diff --git a/.bazelrc b/.bazelrc
deleted file mode 100644
index d5d20309df82498a552df759e3d200a914a4cfb7..0000000000000000000000000000000000000000
--- a/.bazelrc
+++ /dev/null
@@ -1,88 +0,0 @@
-# Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
-# target CPU to build transient dependencies correctly. See
-# https://docs.bazel.build/versions/master/user-manual.html#flag--fat_apk_cpu
-build:android --crosstool_top=//external:android/crosstool
-build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
-build:android_arm --config=android
-build:android_arm --cpu=armeabi-v7a
-build:android_arm --fat_apk_cpu=armeabi-v7a
-build:android_arm64 --config=android
-build:android_arm64 --cpu=arm64-v8a
-build:android_arm64 --fat_apk_cpu=arm64-v8a
-
-# Config to use a mostly-static build and disable modular op registration
-# support (this will revert to loading TensorFlow with RTLD_GLOBAL in Python).
-# By default, TensorFlow will build with a dependence on
-# //tensorflow:libtensorflow_framework.so.
-build:monolithic --define framework_shared_object=false
-
-# For projects which use TensorFlow as part of a Bazel build process, putting
-# nothing in a bazelrc will default to a monolithic build. The following line
-# opts in to modular op registration support by default.
-build --define framework_shared_object=true
-
-# Please note that MKL on MacOS or windows is still not supported.
-# If you would like to use a local MKL instead of downloading, please set the
-# environment variable "TF_MKL_ROOT" every time before build.
-build:mkl --define=build_with_mkl=true --define=enable_mkl=true
-build:mkl -c opt
-
-# This config option is used to enable MKL-DNN open source library only,
-# without depending on MKL binary version.
-build:mkl_open_source_only --define=build_with_mkl_dnn_only=true 
-build:mkl_open_source_only --define=build_with_mkl=true --define=enable_mkl=true
-
-build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
-build:download_clang --define=using_clang=true
-# Instruct clang to use LLD for linking.
-# This only works with GPU builds currently, since Bazel sets -B/usr/bin in
-# auto-generated CPU crosstool, forcing /usr/bin/ld.lld to be preferred over
-# the downloaded one.
-build:download_clang_use_lld --linkopt='-fuse-ld=lld'
-
-build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
-build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
-
-build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
-build:rocm --define=using_rocm=true --define=using_rocm_hipcc=true
-
-build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
-build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true --define=using_clang=true
-
-build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl --define=using_sycl=true --define=using_trisycl=false
-
-build:sycl_nodouble --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_nodouble --define=using_sycl=true --cxxopt -DTENSORFLOW_SYCL_NO_DOUBLE
-
-build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_asan --define=using_sycl=true --define=using_trisycl=false --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
-
-build:sycl_trisycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_trisycl --define=using_sycl=true --define=using_trisycl=true
-
-# Options extracted from configure script
-build:gdr --define=with_gdr_support=true
-build:ngraph --define=with_ngraph_support=true
-build:verbs --define=with_verbs_support=true
-
-build --define=use_fast_cpp_protos=true
-build --define=allow_oversize_protos=true
-build --define=grpc_no_ares=true
-
-build --spawn_strategy=standalone
-build --genrule_strategy=standalone
-build -c opt
-
-# Other build flags.
-build --define=grpc_no_ares=true
-
-# Modular TF build options
-build:dynamic_kernels --define=dynamic_loaded_kernels=true
-
-# Default paths for TF_SYSTEM_LIBS
-build --define=PREFIX=/usr
-build --define=LIBDIR=$(PREFIX)/lib
-build --define=INCLUDEDIR=$(PREFIX)/include
-
-# Do not commit the tf_configure.bazelrc line
diff --git a/.github/ISSUE_TEMPLATE/00-bug-performance-issue.md b/.github/ISSUE_TEMPLATE/00-bug-performance-issue.md
new file mode 100644
index 0000000000000000000000000000000000000000..34ba4cf96017bb0dc15e74eee5d6ce211cf1058d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/00-bug-performance-issue.md
@@ -0,0 +1,34 @@
+---
+name: Bug/Performance Issue
+about: Use this template for reporting a bug or a performance issue.
+
+---
+
+<em>Please make sure that this is a bug. As per our [GitHub Policy](https://github.com/tensorflow/tensorflow/blob/master/ISSUES.md), we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:bug_template</em>
+
+**System information**
+- Have I written custom code (as opposed to using a stock example script provided in TensorFlow):
+- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
+- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device:
+- TensorFlow installed from (source or binary):
+- TensorFlow version (use command below):
+- Python version:
+- Bazel version (if compiling from source):
+- GCC/Compiler version (if compiling from source):
+- CUDA/cuDNN version:
+- GPU model and memory:
+
+
+You can collect some of this information using our environment capture [script](https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh)
+You can also obtain the TensorFlow version with
+python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"
+
+**Describe the current behavior**
+
+**Describe the expected behavior**
+
+**Code to reproduce the issue**
+Provide a reproducible test case that is the bare minimum necessary to generate the problem.
+
+**Other info / logs**
+Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
diff --git a/.github/ISSUE_TEMPLATE/10-build-installation-issue.md b/.github/ISSUE_TEMPLATE/10-build-installation-issue.md
new file mode 100644
index 0000000000000000000000000000000000000000..99c2fe61271fb51cce8aaf94d06d9d4a633aede4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/10-build-installation-issue.md
@@ -0,0 +1,29 @@
+---
+name: Build/Installation Issue
+about: Use this template for build/installation issues
+
+---
+
+<em>Please make sure that this is a build/installation issue. As per our [GitHub Policy](https://github.com/tensorflow/tensorflow/blob/master/ISSUES.md), we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:build_template</em>
+
+**System information**
+- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
+- Mobile device (e.g. iPhone 8, Pixel 2, Samsung Galaxy) if the issue happens on mobile device:
+- TensorFlow installed from (source or binary):
+- TensorFlow version:
+- Python version:
+- Installed using virtualenv? pip? conda?:
+- Bazel version (if compiling from source):
+- GCC/Compiler version (if compiling from source):
+- CUDA/cuDNN version:
+- GPU model and memory:
+
+
+
+**Describe the problem**
+
+**Provide the exact sequence of commands / steps that you executed before running into the problem**
+
+
+**Any other info / logs**
+Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
diff --git a/.github/ISSUE_TEMPLATE/20-documentation-issue.md b/.github/ISSUE_TEMPLATE/20-documentation-issue.md
new file mode 100644
index 0000000000000000000000000000000000000000..7123ca6d6c507315dd3470e1813ac9dd17ba8fcd
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/20-documentation-issue.md
@@ -0,0 +1,17 @@
+---
+name: Documentation Issue
+about: Use this template for documentation related issues
+
+---
+
+<em>Please make sure that this is a documentation issue. As per our [GitHub Policy](https://github.com/tensorflow/tensorflow/blob/master/ISSUES.md), we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:doc_template</em>
+
+
+**System information**
+- TensorFlow version:
+- Doc Link:
+
+
+**Describe the documentation issue**
+
+**We welcome contributions by users. Will you be able to update submit a PR (use the [doc style guide](https://www.tensorflow.org/community/documentation)) to fix the doc Issue?**
diff --git a/.github/ISSUE_TEMPLATE/30-feature-request.md b/.github/ISSUE_TEMPLATE/30-feature-request.md
new file mode 100644
index 0000000000000000000000000000000000000000..71df2e5e49f9e42a23a8c453da5335cfbbbb6211
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/30-feature-request.md
@@ -0,0 +1,22 @@
+---
+name: Feature Request
+about: Use this template for raising a feature request
+
+---
+
+<em>Please make sure that this is a feature request. As per our [GitHub Policy](https://github.com/tensorflow/tensorflow/blob/master/ISSUES.md), we only address code/doc bugs, performance issues, feature requests and build/installation issues on GitHub. tag:feature_template</em>
+
+
+**System information**
+- TensorFlow version (you are using):
+- Are you willing to contribute it (Yes/No):
+
+
+
+**Describe the feature and the current behavior/state.**
+
+**Will this change the current api? How?**
+
+**Who will benefit with this feature?**
+
+**Any Other info.**
diff --git a/.github/ISSUE_TEMPLATE/40-tflite-op-request.md b/.github/ISSUE_TEMPLATE/40-tflite-op-request.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b391279e479ade4ed5327728f19be8752e11507
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/40-tflite-op-request.md
@@ -0,0 +1,24 @@
+---
+name: TensorFlow Lite Op Request
+about: Use this template for reporting ops you are using or missing.
+
+---
+
+
+**System information**
+- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
+- TensorFlow installed from (source or binary):
+- TensorFlow version (or github SHA if from source):
+
+
+**Provide the text output from tflite_convert**
+
+```
+# Copy and paste here
+```
+
+Also, please include a link to a GraphDef or the model if possible.
+
+**Any other info / logs**
+
+Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
diff --git a/.github/ISSUE_TEMPLATE/50-other-issues.md b/.github/ISSUE_TEMPLATE/50-other-issues.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d78d9818bb69ebc7b0807afe5297051494c991e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/50-other-issues.md
@@ -0,0 +1,13 @@
+---
+name: Other Issues
+about: Use this template for any other non-support related issues
+
+---
+
+This template is for miscellaneous issues not covered by the other issue categories.
+
+For questions on how to work with TensorFlow, or support for problems that are not verified bugs in TensorFlow, please go to [StackOverflow](https://stackoverflow.com/questions/tagged/tensorflow).
+
+If you are reporting a vulnerability, please use the [dedicated reporting process](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md).
+
+For high-level discussions about TensorFlow, please post to discuss@tensorflow.org, for questions about the development or internal workings of TensorFlow, or if you would like to know how to contribute to TensorFlow, please post to developers@tensorflow.org.
diff --git a/.gitignore b/.gitignore
index cb65f447d4a551266e237714a16d71b58bcfc51d..90324058600bee46af56e49028977971848a80de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 .DS_Store
 .ipynb_checkpoints
 node_modules
+/.bazelrc
 /.tf_configure.bazelrc
 /bazel-*
 /bazel_pip
@@ -23,10 +24,10 @@ Pods
 Podfile.lock
 *.pbxproj
 *.xcworkspacedata
-/tensorflow/contrib/lite/downloads/**
-/tensorflow/contrib/lite/gen/**
-/tensorflow/contrib/lite/examples/ios/simple/data/*.txt
-/tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
+/tensorflow/lite/tools/make/downloads/**
+/tensorflow/lite/gen/**
+/tensorflow/lite/examples/ios/simple/data/*.txt
+/tensorflow/lite/examples/ios/simple/data/*.tflite
 xcuserdata/**
 /api_init_files_list.txt
 /estimator_api_init_files_list.txt
diff --git a/BUILD b/BUILD
index 4bf647e47aa56cff0b3fd5af7d5df99d8b70549b..1200cf5f7103cad12ab9693c339c372f4f3bc0fb 100644
--- a/BUILD
+++ b/BUILD
@@ -2,5 +2,7 @@ exports_files(
     [
         "LICENSE",
         "ACKNOWLEDGEMENTS",
+        "configure",
+        "configure.py",
     ],
 )
diff --git a/CODEOWNERS b/CODEOWNERS
index 94cc865479cd6ab5cdb589490d3a2d650f06b160..cb3fa2312405ce44d5dfc30ea4164740f436e07e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,6 +1,7 @@
 # Where component owners are known, add them here.
 
 /tenosrflow/core/debug @caisq
+/tensorflow/core/nccl/ @azaks2 @chsigg
 /tensorflow/core/platform/windows/ @mrry
 /tensorflow/core/platform/s3 @yongtang
 /tensorflow/go @asimshankar
@@ -46,18 +47,17 @@
 /tensorflow/contrib/losses/ @alextp @ispirmustafa
 /tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg
 /tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa
-/tensorflow/contrib/nccl/ @cwhipkey @zheng-xq
 /tensorflow/contrib/opt/ @strategist333 @alextp
 /tensorflow/contrib/pi_examples/ @maciekcc
 /tensorflow/contrib/quantization/ @petewarden
 /tensorflow/contrib/rnn/ @ebrevdo @scottzhu
-/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenl
+/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenlavoie
 /tensorflow/contrib/seq2seq/ @ebrevdo @lmthang
 /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
 /tensorflow/contrib/slim/ @sguada @thenbasilmanran
 /tensorflow/contrib/stateless/ @girving @alextp
 /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank
-/tensorflow/contrib/tensorrt/ @aaroey
+/tensorflow/contrib/tensorrt/ @aaroey @smit-hinsu @azaks2
 # NEED OWNER: /tensorflow/contrib/testing/
 /tensorflow/contrib/timeseries/ @allenlavoie
 /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu @sourabhbajaj
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 5fff9d05a1c589636bc9c711e6eb7cc4aba86b2f..a4647020ff76830badd75f3d3f76a41a637159bb 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -7,19 +7,22 @@ In the interest of fostering an open and welcoming environment, we as contributo
 
 Examples of behavior that contributes to creating a positive environment include:
 
-* Using welcoming and inclusive language
-* Being respectful of differing viewpoints and experiences
-* Gracefully accepting constructive criticism
-* Focusing on what is best for the community
-* Showing empathy towards other community members
+*   Using welcoming and inclusive language.
+*   Being respectful of differing viewpoints and experiences.
+*   Gracefully accepting constructive criticism.
+*   Focusing on what is best for the community.
+*   Showing empathy towards other community members.
 
 Examples of unacceptable behavior by participants include:
 
-* The use of sexualized language or imagery and unwelcome sexual attention or advances
-* Trolling, insulting/derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or electronic address, without explicit permission
-* Conduct which could reasonably be considered inappropriate for the forum in which it occurs. 
+*   The use of sexualized language or imagery and unwelcome sexual attention or
+    advances.
+*   Trolling, insulting/derogatory comments, and personal or political attacks.
+*   Public or private harassment.
+*   Publishing others' private information, such as a physical or electronic
+    address, without explicit permission.
+*   Conduct which could reasonably be considered inappropriate for the forum in
+    which it occurs.
 
 All TensorFlow forums and spaces are meant for professional interactions, and any behavior which could reasonably be considered inappropriate in a professional setting is unacceptable.
 
@@ -48,10 +51,12 @@ However, for the vast majority of issues, we aim to empower individuals to first
 
 If you are experiencing or witnessing conflict, we ask you to use the following escalation strategy to address the conflict:
 
-1. Address the perceived conflict directly with those involved, preferably in a real-time medium. 
-2. If this fails, get a third party (e.g. a mutual friend, and/or someone with background on the issue, but not involved in conflict) to intercede.
-3. If you are still unable to resolve the conflict, and you believe it rises to harassment or another code of conduct violation, report it.
-
+1.  Address the perceived conflict directly with those involved, preferably in a
+    real-time medium.
+2.  If this fails, get a third party (e.g. a mutual friend, and/or someone with
+    background on the issue, but not involved in the conflict) to intercede.
+3.  If you are still unable to resolve the conflict, and you believe it rises to
+    harassment or another code of conduct violation, report it.
 
 ## Reporting Violations
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f598999f351c10f8bd01dfbd3ad8897f19d570e8..4a296f265f7b9521c46d350cec26ff199f43eb6c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -31,8 +31,12 @@ Follow either of the two links above to access the appropriate CLA and instructi
 If you have improvements to TensorFlow, send us your pull requests! For those
 just getting started, Github has a [howto](https://help.github.com/articles/using-pull-requests/).
 
-TensorFlow team members will be assigned to review your pull requests. Once the pull requests are approved and pass continuous integration checks, we will merge the pull requests.
-For some pull requests, we will apply the patch for each pull request to our internal version control system first, and export the change out as a new commit later, at which point the original pull request will be closed. The commits in the pull request will be squashed into a single commit with the pull request creator as the author. These pull requests will be labeled as pending merge internally.
+TensorFlow team members will be assigned to review your pull requests. Once the
+pull requests are approved and pass continuous integration checks, a TensorFlow
+team member will apply `ready to pull` label to your change. This means we are
+working on getting your pull request submitted to our internal repository. After
+the change has been submitted internally, your pull request will be merged
+automatically on GitHub.
 
 If you want to contribute but you're not sure where to start, take a look at the
 [issues with the "contributions welcome" label](https://github.com/tensorflow/tensorflow/labels/stat%3Acontributions%20welcome).
diff --git a/ISSUES.md b/ISSUES.md
new file mode 100644
index 0000000000000000000000000000000000000000..2b330e8e0a8a3f64753cfb7a2e2362222439312d
--- /dev/null
+++ b/ISSUES.md
@@ -0,0 +1,9 @@
+If you open a GitHub Issue, here is our policy: 1. It must be a bug/performance
+issue or a feature request or a build issue or a documentation issue (for small
+doc fixes please send a PR instead). 2. Make sure the Issue Template is filled
+out. 3. The issue should be related to the repo it is created in.
+
+**Here's why we have this policy:** We want to focus on the work that benefits
+the whole community, e.g., fixing bugs and adding features. Individual support
+should be seeked on StackOverflow or other non-GitHub channels. It helps us to
+address bugs and feature requests in a timely manner.
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 52faed9297cfcaf8c93bb9c79686c9258a53c560..b3d84ad8c948df9459a8e8afb029785d6f6ad335 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -29,9 +29,11 @@ You can collect some of this information using our environment capture script:
 
 https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh
 
-You can obtain the TensorFlow version with
+You can obtain the TensorFlow version with:
 
+```bash
 python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"
+```
 
 ### Describe the problem
 Describe the problem clearly here. Be sure to convey here why it's a bug in TensorFlow or a feature request.
diff --git a/README.md b/README.md
index 57efb876c9afaf9fe76c4ced4e6a1572e9241edf..044174947a094d43a51f7140dd40ec0f17801d40 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,14 @@
 |-----------------|
 | [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) |
 
-**TensorFlow** is an open source software library for numerical computation using
-data flow graphs.  The graph nodes represent mathematical operations, while
+**TensorFlow** is an open source software library for numerical computation
+using data flow graphs. The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
-between them.  This flexible architecture enables you to deploy computation to one
-or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
-code.  TensorFlow also includes [TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard), a data visualization toolkit.
+between them. This flexible architecture enables you to deploy computation to
+one or more CPUs or GPUs in a desktop, server, or mobile device without
+rewriting code. TensorFlow also includes
+[TensorBoard](https://github.com/tensorflow/tensorboard), a data visualization
+toolkit.
 
 TensorFlow was originally developed by researchers and engineers
 working on the Google Brain team within Google's Machine Intelligence Research
@@ -29,7 +31,21 @@ subscribing to
 [announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
 
 ## Installation
-*See [Installing TensorFlow](https://www.tensorflow.org/install) for instructions on how to install our release binaries or how to build from source.*
+
+To install the current release for CPU-only:
+
+```
+pip install tensorflow
+```
+
+Use the GPU package for CUDA-enabled GPU cards:
+
+```
+pip install tensorflow-gpu
+```
+
+*See [Installing TensorFlow](https://www.tensorflow.org/install) for detailed
+instructions, and how to build from source.*
 
 People who are a little more adventurous can also try our nightly binaries:
 
@@ -65,9 +81,10 @@ guidelines](CONTRIBUTING.md). This project adheres to TensorFlow's
 uphold this code.**
 
 **We use [GitHub issues](https://github.com/tensorflow/tensorflow/issues) for
-tracking requests and bugs. So please see
-[TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss) for general questions
-and discussion, and please direct specific questions to [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
+tracking requests and bugs, so please see
+[TensorFlow Discuss](https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss)
+for general questions and discussion, and please direct specific questions to
+[Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).**
 
 The TensorFlow project strives to abide by generally accepted best practices in open-source software development:
 
@@ -93,25 +110,27 @@ The TensorFlow project strives to abide by generally accepted best practices in
 
 ### Community Supported Builds
 
-| Build Type      | Status | Artifacts |
-| ---             | ---    | ---       |
-| **IBM s390x**       | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/) | TBA |
-| **IBM ppc64le CPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/) | TBA |
-| **IBM ppc64le GPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_PPC64LE_GPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_PPC64LE_GPU/) | TBA |
-| **Linux CPU with Intel® MKL-DNN** Nightly | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/) | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) |
-| **Linux CPU with Intel® MKL-DNN** Python 2.7<br> **Linux CPU with Intel® MKL-DNN** Python 3.5<br>  **Linux CPU with Intel® MKL-DNN** Python 3.6 | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild)|[1.10.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.10.0-cp27-cp27mu-linux_x86_64.whl)<br>[1.10.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.10.0-cp35-cp35m-linux_x86_64.whl)<br>[1.10.0 py3.6](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.10.0-cp36-cp36m-linux_x86_64.whl) |
-
+Build Type                                                                                                                                                                                      | Status                                                                                                                                                                                   | Artifacts
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**IBM s390x**                                                                                                                                                                                   | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                        | TBA
+**IBM ppc64le CPU**                                                                                                                                                                             | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                    | TBA
+**IBM ppc64le GPU** Nightly                                                                                                                                                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)            | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
+**IBM ppc64le GPU** Stable Release                                                                                                                                                              | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)
+**Linux CPU with Intel® MKL-DNN** Nightly                                                                                                                                                       | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/)                                | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
+**Linux CPU with Intel® MKL-DNN** Python 2.7<br> **Linux CPU with Intel® MKL-DNN** Python 3.4<br> **Linux CPU with Intel® MKL-DNN** Python 3.5<br> **Linux CPU with Intel® MKL-DNN** Python 3.6 | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild) | [1.11.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp27-cp27mu-linux_x86_64.whl)<br>[1.11.0 py3.4](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp34-cp34m-linux_x86_64.whl)<br>[1.11.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp35-cp35m-linux_x86_64.whl)<br>[1.11.0 py3.6](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp36-cp36m-linux_x86_64.whl)
 
 ## For more information
-* [TensorFlow Website](https://www.tensorflow.org)
-* [TensorFlow Tutorials](https://www.tensorflow.org/tutorials/)
-* [TensorFlow Model Zoo](https://github.com/tensorflow/models)
-* [TensorFlow Twitter](https://twitter.com/tensorflow)
-* [TensorFlow Blog](https://medium.com/tensorflow)
-* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
-* [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap)
-* [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
-* [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
+
+*   [TensorFlow Website](https://www.tensorflow.org)
+*   [TensorFlow Tutorials](https://www.tensorflow.org/tutorials/)
+*   [TensorFlow Model Zoo](https://github.com/tensorflow/models)
+*   [TensorFlow Twitter](https://twitter.com/tensorflow)
+*   [TensorFlow Blog](https://medium.com/tensorflow)
+*   [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
+*   [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap)
+*   [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
+*   [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
+*   [TensorFlow Visualization Toolkit](https://github.com/tensorflow/tensorboard)
 
 Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.
 
diff --git a/RELEASE.md b/RELEASE.md
index 20e1d9217b7684e696d0abf427eef9ab9548d1b7..b13b071bd6cf4d3a260c8e248a67d23e1a688498 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,74 @@
+# Release 1.12.0
+
+## Major Features and Improvements
+
+*   Keras models can now be directly exported to the SavedModel
+    format(`tf.contrib.saved_model.save_keras_model()`) and used with Tensorflow
+    Serving.
+*   Keras models now support evaluating with a `tf.data.Dataset`.
+*   TensorFlow binaries are built with XLA support linked in by default.
+
+## Bug Fixes and Other Changes
+
+*   tf.data:
+    *   tf.data users can now represent, get, and set options of TensorFlow
+        input pipelines using `tf.data.Options()`, `tf.data.Dataset.options()`,
+        and `tf.data.Dataset.with_options()` respectively.
+    *   New `tf.data.Dataset.reduce()` API allows users to reduce a finite
+        dataset to a single element using a user-provided reduce function.
+    *   New `tf.data.Dataset.window()` API allows users to create finite windows
+        of input dataset; when combined with the `tf.data.Dataset.reduce()` API,
+        this allows users to implement customized batching.
+    *   All C++ code moves to the `tensorflow::data` namespace.
+    *   Add support for `num_parallel_calls` to `tf.data.Dataset.interleave`.
+*   `tf.contrib`:
+    *   Remove `tf.contrib.linalg`. `tf.linalg` should be used instead.
+    *   Replace any calls to `tf.contrib.get_signature_def_by_key(metagraph_def,
+        signature_def_key)` with
+        `meta_graph_def.signature_def[signature_def_key]`. Catching a ValueError
+        exception thrown by `tf.contrib.get_signature_def_by_key` should be
+        replaced by catching a KeyError exception.
+*   `tf.contrib.data`
+    *   Deprecate, and replace by tf.data.experimental.
+*   Other:
+    *   Instead of jemalloc, revert back to using system malloc since it
+        simplifies build and has comparable performance.
+    *   Remove integer types from `tf.nn.softplus` and `tf.nn.softsign` OpDefs.
+        This is a bugfix; these ops were never meant to support integers.
+    *   Allow subslicing Tensors with a single dimension.
+    *   Add option to calculate string length in Unicode characters
+    *   Add functionality to SubSlice a tensor.
+    *   Add searchsorted (ie lower/upper_bound) op.
+    *   Add model explainability to Boosted Trees.
+    *   Support negative positions for tf.substr
+    *   There was previously a bug in the bijector_impl where the
+        _reduce_jacobian_det_over_event does not handle scalar ILDJ
+        implementations properly.
+    *   In tf eager execution, allow re-entering a GradientTape context
+    *   Add tf_api_version flag. If --define=tf_api_version=2 flag is passed in,
+        then bazel will build TensorFlow API version 2.0. Note that TensorFlow
+        2.0 is under active development and has no guarantees at this point.
+    *   Add additional compression options to TfRecordWriter
+    *   Performance improvements for regex full match operations.
+    *   Replace tf.GraphKeys.VARIABLES with `tf.GraphKeys.GLOBAL_VARIABLES`
+    *   Remove unused dynamic learning rate support.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+(David) Siu-Kei Muk, Ag Ramesh, Anton Dmitriev, Artem Sobolev, Avijit-Nervana,
+Bairen Yi, Bruno Goncalves, By Shen, candy.dc, Cheng Chen, Clayne Robison,
+coder3101, Dao Zhang, Elms, Fei Hu, feiquan, Geoffrey Irving, Guozhong Zhuang,
+hellcom, Hoeseong Kim, imsheridan, Jason Furmanek, Jason Zaman, Jenny Sahng,
+jiefangxuanyan, Johannes Bannhofer, Jonathan Homer, Koan-Sin Tan, kouml, Loo
+Rong Jie, Lukas Geiger, manipopopo, Ming Li, Moritz KröGer, Naurril, Niranjan
+Hasabnis, Pan Daoxin, Peng Yu, pengwa, rasmi, Roger Xin, Roland Fernandez, Sami
+Kama, Samuel Matzek, Sangjung Woo, Sergei Lebedev, Sergii Khomenko, shaohua,
+Shaohua Zhang, Shujian2015, Sunitha Kambhampati, tomguluson92, ViníCius Camargo,
+wangsiyu, weidankong, Wen-Heng (Jack) Chung, William D. Irons, Xin Jin, Yan
+Facai (颜发才), Yanbo Liang, Yash Katariya, Yong Tang, 在原佐为
+
 # Release 1.11.0
 
 ## Major Features and Improvements
@@ -20,51 +91,84 @@
 
 ## Bug Fixes and Other Changes
 
-* C++:
-  * Changed the signature of SessionFactory::NewSession so that it can return a meaningful error message on failure.
-* tf.data:
-  * Remove `num_parallel_parser_calls` argument from `tf.contrib.data.make_csv_dataset()`. [tf.data] Remove `num_parallel_parser_calls` argument from `tf.contrib.data.make_csv_dataset()`.
-  * `tf.data.Dataset.list_files()` raises an exception at initialization time if the argument matches no files.
-  * Renamed BigTable class to BigtableTable for clarity
-  * Document use of the Cloud Bigtable API
-  * Adding `tf.contrib.data.reduce_dataset` which can be used to reduce a dataset to a single element.
-  * Generalization of `tf.contrib.data.sliding_window_batch`.
-* INC:
-  * Runtime improvements to triangular solve.
-* `tf.contrib`:
-  * Add an `implementation` argument to `tf.keras.layers.LocallyConnected2D` and `tf.keras.layers.LocallyConnected1D`. The new mode (`implementation=2`) performs forward pass as a single dense matrix multiplication, allowing dramatic speedups in certain scenarios (but worse performance in others - see docstring). The option also allows to use `padding=same`.
-  * Add documentation clarifying the differences between tf.fill and tf.constant.
-  * Add experimental IndexedDatasets.
-  * Add selective registration target using the lite proto runtime.
-  * Add simple Tensor and DataType classes to TensorFlow Lite Java
-  * Add support for bitcasting to/from uint32 and uint64.
-  * Added a subclass of Estimator that can be created from a SavedModel (SavedModelEstimator).
-  * Adds leaf index modes as an argument.
-  * Allow a different output shape from the input in tf.contrib.image.transform.
-  * Change the state_size order of the StackedRNNCell to be natural order. To keep the existing behavior, user can add reverse_state_order=True when constructing the StackedRNNCells.
-  * Deprecate self.test_session() in favor of self.session() or self.cached_session().
-  * Directly import tensor.proto.h (the transitive import will be removed from tensor.h soon)
-  * Estimator.train() now supports tf.contrib.summary.\* summaries out of the box; each call to .train() will now create a separate tfevents file rather than re-using a shared one.
-  * Fix FTRL L2-shrinkage behavior: the gradient from the L2 shrinkage term should not end up in the accumulator.
-  * Fix toco compilation/execution on Windows
-  * GoogleZoneProvider class added to detect  which Google Cloud Engine zone tensorflow is running in.
-  * It is now safe to call any of the C API's TF_Delete\* functions on nullptr
-  * Log some errors on Android to logcat
-  * Match FakeQuant numerics in TFLite to improve accuracy of TFLite quantized inference models.
-  * Optional bucket location check for the GCS Filesystem.
-  * Performance enhancements for StringSplitOp & StringSplitV2Op.
-  * Performance improvements for regex replace operations.
-  * TFRecordWriter now raises an error if .write() fails.
-  * TPU: More helpful error messages in TPUClusterResolvers.
-  * The legacy_init_op argument to SavedModelBuilder methods for adding MetaGraphs has been deprecated. Please use the equivalent main_op argument instead. As part of this, we now explicitly check for a single main_op or legacy_init_op at the time of SavedModel building, whereas the check on main_op was previously only done at load time.
-  * The protocol used for Estimator training is now configurable in RunConfig.
-  * Triangular solve performance improvements.
-  * Unify RNN cell interface between TF and Keras. Add new get_initial_state() to Keras and TF RNN cell, which will use to replace the existing zero_state() method.
-  * Update initialization of variables in Keras.
-  * Updates to "constrained_optimization" in tensorflow/contrib.
-  * boosted trees: adding pruning mode
-  * tf.train.Checkpoint does not delete old checkpoints by default.
-  * tfdbg: Limit the total disk space occupied by dumped tensor data to 100 GBytes. Add environment variable `TFDBG_DISK_BYTES_LIMIT` to allow adjustment of this upper limit.
+*   C++:
+    *   Changed the signature of SessionFactory::NewSession so that it can
+        return a meaningful error message on failure.
+*   tf.data:
+    *   Remove `num_parallel_parser_calls` argument from
+        `tf.contrib.data.make_csv_dataset()`. [tf.data] Remove
+        `num_parallel_parser_calls` argument from
+        `tf.contrib.data.make_csv_dataset()`.
+    *   `tf.data.Dataset.list_files()` raises an exception at initialization
+        time if the argument matches no files.
+    *   Renamed BigTable class to BigtableTable for clarity
+    *   Document use of the Cloud Bigtable API
+    *   Add `tf.contrib.data.reduce_dataset` which can be used to reduce a
+        dataset to a single element.
+    *   Generalization of `tf.contrib.data.sliding_window_batch`.
+*   INC:
+    *   Runtime improvements to triangular solve.
+*   `tf.contrib`:
+    *   Add an `implementation` argument to `tf.keras.layers.LocallyConnected2D`
+        and `tf.keras.layers.LocallyConnected1D`. The new mode
+        (`implementation=2`) performs forward pass as a single dense matrix
+        multiplication, allowing dramatic speedups in certain scenarios (but
+        worse performance in others - see docstring). The option also allows to
+        use `padding=same`.
+    *   Add documentation clarifying the differences between tf.fill and
+        tf.constant.
+    *   Add experimental IndexedDatasets.
+    *   Add selective registration target using the lite proto runtime.
+    *   Add simple Tensor and DataType classes to TensorFlow Lite Java
+    *   Add support for bitcasting to/from uint32 and uint64.
+    *   Added a subclass of Estimator that can be created from a SavedModel
+        (SavedModelEstimator).
+    *   Adds leaf index modes as an argument.
+    *   Allow a different output shape from the input in
+        tf.contrib.image.transform.
+    *   Change the state_size order of the StackedRNNCell to be natural order.
+        To keep the existing behavior, user can add reverse_state_order=True
+        when constructing the StackedRNNCells.
+    *   Deprecate self.test_session() in favor of self.session() or
+        self.cached_session().
+    *   Directly import tensor.proto.h (the transitive import will be removed
+        from tensor.h soon)
+    *   Estimator.train() now supports tf.contrib.summary.\* summaries out of
+        the box; each call to .train() will now create a separate tfevents file
+        rather than re-using a shared one.
+    *   Fix FTRL L2-shrinkage behavior: the gradient from the L2 shrinkage term
+        should not end up in the accumulator.
+    *   Fix toco compilation/execution on Windows
+    *   GoogleZoneProvider class added to detect which Google Cloud Engine zone
+        tensorflow is running in.
+    *   It is now safe to call any of the C API's TF_Delete\* functions on
+        nullptr
+    *   Log some errors on Android to logcat
+    *   Match FakeQuant numerics in TFLite to improve accuracy of TFLite
+        quantized inference models.
+    *   Optional bucket location check for the GCS Filesystem.
+    *   Performance enhancements for StringSplitOp & StringSplitV2Op.
+    *   Performance improvements for regex replace operations.
+    *   TFRecordWriter now raises an error if .write() fails.
+    *   TPU: More helpful error messages in TPUClusterResolvers.
+    *   The legacy_init_op argument to SavedModelBuilder methods for adding
+        MetaGraphs has been deprecated. Please use the equivalent main_op
+        argument instead. As part of this, we now explicitly check for a single
+        main_op or legacy_init_op at the time of SavedModel building, whereas
+        the check on main_op was previously only done at load time.
+    *   The protocol used for Estimator training is now configurable in
+        RunConfig.
+    *   Triangular solve performance improvements.
+    *   Unify RNN cell interface between TF and Keras. Add new
+        get_initial_state() to Keras and TF RNN cell, which will use to replace
+        the existing zero_state() method.
+    *   Update initialization of variables in Keras.
+    *   Updates to "constrained_optimization" in tensorflow/contrib.
+    *   boosted trees: adding pruning mode
+    *   tf.train.Checkpoint does not delete old checkpoints by default.
+    *   tfdbg: Limit the total disk space occupied by dumped tensor data to 100
+        GBytes. Add environment variable `TFDBG_DISK_BYTES_LIMIT` to allow
+        adjustment of this upper limit.
 
 ## Thanks to our Contributors
 
@@ -154,8 +258,8 @@ Ag Ramesh, Alex Wiltschko, Alexander Pantyukhin, Amogh Mannekote, An Jiaoyang, A
 * Update `tf.keras` to the Keras 2.1.6 API.
 * Added [`tf.keras.layers.CuDNNGRU`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/keras/layers/CuDNNGRU) and [`tf.keras.layers.CuDNNLSTM`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/keras/layers/CuDNNLSTM) layers. [Try it](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb?linkId=53292082).
 * Adding support of core [feature columns](https://www.tensorflow.org/get_started/feature_columns) and [losses](https://www.tensorflow.org/api_docs/python/tf/losses) to [gradient boosted trees estimators](https://github.com/tensorflow/models/tree/master/official/boosted_trees).
-* The [python interface](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/contrib/lite)
-  for the [TFLite Optimizing Converter](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/toco/README.md)
+* The [python interface](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/lite)
+  for the [TFLite Optimizing Converter](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/toco/README.md)
   has been expanded, and the command line interface (AKA: `toco`, `tflite_convert`) is once again
   included in the standard `pip` installation.
 * Improved data-loading and text processing with:
@@ -458,7 +562,7 @@ Yoni Tsafir, yordun, Yuan (Terry) Tang, Yuxin Wu, zhengdi, Zhengsheng Wei, 田
 ## Major Features And Improvements
 * [Eager execution](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/eager)
   preview version is now available.
-* [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/contrib/lite)
+* [TensorFlow Lite](https://github.com/tensorflow/tensorflow/tree/r1.5/tensorflow/lite)
   dev preview is now available.
 * CUDA 9.0 and cuDNN 7 support.
 * Accelerated Linear Algebra (XLA):
@@ -805,7 +909,7 @@ See also [TensorBoard 0.1.4](https://github.com/tensorflow/tensorboard/releases/
 * Adds tf.contrib.nn.rank_sampled_softmax_loss, a sampled-softmax variant that can improve rank loss.
 * `tf.contrib.metrics`.{streaming_covariance,streaming_pearson_correlation} modified to return nan when they have seen less or equal to 1 unit of weight.
 * Adds time series models to contrib. See contrib/timeseries/README.md for details.
-* Adds FULLY_CONNECTED Op to tensorflow/contrib/lite/schema.fbs
+* Adds FULLY_CONNECTED Op to tensorflow/lite/schema.fbs
 
 ## Known Issues
 * Tensorflow_gpu compilation fails with Bazel 0.5.3.
diff --git a/WORKSPACE b/WORKSPACE
index 17961829a605c2d1f2d2ba86a7c30c47618c139b..7cc08e0164a202581ad7ebbe107a9e19410e70e4 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,5 +1,7 @@
 workspace(name = "org_tensorflow")
 
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
 http_archive(
     name = "io_bazel_rules_closure",
     sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
@@ -14,6 +16,33 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 
 closure_repositories()
 
+http_archive(
+    name = "base_images_docker",
+    sha256 = "e2b1b7254270bb7605e814a9dbf6d1e4ae04a11136ff1714fbfdabe3f87f7cf9",
+    strip_prefix = "base-images-docker-12801524f867e657fbb5d1a74f31618aff181ac6",
+    urls = ["https://github.com/GoogleCloudPlatform/base-images-docker/archive/12801524f867e657fbb5d1a74f31618aff181ac6.tar.gz"],
+)
+
+http_archive(
+    name = "bazel_toolchains",
+    sha256 = "15b5858b1b5541ec44df31b94c3b8672815b31d71215a98398761ea9f4c4eedb",
+    strip_prefix = "bazel-toolchains-6200b238c9c2d137c0d9a7262c80cc71d98e692b",
+    urls = [
+        "https://github.com/bazelbuild/bazel-toolchains/archive/6200b238c9c2d137c0d9a7262c80cc71d98e692b.tar.gz",
+    ],
+)
+
+http_archive(
+    name = "io_bazel_rules_docker",
+    sha256 = "29d109605e0d6f9c892584f07275b8c9260803bf0c6fcb7de2623b2bedc910bd",
+    strip_prefix = "rules_docker-0.5.1",
+    urls = ["https://github.com/bazelbuild/rules_docker/archive/v0.5.1.tar.gz"],
+)
+
+load("//third_party/toolchains/preconfig/generate:workspace.bzl", "remote_config_workspace")
+
+remote_config_workspace()
+
 # We must check the bazel version before trying to parse any other BUILD
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
@@ -30,9 +59,9 @@ android_workspace()
 # Please add all new TensorFlow dependencies in workspace.bzl.
 tf_workspace()
 
-new_http_archive(
+http_archive(
     name = "inception_v1",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "7efe12a8363f09bc24d7b7a450304a15655a57a7751929b2c1593a71183bb105",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip",
@@ -40,9 +69,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "mobile_ssd",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "bddd81ea5c80a97adfac1c9f770e6f55cbafd7cce4d3bbe15fbeb041e6b8f3e8",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip",
@@ -50,9 +79,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "mobile_multibox",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
@@ -60,9 +89,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "stylize",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
@@ -70,12 +99,13 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "speech_commands",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "c3ec4fea3158eb111f1d932336351edfe8bd515bb6e87aad4f25dbad0a600d0c",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip",
         "http://download.tensorflow.org/models/speech_commands_v0.01.zip",
     ],
 )
+
diff --git a/configure.py b/configure.py
index a88fdb3555531a13300a0aabe36e2cc65a969daa..6c905a0be3d685b5921dfbc5bddfbe6471a82625 100644
--- a/configure.py
+++ b/configure.py
@@ -35,7 +35,6 @@ except ImportError:
 
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
-_DEFAULT_NCCL_VERSION = '2.2'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
@@ -44,7 +43,7 @@ _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
 _DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
-_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16]
+_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16, 17, 18]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
@@ -239,6 +238,13 @@ def setup_python(environ_cp):
   write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
+  # If choosen python_lib_path is from a path specified in the PYTHONPATH
+  # variable, need to tell bazel to include PYTHONPATH
+  if environ_cp.get('PYTHONPATH'):
+    python_paths = environ_cp.get('PYTHONPATH').split(':')
+    if python_lib_path in python_paths:
+      write_action_env_to_bazelrc('PYTHONPATH', environ_cp.get('PYTHONPATH'))
+
   # Write tools/python_bin_path.sh
   with open(
       os.path.join(_TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'),
@@ -384,7 +390,9 @@ def set_build_var(environ_cp,
   var = str(int(get_var(environ_cp, var_name, query_item, enabled_by_default)))
   environ_cp[var_name] = var
   if var == '1':
-    write_to_bazelrc('build --define %s=true' % option_name)
+    write_to_bazelrc(
+        'build:%s --define %s=true' % (bazel_config_name, option_name))
+    write_to_bazelrc('build --config=%s' % bazel_config_name)
   elif bazel_config_name is not None:
     # TODO(mikecase): Migrate all users of configure.py to use --config Bazel
     # options and not to set build configs through environment variables.
@@ -444,11 +452,12 @@ def convert_version_to_int(version):
   return int(version_str)
 
 
-def check_bazel_version(min_version):
-  """Check installed bazel version is at least min_version.
+def check_bazel_version(min_version, max_version):
+  """Check installed bazel version is between min_version and max_version.
 
   Args:
     min_version: string for minimum bazel version.
+    max_version: string for maximum bazel version.
 
   Returns:
     The bazel version detected.
@@ -466,6 +475,7 @@ def check_bazel_version(min_version):
 
   min_version_int = convert_version_to_int(min_version)
   curr_version_int = convert_version_to_int(curr_version)
+  max_version_int = convert_version_to_int(max_version)
 
   # Check if current bazel version can be detected properly.
   if not curr_version_int:
@@ -479,6 +489,10 @@ def check_bazel_version(min_version):
     print('Please upgrade your bazel installation to version %s or higher to '
           'build TensorFlow!' % min_version)
     sys.exit(0)
+  if curr_version_int > max_version_int:
+    print('Please downgrade your bazel installation to version %s or lower to '
+          'build TensorFlow!' % max_version)
+    sys.exit(0)
   return curr_version
 
 
@@ -496,7 +510,7 @@ def set_cc_opt_flags(environ_cp):
   elif is_windows():
     default_cc_opt_flags = '/arch:AVX'
   else:
-    default_cc_opt_flags = '-march=native'
+    default_cc_opt_flags = '-march=native -Wno-sign-compare'
   question = ('Please specify optimization flags to use during compilation when'
               ' bazel option "--config=opt" is specified [Default is %s]: '
              ) % default_cc_opt_flags
@@ -858,7 +872,7 @@ def set_tf_cuda_version(environ_cp):
     cuda_toolkit_paths_full = [
         os.path.join(cuda_toolkit_path, x) for x in cuda_rt_lib_paths
     ]
-    if any([os.path.exists(x) for x in cuda_toolkit_paths_full]):
+    if any(os.path.exists(x) for x in cuda_toolkit_paths_full):
       break
 
     # Reset and retry
@@ -1109,18 +1123,17 @@ def set_tf_nccl_install_path(environ_cp):
     raise ValueError('Currently NCCL is only supported on Linux platforms.')
 
   ask_nccl_version = (
-      'Please specify the NCCL version you want to use. If NCCL %s is not '
-      'installed, then you can use version 1.3 that can be fetched '
-      'automatically but it may have worse performance with multiple GPUs. '
-      '[Default is %s]: ') % (_DEFAULT_NCCL_VERSION, _DEFAULT_NCCL_VERSION)
+      'Please specify the locally installed NCCL version you want to use. '
+      '[Default is to use https://github.com/nvidia/nccl]: ')
 
   for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     tf_nccl_version = get_from_env_or_user_or_default(
-        environ_cp, 'TF_NCCL_VERSION', ask_nccl_version, _DEFAULT_NCCL_VERSION)
-    tf_nccl_version = reformat_version_sequence(str(tf_nccl_version), 1)
+        environ_cp, 'TF_NCCL_VERSION', ask_nccl_version, '')
 
-    if tf_nccl_version == '1':
-      break  # No need to get install path, NCCL 1 is a GitHub repo.
+    if not tf_nccl_version:
+      break  # No need to get install path, building the open source code.
+
+    tf_nccl_version = reformat_version_sequence(str(tf_nccl_version), 1)
 
     # Look with ldconfig first if we can find the library in paths
     # like /usr/lib/x86_64-linux-gnu and the header file in the corresponding
@@ -1182,6 +1195,7 @@ def set_tf_nccl_install_path(environ_cp):
       if is_windows() or is_cygwin():
         nccl_install_path = cygpath(nccl_install_path)
 
+      nccl_lib_path = ''
       if is_windows():
         nccl_lib_path = 'lib/x64/nccl.lib'
       elif is_linux():
@@ -1232,7 +1246,6 @@ def set_tf_nccl_install_path(environ_cp):
   environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
   write_action_env_to_bazelrc('TF_NCCL_VERSION', tf_nccl_version)
 
-
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1418,11 +1431,16 @@ def set_mpi_home(environ_cp):
   def valid_mpi_path(mpi_home):
     exists = (
         os.path.exists(os.path.join(mpi_home, 'include')) and
-        os.path.exists(os.path.join(mpi_home, 'lib')))
+        (os.path.exists(os.path.join(mpi_home, 'lib')) or
+         os.path.exists(os.path.join(mpi_home, 'lib64')) or
+         os.path.exists(os.path.join(mpi_home, 'lib32'))))
     if not exists:
-      print('Invalid path to the MPI Toolkit. %s or %s cannot be found' %
-            (os.path.join(mpi_home, 'include'),
-             os.path.exists(os.path.join(mpi_home, 'lib'))))
+      print(
+          'Invalid path to the MPI Toolkit. %s or %s or %s or %s cannot be found'
+          % (os.path.join(mpi_home, 'include'),
+             os.path.exists(os.path.join(mpi_home, 'lib')),
+             os.path.exists(os.path.join(mpi_home, 'lib64')),
+             os.path.exists(os.path.join(mpi_home, 'lib32'))))
     return exists
 
   _ = prompt_loop_or_load_from_env(
@@ -1463,8 +1481,17 @@ def set_other_mpi_vars(environ_cp):
   if os.path.exists(os.path.join(mpi_home, 'lib/libmpi.so')):
     symlink_force(
         os.path.join(mpi_home, 'lib/libmpi.so'), 'third_party/mpi/libmpi.so')
+  elif os.path.exists(os.path.join(mpi_home, 'lib64/libmpi.so')):
+    symlink_force(
+        os.path.join(mpi_home, 'lib64/libmpi.so'), 'third_party/mpi/libmpi.so')
+  elif os.path.exists(os.path.join(mpi_home, 'lib32/libmpi.so')):
+    symlink_force(
+        os.path.join(mpi_home, 'lib32/libmpi.so'), 'third_party/mpi/libmpi.so')
+
   else:
-    raise ValueError('Cannot find the MPI library file in %s/lib' % mpi_home)
+    raise ValueError(
+        'Cannot find the MPI library file in %s/lib or %s/lib64 or %s/lib32' %
+        mpi_home, mpi_home, mpi_home)
 
 
 def set_system_libs_flag(environ_cp):
@@ -1499,14 +1526,6 @@ def set_windows_build_flags(environ_cp):
   # TODO(pcloudy): Remove this flag when upgrading Bazel to 0.16.0
   # Short object file path will be enabled by default.
   write_to_bazelrc('build --experimental_shortened_obj_file_path=true')
-  # When building zip file for some py_binary and py_test targets, don't
-  # include its dependencies. This is for:
-  #   1. Running python tests against the system installed TF pip package.
-  #   2. Avoiding redundant files in
-  #      //tensorflow/tools/pip_package:simple_console_windows,
-  #      which is a py_binary used during creating TF pip package.
-  #      See https://github.com/tensorflow/tensorflow/issues/22390
-  write_to_bazelrc('build --define=no_tensorflow_py_deps=true')
 
   if get_var(
       environ_cp, 'TF_OVERRIDE_EIGEN_STRONG_INLINE', 'Eigen strong inline',
@@ -1546,9 +1565,12 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.15.0')
+  check_bazel_version('0.15.0', '0.20.0')
 
   reset_tf_configure_bazelrc()
+  # Explicitly import tools/bazel.rc, this is needed for Bazel 0.19.0 or later
+  write_to_bazelrc('import %workspace%/tools/bazel.rc')
+
   cleanup_makefile()
   setup_python(environ_cp)
 
@@ -1561,13 +1583,11 @@ def main():
     # TODO(ibiryukov): Investigate using clang as a cpu or cuda compiler on
     # Windows.
     environ_cp['TF_DOWNLOAD_CLANG'] = '0'
-    environ_cp['TF_ENABLE_XLA'] = '0'
     environ_cp['TF_NEED_MPI'] = '0'
     environ_cp['TF_SET_ANDROID_WORKSPACE'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_TENSORRT'] = '0'
-    environ_cp['TF_ENABLE_XLA'] = '0'
 
   # The numpy package on ppc64le uses OpenBLAS which has multi-threading
   # issues that lead to incorrect answers.  Set OMP_NUM_THREADS=1 at
@@ -1576,10 +1596,9 @@ def main():
   if is_ppc64le():
     write_action_env_to_bazelrc('OMP_NUM_THREADS', 1)
 
-  set_build_var(environ_cp, 'TF_NEED_IGNITE', 'Apache Ignite',
-                'with_ignite_support', True, 'ignite')
+  xla_enabled_by_default = is_linux()
   set_build_var(environ_cp, 'TF_ENABLE_XLA', 'XLA JIT', 'with_xla_support',
-                True, 'xla')
+                xla_enabled_by_default, 'xla')
 
   set_action_env_var(environ_cp, 'TF_NEED_OPENCL_SYCL', 'OpenCL SYCL', False)
   if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
@@ -1671,18 +1690,24 @@ def main():
     create_android_ndk_rule(environ_cp)
     create_android_sdk_rule(environ_cp)
 
-  # On Windows, we don't have MKL support and the build is always monolithic.
-  # So no need to print the following message.
-  # TODO(pcloudy): remove the following if check when they make sense on Windows
-  if not is_windows():
-    print('Preconfigured Bazel build configs. You can use any of the below by '
-          'adding "--config=<>" to your build command. See .bazelrc for more '
-          'details.')
-    config_info_line('mkl', 'Build with MKL support.')
-    config_info_line('monolithic', 'Config for mostly static monolithic build.')
-    config_info_line('gdr', 'Build with GDR support.')
-    config_info_line('verbs', 'Build with libverbs support.')
-    config_info_line('ngraph', 'Build with Intel nGraph support.')
+  print('Preconfigured Bazel build configs. You can use any of the below by '
+        'adding "--config=<>" to your build command. See .bazelrc for more '
+        'details.')
+  config_info_line('mkl', 'Build with MKL support.')
+  config_info_line('monolithic', 'Config for mostly static monolithic build.')
+  config_info_line('gdr', 'Build with GDR support.')
+  config_info_line('verbs', 'Build with libverbs support.')
+  config_info_line('ngraph', 'Build with Intel nGraph support.')
+  config_info_line('dynamic_kernels',
+                   '(Experimental) Build kernels into separate shared objects.')
+
+  print('Preconfigured Bazel build configs to DISABLE default on features:')
+  config_info_line('noaws', 'Disable AWS S3 filesystem support.')
+  config_info_line('nogcp', 'Disable GCP support.')
+  config_info_line('nohdfs', 'Disable HDFS support.')
+  config_info_line('noignite', 'Disable Apacha Ignite support.')
+  config_info_line('nokafka', 'Disable Apache Kafka support.')
+  config_info_line('nonccl', 'Disable NVIDIA NCCL support.')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 9b62a504525d5377d4836e92bdf0e46f7fc3ef38..fd4b94202aad24a82abef8abd16431f61a8326f0 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -43,6 +43,11 @@ TENSORFLOW_API_INIT_FILES_V2 = (
     TENSORFLOW_API_INIT_FILES + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
 )
 
+# @unused
+TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT = (
+    TENSORFLOW_API_INIT_FILES_V1 + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
+)
+
 # Config setting used when building for products
 # which requires restricted licenses to be avoided.
 config_setting(
@@ -209,12 +214,46 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# Features that are default ON are handled differently below.
+#
+config_setting(
+    name = "no_aws_support",
+    define_values = {"no_aws_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "no_gcp_support",
+    define_values = {"no_gcp_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "no_hdfs_support",
+    define_values = {"no_hdfs_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "no_ignite_support",
+    define_values = {"no_ignite_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
-    name = "with_ignite_support",
-    define_values = {"with_ignite_support": "true"},
+    name = "no_kafka_support",
+    define_values = {"no_kafka_support": "true"},
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "no_nccl_support",
+    define_values = {"no_nccl_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
+# Crosses between platforms and file system libraries not supported on those
+# platforms due to limitations in nested select() statements.
 config_setting(
     name = "with_cuda_support_windows_override",
     define_values = {"using_cuda_nvcc": "true"},
@@ -322,8 +361,9 @@ package_group(
         "-//third_party/tensorflow/python/estimator",
         "//learning/meta_rank/...",
         "//tensorflow/...",
-        "//tensorflow_estimator/...",
+        "//tensorflow_estimator/contrib/...",
         "//tensorflow_fold/llgtm/...",
+        "//tensorflow_text/...",
         "//third_party/py/tensor2tensor/...",
     ],
 )
@@ -525,35 +565,45 @@ genrule(
     }),
     outs = ["__init__.py"],
     cmd = select({
-        "api_version_2": "cp $(@D)/_api/v2/__init__.py $(OUTS)",
-        "//conditions:default": "cp $(@D)/_api/v1/__init__.py $(OUTS)",
+        "api_version_2": "cp $(@D)/_api/v2/v2.py $(OUTS)",
+        "//conditions:default": "cp $(@D)/_api/v1/v1.py $(OUTS)",
     }),
 )
 
 gen_api_init_files(
     name = "tf_python_api_gen_v1",
-    srcs = ["api_template.__init__.py"],
+    srcs = [
+        "api_template_v1.__init__.py",
+        "compat_template_v1.__init__.py",
+    ],
     api_version = 1,
+    compat_api_versions = [1],
+    compat_init_templates = ["compat_template_v1.__init__.py"],
     output_dir = "_api/v1/",
-    output_files = TENSORFLOW_API_INIT_FILES_V1,
+    output_files = TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT,
     output_package = "tensorflow._api.v1",
-    root_init_template = "api_template.__init__.py",
+    root_file_name = "v1.py",
+    root_init_template = "api_template_v1.__init__.py",
 )
 
 gen_api_init_files(
     name = "tf_python_api_gen_v2",
-    srcs = ["api_template.__init__.py"],
+    srcs = [
+        "api_template.__init__.py",
+        "compat_template_v1.__init__.py",
+    ],
     api_version = 2,
     compat_api_versions = [1],
+    compat_init_templates = ["compat_template_v1.__init__.py"],
     output_dir = "_api/v2/",
     output_files = TENSORFLOW_API_INIT_FILES_V2,
     output_package = "tensorflow._api.v2",
+    root_file_name = "v2.py",
     root_init_template = "api_template.__init__.py",
 )
 
 py_library(
     name = "tensorflow_py",
-    srcs = ["//tensorflow/python/estimator/api:estimator_python_api_gen"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 2de740e145f93b151faf5c987808dbdf73fb4fd7..d81cf067eb07e88e2b8a86cf5643674235eb3f3b 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -21,41 +21,24 @@ from __future__ import print_function as _print_function
 import os as _os
 
 # pylint: disable=g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-
-try:
-  # Add `estimator` attribute to allow access to estimator APIs via
-  # "tf.estimator..."
-  from tensorflow.python.estimator.api import estimator  # pylint: disable=g-import-not-at-top
-
-  # Add `estimator` to the __path__ to allow "from tensorflow.estimator..."
-  # style imports.
-  from tensorflow.python.estimator import api as estimator_api  # pylint: disable=g-import-not-at-top
-  __path__ += [_os.path.dirname(estimator_api.__file__)]
-  del estimator_api
-except (ImportError, AttributeError):
-  print('tf.estimator package not installed.')
+from tensorflow.python.tools import component_api_helper as _component_api_helper
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
 
 # API IMPORTS PLACEHOLDER
 
-from tensorflow.python.util.lazy_loader import LazyLoader  # pylint: disable=g-import-not-at-top
-contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
-del LazyLoader
-# The templated code that replaces the placeholder above sometimes
-# sets the __all__ variable. If it does, we have to be sure to add
-# "contrib".
-if '__all__' in vars():
-  vars()['__all__'].append('contrib')
-
-from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-app.flags = flags  # pylint: disable=undefined-variable
-
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(app.__file__))  # pylint: disable=undefined-variable
+# We're using bitwise, but there's nothing special about that.
+_tf_api_dir = _os.path.dirname(_os.path.dirname(bitwise.__file__))  # pylint: disable=undefined-variable
 if _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
+# Enable TF2 behaviors
+from tensorflow.python.compat import compat as _compat  # pylint: disable=g-import-not-at-top
+_compat.enable_v2_behavior()
+
 # These symbols appear because we import the python package which
 # in turn imports from tensorflow.core and tensorflow.python. They
 # must come from this module. So python adds these symbols for the
@@ -66,7 +49,14 @@ try:
   del core
 except NameError:
   # Don't fail if these modules are not available.
-  # For e.g. we are using this file for compat.v1 module as well and
-  # 'python', 'core' directories are not under compat/v1.
+  # For e.g. this file will be originally placed under tensorflow/_api/v1 which
+  # does not have 'python', 'core' directories. Then, it will be copied
+  # to tensorflow/ which does have these two directories.
+  pass
+# Similarly for compiler. Do it separately to make sure we do this even if the
+# others don't exist.
+try:
+  del compiler
+except NameError:
   pass
 # pylint: enable=undefined-variable
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..65bdb6cb1b5e6fb0656a12b932d767aeacfccd29
--- /dev/null
+++ b/tensorflow/api_template_v1.__init__.py
@@ -0,0 +1,72 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bring in all of the public TensorFlow interface into this module."""
+
+from __future__ import absolute_import as _absolute_import
+from __future__ import division as _division
+from __future__ import print_function as _print_function
+
+import os as _os
+
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+
+from tensorflow.python.tools import component_api_helper as _component_api_helper
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
+
+# API IMPORTS PLACEHOLDER
+
+from tensorflow.python.util.lazy_loader import LazyLoader  # pylint: disable=g-import-not-at-top
+contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
+del LazyLoader
+# The templated code that replaces the placeholder above sometimes
+# sets the __all__ variable. If it does, we have to be sure to add
+# "contrib".
+if '__all__' in vars():
+  vars()['__all__'].append('contrib')
+
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
+
+# Make sure directory containing top level submodules is in
+# the __path__ so that "from tensorflow.foo import bar" works.
+_tf_api_dir = _os.path.dirname(_os.path.dirname(app.__file__))  # pylint: disable=undefined-variable
+if _tf_api_dir not in __path__:
+  __path__.append(_tf_api_dir)
+
+
+# These symbols appear because we import the python package which
+# in turn imports from tensorflow.core and tensorflow.python. They
+# must come from this module. So python adds these symbols for the
+# resolution to succeed.
+# pylint: disable=undefined-variable
+try:
+  del python
+  del core
+except NameError:
+  # Don't fail if these modules are not available.
+  # For e.g. this file will be originally placed under tensorflow/_api/v1 which
+  # does not have 'python', 'core' directories. Then, it will be copied
+  # to tensorflow/ which does have these two directories.
+  pass
+# Similarly for compiler. Do it separately to make sure we do this even if the
+# others don't exist.
+try:
+  del compiler
+except NameError:
+  pass
+# pylint: enable=undefined-variable
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 17e2e292eb19029d279bc12a8328edadf96f1bb8..25df970ecab0757f23465ab19e7f45de0c759458 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -6,11 +6,12 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
-    "tf_cuda_cc_test",
     "tf_copts",
     "tf_cuda_library",
     "tf_custom_op_library",
+    "tf_kernel_library",
 )
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
 # -----------------------------------------------------------------------------
 # Public targets
@@ -59,6 +60,7 @@ tf_cuda_library(
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core:op_gen_lib",
+            "//tensorflow/core/distributed_runtime:server_lib",
         ],
     }),
 )
@@ -94,6 +96,7 @@ tf_cuda_library(
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
+            "//tensorflow/core/distributed_runtime:server_lib",
         ],
     }) + select({
         "//tensorflow:with_xla_support": [
@@ -118,13 +121,15 @@ tf_cuda_library(
         ":c_api",
         ":c_api_internal",
         "//tensorflow/c/eager:c_api",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/contrib/tpu:all_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:attr_builder",
     ],
 )
 
@@ -170,6 +175,60 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "env",
+    srcs = [
+        "env.cc",
+    ],
+    hdrs = [
+        "env.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            ":c_api",
+            ":tf_status_helper",
+            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:platform_env",
+            "//tensorflow/core:lib",
+        ],
+        "//conditions:default": [
+            ":c_api",
+            ":tf_status_helper",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:platform_env",
+            "//tensorflow/core:lib",
+        ],
+    }) + [":c_api_internal"],
+)
+
+tf_cuda_library(
+    name = "kernels",
+    srcs = [
+        "kernels.cc",
+    ],
+    hdrs = [
+        "kernels.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            ":c_api",
+            ":c_api_internal",
+            ":tf_status_helper",
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            ":c_api",
+            ":c_api_internal",
+            ":tf_status_helper",
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
 # -----------------------------------------------------------------------------
 # Tests
 
@@ -197,14 +256,18 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["c_api_test.cc"],
     data = [
-        ":test_op.so",
+        ":test_op1.so",
         "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     ],
+    kernels = [":test_op_kernel"],
     linkopts = select({
         "//tensorflow:darwin": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    tags = ["noasan"],
+    tags = [
+        "no_oss",  # http://b/119522529
+        "noasan",
+    ],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
     # linkstatic = tf_kernel_tests_linkstatic(),
@@ -215,6 +278,7 @@ tf_cuda_cc_test(
         "//tensorflow/cc:grad_ops",
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/cc/saved_model:tag_constants",
+        "//tensorflow/compiler/jit",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
@@ -232,7 +296,7 @@ tf_cuda_cc_test(
 
 tf_cc_test(
     name = "c_api_experimental_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_experimental_test.cc"],
     data = ["testdata/tf_record"],
     linkopts = select({
@@ -243,8 +307,11 @@ tf_cc_test(
     # the shared library must be able to use core:framework.
     # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
+        ":c_api",
         ":c_api_experimental",
         ":c_test_util",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -281,8 +348,63 @@ tf_cc_test(
 )
 
 tf_custom_op_library(
-    name = "test_op.so",
+    name = "test_op1.so",
+    srcs = ["test_op1.cc"],
+)
+
+tf_kernel_library(
+    name = "test_op_kernel",
     srcs = ["test_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+    name = "env_test",
+    size = "small",
+    srcs = ["env_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    tags = ["noasan"],
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api",
+        ":env",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "kernels_test",
+    size = "small",
+    srcs = ["kernels_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    tags = ["noasan"],
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api",
+        ":kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
 )
 
 # -----------------------------------------------------------------------------
diff --git a/tensorflow/c/README.md b/tensorflow/c/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b386998ceaf3e91daba04125fe83e2f3bdd508e5
--- /dev/null
+++ b/tensorflow/c/README.md
@@ -0,0 +1,7 @@
+# TensorFlow C API
+
+- See [www.tensorflow.org/install/lang_c](https://www.tensorflow.org/install/lang_c)
+- Nightly builds:
+  - [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-cpu-linux-x86_64.tar.gz)
+  - [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-gpu-linux-x86_64.tar.gz)
+  - [MacOS CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-cpu-darwin-x86_64.tar.gz)
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 79811ceae57e0bddeb2a6f32bad7003e14e23422..94d18eb8b04e3534be547aca5cfbb32da40ffbf6 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -136,16 +136,22 @@ const char* TF_Message(const TF_Status* s) {
 namespace {
 class TF_ManagedBuffer : public TensorBuffer {
  public:
-  void* data_;
-  size_t len_;
-  void (*deallocator_)(void* data, size_t len, void* arg);
-  void* deallocator_arg_;
+  TF_ManagedBuffer(void* data, size_t len,
+                   void (*deallocator)(void* data, size_t len, void* arg),
+                   void* deallocator_arg)
+      : TensorBuffer(data),
+        len_(len),
+        deallocator_(deallocator),
+        deallocator_arg_(deallocator_arg) {}
+
+  const size_t len_;
+  void (*const deallocator_)(void* data, size_t len, void* arg);
+  void* const deallocator_arg_;
 
   ~TF_ManagedBuffer() override {
-    (*deallocator_)(data_, len_, deallocator_arg_);
+    (*deallocator_)(data(), len_, deallocator_arg_);
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return len_; }
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -199,8 +205,7 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     dimvec[i] = static_cast<tensorflow::int64>(dims[i]);
   }
 
-  TF_ManagedBuffer* buf = new TF_ManagedBuffer;
-  buf->len_ = len;
+  TF_ManagedBuffer* buf = nullptr;
   if (dtype != TF_STRING && dtype != TF_RESOURCE &&
       tensorflow::DataTypeCanUseMemcpy(static_cast<DataType>(dtype)) &&
       reinterpret_cast<intptr_t>(data) % std::max(1, EIGEN_MAX_ALIGN_BYTES) !=
@@ -212,17 +217,15 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     //
     // Other types have the same representation, so copy only if it is safe to
     // do so.
-    buf->data_ = allocate_tensor("TF_NewTensor", len);
-    std::memcpy(buf->data_, data, len);
-    buf->deallocator_ = deallocate_buffer;
-    buf->deallocator_arg_ = nullptr;
+    buf = new TF_ManagedBuffer(allocate_tensor("TF_NewTensor", len), len,
+                               deallocate_buffer, nullptr);
+    std::memcpy(buf->data(), data, len);
     // Free the original buffer.
     deallocator(data, len, deallocator_arg);
   } else {
-    buf->data_ = data;
-    buf->deallocator_ = deallocator;
-    buf->deallocator_arg_ = deallocator_arg;
+    buf = new TF_ManagedBuffer(data, len, deallocator, deallocator_arg);
   }
+
   TF_Tensor* ret = new TF_Tensor{dtype, TensorShape(dimvec), buf};
   size_t elem_size = TF_DataTypeSize(dtype);
   if (elem_size > 0 && len < (elem_size * ret->shape.num_elements())) {
@@ -477,9 +480,9 @@ static TF_Tensor* EmptyTensor(TF_DataType dtype, const TensorShape& shape) {
   CHECK_EQ(nelems, 0);
   static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
                 "64-bit int types should match in size");
-  return TF_NewTensor(dtype, reinterpret_cast<const int64_t*>(dims.data()),
-                      shape.dims(), reinterpret_cast<void*>(&empty), 0,
-                      [](void*, size_t, void*) {}, nullptr);
+  return TF_NewTensor(
+      dtype, reinterpret_cast<const int64_t*>(dims.data()), shape.dims(),
+      reinterpret_cast<void*>(&empty), 0, [](void*, size_t, void*) {}, nullptr);
 }
 
 // Non-static for testing.
@@ -1592,18 +1595,20 @@ TF_AttrMetadata TF_OperationGetAttrMetadata(TF_Operation* oper,
     break;                                            \
   }
 
-      LIST_CASE(s, TF_ATTR_STRING, metadata.total_size = 0;
-                for (int i = 0; i < attr->list().s_size();
-                     ++i) { metadata.total_size += attr->list().s(i).size(); });
+      LIST_CASE(
+          s, TF_ATTR_STRING, metadata.total_size = 0;
+          for (int i = 0; i < attr->list().s_size();
+               ++i) { metadata.total_size += attr->list().s(i).size(); });
       LIST_CASE(i, TF_ATTR_INT);
       LIST_CASE(f, TF_ATTR_FLOAT);
       LIST_CASE(b, TF_ATTR_BOOL);
       LIST_CASE(type, TF_ATTR_TYPE);
-      LIST_CASE(shape, TF_ATTR_SHAPE, metadata.total_size = 0;
-                for (int i = 0; i < attr->list().shape_size(); ++i) {
-                  const auto& s = attr->list().shape(i);
-                  metadata.total_size += s.unknown_rank() ? 0 : s.dim_size();
-                });
+      LIST_CASE(
+          shape, TF_ATTR_SHAPE, metadata.total_size = 0;
+          for (int i = 0; i < attr->list().shape_size(); ++i) {
+            const auto& s = attr->list().shape(i);
+            metadata.total_size += s.unknown_rank() ? 0 : s.dim_size();
+          });
       LIST_CASE(tensor, TF_ATTR_TENSOR);
       LIST_CASE(tensor, TF_ATTR_FUNC);
 #undef LIST_CASE
@@ -1942,6 +1947,10 @@ void TF_ImportGraphDefOptionsSetPrefix(TF_ImportGraphDefOptions* opts,
                                        const char* prefix) {
   opts->opts.prefix = prefix;
 }
+void TF_ImportGraphDefOptionsSetDefaultDevice(TF_ImportGraphDefOptions* opts,
+                                              const char* device) {
+  opts->opts.default_device = device;
+}
 
 void TF_ImportGraphDefOptionsSetUniquifyNames(TF_ImportGraphDefOptions* opts,
                                               unsigned char uniquify_names) {
@@ -2770,6 +2779,9 @@ TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map, const char* name,
   }
   string name_str(name, name_len);
   const auto* api_def = api_def_map->api_def_map.GetApiDef(name_str);
+  if (api_def == nullptr) {
+    return nullptr;
+  }
 
   TF_Buffer* ret = TF_NewBuffer();
   status->status = MessageToBuffer(*api_def, ret);
@@ -2803,4 +2815,71 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
   }
   return ret;
 }
+
+// TF_Server functions ----------------------------------------------
+
+#ifndef __ANDROID__
+TF_Server::TF_Server(std::unique_ptr<tensorflow::ServerInterface> server)
+    : target(server->target()), server(std::move(server)) {}
+#endif  // __ANDROID__
+
+TF_Server* TF_NewServer(const void* proto, size_t proto_len,
+                        TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+  return nullptr;
+#else
+  tensorflow::ServerDef server_def;
+  if (!server_def.ParseFromArray(proto, static_cast<int>(proto_len))) {
+    status->status = InvalidArgument(
+        "Could not parse provided bytes into a ServerDef protocol buffer");
+    return nullptr;
+  }
+
+  std::unique_ptr<tensorflow::ServerInterface> out_server;
+  status->status = tensorflow::NewServer(server_def, &out_server);
+  if (!status->status.ok()) return nullptr;
+
+  return new TF_Server(std::move(out_server));
+#endif
+}
+
+void TF_ServerStart(TF_Server* server, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+#else
+  status->status = server->server->Start();
+#endif
+}
+
+void TF_ServerStop(TF_Server* server, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+#else
+  status->status = server->server->Stop();
+#endif
+}
+
+void TF_ServerJoin(TF_Server* server, TF_Status* status) {
+#ifdef __ANDROID__
+  status->status = tensorflow::errors::Unimplemented(
+      "Server functionality is not supported in Android");
+#else
+  status->status = server->server->Join();
+#endif
+}
+
+const char* TF_ServerTarget(TF_Server* server) {
+#ifdef __ANDROID__
+  return nullptr;
+#else
+  return server->target.c_str();
+#endif
+}
+
+void TF_DeleteServer(TF_Server* server) { delete server; }
+
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 850f6ecd637d768bca99720e0add07680829e17a..c7abba85521fccec07983cd5ab4f94a8368d6181 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -91,7 +91,7 @@ extern "C" {
 // --------------------------------------------------------------------------
 // TF_Version returns a string describing version information of the
 // TensorFlow library. TensorFlow using semantic versioning.
-TF_CAPI_EXPORT extern const char* TF_Version();
+TF_CAPI_EXPORT extern const char* TF_Version(void);
 
 // --------------------------------------------------------------------------
 // TF_DataType holds the type for a scalar value.  E.g., one slot in a tensor.
@@ -157,7 +157,7 @@ typedef enum TF_Code {
 typedef struct TF_Status TF_Status;
 
 // Return a new status object.
-TF_CAPI_EXPORT extern TF_Status* TF_NewStatus();
+TF_CAPI_EXPORT extern TF_Status* TF_NewStatus(void);
 
 // Delete a previously created status object.
 TF_CAPI_EXPORT extern void TF_DeleteStatus(TF_Status*);
@@ -196,7 +196,7 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_NewBufferFromString(const void* proto,
                                                         size_t proto_len);
 
 // Useful for passing *out* a protobuf.
-TF_CAPI_EXPORT extern TF_Buffer* TF_NewBuffer();
+TF_CAPI_EXPORT extern TF_Buffer* TF_NewBuffer(void);
 
 TF_CAPI_EXPORT extern void TF_DeleteBuffer(TF_Buffer*);
 
@@ -305,7 +305,7 @@ TF_CAPI_EXPORT extern size_t TF_StringEncodedSize(size_t len);
 typedef struct TF_SessionOptions TF_SessionOptions;
 
 // Return a new options object.
-TF_CAPI_EXPORT extern TF_SessionOptions* TF_NewSessionOptions();
+TF_CAPI_EXPORT extern TF_SessionOptions* TF_NewSessionOptions(void);
 
 // Set the target in TF_SessionOptions.options.
 // target can be empty, a single entry, or a comma separated list of entries.
@@ -338,7 +338,7 @@ TF_CAPI_EXPORT extern void TF_DeleteSessionOptions(TF_SessionOptions*);
 typedef struct TF_Graph TF_Graph;
 
 // Return a new graph object.
-TF_CAPI_EXPORT extern TF_Graph* TF_NewGraph();
+TF_CAPI_EXPORT extern TF_Graph* TF_NewGraph(void);
 
 // Destroy an options object.  Graph will be deleted once no more
 // TFSession's are referencing it.
@@ -890,7 +890,8 @@ TF_CAPI_EXPORT extern void TF_GraphVersions(TF_Graph* graph,
 // TF_GraphImportGraphDef.
 typedef struct TF_ImportGraphDefOptions TF_ImportGraphDefOptions;
 
-TF_CAPI_EXPORT extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions();
+TF_CAPI_EXPORT extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions(
+    void);
 TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
     TF_ImportGraphDefOptions* opts);
 
@@ -900,6 +901,12 @@ TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
 TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetPrefix(
     TF_ImportGraphDefOptions* opts, const char* prefix);
 
+// Set the execution device for nodes in `graph_def`.
+// Only applies to nodes where a device was not already explicitly specified.
+// `device` is copied and has no lifetime requirements.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetDefaultDevice(
+    TF_ImportGraphDefOptions* opts, const char* device);
+
 // Set whether to uniquify imported operation names. If true, imported operation
 // names will be modified if their name already exists in the graph. If false,
 // conflicting names will be treated as an error. Note that this option has no
@@ -1605,7 +1612,7 @@ TF_CAPI_EXPORT extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
 //
 // The data in the buffer will be the serialized OpList proto for ops registered
 // in this address space.
-TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList();
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList(void);
 
 // TF_ApiDefMap encapsulates a collection of API definitions for an operation.
 //
@@ -1662,6 +1669,47 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status);
 TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp(
     const char* name, TF_Status* status);
 
+// --------------------------------------------------------------------------
+// In-process TensorFlow server functionality, for use in distributed training.
+// A Server instance encapsulates a set of devices and a Session target that
+// can participate in distributed training. A server belongs to a cluster
+// (specified by a ClusterSpec), and corresponds to a particular task in a
+// named job. The server can communicate with any other server in the same
+// cluster.
+
+// In-process TensorFlow server.
+typedef struct TF_Server TF_Server;
+
+// Creates a new in-process TensorFlow server configured using a serialized
+// ServerDef protocol buffer provided via `proto` and `proto_len`.
+//
+// The server will not serve any requests until TF_ServerStart is invoked.
+// The server will stop serving requests once TF_ServerStop or
+// TF_DeleteServer is invoked.
+TF_CAPI_EXPORT extern TF_Server* TF_NewServer(const void* proto,
+                                              size_t proto_len,
+                                              TF_Status* status);
+
+// Starts an in-process TensorFlow server.
+TF_CAPI_EXPORT extern void TF_ServerStart(TF_Server* server, TF_Status* status);
+
+// Stops an in-process TensorFlow server.
+TF_CAPI_EXPORT extern void TF_ServerStop(TF_Server* server, TF_Status* status);
+
+// Blocks until the server has been successfully stopped (via TF_ServerStop or
+// TF_ServerClose).
+TF_CAPI_EXPORT extern void TF_ServerJoin(TF_Server* server, TF_Status* status);
+
+// Returns the target string that can be provided to TF_SetTarget() to connect
+// a TF_Session to `server`.
+//
+// The returned string is valid only until TF_DeleteServer is invoked.
+TF_CAPI_EXPORT extern const char* TF_ServerTarget(TF_Server* server);
+
+// Destroy an in-process TensorFlow server, frees memory. If server is running
+// it will be stopped and joined.
+TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index d4b78138e93624a7e41e917f8210281b500661bc..38e29aa74a90f4e85d1369b6928a5a58c531b2da 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -15,12 +15,18 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/net.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
@@ -50,8 +56,8 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
     // These XLA flags are needed to trigger XLA properly from C (more generally
     // non-Python) clients. If this API is called again with `enable` set to
     // false, it is safe to keep these flag values as is.
-    tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
-        tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+    tensorflow::MarkForCompilationPassFlags* flags =
+        tensorflow::GetMarkForCompilationPassFlags();
     flags->tf_xla_cpu_global_jit = true;
     flags->tf_xla_min_cluster_size = 1;
   } else {
@@ -70,8 +76,8 @@ TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
     // These XLA flags are needed to trigger XLA properly from C (more generally
     // non-Python) clients. If this API is called again with `enable` set to
     // false, it is safe to keep these flag values as is.
-    tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
-        tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+    tensorflow::MarkForCompilationPassFlags* flags =
+        tensorflow::GetMarkForCompilationPassFlags();
     flags->tf_xla_cpu_global_jit = true;
     flags->tf_xla_min_cluster_size = 1;
   } else {
@@ -6524,7 +6530,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/cycle_length"
+      name: "ExperimentalParallelInterleaveDataset/cycle_length"
       op: "Const"
       attr {
         key: "dtype"
@@ -6545,7 +6551,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/block_length"
+      name: "ExperimentalParallelInterleaveDataset/block_length"
       op: "Const"
       attr {
         key: "dtype"
@@ -6566,7 +6572,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/sloppy"
+      name: "ExperimentalParallelInterleaveDataset/sloppy"
       op: "Const"
       attr {
         key: "dtype"
@@ -6587,7 +6593,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/buffer_output_elements"
+      name: "ExperimentalParallelInterleaveDataset/buffer_output_elements"
       op: "Const"
       attr {
         key: "dtype"
@@ -6608,7 +6614,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/prefetch_input_elements"
+      name: "ExperimentalParallelInterleaveDataset/prefetch_input_elements"
       op: "Const"
       attr {
         key: "dtype"
@@ -6629,14 +6635,14 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset"
-      op: "ParallelInterleaveDataset"
+      name: "ExperimentalParallelInterleaveDataset"
+      op: "ExperimentalParallelInterleaveDataset"
       input: "RepeatDataset:handle:0"
-      input: "ParallelInterleaveDataset/cycle_length:output:0"
-      input: "ParallelInterleaveDataset/block_length:output:0"
-      input: "ParallelInterleaveDataset/sloppy:output:0"
-      input: "ParallelInterleaveDataset/buffer_output_elements:output:0"
-      input: "ParallelInterleaveDataset/prefetch_input_elements:output:0"
+      input: "ExperimentalParallelInterleaveDataset/cycle_length:output:0"
+      input: "ExperimentalParallelInterleaveDataset/block_length:output:0"
+      input: "ExperimentalParallelInterleaveDataset/sloppy:output:0"
+      input: "ExperimentalParallelInterleaveDataset/buffer_output_elements:output:0"
+      input: "ExperimentalParallelInterleaveDataset/prefetch_input_elements:output:0"
       attr {
         key: "Targuments"
         value {
@@ -6736,7 +6742,7 @@ library {
     node_def {
       name: "ShuffleDataset_2"
       op: "ShuffleDataset"
-      input: "ParallelInterleaveDataset:handle:0"
+      input: "ExperimentalParallelInterleaveDataset:handle:0"
       input: "ShuffleDataset_2/buffer_size_1:output:0"
       input: "ShuffleDataset_2/seed_2:output:0"
       input: "ShuffleDataset_2/seed2_2:output:0"
@@ -8738,7 +8744,145 @@ void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   TF_DeleteStatus(status);
 }
 
-TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
-                                                      const char* errMsg) {
+struct TFE_ExecuteOpNotification {
+  TFE_ExecuteOpNotification() : status(TF_NewStatus(), TF_DeleteStatus) {}
+  tensorflow::Notification n;
+  std::unique_ptr<tensorflow::Thread> thread;
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status;
+};
+
+TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread(TFE_Op* op,
+                                                    TFE_TensorHandle** retvals,
+                                                    int* num_retvals,
+                                                    TF_Status* status) {
+  TFE_ExecuteOpNotification* n = new TFE_ExecuteOpNotification;
+
+  n->thread.reset(op->operation.EagerContext()->TFEnv()->StartThread(
+      tensorflow::ThreadOptions(), "ExecuteOpThread",
+      [op, retvals, num_retvals, n]() {
+        TFE_Execute(op, retvals, num_retvals, n->status.get());
+        n->n.Notify();
+      }));
+
+  return n;
+}
+
+void TFE_ExecuteOpNotificationWaitAndDelete(
+    TFE_ExecuteOpNotification* notification, TF_Status* status) {
+  if (notification == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Passed in notification is a nullptr.");
+
+    return;
+  }
+  if (notification->thread == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Passed in notification didn't start a thread correctly. Cleaning up "
+        "this notification. Please re-execute the operation to get a new "
+        "notification.");
+
+    delete notification;
+    return;
+  }
+
+  notification->n.WaitForNotification();
+
+  status->status = notification->status->status;
+
+  delete notification;
+}
+
+void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg) {
   status->status = tensorflow::errors::Internal(errMsg);
 }
+
+// This builder is used in the eager API to build a NodeDef.
+struct TF_AttrBuilder : public tensorflow::AttrBuilder {
+  using tensorflow::AttrBuilder::AttrBuilder;
+  // The string buffers to make sure that any `attr_name` we pass into
+  // `builder->Set()` will outlive the subsequent
+  // `TF_AttrBuilderCheckCanRunOnDevice()` call(s) on the same `builder`.
+  std::set<std::string> attr_names;
+};
+
+TF_AttrBuilder* TF_NewAttrBuilder(const char* op_name) {
+  return new TF_AttrBuilder(op_name);
+}
+
+void TF_DeleteAttrBuilder(TF_AttrBuilder* builder) { delete builder; }
+
+void TF_AttrBuilderSetType(TF_AttrBuilder* builder, const char* attr_name,
+                           TF_DataType value) {
+  auto iter = builder->attr_names.insert(attr_name).first;
+  builder->Set((*iter).c_str(), static_cast<tensorflow::DataType>(value));
+}
+
+void TF_AttrBuilderSetTypeList(TF_AttrBuilder* builder, const char* attr_name,
+                               const TF_DataType* values, int num_values) {
+  auto iter = builder->attr_names.insert(attr_name).first;
+  builder->Set(
+      (*iter).c_str(),
+      tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
+          reinterpret_cast<const tensorflow::DataType*>(values), num_values));
+}
+
+void TF_AttrBuilderCheckCanRunOnDevice(TF_AttrBuilder* builder,
+                                       const char* device_type,
+                                       TF_Status* status) {
+  status->status = tensorflow::FindKernelDef(
+      tensorflow::DeviceType(device_type), builder->BuildNodeDef(),
+      /* def = */ nullptr, /* kernel_class_name = */ nullptr);
+}
+
+const char* TF_GetNumberAttrForOpListInput(const char* op_name, int input_index,
+                                           TF_Status* status) {
+  const tensorflow::OpDef* op_def = nullptr;
+  status->status =
+      tensorflow::OpRegistry::Global()->LookUpOpDef(op_name, &op_def);
+  if (!status->status.ok()) return nullptr;
+
+  if (input_index >= op_def->input_arg_size() || input_index < 0) {
+    status->status = tensorflow::errors::InvalidArgument(
+        input_index, " out of range for ", op_name);
+    return nullptr;
+  }
+
+  const tensorflow::OpDef_ArgDef& input_arg = op_def->input_arg()[input_index];
+
+  if (input_arg.number_attr().empty()) {
+    status->status = tensorflow::errors::NotFound(
+        op_name, " does not have number_attr() defined.");
+    return nullptr;
+  }
+
+  // The returned string is owned by OpRegistry, so liveness is not a concern.
+  return input_arg.number_attr().c_str();
+}
+
+int TF_OpIsStateful(const char* op_type, TF_Status* status) {
+  const tensorflow::OpRegistrationData* op_reg_data;
+  status->status =
+      tensorflow::OpRegistry::Global()->LookUp(op_type, &op_reg_data);
+  if (!status->status.ok()) {
+    return 0;
+  }
+  return op_reg_data->op_def.is_stateful();
+}
+
+void TF_InitMain(const char* usage, int* argc, char*** argv) {
+  tensorflow::port::InitMain(usage, argc, argv);
+}
+
+int TF_PickUnusedPortOrDie() {
+  return tensorflow::internal::PickUnusedPortOrDie();
+}
+
+TFE_TensorHandle* TFE_NewTensorHandleFromScalar(TF_DataType dtype_arg,
+                                                void* data, size_t len) {
+  auto dtype = static_cast<tensorflow::DataType>(dtype_arg);
+  DCHECK(tensorflow::DataTypeCanUseMemcpy(dtype));
+
+  tensorflow::Tensor tensor(dtype, tensorflow::TensorShape({}));
+  std::memcpy(tensorflow::TensorCApi::Buffer(tensor)->data(), data, len);
+  return new TFE_TensorHandle(tensor, nullptr, nullptr);
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index d98d532e32e891e21f5b7ba360c74c3256fb1947..3e3a485eb763b871b0551414c4ef04746b2ed9a3 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -180,9 +180,72 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
 TF_CAPI_EXPORT extern void TFE_TensorHandlePrintDebugString(
     TFE_TensorHandle* handle);
 
+typedef struct TFE_ExecuteOpNotification TFE_ExecuteOpNotification;
+
+// Allows invoking a kernel asynchronously, and explicitly returns a
+// notification that can be waited upon. This always executes the kernel in a
+// new thread.
+// 1. `retvals` and `num_retvals` can only be consumed after
+// `TFE_ExecuteOp` returns successfully. They shouldn't be used
+// if the return is unsuccessful
+// 2. These new APIs cannot be used together with the TFE context level async
+// support.
+TF_CAPI_EXPORT extern TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread(
+    TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
+    TF_Status* status);
+
+// Waits to complete the op execution, and cleans up the notification.
+// Errors reported by op execution are set in `status`.
+TF_CAPI_EXPORT extern void TFE_ExecuteOpNotificationWaitAndDelete(
+    TFE_ExecuteOpNotification* notification, TF_Status* status);
+
 TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
                                                       const char* errMsg);
 
+// TF_NewAttrBuilder() returns an object that you can set attributes on as
+// though it were an op. This allows querying properties of that op for
+// type-checking purposes like if the op will run on a particular device type.
+typedef struct TF_AttrBuilder TF_AttrBuilder;
+TF_CAPI_EXPORT extern TF_AttrBuilder* TF_NewAttrBuilder(const char* op_name);
+TF_CAPI_EXPORT extern void TF_DeleteAttrBuilder(TF_AttrBuilder* builder);
+TF_CAPI_EXPORT extern void TF_AttrBuilderSetType(TF_AttrBuilder* builder,
+                                                 const char* attr_name,
+                                                 TF_DataType value);
+TF_CAPI_EXPORT extern void TF_AttrBuilderSetTypeList(TF_AttrBuilder* builder,
+                                                     const char* attr_name,
+                                                     const TF_DataType* values,
+                                                     int num_values);
+
+// Checks the tensorflow::NodeDef built via the methods above to see if it can
+// run on device_type.
+TF_CAPI_EXPORT extern void TF_AttrBuilderCheckCanRunOnDevice(
+    TF_AttrBuilder* builder, const char* device_type, TF_Status* status);
+
+// For argument number input_index, fetch the corresponding number_attr that
+// needs to be updated with the argument length of the input list.
+// Returns nullptr if there is any problem like op_name is not found, or the
+// argument does not support this attribute type.
+TF_CAPI_EXPORT extern const char* TF_GetNumberAttrForOpListInput(
+    const char* op_name, int input_index, TF_Status* status);
+
+// Returns 1 if the op is stateful, 0 otherwise. The return value is undefined
+// if the status is not ok.
+TF_CAPI_EXPORT extern int TF_OpIsStateful(const char* op_type,
+                                          TF_Status* status);
+
+// Platform specific initialization routine. Very few platforms actually require
+// this to be called.
+TF_CAPI_EXPORT void TF_InitMain(const char* usage, int* argc, char*** argv);
+
+// Platform-specific implementation to return an unused port. (This should used
+// in tests only.)
+TF_CAPI_EXPORT int TF_PickUnusedPortOrDie(void);
+
+// Fast path method that makes constructing a single scalar tensor require less
+// overhead and copies.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromScalar(
+    TF_DataType dtype, void* scalar, size_t len);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index c6effd39697e0397278770b53e98508074f99862..daa7701b7fe7e8ce757b6504329cf6434ad39778 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/c_test_util.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -162,5 +164,137 @@ protocol: "grpc"
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI_EXPERIMENTAL, IsStateful) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  int assign = TF_OpIsStateful("AssignAddVariableOp", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  EXPECT_EQ(assign, 1);
+  int id = TF_OpIsStateful("Identity", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  EXPECT_EQ(id, 0);
+}
+
+TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Simple) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+
+  TFE_Op* matmul_op = MatMulOp(ctx, m, m);
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+
+  auto* r =
+      TFE_ExecuteOpInNewThread(matmul_op, &retvals[0], &num_retvals, status);
+
+  TFE_ExecuteOpNotificationWaitAndDelete(r, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TFE_DeleteOp(matmul_op);
+  TFE_DeleteTensorHandle(m);
+
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
+// Perform a send/recv test. Recv blocks, so they need to be executed
+// asynchronously.
+TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Blocking) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  // Returns a 2x2 float32 Tensor on the CPU, with data 1., 2., 3., 4.
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+
+  // Build a send op.
+  TFE_Op* send_op = TFE_NewOp(ctx, "_Send", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(send_op, m, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  string tensor_name = "Tensor";
+  TFE_OpSetAttrType(send_op, "T", TF_FLOAT);
+  TFE_OpSetAttrString(send_op, "tensor_name", tensor_name.c_str(),
+                      tensor_name.size());
+  string send_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_OpSetAttrString(send_op, "send_device", send_device.c_str(),
+                      send_device.size());
+  TFE_OpSetAttrInt(send_op, "send_device_incarnation", 1234);
+  string recv_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_OpSetAttrString(send_op, "recv_device", recv_device.c_str(),
+                      recv_device.size());
+  TFE_OpSetAttrBool(send_op, "client_terminated", true);
+
+  // Build a recv op.
+  TFE_Op* recv_op = TFE_NewOp(ctx, "_Recv", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_OpSetAttrType(recv_op, "tensor_type", TF_FLOAT);
+  TFE_OpSetAttrString(recv_op, "tensor_name", tensor_name.c_str(),
+                      tensor_name.size());
+  TFE_OpSetAttrString(recv_op, "send_device", send_device.c_str(),
+                      send_device.size());
+  TFE_OpSetAttrInt(recv_op, "send_device_incarnation", 1234);
+  TFE_OpSetAttrString(recv_op, "recv_device", recv_device.c_str(),
+                      recv_device.size());
+  TFE_OpSetAttrBool(recv_op, "client_terminated", true);
+
+  TFE_TensorHandle* send_retvals;
+  int send_num_retvals = 0;
+  auto* send_result = TFE_ExecuteOpInNewThread(send_op, &send_retvals,
+                                               &send_num_retvals, status);
+
+  TFE_TensorHandle* recv_retvals[1] = {nullptr};
+  int recv_num_retvals = 1;
+  auto* recv_result = TFE_ExecuteOpInNewThread(recv_op, &recv_retvals[0],
+                                               &recv_num_retvals, status);
+
+  TFE_ExecuteOpNotificationWaitAndDelete(send_result, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_ExecuteOpNotificationWaitAndDelete(recv_result, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(recv_retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(1, product[0]);
+  EXPECT_EQ(2, product[1]);
+  EXPECT_EQ(3, product[2]);
+  EXPECT_EQ(4, product[3]);
+
+  TFE_DeleteOp(send_op);
+  TFE_DeleteOp(recv_op);
+  TFE_DeleteTensorHandle(m);
+
+  TFE_DeleteTensorHandle(recv_retvals[0]);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index f68f8a3e90a971b5e4a024feaf26ba498afc48da..28b9f8df9c873ee394eb6a241dd9ac06ba6c8796 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -392,26 +392,26 @@ Status ProcessInputs(
     EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   input_tensors->reserve(ninputs);
   for (int i = 0; i < ninputs; ++i) {
-    const Node& node = inputs[i].oper->node;
+    Node* node = &inputs[i].oper->node;
     int idx = inputs[i].index;
 
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        fn_body->graph.IsValidOutputTensor(&node, idx),
+        fn_body->graph.IsValidOutputTensor(node, idx),
         "Encountered while processing input ", i, " into function '", fn_name,
         "'");
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(&node, idx),
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(node, idx),
                                     "Encountered while processing input ", i,
                                     " into function '", fn_name, "'");
 
-    input_tensors->emplace_back(&node, idx);
+    input_tensors->emplace_back(node, idx);
 
-    const auto& iter = input_nodes->find(&node);
+    const auto& iter = input_nodes->find(node);
     if (iter == input_nodes->end()) {
-      input_nodes->insert({&node, {idx}});
+      input_nodes->insert({node, {idx}});
     } else {
       auto& indices = iter->second;
       if (std::find(indices.begin(), indices.end(), idx) != indices.end()) {
-        return InvalidArgument("TF_Output ", node.name(), ":", idx,
+        return InvalidArgument("TF_Output ", node->name(), ":", idx,
                                " appears more than once in the input list");
       }
       indices.push_back(idx);
@@ -428,16 +428,16 @@ Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name,
     EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   output_tensors->reserve(noutputs);
   for (int i = 0; i < noutputs; ++i) {
-    const Node& node = outputs[i].oper->node;
+    Node* node = &outputs[i].oper->node;
     int idx = outputs[i].index;
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        fn_body->graph.IsValidOutputTensor(&node, idx),
+        fn_body->graph.IsValidOutputTensor(node, idx),
         "Encountered while processing output ", i, " from function '", fn_name,
         "'");
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(&node, idx),
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(node, idx),
                                     "Encountered while creating function '",
                                     fn_name, "'");
-    output_tensors->emplace_back(&node, idx);
+    output_tensors->emplace_back(node, idx);
   }
   return Status::OK();
 }
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 95652a11378d6276b5ba6540a07baa15aa77cc1c..5ba26d3c585350aa510f9970cbfc246a9a108543 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #ifndef __ANDROID__
+#include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #endif
 #include "tensorflow/core/common_runtime/shape_refiner.h"
@@ -179,6 +180,15 @@ struct TF_ApiDefMap {
   tensorflow::mutex lock;
 };
 
+#ifndef __ANDROID__
+struct TF_Server {
+  TF_Server(std::unique_ptr<tensorflow::ServerInterface> server);
+
+  const tensorflow::string target;
+  std::unique_ptr<tensorflow::ServerInterface> server;
+};
+#endif
+
 namespace tensorflow {
 
 class TensorCApi {
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 03516c39dc970aa23967107d3a0446da94669465..d5934a10395ae094f65d3bc8b6cd7b94dbd32410 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -186,23 +187,40 @@ TEST(CAPI, LibraryLoadFunctions) {
   // tf_cuda_cc_test() bazel rule and remove the next line.
   if (!GPUDeviceName().empty()) return;
 
-  // Load the library.
-  TF_Status* status = TF_NewStatus();
-  TF_Library* lib =
-      TF_LoadLibrary("tensorflow/c/test_op.so", status);
-  TF_Code code = TF_GetCode(status);
-  string status_msg(TF_Message(status));
-  TF_DeleteStatus(status);
-  ASSERT_EQ(TF_OK, code) << status_msg;
-
-  // Test op list.
-  TF_Buffer op_list_buf = TF_GetOpList(lib);
-  tensorflow::OpList op_list;
-  EXPECT_TRUE(op_list.ParseFromArray(op_list_buf.data, op_list_buf.length));
-  ASSERT_EQ(op_list.op_size(), 1);
-  EXPECT_EQ("TestCApi", op_list.op(0).name());
-
-  TF_DeleteLibraryHandle(lib);
+#if !defined(TENSORFLOW_NO_SHARED_OBJECTS)
+  {
+    // Load the library.
+    TF_Status* status = TF_NewStatus();
+    TF_Library* lib =
+        TF_LoadLibrary("tensorflow/c/test_op1.so", status);
+    TF_Code code = TF_GetCode(status);
+    string status_msg(TF_Message(status));
+    TF_DeleteStatus(status);
+    ASSERT_EQ(TF_OK, code) << status_msg;
+
+    // Test op list.
+    TF_Buffer op_list_buf = TF_GetOpList(lib);
+    tensorflow::OpList op_list;
+    EXPECT_TRUE(op_list.ParseFromArray(op_list_buf.data, op_list_buf.length));
+    ASSERT_EQ(op_list.op_size(), 1);
+    EXPECT_EQ("TestCApi1", op_list.op(0).name());
+    TF_DeleteLibraryHandle(lib);
+  }
+#endif  // !defined(TENSORFLOW_NO_SHARED_OBJECTS)
+  {
+    TF_Buffer* op_list_buffer = TF_GetAllOpList();
+    tensorflow::OpList op_list;
+    op_list.ParseFromArray(op_list_buffer->data, op_list_buffer->length);
+    ASSERT_GE(op_list.op_size(), 1);
+    typedef tensorflow::protobuf::RepeatedPtrField<tensorflow::OpDef> OpDefs;
+    const OpDefs& ops = op_list.op();
+    bool found = std::find_if(ops.begin(), ops.end(),
+                              [](const tensorflow::OpDef& op_def) {
+                                return op_def.name() == "TestCApi";
+                              }) != ops.end();
+    EXPECT_TRUE(found);
+    TF_DeleteBuffer(op_list_buffer);
+  }
 }
 
 void TestEncodeDecode(int line, const std::vector<string>& data) {
@@ -2329,15 +2347,9 @@ TEST(TestApiDef, TestCreateApiDef) {
   // tf_cuda_cc_test() bazel rule and remove the next line.
   if (!GPUDeviceName().empty()) return;
 
+  TF_Buffer* op_list_buf = TF_GetAllOpList();
   TF_Status* status = TF_NewStatus();
-  TF_Library* lib =
-      TF_LoadLibrary("tensorflow/c/test_op.so", status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TF_DeleteStatus(status);
-
-  TF_Buffer op_list_buf = TF_GetOpList(lib);
-  status = TF_NewStatus();
-  auto* api_def_map = TF_NewApiDefMap(&op_list_buf, status);
+  auto* api_def_map = TF_NewApiDefMap(op_list_buf, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 
@@ -2355,7 +2367,7 @@ TEST(TestApiDef, TestCreateApiDef) {
 
   TF_DeleteBuffer(api_def_buf);
   TF_DeleteApiDefMap(api_def_map);
-  TF_DeleteLibraryHandle(lib);
+  TF_DeleteBuffer(op_list_buf);
 }
 
 TEST(TestApiDef, TestCreateApiDefWithOverwrites) {
@@ -2363,15 +2375,9 @@ TEST(TestApiDef, TestCreateApiDefWithOverwrites) {
   // tf_cuda_cc_test() bazel rule and remove the next line.
   if (!GPUDeviceName().empty()) return;
 
+  TF_Buffer* op_list_buf = TF_GetAllOpList();
   TF_Status* status = TF_NewStatus();
-  TF_Library* lib =
-      TF_LoadLibrary("tensorflow/c/test_op.so", status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TF_DeleteStatus(status);
-
-  TF_Buffer op_list_buf = TF_GetOpList(lib);
-  status = TF_NewStatus();
-  auto* api_def_map = TF_NewApiDefMap(&op_list_buf, status);
+  auto* api_def_map = TF_NewApiDefMap(op_list_buf, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 
@@ -2400,7 +2406,7 @@ TEST(TestApiDef, TestCreateApiDefWithOverwrites) {
 
   TF_DeleteBuffer(api_def_buf);
   TF_DeleteApiDefMap(api_def_map);
-  TF_DeleteLibraryHandle(lib);
+  TF_DeleteBuffer(op_list_buf);
 }
 
 class DummyKernel : public tensorflow::OpKernel {
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3ee31a6a7ac641bbd3fc4c05568b61e433a1d523..c34a84fcfee9b6ba9a7be86ae16e2856a2d343c7 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -50,6 +50,7 @@ tf_cuda_library(
         ],
         "//conditions:default": [],
     }) + [
+        "@com_google_absl//absl/memory",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
@@ -69,7 +70,7 @@ tf_cuda_library(
     name = "c_api_internal",
     hdrs = ["c_api_internal.h"],
     visibility = [
-        "//learning/deepmind/courier:__pkg__",
+        "//learning/deepmind/courier:__subpackages__",
         "//tensorflow:internal",
     ],
     deps = [
@@ -143,6 +144,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 3554ec0bf3202b54bfc38d67e51b89df19832302..027d752f420238da867cb9d8c116640e1730caaa 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -21,9 +21,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/platform/host_info.h"
 #ifdef TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #endif  // TENSORFLOW_EAGER_USE_XLA
@@ -79,7 +81,7 @@ tensorflow::Status GetAllRemoteDevices(
     const std::vector<string>& remote_workers,
     tensorflow::WorkerCacheInterface* worker_cache,
     std::unique_ptr<tensorflow::DeviceMgr>* device_mgr) {
-  std::vector<tensorflow::Device*> remote_devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> remote_devices;
   tensorflow::Status status;
   // TODO(nareshmodi) do this in parallel instead of serially.
   for (const string& remote_worker : remote_workers) {
@@ -92,7 +94,7 @@ tensorflow::Status GetAllRemoteDevices(
           status = s;
           if (s.ok()) {
             for (tensorflow::Device* d : *devices) {
-              remote_devices.push_back(d);
+              remote_devices.emplace_back(d);
             }
           }
           n.Notify();
@@ -100,7 +102,7 @@ tensorflow::Status GetAllRemoteDevices(
     n.WaitForNotification();
   }
   std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr(
-      new tensorflow::DeviceMgr(remote_devices));
+      new tensorflow::DeviceMgr(std::move(remote_devices)));
 
   TF_RETURN_IF_ERROR(status);
 
@@ -261,13 +263,13 @@ TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
 void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   status->status = tensorflow::DeviceFactory::AddDevices(
       opts->session_options.options, "/job:localhost/replica:0/task:0",
       &devices);
   if (!status->status.ok()) return nullptr;
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
-      new tensorflow::DeviceMgr(devices));
+      new tensorflow::DeviceMgr(std::move(devices)));
 
   tensorflow::Rendezvous* r =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
@@ -404,8 +406,19 @@ const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) {
         "The passed in handle is a nullptr");
     return nullptr;
   }
-  tensorflow::Device* d = nullptr;
-  status->status = h->handle->OpDevice(&d);
+  tensorflow::Device* d = h->handle->op_device();
+  return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
+                        : d->name().c_str();
+}
+
+const char* TFE_TensorHandleBackingDeviceName(TFE_TensorHandle* h,
+                                              TF_Status* status) {
+  if (h == nullptr || h->handle == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "The passed in handle is a nullptr");
+    return nullptr;
+  }
+  tensorflow::Device* d = h->handle->device();
   return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
                         : d->name().c_str();
 }
@@ -459,13 +472,20 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
   const char* name = op_or_function_name;  // Shorthand
   const tensorflow::AttrTypeMap* types;
-  status->status = tensorflow::AttrTypeMapForOp(name, &types);
-  if (status->status.ok()) return new TFE_Op(ctx, name, types);
-  if (TF_GetCode(status) == TF_NOT_FOUND) {
-    if (ctx->context.FindFunctionByName(name)) {
-      status->status = tensorflow::Status::OK();
-      return new TFE_Op(ctx, name, nullptr);
+  bool is_function = false;
+  status->status = tensorflow::AttrTypeMapForOp(name, &types, &is_function);
+  if (status->status.ok()) {
+    if (is_function && !ctx->context.FindFunctionByName(name)) {
+      status->status = tensorflow::errors::NotFound(
+          "'", name,
+          "' is neither a type of a primitive operation nor a name "
+          "of a function registered in binary running on ",
+          tensorflow::port::Hostname(),
+          ". Make sure the operation or function is "
+          "registered in the binary running in this process.");
+      return nullptr;
     }
+    return new TFE_Op(ctx, name, is_function, types);
   }
   return nullptr;
 }
@@ -498,12 +518,6 @@ void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret;
-  if (op->operation.is_function()) {
-    status->status = tensorflow::errors::Unimplemented(
-        "TODO(apassos): Support for attributes for TensorFlow functions is not "
-        "ready yet.");
-    return TF_ATTR_INT;  // The compiler requires that we return something.
-  }
   status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
                                               attr_name, &ret, is_list);
   return ret;
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index b2454d872207e26feb3764671474a5d87c01f84d..f80ae5a6d02d4d613c95cf8486e0fc0aeed3affc 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -48,7 +48,7 @@ extern "C" {
 typedef struct TFE_ContextOptions TFE_ContextOptions;
 
 // Return a new options object.
-TF_CAPI_EXPORT extern TFE_ContextOptions* TFE_NewContextOptions();
+TF_CAPI_EXPORT extern TFE_ContextOptions* TFE_NewContextOptions(void);
 
 // Set the config in TF_ContextOptions.options.
 // config should be a serialized tensorflow.ConfigProto proto.
@@ -169,10 +169,33 @@ TF_CAPI_EXPORT extern int64_t TFE_TensorHandleNumElements(TFE_TensorHandle* h,
 TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
                                                   int dim_index,
                                                   TF_Status* status);
+
+// Returns the device of the operation that produced `h`.
+// If `h` was produced by a copy, returns the destination device of
+// the copy. Note that returned device name is not always the device
+// holding the tensor handle's memory. If you want the latter, use
+// TFE_TensorHandleBackingDeviceName.
+// This function will block till the operation that produces `h` has completed.
+//
+// Device on which the kernel of the operation that produced `h` ran.
+//
+// If `h` was produced by a copy, returns the destination device of
+// the copy.
+//
+// Note that returned device name is not always the device that owns the memory
+// that backs the tensor handle. For the latter see
+// TFE_TensorHandleBackingDeviceName.
+//
 // This function will block till the operation that produces `h` has completed.
 TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
     TFE_TensorHandle* h, TF_Status* status);
 
+// Returns the name of the device in whose memory `h` resides.
+//
+// This function will block till the operation that produces `h` has completed.
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleBackingDeviceName(
+    TFE_TensorHandle* h, TF_Status* status);
+
 // Return a pointer to a new TFE_TensorHandle that shares the underlying tensor
 // with `h`. On success, `status` is set to OK. On failure, `status` reflects
 // the error and a nullptr is returned.
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index 5006b76f1981d068e99a2c081115ebb3a66d8c7f..52b0824552855860dfb138f3ac9a5d3afa7dc965 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -57,13 +57,9 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
     return nullptr;
   }
 
-  tensorflow::Device* device;
-  status->status = handle->handle->Device(&device);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-
 #ifdef TENSORFLOW_EAGER_USE_XLA
+  tensorflow::Device* device = handle->handle->device();
+
   // If tensor resides on an XLA device, use XLA device's PaddedShapeFn.
   tensorflow::XlaDevice* xla_device =
       dynamic_cast<tensorflow::XlaDevice*>(device);
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 104d52430cf7aa14d4d2a335a1b96e667f21ce87..67bc1bcd24605f8363d6a7c8d5d6a0836a42fc82 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -79,10 +79,6 @@ struct TFE_TensorHandle {
                    tensorflow::Device* op_device)
       : handle(new tensorflow::TensorHandle(t, d, op_device, nullptr)) {}
 
-  TFE_TensorHandle(tensorflow::uint64 node_id, tensorflow::DataType dtype,
-                   tensorflow::EagerContext* ctx)
-      : handle(new tensorflow::TensorHandle(node_id, dtype, ctx)) {}
-
   TFE_TensorHandle(tensorflow::TensorHandle* handle) : handle(handle) {}
 
   tensorflow::TensorHandle* handle;
@@ -97,10 +93,9 @@ struct TFE_TensorDebugInfo {
 };
 
 struct TFE_Op {
-  // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
-  // primitive operation.
-  TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
-      : operation(&ctx->context, op, t) {}
+  TFE_Op(TFE_Context* ctx, const char* op, bool is_function,
+         const tensorflow::AttrTypeMap* t)
+      : operation(&ctx->context, op, is_function, t) {}
 
   tensorflow::EagerOperation operation;
 };
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 55331022b9dbd0696928fa44430f340f371432ac..6b39b79ee82f9c7baaf856e573a42b7da65691e5 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 
 #include <string.h>
+#include "absl/strings/match.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -589,9 +590,22 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) {
   TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   const int num_devices = TF_DeviceListCount(devices);
+  bool has_gpu0 = false;
+  bool has_gpu1 = false;
+  for (int i = 0; i < num_devices; ++i) {
+    const char* dev = TF_DeviceListName(devices, i, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    string device_name(dev);
+    if (device_name.find("GPU:0") != string::npos) {
+      has_gpu0 = true;
+    }
+    if (device_name.find("GPU:1") != string::npos) {
+      has_gpu1 = true;
+    }
+  }
 
   const char* kCPUDevice = "CPU:0";
-  if (num_devices < 3) {
+  if (!has_gpu0 || !has_gpu1) {
     TF_DeleteDeviceList(devices);
     TF_DeleteTensor(t);
     TFE_DeleteTensorHandle(hcpu);
@@ -781,6 +795,14 @@ TEST(CAPI, TensorHandleNullptr) {
 
   TF_SetStatus(status.get(), TF_OK, "");
 
+  device_name = TFE_TensorHandleBackingDeviceName(h, status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+  ASSERT_EQ(device_name, nullptr);
+  ASSERT_EQ("The passed in handle is a nullptr",
+            string(TF_Message(status.get())));
+
+  TF_SetStatus(status.get(), TF_OK, "");
+
   int num_dims = TFE_TensorHandleNumDims(h, status.get());
   ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
   ASSERT_EQ(num_dims, -1);
@@ -796,6 +818,62 @@ TEST(CAPI, TensorHandleNullptr) {
             string(TF_Message(status.get())));
 }
 
+TEST(CAPI, TensorHandleDevices) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
+  const char* device_name = TFE_TensorHandleDeviceName(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(device_name, "CPU:0")) << device_name;
+  const char* backing_device_name =
+      TFE_TensorHandleBackingDeviceName(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(backing_device_name, "CPU:0"))
+      << backing_device_name;
+
+  // Disable the test if no GPU is present.
+  string gpu_device_name;
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_Op* shape_op = ShapeOp(ctx, hgpu);
+    TFE_OpSetDevice(shape_op, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_TensorHandle* retvals[1];
+    int num_retvals = 1;
+    TFE_Execute(shape_op, &retvals[0], &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    // .device of shape is GPU since the op is executed on GPU
+    device_name = TFE_TensorHandleDeviceName(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_TRUE(absl::StrContains(device_name, "GPU:0")) << device_name;
+
+    // .backing_device of shape is CPU since the tensor is backed by CPU
+    backing_device_name =
+        TFE_TensorHandleBackingDeviceName(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_TRUE(absl::StrContains(backing_device_name, "CPU:0"))
+        << backing_device_name;
+
+    TFE_DeleteOp(shape_op);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TFE_DeleteTensorHandle(hgpu);
+  }
+
+  TFE_DeleteTensorHandle(hcpu);
+  TFE_ContextAsyncWait(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
+}
+
 void Execute_MatMul_CPU(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 5607c9dcb0bbec72b2f86def3dd4e6590d73197b..bd38127d50c171af801dd1b937acefdba491b4a6 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -99,8 +99,19 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   TFE_OpAddInput(op, b, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
-  TFE_OpSetAttrBool(op, "transpose_a", 0);
-  TFE_OpSetAttrBool(op, "transpose_b", 0);
+  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a));
+
+  return op;
+}
+
+TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a) {
+  TF_Status* status = TF_NewStatus();
+
+  TFE_Op* op = TFE_NewOp(ctx, "Shape", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, a, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
   TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a));
 
   return op;
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 474cae67c89249af3a62707f0db00ba458ca8f31..75ef9459e93b4f2ed471c423a34565594efc1714 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -37,6 +37,9 @@ TFE_TensorHandle* TestMatrixTensorHandle3X2();
 // Return a matmul op multiplying `a` by `b`.
 TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
 
+// Return a shape op fetching the shape of `a`.
+TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a);
+
 // Return an 1-D INT32 tensor containing a single value 1.
 TFE_TensorHandle* TestAxisTensorHandle();
 
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 5ba55a203ff70cc64c07e96b5a869a1f11c9334e..5c11f51e8749de84547ae873f5f55ebd42bc4b3d 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -141,8 +141,9 @@ class GradientTape {
   // null. The result is populated with one tensor per target element.
   Status ComputeGradient(
       const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-      gtl::ArraySlice<int64> target_tensor_ids,
-      gtl::ArraySlice<int64> source_tensor_id,
+      const gtl::ArraySlice<int64> target_tensor_ids,
+      const gtl::ArraySlice<int64> source_tensor_ids,
+      const gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
       gtl::ArraySlice<Gradient*> output_gradients,
       std::vector<Gradient*>* result);
 
@@ -396,6 +397,7 @@ template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status InitialGradients(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
     gtl::ArraySlice<int64> target_tensor_ids,
+    gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
     const OpTape<BackwardFunction, TapeTensor>& op_tape,
     gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
@@ -425,8 +427,13 @@ Status InitialGradients(
               "none of operations outputs match expected tensor");
         }
       } else {
-        // No record of the target tensor found on the tape, so no gradient
-        // needs to be computed from it. Do nothing.
+        // This target tensor was not generated by any operation recorded on
+        // the tape, so no gradient needs to be computed from it unless this
+        // target is also a source.
+        auto source_tensor = sources_that_are_targets.find(id);
+        if (source_tensor != sources_that_are_targets.end()) {
+          (*result)[id].push_back(vspace.Ones(source_tensor->second));
+        }
       }
     } else {
       (*result)[id].push_back(output_gradients[i]);
@@ -467,8 +474,9 @@ constexpr int kMinAggregateBytes = 128 * 1024 * 1024;
 template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<int64> source_tensor_ids,
+    const gtl::ArraySlice<int64> target_tensor_ids,
+    const gtl::ArraySlice<int64> source_tensor_ids,
+    const gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients,
     std::vector<Gradient*>* result) {
   gtl::FlatSet<int64> sources_set(source_tensor_ids.begin(),
@@ -478,7 +486,8 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
   std::vector<int64> op_stack =
       InitialStack(state.op_tape, state.op_missing_tensor);
   gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
-  Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
+  Status s = InitialGradients(vspace, target_tensor_ids,
+                              sources_that_are_targets, output_gradients,
                               tensor_tape_, state.op_tape, &gradients);
   auto cleanup = [this, &state]() {
     if (!persistent_) {
diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07b9e8b940c55caf62ae0b81b884bf313d335459
--- /dev/null
+++ b/tensorflow/c/env.cc
@@ -0,0 +1,161 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/env.h"
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+struct TF_StringStream {
+  std::vector<::tensorflow::string>* list;
+  size_t position;
+};
+
+void TF_CreateDir(const char* dirname, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->CreateDir(dirname));
+}
+
+void TF_DeleteDir(const char* dirname, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->DeleteDir(dirname));
+}
+
+void TF_DeleteRecursively(const char* dirname, int64_t* undeleted_file_count,
+                          int64_t* undeleted_dir_count, TF_Status* status) {
+  ::tensorflow::int64 f, d;
+
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->DeleteRecursively(dirname, &f, &d));
+  *undeleted_file_count = f;
+  *undeleted_dir_count = d;
+}
+
+void TF_FileStat(const char* filename, TF_FileStatistics* stats,
+                 TF_Status* status) {
+  ::tensorflow::FileStatistics cc_stats;
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Status s =
+      ::tensorflow::Env::Default()->Stat(filename, &cc_stats);
+  ::tensorflow::Set_TF_Status_from_Status(status, s);
+  if (s.ok()) {
+    stats->length = cc_stats.length;
+    stats->mtime_nsec = cc_stats.mtime_nsec;
+    stats->is_directory = cc_stats.is_directory;
+  }
+}
+
+void TF_NewWritableFile(const char* filename, TF_WritableFileHandle** handle,
+                        TF_Status* status) {
+  std::unique_ptr<::tensorflow::WritableFile> f;
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Status s =
+      ::tensorflow::Env::Default()->NewWritableFile(filename, &f);
+  ::tensorflow::Set_TF_Status_from_Status(status, s);
+
+  if (s.ok()) {
+    *handle = reinterpret_cast<TF_WritableFileHandle*>(f.release());
+  }
+}
+
+void TF_CloseWritableFile(TF_WritableFileHandle* handle, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, cc_file->Close());
+  delete cc_file;
+}
+
+void TF_SyncWritableFile(TF_WritableFileHandle* handle, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, cc_file->Sync());
+}
+
+void TF_FlushWritableFile(TF_WritableFileHandle* handle, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, cc_file->Flush());
+}
+
+void TF_AppendWritableFile(TF_WritableFileHandle* handle, const char* data,
+                           size_t length, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, cc_file->Append(::tensorflow::StringPiece{data, length}));
+}
+
+void TF_DeleteFile(const char* filename, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->DeleteFile(filename));
+}
+
+bool TF_StringStreamNext(TF_StringStream* list, const char** result) {
+  if (list->position >= list->list->size()) {
+    *result = nullptr;
+    return false;
+  }
+
+  *result = list->list->at(list->position++).c_str();
+  return true;
+}
+
+void TF_StringStreamDone(TF_StringStream* list) {
+  delete list->list;
+  delete list;
+}
+TF_StringStream* TF_GetChildren(const char* dirname, TF_Status* status) {
+  auto* children = new std::vector<::tensorflow::string>;
+
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->GetChildren(dirname, children));
+
+  auto* list = new TF_StringStream;
+  list->list = children;
+  list->position = 0;
+  return list;
+}
+
+TF_StringStream* TF_GetLocalTempDirectories() {
+  auto* tmpdirs = new std::vector<::tensorflow::string>;
+
+  ::tensorflow::Env::Default()->GetLocalTempDirectories(tmpdirs);
+
+  auto* list = new TF_StringStream;
+  list->list = tmpdirs;
+  list->position = 0;
+  return list;
+}
+
+TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void) {
+  return ::tensorflow::Env::Default()->NowNanos();
+}
+
+// Returns the number of microseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowMicros(void) {
+  return ::tensorflow::Env::Default()->NowMicros();
+}
+
+// Returns the number of seconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowSeconds(void) {
+  return ::tensorflow::Env::Default()->NowSeconds();
+}
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d27c5da37735042c7476b591e57486dbde33152
--- /dev/null
+++ b/tensorflow/c/env.h
@@ -0,0 +1,157 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_ENV_H_
+#define TENSORFLOW_C_ENV_H_
+
+#include "tensorflow/c/c_api.h"
+
+// --------------------------------------------------------------------------
+// C API for tensorflow::Env.
+
+struct TF_WritableFileHandle;
+struct TF_StringStream;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TF_FileStatistics {
+  // The length of the file in bytes.
+  int64_t length;
+  // The last modified time in nanoseconds.
+  int64_t mtime_nsec;
+  // Whether the name refers to a directory.
+  bool is_directory;
+} TF_FileStatistics;
+
+// Creates the specified directory. Typical status code are:
+//  * TF_OK - successfully created the directory
+//  * TF_ALREADY_EXISTS - directory already exists
+//  * TF_PERMISSION_DENIED - dirname is not writable
+TF_CAPI_EXPORT extern void TF_CreateDir(const char* dirname, TF_Status* status);
+
+// Deletes the specified directory. Typical status codes are:
+//  * TF_OK - successfully deleted the directory
+//  * TF_FAILED_PRECONDITION - the directory is not empty
+TF_CAPI_EXPORT extern void TF_DeleteDir(const char* dirname, TF_Status* status);
+
+// Deletes the specified directory and all subdirectories and files underneath
+// it. This is accomplished by traversing the directory tree rooted at dirname
+// and deleting entries as they are encountered.
+//
+// If dirname itself is not readable or does not exist, *undeleted_dir_count is
+// set to 1, *undeleted_file_count is set to 0 and an appropriate status (e.g.
+// TF_NOT_FOUND) is returned.
+//
+// If dirname and all its descendants were successfully deleted, TF_OK is
+// returned and both error counters are set to zero.
+//
+// Otherwise, while traversing the tree, undeleted_file_count and
+// undeleted_dir_count are updated if an entry of the corresponding type could
+// not be deleted. The returned error status represents the reason that any one
+// of these entries could not be deleted.
+//
+// Typical status codes:
+//  * TF_OK - dirname exists and we were able to delete everything underneath
+//  * TF_NOT_FOUND - dirname doesn't exist
+//  * TF_PERMISSION_DENIED - dirname or some descendant is not writable
+//  * TF_UNIMPLEMENTED - some underlying functions (like Delete) are not
+//    implemented
+TF_CAPI_EXPORT extern void TF_DeleteRecursively(const char* dirname,
+                                                int64_t* undeleted_file_count,
+                                                int64_t* undeleted_dir_count,
+                                                TF_Status* status);
+
+// Obtains statistics for the given path. If status is TF_OK, *stats is
+// updated, otherwise it is not touched.
+TF_CAPI_EXPORT extern void TF_FileStat(const char* filename,
+                                       TF_FileStatistics* stats,
+                                       TF_Status* status);
+
+// Creates or truncates the given filename and returns a handle to be used for
+// appending data to the file. If status is TF_OK, *handle is updated and the
+// caller is responsible for freeing it (see TF_CloseWritableFile).
+TF_CAPI_EXPORT extern void TF_NewWritableFile(const char* filename,
+                                              TF_WritableFileHandle** handle,
+                                              TF_Status* status);
+
+// Closes the given handle and frees its memory. If there was a problem closing
+// the file, it is indicated by status. Memory is freed in any case.
+TF_CAPI_EXPORT extern void TF_CloseWritableFile(TF_WritableFileHandle* handle,
+                                                TF_Status* status);
+
+// Syncs content of the handle to the filesystem. Blocks waiting for the
+// filesystem to indicate that the content has been persisted.
+TF_CAPI_EXPORT extern void TF_SyncWritableFile(TF_WritableFileHandle* handle,
+                                               TF_Status* status);
+
+// Flush local buffers to the filesystem. If the process terminates after a
+// successful flush, the contents may still be persisted, since the underlying
+// filesystem may eventually flush the contents.  If the OS or machine crashes
+// after a successful flush, the contents may or may not be persisted, depending
+// on the implementation.
+TF_CAPI_EXPORT extern void TF_FlushWritableFile(TF_WritableFileHandle* handle,
+                                                TF_Status* status);
+
+// Appends the given bytes to the file. Any failure to do so is indicated in
+// status.
+TF_CAPI_EXPORT extern void TF_AppendWritableFile(TF_WritableFileHandle* handle,
+                                                 const char* data,
+                                                 size_t length,
+                                                 TF_Status* status);
+
+// Deletes the named file and indicates whether successful in *status.
+TF_CAPI_EXPORT extern void TF_DeleteFile(const char* filename,
+                                         TF_Status* status);
+
+// Retrieves the next item from the given TF_StringStream and places a pointer
+// to it in *result. If no more items are in the list, *result is set to NULL
+// and false is returned.
+//
+// Ownership of the items retrieved with this function remains with the library.
+// Item points are invalidated after a call to TF_StringStreamDone.
+TF_CAPI_EXPORT extern bool TF_StringStreamNext(TF_StringStream* list,
+                                               const char** result);
+
+// Frees the resources associated with given string list. All pointers returned
+// by TF_StringStreamNext are invalid after this call.
+TF_CAPI_EXPORT extern void TF_StringStreamDone(TF_StringStream* list);
+
+// Retrieves the list of children of the given directory. You can iterate
+// through the list with TF_StringStreamNext. The caller is responsible for
+// freeing the list (see TF_StringStreamDone).
+TF_CAPI_EXPORT extern TF_StringStream* TF_GetChildren(const char* filename,
+                                                      TF_Status* status);
+
+// Retrieves a list of directory names on the local machine that may be used for
+// temporary storage. You can iterate through the list with TF_StringStreamNext.
+// The caller is responsible for freeing the list (see TF_StringStreamDone).
+TF_CAPI_EXPORT extern TF_StringStream* TF_GetLocalTempDirectories(void);
+
+// Returns the number of nanoseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void);
+
+// Returns the number of microseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowMicros(void);
+
+// Returns the number of seconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowSeconds(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_C_ENV_H_
diff --git a/tensorflow/c/env_test.cc b/tensorflow/c/env_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2206c6befd2167346c64032940d6e8c631e4a3e
--- /dev/null
+++ b/tensorflow/c/env_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/env.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x))
+
+TEST(TestEnv, TestDirHandling) {
+  TF_StringStream* tempdirs = TF_GetLocalTempDirectories();
+  const char* tempdir;
+  bool found = false;
+  while (TF_StringStreamNext(tempdirs, &tempdir)) {
+    found = true;
+
+    TF_Status* s = TF_NewStatus();
+
+    ::tensorflow::string dirpath =
+        ::tensorflow::io::JoinPath(tempdir, "somedir");
+    TF_CreateDir(dirpath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_CreateDir failed for " << dirpath << ": "
+                    << TF_Message(s);
+
+    ::tensorflow::string filepath =
+        ::tensorflow::io::JoinPath(dirpath, "somefile.txt");
+    TF_WritableFileHandle* handle;
+    TF_NewWritableFile(filepath.c_str(), &handle, s);
+    ASSERT_TF_OK(s) << "NewWritableFile failed for " << filepath << ": "
+                    << TF_Message(s);
+
+    const char* data = "Hello, world!\n";
+    TF_AppendWritableFile(handle, data, strlen(data), s);
+    ASSERT_TF_OK(s) << "TF_AppendWritableFile failed to append data to file at "
+                    << filepath << ": " << TF_Message(s);
+
+    TF_CloseWritableFile(handle, s);
+    ASSERT_TF_OK(s) << "TF_CloseWritableFile failed to close handle to "
+                    << filepath << ": " << TF_Message(s);
+
+    TF_StringStream* children = TF_GetChildren(dirpath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_GetChildren failed for " << dirpath;
+    const char* childpath;
+    ASSERT_TRUE(TF_StringStreamNext(children, &childpath));
+    ASSERT_EQ(::tensorflow::string(childpath), "somefile.txt");
+    // There should only be one file in this directory.
+    ASSERT_FALSE(TF_StringStreamNext(children, &childpath));
+    ASSERT_EQ(childpath, nullptr);
+    TF_StringStreamDone(children);
+
+    TF_FileStatistics stats;
+    TF_FileStat(filepath.c_str(), &stats, s);
+    ASSERT_EQ(stats.length, strlen(data));
+    ASSERT_FALSE(stats.is_directory);
+    ASSERT_GT(stats.mtime_nsec, 0);
+
+    // Trying to delete a non-empty directory should fail.
+    TF_DeleteDir(dirpath.c_str(), s);
+    ASSERT_NE(TF_OK, TF_GetCode(s))
+        << "TF_DeleteDir unexpectedly succeeded with a non-empty directory "
+        << dirpath;
+
+    TF_DeleteFile(filepath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_DeleteFile failed for " << filepath << ": "
+                    << TF_Message(s);
+
+    // Now deleting the directory should work.
+    TF_DeleteDir(dirpath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_DeleteDir failed for " << dirpath << ": "
+                    << TF_Message(s);
+
+    TF_DeleteStatus(s);
+    break;
+  }
+
+  ASSERT_TRUE(found) << "expected at least one temp dir";
+
+  TF_StringStreamDone(tempdirs);
+}
+
+TEST(TestEnv, TestTimeFunctions) {
+  ASSERT_GE(TF_NowSeconds(), 946684800);  // Midnight Jan 1, 2000
+  ASSERT_GE(TF_NowMicros(), 946684800 * 1e6);
+  ASSERT_GE(TF_NowNanos(), 946684800 * 1e9);
+}
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a4eaecb6cf2740a522b1e849d1306ebde6c4577
--- /dev/null
+++ b/tensorflow/c/kernels.cc
@@ -0,0 +1,160 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+// This file forms the basis of a stable ABI for third-party kernel
+// implementations. It is crucial that changes to this file are made cautiously
+// and with a focus on maintaining both source and binary compatibility.
+
+struct TF_KernelBuilder {
+  ::tensorflow::KernelDefBuilder* cc_builder;
+
+  void* (*create_function)(TF_OpKernelConstruction*);
+  void (*compute_function)(void*, TF_OpKernelContext*);
+  void (*delete_function)(void*);
+};
+
+TF_KernelBuilder* TF_NewKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_func)(void*, TF_OpKernelContext*),
+    void (*delete_func)(void*)) {
+  TF_KernelBuilder* result = new TF_KernelBuilder;
+  result->cc_builder = new ::tensorflow::KernelDefBuilder(op_name);
+  result->cc_builder->Device(device_name);
+  result->create_function = create_func;
+  result->compute_function = compute_func;
+  result->delete_function = delete_func;
+  return result;
+}
+
+void TF_DeleteKernelBuilder(TF_KernelBuilder* builder) {
+  DCHECK_NE(builder, nullptr);
+  delete builder->cc_builder;
+  delete builder;
+}
+
+namespace tensorflow {
+namespace {
+
+// An OpKernel whose methods delegate to C function pointers.
+class COpKernel : public OpKernel {
+ public:
+  explicit COpKernel(OpKernelConstruction* ctx,
+                     void* (*create_func)(TF_OpKernelConstruction*),
+                     void (*compute_func)(void*, TF_OpKernelContext*),
+                     void (*delete_func)(void*))
+      : OpKernel(ctx), compute_func_(compute_func), delete_func_(delete_func) {
+    if (create_func != nullptr) {
+      c_kernel_ =
+          (*create_func)(reinterpret_cast<TF_OpKernelConstruction*>(ctx));
+    } else {
+      c_kernel_ = nullptr;
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    (*compute_func_)(c_kernel_, reinterpret_cast<TF_OpKernelContext*>(ctx));
+  }
+
+  ~COpKernel() override {
+    if (delete_func_ != nullptr) {
+      (*delete_func_)(c_kernel_);
+    }
+  }
+
+ private:
+  void (*compute_func_)(void*, TF_OpKernelContext* context);
+  void (*delete_func_)(void*);
+  void* c_kernel_;
+};
+
+// A KernelFactory that returns COpKernel instances.
+class KernelBuilderFactory
+    : public ::tensorflow::kernel_factory::OpKernelFactory {
+ public:
+  explicit KernelBuilderFactory(TF_KernelBuilder* builder)
+      : builder_(builder) {}
+  ::tensorflow::OpKernel* Create(
+      ::tensorflow::OpKernelConstruction* context) override {
+    return new ::tensorflow::COpKernel(context, builder_->create_function,
+                                       builder_->compute_function,
+                                       builder_->delete_function);
+  }
+  ~KernelBuilderFactory() override { TF_DeleteKernelBuilder(builder_); }
+
+ private:
+  TF_KernelBuilder* builder_;
+};
+}  // namespace
+}  // namespace tensorflow
+
+void TF_RegisterKernelBuilder(const char* name, TF_KernelBuilder* builder,
+                              TF_Status* status) {
+  using tensorflow::register_kernel::Name;
+
+  tensorflow::kernel_factory::OpKernelRegistrar(
+      builder->cc_builder->Build(), name,
+      absl::make_unique<tensorflow::KernelBuilderFactory>(builder));
+
+  TF_SetStatus(status, TF_OK, "");
+}
+
+int TF_NumInputs(TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return cc_ctx->num_inputs();
+}
+
+int TF_NumOutputs(TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return cc_ctx->num_outputs();
+}
+
+void TF_GetInput(TF_OpKernelContext* ctx, int i, TF_Tensor** tensor,
+                 TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  if (i < 0 || i >= cc_ctx->num_inputs()) {
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "input index out of range");
+    return;
+  }
+  const ::tensorflow::Tensor& cc_tensor(cc_ctx->input(i));
+  TF_Tensor* result = ::tensorflow::TF_TensorFromTensor(cc_tensor, status);
+  if (TF_GetCode(status) == TF_OK) {
+    *tensor = result;
+  }
+}
+
+void TF_SetOutput(TF_OpKernelContext* ctx, int i, const TF_Tensor* tensor,
+                  TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  if (i < 0 || i >= cc_ctx->num_inputs()) {
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "input index out of range");
+    return;
+  }
+  ::tensorflow::Tensor cc_tensor;
+  ::tensorflow::Status s = ::tensorflow::TF_TensorToTensor(tensor, &cc_tensor);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, s);
+  if (s.ok()) {
+    cc_ctx->set_output(i, cc_tensor);
+  }
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a91aa184f11ac8e45b38a1d106c7b445747a7c1
--- /dev/null
+++ b/tensorflow/c/kernels.h
@@ -0,0 +1,118 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_KERNELS_H_
+#define TENSORFLOW_C_KERNELS_H_
+
+#include "tensorflow/c/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// C API for TensorFlow Kernels.
+//
+// This API allows developers to register custom kernel implementations for
+// TensorFlow.
+//
+// See c_api.h header comments for a discussion about API conventions.
+//
+// Users wishing to extend TensorFlow with new kernels will call
+// `TF_NewKernelBuilder`. The resulting kernel builder can be registered with
+// `TF_RegisterKernelBuilder`, which will allow TF to construct user-provided
+// kernels when necessary.
+
+struct TF_KernelBuilder;
+struct TF_OpKernelConstruction;
+struct TF_OpKernelContext;
+
+// Allocates a new kernel builder and returns a pointer to it.
+//
+// If non-null, TensorFlow will call create_func when it needs to instantiate
+// the kernel. The pointer returned by create_func will be passed to
+// compute_func and delete_func, thereby functioning as a "this" pointer for
+// referring to kernel instances.
+//
+// The TF_OpKernelConstruction pointer passed to create_func is owned by
+// TensorFlow and will be deleted once create_func returns. It must not be used
+// after this.
+//
+// When TensorFlow needs to perform a computation with this kernel, it will
+// call compute_func. This function will receive the pointer returned by
+// create_func (or null if no create_func was provided), along with the inputs
+// to the computation.
+//
+// The TF_OpKernelContext pointer received by compute_func is owned by
+// TensorFlow and will be deleted once compute_func returns. It must not be used
+// after this.
+//
+// Finally, when TensorFlow no longer needs the kernel, it will call
+// delete_func if one is provided. This function will receive the pointer
+// returned in `create_func` or nullptr if no `create_func` was provided.
+//
+// The caller should pass the result of this function to
+// TF_RegisterKernelBuilder, which will take ownership of the pointer. If, for
+// some reason, the kernel builder will not be registered, the caller should
+// delete it with TF_DeleteKernelBuilder.
+TF_CAPI_EXPORT extern TF_KernelBuilder* TF_NewKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_func)(void*, TF_OpKernelContext*),
+    void (*delete_func)(void*));
+
+// Register the given kernel builder with the TensorFlow runtime. If
+// registration fails, the given status will be populated.
+//
+// This call takes ownership of the `builder` pointer.
+TF_CAPI_EXPORT extern void TF_RegisterKernelBuilder(const char* kernel_name,
+                                                    TF_KernelBuilder* builder,
+                                                    TF_Status* status);
+
+// Deletes the given TF_KernelBuilder. This should be called only if the kernel
+// builder is not registered with TensorFlow via TF_RegisterKernelBuilder.
+TF_CAPI_EXPORT extern void TF_DeleteKernelBuilder(TF_KernelBuilder* builder);
+
+// --------------------------------------------------------------------------
+// OpKernelContext routines
+
+// TF_NumInputs returns the number of inputs available in ctx.
+TF_CAPI_EXPORT extern int TF_NumInputs(TF_OpKernelContext* ctx);
+
+// TF_NumOutputs returns the number of outputs to be placed in *ctx by the
+// kernel.
+TF_CAPI_EXPORT extern int TF_NumOutputs(TF_OpKernelContext* ctx);
+
+// Retrieves the ith input from ctx. If TF_GetCode(status) is TF_OK, *tensor is
+// populated and its ownership is passed to the caller. In any other case,
+// *tensor is not modified.
+//
+// If i < 0 or i >= TF_NumInputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern void TF_GetInput(TF_OpKernelContext* ctx, int i,
+                                       TF_Tensor** tensor, TF_Status* status);
+
+// Sets the ith output of ctx to tensor. If TF_GetCode(status) is anything but
+// TF_OK, ctx is left unmodified.
+//
+// If i < 0 or i >= TF_NumOutputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern void TF_SetOutput(TF_OpKernelContext* ctx, int i,
+                                        const TF_Tensor* tensor,
+                                        TF_Status* status);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_KERNELS_H_
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e659ee3c3d258a626ccf03a782ec031b5a703a48
--- /dev/null
+++ b/tensorflow/c/kernels_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/kernels.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/node_def.pb_text.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+struct MyCustomKernel {
+  bool created;
+  bool compute_called;
+};
+
+static bool delete_called = false;
+
+static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
+  struct MyCustomKernel* s = new struct MyCustomKernel;
+  s->created = true;
+  s->compute_called = false;
+  return s;
+}
+
+static void MyComputeFunc(void* kernel, TF_OpKernelContext* ctx) {
+  struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
+  s->compute_called = true;
+}
+
+static void MyDeleteFunc(void* kernel) {
+  struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
+  EXPECT_TRUE(s->created);
+  EXPECT_TRUE(s->compute_called);
+  delete_called = true;
+  delete s;
+}
+
+namespace tensorflow {
+
+static std::unique_ptr<OpKernel> GetFakeKernel(const char* device_name,
+                                               const char* op_name,
+                                               Status* status) {
+  NodeDef def;
+  def.set_op(op_name);
+  def.set_device(device_name);
+  def.add_input("input1");
+  def.add_input("input2");
+  return CreateOpKernel(DeviceType(device_name), nullptr, nullptr, def, 1,
+                        status);
+}
+
+// Tests registration of a single C kernel and checks that calls through the
+// C/C++ boundary are being made.
+TEST(TestKernel, TestRegisterKernelBuilder) {
+  const char* kernel_name = "SomeKernelName";
+  const char* op_name = "FooOp";
+  const char* device_name = "FakeDeviceName1";
+
+  REGISTER_OP(op_name)
+      .Input("input1: double")
+      .Input("input2: uint8")
+      .Output("output1: uint8");
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(
+      op_name, device_name, &MyCreateFunc, &MyComputeFunc, &MyDeleteFunc);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_Buffer* buf = TF_GetRegisteredKernelsForOp(op_name, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    KernelList list;
+    list.ParseFromArray(buf->data, buf->length);
+    ASSERT_EQ(1, list.kernel_size());
+    ASSERT_EQ(device_name, list.kernel(0).device_type());
+    TF_DeleteBuffer(buf);
+    TF_DeleteStatus(status);
+  }
+
+  {
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+    kernel->Compute(nullptr);
+  }
+
+  ASSERT_TRUE(delete_called);
+}
+
+class DummyDevice : public DeviceBase {
+ public:
+  DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {}
+  bool RequiresRecordingAccessedTensors() const override { return save_; }
+  Allocator* GetAllocator(AllocatorAttributes /*attr*/) override {
+    return cpu_allocator();
+  }
+
+ private:
+  bool save_;
+};
+
+TEST(TestKernel, TestInputAndOutputCount) {
+  const char* kernel_name = "InputOutputCounterKernel";
+  const char* op_name = "BarOp";
+  const char* device_name = "FakeDeviceName2";
+
+  REGISTER_OP(op_name)
+      .Input("input1: double")
+      .Input("input2: uint8")
+      .Output("output1: uint8");
+
+  static int num_inputs = 0;
+  static int num_outputs = 0;
+
+  // A kernel whose Compute function has a side-effect of updating num_inputs
+  // and num_outputs. Various functions on TF_OpKernelContext are also
+  // exercised.
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    num_inputs = TF_NumInputs(ctx);
+    num_outputs = TF_NumOutputs(ctx);
+
+    TF_Tensor* input = nullptr;
+    TF_Status* s = TF_NewStatus();
+    TF_GetInput(ctx, 0, &input, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << "Failed to get input: " << TF_Message(s);
+    EXPECT_EQ(123, *static_cast<tensorflow::uint8*>(TF_TensorData(input)));
+    TF_GetInput(ctx, -1, &input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+    TF_GetInput(ctx, 3, &input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+
+    // Copy the input tensor to output.
+    TF_SetOutput(ctx, 0, input, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+
+    TF_SetOutput(ctx, 24, input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+
+    TF_DeleteStatus(s);
+    if (input != nullptr) {
+      TF_DeleteTensor(input);
+    }
+  };
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(op_name, device_name, nullptr,
+                                                  my_compute_func, nullptr);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_DeleteStatus(status);
+  }
+
+  {
+    OpKernelContext::Params p;
+    DummyDevice dummy_device(nullptr, false);
+    p.device = &dummy_device;
+
+    Tensor t(tensorflow::uint8(123));
+
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    // Simulate 2 inputs
+    inputs.emplace_back(&t);
+    inputs.emplace_back();
+    p.inputs = &inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+
+    p.op_kernel = kernel.get();
+    OpKernelContext ctx(&p);
+    kernel->Compute(&ctx);
+
+    ASSERT_EQ(2, num_inputs);
+    ASSERT_EQ(1, num_outputs);
+    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<tensorflow::uint8>()());
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index 247236b760dd8c07bbb08426100b6a4d34296d2e..98d8393332269ae349cf8aa5c0b612c6f17172e6 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -160,4 +160,17 @@ void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
   ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
 }
 
+void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
+                       TF_Status* status) {
+  mutex_lock l(graph->mu);
+  status->status = graph->graph.AddWhileInputHack(&new_src.oper->node,
+                                                  new_src.index, &dst->node);
+  if (status->status.ok()) {
+    // This modification only updates the destination node for
+    // the purposes of running this graph in a session. Thus, we don't
+    // record the source node as being modified.
+    RecordMutation(graph, *dst, "adding input tensor");
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index 5cce84020bc68d912d259f51512341eb5f464a2c..44779ca656165dd65590cb5e9ea3ccf71165ed63 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -34,6 +34,7 @@ void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
 
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
 
+// Updates 'dst' to consume 'new_src'.
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status);
 
@@ -65,6 +66,13 @@ std::string GetHandleShapeAndType(TF_Graph* graph, TF_Output output);
 // because I couldn't get SWIG to work otherwise.
 void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
                            size_t proto_len, TF_Status* status);
+
+// This method is used to add a new input edge to 'dst', which must be a While
+// op. The While op's "T" attribute must have already been updated to include
+// the new edge. This is used to construct tf.while_loop gradients.
+void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
+                       TF_Status* status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/c/test_op1.cc b/tensorflow/c/test_op1.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b22cc9aef2b344282f45340ff12ee849935a26f9
--- /dev/null
+++ b/tensorflow/c/test_op1.cc
@@ -0,0 +1,23 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+REGISTER_OP("TestCApi1").Doc(R"doc(Used to test C API)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index b587e63227708427e7fae47f8f4a7b524d963ed9..a09becc49b10d2c58f98fbcc11df5190f794c1d4 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -170,6 +170,7 @@ cc_library_with_android_deps(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -411,6 +412,7 @@ tf_cc_test(
     srcs = ["gradients/nn_grad_test.cc"],
     deps = [
         ":cc_ops",
+        ":cc_ops_internal",
         ":grad_op_registry",
         ":grad_testutil",
         ":gradient_checker",
@@ -453,11 +455,33 @@ tf_cc_test(
     ],
 )
 
+# Generates separate libraries for array_ops and math_ops to reduce the dependency count of targets that depend on only these
 tf_gen_op_wrappers_cc(
-    name = "cc_ops",
+    name = "math_ops",
+    api_def_srcs = ["//tensorflow/core/api_def:base_api_def"],
+    op_lib_names = [
+        "math_ops",
+    ],
+    pkg = "//tensorflow/core",
+)
+
+tf_gen_op_wrappers_cc(
+    name = "array_ops",
     api_def_srcs = ["//tensorflow/core/api_def:base_api_def"],
     op_lib_names = [
         "array_ops",
+    ],
+    pkg = "//tensorflow/core",
+)
+
+tf_gen_op_wrappers_cc(
+    name = "cc_ops",
+    api_def_srcs = ["//tensorflow/core/api_def:base_api_def"],
+    deps_internal = [
+        ":array_ops_internal",
+        ":math_ops_internal",
+    ],
+    op_lib_names = [
         "audio_ops",
         "candidate_sampling_ops",
         "control_flow_ops",
@@ -465,10 +489,10 @@ tf_gen_op_wrappers_cc(
         "image_ops",
         "io_ops",
         "linalg_ops",
+        "list_ops",
         "logging_ops",
         "lookup_ops",
         "manip_ops",
-        "math_ops",
         "nn_ops",
         "no_op",
         "parsing_ops",
@@ -480,10 +504,23 @@ tf_gen_op_wrappers_cc(
         "user_ops",
     ],
     other_hdrs = [
+        "ops/array_ops.h",
         "ops/const_op.h",
+        "ops/math_ops.h",
         "ops/standard_ops.h",
     ],
+    other_hdrs_internal = [
+        "ops/array_ops_internal.h",
+        "ops/math_ops_internal.h",
+    ],
     pkg = "//tensorflow/core",
+    deps = [
+        ":array_ops",
+        ":const_op",
+        ":math_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+    ],
 )
 
 tf_cc_test(
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 6abc9e268e3ac97379954a34017ddffa010db67f..81785b2d89b3d36b46992b7ae376b5175a806027 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -95,6 +95,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ScopeName, const string& name,
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
       assigned_device_(other.impl()->assigned_device_),
+      xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -112,6 +113,7 @@ Scope::Impl::Impl(const Scope& other, Tags::OpName, const string& name,
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
       assigned_device_(other.impl()->assigned_device_),
+      xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -135,6 +137,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ControlDeps,
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
       assigned_device_(other.impl()->assigned_device_),
+      xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -167,6 +170,7 @@ Scope::Impl::Impl(const Scope& other, Tags::SingleUseScope,
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
       assigned_device_(other.impl()->assigned_device_),
+      xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -183,6 +187,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ExitOnError)
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
       assigned_device_(other.impl()->assigned_device_),
+      xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -200,6 +205,7 @@ Scope::Impl::Impl(const Scope& other, Tags::KernelLabel,
       kernel_label_(kernel_label),
       device_(other.impl()->device_),
       assigned_device_(other.impl()->assigned_device_),
+      xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -217,6 +223,7 @@ Scope::Impl::Impl(const Scope& other, Tags::Colocate,
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
       assigned_device_(other.impl()->assigned_device_),
+      xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(
           clear_colocations
               ? std::unordered_set<string>()
@@ -237,6 +244,25 @@ Scope::Impl::Impl(const Scope& other, Tags::AssignedDevice,
       kernel_label_(other.impl()->kernel_label_),
       device_(other.impl()->device_),
       assigned_device_(assigned_device),
+      xla_cluster_(other.impl()->xla_cluster_),
+      colocation_constraints_(other.impl()->colocation_constraints_),
+      disable_shape_inference_(other.impl()->disable_shape_inference_) {}
+
+Scope::Impl::Impl(const Scope& other, Tags::XlaCluster,
+                  const string& xla_cluster)
+    : graph_(other.impl()->graph_),
+      status_(other.impl()->status_),
+      name_map_(other.impl()->name_map_),
+      refiner_(other.impl()->refiner_),
+      scope_used_(other.impl()->scope_used_),
+      control_deps_(other.impl()->control_deps_),
+      name_(other.impl()->name_),
+      op_name_(other.impl()->op_name_),
+      exit_on_error_(other.impl()->exit_on_error_),
+      kernel_label_(other.impl()->kernel_label_),
+      device_(other.impl()->device_),
+      assigned_device_(other.impl()->assigned_device_),
+      xla_cluster_(xla_cluster),
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
@@ -326,6 +352,9 @@ void Scope::UpdateBuilder(NodeBuilder* builder) const {
   if (!impl()->assigned_device_.empty()) {
     builder->AssignedDevice(impl()->assigned_device_);
   }
+  if (!impl()->xla_cluster_.empty()) {
+    builder->XlaCluster(impl()->xla_cluster_);
+  }
 }
 
 string Scope::Impl::GetUniqueName(const string& prefix,
@@ -388,7 +417,7 @@ Scope Scope::NewSubScope(const string& child_scope_name) const {
                         false /* copy_names */));
 }
 
-Scope Scope::WithOpName(const string& op_name) const {
+Scope Scope::WithOpNameImpl(const string& op_name) const {
   if (impl()->single_use_scope()) {
     UpdateStatus(errors::InvalidArgument("Cannot set op name ", op_name,
                                          " on this scope"));
@@ -425,6 +454,10 @@ Scope Scope::WithAssignedDevice(const string& assigned_device) const {
   return Scope(new Impl(*this, Impl::Tags::AssignedDevice(), assigned_device));
 }
 
+Scope Scope::WithXlaCluster(const string& xla_cluster) const {
+  return Scope(new Impl(*this, Impl::Tags::XlaCluster(), xla_cluster));
+}
+
 Scope Scope::ColocateWith(const Operation& op) const {
   return Scope(new Impl(*this, Impl::Tags::Colocate(), op,
                         /* clear_colocations */ false));
diff --git a/tensorflow/cc/framework/scope.h b/tensorflow/cc/framework/scope.h
index e307d8989b6647dfac8d2691ed2171c86b7f3a7c..0a75f23725c143e6b22ee6dffae1428ed8209fe8 100644
--- a/tensorflow/cc/framework/scope.h
+++ b/tensorflow/cc/framework/scope.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -69,8 +70,9 @@ struct CompositeOpScopes;
 ///     // W will be named "linear/W"
 ///     auto W = Variable(linear.WithOpName("W"),
 ///                       {2, 2}, DT_FLOAT);
-///     // b will be named "linear/b"
-///     auto b = Variable(linear.WithOpName("b"),
+///     // b will be named "linear/b_3"
+///     int idx = 3;
+///     auto b = Variable(linear.WithOpName("b_", idx),
 ///                       {2}, DT_FLOAT);
 ///     auto x = Const(linear, {...});  // name: "linear/Const"
 ///     auto m = MatMul(linear, x, W);  // name: "linear/MatMul"
@@ -113,8 +115,11 @@ class Scope {
   Scope NewSubScope(const string& child_scope_name) const;
 
   /// Return a new scope. All ops created within the returned scope will have
-  /// names of the form `name/op_name[_suffix]`.
-  Scope WithOpName(const string& op_name) const;
+  /// names of the form `name/StrCat(fragments...)[_suffix]`
+  template <typename... Ty>
+  Scope WithOpName(Ty... fragments) const {
+    return WithOpNameImpl(absl::StrCat(fragments...));
+  }
 
   /// Return a new scope. All ops created within the returned scope will have as
   /// control dependencies the union of operations in the control_deps vector
@@ -137,6 +142,10 @@ class Scope {
   /// their assigned device set to `assigned_device`.
   Scope WithAssignedDevice(const string& assigned_device) const;
 
+  /// Returns a new scope.  All ops created within the returned scope will have
+  /// their _XlaCluster attribute set to `xla_cluster`.
+  Scope WithXlaCluster(const string& xla_cluster) const;
+
   /// Return a new scope. All ops created within the returned scope will be
   /// co-located on the device where op is placed.
   /// NOTE: This function is intended to be use internal libraries only for
@@ -227,6 +236,8 @@ class Scope {
   // END_SKIP_DOXYGEN
 
  private:
+  Scope WithOpNameImpl(const string& op_name) const;
+
   friend class InternalScope;
   std::unique_ptr<Impl> impl_;
   explicit Scope(Impl*);
diff --git a/tensorflow/cc/framework/scope_internal.h b/tensorflow/cc/framework/scope_internal.h
index 514e02e84146b6d95147d83182e5d9a07509cfa1..5db7eab2b819c2c5d8fc358953d4607848f1cba5 100644
--- a/tensorflow/cc/framework/scope_internal.h
+++ b/tensorflow/cc/framework/scope_internal.h
@@ -61,6 +61,7 @@ class Scope::Impl {
     enum class KernelLabel;
     enum class Colocate;
     enum class AssignedDevice;
+    enum class XlaCluster;
   };
 
   Impl(Graph* graph, Status* status, NameMap* name_map, ShapeRefiner* refiner,
@@ -78,6 +79,7 @@ class Scope::Impl {
   Impl(const Scope& other, Tags::Colocate, const Operation& colocate_with_op,
        bool clear_colocations);
   Impl(const Scope& other, Tags::AssignedDevice, const string& assigned_device);
+  Impl(const Scope& other, Tags::XlaCluster, const string& xla_cluster);
 
   std::unordered_set<string> GetColocationConstraints(
       const Operation& colocate_with_op) const;
@@ -112,6 +114,7 @@ class Scope::Impl {
   const string kernel_label_ = "";
   const string device_ = "";
   const string assigned_device_ = "";
+  const string xla_cluster_ = "";
   const std::unordered_set<string> colocation_constraints_;
 
   // If true, Scope::DoShapeInference() always returns Status:OK().
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 588e96cb196189780037f66266484962ba0385e4..2a32a2ed6f7862a29f4ce3d1aba5fdbc86adc670 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -143,6 +143,33 @@ Status Relu6GradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Relu6", Relu6GradHelper);
 
+Status LeakyReluGradHelper(const Scope& scope, const Operation& op,
+                           const std::vector<Output>& grad_inputs,
+                           std::vector<Output>* grad_outputs) {
+  float alpha;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "alpha", &alpha));
+  internal::LeakyReluGrad::Attrs attrs;
+  auto dx = internal::LeakyReluGrad(scope, grad_inputs[0], op.input(0),
+                                    attrs.Alpha(alpha));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("LeakyRelu", LeakyReluGradHelper);
+
+Status LeakyReluGradGradHelper(const Scope& scope, const Operation& op,
+                               const std::vector<Output>& grad_inputs,
+                               std::vector<Output>* grad_outputs) {
+  float alpha;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "alpha", &alpha));
+  internal::LeakyReluGrad::Attrs attrs;
+  auto dx = internal::LeakyReluGrad(scope, grad_inputs[0], op.input(1),
+                                    attrs.Alpha(alpha));
+  grad_outputs->push_back(dx);
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("LeakyReluGrad", LeakyReluGradGradHelper);
+
 Status EluGradHelper(const Scope& scope, const Operation& op,
                      const std::vector<Output>& grad_inputs,
                      std::vector<Output>* grad_outputs) {
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index aa72cf7ba2a958f54d50b59f0edaefb27edf0e86..f5a09e09dcda3e06c71d44d5fa5a1b121a9ade58 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/gradient_checker.h"
 #include "tensorflow/cc/framework/testutil.h"
 #include "tensorflow/cc/gradients/grad_testutil.h"
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -160,6 +161,32 @@ TEST_F(NNGradTest, Relu6Grad) {
   RunTest(x, x_init_value, y, shape);
 }
 
+TEST_F(NNGradTest, LeakyReluGrad) {
+  TensorShape shape({5, 2});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = ops::internal::LeakyRelu(scope_, x);
+  // Avoid input values where Leaky ReLU gradient is not well defined (around
+  // zero).
+  Tensor x_init_value = test::AsTensor<float>(
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
+      {5, 2});
+  RunTest(x, x_init_value, y, shape);
+}
+
+TEST_F(NNGradTest, LeakyReluGradGrad) {
+  TensorShape shape({5, 2});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  // Avoid input values where Leaky ReLU gradient is not well defined (around
+  // zero).
+  Tensor x_init_value = test::AsTensor<float>(
+      {2.3f, 1.9f, 1.5f, 1.1f, 0.7f, 0.3f, -0.1f, -0.5f, -0.9f, -1.3f}, {5, 2});
+  Tensor features = test::AsTensor<float>(
+      {-0.9f, -0.7f, -0.5f, -0.3f, -0.1f, 0.1f, 0.3f, 0.5f, 0.7f, 0.9f},
+      {5, 2});
+  auto y = ops::internal::LeakyReluGrad(scope_, x, features);
+  RunTest(x, x_init_value, y, shape);
+}
+
 TEST_F(NNGradTest, EluGrad) {
   TensorShape shape({5, 2});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 3d3895c8fa82c3c0e2974228e9cad767d0e00df4..52345a376cc29ee47ccb9888c9bb26292468b5a9 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -133,5 +133,6 @@ filegroup(
         "testdata/half_plus_two_pbtxt/**",
         "testdata/half_plus_two_main_op/**",
         "testdata/half_plus_two/**",
+        "testdata/half_plus_two_v2/**",
     ]),
 )
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index 645a3f101d1ae7dda88ec4ca622c694dc5a7a919..6f00dc324bd7054b28de2c35023581e1666bfa01 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -33,10 +33,10 @@ constexpr char kSavedModelFilenamePb[] = "saved_model.pb";
 /// SavedModel text format proto filename.
 constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
 
-/// SavedModel legacy init op key.
+/// SavedModel legacy init op collection key. Used in v1 SavedModels.
 constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
 
-/// SavedModel main op key.
+/// SavedModel main op collection key. Used in v1 SavedModels.
 constexpr char kSavedModelMainOpKey[] = "saved_model_main_op";
 
 /// Directory in which to save the SavedModel variables.
@@ -45,6 +45,11 @@ constexpr char kSavedModelVariablesDirectory[] = "variables";
 /// SavedModel variables filename.
 constexpr char kSavedModelVariablesFilename[] = "variables";
 
+/// SavedModel SignatureDef keys for the initialization and train ops. Used in
+/// V2 SavedModels.
+constexpr char kSavedModelInitOpSignatureKey[] = "__saved_model_init_op";
+constexpr char kSavedModelTrainOpSignatureKey[] = "__saved_model_train_op";
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index c6abe2f41b9b5ec2faee6f65b429ff606f8ac08e..85d3dd01fa51b3c3ba6fcbf5faac03f1ff5630e2 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -122,34 +122,54 @@ Status RunOnce(const RunOptions& run_options,
   return run_status;
 }
 
-bool HasMainOp(const MetaGraphDef& meta_graph_def) {
+// RunInitOp will return OK if the initialization op was run successfully.
+// An empty init_op_name indicates that there are no init ops to run.
+Status RunInitOp(const RunOptions& run_options, const string& export_dir,
+                 const MetaGraphDef& meta_graph_def,
+                 const std::vector<AssetFileDef>& asset_file_defs,
+                 Session* session, const string& init_op_name) {
+  if (!init_op_name.empty()) {
+    LOG(INFO) << "Running initialization op on SavedModel bundle.";
+    std::vector<std::pair<string, Tensor>> inputs;
+    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
+    RunMetadata run_metadata;
+    return RunOnce(run_options, inputs, {}, {init_op_name},
+                   nullptr /* outputs */, &run_metadata, session);
+  }
+  return Status::OK();
+}
+
+// A SavedModel may store the name of the initialization op to run in the
+// in the SignatureDef (v2) or a collection (v1). If an init_op collection
+// exists, then the collection must contain exactly one op.
+Status GetInitOp(const string& export_dir, const MetaGraphDef& meta_graph_def,
+                 string* init_op_name) {
+  const auto& sig_def_map = meta_graph_def.signature_def();
+  const auto& init_op_sig_it =
+      meta_graph_def.signature_def().find(kSavedModelInitOpSignatureKey);
+  if (init_op_sig_it != sig_def_map.end()) {
+    *init_op_name = init_op_sig_it->second.outputs()
+                        .find(kSavedModelInitOpSignatureKey)
+                        ->second.name();
+    return Status::OK();
+  }
+
   const auto& collection_def_map = meta_graph_def.collection_def();
+  string init_op_collection_key;
   if (collection_def_map.find(kSavedModelMainOpKey) !=
       collection_def_map.end()) {
-    return true;
+    init_op_collection_key = kSavedModelMainOpKey;
+  } else {
+    init_op_collection_key = kSavedModelLegacyInitOpKey;
   }
-  return false;
-}
 
-Status RunMainOp(const RunOptions& run_options, const string& export_dir,
-                 const MetaGraphDef& meta_graph_def,
-                 const std::vector<AssetFileDef>& asset_file_defs,
-                 Session* session, const string& main_op_key) {
-  LOG(INFO) << "Running MainOp with key " << main_op_key
-            << " on SavedModel bundle.";
-  const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto main_op_it = collection_def_map.find(main_op_key);
-  if (main_op_it != collection_def_map.end()) {
-    if (main_op_it->second.node_list().value_size() != 1) {
+  const auto init_op_it = collection_def_map.find(init_op_collection_key);
+  if (init_op_it != collection_def_map.end()) {
+    if (init_op_it->second.node_list().value_size() != 1) {
       return errors::FailedPrecondition(
           strings::StrCat("Expected exactly one main op in : ", export_dir));
     }
-    std::vector<std::pair<string, Tensor>> inputs;
-    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
-    RunMetadata run_metadata;
-    const StringPiece main_op_name = main_op_it->second.node_list().value(0);
-    return RunOnce(run_options, inputs, {}, {string(main_op_name)},
-                   nullptr /* outputs */, &run_metadata, session);
+    *init_op_name = init_op_it->second.node_list().value(0);
   }
   return Status::OK();
 }
@@ -193,6 +213,15 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
 
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
                         std::vector<AssetFileDef>* asset_file_defs) {
+  // With SavedModel v2, we write asset file def into metagraph instead of
+  // collection, so read from metagraph first.
+  if (meta_graph_def.asset_file_def_size() > 0) {
+    for (const auto& asset : meta_graph_def.asset_file_def()) {
+      asset_file_defs->push_back(asset);
+    }
+    return Status::OK();
+  }
+  // Fall back to read from collection to be backward compatible with v1.
   const auto& collection_def_map = meta_graph_def.collection_def();
   const auto assets_it = collection_def_map.find(kSavedModelAssetsKey);
   if (assets_it == collection_def_map.end()) {
@@ -227,15 +256,12 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                  bundle->meta_graph_def.saver_def().restore_op_name(),
                  bundle->meta_graph_def.saver_def().filename_tensor_name(),
                  asset_file_defs, bundle->session.get()));
-  if (HasMainOp(bundle->meta_graph_def)) {
-    TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir,
-                                 bundle->meta_graph_def, asset_file_defs,
-                                 bundle->session.get(), kSavedModelMainOpKey));
-  } else {
-    TF_RETURN_IF_ERROR(RunMainOp(
-        run_options, export_dir, bundle->meta_graph_def, asset_file_defs,
-        bundle->session.get(), kSavedModelLegacyInitOpKey));
-  }
+  string init_op_name;
+  TF_RETURN_IF_ERROR(
+      GetInitOp(export_dir, bundle->meta_graph_def, &init_op_name));
+  TF_RETURN_IF_ERROR(RunInitOp(run_options, export_dir, bundle->meta_graph_def,
+                               asset_file_defs, bundle->session.get(),
+                               init_op_name));
   return Status::OK();
 }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 72b8bc18710b0ee77cb01ed3ad0c2abb5183efb2..597e42bb65ab5536664089f7e65ec52d77fc8f23 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -36,6 +36,8 @@ constexpr char kTestDataMainOp[] =
     "cc/saved_model/testdata/half_plus_two_main_op/00000123";
 constexpr char kTestDataSharded[] =
     "cc/saved_model/testdata/half_plus_two/00000123";
+constexpr char kTestDataInitOpV2[] =
+    "cc/saved_model/testdata/half_plus_two_v2/00000123";
 
 class LoaderTest : public ::testing::Test {
  protected:
@@ -227,5 +229,17 @@ TEST_F(LoaderTest, MaybeSavedModelDirectory) {
   EXPECT_FALSE(MaybeSavedModelDirectory(invalid_export_dir));
 }
 
+TEST_F(LoaderTest, SavedModelInitOpV2Format) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataInitOpV2);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  CheckSavedModelBundle(export_dir, bundle);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets/foo.txt b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets/foo.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f9ff036688007836524129e23f5cf82edd1e8910
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets/foo.txt
@@ -0,0 +1 @@
+asset-file-contents
\ No newline at end of file
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/saved_model.pb b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/saved_model.pb
new file mode 100644
index 0000000000000000000000000000000000000000..a10bbf8fb6bca0fcee6414b2927d2f706de85ebc
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..15b75d6ef6bffc336d138d923badb3928b8c4c13
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.index b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.index
new file mode 100644
index 0000000000000000000000000000000000000000..7ec9fb4fe2dd21d0a6c324aecd7658fc37cf2326
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.index differ
diff --git a/tensorflow/compat_template_v1.__init__.py b/tensorflow/compat_template_v1.__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7df80ec01245a7fe820c79d5879458c4cd0a93cb
--- /dev/null
+++ b/tensorflow/compat_template_v1.__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bring in all of the public TensorFlow interface into this module."""
+
+from __future__ import absolute_import as _absolute_import
+from __future__ import division as _division
+from __future__ import print_function as _print_function
+
+import os as _os
+
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+
+from tensorflow.python.tools import component_api_helper as _component_api_helper
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
+
+# API IMPORTS PLACEHOLDER
+
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 6c29f09cde7ee17c11cb44ce48d8e9128daae4d0..16151e77737429f4fbf690fc34b12a70bacebdc4 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -93,7 +93,7 @@ cc_library(
         ":tfcompile_lib",
         "//tensorflow/compiler/tf2xla:tf2xla_proto",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index b17bc658fa06b9feb7edb292bd89ef31e6309169..ab1c1be344e2257721507543bc7647d4ff4becb2 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -164,7 +164,8 @@ string RewriteWithName(const string& name, string code,
 }
 
 // Generate methods for args (inputs).
-Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
+Status GenArgMethods(const tf2xla::Config& config,
+                     const xla::ProgramShapeProto& ps,
                      const CompileResult& compile_result, string* methods) {
   size_t num_args = ps.parameters_size();
   if (config.feed_size() != num_args) {
@@ -174,9 +175,10 @@ Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
   }
   for (int i = 0; i < num_args; ++i) {
     std::vector<std::pair<string, string>> rewrites;
-    TF_RETURN_IF_ERROR(AddRewritesForShape(i, ps.parameters(i), &rewrites));
+    TF_RETURN_IF_ERROR(
+        AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
     const string code = R"(
-  void set_arg{{NAME}}_data(void* data) {
+  void set_arg{{NAME}}_data(const void* data) {
     set_arg_data({{I}}, data);
   }
   {{TYPE}}* arg{{NAME}}_data() {
@@ -204,7 +206,7 @@ Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
 
 // Generate methods for results (outputs).
 Status GenResultMethods(const tf2xla::Config& config,
-                        const xla::ProgramShape& ps, string* methods) {
+                        const xla::ProgramShapeProto& ps, string* methods) {
   if (ps.result().element_type() != xla::TUPLE) {
     // The XlaCompiler we use to build the xla computation always generates a
     // tuple result, and we rely on this to simplify code generation.
@@ -217,8 +219,8 @@ Status GenResultMethods(const tf2xla::Config& config,
   }
   for (int i = 0; i < ps.result().tuple_shapes_size(); ++i) {
     std::vector<std::pair<string, string>> rewrites;
-    TF_RETURN_IF_ERROR(
-        AddRewritesForShape(i, ps.result().tuple_shapes(i), &rewrites));
+    TF_RETURN_IF_ERROR(AddRewritesForShape(
+        i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
     string code = R"(
   {{TYPE}}* result{{NAME}}_data() {
     return static_cast<{{TYPE}}*>(result_data({{I}}));
@@ -336,7 +338,7 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
       ExtractEntryParamBufferInfos(buffer_infos);
   std::vector<BufferInfo> buffer_infos_for_temps =
       ExtractTempBufferInfos(buffer_infos);
-  const xla::ProgramShape& ps = compile_result.program_shape;
+  const xla::ProgramShapeProto& ps = compile_result.program_shape;
   string methods_arg, methods_result;
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
@@ -548,8 +550,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
   static const char** StaticResultNames() {{RESULT_NAMES_CODE}}
 
   // Shape of the args and results.
-  static const xla::ProgramShape* StaticProgramShape() {
-    static const xla::ProgramShape* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}};
+  static const xla::ProgramShapeProto* StaticProgramShape() {
+    static const xla::ProgramShapeProto* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}};
     return kShape;
   }
 
@@ -587,7 +589,7 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{METHODS_RESULT}}\n", methods_result},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
-      {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(ps)},
+      {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(xla::ProgramShape(ps))},
       {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
        metadata_result.program_shape_access_shim},
       {"{{RESULT_INDEX}}", absl::StrCat(result_index)},
@@ -615,11 +617,11 @@ static string CreateUniqueIdentifier(const CodegenOpts& opts,
 Status GenerateMetadata(const CodegenOpts& opts,
                         const CompileResult& compile_result,
                         MetadataResult* metadata_result) {
-  std::unique_ptr<xla::ProgramShape> program_shape;
+  std::unique_ptr<xla::ProgramShapeProto> program_shape;
 
   if (opts.gen_program_shape) {
     program_shape =
-        absl::make_unique<xla::ProgramShape>(compile_result.program_shape);
+        absl::make_unique<xla::ProgramShapeProto>(compile_result.program_shape);
 
     // The parameter names are currently meaningless, and redundant with the
     // rest of our metadata, so clear them out to avoid confusion and save
@@ -631,8 +633,8 @@ Status GenerateMetadata(const CodegenOpts& opts,
   // a shim that evaluates to nullptr, which is what we want.
 
   ProtobufToEmbed program_shape_protobuf{
-      CreateUniqueIdentifier(opts, "ProgramShape"), "xla::ProgramShape",
-      program_shape.get()};
+      CreateUniqueIdentifier(opts, "ProgramShapeProto"),
+      "xla::ProgramShapeProto", program_shape.get()};
 
   ProtobufToEmbed hlo_profile_printer_data_protobuf{
       CreateUniqueIdentifier(opts, "HloProfilePrinterData"),
diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h
index 90410c46a8e36e44454f1219ad76d0fb0937070d..9485e86b10e225a3c9c12eafd9905bdf7c15c9fa 100644
--- a/tensorflow/compiler/aot/codegen.h
+++ b/tensorflow/compiler/aot/codegen.h
@@ -57,7 +57,7 @@ struct MetadataResult {
   std::vector<string> header_variable_decls;
 
   // program_shape_access_shim is a C++ expression that constructs the
-  // xla::ProgramShape instance for the CompileResult passed to
+  // xla::ProgramShapeProto instance for the CompileResult passed to
   // GenerateMetadata.
   string program_shape_access_shim;
 
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index bb288d23000527be74f01630d20bbf82e50007ce..c1788ca32a1d099284eeb870f9513891051fd29e 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -181,13 +181,15 @@ TEST(CodegenTest, Golden) {
        BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/1),
        BufferInfo::MakeTempBuffer(3), BufferInfo::MakeTempBuffer(120)},
       5, {}));
-  compile_result.program_shape = xla::ShapeUtil::MakeProgramShape(
-      {
-          xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
-          xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
-      },
-      xla::ShapeUtil::MakeTupleShape(
-          {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}));
+  compile_result.program_shape =
+      xla::ShapeUtil::MakeProgramShape(
+          {
+              xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
+              xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
+          },
+          xla::ShapeUtil::MakeTupleShape(
+              {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}))
+          .ToProto();
   compile_result.entry_point = "entry_point";
   compile_result.pointer_size = 8;
 
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index e4d8a02877c75fa72c5747650ab9c7ac229955b3..968afad65ed6d4b5510687df484b7ce6743f6a85 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -22,7 +22,7 @@ extern "C" void entry_point(
     void* result, const xla::ExecutableRunOptions* run_options,
     const void** args, void** temps, tensorflow::int64* profile_counters);
 
-extern "C" char __tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[];
+extern "C" char __tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[];
 
 
 namespace foo {
@@ -114,7 +114,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   //   with dim indices specifying which value. No bounds checking is performed
   //   on dim indices.
 
-  void set_arg0_data(void* data) {
+  void set_arg0_data(const void* data) {
     set_arg_data(0, data);
   }
   float* arg0_data() {
@@ -132,7 +132,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
         arg_data(0)))[dim0][dim1];
   }
 
-  void set_arg_myfeed_data(void* data) {
+  void set_arg_myfeed_data(const void* data) {
     set_arg_data(0, data);
   }
   float* arg_myfeed_data() {
@@ -150,7 +150,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
         arg_data(0)))[dim0][dim1];
   }
 
-  void set_arg1_data(void* data) {
+  void set_arg1_data(const void* data) {
     set_arg_data(1, data);
   }
   tensorflow::int64* arg1_data() {
@@ -253,10 +253,10 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   }
 
   // Shape of the args and results.
-  static const xla::ProgramShape* StaticProgramShape() {
-    static const xla::ProgramShape* kShape = []() {
-    xla::ProgramShape* proto = new xla::ProgramShape;
-    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[0], 52);
+  static const xla::ProgramShapeProto* StaticProgramShape() {
+    static const xla::ProgramShapeProto* kShape = []() {
+    xla::ProgramShapeProto* proto = new xla::ProgramShapeProto;
+    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 52);
     return proto;
   }();
     return kShape;
diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden
index eb001c5d45bdfefc76629d7303d89f5480432235..ce8e5ec8c96a2c3696f14b8eea206d648182ecb5 100644
Binary files a/tensorflow/compiler/aot/codegen_test_o.golden and b/tensorflow/compiler/aot/codegen_test_o.golden differ
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 2b5f97b34cd928d32eb220536342c715d91d45bb..9fc223bdc7c0e207ce2005cb86250aa77e709df8 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -56,17 +56,23 @@ Status CompileXla(xla::CompileOnlyClient* client,
     return errors::Unknown("Couldn't get XLA program shape: ",
                            pshape_or.status().error_message());
   }
-  compile_result->program_shape = *pshape_or.ValueOrDie();
-  xla::ProgramShape* pshape = &compile_result->program_shape;
-  std::vector<const xla::Shape*> arg_layouts;
-  arg_layouts.reserve(pshape->parameters_size());
+  compile_result->program_shape = pshape_or.ValueOrDie()->ToProto();
+  xla::ProgramShapeProto* pshape = &compile_result->program_shape;
+
+  // AotXlaComputationInstance::argument_layouts is a vector of Shape
+  // pointers. Accumulate the Shape objects themselves in a separate vector
+  // while building the vector of pointers.
+  std::vector<const xla::Shape*> arg_layout_ptrs(pshape->parameters_size());
+  std::vector<xla::Shape> arg_layouts(pshape->parameters_size());
   for (int i = 0; i < pshape->parameters_size(); ++i) {
-    arg_layouts.push_back(pshape->mutable_parameters(i));
+    arg_layouts[i] = xla::Shape(*pshape->mutable_parameters(i));
+    arg_layout_ptrs[i] = &arg_layouts[i];
   }
   xla::CompileOnlyClient::AotXlaComputationInstance instance;
   instance.computation = &computation;
-  instance.argument_layouts = std::move(arg_layouts);
-  instance.result_layout = &pshape->result();
+  instance.argument_layouts = std::move(arg_layout_ptrs);
+  xla::Shape result_shape(pshape->result());
+  instance.result_layout = &result_shape;
   xla::StatusOr<std::vector<std::unique_ptr<xla::AotCompilationResult>>>
       aot_or = client->CompileAheadOfTime({instance}, aot_opts);
   if (!aot_or.ok()) {
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index e03c5b1aa77c1262ed903aae3072ef65f34d80a2..ee7bb26fabd2d897b85b62f38778ecbfe2238eb6 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -33,9 +33,9 @@ namespace tfcompile {
 struct CompileResult {
   // Contains object file and meta-info.
   std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
-  xla::ProgramShape program_shape;  // Static shape of args and results.
-  string entry_point;               // Name of generated function.
-  int pointer_size = 0;             // Size of a pointer in bytes.
+  xla::ProgramShapeProto program_shape;  // Static shape of args and results.
+  string entry_point;                    // Name of generated function.
+  int pointer_size = 0;                  // Size of a pointer in bytes.
 };
 
 // CompileGraph compiles the graph_def into an object file containing a function
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index f10852c7850f61bfd8b99fa9f1648202d182085e..4dd79e5882d7da61be029735ef2b165908c599f9 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -526,13 +526,15 @@ TEST(TFCompileTest, ProgramShape) {
 
   // muladd has the program shape defined.
   MatMulAndAddComp muladd;
-  const xla::ProgramShape* muladd_shape = muladd.ProgramShape();
+  const xla::ProgramShapeProto* muladd_shape = muladd.ProgramShape();
   ASSERT_TRUE(muladd_shape != nullptr);
   ASSERT_EQ(muladd_shape->parameters_size(), 2);
-  EXPECT_TRUE(ShapeUtil::Compatible(muladd_shape->parameters(0), f32_2x2));
-  EXPECT_TRUE(ShapeUtil::Compatible(muladd_shape->parameters(1), f32_2x2));
+  EXPECT_TRUE(
+      ShapeUtil::Compatible(xla::Shape(muladd_shape->parameters(0)), f32_2x2));
+  EXPECT_TRUE(
+      ShapeUtil::Compatible(xla::Shape(muladd_shape->parameters(1)), f32_2x2));
 
-  const xla::Shape& muladd_result = muladd_shape->result();
+  const xla::Shape muladd_result(muladd_shape->result());
   ASSERT_EQ(muladd_result.element_type(), xla::TUPLE);
   ASSERT_EQ(ShapeUtil::TupleElementCount(muladd_result), 2);
   const xla::Shape& muladd_result0 =
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 859c84bb91657422b830255b0217f8946d351458..2dc3e8c9113b37bf9d575ad66783f4ab49478af4 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -390,6 +390,7 @@ def target_llvm_triple():
         "//tensorflow:android_arm": "armv7-none-android",
         "//tensorflow:android_arm64": "aarch64-none-android",
         "//tensorflow:android_x86": "i686-none-android",
+        "//tensorflow:ios": "arm64-none-ios",
         "//tensorflow:linux_ppc64le": "ppc64le-ibm-linux-gnu",
         "//tensorflow:darwin": "x86_64-none-darwin",
         "//conditions:default": "x86_64-pc-linux",
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index b95b063348c5cdfdcaed635ba527e9f0bfd6092d..d548de8c44285f6d21dd778db464a31e1b19645b 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/flags.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -103,7 +103,7 @@ Status Main(const MainFlags& flags) {
     return errors::InvalidArgument("Must specify --cpp_class");
   }
   codegen_opts.gen_hlo_profile_printer_data =
-      xla::legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile();
+      xla::GetDebugOptionsFromFlags().xla_hlo_profile();
   TF_RETURN_IF_ERROR(ParseCppClass(flags.cpp_class, &codegen_opts.class_name,
                                    &codegen_opts.namespaces));
 
@@ -132,7 +132,7 @@ int main(int argc, char** argv) {
 
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
 
   tensorflow::string usage = tensorflow::tfcompile::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 661b444a42eefadf52739d84483e8e26c07fadf5..15dcbb2641eca031e82db9aa58dee6a14ab0a2cc 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -21,10 +21,8 @@ package(
 )
 
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
@@ -39,7 +37,7 @@ cc_library(
         ":xla_cpu_device",
         ":xla_cpu_jit",
         "//tensorflow/compiler/plugin",
-    ] + if_cuda_is_configured([
+    ] + if_cuda([
         ":xla_gpu_device",
         ":xla_gpu_jit",
     ]),
@@ -52,6 +50,8 @@ cc_library(
     deps = [
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",
     ],
@@ -65,6 +65,7 @@ cc_library(
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/xla/service:gpu_plugin",
     ]),
     alwayslink = 1,
@@ -75,15 +76,17 @@ cc_library(
     srcs = ["xla_cpu_device.cc"],
     visibility = [":friends"],
     deps = [
+        ":create_xla_launch_op",  # buildcleaner: keep
+        ":flags",
         ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_ops",
-        "//tensorflow/compiler/jit/legacy_flags:xla_device_flags",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
     alwayslink = 1,
 )
@@ -93,6 +96,7 @@ cc_library(
     srcs = ["xla_gpu_device.cc"],
     visibility = [":friends"],
     deps = [
+        ":create_xla_launch_op",  # buildcleaner: keep
         ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_ops",
@@ -101,6 +105,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
@@ -116,7 +122,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:interpreter_plugin",  # buildcleaner: keep
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
     ],
     alwayslink = 1,
 )
@@ -188,11 +194,13 @@ cc_library(
         "//tensorflow/core/kernels:resource_variable_ops",
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:shape_ops",
+        "//tensorflow/core/kernels:stack",
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/kernels/data:generator_dataset_op",
         "//tensorflow/core/kernels/data:iterator_ops",
         "//tensorflow/core/kernels/data:prefetch_dataset_op",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -205,6 +213,18 @@ cc_library(
 
 # Internal targets below this point.
 
+cc_library(
+    name = "flags",
+    srcs = ["flags.cc"],
+    hdrs = ["flags.h"],
+    visibility = [":friends"],
+    deps = [
+        "//tensorflow/compiler/xla:parse_flags_from_env",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "common",
     srcs = [
@@ -237,6 +257,8 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -249,6 +271,8 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/core:core_cpu",
@@ -259,6 +283,22 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_compilation_cache_test",
+    srcs = [
+        "xla_compilation_cache_test.cc",
+    ],
+    deps = [
+        ":xla_compilation_cache",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
 
@@ -324,7 +364,6 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -360,6 +399,83 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "shape_inference",
+    srcs = ["shape_inference.cc"],
+    hdrs = ["shape_inference.h"],
+    deps = [
+        ":shape_inference_helpers",
+        "//tensorflow/compiler/tf2xla:dump_graph",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        ":shape_inference",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "shape_inference_test",
+    srcs = ["shape_inference_test.cc"],
+    deps = [
+        ":shape_inference",
+        ":test_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/kernels:constant_op",
+    ],
+)
+
+cc_library(
+    name = "encapsulate_util",
+    srcs = ["encapsulate_util.cc"],
+    hdrs = ["encapsulate_util.h"],
+    deps = [
+        ":shape_inference",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "encapsulate_util_test",
+    srcs = ["encapsulate_util_test.cc"],
+    deps = [
+        ":encapsulate_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "compilation_passes",
     srcs = [
@@ -368,6 +484,8 @@ cc_library(
         "deadness_analysis_internal.h",
         "encapsulate_subgraphs_pass.cc",
         "encapsulate_xla_computations_pass.cc",
+        "extract_outside_compilation_pass.cc",
+        "increase_dynamism_for_auto_jit_pass.cc",
         "mark_for_compilation_pass.cc",
         "mark_for_compilation_pass_test_helper.cc",
         "partially_decluster_pass.cc",
@@ -377,12 +495,16 @@ cc_library(
         "deadness_analysis.h",
         "encapsulate_subgraphs_pass.h",
         "encapsulate_xla_computations_pass.h",
+        "extract_outside_compilation_pass.h",
+        "increase_dynamism_for_auto_jit_pass.h",
         "mark_for_compilation_pass.h",
         "mark_for_compilation_pass_test_helper.h",
         "partially_decluster_pass.h",
     ],
     deps = [
         ":common",
+        ":encapsulate_util",
+        ":flags",
         ":shape_inference_helpers",
         ":union_find",
         ":xla_cluster_util",
@@ -390,12 +512,13 @@ cc_library(
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope_internal",
         "//tensorflow/compiler/jit/graphcycles",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
+        "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:core_cpu",
@@ -409,8 +532,10 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -435,25 +560,6 @@ cc_library(
     hdrs = ["union_find.h"],
 )
 
-cc_library(
-    name = "producer_consumer_queue",
-    hdrs = ["producer_consumer_queue.h"],
-    deps = ["//tensorflow/core:lib"],
-)
-
-tf_cc_test(
-    name = "producer_consumer_queue_test",
-    size = "small",
-    srcs = ["producer_consumer_queue_test.cc"],
-    deps = [
-        ":producer_consumer_queue",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "deadness_analysis_test",
     size = "small",
@@ -491,30 +597,39 @@ tf_cc_test(
         "build_xla_ops_pass_test.cc",
         "encapsulate_subgraphs_pass_test.cc",
         "encapsulate_xla_computations_pass_test.cc",
+        "extract_outside_compilation_pass_test.cc",
+        "increase_dynamism_for_auto_jit_pass_test.cc",
         "mark_for_compilation_pass_test.cc",
         "partially_decluster_pass_test.cc",
     ],
     deps = [
         ":common",
         ":compilation_passes",
+        ":encapsulate_util",
         ":node_matchers",
         ":xla_cluster_util",
+        ":xla_cpu_device",
         ":xla_gpu_device",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:resource_variable_ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:test_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
+        "//tensorflow/compiler/tf2xla/cc:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -551,31 +666,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "xla_launch_util_test",
-    size = "small",
-    srcs = ["xla_launch_util_test.cc"],
-    deps = [
-        ":common",
-        ":xla_compilation_cache",
-        ":xla_launch_util",
-        ":xla_tensor",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core/kernels:variable_ops",
-    ],
-)
-
 cc_library(
     name = "xla_fusion_optimizer",
     srcs = ["xla_fusion_optimizer.cc"],
@@ -621,6 +711,7 @@ cc_library(
     deps = [
         "//tensorflow/cc:ops",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
@@ -636,6 +727,7 @@ tf_cc_test(
     deps = [
         ":node_matchers",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:ops",
         "//tensorflow/core:ops",
         "//tensorflow/core:test_main",
@@ -648,7 +740,10 @@ tf_custom_op_py_library(
     visibility = [
         ":friends",
     ],
-    deps = ["//tensorflow/compiler/jit/ops:xla_ops_wrapper_py"],
+    deps = [
+        "//tensorflow/compiler/jit/ops:xla_ops_grad",
+        "//tensorflow/compiler/jit/ops:xla_ops_wrapper_py",
+    ],
 )
 
 # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 5974696b7751d69eb27141173fdab14313925ee9..9f4042630edaec1b9519b6434d859a48372e8b15 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -15,10 +15,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/build_xla_ops_pass.h"
 #include "absl/algorithm/container.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope_internal.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -48,6 +54,88 @@ void MoveOutgoingEdges(Graph* g, Node* old_node, Node* new_node) {
   }
 }
 
+// Returns a data value that is dead iff `control` is dead.
+Output ControlToData(const Scope& scope, Node* control) {
+  Output data = ops::Const(scope.WithOpName("ctrl_as_data"),
+                           Tensor(DT_BOOL, TensorShape({0})));
+  scope.graph()->AddControlEdge(control, data.node());
+  return Output(data.node());
+}
+
+// Returns an operation that can be control-depended on that is dead iff `data`
+// is dead.
+Operation DataToControl(const Scope& scope, Output data) {
+  return Operation(
+      ops::Identity(scope.WithOpName("data_as_ctrl"), data).node());
+}
+
+// Replaces each outgoing edge from `old_node` with a merge node that merges in
+// the corresponding output from `new_node`.
+void MergeOutgoingDataEdges(const Scope& s, Node* old_node, Node* new_node) {
+  if (!s.status().ok()) {
+    return;
+  }
+
+  std::vector<Output> merged_outputs(old_node->num_outputs(), Output(nullptr));
+
+  std::vector<const Edge*> data_edges;
+  absl::c_copy_if(old_node->out_edges(), std::back_inserter(data_edges),
+                  [](const Edge* e) { return !e->IsControlEdge(); });
+
+  for (const Edge* e : data_edges) {
+    int oidx = e->src_output();
+    Output merged_output = merged_outputs[oidx];
+    if (merged_output.node() == nullptr) {
+      ops::Merge merge_op(s.WithOpName(absl::StrCat("merge_oidx_", oidx)),
+                          {Output(old_node, oidx), Output(new_node, oidx)});
+      merged_output = merged_outputs[oidx] = merge_op.output;
+    }
+
+    Node* dst = e->dst();
+    int dst_idx = e->dst_input();
+
+    s.graph()->RemoveEdge(e);
+    s.graph()->AddEdge(merged_output.node(), merged_output.index(), dst,
+                       dst_idx);
+  }
+}
+
+// Replaces each control successor of `old_node` to execute whenever either
+// `old_node` or `new_node` is executed.
+void MergeOutgoingControlEdges(const Scope& s, Node* old_node, Node* new_node) {
+  if (!s.status().ok()) {
+    return;
+  }
+
+  std::vector<const Edge*> ctrl_edges;
+  absl::c_copy_if(old_node->out_edges(), std::back_inserter(ctrl_edges),
+                  [](const Edge* e) { return e->IsControlEdge(); });
+
+  if (ctrl_edges.empty()) {
+    return;
+  }
+
+  // We can't merge control edges directly so we instead first "convert" them to
+  // normal values that can be merged, merge the values and then "convert" the
+  // merged value back into control.
+  //
+  // NB! We need to copy out the outgoing control edges before constructing
+  // old_ctrl_as_data otherwise the control edge from old_node to the constant
+  // in ControlToData will be present in ctrl_edges.
+
+  Output old_ctrl_as_data = ControlToData(s, old_node);
+  Output new_ctrl_as_data = ControlToData(s, new_node);
+
+  ops::Merge ctrl_merge_as_data(s.WithOpName("ctrl_merge"),
+                                {old_ctrl_as_data, new_ctrl_as_data});
+  Operation ctrl_merge = DataToControl(s, ctrl_merge_as_data.output);
+
+  for (const Edge* e : ctrl_edges) {
+    s.graph()->AddControlEdge(ctrl_merge.node(), e->dst());
+    s.graph()->RemoveControlEdge(e);
+  }
+}
+
 struct XlaClusterInfo {
   std::vector<Output> constant_inputs;
   std::vector<Output> non_constant_inputs;
@@ -107,7 +195,39 @@ Status CopyIncomingControlEdges(Graph* g, Node* from, Node* to) {
   return Status::OK();
 }
 
-Status ReplaceNodeWithXlaCompileAndXlaRun(Graph* g, Node* n) {
+void RemoveAllIncomingControlEdges(Graph* g, Node* n) {
+  std::vector<const Edge*> incoming_ctrl_edges;
+  absl::c_copy_if(n->in_edges(), std::back_inserter(incoming_ctrl_edges),
+                  [](const Edge* e) { return e->IsControlEdge(); });
+  for (const Edge* e : incoming_ctrl_edges) {
+    g->RemoveControlEdge(e);
+  }
+}
+
+// Returns true (into `result`) if `node` must be compiled.
+Status NodeRequiresCompilation(Node* n, bool* result) {
+  DeviceType device_type("");
+  TF_RETURN_IF_ERROR(
+      DeviceToDeviceType(n->assigned_device_name(), &device_type));
+  const XlaOpRegistry::DeviceRegistration* registration = nullptr;
+  if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
+    return errors::Internal("Could not find compilation device ",
+                            device_type.type());
+  }
+  *result = registration->autoclustering_policy ==
+            XlaOpRegistry::AutoclusteringPolicy::kAlways;
+  return Status::OK();
+}
+
+Status ReplaceNodeWithXlaCompileAndXlaRun(
+    const FunctionLibraryDefinition& flib_def, bool lazy_compilation_enabled,
+    Graph* g, Node* n) {
+  bool requires_compilation;
+  TF_RETURN_IF_ERROR(NodeRequiresCompilation(n, &requires_compilation));
+  if (!lazy_compilation_enabled) {
+    requires_compilation = true;
+  }
+
   Status status;
   Scope root = NewInternalScope(g, &status, /*refiner=*/nullptr)
                    .NewSubScope(n->name())
@@ -121,18 +241,63 @@ Status ReplaceNodeWithXlaCompileAndXlaRun(Graph* g, Node* n) {
                                /*constants=*/cluster_info.constant_inputs,
                                /*args=*/cluster_info.non_constant_inputs,
                                /*resources=*/cluster_info.resource_inputs,
+                               /*must_compile=*/requires_compilation,
                                cluster_info.function);
   TF_RETURN_IF_ERROR(
       CopyIncomingControlEdges(g, /*from=*/n, /*to=*/xla_compile.key.node()));
 
-  std::vector<Output> xla_run_args = cluster_info.non_constant_inputs;
-  absl::c_copy(cluster_info.resource_inputs, std::back_inserter(xla_run_args));
-  ops::_XlaRun xla_run(root.WithOpName("xla_run"), xla_run_args,
-                       xla_compile.key, n->output_types());
+  if (requires_compilation) {
+    // "Strict" compilation:  every _XlaCompile invocation must compile the
+    // cluster.
+    std::vector<Output> xla_run_args = cluster_info.non_constant_inputs;
+    absl::c_copy(cluster_info.resource_inputs,
+                 std::back_inserter(xla_run_args));
+    ops::_XlaRun xla_run(root.WithOpName("xla_run"), xla_run_args,
+                         xla_compile.key, n->output_types());
+
+    MoveOutgoingEdges(g, /*old_node=*/n,
+                      /*new_node=*/xla_run.operation.node());
+    g->RemoveNode(n);
+  } else {
+    // "Lazy" compilation: an _XlaCompile invocation may decide not to compile
+    // the cluster based on profitability heuristics.
 
-  MoveOutgoingEdges(g, /*old_node=*/n,
-                    /*new_node=*/xla_run.operation.node());
-  g->RemoveNode(n);
+    // We generate the following graph:
+    //
+    //   (use_tf_call, use_xla_run) =
+    //       Switch(pred=xla_compile.compilation_successful,
+    //              value=xla_compile.key)
+    //
+    //   tf_call_outputs = cluster_N(..., ^use_tf_call)
+    //   xla_run_outputs = _XlaRun(..., key=use_xla_run)
+    //   outputs = Merge(tf_call_outputs, xla_run_outputs).
+    ops::Switch s(root.WithOpName("predicated_compilation_key"),
+                  xla_compile.key, xla_compile.compilation_successful);
+    Output predicated_compilation_key = s.output_true;
+    Output inverse_predicated_compilation_key = s.output_false;
+
+    std::vector<Output> xla_run_args = cluster_info.non_constant_inputs;
+    absl::c_copy(cluster_info.resource_inputs,
+                 std::back_inserter(xla_run_args));
+    ops::_XlaRun xla_run(root.WithOpName("xla_run"), xla_run_args,
+                         predicated_compilation_key, n->output_types());
+
+    MergeOutgoingControlEdges(root, /*old_node=*/n,
+                              /*new_node=*/xla_run.operation.node());
+
+    MergeOutgoingDataEdges(root, /*old_node=*/n,
+                           /*new_node=*/xla_run.operation.node());
+
+    TF_RETURN_IF_ERROR(root.status());
+
+    // We already have a TensorFlow function call into the cluster -- the
+    // original node we set out to rewrite.  We just wire in the correct control
+    // deps and we're done.
+    RemoveAllIncomingControlEdges(g, n);
+    g->AddControlEdge(
+        DataToControl(root, inverse_predicated_compilation_key).node(), n);
+    n->ClearAttr(kXlaCompiledKernelAttr);
+  }
 
   return Status::OK();
 }
@@ -141,22 +306,34 @@ Status ReplaceNodeWithXlaCompileAndXlaRun(Graph* g, Node* n) {
 Status BuildXlaOpsPass::Run(const GraphOptimizationPassOptions& options) {
   Graph* graph = options.graph->get();
 
-  for (Node* n : graph->op_nodes()) {
-    // In all cases, only try to compile computational nodes.
-    if (n->IsSend() || n->IsRecv() || n->IsControlFlow()) {
-      continue;
-    }
+  // Copy out the nodes we want to rewrite to avoid modifying the graph while we
+  // iterate on graph->op_nodes().
+  std::vector<Node*> xla_compiled_kernels;
+  absl::c_copy_if(graph->op_nodes(), std::back_inserter(xla_compiled_kernels),
+                  [](const Node* n) {
+                    if (n->IsSend() || n->IsRecv() || n->IsControlFlow()) {
+                      return false;
+                    }
 
-    // Only compile nodes that are marked for compilation by the
-    // compilation-marking pass (via 'attr_name').
-    if (IsXlaCompiledKernel(*n)) {
-      TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun(graph, n));
-    }
+                    // Only compile nodes that are marked for compilation by the
+                    // compilation-marking pass (via 'attr_name').
+                    return IsXlaCompiledKernel(*n);
+                  });
+
+  bool lazy_compilation_enabled =
+      enable_lazy_compilation_
+          ? *enable_lazy_compilation_
+          : GetBuildXlaOpsPassFlags().tf_xla_enable_lazy_compilation;
+
+  for (Node* n : xla_compiled_kernels) {
+    TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun(
+        *options.flib_def, lazy_compilation_enabled, graph, n));
   }
 
   if (VLOG_IS_ON(1)) {
     dump_graph::DumpGraphToFile("build_xla_ops", *graph, options.flib_def);
   }
+
   return Status::OK();
 }
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.h b/tensorflow/compiler/jit/build_xla_ops_pass.h
index 1dd38fa95186dfbe458166caa23a131fbe3c9510..58f7c4b3a0d1472f602e8234f9f08c23dfe78a34 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.h
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_BUILD_XLA_OPS_PASS_H_
 #define TENSORFLOW_COMPILER_JIT_BUILD_XLA_OPS_PASS_H_
 
+#include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -25,7 +26,17 @@ namespace tensorflow {
 // executes (using XLA) TF function calls marked with "_XlaCompiledKernel".
 class BuildXlaOpsPass : public GraphOptimizationPass {
  public:
+  // If enable_lazy_compilation is not nullopt then *enable_lazy_compilation
+  // overrides --tf_xla_enable_lazy_compilation flag in deciding whether lazy
+  // compilation is enabled.
+  explicit BuildXlaOpsPass(
+      absl::optional<bool> enable_lazy_compilation = absl::nullopt)
+      : enable_lazy_compilation_(enable_lazy_compilation) {}
+
   Status Run(const GraphOptimizationPassOptions& options) override;
+
+ private:
+  absl::optional<bool> enable_lazy_compilation_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index 9d56db7b6bc12938b2de9df02b97ff0ca6a42e54..48a23a4c1711ac88a329723c46559112d5a39dbd 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -22,18 +22,38 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/node_matchers.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace {
 
+class BuildXlaOpsTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // This is needed to register the XLA_* devices.
+    CHECK(DeviceFactory::AddDevices(
+              SessionOptions(), "/job:localhost/replica:0/task:0", &devices_)
+              .ok());
+  }
+
+ private:
+  std::vector<std::unique_ptr<Device>> devices_;
+};
+
 using ::tensorflow::testing::FindNodeByName;
+using ::tensorflow::testing::matchers::Attr;
 using ::tensorflow::testing::matchers::CtrlDeps;
+using ::tensorflow::testing::matchers::Inputs;
 using ::tensorflow::testing::matchers::NodeWith;
 using ::tensorflow::testing::matchers::Op;
+using ::tensorflow::testing::matchers::Out;
+using ::testing::_;
 
 Status BuildXlaOps(const Scope& s, std::unique_ptr<Graph>* result) {
   auto graph = absl::make_unique<Graph>(OpRegistry::Global());
@@ -42,15 +62,18 @@ Status BuildXlaOps(const Scope& s, std::unique_ptr<Graph>* result) {
   // Assign all nodes to the CPU device.
   static const char* kCpuDevice = "/job:localhost/replica:0/task:0/cpu:0";
   for (Node* n : graph->nodes()) {
-    if (n->assigned_device_name().empty()) {
+    if (n->requested_device().empty()) {
       n->set_assigned_device_name(kCpuDevice);
+    } else {
+      n->set_assigned_device_name(n->requested_device());
     }
   }
 
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = &graph;
-  BuildXlaOpsPass pass;
+  BuildXlaOpsPass pass(/*enable_lazy_compilation=*/true);
   TF_RETURN_IF_ERROR(pass.Run(opt_options));
+  VLOG(3) << graph->ToGraphDefDebug().DebugString();
   *result = std::move(graph);
   return Status::OK();
 }
@@ -76,16 +99,19 @@ Status MakeXlaCompiledKernel(Graph* graph, const string& callee_name,
                                result);
 }
 
-Node* MakeWrite(const Scope& scope, const string& id) {
-  Output var_handle =
-      ops::VarHandleOp(scope.WithOpName("Var" + id), DT_FLOAT, TensorShape({}));
-  Output value_to_write =
-      ops::Const(scope.WithOpName("ValueToAssign" + id), 1.0f);
-  ops::AssignVariableOp assign_op(scope.WithOpName("Assignee" + id), var_handle,
-                                  value_to_write);
+Node* MakeWrite(const Scope& scope, Output value_to_write, const string& id) {
+  Output var_handle = ops::VarHandleOp(scope.WithOpName("Var_" + id), DT_FLOAT,
+                                       TensorShape({}));
+  ops::AssignVariableOp assign_op(scope.WithOpName("Assignee_" + id),
+                                  var_handle, value_to_write);
   return assign_op.operation.node();
 }
 
+Node* MakeWrite(const Scope& scope, const string& id) {
+  return MakeWrite(
+      scope, ops::Const(scope.WithOpName("ValueToAssign" + id), 1.0f), id);
+}
+
 FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
   FunctionDefLibrary flib_def;
   FunctionDef func = FunctionDefHelper::Create(
@@ -97,14 +123,16 @@ FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
   return flib_def;
 }
 
-TEST(BuildXlaOps, ControlDepsPreserved) {
-  Scope root = Scope::NewRootScope().ExitOnError();
+TEST_F(BuildXlaOpsTest, ControlDepsPreserved) {
+  const char* kXlaDeviceName = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  Scope root = Scope::NewRootScope().WithDevice(kXlaDeviceName).ExitOnError();
 
   FunctionDefLibrary flib_def =
       CreateFunctionDefLibWithConstFunction("cluster_0");
   TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
   Node* call;
   TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
+  call->set_requested_device(kXlaDeviceName);
   Node* write_op = MakeWrite(root, "write");
   root.graph()->AddControlEdge(call, write_op);
 
@@ -116,15 +144,17 @@ TEST(BuildXlaOps, ControlDepsPreserved) {
   EXPECT_THAT(write_op_new, NodeWith(CtrlDeps(NodeWith(Op("_XlaRun")))));
 }
 
-TEST(BuildXlaOps, CleanFailureOnBogusAttr) {
+TEST_F(BuildXlaOpsTest, CleanFailureOnBogusAttr) {
   Scope root = Scope::NewRootScope().ExitOnError();
 
   FunctionDefLibrary flib_def =
       CreateFunctionDefLibWithConstFunction("cluster_0");
   TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+
   Node* call;
   TF_ASSERT_OK(
       MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", 100, 100, &call));
+
   Node* write_op = MakeWrite(root, "write");
   root.graph()->AddControlEdge(call, write_op);
 
@@ -134,5 +164,65 @@ TEST(BuildXlaOps, CleanFailureOnBogusAttr) {
   EXPECT_EQ(failure_status.code(), error::INVALID_ARGUMENT);
 }
 
+TEST_F(BuildXlaOpsTest, OnNonXlaDevice) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("cluster_0");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+
+  Node* call;
+  TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
+  TF_ASSERT_OK(root.DoShapeInference(call));
+
+  Node* write_op = MakeWrite(root, Output(call), "write_result");
+
+  auto xla_compile = NodeWith(Op("_XlaCompile"), Attr("must_compile", false));
+  auto predicated_compilation_key =
+      NodeWith(Op("Switch"), Inputs(Out(0, xla_compile), Out(1, xla_compile)));
+  auto xla_run =
+      NodeWith(Op("_XlaRun"), Inputs(Out(1, predicated_compilation_key)));
+  auto tf_call =
+      NodeWith(Op("cluster_0"),
+               CtrlDeps(NodeWith(Op("Identity"),
+                                 Inputs(Out(0, predicated_compilation_key)))));
+  auto merge = NodeWith(Op("Merge"), Inputs(Out(tf_call), Out(xla_run)));
+  auto assign_var = NodeWith(Op("AssignVariableOp"), Inputs(_, Out(merge)));
+
+  std::unique_ptr<Graph> graph;
+  TF_ASSERT_OK(BuildXlaOps(root, &graph));
+
+  Node* write_op_new = FindNodeByName(graph.get(), write_op->name());
+  ASSERT_NE(write_op_new, nullptr);
+  EXPECT_THAT(write_op_new, assign_var);
+}
+
+TEST_F(BuildXlaOpsTest, OnXlaDevice) {
+  const char* kXlaDeviceName = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  Scope root = Scope::NewRootScope().WithDevice(kXlaDeviceName).ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("cluster_0");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+
+  Node* call;
+  TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
+  call->set_requested_device(kXlaDeviceName);
+  TF_ASSERT_OK(root.DoShapeInference(call));
+
+  Node* write_op = MakeWrite(root, Output(call), "write_result");
+
+  std::unique_ptr<Graph> graph;
+  TF_ASSERT_OK(BuildXlaOps(root, &graph));
+
+  auto xla_op =
+      NodeWith(Op("_XlaRun"), Inputs(Out(NodeWith(Op("_XlaCompile")))));
+  auto assign_var =
+      NodeWith(Op("AssignVariableOp"), Inputs(Out(NodeWith()), Out(xla_op)));
+
+  Node* write_op_new = FindNodeByName(graph.get(), write_op->name());
+  ASSERT_NE(write_op_new, nullptr);
+  EXPECT_THAT(write_op_new, assign_var);
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/create_xla_launch_op_test.cc b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
index 73866607621cd745f6e640a14405daebf0dd9985..0f872a480f4d4843217f1df3452c4dc62531264e 100644
--- a/tensorflow/compiler/jit/create_xla_launch_op_test.cc
+++ b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
@@ -59,8 +59,9 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 1});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices_));
+        options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) {
@@ -69,7 +70,7 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
     lib_def_ = absl::make_unique<FunctionLibraryDefinition>(
         OpRegistry::Global(), proto);
     OptimizerOptions opts;
-    device_mgr_ = absl::make_unique<DeviceMgr>(devices_);
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
     pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
@@ -77,7 +78,6 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
   }
 
   FunctionLibraryRuntime* flr_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index b7ae7fbeb3912882368dc828e8d6fcd50735b04e..0562838f628c66b1eb03af9d2a5139c01dca31c5 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -525,7 +525,6 @@ Predicate* PredicateFactory::MakeAndOrImpl(
                                    op->GetOperands().begin(),
                                    op->GetOperands().end());
     } else {
-      std::vector<Predicate*> sub_ops_intersection;
       common_inner_operands.clear();
       absl::c_copy_if(op->GetOperands(),
                       std::back_inserter(common_inner_operands),
@@ -696,8 +695,8 @@ Status CreateMultipleNextIterationInputsError(Node* merge) {
     }
   }
   return errors::InvalidArgument(
-      "Multiple NextIteration inputs to merge node ", SummarizeNode(*merge),
-      ": \n", absl::StrJoin(backedges, "\n"),
+      "Multiple NextIteration inputs to merge node ",
+      FormatNodeForError(*merge), ": \n", absl::StrJoin(backedges, "\n"),
       "\nMerge nodes can have at most one incoming NextIteration edge.");
 }
 
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 617e31488c7daeb714c0ff7056b786e4eaf7873f..8a73101c184e6190921fd7729742922bd96f4bcf 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -127,7 +127,8 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
   Output loop_cond =
       ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr);
   ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
-  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
+  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"),
+                           latch.output_false);
   Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"),
                             latch.output_true, increment_by);
   Output next_iteration =
@@ -191,7 +192,8 @@ DependentInductionVar CreateDependentLoopInvariantValue(
                                             value, frame_name);
   ops::Merge iv(root.WithOpName(prefix + "/iv"), {enter_value, enter_value});
   ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
-  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"), iv.output);
+  ops::internal::Exit exit(root.WithOpName(prefix + "/exit"),
+                           latch.output_false);
   Output next_iteration = ops::NextIteration(
       root.WithOpName(prefix + "/next_iteration"), latch.output_true);
   CHECK(root.graph()
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index da27f837e88fc3f57f865211929ec9cb1a1af779..f478832781cb1dc045d9163d4a6f5e5f64a8a705 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -1109,6 +1109,9 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
   function_def_name_ = name;
 
   FunctionDef fdef;
+  // Verify that the graph has well-formed control flow structure.
+  std::vector<ControlFlowInfo> dummy;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph_.get(), &dummy));
   TF_RETURN_IF_ERROR(GraphToFunctionDef(*graph_, name, &fdef));
 
   if (VLOG_IS_ON(1)) {
@@ -1119,8 +1122,11 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
                                       fdef);
   }
 
-  if (!reuse_existing_functions || library->Find(name) == nullptr) {
+  const FunctionDef* original_fdef = library->Find(name);
+  if (!reuse_existing_functions || original_fdef == nullptr) {
     TF_RETURN_IF_ERROR(library->AddFunctionDef(fdef));
+  } else if (!FunctionDefsEqual(*original_fdef, fdef)) {
+    TF_RETURN_IF_ERROR(library->ReplaceFunction(name, fdef));
   }
   return Status::OK();
 }
@@ -1531,9 +1537,6 @@ Status Encapsulator::SplitIntoSubgraphs(FunctionLibraryDefinition* library) {
   for (auto& entry : subgraphs_) {
     Subgraph& subgraph = entry.second;
     FixupSourceAndSinkEdges(subgraph.GetGraph());
-    // Verify that the graph has well-formed control flow structure.
-    std::vector<ControlFlowInfo> dummy;
-    TF_RETURN_IF_ERROR(BuildControlFlowInfo(subgraph.GetGraph(), &dummy));
   }
 
   if (VLOG_IS_ON(1)) {
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 49958093b8dcf35e8adcdfd2f7dfce8558d5db6f..de89be9a3555960dabe7bacd17226c15ae888ae6 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -16,16 +16,20 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/equal_graph_def.h"
@@ -406,8 +410,8 @@ Node* KeyPlaceholderShape(const GraphDefBuilder::Options& opts) {
 Node* KeyPlaceholder(const string& call_node,
                      const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  NodeBuilder node_builder(opts.GetNameForOp("Placeholder"), "Placeholder",
-                           opts.op_registry());
+  NodeBuilder node_builder(absl::StrCat(call_node, "_key_placeholder"),
+                           "Placeholder", opts.op_registry());
   TensorShapeProto shape;
   shape.add_dim()->set_size(2);
   return opts.WithAttr("shape", shape)
@@ -494,7 +498,8 @@ Node* RetOp(int index, ops::NodeOut a, const GraphDefBuilder::Options& opts) {
   return opts.FinalizeBuilder(&node_builder);
 }
 
-Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
+Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
+                   const std::vector<string>& encapsulated_functions) {
   Status s;
   // Convert the GraphDef to a Graph
   std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -505,11 +510,39 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
   s = ConvertGraphDefToGraph(options, *graphdef, graph.get());
   if (!s.ok()) return s;
 
+  s = PerformStaticShapeInferenceBeforeEncapsulation(
+      graph.get(), "_encapsulate", "_outside");
+  if (!s.ok()) return s;
+
+  s = PreprocessForEncapsulation(graph.get(), "_encapsulate", "_outside");
+  if (!s.ok()) return s;
+
   std::unique_ptr<Graph> graph_out;
-  s = EncapsulateSubgraphsInFunctions("_encapsulate", "_outside", *graph,
-                                      /*rewrite_subgraph_fn=*/{},
-                                      /*reuse_existing_functions=*/false,
-                                      &graph_out, lib_def.get());
+  s = EncapsulateSubgraphsInFunctions(
+      "_encapsulate", /*outside_compilation_attribute=*/"", *graph,
+      /*rewrite_subgraph_fn=*/{},
+      /*reuse_existing_functions=*/false, &graph_out, lib_def.get());
+  if (!s.ok()) return s;
+
+  std::unordered_map<string, XlaClusterInfo> clusters;
+  for (const auto& func : encapsulated_functions) {
+    Node* xla_computation_node;
+    for (Node* n : graph_out->nodes()) {
+      if (n->name() == func) {
+        xla_computation_node = n;
+      }
+    }
+    if (!xla_computation_node) {
+      return errors::Internal("Cannot find node ", func);
+    }
+    NameAttrList func_name_attrs;
+    func_name_attrs.set_name(func);
+    clusters.emplace(func,
+                     XlaClusterInfo{func, func_name_attrs, xla_computation_node,
+                                    std::map<string, int>{}});
+  }
+  s = ExtractOutsideCompilation("_encapsulate", "_outside", clusters,
+                                graph_out.get(), lib_def.get());
   if (!s.ok()) return s;
 
   GraphDef graphdef_out;
@@ -520,6 +553,11 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
   return s;
 }
 
+Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library) {
+  std::vector<string> encapsulated_functions;
+  return Encapsulate(graphdef, library, encapsulated_functions);
+}
+
 // If there are no marked nodes, funcification should be a no-op.
 TEST(EncapsulateSubgraphsTest, NoFunctions) {
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
@@ -703,7 +741,7 @@ TEST(EncapsulateSubgraphsTest, InputDeduplication) {
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   std::unique_ptr<Graph> graph;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
-      "_cluster", "_outside", graph_before_encapsulation,
+      "_cluster", "", graph_before_encapsulation,
       /*rewrite_subgraph_fn=*/{},
       /*reuse_existing_functions=*/false, &graph, &library));
 
@@ -755,7 +793,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) {
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   int guaranteed_consts = 0;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
-      "_encapsulate", "_outside", graph_before,
+      "_encapsulate", "", graph_before,
       /*rewrite_subgraph_fn=*/
       [&guaranteed_consts](const std::vector<OutputTensor>& arg_source_tensors,
                            std::unique_ptr<Graph>* graph_ptr,
@@ -800,7 +838,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) {
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   int guaranteed_consts = 0;
   TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
-      "_encapsulate", "_outside", graph_before,
+      "_encapsulate", "", graph_before,
       /*rewrite_subgraph_fn=*/
       [&guaranteed_consts](const std::vector<OutputTensor>& arg_source_tensors,
                            std::unique_ptr<Graph>* graph_ptr,
@@ -854,15 +892,15 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
+    Node* key_constant = KeyPlaceholder("F1", shape.opts());
     Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
                             {DT_FLOAT, DT_FLOAT}, shape.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
@@ -877,7 +915,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
 
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"c"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}, {"C"}},
@@ -899,7 +937,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
-      {{"f_0_retval", "F:o:0"}});
+      {{"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -975,15 +1013,15 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
     Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
                             {DT_FLOAT, DT_FLOAT}, shape1.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
@@ -998,8 +1036,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
 
   {
     GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0"));
+    Node* key_constant = KeyPlaceholder("F1", shape2.opts());
     Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
                              {DT_FLOAT, DT_FLOAT}, shape2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
@@ -1020,7 +1057,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   }
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}},
@@ -1037,14 +1074,13 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"F:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"ancestors",
-             absl::Span<const string>({"outside_compilation_O1_host_compute"})},
+            {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O2"},
             {"shapes", absl::Span<const DataType>({})},
             {"_outside_compilation_subgraph", "O2"}},
-           {"F", "outside_compilation_O1_host_compute"}},
+           {"F"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"C:o:0", "D:o:0"},
@@ -1058,7 +1094,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
-      {{"i_0_retval", "I:o:0"}});
+      {{"i_0_retval_retval", "I:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1149,33 +1185,18 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1", "F2"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
-  {
-    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT, DT_FLOAT}, shape.opts());
-    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts()
-                         .WithName("E")
-                         .WithAttr("_encapsulate", "F1")
-                         .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
-    TF_EXPECT_OK(
-        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
-  }
-
   TensorShapeProto shape_proto_expected;
   shape_proto_expected.add_dim()->set_size(2);
 
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"},
-      {"f_0_retval:float", "d_0_retval:float"}, {},
+      {"f_0_retval_retval:float", "d_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1191,19 +1212,19 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", absl::Span<const DataType>({})},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
-      {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
+      {{"d_0_retval_retval", "D:o:0"}, {"f_0_retval_retval", "F:o:0"}});
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F2", {"e_0_arg:float", "f_0_arg:float"},
-      {"g_0_retval:float", "i_0_retval:float"}, {},
+      "F2", {"f_0_arg:float", "bridge_e_g_0_arg:float"},
+      {"i_0_retval_retval:float", "g_0_retval_retval:float"}, {},
       {
-          {{"G"}, "BinaryTest", {"e_0_arg", "f_0_arg"}},
+          {{"G"}, "BinaryTest", {"bridge_e_g_0_arg", "f_0_arg"}},
           {{"I"},
            "BinaryTest",
            {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
@@ -1219,7 +1240,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
+      {{"i_0_retval_retval", "I:o:0"}, {"g_0_retval_retval", "G:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1265,11 +1286,11 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
         b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}),
         "F2");
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
-    node_builder2.Input(e).Input(call1);
+    node_builder2.Input(call1).Input(e);
     Node* call2 = b2.opts()
                       .WithControlInputs({s2, e, call1})
                       .FinalizeBuilder(&node_builder2);
-    Binary(call2, ops::NodeOut(call2, 1), b2.opts().WithName("J"));
+    Binary(ops::NodeOut(call2, 1), call2, b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1312,44 +1333,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1", "F2"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
-
-  {
-    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT, DT_FLOAT}, shape.opts());
-    Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts()
-                         .WithName("E")
-                         .WithAttr("_encapsulate", "F1")
-                         .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
-    TF_EXPECT_OK(
-        AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
-  }
-
-  {
-    GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F2", "O1",
-                            {DT_FLOAT}, shape.opts());
-    Node* h = Unary(recv, shape.opts()
-                              .WithName("H")
-                              .WithAttr("_encapsulate", "F2")
-                              .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F2", "O1", {h}, shape.opts());
-    TF_EXPECT_OK(
-        AddGraphDefToFunctionLibrary(shape, "F2_O1", &library_expected));
-  }
+  TensorShapeProto shape_proto_expected;
+  shape_proto_expected.add_dim()->set_size(2);
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1365,16 +1358,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", absl::Span<const TensorShapeProto>({})},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
-      {{"f_0_retval", "F:o:0"}});
+      {{"f_0_retval_retval", "F:o:0"}});
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F2", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval:float"}, {},
+      "F2", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval_retval:float"}, {},
       {
           {{"G"}, "BinaryTest", {"a_0_arg", "b_0_arg"}},
           {{"I"},
@@ -1387,12 +1380,12 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F2_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F2_O1"},
-            {"shapes", absl::Span<const TensorShapeProto>({})},
+            {"shape_inference_graph", ""},
+            {"shapes",
+             absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"i_0_retval", "I:o:0"}});
+      {{"i_0_retval_retval", "I:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1439,9 +1432,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
         "F2");
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
     node_builder2.Input(a).Input(b);
-    Node* call2 = b2.opts()
-                      .WithControlInputs({s2, call1})
-                      .FinalizeBuilder(&node_builder2);
+    Node* call2 =
+        b2.opts().WithControlInputs({s2}).FinalizeBuilder(&node_builder2);
     Binary(call1, call2, b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
@@ -1473,7 +1465,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
@@ -1482,7 +1475,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
   shape_proto_expected.add_dim()->set_size(2);
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1501,7 +1494,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"f_0_retval", "F:o:0"}});
+      {{"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1557,7 +1550,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
@@ -1566,7 +1560,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
   shape_proto_expected.add_dim()->set_size(2);
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1586,7 +1580,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
             {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
-      {{"f_0_retval", "F:o:0"}});
+      {{"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1644,13 +1638,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1666,7 +1661,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"f_0_retval", "F:o:0"}});
+      {{"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1721,13 +1716,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1747,7 +1743,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"f_0_retval", "F:o:0"}});
+      {{"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1811,15 +1807,15 @@ TEST(EncapsulateSubgraphsTest,
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
   {
     GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0"));
+    Node* key_constant = KeyPlaceholder("F1", shape2.opts());
     Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
                              {DT_FLOAT}, shape2.opts());
     Node* g = Unary(ops::NodeOut(recv2, 0), shape2.opts()
@@ -1832,7 +1828,7 @@ TEST(EncapsulateSubgraphsTest,
   }
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1852,7 +1848,7 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O2"}}},
       },
-      {{"h_0_retval", "H:o:0"}});
+      {{"h_0_retval_retval", "H:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1920,15 +1916,15 @@ TEST(EncapsulateSubgraphsTest,
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
     Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
                              {DT_FLOAT}, shape1.opts());
     Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
@@ -1941,7 +1937,7 @@ TEST(EncapsulateSubgraphsTest,
   }
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1961,7 +1957,7 @@ TEST(EncapsulateSubgraphsTest,
             {"shapes", absl::Span<const TensorShapeProto>({})},
             {"_outside_compilation_subgraph", "O1"}}},
       },
-      {{"h_0_retval", "H:o:0"}});
+      {{"h_0_retval_retval", "H:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -2034,15 +2030,15 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
     Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
                              {DT_FLOAT}, shape1.opts());
     Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
@@ -2055,7 +2051,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
   }
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval_retval:float"}, {},
       {{{"C"}, "UnaryTest", {"a_0_arg"}},
        {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
        {{"F"}, "UnaryTest", {"outside_compilation_O1_host_compute:outputs:0"}},
@@ -2076,28 +2072,24 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"D:o:0"},
         {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
          {"Toutputs", absl::Span<const DataType>({})},
-         {"ancestors",
-          absl::Span<const string>({"outside_compilation_O1_host_compute"})},
+         {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_O2"},
          {"shape_inference_graph", ""},
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O2"}},
-        {"outside_compilation_O1_host_compute"}},
+        {}},
        {{"outside_compilation_O3_host_compute"},
         "XlaHostCompute",
         {"D:o:0"},
         {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
          {"Toutputs", absl::Span<const DataType>({})},
-         {"ancestors",
-          absl::Span<const string>({"outside_compilation_O1_host_compute",
-                                    "outside_compilation_O2_host_compute"})},
+         {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_O3"},
          {"shape_inference_graph", ""},
          {"shapes", absl::Span<const TensorShapeProto>({})},
          {"_outside_compilation_subgraph", "O3"}},
-        {"outside_compilation_O1_host_compute",
-         "outside_compilation_O2_host_compute"}}},
-      {{"h_0_retval", "H:o:0"}});
+        {}}},
+      {{"h_0_retval_retval", "H:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -2169,19 +2161,20 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"}, "UnaryTest", {"D:o:0"}},
       },
-      {{"f_0_retval", "F:o:0"}});
+      {{"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -2234,19 +2227,20 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
   }
 
-  TF_EXPECT_OK(Encapsulate(&graphdef, &library));
+  std::vector<string> encapsulated_functions{"F1"};
+  TF_EXPECT_OK(Encapsulate(&graphdef, &library, encapsulated_functions));
 
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
-    Node* key_constant =
-        KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
-    Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_1"));
+    Node* key_constant = KeyPlaceholder("F1", shape.opts());
     Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
                             {DT_FLOAT}, shape.opts());
-    Node* e = BinaryUnknownShape(known, recv,
+    Node* a = InputShaped(shape.opts().WithName("A"));
+    Node* c = Unary(a, shape.opts().WithName("C"));
+    Node* e = BinaryUnknownShape(c, recv,
                                  shape.opts()
                                      .WithName("E")
                                      .WithAttr("_encapsulate", "F1")
@@ -2258,7 +2252,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
 
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval:float"}, {},
+      "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval_retval:float"}, {},
       {
           {{"c"}, "UnaryTest", {"b_0_arg"}, {}, {}},
           {{"F"},
@@ -2279,7 +2273,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
             {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
-      {{"f_0_retval", "F:o:0"}});
+      {{"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f4b9c90a4ff0b1166cdb7b5942771b350740ef3
--- /dev/null
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -0,0 +1,955 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include <algorithm>
+#include <iterator>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Returns string attribute value for the node if the attribute is present,
+// otherwise returns empty optional value.
+absl::optional<string> GetStringAttr(const Node& n, const string& attr_name) {
+  auto attr = n.attrs().Find(attr_name);
+  if (!attr) {
+    return absl::nullopt;
+  } else {
+    return attr->s();
+  }
+}
+
+// Adds a value to the node's list attribute.
+template <typename T>
+Status AppendToListAttr(Node* n, const string& attr_name, const string& value) {
+  std::vector<T> attr_value;
+  Status s = GetNodeAttr(n->attrs(), attr_name, &attr_value);
+  if (!s.ok() && s.code() != error::NOT_FOUND) {
+    return s;
+  }
+
+  n->ClearAttr(attr_name);
+  attr_value.push_back(value);
+  n->AddAttr(attr_name, attr_value);
+  return Status::OK();
+}
+
+// Replaces attribute value.
+template <typename T>
+void ReplaceAttr(Node* n, const string& attr_name, const T& value) {
+  n->ClearAttr(attr_name);
+  n->AddAttr(attr_name, value);
+}
+
+// Step 1a ~ 1d for PreprocessForEncapsulation(). See comments of
+// PreprocessForEncapsulation() for details.
+Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
+                           const string& outside_compilation_attr_name) {
+  // Gather edges to remove. We should not remove the edge while iterating.
+  std::vector<const Edge*> edges_to_remove;
+  for (const Edge* e : g->edges()) {
+    if (!e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_xla_computation =
+        GetStringAttr(*e->src(), xla_computation_attr_name);
+    auto dst_xla_computation =
+        GetStringAttr(*e->dst(), xla_computation_attr_name);
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (!src_xla_computation && !dst_xla_computation) {
+      continue;
+    } else if (src_xla_computation && !dst_xla_computation) {
+      if (src_outside_compilation) {
+        // Case 1c: outside compilation to host computation control edge.
+        edges_to_remove.push_back(e);
+
+        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+            e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
+      }
+    } else if (!src_xla_computation && dst_xla_computation) {
+      if (dst_outside_compilation) {
+        // Case 1c: host computation control to outside compilation edge.
+        edges_to_remove.push_back(e);
+
+        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+            e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
+      }
+    } else {  // src_xla_computation && dst_xla_computation
+      if (*src_xla_computation != *dst_xla_computation) {
+        if (src_outside_compilation && dst_outside_compilation) {
+          // Case 1b: outside compilation to outside compilation control edge.
+          edges_to_remove.push_back(e);
+
+          TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+              e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
+        } else if (src_outside_compilation && !dst_outside_compilation) {
+          // Case 1a: outside compilation to another XLA computaition control
+          // edge.
+          TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+              e->src(), kXlaConnectedToOtherXlaComputationAttrName,
+              *dst_xla_computation));
+        } else if (!src_outside_compilation && dst_outside_compilation) {
+          // Case 1a: another XLA computaition to outside compilation control
+          // edge.
+          TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+              e->dst(), kXlaConnectedFromOtherXlaComputationAttrName,
+              *src_xla_computation));
+        }
+      }
+    }
+  }
+
+  for (auto e : edges_to_remove) {
+    g->RemoveEdge(e);
+  }
+  return Status::OK();
+}
+
+// Step 2 for PreprocessForEncapsulation(). See comments of
+// PreprocessForEncapsulation() for details.
+Status ProcessXlaToXlaDataEdges(Graph* g,
+                                const string& xla_computation_attr_name,
+                                const string& outside_compilation_attr_name) {
+  // Gather edges between XLA computations. Notice that we do not store `Edge*`
+  // directly because we remove some nodes while adding Identity nodes, and
+  // those Edge pointers might be invalidated.
+  struct EdgeInfo {
+    int dst_input, dst_node_id;
+  };
+  std::vector<EdgeInfo> edges;
+  for (const Edge* e : g->edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_xla_computation =
+        GetStringAttr(*e->src(), xla_computation_attr_name);
+    auto dst_xla_computation =
+        GetStringAttr(*e->dst(), xla_computation_attr_name);
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+    if (!src_xla_computation || !dst_xla_computation) {
+      continue;
+    }
+
+    if (*src_xla_computation != *dst_xla_computation) {
+      if (src_outside_compilation || dst_outside_compilation) {
+        edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
+        VLOG(4) << "XLA -> XLA edge: " << e->DebugString();
+      }
+    }
+  }
+
+  // For each XLA -> XLA edge, add an Identity node between src and dst.
+  for (int i = 0; i < edges.size(); i++) {
+    Node* dst = g->FindNodeId(edges[i].dst_node_id);
+    const Edge* e;
+    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
+    Node* src = e->src();
+    int src_output = e->src_output(), dst_input = e->dst_input();
+    g->RemoveEdge(e);
+
+    // Create Identity node, and connect it between `src` and `dst`.
+    string identity_node_name =
+        absl::StrCat("bridge_", src->name(), "_", dst->name());
+    DataType dtype = src->output_type(src_output);
+    TF_ASSIGN_OR_RETURN(Node * identity_node,
+                        BuildIdentityNode(g, identity_node_name, dtype, src,
+                                          /*requested_device=*/absl::nullopt));
+    identity_node->AddAttr(kBridgeSourceNodeAttrName, src->name());
+    g->AddEdge(src, src_output, identity_node, 0);
+    g->AddEdge(identity_node, 0, dst, dst_input);
+
+    // Replace `e->dst()` because its input node changed.
+    NodeDef new_def = dst->def();
+    *new_def.mutable_input(dst_input) = identity_node->name();
+    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
+
+    // Other edge in `edges` might have `e->dst()` as src or dst
+    // node. Before removing `e->dst()`, replace those edges with corresponding
+    // edges for `dst_replace_node`.
+    for (int j = i + 1; j < edges.size(); j++) {
+      if (edges[j].dst_node_id == edges[i].dst_node_id) {
+        edges[j].dst_node_id = dst_replace_node->id();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Step 3 for PreprocessForEncapsulation(). See comments of
+// PreprocessForEncapsulation() for details.
+Status ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
+    Graph* g, const string& xla_computation_attr_name,
+    const string& outside_compilation_attr_name) {
+  // Gather edges between outside compilation and host computation. Notice that
+  // we do not store `Edge*` directly because we remove some nodes while adding
+  // Identity nodes, and those Edge pointers might be invalidated.
+  struct EdgeInfo {
+    int dst_input, dst_node_id;
+    bool is_host_to_outside_compilation;
+  };
+  std::vector<EdgeInfo> edges;
+  for (const Edge* e : g->edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    if (e->src()->attrs().Find(xla_computation_attr_name) == nullptr &&
+        e->dst()->attrs().Find(xla_computation_attr_name) != nullptr &&
+        e->dst()->attrs().Find(outside_compilation_attr_name) != nullptr) {
+      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id(),
+                               /*is_host_to_outside_compilation=*/true});
+      VLOG(4) << "Host -> oc edge: " << e->DebugString();
+    } else if (e->dst()->attrs().Find(xla_computation_attr_name) == nullptr &&
+               e->src()->attrs().Find(xla_computation_attr_name) != nullptr &&
+               e->src()->attrs().Find(outside_compilation_attr_name) !=
+                   nullptr) {
+      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id(),
+                               /*is_host_to_outside_compilation=*/false});
+      VLOG(4) << "Oc -> host edge: " << e->DebugString();
+    }
+  }
+
+  // Remove the edge from host to outside compilation. Add a placeholder as
+  // outside compilation node input.
+  std::map<std::pair<string, int>, Node*> placeholders;
+  for (int i = 0; i < edges.size(); i++) {
+    Node* dst = g->FindNodeId(edges[i].dst_node_id);
+    const Edge* e;
+    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
+    Node* src = e->src();
+    int src_output = e->src_output(), dst_input = e->dst_input();
+    g->RemoveEdge(e);
+
+    // Find or create placeholder node.
+    string new_name =
+        edges[i].is_host_to_outside_compilation
+            ? absl::StrCat(src->name(), "_host_to_oc_placeholder_", src_output)
+            : absl::StrCat(src->name(), "_oc_to_host_placeholder_", src_output);
+    auto placeholder_index = std::make_pair(src->name(), src_output);
+    auto iter = placeholders.find(placeholder_index);
+    Node* placeholder_node;
+    if (iter == placeholders.end()) {
+      NodeDefBuilder placeholder_builder(new_name, "Placeholder");
+      placeholder_builder.Attr("dtype", src->output_type(src_output));
+      if (edges[i].is_host_to_outside_compilation) {
+        placeholder_builder.Attr(kHostToOutsideCompilationOriginalNodeAttrName,
+                                 src->name());
+        placeholder_builder.Attr(kHostToOutsideCompilationSrcOutputAttrName,
+                                 src_output);
+        // If this placeholder node is in outside compilation, we need to set
+        // `xla_computation_attr_name` and `outside_compilation_attr_name`.
+        string xla_computation_attr, outside_compilation_attr;
+        TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(), xla_computation_attr_name,
+                                       &xla_computation_attr));
+        TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(),
+                                       outside_compilation_attr_name,
+                                       &outside_compilation_attr));
+        placeholder_builder.Attr(xla_computation_attr_name,
+                                 xla_computation_attr);
+        placeholder_builder.Attr(outside_compilation_attr_name,
+                                 outside_compilation_attr);
+      } else {
+        placeholder_builder.Attr(kOutsideCompilationToHostOriginalNodeAttrName,
+                                 src->name());
+        placeholder_builder.Attr(kOutsideCompilationToHostSrcOutputAttrName,
+                                 src_output);
+      }
+      NodeDef placeholder_def;
+      TF_RETURN_IF_ERROR(placeholder_builder.Finalize(&placeholder_def));
+      Status s;
+      placeholder_node = g->AddNode(placeholder_def, &s);
+      TF_RETURN_IF_ERROR(s);
+      placeholders[placeholder_index] = placeholder_node;
+    } else {
+      placeholder_node = iter->second;
+    }
+    g->AddEdge(placeholder_node, 0, dst, dst_input);
+
+    // Replace `e->dst()` because its input node changed.
+    NodeDef new_def = dst->def();
+    *new_def.mutable_input(dst_input) = placeholder_node->name();
+    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
+
+    // Other edge in `edges` might have `e->dst()` as src or dst
+    // node. Before removing `e->dst()`, replace those edges with corresponding
+    // edges for `dst_replace_node`.
+    for (int j = i + 1; j < edges.size(); j++) {
+      if (edges[j].dst_node_id == edges[i].dst_node_id) {
+        edges[j].dst_node_id = dst_replace_node->id();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Step 1 for `PostprocessForEncapsulation`. See comments of
+// `PostprocessForEncapsulation` for details.
+Status RemovePlaceholderBetweenOutsideCompilationAndHostComputation(Graph* g) {
+  // Gather all outside compilation to host computation nodes.
+  struct PlaceHolderNodeInfo {
+    Node* n;
+    bool is_host_to_oc;
+  };
+  std::vector<PlaceHolderNodeInfo> placeholder_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "Placeholder") {
+      if (HasNodeAttr(n->def(),
+                      kOutsideCompilationToHostOriginalNodeAttrName)) {
+        placeholder_nodes.push_back({n, false});
+      } else if (HasNodeAttr(n->def(),
+                             kHostToOutsideCompilationOriginalNodeAttrName)) {
+        placeholder_nodes.push_back({n, true});
+      }
+    }
+  }
+
+  // Remove the placeholder nodes, and reconnect original edge.
+  auto node_name_index = g->BuildNodeNameIndex();
+  for (auto placeholder_iter : placeholder_nodes) {
+    Node* n = placeholder_iter.n;
+
+    string node_name;
+    int node_src_output;
+    if (placeholder_iter.is_host_to_oc) {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(n->attrs(), kHostToOutsideCompilationOriginalNodeAttrName,
+                      &node_name));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(),
+                                     kHostToOutsideCompilationSrcOutputAttrName,
+                                     &node_src_output));
+    } else {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(n->attrs(), kOutsideCompilationToHostOriginalNodeAttrName,
+                      &node_name));
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(),
+                                     kOutsideCompilationToHostSrcOutputAttrName,
+                                     &node_src_output));
+    }
+    auto iter = node_name_index.find(node_name);
+    if (iter == node_name_index.end()) {
+      return errors::Internal(
+          "Cannot find original node for oc -> host placeholder node ",
+          node_name);
+    }
+
+    // Change all usage node to use the original node instead.
+    Node* original_node = iter->second;
+    std::vector<const Edge*> control_edges;
+    std::vector<OutEdgeInfo> data_edges;
+    for (auto e : n->out_edges()) {
+      if (e->IsControlEdge()) {
+        control_edges.push_back(e);
+      } else {
+        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
+      }
+    }
+    for (const Edge* e : control_edges) {
+      g->AddControlEdge(original_node, e->dst());
+      g->RemoveEdge(e);
+    }
+    for (int i = 0; i < data_edges.size(); i++) {
+      Node* dst = data_edges[i].dst;
+      NodeDef new_def = dst->def();
+      int dst_input = data_edges[i].dst_input;
+      *new_def.mutable_input(dst_input) =
+          absl::StrCat(original_node->name(), ":", node_src_output);
+      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
+
+      const Edge* edge_to_replace = nullptr;
+      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
+      g->RemoveEdge(edge_to_replace);
+      g->AddEdge(original_node, node_src_output, replace_node, dst_input);
+
+      // Other edges might have `dst` as dst node. Update those edges with
+      // `replace_node`.
+      for (int j = i + 1; j < data_edges.size(); j++) {
+        if (data_edges[j].dst == dst) {
+          data_edges[j].dst = replace_node;
+        }
+      }
+
+      // Other placeholder node might have `dst` as original node. Update
+      // `node_name_index` with `replace_node`.
+      node_name_index[replace_node->name()] = replace_node;
+    }
+
+    // Remove placeholder node.
+    g->RemoveNode(n);
+  }
+  return Status::OK();
+}
+
+// Step 2 for `PostprocessForEncapsulation`. See comments of
+// `PostprocessForEncapsulation` for details.
+Status RemoveIdentityBetweenDifferentXlaComputation(Graph* g) {
+  // Gather Identity nodes to remove.
+  std::vector<Node*> bridge_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "Identity" &&
+        HasNodeAttr(n->def(), kBridgeSourceNodeAttrName)) {
+      bridge_nodes.push_back(n);
+    }
+  }
+
+  // Remove the identity nodes, and reconnect the original edge.
+  for (int i = 0; i < bridge_nodes.size(); i++) {
+    Node* n = bridge_nodes[i];
+    const Edge* src_edge = nullptr;
+    TF_RETURN_IF_ERROR(n->input_edge(0, &src_edge));
+
+    // Change all usage node to use the original node instead.
+    std::vector<const Edge*> control_edges;
+    std::vector<OutEdgeInfo> data_edges;
+    for (auto e : n->out_edges()) {
+      if (e->IsControlEdge()) {
+        control_edges.push_back(e);
+      } else {
+        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
+      }
+    }
+    for (const Edge* e : control_edges) {
+      g->AddControlEdge(src_edge->src(), e->dst());
+      g->RemoveEdge(e);
+    }
+    for (int j = 0; j < data_edges.size(); j++) {
+      Node* dst = data_edges[j].dst;
+      NodeDef new_def = dst->def();
+      int dst_input = data_edges[j].dst_input;
+      *new_def.mutable_input(dst_input) =
+          absl::StrCat(src_edge->src()->name(), ":", src_edge->src_output());
+      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
+
+      const Edge* edge_to_replace = nullptr;
+      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
+      g->RemoveEdge(edge_to_replace);
+      g->AddEdge(src_edge->src(), src_edge->src_output(), replace_node,
+                 dst_input);
+
+      // Other edges might have `dst` as dst node. Update those edges with
+      // `replace_node`.
+      for (int k = j + 1; k < data_edges.size(); k++) {
+        if (data_edges[k].dst == dst) {
+          data_edges[k].dst = replace_node;
+        }
+      }
+
+      // The node we replaced might be in `bridge_nodes`. If so, update
+      // `bridge_nodes` to use the replaced node.
+      for (int k = i + 1; k < bridge_nodes.size(); k++) {
+        if (bridge_nodes[k] == dst) {
+          bridge_nodes[k] = replace_node;
+        }
+      }
+    }
+
+    // Remove Identity node.
+    g->RemoveNode(n);
+  }
+  return Status::OK();
+}
+
+// Step 3 for `PostprocessForEncapsulation`. See comments of
+// `PostprocessForEncapsulation` for details.
+// We do not need to worry about removed nodes in step 1 and 2;
+// `PreprocessForEncapsulation` will not record control dependencies for those
+// remvoed nodes in the first place.
+Status AddControlDependencies(
+    Graph* g, const std::unordered_map<string, string>& cluster_node_names) {
+  auto node_name_index = g->BuildNodeNameIndex();
+
+  // Reconnect outside compilation to outside compilation control edge.
+  for (Node* n : g->nodes()) {
+    std::vector<string> control_deps;
+    Status s =
+        GetNodeAttr(n->attrs(), kXlaControlDependenciesAttrName, &control_deps);
+    if (!s.ok()) {
+      if (s.code() != error::NOT_FOUND) {
+        return s;
+      } else {
+        continue;
+      }
+    } else {
+      n->ClearAttr(kXlaControlDependenciesAttrName);
+      for (const string& control_input : control_deps) {
+        auto iter = node_name_index.find(control_input);
+        if (iter == node_name_index.end()) {
+          return errors::Internal("Cannot find original node for ",
+                                  control_input);
+        }
+        g->AddControlEdge(iter->second, n);
+      }
+    }
+  }
+
+  // Reconnect outside compilation to XLA computation control edge.
+  for (Node* n : g->nodes()) {
+    std::vector<string> control_deps;
+    Status s = GetNodeAttr(
+        n->attrs(), kXlaConnectedToOtherXlaComputationAttrName, &control_deps);
+    if (!s.ok()) {
+      if (s.code() != error::NOT_FOUND) {
+        return s;
+      } else {
+        continue;
+      }
+    } else {
+      n->ClearAttr(kXlaConnectedToOtherXlaComputationAttrName);
+      for (const string& control_input : control_deps) {
+        auto iter = cluster_node_names.find(control_input);
+        if (iter == cluster_node_names.end()) {
+          return errors::Internal("Cannot find cluster node for ",
+                                  control_input);
+        }
+        auto iter2 = node_name_index.find(iter->second);
+        if (iter2 == node_name_index.end()) {
+          return errors::Internal("Cannot find cluster node for ",
+                                  iter->second);
+        }
+        g->AddControlEdge(n, iter2->second);
+      }
+    }
+  }
+
+  // Reconnect XLA computation to outside compilation control edge.
+  for (Node* n : g->nodes()) {
+    std::vector<string> control_deps;
+    Status s =
+        GetNodeAttr(n->attrs(), kXlaConnectedFromOtherXlaComputationAttrName,
+                    &control_deps);
+    if (!s.ok()) {
+      if (s.code() != error::NOT_FOUND) {
+        return s;
+      } else {
+        continue;
+      }
+    } else {
+      n->ClearAttr(kXlaConnectedFromOtherXlaComputationAttrName);
+      for (const string& control_input : control_deps) {
+        auto iter = cluster_node_names.find(control_input);
+        if (iter == cluster_node_names.end()) {
+          return errors::Internal("Cannot find cluster node for ",
+                                  control_input);
+        }
+        auto iter2 = node_name_index.find(iter->second);
+        if (iter2 == node_name_index.end()) {
+          return errors::Internal("Cannot find cluster node for ",
+                                  iter->second);
+        }
+        g->AddControlEdge(iter2->second, n);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+// Step 1 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PreprocessEdgesBetweenOutsideCompilations` for details.
+Status PreprocessControlEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather edges to remove. We should not remove the edge while iterating.
+  std::vector<const Edge*> edges_to_remove;
+  for (const Edge* e : g->edges()) {
+    if (!e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (src_outside_compilation && dst_outside_compilation) {
+      if (*src_outside_compilation != *dst_outside_compilation) {
+        // Case 1a: outside compilation to outside compilation control edge.
+        edges_to_remove.push_back(e);
+
+        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+            e->dst(), kXlaControlDependenciesWithinXlaClusterAttrName,
+            e->src()->name()));
+      }
+    } else if (src_outside_compilation && !dst_outside_compilation) {
+      // Case 1b: outside compilation to its XLA computation control edge.
+      ReplaceAttr(e->src(), kXlaConnectedToXlaComputationAttrName, true);
+    } else if (!src_outside_compilation && dst_outside_compilation) {
+      // Case 1b: XLA computation to outside compilation in it control edge.
+      ReplaceAttr(e->dst(), kXlaConnectedFromXlaComputationAttrName, true);
+    }
+  }
+
+  for (auto e : edges_to_remove) {
+    g->RemoveEdge(e);
+  }
+  return Status::OK();
+}
+
+// Step 2 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PreprocessEdgesBetweenOutsideCompilations` for details.
+Status PreprocessDataEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather edges between outside compilation and host computation. Notice that
+  // we do not store `Edge*` directly because we remove some nodes while adding
+  // Identity nodes, and those Edge pointers might be invalidated.
+  struct EdgeInfo {
+    int dst_input, dst_node_id;
+  };
+  std::vector<EdgeInfo> edges;
+  for (const Edge* e : g->edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (src_outside_compilation && dst_outside_compilation &&
+        *src_outside_compilation != *dst_outside_compilation) {
+      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
+      VLOG(4) << "Oc -> oc edge: " << e->DebugString();
+    }
+  }
+
+  // Remove the edge from host to outside compilation. Add a placeholder as
+  // outside compilation node input.
+  std::map<std::pair<string, int>, Node*> placeholders;
+  for (int i = 0; i < edges.size(); i++) {
+    Node* dst = g->FindNodeId(edges[i].dst_node_id);
+    const Edge* e;
+    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
+    Node* src = e->src();
+    int src_output = e->src_output(), dst_input = e->dst_input();
+    g->RemoveEdge(e);
+
+    // Find or create placeholder node.
+    string new_name =
+        absl::StrCat(src->name(), "_oc_to_oc_placeholder_", src_output);
+    auto placeholder_index = std::make_pair(src->name(), src_output);
+    auto iter = placeholders.find(placeholder_index);
+    Node* placeholder_node;
+    if (iter == placeholders.end()) {
+      NodeDefBuilder placeholder_builder(new_name, "Placeholder");
+      placeholder_builder.Attr("dtype", src->output_type(src_output));
+      string outside_compilation_attr;
+      TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(),
+                                     outside_compilation_attr_name,
+                                     &outside_compilation_attr));
+      placeholder_builder.Attr(outside_compilation_attr_name,
+                               outside_compilation_attr);
+      placeholder_builder.Attr(kOutsideCompilationOriginalNodeAttrName,
+                               src->name());
+      placeholder_builder.Attr(kOutsideCompilationSrcOutputAttrName,
+                               src_output);
+      NodeDef placeholder_def;
+      TF_RETURN_IF_ERROR(placeholder_builder.Finalize(&placeholder_def));
+      Status s;
+      placeholder_node = g->AddNode(placeholder_def, &s);
+      TF_RETURN_IF_ERROR(s);
+      placeholders[placeholder_index] = placeholder_node;
+    } else {
+      placeholder_node = iter->second;
+    }
+    g->AddEdge(placeholder_node, 0, dst, dst_input);
+
+    // Replace `e->dst()` because its input node changed.
+    NodeDef new_def = dst->def();
+    *new_def.mutable_input(dst_input) = placeholder_node->name();
+    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
+
+    // Other edge in `edges` might have `e->dst()` as src or dst
+    // node. Before removing `e->dst()`, replace those edges with
+    // corresponding edges for `dst_replace_node`.
+    for (int j = i + 1; j < edges.size(); j++) {
+      if (edges[j].dst_node_id == edges[i].dst_node_id) {
+        edges[j].dst_node_id = dst_replace_node->id();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Step 1 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PostprocessEdgesBetweenOutsideCompilations` for details.
+Status PostprocessDataEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather all outside compilation to outside compilation nodes.
+  std::vector<Node*> placeholder_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "Placeholder" &&
+        HasNodeAttr(n->def(), kOutsideCompilationOriginalNodeAttrName)) {
+      placeholder_nodes.push_back(n);
+    }
+  }
+
+  // Remove the placeholder nodes, and reconnect original edge.
+  auto node_name_index = g->BuildNodeNameIndex();
+  for (auto n : placeholder_nodes) {
+    string node_name;
+    int node_src_output;
+    TF_RETURN_IF_ERROR(GetNodeAttr(
+        n->attrs(), kOutsideCompilationOriginalNodeAttrName, &node_name));
+    TF_RETURN_IF_ERROR(GetNodeAttr(
+        n->attrs(), kOutsideCompilationSrcOutputAttrName, &node_src_output));
+    auto iter = node_name_index.find(node_name);
+    if (iter == node_name_index.end()) {
+      return errors::Internal(
+          "Cannot find original node for oc -> host placeholder node ",
+          node_name);
+    }
+
+    // Change all usage node to use the original node instead.
+    Node* original_node = iter->second;
+    std::vector<const Edge*> control_edges;
+    std::vector<OutEdgeInfo> data_edges;
+    for (auto e : n->out_edges()) {
+      if (e->IsControlEdge()) {
+        control_edges.push_back(e);
+      } else {
+        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
+      }
+    }
+    for (const Edge* e : control_edges) {
+      g->AddControlEdge(original_node, e->dst());
+      g->RemoveEdge(e);
+    }
+    for (int i = 0; i < data_edges.size(); i++) {
+      Node* dst = data_edges[i].dst;
+      NodeDef new_def = dst->def();
+      int dst_input = data_edges[i].dst_input;
+      *new_def.mutable_input(dst_input) =
+          absl::StrCat(original_node->name(), ":", node_src_output);
+      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
+
+      const Edge* edge_to_replace = nullptr;
+      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
+      g->RemoveEdge(edge_to_replace);
+      g->AddEdge(original_node, node_src_output, replace_node, dst_input);
+
+      // Other edges might have `dst` as dst node. Update those edges with
+      // `replace_node`.
+      for (int j = i + 1; j < data_edges.size(); j++) {
+        if (data_edges[j].dst == dst) {
+          data_edges[j].dst = replace_node;
+        }
+      }
+
+      // Other placeholder node might have `dst` as original node. Update
+      // `node_name_index` with `replace_node`.
+      node_name_index[replace_node->name()] = replace_node;
+    }
+
+    // Remove placeholder node.
+    g->RemoveNode(n);
+  }
+  return Status::OK();
+}
+
+// Step 2 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PostprocessEdgesBetweenOutsideCompilations` for details.
+Status PostprocessControlEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  auto node_name_index = g->BuildNodeNameIndex();
+
+  // Reconnect outside compilation to outside compilation control edge.
+  for (Node* n : g->nodes()) {
+    std::vector<string> control_deps;
+    Status s =
+        GetNodeAttr(n->attrs(), kXlaControlDependenciesWithinXlaClusterAttrName,
+                    &control_deps);
+    if (!s.ok()) {
+      if (s.code() != error::NOT_FOUND) {
+        return s;
+      } else {
+        continue;
+      }
+    } else {
+      n->ClearAttr(kXlaControlDependenciesWithinXlaClusterAttrName);
+      for (const string& control_input : control_deps) {
+        auto iter = node_name_index.find(control_input);
+        if (iter == node_name_index.end()) {
+          return errors::Internal("Cannot find original node for ",
+                                  control_input);
+        }
+        g->AddControlEdge(iter->second, n);
+      }
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
+const char kXlaInferredShapesAttrName[] = "_xla_inferred_shapes";
+
+const char kXlaConnectedToOtherXlaComputationAttrName[] =
+    "_xla_connected_to_other_xla_computation";
+const char kXlaConnectedFromOtherXlaComputationAttrName[] =
+    "_xla_connected_from_other_xla_computation";
+const char kXlaControlDependenciesAttrName[] = "_xla_control_dependencies";
+const char kBridgeSourceNodeAttrName[] = "_xla_bridge_src";
+const char kOutsideCompilationToHostOriginalNodeAttrName[] =
+    "_xla_oc_to_host_node_name";
+const char kOutsideCompilationToHostSrcOutputAttrName[] =
+    "_xla_oc_to_host_src_output";
+const char kHostToOutsideCompilationOriginalNodeAttrName[] =
+    "_xla_host_to_oc_node_name";
+const char kHostToOutsideCompilationSrcOutputAttrName[] =
+    "_xla_host_to_oc_src_output";
+const char kXlaConnectedToXlaComputationAttrName[] =
+    "_xla_connected_to_xla_computation";
+const char kXlaConnectedFromXlaComputationAttrName[] =
+    "_xla_connected_from_xla_computation";
+const char kOutsideCompilationOriginalNodeAttrName[] =
+    "_xla_oc_to_oc_node_name";
+const char kOutsideCompilationSrcOutputAttrName[] = "_xla_oc_to_oc_src_output";
+const char kXlaControlDependenciesWithinXlaClusterAttrName[] =
+    "_xla_control_dependencies_within_xla_cluster";
+
+Status PerformStaticShapeInferenceBeforeEncapsulation(
+    Graph* g, const string& xla_computation_attr_name,
+    const string& outside_compilation_attr_name) {
+  // Find all outside compilation to XLA computation data edges.
+  std::unordered_set<Node*> outside_compilation_send_nodes;
+  for (auto e : g->edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_computation = GetStringAttr(*e->src(), xla_computation_attr_name);
+    auto dst_computation = GetStringAttr(*e->dst(), xla_computation_attr_name);
+    if (!src_computation || !dst_computation ||
+        *src_computation != *dst_computation) {
+      continue;
+    }
+
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+    if (src_outside_compilation && !dst_outside_compilation) {
+      outside_compilation_send_nodes.insert(e->src());
+    }
+  }
+
+  // Perform shape inference.
+  std::map<int, InferredShape> arg_shapes;
+  GraphShapeInfo shape_info;
+  TF_RETURN_IF_ERROR(
+      InferShapes(g, arg_shapes, /*fnlib_def=*/nullptr, &shape_info));
+
+  // Add attribute for output shapes.
+  for (Node* n : outside_compilation_send_nodes) {
+    auto iter = shape_info.find(n->name());
+    if (iter == shape_info.end()) {
+      continue;
+    }
+
+    std::vector<PartialTensorShape> output_shapes;
+    std::transform(iter->second.begin(), iter->second.end(),
+                   std::back_inserter(output_shapes),
+                   [](const InferredShape& inferred_shape) {
+                     return inferred_shape.shape;
+                   });
+    n->AddAttr(kXlaInferredShapesAttrName, output_shapes);
+  }
+
+  return Status::OK();
+}
+
+Status PreprocessForEncapsulation(Graph* g,
+                                  const string& xla_computation_attr_name,
+                                  const string& outside_compilation_attr_name) {
+  TF_RETURN_IF_ERROR(ProcessControlEdges(g, xla_computation_attr_name,
+                                         outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(ProcessXlaToXlaDataEdges(g, xla_computation_attr_name,
+                                              outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
+      g, xla_computation_attr_name, outside_compilation_attr_name));
+  return Status::OK();
+}
+
+Status PostprocessForEncapsulation(
+    Graph* g, const string& xla_computation_attr_name,
+    const string& outside_compilation_attr_name,
+    const std::unordered_map<string, XlaClusterInfo>& clusters) {
+  // The `node` pointer in `XlaClusterInfo` might be invalidated in step 1/2,
+  // but the node name won't change. Record cluster node name for
+  // `AddControlDependencies`.
+  std::unordered_map<string, string> cluster_node_names;
+  for (const auto& iter : clusters) {
+    cluster_node_names[iter.first] = iter.second.node->name();
+  }
+
+  TF_RETURN_IF_ERROR(
+      RemovePlaceholderBetweenOutsideCompilationAndHostComputation(g));
+  TF_RETURN_IF_ERROR(RemoveIdentityBetweenDifferentXlaComputation(g));
+  TF_RETURN_IF_ERROR(AddControlDependencies(g, cluster_node_names));
+  return Status::OK();
+}
+
+Status PreprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Remove edges from source node to outside compilation nodes, and edges
+  // from outside compilation nodes to sink node.
+  std::vector<const Edge*> edges_to_remove;
+  for (const Edge* e : g->source_node()->out_edges()) {
+    if (HasNodeAttr(e->dst()->def(), outside_compilation_attr_name)) {
+      edges_to_remove.push_back(e);
+    }
+  }
+  for (const Edge* e : g->sink_node()->in_edges()) {
+    if (HasNodeAttr(e->src()->def(), outside_compilation_attr_name)) {
+      edges_to_remove.push_back(e);
+    }
+  }
+  for (auto e : edges_to_remove) {
+    g->RemoveEdge(e);
+  }
+
+  TF_RETURN_IF_ERROR(PreprocessControlEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(PreprocessDataEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  return Status::OK();
+}
+
+Status PostprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  TF_RETURN_IF_ERROR(PostprocessDataEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(PostprocessControlEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e363bc5754ac395bae262dc67a780a0173efaf5e
--- /dev/null
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -0,0 +1,210 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains some utility functions for encapsulating XLA computation
+// in host graph and encapsulating outside compilation in XLA computation.
+
+#ifndef TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Attribute marking output tensor shapes inferred by XLA. Attribute value is
+// a list of PartialTensorShape objects.
+extern const char kXlaInferredShapesAttrName[];
+
+// Infer output shapes for outside compilation nodes which have output data
+// edges to XLA computation nodes. These shapes will be used later by XLA
+// compiler as output shapes of the outside compilation's XlaHostCompute op.
+// XLA computation nodes will be mark by attr `xla_computation_attr_name`;
+// outside compilation nodes will be marked by both attr
+// `xla_computation_attr_name` and `outside_compilation_attr_name`.
+//
+// Those outside compilation nodes will be marked with attribute
+// `kXlaInferredShapesAttrName`.
+//
+// We have to perform shape inference before encapsulation because after
+// encapsulation, some nodes will be encapsulated into function call, and shape
+// inference does not handle function call at the moment.
+Status PerformStaticShapeInferenceBeforeEncapsulation(
+    Graph* g, const string& xla_computation_attr_name,
+    const string& outside_compilation_attr_name);
+
+// Attribute indicating that some ops in other XLA computation has control
+// dependency on this node. Attribute value will be a list of string (XLA
+// computation names).
+extern const char kXlaConnectedToOtherXlaComputationAttrName[];
+
+// Attribute indicating that this node has control dependency on some ops in
+// other XLA computation. Attribute value will be a list of string (XLA
+// computation names).
+extern const char kXlaConnectedFromOtherXlaComputationAttrName[];
+
+// Attribute indicating that this node has control dependencies on some other
+// nodes. Attribute value will be a list of string (node names).
+extern const char kXlaControlDependenciesAttrName[];
+
+// Attribute indicating that this is an Identity node added to act as a bridge
+// between different XLA computations. Attribute value will be string (source
+// node name).
+extern const char kBridgeSourceNodeAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// string (original input node name).
+extern const char kOutsideCompilationToHostOriginalNodeAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// int (src_output for original edge).
+extern const char kOutsideCompilationToHostSrcOutputAttrName[];
+
+// Attribute indicating that some ops in this node's XLA computation has control
+// dependency on this node. Attribute value will always be "true".
+extern const char kXlaConnectedToXlaComputationAttrName[];
+
+// Attribute indicating that this node has control dependency on some ops in
+// this node's XLA computation. Attribute value will always be "true".
+extern const char kXlaConnectedFromXlaComputationAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an host node. Attribute value will be string
+// (original input node name).
+extern const char kHostToOutsideCompilationOriginalNodeAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for a host node. Attribute value will be int (src_output
+// for original edge).
+extern const char kHostToOutsideCompilationSrcOutputAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// string (original input node name).
+extern const char kOutsideCompilationOriginalNodeAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// int (src_output for original edge).
+extern const char kOutsideCompilationSrcOutputAttrName[];
+
+// Attribute indicating that this node has control dependencies on some other
+// nodes within the same XLA cluster. Attribute value will be a list of string
+// (node names).
+extern const char kXlaControlDependenciesWithinXlaClusterAttrName[];
+
+// Preprocesses edges between different XLA clusters for encapsulation. It will
+// perform the following operations in order:
+//
+// 1a. For control edges between outside compilation and another XLA
+//     computation, add attr "kXlaConnected{From, To}OtherXlaComputationAttrName
+//     = XLA computation node name" to the outside compilation node.
+// 1b. For control edges between different outside compilations (in different
+//     XLA computations), remove the edge and add attr
+//     "kXlaControlDependenciesAttrName = src node name" to dst node.
+// 1c. For control edges between outside compilation and host computation,
+//     remove the edge and add attr "kXlaControlDependenciesAttrName = src node
+//     name" to dst node.
+// 2. For data edges between different XLA computations, if either src or dst
+//    is outside compilation, add an Identity node in between the edge. The
+//    identity node will have attr kBridgeSourceNodeAttrName.
+// 3. For data edges between outside compilation and host computation, remove
+//    the edge and create a Placeholder node as dst node's input.
+Status PreprocessForEncapsulation(Graph* g,
+                                  const string& xla_computation_attr_name,
+                                  const string& outside_compilation_attr_name);
+
+// Information for XLA computation.
+struct XlaClusterInfo {
+  // Add an explicitly-defined default constructor for this class.
+  //
+  // The compiler may delete the default constructor here because
+  // host_compute_core is a const member whose type (std::map) doesn't
+  // necessarily have a user provided constructor -- while libc++ and
+  // libstdc++ 4.8 provide a user defined default constructor, libstdc++ at
+  // least >= 7.3 does not. See also c++11 [class.ctor] p5.
+  //
+  // TODO(klimek): In c++17 we'll be able to initialize host_compute_core
+  // without losing aggregate initialization, which allows us to get rid of
+  // the constructor definitions again.
+  XlaClusterInfo() {}
+  XlaClusterInfo(const string& cluster_name,
+                 const NameAttrList& func_name_attrs, Node* node,
+                 const std::map<string, int>& host_compute_core)
+      : cluster_name(cluster_name),
+        func_name_attrs(func_name_attrs),
+        node(node),
+        host_compute_core(host_compute_core) {}
+  // XLA cluster name. It might be different from `func_name`.
+  const string cluster_name;
+  // Name and attributes of XLA computation function.
+  const NameAttrList func_name_attrs;
+  // The XLA computation node in the graph.
+  Node* node;
+  // A mapping from outside compilation cluster name to its device assignment.
+  const std::map<string, int> host_compute_core;
+};
+
+// Postprocesses edges between different XLA clusters for encapsulation. This
+// function reverts what `PreprocessForEncapsulation` did. It will perform the
+// following operations in order:
+//
+// 1. Remove Placeholder nodes between outside compilation and host computation
+//     (created in `PreprocessForEncapsulation` step 3).
+// 2. Remove Identity nodes created in `PreprocessForEncapsulation` step 2.
+// 3a. Reconnect control edges between outside compilation and another XLA
+//     computation (marked by `PreprocessForEncapsulation` step 1a).
+// 3b. Reconnect control edges between different outside compilations (marked by
+//     `PreprocessForEncapsulation` step 1b).
+// 3c. Reconnect control edges between outside compilation and host computation
+//     (marked by `PreprocessForEncapsulation` step 1c).
+Status PostprocessForEncapsulation(
+    Graph* g, const string& xla_computation_attr_name,
+    const string& outside_compilation_attr_name,
+    const std::unordered_map<string, XlaClusterInfo>& clusters);
+
+// Preprocesses edges within the same XLA cluster. It will perform the following
+// operations in order:
+//
+// 0.  Remove edges from source node to outside compilation nodes, and edges
+//     from outside compilation nodes to sink node.
+// 1a. For edges between different outside compilation clusters, remove the edge
+//     and add attr "kXlaControlDependenciesWithinXlaClusterAttrName = src node
+//     name" to dst node.
+// 1b. For control edges between outside compilation and its XLA computation,
+//     add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the
+//     outside compilation node.
+// 2.  For data edges between different outside compilations, remove the edge
+//     and create a Placeholder node as dst node's input.
+Status PreprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
+
+// Postprocesses edges within the same XLA cluster. This function reverts what
+// `PreprocessEdgesBetweenOutsideCompilations` did. It will perform the
+// following operations in order:
+//
+// 1. Remove Placeholder nodes between different outside compilations (created
+//    in `PreprocessEdgesBetweenOutsideCompilations` step 2).
+// 2a. Reconnect control edges between different outside compilations (marked by
+//     `PreprocessEdgesBetweenOutsideCompilations` step 1a).
+// Notice that control edges marked by
+// `PreprocessEdgesBetweenOutsideCompilations` step 1b are not handled here.
+// They are handled in `RewriteOutsideCompilationSubgraphFn`.
+Status PostprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b8b49cb92f3e453883a8e64e12ce3748a5173f6
--- /dev/null
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -0,0 +1,394 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(PerformStaticShapeInferenceBeforeEncapsulationTest, Basic) {
+  // Build the graph:
+  // "add" = "const_0" + "const_1"
+  // "identity" = "add"
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output const_0 = ops::Const(s.WithOpName("const_0"), 1, {2});
+  Output const_1 = ops::Const(s.WithOpName("const_1"), 2, {2});
+  Output add = ops::Add(s.WithOpName("add"), const_0, const_1);
+  Output identity = ops::Identity(s.WithOpName("identity"), add);
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(s.ToGraph(&g));
+
+  // "add" node is outside compilation node, "identity" node is XLA node.
+  auto node_index = g.BuildNodeNameIndex();
+  Node *add_node = node_index["add"], *identity_node = node_index["identity"];
+  add_node->AddAttr("_xla", "cluster");
+  add_node->AddAttr("_oc", "cluster");
+  identity_node->AddAttr("_xla", "cluster");
+  TF_CHECK_OK(
+      PerformStaticShapeInferenceBeforeEncapsulation(&g, "_xla", "_oc"));
+
+  // Check that only "add" node now has _xla_inferred_shapes attr.
+  std::vector<Node *> nodes_with_inferred_shape;
+  for (Node *n : g.nodes()) {
+    if (HasNodeAttr(n->def(), kXlaInferredShapesAttrName)) {
+      nodes_with_inferred_shape.push_back(n);
+    }
+  }
+  EXPECT_EQ(nodes_with_inferred_shape.size(), 1);
+  EXPECT_EQ(nodes_with_inferred_shape[0], add_node);
+  std::vector<PartialTensorShape> output_shapes;
+  TF_CHECK_OK(GetNodeAttr(add_node->attrs(), kXlaInferredShapesAttrName,
+                          &output_shapes));
+  EXPECT_EQ(output_shapes.size(), 1);
+  TensorShapeProto shape_proto;
+  output_shapes[0].AsProto(&shape_proto);
+  EXPECT_EQ(shape_proto.dim_size(), 1);
+  EXPECT_EQ(shape_proto.dim(0).size(), 2);
+}
+
+TEST(PreprocessForEncapsulationTest, ControlEdges) {
+  // Build the graph:
+  // "const_0" and "const_1" in host computation
+  // "add" = "const_0" + "const_1" in XLA computation 0
+  // "identity0" = "add" in XLA computation 0 & outside compilation 0
+  // "identity1" = "identity0" in XLA computation 0
+  // "identity2" = "identity1" in host computation
+  // "identity3" = "identity2" in XLA computation 1
+  // "identity4" = "identity3" in XLA computation 1 & outside compilation 1
+  // "identity5" = "identity4" in XLA computation 1
+  // "identity6" = "identity5" in host computation
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output const_0 = ops::Const(s.WithOpName("const_0"), 1, {});
+  Output const_1 = ops::Const(s.WithOpName("const_1"), 2, {});
+  Output add = ops::Add(s.WithOpName("add"), const_0, const_1);
+  Output identity0 = ops::Identity(s.WithOpName("identity0"), add);
+  Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
+  Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
+  Output identity3 = ops::Identity(s.WithOpName("identity3"), identity2);
+  Output identity4 = ops::Identity(s.WithOpName("identity4"), identity3);
+  Output identity5 = ops::Identity(s.WithOpName("identity5"), identity4);
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(s.ToGraph(&g));
+  auto node_index = g.BuildNodeNameIndex();
+
+  // Set XLA computation/outside compilation attr, and add control edges.
+  Node *const0_node = node_index["const_0"], *add_node = node_index["add"],
+       *identity0_node = node_index["identity0"],
+       *identity1_node = node_index["identity1"],
+       *identity2_node = node_index["identity2"],
+       *identity3_node = node_index["identity3"],
+       *identity4_node = node_index["identity4"],
+       *identity5_node = node_index["identity5"];
+  add_node->AddAttr("_xla", "0");
+  identity0_node->AddAttr("_xla", "0");
+  identity0_node->AddAttr("_oc", "0");
+  identity1_node->AddAttr("_xla", "0");
+  identity3_node->AddAttr("_xla", "1");
+  identity4_node->AddAttr("_xla", "1");
+  identity4_node->AddAttr("_oc", "0");
+  identity5_node->AddAttr("_xla", "1");
+  // Case 1a: control edges between outside compilation and another XLA
+  // computation.
+  g.AddControlEdge(identity0_node, identity3_node);
+  g.AddControlEdge(identity1_node, identity4_node);
+  // Case 1b: control edges between different outside compilations.
+  g.AddControlEdge(identity0_node, identity4_node);
+  // Case 1c: control edges between outside compilation and host computation.
+  g.AddControlEdge(const0_node, identity0_node);
+  g.AddControlEdge(identity0_node, identity2_node);
+
+  TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
+
+  // Case 1a: add attr "_xla_control_deps_{from/to} = XLA computation node name"
+  // to the outside compilation node.
+  std::vector<string> attr;
+  TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
+                          kXlaConnectedToOtherXlaComputationAttrName, &attr));
+  EXPECT_EQ(attr.size(), 1);
+  EXPECT_EQ(attr[0], "1");
+  attr.clear();
+  TF_CHECK_OK(GetNodeAttr(identity4_node->def(),
+                          kXlaConnectedFromOtherXlaComputationAttrName, &attr));
+  EXPECT_EQ(attr.size(), 1);
+  EXPECT_EQ(attr[0], "0");
+  // Case 1b: add attr "_xla_control_deps = src node name" to dst node.
+  attr.clear();
+  TF_CHECK_OK(GetNodeAttr(identity4_node->def(),
+                          kXlaControlDependenciesAttrName, &attr));
+  EXPECT_EQ(attr.size(), 1);
+  EXPECT_EQ(attr[0], "identity0");
+  // Case 1c: add attr "_xla_control_deps = src node name" to dst node.
+  attr.clear();
+  TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
+                          kXlaControlDependenciesAttrName, &attr));
+  EXPECT_EQ(attr.size(), 1);
+  EXPECT_EQ(attr[0], "const_0");
+  attr.clear();
+  TF_CHECK_OK(GetNodeAttr(identity2_node->def(),
+                          kXlaControlDependenciesAttrName, &attr));
+  EXPECT_EQ(attr.size(), 1);
+  EXPECT_EQ(attr[0], "identity0");
+}
+
+TEST(PreprocessForEncapsulationTest, DataEdges) {
+  // Build the graph:
+  // "const_0" and "const_1" in host computation
+  // "identityn0" = ("const_0", "const_1") in host computation 0
+  // "add0" = "const_0" + "const_1" in XLA computation 0
+  // "add1" = "add0" + "const_0" in XLA computation 0 & outside compilation 0
+  // "identity0" = "add1" in XLA computation 0
+  // "add2" = "add1" + "identity0" in host computation
+  // "add3" = "add1" + "add2" in XLA computation 1
+  // "add4" = "identity0" + "add2" in XLA computation 1 & outside compilation 0
+  // "add5" = "identityn0"[0] + "identityn0"[1] in XLA computation 1 &
+  //                                               outside compilation 0
+  // "identityn1" = ("identityn0"[0], "identityn0"[1]) in XLA computation 1 &
+  //                                                   outside compilation 0
+  // "identity1" = "add4" in XLA computation 1
+  // "identity2" = "identity1" in host computation
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output const_0 = ops::Const(s.WithOpName("const_0"), 1, {});
+  Output const_1 = ops::Const(s.WithOpName("const_1"), 2, {});
+  auto identityn0 =
+      ops::IdentityN(s.WithOpName("identityn_0"), {const_0, const_1});
+  Output add0 = ops::Add(s.WithOpName("add0"), const_0, const_1);
+  Output add1 = ops::Add(s.WithOpName("add1"), add0, const_0);
+  Output identity0 = ops::Identity(s.WithOpName("identity0"), add1);
+  Output add2 = ops::Add(s.WithOpName("add2"), add1, identity0);
+  Output add3 = ops::Add(s.WithOpName("add3"), add1, add2);
+  Output add4 = ops::Add(s.WithOpName("add4"), identity0, add2);
+  Output add5 = ops::Add(s.WithOpName("add5"), identityn0[0], identityn0[1]);
+  auto identityn1 = ops::IdentityN(s.WithOpName("identityn_1"),
+                                   {identityn0[0], identityn0[1]});
+  Output identity1 = ops::Identity(s.WithOpName("identity1"), add4);
+  Output identity2 = ops::Identity(s.WithOpName("identity2"), add4);
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(s.ToGraph(&g));
+  auto node_index = g.BuildNodeNameIndex();
+
+  // Set XLA computation/outside compilation attr.
+  Node *add0_node = node_index["add0"], *add1_node = node_index["add1"],
+       *identity0_node = node_index["identity0"],
+       *add3_node = node_index["add3"], *add4_node = node_index["add4"],
+       *add5_node = node_index["add5"],
+       *identityn1_node = node_index["identityn_1"],
+       *identity1_node = node_index["identity1"];
+  add0_node->AddAttr("_xla", "0");
+  add1_node->AddAttr("_xla", "0");
+  add1_node->AddAttr("_oc", "0");
+  identity0_node->AddAttr("_xla", "0");
+  add3_node->AddAttr("_xla", "1");
+  add4_node->AddAttr("_xla", "1");
+  add4_node->AddAttr("_oc", "0");
+  add5_node->AddAttr("_xla", "1");
+  add5_node->AddAttr("_oc", "0");
+  identityn1_node->AddAttr("_xla", "1");
+  identityn1_node->AddAttr("_oc", "0");
+  identity1_node->AddAttr("_xla", "1");
+
+  TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
+
+  // Check input nodes for related data edges.
+  node_index = g.BuildNodeNameIndex();
+  // Step 2: add an Identity node between different XLA computations.
+  Node *bridge_add1_add3 = node_index["bridge_add1_add3"];
+  EXPECT_NE(bridge_add1_add3, nullptr);
+  string str;
+  TF_CHECK_OK(
+      GetNodeAttr(bridge_add1_add3->attrs(), kBridgeSourceNodeAttrName, &str));
+  EXPECT_EQ(str, "add1");
+  Node *bridge_identity0_add4 = node_index["bridge_identity0_add4"];
+  EXPECT_NE(bridge_identity0_add4, nullptr);
+  // Step 3: add placeholder for edges between host computation and outside
+  // compilation.
+  EXPECT_EQ(bridge_add1_add3->def().input(0), "add1_oc_to_host_placeholder_0");
+  Node *add1_oc_to_host_placeholder =
+      node_index["add1_oc_to_host_placeholder_0"];
+  TF_CHECK_OK(GetNodeAttr(add1_oc_to_host_placeholder->attrs(),
+                          kOutsideCompilationToHostOriginalNodeAttrName, &str));
+  EXPECT_EQ(str, "add1");
+  int i;
+  TF_CHECK_OK(GetNodeAttr(add1_oc_to_host_placeholder->attrs(),
+                          kOutsideCompilationToHostSrcOutputAttrName, &i));
+  EXPECT_EQ(i, 0);
+  add4_node = node_index["add4"];
+  ASSERT_NE(add4_node, nullptr);
+  EXPECT_EQ(add4_node->def().input(0),
+            "bridge_identity0_add4_host_to_oc_placeholder_0");
+  Node *identity0_host_to_oc_placeholder =
+      node_index["bridge_identity0_add4_host_to_oc_placeholder_0"];
+  TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
+                          kHostToOutsideCompilationOriginalNodeAttrName, &str));
+  EXPECT_EQ(str, "bridge_identity0_add4");
+  TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
+                          kHostToOutsideCompilationSrcOutputAttrName, &i));
+  EXPECT_EQ(i, 0);
+
+  // Check different placeholder nodes are created for different src_output.
+  Node *placeholder0 = node_index["identityn_0_host_to_oc_placeholder_0"],
+       *placeholder1 = node_index["identityn_0_host_to_oc_placeholder_1"];
+  EXPECT_NE(placeholder0, nullptr);
+  EXPECT_NE(placeholder1, nullptr);
+  // Check we only have 2 placeholder nodes created for "identityn_0".
+  int placeholder_count = 0;
+  for (Node *n : g.nodes()) {
+    if (HasNodeAttr(n->def(), kHostToOutsideCompilationOriginalNodeAttrName)) {
+      string attr;
+      TF_CHECK_OK(GetNodeAttr(
+          n->attrs(), kHostToOutsideCompilationOriginalNodeAttrName, &attr));
+      if (attr == "identityn_0") {
+        ++placeholder_count;
+      }
+    }
+  }
+  EXPECT_EQ(placeholder_count, 2);
+}
+
+TEST(PostprocessForEncapsulationTest, ControlEdges) {
+  // Build the graph:
+  // "const0"
+  // "identity0" = "const0" (XLA computation 0)
+  // "identity1" = "identity0"
+  // "identity2" = "identity1" (XLA computation 1)
+  // "identity3" = "identity2"
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output const0 = ops::Const(s.WithOpName("const0"), 1, {});
+  Output identity0 = ops::Identity(s.WithOpName("identity0"), const0);
+  Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
+  Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
+  Output identity3 = ops::Identity(s.WithOpName("identity3"), identity2);
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(s.ToGraph(&g));
+  auto node_index = g.BuildNodeNameIndex();
+
+  // Set XLA computation/outside compilation attr, and add control edges.
+  Node *const0_node = node_index["const0"],
+       *identity0_node = node_index["identity0"],
+       *identity1_node = node_index["identity1"],
+       *identity2_node = node_index["identity2"],
+       *identity3_node = node_index["identity3"];
+  identity1_node->AddAttr(kXlaConnectedFromOtherXlaComputationAttrName,
+                          std::vector<string>{"0"});
+  identity1_node->AddAttr(kXlaConnectedToOtherXlaComputationAttrName,
+                          std::vector<string>{"1"});
+  identity3_node->AddAttr(kXlaControlDependenciesAttrName,
+                          std::vector<string>{"const0", "identity1"});
+
+  std::unordered_map<string, XlaClusterInfo> clusters;
+  clusters["0"].node = identity0_node;
+  clusters["1"].node = identity2_node;
+  TF_CHECK_OK(PostprocessForEncapsulation(&g, "_xla", "_oc", clusters));
+
+  // Case 3a: we have control edge identity0 -> identity1, and identity1 ->
+  // identity2.
+  bool edge_identity0_identity1 = false, edge_identity1_identity2 = false;
+  for (const Edge *e : g.edges()) {
+    if (!e->IsControlEdge()) {
+      continue;
+    }
+    if (e->src() == identity0_node && e->dst() == identity1_node) {
+      edge_identity0_identity1 = true;
+    } else if (e->src() == identity1_node && e->dst() == identity2_node) {
+      edge_identity1_identity2 = true;
+    }
+  }
+  EXPECT_TRUE(edge_identity0_identity1);
+  EXPECT_TRUE(edge_identity1_identity2);
+  // Case 3b: we have control edge const0 -> identity3, and identity1 ->
+  // identity3.
+  bool edge_const0_identity3 = false, edge_identity1_identity3 = false;
+  for (const Edge *e : g.edges()) {
+    if (!e->IsControlEdge()) {
+      continue;
+    }
+    if (e->src() == const0_node && e->dst() == identity3_node) {
+      edge_const0_identity3 = true;
+    } else if (e->src() == identity1_node && e->dst() == identity3_node) {
+      edge_identity1_identity3 = true;
+    }
+  }
+  EXPECT_TRUE(edge_const0_identity3);
+  EXPECT_TRUE(edge_identity1_identity3);
+}
+
+TEST(PostprocessForEncapsulationTest, DataEdges) {
+  // Build the graph:
+  // "const0" in outside compilation "0"
+  // "placeholder0" (for "const0") in host computation
+  // "add0" = "placeholder0" + "placeholder0" in host computation
+  // "placeholder1" (for "add0") in outside compilation 1
+  // "add1" = "placeholder1" + "placeholder1" in outside compilation 1
+  //
+  // "bridge" = "placeholder0" in host computation
+  // "placeholder2" (for "bridge") in outside compilation 1
+  // "add2" = "placeholder2" + "placeholder2" in outside compilation 1
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output const0 = ops::Const(s.WithOpName("const0"), 1, {});
+  Output placeholder0 =
+      ops::Placeholder(s.WithOpName("placeholder0"), DT_INT32);
+  Output add0 = ops::Add(s.WithOpName("add0"), placeholder0, placeholder0);
+  Output placeholder1 =
+      ops::Placeholder(s.WithOpName("placeholder1"), DT_INT32);
+  Output add1 = ops::Add(s.WithOpName("add1"), placeholder1, placeholder1);
+  Output bridge = ops::Identity(s.WithOpName("bridge"), placeholder0);
+  Output placeholder2 =
+      ops::Placeholder(s.WithOpName("placeholder2"), DT_INT32);
+  Output add2 = ops::Add(s.WithOpName("add2"), placeholder2, placeholder2);
+  Graph g(OpRegistry::Global());
+  TF_CHECK_OK(s.ToGraph(&g));
+  auto node_index = g.BuildNodeNameIndex();
+
+  // Set related attributes.
+  Node *placeholder0_node = node_index["placeholder0"];
+  placeholder0_node->AddAttr(kOutsideCompilationToHostOriginalNodeAttrName,
+                             "const0");
+  placeholder0_node->AddAttr(kOutsideCompilationToHostSrcOutputAttrName, 0);
+  Node *placeholder1_node = node_index["placeholder1"];
+  placeholder1_node->AddAttr(kHostToOutsideCompilationOriginalNodeAttrName,
+                             "add0");
+  placeholder1_node->AddAttr(kHostToOutsideCompilationSrcOutputAttrName, 0);
+  Node *bridge_node = node_index["bridge"];
+  bridge_node->AddAttr(kBridgeSourceNodeAttrName, "const0");
+  Node *placeholder2_node = node_index["placeholder2"];
+  placeholder2_node->AddAttr(kHostToOutsideCompilationOriginalNodeAttrName,
+                             "bridge");
+  placeholder2_node->AddAttr(kHostToOutsideCompilationSrcOutputAttrName, 0);
+
+  std::unordered_map<string, XlaClusterInfo> clusters;
+  TF_CHECK_OK(PostprocessForEncapsulation(&g, "_xla", "_oc", clusters));
+
+  // Result graph should be:
+  // "add0" = "const0" + "const0"
+  // "add1" = "add0" + "add0"
+  // "add2" = "const0" + "const0"
+  node_index = g.BuildNodeNameIndex();
+  EXPECT_EQ(node_index.size(), 6);
+  EXPECT_EQ(node_index["add0"]->def().input(0), "const0:0");
+  EXPECT_EQ(node_index["add0"]->def().input(1), "const0:0");
+  EXPECT_EQ(node_index["add1"]->def().input(0), "add0:0");
+  EXPECT_EQ(node_index["add1"]->def().input(1), "add0:0");
+  EXPECT_EQ(node_index["add2"]->def().input(0), "const0:0");
+  EXPECT_EQ(node_index["add2"]->def().input(1), "const0:0");
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 2ce6fa73fc448ca83fa392aa909cb385453eb8b6..d334100aa4a915a87fb05d371e0e3379a7ee05f2 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -195,8 +195,11 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
         e->dst()->attrs().Find(kXlaClusterAttr) == nullptr &&
         e->dst()->type_string() != kXlaClusterOutput) {
       return errors::InvalidArgument(
-          "Undeclared output of XLA computation. A common cause of this error "
-          "is variable initializers that depend on the XLA computation. Edge: ",
+          "Undeclared output of XLA computation. Some common causes of this "
+          "error are: 1) variable initializers that depend on the XLA "
+          "computation; 2) gradient computations that depend on the XLA "
+          "computation, which can be mitigated by moving gradient computations "
+          "inside XLA computation. Offending edge: ",
           e->src()->name(), ":", e->src_output(), " -> ", e->dst()->name(), ":",
           e->dst_input());
     }
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index 22531a4acea3f130175c7cb2e03fcb7570926094..192e1c7b32467d80cef6ff61a1c7078f8dea9dfb 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -256,7 +256,7 @@ TEST(EncapsulateXlaComputations, Encapsulate) {
 
   TF_ASSERT_OK(EncapsulateXlaComputationsPass::Encapsulate(&graph, &flib_def));
 
-  std::unordered_map<string, Node*> index = BuildNodeIndex(*graph);
+  std::unordered_map<string, Node*> index = graph->BuildNodeNameIndex();
   string function = index.at("launch0")->type_string();
 
   // Tests the outer graph is as expected.
@@ -291,7 +291,8 @@ TEST(EncapsulateXlaComputations, Encapsulate) {
   // function. Encapsulation should be deterministic to avoid recompilation.
   TF_ASSERT_OK(
       EncapsulateXlaComputationsPass::Encapsulate(&graph_copy, &flib_def));
-  std::unordered_map<string, Node*> index_copy = BuildNodeIndex(*graph_copy);
+  std::unordered_map<string, Node*> index_copy =
+      graph_copy->BuildNodeNameIndex();
   string function_copy = index_copy.at("launch0")->type_string();
   EXPECT_EQ(function, function_copy);
 }
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3c7e2f89be9b37b51a633dabb099969c181013f
--- /dev/null
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -0,0 +1,941 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Add a key placeholder node to the graph. The key placeholder node will be
+// used as input for XlaRecvAtHost/XlaSendFromHost nodes.
+xla::StatusOr<Node*> AddHostComputeKeyPlaceholder(
+    const string& xla_cluster_name, Graph* g) {
+  NodeDef key_def;
+  NodeDefBuilder builder(absl::StrCat(xla_cluster_name, "_key_placeholder"),
+                         "Placeholder");
+  builder.Attr("dtype", DT_STRING);
+  builder.Attr("shape", PartialTensorShape({2}));
+  builder.Attr("_host_compute_call_node", xla_cluster_name);
+  Status s = builder.Finalize(&key_def);
+  if (!s.ok()) return s;
+
+  Node* n = g->AddNode(key_def, &s);
+  if (!s.ok()) return s;
+  return n;
+}
+
+// Returns if the node is a XLA computation key placeholder.
+bool IsKeyPlaceholderNode(const Node& n) {
+  return n.type_string() == "Placeholder" &&
+         absl::EndsWith(n.name(), "_key_placeholder");
+}
+
+// Returns nodes with given type.
+std::vector<Node*> GatherNodesWithType(const Graph& g, const string& type) {
+  std::vector<Node*> result;
+  for (Node* n : g.nodes()) {
+    if (n->type_string() == type) {
+      result.push_back(n);
+    }
+  }
+  return result;
+}
+
+// Gets data types from `arg_nodes` and fills them into `recv_at_host_dtypes`.
+Status GetArgDataTypes(const std::vector<Node*>& arg_nodes,
+                       std::vector<DataType>* recv_at_host_dtypes) {
+  recv_at_host_dtypes->resize(arg_nodes.size(), DT_INVALID);
+  for (auto* n : arg_nodes) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+    DataType dtype;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "T", &dtype));
+    (*recv_at_host_dtypes)[index] = dtype;
+  }
+  for (int i = 0; i < recv_at_host_dtypes->size(); i++) {
+    if ((*recv_at_host_dtypes)[i] == DT_INVALID) {
+      return errors::Internal("Cannot get datatype for input ", i);
+    }
+  }
+  return Status::OK();
+}
+
+// Builds XlaRecvAtHost node.
+xla::StatusOr<Node*> BuildRecvAtHostNode(
+    Graph* g, const string& oc_cluster_name,
+    const std::vector<DataType>& recv_at_host_dtypes, Node* key_placeholder) {
+  NodeDefBuilder recv_at_host_builder(
+      absl::StrCat("outside_compilation_", oc_cluster_name, "_recv"),
+      "_XlaRecvAtHost");
+  NodeDef recv_at_host_def;
+  recv_at_host_builder.Attr("Toutputs", recv_at_host_dtypes);
+  // The correct device_ordinal will be inserted during replication in a
+  // subsequent rewrite.
+  recv_at_host_builder.Attr("device_ordinal", 0);
+  recv_at_host_builder.Attr(
+      "key", absl::StrCat("host_compute_channel_", oc_cluster_name));
+  recv_at_host_builder.Input(key_placeholder->name(), 0, DT_STRING);
+  TF_RETURN_IF_ERROR(recv_at_host_builder.Finalize(&recv_at_host_def));
+  Status s;
+  Node* recv_at_host_node = g->AddNode(recv_at_host_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  return recv_at_host_node;
+}
+
+// Builds XlaRecvAtHost node, and replaces all _Arg nodes with it.
+xla::StatusOr<Node*> ReplaceArgNodesWithRecvAtHostNode(
+    Graph* g, const string& oc_cluster_name,
+    std::vector<DataType>* recv_at_host_dtypes, Node* key_placeholder) {
+  // TODO(b/77601805): use out nodes for source node, instead of traversing all
+  // nodes.
+  std::vector<Node*> arg_nodes = GatherNodesWithType(*g, "_Arg");
+  TF_RETURN_IF_ERROR(GetArgDataTypes(arg_nodes, recv_at_host_dtypes));
+  TF_ASSIGN_OR_RETURN(
+      Node * recv_at_host_node,
+      BuildRecvAtHostNode(g, oc_cluster_name, *recv_at_host_dtypes,
+                          key_placeholder));
+  for (auto* n : arg_nodes) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+    // Record out edges and remove `n` before adding those edges to RecvAtHost.
+    // This is to avoid multiple producers.
+    std::vector<OutEdgeInfo> out_edge_info;
+    for (auto edge : n->out_edges()) {
+      out_edge_info.push_back(
+          {edge->dst(), edge->src_output(), edge->dst_input()});
+    }
+    g->RemoveNode(n);
+    for (const OutEdgeInfo& edge : out_edge_info) {
+      if (edge.dst_input == Graph::kControlSlot) {
+        g->AddControlEdge(recv_at_host_node, edge.dst);
+      } else {
+        g->AddEdge(recv_at_host_node, index, edge.dst, edge.dst_input);
+      }
+    }
+
+    // Rewrite dst nodes because their input changed.
+    for (int i = 0; i < out_edge_info.size(); i++) {
+      const OutEdgeInfo edge = out_edge_info[i];
+      if (edge.dst_input == Graph::kControlSlot) {
+        continue;
+      }
+
+      Node* dst = edge.dst;
+      NodeDef new_def = dst->def();
+      *new_def.mutable_input(edge.dst_input) =
+          absl::StrCat(recv_at_host_node->name(), ":", index);
+      TF_ASSIGN_OR_RETURN(Node * dst_replace, ReplaceNode(g, dst, new_def));
+
+      // Other edges might have `dst` as dst node as well. Update those edges
+      // with `dst_replace`.
+      for (int j = i + 1; j < out_edge_info.size(); j++) {
+        if (out_edge_info[j].dst == dst) {
+          out_edge_info[j].dst = dst_replace;
+        }
+      }
+    }
+  }
+  g->AddEdge(key_placeholder, 0, recv_at_host_node, 0);
+  return recv_at_host_node;
+}
+
+// Gets data types from `ret_nodes` and fills them into `send_from_host_dtypes`.
+Status GetRetDataTypes(const std::vector<Node*>& ret_nodes,
+                       std::vector<DataType>* send_from_host_dtypes) {
+  send_from_host_dtypes->resize(ret_nodes.size(), DT_INVALID);
+  for (auto* n : ret_nodes) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+    DataType dtype;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "T", &dtype));
+    (*send_from_host_dtypes)[index] = dtype;
+  }
+  for (int i = 0; i < send_from_host_dtypes->size(); i++) {
+    if ((*send_from_host_dtypes)[i] == DT_INVALID) {
+      return errors::Internal("Cannot get datatype for output ", i);
+    }
+  }
+  return Status::OK();
+}
+
+// Builds XlaSendFromHost node.
+xla::StatusOr<Node*> BuildSendFromHostNode(
+    Graph* g, const string& oc_cluster_name,
+    const std::vector<Node*>& ret_nodes,
+    const std::vector<DataType>& send_from_host_dtypes, Node* key_placeholder) {
+  NodeDefBuilder send_from_host_builder(
+      absl::StrCat("outside_compilation_", oc_cluster_name, "_send"),
+      "_XlaSendFromHost");
+  NodeDef send_from_host_def;
+  send_from_host_builder.Attr("Tinputs", send_from_host_dtypes);
+  // The correct device_ordinal will be inserted during replication in a
+  // subsequent rewrite.
+  send_from_host_builder.Attr("device_ordinal", 0);
+  send_from_host_builder.Attr(
+      "key", absl::StrCat("host_compute_channel_", oc_cluster_name));
+  std::vector<NodeDefBuilder::NodeOut> inputs(send_from_host_dtypes.size());
+  for (auto* n : ret_nodes) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+    if (index < 0 || index >= send_from_host_dtypes.size()) {
+      return errors::Internal("Invalid _Retval index: ", index);
+    }
+    for (auto edge : n->in_edges()) {
+      inputs[index] =
+          NodeDefBuilder::NodeOut{edge->src()->name(), edge->src_output(),
+                                  edge->src()->output_type(edge->src_output())};
+    }
+  }
+  send_from_host_builder.Input(inputs);
+  send_from_host_builder.Input(key_placeholder->name(), 0, DT_STRING);
+  TF_RETURN_IF_ERROR(send_from_host_builder.Finalize(&send_from_host_def));
+  Status s;
+  Node* send_from_host_node = g->AddNode(send_from_host_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  return send_from_host_node;
+}
+
+// Builds XlaSendFromHost node, and replaces all _Retval nodes with it.
+xla::StatusOr<Node*> ReplaceRetNodesWithSendFromHostNode(
+    Graph* g, const string& oc_cluster_name,
+    std::vector<DataType>* send_from_host_dtypes, Node* key_placeholder) {
+  // TODO(b/77601805): use in nodes for sink node, instead of traversing all
+  // nodes.
+  std::vector<Node*> ret_nodes = GatherNodesWithType(*g, "_Retval");
+  TF_RETURN_IF_ERROR(GetRetDataTypes(ret_nodes, send_from_host_dtypes));
+  TF_ASSIGN_OR_RETURN(
+      Node * send_from_host_node,
+      BuildSendFromHostNode(g, oc_cluster_name, ret_nodes,
+                            *send_from_host_dtypes, key_placeholder));
+  for (auto* n : ret_nodes) {
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+    for (auto edge : n->in_edges()) {
+      if (edge->src_output() == Graph::kControlSlot) {
+        g->AddControlEdge(edge->src(), send_from_host_node);
+      } else {
+        g->AddEdge(edge->src(), edge->src_output(), send_from_host_node, index);
+      }
+    }
+    g->RemoveNode(n);
+  }
+  g->AddEdge(key_placeholder, 0, send_from_host_node,
+             send_from_host_dtypes->size());
+  return send_from_host_node;
+}
+
+// Returns input shapes (excluding key placeholder) for `send_from_host_node`
+// if they are all fully defined; absl::nullopt otherwise.
+absl::optional<std::vector<PartialTensorShape>> GetInferredInputShapes(
+    int num_inputs, Node* send_from_host_node) {
+  std::vector<PartialTensorShape> results(num_inputs);
+  for (int i = 0; i < num_inputs; i++) {
+    const Edge* e;
+    if (!send_from_host_node->input_edge(i, &e).ok()) {
+      return absl::nullopt;
+    }
+
+    std::vector<PartialTensorShape> shapes;
+    if (!GetNodeAttr(e->src()->attrs(), kXlaInferredShapesAttrName, &shapes)
+             .ok()) {
+      return absl::nullopt;
+    }
+
+    const PartialTensorShape shape = shapes[e->src_output()];
+    if (!shape.IsFullyDefined()) {
+      return absl::nullopt;
+    }
+
+    results[e->dst_input()] = shape;
+  }
+  return results;
+}
+
+// Builds XlaHostCompute NodeDef from the outside compilation call node.
+xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
+    const Node* call_node, const std::map<string, int>& host_compute_core) {
+  string original_oc_name;
+  TF_RETURN_IF_ERROR(GetNodeAttr(
+      call_node->attrs(), "_outside_compilation_subgraph", &original_oc_name));
+  NodeDefBuilder host_compute_builder(
+      absl::StrCat("outside_compilation_", original_oc_name, "_host_compute"),
+      "XlaHostCompute");
+
+  // Copy all attributes.
+  for (auto attr : call_node->attrs()) {
+    host_compute_builder.Attr(attr.first, attr.second);
+  }
+
+  // Populate tpu_core assignment.
+  const auto iter = host_compute_core.find(original_oc_name);
+  if (iter != host_compute_core.end()) {
+    int core = iter->second;
+    host_compute_builder.Attr("tpu_core", core);
+  }
+
+  // Populate inputs.
+  std::vector<DataType> input_dtypes;
+  TF_RETURN_IF_ERROR(GetNodeAttr(call_node->attrs(), "Tinputs", &input_dtypes));
+  std::vector<NodeDefBuilder::NodeOut> inputs(input_dtypes.size());
+  for (auto e : call_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    if (e->dst_input() < 0 || e->dst_input() >= input_dtypes.size()) {
+      return errors::Internal("Invalid dst_input: ", e->dst_input());
+    }
+    inputs[e->dst_input()] = NodeDefBuilder::NodeOut{
+        e->src()->name(), e->src_output(), input_dtypes[e->dst_input()]};
+  }
+  host_compute_builder.Input(inputs);
+
+  NodeDef new_def;
+  TF_RETURN_IF_ERROR(host_compute_builder.Finalize(&new_def));
+  return new_def;
+}
+
+// Replace outside compilation function call node with XlaHostCompute node.
+// If the function call node has no input/output edges, we will just remove it
+// and not create a XlaHostCompute node.
+Status ReplaceOrRemoveOutsideCompilationCallNode(
+    Graph* g, Node* call_node, const std::map<string, int>& host_compute_core) {
+  // If the function call node has no input/output edges, just remove it.
+  bool has_edge = false;
+  for (auto e : call_node->in_edges()) {
+    if (!e->IsControlEdge() || e->src() != g->source_node()) {
+      has_edge = true;
+      break;
+    }
+  }
+  for (auto e : call_node->out_edges()) {
+    if (!e->IsControlEdge() || e->dst() != g->sink_node()) {
+      has_edge = true;
+      break;
+    }
+  }
+  if (!has_edge) {
+    VLOG(4) << "Did not add HostCompute node for " << call_node->DebugString();
+    g->RemoveNode(call_node);
+    return Status::OK();
+  }
+
+  // Build XlaHostCompute NodeDef.
+  TF_ASSIGN_OR_RETURN(NodeDef node_def,
+                      BuildXlaHostComputeNodeDef(call_node, host_compute_core));
+  TF_ASSIGN_OR_RETURN(Node * host_compute_node,
+                      ReplaceNode(g, call_node, node_def));
+  VLOG(4) << "Added HostCompute node: " << host_compute_node->DebugString();
+
+  return Status::OK();
+}
+
+// For an XLA computation, builds host side graph given all outside compilation
+// graphs inside it. The host side graph contains:
+// 1) a "sequencer" node (we will add control edge between XlaRecvAtHost and
+//    XlaSendFromHost to this sequencer node, so all outside compilation nodes
+//    will be executed *before* this sequencer).
+// 2) a "key placeholder" node. Later in ExpandHostGraphIntoMainGraph(), we will
+//    replace this node with compilation result node.
+// 3) all outside compilation graphs.
+Status ConstructHostGraph(
+    const string& xla_cluster_name, const string& outside_compilation_attr_name,
+    const std::vector<string>& outside_compilation_host_graphs,
+    FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph) {
+  host_graph->reset(new Graph(fld));
+
+  // Create sequencer node in host graph.
+  NodeDefBuilder sequencer_builder(absl::StrCat(xla_cluster_name, "_sequencer"),
+                                   "NoOp");
+  sequencer_builder.Attr("_xla_host_transfer_sequencer", xla_cluster_name);
+  NodeDef sequencer_def;
+  TF_RETURN_IF_ERROR(sequencer_builder.Finalize(&sequencer_def));
+  Status s;
+  Node* sequencer = (*host_graph)->AddNode(sequencer_def, &s);
+  TF_RETURN_IF_ERROR(s);
+
+  // Create key placeholder in host graph.
+  TF_ASSIGN_OR_RETURN(
+      Node * key_placeholder,
+      AddHostComputeKeyPlaceholder(xla_cluster_name, host_graph->get()));
+
+  // For each outside compilation graph, copy them to host graph with the
+  // following changes:
+  // a) Use key_placeholder in host graph instead of its own.
+  // b) Add control edge from RecvAtHost/SendFromHost to sequencer.
+  // c) Clear node_def.device(), so device placer won't get confused.
+  for (const string& host_func : outside_compilation_host_graphs) {
+    VLOG(4) << "Expanding host graph " << host_func;
+    FunctionBody* host_fbody = nullptr;
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+        *fld->Find(host_func), AttrSlice(), fld,
+        [&](const string& op, const OpDef** sig) {
+          return fld->LookUpOpDef(op, sig);
+        },
+        &host_fbody));
+    std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+
+    // We use ReverseDFS() to copy nodes. Make sure all nodes are reverse
+    // reachable from sink node so all nodes will be copied.
+    // TODO(b/77601805): consolidate copy graph functions.
+    FixupSourceAndSinkEdges(host_fbody->graph);
+
+    std::map<const Node*, Node*> node_map;
+    node_map[host_fbody->graph->source_node()] = (*host_graph)->source_node();
+    node_map[host_fbody->graph->sink_node()] = (*host_graph)->sink_node();
+    Status s;
+    ReverseDFS(
+        *host_fbody->graph, /*enter=*/nullptr,
+        [&](const Node* n) {
+          if (!s.ok()) {
+            return;
+          }
+
+          Node* copy;
+          if (node_map.find(n) != node_map.end()) {
+            // Already copied this node.
+            copy = node_map.at(n);
+          } else if (IsKeyPlaceholderNode(*n)) {
+            // Change a).
+            copy = key_placeholder;
+            node_map[n] = copy;
+          } else {
+            // Copy the node.
+            NodeDef copy_def = n->def();
+            // Change c).
+            copy_def.clear_device();
+            copy = (*host_graph)->AddNode(copy_def, &s);
+            if (!s.ok()) {
+              return;
+            }
+            node_map[n] = copy;
+          }
+
+          // Only handle input edges. Output edges will be added later as
+          // its output nodes' input edges.
+          for (auto e : n->in_edges()) {
+            if (node_map.find(e->src()) == node_map.end()) {
+              s = errors::Internal("Cannot find node image for ",
+                                   e->src()->DebugString());
+              return;
+            }
+            (*host_graph)
+                ->AddEdge(node_map[e->src()], e->src_output(), copy,
+                          e->dst_input());
+          }
+
+          // Change b).
+          if (copy->type_string() == "_XlaRecvAtHost" ||
+              copy->type_string() == "_XlaSendFromHost") {
+            (*host_graph)->AddControlEdge(copy, sequencer);
+          }
+        },
+        NodeComparatorID());
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // sequencer and key_placeholder might be dead nodes. Prune them if necessary.
+  // - sequencer should be pruned iff it has no input control edges from
+  //   RecvAtHost/SendFromHost. If it has input control edge, we connect it to
+  //   sink node so it won't be pruned.
+  // - key_placeholder should be pruned iff there's no RecvAtHost/SendFromHost.
+  //   We don't need to do anything special.
+  if (!sequencer->in_edges().empty()) {
+    (*host_graph)->AddControlEdge(sequencer, (*host_graph)->sink_node());
+  }
+  PruneForReverseReachability(
+      host_graph->get(),
+      std::unordered_set<const Node*>{(*host_graph)->sink_node()});
+
+  // Postprocess edges between different outside compilations.
+  TF_RETURN_IF_ERROR(PostprocessEdgesBetweenOutsideCompilations(
+      host_graph->get(), outside_compilation_attr_name));
+
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("extract_outside_compilation_host_graph_for_",
+                     xla_cluster_name),
+        **host_graph, fld);
+  }
+
+  return Status::OK();
+}
+
+// Expand XLA computation's outside compilation host side graph into main graph.
+// Add a control edge between sequencer node and the XLA computation node.
+Status ExpandHostGraphIntoMainGraph(Graph* main_graph, Graph* host_graph,
+                                    Node* xla_computation_node) {
+  // We use ReverseDFS() to copy nodes. Make sure all nodes are reverse
+  // reachable from sink node so all nodes will be copied.
+  // TODO(b/77601805): consolidate copy graph functions.
+  FixupSourceAndSinkEdges(host_graph);
+
+  // Copy all nodes.
+  std::map<const Node*, Node*> node_map;
+  node_map[host_graph->source_node()] = main_graph->source_node();
+  node_map[host_graph->sink_node()] = main_graph->sink_node();
+  Status s = Status::OK();
+  auto copy_node_fn = [&](const Node* n) {
+    if (!s.ok()) {
+      return;
+    }
+
+    Node* copy;
+    if (node_map.find(n) != node_map.end()) {
+      // Already copied this node.
+      copy = node_map.at(n);
+    } else {
+      // Copy the node.
+      NodeDef copy_def = n->def();
+      copy = main_graph->AddNode(copy_def, &s);
+      if (!s.ok()) {
+        return;
+      }
+      node_map[n] = copy;
+    }
+
+    // Only handle input edges. Output edges will be added later as its output
+    // nodes' input edges.
+    for (auto e : n->in_edges()) {
+      if (node_map.find(e->src()) == node_map.end()) {
+        s = errors::Internal("Cannot find node image for ",
+                             e->src()->DebugString());
+        return;
+      }
+      main_graph->AddEdge(node_map[e->src()], e->src_output(), copy,
+                          e->dst_input());
+    }
+
+    // Add control edge from sequencer to XLA computation node.
+    if (copy->type_string() == "NoOp" &&
+        HasNodeAttr(copy->def(), "_xla_host_transfer_sequencer")) {
+      main_graph->AddControlEdge(copy, xla_computation_node);
+    }
+  };
+  ReverseDFS(*host_graph, /*enter=*/nullptr, copy_node_fn, NodeComparatorID());
+  return s;
+}
+
+// Rewrites shape inference graph for outside compilation.
+// 1. If the outside compilation is a "top-level" one (not in a function of any
+//    If/While/etc.), this shape inference graph might have host computation to
+//    outside compilation placeholder nodes, which will cause shape inference to
+//    fail. However, those nodes are not in `host_graph` any more (because we
+//    have executed `PostprocessForEncapsultion`). In this case, we clear the
+//    graph, and copy SendFromHost with all its predecessors from `host_graph`.
+//    This case is detected by whether the SendFromHost node exists in
+//    `host_graph` as well.
+// 2. Remove control edges, and prune nodes that are not useful for shape
+//    inference.
+Status RewriteShapeInferenceGraph(const string& shape_inference_graph_name,
+                                  Graph* host_graph,
+                                  FunctionLibraryDefinition* fld) {
+  FunctionBody* fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(shape_inference_graph_name), AttrSlice(), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  Graph* g = fbody->graph;
+
+  // Find SendFromHost node.
+  Node* send_from_host = nullptr;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "_XlaSendFromHost") {
+      send_from_host = n;
+      break;
+    }
+  }
+  if (!send_from_host) {
+    return errors::Internal("Shape inference graph ",
+                            shape_inference_graph_name,
+                            " does not have _XlaSendFromHost node.");
+  }
+
+  // See if the SendFromHost node exists in `host_graph`.
+  Node* send_from_host_main_graph = nullptr;
+  for (Node* n : host_graph->nodes()) {
+    if (n->name() == send_from_host->name()) {
+      send_from_host_main_graph = n;
+      break;
+    }
+  }
+  if (send_from_host_main_graph) {
+    // This is an "top-level" outside compilation. Clear the graph, and copy
+    // SendFromHost and all its predecessors from `host_graph`.
+    std::vector<Node*> nodes;
+    for (Node* n : g->op_nodes()) {
+      nodes.push_back(n);
+    }
+    for (Node* n : nodes) {
+      g->RemoveNode(n);
+    }
+
+    std::map<const Node*, Node*> node_map;
+    node_map[host_graph->source_node()] = g->source_node();
+    Status s;
+    auto copy_node_fn = [&](const Node* n) {
+      if (!s.ok()) {
+        return;
+      }
+
+      if (node_map.find(n) != node_map.end()) {
+        return;
+      }
+
+      NodeDef copy_def = n->def();
+      Node* copy = g->AddNode(copy_def, &s);
+      if (!s.ok()) {
+        return;
+      }
+      for (auto e : n->in_edges()) {
+        if (node_map.find(e->src()) == node_map.end()) {
+          s = errors::Internal("Cannot find node image for ",
+                               e->src()->DebugString());
+          return;
+        }
+        g->AddEdge(node_map[e->src()], e->src_output(), copy, e->dst_input());
+      }
+
+      node_map[n] = copy;
+    };
+    // TODO(b/77601805): consolidate copy graph functions.
+    ReverseDFSFrom(*host_graph,
+                   std::vector<const Node*>{send_from_host_main_graph},
+                   /*enter=*/nullptr, copy_node_fn, NodeComparatorID());
+    if (!s.ok()) {
+      return s;
+    }
+
+    send_from_host = node_map[send_from_host_main_graph];
+  } else {
+    // This is an outside compilation embedded in If/While/gradient/etc.
+    // It will be enough for shape inference. Leave `g` unchanged.
+  }
+
+  // Control edges are not useful for shape inference. Remove them.
+  for (auto e : g->edges()) {
+    if (e->IsControlEdge()) {
+      g->RemoveEdge(e);
+    }
+  }
+  // Nodes that are not reverse reachable from SendFromHost are not useful for
+  // shape inference. Prune them.
+  PruneForReverseReachability(g,
+                              std::unordered_set<const Node*>{send_from_host});
+
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(shape_inference_graph_name, *g, fld);
+  }
+
+  // Replace original shape inference graph.
+  FunctionDef fdef_replace;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*g, shape_inference_graph_name, &fdef_replace));
+  TF_RETURN_IF_ERROR(
+      fld->ReplaceFunction(shape_inference_graph_name, fdef_replace));
+
+  return Status::OK();
+}
+
+}  // namespace
+
+Status RewriteOutsideCompilationSubgraphFn::operator()(
+    const std::vector<OutputTensor>& arg_source_tensors,
+    std::unique_ptr<Graph>* graph, std::vector<int>* input_permutation,
+    std::vector<int>* output_permutation, NodeDef* node_def) {
+  string old_name = node_def->op();
+  string new_name = absl::StrCat(xla_cluster_name_, "_", old_name);
+  node_def->set_op(new_name);
+  node_def->set_name(new_name);
+
+  // Later we will run PruneForReverseReachability(), so make sure all original
+  // nodes are reachable from sink node and won't be removed.
+  FixupSourceAndSinkEdges(graph->get());
+
+  // Step 1: create a key placeholder node.
+  TF_ASSIGN_OR_RETURN(
+      Node * key_placeholder,
+      AddHostComputeKeyPlaceholder(xla_cluster_name_, graph->get()));
+
+  // Step 2: build RecvAtHost node, and replace all _Arg nodes with it.
+  std::vector<DataType> recv_at_host_dtypes;
+  TF_ASSIGN_OR_RETURN(
+      Node * recv_at_host_node,
+      ReplaceArgNodesWithRecvAtHostNode(graph->get(), new_name,
+                                        &recv_at_host_dtypes, key_placeholder));
+
+  // Step 3: build SendFromHost node, and replace all _Retval nodes with it.
+  std::vector<DataType> send_from_host_dtypes;
+  TF_ASSIGN_OR_RETURN(
+      Node * send_from_host_node,
+      ReplaceRetNodesWithSendFromHostNode(
+          graph->get(), new_name, &send_from_host_dtypes, key_placeholder));
+
+  // Step 4: add XLA cluster and outside compilation attr.
+  for (Node* n : (*graph)->nodes()) {
+    if (IsKeyPlaceholderNode(*n)) {
+      continue;
+    }
+
+    n->AddAttr(xla_cluster_attr_name_, xla_cluster_name_);
+    n->AddAttr(outside_compilation_attr_name_, old_name);
+  }
+
+  // Check whether we have all input shapes for XlaSendFromHost. If we do, we
+  // will set `shapes` attr for the call node; otherwise we will save the
+  // shape inference graph and set `shape_inference_graph` for the call node.
+  absl::optional<std::vector<PartialTensorShape>> shapes =
+      GetInferredInputShapes(send_from_host_dtypes.size(), send_from_host_node);
+  for (Node* n : (*graph)->nodes()) {
+    n->ClearAttr(kXlaInferredShapesAttrName);
+  }
+
+  // Step 5: add control edges for originally XLA <-> outside compilation
+  // control edges.
+  for (Node* n : (*graph)->nodes()) {
+    if (HasNodeAttr(n->def(), kXlaConnectedToXlaComputationAttrName)) {
+      (*graph)->AddControlEdge(n, send_from_host_node);
+      n->ClearAttr(kXlaConnectedToXlaComputationAttrName);
+    }
+    if (HasNodeAttr(n->def(), kXlaConnectedFromXlaComputationAttrName)) {
+      (*graph)->AddControlEdge(recv_at_host_node, n);
+      n->ClearAttr(kXlaConnectedFromXlaComputationAttrName);
+    }
+  }
+
+  // Step 6: RecvAtHost/SendFromHost/key_placeholder might be dead nodes. Prune
+  // them if necessary.
+  // - RecvAtHost should be pruned iff it has no output data/control edges. If
+  //   it has any output edge, it will be reverse reachable from sink node. We
+  //   don't need to do anything special.
+  // - SendFromHost should be pruned iff it has no input data/control edges. If
+  //   it has input edges other than key_placeholder, we connect it to sink
+  //   node so it won't be pruned.
+  // - key_placeholder should be pruned iff RecvAtHost/SendFromHost are pruned.
+  //   We don't need to do anything special.
+  if (send_from_host_node->in_edges().size() > 1) {
+    (*graph)->AddControlEdge(send_from_host_node, (*graph)->sink_node());
+  }
+  PruneForReverseReachability(
+      graph->get(), std::unordered_set<const Node*>{(*graph)->sink_node()});
+
+  // Step 7: add necessary attributes to function call node, so we can replace
+  // it with HostCompute node later.
+  AddNodeAttr("_outside_compilation_subgraph", old_name, node_def);
+  if (shapes) {
+    AddNodeAttr("shape_inference_graph", "", node_def);
+    AddNodeAttr("shapes", *shapes, node_def);
+  } else {
+    string shape_inference_func_name =
+        absl::StrCat("_outside_compilation_shape_inference_", new_name);
+    AddNodeAttr("shape_inference_graph", shape_inference_func_name, node_def);
+    AddNodeAttr("shapes", std::vector<TensorShapeProto>{}, node_def);
+  }
+  AddNodeAttr("ancestors", std::vector<string>{}, node_def);
+  AddNodeAttr("Tinputs", recv_at_host_dtypes, node_def);
+  AddNodeAttr("Toutputs", send_from_host_dtypes, node_def);
+  AddNodeAttr("key", absl::StrCat("host_compute_channel_", new_name), node_def);
+
+  return Status::OK();
+}
+
+Status ExtractOutsideCompilationForFunction(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const NameAttrList& func_name_attrs, const string& new_func_name,
+    const std::map<string, int>& host_compute_core,
+    FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph,
+    std::vector<string>* shape_inference_graphs,
+    bool* has_outside_compilation) {
+  // Early return if function does not have any outside compilation nodes.
+  const string& func_name = func_name_attrs.name();
+  const FunctionDef* fdef = fld->Find(func_name);
+  if (!fdef) {
+    return errors::Internal("Cannot find function ", func_name);
+  }
+  *has_outside_compilation = false;
+  for (auto& node_def : fdef->node_def()) {
+    if (HasNodeAttr(node_def, outside_compilation_attr_name)) {
+      *has_outside_compilation = true;
+      break;
+    }
+  }
+  if (!has_outside_compilation) {
+    return Status::OK();
+  }
+
+  // Convert the function to graph.
+  FunctionBody* fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(func_name), AttrSlice(&func_name_attrs.attr()), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+
+  // Preprocess edges between different outside compilations. They will be
+  // restored in `ConstructHostGraph()`.
+  TF_RETURN_IF_ERROR(PreprocessEdgesBetweenOutsideCompilations(
+      fbody->graph, outside_compilation_attr_name));
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("extract_outside_compilation_for_func_before_", func_name),
+        *fbody->graph, fld);
+  }
+
+  // Encapsulate outside_compilation cluster into function call node.
+  std::unique_ptr<Graph> graph_out;
+  RewriteOutsideCompilationSubgraphFn rewrite_fn(
+      xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name);
+  TF_RETURN_IF_ERROR(EncapsulateSubgraphsInFunctions(
+      outside_compilation_attr_name, "", *fbody->graph, rewrite_fn,
+      /*reuse_existing_functions=*/true, &graph_out, fld));
+
+  // Replace outside_compilation function nodes with HostCompute ops.
+  std::vector<Node*> outside_compilation_nodes;
+  std::vector<string> outside_compilation_host_graphs;
+  for (Node* n : graph_out->nodes()) {
+    if (HasNodeAttr(n->def(), "_outside_compilation_subgraph")) {
+      outside_compilation_nodes.push_back(n);
+      outside_compilation_host_graphs.push_back(n->name());
+
+      // If we could not infer shapes for XlaSendFromHost inputs statically, we
+      // will set the "shape_inference_graph" attribute. In that case, copy
+      // outside compilation subgraph as shape inference graph in `fld`.
+      string shape_inference_graph;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "shape_inference_graph",
+                                     &shape_inference_graph));
+      if (!shape_inference_graph.empty()) {
+        shape_inference_graphs->push_back(shape_inference_graph);
+
+        const FunctionDef* xla_fdef = fld->Find(n->name());
+        if (!xla_fdef) {
+          return errors::Internal("Cannot find XLA function ", n->name());
+        }
+        FunctionDef shape_inference_fdef = *xla_fdef;
+        shape_inference_fdef.mutable_signature()->set_name(
+            shape_inference_graph);
+        if (fld->Find(shape_inference_graph)) {
+          TF_RETURN_IF_ERROR(fld->ReplaceFunction(shape_inference_graph,
+                                                  shape_inference_fdef));
+        } else {
+          TF_RETURN_IF_ERROR(fld->AddFunctionDef(shape_inference_fdef));
+        }
+      }
+    }
+  }
+  for (Node* n : outside_compilation_nodes) {
+    TF_RETURN_IF_ERROR(ReplaceOrRemoveOutsideCompilationCallNode(
+        graph_out.get(), n, host_compute_core));
+  }
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("extract_outside_compilation_for_func_after_", func_name),
+        *graph_out, fld);
+  }
+
+  // Construct host graph.
+  if (!outside_compilation_host_graphs.empty()) {
+    TF_RETURN_IF_ERROR(
+        ConstructHostGraph(xla_cluster_name, outside_compilation_attr_name,
+                           outside_compilation_host_graphs, fld, host_graph));
+  }
+
+  // Remove the outside compilation graphs from function library.
+  for (const string& func : outside_compilation_host_graphs) {
+    TF_RETURN_IF_ERROR(fld->RemoveFunction(func));
+  }
+
+  // Replace original function.
+  FunctionDef updated_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*graph_out, new_func_name, &updated_fdef));
+  if (fld->Find(new_func_name)) {
+    TF_RETURN_IF_ERROR(fld->ReplaceFunction(new_func_name, updated_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(updated_fdef));
+  }
+
+  return Status::OK();
+}
+
+Status ExtractOutsideCompilation(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name,
+    const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
+    FunctionLibraryDefinition* fld) {
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile("extract_outside_compilation_before", *g, fld);
+  }
+
+  std::vector<string> shape_inference_graphs;
+  for (auto& iter : clusters) {
+    string xla_cluster_name = iter.first;
+    Node* n = iter.second.node;
+    auto const& func_name_attrs = iter.second.func_name_attrs;
+    auto const& host_compute_core = iter.second.host_compute_core;
+
+    bool has_outside_compilation;
+    std::unique_ptr<Graph> host_graph;
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        func_name_attrs, func_name_attrs.name(), host_compute_core, fld,
+        &host_graph, &shape_inference_graphs, &has_outside_compilation));
+    if (host_graph) {
+      TF_RETURN_IF_ERROR(ExpandHostGraphIntoMainGraph(g, host_graph.get(), n));
+    }
+  }
+
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile("extract_outside_compilation_expanded", *g,
+                                fld);
+  }
+
+  TF_RETURN_IF_ERROR(PostprocessForEncapsulation(
+      g, xla_cluster_attr_name, outside_compilation_attr_name, clusters));
+
+  for (auto shape_inference_graph_name : shape_inference_graphs) {
+    TF_RETURN_IF_ERROR(
+        RewriteShapeInferenceGraph(shape_inference_graph_name, g, fld));
+  }
+
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile("extract_outside_compilation_after", *g, fld);
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.h b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a4f07cca213d999202024294f5d8f94527059c3
--- /dev/null
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_EXTRACT_OUTSIDE_COMPILATION_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_EXTRACT_OUTSIDE_COMPILATION_PASS_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Rewrite function for outside compilation subgraphs. It will perform the
+// following steps:
+//
+// 1. Add a XLA computation key placeholder node (it will be used as input for
+//    XlaRecvAtHost and XlaSendFromHost);
+// 2. Replace all _Arg nodes with one single XlaRecvAtHost node;
+// 3. Replace all _Retval nodes with one single XlaSendFromHost node;
+// 4. Mark all nodes except key placeholder with attr `xla_cluster_attr_name`
+//    and `outside_compilation_attr_name`;
+// 5. For nodes marked with attr kXlaConnectedToXlaComputationAttrName, add a
+//    control edge from the node to XlaSendFromHost; for nodes marked with attr
+//    kXlaConnectedFromXlaComputationAttrName, add a control edge from
+//    XlaRecvAtHost node to the node;
+// 6. Try pruning XlaRecvAtHost/XlaSendFromHost/key placeholder node.
+// 7. Add necessary attributes to `node_def`, so we can replace it with a
+//    XlaHostCompute node later. If all input shapes for XlaSendFromHost are
+//    known, "shapes" attr will be set to the list of input shapes; otherwise
+//    "shape_inference_graph" attr will be set to shape inference function name.
+class RewriteOutsideCompilationSubgraphFn {
+ public:
+  RewriteOutsideCompilationSubgraphFn(
+      const string& xla_cluster_attr_name,
+      const string& outside_compilation_attr_name,
+      const string& xla_cluster_name)
+      : xla_cluster_attr_name_(xla_cluster_attr_name),
+        outside_compilation_attr_name_(outside_compilation_attr_name),
+        xla_cluster_name_(xla_cluster_name) {}
+
+  Status operator()(const std::vector<OutputTensor>&,
+                    std::unique_ptr<Graph>* graph,
+                    std::vector<int>* input_permutation,
+                    std::vector<int>* output_permutation, NodeDef* node_def);
+
+ private:
+  string xla_cluster_attr_name_;
+  string outside_compilation_attr_name_;
+  string xla_cluster_name_;
+};
+
+// For an XLA computation function, replace all outside compilations with
+// XlaHostCompute nodes. Each outside compilation subgraph will be rewritten by
+// `RewriteOutsideCompilationSubgraphFn`, and they will be merged into one
+// single host side graph (`host_graph`).
+//
+// xla_cluster_attr_name and outside_compilation_attr_name: attr name for XLA
+//   computation and outside compilation. Required for
+//   `RewriteOutsideCompilationSubgraphFn`.
+// xla_cluster_name: XLA cluster name for this XLA computation. We need it
+//   because XLA cluster name might be different from `func_name`.
+// func_name_attrs: they will be used to instantiate the XLA computation func.
+// new_func_name: new function name for rewritten XLA computation func.
+// host_compute_core: mapping from outside compilation cluster name to XLA
+//   device assignment.
+// fld: FunctionLibraryDefinition object.
+// host_graph: Graph object to store host side graph for all outside
+//   compilations within this XLA computation func. If there is no outside
+//   compilation, it will be empty.
+// shape_inference_graphs: a list of outside compilation shape inference
+//   function names. These functions need to be rewritten later.
+// has_outside_compilation: a bool indicating whether this function has any
+//   outside compilation nodes.
+Status ExtractOutsideCompilationForFunction(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const NameAttrList& func_name_attrs, const string& new_func_name,
+    const std::map<string, int>& host_compute_core,
+    FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph,
+    std::vector<string>* shape_inference_graphs, bool* has_outside_compilation);
+
+// Rewrites XLA computation in `clusters` to replace outside compilation nodes
+// with XlaHostCompute, and moves those outside compilations into `g`. If shapes
+// of outside compilation outputs cannot be determined now, we will store shape
+// inference graph into `fld`.
+Status ExtractOutsideCompilation(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name,
+    const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
+    FunctionLibraryDefinition* fld);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_EXTRACT_OUTSIDE_COMPILATION_PASS_H_
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bff956100da661b679b4557fce53671e6cef88c5
--- /dev/null
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -0,0 +1,441 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
+
+#include "absl/strings/match.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
+  // Build the graph:
+  // "add" = "arg0" + "arg1"
+  // "ret0" = "add"
+  // "ret1" = "arg1"
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_INT32, 0);
+  Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_FLOAT, 1);
+  Output arg2 = ops::_Arg(s.WithOpName("arg2"), DT_INT32, 2);
+  Output add = ops::Add(s.WithOpName("add"), arg0, arg0);
+  auto ret0 = ops::_Retval(s.WithOpName("ret0"), add, 0);
+  auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg1, 1);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(s.ToGraph(g.get()));
+  auto node_name_image = g->BuildNodeNameIndex();
+  Node *add_node = node_name_image["add"];
+  EXPECT_NE(add_node, nullptr);
+  add_node->AddAttr(kXlaConnectedToXlaComputationAttrName, "cluster");
+  add_node->AddAttr(kXlaConnectedFromXlaComputationAttrName, "cluster");
+
+  RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster");
+  std::vector<OutputTensor> arg_source_tensors;
+  NodeDef call_node_def;
+  call_node_def.set_op("0");
+  TF_CHECK_OK(
+      rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
+  node_name_image = g->BuildNodeNameIndex();
+
+  // Verify step 1: add key placeholder node.
+  Node *key_placeholder = node_name_image["cluster_key_placeholder"];
+  EXPECT_NE(key_placeholder, nullptr);
+  // Verify step 2: replace _Arg nodes with XlaRecvAtHost.
+  for (Node *n : g->nodes()) {
+    EXPECT_NE(n->type_string(), "_Arg");
+  }
+  Node *recv_at_host = node_name_image["outside_compilation_cluster_0_recv"];
+  EXPECT_NE(recv_at_host, nullptr);
+  std::vector<DataType> recv_at_host_dtypes;
+  TF_CHECK_OK(
+      GetNodeAttr(recv_at_host->attrs(), "Toutputs", &recv_at_host_dtypes));
+  EXPECT_EQ(recv_at_host_dtypes.size(), 3);
+  EXPECT_EQ(recv_at_host_dtypes[0], DT_INT32);
+  EXPECT_EQ(recv_at_host_dtypes[1], DT_FLOAT);
+  EXPECT_EQ(recv_at_host_dtypes[2], DT_INT32);
+  // Verify step 3: replace _Retval nodes with XlaSendFromHost.
+  for (Node *n : g->nodes()) {
+    EXPECT_NE(n->type_string(), "_Retval");
+  }
+  Node *send_from_host = node_name_image["outside_compilation_cluster_0_send"];
+  EXPECT_NE(send_from_host, nullptr);
+  std::vector<DataType> send_from_host_dtypes;
+  TF_CHECK_OK(
+      GetNodeAttr(send_from_host->attrs(), "Tinputs", &send_from_host_dtypes));
+  EXPECT_EQ(send_from_host_dtypes.size(), 2);
+  EXPECT_EQ(send_from_host_dtypes[0], DT_INT32);
+  EXPECT_EQ(send_from_host_dtypes[1], DT_FLOAT);
+  // Verify step 4: nodes marked with XLA cluster and outside compilation attr.
+  add_node = node_name_image["add"];
+  EXPECT_NE(add_node, nullptr);
+  EXPECT_TRUE(HasNodeAttr(add_node->def(), "_xla"));
+  EXPECT_TRUE(HasNodeAttr(add_node->def(), "_oc"));
+  // Verify step 5: control edges added.
+  bool has_control_edge_from_recv_at_host = false;
+  for (auto e : add_node->in_edges()) {
+    if (e->IsControlEdge() && e->src() == recv_at_host) {
+      has_control_edge_from_recv_at_host = true;
+    }
+  }
+  EXPECT_TRUE(has_control_edge_from_recv_at_host);
+  bool has_control_edge_to_send_from_host = false;
+  for (auto e : add_node->out_edges()) {
+    if (e->IsControlEdge() && e->dst() == send_from_host) {
+      has_control_edge_to_send_from_host = true;
+    }
+  }
+  EXPECT_TRUE(has_control_edge_to_send_from_host);
+  // Verify step 7: necessary attrs added to call_node_def.
+  string shape_inference_graph;
+  TF_CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()),
+                          "shape_inference_graph", &shape_inference_graph));
+  EXPECT_EQ(shape_inference_graph,
+            "_outside_compilation_shape_inference_cluster_0");
+}
+
+TEST(RewriteOutsideCompilationSubgraphFnTest, NoSendFromHost) {
+  // Build the graph: only 1 node: "arg0"
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_INT32, 0);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(s.ToGraph(g.get()));
+
+  RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster");
+  std::vector<OutputTensor> arg_source_tensors;
+  NodeDef call_node_def;
+  call_node_def.set_op("0");
+  TF_CHECK_OK(
+      rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
+  auto node_name_image = g->BuildNodeNameIndex();
+
+  // Check key placeholder and RecvAtHost is present, but SendFromHost is not.
+  Node *key_placeholder = node_name_image["cluster_key_placeholder"];
+  EXPECT_NE(key_placeholder, nullptr);
+  Node *recv_at_host = node_name_image["outside_compilation_cluster_0_recv"];
+  EXPECT_NE(recv_at_host, nullptr);
+  Node *send_from_host = node_name_image["outside_compilation_cluster_0_send"];
+  EXPECT_EQ(send_from_host, nullptr);
+}
+
+TEST(RewriteOutsideCompilationSubgraphFnTest, NoRecvAtHost) {
+  // Build the graph:
+  // "ret" = "const0"
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
+  auto ret = ops::_Retval(s.WithOpName("ret"), const0, 0);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(s.ToGraph(g.get()));
+
+  RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster");
+  std::vector<OutputTensor> arg_source_tensors;
+  NodeDef call_node_def;
+  call_node_def.set_op("0");
+  TF_CHECK_OK(
+      rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
+  auto node_name_image = g->BuildNodeNameIndex();
+
+  // Check key placeholder and SendFromHost is present, but RecvAtHost is not.
+  Node *key_placeholder = node_name_image["cluster_key_placeholder"];
+  EXPECT_NE(key_placeholder, nullptr);
+  Node *recv_at_host = node_name_image["outside_compilation_cluster_0_recv"];
+  EXPECT_EQ(recv_at_host, nullptr);
+  Node *send_from_host = node_name_image["outside_compilation_cluster_0_send"];
+  EXPECT_NE(send_from_host, nullptr);
+}
+
+TEST(RewriteOutsideCompilationSubgraphFnTest, NoKeyPlaceholder) {
+  // Build the graph: only 1 node: "const0"
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(s.ToGraph(g.get()));
+
+  RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster");
+  std::vector<OutputTensor> arg_source_tensors;
+  NodeDef call_node_def;
+  call_node_def.set_op("0");
+  TF_CHECK_OK(
+      rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
+  auto node_name_image = g->BuildNodeNameIndex();
+
+  // Check key placeholder/RecvAtHost/SendFromHost are not present.
+  Node *key_placeholder = node_name_image["cluster_key_placeholder"];
+  EXPECT_EQ(key_placeholder, nullptr);
+  Node *recv_at_host = node_name_image["outside_compilation_cluster_0_recv"];
+  EXPECT_EQ(recv_at_host, nullptr);
+  Node *send_from_host = node_name_image["outside_compilation_cluster_0_send"];
+  EXPECT_EQ(send_from_host, nullptr);
+}
+
+TEST(RewriteOutsideCompilationSubgraphFnTest, ShapesInferred) {
+  // Build the graph:
+  // "ret" = "const0"
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
+  auto ret = ops::_Retval(s.WithOpName("ret"), const0, 0);
+  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(s.ToGraph(g.get()));
+  auto node_name_image = g->BuildNodeNameIndex();
+  Node *const0_node = node_name_image["const0"];
+  EXPECT_NE(const0_node, nullptr);
+  PartialTensorShape shape({2});
+  const0_node->AddAttr(kXlaInferredShapesAttrName,
+                       std::vector<PartialTensorShape>{shape});
+
+  RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster");
+  std::vector<OutputTensor> arg_source_tensors;
+  NodeDef call_node_def;
+  call_node_def.set_op("0");
+  TF_CHECK_OK(
+      rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
+  node_name_image = g->BuildNodeNameIndex();
+
+  // Check "shape" attr is available in call_node_def.
+  std::vector<TensorShapeProto> shapes;
+  TF_CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()), "shapes", &shapes));
+  EXPECT_EQ(shapes.size(), 1);
+  EXPECT_EQ(shapes[0].dim_size(), 1);
+}
+
+TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
+  // Build the XLA computation func.
+  // "const0"
+  // "identity0" = "const0" (outside compilation cluster "0")
+  // "identity1" = "identity0" (outside compilation cluster "1")
+  // "identity2" = "identity1"
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
+    Output identity0 = ops::Identity(s.WithOpName("identity0"), const0);
+    Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
+    Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity0"]->AddAttr("_oc", "0");
+    node_name_image["identity1"]->AddAttr("_oc", "1");
+    PartialTensorShape shape({2});
+    node_name_image["identity1"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::unique_ptr<Graph> host_graph;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten",
+      host_compute_core, &fld, &host_graph, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Get rewritten XLA computation function.
+  FunctionBody *fbody = nullptr;
+  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
+                                      AttrSlice(), &fld,
+                                      [&](const string &op, const OpDef **sig) {
+                                        return fld.LookUpOpDef(op, sig);
+                                      },
+                                      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  auto node_name_index = fbody->graph->BuildNodeNameIndex();
+
+  // Check XlaHostCompute nodes.
+  Node *host_compute_0 = node_name_index["outside_compilation_0_host_compute"];
+  EXPECT_NE(host_compute_0, nullptr);
+  Node *host_compute_1 = node_name_index["outside_compilation_1_host_compute"];
+  EXPECT_NE(host_compute_1, nullptr);
+  // Check XlaHostCompute nodes' "tpu_core" attr.
+  int tpu_core;
+  TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "tpu_core", &tpu_core));
+  EXPECT_EQ(tpu_core, 1);
+  TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "tpu_core", &tpu_core));
+  EXPECT_EQ(tpu_core, 0);
+  // Check XlaHostCompute nodes' "shapes" attr. "0" should not have shapes, and
+  // "1" should have shapes.
+  std::vector<TensorShapeProto> shapes;
+  TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shapes", &shapes));
+  EXPECT_EQ(shapes.size(), 0);
+  TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shapes", &shapes));
+  EXPECT_EQ(shapes.size(), 1);
+  EXPECT_EQ(shapes[0].dim_size(), 1);
+  // Check XlaHostCompute nodes' "shape_inference_graph" attr. Both should have
+  // empty values.
+  string shape_inference_graph;
+  TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
+                          &shape_inference_graph));
+  EXPECT_EQ(shape_inference_graph, "");
+  TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
+                          &shape_inference_graph));
+  EXPECT_EQ(shape_inference_graph, "");
+
+  // Check `shape_inference_graphs`.
+  EXPECT_EQ(shape_inference_graphs.size(), 0);
+
+  // Check `host_graph`: verify we have key placeholder and sequencer.
+  Node *key_placeholder = nullptr, *sequencer = nullptr;
+  for (Node *n : host_graph->nodes()) {
+    if (n->type_string() == "Placeholder" &&
+        absl::EndsWith(n->name(), "_key_placeholder")) {
+      EXPECT_EQ(key_placeholder, nullptr);
+      key_placeholder = n;
+    } else if (HasNodeAttr(n->def(), "_xla_host_transfer_sequencer")) {
+      EXPECT_EQ(sequencer, nullptr);
+      sequencer = n;
+    }
+  }
+  EXPECT_NE(key_placeholder, nullptr);
+  EXPECT_NE(sequencer, nullptr);
+  // Check SendFromHost and RecvAtHost has key placeholder as input, and have
+  // control edge to sequencer.
+  int num_send_from_host = 0, num_recv_at_host = 0;
+  std::vector<Node *> send_recv_nodes;
+  for (Node *n : host_graph->nodes()) {
+    if (n->type_string() == "_XlaSendFromHost") {
+      num_send_from_host++;
+      send_recv_nodes.push_back(n);
+    } else if (n->type_string() == "_XlaRecvAtHost") {
+      num_recv_at_host++;
+      send_recv_nodes.push_back(n);
+    }
+  }
+  EXPECT_EQ(num_send_from_host, 1);
+  EXPECT_EQ(num_recv_at_host, 1);
+  for (Node *n : send_recv_nodes) {
+    Node *input_node;
+    TF_CHECK_OK(n->input_node(n->num_inputs() - 1, &input_node));
+    EXPECT_EQ(input_node, key_placeholder);
+
+    bool has_control_edge_to_sequencer = false;
+    for (const Edge *e : n->out_edges()) {
+      if (e->IsControlEdge() && e->dst() == sequencer) {
+        has_control_edge_to_sequencer = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_control_edge_to_sequencer);
+  }
+}
+
+TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
+  // Build the XLA computation func.
+  // "const0"
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::unique_ptr<Graph> host_graph;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten",
+      host_compute_core, &fld, &host_graph, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Check `host_graph` is empty.
+  EXPECT_FALSE(host_graph);
+}
+
+TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
+  // Build the XLA computation func.
+  // "const0"
+  // "const1" (outside compilation clsuter "0")
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
+    Output const1 = ops::Const(s.WithOpName("const1"), 1, {2});
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["const1"]->AddAttr("_oc", "0");
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
+  std::unique_ptr<Graph> host_graph;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten",
+      host_compute_core, &fld, &host_graph, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Check rewritten XLA graph: verify that we have no XlaHostCompute.
+  FunctionBody *fbody = nullptr;
+  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
+                                      AttrSlice(), &fld,
+                                      [&](const string &op, const OpDef **sig) {
+                                        return fld.LookUpOpDef(op, sig);
+                                      },
+                                      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  for (Node *n : fbody->graph->nodes()) {
+    EXPECT_NE(n->type_string(), "XlaHostCompute");
+  }
+
+  // Check `host_graph`: verify we have no placeholder, but we have "const1".
+  int num_key_placeholders = 0;
+  for (Node *n : host_graph->nodes()) {
+    if (n->type_string() == "Placeholder" &&
+        absl::EndsWith(n->name(), "_key_placeholder")) {
+      num_key_placeholders++;
+    }
+  }
+  EXPECT_EQ(num_key_placeholders, 0);
+  auto node_name_index = host_graph->BuildNodeNameIndex();
+  EXPECT_NE(node_name_index.find("const1"), node_name_index.end());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98e344b3a080aa8aab27cd41564a90427bac151e
--- /dev/null
+++ b/tensorflow/compiler/jit/flags.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <mutex>  // NOLINT
+
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace {
+
+BuildXlaOpsPassFlags* build_ops_flags;
+DumpGraphFlags* dump_graph_flags;
+MarkForCompilationPassFlags* mark_for_compilation_flags;
+XlaDeviceFlags* device_flags;
+XlaOpsCommonFlags* ops_flags;
+
+std::vector<Flag>* flag_list;
+std::once_flag flags_init;
+
+void AppendDumpGraphFlagsInternal(std::vector<Flag>* flag_list) {
+  std::vector<Flag> new_flags = {
+      Flag("tf_dump_graph_prefix", &dump_graph_flags->tf_dump_graph_prefix,
+           "Path prefix to which graphs dumped during debugging should be "
+           "written."),
+  };
+  flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
+}
+
+void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
+  std::vector<Flag> new_flags = {
+      Flag("tf_xla_auto_jit", &mark_for_compilation_flags->tf_xla_auto_jit,
+           "Control compilation of operators into XLA computations on CPU and "
+           "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
+           "things very likely to be improved; 2 = on for everything.  "
+           "Experimental."),
+      Flag("tf_xla_min_cluster_size",
+           &mark_for_compilation_flags->tf_xla_min_cluster_size,
+           "Minimum number of operators in an XLA compilation. Ignored for "
+           "operators placed on an XLA device or operators explicitly marked "
+           "for compilation."),
+      Flag("tf_xla_max_cluster_size",
+           &mark_for_compilation_flags->tf_xla_max_cluster_size,
+           "Maximum number of operators in an XLA compilation."),
+      Flag("tf_xla_clustering_debug",
+           &mark_for_compilation_flags->tf_xla_clustering_debug,
+           "Dump graphs during XLA compilation."),
+      Flag("tf_xla_cpu_global_jit",
+           &mark_for_compilation_flags->tf_xla_cpu_global_jit,
+           "Enables global JIT compilation for CPU via SessionOptions."),
+      Flag("tf_xla_clustering_fuel",
+           &mark_for_compilation_flags->tf_xla_clustering_fuel,
+           "Places an artificial limit on the number of ops marked as "
+           "eligible for clustering."),
+      Flag("tf_xla_fusion_only",
+           &mark_for_compilation_flags->tf_xla_fusion_only,
+           "enable fusion of element-wise operations only using XLA when "
+           "global_jit_level is ON*.")};
+  flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
+}
+
+void AllocateAndParseFlags() {
+  build_ops_flags = new BuildXlaOpsPassFlags;
+  build_ops_flags->tf_xla_enable_lazy_compilation = true;
+
+  dump_graph_flags = new DumpGraphFlags;
+  dump_graph_flags->tf_dump_graph_prefix = "/tmp/";
+
+  mark_for_compilation_flags = new MarkForCompilationPassFlags;
+  mark_for_compilation_flags->tf_xla_auto_jit = 0;
+  mark_for_compilation_flags->tf_xla_min_cluster_size = 2;
+  mark_for_compilation_flags->tf_xla_max_cluster_size =
+      std::numeric_limits<int32>::max();
+  mark_for_compilation_flags->tf_xla_clustering_debug = false;
+  mark_for_compilation_flags->tf_xla_cpu_global_jit = false;
+  mark_for_compilation_flags->tf_xla_clustering_fuel =
+      std::numeric_limits<int64>::max();
+  mark_for_compilation_flags->tf_xla_fusion_only = false;
+
+  device_flags = new XlaDeviceFlags;
+  device_flags->tf_xla_compile_on_demand = false;
+
+  ops_flags = new XlaOpsCommonFlags;
+  ops_flags->tf_xla_always_defer_compilation = false;
+
+  flag_list = new std::vector<Flag>({
+      Flag("tf_xla_enable_lazy_compilation",
+           &build_ops_flags->tf_xla_enable_lazy_compilation, ""),
+
+      Flag("tf_xla_compile_on_demand", &device_flags->tf_xla_compile_on_demand,
+           "Switch a device into 'on-demand' mode, where instead of "
+           "autoclustering ops are compiled one by one just-in-time."),
+
+      Flag("tf_xla_always_defer_compilation",
+           &ops_flags->tf_xla_always_defer_compilation, ""),
+  });
+  AppendDumpGraphFlagsInternal(flag_list);
+  AppendMarkForCompilationPassFlagsInternal(flag_list);
+  xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", *flag_list);
+}
+
+}  // namespace
+
+const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return *build_ops_flags;
+}
+
+DumpGraphFlags* GetDumpGraphFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return dump_graph_flags;
+}
+
+MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return mark_for_compilation_flags;
+}
+
+XlaDeviceFlags* GetXlaDeviceFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return device_flags;
+}
+
+const XlaOpsCommonFlags& GetXlaOpsCommonFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return *ops_flags;
+}
+
+void AppendMarkForCompilationPassFlags(std::vector<Flag>* flag_list) {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  AppendMarkForCompilationPassFlagsInternal(flag_list);
+}
+
+void AppendDumpGraphFlags(std::vector<Flag>* flag_list) {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  AppendDumpGraphFlagsInternal(flag_list);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ddea588eef5270880d91623dc05893da265960a
--- /dev/null
+++ b/tensorflow/compiler/jit/flags.h
@@ -0,0 +1,103 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_FLAGS_H_
+#define TENSORFLOW_COMPILER_JIT_FLAGS_H_
+
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+
+// Flags associated with the XLA bridge's mark_for_compilation_pass module.
+struct MarkForCompilationPassFlags {
+  int32 tf_xla_auto_jit;  // Control compilation of operators into XLA
+                          // computations on CPU and GPU devices.  0 = use
+                          // ConfigProto setting; -1 = off; 1 = on for things
+                          // very likely to be improved; 2 = on for everything.
+                          // Experimental.
+  int32 tf_xla_min_cluster_size;  // Minimum number of operators in an XLA
+                                  // compilation. Ignored for operators placed
+                                  // on an XLA device or operators explicitly
+                                  // marked for compilation.
+  int32 tf_xla_max_cluster_size;  // Maximum number of operators in an XLA
+                                  // compilation.
+  bool tf_xla_clustering_debug;   // Dump graphs during XLA compilation.
+  bool tf_xla_cpu_global_jit;     // Enables global JIT compilation for CPU
+                                  // via SessionOptions.
+  int64 tf_xla_clustering_fuel;   // "Compiler fuel" for clustering.  Only this
+                                  // many ops will be marked as eligible for
+                                  // clustering.
+  bool tf_xla_fusion_only;  // This flag is effective only when global_jit_level
+                            // is set to ON* and overrides its behavior. If
+                            // true, enable fusion of element-wise operations
+                            // only using XLA.
+};
+
+// Flags associated with the XLA bridge's xla_device module.
+struct XlaDeviceFlags {
+  // Switch the CPU device into "on-demand" mode, where instead of
+  // autoclustering ops are compiled one by one just-in-time.
+  // Enabling this mode by a legacy flag is a temporary mechanism. When this
+  // feature is battle-tested, we will switch this to be a session option.
+  bool tf_xla_compile_on_demand;
+};
+
+// Flags common to the _Xla* ops and their kernels.
+struct XlaOpsCommonFlags {
+  // If true, _XlaCompile always refuses to compile the cluster, which means the
+  // XLA clusters always run in the TF executor.  Defaults to false.
+  bool tf_xla_always_defer_compilation;
+};
+
+// Flags for the build_xla_ops pass.
+struct BuildXlaOpsPassFlags {
+  // Enables lazy compilation for TF/XLA (only when auto-clustering) if true.
+  // Defaults to true.
+  bool tf_xla_enable_lazy_compilation;
+};
+
+// Flags for the XLA bridge's dump_graph module.
+struct DumpGraphFlags {
+  // Path prefix to which graphs dumped during debugging should be written.
+  string tf_dump_graph_prefix;
+};
+
+// Return a pointer to the DumpGraphFlags struct;
+// repeated calls return the same pointer.
+// This should be called only after Flags::Parse() has returned.
+
+// Getters for flags structs defined above.  The first call to any of these
+// parses TF_XLA_FLAGS for all of them.  Those functions which return a pointer
+// always return the same pointer.
+MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
+const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags();
+XlaDeviceFlags* GetXlaDeviceFlags();
+const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
+DumpGraphFlags* GetDumpGraphFlags();
+
+// Appends the flag definitions associated with
+// MarkForCompilationPassFlags/DumpGraphFlags to `flag_list`.
+//
+// Has the side-effect of parsing TF_XLA_FLAGS if that hasn't happened yet.
+void AppendMarkForCompilationPassFlags(
+    std::vector<tensorflow::Flag>* flag_list);
+void AppendDumpGraphFlags(std::vector<tensorflow::Flag>* flag_list);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_FLAGS_H_
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce53f70b79d97ab087fefe542920b33f883632a2
--- /dev/null
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -0,0 +1,364 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h"
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/types/optional.h"
+#include "tensorflow/cc/framework/scope_internal.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
+#include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace {
+
+// StatusOrOptional<T> instances hold
+//
+//  - A non-OK Status to indicate an error that needs to be propagated out of
+//    this pass (e.g. the Graph is malformed).
+//
+//  - A nullopt to indicate the function that created the instance failed to do
+//    what it set out to do but this is not actually an error
+//    (e.g. TryToGetTensorFromConstOp was passed a non-Const node).
+//
+//  - A T to indicate a successful operation.
+template <class T>
+using StatusOrOptional = xla::StatusOr<absl::optional<T>>;
+
+StatusOrOptional<Tensor> TryToGetTensorFromConstOp(Node* n) {
+  if (n->type_string() != "Const") {
+    return {absl::nullopt};
+  }
+
+  const TensorProto* proto = nullptr;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "value", &proto));
+  Tensor tensor(proto->dtype());
+  TF_RET_CHECK(tensor.FromProto(*proto));
+  return {tensor};
+}
+
+struct SliceInputs {
+  Output slice_op;
+  Output input;
+  Output begin;
+  Output size;
+
+  // The size of the TF slice operation as a std::vector.  We can always compute
+  // this because we only manipulate slices with a Const size.
+  std::vector<int64> size_as_vector;
+};
+
+std::vector<int64> IntTensorAsVector(const Tensor& t) {
+  DCHECK(t.dtype() == DT_INT32 || t.dtype() == DT_INT64);
+  std::vector<int64> result;
+  result.reserve(t.NumElements());
+  for (int i = 0; i < t.NumElements(); i++) {
+    int64 element = t.dtype() == DT_INT32
+                        ? static_cast<int64>(t.flat<int32>()(i))
+                        : t.flat<int64>()(i);
+    result.push_back(element);
+  }
+  return result;
+}
+
+// Packages up the inputs to a Slice operation into an instance of
+// `SliceInputs`.
+StatusOrOptional<SliceInputs> GetSliceInputs(Node* slice) {
+  const int kSliceInputIndex = 0;
+  const int kSliceBeginIndex = 1;
+  const int kSliceSizeIndex = 2;
+
+  const Edge* slice_input_edge;
+  TF_RETURN_IF_ERROR(slice->input_edge(kSliceInputIndex, &slice_input_edge));
+  const Edge* slice_size_edge;
+  TF_RETURN_IF_ERROR(slice->input_edge(kSliceSizeIndex, &slice_size_edge));
+  const Edge* slice_begin_edge;
+  TF_RETURN_IF_ERROR(slice->input_edge(kSliceBeginIndex, &slice_begin_edge));
+
+  SliceInputs slice_inputs;
+  slice_inputs.input =
+      Output(slice_input_edge->src(), slice_input_edge->src_output());
+  slice_inputs.begin =
+      Output(slice_begin_edge->src(), slice_begin_edge->src_output());
+  slice_inputs.size =
+      Output(slice_size_edge->src(), slice_size_edge->src_output());
+
+  TF_ASSIGN_OR_RETURN(absl::optional<Tensor> tf_slice_size,
+                      TryToGetTensorFromConstOp(slice_inputs.size.node()));
+  if (!tf_slice_size.has_value()) {
+    return {absl::nullopt};
+  }
+
+  if (tf_slice_size->dims() != 1) {
+    return {absl::nullopt};
+  }
+
+  slice_inputs.size_as_vector = IntTensorAsVector(*tf_slice_size);
+  return {slice_inputs};
+}
+
+// Casts `x` to a DT_INT64 if it isn't one already.
+Output MakeInt64(const Scope& host_scope, absl::string_view name,
+                 const Output& x) {
+  return x.type() == DT_INT64
+             ? x
+             : ops::Cast(host_scope.WithOpName(name, "_s64"), x, DT_INT64);
+}
+
+// Returns `slice_inputs` with the index and size inputs cast to DT_INT64.
+SliceInputs MakeSliceIndexAndSizeInt64(const Scope& host_scope,
+                                       const SliceInputs& slice_inputs) {
+  SliceInputs result;
+  result.input = slice_inputs.input;
+  result.begin = MakeInt64(host_scope, "begin", slice_inputs.begin);
+  result.size = MakeInt64(host_scope, "size", slice_inputs.size);
+  result.size_as_vector = slice_inputs.size_as_vector;
+  return result;
+}
+
+// This class caches emitted constants to avoid creating multiple nodes for the
+// same constant value.  This helps make the generated GraphDef more readable.
+class ConstantCache {
+ public:
+  explicit ConstantCache(const Scope& s) : scope_(s) {}
+
+  Output Get1DHostConstant(int64 constant) {
+    auto it = cache_.find(constant);
+    if (it == cache_.end()) {
+      Output new_const =
+          ops::Const(scope_.WithOpName("const_", constant), {constant});
+      it = cache_.insert({constant, new_const}).first;
+    }
+    return it->second;
+  }
+
+ private:
+  Scope scope_;
+  std::unordered_map<int, Output> cache_;
+};
+
+// Returns a node computing the size of the Slice op with inputs `slice_inputs`.
+Status ComputeSliceSize(const Scope& host_scope,
+                        const SliceInputs& slice_inputs, Output* size) {
+  // If slice_size[i] >= 0 then slice_size[i] = slice_size[i].
+  //
+  // If slice_size[i] == -1 then slice_size[i] = input_size[i] -
+  // begin[i].
+  //
+  // If slice_size[i] < -1 then executing the slice will throw an error, and we
+  // don't do anything here.  We've already filtered these cases out in
+  // IsRewritableSlice.
+
+  if (absl::c_all_of(slice_inputs.size_as_vector,
+                     [](int64 i) { return i >= 0; })) {
+    *size = slice_inputs.size;
+    return Status::OK();
+  }
+
+  Output input_shape =
+      ops::Shape(host_scope.WithOpName("input_shape"), slice_inputs.input,
+                 ops::Shape::OutType(DT_INT64));
+
+  ConstantCache constant_pool(host_scope);
+
+  std::vector<Output> slice_size;
+  for (int i = 0; i < slice_inputs.size_as_vector.size(); i++) {
+    if (slice_inputs.size_as_vector[i] >= 0) {
+      slice_size.push_back(
+          constant_pool.Get1DHostConstant(slice_inputs.size_as_vector[i]));
+      continue;
+    }
+
+    DCHECK_EQ(slice_inputs.size_as_vector[i], -1);
+
+    Output begin_i = ops::Slice(
+        host_scope.WithOpName("begin_", i), slice_inputs.begin,
+        constant_pool.Get1DHostConstant(i), constant_pool.Get1DHostConstant(1));
+
+    Output input_shape_i = ops::Slice(
+        host_scope.WithOpName("input_shape_", i), input_shape,
+        constant_pool.Get1DHostConstant(i), constant_pool.Get1DHostConstant(1));
+
+    slice_size.push_back(ops::Sub(host_scope.WithOpName("slice_size_", i),
+                                  input_shape_i, begin_i));
+    DCHECK_EQ(slice_size.back().type(), DT_INT64);
+  }
+
+  // Trivial ConcatV2 nodes (with exactly one input) are disallowed.
+  *size =
+      slice_size.size() == 1
+          ? slice_size[0]
+          : ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
+                        ops::Const(host_scope.WithOpName("concat_axis"), 0));
+  return Status::OK();
+}
+
+// Terminology: "static sized" slice is a slice with the
+// _XlaCompileTimeConstantInputs attribute set to {2}.  The output shape of
+// these slices can be solely determined by their "size" input.
+Status ConvertTensorFlowSliceToStaticShapedSlice(
+    Graph* g, Node* slice, const SliceInputs& slice_inputs,
+    absl::string_view cluster_name, Node** result) {
+  string host_name;
+  TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
+      slice->assigned_device_name(), &host_name));
+
+  Status status;
+  Scope main_scope =
+      NewInternalScope(g, &status, /*refiner=*/nullptr)
+          .WithXlaCluster(string(cluster_name))
+          .NewSubScope(absl::StrCat(slice->name(), "/static_shaped_slice"));
+  Scope host_scope = main_scope.WithAssignedDevice(host_name);
+
+  SliceInputs slice_inputs_int64 =
+      MakeSliceIndexAndSizeInt64(host_scope, slice_inputs);
+
+  Output slice_size;
+  TF_RETURN_IF_ERROR(
+      ComputeSliceSize(host_scope, slice_inputs_int64, &slice_size));
+
+  *result =
+      ops::Slice(main_scope.WithAssignedDevice(slice->assigned_device_name())
+                     .WithOpName("static_shaped_slice"),
+                 slice_inputs_int64.input, slice_inputs_int64.begin, slice_size)
+          .node();
+
+  TF_RETURN_IF_ERROR(main_scope.status());
+
+  std::vector<string> compile_time_const_inputs;
+  compile_time_const_inputs.push_back("size");
+  (*result)->AddAttr(kXlaCompileTimeConstantInputsAttr,
+                     compile_time_const_inputs);
+  return status;
+}
+
+void ReplaceTensorFlowSliceWithStaticShapedSlice(Graph* g, Node* slice,
+                                                 Node* static_shaped_slice) {
+  absl::InlinedVector<const Edge*, 6> edges_to_remove;
+  std::vector<const Edge*> slice_out_edges;
+  absl::c_copy(slice->out_edges(), std::back_inserter(slice_out_edges));
+  for (const Edge* e : slice_out_edges) {
+    DCHECK(e->src_output() == 0 || e->src_output() == Graph::kControlSlot);
+
+    int src_output = e->src_output();
+    int dst_input = e->dst_input();
+    Node* dst = e->dst();
+    g->RemoveEdge(e);
+    g->AddEdge(static_shaped_slice, src_output, dst, dst_input);
+  }
+
+  for (const Edge* e : slice->in_edges()) {
+    if (e->IsControlEdge()) {
+      g->AddControlEdge(e->src(), static_shaped_slice);
+    }
+  }
+
+  g->RemoveNode(slice);
+}
+
+Status RewriteSlice(Graph* g, Node* slice, const SliceInputs& slice_inputs,
+                    absl::string_view cluster_name) {
+  VLOG(3) << "Rewriting slice " << slice->name()
+          << " to a \"static shaped\" Slice";
+  Node* static_shaped_slice;
+  TF_RETURN_IF_ERROR(ConvertTensorFlowSliceToStaticShapedSlice(
+      g, slice, slice_inputs, cluster_name, &static_shaped_slice));
+  ReplaceTensorFlowSliceWithStaticShapedSlice(g, slice, static_shaped_slice);
+  return Status::OK();
+}
+
+// Return true if `n` is a slice we can rewrite to have a static shape
+// (i.e. have the output shape only depend on the "size" input).
+xla::StatusOr<bool> IsRewritableSlice(Node* n) {
+  if (n->type_string() != "Slice") {
+    return false;
+  }
+
+  if (!GetXlaClusterForNode(*n).has_value()) {
+    // There is no need to change slice ops outside XLA clusters.
+    return false;
+  }
+
+  TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
+                      GetSliceInputs(n));
+  if (!slice_inputs.has_value()) {
+    return false;
+  }
+
+  // If slice_size[i] < -1 for any i then executing the slice will throw an
+  // error, and we don't do anything here.
+  return absl::c_all_of(slice_inputs->size_as_vector,
+                        [](int64 size_i) { return size_i >= -1; });
+}
+
+Status FindAndRewriteSlices(Graph* g, bool* changed) {
+  std::vector<Node*> slices_to_rewrite;
+  for (Node* n : g->nodes()) {
+    TF_ASSIGN_OR_RETURN(bool is_rewritable, IsRewritableSlice(n));
+    if (is_rewritable) {
+      slices_to_rewrite.push_back(n);
+    }
+  }
+
+  for (Node* n : slices_to_rewrite) {
+    TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
+                        GetSliceInputs(n));
+    TF_RET_CHECK(slice_inputs.has_value());
+    TF_RETURN_IF_ERROR(
+        RewriteSlice(g, n, *slice_inputs, *GetXlaClusterForNode(*n)));
+  }
+
+  if (!slices_to_rewrite.empty()) {
+    // We've added constants to the graph; hook them up to _SOURCE.
+    FixupSourceAndSinkEdges(g);
+  }
+
+  *changed = !slices_to_rewrite.empty();
+
+  return Status::OK();
+}
+}  // namespace
+
+Status IncreaseDynamismForAutoJitPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
+  if (flags->tf_xla_clustering_debug) {
+    dump_graph::DumpGraphToFile("before_increase_dynamism_for_auto_jit_pass",
+                                **options.graph, options.flib_def);
+  }
+
+  bool changed;
+  TF_RETURN_IF_ERROR(FindAndRewriteSlices(options.graph->get(), &changed));
+  if (changed && flags->tf_xla_clustering_debug) {
+    dump_graph::DumpGraphToFile("increase_dynamism_for_auto_jit_pass",
+                                **options.graph, options.flib_def);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..818ca948d64b0353b08f393c3bd7d874c9b2480b
--- /dev/null
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_INCREASE_DYNAMISM_FOR_AUTO_JIT_PASS_H_
+#define TENSORFLOW_COMPILER_JIT_INCREASE_DYNAMISM_FOR_AUTO_JIT_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+namespace tensorflow {
+
+// Increases the amount of "dynamism" representable by XLA clusters by rewriting
+// the TensorFlow graph.  This pass does the following rewrites:
+//
+// Slice
+// -----
+//
+//   Slice(op, begin, size <must be constant>) =>
+//     Slice(op, begin, actual_size(op.shape(), size, begin));
+//       _XlaCompileTimeConstantInputs={2}
+//
+// where
+//
+//   actual_size(op_shape, size, begin)[i] =
+//     size[i] == -1 ? (op_shape[i] - size[i])
+//                   : size[i]
+//
+// This pass, combined with jit/partially_decluster_pass, reduces the number of
+// unnecessary cluster recompilations in some common cases.  After the rewrite
+// shown above jit/partially_decluster_pass extracts the actual_size(...)
+// computation to outside the XLA cluster, causing the cluster to be versioned
+// only on the actual size of the XlaDynamicSlice.  This avoids recompilation
+// due to superficial changes that don't affect tensor shapes.
+//
+// Future Work TODO(b/111210515)
+// -----------------------------
+//
+// In the future we will also translate StridedSlice and Pad a similar way.
+class IncreaseDynamismForAutoJitPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_INCREASE_DYNAMISM_FOR_AUTO_JIT_PASS_H_
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2f1b831ad7605237e23c15cc43b337e06265553
--- /dev/null
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
@@ -0,0 +1,405 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h"
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/compiler/jit/node_matchers.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+using ::testing::_;
+using testing::matchers::AssignedDevice;
+using testing::matchers::Attr;
+using testing::matchers::Const;
+using testing::matchers::CtrlDeps;
+using testing::matchers::Inputs;
+using testing::matchers::Name;
+using testing::matchers::NodeWith;
+using testing::matchers::Op;
+using testing::matchers::Out;
+
+// A fake device used to populate a DeviceSet.
+class FakeDevice : public Device {
+ public:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+  Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
+
+  static std::unique_ptr<Device> Make(const string& name, const string& type) {
+    DeviceAttributes device_attributes;
+    device_attributes.set_name(name);
+    device_attributes.set_device_type(DeviceType(type).type());
+    return absl::make_unique<FakeDevice>(device_attributes);
+  }
+};
+
+const char* kHostName = "/job:worker/replica:0/task:0/device:CPU:0";
+const char* kDeviceName = "/job:worker/replica:0/task:0/device:GPU:0";
+
+Status IncreaseDynamismForAutoJit(const Scope& s,
+                                  std::unique_ptr<Graph>* result) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.push_back(FakeDevice::Make(kDeviceName, DEVICE_GPU));
+  devices.push_back(FakeDevice::Make(kHostName, DEVICE_CPU));
+
+  std::unique_ptr<DeviceSet> device_set(new DeviceSet());
+  for (auto& device : devices) {
+    device_set->AddDevice(device.get());
+  }
+
+  auto graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SessionOptions session_options;
+  session_options.config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_global_jit_level(OptimizerOptions::ON_2);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  options.device_set = device_set.get();
+  options.session_options = &session_options;
+
+  // Scope::ToGraph seems to drop assigned devices, probably because it goes
+  // through a GraphDef.  So explicitly maintain the device assignment.
+  std::unordered_map<string, string> assigned_device_names;
+  for (Node* n : s.graph()->nodes()) {
+    assigned_device_names[n->name()] = n->assigned_device_name();
+  }
+  TF_RETURN_IF_ERROR(s.ToGraph(graph.get()));
+  for (Node* n : graph->nodes()) {
+    n->set_assigned_device_name(assigned_device_names[n->name()]);
+  }
+
+  IncreaseDynamismForAutoJitPass rewriter;
+  TF_RETURN_IF_ERROR(rewriter.Run(options));
+  *result = std::move(graph);
+  return Status::OK();
+}
+
+TEST(SliceToDynamicSliceRewriteTest, Basic) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size = ops::Const(root.WithOpName("size"), {-1, 500});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  const int64 zero_64 = 0;
+  const int32 zero_32 = 0;
+  const int64 one_64 = 1;
+
+  auto m_input = Out(NodeWith(Op("Placeholder"), Name("input")));
+  auto m_begin_s64 = Out(NodeWith(
+      Op("Cast"), Inputs(Out(NodeWith(Op("Placeholder"), Name("begin"))))));
+  auto m_input_shape = Out(NodeWith(Op("Shape"), Inputs(m_input)));
+  auto m_slice_size_0 = Out(NodeWith(
+      Op("Sub"), AssignedDevice(kHostName),
+      Inputs(
+          Out(NodeWith(Op("Slice"), AssignedDevice(kHostName),
+                       Inputs(m_input_shape, Const(zero_64), Const(one_64)))),
+          Out(NodeWith(Op("Slice"), AssignedDevice(kHostName),
+                       Inputs(m_begin_s64, Const(zero_64), Const(one_64)))))));
+  auto m_dynamic_slice_size = Out(NodeWith(
+      Op("ConcatV2"), AssignedDevice(kHostName),
+      Inputs(m_slice_size_0, Const(static_cast<int64>(500)), Const(zero_32))));
+
+  std::vector<string> compile_time_constant_inputs;
+  compile_time_constant_inputs.push_back("size");
+  auto m_dynamic_slice = NodeWith(
+      Op("Slice"), AssignedDevice(kDeviceName),
+      Attr(kXlaCompileTimeConstantInputsAttr, compile_time_constant_inputs),
+      Inputs(m_input, m_begin_s64, m_dynamic_slice_size));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(static_shaped_slice, m_dynamic_slice);
+}
+
+TEST(SliceToDynamicSliceRewriteTest, SliceFromVector) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size = ops::Const(root.WithOpName("size"), {-1});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  EXPECT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(result->nodes(), Not(Contains(NodeWith(Op("ConcatV2")))));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, ControlDependencePreserved) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size = ops::Const(root.WithOpName("size"), {-1, 500});
+  Output control_pred = ops::Placeholder(root.WithOpName("control"), DT_BOOL);
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+  root.graph()->AddControlEdge(control_pred.node(), slice.node());
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(static_shaped_slice,
+              NodeWith(Op("Slice"),
+                       CtrlDeps(NodeWith(Op("Placeholder"), Name("control")))));
+}
+
+int64 ToInt64(int v) { return static_cast<int64>(v); }
+
+TEST(SliceToDynamicSliceRewriteTest, Int64Indices) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
+  Output size =
+      ops::Const(root.WithOpName("size"), {ToInt64(-1), ToInt64(500)});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  EXPECT_THAT(result->nodes(), Not(Contains(NodeWith(Op("Cast")))));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, DontRewriteInvalidSlice) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+
+  // The shape refiner throws an error if we use a bogus constant value for
+  // size.  So we first use a Placeholder to placate the shape refiner, and
+  // later replace it with a bogus constant.
+  Output size_placeholder =
+      ops::Placeholder(root.WithOpName("size_placeholder"), DT_INT32);
+  Output slice =
+      ops::Slice(root.WithOpName("slice"), input, begin, size_placeholder);
+
+  Output size = ops::Const(root.WithOpName("size"), {-8, 500});
+  TF_ASSERT_OK(root.graph()->UpdateEdge(/*new_src=*/size.node(),
+                                        /*new_src_index=*/0,
+                                        /*dst=*/slice.node(), /*dst_index=*/2));
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  EXPECT_THAT(result->nodes(),
+              Not(Contains(NodeWith(Op("Slice"),
+                                    Attr(kXlaCompileTimeConstantInputsAttr)))));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, DontRewriteUnclusteredSlice) {
+  Scope root =
+      Scope::NewRootScope().ExitOnError().WithAssignedDevice(kDeviceName);
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size = ops::Const(root.WithOpName("size"), {-1, 500});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  EXPECT_THAT(result->nodes(),
+              Not(Contains(NodeWith(Op("Slice"),
+                                    Attr(kXlaCompileTimeConstantInputsAttr)))));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, DontRewriteSliceWithNonConstSize) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
+  Output size = ops::Placeholder(root.WithOpName("size"), DT_INT64);
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  EXPECT_THAT(result->nodes(),
+              Not(Contains(NodeWith(Op("Slice"),
+                                    Attr(kXlaCompileTimeConstantInputsAttr)))));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, ScalarSlice) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
+  Output size = ops::Const<int64>(root.WithOpName("size"), {});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(static_shaped_slice,
+              NodeWith(Op("Slice"), Attr(kXlaCompileTimeConstantInputsAttr),
+                       Inputs(_, _, Out(NodeWith(Name(size.node()->name()))))));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  auto ToInt64 = [](int v) { return static_cast<int64>(v); };
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
+
+  // The C++ node bindings immediately error out when we try construct a bogus
+  // slice so we first use a placeholder to construct the Slice and then replace
+  // the input.
+  Output size_placeholder = ops::Placeholder(root.WithOpName("size"), DT_INT64);
+  Output slice =
+      ops::Slice(root.WithOpName("slice"), input, begin, size_placeholder);
+
+  Output size =
+      ops::Const(root.WithOpName("size"), {{ToInt64(-1)}, {ToInt64(500)}});
+  TF_ASSERT_OK(root.graph()->UpdateEdge(size.node(), 0, slice.node(), 2));
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  EXPECT_THAT(result->nodes(),
+              Not(Contains(NodeWith(Op("Slice"),
+                                    Attr(kXlaCompileTimeConstantInputsAttr)))));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceInput) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size_a = ops::Const(root.WithOpName("size_a"), {-1, 500});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size_a);
+
+  Output size_b = ops::Const(root.WithOpName("size_a"), {-1, 200});
+  Output slice_with_slice_input = ops::Slice(
+      root.WithOpName("slice_with_slice_input"), slice, begin, size_b);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(),
+      "slice_with_slice_input/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_EQ(static_shaped_slice->output_type(0), DT_FLOAT)
+      << "Expected DT_FLOAT, was "
+      << DataType_Name(static_shaped_slice->output_type(0));
+  EXPECT_THAT(
+      static_shaped_slice,
+      NodeWith(
+          Op("Slice"),
+          Inputs(Out(NodeWith(
+                     Op("Slice"),
+                     Name("slice/static_shaped_slice/static_shaped_slice"))),
+                 _, _)));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceBegin) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input_float =
+      ops::Placeholder(root.WithOpName("input_float"), DT_FLOAT);
+  Output input_i64 = ops::Placeholder(root.WithOpName("input_i64"), DT_INT64);
+
+  Output begin_begin =
+      ops::Placeholder(root.WithOpName("begin_begin"), DT_INT32);
+  Output begin_size = ops::Const(root.WithOpName("begin_size"), {-1});
+  Output begin =
+      ops::Slice(root.WithOpName("begin"), input_i64, begin_begin, begin_size);
+
+  Output size =
+      ops::Const(root.WithOpName("size"), {ToInt64(-1), ToInt64(200)});
+  Output slice_with_slice_begin = ops::Slice(
+      root.WithOpName("slice_with_slice_begin"), input_float, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(),
+      "slice_with_slice_begin/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_EQ(static_shaped_slice->output_type(0), DT_FLOAT)
+      << "Expected DT_FLOAT, was "
+      << DataType_Name(static_shaped_slice->output_type(0));
+  EXPECT_THAT(
+      static_shaped_slice,
+      NodeWith(
+          Op("Slice"),
+          Inputs(_,
+                 Out(NodeWith(
+                     Op("Slice"),
+                     Name("begin/static_shaped_slice/static_shaped_slice"))),
+                 _)));
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
index 085c0e5adbb270e71ff3447a936555c99904e26c..f79bdc1e2e8d82c9144d1bb9923ad36d8541cbdb 100644
--- a/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
+++ b/tensorflow/compiler/jit/jit_compilation_pass_registration.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/build_xla_ops_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
+#include "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/jit/partially_decluster_pass.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
@@ -44,17 +45,20 @@ REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 10,
                       MarkForCompilationPass);
 
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 20,
+                      IncreaseDynamismForAutoJitPass);
+
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30,
                       PartiallyDeclusterPass);
 
 // The EncapsulateSubgraphs pass must run after the MarkForCompilationPass. We
 // also need to run it after the graph been rewritten to have _Send nodes added
 // for fetches. Before the _Send nodes are added, fetch nodes are identified by
 // name, and encapsulation might remove that node from the graph.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 30,
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 40,
                       EncapsulateSubgraphsPass);
 
 // Must run after EncapsulateSubgraphsPass.
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 40,
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 50,
                       BuildXlaOpsPass);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 26cb3af9d69ba1877c67853cde28d2477d394efc..0583774714c6db7a2fa515fc8a0d304e1898db97 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -12,6 +12,7 @@ cc_library(
     hdrs = ["xla_ops.h"],
     deps = [
         "//tensorflow/compiler/jit:common",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:xla_compilation_cache",
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/jit:xla_launch_util",
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index accc86a86d9d3eca741994ee502bd7580ce49b2e..ad71df5a694a5f8da94675049df1062a7edb6253 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -38,12 +39,22 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
+// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that
+// in error case, it returns RET instead of void.
+#define OP_REQUIRES_OK_RETURN(CTX, RET, ...)                \
+  do {                                                      \
+    ::tensorflow::Status _s(__VA_ARGS__);                   \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      return RET;                                           \
+    }                                                       \
+  } while (0)
+
 namespace tensorflow {
 
 namespace {
 
-Status PlatformInfoFromContext(OpKernelConstruction* ctx,
-                               XlaPlatformInfo* result) {
+XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
   DeviceType device_type = ctx->device_type();
   se::Platform::Id platform_id = nullptr;
   const XlaDevice::Metadata* xla_device_metadata = nullptr;
@@ -75,16 +86,16 @@ Status PlatformInfoFromContext(OpKernelConstruction* ctx,
   }
 
   if (!device_allocator) {
-    TF_ASSIGN_OR_RETURN(se::Platform* const platform,
-                        se::MultiPlatformManager::PlatformWithId(platform_id));
+    xla::StatusOr<se::Platform*> maybe_platform =
+        se::MultiPlatformManager::PlatformWithId(platform_id);
+    OP_REQUIRES_OK_RETURN(ctx, XlaPlatformInfo(), maybe_platform.status());
+
     xla_allocator = absl::make_unique<XlaAllocator>(
-        platform, ctx->device()->GetAllocator({}));
+        maybe_platform.ValueOrDie(), ctx->device()->GetAllocator({}));
   }
 
-  *result = XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
-                            std::move(xla_allocator), device_allocator);
-
-  return Status::OK();
+  return XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
+                         std::move(xla_allocator), device_allocator);
 }
 
 // A closure describing how to run a compiled version of a TensorFlow function.
@@ -178,9 +189,8 @@ XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
     : OpKernel(ctx),
       constants_(constants),
       resources_(resources),
-      function_(function) {
-  OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_));
-}
+      function_(function),
+      platform_info_(PlatformInfoFromContext(ctx)) {}
 
 static Status BuildCompilationCache(OpKernelContext* ctx,
                                     const XlaPlatformInfo& platform_info,
@@ -219,7 +229,7 @@ static Status BuildCompilationCache(OpKernelContext* ctx,
 static Status CompileToLocalExecutable(
     OpKernelContext* ctx, const NameAttrList& function,
     const XlaPlatformInfo& platform_info, absl::Span<const int> resources,
-    absl::Span<const int> constants, xla::LocalClient** client,
+    absl::Span<const int> constants, bool lazy, xla::LocalClient** client,
     std::map<int, OptionalTensor>* variables,
     const XlaCompiler::CompilationResult** kernel,
     xla::LocalExecutable** executable) {
@@ -241,7 +251,7 @@ static Status CompileToLocalExecutable(
   // this is more obviously correct.)
   core::ScopedUnref cache_ref(cache);
 
-  *variables = SnapshotResourceVariables(ctx, resources);
+  TF_RETURN_IF_ERROR(SnapshotResourceVariables(ctx, resources, variables));
   *client = static_cast<xla::LocalClient*>(cache->client());
 
   XlaCompiler::Options options;
@@ -276,8 +286,13 @@ static Status CompileToLocalExecutable(
   // rather than a one-element tuple.
   compile_options.always_return_tuple = false;
 
-  return cache->Compile(options, function, constant_args, *variables, ctx,
-                        compile_options, kernel, executable);
+  std::vector<XlaCompiler::Argument> args;
+  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
+      constant_args, *variables, ctx, &args));
+  return cache->Compile(options, function, args, compile_options,
+                        lazy ? XlaCompilationCache::CompileMode::kLazy
+                             : XlaCompilationCache::CompileMode::kStrict,
+                        kernel, executable);
 }
 
 void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
@@ -291,8 +306,8 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
 
   OP_REQUIRES_OK(
       ctx, CompileToLocalExecutable(ctx, function_, platform_info_, resources_,
-                                    constants_, &client, &variables, &kernel,
-                                    &executable));
+                                    constants_, /*lazy=*/false, &client,
+                                    &variables, &kernel, &executable));
 
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
@@ -329,18 +344,6 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
 }
 
 namespace {
-
-// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that
-// in error case, it returns RET instead of void.
-#define OP_REQUIRES_OK_RETURN(CTX, RET, ...)                \
-  do {                                                      \
-    ::tensorflow::Status _s(__VA_ARGS__);                   \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
-      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
-      return RET;                                           \
-    }                                                       \
-  } while (0)
-
 // Helper static functions to construct parameters for
 // XlaLocalLaunchBase constructor from OpKernelConstruction.
 std::vector<int> ConstantsVector(OpKernelConstruction* ctx) {
@@ -377,7 +380,12 @@ NameAttrList FunctionAttr(OpKernelConstruction* ctx) {
   return *func;
 }
 
-#undef OP_REQUIRES_OK_RETURN
+bool MustCompileAttr(OpKernelConstruction* ctx) {
+  bool must_compile;
+  OP_REQUIRES_OK_RETURN(ctx, false,
+                        ctx->GetAttr("must_compile", &must_compile));
+  return must_compile;
+}
 }  // namespace
 
 XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
@@ -392,20 +400,59 @@ XlaCompileOp::XlaCompileOp(OpKernelConstruction* ctx)
     : OpKernel(ctx),
       constants_(ConstantsVector(ctx)),
       resources_(ResourcesVector(ctx)),
-      function_(FunctionAttr(ctx)) {
-  OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_));
-}
+      function_(FunctionAttr(ctx)),
+      platform_info_(PlatformInfoFromContext(ctx)),
+      must_compile_(MustCompileAttr(ctx)) {}
 
 void XlaCompileOp::Compute(OpKernelContext* ctx) {
+  VLOG(3) << "XlaCompileOp " << def().name()
+          << (must_compile_ ? "(must-compile)" : "");
   xla::LocalClient* client;
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
   std::map<int, OptionalTensor> variables;
 
-  OP_REQUIRES_OK(
-      ctx, CompileToLocalExecutable(ctx, function_, platform_info_, resources_,
-                                    constants_, &client, &variables, &kernel,
-                                    &executable));
+  bool cannot_compile_cluster;
+  {
+    mutex_lock guard(cannot_compile_cluster_mu_);
+    cannot_compile_cluster = cannot_compile_cluster_;
+  }
+
+  if (GetXlaOpsCommonFlags().tf_xla_always_defer_compilation ||
+      cannot_compile_cluster) {
+    executable = nullptr;
+  } else {
+    Status status = CompileToLocalExecutable(
+        ctx, function_, platform_info_, resources_, constants_,
+        /*lazy=*/!must_compile_, &client, &variables, &kernel, &executable);
+    if (must_compile_ || status.code() != error::UNIMPLEMENTED) {
+      OP_REQUIRES_OK(ctx, status);
+    }
+
+    if (status.code() == error::UNIMPLEMENTED) {
+      LOG(WARNING) << "Compilation failed:" << status.ToString()
+                   << ".  Falling back to TF function call.";
+      executable = nullptr;
+      mutex_lock guard(cannot_compile_cluster_mu_);
+      cannot_compile_cluster_ = true;
+    }
+  }
+
+  AllocatorAttributes host_alloc_attrs;
+  host_alloc_attrs.set_gpu_compatible(true);
+  host_alloc_attrs.set_on_host(true);
+  Allocator* cpu_allocator = ctx->device()->GetAllocator(host_alloc_attrs);
+
+  if (!executable) {
+    DCHECK(!must_compile_);
+    Tensor compilation_key(cpu_allocator, DT_STRING, TensorShape({}));
+
+    Tensor compilation_successful(cpu_allocator, DT_BOOL, TensorShape({}));
+    compilation_successful.scalar<bool>()() = false;
+    ctx->set_output(0, Tensor(cpu_allocator, DT_STRING, TensorShape({})));
+    ctx->set_output(1, compilation_successful);
+    return;
+  }
 
   // Each execution of an XlaCompile op creates a new XlaExecutableClosure, even
   // if it didn't have to compile the cluster because of a compilation-cache
@@ -415,13 +462,6 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
       XlaExecutableClosureStore::Global()->Produce(XlaExecutableClosure(
           client, executable, kernel, std::move(variables), constants_.size()));
 
-  Allocator* cpu_allocator = [&] {
-    AllocatorAttributes host_alloc_attrs;
-    host_alloc_attrs.set_gpu_compatible(true);
-    host_alloc_attrs.set_on_host(true);
-    return ctx->device()->GetAllocator(host_alloc_attrs);
-  }();
-
   Tensor compilation_key(cpu_allocator, DT_STRING, TensorShape({}));
   compilation_key.flat<string>()(0) = key;
 
@@ -432,11 +472,11 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   ctx->set_output(1, compilation_successful);
 }
 
-XlaRunOp::XlaRunOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-  OP_REQUIRES_OK(ctx, PlatformInfoFromContext(ctx, &platform_info_));
-}
+XlaRunOp::XlaRunOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx), platform_info_(PlatformInfoFromContext(ctx)) {}
 
 void XlaRunOp::Compute(OpKernelContext* ctx) {
+  VLOG(3) << "XlaRunOp " << def().name();
   Tensor key_tensor = ctx->input(ctx->num_inputs() - 1);
   const XlaExecutableClosureStore::KeyT& key = key_tensor.flat<string>()(0);
 
@@ -491,6 +531,8 @@ REGISTER_KERNEL_BUILDER(Name("_XlaCompile").Device(DEVICE_CPU), XlaCompileOp);
 REGISTER_KERNEL_BUILDER(Name("_XlaCompile")
                             .Device(DEVICE_GPU)
                             .HostMemory("constants")
+                            .HostMemory("key")
+                            .HostMemory("compilation_successful")
                             .HostMemory("resources"),
                         XlaCompileOp);
 
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h
index 489d26eb30a66646158f39ea3fc6f55759c7f88e..7b4d4b5b4737784d4fe277d5bbe9cab79cfaf4c9 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
 #define TENSORFLOW_COMPILER_JIT_KERNELS_XLA_OPS_H_
 
+#include <atomic>
+
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
@@ -33,6 +35,7 @@ namespace tensorflow {
 class XlaPlatformInfo {
  public:
   XlaPlatformInfo() : device_type_("") {}
+  XlaPlatformInfo(XlaPlatformInfo&&) = default;
   explicit XlaPlatformInfo(const DeviceType device_type,
                            se::Platform::Id platform_id,
                            const XlaDevice::Metadata* xla_device_metadata,
@@ -110,12 +113,12 @@ class XlaLocalLaunchBase : public OpKernel {
 
  protected:
   // Indexes of compile-time constant inputs
-  std::vector<int> constants_;
+  const std::vector<int> constants_;
   // Indexes of resource inputs
-  std::vector<int> resources_;
+  const std::vector<int> resources_;
 
-  NameAttrList function_;
-  XlaPlatformInfo platform_info_;
+  const NameAttrList function_;
+  const XlaPlatformInfo platform_info_;
 };
 
 // XlaLocalLaunchOp is used to replace a region of the TensorFlow graph
@@ -144,13 +147,23 @@ class XlaCompileOp : public OpKernel {
 
  private:
   // Indexes of compile-time constant inputs
-  std::vector<int> constants_;
+  const std::vector<int> constants_;
   // Indexes of resource inputs
-  std::vector<int> resources_;
+  const std::vector<int> resources_;
 
-  NameAttrList function_;
+  const NameAttrList function_;
 
   XlaPlatformInfo platform_info_;
+
+  const bool must_compile_;
+
+  // cannot_compile_cluster_ is set to true if XLA returns an Unimplemented
+  // error when compiling the cluster this _XlaCompile is supposed to compile.
+  // If `cannot_compile_cluster_` is true then we avoid compiling this cluster
+  // on any future calls to _XlaCompile.
+  bool cannot_compile_cluster_ GUARDED_BY(cannot_compile_cluster_mu_) = false;
+
+  mutex cannot_compile_cluster_mu_;
 };
 
 class XlaRunOp : public OpKernel {
@@ -160,7 +173,7 @@ class XlaRunOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  XlaPlatformInfo platform_info_;
+  const XlaPlatformInfo platform_info_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/BUILD b/tensorflow/compiler/jit/legacy_flags/BUILD
deleted file mode 100644
index 07c5b2318851ed506711b9ee00c66fe680a3afd8..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/BUILD
+++ /dev/null
@@ -1,41 +0,0 @@
-# Legacy command line flags for the XLA bridge libraries.
-
-# Please do not add more flags to this package.
-
-# The XLA bridge libraries were written in an environment that allowed
-# command-line flags to be scattered freely throughout the libraries.  This
-# model, while initially convenient, leads to a proliferation in unused command
-# line flags in tests and binaries, and serious problems in servers, where one
-# might wish parameters to be different in independent RPC calls to the same
-# routine.
-#
-# Please don't add more flags.  If you're a library author, pass options and
-# parameters explicitly through the library's interface.
-
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
-cc_library(
-    name = "mark_for_compilation_pass_flags",
-    srcs = ["mark_for_compilation_pass_flags.cc"],
-    hdrs = ["mark_for_compilation_pass_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "xla_device_flags",
-    srcs = ["xla_device_flags.cc"],
-    hdrs = ["xla_device_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
deleted file mode 100644
index 7277a1d1f8ad5fa045645ead839ab9efa01e89c7..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's mark_for_compilation_pass module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static MarkForCompilationPassFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new MarkForCompilationPassFlags;
-  flags->tf_xla_auto_jit = 0;
-  flags->tf_xla_min_cluster_size = 2;
-  flags->tf_xla_max_cluster_size = std::numeric_limits<int32>::max();
-  flags->tf_xla_clustering_debug = false;
-  flags->tf_xla_cpu_global_jit = false;
-  flags->tf_xla_clustering_fuel = std::numeric_limits<int64>::max();
-  flags->tf_xla_fusion_only = false;
-  flag_list = new std::vector<Flag>(
-      {Flag("tf_xla_auto_jit", &flags->tf_xla_auto_jit,
-            "Control compilation of operators into XLA computations on CPU and "
-            "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
-            "things very likely to be improved; 2 = on for everything.  "
-            "Experimental."),
-       Flag("tf_xla_min_cluster_size", &flags->tf_xla_min_cluster_size,
-            "Minimum number of operators in an XLA compilation. Ignored for "
-            "operators placed on an XLA device or operators explicitly marked "
-            "for compilation."),
-       Flag("tf_xla_max_cluster_size", &flags->tf_xla_max_cluster_size,
-            "Maximum number of operators in an XLA compilation."),
-       Flag("tf_xla_clustering_debug", &flags->tf_xla_clustering_debug,
-            "Dump graphs during XLA compilation."),
-       Flag("tf_xla_cpu_global_jit", &flags->tf_xla_cpu_global_jit,
-            "Enables global JIT compilation for CPU via SessionOptions."),
-       Flag("tf_xla_clustering_fuel", &flags->tf_xla_clustering_fuel,
-            "Places an artificial limit on the number of ops marked as "
-            "eligible for clustering."),
-       Flag("tf_xla_fusion_only", &flags->tf_xla_fusion_only,
-            "enable fusion of element-wise operations only using XLA when "
-            "global_jit_level is ON*.")});
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// mark_for_compilation_pass module.
-void AppendMarkForCompilationPassFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the MarkForCompilationPassFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
deleted file mode 100644
index 2affda6ab4e0fbad32a246744fa5b38aeb629c1b..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
-
-// Legacy flags for the XLA bridge's mark_for_compilation_pass module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// mark_for_compilation_pass module.
-void AppendMarkForCompilationPassFlags(
-    std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with the XLA bridge's
-// mark_for_compilation_pass module.
-typedef struct {
-  int32 tf_xla_auto_jit;  // Control compilation of operators into XLA
-                          // computations on CPU and GPU devices.  0 = use
-                          // ConfigProto setting; -1 = off; 1 = on for things
-                          // very likely to be improved; 2 = on for everything.
-                          // Experimental.
-  int32 tf_xla_min_cluster_size;  // Minimum number of operators in an XLA
-                                  // compilation. Ignored for operators placed
-                                  // on an XLA device or operators explicitly
-                                  // marked for compilation.
-  int32 tf_xla_max_cluster_size;  // Maximum number of operators in an XLA
-                                  // compilation.
-  bool tf_xla_clustering_debug;   // Dump graphs during XLA compilation.
-  bool tf_xla_cpu_global_jit;     // Enables global JIT compilation for CPU
-                                  // via SessionOptions.
-  int64 tf_xla_clustering_fuel;   // "Compiler fuel" for clustering.  Only this
-                                  // many ops will be marked as eligible for
-                                  // clustering.
-  bool tf_xla_fusion_only;  // This flag is effective only when global_jit_level
-                            // is set to ON* and overrides its behavior. If
-                            // true, enable fusion of element-wise operations
-                            // only using XLA.
-} MarkForCompilationPassFlags;
-
-// Return a pointer to the MarkForCompilationPassFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
deleted file mode 100644
index 1bb2fce2dbad5bffce2e33b665b7222090d0855a..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's xla_device module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static XlaDeviceFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new XlaDeviceFlags;
-  flags->tf_xla_compile_on_demand = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_compile_on_demand", &flags->tf_xla_compile_on_demand,
-           "Switch a device into 'on-demand' mode, where instead of "
-           "autoclustering ops are compiled one by one just-in-time."),
-  });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-// Return a pointer to the XlaDeviceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-XlaDeviceFlags* GetXlaDeviceFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
deleted file mode 100644
index 27b22121ac1e089bd5d5a494e1e3fb60b05bc76d..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
-
-// Legacy flags for the XLA bridge's xla_device module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// The values of flags associated with the XLA bridge's
-// xla_device module.
-typedef struct {
-  // Switch the CPU device into "on-demand" mode, where instead of
-  // autoclustering ops are compiled one by one just-in-time.
-  // Enabling this mode by a legacy flag is a temporary mechanism. When this
-  // feature is battle-tested, we will switch this to be a session option.
-  bool tf_xla_compile_on_demand;
-} XlaDeviceFlags;
-
-// Return a pointer to the XlaDeviceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-XlaDeviceFlags* GetXlaDeviceFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 4f0c370e65159c89c91ea58733f20f852d9acc99..6618e3a58ab7b6374ed775cd6e4e18a6a4975588 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
@@ -49,6 +49,51 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
+// Aggregates information about what kinds of ops are allowed.
+struct OperationFilter {
+  // Whether resource variable ops are allowed.  We do not allow resource
+  // variable ops in called functions (either as direct TF calls or as higher
+  // order control flow ops) because we do not yet model their memory effects in
+  // jit/resource_variable_safety_analysis.
+  bool allow_resource_ops;
+
+  // Whether stateful RNG ops are allowed.  XLA's RNG does not have the same
+  // seeding behavior as TensorFlow's RNG (b/34749654).  So we avoid
+  // auto-clustering stateful RNG ops.
+  bool allow_stateful_rng_ops;
+
+  // TODO(b/118970344): Whether ControlTrigger ops are allowed.  It is unsound
+  // to cluster ControlTrigger because of how we use deadness analysis.
+  bool allow_control_trigger;
+
+  // Whether ops with dummy implementations are allowed. We avoid
+  // auto-clustering these ops so that the user is not surprised when XLA is
+  // implicitly enabled. If the user explicitly specifies to use XLA, it is fine
+  // to resort to a dummy implementation. Currently Assert and CheckNumerics ops
+  // have dummy XLA implementations.
+  bool allow_dummy_ops;
+
+  // Whether ops that produce or consume DT_VARIANT values are allowed.  We
+  // don't auto-cluster these ops because we don't yet support live-in or
+  // live-out DT_VARIANT values.
+  bool allow_ops_producing_or_consuming_variant;
+};
+
+bool IsDummyImplOp(absl::string_view op_name) {
+  return op_name == "Assert" || op_name == "CheckNumerics";
+}
+
+bool IsStatefulRandomOp(absl::string_view op_name) {
+  return op_name == "RandomUniform" || op_name == "RandomShuffle" ||
+         op_name == "RandomUniformInt" || op_name == "RandomStandardNormal" ||
+         op_name == "TruncatedNormal" || op_name == "Multinomial";
+}
+
+bool OpProducesOrConsumesVariant(const Node& node) {
+  auto is_variant = [](DataType dtype) { return dtype == DT_VARIANT; };
+  return absl::c_any_of(node.input_types(), is_variant) ||
+         absl::c_any_of(node.output_types(), is_variant);
+}
 
 bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
@@ -101,7 +146,7 @@ const int kMaxRecursionDepth = 10;
 
 bool IsCompilableCall(const NodeDef& call_def,
                       const DeviceType& jit_device_type,
-                      bool allow_resource_ops, int depth,
+                      const OperationFilter& op_filter, int depth,
                       FunctionLibraryRuntime* lib_runtime);
 
 // Tests whether 'while_node' is a completely compilable loop.
@@ -109,7 +154,7 @@ bool IsCompilableCall(const NodeDef& call_def,
 // while loop to be compilable.
 bool IsCompilableWhile(const Node& while_node,
                        const DeviceType& jit_device_type,
-                       bool allow_resource_ops, int depth,
+                       const OperationFilter& op_filter, int depth,
                        FunctionLibraryRuntime* lib_runtime) {
   const NameAttrList* name_attr;
   NodeDef call;
@@ -124,7 +169,7 @@ bool IsCompilableWhile(const Node& while_node,
   call.set_name("while_cond");
   call.set_op(cond_func);
   *call.mutable_attr() = name_attr->attr();
-  if (!IsCompilableCall(call, jit_device_type, allow_resource_ops, depth + 1,
+  if (!IsCompilableCall(call, jit_device_type, op_filter, depth + 1,
                         lib_runtime)) {
     VLOG(2) << "Rejecting While " << while_node.name()
             << ": can't compile loop condition: " << cond_func;
@@ -140,7 +185,7 @@ bool IsCompilableWhile(const Node& while_node,
   call.set_name("while_body");
   call.set_op(body_func);
   *call.mutable_attr() = name_attr->attr();
-  if (!IsCompilableCall(call, jit_device_type, allow_resource_ops, depth + 1,
+  if (!IsCompilableCall(call, jit_device_type, op_filter, depth + 1,
                         lib_runtime)) {
     VLOG(2) << "Rejecting While " << while_node.name()
             << ": can't compile loop body: " << body_func;
@@ -154,7 +199,7 @@ bool IsCompilableWhile(const Node& while_node,
 // compilable.
 bool IsCompilableCall(const NodeDef& call_def,
                       const DeviceType& jit_device_type,
-                      bool allow_resource_ops, int depth,
+                      const OperationFilter& op_filter, int depth,
                       FunctionLibraryRuntime* lib_runtime) {
   if (depth > kMaxRecursionDepth) {
     VLOG(2) << "Rejecting " << call_def.op()
@@ -195,16 +240,30 @@ bool IsCompilableCall(const NodeDef& call_def,
       continue;
     if (node->type_string() == "While") {
       // Handle functional While loop.
-      return IsCompilableWhile(*node, jit_device_type, allow_resource_ops,
-                               depth + 1, lib_runtime);
+      return IsCompilableWhile(*node, jit_device_type, op_filter, depth + 1,
+                               lib_runtime);
     }
-    if (!allow_resource_ops &&
+    if (!op_filter.allow_resource_ops &&
         (HasResourceInput(*node) || HasResourceOutput(*node))) {
       return false;
     }
+    if (!op_filter.allow_stateful_rng_ops &&
+        IsStatefulRandomOp(node->type_string())) {
+      return false;
+    }
+    if (!op_filter.allow_control_trigger && node->IsControlTrigger()) {
+      return false;
+    }
+    if (!op_filter.allow_dummy_ops && IsDummyImplOp(node->type_string())) {
+      return false;
+    }
+    if (!op_filter.allow_ops_producing_or_consuming_variant &&
+        OpProducesOrConsumesVariant(*node)) {
+      return false;
+    }
     if (!HasXLAKernel(*node, jit_device_type) &&
-        !IsCompilableCall(node->def(), jit_device_type, allow_resource_ops,
-                          depth + 1, lib_runtime)) {
+        !IsCompilableCall(node->def(), jit_device_type, op_filter, depth + 1,
+                          lib_runtime)) {
       VLOG(2) << "Rejecting " << call_def.op() << ": unsupported op "
               << node->name() << ": " << node->def().ShortDebugString();
       return false;
@@ -383,8 +442,7 @@ Status FindCompilationCandidates(
       BackwardsConstAnalysis(graph, /*compile_time_const_arg_indices=*/nullptr,
                              &compile_time_const_nodes));
 
-  int64& fuel =
-      legacy_flags::GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
+  int64& fuel = GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
 
   // Iterate over nodes in sorted order so that compiler fuel is deterministic.
   // We can't simply pass op_nodes().begin() and op_nodes().end to the
@@ -426,14 +484,47 @@ Status FindCompilationCandidates(
     CHECK(
         XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration));
     DeviceType jit_device_type(registration->compilation_device_name);
+
+    bool always_auto_cluster = registration->autoclustering_policy ==
+                               XlaOpRegistry::AutoclusteringPolicy::kAlways;
+
+    OperationFilter op_filter;
+    op_filter.allow_resource_ops = registration->compile_resource_ops;
+    op_filter.allow_stateful_rng_ops = always_auto_cluster;
+    op_filter.allow_control_trigger = always_auto_cluster;
+    op_filter.allow_dummy_ops = always_auto_cluster;
+    op_filter.allow_ops_producing_or_consuming_variant = always_auto_cluster;
+
     if (!HasXLAKernel(*node, jit_device_type) &&
-        !IsCompilableCall(node->def(), jit_device_type,
-                          registration->compile_resource_ops, 0, lib_runtime)) {
+        !IsCompilableCall(node->def(), jit_device_type, op_filter, 0,
+                          lib_runtime)) {
       VLOG(2) << "Rejecting " << node->name() << ": unsupported op "
               << node->type_string();
       continue;
     }
-    if (!registration->compile_resource_ops &&
+
+    if (!op_filter.allow_stateful_rng_ops &&
+        IsStatefulRandomOp(node->type_string())) {
+      VLOG(2) << "Rejecting " << node->name() << ": stateful random operation";
+      continue;
+    }
+    if (!op_filter.allow_control_trigger && node->IsControlTrigger()) {
+      VLOG(2) << "Rejecting " << node->name() << ": is a control trigger op";
+      continue;
+    }
+    if (!op_filter.allow_dummy_ops && IsDummyImplOp(node->type_string())) {
+      VLOG(2) << "Rejecting " << node->name() << ": dummy op ("
+              << node->type_string() << ")";
+      continue;
+    }
+    if (!op_filter.allow_ops_producing_or_consuming_variant &&
+        OpProducesOrConsumesVariant(*node)) {
+      VLOG(2) << "Rejecting " << node->name()
+              << ": produces or consumes DT_VARIANT";
+      continue;
+    }
+
+    if (!op_filter.allow_resource_ops &&
         (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) {
       // We don't have a way of returning values of type DT_RESOURCE from XLA
       // computations so we avoid auto-clustering nodes producing DT_RESOURCE.
@@ -444,6 +535,7 @@ Status FindCompilationCandidates(
               << node->type_string();
       continue;
     }
+
     if (compile_time_const_nodes[node->id()]) {
       const OpDef* op_def;
       TF_RETURN_IF_ERROR(
@@ -501,9 +593,7 @@ Status FindCompilationCandidates(
     // registration->compile_resource_ops is true for XLA_CPU/XLA_GPU but not
     // for CPU/GPU.
     if (node->type_string() == "While" &&
-        !IsCompilableWhile(*node, jit_device_type,
-                           registration->compile_resource_ops, 0,
-                           lib_runtime)) {
+        !IsCompilableWhile(*node, jit_device_type, op_filter, 0, lib_runtime)) {
       continue;
     }
     // _Arg nodes in a top-level function represent feeds.
@@ -536,8 +626,7 @@ OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
     // To set compilation to be on by default, change the following line.
     global_jit_level = OptimizerOptions::OFF;
   }
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   if (flags->tf_xla_auto_jit == -1 ||
       (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) {
     // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides
@@ -563,10 +652,16 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
                                             &registration));
   DeviceType jit_device_type(registration->compilation_device_name);
 
-  // We can always *compile* resource operations, even if we are sometimes
-  // unable to auto-cluster them.
-  const bool compile_resource_ops = true;
-  return IsCompilableCall(ndef, jit_device_type, compile_resource_ops, 0, flr);
+  // We can always *compile* resource operations, stateful RNGs and dummy ops,
+  // even if we are sometimes unable to auto-cluster them.
+  OperationFilter op_filter;
+  op_filter.allow_resource_ops = true;
+  op_filter.allow_stateful_rng_ops = true;
+  op_filter.allow_control_trigger = true;
+  op_filter.allow_dummy_ops = true;
+  op_filter.allow_ops_producing_or_consuming_variant = true;
+
+  return IsCompilableCall(ndef, jit_device_type, op_filter, 0, flr);
 }
 
 Status MarkForCompilationPass::Run(
@@ -575,12 +670,9 @@ Status MarkForCompilationPass::Run(
   // device ahead of time.
   OptimizerOptions::GlobalJitLevel global_jit_level =
       GetGlobalJitLevel(options);
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
-  bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   bool fusion_only = flags->tf_xla_fusion_only;
 
-  VLOG(1) << "flags->tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
   VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
   VLOG(1) << "flags->tf_xla_auto_jit = " << flags->tf_xla_auto_jit;
   const FunctionLibraryDefinition* fld = options.flib_def;
@@ -599,9 +691,6 @@ Status MarkForCompilationPass::Run(
       return false;
     }
 
-    // If this device requires a JIT, we must say yes.
-    if (registration->requires_compilation) return true;
-
     // If there is a _XlaCompile annotation, use its value.
     bool compile = false;
     Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
@@ -638,18 +727,21 @@ Status MarkForCompilationPass::Run(
       return false;
     }
 
-    // Otherwise use the value of global_jit_level.
-    // Ignore enable_jit_by_default if global jit compilation for CPU
-    // is explicitly requested via tf_xla_cpu_global_jit flag
-    bool ignore_registration = cpu_global_jit && device_type == DEVICE_CPU;
+    // Otherwise use the value of global_jit_level and the device's
+    // autoclustering policy.
     bool should_compile =
-        (ignore_registration || registration->enable_jit_by_default) &&
-        global_jit_level != OptimizerOptions::OFF;
+        registration->autoclustering_policy ==
+            XlaOpRegistry::AutoclusteringPolicy::kAlways ||
+        (registration->autoclustering_policy ==
+             XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally &&
+         global_jit_level != OptimizerOptions::OFF);
     if (!should_compile) {
       if (global_jit_level == OptimizerOptions::OFF) {
         VLOG(2) << "Rejecting " << node->name() << ": global jit disabled.";
       } else {
-        VLOG(2) << "Rejecting " << node->name() << ": JIT for device disabled.";
+        VLOG(2)
+            << "Rejecting " << node->name()
+            << ": autoclustering for device only when requested explicitly.";
       }
     }
     return should_compile;
@@ -879,8 +971,7 @@ Status MarkForCompilationPass::RunImpl(
 
   OptimizerOptions::GlobalJitLevel global_jit_level =
       GetGlobalJitLevel(options);
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
 
   // Repeatedly contract edges between clusters that are on the same device,
   // provided the contraction would not create a cycle.
@@ -952,6 +1043,28 @@ Status MarkForCompilationPass::RunImpl(
         continue;
       }
 
+      // If any of the consumer's producers are on a different device, do not
+      // cluster these nodes. This prevents other work on this device from being
+      // delayed by work on other devices. We consider predecessors of the
+      // entire cluster rather than just the inputs to the node to prevent the
+      // cluster still being combined in cases where the 'to' cluster has
+      // multiple dependencies on the 'from' cluster and another dependency
+      // leads to a merging of the clusters.
+      //
+      // TODO(b/117085735): We probably want to handle the reciprocal of this
+      // case where a cluster is producing data for multiple devices.
+      bool found_split = false;
+      for (const auto& in_id : cycles.Predecessors(to)) {
+        if (in_id >= graph->num_node_ids()) continue;
+
+        Node* in = graph->FindNodeId(in_id);
+        if (compilation_candidates.find(in) != compilation_candidates.cend() &&
+            in->assigned_device_name() != node_to->assigned_device_name()) {
+          found_split = true;
+        }
+      }
+      if (found_split) continue;
+
       // If contracting the edge would create a cycle, bail out.
       // However, just because we can't merge the clusters now does not mean
       // we won't be able to merge them in the future.
@@ -1015,12 +1128,10 @@ Status MarkForCompilationPass::RunImpl(
     XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
     // Compile if this is a cluster of >= min_cluster_size compilable operators.
-    // Also, always compile if the operator is placed on a device that requires
-    // compilation, or if it contains at least one op that is marked for
+    // Also, always compile if it contains at least one op that is marked for
     // compilation that is not an Identity op.
     if (effective_cluster_sizes[cluster] >= min_cluster_size ||
-        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation) ||
-        registration->requires_compilation) {
+        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation)) {
       string& name = cluster_names[cluster];
 
       if (name.empty()) {
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 2a80c745e3fcebf97bcccb03551feb3d6fb9f831..bf2c5508ea9e987e80093f4c2e15d3ff5191126f 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/list_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -817,14 +818,10 @@ TEST(XlaCompilationTest, ClusterControlTrigger) {
 
   std::unordered_map<string, string> clusters = GetClusters(*graph);
 
-  ASSERT_FALSE(clusters.empty());
-  string cluster_name = clusters.begin()->second;
-
-  // ctrl_trigger_a has inputs with mismatching deadness so it won't be
-  // clustered.  ctrl_trigger_b is okay to cluster.
-  std::unordered_map<string, string> expected_clusters(
-      {{"const_a", cluster_name}, {"ctrl_trigger_b", cluster_name}});
-  EXPECT_EQ(clusters, expected_clusters);
+  // TODO(b/118970344): ctrl_trigger_a has inputs with mismatching deadness so
+  // it won't be clustered.  ctrl_trigger_b is okay to cluster but we don't
+  // cluster it because of b/118970344.
+  EXPECT_TRUE(clusters.empty());
 }
 
 TEST(XlaCompilationTest, RandomShape) {
@@ -923,9 +920,8 @@ TEST(XlaCompilationTest, RandomShapeOnXlaDevice) {
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
 
   std::unordered_map<string, string> clusters = GetClusters(*graph);
-  EXPECT_NE(clusters["test/shape_rng"], "");
-  EXPECT_NE(clusters["test/reshape"], "");
-  EXPECT_NE(clusters["test/shape_rng"], clusters["test/reshape"]);
+  EXPECT_EQ(clusters["test/shape_rng"], "");
+  EXPECT_EQ(clusters["test/reshape"], "");
 }
 
 TEST(XlaCompilationTest, TensorArrayShapeOnXlaDevice) {
@@ -961,5 +957,271 @@ TEST(XlaCompilationTest, TensorArrayShapeOnXlaDevice) {
   EXPECT_EQ(clusters["test/read"], clusters["test/reshape"]);
 }
 
+TEST(XlaCompilationTest, DontClusterMergingNodes) {
+  // MatMulCombined below takes data from nodes on GPU0 and GPU1 and is placed
+  // on GPU1. However, it should not be clustered with the previous node on
+  // GPU1, because that will serialize production of its inputs that should be
+  // done in parallel.
+  //
+  // This graph is:
+  // (Const0, Const0) -> MatMul0
+  // (Const1, Const1) -> MatMul1
+  // (MatMul0, MatMul1) -> MatMulCombined
+  //
+  // Device0: [Const0, Const0, MatMul0]
+  // Device1: [Const1, Const1, MatMul1, MatMulCombined]
+  //
+  // Cluster0: [Const0, Const0, MatMul0]
+  // Cluster1: [Const1, Const1, MatMul1]
+  // Cluster2: [MatMulCombined]
+  Scope root = Scope::NewRootScope().ExitOnError();
+  absl::string_view xla_gpu_dev0 =
+      "/job:worker/replica:0/task:0/device:XLA_GPU:0";
+  absl::string_view xla_gpu_dev1 =
+      "/job:worker/replica:0/task:0/device:XLA_GPU:1";
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Output a = ops::Const(root.WithOpName("A_dev0"), 1.0f, {2, 2});
+  Output b = ops::Const(root.WithOpName("B_dev1"), 1.0f, {2, 2});
+  Output matmul0 = ops::MatMul(root.WithOpName("MatMul0_dev0"), a, a);
+  Output matmul1 = ops::MatMul(root.WithOpName("MatMul1_dev1"), b, b);
+
+  Output combined =
+      ops::MatMul(root.WithOpName("MatMulCombined_dev1"), matmul0, matmul1);
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  for (Node* n : graph->nodes()) {
+    if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
+      n->set_assigned_device_name(string(xla_gpu_dev0));
+    } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
+      n->set_assigned_device_name(string(xla_gpu_dev1));
+    }
+  }
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  // Each of the MatMuls should be in a separate cluster.
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
+  EXPECT_NE(clusters["MatMulCombined_dev1"], clusters["MatMul0_dev0"]);
+  EXPECT_NE(clusters["MatMulCombined_dev1"], clusters["MatMul1_dev1"]);
+  EXPECT_EQ(clusters["A_dev0"], clusters["MatMul0_dev0"]);
+  EXPECT_EQ(clusters["B_dev1"], clusters["MatMul1_dev1"]);
+}
+
+// TODO(b/117085735): This form of clustering should be prevented.
+TEST(XlaCompilationTest, NOT_DontClusterSpreadingNodes) {
+  // MatMulSource below creates data for nodes on GPU0 and GPU1 and is placed
+  // on GPU0. However, it should not be clustered with the next node on
+  // GPU0, because that will prevent the node on GPU1 from beginning its work as
+  // soon as the data has been produced.
+  //
+  // This graph is:
+  // (Const0, Const0) -> MatMulSource
+  // MatMulSource -> (MatMul0, MatMul1)
+  //
+  // Device0: [Const0, Const1, MatMulSource, MatMul0]
+  // Device1: [MatMul1]
+  //
+  // Cluster0: [Const0, Const1, MatMulSource]
+  // Cluster1: [MatMul0]
+  // Cluster2: [MatMul1]
+  Scope root = Scope::NewRootScope().ExitOnError();
+  absl::string_view xla_gpu_dev0 =
+      "/job:worker/replica:0/task:0/device:XLA_GPU:0";
+  absl::string_view xla_gpu_dev1 =
+      "/job:worker/replica:0/task:0/device:XLA_GPU:1";
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Output a = ops::Const(root.WithOpName("A_dev0"), 1.0f, {2, 2});
+  Output matmul_source =
+      ops::MatMul(root.WithOpName("MatMulSource_dev0"), a, a);
+
+  Output matmul0 = ops::MatMul(root.WithOpName("MatMul0_dev0"), matmul_source,
+                               matmul_source);
+  Output matmul1 = ops::MatMul(root.WithOpName("MatMul1_dev1"), matmul_source,
+                               matmul_source);
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  for (Node* n : graph->nodes()) {
+    if (absl::EndsWith(n->name(), /*suffix=*/"dev0")) {
+      n->set_assigned_device_name(string(xla_gpu_dev0));
+    } else if (absl::EndsWith(n->name(), /*suffix=*/"dev1")) {
+      n->set_assigned_device_name(string(xla_gpu_dev1));
+    }
+  }
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["A_dev0"], clusters["MatMulSource_dev0"]);
+  EXPECT_NE(clusters["MatMul0_dev0"], clusters["MatMul1_dev1"]);
+  EXPECT_NE(clusters["MatMulSource_dev0"], clusters["MatMul1_dev1"]);
+
+  // Improved Heuristics should prevent this probably.
+  EXPECT_EQ(clusters["MatMulSource_dev0"], clusters["MatMul0_dev0"]);
+}
+
+TEST(XlaCompilationTest, ClusterStatefulRandomOpOnXlaDevice) {
+  absl::string_view xla_cpu_device =
+      "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output shape = ops::Const(root.WithOpName("test/shape_shape"), {200, 200});
+  Output a = ops::RandomUniform(root.WithOpName("test/a"), shape, DT_FLOAT);
+  Output b = ops::RandomUniform(root.WithOpName("test/b"), shape, DT_FLOAT);
+  Output c = ops::Add(root.WithOpName("test/c"), a, b);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  for (Node* n : graph->nodes()) {
+    if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
+      n->set_assigned_device_name(string(xla_cpu_device));
+    }
+  }
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["test/a"], "");
+  EXPECT_NE(clusters["test/b"], "");
+  EXPECT_NE(clusters["test/c"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterStatefulRandomOp) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output shape = ops::Const(root.WithOpName("test/shape_shape"), {200, 200});
+  Output a = ops::RandomUniform(root.WithOpName("test/a"), shape, DT_FLOAT);
+  Output b = ops::RandomUniform(root.WithOpName("test/b"), shape, DT_FLOAT);
+  Output c = ops::Add(root.WithOpName("test/c"), a, b);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/a"], "");
+  EXPECT_EQ(clusters["test/b"], "");
+}
+
+TEST(XlaCompilationTest, ClusterDummyOpsOnXlaDevice) {
+  absl::string_view xla_cpu_device =
+      "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+  Output check =
+      ops::CheckNumerics(root.WithOpName("test/check"), a, "test/check");
+  Output ge = ops::GreaterEqual(root.WithOpName("test/greaterequal"), check, b);
+  Operation assert = ops::Assert(root.WithOpName("test/assert"), ge, {a, b});
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  for (Node* n : graph->nodes()) {
+    if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
+      n->set_assigned_device_name(string(xla_cpu_device));
+    }
+  }
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["test/check"], "");
+  EXPECT_NE(clusters["test/greaterequal"], "");
+  EXPECT_NE(clusters["test/assert"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterDummyOps) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_FLOAT);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_FLOAT);
+  Output check =
+      ops::CheckNumerics(root.WithOpName("test/check"), a, "test/check");
+  Output ge = ops::GreaterEqual(root.WithOpName("test/greaterequal"), check, b);
+  Operation assert = ops::Assert(root.WithOpName("test/assert"), ge, {a, b});
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/assert"], "");
+  EXPECT_EQ(clusters["test/check"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterOpsProducingVariant) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64);
+
+  Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32);
+  Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32);
+
+  Output tensor_list_reserve = ops::TensorListReserve(
+      root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/tensor_list_reserve"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterOpsConsumingVariant) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output dummy_input =
+      ops::Placeholder(root.WithOpName("test/dummy_input"), DT_INT64);
+  Output variant_input =
+      ops::Placeholder(root.WithOpName("test/variant_input"), DT_VARIANT);
+
+  // Create one more node so that we don't avoid creating a cluster solely
+  // because it would be trivial.
+  Output dummy_cast =
+      ops::Cast(root.WithOpName("test/dummy_cast"), dummy_input, DT_INT32);
+
+  Output tensor_list_element_shape = ops::TensorListElementShape(
+      root.WithOpName("test/tensor_list_element_shape"), variant_input,
+      DT_INT32);
+
+  root.graph()->AddControlEdge(dummy_cast.node(),
+                               tensor_list_element_shape.node());
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/tensor_list_element_shape"], "");
+}
+
+TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64);
+
+  Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32);
+  Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32);
+
+  Output tensor_list_reserve = ops::TensorListReserve(
+      root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  for (Node* n : graph->nodes()) {
+    if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
+      n->set_assigned_device_name(xla_cpu_device);
+    }
+  }
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["test/tensor_list_reserve"], "");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
index d56d0f8ccfcdab40003be38059228cb255921b64..64a3301745790132fe3149bf8fb52d6c45ecc3c1 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
@@ -34,15 +34,9 @@ namespace tensorflow {
   //
   // It may be worth refactoring out XlaOpRegistry::RegisterCompilationDevice to
   // make this more direct, but probably not worth it solely for this test.
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(*session_options, "", &devices));
 
-  auto delete_devices = gtl::MakeCleanup([&] {
-    for (Device* d : devices) {
-      delete d;
-    }
-  });
-
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = graph;
   opt_options.session_options = session_options;
diff --git a/tensorflow/compiler/jit/node_matchers.cc b/tensorflow/compiler/jit/node_matchers.cc
index d8ace628e6b76e011ecddd4d526efc4db9c9237e..c788091724e443ba1e3bcd60515d68e71e2e0824 100644
--- a/tensorflow/compiler/jit/node_matchers.cc
+++ b/tensorflow/compiler/jit/node_matchers.cc
@@ -19,7 +19,10 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 
 namespace tensorflow {
@@ -28,6 +31,7 @@ namespace matchers {
 namespace {
 
 using impl::NodeMatcherProperties;
+using impl::OutEdge;
 
 string IndentAllButFirstLine(absl::string_view text) {
   std::vector<std::string> lines = absl::StrSplit(text, '\n');
@@ -99,8 +103,6 @@ bool MatchAndExplainTensor(const Tensor& tensor, const Tensor& expected_tensor,
   }
 }
 
-using Input = std::pair<const Node*, int>;
-
 struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
   bool MatchAndExplain(
       const Node* node,
@@ -191,6 +193,30 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
       }
       return false;
     }
+
+    const AttrValueMap attr_value_map = node->def().attr();
+    for (const auto& attr_kv_pair : attrs) {
+      auto it = attr_value_map.find(attr_kv_pair.first);
+      if (it == attr_value_map.end()) {
+        if (listener->IsInterested()) {
+          *listener << "did not find attribute named \"" << attr_kv_pair.first
+                    << "\" in node";
+        }
+        return false;
+      }
+      if (attr_kv_pair.second &&
+          !AreAttrValuesEqual(it->second, *attr_kv_pair.second)) {
+        if (listener->IsInterested()) {
+          *listener << "attribute named " << attr_kv_pair.first
+                    << " does not match value; expected: \""
+                    << SummarizeAttrValue(*attr_kv_pair.second)
+                    << "\", found: \"" << SummarizeAttrValue(it->second)
+                    << "\"";
+        }
+        return false;
+      }
+    }
+
     return true;
   }
 
@@ -232,7 +258,7 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
         *os << "matching " << ss.str();
       } else {
         int edge_idx = 0;
-        for (const ::testing::Matcher<Input>& matcher : (*input_matchers)) {
+        for (const ::testing::Matcher<OutEdge>& matcher : (*input_matchers)) {
           *os << "\n  [" << edge_idx << "] matching (";
           ::std::stringstream ss;
           matcher.DescribeTo(&ss);
@@ -250,6 +276,21 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
       control_dep_set->DescribeTo(os);
     }
 
+    if (!attrs.empty()) {
+      printed_something = true;
+      std::vector<string> attrs_str;
+      absl::c_transform(
+          attrs, std::back_inserter(attrs_str),
+          [](const std::pair<string, absl::optional<AttrValue>>& attr_kv_pair) {
+            return absl::StrCat(attr_kv_pair.first, "->",
+                                attr_kv_pair.second
+                                    ? SummarizeAttrValue(*attr_kv_pair.second)
+                                    : "*");
+          });
+      *os << " and attr values matching [" << absl::StrJoin(attrs_str, ", ")
+          << "]";
+    }
+
     if (!printed_something) {
       *os << "is any node";
     }
@@ -266,7 +307,7 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
     }
 
     ::testing::StringMatchResultListener inner_listener;
-    Input input = {edge->src(), edge->src_output()};
+    OutEdge input = {edge->src(), edge->src_output()};
     if ((*input_matchers)[input_idx].MatchAndExplain(input, &inner_listener)) {
       return true;
     }
@@ -286,22 +327,24 @@ struct NodeMatcher : public ::testing::MatcherInterface<const Node*> {
   absl::optional<string> name;
   absl::optional<string> assigned_device;
   absl::optional<Tensor> constant_value;
-  absl::optional<std::vector<::testing::Matcher<Input>>> input_matchers;
+  absl::optional<std::vector<::testing::Matcher<OutEdge>>> input_matchers;
   absl::optional<::testing::Matcher<absl::Span<const Node* const>>>
       control_dep_set;
+  std::map<string, absl::optional<AttrValue>> attrs;
 };
 
 // Matches a dst and dst_output on an input edge.  Today we only use this with
 // dst_output=0 but we will eventually need to support multi-output operations.
-class InputMatcher : public ::testing::MatcherInterface<Input> {
+class OutEdgeMatcher : public ::testing::MatcherInterface<OutEdge> {
  public:
-  InputMatcher(::testing::Matcher<const Node*> src_matcher, int src_output)
-      : src_matcher_(std::move(src_matcher)), src_output_(src_output) {}
+  OutEdgeMatcher(::testing::Matcher<const Node*> src_matcher, int src_oidx)
+      : src_matcher_(std::move(src_matcher)), src_oidx_(src_oidx) {}
 
   bool MatchAndExplain(
-      Input input, ::testing::MatchResultListener* listener) const override {
+      OutEdge out_edge,
+      ::testing::MatchResultListener* listener) const override {
     ::testing::StringMatchResultListener inner_listener;
-    if (!src_matcher_.MatchAndExplain(input.first, &inner_listener)) {
+    if (!src_matcher_.MatchAndExplain(out_edge.first, &inner_listener)) {
       if (listener->IsInterested()) {
         *listener << "\nsource does not match expected ";
         src_matcher_.DescribeTo(listener->stream());
@@ -312,10 +355,10 @@ class InputMatcher : public ::testing::MatcherInterface<Input> {
       }
       return false;
     }
-    if (input.second != src_output_) {
+    if (out_edge.second != src_oidx_) {
       if (listener->IsInterested()) {
-        *listener << "\nexpected output slot to be " << src_output_
-                  << " but found " << input.second;
+        *listener << "\nexpected output slot to be " << src_oidx_
+                  << " but found " << out_edge.second;
       }
       return false;
     }
@@ -324,31 +367,21 @@ class InputMatcher : public ::testing::MatcherInterface<Input> {
   }
 
   void DescribeTo(::std::ostream* os) const override {
-    if (src_output_) {
-      *os << "output slot: " << src_output_ << ", source: (";
+    if (src_oidx_) {
+      *os << "output slot: " << src_oidx_ << ", source: (";
     }
 
     src_matcher_.DescribeTo(os);
 
-    if (src_output_) {
+    if (src_oidx_) {
       *os << ")";
     }
   }
 
  private:
   ::testing::Matcher<const Node*> src_matcher_;
-  int src_output_;
+  int src_oidx_;
 };
-
-std::vector<::testing::Matcher<Input>> NodeMatchersToInputMatchers(
-    absl::Span<const ::testing::Matcher<const Node*>> node_matchers) {
-  std::vector<::testing::Matcher<Input>> result;
-  absl::c_transform(node_matchers, std::back_inserter(result),
-                    [](::testing::Matcher<const Node*> n) {
-                      return ::testing::MakeMatcher(new InputMatcher(n, 0));
-                    });
-  return result;
-}
 }  // namespace
 
 ::testing::Matcher<const Node*> impl::NodeWith(
@@ -375,10 +408,9 @@ std::vector<::testing::Matcher<Input>> NodeMatchersToInputMatchers(
       matcher->assigned_device = prop.assigned_device();
     }
 
-    if (prop.input_nodes()) {
+    if (prop.inputs()) {
       DCHECK(!matcher->input_matchers);
-      matcher->input_matchers =
-          NodeMatchersToInputMatchers(*prop.input_nodes());
+      matcher->input_matchers = *prop.inputs();
     }
 
     if (prop.control_deps()) {
@@ -386,6 +418,11 @@ std::vector<::testing::Matcher<Input>> NodeMatchersToInputMatchers(
       matcher->control_dep_set =
           ::testing::UnorderedElementsAreArray(*prop.control_deps());
     }
+
+    if (prop.attr()) {
+      auto insert_result = matcher->attrs.insert(*prop.attr());
+      DCHECK(insert_result.second);
+    }
   }
 
   return ::testing::MakeMatcher(matcher);
@@ -412,12 +449,12 @@ impl::NodeMatcherProperties AssignedDevice(string assigned_device) {
 }
 
 impl::NodeMatcherProperties impl::Inputs(
-    absl::Span<const ::testing::Matcher<const Node*>> inputs) {
-  std::vector<::testing::Matcher<const Node*>> inputs_vector;
+    absl::Span<const ::testing::Matcher<OutEdge>> inputs) {
+  std::vector<::testing::Matcher<OutEdge>> inputs_vector;
   absl::c_copy(inputs, std::back_inserter(inputs_vector));
 
   impl::NodeMatcherProperties props;
-  props.set_input_nodes(std::move(inputs_vector));
+  props.set_inputs(std::move(inputs_vector));
   return props;
 }
 
@@ -431,6 +468,45 @@ impl::NodeMatcherProperties impl::CtrlDeps(
   return props;
 }
 
+std::pair<string, AttrValue> impl::AttrLiteralHelper(
+    const std::pair<string, bool>& bool_attr) {
+  AttrValue attr_value;
+  attr_value.set_b(bool_attr.second);
+  return {bool_attr.first, attr_value};
+}
+
+std::pair<string, AttrValue> impl::AttrLiteralHelper(
+    const std::pair<string, absl::Span<const int>>& int_list_attr) {
+  AttrValue attr_value;
+  AttrValue::ListValue* list = attr_value.mutable_list();
+  for (int i : int_list_attr.second) {
+    list->add_i(i);
+  }
+  return {int_list_attr.first, attr_value};
+}
+
+std::pair<string, AttrValue> impl::AttrLiteralHelper(
+    const std::pair<string, absl::Span<const string>>& string_list_attr) {
+  AttrValue attr_value;
+  AttrValue::ListValue* list = attr_value.mutable_list();
+  for (string s : string_list_attr.second) {
+    list->add_s(s);
+  }
+  return {string_list_attr.first, attr_value};
+}
+
+impl::NodeMatcherProperties impl::Attr(std::pair<string, AttrValue> attr) {
+  impl::NodeMatcherProperties props;
+  props.set_attr(std::move(attr));
+  return props;
+}
+
+impl::NodeMatcherProperties impl::Attr(string name) {
+  impl::NodeMatcherProperties props;
+  props.set_attr({std::move(name), absl::nullopt});
+  return props;
+}
+
 NodeMatcherProperties ConstantValue(
     const ::tensorflow::Input::Initializer& val) {
   TF_CHECK_OK(val.status);
@@ -439,9 +515,13 @@ NodeMatcherProperties ConstantValue(
   return props;
 }
 
-::testing::Matcher<const Node*> Const(
+::testing::Matcher<impl::OutEdge> Const(
     const ::tensorflow::Input::Initializer& val) {
-  return NodeWith(ConstantValue(val));
+  return Out(NodeWith(ConstantValue(val)));
+}
+::testing::Matcher<impl::OutEdge> Out(
+    int oidx, ::testing::Matcher<const Node*> node_matcher) {
+  return ::testing::MakeMatcher(new OutEdgeMatcher(node_matcher, oidx));
 }
 }  // namespace matchers
 
@@ -455,4 +535,7 @@ Node* FindNodeByName(Graph* g, absl::string_view name) {
   return nullptr;
 }
 }  // namespace testing
+
+void PrintTo(const Node* n, ::std::ostream* os) { *os << SummarizeNode(*n); }
+void PrintTo(Node* n, ::std::ostream* os) { *os << SummarizeNode(*n); }
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/node_matchers.h b/tensorflow/compiler/jit/node_matchers.h
index 0437a7e95c1eb3bdcdbe24a440dd90a5943c0894..0d4f02c236bba353799f75ee91cf03235b424b29 100644
--- a/tensorflow/compiler/jit/node_matchers.h
+++ b/tensorflow/compiler/jit/node_matchers.h
@@ -19,7 +19,7 @@ limitations under the License.
 //
 //  tensorflow::Node* node = ...;
 //  EXPECT_THAT(node, NodeWith(Name("name"), Op("op"),
-//                             Inputs(NodeWith(Name("input")))))
+//                             Inputs(Out(3, NodeWith(Name("input"))))))
 //
 // Matchable node properties (the expressions that go inside NodeWith(...))
 // are:
@@ -32,7 +32,8 @@ limitations under the License.
 //  - AssignedDevice(string): matches the assigned device exactly.
 //
 //  - Inputs(<ordered list>): matches the list of non-control inputs to the node
-//    exactly (i.e. does not match a suffix or a prefix).
+//    exactly (i.e. does not match a suffix or a prefix) where each element
+//    matches an output of a node (see Out(idx, node) below).
 //
 //  - CtrlDeps(<unordered list>): matches the list of control dependences on the
 //    node exactly but in any order.
@@ -40,10 +41,16 @@ limitations under the License.
 //  - ConstantValue(tensorflow::Input::Initializer init): matches a Const node
 //    with the constant value `init`.  Implies Op("Const").
 //
-// Node properties may not be repeated in a single NodeWith(...)  matcher.
-// E.g. NodeWith(Op("Foo"), Op("Bar")) will CHECK-fail.  Since ConstantValue
-// implies Op("Const"), a single NodeWith matcher can't have both
-// ConstantValue(...) and Op(...).
+//  - Attr(name, value): Matches a single attribute with name `name` and value
+//    `value`.  Right now only boolean values are supported.
+//
+// Overlapping node properties may not be repeated in a single NodeWith(...)
+// matcher.  E.g. NodeWith(Op("Foo"), Op("Bar")) will CHECK-fail.  Since
+// ConstantValue implies Op("Const"), a single NodeWith matcher can't have both
+// ConstantValue(...) and Op(...).  Multiple Attr() values can be combined as
+// long as the attribute names are different.
+//
+// Out(idx, node) matches the `idx`'th output of a node that matches `node`.
 
 #ifndef TENSORFLOW_COMPILER_JIT_NODE_MATCHERS_H_
 #define TENSORFLOW_COMPILER_JIT_NODE_MATCHERS_H_
@@ -66,6 +73,8 @@ namespace matchers {
 
 namespace impl {
 
+using OutEdge = std::pair<const Node*, int>;
+
 // -----------------------------------------------------------------------------
 // Implementation details.
 
@@ -74,6 +83,8 @@ namespace impl {
 class NodeMatcherProperties {
  public:
   using NodeSeqMatcher = std::vector<::testing::Matcher<const Node*>>;
+  using InputSeqMatcher = std::vector<::testing::Matcher<OutEdge>>;
+  using AttrKeyValuePair = std::pair<string, absl::optional<AttrValue>>;
 
   const absl::optional<string>& name() const { return name_; }
   const absl::optional<string>& op() const { return op_; }
@@ -83,12 +94,13 @@ class NodeMatcherProperties {
   const absl::optional<Tensor>& constant_value() const {
     return constant_value_;
   }
-  const absl::optional<NodeSeqMatcher>& input_nodes() const {
-    return input_nodes_;
+  const absl::optional<InputSeqMatcher>& inputs() const {
+    return input_matchers_;
   }
   const absl::optional<NodeSeqMatcher>& control_deps() const {
     return control_deps_;
   }
+  const absl::optional<AttrKeyValuePair>& attr() const { return attr_; }
 
   void set_name(string name) {
     DCHECK(IsEmpty());
@@ -111,9 +123,9 @@ class NodeMatcherProperties {
     op_ = "Const";
   }
 
-  void set_input_nodes(NodeSeqMatcher input_nodes) {
+  void set_inputs(InputSeqMatcher inputs) {
     DCHECK(IsEmpty());
-    input_nodes_ = std::move(input_nodes);
+    input_matchers_ = std::move(inputs);
   }
 
   void set_control_deps(NodeSeqMatcher control_deps) {
@@ -121,9 +133,14 @@ class NodeMatcherProperties {
     control_deps_ = std::move(control_deps);
   }
 
+  void set_attr(AttrKeyValuePair attr) {
+    DCHECK(IsEmpty());
+    attr_ = std::move(attr);
+  }
+
   bool IsEmpty() const {
-    return !name().has_value() && !op().has_value() &&
-           !input_nodes().has_value() && !control_deps().has_value();
+    return !name().has_value() && !op().has_value() && !inputs().has_value() &&
+           !control_deps().has_value() && !attr().has_value();
   }
 
  private:
@@ -131,18 +148,31 @@ class NodeMatcherProperties {
   absl::optional<string> op_;
   absl::optional<string> assigned_device_;
   absl::optional<Tensor> constant_value_;
-  absl::optional<NodeSeqMatcher> input_nodes_;
+  absl::optional<InputSeqMatcher> input_matchers_;
   absl::optional<NodeSeqMatcher> control_deps_;
+  absl::optional<AttrKeyValuePair> attr_;
 };
 
 ::testing::Matcher<const Node*> NodeWith(
     absl::Span<const NodeMatcherProperties> props);
 
 impl::NodeMatcherProperties Inputs(
-    absl::Span<const ::testing::Matcher<const Node*>> inputs);
+    absl::Span<const ::testing::Matcher<OutEdge>> inputs);
 
 impl::NodeMatcherProperties CtrlDeps(
     absl::Span<const ::testing::Matcher<const Node*>> control_deps);
+
+impl::NodeMatcherProperties Attr(std::pair<string, AttrValue> attrs);
+impl::NodeMatcherProperties Attr(string name);
+
+std::pair<string, AttrValue> AttrLiteralHelper(
+    const std::pair<string, bool>& bool_attr);
+
+std::pair<string, AttrValue> AttrLiteralHelper(
+    const std::pair<string, absl::Span<const int>>& int_list_attr);
+
+std::pair<string, AttrValue> AttrLiteralHelper(
+    const std::pair<string, absl::Span<const string>>& string_list_attr);
 }  // namespace impl
 
 // -----------------------------------------------------------------------------
@@ -157,6 +187,17 @@ impl::NodeMatcherProperties Op(string op);
 // Matches a node with assigned device `assigned_device`.
 impl::NodeMatcherProperties AssignedDevice(string assigned_device);
 
+// Matches a node with a boolean typed attrbute named `name` and with value
+// `value`.
+template <typename ValueTy>
+impl::NodeMatcherProperties Attr(const string& name, ValueTy value) {
+  return impl::Attr({impl::AttrLiteralHelper({name, value})});
+}
+
+inline impl::NodeMatcherProperties Attr(const string& name) {
+  return impl::Attr(name);
+}
+
 // Matches a node with inputs `inputs`.
 //
 // `inputs` are ordered; `inputs`[i] must match input i.
@@ -165,6 +206,16 @@ impl::NodeMatcherProperties Inputs(Ts... inputs) {
   return impl::Inputs({inputs...});
 }
 
+// Matches the `idx`'th output of a node that matches `node`.
+::testing::Matcher<impl::OutEdge> Out(int oidx,
+                                      ::testing::Matcher<const Node*> node);
+
+// Matches the first output of a node that matches `node`.
+inline ::testing::Matcher<impl::OutEdge> Out(
+    ::testing::Matcher<const Node*> node) {
+  return Out(0, node);
+}
+
 // Matches a node with control dependences `control_deps`.
 //
 // `control_deps` are unordered and will match the control deps of a node in any
@@ -185,13 +236,16 @@ template <typename... Ts>
   return impl::NodeWith(array);
 }
 
-::testing::Matcher<const Node*> Const(
+::testing::Matcher<impl::OutEdge> Const(
     const ::tensorflow::Input::Initializer& val);
 }  // namespace matchers
 
 // If `g` has a node named `name` returns it, otherwise returns null.
 Node* FindNodeByName(Graph* g, absl::string_view name);
 }  // namespace testing
+
+void PrintTo(const Node* n, ::std::ostream* os);
+void PrintTo(Node* n, ::std::ostream* os);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_NODE_MATCHERS_H_
diff --git a/tensorflow/compiler/jit/node_matchers_test.cc b/tensorflow/compiler/jit/node_matchers_test.cc
index 93a8994307b38ac240c22d0a18268638ac7620ae..c3f0dfece85573d71dbfa21eba5af70b674fe71e 100644
--- a/tensorflow/compiler/jit/node_matchers_test.cc
+++ b/tensorflow/compiler/jit/node_matchers_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/control_flow_ops.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/math_ops.h"
 
 namespace tensorflow {
@@ -27,12 +29,14 @@ namespace {
 using ::testing::_;
 
 using testing::matchers::AssignedDevice;
+using testing::matchers::Attr;
 using testing::matchers::ConstantValue;
 using testing::matchers::CtrlDeps;
 using testing::matchers::Inputs;
 using testing::matchers::Name;
 using testing::matchers::NodeWith;
 using testing::matchers::Op;
+using testing::matchers::Out;
 
 template <typename M, typename T>
 string Explain(const T& t, const M& m) {
@@ -61,7 +65,7 @@ TEST(NodeMatchers, CheckAgainstConstant) {
             "\nexpected op Add but found Placeholder");
   EXPECT_EQ(Explain(placeholder.node(), NodeWith(Name("add"))),
             "\nexpected name add but found placeholder");
-  EXPECT_EQ(Explain(placeholder.node(), NodeWith(Inputs(NodeWith()))),
+  EXPECT_EQ(Explain(placeholder.node(), NodeWith(Inputs(Out(NodeWith())))),
             "\nexpected 1 inputs but node has 0");
 }
 
@@ -74,18 +78,19 @@ TEST(NodeMatchers, CheckAgainstBinary) {
       ops::Placeholder(root.WithOpName("placeholder_b"), DT_FLOAT);
   Output add = ops::Add(root.WithOpName("add"), placeholder_a, placeholder_b);
 
-  EXPECT_THAT(add.node(), NodeWith(Op("Add"), Name("add"),
-                                   Inputs(NodeWith(Name("placeholder_a")),
-                                          NodeWith(Name("placeholder_b")))));
+  EXPECT_THAT(add.node(),
+              NodeWith(Op("Add"), Name("add"),
+                       Inputs(Out(NodeWith(Name("placeholder_a"))),
+                              Out(NodeWith(Name("placeholder_b"))))));
 
   EXPECT_EQ(Explain(add.node(), NodeWith(Inputs())),
             "\nexpected 0 inputs but node has 2");
   EXPECT_EQ(
-      Explain(add.node(), NodeWith(Inputs(NodeWith(Name("blah")), _))),
+      Explain(add.node(), NodeWith(Inputs(Out(NodeWith(Name("blah"))), _))),
       "\ninput 0 does not match expected:\nname: blah, \nsource does not match "
       "expected name: blah\n\t\nexpected name blah but found placeholder_a");
   EXPECT_EQ(
-      Explain(add.node(), NodeWith(Inputs(_, NodeWith(Name("blah"))))),
+      Explain(add.node(), NodeWith(Inputs(_, Out(NodeWith(Name("blah")))))),
       "\ninput 1 does not match expected:\nname: blah, \nsource does not match "
       "expected name: blah\n\t\nexpected name blah but found placeholder_b");
 }
@@ -174,6 +179,36 @@ TEST(NodeMatchers, AssignedDevice) {
             "/job:localhost/replica:0/task:0/device:CPU:0 but found \"\"");
 }
 
+TEST(NodeMatchers, OutputIndices) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output pred = ops::Placeholder(root.WithOpName("pred"), DT_BOOL);
+
+  Output data = ops::Placeholder(root.WithOpName("data"), DT_FLOAT);
+  ops::Switch sw(root.WithOpName("switch"), data, pred);
+  Output add = ops::Add(root.WithOpName("add"), sw.output_true,
+                        ops::Placeholder(root.WithOpName("addend"), DT_FLOAT));
+
+  EXPECT_THAT(add.node(), NodeWith(Inputs(Out(1, NodeWith(Op("Switch"))), _)));
+  EXPECT_EQ(
+      Explain(add.node(), NodeWith(Inputs(Out(0, NodeWith(Op("Switch"))), _))),
+      "\ninput 0 does not match expected:\nop: Switch, \nexpected output slot "
+      "to be 0 but found 1");
+}
+
+TEST(NodeMatchers, Attrs) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output enter = ops::internal::Enter(
+      root.WithOpName("enter"),
+      ops::Placeholder(root.WithOpName("data"), DT_FLOAT), "frame_name",
+      ops::internal::Enter::Attrs{}.IsConstant(true));
+  EXPECT_THAT(enter.node(), NodeWith(Attr("is_constant", true)));
+  EXPECT_EQ(Explain(enter.node(), NodeWith(Attr("is_constant", false))),
+            "attribute named is_constant does not match value; expected: "
+            "\"false\", found: \"true\"");
+  EXPECT_EQ(Explain(enter.node(), NodeWith(Attr("missing_attr", false))),
+            "did not find attribute named \"missing_attr\" in node");
+}
+
 }  // namespace
 }  // namespace testing
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index f72224545b25bc7100e0b6788e6fbf0a7ca63dad..64409d9334751e0edfce9091a4e5697dd2c712c5 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -18,3 +18,9 @@ tf_gen_op_wrapper_py(
     out = "xla_ops.py",
     deps = ["//tensorflow/compiler/jit/ops:xla_ops"],
 )
+
+py_library(
+    name = "xla_ops_grad",
+    srcs = ["xla_ops_grad.py"],
+    deps = ["//tensorflow/python:framework_ops"],
+)
diff --git a/tensorflow/compiler/jit/ops/xla_ops.cc b/tensorflow/compiler/jit/ops/xla_ops.cc
index bcd1a29b1ff789b5674a21ff66cc6d23a809afc5..95d12e95fd9a0d1cca513ee74a0651ea69eba89e 100644
--- a/tensorflow/compiler/jit/ops/xla_ops.cc
+++ b/tensorflow/compiler/jit/ops/xla_ops.cc
@@ -54,6 +54,7 @@ REGISTER_OP("XlaClusterOutput")
 REGISTER_OP("_XlaCompile")
     .Input("constants: Tconstants")
     .Attr("Tconstants: list(type) >= 0")
+    .Attr("must_compile: bool")
     .Input("args: Targs")
     .Attr("Targs: list(type) >= 0")
     .Input("resources: Nresources * resource")
@@ -71,8 +72,12 @@ that _XlaRun can use to look up the LocalExecutable and execute it.
 key: A key that can be used to look up the local executable compiled by the
    node and associated metadata.
 
-compilation_successful: True iff the compilation was successful.  Always true
-for now.
+compilation_successful: If the `must_compile` attr is false the _XlaCompile op
+   can decide not to compile the clusters based on some profitability
+   heuristics.  In that case `compilation_successful` is false if _XlaCompile
+   chose not to compile the cluster.  If the `must_compile` attr is true then
+   _XlaCompile always attempts to compile the cluster and
+   `compilation_successful` is always true.
 )");
 
 REGISTER_OP("_XlaRun")
diff --git a/tensorflow/compiler/jit/ops/xla_ops_grad.py b/tensorflow/compiler/jit/ops/xla_ops_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d31d8dc714307a48932d061fb1af643940a0872
--- /dev/null
+++ b/tensorflow/compiler/jit/ops/xla_ops_grad.py
@@ -0,0 +1,29 @@
+"""Gradients for XLA ops."""
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+
+
+@ops.RegisterGradient("XlaClusterOutput")
+def _XlaClusterOutputGrad(_, grad):
+  del grad  # unused
+  raise RuntimeError("Gradient computation of graph in xla.compile() is "
+                     "prohibited because it can cause performance degradation."
+                     "Please move gradient computation inside xla.compile().")
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index b1f9e9088f391cb8813d2c82395ffcc0b2081cae..42ea3926e16ae791dbe1bede3b8742383db7667c 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -22,9 +22,14 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 namespace {
+
+bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
+
+namespace reduce_device_to_host_copies {
 Status FindNodesToDecluster(const Graph& graph,
                             absl::flat_hash_set<Node*>* result,
                             absl::Span<Node* const> post_order) {
@@ -132,11 +137,13 @@ Status PartiallyDeclusterNode(Graph* graph, Node* n) {
     graph->RemoveEdge(out_edge_to_clone);
   }
 
+  if (n->out_edges().empty()) {
+    graph->RemoveNode(n);
+  }
+
   return Status::OK();
 }
 
-bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
-
 // Clones nodes to outside their cluster to avoid device-to-host copies.  For
 // instance, converts this:
 //
@@ -163,7 +170,7 @@ bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
 // where the ===> arrow has a hostmem source and destination and would entail a
 // device to host copy if the source and destination were not in the same XLA
 // cluster.
-Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
+Status PartiallyDeclusterGraph(Graph* graph) {
   // When deciding whether to decluster a particular node, we base our decision
   // on if we've decided that some of its consumers have to be declustered too.
   // Iterating the graph in post-order guarantees that consumers have been
@@ -190,6 +197,10 @@ Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
     }
   }
 
+  // Recompute post order since PartiallyDeclusterNode may have deleted nodes.
+  post_order.clear();
+  GetPostOrder(*graph, &post_order, /*stable_comparator=*/NodeComparatorName(),
+               /*edge_filter=*/NotBackedge);
   nodes_to_partially_decluster.clear();
   TF_RETURN_IF_ERROR(
       FindNodesToDecluster(*graph, &nodes_to_partially_decluster, post_order));
@@ -197,7 +208,9 @@ Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
 
   return Status::OK();
 }
+}  // namespace reduce_device_to_host_copies
 
+namespace reduce_recompilation {
 bool IsIntraClusterEdge(const Edge& edge) {
   absl::optional<absl::string_view> src_cluster_name =
       GetXlaClusterForNode(*edge.src());
@@ -206,18 +219,28 @@ bool IsIntraClusterEdge(const Edge& edge) {
   return src_cluster_name.has_value() && src_cluster_name == dst_cluster_name;
 }
 
-Status MustCompileNode(const Node* n, bool* result) {
+bool IsMustCompileDevice(const DeviceType& device_type) {
+  const XlaOpRegistry::DeviceRegistration* registration;
+  if (XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
+    return registration->autoclustering_policy ==
+           XlaOpRegistry::AutoclusteringPolicy::kAlways;
+  }
+
+  return false;
+}
+
+Status MustCompileNode(const Node* n, bool* must_compile) {
   DeviceType device_type("");
   TF_RETURN_IF_ERROR(
       DeviceToDeviceType(n->assigned_device_name(), &device_type));
 
-  const XlaOpRegistry::DeviceRegistration* registration;
-  if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
-    *result = false;
-  } else {
-    *result = registration->requires_compilation;
+  if (IsMustCompileDevice(device_type)) {
+    *must_compile = true;
+    return Status::OK();
   }
 
+  // We must compile `n` if it does not have a TensorFlow kernel.
+  *must_compile = !FindKernelDef(device_type, n->def(), nullptr, nullptr).ok();
   return Status::OK();
 }
 
@@ -250,7 +273,7 @@ Status MustCompileNode(const Node* n, bool* result) {
 // regress performance in any significant manner.  We will have to revisit this
 // algorith with a more complex cost model if this assumption turns out to be
 // incorrect.
-Status DeclusterNodesToReduceRecompilations(Graph* graph) {
+Status PartiallyDeclusterGraph(Graph* graph) {
   std::vector<bool> compile_time_const_nodes(graph->num_node_ids());
   TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
       *graph, nullptr, &compile_time_const_nodes, IsIntraClusterEdge));
@@ -303,7 +326,7 @@ Status DeclusterNodesToReduceRecompilations(Graph* graph) {
 
   return Status::OK();
 }
-
+}  // namespace reduce_recompilation
 }  // namespace
 
 Status PartiallyDeclusterPass::Run(
@@ -315,8 +338,9 @@ Status PartiallyDeclusterPass::Run(
 
   Graph* graph = options.graph->get();
 
-  TF_RETURN_IF_ERROR(PartiallyDeclusterToRemoveDeviceToHostCopies(graph));
-  TF_RETURN_IF_ERROR(DeclusterNodesToReduceRecompilations(graph));
+  TF_RETURN_IF_ERROR(
+      reduce_device_to_host_copies::PartiallyDeclusterGraph(graph));
+  TF_RETURN_IF_ERROR(reduce_recompilation::PartiallyDeclusterGraph(graph));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 0feb73a89e7050e8c413e5a733da1d87775b0ba3..38a54cc5efae35ad77b6dc8039c653e920cfc071 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -385,7 +386,7 @@ TEST(PartiallyDeclusterPassTest, DontDeclusterXlaDeviceOps) {
   TF_ASSERT_OK(s.ToGraph(graph.get()));
 
   // This is needed to register the XLA_GPU device.
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_ASSERT_OK(DeviceFactory::AddDevices(
       SessionOptions(), "/job:localhost/replica:0/task:0", &devices));
 
@@ -399,10 +400,64 @@ TEST(PartiallyDeclusterPassTest, DontDeclusterXlaDeviceOps) {
   TF_ASSERT_OK(PartiallyDecluster(&graph));
 
   EXPECT_EQ(GetXlaClusterForNode(*n), "cluster_0");
+}
+
+TEST(PartiallyDeclusterPassTest, DontDeclusterNonTensorFlowOps) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output dynamic_slice_operand =
+      ops::Placeholder(s.WithOpName("dynamic_slice_operand"), DT_INT32,
+                       ops::Placeholder::Attrs{});
+  Output dynamic_slice_begin = ops::Placeholder(
+      s.WithOpName("dynamic_slice_begin"), DT_INT32, ops::Placeholder::Attrs{});
+  Output dynamic_slice_size = ops::Placeholder(
+      s.WithOpName("dynamic_slice_size"), DT_INT32, ops::Placeholder::Attrs{});
+  Output dynamic_slice =
+      ops::XlaDynamicSlice(s.WithOpName("dynamic_slice"), dynamic_slice_operand,
+                           dynamic_slice_begin, dynamic_slice_size);
+
+  Output reshape_input = ops::Placeholder(s.WithOpName("reshape_input"),
+                                          DT_FLOAT, ops::Placeholder::Attrs{});
+  Output reshape =
+      ops::Reshape(s.WithOpName("reshape"), reshape_input, dynamic_slice);
+
+  AddToCluster({dynamic_slice.node(), reshape.node()}, "cluster_0");
+
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  TF_ASSERT_OK(s.ToGraph(graph.get()));
+
+  Node* n = FindNodeByName(*graph, "dynamic_slice");
+  ASSERT_NE(n, nullptr);
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+
+  EXPECT_EQ(GetXlaClusterForNode(*n), "cluster_0");
+}
 
-  for (Device* d : devices) {
-    delete d;
+TEST(PartiallyDeclusterPassTest, EliminatedUnusedNodes) {
+  const char* const kClusteredProducer0Name = "ClusteredProducer0";
+  const char* const kClusteredProducer1Name = "ClusteredProducer1";
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* input =
+        ops::SourceOp("FakeNullary", builder.opts().WithName("Input"));
+    Node* clustered_producer_0 =
+        ops::BinaryOp("FakeBinary", input, input,
+                      builder.opts().WithName(kClusteredProducer0Name));
+    Node* clustered_producer_1 =
+        ops::BinaryOp("FakeBinary", clustered_producer_0, input,
+                      builder.opts().WithName(kClusteredProducer1Name));
+    ops::BinaryOp("FakeBinary", clustered_producer_1, input,
+                  builder.opts().WithName("UnclusteredConsumer"));
+    clustered_producer_0->AddAttr(kXlaClusterAttr, "cluster_0");
+    clustered_producer_1->AddAttr(kXlaClusterAttr, "cluster_0");
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
+
+  TF_ASSERT_OK(PartiallyDecluster(&graph));
+  EXPECT_EQ(FindNodeByName(*graph, kClusteredProducer0Name), nullptr);
+  EXPECT_EQ(FindNodeByName(*graph, kClusteredProducer1Name), nullptr);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/jit/producer_consumer_queue.h b/tensorflow/compiler/jit/producer_consumer_queue.h
deleted file mode 100644
index 7c8c04152d2f3a0fd46711df24756b7e68b967ea..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/producer_consumer_queue.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
-#define TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
-
-#include <deque>
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-
-// A thread-safe, first-in-first-out queue.
-template <typename T>
-class ProducerConsumerQueue {
- public:
-  ProducerConsumerQueue()
-      : capacity_(std::numeric_limits<std::size_t>::max()) {}
-  ~ProducerConsumerQueue() = default;
-
-  // Wait until the queue is non-full, then append a copy of v.
-  void Put(const T &v);
-
-  // Wait until the queue is non-empty, then remove and return the head value.
-  T Get();
-
-  // If the queue is non-empty, remove the head value, placing it in *pv, and
-  // return true; otherwise return false.
-  bool TryGet(T *pv);
-
-  // Set the capacity of the queue; the queue is full whenever count() >=
-  // capacity().  The initial value is the maximum size_t.  Requires size > 0.
-  void set_capacity(std::size_t size);
-
-  // Return the capacity of the queue.
-  std::size_t capacity() const;
-
-  // Return the number of elements in the queue.
-  std::size_t count() const;
-
-  // Implementation details follow.  Clients should ignore.
- private:
-  mutable tensorflow::mutex mu_;  // protects all fields below
-  tensorflow::condition_variable non_empty_ GUARDED_BY(mu_);
-  tensorflow::condition_variable non_full_ GUARDED_BY(mu_);
-  std::size_t capacity_ GUARDED_BY(mu_);
-  std::deque<T> queue_ GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ProducerConsumerQueue);
-};
-
-// ------------------------------------------------------
-// Implementation details follow.  Clients should ignore.
-
-// Wait until the queue is non-full, then append a copy of v.
-template <typename T>
-void ProducerConsumerQueue<T>::Put(const T &v) {
-  mutex_lock lock(mu_);
-  while (queue_.size() >= capacity_) {
-    non_full_.wait(lock);
-  }
-  queue_.push_back(v);
-  non_empty_.notify_one();
-}
-
-// Wait until the queue is non-empty, then remove and return the head value.
-template <typename T>
-T ProducerConsumerQueue<T>::Get() {
-  mutex_lock lock(mu_);
-  while (queue_.empty()) {
-    non_empty_.wait(lock);
-  }
-  non_full_.notify_one();
-  T result_value = queue_.front();
-  queue_.pop_front();
-  return result_value;
-}
-
-// If the queue is non-empty, remove the head value, placing it in *pv, and
-// return true; otherwise return false.
-template <typename T>
-bool ProducerConsumerQueue<T>::TryGet(T *pv) {
-  mutex_lock lock(mu_);
-  bool got_element = !queue_.empty();
-  if (got_element) {
-    non_full_.notify_one();
-    *pv = queue_.front();
-    queue_.pop_front();
-  }
-  return got_element;
-}
-
-// Set the capacity of the queue; the queue is full whenever count() >=
-// capacity().  The initial value is the maximum size_t.  Requires size > 0.
-template <typename T>
-void ProducerConsumerQueue<T>::set_capacity(std::size_t size) {
-  mutex_lock lock(mu_);
-  CHECK_NE(size, 0);
-  capacity_ = size;
-  non_full_.notify_all();
-}
-
-// Return the capacity of the queue.
-template <typename T>
-std::size_t ProducerConsumerQueue<T>::capacity() const {
-  mutex_lock lock(mu_);
-  std::size_t max_elements = capacity_;
-  return max_elements;
-}
-
-// Return the number of elements in the queue.
-template <typename T>
-std::size_t ProducerConsumerQueue<T>::count() const {
-  mutex_lock lock(mu_);
-  std::size_t num_elements = queue_.size();
-  return num_elements;
-}
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
diff --git a/tensorflow/compiler/jit/producer_consumer_queue_test.cc b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
deleted file mode 100644
index f61260c6e52756ee039829afdc7452f5f760c221..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/producer_consumer_queue_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/producer_consumer_queue.h"
-
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace {
-
-typedef ProducerConsumerQueue<int> IntQueue;
-
-// Insert integers between low inclusive and high exclusive into q.
-void PushRange(IntQueue *q, int low, int high) {
-  while (low != high) {
-    q->Put(low);
-    VLOG(2) << "Pushing " << low;
-    ++low;
-  }
-}
-
-// Push the numbers between 0 and 999 inclusive from several threads in the
-// pool.
-void PushRanges(IntQueue *queue, thread::ThreadPool *pool) {
-  VLOG(1) << "Adding 20-36";
-  pool->Schedule([queue] { PushRange(queue, 20, 36); });
-  VLOG(1) << "Adding 7-20";
-  pool->Schedule([queue] { PushRange(queue, 7, 20); });
-  VLOG(1) << "Adding 36-501";
-  pool->Schedule([queue] { PushRange(queue, 36, 501); });
-  VLOG(1) << "Adding 501-1000";
-  pool->Schedule([queue] { PushRange(queue, 501, 1000); });
-  VLOG(1) << "Adding 0-5";
-  pool->Schedule([queue] { PushRange(queue, 0, 5); });
-  VLOG(1) << "Adding 5-7";
-  pool->Schedule([queue] { PushRange(queue, 5, 7); });
-}
-
-// Pop elements from queue using Get().  Make sure that exactly <high> elements
-// were present and their values are all integers between 0 and high-1
-// inclusive.
-void GetRange(IntQueue *queue, int high) {
-  VLOG(1) << "Testing Wait";
-  std::vector<int> results;
-  for (int i = 0; i != high; ++i) {
-    int r = queue->Get();
-    VLOG(2) << "Waited and got " << r;
-    results.push_back(r);
-  }
-  CHECK_EQ(queue->count(), 0);
-  std::sort(results.begin(), results.end());
-  for (int i = 0; i != high; ++i) {
-    CHECK(results[i] == i);
-  }
-}
-
-// Pop elements from queue using TryGet().  Make sure that exactly <high>
-// elements were present and their values are all integers between 0 and high-1
-// inclusive.
-void TryGetRange(IntQueue *queue, int high) {
-  std::vector<int> results;
-  // Give up if we don't get all the elements back from the queue
-  // in 10 seconds.
-  int timeout = 10;
-  int r;
-  for (int i = 0; i != high; ++i) {
-    while (!queue->TryGet(&r)) {
-      if (!timeout--) {
-        LOG(FATAL) << "Can't find all elements in the queue";
-      }
-      VLOG(1) << "Sleeping for a second...";
-      sleep(1);
-    }
-    VLOG(2) << "Popped " << r;
-    results.push_back(r);
-  }
-  CHECK_EQ(queue->count(), 0);
-  CHECK(!queue->TryGet(&r));
-  std::sort(results.begin(), results.end());
-  for (int i = 0; i != high; ++i) {
-    CHECK_EQ(i, results[i]);
-  }
-}
-
-const int kNumThreads = 15;
-
-TEST(ProducerConsumerQueue, GetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    PushRanges(&queue, &pool);
-  }
-  GetRange(&queue, 1000);
-}
-
-TEST(ProducerConsumerQueue, TryGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    PushRanges(&queue, &pool);
-  }
-  TryGetRange(&queue, 1000);
-}
-
-TEST(ProducerConsumerQueue, ParallelGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    pool.Schedule([&queue] { GetRange(&queue, 1000); });
-    PushRanges(&queue, &pool);
-  }
-}
-
-TEST(ProducerConsumerQueue, ParallelTryGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    pool.Schedule([&queue] { TryGetRange(&queue, 1000); });
-    PushRanges(&queue, &pool);
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
index e039d46ec863920eb7deb5bc20525fdab866415c..c0897217bcbd895003ce3018835da93a779a51a2 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
@@ -39,8 +39,7 @@ limitations under the License.
 //     resource variables).
 //
 // The result is incorrect around loops because we ignore edges from
-// NextIteration to Merge, but that should be fine because we don't cluster
-// these edges.  For instance, in:
+// NextIteration to Merge.  For instance, in:
 //
 // Init -----> Merge <-------+
 //               |           |
@@ -55,21 +54,20 @@ limitations under the License.
 //
 // we won't put (Read, Write) in the returned set.  This is fine if
 // auto-clustering can only cluster the Read->Write edge, but it is a problem if
-// it clusters the Write->NextIteration->Merge->Read edges instead.  The same
-// problem is present for the functional version of the loop above.  We rely on
-// auto-clustering to not cluster control flow edges like NextIteration->Merge.
-// This is enough to avoid the explicit-control-flow problem shown above.  One
-// way to think about this is that we only care about cases where two nodes, A
-// and B, would normally have been put in the same cluster but cannot legally be
-// in the same cluster because of resourcevar-dependencies.  If A and B would
+// it clusters the Write->NextIteration->Merge->Read edges instead.  So we rely
+// on auto-clustering to not cluster NextIteration->Merge edges.  The same
+// problem is present for the functional version of the loop above and we also
+// rely on auto-clustering not clustering functional while loops containing
+// resource operations.
+//
+// One way to think about this is that we only care about cases where two nodes,
+// A and B, would normally have been put in the same cluster but cannot legally
+// be in the same cluster because of resourcevar-dependencies.  If A and B would
 // normally have been put in the same cluster then all paths between A and B
 // would have to be clusterable (otherwise we'd have introduced a cycle).  Ergo
 // there could not have been a NextIteration->Merge edge between A and B since
 // we don't cluster these edges.
 //
-// We also rely on auto-clustering to not cluster functional control flow nodes
-// that contain resource operations.
-//
 // IMPLEMENTATION
 // --------------
 //
@@ -152,13 +150,12 @@ Status XlaResourceOpKindForNode(
 // can be represented by an XLA cluster and needs no special handling around
 // auto-jit.
 bool IsEdgeSafe(XlaResourceOpKind from, XlaResourceOpKind to) {
-  // XLA clusters forces all reads to happen before all writes, which means the
-  // kinds of edges it can faithfully represent are: Read->Write, Read->Modify,
-  // Modify->Write, Read->Read, Write->Write.
-  //
-  // TODO(b/112856632): We can, in theory, support Read->Read and Write->Write
-  // dependencies.
-  return from == XlaResourceOpKind::kRead && to == XlaResourceOpKind::kWrite;
+  // XLA clusters force all reads to happen before all writes.  Moreover the set
+  // of reads are executed as one atomic operation, and the set of writes are as
+  // another atomic operation.  This means we can faithfully represent the
+  // following edges: Read->*, *->Write.
+
+  return from == XlaResourceOpKind::kRead || to == XlaResourceOpKind::kWrite;
 }
 
 using ResourceOp = std::pair<int, XlaResourceOpKind>;
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
index e54b547abcfea698fe79e81dce547ea7858ff829..67304412fd384edde931fa2c5efb05f49e10411f 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis_test.cc
@@ -130,9 +130,7 @@ TEST(ResourceOperationSafetyAnalysisTest, ReadModify) {
   std::vector<std::pair<int, int>> incompatible_pairs;
   TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
 
-  EXPECT_EQ(incompatible_pairs.size(), 1);
-  std::pair<int, int> read_modify_pair = {read->id(), modify->id()};
-  EXPECT_EQ(incompatible_pairs[0], read_modify_pair);
+  EXPECT_EQ(incompatible_pairs.size(), 0);
 }
 
 TEST(ResourceOperationSafetyAnalysisTest, ModifyRead) {
@@ -162,9 +160,7 @@ TEST(ResourceOperationSafetyAnalysisTest, ModifyWrite) {
   std::vector<std::pair<int, int>> incompatible_pairs;
   TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
 
-  EXPECT_EQ(incompatible_pairs.size(), 1);
-  std::pair<int, int> modify_write_pair = {modify->id(), write->id()};
-  EXPECT_EQ(incompatible_pairs[0], modify_write_pair);
+  EXPECT_EQ(incompatible_pairs.size(), 0);
 }
 
 TEST(ResourceOperationSafetyAnalysisTest, WriteModify) {
@@ -196,11 +192,7 @@ TEST(ResourceOperationSafetyAnalysisTest, ReadModifyWrite) {
   std::vector<std::pair<int, int>> incompatible_pairs;
   TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
 
-  EXPECT_EQ(incompatible_pairs.size(), 2);
-  std::pair<int, int> modify_write_pair = {modify->id(), write->id()};
-  std::pair<int, int> read_modify_pair = {read->id(), modify->id()};
-  EXPECT_EQ(incompatible_pairs[0], read_modify_pair);
-  EXPECT_EQ(incompatible_pairs[1], modify_write_pair);
+  EXPECT_EQ(incompatible_pairs.size(), 0);
 }
 
 TEST(ResourceOperationSafetyAnalysisTest, WriteModifyRead) {
@@ -239,14 +231,12 @@ TEST(ResourceOperationSafetyAnalysisTest, WriteReadModify) {
   std::vector<std::pair<int, int>> incompatible_pairs;
   TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
 
-  ASSERT_EQ(incompatible_pairs.size(), 3);
+  ASSERT_EQ(incompatible_pairs.size(), 2);
 
   std::pair<int, int> write_modify_pair = {write->id(), modify->id()};
   std::pair<int, int> write_read_pair = {write->id(), read->id()};
-  std::pair<int, int> read_modify_pair = {read->id(), modify->id()};
-  EXPECT_EQ(incompatible_pairs[0], read_modify_pair);
-  EXPECT_EQ(incompatible_pairs[1], write_read_pair);
-  EXPECT_EQ(incompatible_pairs[2], write_modify_pair);
+  EXPECT_EQ(incompatible_pairs[0], write_read_pair);
+  EXPECT_EQ(incompatible_pairs[1], write_modify_pair);
 }
 
 FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
@@ -307,9 +297,7 @@ TEST(ResourceOperationSafetyAnalysisTest, ReadCall) {
   std::vector<std::pair<int, int>> incompatible_pairs;
   TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
 
-  ASSERT_EQ(incompatible_pairs.size(), 1);
-  std::pair<int, int> read_call_edge = {read->id(), call->id()};
-  EXPECT_EQ(incompatible_pairs[0], read_call_edge);
+  EXPECT_EQ(incompatible_pairs.size(), 0);
 }
 
 TEST(ResourceOperationSafetyAnalysisTest, CallWrite) {
@@ -329,9 +317,7 @@ TEST(ResourceOperationSafetyAnalysisTest, CallWrite) {
   std::vector<std::pair<int, int>> incompatible_pairs;
   TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
 
-  ASSERT_EQ(incompatible_pairs.size(), 1);
-  std::pair<int, int> call_write_edge = {call->id(), write->id()};
-  EXPECT_EQ(incompatible_pairs[0], call_write_edge);
+  EXPECT_EQ(incompatible_pairs.size(), 0);
 }
 
 TEST(ResourceOperationSafetyAnalysisTest, WriteCall) {
@@ -429,18 +415,14 @@ TEST(ResourceOperationSafetyAnalysisTest, ChainOfOps) {
   std::vector<std::pair<int, int>> incompatible_pairs;
   TF_ASSERT_OK(ComputeIncompatiblePairs(root.graph(), &incompatible_pairs));
 
-  ASSERT_EQ(incompatible_pairs.size(), 5);
+  ASSERT_EQ(incompatible_pairs.size(), 3);
   std::pair<int, int> write_0_read_0_pair = {write_0->id(), read_0->id()};
   std::pair<int, int> write_0_read_1_pair = {write_0->id(), read_1->id()};
   std::pair<int, int> write_1_read_1_pair = {write_1->id(), read_1->id()};
-  std::pair<int, int> write_0_write_1_pair = {write_0->id(), write_1->id()};
-  std::pair<int, int> read_0_read_1_pair = {read_0->id(), read_1->id()};
 
   EXPECT_EQ(incompatible_pairs[0], write_0_read_0_pair);
-  EXPECT_EQ(incompatible_pairs[1], write_0_write_1_pair);
-  EXPECT_EQ(incompatible_pairs[2], write_0_read_1_pair);
-  EXPECT_EQ(incompatible_pairs[3], read_0_read_1_pair);
-  EXPECT_EQ(incompatible_pairs[4], write_1_read_1_pair);
+  EXPECT_EQ(incompatible_pairs[1], write_0_read_1_pair);
+  EXPECT_EQ(incompatible_pairs[2], write_1_read_1_pair);
 }
 
 TEST(ResourceOperationSafetyAnalysisTest, DagOfOps) {
diff --git a/tensorflow/compiler/jit/shape_inference.cc b/tensorflow/compiler/jit/shape_inference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80c691fe490c1092315708a2da754d367d585300
--- /dev/null
+++ b/tensorflow/compiler/jit/shape_inference.cc
@@ -0,0 +1,174 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/shape_inference.h"
+
+#include "tensorflow/compiler/jit/shape_inference_helpers.h"
+#include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/graph/algorithm.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Converts a shape inference handle to a PartialTensorShape.
+Status ShapeHandleToTensorShape(shape_inference::InferenceContext* context,
+                                const shape_inference::ShapeHandle& handle,
+                                PartialTensorShape* shape) {
+  // The default is already unknown
+  if (!context->RankKnown(handle)) return Status::OK();
+
+  std::vector<int64> dims(context->Rank(handle));
+  for (int32 i = 0; i < dims.size(); ++i) {
+    dims[i] = context->Value(context->Dim(handle, i));
+  }
+  return PartialTensorShape::MakePartialShape(dims.data(), dims.size(), shape);
+}
+
+Status PropagateShapes(const Graph& graph,
+                       const std::map<int, InferredShape>& arg_shapes,
+                       ShapeRefiner* shape_refiner) {
+  // Visits the nodes in topological order (reverse post-order), inferring
+  // shapes.
+  // TODO(phawkins): handle cyclic graphs.
+  std::vector<Node*> order;
+  GetReversePostOrder(graph, &order);
+
+  for (Node* n : order) {
+    // Ignore the status returned by the shape_refiner. We want the best effort
+    // shapes, even if no shape function is registered for a node.
+    Status status = shape_refiner->AddNode(n);
+    if (!status.ok()) {
+      VLOG(1) << "Shape inference failed for node: " << status;
+    }
+
+    if (n->type_string() == "_Arg") {
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      auto it = arg_shapes.find(index);
+      if (it != arg_shapes.end()) {
+        const InferredShape& arg_shape = it->second;
+        shape_inference::InferenceContext* context =
+            shape_refiner->GetContext(n);
+
+        if (arg_shape.handle_type != DT_INVALID) {
+          shape_inference::ShapeHandle handle;
+          TF_RETURN_IF_ERROR(context->MakeShapeFromPartialTensorShape(
+              arg_shape.handle_shape, &handle));
+
+          // Sets the shape and type of the variable's value.
+          context->set_output_handle_shapes_and_types(
+              0, std::vector<shape_inference::ShapeAndType>{
+                     {handle, arg_shape.handle_type}});
+        }
+
+        shape_inference::ShapeHandle handle;
+        TF_RETURN_IF_ERROR(
+            context->MakeShapeFromPartialTensorShape(arg_shape.shape, &handle));
+        TF_RETURN_IF_ERROR(shape_refiner->SetShape(n, 0, handle));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Store the shapes of the output tensors in a map
+Status StoreOutputShapes(const Graph& graph, const ShapeRefiner& shape_refiner,
+                         GraphShapeInfo* shape_info) {
+  for (const Node* node : graph.nodes()) {
+    shape_inference::InferenceContext* context = shape_refiner.GetContext(node);
+    if (!context) continue;
+
+    auto& outputs = (*shape_info)[node->name()];
+    outputs.resize(context->num_outputs());
+    for (int i = 0; i < context->num_outputs(); ++i) {
+      auto& output = outputs[i];
+      TF_RETURN_IF_ERROR(
+          ShapeHandleToTensorShape(context, context->output(i), &output.shape));
+
+      const auto* handle_shapes_and_types =
+          context->output_handle_shapes_and_types(i);
+      if (handle_shapes_and_types != nullptr) {
+        if (handle_shapes_and_types->size() == 1) {
+          TF_RETURN_IF_ERROR(ShapeHandleToTensorShape(
+              context, (*handle_shapes_and_types)[0].shape,
+              &output.handle_shape));
+          output.handle_type = (*handle_shapes_and_types)[0].dtype;
+        } else {
+          // otherwise, it may be resource like a Queue, which can have
+          // multiple shapes and types represented by a single handle.
+        }
+      }
+      VLOG(4) << node->name() << " output " << i << " shape"
+              << output.shape.DebugString() << " handle_type "
+              << DataTypeString(output.handle_type) << " handle_shape "
+              << output.handle_shape.DebugString();
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status InferShapes(Graph* graph, const std::map<int, InferredShape>& arg_shapes,
+                   const tensorflow::FunctionLibraryDefinition* fnlib_def,
+                   GraphShapeInfo* shape_info) {
+  ShapeRefiner shape_refiner(graph->versions(), graph->op_registry());
+  shape_refiner.set_require_shape_inference_fns(false);
+  // TODO(dlibenzi): Verify if it is worth trying to infer shaped within
+  // functions. Some functions can be called at multiple locations with
+  // difference shapes, which will trigger a shape inference based on the
+  // arguments passed at the first call.
+  // shape_refiner.set_function_library_for_shape_inference(fnlib_def);
+
+  // ShapeRefiner requires that all inputs of a node are present when
+  // ShapeRefiner::AddNode is called. To get at least some shape information in
+  // loops, we temporarily remove loop backedges and add them back again after
+  // the shape inference is complete.
+  BackEdgeHelper back_edge;
+  TF_RETURN_IF_ERROR(back_edge.Remove(graph));
+  TF_RETURN_IF_ERROR(PropagateShapes(*graph, arg_shapes, &shape_refiner));
+  TF_RETURN_IF_ERROR(back_edge.Replace());
+
+  // Currently information does not flow "backward" from consumers to producers
+  // in the shape inference, but we consume the shapes in a second pass in case
+  // backward information flow is added in the future.
+  return StoreOutputShapes(*graph, shape_refiner, shape_info);
+}
+
+xla::StatusOr<InferredShape> MergeInferredShapes(const InferredShape& a,
+                                                 const InferredShape& b) {
+  InferredShape result;
+  TF_RETURN_IF_ERROR(a.shape.MergeWith(b.shape, &result.shape));
+
+  if (a.handle_type == DT_INVALID) {
+    result.handle_type = b.handle_type;
+  } else if (b.handle_type == DT_INVALID) {
+    result.handle_type = a.handle_type;
+  } else if (a.handle_type == b.handle_type) {
+    result.handle_type = a.handle_type;
+  } else {
+    return errors::InvalidArgument(
+        "Mismatched resource types: ", DataTypeString(a.handle_type), " vs. ",
+        DataTypeString(b.handle_type));
+  }
+  TF_RETURN_IF_ERROR(
+      a.handle_shape.MergeWith(b.handle_shape, &result.handle_shape));
+  return result;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/shape_inference.h b/tensorflow/compiler/jit/shape_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..8668dbca55c2cf84729d81086bde45757e54f8ab
--- /dev/null
+++ b/tensorflow/compiler/jit/shape_inference.h
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_H_
+#define TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_H_
+
+#include <map>
+#include <vector>
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+struct InferredShape {
+  // Shape of the argument tensor.
+  PartialTensorShape shape;
+
+  // If the argument is a resource variable, the type and shape of the
+  // variable's value.
+  DataType handle_type = DT_INVALID;
+  PartialTensorShape handle_shape;
+};
+typedef std::unordered_map<string, std::vector<InferredShape>> GraphShapeInfo;
+
+// Infer shapes for all Tensors in a graph, and save them in a map.  The vector
+// for a Node contains the information about each of its outputs.
+// TODO(phawkins): this code does not infer accurate shapes for cyclic graphs.
+Status InferShapes(Graph* graph, const std::map<int, InferredShape>& arg_shapes,
+                   const tensorflow::FunctionLibraryDefinition* fnlib_def,
+                   GraphShapeInfo* shape_info);
+
+// Merges two InferredShapes. Return an error if the two shapes cannot be
+// merged.
+xla::StatusOr<InferredShape> MergeInferredShapes(const InferredShape& a,
+                                                 const InferredShape& b);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_SHAPE_INFERENCE_H_
diff --git a/tensorflow/compiler/jit/shape_inference_test.cc b/tensorflow/compiler/jit/shape_inference_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9268172b1c4a4a717b608a52041219d54383a3ff
--- /dev/null
+++ b/tensorflow/compiler/jit/shape_inference_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests for ShapeInference.
+
+#include "tensorflow/compiler/jit/shape_inference.h"
+
+#include <map>
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/test_util.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(ShapeInferenceTest, Basics) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  auto a = ops::Placeholder(root.WithOpName("A"), DT_FLOAT,
+                            ops::Placeholder::Shape({2, 3}));
+  auto b = ops::Placeholder(root.WithOpName("B"), DT_FLOAT,
+                            ops::Placeholder::Shape({3}));
+  auto c = ops::Placeholder(root.WithOpName("C"), DT_FLOAT);
+  auto d = ops::Add(root.WithOpName("D"), a, b);
+  auto e = ops::Add(root.WithOpName("E"), d, c);
+  auto f = ops::Neg(root.WithOpName("F"), e);
+  auto g = ops::AddN(root.WithOpName("G"), std::initializer_list<Output>{e, f});
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_CHECK_OK(root.ToGraph(graph.get()));
+
+  GraphShapeInfo shape_info;
+  TF_ASSERT_OK(InferShapes(graph.get(), /*arg_shapes=*/{},
+                           /*fnlib_def=*/nullptr, &shape_info));
+
+  std::map<string, std::vector<PartialTensorShape>> expected = {
+      {"A", {PartialTensorShape({2, 3})}}, {"B", {PartialTensorShape({3})}},
+      {"C", {PartialTensorShape()}},       {"D", {PartialTensorShape({2, 3})}},
+      {"E", {PartialTensorShape()}},       {"F", {PartialTensorShape()}},
+      {"G", {PartialTensorShape()}},
+  };
+  TF_EXPECT_OK(ShapeAnnotationsMatch(*graph, shape_info, expected));
+}
+
+TEST(ShapeInferenceTest, WhileLoop) {
+  // Graph:
+  // x = array_ops.placeholder(dtypes.int32)
+  // y = control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + 1, [x])
+  Graph graph(OpRegistry::Global());
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+
+    auto dummy = ops::Placeholder(scope.WithOpName("Dummy"), DT_INT32,
+                                  ops::Placeholder::Shape({}));
+
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32,
+                                   ops::Placeholder::Shape({}));
+    auto enter =
+        ops::internal::Enter(scope.WithOpName("while/Enter"), source, "aloop");
+    // Add an unused Enter node. These should be ignored.
+    auto enter2 =
+        ops::internal::Enter(scope.WithOpName("while/Enter2"), source, "aloop");
+    auto merge = ops::Merge(scope.WithOpName("while/Merge"),
+                            std::initializer_list<Input>{enter, dummy});
+    auto ten = ops::Const<int32>(
+        scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
+        10);
+    auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
+    auto loop_cond = ops::LoopCond(scope.WithOpName("while/LoopCond"), less);
+    auto switch_node =
+        ops::Switch(scope.WithOpName("while/Switch"), merge.output, loop_cond);
+    auto exit = ops::internal::Exit(scope.WithOpName("while/Exit"),
+                                    switch_node.output_false);
+    auto identity = ops::Identity(scope.WithOpName("while/Identity"),
+                                  switch_node.output_true);
+    auto identity_shape =
+        ops::Const<int32>(scope.WithOpName("while/Identity/shape"), {});
+    auto identity_reshaped = ops::Reshape(
+        scope.WithOpName("while/Identity/reshaped"), identity, identity_shape);
+
+    auto one = ops::Const<int32>(
+        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+    auto add = ops::Add(scope.WithOpName("while/add"), identity_reshaped, one);
+    auto next_iteration =
+        ops::NextIteration(scope.WithOpName("while/NextIteration"), add);
+
+    auto sink = ops::Identity(scope.WithOpName("sink"), exit);
+
+    // Remove the dummy node and add the loop backedge.
+    scope.graph()->RemoveNode(dummy.node());
+    scope.graph()->AddEdge(next_iteration.node(), 0, merge.output.node(), 1);
+
+    TF_EXPECT_OK(scope.ToGraph(&graph));
+  }
+
+  GraphShapeInfo shape_info;
+  TF_ASSERT_OK(InferShapes(&graph, /*arg_shapes=*/{}, /*fnlib_def=*/nullptr,
+                           &shape_info));
+  std::map<string, std::vector<PartialTensorShape>> expected = {
+      {"while/Identity", {PartialTensorShape()}},
+      {"while/add", {PartialTensorShape({})}},
+  };
+  TF_EXPECT_OK(ShapeAnnotationsMatch(graph, shape_info, expected));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/test_util.cc b/tensorflow/compiler/jit/test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cada272090a1f613baea8f6d111866d8bb9cd55b
--- /dev/null
+++ b/tensorflow/compiler/jit/test_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/test_util.h"
+
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace tensorflow {
+
+Status ShapeAnnotationsMatch(
+    const Graph& graph, const GraphShapeInfo& shape_info,
+    std::map<string, std::vector<PartialTensorShape>> expected_shapes) {
+  for (Node* node : graph.op_nodes()) {
+    auto sit = shape_info.find(node->name());
+    TF_RET_CHECK(sit != shape_info.end())
+        << "Missing shape information for node " << node->name();
+    std::vector<PartialTensorShape> shapes;
+    for (const auto& output : sit->second) shapes.push_back(output.shape);
+
+    auto it = expected_shapes.find(node->name());
+    if (it != expected_shapes.end()) {
+      if (!PartialTensorShapeUtils::AreIdentical(shapes, it->second)) {
+        return errors::InvalidArgument(
+            "Shape mismatch for ", node->name(), ". Expected: ",
+            PartialTensorShapeUtils::PartialShapeListString(it->second),
+            ", actual: ",
+            PartialTensorShapeUtils::PartialShapeListString(shapes));
+      }
+      expected_shapes.erase(it);
+    }
+  }
+  if (!expected_shapes.empty()) {
+    std::vector<string> missing;
+    missing.reserve(expected_shapes.size());
+    for (const auto& entry : expected_shapes) {
+      missing.push_back(entry.first);
+    }
+    return errors::InvalidArgument("Missing shapes for nodes: ",
+                                   str_util::Join(missing, ","));
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/test_util.h b/tensorflow/compiler/jit/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c9fee8f2446d41f792a6cfbf8fc808d9d679c09
--- /dev/null
+++ b/tensorflow/compiler/jit/test_util.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for tests.
+
+#ifndef TENSORFLOW_COMPILER_JIT_TEST_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_TEST_UTIL_H_
+
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/compiler/jit/shape_inference.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Tests that the shapes in 'shape_info' for the nodes in `graph` match
+// `expected_shapes`. Returns an error if there are nodes in `expected_shapes`
+// that do not have shape information. Ignores nodes in `graph` that do not have
+// `expected_shapes` entries.
+Status ShapeAnnotationsMatch(
+    const Graph& graph, const GraphShapeInfo& shape_info,
+    std::map<string, std::vector<PartialTensorShape>> expected_shapes);
+
+}  // namespace tensorflow
+
+
+#endif  // TENSORFLOW_COMPILER_JIT_TEST_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index f85121ca27ad3da918315f93b28e9000dfd65e67..fef28fc810cb4e544fe3f271f0b96cebd8a96779 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -28,6 +28,8 @@ namespace tensorflow {
 
 const char* const kXlaClusterAttr = "_XlaCluster";
 const char* const kXlaOutsideCompilationAttr = "_XlaOutsideCompilation";
+const char* const kXlaCompileTimeConstantInputsAttr =
+    "_XlaCompileTimeConstantInputs";
 
 namespace {
 // Returns a string describing how an edge from src to dst would
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
index ba218f3315d2607c47342fdade0403678faa2362..fa6eaab3900b37baf7271c8c431c8384ceeda59f 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.h
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -32,6 +32,15 @@ extern const char* const kXlaClusterAttr;
 // compilation by the encapsulate subgraphs pass.
 extern const char* const kXlaOutsideCompilationAttr;
 
+// The attribute that marks certain inputs to a Node as required to be a
+// constant at compile time.  If this attribute is present then the
+// CompileTimeConstantInput information in the corresponding XlaOpKernel is
+// ignored.
+//
+// The value for this attribute, if present, has to be a list of strings naming
+// the inputs to the node that must be constant.
+extern const char* const kXlaCompileTimeConstantInputsAttr;
+
 using OrderedNodeSet = std::set<Node*, NodeComparatorID>;
 
 // Returns the DeviceType corresponding to 'device'.
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 0471995015bb080016b523305c90a3e42163a039..3df5479a55e841380ca7b8cdd0add9fd17487091 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <numeric>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -40,6 +41,7 @@ namespace tensorflow {
 XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client,
                                          DeviceType device_type)
     : client_(client), device_type_(std::move(device_type)) {}
+
 XlaCompilationCache::~XlaCompilationCache() {
   // Ensure any use of our programs have completed by waiting for all stream
   // executors to complete.
@@ -64,14 +66,14 @@ string XlaCompilationCache::DebugString() {
 
 // Compute a string signature which encodes the shapes of the
 // arguments in the supplied list.
-string XlaCompilationCache::SignatureDebugString(const Signature& sig) {
-  string result = sig.name;
-  for (const auto& a : sig.arg_types) {
+string XlaCompilationCache::Signature::HumanString() const {
+  string result = name;
+  for (const auto& a : arg_types) {
     absl::StrAppend(&result, ",", DataTypeString(a.first),
                     a.second.DebugString());
   }
 
-  for (const auto& v : sig.arg_values) {
+  for (const auto& v : arg_values) {
     absl::StrAppend(&result, "; ", v.DebugString());
   }
   return result;
@@ -83,7 +85,9 @@ bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
 
   if (arg_values.size() != other.arg_values.size()) return false;
   for (int i = 0; i < arg_values.size(); ++i) {
-    if (arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) {
+    if (arg_values[i].dtype() != other.arg_values[i].dtype() ||
+        arg_values[i].shape() != other.arg_values[i].shape() ||
+        arg_values[i].tensor_data() != other.arg_values[i].tensor_data()) {
       return false;
     }
   }
@@ -107,96 +111,30 @@ uint64 XlaCompilationCache::Signature::Hash::operator()(
   return h;
 }
 
-Status XlaCompilationCache::BuildSignature(
-    const NameAttrList& function, const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-    Signature* signature) {
-  signature->name = Canonicalize(function.name(), AttrSlice(&function.attr()));
-  signature->arg_values.reserve(constant_args.size());
-
-  signature->arg_types.reserve(ctx->num_inputs() - constant_args.size());
-
-  for (int i = 0; i < ctx->num_inputs(); ++i) {
-    if (constant_args.count(i) > 0) {
-      // Use the values of compile time constants in the signature.
-      signature->arg_values.push_back(constant_args.at(i));
-    } else if (variable_args.count(i) > 0) {
-      const OptionalTensor& variable = variable_args.at(i);
-      if (variable.present) {
-        signature->arg_types.emplace_back(variable.value.dtype(),
-                                          variable.value.shape());
-      } else {
-        signature->arg_types.emplace_back(DT_INVALID, TensorShape());
-      }
-    } else {
-      signature->arg_types.emplace_back(ctx->input_dtype(i),
-                                        ctx->input(i).shape());
-    }
-  }
-  return Status::OK();
-}
-
-namespace {
-
-// Builds a XlaCompiler::Argument vector from the arguments to the XlaLaunch op.
-Status BuildArguments(const std::map<int, Tensor>& constant_args,
-                      const std::map<int, OptionalTensor>& variable_args,
-                      OpKernelContext* ctx,
-                      std::vector<XlaCompiler::Argument>* args) {
-  args->resize(ctx->num_inputs());
-
-  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
-    XlaCompiler::Argument& arg = (*args)[input_num];
-    if (constant_args.count(input_num) > 0) {
-      // Handles compile-time constants.
-      const Tensor& input = constant_args.at(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-      arg.kind = XlaCompiler::Argument::kConstant;
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-      arg.constant_value = input;
-    } else if (variable_args.count(input_num) == 0) {
-      // Handles the non-constant arguments.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-      if (input.NumElements() > 0) {
-        arg.kind = XlaCompiler::Argument::kParameter;
-      } else {
-        arg.kind = XlaCompiler::Argument::kConstant;
-        arg.constant_value = input;
-      }
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-    } else {
-      // Handles resource variables.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
-      const OptionalTensor& variable = variable_args.at(input_num);
-      arg.name = variable.name;
-      arg.kind = XlaCompiler::Argument::kResource;
-      arg.resource_kind = XlaResource::kVariable;
-      if (variable.present) {
-        const Tensor& value = variable.value;
-        arg.type = value.dtype();
-        arg.shape = value.shape();
-        arg.initialized = true;
-      } else {
-        // The values of uninitialized variables are not passed as inputs, since
-        // they are meaningless. However, it is legal to assign to a resource
-        // variable for the first time inside the XLA computation, so we do
-        // permit uninitialized variables.
-        arg.initialized = false;
-        arg.type = DT_INVALID;
-        arg.shape = TensorShape();
-      }
+xla::StatusOr<XlaCompilationCache::Signature>
+XlaCompilationCache::BuildSignature(
+    const NameAttrList& function,
+    absl::Span<const XlaCompiler::Argument> args) {
+  Signature signature;
+  signature.name = Canonicalize(function.name(), AttrSlice(&function.attr()));
+  for (const XlaCompiler::Argument& arg : args) {
+    switch (arg.kind) {
+      case XlaCompiler::Argument::kConstant:
+        signature.arg_values.push_back(arg.constant_value);
+        break;
+      case XlaCompiler::Argument::kParameter:
+      case XlaCompiler::Argument::kResource:
+        signature.arg_types.emplace_back(arg.type, arg.shape);
+        break;
+      default:
+        return errors::InvalidArgument(
+            "Unhandled argument kind in XlaCompilationCache: ",
+            arg.HumanString());
     }
   }
-
-  return Status::OK();
+  return std::move(signature);
 }
 
-}  // namespace
-
 Status XlaCompilationCache::BuildExecutable(
     const XlaCompiler::Options& options,
     const XlaCompiler::CompilationResult& result,
@@ -226,20 +164,38 @@ Status XlaCompilationCache::BuildExecutable(
 
 Status XlaCompilationCache::Compile(
     const XlaCompiler::Options& options, const NameAttrList& function,
-    const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    absl::Span<const XlaCompiler::Argument> args,
     const XlaCompiler::CompileOptions& compile_options,
+    CompileMode compile_mode,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
-  return CompileImpl(options, function, constant_args, variable_args, ctx,
-                     compile_options, /*compile_single_op=*/false,
+  absl::optional<int64> compile_threshold;
+  if (compile_mode == CompileMode::kLazy) {
+    compile_threshold = kDefaultCompilationThreshold;
+  }
+  auto compile_fn = [&](XlaCompiler* compiler,
+                        XlaCompiler::CompilationResult* result) {
+    return compiler->CompileFunction(compile_options, function, args, result);
+  };
+  return CompileImpl(options, function, args, compile_fn,
+                     /*compile_threshold=*/compile_threshold,
                      out_compilation_result, out_executable);
 }
 
+static bool IsMegamorphic(int64 compile_count, int64 execution_count) {
+  const int64 kCompileThreshold = 10;
+  const int64 kMinExecutionsPerCompile = 50;
+
+  // This heuristic is trying to capture the following property: have we sunk a
+  // certain minimum amount of compile time into the cluster that didn't quite
+  // "pay off"?
+  return compile_count > kCompileThreshold &&
+         execution_count < kMinExecutionsPerCompile * compile_count;
+}
+
 Status XlaCompilationCache::CompileSingleOp(
     const XlaCompiler::Options& options,
-    const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    absl::Span<const XlaCompiler::Argument> args, OpKernelContext* ctx,
     const XlaCompiler::CompileOptions& compile_options,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
@@ -247,52 +203,41 @@ Status XlaCompilationCache::CompileSingleOp(
   NameAttrList name;
   name.set_name(def.op());
   *name.mutable_attr() = def.attr();
-  return CompileImpl(
-      options, name, constant_args, variable_args, ctx, compile_options,
-      /*compile_single_op=*/true, out_compilation_result, out_executable);
+  auto compile_op = [&](XlaCompiler* compiler,
+                        XlaCompiler::CompilationResult* result) {
+    std::vector<DataType> result_dtypes(ctx->num_outputs());
+    for (int i = 0; i < result_dtypes.size(); ++i) {
+      result_dtypes[i] = ctx->expected_output_dtype(i);
+    }
+    return compiler->CompileSingleOp(compile_options, ctx->op_kernel().def(),
+                                     args, result_dtypes, result);
+  };
+  return CompileImpl(options, name, args, compile_op,
+                     /*compile_threshold=*/absl::nullopt,
+                     out_compilation_result, out_executable);
 }
 
 Status XlaCompilationCache::CompileImpl(
     const XlaCompiler::Options& options, const NameAttrList& function,
-    const std::map<int, Tensor>& constant_args,
-    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-    const XlaCompiler::CompileOptions& compile_options, bool compile_single_op,
+    absl::Span<const XlaCompiler::Argument> args,
+    const std::function<Status(XlaCompiler* compiler,
+                               XlaCompiler::CompilationResult*)>& compile_fn,
+    absl::optional<int64> compile_threshold,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
   DCHECK_NE(out_executable, nullptr);
   VLOG(2) << "XlaCompilationCache::Compile " << DebugString();
 
   if (VLOG_IS_ON(2)) {
-    VLOG(2) << "num_inputs=" << ctx->num_inputs()
-            << " num_constant_args=" << constant_args.size()
-            << " num_variable_args=" << variable_args.size();
-    for (int i = 0; i < ctx->num_inputs(); i++) {
-      TensorShape shape = ctx->input(i).shape();
-      VLOG(2) << i << ": dtype=" << DataTypeString(ctx->input_dtype(i))
-              << " present=" << ctx->has_input(i)
-              << " shape=" << shape.DebugString();
-    }
-    for (auto& iterator : variable_args) {
-      const OptionalTensor& variable = iterator.second;
-      VLOG(2) << "variable present=" << variable.present
-              << " type=" << DataTypeString(variable.value.dtype())
-              << " shape=" << variable.value.shape().DebugString()
-              << " TF arg= " << iterator.first;
-    }
-    VLOG(2) << "num_outputs = " << ctx->num_outputs();
-    for (int i = 0; i < ctx->num_outputs(); i++) {
-      VLOG(2) << i << ": dtype=" << ctx->expected_output_dtype(i);
+    VLOG(2) << "num_inputs=" << args.size();
+    for (int i = 0; i < args.size(); i++) {
+      VLOG(2) << i << ": " << args[i].HumanString();
     }
   }
 
-  TF_RET_CHECK(constant_args.size() + variable_args.size() <=
-               ctx->num_inputs());
-
-  Signature signature;
-  TF_RETURN_IF_ERROR(
-      BuildSignature(function, constant_args, variable_args, ctx, &signature));
+  TF_ASSIGN_OR_RETURN(Signature signature, BuildSignature(function, args));
+  VLOG(2) << "Signature: " << signature.HumanString();
 
-  VLOG(2) << "Signature: " << SignatureDebugString(signature);
   // The outer lock protects the existence of the cache entry. It does not
   // protect the contents of the cache entry.
   Entry* entry;
@@ -306,32 +251,87 @@ Status XlaCompilationCache::CompileImpl(
     entry = e.get();
   }
 
+  // We always compile a cluster the very first time it is executed.  This is an
+  // optimistic guess that pays off for statically shaped TensorFlow graphs
+  // (since they get the benefit of XLA right away without waiting for warmup)
+  // and doesn't hurt much for dynamically shaped TensorFlow graphs (we "pay" at
+  // most one cluster-compilation's worth of compile time).
+  bool is_first_execution;
+
+  // We avoid compiling clusters that have "gone megamorphic" i.e. have an
+  // excessive amount of shape dynamism.
+  bool is_megamorphic;
+
+  {
+    mutex_lock lock(cluster_compile_stats_mu_);
+    auto it =
+        cluster_compile_stats_.emplace(function.name(), ClusterCompileStats{})
+            .first;
+    is_first_execution = it->second.execution_count++ == 0;
+
+    // The is_megamorphic bit is "sticky".  We assume clusters that have been
+    // observed to be megamorphic once stay megamorphic forever.
+    it->second.is_megamorphic |=
+        IsMegamorphic(/*compile_count=*/it->second.compile_count,
+                      /*execution_count=*/it->second.execution_count);
+    is_megamorphic = it->second.is_megamorphic;
+  }
+
   // Acquire the cache entry lock and compile, if necessary.
   // TODO(phawkins): this locking will need to be restructured when we implement
   // cache eviction.
   mutex_lock entry_lock(entry->mu);
+  int64 current_request_count = ++entry->request_count;
+  VLOG(2) << "Compilation cache entry hit: " << entry->compiled
+          << " signature: " << signature.HumanString() << " with request count "
+          << current_request_count << " and compile threshold "
+          << compile_threshold.value_or(0);
   if (!entry->compiled) {
-    VLOG(2) << "Compilation cache miss for signature: "
-            << SignatureDebugString(signature);
+    const bool should_compile = [&] {
+      if (!compile_threshold.has_value()) {
+        // Lazy compilation is disabled.
+        return true;
+      }
+
+      if (is_megamorphic) {
+        VLOG(3) << "Not compiling cluster " << function.name()
+                << " because it is megamorphic.";
+        return false;
+      }
+
+      if (is_first_execution) {
+        return true;
+      }
+
+      bool reached_compile_threshold =
+          current_request_count >= *compile_threshold;
+      if (!reached_compile_threshold) {
+        VLOG(3)
+            << "Not compiling cluster " << function.name()
+            << " because it has not reached compile threshold; threshold is "
+            << *compile_threshold << " execution count "
+            << current_request_count << ".";
+      }
+      return reached_compile_threshold;
+    }();
+
+    if (!should_compile) {
+      VLOG(2) << "Not compiling for signature: " << signature.HumanString();
+      *out_compilation_result = nullptr;
+      *out_executable = nullptr;
+      return Status::OK();
+    }
+
     tensorflow::Env* env = tensorflow::Env::Default();
     const uint64 compile_start_us = env->NowMicros();
     // Do the actual JIT compilation without holding the lock (it can take
     // a long time.)
-    std::vector<XlaCompiler::Argument> args;
-    TF_RETURN_IF_ERROR(
-        BuildArguments(constant_args, variable_args, ctx, &args));
 
     XlaCompiler compiler(options);
     entry->compiled = true;
 
-    if (compile_single_op) {
-      entry->compilation_status =
-          compiler.CompileSingleOp(compile_options, signature.name, ctx, args,
-                                   &entry->compilation_result);
-    } else {
-      entry->compilation_status = compiler.CompileFunction(
-          compile_options, function, args, &entry->compilation_result);
-    }
+    entry->compilation_status =
+        compile_fn(&compiler, &entry->compilation_result);
     TF_RETURN_IF_ERROR(entry->compilation_status);
     CHECK_EQ(entry->executable.get(), nullptr);
     entry->compilation_status =
@@ -340,8 +340,8 @@ Status XlaCompilationCache::CompileImpl(
     const uint64 compile_end_us = env->NowMicros();
     const uint64 compile_time_us = compile_end_us - compile_start_us;
     {
-      mutex_lock lock(compile_stats_mu_);
-      auto it = compile_stats_.emplace(function.name(), CompileStats{}).first;
+      mutex_lock lock(cluster_compile_stats_mu_);
+      auto it = cluster_compile_stats_.find(function.name());
       it->second.compile_count++;
       it->second.cumulative_compile_time_us += compile_time_us;
       VLOG(1) << "compiled " << function.name() << " "
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 75c7758f730f9f2f8251c02e7fac1a01f8cc9c2b..846d0c963dbfdf55f51120f2f138d12f5f63839b 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -17,9 +17,12 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILATION_CACHE_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -30,13 +33,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Struct that represents a possibly-absent Tensor.
-struct OptionalTensor {
-  string name;           // A descriptive name
-  bool present = false;  // Is the tensor present?
-  Tensor value;          // If present, what is the Tensor's value?
-};
-
 // The XlaCompilationCache class caches the results of the XlaCompiler class,
 // which converts a Tensorflow graph into a compiled XLA compilation.
 //
@@ -50,14 +46,23 @@ class XlaCompilationCache : public ResourceBase {
   XlaCompilationCache(xla::LocalClient* client, DeviceType device_type);
   ~XlaCompilationCache() override;
 
+  enum class CompileMode {
+    kLazy,
+    kStrict,
+  };
+
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
   // to execute an XLA Computation. Compilation results are cached.
   // `function` is the name of a Tensorflow function to compile.
-  // `constant_args` is a map of tensorflow argument number to its constant
-  //  value.
-  // `variable_args` is a snapshot of the current values of the
-  // resource variable arguments to `function`; uninitialized variables are
-  // represented by an absent OptionalTensor.
+  // `args` is a description of the arguments to the computation.
+  //
+  // `compile_mode` controls the behavior of the compilation cache on a cache
+  // miss.  If `compile_mode` is `kLazy` then, based on some profitability
+  // heuristics, the compilation cache may decide not to compile the cluster at
+  // this time.  In this case it returns null into both `out_compilation_result`
+  // and `out_executable`.  If `compile_mode` is `kStrict` then the compilation
+  // cache always attempts the compilation on a cache miss.
+  //
   // The result of compilation is written to `*compilation_result`, which must
   // be non-null. If `executable` is non-null, also builds an
   // xla::LocalExecutable and sets `executable` to point to it. The resulting
@@ -65,10 +70,9 @@ class XlaCompilationCache : public ResourceBase {
   // outputs.
   Status Compile(const XlaCompiler::Options& options,
                  const NameAttrList& function,
-                 const std::map<int, Tensor>& constant_args,
-                 const std::map<int, OptionalTensor>& variable_args,
-                 OpKernelContext* ctx,
+                 absl::Span<const XlaCompiler::Argument> args,
                  const XlaCompiler::CompileOptions& compile_options,
+                 CompileMode compile_mode,
                  const XlaCompiler::CompilationResult** out_compilation_result,
                  xla::LocalExecutable** out_executable);
 
@@ -76,8 +80,7 @@ class XlaCompilationCache : public ResourceBase {
   // XlaCompiler::CompileFunction.
   Status CompileSingleOp(
       const XlaCompiler::Options& options,
-      const std::map<int, Tensor>& constant_args,
-      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      absl::Span<const XlaCompiler::Argument> args, OpKernelContext* ctx,
       const XlaCompiler::CompileOptions& compile_options,
       const XlaCompiler::CompilationResult** out_compilation_result,
       xla::LocalExecutable** out_executable);
@@ -87,26 +90,6 @@ class XlaCompilationCache : public ResourceBase {
 
   string DebugString() override;
 
- private:
-  // Common implementation of Compile and CompileSingleOp.
-  Status CompileImpl(
-      const XlaCompiler::Options& options, const NameAttrList& function,
-      const std::map<int, Tensor>& constant_args,
-      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
-      const XlaCompiler::CompileOptions& compile_options,
-      bool compile_single_op,
-      const XlaCompiler::CompilationResult** out_compilation_result,
-      xla::LocalExecutable** out_executable);
-
-  // Takes `result` which has been compiled from a Tensorflow subgraph to a
-  // XLA computation already, and generates an XLA LocalExecutable `executable`.
-  Status BuildExecutable(const XlaCompiler::Options& options,
-                         const XlaCompiler::CompilationResult& result,
-                         std::unique_ptr<xla::LocalExecutable>* executable);
-
-  xla::LocalClient* const client_;
-  const DeviceType device_type_;
-
   // Describes the types, shapes and any compile-time constant arguments
   // to a kernel. Key that uniquely identifies a compilation output.
   struct Signature {
@@ -123,14 +106,35 @@ class XlaCompilationCache : public ResourceBase {
     struct Hash {
       uint64 operator()(const Signature& signature) const;
     };
+
+    // Returns a human-readable description of the signature.
+    string HumanString() const;
   };
-  static string SignatureDebugString(const Signature& sig);
 
   // Builds the signature for a compilation.
-  Status BuildSignature(const NameAttrList& function,
-                        const std::map<int, Tensor>& constant_args,
-                        const std::map<int, OptionalTensor>& variable_args,
-                        OpKernelContext* ctx, Signature* signature);
+  static xla::StatusOr<Signature> BuildSignature(
+      const NameAttrList& function,
+      absl::Span<const XlaCompiler::Argument> args);
+
+ private:
+  // Common implementation of Compile and CompileSingleOp.
+  Status CompileImpl(
+      const XlaCompiler::Options& options, const NameAttrList& function,
+      absl::Span<const XlaCompiler::Argument> args,
+      const std::function<Status(XlaCompiler* compiler,
+                                 XlaCompiler::CompilationResult*)>& compile_fn,
+      absl::optional<int64> compile_threshold,
+      const XlaCompiler::CompilationResult** out_compilation_result,
+      xla::LocalExecutable** out_executable);
+
+  // Takes `result` which has been compiled from a Tensorflow subgraph to a
+  // XLA computation already, and generates an XLA LocalExecutable `executable`.
+  Status BuildExecutable(const XlaCompiler::Options& options,
+                         const XlaCompiler::CompilationResult& result,
+                         std::unique_ptr<xla::LocalExecutable>* executable);
+
+  xla::LocalClient* const client_;
+  const DeviceType device_type_;
 
   // The value associated with a cache entry.
   struct Entry {
@@ -139,6 +143,9 @@ class XlaCompilationCache : public ResourceBase {
     // Have we tried compiling this entry?
     bool compiled = false;
 
+    // The number of times a compilation with this signature has been requested.
+    int64 request_count = 0;
+
     // Did compilation succeed?
     Status compilation_status GUARDED_BY(mu);
 
@@ -154,18 +161,31 @@ class XlaCompilationCache : public ResourceBase {
   absl::flat_hash_map<Signature, std::unique_ptr<Entry>, Signature::Hash> cache_
       GUARDED_BY(compile_cache_mu_);
 
-  struct CompileStats {
+  struct ClusterCompileStats {
     // Number of times the cluster has been (re-)compiled.
     int64 compile_count = 0;
 
+    // The number of times this cluster has been executed.
+    int64 execution_count = 0;
+
     // Cumulative time spent compiling the cluster.
     int64 cumulative_compile_time_us = 0;
+
+    // True if we have decided that this cluster is too dynamic (i.e. its shapes
+    // change too frequently) to profitably JIT compile.  Once a cluster is
+    // tagged megamorphic, it stays megamorphic forever.
+    bool is_megamorphic = false;
   };
-  mutex compile_stats_mu_;
+
+  mutex cluster_compile_stats_mu_;
 
   // Maps cluster names to compilation statistics for said cluster.
-  absl::flat_hash_map<string, CompileStats> compile_stats_
-      GUARDED_BY(compile_stats_mu_);
+  absl::flat_hash_map<string, ClusterCompileStats> cluster_compile_stats_
+      GUARDED_BY(cluster_compile_stats_mu_);
+
+  // The number of times a lazy compilation must be requested for a specific
+  // signature before  we attempt to compile it.
+  static constexpr int64 kDefaultCompilationThreshold = 2;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompilationCache);
 };
diff --git a/tensorflow/compiler/jit/xla_compilation_cache_test.cc b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..018c7c219f445bdca17f4f8b060e3678fe1be9ee
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_compilation_cache.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(XlaCompilationCacheTest, SignatureEquality) {
+  NameAttrList fn;
+  fn.set_name("afunction");
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kConstant;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({4, 0});
+  args[0].constant_value = Tensor(DT_INT32, {4, 0});
+  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s1,
+                          XlaCompilationCache::BuildSignature(fn, args));
+
+  args[0].type = DT_FLOAT;
+  args[0].constant_value = Tensor(DT_FLOAT, {4, 0});
+  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s2,
+                          XlaCompilationCache::BuildSignature(fn, args));
+
+  args[0].shape = TensorShape({0, 4});
+  args[0].constant_value = Tensor(DT_FLOAT, {0, 4});
+  TF_ASSERT_OK_AND_ASSIGN(XlaCompilationCache::Signature s3,
+                          XlaCompilationCache::BuildSignature(fn, args));
+
+  std::vector<XlaCompilationCache::Signature> signatures = {s1, s2, s3};
+  for (int i = 0; i < signatures.size(); ++i) {
+    for (int j = 0; j < signatures.size(); ++j) {
+      EXPECT_EQ(i == j, signatures[i] == signatures[j])
+          << signatures[i].HumanString() << " " << signatures[j].HumanString();
+    }
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 79976c85dff200ce993ebb06e7a20a15b71f6085..c7e8d61d280a33a83c3386d8ef801018634d31ec 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // Defines the XlaCompileOnDemandOp.
 
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
+
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
@@ -86,29 +88,26 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
   return Status::OK();
 }
 
-bool XlaCompileOnDemandOp::MustArgumentBeConstant(const OpKernel* op_kernel,
-                                                  int64 argument_idx) {
+Status XlaCompileOnDemandOp::MustArgumentBeConstant(const OpKernel* op_kernel,
+                                                    int64 argument_idx,
+                                                    bool* result) {
+  *result = false;
+
   // TODO(jmolloy): This could be expensive, so memoize.
-  auto* constant_inputs = tensorflow::XlaOpRegistry::CompileTimeConstantInputs(
-      op_kernel->def().op());
-  CHECK(constant_inputs);
-  std::set<int64> constant_input_indices;
-  for (const auto& name : *constant_inputs) {
-    int start, stop;
-    TF_CHECK_OK(op_kernel->InputRange(name, &start, &stop));
-    for (int i = start; i < stop; ++i) {
-      constant_input_indices.insert(i);
-    }
-  }
-  return constant_input_indices.count(argument_idx) > 0;
+  std::vector<int> constant_input_indices;
+  TF_RETURN_IF_ERROR(XlaOpRegistry::CompileTimeConstantInputs(
+      *op_kernel, &constant_input_indices));
+  *result = absl::c_binary_search(constant_input_indices, argument_idx);
+  return Status::OK();
 }
 
-bool XlaCompileOnDemandOp::ShouldArgumentBeConstant(const OpKernel* op_kernel,
-                                                    int64 argument_idx) {
+Status XlaCompileOnDemandOp::ShouldArgumentBeConstant(const OpKernel* op_kernel,
+                                                      int64 argument_idx,
+                                                      bool* result) {
   // Right now we only create kConstant arguments when absolutely required, but
   // there may be benefit in eagerly constant-folding a larger subset of
   // arguments in the future.
-  return MustArgumentBeConstant(op_kernel, argument_idx);
+  return MustArgumentBeConstant(op_kernel, argument_idx, result);
 }
 
 Status XlaCompileOnDemandOp::Compile(
@@ -119,27 +118,48 @@ Status XlaCompileOnDemandOp::Compile(
   for (int64 i = 0; i < ctx->num_inputs(); ++i) {
     const Tensor& device_tensor = ctx->input(i);
     if (const XlaTensor* xla_tensor = XlaTensor::FromTensor(&device_tensor)) {
-      if (xla_tensor->has_host_tensor() &&
-          ShouldArgumentBeConstant(&ctx->op_kernel(), i)) {
-        constant_arguments[i] = xla_tensor->host_tensor();
+      if (xla_tensor->has_host_tensor()) {
+        bool should_arg_be_const;
+        TF_RETURN_IF_ERROR(ShouldArgumentBeConstant(&ctx->op_kernel(), i,
+                                                    &should_arg_be_const));
+        if (should_arg_be_const) {
+          constant_arguments[i] = xla_tensor->host_tensor();
+        }
       }
     }
-    if (constant_arguments.count(i) == 0 &&
-        MustArgumentBeConstant(&ctx->op_kernel(), i)) {
-      // Slow path; the argument is not available as a host constant so we must
-      // fetch it synchronously.
-      Tensor host_tensor;
-      AllocatorAttributes attrs;
-      attrs.set_on_host(true);
-      TF_RETURN_IF_ERROR(ctx->allocate_temp(
-          device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
-      Notification n;
-      ctx->op_device_context()->CopyDeviceTensorToCPU(
-          &device_tensor, "ConstantArgument",
-          reinterpret_cast<Device*>(ctx->device()), &host_tensor,
-          [&](Status status) { n.Notify(); });
-      n.WaitForNotification();
-      constant_arguments[i] = host_tensor;
+
+    if (constant_arguments.count(i) == 0) {
+      bool must_argument_be_const;
+      TF_RETURN_IF_ERROR(MustArgumentBeConstant(&ctx->op_kernel(), i,
+                                                &must_argument_be_const));
+
+      if (must_argument_be_const) {
+        // Slow path; the argument is not available as a host constant so we
+        // must fetch it synchronously.
+        Tensor host_tensor;
+        AllocatorAttributes attrs;
+        attrs.set_on_host(true);
+        TF_RETURN_IF_ERROR(ctx->allocate_temp(
+            device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
+        Notification n;
+        Status status;
+        ctx->op_device_context()->CopyDeviceTensorToCPU(
+            &device_tensor, "ConstantArgument",
+            reinterpret_cast<Device*>(ctx->device()), &host_tensor,
+            [&](Status s) {
+              status = s;
+              n.Notify();
+            });
+        n.WaitForNotification();
+        if (!status.ok()) {
+          LOG(ERROR) << "Copying tensor of shape "
+                     << device_tensor.shape().DebugString() << " from "
+                     << ctx->device()->name() << "to CPU failed with "
+                     << status.ToString();
+          return status;
+        }
+        constant_arguments[i] = host_tensor;
+      }
     }
   }
 
@@ -164,8 +184,7 @@ Status XlaCompileOnDemandOp::Compile(
   XlaCompiler::Options options;
   options.device_type = metadata.jit_device_type();
   options.client = metadata.client();
-  options.flib_def =
-      new FunctionLibraryDefinition(OpRegistry::Global(), FunctionDefLibrary{});
+  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.shape_representation_fn = metadata.shape_representation_fn();
 
   XlaCompiler::CompileOptions compile_options;
@@ -179,8 +198,14 @@ Status XlaCompileOnDemandOp::Compile(
   compile_options.always_return_tuple = false;
 
   std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
-  return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx,
-                                compile_options, result, executable);
+
+  std::vector<XlaCompiler::Argument> args;
+
+  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
+      constant_arguments, variable_args, ctx, &args));
+
+  return cache->CompileSingleOp(options, args, ctx, compile_options, result,
+                                executable);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index 7cc3d0e007ba2974fbfbe6fbabc4aa08f9fa910f..b93bb15ce34688f26316e22bf59f448e787df9fc 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -38,8 +38,10 @@ class XlaCompileOnDemandOp : public OpKernel {
 
  private:
   XlaCompiler::Argument CreateCompilerArgument(OpKernelContext* ctx, int64 i);
-  bool ShouldArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx);
-  bool MustArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx);
+  Status ShouldArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx,
+                                  bool* result);
+  Status MustArgumentBeConstant(const OpKernel* op_kernel, int64 argument_idx,
+                                bool* result);
   Status Compile(OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
                  const XlaCompiler::CompilationResult** result,
                  xla::LocalExecutable** executable);
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 003c1d8081a3313fd042cdcaea14508ed1048da3..e9770647e7ba96cc1db026d12d5f11f52ce98d35 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -16,8 +16,9 @@ limitations under the License.
 // Registers the XLA_CPU device, which is an XlaDevice instantiation that runs
 // operators using XLA via the XLA "Host" (CPU) backend.
 
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@@ -30,34 +31,51 @@ namespace tensorflow {
 class XlaCpuDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 };
 
-Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& options,
-                                          const string& name_prefix,
-                                          std::vector<Device*>* devices) {
-  legacy_flags::XlaDeviceFlags* flags = legacy_flags::GetXlaDeviceFlags();
+Status XlaCpuDeviceFactory::CreateDevices(
+    const SessionOptions& session_options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
+  XlaDeviceFlags* flags = GetXlaDeviceFlags();
   bool compile_on_demand = flags->tf_xla_compile_on_demand;
 
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
-  registration.requires_compilation = !compile_on_demand;
-  registration.enable_jit_by_default = false;
+  registration.autoclustering_policy =
+      compile_on_demand
+          ? XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested
+          : XlaOpRegistry::AutoclusteringPolicy::kAlways;
   registration.compile_resource_ops = true;
+  XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_CPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
       RegisterXlaDeviceKernels(DEVICE_XLA_CPU, DEVICE_CPU_XLA_JIT);
   (void)registrations;
 
-  std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create("Host", DEVICE_XLA_CPU, 0,
-                                       DEVICE_CPU_XLA_JIT, options, name_prefix,
-                                       registration,
-                                       /*transfer_as_literal=*/false,
-                                       /*use_multiple_streams=*/false,
-                                       /*shape_representation_fn=*/{},
-                                       /*padded_shape_fn=*/{}, &device));
-  devices->push_back(device.release());
+  TF_ASSIGN_OR_RETURN(auto platform,
+                      se::MultiPlatformManager::PlatformWithName("Host"));
+
+  XlaDevice::Options options;
+  options.platform = platform;
+  options.device_name_prefix = name_prefix;
+  options.device_name = DEVICE_XLA_CPU;
+  options.device_ordinal = 0;
+  options.compilation_device_name = DEVICE_CPU_XLA_JIT;
+  options.use_multiple_streams = false;
+  auto device = absl::make_unique<XlaDevice>(session_options, options);
+
+  // Setting GpuDeviceInfo because eager runtime relies on the device
+  // context in tensorflow_gpu_device_info(). Also,
+  // tensorflow_gpu_device_info() == nullptr is used as an IsCPU test.
+  // We need XlaCpuDevice to be treated not as CPU because it allocates
+  // XlaTensors, not regular Tensors.
+  Status status = device->UseGpuDeviceInfo();
+  if (!status.ok()) {
+    errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT);
+    return status;
+  }
+  devices->push_back(std::move(device));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 0824c4644e3e5d8e1390b99f12de824bfcdfec24..4201ff91a89b1bee370e6a43337c51abe3bf974a 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -125,41 +125,17 @@ Status DefaultPaddedShapeFn(const Tensor& tensor, xla::Shape* shape) {
   return Status::OK();
 }
 
-}  // namespace
-
-/* static */ Status XlaDevice::Create(
-    const string& platform_name, const string& device_name, int device_ordinal,
-    const string& jit_device_name, const SessionOptions& options,
-    const string& name_prefix,
-    const XlaOpRegistry::DeviceRegistration& registration,
-    bool transfer_as_literal, bool use_multiple_streams,
-    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
-    const PaddedShapeFn& padded_shape_fn, std::unique_ptr<XlaDevice>* device) {
-  VLOG(1) << "XlaDevice::Create " << platform_name << " " << device_name << ":"
-          << device_ordinal;
-
-  // These are no-ops if they have already been done previously for
-  // this device_name/compilation_device_name pair.
-  XlaOpRegistry::RegisterCompilationDevice(device_name, registration);
-
-  auto platform = se::MultiPlatformManager::PlatformWithName(platform_name);
-  if (!platform.ok()) {
-    return platform.status();
-  }
-
-  const DeviceAttributes attrs = Device::BuildDeviceAttributes(
+static DeviceAttributes BuildXlaDeviceAttributes(const string& name_prefix,
+                                                 const string& device_name,
+                                                 int device_ordinal) {
+  return Device::BuildDeviceAttributes(
       absl::StrCat(name_prefix, "/device:", device_name, ":", device_ordinal),
       DeviceType(device_name), Bytes(16ULL << 30), DeviceLocality(),
       absl::StrCat("device: ", device_name, " device"));
-
-  device->reset(
-      new XlaDevice(options, attrs, device_ordinal, DeviceType(jit_device_name),
-                    platform.ValueOrDie(), transfer_as_literal,
-                    use_multiple_streams, shape_representation_fn,
-                    padded_shape_fn ? padded_shape_fn : DefaultPaddedShapeFn));
-  return Status::OK();
 }
 
+}  // namespace
+
 XlaDevice::Metadata::Metadata(
     int device_ordinal, se::Platform* platform, const DeviceType& device_type,
     XlaCompiler::ShapeRepresentationFn shape_representation_fn,
@@ -209,30 +185,42 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
   return GetMetadataFromDevice(ctx->device(), metadata);
 }
 
-XlaDevice::XlaDevice(
-    const SessionOptions& options, const DeviceAttributes& attrs,
-    int device_ordinal, const DeviceType& jit_device_name,
-    se::Platform* platform, bool transfer_as_literal, bool use_multiple_streams,
-    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
-    const PaddedShapeFn& padded_shape_fn)
-    : LocalDevice(options, attrs),
-      xla_metadata_(device_ordinal, platform, jit_device_name,
-                    shape_representation_fn, padded_shape_fn,
-                    use_multiple_streams),
-      device_ordinal_(device_ordinal),
-      jit_device_name_(jit_device_name),
-      platform_(platform),
-      use_multiple_streams_(use_multiple_streams),
-      transfer_as_literal_(transfer_as_literal),
-      shape_representation_fn_(shape_representation_fn) {
-  VLOG(1) << "Created XLA device " << jit_device_name << " " << this;
-  thread_pool_.reset(new thread::ThreadPool(options.env, "xla_device",
+XlaDevice::XlaDevice(const SessionOptions& session_options,
+                     const Options& options)
+    : LocalDevice(session_options,
+                  BuildXlaDeviceAttributes(options.device_name_prefix,
+                                           options.device_name,
+                                           options.device_ordinal)),
+      xla_metadata_(options.device_ordinal, options.platform,
+                    DeviceType(options.compilation_device_name),
+                    options.shape_representation_fn,
+                    options.padded_shape_fn ? options.padded_shape_fn
+                                            : DefaultPaddedShapeFn,
+                    options.use_multiple_streams),
+      device_ordinal_(options.device_ordinal),
+      jit_device_name_(options.compilation_device_name),
+      platform_(options.platform),
+      use_multiple_streams_(options.use_multiple_streams),
+      shape_representation_fn_(options.shape_representation_fn) {
+  VLOG(1) << "Created XLA device " << options.compilation_device_name << " "
+          << this;
+  thread_pool_.reset(new thread::ThreadPool(session_options.env, "xla_device",
                                             /*num_threads=*/1));
+
+  // We have multiple device to device streams to allow for some concurrency
+  // between transfers. The particular value of '4' is chosen fairly
+  // arbitrarily. It may be necessary to make this tunable via
+  // XlaDevice::Options.
+  static constexpr int kNumDeviceToDeviceStreams = 4;
+  device_to_device_streams_.resize(kNumDeviceToDeviceStreams);
 }
 
 XlaDevice::~XlaDevice() {
   VLOG(1) << "Destroying XLA device " << jit_device_name_ << " " << this;
   mutex_lock lock(mu_);
+  while (outstanding_asynchronous_operations_ > 0) {
+    outstanding_asynchronous_operations_cv_.wait(lock);
+  }
   if (device_context_) {
     device_context_->Unref();
   }
@@ -295,8 +283,9 @@ xla::StatusOr<XlaDeviceContext*> XlaDevice::GetDeviceContextLocked() {
   TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "stream", &stream_,
                                           &need_new_device_context));
 
-  std::shared_ptr<se::Stream> host_to_device_stream = stream_;
-  std::shared_ptr<se::Stream> device_to_host_stream = stream_;
+  std::shared_ptr<se::Stream> host_to_device_stream;
+  std::shared_ptr<se::Stream> device_to_host_stream;
+  std::vector<std::shared_ptr<se::Stream>> device_to_device_streams;
   if (use_multiple_streams_) {
     TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "host_to_device_stream",
                                             &host_to_device_stream_,
@@ -304,8 +293,18 @@ xla::StatusOr<XlaDeviceContext*> XlaDevice::GetDeviceContextLocked() {
     TF_RETURN_IF_ERROR(EnsureStreamOkLocked(backend, "device_to_host_stream",
                                             &device_to_host_stream_,
                                             &need_new_device_context));
+    for (std::shared_ptr<se::Stream>& stream : device_to_device_streams_) {
+      TF_RETURN_IF_ERROR(
+          EnsureStreamOkLocked(backend, "device_to_device_stream", &stream,
+                               &need_new_device_context));
+    }
     host_to_device_stream = host_to_device_stream_;
     device_to_host_stream = device_to_host_stream_;
+    device_to_device_streams = device_to_device_streams_;
+  } else {
+    host_to_device_stream = stream_;
+    device_to_host_stream = stream_;
+    device_to_device_streams = {stream_};
   }
 
   if (!need_new_device_context) {
@@ -323,8 +322,9 @@ xla::StatusOr<XlaDeviceContext*> XlaDevice::GetDeviceContextLocked() {
   // ensures that the streams remain live for the duration of a run, even if
   // an error is encountered and the streams are replaced with new ones.
   device_context_ = new XlaDeviceContext(
-      stream_, host_to_device_stream, device_to_host_stream, client(),
-      transfer_as_literal_, shape_representation_fn_, thread_pool_.get());
+      stream_, std::move(host_to_device_stream),
+      std::move(device_to_host_stream), std::move(device_to_device_streams),
+      client(), shape_representation_fn_, thread_pool_.get());
   VLOG(1) << "XlaDevice " << this << " new XlaDeviceContext "
           << device_context_;
 
@@ -387,6 +387,7 @@ void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
 
 Status XlaDevice::Sync() {
   VLOG(1) << "XlaDevice::Sync";
+  tracing::ScopedActivity activity("XlaDevice::Sync", /*is_expensive=*/true);
   std::shared_ptr<se::Stream> stream;
   {
     mutex_lock lock(mu_);
@@ -394,13 +395,46 @@ Status XlaDevice::Sync() {
   }
   if (!stream) return Status::OK();
 
-  if (!stream->parent()->SynchronizeAllActivity() || !stream->ok()) {
+  Status status = stream->BlockHostUntilDone();
+  {
+    mutex_lock lock(mu_);
+    while (outstanding_asynchronous_operations_ > 0) {
+      outstanding_asynchronous_operations_cv_.wait(lock);
+    }
+  }
+  TF_RETURN_IF_ERROR(status);
+  if (!stream->ok()) {
     return errors::Internal("XlaDevice::Sync() failed.");
   }
   VLOG(1) << "XlaDevice::Sync completed";
   return Status::OK();
 }
 
+void XlaDevice::Sync(const DoneCallback& done) {
+  VLOG(1) << "XlaDevice::Sync (asynchronous)";
+  std::shared_ptr<se::Stream> stream;
+  {
+    mutex_lock lock(mu_);
+    stream = stream_;
+  }
+  if (!stream) {
+    done(Status::OK());
+    return;
+  }
+
+  stream->ThenEnqueueOnBackgroundThread(
+      [this, stream, done](se::StreamExecutor*) {
+        tracing::ScopedActivity activity("XlaDevice::Sync::Callback",
+                                         /*is_expensive=*/true);
+        mutex_lock lock(mu_);
+        while (outstanding_asynchronous_operations_ > 0) {
+          outstanding_asynchronous_operations_cv_.wait(lock);
+        }
+        done(stream->ok() ? Status::OK()
+                          : errors::Internal("XlaDevice::Sync() failed."));
+      });
+}
+
 Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                       const AllocatorAttributes alloc_attrs,
                                       Tensor* tensor) {
@@ -444,12 +478,55 @@ bool XlaDevice::RequiresSyncOnCompletion() const {
   return sync_on_completion_;
 }
 
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    XlaDevice* device)
+    : device_(device) {
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+}
+
+XlaDevice::AsynchronousOperationHandle::~AsynchronousOperationHandle() {
+  if (device_) {
+    mutex_lock lock(device_->mu_);
+    --device_->outstanding_asynchronous_operations_;
+    device_->outstanding_asynchronous_operations_cv_.notify_all();
+  }
+}
+
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    const XlaDevice::AsynchronousOperationHandle& other)
+    : device_(other.device_) {
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+}
+
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    XlaDevice::AsynchronousOperationHandle&& other)
+    : device_(other.device_) {
+  other.device_ = nullptr;
+}
+
+XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
+operator=(const XlaDevice::AsynchronousOperationHandle& other) {
+  device_ = other.device_;
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+  return *this;
+}
+
+XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
+operator=(XlaDevice::AsynchronousOperationHandle&& other) {
+  device_ = other.device_;
+  other.device_ = nullptr;
+  return *this;
+}
+
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
                                                    const char* jit_device) {
   // Any op assigned to the device that isn't rewritten by the graph rewriter
   // gets executed by a n XlaCompileOnDemandOp, which compiles it and executes
   // it just-in-time.
-  kernel_factory::OpKernelRegistrar::Factory factory =
+  OpKernel* (*factory)(OpKernelConstruction*) =
       [](OpKernelConstruction* context) -> OpKernel* {
     return new XlaCompileOnDemandOp(context);
   };
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 0f06b3fc80b7c844dae5643127bdabba8a53b35e..c8bb276cdb9673fdcba4cc15a9f33ecd3ae96dbb 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -92,34 +92,41 @@ class XlaDevice : public LocalDevice {
   static Status GetMetadata(OpKernelConstruction* ctx,
                             const Metadata** metadata);
 
-  // Factory function. 'platform_name' is the name of the XLA platform.
-  // 'device_name' is the name of the Tensorflow device to create.
-  // 'jit_device_name' is the name of the corresponding JIT device.
-  // 'transfer_as_literal' is true if device<->host transfers must be done using
-  // XLA's TransferLiteral{To,From}Device interface. If false, we can use
-  // ThenMemcpy instead.
-  // If 'use_multiple_streams' is true, we create separate streams for
-  // host-to-device and device-to-host communication.
-  // If padded_shape_fn is empty, a default implementation that returns
-  // the on-host shape is used.
-  static Status Create(
-      const string& platform_name, const string& device_name,
-      int device_ordinal, const string& jit_device_name,
-      const SessionOptions& options, const string& name_prefix,
-      const XlaOpRegistry::DeviceRegistration& registration,
-      bool transfer_as_literal, bool use_multiple_streams,
-      const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
-      const PaddedShapeFn& padded_shape_fn, std::unique_ptr<XlaDevice>* device);
+  struct Options {
+    // The StreamExecutor platform. Not owned. Must be non-null.
+    se::Platform* platform = nullptr;
+
+    // The device name's prefix (e.g., "/task:7")
+    string device_name_prefix;
+
+    // The name of the XLA device (e.g., "XLA_CPU")
+    string device_name;
+
+    // The number of the device.
+    int device_ordinal = -1;
+
+    // The name of the compilation device (e.g., "XLA_CPU_JIT");
+    string compilation_device_name;
+
+    // If 'use_multiple_streams' is true, we create separate streams for
+    // compute, host-to-device, and device-to-host communication.
+    bool use_multiple_streams = false;
+
+    // A function that describes how the on-host shapes of
+    // a) argument and return value, for entry computations
+    // b) variables, for all computations,
+    // should be represented in XLA. Parameters/return values will be shaped
+    // according to this function, and reshaped back to/from their declared
+    // shapes for computations. Must be non-null.
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn;
+
+    // If padded_shape_fn is empty, a default implementation that returns
+    // the logical on-device shape without padding is used.
+    PaddedShapeFn padded_shape_fn;
+  };
 
   // Creates a new XLA Device.
-  // If padded_shape_fn is empty, a default implementation that returns
-  // the logical on-device shape without padding is used.
-  XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs,
-            int device_ordinal, const DeviceType& jit_device_name,
-            se::Platform* platform, bool transfer_as_literal,
-            bool use_multiple_streams,
-            const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
-            const PaddedShapeFn& padded_shape_fn);
+  XlaDevice(const SessionOptions& session_options, const Options& options);
   ~XlaDevice() override;
 
   Allocator* GetAllocator(AllocatorAttributes attr) override
@@ -128,6 +135,7 @@ class XlaDevice : public LocalDevice {
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
   Status Sync() override;
+  void Sync(const DoneCallback& done) override;
 
   Status FillContextMap(const Graph* graph,
                         DeviceContextMap* device_context_map) override
@@ -157,7 +165,30 @@ class XlaDevice : public LocalDevice {
 
   bool RequiresSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
 
+  // A simple RAII handle. On construction the device's
+  // outstanding_asynchronous_operations_ field is incremented; on destruction
+  // it is decremented.
+  class AsynchronousOperationHandle {
+   public:
+    AsynchronousOperationHandle(XlaDevice* device);
+    ~AsynchronousOperationHandle();
+    AsynchronousOperationHandle(const AsynchronousOperationHandle& other);
+    AsynchronousOperationHandle(AsynchronousOperationHandle&& other);
+    AsynchronousOperationHandle& operator=(
+        const AsynchronousOperationHandle& other);
+    AsynchronousOperationHandle& operator=(AsynchronousOperationHandle&& other);
+
+   private:
+    XlaDevice* device_ = nullptr;
+  };
+
+  AsynchronousOperationHandle CreateAsynchronousOperationHandle() {
+    return AsynchronousOperationHandle(this);
+  }
+
  private:
+  friend class AsynchronousOperationHandle;
+
   xla::LocalClient* client() const;
   Allocator* GetAllocatorLocked(AllocatorAttributes attr)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -182,6 +213,7 @@ class XlaDevice : public LocalDevice {
   se::Platform* const platform_;  // Not owned.
   // Memory allocator associated with this device.
   Allocator* xla_allocator_ GUARDED_BY(mu_) = nullptr;  // Not owned.
+
   // Stream associated with this device. Operations enqueued on this
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
@@ -197,9 +229,11 @@ class XlaDevice : public LocalDevice {
   // If use_multiple_streams_, device to host transfers are performed using this
   // stream.
   std::shared_ptr<se::Stream> device_to_host_stream_ GUARDED_BY(mu_);
-  // Must we use XLA's transfer manager for correct host<->device transfers? if
-  // false, we can use ThenMemcpy() instead.
-  const bool transfer_as_literal_;
+  // If use_multiple_streams_, transfers between different devices are performed
+  // using these streams.
+  std::vector<std::shared_ptr<se::Stream>> device_to_device_streams_
+      GUARDED_BY(mu_);
+
   const XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
 
   // The device context accessed by all users of the XlaDevice, set by calls to
@@ -217,6 +251,11 @@ class XlaDevice : public LocalDevice {
   // True if the device requires XlaDevice::Sync to be called on completion
   // regardless of status.
   bool sync_on_completion_ GUARDED_BY(mu_) = false;
+
+  // Count of outstanding asynchronous operations which must be zero on Sync()
+  // completion.
+  int64 outstanding_asynchronous_operations_ GUARDED_BY(mu_) = 0;
+  condition_variable outstanding_asynchronous_operations_cv_;
 };
 
 // Builds OpKernel registrations on 'device' for the JIT operators
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index af83c792e5e11d8596c521c6a3aed332a1f42e5b..6e6532731e64bd42ee56aa719748988f321e0f17 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -50,94 +50,39 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
 
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
 
-XlaTransferManager::XlaTransferManager(
+XlaDeviceContext::XlaDeviceContext(
     std::shared_ptr<se::Stream> compute_stream,
     std::shared_ptr<se::Stream> host_to_device_stream,
-    std::shared_ptr<se::Stream> device_to_host_stream, xla::LocalClient* client,
-    bool transfer_as_literal,
+    std::shared_ptr<se::Stream> device_to_host_stream,
+    std::vector<std::shared_ptr<se::Stream>> device_to_device_streams,
+    xla::LocalClient* client,
     XlaCompiler::ShapeRepresentationFn shape_representation_fn,
     thread::ThreadPool* thread_pool)
     : stream_(std::move(compute_stream)),
       host_to_device_stream_(std::move(host_to_device_stream)),
       device_to_host_stream_(std::move(device_to_host_stream)),
+      device_to_device_streams_(std::move(device_to_device_streams)),
       client_(client),
       transfer_manager_(client->backend().transfer_manager()),
-      transfer_as_literal_(transfer_as_literal),
       shape_representation_fn_(std::move(shape_representation_fn)),
       thread_pool_(thread_pool) {
   CHECK(host_to_device_stream_ != nullptr);
   CHECK(device_to_host_stream_ != nullptr);
   CHECK(stream_ != nullptr);
   if (!shape_representation_fn_) {
-    shape_representation_fn_ =
-        [](const TensorShape& shape,
-           DataType dtype) -> xla::StatusOr<TensorShape> { return shape; };
+    shape_representation_fn_ = [](const TensorShape& shape,
+                                  DataType dtype) -> xla::StatusOr<xla::Shape> {
+      xla::Shape xla_shape;
+      TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
+      return xla_shape;
+    };
   }
 }
 
-Status XlaTransferManager::TransferLiteralToDevice(
-    const Tensor& host_tensor, Tensor* device_tensor) const {
-  xla::Shape xla_shape;
-  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(),
-                                           host_tensor.shape(), &xla_shape));
-  // Create a reference to hold onto host_tensor until after the literal has
-  // been transferred. Also make sure the literal exists until the function
-  // asynchronously completes, as it will be wrapped in an xla::LiteralSlice.
-  TensorReference ref(host_tensor);
-  auto literal = std::make_shared<xla::BorrowingLiteral>(
-      static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
-
-  XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
-  const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer();
-  VLOG(1) << "Transfer to device as literal: " << literal->ToString() << " "
-          << shaped_buffer.ToString();
-  if (UseMultipleStreams() && !transfer_manager_->CanShapedBufferBeAccessedNow(
-                                  stream_->parent(), shaped_buffer)) {
-    // Initially wait for the compute stream so that memory allocations are
-    // synchronized.
-    host_to_device_stream_->ThenWaitFor(stream_.get());
-  }
-  TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
-      host_to_device_stream_.get(), *literal, shaped_buffer));
-  if (UseMultipleStreams()) {
-    auto event = std::make_shared<se::Event>(stream_->parent());
-    TF_RET_CHECK(event->Init()) << "Event failed to initialize!";
-    host_to_device_stream_->ThenRecordEvent(event.get());
-    xla_tensor->SetDefinedOn(host_to_device_stream_.get(), std::move(event));
-  }
-  // Unref the host tensor, and capture the literal shared_ptr too so it goes
-  // out of scope when the lambda completes.
-  host_to_device_stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); });
-
-  return Status::OK();
-}
-
-void XlaTransferManager::TransferLiteralFromDevice(
-    Tensor* host_tensor, const Tensor& device_tensor,
-    const StatusCallback& done) const {
-  xla::MutableBorrowingLiteral literal;
-  TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(host_tensor, &literal));
-
-  const xla::ShapedBuffer& shaped_buffer =
-      XlaTensor::FromTensor(&device_tensor)->shaped_buffer();
-
-  TensorReference ref(device_tensor);
-  transfer_manager_->TransferLiteralFromDevice(
-      device_to_host_stream_.get(), shaped_buffer, literal,
-      [=, &shaped_buffer](xla::Status status) {
-        ref.Unref();
-        done([&]() -> Status {
-          VLOG(1) << "Transfer from device as literal: "
-                  << shaped_buffer.ToString();
-          return status;
-        }());
-      });
-}
-
-void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
-                                               Device* device,
-                                               Tensor* device_tensor,
-                                               StatusCallback done) const {
+void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
+                                             Device* device,
+                                             Tensor* device_tensor,
+                                             StatusCallback done) const {
   if (cpu_tensor->NumElements() == 0) {
     VLOG(2) << "CopyCPUTensorToDevice empty tensor";
     done(Status::OK());
@@ -152,61 +97,85 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
           << cpu_tensor->shape().DebugString() << " "
           << device_tensor->shape().DebugString();
 
-  void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
-  const int64 total_bytes = cpu_tensor->TotalBytes();
 
   XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
   CHECK(xla_tensor);
 
-  xla::StatusOr<TensorShape> shape_or_status =
-      shape_representation_fn_(device_tensor->shape(), device_tensor->dtype());
-  if (!shape_or_status.ok()) {
-    done(shape_or_status.status());
-    return;
-  }
-  TensorShape shape = shape_or_status.ValueOrDie();
-  if (!xla_tensor->has_shaped_buffer()) {
-    Status s =
+  Status status = [&]() -> Status {
+    TF_ASSIGN_OR_RETURN(xla::Shape shape,
+                        shape_representation_fn_(device_tensor->shape(),
+                                                 device_tensor->dtype()));
+
+    // The device tensor should always be fresh.
+    TF_RET_CHECK(!xla_tensor->has_shaped_buffer());
+
+    xla_tensor->set_host_tensor(*cpu_tensor);
+    TF_RETURN_IF_ERROR(
         xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_,
-                                         stream_->parent()->device_ordinal());
-    if (!s.ok()) {
-      done(s);
-      return;
+                                         stream_->parent()->device_ordinal()));
+
+    // The cpu_tensor and literal that we created here hold the data of host
+    // tensor in descending layout. The layout could be different from layout in
+    // device_tensor (but the logical shape has to be the same). The
+    // transfer_manager is responsible to do corresponding transposing when
+    // transferring the data to device.
+    xla::BorrowingLiteral literal(
+        static_cast<const char*>(DMAHelper::base(cpu_tensor)),
+        xla::ShapeUtil::MakeShape(shape.element_type(),
+                                  xla::AsInt64Slice(shape.dimensions())));
+
+    VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
+            << xla_tensor->shaped_buffer().ToString();
+    if (UseMultipleStreams() &&
+        !transfer_manager_->CanShapedBufferBeAccessedNow(
+            stream_->parent(), xla_tensor->shaped_buffer())) {
+      // Initially wait for the compute stream so that memory allocations are
+      // synchronized.
+      host_to_device_stream_->ThenWaitFor(stream_.get());
     }
-  }
 
-  Status status;
-  if (transfer_as_literal_) {
-    Tensor reshaped_cpu_tensor;
-    if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) {
-      done(errors::Internal(
-          "Tensor::CopyFrom failed when copying from CPU to XLA device"));
-      return;
-    }
-    status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
-  } else {
-    se::DeviceMemoryBase dev_dst_ptr =
-        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
-    host_to_device_stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
-    // TODO(hpucha): Make this asynchronous.
-    Status block_status = host_to_device_stream_->BlockHostUntilDone();
-    if (!block_status.ok()) {
-      status = xla::InternalError(
-          "Failed to complete data transfer on stream %p: %s",
-          host_to_device_stream_.get(), block_status.error_message().c_str());
+    TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
+        host_to_device_stream_.get(), literal, xla_tensor->shaped_buffer()));
+
+    if (UseMultipleStreams()) {
+      auto event = std::make_shared<se::Event>(stream_->parent());
+      TF_RET_CHECK(event->Init()) << "Event failed to initialize!";
+      host_to_device_stream_->ThenRecordEvent(event.get());
+      xla_tensor->ResetDefinitionEvent(std::move(event),
+                                       host_to_device_stream_.get());
     }
+
+    return Status::OK();
+  }();
+  if (!status.ok()) {
+    done(status);
+    return;
   }
-  if (status.ok()) {
-    xla_tensor->set_host_tensor(*cpu_tensor);
+
+  // Create a reference to hold onto cpu_tensor until after the literal has
+  // been transferred
+  TensorReference ref(*cpu_tensor);
+  if (UseMultipleStreams()) {
+    // Unref the host tensor when the transfer completes.
+    // We don't defer the call to done() onto the stream here, and the reasons
+    // why this is correct are subtle. We assume that:
+    // a) all consumers of the device tensor will wait for its definition event.
+    // b) if the tensor is destroyed, then the memory allocator will not hand
+    //    out the same buffers until the transfer has completed.
+    host_to_device_stream_->ThenDoHostCallback([ref]() { ref.Unref(); });
+    done(status);
+  } else {
+    host_to_device_stream_->ThenDoHostCallback([ref, done]() {
+      ref.Unref();
+      done(Status::OK());
+    });
   }
-  done(status);
 }
 
-void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
-                                               absl::string_view tensor_name,
-                                               Device* device,
-                                               Tensor* cpu_tensor,
-                                               StatusCallback done) {
+void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                                             absl::string_view tensor_name,
+                                             Device* device, Tensor* cpu_tensor,
+                                             StatusCallback done) {
   if (device_tensor->NumElements() == 0) {
     VLOG(2) << "CopyDeviceTensorToCPU empty tensor";
     done(Status::OK());
@@ -220,136 +189,38 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
           << cpu_tensor->shape().DebugString() << " "
           << device_tensor->shape().DebugString();
 
-  const int64 total_bytes = cpu_tensor->TotalBytes();
-  se::DeviceMemoryBase dev_src_ptr =
-      XlaTensor::DeviceMemoryFromTensor(*device_tensor);
-  void* dst_ptr = DMAHelper::base(cpu_tensor);
   XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+  xla_tensor->WaitForDefinitionEventOnStream(device_to_host_stream_.get());
 
-  if (se::Event* event =
-          xla_tensor->GetDefinitionEvent(device_to_host_stream_.get())) {
-    device_to_host_stream_->ThenWaitFor(event);
-    xla_tensor->SetDefinedOn(device_to_host_stream_.get());
-  }
-
-  Status status;
-  if (transfer_as_literal_) {
-    TransferLiteralFromDevice(cpu_tensor, *device_tensor, done);
-    return;
-  } else {
-    device_to_host_stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
-    // TODO(hpucha): Make this asynchronous.
-    Status block_status = device_to_host_stream_->BlockHostUntilDone();
-    if (!block_status.ok()) {
-      status = xla::InternalError(
-          "Failed to complete data transfer on stream %p: %s", stream_.get(),
-          block_status.error_message().c_str());
-    }
-  }
-
-  done(status);
-}
-
-void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
-                                                  Tensor* dst_tensor,
-                                                  const StatusCallback& done) {
-  VLOG(2) << "CopyDeviceTensorToDevice "
-          << reinterpret_cast<const void*>(src_tensor.tensor_data().data())
-          << " "
-          << reinterpret_cast<const void*>(dst_tensor->tensor_data().data());
-  // Perform memory allocation now, and enqueue the device-to-device transfer.
-  Status status = [&]() -> Status {
-    if (src_tensor.NumElements() == 0) {
-      return Status::OK();
-    }
-    // TODO(jmolloy): We co-opt the device_to_host stream for device to device
-    // transfers; perhaps we should have a dedicated device to device stream? or
-    // one per device?
-    auto device_to_device_stream = stream_;
-    XlaTensor* xla_src = XlaTensor::FromTensor(&src_tensor);
-    XlaTensor* xla_dst = XlaTensor::FromTensor(dst_tensor);
-    CHECK(xla_src && xla_dst)
-        << "Missing destination tensor for device-to-device copy";
-    if (!xla_dst->has_shaped_buffer()) {
-      TF_ASSIGN_OR_RETURN(
-          TensorShape shape,
-          shape_representation_fn_(src_tensor.shape(), src_tensor.dtype()));
-      TF_RETURN_IF_ERROR(
-          xla_dst->AllocateShapedBuffer(src_tensor.dtype(), shape, client_,
-                                        stream_->parent()->device_ordinal()));
-      if (stream_ != device_to_device_stream) {
-        // Initially wait for the compute stream so that memory allocations are
-        // synchronized.
-        device_to_device_stream->ThenWaitFor(stream_.get());
-      }
-    }
-
-    if (se::Event* event =
-            xla_src->GetDefinitionEvent(device_to_device_stream.get())) {
-      device_to_device_stream->ThenWaitFor(event);
-      xla_src->SetDefinedOn(device_to_device_stream.get());
-    }
-
-    auto from_iter = xla_src->shaped_buffer().buffers().begin();
-    auto to_iter = xla_dst->shaped_buffer().buffers().begin();
-    for (auto end_iter = xla_src->shaped_buffer().buffers().end();
-         from_iter != end_iter; ++from_iter, ++to_iter) {
-      device_to_device_stream->ThenMemcpyD2D(
-          &to_iter->second, from_iter->second, to_iter->second.size());
-    }
-
-    if (UseMultipleStreams()) {
-      auto event = std::make_shared<se::Event>(stream_->parent());
-      TF_RET_CHECK(event->Init()) << "Event failed to initialize";
-      device_to_device_stream->ThenRecordEvent(event.get());
-      xla_dst->SetDefinedOn(device_to_device_stream.get(), std::move(event));
-    }
-    return Status::OK();
-  }();
-  if (!status.ok()) {
-    return done(status);
-  } else {
-    stream_->ThenDoHostCallback([this, done]() {
-      // We must not call the done closure directly from DoHostCallback to avoid
-      // a deadlock. If done() is the callback that ends an Executor's run, the
-      // Executor may call XlaDevice::Sync() inside the callback. This
-      // deadlocks, because XlaDevice::Sync() waits for all stream activity to
-      // complete.
-      thread_pool_->Schedule([done]() { done(Status::OK()); });
-    });
-  }
-}
-
-XlaDeviceContext::XlaDeviceContext(
-    std::shared_ptr<se::Stream> compute_stream,
-    std::shared_ptr<se::Stream> host_to_device_stream,
-    std::shared_ptr<se::Stream> device_to_host_stream, xla::LocalClient* client,
-    bool transfer_as_literal,
-    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-    thread::ThreadPool* thread_pool)
-    : manager_(std::move(compute_stream), std::move(host_to_device_stream),
-               std::move(device_to_host_stream), client, transfer_as_literal,
-               std::move(shape_representation_fn), thread_pool) {}
-
-void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
-                                             Device* device,
-                                             Tensor* device_tensor,
-                                             StatusCallback done) const {
-  manager_.CopyCPUTensorToDevice(cpu_tensor, device, device_tensor, done);
-}
+  // Transfer manager requires the shape of the shaped buffer to be the same as
+  // literal shape except for the layout.  Set the literal to use xla_tensor's
+  // shape as it is derived from the cpu_tensor's shape using
+  // shape_representation_fn_.
+  xla::MutableBorrowingLiteral literal;
+  TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(
+      xla::LayoutUtil::GetWithDefaultLayout(
+          xla_tensor->shaped_buffer().on_host_shape()),
+      cpu_tensor, &literal));
 
-void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
-                                             absl::string_view tensor_name,
-                                             Device* device, Tensor* cpu_tensor,
-                                             StatusCallback done) {
-  manager_.CopyDeviceTensorToCPU(device_tensor, tensor_name, device, cpu_tensor,
-                                 done);
+  TensorReference ref(*device_tensor);
+  transfer_manager_->TransferLiteralFromDevice(
+      device_to_host_stream_.get(), xla_tensor->shaped_buffer(), literal,
+      [ref, xla_tensor, done](xla::Status status) {
+        done([&]() -> Status {
+          VLOG(1) << "Transfer from device as literal: "
+                  << xla_tensor->shaped_buffer().ToString();
+          return status;
+        }());
+        ref.Unref();
+      });
 }
 
-void XlaDeviceContext::CopyDeviceTensorToDevice(const Tensor& src_tensor,
-                                                Tensor* dst_tensor,
-                                                const StatusCallback& done) {
-  manager_.CopyDeviceTensorToDevice(src_tensor, dst_tensor, done);
+se::Stream* XlaDeviceContext::GetDeviceToDeviceStream() {
+  DCHECK_GT(device_to_device_streams_.size(), 0);
+  absl::MutexLock lock(&mu_);
+  int stream = next_stream_;
+  next_stream_ = (next_stream_ + 1) % device_to_device_streams_.size();
+  return device_to_device_stream(stream);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index df824212948ac96a5df5228cecd9a8c864bbec9a..1e18df197a2dd65590c5181b4dae4481dca36641 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
@@ -44,33 +45,44 @@ class XlaDeviceAllocator : public Allocator {
 };
 
 // Helper class for managing data transfers between host and XLA devices.
-class XlaTransferManager {
+class XlaDeviceContext : public DeviceContext {
  public:
-  explicit XlaTransferManager(
+  explicit XlaDeviceContext(
       std::shared_ptr<se::Stream> compute_stream,
       std::shared_ptr<se::Stream> host_to_device_stream,
       std::shared_ptr<se::Stream> device_to_host_stream,
-      xla::LocalClient* client, bool transfer_as_literal,
+      std::vector<std::shared_ptr<se::Stream>> device_to_device_streams,
+      xla::LocalClient* client,
       XlaCompiler::ShapeRepresentationFn shape_representation_fn,
       thread::ThreadPool* thread_pool);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
-                             Tensor* device_tensor, StatusCallback done) const;
+                             Tensor* device_tensor,
+                             StatusCallback done) const override;
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              absl::string_view tensor_name, Device* device,
-                             Tensor* cpu_tensor, StatusCallback done);
-
-  void CopyDeviceTensorToDevice(const Tensor& src_tensor, Tensor* dst_tensor,
-                                const StatusCallback& done);
+                             Tensor* cpu_tensor, StatusCallback done) override;
 
+  xla::LocalClient* client() const { return client_; }
   se::Stream* stream() const { return stream_.get(); }
+  se::Stream* host_to_device_stream() const {
+    return host_to_device_stream_.get();
+  }
+  se::Stream* device_to_host_stream() const {
+    return device_to_host_stream_.get();
+  }
+  se::Stream* device_to_device_stream(int index) const {
+    return device_to_device_streams_.at(index).get();
+  }
+  xla::TransferManager* transfer_manager() const { return transfer_manager_; }
+  const XlaCompiler::ShapeRepresentationFn& shape_representation_fn() const {
+    return shape_representation_fn_;
+  }
+
+  // Returns a device-to-device stream, in round-robin fashion.
+  se::Stream* GetDeviceToDeviceStream();
 
  private:
-  Status TransferLiteralToDevice(const Tensor& host_tensor,
-                                 Tensor* device_tensor) const;
-  void TransferLiteralFromDevice(Tensor* host_tensor,
-                                 const Tensor& device_tensor,
-                                 const StatusCallback& done) const;
   bool UseMultipleStreams() const { return stream_ != host_to_device_stream_; }
 
   // The main compute stream of the device, used to synchronize the transfer
@@ -82,44 +94,22 @@ class XlaTransferManager {
   // The stream to use for transferring data from device to host. Can be
   // idential to stream_, but must not be nullptr.
   std::shared_ptr<se::Stream> device_to_host_stream_;
+  // Streams to use for transferring data directly between different devices,
+  // e.g., over NVLINK.
+  std::vector<std::shared_ptr<se::Stream>> device_to_device_streams_;
+
   // For the underlying memory allocator and XLA's TransferManager.
   xla::LocalClient* client_;
   // Transfer manager, for marshalling data to and from the device.
   xla::TransferManager* transfer_manager_;
-  // True if we must use XLA's TransferManager for correct device transfers.
-  const bool transfer_as_literal_;
+
   XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
 
   // Thread pool used for running closures
   thread::ThreadPool* thread_pool_;
-};
 
-// DeviceContext for operators assigned to XlaDevice devices. The
-// implementation must inherit from DeviceContext but otherwise just
-// wraps the methods in XlaTransferManager.
-class XlaDeviceContext : public DeviceContext {
- public:
-  explicit XlaDeviceContext(
-      std::shared_ptr<se::Stream> compute_stream,
-      std::shared_ptr<se::Stream> host_to_device_stream,
-      std::shared_ptr<se::Stream> device_to_host_stream,
-      xla::LocalClient* client, bool transfer_as_literal,
-      XlaCompiler::ShapeRepresentationFn shape_representation_fn,
-      thread::ThreadPool* thread_pool);
-
-  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
-                             Tensor* device_tensor,
-                             StatusCallback done) const override;
-  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
-                             absl::string_view tensor_name, Device* device,
-                             Tensor* cpu_tensor, StatusCallback done) override;
-  void CopyDeviceTensorToDevice(const Tensor& src_tensor, Tensor* dst_tensor,
-                                const StatusCallback& done);
-
-  se::Stream* stream() const override { return manager_.stream(); }
-
- private:
-  XlaTransferManager manager_;
+  absl::Mutex mu_;
+  int next_stream_ GUARDED_BY(mu_) = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_device_ops.cc b/tensorflow/compiler/jit/xla_device_ops.cc
index 5ecb1afa7bcec910ca843ccd3a782745f2bb6ca8..f56c26ba0103fed152322f0c8971a449610cdc2b 100644
--- a/tensorflow/compiler/jit/xla_device_ops.cc
+++ b/tensorflow/compiler/jit/xla_device_ops.cc
@@ -30,81 +30,43 @@ void XlaDeviceDummyOp::Compute(OpKernelContext* ctx) {
 }
 
 XlaAssignVariableOp::XlaAssignVariableOp(OpKernelConstruction* c)
-    : AsyncOpKernel(c) {
+    : OpKernel(c) {
   OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
 }
 
-void XlaAssignVariableOp::ComputeAsync(OpKernelContext* context,
-                                       DoneCallback done) {
-  OP_REQUIRES_ASYNC(context, dtype_ == context->input(1).dtype(),
-                    errors::InvalidArgument(
-                        "Variable and value dtypes don't match; respectively, ",
-                        dtype_, " and ", context->input(1).dtype()),
-                    done);
+void XlaAssignVariableOp::Compute(OpKernelContext* context) {
+  OP_REQUIRES(context, dtype_ == context->input(1).dtype(),
+              errors::InvalidArgument(
+                  "Variable and value dtypes don't match; respectively, ",
+                  DataTypeString(dtype_), " and ",
+                  DataTypeString(context->input(1).dtype())));
   Var* variable = nullptr;
-  OP_REQUIRES_OK_ASYNC(
-      context,
-      LookupOrCreateResource<Var>(
-          context, HandleFromInput(context, 0), &variable,
-          [this, context](Var** ptr) {
-            *ptr = new Var(dtype_);
-            PersistentTensor unused;
-            Tensor* tmp;
-            AllocatorAttributes attr;
-            TF_RETURN_IF_ERROR(context->allocate_persistent(
-                dtype_, context->input(1).shape(), &unused, &tmp, attr));
-            *(*ptr)->tensor() = *tmp;
-            return Status::OK();
-          }),
-      done);
-  core::ScopedUnref s(variable);
-
-  OP_REQUIRES_ASYNC(context, variable->tensor()->dtype() == dtype_,
-                    errors::InvalidArgument(
-                        "Trying to assign variable with wrong dtype. Expected ",
-                        DataTypeString(variable->tensor()->dtype()), " got ",
-                        DataTypeString(dtype_)),
-                    done);
-
   const Tensor& value = context->input(1);
-  AllocatorAttributes attr;
-
-  // Copying is unnecessary if we are the last user of the value tensor, we can
-  // just adopt the input tensor's buffer instead.
-  std::unique_ptr<Tensor> input_alias = context->forward_input(
-      1, /*output_index=*/OpKernelContext::Params::kNoReservation, dtype_,
-      value.shape(), DEVICE_MEMORY, attr);
+  // Note: every resource-variable-manipulating op assumes copy-on-write
+  // semantics, and creates a copy of the variable's Tensor if its refcount is
+  // bigger than 1 when we try to modify it. This means we never need to copy
+  // the original tensor for AssignVariableOp; even if there are other live
+  // users of it we know none can modify it so this is always safe (even in
+  // esoteric cases where the same tensor is used to initialize multiple
+  // variables or the tensor is a constant this is safe, as future writes will
+  // trigger copies).
+  OP_REQUIRES_OK(context, LookupOrCreateResource<Var>(
+                              context, HandleFromInput(context, 0), &variable,
+                              [this, &value](Var** ptr) {
+                                *ptr = new Var(dtype_);
+                                *(*ptr)->tensor() = value;
+                                (*ptr)->is_initialized = true;
+                                return Status::OK();
+                              }));
+  core::ScopedUnref s(variable);
   mutex_lock ml(*variable->mu());
+  OP_REQUIRES(context, variable->tensor()->dtype() == dtype_,
+              errors::InvalidArgument(
+                  "Trying to assign variable with wrong dtype. Expected ",
+                  DataTypeString(variable->tensor()->dtype()), " got ",
+                  DataTypeString(dtype_)));
   variable->is_initialized = true;
-  if (input_alias) {
-    *variable->tensor() = *input_alias;
-    done();
-    return;
-  }
-
-  // Need to copy, but maybe we can re-use variable's buffer?
-  if (!XlaTensor::RefCountIsOne(*variable->tensor()) ||
-      !variable->tensor()->shape().IsSameSize(value.shape())) {
-    // Copy to new buffer
-    PersistentTensor unused;
-    Tensor* tmp;
-    OP_REQUIRES_OK_ASYNC(context,
-                         context->allocate_persistent(dtype_, value.shape(),
-                                                      &unused, &tmp, attr),
-                         done);
-    *variable->tensor() = *tmp;
-  }
-
-  XlaDeviceContext* device_context =
-      static_cast<XlaDeviceContext*>(context->op_device_context());
-
-  variable->Ref();
-  device_context->CopyDeviceTensorToDevice(
-      value, variable->tensor(), [context, variable, done](Status status) {
-        variable->Unref();
-        context->SetStatus(status);
-        done();
-      });
+  *variable->tensor() = value;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 6967ad1f03fb5dd962d5b41f0c7ab1dfa42fab94..927f983ba9ef23c8509523f42366c0c89c29db9f 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/resource_variable_ops.h"
 #include "tensorflow/core/kernels/sendrecv_ops.h"
 #include "tensorflow/core/kernels/shape_ops.h"
+#include "tensorflow/core/kernels/stack.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 namespace tensorflow {
@@ -49,10 +50,10 @@ class XlaDeviceDummyOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 };
 
-class XlaAssignVariableOp : public AsyncOpKernel {
+class XlaAssignVariableOp : public OpKernel {
  public:
   explicit XlaAssignVariableOp(OpKernelConstruction* c);
-  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
+  void Compute(OpKernelContext* context) override;
 
  private:
   DataType dtype_;
@@ -65,11 +66,13 @@ class XlaAssignVariableOp : public AsyncOpKernel {
                               .HostMemory("resources"),   \
                           KERNEL);
 
-#define REGISTER_XLA_COMPILE_KERNEL(DEVICE, KERNEL, TYPES) \
-  REGISTER_KERNEL_BUILDER(Name("_XlaCompile")              \
-                              .Device(DEVICE)              \
-                              .HostMemory("constants")     \
-                              .HostMemory("resources"),    \
+#define REGISTER_XLA_COMPILE_KERNEL(DEVICE, KERNEL, TYPES)          \
+  REGISTER_KERNEL_BUILDER(Name("_XlaCompile")                       \
+                              .Device(DEVICE)                       \
+                              .HostMemory("constants")              \
+                              .HostMemory("key")                    \
+                              .HostMemory("compilation_successful") \
+                              .HostMemory("resources"),             \
                           KERNEL);
 
 #define REGISTER_XLA_RUN_KERNEL(DEVICE, KERNEL, TYPES) \
@@ -91,6 +94,9 @@ class XlaAssignVariableOp : public AsyncOpKernel {
       ConstantOp);                                                             \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Identity").Device(DEVICE).TypeConstraint("T", TYPES), IdentityOp); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("Identity").Device(DEVICE).TypeConstraint("T", DT_STRING),          \
+      IdentityOp);                                                             \
   REGISTER_KERNEL_BUILDER(Name("IdentityN").Device(DEVICE), IdentityNOp);      \
   REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE), PlaceholderOp);  \
   REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE),                \
@@ -197,6 +203,8 @@ class XlaAssignVariableOp : public AsyncOpKernel {
                               .HostMemory("output")                            \
                               .TypeConstraint<ResourceHandle>("T"),            \
                           ArgOp);                                              \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kArgOp).Device(DEVICE).TypeConstraint<Variant>("T"), ArgOp);        \
                                                                                \
   REGISTER_KERNEL_BUILDER(Name(kRetOp)                                         \
                               .Device(DEVICE)                                  \
@@ -208,6 +216,8 @@ class XlaAssignVariableOp : public AsyncOpKernel {
                               .TypeConstraint<ResourceHandle>("T")             \
                               .HostMemory("input"),                            \
                           RetvalOp);                                           \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kDeviceRetOp).Device(DEVICE).TypeConstraint<int32>("T"), RetvalOp); \
                                                                                \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("RemoteCall").Device(DEVICE).HostMemory("target"), RemoteCallOp);   \
@@ -250,9 +260,27 @@ class XlaAssignVariableOp : public AsyncOpKernel {
                               .Device(DEVICE)                                  \
                               .TypeConstraint<string>("T")                     \
                               .HostMemory("input"),                            \
-                          RetvalOp);
+                          RetvalOp);                                           \
+                                                                               \
+  REGISTER_KERNEL_BUILDER(Name("StackV2")                                      \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("max_size")                          \
+                              .HostMemory("handle"),                           \
+                          StackOp);                                            \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")                                  \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("handle")                            \
+                              .TypeConstraint("T", TYPES),                     \
+                          TemplatedStackPushOp</*allow_swapping=*/false>);     \
+  REGISTER_KERNEL_BUILDER(Name("StackPopV2")                                   \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("handle")                            \
+                              .TypeConstraint("elem_type", TYPES),             \
+                          StackPopOp);                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("StackCloseV2").Device(DEVICE).HostMemory("handle"), StackCloseOp);
 
-// TODO(phawkins): currently we do not register the QueueEnqueueMany,
+// TODO(b/118881356): currently we do not register the QueueEnqueueMany,
 // QueueDequeueMany, or QueueDequeueUpTo kernels because they attempt to read
 // and write the tensors they access in order to concatenate them into a batch.
 // We would need either to call out to an XLA computation to perform the
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 60979556a3245f4a9984cde889835ce31154fe18..0191315a66f4d331e54fadc9dc6a073a05fd67ef 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -16,6 +16,10 @@ limitations under the License.
 // Registers the XLA_GPU device, which is an XlaDevice instantiation that runs
 // operators using XLA via the XLA "CUDA" (GPU) backend.
 
+#include <set>
+#include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@@ -28,45 +32,76 @@ namespace tensorflow {
 class XlaGpuDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 };
 
-Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options,
-                                          const string& name_prefix,
-                                          std::vector<Device*>* devices) {
+Status XlaGpuDeviceFactory::CreateDevices(
+    const SessionOptions& session_options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
-  registration.requires_compilation = true;
-  registration.enable_jit_by_default = false;
+  registration.autoclustering_policy =
+      XlaOpRegistry::AutoclusteringPolicy::kAlways;
   registration.compile_resource_ops = true;
+  XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_GPU, registration);
 
   static XlaDeviceOpRegistrations* registrations =
       RegisterXlaDeviceKernels(DEVICE_XLA_GPU, DEVICE_GPU_XLA_JIT);
   (void)registrations;
 
-  std::unique_ptr<XlaDevice> device;
-  Status status =
-      XlaDevice::Create("CUDA", DEVICE_XLA_GPU, 0, DEVICE_GPU_XLA_JIT, options,
-                        name_prefix, registration,
-                        /*transfer_as_literal=*/false,
-                        /*use_multiple_streams=*/false,
-                        /*shape_representation_fn=*/{},
-                        /*padded_shape_fn=*/{}, &device);
-  if (!status.ok()) {
+  auto platform = se::MultiPlatformManager::PlatformWithName("CUDA");
+  if (!platform.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
-    VLOG(1) << "Failed to create XLA_GPU device: " << status;
+    VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();
     return Status::OK();
   }
-
-  // TODO(b/78468222): Uncomment after fixing this bug
-  // status = device->UseGpuDeviceInfo();
-  // if (!status.ok()) {
-  //  errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT,
-  //                          " device");
-  //  return status;
-  // }
-
-  devices->push_back(device.release());
+  string allowed_gpus =
+      session_options.config.gpu_options().visible_device_list();
+  std::set<int> gpu_ids;
+  int num_visible_devices = platform.ValueOrDie()->VisibleDeviceCount();
+  if (allowed_gpus.empty()) {
+    for (int i = 0; i < num_visible_devices; ++i) {
+      gpu_ids.insert(i);
+    }
+  } else {
+    // For loop below is copied from gpu/gpu_device.cc. It validates
+    // the visible_device_list and populates gpu_ids set.
+    const std::vector<string> visible_devices =
+        absl::StrSplit(allowed_gpus, ',');
+    for (const string& platform_gpu_id_str : visible_devices) {
+      int32 platform_gpu_id;
+      if (!absl::SimpleAtoi(platform_gpu_id_str, &platform_gpu_id)) {
+        return errors::InvalidArgument(
+            "Could not parse entry in 'visible_device_list': '",
+            platform_gpu_id_str, "'. visible_device_list = ", allowed_gpus);
+      }
+      if (platform_gpu_id < 0 || platform_gpu_id >= num_visible_devices) {
+        return errors::InvalidArgument(
+            "'visible_device_list' listed an invalid GPU id '", platform_gpu_id,
+            "' but visible device count is ", num_visible_devices);
+      }
+      gpu_ids.insert(platform_gpu_id);
+    }
+  }
+  for (int i : gpu_ids) {
+    XlaDevice::Options options;
+    options.platform = platform.ValueOrDie();
+    options.device_name_prefix = name_prefix;
+    options.device_name = DEVICE_XLA_GPU;
+    options.device_ordinal = i;
+    options.compilation_device_name = DEVICE_GPU_XLA_JIT;
+    options.use_multiple_streams = true;
+    auto device = absl::make_unique<XlaDevice>(session_options, options);
+
+    Status status = device->UseGpuDeviceInfo();
+    if (!status.ok()) {
+      errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT,
+                              " device number ", i);
+      return status;
+    }
+
+    devices->push_back(std::move(device));
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index 19e681af0c940023de2ce82b3b337babe2f3dd5a..4007309ed1c57b663dca5bac0df11260bf1327f3 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // Registers the XLA_INTERPRETER device which exposes the XLA Interpreter.
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@@ -25,37 +26,43 @@ namespace tensorflow {
 const char* const DEVICE_XLA_INTERPRETER = "XLA_INTERPRETER";
 const char* const DEVICE_INTERPRETER_XLA_JIT = "XLA_INTERPRETER_JIT";
 
-constexpr std::array<DataType, 6> kExecAllTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 9> kExecAllTypes = {
+    {DT_INT8, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
+     DT_BOOL, DT_BFLOAT16}};
 
 class XlaInterpreterDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 };
 
 Status XlaInterpreterDeviceFactory::CreateDevices(
-    const SessionOptions& options, const string& name_prefix,
-    std::vector<Device*>* devices) {
+    const SessionOptions& session_options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
       DEVICE_XLA_INTERPRETER, DEVICE_INTERPRETER_XLA_JIT);
   (void)registrations;
 
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
-  registration.requires_compilation = true;
-  registration.enable_jit_by_default = false;
+  registration.autoclustering_policy =
+      XlaOpRegistry::AutoclusteringPolicy::kAlways;
   registration.compile_resource_ops = true;
+  XlaOpRegistry::RegisterCompilationDevice(DEVICE_XLA_INTERPRETER,
+                                           registration);
+
+  TF_ASSIGN_OR_RETURN(
+      auto platform, se::MultiPlatformManager::PlatformWithName("Interpreter"));
+
+  XlaDevice::Options options;
+  options.platform = platform;
+  options.device_name_prefix = name_prefix;
+  options.device_name = DEVICE_XLA_INTERPRETER;
+  options.device_ordinal = 0;
+  options.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
+  options.use_multiple_streams = false;
+  devices->push_back(absl::make_unique<XlaDevice>(session_options, options));
 
-  std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create("Interpreter", DEVICE_XLA_INTERPRETER, 0,
-                                       DEVICE_INTERPRETER_XLA_JIT, options,
-                                       name_prefix, registration,
-                                       /*transfer_as_literal=*/false,
-                                       /*use_multiple_streams=*/false,
-                                       /*shape_representation_fn=*/{},
-                                       /*padded_shape_fn=*/{}, &device));
-  devices->push_back(device.release());
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 4f6fc4e068e3ba125ddbca264c1affa1f09f5896..3b0bda4caa161a7561a3098b89420329998ff8a7 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -41,22 +42,127 @@ using xla::ScopedShapedBuffer;
 using xla::ShapedBuffer;
 }  // anonymous namespace
 
-std::map<int, OptionalTensor> SnapshotResourceVariables(
-    OpKernelContext* ctx, absl::Span<const int> variables) {
-  std::map<int, OptionalTensor> snapshot;
-  for (int i : variables) {
-    Var* variable = nullptr;
-    ResourceHandle handle = HandleFromInput(ctx, i);
-    OptionalTensor& tensor = snapshot[i];
-    if (LookupResource(ctx, handle, &variable).ok()) {
-      core::ScopedUnref scoped_unref(variable);
-      tf_shared_lock lock(*variable->mu());
-      tensor.name = handle.name();
+VariableInfo::VariableInfo(int index, Var* var) : index_(index), var_(var) {}
+VariableInfo::VariableInfo(VariableInfo&& other)
+    : index_(other.index_), var_(other.var_), lock_held_(other.lock_held_) {
+  other.index_ = -1;
+  other.var_ = nullptr;
+}
+
+VariableInfo& VariableInfo::operator=(VariableInfo&& other) {
+  index_ = other.index_;
+  var_ = other.var_;
+  lock_held_ = other.lock_held_;
+
+  other.index_ = -1;
+  other.var_ = nullptr;
+
+  return *this;
+}
+
+VariableInfo::~VariableInfo() {
+  // Release the variable's lock if we hold it. Ensures that the lock is
+  // released even on error.  It does not matter in what order we release the
+  // locks.
+  if (var()) {
+    if (lock_held()) {
+      var()->mu()->unlock();
+    }
+
+    // Unref the variable so it can be released by ResourceManager.
+    var()->Unref();
+  }
+}
+
+// Returns a vector of VaribleInfo instances for the resource variable inputs to
+// the kernel with context `ctx`.  The input indices for the resource variable
+// inputs are in `variable_indices`.
+static Status GetVariableInfosFromCtxInputs(
+    OpKernelContext* ctx, absl::Span<const int> variable_indices,
+    std::vector<VariableInfo>* result) {
+  std::vector<const ResourceHandle*> resource_handles;
+  absl::c_transform(
+      variable_indices, std::back_inserter(resource_handles),
+      [&](int variable_idx) { return &HandleFromInput(ctx, variable_idx); });
+
+  std::vector<std::unique_ptr<Var, core::RefCountDeleter>> variables;
+  TF_RETURN_IF_ERROR(LookupResources(ctx, resource_handles, &variables));
+
+  result->clear();
+  result->reserve(variable_indices.size());
+  for (int i = 0; i < variable_indices.size(); i++) {
+    // *Release* the variable because we're going to unref it later in
+    // ~VariableInfo.
+    Var* variable = variables[i].release();
+    result->emplace_back(variable_indices[i], variable);
+  }
+
+  return Status::OK();
+}
+
+Status LockVariables(absl::Span<VariableInfo> variables) {
+  std::vector<int> lock_order(variables.size());
+  std::iota(lock_order.begin(), lock_order.end(), 0);
+
+  // VariableInfoComparator orders all empty VariableInfo instances as
+  // equivalent so it looks like we may want to stable sort these to maintain a
+  // deterministic order between the empty VariableInfo instances.  However
+  // since we're sorting by pointer value the sort is pretty non-deterministic
+  // anyway so we don't bother using std::stable_sort for now.
+  absl::c_sort(lock_order, [&](int a, int b) {
+    if (variables[a].var() && variables[b].var()) {
+      return variables[a].var()->mu() < variables[b].var()->mu();
+    }
+
+    // Move all the empty VariableInfo instances to the end.
+    return variables[a].var() != nullptr;
+  });
+
+  mutex* prev = nullptr;
+  for (int i : lock_order) {
+    Var* variable = variables[i].var();
+    if (variable == nullptr) {
+      // All empty VariableInfo instances are at the end of the order
+      // so we're done.
+      break;
+    }
+    mutex* mu = variable->mu();
+    if (prev == mu) {
+      // It is an error to pass the same variable handle twice to the same XLA
+      // cluster because we would not handle variable updates correctly.  Any
+      // locks we have already acquired will be released when the VariableInfo
+      // objects are destroyed.
+      return errors::Internal("Duplicate variable passed to XLA cluster");
+    }
+    VLOG(4) << "Acquiring lock for variable "
+            << reinterpret_cast<void*>(variable);
+    mu->lock();
+    variables[i].set_lock_held();
+    prev = mu;
+  }
+  VLOG(4) << "Finished acquiring variable locks.";
+  return Status::OK();
+}
+
+Status SnapshotResourceVariables(OpKernelContext* ctx,
+                                 absl::Span<const int> variable_indices,
+                                 std::map<int, OptionalTensor>* result) {
+  std::vector<VariableInfo> variable_infos;
+  TF_RETURN_IF_ERROR(
+      GetVariableInfosFromCtxInputs(ctx, variable_indices, &variable_infos));
+  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
+
+  for (int i = 0; i < variable_indices.size(); i++) {
+    if (variable_infos[i].var()) {
+      OptionalTensor& tensor = (*result)[variable_indices[i]];
+      tensor.name = HandleFromInput(ctx, variable_indices[i]).name();
       tensor.present = true;
-      tensor.value = *variable->tensor();
+      tensor.value = *variable_infos[i].var()->tensor();
+    } else {
+      (*result)[variable_indices[i]] = OptionalTensor();
     }
   }
-  return snapshot;
+  return Status::OK();
 }
 
 XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
@@ -85,40 +191,6 @@ Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
   return Status::OK();
 }
 
-namespace internal {
-// Return the 'index''th subtree of the given ShapedBuffer as a
-// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
-// subtree, and sets the input's buffer pointers to nullptr for the subtree.
-ScopedShapedBuffer ExtractSubShapedBuffer(
-    ShapedBuffer* shaped_buffer, int index,
-    xla::DeviceMemoryAllocator* allocator) {
-  const xla::Shape& on_host_shape = xla::ShapeUtil::GetTupleElementShape(
-      shaped_buffer->on_host_shape(), index);
-  const xla::Shape& on_device_shape = xla::ShapeUtil::GetTupleElementShape(
-      shaped_buffer->on_device_shape(), index);
-
-  ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
-                                 shaped_buffer->platform(),
-                                 shaped_buffer->device_ordinal());
-
-  auto& shape_tree = shaped_buffer->buffers();
-  auto& sub_shape_tree = sub_shaped_buffer.buffers();
-  sub_shape_tree.CopySubtreeFrom(shape_tree,
-                                 /*source_base_index=*/{index},
-                                 /*target_base_index=*/{});
-  shape_tree.ForEachMutableElement(
-      [index](const xla::ShapeIndex& shape_index,
-              tensorflow::se::DeviceMemoryBase* data) {
-        // shape_index is empty for the root node. Ignore that.
-        if (!shape_index.empty() && shape_index[0] == index) {
-          *data = tensorflow::se::DeviceMemoryBase(nullptr, 0);
-        }
-      });
-  return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
-}
-}  // namespace internal
-using internal::ExtractSubShapedBuffer;
-
 XlaComputationLaunchContext::XlaComputationLaunchContext(
     xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator,
     bool allocate_xla_tensors, bool use_multiple_streams)
@@ -160,10 +232,7 @@ void XlaComputationLaunchContext::PopulateInputs(
       CHECK(stream) << "Must have a stream available when using XLA tensors!";
       XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
       CHECK(xla_tensor);
-      if (se::Event* event = xla_tensor->GetDefinitionEvent(stream)) {
-        stream->ThenWaitFor(event);
-        xla_tensor->SetDefinedOn(stream);
-      }
+      xla_tensor->WaitForDefinitionEventOnStream(stream);
     }
 
     const xla::Shape on_device_shape =
@@ -288,10 +357,9 @@ Status XlaComputationLaunchContext::PopulateOutputs(
           TF_RETURN_IF_ERROR(ctx->allocate_output(i, shape, &output_tensor));
           XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
           if (xla_tensor) {
-            xla_tensor->set_shaped_buffer(ScopedShapedBuffer(
-                ExtractSubShapedBuffer(&output, output_num, xla_allocator_)));
+            xla_tensor->set_shaped_buffer(output.TakeSubTree({output_num}));
             if (use_multiple_streams_) {
-              xla_tensor->SetDefinedOn(stream, definition_event);
+              xla_tensor->ResetDefinitionEvent(definition_event, stream);
             }
           } else {
             // xla_tensor wasn't valid, which must mean this is a zero-element
@@ -315,30 +383,35 @@ Status XlaComputationLaunchContext::PopulateOutputs(
 
   // Apply variable updates, if any.
   VLOG(2) << "Applying variable updates";
+  std::vector<VariableInfo> variable_infos;
+  variable_infos.reserve(kernel->resource_updates.size());
+
   for (int i = 0; i < kernel->resource_updates.size(); ++i) {
-    Allocator* allocator = ctx->device()->GetAllocator({});
     const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
     int actual_input_index = write.input_index - missing_ctx_input_prefix;
     if (actual_input_index < 0 || actual_input_index >= ctx->num_inputs()) {
       return errors::Internal("Invalid input index for variable write.");
     }
 
-    se::DeviceMemoryBase buffer = output.buffer({output_num});
-
-    Var* variable = nullptr;
     // TODO(b/35625933): tensorflow::Var should contain a PersistentTensor,
     // not a Tensor.
+    Var* variable = nullptr;
     TF_RETURN_IF_ERROR(LookupOrCreateResource<Var>(
         ctx, HandleFromInput(ctx, actual_input_index), &variable,
         [&write](Var** ptr) {
           *ptr = new Var(write.type);
           return Status::OK();
         }));
+    variable_infos.emplace_back(actual_input_index, variable);
+  }
 
-    core::ScopedUnref s(variable);
+  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
 
-    mutex_lock ml(*variable->mu());
-    if (variable->tensor()->dtype() != write.type) {
+  for (int i = 0; i < kernel->resource_updates.size(); ++i) {
+    Allocator* allocator = ctx->device()->GetAllocator({});
+    const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
+
+    if (variable_infos[i].var()->tensor()->dtype() != write.type) {
       return errors::Internal("Mismatched type in variable write");
     }
 
@@ -346,23 +419,81 @@ Status XlaComputationLaunchContext::PopulateOutputs(
       Tensor output_tensor;
       TF_RETURN_IF_ERROR(
           ctx->allocate_temp(write.type, write.shape, &output_tensor));
-      XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
-      CHECK(xla_tensor);
-      xla_tensor->set_shaped_buffer(
-          ExtractSubShapedBuffer(&output, output_num, xla_allocator_));
-      if (use_multiple_streams_) {
-        xla_tensor->SetDefinedOn(stream, definition_event);
+      if (write.shape.num_elements() > 0) {
+        XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
+        CHECK(xla_tensor);
+        xla_tensor->set_shaped_buffer(output.TakeSubTree({output_num}));
+        if (use_multiple_streams_) {
+          xla_tensor->ResetDefinitionEvent(definition_event, stream);
+        }
       }
-      *variable->tensor() = output_tensor;
+      *variable_infos[i].var()->tensor() = output_tensor;
     } else {
+      se::DeviceMemoryBase buffer = output.buffer({output_num});
+      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
-      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
-      *variable->tensor() = output_tensor;
+      *variable_infos[i].var()->tensor() = output_tensor;
     }
     ++output_num;
   }
   return Status::OK();
 }
 
+Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
+    const std::map<int, Tensor>& constant_args,
+    const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+    std::vector<XlaCompiler::Argument>* args) {
+  args->resize(ctx->num_inputs());
+
+  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
+    XlaCompiler::Argument& arg = (*args)[input_num];
+    if (constant_args.count(input_num) > 0) {
+      // Handles compile-time constants.
+      const Tensor& input = constant_args.at(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      arg.kind = XlaCompiler::Argument::kConstant;
+      arg.type = input.dtype();
+      arg.shape = input.shape();
+      arg.constant_value = input;
+    } else if (variable_args.count(input_num) == 0) {
+      // Handles the non-constant arguments.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      if (input.NumElements() > 0) {
+        arg.kind = XlaCompiler::Argument::kParameter;
+      } else {
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = input;
+      }
+      arg.type = input.dtype();
+      arg.shape = input.shape();
+    } else {
+      // Handles resource variables.
+      const Tensor& input = ctx->input(input_num);
+      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
+      const OptionalTensor& variable = variable_args.at(input_num);
+      arg.name = variable.name;
+      arg.kind = XlaCompiler::Argument::kResource;
+      arg.resource_kind = XlaResource::kVariable;
+      if (variable.present) {
+        const Tensor& value = variable.value;
+        arg.type = value.dtype();
+        arg.shape = value.shape();
+        arg.initialized = true;
+      } else {
+        // The values of uninitialized variables are not passed as inputs, since
+        // they are meaningless. However, it is legal to assign to a resource
+        // variable for the first time inside the XLA computation, so we do
+        // permit uninitialized variables.
+        arg.initialized = false;
+        arg.type = DT_INVALID;
+        arg.shape = TensorShape();
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 326d70a027564343408df356833c97e131495da0..554227f09de0ab4d9e07f199b957657f3121ff06 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 
+#include "absl/base/thread_annotations.h"
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -34,17 +35,75 @@ limitations under the License.
 namespace tensorflow {
 class XlaAllocator;
 
-// Takes a snapshot of the values of resource variable arguments, whose
-// indices are specified in `variables` argument. We snapshot tensors that back
+// Struct that represents a possibly-absent Tensor.
+struct OptionalTensor {
+  string name;           // A descriptive name
+  bool present = false;  // Is the tensor present?
+  Tensor value;          // If present, what is the Tensor's value?
+};
+
+// Takes a snapshot of the values of resource variable arguments, whose indices
+// are specified in `variable_indices` argument. We snapshot tensors that back
 // resource variables since concurrent updates may modify the shape, and it is
 // important that the shapes used for compilation match the true shapes of the
 // buffers.
 //
+// We snapshot the entire set of resource variables as one atomic operation.
+// This models Read->* dependencies between resource variable operations.  See
+// jit/resource_operation_safety_analysis for details.
+//
 // Returns a map of TensorFlow argument index to resource variable. If a
 // resource variable is not initialized, the corresponding OptionalTensor
 // will have its `present` field set to false.
-std::map<int, OptionalTensor> SnapshotResourceVariables(
-    OpKernelContext* ctx, absl::Span<const int> variables);
+Status SnapshotResourceVariables(OpKernelContext* ctx,
+                                 absl::Span<const int> variable_indices,
+                                 std::map<int, OptionalTensor>* result);
+
+// Information about the state of a variable passed as input to the _XlaCompile
+// and _XlaRun operators.  Unlocks the resource variable and decrements its
+// refcount on destruction.
+class VariableInfo {
+ public:
+  explicit VariableInfo(int index, Var* var);
+  VariableInfo(VariableInfo&& other);
+
+  VariableInfo& operator=(VariableInfo&& other);
+
+  VariableInfo(const VariableInfo&) = delete;
+  VariableInfo& operator=(const VariableInfo&) = delete;
+
+  // The index of the DT_RESOURCE input to the _XlaCompile/_XlaRun operator.
+  // Note that the indices can be different between _XlaCompile and _XlaRun.
+  int index() const { return index_; }
+
+  // A pointer to the resource variable.  May be null if this VariableInfo is
+  // "empty", i.e. it does not track a resource variable.
+  Var* var() const { return var_; }
+
+  // Returns true if the resource variable lock was successfully acquired by
+  // this thread.
+  bool lock_held() const { return lock_held_; }
+  void set_lock_held() { lock_held_ = true; }
+
+  ~VariableInfo();
+
+ private:
+  int index_;
+  Var* var_;
+
+  // We can't use a optional<mutex_lock> here because it confuses the compiler's
+  // thread safety analysis. Instead we use a boolean flag and release the lock
+  // in the VariableInfo destructor.
+  bool lock_held_ = false;
+};
+
+// Acquires the mutexes for all the variables in `variables` using a
+// deadlock-safe protocol (acquire the mutexes in increasing-address order).
+//
+// `variables` is allowed to contain instances that don't track a resource
+// variable (i.e. variables[i].var() can be null for some i).
+Status LockVariables(absl::Span<VariableInfo> variables)
+    EXCLUSIVE_LOCK_FUNCTION();
 
 // Adapter class that wraps a Tensorflow allocator as an XLA allocator.
 // Assumes that the Tensorflow allocator permits asynchronous deallocation:
@@ -87,6 +146,13 @@ class XlaComputationLaunchContext {
                               bool allocate_xla_tensors,
                               bool use_multiple_streams);
 
+  // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
+  // op.
+  static Status BuildXlaCompilerArguments(
+      const std::map<int, Tensor>& constant_args,
+      const std::map<int, OptionalTensor>& variable_args, OpKernelContext* ctx,
+      std::vector<XlaCompiler::Argument>* args);
+
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
   // `variables` is a map from TensorFlow argument number to resource variable.
   //
@@ -99,7 +165,13 @@ class XlaComputationLaunchContext {
                       const std::map<int, OptionalTensor>& variables,
                       int missing_ctx_input_prefix);
 
-  // Given the XLA output in `output`, populate all outputs of `ctx`.
+  // Given the XLA output in `output`, populate all outputs of `ctx`.  Also
+  // writes out the resource variable updates.
+  //
+  // Updates to all resource variables are written in a single atomic operation.
+  // This models *->Write dependencies between resource variable operations.
+  // See jit/resource_operation_safety_analysis for details.
+  //
   //
   // Assumes that the first `missing_ctx_input_prefix` inputs to the kernel are
   // missing and adjusts input indices accordingly.
@@ -127,19 +199,17 @@ class XlaTensorBuffer : public TensorBuffer {
  public:
   XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size,
                   Allocator* allocator)
-      : expected_size_(expected_size),
+      : TensorBuffer(const_cast<void*>(ptr)),
+        expected_size_(expected_size),
         actual_size_(actual_size),
-        allocator_(allocator) {
-    data_ = const_cast<void*>(ptr);
-  }
+        allocator_(allocator) {}
 
   ~XlaTensorBuffer() override {
-    if (data_) {
-      allocator_->DeallocateRaw(data_);
+    if (data()) {
+      allocator_->DeallocateRaw(data());
     }
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return expected_size_; }
 
   TensorBuffer* root_buffer() override { return this; }
@@ -159,23 +229,11 @@ class XlaTensorBuffer : public TensorBuffer {
   }
 
  private:
-  void* data_;
   size_t expected_size_;
   size_t actual_size_;
   Allocator* allocator_;
 };
 
-// Exposed in this header file for microbenchmarking purposes, but this is an
-// internal implementation detail.
-namespace internal {
-// Return the 'index''th subtree of the given ShapedBuffer as a
-// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
-// subtree, and sets the input's buffer pointers to nullptr for the subtree.
-xla::ScopedShapedBuffer ExtractSubShapedBuffer(
-    xla::ShapedBuffer* shaped_buffer, int index,
-    xla::DeviceMemoryAllocator* allocator);
-}  // namespace internal
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
deleted file mode 100644
index a45932403ec1760d6b985d5357fd6d84fbf257a2..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/xla_launch_util_test.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Contains microbenchmarks for performance critical functions in
-// xla_launch_util.cc.
-
-#include "tensorflow/compiler/jit/xla_launch_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-
-// Test ExtractSubBuffer with different depths (depth of ShapeTree) and fan-outs
-// (cardinality of each non-leaf node's children).
-void BM_ExtractSubBuffer(int iters, int depth, int fan_out) {
-  tensorflow::testing::StopTiming();
-  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
-  for (int i = 0; i < depth; ++i) {
-    std::vector<xla::Shape> shapes(fan_out, shape);
-    shape = xla::ShapeUtil::MakeTupleShape(shapes);
-  }
-  xla::ShapedBuffer shaped_buffer(shape, shape, /*platform=*/nullptr,
-                                  /*device_ordinal=*/0);
-  tensorflow::testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    // Extract a buffer from approximately the middle of the first level of the
-    // tree.
-    (void)tensorflow::internal::ExtractSubShapedBuffer(&shaped_buffer,
-                                                       /*index=*/fan_out / 2,
-                                                       /*allocator=*/nullptr)
-        .release();
-  }
-}
-
-BENCHMARK(BM_ExtractSubBuffer)
-    ->ArgPair(1, 4)
-    ->ArgPair(1, 8)
-    ->ArgPair(1, 32)
-    ->ArgPair(1, 64)
-    ->ArgPair(1, 128)
-    ->ArgPair(1, 256)
-    ->ArgPair(1, 512)
-    ->ArgPair(2, 4)
-    ->ArgPair(2, 8)
-    ->ArgPair(2, 32)
-    ->ArgPair(2, 64)
-    ->ArgPair(2, 128);
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  tensorflow::testing::RunBenchmarks();
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index 92ba7de1b7d32fcf693cd12a380d7a1e0d861d71..d1f7f754c8338487557eda512c56be34c9e958b7 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -43,11 +43,10 @@ namespace tensorflow {
   }
 }
 
-Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+Status XlaTensor::AllocateShapedBuffer(DataType dtype,
+                                       const xla::Shape& on_host_shape,
                                        xla::LocalClient* client,
                                        int device_ordinal) {
-  xla::Shape on_host_shape;
-  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &on_host_shape));
   xla::Shape on_device_shape =
       client->backend().transfer_manager()->HostShapeToDeviceShape(
           on_host_shape);
@@ -73,10 +72,10 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
   return Status::OK();
 }
 
-se::Event* XlaTensor::GetDefinitionEvent(se::Stream* stream) {
+void XlaTensor::WaitForDefinitionEventOnStream(se::Stream* stream) {
   mutex_lock lock(mu_);
   if (!definition_event_) {
-    return nullptr;
+    return;
   }
 
   // The set of defined streams is expected to be very small indeed (usually
@@ -84,24 +83,20 @@ se::Event* XlaTensor::GetDefinitionEvent(se::Stream* stream) {
   if (std::find(streams_defined_on_.begin(), streams_defined_on_.end(),
                 stream) != streams_defined_on_.end()) {
     // stream is in streams_defined_on_; it doesn't need to be waited on.
-    return nullptr;
+    return;
   }
 
-  return definition_event_.get();
+  stream->ThenWaitFor(definition_event_.get());
+  streams_defined_on_.push_back(stream);
 }
 
-void XlaTensor::SetDefinedOn(se::Stream* stream,
-                             std::shared_ptr<se::Event> event) {
+void XlaTensor::ResetDefinitionEvent(std::shared_ptr<se::Event> event,
+                                     se::Stream* stream) {
   mutex_lock lock(mu_);
   definition_event_ = std::move(event);
   streams_defined_on_ = {stream};
 }
 
-void XlaTensor::SetDefinedOn(se::Stream* stream) {
-  mutex_lock lock(mu_);
-  streams_defined_on_.push_back(stream);
-}
-
 // The pointer tag, OR-ed into the XlaTensor's address to distinguish it from
 // device-side tensors, which are either CPU or GPU memory pointers. This works
 // because we're guaranteed that CPU and GPU pointers are aligned to > 1 bits.
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index d95da63405889dfd0c279b17789a2195072c7277..77e80aa2527ecc2221ac61f7b7e6ebcce0982931 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -50,7 +50,7 @@ class XlaTensor {
   // Assign the internal ShapedBuffer to new memory for the given dtype and
   // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it
   // is replaced and the managed memory deallocated.
-  Status AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+  Status AllocateShapedBuffer(DataType dtype, const xla::Shape& on_host_shape,
                               xla::LocalClient* client, int device_ordinal);
 
   // Some Tensors can have complex on-device shapes, including tuple shapes. To
@@ -88,23 +88,19 @@ class XlaTensor {
     host_tensor_.reset(new Tensor(tensor));
   }
 
-  // If the tensor's content is not yet defined on 'stream', and there exists an
-  // se::Event declaring when the tensor's content is defined, return it.
-  // Otherwise, return nullptr. If this function returns nullptr then the
-  // tensor's content can be read on 'stream' without additional
-  // synchronization.
-  se::Event* GetDefinitionEvent(se::Stream* stream);
-
-  // Assert that the tensor's content is defined on 'stream' by the time 'event'
-  // triggers.
-  void SetDefinedOn(se::Stream* stream, std::shared_ptr<se::Event> event);
-
-  // Assert that the tensor's content is defined on 'stream'. This version does
-  // not provide an event, and must be called *after* SetDefinedOn(Stream,
-  // Event). This call can be read as an assertion that the definition event has
-  // been waited on by 'stream', so further calls to GetDefinitionEvent(stream)
-  // do not need to also wait on the event.
-  void SetDefinedOn(se::Stream* stream);
+  // Adds synchronization events to 'stream' that wait for this tensor to be
+  // defined on 'stream'. Does nothing if the tensor is already defined on that
+  // stream.
+  void WaitForDefinitionEventOnStream(se::Stream* stream);
+
+  // (Re)sets the definition event of the tensor to 'event', and promises that
+  // the tensor has already been defined on stream. Removes any previous
+  // definition event or any previous promises about the tensor being defined on
+  // streams.
+  // It is legal to reset the definition event of a tensor when overwriting the
+  // tensor's value (at which point, it is effectively a new tensor once again.)
+  void ResetDefinitionEvent(std::shared_ptr<se::Event> event,
+                            se::Stream* stream);
 
   // Convert from a raw pointer to an XlaTensor, removing the pointer tag.
   static XlaTensor* FromOpaquePointer(void* ptr);
diff --git a/tensorflow/compiler/plugin/README.md b/tensorflow/compiler/plugin/README.md
index 9dd0d2bdab5e2c990fd547cef4b657253c545715..07465934aec0364eb03ddfb7f99ea54aaf084fff 100644
--- a/tensorflow/compiler/plugin/README.md
+++ b/tensorflow/compiler/plugin/README.md
@@ -1,5 +1,4 @@
-3rd party XLA devices
----------------------
+## 3rd party XLA devices
 
 This directory is intended as a place for 3rd party XLA devices which are _not_
 integrated into the public repository.
@@ -9,8 +8,5 @@ can be included as a dependency of the JIT subsystem.
 
 For integration into the unit test system, see the files:
 
-- tensorflow/compiler/tests/plugin.bzl
-- tensorflow/compiler/xla/tests/plugin.bzl
-
-
-- 
+-   tensorflow/compiler/tests/plugin.bzl
+-   tensorflow/compiler/xla/tests/plugin.bzl
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index ba2401ed2628beeba2be3bf59a067c3d87ca3f9f..093b61629cd0b04d5d8488139b8d7262b739f86d 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -294,33 +294,6 @@ tf_xla_py_test(
     ],
 )
 
-tf_xla_py_test(
-    name = "oom_test",
-    size = "medium",
-    srcs = ["oom_test.py"],
-    # TODO(b/80081500): Re-enable on GPU. Disabled on 2018-05-21.
-    disabled_backends = [
-        "cpu",
-        "cpu_ondemand",
-        "gpu",
-    ],
-    tags = [
-        # Allocates very large amounts of memory and does not work under TSAN.
-        "notsan",
-        "optonly",  # Times out frequently in fastbuild.
-    ],
-    deps = [
-        ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:gradient_checker",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 tf_xla_py_test(
     name = "conv2d_test",
     size = "medium",
@@ -435,13 +408,6 @@ tf_xla_py_test(
     name = "eager_test",
     size = "large",
     srcs = ["eager_test.py"],
-    disabled_backends = [
-        # TODO(b/78199195) Support XLA CPU devices in eager runtime
-        "cpu",
-        "cpu_ondemand",
-        # TODO(b/78468222) Enable GPU backend
-        "gpu",
-    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -476,12 +442,11 @@ tf_xla_py_test(
     tags = ["optonly"],
     deps = [
         ":xla_test",
-        "//tensorflow/contrib/signal:signal_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:spectral_ops",
+        "//tensorflow/python/ops/signal",
     ],
 )
 
@@ -516,8 +481,6 @@ tf_xla_py_test(
     name = "function_test",
     size = "small",
     srcs = ["function_test.py"],
-    # Functions are not implemented in the on-demand compilation model yet.
-    disabled_backends = "cpu_ondemand",
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -707,9 +670,6 @@ tf_xla_py_test(
     name = "random_ops_test",
     size = "small",
     srcs = ["random_ops_test.py"],
-    disabled_backends = [
-        "cpu_ondemand",
-    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -740,7 +700,6 @@ tf_xla_py_test(
     name = "reduce_window_test",
     size = "small",
     srcs = ["reduce_window_test.py"],
-    disabled_backends = ["cpu_ondemand"],
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -849,8 +808,6 @@ tf_xla_py_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
-    # Stack ops are not implemented in the on-demand compilation model yet.
-    disabled_backends = "cpu_ondemand",
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -867,9 +824,9 @@ tf_xla_py_test(
     tags = ["optonly"],
     deps = [
         ":xla_test",
-        "//tensorflow/contrib/stateless",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:stateless_random_ops",
     ],
 )
 
@@ -878,7 +835,7 @@ tf_xla_py_test(
     size = "small",
     srcs = ["tensor_array_ops_test.py"],
     # TensorArray ops are not implemented in the on-demand compilation model yet.
-    disabled_backends = "cpu_ondemand",
+    disabled_backends = ["cpu_ondemand"],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -899,7 +856,7 @@ tf_xla_py_test(
     size = "small",
     srcs = ["tensor_list_ops_test.py"],
     # TensorList ops are not implemented in the on-demand compilation model yet.
-    disabled_backends = "cpu_ondemand",
+    disabled_backends = ["cpu_ondemand"],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -979,7 +936,6 @@ tf_xla_py_test(
     name = "while_test",
     size = "small",
     srcs = ["while_test.py"],
-    disabled_backends = ["cpu_ondemand"],
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
@@ -1089,6 +1045,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["jit_test.py"],
     additional_deps = [
+        ":test_utils",
         "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -1107,6 +1064,7 @@ cuda_py_test(
     size = "small",
     srcs = ["dense_layer_test.py"],
     additional_deps = [
+        ":test_utils",
         "//tensorflow/contrib/compiler:compiler_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -1134,6 +1092,7 @@ cc_library(
         "//tensorflow/core:test",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:ops_util",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
@@ -1244,7 +1203,6 @@ tf_xla_py_test(
     name = "xla_ops_test",
     size = "medium",
     srcs = ["xla_ops_test.py"],
-    disabled_backends = ["cpu_ondemand"],
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
diff --git a/tensorflow/compiler/tests/adagrad_da_test.py b/tensorflow/compiler/tests/adagrad_da_test.py
index 69fb3ec2964a09508e612515b9e291fc14121d68..e9c2d363acab96c0fb968cb7f901ce105ea8703e 100644
--- a/tensorflow/compiler/tests/adagrad_da_test.py
+++ b/tensorflow/compiler/tests/adagrad_da_test.py
@@ -50,8 +50,8 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllClose([0.0, 0.0], var0.eval())
-        self.assertAllClose([0.0, 0.0], var1.eval())
+        self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+        self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
@@ -63,9 +63,9 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
         # For -0.1*3.0*(0.1 - 0)/(0 + sqrt(0.1 + 0.1*0.1)) = -0.904534
         # similarly for others.
         self.assertAllCloseAccordingToType(
-            np.array([-0.904534, -1.603567]), var0.eval())
+            np.array([-0.904534, -1.603567]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.094821, -0.189358]), var1.eval())
+            np.array([-0.094821, -0.189358]), self.evaluate(var1))
 
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in self.float_types:
@@ -87,16 +87,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.904534, -1.603567]), var0.eval())
+            np.array([-0.904534, -1.603567]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.094821, -0.189358]), var1.eval())
+            np.array([-0.094821, -0.189358]), self.evaluate(var1))
 
   def testAdagradDAWithL1(self):
     for dtype in self.float_types:
@@ -118,16 +118,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.895489, -1.59555]), var0.eval())
+            np.array([-0.895489, -1.59555]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.085339, -0.17989]), var1.eval())
+            np.array([-0.085339, -0.17989]), self.evaluate(var1))
 
   def testAdagradDAWithL1_L2(self):
     for dtype in self.float_types:
@@ -149,16 +149,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.046907, -0.093659]), var0.eval())
+            np.array([-0.046907, -0.093659]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.004275, -0.009023]), var1.eval())
+            np.array([-0.004275, -0.009023]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py
index ab69319c59fb07e7ce56c3c287a50a6290effdfd..e26483303c3934fd51675cb1fbc998b276caf527 100644
--- a/tensorflow/compiler/tests/adagrad_test.py
+++ b/tensorflow/compiler/tests/adagrad_test.py
@@ -42,17 +42,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
   def testTensorLearningRate(self):
@@ -68,17 +70,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
   def testSharing(self):
@@ -103,18 +107,20 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values.
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Mix the first and the second adagrad for 3 steps.
         ada_update1.run()
         ada_update2.run()
         ada_update1.run()
         # Validate updated params (the same as with only 1 Adagrad).
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
 
diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py
index 058576b3d4b695209952158769162bb24e7ccfce..8bcff9d379d34f8a6bb8b0fdc60b7588c6d80be9 100644
--- a/tensorflow/compiler/tests/adam_test.py
+++ b/tensorflow/compiler/tests/adam_test.py
@@ -75,23 +75,24 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTensorLearningRate(self):
     for dtype in self.float_types:
@@ -117,23 +118,24 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testSharing(self):
     for dtype in self.float_types:
@@ -162,13 +164,14 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 3 steps of intertwined Adam1 and Adam2.
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           if t % 2 == 0:
             update1.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
           else:
@@ -178,8 +181,8 @@ class AdamOptimizerTest(xla_test.XLATestCase):
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/adamax_test.py b/tensorflow/compiler/tests/adamax_test.py
index 3ed1d41b7121f44dd7470f61180f7a7055369174..961b46375c941bdc3922e460a2f58345086dbceb 100644
--- a/tensorflow/compiler/tests/adamax_test.py
+++ b/tensorflow/compiler/tests/adamax_test.py
@@ -78,8 +78,8 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power = opt._get_beta_accumulators()
 
@@ -87,14 +87,17 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
         for t in range(1, 4):
           update.run()
 
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval(), rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, var1.eval(), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(var0), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var1_np, self.evaluate(var1), rtol=1e-2)
           self.assertEqual("var0_%d/AdaMax:0" % (i,),
                            opt.get_slot(var=var0, name="m").name)
 
@@ -118,22 +121,23 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power = opt._get_beta_accumulators()
 
         # Run 3 steps of AdaMax
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
           update.run()
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/addsign_test.py b/tensorflow/compiler/tests/addsign_test.py
index 1bc07ace23ccdc83103abe71ee11b72994c75a6d..a37c97e6d374440aeb860b9d02f2d5dd95c91f62 100644
--- a/tensorflow/compiler/tests/addsign_test.py
+++ b/tensorflow/compiler/tests/addsign_test.py
@@ -90,8 +90,8 @@ class AddSignTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 7 steps of AddSign
         # first 4 steps with positive gradient
@@ -125,8 +125,8 @@ class AddSignTest(xla_test.XLATestCase):
 
           # Validate updated params
           self.assertAllCloseAccordingToType(
-              var0_np, var0.eval(), half_rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+              var0_np, self.evaluate(var0), half_rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testDense(self):
     decay_steps = 10
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 1b39d53dc0908e1fa05f766ca1e601731b26846d..9a5423c1b2a5df7880453cbb328f6a8174066255 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
@@ -178,6 +180,13 @@ class BinaryOpsTest(xla_test.XLATestCase):
               [0, 0, 0, 0, 0, 0.1, 0.3, 0.5, 0.7, 0.9, 6.1, 10.0], dtype=dtype),
           expected=np.array([0, 0, 0, 0, 0, 6, 7, 8, 9, 10, 0, 0], dtype=dtype))
 
+      self._testBinary(
+          gen_nn_ops.leaky_relu_grad,
+          np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=dtype),
+          np.array([0, 0, 0, 0, 0, 0.1, 0.3, 0.5, 0.7, 0.9], dtype=dtype),
+          expected=np.array([0.2, 0.4, 0.6, 0.8, 1, 6, 7, 8, 9, 10],
+                            dtype=dtype))
+
       self._testBinary(
           gen_nn_ops.softmax_cross_entropy_with_logits,
           np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=dtype),
@@ -209,6 +218,21 @@ class BinaryOpsTest(xla_test.XLATestCase):
             ],
             equality_test=self.ListsAreClose)
 
+      # TF doesn't define these for bf16.
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        self._testBinary(
+            gen_math_ops.xdivy,
+            np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
+            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
+            expected=np.array([0, 0.8, 0.5, 0.285714, 0.125, 0], dtype=dtype))
+
+        self._testBinary(
+            gen_math_ops.xlogy,
+            np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
+            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
+            expected=np.array([0, 6.437752, 5.375278, 3.89182, 2.079442, 0],
+                              dtype=dtype))
+
   def testIntOps(self):
     for dtype in self.signed_int_types:
       self._testBinary(
@@ -960,7 +984,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
       self._testBinary(
           array_ops.expand_dims,
           np.array([42], dtype=dtype),
-          np.int32(0),
+          np.array([0], dtype=np.int64),
           expected=np.array([[42]], dtype=dtype))
       self._testBinary(
           array_ops.expand_dims,
@@ -987,15 +1011,21 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array([[[1, 2], [3, 4]]], dtype=dtype),
           np.int32(3),
           expected=np.array([[[[1], [2]], [[3], [4]]]], dtype=dtype))
+      self._testBinary(
+          array_ops.expand_dims,
+          np.array([[[1, 2], [3, 4]]], dtype=dtype),
+          np.array([2], dtype=np.int64),
+          expected=np.array([[[[1, 2]], [[3, 4]]]], dtype=dtype))
 
   def testPad(self):
-    for dtype in self.numeric_types:
+    for dtype, pad_type in itertools.product(
+        self.numeric_types, [np.int32, np.int64]):
       self._testBinary(
           array_ops.pad,
           np.array(
               [[1, 2, 3], [4, 5, 6]], dtype=dtype),
           np.array(
-              [[1, 2], [2, 1]], dtype=np.int32),
+              [[1, 2], [2, 1]], dtype=pad_type),
           expected=np.array(
               [[0, 0, 0, 0, 0, 0],
                [0, 0, 1, 2, 3, 0],
@@ -1009,7 +1039,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array(
               [[1, 2, 3], [4, 5, 6]], dtype=dtype),
           np.array(
-              [[0, 3], [2, 1]], dtype=np.int32),
+              [[0, 3], [2, 1]], dtype=pad_type),
           expected=np.array(
               [[7, 7, 1, 2, 3, 7],
                [7, 7, 4, 5, 6, 7],
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 1d3979b21bfd915a641fabe1ef40301b3e5a17b4..447a7de2cb6526a5dcf7789d4f2bffb5e733e8c0 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -50,6 +50,8 @@ def tf_xla_py_test(
     """
     if disabled_backends == None:
         disabled_backends = []
+    if type(disabled_backends) != "list":
+        fail("disabled_backends must be a list of strings", "disabled_backends")
 
     enabled_backends = [b for b in all_backends() if b not in disabled_backends]
     test_names = []
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index a57d1dc81ea2c9c188b0a3005904738aa8156bf3..5d5e486f616937601214aa169a4c329ab78932c8 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import googletest
 
 
@@ -56,11 +57,11 @@ class CategoricalTest(xla_test.XLATestCase):
     Returns:
       Frequencies from sampled classes; shape [batch_size, num_classes].
     """
-    with self.cached_session() as sess, self.test_scope():
+    with self.cached_session(), self.test_scope():
       random_seed.set_random_seed(1618)
       op = random_ops.multinomial(logits, num_samples,
                                   output_dtype=dtypes.int32)
-      d = sess.run(op)
+      d = self.evaluate(op)
 
     batch_size, num_classes = logits.shape
     freqs_mat = []
@@ -79,15 +80,15 @@ class CategoricalTest(xla_test.XLATestCase):
 
   def _testRngIsNotConstant(self, rng, dtype, output_dtype):
     # Tests that 'rng' does not always return the same value.
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         x = rng(dtype, output_dtype)
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # deterministic output, all three outputs will be bitwise identical.
@@ -107,12 +108,12 @@ class CategoricalTest(xla_test.XLATestCase):
   def testCategoricalIsInRange(self):
     for dtype in self.float_types:
       for output_dtype in self.output_dtypes():
-        with self.cached_session() as sess:
+        with self.cached_session():
           with self.test_scope():
             x = random_ops.multinomial(
                 array_ops.ones(shape=[1, 20], dtype=dtype), 1000,
                 output_dtype=output_dtype)
-          y = sess.run(x)
+          y = self.evaluate(x)
           self.assertTrue((y >= 0).sum() == 1000)
           self.assertTrue((y < 20).sum() == 1000)
 
@@ -138,6 +139,57 @@ class CategoricalTest(xla_test.XLATestCase):
       chi2 = self._chi2(probs, freqs)
       self.assertLess(chi2, 1e-3)
 
+  def testStatelessMultinomialIsInRange(self):
+    for dtype in self.float_types:
+      for output_dtype in self.output_dtypes():
+        with self.cached_session() as sess:
+          with self.test_scope():
+            seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+            x = stateless_random_ops.stateless_multinomial(
+                array_ops.ones(shape=[1, 20], dtype=dtype),
+                1000,
+                seed_t,
+                output_dtype=output_dtype)
+          y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+          self.assertTrue((y >= 0).sum() == 1000)
+          self.assertTrue((y < 20).sum() == 1000)
+
+  def testDeterminismMultinomial(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    num_samples = 10
+    with self.cached_session(), self.test_scope():
+      seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+      for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                [0.25, 0.75]]):
+        pure = stateless_random_ops.stateless_multinomial(
+            logits, num_samples, seed=seed_t)
+        values = [(seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds]
+        for s0, v0 in values:
+          for s1, v1 in values:
+            self.assertEqual(s0 == s1, np.all(v0 == v1))
+
+  def testEmpty(self):
+    with self.cached_session():
+      with self.test_scope():
+        x = random_ops.multinomial(
+            array_ops.zeros([42, 40]), 0, output_dtype=dtypes.int32)
+        y = self.evaluate(x)
+        self.assertEqual(y.shape, (42, 0))
+
+  def testEmptyStateless(self):
+    with self.cached_session() as sess:
+      with self.test_scope():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        x = stateless_random_ops.stateless_multinomial(
+            array_ops.zeros([42, 40]),
+            0,
+            seed=seed_t,
+            output_dtype=dtypes.int32)
+        y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+        self.assertEqual(y.shape, (42, 0))
+
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tests/clustering_test.py b/tensorflow/compiler/tests/clustering_test.py
index 88bd58b2da6b2892f898ad10f3467d8ce39d6388..ef2d7af69deeebd5f4c4c7225d7027f8f76bf861 100644
--- a/tensorflow/compiler/tests/clustering_test.py
+++ b/tensorflow/compiler/tests/clustering_test.py
@@ -43,7 +43,7 @@ class ClusteringTest(xla_test.XLATestCase):
         input1 = constant_op.constant(val1, name="const1")
         input2 = constant_op.constant(val2, name="const2")
         output = math_ops.add(input1, input2)
-      result = output.eval()
+      result = self.evaluate(output)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testAddFromCpuMultiple(self):
@@ -57,7 +57,7 @@ class ClusteringTest(xla_test.XLATestCase):
       with self.test_scope():
         output = math_ops.add(input1, input2)
       for _ in xrange(10):
-        result = output.eval()
+        result = self.evaluate(output)
         self.assertAllClose(result, expected, rtol=1e-3)
 
   def testDeadlock(self):
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 2d225ad226cac368042b95eae8fc29e6fd8e82e0..2187f57960f80300d631bdc7eb8fe5e9c8dddeea 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -72,7 +72,7 @@ class ConcatTest(xla_test.XLATestCase):
       x2 = constant_op.constant(p2)
       with self.test_scope():
         c = array_ops.concat([x1, x2], 0)
-      result = c.eval()
+      result = self.evaluate(c)
     self.assertAllEqual(result[:2, :], p1)
     self.assertAllEqual(result[2:, :], p2)
 
@@ -150,7 +150,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 1)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
     self.assertAllEqual(result, grad_inp)
 
   def testGradientsSimpleAll(self):
@@ -177,7 +177,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 0)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -205,7 +205,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 2)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -242,7 +242,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, concat_dim)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -254,7 +254,7 @@ class ConcatTest(xla_test.XLATestCase):
   def DISABLED_testZeroSize(self):
     # Verify that concat doesn't crash and burn for zero size inputs
     np.random.seed(7)
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         for shape0 in (), (2,):
           axis = len(shape0)
@@ -270,7 +270,7 @@ class ConcatTest(xla_test.XLATestCase):
                 self.assertAllEqual(c.eval(), correct)
                 # Check gradients
                 dc = np.random.randn(*c.get_shape().as_list())
-                dxs = sess.run(gradients_impl.gradients(c, xs, dc))
+                dxs = self.evaluate(gradients_impl.gradients(c, xs, dc))
                 self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
 
   def testConcatTuple(self):
@@ -280,7 +280,7 @@ class ConcatTest(xla_test.XLATestCase):
       with self.test_scope():
         concat_list_t = array_ops.concat([c1, c2], 0)
         concat_tuple_t = array_ops.concat((c1, c2), 0)
-      self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval())
+      self.assertAllEqual(concat_list_t.eval(), self.evaluate(concat_tuple_t))
 
   def testConcatNoScalars(self):
     with self.cached_session():
@@ -330,47 +330,47 @@ class ConcatTest(xla_test.XLATestCase):
 class ConcatOffsetTest(xla_test.XLATestCase):
 
   def testBasic(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         cdim = constant_op.constant(1, dtypes.int32)
         s0 = constant_op.constant([2, 3, 5], dtypes.int32)
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
         s2 = constant_op.constant([2, 20, 5], dtypes.int32)
         off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-        ans = sess.run(off)
+        ans = self.evaluate(off)
         self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
 
 class PackTest(xla_test.XLATestCase):
 
   def testBasic(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         s0 = constant_op.constant([2, 3, 5], dtypes.int32)
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
         s2 = constant_op.constant([2, 20, 5], dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [[2, 3, 5], [2, 7, 5], [2, 20, 5]])
 
   def testScalars(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         s0 = constant_op.constant(2, dtypes.int32)
         s1 = constant_op.constant(3, dtypes.int32)
         s2 = constant_op.constant(5, dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [2, 3, 5])
 
   def testEmpty(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         s0 = constant_op.constant([[]], dtypes.int32)
         s1 = constant_op.constant([[]], dtypes.int32)
         s2 = constant_op.constant([[]], dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [[[]], [[]], [[]]])
 
 
diff --git a/tensorflow/compiler/tests/conv3d_test.py b/tensorflow/compiler/tests/conv3d_test.py
index 33fd983b5485e503c2fcc96db2dfdecfc41e309f..01cc1b6392845be2418c50d55be97487eb290843 100644
--- a/tensorflow/compiler/tests/conv3d_test.py
+++ b/tensorflow/compiler/tests/conv3d_test.py
@@ -85,7 +85,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells = kernel_depth * kernel_height * kernel_width
@@ -135,7 +135,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[3]):
@@ -173,7 +173,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
@@ -225,7 +225,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
       err = gradient_checker.compute_gradient_error([x, f], [x_shape, f_shape],
                                                     output, y_shape)
     print("conv3d_transpose gradient err = %g " % err)
-    err_tolerance = 0.0005
+    err_tolerance = 0.001
     self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
index 9390870e07d6b5bd90dbc5c04bac0946595dcf7f..bf5ea7b1fb6fb3c774c4db20d059f131990d20d3 100644
--- a/tensorflow/compiler/tests/dense_layer_test.py
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import numpy as np
 
+from tensorflow.compiler.tests import test_utils
 from tensorflow.contrib.compiler import jit
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.layers import layers
@@ -30,7 +31,6 @@ from tensorflow.python.platform import test
 
 jit_scope = jit.experimental_jit_scope
 
-
 def GetRunMetadataLabels(run_metadata):
   """Returns all labels in run_metadata."""
   labels = []
@@ -42,7 +42,7 @@ def GetRunMetadataLabels(run_metadata):
 
 def InLabels(labels, substr):
   """Returns true iff one of the labels contains substr."""
-  return any([substr in x for x in labels])
+  return any(substr in x for x in labels)
 
 
 class DenseLayerTest(test.TestCase):
@@ -68,13 +68,14 @@ class DenseLayerTest(test.TestCase):
     config.graph_options.optimizer_options.global_jit_level = (
         config_pb2.OptimizerOptions.ON_1)
 
-    with self.test_session(config=config) as sess:
+    with self.session(config=config) as sess:
       x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
       y = layers.dense(x, 3)
 
-      sess.run(variables.initialize_all_variables())
+      self.evaluate(variables.initialize_all_variables())
       run_metadata = config_pb2.RunMetadata()
-      sess.run(
+      test_utils.RunWithWarmup(
+          sess,
           y, {x: np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])},
           run_metadata=run_metadata,
           options=config_pb2.RunOptions(
@@ -96,9 +97,10 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      sess.run(variables.initialize_all_variables())
+      self.evaluate(variables.initialize_all_variables())
       run_metadata = config_pb2.RunMetadata()
-      sess.run(
+      test_utils.RunWithWarmup(
+          sess,
           y, {x: np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])},
           run_metadata=run_metadata,
           options=config_pb2.RunOptions(
@@ -124,9 +126,10 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      sess.run(variables.initialize_all_variables())
+      self.evaluate(variables.initialize_all_variables())
       run_metadata = config_pb2.RunMetadata()
-      sess.run(
+      test_utils.RunWithWarmup(
+          sess,
           y, {x: np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])},
           run_metadata=run_metadata,
           options=config_pb2.RunOptions(
@@ -138,4 +141,6 @@ class DenseLayerTest(test.TestCase):
 
 
 if __name__ == "__main__":
+  os.environ["TF_XLA_FLAGS"] = ("--tf_xla_enable_lazy_compilation=true " +
+                                os.environ.get("TF_XLA_FLAGS", ""))
   test.main()
diff --git a/tensorflow/compiler/tests/depthwise_conv_op_test.py b/tensorflow/compiler/tests/depthwise_conv_op_test.py
index 6ef8a68ca5d35d3d2f78f0cb491e7bb98ff97ac9..174bfa9efbcd7dcb4f895237eb01c17bc4a3a6b4 100644
--- a/tensorflow/compiler/tests/depthwise_conv_op_test.py
+++ b/tensorflow/compiler/tests/depthwise_conv_op_test.py
@@ -255,7 +255,7 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
             t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv, {t1: x1, t2: x2})
     print("value = ", value)
-    self.assertArrayNear(expected, np.ravel(value), 1e-5)
+    self.assertArrayNear(expected, np.ravel(value), 1e-4)
     self.assertShapeEqual(value, conv)
 
   def testConv2D2x2Filter(self):
diff --git a/tensorflow/compiler/tests/dynamic_stitch_test.py b/tensorflow/compiler/tests/dynamic_stitch_test.py
index 50b04daa6b9f4159a3c4bdeecaf900a5b35a833c..e89cf975f5d889091ce92a35165aef55ee5ad4b0 100644
--- a/tensorflow/compiler/tests/dynamic_stitch_test.py
+++ b/tensorflow/compiler/tests/dynamic_stitch_test.py
@@ -58,6 +58,15 @@ class DynamicStitchTest(xla_test.XLATestCase):
         [idx1, idx2], [val1, val2],
         expected=np.array([[], [], [], []], np.int32))
 
+  def testEmptyIndex(self):
+    idx1 = np.array([], dtype=np.int32)
+    idx2 = np.array([[], []], dtype=np.int32)
+    val1 = np.ndarray(shape=(0, 9), dtype=np.int32)
+    val2 = np.ndarray(shape=(2, 0, 9), dtype=np.int32)
+    self._AssertDynamicStitchResultIs([idx1, idx2], [val1, val2],
+                                      expected=np.ndarray(
+                                          shape=(0, 9), dtype=np.int32))
+
   def testSimple1D(self):
     val1 = np.array([0, 4, 7], dtype=np.int32)
     val2 = np.array([1, 6, 2, 3, 5], dtype=np.int32)
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 63cee550fde9d9d4314b1541fba191df776a4da2..2af32b537ba53723370faf81aebf308a465718c7 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -101,12 +101,12 @@ class EagerTest(xla_test.XLATestCase):
       self.assertAllEqual(15, product)
 
     # Run some ops graphly
-    with context.graph_mode(), self.cached_session() as sess:
+    with context.graph_mode(), self.cached_session():
       with self.test_scope():
         three = constant_op.constant(3)
         five = constant_op.constant(5)
         product = three * five
-        self.assertAllEqual(15, sess.run(product))
+        self.assertAllEqual(15, self.evaluate(product))
 
   def testDegenerateSlices(self):
     with self.test_scope():
diff --git a/tensorflow/compiler/tests/fft_test.py b/tensorflow/compiler/tests/fft_test.py
index b3e13fbaa6b33bdaa1be123be558059e96de282e..0edd0c35aa2d417a3ed24decbaa0b5d62d35bb62 100644
--- a/tensorflow/compiler/tests/fft_test.py
+++ b/tensorflow/compiler/tests/fft_test.py
@@ -24,11 +24,10 @@ import numpy as np
 import scipy.signal as sps
 
 from tensorflow.compiler.tests import xla_test
-from tensorflow.contrib.signal.python.ops import spectral_ops as signal
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import signal
 from tensorflow.python.platform import googletest
 
 BATCH_DIMS = (3, 5)
@@ -107,39 +106,39 @@ class FFTTest(xla_test.XLATestCase):
 
   def testFFT(self):
     self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.fft,
-                          spectral_ops.fft)
+                          signal.fft)
 
   def testFFT2D(self):
     self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.fft2,
-                          spectral_ops.fft2d)
+                          signal.fft2d)
 
   def testFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.fftn(x, axes=(-3, -2, -1)),
-                          spectral_ops.fft3d)
+                          signal.fft3d)
 
   def testIFFT(self):
     self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.ifft,
-                          spectral_ops.ifft)
+                          signal.ifft)
 
   def testIFFT2D(self):
     self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.ifft2,
-                          spectral_ops.ifft2d)
+                          signal.ifft2d)
 
   def testIFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.ifftn(x, axes=(-3, -2, -1)),
-                          spectral_ops.ifft3d)
+                          signal.ifft3d)
 
   def testRFFT(self):
     self._VerifyFftMethod(
         INNER_DIMS_1D, np.real, lambda x: np.fft.rfft(x, n=x.shape[-1]),
-        lambda x: spectral_ops.rfft(x, fft_length=[x.shape[-1].value]))
+        lambda x: signal.rfft(x, fft_length=[x.shape[-1].value]))
 
   def testRFFT2D(self):
 
     def _tf_fn(x):
-      return spectral_ops.rfft2d(
+      return signal.rfft2d(
           x, fft_length=[x.shape[-2].value, x.shape[-1].value])
 
     self._VerifyFftMethod(
@@ -153,16 +152,33 @@ class FFTTest(xla_test.XLATestCase):
           x, axes=(-3, -2, -1), s=[x.shape[-3], x.shape[-2], x.shape[-1]])
 
     def _tf_fn(x):
-      return spectral_ops.rfft3d(
+      return signal.rfft3d(
           x,
           fft_length=[x.shape[-3].value, x.shape[-2].value, x.shape[-1].value])
 
     self._VerifyFftMethod(INNER_DIMS_3D, np.real, _to_expected, _tf_fn)
 
+  def testRFFT3DMismatchedSize(self):
+
+    def _to_expected(x):
+      return np.fft.rfftn(
+          x,
+          axes=(-3, -2, -1),
+          s=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
+
+    def _tf_fn(x):
+      return signal.rfft3d(
+          x,
+          fft_length=[
+              x.shape[-3].value // 2, x.shape[-2].value, x.shape[-1].value * 2
+          ])
+
+    self._VerifyFftMethod(INNER_DIMS_3D, np.real, _to_expected, _tf_fn)
+
   def testIRFFT(self):
 
     def _tf_fn(x):
-      return spectral_ops.irfft(x, fft_length=[2 * (x.shape[-1].value - 1)])
+      return signal.irfft(x, fft_length=[2 * (x.shape[-1].value - 1)])
 
     self._VerifyFftMethod(
         INNER_DIMS_1D, lambda x: np.fft.rfft(np.real(x), n=x.shape[-1]),
@@ -171,7 +187,7 @@ class FFTTest(xla_test.XLATestCase):
   def testIRFFT2D(self):
 
     def _tf_fn(x):
-      return spectral_ops.irfft2d(
+      return signal.irfft2d(
           x, fft_length=[x.shape[-2].value, 2 * (x.shape[-1].value - 1)])
 
     self._VerifyFftMethod(
@@ -195,7 +211,7 @@ class FFTTest(xla_test.XLATestCase):
           s=[x.shape[-3], x.shape[-2], 2 * (x.shape[-1] - 1)])
 
     def _tf_fn(x):
-      return spectral_ops.irfft3d(
+      return signal.irfft3d(
           x,
           fft_length=[
               x.shape[-3].value, x.shape[-2].value, 2 * (x.shape[-1].value - 1)
@@ -203,6 +219,30 @@ class FFTTest(xla_test.XLATestCase):
 
     self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn)
 
+  def testIRFFT3DMismatchedSize(self):
+
+    def _to_input(x):
+      return np.fft.rfftn(
+          np.real(x),
+          axes=(-3, -2, -1),
+          s=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
+
+    def _to_expected(x):
+      return np.fft.irfftn(
+          x,
+          axes=(-3, -2, -1),
+          s=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
+
+    def _tf_fn(x):
+      return signal.irfft3d(
+          x,
+          fft_length=[
+              x.shape[-3].value // 2, x.shape[-2].value, x.shape[-1].value * 2
+          ])
+
+    self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn)
+
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/fifo_queue_test.py b/tensorflow/compiler/tests/fifo_queue_test.py
index 8c7edfd277c992c35a81dd5f261256a86352254e..91d77d2f791834346f43aecb60d116ddbf2faa6e 100644
--- a/tensorflow/compiler/tests/fifo_queue_test.py
+++ b/tensorflow/compiler/tests/fifo_queue_test.py
@@ -129,7 +129,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -192,9 +192,9 @@ class FIFOQueueTest(xla_test.XLATestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index f1b87a5ffb73bed62a80abaa152d335f64d970c5..b078053cdbd6d129645734492d34dd25d28ab3ef 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -50,14 +50,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Ftrl for a few steps
     for _ in range(steps):
       ftrl_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivAdagradTest_AdagradPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -65,14 +65,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     adagrad_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Adagrad for a few steps
     for _ in range(steps):
       adagrad_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivGradientDescentTest_FtrlPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -85,14 +85,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Ftrl for a few steps
     for _ in range(steps):
       ftrl_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivGradientDescentTest_GradientDescentPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -100,14 +100,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     sgd_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run GradientDescent for a few steps
     for _ in range(steps):
       sgd_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testFtrlwithoutRegularization(self):
     for dtype in self.float_types:
@@ -124,8 +124,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([0.0, 0.0], var0.eval())
-        self.assertAllClose([0.0, 0.0], var1.eval())
+        self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+        self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
         # Run 3 steps FTRL
         for _ in range(3):
@@ -134,12 +134,12 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType(
             np.array([-2.60260963, -4.29698515]),
-            var0.eval(),
-            float_rtol=1e-5,
+            self.evaluate(var0),
+            float_rtol=1e-4,
             half_rtol=1e-2)
         self.assertAllCloseAccordingToType(
             np.array([-0.28432083, -0.56694895]),
-            var1.eval(),
+            self.evaluate(var1),
             float_rtol=1e-5,
             half_rtol=1e-2)
 
@@ -158,8 +158,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 3 steps FTRL
         for _ in range(3):
@@ -167,9 +167,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-2.55607247, -3.98729396]), var0.eval(), 1e-5, 1e-5)
+            np.array([-2.55607247, -3.98729396]),
+            self.evaluate(var0),
+            1e-5,
+            1e-5,
+            float_rtol=1e-4)
         self.assertAllCloseAccordingToType(
-            np.array([-0.28232238, -0.56096673]), var1.eval(), 1e-5, 1e-5)
+            np.array([-0.28232238, -0.56096673]), self.evaluate(var1), 1e-5,
+            1e-5)
 
   def testFtrlWithL1(self):
     for dtype in self.float_types:
@@ -186,8 +191,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -196,12 +201,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType(
             np.array([-7.66718769, -10.91273689]),
-            var0.eval(),
+            self.evaluate(var0),
             rtol=1e-4,
             bfloat16_rtol=1e-1,
             bfloat16_atol=1e-1)
         self.assertAllCloseAccordingToType(
-            np.array([-0.93460727, -1.86147261]), var1.eval(), rtol=1e-4)
+            np.array([-0.93460727, -1.86147261]),
+            self.evaluate(var1),
+            rtol=1e-4)
 
   def testFtrlWithL1_L2(self):
     for dtype in self.float_types:
@@ -218,8 +225,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -227,9 +234,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-0.24059935, -0.46829352]), var0.eval(), rtol=1e-5)
+            np.array([-0.24059935, -0.46829352]),
+            self.evaluate(var0),
+            rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([-0.02406147, -0.04830509]), var1.eval(), rtol=1e-5)
+            np.array([-0.02406147, -0.04830509]),
+            self.evaluate(var1),
+            rtol=1e-5)
 
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
@@ -253,8 +264,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -262,9 +273,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-0.22578996, -0.44345799]), var0.eval(), rtol=1e-4)
+            np.array([-0.22578996, -0.44345799]),
+            self.evaluate(var0),
+            rtol=1e-4)
         self.assertAllCloseAccordingToType(
-            np.array([-0.14378493, -0.13229476]), var1.eval(), rtol=1e-4)
+            np.array([-0.14378493, -0.13229476]),
+            self.evaluate(var1),
+            rtol=1e-4)
 
   def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
     """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
@@ -290,8 +305,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         update1 = opt1.apply_gradients([(grads1, var1)])
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([1.0, 2.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -300,7 +315,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # var0 is experiencing L2 shrinkage so it should be smaller than var1
         # in magnitude.
-        self.assertTrue((var0.eval()**2 < var1.eval()**2).all())
+        self.assertTrue((var0.eval()**2 < self.evaluate(var1)**2).all())
         accum0 = list(opt0._slots["accum"].values())[0].eval()
         accum1 = list(opt1._slots["accum"].values())[0].eval()
         # L2 shrinkage should not change how we update grad accumulator.
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index b1891b918c6584abce9da382088ed0037f5319fb..a61827c2ae44de117abad5b7db5c6bcd78fa171e 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -40,7 +40,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32)
     expected = APlus2B(aval, bval)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -50,7 +50,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_f = Foo(a, b)
-      result = sess.run(call_f)
+      result = self.evaluate(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testNestedFunctions(self):
@@ -66,7 +66,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([4, 3, 2, 1]).reshape([2, 2]).astype(np.float32)
     expected = APlus2B(aval, bval)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -76,7 +76,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_g = Foo(a, b)
-      result = sess.run(call_g)
+      result = self.evaluate(call_g)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testFunctionMultipleRetvals(self):
@@ -90,7 +90,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32)
     expected = Func(aval, bval)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -100,7 +100,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_f = Foo(a, b)
-      result = sess.run(call_f)
+      result = self.evaluate(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testCompileTimeConstantsInDefun(self):
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 68fdb5caf4c2a496b5058cdda40ca650484a6e0e..0e2d840418156d825e2d141018e49f42374c8fee 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.tests import xla_test
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -449,8 +448,8 @@ class ResizeBilinearTest(xla_test.XLATestCase):
     for dtype in self.float_types:
       self._assertForwardOpMatchesExpected(
           np.array([[1, 2]], dtype=dtype), [3, 3],
-          expected=np.array(
-              [[1, 1.5, 2], [1, 1.5, 2], [1, 1.5, 2]], dtype=np.float32))
+          expected=np.array([[1, 1.5, 2], [1, 1.5, 2], [1, 1.5, 2]],
+                            dtype=np.float32))
 
   def testAlignCorners1x2To3x2Grad(self):
     for dtype in self.float_types:
@@ -478,8 +477,8 @@ class ResizeBilinearTest(xla_test.XLATestCase):
     for dtype in self.float_types:
       self._assertForwardOpMatchesExpected(
           np.array([[1, 2], [3, 4]], dtype=dtype), [3, 3],
-          expected=np.array(
-              [[1, 1.5, 2], [2, 2.5, 3], [3, 3.5, 4]], dtype=np.float32))
+          expected=np.array([[1, 1.5, 2], [2, 2.5, 3], [3, 3.5, 4]],
+                            dtype=np.float32))
 
   def testAlignCorners2x2To3x3Grad(self):
     self._assertBackwardOpMatchesExpected(
@@ -499,8 +498,8 @@ class ResizeBilinearTest(xla_test.XLATestCase):
           np.array([[7, 13], [22, 4]], dtype=np.float32),
           input_shape=[3, 3],
           dtype=dtype,
-          expected=np.array(
-              [[7, 0, 13], [0, 0, 0], [22, 0, 4]], dtype=np.float32))
+          expected=np.array([[7, 0, 13], [0, 0, 0], [22, 0, 4]],
+                            dtype=np.float32))
 
   def testAlignCorners4x4To3x3(self):
     for dtype in self.float_types:
@@ -508,8 +507,8 @@ class ResizeBilinearTest(xla_test.XLATestCase):
           np.array(
               [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
               dtype=dtype), [3, 3],
-          expected=np.array(
-              [[1, 2.5, 4], [7, 8.5, 10], [13, 14.5, 16]], dtype=np.float32))
+          expected=np.array([[1, 2.5, 4], [7, 8.5, 10], [13, 14.5, 16]],
+                            dtype=np.float32))
 
   def testAlignCorners4x4To3x3Grad(self):
     for dtype in self.float_types:
@@ -517,41 +516,39 @@ class ResizeBilinearTest(xla_test.XLATestCase):
           np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32),
           input_shape=[4, 4],
           dtype=dtype,
-          expected=np.array(
-              [[1, 1, 1, 3], [2, 1.25, 1.25, 3], [2, 1.25, 1.25, 3],
-               [7, 4, 4, 9]],
-              dtype=np.float32))
+          expected=np.array([[1, 1, 1, 3], [2, 1.25, 1.25, 3],
+                             [2, 1.25, 1.25, 3], [7, 4, 4, 9]],
+                            dtype=np.float32))
 
   def testAlignCorners3x3To9x9(self):
     for dtype in self.float_types:
       self._assertForwardOpMatchesExpected(
           np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=dtype), [9, 9],
           expected=np.array(
-              [[1.0, 1.25, 1.50, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00], [
-                  1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 3.75
-              ], [2.50, 2.75, 3.00, 3.25, 3.50, 3.75, 4.00, 4.25, 4.50], [
-                  3.25, 3.50, 3.75, 4.00, 4.25, 4.50, 4.75, 5.00, 5.25
-              ], [4.00, 4.25, 4.50, 4.75, 5.00, 5.25, 5.50, 5.75, 6.00], [
-                  4.75, 5.00, 5.25, 5.50, 5.75, 6.00, 6.25, 6.50, 6.75
-              ], [5.50, 5.75, 6.00, 6.25, 6.50, 6.75, 7.00, 7.25, 7.50], [
-                  6.25, 6.50, 6.75, 7.00, 7.25, 7.50, 7.75, 8.00, 8.25
-              ], [7.00, 7.25, 7.50, 7.75, 8.00, 8.25, 8.50, 8.75, 9.00]],
+              [[1.0, 1.25, 1.50, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00],
+               [1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 3.75],
+               [2.50, 2.75, 3.00, 3.25, 3.50, 3.75, 4.00, 4.25, 4.50],
+               [3.25, 3.50, 3.75, 4.00, 4.25, 4.50, 4.75, 5.00, 5.25],
+               [4.00, 4.25, 4.50, 4.75, 5.00, 5.25, 5.50, 5.75, 6.00],
+               [4.75, 5.00, 5.25, 5.50, 5.75, 6.00, 6.25, 6.50, 6.75],
+               [5.50, 5.75, 6.00, 6.25, 6.50, 6.75, 7.00, 7.25, 7.50],
+               [6.25, 6.50, 6.75, 7.00, 7.25, 7.50, 7.75, 8.00, 8.25],
+               [7.00, 7.25, 7.50, 7.75, 8.00, 8.25, 8.50, 8.75, 9.00]],
               dtype=np.float32))
 
   def testAlignCorners3x3To9x9Grad(self):
     for dtype in self.float_types:
       self._assertBackwardOpMatchesExpected(
-          np.array(
-              [[1.00, 1.25, 1.50, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00], [
-                  1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 3.75
-              ], [2.50, 2.75, 3.00, 3.25, 3.50, 3.75, 4.00, 4.25, 4.50], [
-                  3.25, 3.50, 3.75, 4.00, 4.25, 4.50, 4.75, 5.00, 5.25
-              ], [4.00, 4.25, 4.50, 4.75, 5.00, 5.25, 5.50, 5.75, 6.00], [
-                  4.75, 5.00, 5.25, 5.50, 5.75, 6.00, 6.25, 6.50, 6.75
-              ], [5.50, 5.75, 6.00, 6.25, 6.50, 6.75, 7.00, 7.25, 7.50], [
-                  6.25, 6.50, 6.75, 7.00, 7.25, 7.50, 7.75, 8.00, 8.25
-              ], [7.00, 7.25, 7.50, 7.75, 8.00, 8.25, 8.50, 8.75, 9.00]],
-              dtype=np.float32),
+          np.array([[1.00, 1.25, 1.50, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00],
+                    [1.75, 2.00, 2.25, 2.50, 2.75, 3.00, 3.25, 3.50, 3.75],
+                    [2.50, 2.75, 3.00, 3.25, 3.50, 3.75, 4.00, 4.25, 4.50],
+                    [3.25, 3.50, 3.75, 4.00, 4.25, 4.50, 4.75, 5.00, 5.25],
+                    [4.00, 4.25, 4.50, 4.75, 5.00, 5.25, 5.50, 5.75, 6.00],
+                    [4.75, 5.00, 5.25, 5.50, 5.75, 6.00, 6.25, 6.50, 6.75],
+                    [5.50, 5.75, 6.00, 6.25, 6.50, 6.75, 7.00, 7.25, 7.50],
+                    [6.25, 6.50, 6.75, 7.00, 7.25, 7.50, 7.75, 8.00, 8.25],
+                    [7.00, 7.25, 7.50, 7.75, 8.00, 8.25, 8.50, 8.75, 9.00]],
+                   dtype=np.float32),
           input_shape=[3, 3],
           dtype=dtype,
           expected=np.array(
@@ -572,12 +569,12 @@ class ResizeBilinearTest(xla_test.XLATestCase):
         (np.array([[0, 1, 2, 3, 4, 5, 6, 7]], dtype=np.float32) + np.array(
             [[0], [1], [2], [3], [4], [5], [6], [7]], dtype=np.float32)) * 15.0,
         [16, 16],
-        expected=7 * (np.array(
-            [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]],
-            dtype=np.float32) + np.array(
-                [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11],
-                 [12], [13], [14], [15]],
-                dtype=np.float32)),
+        expected=7 *
+        (np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]],
+                  dtype=np.float32) +
+         np.array([[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11],
+                   [12], [13], [14], [15]],
+                  dtype=np.float32)),
         large_tolerance=True)
 
   def testNonAlignCorners3x2To6x4(self):
@@ -601,172 +598,230 @@ class ResizeBilinearTest(xla_test.XLATestCase):
           expected=np.array(expected_data, dtype=dtype),
           align_corners=False)
 
+  def testNonAlignCorners3x2To6x4Batch2(self):
+    input_data = [[[64, 32], [32, 64], [50, 100]], [[32, 16], [16, 32],
+                                                    [25, 50]]]
+    expected_data = [[[64.0, 48.0, 32.0, 32.0], [48.0, 48.0, 48.0, 48.0],
+                      [32.0, 48.0, 64.0, 64.0], [41.0, 61.5, 82.0, 82.0],
+                      [50.0, 75.0, 100.0, 100.0], [50.0, 75.0, 100.0, 100.0]],
+                     [[32.0, 24.0, 16.0, 16.0], [24.0, 24.0, 24.0, 24.0],
+                      [16.0, 24.0, 32.0, 32.0], [20.5, 30.75, 41.0, 41.0],
+                      [25.0, 37.5, 50.0, 50.0], [25.0, 37.5, 50.0, 50.0]]]
 
-class NonMaxSuppressionTest(xla_test.XLATestCase):
+    for dtype in self.float_types:
+      input_image = np.array(input_data, dtype=dtype)
+      expected = np.array(expected_data, dtype=dtype)
+      with self.cached_session() as sess, self.test_scope():
+        image = array_ops.placeholder(input_image.dtype)
+        resized = gen_image_ops.resize_bilinear(
+            image, [6, 4], align_corners=False)
+        out = sess.run(resized, {image: input_image[:, :, :, np.newaxis]})
+        self.assertAllClose(expected[:, :, :, np.newaxis], out)
 
-  def testNMS128From1024(self):
-    with compat.forward_compatibility_horizon(2018, 8, 8):
-      num_boxes = 1024
-      boxes_np = np.random.normal(50, 10, (num_boxes, 4)).astype("f4")
-      scores_np = np.random.normal(0.5, 0.1, (num_boxes,)).astype("f4")
 
-      max_output_size = 128
-      iou_threshold_np = np.array(0.5, dtype=np.float32)
-      score_threshold_np = np.array(0.0, dtype=np.float32)
+class NonMaxSuppressionTest(xla_test.XLATestCase):
 
-      with self.cached_session() as sess:
-        boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
-        scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
-        iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
-                                              iou_threshold_np.shape)
-        score_threshold = array_ops.placeholder(score_threshold_np.dtype,
-                                                score_threshold_np.shape)
-        with self.test_scope():
-          selected_indices = image_ops.non_max_suppression_padded(
-              boxes=boxes,
-              scores=scores,
-              max_output_size=max_output_size,
-              iou_threshold=iou_threshold,
-              score_threshold=score_threshold,
-              pad_to_max_output_size=True)
-        inputs_feed = {
-            boxes: boxes_np,
-            scores: scores_np,
-            score_threshold: score_threshold_np,
-            iou_threshold: iou_threshold_np
-        }
-        (indices_tf, _) = sess.run(selected_indices, feed_dict=inputs_feed)
-
-        self.assertEqual(indices_tf.size, max_output_size)
+  def testNMS128From1024(self):
+    num_boxes = 1024
+    boxes_np = np.random.normal(50, 10, (num_boxes, 4)).astype("f4")
+    scores_np = np.random.normal(0.5, 0.1, (num_boxes,)).astype("f4")
+
+    max_output_size = 128
+    iou_threshold_np = np.array(0.5, dtype=np.float32)
+    score_threshold_np = np.array(0.0, dtype=np.float32)
+
+    with self.cached_session() as sess:
+      boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
+      scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
+      iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
+                                            iou_threshold_np.shape)
+      score_threshold = array_ops.placeholder(score_threshold_np.dtype,
+                                              score_threshold_np.shape)
+      with self.test_scope():
+        selected_indices = image_ops.non_max_suppression_padded(
+            boxes=boxes,
+            scores=scores,
+            max_output_size=max_output_size,
+            iou_threshold=iou_threshold,
+            score_threshold=score_threshold,
+            pad_to_max_output_size=True)
+      inputs_feed = {
+          boxes: boxes_np,
+          scores: scores_np,
+          score_threshold: score_threshold_np,
+          iou_threshold: iou_threshold_np
+      }
+      (indices_tf, _) = sess.run(selected_indices, feed_dict=inputs_feed)
+
+      self.assertEqual(indices_tf.size, max_output_size)
 
   def testNMS3From6Boxes(self):
-    with compat.forward_compatibility_horizon(2018, 8, 8):
-      # Three boxes are selected based on IOU.
-      boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
-                    [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
-      boxes_np = np.array(boxes_data, dtype=np.float32)
-
-      scores_data = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
-      scores_np = np.array(scores_data, dtype=np.float32)
-
-      max_output_size = 3
-      iou_threshold_np = np.array(0.5, dtype=np.float32)
-      score_threshold_np = np.array(0.0, dtype=np.float32)
-
-      with self.cached_session() as sess:
-        boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
-        scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
-        iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
-                                              iou_threshold_np.shape)
-        score_threshold = array_ops.placeholder(score_threshold_np.dtype,
-                                                score_threshold_np.shape)
-        with self.test_scope():
-          selected_indices = image_ops.non_max_suppression_padded(
-              boxes=boxes,
-              scores=scores,
-              max_output_size=max_output_size,
-              iou_threshold=iou_threshold,
-              score_threshold=score_threshold,
-              pad_to_max_output_size=True)
-        inputs_feed = {
-            boxes: boxes_np,
-            scores: scores_np,
-            score_threshold: score_threshold_np,
-            iou_threshold: iou_threshold_np
-        }
-        (indices_tf, num_valid) = sess.run(
-            selected_indices, feed_dict=inputs_feed)
-
-        self.assertEqual(indices_tf.size, max_output_size)
-        self.assertEqual(num_valid, 3)
-        self.assertAllClose(indices_tf[:num_valid], [3, 0, 5])
+    # Three boxes are selected based on IOU.
+    boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
+                  [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
+    boxes_np = np.array(boxes_data, dtype=np.float32)
+
+    scores_data = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
+    scores_np = np.array(scores_data, dtype=np.float32)
+
+    max_output_size = 3
+    iou_threshold_np = np.array(0.5, dtype=np.float32)
+    score_threshold_np = np.array(0.0, dtype=np.float32)
+
+    with self.cached_session() as sess:
+      boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
+      scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
+      iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
+                                            iou_threshold_np.shape)
+      score_threshold = array_ops.placeholder(score_threshold_np.dtype,
+                                              score_threshold_np.shape)
+      with self.test_scope():
+        selected_indices = image_ops.non_max_suppression_padded(
+            boxes=boxes,
+            scores=scores,
+            max_output_size=max_output_size,
+            iou_threshold=iou_threshold,
+            score_threshold=score_threshold,
+            pad_to_max_output_size=True)
+      inputs_feed = {
+          boxes: boxes_np,
+          scores: scores_np,
+          score_threshold: score_threshold_np,
+          iou_threshold: iou_threshold_np
+      }
+      (indices_tf, num_valid) = sess.run(
+          selected_indices, feed_dict=inputs_feed)
+
+      self.assertEqual(indices_tf.size, max_output_size)
+      self.assertEqual(num_valid, 3)
+      self.assertAllClose(indices_tf[:num_valid], [3, 0, 5])
 
   def testNMS3Then2WithScoreThresh(self):
     # Three boxes are selected based on IOU.
     # One is filtered out by score threshold.
 
-    with compat.forward_compatibility_horizon(2018, 8, 8):
-      boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
-                    [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
-      boxes_np = np.array(boxes_data, dtype=np.float32)
-
-      scores_data = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
-      scores_np = np.array(scores_data, dtype=np.float32)
-      max_output_size = 3
-      iou_threshold_np = np.array(0.5, dtype=np.float32)
-      score_threshold_np = np.array(0.4, dtype=np.float32)
-
-      with self.cached_session() as sess:
-        boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
-        scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
-        iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
-                                              iou_threshold_np.shape)
-        score_threshold = array_ops.placeholder(score_threshold_np.dtype,
-                                                score_threshold_np.shape)
-        with self.test_scope():
-          selected_indices = image_ops.non_max_suppression_padded(
-              boxes=boxes,
-              scores=scores,
-              max_output_size=max_output_size,
-              iou_threshold=iou_threshold,
-              score_threshold=score_threshold,
-              pad_to_max_output_size=True)
-        inputs_feed = {
-            boxes: boxes_np,
-            scores: scores_np,
-            iou_threshold: iou_threshold_np,
-            score_threshold: score_threshold_np
-        }
-        (indices_tf, num_valid) = sess.run(
-            selected_indices, feed_dict=inputs_feed)
-
-        self.assertEqual(indices_tf.size, max_output_size)
-        self.assertEqual(num_valid, 2)
-        self.assertAllClose(indices_tf[:num_valid], [3, 0])
+    boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
+                  [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
+    boxes_np = np.array(boxes_data, dtype=np.float32)
+
+    scores_data = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
+    scores_np = np.array(scores_data, dtype=np.float32)
+    max_output_size = 3
+    iou_threshold_np = np.array(0.5, dtype=np.float32)
+    score_threshold_np = np.array(0.4, dtype=np.float32)
+
+    with self.cached_session() as sess:
+      boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
+      scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
+      iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
+                                            iou_threshold_np.shape)
+      score_threshold = array_ops.placeholder(score_threshold_np.dtype,
+                                              score_threshold_np.shape)
+      with self.test_scope():
+        selected_indices = image_ops.non_max_suppression_padded(
+            boxes=boxes,
+            scores=scores,
+            max_output_size=max_output_size,
+            iou_threshold=iou_threshold,
+            score_threshold=score_threshold,
+            pad_to_max_output_size=True)
+      inputs_feed = {
+          boxes: boxes_np,
+          scores: scores_np,
+          iou_threshold: iou_threshold_np,
+          score_threshold: score_threshold_np
+      }
+      (indices_tf, num_valid) = sess.run(
+          selected_indices, feed_dict=inputs_feed)
+
+      self.assertEqual(indices_tf.size, max_output_size)
+      self.assertEqual(num_valid, 2)
+      self.assertAllClose(indices_tf[:num_valid], [3, 0])
 
   def testNMS3Then1WithScoreMaxThresh(self):
     # Three boxes are selected based on IOU.
     # One is filtered out by score threshold.
     # One is filtered out by max_output_size.
 
-    with compat.forward_compatibility_horizon(2018, 8, 8):
-      boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
-                    [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
-      boxes_np = np.array(boxes_data, dtype=np.float32)
-
-      scores_data = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
-      scores_np = np.array(scores_data, dtype=np.float32)
-      max_output_size = 1
-      iou_threshold_np = np.array(0.5, dtype=np.float32)
-      score_threshold_np = np.array(0.4, dtype=np.float32)
+    boxes_data = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
+                  [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
+    boxes_np = np.array(boxes_data, dtype=np.float32)
+
+    scores_data = [0.9, 0.75, 0.6, 0.95, 0.5, 0.3]
+    scores_np = np.array(scores_data, dtype=np.float32)
+    max_output_size = 1
+    iou_threshold_np = np.array(0.5, dtype=np.float32)
+    score_threshold_np = np.array(0.4, dtype=np.float32)
+
+    with self.cached_session() as sess:
+      boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
+      scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
+      iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
+                                            iou_threshold_np.shape)
+      score_threshold = array_ops.placeholder(score_threshold_np.dtype,
+                                              score_threshold_np.shape)
+      with self.test_scope():
+        selected_indices = image_ops.non_max_suppression_padded(
+            boxes=boxes,
+            scores=scores,
+            max_output_size=max_output_size,
+            iou_threshold=iou_threshold,
+            score_threshold=score_threshold,
+            pad_to_max_output_size=True)
+      inputs_feed = {
+          boxes: boxes_np,
+          scores: scores_np,
+          iou_threshold: iou_threshold_np,
+          score_threshold: score_threshold_np
+      }
+      (indices_tf, num_valid) = sess.run(
+          selected_indices, feed_dict=inputs_feed)
+
+      self.assertEqual(indices_tf.size, max_output_size)
+      self.assertEqual(num_valid, 1)
+      self.assertAllClose(indices_tf[:num_valid], [3])
+
+  def testSelectFromContinuousOverLap(self):
+    # Tests that a suppressed box does not itself suppress other boxes.
+
+    boxes_data = [[0, 0, 1, 1], [0, 0.2, 1, 1.2], [0, 0.4, 1, 1.4],
+                  [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 3]]
+    boxes_np = np.array(boxes_data, dtype=np.float32)
+
+    scores_data = [0.9, 0.75, 0.6, 0.5, 0.4, 0.3]
+    scores_np = np.array(scores_data, dtype=np.float32)
+    max_output_size = 3
+    iou_threshold_np = np.array(0.5, dtype=np.float32)
+    score_threshold_np = np.array(0.1, dtype=np.float32)
+
+    with self.cached_session() as sess:
+      boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
+      scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
+      iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
+                                            iou_threshold_np.shape)
+      score_threshold = array_ops.placeholder(score_threshold_np.dtype,
+                                              score_threshold_np.shape)
+      with self.test_scope():
+        selected_indices = image_ops.non_max_suppression_padded(
+            boxes=boxes,
+            scores=scores,
+            max_output_size=max_output_size,
+            iou_threshold=iou_threshold,
+            score_threshold=score_threshold,
+            pad_to_max_output_size=True)
+      inputs_feed = {
+          boxes: boxes_np,
+          scores: scores_np,
+          iou_threshold: iou_threshold_np,
+          score_threshold: score_threshold_np
+      }
+      (indices_tf, num_valid) = sess.run(
+          selected_indices, feed_dict=inputs_feed)
+
+      self.assertEqual(indices_tf.size, max_output_size)
+      self.assertEqual(num_valid, 3)
+      self.assertAllClose(indices_tf[:num_valid], [0, 2, 4])
 
-      with self.cached_session() as sess:
-        boxes = array_ops.placeholder(boxes_np.dtype, shape=boxes_np.shape)
-        scores = array_ops.placeholder(scores_np.dtype, shape=scores_np.shape)
-        iou_threshold = array_ops.placeholder(iou_threshold_np.dtype,
-                                              iou_threshold_np.shape)
-        score_threshold = array_ops.placeholder(score_threshold_np.dtype,
-                                                score_threshold_np.shape)
-        with self.test_scope():
-          selected_indices = image_ops.non_max_suppression_padded(
-              boxes=boxes,
-              scores=scores,
-              max_output_size=max_output_size,
-              iou_threshold=iou_threshold,
-              score_threshold=score_threshold,
-              pad_to_max_output_size=True)
-        inputs_feed = {
-            boxes: boxes_np,
-            scores: scores_np,
-            iou_threshold: iou_threshold_np,
-            score_threshold: score_threshold_np
-        }
-        (indices_tf, num_valid) = sess.run(
-            selected_indices, feed_dict=inputs_feed)
-
-        self.assertEqual(indices_tf.size, max_output_size)
-        self.assertEqual(num_valid, 1)
-        self.assertAllClose(indices_tf[:num_valid], [3])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index de68ff0e32cd59e65094c0b7319f8ab213eed4db..dbea9849e217519874352b789588a2af62f1c826 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 import numpy as np
 
+from tensorflow.compiler.tests import test_utils
 from tensorflow.contrib.compiler import jit
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -36,8 +37,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
-jit_scope = jit.experimental_jit_scope
 
+jit_scope = jit.experimental_jit_scope
 
 # Disable rewrites to make sure we don't end up having to update this test
 # whenever we implement new ones.
@@ -74,14 +75,14 @@ def RunMetadataLabels(run_metadata):
 
 def InLabels(labels, substr):
   """Returns true iff one of the labels contains substr."""
-  return any([substr in x for x in labels])
+  return any(substr in x for x in labels)
 
 
-def MetadataHasXlaOp(run_metadata):
+def MetadataHasXlaRunOp(run_metadata):
   """Returns true if there are XlaRun kernels in run_metadata's timeline."""
 
   # TODO(phawkins): find a less hacky way to test whether a kernel ran.
-  return InLabels(RunMetadataLabels(run_metadata), "XlaRun")
+  return InLabels(RunMetadataLabels(run_metadata), "_XlaRun")
 
 
 class JitLaunchTest(test.TestCase):
@@ -108,15 +109,14 @@ class JitLaunchTest(test.TestCase):
       direct_op = fn(*placeholders)
 
       run_metadata = config_pb2.RunMetadata()
-      compiled = sess.run(compiled_op,
-                          feeds,
-                          run_metadata=run_metadata,
-                          options=config_pb2.RunOptions(
-                              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      compiled = test_utils.RunWithWarmup(
+          sess, compiled_op, feeds,
+          config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE),
+          run_metadata)
       print("Compiled Result {}".format(compiled))
 
       if require_kernel_launch:
-        self.assert_(MetadataHasXlaOp(run_metadata))
+        self.assert_(MetadataHasXlaRunOp(run_metadata))
 
         direct = sess.run(direct_op, feeds)
         print("Direct Result {}".format(direct))
@@ -137,7 +137,7 @@ class JitLaunchTest(test.TestCase):
         a = constant_op.constant(100)  # pylint: disable=unused-variable
 
       call = KernelWithNoOutputs()  # pylint: disable=assignment-from-no-return
-      sess.run(call, {})
+      test_utils.RunWithWarmup(sess, call, {})
 
   def testAliasing(self):
     """Regression test for compiled functions that return an aliased buffer.
@@ -250,17 +250,21 @@ class JitLaunchTest(test.TestCase):
       dx = np.random.random_sample((batch_size, image_size)).astype(np.float32)
       with session_lib.Session() as sess:
         run_metadata = config_pb2.RunMetadata()
-        output = sess.run(y, {x: dx,
-                              w: dw,
-                              b: db},
-                          run_metadata=run_metadata,
-                          options=config_pb2.RunOptions(
-                              trace_level=config_pb2.RunOptions.FULL_TRACE))
+        output = test_utils.RunWithWarmup(
+            sess,
+            y, {
+                x: dx,
+                w: dw,
+                b: db
+            },
+            run_metadata=run_metadata,
+            options=config_pb2.RunOptions(
+                trace_level=config_pb2.RunOptions.FULL_TRACE))
 
         # TODO(phawkins): really we would like to test that there were exactly
         # two kernel launches. However, we have no reliable way to determine
         # that.
-        self.assert_(MetadataHasXlaOp(run_metadata))
+        self.assert_(MetadataHasXlaRunOp(run_metadata))
 
         expected = np.square(np.dot(dx, dw) + db)
         self.assertAllClose(expected, output, rtol=1e-1)
@@ -272,7 +276,7 @@ class XlaCompilationTest(test.TestCase):
   def testReshape(self):
     """Tests an operator with compile-time constant and non-constant inputs."""
 
-    with self.test_session(config=NoRewriteSessionConfig()) as sess:
+    with self.session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -284,19 +288,22 @@ class XlaCompilationTest(test.TestCase):
         # statically known as part of the JIT compilation's input graph.
         z = array_ops.reshape(x, y)
       run_metadata = config_pb2.RunMetadata()
-      out = sess.run(z,
-                     {x: np.array([1, 2, 3, 4, 5, 6], np.float32),
-                      y: [-1, 3]},
-                     run_metadata=run_metadata,
-                     options=config_pb2.RunOptions(
-                         trace_level=config_pb2.RunOptions.FULL_TRACE))
-      self.assert_(MetadataHasXlaOp(run_metadata))
+      out = test_utils.RunWithWarmup(
+          sess,
+          z, {
+              x: np.array([1, 2, 3, 4, 5, 6], np.float32),
+              y: [-1, 3]
+          },
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assert_(MetadataHasXlaRunOp(run_metadata))
       self.assertAllClose(np.array([[1, 2, 3], [4, 5, 6]], np.float32), out)
 
   def testIgnoredArguments(self):
     """Tests that JIT computations can ignore formal parameters."""
 
-    with self.test_session(config=NoRewriteSessionConfig()) as sess:
+    with self.session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.int32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -309,18 +316,22 @@ class XlaCompilationTest(test.TestCase):
           t = math_ops.add(z, z)
 
       run_metadata = config_pb2.RunMetadata()
-      out = sess.run(t, {x: np.int32(7),
-                         y: np.int32(404)},
-                     run_metadata=run_metadata,
-                     options=config_pb2.RunOptions(
-                         trace_level=config_pb2.RunOptions.FULL_TRACE))
-      self.assert_(MetadataHasXlaOp(run_metadata))
+      out = test_utils.RunWithWarmup(
+          sess,
+          t, {
+              x: np.int32(7),
+              y: np.int32(404)
+          },
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assert_(MetadataHasXlaRunOp(run_metadata))
       self.assertAllClose(28, out)
 
   def testLoops(self):
     """Tests that compilation accepts computations containing loops."""
 
-    with self.test_session(config=NoRewriteSessionConfig()) as session:
+    with self.session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         c = lambda i, _: math_ops.less(i, 5)
@@ -332,13 +343,13 @@ class XlaCompilationTest(test.TestCase):
                            run_metadata=run_metadata,
                            options=config_pb2.RunOptions(
                                trace_level=config_pb2.RunOptions.FULL_TRACE))
-      self.assert_(MetadataHasXlaOp(run_metadata))
+      self.assert_(MetadataHasXlaRunOp(run_metadata))
       self.assertAllClose(result, np.float32(95), rtol=1e-1)
 
   def testCond(self):
     """Tests that compilation handles switch operators."""
 
-    with self.test_session(config=NoRewriteSessionConfig()) as session:
+    with self.session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.float32)
       c = array_ops.placeholder(dtypes.bool)
@@ -351,13 +362,17 @@ class XlaCompilationTest(test.TestCase):
       # deadlock.
 
       run_metadata = config_pb2.RunMetadata()
-      result = session.run(t, {x: np.float32(2),
-                               y: np.float32(4),
-                               c: True},
-                           run_metadata=run_metadata,
-                           options=config_pb2.RunOptions(
-                               trace_level=config_pb2.RunOptions.FULL_TRACE))
-      self.assert_(MetadataHasXlaOp(run_metadata))
+      result = test_utils.RunWithWarmup(
+          session,
+          t, {
+              x: np.float32(2),
+              y: np.float32(4),
+              c: True
+          },
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assert_(MetadataHasXlaRunOp(run_metadata))
       self.assertAllClose(result, np.float32(6), rtol=1e-1)
 
   def testNestedFunction(self):
@@ -379,7 +394,7 @@ class XlaCompilationTest(test.TestCase):
       inp = array_ops.placeholder(dtypes.float32)
       out = Entry(inp)
 
-    with self.test_session(
+    with self.session(
         config=NoRewriteSessionConfig(), graph=g, use_gpu=True) as sess:
       run_metadata = config_pb2.RunMetadata()
       val = sess.run(out,
@@ -392,7 +407,7 @@ class XlaCompilationTest(test.TestCase):
   def testLoopDeadlock(self):
     """Regression test for bug that caused deadlocks in graphs with loops."""
 
-    with self.test_session(config=NoRewriteSessionConfig()) as session:
+    with self.session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         y = x + 1.0
@@ -425,11 +440,13 @@ class XlaCompilationTest(test.TestCase):
       cfg.graph_options.optimizer_options.do_function_inlining = True
       with session_lib.Session(graph=g, config=cfg) as sess:
         run_metadata = config_pb2.RunMetadata()
-        dx_val = sess.run(dx,
-                          feed_dict={x: 100.},
-                          run_metadata=run_metadata,
-                          options=config_pb2.RunOptions(
-                              trace_level=config_pb2.RunOptions.FULL_TRACE))
+        dx_val = test_utils.RunWithWarmup(
+            sess,
+            dx,
+            feed_dict={x: 100.},
+            run_metadata=run_metadata,
+            options=config_pb2.RunOptions(
+                trace_level=config_pb2.RunOptions.FULL_TRACE))
       self.assertAllClose(dx_val, 0.01)
       return RunMetadataLabels(run_metadata)
 
@@ -475,7 +492,8 @@ class ElementWiseFusionTest(test.TestCase):
       a7 = a6 + a2
 
       run_metadata = config_pb2.RunMetadata()
-      output = sess.run(
+      output = test_utils.RunWithWarmup(
+          sess,
           a7, {
               a1: arg0,
               a2: arg1
@@ -509,5 +527,135 @@ class ElementWiseFusionTest(test.TestCase):
     self.assertAllClose(tf_op, tfef_op, rtol=1e-1)
 
 
+class LazyCompilationTest(test.TestCase):
+
+  def testLazyCompilation(self):
+
+    @function.Defun(compiled=True)
+    def CompiledFunction(x):
+      return math_ops.log(x)
+
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
+      x = array_ops.placeholder(dtypes.float32)
+      y = CompiledFunction(x)
+
+      # The very first run of the cluster is always compiled (non-lazily).
+      run_metadata_for_first_run = config_pb2.RunMetadata()
+      sess.run(
+          y,
+          feed_dict={x: [2., 10., 19., 77., 100.]},
+          run_metadata=run_metadata_for_first_run,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(
+          InLabels(
+              RunMetadataLabels(run_metadata_for_first_run), "_XlaCompile"))
+      self.assertTrue(
+          InLabels(RunMetadataLabels(run_metadata_for_first_run), "_XlaRun"))
+
+      run_metadata_before_warmup = config_pb2.RunMetadata()
+      sess.run(
+          y,
+          feed_dict={x: [2., 10.]},
+          run_metadata=run_metadata_before_warmup,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(
+          InLabels(
+              RunMetadataLabels(run_metadata_before_warmup), "_XlaCompile"))
+      self.assertFalse(
+          InLabels(RunMetadataLabels(run_metadata_before_warmup), "_XlaRun"))
+
+      # We compile when we see the same shape a second time.
+
+      run_metadata_after_warmup = config_pb2.RunMetadata()
+      sess.run(
+          y,
+          feed_dict={x: [2., 10.]},
+          run_metadata=run_metadata_after_warmup,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(
+          InLabels(RunMetadataLabels(run_metadata_after_warmup), "_XlaCompile"))
+      self.assertTrue(
+          InLabels(RunMetadataLabels(run_metadata_after_warmup), "_XlaRun"))
+
+      run_metadata_for_new_shape = config_pb2.RunMetadata()
+      sess.run(
+          y,
+          feed_dict={x: [2., 10., 12.]},
+          run_metadata=run_metadata_for_new_shape,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(
+          InLabels(
+              RunMetadataLabels(run_metadata_for_new_shape), "_XlaCompile"))
+      self.assertFalse(
+          InLabels(RunMetadataLabels(run_metadata_for_new_shape), "_XlaRun"))
+
+  def testIsMegamorphic(self):
+
+    @function.Defun(compiled=True)
+    def CompiledFunction(x):
+      return math_ops.log(x)
+
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
+      x = array_ops.placeholder(dtypes.float32)
+      y = CompiledFunction(x)
+
+      # Make the cluster go megamorphic by running it with lots of shape
+      # signatures where the cluster is executed with each signature only a few
+      # times.  Then check that we don't compile the cluster ever again.
+
+      for shape in range(10, 50):
+        for _ in range(0, 49):
+          sess.run(y, feed_dict={x: [0.] * shape})
+
+      for _ in range(0, 50):
+        run_metadata = config_pb2.RunMetadata()
+        sess.run(
+            y,
+            feed_dict={x: [0.] * 60},
+            run_metadata=run_metadata,
+            options=config_pb2.RunOptions(
+                trace_level=config_pb2.RunOptions.FULL_TRACE))
+        self.assertTrue(
+            InLabels(RunMetadataLabels(run_metadata), "_XlaCompile"))
+        self.assertFalse(InLabels(RunMetadataLabels(run_metadata), "_XlaRun"))
+
+  def testIsNotMegamorphic(self):
+
+    @function.Defun(compiled=True)
+    def CompiledFunction(x):
+      return math_ops.log(x)
+
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
+      x = array_ops.placeholder(dtypes.float32)
+      y = CompiledFunction(x)
+
+      # Run the cluster with lots of shape signatures, but in a way that it
+      # isn't megamorphic (i.e. each shape signature sees a lot of executions).
+      # Then check that the cluster has not been marked as megamorphic.
+
+      for shape in range(10, 50):
+        for _ in range(0, 1000):
+          sess.run(y, feed_dict={x: [0.] * shape})
+
+      for _ in range(0, 10):
+        sess.run(y, feed_dict={x: [0.] * 60})
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(
+          y,
+          feed_dict={x: [0.] * 60},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertTrue(InLabels(RunMetadataLabels(run_metadata), "_XlaCompile"))
+      self.assertTrue(InLabels(RunMetadataLabels(run_metadata), "_XlaRun"))
+
+
 if __name__ == "__main__":
+  os.environ["TF_XLA_FLAGS"] = ("--tf_xla_enable_lazy_compilation=true " +
+                                os.environ.get("TF_XLA_FLAGS", ""))
   test.main()
diff --git a/tensorflow/compiler/tests/listdiff_op_test.py b/tensorflow/compiler/tests/listdiff_op_test.py
index 58622114e4f552fb71db9b040a39b57d7da0037c..0210201fa71a6e790e94667073ab4dba542537a5 100644
--- a/tensorflow/compiler/tests/listdiff_op_test.py
+++ b/tensorflow/compiler/tests/listdiff_op_test.py
@@ -33,13 +33,13 @@ class ListDiffTest(xla_test.XLATestCase):
   def _testListDiff(self, x, y, out, idx):
     for dtype in [dtypes.int32, dtypes.int64]:
       for index_dtype in [dtypes.int32, dtypes.int64]:
-        with self.cached_session() as sess:
+        with self.cached_session():
           x_tensor = ops.convert_to_tensor(x, dtype=dtype)
           y_tensor = ops.convert_to_tensor(y, dtype=dtype)
           with self.test_scope():
             out_tensor, idx_tensor = array_ops.listdiff(
                 x_tensor, y_tensor, out_idx=index_dtype)
-            tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
+            tf_out, tf_idx = self.evaluate([out_tensor, idx_tensor])
         self.assertAllEqual(out, tf_out)
         self.assertAllEqual(idx, tf_idx)
         self.assertEqual(1, out_tensor.get_shape().ndims)
diff --git a/tensorflow/compiler/tests/lrn_ops_test.py b/tensorflow/compiler/tests/lrn_ops_test.py
index c6ad67993e8bc196a74c9a328df8c9200c92c575..5dddf6ae4e8c8a3d5e9eb7b2c62298df02a0093c 100644
--- a/tensorflow/compiler/tests/lrn_ops_test.py
+++ b/tensorflow/compiler/tests/lrn_ops_test.py
@@ -120,8 +120,8 @@ class LRNTest(xla_test.XLATestCase):
       with self.test_scope():
         actual = gen_nn_ops.lrn_grad(out_grads, in_image, out_image,
                                      depth_radius, bias, alpha, beta)
-      expected_val = expected.eval()
-      actual_val = actual.eval()
+      expected_val = self.evaluate(expected)
+      actual_val = self.evaluate(actual)
     self.assertAllClose(actual_val, expected_val, rtol=1e-3)
 
 
diff --git a/tensorflow/compiler/tests/lstm_test.py b/tensorflow/compiler/tests/lstm_test.py
index 265c0b6d1412de7be3a5bf5e79129cb330ceb162..776ed899e68ddd3893b8bb30b7c8034297aa6515 100644
--- a/tensorflow/compiler/tests/lstm_test.py
+++ b/tensorflow/compiler/tests/lstm_test.py
@@ -88,8 +88,8 @@ class LSTMTest(test.TestCase):
                  (basename, m_prev_scalar, c_prev_scalar, pad_scalar))
 
       # Initialize variables and run the unrolled LSTM step.
-      sess.run(variables.global_variables_initializer())
-      return sess.run([m, c])
+      self.evaluate(variables.global_variables_initializer())
+      return self.evaluate([m, c])
 
   def testLSTMCell(self):
     # Run with all-0 weights, no padding.
@@ -173,8 +173,8 @@ class LSTMTest(test.TestCase):
                  (basename, m_init_scalar, c_init_scalar, pad_scalar))
 
       # Initialize variables and run the unrolled LSTM layer.
-      sess.run(variables.global_variables_initializer())
-      return sess.run(out_seq)
+      self.evaluate(variables.global_variables_initializer())
+      return self.evaluate(out_seq)
 
   def testLSTMLayer(self):
     # Run with all-0 weights, no padding.
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
index f77521a7c49dba39849869ddceb7c0e885147722..3416f7dbd6bdd264bf79785084f981f5b07cb8a9 100644
--- a/tensorflow/compiler/tests/momentum_test.py
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -61,37 +61,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
   def testNesterovMomentum(self):
     for dtype in self.float_types:
@@ -115,8 +121,8 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
               var0_np, accum0_np, var0_np * 0.8, 0.1, 0.9)
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(
               var1_np, accum1_np, 0.9, 0.1, 0.9)
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in self.float_types:
@@ -141,37 +147,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py
deleted file mode 100644
index 7635f89249b7b71e5353e0b7cb1cea5c1f7bca1d..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tests/oom_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional tests for out-of-memory conditions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.compiler.tests import xla_test
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.platform import googletest
-
-
-class OutOfMemoryTest(xla_test.XLATestCase):
-
-  def testOutputOutOfMemory(self):
-    """Allocates tensors until out of memory.
-
-    Generates a large rank-1 tensor. The tensor is an output of an XLA
-    computation, not constant.
-
-    Check that a ResourceExhaustedError is raised and can be caught.
-
-    We spin in a loop generating larger and larger tensors until an OOM event
-    happens. We may be running sandboxed, so have a small host memory limit, so
-    any hardcoded value is unlikely to land in the sweet spot between device
-    memory size and host memory size with stability.
-    """
-
-    def test_loop():
-      size = int(2e8)
-      while True:
-        with self.cached_session():
-          # Force the compiled code to not be constant by feeding in a
-          # parameter.
-          p = array_ops.placeholder(dtypes.float32, shape=[2, 1, 1])
-          with self.test_scope():
-            # Create a computation that produces a large R1 tensor as an
-            # intermediate result.  Reduce it down so that if this file was
-            # compiled without --config=cuda, we don't force a D2H copy of a
-            # large tensor and potentially OOM the host.
-            #
-            # This is a bit tricky because XLA:GPU doesn't currently support RNG
-            # ops.  Here we rely on the fact that XLA doesn't do algebraic
-            # simplifications on conv(<ones>, <filter>).
-            c = math_ops.reduce_sum(
-                nn_ops.convolution(
-                    array_ops.ones([1, size, 1]),
-                    p,
-                    padding='SAME',
-                    data_format='NWC'))
-
-            c.eval(feed_dict={p: [[[1.0]], [[2.0]]]})
-            size *= 2
-
-    self.assertRaises(errors.ResourceExhaustedError, test_loop)
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/compiler/tests/permute_test.py b/tensorflow/compiler/tests/permute_test.py
index dbb9274df4f579fbc6076bf55c9307e4d1cb7768..e2f6de821b5fd4709d305bcd17ee6ba40b1443fd 100644
--- a/tensorflow/compiler/tests/permute_test.py
+++ b/tensorflow/compiler/tests/permute_test.py
@@ -40,40 +40,48 @@ class XlaPermuteOpTest(xla_test.XLATestCase):
     self.assertAllEqual(result, expected)
 
   def testNHWCToNCHW(self):
-    x = np.array([7, 4, 9, 3], dtype=np.int32)
-    self._runPermuteAndCompare(x, "NHWC", "NCHW", [7, 3, 4, 9])
+    for dtype in {np.int32, np.int64}:
+      x = np.array([7, 4, 9, 3], dtype=dtype)
+      self._runPermuteAndCompare(x, "NHWC", "NCHW", [7, 3, 4, 9])
 
   def testNCHWToNHWC(self):
-    x = np.array([7, 4, 9, 3], dtype=np.int32)
-    self._runPermuteAndCompare(x, "NCHW", "NHWC", [7, 9, 3, 4])
+    for dtype in {np.int32, np.int64}:
+      x = np.array([7, 4, 9, 3], dtype=dtype)
+      self._runPermuteAndCompare(x, "NCHW", "NHWC", [7, 9, 3, 4])
 
   def testNHWCToHWNC(self):
-    x = np.array([7, 4, 9, 3], dtype=np.int32)
-    self._runPermuteAndCompare(x, "NHWC", "HWNC", [4, 9, 7, 3])
+    for dtype in {np.int32, np.int64}:
+      x = np.array([7, 4, 9, 3], dtype=dtype)
+      self._runPermuteAndCompare(x, "NHWC", "HWNC", [4, 9, 7, 3])
 
   def testHWNCToNHWC(self):
-    x = np.array([7, 4, 9, 3], dtype=np.int32)
-    self._runPermuteAndCompare(x, "HWNC", "NHWC", [9, 7, 4, 3])
+    for dtype in {np.int32, np.int64}:
+      x = np.array([7, 4, 9, 3], dtype=dtype)
+      self._runPermuteAndCompare(x, "HWNC", "NHWC", [9, 7, 4, 3])
 
   def testNHWCToNCHW2D(self):
-    x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=np.int32)
-    self._runPermuteAndCompare(x, "NHWC", "NCHW",
-                               [[7, 4], [5, 1], [9, 3], [4, 5]])
+    for dtype in {np.int32, np.int64}:
+      x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=dtype)
+      self._runPermuteAndCompare(x, "NHWC", "NCHW",
+                                 [[7, 4], [5, 1], [9, 3], [4, 5]])
 
   def testNHWCToHWNC2D(self):
-    x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=np.int32)
-    self._runPermuteAndCompare(x, "NHWC", "HWNC",
-                               [[9, 3], [4, 5], [7, 4], [5, 1]])
+    for dtype in {np.int32, np.int64}:
+      x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=dtype)
+      self._runPermuteAndCompare(x, "NHWC", "HWNC",
+                                 [[9, 3], [4, 5], [7, 4], [5, 1]])
 
   def testHWNCToNHWC2D(self):
-    x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=np.int32)
-    self._runPermuteAndCompare(x, "HWNC", "NHWC",
-                               [[4, 5], [7, 4], [9, 3], [5, 1]])
+    for dtype in {np.int32, np.int64}:
+      x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=dtype)
+      self._runPermuteAndCompare(x, "HWNC", "NHWC",
+                                 [[4, 5], [7, 4], [9, 3], [5, 1]])
 
   def testNCHWToNHWC2D(self):
-    x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=np.int32)
-    self._runPermuteAndCompare(x, "NCHW", "NHWC",
-                               [[7, 4], [4, 5], [5, 1], [9, 3]])
+    for dtype in {np.int32, np.int64}:
+      x = np.array([[7, 4], [9, 3], [4, 5], [5, 1]], dtype=dtype)
+      self._runPermuteAndCompare(x, "NCHW", "NHWC",
+                                 [[7, 4], [4, 5], [5, 1], [9, 3]])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py
index 77bb839409f0c323ff6ed2c8d6bd105d3003b398..9671ae0ae973ff82d22744a1feb9b4293d94bbdd 100644
--- a/tensorflow/compiler/tests/placeholder_test.py
+++ b/tensorflow/compiler/tests/placeholder_test.py
@@ -33,7 +33,7 @@ class PlaceholderTest(xla_test.XLATestCase):
       ph = array_ops.placeholder_with_default(v, shape=[])
       out = ph * 2
       sess.run(variables.variables_initializer([v]))
-      self.assertEqual(8.0, sess.run(out))
+      self.assertEqual(8.0, self.evaluate(out))
 
   def test_placeholder_with_default_fed(self):
     with self.cached_session() as sess, self.test_scope():
diff --git a/tensorflow/compiler/tests/powersign_test.py b/tensorflow/compiler/tests/powersign_test.py
index 86536da7fed0e2309beb32fee9c7c605491592ed..5b35c20027700b34500a31e174061d7087094b61 100644
--- a/tensorflow/compiler/tests/powersign_test.py
+++ b/tensorflow/compiler/tests/powersign_test.py
@@ -91,8 +91,8 @@ class PowerSignTest(xla_test.XLATestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 7 steps of powersign
         # first 4 steps with positive gradient
@@ -125,8 +125,8 @@ class PowerSignTest(xla_test.XLATestCase):
           )
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testDense(self):
     decay_steps = 10
diff --git a/tensorflow/compiler/tests/proximal_adagrad_test.py b/tensorflow/compiler/tests/proximal_adagrad_test.py
index c41b4171e26af4f7ad0237d7407a5b3691299595..63cc51a470164915b2614a06d18ca1850bb64a3c 100644
--- a/tensorflow/compiler/tests/proximal_adagrad_test.py
+++ b/tensorflow/compiler/tests/proximal_adagrad_test.py
@@ -45,15 +45,17 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([0.0, 0.0], var0.eval())
-      self.assertAllClose([0.0, 0.0], var1.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+      self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([-2.60260963, -4.29698515]), var0.eval())
-      self.assertAllClose(np.array([-0.28432083, -0.56694895]), var1.eval())
+      self.assertAllClose(
+          np.array([-2.60260963, -4.29698515]), self.evaluate(var0))
+      self.assertAllClose(
+          np.array([-0.28432083, -0.56694895]), self.evaluate(var1))
       opt_vars = opt.variables()
       self.assertStartsWith(opt_vars[0].name, var0._shared_name)
       self.assertStartsWith(opt_vars[1].name, var1._shared_name)
@@ -74,14 +76,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
-      self.assertAllClose(np.array([-1.60261, -2.296985]), var0.eval())
-      self.assertAllClose(np.array([3.715679, 2.433051]), var1.eval())
+      self.assertAllClose(np.array([-1.60261, -2.296985]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.715679, 2.433051]), self.evaluate(var1))
 
   def testProximalAdagradWithL1(self):
     with self.cached_session(), self.test_scope():
@@ -98,14 +100,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Adagrad
       for _ in range(10):
         update.run()
-      self.assertAllClose(np.array([-6.663634, -9.190331]), var0.eval())
-      self.assertAllClose(np.array([2.959304, 1.029232]), var1.eval())
+      self.assertAllClose(np.array([-6.663634, -9.190331]), self.evaluate(var0))
+      self.assertAllClose(np.array([2.959304, 1.029232]), self.evaluate(var1))
 
   def testProximalAdagradWithL1_L2(self):
     with self.cached_session(), self.test_scope():
@@ -122,15 +124,15 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Adagrad.
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval())
-      self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval())
+      self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1))
 
   def applyOptimizer(self, opt, steps=5):
     var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
@@ -141,14 +143,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
 
-    self.assertAllClose([1.0, 2.0], var0.eval())
-    self.assertAllClose([3.0, 4.0], var1.eval())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
     # Run ProximalAdagrad for a few steps
     for _ in range(steps):
       update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testEquivAdagradwithoutRegularization(self):
     with self.cached_session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/proximal_gradient_descent_test.py b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
index 3d808e6b8a71ef9fa60b671d07bfd907e9f58efc..5aec433be765dd0a04bd7ab10d5c39a5a7f48c5c 100644
--- a/tensorflow/compiler/tests/proximal_gradient_descent_test.py
+++ b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
@@ -42,15 +42,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([0.0, 0.0], var0.eval())
-      self.assertAllClose([0.0, 0.0], var1.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+      self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Gradient Descent.
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([-0.9, -1.8]), var0.eval())
-      self.assertAllClose(np.array([-0.09, -0.18]), var1.eval())
+      self.assertAllClose(np.array([-0.9, -1.8]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.09, -0.18]), self.evaluate(var1))
 
   def testProximalGradientDescentwithoutRegularization2(self):
     with self.cached_session(), self.test_scope():
@@ -64,15 +64,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Gradient Descent
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([0.1, 0.2]), var0.eval())
-      self.assertAllClose(np.array([3.91, 2.82]), var1.eval())
+      self.assertAllClose(np.array([0.1, 0.2]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.91, 2.82]), self.evaluate(var1))
 
   def testProximalGradientDescentWithL1(self):
     with self.cached_session(), self.test_scope():
@@ -86,15 +86,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps proximal gradient descent.
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-1.988, -3.988001]), var0.eval())
-      self.assertAllClose(np.array([3.67, 2.37]), var1.eval())
+      self.assertAllClose(np.array([-1.988, -3.988001]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.67, 2.37]), self.evaluate(var1))
 
   def testProximalGradientDescentWithL1_L2(self):
     with self.cached_session(), self.test_scope():
@@ -108,15 +108,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Gradient Descent
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval())
-      self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval())
+      self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1))
 
   def applyOptimizer(self, opt, steps=5):
     var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
@@ -127,14 +127,14 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
 
-    self.assertAllClose([1.0, 2.0], var0.eval())
-    self.assertAllClose([3.0, 4.0], var1.eval())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
     # Run ProximalAdagrad for a few steps
     for _ in range(steps):
       update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testEquivGradientDescentwithoutRegularization(self):
     with self.cached_session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/qr_op_test.py b/tensorflow/compiler/tests/qr_op_test.py
index 236b1b881dcaffc1a5b0c6395f0605c1d7ef0269..b4d4193e35f9e0e3b23d0242ed076dd811f4ee2b 100644
--- a/tensorflow/compiler/tests/qr_op_test.py
+++ b/tensorflow/compiler/tests/qr_op_test.py
@@ -63,7 +63,7 @@ class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
-    precision = self.AdjustedNorm(xx.eval() - identity.eval())
+    precision = self.AdjustedNorm(xx.eval() - self.evaluate(identity))
     self.assertTrue(np.all(precision < 5.0))
 
   def _test(self, dtype, shape, full_matrices):
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 36ef6ed5fee78bad10bb1ee0bf3eb7824d05c206..97ffad34c00b8ec16eb1ec109ba5d980e0ce673d 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -46,9 +46,9 @@ class RandomOpsTest(xla_test.XLATestCase):
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # deterministic output, all three outputs will be bitwise identical.
@@ -83,7 +83,7 @@ class RandomOpsTest(xla_test.XLATestCase):
         with self.test_scope():
           x = random_ops.random_uniform(
               shape=[1000], dtype=dtype, minval=-2, maxval=33)
-        y = sess.run(x)
+        y = self.evaluate(x)
         self.assertTrue((y >= -2).sum() == 1000)
         self.assertTrue((y < 33).sum() == 1000)
 
@@ -102,7 +102,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.cached_session() as sess:
         with self.test_scope():
           x = random_ops.truncated_normal(shape=[count], dtype=dtype)
-        y = sess.run(x)
+        y = self.evaluate(x)
 
         def normal_cdf(x):
           return .5 * math.erfc(-x / math.sqrt(2))
@@ -111,7 +111,7 @@ class RandomOpsTest(xla_test.XLATestCase):
           return math.exp(-(x**2) / 2.) / math.sqrt(2 * math.pi)
 
         def probit(x, sess=sess):
-          return sess.run(special_math.ndtri(x))
+          return self.evaluate(special_math.ndtri(x))
 
         a = -2.
         b = 2.
@@ -148,7 +148,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.test_scope():
         x = math_ops.range(1 << 16)
         shuffle = random_ops.random_shuffle(x)
-      result = sess.run(shuffle)
+      result = self.evaluate(shuffle)
       expected = range(1 << 16)
       # Compare sets to avoid randomness behavior changes but make sure still
       # have all the values.
@@ -159,7 +159,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.test_scope():
         x = array_ops.diag(math_ops.range(20))
         shuffle = random_ops.random_shuffle(x)
-      result = sess.run(shuffle)
+      result = self.evaluate(shuffle)
       expected = np.diag(range(20)).flatten()
       # Compare sets to avoid randomness behavior changes but make sure still
       # have all the values.
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index dc119fb0f8a41a3772a8c9508bf2db657f57de88..d23fd125163d1afe8c7fd5e008d4b617ff4b2874 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include <random>
 #include <unordered_map>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -2465,20 +2466,21 @@ TEST_F(OpTest, Pack) {
   });
 }
 
-// TODO(b/31741898): crashes on GPU.
 TEST_F(OpTest, Pad) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
     std::vector<int64> t_dims = RandomDims();
 
-    // TODO(b/31741996): re-enable DT_INT64 when bug is fixed.
-    // DataType tpaddings = Choose<DataType>({DT_INT32, DT_INT64});
-    DataType tpaddings = DT_INT32;
+    DataType tpaddings = Choose<DataType>({DT_INT32, DT_INT64});
     std::vector<int64> paddings_vec;
-    std::uniform_int_distribution<int> distribution(0, 7);
     for (int i = 0; i < t_dims.size(); ++i) {
-      paddings_vec.push_back(distribution(generator()));
-      paddings_vec.push_back(distribution(generator()));
+      std::uniform_int_distribution<int> pad_distribution(0, t_dims[i]);
+      int pad_size = pad_distribution(generator());
+      std::uniform_int_distribution<int> lower_distribution(0, pad_size);
+      int low_pad_size = lower_distribution(generator());
+      paddings_vec.push_back(low_pad_size);
+      paddings_vec.push_back(pad_size - low_pad_size);
+      t_dims[i] -= pad_size;
     }
     Tensor paddings;
     CHECK(
@@ -2687,6 +2689,37 @@ TEST_F(OpTest, Reverse) {
   });
 }
 
+TEST_F(OpTest, ReverseSequence) {
+  Repeatedly([this]() {
+    std::vector<int64> dims = RandomDims(/*min_rank=*/2);
+    auto type = Choose<DataType>(kAllXlaTypes);
+    int64 rank = dims.size();
+
+    // Choose random batch and sequence dimensions.
+    std::vector<int> shuffled_dim_ids(rank);
+    absl::c_iota(shuffled_dim_ids, 0);
+    absl::c_shuffle(shuffled_dim_ids, generator());
+    shuffled_dim_ids.resize(2);
+    int batch_dim = shuffled_dim_ids[0];
+    int seq_dim = shuffled_dim_ids[1];
+
+    int batch_size = dims[batch_dim];
+    int max_seq_len = dims[seq_dim];
+    std::vector<int32> seq_lens(batch_size);
+    std::uniform_int_distribution<int32> d(0, max_seq_len);
+    absl::c_generate(seq_lens, [&]() { return d(generator()); });
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("ReverseSequence")
+            .RandomInput(type, dims)
+            .Input(test::AsTensor<int32>(seq_lens))
+            .Attr("seq_dim", seq_dim)
+            .Attr("batch_dim", batch_dim)
+            .Attr("T", type)
+            .Attr("Tlen", DT_INT32));
+  });
+}
+
 TEST_F(OpTest, ReverseV2) {
   Repeatedly([this]() {
     auto type = Choose<DataType>(kAllXlaTypes);
@@ -3349,10 +3382,10 @@ int main(int argc, char** argv) {
   }
   // XLA devices register kernels at construction time; create all known devices
   // to make sure the kernels are registered.
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
       tensorflow::SessionOptions(), "", &devices));
-  tensorflow::DeviceMgr device_mgr(devices);
+  tensorflow::DeviceMgr device_mgr(std::move(devices));
 
   tensorflow::Device* ignored;
   TF_QCHECK_OK(
diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py
index 132c59c32c9db0c8759bdbb31f8613c3ef88b485..e8fc81bbb5472669c408b8bbdbcdfcdcf461131f 100644
--- a/tensorflow/compiler/tests/reduce_ops_test.py
+++ b/tensorflow/compiler/tests/reduce_ops_test.py
@@ -91,6 +91,7 @@ class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       np.array([], dtype=np.bool).reshape(0, 3),
       np.array([[False, True, False], [True, True, False]]),
   ]
+  ONES = [np.ones([34000, 2])]
 
   def testReduceSumF32(self, index_dtype):
     self._testReduction(math_ops.reduce_sum, np.sum, np.float32, self.REAL_DATA,
@@ -149,6 +150,11 @@ class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase):
     self._testReduction(math_ops.reduce_mean, np.mean, np.float32,
                         self.NONEMPTY_REAL_DATA, index_dtype)
 
+  def testReduceMeanF16(self, index_dtype):
+    if np.float16 in self.all_types:
+      self._testReduction(math_ops.reduce_mean, np.mean, np.float16, self.ONES,
+                          index_dtype)
+
   def testReduceMeanC64(self, index_dtype):
     self._testReduction(math_ops.reduce_mean, np.mean, np.complex64,
                         self.NONEMPTY_COMPLEX_DATA, index_dtype)
diff --git a/tensorflow/compiler/tests/rmsprop_test.py b/tensorflow/compiler/tests/rmsprop_test.py
index 8840a1329a907bddc6ef1cb6dd1c2a6d234def5c..dc3e90b4afa41c08d899ee195d42fb91678bad1c 100644
--- a/tensorflow/compiler/tests/rmsprop_test.py
+++ b/tensorflow/compiler/tests/rmsprop_test.py
@@ -76,7 +76,7 @@ class RmspropTest(xla_test.XLATestCase):
           rms_opt = rmsprop.RMSPropOptimizer(learning_rate, centered=centered)
           rms_update = rms_opt.apply_gradients(
               zip([grads0, grads1], [var0, var1]))
-          variables.global_variables_initializer().run()
+          self.evaluate(variables.global_variables_initializer())
 
           mg0 = rms_opt.get_slot(var0, "mg")
           self.assertEqual(mg0 is not None, centered)
@@ -92,12 +92,12 @@ class RmspropTest(xla_test.XLATestCase):
           self.assertTrue(mom1 is not None)
 
           # Fetch params to validate initial values
-          self.assertAllClose([1.0, 2.0], var0.eval())
-          self.assertAllClose([3.0, 4.0], var1.eval())
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
           # Run 3 steps of RMSProp
           for _ in range(3):
-            rms_update.run()
+            self.evaluate(rms_update)
 
             var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
                 var0_np,
@@ -118,14 +118,14 @@ class RmspropTest(xla_test.XLATestCase):
 
             # Validate updated params
             if centered:
-              self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-              self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-            self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-            self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-            self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-            self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-            self.assertAllCloseAccordingToType(var0_np, var0.eval())
-            self.assertAllCloseAccordingToType(var1_np, var1.eval())
+              self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+              self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+            self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+            self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+            self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+            self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
index 897db384b7e8067b0460b5f344201f101a4d8479..17639bd8a755b9e9f5acc77979ac7a4149f112db 100644
--- a/tensorflow/compiler/tests/scan_ops_test.py
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -71,7 +71,7 @@ def handle_options(func, x, axis, exclusive, reverse):
 
 class CumsumTest(xla_test.XLATestCase):
 
-  valid_dtypes = [np.float32]
+  valid_dtypes = [np.float32, np.int32]
 
   def axis_dtypes(self):
     return set(self.int_types).intersection([np.int32, np.int64])
@@ -149,7 +149,7 @@ class CumsumTest(xla_test.XLATestCase):
 
 class CumprodTest(xla_test.XLATestCase):
 
-  valid_dtypes = [np.float32]
+  valid_dtypes = [np.float32, np.int32]
 
   def axis_dtypes(self):
     return set(self.int_types).intersection([np.int32, np.int64])
diff --git a/tensorflow/compiler/tests/sort_ops_test.py b/tensorflow/compiler/tests/sort_ops_test.py
index dbf4beb693ec1766e6b7b5daaed4be4e1d874fba..3e499c2fb176a6d63fe3590e18a4a90e461e096a 100644
--- a/tensorflow/compiler/tests/sort_ops_test.py
+++ b/tensorflow/compiler/tests/sort_ops_test.py
@@ -48,13 +48,32 @@ class XlaSortOpTest(xla_test.XLATestCase):
         self.assertAllClose(v, result, rtol=1e-3)
 
   def testSort(self):
-    supported_types = set([dtypes.bfloat16.as_numpy_dtype, np.float32])
+    supported_types = set(
+        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
     for dtype in supported_types.intersection(self.numeric_types):
       x = np.arange(101, dtype=dtype)
       np.random.shuffle(x)
       self._assertOpOutputMatchesExpected(
           xla.sort, [x], expected=[np.arange(101, dtype=dtype)])
 
+  def testKeyValueSort(self):
+    supported_key_types = set(
+        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
+    supported_value_types = set(
+        [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32,
+         dtypes.int64.as_numpy_dtype, dtypes.uint64.as_numpy_dtype])
+    for key_type in supported_key_types.intersection(self.numeric_types):
+      for value_type in supported_value_types.intersection(self.numeric_types):
+        x = np.arange(101, dtype=key_type)
+        np.random.shuffle(x)
+        y = (-x).astype(value_type)
+        self._assertOpOutputMatchesExpected(
+            xla.key_value_sort, [x, y],
+            expected=[
+                np.arange(101, dtype=key_type),
+                -np.arange(101, dtype=value_type)
+            ])
+
   def testTopK(self):
     supported_types = set(
         [dtypes.bfloat16.as_numpy_dtype, np.float32, np.int32, np.uint32])
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index e8741bc468585ff9fb049dcd87700f8048d74026..ee7ca7e6f196e114ff18e2597145e5c198980b08 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -23,9 +23,9 @@ import math
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
-from tensorflow.contrib import stateless
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import stateless_random_ops as stateless
 from tensorflow.python.ops.distributions import special_math
 from tensorflow.python.platform import test
 
@@ -33,8 +33,11 @@ from tensorflow.python.platform import test
 class StatelessRandomOpsTest(xla_test.XLATestCase):
   """Test cases for stateless random-number generator operators."""
 
-  def _random_types(self):
-    return self.float_types & {dtypes.float32, dtypes.float64}
+  def _random_types(self, include_int=False):
+    allowed_types = {dtypes.float32, dtypes.float64, dtypes.bfloat16}
+    if include_int:
+      allowed_types.update({dtypes.int32, dtypes.int64})
+    return self.all_tf_types & allowed_types
 
   def testDeterminism(self):
     # Stateless values should be equal iff the seeds are equal (roughly)
@@ -46,6 +49,11 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
       ]:
         for shape in (), (3,), (2, 5):
           for dtype in self._random_types():
+            # Skip bfloat16. The result of bfloat16 is truncated from 32-bit
+            # result. With different seeds, the 32-bit results are different,
+            # but the truncated 16-bit results might be the same.
+            if dtype == dtypes.bfloat16:
+              continue
             pure = stateless_op(shape, seed=seed_t, dtype=dtype)
             values = [(seed, pure.eval(feed_dict={
                 seed_t: seed
@@ -56,13 +64,16 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
 
   def testRandomUniformIsInRange(self):
     with self.cached_session() as sess, self.test_scope():
-      for dtype in self._random_types():
+      for dtype in self._random_types(include_int=True):
+        maxval = 1
+        if dtype.is_integer:
+          maxval = 100
         seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
         x = stateless.stateless_random_uniform(
-            shape=[1000], seed=seed_t, dtype=dtype)
+            shape=[1000], seed=seed_t, maxval=maxval, dtype=dtype)
         y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
         self.assertTrue(np.all(y >= 0))
-        self.assertTrue(np.all(y < 1))
+        self.assertTrue(np.all(y < maxval))
 
   def _chi_squared(self, x, bins):
     """Pearson's Chi-squared test."""
@@ -75,12 +86,18 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
   def testDistributionOfStatelessRandomUniform(self):
     """Use Pearson's Chi-squared test to test for uniformity."""
     with self.cached_session() as sess, self.test_scope():
-      for dtype in self._random_types():
+      for dtype in self._random_types(include_int=True):
         seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
         n = 1000
+        maxval = 1
+        if dtype.is_integer:
+          maxval = 100
         x = stateless.stateless_random_uniform(
-            shape=[n], seed=seed_t, dtype=dtype)
+            shape=[n], seed=seed_t, maxval=maxval, dtype=dtype)
         y = sess.run(x, {seed_t: [565656, 121212]})
+        if maxval > 1:
+          # Normalize y to range [0, 1).
+          y = y.astype(float) / maxval
         # Tests that the values are distributed amongst 10 bins with equal
         # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with
         # p=0.05. This test is probabilistic and would be flaky if the random
@@ -121,7 +138,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
         # The constant 2.492 is the 5% critical value for the Anderson-Darling
         # test where the mean and variance are known. This test is probabilistic
         # so to avoid flakiness the seed is fixed.
-        self.assertTrue(self._anderson_darling(y) < 2.492)
+        self.assertTrue(self._anderson_darling(y.astype(float)) < 2.492)
 
   def testTruncatedNormalIsInRange(self):
     for dtype in self._random_types():
@@ -139,7 +156,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
           return math.exp(-(x**2) / 2.) / math.sqrt(2 * math.pi)
 
         def probit(x, sess=sess):
-          return sess.run(special_math.ndtri(x))
+          return self.evaluate(special_math.ndtri(x))
 
         a = -2.
         b = 2.
@@ -157,6 +174,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
         # Burkardt, John. "The Truncated Normal Distribution".
         # Department of Scientific Computing website. Florida State University.
         expected_mean = mu + (normal_pdf(alpha) - normal_pdf(beta)) / z * sigma
+        y = y.astype(float)
         actual_mean = np.mean(y)
         self.assertAllClose(actual_mean, expected_mean, atol=5e-4)
 
@@ -169,8 +187,8 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
             (alpha * normal_pdf(alpha) - beta * normal_pdf(beta)) / z) - (
                 (normal_pdf(alpha) - normal_pdf(beta)) / z)**2)
         actual_variance = np.var(y)
-        self.assertAllClose(actual_variance, expected_variance, rtol=1e-3)
-
+        self.assertAllClose(actual_variance, expected_variance,
+                            rtol=5e-3 if dtype == dtypes.bfloat16 else 1e-3)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index 78244d0b366d9128a4c59f786e4c5ac12e743b75..d7e26d79c4c054860ade5c8960a3bca984e020b0 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -79,7 +79,8 @@ class TensorArrayTest(xla_test.XLATestCase):
       c0 = w2.stack()
 
       self.assertAllEqual(
-          convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]), c0.eval())
+          convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]),
+          self.evaluate(c0))
 
   def testTensorArrayWritePack(self):
     for dtype in self.numeric_tf_types:
@@ -97,7 +98,7 @@ class TensorArrayTest(xla_test.XLATestCase):
 
       c0 = w2.stack()
 
-      self.assertAllEqual([3, 0, 1], c0.eval().shape)
+      self.assertAllEqual([3, 0, 1], self.evaluate(c0).shape)
 
   def _testTensorArrayWriteConcat(self, tf_dtype):
     with self.cached_session(), self.test_scope():
@@ -113,8 +114,8 @@ class TensorArrayTest(xla_test.XLATestCase):
       c0 = w2.concat()
 
       self.assertAllEqual(
-          convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0],
-                   [106.0, 107.0], [8.0, 9.0], [204.0, 205.0]]), c0.eval())
+          convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0], [106.0, 107.0],
+                   [8.0, 9.0], [204.0, 205.0]]), self.evaluate(c0))
 
   def testTensorArrayWriteConcat(self):
     for dtype in self.numeric_tf_types:
@@ -341,7 +342,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtype2, flow_in=w0.flow)
         with self.assertRaisesOpError("TensorArray dtype is "):
-          r0_bad.eval()
+          self.evaluate(r0_bad)
 
         # Test reading from a different index than the one we wrote to
         w0.read(1)
@@ -422,7 +423,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       w2 = h2.write(0, 5.0)
       r2 = w2.read(0)
       r = r1 + r2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
   def _testTensorArrayGradientWriteReadType(self, dtype):
     with self.cached_session() as session, self.test_scope():
@@ -504,7 +505,7 @@ class TensorArrayTest(xla_test.XLATestCase):
                 [-0.5, 1.5],  # read(0) gradient
                 [20.0, 30.0, 40.0, 50.0],  # concat gradient
             ])
-      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+      grad_vals = self.evaluate(grad_r)  # 2 + 2 entries
 
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
@@ -526,7 +527,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       with ops.control_dependencies([r0_readtwice]):
         r1_readtwice = w_readtwice.read(0)
 
-      self.assertAllEqual([1.0, -1.0], r1_readtwice.eval())
+      self.assertAllEqual([1.0, -1.0], self.evaluate(r1_readtwice))
 
   def _testTensorArrayGradientUnpackRead(self):
     with self.cached_session() as session, self.test_scope():
@@ -592,7 +593,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       s = ta.size()
-      self.assertAllEqual(3, s.eval())
+      self.assertAllEqual(3, self.evaluate(s))
 
   def testWriteCloseTensorArray(self):
     with self.cached_session(), self.test_scope():
@@ -722,7 +723,7 @@ class TensorArrayTest(xla_test.XLATestCase):
 
   #     r = acc2.stack()
   #     grad = gradients_impl.gradients(r, [x])[0]
-  #     self.assertAllClose(31.0, grad.eval())
+  #     self.assertAllClose(31.0, self.evaluate(grad))
 
   def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
     with self.cached_session() as session, self.test_scope():
@@ -912,7 +913,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertEqual(0, ta.size().eval())
       ta = ta.unstack(array_ops.zeros([0, 3, 5]))
       packed = ta.stack()
-      self.assertAllEqual([0, 3, 5], packed.eval().shape)
+      self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
       self.assertAllEqual([0, 5], ta.concat().eval().shape)
@@ -920,6 +921,34 @@ class TensorArrayTest(xla_test.XLATestCase):
   def testTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
+  def _testTensorArrayScatterRead(self, tf_dtype):
+    with self.cached_session() as session, self.test_scope():
+      convert = _make_converter(tf_dtype)
+
+      ta = tensor_array_ops.TensorArray(
+          dtype=tf_dtype,
+          tensor_array_name="foo",
+          size=10)
+
+      indices = constant_op.constant([1, 8])
+      value = constant_op.constant(convert([[1.0, -1.0], [10.0, -10.0]]))
+      id0 = array_ops.placeholder(dtypes.int32)
+      id1 = array_ops.placeholder(dtypes.int32)
+
+      w = ta.scatter(indices, value)
+      r0 = w.read(id0)
+      r1 = w.read(id1)
+
+      # Test aggregation of read
+      read_vals = session.run([r0, r1], feed_dict={id0: 1, id1: 8})
+      self.assertAllEqual(convert([1.0, -1.0]), read_vals[0])
+      self.assertAllEqual(convert([10.0, -10.0]), read_vals[1])
+
+  def testTensorArrayScatterRead(self):
+    for dtype in self.numeric_tf_types:
+      self._testTensorArrayScatterRead(dtype)
+    self._testTensorArrayScatterRead(dtypes.bool)
+
   def testTensorArrayScatterReadAndGradients(self):
     with self.cached_session() as session, self.test_scope():
       ta = tensor_array_ops.TensorArray(
@@ -929,15 +958,18 @@ class TensorArrayTest(xla_test.XLATestCase):
 
       indices = constant_op.constant([1, 8])
       value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
+      id0 = array_ops.placeholder(dtypes.int32)
+      id1 = array_ops.placeholder(dtypes.int32)
 
       w = ta.scatter(indices, value)
-      r0 = w.read(1)
-      r1 = w.read(8)
+      r0 = w.read(id0)
+      r1 = w.read(id1)
 
       # Test combined gradients + aggregation of read(0).
       grad = gradients_impl.gradients(
           ys=[r0, r1], xs=[value], grad_ys=[[2.0, 3.0], [4.0, 5.0]])
-      read_vals, grad_vals = session.run([[r0, r1], grad])
+      read_vals, grad_vals = session.run([[r0, r1], grad],
+                                         feed_dict={id0: 1, id1: 8})
 
       self.assertEqual(len(read_vals), 2)
       self.assertEqual(len(grad_vals), 1)
@@ -1010,8 +1042,8 @@ class TensorArrayTest(xla_test.XLATestCase):
           (read0, read1, size0, size1))
 
       # Tests that the control dependencies was added and executed.
-      self.assertEqual(1, v0.eval())
-      self.assertEqual(1, v1.eval())
+      self.assertEqual(1, self.evaluate(v0))
+      self.assertEqual(1, self.evaluate(v1))
 
       # Tests correct TensorArray.
       self.assertEqual(read0_v, 0)
diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
index b556723eec77246c87cf88a48c17a307c35fd857..5c079d595c440cac644f5461154509abe7b1d1ed 100644
--- a/tensorflow/compiler/tests/tensor_list_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -20,22 +20,13 @@ from __future__ import division
 from __future__ import print_function
 import numpy as np
 from tensorflow.compiler.tests import xla_test
-from tensorflow.python.client import session
-from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import list_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
 
 
 def scalar_shape():
diff --git a/tensorflow/compiler/tests/test_utils.py b/tensorflow/compiler/tests/test_utils.py
index 6abde18ea91f16d153a154b94effab037a911c6c..0e77dbf1a79d3dbacb77bab8b8e3df9bcc6287e1 100644
--- a/tensorflow/compiler/tests/test_utils.py
+++ b/tensorflow/compiler/tests/test_utils.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 
 def ConvertBetweenDataFormats(x, data_format_src, data_format_dst):
@@ -61,3 +62,14 @@ def PermuteDimsBetweenDataFormats(dims, data_format_src, data_format_dst):
   dim_map = {d: i for i, d in enumerate(data_format_src)}
   permuted_dims = [dims[dim_map[d]] for d in data_format_dst]
   return permuted_dims
+
+
+_JIT_WARMUP_ITERATIONS = 10
+
+
+def RunWithWarmup(sess, op_to_run, feed_dict, options=None, run_metadata=None):
+  """Runs a graph a few times to ensure that its clusters are compiled."""
+  for _ in xrange(0, _JIT_WARMUP_ITERATIONS):
+    sess.run(op_to_run, feed_dict, options=options)
+  return sess.run(
+      op_to_run, feed_dict, options=options, run_metadata=run_metadata)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 77f6eee0cf8ddc9b76f150e1038bf66da34c5218..95c9e7ffd4651642781143c2c1940b0e51e1e470 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -358,6 +358,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([[-0.05, 6.05, 5]], dtype=dtype),
           expected=np.array([[0, 6, 5]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          nn_ops.leaky_relu,
+          np.array([[-2, -1, 0, 1, 2]], dtype=dtype),
+          expected=np.array([[-0.4, -0.2, 0.0, 1.0, 2.0]], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           nn_ops.softmax,
           np.array([1, 2, 3, 4], dtype=dtype),
@@ -476,6 +481,72 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([-1, -0.5, 0, 0.3], dtype=dtype),
           expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype))
 
+      def quantize_and_dequantize_v2_round_half_up(x):
+        return array_ops.quantize_and_dequantize_v2(
+            x,
+            -1,
+            1.0,
+            signed_input=True,
+            num_bits=8,
+            range_given=True,
+            round_mode="HALF_UP")
+
+      self._assertOpOutputMatchesExpected(
+          quantize_and_dequantize_v2_round_half_up,
+          np.array([-0.8, -0.5, 0, 0.3, 0.8, -2, 33], dtype=dtype),
+          expected=np.array([
+              -102.0 / 127,
+              -63.0 / 127,
+              0,
+              38.0 / 127,
+              102.0 / 127,
+              -128.0 / 127,
+              1,
+          ],
+                            dtype=dtype))
+
+      def quantize_and_dequantize_v2_round_half_to_even(x):
+        return array_ops.quantize_and_dequantize_v2(
+            x,
+            -1.0,
+            1.0,
+            signed_input=True,
+            num_bits=8,
+            range_given=True,
+            round_mode="HALF_TO_EVEN")
+
+      self._assertOpOutputMatchesExpected(
+          quantize_and_dequantize_v2_round_half_to_even,
+          np.array(
+              [
+                  -0.8,
+                  # The -0.5 should become -63.5 after scaling and with
+                  # rounding this should become -64. But with the test
+                  # unary_ops_test_cpu_ondemand, this fails as the result
+                  # before scaling becomes -63.499996 and gets rounded to -63.
+                  # TODO(sreenik): Some one more familiar with this test needs
+                  # to take a look and resolve this. This works on all other
+                  # variations of the platform like cpu, and gpu.
+                  # -0.5,
+                  0,
+                  0.3,
+                  0.8,
+                  -2,
+                  33
+              ],
+              dtype=dtype),
+          expected=np.array(
+              [
+                  -102.0 / 127,
+                  # -64.0 / 127,
+                  0,
+                  38.0 / 127,
+                  102.0 / 127,
+                  -128.0 / 127,
+                  1,
+              ],
+              dtype=dtype))
+
       def quantize_and_dequantize_v3(x):
         return array_ops.quantize_and_dequantize_v3(
             x, -127, 127, num_bits=8, signed_input=True, range_given=False)
@@ -724,6 +795,15 @@ class UnaryOpsTest(xla_test.XLATestCase):
         lambda x: array_ops.bitcast(x, dtypes.int32),
         np.array([1e-45, 1.0], np.float32),
         expected=np.array([1, 0x3f800000], np.int32))
+    if np.int64 in self.numeric_types:
+      self._assertOpOutputMatchesExpected(
+          lambda x: array_ops.bitcast(x, dtypes.int64),
+          np.array([1, 0x100000003f800000], np.uint64),
+          expected=np.array([1, 0x100000003f800000], np.int64))
+      self._assertOpOutputMatchesExpected(
+          lambda x: array_ops.bitcast(x, dtypes.uint64),
+          np.array([1, 0x100000003f800000], np.int64),
+          expected=np.array([1, 0x100000003f800000], np.uint64))
 
   def testInvertPermutation(self):
     self._assertOpOutputMatchesExpected(
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index dd2c252d383bca9c59033ac07e442b487e4975a6..fcd7ac5ba1ca5049246e93e6f5f76746fb28c6b8 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -40,6 +40,19 @@ from tensorflow.python.training.gradient_descent import GradientDescentOptimizer
 class VariableOpsTest(xla_test.XLATestCase):
   """Test cases for resource variable operators."""
 
+  def testWriteEmptyShape(self):
+    # Verifies that we can pass an uninitialized variable with an empty shape,
+    # assign it a value, and successfully return it.
+    for dtype in self.numeric_types:
+      with self.test_session() as sess, self.test_scope():
+        zeros = np.zeros([3, 0], dtype=dtype)
+        v = resource_variable_ops.ResourceVariable(zeros)
+        p = array_ops.placeholder(dtype)
+        x = v.assign(p)
+        with ops.control_dependencies([x]):
+          y = v.read_value()
+        self.assertAllClose(zeros, sess.run(y, {p: zeros}))
+
   def testOneWriteOneOutput(self):
     # Regression test for a bug where computations with one non-constant
     # output and one variable update were mishandled.
@@ -64,7 +77,7 @@ class VariableOpsTest(xla_test.XLATestCase):
         sess.run(variables.variables_initializer([v]))
         x = v.sparse_read(2)
         self.assertAllClose(
-            np.array([8j, 9, 10, 11]).astype(dtype), sess.run(x))
+            np.array([8j, 9, 10, 11]).astype(dtype), self.evaluate(x))
 
   def testSparseRead1DIndices(self):
     for dtype in self.numeric_types:
@@ -76,7 +89,7 @@ class VariableOpsTest(xla_test.XLATestCase):
         x = v.sparse_read([2, 1])
         self.assertAllClose(
             np.array([[8, 9, 10, 11], [4, 5, 6j, 7]]).astype(dtype),
-            sess.run(x))
+            self.evaluate(x))
 
   def testSparseRead2DIndices(self):
     for dtype in self.numeric_types:
@@ -89,7 +102,7 @@ class VariableOpsTest(xla_test.XLATestCase):
         self.assertAllClose(
             np.array([[[8, 9, 10, 11], [4, 5, 6, 7]],
                       [[0, 1, 2j, 3], [8, 9, 10, 11]]]).astype(dtype),
-            sess.run(x))
+            self.evaluate(x))
 
   def testSparseRead2DIndices3DTensor(self):
     for dtype in self.numeric_types:
@@ -102,9 +115,9 @@ class VariableOpsTest(xla_test.XLATestCase):
         x = v.sparse_read([[2, 1], [3, 0]])
         self.assertAllClose(
             np.array(
-                [[[[20, 21, 22], [23, 24j, 25]], [[10, 11, 12], [13, 14, 15]]
-                 ], [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]
-                ],).astype(dtype), sess.run(x))
+                [[[[20, 21, 22], [23, 24j, 25]], [[10, 11, 12], [13, 14, 15]]],
+                 [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]
+                ],).astype(dtype), self.evaluate(x))
 
   def testShape(self):
     for dtype in self.numeric_types:
@@ -216,7 +229,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_add(
               handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[3], [7]])
+      self.assertAllEqual(self.evaluate(read), [[3], [7]])
 
   def testScatterSub(self):
     with self.test_session() as sess, self.test_scope():
@@ -229,7 +242,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_sub(
               handle, [1], constant_op.constant([[2]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[4], [-1]])
+      self.assertAllEqual(self.evaluate(read), [[4], [-1]])
 
   def testScatterMul(self):
     with self.test_session() as sess, self.test_scope():
@@ -242,7 +255,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_mul(
               handle, [0], constant_op.constant([[5]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[5]])
+      self.assertEqual(self.evaluate(read), [[5]])
 
   def testScatterDiv(self):
     with self.test_session() as sess, self.test_scope():
@@ -255,7 +268,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_div(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[2]])
+      self.assertAllEqual(self.evaluate(read), [[2]])
 
   def testScatterMin(self):
     with self.test_session() as sess, self.test_scope():
@@ -268,7 +281,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_min(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterMax(self):
     with self.test_session() as sess, self.test_scope():
@@ -281,7 +294,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_max(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[6]])
+      self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterUpdate(self):
     with self.test_session() as sess, self.test_scope():
@@ -294,7 +307,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_update(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterAddScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -307,7 +320,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_add(
               handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterSubScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -320,7 +333,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_sub(
               handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[-1]])
+      self.assertEqual(self.evaluate(read), [[-1]])
 
   def testScatterMulScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -333,7 +346,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_mul(
               handle, [0], constant_op.constant(5, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[5]])
+      self.assertEqual(self.evaluate(read), [[5]])
 
   def testScatterDivScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -346,7 +359,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_div(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[2]])
+      self.assertEqual(self.evaluate(read), [[2]])
 
   def testScatterMinScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -359,7 +372,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_min(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterMaxScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -372,7 +385,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_max(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[6]])
+      self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterNdAddOps(self):
     with self.test_session() as sess, self.test_scope():
@@ -387,7 +400,7 @@ class VariableOpsTest(xla_test.XLATestCase):
       sess.run(gen_state_ops.resource_scatter_nd_add(handle, indices, updates))
       read = resource_variable_ops.read_variable_op(
           handle, dtype=dtypes.float32)
-      self.assertAllClose(expected, sess.run(read))
+      self.assertAllClose(expected, self.evaluate(read))
 
   def testScatterNdUpdateAddOps(self):
     with self.test_session() as sess, self.test_scope():
@@ -403,7 +416,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           gen_state_ops.resource_scatter_nd_update(handle, indices, updates))
       read = resource_variable_ops.read_variable_op(
           handle, dtype=dtypes.float32)
-      self.assertAllClose(expected, sess.run(read))
+      self.assertAllClose(expected, self.evaluate(read))
 
 
 class StridedSliceAssignChecker(object):
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index 28d61fb07dcb665fa0dbe3f3e566e291e24fa662..ef55292b1be91a731ec556d7efa9cdf1a696e5cc 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -81,7 +81,7 @@ class XlaDeviceTest(xla_test.XLATestCase):
     with self.cached_session() as sess:
       with self.test_scope():
         x = gen_control_flow_ops.control_trigger()
-      sess.run(x)
+      self.evaluate(x)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 3f631f91ec442c149b3ea4df3826d98b0419a76f..5a0d9b9af9d55a8dee809d3cf909bce39c3b8b6c 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -9,6 +9,7 @@ package_group(
         "//tensorflow/compiler/jit/...",
         "//tensorflow/compiler/tests/...",
         "//tensorflow/compiler/tf2xla/...",
+        "//tensorflow/contrib/compiler/...",
     ],
 )
 
@@ -166,6 +167,7 @@ cc_library(
         "xla_compilation_device.cc",
         "xla_compiler.cc",
         "xla_context.cc",
+        "xla_expression.cc",
         "xla_helpers.cc",
         "xla_op_kernel.cc",
         "xla_op_registry.cc",
@@ -180,6 +182,7 @@ cc_library(
         "xla_compilation_device.h",
         "xla_compiler.h",
         "xla_context.h",
+        "xla_expression.h",
         "xla_helpers.h",
         "xla_op_kernel.h",
         "xla_op_registry.h",
@@ -193,6 +196,8 @@ cc_library(
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
+        "//tensorflow/compiler/jit:flags",
+        "//tensorflow/compiler/jit:xla_cluster_util",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -200,13 +205,13 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -214,7 +219,10 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
@@ -308,6 +316,7 @@ tf_cc_test(
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
@@ -359,8 +368,12 @@ tf_cc_test(
 
 tf_cc_test(
     name = "xla_compiler_test",
-    srcs = ["xla_compiler_test.cc"],
+    srcs = [
+        "xla_compiler_test.cc",
+        "xla_expression_test.cc",
+    ],
     deps = [
+        ":common",
         ":side_effect_util",
         ":xla_compiler",
         "//tensorflow/cc:cc_ops",
@@ -383,6 +396,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -412,6 +426,7 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/compiler/jit:xla_cluster_util",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:ops",
@@ -424,21 +439,15 @@ cc_library(
     name = "dump_graph",
     srcs = [
         "dump_graph.cc",
-        "dump_graph_flags.cc",
-        "dump_graph_flags.h",
     ],
     hdrs = [
         "dump_graph.h",
     ],
     deps = [
-        "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -661,5 +670,6 @@ cc_library(
     hdrs = ["side_effect_util.h"],
     deps = [
         "//tensorflow/core:core_cpu",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index 027ca6d2d2f616177d91d9d57d1ff373bab2a754..a57095f91e43f6b31b58e5a5f36331241451b545 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
@@ -67,25 +68,18 @@ Status BackwardsConstAnalysis(const Graph& g,
     }
 
     // Mark any compile-time constant operator arguments as const.
-    const std::unordered_set<string>* const_inputs =
-        XlaOpRegistry::CompileTimeConstantInputs(node->type_string());
-    if (!const_inputs || const_inputs->empty()) return;
+    std::vector<int> const_input_idxs;
+    status = XlaOpRegistry::CompileTimeConstantInputs(
+        node->def(), node->op_def(), &const_input_idxs);
 
-    NameRangeMap input_name_ranges;
-    status =
-        NameRangesForNode(*node, node->op_def(), &input_name_ranges, nullptr);
-    if (!status.ok()) return;
-
-    for (const string& input : *const_inputs) {
-      auto name_range = input_name_ranges.find(input);
-      if (name_range == input_name_ranges.end()) continue;
+    if (!status.ok()) {
+      return;
+    }
 
-      for (Edge const* edge : node->in_edges()) {
-        if (edge->dst_input() >= name_range->second.first &&
-            edge->dst_input() < name_range->second.second &&
-            edge_filter(*edge)) {
-          (*compile_time_const_nodes)[edge->src()->id()] = true;
-        }
+    for (Edge const* edge : node->in_edges()) {
+      if (absl::c_binary_search(const_input_idxs, edge->dst_input()) &&
+          edge_filter(*edge)) {
+        (*compile_time_const_nodes)[edge->src()->id()] = true;
       }
     }
   };
diff --git a/tensorflow/compiler/tf2xla/const_analysis_test.cc b/tensorflow/compiler/tf2xla/const_analysis_test.cc
index 56065be894697bc72ecc0089c665c19aafee7bf8..40c6d0e01701d9104a200d9ea27706a0a7c12146 100644
--- a/tensorflow/compiler/tf2xla/const_analysis_test.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -107,5 +108,54 @@ TEST(ConstAnalysisTest, DontFollowControlDependencies) {
   EXPECT_EQ(const_args, std::vector<bool>({false, true}));
 }
 
+TEST(ConstAnalysisTest, RespectExplicitAttr_0) {
+  Scope root = Scope::NewRootScope();
+
+  Output arg0 = ops::_Arg(root.WithOpName("Arg0"), DT_INT32, 0);
+  Output arg1 = ops::_Arg(root.WithOpName("Arg1"), DT_INT32, 1);
+  Output c1 =
+      ops::Const(root.WithOpName("c1").WithControlDependencies(arg0), 1, {1});
+  Output add = ops::Add(root, arg1, c1);
+
+  // Force const analysis to pretend that the shape argument to `reshape` does
+  // not need to be a constant.
+  Output reshape = ops::Reshape(root, arg1, add);
+  reshape.node()->AddAttr(kXlaCompileTimeConstantInputsAttr,
+                          std::vector<string>());
+
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph));
+
+  std::vector<bool> const_args(2, false);
+  TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args,
+                                      /*compile_time_const_nodes=*/nullptr));
+
+  EXPECT_EQ(const_args, std::vector<bool>({false, false}));
+}
+
+TEST(ConstAnalysisTest, RespectExplicitAttr_1) {
+  Scope root = Scope::NewRootScope();
+
+  Output arg0 = ops::_Arg(root.WithOpName("Arg0"), DT_INT32, 0);
+  Output c1 =
+      ops::Const(root.WithOpName("c1").WithControlDependencies(arg0), 1, {1});
+  Output add = ops::Add(root, arg0, c1);
+
+  // Force const analysis to pretend that the first argument to `add` needs to
+  // be a constant.
+  std::vector<string> add_constant_inputs;
+  add_constant_inputs.push_back("x");
+  add.node()->AddAttr(kXlaCompileTimeConstantInputsAttr, add_constant_inputs);
+
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph));
+
+  std::vector<bool> const_args(1, false);
+  TF_ASSERT_OK(BackwardsConstAnalysis(graph, &const_args,
+                                      /*compile_time_const_nodes=*/nullptr));
+
+  EXPECT_EQ(const_args, std::vector<bool>({true}));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index 380c6a7e23da92d949b26876836b999bf6406c6c..64fdbbebc65bff4ed0b965fcdd534cc9696472b6 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -18,87 +18,26 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2xla/dump_graph_flags.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 namespace dump_graph {
 
-namespace {
-
-struct NameCounts {
-  mutex counts_mutex;
-  std::unordered_map<string, int> counts;
-};
-
-string MakeUniqueFilename(string name) {
-  static NameCounts& instance = *new NameCounts;
-
-  // Remove illegal characters from `name`.
-  for (int i = 0; i < name.size(); ++i) {
-    char ch = name[i];
-    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
-      name[i] = '_';
-    }
-  }
-
-  int count;
-  {
-    mutex_lock lock(instance.counts_mutex);
-    count = instance.counts[name]++;
-  }
-
-  string filename = name;
-  if (count > 0) {
-    absl::StrAppend(&filename, "_", count);
-  }
-  absl::StrAppend(&filename, ".pbtxt");
-  return filename;
-}
-
-string WriteTextProtoToUniqueFile(
-    Env* env, const string& name, const char* proto_type,
-    const ::tensorflow::protobuf::Message& proto) {
-  const string& dirname =
-      legacy_flags::GetDumpGraphFlags()->tf_dump_graph_prefix;
-  Status status = env->RecursivelyCreateDir(dirname);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to create " << dirname << " for dumping "
-                 << proto_type << ": " << status;
-    return "(unavailable)";
-  }
-  string filepath = absl::StrCat(dirname, "/", MakeUniqueFilename(name));
-  status = WriteTextProto(Env::Default(), filepath, proto);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
-                 << " : " << status;
-    return "(unavailable)";
-  }
-  LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
-  return filepath;
-}
-
-}  // anonymous namespace
-
 string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef",
-                                    graph_def);
+  return tensorflow::DumpGraphDefToFile(
+      name, graph_def, GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 string DumpGraphToFile(const string& name, Graph const& graph,
                        const FunctionLibraryDefinition* flib_def) {
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-  if (flib_def) {
-    *graph_def.mutable_library() = flib_def->ToProto();
-  }
-  return DumpGraphDefToFile(name, graph_def);
+  return tensorflow::DumpGraphToFile(name, graph, flib_def,
+                                     GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef);
+  return tensorflow::DumpFunctionDefToFile(
+      name, fdef, GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 }  // namespace dump_graph
diff --git a/tensorflow/compiler/tf2xla/dump_graph_flags.cc b/tensorflow/compiler/tf2xla/dump_graph_flags.cc
deleted file mode 100644
index a6c908ba011afb90fabacc855df8c6afbb35d254..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/dump_graph_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's dump_graph module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/dump_graph_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static DumpGraphFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new DumpGraphFlags;
-  flags->tf_dump_graph_prefix = "/tmp/";
-  flag_list = new std::vector<Flag>({
-      Flag("tf_dump_graph_prefix", &flags->tf_dump_graph_prefix,
-           "Path prefix to which graphs dumped during debugging should be "
-           "written."),
-  });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// dump_graph module.
-void AppendDumpGraphFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the DumpGraphFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-DumpGraphFlags* GetDumpGraphFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/dump_graph_flags.h b/tensorflow/compiler/tf2xla/dump_graph_flags.h
deleted file mode 100644
index 80a3307d920f2cc3d668d507786a02e43589f86f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/dump_graph_flags.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
-#define TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
-
-// Legacy flags for the XLA bridge's dump_graph module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// dump_graph module.
-void AppendDumpGraphFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with the XLA bridge's
-// dump_graph module.
-typedef struct {
-  string tf_dump_graph_prefix;  // Path prefix to which graphs dumped during
-                                // debugging should be written.
-} DumpGraphFlags;
-
-// Return a pointer to the DumpGraphFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-DumpGraphFlags* GetDumpGraphFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index db256e577a1f3dd38e04d102f60182023b9d43b2..c693e42d26712d55852f45c806215fc1f1b9a030 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -339,6 +339,7 @@ Status Conditional::AddSwitch(Node* s) {
         DebugString(switch_predicate_), " vs ", DebugString(predicate), ").");
   }
   switches_.insert(s);
+  parent_->AddSwitchId(s->id());
   return Status::OK();
 }
 
@@ -695,6 +696,12 @@ Status Conditional::BuildIfNode(Graph* graph,
   VLOG(3) << "Build output type: " << DataTypeVectorString(out_type);
 
   builder.Attr("Tcond", DT_BOOL);
+  string outside_compilation;
+  if (GetNodeAttr(predicate_.node->def(), kXlaOutsideCompilationAttrName,
+                  &outside_compilation)
+          .ok()) {
+    builder.Attr(kXlaOutsideCompilationAttrName, outside_compilation);
+  }
   builder.Device(predicate_.node->assigned_device_name());
   // Conditional should be the first input ...
   builder.Input(NodeDefBuilder::NodeOut(predicate_.node->name(),
@@ -1179,7 +1186,7 @@ Status FunctionalizeCond::DetermineAncestorState(Node* dst) {
 }
 
 void FunctionalizeCond::DeleteReachableAndDeadNodes(
-    const std::vector<int>& switch_ids, const std::vector<Node*>& merge_order) {
+    const std::vector<Node*>& merge_order) {
   // Delete all nodes that have been extracted or are reachable from
   // deleted/dead nodes. The input and outgoing edges should have already been
   // removed.
@@ -1191,7 +1198,7 @@ void FunctionalizeCond::DeleteReachableAndDeadNodes(
 
   // All remaining Switch nodes are not reachable from a Merge node and
   // removed. This is to account for dead Switch nodes.
-  for (int s_id : switch_ids) {
+  for (int s_id : switch_ids_) {
     Node* s = graph_->FindNodeId(s_id);
     if (s == nullptr) continue;
     for (const Edge* e : s->out_edges()) {
@@ -1282,11 +1289,10 @@ Status FunctionalizeCond::FunctionalizeInternal() {
   //   reverse topological sorting);
   // * Record reverse topological for merge and switch nodes;
   std::vector<Node*> rev_topo_order;
-  std::vector<int> switch_ids;
   std::vector<Node*> merge_order;
   DFS(*graph_, nullptr, [&](Node* n) {
     if (IsSwitch(n)) {
-      switch_ids.push_back(n->id());
+      AddSwitchId(n->id());
     }
     if (IsMerge(n)) {
       merge_order.push_back(n);
@@ -1300,9 +1306,7 @@ Status FunctionalizeCond::FunctionalizeInternal() {
   if (merge_order.empty()) {
     // No merges mean no switch values consumed (as only considering values
     // fetchable as output of merge);
-    for (auto it = switch_ids.begin(); it != switch_ids.end(); ++it) {
-      graph_->RemoveNode(graph_->FindNodeId(*it));
-    }
+    DeleteReachableAndDeadNodes(merge_order);
     return Status::OK();
   }
 
@@ -1345,7 +1349,7 @@ Status FunctionalizeCond::FunctionalizeInternal() {
     if (VLOG_IS_ON(4)) DumpGraphWithCondState("after_extract");
   }
 
-  DeleteReachableAndDeadNodes(switch_ids, merge_order);
+  DeleteReachableAndDeadNodes(merge_order);
 
   return Status::OK();
 }
@@ -1365,6 +1369,10 @@ void FunctionalizeCond::DumpGraphWithCondState(const string& name) {
                    library_);
 }
 
+void FunctionalizeCond::AddSwitchId(int switch_id) {
+  switch_ids_.push_back(switch_id);
+}
+
 Status FunctionalizeCond::Functionalize(Graph* graph,
                                         FunctionLibraryDefinition* library) {
   VLOG(1) << "FunctionalizeCond::Functionalize";
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.h b/tensorflow/compiler/tf2xla/functionalize_cond.h
index 189980894073b1da1a12d1c284536336eb920900..8525d7af61b4471e53a9ae16b081060bfd234c9c 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.h
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.h
@@ -166,6 +166,9 @@ class FunctionalizeCond {
   // Dump graph with the CondState annotated.
   void DumpGraphWithCondState(const string& name);
 
+  // Adds `switch_id` to the list of Switch node ids.
+  void AddSwitchId(int switch_id);
+
  private:
   FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library);
 
@@ -219,8 +222,7 @@ class FunctionalizeCond {
 
   // Deletes all nodes in/consumers reachable from switch/merge nodes that were
   // extracted.
-  void DeleteReachableAndDeadNodes(const std::vector<int>& switch_ids,
-                                   const std::vector<Node*>& merge_order);
+  void DeleteReachableAndDeadNodes(const std::vector<Node*>& merge_order);
 
   // Member used to unique the CondState to a unique CondId (AncestorState to a
   // unique AncestorId) and keep track of CondState/CondId
@@ -234,6 +236,8 @@ class FunctionalizeCond {
   Graph* graph_;
 
   friend class FunctionalizeCondTest;
+
+  std::vector<int> switch_ids_;
 };
 
 }  // namespace functionalize_cond
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 28e09d7b79a70bba7e05e9eccc26a65cc40324c6..3dfd3f854c8646ebbf06d3378201d22e8741b7eb 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -75,6 +75,25 @@ Status FunctionalizeControlFlow(Graph* graph,
   return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library);
 }
 
+Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
+                                           FunctionLibraryDefinition* library) {
+  return FunctionalizeControlFlowForGraphDef(/*lookup_library=*/nullptr,
+                                             graph_def, library);
+}
+
+Status FunctionalizeControlFlowForGraphDef(
+    const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def,
+    FunctionLibraryDefinition* library) {
+  FunctionDefLibrary function_lib = graph_def->library();
+  Graph graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph({}, *graph_def, &graph));
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(lookup_library, &graph, library));
+  graph.ToGraphDef(graph_def);
+  std::swap(*graph_def->mutable_library(), function_lib);
+  return Status::OK();
+}
+
 Status FunctionalizeControlFlowForFunction(
     const string& func_name, const string& new_func_name,
     const protobuf::Map<string, tensorflow::AttrValue>& attrs,
@@ -94,8 +113,9 @@ Status FunctionalizeControlFlowForFunction(
     }
   });
   const FunctionBody* body = flr->GetFunctionBody(handle);
+  Graph* g = body->graph;
 
-  // Check if the graph has Switch or Merge node before optimizing the graph.
+  // Check if the graph has Switch or Merge node.
   bool has_switch_or_merge = false;
   for (Node* n : body->graph->nodes()) {
     if (n->type_string() == "Switch" || n->type_string() == "Merge") {
@@ -108,59 +128,14 @@ Status FunctionalizeControlFlowForFunction(
   // in function body. We still need to rewrite those functions and modify
   // corresponding nodes.
 
-  // Call graph optimizer. The most important optimization we need is constant
-  // folding, which will replace ops like Shape/BroadcastGradientArgs with
-  // constant shape input. Without this optimization, those ops might become
-  // dynamic input for then/else body function and XLA will complain that input
-  // is not compile time constant. We enable function inlining as well, because
-  // otherwise we won't be able to infer shape for any node depending on
-  // function call nodes.
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("functionalize_control_flow_before_opt_", func_name),
-        *body->graph, fld);
-  }
-  // Optimizer accepts std::unique_ptr<Graph>* as input and might change
-  // underlying pointer, thus we create a new Graph and copy from body->graph.
-  std::unique_ptr<Graph> optimized_graph(new Graph(fld));
-  CopyGraph(*body->graph, optimized_graph.get());
-  OptimizerOptions opts;
-  opts.set_opt_level(OptimizerOptions::L0);
-  opts.set_do_function_inlining(true);
-  opts.set_do_constant_folding(true);
-  GraphOptimizer optimizer(opts);
-  auto cf_consider_fn = [](const Node* n) {
-    // Skip SymbolicGradient op when doing constant folding.
-    // Enabling SymbolicGradient op in constant folding requires
-    // flr->device() to be non-null, and here we have not constructed
-    // proper Device object yet (it will be constructed in XlaCompiler).
-    return n->type_string() != FunctionLibraryDefinition::kGradientOp;
-  };
-  optimizer.Optimize(flr, flr->env(),
-                     /*device=*/nullptr, &optimized_graph,
-                     /*shape_map=*/nullptr, /*cse_consider_fn=*/nullptr,
-                     cf_consider_fn);
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("functionalize_control_flow_after_opt_", func_name),
-        *optimized_graph, fld);
-  }
-  // Some inlined functions might have Switch/Merge nodes.
-  for (Node* n : optimized_graph->nodes()) {
-    if (n->type_string() == "Switch" || n->type_string() == "Merge") {
-      has_switch_or_merge = true;
-      break;
-    }
-  }
-
   // If any node has associated functions, functionalize them first.
   // Gather nodes with associated functions first, because rewriting those nodes
   // might involve node deletion/addition. Avoid modifying nodes while iterating
   // it.
   std::vector<std::pair<Node*, std::vector<AssociatedFunctionInfo>>>
       nodes_to_associated_functions;
-  for (auto* n : optimized_graph->nodes()) {
-    auto associated_functions = GetAssociatedFunctions(*n, flr);
+  for (auto* n : g->nodes()) {
+    auto associated_functions = GetAssociatedFunctions(*n, fld);
     if (!associated_functions.empty()) {
       nodes_to_associated_functions.push_back({n, associated_functions});
     }
@@ -215,7 +190,7 @@ Status FunctionalizeControlFlowForFunction(
         // pointer. That's fine because in that case, associated_functions will
         // only have one member and the loop will only run once.
         TF_RETURN_IF_ERROR(RewriteAssociatedFunction(
-            optimized_graph.get(), n, fld, associated_function, new_name));
+            g, n, fld, associated_function, new_name));
       }
     }
   }
@@ -227,21 +202,21 @@ Status FunctionalizeControlFlowForFunction(
     if (VLOG_IS_ON(4)) {
       dump_graph::DumpGraphToFile(
           absl::StrCat("functionalize_control_flow_before_fdef_", func_name),
-          *optimized_graph, fld);
+          *g, fld);
     }
-    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(optimized_graph.get(), fld));
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlow(g, fld));
     if (VLOG_IS_ON(4)) {
       dump_graph::DumpGraphToFile(
-          absl::StrCat("functionalize_control_flow_after_fdef_", func_name),
-          *optimized_graph, fld);
+          absl::StrCat("functionalize_control_flow_after_fdef_", func_name), *g,
+          fld);
     }
   }
 
   if (*modified) {
     // Add rewritten FunctionDef into library.
     FunctionDef functionalized_fdef;
-    TF_RETURN_IF_ERROR(GraphToFunctionDef(*optimized_graph, new_func_name,
-                                          &functionalized_fdef));
+    TF_RETURN_IF_ERROR(
+        GraphToFunctionDef(*g, new_func_name, &functionalized_fdef));
     if (func_name == new_func_name) {
       VLOG(2) << "Replacing function " << func_name;
       TF_RETURN_IF_ERROR(
@@ -270,9 +245,13 @@ Status FunctionalizeControlFlowPass::Run(
       pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
 
   // Find XLA compile ops and its corresponding FunctionDef.
+  // TPUCompile op is not in the map because graph rewriting might happen
+  // multiple times, and we want to avoid functionalize it again.
   static std::map<string, string>* kNodeTypeToFunctionAttrMapping =
       new std::map<string, string>{
-          {"TPUCompile", "function"},
+          // TPUReplicate ops are generated by EncapsulateTPUComputationsPass.
+          {"TPUReplicate", "computation"},
+          // XlaLaunch ops are generated by EncapsulateXlaComputationsPass.
           {"XlaLaunch", "function"},
       };
   std::map<string, absl::optional<string>> canonicalized_name_to_new_name;
@@ -282,23 +261,20 @@ Status FunctionalizeControlFlowPass::Run(
       continue;
     }
     const string func_attr = it->second;
-    if (kNodeTypeToFunctionAttrMapping->find(n->type_string()) !=
-        kNodeTypeToFunctionAttrMapping->end()) {
-      NameAttrList func;
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
-      VLOG(2) << "Graph has node " << n->type_string()
-              << ". Corresponding function: " << func.name();
-      string new_func_name = options.flib_def->UniqueFunctionName(
-          absl::StrCat(func.name(), "_f15n_"));
-      bool modified;
-      TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
-          func.name(), new_func_name, func.attr(), options.flib_def, flr,
-          &canonicalized_name_to_new_name, &modified));
-      if (modified) {
-        n->ClearAttr(func_attr);
-        func.set_name(new_func_name);
-        n->AddAttr(func_attr, func);
-      }
+    NameAttrList func;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), func_attr, &func));
+    VLOG(2) << "Graph has node " << n->type_string()
+            << ". Corresponding function: " << func.name();
+    string new_func_name = options.flib_def->UniqueFunctionName(
+        absl::StrCat(func.name(), "_f15n_"));
+    bool modified;
+    TF_RETURN_IF_ERROR(FunctionalizeControlFlowForFunction(
+        func.name(), new_func_name, func.attr(), options.flib_def, flr,
+        &canonicalized_name_to_new_name, &modified));
+    if (modified) {
+      n->ClearAttr(func_attr);
+      func.set_name(new_func_name);
+      n->AddAttr(func_attr, func);
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index ba99205640ccdc83a3a4d50e3ec474907894a835..91d33fa405834d7f1f8f66180583580f4f2e448a 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -33,6 +33,12 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
                                 Graph* graph,
                                 FunctionLibraryDefinition* library);
 
+Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
+                                           FunctionLibraryDefinition* library);
+Status FunctionalizeControlFlowForGraphDef(
+    const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def,
+    FunctionLibraryDefinition* library);
+
 // This pass looks at the graph and all associated FunctionDefs, and turns
 // traditional control flow structure (Switch/Merge/etc.) into functional
 // control flow structure (If/While).
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index c3841f996f801e855da75b23f01d41674ec51c4d..9784985af83a18619d837528f99a60b98a501ec5 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -95,77 +95,87 @@ TEST(FunctionalizeControlFlow, Conditional) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    string op_name;
+    NameAttrList then_fn;
+    NameAttrList else_fn;
+    TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
+    InstantiationResultForTest else_result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(else_fn.name(), library, &else_result));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
+      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+      auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
+      auto if_op = ops::If(scope.WithOpName(op_name), less,
+                           std::initializer_list<Input>{less, y, x}, {DT_INT32},
+                           then_fn, else_fn);
+      auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-  string op_name;
-  NameAttrList then_fn;
-  NameAttrList else_fn;
-  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
-  InstantiationResultForTest else_result;
-  TF_EXPECT_OK(
-      InstantiateFunctionForTest(else_fn.name(), library, &else_result));
-
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
-    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-    auto if_op = ops::If(scope.WithOpName(op_name), less,
-                         std::initializer_list<Input>{less, y, x}, {DT_INT32},
-                         then_fn, else_fn);
-    auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // then body.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
-    auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
-    auto cond = ops::Const(
-        scope.WithOpName("cond").WithControlDependencies(identity), 17);
-    auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(then_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+    // then body.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
+      auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
+      auto cond = ops::Const(
+          scope.WithOpName("cond").WithControlDependencies(identity), 17);
+      auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(then_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
+                result.arg_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-  // else body.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
-    auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
-    auto cond_1 = ops::Const(
-        scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
-    auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(else_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // else body.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
+      auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
+      auto cond_1 = ops::Const(
+          scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
+      auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(else_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
+                result.arg_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -239,75 +249,77 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
-
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
-    auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
-    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+    // Condition graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
+      auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+      auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
-// @function.Defun(noinline=True)
-// def increment_fn(x):
-//   return [x + 1]
-// Define the above function, and add it to the given graph. It's used as the
-// while loop body in NoinlineLoopBody test.
-Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
+FunctionDef GetNoinlineFunctionDef() {
   FunctionDef fdef = FunctionDefHelper::Create(
       "increment_fn", {"x:int32"}, {"add:int32"}, {},
       {
@@ -316,8 +328,17 @@ Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
       },
       {{"add", "add_0:z:0"}});
   (*fdef.mutable_attr())["_noinline"].set_b(true);
+  return fdef;
+}
+
+// @function.Defun(noinline=True)
+// def increment_fn(x):
+//   return [x + 1]
+// Define the above function, and add it to the given graph. It's used as the
+// while loop body in NoinlineLoopBody test.
+Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
   FunctionDefLibrary fdef_lib;
-  *(fdef_lib.add_function()) = fdef;
+  *(fdef_lib.add_function()) = GetNoinlineFunctionDef();
   TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdef_lib));
   NodeDef increment_fn;
   increment_fn.set_name(node_name);
@@ -376,55 +397,88 @@ TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
   FunctionLibraryDefinition lookup_lib(graph.flib_def());
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   // Function increment_fn will be copied from lookup_lib to library.
-  TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library));
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
+  *(optimized_graph_def.mutable_library()->add_function()) =
+      GetNoinlineFunctionDef();
 
-  NameAttrList cond_fn, body_fn;
-  TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+  TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef(
+      &lookup_lib, &optimized_graph_def, &library));
+  TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      GraphDef expected;
+      TF_ASSERT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    GraphDef expected;
-    TF_ASSERT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      TF_ASSERT_OK(
+          AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      NodeDef retval;
+      retval.set_name("_retval0_RetVal");
+      retval.set_op(FunctionLibraryDefinition::kRetOp);
+      *retval.add_input() = noinline_node_name;
+      (*retval.mutable_attr())["T"].set_type(DT_INT32);
+      (*retval.mutable_attr())["index"].set_i(0);
+      Status status;
+      scope.graph()->AddNode(retval, &status);
+      TF_ASSERT_OK(status);
+
+      GraphDef expected;
+      TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      // Verify that increment_fn has been copied to library.
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      // Ignore the function library when comparing the graphs.
+      expected.clear_library();
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
+}
 
-  // Body graph.
+TEST(FunctionalizeControlFlow, MissingFunctionDefInLibrary) {
+  const string& noinline_node_name = "while/increment_fn";
+  Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+    auto identity = ops::Identity(scope.WithOpName("while/Identity"), source);
     TF_ASSERT_OK(AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    NodeDef retval;
-    retval.set_name("_retval0_RetVal");
-    retval.set_op(FunctionLibraryDefinition::kRetOp);
-    *retval.add_input() = noinline_node_name;
-    (*retval.mutable_attr())["T"].set_type(DT_INT32);
-    (*retval.mutable_attr())["index"].set_i(0);
-    Status status;
-    scope.graph()->AddNode(retval, &status);
-    TF_ASSERT_OK(status);
-
-    GraphDef expected;
-    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+  }
 
-    InstantiationResultForTest result;
-    // Verify that increment_fn has been copied to library.
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+  FunctionLibraryDefinition lookup_lib(graph.flib_def());
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+  graph_def.clear_library();
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    // Ignore the function library when comparing the graphs.
-    expected.clear_library();
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+  Status status =
+      FunctionalizeControlFlowForGraphDef(&lookup_lib, &graph_def, &library);
+  EXPECT_EQ(tensorflow::error::NOT_FOUND, status.code());
 }
 
 // Tests functionalizing OneLoopVar where the loop value is not used post the
@@ -467,65 +521,72 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
-
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
-    auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
-    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+    // Condition graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
+      auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+      auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -608,86 +669,95 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
+      auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{x, y}, cond_fn, body_fn);
+      auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]);
+      auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
-
-  // Outer graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
-    auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{x, y}, cond_fn, body_fn);
-    auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]);
-    auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+    // Condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+                                         .WithControlDependencies(arg0.output),
+                                     3);
+      auto cond_add =
+          ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
+      auto ten = ops::Const<int32>(scope.WithOpName("while/cond/ten")
                                        .WithControlDependencies(arg0.output),
-                                   3);
-    auto cond_add =
-        ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/cond/ten").WithControlDependencies(arg0.output),
-        10);
-    auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-
-    auto identity_x = ops::Identity(scope.WithOpName("while/Identity/x"), arg0);
-    auto identity_y = ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
-
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
-        1);
-    auto two = ops::Const<int32>(
-        scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
-        2);
+                                   10);
+      auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
 
-    auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
-    auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+
+      auto identity_x =
+          ops::Identity(scope.WithOpName("while/Identity/x"), arg0);
+      auto identity_y =
+          ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
+
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
+          1);
+      auto two = ops::Const<int32>(
+          scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
+          2);
+
+      auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
+      auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+      auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -841,177 +911,192 @@ TEST(FunctionalizeControlFlow, Complex) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList outer_cond_fn, outer_body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
-
-  // Outer graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
-    auto y = ops::Add(scope.WithOpName("y"), x, three);
-
-    auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
-                                TensorShape({}));
-
-    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
-
-    auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
-                               std::initializer_list<Input>{zero, y, x, var},
-                               outer_cond_fn, outer_body_fn);
-    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Outer condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
-        10);
-    auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Outer body graph.
-  NameAttrList inner_cond_fn, inner_body_fn;
-  {
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
-
-    // Find the inner condition and body names.
-    TF_EXPECT_OK(
-        FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
-
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
-    auto one_j = ops::Const<int32>(
-        scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
-    auto while_op =
-        ops::While(scope.WithOpName("outer/LoopCond_1"),
-                   std::initializer_list<Input>{one_j, arg1, arg2, arg3},
-                   inner_cond_fn, inner_body_fn);
-
-    auto one_outer = ops::Const<int32>(
-        scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
-    auto add_i =
-        ops::Add(scope.WithOpName("outer/add")
-                     .WithControlDependencies(absl::Span<const Operation>{
-                         while_op[0].op(), while_op[1].op()}),
-                 identity_i, one_outer);
-
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
-    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
-    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Inner condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto five = ops::Const<int32>(
-        scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), 5);
-    auto less_j = ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList outer_cond_fn, outer_body_fn;
     TF_EXPECT_OK(
-        InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Inner body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto identity_j =
-        ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
-    auto identity_k =
-        ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
-
-    auto mul_jk =
-        ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
-    auto add_jkx = ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
-    auto assign = ops::AssignAddVariableOp(
-        scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
-
-    auto one = ops::Const<int32>(
-        scope.WithOpName("outer/inner/One")
-            .WithControlDependencies(
-                absl::Span<const Operation>{assign.operation}),
-        1);
-    auto add_j =
-        ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+        FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
+
+    // Outer graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+      auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+      auto y = ops::Add(scope.WithOpName("y"), x, three);
+
+      auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
+                                  TensorShape({}));
+
+      auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
+
+      auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
+                                 std::initializer_list<Input>{zero, y, x, var},
+                                 outer_cond_fn, outer_body_fn);
+      auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0);
-    auto retval1 =
-        ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
-    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+    // Outer condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
+          10);
+      auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    // Outer body graph.
+    NameAttrList inner_cond_fn, inner_body_fn;
+    {
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
+
+      // Find the inner condition and body names.
+      TF_EXPECT_OK(
+          FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
+
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
+      auto one_j = ops::Const<int32>(
+          scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
+      auto while_op =
+          ops::While(scope.WithOpName("outer/LoopCond_1"),
+                     std::initializer_list<Input>{one_j, arg1, arg2, arg3},
+                     inner_cond_fn, inner_body_fn);
+
+      auto one_outer = ops::Const<int32>(
+          scope.WithOpName("outer/add/y").WithControlDependencies(identity_i),
+          1);
+      auto add_i =
+          ops::Add(scope.WithOpName("outer/add")
+                       .WithControlDependencies(absl::Span<const Operation>{
+                           while_op[0].op(), while_op[1].op()}),
+                   identity_i, one_outer);
+
+      auto retval0 =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
+      auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
+      auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+                result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+    // Inner condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto five = ops::Const<int32>(
+          scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0),
+          5);
+      auto less_j =
+          ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
+      auto retval =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Inner body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto identity_j =
+          ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
+      auto identity_k =
+          ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
+
+      auto mul_jk =
+          ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
+      auto add_jkx =
+          ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
+      auto assign = ops::AssignAddVariableOp(
+          scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
+
+      auto one = ops::Const<int32>(
+          scope.WithOpName("outer/inner/One")
+              .WithControlDependencies(
+                  absl::Span<const Operation>{assign.operation}),
+          1);
+      auto add_j =
+          ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+
+      auto retval0 =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0);
+      auto retval1 =
+          ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
+      auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+                result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index 7c3ad448ef546dd1ab2640a57d7d1d73ca3768ad..d87436a7b4ac37c74d0f0df921779c8716290013 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -523,6 +523,12 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
   builder.Attr("T", arg_types);
   builder.Attr("cond", cond_name);
   builder.Attr("body", body_name);
+  string outside_compilation;
+  if (GetNodeAttr(frame->loop_cond->def(), kXlaOutsideCompilationAttrName,
+                  &outside_compilation)
+          .ok()) {
+    builder.Attr(kXlaOutsideCompilationAttrName, outside_compilation);
+  }
   std::vector<NodeDefBuilder::NodeOut> inputs;
   for (int i = 0; i < frame->args.size(); ++i) {
     const Arg& arg = frame->args[i];
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index c019a28e892ff89f559ddbec2360d6caa9c1808f..efb75749722893100494e089c0beb96944e9f1d4 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/validate.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -51,12 +52,11 @@ namespace {
 Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
                         const std::vector<const XlaExpression*>& expressions,
                         std::vector<XlaCompiler::Argument>* args) {
-  auto builder = ctx->builder();
   auto client = ctx->compiler()->client();
-  std::vector<bool> compile_time_constant_flags(expressions.size());
+  std::vector<bool> arg_must_be_compile_time_constant(expressions.size());
 
   TF_RETURN_IF_ERROR(
-      BackwardsConstAnalysis(*graph, &compile_time_constant_flags,
+      BackwardsConstAnalysis(*graph, &arg_must_be_compile_time_constant,
                              /*compile_time_const_nodes=*/nullptr));
 
   args->resize(expressions.size());
@@ -65,24 +65,31 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
     arg.type = ctx->input_type(i);
     arg.shape = ctx->InputShape(i);
 
-    if (arg.type == DT_RESOURCE) {
-      return errors::InvalidArgument(
-          "Resource as function argument is not yet implemented.");
-    } else if (expressions[i]->has_constant_value()) {
-      arg.kind = XlaCompiler::Argument::kConstant;
-      arg.constant_value = expressions[i]->constant_value();
-    } else if (compile_time_constant_flags[i]) {
-      arg.kind = XlaCompiler::Argument::kConstant;
-      TF_RET_CHECK(expressions[i]->resource() == nullptr)
-          << "Input with resource is not yet implemented.";
-      TF_ASSIGN_OR_RETURN(auto constant_graph, builder->BuildConstantSubGraph(
-                                                   expressions[i]->handle()));
-      TF_ASSIGN_OR_RETURN(auto literal,
-                          client->ComputeConstant(constant_graph));
-      TF_RETURN_IF_ERROR(
-          LiteralToHostTensor(literal, arg.type, &arg.constant_value));
-    } else {
-      arg.kind = XlaCompiler::Argument::kParameter;
+    switch (expressions[i]->kind()) {
+      case XlaExpression::Kind::kConstant:
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = expressions[i]->constant_value();
+        break;
+      case XlaExpression::Kind::kXlaOp:
+        if (arg_must_be_compile_time_constant[i]) {
+          TF_ASSIGN_OR_RETURN(absl::optional<Tensor> value,
+                              expressions[i]->ResolveConstant(client));
+          if (!value.has_value()) {
+            return errors::InvalidArgument(
+                "Argument to function must be a compile-time constant, but "
+                "unable to resolve argument value to a constant.");
+          }
+          arg.kind = XlaCompiler::Argument::kConstant;
+          arg.constant_value = *value;
+        } else {
+          arg.kind = XlaCompiler::Argument::kParameter;
+        }
+        break;
+      case XlaExpression::Kind::kResource:
+        return errors::Unimplemented(
+            "Resource as function argument is not yet implemented.");
+      case XlaExpression::Kind::kInvalid:
+        return errors::InvalidArgument("Invalid function argument");
     }
   }
   return Status::OK();
@@ -164,7 +171,7 @@ Status GraphCompiler::Compile() {
       outputs[o] = op_context.release_output(o);
       if (outputs[o].tensor == nullptr) {
         return errors::Internal("Missing xla_context ", o, "-th output from ",
-                                SummarizeNode(*n));
+                                FormatNodeForError(*n));
       }
     }
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 95a010a119d13d4fdd35690d2b8ea708eafb221f..8bc329229648c5aced8d06c99b170803bb3a90f8 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -40,6 +40,7 @@ tf_kernel_library(
         "dynamic_stitch_op.cc",
         "elu_op.cc",
         "extract_image_patches_op.cc",
+        "fake_param_op.cc",
         "fake_quantize_ops.cc",
         "fft_ops.cc",
         "fill_op.cc",
@@ -120,12 +121,11 @@ tf_kernel_library(
         ":while_op",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/lib:batch_dot",
+        "//tensorflow/compiler/tf2xla/lib:broadcast",
         "//tensorflow/compiler/tf2xla/lib:cholesky",
         "//tensorflow/compiler/tf2xla/lib:qr",
         "//tensorflow/compiler/tf2xla/lib:random",
         "//tensorflow/compiler/tf2xla/lib:scatter",
-        "//tensorflow/compiler/tf2xla/lib:triangular_solve",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/lib:while_loop",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
@@ -142,10 +142,11 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:numeric",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:pooling",
         "//tensorflow/compiler/xla/client/lib:prng",
         "//tensorflow/compiler/xla/client/lib:sorting",
+        "//tensorflow/compiler/xla/client/lib:triangular_solve",
         "//tensorflow/core:framework",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
@@ -176,6 +177,31 @@ tf_kernel_library(
     ],
 )
 
+# A separate cc_library for resampler_ops is needed because resampler is in
+# contrib/, and thus the declaration of resampler cannot be pulled into the deps
+# of xla_ops. Therefore, resampler_ops is its own cc_library target, and its
+# corresponding tf_kernel_library is defined in contrib/resampler/BUILD.
+cc_library(
+    name = "resampler_ops",
+    srcs = ["resampler_ops.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "conv_op_helpers",
     srcs = ["conv_op_helpers.cc"],
@@ -188,7 +214,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:conv_ops",
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 276d744c096f8996c774964204feaa3762bdb844..795ea09831e183a26fb3498b9bbaf9c3adaef9ed 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -14,11 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
@@ -48,14 +50,10 @@ class XlaArgOp : public XlaOpKernel {
       return;
     }
 
-    const XlaExpression& arg = XlaContext::Get(ctx).args()[index_];
-    if (arg.resource() != nullptr) {
-      ctx->SetResourceOutput(0, arg.resource());
-    } else if (arg.has_constant_value()) {
-      ctx->SetConstantOutput(0, arg.constant_value());
-    } else {
-      ctx->SetOutput(0, arg.handle());
-    }
+    const XlaExpression& arg = ctx->xla_context()->args()[index_];
+    OP_REQUIRES(ctx, arg.kind() != XlaExpression::Kind::kInvalid,
+                errors::InvalidArgument("Invalid/missing argument expression"));
+    ctx->SetOutputExpression(0, arg);
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index 4cfe946b2e6146f034867c06e996ffae42b90705..1b254e328a8c71bd81a0ec700e2af1d81a5fa67a 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
+#include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 
 namespace tensorflow {
 namespace {
@@ -28,9 +30,11 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result = BatchDot(ctx->Input(0), ctx->Input(1),
-                           /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_,
-                           /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_);
+    auto result =
+        xla::BatchDot(MaybeTransposeInMinorDims(
+                          MaybeConjugate(ctx->Input(0), adj_x_), adj_x_),
+                      MaybeTransposeInMinorDims(
+                          MaybeConjugate(ctx->Input(1), adj_y_), adj_y_));
     ctx->SetOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index a267c0c72fce67d7c22c55a57f8d5ac4ffd2b7e2..0e2f335f3354e3ae6008bdc0ac0b80683fe479c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -115,9 +115,9 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     // operators. For now, cast everything to the statistics type (which
     // may be more precise than the input type).
     auto grad_backprop =
-        XlaHelpers::ConvertElementType(b, ctx->Input(0), scale_dtype);
+        XlaHelpers::ConvertElementType(ctx->Input(0), scale_dtype);
     auto activations =
-        XlaHelpers::ConvertElementType(b, ctx->Input(1), scale_dtype);
+        XlaHelpers::ConvertElementType(ctx->Input(1), scale_dtype);
     auto scale = ctx->Input(2);
     auto mean = ctx->Input(3);
     auto var = ctx->Input(4);
@@ -151,11 +151,11 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       const DataType accumulation_type =
           XlaHelpers::SumAccumulationType(scale_dtype);
       auto converted =
-          XlaHelpers::ConvertElementType(b, grad_backprop, accumulation_type);
+          XlaHelpers::ConvertElementType(grad_backprop, accumulation_type);
       auto reduce =
           xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                       *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
-      offset_backprop = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
+      offset_backprop = XlaHelpers::ConvertElementType(reduce, scale_dtype);
 
       // scratch1 = rsqrt(pop_var + epsilon)
       auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
@@ -165,19 +165,18 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       // scratch2 = sum(y_backprop * (x - mean))
       auto mul =
           xla::Mul(grad_backprop, xla::Sub(activations, mean, {feature_index}));
-      converted = XlaHelpers::ConvertElementType(b, mul, accumulation_type);
+      converted = XlaHelpers::ConvertElementType(mul, accumulation_type);
       reduce =
           xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                       *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
-      auto scratch2 = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
+      auto scratch2 = XlaHelpers::ConvertElementType(reduce, scale_dtype);
 
       x_backprop =
           xla::Mul(grad_backprop, xla::Mul(scratch1, scale), {feature_index});
       scale_backprop = xla::Mul(scratch1, scratch2);
     }
 
-    ctx->SetOutput(0,
-                   XlaHelpers::ConvertElementType(b, x_backprop, input_dtype));
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(x_backprop, input_dtype));
     ctx->SetOutput(1, scale_backprop);
     ctx->SetOutput(2, offset_backprop);
     ctx->SetConstantOutput(3, Tensor());
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index a18e04995b5e1e0b0374f7b0edd6f5e114cf994a..46e5d68c78fd9ff26a88dc2a1484c3a67b76f4f3 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -159,8 +159,8 @@ class BatchToSpaceNDOp : public XlaOpKernel {
   }
 };
 REGISTER_XLA_OP(Name("BatchToSpaceND")
-                    .CompileTimeConstInput("block_shape")
-                    .CompileTimeConstInput("crops"),
+                    .CompileTimeConstantInput("block_shape")
+                    .CompileTimeConstantInput("crops"),
                 BatchToSpaceNDOp);
 
 class BatchToSpaceOp : public XlaOpKernel {
@@ -183,7 +183,7 @@ class BatchToSpaceOp : public XlaOpKernel {
  private:
   int block_size_;
 };
-REGISTER_XLA_OP(Name("BatchToSpace").CompileTimeConstInput("crops"),
+REGISTER_XLA_OP(Name("BatchToSpace").CompileTimeConstantInput("crops"),
                 BatchToSpaceOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index 182f7c99344845964f7010127718f876ab6e8a44..c022284fec6bc91951170e243ea3609c8d5d0c43 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -67,8 +67,8 @@ class BCastArgsOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(BCastArgsOp);
 };
 REGISTER_XLA_OP(Name("BroadcastArgs")
-                    .CompileTimeConstInput("s0")
-                    .CompileTimeConstInput("s1"),
+                    .CompileTimeConstantInput("s0")
+                    .CompileTimeConstantInput("s1"),
                 BCastArgsOp);
 
 // Given shapes of two tensors, computes the reduction indices for the
@@ -94,14 +94,10 @@ class BCastGradArgsOp : public XlaOpKernel {
       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(in_shape),
                   errors::InvalidArgument("In[", i, "] must be a vector.",
                                           in_shape.DebugString()));
-      xla::Literal literal;
-      OP_REQUIRES_OK(ctx, ctx->ConstantInput(i, &literal));
-
-      BCast::Vec vec;
-      for (int64 i = 0; i < in_shape.num_elements(); ++i) {
-        vec.push_back(literal.Get<int>({i}));
-      }
-      shapes.push_back(vec);
+      std::vector<int64> vec;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(i, &vec));
+
+      shapes.push_back(BCast::Vec(vec.begin(), vec.end()));
     }
     BCast bcast(shapes[0], shapes[1]);
     OP_REQUIRES(ctx, bcast.IsValid(),
@@ -126,8 +122,8 @@ class BCastGradArgsOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("BroadcastGradientArgs")
-                    .CompileTimeConstInput("s0")
-                    .CompileTimeConstInput("s1"),
+                    .CompileTimeConstantInput("s0")
+                    .CompileTimeConstantInput("s1"),
                 BCastGradArgsOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index 41f540506ba41fbe7f91393e7b8e26a89e72ef0a..e7f369b761f36a717ea5fb536780af91a8955b1e 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -107,11 +107,11 @@ class BiasAddGradOp : public XlaOpKernel {
     const DataType accumulation_type =
         XlaHelpers::SumAccumulationType(input_type(0));
     auto converted =
-        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+        XlaHelpers::ConvertElementType(ctx->Input(0), accumulation_type);
     auto reduce =
         xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                     *ctx->GetOrCreateAdd(accumulation_type), reduce_dims);
-    ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, reduce, input_type(0)));
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(reduce, input_type(0)));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index a988d3c33ed808b022f67882c8ae5100b7e7a305..5e9280c1fe692037b0a842a92ef5a8c28b854a54 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -43,6 +43,9 @@ namespace {
         const std::vector<int64>& extend_dimensions) override {          \
       xla::XlaBuilder* b = ctx->builder();                               \
       (void)b;                                                           \
+      (void)lhs_shape;                                                   \
+      (void)rhs_shape;                                                   \
+      (void)extend_dimensions;                                           \
       return HLO;                                                        \
     }                                                                    \
   };                                                                     \
@@ -64,7 +67,7 @@ XLA_MAKE_BINARY(Complex, xla::Complex(lhs, rhs, extend_dimensions));
 // }
 static xla::XlaOp DivNoNanImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                                xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   auto zero = XlaHelpers::Zero(b, dtype);
   auto y_equals_0 = xla::Eq(y, zero);
   auto zeros = xla::ZerosLike(x);
@@ -84,7 +87,7 @@ XLA_MAKE_BINARY(DivNoNan,
 // }
 static xla::XlaOp FloorDivImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                                xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   if (DataTypeIsUnsigned(dtype)) {
     return xla::Div(x, y);
   }
@@ -103,30 +106,30 @@ static xla::XlaOp FloorDivImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
 XLA_MAKE_BINARY(FloorDiv,
                 FloorDivImpl(b, input_type(0), lhs, rhs, broadcast_helper));
 
-static xla::XlaOp XlogyImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
-                            xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
-  auto zero = XlaHelpers::Zero(b, dtype);
+xla::XlaOp XlogyImpl(xla::XlaOp x, xla::XlaOp y,
+                     const BCast& broadcast_helper) {
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
+  auto zero = xla::ZerosLike(x);
   auto is_zero = xla::Eq(x, zero);
   return xla::Select(is_zero, zero, xla::Mul(x, xla::Log(y)));
 }
-XLA_MAKE_BINARY(Xlogy, XlogyImpl(b, input_type(0), lhs, rhs, broadcast_helper));
+XLA_MAKE_BINARY(Xlogy, XlogyImpl(lhs, rhs, broadcast_helper));
 
-static xla::XlaOp XdivyImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
-                            xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
-  auto zero = XlaHelpers::Zero(b, dtype);
+xla::XlaOp XdivyImpl(xla::XlaOp x, xla::XlaOp y,
+                     const BCast& broadcast_helper) {
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
+  auto zero = xla::ZerosLike(x);
   auto is_zero = xla::Eq(x, zero);
   return xla::Select(is_zero, zero, xla::Div(x, y));
 }
-XLA_MAKE_BINARY(Xdivy, XdivyImpl(b, input_type(0), lhs, rhs, broadcast_helper));
+XLA_MAKE_BINARY(Xdivy, XdivyImpl(lhs, rhs, broadcast_helper));
 
 // Implementation of FloorMod. Pseudo-code:
 // T trunc_mod = std::fmod(x, y);
 // return (x < T(0)) == (y < T(0)) ? trunc_mod : std::fmod(trunc_mod + y, y);
 static xla::XlaOp FloorModImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
                                xla::XlaOp y, const BCast& broadcast_helper) {
-  std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
+  std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
   auto zero = XlaHelpers::Zero(b, dtype);
   auto same_sign = xla::Eq(xla::Lt(x, zero), xla::Lt(y, zero));
   auto trunc_mod = xla::Rem(x, y);
diff --git a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
index 696c1c39befd5aa2972afb6cfa64905b57a5ab72..d7a8e67dd33aab5c32b7465ce505b745b5c1ca2f 100644
--- a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
@@ -13,16 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "absl/algorithm/container.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 namespace {
@@ -37,63 +32,13 @@ class BroadcastToOp : public XlaOpKernel {
     TensorShape output_shape;
     OP_REQUIRES_OK(context, context->ConstantInputAsShape(1, &output_shape));
 
-    OP_REQUIRES(context, input_shape.dims() <= output_shape.dims(),
-                errors::InvalidArgument(
-                    "Input rank (", input_shape.dims(),
-                    ") must be less than or equal to the output rank (",
-                    output_shape.dims(), ")"));
-
-    auto input_dims = input_shape.dim_sizes();
-    auto output_dims = output_shape.dim_sizes();
-
-    // Broadcasting is done right-to-left on right-aligned dimensions; reverse
-    // the two vectors so elements to be broadcast are aligned.
-    absl::c_reverse(input_dims);
-    absl::c_reverse(output_dims);
-
-    std::vector<int64> broadcast_dims;
-    std::vector<int64> broadcast_shape;
-    for (int i = 0; i < output_shape.dims(); ++i) {
-      if (i < input_shape.dims()) {
-        OP_REQUIRES(
-            context,
-            (output_dims[i] == 0 && input_dims[i] == 0) ||
-                (input_dims[i] != 0 && output_dims[i] % input_dims[i] == 0),
-            errors::InvalidArgument("invalid shape to broadcast from ",
-                                    input_shape.DebugString(), " to ",
-                                    output_shape.DebugString()));
-
-        broadcast_dims.push_back(broadcast_shape.size());
-        if (output_dims[i] == input_dims[i]) {
-          broadcast_shape.push_back(output_dims[i]);
-        } else if (output_dims[i] != input_dims[i]) {
-          // Add dimensions [I, O/I], which we will later flatten to just
-          // [O]. We must do this in two phases since XLA broadcasting does not
-          // support tiling.
-          broadcast_shape.push_back(input_dims[i]);
-          broadcast_shape.push_back(output_dims[i] / input_dims[i]);
-        }
-      } else {
-        broadcast_shape.push_back(output_dims[i]);
-      }
-    }
-    absl::c_reverse(broadcast_dims);
-    int broadcast_shape_size = broadcast_shape.size();
-    for (int64& broadcast_dim : broadcast_dims) {
-      broadcast_dim = broadcast_shape_size - broadcast_dim - 1;
-    }
-    absl::c_reverse(broadcast_shape);
-    xla::XlaOp output = xla::Reshape(
-        xla::BroadcastInDim(context->Input(0),
-                            xla::ShapeUtil::MakeShape(
-                                context->input_xla_type(0), broadcast_shape),
-                            broadcast_dims),
-        output_shape.dim_sizes());
-    context->SetOutput(0, output);
+    auto output = BroadcastTo(context->Input(0), output_shape.dim_sizes());
+    OP_REQUIRES_OK(context, output.status());
+    context->SetOutput(0, output.ValueOrDie());
   }
 };
 
-REGISTER_XLA_OP(Name("BroadcastTo").CompileTimeConstInput("shape"),
+REGISTER_XLA_OP(Name("BroadcastTo").CompileTimeConstantInput("shape"),
                 BroadcastToOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index e7fef77edcba0ea5a521956a704225ac4f7fcb22..7199b9b6feb36dd45ef51f4c38463bc715fcc38a 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -21,10 +21,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -57,42 +60,114 @@ class CategoricalOp : public XlaOpKernel {
     const int64 batch_size = logits_shape.dim_size(0);
     const int64 num_classes = logits_shape.dim_size(1);
 
-    xla::XlaBuilder* builder = ctx->builder();
-
-    std::array<int64, 3> uniform_shape_array = {
-        {batch_size, num_samples, num_classes}};
-    xla::PrimitiveType uniform_xla_type;
-    OP_REQUIRES_OK(ctx,
-                   DataTypeToPrimitiveType(input_type(0), &uniform_xla_type));
-    xla::Shape uniform_shape =
-        xla::ShapeUtil::MakeShape(uniform_xla_type, uniform_shape_array);
-    auto uniforms =
-        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
-                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    xla::Shape uniform_shape;
+    int class_dimension;
+    if (num_samples != 1) {
+      std::array<int64, 3> uniform_shape_array = {
+          {batch_size, num_samples, num_classes}};
+      xla::PrimitiveType uniform_xla_type;
+      OP_REQUIRES_OK(ctx,
+                     DataTypeToPrimitiveType(input_type(0), &uniform_xla_type));
+      uniform_shape =
+          xla::ShapeUtil::MakeShape(uniform_xla_type, uniform_shape_array);
+      class_dimension = 2;
+    } else {
+      // Have a special case for when we only need one sample, because
+      // dimensions may be padded on architectures with tiled memory layouts, so
+      // if the num_classes or batch size is large then this can lead to
+      // expensive wasted memory.
+      std::array<int64, 2> uniform_shape_array = {{batch_size, num_classes}};
+      xla::PrimitiveType uniform_xla_type;
+      OP_REQUIRES_OK(ctx,
+                     DataTypeToPrimitiveType(input_type(0), &uniform_xla_type));
+      uniform_shape =
+          xla::ShapeUtil::MakeShape(uniform_xla_type, uniform_shape_array);
+      class_dimension = 1;
+    }
+    xla::PrimitiveType type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(0), &type));
+    xla::XlaOp log_uniforms = GetLogUniforms(uniform_shape, type, ctx);
 
     // Use Gumbel softmax trick to generate categorical samples.
     // See:
     // https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/
     // TODO(b/68769470): Switch to using a cumulative sum approach.
-    auto softmax_entries = xla::Sub(logits, xla::Log(-xla::Log(uniforms)),
-                                    /*broadcast_dimensions=*/{0, 2});
+    auto softmax_entries =
+        xla::Sub(logits, log_uniforms,
+                 /*broadcast_dimensions=*/{0, class_dimension});
 
     xla::PrimitiveType xla_output_type;
     OP_REQUIRES_OK(ctx,
                    DataTypeToPrimitiveType(output_type(0), &xla_output_type));
-    xla::XlaOp argmax =
-        XlaHelpers::ArgMax(softmax_entries, xla_output_type, /*axis=*/2);
+    xla::XlaOp argmax = XlaHelpers::ArgMax(softmax_entries, xla_output_type,
+                                           /*axis=*/class_dimension);
+    if (num_samples == 1) {
+      argmax = xla::Reshape(argmax, {batch_size, 1});
+    }
 
     ctx->SetOutput(0, argmax);
   }
 
+  virtual xla::XlaOp GetLogUniforms(xla::Shape uniform_shape,
+                                    xla::PrimitiveType type,
+                                    XlaOpKernelContext* ctx) {
+    xla::XlaBuilder* builder = ctx->builder();
+    auto uniforms =
+        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
+                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    return xla::Log(-xla::Log(uniforms));
+  }
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(CategoricalOp);
 };
 
 // TODO(b/68769717): Rename this sampler to Categorical.
-REGISTER_XLA_OP(Name("Multinomial").CompileTimeConstInput("num_samples"),
+REGISTER_XLA_OP(Name("Multinomial").CompileTimeConstantInput("num_samples"),
                 CategoricalOp);
 
+class StatelessCategoricalOp : public CategoricalOp {
+ public:
+  explicit StatelessCategoricalOp(OpKernelConstruction* ctx)
+      : CategoricalOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  xla::XlaOp GetLogUniforms(xla::Shape uniform_shape, xla::PrimitiveType type,
+                            XlaOpKernelContext* ctx) override {
+    xla::XlaOp seed = ctx->Input(2);
+    auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
+    auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
+
+    xla::XlaBuilder* builder = ctx->builder();
+    if (uniform_shape.element_type() == xla::BF16) {
+      uniform_shape.set_element_type(xla::F32);
+    }
+    auto uniforms = xla::StatelessRngUniform(
+        {seed0, seed1}, uniform_shape, XlaHelpers::Zero(builder, DT_FLOAT),
+        XlaHelpers::One(builder, DT_FLOAT));
+    return xla::ConvertElementType(xla::Log(-xla::Log(uniforms)), type);
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape seed_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, seed_shape.dims() == 1 && seed_shape.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    CategoricalOp::Compile(ctx);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessCategoricalOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessMultinomial")
+                    .CompileTimeConstantInput("num_samples")
+                    .TypeConstraint("T", {DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("Tseed", DT_INT32),
+                StatelessCategoricalOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index 0ae23aa6dfe49048ac5cb8ae00c12432b2e2a2fe..cd7c7f4a82df7a65829787efcb1fd2f77870e945 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
@@ -37,16 +38,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// Used to determine the number of Tensors allowed in a Concat op to prevent
-// going over the max gpu parameter memory size. This is an issue because concat
-// is variadic and can have an unlimited number of arguments when called.
-// Concat ops with more Tensors than this will be split into multiple concat
-// ops.
-//
-// TODO(b/112613927): Remove the logic here and put it properly in an HLO pass
-// along with boxing large numbers of parameters.
-constexpr int64 kMaxConcatArgsPerOp = 500;
-
 // --------------------------------------------------------------------------
 class ConcatBaseOp : public XlaOpKernel {
  public:
@@ -55,15 +46,13 @@ class ConcatBaseOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape concat_dim_tensor_shape = ctx->InputShape(axis_index_);
-    OP_REQUIRES(
-        ctx, IsLegacyScalar(concat_dim_tensor_shape),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_tensor_shape.DebugString()));
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(axis_index_, &literal));
-    // TODO(annarev): add a helper to support int64 input.
-    const int32 concat_dim = literal.Get<int>({});
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(concat_dim_tensor_shape),
+                errors::InvalidArgument(
+                    "Concat dim tensor should be a scalar, but got shape ",
+                    concat_dim_tensor_shape.DebugString()));
+    int64 concat_dim;
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsIntScalar(axis_index_, &concat_dim));
 
     std::vector<xla::XlaOp> values;
     std::vector<TensorShape> shapes;
@@ -73,9 +62,7 @@ class ConcatBaseOp : public XlaOpKernel {
     const TensorShape& input_shape = shapes[0];
 
     int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(ctx,
-                (0 <= axis && axis < input_dims) ||
-                    (allow_legacy_scalars() && concat_dim == 0),
+    OP_REQUIRES(ctx, 0 <= axis && axis < input_dims,
                 errors::InvalidArgument(
                     "ConcatOp : Expected concatenating dimensions in the range "
                     "[",
@@ -84,16 +71,12 @@ class ConcatBaseOp : public XlaOpKernel {
     // Make a vector holding the XlaOp for each of the inputs that has non-zero
     // elements.
     std::vector<xla::XlaOp> input_data;
-    std::vector<xla::XlaOp> partial_concats;
     int output_concat_dim = 0;
-    const bool input_is_scalar = IsLegacyScalar(input_shape);
     for (int i = 0; i < N; ++i) {
       xla::XlaOp handle = values[i];
       const TensorShape& in_shape = shapes[i];
-      const bool in_is_scalar = IsLegacyScalar(in_shape);
       OP_REQUIRES(
-          ctx,
-          in_shape.dims() == input_dims || (input_is_scalar && in_is_scalar),
+          ctx, in_shape.dims() == input_dims,
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
               input_shape.DebugString(), " vs. shape[", i,
@@ -105,30 +88,10 @@ class ConcatBaseOp : public XlaOpKernel {
         input_data.push_back(handle);
       }
       output_concat_dim += in_shape.dims() > 0 ? in_shape.dim_size(axis) : 1;
-
-      // Concat is associative, so it can be split into many operations when too
-      // many arguments are in a single op. This is a temporary workaround for
-      // b/112613927 where too many parameters in an XlaLaunchOp later result in
-      // too many parameters to a single GPU kernel.
-      if (i && i % kMaxConcatArgsPerOp == 0) {
-        partial_concats.push_back(
-            xla::ConcatInDim(ctx->builder(), input_data, axis));
-        input_data.clear();
-      }
     }
-    // Add any inputs that have not been put into another concat yet.
-    partial_concats.insert(partial_concats.end(), input_data.begin(),
-                           input_data.end());
 
     VLOG(1) << "Concat dim " << concat_dim << " equivalent to " << axis;
-    // Don't add an additional "identity" concatenate for better readibility of
-    // IR.
-    if (partial_concats.size() == 1) {
-      ctx->SetOutput(0, partial_concats.front());
-    } else {
-      ctx->SetOutput(0,
-                     xla::ConcatInDim(ctx->builder(), partial_concats, axis));
-    }
+    ctx->SetOutput(0, xla::ConcatInDim(ctx->builder(), input_data, axis));
   }
 
  private:
@@ -149,10 +112,11 @@ class ConcatV2Op : public ConcatBaseOp {
       : ConcatBaseOp(c, /* axis_index */ c->num_inputs() - 1) {}
 };
 
-REGISTER_XLA_OP(Name("Concat").CompileTimeConstInput("concat_dim"), ConcatOp);
+REGISTER_XLA_OP(Name("Concat").CompileTimeConstantInput("concat_dim"),
+                ConcatOp);
 REGISTER_XLA_OP(Name("ConcatV2")
                     .TypeConstraint("Tidx", DT_INT32)
-                    .CompileTimeConstInput("axis"),
+                    .CompileTimeConstantInput("axis"),
                 ConcatV2Op);
 
 class ConcatOffsetOp : public XlaOpKernel {
@@ -161,11 +125,10 @@ class ConcatOffsetOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape concat_dim_shape = ctx->InputShape(0);
-    OP_REQUIRES(
-        ctx, IsLegacyScalar(concat_dim_shape),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_shape.DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(concat_dim_shape),
+                errors::InvalidArgument(
+                    "Concat dim tensor should be a scalar, but got shape ",
+                    concat_dim_shape.DebugString()));
     for (int i = 1; i < ctx->num_inputs(); ++i) {
       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(ctx->InputShape(i)),
                   errors::InvalidArgument("input ", i,
@@ -192,39 +155,38 @@ class ConcatOffsetOp : public XlaOpKernel {
     //  [0, 5, 0, 0]
     const int32 N = ctx->num_inputs() - 1;
     const TensorShape inp0_shape = ctx->InputShape(1);
-    xla::Literal inp0_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &inp0_literal));
-    const int64 dims = inp0_shape.num_elements();
+    std::vector<int64> inp0_dims;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &inp0_dims));
+    const int64 inp0_rank = inp0_shape.num_elements();
 
-    xla::Literal concat_dim_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &concat_dim_literal));
-    const int64 cdim = concat_dim_literal.Get<int>({});
+    int64 cdim;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &cdim));
 
-    VLOG(1) << "ConcatOffset " << cdim << "," << dims;
-    int32 axis = cdim < 0 ? cdim + dims : cdim;
-    OP_REQUIRES(ctx, FastBoundsCheck(axis, dims),
+    VLOG(1) << "ConcatOffset " << cdim << "," << inp0_rank;
+    int32 axis = cdim < 0 ? cdim + inp0_rank : cdim;
+    OP_REQUIRES(ctx, FastBoundsCheck(axis, inp0_rank),
                 errors::InvalidArgument("Concat dim is out of range: ", axis,
-                                        " vs. ", dims));
+                                        " vs. ", inp0_rank));
     int32 offset = 0;
     for (int i = 0; i < N; ++i) {
       const TensorShape inp_shape = ctx->InputShape(1 + i);
-      OP_REQUIRES(ctx, dims == inp_shape.num_elements(),
-                  errors::InvalidArgument("input ", i, " should contain ", dims,
-                                          " elements, but got ",
+      OP_REQUIRES(ctx, inp0_rank == inp_shape.num_elements(),
+                  errors::InvalidArgument("input ", i, " should contain ",
+                                          inp0_rank, " elements, but got ",
                                           inp_shape.num_elements()));
-      xla::Literal inp_literal;
-      OP_REQUIRES_OK(ctx, ctx->ConstantInput(1 + i, &inp_literal));
+      std::vector<int64> inp_dims;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1 + i, &inp_dims));
 
-      Tensor out_constant(DT_INT32, TensorShape({dims}));
+      Tensor out_constant(DT_INT32, TensorShape({inp0_rank}));
       auto out_vec = out_constant.vec<int32>();
-      for (int64 j = 0; j < dims; ++j) {
+      for (int64 j = 0; j < inp0_rank; ++j) {
         if (j == axis) {
           out_vec(j) = offset;
-          offset += inp_literal.Get<int>({j});
+          offset += inp_dims[j];
         } else {
-          const int32 inp0_element = inp0_literal.Get<int>({j});
-          const int32 inp_element = inp_literal.Get<int>({j});
-          OP_REQUIRES(ctx, (inp0_element == inp_element),
+          const int32 inp0_element = inp0_dims[j];
+          const int32 inp_element = inp_dims[j];
+          OP_REQUIRES(ctx, inp0_element == inp_element,
                       errors::InvalidArgument("input[", i, ",", j,
                                               "] mismatch: ", inp0_element,
                                               " vs. ", inp_element));
@@ -238,8 +200,8 @@ class ConcatOffsetOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("ConcatOffset")
-                    .CompileTimeConstInput("concat_dim")
-                    .CompileTimeConstInput("shape"),
+                    .CompileTimeConstantInput("concat_dim")
+                    .CompileTimeConstantInput("shape"),
                 ConcatOffsetOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index 2628ef8e2454976aeff3859fa5dc1d8e106f32e1..dff8af800229b9605bb93e0498bc5e5cf012f244 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -42,11 +42,6 @@ class ConstOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape shape(proto_.tensor_shape());
 
-    if (proto_.dtype() == DT_STRING) {
-      LOG(WARNING) << "Not computing Const of type DT_STRING";
-      ctx->SetInvalidOutput(0);
-      return;
-    }
     xla::XlaBuilder* b = ctx->builder();
 
     // To avoid blowups for large constants filled with the same value,
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index c9a1be494066e4f935a1d818bc86c86333e34fae..641fefafb357f6ad10483c454600f3dadd4f8cb7 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -65,60 +64,63 @@ xla::Shape ExpandedFilterShapeForDepthwiseConvolution(const xla::Shape& shape) {
 //   0 0 1 1 0 0   0 0 1 1 0 0
 //   0 0 0 0 1 1   0 0 0 0 1 1
 //
-// The first step is to create a one tensor, A, that is [3]
-//   0 1 2
+// The first step is to create a iota A with iota_dimension = 2
+//   0 0 0 0 0 0   0 0 0 0 0 0
+//   1 1 1 1 1 1   1 1 1 1 1 1
+//   2 2 2 2 2 2   2 2 2 2 2 2
 //
-// and another tensor, B,  that is [3 * 2]
-//   0 1 2 3 4 5
+//   0 0 0 0 0 0   0 0 0 0 0 0
+//   1 1 1 1 1 1   1 1 1 1 1 1
+//   2 2 2 2 2 2   2 2 2 2 2 2
 //
-// and divide B it by 2 to get
-//   0 0 1 1 2 2
+// and another iota B with iota_dimension = 3
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
 //
-// then we broadcast the B to [2, 2, 3, 3 * 2]
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
 //
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
+// and divide B by 2 to get
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
 //
-// Finally compare A and broadcasted B in dimension 2 amd return the result at
-// the beginning of the comment.
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//
+// Finally compare A and B and return the result at the beginning of the
+// comment.
 xla::XlaOp CreateExpandedFilterMask(const xla::Shape& filter_shape,
                                     xla::XlaBuilder* builder) {
   xla::Shape expanded_filter_shape =
       ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
   int64 depthwise_multiplier =
       filter_shape.dimensions(filter_shape.dimensions_size() - 1);
-  int64 input_feature =
-      filter_shape.dimensions(filter_shape.dimensions_size() - 2);
-
-  // Create a M sized linspace and an M*N sized linspace that will be
-  // broadcasted into perpendicular dimensions and compared.
-  xla::XlaOp input_feature_iota = xla::Iota(builder, xla::S32, input_feature);
-  xla::XlaOp expanded_feature_iota =
-      xla::Iota(builder, xla::S32, input_feature * depthwise_multiplier);
 
-  // Divide the M*N sized linspace by the depthwise_multiplier to create
-  // [0 0 1 1 2 2] in the example in the function comment.
+  // Create two iotas with the shape of the expanded filter, one of them with
+  // the iota dimension chosen as the feature dimension, and the other a iota
+  // with the iota dimension chosen as the expanded output feature dimension.
+  std::vector<int64> iota_dimensions(expanded_filter_shape.dimensions().begin(),
+                                     expanded_filter_shape.dimensions().end());
+  xla::Shape iota_shape = xla::ShapeUtil::MakeShape(xla::S32, iota_dimensions);
+  xla::XlaOp input_feature_iota = xla::Iota(
+      builder, iota_shape, /*iota_dimension=*/iota_dimensions.size() - 2);
+  xla::XlaOp expanded_feature_iota = xla::Iota(
+      builder, iota_shape, /*iota_dimension=*/iota_dimensions.size() - 1);
+
+  // Divide 'expanded_feature_iota' by the depthwise_multiplier to create
+  // [0 0 1 1 2 2] ... in the example in the function comment.
   expanded_feature_iota =
       xla::Div(expanded_feature_iota,
                XlaHelpers::IntegerLiteral(builder, DataType::DT_INT32,
                                           depthwise_multiplier));
 
-  // Broadcast the N*M linspace to [H, W, ..., M, M*N].
-  std::vector<int64> expanded_feature_broadcast_dims(
-      expanded_filter_shape.dimensions().begin(),
-      expanded_filter_shape.dimensions().end());
-  expanded_feature_broadcast_dims.pop_back();
-  auto broadcasted_expanded_feature_iota =
-      xla::Broadcast(expanded_feature_iota, expanded_feature_broadcast_dims);
-
-  // Compare the broadcasted linspace to the input feature linspace in the
-  // input feature dimension to create a diagonal predicate.
-  return xla::Eq(broadcasted_expanded_feature_iota, input_feature_iota,
-                 {expanded_filter_shape.dimensions_size() - 2});
+  // Compare 'input_feature_iota' with 'expanded_feature_iota' to create a
+  // diagonal predicate.
+  return xla::Eq(expanded_feature_iota, input_feature_iota);
 }
 
 // Reshapes a filter of shape [H, W, ..., M, N] to [H, W, ..., 1, M*N]. Used to
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index cd7c820be0b6029514ff74288e7bdd3f75b5d6b1..eafdba876ae9e2c38694f065cf83bb3725b8460e 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -124,7 +124,7 @@ class Conv2DBackpropInputOp : public ConvBackpropInputOp {
       : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/false) {}
 };
 REGISTER_XLA_OP(
-    Name("Conv2DBackpropInput").CompileTimeConstInput("input_sizes"),
+    Name("Conv2DBackpropInput").CompileTimeConstantInput("input_sizes"),
     Conv2DBackpropInputOp);
 
 class Conv3DBackpropInputOp : public ConvBackpropInputOp {
@@ -133,7 +133,7 @@ class Conv3DBackpropInputOp : public ConvBackpropInputOp {
       : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/3, /*depthwise=*/false) {}
 };
 REGISTER_XLA_OP(
-    Name("Conv3DBackpropInputV2").CompileTimeConstInput("input_sizes"),
+    Name("Conv3DBackpropInputV2").CompileTimeConstantInput("input_sizes"),
     Conv3DBackpropInputOp);
 
 class DepthwiseConv2DBackpropInputOp : public ConvBackpropInputOp {
@@ -142,7 +142,7 @@ class DepthwiseConv2DBackpropInputOp : public ConvBackpropInputOp {
       : ConvBackpropInputOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/true) {}
 };
 REGISTER_XLA_OP(Name("DepthwiseConv2dNativeBackpropInput")
-                    .CompileTimeConstInput("input_sizes"),
+                    .CompileTimeConstantInput("input_sizes"),
                 DepthwiseConv2DBackpropInputOp);
 
 class ConvBackpropFilterOp : public XlaOpKernel {
@@ -183,7 +183,7 @@ class Conv2DBackpropFilterOp : public ConvBackpropFilterOp {
   }
 };
 REGISTER_XLA_OP(
-    Name("Conv2DBackpropFilter").CompileTimeConstInput("filter_sizes"),
+    Name("Conv2DBackpropFilter").CompileTimeConstantInput("filter_sizes"),
     Conv2DBackpropFilterOp);
 
 class Conv3DBackpropFilterOp : public ConvBackpropFilterOp {
@@ -193,7 +193,7 @@ class Conv3DBackpropFilterOp : public ConvBackpropFilterOp {
   }
 };
 REGISTER_XLA_OP(
-    Name("Conv3DBackpropFilterV2").CompileTimeConstInput("filter_sizes"),
+    Name("Conv3DBackpropFilterV2").CompileTimeConstantInput("filter_sizes"),
     Conv3DBackpropFilterOp);
 
 class DepthwiseConv2DBackpropFilterOp : public ConvBackpropFilterOp {
@@ -202,7 +202,7 @@ class DepthwiseConv2DBackpropFilterOp : public ConvBackpropFilterOp {
       : ConvBackpropFilterOp(ctx, /*num_spatial_dims=*/2, /*depthwise=*/true) {}
 };
 REGISTER_XLA_OP(Name("DepthwiseConv2dNativeBackpropFilter")
-                    .CompileTimeConstInput("filter_sizes"),
+                    .CompileTimeConstantInput("filter_sizes"),
                 DepthwiseConv2DBackpropFilterOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
index ef1015552d181a183d412f9c269dd5ec608b388f..234f7b4a019c9aac4bac4f906ddbae166ecd9a80 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -39,7 +40,8 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) {
   // compute valid broadcast shapes, but rely below on XLA to
   // automatically perform the broadcast assuming its valid shapes are
   // a superset of TensorFlow's valid shapes.
-  BCast bcast(BCast::FromShape(lhs_shape), BCast::FromShape(rhs_shape));
+  BCast bcast(BCast::FromShape(lhs_shape), BCast::FromShape(rhs_shape),
+              /*fewer_dims_optimization=*/false);
   if (!bcast.IsValid()) {
     ctx->SetStatus(errors::InvalidArgument("Incompatible shapes: ",
                                            lhs_shape.DebugString(), " vs. ",
@@ -86,51 +88,18 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) {
 }
 
 /* static */ std::pair<xla::XlaOp, xla::XlaOp> XlaBinaryOp::Broadcast(
-    xla::XlaBuilder* builder, const xla::XlaOp& lhs, const xla::XlaOp& rhs,
-    const BCast& broadcast_helper) {
-  // Manually construct the broadcasting since MapN does not do
-  // automatic broadcasting. The bcast helper ensures that
-  // lhs.reshape(bcast.x_reshape()).broadcast(bcast.x_bcast()) and
-  // rhs.reshape(bcast.y_reshape()).broadcast(bcast.y_bcast()) have
-  // the same shape, so can be operated on by MapN.
-
-  // First reshape the inputs, which should be a metadata-only
-  // operation since we are flattening the dimensions in order.
-  auto lhs_shaped = xla::Reshape(lhs, broadcast_helper.x_reshape());
-  auto rhs_shaped = xla::Reshape(rhs, broadcast_helper.y_reshape());
-
-  // Next broadcast the necessary input dimensions. We rely on the
-  // XLA optimizer to be smart about the fact that we are asking
-  // it to broadcast size 1 on some of these dimensions, to avoid
-  // adding complexity to this code.
-  auto lhs_broadcast = xla::Broadcast(lhs_shaped, broadcast_helper.x_bcast());
-  int lhs_size = broadcast_helper.x_bcast().size();
-  auto rhs_broadcast = xla::Broadcast(rhs_shaped, broadcast_helper.y_bcast());
-  int rhs_size = broadcast_helper.y_bcast().size();
-
-  // Now reshape them to the correct output shape. After the
-  // broadcast each side is twice as wide as it should be, since the
-  // broadcast dimensions were prepended to the shape. Reshape
-  // flattening each original dimension with the prepended broadcast
-  // dimension. E.g. if we started out with lhs_shaped with shape
-  // [5,2,3] and x_bcast was [2,1,7] then lhs_broadcast would have
-  // shape [2,1,7,5,2,3] and we want to reshape it to [10,2,21].
-  std::vector<int64> lhs_reorder;
-  for (int i = 0; i < lhs_size; ++i) {
-    lhs_reorder.push_back(i);
-    lhs_reorder.push_back(i + lhs_size);
+    xla::XlaOp lhs, xla::XlaOp rhs, const BCast& broadcast_helper) {
+  auto lhs_output = BroadcastTo(lhs, broadcast_helper.output_shape());
+  if (!lhs_output.ok()) {
+    xla::XlaOp error = lhs.builder()->ReportError(lhs_output.status());
+    return {error, error};
   }
-  auto lhs_output =
-      xla::Reshape(lhs_broadcast, lhs_reorder, broadcast_helper.output_shape());
-  std::vector<int64> rhs_reorder;
-  for (int i = 0; i < rhs_size; ++i) {
-    rhs_reorder.push_back(i);
-    rhs_reorder.push_back(i + rhs_size);
+  auto rhs_output = BroadcastTo(rhs, broadcast_helper.output_shape());
+  if (!rhs_output.ok()) {
+    xla::XlaOp error = rhs.builder()->ReportError(rhs_output.status());
+    return {error, error};
   }
-  auto rhs_output =
-      xla::Reshape(rhs_broadcast, rhs_reorder, broadcast_helper.output_shape());
-
-  return {lhs_output, rhs_output};
+  return {lhs_output.ValueOrDie(), rhs_output.ValueOrDie()};
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
index 6653944a911588b7bc88d67b8cdd2c17850530f0..516ead4bfe89b4ddeee11dcc6410a838d04f28a9 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -67,8 +67,7 @@ class XlaBinaryOp : public XlaOpKernel {
   // 'broadcast_helper', yielding arguments 'lhs' and 'rhs' that have the same
   // shape.
   static std::pair<xla::XlaOp, xla::XlaOp> Broadcast(
-      xla::XlaBuilder* builder, const xla::XlaOp& lhs, const xla::XlaOp& rhs,
-      const BCast& broadcast_helper);
+      xla::XlaOp lhs, xla::XlaOp rhs, const BCast& broadcast_helper);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 49c12fc232092873b69961644a059abc6035f64f..ee79cbc70da269be7586c47b4fd33c901f4fd581 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
index 4af1e8b44cbbd02d8e3ea5e42d841c92288b5d56..bb2c0d9ddb8504a1156a74b6ece5d41b620803c7 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -102,8 +102,9 @@ class DynamicSliceOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("XlaDynamicSlice").CompileTimeConstInput("size_indices"),
-                DynamicSliceOp);
+REGISTER_XLA_OP(
+    Name("XlaDynamicSlice").CompileTimeConstantInput("size_indices"),
+    DynamicSliceOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index cb73053666d4c32bc0a2ef19b174aee1a29f101e..6e6ba21daf5bf3eab5bfc15378e77b6dd253da7c 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -113,8 +113,20 @@ class DynamicStitchOp : public XlaOpKernel {
       }
     }
     int number_of_indices = max_index + 1;
-    OP_REQUIRES(ctx, number_of_indices > 0,
-                errors::InvalidArgument("no indices supplied"));
+    int64 result_rank = 1 + data0_shape.dims() - indices0_shape.dims();
+    if (number_of_indices == 0) {
+      std::vector<int64> result_shape(result_rank);
+      for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
+        result_shape[d - indices0_shape.dims() + 1] = data0_shape.dim_size(d);
+      }
+      xla::PrimitiveType element_type =
+          ctx->input_xla_type(ctx->num_inputs() - 1);
+      xla::Literal empty_literal = xla::Literal::CreateFromShape(
+          xla::ShapeUtil::MakeShape(element_type, result_shape));
+      ctx->SetOutput(0, xla::ConstantLiteral(ctx->builder(), empty_literal));
+      return;
+    }
+
     // Construct the reverse mapping, for each index, of which slice of which
     // input it comes from.
     std::vector<int32> src_input_vector(number_of_indices);
@@ -157,12 +169,9 @@ class DynamicStitchOp : public XlaOpKernel {
 
     // Set up the vectors for slicing: the first dimension will vary
     // slice by slice, and the rest take the full common extra shape.
-    std::vector<int64> slice_start(1 + data0_shape.dims() -
-                                   indices0_shape.dims());
-    std::vector<int64> slice_limit(1 + data0_shape.dims() -
-                                   indices0_shape.dims());
-    std::vector<int64> stride(1 + data0_shape.dims() - indices0_shape.dims(),
-                              1);
+    std::vector<int64> slice_start(result_rank);
+    std::vector<int64> slice_limit(result_rank);
+    std::vector<int64> stride(result_rank, 1);
     for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
       slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d);
     }
@@ -200,10 +209,11 @@ class DynamicStitchOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("DynamicStitch").CompileTimeConstInput("indices"),
-                DynamicStitchOp);
-REGISTER_XLA_OP(Name("ParallelDynamicStitch").CompileTimeConstInput("indices"),
+REGISTER_XLA_OP(Name("DynamicStitch").CompileTimeConstantInput("indices"),
                 DynamicStitchOp);
+REGISTER_XLA_OP(
+    Name("ParallelDynamicStitch").CompileTimeConstantInput("indices"),
+    DynamicStitchOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index c68b0bfd7961892294c2931e5c4c44de534a7740..29687c7b82f92d9f336854c4575746589c63b64f 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc b/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec3463bd58f55c1fc6a8f7c074c8e487d266d7b6
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/fake_param_op.cc
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+
+namespace tensorflow {
+
+// This OpKernel implements the FakeParam Op for XLA JIT devices. Create zeros
+// with the appropriate shape for FakeParam op.
+class XlaFakeParamOp : public XlaOpKernel {
+ public:
+  explicit XlaFakeParamOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    DataType dtype;
+    TensorShape tensor_shape;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &tensor_shape));
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, tensor_shape, &shape_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    ctx->SetOutput(0, xla::Zeros(b, shape_));
+  }
+
+ private:
+  xla::Shape shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaFakeParamOp);
+};
+
+REGISTER_XLA_OP(Name("FakeParam"), XlaFakeParamOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index cdba6680dee3fade5bdf0c453ed672b653072b0d..142be030f737f105980ab9c80a5a849e1ca6eb47 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -260,19 +260,19 @@ class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel {
     xla::XlaOp below_min = xla::Lt(input, nudged_input_min);
     xla::XlaOp select1 = xla::Select(below_min, gradient, zeroes);
     xla::XlaOp reduce1 = xla::ReduceAll(
-        XlaHelpers::ConvertElementType(b, select1, accumulation_type),
+        XlaHelpers::ConvertElementType(select1, accumulation_type),
         XlaHelpers::Zero(b, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type));
-    xla::XlaOp output1 = XlaHelpers::ConvertElementType(b, reduce1, data_type);
+    xla::XlaOp output1 = XlaHelpers::ConvertElementType(reduce1, data_type);
     ctx->SetOutput(1, output1);
 
     xla::XlaOp above_max = xla::Gt(input, nudged_input_max);
     xla::XlaOp select2 = xla::Select(above_max, gradient, zeroes);
     xla::XlaOp reduce2 = xla::ReduceAll(
-        XlaHelpers::ConvertElementType(b, select2, accumulation_type),
+        XlaHelpers::ConvertElementType(select2, accumulation_type),
         XlaHelpers::Zero(b, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type));
-    xla::XlaOp output2 = XlaHelpers::ConvertElementType(b, reduce2, data_type);
+    xla::XlaOp output2 = XlaHelpers::ConvertElementType(reduce2, data_type);
     ctx->SetOutput(2, output2);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 80bcef966360ec9a1ca63a02741108ce41b31846..6df8b5367d2390e65995beb1583b225755e6ee9f 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -50,11 +51,36 @@ class GenericFftOp : public XlaOpKernel {
         errors::InvalidArgument("input must be at least 1 dimensional"));
 
     std::vector<int64> fft_length;
+    xla::XlaOp input = ctx->Input(0);
     if (fft_type_ == FftType::RFFT || fft_type_ == FftType::IRFFT) {
       OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &fft_length));
       OP_REQUIRES(ctx, fft_length.size() == fft_rank_,
                   errors::InvalidArgument("fft_length must be length ",
                                           fft_rank_, " vector"));
+
+      // Zero pad or truncate the axes we're doing FFT on.
+      absl::InlinedVector<int64, 4> slice_sizes = input_shape.dim_sizes();
+      std::vector<std::pair<int64, int64>> padding_sizes(slice_sizes.size());
+      std::vector<int64> expected_sizes = fft_length;
+      // IRFFT wants the innermost axis to be n / 2 + 1.
+      if (fft_type_ == FftType::IRFFT) {
+        expected_sizes[fft_rank_ - 1] = fft_length[fft_rank_ - 1] / 2 + 1;
+      }
+      for (int i = 0; i < fft_rank_; i++) {
+        int index = input_shape.dims() - fft_rank_ + i;
+        if (input_shape.dim_size(index) > expected_sizes[i]) {
+          slice_sizes[index] = expected_sizes[i];
+        } else {
+          padding_sizes[index].second =
+              expected_sizes[i] - input_shape.dim_size(index);
+        }
+      }
+
+      std::vector<int64> start_indices(input_shape.dims(), 0);
+      std::vector<int64> strides(input_shape.dims(), 1);
+      input = xla::Pad(xla::Slice(input, start_indices, slice_sizes, strides),
+                       XlaHelpers::Zero(ctx->builder(), ctx->input_type(0)),
+                       xla::MakeEdgePaddingConfig(padding_sizes));
     } else {
       // Innermost axis provides the FFT length.
       for (int i = 0; i < fft_rank_; i++) {
@@ -63,7 +89,7 @@ class GenericFftOp : public XlaOpKernel {
       }
     }
 
-    xla::XlaOp fft = xla::Fft(ctx->Input(0), fft_type_, fft_length);
+    xla::XlaOp fft = xla::Fft(input, fft_type_, fft_length);
     ctx->SetOutput(0, fft);
   }
 
@@ -106,9 +132,11 @@ class RFFTOp : public GenericFftOp {
   explicit RFFTOp(OpKernelConstruction* ctx)
       : GenericFftOp(ctx, /*fft_type=*/FftType::RFFT, /*fft_rank=*/FFTRank) {}
 };
-REGISTER_XLA_OP(Name("RFFT").CompileTimeConstInput("fft_length"), RFFTOp<1>);
-REGISTER_XLA_OP(Name("RFFT2D").CompileTimeConstInput("fft_length"), RFFTOp<2>);
-REGISTER_XLA_OP(Name("RFFT3D").CompileTimeConstInput("fft_length"), RFFTOp<3>);
+REGISTER_XLA_OP(Name("RFFT").CompileTimeConstantInput("fft_length"), RFFTOp<1>);
+REGISTER_XLA_OP(Name("RFFT2D").CompileTimeConstantInput("fft_length"),
+                RFFTOp<2>);
+REGISTER_XLA_OP(Name("RFFT3D").CompileTimeConstantInput("fft_length"),
+                RFFTOp<3>);
 
 template <int FFTRank>
 class IRFFTOp : public GenericFftOp {
@@ -116,10 +144,11 @@ class IRFFTOp : public GenericFftOp {
   explicit IRFFTOp(OpKernelConstruction* ctx)
       : GenericFftOp(ctx, /*fft_type=*/FftType::IRFFT, /*fft_rank=*/FFTRank) {}
 };
-REGISTER_XLA_OP(Name("IRFFT").CompileTimeConstInput("fft_length"), IRFFTOp<1>);
-REGISTER_XLA_OP(Name("IRFFT2D").CompileTimeConstInput("fft_length"),
+REGISTER_XLA_OP(Name("IRFFT").CompileTimeConstantInput("fft_length"),
+                IRFFTOp<1>);
+REGISTER_XLA_OP(Name("IRFFT2D").CompileTimeConstantInput("fft_length"),
                 IRFFTOp<2>);
-REGISTER_XLA_OP(Name("IRFFT3D").CompileTimeConstInput("fft_length"),
+REGISTER_XLA_OP(Name("IRFFT3D").CompileTimeConstantInput("fft_length"),
                 IRFFTOp<3>);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index 54b21a278229024e3e54e9135548be6b69b077e1..35e0625dbb0d4c696d36cce642d6f50f1d220c45 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace {
@@ -33,44 +34,25 @@ class FillOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     // The output of this Op is a tensor of shape 'dims_shape' with each
     // element set to the scalar 'dims_literal'.
-    const TensorShape dims_shape = ctx->InputShape(0);
-    const TensorShape value_shape = ctx->InputShape(1);
+    const TensorShape dims_shape = ctx->InputShape("dims");
+    const TensorShape value_shape = ctx->InputShape("value");
     OP_REQUIRES(
-        ctx, IsLegacyVector(dims_shape),
+        ctx, TensorShapeUtils::IsVector(dims_shape),
         errors::InvalidArgument("dims must be a vector of int32, got shape ",
                                 dims_shape.DebugString()));
-    OP_REQUIRES(ctx, IsLegacyScalar(value_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(value_shape),
                 errors::InvalidArgument("value must be a scalar, got shape ",
                                         value_shape.DebugString()));
-    // Evaluate the 'dims' constant input, reshaping to a vector if it
-    // was a 'legacy' vector (secretly a scalar).
-    xla::Literal dims_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(
-                            0, {dims_shape.num_elements()}, &dims_literal));
 
-    // Convert the dims literal into a vector that we can pass to
-    // XlaBuilder.
-    std::vector<int64> broadcast;
-    broadcast.reserve(dims_literal.shape().dimensions(0));
-    for (int i = 0; i < dims_literal.shape().dimensions(0); ++i) {
-      broadcast.push_back(dims_literal.Get<int>({i}));
-    }
-    // Look up the value input, reshaping to a scalar if it was a
-    // 'legacy' scalar (secretly a vector).
-    xla::XlaOp data = ctx->Input(1);
-    if (value_shape.dims() > 0) {
-      CHECK_EQ(value_shape.dims(), 1);
-      data = xla::Reshape(data, {});
-    }
-    // Emit the actual computation, which broadcasts the scalar to the
-    // desired shape.
-    auto result = xla::Broadcast(data, broadcast);
+    std::vector<int64> dims;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("dims", &dims));
 
+    auto result = xla::Broadcast(ctx->Input("value"), dims);
     ctx->SetOutput(0, result);
   }
 };
 
-REGISTER_XLA_OP(Name("Fill").CompileTimeConstInput("dims"), FillOp);
+REGISTER_XLA_OP(Name("Fill").CompileTimeConstantInput("dims"), FillOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 44140304fdf5cdf60d8ad8b85c532fcadff8ba86..20b0de193dc060197f3062d3be0b8d45f7dcb9b1 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -194,7 +194,7 @@ class GatherOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("Gather"), GatherOp);
-REGISTER_XLA_OP(Name("GatherV2").CompileTimeConstInput("axis"), GatherOp);
+REGISTER_XLA_OP(Name("GatherV2").CompileTimeConstantInput("axis"), GatherOp);
 
 class GatherNdOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 56da50f140893c68c8a1556853884720b21c7229..b5e083912555c865b5eadc7697075c9ca4451ca9 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -72,7 +72,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.shape = resource->shape();
       OP_REQUIRES(ctx, arg.initialized,
                   errors::Unimplemented("Uninitialized arguments: ", arg.name));
-      arg.tensor_array_size = resource->tensor_array_size();
+      arg.max_array_size = resource->max_array_size();
       for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index 921b4340c0ac674a5ad7d17aaf54f1cf36975151..e9bb0a77e99d144863b027bd214081316d61c314 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -189,12 +191,11 @@ class AdjustContrastOpV2 : public XlaOpKernel {
     DataType type = context->input_type(0);
 
     const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
-    auto converted =
-        XlaHelpers::ConvertElementType(b, input, accumulation_type);
+    auto converted = XlaHelpers::ConvertElementType(input, accumulation_type);
     auto reduce = xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                               *context->GetOrCreateAdd(accumulation_type),
                               {height_dim, width_dim});
-    auto output = XlaHelpers::ConvertElementType(b, reduce, type);
+    auto output = XlaHelpers::ConvertElementType(reduce, type);
     output =
         xla::Div(output, XlaHelpers::FloatLiteral(b, type, height * width));
 
@@ -316,6 +317,70 @@ class AdjustHueOp : public XlaOpKernel {
 };
 REGISTER_XLA_OP(Name("AdjustHue"), AdjustHueOp);
 
+struct WhileCondFn {
+  const int64 num_boxes;
+  const int64 output_size;
+
+  explicit WhileCondFn(int64 num_boxes, int64 output_size)
+      : num_boxes(num_boxes), output_size(output_size) {}
+
+  xla::StatusOr<xla::XlaOp> operator()(absl::Span<const xla::XlaOp> values,
+                                       xla::XlaBuilder* cond_builder) const {
+    xla::XlaOp row_idx = values[0];
+    xla::XlaOp row_in_bounds =
+        xla::Lt(row_idx, xla::ConstantR0<int32>(cond_builder, num_boxes));
+    xla::XlaOp num_outputs_so_far = values[1];
+    xla::XlaOp results_not_full = xla::Lt(
+        num_outputs_so_far, xla::ConstantR0<int32>(cond_builder, output_size));
+    return xla::And(row_in_bounds, results_not_full);
+  }
+};
+
+// Process the boxes one-by-one using the iou matrix mask.
+// This implementation uses a correct, but greedy, sequential algorithm
+// to ensure that suppressed boxes cannot themselves suppress other
+// boxes.
+struct SuppressBodyFn {
+  const int64 num_boxes;
+
+  explicit SuppressBodyFn(int64 num_boxes) : num_boxes(num_boxes) {}
+
+  xla::StatusOr<std::vector<xla::XlaOp>> operator()(
+      absl::Span<const xla::XlaOp> values, xla::XlaBuilder* builder) const {
+    auto row_idx = values[0];
+    auto num_outputs_so_far = values[1];
+    auto iou_mask = values[2];
+    auto included_iou = values[3];
+    auto zero_r1 = xla::ConstantR1<int32>(builder, {0});
+    // Determine if current elem is active using a slice.
+    auto row_idx_r1 = xla::Reshape(row_idx, {1});
+    auto active_elem = xla::DynamicSlice(included_iou, row_idx_r1, {1});
+    active_elem = xla::Reshape(active_elem, {});
+    // Increment output count iff current elem is not suppressed.
+    num_outputs_so_far = xla::Select(
+        active_elem, num_outputs_so_far + xla::ConstantR0<int32>(builder, 1),
+        num_outputs_so_far);
+    // Slice out the row_idx.
+    auto starts = xla::ConcatInDim(builder, {row_idx_r1, zero_r1}, 0);
+    auto row_iou = xla::DynamicSlice(iou_mask, starts, {1, num_boxes});
+    // Remove the diagonal from consideration. An elem cannot suppress
+    // itself.
+    auto update_starts = xla::ConcatInDim(builder, {zero_r1, row_idx_r1}, 0);
+    row_iou = xla::DynamicUpdateSlice(
+        row_iou, xla::ConstantR2FromArray2D<bool>(builder, {{false}}),
+        update_starts);
+    // Create a suppression by inverting polarity.
+    row_iou = xla::Reshape(row_iou, {num_boxes});
+    auto supp_mask = xla::Not(row_iou);
+    // Update mask iff current elem is not suppressed.
+    included_iou = xla::Select(xla::Broadcast(active_elem, {num_boxes}),
+                               xla::And(included_iou, supp_mask), included_iou);
+    row_idx = row_idx + xla::ConstantR0<int32>(builder, 1);
+    return std::vector<xla::XlaOp>{row_idx, num_outputs_so_far, iou_mask,
+                                   included_iou};
+  }
+};
+
 class NonMaxSuppressionOp : public XlaOpKernel {
  public:
   explicit NonMaxSuppressionOp(OpKernelConstruction* context)
@@ -326,14 +391,12 @@ class NonMaxSuppressionOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* context) override {
     // TODO(b/111646731): Improve scalability of this op, using blocking.
-    int num_boxes_dim = 0;
-    int coords_dim = 1;
     const TensorShape& boxes_shape = context->InputShape("boxes");
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(boxes_shape),
                 errors::InvalidArgument("boxes must be 2-D, currently: ",
                                         boxes_shape.DebugString()));
-    const int64 num_boxes = boxes_shape.dim_size(num_boxes_dim);
-    OP_REQUIRES(context, boxes_shape.dim_size(coords_dim) == 4,
+    const int64 num_boxes = boxes_shape.dim_size(0);
+    OP_REQUIRES(context, boxes_shape.dim_size(1) == 4,
                 errors::InvalidArgument("boxes must have 4 columns",
                                         boxes_shape.DebugString()));
     const TensorShape& scores_shape = context->InputShape("scores");
@@ -347,9 +410,13 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     OP_REQUIRES(context, pad_to_max_output_size_,
                 errors::InvalidArgument(
                     "XLA compilation requires pad_to_max_output_size == True"));
+    OP_REQUIRES(context, num_boxes <= kint32max,
+                errors::InvalidArgument("XLA compilation requires number of "
+                                        "boxes to be <= kint32max, got ",
+                                        num_boxes));
 
-    xla::XlaOp boxes = context->Input("boxes");
-    xla::XlaOp scores = context->Input("scores");
+    const xla::XlaOp boxes_input = context->Input("boxes");
+    const xla::XlaOp scores_input = context->Input("scores");
     int64 output_size;
     OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &output_size));
     OP_REQUIRES(
@@ -358,90 +425,113 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     OP_REQUIRES(context, output_size <= kint32max,
                 errors::InvalidArgument("Need output_size <= kint32Max, got ",
                                         output_size));
-    xla::XlaOp score_thresh = context->Input("score_threshold");
-    xla::XlaOp iou_thresh = context->Input("iou_threshold");
-
+    const xla::XlaOp score_thresh = context->Input("score_threshold");
+    const xla::XlaOp iou_thresh = context->Input("iou_threshold");
     xla::XlaBuilder* const builder = context->builder();
 
     // Choose a more convenient layout.
-    xla::XlaOp boxes_t = xla::Transpose(boxes, {1, 0});
-    coords_dim = 0;
-    num_boxes_dim = 1;
-
-    // Shapes are henceforth [1, num_boxes].
-    xla::XlaOp coord_y0 = xla::SliceInDim(boxes_t,
-                                          /*start_index=*/0,
-                                          /*limit_index=*/1,
-                                          /*stride=*/1,
-                                          /*dimno=*/coords_dim);
-    xla::XlaOp coord_x0 = xla::SliceInDim(boxes_t,
-                                          /*start_index=*/1,
-                                          /*limit_index=*/2,
-                                          /*stride=*/1,
-                                          /*dimno=*/coords_dim);
-    xla::XlaOp coord_y1 = xla::SliceInDim(boxes_t,
-                                          /*start_index=*/2,
-                                          /*limit_index=*/3,
-                                          /*stride=*/1,
-                                          /*dimno=*/coords_dim);
-    xla::XlaOp coord_x1 = xla::SliceInDim(boxes_t,
-                                          /*start_index=*/3,
-                                          /*limit_index=*/4,
-                                          /*stride=*/1,
-                                          /*dimno=*/coords_dim);
-    xla::XlaOp y1 =
-        xla::Select(xla::Le(coord_y0, coord_y1), coord_y0, coord_y1);
-    xla::XlaOp y2 =
-        xla::Select(xla::Le(coord_y0, coord_y1), coord_y1, coord_y0);
-    xla::XlaOp x1 =
-        xla::Select(xla::Le(coord_x0, coord_x1), coord_x0, coord_x1);
-    xla::XlaOp x2 =
-        xla::Select(xla::Le(coord_x0, coord_x1), coord_x1, coord_x0);
+    const xla::XlaOp boxes = xla::Transpose(boxes_input, {1, 0});
+    const xla::XlaOp boxes_sorted = xla::GetTupleElement(
+        xla::Sort(/*keys=*/-xla::Broadcast(scores_input, {4}),
+                  /*values=*/{boxes},
+                  /*dimension=*/1),
+        1);
+    // Track the mapping of indices into sorted domain.
+    const xla::XlaOp iota_indices = xla::Iota(builder, xla::S32, num_boxes);
+    const xla::XlaOp indices_sort = xla::Sort(-scores_input, {iota_indices});
+    const xla::XlaOp indices_sorted = xla::GetTupleElement(indices_sort, 1);
+    const xla::XlaOp scores = xla::Neg(xla::GetTupleElement(indices_sort, 0));
+
+    // Shapes are henceforth [1, num_boxes]. 'c_y0' denotes 'coordinate' y0.
+    const xla::XlaOp c_y0 = xla::Reshape(xla::SliceInDim(boxes_sorted,
+                                                         /*start_index=*/0,
+                                                         /*limit_index=*/1,
+                                                         /*stride=*/1,
+                                                         /*dimno=*/0),
+                                         {num_boxes});
+    const xla::XlaOp c_x0 = xla::Reshape(xla::SliceInDim(boxes_sorted,
+                                                         /*start_index=*/1,
+                                                         /*limit_index=*/2,
+                                                         /*stride=*/1,
+                                                         /*dimno=*/0),
+                                         {num_boxes});
+    const xla::XlaOp c_y1 = xla::Reshape(xla::SliceInDim(boxes_sorted,
+                                                         /*start_index=*/2,
+                                                         /*limit_index=*/3,
+                                                         /*stride=*/1,
+                                                         /*dimno=*/0),
+                                         {num_boxes});
+    const xla::XlaOp c_x1 = xla::Reshape(xla::SliceInDim(boxes_sorted,
+                                                         /*start_index=*/3,
+                                                         /*limit_index=*/4,
+                                                         /*stride=*/1,
+                                                         /*dimno=*/0),
+                                         {num_boxes});
+
+    xla::XlaOp y1 = xla::Select(xla::Le(c_y0, c_y1), c_y0, c_y1);
+    xla::XlaOp y2 = xla::Select(xla::Le(c_y0, c_y1), c_y1, c_y0);
+    xla::XlaOp x1 = xla::Select(xla::Le(c_x0, c_x1), c_x0, c_x1);
+    xla::XlaOp x2 = xla::Select(xla::Le(c_x0, c_x1), c_x1, c_x0);
     xla::XlaOp area = (y2 - y1) * (x2 - x1);
 
-    // Transpose the 1xN tensors, instead of the NxN tensors.
-    xla::XlaOp y1_t = xla::Transpose(y1, {1, 0});
-    xla::XlaOp y2_t = xla::Transpose(y2, {1, 0});
-    xla::XlaOp x1_t = xla::Transpose(x1, {1, 0});
-    xla::XlaOp x2_t = xla::Transpose(x2, {1, 0});
-    xla::XlaOp area_t = xla::Transpose(area, {1, 0});
+    // Shapes are henceforth [1, num_boxes].
+    y1 = xla::Broadcast(y1, {1});
+    y2 = xla::Broadcast(y2, {1});
+    x1 = xla::Broadcast(x1, {1});
+    x2 = xla::Broadcast(x2, {1});
+    area = xla::Broadcast(area, {1});
 
     // Shapes are henceforth [num_boxes, num_boxes].
-    xla::XlaOp i_xmin = xla::Max(x1, x1_t);
-    xla::XlaOp i_ymin = xla::Max(y1, y1_t);
-    xla::XlaOp i_xmax = xla::Min(x2, x2_t);
-    xla::XlaOp i_ymax = xla::Min(y2, y2_t);
+    xla::XlaOp i_xmin = xla::Max(x1, xla::Transpose(x1, {1, 0}));
+    xla::XlaOp i_ymin = xla::Max(y1, xla::Transpose(y1, {1, 0}));
+    xla::XlaOp i_xmax = xla::Min(x2, xla::Transpose(x2, {1, 0}));
+    xla::XlaOp i_ymax = xla::Min(y2, xla::Transpose(y2, {1, 0}));
     auto square_zero = xla::ZerosLike(i_xmin);
 
     xla::XlaOp i_area = xla::Max(i_xmax - i_xmin, square_zero) *
                         xla::Max(i_ymax - i_ymin, square_zero);
-    xla::XlaOp u_area = area + area_t - i_area;
+    xla::XlaOp u_area = area + xla::Transpose(area, {1, 0}) - i_area;
     xla::XlaOp iou = i_area / u_area;
 
     xla::XlaOp iou_thresh_mask = xla::Gt(iou, iou_thresh + square_zero);
-    xla::XlaOp scores_2d = xla::Reshape(scores, {num_boxes, 1});
-    xla::XlaOp score_cmp_mask =
-        xla::Gt(scores_2d, xla::Transpose(scores_2d, {1, 0}));
-    xla::XlaOp suppress = xla::And(iou_thresh_mask, score_cmp_mask);
-
-    // Shapes are [num_boxes] after the reduce.
-    xla::XlaOp included_iou = xla::Not(xla::Reduce(
-        suppress,
-        /*init_value=*/xla::ConstantR0<bool>(builder, false),
-        /*computation=*/CreateScalarOrComputation(xla::PRED, builder),
-        /*dimensions_to_reduce=*/{0}));
+    xla::XlaOp included_iou =
+        xla::Broadcast(xla::ConstantR0<bool>(builder, true), {num_boxes});
+
+    std::vector<xla::XlaOp> init_values;
+    init_values.reserve(4);
+    init_values.push_back(xla::ConstantR0<int32>(builder, 0));  // col_idx
+    init_values.push_back(xla::ConstantR0<int32>(builder, 0));  // num_outputs
+    init_values.push_back(iou_thresh_mask);
+    init_values.push_back(included_iou);
+
+    auto suppress_loop_result =
+        XlaWhileLoop(WhileCondFn(num_boxes, output_size),
+                     SuppressBodyFn(num_boxes), init_values, "suppress_loop",
+                     builder)
+            .ValueOrDie();
+
     xla::XlaOp included_score =
         xla::Gt(scores, xla::Broadcast(score_thresh, {num_boxes}));
-    xla::XlaOp included = xla::And(included_iou, included_score);
+    xla::XlaOp included = xla::And(included_score, suppress_loop_result[3]);
+
+    // Only consider boxes over which we have iterated. This allows for accurate
+    // counting. DynamicSlice would require knowledge of the size of the output.
+    auto valid_elem = xla::Lt(
+        iota_indices, xla::Broadcast(suppress_loop_result[0], {num_boxes}));
+    included = xla::And(included, valid_elem);
+
     xla::XlaOp neg_inf =
         xla::Broadcast(xla::MinValue(builder, xla::F32), {num_boxes});
     xla::XlaOp scores_included = xla::Select(included, scores, neg_inf);
-
+    xla::XlaOp output_tuple = TopK(scores_included, output_size);
+    xla::XlaOp selected_indices_sorted = xla::GetTupleElement(output_tuple, 1);
+    // Calculate num_valid.
+    // Note: num_valid cannot be taken from the loop outputs, because outputs
+    // can be suppressed by score threshold.
     xla::XlaOp ones_included = xla::Select(
         included,
         xla::Broadcast(xla::ConstantR0<int32>(builder, 1), {num_boxes}),
         xla::Broadcast(xla::ConstantR0<int32>(builder, 0), {num_boxes}));
-
     // num_valid is scalar. Value should be bound by output_size.
     xla::XlaOp num_valid_total = xla::Reduce(
         ones_included,
@@ -451,8 +541,17 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     xla::XlaOp num_valid =
         xla::Min(num_valid_total, xla::ConstantR0<int32>(builder, output_size));
 
-    xla::XlaOp output_tuple = TopK(scores_included, output_size);
-    xla::XlaOp selected_indices = xla::GetTupleElement(output_tuple, 1);
+    // Re-index into the original scores input tensor, using a Gather.
+    // Boxes were suppressed in the sorted domain.
+    xla::XlaOp selected_indices;
+    DataType gather_type = context->expected_output_dtype(0);
+    OP_REQUIRES_OK(
+        context,
+        XlaGather(indices_sorted, scores_shape, selected_indices_sorted,
+                  TensorShape({output_size}),
+                  /*axis=*/0,
+                  /*indices_are_nd=*/false,
+                  /*dtype=*/gather_type, DT_INT32, builder, &selected_indices));
 
     context->SetOutput(0, selected_indices);
     context->SetOutput(1, num_valid);
@@ -463,7 +562,7 @@ class NonMaxSuppressionOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(
-    Name("NonMaxSuppressionV4").CompileTimeConstInput("max_output_size"),
+    Name("NonMaxSuppressionV4").CompileTimeConstantInput("max_output_size"),
     NonMaxSuppressionOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 7b2bb4a7c50fc954237e09a32f71009f790b60d0..5a10c52ba8b6d4fab73f0dda67cbd52fd625e76b 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -231,20 +230,22 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
     num_extended[0] = upper_padding[0] / (dims.kernel_size[0]);
     num_extended[1] = upper_padding[1] / (dims.kernel_size[1]);
 
+    const int64 batch_dim_size =
+        builder->GetShape(input).ValueOrDie().dimensions(0);
     if (num_extended[0] > 0) {
-      auto slice =
-          xla::Slice(input_data, {0, in_size[0] - 1, 0, 0},
-                     {1, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
+      auto slice = xla::Slice(
+          input_data, {0, in_size[0] - 1, 0, 0},
+          {batch_dim_size, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
       for (int i = 0; i < num_extended[0]; i++) {
         input_data = xla::ConcatInDim(builder, {input_data, slice}, 1);
       }
     }
 
     if (num_extended[1] > 0) {
-      auto slice =
-          xla::Slice(input_data, {0, 0, in_size[1] - 1, 0},
-                     {1, in_size[0] + num_extended[0], in_size[1], channels},
-                     {1, 1, 1, 1});
+      auto slice = xla::Slice(
+          input_data, {0, 0, in_size[1] - 1, 0},
+          {batch_dim_size, in_size[0] + num_extended[0], in_size[1], channels},
+          {1, 1, 1, 1});
       for (int i = 0; i < num_extended[1]; i++) {
         input_data = xla::ConcatInDim(builder, {input_data, slice}, 2);
       }
@@ -511,7 +512,7 @@ class ResizeBilinearOp : public XlaOpKernel {
   bool align_corners_;
 };
 
-REGISTER_XLA_OP(Name("ResizeBilinear").CompileTimeConstInput("size"),
+REGISTER_XLA_OP(Name("ResizeBilinear").CompileTimeConstantInput("size"),
                 ResizeBilinearOp);
 
 class ResizeBilinearGradOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index f3964748587c1b31cf8b1b76643ff19a9044bf44..843b6bb4e658af16fd753c1a20b35dd3d18df027 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -78,7 +78,7 @@ XlaArgMaxOp::XlaArgMaxOp(OpKernelConstruction* ctx)
     : XlaArgMinMaxOp(ctx, /*is_min=*/false) {}
 REGISTER_XLA_OP(Name("ArgMax")
                     .Device(DEVICE_GPU_XLA_JIT)
-                    .CompileTimeConstInput("dimension"),
+                    .CompileTimeConstantInput("dimension"),
                 XlaArgMaxOp);
 
 namespace {
@@ -89,7 +89,8 @@ class XlaArgMinOp : public XlaArgMinMaxOp {
 };
 XlaArgMinOp::XlaArgMinOp(OpKernelConstruction* ctx)
     : XlaArgMinMaxOp(ctx, /*is_min=*/true) {}
-REGISTER_XLA_OP(Name("ArgMin").CompileTimeConstInput("dimension"), XlaArgMinOp);
+REGISTER_XLA_OP(Name("ArgMin").CompileTimeConstantInput("dimension"),
+                XlaArgMinOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index 3d81ae9eb89a80e5b89b180ad77521c5ed15e79d..e2c05b648bb194b1b452c527ddb1a2c5995b1217 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -30,7 +30,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// The logic below uses a custom-call to implement argmax.
+// The logic below uses a custom-call to implement argmax when possible. When
+// custom-call is not allowed or input shapes are not supported, this kernel
+// falls back to using XLA HLO native ArgMax.
 //
 // Also see b/29507024 for first-class XLA support for indexing ops.
 class ArgMaxCustomCallOp : public XlaOpKernel {
@@ -48,30 +50,42 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     // We require that the dimension argument is a constant, since it lets us
     // dispatch to a specialized custom-call function without any run-time
     // overhead, when compiling ahead-of-time.
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &literal));
-    const int32 dim = literal.Get<int32>({});
-    OP_REQUIRES(ctx, dim >= 0, errors::InvalidArgument("dim must be >= 0"));
-    OP_REQUIRES(
-        ctx, dim < input_shape.dims(),
-        errors::InvalidArgument("dim must be < input rank (",
-                                input_shape.dims(), "), but got: ", dim));
-    const int64 dim_size = input_shape.dim_size(dim);
-    OP_REQUIRES(ctx, dim_size > 0,
+    int64 dim;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &dim));
+
+    const int input_dims = input_shape.dims();
+    const int axis = dim < 0 ? dim + input_dims : dim;
+    OP_REQUIRES(ctx, axis >= 0 && axis < input_dims,
+                errors::InvalidArgument("Expected dimension in the range [",
+                                        -input_dims, ", ", input_dims,
+                                        "), but got ", dim));
+
+    const int64 axis_size = input_shape.dim_size(axis);
+    OP_REQUIRES(ctx, axis_size > 0,
                 errors::InvalidArgument(
                     "Reduction axis ", dim,
                     " is empty in shape: ", input_shape.DebugString()));
 
-    // The output shape is the input shape contracted along dim.
+    const DataType dtype = output_type(0);
+    xla::PrimitiveType output_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype, &output_type));
+
+    // Fall back to XLA ArgMax HLO when CustomCall is not allowed or when input
+    // shape isn't supported.
+    if (!ctx->compiler()->options().allow_cpu_custom_calls ||
+        (input_dims != 1 && input_dims != 2)) {
+      xla::XlaOp output = XlaHelpers::ArgMax(ctx->Input(0), output_type, axis);
+      ctx->SetOutput(0, output);
+      return;
+    }
+
+    xla::XlaOp output;
+    // The output shape is the input shape contracted along axis.
     TensorShape output_shape;
     for (int d = 0; d < input_shape.dims() - 1; ++d) {
-      output_shape.AddDim(input_shape.dim_size((d < dim) ? d : d + 1));
+      output_shape.AddDim(input_shape.dim_size((d < axis) ? d : d + 1));
     }
 
-    // For now we use a custom-call, only for the 1d and 2d cases.
-    OP_REQUIRES(ctx, XlaContext::Get(ctx).allow_cpu_custom_calls(),
-                errors::InvalidArgument(
-                    "ArgMax implementation requires a CustomCall on CPU"));
     xla::XlaBuilder& b = *ctx->builder();
 
     // XLA passes <out> to the function, so it is not included here.
@@ -85,31 +99,32 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
       args.push_back(xla::ConstantLiteral(
           &b, xla::LiteralUtil::CreateR1<int64>(output_shape.dim_sizes())));
       args.push_back(
-          xla::ConstantLiteral(&b, xla::LiteralUtil::CreateR0<int32>(dim)));
+          xla::ConstantLiteral(&b, xla::LiteralUtil::CreateR0<int32>(axis)));
     }
 
-    xla::Shape xla_shape =
-        xla::ShapeUtil::MakeShape(xla::S64, output_shape.dim_sizes());
+    // The argmax function expects row-major layout.
+    xla::Shape xla_shape = xla::ShapeUtil::MakeShapeWithDescendingLayout(
+        xla::S64, output_shape.dim_sizes());
+    std::vector<xla::Shape> arg_shapes;
+    for (const xla::XlaOp& arg : args) {
+      auto shape_status = b.GetShape(arg);
+      OP_REQUIRES_OK(ctx, shape_status.status());
+      xla::Shape arg_shape = shape_status.ConsumeValueOrDie();
+      *arg_shape.mutable_layout() = xla::LayoutUtil::MakeDescendingLayout(
+          xla::ShapeUtil::Rank(arg_shape));
+      arg_shapes.push_back(std::move(arg_shape));
+    }
 
     // Tell XLA to call the custom code, defined in
-    // index_ops_kernel_argmax_float_1d.cc.
-    xla::XlaOp output;
-    switch (input_shape.dims()) {
-      case 1:
-        output =
-            xla::CustomCall(&b, "argmax_float_1d_xla_impl", args, xla_shape);
-        break;
-      case 2:
-        output =
-            xla::CustomCall(&b, "argmax_float_2d_xla_impl", args, xla_shape);
-        break;
-      default:
-        OP_REQUIRES(ctx, false,
-                    errors::Unimplemented(
-                        "Argmax is only implemented for 1d and 2d tensors"
-                        ", but got shape: ",
-                        input_shape.DebugString()));
+    // index_ops_kernel_argmax_float_{1, 2}d.cc.
+    if (input_dims == 1) {
+      output = xla::CustomCallWithLayout(&b, "argmax_float_1d_xla_impl", args,
+                                         xla_shape, arg_shapes);
+    } else {
+      output = xla::CustomCallWithLayout(&b, "argmax_float_2d_xla_impl", args,
+                                         xla_shape, arg_shapes);
     }
+    output = xla::ConvertElementType(output, output_type);
     ctx->SetOutput(0, output);
   }
 
@@ -120,7 +135,7 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("ArgMax")
                     .TypeConstraint("T", DT_FLOAT)
                     .Device(DEVICE_CPU_XLA_JIT)
-                    .CompileTimeConstInput("dimension"),
+                    .CompileTimeConstantInput("dimension"),
                 ArgMaxCustomCallOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index f028e361bccd51de0bd69a1d2227c7afaed53455..93f029731c34e84000a3dc00df8af05654cccf2d 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -37,12 +37,11 @@ class L2LossOp : public XlaOpKernel {
 
     //  output = sum(t ** 2) / 2
     const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype);
-    auto t =
-        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+    auto t = XlaHelpers::ConvertElementType(ctx->Input(0), accumulation_type);
     auto square = xla::Mul(t, t);
     auto reduce = xla::Reduce(square, XlaHelpers::Zero(b, accumulation_type),
                               *ctx->GetOrCreateAdd(accumulation_type), dims);
-    auto deconverted = XlaHelpers::ConvertElementType(b, reduce, dtype);
+    auto deconverted = XlaHelpers::ConvertElementType(reduce, dtype);
     auto two = XlaHelpers::IntegerLiteral(b, dtype, 2);
     ctx->SetOutput(0, xla::Div(deconverted, two));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
index a11bbe918f7f8eb050aaa40d4344f9cc9e9a10a4..e46f4e72dc9cb245916b138d5365ee42371f0e4c 100644
--- a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
@@ -115,8 +115,8 @@ class ListDiffOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("ListDiff")
                     .TypeConstraint("T", kListDiffTypes)
-                    .CompileTimeConstInput("x")
-                    .CompileTimeConstInput("y"),
+                    .CompileTimeConstantInput("x")
+                    .CompileTimeConstantInput("y"),
                 ListDiffOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
index 87ee2d3aede50eb24e65570f106d49030e1d4236..987901d82b3f3798dd52f18ef2497b8f0cf80b11 100644
--- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
@@ -49,16 +49,14 @@ class LRNOp : public XlaOpKernel {
     // We use a window of depth_radius_ * 2 + 1, to account for the current
     // element and a depth_radius_ on either side.
     auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
-    auto converted =
-        XlaHelpers::ConvertElementType(builder, input, accumulation_type);
+    auto converted = XlaHelpers::ConvertElementType(input, accumulation_type);
     auto squared = xla::Mul(converted, converted);
     auto reduce = xla::ReduceWindow(
         squared, XlaHelpers::Zero(builder, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
-    auto sqr_sum =
-        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
+    auto sqr_sum = XlaHelpers::ConvertElementType(reduce, input_type(0));
 
     auto scale = xla::Pow(
         xla::Add(xla::ConstantR0<float>(builder, bias_),
@@ -138,15 +136,14 @@ class LRNGradOp : public XlaOpKernel {
 
     auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
     auto converted =
-        XlaHelpers::ConvertElementType(builder, in_image, accumulation_type);
+        XlaHelpers::ConvertElementType(in_image, accumulation_type);
     auto squared = xla::Mul(converted, converted);
     auto reduce = xla::ReduceWindow(
         squared, XlaHelpers::Zero(builder, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
-    auto sqr_sum =
-        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
+    auto sqr_sum = XlaHelpers::ConvertElementType(reduce, input_type(0));
 
     auto norm =
         xla::Add(xla::ConstantR0<float>(builder, bias_),
@@ -157,15 +154,13 @@ class LRNGradOp : public XlaOpKernel {
                  xla::Div(out_image, norm)),
         in_grads);
 
-    auto converted_dy =
-        XlaHelpers::ConvertElementType(builder, dy, accumulation_type);
+    auto converted_dy = XlaHelpers::ConvertElementType(dy, accumulation_type);
     auto dy_reduce = xla::ReduceWindow(
         converted_dy, XlaHelpers::Zero(builder, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
-    auto dy_reduced =
-        XlaHelpers::ConvertElementType(builder, dy_reduce, input_type(0));
+    auto dy_reduced = XlaHelpers::ConvertElementType(dy_reduce, input_type(0));
 
     xla::XlaOp gradients = xla::Add(
         xla::Mul(in_image, dy_reduced),
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
index 8dfd7de591c4a3c4768dd60b41e03d294ad49397..2dd0a710e47ec8cad6153402fdb3be59f5868205 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
@@ -61,11 +61,11 @@ class MatrixBandPartOp : public XlaOpKernel {
 
     // Compute 'offset', which is how many diagonals we are above/below the
     // diagonal.
-    xla::XlaOp iota_m = xla::Iota(builder, index_xla_type, m);
-    xla::XlaOp iota_n = xla::Iota(builder, index_xla_type, n);
+    xla::Shape iota_shape = xla::ShapeUtil::MakeShape(index_xla_type, {m, n});
+    xla::XlaOp iota_m = xla::Iota(builder, iota_shape, /*iota_dimension=*/0);
+    xla::XlaOp iota_n = xla::Iota(builder, iota_shape, /*iota_dimension=*/1);
 
-    auto offset = xla::Sub(xla::Broadcast(iota_n, {m}), iota_m,
-                           /*broadcast_dimensions=*/{0});
+    auto offset = xla::Sub(iota_n, iota_m);
 
     // If num_lower or num_upper are negative, include all lower/upper
     // diagonals.
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
index c0ca881ff82cee04e0c5e35f9a2d5732fabdd8a6..4f980b6d14ed667bdf4756ed740894098cae5919 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
index f4def11d08c31513aec5aad15187016a7294c2fd..90c0ebefb24ec2c4378782e9b15d3f57c33032a4 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 
 namespace tensorflow {
 namespace {
@@ -29,7 +29,7 @@ class MatrixTriangularSolveOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result = TriangularSolve(
+    auto result = xla::TriangularSolve(
         ctx->Input(0), ctx->Input(1), /*left_side=*/true,
         /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
     ctx->SetOutput(0, result);
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index 2a42eeaf76ab3aa88ff3a93ef7eb7ab217964bb6..f6b8534f4d7c537e5b708ee000e00cb92123584b 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -41,10 +41,8 @@ class MirrorPadOp : public XlaOpKernel {
     for (int64 dimno = xla::ShapeUtil::Rank(original_shape) - 1; dimno >= 0;
          --dimno) {
       auto t_rev = xla::Rev(accum, {dimno});
-      TF_ASSIGN_OR_RETURN(int64 lhs_padding,
-                          pad_literal.GetIntegralAsS64({dimno, 0}));
-      TF_ASSIGN_OR_RETURN(int64 rhs_padding,
-                          pad_literal.GetIntegralAsS64({dimno, 1}));
+      int64 lhs_padding = pad_literal.Get<int64>({dimno, 0});
+      int64 rhs_padding = pad_literal.Get<int64>({dimno, 1});
       int64 dim_size = original_shape.dimensions(dimno);
 
       // Padding amounts on each side must be no more than the size of the
@@ -65,8 +63,8 @@ class MirrorPadOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape pad_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape pad_shape = ctx->InputShape("paddings");
 
     MirrorPadMode mode;
     OP_REQUIRES_OK(ctx, GetNodeAttr(def(), "mode", &mode));
@@ -81,23 +79,19 @@ class MirrorPadOp : public XlaOpKernel {
         TensorShapeUtils::IsMatrix(pad_shape) && pad_shape.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 pad_shape.DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && pad_shape.dim_size(0) == 1)
-            ? 1
-            : dims;
     OP_REQUIRES(
-        ctx, fixed_dims == pad_shape.dim_size(0),
+        ctx, dims == pad_shape.dim_size(0),
         errors::InvalidArgument(
             "The first dimension of paddings must be the rank of inputs",
             pad_shape.DebugString(), " ", input_shape.DebugString()));
 
     // Evaluate the 'padding' constant input, reshaping to a matrix.
     xla::Literal pad_literal;
-    OP_REQUIRES_OK(
-        ctx, ctx->ConstantInputReshaped(1, {fixed_dims, 2}, &pad_literal));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsInt64Literal("paddings", &pad_literal));
 
     xla::XlaBuilder* b = ctx->builder();
-    auto in0 = ctx->Input(0);
+    auto in0 = ctx->Input("input");
     xla::StatusOr<xla::Shape> in0_shape = b->GetShape(in0);
     OP_REQUIRES(ctx, in0_shape.ok(), in0_shape.status());
     xla::StatusOr<xla::XlaOp> accum_status =
@@ -112,7 +106,7 @@ class MirrorPadOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(MirrorPadOp);
 };
 
-REGISTER_XLA_OP(Name("MirrorPad").CompileTimeConstInput("paddings"),
+REGISTER_XLA_OP(Name("MirrorPad").CompileTimeConstantInput("paddings"),
                 MirrorPadOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
index cac2eea96eeed723b2a63bc9193070cad04b005d..aba54578d97c1e455f67efa2877ddc25dab68ac0 100644
--- a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
@@ -76,7 +76,7 @@ class OneHotOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(OneHotOp);
 };
 
-REGISTER_XLA_OP(Name("OneHot").CompileTimeConstInput("depth"), OneHotOp);
+REGISTER_XLA_OP(Name("OneHot").CompileTimeConstantInput("depth"), OneHotOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index e5937b56c17d01892928b073da09f38941ea1bbb..36ea70ac392ff18fb52d400efa886533f8335eba 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace {
@@ -29,40 +30,36 @@ class PadOp : public XlaOpKernel {
   explicit PadOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape pad_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape pad_shape = ctx->InputShape("paddings");
     const int dims = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsMatrix(pad_shape) && pad_shape.dim_size(1) == 2,
         errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
                                 pad_shape.DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && pad_shape.dim_size(0) == 1)
-            ? 1
-            : dims;
     OP_REQUIRES(
-        ctx, fixed_dims == pad_shape.dim_size(0),
+        ctx, dims == pad_shape.dim_size(0),
         errors::InvalidArgument(
             "The first dimension of paddings must be the rank of inputs",
             pad_shape.DebugString(), " ", input_shape.DebugString()));
 
-    if (fixed_dims == 0) {
+    xla::XlaOp input = ctx->Input("input");
+    if (dims == 0) {
       // Tensor is rank 0. Return it unchanged.
-      ctx->SetOutput(0, ctx->Input(0));
+      ctx->SetOutput(0, input);
       return;
     }
 
-    // Evaluate the 'padding' constant input, reshaping to a matrix.
     xla::Literal pad_literal;
-    OP_REQUIRES_OK(
-        ctx, ctx->ConstantInputReshaped(1, {fixed_dims, 2}, &pad_literal));
+    OP_REQUIRES_OK(ctx,
+                   ctx->ConstantInputAsInt64Literal("paddings", &pad_literal));
 
     xla::PaddingConfig config;
-    for (int i = 0; i < fixed_dims; ++i) {
+    for (int i = 0; i < dims; ++i) {
       auto* dim = config.add_dimensions();
-      int before = pad_literal.Get<int32>({i, 0});
-      int after = pad_literal.Get<int32>({i, 1});
+      int before = pad_literal.Get<int64>({i, 0});
+      int after = pad_literal.Get<int64>({i, 1});
       OP_REQUIRES(ctx, before >= 0 && after >= 0,
                   errors::InvalidArgument(
                       "Paddings must be non-negative: ", before, " ", after));
@@ -73,18 +70,19 @@ class PadOp : public XlaOpKernel {
     // PadV2 added a "constant_values" input that indicates the pad value.
     xla::XlaOp constant_values;
     if (ctx->num_inputs() == 3) {
-      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(2)),
-                  errors::InvalidArgument("constant_values must be a scalar."));
-      ctx->SetOutput(0, xla::Pad(ctx->Input(0), ctx->Input(2), config));
+      OP_REQUIRES(
+          ctx, TensorShapeUtils::IsScalar(ctx->InputShape("constant_values")),
+          errors::InvalidArgument("constant_values must be a scalar."));
+      ctx->SetOutput(0, xla::Pad(input, ctx->Input("constant_values"), config));
     } else {
       auto zero = XlaHelpers::Zero(ctx->builder(), input_type(0));
-      ctx->SetOutput(0, xla::Pad(ctx->Input(0), zero, config));
+      ctx->SetOutput(0, xla::Pad(input, zero, config));
     }
   }
 };
 
-REGISTER_XLA_OP(Name("Pad").CompileTimeConstInput("paddings"), PadOp);
-REGISTER_XLA_OP(Name("PadV2").CompileTimeConstInput("paddings"), PadOp);
+REGISTER_XLA_OP(Name("Pad").CompileTimeConstantInput("paddings"), PadOp);
+REGISTER_XLA_OP(Name("PadV2").CompileTimeConstantInput("paddings"), PadOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/permute_op.cc b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
index 0764e5503db583351e92a144b2c361e8875161d3..71920bf5c1e6aa5981aafa8b611cc01c0917e02b 100644
--- a/tensorflow/compiler/tf2xla/kernels/permute_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
@@ -75,10 +75,9 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
     }
     auto keys = xla::ConstantR1(builder, absl::Span<const int32>(dst_indices));
     if (input_rank == 2) {
-      keys = xla::BroadcastInDim(
-          keys, xla::ShapeUtil::MakeShape(xla::S32, {4, 2}), {0});
+      keys = xla::BroadcastInDim(keys, {4, 2}, {0});
     }
-    auto sorted = xla::Sort(keys, ctx->Input(0), 0);
+    auto sorted = xla::Sort(keys, {ctx->Input(0)}, 0);
     auto output = xla::GetTupleElement(sorted, 1);
     ctx->SetOutput(0, output);
   }
@@ -90,9 +89,9 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(DataFormatVecPermuteOp);
 };
 
-// TODO(b/115384656): Support DT_INT64.
-REGISTER_XLA_OP(Name("DataFormatVecPermute").TypeConstraint("T", DT_INT32),
-                DataFormatVecPermuteOp);
+REGISTER_XLA_OP(
+    Name("DataFormatVecPermute").TypeConstraint("T", {DT_INT32, DT_INT64}),
+    DataFormatVecPermuteOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 27690c156e4da129ad139f3880bba3a208b5606d..06c6cc37ec90192486ba15010bfeb763a9ffb987 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -152,7 +152,12 @@ class MaxPoolOp : public PoolingOp {
  public:
   MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
-                  /*reduction_type=*/ctx->input_type(0)) {}
+                  /*reduction_type=*/ctx->input_type(0)) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     auto ksize_or_error = GetKernelSize(ctx);
@@ -180,16 +185,12 @@ class MaxPool2DOp : public MaxPoolOp {
  public:
   explicit MaxPool2DOp(OpKernelConstruction* ctx)
       : MaxPoolOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format_str;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
 REGISTER_XLA_OP(Name("MaxPoolV2")
-                    .CompileTimeConstInput("ksize")
-                    .CompileTimeConstInput("strides"),
+                    .CompileTimeConstantInput("ksize")
+                    .CompileTimeConstantInput("strides"),
                 MaxPool2DOp);
 
 class MaxPool3DOp : public MaxPoolOp {
@@ -204,7 +205,12 @@ class AvgPoolOp : public PoolingOp {
   AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
                   /*reduction_type=*/
-                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
+                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     auto ksize_or_error = GetKernelSize(ctx);
@@ -241,10 +247,6 @@ class AvgPool2DOp : public AvgPoolOp {
  public:
   explicit AvgPool2DOp(OpKernelConstruction* ctx)
       : AvgPoolOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format_str;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(Name("AvgPool"), AvgPool2DOp);
@@ -360,8 +362,8 @@ class MaxPool2DGradOp : public MaxPoolGradOp {
 };
 REGISTER_XLA_OP(Name("MaxPoolGrad"), MaxPool2DGradOp);
 REGISTER_XLA_OP(Name("MaxPoolGradV2")
-                    .CompileTimeConstInput("ksize")
-                    .CompileTimeConstInput("strides"),
+                    .CompileTimeConstantInput("ksize")
+                    .CompileTimeConstantInput("strides"),
                 MaxPool2DGradOp);
 
 class MaxPool3DGradOp : public MaxPoolGradOp {
@@ -390,6 +392,11 @@ class AvgPoolGradOp : public XlaOpKernel {
     OP_REQUIRES(ctx, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+
+    string data_format;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
+    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
   }
 
   int num_dims() const { return num_spatial_dims_ + 2; }
@@ -449,22 +456,20 @@ class AvgPool2DGradOp : public AvgPoolGradOp {
  public:
   explicit AvgPool2DGradOp(OpKernelConstruction* ctx)
       : AvgPoolGradOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
-REGISTER_XLA_OP(Name("AvgPoolGrad").CompileTimeConstInput("orig_input_shape"),
-                AvgPool2DGradOp);
+REGISTER_XLA_OP(
+    Name("AvgPoolGrad").CompileTimeConstantInput("orig_input_shape"),
+    AvgPool2DGradOp);
 
 class AvgPool3DGradOp : public AvgPoolGradOp {
  public:
   explicit AvgPool3DGradOp(OpKernelConstruction* ctx)
       : AvgPoolGradOp(ctx, /*num_spatial_dims=*/3) {}
 };
-REGISTER_XLA_OP(Name("AvgPool3DGrad").CompileTimeConstInput("orig_input_shape"),
-                AvgPool3DGradOp);
+REGISTER_XLA_OP(
+    Name("AvgPool3DGrad").CompileTimeConstantInput("orig_input_shape"),
+    AvgPool3DGradOp);
 
 class MaxPoolGradGradOp : public XlaOpKernel {
  public:
@@ -632,8 +637,8 @@ REGISTER_XLA_OP(Name("MaxPoolGradGrad").TypeConstraint("T", DT_FLOAT),
                 MaxPool2DGradGradOp);
 REGISTER_XLA_OP(Name("MaxPoolGradGradV2")
                     .TypeConstraint("T", DT_FLOAT)
-                    .CompileTimeConstInput("ksize")
-                    .CompileTimeConstInput("strides"),
+                    .CompileTimeConstantInput("ksize")
+                    .CompileTimeConstantInput("strides"),
                 MaxPool2DGradGradOp);
 
 class MaxPool3DGradGradOp : public MaxPoolGradGradOp {
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index 6f4ed496a1774dde68dd9d5fbd37995d615b678c..7fe102428db1cc5ce16037f56fa301d1941da8e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/platform/macros.h"
@@ -26,12 +27,26 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+enum QuantizerRoundMode {
+  // Round half up: if the fraction of y is exactly 0.5, then
+  // round(y) = y + 0.5
+  // E.g., -5.5 gets rounded to -5, -5.4 goes to -5,
+  // 5.4 goes to 5, and 5.5 goes to 6.
+  ROUND_HALF_UP,
+  // Round half to even: if the fraction of y is exactly 0.5, then round(y) is
+  // the nearest even integer to y.
+  // E.g., 23.5 gets rounded to 24, 24.5 gets rounded to 24, while -23.5 becomes
+  // -24, and -24.5 gets rounded to 24.
+  ROUND_HALF_TO_EVEN,
+};
+
 class QuantizeAndDequantizeOp : public XlaOpKernel {
  public:
   explicit QuantizeAndDequantizeOp(OpKernelConstruction* ctx)
       : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
+    round_mode_ = ROUND_HALF_TO_EVEN;
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -117,8 +132,17 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
       // in that case they were measured from the tensor.
       input = Clamp(min_range, input, max_range);
     }
-    xla::XlaOp result =
-        Floor((input - min_range) * scale + half) * inverse_scale + min_range;
+    xla::XlaOp result;
+    switch (round_mode_) {
+      case ROUND_HALF_TO_EVEN: {
+        result = xla::RoundToEven(input * scale) * inverse_scale;
+        break;
+      }
+      case ROUND_HALF_UP: {
+        result = Floor(input * scale + half) * inverse_scale;
+        break;
+      }
+    }
     ctx->SetOutput(0, result);
   }
 
@@ -126,6 +150,7 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
   int64 num_bits_ = -1;
   bool signed_input_;
   bool range_given_;
+  QuantizerRoundMode round_mode_;
 };
 
 class QuantizeAndDequantizeV2Op : public QuantizeAndDequantizeOp {
@@ -136,6 +161,20 @@ class QuantizeAndDequantizeV2Op : public QuantizeAndDequantizeOp {
     OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63),
                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
                                         " with signed_input_ ", signed_input_));
+    string round_mode_string;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
+    OP_REQUIRES(
+        ctx,
+        (round_mode_string == "HALF_UP" || round_mode_string == "HALF_TO_EVEN"),
+        errors::InvalidArgument("Round mode string must be "
+                                "'HALF_UP' or "
+                                "'HALF_TO_EVEN', is '" +
+                                round_mode_string + "'"));
+    if (round_mode_string == "HALF_UP") {
+      round_mode_ = ROUND_HALF_UP;
+    } else if (round_mode_string == "HALF_TO_EVEN") {
+      round_mode_ = ROUND_HALF_TO_EVEN;
+    }
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index afd5986846705f66eb4c7ced9dbe2f4757f5af7f..8822e29f7e77b1cbc6fa6ca61d0062d9b1b0c36e 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -58,7 +57,7 @@ class RandomUniformOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RandomUniformOp);
 };
 
-REGISTER_XLA_OP(Name("RandomUniform").CompileTimeConstInput("shape"),
+REGISTER_XLA_OP(Name("RandomUniform").CompileTimeConstantInput("shape"),
                 RandomUniformOp);
 
 class RandomShuffleOp : public XlaOpKernel {
@@ -135,7 +134,7 @@ class RandomShuffleOp : public XlaOpKernel {
       xla::XlaOp curr = input;
       for (int i = 0; i < rounds; ++i) {
         xla::XlaOp keys = xla::RngUniform(zero, max_value, key_shape);
-        xla::XlaOp sorted = xla::Sort(keys, curr);
+        xla::XlaOp sorted = xla::Sort(keys, {curr});
         curr = xla::GetTupleElement(sorted, 1);
       }
 
@@ -227,7 +226,7 @@ class RandomUniformIntOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RandomUniformIntOp);
 };
 
-REGISTER_XLA_OP(Name("RandomUniformInt").CompileTimeConstInput("shape"),
+REGISTER_XLA_OP(Name("RandomUniformInt").CompileTimeConstantInput("shape"),
                 RandomUniformIntOp);
 
 class RandomStandardNormalOp : public XlaOpKernel {
@@ -256,7 +255,7 @@ class RandomStandardNormalOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RandomStandardNormalOp);
 };
 
-REGISTER_XLA_OP(Name("RandomStandardNormal").CompileTimeConstInput("shape"),
+REGISTER_XLA_OP(Name("RandomStandardNormal").CompileTimeConstantInput("shape"),
                 RandomStandardNormalOp);
 
 class TruncatedNormalOp : public XlaOpKernel {
@@ -282,7 +281,7 @@ class TruncatedNormalOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("TruncatedNormal")
-                    .CompileTimeConstInput("shape")
+                    .CompileTimeConstantInput("shape")
                     .TypeConstraint("dtype", DT_FLOAT),
                 TruncatedNormalOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
index 8102faad28db71075fb8da269c55edbdb667193e..dacdbc88e005304bc64ea35c8985711afca41eae 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
@@ -40,10 +40,16 @@ class ReduceWindowOp : public XlaOpKernel {
 
     std::vector<int64> window_dimensions;
     std::vector<int64> window_strides;
+    std::vector<int64> base_dilations;
+    std::vector<int64> window_dilations;
     OP_REQUIRES_OK(context, context->ConstantInputAsIntVector(
                                 "window_dimensions", &window_dimensions));
     OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("window_strides",
                                                               &window_strides));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector("base_dilations",
+                                                              &base_dilations));
+    OP_REQUIRES_OK(context, context->ConstantInputAsIntVector(
+                                "window_dilations", &window_dilations));
 
     const int rank = input_shape.dims();
     OP_REQUIRES(context, rank == window_dimensions.size(),
@@ -56,6 +62,16 @@ class ReduceWindowOp : public XlaOpKernel {
                     "The size of window_strides must be equal to the input "
                     "rank (",
                     window_strides.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == base_dilations.size(),
+                errors::InvalidArgument(
+                    "The size of base_dilations must be equal to the input "
+                    "rank (",
+                    base_dilations.size(), " vs. ", rank, ")"));
+    OP_REQUIRES(context, rank == window_dilations.size(),
+                errors::InvalidArgument(
+                    "The size of window_dilations must be equal to the input "
+                    "rank (",
+                    window_dilations.size(), " vs. ", rank, ")"));
 
     // Build the reducer function.
     XlaCompiler::Argument reducer_arg;
@@ -102,7 +118,8 @@ class ReduceWindowOp : public XlaOpKernel {
 
     xla::XlaOp output = xla::ReduceWindowWithGeneralPadding(
         context->Input(0), context->Input(1), *reducer.computation,
-        window_dimensions, window_strides, padding);
+        window_dimensions, window_strides, base_dilations, window_dilations,
+        padding);
     context->SetOutput(0, output);
   }
 
@@ -113,9 +130,11 @@ class ReduceWindowOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("XlaReduceWindow")
-                    .CompileTimeConstInput("window_dimensions")
-                    .CompileTimeConstInput("window_strides")
-                    .CompileTimeConstInput("padding"),
+                    .CompileTimeConstantInput("window_dimensions")
+                    .CompileTimeConstantInput("window_strides")
+                    .CompileTimeConstantInput("base_dilations")
+                    .CompileTimeConstantInput("window_dilations")
+                    .CompileTimeConstantInput("padding"),
                 ReduceWindowOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 0d260fa8fcaa513d7854c1e9215952404d555c70..65e158d64fdd7df62d50b81c9e488b2d03476fb7 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -41,7 +41,8 @@ class SumOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Sum").CompileTimeConstInput("reduction_indices"), SumOp);
+REGISTER_XLA_OP(Name("Sum").CompileTimeConstantInput("reduction_indices"),
+                SumOp);
 
 class ProdOp : public XlaReductionOp {
  public:
@@ -59,7 +60,7 @@ class ProdOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Prod").CompileTimeConstInput("reduction_indices"),
+REGISTER_XLA_OP(Name("Prod").CompileTimeConstantInput("reduction_indices"),
                 ProdOp);
 
 class MinOp : public XlaReductionOp {
@@ -77,7 +78,8 @@ class MinOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Min").CompileTimeConstInput("reduction_indices"), MinOp);
+REGISTER_XLA_OP(Name("Min").CompileTimeConstantInput("reduction_indices"),
+                MinOp);
 
 class MaxOp : public XlaReductionOp {
  public:
@@ -94,7 +96,8 @@ class MaxOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Max").CompileTimeConstInput("reduction_indices"), MaxOp);
+REGISTER_XLA_OP(Name("Max").CompileTimeConstantInput("reduction_indices"),
+                MaxOp);
 
 class MeanOp : public XlaReductionOp {
  public:
@@ -110,16 +113,25 @@ class MeanOp : public XlaReductionOp {
     xla::Add(scalar_lhs, scalar_rhs);
   }
 
-  xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
-                            const xla::XlaOp& reduce_output,
-                            int64 num_elements_reduced) override {
-    auto divisor = XlaHelpers::IntegerLiteral(builder, input_type(0),
-                                              num_elements_reduced);
-    return reduce_output / divisor;
+  xla::XlaOp BuildFinalizer(
+      xla::XlaBuilder* /*builder*/, const xla::XlaOp& input,
+      const xla::XlaOp& reduce_output,
+      const std::vector<int64>& dimensions_to_reduce) override {
+    if (dimensions_to_reduce.empty()) {
+      return reduce_output;
+    }
+    auto divisor = xla::GetDimensionSize(input, dimensions_to_reduce[0]);
+    for (int i = 1; i < dimensions_to_reduce.size(); i++) {
+      auto size = xla::GetDimensionSize(input, dimensions_to_reduce[i]);
+      divisor = xla::Mul(divisor, size);
+    }
+    divisor = xla::ConvertElementType(divisor, xla_reduction_type_);
+    return XlaHelpers::ConvertElementType(reduce_output / divisor,
+                                          input_type(0));
   }
 };
 
-REGISTER_XLA_OP(Name("Mean").CompileTimeConstInput("reduction_indices"),
+REGISTER_XLA_OP(Name("Mean").CompileTimeConstantInput("reduction_indices"),
                 MeanOp);
 
 class AllOp : public XlaReductionOp {
@@ -137,7 +149,8 @@ class AllOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("All").CompileTimeConstInput("reduction_indices"), AllOp);
+REGISTER_XLA_OP(Name("All").CompileTimeConstantInput("reduction_indices"),
+                AllOp);
 
 class AnyOp : public XlaReductionOp {
  public:
@@ -154,7 +167,8 @@ class AnyOp : public XlaReductionOp {
   }
 };
 
-REGISTER_XLA_OP(Name("Any").CompileTimeConstInput("reduction_indices"), AnyOp);
+REGISTER_XLA_OP(Name("Any").CompileTimeConstantInput("reduction_indices"),
+                AnyOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 466e79828d111ee7cadcf713703e8f252c63e62c..af716eab79886791e8507a84984b7ca60865d00e 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -48,13 +48,14 @@ class XlaReductionOp : public XlaOpKernel {
                             const xla::XlaOp& scalar_rhs) = 0;
 
   // Applies a transformation to the output of the reduction. The desired
-  // computation should be added to 'builder'. Argument 'reduce_output' is the
-  // output of the reduction. 'num_elements_reduced' is the number of elements
-  // that contributed to the reduction. Returns the transformed reduction
-  // output, Defaults to returning 'reduce_output' unchanged.
-  virtual xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
-                                    const xla::XlaOp& reduce_output,
-                                    int64 num_elements_reduced);
+  // computation should be added to 'builder'. Argument 'input' is the original
+  // input of the reduction; 'reduce_output' is the output of the reduction.
+  // Returns the transformed reduction output. Defaults to returning
+  // 'reduce_output' converted to the input type.
+  virtual xla::XlaOp BuildFinalizer(
+      xla::XlaBuilder* builder, const xla::XlaOp& input,
+      const xla::XlaOp& reduce_output,
+      const std::vector<int64>& dimensions_to_reduce);
 
   void Compile(XlaOpKernelContext* ctx) override;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 118f2798d559f43acb7f6394a7337426164325ef..2ca2a85244b4edfe75db3d4fff6c2058adc2bf71 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -35,12 +35,13 @@ XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx,
       ctx, DataTypeToPrimitiveType(reduction_type_, &xla_reduction_type_));
 }
 
-// Unless BuildFinalizer is overridden the reduction has no
-// finalizer.
-xla::XlaOp XlaReductionOp::BuildFinalizer(xla::XlaBuilder* builder,
-                                          const xla::XlaOp& reduce_output,
-                                          int64 num_elements_reduced) {
-  return reduce_output;
+// The default finalizer converts the results back into the input type. This can
+// be overridden.
+xla::XlaOp XlaReductionOp::BuildFinalizer(
+    xla::XlaBuilder* /*builder*/, const xla::XlaOp& /*input*/,
+    const xla::XlaOp& reduce_output,
+    const std::vector<int64>& /*dimensions_to_reduce*/) {
+  return XlaHelpers::ConvertElementType(reduce_output, input_type(0));
 }
 
 void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
@@ -71,7 +72,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
 
   absl::InlinedVector<bool, 4> bitmap(data_shape.dims(), false);
   std::vector<int64> xla_axes;
-  int64 num_elements_reduced = 1LL;
   for (int64 i = 0; i < axes_tensor_shape.num_elements(); ++i) {
     int64 index = axes[i];
     OP_REQUIRES(ctx,
@@ -82,7 +82,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
     index = (index + data_shape.dims()) % data_shape.dims();
     bitmap[index] = true;
     xla_axes.push_back(index);
-    num_elements_reduced *= data_shape.dim_size(index);
   }
 
   std::vector<int64> final_shape;
@@ -118,8 +117,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
   xla::XlaComputation reduction_computation = r.Build().ConsumeValueOrDie();
 
   auto reduce = xla::Reduce(data, initial, reduction_computation, xla_axes);
-  auto deconverted = XlaHelpers::ConvertElementType(b, reduce, input_type(0));
-  auto finalized = BuildFinalizer(b, deconverted, num_elements_reduced);
+  auto finalized = BuildFinalizer(b, data, reduce, xla_axes);
   auto result = keep_dims_ ? xla::Reshape(finalized, final_shape) : finalized;
   ctx->SetOutput(0, result);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index d35777ccb1271ec6a7c9972c714d06b2415d9c34..a8e230ba107ce8a73f3e80f0e55fa27eea31338f 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -15,14 +15,12 @@ limitations under the License.
 
 // Native XLA implementations of XLA Relu Ops
 
-#include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/core/framework/kernel_def_builder.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/no_op.h"
 
 namespace tensorflow {
 namespace {
@@ -37,6 +35,7 @@ class ReluOp : public XlaOpKernel {
     ctx->SetOutput(0, xla::Max(zero, ctx->Input(0)));
   }
 };
+REGISTER_XLA_OP(Name("Relu"), ReluOp);
 
 class Relu6Op : public XlaOpKernel {
  public:
@@ -49,6 +48,22 @@ class Relu6Op : public XlaOpKernel {
     ctx->SetOutput(0, xla::Clamp(zero, ctx->Input(0), six));
   }
 };
+REGISTER_XLA_OP(Name("Relu6"), Relu6Op);
+
+class LeakyReluOp : public XlaOpKernel {
+ public:
+  explicit LeakyReluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", &alpha_));
+  }
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto features = ctx->Input("features");
+    auto output =
+        xla::Max(features, features * xla::ScalarLike(features, alpha_));
+    ctx->SetOutput(0, output);
+  }
+  float alpha_;
+};
+REGISTER_XLA_OP(Name("LeakyRelu"), LeakyReluOp);
 
 class ReluGradOp : public XlaOpKernel {
  public:
@@ -64,6 +79,7 @@ class ReluGradOp : public XlaOpKernel {
     ctx->SetOutput(0, xla::Select(pred, ctx->Input(0), zero));
   }
 };
+REGISTER_XLA_OP(Name("ReluGrad"), ReluGradOp);
 
 class Relu6GradOp : public XlaOpKernel {
  public:
@@ -83,11 +99,24 @@ class Relu6GradOp : public XlaOpKernel {
     ctx->SetOutput(0, out);
   }
 };
-
-REGISTER_XLA_OP(Name("Relu"), ReluOp);
-REGISTER_XLA_OP(Name("Relu6"), Relu6Op);
-REGISTER_XLA_OP(Name("ReluGrad"), ReluGradOp);
 REGISTER_XLA_OP(Name("Relu6Grad"), Relu6GradOp);
 
+class LeakyReluGradOp : public XlaOpKernel {
+ public:
+  explicit LeakyReluGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", &alpha_));
+  }
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto gradients = ctx->Input("gradients");
+    auto features = ctx->Input("features");
+    auto output =
+        xla::Select(xla::Gt(features, xla::ScalarLike(features, 0)), gradients,
+                    gradients * xla::ScalarLike(gradients, alpha_));
+    ctx->SetOutput(0, output);
+  }
+  float alpha_;
+};
+REGISTER_XLA_OP(Name("LeakyReluGrad"), LeakyReluGradOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54d34a38abc4948a1a08197d72e3e7f763649093
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
@@ -0,0 +1,576 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+using xla::XlaOp;
+
+// Calculates the bilinear weight tensor, given basis ratio (px, py) of the
+// sampling position:
+//    W = [(1-px)*(1-py), px*(1-py), (1-px)*py, px*py]
+// 'ratio' tensor has dimensions [batch, dim_0, ...dim_n, 2].
+//
+// The returned tensor has dimensions [batch, dim_0, ... dim_n, 4].
+XlaOp BilinearWeights(XlaOpKernelContext* ctx, XlaOp ratio,
+                      const TensorShape warp_shape,
+                      xla::PrimitiveType xla_type) {
+  auto first_term = xla::ConstantR2<float>(
+      ctx->builder(), {{1.0, 1.0}, {0.0, 1.0}, {1.0, 0.0}, {0.0, 0.0}});
+  first_term = xla::ConvertElementType(first_term, xla_type);
+
+  auto warp_dims = warp_shape.dim_sizes();
+  std::vector<int64> broadcast_dims(warp_dims.begin(), warp_dims.end() - 1);
+  broadcast_dims.push_back(4);
+  broadcast_dims.push_back(2);
+
+  const int64 broadcast_dims_size = broadcast_dims.size();
+
+  std::vector<int64> last_two_dims_indices = {(broadcast_dims_size - 2),
+                                              (broadcast_dims_size - 1)};
+
+  auto broadcast_first_term =
+      xla::BroadcastInDim(first_term, broadcast_dims, last_two_dims_indices);
+
+  // Ratio is of the same dimension as warp, which is [batch, dim_0,... dim_n,
+  // 2], we broadcast ratio tensor to 'broadcast_dim' by keeping the
+  // [batch, dim_0,...dim_n] dimensions and the [2] dimension as the last
+  // dimension.
+  std::vector<int64> ratio_broadcast_indices(broadcast_dims.size());
+  std::iota(ratio_broadcast_indices.begin(), ratio_broadcast_indices.end(), 0);
+  ratio_broadcast_indices.erase(ratio_broadcast_indices.end() - 2);
+
+  auto broadcast_ratio =
+      xla::BroadcastInDim(ratio, broadcast_dims, ratio_broadcast_indices);
+
+  auto first_term_subtract_weights = broadcast_first_term - broadcast_ratio;
+
+  // Now we have [(1-px, 1-py), (-px, 1-py), (1-px, -py), (px, py)], need to
+  // flip the signs of the second and the third term.
+  auto sign_change = xla::ConstantR2<float>(
+      ctx->builder(), {{1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {1.0, 1.0}});
+  sign_change = xla::ConvertElementType(sign_change, xla_type);
+
+  auto broadcast_sign_change =
+      xla::BroadcastInDim(sign_change, broadcast_dims, last_two_dims_indices);
+
+  auto flipped = first_term_subtract_weights * broadcast_sign_change;
+
+  // Build up the final bilinear weight tensor by multiply reduction, which
+  // gives:
+  //    [(1-px)*(1-py), px*(1-py), (1-px)*py, px*py]
+  // for each 4 neighboring pixels where px and py are the weight of the target
+  // pixel we are sampling from.
+  return xla::Reduce(
+      flipped, xla::One(ctx->builder(), xla_type),
+      xla::CreateScalarMultiplyComputation(xla_type, ctx->builder()),
+      {broadcast_dims_size - 1});
+}
+
+// Concatenates the batch indices to the (x, y) coordinate indices.
+// This is done by first creating an Iota tensor that represents the current
+// batch it is in, then concatenate with the givin (coordinate) indices.
+//
+// The resulting tensor has dimension (batch, dim_0, ... dim_n, 3) where
+// the last dimension of size 3 in turn is [batch_number, x, y].
+// The [batch_number, x, y] dimension is needed because the indices
+// [x,y] alone cannot allow the xla::Gather operation to gather from the input
+// data, which is of dimension [batch, height(y), width(x), channel] with
+// 'batch' being the first dimension.
+XlaOp ConcatenateIota(xla::XlaBuilder* b, XlaOp indices,
+                      const TensorShape& warp_shape) {
+  // We need to create an iota tensor with the same batch dimension.
+  std::vector<int64> dimensions;
+  for (auto dim : warp_shape) {
+    dimensions.push_back(dim.size);
+  }
+  // Except the last dimension, which is of size 1.
+  dimensions.back() = 1;
+
+  auto batch_indices =
+      xla::Iota(b, xla::ShapeUtil::MakeShape(xla::U32, dimensions),
+                /*iota_dimension=*/0);
+
+  return xla::ConcatInDim(b, {batch_indices, indices}, dimensions.size() - 1);
+}
+
+// Gathers the 2x2 neighbors of the input starting_indices, and return a
+// tensor of dimension [batch, dim_0, ... dim_n, 4, data_channels].
+// 'gather_indices' is of dimension [batch, dim_0, ..., dim_n, 3] where the last
+// dimension of size 3 is (batch_no, x, y).
+XlaOp Gather2by2Neighbors(xla::XlaBuilder* b, XlaOp data, XlaOp gather_indices,
+                          int64 data_channels, int warp_dims) {
+  xla::GatherDimensionNumbers gather_dim_numbers;
+  const int64 neighbor_data_dimensions = warp_dims + 2;
+  // Since the Gather output dimensions are [batch, dim_0, ... dim_n, 2, 2,
+  // data_channels], the offset dimensions for Gather is the last 3 dimensions.
+  gather_dim_numbers.add_offset_dims(neighbor_data_dimensions - 3);
+  gather_dim_numbers.add_offset_dims(neighbor_data_dimensions - 2);
+  gather_dim_numbers.add_offset_dims(neighbor_data_dimensions - 1);
+  // The last dimension of 'gather_indices' is the starting indices for gather.
+  gather_dim_numbers.set_index_vector_dim(warp_dims - 1);
+  gather_dim_numbers.add_collapsed_slice_dims(0);
+  gather_dim_numbers.add_start_index_map(0);
+  // Since input is of dimension [batch, height(y), width(x), channel], and warp
+  // is of dimension [batch, x, y], the ordering of x, y here needs to be
+  // swapped when gathering.
+  gather_dim_numbers.add_start_index_map(2);
+  gather_dim_numbers.add_start_index_map(1);
+  // Data dimensions are [batch, x, y, channel].
+  // Output dimensions are [batch, dim_0, ... dim_n, 2, 2, data_channels].
+  auto neighbors_data = xla::Gather(data, gather_indices, gather_dim_numbers,
+                                    /*slice_sizes=*/{1, 2, 2, data_channels});
+  // Collapse the ...,2,2,... dimensions into ...,4,...
+  return xla::Collapse(neighbors_data, {warp_dims - 1, warp_dims});
+}
+
+// Scatter 'updates' tensor to 'grad_data' based on 'indices'. Returns the
+// resulting tensor of dimension: [batch, dim_0, ...dim_n, 2, 2, data_channels].
+// This function can also be seen as the inverse of 'Gather2by2Neighbors'.
+XlaOp ScatterToGradData(XlaOpKernelContext* ctx, XlaOp grad_data, XlaOp indices,
+                        XlaOp updates, int64 warp_dims,
+                        xla::PrimitiveType xla_type) {
+  xla::ScatterDimensionNumbers scatter_dim_numbers;
+  const int64 neighbor_data_dimensions = warp_dims + 2;
+  // Since the Scatter output dimensions are [batch, dim_0, ... dim_n, 2, 2,
+  // data_channels], the update window dimensions is the last 3 dimensions.
+  scatter_dim_numbers.add_update_window_dims(neighbor_data_dimensions - 3);
+  scatter_dim_numbers.add_update_window_dims(neighbor_data_dimensions - 2);
+  scatter_dim_numbers.add_update_window_dims(neighbor_data_dimensions - 1);
+  scatter_dim_numbers.set_index_vector_dim(warp_dims - 1);
+
+  scatter_dim_numbers.add_inserted_window_dims(0);
+  scatter_dim_numbers.add_scatter_dims_to_operand_dims(0);
+  // Since input is of dimension [batch, height(y), width(x), channel], and warp
+  // is of dimension [batch, x, y], the ordering of x, y here needs to be
+  // swapped when scattering.
+  scatter_dim_numbers.add_scatter_dims_to_operand_dims(2);
+  scatter_dim_numbers.add_scatter_dims_to_operand_dims(1);
+
+  return xla::Scatter(grad_data, indices, updates,
+                      xla::CreateScalarAddComputation(xla_type, ctx->builder()),
+                      scatter_dim_numbers);
+}
+
+// Build computation the backprop into input 'data'.
+// Where input:
+// grad_output is of dimension [batch, dim_0, ...dim_n, channel]
+// ratio is of dimension [batch, dim_0, ...dim_n, 2]
+// gather_indices is of dimension [batch, dim_0, ...dim_n, 3]
+//
+// Output:
+// scatter-add to each 2x2 grad_data neighbor:
+//  grad_data[fx, fy, chan] += output_grad * dx * dy
+//  grad_data[cx, fy, chan] += output_grad * (1 - dx) * dy
+//  grad_data[fx, cy, chan] += output_grad * dx * (1 - dy)
+//  grad_data[cx, cy, chan] += output_grad * (1 - dx) * (1 - dy)
+// where (dx, dy) is (1 - ratio).
+XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
+                        XlaOp gather_indices, xla::PrimitiveType warp_type,
+                        TensorShape warp_shape, int64 data_channels,
+                        xla::Shape data_shape) {
+  // Weights tensor has dimension [batch, dim_0, ... dim_n, 4].
+  auto weights = BilinearWeights(ctx, ratio, warp_shape, warp_type);
+
+  auto warp_dims = warp_shape.dim_sizes();
+  std::vector<int64> warp_dims_without_last_dims(warp_dims.begin(),
+                                                 warp_dims.end() - 1);
+
+  std::vector<int64> reshaped_weights_dims = warp_dims_without_last_dims;
+  // Reshape the last dimension of size 4 to two dimensions [2, 2].
+  reshaped_weights_dims.push_back(2);
+  reshaped_weights_dims.push_back(2);
+  std::vector<int64> reshape_dims(warp_shape.dims());
+  std::iota(reshape_dims.begin(), reshape_dims.end(), 0);
+  // The dimension is [batch, dim_0,..., dim_n, 2, 2].
+  auto reshaped_weights = xla::Reshape(weights, /*dimensions=*/reshape_dims,
+                                       /*new_sizes=*/reshaped_weights_dims);
+
+  std::vector<int64> weights_with_channels_dims = reshaped_weights_dims;
+  weights_with_channels_dims.push_back(data_channels);
+  std::vector<int64> reshaped_weights_indices(reshaped_weights_dims.size());
+  std::iota(reshaped_weights_indices.begin(), reshaped_weights_indices.end(),
+            0);
+
+  // The dimension is [batch, dim_0, ..., dim_n, 2, 2, data_channel].
+  auto broadcast_reshaped_weights = xla::BroadcastInDim(
+      reshaped_weights, weights_with_channels_dims, reshaped_weights_indices);
+
+  std::vector<int64> grad_output_indices(warp_dims_without_last_dims.size());
+  std::iota(grad_output_indices.begin(), grad_output_indices.end(), 0);
+  grad_output_indices.push_back(weights_with_channels_dims.size() - 1);
+  XlaOp broadcast_grad_output = xla::BroadcastInDim(
+      grad_output, weights_with_channels_dims, grad_output_indices);
+
+  auto grad_output_multiply_weights =
+      broadcast_grad_output * broadcast_reshaped_weights;
+
+  auto grad_data = xla::ConstantLiteral(
+      ctx->builder(), xla::Literal::CreateFromShape(data_shape));
+
+  return ScatterToGradData(ctx, grad_data, gather_indices,
+                           grad_output_multiply_weights, warp_shape.dims(),
+                           warp_type);
+}
+
+// Build computation for the backprop into input 'warp'.
+// Where input:
+// warp is of dimension [batch, dim_0, ...dim_n, 2]
+// grad_output is of dimension [batch, dim_0, ...dim_n, channel]
+// ratio is of dimension [batch, dim_0, ...dim_n, 2]
+// gather_indices is of dimension [batch, dim_0, ...dim_n, 3]
+// data is of dimension [batch, x, y, channel]
+//
+// Output (simplified by ignoring the batch dimensions):
+// Since the forward path has:
+//    output = dot(weights * neighbors)
+// The backprop into warp will therefore be:
+//    grad_warp = output_grad * d_output / d_warp
+//              = output_grad * (d_weights / d_warp * neighbors + d_neighbors /
+//              d_warp * weight)
+// Where:
+//    d_weights / d_warp_x = [-(1 - py), (1 - py), -py, py]
+//    d_weights / d_warp_y = [-(1 - px), -px, (1-px), px]
+// and
+//    d_neighbors / d_warp_x = 0
+//
+// Therefore:
+//    grad_warp_x = py * (img_cxcy - img_fxcy) + (1-py) * (img_cxfy-img_fxfy)
+//    grad_warp_y = px * (img_cxcy - img_cxfy) + (1-px) * (img_fxcy-img_fxfy)
+//
+// where (px, py) is warp, (fx, fy) is the left top corner and (cx, cy) is the
+// bottom right corner in a 2x2 neighborhood.
+XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
+                        XlaOp gather_indices, XlaOp data,
+                        TensorShape warp_shape, int64 data_channels,
+                        xla::PrimitiveType data_type) {
+  auto warp_dims = warp_shape.dim_sizes();
+  std::vector<int64> warp_dims_without_last_dims(warp_dims.begin(),
+                                                 warp_dims.end() - 1);
+
+  // With dimension [batch, dim_0, ...dim_n, 4]
+  std::vector<int64> neighbor_broadcast_dims = warp_dims_without_last_dims;
+  neighbor_broadcast_dims.push_back(4);
+
+  // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
+  auto neighbors_data = Gather2by2Neighbors(
+      ctx->builder(), data, gather_indices, data_channels, warp_shape.dims());
+
+  const int64 last_warp_dim = warp_shape.dims() - 1;
+
+  // Since we will be creating the dot product of:
+  //  lhs: [batch, dim_0, ...dim_n, 4]
+  // and
+  //  rhs: [batch, dim_0, ...dim_n, 4, data_channels]
+  // we choose the last dimension of lhs and the second last dimension of rhs,
+  // with size 4, as the contracting dimension.
+  xla::DotDimensionNumbers dot_dims;
+  for (int i = 0; i < warp_shape.dims() - 1; ++i) {
+    dot_dims.add_lhs_batch_dimensions(i);
+    dot_dims.add_rhs_batch_dimensions(i);
+  }
+  dot_dims.add_lhs_contracting_dimensions(warp_shape.dims() - 1);
+  dot_dims.add_rhs_contracting_dimensions(warp_shape.dims() - 1);
+
+  // img_cxcy - img_fxcy
+  auto bottom_right_minus_bottom_left = xla::DotGeneral(
+      xla::BroadcastInDim(
+          xla::ConvertElementType(
+              xla::ConstantR1<float>(ctx->builder(), {0, 0, -1, 1}), data_type),
+          neighbor_broadcast_dims, {last_warp_dim}),
+      neighbors_data, dot_dims, /*precision_config=*/nullptr);
+
+  // img_cxfy - img_fxfy
+  auto top_right_minus_top_left = xla::DotGeneral(
+      xla::BroadcastInDim(
+          xla::ConvertElementType(
+              xla::ConstantR1<float>(ctx->builder(), {-1, 1, 0, 0}), data_type),
+          neighbor_broadcast_dims, {last_warp_dim}),
+      neighbors_data, dot_dims, /*precision_config=*/nullptr);
+
+  // img_cxcy - img_cxfy
+  auto bottom_right_minus_top_right = xla::DotGeneral(
+      xla::BroadcastInDim(
+          xla::ConvertElementType(
+              xla::ConstantR1<float>(ctx->builder(), {0, -1, 0, 1}), data_type),
+          neighbor_broadcast_dims, {last_warp_dim}),
+      neighbors_data, dot_dims, /*precision_config=*/nullptr);
+
+  // img_fxcy - img_fxfy
+  auto bottom_left_minus_top_left = xla::DotGeneral(
+      xla::BroadcastInDim(
+          xla::ConvertElementType(
+              xla::ConstantR1<float>(ctx->builder(), {-1, 0, 1, 0}), data_type),
+          neighbor_broadcast_dims, {last_warp_dim}),
+      neighbors_data, dot_dims, /*precision_config=*/nullptr);
+
+  // Slice out x and y.
+  auto weight_x = xla::SliceInDim(ratio, /*start_index=*/0, /*limit_index=*/1,
+                                  /*stride=*/1, /*dimno=*/last_warp_dim);
+  auto weight_y = xla::SliceInDim(ratio, /*start_index=*/1, /*limit_index=*/2,
+                                  /*stride=*/1, /*dimno=*/last_warp_dim);
+
+  // Build 1 - y and 1 - x.
+  auto one_minus_y = xla::One(ctx->builder(), data_type) - weight_y;
+  auto one_minus_x = xla::One(ctx->builder(), data_type) - weight_x;
+
+  auto x_before_reduce =
+      grad_output * weight_y * bottom_right_minus_bottom_left +
+      one_minus_y * top_right_minus_top_left;
+
+  std::vector<int64> reshaped_sizes = warp_dims_without_last_dims;
+  reshaped_sizes.push_back(1);
+
+  std::vector<int64> reshaped_dims(warp_dims_without_last_dims.size());
+  std::iota(reshaped_dims.begin(), reshaped_dims.end(), 0);
+
+  // Reduce-add along the channel dimension.
+  auto x_result =
+      xla::Reduce(x_before_reduce, xla::Zero(ctx->builder(), data_type),
+                  xla::CreateScalarAddComputation(data_type, ctx->builder()),
+                  {last_warp_dim});
+  // Reshape before concatenating with y values.
+  XlaOp reshaped_x = xla::Reshape(x_result, reshaped_dims, reshaped_sizes);
+
+  auto y_before_reduce = grad_output * weight_x * bottom_right_minus_top_right +
+                         one_minus_x * bottom_left_minus_top_left;
+  // Reduce-add along the channel dimension.
+  auto y_result =
+      xla::Reduce(y_before_reduce, xla::Zero(ctx->builder(), data_type),
+
+                  xla::CreateScalarAddComputation(data_type, ctx->builder()),
+                  {last_warp_dim});
+  XlaOp reshaped_y = xla::Reshape(y_result, reshaped_dims, reshaped_sizes);
+
+  return xla::ConcatInDim(ctx->builder(), {reshaped_x, reshaped_y},
+                          last_warp_dim);
+}
+
+class ResamplerOp : public XlaOpKernel {
+ public:
+  explicit ResamplerOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape data_shape = ctx->InputShape("data");
+    OP_REQUIRES(ctx, data_shape.dims() == 4,
+                errors::InvalidArgument("data must be 4-dimensional",
+                                        data_shape.DebugString()));
+    const int64 data_channels = data_shape.dim_size(3);
+    xla::PrimitiveType data_type = ctx->input_xla_type(0);
+
+    TensorShape warp_shape = ctx->InputShape("warp");
+    OP_REQUIRES(ctx, warp_shape.dims() >= 2,
+                errors::InvalidArgument("warp must be at least 2-dimensional",
+                                        warp_shape.DebugString()));
+    for (int size : warp_shape.dim_sizes()) {
+      OP_REQUIRES(ctx, size > 0,
+                  errors::InvalidArgument("warp sizes must be positive, got [",
+                                          size, "]"));
+    }
+    const int64 last_warp_dim = warp_shape.dims() - 1;
+    // Last dimension of warp shape must be of size 2.
+    OP_REQUIRES(ctx, warp_shape.dim_size(last_warp_dim) == 2,
+                errors::InvalidArgument(
+                    "the last dimension of warp must be exactly size 2."));
+    xla::PrimitiveType warp_type = ctx->input_xla_type(1);
+
+    XlaOp data = ctx->Input("data");
+    XlaOp warp = ctx->Input("warp");
+
+    // Find the coordinates of the top left corner for the 2x2 region to be
+    // sampled from. The dimensions are [batch, dim_0, ... dim_n, 2] where the
+    // last dimension of size 2 in turn is [x, y].
+    XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
+
+    auto gather_indices = ConcatenateIota(ctx->builder(), top_left, warp_shape);
+
+    // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
+    auto neighbors_data = Gather2by2Neighbors(
+        ctx->builder(), data, gather_indices, data_channels, warp_shape.dims());
+
+    // Dimensions are [batch, dim_0, ... dim_n, 2].
+    XlaOp ratio = warp - xla::ConvertElementType(top_left, data_type);
+
+    // Obtain the bilinear blending weights, the dimension is [batch, dim_0,
+    // ...dim_n, 4].
+    auto weights = BilinearWeights(ctx, ratio, warp_shape, data_type);
+
+    // Since we will be creating the dot product of:
+    //  lhs: [batch, dim_0, ...dim_n, 4]
+    // and
+    //  rhs: [batch, dim_0, ...dim_n, 4, data_channels]
+    // we choose the last dimension of lhs and the second last dimension of rhs,
+    // with size 4, as the contracting dimension.
+    xla::DotDimensionNumbers dot_dims;
+    for (int i = 0; i < warp_shape.dims() - 1; ++i) {
+      dot_dims.add_lhs_batch_dimensions(i);
+      dot_dims.add_rhs_batch_dimensions(i);
+    }
+    dot_dims.add_lhs_contracting_dimensions(warp_shape.dims() - 1);
+    dot_dims.add_rhs_contracting_dimensions(warp_shape.dims() - 1);
+
+    // The dimension is [batch, dim_0, ...dim_n, data_channels].
+    auto blended_pixels = xla::DotGeneral(weights, neighbors_data, dot_dims,
+                                          /*precision_config=*/nullptr);
+
+    // Handle out of boundary cases by constructing a predicate mask array based
+    // on the in-bound condition, and output 0 for the blended pixel value if
+    // out-bound. The dimension is the same as top_left: [batch, dim_0,
+    // ...dim_n, 2] where the last dimension of size 2 is the [x, y] coordinate.
+
+    auto is_ge_zero = xla::Ge(warp, xla::ZerosLike(warp));
+
+    auto is_lt_image_size = xla::Lt(
+        warp,
+        xla::ConvertElementType(
+            xla::ConstantR1<float>(
+                ctx->builder(),
+                {/*width=*/static_cast<float>(data_shape.dim_size(2) - 1),
+                 /*height=*/static_cast<float>(data_shape.dim_size(1) - 1)}),
+            warp_type),
+        /*broadcast_dimensions=*/{warp_shape.dims() - 1});
+
+    auto is_in_bound_x_y = xla::And(is_ge_zero, is_lt_image_size);
+    // Reduce along last dimension. The resulting dimension is:
+    // [batch, dim_0, ...dim_n].
+    auto is_in_bound = xla::Reduce(
+        is_in_bound_x_y, xla::ConstantR0<bool>(ctx->builder(), true),
+        xla::CreateScalarAndComputation(xla::PrimitiveType::PRED,
+                                        ctx->builder()),
+        {last_warp_dim});
+
+    // Broadcast 'is_in_bound' to the same dimension as 'blended_pixels', which
+    // is the dimension of the result:
+    //  [batch, dim_0, ...dim_n, data_channels].
+    auto warp_dims = warp_shape.dim_sizes();
+    std::vector<int64> result_dims(warp_dims.begin(), warp_dims.end() - 1);
+    result_dims.push_back(data_channels);
+
+    std::vector<int64> broadcasted_dims(warp_dims.size() - 1);
+    std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0);
+    auto broadcasted_is_in_bound =
+        xla::BroadcastInDim(is_in_bound, result_dims, broadcasted_dims);
+
+    // Set out of bound samples to zero.
+    auto zeros =
+        xla::Broadcast(xla::Zero(ctx->builder(), data_type), result_dims);
+    auto result = xla::Select(broadcasted_is_in_bound, blended_pixels, zeros);
+
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(Name("Resampler"), ResamplerOp);
+
+class ResamplerGradOp : public XlaOpKernel {
+ public:
+  explicit ResamplerGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    DataType output_dtype;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &output_dtype));
+  }
+
+  // TODO(b/112295522): note that sampling from image boundary is not currently
+  // being handled properly.
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape data_shape_tf = ctx->InputShape("data");
+    OP_REQUIRES(ctx, data_shape_tf.dims() == 4,
+                errors::InvalidArgument("data must be 4-dimensional",
+                                        data_shape_tf.DebugString()));
+    const int64 data_channels = data_shape_tf.dim_size(3);
+    xla::PrimitiveType data_type = ctx->input_xla_type(0);
+
+    TensorShape warp_shape = ctx->InputShape("warp");
+    OP_REQUIRES(ctx, warp_shape.dims() >= 2,
+                errors::InvalidArgument("warp must be at least 2-dimensional",
+                                        warp_shape.DebugString()));
+    for (int size : warp_shape.dim_sizes()) {
+      OP_REQUIRES(ctx, size > 0,
+                  errors::InvalidArgument("warp sizes must be positive, got [",
+                                          size, "]"));
+    }
+    // Last dimension of warp shape must be of size 2.
+    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
+                errors::InvalidArgument(
+                    "the last dimension of warp must be exactly size 2."));
+    xla::PrimitiveType warp_type = ctx->input_xla_type(1);
+
+    TensorShape output_grad_shape = ctx->InputShape("grad_output");
+    OP_REQUIRES(
+        ctx, output_grad_shape.dims() >= 2,
+        errors::InvalidArgument("output_grad must be at least 2-dimensional",
+                                output_grad_shape.DebugString()));
+
+    // Dimensions are [batch, x, y, channel].
+    XlaOp data = ctx->Input("data");
+    xla::Shape data_shape = TensorShapeToXLAShape(data_type, data_shape_tf);
+
+    // Dimensions are [batch, dim_0, ...dim_n, 2].
+    XlaOp warp = ctx->Input("warp");
+    // Dimensions are [batch, dim_0, ...dim_n, channel].
+    XlaOp grad_output = ctx->Input("grad_output");
+
+    // Find the top left corner coordinate for the region to be sampled from.
+    // The dimensions are [batch, dim_0, ... dim_n, 2] where the last dimension
+    // of size 2 in turn is [x, y].
+    XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
+
+    // Dimensions are [batch, dim_0, ... dim_n, 2]
+    XlaOp ratio = warp - xla::ConvertElementType(top_left, warp_type);
+
+    // Indices for gathering neighboring pixels.
+    auto gather_indices = ConcatenateIota(ctx->builder(), top_left, warp_shape);
+
+    auto grad_data =
+        CalculateGradData(ctx, grad_output, ratio, gather_indices, warp_type,
+                          warp_shape, data_channels, data_shape);
+
+    auto grad_warp =
+        CalculateGradWarp(ctx, grad_output, ratio, gather_indices, data,
+                          warp_shape, data_channels, data_type);
+
+    ctx->SetOutput(0, grad_data);
+    ctx->SetOutput(1, grad_warp);
+  }
+};
+
+REGISTER_XLA_OP(Name("ResamplerGrad"), ResamplerGradOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index 366ce42866e9f1375ee0ff6f4985c8f461fc0885..fa1b6b91710f5507f41f3f69b0715398ae879aaf 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 namespace {
@@ -36,7 +37,7 @@ class ReshapeOp : public XlaOpKernel {
     const TensorShape input_shape = ctx->InputShape(0);
     const TensorShape sizes_shape = ctx->InputShape(1);
     // Preliminary validation of sizes.
-    OP_REQUIRES(ctx, IsLegacyVector(sizes_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(sizes_shape),
                 errors::InvalidArgument("sizes input must be 1-D, not shape ",
                                         sizes_shape.DebugString()));
     const int64 num_dims = sizes_shape.num_elements();
@@ -95,7 +96,7 @@ class ReshapeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Reshape").CompileTimeConstInput("shape"), ReshapeOp);
+REGISTER_XLA_OP(Name("Reshape").CompileTimeConstantInput("shape"), ReshapeOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index e172c649325adb6f7761ce0be141f21e8d545bc1..e4046c795577983bff1a8053743bf4d3a258e583 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -46,61 +47,7 @@ class RetvalOp : public XlaOpKernel {
       // compilation.
       OP_REQUIRES_OK(ctx, frame->SetRetval(index_, input));
     } else {
-      xla::XlaOp input = ctx->Input(0);
-      const TensorShape input_shape = ctx->InputShape(0);
-      DataType input_type = ctx->input_type(0);
-      XlaContext& tc = XlaContext::Get(ctx);
-
-      if (input_type == DT_RESOURCE) {
-        XlaResource* resource;
-        OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
-        ctx->SetStatus(tc.AddResourceRetval(index_, resource));
-        return;
-      }
-
-      auto is_constant = ctx->builder()->IsConstant(input);
-      if (!is_constant.ok()) {
-        ctx->SetStatus(is_constant.status());
-        return;
-      }
-
-      if (tc.resolve_compile_time_constants() &&
-          (input_shape.num_elements() == 0 || is_constant.ValueOrDie())) {
-        xla::Literal literal;
-        OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal));
-        OP_REQUIRES_OK(ctx, tc.AddConstRetval(index_, dtype_, literal));
-      } else {
-        TensorShape shape = ctx->InputShape(0);
-        ctx->SetStatus(is_constant.status());
-        TensorShape representation_shape;
-        if (tc.is_entry_computation()) {
-          xla::StatusOr<TensorShape> shape_or_status =
-              tc.RepresentationShape(shape, ctx->input_type(0));
-          if (!shape_or_status.ok()) {
-            ctx->SetStatus(shape_or_status.status());
-            return;
-          } else {
-            representation_shape = shape_or_status.ValueOrDie();
-          }
-        } else {
-          representation_shape = shape;
-        }
-
-        xla::XlaOp output = input;
-        if (tc.is_entry_computation()) {
-          output = xla::Reshape(input, representation_shape.dim_sizes());
-        } else {
-          // The core from which a return value is returned depends on the
-          // device assignment of the input to the retval. Since we can't change
-          // the device assignment of "input" at this point, we must always
-          // introduce an operator here, even if the shape does not change.
-          // TODO(b/76097077): propagate device assignments onto arguments and
-          // return values of functions, and then reshape unconditionally.
-          output =
-              xla::GetTupleElement(xla::Tuple(ctx->builder(), {output}), 0);
-        }
-        tc.AddRetval(index_, dtype_, shape, output);
-      }
+      ctx->xla_context()->SetRetval(index_, ctx->InputExpression(0));
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index 8494864b33a44b03a07e3fea7766285f54074e7d..2ceadaf79c5cef35ad50aa84a0d66a46527a6458 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -51,14 +51,11 @@ class ReverseOp : public XlaOpKernel {
     }
     // XlaBuilder::Rev() requires concrete values for dimensions arg.
     xla::Literal lax;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {x_shape.dims()}, &lax));
-    std::vector<bool> revdims(x_shape.dims());
-    std::copy(lax.data<bool>().begin(), lax.data<bool>().end(),
-              revdims.begin());
-    std::vector<int64> dimensions;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &lax));
 
+    std::vector<int64> dimensions;
     for (int d = 0; d < x_shape.dims(); ++d) {
-      if (revdims[d]) {
+      if (lax.Get<bool>({d})) {
         dimensions.push_back(d);
       }
     }
@@ -67,7 +64,7 @@ class ReverseOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Reverse").CompileTimeConstInput("dims"), ReverseOp);
+REGISTER_XLA_OP(Name("Reverse").CompileTimeConstantInput("dims"), ReverseOp);
 
 class ReverseV2Op : public XlaOpKernel {
  public:
@@ -119,7 +116,8 @@ class ReverseV2Op : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("ReverseV2").CompileTimeConstInput("axis"), ReverseV2Op);
+REGISTER_XLA_OP(Name("ReverseV2").CompileTimeConstantInput("axis"),
+                ReverseV2Op);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index 03a50ef8a059e5a005c4cc2e5e98acedfea8619a..d7b38e86cc985d608116488f9e76756a8e904f9c 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -17,8 +17,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
@@ -61,113 +62,79 @@ class ReverseSequenceOp : public XlaOpKernel {
     const auto seq_lens = context->Input(1);
 
     const int64 batch_size = input_shape.dim_size(batch_dim_);
+    if (batch_size == 0) {
+      context->SetOutput(0, input);
+      return;
+    }
 
-    const DataType input_type = context->input_type(0);
-    const DataType seq_lens_type = context->input_type(1);
+    // Given the input
+    //
+    // 012345
+    // 6789AB
+    //
+    // and sequence lens {2, 3} we:
+    //
+    // 1. Reverse and pad each row to get
+    //
+    //    543210XXXXXX
+    //    BA9876XXXXXX
+    //
+    // 2. Gather out the suffix from each row to get
+    //
+    //    10XXXX
+    //    876XXX
+    //
+    // 3. Select from the input and the array created by (2) to get the result.
+    //
+    //    102345
+    //    8769AB
+    const xla::PrimitiveType input_type = context->input_xla_type(0);
+    const xla::PrimitiveType seq_lens_type = context->input_xla_type(1);
     const int64 max_seq_len = input_shape.dim_size(seq_dim_);
 
-    xla::Shape input_xla_shape;
-    OP_REQUIRES_OK(context, TensorShapeToXLAShape(input_type, input_shape,
-                                                  &input_xla_shape));
-    xla::Shape seq_lens_xla_shape;
-    OP_REQUIRES_OK(context, TensorShapeToXLAShape(seq_lens_type, seq_lens_shape,
-                                                  &seq_lens_xla_shape));
-
-    const auto tuple_shape = xla::ShapeUtil::MakeTupleShape({
-        xla::ShapeUtil::MakeShape(seq_lens_xla_shape.element_type(), {}),
-        seq_lens_xla_shape,
-        input_xla_shape,
-    });
-
-    // For each entry in the batch, reverse the sequence.
-    // TODO(b/65689298): generalize the Map() operator to non-scalar cases and
-    // use it here, instead of a While loop.
-
-    // Condition: lambda (i, _, _): i < batch_size
-    auto condition_builder =
-        builder->CreateSubBuilder("reverse_sequence_condition");
-    {
-      auto param =
-          xla::Parameter(condition_builder.get(), 0, tuple_shape, "param");
-      auto i = xla::GetTupleElement(param, 0);
-      xla::Lt(i, XlaHelpers::IntegerLiteral(condition_builder.get(),
-                                            seq_lens_type, batch_size));
-    }
-    auto condition = condition_builder->Build();
-    OP_REQUIRES_OK(context, condition.status());
-
-    auto body_builder = builder->CreateSubBuilder("reverse_sequence_body");
-    {
-      auto param = xla::Parameter(body_builder.get(), 0, tuple_shape, "param");
-      auto i = xla::GetTupleElement(param, 0);
-      auto seq_lens = xla::GetTupleElement(param, 1);
-      auto output = xla::GetTupleElement(param, 2);
-
-      // seq_len is the sequence length of the current batch element (rank 1)
-      auto seq_len = xla::DynamicSlice(seq_lens, xla::Reshape(i, {1}), {1});
-
-      // Indices is the offset of the batch element in the input.
-      auto batch_element_indices =
-          xla::Broadcast(XlaHelpers::Zero(body_builder.get(), seq_lens_type),
-                         {input_shape.dims()});
-      batch_element_indices = xla::DynamicUpdateSlice(
-          batch_element_indices, xla::Reshape(i, {1}),
-          xla::Reshape(XlaHelpers::IntegerLiteral(body_builder.get(),
-                                                  seq_lens_type, batch_dim_),
-                       {1}));
-
-      // Slice out the current batch element and pad it out in the sequence
-      // dimension.
-      TensorShape slice_shape = input_shape;
-      slice_shape.set_dim(batch_dim_, 1);
-      slice_shape.set_dim(seq_dim_, max_seq_len);
-      auto slice = xla::DynamicSlice(output, batch_element_indices,
-                                     slice_shape.dim_sizes());
-      auto padding_config = xla::MakeNoPaddingConfig(slice_shape.dims());
-      padding_config.mutable_dimensions(seq_dim_)->set_edge_padding_high(
-          slice_shape.dim_size(seq_dim_));
-      slice = xla::Pad(slice, XlaHelpers::Zero(body_builder.get(), input_type),
-                       padding_config);
-
-      // Now slice out the reversed sequence from its actual start.
-      // sequence_start_indices is the offset of the start of the reversed
-      // sequence in the input. The slice will go into the padding, however, we
-      // will mask off these elements and replace them with elements from the
-      // original input so their values do not matter.
-      auto sequence_start_indices =
-          xla::Broadcast(XlaHelpers::Zero(body_builder.get(), seq_lens_type),
-                         {slice_shape.dims()});
-      sequence_start_indices = xla::DynamicUpdateSlice(
-          sequence_start_indices,
-          xla::Sub(XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
-                                              max_seq_len),
-                   seq_len),
-          xla::Reshape(XlaHelpers::IntegerLiteral(body_builder.get(),
-                                                  seq_lens_type, seq_dim_),
-                       {1}));
-      slice = xla::DynamicSlice(slice, sequence_start_indices,
-                                slice_shape.dim_sizes());
-
-      // Shift the reversed sequence to the left.
-      output = xla::DynamicUpdateSlice(output, slice, batch_element_indices);
-
-      xla::Tuple(
-          body_builder.get(),
-          {xla::Add(i, XlaHelpers::One(body_builder.get(), seq_lens_type)),
-           seq_lens, output});
+    xla::XlaOp rev = xla::Rev(input, {seq_dim_});
+
+    auto padding_config = xla::MakeNoPaddingConfig(input_shape.dims());
+    padding_config.mutable_dimensions(seq_dim_)->set_edge_padding_high(
+        max_seq_len);
+    xla::XlaOp padded =
+        xla::Pad(rev, xla::Zero(builder, input_type), padding_config);
+
+    // Form a start indices tensor with shape [2, batch_size]. For each batch
+    // entry we have a (batch offset, seq offset) pair.
+    xla::XlaOp start_indices = xla::ConcatInDim(
+        builder,
+        {
+            xla::Iota(builder,
+                      xla::ShapeUtil::MakeShape(seq_lens_type, {1, batch_size}),
+                      /*iota_dimension=*/1),
+            xla::Reshape(xla::ScalarLike(seq_lens, max_seq_len) - seq_lens,
+                         {1, batch_size}),
+        },
+        /*dimension=*/0);
+
+    xla::GatherDimensionNumbers dnums;
+    // The first dimension of start_indices contains the batch/seq dim choice.
+    dnums.set_index_vector_dim(0);
+    dnums.add_start_index_map(batch_dim_);
+    dnums.add_start_index_map(seq_dim_);
+
+    // All other dimensions other than the batch dim are offset dimensions.
+    for (int i = 0; i < input_shape.dims(); ++i) {
+      if (i != batch_dim_) {
+        dnums.add_offset_dims(i);
+      }
     }
-    auto body = body_builder->Build();
-    OP_REQUIRES_OK(context, body.status());
-
-    auto loop_output = xla::While(
-        condition.ValueOrDie(), body.ValueOrDie(),
-        xla::Tuple(builder, {XlaHelpers::Zero(builder, seq_lens_type), seq_lens,
-                             xla::Rev(input, {seq_dim_})}));
-    auto output = xla::GetTupleElement(loop_output, 2);
-
-    // Mask out elements after the sequence length.
-    xla::XlaOp iota =
-        xla::Iota(builder, seq_lens_xla_shape.element_type(), max_seq_len);
+    dnums.add_collapsed_slice_dims(batch_dim_);
+
+    auto slice_sizes = input_shape.dim_sizes();
+    slice_sizes[batch_dim_] = 1;
+
+    xla::XlaOp output = xla::Gather(padded, start_indices, dnums, slice_sizes);
+
+    // Mask out elements after the sequence length, and copy the corresponding
+    // elements from the input.
+    xla::XlaOp iota = xla::Iota(builder, seq_lens_type, max_seq_len);
     std::vector<int64> dims(input_shape.dims(), 1);
     dims[batch_dim_] = batch_size;
     auto mask = xla::Lt(iota, xla::Reshape(seq_lens, dims), {seq_dim_});
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index ab094d7dd1ce9856a3c2854fd2776827d6c4b76f..4b9e1a578be2445091228953df7e5c5e82b42c28 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -39,8 +39,8 @@ namespace {
 
 // TODO(phawkins): implement double-sized windowed reductions in XLA and remove
 // the type constraint.
-constexpr std::array<DataType, 3> kScanOpTypes = {
-    {DT_HALF, DT_BFLOAT16, DT_FLOAT}};
+constexpr std::array<DataType, 4> kScanOpTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_INT32}};
 
 class ScanOp : public XlaOpKernel {
  public:
@@ -103,10 +103,10 @@ class ScanOp : public XlaOpKernel {
       reducer = ctx->GetOrCreateMul(dtype);
     }
     auto output = xla::ReduceWindowWithGeneralPadding(
-        XlaHelpers::ConvertElementType(builder, ctx->Input(0), dtype), init,
-        *reducer, window_dims, window_strides, padding);
-    output =
-        XlaHelpers::ConvertElementType(builder, output, ctx->input_type(0));
+        XlaHelpers::ConvertElementType(ctx->Input(0), dtype), init, *reducer,
+        window_dims, window_strides,
+        /*base_dilations=*/{}, /*window_dilations=*/{}, padding);
+    output = XlaHelpers::ConvertElementType(output, ctx->input_type(0));
 
     // In exclusive mode, we have computed an extra element containing the sum
     // of all the input elements. Slice off this extra "last" element.
@@ -135,7 +135,7 @@ class CumsumOp : public ScanOp {
 };
 REGISTER_XLA_OP(Name("Cumsum")
                     .TypeConstraint("T", kScanOpTypes)
-                    .CompileTimeConstInput("axis"),
+                    .CompileTimeConstantInput("axis"),
                 CumsumOp);
 
 class CumprodOp : public ScanOp {
@@ -144,7 +144,7 @@ class CumprodOp : public ScanOp {
 };
 REGISTER_XLA_OP(Name("Cumprod")
                     .TypeConstraint("T", kScanOpTypes)
-                    .CompileTimeConstInput("axis"),
+                    .CompileTimeConstantInput("axis"),
                 CumprodOp);
 
 }  // anonymous namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index f1f32699fee5f03f603f830722fe65622dee5d3e..a95e7adacf194ba6eb33cbeb56abe1a5a2479337 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -116,7 +116,8 @@ class ScatterNdOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("ScatterNd").CompileTimeConstInput("shape"), ScatterNdOp);
+REGISTER_XLA_OP(Name("ScatterNd").CompileTimeConstantInput("shape"),
+                ScatterNdOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index b22ecb7c6dbb42a33a4f4d90b18b20816df16a50..97359f81eee4aa0b46f03941ab6ca3ea3d468f1f 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -105,7 +105,7 @@ class UnsortedSegmentSum : public UnsortedSegmentReduce {
 };
 
 REGISTER_XLA_OP(
-    Name("UnsortedSegmentSum").CompileTimeConstInput("num_segments"),
+    Name("UnsortedSegmentSum").CompileTimeConstantInput("num_segments"),
     UnsortedSegmentSum);
 
 class UnsortedSegmentProd : public UnsortedSegmentReduce {
@@ -120,7 +120,7 @@ class UnsortedSegmentProd : public UnsortedSegmentReduce {
 };
 
 REGISTER_XLA_OP(
-    Name("UnsortedSegmentProd").CompileTimeConstInput("num_segments"),
+    Name("UnsortedSegmentProd").CompileTimeConstantInput("num_segments"),
     UnsortedSegmentProd);
 
 class UnsortedSegmentMin : public UnsortedSegmentReduce {
@@ -137,7 +137,7 @@ class UnsortedSegmentMin : public UnsortedSegmentReduce {
 };
 
 REGISTER_XLA_OP(
-    Name("UnsortedSegmentMin").CompileTimeConstInput("num_segments"),
+    Name("UnsortedSegmentMin").CompileTimeConstantInput("num_segments"),
     UnsortedSegmentMin);
 
 class UnsortedSegmentMax : public UnsortedSegmentReduce {
@@ -154,7 +154,7 @@ class UnsortedSegmentMax : public UnsortedSegmentReduce {
 };
 
 REGISTER_XLA_OP(
-    Name("UnsortedSegmentMax").CompileTimeConstInput("num_segments"),
+    Name("UnsortedSegmentMax").CompileTimeConstantInput("num_segments"),
     UnsortedSegmentMax);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index a7f5a8f1698b9d02560de427d356e9e6be5caa7c..84470b230d421658e0d79dcecb175a24155f49b7 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -42,7 +42,7 @@ SendOp::SendOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
 }
 
 void SendOp::Compile(XlaOpKernelContext* ctx) {
-  XlaCompiler* compiler = XlaContext::Get(ctx).compiler();
+  XlaCompiler* compiler = ctx->compiler();
   xla::ChannelHandle channel;
   OP_REQUIRES_OK(ctx, compiler->GetChannelHandle(tensor_name_, &channel));
   xla::Send(ctx->Input(0), channel);
@@ -73,7 +73,7 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
 }
 
 void RecvOp::Compile(XlaOpKernelContext* ctx) {
-  XlaCompiler* compiler = XlaContext::Get(ctx).compiler();
+  XlaCompiler* compiler = ctx->compiler();
   xla::ChannelHandle channel;
   OP_REQUIRES_OK(ctx, compiler->GetChannelHandle(tensor_name_, &channel));
   ctx->SetOutput(0, xla::Recv(ctx->builder(), shape_, channel));
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 0c32b8def0f7b741c93e803f8359b6504087e257..b1fa2915d59e4e5e2f2523e20e9a37898d087117 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -30,31 +30,6 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-template <typename T>
-Status GetValue(int index, XlaOpKernelContext* ctx, T* value) {
-  xla::Literal literal;
-  TF_RETURN_IF_ERROR(ctx->ConstantInput(index, &literal));
-  *value = literal.Get<T>({});
-  return Status::OK();
-}
-
-Status GetIntValue(int index, XlaOpKernelContext* ctx, int64* value) {
-  xla::Literal literal;
-  TF_RETURN_IF_ERROR(ctx->ConstantInput(index, &literal));
-  switch (literal.shape().element_type()) {
-    case xla::S32:
-      *value = literal.Get<int32>({});
-      break;
-    case xla::S64:
-      *value = literal.Get<int64>({});
-      break;
-    default:
-      return errors::InvalidArgument("Invalid argument type for argument",
-                                     index);
-  }
-  return Status::OK();
-}
-
 // The type-specific part of the implementation of Range.
 template <typename T>
 xla::StatusOr<xla::XlaOp> CreateRangeTensor(
@@ -98,13 +73,13 @@ class RangeOp : public XlaOpKernel {
     const TensorShape start_in_shape = ctx->InputShape(0);
     const TensorShape limit_in_shape = ctx->InputShape(1);
     const TensorShape delta_in_shape = ctx->InputShape(2);
-    OP_REQUIRES(ctx, IsLegacyScalar(start_in_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(start_in_shape),
                 errors::InvalidArgument("start must be a scalar, not shape ",
                                         start_in_shape.DebugString()));
-    OP_REQUIRES(ctx, IsLegacyScalar(limit_in_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(limit_in_shape),
                 errors::InvalidArgument("limit must be a scalar, not shape ",
                                         limit_in_shape.DebugString()));
-    OP_REQUIRES(ctx, IsLegacyScalar(delta_in_shape),
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(delta_in_shape),
                 errors::InvalidArgument("delta must be a scalar, not shape ",
                                         delta_in_shape.DebugString()));
     xla::Literal start, limit, delta;
@@ -137,9 +112,9 @@ class RangeOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("Range")
-                    .CompileTimeConstInput("start")
-                    .CompileTimeConstInput("limit")
-                    .CompileTimeConstInput("delta"),
+                    .CompileTimeConstantInput("start")
+                    .CompileTimeConstantInput("limit")
+                    .CompileTimeConstantInput("delta"),
                 RangeOp);
 
 class LinSpaceOp : public XlaOpKernel {
@@ -147,9 +122,9 @@ class LinSpaceOp : public XlaOpKernel {
   explicit LinSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape start_in_shape = ctx->InputShape(0);
-    const TensorShape stop_in_shape = ctx->InputShape(1);
-    const TensorShape num_in_shape = ctx->InputShape(2);
+    const TensorShape start_in_shape = ctx->InputShape("start");
+    const TensorShape stop_in_shape = ctx->InputShape("stop");
+    const TensorShape num_in_shape = ctx->InputShape("num");
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(start_in_shape),
                 errors::InvalidArgument("start must be a scalar, not shape ",
                                         start_in_shape.DebugString()));
@@ -163,16 +138,20 @@ class LinSpaceOp : public XlaOpKernel {
     DataType type = ctx->input_type(0);
 
     int64 num;
-    OP_REQUIRES_OK(ctx, GetIntValue(2, ctx, &num));
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("num", &num));
     OP_REQUIRES(ctx, num > 0,
                 errors::InvalidArgument("Requires num > 0: ", num));
     Tensor out_constant(type, TensorShape({num}));
 
+    xla::Literal start_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput("start", &start_literal));
+    xla::Literal stop_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput("stop", &stop_literal));
+
     switch (type) {
       case DT_FLOAT: {
-        float start, stop;
-        OP_REQUIRES_OK(ctx, GetValue(0, ctx, &start));
-        OP_REQUIRES_OK(ctx, GetValue(1, ctx, &stop));
+        float start = start_literal.GetFirstElement<float>();
+        float stop = stop_literal.GetFirstElement<float>();
         auto flat = out_constant.flat<float>();
         if (num == 1) {
           flat(0) = start;
@@ -185,9 +164,8 @@ class LinSpaceOp : public XlaOpKernel {
         break;
       }
       case DT_DOUBLE: {
-        double start, stop;
-        OP_REQUIRES_OK(ctx, GetValue(0, ctx, &start));
-        OP_REQUIRES_OK(ctx, GetValue(1, ctx, &stop));
+        double start = start_literal.GetFirstElement<double>();
+        double stop = stop_literal.GetFirstElement<double>();
         auto flat = out_constant.flat<double>();
         if (num == 1) {
           flat(0) = start;
@@ -210,9 +188,9 @@ class LinSpaceOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("LinSpace")
-                    .CompileTimeConstInput("start")
-                    .CompileTimeConstInput("stop")
-                    .CompileTimeConstInput("num"),
+                    .CompileTimeConstantInput("start")
+                    .CompileTimeConstantInput("stop")
+                    .CompileTimeConstantInput("num"),
                 LinSpaceOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index c8a0f31a0375abacaca26688a23f4835e11c692e..12830816ec16c9797f0fe4d8f3f13f5a8176161d 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
@@ -108,21 +109,16 @@ class ExpandDimsOp : public XlaOpKernel {
   explicit ExpandDimsOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape dim_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape dim_shape = ctx->InputShape("dim");
 
-    // TODO(phawkins): the standard implementation of ExpandDimsOp seems to
-    // accept legacy scalars, even when they should be forbidden by the graphdef
-    // version.
-    OP_REQUIRES(ctx, dim_shape.num_elements() == 1,
+    std::vector<int64> dims;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshapedToIntVector("dim", &dims));
+    OP_REQUIRES(ctx, dims.size() == 1,
                 errors::InvalidArgument(absl::StrCat(
                     "dim input to ExpandDims must be a scalar; got ",
                     dim_shape.DebugString())));
-
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {1}, &literal));
-
-    int dim = literal.data<int32>()[0];
+    int dim = dims[0];
 
     OP_REQUIRES(ctx,
                 (dim >= -1 - input_shape.dims() && dim <= input_shape.dims()),
@@ -148,10 +144,11 @@ class ExpandDimsOp : public XlaOpKernel {
     dim = std::min<int32>(dim, existing_dims_size);
     new_shape.emplace(new_shape.begin() + dim, 1);
 
-    ctx->SetOutput(0, xla::Reshape(ctx->Input(0), new_shape));
+    ctx->SetOutput(0, xla::Reshape(ctx->Input("input"), new_shape));
   }
 };
-REGISTER_XLA_OP(Name("ExpandDims").CompileTimeConstInput("dim"), ExpandDimsOp);
+REGISTER_XLA_OP(Name("ExpandDims").CompileTimeConstantInput("dim"),
+                ExpandDimsOp);
 
 class SqueezeOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 537b71f3c0cf3622a8a45a717ac406da69f5c3c7..88da64e5a217a0c026106f03cb26958f6738446c 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mem.h"
@@ -42,8 +43,8 @@ class SliceOp : public XlaOpKernel {
 
     OP_REQUIRES(
         ctx,
-        IsLegacyVector(begin_tensor_shape) &&
-            IsLegacyVector(size_tensor_shape) &&
+        TensorShapeUtils::IsVector(begin_tensor_shape) &&
+            TensorShapeUtils::IsVector(size_tensor_shape) &&
             begin_tensor_shape.num_elements() == input_shape.dims() &&
             size_tensor_shape.num_elements() == input_shape.dims(),
         errors::InvalidArgument(
@@ -111,9 +112,10 @@ class SliceOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(
-    Name("Slice").CompileTimeConstInput("begin").CompileTimeConstInput("size"),
-    SliceOp);
+REGISTER_XLA_OP(Name("Slice")
+                    .CompileTimeConstantInput("begin")
+                    .CompileTimeConstantInput("size"),
+                SliceOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index d6bd927135c013ac1ec3f6547aef358dc2741896..20da8033536e3af3da0fcb216db45f808cacc1d5 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -71,7 +71,7 @@ class SoftmaxOp : public XlaOpKernel {
     auto reduce =
         xla::Reduce(converted, xla::Zero(b, xla_accumulation_type),
                     *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
-    auto sum = XlaHelpers::ConvertElementType(b, reduce, type);
+    auto sum = XlaHelpers::ConvertElementType(reduce, type);
     auto softmax =
         log_
             // softmax = shifted_logits - log(sum(exp(shifted_logits)))
@@ -111,11 +111,11 @@ std::pair<xla::XlaOp, xla::XlaOp> CrossEntropyWithLogits(
   // sum_{class} (exp(logits - max_logits))
   const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
   auto converted =
-      XlaHelpers::ConvertElementType(b, exp_shifted_logits, accumulation_type);
+      XlaHelpers::ConvertElementType(exp_shifted_logits, accumulation_type);
   auto reduce =
       xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                   *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
-  auto sum_exp = XlaHelpers::ConvertElementType(b, reduce, type);
+  auto sum_exp = XlaHelpers::ConvertElementType(reduce, type);
 
   // log(sum(exp(logits - max_logits)))
   auto log_sum_exp = xla::Log(sum_exp);
@@ -126,11 +126,10 @@ std::pair<xla::XlaOp, xla::XlaOp> CrossEntropyWithLogits(
   // (The subtraction broadcasts along the batch dimension.)
   auto sub = xla::Sub(shifted_logits, log_sum_exp, {kBatchDim});
   auto mul = xla::Mul(xla::Neg(labels), sub);
-  auto sum =
-      xla::Reduce(XlaHelpers::ConvertElementType(b, mul, accumulation_type),
-                  XlaHelpers::Zero(b, accumulation_type),
-                  *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
-  auto loss = XlaHelpers::ConvertElementType(b, sum, type);
+  auto sum = xla::Reduce(XlaHelpers::ConvertElementType(mul, accumulation_type),
+                         XlaHelpers::Zero(b, accumulation_type),
+                         *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
+  auto loss = XlaHelpers::ConvertElementType(sum, type);
 
   // backprop: prob - labels, where
   //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
diff --git a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
index aaeeae01ccb303091a6d37d1aeb4b2a3377dc638..6cfdf4a5ae479e9851454df97160754f122bc6ff 100644
--- a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
@@ -25,11 +25,26 @@ class XlaSortOp : public XlaOpKernel {
   explicit XlaSortOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
 
   void Compile(XlaOpKernelContext* context) override {
-    context->SetOutput(0, xla::Sort(context->Input(0)));
+    context->SetOutput(0, xla::Sort(context->Input("input")));
   }
 };
 
 REGISTER_XLA_OP(Name("XlaSort"), XlaSortOp);
 
+class XlaKeyValueSortOp : public XlaOpKernel {
+ public:
+  explicit XlaKeyValueSortOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    xla::XlaOp result =
+        xla::Sort(context->Input("keys"), {context->Input("values")});
+    context->SetOutput(0, xla::GetTupleElement(result, 0));
+    context->SetOutput(1, xla::GetTupleElement(result, 1));
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaKeyValueSort"), XlaKeyValueSortOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index 76b79be6f6f6b5ecbe9edcffb81f2834fdac9a56..622efac81766fc3ddaf538b58170f34fce06927a 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -161,8 +161,8 @@ class SpaceToBatchNDOp : public XlaOpKernel {
   }
 };
 REGISTER_XLA_OP(Name("SpaceToBatchND")
-                    .CompileTimeConstInput("paddings")
-                    .CompileTimeConstInput("block_shape"),
+                    .CompileTimeConstantInput("paddings")
+                    .CompileTimeConstantInput("block_shape"),
                 SpaceToBatchNDOp);
 
 class SpaceToBatchOp : public XlaOpKernel {
@@ -185,7 +185,7 @@ class SpaceToBatchOp : public XlaOpKernel {
  private:
   int block_size_;
 };
-REGISTER_XLA_OP(Name("SpaceToBatch").CompileTimeConstInput("paddings"),
+REGISTER_XLA_OP(Name("SpaceToBatch").CompileTimeConstantInput("paddings"),
                 SpaceToBatchOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc b/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc
index e831dc30a9d3c27ec3b1494e7d8a6de836ff2a11..def3c147bf3fc619784044357e95bf32b404954b 100644
--- a/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc
@@ -80,7 +80,7 @@ class SparseToDenseOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("SparseToDense").CompileTimeConstInput("output_shape"),
+REGISTER_XLA_OP(Name("SparseToDense").CompileTimeConstantInput("output_shape"),
                 SparseToDenseOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 93fc14e9efca868e84444dd0e07d7f0dfa84c042..7a0e240400b344ab25743997ce3baad81bd5f476 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -35,26 +35,16 @@ class SplitOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const int32 num_split = num_outputs();
-    const TensorShape index_shape = ctx->InputShape(0);
+    const TensorShape split_dim_shape = ctx->InputShape("split_dim");
     const TensorShape input_shape = ctx->InputShape(1);
 
-    xla::Literal literal_index;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal_index));
-
-    int32 split_dim_orig;
-    if (index_shape.dims() == 0) {
-      split_dim_orig = literal_index.Get<int>({});
-    } else {
-      OP_REQUIRES(
-          ctx, index_shape.dims() == 1,
-          errors::InvalidArgument("split_index input to Split Op must be a "
-                                  "scalar or a vector with 1 element"));
-      OP_REQUIRES(
-          ctx, index_shape.dim_size(0) == 1,
-          errors::InvalidArgument("split_index input to Split Op must be a "
-                                  "scalar or a vector with 1 element"));
-      split_dim_orig = literal_index.Get<int>({0});
-    }
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(split_dim_shape),
+        errors::InvalidArgument("split_dim must be a scalar but has rank ",
+                                split_dim_shape.dims()));
+    int64 split_dim_orig;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &split_dim_orig));
+
     int32 split_dim = split_dim_orig < 0 ? split_dim_orig + input_shape.dims()
                                          : split_dim_orig;
     OP_REQUIRES(ctx, 0 <= split_dim && split_dim < input_shape.dims(),
@@ -104,7 +94,7 @@ class SplitOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Split").CompileTimeConstInput("split_dim"), SplitOp);
+REGISTER_XLA_OP(Name("Split").CompileTimeConstantInput("split_dim"), SplitOp);
 
 class SplitVOp : public XlaOpKernel {
  public:
@@ -138,7 +128,6 @@ class SplitVOp : public XlaOpKernel {
     // Check that sizes are correct.
     int total_split_size = 0;
     int neg_one_dim = -1;
-    std::vector<int64> split_sizes_vec(num_split, -1);
     const TensorShape split_size_shape = ctx->InputShape(1);
     OP_REQUIRES(ctx,
                 split_size_shape.dims() == 1 &&
@@ -150,12 +139,11 @@ class SplitVOp : public XlaOpKernel {
                     split_size_shape.dims(), "-D and ",
                     split_size_shape.num_elements(), " elements"));
     // Get the dimension of this split.
-    xla::Literal split_size_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &split_size_literal));
+    std::vector<int64> split_sizes;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &split_sizes));
 
     for (int i = 0; i < num_split; ++i) {
-      int slice_size;
-      slice_size = split_size_literal.Get<int>({i});
+      int64 slice_size = split_sizes[i];
       if (slice_size == -1) {
         OP_REQUIRES(
             ctx, neg_one_dim == -1,
@@ -164,7 +152,6 @@ class SplitVOp : public XlaOpKernel {
                                     i));
         neg_one_dim = i;
       } else {
-        split_sizes_vec[i] = slice_size;
         total_split_size += slice_size;
       }
     }
@@ -183,7 +170,7 @@ class SplitVOp : public XlaOpKernel {
                                 total_split_size));
 
     if (neg_one_dim >= 0) {
-      split_sizes_vec[neg_one_dim] =
+      split_sizes[neg_one_dim] =
           input_shape.dim_size(split_dim) - total_split_size;
     }
 
@@ -195,7 +182,7 @@ class SplitVOp : public XlaOpKernel {
     std::vector<int64> strides(input_shape.dims(), 1);
     for (int i = 0; i < num_split; ++i) {
       TensorShape output_shape(input_shape);
-      int slice_size = split_sizes_vec[i];
+      int slice_size = split_sizes[i];
       output_shape.set_dim(split_dim, slice_size);
 
       // Slice out the ith split from the split dimension.
@@ -207,8 +194,8 @@ class SplitVOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("SplitV")
-                    .CompileTimeConstInput("split_dim")
-                    .CompileTimeConstInput("size_splits"),
+                    .CompileTimeConstantInput("split_dim")
+                    .CompileTimeConstantInput("size_splits"),
                 SplitVOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index ee70f508a9586d5f47bd7bb7670506d4c92b369f..8e9e4daf99d3dd3b8e149e3f3e5f6c27665c0fcb 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -69,7 +69,7 @@ Status MaybeInitializeStack(xla::XlaBuilder* builder, XlaResource* resource,
   }
 
   TensorShape stack_shape;
-  stack_shape.AddDim(resource->tensor_array_size());
+  stack_shape.AddDim(resource->max_array_size());
   stack_shape.AppendShape(elem_shape);
 
   if (!resource->initialized()) {
@@ -97,10 +97,10 @@ class StackOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    int64 size;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &size));
+    int64 max_size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &max_size));
     OP_REQUIRES(
-        ctx, size >= 0,
+        ctx, max_size >= 0,
         errors::InvalidArgument(
             "XLA compilation requires a fixed stack size upper bound. If "
             "you are using tf.while_loop, set the maximum_iterations parameter "
@@ -108,14 +108,9 @@ class StackOp : public XlaOpKernel {
 
     // We defer initializing the Stack resource until we see the first push.
     // Otherwise we do not know the shape of the stack elements.
-    xla::XlaOp value;
-    XlaContext& xc = XlaContext::Get(ctx);
-    XlaResource* resource;
-    string name = absl::StrCat("Stack: ", stack_name_);
-    OP_REQUIRES_OK(
-        ctx, xc.CreateResource(XlaResource::kStack, -1, std::move(name), dtype_,
-                               TensorShape(), value, /*tensor_array_size=*/size,
-                               /*tensor_array_gradients=*/{}, &resource));
+    XlaResource* resource =
+        ctx->xla_context()->AddResource(XlaResource::CreateStack(
+            /*name=*/absl::StrCat("Stack: ", stack_name_), dtype_, max_size));
     ctx->SetResourceOutput(0, resource);
   }
 
@@ -126,7 +121,9 @@ class StackOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
 };
 
-REGISTER_XLA_OP(Name("StackV2").CompileTimeConstInput("max_size"), StackOp);
+REGISTER_XLA_OP(
+    Name("StackV2").CompileTimeConstantInput("max_size").CompilationOnly(),
+    StackOp);
 
 class StackPushOp : public XlaOpKernel {
  public:
@@ -173,7 +170,7 @@ class StackPushOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackPushOp);
 };
 
-REGISTER_XLA_OP(Name("StackPushV2"), StackPushOp);
+REGISTER_XLA_OP(Name("StackPushV2").CompilationOnly(), StackPushOp);
 
 class StackPopOp : public XlaOpKernel {
  public:
@@ -227,7 +224,7 @@ class StackPopOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackPopOp);
 };
 
-REGISTER_XLA_OP(Name("StackPopV2"), StackPopOp);
+REGISTER_XLA_OP(Name("StackPopV2").CompilationOnly(), StackPopOp);
 
 class StackCloseOp : public XlaOpKernel {
  public:
@@ -241,7 +238,7 @@ class StackCloseOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(StackCloseOp);
 };
 
-REGISTER_XLA_OP(Name("StackCloseV2"), StackCloseOp);
+REGISTER_XLA_OP(Name("StackCloseV2").CompilationOnly(), StackCloseOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 5412e135478361d08965e4621ec52cfb4a792f1d..50653d7b3973b73d580cdeec5d71943b575d7cc9 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -17,27 +17,43 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/lib/random.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
 namespace {
 
+xla::XlaOp MaybeConvertF32ToBF16(xla::XlaOp input, DataType dtype) {
+  // Mask the last 16 bit. With normal rounding, values near "maxval" would be
+  // converted to "maxval" which is out of range ["minval", "maxval"). In
+  // addition, the distribution near the limit is not uniform.
+  if (dtype == DT_BFLOAT16) {
+    xla::XlaBuilder* builder = input.builder();
+    auto output = xla::BitcastConvertType(input, xla::U32) &
+                  xla::ConstantR0<uint32>(builder, 0xFFFF0000);
+    return xla::ConvertElementType(xla::BitcastConvertType(output, xla::F32),
+                                   xla::BF16);
+  } else {
+    return input;
+  }
+}
+
 class StatelessRandomUniformOp : public XlaOpKernel {
  public:
   explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx) {}
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* builder = ctx->builder();
@@ -60,24 +76,81 @@ class StatelessRandomUniformOp : public XlaOpKernel {
     auto uniform = xla::StatelessRngUniform(
         {seed0, seed1}, xla_shape, xla::ConstantR0<float>(builder, 0.0),
         xla::ConstantR0<float>(builder, 1.0));
+    uniform = MaybeConvertF32ToBF16(uniform, dtype_);
     ctx->SetOutput(0, uniform);
   }
 
  private:
+  DataType dtype_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformOp);
 };
 
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
 REGISTER_XLA_OP(Name("StatelessRandomUniform")
-                    .CompileTimeConstInput("shape")
-                    .TypeConstraint("dtype", DT_FLOAT)
+                    .CompileTimeConstantInput("shape")
+                    .TypeConstraint("dtype", {DT_FLOAT, DT_BFLOAT16})
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessRandomUniformOp);
 
+class StatelessRandomUniformIntOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomUniformIntOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    TensorShape seed_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, seed_shape.dims() == 1 && seed_shape.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    TensorShape minval_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval_shape),
+                errors::InvalidArgument("minval must be scalar, got shape ",
+                                        minval_shape.DebugString()));
+    TensorShape maxval_shape = ctx->InputShape(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(maxval_shape),
+                errors::InvalidArgument("minval must be scalar, got shape ",
+                                        maxval_shape.DebugString()));
+
+    xla::XlaOp seed = ctx->Input(1);
+    xla::XlaOp minval = ctx->Input(2);
+    xla::XlaOp maxval = ctx->Input(3);
+
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape, &xla_shape));
+
+    auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
+    auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
+
+    auto uniform =
+        xla::StatelessRngUniform({seed0, seed1}, xla_shape, minval, maxval);
+    ctx->SetOutput(0, uniform);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformIntOp);
+};
+
+// TODO(phawkins): generalize to non-int32 seed types.
+REGISTER_XLA_OP(Name("StatelessRandomUniformInt")
+                    .CompileTimeConstantInput("shape")
+                    .TypeConstraint("dtype", {DT_INT32, DT_INT64})
+                    .TypeConstraint("Tseed", DT_INT32),
+                StatelessRandomUniformIntOp);
+
 class StatelessRandomNormalOp : public XlaOpKernel {
  public:
   explicit StatelessRandomNormalOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx) {}
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape shape;
@@ -103,24 +176,29 @@ class StatelessRandomNormalOp : public XlaOpKernel {
     // sqrt(2) * erfinv(x)
     auto normal =
         xla::ScalarLike(uniform, std::sqrt(2.0)) * xla::ErfInv(uniform);
+    normal = MaybeConvertF32ToBF16(normal, dtype_);
     ctx->SetOutput(0, normal);
   }
 
  private:
+  DataType dtype_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomNormalOp);
 };
 
 // TODO(phawkins): generalize to non-float, non-int32 seed types.
 REGISTER_XLA_OP(Name("StatelessRandomNormal")
-                    .CompileTimeConstInput("shape")
-                    .TypeConstraint("dtype", DT_FLOAT)
+                    .CompileTimeConstantInput("shape")
+                    .TypeConstraint("dtype", {DT_FLOAT, DT_BFLOAT16})
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessRandomNormalOp);
 
 class StatelessTruncatedNormalOp : public XlaOpKernel {
  public:
   explicit StatelessTruncatedNormalOp(OpKernelConstruction* ctx)
-      : XlaOpKernel(ctx) {}
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape shape;
@@ -142,17 +220,20 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
         {seed0, seed1}, xla_shape,
         xla::ConstantR0<float>(builder, std::numeric_limits<float>::min()),
         xla::ConstantR0<float>(builder, 1.0));
-
-    ctx->SetOutput(0, TruncatedNormal(uniform));
+    auto output = TruncatedNormal(uniform);
+    output = MaybeConvertF32ToBF16(output, dtype_);
+    ctx->SetOutput(0, output);
   }
 
  private:
+  DataType dtype_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(StatelessTruncatedNormalOp);
 };
 
 REGISTER_XLA_OP(Name("StatelessTruncatedNormal")
-                    .CompileTimeConstInput("shape")
-                    .TypeConstraint("dtype", DT_FLOAT)
+                    .CompileTimeConstantInput("shape")
+                    .TypeConstraint("dtype", {DT_FLOAT, DT_BFLOAT16})
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessTruncatedNormalOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 2b2e3de64fd0db9d99efa46ecaf7a0fefbae6645..10d990b3213ab882cf44a4df20a977633de3fdab 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -109,9 +109,9 @@ class StridedSliceOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("StridedSlice")
-                    .CompileTimeConstInput("begin")
-                    .CompileTimeConstInput("end")
-                    .CompileTimeConstInput("strides"),
+                    .CompileTimeConstantInput("begin")
+                    .CompileTimeConstantInput("end")
+                    .CompileTimeConstantInput("strides"),
                 StridedSliceOp);
 
 class StridedSliceGradOp : public XlaOpKernel {
@@ -218,10 +218,10 @@ class StridedSliceGradOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("StridedSliceGrad")
-                    .CompileTimeConstInput("shape")
-                    .CompileTimeConstInput("begin")
-                    .CompileTimeConstInput("end")
-                    .CompileTimeConstInput("strides"),
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("begin")
+                    .CompileTimeConstantInput("end")
+                    .CompileTimeConstantInput("strides"),
                 StridedSliceGradOp);
 
 class StridedSliceAssignOp : public XlaOpKernel {
@@ -331,9 +331,9 @@ class StridedSliceAssignOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("ResourceStridedSliceAssign")
-                    .CompileTimeConstInput("begin")
-                    .CompileTimeConstInput("end")
-                    .CompileTimeConstInput("strides"),
+                    .CompileTimeConstantInput("begin")
+                    .CompileTimeConstantInput("end")
+                    .CompileTimeConstantInput("strides"),
                 StridedSliceAssignOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 94108b764fd32fc77520f9a8ea16065c27e6accf..939d7e19515a1cb41e3e23e9d1fa957ae09ecab7 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -61,8 +61,8 @@ Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
         " but op has dtype ", DataTypeString(dtype), ".");
   }
 
-  TF_RET_CHECK(resource->tensor_array_size() >= 0)
-      << resource->name() << " size " << resource->tensor_array_size();
+  TF_RET_CHECK(resource->max_array_size() >= 0)
+      << resource->name() << " size " << resource->max_array_size();
 
   if (!resource->initialized()) {
     TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
@@ -78,7 +78,7 @@ Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
         XLAShapeToTensorShape(shape_or_status.ValueOrDie(), &shape));
 
     TensorShape ta_shape;
-    ta_shape.AddDim(resource->tensor_array_size());
+    ta_shape.AddDim(resource->max_array_size());
     ta_shape.AppendShape(elem_shape);
     if (ta_shape != shape) {
       return errors::InvalidArgument(
@@ -114,7 +114,7 @@ Status CheckTensorArrayIsInitialized(const string& op_name,
 Status GetTensorArrayShape(const XlaResource* resource,
                            xla::XlaBuilder* builder, TensorShape* shape) {
   *shape = resource->shape();
-  shape->InsertDim(0, resource->tensor_array_size());
+  shape->InsertDim(0, resource->max_array_size());
   return Status::OK();
 }
 
@@ -123,9 +123,10 @@ Status GetTensorArrayShape(const XlaResource* resource,
 xla::XlaOp DynamicAddSlice(xla::XlaBuilder* builder, const xla::XlaOp& operand,
                            const xla::XlaOp& update,
                            absl::Span<const int64> update_dims,
-                           const xla::XlaOp& start_indices) {
+                           const xla::XlaOp& start_indices, DataType dtype) {
   xla::XlaOp current = xla::DynamicSlice(operand, start_indices, update_dims);
-  xla::XlaOp sum = xla::Add(current, update);
+  xla::XlaOp sum =
+      dtype == DT_BOOL ? xla::Or(current, update) : xla::Add(current, update);
   return xla::DynamicUpdateSlice(operand, sum, start_indices);
 }
 
@@ -165,13 +166,10 @@ class TensorArrayOp : public XlaOpKernel {
       value = xla::Broadcast(zero, ta_shape.dim_sizes());
     }
 
-    XlaContext& xc = XlaContext::Get(ctx);
-    XlaResource* var;
-    string name = absl::StrCat("TensorArray: ", tensor_array_name_);
-    OP_REQUIRES_OK(
-        ctx, xc.CreateResource(XlaResource::kTensorArray, -1, std::move(name),
-                               dtype_, shape, value, /*tensor_array_size=*/size,
-                               /*tensor_array_gradients=*/{}, &var));
+    XlaResource* var =
+        ctx->xla_context()->AddResource(XlaResource::CreateTensorArray(
+            /*name=*/absl::StrCat("TensorArray: ", tensor_array_name_), dtype_,
+            shape, /*initial_value=*/value, /*max_array_size=*/size));
     ctx->SetResourceOutput(0, var);
 
     Tensor flow(DT_FLOAT, TensorShape({}));
@@ -187,7 +185,7 @@ class TensorArrayOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
 };
 
-REGISTER_XLA_OP(Name("TensorArrayV3").CompileTimeConstInput("size"),
+REGISTER_XLA_OP(Name("TensorArrayV3").CompileTimeConstantInput("size"),
                 TensorArrayOp);
 
 class TensorArrayWriteOp : public XlaOpKernel {
@@ -222,9 +220,16 @@ class TensorArrayWriteOp : public XlaOpKernel {
     slice_shape.InsertDim(0, 1LL);
     auto update = xla::Reshape(value, slice_shape.dim_sizes());
 
-    xla::XlaOp written =
-        DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices);
-
+    xla::XlaOp written;
+    if (resource->tensor_array_multiple_writes_aggregate()) {
+      written = DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(),
+                                start_indices, dtype_);
+    } else {
+      // TODO(b/117569591): Ideally we would report an error in the case that we
+      // see multiple writes to the same offset. Unfortunately there is no way
+      // to report errors at the moment, so we silently overwrite.
+      written = xla::DynamicUpdateSlice(ta, update, start_indices);
+    }
     OP_REQUIRES_OK(ctx, resource->SetValue(written));
     ctx->SetOutput(0, flow);
   }
@@ -391,7 +396,11 @@ class TensorArrayScatterOp : public XlaOpKernel {
     }
 
     if (scatter_all_elements_in_order) {
-      ta = xla::Add(ta, value);
+      if (dtype_ == DT_BOOL) {
+        ta = xla::Or(ta, value);
+      } else {
+        ta = xla::Add(ta, value);
+      }
     } else {
       auto slice_dims = value_shape.dim_sizes();
       slice_dims[0] = 1LL;
@@ -414,7 +423,7 @@ class TensorArrayScatterOp : public XlaOpKernel {
         auto start_indices =
             xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
                      xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
-        ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+        ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices, dtype_);
       }
     }
 
@@ -505,14 +514,13 @@ class TensorArraySplitOp : public XlaOpKernel {
     xla::XlaOp ta = resource->value();
 
     TensorShape ta_shape;
-    ta_shape.AddDim(resource->tensor_array_size());
+    ta_shape.AddDim(resource->max_array_size());
     ta_shape.AppendShape(elem_shape);
 
-    OP_REQUIRES(
-        ctx, lengths.size() == resource->tensor_array_size(),
-        errors::InvalidArgument(
-            "TensorArray's size is not equal to the size of lengths (",
-            lengths.size(), " vs. ", resource->tensor_array_size(), ")"));
+    OP_REQUIRES(ctx, lengths.size() == resource->max_array_size(),
+                errors::InvalidArgument(
+                    "TensorArray's size is not equal to the size of lengths (",
+                    lengths.size(), " vs. ", resource->max_array_size(), ")"));
 
     const xla::XlaOp value = ctx->Input(1);
     const xla::XlaOp flow = ctx->Input(3);
@@ -522,8 +530,13 @@ class TensorArraySplitOp : public XlaOpKernel {
                                         value_shape.DebugString(), " vs. ",
                                         ta_shape.DebugString()));
 
-    OP_REQUIRES_OK(ctx, resource->SetValue(xla::Add(
-                            ta, xla::Reshape(value, ta_shape.dim_sizes()))));
+    const xla::XlaOp reshape = xla::Reshape(value, ta_shape.dim_sizes());
+    if (dtype_ == DT_BOOL) {
+      ta = xla::Or(ta, reshape);
+    } else {
+      ta = xla::Add(ta, reshape);
+    }
+    OP_REQUIRES_OK(ctx, resource->SetValue(ta));
 
     ctx->SetOutput(0, flow);
   }
@@ -534,7 +547,7 @@ class TensorArraySplitOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(TensorArraySplitOp);
 };
 
-REGISTER_XLA_OP(Name("TensorArraySplitV3").CompileTimeConstInput("lengths"),
+REGISTER_XLA_OP(Name("TensorArraySplitV3").CompileTimeConstantInput("lengths"),
                 TensorArraySplitOp);
 
 class TensorArraySizeOp : public XlaOpKernel {
@@ -545,8 +558,7 @@ class TensorArraySizeOp : public XlaOpKernel {
     XlaResource* var;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &var));
     Tensor size_tensor(DT_INT32, {});
-    size_tensor.scalar<int32>()() =
-        static_cast<int32>(var->tensor_array_size());
+    size_tensor.scalar<int32>()() = static_cast<int32>(var->max_array_size());
     ctx->SetConstantOutput(0, size_tensor);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 74d4fcc425bdadb70a7bedf2487deaf6c4a4f7b9..64a24703ae1460abfedb6d9298e1e164076a199a 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -79,8 +79,8 @@ class TensorListReserveOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("TensorListReserve")
-                    .CompileTimeConstInput("element_shape")
-                    .CompileTimeConstInput("num_elements"),
+                    .CompileTimeConstantInput("element_shape")
+                    .CompileTimeConstantInput("num_elements"),
                 TensorListReserveOp);
 
 class EmptyTensorListOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 93d5996b5eaf10221b1d7067e7650b78cd6b8fef..e1c764f3d5c28cf0d812519e4a16786e1f2d3a3a 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -16,7 +16,9 @@ limitations under the License.
 // XLA-specific Tile Op.
 
 #include <vector>
+#include "absl/algorithm/container.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -25,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/macros.h"
@@ -38,11 +41,11 @@ class TileOp : public XlaOpKernel {
   explicit TileOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape multiples_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape multiples_shape = ctx->InputShape("multiples");
 
     OP_REQUIRES(
-        ctx, IsLegacyVector(multiples_shape),
+        ctx, TensorShapeUtils::IsVector(multiples_shape),
         errors::InvalidArgument("Expected multiples to be 1-D, but got shape ",
                                 multiples_shape.DebugString()));
     OP_REQUIRES(ctx, input_shape.dims() == multiples_shape.num_elements(),
@@ -51,78 +54,64 @@ class TileOp : public XlaOpKernel {
                     input_shape.dims(), " but got length ",
                     multiples_shape.dim_size(0)));
     const int input_dims = input_shape.dims();
-
+    auto input = ctx->Input(0);
     // If input is a scalar then multiples has 0 elements and this is
     // a NoOp.
     if (input_dims == 0) {
-      ctx->SetOutput(0, ctx->Input(0));
+      ctx->SetOutput(0, input);
       return;
     }
 
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &literal));
-
-    // zero_element_result is true if the final shape has 0 elements,
-    // i.e. if any of the input dimensions or multiples is zero.
-    std::vector<int64> multiples_array(input_dims);
-    std::vector<int64> output_shape;
-    bool all_multiples_are_one = true;
-    bool one_dimension_is_broadcasted_without_multiple = true;
-    for (int i = 0; i < input_dims; ++i) {
-      int multiple = literal.Get<int>({i});
-      OP_REQUIRES(ctx, multiple >= 0,
+    std::vector<int64> multiples;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("multiples", &multiples));
+    std::vector<int64> output_dims(input_shape.dims());
+    for (int64 i = 0; i < input_shape.dims(); ++i) {
+      OP_REQUIRES(ctx, multiples[i] >= 0,
                   errors::InvalidArgument("Expected multiples[", i,
-                                          "] >= 0, but got ", multiple));
-      int64 new_dim = input_shape.dim_size(i) * multiple;
-      output_shape.push_back(new_dim);
-      multiples_array[i] = multiple;
-      all_multiples_are_one = all_multiples_are_one && multiple == 1;
-      // If the multiple of a non-one dimensions is not one, then binary
-      // operation broadcast semantics will not be sufficient to implement the
-      // tile operation.
-      one_dimension_is_broadcasted_without_multiple =
-          one_dimension_is_broadcasted_without_multiple &&
-          ((input_shape.dim_size(i) > 1 && multiple == 1) ||
-           input_shape.dim_size(i) == 1);
+                                          "] >= 0, but got ", output_dims[i]));
+      output_dims[i] = input_shape.dim_size(i) * multiples[i];
     }
-    auto input = ctx->Input(0);
+
     // If all multiples are 1, than the input is the same as the output.
-    if (all_multiples_are_one) {
+    if (absl::c_all_of(multiples,
+                       [](int64 multiple) { return multiple == 1; })) {
       ctx->SetOutput(0, input);
       return;
     }
-    if (one_dimension_is_broadcasted_without_multiple) {
+
+    bool can_tile_with_implicit_broadcast = true;
+    for (int i = 0; i < input_dims; ++i) {
+      int64 multiple = multiples[i];
+      // If the multiple and input dimension are not 1, then tile cannot be
+      // implemented with a single hlo broadcast.
+      if (multiple != 1 && input_shape.dim_size(i) != 1) {
+        can_tile_with_implicit_broadcast = false;
+      }
+    }
+
+    if (can_tile_with_implicit_broadcast) {
       // Create a constant Zero the size of the output shape to leverage binary
       // operation broadcast semantics.
       auto broadcasted_zero = xla::Broadcast(
-          XlaHelpers::Zero(ctx->builder(), ctx->input_type(0)), output_shape);
-      ctx->SetOutput(0, xla::Add(broadcasted_zero, input));
+          XlaHelpers::Zero(ctx->builder(), ctx->input_type(0)), output_dims);
+      if (ctx->input_type(0) == DT_BOOL) {
+        ctx->SetOutput(0, xla::Or(broadcasted_zero, input));
+      } else {
+        ctx->SetOutput(0, xla::Add(broadcasted_zero, input));
+      }
       return;
     }
 
-    // First broadcast the requisite number of multiples along each
-    // dimension. This prepends the broadcasted dimensions, so an
-    // input of shape [2,3,1] broadcast with multiples [5,4,3] will
-    // end up with shape [5,4,3,2,3,1].
-    auto broadcasted = xla::Broadcast(input, multiples_array);
-    // Now flatten and reshape. The broadcasted dimensions are
-    // paired with the original dimensions so in the above example
-    // we flatten [0,3,1,4,2,5] then reshape to [10,12,3].
-    std::vector<int64> flattened;
-    for (int i = 0; i < output_shape.size(); ++i) {
-      flattened.push_back(i);
-      flattened.push_back(i + output_shape.size());
-    }
-    xla::XlaOp output = xla::Reshape(broadcasted, flattened, output_shape);
-
-    ctx->SetOutput(0, output);
+    auto result = BroadcastTo(ctx->Input("input"), output_dims);
+    OP_REQUIRES_OK(ctx, result.status());
+    ctx->SetOutput(0, result.ValueOrDie());
   }
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(TileOp);
 };
 
-REGISTER_XLA_OP(Name("Tile").CompileTimeConstInput("multiples"), TileOp);
+REGISTER_XLA_OP(Name("Tile").CompileTimeConstantInput("multiples"), TileOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index 183879c7602ccbbd74fca6cb9fa3fc94c066c37d..ee3bdf3394e37c757f31724e73e95417becaa534 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -59,7 +58,7 @@ class TopKOp : public XlaOpKernel {
   bool sorted_;
 };
 
-REGISTER_XLA_OP(Name("TopKV2").CompileTimeConstInput("k").TypeConstraint(
+REGISTER_XLA_OP(Name("TopKV2").CompileTimeConstantInput("k").TypeConstraint(
                     "T", {DT_UINT32, DT_INT32, DT_FLOAT, DT_BFLOAT16}),
                 TopKOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 7077c2e3a546e198bdb4ff944ea531f3158810f2..960c1462ceb8c00a2d6c96564f6c985fd1caef0f 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -320,9 +320,8 @@ class ResourceApplyAdagradDA : public XlaOpKernel {
     xla::XlaOp lr = ctx->Input(4);
     xla::XlaOp l1 = ctx->Input(5);
     xla::XlaOp l2 = ctx->Input(6);
-    xla::XlaBuilder* const b = ctx->builder();
     xla::XlaOp global_step =
-        XlaHelpers::ConvertElementType(b, ctx->Input(7), dtype_);
+        XlaHelpers::ConvertElementType(ctx->Input(7), dtype_);
 
     accum = accum + grad;
     squared_accum = squared_accum + xla::Square(grad);
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 6b303b31d43ce2249a87f25723caf34f84c8387d..c9b324a243e4cc3ec64daa3ca0d285336a0d0154 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -37,8 +37,8 @@ class TransposeOp : public XlaOpKernel {
       : XlaOpKernel(ctx), conjugate_(conjugate) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape input_shape = ctx->InputShape(0);
-    const TensorShape perm_tensor_shape = ctx->InputShape(1);
+    const TensorShape input_shape = ctx->InputShape("x");
+    const TensorShape perm_tensor_shape = ctx->InputShape("perm");
 
     // Preliminary validation of sizes.
     OP_REQUIRES(ctx, TensorShapeUtils::IsVector(perm_tensor_shape),
@@ -52,19 +52,15 @@ class TransposeOp : public XlaOpKernel {
                                         ". But input(1) is a vector of size ",
                                         perm_tensor_shape.num_elements()));
 
-    xla::Literal literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {dims}, &literal));
-
-    std::vector<int32> perm(dims);
-    std::copy(literal.data<int32>().begin(), literal.data<int32>().end(),
-              perm.begin());
+    std::vector<int64> perm;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("perm", &perm));
 
     std::vector<int64> transposed_order;
     // Check whether permutation is a permutation of integers of [0 .. dims).
     absl::InlinedVector<bool, 8> bits(dims);
     bool is_identity = true;
     for (int i = 0; i < dims; ++i) {
-      const int32 d = perm[i];
+      const int64 d = perm[i];
       OP_REQUIRES(
           ctx, 0 <= d && d < dims,
           errors::InvalidArgument(d, " is out of range [0 .. ", dims, ")"));
@@ -83,9 +79,9 @@ class TransposeOp : public XlaOpKernel {
     xla::XlaOp transposed;
     // 0-D, 1-D, and identity transposes do nothing.
     if (dims <= 1 || is_identity) {
-      transposed = ctx->Input(0);
+      transposed = ctx->Input("x");
     } else {
-      transposed = xla::Transpose(ctx->Input(0), transposed_order);
+      transposed = xla::Transpose(ctx->Input("x"), transposed_order);
     }
 
     // Conjugate the transposed result if this is ConjugateTransposeOp.
@@ -106,9 +102,10 @@ class ConjugateTransposeOp : public TransposeOp {
       : TransposeOp(ctx, /*conjugate=*/true) {}
 };
 
-REGISTER_XLA_OP(Name("Transpose").CompileTimeConstInput("perm"), TransposeOp);
+REGISTER_XLA_OP(Name("Transpose").CompileTimeConstantInput("perm"),
+                TransposeOp);
 
-REGISTER_XLA_OP(Name("ConjugateTranspose").CompileTimeConstInput("perm"),
+REGISTER_XLA_OP(Name("ConjugateTranspose").CompileTimeConstantInput("perm"),
                 ConjugateTransposeOp);
 
 // InvertPermutation frequently forms part of the gradient of Transpose.
@@ -153,7 +150,7 @@ class InvertPermutationOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("InvertPermutation")
                     .TypeConstraint("T", DT_INT32)
-                    .CompileTimeConstInput("x"),
+                    .CompileTimeConstantInput("x"),
                 InvertPermutationOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 0bdfc05726105e2d18362a691cbe2aab00bf77f3..a0ea6422d732b00fc1b8cf855d9c9ad603b87c82 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -80,24 +80,8 @@ XLAJIT_MAKE_UNARY(Invert, xla::Not(x));
 XLAJIT_MAKE_UNARY(LogicalNot, xla::Not(x));
 XLAJIT_MAKE_UNARY(Neg, -x);
 
-// Implements Banker's rounding: numbers that are equidistant between two
-// integers are rounded towards even.
-xla::XlaOp RoundToEven(xla::XlaOp x) {
-  auto half = xla::ScalarLike(x, 0.5);
-  auto one = xla::ScalarLike(x, 1.0);
-  auto two = xla::ScalarLike(x, 2.0);
-
-  auto round_val = xla::Floor(x);
-  auto fraction = x - round_val;
-  auto nearest_even_int = round_val - two * xla::Floor(half * x);
-  auto is_odd = xla::Eq(nearest_even_int, one);
-  return xla::Select(xla::Or(xla::Gt(fraction, half),
-                             xla::And(xla::Eq(fraction, half), is_odd)),
-                     round_val + one, round_val);
-}
-
-XLAJIT_MAKE_UNARY(Rint, RoundToEven(x));
-XLAJIT_MAKE_UNARY(Round, RoundToEven(x));
+XLAJIT_MAKE_UNARY(Rint, xla::RoundToEven(x));
+XLAJIT_MAKE_UNARY(Round, xla::RoundToEven(x));
 
 XLAJIT_MAKE_UNARY(Rsqrt, xla::Rsqrt(x));
 
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 559414eeaa5fec75e5a9d1866baaf738c024cd15..ce007fc04a818869686b9936a1607cee42665e87 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -64,7 +64,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
       if (!arg.initialized) {
         *has_uninitialized_vars = true;
       }
-      arg.tensor_array_size = resource->tensor_array_size();
+      arg.max_array_size = resource->max_array_size();
       for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
index 412afeaaad96842521fbd306f5b666e837e675fd..ad8e707e1116d01d492575986a7ab9586022f6b3 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
@@ -89,13 +89,10 @@ class XlaBroadcastHelperOp : public XlaOpKernel {
               lhs_shape.DebugString(), " and ", rhs_shape.DebugString()));
       broadcast_shape[dim] = min_rank_shape->dim_size(i);
     }
-    xla::PrimitiveType type = context->input_xla_type(0);
-    xla::Shape broadcast_xla_shape =
-        xla::ShapeUtil::MakeShape(type, broadcast_shape);
     if (broadcast_lhs) {
-      lhs = xla::BroadcastInDim(lhs, broadcast_xla_shape, broadcast_dims);
+      lhs = xla::BroadcastInDim(lhs, broadcast_shape, broadcast_dims);
     } else {
-      rhs = xla::BroadcastInDim(rhs, broadcast_xla_shape, broadcast_dims);
+      rhs = xla::BroadcastInDim(rhs, broadcast_shape, broadcast_dims);
     }
     context->SetOutput(0, lhs);
     context->SetOutput(1, rhs);
@@ -108,7 +105,7 @@ class XlaBroadcastHelperOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(
-    Name("XlaBroadcastHelper").CompileTimeConstInput("broadcast_dims"),
+    Name("XlaBroadcastHelper").CompileTimeConstantInput("broadcast_dims"),
     XlaBroadcastHelperOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
index fecc7c556eb4121b912796e5811632c46769b479..4612f19971a3ce6994aef303f751748b77ccda9a 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
@@ -90,11 +90,11 @@ class XlaConvOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("XlaConv")
-                    .CompileTimeConstInput("window_strides")
-                    .CompileTimeConstInput("lhs_dilation")
-                    .CompileTimeConstInput("rhs_dilation")
-                    .CompileTimeConstInput("feature_group_count")
-                    .CompileTimeConstInput("padding"),
+                    .CompileTimeConstantInput("window_strides")
+                    .CompileTimeConstantInput("lhs_dilation")
+                    .CompileTimeConstantInput("rhs_dilation")
+                    .CompileTimeConstantInput("feature_group_count")
+                    .CompileTimeConstantInput("padding"),
                 XlaConvOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
index 59502d83c7338bd1b05b3323a97761fff2da186a..a3c2eef993c80e43e7cf9e1f6147e5b337c41cfe 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
@@ -96,9 +96,9 @@ class XlaPadOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("XlaPad")
-                    .CompileTimeConstInput("padding_low")
-                    .CompileTimeConstInput("padding_high")
-                    .CompileTimeConstInput("padding_interior"),
+                    .CompileTimeConstantInput("padding_low")
+                    .CompileTimeConstantInput("padding_high")
+                    .CompileTimeConstantInput("padding_interior"),
                 XlaPadOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc
index 089776fcf74fcf6b363dfff5de8d86d7449eacd6..9043af995386a179f74d95bbc6c17a1cac7881cd 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_select_and_scatter_op.cc
@@ -138,9 +138,9 @@ class XlaSelectAndScatterOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("XlaSelectAndScatter")
-                    .CompileTimeConstInput("window_dimensions")
-                    .CompileTimeConstInput("window_strides")
-                    .CompileTimeConstInput("padding"),
+                    .CompileTimeConstantInput("window_dimensions")
+                    .CompileTimeConstantInput("window_strides")
+                    .CompileTimeConstantInput("padding"),
                 XlaSelectAndScatterOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 8597e7f139d8d32b7e08782e70a4ee44d02618f2..3e7a761120317ff85947559b7b2e52be9232afb7 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -18,16 +18,18 @@ filegroup(
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 
 cc_library(
-    name = "batch_dot",
-    srcs = ["batch_dot.cc"],
-    hdrs = ["batch_dot.h"],
+    name = "broadcast",
+    srcs = ["broadcast.cc"],
+    hdrs = ["broadcast.h"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -36,8 +38,6 @@ cc_library(
     srcs = ["cholesky.cc"],
     hdrs = ["cholesky.h"],
     deps = [
-        ":batch_dot",
-        ":triangular_solve",
         ":util",
         ":while_loop",
         "//tensorflow/compiler/xla:literal",
@@ -47,6 +47,9 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/compiler/xla/client/lib:triangular_solve",
         "//tensorflow/core:lib",
     ],
 )
@@ -71,7 +74,6 @@ cc_library(
     srcs = ["qr.cc"],
     hdrs = ["qr.h"],
     deps = [
-        ":batch_dot",
         ":util",
         ":while_loop",
         "//tensorflow/compiler/xla:literal_util",
@@ -83,7 +85,8 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:numeric",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/core:lib",
     ],
 )
@@ -108,51 +111,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "triangular_solve",
-    srcs = ["triangular_solve.cc"],
-    hdrs = ["triangular_solve.h"],
-    deps = [
-        ":batch_dot",
-        ":util",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/core:lib",
-    ],
-)
-
-xla_test(
-    name = "triangular_solve_test",
-    srcs = ["triangular_solve_test.cc"],
-    tags = ["noasan"],  # sometimes times out, http://b/78650012
-    deps = [
-        ":triangular_solve",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "util",
     srcs = ["util.cc"],
@@ -171,29 +129,6 @@ cc_library(
     ],
 )
 
-xla_test(
-    name = "util_test",
-    srcs = ["util_test.cc"],
-    deps = [
-        ":batch_dot",
-        ":util",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "while_loop",
     srcs = ["while_loop.cc"],
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
deleted file mode 100644
index 5400e8834cb9807f6dd71abe7789b2672e29e905..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-
-xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x,
-                    bool transpose_y, bool conjugate_x, bool conjugate_y,
-                    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x));
-    TF_ASSIGN_OR_RETURN(xla::Shape y_shape, builder->GetShape(y));
-
-    // Check that both tensors have the same number of dimensions. There must be
-    // at least two (the batch dimensions can be empty).
-    if (xla::ShapeUtil::Rank(x_shape) != xla::ShapeUtil::Rank(y_shape)) {
-      return errors::InvalidArgument(
-          "Arguments to BatchedDot have different ranks: ",
-          xla::ShapeUtil::HumanString(x_shape), " vs. ",
-          xla::ShapeUtil::HumanString(y_shape));
-    }
-    const int ndims = xla::ShapeUtil::Rank(x_shape);
-    if (ndims < 2) {
-      return errors::InvalidArgument(
-          "Arguments to BatchedDot must have rank >= 2: ", ndims);
-    }
-
-    // The batch dimensions must be equal and the matrix dimensions must be
-    // valid.
-    std::vector<int64> batch_dimension_numbers;
-    for (int i = 0; i < ndims - 2; ++i) {
-      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
-        return errors::InvalidArgument(
-            "Dimension ", i, " of inputs to BatchedDot must be equal: ",
-            xla::ShapeUtil::HumanString(x_shape), " vs ",
-            xla::ShapeUtil::HumanString(y_shape));
-      }
-      batch_dimension_numbers.push_back(i);
-    }
-
-    int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1);
-    int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2);
-    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
-      return errors::InvalidArgument(
-          "Dimensions ", x_inner_dim, " and ", y_inner_dim,
-          " of arguments to BatchedDot must be equal: ",
-          xla::ShapeUtil::HumanString(x_shape), " transpose: ", transpose_x,
-          " vs. ", xla::ShapeUtil::HumanString(y_shape),
-          " transpose: ", transpose_y);
-    }
-
-    // Check for zero lhs/rhs dim size.
-    if (xla::ShapeUtil::IsZeroElementArray(x_shape) ||
-        xla::ShapeUtil::IsZeroElementArray(y_shape)) {
-      std::vector<int64> dimensions(batch_dimension_numbers.size());
-      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
-        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
-      }
-      int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
-      int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
-      dimensions.push_back(x_shape.dimensions(x_outer_dim));
-      dimensions.push_back(y_shape.dimensions(y_outer_dim));
-      return xla::Broadcast(
-          xla::ConstantLiteral(builder,
-                               xla::LiteralUtil::Zero(x_shape.element_type())),
-          dimensions);
-    }
-
-    if (x_shape.element_type() == xla::C64 && conjugate_x) {
-      x = xla::Conj(x);
-    }
-    if (y_shape.element_type() == xla::C64 && conjugate_y) {
-      y = xla::Conj(y);
-    }
-
-    xla::PrecisionConfig precision_proto;
-    precision_proto.add_operand_precision(precision);
-    precision_proto.add_operand_precision(precision);
-
-    xla::DotDimensionNumbers dot_dnums;
-    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
-    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
-    for (auto batch_dimension_number : batch_dimension_numbers) {
-      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
-      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
-    }
-
-    return xla::DotGeneral(x, y, dot_dnums, &precision_proto);
-  });
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
deleted file mode 100644
index 6edd63a4d3b66c21aa4cce8c9f36eef0dc363cd8..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace tensorflow {
-
-// Multiplies slices of two tensors in batches.
-
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be transposed before multiplication by
-// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each
-// can be elementwise-complex-conjugated by setting the `conjugate_x` or
-// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both
-// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if transpose_x else r_x
-//     c_o = r_y if transpose_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-xla::XlaOp BatchDot(
-    xla::XlaOp x, xla::XlaOp y, bool transpose_x = false,
-    bool transpose_y = false, bool conjugate_x = false,
-    bool conjugate_y = false,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
diff --git a/tensorflow/compiler/tf2xla/lib/broadcast.cc b/tensorflow/compiler/tf2xla/lib/broadcast.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be31f116686a2e302ece730e9d03312a45888a61
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/broadcast.cc
@@ -0,0 +1,91 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/broadcast.h"
+
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace tensorflow {
+
+xla::StatusOr<xla::XlaOp> BroadcastTo(xla::XlaOp input,
+                                      absl::Span<int64 const> output_dims) {
+  xla::XlaBuilder* builder = input.builder();
+  TF_ASSIGN_OR_RETURN(xla::Shape input_shape, builder->GetShape(input));
+  absl::Span<int64 const> input_dims =
+      xla::AsInt64Slice(input_shape.dimensions());
+
+  if (input_dims == output_dims) {
+    return input;
+  }
+
+  if (input_dims.size() > output_dims.size()) {
+    return errors::InvalidArgument(
+        "Input shape (", xla::ShapeUtil::HumanString(input_shape),
+        ") must have rank less than or equal to the output shape [",
+        absl::StrJoin(output_dims, ","), "]");
+  }
+
+  std::vector<int64> broadcast_dims;
+  std::vector<int64> broadcast_shape;
+  auto input_it = input_dims.rbegin();
+  for (auto output_it = output_dims.rbegin(); output_it != output_dims.rend();
+       ++output_it) {
+    if (input_it != input_dims.rend()) {
+      if (!(*output_it == 0 && *input_it == 0) &&
+          !(*input_it != 0 && *output_it % *input_it == 0)) {
+        return errors::InvalidArgument("Invalid shape broadcast from ",
+                                       xla::ShapeUtil::HumanString(input_shape),
+                                       " to [", absl::StrJoin(output_dims, ","),
+                                       "]");
+      }
+
+      broadcast_dims.push_back(broadcast_shape.size());
+      if (*output_it == *input_it) {
+        broadcast_shape.push_back(*output_it);
+      } else if (*output_it != *input_it) {
+        // Add dimensions [I, O/I], which we will later flatten to just
+        // [O]. We must do this in two phases since XLA broadcasting does not
+        // support tiling.
+        broadcast_shape.push_back(*input_it);
+        broadcast_shape.push_back(*output_it / *input_it);
+      }
+      ++input_it;
+    } else {
+      broadcast_shape.push_back(*output_it);
+    }
+  }
+  TF_RET_CHECK(input_it == input_dims.rend());
+
+  absl::c_reverse(broadcast_dims);
+  int broadcast_shape_size = broadcast_shape.size();
+  for (int64& broadcast_dim : broadcast_dims) {
+    broadcast_dim = broadcast_shape_size - broadcast_dim - 1;
+  }
+  absl::c_reverse(broadcast_shape);
+  xla::XlaOp output =
+      xla::BroadcastInDim(input, broadcast_shape, broadcast_dims);
+  if (broadcast_shape != output_dims) {
+    output = xla::Reshape(output, output_dims);
+  }
+  return output;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/broadcast.h b/tensorflow/compiler/tf2xla/lib/broadcast.h
new file mode 100644
index 0000000000000000000000000000000000000000..591e696f06b994a7fdea58bc95ba785f683ce7d1
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/broadcast.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BROADCAST_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_BROADCAST_H_
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace tensorflow {
+
+// Broadcasts 'input' up to shape 'output_dims', using TensorFlow broadcasting
+// rules. Supports broadcasting a dimension of size x to size x*y, i.e., tiling.
+xla::StatusOr<xla::XlaOp> BroadcastTo(xla::XlaOp input,
+                                      absl::Span<int64 const> output_dims);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_BROADCAST_H_
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index ab3d0a566839343828d176d9a46672824e425613..550ab5b05693b79e60e49577309328ac6846d3f9 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -18,11 +18,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -101,10 +102,7 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       // a[..., i, i]
       auto a_ii = DynamicSliceInMinorDims(body_a, {i, i}, {1, 1});
       // np.dot(row, np.swapaxes(row, -1, -2))
-      auto diag_dot = BatchDot(row, row,
-                               /*transpose_x=*/false,
-                               /*transpose_y=*/true, /*conjugate_x=*/false,
-                               /*conjugate_y=*/false, precision);
+      auto diag_dot = BatchDot(row, TransposeInMinorDims(row), precision);
       // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
       //                                              np.swapaxes(row, -1, -2)))
       auto l_ii =
@@ -122,10 +120,7 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       // The columns in [i, n] are zeroed out in `row`, so we just have to
       // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i],
       // r.T)
-      auto dot = BatchDot(body_l, row,
-                          /*transpose_x=*/false,
-                          /*transpose_y=*/true, /*conjugate_x=*/false,
-                          /*conjugate_y=*/false, precision);
+      auto dot = BatchDot(body_l, TransposeInMinorDims(row), precision);
       // np.dot(l[..., i+1:, :i], r.T)
       auto dot_ip1 =
           xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, dot);
@@ -185,9 +180,7 @@ xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
         // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i]))
         auto lhs = SliceInMinorDims(l, {i, 0}, {n, i});
         auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i});
-        auto delta = BatchDot(lhs, rhs, /*transpose_x=*/false,
-                              /*transpose_y=*/true, /*conjugate_x=*/false,
-                              /*conjugate_y=*/false, precision);
+        auto delta = BatchDot(lhs, TransposeInMinorDims(rhs), precision);
         auto before = SliceInMinorDims(a, {i, i}, {n, i + k});
         a = UpdateSliceInMinorDims(a, before - delta, {i, i});
       }
diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/tf2xla/lib/qr.cc
index 6b3f2b6e065b5c99e2d0248237369ecc30188aa5..d6007748609fdd161cb89692a167eb7ed12fe00c 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.cc
+++ b/tensorflow/compiler/tf2xla/lib/qr.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -191,12 +191,8 @@ xla::StatusOr<QRBlockResult> QRBlock(
     auto v_broadcast = xla::Reshape(v, shape);
     // a[:, :] -= tau * np.dot(v[:, np.newaxis],
     //                          np.dot(v[np.newaxis, :], a[:, :]))
-    auto vva =
-        BatchDot(v_broadcast, a, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    vva =
-        BatchDot(v_broadcast, vva, /*transpose_x=*/true, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto vva = BatchDot(v_broadcast, a, precision);
+    vva = BatchDot(TransposeInMinorDims(v_broadcast), vva, precision);
     a = a - xla::Mul(tau, vva,
                      /*broadcast_dimensions=*/batch_dim_indices);
 
@@ -278,12 +274,9 @@ xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
     auto beta = DynamicSliceInMinorDims(taus, {j}, {1});
 
     // yv has shape [..., n, 1]
-    auto yv = BatchDot(y, v, /*transpose_x=*/true, /*transpose_y=*/false,
-                       /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto yv = BatchDot(TransposeInMinorDims(y), v, precision);
     // wyv has shape [..., m, 1]
-    auto wyv =
-        BatchDot(w, yv, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto wyv = BatchDot(w, yv, precision);
 
     auto z = xla::Mul(
         -beta, v + wyv,
@@ -375,23 +368,15 @@ xla::StatusOr<QRDecompositionResult> QRDecomposition(
 
     // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
     auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n});
-    auto a_update =
-        BatchDot(w, a_panel, /*transpose_x=*/true, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    a_update =
-        BatchDot(y, a_update, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto a_update = BatchDot(TransposeInMinorDims(w), a_panel, precision);
+    a_update = BatchDot(y, a_update, precision);
     a_panel = a_panel + a_update;
     a = UpdateSliceInMinorDims(a, a_panel, {i, i + k});
 
     // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
     auto q_panel = SliceInMinorDims(q, {0, i}, {m, m});
-    auto q_update =
-        BatchDot(q_panel, w, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    q_update = BatchDot(q_update, y, /*transpose_x=*/false,
-                        /*transpose_y=*/true, /*conjugate_x=*/false,
-                        /*conjugate_y=*/false, precision);
+    auto q_update = BatchDot(q_panel, w, precision);
+    q_update = BatchDot(q_update, TransposeInMinorDims(y), precision);
     q_panel = q_panel + q_update;
     q = UpdateSliceInMinorDims(q, q_panel, {0, i});
   }
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
deleted file mode 100644
index 6524c2a9b1ada632d80edd234272760c2b545cc4..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/math/math_util.h"
-
-namespace tensorflow {
-
-// Get the diagonal blocks of the coefficient matrix
-xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(a));
-    int ndims = xla::ShapeUtil::Rank(shape);
-    int64 n = xla::ShapeUtil::GetDimension(shape, -1);
-    int64 num_blocks = n / block_size;
-
-    xla::XlaOp diag_blocks;
-
-    // If the coefficient matrix is exactly the block size, we just add a
-    // singleton dimension i.e. [..., n, n] -> [..., 1, n, n]
-    if (n == block_size) {
-      std::vector<int64> permutation(ndims);
-      std::iota(permutation.begin(), permutation.end(), 1);
-      permutation.insert(permutation.end() - 2, 0);
-      return Transpose(Broadcast(a, /*broadcast_sizes=*/{1}), permutation);
-    }
-
-    // We can grab entire blocks using gather
-    if (n > block_size) {
-      // Construct the starting indices of the diagonal blocks
-      auto start_indices =
-          Transpose(Broadcast(Mul(Iota(builder, xla::S32, num_blocks),
-                                  xla::ConstantR0<int32>(builder, block_size)),
-                              /*broadcast_sizes=*/{2}),
-                    /*permutation=*/{1, 0});
-
-      // Gather the diagonal blocks
-      xla::GatherDimensionNumbers dim_numbers;
-      dim_numbers.add_offset_dims(ndims - 1);
-      dim_numbers.add_offset_dims(ndims);
-      dim_numbers.add_start_index_map(ndims - 2);
-      dim_numbers.add_start_index_map(ndims - 1);
-      dim_numbers.set_index_vector_dim(1);
-      diag_blocks = Gather(a, start_indices, dim_numbers,
-                           /*slice_sizes=*/{block_size, block_size});
-    }
-
-    // The last block might be smaller than the block size,
-    // so we will need to pad it
-    if (n % block_size != 0) {
-      // Pad with zeros
-      auto last_blocks =
-          SliceInMinorDims(a, {n - n % block_size, n - n % block_size}, {n, n});
-      xla::PaddingConfig config = xla::MakeNoPaddingConfig(ndims);
-      int64 padding = block_size - n % block_size;
-      config.mutable_dimensions(ndims - 1)->set_edge_padding_high(padding);
-      config.mutable_dimensions(ndims - 2)->set_edge_padding_high(padding);
-      last_blocks =
-          Pad(last_blocks, Zero(builder, shape.element_type()), config);
-
-      // Add a singleton dimension
-      // i.e. [..., block_size, block_size] -> [..., 1, block_size, block_size]
-      TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
-                          builder->GetShape(last_blocks));
-      auto shape_dims = xla::AsInt64Slice(blocks_shape.dimensions());
-      auto last_blocks_dims = std::vector<int64>(ndims);
-      std::copy(shape_dims.begin(), shape_dims.end(), last_blocks_dims.begin());
-      last_blocks_dims.insert(last_blocks_dims.end() - 2, 1);
-      last_blocks = Reshape(last_blocks, last_blocks_dims);
-
-      // Concatenate with the other blocks if necessary
-      if (n > block_size) {
-        diag_blocks =
-            xla::ConcatInDim(builder, {diag_blocks, last_blocks}, ndims - 2);
-      } else {
-        diag_blocks = last_blocks;
-      }
-    }
-
-    return diag_blocks;
-  });
-}
-
-xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
-                                bool transpose_a, bool conjugate_a,
-                                xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = diag_blocks.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    // Input is a batch of square lower triangular square matrices. Its shape is
-    // (..., size, size). We resize this to (num_blocks, size, size).
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(diag_blocks));
-    int64 block_size = xla::ShapeUtil::GetDimension(shape, -1);
-    int64 num_blocks = xla::ShapeUtil::ElementsIn(shape) /
-                       tensorflow::MathUtil::IPow(block_size, 2);
-    diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
-
-    // The input must be triangular because we rely on that when doing
-    // multiplications later on
-    diag_blocks = Triangle(diag_blocks, /*lower=*/lower);
-
-    // Rescale blocks to be unit triangular, but avoid dividing by
-    // zero (which can happen if the last block was padded) otherwise it will
-    // introduce nans which will propagate
-    auto diags = GetMatrixDiagonal(diag_blocks);
-    TF_ASSIGN_OR_RETURN(xla::Shape diags_shape, builder->GetShape(diags));
-    auto one = ScalarLike(diags, 1);
-    auto ones = Broadcast(one, xla::AsInt64Slice(diags_shape.dimensions()));
-    diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
-    auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
-
-    // We can now use the fact that for an upper triangular matrix
-    // [[L11, 0], [L21, L22]], given the inverses L11' and L22', we have
-    // L22' = -L22' * L21 * L11'. In our case, L21 is a vector and our blocks
-    // have been rescaled to be unit triangular, so L22 = L22' = 1.
-
-    // Initialize the output matrix with -1s on the diagonal. We use -1 instead
-    // of 1 because we cannot do matrix-vector multiplies with variable shapes
-    // inside of a loop, or do irregularly shaped in-place updates. Hence,
-    // L21 <- -L22 * L21 * L11 cannot be done naively. Instead, we update the
-    // entire row i.e. we calculate
-    // [L21 L22 0] <- -[L21 L22 0] @ diag_blocks([L11', -I, -I])
-    // which means [L21 L22 0] <- [-L21 * L11', L22, 0].
-    auto identity =
-        IdentityMatrix(builder, shape.element_type(), block_size, block_size);
-    auto neg_identity = -identity;
-
-    // The first or last  diagonal element should be set to 1 instead of -1
-    // though, since we never update it
-    auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
-    auto start_index = (lower) ? 0 : block_size - 1;
-    auto output_block = DynamicUpdateSlice(
-        neg_identity, pos_one,
-        /*start_indices=*/xla::ConstantR1<int>(builder, 2, start_index));
-
-    // Broadcast diag([1, -1, -1, ...]) to every block
-    xla::XlaOp output = Broadcast(output_block,
-                                  /*broadcast_sizes=*/{num_blocks});
-
-    // Now we construct a loop that performs matrix-vector multiplications
-    // inverting the blocks one row at a time
-    std::vector<xla::Shape> tuple_shapes = {
-        // The loop iteration counter is a scalar, incremented each iteration.
-        xla::ShapeUtil::MakeShape(xla::S32, {}),
-        // The output has the shape of A, with one row updated each iteration.
-        xla::ShapeUtil::MakeShape(shape.element_type(),
-                                  {num_blocks, block_size, block_size}),
-        // The input is a loop invariant.
-        xla::ShapeUtil::MakeShape(shape.element_type(),
-                                  {num_blocks, block_size, block_size})};
-    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
-
-    auto init_i = One(builder, xla::S32);
-    auto init = xla::Tuple(builder, {init_i, output, scaled_diag_blocks});
-
-    // Construct the loop condition function.
-    std::unique_ptr<xla::XlaBuilder> condb =
-        builder->CreateSubBuilder("InvertDiagCond");
-    {
-      auto i = GetTupleElement(
-          Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
-      Lt(i, xla::ConstantR0<int32>(condb.get(), block_size));
-    }
-    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
-
-    // Construct the loop body function.
-    std::unique_ptr<xla::XlaBuilder> bodyb =
-        builder->CreateSubBuilder("InvertDiagBody");
-    {
-      auto input_tuple =
-          Parameter(bodyb.get(), 0, tuple_shape, "InvertDiagBodyTuple");
-
-      auto i = GetTupleElement(input_tuple, 0);
-      auto body_out = GetTupleElement(input_tuple, 1);
-      auto body_input = GetTupleElement(input_tuple, 2);
-
-      auto zero = xla::ConstantR1<int32>(bodyb.get(), 1, 0);
-      auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
-      auto start_indices =
-          xla::ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
-      auto input_row =
-          DynamicSlice(body_input, start_indices,
-                       /*slice_sizes=*/{num_blocks, 1, block_size});
-
-      // We want -L21 L11^{-1}
-      xla::DotDimensionNumbers dnums;
-      dnums.add_lhs_batch_dimensions(0);
-      dnums.add_rhs_batch_dimensions(0);
-      dnums.add_lhs_contracting_dimensions(2);
-      dnums.add_rhs_contracting_dimensions(1);
-      xla::PrecisionConfig precision_proto;
-      precision_proto.add_operand_precision(precision);
-      precision_proto.add_operand_precision(precision);
-      auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
-
-      body_out = DynamicUpdateSlice(body_out, update, start_indices);
-
-      auto next_i = i + ScalarLike(i, 1);
-      xla::Tuple(bodyb.get(), {next_i, body_out, body_input});
-    }
-    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
-
-    // Construct the While loop and return the result,
-    // return while_loop(cond_fun, body_fun, init)[1]
-    auto invert_while = While(cond, body, init);
-    auto inv_diag_blocks = GetTupleElement(invert_while, 1);
-
-    // Undo the scaling
-    inv_diag_blocks = Div(inv_diag_blocks, diags,
-                          /*broadcast_dimensions=*/{0, 1});
-
-    // Reshape back to original batch major dimensions
-    return Reshape(inv_diag_blocks, xla::AsInt64Slice(shape.dimensions()));
-  });
-}
-
-xla::XlaOp SolveWithInvertedDiagonalBlocks(
-    xla::XlaOp a, xla::XlaOp b, xla::XlaOp inv_diag_blocks, bool left_side,
-    bool lower, bool transpose_a, bool conjugate_a,
-    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
-                        builder->GetShape(inv_diag_blocks));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    int64 block_size = xla::ShapeUtil::GetDimension(blocks_shape, -1);
-
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    int64 ndims = xla::ShapeUtil::Rank(a_shape);
-    int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
-    int64 num_blocks = n / block_size + (n % block_size != 0);
-    int64 m_dim = (left_side) ? -1 : -2;
-    int64 m = xla::ShapeUtil::GetDimension(b_shape, m_dim);
-
-    // Initialize the solution
-    auto x = ZerosLike(b);
-
-    // This loop is unrolled for performance reasons, but it could be expressed
-    // rolled as well since the matrices are of the same size each iteration
-    for (int i = 0; i < num_blocks; i++) {
-      // High-level intuition: We have B[i] = L[i] @ X. Since L is upper
-      // triangular this means B[i] = L[i, :i + 1] @ X[:i + 1]. We can split
-      // this into two parts: B[i] = L[i, :i] @ X[:i] + L[i, i] @ X[i] which
-      // can be solved for X[i] as X[i] = inv(L[i, i]) @ B[i] - L[i, :i] @ X[:i]
-
-      // Decide whether we go from first block to last or vice versa
-      auto j = (left_side ^ lower ^ transpose_a) ? num_blocks - 1 - i : i;
-
-      // Get the size of the inverse blocks (the last one might be smaller)
-      int64 block = (n % block_size != 0 && j + 1 == num_blocks)
-                        ? n % block_size
-                        : block_size;
-      auto inv_block =
-          MaybeConjugate(Collapse(SliceInMinorDims(inv_diag_blocks, {j, 0, 0},
-                                                   {j + 1, block, block}),
-                                  /*dimensions=*/{ndims - 2, ndims - 1}),
-                         conjugate_a);
-
-      // Get the corresponding row of B
-      int64 k = std::min((j + 1) * block_size, n);
-      std::vector<int64> start = {j * block_size, 0};
-      std::vector<int64> end = {k, m};
-      if (!left_side) {
-        std::swap(start[0], start[1]);
-        std::swap(end[0], end[1]);
-      }
-      auto b_row = SliceInMinorDims(b, start, end);
-
-      xla::XlaOp remainder;
-      if (i == 0) {
-        remainder = b_row;
-      } else {
-        // This matrix multiply involves a lot of multiplying with zero (namely,
-        // X[i * block_size:] = 0), but this is faster than slicing...
-        end = {k, n};
-        if (!left_side) {
-          std::swap(end[0], end[1]);
-        }
-        if (transpose_a) {
-          std::swap(start[0], start[1]);
-          std::swap(end[0], end[1]);
-        }
-        auto a_row =
-            MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a);
-        if (left_side) {
-          remainder = b_row - BatchDot(a_row, x, transpose_a, false,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false, precision);
-        } else {
-          remainder = b_row - BatchDot(x, a_row, false, transpose_a,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false, precision);
-        }
-      }
-
-      xla::XlaOp x_update;
-      auto zero = Zero(builder, xla::S32);
-      auto start_index =
-          xla::ConstantR0WithType(builder, xla::S32, j * block_size);
-      std::vector<xla::XlaOp> update_starts = {start_index, zero};
-      if (left_side) {
-        x_update =
-            BatchDot(inv_block, remainder, transpose_a, false,
-                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-      } else {
-        x_update =
-            BatchDot(remainder, inv_block, false, transpose_a,
-                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-        std::swap(update_starts[0], update_starts[1]);
-      }
-      x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
-    }
-
-    return x;
-  });
-}
-
-xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
-                           bool lower, bool transpose_a, bool conjugate_a,
-                           int64 block_size,
-                           xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    if (xla::ShapeUtil::Rank(a_shape) != xla::ShapeUtil::Rank(b_shape)) {
-      return errors::InvalidArgument(
-          "Arguments to TriangularSolve have different ranks: ",
-          xla::ShapeUtil::HumanString(a_shape), " vs. ",
-          xla::ShapeUtil::HumanString(b_shape));
-    }
-    const int64 ndims = xla::ShapeUtil::Rank(a_shape);
-    if (ndims < 2) {
-      return errors::InvalidArgument(
-          "Arguments to TriangularSolve must have rank >= 2: ", ndims);
-    }
-    // The batch dimensions must be equal.
-    std::vector<int64> batch_dimensions;
-    for (int i = 0; i < ndims - 2; ++i) {
-      int64 a_size = a_shape.dimensions(i);
-      int64 b_size = b_shape.dimensions(i);
-      if (a_size != b_size) {
-        return errors::InvalidArgument(
-            "Batch dimensions of arguments to TriangularSolve must be equal: ",
-            xla::ShapeUtil::HumanString(a_shape), " vs ",
-            xla::ShapeUtil::HumanString(b_shape));
-      }
-      batch_dimensions.push_back(a_size);
-    }
-
-    if (xla::ShapeUtil::GetDimension(a_shape, -1) !=
-        xla::ShapeUtil::GetDimension(a_shape, -2)) {
-      return errors::InvalidArgument(
-          "The 'a' arguments to TriangularSolve must be square matrices: ",
-          xla::ShapeUtil::HumanString(a_shape));
-    }
-    const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
-    const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
-    if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(a_shape, -1)) {
-      return errors::InvalidArgument(
-          "Arguments to TriangularSolve have incompatible matrix shapes: ",
-          xla::ShapeUtil::HumanString(a_shape), " vs ",
-          xla::ShapeUtil::HumanString(b_shape));
-    }
-
-    if (block_size < 1) {
-      return errors::InvalidArgument(
-          "block_size argument to TriangularSolve must be >= 1; got ",
-          block_size);
-    }
-
-    // We find the diagonal blocks of the coefficient matrix
-    auto diag_blocks = DiagonalBlocks(a, block_size);
-
-    // We invert these blocks in parallel using batched matrix-vector products
-    auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, transpose_a,
-                                                conjugate_a, precision);
-
-    // We now find the solution using GEMMs
-    auto x =
-        SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side, lower,
-                                        transpose_a, conjugate_a, precision);
-
-    return x;
-  });
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 804671fbc75b0a5a6e04b204822b6f084013cd8b..c0bd172d17c192435ba8ee196f9def0491c0bf5c 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -113,36 +113,6 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
   return xla::ConstantLiteral(builder, literal);
 }
 
-xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span<const int64> start,
-                            absl::Span<const int64> end) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_RET_CHECK(start.size() == end.size());
-    int64 n_minor_dims = start.size();
-
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    auto major_dims = xla::AsInt64Slice(shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - n_minor_dims);
-
-    // Prepends 0s in the major dim
-    std::vector<int64> padded_start(n_dims, 0);
-    std::copy(start.begin(), start.end(),
-              padded_start.begin() + major_dims.size());
-
-    // Prepends the shape of the major dims.
-    std::vector<int64> padded_end(n_dims);
-    std::copy(major_dims.begin(), major_dims.end(), padded_end.begin());
-    std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size());
-
-    std::vector<int64> strides(n_dims, 1);
-    return xla::Slice(x, padded_start, padded_end, strides);
-  });
-}
 
 std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
                                  absl::Span<const int64> ys) {
@@ -152,100 +122,4 @@ std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
   return output;
 }
 
-xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts,
-                                   absl::Span<const int64> sizes) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    int64 n_minor_dims = starts.size();
-    TF_RET_CHECK(n_minor_dims == sizes.size());
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    auto major_dims = xla::AsInt64Slice(shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - sizes.size());
-    auto padded_starts = PrependZerosInMajorDims(x, starts);
-    auto padded_sizes = ConcatVectors(major_dims, sizes);
-    return xla::DynamicSlice(x, padded_starts, padded_sizes);
-  });
-}
-
-xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update,
-                       absl::Span<const int64> start) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
-    std::vector<int32> start_as_int32(start.begin(), start.end());
-    auto start_constant = xla::ConstantR1<int32>(builder, start_as_int32);
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_ASSIGN_OR_RETURN(xla::Shape start_constant_shape,
-                        builder->GetShape(start_constant));
-    const int64 start_length =
-        xla::ShapeUtil::GetDimension(start_constant_shape, -1);
-    TF_RET_CHECK(start_length == n_dims);
-    return xla::DynamicUpdateSlice(x, update, start_constant);
-  });
-}
-
-xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                  absl::Span<const int64> start) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    const int64 n_minor_dims = start.size();
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    std::vector<int64> padded_start(n_dims, 0);
-    std::copy(start.begin(), start.end(),
-              padded_start.begin() + (n_dims - n_minor_dims));
-    return UpdateSlice(x, update, padded_start);
-  });
-}
-
-xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                         absl::Span<const xla::XlaOp> starts) {
-  auto padded_starts = PrependZerosInMajorDims(x, starts);
-  return xla::DynamicUpdateSlice(x, update, padded_starts);
-}
-
-xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    auto zero = xla::Reshape(xla::ConstantR0<int32>(builder, 0), {1});
-    std::vector<xla::XlaOp> padded_starts(n_dims, zero);
-    for (int i = 0; i < starts.size(); ++i) {
-      padded_starts[n_dims - starts.size() + i] = xla::Reshape(starts[i], {1});
-    }
-    return xla::ConcatInDim(builder, padded_starts, 0);
-  });
-}
-
-xla::XlaOp TransposeInMinorDims(xla::XlaOp x) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    std::vector<int64> permutation(n_dims);
-    std::iota(permutation.begin(), permutation.end(), 0);
-    std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
-    return xla::Transpose(x, permutation);
-  });
-}
-
-xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    auto perform_conj = shape.element_type() == xla::C64 && conjugate;
-    return perform_conj ? xla::Conj(x) : x;
-  });
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 80e9e5b002d49581209e608b98606e02709c5876..aec8061cb4322b8d315b6cdc80c7fff1e0cb4cb1 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -38,44 +38,10 @@ xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
 xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
                           int64 value);
 
-// Builds a vector of zeros of length rank(x) with the last values being
-// those in `starts`.
-xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts);
-
-// Performs a slice in the minor dimensions of a Tensor.
-xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span<const int64> start,
-                            absl::Span<const int64> end);
-
 // Returns the concatenation of `xs` and `ys`.
 std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
                                  absl::Span<const int64> ys);
 
-// Performs a dynamic slice in the minor dimensions of a Tensor.
-xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts,
-                                   absl::Span<const int64> sizes);
-
-// Updates a slice of 'x', i.e.,
-// x[start[0], ..., start[n]] = update
-xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update,
-                       absl::Span<const int64> start);
-
-// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
-// x[..., start[0], ..., start[n]] = update
-xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                  absl::Span<const int64> start);
-
-xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                         absl::Span<const xla::XlaOp> starts);
-
-// Transposes a stack of matrices `x` by swapping the last two dimensions.
-xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
-
-// Applies a complex conjugation operation if `a` is complex and `conjugate_a`
-// is true, otherwise returns its argument.
-xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/tf2xla/lib/util_test.cc
deleted file mode 100644
index 442fe92c34ca26cb1a854cc90da8dc034bca79bb..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/lib/util_test.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/lib/util.h"
-
-#include <memory>
-#include <numeric>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-
-namespace tensorflow {
-namespace {
-
-using UtilTest = xla::ClientLibraryTestBase;
-using UtilLeftLookingTest = xla::ClientLibraryTestBase;
-
-xla::Array2D<float> BValsRight() {
-  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
-}
-
-xla::Array2D<float> BValsLeft() {
-  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
-}
-
-xla::Array2D<float> AValsFull() {
-  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
-}
-
-xla::Array3D<float> BatchedAValsFull() {
-  return {{
-              {2, 0, 1, 2},
-              {3, 6, 0, 1},
-              {4, 7, 9, 0},
-              {5, 8, 10, 11},
-          },
-          {
-              {16, 24, 8, 12},
-              {24, 61, 82, 48},
-              {8, 82, 456, 106},
-              {12, 48, 106, 62},
-          }};
-}
-
-XLA_TEST_F(UtilTest, Simple2dLookup) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, x, y;
-  auto a_data = CreateR2Parameter<float>(BValsRight(), 0, "a", &builder, &a);
-  auto x_data = CreateR0Parameter<int>(2, 1, "x", &builder, &x);
-  auto y_data = CreateR0Parameter<int>(1, 2, "y", &builder, &y);
-  DynamicSliceInMinorDims(a, {x, y}, {1, 1});
-
-  ComputeAndCompareR2<float>(&builder, {{10}},
-                             {a_data.get(), x_data.get(), y_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(UtilTest, Simple3dLookup) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, index;
-  auto a_data =
-      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
-  auto index_data = CreateR0Parameter<int>(1, 1, "index", &builder, &index);
-
-  DynamicSliceInMinorDims(a, {index, xla::ConstantR0<int32>(&builder, 0)},
-                          {1, 4});
-
-  ComputeAndCompareR3<float>(&builder, {{{3, 6, 0, 1}}, {{24, 61, 82, 48}}},
-                             {a_data.get(), index_data.get()});
-}
-
-XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b, x, y;
-  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>({{9, 1, -10}}, 1, "b", &builder, &b);
-  auto x_data = CreateR0Parameter<int>(2, 2, "x", &builder, &x);
-  auto y_data = CreateR0Parameter<int>(1, 3, "y", &builder, &y);
-
-  DynamicUpdateSliceInMinorDims(a, b, {x, y});
-
-  xla::Array2D<float> expected(
-      {{{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 9, 1, -10}, {5, 8, 10, 11}}});
-
-  ComputeAndCompareR2<float>(
-      &builder, expected,
-      {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
-}
-
-XLA_TEST_F(UtilTest, RowBatchDot) {
-  xla::XlaBuilder builder(TestName());
-
-  int n = 4;
-
-  xla::XlaOp a, row, index;
-  auto a_data =
-      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
-  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
-                                           "row", &builder, &row);
-  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
-  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
-
-  auto l_index = DynamicSliceInMinorDims(
-      a, {index, xla::ConstantR0<int32>(&builder, 0)}, {1, n});
-  BatchDot(l_index, row, /*transpose_x=*/false, /*transpose_y=*/true);
-
-  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
-                             {a_data.get(), row_data.get(), index_data.get()});
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 20103ec3ae00b57723e05326dbbb1b0f6e1a671a..67d08290033361f16dfff42b06af9b253e84963a 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -32,6 +32,12 @@ Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
   return Status::OK();
 }
 
+xla::StatusOr<xla::Literal> HostTensorToLiteral(const Tensor& host_tensor) {
+  xla::BorrowingLiteral literal;
+  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(host_tensor, &literal));
+  return literal.Clone();
+}
+
 Status HostTensorToMutableBorrowingLiteral(
     Tensor* host_tensor, xla::MutableBorrowingLiteral* literal) {
   xla::Shape xla_shape;
diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h
index 1db7470ee2a839099454b772d4833492e033bc92..a153dddee6127ff9c0858220f2d8a735ab3f0e19 100644
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@@ -30,6 +30,11 @@ namespace tensorflow {
 // 'host_tensor'.
 Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
                                     xla::BorrowingLiteral* literal);
+
+// Returns a Literal with the contents of 'host_tensor', backed by its own
+// storage (i.e., not reusing 'host_tensor's buffers.)
+xla::StatusOr<xla::Literal> HostTensorToLiteral(const Tensor& host_tensor);
+
 // Returns a MutableBorrowingLiteral that utilizes the same underlying buffer
 // owned by 'host_tensor', but is mutable via the xla::Literal methods.
 Status HostTensorToMutableBorrowingLiteral(
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 733eeed3c661c9ed683f0fb7fd90f7f997b8dc2b..bd2c0a5ee88869ba60701c0a7ace05857452eed9 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -283,6 +283,8 @@ REGISTER_OP("XlaReduceWindow")
     .Input("init_value: T")
     .Input("window_dimensions: Tindices")
     .Input("window_strides: Tindices")
+    .Input("base_dilations: Tindices")
+    .Input("window_dilations: Tindices")
     .Input("padding: Tindices")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32, int64}")
@@ -354,12 +356,33 @@ Wraps the XLA Sort operator, documented at
  https://www.tensorflow.org/performance/xla/operation_semantics#sort
 .
 
-Sorts a tensor. Currently only rank 1 sorts in ascending order are supported.
+Sorts a tensor. Currently only sorts in ascending order are supported.
 
 input: A `Tensor` of type T.
 output: A `Tensor` of type T.
 )doc");
 
+REGISTER_OP("XlaKeyValueSort")
+    .Input("keys: K")
+    .Input("values: V")
+    .Output("sorted_keys: K")
+    .Output("sorted_values: V")
+    .Attr("K: realnumbertype")
+    .Attr("V: type")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Wraps the XLA Sort operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#sort
+.
+
+Sorts a tensor. Currently only sorts in ascending order are supported.
+
+keys: A `Tensor` of type K.
+values: A `Tensor` of type V.
+sorted_keys: A `Tensor` of type K.
+sorted_values: A `Tensor` of type V.
+)doc");
+
 // TODO(b/37549631) setting the While Op to always be stateful is too
 // conservative.
 REGISTER_OP("XlaWhile")
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index 69ca39436013ec5cf09ba502a1540d5df322e213..fef97b98c376d9df8bbfd9cb6651216895e46bf4 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -1,9 +1,13 @@
 licenses(["notice"])  # Apache 2.0
 
+package_group(
+    name = "friends",
+    includes = ["//tensorflow:internal"],
+)
+
 package(
     default_visibility = [
-        "//learning/tfx:__subpackages__",
-        "//tensorflow:internal",
+        ":friends",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 27dd18a9bbd5aceece41aaf61eb185acb537b3b6..147e562658bbfc445f99268812e2c3ae1ee61e30 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -212,9 +212,9 @@ bitcast_convert_type = array_ops.bitcast
 
 def broadcast(x, dims, name=None):
   x = ops.convert_to_tensor(x)
-  shape = array_ops.concat(
-      [constant_op.constant(dims),
-       array_ops.shape(x)], axis=0)
+  shape = array_ops.concat([constant_op.constant(dims),
+                            array_ops.shape(x)],
+                           axis=0)
   return array_ops.broadcast_to(x, shape, name=name)
 
 
@@ -250,7 +250,7 @@ def conv(lhs,
     rhs_dilation: dilation to apply between kernel elements
     dimension_numbers: a `ConvolutionDimensionNumbers` proto.
     feature_group_count: number of feature groups for grouped convolution.
-    precision_config: a `PrecisionConfigProto` proto.
+    precision_config: a `xla.PrecisionConfig` proto.
     name: an optional name for the operator
 
   Returns:
@@ -320,6 +320,8 @@ def reduce_window(operand,
                   reducer,
                   window_dimensions,
                   window_strides=None,
+                  base_dilations=None,
+                  window_dilations=None,
                   padding=None,
                   name=None):
   """Wraps the XLA ReduceWindow operator.
@@ -332,22 +334,27 @@ def reduce_window(operand,
     init: a scalar tensor representing the initial value for the reduction
     reducer: a reduction function that combines a pair of scalars.
     window_dimensions: shape of the window, as a list of integers
-    window_strides: inter-window strides, as a list of integers. Optional;
-      if omitted, defaults to strides of 1.
+    window_strides: inter-window strides, as a list of integers. Optional; if
+      omitted, defaults to strides of 1.
     padding: padding to apply to 'operand'. List of (low, high) pairs of
       integers that specify the padding to apply before and after each
       dimension. Optional; if omitted, defaults to no padding.
     name: the operator name, or None.
+
   Returns:
     A tensor that represents the output of the reduce_window operator.
   """
   window_strides = window_strides or [1] * len(window_dimensions)
+  base_dilations = base_dilations or [1] * len(window_dimensions)
+  window_dilations = window_dilations or [1] * len(window_dimensions)
   padding = padding or [(0, 0)] * len(window_dimensions)
   return gen_xla_ops.xla_reduce_window(
       input=operand,
       init_value=init,
       window_dimensions=window_dimensions,
       window_strides=window_strides,
+      base_dilations=base_dilations,
+      window_dilations=window_dilations,
       padding=padding,
       computation=reducer,
       name=name)
@@ -377,4 +384,5 @@ def slice(x, start_dims, limit_dims, strides):
 
 
 sort = gen_xla_ops.xla_sort
+key_value_sort = gen_xla_ops.xla_key_value_sort
 while_loop = gen_xla_ops.xla_while
diff --git a/tensorflow/compiler/tf2xla/shape_util.h b/tensorflow/compiler/tf2xla/shape_util.h
index f7e34a5b40c2f9244c029ed325a76322b8cf54dd..0b231ea8e7a2d8e303e91911e2e0a36fc83e78b4 100644
--- a/tensorflow/compiler/tf2xla/shape_util.h
+++ b/tensorflow/compiler/tf2xla/shape_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
 
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index 6cd7b24592f30d7202b985f3dfd082ea2d85e344..b233e6b2c28e1968bb74901fc684e808ae45ab60 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 
+#include "absl/strings/numbers.h"
 #include "tensorflow/core/graph/algorithm.h"
 
 namespace tensorflow {
@@ -64,4 +65,28 @@ bool HasSideEffectingNodes(const Graph& g) {
   return false;
 }
 
+Status ParseHostComputeCoreList(absl::Span<const string> list_from_attr,
+                                std::map<string, int>* host_compute_core) {
+  for (const auto& hc_core : list_from_attr) {
+    std::vector<string> parts = str_util::Split(hc_core, ":");
+    if (parts.size() != 2) {
+      return errors::InvalidArgument(
+          "Malformed host_compute_core entry ", hc_core,
+          " should be <cluster_name>:<core_number>.");
+    }
+    int core;
+    if (!absl::numbers_internal::safe_strto32_base(parts[1], &core, 10)) {
+      return errors::InvalidArgument("Malformed host_compute_core entry ",
+                                     hc_core,
+                                     " part after ':' should be an integer.");
+    }
+    if (host_compute_core->find(parts[0]) != host_compute_core->end()) {
+      return errors::InvalidArgument(
+          "Duplicate host_compute_core entry for cluster ", parts[0]);
+    }
+    (*host_compute_core)[parts[0]] = core;
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.h b/tensorflow/compiler/tf2xla/side_effect_util.h
index ad07624729f0b0d2443b2fc43d32dfa3377ce115..f22ddb2f58e1fa5c10ca0fdb956d9136942388b7 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.h
+++ b/tensorflow/compiler/tf2xla/side_effect_util.h
@@ -42,6 +42,12 @@ std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g);
 // Returns whether a graph contains side-effecting nodes.
 bool HasSideEffectingNodes(const Graph& g);
 
+// Parse the mapping from outside_compilation_subgraph name to core number,
+// which is specified in an attr as a list of strings
+// <subgraph_name>:<core_index>.
+Status ParseHostComputeCoreList(absl::Span<const string> list_from_attr,
+                                std::map<string, int>* host_compute_core);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_SIDE_EFFECT_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/test_util.cc b/tensorflow/compiler/tf2xla/test_util.cc
index f31bfb45a2f4db270446eb59259969dc0ab63a8e..3c6c9a91b6d2fb47f6dee1c347e9b852f1eea3ec 100644
--- a/tensorflow/compiler/tf2xla/test_util.cc
+++ b/tensorflow/compiler/tf2xla/test_util.cc
@@ -40,12 +40,4 @@ Status InstantiateFunctionForTest(const string& name,
   return Status::OK();
 }
 
-std::unordered_map<string, Node*> BuildNodeIndex(const Graph& graph) {
-  std::unordered_map<string, Node*> index;
-  for (Node* node : graph.nodes()) {
-    index[node->name()] = node;
-  }
-  return index;
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/test_util.h b/tensorflow/compiler/tf2xla/test_util.h
index 350a868568531c0d073e0cf600327d1ff9d62e3a..4ffc94ae3bc7c930720cd625a7856443c77be666 100644
--- a/tensorflow/compiler/tf2xla/test_util.h
+++ b/tensorflow/compiler/tf2xla/test_util.h
@@ -44,9 +44,6 @@ Status InstantiateFunctionForTest(const string& name,
                                   const FunctionLibraryDefinition& library,
                                   InstantiationResultForTest* result);
 
-// Builds a map from node name to Node* for `graph`.
-std::unordered_map<string, Node*> BuildNodeIndex(const Graph& graph);
-
 }  // namespace tensorflow
 
 // Variant of TF_EXPECT_GRAPH_EQ that also compares internal attributes for
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index b22d53805d83069052cc5e16020d6c540d618a82..9fac16a9700419b189bf5393c2b8bd7d76c6c1cc 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -218,7 +218,7 @@ Status CollectArgNodes(const Graph& graph, std::vector<Node*>* arg_nodes) {
         const Node* dup = insert_result.first->second;
         return errors::InvalidArgument(
             "Multiple ", kArgOp, " nodes with index ", index, ", ",
-            n->DebugString(), " and ", dup->DebugString());
+            FormatNodeForError(*n), " and ", FormatNodeForError(*dup));
       }
     }
   }
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 01dd3ba10fec85e6b1d411fbd32fbf9c58b5fe11..cc81772e8c5da710bc733f7e4f5fe820b2c2d110 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/strings/str_cat.h"
-#include "absl/types/optional.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -31,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -76,6 +76,222 @@ Status CheckFeedFetchNameConflicts(const string& kind,
   return Status::OK();
 }
 
+// For graph `g`, copy all function call nodes' FunctionDef from `lookup_fld` to
+// `fld`. This is to ensure that `fld` can instantiate FunctionDef of graph `g`.
+Status CopyAssociatedFunctions(Graph* g,
+                               const FunctionLibraryDefinition* lookup_fld,
+                               FunctionLibraryDefinition* fld) {
+  for (Node* n : g->op_nodes()) {
+    for (const auto& associated_function :
+         GetAssociatedFunctions(*n, lookup_fld)) {
+      switch (associated_function.type()) {
+        case AssociatedFunctionInfo::kFunctionCallNode: {
+          const FunctionDef* fdef =
+              lookup_fld->Find(associated_function.func_name());
+          if (!fdef) {
+            return errors::Internal(
+                "Cannot find function ", associated_function.func_name(),
+                " for function call node ", n->DebugString());
+          }
+          TF_RETURN_IF_ERROR(fld->AddFunctionDef(*fdef));
+          break;
+        }
+        case AssociatedFunctionInfo::kSymbolicGradient:
+        case AssociatedFunctionInfo::kFunctionAttr:
+          break;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// For graph `g`, replaces _Arg nodes whose "index" attribute is in
+// `const_input_index_to_node` with Const nodes.
+Status ReplaceArgUsageWithConstNode(
+    Graph* g,
+    const std::unordered_map<int, const Node*>& const_input_index_to_node) {
+  // Collect all _Arg nodes.
+  std::unordered_map<int, Node*> arg_nodes;
+  for (Node* n : g->op_nodes()) {
+    if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      arg_nodes[index] = n;
+    }
+  }
+
+  for (const auto& iter : const_input_index_to_node) {
+    int arg_index = iter.first;
+    Node* const_node = g->CopyNode(iter.second);
+    Node* arg_node = arg_nodes[arg_index];
+
+    // Collect all usages of the _Arg node.
+    struct OutEdgeInfo {
+      int dst_node_id, dst_input;
+    };
+    std::vector<OutEdgeInfo> usages;
+    for (const Edge* e : arg_node->out_edges()) {
+      if (e->IsControlEdge()) {
+        continue;
+      }
+      usages.push_back({e->dst()->id(), e->dst_input()});
+    }
+
+    for (int i = 0; i < usages.size(); i++) {
+      // Make a copy of `usage_node`, and change its input to const node.
+      Node* usage_node = g->FindNodeId(usages[i].dst_node_id);
+      NodeDef replace_def = usage_node->def();
+      *replace_def.mutable_input(usages[i].dst_input) = const_node->name();
+      TF_ASSIGN_OR_RETURN(Node * replace_node,
+                          ReplaceNode(g, usage_node, replace_def));
+      const Edge* usage_edge;
+      TF_RETURN_IF_ERROR(
+          replace_node->input_edge(usages[i].dst_input, &usage_edge));
+      g->RemoveEdge(usage_edge);
+      g->AddEdge(const_node, 0, replace_node, usages[i].dst_input);
+
+      // Later entries in `usages` might have `usage_node` as dst node, but
+      // `usage_node` is removed. Replace such entries with `replace_node`.
+      for (int j = i + 1; j < usages.size(); j++) {
+        if (usages[j].dst_node_id == usages[i].dst_node_id) {
+          usages[j].dst_node_id = replace_node->id();
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// For a node's function attr (e.g. then/else branch for "If" nodes), rewrites
+// the function to replace _Arg nodes in `const_input_index_to_node` with Const
+// inputs.
+Status PropagateConstIntoFuncAttr(
+    Node* n, const string& attr_name,
+    const std::unordered_map<int, const Node*>& const_input_index_to_node,
+    const FunctionLibraryDefinition* lookup_fld,
+    FunctionLibraryDefinition* fld) {
+  // Instantiate the function.
+  NameAttrList func_attr;
+  TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), attr_name, &func_attr));
+  const FunctionDef* fdef = lookup_fld->Find(func_attr.name());
+  if (!fdef) {
+    return errors::Internal("Cannot find function ", func_attr.name(),
+                            " for node ", n->name());
+  }
+  FunctionBody* fbody;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fdef, AttrSlice(&func_attr.attr()), lookup_fld,
+      [lookup_fld](const string& op, const OpDef** sig) {
+        return lookup_fld->LookUpOpDef(op, sig);
+      },
+      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+
+  // Rewrite _Arg usages with Const node.
+  Graph* func_graph = fbody->graph;
+  TF_RETURN_IF_ERROR(
+      ReplaceArgUsageWithConstNode(func_graph, const_input_index_to_node));
+
+  // Save rewritten function.
+  FunctionDef replace_fdef;
+  string new_func_name =
+      fld->UniqueFunctionName(absl::StrCat(func_attr.name(), "_const_"));
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*func_graph, new_func_name, &replace_fdef));
+  TF_RETURN_IF_ERROR(fld->AddFunctionDef(replace_fdef));
+
+  // Change the node to use rewritten function.
+  func_attr.set_name(new_func_name);
+  n->ClearAttr(attr_name);
+  n->AddAttr(attr_name, func_attr);
+
+  // Copy associated functions.
+  TF_RETURN_IF_ERROR(CopyAssociatedFunctions(func_graph, lookup_fld, fld));
+
+  return Status::OK();
+}
+
+// For an "If" node in graph `g`, if it has Const node inputs, rewrite its
+// then/else branch function to replace _Arg nodes with those Const inputs.
+Status PropagateConstIntoIfNode(Graph* g, Node* if_node,
+                                const FunctionLibraryDefinition* lookup_fld,
+                                FunctionLibraryDefinition* fld) {
+  // Notice that first input for If node is predicate; other inputs are function
+  // inputs.
+  std::unordered_map<int, const Node*> const_input_index_to_node;
+  for (int i = 1; i < if_node->num_inputs(); i++) {
+    const Node* input_node;
+    TF_RETURN_IF_ERROR(if_node->input_node(i, &input_node));
+    if (input_node->type_string() == "Const") {
+      const_input_index_to_node[i - 1] = input_node;
+    }
+  }
+  if (const_input_index_to_node.empty()) {
+    return Status::OK();
+  }
+
+  // Rewrite "then_branch" and "else_branch" function, replace usage of those
+  // _Arg nodes with corresponding const node.
+  for (const auto& attr_name :
+       std::vector<string>{"then_branch", "else_branch"}) {
+    TF_RETURN_IF_ERROR(PropagateConstIntoFuncAttr(
+        if_node, attr_name, const_input_index_to_node, lookup_fld, fld));
+  }
+
+  return Status::OK();
+}
+
+// For a "While" node in graph `g`, if it has Const node inputs, rewrite its
+// cond/body function to replace _Arg nodes with those Const inputs.
+Status PropagateConstIntoWhileNode(Graph* g, Node* while_node,
+                                   const FunctionLibraryDefinition* lookup_fld,
+                                   FunctionLibraryDefinition* fld) {
+  // For "While" node, we should only replace _Arg nodes which are loop
+  // invariants. For such _Arg nodes, the return value's input will come
+  // directly from the corresponding arg.
+  std::unordered_map<int, const Node*> const_input_index_to_node;
+  NameAttrList body_attr;
+  TF_RETURN_IF_ERROR(GetNodeAttr(while_node->def(), "body", &body_attr));
+  const FunctionDef* body_func = lookup_fld->Find(body_attr.name());
+  if (!body_func) {
+    return errors::Internal("Cannot find body function ", body_attr.name(),
+                            " for While node ", while_node->name());
+  }
+  for (int i = 0; i < while_node->num_inputs(); i++) {
+    const Node* input_node;
+    TF_RETURN_IF_ERROR(while_node->input_node(i, &input_node));
+    if (input_node->type_string() != "Const") {
+      continue;
+    }
+
+    // Check if i-th retval's input comes from i-th arg directly.
+    const OpDef_ArgDef& output_arg = body_func->signature().output_arg(i);
+    auto output_arg_input = body_func->ret().find(output_arg.name());
+    if (output_arg_input == body_func->ret().end()) {
+      return errors::Internal("Cannot find input for output arg ",
+                              output_arg.name(), " in function ",
+                              body_attr.name());
+    }
+    const OpDef_ArgDef& input_arg = body_func->signature().input_arg(i);
+    if (output_arg_input->second != input_arg.name()) {
+      continue;
+    }
+
+    const_input_index_to_node[i] = input_node;
+  }
+  if (const_input_index_to_node.empty()) {
+    return Status::OK();
+  }
+
+  // Rewrite "cond" and "body" function, replace usage of those _Arg nodes with
+  // corresponding const node.
+  for (const auto& attr_name : std::vector<string>{"cond", "body"}) {
+    TF_RETURN_IF_ERROR(PropagateConstIntoFuncAttr(
+        while_node, attr_name, const_input_index_to_node, lookup_fld, fld));
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 const char kXlaOutsideCompilationAttrName[] = "_xla_outside_compilation";
@@ -294,7 +510,7 @@ Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) {
   return Status::OK();
 }
 
-void AddDtypeToKernalDefConstraint(absl::string_view name, DataType dtype,
+void AddDtypeToKernelDefConstraint(absl::string_view name, DataType dtype,
                                    KernelDef* kdef) {
   for (KernelDef::AttrConstraint& constraint : *kdef->mutable_constraint()) {
     if (constraint.name() == name) {
@@ -330,8 +546,8 @@ uint32 GetXLARandomSeed() {
 
 // TODO(b/77601805): add tests for associated function related stuff.
 bool HasAssociatedFunction(const NodeDef& node_def,
-                           FunctionLibraryRuntime* flr) {
-  if (flr->GetFunctionLibraryDefinition()->Contains(node_def.op())) {
+                           const FunctionLibraryDefinition* fld) {
+  if (fld->Contains(node_def.op())) {
     return true;
   }
 
@@ -351,10 +567,10 @@ bool HasAssociatedFunction(const NodeDef& node_def,
 }
 
 std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
-    const Node& node, FunctionLibraryRuntime* flr) {
+    const Node& node, const FunctionLibraryDefinition* fld) {
   std::vector<AssociatedFunctionInfo> results;
   const string& op = node.type_string();
-  if (flr->GetFunctionLibraryDefinition()->Contains(op)) {
+  if (fld->Contains(op)) {
     // This is a function call node.
     AttrValueMap attrs(node.attrs().begin(), node.attrs().end());
     results.emplace_back(AssociatedFunctionInfo::FunctionCall(op, attrs));
@@ -441,4 +657,97 @@ Status RewriteAssociatedFunction(
   return Status::OK();
 }
 
+Status CachedFunctionHandles::GetOrInstantiate(
+    const string& func_name, AttrSlice attrs,
+    FunctionLibraryRuntime::Handle* handle) {
+  string canonicalized_name = Canonicalize(func_name, attrs);
+  auto iter = handles_.find(canonicalized_name);
+  if (iter != handles_.end()) {
+    *handle = iter->second;
+    return Status::OK();
+  }
+
+  TF_RETURN_IF_ERROR(flr_->Instantiate(func_name, attrs, handle));
+  handles_[canonicalized_name] = *handle;
+  return Status::OK();
+}
+
+Status CachedFunctionHandles::ReleaseAllHandles() {
+  Status result;
+  for (auto iter : handles_) {
+    result.Update(flr_->ReleaseHandle(iter.second));
+  }
+  handles_.clear();
+  return result;
+}
+
+xla::StatusOr<Node*> ReplaceNode(Graph* g, Node* n, const NodeDef& node_def) {
+  // Create the replacement node.
+  Status s;
+  Node* new_node = g->AddNode(node_def, &s);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Record original node's output edges and remove them first. This is to avoid
+  // multiple producers for dst nodes' input.
+  std::vector<OutEdgeInfo> out_edge_info;
+  std::vector<const Edge*> out_edges;
+  for (const Edge* edge : n->out_edges()) {
+    out_edges.push_back(edge);
+    out_edge_info.push_back(
+        {edge->dst(), edge->src_output(), edge->dst_input()});
+  }
+  for (const Edge* edge : out_edges) {
+    g->RemoveEdge(edge);
+  }
+
+  // Add original node's input and output edges to the replacement node.
+  for (const Edge* in_edge : n->in_edges()) {
+    g->AddEdge(in_edge->src(), in_edge->src_output(), new_node,
+               in_edge->dst_input());
+  }
+  for (const OutEdgeInfo& out_edge : out_edge_info) {
+    g->AddEdge(new_node, out_edge.src_output, out_edge.dst, out_edge.dst_input);
+  }
+
+  // Remove the original node.
+  g->RemoveNode(n);
+
+  return new_node;
+}
+
+xla::StatusOr<Node*> BuildIdentityNode(
+    Graph* graph, const string& node_name, DataType dtype, const Node* input,
+    absl::optional<string> requested_device) {
+  // Create identity node.
+  NodeDef ndef;
+  ndef.set_name(node_name);
+  ndef.set_op("Identity");
+  if (input) {
+    ndef.add_input(input->name());
+  }
+  if (requested_device) {
+    ndef.set_device(*requested_device);
+  }
+  AddNodeAttr("T", dtype, &ndef);
+  Status s;
+  Node* id_node = graph->AddNode(ndef, &s);
+  TF_RETURN_IF_ERROR(s);
+  return id_node;
+}
+
+Status PropagateConstIntoFunctionalNodes(
+    Graph* g, const FunctionLibraryDefinition* lookup_fld,
+    FunctionLibraryDefinition* fld) {
+  for (Node* n : g->op_nodes()) {
+    if (n->type_string() == "If") {
+      TF_RETURN_IF_ERROR(PropagateConstIntoIfNode(g, n, lookup_fld, fld));
+    } else if (n->type_string() == "While") {
+      TF_RETURN_IF_ERROR(PropagateConstIntoWhileNode(g, n, lookup_fld, fld));
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index 53eab8b63e2fc8aa3dfb0bacfe065897ca775bd0..cf3aa2f847c5ada8897110c7735b207f388f88d4 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -54,7 +55,7 @@ string TensorIdToString(const tf2xla::TensorId& id);
 Status SetNodeShardingFromNeighbors(Node* n, bool out_edges);
 
 // Add an allowed data type to the AttrConstraint with the given name.
-void AddDtypeToKernalDefConstraint(absl::string_view name, DataType dtype,
+void AddDtypeToKernelDefConstraint(absl::string_view name, DataType dtype,
                                    KernelDef* kdef);
 
 // Returns the next random seed to use for seeding xla rng.
@@ -120,7 +121,7 @@ class AssociatedFunctionInfo {
 
 // Returns if the NodeDef has associated function.
 bool HasAssociatedFunction(const NodeDef& node_def,
-                           FunctionLibraryRuntime* flr);
+                           const FunctionLibraryDefinition* fld);
 
 // Gets functions associated with the node. Current cases:
 // 1. For function call node, its function name;
@@ -128,7 +129,7 @@ bool HasAssociatedFunction(const NodeDef& node_def,
 //    and returned attrs will be this node's attributes;
 // 3. For nodes like XlaWhile/XlaIf, all their function attributes.
 std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
-    const Node& node, FunctionLibraryRuntime* flr);
+    const Node& node, const FunctionLibraryDefinition* fld);
 
 // Changes associated functions for the node. Current cases:
 // 1. For function call node, creates a new node with the new function name and
@@ -144,6 +145,58 @@ Status RewriteAssociatedFunction(
 // Attribute to mark nodes to be executed on host.
 extern const char kXlaOutsideCompilationAttrName[];
 
+// Class to act as cache for FunctionLibraryRuntime::Handle objects.
+class CachedFunctionHandles {
+ public:
+  CachedFunctionHandles(FunctionLibraryRuntime* flr) : flr_(flr) {}
+
+  // Populates `handle` for requested function and attributes. If we have
+  // instantiated the function with the same attributes before, `handle` will be
+  // cached handle; otherwise instantiate the function and populate `handle`.
+  Status GetOrInstantiate(const string& func_name, AttrSlice attrs,
+                          FunctionLibraryRuntime::Handle* handle);
+
+  // Releases all handles in the cache. Returns first non-OK status if any;
+  // returns OK otherwise.
+  Status ReleaseAllHandles();
+
+  ~CachedFunctionHandles() { ReleaseAllHandles().IgnoreError(); }
+
+ private:
+  FunctionLibraryRuntime* flr_;
+  std::map<string, FunctionLibraryRuntime::Handle> handles_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CachedFunctionHandles);
+};
+
+// Struct for node's output edge info.
+struct OutEdgeInfo {
+  Node* dst;
+  int src_output, dst_input;
+};
+
+// Replaces node `n` with a new node whose NodeDef is `node_def`.
+xla::StatusOr<Node*> ReplaceNode(Graph* g, Node* n, const NodeDef& node_def);
+
+// Helper function that builds an Identity node.
+xla::StatusOr<Node*> BuildIdentityNode(Graph* graph, const string& node_name,
+                                       DataType dtype, const Node* input,
+                                       absl::optional<string> requested_device);
+
+// For "If"/"While" nodes, if some of their inputs are Const nodes, rewrite
+// body functions to use the Const nodes instead of original _Arg nodes.
+//
+// For example, say we have the following computation:
+//     shape = constant_op.constant([1])
+//     return tf.cond(pred, lambda: tf.ones(shape), lambda: tf.zeros(shape))
+// If we do not rewrite then/else function, they will use _Arg node as shape
+// input for tf.ones/tf.zeros. But XLA requires that shape input to be compile
+// time constant, so XLA compilation will fail. This rewriting process will
+// change the shape input to Const node.
+Status PropagateConstIntoFunctionalNodes(
+    Graph* g, const FunctionLibraryDefinition* lookup_fld,
+    FunctionLibraryDefinition* fld);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index 68441b3d4790b17bd06accff3fcdc8ccee79bbb7..202e929315cacd4d6cdfc69d50639d8a427ec6c2 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -23,11 +23,15 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 namespace {
@@ -255,5 +259,75 @@ TEST(SetNodeShardingFromNeighbors, Basic) {
   EXPECT_EQ(1, parse_status.ValueOrDie().value().tile_assignment_devices(0));
 }
 
+REGISTER_OP("One")
+    .Output("y: T")
+    .Attr("T: {float, double, int32, int64}")
+    .Doc(R"doc(
+Returns a tensor with a single element (1) of type T.
+
+y: A scalar in type T.
+
+)doc");
+
+// Tests that CachedFunctionHandles class works.
+TEST(CachedFunctionHandles, Basic) {
+  FunctionDef func = FunctionDefHelper::Define(
+      // Name
+      "TestFunc",
+      // Args
+      {},
+      // Return values
+      {"y:T"},
+      // Attr def
+      {"T:{float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"y"}, "One", {}, {{"T", "$T"}}},
+      });
+  FunctionDefLibrary proto;
+  *proto.add_function() = func;
+  FunctionLibraryDefinition fld(OpRegistry::Global(), proto);
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
+      new ProcessFunctionLibraryRuntime(
+          /*device_mgr=*/nullptr, Env::Default(), TF_GRAPH_DEF_VERSION, &fld,
+          OptimizerOptions()));
+  FunctionLibraryRuntime* flr =
+      pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+
+  CachedFunctionHandles cached_function_handles(flr);
+
+  // Tests that GetOrInstantiate() works.
+  FunctionLibraryRuntime::Handle first_handle;
+  AttrValue attr;
+  attr.set_type(DT_FLOAT);
+  AttrValueMap attrs;
+  attrs["T"] = attr;
+  TF_ASSERT_OK(cached_function_handles.GetOrInstantiate(
+      "TestFunc", AttrSlice(&attrs), &first_handle));
+
+  // Tests that we can get FunctionBody.
+  const FunctionBody* body = flr->GetFunctionBody(first_handle);
+  EXPECT_NE(body, nullptr);
+
+  // Tests that GetOrInstantiate() returns cached handle when called with same
+  // function name and attributes.
+  FunctionLibraryRuntime::Handle second_handle;
+  TF_ASSERT_OK(cached_function_handles.GetOrInstantiate(
+      "TestFunc", AttrSlice(&attrs), &second_handle));
+  EXPECT_EQ(first_handle, second_handle);
+
+  // Tests that GetOrInstantiate() returns new handle when called with same
+  // function name but different attributes.
+  attr.set_type(DT_INT32);
+  attrs["T"] = attr;
+  FunctionLibraryRuntime::Handle third_handle;
+  TF_ASSERT_OK(cached_function_handles.GetOrInstantiate(
+      "TestFunc", AttrSlice(&attrs), &third_handle));
+  EXPECT_NE(first_handle, third_handle);
+
+  // Tests that ReleaseAllHandles() works.
+  TF_EXPECT_OK(cached_function_handles.ReleaseAllHandles());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 7f860500c75667a920505dbf498e3da4b388fb90..ddb284966eeb97cc7c9d3ed77fb313e567975e59 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -92,7 +92,7 @@ Allocator* XlaCompilationDevice::GetAllocator(AllocatorAttributes attr) {
 void XlaCompilationDevice::Compute(OpKernel* op_kernel,
                                    OpKernelContext* context) {
   VLOG(4) << "XlaCompilationDevice::Compute "
-          << SummarizeNodeDef(op_kernel->def());
+          << FormatNodeDefForError(op_kernel->def());
   auto* b = XlaContext::Get(context).builder();
   xla::OpMetadata metadata;
   metadata.set_op_type(op_kernel->type_string());
@@ -124,13 +124,4 @@ Status XlaCompilationDevice::MakeTensorFromProto(
       "XLACompilationDevice::MakeTensorFromProto should not be called");
 }
 
-XlaExpression::XlaExpression() = default;
-
-void XlaExpression::set_handle(const xla::XlaOp& h) { handle_ = h; }
-
-void XlaExpression::set_constant_value(Tensor value) {
-  has_constant_value_ = true;
-  constant_value_ = std::move(value);
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h
index a6e78825334fec748be5fee80669649df699d2fb..de6a3356e05d8ab45c269d7c6c653853d2c63a79 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.h
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -18,9 +18,6 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/tf2xla/xla_resource.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -38,8 +35,8 @@ class XlaCompilationAllocator;
 // This is a 'dummy' TensorFlow device that is only used to execute a
 // subgraph of XLA compilation Ops to construct a compiled version
 // of the subgraph's computation. It has a 'dummy' allocator that
-// backs each Tensor with metadata indicating the computation the
-// Tensor represents.
+// backs each Tensor with an XlaExpression. The shape of the Tensor
+// matches the shape of XlaExpression.
 //
 // We deliberately don't register a device factory because we *never*
 // want placement to put Ops on a compilation device. The device is created
@@ -67,40 +64,6 @@ class XlaCompilationDevice : public LocalDevice {
   std::unique_ptr<XlaCompilationAllocator> allocator_;
 };
 
-// A XlaExpression wraps an XLA computation. Each Tensor on an
-// XlaCompilationDevice contains an XlaExpression, and the shape of the Tensor
-// matches the shape of the subcomputation in the XlaOp. Each
-// expression is either a constant, or a function of previously-compiled
-// expressions.
-class XlaExpression {
- public:
-  XlaExpression();
-
-  // handle() stores the XLA handle of the computation that the
-  // expression represents.
-  void set_handle(const xla::XlaOp& h);
-  const xla::XlaOp& handle() const { return handle_; }
-
-  void set_constant_value(Tensor value);
-  bool has_constant_value() const { return has_constant_value_; }
-  const Tensor& constant_value() const { return constant_value_; }
-
-  void set_resource(XlaResource* resource) { resource_ = resource; }
-  XlaResource* resource() const { return resource_; }
-
- private:
-  // The XLA handle of the expression's computation.
-  xla::XlaOp handle_;
-
-  // If this expression is a constant with a known value, 'constant_value' is a
-  // host-memory Tensor containing the value. Used to avoid invoking XLA for
-  // expressions that are trivially constant.
-  bool has_constant_value_ = false;
-  Tensor constant_value_;
-
-  XlaResource* resource_ = nullptr;  // Not owned.
-};
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILATION_DEVICE_H_
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 425e769346ffcbc548495d93cb7adc779f860110..c7341cf8b9e8d7a06fd304ae8766420d20f0c16e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -26,7 +26,7 @@ limitations under the License.
 // Forward-declare, rather than include, to reduce code size for users that
 // never use this functionality.
 namespace xla {
-class ProgramShape;
+class ProgramShapeProto;
 class HloProfilePrinterData;
 }
 
@@ -84,7 +84,7 @@ class XlaCompiledCpuFunction {
     void set_result_names(const char** result_names) {
       result_names_ = result_names;
     }
-    void set_program_shape(const xla::ProgramShape* program_shape) {
+    void set_program_shape(const xla::ProgramShapeProto* program_shape) {
       program_shape_ = program_shape;
     }
     const xla::HloProfilePrinterData* hlo_profile_printer_data() const {
@@ -122,7 +122,7 @@ class XlaCompiledCpuFunction {
     const char** result_names_ = nullptr;
 
     // [Optional] Arg and result shapes.
-    const xla::ProgramShape* program_shape_ = nullptr;
+    const xla::ProgramShapeProto* program_shape_ = nullptr;
 
     // [Optional] Profile printer data.  Null if profiling is disabled.
     const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
@@ -206,8 +206,14 @@ class XlaCompiledCpuFunction {
   //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
-  void set_arg_data(size_t index, void* data) {
-    buffer_table_[arg_index_table_[index]] = data;
+  void set_arg_data(size_t index, const void* data) {
+    // The const_cast is safe because the generated code does not write to arg
+    // buffers.
+    //
+    // buffer_table_ contains pointers to buffers that _will_ be written to by
+    // generated code so it would be misleading to make buffer_table_ a `const
+    // void**`.
+    buffer_table_[arg_index_table_[index]] = const_cast<void*>(data);
   }
 
   // ------------------------------
@@ -264,7 +270,7 @@ class XlaCompiledCpuFunction {
 
   // Returns the shape of the args and results. May return nullptr if the
   // program shape isn't available.
-  const xla::ProgramShape* ProgramShape() const { return program_shape_; }
+  const xla::ProgramShapeProto* ProgramShape() const { return program_shape_; }
 
   bool hlo_profiling_enabled() const {
     return hlo_profile_printer_data_ != nullptr;
@@ -287,11 +293,6 @@ class XlaCompiledCpuFunction {
 
   // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
   // for XLA generated code to be able to find it.
-  //
-  // For now we need to keep around the args_ array because there is code that
-  // depends on args() returning a void**.  However, in the future we may remove
-  // args_ in favor of using buffer_table_ as the sole storage for the
-  // arguments.
   const int32* const arg_index_table_;
 
   // The number of incoming arguments.
@@ -310,7 +311,7 @@ class XlaCompiledCpuFunction {
   // Optional metadata.
   const char** arg_names_ = nullptr;
   const char** result_names_ = nullptr;
-  const xla::ProgramShape* program_shape_ = nullptr;
+  const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 };
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index b2c57e88803e0661a9a514f844dff97ff9edf2ea..ee461a3c07d4db514c7697e005a9371be4b54dd0 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -36,10 +36,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -48,7 +51,7 @@ namespace {
 
 // Checks that arguments `args` match types `types`.
 Status CheckSignature(const DataTypeVector& types,
-                      const std::vector<XlaCompiler::Argument>& args) {
+                      absl::Span<const XlaCompiler::Argument> args) {
   if (args.size() != types.size()) {
     return errors::Internal("Compilation arguments have ", args.size(),
                             " elements while function has ", types.size());
@@ -63,14 +66,270 @@ Status CheckSignature(const DataTypeVector& types,
   return Status::OK();
 }
 
+// Uses the _Arg and _Retval nodes in the graph to determine a core assignment
+// for each argument and return value.
+xla::StatusOr<std::pair<std::map<int, int>, std::map<int, int>>>
+ComputeArgAndRetvalCores(const Graph& graph) {
+  auto get_sharding_for_node = [](const Node* n) -> xla::StatusOr<int> {
+    TF_ASSIGN_OR_RETURN(
+        auto sharding,
+        ParseShardingFromDevice(*n, std::numeric_limits<int32>::max()));
+    if (sharding.has_value()) {
+      TF_RET_CHECK(sharding.value().type() ==
+                   xla::OpSharding::Type::OpSharding_Type_MAXIMAL);
+      return sharding.value().tile_assignment_devices(0);
+    } else {
+      return -1;
+    }
+  };
+  std::map<int, int> arg_cores;
+  std::map<int, int> retval_cores;
+  for (const Node* n : graph.nodes()) {
+    if (n->type_string() == FunctionLibraryDefinition::kArgOp) {
+      TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
+      if (core < 0) continue;
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      TF_RET_CHECK(index >= 0) << "Negative _Arg index";
+      arg_cores[index] = core;
+    } else if (n->type_string() == FunctionLibraryDefinition::kRetOp) {
+      TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n));
+      if (core < 0) continue;
+      int index;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+      TF_RET_CHECK(index >= 0) << "Negative _Retval index";
+      TF_ASSIGN_OR_RETURN(retval_cores[index], get_sharding_for_node(n));
+      retval_cores[index] = core;
+    }
+  }
+  return std::make_pair(std::move(arg_cores), std::move(retval_cores));
+}
+
+Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
+                    XlaCompilationDevice* device, FunctionLibraryRuntime* flib,
+                    int64 step_id) {
+  // Resource cleanup is a bit messy. XlaContext is a ref-countd resource; the
+  // resource manager takes ownership via Create, and unrefs via Cleanup.  We
+  // explicitly add a reference to ensure the refcount at entry is maintained at
+  // all exit points; Create and Cleanup are always called in this function.
+  //
+  // The Executor requires us to use ScopedStepContainer. We wrap it in a
+  // unique_ptr so we can capture the cleanup status in the end.
+  xla_context->Ref();
+  Status status;
+  auto step_container = absl::make_unique<ScopedStepContainer>(
+      step_id, [&status, device](const string& name) {
+        status = device->resource_manager()->Cleanup(name);
+      });
+  TF_RETURN_IF_ERROR(device->resource_manager()->Create(
+      step_container->name(), XlaContext::kXlaContextResourceName,
+      xla_context));
+
+  GraphCompiler graph_compiler(device, graph.get(), flib, step_container.get());
+  TF_RETURN_IF_ERROR(graph_compiler.Compile());
+  // Explicitly clean up the step container, to capture the cleanup status.
+  step_container.reset();
+  return Status::OK();
+}
+
+// Builds the XLA computation.
+// - `args` is the list of input arguments
+// - `retvals` is the list of retvals produced by _Retval operators, in index
+//   order.
+// - `args_core` and `retval_cores` are mapping from arg/return indices to core
+//   assignments.
+// - If `return_updated_values_for_all_resources` is true, all resources will be
+//   included in `resource_updates`, regardless of whether their value changed.
+// - Sets `*num_nonconst_outputs` to the number of outputs of the `computation`.
+// - Sets `*resource_updates` to a description of resources whose values are
+//   written by the computation; the variable writes are the last
+// - `resource_updates.size()` return values from the computation. Each entry in
+//   `resource_updates` is a ResourceUpdate, whose `index` is the index of a
+//   resource variable argument to the computation to be updated, and `type` is
+//   the type of the final output.
+Status BuildComputation(
+    const std::vector<XlaCompiler::Argument>& args,
+    const std::vector<XlaExpression>& retvals,
+    const std::map<int, int>& arg_cores, const std::map<int, int>& retval_cores,
+    const std::vector<std::unique_ptr<XlaResource>>& resources,
+    std::unique_ptr<xla::XlaOp> token_output,
+    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
+    bool return_updated_values_for_all_resources, bool always_return_tuple,
+    xla::XlaBuilder* builder, xla::XlaComputation* computation,
+    int* num_computation_outputs, int* num_nonconst_outputs,
+    std::vector<XlaCompiler::OutputDescription>* outputs,
+    std::vector<XlaCompiler::ResourceUpdate>* resource_updates,
+    xla::Shape* output_shape) {
+  // Attach a common operator name as metadata. This has no semantic effect — it
+  // merely makes the HLO graph more readable when visualized via TensorBoard,
+  // since TensorBoard forms groups out of operators with similar names.
+  xla::OpMetadata retval_metadata;
+  retval_metadata.set_op_name("XLA_Retvals");
+  builder->SetOpMetadata(retval_metadata);
+  auto cleanup = gtl::MakeCleanup([builder]() { builder->ClearOpMetadata(); });
+
+  // Builds a no-op XLA computation. We need to set the sharding of outputs, but
+  // cannot change the sharding of the existing output op. To do this, we build
+  // a new identity op to which shardings can be applied.
+  auto identity_op = [builder](xla::XlaOp op) {
+    return xla::GetTupleElement(xla::Tuple(builder, {op}), 0);
+  };
+
+  std::vector<xla::XlaOp> elems;
+  elems.reserve(retvals.size());
+
+  // Keeps track of which retvals have layout to update. The first element is
+  // the output index, second element is the new layout.
+  std::vector<std::pair<int64, xla::Layout>> retval_to_update_layout;
+  for (int i = 0; i < retvals.size(); ++i) {
+    XlaCompiler::OutputDescription& output = (*outputs)[i];
+    const XlaExpression& retval = retvals[i];
+    output.type = retval.dtype();
+    switch (retval.kind()) {
+      case XlaExpression::Kind::kConstant:
+        output.is_constant = true;
+        output.constant_value = retval.constant_value();
+        output.shape = output.constant_value.shape();
+        break;
+
+      case XlaExpression::Kind::kXlaOp: {
+        output.is_constant = false;
+        TF_ASSIGN_OR_RETURN(output.shape, retval.GetShape());
+        xla::XlaOp value = retval.handle();
+        auto it = retval_cores.find(i);
+        xla::XlaScopedShardingAssignment assign_sharding(
+            builder, it == retval_cores.end()
+                         ? absl::optional<xla::OpSharding>()
+                         : xla::sharding_builder::AssignDevice(it->second));
+        if (shape_representation_fn) {
+          // If there is a shape representation function, reshape the output
+          // tensor to the shape given by the representation shape function.
+          TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
+                                                    output.shape, output.type));
+          value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
+          retval_to_update_layout.emplace_back(elems.size(), shape.layout());
+        } else if (it != retval_cores.end()) {
+          // Apply the sharding to the output, if there is a core assignment.
+          value = identity_op(value);
+        }
+
+        elems.push_back(value);
+        break;
+      }
+
+      case XlaExpression::Kind::kResource:
+        output.is_constant = false;
+        output.input_index = retval.resource()->arg_num();
+        output.shape = retval.resource()->shape();
+        break;
+
+      case XlaExpression::Kind::kInvalid:
+        return errors::InvalidArgument(
+            "Invalid expression returned by computation. "
+            "This probably means a return value was not set.");
+    }
+  }
+  *num_nonconst_outputs = elems.size();
+
+  // Add return values for resources whose values have changed.
+  std::vector<const XlaResource*> arg_resources;
+  arg_resources.reserve(resources.size());
+  for (const auto& resource : resources) {
+    if (resource->arg_num() >= 0) {
+      arg_resources.push_back(resource.get());
+    }
+  }
+  std::sort(arg_resources.begin(), arg_resources.end(),
+            [](const XlaResource* a, const XlaResource* b) {
+              return a->arg_num() < b->arg_num();
+            });
+
+  for (const XlaResource* resource : arg_resources) {
+    DCHECK_LT(resource->arg_num(), args.size());
+    const XlaCompiler::Argument& arg = args[resource->arg_num()];
+    auto it = arg_cores.find(resource->arg_num());
+    const int core = it == arg_cores.end() ? -1 : it->second;
+    bool modified = !resource->value().IsIdenticalTo(resource->initial_value());
+    // TensorArray gradients were modified if their values changed or there are
+    // any newly created gradients.
+    for (const auto& grad : resource->tensor_array_gradients()) {
+      modified =
+          modified ||
+          !grad.second->value().IsIdenticalTo(grad.second->initial_value()) ||
+          arg.tensor_array_gradients.count(grad.first) == 0;
+    }
+    if (return_updated_values_for_all_resources || modified) {
+      resource_updates->emplace_back();
+      XlaCompiler::ResourceUpdate& update = resource_updates->back();
+      update.input_index = resource->arg_num();
+      update.type = resource->type();
+      update.shape = resource->shape();
+      update.modified = modified;
+      for (const auto& grad : resource->tensor_array_gradients()) {
+        update.tensor_array_gradients_accessed.insert(grad.first);
+      }
+
+      // Request that the value be returned on a specific core.
+      xla::XlaScopedShardingAssignment assign_sharding(
+          builder, core == -1 ? absl::optional<xla::OpSharding>()
+                              : xla::sharding_builder::AssignDevice(core));
+
+      xla::XlaOp handle;
+      TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
+
+      // Ensures the correct sharding is applied to the output.
+      handle = identity_op(handle);
+
+      elems.push_back(handle);
+    }
+  }
+
+  // If we have token output, append it as the last one.
+  if (token_output) {
+    elems.push_back(*token_output);
+  }
+
+  *num_computation_outputs = elems.size();
+
+  // Builds the XLA computation. We *always* form a tuple here to ensure that
+  // the output value is the last thing added into the XLA computation, even
+  // if there is only one output value.
+  auto tuple = xla::Tuple(builder, elems);
+  if (!always_return_tuple && elems.size() == 1) {
+    xla::GetTupleElement(tuple, 0);
+  }
+
+  xla::StatusOr<xla::XlaComputation> computation_status = builder->Build();
+  if (!computation_status.ok()) {
+    return computation_status.status();
+  }
+  *computation = computation_status.ConsumeValueOrDie();
+
+  TF_ASSIGN_OR_RETURN(const auto& program_shape,
+                      computation->GetProgramShape());
+  *output_shape = program_shape.result();
+  // Update the output layout to the layout of retval.
+  for (auto& update : retval_to_update_layout) {
+    if (!always_return_tuple && elems.size() == 1) {
+      *output_shape->mutable_layout() = update.second;
+      continue;
+    }
+
+    xla::Shape* output_sub_shape =
+        xla::ShapeUtil::GetMutableSubshape(output_shape, {update.first});
+    *output_sub_shape->mutable_layout() = update.second;
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 bool XlaCompiler::Argument::operator==(
     const XlaCompiler::Argument& other) const {
-  if (std::tie(kind, resource_kind, type, name, initialized, tensor_array_size,
+  if (std::tie(kind, resource_kind, type, name, initialized, max_array_size,
                tensor_array_gradients) !=
       std::tie(other.kind, other.resource_kind, other.type, other.name,
-               other.initialized, other.tensor_array_size,
+               other.initialized, other.max_array_size,
                other.tensor_array_gradients)) {
     return false;
   }
@@ -83,12 +342,45 @@ bool XlaCompiler::Argument::operator==(
   return constant_value.tensor_data() == other.constant_value.tensor_data();
 }
 
+string XlaCompiler::Argument::HumanString() const {
+  string common;
+  if (!name.empty()) {
+    common = absl::StrCat(" name=", name);
+  }
+  absl::StrAppend(&common, " type=", DataTypeString(type),
+                  " shape=", shape.DebugString());
+  switch (kind) {
+    case kInvalid:
+      return "invalid";
+    case kConstant:
+      return absl::StrCat("kind=constant", common,
+                          " value=", constant_value.DebugString());
+    case kResource: {
+      string output = absl::StrCat("kind=resource", common, " resource_kind=",
+                                   XlaResource::KindToString(resource_kind),
+                                   " initialized=", initialized);
+      if (max_array_size >= 0) {
+        absl::StrAppend(&output, " max_array_size=", max_array_size);
+      }
+      if (!tensor_array_gradients.empty()) {
+        absl::StrAppend(&output, " tensor_array_gradients=",
+                        absl::StrJoin(tensor_array_gradients, ","));
+      }
+      return output;
+    }
+    case kParameter:
+      return absl::StrCat("kind=parameter", common);
+    case kToken:
+      return absl::StrCat("token", common);
+  }
+}
+
 XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
       initialization_status_(Status::OK()),
       next_step_id_(1),
       device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)),
-      device_mgr_({device_}) {
+      device_mgr_(absl::WrapUnique(device_)) {
   CHECK(!options_.device_type.type_string().empty());
   if (options_.populate_resource_manager) {
     initialization_status_ =
@@ -110,8 +402,13 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
 
   // The default shape representation function is the identity.
   if (!options_.shape_representation_fn) {
-    options_.shape_representation_fn = [](const TensorShape& shape,
-                                          DataType type) { return shape; };
+    options_.shape_representation_fn =
+        [](const TensorShape& shape,
+           DataType dtype) -> xla::StatusOr<xla::Shape> {
+      xla::Shape xla_shape;
+      TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &xla_shape));
+      return xla_shape;
+    };
   }
 }
 
@@ -171,15 +468,16 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   return graph;
 }
 
-Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
-                                    const NameAttrList& function,
-                                    std::vector<XlaCompiler::Argument> args,
-                                    XlaCompiler::CompilationResult* result) {
+Status XlaCompiler::CompileFunction(
+    const XlaCompiler::CompileOptions& options, const NameAttrList& function,
+    absl::Span<const XlaCompiler::Argument> args,
+    XlaCompiler::CompilationResult* result) {
   const string function_id =
       Canonicalize(function.name(), AttrSlice(&function.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
 
-  auto it = cache_.find({function_id, args});
+  const std::vector<XlaCompiler::Argument> arg_vector(args.begin(), args.end());
+  auto it = cache_.find({function_id, arg_vector});
   if (it != cache_.end()) {
     *result = it->second;
     return Status::OK();
@@ -212,14 +510,16 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
   // lowest-numbered core that consumes the argument. We choose the
   // lowest-numbered core so the assignment is deterministic.
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) == "_Arg") {
+    if (absl::string_view(n->type_string()) ==
+        FunctionLibraryDefinition::kArgOp) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/true));
     }
   }
   // Do _Retval as a second loop, in case the retval's input is an _Arg (which
   // may have gotten a device assignment from the first loop).
   for (Node* n : graph->nodes()) {
-    if (absl::string_view(n->type_string()) == "_Retval") {
+    if (absl::string_view(n->type_string()) ==
+        FunctionLibraryDefinition::kRetOp) {
       TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/false));
     }
   }
@@ -235,7 +535,7 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
       CompileGraph(options, function_id, std::move(graph), args, result));
   VLOG(1) << "====================================================";
 
-  cache_[{function_id, args}] = *result;
+  cache_[{function_id, arg_vector}] = *result;
   return Status::OK();
 }
 
@@ -247,33 +547,32 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
     case XlaCompiler::Argument::kConstant:
       LOG(FATAL) << "Unreachable case";
     case XlaCompiler::Argument::kParameter: {
-      TensorShape shape;
       if (is_entry_computation) {
         TF_ASSIGN_OR_RETURN(
-            shape, options_.shape_representation_fn(arg.shape, arg.type));
+            *xla_shape, options_.shape_representation_fn(arg.shape, arg.type));
       } else {
-        shape = arg.shape;
+        TF_RETURN_IF_ERROR(
+            TensorShapeToXLAShape(arg.type, arg.shape, xla_shape));
       }
-      return TensorShapeToXLAShape(arg.type, shape, xla_shape);
+      return Status::OK();
     }
     case XlaCompiler::Argument::kResource: {
       TF_RET_CHECK(arg.initialized);
 
       switch (arg.resource_kind) {
         case XlaResource::kVariable: {
-          TF_ASSIGN_OR_RETURN(
-              TensorShape representation_shape,
-              options_.shape_representation_fn(arg.shape, arg.type));
-          return TensorShapeToXLAShape(arg.type, representation_shape,
-                                       xla_shape);
+          TF_ASSIGN_OR_RETURN(*xla_shape, options_.shape_representation_fn(
+                                              arg.shape, arg.type));
+
+          return Status::OK();
         }
         case XlaResource::kTensorArray: {
-          if (arg.tensor_array_size < 0) {
+          if (arg.max_array_size < 0) {
             return errors::InvalidArgument(
-                "Negative tensor_array_size in XLAShapeForArgument");
+                "Negative max_array_size in XLAShapeForArgument");
           }
           TensorShape shape;
-          shape.AddDim(arg.tensor_array_size);
+          shape.AddDim(arg.max_array_size);
           shape.AppendShape(arg.shape);
           TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
 
@@ -285,12 +584,12 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
           return Status::OK();
         }
         case XlaResource::kStack: {
-          if (arg.tensor_array_size < 0) {
+          if (arg.max_array_size < 0) {
             return errors::InvalidArgument(
-                "Negative tensor_array_size in XLAShapeForArgument");
+                "Negative max_array_size in XLAShapeForArgument");
           }
           TensorShape shape;
-          shape.AddDim(arg.tensor_array_size);
+          shape.AddDim(arg.max_array_size);
           shape.AppendShape(arg.shape);
           xla::Shape buffer_shape;
           TF_RETURN_IF_ERROR(
@@ -314,169 +613,16 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
   }
 }
 
-namespace {
-
-Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
-                    XlaCompilationDevice* device, FunctionLibraryRuntime* flib,
-                    int64 step_id) {
-  // Resource cleanup is a bit messy. XlaContext is a ref-countd resource; the
-  // resource manager takes ownership via Create, and unrefs via Cleanup.  We
-  // explicitly add a reference to ensure the refcount at entry is maintained at
-  // all exit points; Create and Cleanup are always called in this function.
-  //
-  // The Executor requires us to use ScopedStepContainer. We wrap it in a
-  // unique_ptr so we can capture the cleanup status in the end.
-  xla_context->Ref();
-  Status status;
-  auto step_container = absl::make_unique<ScopedStepContainer>(
-      step_id, [&status, device](const string& name) {
-        status = device->resource_manager()->Cleanup(name);
-      });
-  TF_RETURN_IF_ERROR(device->resource_manager()->Create(
-      step_container->name(), XlaContext::kXlaContextResourceName,
-      xla_context));
-
-  GraphCompiler graph_compiler(device, graph.get(), flib, step_container.get());
-  TF_RETURN_IF_ERROR(graph_compiler.Compile());
-  // Explicitly clean up the step container, to capture the cleanup status.
-  step_container.reset();
-  return Status::OK();
-}
-
-// Builds the XLA computation.
-// `args` is the list of input arguments, `retvals` is the list of retvals
-// produced by _Retval operators, in index order.
-// If `return_updated_values_for_all_resources` is true, all resources will be
-// included in `resource_updates`, regardless of whether their value changed.
-// Sets `*num_nonconst_outputs` to the number of outputs of the `computation`.
-// Sets `*resource_updates` to a description of resources whose values are
-// written by the computation; the variable writes are the last
-// `resource_updates.size()` return values from the computation. Each entry in
-// `resource_updates` is a (input_index, type) pair, where `input_index` is the
-// index of a resource variable argument to the computation, and `type` is the
-// type of the final output.
-Status BuildComputation(
-    const std::vector<XlaCompiler::Argument>& args,
-    const std::vector<int>& arg_cores,
-    const std::vector<XlaContext::Retval>& retvals,
-    const std::vector<std::unique_ptr<XlaResource>>& resources,
-    bool return_updated_values_for_all_resources, bool always_return_tuple,
-    xla::XlaBuilder* builder, xla::XlaComputation* computation,
-    int* num_computation_outputs, int* num_nonconst_outputs,
-    std::vector<XlaCompiler::OutputDescription>* outputs,
-    std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
-  std::vector<xla::XlaOp> elems;
-  elems.reserve(retvals.size());
-  for (int i = 0; i < retvals.size(); ++i) {
-    XlaCompiler::OutputDescription& output = (*outputs)[i];
-    output.type = retvals[i].type;
-    output.shape = retvals[i].shape;
-    const XlaExpression& retval = retvals[i].expression;
-    if (retval.has_constant_value()) {
-      output.is_constant = true;
-      output.constant_value = retval.constant_value();
-    } else if (retval.resource() != nullptr) {
-      output.is_constant = false;
-      output.input_index = retval.resource()->arg_num();
-    } else {
-      output.is_constant = false;
-      elems.push_back(retval.handle());
-    }
-  }
-  *num_nonconst_outputs = elems.size();
-
-  // Add return values for resources whose values have changed.
-  std::vector<const XlaResource*> arg_resources;
-  arg_resources.reserve(resources.size());
-  for (const auto& resource : resources) {
-    if (resource->arg_num() >= 0) {
-      arg_resources.push_back(resource.get());
-    }
-  }
-  std::sort(arg_resources.begin(), arg_resources.end(),
-            [](const XlaResource* a, const XlaResource* b) {
-              return a->arg_num() < b->arg_num();
-            });
-
-  // Attach a common operator name as metadata. This has no semantic effect — it
-  // merely makes the HLO graph more readable when visualized via TensorBoard,
-  // since TensorBoard forms groups out of operators with similar names.
-  xla::OpMetadata retval_metadata;
-  retval_metadata.set_op_name("XLA_Retvals");
-  builder->SetOpMetadata(retval_metadata);
-
-  for (const XlaResource* resource : arg_resources) {
-    const XlaCompiler::Argument& arg = args[resource->arg_num()];
-    const int core = arg_cores[resource->arg_num()];
-    DCHECK_LT(resource->arg_num(), arg_cores.size());
-    bool modified = !resource->value().IsIdenticalTo(resource->initial_value());
-    // TensorArray gradients were modified if their values changed or there are
-    // any newly created gradients.
-    for (const auto& grad : resource->tensor_array_gradients()) {
-      modified =
-          modified ||
-          !grad.second->value().IsIdenticalTo(grad.second->initial_value()) ||
-          arg.tensor_array_gradients.count(grad.first) == 0;
-    }
-    if (return_updated_values_for_all_resources || modified) {
-      resource_updates->emplace_back();
-      XlaCompiler::ResourceUpdate& update = resource_updates->back();
-      update.input_index = resource->arg_num();
-      update.type = resource->type();
-      update.shape = resource->shape();
-      update.modified = modified;
-      for (const auto& grad : resource->tensor_array_gradients()) {
-        update.tensor_array_gradients_accessed.insert(grad.first);
-      }
-
-      // Request that the value be returned on a specific core.
-      xla::XlaScopedShardingAssignment assign_sharding(
-          builder, core == -1 ? absl::optional<xla::OpSharding>()
-                              : xla::sharding_builder::AssignDevice(core));
-
-      xla::XlaOp handle;
-      TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
-
-      // Since we can't change the sharding metadata of <value> as this point,
-      // create a tuple/get-tuple-element combination so that sharding
-      // assignment will be placed on this value, which will cause the resource
-      // update to be returned from the same device that provided the resource.
-      handle = xla::GetTupleElement(xla::Tuple(builder, {handle}), 0);
-      elems.push_back(handle);
-    }
-  }
-
-  *num_computation_outputs = elems.size();
-
-  // Builds the XLA computation. We *always* form a tuple here to ensure that
-  // the output value is the last thing added into the XLA computation, even
-  // if there is only one output value.
-  auto tuple = xla::Tuple(builder, elems);
-  if (!always_return_tuple && elems.size() == 1) {
-    xla::GetTupleElement(tuple, 0);
-  }
-  builder->ClearOpMetadata();
-
-  xla::StatusOr<xla::XlaComputation> computation_status = builder->Build();
-  if (!computation_status.ok()) {
-    return computation_status.status();
-  }
-  *computation = computation_status.ConsumeValueOrDie();
-  return Status::OK();
-}
-
-}  // namespace
-
 // Builds XLA computations for each of the arguments to the computation.
 // `args` are the arguments to the computation.
 Status XlaCompiler::BuildArguments(
     const Graph& graph, const std::vector<XlaCompiler::Argument>& args,
     bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context,
-    std::vector<int>* arg_cores, std::vector<XlaExpression>* arg_expressions,
+    const std::map<int, int>& arg_cores,
+    std::vector<XlaExpression>* arg_expressions,
     std::vector<int>* input_mapping, std::vector<xla::Shape>* input_shapes,
     bool is_entry_computation) {
   arg_expressions->resize(args.size());
-  *arg_cores = std::vector<int>(args.size(), -1);
 
   // Argument numbers of arguments and resources that are to be passed to the
   // XLA computation as runtime parameters.
@@ -489,28 +635,30 @@ Status XlaCompiler::BuildArguments(
     const XlaCompiler::Argument& arg = args[i];
     XlaExpression& arg_expression = (*arg_expressions)[i];
     switch (arg.kind) {
-      case XlaCompiler::Argument::kResource:
+      case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.resource_kind != XlaResource::kInvalid);
         // TODO(phawkins): this code assumes that resource arguments do not
         // alias.
-        XlaResource* resource;
-        TF_RETURN_IF_ERROR(context->CreateResource(
-            arg.resource_kind, i, arg.name, arg.type, arg.shape, xla::XlaOp(),
-            /*tensor_array_size=*/arg.tensor_array_size,
-            /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
-        arg_expression.set_resource(resource);
+        XlaResource* resource =
+            context->AddResource(absl::make_unique<XlaResource>(
+                arg.resource_kind, i, arg.name, arg.type, arg.shape,
+                xla::XlaOp(),
+                /*max_array_size=*/arg.max_array_size,
+                /*tensor_array_gradients=*/arg.tensor_array_gradients,
+                /*tensor_array_multiple_writes_aggregate=*/true));
+        arg_expression = XlaExpression::Resource(resource);
         if (arg.initialized) {
           input_mapping->push_back(i);
         }
-
         break;
+      }
       case XlaCompiler::Argument::kParameter:
       case XlaCompiler::Argument::kToken: {
         input_mapping->push_back(i);
         break;
       }
       case XlaCompiler::Argument::kConstant:
-        arg_expression.set_constant_value(arg.constant_value);
+        arg_expression = XlaExpression::Constant(arg.constant_value);
         break;
       case XlaCompiler::Argument::kInvalid:
         return errors::Internal(
@@ -535,26 +683,6 @@ Status XlaCompiler::BuildArguments(
     *input_shapes = arg_shapes;
   }
 
-  // Use the _Arg nodes in the graph to resolve core assignments.
-  for (const Node* n : graph.nodes()) {
-    if (absl::string_view(n->type_string()) != "_Arg") continue;
-    int index;
-    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
-    TF_RET_CHECK(index >= 0 && index < args.size())
-        << "_Arg out of bounds: " << index << " vs " << args.size();
-    TF_ASSIGN_OR_RETURN(
-        auto sharding,
-        ParseShardingFromDevice(*n, std::numeric_limits<int32>::max()));
-    if (sharding.has_value()) {
-      TF_RET_CHECK(sharding.value().type() ==
-                   xla::OpSharding::Type::OpSharding_Type_MAXIMAL);
-      const int core = sharding.value().tile_assignment_devices(0);
-      if ((*arg_cores)[index] == -1 || core < (*arg_cores)[index]) {
-        (*arg_cores)[index] = core;
-      }
-    }
-  }
-
   // Attach a common operator name as metadata. This has no semantic effect — it
   // merely makes the HLO graph more readable when visualized via TensorBoard,
   // since TensorBoard forms groups out of operators with similar names.
@@ -570,11 +698,10 @@ Status XlaCompiler::BuildArguments(
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
       for (int64 parameter : *input_mapping) {
-        const int core = (*arg_cores)[parameter];
-        const int root_device = 0;
+        auto it = arg_cores.find(parameter);
+        const int core = it == arg_cores.end() ? 0 : it->second;
         *tuple_sharding.add_tuple_shardings() =
-            core == -1 ? xla::sharding_builder::AssignDevice(root_device)
-                       : xla::sharding_builder::AssignDevice(core);
+            xla::sharding_builder::AssignDevice(core);
       }
       xla::XlaScopedShardingAssignment assign_tuple_sharding(builder,
                                                              tuple_sharding);
@@ -583,7 +710,8 @@ Status XlaCompiler::BuildArguments(
       tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple");
     }
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
-      const int core = (*arg_cores)[input_mapping->at(i)];
+      auto it = arg_cores.find(i);
+      const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
           builder, core == -1 ? absl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
@@ -591,7 +719,8 @@ Status XlaCompiler::BuildArguments(
     }
   } else {
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
-      const int core = (*arg_cores)[input_mapping->at(i)];
+      auto it = arg_cores.find(i);
+      const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
           builder, core == -1 ? absl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
@@ -626,14 +755,14 @@ Status XlaCompiler::BuildArguments(
         // TODO(b/76097077): propagate device assignments onto arguments and
         // return values of functions, and then reshape unconditionally.
         if (is_entry_computation) {
-          arg_expression.set_handle(
-              xla::Reshape(arg_handles[i], arg.shape.dim_sizes()));
+          arg_expression = XlaExpression::XlaOp(
+              xla::Reshape(arg_handles[i], arg.shape.dim_sizes()), arg.type);
         } else {
-          arg_expression.set_handle(arg_handles[i]);
+          arg_expression = XlaExpression::XlaOp(arg_handles[i], arg.type);
         }
         break;
       case XlaCompiler::Argument::kToken: {
-        arg_expression.set_handle(arg_handles[i]);
+        arg_expression = XlaExpression::XlaOp(arg_handles[i], arg.type);
         break;
       }
       case XlaCompiler::Argument::kConstant:
@@ -647,46 +776,48 @@ Status XlaCompiler::BuildArguments(
 }
 
 Status XlaCompiler::CompileSingleOp(
-    const XlaCompiler::CompileOptions& options, string const& name,
-    OpKernelContext* ctx, const std::vector<XlaCompiler::Argument>& args,
-    CompilationResult* result) {
+    const XlaCompiler::CompileOptions& options, const NodeDef& node_def,
+    absl::Span<const XlaCompiler::Argument> args,
+    absl::Span<const DataType> result_types, CompilationResult* result) {
   // TODO(b/74182462): We implement this by creating a new dummy Graph including
   // _Arg nodes, and let CompileGraph walk it. This could be optimized.
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
   Status status;
   // First create the actual node we care about computing.
-  Node* main_node = graph->AddNode(ctx->op_kernel().def(), &status);
+  Node* main_node = graph->AddNode(node_def, &status);
   TF_RETURN_IF_ERROR(status);
 
   // Create dummy _Arg nodes. Link these to `node` and also via a control
   // dependency edge to the _SOURCE node.
-  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
+  for (int64 i = 0; i < args.size(); ++i) {
     Node* node;
-    string name = absl::StrCat(ctx->op_kernel().name(), "_", i, "_arg");
-    Status status = NodeBuilder(name, "_Arg")
-                        .ControlInput(graph->source_node())
-                        .Attr("T", ctx->input_dtype(i))
-                        .Attr("index", i)
-                        .Finalize(graph.get(), &node);
+    string arg_name = absl::StrCat("_arg", i);
+    Status status =
+        NodeBuilder(arg_name, FunctionLibraryDefinition::kArgOp)
+            .ControlInput(graph->source_node())
+            .Attr("T", args[i].kind == Argument::kResource ? DT_RESOURCE
+                                                           : args[i].type)
+            .Attr("index", i)
+            .Finalize(graph.get(), &node);
     TF_RETURN_IF_ERROR(status);
     graph->AddEdge(node, 0, main_node, i);
   }
 
   // Similarly with return values, create dummy _Retval nodes fed by `node`.
-  for (int64 i = 0; i < ctx->num_outputs(); ++i) {
+  for (int64 i = 0; i < result_types.size(); ++i) {
     Node* node;
-    string name = absl::StrCat(ctx->op_kernel().name(), "_", i, "_retval");
-    Status status = NodeBuilder(name, "_Retval")
+    string retval_name = absl::StrCat("_retval", i);
+    Status status = NodeBuilder(retval_name, FunctionLibraryDefinition::kRetOp)
                         .Input(main_node, i)
-                        .Attr("T", ctx->expected_output_dtype(i))
+                        .Attr("T", result_types[i])
                         .Attr("index", i)
                         .Finalize(graph.get(), &node);
     TF_RETURN_IF_ERROR(status);
   }
   FixupSourceAndSinkEdges(graph.get());
 
-  return CompileGraph(options, name, std::move(graph), args, result);
+  return CompileGraph(options, node_def.name(), std::move(graph), args, result);
 }
 
 namespace {
@@ -741,15 +872,43 @@ Status ValidateGraph(const Graph* graph,
   return Status::OK();
 }
 
+// Converts the value of any expressions whose values are known at compile-time
+// to constants.
+Status ResolveConstantExpressionsToConstants(
+    xla::Client* client, absl::Span<XlaExpression> expressions) {
+  for (XlaExpression& expression : expressions) {
+    if (expression.kind() == XlaExpression::Kind::kXlaOp) {
+      TF_ASSIGN_OR_RETURN(absl::optional<Tensor> constant,
+                          expression.ResolveConstant(client));
+      if (constant.has_value()) {
+        expression = XlaExpression::Constant(*constant);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void ConvertConstantsToExpressions(xla::XlaBuilder* builder,
+                                   absl::Span<XlaExpression> expressions) {
+  for (XlaExpression& expression : expressions) {
+    if (expression.kind() == XlaExpression::Kind::kConstant) {
+      expression =
+          XlaExpression::XlaOp(expression.AsXlaOp(builder), expression.dtype());
+    }
+  }
+}
+
 }  // namespace
 
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  string const& name,
                                  std::unique_ptr<Graph> graph,
-                                 const std::vector<XlaCompiler::Argument>& args,
+                                 absl::Span<const XlaCompiler::Argument> args,
                                  CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate XlaBuilder.";
 
+  TF_RETURN_IF_ERROR(PropagateConstIntoFunctionalNodes(
+      graph.get(), options_.flib_def, local_flib_def_.get()));
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "XlaCompiler::CompileGraph: "
             << dump_graph::DumpGraphToFile(
@@ -766,14 +925,12 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                    options_.device_type, name));
 
   xla::XlaBuilder builder(name);
-  XlaContext* context = new XlaContext(
-      this, &builder, options_.allow_cpu_custom_calls,
-      options.resolve_compile_time_constants, options.is_entry_computation,
-      &options_.shape_representation_fn);
+  XlaContext* context = new XlaContext(this, &builder);
   core::ScopedUnref context_unref(context);
 
-  std::vector<XlaCompiler::Argument> real_args(args);
+  std::vector<XlaCompiler::Argument> real_args(args.begin(), args.end());
   int token_input_index = -1;
+  std::unique_ptr<xla::XlaOp> token_output;
   if (options.add_token_input_output) {
     // Add extra token input.
     token_input_index = real_args.size();
@@ -783,10 +940,14 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
     real_args.push_back(token_arg);
   }
 
+  std::map<int, int> arg_cores;
+  std::map<int, int> retval_cores;
+  TF_ASSIGN_OR_RETURN(std::tie(arg_cores, retval_cores),
+                      ComputeArgAndRetvalCores(*graph));
+
   std::vector<XlaExpression> arg_expressions;
-  std::vector<int> arg_cores;
   TF_RETURN_IF_ERROR(BuildArguments(
-      *graph, real_args, options.use_tuple_arg, &builder, context, &arg_cores,
+      *graph, real_args, options.use_tuple_arg, &builder, context, arg_cores,
       &arg_expressions, &result->input_mapping, &result->xla_input_shapes,
       options.is_entry_computation));
   context->set_args(std::move(arg_expressions));
@@ -826,8 +987,7 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       TF_RETURN_IF_ERROR(token_or.status());
       token_inputs.push_back(token_or.ValueOrDie());
     }
-    TF_RETURN_IF_ERROR(
-        context->AppendTokenRetval(xla::AfterAll(&builder, token_inputs)));
+    token_output.reset(new xla::XlaOp(xla::AfterAll(&builder, token_inputs)));
   }
   TF_RETURN_IF_ERROR(PopNodeTokenMapping());
 
@@ -835,28 +995,27 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   int num_computation_outputs;
   result->computation = std::make_shared<xla::XlaComputation>();
   result->outputs.resize(context->retvals().size());
+  std::vector<XlaExpression> retvals = context->retvals();
+  if (options.resolve_compile_time_constants) {
+    TF_RETURN_IF_ERROR(ResolveConstantExpressionsToConstants(
+        client(), absl::Span<XlaExpression>(retvals)));
+  } else {
+    ConvertConstantsToExpressions(&builder, absl::Span<XlaExpression>(retvals));
+  }
   TF_RETURN_IF_ERROR(BuildComputation(
-      real_args, arg_cores, context->retvals(), context->resources(),
+      real_args, retvals, arg_cores, retval_cores, context->resources(),
+      std::move(token_output),
+      options.is_entry_computation ? options_.shape_representation_fn
+                                   : ShapeRepresentationFn{},
       options.return_updated_values_for_all_resources,
       options.always_return_tuple, &builder, result->computation.get(),
       &num_computation_outputs, &num_nonconst_outputs, &result->outputs,
-      &result->resource_updates));
+      &result->resource_updates, &result->xla_output_shape));
 
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
-
-  // Compute the XLA output shape, if there is a computation with non-constant
-  // outputs.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ProgramShape> computation_shape,
-                      client()->GetComputationShape(*result->computation));
-
-  result->xla_output_shape.Swap(computation_shape->mutable_result());
   VLOG(2) << "XLA output shape: "
-          << xla::ShapeUtil::HumanString(result->xla_output_shape);
-
-  // Tensorflow expects a major-to-minor order of results.
-  xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
-
+          << xla::ShapeUtil::HumanStringWithLayout(result->xla_output_shape);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 2cc603a58016a509fafdf6f95423dd6c0864cce3..0d801b73a8c2651305328384377751254ecaa41d 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -18,10 +18,13 @@ limitations under the License.
 
 #include <stack>
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -118,7 +121,7 @@ class XlaCompiler {
 
     // The type of the argument. If the argument is a resource, this
     // is the type of the variable's value, not DT_RESOURCE.
-    DataType type;
+    DataType type = DT_INVALID;
 
     // The shape of the argument. For:
     // * a parameter: the shape of the parameter.
@@ -147,7 +150,7 @@ class XlaCompiler {
 
     // For a TensorArray or Stack resource, what is the array's declared size?
     // (Used for lazy initialization.)
-    int64 tensor_array_size = -1;
+    int64 max_array_size = -1;
 
     // TensorArray resource parameters are passed as (array, gradient array 0,
     // ..., gradient array k), where the gradient arrays are in the same order
@@ -155,6 +158,9 @@ class XlaCompiler {
     std::set<string> tensor_array_gradients;
 
     bool operator==(const Argument& other) const;
+
+    // Returns a human-readable summary of the argument.
+    string HumanString() const;
   };
 
   // Options pertaining to an individual call to CompileGraph() or
@@ -259,8 +265,7 @@ class XlaCompiler {
     std::shared_ptr<xla::XlaComputation> computation;
   };
 
-  typedef std::function<xla::StatusOr<TensorShape>(const TensorShape&,
-                                                   DataType)>
+  typedef std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType)>
       ShapeRepresentationFn;
   struct Options {
     // Name of the compilation device to use. It must be set by the caller.
@@ -316,22 +321,23 @@ class XlaCompiler {
 
   Status CompileFunction(const CompileOptions& options,
                          const NameAttrList& fn_name_attrs,
-                         std::vector<Argument> args, CompilationResult* result);
+                         absl::Span<const Argument> args,
+                         CompilationResult* result);
 
   // Compiles a tensorflow::Graph into an xla::XlaComputation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
   Status CompileGraph(const CompileOptions& options, string const& name,
                       std::unique_ptr<Graph> graph,
-                      const std::vector<Argument>& args,
+                      absl::Span<const Argument> args,
                       CompilationResult* result);
 
-  // Compiles a single Op, given by an OpKernelContext, into an
+  // Compiles a single Op, given by `node_def`, into an
   // xla::XlaComputation. Similar to CompileFunction but takes a single Op as
   // input.
-  Status CompileSingleOp(const CompileOptions& options, string const& name,
-                         OpKernelContext* ctx,
-                         const std::vector<Argument>& args,
+  Status CompileSingleOp(const CompileOptions& options, const NodeDef& node_def,
+                         absl::Span<const Argument> args,
+                         absl::Span<const DataType> result_types,
                          CompilationResult* result);
 
   // Returns the shape of the XLA parameter for an argument 'arg'.
@@ -411,7 +417,8 @@ class XlaCompiler {
   Status BuildArguments(const Graph& graph,
                         const std::vector<XlaCompiler::Argument>& args,
                         bool use_tuple_arg, xla::XlaBuilder* builder,
-                        XlaContext* context, std::vector<int>* arg_cores,
+                        XlaContext* context,
+                        const std::map<int, int>& arg_cores,
                         std::vector<XlaExpression>* arg_expressions,
                         std::vector<int>* input_mapping,
                         std::vector<xla::Shape>* input_shapes,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 72b17d04fc42eb00781e96b412465b73fb29a5c2..fe2a5f5b0c9ea6b5f2bb71df836fdcabf9a0cf23 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -354,8 +356,10 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
   EXPECT_TRUE(
       absl::StrContains(status.error_message(), "depends on a parameter"))
       << status.error_message();
-  EXPECT_TRUE(
-      absl::StrContains(status.error_message(), "[[{{node C}} = Reshape"))
+  EXPECT_TRUE(absl::StrContains(status.error_message(), "{{node C}}"))
+      << status.error_message();
+  EXPECT_TRUE(absl::StrContains(status.error_message(),
+                                "must be a compile-time constant"))
       << status.error_message();
 }
 
@@ -646,7 +650,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   args[0].initialized = true;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({});
-  args[0].tensor_array_size = 2;
+  args[0].max_array_size = 2;
   args[0].tensor_array_gradients = {"grad2"};
 
   // Compiles the graph.
@@ -705,7 +709,7 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) {
   args[0].initialized = true;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({});
-  args[0].tensor_array_size = 2;
+  args[0].max_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
   // Compiles the graph.
@@ -737,7 +741,7 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) {
   args[0].initialized = true;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({});
-  args[0].tensor_array_size = 2;
+  args[0].max_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
   // Compiles the graph.
@@ -907,6 +911,82 @@ TEST_F(XlaCompilerTest, Variables) {
   RunAndCheckVariablesComputation(client_, result);
 }
 
+TEST_F(XlaCompilerTest, ResultLayoutSingle) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Retval(scope.WithOpName("RET"), a, 0);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  // Sets the representation function to return a non-default layout.
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  auto compile_options = XlaCompiler::CompileOptions();
+  compile_options.always_return_tuple = false;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "id", std::move(graph),
+                                     args, &result));
+  EXPECT_TRUE(xla::ShapeUtil::Equal(
+      result.xla_output_shape,
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1})));
+}
+
+TEST_F(XlaCompilerTest, ResultLayoutMultiple) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Retval(scope.WithOpName("RET1"), a, 0);
+  auto c = ops::_Retval(scope.WithOpName("RET2"), a, 1);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  // Sets the representation function to return a non-default layout.
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "id",
+                                     std::move(graph), args, &result));
+  xla::Shape result_shape =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1});
+
+  EXPECT_TRUE(xla::ShapeUtil::Equal(
+      result.xla_output_shape,
+      xla::ShapeUtil::MakeTupleShape({result_shape, result_shape})));
+}
+
 // Tests a simple graph that reads and writes a variable.
 TEST_F(XlaCompilerTest, ReturnResourceHandleOnly) {
   Scope scope = Scope::NewRootScope().ExitOnError();
@@ -1016,9 +1096,11 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
 
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
-  options.shape_representation_fn = [](const TensorShape& shape,
-                                       DataType type) {
-    return TensorShape({shape.num_elements()});
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::PrimitiveType ptype;
+    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype));
+    return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()});
   };
   XlaCompiler compiler(options);
 
@@ -1084,9 +1166,11 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
 
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
-  options.shape_representation_fn = [](const TensorShape& shape,
-                                       DataType type) {
-    return TensorShape({shape.num_elements()});
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::PrimitiveType ptype;
+    TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(type, &ptype));
+    return xla::ShapeUtil::MakeShape(ptype, {shape.num_elements()});
   };
   XlaCompiler compiler(options);
 
@@ -1256,23 +1340,30 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
   TF_ASSERT_OK(status);
   EXPECT_TRUE(FixupSourceAndSinkEdges(graph.get()));
 
-  const std::vector<XlaCompiler::Argument> empty_args;
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kResource;
+  args[0].resource_kind = XlaResource::kVariable;
+  args[0].initialized = true;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 2});
+
   {
     // The case for entry computation: we don't add token input/output. Instead,
     // we use CreateToken HLO to create the entry token.
     XlaCompiler::CompileOptions options;
     options.is_entry_computation = true;
     options.add_token_input_output = false;
+    options.return_updated_values_for_all_resources = true;
     XlaCompiler compiler(DefaultOptions());
 
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
     XlaCompiler::CompilationResult result;
     TF_ASSERT_OK(compiler.CompileGraph(options, "NoOp", std::move(graph_copy),
-                                       empty_args, &result));
-    EXPECT_EQ(result.xla_input_shapes.size(), 0);
+                                       args, &result));
+    EXPECT_EQ(result.xla_input_shapes.size(), 1);
     EXPECT_TRUE(xla::ShapeUtil::IsTuple(result.xla_output_shape));
-    EXPECT_EQ(xla::ShapeUtil::TupleElementCount(result.xla_output_shape), 0);
+    EXPECT_EQ(xla::ShapeUtil::TupleElementCount(result.xla_output_shape), 1);
   }
   {
     // The case for non-entry computation (e.g. while loop body). We add token
@@ -1280,19 +1371,20 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
     XlaCompiler::CompileOptions options;
     options.is_entry_computation = false;
     options.add_token_input_output = true;
+    options.return_updated_values_for_all_resources = true;
     XlaCompiler compiler(DefaultOptions());
 
     std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
     CopyGraph(*graph, graph_copy.get());
     XlaCompiler::CompilationResult result;
     TF_ASSERT_OK(compiler.CompileGraph(options, "NoOp", std::move(graph_copy),
-                                       empty_args, &result));
-    EXPECT_EQ(result.xla_input_shapes.size(), 1);
-    EXPECT_TRUE(xla::ShapeUtil::IsToken(result.xla_input_shapes[0]));
+                                       args, &result));
+    EXPECT_EQ(result.xla_input_shapes.size(), 2);
+    EXPECT_TRUE(xla::ShapeUtil::IsToken(result.xla_input_shapes[1]));
     EXPECT_TRUE(xla::ShapeUtil::IsTuple(result.xla_output_shape));
-    EXPECT_EQ(xla::ShapeUtil::TupleElementCount(result.xla_output_shape), 1);
+    EXPECT_EQ(xla::ShapeUtil::TupleElementCount(result.xla_output_shape), 2);
     EXPECT_TRUE(xla::ShapeUtil::IsToken(
-        xla::ShapeUtil::GetTupleElementShape(result.xla_output_shape, 0)));
+        xla::ShapeUtil::GetTupleElementShape(result.xla_output_shape, 1)));
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index f247570d72c0287a33695de3d778cce2a2418921..a69af70503376b6c0905deb8980abdc3254a6e47 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -54,98 +54,25 @@ const char XlaContext::kXlaContextResourceName[] = "_xla_context";
   return *context;
 }
 
-/* static */ XlaContext& XlaContext::Get(const XlaOpKernelContext* ctx) {
-  return Get(ctx->op_kernel_context());
-}
-
 void XlaContext::set_args(std::vector<XlaExpression> args) {
   args_ = std::move(args);
 }
 
-XlaContext::XlaContext(
-    XlaCompiler* compiler, xla::XlaBuilder* builder,
-    bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
-    bool is_entry_computation,
-    const std::function<xla::StatusOr<TensorShape>(
-        const TensorShape&, DataType)>* shape_representation_fn)
-    : compiler_(compiler),
-      builder_(builder),
-      allow_cpu_custom_calls_(allow_cpu_custom_calls),
-      resolve_compile_time_constants_(resolve_compile_time_constants),
-      is_entry_computation_(is_entry_computation),
-      shape_representation_fn_(shape_representation_fn) {}
-
-string XlaContext::DebugString() { return "TLA JIT context"; }
-
-// This is called by the Retval Op to associate a computed value
-// with a specific return value of the subgraph.
-void XlaContext::AddRetval(int retval_index, DataType type,
-                           const TensorShape& shape, const xla::XlaOp& handle) {
-  VLOG(1) << "Added retval index " << retval_index << " to XLA computation";
-  // Add the return value to the list being built up.
-  if (retvals_.size() <= retval_index) {
-    retvals_.resize(retval_index + 1);
-  }
-  XlaExpression e;
-  e.set_handle(handle);
-  retvals_[retval_index] = Retval{type, shape, e};
-}
+XlaContext::XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder)
+    : compiler_(compiler), builder_(builder) {}
 
-Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
-                                  const xla::LiteralSlice& literal) {
-  VLOG(1) << "Adding retval index " << retval_index
-          << " with non-data-dependent tensor to XLA computation";
-  if (retvals_.size() <= retval_index) {
-    retvals_.resize(retval_index + 1);
-  }
-  Tensor value;
-  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype, &value));
-  XlaExpression e;
-  e.set_constant_value(value);
-  retvals_[retval_index] = Retval{dtype, value.shape(), e};
-  return Status::OK();
-}
+string XlaContext::DebugString() { return "XLA JIT context"; }
 
-Status XlaContext::AddResourceRetval(int retval_index, XlaResource* resource) {
-  VLOG(1) << "Adding retval index " << retval_index << " with resource "
-          << resource->name() << ":" << resource->shape().DebugString()
-          << " to XLA computation";
-  if (retvals_.size() <= retval_index) {
-    retvals_.resize(retval_index + 1);
+void XlaContext::SetRetval(int index, const XlaExpression& expression) {
+  if (retvals_.size() <= index) {
+    retvals_.resize(index + 1);
   }
-  XlaExpression e;
-  e.set_resource(resource);
-  retvals_[retval_index] = Retval{DT_RESOURCE, resource->shape(), e};
-  return Status::OK();
-}
-
-Status XlaContext::AppendTokenRetval(const xla::XlaOp& token) {
-  VLOG(1) << "Adding retval index " << retvals_.size()
-          << " with token to XLA computation";
-  XlaExpression e;
-  e.set_handle(token);
-  // We use DT_INVALID because there is no TF DataType which corresponds to XLA
-  // token. XlaCompiler handles this case separately, so putting it here is OK.
-  retvals_.push_back(Retval{DT_INVALID, TensorShape(), e});
-  return Status::OK();
-}
-
-xla::XlaBuilder* XlaContext::builder() { return builder_; }
-
-Status XlaContext::CreateResource(
-    XlaResource::Kind kind, int arg_num, string name, DataType type,
-    TensorShape shape, const xla::XlaOp& handle, int64 tensor_array_size,
-    const std::set<string>& tensor_array_gradients, XlaResource** resource) {
-  resources_.emplace_back(
-      new XlaResource(kind, arg_num, std::move(name), type, std::move(shape),
-                      handle, tensor_array_size, tensor_array_gradients));
-  *resource = resources_.back().get();
-  return Status::OK();
+  retvals_[index] = expression;
 }
 
-xla::StatusOr<TensorShape> XlaContext::RepresentationShape(
-    const TensorShape& shape, DataType type) const {
-  return (*shape_representation_fn_)(shape, type);
+XlaResource* XlaContext::AddResource(std::unique_ptr<XlaResource> resource) {
+  resources_.push_back(std::move(resource));
+  return resources_.back().get();
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index d7dbdc957f0e7969db5098b815381866cdc71ab6..0767d1faac14cedb8666f6cc37175eb7b55f6158 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -41,15 +41,10 @@ class XlaContext : public ResourceBase {
  public:
   // Retrieves the XlaContext of the current compilation.
   static XlaContext& Get(const OpKernelContext* ctx);
-  static XlaContext& Get(const XlaOpKernelContext* ctx);
 
   // Creates a new XlaContext. See the documentation on the class data fields
   // for descriptions of the arguments.
-  XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
-             bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
-             bool is_entry_computation,
-             const std::function<xla::StatusOr<TensorShape>(
-                 const TensorShape&, DataType)>* shape_representation_fn);
+  XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder);
 
   // Virtual method defined by ResourceBase.
   string DebugString() override;
@@ -57,60 +52,25 @@ class XlaContext : public ResourceBase {
   XlaCompiler* compiler() const { return compiler_; }
 
   // Returns the XlaBuilder that Ops use for compiling new expressions.
-  xla::XlaBuilder* builder();
-
-  bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
-
-  bool resolve_compile_time_constants() const {
-    return resolve_compile_time_constants_;
-  }
-  bool is_entry_computation() const { return is_entry_computation_; }
+  xla::XlaBuilder* builder() { return builder_; }
 
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
-  struct Retval {
-    DataType type;
-    TensorShape shape;
-    // An XlaExpression representing the Retval's value.
-    XlaExpression expression;
-  };
-  const std::vector<Retval>& retvals() { return retvals_; }
-
-  // This is called by the Retval Op to associate a computed value
-  // with a specific return value of the subgraph.
-  void AddRetval(int retval_index, DataType type, const TensorShape& shape,
-                 const xla::XlaOp& handle);
-
-  // As for Retval, but for return values that are compile-time constants.
-  Status AddConstRetval(int retval_index, DataType dtype,
-                        const xla::LiteralSlice& literal);
-
-  // As for Retval, but for return values that are resource handles.
-  Status AddResourceRetval(int retval_index, XlaResource* resource);
-
-  // As for Retval, but for return values that are XLA tokens.
-  Status AppendTokenRetval(const xla::XlaOp& token);
-
-  // Creates a resource with resource `kind` and initial value `handle`. `name`
-  // is a descriptive name for use in error messages. See the `XlaResource`
-  // constructor for a description of the remaining arguments.
-  // Fails if the resource already exists.
-  Status CreateResource(XlaResource::Kind kind, int arg_num, string name,
-                        DataType type, TensorShape shape,
-                        const xla::XlaOp& handle, int64 tensor_array_size,
-                        const std::set<string>& tensor_array_gradients,
-                        XlaResource** resource);
+  const std::vector<XlaExpression>& retvals() { return retvals_; }
+
+  // Sets a return value.
+  // Since we do not always know in advance how many return values there are,
+  // grows the return values vector to size index+1 if it is smaller.
+  void SetRetval(int index, const XlaExpression& expression);
+
+  // Adds 'resource' to the set of resources owned by the context.
+  XlaResource* AddResource(std::unique_ptr<XlaResource> resource);
 
   const std::vector<std::unique_ptr<XlaResource>>& resources() {
     return resources_;
   }
 
-  // Returns the XLA shape to be used to represent a variable of TF `shape`
-  // and `type`, or of an argument or return value of a top-level computation.
-  xla::StatusOr<TensorShape> RepresentationShape(const TensorShape& shape,
-                                                 DataType type) const;
-
   // Get an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
@@ -140,36 +100,16 @@ class XlaContext : public ResourceBase {
   // The XlaBuilder used to construct the subgraph's compiled representation.
   xla::XlaBuilder* builder_;
 
-  // Allow ops to emit CustomCall operations for CPU.
-  const bool allow_cpu_custom_calls_;
-
-  // If true, constant return values are returned as Tensors instead of
-  // run-time computation outputs.
-  const bool resolve_compile_time_constants_;
-
   // Arguments to the Tensorflow graph, indexed by _Arg index.
   // Includes both compile-time constant arguments and runtime parameters.
   std::vector<XlaExpression> args_;
 
   // Return values of the Tensorflow graph, indexed by _Retval index.
-  std::vector<Retval> retvals_;
+  std::vector<XlaExpression> retvals_;
 
   // Holds ownership of resources. The resources are not ordered.
   std::vector<std::unique_ptr<XlaResource>> resources_;
 
-  // Is this a top-level computation, or an inner computation (e.g., a while
-  // body)?
-  const bool is_entry_computation_;
-
-  // A function that describes how the shapes of
-  // a) argument and return value, for entry computations
-  // b) variables, for all computations,
-  // should be represented in XLA. Parameters/return values will be shaped
-  // according to this function, and reshaped back to/from their declared shapes
-  // for computations. Must be non-null.
-  const std::function<xla::StatusOr<TensorShape>(const TensorShape&, DataType)>*
-      shape_representation_fn_;
-
   // Cache of prebuilt computations indexed by their type.
   using ComputationMap = std::map<DataType, xla::XlaComputation>;
 
diff --git a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
index bc44301d405102921de21da4bd9407032783838c..9bb785842d061e5892ba9da0a902eef50d21f55d 100644
--- a/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_cpu_backend.cc
@@ -21,10 +21,10 @@ namespace tensorflow {
 
 bool CpuOpFilter(KernelDef* kdef) {
   if (kdef->op() == "Const") {
-    AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef);
+    AddDtypeToKernelDefConstraint("dtype", DT_STRING, kdef);
   }
   if (kdef->op() == "Assert") {
-    AddDtypeToKernalDefConstraint("T", DT_STRING, kdef);
+    AddDtypeToKernelDefConstraint("T", DT_STRING, kdef);
   }
   return true;
 }
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca0309166b7c73d1a5a818091e2a30fa112a4de4
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -0,0 +1,145 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+XlaExpression::XlaExpression() = default;
+
+XlaExpression XlaExpression::Invalid() {
+  XlaExpression e;
+  e.kind_ = Kind::kInvalid;
+  return e;
+}
+
+XlaExpression XlaExpression::Constant(Tensor value) {
+  XlaExpression e;
+  e.kind_ = Kind::kConstant;
+  e.dtype_ = value.dtype();
+  e.constant_value_ = value;
+  return e;
+}
+
+XlaExpression XlaExpression::XlaOp(xla::XlaOp value, DataType dtype) {
+  XlaExpression e;
+  e.kind_ = Kind::kXlaOp;
+  e.dtype_ = dtype;
+  e.handle_ = value;
+  return e;
+}
+
+XlaExpression XlaExpression::Resource(XlaResource* resource) {
+  XlaExpression e;
+  e.kind_ = Kind::kResource;
+  e.dtype_ = DT_RESOURCE;
+  e.resource_ = resource;
+  return e;
+}
+
+string XlaExpression::HumanString() const {
+  switch (kind_) {
+    case Kind::kInvalid:
+      return "invalid";
+    case Kind::kConstant:
+      return "constant";
+    case Kind::kXlaOp:
+      return "xla_op";
+    case Kind::kResource:
+      return "resource";
+  }
+}
+
+xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
+  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    switch (kind_) {
+      case Kind::kConstant: {
+        xla::BorrowingLiteral literal;
+        TF_RETURN_IF_ERROR(
+            HostTensorToBorrowingLiteral(constant_value_, &literal));
+        return xla::ConstantLiteral(builder, literal);
+      }
+      case Kind::kXlaOp:
+        if (builder != handle_.builder()) {
+          return errors::InvalidArgument(
+              "Mismatched builders in XlaExpression::AsXlaOp");
+        }
+        return handle_;
+      default:
+        return errors::InvalidArgument("AsXlaOp called on XlaExpression: ",
+                                       HumanString());
+    }
+  });
+}
+
+xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
+    xla::Client* client) const {
+  switch (kind()) {
+    case Kind::kConstant:
+      return {constant_value()};
+    case Kind::kXlaOp:
+      break;
+    case Kind::kResource:
+    case Kind::kInvalid:
+      return errors::InvalidArgument(
+          "ResolveConstant called on XlaExpression: ", HumanString());
+  }
+
+  TF_ASSIGN_OR_RETURN(bool is_constant,
+                      handle().builder()->IsConstant(handle()));
+  if (!is_constant) return {absl::nullopt};
+
+  TF_ASSIGN_OR_RETURN(xla::XlaComputation constant_graph,
+                      handle().builder()->BuildConstantSubGraph(handle()));
+
+  TF_ASSIGN_OR_RETURN(TensorShape shape, GetShape());
+
+  // The XLA layout is specified minor to major, and TensorFlow uses a major to
+  // minor order.
+  std::vector<int64> layout_indices(shape.dims());
+  std::iota(layout_indices.rbegin(), layout_indices.rend(), 0);
+  xla::Layout layout = xla::LayoutUtil::MakeLayout(layout_indices);
+  TF_ASSIGN_OR_RETURN(xla::Literal literal,
+                      client->ComputeConstant(constant_graph, &layout));
+  Tensor tensor;
+  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype(), &tensor));
+  return {tensor};
+}
+
+xla::StatusOr<TensorShape> XlaExpression::GetShape() const {
+  switch (kind_) {
+    case Kind::kConstant:
+      return constant_value().shape();
+    case Kind::kXlaOp: {
+      TF_ASSIGN_OR_RETURN(xla::Shape xla_shape,
+                          handle().builder()->GetShape(handle()));
+      TensorShape shape;
+      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, &shape));
+      return shape;
+    }
+    case Kind::kResource:
+      return TensorShape({});
+    case Kind::kInvalid:
+      return errors::InvalidArgument(
+          "GetShape() called on invalid XlaExpression");
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
new file mode 100644
index 0000000000000000000000000000000000000000..bed6761d362a98d344003c1edea342e68c31ef07
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -0,0 +1,115 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
+#define TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// A XlaExpression represents a symbolic TensorFlow value in a TF->XLA
+// compilation.
+// An expression is one of:
+// * a constant tensor.
+// * an xla::XlaOp, representing a symbolic XLA value.
+// * a resource, e.g., a variable, represented as an XlaResource pointer.
+//
+// Constant tensors are mostly an optimization to avoid passing large constants
+// to XLA, but are also sometimes used to represent tensors that have no XLA
+// representation, for example, DT_STRING tensors. A canonical use case might be
+// an error message string.
+class XlaExpression {
+ public:
+  enum class Kind {
+    kInvalid,
+    kConstant,
+    kXlaOp,
+    kResource,
+  };
+
+  XlaExpression();
+  XlaExpression(const XlaExpression&) = default;
+  XlaExpression& operator=(const XlaExpression&) = default;
+
+  // Builds an invalid expression. (Same as the default constructor, but makes
+  // the intent clearer.)
+  static XlaExpression Invalid();
+
+  // Builds a constant XLA expression.
+  static XlaExpression Constant(Tensor value);
+
+  // Builds a XlaOp expression. Since the mapping from TF data types to XLA
+  // types is not 1-1, the TF type must also be provided; in general it cannot
+  // be derived from the XLA type.
+  static XlaExpression XlaOp(xla::XlaOp value, DataType dtype);
+
+  // Builds a resource expression.
+  static XlaExpression Resource(XlaResource* resource);
+
+  Kind kind() const { return kind_; }
+
+  DataType dtype() const { return dtype_; }
+
+  // handle() returns the XlaOp that backs a kXlaOp expression.
+  const xla::XlaOp& handle() const { return handle_; }
+
+  const Tensor& constant_value() const { return constant_value_; }
+
+  XlaResource* resource() const { return resource_; }
+
+  // Returns a human-readable summary of the expression.
+  string HumanString() const;
+
+  // Returns the value of a kConstant or kXlaOp as an xla::XlaOp. Returns
+  // an erroneous XlaOp if the expression is not a constant or an expression.
+  xla::XlaOp AsXlaOp(xla::XlaBuilder* builder) const;
+
+  // If a kXlaOp or kConstant expression can be resolved to a compile-time
+  // constant, returns the value as a host-memory Tensor. Returns an empty
+  // optional if it cannot be resolved. Returns an error if passed a resource
+  // expression.
+  xla::StatusOr<absl::optional<Tensor>> ResolveConstant(
+      xla::Client* client) const;
+
+  // Returns the shape of the tensor.
+  // The shape of a resource is the shape of a resource handle (i.e., a scalar),
+  // not the shape of the resource's value.
+  xla::StatusOr<TensorShape> GetShape() const;
+
+ private:
+  Kind kind_ = Kind::kInvalid;
+
+  DataType dtype_ = DT_INVALID;
+
+  // The XLA handle of the expression's computation, if kind_ == kXlaOp.
+  xla::XlaOp handle_;
+
+  // The value of the constant, if kind_ == kConstant.
+  Tensor constant_value_;
+
+  // The resource, if kind_ == kResource. Not owned.
+  XlaResource* resource_ = nullptr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_EXPRESSION_H_
diff --git a/tensorflow/compiler/tf2xla/xla_expression_test.cc b/tensorflow/compiler/tf2xla/xla_expression_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84202c931390f2d68f6d381aef0752bfff00a53d
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/xla_expression_test.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaExpressionTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    client_ = xla::ClientLibrary::LocalClientOrDie();
+    builder_ = absl::make_unique<xla::XlaBuilder>("acomputation");
+    constant_ = test::AsScalar<int32>(42);
+    op_ = xla::ConstantR0<int32>(builder_.get(), 7);
+    non_constant_op_ = xla::Parameter(
+        builder_.get(), 0, xla::ShapeUtil::MakeShape(xla::F32, {}), "x");
+    resource_ = absl::make_unique<XlaResource>(
+        XlaResource::kVariable, /*arg_num=*/0, /*name=*/string("avariable"),
+        DT_INT32, TensorShape({17, 3}), op_, /*tensor_array_size=*/-1,
+        /*tensor_array_gradients=*/std::set<string>(),
+        /*tensor_array_multiple_writes_aggregate=*/false);
+  }
+
+  xla::Client* client_;
+  std::unique_ptr<xla::XlaBuilder> builder_;
+  Tensor constant_;
+  xla::XlaOp op_;
+  xla::XlaOp non_constant_op_;
+  std::unique_ptr<XlaResource> resource_;
+};
+
+TEST_F(XlaExpressionTest, Kind) {
+  EXPECT_TRUE(XlaExpression::Kind::kInvalid == XlaExpression().kind());
+  EXPECT_TRUE(XlaExpression::Kind::kInvalid == XlaExpression::Invalid().kind());
+  EXPECT_TRUE(XlaExpression::Kind::kConstant ==
+              XlaExpression::Constant(constant_).kind());
+  EXPECT_TRUE(XlaExpression::Kind::kXlaOp ==
+              XlaExpression::XlaOp(op_, DT_INT32).kind());
+  EXPECT_TRUE(XlaExpression::Kind::kResource ==
+              XlaExpression::Resource(resource_.get()).kind());
+}
+
+TEST_F(XlaExpressionTest, HumanString) {
+  EXPECT_EQ("invalid", XlaExpression().HumanString());
+  EXPECT_EQ("invalid", XlaExpression::Invalid().HumanString());
+  EXPECT_EQ("constant", XlaExpression::Constant(constant_).HumanString());
+  EXPECT_EQ("xla_op", XlaExpression::XlaOp(op_, DT_INT32).HumanString());
+  EXPECT_EQ("resource", XlaExpression::Resource(resource_.get()).HumanString());
+}
+
+TEST_F(XlaExpressionTest, AsXlaOp) {
+  xla::XlaOp op_as_op =
+      XlaExpression::XlaOp(op_, DT_INT32).AsXlaOp(builder_.get());
+  EXPECT_TRUE(op_.IsIdenticalTo(op_as_op));
+
+  xla::XlaOp const_as_op =
+      XlaExpression::Constant(constant_).AsXlaOp(builder_.get());
+  TF_ASSERT_OK_AND_ASSIGN(xla::XlaComputation computation,
+                          builder_->BuildConstantSubGraph(const_as_op));
+  TF_ASSERT_OK_AND_ASSIGN(xla::Literal value,
+                          client_->ComputeConstant(computation));
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(xla::LiteralUtil::CreateR0<int32>(42),
+                                          value));
+}
+
+TEST_F(XlaExpressionTest, GetShape) {
+  EXPECT_FALSE(XlaExpression().GetShape().ok());
+  EXPECT_FALSE(XlaExpression::Invalid().GetShape().ok());
+
+  TF_ASSERT_OK_AND_ASSIGN(TensorShape resource_shape,
+                          XlaExpression::Resource(resource_.get()).GetShape());
+  EXPECT_EQ(TensorShape({}), resource_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(TensorShape op_shape,
+                          XlaExpression::XlaOp(op_, DT_INT32).GetShape());
+  EXPECT_EQ(TensorShape({}), op_shape);
+
+  TF_ASSERT_OK_AND_ASSIGN(TensorShape constant_shape,
+                          XlaExpression::Constant(constant_).GetShape());
+  EXPECT_EQ(TensorShape({}), constant_shape);
+}
+
+TEST_F(XlaExpressionTest, ResolveConstant) {
+  EXPECT_FALSE(XlaExpression().ResolveConstant(client_).ok());
+  EXPECT_FALSE(XlaExpression::Invalid().ResolveConstant(client_).ok());
+  EXPECT_FALSE(
+      XlaExpression::Resource(resource_.get()).ResolveConstant(client_).ok());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      absl::optional<Tensor> op_constant,
+      XlaExpression::XlaOp(op_, DT_INT32).ResolveConstant(client_));
+  ASSERT_TRUE(op_constant.has_value());
+  test::ExpectTensorEqual<int32>(test::AsScalar<int32>(7), *op_constant);
+
+  TF_ASSERT_OK_AND_ASSIGN(absl::optional<Tensor> op_nonconstant,
+                          XlaExpression::XlaOp(non_constant_op_, DT_FLOAT)
+                              .ResolveConstant(client_));
+  EXPECT_FALSE(op_nonconstant.has_value());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      absl::optional<Tensor> constant_constant,
+      XlaExpression::Constant(constant_).ResolveConstant(client_));
+  ASSERT_TRUE(constant_constant.has_value());
+  test::ExpectTensorEqual<int32>(constant_, *constant_constant);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
index 1398e9ee536a9675e5b703ec3fabf4a8b9d89cbf..5e8006b8d8f63d67e8409cd89d182f8fe61a7441 100644
--- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
@@ -21,10 +21,10 @@ namespace tensorflow {
 
 bool GpuOpFilter(KernelDef* kdef) {
   if (kdef->op() == "Const") {
-    AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef);
+    AddDtypeToKernelDefConstraint("dtype", DT_STRING, kdef);
   }
   if (kdef->op() == "Assert") {
-    AddDtypeToKernalDefConstraint("T", DT_STRING, kdef);
+    AddDtypeToKernelDefConstraint("T", DT_STRING, kdef);
   }
   return true;
 }
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 9a34cd8c6ae2dc6d52a3cc69168df96f5322c6da..c2c0751211180c3715a19d6c78e34659fd18914e 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -216,8 +215,7 @@ DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
   return dtype;
 }
 
-xla::XlaOp XlaHelpers::ConvertElementType(xla::XlaBuilder* const builder,
-                                          const xla::XlaOp& operand,
+xla::XlaOp XlaHelpers::ConvertElementType(const xla::XlaOp& operand,
                                           const DataType new_element_type) {
   xla::PrimitiveType convert_to;
   TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 39578144caaadf293d24ea91aa874e56e27ecc01..4858dfee55a393d04cd2af83916eeb40820ee368 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -80,8 +80,7 @@ class XlaHelpers {
 
   // A helper for creating a ConvertElementType xla op given a DataType rather
   // than the xla::PrimitiveType.
-  static xla::XlaOp ConvertElementType(xla::XlaBuilder* const builder,
-                                       const xla::XlaOp& operand,
+  static xla::XlaOp ConvertElementType(const xla::XlaOp& operand,
                                        const DataType new_element_type);
 };
 
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 86a78ee429e8913edb4a948727fa692083c472f4..fabbcd04fed96ad814d04c2df9394f43bfe0cf99 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -133,7 +133,8 @@ XlaJitCompiledCpuFunction::Compile(
   jit->executable_ = std::move(executable);
   jit->buffer_infos_ = std::move(buffer_infos);
   jit->arg_index_table_ = std::move(arg_index_table);
-  jit->program_shape_ = std::move(program_shape);
+  jit->program_shape_ =
+      absl::make_unique<xla::ProgramShapeProto>(program_shape->ToProto());
   jit->static_data_.set_raw_function(raw_function);
   jit->static_data_.set_buffer_infos(jit->buffer_infos_.data());
   jit->static_data_.set_num_buffers(jit->buffer_infos_.size());
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
index d3c8f22a8078d03d15447ed200c914390f40b04f..a5392057177e983e11787c31bb496a8947add1e6 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
@@ -80,8 +80,10 @@ class XlaJitCompiledCpuFunction {
   std::vector<const char*> arg_names_;
   std::vector<const char*> result_names_;
 
-  // The backing data for the program shape.
-  std::unique_ptr<const xla::ProgramShape> program_shape_;
+  // The backing data for the program shape. The proto form of program shape is
+  // used because the program shape is serialized and embedded in the object
+  // file.
+  std::unique_ptr<const xla::ProgramShapeProto> program_shape_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index 6d49298a6f3e8a726695fafc42f3c5341fe98b5f..8846088678b53f6b9ecff0de732d6b5c82392b5a 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -116,13 +116,13 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
   // Check program shape.
   using xla::ShapeUtil;
   const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
-  const xla::ProgramShape* program_shape = function.ProgramShape();
-  ASSERT_TRUE(program_shape != nullptr);
-  ASSERT_EQ(program_shape->parameters_size(), 2);
-  EXPECT_TRUE(ShapeUtil::Compatible(program_shape->parameters(0), s32));
-  EXPECT_TRUE(ShapeUtil::Compatible(program_shape->parameters(1), s32));
+  ASSERT_TRUE(function.ProgramShape() != nullptr);
+  const xla::ProgramShape program_shape(*function.ProgramShape());
+  ASSERT_EQ(program_shape.parameters_size(), 2);
+  EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(0), s32));
+  EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(1), s32));
 
-  const xla::Shape& result = program_shape->result();
+  const xla::Shape& result = program_shape.result();
   ASSERT_EQ(result.element_type(), xla::TUPLE);
   ASSERT_EQ(ShapeUtil::TupleElementCount(result), 1);
   const xla::Shape& result0 = ShapeUtil::GetTupleElementShape(result, 0);
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index dd3498ef7aa242d3ad946cae5f60bc2c8853a342..58808c76de6330a6b28e21dbdead03dea25847f6 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -35,40 +36,52 @@ bool XlaOpKernelContext::ValidateInputsAreSameShape(OpKernel* op) {
   return context_->ValidateInputsAreSameShape(op);
 }
 
+XlaContext* XlaOpKernelContext::xla_context() const {
+  return &XlaContext::Get(context_);
+}
+
 xla::XlaBuilder* XlaOpKernelContext::builder() const {
-  return XlaContext::Get(this).builder();
+  return xla_context()->builder();
+}
+
+XlaCompiler* XlaOpKernelContext::compiler() const {
+  return xla_context()->compiler();
 }
 
 // Retrieves an XlaExpression that was allocated by a previous Op.
 static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
-  CHECK(expression->handle().valid() || expression->resource() != nullptr);
-  VLOG(1) << "Fetched T" << expression->handle();
+  CHECK(expression->kind() != XlaExpression::Kind::kInvalid)
+      << expression->HumanString();
   return expression;
 }
 
-// Retrieves an uninitialized XlaExpression from a newly-allocated tensor.
-static XlaExpression* CastExpressionFromUninitializedTensor(Tensor* tensor) {
+// Assigns an XlaExpression to a tensor on an XLA compilation device.
+static void AssignExpressionToTensor(Tensor* tensor,
+                                     const XlaExpression& value) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor->tensor_data().data());
-  CHECK(!expression->handle().valid());
-  return const_cast<XlaExpression*>(expression);
+  CHECK(expression->kind() == XlaExpression::Kind::kInvalid)
+      << expression->HumanString();
+  *const_cast<XlaExpression*>(expression) = value;
+}
+
+const XlaExpression& XlaOpKernelContext::InputExpression(int index) {
+  return *CastExpressionFromTensor(context_->input(index));
 }
 
-// Retrieves the XlaOp from an input Tensor to an Op. This computation was
-// constructed by an Op that executed previously and created the output Tensor
-// using CreateOutputTensorFromComputation or CreateConstantOutputTensor.
-static const xla::XlaOp& GetComputationFromTensor(const Tensor& tensor) {
-  return CastExpressionFromTensor(tensor)->handle();
+const XlaExpression& XlaOpKernelContext::InputExpression(
+    absl::string_view name) {
+  return *CastExpressionFromTensor(GetInputTensorByName(name));
 }
 
-const xla::XlaOp& XlaOpKernelContext::Input(int index) {
-  return GetComputationFromTensor(context_->input(index));
+xla::XlaOp XlaOpKernelContext::Input(int index) {
+  return InputExpression(index).AsXlaOp(builder());
 }
 
-const xla::XlaOp& XlaOpKernelContext::Input(absl::string_view name) {
-  return GetComputationFromTensor(GetInputTensorByName(name));
+xla::XlaOp XlaOpKernelContext::Input(absl::string_view name) {
+  return InputExpression(name).AsXlaOp(builder());
 }
 
 TensorShape XlaOpKernelContext::InputShape(int index) {
@@ -125,77 +138,18 @@ Status XlaOpKernelContext::ConstantInput(absl::string_view name,
 Status XlaOpKernelContext::ConstantInputReshaped(
     int index, absl::Span<const int64> new_dims,
     xla::Literal* constant_literal) {
-  const Tensor& tensor = context_->input(index);
-  TensorShape new_shape(new_dims);
-  if (tensor.NumElements() != new_shape.num_elements()) {
-    return errors::InvalidArgument(
-        context_->op_kernel().name(), " input ", index, " has shape ",
-        tensor.shape().DebugString(),
-        " but was asked to be reshaped to incompatible shape ",
-        new_shape.DebugString());
-  }
-  const XlaExpression* expression = CastExpressionFromTensor(tensor);
-
-  auto copy_tensor_to_literal = [](const Tensor& tensor,
-                                   xla::Literal* literal) {
-    xla::Shape literal_shape;
-    TF_RETURN_IF_ERROR(
-        TensorShapeToXLAShape(tensor.dtype(), tensor.shape(), &literal_shape));
-
-    *literal = xla::Literal(literal_shape);
-
-    // memcpy over the payload ...
-    // TODO(phawkins): handle string types.
-    size_t total_bytes = tensor.TotalBytes();
-    if (total_bytes > 0) {
-      void* dst_ptr = literal->untyped_data();
-      const void* src_ptr = DMAHelper::base(&tensor);
-      memcpy(dst_ptr, src_ptr, total_bytes);
-    }
-    return Status::OK();
-  };
-
-  // If the tensor has a known constant value, there is no need to invoke XLA.
-  if (expression->has_constant_value()) {
-    Tensor temp(tensor.dtype());
-    if (!temp.CopyFrom(expression->constant_value(), new_shape)) {
-      // This should never happen. The constant should have a shape compatible
-      // with the enclosing Tensor.
-      return errors::Internal("Incompatible shapes in ConstantInputReshaped.");
-    }
-
-    return copy_tensor_to_literal(temp, constant_literal);
-  }
-
-  // Make sure we treat zero-element tensors as constant.
-  if (new_shape.num_elements() == 0) {
-    Tensor temp(tensor.dtype(), new_shape);
-
-    return copy_tensor_to_literal(temp, constant_literal);
-  }
-
-  xla::XlaOp handle = expression->handle();
-  if (new_shape != tensor.shape()) {
-    // Reshape the handle to the desired shape.
-    handle = xla::Reshape(handle, new_shape.dim_sizes());
-  }
-
-  // The XLA layout is specified minor to major, and TensorFlow's minor
-  // dimension is the last one.
-  std::vector<int64> layout_indices(new_shape.dims());
-  std::iota(layout_indices.rbegin(), layout_indices.rend(), 0);
-  xla::Layout layout = xla::LayoutUtil::MakeLayout(layout_indices);
-
-  xla::StatusOr<bool> is_constant = builder()->IsConstant(handle);
-  if (!is_constant.ok()) {
-    Status status = is_constant.status();
+  XlaExpression e = InputExpression(index);
+  xla::StatusOr<absl::optional<Tensor>> constant_or_status =
+      e.ResolveConstant(compiler()->client());
+  if (!constant_or_status.ok()) {
+    Status status = constant_or_status.status();
     errors::AppendToMessage(&status, "while evaluating input ", index, " of ",
                             context_->op_kernel().type_string(),
                             " operator as a compile-time constant.");
     return status;
   }
-
-  if (!is_constant.ValueOrDie()) {
+  absl::optional<Tensor> constant = constant_or_status.ValueOrDie();
+  if (!constant.has_value()) {
     return errors::InvalidArgument(
         "Input ", index, " to ", context_->op_kernel().type_string(),
         " operator must be a compile-time constant.\n"
@@ -208,25 +162,16 @@ Status XlaOpKernelContext::ConstantInputReshaped(
         "stateful operation such as a random number generator.");
   }
 
-  // Ask the XLA compiler to evaluate the data handle to a literal.
-  xla::StatusOr<xla::XlaComputation> constant_graph =
-      builder()->BuildConstantSubGraph(handle);
-  if (!constant_graph.ok()) {
-    return errors::Internal(
-        "Error getting a compile-time constant graph for ",
-        context_->op_kernel().name(), " input ", index,
-        ".\nError: ", constant_graph.status().error_message());
-  }
-  xla::StatusOr<xla::Literal> computed = compiler()->client()->ComputeConstant(
-      constant_graph.ValueOrDie(), &layout);
-  if (!computed.ok()) {
-    return errors::Internal("Error evaluating ", context_->op_kernel().name(),
-                            " input ", index,
-                            " as a compile-time constant.\nError: ",
-                            computed.status().error_message());
+  Tensor temp(constant->dtype());
+  if (!temp.CopyFrom(*constant, TensorShape(new_dims))) {
+    return errors::InvalidArgument(
+        context_->op_kernel().name(), " input ", index, " has shape ",
+        constant->shape().DebugString(),
+        " but was asked to be reshaped to incompatible shape ",
+        TensorShape(new_dims).DebugString());
   }
-  *constant_literal = std::move(computed).ValueOrDie();
 
+  TF_ASSIGN_OR_RETURN(*constant_literal, HostTensorToLiteral(temp));
   return Status::OK();
 }
 
@@ -322,6 +267,15 @@ Status XlaOpKernelContext::ConstantInputReshapedToIntVector(
   return LiteralToInt64Vector(literal, out);
 }
 
+Status XlaOpKernelContext::ConstantInputReshapedToIntVector(
+    absl::string_view name, std::vector<int64>* out) {
+  TF_ASSIGN_OR_RETURN(int index, InputIndex(this, name));
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInputReshaped(
+      index, {InputShape(index).num_elements()}, &literal));
+  return LiteralToInt64Vector(literal, out);
+}
+
 Status XlaOpKernelContext::ConstantInputAsInt64Literal(int index,
                                                        xla::Literal* out) {
   xla::Literal literal;
@@ -372,7 +326,7 @@ Status XlaOpKernelContext::InputList(absl::string_view name,
   handles->clear();
   shapes->clear();
   for (const Tensor& input : inputs) {
-    handles->push_back(GetComputationFromTensor(input));
+    handles->push_back(CastExpressionFromTensor(input)->AsXlaOp(builder()));
     shapes->push_back(input.shape());
   }
   return Status::OK();
@@ -392,8 +346,8 @@ Status XlaOpKernelContext::ConstantInputList(
 namespace {
 
 Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
-                               const OpKernelContext* ctx, TensorShape* shape,
-                               xla::XlaOp* value) {
+                               const XlaOpKernelContext* ctx,
+                               TensorShape* shape, xla::XlaOp* value) {
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
@@ -411,11 +365,13 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
     *shape = variable->shape();
   }
 
-  XlaContext& xla_context = XlaContext::Get(ctx);
-  TF_ASSIGN_OR_RETURN(
-      TensorShape representation_shape,
-      xla_context.RepresentationShape(variable->shape(), variable->type()));
-  if (representation_shape == variable->shape()) {
+  TF_ASSIGN_OR_RETURN(xla::Shape representation_shape,
+                      ctx->compiler()->options().shape_representation_fn(
+                          variable->shape(), variable->type()));
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(
+      TensorShapeToXLAShape(variable->type(), variable->shape(), &xla_shape));
+  if (xla::ShapeUtil::Compatible(xla_shape, representation_shape)) {
     *value = variable->value();
   } else {
     *value = xla::Reshape(variable->value(), variable->shape().dim_sizes());
@@ -428,15 +384,15 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
 Status XlaOpKernelContext::ReadVariableInput(int index, DataType type,
                                              TensorShape* shape,
                                              xla::XlaOp* value) {
-  return ReadVariableInputTensor(context_->input(index), type, context_, shape,
+  return ReadVariableInputTensor(context_->input(index), type, this, shape,
                                  value);
 }
 
 Status XlaOpKernelContext::ReadVariableInput(absl::string_view name,
                                              DataType type, TensorShape* shape,
                                              xla::XlaOp* value) {
-  return ReadVariableInputTensor(GetInputTensorByName(name), type, context_,
-                                 shape, value);
+  return ReadVariableInputTensor(GetInputTensorByName(name), type, this, shape,
+                                 value);
 }
 
 Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
@@ -455,90 +411,53 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
   return Status::OK();
 }
 
-Status XlaOpKernelContext::allocate_output(int index, const xla::Shape& shape,
-                                           Tensor** output) {
-  // The step's default allocator is the dummy XlaCompilationAllocator which
-  // simply allocates a metadata buffer to hold the expression to which it
-  // corresponds.
-  if (expected_output_dtype(index) == DT_VARIANT) {
-    // tensor_data() is not supported for variant Tensor (i.e.,
-    // DataTypeCanUseMemcpy is false for DT_VARIANT), and so storing the
-    // XlaExpression inside the Tensor's tensor_data() does not work for
-    // variant. Instead construct a uint8 tensor and store the expression in its
-    // value.
-    // TODO(jpienaar): This should be refactored to stop masquerading
-    // XlaExpressions as Tensors.
-    *output = new Tensor();
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(
-        context_->allocate_temp(DT_UINT8, tensor_shape, *output));
-    context_->set_output(index, **output);
-  } else {
-    TensorShape tensor_shape;
-    TF_RETURN_IF_ERROR(XLAShapeToTensorShape(shape, &tensor_shape));
-    TF_RETURN_IF_ERROR(context_->allocate_output(index, tensor_shape, output));
+void XlaOpKernelContext::SetOutputExpression(int index,
+                                             const XlaExpression& expression) {
+  Status status = [&] {
+    // The step's default allocator is the dummy XlaCompilationAllocator which
+    // simply allocates a metadata buffer to hold the expression to which it
+    // corresponds.
+    Tensor* output = nullptr;
+    // Provides a special behavior for DT_VARIANT: a variant is treated as
+    // DT_UINT8 scalar as the type to allow mapping for variant to more generic
+    // types.
+    if (expression.dtype() == DT_VARIANT) {
+      // tensor_data() is not supported for variant Tensor (i.e.,
+      // DataTypeCanUseMemcpy is false for DT_VARIANT), and so storing the
+      // XlaExpression inside the Tensor's tensor_data() does not work for
+      // variant. Instead construct a uint8 tensor and store the expression in
+      // its value.
+      // TODO(jpienaar): This should be refactored to stop masquerading
+      // XlaExpressions as Tensors.
+      output = new Tensor();
+      TensorShape tensor_shape;
+      TF_RETURN_IF_ERROR(
+          context_->allocate_temp(DT_UINT8, tensor_shape, output));
+      context_->set_output(index, *output);
+    } else {
+      TF_ASSIGN_OR_RETURN(TensorShape shape, expression.GetShape());
+      TF_RETURN_IF_ERROR(context_->allocate_output(index, shape, &output));
+    }
+    AssignExpressionToTensor(output, expression);
+    return Status::OK();
+  }();
+  if (!status.ok()) {
+    SetStatus(status);
   }
-  return Status::OK();
 }
 
 void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) {
-  // Makes the host Tensor that will refer to the expression.
-  Tensor* output = nullptr;
-  auto shape_or = builder()->GetShape(handle);
-  if (!shape_or.ok()) {
-    SetStatus(shape_or.status());
-    return;
-  }
-
-  OP_REQUIRES_OK(context_,
-                 allocate_output(index, shape_or.ValueOrDie(), &output));
-
-  // The expression is stored in the tensor's data buffer. Fill in the
-  // fields now.
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_handle(handle);
+  SetOutputExpression(
+      index,
+      XlaExpression::XlaOp(handle, context_->expected_output_dtype(index)));
 }
 
 void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
-  const TensorShape& shape = constant.shape();
-
-  xla::BorrowingLiteral literal;
-  OP_REQUIRES_OK(context_, HostTensorToBorrowingLiteral(constant, &literal));
-
-  xla::XlaOp handle = xla::ConstantLiteral(builder(), literal);
-  CHECK(handle.valid());
-
-  // Make the Tensor that will refer to the expression.
-  Tensor* output = nullptr;
-  // The step's default allocator is the dummy XlaCompilationAllocator which
-  // simply allocates a metadata buffer to hold the expression to which it
-  // corresponds.
-  OP_REQUIRES_OK(context_, context_->allocate_output(index, shape, &output));
-
-  // The expression is stored in the tensor's data buffer. Fill in the
-  // fields now.
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_handle(handle);
-  expression->set_constant_value(constant);
-}
-
-void XlaOpKernelContext::SetInvalidOutput(int index) {
-  Tensor* output = nullptr;
-  OP_REQUIRES_OK(context_,
-                 context_->allocate_output(index, TensorShape({}), &output));
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  xla::XlaOp handle;
-  expression->set_handle(handle);
+  SetOutputExpression(index, XlaExpression::Constant(constant));
 }
 
 void XlaOpKernelContext::SetResourceOutput(int index, XlaResource* resource) {
-  Tensor* output = nullptr;
-  // The shape of the output tensor is the shape of the resource itself
-  // (i.e., a scalar), not the shape of the resource's value.
-  OP_REQUIRES_OK(context_,
-                 context_->allocate_output(index, TensorShape(), &output));
-  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  expression->set_resource(resource);
+  SetOutputExpression(index, XlaExpression::Resource(resource));
 }
 
 Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
@@ -552,7 +471,7 @@ Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
 namespace {
 
 Status AssignVariableTensor(const Tensor& tensor, DataType type,
-                            const OpKernelContext* ctx, xla::XlaOp handle,
+                            const XlaOpKernelContext* ctx, xla::XlaOp handle,
                             xla::XlaBuilder* builder) {
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
@@ -569,11 +488,14 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
 
   TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
 
-  XlaContext& xla_context = XlaContext::Get(ctx);
-  TF_ASSIGN_OR_RETURN(TensorShape representation_shape,
-                      xla_context.RepresentationShape(shape, type));
-  if (shape != representation_shape) {
-    handle = xla::Reshape(handle, representation_shape.dim_sizes());
+  TF_ASSIGN_OR_RETURN(
+      xla::Shape representation_shape,
+      ctx->compiler()->options().shape_representation_fn(shape, type));
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+  if (!xla::ShapeUtil::Compatible(xla_shape, representation_shape)) {
+    handle = xla::Reshape(handle,
+                          xla::AsInt64Slice(representation_shape.dimensions()));
   }
   return variable->SetValue(handle);
 }
@@ -583,19 +505,15 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
 Status XlaOpKernelContext::AssignVariable(int input_index, DataType type,
                                           xla::XlaOp handle) {
   TF_RET_CHECK(handle.valid());
-  return AssignVariableTensor(context_->input(input_index), type, context_,
-                              handle, builder());
+  return AssignVariableTensor(context_->input(input_index), type, this, handle,
+                              builder());
 }
 
 Status XlaOpKernelContext::AssignVariable(absl::string_view name, DataType type,
                                           xla::XlaOp handle) {
   TF_RET_CHECK(handle.valid());
-  return AssignVariableTensor(GetInputTensorByName(name), type, context_,
-                              handle, builder());
-}
-
-XlaCompiler* XlaOpKernelContext::compiler() const {
-  return XlaContext::Get(context_).compiler();
+  return AssignVariableTensor(GetInputTensorByName(name), type, this, handle,
+                              builder());
 }
 
 void XlaOpKernelContext::CtxFailure(const Status& s) {
@@ -615,22 +533,22 @@ void XlaOpKernelContext::CtxFailureWithWarning(const char* file, int line,
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMax(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateMax(type);
+  return xla_context()->GetOrCreateMax(type);
 }
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMin(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateMin(type);
+  return xla_context()->GetOrCreateMin(type);
 }
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateAdd(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateAdd(type);
+  return xla_context()->GetOrCreateAdd(type);
 }
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMul(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateMul(type);
+  return xla_context()->GetOrCreateMul(type);
 }
 
 const Tensor& XlaOpKernelContext::GetInputTensorByName(absl::string_view name) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index aa00a454968ad29495e34dc080e55b62bb0b5f7b..1858844bc05a6e12abbf07af83cad816590ddd03 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -60,6 +60,8 @@ class XlaOpKernelContext {
  public:
   explicit XlaOpKernelContext(OpKernelContext* context);
 
+  XlaContext* xla_context() const;
+
   // Returns the XLA XlaBuilder containing the output of compilation.
   xla::XlaBuilder* builder() const;
 
@@ -88,9 +90,9 @@ class XlaOpKernelContext {
   // Returns input `index` as a XlaOp. Unlike
   // OpKernelContext::Input returns a symbolic value rather than a concrete
   // Tensor.
-  const xla::XlaOp& Input(int index);
+  xla::XlaOp Input(int index);
   // Returns input `name` as a XlaOp.
-  const xla::XlaOp& Input(absl::string_view name);
+  xla::XlaOp Input(absl::string_view name);
 
   // Returns true if all inputs are the same shape, otherwise sets the
   // status to a non-OK value and returns false.
@@ -111,14 +113,6 @@ class XlaOpKernelContext {
   Status ConstantInput(int index, xla::Literal* constant_literal);
   Status ConstantInput(absl::string_view name, xla::Literal* constant_literal);
 
-  // Evaluates input `index`, reshapes it to `new_shape` if new_shape !=
-  // InputShape(index), and stores it in `*constant_literal`. If the input
-  // cannot be evaluated, e.g., because it depends on unbound parameters,
-  // returns a non-Ok status. If InputShape(index).num_elements() !=
-  // new_shape.num_elements(), returns an error status.
-  Status ConstantInputReshaped(int index, absl::Span<const int64> new_dims,
-                               xla::Literal* constant_literal);
-
   // Converts a constant scalar int32 or int64 tensor into an int64.
   Status ConstantInputAsIntScalar(int index, int64* out);
   Status ConstantInputAsIntScalar(absl::string_view name, int64* out);
@@ -134,6 +128,8 @@ class XlaOpKernelContext {
   // Reshapes and converts a constant int32 or int64 tensor into a vector of
   // int64s.
   Status ConstantInputReshapedToIntVector(int index, std::vector<int64>* out);
+  Status ConstantInputReshapedToIntVector(absl::string_view name,
+                                          std::vector<int64>* out);
 
   // Converts a constant int32 or int64 Tensor into an xla int64 Literal.
   Status ConstantInputAsInt64Literal(int index, xla::Literal* out);
@@ -148,6 +144,10 @@ class XlaOpKernelContext {
   Status ConstantInputList(absl::string_view name,
                            std::vector<xla::Literal>* literals);
 
+  // Returns an XlaExpression describing the value of 'index'.
+  const XlaExpression& InputExpression(int index);
+  const XlaExpression& InputExpression(absl::string_view name);
+
   // Outputs
 
   int num_outputs() const { return context_->num_outputs(); }
@@ -165,9 +165,8 @@ class XlaOpKernelContext {
   // SetConstantOutput where possible.
   void SetConstantOutput(int index, const Tensor& host_tensor);
 
-  // Sets output `index` to an invalid value.
-  // Any subsequent attempt to consume this output will cause an error.
-  void SetInvalidOutput(int index);
+  // Returns an XlaExpression describing the value of 'index'.
+  void SetOutputExpression(int index, const XlaExpression& expression);
 
   // Status handling.
   void SetStatus(const Status& status) { context_->SetStatus(status); }
@@ -255,10 +254,13 @@ class XlaOpKernelContext {
   // Returns the tensor of input `name`.
   const Tensor& GetInputTensorByName(absl::string_view name);
 
-  // Wraps OpKernelContext's allocate_output method while providing special
-  // behavior for DT_VARIANT: a variant is treated as DT_UINT8 scalar as the
-  // type to allow mapping for variant to more generic types.
-  Status allocate_output(int index, const xla::Shape& shape, Tensor** output);
+  // Evaluates input `index`, reshapes it to `new_shape` if new_shape !=
+  // InputShape(index), and stores it in `*constant_literal`. If the input
+  // cannot be evaluated, e.g., because it depends on unbound parameters,
+  // returns a non-Ok status. If InputShape(index).num_elements() !=
+  // new_shape.num_elements(), returns an error status.
+  Status ConstantInputReshaped(int index, absl::Span<const int64> new_dims,
+                               xla::Literal* constant_literal);
 
   OpKernelContext* const context_;
 };
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 91d48125f1d21092db7e5f9307e44af9c16e4e2b..14237df69081016817fbd1a5332f22996e7f264d 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
@@ -128,21 +130,26 @@ XlaOpRegistry::~XlaOpRegistry() = default;
   // Lazily register the CPU and GPU JIT devices the first time
   // GetCompilationDevice is called.
   static void* registration_init = [&registry]() {
+    MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
+    bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
+
     mutex_lock lock(registry.mutex_);
     if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_CPU)).ok()) {
       DeviceRegistration& registration =
           registry.compilation_devices_[DEVICE_CPU];
       registration.compilation_device_name = DEVICE_CPU_XLA_JIT;
-      registration.requires_compilation = false;
-      registration.enable_jit_by_default = false;
+      registration.autoclustering_policy =
+          cpu_global_jit
+              ? XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally
+              : XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested;
       registration.compile_resource_ops = false;
     }
     if (LaunchOpHasKernelForDevice(DeviceType(DEVICE_GPU)).ok()) {
       DeviceRegistration& registration =
           registry.compilation_devices_[DEVICE_GPU];
       registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
-      registration.requires_compilation = false;
-      registration.enable_jit_by_default = true;
+      registration.autoclustering_policy =
+          XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally;
       registration.compile_resource_ops = false;
     }
     return nullptr;
@@ -341,18 +348,69 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
   return ops;
 }
 
-/* static */ const std::unordered_set<string>*
-XlaOpRegistry::CompileTimeConstantInputs(const string& op) {
-  XlaOpRegistry& registry = Instance();
-  mutex_lock lock(registry.mutex_);
-  auto it = registry.ops_.find(op);
-  if (it == registry.ops_.end() || it->second.empty()) {
-    return nullptr;
+/* static */ Status XlaOpRegistry::CompileTimeConstantInputs(
+    const NodeDef& node_def, const OpKernel* op_kernel, const OpDef* op_def,
+    std::vector<int>* result) {
+  result->clear();
+
+  DCHECK(op_def != nullptr || op_kernel != nullptr);
+
+  std::unordered_set<string> compile_time_constant_inputs_from_attr;
+  std::vector<string> compile_time_constant_inputs_vect_from_attr;
+
+  const std::unordered_set<string>* compile_time_constant_inputs;
+
+  if (GetNodeAttr(node_def, kXlaCompileTimeConstantInputsAttr,
+                  &compile_time_constant_inputs_vect_from_attr)
+          .ok()) {
+    absl::c_copy(compile_time_constant_inputs_vect_from_attr,
+                 std::inserter(compile_time_constant_inputs_from_attr,
+                               compile_time_constant_inputs_from_attr.end()));
+    compile_time_constant_inputs = &compile_time_constant_inputs_from_attr;
+  } else {
+    const string& op = node_def.op();
+
+    XlaOpRegistry& registry = Instance();
+    mutex_lock lock(registry.mutex_);
+    auto it = registry.ops_.find(op);
+    if (it == registry.ops_.end() || it->second.empty()) {
+      return Status::OK();
+    } else {
+      // The test in IsCompatible ensures that if there are multiple matching
+      // registrations for this op name, they all have the same value of
+      // compile_time_constant_inputs, so only the first match is returned.
+      //
+      // TODO(sanjoy): This can probably be a std::vector<string>.
+      compile_time_constant_inputs =
+          &it->second.front()->compile_time_constant_inputs;
+    }
   }
-  // The test in IsCompatible ensures that if there are multiple matching
-  // registrations for this op name, they all have the same value of
-  // compile_time_constant_inputs, so only the first match is returned.
-  return &it->second.front()->compile_time_constant_inputs;
+
+  for (const string& input : *compile_time_constant_inputs) {
+    if (op_def) {
+      NameRangeMap input_name_ranges;
+      TF_RETURN_IF_ERROR(
+          NameRangesForNode(node_def, *op_def, &input_name_ranges, nullptr));
+      auto name_range = input_name_ranges.find(input);
+      if (name_range == input_name_ranges.end()) {
+        continue;
+      }
+
+      for (int i = name_range->second.first; i < name_range->second.second;
+           i++) {
+        result->push_back(i);
+      }
+    } else {
+      int start, stop;
+      TF_CHECK_OK(op_kernel->InputRange(input, &start, &stop));
+      for (int i = start; i < stop; ++i) {
+        result->push_back(i);
+      }
+    }
+  }
+
+  absl::c_sort(*result);
+  return Status::OK();
 }
 
 /*static*/ bool XlaOpRegistry::IsMetadataOp(const string& op) {
@@ -445,7 +503,7 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
   return *this;
 }
 
-XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::CompileTimeConstInput(
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::CompileTimeConstantInput(
     absl::string_view input_name) {
   registration_->compile_time_constant_inputs.emplace(input_name);
   return *this;
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 4b2c2bacd647b3e6fe500a942b116772550195ce..0bdd4a1085445420a5147756daac4a54f4725f11 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -66,19 +66,26 @@ class XlaOpRegistry {
  public:
   typedef OpKernel* (*Factory)(OpKernelConstruction*);
 
+  enum class AutoclusteringPolicy {
+    // Enable autoclustering if the user requests it, e.g., via
+    // experimental_jit_scope. Does not autocluster if the JIT is enabled
+    // globally (e.g., via the OptimizerOptions in the TF session
+    // configuration.)
+    kIfExplicitlyRequested,
+    // Enable autoclustering if explicitly requested, or if the JIT is enabled
+    // globally in the session options, or via TF_XLA_FLAGS=--tf_xla_auto_jit=N.
+    kIfEnabledGlobally,
+    // Always try to autocluster ops placed on this device.
+    kAlways,
+  };
+
   // Describes how to compile operators assigned to a device.
   struct DeviceRegistration {
     // The name of the an XLA compilation device to use to compile code.
     string compilation_device_name;
 
-    // Do operators assigned to this device require compilation?
-    bool requires_compilation;
-
-    // If !requires_compilation, should we try to JIT operators on this device
-    // when XLA JIT compilation is enabled globally via the SessionOptions?
-    // (It is still possible to explicitly mark operators to JIT compile, even
-    // if enable_jit_by_default is false.)
-    bool enable_jit_by_default;
+    // When should we autocluster operators assigned to this device?
+    AutoclusteringPolicy autoclustering_policy;
 
     // Enable compilation of operators that use DT_RESOURCE types?
     bool compile_resource_ops = false;
@@ -106,6 +113,7 @@ class XlaOpRegistry {
 
   // Registers `device_name` for XLA compilation, using information from
   // `registration`.
+  // Does nothing if a registration for `device_name` already exists.
   static void RegisterCompilationDevice(const string& device_name,
                                         const DeviceRegistration& registration);
 
@@ -132,10 +140,27 @@ class XlaOpRegistry {
   // Returns all operations for which there are XLA kernels on any device.
   static std::vector<string> GetAllRegisteredOps();
 
-  // Returns the set of compile-time constant inputs to 'op'. Returns nullptr
-  // if the op is not registered.
-  static const std::unordered_set<string>* CompileTimeConstantInputs(
-      const string& op);
+  // Returns (via `result`) the indices of inputs to `node_def` that must be
+  // compile-time constants. Returns an empty vector if the op is not
+  // registered.
+  //
+  // `result` is sorted.
+  static Status CompileTimeConstantInputs(const NodeDef& node_def,
+                                          const OpDef& op_def,
+                                          std::vector<int>* result) {
+    return CompileTimeConstantInputs(node_def, /*op_kernel=*/nullptr, &op_def,
+                                     result);
+  }
+
+  // Returns (via `result`) the indices of inputs to `op_kernel` that must be
+  // compile-time constants.
+  //
+  // `result` is sorted.
+  static Status CompileTimeConstantInputs(const OpKernel& op_kernel,
+                                          std::vector<int>* result) {
+    return CompileTimeConstantInputs(op_kernel.def(), /*op_kernel=*/&op_kernel,
+                                     /*op_def=*/nullptr, result);
+  }
 
   // Returns true if `op` is a "metadata" op, one that only looks at the shapes
   // of its operands and not their values.
@@ -212,6 +237,11 @@ class XlaOpRegistry {
   // whitelists must not intersect.
   static bool IsCompatible(const OpRegistration& x, const OpRegistration& y);
 
+  static Status CompileTimeConstantInputs(const NodeDef& node_def,
+                                          const OpKernel* op_kernel,
+                                          const OpDef* op_def,
+                                          std::vector<int>* result);
+
   // Map from operator name to OpRegistrations, populated by REGISTER_XLA_OP.
   // Registrations present under the same key must satisfy IsCompatible above,
   // and this is checked during registration.
@@ -263,7 +293,8 @@ class XlaOpRegistrationBuilder {
   XlaOpRegistrationBuilder& AllowResourceTypes();
 
   // Mark 'input_name' as an argument whose value must be known at compile-time.
-  XlaOpRegistrationBuilder& CompileTimeConstInput(absl::string_view input_name);
+  XlaOpRegistrationBuilder& CompileTimeConstantInput(
+      absl::string_view input_name);
 
   // Mark this op as a "metadata" op, one that only looks at the shapes of its
   // operands and not their values.
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index 56c2e01055665954b99ea635e56666fbd8b96026..48a3c012727acd8472d3d5d4072ae700f5497d96 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -26,10 +27,44 @@ limitations under the License.
 
 namespace tensorflow {
 
+/*static*/ absl::string_view XlaResource::KindToString(XlaResource::Kind kind) {
+  switch (kind) {
+    case XlaResource::kInvalid:
+      return "invalid";
+    case XlaResource::kVariable:
+      return "variable";
+    case XlaResource::kStack:
+      return "stack";
+    case XlaResource::kTensorArray:
+      return "tensorarray";
+  }
+}
+
+/*static*/ std::unique_ptr<XlaResource> XlaResource::CreateStack(
+    string name, DataType type, int64 max_size) {
+  return absl::make_unique<XlaResource>(
+      XlaResource::kStack, /*arg_num=*/-1, std::move(name), type, TensorShape(),
+      /*initial_value=*/xla::XlaOp(),
+      /*max_array_size=*/max_size,
+      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_multiple_writes_aggregate=*/false);
+}
+
+/*static*/ std::unique_ptr<XlaResource> XlaResource::CreateTensorArray(
+    string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
+    int64 max_array_size) {
+  return absl::make_unique<XlaResource>(
+      XlaResource::kTensorArray, /*arg_num=*/-1, std::move(name), type, shape,
+      initial_value, max_array_size,
+      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_multiple_writes_aggregate=*/false);
+}
+
 XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
                          TensorShape shape, const xla::XlaOp& initial_value,
-                         int64 tensor_array_size,
-                         const std::set<string>& tensor_array_gradients)
+                         int64 max_array_size,
+                         const std::set<string>& tensor_array_gradients,
+                         bool tensor_array_multiple_writes_aggregate)
     : kind_(kind),
       arg_num_(arg_num),
       name_(std::move(name)),
@@ -37,14 +72,17 @@ XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
       shape_(std::move(shape)),
       value_(initial_value),
       initial_value_(initial_value),
-      tensor_array_size_(tensor_array_size) {
+      max_array_size_(max_array_size),
+      tensor_array_multiple_writes_aggregate_(
+          tensor_array_multiple_writes_aggregate) {
   CHECK(kind_ != kInvalid);
 
   for (const string& gradient : tensor_array_gradients) {
     tensor_array_gradients_[gradient].reset(new XlaResource(
         /*kind=*/kTensorArray, /*arg_num=*/-1,
         /*name=*/absl::StrCat("TensorArrayGrad: ", name_), type_, shape_,
-        xla::XlaOp(), tensor_array_size_, /*tensor_array_gradients=*/{}));
+        xla::XlaOp(), max_array_size_, /*tensor_array_gradients=*/{},
+        /*tensor_array_multiple_writes_aggregate=*/true));
   }
 }
 
@@ -96,7 +134,7 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
     }
     case kTensorArray: {
       TensorShape ta_shape;
-      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AddDim(max_array_size_);
       ta_shape.AppendShape(shape_);
       value_ = xla::Broadcast(XlaHelpers::Zero(builder, type_),
                               ta_shape.dim_sizes());
@@ -104,7 +142,7 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
     }
     case kStack: {
       TensorShape ta_shape;
-      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AddDim(max_array_size_);
       ta_shape.AppendShape(shape_);
       value_ =
           xla::Tuple(builder, {xla::Broadcast(XlaHelpers::Zero(builder, type_),
@@ -129,15 +167,16 @@ Status XlaResource::GetOrCreateTensorArrayGradient(const string& source,
   std::unique_ptr<XlaResource>& gradient = tensor_array_gradients_[source];
   if (!gradient) {
     TensorShape ta_shape;
-    ta_shape.AddDim(tensor_array_size_);
+    ta_shape.AddDim(max_array_size_);
     ta_shape.AppendShape(shape_);
     xla::XlaOp gradient_value =
         xla::Broadcast(XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes());
     gradient.reset(
         new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
                         /*name=*/absl::StrCat("TensorArrayGrad: ", name_),
-                        type_, shape_, gradient_value, tensor_array_size_,
-                        /*tensor_array_gradients=*/{}));
+                        type_, shape_, gradient_value, max_array_size_,
+                        /*tensor_array_gradients=*/{},
+                        /*tensor_array_multiple_writes_aggregate=*/true));
   }
   *gradient_out = gradient.get();
   return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 2438490be13809b9f3571a362900b44cb838e76b..736588bb8b89ba756cdce77eeebff8d1fcf4774c 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -35,11 +36,22 @@ class XlaResource {
     kTensorArray,
     kStack,
   };
+  static absl::string_view KindToString(Kind kind);
+
+  // Creates a new Stack resource.
+  static std::unique_ptr<XlaResource> CreateStack(string name, DataType type,
+                                                  int64 max_size);
+
+  // Creates a new TensorArray resource.
+  static std::unique_ptr<XlaResource> CreateTensorArray(
+      string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
+      int64 max_array_size);
 
   XlaResource(Kind kind, int arg_num, string name, DataType type,
               TensorShape shape, const xla::XlaOp& initial_value,
-              int64 tensor_array_size,
-              const std::set<string>& tensor_array_gradients);
+              int64 max_array_size,
+              const std::set<string>& tensor_array_gradients,
+              bool tensor_array_multiple_writes_aggregate);
 
   XlaResource(const XlaResource&) = delete;
   XlaResource(XlaResource&&) = delete;
@@ -113,13 +125,19 @@ class XlaResource {
                      const xla::XlaOp& pack, xla::XlaBuilder* builder);
 
   // TensorArray and Stack specific fields
+  // TODO(phawkins): refactor this code to use subclasses, rather than putting
+  // kind-specific fields in XlaResource.
 
-  // 'tensor_array_size' stores the expected size of the TensorArray or Stack.
+  // 'max_array_size' stores the expected size of the TensorArray or Stack.
   // We need to store this since sometimes TensorArrays must be initialized
   // lazily since we do not know the element shape at construction time.
   // Used by both TensorArrays and Stacks.
-  int64 tensor_array_size() const { return tensor_array_size_; }
-  void set_tensor_array_size(int64 size) { tensor_array_size_ = size; }
+  int64 max_array_size() const { return max_array_size_; }
+  void set_max_array_size(int64 size) { max_array_size_ = size; }
+
+  bool tensor_array_multiple_writes_aggregate() const {
+    return tensor_array_multiple_writes_aggregate_;
+  }
 
   // 'tensor_array_gradient' is a map from TensorArrayGradV3 'source' attributes
   // to an XlaResource containing the gradient TensorArrays. We store a pointer
@@ -142,7 +160,8 @@ class XlaResource {
   xla::XlaOp value_;
   xla::XlaOp initial_value_;
 
-  int64 tensor_array_size_ = -1;
+  int64 max_array_size_ = -1;
+  bool tensor_array_multiple_writes_aggregate_ = false;
 
   std::map<string, std::unique_ptr<XlaResource>> tensor_array_gradients_;
 };
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index cc7390c6e60375b4c31c38f9f7dee25730f8f51e..4360e0857964b0ac63fc887e269b04a4b00d854a 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -7,6 +7,7 @@ package_group(
     packages = [
         "//tensorflow/compiler/...",
         "//tensorflow/contrib/tpu/...",
+        "//third_party/py/jax/...",
     ],
 )
 
@@ -67,7 +68,7 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
     ],
 )
 
@@ -225,12 +226,14 @@ cc_library(
         "index_util.cc",
         "layout_util.cc",
         "primitive_util.cc",
+        "shape.cc",
         "shape_util.cc",
     ],
     hdrs = [
         "index_util.h",
         "layout_util.h",
         "primitive_util.h",
+        "shape.h",
         "shape_util.h",
     ],
     visibility = ["//visibility:public"],
@@ -253,6 +256,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "shape_test",
+    srcs = ["shape_test.cc"],
+    deps = [
+        ":shape_util",
+        ":status_macros",
+        ":test",
+        ":test_helpers",
+        ":types",
+        ":util",
+        ":xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_test(
     name = "shape_util_test",
     srcs = ["shape_util_test.cc"],
@@ -308,6 +328,7 @@ cc_library(
         ":util",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -330,6 +351,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -373,6 +395,7 @@ cc_library(
         ":literal_util",
         ":util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -731,6 +754,72 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "parse_flags_from_env",
+    srcs = ["parse_flags_from_env.cc"],
+    hdrs = ["parse_flags_from_env.h"],
+    deps =
+        [
+            "//tensorflow/compiler/xla:types",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/strings:str_format",
+            "@com_google_absl//absl/types:span",
+        ],
+)
+
+tf_cc_test(
+    name = "parse_flags_from_env_test",
+    srcs = ["parse_flags_from_env_test.cc"],
+    deps =
+        [
+            ":parse_flags_from_env",
+            "//tensorflow/compiler/xla:types",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+            "@com_google_absl//absl/strings:str_format",
+        ],
+)
+
+cc_library(
+    name = "debug_options_flags",
+    srcs = [
+        "debug_options_flags.cc",
+        "debug_options_parsers.h",
+    ],
+    hdrs = ["debug_options_flags.h"],
+    deps =
+        [
+            ":parse_flags_from_env",
+            "//tensorflow/compiler/xla:xla_proto",
+            "//tensorflow/compiler/xla/service:hlo",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "@com_google_absl//absl/strings",
+        ],
+)
+
+tf_cc_test(
+    name = "debug_options_parsers_test",
+    size = "small",
+    srcs = [
+        "debug_options_parsers.h",
+        "debug_options_parsers_test.cc",
+    ],
+    deps =
+        [
+            "//tensorflow/compiler/xla:xla_proto",
+            "//tensorflow/compiler/xla/service:hlo",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/strings:str_format",
+        ],
+)
+
 # -----------------------------------------------------------------------------
 
 # This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
diff --git a/tensorflow/compiler/xla/README.md b/tensorflow/compiler/xla/README.md
index 39f8caaa961dc7b57d2b45f974fc6ecf89cf6748..f9c93707f7af30a0fa0c4224240dc40848a24f66 100644
--- a/tensorflow/compiler/xla/README.md
+++ b/tensorflow/compiler/xla/README.md
@@ -1,7 +1,6 @@
 <p align="center">
-  <img width="200" src="xlalogo.png"/>
+  <img width="200" src="./g3doc/images/xlalogo.png"/>
 </p>
 
 XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
-algebra that optimizes TensorFlow computations. See the
-[documentation](https://www.tensorflow.org/performance/xla/) for more details.
+algebra that optimizes TensorFlow computations. See the [documentation](./g3doc/overview.md).
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index 782c966b4c57672d137569a318fb20ace14d493b..e4aca98f67d50287a83afc6f41a59458f3df2da2 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -104,7 +104,7 @@ std::unique_ptr<Array2D<NativeT>> MakeLinspaceArray2D(double from, double to,
   int64 count = n1 * n2;
   NativeT step =
       static_cast<NativeT>((count > 1) ? (to - from) / (count - 1) : 0);
-  auto set = [&array, n1, n2](int64 index, NativeT value) {
+  auto set = [&array, n2](int64 index, NativeT value) {
     (*array)(index / n2, index % n2) = value;
   };
   for (int64 i = 0; i < count - 1; ++i) {
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index dc097f3696e22d75d7dc72ec4877a9c8b5dda059..fe99564d3c671cd7890e1fa26fcd2e3384972983 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -33,6 +33,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -66,6 +68,7 @@ cc_library(
     deps = [
         ":global_data",
         ":xla_computation",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:service_interface",
@@ -74,11 +77,11 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -88,11 +91,12 @@ cc_library(
     srcs = ["executable_build_options.cc"],
     hdrs = ["executable_build_options.h"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -189,6 +193,7 @@ cc_library(
     hdrs = ["xla_computation.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -234,13 +239,13 @@ tf_cc_test(
     deps = [
         ":xla_builder",
         ":xla_computation",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 5dde5b432f136c16d4e3795569499ee5de709763..74b76f929949d3300a5d0ff45d5fa4cd9f162642 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -20,9 +20,10 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -42,7 +43,7 @@ StatusOr<Literal> Client::Transfer(const GlobalData& data,
   TransferToClientRequest request;
   *request.mutable_data() = data.handle();
   if (shape_with_layout != nullptr) {
-    *request.mutable_shape_with_layout() = *shape_with_layout;
+    *request.mutable_shape_with_layout() = shape_with_layout->ToProto();
   }
   TransferToClientResponse response;
 
@@ -123,7 +124,7 @@ StatusOr<Literal> Client::TransferFromOutfeed(
   }
   request.set_replica_id(replica_id);
   if (shape_with_layout != nullptr) {
-    *request.mutable_shape_with_layout() = *shape_with_layout;
+    *request.mutable_shape_with_layout() = shape_with_layout->ToProto();
   }
   TransferFromOutfeedResponse response;
 
@@ -170,11 +171,14 @@ StatusOr<Literal> Client::ExecuteAndTransfer(
       std::unique_ptr<GlobalData> data,
       Execute(computation, arguments, execution_options, execution_profile));
 
-  const Shape* shape_with_output_layout = nullptr;
+  absl::optional<Shape> shape_with_output_layout;
   if (execution_options && execution_options->has_shape_with_output_layout()) {
-    shape_with_output_layout = &execution_options->shape_with_output_layout();
+    shape_with_output_layout =
+        Shape(execution_options->shape_with_output_layout());
   }
-  return Transfer(*data, shape_with_output_layout);
+  return Transfer(*data, shape_with_output_layout.has_value()
+                             ? &(*shape_with_output_layout)
+                             : nullptr);
 }
 
 StatusOr<Literal> Client::ComputeConstant(const XlaComputation& computation,
@@ -210,11 +214,10 @@ StatusOr<XlaComputation> Client::LoadSnapshot(const HloSnapshot& module) {
   return XlaComputation(module.hlo().hlo_module());
 }
 
-StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
-    const XlaComputation& computation, absl::Span<GlobalData* const> arguments,
-    const ExecutionOptions* execution_options,
-    ExecutionProfile* execution_profile) {
-  ExecuteGraphRequest request;
+StatusOr<ExecutionHandle> Client::Compile(
+    const XlaComputation& computation, absl::Span<const Shape> argument_shapes,
+    const ExecutionOptions* execution_options) {
+  CompileRequest request;
   *request.mutable_computation() = computation.proto();
 
   if (execution_options == nullptr) {
@@ -222,6 +225,34 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
   } else {
     *request.mutable_execution_options() = *execution_options;
   }
+  if (request.execution_options().device_handles_size() > 1) {
+    return InvalidArgument(
+        "Compiling with multiple device handles is not supported. Use "
+        "'Execute' instead.");
+  }
+
+  // The argument shapes affect how the computation is compiled.
+  for (const auto& arg_shape : argument_shapes) {
+    *request.add_input_shape_with_layout() = arg_shape.ToProto();
+  }
+
+  CompileResponse response;
+  VLOG(1) << "making compile request: " << request.ShortDebugString();
+  Status s = stub_->Compile(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+  TF_RET_CHECK(response.has_handle());
+  return response.handle();
+}
+
+StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
+    const ExecutionHandle& handle, absl::Span<GlobalData* const> arguments,
+    ExecutionProfile* execution_profile) {
+  ExecuteRequest request;
+  *request.mutable_handle() = handle;
   for (GlobalData* argument : arguments) {
     CHECK(argument != nullptr) << "Argument pointers must not be null.";
     *request.add_arguments() = argument->handle();
@@ -229,7 +260,7 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
 
   ExecuteResponse response;
   VLOG(1) << "making execute request: " << request.ShortDebugString();
-  Status s = stub_->ExecuteGraph(&request, &response);
+  Status s = stub_->Execute(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -238,15 +269,62 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
 
   if (execution_profile != nullptr) {
     *execution_profile = response.profile();
+  }
+
+  return absl::make_unique<GlobalData>(stub_, response.output());
+}
+
+StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
+    const XlaComputation& computation, absl::Span<GlobalData* const> arguments,
+    const ExecutionOptions* execution_options,
+    ExecutionProfile* execution_profile) {
+  if (execution_options != nullptr &&
+      execution_options->device_handles_size() > 1) {
+    std::vector<XlaComputationInstance> computation_instances = {
+        XlaComputationInstance{
+            computation,
+            std::vector<GlobalData*>(arguments.begin(), arguments.end()),
+            *execution_options, execution_profile}};
+    TF_ASSIGN_OR_RETURN(auto results, ExecuteParallel(computation_instances));
+    // The result selection is a bit hacky, but better than assuming it is
+    // device 0.
+    //
+    // TODO(b/118493728): Allow Execute to return one result per computation.
+    for (int64 i = 0; i < results.size(); i++) {
+      TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(*results[i]));
+      if (!ShapeUtil::IsEmptyTuple(shape)) {
+        VLOG(3) << "Fetching result from device " << i << ": "
+                << ShapeUtil::HumanString(shape);
+        return std::move(results[i]);
+      }
+    }
+    TF_RET_CHECK(!results.empty());
+    VLOG(1) << "Defaulting to device 0 result";
+    return std::move(results[0]);
+  }
+
+  // The argument shapes affect how the computation is compiled.
+  std::vector<Shape> arg_shapes(arguments.size());
+  for (int i = 0; i < arguments.size(); i++) {
+    TF_ASSIGN_OR_RETURN(arg_shapes[i], GetShape(*arguments[i]));
+  }
+
+  TF_ASSIGN_OR_RETURN(auto handle,
+                      Compile(computation, arg_shapes, execution_options));
+
+  TF_ASSIGN_OR_RETURN(auto result,
+                      Execute(handle, arguments, execution_profile));
+
+  if (execution_profile != nullptr) {
     if (VLOG_IS_ON(1)) {
       TF_ASSIGN_OR_RETURN(
           auto execution_stats,
-          ExecutionStatsAsString(computation, response.profile()));
+          ExecutionStatsAsString(computation, *execution_profile));
       VLOG(1) << execution_stats;
     }
   }
 
-  return absl::make_unique<GlobalData>(stub_, response.output());
+  return std::move(result);
 }
 
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
@@ -274,10 +352,11 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   }
 
   std::vector<std::unique_ptr<GlobalData>> outputs;
-  for (size_t i = 0; i < computations.size(); ++i) {
+  for (size_t i = 0; i < response.responses_size(); ++i) {
     outputs.push_back(
         absl::make_unique<GlobalData>(stub_, response.responses(i).output()));
-    if (computations[i].execution_profile != nullptr) {
+    if (i < computations.size() &&
+        computations[i].execution_profile != nullptr) {
       *computations[i].execution_profile = response.responses(i).profile();
     }
   }
@@ -312,7 +391,7 @@ StatusOr<std::vector<DeviceHandle>> Client::GetDeviceHandles(
 
 Status Client::Unregister(const GlobalData& data) {
   UnregisterRequest request;
-  *request.mutable_data() = data.handle();
+  *request.add_data() = data.handle();
   UnregisterResponse response;
 
   VLOG(1) << "making unregister request";
@@ -383,15 +462,14 @@ StatusOr<Shape> Client::GetShape(const GlobalData& data) {
     return s;
   }
 
-  return response.shape();
+  return Shape(response.shape());
 }
 
 StatusOr<string> Client::ExecutionStatsAsString(
     const XlaComputation& computation, const ExecutionProfile& profile) {
   TF_ASSIGN_OR_RETURN(
       auto computation_stats,
-      GetComputationStats(computation,
-                          legacy_flags::GetDebugOptionsFromFlags()));
+      GetComputationStats(computation, GetDebugOptionsFromFlags()));
   int64 total_flops =
       computation_stats.flop_count() + computation_stats.transcendental_count();
   if (profile.compute_time_ns() > 0) {
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index 6f4d33c469f1f885cfeef546e3981dc3417ef71f..d0ac4703c632e0e01d3c8911594b46fedf28930d 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -40,6 +40,31 @@ class Client {
   explicit Client(ServiceInterface* stub);
   virtual ~Client();
 
+  // Compile the computation with the given argument shapes and returns the
+  // handle to the compiled executable. The compiled executable is cached on the
+  // service, and the returned handle can be used for exection without
+  // re-compile.
+  // * The shape and layout of the arguments being executed with will affect how
+  //   the computation is compiled. If argument_shapes is empty, the parameters'
+  //   shape and layout will be used in the compilation.
+  // * If execution_options is not nullptr, these options are passed to the
+  //   service to affect how it compiles our computation.  (The pointer does not
+  //   need to live beyond this call.)
+  // * If execution_options.device_handles should be empty. If you need
+  //   non-empty device handles, call 'Execute' instead.
+  StatusOr<ExecutionHandle> Compile(
+      const XlaComputation& computation,
+      absl::Span<const Shape> argument_shapes,
+      const ExecutionOptions* execution_options = nullptr);
+
+  // Executes the compiled executable for the given handle with the given
+  // arguments and returns the global data that was produced from the execution.
+  // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
+  //   will be filled with profile data from the execution.
+  StatusOr<std::unique_ptr<GlobalData>> Execute(
+      const ExecutionHandle& handle, absl::Span<GlobalData* const> arguments,
+      ExecutionProfile* execution_profile = nullptr);
+
   // Executes the computation with the given arguments and returns the global
   // data that was produced from the execution.
   // * If execution_options is not nullptr, these options are passed to the
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 0f1745366b7c33e573aff2e66d85431b01488c49..1f594e551af381d7537e947892cbf7e0b5b3b861 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
@@ -39,6 +40,13 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
 
 int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
 
+DebugOptions* ExecutableBuildOptions::mutable_debug_options() {
+  if (!has_debug_options()) {
+    debug_options_ = GetDebugOptionsFromFlags();
+  }
+  return &debug_options_.value();
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
     const Shape& shape_with_layout) {
   result_layout_set_ = true;
@@ -55,68 +63,10 @@ string ExecutableBuildOptions::ToString() const {
   if (result_layout_set_) {
     result_layout = ShapeUtil::HumanStringWithLayout(result_layout_);
   }
-  string generate_hlo_graph = "nullopt";
-  if (generate_hlo_graph_.has_value()) {
-    generate_hlo_graph = generate_hlo_graph_.value();
-  }
   return absl::StrFormat(
       "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
       "generate_hlo_graph=%s}",
-      device_ordinal_, result_layout, generate_hlo_graph);
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph(
-    string regex) {
-  generate_hlo_graph_ = std::move(regex);
-  return *this;
-}
-
-const absl::optional<string>& ExecutableBuildOptions::generate_hlo_graph()
-    const {
-  return generate_hlo_graph_;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_dump_optimized_hlo_proto_to(
-    absl::string_view dirpath) {
-  dump_optimized_hlo_proto_to_ = string(dirpath);
-  return *this;
-}
-
-const absl::optional<string>&
-ExecutableBuildOptions::dump_optimized_hlo_proto_to() const {
-  return dump_optimized_hlo_proto_to_;
-}
-
-ExecutableBuildOptions&
-ExecutableBuildOptions::set_dump_unoptimized_hlo_proto_to(
-    absl::string_view dirpath) {
-  dump_unoptimized_hlo_proto_to_ = string(dirpath);
-  return *this;
-}
-
-const absl::optional<string>&
-ExecutableBuildOptions::dump_unoptimized_hlo_proto_to() const {
-  return dump_unoptimized_hlo_proto_to_;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_dump_per_pass_hlo_proto_to(
-    absl::string_view dirpath) {
-  dump_per_pass_hlo_proto_to_ = string(dirpath);
-  return *this;
-}
-
-const absl::optional<string>&
-ExecutableBuildOptions::dump_per_pass_hlo_proto_to() const {
-  return dump_per_pass_hlo_proto_to_;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_hlo_profile(bool enabled) {
-  hlo_profile_ = enabled;
-  return *this;
-}
-
-absl::optional<bool> ExecutableBuildOptions::hlo_profile() const {
-  return hlo_profile_;
+      device_ordinal_, result_layout, debug_options().xla_generate_hlo_graph());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 93334db88bc24f2ffbf3c7a57ee45ef238286739..a58090253bfac7779e4b61bc7231a0f0d945cc00 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -19,7 +19,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -44,6 +46,12 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
   const Shape* result_layout() const;
 
+  // Expose access to the XLA debug options which will be passed to the
+  // compilation process.
+  bool has_debug_options() const { return debug_options_.has_value(); }
+  const DebugOptions& debug_options() const { return *debug_options_; }
+  DebugOptions* mutable_debug_options();
+
   // If set, this specifies an allocator that can be used to allocate temporary
   // space on the device during compilation.  For example, the compiler might
   // want to run various algorithms on the device and pick the fastest one -- it
@@ -55,56 +63,16 @@ class ExecutableBuildOptions {
       DeviceMemoryAllocator* allocator);
   DeviceMemoryAllocator* device_allocator() const;
 
-  // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions).
-  ExecutableBuildOptions& set_generate_hlo_graph(string regex);
-  const absl::optional<string>& generate_hlo_graph() const;
-
-  // If set, specifies a dirpath to dump the end-of-optimization-pipeline HLO
-  // protobuf to (as in DebugOptions).
-  ExecutableBuildOptions& set_dump_optimized_hlo_proto_to(
-      absl::string_view dirpath);
-  const absl::optional<string>& dump_optimized_hlo_proto_to() const;
-
-  // If set, specifies a dirpath to dump the start-of-optimization-pipeline HLO
-  // protobuf to (as in DebugOptions).
-  ExecutableBuildOptions& set_dump_unoptimized_hlo_proto_to(
-      absl::string_view dirpath);
-  const absl::optional<string>& dump_unoptimized_hlo_proto_to() const;
-
-  // If set, specifies a dirpath to dump the per-pass-in-pipeline HLO protobufs
-  // to (as in DebugOptions).
-  ExecutableBuildOptions& set_dump_per_pass_hlo_proto_to(
-      absl::string_view dirpath);
-  const absl::optional<string>& dump_per_pass_hlo_proto_to() const;
-
-  // If true, specifies that we should record an HLO profile during execution
-  // and log it after execution (as in DebugOptions). If nullopt the default is
-  // used.
-  ExecutableBuildOptions& set_hlo_profile(bool enabled);
-  absl::optional<bool> hlo_profile() const;
-
-  void add_disabled_hlo_pass(absl::string_view pass_name) {
-    disabled_hlo_passes_.push_back(std::string(pass_name));
-  }
-  const absl::Span<const std::string> disabled_hlo_passes() const {
-    return disabled_hlo_passes_;
-  }
-
   // Returns a string representation of the build options, suitable for
   // debugging.
   string ToString() const;
 
  private:
-  absl::optional<bool> hlo_profile_;
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
-  absl::optional<string> generate_hlo_graph_;
-  absl::optional<string> dump_optimized_hlo_proto_to_;
-  absl::optional<string> dump_unoptimized_hlo_proto_to_;
-  absl::optional<string> dump_per_pass_hlo_proto_to_;
+  absl::optional<DebugOptions> debug_options_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
-  std::vector<std::string> disabled_hlo_passes_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/global_data.cc b/tensorflow/compiler/xla/client/global_data.cc
index 2986d4060013703873b2cffb6aacbb012606d16f..f1fa13d95c035d182746d3ce5400178890aa42b1 100644
--- a/tensorflow/compiler/xla/client/global_data.cc
+++ b/tensorflow/compiler/xla/client/global_data.cc
@@ -18,25 +18,53 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
+namespace {
+
+// Releases a set of global data handles owned by the parent service
+// interface.
+void ReleaseHandles(ServiceInterface* parent,
+                    const absl::Span<const GlobalDataHandle> handles) {
+  UnregisterRequest request;
+  for (auto& handle : handles) {
+    VLOG(1) << "Requesting to unregister " << handle.ShortDebugString();
+    *request.add_data() = handle;
+  }
+  UnregisterResponse response;
+  Status status = parent->Unregister(&request, &response);
+  VLOG(1) << "Done with request";
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to unregister handles: " << status
+                 << "; continuing anyway...";
+  }
+}
+
+}  // namespace
 
 GlobalData::GlobalData(ServiceInterface* parent, GlobalDataHandle handle)
     : handle_(std::move(handle)), parent_(parent) {}
 
 GlobalData::~GlobalData() {
-  UnregisterRequest request;
-  *request.mutable_data() = handle_;
-  UnregisterResponse response;
-  VLOG(1) << "requesting to unregister " << handle_.ShortDebugString();
-  Status s = parent_->Unregister(&request, &response);
-  VLOG(1) << "done with request";
+  if (parent_ != nullptr) {
+    ReleaseHandles(parent_, {handle_});
+  }
+}
 
-  if (!s.ok()) {
-    LOG(WARNING) << "failed to unregister " << handle_.ShortDebugString()
-                 << "; continuing anyway...";
+/* static */ void GlobalData::Release(
+    std::vector<std::unique_ptr<GlobalData>> instances) {
+  absl::flat_hash_map<ServiceInterface*, std::vector<GlobalDataHandle>>
+      parent_handles_map;
+  for (auto& instance : instances) {
+    if (instance->parent_ != nullptr) {
+      parent_handles_map[instance->parent_].push_back(instance->Release());
+    }
+  }
+  for (auto& parent_handles : parent_handles_map) {
+    ReleaseHandles(parent_handles.first, parent_handles.second);
   }
 }
 
diff --git a/tensorflow/compiler/xla/client/global_data.h b/tensorflow/compiler/xla/client/global_data.h
index b7929357d06032b55c04bf0391f7fa703ee15f17..4d48d2c53fc6171fe1940924598a4d48519c5adf 100644
--- a/tensorflow/compiler/xla/client/global_data.h
+++ b/tensorflow/compiler/xla/client/global_data.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_GLOBAL_DATA_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_GLOBAL_DATA_H_
 
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -36,7 +40,18 @@ class GlobalData {
 
   const GlobalDataHandle& handle() const { return handle_; }
 
+  // Releases a set of GlobalData handles. A single RPC will be issued
+  // per unique ServiceInterface of the given GlobalData objects.
+  static void Release(std::vector<std::unique_ptr<GlobalData>> instances);
+
  private:
+  // Detaches the global data handle from the object, such that the destructor
+  // will not try to release it.
+  GlobalDataHandle Release() {
+    parent_ = nullptr;
+    return handle_;
+  }
+
   GlobalDataHandle handle_;   // Handle being wrapped.
   ServiceInterface* parent_;  // Service used to unregister handle_.
 
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index a18c94c4e695a6cdcb9dcc60b64b617cecd276d8..41db8de29ff0085a30847ff41db4ffbfc774e2a1 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -104,13 +104,17 @@ xla_test(
 )
 
 cc_library(
-    name = "numeric",
-    srcs = ["numeric.cc"],
-    hdrs = ["numeric.h"],
+    name = "matrix",
+    srcs = ["matrix.cc"],
+    hdrs = ["matrix.h"],
     deps = [
         ":arithmetic",
         ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "@com_google_absl//absl/types:span",
@@ -118,11 +122,12 @@ cc_library(
 )
 
 xla_test(
-    name = "numeric_test",
-    srcs = ["numeric_test.cc"],
+    name = "matrix_test",
+    srcs = ["matrix_test.cc"],
     tags = ["enable_for_xla_interpreter"],
     deps = [
-        ":numeric",
+        ":matrix",
+        ":slicing",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -164,11 +169,43 @@ cc_library(
     deps = [
         ":constants",
         ":math",
-        ":numeric",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
+    ],
+)
+
+cc_library(
+    name = "slicing",
+    srcs = ["slicing.cc"],
+    hdrs = ["slicing.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "slicing_test",
+    srcs = ["slicing_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
+    deps = [
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
 
@@ -177,8 +214,9 @@ cc_library(
     srcs = ["sorting.cc"],
     hdrs = ["sorting.h"],
     deps = [
-        ":numeric",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
     ],
@@ -187,10 +225,6 @@ cc_library(
 xla_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
-    blacklisted_backends = [
-        "cpu",
-        "gpu",
-    ],
     tags = ["enable_for_xla_interpreter"],
     deps = [
         ":sorting",
@@ -224,3 +258,48 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "triangular_solve",
+    srcs = ["triangular_solve.cc"],
+    hdrs = ["triangular_solve.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "triangular_solve_test",
+    srcs = ["triangular_solve_test.cc"],
+    tags = ["noasan"],  # sometimes times out, http://b/78650012
+    deps = [
+        ":triangular_solve",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index d3d7edb42a38595bbf9fdb36e0dd946ae5df51f9..36fdda39b4124b9100c6054160f9c17bdf787d6f 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -265,6 +265,21 @@ XlaOp Digamma(XlaOp input) {
   return result;
 }
 
+// Implements Banker's rounding: numbers that are equidistant between two
+// integers are rounded towards even.
+XlaOp RoundToEven(XlaOp x) {
+  auto half = ScalarLike(x, 0.5);
+  auto one = ScalarLike(x, 1.0);
+  auto two = ScalarLike(x, 2.0);
+
+  auto round_val = Floor(x);
+  auto fraction = x - round_val;
+  auto nearest_even_int = round_val - two * Floor(half * x);
+  auto is_odd = Eq(nearest_even_int, one);
+  return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
+                round_val + one, round_val);
+}
+
 // Trigonometric functions.
 
 // acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
@@ -304,4 +319,13 @@ XlaOp Cosh(XlaOp x) { return (Exp(x) + Exp(-x)) * ScalarLike(x, 0.5); }
 
 XlaOp Sinh(XlaOp x) { return (Exp(x) - Exp(-x)) * ScalarLike(x, 0.5); }
 
+XlaOp MaybeConjugate(XlaOp x, bool conjugate) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    auto perform_conj = shape.element_type() == C64 && conjugate;
+    return perform_conj ? Conj(x) : x;
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index a6cafd42077367bf23ffa1f45eab31c01dc31b16..17612bf9fdc0f1eabb338671c93c025c5b268872 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -51,6 +51,10 @@ XlaOp Lgamma(XlaOp input);
 // Computes an approximation of the digamma function.
 XlaOp Digamma(XlaOp input);
 
+// Rounds the given number to even when the number is equidistant between two
+// integers.
+XlaOp RoundToEven(XlaOp x);
+
 // Trigonometric functions
 
 // Computes the arc cosine of 'x'.
@@ -82,6 +86,10 @@ XlaOp Cosh(XlaOp x);
 // Computes the hyperbolic sine of 'x'.
 XlaOp Sinh(XlaOp x);
 
+// Applies a complex conjugation operation if `a` is complex and `conjugate`
+// is true, otherwise returns its argument.
+xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 14c259a7fa2a47642663b65d2785e5bbdc040cfd..ae2ea225d1aadd7b3a794eabeca866c498f34760 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -136,5 +136,17 @@ XLA_TEST_F(MathTest, Digamma) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, RoundToEven) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(
+      &builder, {-1.4, -1.5, -2.5, -0.5, 0, 0.5, 1.5, 2.5, 3.5, 4.5});
+  RoundToEven(x);
+
+  std::vector<float> expected = {-1.0, -2.0, -2.0, -0.0, 0,
+                                 0.0,  2.0,  2.0,  4.0,  4.0};
+
+  ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffd744d190885b8e3f4149a48a706498b3787618
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+
+#include <numeric>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
+                     int64 n) {
+  auto a = Iota(builder, type, m);
+  auto b = Iota(builder, type, n);
+  auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
+  return ConvertElementType(indicator, type);
+}
+
+XlaOp GetMatrixDiagonal(XlaOp x) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
+    absl::Span<const int64> major_dims =
+        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    auto mask = Broadcast(indicator, major_dims);
+
+    // TPUs don't support S64 add reduction at the moment. But fortunately
+    // OR-reductions work just as well for integers.
+    XlaComputation reducer =
+        primitive_util::IsIntegralType(shape.element_type())
+            ? CreateScalarOrComputation(shape.element_type(), builder)
+            : CreateScalarAddComputation(shape.element_type(), builder);
+
+    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
+                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+  });
+}
+
+XlaOp Triangle(XlaOp x, bool lower) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
+    absl::Span<const int64> major_dims =
+        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    XlaOp indicator;
+    if (lower) {
+      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    } else {
+      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    }
+    auto mask = Broadcast(indicator, major_dims);
+
+    return Select(mask, x, Zeros(builder, shape));
+  });
+}
+
+XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
+
+XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
+
+XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
+
+    // Check that both tensors have the same number of dimensions. There must be
+    // at least two (the batch dimensions can be empty).
+    if (ShapeUtil::Rank(x_shape) != ShapeUtil::Rank(y_shape)) {
+      return InvalidArgument(
+          "Arguments to BatchDot have different ranks: %s vs. %s",
+          ShapeUtil::HumanString(x_shape), ShapeUtil::HumanString(y_shape));
+    }
+    const int ndims = ShapeUtil::Rank(x_shape);
+    if (ndims < 2) {
+      return InvalidArgument(
+          "Arguments to BatchDot must have rank >= 2: got %d", ndims);
+    }
+
+    // The batch dimensions must be equal and the matrix dimensions must be
+    // valid.
+    std::vector<int64> batch_dimension_numbers;
+    for (int i = 0; i < ndims - 2; ++i) {
+      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
+        return InvalidArgument(
+            "Dimension %d of inputs to BatchDot must be equal: shapes %s vs %s",
+            i, ShapeUtil::HumanString(x_shape),
+            ShapeUtil::HumanString(y_shape));
+      }
+      batch_dimension_numbers.push_back(i);
+    }
+
+    int x_inner_dim = ndims - 1;
+    int y_inner_dim = ndims - 2;
+    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
+      return InvalidArgument(
+          "Dimensions %d and %d of arguments to BatchDot must be equal: "
+          "shapes %s vs %s",
+          x_inner_dim, y_inner_dim, ShapeUtil::HumanString(x_shape),
+          ShapeUtil::HumanString(y_shape));
+    }
+
+    // Check for zero lhs/rhs dim size.
+    if (ShapeUtil::IsZeroElementArray(x_shape) ||
+        ShapeUtil::IsZeroElementArray(y_shape)) {
+      std::vector<int64> dimensions(batch_dimension_numbers.size());
+      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
+        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
+      }
+      int x_outer_dim = ndims - 2;
+      int y_outer_dim = ndims - 1;
+      dimensions.push_back(x_shape.dimensions(x_outer_dim));
+      dimensions.push_back(y_shape.dimensions(y_outer_dim));
+      return Broadcast(
+          ConstantLiteral(builder, LiteralUtil::Zero(x_shape.element_type())),
+          dimensions);
+    }
+
+    PrecisionConfig precision_proto;
+    precision_proto.add_operand_precision(precision);
+    precision_proto.add_operand_precision(precision);
+
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
+    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
+    for (auto batch_dimension_number : batch_dimension_numbers) {
+      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
+      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
+    }
+
+    return DotGeneral(x, y, dot_dnums, &precision_proto);
+  });
+}
+
+XlaOp TransposeInMinorDims(XlaOp x) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    std::vector<int64> permutation(n_dims);
+    std::iota(permutation.begin(), permutation.end(), 0);
+    std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
+    return Transpose(x, permutation);
+  });
+}
+
+XlaOp MaybeTransposeInMinorDims(XlaOp x, bool transpose) {
+  return transpose ? TransposeInMinorDims(x) : x;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..8856f99c7a0fee8f315aac11fab392cf5536f57b
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// Returns an m x n matrix with 1s on the diagonal elements, zeros everywhere
+// else.
+XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
+
+// Get the diagonals of the last two dimensions. If 'x' has shape
+// [..., M, N], then the output has shape [..., min(M, N)], containing the
+// diagonal elements (i.e., with indices [..., i, i]).
+XlaOp GetMatrixDiagonal(XlaOp x);
+
+// Get the upper or lower triangle part of the last two dimensions
+XlaOp Triangle(XlaOp x, bool lower);
+
+// Get the upper triangle part of the last two dimensions
+XlaOp UpperTriangle(XlaOp x);
+
+// Get the lower triangle part of the last two dimensions
+XlaOp LowerTriangle(XlaOp x);
+
+// Multiplies slices of two tensors in batches.
+
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if transpose_x else r_x
+//     c_o = r_y if transpose_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+xla::XlaOp BatchDot(
+    xla::XlaOp x, xla::XlaOp y,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
+// Transposes a stack of matrices `x` by swapping the last two dimensions.
+xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
+
+// Transposes `x` in its minor dimensions if `transpose` is true, otherwise
+// returns `x` unchanged.
+xla::XlaOp MaybeTransposeInMinorDims(xla::XlaOp x, bool transpose);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
diff --git a/tensorflow/compiler/xla/client/lib/matrix_test.cc b/tensorflow/compiler/xla/client/lib/matrix_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0593a7517ac125ca8dc5395cee76f6bc23232cd3
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/matrix_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class MatrixTest : public ClientLibraryTestBase {
+ protected:
+  template <typename T>
+  void TestMatrixDiagonal();
+};
+
+XLA_TEST_F(MatrixTest, Triangle) {
+  XlaBuilder builder(TestName());
+  Array3D<int32> input(2, 3, 4);
+  input.FillIota(0);
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<int32>(input, 0, "a", &builder, &a);
+  LowerTriangle(a);
+  Array3D<int32> expected({{{0, 0, 0, 0}, {4, 5, 0, 0}, {8, 9, 10, 0}},
+                           {{12, 0, 0, 0}, {16, 17, 0, 0}, {20, 21, 22, 0}}});
+
+  ComputeAndCompareR3<int32>(&builder, expected, {a_data.get()});
+}
+
+template <typename T>
+void MatrixTest::TestMatrixDiagonal() {
+  XlaBuilder builder("GetMatrixDiagonal");
+  Array3D<T> input(2, 3, 4);
+  input.FillIota(0);
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
+  GetMatrixDiagonal(a);
+  Array2D<T> expected({{0, 5, 10}, {12, 17, 22}});
+
+  ComputeAndCompareR2<T>(&builder, expected, {a_data.get()});
+}
+
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
+
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64>(); }
+
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
+
+Array3D<float> BatchedAValsFull() {
+  return {{
+              {2, 0, 1, 2},
+              {3, 6, 0, 1},
+              {4, 7, 9, 0},
+              {5, 8, 10, 11},
+          },
+          {
+              {16, 24, 8, 12},
+              {24, 61, 82, 48},
+              {8, 82, 456, 106},
+              {12, 48, 106, 62},
+          }};
+}
+
+XLA_TEST_F(MatrixTest, RowBatchDot) {
+  XlaBuilder builder(TestName());
+
+  int n = 4;
+
+  XlaOp a, row, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
+                                           "row", &builder, &row);
+  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
+  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
+
+  auto l_index = DynamicSliceInMinorDims(
+      a, {index, ConstantR0<int32>(&builder, 0)}, {1, n});
+  BatchDot(l_index, TransposeInMinorDims(row));
+
+  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
+                             {a_data.get(), row_data.get(), index_data.get()});
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.cc b/tensorflow/compiler/xla/client/lib/numeric.cc
deleted file mode 100644
index 377654220b5df4487e9e194361473d54ff46a54e..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/numeric.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <numeric>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
-
-namespace xla {
-
-XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
-                     int64 n) {
-  auto a = Iota(builder, type, m);
-  auto b = Iota(builder, type, n);
-  auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
-  return ConvertElementType(indicator, type);
-}
-
-XlaOp GetMatrixDiagonal(XlaOp x) {
-  XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    const int64 m = shape.dimensions(n_dims - 2);
-    const int64 n = shape.dimensions(n_dims - 1);
-    absl::Span<const int64> major_dims =
-        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
-    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    auto mask = Broadcast(indicator, major_dims);
-
-    // TPUs don't support S64 add reduction at the moment. But fortunately
-    // OR-reductions work just as well for integers.
-    XlaComputation reducer =
-        primitive_util::IsIntegralType(shape.element_type())
-            ? CreateScalarOrComputation(shape.element_type(), builder)
-            : CreateScalarAddComputation(shape.element_type(), builder);
-
-    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
-                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
-  });
-}
-
-XlaOp Triangle(XlaOp x, bool lower) {
-  XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    const int64 m = shape.dimensions(n_dims - 2);
-    const int64 n = shape.dimensions(n_dims - 1);
-    absl::Span<const int64> major_dims =
-        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
-    xla::XlaOp indicator;
-    if (lower) {
-      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    } else {
-      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    }
-    auto mask = Broadcast(indicator, major_dims);
-
-    return Select(mask, x, Zeros(builder, shape));
-  });
-}
-
-XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
-
-XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.h b/tensorflow/compiler/xla/client/lib/numeric.h
deleted file mode 100644
index efd8cdc25724198633e0bf1c48c4e7d9e4b4c9e1..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/numeric.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-
-// Returns a rank 1 tensor of `type` containing values [0, 1, 2, ...].
-XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size);
-
-// Returns an m x n matrix with 1s on the diagonal elements, zeros everywhere
-// else.
-XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
-
-// Get the diagonals of the last two dimensions. If 'x' has shape
-// [..., M, N], then the output has shape [..., min(M, N)], containing the
-// diagonal elements (i.e., with indices [..., i, i]).
-XlaOp GetMatrixDiagonal(XlaOp x);
-
-// Get the upper or lower triangle part of the last two dimensions
-XlaOp Triangle(XlaOp x, bool lower);
-
-// Get the upper triangle part of the last two dimensions
-XlaOp UpperTriangle(XlaOp x);
-
-// Get the lower triangle part of the last two dimensions
-XlaOp LowerTriangle(XlaOp x);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
diff --git a/tensorflow/compiler/xla/client/lib/numeric_test.cc b/tensorflow/compiler/xla/client/lib/numeric_test.cc
deleted file mode 100644
index 7d6aedd49462bd4f075f90d0b0f85c40f1191aa1..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/numeric_test.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-namespace {
-
-class NumericTest : public ClientLibraryTestBase {
- protected:
-  template <typename T>
-  void TestMatrixDiagonal();
-};
-
-XLA_TEST_F(NumericTest, Triangle) {
-  XlaBuilder builder(TestName());
-  Array3D<int32> input(2, 3, 4);
-  input.FillIota(0);
-
-  XlaOp a;
-  auto a_data = CreateR3Parameter<int32>(input, 0, "a", &builder, &a);
-  LowerTriangle(a);
-  Array3D<int32> expected({{{0, 0, 0, 0}, {4, 5, 0, 0}, {8, 9, 10, 0}},
-                           {{12, 0, 0, 0}, {16, 17, 0, 0}, {20, 21, 22, 0}}});
-
-  ComputeAndCompareR3<int32>(&builder, expected, {a_data.get()});
-}
-
-template <typename T>
-void NumericTest::TestMatrixDiagonal() {
-  XlaBuilder builder("GetMatrixDiagonal");
-  Array3D<T> input(2, 3, 4);
-  input.FillIota(0);
-
-  XlaOp a;
-  auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
-  GetMatrixDiagonal(a);
-  Array2D<T> expected({{0, 5, 10}, {12, 17, 22}});
-
-  ComputeAndCompareR2<T>(&builder, expected, {a_data.get()});
-}
-
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
-
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64>(); }
-
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 6ef81689489d8117d5951bcb75693c2e3413e4d6..85b9e1827dcef5ed907d893277deb5a52f8f30e9 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -15,20 +15,19 @@ limitations under the License.
 
 #include <cmath>
 
+#include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/casts.h"
 
 namespace xla {
 namespace {
 
 // Rotates a 32-bit integer 'v' left by 'distance' bits.
-XlaOp RotateLeftS32(XlaOp v, int distance) {
-  return (v << ConstantR0<int32>(v.builder(), distance)) |
-         ShiftRightLogical(v, ConstantR0<int32>(v.builder(), 32 - distance));
+XlaOp RotateLeftU32(XlaOp v, int distance) {
+  return (v << ConstantR0<uint32>(v.builder(), distance)) |
+         ShiftRightLogical(v, ConstantR0<uint32>(v.builder(), 32 - distance));
 }
 
 using ThreeFry2x32State = std::array<XlaOp, 2>;
@@ -38,13 +37,16 @@ using ThreeFry2x32State = std::array<XlaOp, 2>;
 // http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
 ThreeFry2x32State ThreeFry2x32(ThreeFry2x32State input, ThreeFry2x32State key) {
   XlaBuilder* builder = input[0].builder();
+  key[0] = BitcastConvertType(key[0], U32);
+  key[1] = BitcastConvertType(key[1], U32);
+
   // Rotation distances specified by the Threefry2x32 algorithm.
   constexpr std::array<int, 8> rotations = {13, 15, 26, 6, 17, 29, 16, 24};
   ThreeFry2x32State x;
 
   std::array<XlaOp, 3> ks;
   // 0x1BD11BDA is a parity constant specified by the ThreeFry2x32 algorithm.
-  ks[2] = ConstantR0<int32>(builder, 0x1BD11BDA);
+  ks[2] = ConstantR0<uint32>(builder, 0x1BD11BDA);
   for (int i = 0; i < 2; ++i) {
     ks[i] = key[i];
     x[i] = input[i];
@@ -58,7 +60,7 @@ ThreeFry2x32State ThreeFry2x32(ThreeFry2x32State input, ThreeFry2x32State key) {
   // amount 'rotation'.
   auto round = [](ThreeFry2x32State v, int rotation) {
     v[0] = v[0] + v[1];
-    v[1] = RotateLeftS32(v[1], rotation);
+    v[1] = RotateLeftU32(v[1], rotation);
     v[1] = v[0] ^ v[1];
     return v;
   };
@@ -70,74 +72,83 @@ ThreeFry2x32State ThreeFry2x32(ThreeFry2x32State input, ThreeFry2x32State key) {
   x = round(x, rotations[2]);
   x = round(x, rotations[3]);
   x[0] = x[0] + ks[1];
-  x[1] = x[1] + ks[2] + ConstantR0<int32>(builder, 1);
+  x[1] = x[1] + ks[2] + ConstantR0<uint32>(builder, 1);
 
   x = round(x, rotations[4]);
   x = round(x, rotations[5]);
   x = round(x, rotations[6]);
   x = round(x, rotations[7]);
   x[0] = x[0] + ks[2];
-  x[1] = x[1] + ks[0] + ConstantR0<int32>(builder, 2);
+  x[1] = x[1] + ks[0] + ConstantR0<uint32>(builder, 2);
 
   x = round(x, rotations[0]);
   x = round(x, rotations[1]);
   x = round(x, rotations[2]);
   x = round(x, rotations[3]);
   x[0] = x[0] + ks[0];
-  x[1] = x[1] + ks[1] + ConstantR0<int32>(builder, 3);
+  x[1] = x[1] + ks[1] + ConstantR0<uint32>(builder, 3);
 
   x = round(x, rotations[4]);
   x = round(x, rotations[5]);
   x = round(x, rotations[6]);
   x = round(x, rotations[7]);
   x[0] = x[0] + ks[1];
-  x[1] = x[1] + ks[2] + ConstantR0<int32>(builder, 4);
+  x[1] = x[1] + ks[2] + ConstantR0<uint32>(builder, 4);
 
   x = round(x, rotations[0]);
   x = round(x, rotations[1]);
   x = round(x, rotations[2]);
   x = round(x, rotations[3]);
   x[0] = x[0] + ks[2];
-  x[1] = x[1] + ks[0] + ConstantR0<int32>(builder, 5);
+  x[1] = x[1] + ks[0] + ConstantR0<uint32>(builder, 5);
 
   return x;
 }
 
-}  // namespace
+// Returns the inputs with unique counter values for ThreeFry2x32.
+ThreeFry2x32State GetInputs(const int64 size, XlaBuilder* builder) {
+  ThreeFry2x32State inputs;
+  inputs[0] = Iota(builder, U32, size);
+  inputs[1] = inputs[0] + ConstantR0<uint32>(builder, size);
+  return inputs;
+}
 
-XlaOp StatelessRngUniform(std::array<XlaOp, 2> seeds, const Shape& shape,
-                          XlaOp minval, XlaOp maxval) {
-  XlaBuilder* builder = seeds[0].builder();
-  if (shape.element_type() != F32) {
-    return builder->ReportError(Unimplemented(
-        "Types other than F32 are not implemented by StatelessRngUniform."));
-  }
-  ThreeFry2x32State key = seeds;
+XlaOp StatelessRngUniformU32(std::array<XlaOp, 2> key, const Shape& shape) {
+  XlaBuilder* builder = key[0].builder();
   const int64 size = ShapeUtil::ElementsIn(shape);
-
   const int64 half_size = CeilOfRatio<int64>(size, 2);
   const bool size_is_odd = (half_size * 2 != size);
-
-  // Fill the generator inputs with unique counter values.
-  ThreeFry2x32State inputs;
-  inputs[0] = Iota(builder, S32, half_size);
-  inputs[1] = inputs[0] + ConstantR0<int32>(builder, half_size);
+  ThreeFry2x32State inputs = GetInputs(half_size, builder);
   ThreeFry2x32State outputs = ThreeFry2x32(inputs, key);
-
   if (size_is_odd) {
     outputs[1] = Slice(outputs[1], {0}, {half_size - 1}, {1});
   }
+  auto result = ConcatInDim(builder, outputs, 0);
+  return Reshape(result, AsInt64Slice(shape.dimensions()));
+}
 
-  auto bits = Reshape(ConcatInDim(builder, outputs, 0),
-                      AsInt64Slice(shape.dimensions()));
+XlaOp StatelessRngUniformU64(std::array<XlaOp, 2> key, const Shape& shape) {
+  XlaBuilder* builder = key[0].builder();
+  const int64 size = ShapeUtil::ElementsIn(shape);
+  ThreeFry2x32State inputs = GetInputs(size, builder);
+  ThreeFry2x32State outputs = ThreeFry2x32(inputs, key);
+  // low 32 bit: outputs[0], high 32 bit: outputs[1]
+  auto result = ConvertElementType(outputs[0], U64) |
+                ShiftLeft(ConvertElementType(outputs[1], U64),
+                          ConstantR0WithType(builder, U64, 32));
+  return Reshape(result, AsInt64Slice(shape.dimensions()));
+}
+
+XlaOp StatelessRngUniformF32(XlaOp bits, XlaOp minval, XlaOp maxval) {
+  XlaBuilder* builder = bits.builder();
 
   // Form 23 random mantissa bits, with a leading 1 bit. The leading 1 bit
   // forces the random bits into the mantissa.
   constexpr int kFloatBits = 32;
   constexpr int kMantissaBits = 23;
   bits = ShiftRightLogical(
-             bits, ConstantR0<int32>(builder, kFloatBits - kMantissaBits)) |
-         ConstantR0<int32>(builder, tensorflow::bit_cast<int32>(1.0f));
+             bits, ConstantR0<uint32>(builder, kFloatBits - kMantissaBits)) |
+         ConstantR0<uint32>(builder, absl::bit_cast<uint32>(1.0f));
   auto floats = BitcastConvertType(bits, F32);
 
   // We have a floating point number in the range [1.0, 2.0).
@@ -147,4 +158,47 @@ XlaOp StatelessRngUniform(std::array<XlaOp, 2> seeds, const Shape& shape,
   return floats * (maxval - minval) + minval;
 }
 
+XlaOp StatelessRngUniformInt(XlaOp bits, XlaOp minval, XlaOp maxval,
+                             PrimitiveType type, PrimitiveType unsigned_type) {
+  XlaBuilder* builder = bits.builder();
+  // TODO(b/72573764): Generate real uniform integer distribution.
+  // The following algorithm is the same one that TF uses right now, but it's
+  // uniform only when maxval - minval is a divisor of the range that bits is
+  // generated from.
+  auto range = BitcastConvertType(maxval, unsigned_type) -
+               BitcastConvertType(minval, unsigned_type);
+  auto dist = Rem(bits, range);
+  auto dist_div_2 =
+      ShiftRightLogical(dist, ConstantR0WithType(builder, unsigned_type, 1));
+
+  return minval + BitcastConvertType(dist_div_2, type) +
+         BitcastConvertType(dist - dist_div_2, type);
+}
+
+}  // namespace
+
+XlaOp StatelessRngUniform(std::array<XlaOp, 2> seeds, const Shape& shape,
+                          XlaOp minval, XlaOp maxval) {
+  XlaBuilder* builder = seeds[0].builder();
+  PrimitiveType type = shape.element_type();
+  switch (type) {
+    case F32: {
+      auto bits = StatelessRngUniformU32(seeds, shape);
+      return StatelessRngUniformF32(bits, minval, maxval);
+    }
+    case S32: {
+      auto bits = StatelessRngUniformU32(seeds, shape);
+      return StatelessRngUniformInt(bits, minval, maxval, type, U32);
+    }
+    case S64: {
+      auto bits = StatelessRngUniformU64(seeds, shape);
+      return StatelessRngUniformInt(bits, minval, maxval, type, U64);
+    }
+    default:
+      return builder->ReportError(Unimplemented(
+          "Types other than F32, S32 and S64 are not implemented by "
+          "StatelessRngUniform."));
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/prng.h b/tensorflow/compiler/xla/client/lib/prng.h
index ad000b1fa1d0655c8fccc0bb33379f2499b77f26..2603818de26888566a533334e49b039b126db66e 100644
--- a/tensorflow/compiler/xla/client/lib/prng.h
+++ b/tensorflow/compiler/xla/client/lib/prng.h
@@ -25,7 +25,7 @@ namespace xla {
 
 // Returns a tensor containing 'shape' random values uniformly distributed in
 // the range [minval, maxval). Requires 2 32-bit integer seeds.
-// Currently only 'shape's of type F32 are implemented.
+// Currently only 'shape's of type F32, S32 and S64 are implemented.
 XlaOp StatelessRngUniform(std::array<XlaOp, 2> seeds, const Shape& shape,
                           XlaOp minval, XlaOp maxval);
 
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8c7df3ff5189c817202eaf39adb572f7e232ec2
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+
+namespace xla {
+
+XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
+                       absl::Span<const int64> end) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RET_CHECK(start.size() == end.size());
+    int64 n_minor_dims = start.size();
+
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    auto major_dims = AsInt64Slice(shape.dimensions())
+                          .subspan(
+                              /*pos=*/0,
+                              /*len=*/n_dims - n_minor_dims);
+
+    // Prepends 0s in the major dim
+    std::vector<int64> padded_start(n_dims, 0);
+    std::copy(start.begin(), start.end(),
+              padded_start.begin() + major_dims.size());
+
+    // Prepends the shape of the major dims.
+    std::vector<int64> padded_end(n_dims);
+    std::copy(major_dims.begin(), major_dims.end(), padded_end.begin());
+    std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size());
+
+    std::vector<int64> strides(n_dims, 1);
+    return Slice(x, padded_start, padded_end, strides);
+  });
+}
+
+XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
+    std::vector<int32> start_as_int32(start.begin(), start.end());
+    auto start_constant = ConstantR1<int32>(builder, start_as_int32);
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_ASSIGN_OR_RETURN(Shape start_constant_shape,
+                        builder->GetShape(start_constant));
+    const int64 start_length =
+        ShapeUtil::GetDimension(start_constant_shape, -1);
+    TF_RET_CHECK(start_length == n_dims);
+    return DynamicUpdateSlice(x, update, start_constant);
+  });
+}
+
+XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                             absl::Span<const int64> start) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_minor_dims = start.size();
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    std::vector<int64> padded_start(n_dims, 0);
+    std::copy(start.begin(), start.end(),
+              padded_start.begin() + (n_dims - n_minor_dims));
+    return UpdateSlice(x, update, padded_start);
+  });
+}
+
+namespace {
+
+std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
+                                 absl::Span<const int64> ys) {
+  std::vector<int64> output(xs.size() + ys.size());
+  std::copy(xs.begin(), xs.end(), output.begin());
+  std::copy(ys.begin(), ys.end(), output.begin() + xs.size());
+  return output;
+}
+
+XlaOp PrependZerosInMajorDims(XlaOp x, absl::Span<const XlaOp> starts) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    auto zero = Reshape(ConstantR0<int32>(builder, 0), {1});
+    std::vector<XlaOp> padded_starts(n_dims, zero);
+    for (int i = 0; i < starts.size(); ++i) {
+      padded_starts[n_dims - starts.size() + i] = Reshape(starts[i], {1});
+    }
+    return ConcatInDim(builder, padded_starts, 0);
+  });
+}
+
+}  // namespace
+
+XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
+                              absl::Span<const int64> sizes) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    int64 n_minor_dims = starts.size();
+    TF_RET_CHECK(n_minor_dims == sizes.size());
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    auto major_dims = AsInt64Slice(shape.dimensions())
+                          .subspan(
+                              /*pos=*/0,
+                              /*len=*/n_dims - sizes.size());
+    auto padded_starts = PrependZerosInMajorDims(x, starts);
+    auto padded_sizes = ConcatVectors(major_dims, sizes);
+    return DynamicSlice(x, padded_starts, padded_sizes);
+  });
+}
+
+XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                                    absl::Span<const XlaOp> starts) {
+  auto padded_starts = PrependZerosInMajorDims(x, starts);
+  return DynamicUpdateSlice(x, update, padded_starts);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c482a38b5489c9fb17c3dca9ee3d2a1b8fd1890
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/types.h"
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
+
+namespace xla {
+
+// Updates a slice of 'x', i.e.,
+// x[start[0], ..., start[n]] = update
+XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start);
+
+// Performs a slice in the minor dimensions of a tensor.
+// x[..., start[0]:end[0], ..., start[n]:end[n]]
+XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
+                       absl::Span<const int64> end);
+
+// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
+// x[..., start[0]:..., ..., start[n]:...] = update
+XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                             absl::Span<const int64> start);
+
+// Performs a dynamic slice in the minor dimensions of a tensor.
+XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
+                              absl::Span<const int64> sizes);
+
+XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                                    absl::Span<const XlaOp> starts);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
diff --git a/tensorflow/compiler/xla/client/lib/slicing_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d362119e01006555db0f82d02626175936e1d05
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+namespace {
+
+using SlicingTest = xla::ClientLibraryTestBase;
+
+xla::Array2D<float> BValsRight() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+xla::Array2D<float> BValsLeft() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+xla::Array2D<float> AValsFull() {
+  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
+}
+
+xla::Array3D<float> BatchedAValsFull() {
+  return {{
+              {2, 0, 1, 2},
+              {3, 6, 0, 1},
+              {4, 7, 9, 0},
+              {5, 8, 10, 11},
+          },
+          {
+              {16, 24, 8, 12},
+              {24, 61, 82, 48},
+              {8, 82, 456, 106},
+              {12, 48, 106, 62},
+          }};
+}
+
+XLA_TEST_F(SlicingTest, Simple2dLookup) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp a, x, y;
+  auto a_data = CreateR2Parameter<float>(BValsRight(), 0, "a", &builder, &a);
+  auto x_data = CreateR0Parameter<int>(2, 1, "x", &builder, &x);
+  auto y_data = CreateR0Parameter<int>(1, 2, "y", &builder, &y);
+  DynamicSliceInMinorDims(a, {x, y}, {1, 1});
+
+  ComputeAndCompareR2<float>(&builder, {{10}},
+                             {a_data.get(), x_data.get(), y_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(SlicingTest, Simple3dLookup) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp a, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto index_data = CreateR0Parameter<int>(1, 1, "index", &builder, &index);
+
+  DynamicSliceInMinorDims(a, {index, xla::ConstantR0<int32>(&builder, 0)},
+                          {1, 4});
+
+  ComputeAndCompareR3<float>(&builder, {{{3, 6, 0, 1}}, {{24, 61, 82, 48}}},
+                             {a_data.get(), index_data.get()});
+}
+
+XLA_TEST_F(SlicingTest, SimpleSliceUpdate) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp a, b, x, y;
+  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>({{9, 1, -10}}, 1, "b", &builder, &b);
+  auto x_data = CreateR0Parameter<int>(2, 2, "x", &builder, &x);
+  auto y_data = CreateR0Parameter<int>(1, 3, "y", &builder, &y);
+
+  DynamicUpdateSliceInMinorDims(a, b, {x, y});
+
+  xla::Array2D<float> expected(
+      {{{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 9, 1, -10}, {5, 8, 10, 11}}});
+
+  ComputeAndCompareR2<float>(
+      &builder, expected,
+      {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index a904be259a3870a679b2c4699ec01e2a11b1ce46..e8553a08bb014e790822a14e128686b60b8d6b7c 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 
@@ -23,13 +25,12 @@ XlaOp TopK(XlaOp input, int64 k) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
     int last_dim = input_shape.dimensions_size() - 1;
-    int last_dim_size = input_shape.dimensions(last_dim);
 
-    XlaOp iota_s32 = Iota(builder, S32, last_dim_size);
+    Shape iota_shape =
+        ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
+    XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
     auto input_dims = input_shape.dimensions();
-    std::vector<int64> broadcast_dims(input_dims.begin(), input_dims.end() - 1);
-    XlaOp broadcast_s32 = Broadcast(iota_s32, broadcast_dims);
-    XlaOp sort_result = Sort(Neg(input), broadcast_s32);
+    XlaOp sort_result = Sort(Neg(input), {iota_s32});
     std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
     std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
     limit_indices[last_dim] = k;
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index fef98c9923096e21a755c6d730de2c7c10852b2d..27ff36c7491ab8397d46f3a49493ff2b904deb2d 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
+
+#include <limits>
+
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -41,6 +44,28 @@ XLA_TEST_F(SortingTest, TopK3From8Indices) {
   ComputeAndCompareR1<int>(&builder, {0, 1, 2}, {});
 }
 
+// TODO(b/119930279): enable this test.
+XLA_TEST_F(SortingTest, DISABLED_TopKFullSortMinInt) {
+  XlaBuilder builder(TestName());
+  auto x_rev = ConstantR1<int>(&builder, {std::numeric_limits<int>::min(),
+                                          std::numeric_limits<int>::min() + 1,
+                                          std::numeric_limits<int>::max()});
+  xla::GetTupleElement(xla::TopK(x_rev, 3), 1);
+  ComputeAndCompareR1<int>(&builder, {2, 1, 0}, {});
+}
+
+XLA_TEST_F(SortingTest, NOT_TopKFullSortMinInt) {
+  XlaBuilder builder(TestName());
+  auto x_rev = ConstantR1<int>(&builder, {std::numeric_limits<int>::min(),
+                                          std::numeric_limits<int>::min() + 1,
+                                          std::numeric_limits<int>::max()});
+  xla::GetTupleElement(xla::TopK(x_rev, 3), 1);
+  // TopK currently negates the keys, which doesn't work correctly for
+  // std::numeric_limits<int>::min(). Therefore, it will sort this key to the
+  // front instead of to the back.
+  ComputeAndCompareR1<int>(&builder, {0, 2, 1}, {});
+}
+
 XLA_TEST_F(SortingTest, TopKFullSort) {
   XlaBuilder builder(TestName());
   const int kSize = 16;
@@ -56,5 +81,13 @@ XLA_TEST_F(SortingTest, TopKFullSort) {
   ComputeAndCompareR1<float>(&builder, inputs, {});
 }
 
+XLA_TEST_F(SortingTest, TopKFullSortWithDuplicates) {
+  XlaBuilder builder(TestName());
+  XlaOp a;
+  auto a_data = CreateR1Parameter<int>({1, 1, 2, 2, 1}, 0, "a", &builder, &a);
+  xla::GetTupleElement(xla::TopK(a, 5), 1);
+  ComputeAndCompareR1<int>(&builder, {2, 3, 0, 1, 4}, {a_data.get()});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index ff0ec76a7f9b62fce0f14beae688cb0dd74847a1..a95bbf2c8c860914877d3195b97342097dafc725 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -66,7 +66,7 @@ std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
   XlaComputation computation = b.Build().ConsumeValueOrDie();
 
   auto execution_options = CreateDefaultExecutionOptions();
-  *execution_options.mutable_shape_with_output_layout() = shape;
+  *execution_options.mutable_shape_with_output_layout() = shape.ToProto();
   return client->Execute(computation, /*arguments=*/{}, &execution_options)
       .ConsumeValueOrDie();
 }
@@ -93,13 +93,13 @@ std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
 
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
     const XlaComputation& computation, Client* client) {
-  CHECK(computation.proto().has_program_shape())
+  CHECK(computation.proto().has_host_program_shape())
       << "Computation should have progran shape.";
-  auto program_shape = computation.proto().program_shape();
+  auto program_shape = computation.proto().host_program_shape();
 
   std::vector<std::unique_ptr<GlobalData>> results;
-  for (const Shape& shape : program_shape.parameters()) {
-    results.push_back(MakeFakeDataOrDie(shape, client));
+  for (const ShapeProto& shape : program_shape.parameters()) {
+    results.push_back(MakeFakeDataOrDie(Shape(shape), client));
   }
   return results;
 }
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve.cc b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5a1d34cc66e6f8c1a832f8a8437163b846a5431
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
@@ -0,0 +1,412 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/math/math_util.h"
+
+namespace xla {
+
+// Get the diagonal blocks of the coefficient matrix
+XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(a));
+    int ndims = ShapeUtil::Rank(shape);
+    int64 n = ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = n / block_size;
+
+    XlaOp diag_blocks;
+
+    // If the coefficient matrix is exactly the block size, we just add a
+    // singleton dimension i.e. [..., n, n] -> [..., 1, n, n]
+    if (n == block_size) {
+      std::vector<int64> permutation(ndims);
+      std::iota(permutation.begin(), permutation.end(), 1);
+      permutation.insert(permutation.end() - 2, 0);
+      return Transpose(Broadcast(a, /*broadcast_sizes=*/{1}), permutation);
+    }
+
+    // We can grab entire blocks using gather
+    if (n > block_size) {
+      // Construct the starting indices of the diagonal blocks
+      auto start_indices =
+          Transpose(Broadcast(Mul(Iota(builder, S32, num_blocks),
+                                  ConstantR0<int32>(builder, block_size)),
+                              /*broadcast_sizes=*/{2}),
+                    /*permutation=*/{1, 0});
+
+      // Gather the diagonal blocks
+      GatherDimensionNumbers dim_numbers;
+      dim_numbers.add_offset_dims(ndims - 1);
+      dim_numbers.add_offset_dims(ndims);
+      dim_numbers.add_start_index_map(ndims - 2);
+      dim_numbers.add_start_index_map(ndims - 1);
+      dim_numbers.set_index_vector_dim(1);
+      diag_blocks = Gather(a, start_indices, dim_numbers,
+                           /*slice_sizes=*/{block_size, block_size});
+    }
+
+    // The last block might be smaller than the block size,
+    // so we will need to pad it
+    if (n % block_size != 0) {
+      // Pad with zeros
+      auto last_blocks =
+          SliceInMinorDims(a, {n - n % block_size, n - n % block_size}, {n, n});
+      PaddingConfig config = MakeNoPaddingConfig(ndims);
+      int64 padding = block_size - n % block_size;
+      config.mutable_dimensions(ndims - 1)->set_edge_padding_high(padding);
+      config.mutable_dimensions(ndims - 2)->set_edge_padding_high(padding);
+      last_blocks =
+          Pad(last_blocks, Zero(builder, shape.element_type()), config);
+
+      // Add a singleton dimension
+      // i.e. [..., block_size, block_size] -> [..., 1, block_size, block_size]
+      TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(last_blocks));
+      auto shape_dims = AsInt64Slice(blocks_shape.dimensions());
+      auto last_blocks_dims = std::vector<int64>(ndims);
+      std::copy(shape_dims.begin(), shape_dims.end(), last_blocks_dims.begin());
+      last_blocks_dims.insert(last_blocks_dims.end() - 2, 1);
+      last_blocks = Reshape(last_blocks, last_blocks_dims);
+
+      // Concatenate with the other blocks if necessary
+      if (n > block_size) {
+        diag_blocks =
+            ConcatInDim(builder, {diag_blocks, last_blocks}, ndims - 2);
+      } else {
+        diag_blocks = last_blocks;
+      }
+    }
+
+    return diag_blocks;
+  });
+}
+
+XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
+                           bool conjugate_a,
+                           PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = diag_blocks.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Input is a batch of square lower triangular square matrices. Its shape is
+    // (..., size, size). We resize this to (num_blocks, size, size).
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(diag_blocks));
+    int64 block_size = ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = ShapeUtil::ElementsIn(shape) /
+                       tensorflow::MathUtil::IPow(block_size, 2);
+    diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
+
+    // The input must be triangular because we rely on that when doing
+    // multiplications later on
+    diag_blocks = Triangle(diag_blocks, /*lower=*/lower);
+
+    // Rescale blocks to be unit triangular, but avoid dividing by
+    // zero (which can happen if the last block was padded) otherwise it will
+    // introduce nans which will propagate
+    auto diags = GetMatrixDiagonal(diag_blocks);
+    TF_ASSIGN_OR_RETURN(Shape diags_shape, builder->GetShape(diags));
+    auto one = ScalarLike(diags, 1);
+    auto ones = Broadcast(one, AsInt64Slice(diags_shape.dimensions()));
+    diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
+    auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
+
+    // We can now use the fact that for an upper triangular matrix
+    // [[L11, 0], [L21, L22]], given the inverses L11' and L22', we have
+    // L22' = -L22' * L21 * L11'. In our case, L21 is a vector and our blocks
+    // have been rescaled to be unit triangular, so L22 = L22' = 1.
+
+    // Initialize the output matrix with -1s on the diagonal. We use -1 instead
+    // of 1 because we cannot do matrix-vector multiplies with variable shapes
+    // inside of a loop, or do irregularly shaped in-place updates. Hence,
+    // L21 <- -L22 * L21 * L11 cannot be done naively. Instead, we update the
+    // entire row i.e. we calculate
+    // [L21 L22 0] <- -[L21 L22 0] @ diag_blocks([L11', -I, -I])
+    // which means [L21 L22 0] <- [-L21 * L11', L22, 0].
+    auto identity =
+        IdentityMatrix(builder, shape.element_type(), block_size, block_size);
+    auto neg_identity = -identity;
+
+    // The first or last  diagonal element should be set to 1 instead of -1
+    // though, since we never update it
+    auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
+    auto start_index = (lower) ? 0 : block_size - 1;
+    auto output_block = DynamicUpdateSlice(
+        neg_identity, pos_one,
+        /*start_indices=*/ConstantR1<int>(builder, 2, start_index));
+
+    // Broadcast diag([1, -1, -1, ...]) to every block
+    XlaOp output = Broadcast(output_block,
+                             /*broadcast_sizes=*/{num_blocks});
+
+    // Now we construct a loop that performs matrix-vector multiplications
+    // inverting the blocks one row at a time
+    std::vector<Shape> tuple_shapes = {
+        // The loop iteration counter is a scalar, incremented each iteration.
+        ShapeUtil::MakeShape(S32, {}),
+        // The output has the shape of A, with one row updated each iteration.
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size}),
+        // The input is a loop invariant.
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size})};
+    Shape tuple_shape = ShapeUtil::MakeTupleShape(tuple_shapes);
+
+    auto init_i = One(builder, S32);
+    auto init = Tuple(builder, {init_i, output, scaled_diag_blocks});
+
+    // Construct the loop condition function.
+    std::unique_ptr<XlaBuilder> condb =
+        builder->CreateSubBuilder("InvertDiagCond");
+    {
+      auto i = GetTupleElement(
+          Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
+      Lt(i, ConstantR0<int32>(condb.get(), block_size));
+    }
+    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
+
+    // Construct the loop body function.
+    std::unique_ptr<XlaBuilder> bodyb =
+        builder->CreateSubBuilder("InvertDiagBody");
+    {
+      auto input_tuple =
+          Parameter(bodyb.get(), 0, tuple_shape, "InvertDiagBodyTuple");
+
+      auto i = GetTupleElement(input_tuple, 0);
+      auto body_out = GetTupleElement(input_tuple, 1);
+      auto body_input = GetTupleElement(input_tuple, 2);
+
+      auto zero = ConstantR1<int32>(bodyb.get(), 1, 0);
+      auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
+      auto start_indices =
+          ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
+      auto input_row =
+          DynamicSlice(body_input, start_indices,
+                       /*slice_sizes=*/{num_blocks, 1, block_size});
+
+      // We want -L21 L11^{-1}
+      DotDimensionNumbers dnums;
+      dnums.add_lhs_batch_dimensions(0);
+      dnums.add_rhs_batch_dimensions(0);
+      dnums.add_lhs_contracting_dimensions(2);
+      dnums.add_rhs_contracting_dimensions(1);
+      PrecisionConfig precision_proto;
+      precision_proto.add_operand_precision(precision);
+      precision_proto.add_operand_precision(precision);
+      auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
+
+      body_out = DynamicUpdateSlice(body_out, update, start_indices);
+
+      auto next_i = i + ScalarLike(i, 1);
+      Tuple(bodyb.get(), {next_i, body_out, body_input});
+    }
+    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+    // Construct the While loop and return the result,
+    // return while_loop(cond_fun, body_fun, init)[1]
+    auto invert_while = While(cond, body, init);
+    auto inv_diag_blocks = GetTupleElement(invert_while, 1);
+
+    // Undo the scaling
+    inv_diag_blocks = Div(inv_diag_blocks, diags,
+                          /*broadcast_dimensions=*/{0, 1});
+
+    // Reshape back to original batch major dimensions
+    return Reshape(inv_diag_blocks, AsInt64Slice(shape.dimensions()));
+  });
+}
+
+XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
+                                      bool left_side, bool lower,
+                                      bool transpose_a, bool conjugate_a,
+                                      PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(inv_diag_blocks));
+    TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
+    int64 block_size = ShapeUtil::GetDimension(blocks_shape, -1);
+
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    int64 ndims = ShapeUtil::Rank(a_shape);
+    int64 n = ShapeUtil::GetDimension(a_shape, -1);
+    int64 num_blocks = n / block_size + (n % block_size != 0);
+    int64 m_dim = (left_side) ? -1 : -2;
+    int64 m = ShapeUtil::GetDimension(b_shape, m_dim);
+
+    // Initialize the solution
+    auto x = ZerosLike(b);
+
+    // This loop is unrolled for performance reasons, but it could be expressed
+    // rolled as well since the matrices are of the same size each iteration
+    for (int i = 0; i < num_blocks; i++) {
+      // High-level intuition: We have B[i] = L[i] @ X. Since L is upper
+      // triangular this means B[i] = L[i, :i + 1] @ X[:i + 1]. We can split
+      // this into two parts: B[i] = L[i, :i] @ X[:i] + L[i, i] @ X[i] which
+      // can be solved for X[i] as X[i] = inv(L[i, i]) @ B[i] - L[i, :i] @ X[:i]
+
+      // Decide whether we go from first block to last or vice versa
+      auto j = (left_side ^ lower ^ transpose_a) ? num_blocks - 1 - i : i;
+
+      // Get the size of the inverse blocks (the last one might be smaller)
+      int64 block = (n % block_size != 0 && j + 1 == num_blocks)
+                        ? n % block_size
+                        : block_size;
+      auto inv_block =
+          MaybeConjugate(Collapse(SliceInMinorDims(inv_diag_blocks, {j, 0, 0},
+                                                   {j + 1, block, block}),
+                                  /*dimensions=*/{ndims - 2, ndims - 1}),
+                         conjugate_a);
+
+      // Get the corresponding row of B
+      int64 k = std::min((j + 1) * block_size, n);
+      std::vector<int64> start = {j * block_size, 0};
+      std::vector<int64> end = {k, m};
+      if (!left_side) {
+        std::swap(start[0], start[1]);
+        std::swap(end[0], end[1]);
+      }
+      auto b_row = SliceInMinorDims(b, start, end);
+
+      XlaOp remainder;
+      if (i == 0) {
+        remainder = b_row;
+      } else {
+        // This matrix multiply involves a lot of multiplying with zero (namely,
+        // X[i * block_size:] = 0), but this is faster than slicing...
+        end = {k, n};
+        if (!left_side) {
+          std::swap(end[0], end[1]);
+        }
+        if (transpose_a) {
+          std::swap(start[0], start[1]);
+          std::swap(end[0], end[1]);
+        }
+        auto a_row =
+            MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a);
+        if (left_side) {
+          remainder =
+              b_row - BatchDot(MaybeTransposeInMinorDims(a_row, transpose_a), x,
+                               precision);
+        } else {
+          remainder =
+              b_row - BatchDot(x, MaybeTransposeInMinorDims(a_row, transpose_a),
+                               precision);
+        }
+      }
+
+      XlaOp x_update;
+      auto zero = Zero(builder, S32);
+      auto start_index = ConstantR0WithType(builder, S32, j * block_size);
+      std::vector<XlaOp> update_starts = {start_index, zero};
+      if (left_side) {
+        x_update = BatchDot(MaybeTransposeInMinorDims(inv_block, transpose_a),
+                            remainder, precision);
+      } else {
+        x_update = BatchDot(remainder,
+                            MaybeTransposeInMinorDims(inv_block, transpose_a),
+                            precision);
+        std::swap(update_starts[0], update_starts[1]);
+      }
+      x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
+    }
+
+    return x;
+  });
+}
+
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool transpose_a, bool conjugate_a, int64 block_size,
+                      PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
+    if (ShapeUtil::Rank(a_shape) != ShapeUtil::Rank(b_shape)) {
+      return InvalidArgument(
+          "Arguments to TriangularSolve have shapes with different ranks: "
+          "%s vs. %s",
+          ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
+    }
+    const int64 ndims = ShapeUtil::Rank(a_shape);
+    if (ndims < 2) {
+      return InvalidArgument(
+          "Arguments to TriangularSolve was rank %d but must have rank >= 2.",
+          ndims);
+    }
+    // The batch dimensions must be equal.
+    std::vector<int64> batch_dimensions;
+    for (int i = 0; i < ndims - 2; ++i) {
+      int64 a_size = a_shape.dimensions(i);
+      int64 b_size = b_shape.dimensions(i);
+      if (a_size != b_size) {
+        return InvalidArgument(
+            "Batch dimensions of arguments to TriangularSolve must be equal; "
+            "shapes were %s and %s.",
+            ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
+      }
+      batch_dimensions.push_back(a_size);
+    }
+
+    if (ShapeUtil::GetDimension(a_shape, -1) !=
+        ShapeUtil::GetDimension(a_shape, -2)) {
+      return InvalidArgument(
+          "The 'a' argument to TriangularSolve must be a batched square matrix;"
+          " shape was: %s",
+          ShapeUtil::HumanString(a_shape));
+    }
+    const int64 m = ShapeUtil::GetDimension(b_shape, -2);
+    const int64 n = ShapeUtil::GetDimension(b_shape, -1);
+    if ((left_side ? m : n) != ShapeUtil::GetDimension(a_shape, -1)) {
+      return InvalidArgument(
+          "Arguments to TriangularSolve have incompatible matrix shapes %s and "
+          "%s",
+          ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
+    }
+
+    if (block_size < 1) {
+      return InvalidArgument(
+          "block_size argument to TriangularSolve must be >= 1; got %d",
+          block_size);
+    }
+
+    // We find the diagonal blocks of the coefficient matrix
+    auto diag_blocks = DiagonalBlocks(a, block_size);
+
+    // We invert these blocks in parallel using batched matrix-vector products
+    auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, transpose_a,
+                                                conjugate_a, precision);
+
+    // We now find the solution using GEMMs
+    auto x =
+        SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side, lower,
+                                        transpose_a, conjugate_a, precision);
+
+    return x;
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/xla/client/lib/triangular_solve.h
similarity index 88%
rename from tensorflow/compiler/tf2xla/lib/triangular_solve.h
rename to tensorflow/compiler/xla/client/lib/triangular_solve.h
index 2303234f361e54cd2a0ad495cb03b371bed76877..50a3b30ebd1c15eb6d2ace4e351cb41f21db7093 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Solves systems of linear equations with lower or upper triangular coefficient
 // matrices by forward- or back-substitution. Broadcasting along leading
@@ -57,11 +57,11 @@ namespace tensorflow {
 //
 // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
 // blocking is used.
-xla::XlaOp TriangularSolve(
-    xla::XlaOp a, xla::XlaOp b, bool left_side, bool lower, bool transpose_a,
+XlaOp TriangularSolve(
+    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
     bool conjugate_a, int64 block_size = 128,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::HIGHEST);
+    PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
 
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
similarity index 99%
rename from tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
rename to tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
index aeebf16028d40189203cdfd815f06a339ee72902..f6a70d64a788d95a456774ccbbcf67f2e5cac98b 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 
 #include <memory>
 #include <numeric>
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
-namespace tensorflow {
+namespace xla {
 namespace {
 
 using TriangularSolveTest = xla::ClientLibraryTestBase;
@@ -330,4 +330,4 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
 }
 
 }  // namespace
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index f96b6c9c261a9686fb647e3da0dcc933cd1f70df..049cd15738a619294b19d5cf74ca514d7b4a00ad 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -71,9 +71,9 @@ Status LocalExecutable::ValidateExecutionOptions(
           "parameter "
           "%d: want %s, got %s",
           i,
-          ShapeUtil::HumanString(
+          ShapeUtil::HumanStringWithLayout(
               computation_layout.parameter_layout(i).shape()),
-          ShapeUtil::HumanString(arguments[i]->on_host_shape()));
+          ShapeUtil::HumanStringWithLayout(arguments[i]->on_host_shape()));
     }
   }
 
@@ -310,4 +310,28 @@ StatusOr<int> LocalClient::ReplicaNumberToDeviceOrdinal(int replica_number) {
   return local_service_->ReplicaNumberToDeviceOrdinal(replica_number);
 }
 
+StatusOr<TransferToServerResponse> LocalClient::TransferToLocalServer(
+    const ::xla::BorrowingLiteral& literal, int device_oridinal) {
+  const ::xla::Shape& shape = literal.shape();
+
+  TF_ASSIGN_OR_RETURN(
+      ::xla::ScopedShapedBuffer shaped_buffer,
+      backend().transfer_manager()->AllocateScopedShapedBuffer(
+          shape, backend().memory_allocator(), device_oridinal));
+  TF_ASSIGN_OR_RETURN(auto stream,
+                      mutable_backend()->BorrowStream(device_oridinal));
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+      stream.get(), literal, shaped_buffer));
+  std::vector<::xla::ScopedShapedBuffer> replicated_buffer;
+  replicated_buffer.emplace_back(std::move(shaped_buffer));
+  ::xla::TransferToServerResponse result;
+  TF_ASSIGN_OR_RETURN(*result.mutable_data(),
+                      local_service_->RegisterReplicatedBuffers(
+                          std::move(replicated_buffer),
+                          absl::StrCat("TransferToServer literal of shape ",
+                                       ::xla::ShapeUtil::HumanString(shape))));
+
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index feb2f8ec9dab5bf13afdc866d10ccbe74f8edcb9..ddb36680e8b185b053368baffa6f1d5cac50dc07 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -60,8 +60,8 @@ class LocalExecutable {
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
   //
-  // The given ExecutableRunOptions override any values from legacy_flags
-  // (TF_XLA_FLAGS environment variable).
+  // The given ExecutableRunOptions override any values from TF_XLA_FLAGS
+  // environment variable.
   Status ValidateExecutionOptions(
       const absl::Span<const ShapedBuffer* const> arguments,
       const ExecutableRunOptions& run_options, const Backend& backend);
@@ -69,8 +69,8 @@ class LocalExecutable {
   // Records the computation in a SessionModule proto with the arguments used to
   // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
   //
-  // The given ServiceExecutableRunOptions override any values from legacy_flags
-  // (TF_XLA_FLAGS environment variable).
+  // The given ServiceExecutableRunOptions override any values from TF_XLA_FLAGS
+  // environment variable.
   StatusOr<ScopedShapedBuffer> ExecuteAndDump(
       const ServiceExecutableRunOptions* run_options,
       const absl::Span<const ShapedBuffer* const> arguments);
@@ -114,8 +114,8 @@ class LocalClient : public Client {
   // Build and return a LocalExecutable object. The executable is compiled using
   // the given XlaComputation, argument layouts and options.
   //
-  // The given ExecutableBuildOptions override any values from legacy_flags
-  // (TF_XLA_FLAGS environment variable).
+  // The given ExecutableBuildOptions override any values from TF_XLA_FLAGS
+  // environment variable.
   StatusOr<std::unique_ptr<LocalExecutable>> Compile(
       const XlaComputation& computation,
       const absl::Span<const Shape* const> argument_layouts,
@@ -129,6 +129,10 @@ class LocalClient : public Client {
       const Literal& literal, int device_ordinal,
       DeviceMemoryAllocator* allocator = nullptr);
 
+  // Transfer the BorrowingLiteral to the device with the given ordinal.
+  StatusOr<TransferToServerResponse> TransferToLocalServer(
+      const ::xla::BorrowingLiteral& literal, int device_oridinal);
+
   // Copy the data from the device contained in the given ShapedBuffer and
   // return as a Literal.
   StatusOr<Literal> ShapedBufferToLiteral(const ShapedBuffer& shaped_buffer);
diff --git a/tensorflow/compiler/xla/client/sharding_builder.cc b/tensorflow/compiler/xla/client/sharding_builder.cc
index 176802b33ef824a1f898255a19e44def3c1fc982..fb9ea6ec3fc41d5e04ca125798a8199350470a44 100644
--- a/tensorflow/compiler/xla/client/sharding_builder.cc
+++ b/tensorflow/compiler/xla/client/sharding_builder.cc
@@ -36,7 +36,7 @@ OpSharding Tile(const Shape& tile_shape,
                 const TileAssignment& tile_assignment) {
   OpSharding result;
   result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
-  *result.mutable_tile_shape() = tile_shape;
+  *result.mutable_tile_shape() = tile_shape.ToProto();
   for (int64 dim : tile_assignment.dimensions()) {
     result.add_tile_assignment_dimensions(dim);
   }
@@ -52,7 +52,7 @@ OpSharding Tile1D(const Shape& tile_shape, int64 num_tiles) {
 
   CHECK_EQ(ShapeUtil::Rank(tile_shape), 1);
   std::vector<int64> dimensions(1, num_tiles);
-  *result.mutable_tile_shape() = tile_shape;
+  *result.mutable_tile_shape() = tile_shape.ToProto();
   auto& tile_dimension =
       (*result.mutable_tile_shape()->mutable_dimensions())[0];
   tile_dimension = CeilOfRatio(static_cast<int64>(tile_dimension), num_tiles);
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index d196252db16fe84d44824856a2202c1a5d3fce95..60df2ec3959216b0564846ad47c21c5bcc01ea57 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -34,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/platform/mutex.h"
 
 namespace xla {
 
@@ -42,12 +40,30 @@ using absl::StrCat;
 
 namespace {
 
-int64 GetUniqueId() {
-  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
-  static int64 built_counter = 0;
-  tensorflow::mutex_lock loc(mu);
-  const int64 id = built_counter++;
-  return id;
+static const char kNameSeparator = '.';
+
+// Retrieves the base name of an instruction or computation fully qualified
+// name, using separator as boundary between the initial base name part, and
+// the numeric identification.
+string GetBaseName(const string& name, char separator) {
+  auto pos = name.rfind(separator);
+  CHECK_NE(pos, string::npos) << name;
+  return name.substr(0, pos);
+}
+
+// Generates a fully qualified computation/instruction name.
+string GetFullName(const string& base_name, char separator, int64 id) {
+  const char separator_str[] = {separator, '\0'};
+  return StrCat(base_name, separator_str, id);
+}
+
+// Common function to standardize setting name and IDs on computation and
+// instruction proto entities.
+template <typename T>
+void SetProtoIdAndName(T* entry, const string& base_name, char separator,
+                       int64 id) {
+  entry->set_id(id);
+  entry->set_name(GetFullName(base_name, separator, id));
 }
 
 }  // namespace
@@ -86,7 +102,7 @@ StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
   TF_RETURN_IF_ERROR(first_error_);
 
   TF_ASSIGN_OR_RETURN(auto instr, LookUpInstruction(op));
-  return instr->shape();
+  return Shape(instr->shape());
 }
 
 StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
@@ -139,7 +155,7 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64 root_id) const {
 
   ProgramShape program_shape;
 
-  *program_shape.mutable_result() = root_proto->shape();
+  *program_shape.mutable_result() = Shape(root_proto->shape());
 
   // Check that the parameter numbers are continuous from 0, and add parameter
   // shapes and names to the program shape.
@@ -156,7 +172,7 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64 root_id) const {
       const int64 index = instr.parameter_number();
       TF_RET_CHECK(index >= 0 && index < param_count)
           << "invalid parameter number: " << index;
-      *program_shape.mutable_parameters(index) = instr.shape();
+      *program_shape.mutable_parameters(index) = Shape(instr.shape());
       *program_shape.mutable_parameter_names(index) = instr.name();
     }
   }
@@ -223,6 +239,19 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
   visited->insert(op_handle);
 }
 
+Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
+                                     ShapeIndex dynamic_size_param_index,
+                                     int64 target_param_num,
+                                     ShapeIndex target_param_index,
+                                     int64 target_dim_num) {
+  TF_RETURN_IF_ERROR(dynamic_parameter_binding_.Bind(
+      DynamicParameterBinding::DynamicParameter{dynamic_size_param_num,
+                                                dynamic_size_param_index},
+      DynamicParameterBinding::DynamicDimension{
+          target_param_num, target_param_index, target_dim_num}));
+  return Status::OK();
+}
+
 XlaComputation XlaBuilder::BuildAndNoteError() {
   DCHECK(parent_builder_ != nullptr);
   auto build_status = Build();
@@ -258,17 +287,15 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
   }
 
   HloComputationProto entry;
-  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
-  entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
-
-  TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(), GetProgramShape(root_id));
+  SetProtoIdAndName(&entry, name_, kNameSeparator, GetNextId());
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, GetProgramShape(root_id));
+  *entry.mutable_program_shape() = program_shape.ToProto();
   entry.set_root_id(root_id);
 
   for (auto& instruction : instructions_) {
     // Ensures that the instruction names are unique among the whole graph.
-    const string& new_name =
-        StrCat(instruction.name(), ".", entry.id(), ".", instruction.id());
-    instruction.set_name(new_name);
+    instruction.set_name(
+        GetFullName(instruction.name(), kNameSeparator, instruction.id()));
     entry.add_instructions()->Swap(&instruction);
   }
 
@@ -278,12 +305,15 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
   module->set_id(entry.id());
   module->set_entry_computation_name(entry.name());
   module->set_entry_computation_id(entry.id());
-  *module->mutable_program_shape() = entry.program_shape();
+  *module->mutable_host_program_shape() = entry.program_shape();
   for (auto& e : embedded_) {
     module->add_computations()->Swap(&e.second);
   }
   module->add_computations()->Swap(&entry);
 
+  *(module->mutable_dynamic_parameter_binding()) =
+      dynamic_parameter_binding_.ToProto();
+
   // Clear data held by this builder.
   this->instructions_.clear();
   this->handle_to_index_.clear();
@@ -299,7 +329,7 @@ StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
   TF_RETURN_IF_ERROR(first_error_);
 
   HloInstructionProto instr;
-  *instr.mutable_shape() = shape;
+  *instr.mutable_shape() = shape.ToProto();
   for (int64 dim : broadcast_dimensions) {
     instr.add_dimensions(dim);
   }
@@ -350,8 +380,9 @@ XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferUnaryOpShape(unop, operand_shape));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), unop, {operand});
   });
 }
@@ -362,9 +393,10 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferBinaryOpShape(
                             binop, lhs_shape, rhs_shape, broadcast_dimensions));
+    *instr.mutable_shape() = shape.ToProto();
 
     const int64 lhs_rank = ShapeUtil::Rank(lhs_shape);
     const int64 rhs_rank = ShapeUtil::Rank(rhs_shape);
@@ -378,7 +410,7 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
       const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape;
 
       std::vector<int64> to_size;
-      for (int64 size : instr.shape().dimensions()) {
+      for (int64 size : shape.dimensions()) {
         to_size.push_back(size);
       }
       for (int64 from_dim = 0; from_dim < ShapeUtil::Rank(from_shape);
@@ -398,14 +430,14 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
     }
 
     TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, GetShape(updated_lhs));
-    if (!ShapeUtil::SameDimensions(instr.shape(), updated_lhs_shape)) {
+    if (!ShapeUtil::SameDimensions(shape, updated_lhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_lhs,
-                          AddBroadcastSequence(instr.shape(), updated_lhs));
+                          AddBroadcastSequence(shape, updated_lhs));
     }
     TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, GetShape(updated_rhs));
-    if (!ShapeUtil::SameDimensions(instr.shape(), updated_rhs_shape)) {
+    if (!ShapeUtil::SameDimensions(shape, updated_rhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_rhs,
-                          AddBroadcastSequence(instr.shape(), updated_rhs));
+                          AddBroadcastSequence(shape, updated_rhs));
     }
 
     return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs});
@@ -419,30 +451,28 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
     TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(ehs));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferTernaryOpShape(
-                            triop, lhs_shape, rhs_shape, ehs_shape));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferTernaryOpShape(triop, lhs_shape,
+                                                         rhs_shape, ehs_shape));
+    *instr.mutable_shape() = shape.ToProto();
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
     XlaOp updated_ehs = ehs;
-    if (!ShapeUtil::IsTuple(instr.shape())) {
+    if (!ShapeUtil::IsTuple(shape)) {
       if (!ShapeUtil::IsTuple(lhs_shape) &&
-          !ShapeUtil::SameDimensions(instr.shape(), lhs_shape)) {
+          !ShapeUtil::SameDimensions(shape, lhs_shape)) {
         // lhs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_lhs,
-                            AddBroadcastSequence(instr.shape(), lhs));
+        TF_ASSIGN_OR_RETURN(updated_lhs, AddBroadcastSequence(shape, lhs));
       }
       if (!ShapeUtil::IsTuple(rhs_shape) &&
-          !ShapeUtil::SameDimensions(instr.shape(), rhs_shape)) {
+          !ShapeUtil::SameDimensions(shape, rhs_shape)) {
         // rhs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_rhs,
-                            AddBroadcastSequence(instr.shape(), rhs));
+        TF_ASSIGN_OR_RETURN(updated_rhs, AddBroadcastSequence(shape, rhs));
       }
       if (!ShapeUtil::IsTuple(ehs_shape) &&
-          !ShapeUtil::SameDimensions(instr.shape(), ehs_shape)) {
+          !ShapeUtil::SameDimensions(shape, ehs_shape)) {
         // ehs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_ehs,
-                            AddBroadcastSequence(instr.shape(), ehs));
+        TF_ASSIGN_OR_RETURN(updated_ehs, AddBroadcastSequence(shape, ehs));
       }
     }
     return AddInstruction(std::move(instr), triop,
@@ -463,7 +493,7 @@ XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = literal.shape();
+    *instr.mutable_shape() = literal.shape().ToProto();
     *instr.mutable_literal() = literal.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kConstant);
   });
@@ -472,7 +502,7 @@ XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
 XlaOp XlaBuilder::Iota(const Shape& shape, int64 iota_dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
     instr.add_dimensions(iota_dimension);
     return AddInstruction(std::move(instr), HloOpcode::kIota);
   });
@@ -492,10 +522,10 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferCallShape(operand_shape_ptrs,
-                                       /*to_apply=*/called_program_shape));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCallShape(
+                                         operand_shape_ptrs,
+                                         /*to_apply=*/called_program_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     AddCalledComputation(computation, &instr);
 
@@ -513,7 +543,7 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
     }
     instr.set_parameter_number(parameter_number);
     instr.set_name(name);
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kParameter);
   });
 }
@@ -543,10 +573,35 @@ XlaOp XlaBuilder::Broadcast(const XlaOp& operand,
 }
 
 XlaOp XlaBuilder::BroadcastInDim(
-    const XlaOp& operand, const Shape& shape,
+    const XlaOp& operand, const absl::Span<const int64> out_dim_size,
     const absl::Span<const int64> broadcast_dimensions) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    return InDimBroadcast(shape, operand, broadcast_dimensions);
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    // Output shape, in the case of degenerate broadcast, the out_dim_size is
+    // not necessarily the same as the dimension sizes of the output shape.
+    const auto& output_shape =
+        ShapeUtil::MakeShape(operand_shape.element_type(), out_dim_size);
+
+    TF_RETURN_IF_ERROR(ShapeInference::InferBroadcastShape(
+                           operand_shape, output_shape, broadcast_dimensions)
+                           .status());
+    std::vector<int64> in_dim_size(out_dim_size.begin(), out_dim_size.end());
+    for (int i = 0; i < broadcast_dimensions.size(); i++) {
+      in_dim_size[broadcast_dimensions[i]] = operand_shape.dimensions(i);
+    }
+    const auto& in_dim_shape =
+        ShapeUtil::MakeShape(operand_shape.element_type(), in_dim_size);
+    TF_ASSIGN_OR_RETURN(
+        XlaOp in_dim_broadcast,
+        InDimBroadcast(in_dim_shape, operand, broadcast_dimensions));
+
+    // If broadcast is not degenerate, return broadcasted result.
+    if (ShapeUtil::Equal(in_dim_shape, output_shape)) {
+      return in_dim_broadcast;
+    }
+
+    // Otherwise handle degenerate broadcast case.
+    return AddBroadcastSequence(output_shape, in_dim_broadcast);
   });
 }
 
@@ -554,7 +609,7 @@ StatusOr<XlaOp> XlaBuilder::Reshape(const Shape& shape, const XlaOp& operand) {
   TF_RETURN_IF_ERROR(first_error_);
 
   HloInstructionProto instr;
-  *instr.mutable_shape() = shape;
+  *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), HloOpcode::kReshape, {operand});
 }
 
@@ -566,9 +621,9 @@ XlaOp XlaBuilder::Slice(const XlaOp& operand,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferSliceShape(operand_shape, start_indices,
-                                        limit_indices, strides));
+        Shape shape, ShapeInference::InferSliceShape(
+                         operand_shape, start_indices, limit_indices, strides));
+    *instr.mutable_shape() = shape.ToProto();
     for (int i = 0; i < start_indices.size(); i++) {
       auto* slice_config = instr.add_slice_dimensions();
       slice_config->set_start(start_indices[i]);
@@ -603,9 +658,10 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
                             operand_shape, start_indices_shape, slice_sizes));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (int64 size : slice_sizes) {
       instr.add_dynamic_slice_sizes(size);
@@ -625,9 +681,10 @@ XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
     TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicUpdateSliceShape(
                             operand_shape, update_shape, start_indices_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
                           {operand, update, start_indices});
@@ -643,9 +700,9 @@ XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConcatOpShape(
+                                         operand_shape_ptrs, dimension));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.add_dimensions(dimension);
 
@@ -662,10 +719,9 @@ XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
     TF_ASSIGN_OR_RETURN(const Shape& padding_value_shape,
                         GetShape(padding_value));
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferPadShape(operand_shape, padding_value_shape,
-                                      padding_config));
-
+        Shape shape, ShapeInference::InferPadShape(
+                         operand_shape, padding_value_shape, padding_config));
+    *instr.mutable_shape() = shape.ToProto();
     *instr.mutable_padding_config() = padding_config;
 
     return AddInstruction(std::move(instr), HloOpcode::kPad,
@@ -678,7 +734,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           absl::Span<const int64> new_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(const Shape& shape,
+    TF_ASSIGN_OR_RETURN(const Shape shape,
                         ShapeInference::InferReshapeShape(
                             operand_shape, dimensions, new_sizes));
     XlaOp transposed = IsIdentityPermutation(dimensions)
@@ -691,7 +747,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           absl::Span<const int64> new_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(auto shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(Shape shape, GetShape(operand));
     std::vector<int64> dimensions(shape.dimensions_size());
     std::iota(dimensions.begin(), dimensions.end(), 0);
     return Reshape(operand, dimensions, new_sizes);
@@ -741,7 +797,7 @@ XlaOp XlaBuilder::Collapse(const XlaOp& operand,
 void XlaBuilder::Trace(const string& tag, const XlaOp& operand) {
   ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = ShapeUtil::MakeNil();
+    *instr.mutable_shape() = ShapeUtil::MakeNil().ToProto();
     *instr.mutable_literal() = LiteralUtil::CreateR1U8(tag).ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kTrace, {operand});
   });
@@ -767,9 +823,10 @@ XlaOp XlaBuilder::Tuple(absl::Span<const XlaOp> elements) {
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(const Shape shape,
                         ShapeInference::InferVariadicOpShape(
                             HloOpcode::kTuple, operand_shape_ptrs));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
   });
 }
@@ -784,7 +841,7 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
           ShapeUtil::HumanString(tuple_shape));
     }
     *instr.mutable_shape() =
-        ShapeUtil::GetTupleElementShape(tuple_shape, index);
+        ShapeUtil::GetTupleElementShape(tuple_shape, index).ToProto();
 
     instr.set_tuple_index(index);
 
@@ -843,9 +900,10 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
                                                         dimension_numbers));
+    *instr.mutable_shape() = shape.ToProto();
     *instr.mutable_dot_dimension_numbers() = dimension_numbers;
     if (precision_config != nullptr) {
       *instr.mutable_precision_config() = *precision_config;
@@ -987,10 +1045,11 @@ XlaOp XlaBuilder::ConvGeneralDilated(
                         MakeWindow(window_dimensions, window_strides, padding,
                                    lhs_dilation, rhs_dilation));
 
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferConvolveShape(
                             lhs_shape, rhs_shape, feature_group_count,
                             instr.window(), dimension_numbers));
+    *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
     instr.set_feature_group_count(feature_group_count);
@@ -1063,10 +1122,9 @@ XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferFftShape(operand_shape, fft_type, fft_length));
-
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferFftShape(
+                                         operand_shape, fft_type, fft_length));
+    *instr.mutable_shape() = shape.ToProto();
     instr.set_fft_type(fft_type);
     for (int64 i : fft_length) {
       instr.add_fft_length(i);
@@ -1084,7 +1142,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     }
     const Shape infeed_instruction_shape =
         ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
-    *instr.mutable_shape() = infeed_instruction_shape;
+    *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
     if (ShapeUtil::IsArray(shape) && sharding() &&
@@ -1105,7 +1163,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     XlaOp token;
     auto make_token = [&]() {
       HloInstructionProto token_instr;
-      *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+      *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
       return AddInstruction(std::move(token_instr), HloOpcode::kAfterAll, {});
     };
     if (sharding()) {
@@ -1144,7 +1202,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto infeed_data;
-    *infeed_data.mutable_shape() = shape;
+    *infeed_data.mutable_shape() = shape.ToProto();
     infeed_data.set_tuple_index(0);
     return AddInstruction(std::move(infeed_data), HloOpcode::kGetTupleElement,
                           {infeed});
@@ -1160,7 +1218,7 @@ XlaOp XlaBuilder::InfeedWithToken(const XlaOp& token, const Shape& shape,
     }
     const Shape infeed_instruction_shape =
         ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
-    *instr.mutable_shape() = infeed_instruction_shape;
+    *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
     if (ShapeUtil::IsArray(shape) && sharding() &&
@@ -1185,7 +1243,7 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
   ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
 
     // Check and set outfeed shape.
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
@@ -1198,14 +1256,14 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
           ShapeUtil::HumanStringWithLayout(shape_with_layout),
           ShapeUtil::HumanStringWithLayout(operand_shape));
     }
-    *instr.mutable_outfeed_shape() = shape_with_layout;
+    *instr.mutable_outfeed_shape() = shape_with_layout.ToProto();
 
     instr.set_outfeed_config(outfeed_config);
 
     // Outfeed takes a token as its second operand. Generate the token to pass
     // to the outfeed.
     HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
                                                     HloOpcode::kAfterAll, {}));
 
@@ -1219,7 +1277,7 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto tuple_instr;
-    *tuple_instr.mutable_shape() = ShapeUtil::MakeNil();
+    *tuple_instr.mutable_shape() = ShapeUtil::MakeNil().ToProto();
 
     // The dummy tuple should have no sharding.
     {
@@ -1238,7 +1296,7 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
 
     // Check and set outfeed shape.
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
@@ -1251,7 +1309,7 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
           ShapeUtil::HumanStringWithLayout(shape_with_layout),
           ShapeUtil::HumanStringWithLayout(operand_shape));
     }
-    *instr.mutable_outfeed_shape() = shape_with_layout;
+    *instr.mutable_outfeed_shape() = shape_with_layout.ToProto();
 
     instr.set_outfeed_config(outfeed_config);
 
@@ -1263,7 +1321,7 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
 XlaOp XlaBuilder::CreateToken() {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kAfterAll);
   });
 }
@@ -1273,15 +1331,25 @@ XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
     if (tokens.empty()) {
       return InvalidArgument("AfterAll requires at least one operand");
     }
+    for (int i = 0; i < tokens.size(); ++i) {
+      const XlaOp& operand = tokens[i];
+      TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+      if (!ShapeUtil::IsToken(operand_shape)) {
+        return InvalidArgument(
+            "All operands to AfterAll must be tokens; operand %d has shape %s",
+            i, ShapeUtil::HumanString(operand_shape));
+      }
+    }
     HloInstructionProto instr;
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kAfterAll, tokens);
   });
 }
 
-XlaOp XlaBuilder::CustomCall(const string& call_target_name,
-                             absl::Span<const XlaOp> operands,
-                             const Shape& shape, const string& opaque) {
+XlaOp XlaBuilder::CustomCall(
+    const string& call_target_name, absl::Span<const XlaOp> operands,
+    const Shape& shape, const string& opaque,
+    absl::optional<absl::Span<const Shape>> operand_shapes_with_layout) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (absl::StartsWith(call_target_name, "$")) {
@@ -1290,9 +1358,34 @@ XlaOp XlaBuilder::CustomCall(const string& call_target_name,
           "are reserved for internal use.",
           call_target_name);
     }
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
     instr.set_custom_call_target(call_target_name);
     instr.set_custom_call_opaque(opaque);
+    if (operand_shapes_with_layout.has_value()) {
+      if (!LayoutUtil::HasLayout(shape)) {
+        return InvalidArgument(
+            "Result shape must have layout for custom call with constrained "
+            "layout.");
+      }
+      if (operands.size() != operand_shapes_with_layout->size()) {
+        return InvalidArgument(
+            "Must specify a shape with layout for each operand for custom call "
+            "with constrained layout; given %d shapes, expected %d",
+            operand_shapes_with_layout->size(), operands.size());
+      }
+      instr.set_constrain_layout(true);
+      int64 operand_num = 0;
+      for (const Shape& operand_shape : *operand_shapes_with_layout) {
+        if (!LayoutUtil::HasLayout(operand_shape)) {
+          return InvalidArgument(
+              "No layout specified for operand %d for custom call with "
+              "constrained layout.",
+              operand_num);
+        }
+        *instr.add_operand_shapes_with_layout() = operand_shape.ToProto();
+        ++operand_num;
+      }
+    }
     return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
   });
 }
@@ -1443,9 +1536,9 @@ XlaOp XlaBuilder::Transpose(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferTransposeShape(operand_shape, permutation));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTransposeShape(
+                                         operand_shape, permutation));
+    *instr.mutable_shape() = shape.ToProto();
     for (int64 dim : permutation) {
       instr.add_dimensions(dim);
     }
@@ -1458,9 +1551,9 @@ XlaOp XlaBuilder::Rev(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferReverseShape(operand_shape, dimensions));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferReverseShape(
+                                         operand_shape, dimensions));
+    *instr.mutable_shape() = shape.ToProto();
     for (int64 dim : dimensions) {
       instr.add_dimensions(dim);
     }
@@ -1468,30 +1561,28 @@ XlaOp XlaBuilder::Rev(const XlaOp& operand,
   });
 }
 
-XlaOp XlaBuilder::Sort(XlaOp keys, absl::optional<XlaOp> values,
+XlaOp XlaBuilder::Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
                        int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
     operand_shape_ptrs.push_back(&keys_shape);
-    Shape values_shape;
-    if (values.has_value()) {
-      TF_ASSIGN_OR_RETURN(values_shape, GetShape(*values));
-      operand_shape_ptrs.push_back(&values_shape);
-    }
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferVariadicOpShape(
-                            HloOpcode::kSort, operand_shape_ptrs));
+    TF_ASSIGN_OR_RETURN(std::vector<Shape> values_shapes,
+                        GetOperandShapes(values));
+    absl::c_transform(values_shapes, std::back_inserter(operand_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferVariadicOpShape(
+                                         HloOpcode::kSort, operand_shape_ptrs));
+    *instr.mutable_shape() = shape.ToProto();
     if (dimension == -1) {
       TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
       dimension = ShapeUtil::Rank(keys_shape) - 1;
     }
     instr.add_dimensions(dimension);
-    return values.has_value()
-               ? AddInstruction(std::move(instr), HloOpcode::kSort,
-                                {keys, *values})
-               : AddInstruction(std::move(instr), HloOpcode::kSort, {keys});
+    std::vector<XlaOp> operands{keys};
+    operands.insert(operands.end(), values.begin(), values.end());
+    return AddInstruction(std::move(instr), HloOpcode::kSort, operands);
   });
 }
 
@@ -1505,9 +1596,9 @@ XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
+                                         operand_shape, new_element_type));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand});
   });
 }
@@ -1517,9 +1608,9 @@ XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
+                                         operand_shape, new_element_type));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
                           {operand});
   });
@@ -1551,11 +1642,11 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferMapShape(operand_shape_ptrs, called_program_shape,
-                                      dimensions));
+        Shape shape, ShapeInference::InferMapShape(
+                         operand_shape_ptrs, called_program_shape, dimensions));
+    *instr.mutable_shape() = shape.ToProto();
 
-    const Shape& output_shape = instr.shape();
+    Shape output_shape(instr.shape());
     const int64 output_rank = ShapeUtil::Rank(output_shape);
     AddCalledComputation(computation, &instr);
     std::vector<XlaOp> new_operands(operands.begin(), operands.end());
@@ -1598,7 +1689,7 @@ XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
     }
 
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_distribution(distribution);
 
@@ -1626,10 +1717,10 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
     TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
                         condition.GetProgramShape());
     TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferWhileShape(condition_program_shape,
-                                        body_program_shape, init_shape));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferWhileShape(
+                                         condition_program_shape,
+                                         body_program_shape, init_shape));
+    *instr.mutable_shape() = shape.ToProto();
     // Body comes before condition computation in the vector.
     AddCalledComputation(body, &instr);
     AddCalledComputation(condition, &instr);
@@ -1646,10 +1737,10 @@ XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& start_indices,
     TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferGatherShape(input_shape, start_indices_shape,
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGatherShape(
+                                         input_shape, start_indices_shape,
                                          dimension_numbers, slice_sizes));
+    *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_gather_dimension_numbers() = dimension_numbers;
     for (int64 bound : slice_sizes) {
@@ -1674,10 +1765,11 @@ XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices,
     TF_ASSIGN_OR_RETURN(const Shape& updates_shape, GetShape(updates));
     TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape,
                         update_computation.GetProgramShape());
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferScatterShape(
                             input_shape, scatter_indices_shape, updates_shape,
                             to_apply_shape, dimension_numbers));
+    *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_scatter_dimension_numbers() = dimension_numbers;
 
@@ -1704,10 +1796,11 @@ XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand,
     TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape,
                         false_computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferConditionalShape(
             predicate_shape, true_operand_shape, false_operand_shape,
             true_computation_shape, false_computation_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     // The index of true_computation must be 0 and that of false computation
     // must be 1.
@@ -1749,9 +1842,10 @@ XlaOp XlaBuilder::Reduce(absl::Span<const XlaOp> operands,
                       [](const Shape& shape) { return &shape; });
 
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferReduceShape(
             operand_shape_ptrs, dimensions_to_reduce, called_program_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (int64 dim : dimensions_to_reduce) {
       instr.add_dimensions(dim);
@@ -1789,9 +1883,9 @@ XlaOp XlaBuilder::ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
     std::vector<std::pair<int64, int64>> padding_values =
         MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
                     window_strides, padding);
-    return ReduceWindowWithGeneralPadding(operand, init_value, computation,
-                                          window_dimensions, window_strides,
-                                          padding_values);
+    return ReduceWindowWithGeneralPadding(
+        operand, init_value, computation, window_dimensions, window_strides,
+        /*base_dilations=*/{}, /*window_dilations=*/{}, padding_values);
   });
 }
 
@@ -1800,6 +1894,8 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
     const XlaComputation& computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
+    absl::Span<const int64> base_dilations,
+    absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -1810,11 +1906,12 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
                         computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
                         MakeWindow(window_dimensions, window_strides, padding,
-                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferReduceWindowShape(operand_shape, init_shape,
-                                               instr.window(), to_apply_shape));
+                                   /*lhs_dilation=*/base_dilations,
+                                   /*rhs_dilation=*/window_dilations));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferReduceWindowShape(
+                                         operand_shape, init_shape,
+                                         instr.window(), to_apply_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     AddCalledComputation(computation, &instr);
     return AddInstruction(std::move(instr), HloOpcode::kReduceWindow,
@@ -1832,9 +1929,10 @@ XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
     TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
     TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferBatchNormTrainingShape(
             operand_shape, scale_shape, offset_shape, feature_index));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_epsilon(epsilon);
     instr.set_feature_index(feature_index);
@@ -1856,10 +1954,11 @@ XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale,
     TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
     TF_ASSIGN_OR_RETURN(const Shape& mean_shape, GetShape(mean));
     TF_ASSIGN_OR_RETURN(const Shape& variance_shape, GetShape(variance));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferBatchNormInferenceShape(
-                            operand_shape, scale_shape, offset_shape,
-                            mean_shape, variance_shape, feature_index));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferBatchNormInferenceShape(
+                         operand_shape, scale_shape, offset_shape, mean_shape,
+                         variance_shape, feature_index));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_epsilon(epsilon);
     instr.set_feature_index(feature_index);
@@ -1881,10 +1980,11 @@ XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
     TF_ASSIGN_OR_RETURN(const Shape& batch_mean_shape, GetShape(batch_mean));
     TF_ASSIGN_OR_RETURN(const Shape& batch_var_shape, GetShape(batch_var));
     TF_ASSIGN_OR_RETURN(const Shape& grad_output_shape, GetShape(grad_output));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferBatchNormGradShape(
                             operand_shape, scale_shape, batch_mean_shape,
                             batch_var_shape, grad_output_shape, feature_index));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_epsilon(epsilon);
     instr.set_feature_index(feature_index);
@@ -1915,9 +2015,9 @@ XlaOp XlaBuilder::CrossReplicaSum(
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
+                                         {&operand_shape}));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (const ReplicaGroup& group : replica_groups) {
       *instr.add_replica_groups() = group;
@@ -1970,8 +2070,8 @@ XlaOp XlaBuilder::AllToAll(const XlaOp& operand, int64 split_dimension,
     absl::c_transform(slice_shapes, std::back_inserter(slice_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs));
+        Shape shape, ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs));
+    *instr.mutable_shape() = shape.ToProto();
     for (const ReplicaGroup& group : replica_groups) {
       *instr.add_replica_groups() = group;
     }
@@ -1996,8 +2096,9 @@ XlaOp XlaBuilder::CollectivePermute(
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferCollectivePermuteShape(operand_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (const auto& pair : source_target_pairs) {
       auto* proto_pair = instr.add_source_target_pairs();
@@ -2046,10 +2147,11 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
     TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
                         MakeWindow(window_dimensions, window_strides, padding,
                                    /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferSelectAndScatterShape(
                             operand_shape, select_shape, instr.window(),
                             source_shape, init_shape, scatter_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     AddCalledComputation(select, &instr);
     AddCalledComputation(scatter, &instr);
@@ -2064,9 +2166,10 @@ XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferReducePrecisionShape(
                             operand_shape, exponent_bits, mantissa_bits));
+    *instr.mutable_shape() = shape.ToProto();
     instr.set_exponent_bits(exponent_bits);
     instr.set_mantissa_bits(mantissa_bits);
     return AddInstruction(std::move(instr), HloOpcode::kReducePrecision,
@@ -2081,7 +2184,7 @@ void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
                                                     HloOpcode::kAfterAll, {}));
 
@@ -2100,15 +2203,17 @@ XlaOp XlaBuilder::SendWithToken(const XlaOp& operand, const XlaOp& token,
     // token}.
     HloInstructionProto send_instr;
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
-    *send_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()});
+    *send_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
     send_instr.set_channel_id(handle.handle());
     TF_ASSIGN_OR_RETURN(XlaOp send,
                         AddInstruction(std::move(send_instr), HloOpcode::kSend,
                                        {operand, token}));
 
     HloInstructionProto send_done_instr;
-    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     send_done_instr.set_channel_id(handle.handle());
     return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
                           {send});
@@ -2122,7 +2227,7 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
                                                     HloOpcode::kAfterAll, {}));
 
@@ -2133,7 +2238,7 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto recv_data;
-    *recv_data.mutable_shape() = shape;
+    *recv_data.mutable_shape() = shape.ToProto();
     recv_data.set_tuple_index(0);
     return AddInstruction(std::move(recv_data), HloOpcode::kGetTupleElement,
                           {recv});
@@ -2150,15 +2255,18 @@ XlaOp XlaBuilder::RecvWithToken(const XlaOp& token, const Shape& shape,
     // Recv instruction produces a tuple of {receive buffer, U32 context,
     // token}.
     HloInstructionProto recv_instr;
-    *recv_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()});
+    *recv_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_instr.set_channel_id(handle.handle());
     TF_ASSIGN_OR_RETURN(XlaOp recv, AddInstruction(std::move(recv_instr),
                                                    HloOpcode::kRecv, {token}));
 
     HloInstructionProto recv_done_instr;
     *recv_done_instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_done_instr.set_channel_id(handle.handle());
     return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
                           {recv});
@@ -2192,9 +2300,11 @@ XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token,
     // Send instruction produces a tuple of {aliased operand, U32 context,
     // token}.
     HloInstructionProto send_instr;
-    *send_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape_with_layout, ShapeUtil::MakeShape(U32, {}),
-         ShapeUtil::MakeTokenShape()});
+    *send_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape_with_layout,
+                                   ShapeUtil::MakeShape(U32, {}),
+                                   ShapeUtil::MakeTokenShape()})
+            .ToProto();
     send_instr.set_channel_id(handle.handle());
     send_instr.set_is_host_transfer(true);
     TF_ASSIGN_OR_RETURN(XlaOp send,
@@ -2202,7 +2312,7 @@ XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token,
                                        {operand, token}));
 
     HloInstructionProto send_done_instr;
-    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     send_done_instr.set_channel_id(handle.handle());
     send_done_instr.set_is_host_transfer(true);
     return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
@@ -2231,8 +2341,10 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
     // Recv instruction produces a tuple of {receive buffer, U32 context,
     // token}.
     HloInstructionProto recv_instr;
-    *recv_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()});
+    *recv_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_instr.set_channel_id(handle.handle());
     recv_instr.set_is_host_transfer(true);
     TF_ASSIGN_OR_RETURN(XlaOp recv, AddInstruction(std::move(recv_instr),
@@ -2240,7 +2352,8 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
 
     HloInstructionProto recv_done_instr;
     *recv_done_instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_done_instr.set_channel_id(handle.handle());
     recv_done_instr.set_is_host_transfer(true);
     return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
@@ -2248,6 +2361,19 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
   });
 }
 
+XlaOp XlaBuilder::GetDimensionSize(const XlaOp& operand, int64 dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGetDimensionSizeShape(
+                                         operand_shape, dimension));
+    *instr.mutable_shape() = shape.ToProto();
+    instr.add_dimensions(dimension);
+    return AddInstruction(std::move(instr), HloOpcode::kGetDimensionSize,
+                          {operand});
+  });
+}
+
 StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
   TF_RETURN_IF_ERROR(first_error_);
 
@@ -2261,7 +2387,7 @@ StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
 }
 
 StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
-    const XlaOp& root_op) const {
+    const XlaOp& root_op) {
   TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
   if (!is_constant) {
     auto op_status = LookUpInstruction(root_op);
@@ -2283,10 +2409,10 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
                       LookUpInstruction(root_op));
 
   HloComputationProto entry;
-  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
-  entry.set_name(StrCat(name_, entry.id(), "_compute_constant"));
+  SetProtoIdAndName(&entry, StrCat(name_, "_compute_constant"), kNameSeparator,
+                    GetNextId());
   entry.set_root_id(root->id());
-  ProgramShape* program_shape = entry.mutable_program_shape();
+  ProgramShapeProto* program_shape = entry.mutable_program_shape();
   *program_shape->mutable_result() = root->shape();
 
   // We use std::set to keep the instruction ids in ascending order (which is
@@ -2330,7 +2456,7 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
   module->set_id(entry.id());
   module->set_entry_computation_name(entry.name());
   module->set_entry_computation_id(entry.id());
-  *module->mutable_program_shape() = *program_shape;
+  *module->mutable_host_program_shape() = *program_shape;
   for (auto& e : embedded_) {
     if (related_calls.find(e.second.id()) != related_calls.end()) {
       *module->add_computations() = e.second;
@@ -2424,7 +2550,7 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
                                            absl::Span<const XlaOp> operands) {
   TF_RETURN_IF_ERROR(first_error_);
 
-  const int64 handle = GetUniqueId();
+  const int64 handle = GetNextId();
   instr.set_id(handle);
   instr.set_opcode(HloOpcodeString(opcode));
   if (instr.name().empty()) {
@@ -2455,9 +2581,50 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(HloInstructionProto&& instr,
 
 void XlaBuilder::AddCalledComputation(const XlaComputation& computation,
                                       HloInstructionProto* instr) {
-  instr->add_called_computation_ids(computation.proto().entry_computation_id());
+  absl::flat_hash_map<int64, int64> remapped_ids;
+  std::vector<HloComputationProto> imported_computations;
+  imported_computations.reserve(computation.proto().computations_size());
+  // Before we import the computations by remapping IDs, and capturing the
+  // old->new mappings in remapped_ids.
   for (const HloComputationProto& e : computation.proto().computations()) {
-    embedded_.insert({e.id(), e});
+    HloComputationProto new_computation(e);
+    int64 computation_id = GetNextId();
+    remapped_ids[new_computation.id()] = computation_id;
+    SetProtoIdAndName(&new_computation,
+                      GetBaseName(new_computation.name(), kNameSeparator),
+                      kNameSeparator, computation_id);
+    for (auto& instruction : *new_computation.mutable_instructions()) {
+      int64 instruction_id = GetNextId();
+      remapped_ids[instruction.id()] = instruction_id;
+      SetProtoIdAndName(&instruction,
+                        GetBaseName(instruction.name(), kNameSeparator),
+                        kNameSeparator, instruction_id);
+    }
+    new_computation.set_root_id(remapped_ids.at(new_computation.root_id()));
+
+    imported_computations.push_back(std::move(new_computation));
+  }
+  // Once we have imported all the computations, and captured all the ID
+  // mappings, we go back and fixup the IDs in the imported computations.
+  instr->add_called_computation_ids(
+      remapped_ids.at(computation.proto().entry_computation_id()));
+  for (auto& imported_computation : imported_computations) {
+    for (auto& instruction : *imported_computation.mutable_instructions()) {
+      for (auto& operand_id : *instruction.mutable_operand_ids()) {
+        operand_id = remapped_ids.at(operand_id);
+      }
+      for (auto& control_predecessor_id :
+           *instruction.mutable_control_predecessor_ids()) {
+        control_predecessor_id = remapped_ids.at(control_predecessor_id);
+      }
+      for (auto& called_computation_id :
+           *instruction.mutable_called_computation_ids()) {
+        called_computation_id = remapped_ids.at(called_computation_id);
+      }
+    }
+
+    int64 computation_id = imported_computation.id();
+    embedded_.insert({computation_id, std::move(imported_computation)});
   }
 }
 
@@ -2506,9 +2673,10 @@ XlaOp Broadcast(const XlaOp& operand, absl::Span<const int64> broadcast_sizes) {
   return operand.builder()->Broadcast(operand, broadcast_sizes);
 }
 
-XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
+XlaOp BroadcastInDim(const XlaOp& operand,
+                     const absl::Span<const int64> out_dim_size,
                      const absl::Span<const int64> broadcast_dimensions) {
-  return operand.builder()->BroadcastInDim(operand, shape,
+  return operand.builder()->BroadcastInDim(operand, out_dim_size,
                                            broadcast_dimensions);
 }
 
@@ -2687,7 +2855,16 @@ XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
 XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
                  absl::Span<const XlaOp> operands, const Shape& shape,
                  const string& opaque) {
-  return builder->CustomCall(call_target_name, operands, shape, opaque);
+  return builder->CustomCall(call_target_name, operands, shape, opaque,
+                             /*operand_shapes_with_layout=*/absl::nullopt);
+}
+
+XlaOp CustomCallWithLayout(XlaBuilder* builder, const string& call_target_name,
+                           absl::Span<const XlaOp> operands, const Shape& shape,
+                           absl::Span<const Shape> operand_shapes_with_layout,
+                           const string& opaque) {
+  return builder->CustomCall(call_target_name, operands, shape, opaque,
+                             operand_shapes_with_layout);
 }
 
 XlaOp Complex(const XlaOp& real, const XlaOp& imag,
@@ -2800,10 +2977,12 @@ XlaOp ReduceWindowWithGeneralPadding(
     const XlaComputation& computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
+    absl::Span<const int64> base_dilations,
+    absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding) {
   return operand.builder()->ReduceWindowWithGeneralPadding(
       operand, init_value, computation, window_dimensions, window_strides,
-      padding);
+      base_dilations, window_dilations, padding);
 }
 
 XlaOp CrossReplicaSum(const XlaOp& operand,
@@ -2914,8 +3093,8 @@ XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions) {
   return operand.builder()->Rev(operand, dimensions);
 }
 
-XlaOp Sort(XlaOp keys, absl::optional<XlaOp> values, int64 dimension) {
-  return keys.builder()->Sort(keys, std::move(values), dimension);
+XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values, int64 dimension) {
+  return keys.builder()->Sort(keys, values, dimension);
 }
 
 XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max) {
@@ -3049,4 +3228,8 @@ XlaOp Iota(XlaBuilder* builder, const Shape& shape, int64 iota_dimension) {
   return builder->Iota(shape, iota_dimension);
 }
 
+XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension) {
+  return operand.builder()->GetDimensionSize(operand, dimension);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index cd0d5ca5d3043ca13bbfda40eacc04b86659a85c..098efb60f9bdca8306ff771a505f4a225dea9f7d 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -216,7 +217,7 @@ class XlaBuilder {
   // compile-time constant (see `IsConstant`), returns an error.
   //
   // This will copy the needed ops/computations to the subgraph.
-  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op) const;
+  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op);
 
   // Returns the first error that was encountered while building the
   // computation. When an error is encountered, by default we return a vacuous
@@ -263,35 +264,30 @@ class XlaBuilder {
   // evaluating the computation.
   StatusOr<bool> IsConstant(const XlaOp& operand) const;
 
+  // Sets up binding which indicates that the `target_dim_num` in the subshape
+  // `target_param_index` of parameter `target_param_num` is a dynamic dimension
+  // and its real dynamic size is represented by `dynamic_param_index` in
+  // parameter `dynamic_param_num`.
+  //
+  // TODO(b/119520625): Remove this API once we have more dynamic shape infra
+  // ready.
+  Status SetDynamicBinding(int64 dynamic_size_param_num,
+                           ShapeIndex dynamic_size_param_index,
+                           int64 target_param_num,
+                           ShapeIndex target_param_index, int64 target_dim_num);
+
  private:
   // Build helper which takes the id of the root operation..
   StatusOr<XlaComputation> Build(int64 root_id);
 
-  // Enqueues a "retrieve parameter value" instruction for a parameter that was
-  // passed to the computation.
+  // Description for the methods below can be found in the corresponding public
+  // functions section in this file.
+
   XlaOp Parameter(int64 parameter_number, const Shape& shape,
                   const string& name);
 
-  // Enqueues a constant with the value of the given literal onto the
-  // computation.
   XlaOp ConstantLiteral(const LiteralSlice& literal);
 
-  // Enqueues a constant onto the computation. Methods are templated on the
-  // native host type (NativeT) which corresponds to a specific XLA
-  // PrimitiveType as given in the following table:
-  //
-  //  Native Type   PrimitiveType
-  // -----------------------------
-  //   bool           PRED
-  //   int32          S32
-  //   int64          S64
-  //   uint32         U32
-  //   uint64         U64
-  //   float          F32
-  //   double         F64
-  //
-  // Note: not all primitive types defined in xla_data.proto have a
-  // corresponding native type yet.
   template <typename NativeT>
   XlaOp ConstantR0(NativeT value);
   template <typename NativeT>
@@ -321,198 +317,79 @@ class XlaBuilder {
   template <typename NativeT>
   XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
 
-  // Enqueues a rank one constant (vector) onto the computation. The vector has
-  // size 'length' and every element has the value 'value'.
   template <typename NativeT>
   XlaOp ConstantR1(int64 length, NativeT value);
 
-  // Adds dimensions to an array by duplicating the data in the array.
-  //
-  // The new dimensions are inserted on the left, i.e. if
-  // broadcast_sizes has values {a0, ..., aN} and the operand shape
-  // has dimensions {b0, ..., bM} then the shape of the output has
-  // dimensions {a0, ..., aN, b0, ..., bM}.
-  //
-  // The new dimensions index into copies of the operand, i.e.
-  //
-  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
   XlaOp Broadcast(const XlaOp& operand,
                   absl::Span<const int64> broadcast_sizes);
 
-  // Performs in-dimension-style broadcast.
-  //
-  // Operand specifies the input to be broadcast. "shape" is expected output
-  // shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
-  // Dimension numbers in broadcast_dimensions map to individual dimensions
-  // of the operand, and specify what dimension of the output shape they
-  // should be broadcast.
-  // e.g.
-  // Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
-  // and dimension of shape is [2,2].
-  // Specifying {1} as brodcast_dimension will generate output
-  // [1 , 2]
-  // [1 , 2]
-  // On the other hand, specifying {0} as broadcast_dimension
-  // will generate output
-  // [1 , 1]
-  // [2 , 2]
-  XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
+  XlaOp BroadcastInDim(const XlaOp& operand,
+                       const absl::Span<const int64> out_dim_size,
                        const absl::Span<const int64> broadcast_dimensions);
 
-  // Enqueues a pad operation onto the computation that pads the given value on
-  // the edges as well as between the elements of the input. padding_config
-  // specifies the padding amount for each dimension.
   XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
             const PaddingConfig& padding_config);
 
-  // Enqueues an operation onto the computation that flattens the operand based
-  // on the dimension order (major/slowest-varying to minor/fastest-varying)
-  // given, followed by reshaping it into the shape with the given dimension
-  // sizes (also major to minor). Conceptually, this is a limited form of
-  // "shape casting".
   XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> dimensions,
                 absl::Span<const int64> new_sizes);
 
-  // Enqueues an operation onto the computation that collapses the operand, from
-  // first to last dimension (C order), then reshapes it to the given dimension
-  // sizes. Conceptually, this is a limited form of "shape casting".
   XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> new_sizes);
 
-  // Wrapper for Reshape.
-  // Enqueues an operation to collapse the provided dimensions; e.g. an
-  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-  // be a consecutive, in-order subsequence of the operand dimensions.
-  //
-  // Note that collapsing a single dimension does nothing:
-  //
-  //    {256} collapsing {0} => {256}
-  //    {1} collapsing {0} => {1}
-  //
-  // Collapsing multiple dimensions produces a single result dimension:
-  //
-  //    {256, 2} collapsing {0,1} => {512}
-  //    {256, 2, 3} collapsing {0,1} => {512, 3}
-  //
-  // This could potentially cause data to be moved -- it provides a more
-  // structured form of reshaping than an arbitrary Reshape operation.
   XlaOp Collapse(const XlaOp& operand, absl::Span<const int64> dimensions);
 
-  // Enqueues a slice operation onto the computation that slices the operand
-  // from the start indices to the limit indices; e.g.
-  //
-  //        x
-  //   [ 0 1 2 3 ]
-  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-  //   [ 8 9 a b ]
-  //
-  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-  // range notation.
-  // The strides parameter determines the stride over the slice
   XlaOp Slice(const XlaOp& operand, absl::Span<const int64> start_indices,
               absl::Span<const int64> limit_indices,
               absl::Span<const int64> strides);
 
-  // Enqueues a slice operation in a given dimension, taking all other
-  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-  // for:
-  //
-  //  array[:, 2:4:1, :]
   XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
                    int64 stride, int64 dimno);
 
-  // Enqueues a slice operation onto the computation that slices the 'operand'
-  // from dynamic start indices which are passed in 'start_indices'.
-  // The size of the slice in each dimension is passed in 'slice_sizes',
-  // which specify the end point of exclusive slice intervals in each
-  // dimension [start, start + size).
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo input dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
   XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                      absl::Span<const int64> slice_sizes);
 
-  // Enqueues a dynamic update slice operation onto the computation, which
-  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-  // The shape of 'update' determines the shape of the slice of 'operand'
-  // which is updated.
-  // The indices specified in 'start_indices' specify the offset of the slice
-  // of 'operand' which is updated.
-  //
-  //               update = {10, 11} // calculated at runtime.
-  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-  //   [7 8 9]                                                  [7 8  9 ]
-  //
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo update dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
   XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                            const XlaOp& start_indices);
 
-  // Enqueues a concatenate instruction onto the computation. 'operands' must
-  // have >= 1 entry.
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
 
-  // Enqueue a tracing operation onto the computation; the computation will emit
-  // a logging message with the operand.
   void Trace(const string& tag, const XlaOp& operand);
 
-  // Enqueues a conditional-move-like select operation onto the computation;
-  // predicated on pred, selects between on_true and on_false.
   XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
 
-  // Enqueues a tuple-creation instruction onto the computation.
   XlaOp Tuple(absl::Span<const XlaOp> elements);
 
-  // Enqueues a tuple-element-get instruction onto the computation.
   XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
 
-  // Enqueues an equal-to comparison instruction onto the computation.
   XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a not-equal comparison instruction onto the computation.
   XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a greater-or-equal comparison instruction onto the computation.
   XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a greater-than comparison instruction onto the computation.
   XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a less-than comparison instruction onto the computation.
   XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a less-or-equal comparison instruction onto the computation.
   XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a dot instruction onto the computation.
   XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
             const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a general dot instruction onto the computation.
   XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
                    const DotDimensionNumbers& dimension_numbers,
                    const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, which uses the
-  // default convolution dimension numbers.
   XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
              absl::Span<const int64> window_strides, Padding padding,
              int64 feature_group_count = 1,
              const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration in the format returned by MakePadding().
   XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides,
@@ -520,8 +397,6 @@ class XlaBuilder {
       int64 feature_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided dimension numbers configuration.
   XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides, Padding padding,
@@ -529,8 +404,6 @@ class XlaBuilder {
       int64 feature_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration as well as the dimension numbers.
   XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                     absl::Span<const int64> window_strides,
                     absl::Span<const std::pair<int64, int64>> padding,
@@ -538,8 +411,6 @@ class XlaBuilder {
                     int64 feature_group_count = 1,
                     const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration, dilation factors and dimension numbers.
   XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
                            absl::Span<const int64> window_strides,
                            absl::Span<const std::pair<int64, int64>> padding,
@@ -549,79 +420,53 @@ class XlaBuilder {
                            int64 feature_group_count = 1,
                            const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues an FFT instruction onto the computation, of the given type and
-  // with the given FFT length.
   XlaOp Fft(const XlaOp& operand, FftType fft_type,
             absl::Span<const int64> fft_length);
 
-  // Enqueues an infeed instruction onto the computation, which writes data of
-  // the given shape to the infeed buffer of the device.
   XlaOp Infeed(const Shape& shape, const string& config = "");
   XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
                         const string& config = "");
 
-  // Enqueues an outfeed instruction onto the computation. This instruction
-  // generates outgoing data transfers for the given data.
-  //
-  // shape_with_layout communicates the laid out shape that we want to outfeed
-  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-  // will occur.
   void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
                const string& outfeed_config);
   XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
                          const Shape& shape_with_layout,
                          const string& outfeed_config);
 
-  // Enqueues a call instruction onto the computation.
   XlaOp Call(const XlaComputation& computation,
              absl::Span<const XlaOp> operands);
 
-  // Enqueues a custom call instruction onto the computation.
-  XlaOp CustomCall(const string& call_target_name,
-                   absl::Span<const XlaOp> operands, const Shape& shape,
-                   const string& opaque);
+  XlaOp CustomCall(
+      const string& call_target_name, absl::Span<const XlaOp> operands,
+      const Shape& shape_with_layout, const string& opaque,
+      absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
-  // The following methods enqueue element-wise binary arithmetic operations
-  // onto the computation. The shapes of the operands have to match unless one
-  // of the operands is a scalar, or an explicit broadcast dimension is given
-  // (see g3doc for more details).
-
-  // Enqueues a complex compose instruction onto the computation.
   XlaOp Complex(const XlaOp& real, const XlaOp& imag,
                 absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a complex conjugate instruction onto the computation.
   XlaOp Conj(const XlaOp& operand);
 
-  // Enqueues an add instruction onto the computation.
   XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a subtract instruction onto the computation.
   XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a multiply instruction onto the computation.
   XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a divide instruction onto the computation.
   XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a remainder instruction onto the computation.
   XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a max instruction onto the computation.
   XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a min instruction onto the computation.
   XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Element-wise logical operators
   XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
@@ -640,81 +485,48 @@ class XlaBuilder {
   XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
                           absl::Span<const int64> broadcast_dimensions = {});
 
-  // Reduces an array among the provided dimensions, given "computation" as a
-  // reduction operator.
   XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
 
-  // Reduces several arrays simultaneously among the provided dimensions, given
-  // "computation" as a reduction operator.
   XlaOp Reduce(absl::Span<const XlaOp> operands,
                absl::Span<const XlaOp> init_values,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
 
-  // Convenience wrapper around the above that reduces all the dimensions in the
-  // operand shape.
   XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
                   const XlaComputation& computation);
 
-  // Enqueues a windowed reduce instruction onto the computation.
   XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
                      const XlaComputation& computation,
                      absl::Span<const int64> window_dimensions,
                      absl::Span<const int64> window_strides, Padding padding);
 
-  // As ReduceWindow(), but the padding is given in the format
-  // returned by MakePadding().
   XlaOp ReduceWindowWithGeneralPadding(
       const XlaOp& operand, const XlaOp& init_value,
       const XlaComputation& computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
+      absl::Span<const int64> base_dilations,
+      absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64>> padding);
 
-  // Returns the sum of the operand value within each subgroup of replicas. All
-  // replicas supply one input to the sum and all replicas receive the resulting
-  // sum for each subgroup.
   XlaOp CrossReplicaSum(const XlaOp& operand,
                         absl::Span<const ReplicaGroup> replica_groups = {});
 
-  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
-  // AllReduce means doing a reduction on the input operand cross cores and then
-  // broadcasting the reduction result to those cores. The reduction function is
-  // defined by `computation`, which should be a commutative computation on
-  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
-  // configured by:
-  //
-  // - `replica_groups`: each ReplicaGroup contains a list of replica id. If
-  // empty, all replicas belong to one group. Allreduce will be applied within
-  // subgroups. For example, we have 4 replicas, then
-  // replica_groups={{0,2},{1,3}} means, replica 0 and 2 are in subgroup 0,
-  // replica 1 and 3 are in subgroup 1.
-  //
-  // - `channel_id`: for Allreduce nodes from different modules, if they have
-  // the same channel_id, they will be 'Allreduce'd. If empty, Allreduce will
-  // not be applied cross modules.
-  //
-  // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
   XlaOp CrossReplicaSum(
       const XlaOp& operand, const XlaComputation& computation,
       absl::Span<const ReplicaGroup> replica_groups = {},
       const absl::optional<ChannelHandle>& channel_id = absl::nullopt);
 
-  // Enqueues an operation that do an Alltoall of the operand cross cores.
   XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
                  int64 concat_dimension, int64 split_count,
                  const std::vector<ReplicaGroup>& replica_groups);
 
-  // Enqueues an operation that do an CollectivePermute of the operand cross
-  // cores.
   XlaOp CollectivePermute(
       const XlaOp& operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
-  // Enqueues an operation that scatters the `source` array to the selected
-  // indices of each window.
   XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
                          absl::Span<const int64> window_dimensions,
                          absl::Span<const int64> window_strides,
@@ -722,8 +534,6 @@ class XlaBuilder {
                          const XlaOp& init_value,
                          const XlaComputation& scatter);
 
-  // As SelectAndScatter(), but the padding is given in the format
-  // returned by MakePadding().
   XlaOp SelectAndScatterWithGeneralPadding(
       const XlaOp& operand, const XlaComputation& select,
       absl::Span<const int64> window_dimensions,
@@ -731,222 +541,126 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
       const XlaOp& init_value, const XlaComputation& scatter);
 
-  // Enqueues an abs instruction onto the computation.
   XlaOp Abs(const XlaOp& operand);
 
-  // Enqueues a atan2 instruction onto the computation.
   XlaOp Atan2(const XlaOp& y, const XlaOp& x,
               absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues an exp instruction onto the computation.
   XlaOp Exp(const XlaOp& operand);
 
-  // Enqueues an expm1 instruction onto the computation.
   XlaOp Expm1(const XlaOp& operand);
 
-  // Enqueues a floor instruction onto the computation.
   XlaOp Floor(const XlaOp& operand);
 
-  // Enqueues a ceil instruction onto the computation.
   XlaOp Ceil(const XlaOp& operand);
 
-  // Enqueues a round instruction onto the computation, rounding to nearest even
-  // with half-way cases rounding away from zero.
   XlaOp Round(const XlaOp& operand);
 
-  // Enqueues an log instruction (natural logarithm) onto the computation.
   XlaOp Log(const XlaOp& operand);
 
-  // Enqueues an log1p instruction (log(x+1)) onto the computation.
   XlaOp Log1p(const XlaOp& operand);
 
-  // Enqueues a sign instruction onto the computation.
   XlaOp Sign(const XlaOp& operand);
 
-  // Enqueues a count leading zeros instruction onto the computation.
   XlaOp Clz(const XlaOp& operand);
 
-  // Enqueues a cosine instruction onto the computation.
   XlaOp Cos(const XlaOp& operand);
 
-  // Enqueues a sine instruction onto the computation.
   XlaOp Sin(const XlaOp& operand);
 
-  // Enqueues a tanh instruction onto the computation.
   XlaOp Tanh(const XlaOp& operand);
 
-  // Enqueues a real-part instruction onto the computation.
   XlaOp Real(const XlaOp& operand);
 
-  // Enqueues an imaginary-part instruction onto the computation.
   XlaOp Imag(const XlaOp& operand);
 
-  // Enqueues a lhs^rhs computation onto the computation.
   XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues an operator that tests if the operand's values are finite, i.e.,
-  // not Inf or NaN. Defined only for floating-point types. Returns an array of
-  // booleans with the same shape where entries are true iff the corresponding
-  // entry was NaN.
   XlaOp IsFinite(const XlaOp& operand);
 
-  // Enqueues an iota operation onto the computation.
   XlaOp Iota(const Shape& shape, int64 iota_dimension);
 
-  // Enqueues a rank-1 iota operation onto the computation.
   XlaOp Iota(PrimitiveType type, int64 size);
 
-  // Enqueues a convert instruction onto the computation that changes the
-  // element type of the operand array to primitive_type.
   XlaOp ConvertElementType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  // Enqueues a no-op instruction onto the computation that changes
-  // the element type of the operand array to primitive_type. The
-  // bit-widths of the source and destination element types must be
-  // identical.
   XlaOp BitcastConvertType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  // Enqueues a negate instruction onto the computation.
   XlaOp Neg(const XlaOp& operand);
 
-  // Enqueues a transpose instruction onto the computation.
   XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation);
 
-  // Enqueues a reverse instruction onto the computation. The order of the
-  // elements in the given dimensions is reversed (i.e., the element at index i
-  // is moved to index dimension_size - 1 - i).
   XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 
-  // Enqueues a sort (as increasing order) instruction onto the computation.
-  // If only keys are provided:
-  // * If the keys are an rank-1 tensor (an array), the result is a sorted array
-  // of keys, in ascending order.
-  // * If the keys have higher rank, the keys are sorted along the provided
-  // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-  // value of 0 will indepenently sort every column, and a dimension value of 1
-  // will independently sort each row. If no dimension number is provided, then
-  // the last dimension is chosen by default.
-  //
-  // If both keys and values are provided:
-  // * The keys and the values must tensors with the same dimensions. The
-  // element types of the tensors may be different.
-  // * The result is a tuple that consists of a sorted tensor of keys (along the
-  // provided dimension, as above) as the first element, and a tensor with their
-  // corresponding values as the second element.
-  XlaOp Sort(XlaOp keys, absl::optional<XlaOp> values = absl::nullopt,
+  XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
              int64 dimension = -1);
 
-  // Enqueues a clamp instruction onto the computation.
   XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
-  // Enqueues a map instruction onto the computation.
   XlaOp Map(absl::Span<const XlaOp> operands, const XlaComputation& computation,
             absl::Span<const int64> dimensions,
             absl::Span<const XlaOp> static_operands = {});
 
-  // Enqueues a N(mu, sigma) random number generation instruction onto the
-  // computation.
   XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
 
-  // Enqueues a U(a, b) random number generation instruction onto the
-  // computation. Returns values in the semi-open interval [a, b).
   XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
 
-  // Enqueues a while node onto the computation.
   XlaOp While(const XlaComputation& condition, const XlaComputation& body,
               const XlaOp& init);
 
-  // Enqueues a conditional node onto the computation.
   XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                     const XlaComputation& true_computation,
                     const XlaOp& false_operand,
                     const XlaComputation& false_computation);
 
-  // Enqueues a ReducePrecision node onto the computation.
   XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                         const int mantissa_bits);
 
-  // Enqueues a Gather node onto the computation.
   XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
                const GatherDimensionNumbers& dimension_numbers,
                absl::Span<const int64> slice_sizes);
 
-  // Enqueues a Scatter node onto the computation.
   XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                 const XlaOp& updates, const XlaComputation& update_computation,
                 const ScatterDimensionNumbers& dimension_numbers);
 
-  // Enqueues a Send node onto the computation for device-to-device
-  // communication, to send the given operand to a Recv instruction that shares
-  // the same channel handle.
   void Send(const XlaOp& operand, const ChannelHandle& handle);
   XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
                       const ChannelHandle& handle);
 
-  // Enqueues a Send node which sends data to the host.
   XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
                    const Shape& shape_with_layout, const ChannelHandle& handle);
 
-  // Enqueues a Recv node which receives data from the host.
   XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
                      const ChannelHandle& handle);
 
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
   XlaOp CreateToken();
 
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
   XlaOp AfterAll(absl::Span<const XlaOp> tokens);
 
-  // Enqueues a Recv node onto the computation. The data comes from a Send
-  // instruction that shares the same channel handle and its shape must
-  // be the same as the given shape.
   XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
   XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
                       const ChannelHandle& handle);
 
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-  // is the normalized result and batch_mean and batch_var are the mean and
-  // variance, respectively, across batch for the operand.
   XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
                           const XlaOp& offset, float epsilon,
                           int64 feature_index);
 
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-  // computing `mean` and `variance` for each batch inside the operation. It
-  // uses the input `mean` and `variance` instead as estimated values. The
-  // purpose of this op is to reduce latency in inference, hence the name
-  // `BatchNormInference`.
-  //
-  // The output has the same shape as `operand`, and contains the normalized
-  // values for each batch.
   XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
                            const XlaOp& offset, const XlaOp& mean,
                            const XlaOp& variance, float epsilon,
                            int64 feature_index);
 
-  // Calculates the gradients of a batch norm op.
-  //
-  // The inputs `batch_mean` and `batch_var` represent the mean and variance
-  // across the batch.
-  //
-  // Returns a tuple of three elements:
-  //   - grad_operand: Gradient with respect to input `operand`
-  //   - grad_offset: Gradient with respect to input `offset`
-  //   - grad_scale: Gradient with respect to input `scale`
   XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                       const XlaOp& batch_mean, const XlaOp& batch_var,
                       const XlaOp& grad_output, float epsilon,
                       int64 feature_index);
 
+  XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
+
   StatusOr<XlaOp> AddInstruction(HloInstructionProto&& instr, HloOpcode opcode,
                                  absl::Span<const XlaOp> operands = {});
 
@@ -1013,8 +727,14 @@ class XlaBuilder {
                               absl::Span<const int64> lhs_dilation,
                               absl::Span<const int64> rhs_dilation) const;
 
+  int64 GetNextId() { return ++next_id_; }
+
   string name_;  // Name to use for the built computation.
 
+  // The next sequential ID for every instruction/computation contained within
+  // this computation.
+  int64 next_id_ = 0;
+
   // The first error encountered while building the computation.
   // This is OK until the first error is encountered.
   Status first_error_;
@@ -1025,6 +745,9 @@ class XlaBuilder {
   // The instructions of this computation.
   std::vector<HloInstructionProto> instructions_;
 
+  // Dynamic parameter configuration of this computation.
+  DynamicParameterBinding dynamic_parameter_binding_;
+
   // A map from XlaOp::Handle to the index in the instructions_ vector where the
   // instruction is held.
   absl::flat_hash_map<int64, int64> handle_to_index_;
@@ -1102,7 +825,7 @@ class XlaBuilder {
                          absl::Span<const int64> broadcast_sizes);
 
   friend XlaOp BroadcastInDim(
-      const XlaOp& operand, const Shape& shape,
+      const XlaOp& operand, const absl::Span<const int64> out_dim_size,
       const absl::Span<const int64> broadcast_dimensions);
 
   friend XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
@@ -1195,6 +918,10 @@ class XlaBuilder {
   friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
                           absl::Span<const XlaOp> operands, const Shape& shape,
                           const string& opaque);
+  friend XlaOp CustomCallWithLayout(
+      XlaBuilder* builder, const string& call_target_name,
+      absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
+      absl::Span<const Shape> operand_shapes_with_layout, const string& opaque);
   friend XlaOp Complex(const XlaOp& real, const XlaOp& imag,
                        absl::Span<const int64> broadcast_dimensions);
   friend XlaOp Conj(const XlaOp& operand);
@@ -1245,6 +972,8 @@ class XlaBuilder {
       const XlaComputation& computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
+      absl::Span<const int64> base_dilations,
+      absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64>> padding);
   friend XlaOp CrossReplicaSum(const XlaOp& operand,
                                absl::Span<const ReplicaGroup> replica_groups);
@@ -1302,7 +1031,8 @@ class XlaBuilder {
   friend XlaOp Transpose(const XlaOp& operand,
                          absl::Span<const int64> permutation);
   friend XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
-  friend XlaOp Sort(XlaOp keys, absl::optional<XlaOp> values, int64 dimension);
+  friend XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
+                    int64 dimension);
   friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
   friend XlaOp Map(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                    const XlaComputation& computation,
@@ -1356,6 +1086,8 @@ class XlaBuilder {
                                 const string& outfeed_config);
   friend XlaOp CreateToken(XlaBuilder* builder);
   friend XlaOp AfterAll(XlaBuilder* builder, absl::Span<const XlaOp> tokens);
+
+  friend XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
 };
 
 // RAII-style object: sets the current sharding assignment in builder on
@@ -1390,6 +1122,7 @@ class XlaScopedShardingAssignment {
 // Free functions for building XlaOps. The intention is that these will
 // become the public API for building XlaOps rather than calling methods on
 // XlaBuilder directly.
+//
 
 // Enqueues a "retrieve parameter value" instruction for a parameter that was
 // passed to the computation.
@@ -1470,24 +1203,23 @@ XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
 //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
 XlaOp Broadcast(const XlaOp& operand, absl::Span<const int64> broadcast_sizes);
 
-// Performs in-dimension-style broadcast.
+// This op broadcasts the `operand` to an output with the given `shape`.
+// `broadcast_dimensions` are the dimensions to be broadcasting into, i.e., the
+// i'th dimension of the operand is mapped to the broadcast_dimensions[i]'th
+// dimension of the output. This also requires that the i'th input dimension is
+// either 1 or is the same as the output dimension it's broadcasting into.
 //
-// Operand specifies the input to be broadcast. "shape" is expected output
-// shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
-// Dimension numbers in broadcast_dimensions map to individual dimensions
-// of the operand, and specify what dimension of the output shape they
-// should be broadcast.
-// e.g.
-// Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
-// and dimension of shape is [2,2].
-// Specifying {1} as brodcast_dimension will generate output
-// [1 , 2]
-// [1 , 2]
-// On the other hand, specifying {0} as broadcast_dimension
-// will generate output
-// [1 , 1]
-// [2 , 2]
-XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
+// For example, say operand = {1, 2}, i.e., a 1D tensor in shape s32[2]; the
+// output shape is s32[2,2]:
+// - Specifying {1} as brodcast_dimension will generate output
+//   {{1, 2},
+//    {1, 2}}
+// - On the other hand, specifying {0} as broadcast_dimension
+//   will generate output
+//   {{1 , 1},
+//    {2 , 2}}
+XlaOp BroadcastInDim(const XlaOp& operand,
+                     const absl::Span<const int64> out_dim_size,
                      const absl::Span<const int64> broadcast_dimensions);
 
 // Enqueues a pad operation onto the computation that pads the given value on
@@ -1728,6 +1460,17 @@ XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
                  absl::Span<const XlaOp> operands, const Shape& shape,
                  const string& opaque = "");
 
+// Overload which constructs a custom call with fixed layouts. The operands will
+// have the layouts specified by |operand_shapes_with_layout| when provided to
+// external code, and the external code is expected to produce a result with the
+// layout specified by |shape_with_layout|. All shapes in |shape_with_layout|
+// and |operand_shapes_with_layout| must have layouts.
+XlaOp CustomCallWithLayout(XlaBuilder* builder, const string& call_target_name,
+                           absl::Span<const XlaOp> operands,
+                           const Shape& shape_with_layout,
+                           absl::Span<const Shape> operand_shapes_with_layout,
+                           const string& opaque = "");
+
 // The following methods enqueue element-wise binary arithmetic operations
 // onto the computation. The shapes of the operands have to match unless one
 // of the operands is a scalar, or an explicit broadcast dimension is given
@@ -1818,6 +1561,8 @@ XlaOp ReduceWindowWithGeneralPadding(
     const XlaComputation& computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
+    absl::Span<const int64> base_dilations,
+    absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding);
 
 // Returns the sum of the operand value within each subgroup of replicas. All
@@ -1842,7 +1587,7 @@ XlaOp CrossReplicaSum(const XlaOp& operand,
 // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
 // applied cross modules.
 //
-// TODO(b/79737069): Rename this to AllReduce when it's ready to use.
+// TODO(b/117564385): Rename this to AllReduce when it's ready to use.
 XlaOp CrossReplicaSum(
     const XlaOp& operand, const XlaComputation& computation,
     absl::Span<const ReplicaGroup> replica_groups = {},
@@ -1980,12 +1725,12 @@ XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 // the last dimension is chosen by default.
 //
 // If both keys and values are provided:
-// * The keys and the values must tensors with the same dimensions. The
+// * The keys and all values must be tensors with the same dimensions. The
 // element types of the tensors may be different.
 // * The result is a tuple that consists of a sorted tensor of keys (along the
-// provided dimension, as above) as the first element, and a tensor with their
-// corresponding values as the second element.
-XlaOp Sort(XlaOp keys, absl::optional<XlaOp> values = absl::nullopt,
+// provided dimension, as above) as the first element, and tensors with their
+// corresponding values as the other elements.
+XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
            int64 dimension = -1);
 
 // Enqueues a clamp instruction onto the computation.
@@ -2119,7 +1864,12 @@ XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                     const XlaOp& grad_output, float epsilon,
                     int64 feature_index);
 
+// Returns the size of the given dimension of the operand. The operand must be
+// array shaped.
+XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
+
 // Implementation details below this point.
+//
 
 template <typename NativeT>
 XlaOp XlaBuilder::ConstantR0(NativeT value) {
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 7c37ed00cd3dcc214fb0b36c0161d3c39a5bf8c8..b3f5be300d3f15397ad33858a6a9cab5f6029688 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -43,7 +43,7 @@ class XlaBuilderTest : public ::testing::Test {
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
-                            proto, legacy_flags::GetDebugOptionsFromFlags()));
+                            proto, GetDebugOptionsFromFlags()));
     return HloModule::CreateFromProto(proto, config);
   }
 
@@ -54,7 +54,7 @@ class XlaBuilderTest : public ::testing::Test {
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
-                            proto, legacy_flags::GetDebugOptionsFromFlags()));
+                            proto, GetDebugOptionsFromFlags()));
     return HloModule::CreateFromProto(proto, config);
   }
 
@@ -264,6 +264,26 @@ TEST_F(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) {
                             op::Broadcast(op::Reshape(op::Parameter(1)))));
 }
 
+TEST_F(XlaBuilderTest, BroadcastInDim) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3}), "x");
+  BroadcastInDim(x, {2, 4, 3},
+                 /*broadcast_dimensions=*/{0, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Broadcast());
+}
+
+TEST_F(XlaBuilderTest, BroadcastInDimWithDegeneratedDim) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 1, 4}), "x");
+  BroadcastInDim(x, {2, 3, 4},
+                 /*broadcast_dimensions=*/{0, 1, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Broadcast(op::Reshape(op::Broadcast())));
+}
+
 TEST_F(XlaBuilderTest, OperandFromWrongBuilder) {
   XlaBuilder b1("b1");
   auto p0 = Parameter(&b1, 0, ShapeUtil::MakeShape(F32, {}), "p0");
@@ -329,6 +349,15 @@ TEST_F(XlaBuilderTest, CollectivePermute) {
   EXPECT_EQ(root->opcode(), HloOpcode::kCollectivePermute);
 }
 
+TEST_F(XlaBuilderTest, GetDimensionSize) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
+  GetDimensionSize(x, 1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kGetDimensionSize);
+}
+
 TEST_F(XlaBuilderTest, ReportError) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}), "x");
@@ -396,5 +425,35 @@ TEST_F(XlaBuilderTest, BuildWithSpecificRootWithWrongBuilder) {
       ::testing::HasSubstr("root operation is not in this computation"));
 }
 
+TEST_F(XlaBuilderTest, ProtoMatches) {
+  std::vector<XlaComputation> computations;
+  for (int i = 0; i < 2; ++i) {
+    XlaBuilder b_call("the_only_to_apply");
+    auto p0 = Parameter(&b_call, 0, ShapeUtil::MakeShape(F32, {}), "p0");
+    auto p1 = Parameter(&b_call, 1, ShapeUtil::MakeShape(F32, {}), "p1");
+    Add(p0, Add(p1, p0));
+    TF_ASSERT_OK_AND_ASSIGN(auto call, b_call.Build());
+    XlaBuilder b(TestName());
+    auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "x");
+    auto y = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {}), "y");
+    auto one = ConstantR0<float>(&b, 1);
+    auto two = ConstantR0<float>(&b, 2);
+    Add(Call(&b, call, {x, y}), Call(&b, call, {one, two}));
+    computations.push_back(b.Build().ValueOrDie());
+  }
+  auto c0_string = computations[0].proto().SerializeAsString();
+  auto c1_string = computations[1].proto().SerializeAsString();
+  EXPECT_EQ(c0_string, c1_string);
+}
+
+TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
+  XlaBuilder b(TestName());
+  AfterAll(&b, {CreateToken(&b), ConstantR0<float>(&b, 1.0)});
+  Status status = b.Build().status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("All operands to AfterAll must be tokens"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_computation.cc
index 22c9e83bb2ae9e3e205bdd480b64c703e31c6ffd..f317892c12529b2ee8a81788f6bbcae3b3d6489d 100644
--- a/tensorflow/compiler/xla/client/xla_computation.cc
+++ b/tensorflow/compiler/xla/client/xla_computation.cc
@@ -24,8 +24,8 @@ limitations under the License.
 namespace xla {
 
 StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
-  TF_RET_CHECK(proto_.has_program_shape());
-  return proto_.program_shape();
+  TF_RET_CHECK(proto_.has_host_program_shape());
+  return ProgramShape(proto_.host_program_shape());
 }
 
 StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
diff --git a/tensorflow/compiler/xla/client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h
index 71598ef8b296a760b0ee818fce0a59aed5cfc6b4..3ccbfb28bd0c5939ee40878e9cc298688882ac62 100644
--- a/tensorflow/compiler/xla/client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_computation.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
similarity index 93%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
rename to tensorflow/compiler/xla/debug_options_flags.cc
index 3ed3afcfcede20fbf5c7d4f004378817febeb4c7..20609cad58d920c0c272899c41efeb99d23cd490 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -13,17 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 
 #include <mutex>  // NOLINT(build/c++11): only using std::call_once, not mutex.
 #include <vector>
 #include "absl/strings/str_split.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/debug_options_parsers.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 
 namespace xla {
-namespace legacy_flags {
-
 namespace {
 
 DebugOptions* flag_values;
@@ -56,7 +54,7 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   // TODO(jlebar): Disable fastmath once doing so is not a performance
   // regression.
   flags->set_xla_cpu_enable_fast_math(true);
-  flags->set_xla_gpu_enable_fast_math(true);
+  flags->set_xla_gpu_enable_fast_min_max(true);
 
   flags->set_xla_force_host_platform_device_count(1);
 }
@@ -101,8 +99,8 @@ void AllocateFlags() {
       [](string comma_separated_values) {
         auto* extra_options_map =
             flag_values->mutable_xla_backend_extra_options();
-        impl::parse_xla_backend_extra_options(extra_options_map,
-                                              comma_separated_values);
+        parse_xla_backend_extra_options(extra_options_map,
+                                        comma_separated_values);
         return true;
       };
 
@@ -111,8 +109,8 @@ void AllocateFlags() {
       [](string reduce_precision_option_value) {
         HloReducePrecisionOptions* option_proto =
             flag_values->add_hlo_reduce_precision_options();
-        return impl::parse_xla_reduce_precision_option(
-            option_proto, reduce_precision_option_value);
+        return parse_xla_reduce_precision_option(option_proto,
+                                                 reduce_precision_option_value);
       };
 
   flag_objects = new std::vector<tensorflow::Flag>({
@@ -162,11 +160,11 @@ void AllocateFlags() {
           "Enable unsafe fast-math optimizations in the CPU compiler; "
           "this may produce faster code at the expense of some accuracy."),
       tensorflow::Flag(
-          "xla_gpu_enable_fast_math",
-          bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
-          flag_values->xla_cpu_enable_fast_math(),
-          "Enable unsafe fast-math optimizations in the GPU compiler; "
-          "this may produce faster code at the expense of some accuracy."),
+          "xla_gpu_enable_fast_min_max",
+          bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
+          flag_values->xla_gpu_enable_fast_min_max(),
+          "Enable fast floating point min/max lowering that does not propagate "
+          "NaNs."),
       tensorflow::Flag(
           "xla_llvm_enable_alias_scope_metadata",
           bool_setter_for(
@@ -336,8 +334,14 @@ void AllocateFlags() {
           "overhead from context switching but we let the user override this "
           "behavior to help run tests on the host that run models in parallel "
           "across multiple devices."),
+      tensorflow::Flag(
+          "xla_gpu_disable_ptxas_optimizations",
+          bool_setter_for(
+              &DebugOptions::set_xla_gpu_disable_ptxas_optimizations),
+          flag_values->xla_gpu_disable_ptxas_optimizations(),
+          "In XLA:GPU run ptxas in -O0 (default is -O3)."),
   });
-  ParseFlagsFromEnv(*flag_objects);
+  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
 }  // namespace
@@ -353,5 +357,4 @@ xla::DebugOptions GetDebugOptionsFromFlags() {
   return *flag_values;
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h b/tensorflow/compiler/xla/debug_options_flags.h
similarity index 81%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
rename to tensorflow/compiler/xla/debug_options_flags.h
index b53157f59c61cf4e0850e006ad3656f4be63a936..60e59abc2a2e0f1cce3de1afc928f9fe36f75b33 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.h
+++ b/tensorflow/compiler/xla/debug_options_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_FLAGS_H_
+#define TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_FLAGS_H_
 
 #include <vector>
 
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Appends flag definitions for debug options to flag_list.
 void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list);
@@ -32,7 +31,6 @@ void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list);
 // first.
 xla::DebugOptions GetDebugOptionsFromFlags();
 
-}  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_FLAGS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_FLAGS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h b/tensorflow/compiler/xla/debug_options_parsers.h
similarity index 94%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
rename to tensorflow/compiler/xla/debug_options_parsers.h
index ee7eb019c07cf898e48886955b18710146644cac..80aadfd5ece0e768afaf1842d2b6c5b11c288b55 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h
+++ b/tensorflow/compiler/xla/debug_options_parsers.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#ifndef TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_PARSERS_H_
+#define TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_PARSERS_H_
 
 #include <vector>
 #include "absl/strings/numbers.h"
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
-namespace legacy_flags {
-namespace impl {
 
 template <typename T>
 void parse_xla_backend_extra_options(T* extra_options_map,
@@ -140,8 +138,6 @@ inline bool parse_xla_reduce_precision_option(
   return true;
 }
 
-}  // namespace impl
-}  // namespace legacy_flags
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_DEBUG_OPTIONS_PARSERS_H_
+#endif  // TENSORFLOW_COMPILER_XLA_DEBUG_OPTIONS_PARSERS_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc b/tensorflow/compiler/xla/debug_options_parsers_test.cc
similarity index 88%
rename from tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc
rename to tensorflow/compiler/xla/debug_options_parsers_test.cc
index 6f197aec53c7596e84437a03affa9118f22f5a1d..8003c3496d5df9be2ff8a99bc171972c8e090c43 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_parsers_test.cc
+++ b/tensorflow/compiler/xla/debug_options_parsers_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Test for parse_flags_from_env.cc
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_parsers.h"
+#include "tensorflow/compiler/xla/debug_options_parsers.h"
 
 #include <unordered_map>
 #include <vector>
@@ -23,13 +23,12 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Test that the xla_backend_extra_options flag is parsed correctly.
 TEST(DebugOptionsFlags, ParseXlaBackendExtraOptions) {
   std::unordered_map<string, string> test_map;
   string test_string = "aa=bb,cc,dd=,ee=ff=gg";
-  impl::parse_xla_backend_extra_options(&test_map, test_string);
+  parse_xla_backend_extra_options(&test_map, test_string);
   EXPECT_EQ(test_map.size(), 4);
   EXPECT_EQ(test_map.at("aa"), "bb");
   EXPECT_EQ(test_map.at("cc"), "");
@@ -41,7 +40,7 @@ TEST(DebugOptionsFlags, ParseXlaBackendExtraOptions) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStrings) {
   HloReducePrecisionOptions proto;
   string test_string = "OP_OUTPUTS=5,10:add,dot";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -56,7 +55,7 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStrings) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStringsSemicolon) {
   HloReducePrecisionOptions proto;
   string test_string = "OP_OUTPUTS=5,10:add,dot;";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -71,7 +70,7 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoStringsSemicolon) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoOpcodes) {
   HloReducePrecisionOptions proto;
   string test_string = "UNFUSED_OP_OUTPUTS=5,10:;foo,bar/baz";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::UNFUSED_OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -84,7 +83,7 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionNoOpcodes) {
 TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionBoth) {
   HloReducePrecisionOptions proto;
   string test_string = "UNFUSED_OP_OUTPUTS=5,10:subtract;foo,bar/baz";
-  EXPECT_TRUE(impl::parse_xla_reduce_precision_option(&proto, test_string));
+  EXPECT_TRUE(parse_xla_reduce_precision_option(&proto, test_string));
   EXPECT_EQ(proto.location(), HloReducePrecisionOptions::UNFUSED_OP_OUTPUTS);
   EXPECT_EQ(proto.exponent_bits(), 5);
   EXPECT_EQ(proto.mantissa_bits(), 10);
@@ -96,7 +95,6 @@ TEST(DebugOptionsFlags, ParseXlaReducePrecisionOptionBoth) {
   EXPECT_EQ(proto.opname_substrings_to_suffix(1), "bar/baz");
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
 
 int main(int argc, char* argv[]) {
diff --git a/tensorflow/compiler/xla/execution_options_util.cc b/tensorflow/compiler/xla/execution_options_util.cc
index e83ff7cddd675197c7f6d7018257edb4c25b6228..cf569863bbe1c92bdcafb133d49dcf5ae8890ffe 100644
--- a/tensorflow/compiler/xla/execution_options_util.cc
+++ b/tensorflow/compiler/xla/execution_options_util.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 
 namespace xla {
 
 ExecutionOptions CreateDefaultExecutionOptions() {
   ExecutionOptions execution_options;
-  *(execution_options.mutable_debug_options()) =
-      legacy_flags::GetDebugOptionsFromFlags();
+  *(execution_options.mutable_debug_options()) = GetDebugOptionsFromFlags();
   return execution_options;
 }
 
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index fb135f5ceda67ce6c001de15b8f3f084ca164826..1fea816a803bfb75b9721393cef8c4dfc249268d 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as _np  # Avoids becoming a part of public Tensorflow API.
 
 from tensorflow.compiler.xla import xla_data_pb2
-from tensorflow.compiler.xla.python_api import xla_shape
 from tensorflow.core.framework import attr_value_pb2
 
 
@@ -64,22 +61,18 @@ class Sharding(object):
             tile_assignment_devices=[core]))
 
   @classmethod
-  def tile(cls, tile_shape, tile_assignment):
+  def tile(cls, tile_assignment):
     """Returns a Tiled sharding attribute.
 
     This causes an op to be partially computed on multiple cores in the
     XLA device.
 
     Args:
-      tile_shape: A xla_shape.Shape describing the tile shape that each core
-        will compute.
-        The tile shape does not need to be divisible by the tile assignment.
       tile_assignment: An np.ndarray describing the topology of the tiling and
         which device will compute which part of the topology.
 
     Raises:
-      TypeError: tile_assignment was not of np.array type or tile_shape was
-         not of xla_shape.Shape type.
+      TypeError: tile_assignment was not of np.array type.
 
     TODO(jmolloy): This concept is nefarious and is not
     something we really want to expose to users (especially as the
@@ -87,14 +80,11 @@ class Sharding(object):
     """
     if not isinstance(tile_assignment, _np.ndarray):
       raise TypeError('Tile assignment must be of type np.ndarray')
-    if not isinstance(tile_shape, xla_shape.Shape):
-      raise TypeError('Tile shape must be of type xla_shape.Shape')
     dims = list(tile_assignment.shape)
     flattened_devices = tile_assignment.reshape(-1, order='C')
     return Sharding(
         proto=xla_data_pb2.OpSharding(
             type=xla_data_pb2.OpSharding.OTHER,
-            tile_shape=tile_shape.message,
             tile_assignment_dimensions=dims,
             tile_assignment_devices=list(flattened_devices)))
 
@@ -118,14 +108,8 @@ class Sharding(object):
     shape = tensor.shape.as_list()
     if shape[split_dimension] < num_devices:
       raise ValueError('Split dimension was smaller than the required number '
-                       'of splits: shape=%r, dimension=%r, num_devices=%r',
-                       shape, split_dimension, num_devices)
-
-    tile_shape = shape
-    tile_shape[split_dimension] = int(
-        math.ceil(tile_shape[split_dimension] / num_devices))
-    tile_shape_proto = xla_data_pb2.Shape(
-        element_type=xla_data_pb2.F32, dimensions=tile_shape)
+                       'of splits: shape=%r, dimension=%r, num_devices=%r' %
+                       (shape, split_dimension, num_devices))
 
     tile_assignment_dims = [1] * len(shape)
     tile_assignment_dims[split_dimension] = num_devices
@@ -133,7 +117,6 @@ class Sharding(object):
     return Sharding(
         proto=xla_data_pb2.OpSharding(
             type=xla_data_pb2.OpSharding.OTHER,
-            tile_shape=tile_shape_proto,
             tile_assignment_dimensions=tile_assignment_dims,
             tile_assignment_devices=range(num_devices)))
 
@@ -149,7 +132,6 @@ class Sharding(object):
           type=xla_data_pb2.OpSharding.TUPLE, tuple_shardings=tuple_shardings)
     else:
       proto = self._proto
-
     attr_value = attr_value_pb2.AttrValue(s=proto.SerializeToString())
     # TODO(jmolloy): This need to be seriously revisited before declaring this
     # API available for public use.
@@ -194,8 +176,8 @@ def assign_device(tensor, device):
   return tensor
 
 
-def tile(tensor, tile_shape, tile_assignment):
-  Sharding.tile(tile_shape, tile_assignment).apply_to_tensor(tensor)
+def tile(tensor, tile_assignment):
+  Sharding.tile(tile_assignment).apply_to_tensor(tensor)
   return tensor
 
 
diff --git a/tensorflow/compiler/xla/g3doc/README.md b/tensorflow/compiler/xla/g3doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6643bf0aab3078ff24c86b81de69216355da69a1
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/README.md
@@ -0,0 +1,3 @@
+# XLA: Accelerated Linear Algebra
+
+These are the docs for: https://www.tensorflow.org/xla
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..267701e9c0e42a21d2cda6238520f6a9692e7e76
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -0,0 +1,35 @@
+upper_tabs:
+# Tabs left of dropdown menu
+- include: /_upper_tabs_left.yaml
+- include: /api_docs/_upper_tabs_api.yaml
+# Dropdown menu
+- name: Resources
+  path: /resources
+  is_default: true
+  menu:
+  - include: /resources/_menu_toc.yaml
+  lower_tabs:
+    # Subsite tabs
+    other:
+    - name: Guide & Tutorials
+      contents:
+      - title: XLA overview
+        path: /xla/overview
+      - title: Broadcasting semantics
+        path: /xla/broadcasting
+      - title: Developing a new backend for XLA
+        path: /xla/developing_new_backend
+      - title: Using JIT compilation
+        path: /xla/jit
+      - title: Operation semantics
+        path: /xla/operation_semantics
+      - title: Shapes and layout
+        path: /xla/shapes
+      - title: Using AOT compilation
+        path: /xla/tfcompile
+      - heading: Tutorials
+      - title: XLA compile API
+        path: /xla/tutorials/xla_compile
+        status: experimental
+
+- include: /_upper_tabs_right.yaml
diff --git a/tensorflow/compiler/xla/g3doc/_index.yaml b/tensorflow/compiler/xla/g3doc/_index.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..858de427119bfcfa82d0b1158776bf269129fd92
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/_index.yaml
@@ -0,0 +1,35 @@
+book_path: /xla/_book.yaml
+project_path: /xla/_project.yaml
+description: <!--no description-->
+landing_page:
+  custom_css_path: /site-assets/css/style.css
+  rows:
+  - heading: XLA is a compiler that optimizes TensorFlow computations.
+    items:
+    - classname: devsite-landing-row-50
+      description: >
+        XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
+        algebra that optimizes TensorFlow computations. The results are
+        improvements in speed, memory usage, and portability on server and mobile
+        platforms. The XLA framework is experimental and in active development.
+        For details, read the <a href="./overview">XLA guide</a>.
+
+  - classname: devsite-landing-row-cards
+    items:
+    - heading: XLA - TensorFlow, compiled
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html
+      buttons:
+      - label: Read on Google Developers blog
+        path: https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html
+    - heading: XLA at the Dev Summit
+      youtube_id: kAOanJczHA0
+      buttons:
+      - label: Watch the video
+        path: https://www.youtube.com/watch?v=kAOanJczHA0
+    - heading: XLA on GitHub
+      image_path: /resources/images/github-card-16x9.png
+      path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla
+      buttons:
+      - label: View on GitHub
+        path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla
diff --git a/tensorflow/compiler/xla/g3doc/_project.yaml b/tensorflow/compiler/xla/g3doc/_project.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..33d8bdb27a664d9e282d1d65c007ebf5838b196a
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/_project.yaml
@@ -0,0 +1,10 @@
+name: XLA
+breadcrumb_name: XLA
+home_url: /xla/
+parent_project_metadata_path: /_project.yaml
+description: >
+  XLA is a compiler-based linear algebra execution engine.
+use_site_branding: true
+hide_from_products_list: true
+content_license: cc3-apache2
+buganizer_id: 171704
diff --git a/tensorflow/compiler/xla/g3doc/broadcasting.md b/tensorflow/compiler/xla/g3doc/broadcasting.md
new file mode 100644
index 0000000000000000000000000000000000000000..2870869a2cef13a9105b9dc9fa4d657834288f86
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/broadcasting.md
@@ -0,0 +1,204 @@
+# Broadcasting semantics
+
+This document describes how the broadcasting semantics in XLA work.
+
+## What is broadcasting?
+
+Broadcasting is the process of making arrays with different shapes have
+compatible shapes for arithmetic operations. The terminology is borrowed from
+Numpy
+[broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+Broadcasting may be required for operations between multi-dimensional arrays of
+different ranks, or between multi-dimensional arrays with different but
+compatible shapes. Consider the addition `X+v` where `X` is a matrix (an array
+of rank 2) and `v` is a vector (an array of rank 1). To perform element-wise
+addition, XLA needs to "broadcast" the vector `v` to the same rank as the
+matrix `X`, by replicating `v` a certain number of times. The vector's length
+has to match at least one of the dimensions of the matrix.
+
+For example:
+
+    |1 2 3| + |7 8 9|
+    |4 5 6|
+
+The matrix's dimensions are (2,3), the vector's are (3). The vector is broadcast
+by replicating it over rows to get:
+
+    |1 2 3| + |7 8 9| = |8  10 12|
+    |4 5 6|   |7 8 9|   |11 13 15|
+
+In Numpy, this is called
+[broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+## Principles
+
+The XLA language is as strict and explicit as possible, avoiding implicit and
+"magical" features. Such features may make some computations slightly easier to
+define, at the cost of more assumptions baked into user code that will be
+difficult to change in the long term. If necessary, implicit and magical
+features can be added in client-level wrappers.
+
+In regards to broadcasting, explicit broadcasting specifications on operations
+between arrays of different ranks is required. This is different from Numpy,
+which infers the specification when possible.
+
+## Broadcasting a lower-rank array onto a higher-rank array
+
+*Scalars* can always be broadcast over arrays without an explicit specification
+of broadcasting dimensions. An element-wise binary operation between a scalar
+and an array means applying the operation with the scalar for each element in
+the array. For example, adding a scalar to a matrix means producing a matrix
+each element of which is a sum of the scalar with the corresponding input
+matrix's element.
+
+    |1 2 3| + 7 = |8  9  10|
+    |4 5 6|       |11 12 13|
+
+Most broadcasting needs can be captured by using a tuple of dimensions on a
+binary operation. When the inputs to the operation have different ranks, this
+broadcasting tuple specifies which dimension(s) in the **higher-rank** array to
+match with the **lower-rank** array.
+
+Consider the previous example, instead of adding a scalar to a (2,3) matrix, add
+a vector of dimension (3) to a matrix of dimensions (2,3). *Without specifying
+broadcasting, this operation is invalid.* To correctly request matrix-vector
+addition, specify the broadcasting dimension to be (1), meaning the vector's
+dimension is matched to dimension 1 of the matrix. In 2D, if dimension 0 is
+considered as rows and dimension 1 as columns, this means that each element of
+the vector becomes a column of a size matching the number of rows in the matrix:
+
+    |7 8 9| ==> |7 8 9|
+                |7 8 9|
+
+As a more complex example, consider adding a 3-element vector (dimension (3)) to
+a 3x3 matrix (dimensions (3,3)). There are two ways broadcasting can happen for
+this example:
+
+(1) A broadcasting dimension of 1 can be used. Each vector element becomes a
+column and the vector is duplicated for each row in the matrix.
+
+    |7 8 9| ==> |7 8 9|
+                |7 8 9|
+                |7 8 9|
+
+(2) A broadcasting dimension of 0 can be used. Each vector element becomes a row
+and the vector is duplicated for each column in the matrix.
+
+     |7| ==> |7 7 7|
+     |8|     |8 8 8|
+     |9|     |9 9 9|
+
+> Note: when adding a 2x3 matrix to a 3-element vector, a broadcasting dimension
+> of 0 is invalid.
+
+The broadcasting dimensions can be a tuple that describes how a smaller rank
+shape is broadcast into a larger rank shape. For example, given a 2x3x4 cuboid
+and a 3x4 matrix, a broadcasting tuple (1,2) means matching the matrix to
+dimensions 1 and 2 of the cuboid.
+
+This type of broadcast is used in the binary ops in `XlaBuilder`, if the
+`broadcast_dimensions` argument is given. For example, see
+[XlaBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.cc).
+In the XLA source code, this type of broadcasting is sometimes called "InDim"
+broadcasting.
+
+### Formal definition
+
+The broadcasting attribute allows matching a lower-rank array to a higher-rank
+array, by specifying which dimensions of the higher-rank array to match. For
+example, for an array with dimensions MxNxPxQ, a vector with dimension T can be
+matched as follows:
+
+              MxNxPxQ
+
+    dim 3:          T
+    dim 2:        T
+    dim 1:      T
+    dim 0:    T
+
+In each case, T has to be equal to the matching dimension of the higher-rank
+array. The vector's values are then broadcast from the matched dimension to all
+the other dimensions.
+
+To match a TxV matrix onto the MxNxPxQ array, a pair of broadcasting dimensions
+are used:
+
+              MxNxPxQ
+    dim 2,3:      T V
+    dim 1,2:    T V
+    dim 0,3:  T     V
+    etc...
+
+The order of dimensions in the broadcasting tuple has to be the order in which
+the lower-rank array's dimensions are expected to match the higher-rank array's
+dimensions. The first element in the tuple says which dimension in the
+higher-rank array has to match dimension 0 in the lower-rank array. The second
+element for dimension 1, and so on. The order of broadcast dimensions has to be
+strictly increasing. For example, in the previous example it is illegal to match
+V to N and T to P; it is also illegal to match V to both P and N.
+
+## Broadcasting similar-rank arrays with degenerate dimensions
+
+A related broadcasting problem is broadcasting two arrays that have the same
+rank but different dimension sizes. Similarly to Numpy's rules, this is only
+possible when the arrays are *compatible*. Two arrays are compatible when all
+their dimensions are compatible. Two dimensions are compatible if:
+
+*   They are equal, or
+*   One of them is 1 (a "degenerate" dimension)
+
+When two compatible arrays are encountered, the result shape has the maximum
+among the two inputs at every dimension index.
+
+Examples:
+
+1.  (2,1) and (2,3) broadcast to (2,3).
+2.  (1,2,5) and (7,2,5) broadcast to (7,2,5)
+3.  (7,2,5) and (7,1,5) broadcast to (7,2,5)
+4.  (7,2,5) and (7,2,6) are incompatible and cannot be broadcast.
+
+A special case arises, and is also supported, where each of the input arrays has
+a degenerate dimension at a different index. In this case, the result is an
+"outer operation": (2,1) and (1,3) broadcast to (2,3). For more examples,
+consult the
+[Numpy documentation on broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+## Broadcast composition
+
+Broadcasting of a lower-rank array to a higher-rank array **and** broadcasting
+using degenerate dimensions can both be performed in the same binary operation.
+For example, a vector of size 4 and an matrix of size 1x2 can be added together
+using broadcast dimensions value of (0):
+
+    |1 2 3 4| + [5 6]    // [5 6] is a 1x2 matrix, not a vector.
+
+First the vector is broadcast up to rank 2 (matrix) using the broadcast
+dimensions. The single value (0) in the broadcast dimensions indicates that
+dimension zero of the vector matches to dimension zero of the matrix. This
+produces an matrix of size 4xM where the value M is chosen to match the
+corresponding dimension size in the 1x2 array. Therefore, a 4x2 matrix is
+produced:
+
+    |1 1| + [5 6]
+    |2 2|
+    |3 3|
+    |4 4|
+
+Then "degenerate dimension broadcasting" broadcasts dimension zero of the 1x2
+matrix to match the corresponding dimension size of the right hand side:
+
+    |1 1| + |5 6|     |6  7|
+    |2 2| + |5 6|  =  |7  8|
+    |3 3| + |5 6|     |8  9|
+    |4 4| + |5 6|     |9 10|
+
+A more complicated example is a matrix of size 1x2 added to an array of size
+4x3x1 using broadcast dimensions of (1, 2). First the 1x2 matrix is broadcast up
+to rank 3 using the broadcast dimensions to produces an intermediate Mx1x2 array
+where the dimension size M is determined by the size of the larger operand (the
+4x3x1 array) producing a 4x1x2 intermediate array. The M is at dimension 0
+(left-most dimension) because the dimensions 1 and 2 are mapped to the
+dimensions of the original 1x2 matrix as the broadcast dimension are (1, 2).
+This intermediate array can be added to the 4x3x1 matrix using broadcasting of
+degenerate dimensions to produce a 4x3x2 array result.
diff --git a/tensorflow/compiler/xla/g3doc/developing_new_backend.md b/tensorflow/compiler/xla/g3doc/developing_new_backend.md
new file mode 100644
index 0000000000000000000000000000000000000000..5ede7f523131cf715575074b8e27487be5ea77c6
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/developing_new_backend.md
@@ -0,0 +1,76 @@
+# Developing a new backend for XLA
+
+This preliminary guide is for early adopters that want to easily retarget
+TensorFlow to their hardware in an efficient manner. The guide is not
+step-by-step and assumes knowledge of [LLVM](http://llvm.org),
+[Bazel](https://bazel.build/), and TensorFlow.
+
+XLA provides an abstract interface that a new architecture or accelerator can
+implement to create a backend to run TensorFlow graphs. Retargeting XLA should
+be significantly simpler and scalable than implementing every existing
+TensorFlow Op for new hardware.
+
+Most implementations will fall into one of the following scenarios:
+
+1.  Existing CPU architecture not yet officially supported by XLA, with or
+    without an existing [LLVM](http://llvm.org) backend.
+2.  Non-CPU-like hardware with an existing LLVM backend.
+3.  Non-CPU-like hardware without an existing LLVM backend.
+
+> Note: An LLVM backend can mean either one of the officially released LLVM
+> backends or a custom LLVM backend developed in-house.
+
+## Scenario 1: Existing CPU architecture not yet officially supported by XLA
+
+In this scenario, start by looking at the existing
+[XLA CPU backend](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/cpu/).
+XLA makes it easy to retarget TensorFlow to different CPUs by using LLVM, since
+the main difference between XLA backends for CPUs is the code generated by LLVM.
+Google tests XLA for x64 and ARM64 architectures.
+
+If the hardware vendor has an LLVM backend for their hardware, it is simple to
+link the backend with the LLVM built with XLA. In JIT mode, the XLA CPU backend
+emits code for the host CPU. For ahead-of-time compilation,
+[`xla::AotCompilationOptions`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h)
+can provide an LLVM triple to configure the target architecture.
+
+If there is no existing LLVM backend but another kind of code generator exists,
+it should be possible to reuse most of the existing CPU backend.
+
+## Scenario 2: Non-CPU-like hardware with an existing LLVM backend
+
+It is possible to model a new
+[`xla::Compiler`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h)
+implementation on the existing
+[`xla::CPUCompiler`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc)
+and [`xla::GPUCompiler`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc)
+classes, since these already emit LLVM IR. Depending on the nature of the
+hardware, it is possible that many of the LLVM IR generation aspects will have
+to be changed, but a lot of code can be shared with the existing backends.
+
+A good example to follow is the
+[GPU backend](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/gpu/)
+of XLA. The GPU backend targets a non-CPU-like ISA, and therefore some aspects
+of its code generation are unique to the GPU domain. Other kinds of hardware,
+e.g. DSPs like Hexagon (which has an upstream LLVM backend), can reuse parts of
+the LLVM IR emission logic, but other parts will be unique.
+
+## Scenario 3: Non-CPU-like hardware without an existing LLVM backend
+
+If it is not possible to utilize LLVM, then the best option is to implement a
+new backend for XLA for the desired hardware. This option requires the most
+effort. The classes that need to be implemented are as follows:
+
+*   [`StreamExecutor`](https://www.tensorflow.org/code/tensorflow/stream_executor/stream_executor.h):
+    For many devices not all methods of `StreamExecutor` are needed. See
+    existing `StreamExecutor` implementations for details.
+*   [`xla::Compiler`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h):
+    This class encapsulates the compilation of an HLO computation into an
+    `xla::Executable`.
+*   [`xla::Executable`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/executable.h):
+    This class is used to launch a compiled computation on the platform.
+*   [`xla::TransferManager`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/transfer_manager.h):
+    This class enables backends to provide platform-specific mechanisms for
+    constructing XLA literal data from given device memory handles. In other
+    words, it helps encapsulate the transfer of data from the host to the device
+    and back.
diff --git a/tensorflow/compiler/xla/g3doc/images/how-does-xla-work.png b/tensorflow/compiler/xla/g3doc/images/how-does-xla-work.png
new file mode 100644
index 0000000000000000000000000000000000000000..15f86c3221d3637f2087a2db9f4cb008fe2690fa
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/how-does-xla-work.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_cpu_xla_graph.png b/tensorflow/compiler/xla/g3doc/images/jit_cpu_xla_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e2dc091fee1d13ae659988b1a68505e9ff77b27
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/jit_cpu_xla_graph.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_gpu_xla_graph.png b/tensorflow/compiler/xla/g3doc/images/jit_gpu_xla_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..39d7c90c4fc3d707df062562fcf9ebdc37344af0
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/jit_gpu_xla_graph.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu.png b/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..a38f636983b527b678f17d3b0c92646ac1485f86
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu_xla.png b/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu_xla.png
new file mode 100644
index 0000000000000000000000000000000000000000..285c3a96d5aa33605cab2486522a5e815901a2fc
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/jit_timeline_cpu_xla.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu.png b/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..488fc2c2f1009706b7e2c5ded154f47e2b7f4bcb
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu_xla.png b/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu_xla.png
new file mode 100644
index 0000000000000000000000000000000000000000..d0df38cf18197f89224cc0f5ff643dd537d03fcc
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/jit_timeline_gpu_xla.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_2d_matrix.png b/tensorflow/compiler/xla/g3doc/images/ops_2d_matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..4846d1700607ced60dd3b8038996894d4dd0f8af
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_2d_matrix.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_alltoall.png b/tensorflow/compiler/xla/g3doc/images/ops_alltoall.png
new file mode 100644
index 0000000000000000000000000000000000000000..c8150bda5bd6fb5723832a5e42e71c12cee3d399
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_alltoall.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_concatenate.png b/tensorflow/compiler/xla/g3doc/images/ops_concatenate.png
new file mode 100644
index 0000000000000000000000000000000000000000..26ded3d88c07205dd6eceef2d2ee151b4e390977
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_concatenate.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_pad.png b/tensorflow/compiler/xla/g3doc/images/ops_pad.png
new file mode 100644
index 0000000000000000000000000000000000000000..dc1948a627a88721d44bd22027ab75540f61feda
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_pad.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_reduce_from_2d_matrix.png b/tensorflow/compiler/xla/g3doc/images/ops_reduce_from_2d_matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..c2ff037ab5c6ad7b2b2157339f189cff3b16df09
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_reduce_from_2d_matrix.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_reduce_from_3d_matrix.png b/tensorflow/compiler/xla/g3doc/images/ops_reduce_from_3d_matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..ebeeca093b2dda7fc5871e53302bce0e73e670be
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_reduce_from_3d_matrix.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_reduce_window.png b/tensorflow/compiler/xla/g3doc/images/ops_reduce_window.png
new file mode 100644
index 0000000000000000000000000000000000000000..e9cdc3d148ab4ebb46bef8af84724134eae75d55
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_reduce_window.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_reduce_window_stride.png b/tensorflow/compiler/xla/g3doc/images/ops_reduce_window_stride.png
new file mode 100644
index 0000000000000000000000000000000000000000..f1ef5270dbac9f4ca1eb884e5cb27fd57a02ba8e
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_reduce_window_stride.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_scatter_to_selected_window_element.png b/tensorflow/compiler/xla/g3doc/images/ops_scatter_to_selected_window_element.png
new file mode 100644
index 0000000000000000000000000000000000000000..4a82afaefab42d837a46178ba9aef3a1b6ddc434
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_scatter_to_selected_window_element.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_while.png b/tensorflow/compiler/xla/g3doc/images/ops_while.png
new file mode 100644
index 0000000000000000000000000000000000000000..da32b553eb0226bfb1122c236dfefe151758b9fa
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/ops_while.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_xla_gather_0.svg b/tensorflow/compiler/xla/g3doc/images/ops_xla_gather_0.svg
new file mode 100644
index 0000000000000000000000000000000000000000..7d324aa35bd92aeef7bc2987eaf346f1c3aa0966
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/images/ops_xla_gather_0.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 526.0157480314961 464.99212598425197" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l526.01575 0l0 464.99213l-526.01575 0l0 -464.99213z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l526.01575 0l0 464.99213l-526.01575 0z" fill-rule="evenodd"/><g filter="url(#shadowFilter-p.1)"><use xlink:href="#p.1" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.1" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.1"><path fill="#fff2cc" d="m218.29134 20.871391l293.88977 0l0 427.33856l-293.88977 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m218.29134 20.871391l293.88977 0l0 427.33856l-293.88977 0z" fill-rule="evenodd"/></g><path fill="#cfe2f3" d="m31.918634 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m31.918634 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path fill="#000000" d="m61.48428 74.78799l0 -1.578125l5.656246 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.3749962 0.515625 -2.8437462 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.6249962 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.2656212 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.0937462 -0.453125 1.6562462 -0.953125l0 -2.484375l-3.9218712 0z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m148.45407 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m148.45407 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path fill="#000000" d="m167.77994 73.52236q0 -3.328125 1.78125 -5.203125q1.78125 -1.890625 4.609375 -1.890625q1.84375 0 3.328125 0.890625q1.484375 0.875 2.265625 2.46875q0.78125 1.578125 0.78125 3.578125q0 2.03125 -0.828125 3.640625q-0.8125 1.59375 -2.3125 2.421875q-1.5 0.828125 -3.25 0.828125q-1.875 0 -3.359375 -0.90625q-1.484375 -0.921875 -2.25 -2.5q-0.765625 -1.578125 -0.765625 -3.328125zm1.8125 0.015625q0 2.421875 1.296875 3.8125q1.296875 1.390625 3.265625 1.390625q2.0 0 3.28125 -1.40625q1.28125 -1.40625 1.28125 -3.984375q0 -1.625 -0.546875 -2.84375q-0.546875 -1.21875 -1.609375 -1.875q-1.0625 -0.671875 -2.375 -0.671875q-1.890625 0 -3.25 1.296875q-1.34375 1.28125 -1.34375 4.28125z" fill-rule="nonzero"/><path fill="#000000" d="m186.02574 87.02236l-1.09375 0l0 -6.96875q-0.40625 0.375 -1.046875 0.75q-0.640625 0.375 -1.140625 0.578125l0 -1.0625q0.90625 -0.4375 1.59375 -1.046875q0.6875 -0.609375 0.96875 -1.1875l0.71875 0l0 8.9375z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m90.18635 48.0l58.267723 0l0 50.204727l-58.267723 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m90.18635 48.0l58.267723 0l0 50.204727l-58.267723 0z" fill-rule="evenodd"/><path fill="#000000" d="m109.51222 73.52236q0 -3.328125 1.78125 -5.203125q1.78125 -1.890625 4.609375 -1.890625q1.84375 0 3.328125 0.890625q1.484375 0.875 2.265625 2.46875q0.78125 1.578125 0.78125 3.578125q0 2.03125 -0.828125 3.640625q-0.8125 1.59375 -2.3125 2.421875q-1.5 0.828125 -3.25 0.828125q-1.875 0 -3.359375 -0.90625q-1.484375 -0.921875 -2.25 -2.5q-0.765625 -1.578125 -0.765625 -3.328125zm1.8125 0.015625q0 2.421875 1.296875 3.8125q1.296875 1.390625 3.265625 1.390625q2.0 0 3.28125 -1.40625q1.28125 -1.40625 1.28125 -3.984375q0 -1.625 -0.546875 -2.84375q-0.546875 -1.21875 -1.609375 -1.875q-1.0625 -0.671875 -2.375 -0.671875q-1.890625 0 -3.25 1.296875q-1.34375 1.28125 -1.34375 4.28125z" fill-rule="nonzero"/><path fill="#000000" d="m123.633026 82.63174q0 -1.578125 0.328125 -2.53125q0.328125 -0.96875 0.96875 -1.484375q0.640625 -0.53125 1.609375 -0.53125q0.71875 0 1.25 0.296875q0.546875 0.28125 0.890625 0.828125q0.359375 0.53125 0.5625 1.3125q0.203125 0.78125 0.203125 2.109375q0 1.5625 -0.328125 2.53125q-0.328125 0.953125 -0.96875 1.484375q-0.640625 0.53125 -1.609375 0.53125q-1.296875 0 -2.03125 -0.921875q-0.875 -1.109375 -0.875 -3.625zm1.125 0q0 2.1875 0.515625 2.921875q0.515625 0.71875 1.265625 0.71875q0.75 0 1.265625 -0.71875q0.515625 -0.734375 0.515625 -2.921875q0 -2.203125 -0.515625 -2.921875q-0.515625 -0.71875 -1.28125 -0.71875q-0.75 0 -1.203125 0.640625q-0.5625 0.8125 -0.5625 3.0z" fill-rule="nonzero"/><path fill="#f4cccc" d="m276.83203 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m337.11548 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m357.20996 79.03937l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 79.03937l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m377.30447 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m397.39896 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m417.49344 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m437.58792 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 79.03937l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 79.03937l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m337.11548 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m357.20996 99.13386l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 99.13386l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m377.30447 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m397.39896 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m417.49344 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m437.58792 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 99.13386l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 99.13386l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m337.11548 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m357.20996 119.22835l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 119.22835l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m377.30447 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m397.39896 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m417.49344 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m437.58792 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 119.22835l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 119.22835l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m337.11548 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m357.20996 139.32283l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 139.32283l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m377.30447 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m397.39896 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m417.49344 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m437.58792 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 139.32283l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 139.32283l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m337.11548 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m357.20996 159.41733l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 159.41733l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m377.30447 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m397.39896 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m417.49344 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m437.58792 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 159.41733l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 159.41733l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m337.11548 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m357.20996 179.51181l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 179.51181l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m377.30447 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m397.39896 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m417.49344 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m437.58792 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 179.51181l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 179.51181l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m276.83203 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m296.9265 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m317.021 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m337.11548 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m357.20996 199.6063l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 199.6063l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m377.30447 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m397.39896 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m417.49344 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m437.58792 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 199.6063l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 199.6063l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m337.11548 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m357.20996 219.70079l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 219.70079l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m377.30447 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m397.39896 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m417.49344 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m437.58792 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 219.70079l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 219.70079l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m337.11548 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m357.20996 239.79527l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 239.79527l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m377.30447 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m397.39896 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m417.49344 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m437.58792 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 239.79527l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 239.79527l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m337.11548 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m357.20996 259.88977l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 259.88977l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m377.30447 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#cc0000" d="m397.39896 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m417.49344 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m437.58792 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 259.88977l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 259.88977l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m337.11548 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m357.20996 279.98425l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 279.98425l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m377.30447 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m397.39896 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m417.49344 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m437.58792 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 279.98425l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 279.98425l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m337.11548 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m357.20996 300.07874l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 300.07874l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m377.30447 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m397.39896 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m417.49344 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m437.58792 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 300.07874l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 300.07874l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m337.11548 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m357.20996 320.17322l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 320.17322l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m377.30447 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m397.39896 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m417.49344 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m437.58792 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 320.17322l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 320.17322l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m337.11548 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m357.20996 340.26773l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 340.26773l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m377.30447 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m397.39896 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m417.49344 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m437.58792 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 340.26773l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 340.26773l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m337.11548 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m357.20996 360.3622l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 360.3622l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m377.30447 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m397.39896 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m417.49344 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m437.58792 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 360.3622l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 360.3622l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m276.83203 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.83203 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m296.9265 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m296.9265 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m317.021 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m317.021 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m337.11548 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m337.11548 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m357.20996 380.4567l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m357.20996 380.4567l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m377.30447 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m377.30447 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m397.39896 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m397.39896 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m417.49344 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.49344 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m437.58792 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m437.58792 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m457.6824 380.4567l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m457.6824 380.4567l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m477.77692 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m477.77692 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m31.918634 3.9055119l129.98425 0l0 44.09449l-129.98425 0z" fill-rule="evenodd"/><path fill="#000000" d="m41.824886 24.32551q0 -3.328125 1.78125 -5.203125q1.78125 -1.890625 4.609375 -1.890625q1.84375 0 3.328125 0.890625q1.484375 0.875 2.265625 2.46875q0.78125 1.578125 0.78125 3.578125q0 2.03125 -0.828125 3.640625q-0.8125 1.59375 -2.3125 2.421875q-1.5 0.828125 -3.25 0.828125q-1.875 0 -3.359375 -0.90625q-1.484375 -0.921875 -2.25 -2.5q-0.765625 -1.578125 -0.765625 -3.328125zm1.8125 0.015625q0 2.421875 1.296875 3.8125q1.296875 1.390625 3.265625 1.390625q2.0 0 3.28125 -1.40625q1.28125 -1.40625 1.28125 -3.984375q0 -1.625 -0.546875 -2.84375q-0.546875 -1.21875 -1.609375 -1.875q-1.0625 -0.671875 -2.375 -0.671875q-1.890625 0 -3.25 1.296875q-1.34375 1.28125 -1.34375 4.28125zm19.355179 6.484375l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.6406212 0l0 9.671875l-1.4687462 0zm7.6257133 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 5.171873l0 -13.374998l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703123l-1.640625 0zm1.484375 -8.484373q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm15.219467 4.78125l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm7.625717 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm7.2884827 1.46875l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683304 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm16.641342 0l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125zm15.906975 1.71875l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm8.047592 5.765625l3.53125 -5.03125l-3.265625 -4.640625l2.046875 0l1.484375 2.265625q0.421875 0.640625 0.671875 1.078125q0.40625 -0.59375 0.734375 -1.0625l1.640625 -2.28125l1.953125 0l-3.34375 4.546875l3.59375 5.125l-2.015625 0l-1.984375 -3.0l-0.515625 -0.8125l-2.546875 3.8125l-1.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m277.17847 405.98425l220.34647 0l0 26.519684l-220.34647 0z" fill-rule="evenodd"/><path fill="#000000" d="m287.91284 432.90424l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683319 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm10.375702 3.703125l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm15.219482 4.78125l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm7.625702 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm10.397858 1.46875l0 -11.78125l-4.40625 0l0 -1.578125l10.578125 0l0 1.578125l-4.40625 0l0 11.78125l-1.765625 0zm14.411621 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.141327 5.765625l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm9.719482 -2.890625l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm9.375 -1.953125q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm9.281952 4.84375l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m127.268135 168.35612l0.06298828 44.220474" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m127.268135 168.35612l0.06298828 44.220474" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m407.4462 79.03937l0 321.5118" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m407.4462 79.03937l0 321.5118" fill-rule="evenodd"/><g filter="url(#shadowFilter-p.2)"><use xlink:href="#p.2" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.2" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.2"><path fill="#000000" fill-opacity="0.0" d="m276.83203 269.937l221.03937 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m276.83203 269.937l221.03937 0" fill-rule="evenodd"/></g><path fill="#000000" fill-opacity="0.0" d="m337.11548 189.55905l60.283478 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m343.11548 189.55905l48.283478 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m343.11548 187.90732l-4.538086 1.6517334l4.538086 1.6517334z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m391.39896 191.21078l4.538086 -1.6517334l-4.538086 -1.6517334z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m327.06824 199.6063l0 60.283478" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m327.06824 205.6063l0 48.283478" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m328.71997 205.6063l-1.6517334 -4.538086l-1.6517334 4.538086z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m325.4165 253.88977l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m256.73752 79.03937l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m256.73752 99.13386l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m256.73752 119.22835l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m259.2362 138.94489l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m256.73752 179.13387l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m266.7848 79.03937l0 120.18898" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m266.7848 85.03937l0 108.18898" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m268.43652 85.03937l-1.6517334 -4.5380936l-1.6517334 4.5380936z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m265.13306 193.22835l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.83203 69.37008l60.283447 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.83203 69.37008l48.283447 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m282.83203 67.718346l-4.5381165 1.6517334l4.5381165 1.6517334z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m331.11548 71.02181l4.5381165 -1.6517334l-4.5381165 -1.6517334z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m347.79266 155.0105l38.929108 0l0 28.755905l-38.929108 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m347.79266 155.0105l38.929108 0l0 28.755905l-38.929108 0z" fill-rule="evenodd"/><path fill="#000000" d="m360.24643 169.54782q0 -2.375 1.28125 -3.71875q1.28125 -1.34375 3.296875 -1.34375q1.3125 0 2.375 0.625q1.0625 0.625 1.609375 1.765625q0.5625 1.125 0.5625 2.5625q0 1.4375 -0.59375 2.59375q-0.578125 1.140625 -1.65625 1.734375q-1.0625 0.578125 -2.3125 0.578125q-1.34375 0 -2.40625 -0.640625q-1.0625 -0.65625 -1.609375 -1.78125q-0.546875 -1.125 -0.546875 -2.375zm1.3125 0.015625q0 1.71875 0.921875 2.71875q0.921875 0.984375 2.328125 0.984375q1.421875 0 2.34375 -1.0q0.921875 -1.0 0.921875 -2.84375q0 -1.15625 -0.40625 -2.03125q-0.390625 -0.875 -1.15625 -1.34375q-0.75 -0.484375 -1.6875 -0.484375q-1.34375 0 -2.3125 0.921875q-0.953125 0.921875 -0.953125 3.078125z" fill-rule="nonzero"/><path fill="#000000" d="m373.28537 179.18845l-0.78125 0l0 -4.96875q-0.28125 0.265625 -0.75 0.53125q-0.453125 0.265625 -0.8125 0.40625l0 -0.75q0.65625 -0.3125 1.140625 -0.75q0.484375 -0.4375 0.6875 -0.84375l0.515625 0l0 6.375z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m279.33072 215.37007l38.92914 0l0 28.75592l-38.92914 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m279.33072 215.37007l38.92914 0l0 28.75592l-38.92914 0z" fill-rule="evenodd"/><path fill="#000000" d="m291.7845 229.90741q0 -2.375 1.28125 -3.71875q1.28125 -1.34375 3.296875 -1.34375q1.3125 0 2.375 0.625q1.0625 0.625 1.609375 1.765625q0.5625 1.125 0.5625 2.5625q0 1.4375 -0.59375 2.59375q-0.578125 1.140625 -1.65625 1.734375q-1.0625 0.578125 -2.3125 0.578125q-1.34375 0 -2.40625 -0.640625q-1.0625 -0.65625 -1.609375 -1.78125q-0.546875 -1.125 -0.546875 -2.375zm1.3125 0.015625q0 1.71875 0.921875 2.71875q0.921875 0.984375 2.328125 0.984375q1.421875 0 2.34375 -1.0q0.921875 -1.0 0.921875 -2.84375q0 -1.15625 -0.40625 -2.03125q-0.390625 -0.875 -1.15625 -1.34375q-0.75 -0.484375 -1.6875 -0.484375q-1.34375 0 -2.3125 0.921875q-0.953125 0.921875 -0.953125 3.078125z" fill-rule="nonzero"/><path fill="#000000" d="m301.88593 236.40741q0 -1.125 0.21875 -1.8125q0.234375 -0.6875 0.6875 -1.046875q0.46875 -0.375 1.15625 -0.375q0.515625 0 0.890625 0.203125q0.390625 0.203125 0.640625 0.59375q0.25 0.390625 0.390625 0.953125q0.15625 0.546875 0.15625 1.484375q0 1.125 -0.234375 1.8125q-0.234375 0.6875 -0.6875 1.0625q-0.453125 0.375 -1.15625 0.375q-0.921875 0 -1.4375 -0.65625q-0.625 -0.796875 -0.625 -2.59375zm0.796875 0q0 1.578125 0.359375 2.09375q0.375 0.515625 0.90625 0.515625q0.546875 0 0.90625 -0.515625q0.359375 -0.53125 0.359375 -2.09375q0 -1.5625 -0.359375 -2.078125q-0.359375 -0.515625 -0.90625 -0.515625q-0.546875 0 -0.859375 0.453125q-0.40625 0.578125 -0.40625 2.140625z" fill-rule="nonzero"/><path fill="#8e7cc3" d="m230.86093 123.25984l29.228363 0l0 26.519684l-29.228363 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m230.86093 123.25984l29.228363 0l0 26.519684l-29.228363 0z" fill-rule="evenodd"/><path fill="#000000" d="m241.09273 141.31969l3.6875 -4.96875l-3.25 -4.578125l1.5 0l1.734375 2.453125q0.53125 0.765625 0.765625 1.171875q0.3125 -0.515625 0.75 -1.09375l1.921875 -2.53125l1.375 0l-3.359375 4.5l3.625 5.046875l-1.5625 0l-2.40625 -3.40625q-0.203125 -0.296875 -0.421875 -0.640625q-0.3125 0.53125 -0.453125 0.71875l-2.390625 3.328125l-1.515625 0z" fill-rule="nonzero"/><path fill="#8e7cc3" d="m292.35962 35.527557l29.228363 0l0 26.519688l-29.228363 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m292.35962 35.527557l29.228363 0l0 26.519688l-29.228363 0z" fill-rule="evenodd"/><path fill="#000000" d="m306.24768 53.5874l0 -4.046875l-3.6875 -5.5l1.546875 0l1.875 2.875q0.515625 0.8125 0.96875 1.625q0.4375 -0.75 1.046875 -1.6875l1.84375 -2.8125l1.46875 0l-3.796875 5.5l0 4.046875l-1.265625 0z" fill-rule="nonzero"/><path fill="#d9d2e9" d="m89.2511 288.5451l76.09448 0l0 40.06299l-76.09448 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m89.2511 288.5451l76.09448 0l0 40.06299l-76.09448 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m89.219604 130.46635l76.09448 0l0 40.06299l-76.09448 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m89.219604 130.46635l76.09448 0l0 40.06299l-76.09448 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m56.479004 320.40945l141.63779 0l0 36.283447l-141.63779 0z" fill-rule="evenodd"/><path fill="#000000" d="m73.166504 342.09506l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0zm14.370804 4.046875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm7.781967 3.390625l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 1.46875l0 -13.359375l1.640625 0l0 4.796875q1.140625 -1.328125 2.890625 -1.328125q1.078125 0 1.859375 0.421875q0.796875 0.421875 1.140625 1.171875q0.34375 0.75 0.34375 2.171875l0 6.125l-1.640625 0l0 -6.125q0 -1.234375 -0.53125 -1.796875q-0.53125 -0.5625 -1.515625 -0.5625q-0.71875 0 -1.359375 0.390625q-0.640625 0.375 -0.921875 1.015625q-0.265625 0.640625 -0.265625 1.78125l0 5.296875l-1.640625 0zm17.000717 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm11.911606 0l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683304 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm16.641342 0l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125zm9.281967 -6.640625l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm10.457321 -3.546875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.578125 0.515625 -2.75q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.8125q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm9.640625 0.4375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm8.485092 2.875l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125z" fill-rule="nonzero"/><path fill="#d9d2e9" d="m89.22223 168.35612l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m89.22223 168.35612l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m89.22223 248.48286l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m89.22223 248.48286l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path fill="#8e7cc3" d="m89.222626 208.41995l38.045906 0l0 40.06337l-38.045906 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m89.222626 208.41995l38.045906 0l0 40.06337l-38.045906 0z" fill-rule="evenodd"/><path fill="#000000" d="m102.10192 235.37163l5.171875 -6.953125l-4.5625 -6.40625l2.109375 0l2.421875 3.4375q0.75 1.0625 1.078125 1.625q0.4375 -0.71875 1.046875 -1.515625l2.6875 -3.546875l1.921875 0l-4.6875 6.296875l5.0625 7.0625l-2.1875 0l-3.359375 -4.765625q-0.28125 -0.40625 -0.59375 -0.890625q-0.4375 0.734375 -0.625 1.0l-3.359375 4.65625l-2.125 0z" fill-rule="nonzero"/><path fill="#8e7cc3" d="m127.25937 208.4357l38.04724 0l0 40.06299l-38.04724 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.25937 208.4357l38.04724 0l0 40.06299l-38.04724 0z" fill-rule="evenodd"/><path fill="#000000" d="m145.26433 235.38719l0 -5.65625l-5.15625 -7.703125l2.15625 0l2.640625 4.03125q0.71875 1.125 1.34375 2.265625q0.609375 -1.046875 1.46875 -2.359375l2.578125 -3.9375l2.0625 0l-5.328125 7.703125l0 5.65625l-1.765625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m127.266846 130.46635l0.03149414 198.14174" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m127.266846 130.46635l0.03149414 198.14174" fill-rule="evenodd"/><path fill="#ffffff" d="m399.69815 290.2896l7.748047 -7.748047l7.7480164 7.748047l-3.873993 0l0 12.346436l-7.748047 0l0 -12.346436z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m399.69815 290.2896l7.748047 -7.748047l7.7480164 7.748047l-3.873993 0l0 12.346436l-7.748047 0l0 -12.346436z" fill-rule="evenodd"/><path fill="#ffffff" d="m354.67453 305.63605l105.543304 0l0 20.094513l-105.543304 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m354.67453 305.63605l105.543304 0l0 20.094513l-105.543304 0z" fill-rule="evenodd"/><path fill="#000000" d="m364.92453 320.4833l0 -9.546875l1.25 0l0 9.546875l-1.25 0zm3.328003 0l0 -6.90625l1.0625 0l0 0.984375q0.75 -1.140625 2.1875 -1.140625q0.625 0 1.15625 0.21875q0.53125 0.21875 0.78125 0.59375q0.265625 0.359375 0.375 0.859375q0.0625 0.328125 0.0625 1.140625l0 4.25l-1.171875 0l0 -4.203125q0 -0.71875 -0.140625 -1.0625q-0.140625 -0.359375 -0.484375 -0.5625q-0.34375 -0.21875 -0.8125 -0.21875q-0.75 0 -1.296875 0.46875q-0.546875 0.46875 -0.546875 1.796875l0 3.78125l-1.171875 0zm7.4124756 2.65625l0 -9.5625l1.078125 0l0 0.890625q0.375 -0.53125 0.84375 -0.78125q0.484375 -0.265625 1.15625 -0.265625q0.875 0 1.546875 0.453125q0.6875 0.453125 1.03125 1.28125q0.34375 0.828125 0.34375 1.828125q0 1.046875 -0.375 1.90625q-0.375 0.84375 -1.109375 1.296875q-0.71875 0.453125 -1.53125 0.453125q-0.578125 0 -1.046875 -0.25q-0.46875 -0.25 -0.765625 -0.625l0 3.375l-1.171875 0zm1.0625 -6.078125q0 1.34375 0.53125 1.984375q0.546875 0.625 1.3125 0.625q0.78125 0 1.34375 -0.65625q0.5625 -0.65625 0.5625 -2.046875q0 -1.3125 -0.546875 -1.96875q-0.546875 -0.671875 -1.296875 -0.671875q-0.75 0 -1.328125 0.703125q-0.578125 0.703125 -0.578125 2.03125zm10.881226 3.421875l0 -1.015625q-0.8125 1.171875 -2.1875 1.171875q-0.609375 0 -1.140625 -0.234375q-0.53125 -0.234375 -0.796875 -0.578125q-0.25 -0.359375 -0.359375 -0.875q-0.0625 -0.34375 -0.0625 -1.09375l0 -4.28125l1.171875 0l0 3.828125q0 0.921875 0.0625 1.234375q0.109375 0.46875 0.46875 0.734375q0.359375 0.25 0.890625 0.25q0.515625 0 0.984375 -0.265625q0.46875 -0.265625 0.65625 -0.734375q0.1875 -0.46875 0.1875 -1.34375l0 -3.703125l1.171875 0l0 6.90625l-1.046875 0zm5.4437256 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.0153503 1.046875l0 -9.546875l6.90625 0l0 1.125l-5.640625 0l0 2.921875l5.28125 0l0 1.125l-5.28125 0l0 3.25l5.859375 0l0 1.125l-7.125 0zm8.70224 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm7.71109 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm15.8368225 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.0625 0l0 0.984375q0.75 -1.140625 2.1875 -1.140625q0.625 0 1.15625 0.21875q0.53125 0.21875 0.78125 0.59375q0.265625 0.359375 0.375 0.859375q0.0625 0.328125 0.0625 1.140625l0 4.25l-1.171875 0l0 -4.203125q0 -0.71875 -0.140625 -1.0625q-0.140625 -0.359375 -0.484375 -0.5625q-0.34375 -0.21875 -0.8125 -0.21875q-0.75 0 -1.296875 0.46875q-0.546875 0.46875 -0.546875 1.796875l0 3.78125l-1.171875 0zm9.974976 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m79.14611 130.41124l0 77.95276" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m79.14611 136.41124l0 65.95274" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m80.797844 136.41124l-1.6517334 -4.538101l-1.6517334 4.538101z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m77.49438 202.36398l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m42.311485 156.79002l29.228348 0l0 25.196854l-29.228348 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m42.311485 156.79002l29.228348 0l0 25.196854l-29.228348 0z" fill-rule="evenodd"/><path fill="#000000" d="m57.242134 170.45407l0 -1.125l4.03125 -0.015625l0 3.546875q-0.921875 0.75 -1.921875 1.125q-0.984375 0.359375 -2.03125 0.359375q-1.40625 0 -2.5625 -0.59375q-1.140625 -0.609375 -1.734375 -1.734375q-0.578125 -1.140625 -0.578125 -2.546875q0 -1.40625 0.578125 -2.609375q0.59375 -1.203125 1.6875 -1.78125q1.09375 -0.59375 2.515625 -0.59375q1.03125 0 1.859375 0.34375q0.84375 0.328125 1.3125 0.9375q0.484375 0.59375 0.734375 1.546875l-1.140625 0.3125q-0.21875 -0.71875 -0.53125 -1.140625q-0.3125 -0.421875 -0.90625 -0.671875q-0.59375 -0.25 -1.3125 -0.25q-0.875 0 -1.515625 0.265625q-0.625 0.265625 -1.015625 0.703125q-0.375 0.421875 -0.59375 0.9375q-0.359375 0.875 -0.359375 1.921875q0 1.265625 0.4375 2.125q0.4375 0.859375 1.265625 1.28125q0.84375 0.421875 1.796875 0.421875q0.8125 0 1.59375 -0.3125q0.78125 -0.328125 1.1875 -0.6875l0 -1.765625l-2.796875 0z" fill-rule="nonzero"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_xla_gather_1.svg b/tensorflow/compiler/xla/g3doc/images/ops_xla_gather_1.svg
new file mode 100644
index 0000000000000000000000000000000000000000..f460b923f0efa5594a25251ba308f0fe4b9bf786
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/images/ops_xla_gather_1.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 699.7979002624672 457.90288713910763" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l699.7979 0l0 457.9029l-699.7979 0l0 -457.9029z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l699.7979 0l0 457.9029l-699.7979 0z" fill-rule="evenodd"/><g filter="url(#shadowFilter-p.1)"><use xlink:href="#p.1" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.1" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.1"><path fill="#fff2cc" d="m385.85303 8.472441l300.063 0l0 434.3937l-300.063 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m385.85303 8.472441l300.063 0l0 434.3937l-300.063 0z" fill-rule="evenodd"/></g><path fill="#f4cccc" d="m444.83203 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m505.11548 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m525.20996 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m545.30444 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m565.3989 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m585.4934 79.03937l20.094543 0l0 20.09449l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 79.03937l20.094543 0l0 20.09449l-20.094543 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m605.58795 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m505.11548 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m525.20996 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m545.30444 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m565.3989 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m585.4934 99.13386l20.094543 0l0 20.09449l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 99.13386l20.094543 0l0 20.09449l-20.094543 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m605.58795 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 79.03937l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 99.13386l20.094482 0l0 20.09449l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m505.11548 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m525.20996 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m545.30444 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m565.3989 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m585.4934 119.22835l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 119.22835l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m605.58795 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m505.11548 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m525.20996 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m545.30444 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m565.3989 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m585.4934 139.32283l20.094543 0l0 20.094498l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 139.32283l20.094543 0l0 20.094498l-20.094543 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m605.58795 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 119.22835l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 139.32283l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m505.11548 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m525.20996 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m545.30444 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m565.3989 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m585.4934 159.41733l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 159.41733l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m605.58795 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m505.11548 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m525.20996 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m545.30444 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m565.3989 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m585.4934 179.51181l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 179.51181l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m605.58795 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 159.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 179.51181l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m444.83203 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m464.9265 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#e6b8af" d="m485.021 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m505.11548 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m525.20996 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m545.30444 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m565.3989 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m585.4934 199.6063l20.094543 0l0 20.094498l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 199.6063l20.094543 0l0 20.094498l-20.094543 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m605.58795 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m505.11548 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m525.20996 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m545.30444 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m565.3989 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m585.4934 219.70079l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 219.70079l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m605.58795 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 199.6063l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 219.70079l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m505.11548 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m525.20996 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m545.30444 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m565.3989 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m585.4934 239.79527l20.094543 0l0 20.094498l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 239.79527l20.094543 0l0 20.094498l-20.094543 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m605.58795 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m505.11548 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m525.20996 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m545.30444 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#cc0000" d="m565.3989 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m585.4934 259.88977l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 259.88977l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m605.58795 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 239.79527l20.094482 0l0 20.094498l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 259.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m505.11548 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m525.20996 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m545.30444 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m565.3989 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m585.4934 279.98425l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 279.98425l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m605.58795 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m505.11548 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m525.20996 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m545.30444 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m565.3989 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m585.4934 300.07874l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 300.07874l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m605.58795 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 279.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 300.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m505.11548 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m525.20996 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m545.30444 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m565.3989 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m585.4934 320.17322l20.094543 0l0 20.094513l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 320.17322l20.094543 0l0 20.094513l-20.094543 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m605.58795 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m505.11548 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m525.20996 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m545.30444 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m565.3989 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m585.4934 340.26773l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 340.26773l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m605.58795 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 320.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 340.26773l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m505.11548 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m525.20996 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m545.30444 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m565.3989 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m585.4934 360.3622l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 360.3622l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m605.58795 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m444.83203 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m444.83203 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m464.9265 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m464.9265 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m485.021 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m485.021 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m505.11548 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m505.11548 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m525.20996 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m525.20996 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m545.30444 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m545.30444 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m565.3989 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m565.3989 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m585.4934 380.4567l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m585.4934 380.4567l20.094543 0l0 20.094482l-20.094543 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m605.58795 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m605.58795 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m625.68243 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m625.68243 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 360.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m645.7769 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m645.7769 380.4567l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m445.17847 405.98425l220.3465 0l0 26.519684l-220.3465 0z" fill-rule="evenodd"/><path fill="#000000" d="m455.91284 432.90424l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683319 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm10.375702 3.703125l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm15.219482 4.78125l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm7.625702 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm10.397858 1.46875l0 -11.78125l-4.40625 0l0 -1.578125l10.578125 0l0 1.578125l-4.40625 0l0 11.78125l-1.765625 0zm14.411621 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.141357 5.765625l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm9.719421 -2.890625l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm9.375 -1.953125q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm9.281982 4.84375l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m434.7848 79.03937l0 120.18898" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m434.7848 85.03937l0 108.18898" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m436.43652 85.03937l-1.6517334 -4.5380936l-1.6517334 4.5380936z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m433.13306 193.22835l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m444.83203 69.37008l60.283447 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m450.83203 69.37008l48.283447 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m450.83203 67.718346l-4.5381165 1.6517334l4.5381165 1.6517334z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m499.11548 71.02181l4.5381165 -1.6517334l-4.5381165 -1.6517334z" fill-rule="evenodd"/><path fill="#8e7cc3" d="m398.86093 123.25984l29.228363 0l0 26.519684l-29.228363 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m398.86093 123.25984l29.228363 0l0 26.519684l-29.228363 0z" fill-rule="evenodd"/><path fill="#000000" d="m409.09274 141.31969l3.6875 -4.96875l-3.25 -4.578125l1.5 0l1.734375 2.453125q0.53125 0.765625 0.765625 1.171875q0.3125 -0.515625 0.75 -1.09375l1.921875 -2.53125l1.375 0l-3.359375 4.5l3.625 5.046875l-1.5625 0l-2.40625 -3.40625q-0.203125 -0.296875 -0.421875 -0.640625q-0.3125 0.53125 -0.453125 0.71875l-2.390625 3.328125l-1.515625 0z" fill-rule="nonzero"/><path fill="#8e7cc3" d="m460.35962 35.527557l29.228363 0l0 26.519688l-29.228363 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m460.35962 35.527557l29.228363 0l0 26.519688l-29.228363 0z" fill-rule="evenodd"/><path fill="#000000" d="m474.24768 53.5874l0 -4.046875l-3.6875 -5.5l1.546875 0l1.875 2.875q0.515625 0.8125 0.96875 1.625q0.4375 -0.75 1.046875 -1.6875l1.84375 -2.8125l1.46875 0l-3.796875 5.5l0 4.046875l-1.265625 0z" fill-rule="nonzero"/><path fill="#d9d2e9" d="m69.21183 351.24402l76.09182 0l0 40.063354l-76.09182 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m69.21183 351.24402l76.09182 0l0 40.063354l-76.09182 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m145.30365 351.24402l76.09181 0l0 40.063354l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m145.30365 351.24402l76.09181 0l0 40.063354l-76.09181 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m297.48727 351.24402l76.0918 0l0 40.063354l-76.0918 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m297.48727 351.24402l76.0918 0l0 40.063354l-76.0918 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m221.39546 351.24402l76.09181 0l0 40.063354l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m221.39546 351.24402l76.09181 0l0 40.063354l-76.09181 0z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m69.2126 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m69.2126 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path fill="#000000" d="m95.319725 74.78799l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0z" fill-rule="nonzero"/><path fill="#000000" d="m102.65928 82.63174q0 -1.578125 0.328125 -2.53125q0.328125 -0.96875 0.96875 -1.484375q0.640625 -0.53125 1.609375 -0.53125q0.71875 0 1.25 0.296875q0.546875 0.28125 0.890625 0.828125q0.359375 0.53125 0.5625 1.3125q0.203125 0.78125 0.203125 2.109375q0 1.5625 -0.328125 2.53125q-0.328125 0.953125 -0.96875 1.484375q-0.640625 0.53125 -1.609375 0.53125q-1.296875 0 -2.03125 -0.921875q-0.875 -1.109375 -0.875 -3.625zm1.125 0q0 2.1875 0.515625 2.921875q0.515625 0.71875 1.265625 0.71875q0.75 0 1.265625 -0.71875q0.515625 -0.734375 0.515625 -2.921875q0 -2.203125 -0.515625 -2.921875q-0.515625 -0.71875 -1.28125 -0.71875q-0.75 0 -1.203125 0.640625q-0.5625 0.8125 -0.5625 3.0z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m127.480316 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.480316 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path fill="#000000" d="m153.58743 74.78799l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0z" fill-rule="nonzero"/><path fill="#000000" d="m165.05199 87.02236l-1.09375 0l0 -6.96875q-0.40625 0.375 -1.046875 0.75q-0.640625 0.375 -1.140625 0.578125l0 -1.0625q0.90625 -0.4375 1.59375 -1.046875q0.6875 -0.609375 0.96875 -1.1875l0.71875 0l0 8.9375z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m244.01575 48.0l58.26773 0l0 50.204727l-58.26773 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m244.01575 48.0l58.26773 0l0 50.204727l-58.26773 0z" fill-rule="evenodd"/><path fill="#000000" d="m263.3416 73.52236q0 -3.328125 1.78125 -5.203125q1.78125 -1.890625 4.609375 -1.890625q1.84375 0 3.328125 0.890625q1.484375 0.875 2.265625 2.46875q0.78125 1.578125 0.78125 3.578125q0 2.03125 -0.828125 3.640625q-0.8125 1.59375 -2.3125 2.421875q-1.5 0.828125 -3.25 0.828125q-1.875 0 -3.359375 -0.90625q-1.484375 -0.921875 -2.25 -2.5q-0.765625 -1.578125 -0.765625 -3.328125zm1.8125 0.015625q0 2.421875 1.296875 3.8125q1.296875 1.390625 3.265625 1.390625q2.0 0 3.28125 -1.40625q1.28125 -1.40625 1.28125 -3.984375q0 -1.625 -0.546875 -2.84375q-0.546875 -1.21875 -1.609375 -1.875q-1.0625 -0.671875 -2.375 -0.671875q-1.890625 0 -3.25 1.296875q-1.34375 1.28125 -1.34375 4.28125z" fill-rule="nonzero"/><path fill="#000000" d="m281.58743 87.02236l-1.09375 0l0 -6.96875q-0.40625 0.375 -1.046875 0.75q-0.640625 0.375 -1.140625 0.578125l0 -1.0625q0.90625 -0.4375 1.59375 -1.046875q0.6875 -0.609375 0.96875 -1.1875l0.71875 0l0 8.9375z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m185.74803 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m185.74803 48.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path fill="#000000" d="m205.0739 73.52236q0 -3.328125 1.78125 -5.203125q1.78125 -1.890625 4.609375 -1.890625q1.84375 0 3.328125 0.890625q1.484375 0.875 2.265625 2.46875q0.78125 1.578125 0.78125 3.578125q0 2.03125 -0.828125 3.640625q-0.8125 1.59375 -2.3125 2.421875q-1.5 0.828125 -3.25 0.828125q-1.875 0 -3.359375 -0.90625q-1.484375 -0.921875 -2.25 -2.5q-0.765625 -1.578125 -0.765625 -3.328125zm1.8125 0.015625q0 2.421875 1.296875 3.8125q1.296875 1.390625 3.265625 1.390625q2.0 0 3.28125 -1.40625q1.28125 -1.40625 1.28125 -3.984375q0 -1.625 -0.546875 -2.84375q-0.546875 -1.21875 -1.609375 -1.875q-1.0625 -0.671875 -2.375 -0.671875q-1.890625 0 -3.25 1.296875q-1.34375 1.28125 -1.34375 4.28125z" fill-rule="nonzero"/><path fill="#000000" d="m219.1947 82.63174q0 -1.578125 0.328125 -2.53125q0.328125 -0.96875 0.96875 -1.484375q0.640625 -0.53125 1.609375 -0.53125q0.71875 0 1.25 0.296875q0.546875 0.28125 0.890625 0.828125q0.359375 0.53125 0.5625 1.3125q0.203125 0.78125 0.203125 2.109375q0 1.5625 -0.328125 2.53125q-0.328125 0.953125 -0.96875 1.484375q-0.640625 0.53125 -1.609375 0.53125q-1.296875 0 -2.03125 -0.921875q-0.875 -1.109375 -0.875 -3.625zm1.125 0q0 2.1875 0.515625 2.921875q0.515625 0.71875 1.265625 0.71875q0.75 0 1.265625 -0.71875q0.515625 -0.734375 0.515625 -2.921875q0 -2.203125 -0.515625 -2.921875q-0.515625 -0.71875 -1.28125 -0.71875q-0.75 0 -1.203125 0.640625q-0.5625 0.8125 -0.5625 3.0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m68.267715 7.813648l260.66144 0l0 44.09449l-260.66144 0z" fill-rule="evenodd"/><path fill="#000000" d="m78.173965 28.233646q0 -3.328125 1.78125 -5.203125q1.78125 -1.890625 4.609375 -1.890625q1.84375 0 3.328125 0.890625q1.484375 0.875 2.265625 2.46875q0.78125 1.578125 0.78125 3.578125q0 2.03125 -0.828125 3.640625q-0.8125 1.59375 -2.3125 2.421875q-1.5 0.828125 -3.25 0.828125q-1.875 0 -3.359375 -0.90625q-1.484375 -0.921875 -2.25 -2.5q-0.765625 -1.578125 -0.765625 -3.328125zm1.8125 0.015625q0 2.421875 1.296875 3.8125q1.296875 1.390625 3.265625 1.390625q2.0 0 3.28125 -1.40625q1.28125 -1.40625 1.28125 -3.984375q0 -1.625 -0.546875 -2.84375q-0.546875 -1.21875 -1.609375 -1.875q-1.0625 -0.671875 -2.375 -0.671875q-1.890625 0 -3.25 1.296875q-1.34375 1.28125 -1.34375 4.28125zm19.355179 6.484375l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm7.625717 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 5.171875l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm15.219467 4.78125l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm7.625717 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm7.2884827 1.46875l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683304 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm16.641342 0l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125zm15.906967 1.71875l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm8.047592 5.765625l3.53125 -5.03125l-3.265625 -4.640625l2.046875 0l1.484375 2.265625q0.421875 0.640625 0.671875 1.078125q0.40625 -0.59375 0.734375 -1.0625l1.640625 -2.28125l1.953125 0l-3.34375 4.546875l3.59375 5.125l-2.015625 0l-1.984375 -3.0l-0.515625 -0.8125l-2.546875 3.8125l-1.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m69.2126 404.88846l237.10237 0l0 36.283447l-237.10237 0z" fill-rule="evenodd"/><path fill="#000000" d="m85.9001 426.57407l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0zm14.370804 4.046875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm7.781967 3.390625l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 1.46875l0 -13.359375l1.640625 0l0 4.796875q1.140625 -1.328125 2.890625 -1.328125q1.078125 0 1.859375 0.421875q0.796875 0.421875 1.140625 1.171875q0.34375 0.75 0.34375 2.171875l0 6.125l-1.640625 0l0 -6.125q0 -1.234375 -0.53125 -1.796875q-0.53125 -0.5625 -1.515625 -0.5625q-0.71875 0 -1.359375 0.390625q-0.640625 0.375 -0.921875 1.015625q-0.265625 0.640625 -0.265625 1.78125l0 5.296875l-1.640625 0zm17.000717 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm11.911606 0l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683304 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm16.641342 0l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125zm9.281967 -6.640625l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm10.457321 -3.546875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.578125 0.515625 -2.75q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.8125q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm9.640625 0.4375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm8.485092 2.875l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125z" fill-rule="nonzero"/><path fill="#d9d2e9" d="m69.21183 190.9598l76.09182 0l0 40.06337l-76.09182 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m69.21183 190.9598l76.09182 0l0 40.06337l-76.09182 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m145.30365 190.9598l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m145.30365 190.9598l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m297.48727 190.9598l76.0918 0l0 40.06337l-76.0918 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m297.48727 190.9598l76.0918 0l0 40.06337l-76.0918 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m221.39546 190.9598l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m221.39546 190.9598l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m69.21183 231.03853l76.09182 0l0 40.06337l-76.09182 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m69.21183 231.03853l76.09182 0l0 40.06337l-76.09182 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m145.30365 231.03853l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m145.30365 231.03853l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m297.48727 231.03853l76.0918 0l0 40.06337l-76.0918 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m297.48727 231.03853l76.0918 0l0 40.06337l-76.0918 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m221.39546 231.03853l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m221.39546 231.03853l76.09181 0l0 40.06337l-76.09181 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m69.21183 311.16528l76.09182 0l0 40.063354l-76.09182 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m69.21183 311.16528l76.09182 0l0 40.063354l-76.09182 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m145.30365 311.16528l76.09181 0l0 40.063354l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m145.30365 311.16528l76.09181 0l0 40.063354l-76.09181 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m297.48727 311.16528l76.0918 0l0 40.063354l-76.0918 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m297.48727 311.16528l76.0918 0l0 40.063354l-76.0918 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m221.39546 311.16528l76.09181 0l0 40.063354l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m221.39546 311.16528l76.09181 0l0 40.063354l-76.09181 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m69.21183 271.1019l76.09182 0l0 40.063385l-76.09182 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m69.21183 271.1019l76.09182 0l0 40.063385l-76.09182 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m145.30365 271.1019l76.09181 0l0 40.063385l-76.09181 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m145.30365 271.1019l76.09181 0l0 40.063385l-76.09181 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m297.48727 271.1019l76.0918 0l0 40.063385l-76.0918 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m297.48727 271.1019l76.0918 0l0 40.063385l-76.0918 0z" fill-rule="evenodd"/><path fill="#8e7cc3" d="m221.39586 271.10236l38.045914 0l0 40.063385l-38.045914 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m221.39586 271.10236l38.045914 0l0 40.063385l-38.045914 0z" fill-rule="evenodd"/><path fill="#000000" d="m234.27515 298.05405l5.171875 -6.953125l-4.5625 -6.40625l2.109375 0l2.421875 3.4375q0.75 1.0625 1.078125 1.625q0.4375 -0.71875 1.046875 -1.515625l2.6875 -3.546875l1.921875 0l-4.6875 6.296875l5.0625 7.0625l-2.1875 0l-3.359375 -4.765625q-0.28125 -0.40625 -0.59375 -0.890625q-0.4375 0.734375 -0.625 1.0l-3.359375 4.65625l-2.125 0z" fill-rule="nonzero"/><path fill="#8e7cc3" d="m259.4326 271.1181l38.04727 0l0 40.06299l-38.04727 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m259.4326 271.1181l38.04727 0l0 40.06299l-38.04727 0z" fill-rule="evenodd"/><path fill="#000000" d="m277.43756 298.0696l0 -5.65625l-5.15625 -7.703125l2.15625 0l2.640625 4.03125q0.71875 1.125 1.34375 2.265625q0.609375 -1.046875 1.46875 -2.359375l2.578125 -3.9375l2.0625 0l-5.328125 7.703125l0 5.65625l-1.765625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m259.44138 190.9598l0.06298828 84.28346" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m259.44138 190.9598l0.06298828 84.28346" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m259.44138 307.02393l0 84.28345" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m259.44138 307.02393l0 84.28345" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m183.34955 190.9598l0 200.34645" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m183.34955 190.9598l0 200.34645" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m335.53317 190.9598l0 200.34645" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m335.53317 190.9598l0 200.34645" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m107.25774 190.9598l0 200.34645" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m107.25774 190.9598l0 200.34645" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m575.44617 79.03937l0 321.5118" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m575.44617 79.03937l0 321.5118" fill-rule="evenodd"/><g filter="url(#shadowFilter-p.2)"><use xlink:href="#p.2" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.2" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.2"><path fill="#000000" fill-opacity="0.0" d="m444.83203 269.937l221.03937 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="1.0,3.0" d="m444.83203 269.937l221.03937 0" fill-rule="evenodd"/></g><path fill="#000000" fill-opacity="0.0" d="m505.11548 189.55905l60.283447 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m511.11548 189.55905l48.283447 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m511.11548 187.90732l-4.538086 1.6517334l4.538086 1.6517334z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m559.3989 191.21078l4.538147 -1.6517334l-4.538147 -1.6517334z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m495.06824 199.6063l0 60.283478" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m495.06824 205.6063l0 48.283478" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m496.71997 205.6063l-1.6517334 -4.538086l-1.6517334 4.538086z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m493.4165 253.88977l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m424.73752 119.03937l20.094513 0l0 20.094498l-20.094513 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m424.73752 115.13386l20.094513 0l0 20.09449l-20.094513 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m424.73752 135.22835l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m427.2362 154.94489l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m424.73752 195.13387l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m515.79266 155.32283l38.92914 0l0 28.755905l-38.92914 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m515.79266 155.32283l38.92914 0l0 28.755905l-38.92914 0z" fill-rule="evenodd"/><path fill="#000000" d="m528.24646 169.86017q0 -2.375 1.28125 -3.71875q1.28125 -1.34375 3.296875 -1.34375q1.3125 0 2.375 0.625q1.0625 0.625 1.609375 1.765625q0.5625 1.125 0.5625 2.5625q0 1.4375 -0.59375 2.59375q-0.578125 1.140625 -1.65625 1.734375q-1.0625 0.578125 -2.3125 0.578125q-1.34375 0 -2.40625 -0.640625q-1.0625 -0.65625 -1.609375 -1.78125q-0.546875 -1.125 -0.546875 -2.375zm1.3125 0.015625q0 1.71875 0.921875 2.71875q0.921875 0.984375 2.328125 0.984375q1.421875 0 2.34375 -1.0q0.921875 -1.0 0.921875 -2.84375q0 -1.15625 -0.40625 -2.03125q-0.390625 -0.875 -1.15625 -1.34375q-0.75 -0.484375 -1.6875 -0.484375q-1.34375 0 -2.3125 0.921875q-0.953125 0.921875 -0.953125 3.078125z" fill-rule="nonzero"/><path fill="#000000" d="m541.28534 179.5008l-0.78125 0l0 -4.96875q-0.28125 0.265625 -0.75 0.53125q-0.453125 0.265625 -0.8125 0.40625l0 -0.75q0.65625 -0.3125 1.140625 -0.75q0.484375 -0.4375 0.6875 -0.84375l0.515625 0l0 6.375z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m450.37833 216.79002l38.929108 0l0 28.755905l-38.929108 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m450.37833 216.79002l38.929108 0l0 28.755905l-38.929108 0z" fill-rule="evenodd"/><path fill="#000000" d="m462.8321 231.32735q0 -2.375 1.28125 -3.71875q1.28125 -1.34375 3.296875 -1.34375q1.3125 0 2.375 0.625q1.0625 0.625 1.609375 1.765625q0.5625 1.125 0.5625 2.5625q0 1.4375 -0.59375 2.59375q-0.578125 1.140625 -1.65625 1.734375q-1.0625 0.578125 -2.3125 0.578125q-1.34375 0 -2.40625 -0.640625q-1.0625 -0.65625 -1.609375 -1.78125q-0.546875 -1.125 -0.546875 -2.375zm1.3125 0.015625q0 1.71875 0.921875 2.71875q0.921875 0.984375 2.328125 0.984375q1.421875 0 2.34375 -1.0q0.921875 -1.0 0.921875 -2.84375q0 -1.15625 -0.40625 -2.03125q-0.390625 -0.875 -1.15625 -1.34375q-0.75 -0.484375 -1.6875 -0.484375q-1.34375 0 -2.3125 0.921875q-0.953125 0.921875 -0.953125 3.078125z" fill-rule="nonzero"/><path fill="#000000" d="m472.93353 237.82735q0 -1.125 0.21875 -1.8125q0.234375 -0.6875 0.6875 -1.046875q0.46875 -0.375 1.15625 -0.375q0.515625 0 0.890625 0.203125q0.390625 0.203125 0.640625 0.59375q0.25 0.390625 0.390625 0.953125q0.15625 0.546875 0.15625 1.484375q0 1.125 -0.234375 1.8125q-0.234375 0.6875 -0.6875 1.0625q-0.453125 0.375 -1.15625 0.375q-0.921875 0 -1.4375 -0.65625q-0.625 -0.796875 -0.625 -2.59375zm0.796875 0q0 1.578125 0.359375 2.09375q0.375 0.515625 0.90625 0.515625q0.546875 0 0.90625 -0.515625q0.359375 -0.53125 0.359375 -2.09375q0 -1.5625 -0.359375 -2.078125q-0.359375 -0.515625 -0.90625 -0.515625q-0.546875 0 -0.859375 0.453125q-0.40625 0.578125 -0.40625 2.140625z" fill-rule="nonzero"/><path fill="#ffffff" d="m567.6982 290.34647l7.747986 -7.748047l7.748047 7.748047l-3.8740234 0l0 12.346436l-7.747986 0l0 -12.346436z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m567.6982 290.34647l7.747986 -7.748047l7.748047 7.748047l-3.8740234 0l0 12.346436l-7.747986 0l0 -12.346436z" fill-rule="evenodd"/><path fill="#ffffff" d="m522.67456 305.6929l105.543274 0l0 20.094513l-105.543274 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m522.67456 305.6929l105.543274 0l0 20.094513l-105.543274 0z" fill-rule="evenodd"/><path fill="#000000" d="m532.92456 320.54016l0 -9.546875l1.25 0l0 9.546875l-1.25 0zm3.327942 0l0 -6.90625l1.0625 0l0 0.984375q0.75 -1.140625 2.1875 -1.140625q0.625 0 1.15625 0.21875q0.53125 0.21875 0.78125 0.59375q0.265625 0.359375 0.375 0.859375q0.0625 0.328125 0.0625 1.140625l0 4.25l-1.171875 0l0 -4.203125q0 -0.71875 -0.140625 -1.0625q-0.140625 -0.359375 -0.484375 -0.5625q-0.34375 -0.21875 -0.8125 -0.21875q-0.75 0 -1.296875 0.46875q-0.546875 0.46875 -0.546875 1.796875l0 3.78125l-1.171875 0zm7.4124756 2.65625l0 -9.5625l1.078125 0l0 0.890625q0.375 -0.53125 0.84375 -0.78125q0.484375 -0.265625 1.15625 -0.265625q0.875 0 1.546875 0.453125q0.6875 0.453125 1.03125 1.28125q0.34375 0.828125 0.34375 1.828125q0 1.046875 -0.375 1.90625q-0.375 0.84375 -1.109375 1.296875q-0.71875 0.453125 -1.53125 0.453125q-0.578125 0 -1.046875 -0.25q-0.46875 -0.25 -0.765625 -0.625l0 3.375l-1.171875 0zm1.0625 -6.078125q0 1.34375 0.53125 1.984375q0.546875 0.625 1.3125 0.625q0.78125 0 1.34375 -0.65625q0.5625 -0.65625 0.5625 -2.046875q0 -1.3125 -0.546875 -1.96875q-0.546875 -0.671875 -1.296875 -0.671875q-0.75 0 -1.328125 0.703125q-0.578125 0.703125 -0.578125 2.03125zm10.881226 3.421875l0 -1.015625q-0.8125 1.171875 -2.1875 1.171875q-0.609375 0 -1.140625 -0.234375q-0.53125 -0.234375 -0.796875 -0.578125q-0.25 -0.359375 -0.359375 -0.875q-0.0625 -0.34375 -0.0625 -1.09375l0 -4.28125l1.171875 0l0 3.828125q0 0.921875 0.0625 1.234375q0.109375 0.46875 0.46875 0.734375q0.359375 0.25 0.890625 0.25q0.515625 0 0.984375 -0.265625q0.46875 -0.265625 0.65625 -0.734375q0.1875 -0.46875 0.1875 -1.34375l0 -3.703125l1.171875 0l0 6.90625l-1.046875 0zm5.4437256 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.015381 1.046875l0 -9.546875l6.90625 0l0 1.125l-5.640625 0l0 2.921875l5.28125 0l0 1.125l-5.28125 0l0 3.25l5.859375 0l0 1.125l-7.125 0zm8.7022705 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm7.7110596 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm15.836792 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.0625 0l0 0.984375q0.75 -1.140625 2.1875 -1.140625q0.625 0 1.15625 0.21875q0.53125 0.21875 0.78125 0.59375q0.265625 0.359375 0.375 0.859375q0.0625 0.328125 0.0625 1.140625l0 4.25l-1.171875 0l0 -4.203125q0 -0.71875 -0.140625 -1.0625q-0.140625 -0.359375 -0.484375 -0.5625q-0.34375 -0.21875 -0.8125 -0.21875q-0.75 0 -1.296875 0.46875q-0.546875 0.46875 -0.546875 1.796875l0 3.78125l-1.171875 0zm9.974976 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m60.003254 191.0883l0 80.157486" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m60.003254 197.08832l0 68.15747" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m61.654987 197.08832l-1.6517334 -4.538101l-1.6517296 4.538101z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m58.351524 265.2458l1.6517296 4.5381165l1.6517334 -4.5381165z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m69.21183 182.38914l152.18896 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m75.21184 182.38914l140.18898 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m75.21184 180.73741l-4.538101 1.6517334l4.538101 1.6517334z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m215.40082 184.04088l4.538101 -1.6517334l-4.538101 -1.6517334z" fill-rule="evenodd"/><path fill="#cfe2f3" d="m14.017324 212.97638l38.92913 0l0 33.543304l-38.92913 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m14.017324 212.97638l38.92913 0l0 33.543304l-38.92913 0z" fill-rule="evenodd"/><path fill="#000000" d="m31.330479 230.81366l0 -1.125l4.03125 -0.015625l0 3.546875q-0.921875 0.75 -1.921875 1.125q-0.984375 0.359375 -2.03125 0.359375q-1.40625 0 -2.5625 -0.59375q-1.140625 -0.609375 -1.734375 -1.734375q-0.578125 -1.140625 -0.578125 -2.546875q0 -1.40625 0.578125 -2.609375q0.59375 -1.203125 1.6875 -1.78125q1.09375 -0.59375 2.515625 -0.59375q1.03125 0 1.859375 0.34375q0.84375 0.328125 1.3125 0.9375q0.484375 0.59375 0.734375 1.546875l-1.140625 0.3125q-0.21875 -0.71875 -0.53125 -1.140625q-0.3125 -0.421875 -0.90625 -0.671875q-0.59375 -0.25 -1.3125 -0.25q-0.875 0 -1.515625 0.265625q-0.625 0.265625 -1.015625 0.703125q-0.375 0.421875 -0.59375 0.9375q-0.359375 0.875 -0.359375 1.921875q0 1.265625 0.4375 2.125q0.4375 0.859375 1.265625 1.28125q0.84375 0.421875 1.796875 0.421875q0.8125 0 1.59375 -0.3125q0.78125 -0.328125 1.1875 -0.6875l0 -1.765625l-2.796875 0z" fill-rule="nonzero"/><path fill="#000000" d="m36.57253 236.40741q0 -1.125 0.21875 -1.8125q0.234375 -0.6875 0.6875 -1.046875q0.46875 -0.375 1.15625 -0.375q0.515625 0 0.890625 0.203125q0.390625 0.203125 0.640625 0.59375q0.25 0.390625 0.390625 0.953125q0.15625 0.546875 0.15625 1.484375q0 1.125 -0.234375 1.8125q-0.234375 0.6875 -0.6875 1.0625q-0.453125 0.375 -1.15625 0.375q-0.921875 0 -1.4375 -0.65625q-0.625 -0.796875 -0.625 -2.59375zm0.796875 0q0 1.578125 0.359375 2.09375q0.375 0.515625 0.90625 0.515625q0.546875 0 0.90625 -0.515625q0.359375 -0.53125 0.359375 -2.09375q0 -1.5625 -0.359375 -2.078125q-0.359375 -0.515625 -0.90625 -0.515625q-0.546875 0 -0.859375 0.453125q-0.40625 0.578125 -0.40625 2.140625z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m125.84252 141.49606l38.92913 0l0 33.543304l-38.92913 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m125.84252 141.49606l38.92913 0l0 33.543304l-38.92913 0z" fill-rule="evenodd"/><path fill="#000000" d="m143.15567 159.33334l0 -1.125l4.03125 -0.015625l0 3.546875q-0.921875 0.75 -1.921875 1.125q-0.984375 0.359375 -2.03125 0.359375q-1.40625 0 -2.5625 -0.59375q-1.140625 -0.609375 -1.734375 -1.734375q-0.578125 -1.140625 -0.578125 -2.546875q0 -1.40625 0.578125 -2.609375q0.59375 -1.203125 1.6875 -1.78125q1.09375 -0.59375 2.515625 -0.59375q1.03125 0 1.859375 0.34375q0.84375 0.328125 1.3125 0.9375q0.484375 0.59375 0.734375 1.546875l-1.140625 0.3125q-0.21875 -0.71875 -0.53125 -1.140625q-0.3125 -0.421875 -0.90625 -0.671875q-0.59375 -0.25 -1.3125 -0.25q-0.875 0 -1.515625 0.265625q-0.625 0.265625 -1.015625 0.703125q-0.375 0.421875 -0.59375 0.9375q-0.359375 0.875 -0.359375 1.921875q0 1.265625 0.4375 2.125q0.4375 0.859375 1.265625 1.28125q0.84375 0.421875 1.796875 0.421875q0.8125 0 1.59375 -0.3125q0.78125 -0.328125 1.1875 -0.6875l0 -1.765625l-2.796875 0z" fill-rule="nonzero"/><path fill="#000000" d="m151.33522 168.06772l-0.78125 0l0 -4.96875q-0.28125 0.265625 -0.75 0.53125q-0.453125 0.265625 -0.8125 0.40625l0 -0.75q0.65625 -0.3125 1.140625 -0.75q0.484375 -0.4375 0.6875 -0.84375l0.515625 0l0 6.375z" fill-rule="nonzero"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/g3doc/images/ops_xla_gather_2.svg b/tensorflow/compiler/xla/g3doc/images/ops_xla_gather_2.svg
new file mode 100644
index 0000000000000000000000000000000000000000..d9c35e972d152c63df44d3c9be65ec3a840d5544
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/images/ops_xla_gather_2.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 550.9317585301837 635.6141732283464" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l550.93176 0l0 635.6142l-550.93176 0l0 -635.6142z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l550.93176 0l0 635.6142l-550.93176 0z" fill-rule="evenodd"/><g filter="url(#shadowFilter-p.1)"><use xlink:href="#p.1" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.1" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.1"><path fill="#fff2cc" d="m241.7559 188.68767l292.75592 0l0 424.5984l-292.75592 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m241.7559 188.68767l292.75592 0l0 424.5984l-292.75592 0z" fill-rule="evenodd"/></g><path fill="#cfe2f3" d="m12.267716 40.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m12.267716 40.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path fill="#000000" d="m38.37484 66.78799l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8124981 -1.59375 -0.8124981 -3.5625q0 -1.953125 0.8124981 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0z" fill-rule="nonzero"/><path fill="#000000" d="m45.714394 74.63174q0 -1.578125 0.328125 -2.53125q0.328125 -0.96875 0.96875 -1.484375q0.640625 -0.53125 1.609375 -0.53125q0.71875 0 1.25 0.296875q0.546875 0.28125 0.890625 0.828125q0.359375 0.53125 0.5625 1.3125q0.203125 0.78125 0.203125 2.109375q0 1.5625 -0.328125 2.53125q-0.328125 0.953125 -0.96875 1.484375q-0.640625 0.53125 -1.609375 0.53125q-1.296875 0 -2.03125 -0.921875q-0.875 -1.109375 -0.875 -3.625zm1.125 0q0 2.1875 0.515625 2.921875q0.515625 0.71875 1.265625 0.71875q0.75 0 1.265625 -0.71875q0.515625 -0.734375 0.515625 -2.921875q0 -2.203125 -0.515625 -2.921875q-0.515625 -0.71875 -1.28125 -0.71875q-0.75 0 -1.203125 0.640625q-0.5625 0.8125 -0.5625 3.0z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m70.53543 40.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m70.53543 40.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path fill="#000000" d="m96.642555 66.78799l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0z" fill-rule="nonzero"/><path fill="#000000" d="m108.10711 79.02236l-1.09375 0l0 -6.96875q-0.40625 0.375 -1.046875 0.75q-0.640625 0.375 -1.140625 0.578125l0 -1.0625q0.90625 -0.4375 1.59375 -1.046875q0.6875 -0.609375 0.96875 -1.1875l0.71875 0l0 8.9375z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m128.80315 40.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m128.80315 40.0l58.267715 0l0 50.204727l-58.267715 0z" fill-rule="evenodd"/><path fill="#000000" d="m148.12903 65.52236q0 -3.328125 1.78125 -5.203125q1.78125 -1.890625 4.609375 -1.890625q1.84375 0 3.328125 0.890625q1.484375 0.875 2.265625 2.46875q0.78125 1.578125 0.78125 3.578125q0 2.03125 -0.828125 3.640625q-0.8125 1.59375 -2.3125 2.421875q-1.5 0.828125 -3.25 0.828125q-1.875 0 -3.359375 -0.90625q-1.484375 -0.921875 -2.25 -2.5q-0.765625 -1.578125 -0.765625 -3.328125zm1.8125 0.015625q0 2.421875 1.296875 3.8125q1.296875 1.390625 3.265625 1.390625q2.0 0 3.28125 -1.40625q1.28125 -1.40625 1.28125 -3.984375q0 -1.625 -0.546875 -2.84375q-0.546875 -1.21875 -1.609375 -1.875q-1.0625 -0.671875 -2.375 -0.671875q-1.890625 0 -3.25 1.296875q-1.34375 1.28125 -1.34375 4.28125z" fill-rule="nonzero"/><path fill="#000000" d="m162.24983 74.63174q0 -1.578125 0.328125 -2.53125q0.328125 -0.96875 0.96875 -1.484375q0.640625 -0.53125 1.609375 -0.53125q0.71875 0 1.25 0.296875q0.546875 0.28125 0.890625 0.828125q0.359375 0.53125 0.5625 1.3125q0.203125 0.78125 0.203125 2.109375q0 1.5625 -0.328125 2.53125q-0.328125 0.953125 -0.96875 1.484375q-0.640625 0.53125 -1.609375 0.53125q-1.296875 0 -2.03125 -0.921875q-0.875 -1.109375 -0.875 -3.625zm1.125 0q0 2.1875 0.515625 2.921875q0.515625 0.71875 1.265625 0.71875q0.75 0 1.265625 -0.71875q0.515625 -0.734375 0.515625 -2.921875q0 -2.203125 -0.515625 -2.921875q-0.515625 -0.71875 -1.28125 -0.71875q-0.75 0 -1.203125 0.640625q-0.5625 0.8125 -0.5625 3.0z" fill-rule="nonzero"/><path fill="#f4cccc" d="m300.83203 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 247.03937l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 247.03937l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 247.03937l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 247.03937l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 267.13385l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 267.13385l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 267.13385l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 267.13385l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 247.03937l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 267.13385l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 287.22833l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 287.22833l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 287.22833l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 287.22833l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 307.32285l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 307.32285l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 307.32285l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 307.32285l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 287.22833l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 307.32285l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 327.41733l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 327.41733l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 327.41733l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 327.41733l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 347.5118l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 347.5118l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 347.5118l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 347.5118l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 327.41733l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 347.5118l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m300.83203 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m320.9265 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m341.021 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m361.11548 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m381.20996 367.6063l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 367.6063l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m401.30447 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#cc0000" d="m421.39896 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m441.49344 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m461.58792 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m481.6824 367.6063l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 367.6063l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 387.70078l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 387.70078l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 387.70078l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 387.70078l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path fill="#ea9999" d="m501.77692 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 367.6063l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 387.70078l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 407.7953l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 407.7953l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 407.7953l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 407.7953l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 427.88977l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 427.88977l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 427.88977l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 427.88977l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 407.7953l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 427.88977l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 447.98425l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 447.98425l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 447.98425l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 447.98425l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 468.07874l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 468.07874l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 468.07874l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 468.07874l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 447.98425l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 468.07874l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 488.17322l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 488.17322l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 488.17322l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 488.17322l20.094513 0l0 20.094513l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 508.26773l20.094513 0l0 20.094452l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 508.26773l20.094513 0l0 20.094452l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 508.26773l20.094513 0l0 20.094452l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 508.26773l20.094513 0l0 20.094452l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 488.17322l20.094482 0l0 20.094513l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 508.26773l20.094482 0l0 20.094452l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 528.3622l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 528.3622l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 528.3622l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 528.3622l20.094513 0l0 20.094482l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m300.83203 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.83203 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m320.9265 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m320.9265 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m341.021 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m341.021 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m361.11548 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m361.11548 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m381.20996 548.45667l20.094513 0l0 20.094543l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m381.20996 548.45667l20.094513 0l0 20.094543l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m401.30447 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m401.30447 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m421.39896 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m421.39896 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m441.49344 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m441.49344 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m461.58792 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m461.58792 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m481.6824 548.45667l20.094513 0l0 20.094543l-20.094513 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m481.6824 548.45667l20.094513 0l0 20.094543l-20.094513 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 528.3622l20.094482 0l0 20.094482l-20.094482 0z" fill-rule="evenodd"/><path fill="#f4cccc" d="m501.77692 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m501.77692 548.45667l20.094482 0l0 20.094543l-20.094482 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m12.267716 -0.1863517l260.6614 0l0 44.09449l-260.6614 0z" fill-rule="evenodd"/><path fill="#000000" d="m22.173967 20.233646q0 -3.328125 1.78125 -5.203125q1.78125 -1.890625 4.609375 -1.890625q1.84375 0 3.328125 0.890625q1.4843731 0.875 2.265623 2.46875q0.78125 1.578125 0.78125 3.578125q0 2.03125 -0.828125 3.640625q-0.8125 1.59375 -2.312498 2.421875q-1.5 0.828125 -3.25 0.828125q-1.875 0 -3.359375 -0.90625q-1.484375 -0.921875 -2.25 -2.5q-0.765625 -1.578125 -0.765625 -3.328125zm1.8125 0.015625q0 2.421875 1.296875 3.8125q1.296875 1.390625 3.265625 1.390625q2.0 0 3.28125 -1.40625q1.2812481 -1.40625 1.2812481 -3.984375q0 -1.625 -0.546875 -2.84375q-0.546875 -1.21875 -1.6093731 -1.875q-1.0625 -0.671875 -2.375 -0.671875q-1.890625 0 -3.25 1.296875q-1.34375 1.28125 -1.34375 4.28125zm19.355177 6.484375l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm7.625717 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 5.171875l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm15.219467 4.78125l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm7.625717 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm7.2884827 1.46875l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683304 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm16.641342 0l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125zm15.906967 1.71875l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm8.047592 5.765625l3.53125 -5.03125l-3.265625 -4.640625l2.046875 0l1.484375 2.265625q0.421875 0.640625 0.671875 1.078125q0.40625 -0.59375 0.734375 -1.0625l1.640625 -2.28125l1.953125 0l-3.34375 4.546875l3.59375 5.125l-2.015625 0l-1.984375 -3.0l-0.515625 -0.8125l-2.546875 3.8125l-1.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m61.212597 420.88977l152.7874 0l0 36.283447l-152.7874 0z" fill-rule="evenodd"/><path fill="#000000" d="m77.9001 442.57538l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421875 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125 -1.6875 2.34375 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0zm14.370804 4.046875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm7.781967 3.390625l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 1.46875l0 -13.359375l1.640625 0l0 4.796875q1.140625 -1.328125 2.890625 -1.328125q1.078125 0 1.859375 0.421875q0.796875 0.421875 1.140625 1.171875q0.34375 0.75 0.34375 2.171875l0 6.125l-1.640625 0l0 -6.125q0 -1.234375 -0.53125 -1.796875q-0.53125 -0.5625 -1.515625 -0.5625q-0.71875 0 -1.359375 0.390625q-0.640625 0.375 -0.921875 1.015625q-0.265625 0.640625 -0.265625 1.78125l0 5.296875l-1.640625 0zm17.000717 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm11.911606 0l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683304 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm16.641342 0l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125zm9.281967 -6.640625l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm10.457321 -3.546875l1.609375 0.21875q-0.265625 1.65625 -1.359375 2.609375q-1.078125 0.9375 -2.671875 0.9375q-1.984375 0 -3.1875 -1.296875q-1.203125 -1.296875 -1.203125 -3.71875q0 -1.578125 0.515625 -2.75q0.515625 -1.171875 1.578125 -1.75q1.0625 -0.59375 2.3125 -0.59375q1.578125 0 2.578125 0.796875q1.0 0.796875 1.28125 2.265625l-1.59375 0.234375q-0.234375 -0.96875 -0.8125 -1.453125q-0.578125 -0.5 -1.390625 -0.5q-1.234375 0 -2.015625 0.890625q-0.78125 0.890625 -0.78125 2.8125q0 1.953125 0.75 2.84375q0.75 0.875 1.953125 0.875q0.96875 0 1.609375 -0.59375q0.65625 -0.59375 0.828125 -1.828125zm9.640625 0.4375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm8.485092 2.875l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m301.17847 573.98425l220.3465 0l0 26.519714l-220.3465 0z" fill-rule="evenodd"/><path fill="#000000" d="m311.91284 600.90424l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683319 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm10.375702 3.703125l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm15.219482 4.78125l0 -1.421875q-1.125 1.640625 -3.0625 1.640625q-0.859375 0 -1.609375 -0.328125q-0.734375 -0.328125 -1.09375 -0.828125q-0.359375 -0.5 -0.5 -1.21875q-0.109375 -0.46875 -0.109375 -1.53125l0 -5.984375l1.640625 0l0 5.359375q0 1.28125 0.109375 1.734375q0.15625 0.640625 0.65625 1.015625q0.5 0.375 1.234375 0.375q0.734375 0 1.375 -0.375q0.65625 -0.390625 0.921875 -1.03125q0.265625 -0.65625 0.265625 -1.890625l0 -5.1875l1.640625 0l0 9.671875l-1.46875 0zm7.625702 -1.46875l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm10.397858 1.46875l0 -11.78125l-4.40625 0l0 -1.578125l10.578125 0l0 1.578125l-4.40625 0l0 11.78125l-1.765625 0zm14.411621 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.141327 5.765625l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm9.719482 -2.890625l1.625 -0.25q0.125 0.96875 0.75 1.5q0.625 0.515625 1.75 0.515625q1.125 0 1.671875 -0.453125q0.546875 -0.46875 0.546875 -1.09375q0 -0.546875 -0.484375 -0.875q-0.328125 -0.21875 -1.671875 -0.546875q-1.8125 -0.46875 -2.515625 -0.796875q-0.6875 -0.328125 -1.046875 -0.90625q-0.359375 -0.59375 -0.359375 -1.3125q0 -0.640625 0.296875 -1.1875q0.296875 -0.5625 0.8125 -0.921875q0.375 -0.28125 1.03125 -0.46875q0.671875 -0.203125 1.421875 -0.203125q1.140625 0 2.0 0.328125q0.859375 0.328125 1.265625 0.890625q0.421875 0.5625 0.578125 1.5l-1.609375 0.21875q-0.109375 -0.75 -0.640625 -1.171875q-0.515625 -0.421875 -1.46875 -0.421875q-1.140625 0 -1.625 0.375q-0.46875 0.375 -0.46875 0.875q0 0.3125 0.1875 0.578125q0.203125 0.265625 0.640625 0.4375q0.234375 0.09375 1.4375 0.421875q1.75 0.453125 2.4375 0.75q0.6875 0.296875 1.078125 0.859375q0.390625 0.5625 0.390625 1.40625q0 0.828125 -0.484375 1.546875q-0.46875 0.71875 -1.375 1.125q-0.90625 0.390625 -2.046875 0.390625q-1.875 0 -2.875 -0.78125q-0.984375 -0.78125 -1.25 -2.328125zm9.375 -1.953125q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm9.281952 4.84375l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m300.83203 398.55905l120.566925 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m306.83203 398.55905l108.566925 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m306.83203 396.90732l-4.5381165 1.6517334l4.5381165 1.6517334z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m415.39896 400.2108l4.538086 -1.6517334l-4.538086 -1.6517334z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m290.7848 247.05511l0 120.566925" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m290.7848 253.05511l0 108.566925" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m292.43652 253.05511l-1.6517334 -4.538101l-1.6517334 4.538101z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m289.13306 361.62204l1.6517334 4.5381165l1.6517334 -4.5381165z" fill-rule="evenodd"/><path fill="#8e7cc3" d="m251.38309 291.9528l29.35434 0l0 30.771667l-29.35434 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m251.38309 291.9528l29.35434 0l0 30.771667l-29.35434 0z" fill-rule="evenodd"/><path fill="#000000" d="m261.6779 312.1386l3.6875 -4.96875l-3.25 -4.578125l1.5 0l1.734375 2.453125q0.53125 0.765625 0.765625 1.171875q0.3125 -0.515625 0.75 -1.09375l1.921875 -2.53125l1.375 0l-3.359375 4.5l3.625 5.046875l-1.5625 0l-2.40625 -3.40625q-0.203125 -0.296875 -0.421875 -0.640625q-0.3125 0.53125 -0.453125 0.71875l-2.390625 3.328125l-1.515625 0z" fill-rule="nonzero"/><path fill="#8e7cc3" d="m300.8319 210.83469l29.35434 0l0 30.771652l-29.35434 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m300.8319 210.83469l29.35434 0l0 30.771652l-29.35434 0z" fill-rule="evenodd"/><path fill="#000000" d="m312.3497 226.31738q0 -1.6875 0.34375 -2.71875q0.359375 -1.03125 1.046875 -1.59375q0.6875 -0.5625 1.71875 -0.5625q0.78125 0 1.359375 0.3125q0.578125 0.296875 0.953125 0.890625q0.375 0.578125 0.59375 1.421875q0.21875 0.828125 0.21875 2.25q0 1.671875 -0.359375 2.703125q-0.34375 1.03125 -1.03125 1.59375q-0.671875 0.5625 -1.734375 0.5625q-1.375 0 -2.15625 -0.984375q-0.953125 -1.1875 -0.953125 -3.875zm1.203125 0q0 2.34375 0.546875 3.125q0.5625 0.78125 1.359375 0.78125q0.8125 0 1.359375 -0.78125q0.5625 -0.78125 0.5625 -3.125q0 -2.359375 -0.5625 -3.125q-0.546875 -0.78125 -1.359375 -0.78125q-0.8125 0 -1.296875 0.6875q-0.609375 0.875 -0.609375 3.21875z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m248.67323 367.62204l42.110245 0l0 36.283478l-42.110245 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m248.67323 367.62204l42.110245 0l0 36.283478l-42.110245 0z" fill-rule="evenodd"/><path fill="#000000" d="m266.56897 385.86066q0 -1.6875 0.34375 -2.71875q0.359375 -1.03125 1.046875 -1.59375q0.6875 -0.5625 1.71875 -0.5625q0.78125 0 1.359375 0.3125q0.578125 0.296875 0.953125 0.890625q0.375 0.578125 0.59375 1.421875q0.21875 0.828125 0.21875 2.25q0 1.671875 -0.359375 2.703125q-0.34375 1.03125 -1.03125 1.59375q-0.671875 0.5625 -1.734375 0.5625q-1.375 0 -2.15625 -0.984375q-0.953125 -1.1875 -0.953125 -3.875zm1.203125 0q0 2.34375 0.546875 3.125q0.5625 0.78125 1.359375 0.78125q0.8125 0 1.359375 -0.78125q0.5625 -0.78125 0.5625 -3.125q0 -2.359375 -0.5625 -3.125q-0.546875 -0.78125 -1.359375 -0.78125q-0.8125 0 -1.296875 0.6875q-0.609375 0.875 -0.609375 3.21875z" fill-rule="nonzero"/><g filter="url(#shadowFilter-p.2)"><use xlink:href="#p.2" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.2" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.2"><path fill="#fce5cd" d="m203.28871 40.0l206.58269 0l0 74.92913l-206.58269 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m203.28871 40.0l206.58269 0l0 74.92913l-206.58269 0z" fill-rule="evenodd"/></g><g filter="url(#shadowFilter-p.3)"><use xlink:href="#p.3" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.3" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.3"><path fill="#cfe2f3" d="m216.0315 50.885826l58.26773 0l0 50.204727l-58.26773 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m216.0315 50.885826l58.26773 0l0 50.204727l-58.26773 0z" fill-rule="evenodd"/><path fill="#000000" d="m235.54968 86.61131l0 -17.0625l3.609375 0l0 1.359375l-1.96875 0l0 14.34375l1.96875 0l0 1.359375l-3.609375 0zm8.761429 -3.703125l0 -11.78125l-4.40625 0l0 -1.578125l10.578125 0l0 1.578125l-4.40625 0l0 11.78125l-1.765625 0zm10.520981 3.703125l-3.609375 0l0 -1.359375l1.96875 0l0 -14.34375l-1.96875 0l0 -1.359375l3.609375 0l0 17.0625z" fill-rule="nonzero"/></g><g filter="url(#shadowFilter-p.4)"><use xlink:href="#p.4" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.4" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.4"><path fill="#cfe2f3" d="m282.29922 68.29527l14.677155 0l0 -7.6929092l14.677155 15.385826l-14.677155 15.385826l0 -7.692917l-14.677155 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.29922 68.29527l14.677155 0l0 -7.6929092l14.677155 15.385826l-14.677155 15.385826l0 -7.692917l-14.677155 0z" fill-rule="evenodd"/></g><g filter="url(#shadowFilter-p.5)"><use xlink:href="#p.5" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.5" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.5"><path fill="#cfe2f3" d="m319.65353 50.88714l75.68506 0l0 50.204727l-75.68506 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m319.65353 50.88714l75.68506 0l0 50.204727l-75.68506 0z" fill-rule="evenodd"/><path fill="#000000" d="m340.1009 86.612625l0 -17.0625l3.609375 0l0 1.359375l-1.96875 0l0 14.34375l1.96875 0l0 1.359375l-3.609375 0zm4.6989136 -10.296875q0 -2.359375 0.484375 -3.796875q0.484375 -1.453125 1.4375 -2.234375q0.96875 -0.78125 2.421875 -0.78125q1.078125 0 1.890625 0.4375q0.8125 0.421875 1.328125 1.25q0.53125 0.8125 0.828125 1.984375q0.3125 1.15625 0.3125 3.140625q0 2.359375 -0.484375 3.8125q-0.484375 1.4375 -1.453125 2.234375q-0.953125 0.78125 -2.421875 0.78125q-1.921875 0 -3.03125 -1.390625q-1.3125 -1.671875 -1.3125 -5.4375zm1.671875 0q0 3.296875 0.765625 4.390625q0.78125 1.078125 1.90625 1.078125q1.140625 0 1.90625 -1.09375q0.765625 -1.09375 0.765625 -4.375q0 -3.296875 -0.765625 -4.375q-0.765625 -1.078125 -1.921875 -1.078125q-1.125 0 -1.796875 0.953125q-0.859375 1.21875 -0.859375 4.5zm9.578857 6.59375l0 -1.875l1.875 0l0 1.875q0 1.03125 -0.375 1.65625q-0.359375 0.640625 -1.15625 0.984375l-0.453125 -0.703125q0.515625 -0.21875 0.765625 -0.671875q0.25 -0.4375 0.28125 -1.265625l-0.9375 0zm8.370789 0l0 -11.78125l-4.40625 0l0 -1.578125l10.578125 0l0 1.578125l-4.40625 0l0 11.78125l-1.765625 0zm10.520996 3.703125l-3.609375 0l0 -1.359375l1.96875 0l0 -14.34375l-1.96875 0l0 -1.359375l3.609375 0l0 17.0625z" fill-rule="nonzero"/></g><g filter="url(#shadowFilter-p.6)"><use xlink:href="#p.6" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.6" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.6"><path fill="#fce5cd" d="m37.212597 504.51968l170.23622 0l0 64.37796l-170.23622 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m37.212597 504.51968l170.23622 0l0 64.37796l-170.23622 0z" fill-rule="evenodd"/></g><g filter="url(#shadowFilter-p.7)"><use xlink:href="#p.7" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.7" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.7"><path fill="#8e7cc3" d="m45.60635 514.08923l42.110233 0l0 40.06299l-42.110233 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m45.60635 514.08923l42.110233 0l0 40.06299l-42.110233 0z" fill-rule="evenodd"/><path fill="#000000" d="m57.0458 544.74384l0 -17.0625l3.609375 0l0 1.359375l-1.96875 0l0 14.34375l1.96875 0l0 1.359375l-3.609375 0zm8.761429 -3.703125l0 -11.78125l-4.40625 0l0 -1.578125l10.578125 0l0 1.578125l-4.40625 0l0 11.78125l-1.765625 0zm10.520981 3.703125l-3.609375 0l0 -1.359375l1.96875 0l0 -14.34375l-1.96875 0l0 -1.359375l3.609375 0l0 17.0625z" fill-rule="nonzero"/></g><g filter="url(#shadowFilter-p.8)"><use xlink:href="#p.8" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.8" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.8"><path fill="#8e7cc3" d="m95.49213 527.4908l16.09449 0l0 -6.629883l13.259842 13.259827l-13.259842 13.259827l0 -6.629883l-16.09449 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m95.49213 527.4908l16.09449 0l0 -6.629883l13.259842 13.259827l-13.259842 13.259827l0 -6.629883l-16.09449 0z" fill-rule="evenodd"/></g><g filter="url(#shadowFilter-p.9)"><use xlink:href="#p.9" transform="matrix(1.0 0.0 0.0 1.0 0.0 2.0)"/></g><defs><filter id="shadowFilter-p.9" filterUnits="userSpaceOnUse"><feGaussianBlur in="SourceAlpha" stdDeviation="2.0" result="blur"/><feComponentTransfer in="blur" color-interpolation-filters="sRGB"><feFuncR type="linear" slope="0" intercept="0.0"/><feFuncG type="linear" slope="0" intercept="0.0"/><feFuncB type="linear" slope="0" intercept="0.0"/><feFuncA type="linear" slope="0.5" intercept="0"/></feComponentTransfer></filter></defs><g id="p.9"><path fill="#8e7cc3" d="m132.62463 514.08923l65.07086 0l0 40.06299l-65.07086 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m132.62463 514.08923l65.07086 0l0 40.06299l-65.07086 0z" fill-rule="evenodd"/><path fill="#000000" d="m147.7649 544.74384l0 -17.0625l3.609375 0l0 1.359375l-1.96875 0l0 14.34375l1.96875 0l0 1.359375l-3.609375 0zm8.761429 -3.703125l0 -11.78125l-4.40625 0l0 -1.578125l10.578125 0l0 1.578125l-4.40625 0l0 11.78125l-1.765625 0zm8.208481 0l0 -1.875l1.875 0l0 1.875q0 1.03125 -0.375 1.65625q-0.359375 0.640625 -1.15625 0.984375l-0.453125 -0.703125q0.515625 -0.21875 0.765625 -0.671875q0.25 -0.4375 0.28125 -1.265625l-0.9375 0zm4.308304 -6.59375q0 -2.359375 0.484375 -3.796875q0.484375 -1.453125 1.4375 -2.234375q0.96875 -0.78125 2.421875 -0.78125q1.078125 0 1.890625 0.4375q0.8125 0.421875 1.328125 1.25q0.53125 0.8125 0.828125 1.984375q0.3125 1.15625 0.3125 3.140625q0 2.359375 -0.484375 3.8125q-0.484375 1.4375 -1.453125 2.234375q-0.953125 0.78125 -2.421875 0.78125q-1.921875 0 -3.03125 -1.390625q-1.3125 -1.671875 -1.3125 -5.4375zm1.671875 0q0 3.296875 0.765625 4.390625q0.78125 1.078125 1.90625 1.078125q1.140625 0 1.90625 -1.09375q0.765625 -1.09375 0.765625 -4.375q0 -3.296875 -0.765625 -4.375q-0.765625 -1.078125 -1.921875 -1.078125q-1.125 0 -1.796875 0.953125q-0.859375 1.21875 -0.859375 4.5zm11.891342 10.296875l-3.609375 0l0 -1.359375l1.96875 0l0 -14.34375l-1.96875 0l0 -1.359375l3.609375 0l0 17.0625z" fill-rule="nonzero"/></g><path fill="#000000" fill-opacity="0.0" d="m203.28871 129.22966l206.58269 0l0 36.283463l-206.58269 0z" fill-rule="evenodd"/><path fill="#000000" d="m238.79514 156.14966l-3.53125 -13.359375l1.8125 0l2.03125 8.765625q0.328125 1.375 0.5625 2.71875q0.5 -2.140625 0.59375 -2.46875l2.546875 -9.015625l2.125 0l1.921875 6.765625q0.71875 2.515625 1.03125 4.71875q0.265625 -1.265625 0.671875 -2.890625l2.09375 -8.59375l1.78125 0l-3.671875 13.359375l-1.703125 0l-2.8125 -10.171875q-0.359375 -1.28125 -0.421875 -1.5625q-0.203125 0.90625 -0.390625 1.5625l-2.828125 10.171875l-1.8125 0zm15.077408 -11.46875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm4.144821 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm16.641327 0l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125zm8.672607 -0.015625q0 -2.6875 1.484375 -3.96875q1.25 -1.078125 3.046875 -1.078125q2.0 0 3.265625 1.3125q1.265625 1.296875 1.265625 3.609375q0 1.859375 -0.5625 2.9375q-0.5625 1.0625 -1.640625 1.65625q-1.0625 0.59375 -2.328125 0.59375q-2.03125 0 -3.28125 -1.296875q-1.25 -1.3125 -1.25 -3.765625zm1.6875 0q0 1.859375 0.796875 2.796875q0.8125 0.921875 2.046875 0.921875q1.21875 0 2.03125 -0.921875q0.8125 -0.9375 0.8125 -2.84375q0 -1.796875 -0.8125 -2.71875q-0.8125 -0.921875 -2.03125 -0.921875q-1.234375 0 -2.046875 0.921875q-0.796875 0.90625 -0.796875 2.765625zm11.078827 4.84375l-2.96875 -9.671875l1.703125 0l1.53125 5.578125l0.578125 2.078125q0.046875 -0.15625 0.5 -2.0l1.546875 -5.65625l1.6875 0l1.4375 5.609375l0.484375 1.84375l0.5625 -1.859375l1.65625 -5.59375l1.59375 0l-3.03125 9.671875l-1.703125 0l-1.53125 -5.796875l-0.375 -1.640625l-1.953125 7.4375l-1.71875 0zm17.03125 0l0 -13.359375l2.65625 0l3.15625 9.453125q0.4375 1.328125 0.640625 1.984375q0.234375 -0.734375 0.703125 -2.140625l3.203125 -9.296875l2.375 0l0 13.359375l-1.703125 0l0 -11.171875l-3.875 11.171875l-1.59375 0l-3.859375 -11.375l0 11.375l-1.703125 0zm21.697052 -1.1875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.10940552 0.59375 0.40628052 1.15625l-1.7031555 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm4.2038574 8.5625l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm8.891357 8.484375l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm8.891327 -6.6875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm4.1448364 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm10.063202 0.796875l1.59375 0.234375q0.109375 0.75 0.5625 1.078125q0.609375 0.453125 1.671875 0.453125q1.140625 0 1.75 -0.453125q0.625 -0.453125 0.84375 -1.265625q0.125 -0.5 0.109375 -2.109375q-1.0625 1.265625 -2.671875 1.265625q-2.0 0 -3.09375 -1.4375q-1.09375 -1.4375 -1.09375 -3.453125q0 -1.390625 0.5 -2.5625q0.515625 -1.171875 1.453125 -1.796875q0.953125 -0.640625 2.25 -0.640625q1.703125 0 2.8125 1.375l0 -1.15625l1.515625 0l0 8.359375q0 2.265625 -0.46875 3.203125q-0.453125 0.9375 -1.453125 1.484375q-0.984375 0.546875 -2.453125 0.546875q-1.71875 0 -2.796875 -0.78125q-1.0625 -0.765625 -1.03125 -2.34375zm1.359375 -5.8125q0 1.90625 0.75 2.78125q0.765625 0.875 1.90625 0.875q1.125 0 1.890625 -0.859375q0.765625 -0.875 0.765625 -2.734375q0 -1.78125 -0.796875 -2.671875q-0.78125 -0.90625 -1.890625 -0.90625q-1.09375 0 -1.859375 0.890625q-0.765625 0.875 -0.765625 2.625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m19.03937 583.0066l206.58267 0l0 36.283447l-206.58267 0z" fill-rule="evenodd"/><path fill="#000000" d="m37.71623 604.6922l0 -1.578125l5.65625 0l0 4.953125q-1.296875 1.046875 -2.6875 1.578125q-1.375 0.515625 -2.84375 0.515625q-1.96875 0 -3.578125 -0.84375q-1.609375 -0.84375 -2.421877 -2.4375q-0.8125 -1.59375 -0.8125 -3.5625q0 -1.953125 0.8125 -3.640625q0.8125019 -1.6875 2.343752 -2.5q1.53125 -0.828125 3.515625 -0.828125q1.453125 0 2.625 0.46875q1.171875 0.46875 1.828125 1.3125q0.671875 0.828125 1.015625 2.171875l-1.59375 0.4375q-0.296875 -1.015625 -0.75 -1.59375q-0.4375 -0.59375 -1.265625 -0.9375q-0.828125 -0.34375 -1.84375 -0.34375q-1.203125 0 -2.09375 0.375q-0.890625 0.359375 -1.4375 0.96875q-0.53125 0.59375 -0.828125 1.3125q-0.515625 1.234375 -0.515625 2.6875q0 1.78125 0.609375 2.984375q0.625 1.203125 1.796875 1.796875q1.171875 0.578125 2.5 0.578125q1.140625 0 2.234375 -0.4375q1.09375 -0.453125 1.65625 -0.953125l0 -2.484375l-3.921875 0zm14.370804 4.046875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm7.781967 3.390625l0.234375 1.453125q-0.6875 0.140625 -1.234375 0.140625q-0.890625 0 -1.390625 -0.28125q-0.484375 -0.28125 -0.6875 -0.734375q-0.203125 -0.46875 -0.203125 -1.9375l0 -5.578125l-1.203125 0l0 -1.265625l1.203125 0l0 -2.390625l1.625 -0.984375l0 3.375l1.65625 0l0 1.265625l-1.65625 0l0 5.671875q0 0.6875 0.078125 0.890625q0.09375 0.203125 0.28125 0.328125q0.203125 0.109375 0.578125 0.109375q0.265625 0 0.71875 -0.0625zm1.6051788 1.46875l0 -13.359375l1.640625 0l0 4.796875q1.1406212 -1.328125 2.8906212 -1.328125q1.078125 0 1.859375 0.421875q0.796875 0.421875 1.140625 1.171875q0.34375 0.75 0.34375 2.171875l0 6.125l-1.640625 0l0 -6.125q0 -1.234375 -0.53125 -1.796875q-0.53125 -0.5625 -1.515625 -0.5625q-0.71875 0 -1.359375 0.390625q-0.6406212 0.375 -0.9218712 1.015625q-0.265625 0.640625 -0.265625 1.78125l0 5.296875l-1.640625 0zm17.000713 -3.109375l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm9.125717 5.765625l0 -9.671875l1.46875 0l0 1.46875q0.5625 -1.03125 1.03125 -1.359375q0.484375 -0.328125 1.0625 -0.328125q0.828125 0 1.6875 0.53125l-0.5625 1.515625q-0.609375 -0.359375 -1.203125 -0.359375q-0.546875 0 -0.96875 0.328125q-0.421875 0.328125 -0.609375 0.890625q-0.28125 0.875 -0.28125 1.921875l0 5.0625l-1.625 0zm11.911606 0l0 -13.359375l1.765625 0l0 13.359375l-1.765625 0zm4.683304 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm16.641342 0l0 -1.21875q-0.90625 1.4375 -2.703125 1.4375q-1.15625 0 -2.125 -0.640625q-0.96875 -0.640625 -1.5 -1.78125q-0.53125 -1.140625 -0.53125 -2.625q0 -1.453125 0.484375 -2.625q0.484375 -1.1875 1.4375 -1.8125q0.96875 -0.625 2.171875 -0.625q0.875 0 1.546875 0.375q0.6875 0.359375 1.109375 0.953125l0 -4.796875l1.640625 0l0 13.359375l-1.53125 0zm-5.171875 -4.828125q0 1.859375 0.78125 2.78125q0.78125 0.921875 1.84375 0.921875q1.078125 0 1.828125 -0.875q0.75 -0.890625 0.75 -2.6875q0 -1.984375 -0.765625 -2.90625q-0.765625 -0.9375 -1.890625 -0.9375q-1.078125 0 -1.8125 0.890625q-0.734375 0.890625 -0.734375 2.8125zm15.906967 1.71875l1.6875 0.203125q-0.40625 1.484375 -1.484375 2.3125q-1.078125 0.8125 -2.765625 0.8125q-2.125 0 -3.375 -1.296875q-1.234375 -1.3125 -1.234375 -3.671875q0 -2.453125 1.25 -3.796875q1.265625 -1.34375 3.265625 -1.34375q1.9375 0 3.15625 1.328125q1.234375 1.3125 1.234375 3.703125q0 0.15625 0 0.4375l-7.21875 0q0.09375 1.59375 0.90625 2.453125q0.8125 0.84375 2.015625 0.84375q0.90625 0 1.546875 -0.46875q0.640625 -0.484375 1.015625 -1.515625zm-5.390625 -2.65625l5.40625 0q-0.109375 -1.21875 -0.625 -1.828125q-0.78125 -0.953125 -2.03125 -0.953125q-1.125 0 -1.90625 0.765625q-0.765625 0.75 -0.84375 2.015625zm8.0476 5.765625l3.53125 -5.03125l-3.265625 -4.640625l2.046875 0l1.484375 2.265625q0.421875 0.640625 0.671875 1.078125q0.40625 -0.59375 0.734375 -1.0625l1.640625 -2.28125l1.953125 0l-3.34375 4.546875l3.59375 5.125l-2.015625 0l-1.984375 -3.0l-0.515625 -0.8125l-2.546875 3.8125l-1.984375 0zm15.761429 0l0 -13.359375l2.65625 0l3.15625 9.453125q0.4375 1.328125 0.640625 1.984375q0.234375 -0.734375 0.703125 -2.140625l3.203125 -9.296875l2.375 0l0 13.359375l-1.703125 0l0 -11.171875l-3.875 11.171875l-1.59375 0l-3.859375 -11.375l0 11.375l-1.703125 0zm21.697052 -1.1875q-0.921875 0.765625 -1.765625 1.09375q-0.828125 0.3125 -1.796875 0.3125q-1.59375 0 -2.453125 -0.78125q-0.859375 -0.78125 -0.859375 -1.984375q0 -0.71875 0.328125 -1.296875q0.328125 -0.59375 0.84375 -0.9375q0.53125 -0.359375 1.1875 -0.546875q0.46875 -0.125 1.453125 -0.25q1.984375 -0.234375 2.921875 -0.5625q0.015625 -0.34375 0.015625 -0.421875q0 -1.0 -0.46875 -1.421875q-0.625 -0.546875 -1.875 -0.546875q-1.15625 0 -1.703125 0.40625q-0.546875 0.40625 -0.8125 1.421875l-1.609375 -0.21875q0.21875 -1.015625 0.71875 -1.640625q0.5 -0.640625 1.453125 -0.984375q0.953125 -0.34375 2.1875 -0.34375q1.25 0 2.015625 0.296875q0.78125 0.28125 1.140625 0.734375q0.375 0.4375 0.515625 1.109375q0.078125 0.421875 0.078125 1.515625l0 2.1875q0 2.28125 0.109375 2.890625q0.109375 0.59375 0.40625 1.15625l-1.703125 0q-0.265625 -0.515625 -0.328125 -1.1875zm-0.140625 -3.671875q-0.890625 0.375 -2.671875 0.625q-1.015625 0.140625 -1.4375 0.328125q-0.421875 0.1875 -0.65625 0.53125q-0.21875 0.34375 -0.21875 0.78125q0 0.65625 0.5 1.09375q0.5 0.4375 1.453125 0.4375q0.9375 0 1.671875 -0.40625q0.75 -0.421875 1.09375 -1.140625q0.265625 -0.5625 0.265625 -1.640625l0 -0.609375zm4.203827 8.5625l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm8.891357 8.484375l0 -13.375l1.484375 0l0 1.25q0.53125 -0.734375 1.1875 -1.09375q0.671875 -0.375 1.625 -0.375q1.234375 0 2.171875 0.640625q0.953125 0.625 1.4375 1.796875q0.484375 1.15625 0.484375 2.546875q0 1.484375 -0.53125 2.671875q-0.53125 1.1875 -1.546875 1.828125q-1.015625 0.625 -2.140625 0.625q-0.8125 0 -1.46875 -0.34375q-0.65625 -0.34375 -1.0625 -0.875l0 4.703125l-1.640625 0zm1.484375 -8.484375q0 1.859375 0.75 2.765625q0.765625 0.890625 1.828125 0.890625q1.09375 0 1.875 -0.921875q0.78125 -0.9375 0.78125 -2.875q0 -1.84375 -0.765625 -2.765625q-0.75 -0.921875 -1.8125 -0.921875q-1.046875 0 -1.859375 0.984375q-0.796875 0.96875 -0.796875 2.84375zm8.891327 -6.6875l0 -1.890625l1.640625 0l0 1.890625l-1.640625 0zm0 11.46875l0 -9.671875l1.640625 0l0 9.671875l-1.640625 0zm4.1448364 0l0 -9.671875l1.46875 0l0 1.375q1.0625 -1.59375 3.078125 -1.59375q0.875 0 1.609375 0.3125q0.734375 0.3125 1.09375 0.828125q0.375 0.5 0.515625 1.203125q0.09375 0.453125 0.09375 1.59375l0 5.953125l-1.640625 0l0 -5.890625q0 -1.0 -0.203125 -1.484375q-0.1875 -0.5 -0.671875 -0.796875q-0.484375 -0.296875 -1.140625 -0.296875q-1.046875 0 -1.8125 0.671875q-0.75 0.65625 -0.75 2.515625l0 5.28125l-1.640625 0zm10.063202 0.796875l1.59375 0.234375q0.109375 0.75 0.5625 1.078125q0.609375 0.453125 1.671875 0.453125q1.140625 0 1.75 -0.453125q0.625 -0.453125 0.84375 -1.265625q0.125 -0.5 0.109375 -2.109375q-1.0625 1.265625 -2.671875 1.265625q-2.0 0 -3.09375 -1.4375q-1.09375 -1.4375 -1.09375 -3.453125q0 -1.390625 0.5 -2.5625q0.515625 -1.171875 1.453125 -1.796875q0.953125 -0.640625 2.25 -0.640625q1.703125 0 2.8125 1.375l0 -1.15625l1.515625 0l0 8.359375q0 2.265625 -0.46875 3.203125q-0.453125 0.9375 -1.453125 1.484375q-0.984375 0.546875 -2.453125 0.546875q-1.71875 0 -2.796875 -0.78125q-1.0625 -0.765625 -1.03125 -2.34375zm1.359375 -5.8125q0 1.90625 0.75 2.78125q0.765625 0.875 1.90625 0.875q1.125 0 1.890625 -0.859375q0.765625 -0.875 0.765625 -2.734375q0 -1.78125 -0.796875 -2.671875q-0.78125 -0.90625 -1.890625 -0.90625q-1.09375 0 -1.859375 0.890625q-0.765625 0.875 -0.765625 2.625z" fill-rule="nonzero"/><path fill="#ffffff" d="m423.90027 397.3491l7.7480164 -7.748047l7.748047 7.748047l-3.8740234 0l0 12.346436l-7.7480164 0l0 -12.346436z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m423.90027 397.3491l7.7480164 -7.748047l7.748047 7.748047l-3.8740234 0l0 12.346436l-7.7480164 0l0 -12.346436z" fill-rule="evenodd"/><path fill="#ffffff" d="m378.87665 412.69553l105.543304 0l0 20.094513l-105.543304 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m378.87665 412.69553l105.543304 0l0 20.094513l-105.543304 0z" fill-rule="evenodd"/><path fill="#000000" d="m389.12665 427.5428l0 -9.546875l1.25 0l0 9.546875l-1.25 0zm3.3279724 0l0 -6.90625l1.0625 0l0 0.984375q0.75 -1.140625 2.1875 -1.140625q0.625 0 1.15625 0.21875q0.53125 0.21875 0.78125 0.59375q0.265625 0.359375 0.375 0.859375q0.0625 0.328125 0.0625 1.140625l0 4.25l-1.171875 0l0 -4.203125q0 -0.71875 -0.140625 -1.0625q-0.140625 -0.359375 -0.484375 -0.5625q-0.34375 -0.21875 -0.8125 -0.21875q-0.75 0 -1.296875 0.46875q-0.546875 0.46875 -0.546875 1.796875l0 3.78125l-1.171875 0zm7.4124756 2.65625l0 -9.5625l1.078125 0l0 0.890625q0.375 -0.53125 0.84375 -0.78125q0.484375 -0.265625 1.15625 -0.265625q0.875 0 1.546875 0.453125q0.6875 0.453125 1.03125 1.28125q0.34375 0.828125 0.34375 1.828125q0 1.046875 -0.375 1.90625q-0.375 0.84375 -1.109375 1.296875q-0.71875 0.453125 -1.53125 0.453125q-0.578125 0 -1.046875 -0.25q-0.46875 -0.25 -0.765625 -0.625l0 3.375l-1.171875 0zm1.0625 -6.078125q0 1.34375 0.53125 1.984375q0.546875 0.625 1.3125 0.625q0.78125 0 1.34375 -0.65625q0.5625 -0.65625 0.5625 -2.046875q0 -1.3125 -0.546875 -1.96875q-0.546875 -0.671875 -1.296875 -0.671875q-0.75 0 -1.328125 0.703125q-0.578125 0.703125 -0.578125 2.03125zm10.881226 3.421875l0 -1.015625q-0.8125 1.171875 -2.1875 1.171875q-0.609375 0 -1.140625 -0.234375q-0.53125 -0.234375 -0.796875 -0.578125q-0.25 -0.359375 -0.359375 -0.875q-0.0625 -0.34375 -0.0625 -1.09375l0 -4.28125l1.171875 0l0 3.828125q0 0.921875 0.0625 1.234375q0.109375 0.46875 0.46875 0.734375q0.359375 0.25 0.890625 0.25q0.515625 0 0.984375 -0.265625q0.46875 -0.265625 0.65625 -0.734375q0.1875 -0.46875 0.1875 -1.34375l0 -3.703125l1.171875 0l0 6.90625l-1.046875 0zm5.4437256 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875zm5.0153503 1.046875l0 -9.546875l6.90625 0l0 1.125l-5.640625 0l0 2.921875l5.28125 0l0 1.125l-5.28125 0l0 3.25l5.859375 0l0 1.125l-7.125 0zm8.7022705 0l0 -9.546875l1.171875 0l0 9.546875l-1.171875 0zm7.71109 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.046875 0l0 0.96875q0.328125 -0.515625 0.859375 -0.8125q0.546875 -0.3125 1.234375 -0.3125q0.78125 0 1.265625 0.3125q0.484375 0.3125 0.6875 0.890625q0.828125 -1.203125 2.140625 -1.203125q1.03125 0 1.578125 0.578125q0.5625 0.5625 0.5625 1.734375l0 4.75l-1.171875 0l0 -4.359375q0 -0.703125 -0.125 -1.0q-0.109375 -0.3125 -0.40625 -0.5q-0.296875 -0.1875 -0.703125 -0.1875q-0.71875 0 -1.203125 0.484375q-0.484375 0.484375 -0.484375 1.546875l0 4.015625l-1.171875 0l0 -4.484375q0 -0.78125 -0.296875 -1.171875q-0.28125 -0.390625 -0.921875 -0.390625q-0.5 0 -0.921875 0.265625q-0.421875 0.25 -0.609375 0.75q-0.1875 0.5 -0.1875 1.453125l0 3.578125l-1.171875 0zm15.836792 -2.21875l1.203125 0.140625q-0.28125 1.0625 -1.0625 1.65625q-0.765625 0.578125 -1.96875 0.578125q-1.515625 0 -2.40625 -0.9375q-0.890625 -0.9375 -0.890625 -2.609375q0 -1.75 0.890625 -2.703125q0.90625 -0.96875 2.34375 -0.96875q1.390625 0 2.265625 0.9375q0.875 0.9375 0.875 2.65625q0 0.109375 0 0.3125l-5.15625 0q0.0625 1.140625 0.640625 1.75q0.578125 0.59375 1.4375 0.59375q0.65625 0 1.109375 -0.328125q0.453125 -0.34375 0.71875 -1.078125zm-3.84375 -1.90625l3.859375 0q-0.078125 -0.859375 -0.4375 -1.296875q-0.5625 -0.6875 -1.453125 -0.6875q-0.8125 0 -1.359375 0.546875q-0.546875 0.53125 -0.609375 1.4375zm6.5218506 4.125l0 -6.90625l1.0625 0l0 0.984375q0.75 -1.140625 2.1875 -1.140625q0.625 0 1.15625 0.21875q0.53125 0.21875 0.78125 0.59375q0.265625 0.359375 0.375 0.859375q0.0625 0.328125 0.0625 1.140625l0 4.25l-1.171875 0l0 -4.203125q0 -0.71875 -0.140625 -1.0625q-0.140625 -0.359375 -0.484375 -0.5625q-0.34375 -0.21875 -0.8125 -0.21875q-0.75 0 -1.296875 0.46875q-0.546875 0.46875 -0.546875 1.796875l0 3.78125l-1.171875 0zm9.974976 -1.046875l0.171875 1.03125q-0.5 0.109375 -0.890625 0.109375q-0.640625 0 -1.0 -0.203125q-0.34375 -0.203125 -0.484375 -0.53125q-0.140625 -0.328125 -0.140625 -1.390625l0 -3.96875l-0.859375 0l0 -0.90625l0.859375 0l0 -1.71875l1.171875 -0.703125l0 2.421875l1.171875 0l0 0.90625l-1.171875 0l0 4.046875q0 0.5 0.046875 0.640625q0.0625 0.140625 0.203125 0.234375q0.140625 0.078125 0.40625 0.078125q0.203125 0 0.515625 -0.046875z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m340.06036 409.17322l42.11023 0l0 36.283478l-42.11023 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m340.06036 409.17322l42.11023 0l0 36.283478l-42.11023 0z" fill-rule="evenodd"/><path fill="#000000" d="m354.1047 427.47433q0 -2.375 1.28125 -3.71875q1.28125 -1.34375 3.296875 -1.34375q1.3125 0 2.375 0.625q1.0625 0.625 1.609375 1.765625q0.5625 1.125 0.5625 2.5625q0 1.4375 -0.59375 2.59375q-0.578125 1.140625 -1.65625 1.734375q-1.0625 0.578125 -2.3125 0.578125q-1.34375 0 -2.40625 -0.640625q-1.0625 -0.65625 -1.609375 -1.78125q-0.546875 -1.125 -0.546875 -2.375zm1.3125 0.015625q0 1.71875 0.921875 2.71875q0.921875 0.984375 2.328125 0.984375q1.421875 0 2.34375 -1.0q0.921875 -1.0 0.921875 -2.84375q0 -1.15625 -0.40625 -2.03125q-0.390625 -0.875 -1.15625 -1.34375q-0.75 -0.484375 -1.6875 -0.484375q-1.34375 0 -2.3125 0.921875q-0.953125 0.921875 -0.953125 3.078125z" fill-rule="nonzero"/><path fill="#000000" d="m364.20612 433.97433q0 -1.125 0.21875 -1.8125q0.234375 -0.6875 0.6875 -1.046875q0.46875 -0.375 1.15625 -0.375q0.515625 0 0.890625 0.203125q0.390625 0.203125 0.640625 0.59375q0.25 0.390625 0.390625 0.953125q0.15625 0.546875 0.15625 1.484375q0 1.125 -0.234375 1.8125q-0.234375 0.6875 -0.6875 1.0625q-0.453125 0.375 -1.15625 0.375q-0.921875 0 -1.4375 -0.65625q-0.625 -0.796875 -0.625 -2.59375zm0.796875 0q0 1.578125 0.359375 2.09375q0.375 0.515625 0.90625 0.515625q0.546875 0 0.90625 -0.515625q0.359375 -0.53125 0.359375 -2.09375q0 -1.5625 -0.359375 -2.078125q-0.359375 -0.515625 -0.90625 -0.515625q-0.546875 0 -0.859375 0.453125q-0.40625 0.578125 -0.40625 2.140625z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m13.074861 221.6693l35.716537 0l0 30.771652l-35.716537 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m13.074861 221.6693l35.716537 0l0 30.771652l-35.716537 0z" fill-rule="evenodd"/><path fill="#000000" d="m28.781715 238.12074l0 -1.125l4.03125 -0.015625l0 3.546875q-0.921875 0.75 -1.921875 1.125q-0.984375 0.359375 -2.03125 0.359375q-1.40625 0 -2.5625 -0.59375q-1.140625 -0.609375 -1.734375 -1.734375q-0.578125 -1.140625 -0.578125 -2.546875q0 -1.40625 0.578125 -2.609375q0.59375 -1.203125 1.6875 -1.78125q1.09375 -0.59375 2.515625 -0.59375q1.03125 0 1.859375 0.34375q0.84375 0.328125 1.3125 0.9375q0.484375 0.59375 0.734375 1.546875l-1.140625 0.3125q-0.21875 -0.71875 -0.53125 -1.140625q-0.3125 -0.421875 -0.90625 -0.671875q-0.59375 -0.25 -1.3125 -0.25q-0.875 0 -1.515625 0.265625q-0.625 0.265625 -1.015625 0.703125q-0.375 0.421875 -0.59375 0.9375q-0.359375 0.875 -0.359375 1.921875q0 1.265625 0.4375 2.125q0.4375 0.859375 1.265625 1.28125q0.84375 0.421875 1.796875 0.421875q0.8125 0 1.59375 -0.3125q0.78125 -0.328125 1.1875 -0.6875l0 -1.765625l-2.796875 0z" fill-rule="nonzero"/><path fill="#000000" d="m34.023766 243.7145q0 -1.125 0.21875 -1.8125q0.234375 -0.6875 0.6875 -1.046875q0.46875 -0.375 1.15625 -0.375q0.515625 0 0.890625 0.203125q0.390625 0.203125 0.640625 0.59375q0.25 0.390625 0.390625 0.953125q0.15625 0.546875 0.15625 1.484375q0 1.125 -0.234375 1.8125q-0.234375 0.6875 -0.6875 1.0625q-0.453125 0.375 -1.15625 0.375q-0.921875 0 -1.4375 -0.65625q-0.625 -0.796875 -0.625 -2.59375zm0.796875 0q0 1.578125 0.359375 2.09375q0.375 0.515625 0.90625 0.515625q0.546875 0 0.90625 -0.515625q0.359375 -0.53125 0.359375 -2.09375q0 -1.5625 -0.359375 -2.078125q-0.359375 -0.515625 -0.90625 -0.515625q-0.546875 0 -0.859375 0.453125q-0.40625 0.578125 -0.40625 2.140625z" fill-rule="nonzero"/><path fill="#cfe2f3" d="m81.55118 158.1693l35.716537 0l0 30.771652l-35.716537 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m81.55118 158.1693l35.716537 0l0 30.771652l-35.716537 0z" fill-rule="evenodd"/><path fill="#000000" d="m97.25803 174.62074l0 -1.125l4.03125 -0.015625l0 3.546875q-0.921875 0.75 -1.921875 1.125q-0.984375 0.359375 -2.03125 0.359375q-1.40625 0 -2.5625 -0.59375q-1.140625 -0.609375 -1.734375 -1.734375q-0.578125 -1.140625 -0.578125 -2.546875q0 -1.40625 0.578125 -2.609375q0.59375 -1.203125 1.6875 -1.78125q1.09375 -0.59375 2.515625 -0.59375q1.03125 0 1.859375 0.34375q0.84375 0.328125 1.3125 0.9375q0.484375 0.59375 0.734375 1.546875l-1.140625 0.3125q-0.21875 -0.71875 -0.53125 -1.140625q-0.3125 -0.421875 -0.90625 -0.671875q-0.59375 -0.25 -1.3125 -0.25q-0.875 0 -1.515625 0.265625q-0.625 0.265625 -1.015625 0.703125q-0.375 0.421875 -0.59375 0.9375q-0.359375 0.875 -0.359375 1.921875q0 1.265625 0.4375 2.125q0.4375 0.859375 1.265625 1.28125q0.84375 0.421875 1.796875 0.421875q0.8125 0 1.59375 -0.3125q0.78125 -0.328125 1.1875 -0.6875l0 -1.765625l-2.796875 0z" fill-rule="nonzero"/><path fill="#000000" d="m105.437584 183.35512l-0.78125 0l0 -4.96875q-0.28125 0.265625 -0.75 0.53125q-0.453125 0.265625 -0.8125 0.40625l0 -0.75q0.65625 -0.3125 1.140625 -0.75q0.484375 -0.4375 0.6875 -0.84375l0.515625 0l0 6.375z" fill-rule="nonzero"/><path fill="#d9d2e9" d="m61.208965 239.81032l38.198093 0l0 40.06337l-38.198093 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m61.208965 239.81032l38.198093 0l0 40.06337l-38.198093 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m99.40706 239.81032l38.19809 0l0 40.06337l-38.19809 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.40706 239.81032l38.19809 0l0 40.06337l-38.19809 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m175.80324 239.81032l38.19809 0l0 40.06337l-38.19809 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m175.80324 239.81032l38.19809 0l0 40.06337l-38.19809 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m137.60515 239.81032l38.19809 0l0 40.06337l-38.19809 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m137.60515 239.81032l38.19809 0l0 40.06337l-38.19809 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m61.208965 319.93704l38.198093 0l0 40.063385l-38.198093 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m61.208965 319.93704l38.198093 0l0 40.063385l-38.198093 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m99.40706 319.93704l38.19809 0l0 40.063385l-38.19809 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.40706 319.93704l38.19809 0l0 40.063385l-38.19809 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m175.80324 319.93704l38.19809 0l0 40.063385l-38.19809 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m175.80324 319.93704l38.19809 0l0 40.063385l-38.19809 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m137.60515 319.93704l38.19809 0l0 40.063385l-38.19809 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m137.60515 319.93704l38.19809 0l0 40.063385l-38.19809 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m61.208965 279.8737l38.198093 0l0 40.063354l-38.198093 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m61.208965 279.8737l38.198093 0l0 40.063354l-38.198093 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m99.40706 279.8737l38.19809 0l0 40.063354l-38.19809 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.40706 279.8737l38.19809 0l0 40.063354l-38.19809 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m175.80324 279.8737l38.19809 0l0 40.063354l-38.19809 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m175.80324 279.8737l38.19809 0l0 40.063354l-38.19809 0z" fill-rule="evenodd"/><path fill="#8e7cc3" d="m137.60582 279.87415l38.19809 0l0 40.063354l-38.19809 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m137.60582 279.87415l38.19809 0l0 40.063354l-38.19809 0z" fill-rule="evenodd"/><path fill="#000000" d="m150.5612 306.82584l5.171875 -6.953125l-4.5625 -6.40625l2.109375 0l2.421875 3.4375q0.75 1.0625 1.078125 1.625q0.4375 -0.71875 1.046875 -1.515625l2.6875 -3.546875l1.921875 0l-4.6875 6.296875l5.0625 7.0625l-2.1875 0l-3.359375 -4.765625q-0.28125 -0.40625 -0.59375 -0.890625q-0.4375 0.734375 -0.625 1.0l-3.359375 4.65625l-2.125 0z" fill-rule="nonzero"/><path fill="#d9d2e9" d="m61.205215 360.00006l38.200317 0l0 40.063354l-38.200317 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m61.205215 360.00006l38.200317 0l0 40.063354l-38.200317 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m99.40553 360.00006l38.200317 0l0 40.063354l-38.200317 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.40553 360.00006l38.200317 0l0 40.063354l-38.200317 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m175.80615 360.00006l38.200317 0l0 40.063354l-38.200317 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m175.80615 360.00006l38.200317 0l0 40.063354l-38.200317 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m137.60585 360.00006l38.200302 0l0 40.063354l-38.200302 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m137.60585 360.00006l38.200302 0l0 40.063354l-38.200302 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m61.205215 199.74808l38.200317 0l0 40.06337l-38.200317 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m61.205215 199.74808l38.200317 0l0 40.06337l-38.200317 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m99.40553 199.74808l38.200317 0l0 40.06337l-38.200317 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.40553 199.74808l38.200317 0l0 40.06337l-38.200317 0z" fill-rule="evenodd"/><path fill="#d9d2e9" d="m175.80615 199.74808l38.200317 0l0 40.06337l-38.200317 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m175.80615 199.74808l38.200317 0l0 40.06337l-38.200317 0z" fill-rule="evenodd"/><path fill="#b4a7d6" d="m137.60585 199.74808l38.200302 0l0 40.06337l-38.200302 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m137.60585 199.74808l38.200302 0l0 40.06337l-38.200302 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m54.305374 199.74808l0 80.12598" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m54.305374 205.74808l0 68.12598" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m55.957108 205.74808l-1.6517334 -4.538101l-1.6517334 4.538101z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m52.65364 273.87405l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m61.205215 193.27977l76.409454 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m67.205215 193.27977l64.409454 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m67.205215 191.62804l-4.5380974 1.6517334l4.5380974 1.6517334z" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m131.61467 194.9315l4.538086 -1.6517334l-4.538086 -1.6517334z" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/g3doc/images/send_recv_order.png b/tensorflow/compiler/xla/g3doc/images/send_recv_order.png
new file mode 100644
index 0000000000000000000000000000000000000000..721200e3cb0af984f58cb8594607e5c0a39ddd18
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/send_recv_order.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/send_recv_schedule.png b/tensorflow/compiler/xla/g3doc/images/send_recv_schedule.png
new file mode 100644
index 0000000000000000000000000000000000000000..c830f987ab9b7e53730555d5734ce37bd1854211
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/send_recv_schedule.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure1.png b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure1.png
new file mode 100644
index 0000000000000000000000000000000000000000..00cefe4c7806c1c09dd51499375e720bfb0baac6
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure1.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure2.png b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure2.png
new file mode 100644
index 0000000000000000000000000000000000000000..6439c6e40272ae6b2954e9d7f3de2df470a2b36d
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure2.png differ
diff --git a/tensorflow/compiler/xla/xlalogo.png b/tensorflow/compiler/xla/g3doc/images/xlalogo.png
similarity index 100%
rename from tensorflow/compiler/xla/xlalogo.png
rename to tensorflow/compiler/xla/g3doc/images/xlalogo.png
diff --git a/tensorflow/compiler/xla/g3doc/jit.md b/tensorflow/compiler/xla/g3doc/jit.md
new file mode 100644
index 0000000000000000000000000000000000000000..85fa16ccc7f48a3dce840564e79097c9e136767f
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/jit.md
@@ -0,0 +1,180 @@
+# Using JIT Compilation
+
+> Note: TensorFlow must be compiled from source to include XLA.
+
+## Why use just-in-time (JIT) compilation?
+
+The TensorFlow/XLA JIT compiler compiles and runs parts of TensorFlow graphs via
+XLA. The benefit of this over the standard TensorFlow implementation is that XLA
+can fuse multiple operators (kernel fusion) into a small number of compiled
+kernels. Fusing operators can reduce memory bandwidth requirements and improve
+performance compared to executing operators one-at-a-time, as the TensorFlow
+executor does.
+
+## Running TensorFlow graphs via XLA
+
+There are two ways to run TensorFlow computations via XLA, either by
+JIT-compiling operators placed on a CPU or GPU device, or by placing operators
+on the `XLA_CPU` or `XLA_GPU` TensorFlow devices. Placing operators directly on
+a TensorFlow XLA device forces the operator to run on that device and is mainly
+used for testing.
+
+> Note: The XLA CPU backend supports intra-op parallelism (i.e. it can shard a
+> single operation across multiple cores) but it does not support inter-op
+> parallelism (i.e. it cannot execute independent operations concurrently across
+> multiple cores).  The XLA GPU backend is competitive with the standard
+> TensorFlow implementation, sometimes faster, sometimes slower.
+
+### Turning on JIT compilation
+
+JIT compilation can be turned on at the session level or manually for select
+operations. Both of these approaches are zero-copy --- data does not need to be
+copied when passing data between a compiled XLA kernel and a TensorFlow operator
+placed on the same device.
+
+#### Session
+
+Turning on JIT compilation at the session level will result in all possible
+operators being greedily compiled into XLA computations. Each XLA computation
+will be compiled into one or more kernels for the underlying device.
+
+Subject to a few constraints, if there are two adjacent operators in the graph
+that both have XLA implementations, then they will be compiled into a single XLA
+computation.
+
+JIT compilation is turned on at the session level by setting the
+`global_jit_level` config to `tf.OptimizerOptions.ON_1` and passing the config
+during session initialization.
+
+```python
+# Config to turn on JIT compilation
+config = tf.ConfigProto()
+config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+
+sess = tf.Session(config=config)
+```
+
+> Note: Turning on JIT at the session level will not result in operations being
+> compiled for the CPU. JIT compilation for CPU operations must be done via
+> the manual method documented below.
+
+#### Manual with experimental_jit_scope()
+
+JIT compilation can also be turned on manually for one or more operators. This
+is done by tagging the operators to compile with the attribute
+`_XlaCompile=true`. The simplest way to do this is via the
+`tf.contrib.compiler.jit.experimental_jit_scope()` scope defined in
+[`tensorflow/contrib/compiler/jit.py`](https://www.tensorflow.org/code/tensorflow/contrib/compiler/jit.py).
+Example usage:
+
+```python
+    jit_scope = tf.contrib.compiler.jit.experimental_jit_scope
+
+    x = tf.placeholder(np.float32)
+    with jit_scope():
+      y = tf.add(x, x)  # The "add" will be compiled with XLA.
+```
+
+The `_XlaCompile` attribute is currently supported on a best-effort basis. If an
+operator cannot be compiled, TensorFlow will silently fall back to the normal
+implementation.
+
+#### Manual with xla.compile()
+
+Unlike experimental_jit_scope() which silently falls back to normal Tensorflow
+on uncompilable operator, xla.compile() returns an explicit error. This is
+useful if you want more predictable behaviors from XLA compilation.
+
+Please see
+[xla.compile() tutorial Colab](./tutorials/xla_compile.ipynb)
+for how to use it.
+
+### Placing operators on XLA devices
+
+Another way to run computations via XLA is to place an operator on a specific
+XLA device. This method is normally only used for testing. Valid targets are
+`XLA_CPU` or `XLA_GPU`.
+
+```python
+with tf.device("/job:localhost/replica:0/task:0/device:XLA_GPU:0"):
+  output = tf.add(input1, input2)
+```
+
+Unlike JIT compilation on the standard CPU and GPU devices, these devices make a
+copy of data when it is transferred on and off the device. The extra copy makes
+it expensive to mix XLA and TensorFlow operators in the same graph.
+
+## Tutorial
+
+This tutorial covers training a simple version of MNIST softmax with JIT turned
+on. Currently JIT at the session level, which is what is used for the tutorial,
+only supports GPU.
+
+Before starting the tutorial verify that the LD_LIBRARY environment variable or
+ldconfig contains `$CUDA_ROOT/extras/CUPTI/lib64`, which contains libraries for
+the CUDA Profiling Tools Interface
+[(CUPTI)](http://docs.nvidia.com/cuda/cupti/index.html). TensorFlow uses CUPTI
+to pull tracing information from the GPU.
+
+### Step #1: Prepare sample script
+
+Download or move
+[mnist_softmax_xla.py](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py)
+into a folder outside of the TensorFlow source tree.
+
+### Step #2: Run without XLA
+
+Execute the python script to train the model without XLA.
+
+```shell
+python mnist_softmax_xla.py --xla=''
+```
+
+Using the Chrome Trace Event Profiler (browse to chrome://tracing),
+open the timeline file created when the script finishes: `timeline.ctf.json`.
+The rendered timeline should look similar to the picture below with multiple
+green boxes labeled `MatMul`, possibly across multiple CPUs.
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="./images/jit_timeline_gpu.png">
+</div>
+
+### Step #3 Run with XLA
+
+Execute the python script to train the model with XLA and turn on a debugging
+feature of XLA via an environmental variable that outputs the XLA graph.
+
+```shell
+XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
+```
+
+Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
+should look similar to the picture below with one long bar labeled `XlaLaunch`.
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="./images/jit_timeline_gpu_xla.png">
+</div>
+
+To understand what is happening in `XlaLaunch`, look at the console output for
+statements similar to the following:
+
+```shell
+computation cluster_0[_XlaCompiledKernel=true,_XlaNumConstantArgs=1].v82 [CPU:
+pipeline start, before inline]: /tmp/hlo_graph_0.dot
+
+```
+
+The console statements point to the location of `hlo_graph_xx.dot` files that
+contain information about the graph created by XLA. The process that XLA takes
+to fuse Ops is visible by starting at `hlo_graph_0.dot` and viewing each diagram
+in succession.
+
+To Render the .dot file into a png, install
+[GraphViz](https://www.graphviz.org/download/) and run:
+
+```shell
+dot -Tpng hlo_graph_80.dot -o hlo_graph_80.png
+```
+
+The result will look like the following:
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="./images/jit_gpu_xla_graph.png">
+</div>
diff --git a/tensorflow/compiler/xla/g3doc/layout_with_tiling.md b/tensorflow/compiler/xla/g3doc/layout_with_tiling.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e990851af7495ebd4417e44f1d955fcc14dadf1
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/layout_with_tiling.md
@@ -0,0 +1,159 @@
+# Tiled layout
+
+*Note: This doc describes how tiled layout is intended to work. Tiling is being
+implemented, but this is an early effort and it is currently not even guaranteed
+to get an Unimplemented error if one tries to use tiling - it may be just
+silently ignored.*
+
+<center> ![](images/xla_array_layout_figure1.png)
+
+Figure 1 </center>
+
+Figure 1 shows how an array F32[3,5] is laid out in memory with 2x2 tiling. A
+shape with this layout is written as F32[3,5]{1,0:(2,2)}, where 1,0 relates to
+the physical order of dimensions (minor_to_major field in Layout) while (2,2)
+after the colon indicates tiling of the physical dimensions by a 2x2 tile.
+
+Intuitively tiles are laid out to cover the shape and then within each tile,
+elements are then laid out without tiling, as in the example above, where the
+right part of the example shows the layout in memory, including the white
+padding elements that are added in order to have complete 2x2 tiles even though
+the original array bounds are not even.
+
+The extra elements in the padding are not required to contain any particular
+value.
+
+## Linear index formulas for tiling given a shape and a tile
+
+Without tiling, an element e=(e<sub>n</sub>, e<sub>n-1</sub>, ... ,
+e<sub>1</sub>) in an array with array bounds d=(d<sub>n</sub>, d<sub>n-1</sub>,
+... , d<sub>1</sub>) (d1 is the most minor dimension) is laid out by major to
+minor order at position:
+
+&nbsp;&nbsp; linear_index(e, d) \
+= linear_index((e<sub>n</sub>, e<sub>n-1</sub>, ... , e<sub>1</sub>),
+(d<sub>n</sub>, d<sub>n-1</sub>, ... , d<sub>1</sub>)) \
+= e<sub>n</sub>d<sub>n-1</sub>...d<sub>1</sub> +
+e<sub>n-1</sub>d<sub>n-2</sub>...d<sub>1</sub> + ... + e<sub>1</sub>
+
+For simplicity of notation in this document we assume a tile has the same number
+of dimensions as the array. In XLA's implementation of tiling, this is
+generalized to tilings with fewer dimensions by leaving the initial most-major
+dimensions unchanged and applying the tiling only to the most minor dimensions,
+so that the tiling that is specified mentions a suffix of the physical
+dimensions of the shape being tiled.
+
+When tiling of size (t<sub>n</sub>, t<sub>n-1</sub>, ... , t<sub>1</sub>) is
+used, an element in the array with indices (e<sub>n</sub>, e<sub>n-1</sub>, ...
+, e<sub>1</sub>) is mapped to this position in the final layout:
+
+&nbsp;&nbsp; linear_index_with_tile(e, d, t) \
+= linear_index((⌊e/t⌋, e mod t), (⌈d/t⌉, t)) &nbsp; &nbsp; (arithmetic is
+elementwise, (a,b) is concatenation) \
+= linear_index((⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... ,
+⌊e<sub>1</sub>/t<sub>1</sub>⌋, e<sub>n</sub> mod t<sub>n</sub>, ... ,
+e<sub>1</sub> mod t<sub>1</sub>), (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+⌈d<sub>1</sub>/t<sub>1</sub>⌉, t<sub>n</sub>, t<sub>n-1</sub>, ... ,
+t<sub>1</sub>)) \
+= linear_index((⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... ,
+⌊e<sub>1</sub>/t<sub>1</sub>⌋), (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+⌈d<sub>1</sub>/t<sub>1</sub>⌉))∙t<sub>n</sub>t<sub>n-1</sub>...t<sub>1</sub> +
+linear_index((e<sub>n</sub> mod t<sub>n</sub>, ... , e<sub>1</sub> mod
+t<sub>1</sub>), (t<sub>n</sub>, t<sub>n-1</sub>, ... , t<sub>1</sub>))
+
+The layout can be thought of as having two parts:
+(⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... , ⌊e<sub>1</sub>/t<sub>1</sub>⌋), which
+corresponds to a tile index in an array of tiles of size
+(⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... , ⌈d<sub>1</sub>/t<sub>1</sub>⌉), and
+(e<sub>n</sub> mod t<sub>n</sub>, ... , e<sub>1</sub> mod t<sub>1</sub>), which
+corresponds to a within-tile index. The ceil function appears in
+⌈d<sub>i</sub>/t<sub>i</sub>⌉ because if tiles overrun the bounds of the larger
+array, padding is inserted as in Figure 1. Both the tiles and elements within
+tiles are laid out recursively without tiling.
+
+For the example in Figure 1, element (2,3) has tile index (1,1), and within-tile
+index (0,1), for a combined coordinate vector of (1, 1, 0, 1). The tile indices
+have bounds (2, 3) and the tile itself is (2, 2) for a combined vector of (2, 3,
+2, 2). The linear index with tile for the element with index (2, 3) in the
+logical shape is then
+
+&nbsp;&nbsp; linear_index_with_tile((2,3), (3,5), (2,2)) \
+= linear_index((1,1,0,1), (2,3,2,2)) \
+= linear_index((1,1), (2,3)) ∙ 2 ∙ 2 + linear_index((0,1), (2,2)) \
+= (1 ∙ 3 + 1) ∙ 2 ∙ 2 + (0 ∙ 2 + 1) \
+= 17.
+
+# Tiling as pad-reshape-transpose
+
+Tiling-based layout operates as follows: \
+Consider an array of dimensions (d<sub>n</sub>, d<sub>n-1</sub>, ... , d1) (d1
+is the most minor dimension). When it’s laid out with tiling of size
+(t<sub>n</sub>, t<sub>n-1</sub>, ... , t<sub>1</sub>) (t<sub>1</sub> is the most
+minor dimension), that tiling can be described in terms of pad-reshape-transpose
+in the following way.
+
+1.  The array is padded to (⌈d<sub>n</sub>/t<sub>n</sub>⌉∙t<sub>n</sub>, ... ,
+    ⌈d<sub>1</sub>/t<sub>1</sub>⌉∙t<sub>1</sub>).
+2.  Each dimension i is broken into (⌈d<sub>i</sub>/t</sub>i</sub>⌉,
+    t<sub>i</sub>), i.e. the array is reshaped to \
+    &nbsp; &nbsp; (⌈d<sub>n</sub>/t<sub>n</sub>⌉, t<sub>n</sub>, ... ,
+    ⌈d<sub>1</sub>/t<sub>1</sub>⌉, t<sub>1</sub>). \
+    There is no physical layout change in this reshape by itself, so this
+    reshape is a bitcast. If one is not explicitly thinking of a tiling, this
+    reshape could express any shape with the same number of elements as the
+    padded shape - the example here is of how to express a tile in this way.
+3.  A transpose happens by moving t<sub>n</sub>, ... , t<sub>1</sub> to the most
+    minor dimensions while keeping their relative order, so that the order of
+    dimensions from most major to most minor becomes \
+    &nbsp; &nbsp; (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+    ⌈d<sub>1</sub>/t<sub>1</sub>⌉, t<sub>n</sub>, ... , t<sub>1</sub>).
+
+The final shape has the prefix \
+&nbsp; &nbsp; (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+⌈d<sub>1</sub>/t<sub>1</sub>⌉), which describes the number of tiles in each
+dimension. An element in the array (e<sub>n</sub>, ... , e<sub>1</sub>) is
+mapped to this element in the final shape: \
+&nbsp; &nbsp; (⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... ,
+⌊e<sub>0</sub>/t<sub>0</sub>⌋, e<sub>n</sub> mod t<sub>n</sub>, ... ,
+e<sub>1</sub> mod t<sub>1</sub>). It is easy to see that the linear index of the
+element follows the formula above as expected.
+
+# Repeated tiling
+
+XLA's tiling becomes even more flexible by applying it repeatedly.
+
+<center> ![](images/xla_array_layout_figure2.png)
+
+Figure 2 </center>
+
+Figure 2 shows how an array of size 4x8 is tiled by two levels of tiling (first
+2x4 then 2x1). We represent this repeated tiling as (2,4)(2,1). Each color
+indicates a 2x4 tile and each red border box is a 2x1 tile. The numbers
+indicates the linear index in memory of that element in the tiled format. This
+format matches the format used for BF16 on TPU, except that the initial tile is
+bigger, namely the tiling is (8,128)(2,1), where the purpose of the second
+tiling by 2x1 is to collect together two 16 bit values to form one 32 bit value
+in a way that aligns with the architecture of a TPU.
+
+Note that a second or later tile can refer to both the minor within-tile
+dimensions, which just rearranges data within the tile, as in this example with
+(8,128)(2,1), but can also refer to the major cross-tile dimensions from the
+prior tiling.
+
+# Combining dimensions using tiles
+
+XLA's tiling also supports combining dimensions. For example, it can combine
+dimensions in F32[2,7,8,11,10]{4,3,2,1,0} into F32[112,110]{1,0} first before
+tiling it with (2,3). The tile used is (&lowast;,&lowast;,2,&lowast;,3). Here an
+asterisk in a tile implies taking that dimension and combining it with the next
+more minor dimension. Multiple adjacent dimensions can be subsumed together into
+one dimension. A subsumed dimension is represented by a tile value of -1 in that
+dimension of the tile, which is not otherwise valid in a tile as a dimension
+size.
+
+More precisely, if dimension i of the shape is eliminated via an asterisk in the
+tile, then before the prior definition of tiling is applied, that dimension is
+removed from both the shape being tiled and the tile vector, and what was
+dimension i-1 of the shape has its array bound increased from d<sub>i-1</sub> to
+d<sub>i</sub>d<sub>i-1</sub>. This step is repeated for each asterisk in the
+tile vector.
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
similarity index 88%
rename from tensorflow/docs_src/performance/xla/operation_semantics.md
rename to tensorflow/compiler/xla/g3doc/operation_semantics.md
index 96d269bec4d59bd7eb23e1964bf7d208996aabde..d888b1f23f36f33ef94ef0e22374e0c796e47a89 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -13,6 +13,22 @@ arbitrary-dimensional array. For convenience, special cases have more specific
 and familiar names; for example a *vector* is a 1-dimensional array and a
 *matrix* is a 2-dimensional array.
 
+## AfterAll
+
+See also
+[`XlaBuilder::AfterAll`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+AfterAll takes a variadic number of tokens and produces a single token. Tokens
+are primitive types which can be threaded between side-effecting operations to
+enforce ordering. `AfterAll` can be used as a join of tokens for ordering a
+operation after a set operations.
+
+<b> `AfterAll(operands)` </b>
+
+Arguments  | Type    | Semantics
+---------- | ------- | -------------------------
+`operands` | `XlaOp` | variadic number of tokens
+
 ## AllToAll
 
 See also
@@ -77,7 +93,7 @@ AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, /*split_count=*/4);
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/xla/ops_alltoall.png">
+  <img style="width:100%" src="./images/ops_alltoall.png">
 </div>
 
 In this example, there are 4 cores participating the Alltoall. On each core, the
@@ -119,8 +135,8 @@ respect to `operand`, `offset` and `scale` across all the other dimensions. The
 `feature_index` must be a valid index for the feature dimension in `operand`.
 
 The three gradients are defined by the following formulas (assuming a
-4-dimensional tensor as `operand` and with feature dimension index \\(l\\),
-batch size `m` and spatial sizes `w` and `h`):
+4-dimensional array as `operand` and with feature dimension index $$l$$, batch
+size `m` and spatial sizes `w` and `h`):
 
 \\[ \begin{split} c_l&=
 \frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h
@@ -402,6 +418,33 @@ then v12 == f32[8x3] {{10, 11, 12},
 
 ```
 
+## CollectivePermute
+
+See also
+[`XlaBuilder::CollectivePermute`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+CollectivePermute is a collective operation that sends and receives data cross
+replicas.
+
+<b> `CollectivePermute(operand, source_target_pairs)` </b>
+
+| Arguments             | Type                    | Semantics                  |
+| --------------------- | ----------------------- | -------------------------- |
+| `operand`             | `XlaOp`                 | n dimensional input array  |
+| `source_target_pairs` | `<int64, int64>` vector | A list of                  |
+:                       :                         : (source_replica_id,        :
+:                       :                         : target_replica_id) pairs.  :
+:                       :                         : For each pair, the operand :
+:                       :                         : is sent from source        :
+:                       :                         : replica to target replica. :
+
+Note that there are the following restrictions on the `source_target_pair`:
+
+-   Any two pairs should not have the same target replica id, and they should
+    not have the same source replica id.
+-   If a replica id is not a target in any pair, then the output on that replica
+    is a tensor consists of 0(s) with the same shape as the input.
+
 ## Concatenate
 
 See also
@@ -455,7 +498,7 @@ Concat({a, b}, 0)
 
 Diagram:
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="https://www.tensorflow.org/images/ops_concatenate.png">
+  <img style="width:100%" src="./images/ops_concatenate.png">
 </div>
 
 ## Conditional
@@ -1028,7 +1071,7 @@ Arguments | Type    | Semantics
 `rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
-[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
+[broadcasting](broadcasting.md) documentation about what it means for shapes to
 be compatible. The result of an operation has a shape which is the result of
 broadcasting the two input arrays. In this variant, operations between arrays of
 different ranks are *not* supported, unless one of the operands is a scalar.
@@ -1056,7 +1099,7 @@ the dimensions of the higher-rank shape. The unmapped dimensions of the expanded
 shape are filled with dimensions of size one. Degenerate-dimension broadcasting
 then broadcasts the shapes along these degenerate dimensions to equalize the
 shapes of both operands. The semantics are described in detail on the
-[broadcasting page](../../performance/xla/broadcasting.md).
+[broadcasting page](broadcasting.md).
 
 ## Element-wise comparison operations
 
@@ -1079,7 +1122,7 @@ Arguments | Type    | Semantics
 `rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
-[broadcasting](../../performance/xla/broadcasting.md) documentation about what it means for shapes to
+[broadcasting](broadcasting.md) documentation about what it means for shapes to
 be compatible. The result of an operation has a shape which is the result of
 broadcasting the two input arrays with the element type `PRED`. In this variant,
 operations between arrays of different ranks are *not* supported, unless one of
@@ -1096,7 +1139,7 @@ matrix to a vector).
 
 The additional `broadcast_dimensions` operand is a slice of integers specifying
 the dimensions to use for broadcasting the operands. The semantics are described
-in detail on the [broadcasting page](../../performance/xla/broadcasting.md).
+in detail on the [broadcasting page](broadcasting.md).
 
 ## Element-wise unary functions
 
@@ -1152,29 +1195,32 @@ For a more intuitive description, see the "Informal Description" section below.
 
 <b> `gather(operand, start_indices, offset_dims, collapsed_slice_dims, slice_sizes, start_index_map)` </b>
 
-|Arguments         | Type                    | Semantics                       |
-|----------------- | ----------------------- | --------------------------------|
-|`operand`         | `XlaOp`                 | The array we’re gathering       |
-:                  :                         : from.                           :
-|`start_indices`   | `XlaOp`                 | Array containing the starting  |
-:                  :                         : indices of the slices we gather.:
-|`index_vector_dim` | `int64`                | The dimension in                |
-:                  :                         : `start_indices` that "contains" :
-:                  :                         : the starting indices.  See      :
-:                  :                         : below for a detailed            :
-:                  :                         : description.                    :
-|`offset_dims`     | `ArraySlice<int64>`     | The set of dimensions in  the   :
-:                  :                         : output shape that offset into a :
-:                  :                         : array sliced from operand.     :
-|`slice_sizes`     | `ArraySlice<int64>`      | `slice_sizes[i]` is the bounds |
-:                  :                          : for the slice on dimension `i`.:
-|`collapsed_slice_dims` | `ArraySlice<int64>` | The set of dimensions in each  :
-|                  :                          | slice that are collapsed away. :
-|                  :                          | These dimensions must have size:
-|                  :                          | 1.                             |
-|`start_index_map` | `ArraySlice<int64>`      | A map that describes how to map|
-:                  :                          : indices in `start_indices` to  :
-:                  :                          : to legal indices into operand. :
+| Arguments              | Type                | Semantics                     |
+| ---------------------- | ------------------- | ----------------------------- |
+| `operand`              | `XlaOp`             | The array we’re gathering     |
+:                        :                     : from.                         :
+| `start_indices`        | `XlaOp`             | Array containing the starting |
+:                        :                     : indices of the slices we      :
+:                        :                     : gather.                       :
+| `index_vector_dim`     | `int64`             | The dimension in              |
+:                        :                     : `start_indices` that          :
+:                        :                     : "contains" the starting       :
+:                        :                     : indices. See below for a      :
+:                        :                     : detailed description.         :
+| `offset_dims`          | `ArraySlice<int64>` | The set of dimensions in the  |
+:                        :                     : output shape that offset into :
+:                        :                     : a array sliced from operand.  :
+| `slice_sizes`          | `ArraySlice<int64>` | `slice_sizes[i]` is the       |
+:                        :                     : bounds for the slice on       :
+:                        :                     : dimension `i`.                :
+| `collapsed_slice_dims` | `ArraySlice<int64>` | The set of dimensions in each |
+:                        :                     : \: slice that are collapsed   :
+:                        :                     : away. These dimensions must   :
+:                        :                     : have size 1.                  :
+| `start_index_map`      | `ArraySlice<int64>` | A map that describes how to   |
+:                        :                     : map indices in                :
+:                        :                     : `start_indices` to legal      :
+:                        :                     : indices into operand.         :
 
 For convenience, we label dimensions in the output array not in `offset_dims`
 as `batch_dims`.
@@ -1269,7 +1315,7 @@ the output shape, and maps it to an element in the input array in the following
 way:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_xla_gather_0.svg">
+  <img style="width:100%" src="./images/ops_xla_gather_0.svg">
 </div>
 
 We first select an (`X`,`Y`) vector from the gather indices array using `G`.
@@ -1288,7 +1334,7 @@ version of the example above using a "gather indices" array of shape `[4,5,2]`
 would translate indices like this:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_xla_gather_1.svg">
+  <img style="width:100%" src="./images/ops_xla_gather_1.svg">
 </div>
 
 Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
@@ -1317,7 +1363,7 @@ the following ways:
 As a final example, we use (2) and (3) to implement `tf.gather_nd`:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/ops_xla_gather_2.svg">
+  <img style="width:100%" src="./images/ops_xla_gather_2.svg">
 </div>
 
 `G`<sub>`0`</sub> and `G`<sub>`1`</sub> are used to slice out a starting index
@@ -1326,7 +1372,7 @@ element, `X`.  Similarly, there is only one output offset index with the value
 `O`<sub>`0`</sub>.  However, before being used as indices into the input array,
 these are expanded in accordance to "Gather Index Mapping" (`start_index_map` in
 the formal description) and "Offset Mapping" (`expand_offset_dims` in the formal
-description) into [`0`,`O`<sub>`0`</sub>] and [`X`,`0`] respectively, adding up
+description) into  [`X`,`0`] and [`0`,`O`<sub>`0`</sub>] respectively, adding up
 to [`X`,`O`<sub>`0`</sub>].  In other words, the output index
 [`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`O`<sub>`0`</sub>] maps to the input index
 [`GatherIndices`[`G`<sub>`0`</sub>,`G`<sub>`1`</sub>,`0`],`X`] which gives us
@@ -1336,6 +1382,22 @@ the semantics for `tf.gather_nd`.
 index `X` in the gather indices array picks an entire row and the result is the
 concatenation of all these rows.
 
+## GetDimensionSize
+
+See also
+[`XlaBuilder::GetDimensionSize`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Returns the size of the given dimension of the operand. The operand must be
+array shaped.
+
+<b> `GetDimensionSize(operand, dimension)` </b>
+
+| Arguments   | Type    | Semantics                                           |
+| ----------- | ------- | --------------------------------------------------- |
+| `operand`   | `XlaOp` | n dimensional input array                           |
+| `dimension` | `int64` | A value in the interval `[0, n)` that specifies the |
+:             :         : dimension                                           :
+
 ## GetTupleElement
 
 See also
@@ -1401,13 +1463,14 @@ Infeed of the device.
 <b> `Iota()` </b>
 
 Builds a constant literal on device rather than a potentially large host
-transfer.  Creates a rank 1 tensor of values starting at zero and incrementing
-by one.
+transfer. Creates a rank 1 array of values starting at zero and incrementing by
+one.
 
-Arguments          | Type            | Semantics
------------------- | --------------- | ---------------------------
-`type`             | `PrimitiveType` | type U
-`size`             | `int64`         | The number of elements in the tensor.
+Arguments        | Type            | Semantics
+---------------- | --------------- | ------------------------------------
+`type`           | `PrimitiveType` | type U
+`size`           | `int64`         | The number of elements in the array.
+`iota_dimension` | `int64`         | The dimension to increment along.
 
 ## Map
 
@@ -1461,20 +1524,25 @@ dimension.
 
 `PaddingConfig` is a repeated field of `PaddingConfigDimension`, which contains
 three fields for each dimension: `edge_padding_low`, `edge_padding_high`, and
-`interior_padding`. `edge_padding_low` and `edge_padding_high` specify the
-amount of padding added at the low-end (next to index 0) and the high-end (next
-to the highest index) of each dimension respectively. The amount of edge padding
-can be negative -- the absolute value of negative padding indicates the number
-of elements to remove from the specified dimension. `interior_padding` specifies
-the amount of padding added between any two elements in each dimension. Interior
-padding occurs logically before edge padding, so in the case of negative edge
-padding elements are removed from the interior-padded operand. This operation is
-a no-op if the edge padding pairs are all (0, 0) and the interior padding values
-are all 0. The figure below shows examples of different `edge_padding` and
-`interior_padding` values for a two-dimensional array.
+`interior_padding`.
+
+`edge_padding_low` and `edge_padding_high` specify the amount of padding added
+at the low-end (next to index 0) and the high-end (next to the highest index) of
+each dimension respectively. The amount of edge padding can be negative -- the
+absolute value of negative padding indicates the number of elements to remove
+from the specified dimension.
+
+`interior_padding` specifies the amount of padding added between any two
+elements in each dimension; it may not be negative.  Interior padding occurs
+logically before edge padding, so in the case of negative edge padding, elements
+are removed from the interior-padded operand.
+
+This operation is a no-op if the edge padding pairs are all (0, 0) and the
+interior padding values are all 0. The figure below shows examples of different
+`edge_padding` and `interior_padding` values for a two-dimensional array.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
+  <img style="width:100%" src="./images/ops_pad.png">
 </div>
 
 ## Recv
@@ -1590,13 +1658,13 @@ Here's an example of reducing a 2D array (matrix). The shape has rank 2,
 dimension 0 of size 2 and dimension 1 of size 3:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="https://www.tensorflow.org/images/ops_2d_matrix.png">
+  <img style="width:35%" src="./images/ops_2d_matrix.png">
 </div>
 
 Results of reducing dimensions 0 or 1 with an "add" function:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_2d_matrix.png">
+  <img style="width:35%" src="./images/ops_reduce_from_2d_matrix.png">
 </div>
 
 Note that both reduction results are 1D arrays. The diagram shows one as column
@@ -1607,7 +1675,7 @@ size 4, dimension 1 of size 2 and dimension 2 of size 3. For simplicity, the
 values 1 to 6 are replicated across dimension 0.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_from_3d_matrix.png">
+  <img style="width:35%" src="./images/ops_reduce_from_3d_matrix.png">
 </div>
 
 Similarly to the 2D example, we can reduce just one dimension. If we reduce
@@ -1640,8 +1708,8 @@ Reducing the 3D array over all its dimensions produces the scalar `84`.
 
 When `N > 1`, reduce function application is slightly more complex, as it is
 applied simultaneously to all inputs. For example, consider the following
-reduction function, which can be used to compute the max and the argmax of a
-a 1-D tensor in parallel:
+reduction function, which can be used to compute the max and the argmax of a a
+1-D array in parallel:
 
 ```
 f: (Float, Int, Float, Int) -> Float, Int
@@ -1728,6 +1796,10 @@ window_strides, padding)` </b>
 :                     :                     : dimension values                 :
 | `window_strides`    | `ArraySlice<int64>` | array of integers for window     |
 :                     :                     : stride values                    :
+| `base_dilations`    | `ArraySlice<int64>` | array of integers for base       |
+:                     :                     : dilation values                  :
+| `window_dilations`  | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : dilation values                  :
 | `padding`           | `Padding`           | padding type for window          |
 :                     :                     : (Padding\:\:kSame or             :
 :                     :                     : Padding\:\:kValid)               :
@@ -1752,15 +1824,16 @@ XlaBuilder builder(client_, "reduce_window_2x3");
 auto shape = ShapeUtil::MakeShape(F32, {4, 6});
 auto input = builder.Parameter(0, shape, "input");
 builder.ReduceWindow(
-    input, *max,
+    input,
     /*init_val=*/builder.ConstantLiteral(LiteralUtil::MinValue(F32)),
+    *max,
     /*window_dimensions=*/{2, 3},
     /*window_stride_dimensions=*/{2, 3},
     Padding::kValid);
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:35%" src="https://www.tensorflow.org/images/ops_reduce_window.png">
+  <img style="width:35%" src="./images/ops_reduce_window.png">
 </div>
 
 Stride of 1 in a dimension specifies that the position of a window in the
@@ -1772,7 +1845,7 @@ are the same as though the input came in with the dimensions it has after
 padding.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:75%" src="https://www.tensorflow.org/images/ops_reduce_window_stride.png">
+  <img style="width:75%" src="./images/ops_reduce_window_stride.png">
 </div>
 
 The evaluation order of the reduction function is arbitrary and may be
@@ -1929,44 +2002,24 @@ implementation-defined.
 ## Scatter
 
 The XLA scatter operation generates a result which is the value of the input
-tensor `operand`, with several slices (at indices specified by
-`scatter_indices`) updated with the values in `updates` using
-`update_computation`.
+array `operand`, with several slices (at indices specified by `scatter_indices`)
+updated with the values in `updates` using `update_computation`.
 
 See also
 [`XlaBuilder::Scatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `scatter(operand, scatter_indices, updates, update_computation, index_vector_dim, update_window_dims, inserted_window_dims, scatter_dims_to_operand_dims)` </b>
 
-|Arguments         | Type                   | Semantics                        |
-|------------------|------------------------|----------------------------------|
-|`operand`         | `XlaOp`                | Tensor to be scattered into.     |
-|`scatter_indices` | `XlaOp`                | Tensor containing the starting   |
-:                  :                        : indices of the slices that must  :
-:                  :                        : be scattered to.                 :
-|`updates`         | `XlaOp`                | Tensor containing the values that|
-:                  :                        : must be used for scattering.     :
-|`update_computation`| `XlaComputation`     | Computation to be used for       |
-:                  :                        : combining the existing values in :
-:                  :                        : the input tensor and the updates :
-:                  :                        : during scatter. This computation :
-:                  :                        : should be of type `T, T -> T`.   :
-|`index_vector_dim`| `int64`                | The dimension in                 |
-:                  :                        : `scatter_indices` that contains  :
-:                  :                        : the starting indices.            :
-|`update_window_dims`| `ArraySlice<int64>`  | The set of dimensions in         |
-:                  :                        : `updates` shape that are _window :
-:                  :                        : dimensions_.                     :
-|`inserted_window_dims`| `ArraySlice<int64>`| The set of _window dimensions_   |
-:                  :                        : that must be inserted into       :
-:                  :                        : `updates` shape.                 :
-|`scatter_dims_to_operand_dims`| `ArraySlice<int64>`  | A dimensions map from  |
-:                  :                        : the scatter indices to the       :
-:                  :                        : operand index space. This array  :
-:                  :                        : is interpreted as mapping `i` to :
-:                  :                        : `scatter_dims_to_operand_dims[i]`:
-:                  :                        : . It has to be one-to-one and    :
-:                  :                        : total.                           :
+Arguments                      | Type                | Semantics
+------------------------------ | ------------------- | ---------
+`operand`                      | `XlaOp`             | Array to be scattered into.
+`scatter_indices`              | `XlaOp`             | Array containing the starting indices of the slices that must be scattered to.
+`updates`                      | `XlaOp`             | Array containing the values that must be used for scattering.
+`update_computation`           | `XlaComputation`    | Computation to be used for combining the existing values in the input array and the updates during scatter. This computation should be of type `T, T -> T`.
+`index_vector_dim`             | `int64`             | The dimension in `scatter_indices` that contains the starting indices.
+`update_window_dims`           | `ArraySlice<int64>` | The set of dimensions in `updates` shape that are _window dimensions_.
+`inserted_window_dims`         | `ArraySlice<int64>` | The set of _window dimensions_ that must be inserted into `updates` shape.
+`scatter_dims_to_operand_dims` | `ArraySlice<int64>` | A dimensions map from the scatter indices to the operand index space. This array is interpreted as mapping `i` to `scatter_dims_to_operand_dims[i]` . It has to be one-to-one and total.
 
 If `index_vector_dim` is equal to `scatter_indices.rank` we implicitly consider
 `scatter_indices` to have a trailing `1` dimension.
@@ -1977,78 +2030,77 @@ order.
 
 The arguments of scatter should follow these constraints:
 
-  - `updates` tensor must be of rank `update_window_dims.size +
-  scatter_indices.rank - 1`.
+-   `updates` array must be of rank `update_window_dims.size +
+    scatter_indices.rank - 1`.
+
+-   Bounds of dimension `i` in `updates` must conform to the following:
 
-  - Bounds of dimension `i` in `updates` must conform to the following:
-      - If `i` is present in `update_window_dims` (i.e. equal to
-        `update_window_dims`[`k`] for some `k`), then the bound of dimension
-        `i` in `updates` must not exceed the corresponding bound of `operand`
-        after accounting for the `inserted_window_dims` (i.e.
+    -   If `i` is present in `update_window_dims` (i.e. equal to
+        `update_window_dims`[`k`] for some `k`), then the bound of dimension `i`
+        in `updates` must not exceed the corresponding bound of `operand` after
+        accounting for the `inserted_window_dims` (i.e.
         `adjusted_window_bounds`[`k`], where `adjusted_window_bounds` contains
         the bounds of `operand` with the bounds at indices
         `inserted_window_dims` removed).
-      - If `i` is present in `update_scatter_dims` (i.e. equal to
+    -   If `i` is present in `update_scatter_dims` (i.e. equal to
         `update_scatter_dims`[`k`] for some `k`), then the bound of dimension
         `i` in `updates` must be equal to the corresponding bound of
         `scatter_indices`, skipping `index_vector_dim` (i.e.
         `scatter_indices.shape.dims`[`k`], if `k` < `index_vector_dim` and
         `scatter_indices.shape.dims`[`k+1`] otherwise).
 
-  - `update_window_dims` must be in ascending order, not have any repeating
+-   `update_window_dims` must be in ascending order, not have any repeating
     dimension numbers, and be in the range `[0, updates.rank)`.
 
-  - `inserted_window_dims` must be in ascending order, not have any
-    repeating dimension numbers, and be in the range `[0, operand.rank)`.
+-   `inserted_window_dims` must be in ascending order, not have any repeating
+    dimension numbers, and be in the range `[0, operand.rank)`.
 
-  - `scatter_dims_to_operand_dims.size` must be equal to
+-   `scatter_dims_to_operand_dims.size` must be equal to
     `scatter_indices`[`index_vector_dim`], and its values must be in the range
     `[0, operand.rank)`.
 
-For a given index `U` in the `updates` tensor, the corresponding index `I` in
-the `operand` tensor into which this update has to be applied is computed as
-follows:
-
-  1. Let `G` = { `U`[`k`] for `k` in `update_scatter_dims` }. Use `G` to look up
-     an index vector `S` in the `scatter_indices` tensor such that `S`[`i`] =
-     `scatter_indices`[Combine(`G`, `i`)] where Combine(A, b) inserts b at
-     positions `index_vector_dim` into A.
-  2. Create an index `S`<sub>`in`</sub> into `operand` using `S` by scattering
-     `S` using the `scatter_dims_to_operand_dims` map. More formally:
-       1. `S`<sub>`in`</sub>[`scatter_dims_to_operand_dims`[`k`]] = `S`[`k`] if
-          `k` < `scatter_dims_to_operand_dims.size`.
-       2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
-  3. Create an index `W`<sub>`in`</sub> into `operand` by scattering the indices
-     at `update_window_dims` in `U` according to `inserted_window_dims`.
-     More formally:
-       1. `W`<sub>`in`</sub>[`window_dims_to_operand_dims`(`k`)] = `U`[`k`] if
-          `k` < `update_window_dims.size`, where `window_dims_to_operand_dims`
-          is the monotonic function with domain [`0`, `update_window_dims.size`)
-          and range [`0`, `operand.rank`) \\ `inserted_window_dims`. (For
-          example, if `update_window_dims.size` is `4`, `operand.rank` is `6`,
-          and `inserted_window_dims` is {`0`, `2`} then
-          `window_dims_to_operand_dims` is {`0`→`1`, `1`→`3`, `2`→`4`,
-          `3`→`5`}).
-       2. `W`<sub>`in`</sub>[`_`] = `0` otherwise.
-  4. `I` is `W`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
-     addition.
+For a given index `U` in the `updates` array, the corresponding index `I` in the
+`operand` array into which this update has to be applied is computed as follows:
+
+1.  Let `G` = { `U`[`k`] for `k` in `update_scatter_dims` }. Use `G` to look up
+    an index vector `S` in the `scatter_indices` array such that `S`[`i`] =
+    `scatter_indices`[Combine(`G`, `i`)] where Combine(A, b) inserts b at
+    positions `index_vector_dim` into A.
+2.  Create an index `S`<sub>`in`</sub> into `operand` using `S` by scattering
+    `S` using the `scatter_dims_to_operand_dims` map. More formally:
+    1.  `S`<sub>`in`</sub>[`scatter_dims_to_operand_dims`[`k`]] = `S`[`k`] if
+        `k` < `scatter_dims_to_operand_dims.size`.
+    2.  `S`<sub>`in`</sub>[`_`] = `0` otherwise.
+3.  Create an index `W`<sub>`in`</sub> into `operand` by scattering the indices
+    at `update_window_dims` in `U` according to `inserted_window_dims`. More
+    formally:
+    1.  `W`<sub>`in`</sub>[`window_dims_to_operand_dims`(`k`)] = `U`[`k`] if `k`
+        < `update_window_dims.size`, where `window_dims_to_operand_dims` is the
+        monotonic function with domain [`0`, `update_window_dims.size`) and
+        range [`0`, `operand.rank`) \\ `inserted_window_dims`. (For example, if
+        `update_window_dims.size` is `4`, `operand.rank` is `6`, and
+        `inserted_window_dims` is {`0`, `2`} then `window_dims_to_operand_dims`
+        is {`0`→`1`, `1`→`3`, `2`→`4`, `3`→`5`}).
+    2.  `W`<sub>`in`</sub>[`_`] = `0` otherwise.
+4.  `I` is `W`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
+    addition.
 
 In summary, the scatter operation can be defined as follows.
 
-   - Initialize `output` with `operand`, i.e. for all indices `O` in the
-     `operand` tensor:\
-       `output`[`O`] = `operand`[`O`]
-   - For every index `U` in the `updates` tensor and the corresponding index `O`
-     in the `operand` tensor:\
-       `output`[`O`] = `update_computation`(`output`[`O`], `updates`[`U`])
+-   Initialize `output` with `operand`, i.e. for all indices `O` in the
+    `operand` array: \
+    `output`[`O`] = `operand`[`O`]
+-   For every index `U` in the `updates` array and the corresponding index `O`
+    in the `operand` array: \
+    `output`[`O`] = `update_computation`(`output`[`O`], `updates`[`U`])
 
 The order in which updates are applied is non-deterministic. So, when multiple
 indices in `updates` refer to the same index in `operand`, the corresponding
 value in `output` will be non-deterministic.
 
 Note that the first parameter that is passed into the `update_computation` will
-always be the current value from the `output` tensor and the second parameter
-will always be the value from the `updates` tensor. This is important
+always be the current value from the `output` array and the second parameter
+will always be the value from the `updates` array. This is important
 specifically for cases when the `update_computation` is _not commutative_.
 
 Informally, the scatter op can be viewed as an _inverse_ of the gather op, i.e.
@@ -2080,10 +2132,9 @@ shape of the output array. The array `pred` must have the same dimensionality as
 
 For each element `P` of `pred`, the corresponding element of the output array is
 taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
-value of `P` is `false`. As a restricted form of [broadcasting]
-(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the
-output array is taken wholly from `on_true` if `pred` is `true`, and from
-`on_false` if `pred` is `false`.
+value of `P` is `false`. As a restricted form of [broadcasting](broadcasting.md),
+`pred` can be a scalar of type `PRED`. In this case, the output array is taken
+wholly from `on_true` if `pred` is `true`, and from `on_false` if `pred` is `false`.
 
 Example with non-scalar `pred`:
 
@@ -2181,7 +2232,7 @@ addition `scatter` function produces the output element of value 8 (2 + 6).
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%"
-    src="https://www.tensorflow.org/images/ops_scatter_to_selected_window_element.png">
+    src="./images/ops_scatter_to_selected_window_element.png">
 </div>
 
 The evaluation order of the `scatter` function is arbitrary and may be
@@ -2228,7 +2279,7 @@ The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`,
 `Send`, `SendDone`) is as below.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:70%" src="../../images/send_recv_order.png">
+  <img style="width:70%" src="./images/send_recv_order.png">
 </div>
 
 * `Recv` happens before `Send`
@@ -2241,7 +2292,7 @@ communicates via channel instructions, there must not be cycles across the
 computations. For example, below schedules lead to deadlocks.
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="../../images/send_recv_schedule.png">
+  <img style="width:100%" src="./images/send_recv_schedule.png">
 </div>
 
 ## Slice
@@ -2299,9 +2350,9 @@ See also
 [`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 There are two versions of the Sort instruction: a single-operand and a
-two-operand version.
+multi-operand version.
 
-<b>`Sort(operand)`</b>
+<b>`Sort(operand, dimension)`</b>
 
 Arguments   | Type    | Semantics
 ----------- | ------- | --------------------
@@ -2315,25 +2366,26 @@ row independently. If the operand's elements have floating point type, and the
 operand contains NaN elements, the order of elements in the output is
 implementation-defined.
 
-<b>`Sort(key, value)`</b>
+<b>`Sort(keys, values, ... values, dimension)`</b>
 
-Sorts both the key and the value operands. The keys are sorted as in the
-single-operand version. The values are sorted according to the order of their
-corresponding keys. For example, if the inputs are `keys = [3, 1]` and
-`values = [42, 50]`, then the output of the sort is the tuple 
-`{[1, 3], [50, 42]}`.
+Sorts both the key and one or more value operands. The keys are sorted as in the
+single-operand version. Each of the values inputs is sorted according to the
+order of the corresponding keys. For example, if the three inputs are `keys =
+[3, 1]`, `values0 = [42, 50]`, `values1 = [-3.0, 1.1]`, then the output of the
+sort is the tuple `{[1, 3], [50, 42], [1.1, -3.0]}`.
 
 The sort is not guaranteed to be stable, that is, if the keys array contains
-duplicates, the order of their corresponding values may not be preserved.
+duplicates, the order of values corresponding to these keys may not be
+preserved.
 
-Arguments   | Type    | Semantics
------------ | ------- | -------------------
-`keys`      | `XlaOp` | The sort keys.
-`values`    | `XlaOp` | The values to sort.
-`dimension` | `int64` | The dimension along which to sort.
+Arguments   | Type                   | Semantics
+----------- | ---------------------- | ----------------------------------
+`keys`      | `XlaOp`                | The sort keys.
+`values`    | Sequence of N `XlaOp`s | The values to sort.
+`dimension` | `int64`                | The dimension along which to sort.
 
-The `keys` and `values` must have the same dimensions, but may have different
-element types.
+The `keys` and each of the `values` inputs must have the same dimensions, but
+may have different element types.
 
 ## Transpose
 
@@ -2422,5 +2474,5 @@ while (result(0) < 1000) {
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="https://www.tensorflow.org/images/ops_while.png">
+  <img style="width:100%" src="./images/ops_while.png">
 </div>
diff --git a/tensorflow/compiler/xla/g3doc/overview.md b/tensorflow/compiler/xla/g3doc/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..d3428b7276131e8f406f60cfea9a9346c5478433
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/overview.md
@@ -0,0 +1,98 @@
+# XLA Overview
+
+<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:50%" src="./images/xlalogo.png">
+</div>
+
+> Note: XLA is still under development.  Some use cases will not
+> see improvements in speed or decreased memory usage.
+
+XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
+algebra that optimizes TensorFlow computations. The results are improvements in
+speed, memory usage, and portability on server and mobile platforms. Initially,
+most users will not see large benefits from XLA, but are welcome to experiment
+by using XLA via [just-in-time (JIT) compilation](./jit.md) or
+[ahead-of-time (AOT) compilation](./tfcompile.md). Developers targeting new
+hardware accelerators are especially encouraged to try out XLA.
+
+The XLA framework is experimental and in active development. In particular,
+while it is unlikely that the semantics of existing operations will change, it
+is expected that more operations will be added to cover important use cases. The
+team welcomes feedback from the community about missing functionality and
+community contributions via GitHub.
+
+## Why did we build XLA?
+
+We had several objectives for XLA to work with TensorFlow:
+
+*   *Improve execution speed.* Compile subgraphs to reduce the execution time of
+    short-lived Ops to eliminate overhead from the TensorFlow runtime, fuse
+    pipelined operations to reduce memory overhead, and specialize to known
+    tensor shapes to allow for more aggressive constant propagation.
+
+*   *Improve memory usage.* Analyze and schedule memory usage, in principle
+    eliminating many intermediate storage buffers.
+
+*   *Reduce reliance on custom Ops.* Remove the need for many custom Ops by
+    improving the performance of automatically fused low-level Ops to match the
+    performance of custom Ops that were fused by hand.
+
+*   *Reduce mobile footprint.* Eliminate the TensorFlow runtime by ahead-of-time
+    compiling the subgraph and emitting an object/header file pair that can be
+    linked directly into another application. The results can reduce the
+    footprint for mobile inference by several orders of magnitude.
+
+*   *Improve portability.* Make it relatively easy to write a new backend for
+    novel hardware, at which point a large fraction of TensorFlow programs will
+    run unmodified on that hardware. This is in contrast with the approach of
+    specializing individual monolithic Ops for new hardware, which requires
+    TensorFlow programs to be rewritten to make use of those Ops.
+
+## How does XLA work?
+
+The input language to XLA is called "HLO IR", or just HLO (High Level
+Optimizer). The semantics of HLO are described on the
+[Operation Semantics](./operation_semantics.md) page. It
+is most convenient to think of HLO as a
+[compiler IR](https://en.wikipedia.org/wiki/Intermediate_representation).
+
+XLA takes graphs ("computations") defined in HLO and compiles them into machine
+instructions for various architectures. XLA is modular in the sense that it is
+easy to slot in an alternative backend to
+[target some novel HW architecture](./developing_new_backend.md).
+The CPU backend for x64 and ARM64 as well as the NVIDIA GPU backend are in the
+TensorFlow source tree.
+
+The following diagram shows the compilation process in XLA:
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img src="./images/how-does-xla-work.png">
+</div>
+
+XLA comes with several optimizations and analysis passes that are
+target-independent, such as
+[CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
+target-independent operation fusion, and buffer analysis for allocating runtime
+memory for the computation.
+
+After the target-independent step, XLA sends the HLO computation to a backend.
+The backend can perform further HLO-level optimizations, this time with target
+specific information and needs in mind. For example, the XLA GPU backend may
+perform operation fusion beneficial specifically for the GPU programming model
+and determine how to partition the computation into streams. At this stage,
+backends may also pattern-match certain operations or combinations thereof to
+optimized library calls.
+
+The next step is target-specific code generation. The CPU and GPU backends
+included with XLA use [LLVM](http://llvm.org) for low-level IR, optimization,
+and code-generation. These backends emit the LLVM IR necessary to represent the
+XLA HLO computation in an efficient manner, and then invoke LLVM to emit native
+code from this LLVM IR.
+
+The GPU backend currently supports NVIDIA GPUs via the LLVM NVPTX backend; the
+CPU backend supports multiple CPU ISAs.
+
+## Supported Platforms
+
+XLA currently supports [JIT compilation](./jit.md) on x86-64 and NVIDIA GPUs; and
+[AOT compilation](./tfcompile.md) for x86-64 and ARM.
diff --git a/tensorflow/compiler/xla/g3doc/shapes.md b/tensorflow/compiler/xla/g3doc/shapes.md
new file mode 100644
index 0000000000000000000000000000000000000000..39e74ff307cde49ef378a1201cb074dce4ababf0
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/shapes.md
@@ -0,0 +1,150 @@
+# Shapes and Layout
+
+The XLA `Shape` proto
+([xla_data.proto](https://www.tensorflow.org/code/tensorflow/compiler/xla/xla_data.proto))
+describes the rank, size, and data type of an N-dimensional array (*array* in
+short).
+
+## Terminology, Notation, and Conventions
+
+*   The rank of an array is equal to the number of dimensions. The *true rank*
+    of an array is the number of dimensions which have a size greater than 1.
+
+*   Dimensions are numbered from `0` up to `N-1` for an `N` dimensional array.
+    The dimension numbers are arbitrary labels for convenience. The order of
+    these dimension numbers does not imply a particular minor/major ordering in
+    the layout of the shape. The layout is determined by the `Layout` proto.
+
+*   By convention, dimensions are listed in increasing order of dimension
+    number. For example, for a 3-dimensional array of size `[A x B x C]`,
+    dimension 0 has size `A`, dimension 1 has size `B` and dimension 2 has size
+    `C`.
+
+    Some utilities in XLA also support negative indexing, similarly to Python;
+    dimension -1 is the last dimension (equivalent to `N-1` for an `N`
+    dimensional array). For example, for the 3-dimensional array described
+    above, dimension -1 has size `C`, dimension -2 has size `B` and so on.
+
+*   Two, three, and four dimensional arrays often have specific letters
+    associated with dimensions. For example, for a 2D array:
+
+    *   dimension 0: `y`
+    *   dimension 1: `x`
+
+    For a 3D array:
+
+    *   dimension 0: `z`
+    *   dimension 1: `y`
+    *   dimension 2: `x`
+
+    For a 4D array:
+
+    *   dimension 0: `p`
+    *   dimension 1: `z`
+    *   dimension 2: `y`
+    *   dimension 3: `x`
+
+*   Functions in the XLA API which take dimensions do so in increasing order of
+    dimension number. This matches the ordering used when passing dimensions as
+    an `initializer_list`; e.g.
+
+    `ShapeUtil::MakeShape(F32, {A, B, C, D})`
+
+    Will create a shape whose dimension size array consists of the sequence
+    `[A, B, C, D]`.
+
+## Layout
+
+The `Layout` proto describes how an array is represented in memory. The `Layout`
+proto includes the following fields:
+
+```
+message Layout {
+  repeated int64 minor_to_major = 1;
+  repeated int64 padded_dimensions = 2;
+  optional PaddingValue padding_value = 3;
+}
+```
+
+### Minor-to-major dimension ordering
+
+The only required field is `minor_to_major`. This field describes the
+minor-to-major ordering of the dimensions within a shape. Values in
+`minor_to_major` are an ordering of the dimensions of the array (`0` to `N-1`
+for an `N` dimensional array) with the first value being the most-minor
+dimension up to the last value which is the most-major dimension. The most-minor
+dimension is the dimension which changes most rapidly when stepping through the
+elements of the array laid out in linear memory.
+
+For example, consider the following 2D array of size `[2 x 3]`:
+
+```
+a b c
+d e f
+```
+
+Here dimension `0` is size 2, and dimension `1` is size 3. If the
+`minor_to_major` field in the layout is `[0, 1]` then dimension `0` is the
+most-minor dimension and dimension `1` is the most-major dimension. This
+corresponds to the following layout in linear memory:
+
+```
+a d b e c f
+```
+
+This minor-to-major dimension order of `0` up to `N-1` is akin to *column-major*
+(at rank 2). Assuming a monotonic ordering of dimensions, another name we may
+use to refer to this layout in the code is simply "dim 0 is minor".
+
+On the other hand, if the `minor_to_major` field in the layout is `[1, 0]` then
+the layout in linear memory is:
+
+```
+a b c d e f
+```
+
+A minor-to-major dimension order of `N-1` down to `0` for an `N` dimensional
+array is akin to *row-major* (at rank 2). Assuming a monotonic ordering of
+dimensions, another name we may use to refer to this layout in the code is
+simply "dim 0 is major".
+
+#### Default minor-to-major ordering
+
+The default layout for newly created Shapes is "dimension order is
+major-to-minor" (akin to row-major at rank 2).
+
+### Padding
+
+Padding is defined in the optional `padded_dimensions` and `padding_value`
+fields. The field `padded_dimensions` describes the sizes (widths) to which each
+dimension is padded. If present, the number of elements in `padded_dimensions`
+must equal the rank of the shape.
+
+For example, given the `[2 x 3]` array defined above, if `padded_dimension` is
+`[3, 5]` then dimension 0 is padded to a width of 3 and dimension 1 is padded to
+a width of 5. The layout in linear memory (assuming a padding value of 0 and
+column-major layout) is:
+
+```
+a d 0 b e 0 c f 0 0 0 0 0 0 0
+```
+
+This is equivalent to the layout of the following array with the same
+minor-to-major dimension order:
+
+```
+a b c 0 0
+d e f 0 0
+0 0 0 0 0
+```
+
+### Indexing into arrays
+
+The class `IndexUtil` in
+[index_util.h](https://www.tensorflow.org/code/tensorflow/compiler/xla/index_util.h)
+provides utilities for converting between multidimensional indices and linear
+indices given a shape and layout. Multidimensional indices include a `int64`
+index for each dimension. Linear indices are a single `int64` value which
+indexes into the buffer holding the array. See `shape_util.h` and
+`layout_util.h` in the same directory for utilities that simplify creation and
+manipulation of shapes and layouts.
diff --git a/tensorflow/compiler/xla/g3doc/tfcompile.md b/tensorflow/compiler/xla/g3doc/tfcompile.md
new file mode 100644
index 0000000000000000000000000000000000000000..5ee09fd302ba0edf84a7c99bb369586067141bef
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/tfcompile.md
@@ -0,0 +1,281 @@
+# Using AOT compilation
+
+## What is tfcompile?
+
+`tfcompile` is a standalone tool that ahead-of-time (AOT) compiles TensorFlow
+graphs into executable code. It can reduce total binary size, and also avoid
+some runtime overheads. A typical use-case of `tfcompile` is to compile an
+inference graph into executable code for mobile devices.
+
+The TensorFlow graph is normally executed by the TensorFlow runtime. This incurs
+some runtime overhead for execution of each node in the graph. This also leads
+to a larger total binary size, since the code for the TensorFlow runtime needs
+to be available, in addition to the graph itself. The executable code produced
+by `tfcompile` does not use the TensorFlow runtime, and only has dependencies on
+kernels that are actually used in the computation.
+
+The compiler is built on top of the XLA framework. The code bridging TensorFlow
+to the XLA framework resides under
+[tensorflow/compiler](https://www.tensorflow.org/code/tensorflow/compiler/),
+which also includes support for [just-in-time (JIT) compilation](jit.md) of
+TensorFlow graphs.
+
+## What does tfcompile do?
+
+`tfcompile` takes a subgraph, identified by the TensorFlow concepts of
+feeds and fetches, and generates a function that implements that subgraph.
+The `feeds` are the input arguments for the function, and the `fetches` are the
+output arguments for the function. All inputs must be fully specified by the
+feeds; the resulting pruned subgraph cannot contain Placeholder or Variable
+nodes. It is common to specify all Placeholders and Variables as feeds, which
+ensures the resulting subgraph no longer contains these nodes. The generated
+function is packaged as a `cc_library`, with a header file exporting the
+function signature, and an object file containing the implementation. The user
+writes code to invoke the generated function as appropriate.
+
+## Using tfcompile
+
+This section details high level steps for generating an executable binary with
+`tfcompile` from a TensorFlow subgraph. The steps are:
+
+*   Step 1: Configure the subgraph to compile
+*   Step 2: Use the `tf_library` build macro to compile the subgraph
+*   Step 3: Write code to invoke the subgraph
+*   Step 4: Create the final binary
+
+### Step 1: Configure the subgraph to compile
+
+Identify the feeds and fetches that correspond to the input and output
+arguments for the generated function. Then configure the `feeds` and `fetches`
+in a [`tensorflow.tf2xla.Config`](https://www.tensorflow.org/code/tensorflow/compiler/tf2xla/tf2xla.proto)
+proto.
+
+```textproto
+# Each feed is a positional input argument for the generated function.  The order
+# of each entry matches the order of each input argument.  Here “x_hold” and “y_hold”
+# refer to the names of placeholder nodes defined in the graph.
+feed {
+  id { node_name: "x_hold" }
+  shape {
+    dim { size: 2 }
+    dim { size: 3 }
+  }
+}
+feed {
+  id { node_name: "y_hold" }
+  shape {
+    dim { size: 3 }
+    dim { size: 2 }
+  }
+}
+
+# Each fetch is a positional output argument for the generated function.  The order
+# of each entry matches the order of each output argument.  Here “x_y_prod”
+# refers to the name of a matmul node defined in the graph.
+fetch {
+  id { node_name: "x_y_prod" }
+}
+```
+
+### Step 2: Use tf_library build macro to compile the subgraph
+
+This step converts the graph into a `cc_library` using the `tf_library` build
+macro. The `cc_library` consists of an object file containing the code generated
+from the graph, along with a header file that gives access to the generated
+code. `tf_library` utilizes `tfcompile` to compile the TensorFlow graph into
+executable code.
+
+```build
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+
+# Use the tf_library macro to compile your graph into executable code.
+tf_library(
+    # name is used to generate the following underlying build rules:
+    # <name>           : cc_library packaging the generated header and object files
+    # <name>_test      : cc_test containing a simple test and benchmark
+    # <name>_benchmark : cc_binary containing a stand-alone benchmark with minimal deps;
+    #                    can be run on a mobile device
+    name = "test_graph_tfmatmul",
+    # cpp_class specifies the name of the generated C++ class, with namespaces allowed.
+    # The class will be generated in the given namespace(s), or if no namespaces are
+    # given, within the global namespace.
+    cpp_class = "foo::bar::MatMulComp",
+    # graph is the input GraphDef proto, by default expected in binary format.  To
+    # use the text format instead, just use the ‘.pbtxt’ suffix.  A subgraph will be
+    # created from this input graph, with feeds as inputs and fetches as outputs.
+    # No Placeholder or Variable ops may exist in this subgraph.
+    graph = "test_graph_tfmatmul.pb",
+    # config is the input Config proto, by default expected in binary format.  To
+    # use the text format instead, use the ‘.pbtxt’ suffix.  This is where the
+    # feeds and fetches were specified above, in the previous step.
+    config = "test_graph_tfmatmul.config.pbtxt",
+)
+```
+
+> To generate the GraphDef proto (test_graph_tfmatmul.pb) for this example, run
+> [make_test_graphs.py](https://www.tensorflow.org/code/tensorflow/compiler/aot/tests/make_test_graphs.py)
+> and specify the output location with the --out_dir flag.
+
+Typical graphs contain [`Variables`](https://www.tensorflow.org/guide/variables)
+representing the weights that are learned via training, but `tfcompile` cannot
+compile a subgraph that contain `Variables`. The
+[freeze_graph.py](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py)
+tool converts variables into constants, using values stored in a checkpoint
+file. As a convenience, the `tf_library` macro supports the `freeze_checkpoint`
+argument, which runs the tool. For more examples see
+[tensorflow/compiler/aot/tests/BUILD](https://www.tensorflow.org/code/tensorflow/compiler/aot/tests/BUILD).
+
+> Constants that show up in the compiled subgraph are compiled directly into the
+> generated code. To pass the constants into the generated function, rather than
+> having them compiled-in, simply pass them in as feeds.
+
+For details on the `tf_library` build macro, see
+[tfcompile.bzl](https://www.tensorflow.org/code/tensorflow/compiler/aot/tfcompile.bzl).
+
+For details on the underlying `tfcompile` tool, see
+[tfcompile_main.cc](https://www.tensorflow.org/code/tensorflow/compiler/aot/tfcompile_main.cc).
+
+### Step 3: Write code to invoke the subgraph
+
+This step uses the header file (`test_graph_tfmatmul.h`) generated by the
+`tf_library` build macro in the previous step to invoke the generated code. The
+header file is located in the `bazel-genfiles` directory corresponding to the
+build package, and is named based on the name attribute set on the `tf_library`
+build macro. For example, the header generated for `test_graph_tfmatmul` would
+be `test_graph_tfmatmul.h`. Below is an abbreviated version of what is
+generated. The generated file, in `bazel-genfiles`, contains additional useful
+comments.
+
+```c++
+namespace foo {
+namespace bar {
+
+// MatMulComp represents a computation previously specified in a
+// TensorFlow graph, now compiled into executable code.
+class MatMulComp {
+ public:
+  // AllocMode controls the buffer allocation mode.
+  enum class AllocMode {
+    ARGS_RESULTS_AND_TEMPS,  // Allocate arg, result and temp buffers
+    RESULTS_AND_TEMPS_ONLY,  // Only allocate result and temp buffers
+  };
+
+  MatMulComp(AllocMode mode = AllocMode::ARGS_RESULTS_AND_TEMPS);
+  ~MatMulComp();
+
+  // Runs the computation, with inputs read from arg buffers, and outputs
+  // written to result buffers. Returns true on success and false on failure.
+  bool Run();
+
+  // Arg methods for managing input buffers. Buffers are in row-major order.
+  // There is a set of methods for each positional argument.
+  void** args();
+
+  void set_arg0_data(float* data);
+  float* arg0_data();
+  float& arg0(size_t dim0, size_t dim1);
+
+  void set_arg1_data(float* data);
+  float* arg1_data();
+  float& arg1(size_t dim0, size_t dim1);
+
+  // Result methods for managing output buffers. Buffers are in row-major order.
+  // Must only be called after a successful Run call. There is a set of methods
+  // for each positional result.
+  void** results();
+
+
+  float* result0_data();
+  float& result0(size_t dim0, size_t dim1);
+};
+
+}  // end namespace bar
+}  // end namespace foo
+```
+
+The generated C++ class is called `MatMulComp` in the `foo::bar` namespace,
+because that was the `cpp_class` specified in the `tf_library` macro. All
+generated classes have a similar API, with the only difference being the methods
+to handle arg and result buffers. Those methods differ based on the number and
+types of the buffers, which were specified by the `feed` and `fetch` arguments
+to the `tf_library` macro.
+
+There are three types of buffers managed within the generated class: `args`
+representing the inputs, `results` representing the outputs, and `temps`
+representing temporary buffers used internally to perform the computation. By
+default, each instance of the generated class allocates and manages all of these
+buffers for you. The `AllocMode` constructor argument may be used to change this
+behavior. All buffers are aligned to 64-byte boundaries.
+
+The generated C++ class is just a wrapper around the low-level code generated by
+XLA.
+
+Example of invoking the generated function based on
+[`tfcompile_test.cc`](https://www.tensorflow.org/code/tensorflow/compiler/aot/tests/tfcompile_test.cc):
+
+```c++
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+
+#include <iostream>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h" // generated
+
+int main(int argc, char** argv) {
+  Eigen::ThreadPool tp(2);  // Size the thread pool as appropriate.
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+
+  foo::bar::MatMulComp matmul;
+  matmul.set_thread_pool(&device);
+
+  // Set up args and run the computation.
+  const float args[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  std::copy(args + 0, args + 6, matmul.arg0_data());
+  std::copy(args + 6, args + 12, matmul.arg1_data());
+  matmul.Run();
+
+  // Check result
+  if (matmul.result0(0, 0) == 58) {
+    std::cout << "Success" << std::endl;
+  } else {
+    std::cout << "Failed. Expected value 58 at 0,0. Got:"
+              << matmul.result0(0, 0) << std::endl;
+  }
+
+  return 0;
+}
+```
+
+### Step 4: Create the final binary
+
+This step combines the library generated by `tf_library` in step 2 and the code
+written in step 3 to create a final binary. Below is an example `bazel` BUILD
+file.
+
+```build
+# Example of linking your binary
+# Also see //tensorflow/compiler/aot/tests/BUILD
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+
+# The same tf_library call from step 2 above.
+tf_library(
+    name = "test_graph_tfmatmul",
+    ...
+)
+
+# The executable code generated by tf_library can then be linked into your code.
+cc_binary(
+    name = "my_binary",
+    srcs = [
+        "my_code.cc",  # include test_graph_tfmatmul.h to access the generated header
+    ],
+    deps = [
+        ":test_graph_tfmatmul",  # link in the generated object file
+        "//third_party/eigen3",
+    ],
+    linkopts = [
+          "-lpthread",
+    ]
+)
+```
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..2a83092805be5efdd7b9ab54449b2bcc6a2ec481
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
@@ -0,0 +1,373 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "The XLA compile API",
+      "version": "0.3.2",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "f4TSNCvpENrW"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "##### Copyright 2018 The TensorFlow Authors."
+      ]
+    },
+    {
+      "metadata": {
+        "cellView": "form",
+        "colab_type": "code",
+        "id": "vamNSA0vEP-m",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "e1oSi4lHFt3z"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# The XLA compile API"
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "b7noD9NjFRL-"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/xla/tutorials/xla_compile\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>"
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "v9YbsuLZaBXy"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "\n",
+        "\n",
+        "Import TensorFlow and the XLA library. XLA contains `xla.compile()`, an experimental API that compiles part or all of a model with [XLA](https://www.tensorflow.org/extend/xla/)."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "45kUPj5ZFrRa",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "import tensorflow as tf\n",
+        "\n",
+        "from tensorflow.contrib.compiler import xla"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "GZVNiRmTDV-5"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Define some necessary constants and prepare the MNIST dataset."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "f37TSEGvGX4_",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "# Size of each input image, 28 x 28 pixels\n",
+        "IMAGE_SIZE = 28 * 28\n",
+        "# Number of distinct number labels, [0..9]\n",
+        "NUM_CLASSES = 10\n",
+        "# Number of examples in each training batch (step)\n",
+        "TRAIN_BATCH_SIZE = 100\n",
+        "# Number of training steps to run\n",
+        "TRAIN_STEPS = 1000"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "TiVXchblG5hK",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "# Loads MNIST dataset.\n",
+        "train, test = tf.keras.datasets.mnist.load_data()\n",
+        "train_ds = tf.data.Dataset.from_tensor_slices(train).batch(TRAIN_BATCH_SIZE).repeat()\n",
+        "test_ds = tf.data.Dataset.from_tensor_slices(test).batch(TRAIN_BATCH_SIZE)\n",
+        "\n",
+        "iterator = tf.data.Iterator.from_structure(train_ds.output_types, train_ds.output_shapes)\n",
+        "images, labels = iterator.get_next()\n",
+        "images = tf.reshape(images, [-1, IMAGE_SIZE])\n",
+        "images, labels = tf.cast(images, tf.float32), tf.cast(labels, tf.int64)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "x_ZehpZP-SfS"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Define the model constructing function\n",
+        "\n",
+        "Following code block contains a function that constructs a simple model with one dense layer, including both forward and backward propagation.\n",
+        "\n",
+        "When called, it returns two values. `y` is a `tf.Tensor` representing predicted probability of each target class, `train_step` is a `tf.Operation` that increments `global_step` and applies variable update."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "ZbhJl_WvGa3g",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "def build_mnist_model(x, y_):\n",
+        "  y = tf.keras.layers.Dense(NUM_CLASSES).apply(x)\n",
+        "\n",
+        "  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)\n",
+        "  train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)\n",
+        "\n",
+        "  return y, train_step"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "7Jh3lyQHDfM9"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Enable XLA\n",
+        "\n",
+        "Use `xla.compile` with the `build_mnist_model` function to enable XLA. Following code block wraps the model with `xla.compile()`, which allows the target function with provided inputs to be executed by XLA."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "kYpCXCdRHNuN",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "[y] = xla.compile(build_mnist_model, inputs=[images, labels])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "4giQh62IrZGF"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "When compiling the graph, XLA replaces all the graph nodes constructed in the target function with a few XLA ops.\n",
+        "\n",
+        "xla.compile does not return any\n",
+        "`tf.Operation` nodes that can be executed independently from the generated XLA ops. Instead, returned `tf.Operation` nodes from the target function are added as control dependencies of all returned `tf.Tensor` values. This triggers execution of the `tf.Operation` nodes when the returned tensors are evaluated.\n",
+        "\n",
+        "In pseudo-code, xla.compile's implementation looks as follows:\n",
+        "\n",
+        "---\n",
+        "```\n",
+        "# Ask Tensorflow to execute code in XLA-friendly manner\n",
+        "\n",
+        "y, train_step = build_mnist_model(images, labels)\n",
+        "with tf.control_dependencies([train_step]):\n",
+        "  y = tf.identity(y)\n",
+        "\n",
+        "# Ask Tensorflow to STOP executing code in XLA-friendly manner\n",
+        "```\n",
+        "---\n",
+        "\n",
+        "xla.compile() always returns a list of `tf.Tensor`'s (even if there is only one-element)."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "TPGas4jjFLZl"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "If you were to print the constructed graph now, you will see that it is not much different from a normal Tensorflow graph and you won't be able to find XLA ops mentioned before. This is because the actual compilation happens later when you try to execute the graph with `sess.run()`.  At that time, Tensorflow triggers a series of graph rewrite passes that actually generate XLA ops, which compiles and executes computation when all inputs are ready."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "EZD1m_n1DxAF"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Train and test the model"
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "qe28bAHNHUG2",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "# Creates session and initialize all variables.\n",
+        "# xla.compile() doesn't work with Keras model.fit() API or TF eager mode yet.\n",
+        "sess = tf.Session()\n",
+        "sess.run(tf.global_variables_initializer())"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "qgsKmz3n2UiW"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Following code block trains model. Evaluating `y` also triggers its control dependency node `train_step`, which updates model variables."
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "_GxF6jTRHVuA",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "fbf299ca-02d5-4e95-f9fe-8f3c0432d132"
+      },
+      "cell_type": "code",
+      "source": [
+        "# Feeds training dataset\n",
+        "sess.run(iterator.make_initializer(train_ds))\n",
+        "\n",
+        "# Runs TRAIN_STEPS steps\n",
+        "for i in range(TRAIN_STEPS):\n",
+        "  sess.run(y)\n",
+        "\n",
+        "print(\"Model trained for %s steps.\" % TRAIN_STEPS)"
+      ],
+      "execution_count": 21,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Model trained for 1000 steps.\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "dHlQlRSRHXD1",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "9c3677a2-ec84-406f-9d2c-d722844f3093"
+      },
+      "cell_type": "code",
+      "source": [
+        "# Tests trained model\n",
+        "\n",
+        "# Feeds testing dataset\n",
+        "sess.run(iterator.make_initializer(test_ds))\n",
+        "\n",
+        "# Calculates accuracy\n",
+        "correct_prediction = tf.equal(tf.argmax(y, 1), labels)\n",
+        "accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))\n",
+        "print(\"Prediction accuracy after training: %s\" % sess.run(accuracy))"
+      ],
+      "execution_count": 22,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Prediction accuracy after training: 0.91\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "ynJQIuzjHYOb",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "# Cleans up session\n",
+        "sess.close()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 3fadabcf5207097aa875d654320b930b1ed94ad3..2a0241af3ef359c4d1c6c1ab9319b5b293110f7a 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -29,8 +29,6 @@ namespace xla {
 /* static */ int64 IndexUtil::MultidimensionalIndexToLinearIndex(
     const Shape& shape, absl::Span<const int64> multi_index) {
   DCHECK_EQ(shape.dimensions_size(), multi_index.size());
-  // Padding and nested layouts not supported yet.
-  DCHECK_EQ(0, shape.layout().padded_dimensions_size());
 
   for (size_t i = 0; i < multi_index.size(); ++i) {
     DCHECK_GE(multi_index[i], 0);
@@ -94,8 +92,6 @@ namespace xla {
 
 /* static */ std::vector<int64> IndexUtil::LinearIndexToMultidimensionalIndex(
     const Shape& shape, int64 linear_index) {
-  // Padding and nested layouts not supported yet.
-  DCHECK_EQ(0, shape.layout().padded_dimensions_size());
   DCHECK_GE(linear_index, 0);
   DCHECK_LT(linear_index, ShapeUtil::ElementsIn(shape));
 
@@ -133,18 +129,12 @@ namespace xla {
 
 /* static */ int64 IndexUtil::GetDimensionStride(const Shape& shape,
                                                  int64 dimension) {
-  int64 pdim_size = LayoutUtil::PaddedDimensions(shape).size();
   int64 stride = 1;
-  DCHECK(pdim_size == 0 || pdim_size == shape.dimensions_size());
   for (auto dim : LayoutUtil::MinorToMajor(shape)) {
     if (dim == dimension) {
       break;
     }
-    if (pdim_size == 0) {
-      stride *= shape.dimensions(dim);
-    } else {
-      stride *= LayoutUtil::PaddedDimension(shape, dim);
-    }
+    stride *= shape.dimensions()[dim];
   }
   return stride;
 }
diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h
index 2979cf87dde92893ce2151cb09b46c8db8473b31..d76f61eb62c0fc89d6bc3ca2033e8c7170f30e78 100644
--- a/tensorflow/compiler/xla/index_util.h
+++ b/tensorflow/compiler/xla/index_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
@@ -61,8 +62,7 @@ class IndexUtil {
   static bool BumpIndices(const Shape& shape, absl::Span<int64> indices);
 
   // Calculates the stride size (in number of elements, not byte size) of a
-  // given logical shape dimension (from 0 to rank-1). If available, padded
-  // dimensions are used.
+  // given logical shape dimension (from 0 to rank-1).
   // Example:
   //  GetDimensionStride(F32[5,8,10,4]{3,2,1,0}, 1) ==
   //    sizeof(dimension(3)) * sizeof(dimension(2)) == 4 * 10
diff --git a/tensorflow/compiler/xla/index_util_test.cc b/tensorflow/compiler/xla/index_util_test.cc
index 93522d2ca87a7eba8d3c7533785c54e63ce507b0..fa94d0afb4c9280b8f8fa9642c1b0ab7285ee6f3 100644
--- a/tensorflow/compiler/xla/index_util_test.cc
+++ b/tensorflow/compiler/xla/index_util_test.cc
@@ -24,8 +24,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-void SetMinorToMajorLayout(Shape* shape,
-                           std::initializer_list<int64> dimensions) {
+void SetMinorToMajorLayout(Shape* shape, std::vector<int64> dimensions) {
   shape->mutable_layout()->clear_minor_to_major();
   for (auto dimension : dimensions) {
     shape->mutable_layout()->add_minor_to_major(dimension);
@@ -122,7 +121,7 @@ TEST(IndexUtilTest, LinearToMultiToLinear) {
   std::vector<int64> linear_indexes = {0,        1439999999, 1145567336,
                                        43883404, 617295214,  1117613654};
 
-  std::vector<std::initializer_list<int64>> minor_to_major_orders;
+  std::vector<std::vector<int64>> minor_to_major_orders;
   minor_to_major_orders.push_back({6, 5, 4, 3, 2, 1, 0});
   minor_to_major_orders.push_back({0, 1, 2, 3, 4, 5, 6});
   minor_to_major_orders.push_back({4, 5, 1, 2, 6, 0, 3});
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index d310335618ded7b581e6ed632223218585bb791f..dbb81381acde645f08639737b6e7b6f6ad971f9b 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -65,6 +65,12 @@ void SetDefaultLayoutToContainer(
   return layout;
 }
 
+/* static */ Layout LayoutUtil::MakeDescendingLayout(int64 rank) {
+  std::vector<int64> layout(rank);
+  std::iota(layout.rbegin(), layout.rend(), static_cast<int64>(0));
+  return MakeLayout(layout);
+}
+
 /* static */ Layout LayoutUtil::MakeLayoutFromMajorToMinor(
     absl::Span<const int64> major_to_minor) {
   Layout layout;
@@ -156,18 +162,23 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   LayoutUtil::SetToDefaultLayout(program_shape->mutable_result());
 }
 
-/* static */ Status LayoutUtil::ValidateLayoutInShape(const Shape& shape) {
+/* static */ Status LayoutUtil::ValidateLayoutInShape(
+    const Shape& shape, bool allow_missing_layouts) {
   if (ShapeUtil::IsTuple(shape)) {
     // Tuple shape.
     if (shape.has_layout()) {
       return InvalidArgument("tuple should not have a layout field");
     }
     for (auto& element_shape : shape.tuple_shapes()) {
-      TF_RETURN_IF_ERROR(ValidateLayoutInShape(element_shape));
+      TF_RETURN_IF_ERROR(
+          ValidateLayoutInShape(element_shape, allow_missing_layouts));
     }
     return Status::OK();
   } else if (ShapeUtil::IsArray(shape)) {
     if (!shape.has_layout()) {
+      if (allow_missing_layouts) {
+        return Status::OK();
+      }
       return InvalidArgument("shape %s does not have a layout",
                              ShapeUtil::HumanString(shape));
     }
@@ -190,8 +201,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   }
 
   if (!ShapeUtil::IsArray(shape)) {
-    if (layout.minor_to_major_size() != 0 ||
-        layout.padded_dimensions_size() != 0) {
+    if (layout.minor_to_major_size() != 0) {
       return InvalidArgument(
           "shape of primitive type %s should not have a non-trivial layout",
           PrimitiveType_Name(shape.element_type()));
@@ -199,10 +209,10 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     return Status::OK();
   }
 
-  if (layout.format() == INVALID_FORMAT) {
+  if (layout.format() == INVALID_FORMAT || !Format_IsValid(layout.format())) {
     return InvalidArgument(
-        "Layout does not have a valid format: layout {%s}, shape {%s}",
-        layout.ShortDebugString(), shape.ShortDebugString());
+        "Layout has an invalid format (%d) in layout {%s}, shape {%s}",
+        layout.format(), layout.ShortDebugString(), shape.ShortDebugString());
   }
 
   if (layout.format() == DENSE) {
@@ -230,28 +240,6 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       }
       dimensions_in_layout[dim] = true;
     }
-
-    if (layout.padded_dimensions_size() > 0) {
-      if (layout.padded_dimensions_size() != ShapeUtil::Rank(shape)) {
-        return InvalidArgument(
-            "layout has %d padded dimensions, but shape is rank %d",
-            layout.padded_dimensions_size(), ShapeUtil::Rank(shape));
-      }
-      for (int i = 0; i < layout.padded_dimensions_size(); ++i) {
-        if (layout.padded_dimensions(i) < shape.dimensions(i)) {
-          return InvalidArgument(
-              "for dimension %d, dimension padding (%d) is smaller than "
-              "the dimension size (%d) of the shape",
-              i, layout.padded_dimensions(i), shape.dimensions(i));
-        }
-      }
-    }
-  }
-
-  if (layout.format() == SPARSE) {
-    if (!layout.padded_dimensions().empty()) {
-      return InvalidArgument("Sparse layout has padded dimensions");
-    }
   }
 
   return Status::OK();
@@ -292,38 +280,6 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
                         layout.minor_to_major().end(), std::greater<int64>());
 }
 
-/* static */ bool LayoutUtil::IsPadded(const Shape& shape) {
-  if (!ShapeUtil::IsArray(shape) || !HasLayout(shape) ||
-      shape.layout().padded_dimensions_size() == 0) {
-    return false;
-  }
-  CHECK(IsDenseArray(shape)) << shape.ShortDebugString();
-  CHECK_EQ(shape.dimensions_size(), shape.layout().padded_dimensions_size());
-  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
-    if (shape.layout().padded_dimensions(i) > shape.dimensions(i)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/* static */ absl::Span<const int64> LayoutUtil::PaddedDimensions(
-    const Shape& shape) {
-  CHECK(IsDenseArray(shape));
-  return AsInt64Slice(shape.layout().padded_dimensions());
-}
-
-/* static */ int64 LayoutUtil::PaddedDimension(const Shape& shape,
-                                               int64 index) {
-  CHECK(IsDenseArray(shape));
-  return shape.layout().padded_dimensions(index);
-}
-
-/* static */ PaddingValue LayoutUtil::GetPaddingValue(const Shape& shape) {
-  CHECK(IsDenseArray(shape));
-  return shape.layout().padding_value();
-}
-
 /* static */ bool LayoutUtil::IsSparseArray(const Shape& shape) {
   return ShapeUtil::IsArray(shape) && shape.has_layout() &&
          IsSparse(shape.layout());
@@ -502,14 +458,14 @@ std::ostream& operator<<(std::ostream& out, const Layout& layout) {
   for (int64 minor_to_major : layout.minor_to_major()) {
     hash_value = Hash64Combine(hash_value, hash<int64>()(minor_to_major));
   }
+  hash_value = Hash64Combine(hash_value, layout.max_sparse_elements());
 
-  for (int64 padded_dim : layout.padded_dimensions()) {
-    hash_value = Hash64Combine(hash_value, hash<int64>()(padded_dim));
+  for (Tile tile : layout.tiles()) {
+    for (int64 tile_dim : tile.dimensions()) {
+      hash_value = Hash64Combine(hash_value, hash<int64>()(tile_dim));
+    }
   }
-
-  hash_value =
-      Hash64Combine(hash_value, hash<PaddingValue>()(layout.padding_value()));
-  hash_value = Hash64Combine(hash_value, layout.max_sparse_elements());
+  hash_value = Hash64Combine(hash_value, layout.element_size_in_bits());
 
   return hash_value;
 }
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index b78883c2d870043032306637730c4666665125a8..6c298e57252449ce3f1f9055436e918f2d9f17f1 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -40,6 +41,10 @@ class LayoutUtil {
   static Layout MakeLayoutFromMajorToMinor(
       absl::Span<const int64> major_to_minor);
 
+  // Returns a layout with descending ((i.e. {n, n-1, ..., 0}) minor-to-major
+  // dimensions.
+  static Layout MakeDescendingLayout(int64 rank);
+
   // Creates a sparse layout with the given maximum number of elements. (This is
   // a convenience function for protobuf construction.)
   static Layout MakeSparseLayout(int64 max_sparse_elements);
@@ -64,8 +69,11 @@ class LayoutUtil {
   // default.
   static void SetToDefaultLayout(ProgramShape* program_shape);
 
-  // Validates that the layout within the given shape is correct.
-  static Status ValidateLayoutInShape(const Shape& shape);
+  // Validates that the layout within the given shape is correct. The check
+  // is performed for all subshapes as well. If missing layouts are allowed
+  // the check does not fail on array shapes without layouts.
+  static Status ValidateLayoutInShape(const Shape& shape,
+                                      bool allow_missing_layouts = false);
 
   // Validates that the provided layout satisfies invariants for the given
   // shape.
@@ -97,23 +105,6 @@ class LayoutUtil {
   //        more minor, and so on until dimension N-1 which is the minor.
   static bool IsMonotonicWithDim0Major(const Layout& layout);
 
-  // Returns whether the layout of the given shape has padding (a
-  // padded_dimension value in Layout is greater than the corresponding
-  // dimension size).
-  static bool IsPadded(const Shape& shape);
-
-  // Returns the padded_dimensions array for the given Shape.  Requires that the
-  // shape is an array and has a dense layout.
-  static absl::Span<const int64> PaddedDimensions(const Shape& shape);
-
-  // Returns the given index of the padded_dimensions array for the given Shape.
-  // Requires that the shape is an array and has a dense layout.
-  static int64 PaddedDimension(const Shape& shape, int64 index);
-
-  // Returns the padding_value for the given Shape.  Requires that the shape is
-  // an array and has a dense layout.
-  static PaddingValue GetPaddingValue(const Shape& shape);
-
   // Returns whether the given Shape is an array (i.e. not a tuple) and has a
   // sparse format layout.
   static bool IsSparseArray(const Shape& shape);
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index f25dae6ff411133c74502039f441060f1329ffd4..12ce2d2d7c6fa8c590035f9ff2af50001ccf80d8 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -304,30 +304,6 @@ TEST_F(LayoutUtilTest, SetToDefaultLayoutTuple) {
       shape.tuple_shapes(1).layout()));
 }
 
-TEST_F(LayoutUtilTest, IsPadded) {
-  Shape shape_without_layout = ShapeUtil::MakeShape(F32, {2, 3, 4});
-  LayoutUtil::ClearLayout(&shape_without_layout);
-  EXPECT_FALSE(LayoutUtil::IsPadded(shape_without_layout));
-
-  Shape shape_with_layout = ShapeUtil::MakeShape(F32, {2, 3, 4});
-  LayoutUtil::SetToDefaultLayout(&shape_with_layout);
-  EXPECT_FALSE(LayoutUtil::IsPadded(shape_with_layout));
-
-  // Add padding equal to the dimension sizes. In this case the padding is a
-  // nop.
-  Shape shape_with_degenerate_padding = ShapeUtil::MakeShape(F32, {2, 3, 4});
-  shape_with_degenerate_padding.mutable_layout()->add_padded_dimensions(2);
-  shape_with_degenerate_padding.mutable_layout()->add_padded_dimensions(3);
-  shape_with_degenerate_padding.mutable_layout()->add_padded_dimensions(4);
-  EXPECT_FALSE(LayoutUtil::IsPadded(shape_with_degenerate_padding));
-
-  Shape shape_with_padding = ShapeUtil::MakeShape(F32, {2, 3, 4});
-  shape_with_padding.mutable_layout()->add_padded_dimensions(2);
-  shape_with_padding.mutable_layout()->add_padded_dimensions(14);
-  shape_with_padding.mutable_layout()->add_padded_dimensions(42);
-  EXPECT_TRUE(LayoutUtil::IsPadded(shape_with_padding));
-}
-
 TEST_F(LayoutUtilTest, DefaultLayoutGettersMajorToMinor) {
   EXPECT_TRUE(LayoutUtil::Equal(LayoutUtil::MakeLayout({1, 0}),
                                 LayoutUtil::GetDefaultLayoutForR2()));
@@ -352,5 +328,92 @@ TEST_F(LayoutUtilTest, StreamOut) {
   EXPECT_EQ(oss.str(), "{0,1,2}");
 }
 
+TEST_F(LayoutUtilTest, ValidateLayout_ValidArrayLayout) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {0, 1});
+  auto status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
+  EXPECT_TRUE(status.ok());
+  status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
+  EXPECT_TRUE(status.ok());
+}
+
+TEST_F(LayoutUtilTest, ValidateLayout_InvalidArrayLayout) {
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  *shape.mutable_layout() = LayoutUtil::MakeLayout({0, 1, 2});
+  auto status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
+  EXPECT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("layout minor_to_major field "
+                                   "contains 3 elements, but shape is rank 2"));
+  status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
+  EXPECT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("layout minor_to_major field "
+                                   "contains 3 elements, but shape is rank 2"));
+}
+
+TEST_F(LayoutUtilTest, ValidateLayout_MissingArrayLayout) {
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
+  LayoutUtil::ClearLayout(&shape);
+  auto status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
+  EXPECT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("shape f32[2,3] does not have a layout"));
+  status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
+  EXPECT_TRUE(status.ok());
+}
+
+TEST_F(LayoutUtilTest, ValidateLayout_TupleWithLayout) {
+  Shape shape = ShapeUtil::MakeTupleShape({});
+  *shape.mutable_layout() = LayoutUtil::MakeLayout({0});
+  auto status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
+  EXPECT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("tuple should not have a layout field"));
+  status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
+  EXPECT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("tuple should not have a layout field"));
+}
+
+TEST_F(LayoutUtilTest, ValidateLayout_TupleSubshapesWithMissingLayouts) {
+  Shape sub_1_1_1 = ShapeUtil::MakeShape(F32, {1, 2});
+  Shape sub_1_1 = ShapeUtil::MakeTupleShape({sub_1_1_1});
+  Shape sub_1_2 = ShapeUtil::MakeShape(F32, {1, 2});
+  LayoutUtil::ClearLayout(&sub_1_2);
+  Shape sub_1 = ShapeUtil::MakeTupleShape({sub_1_1, sub_1_2});
+  Shape sub_2_1 = ShapeUtil::MakeShape(F32, {9});
+  LayoutUtil::ClearLayout(&sub_2_1);
+  Shape sub_2 = ShapeUtil::MakeTupleShape({sub_2_1});
+  Shape shape = ShapeUtil::MakeTupleShape({sub_1, sub_2});
+
+  auto status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/false);
+  EXPECT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("shape f32[1,2] does not have a layout"));
+  status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
+  EXPECT_TRUE(status.ok());
+
+  // Add invalid layout on one of sub-shapes.
+  *shape.mutable_tuple_shapes(1)->mutable_tuple_shapes(0)->mutable_layout() =
+      LayoutUtil::MakeLayout({0, 2, 3});
+
+  status =
+      LayoutUtil::ValidateLayoutInShape(shape, /*allow_missing_layouts=*/true);
+  EXPECT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("layout minor_to_major field "
+                                   "contains 3 elements, but shape is rank 1"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD
deleted file mode 100644
index 3e79129aafd234e5eab05d205f2017b54057795e..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/BUILD
+++ /dev/null
@@ -1,82 +0,0 @@
-# Legacy command-line flags for the XLA libraries.
-
-# Please do not add more flags to this package.
-
-# The XLA libraries were written in an environment that allowed command-line
-# flags to be scattered freely throughout the libraries.  This model, while
-# initially convenient, leads to a proliferation in unused command-line flags
-# in tests and binaries, and serious problems in servers, where one might wish
-# parameters to be different in independent RPC calls to the same routine.
-#
-# Please don't add more flags.  If you're a library author, pass options and
-# parameters explicitly through the library's interface.
-
-package(default_visibility = ["//tensorflow:internal"])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-cc_library(
-    name = "parse_flags_from_env",
-    srcs = ["parse_flags_from_env.cc"],
-    hdrs = ["parse_flags_from_env.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:types",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "@com_google_absl//absl/strings",
-        ],
-)
-
-tf_cc_test(
-    name = "parse_flags_from_env_test",
-    srcs = ["parse_flags_from_env_test.cc"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:types",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
-            "@com_google_absl//absl/strings:str_format",
-        ],
-)
-
-cc_library(
-    name = "debug_options_flags",
-    srcs = [
-        "debug_options_flags.cc",
-        "debug_options_parsers.h",
-    ],
-    hdrs = ["debug_options_flags.h"],
-    deps =
-        [
-            ":parse_flags_from_env",
-            "//tensorflow/compiler/xla:xla_proto",
-            "//tensorflow/compiler/xla/service:hlo",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "@com_google_absl//absl/strings",
-        ],
-)
-
-tf_cc_test(
-    name = "debug_options_parsers_test",
-    size = "small",
-    srcs = [
-        "debug_options_parsers.h",
-        "debug_options_parsers_test.cc",
-    ],
-    deps =
-        [
-            "//tensorflow/compiler/xla:xla_proto",
-            "//tensorflow/compiler/xla/service:hlo",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
-            "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/strings:str_format",
-        ],
-)
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.cc b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.cc
deleted file mode 100644
index 2a4e49b05aa0d1eed2197095694cfc6aa8814983..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This module exports ParseFlagsFromEnv(), which allows other modules to parse
-// flags from an environtment variable, or a file named by the environment
-// variable.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <vector>
-
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-static const char kEnvVar[] = "TF_XLA_FLAGS";  // environment variable queried
-static const char kWS[] = " \t\r\n";           // whitespace
-
-// The following struct represents an argv[]-style array, parsed
-// from data gleaned from the environment.
-//
-// As usual, an anonymous namespace is advisable to avoid
-// constructor/destructor collisions with other "private" types
-// in the same named namespace.
-namespace {
-struct EnvArgv {
-  EnvArgv() : initialized(false), argc(0) {}
-  bool initialized;         // whether the other fields have been set.
-  int argc;                 // elements used in argv[]
-  std::vector<char*> argv;  // flag arguments parsed from environment string.
-  std::vector<char*> argv_save;  // saved values from argv[] to avoid leaks
-};
-}  // anonymous namespace
-
-// Append the string s0[0, .., s0len-1] concatenated with s1[0, .., s1len-1] as
-// a newly allocated nul-terminated string to the array *a.  If s0==nullptr, a
-// nullptr is appended without increasing a->argc.
-static void AppendToEnvArgv(const char* s0, size_t s0len, const char* s1,
-                            size_t s1len, EnvArgv* a) {
-  if (s0 == nullptr) {
-    a->argv.push_back(nullptr);
-    a->argv_save.push_back(nullptr);
-  } else {
-    string s = string(s0, s0len) + string(s1, s1len);
-    char* str = strdup(s.c_str());
-    a->argv.push_back(str);
-    a->argv_save.push_back(str);
-    a->argc++;
-  }
-}
-
-// Like s.find_first_of(x, pos), but return s.size() when find_first_of() would
-// return string::npos.  This avoids if-statements elsewhere.
-static size_t FindFirstOf(const string& s, const char* x, size_t pos) {
-  size_t result = s.find_first_of(x, pos);
-  return result == string::npos ? s.size() : result;
-}
-
-// Like s.find_first_not_of(x, pos), but return s.size() when
-// find_first_not_of() would return string::npos.  This avoids if-statements
-// elsewhere.
-static size_t FindFirstNotOf(const string& s, const char* x, size_t pos) {
-  size_t result = s.find_first_not_of(x, pos);
-  return result == string::npos ? s.size() : result;
-}
-
-// Given a string containing flags, parse them into the XLA command line flags.
-// The parse is best effort, and gives up on the first syntax error.
-static void ParseArgvFromString(const string& flag_str, EnvArgv* a) {
-  size_t b = FindFirstNotOf(flag_str, kWS, 0);
-  while (b != flag_str.size() && flag_str[b] == '-') {
-    // b is the index of the start of a flag.
-    // Set e to the index just past the end of the flag.
-    size_t e = b;
-    while (e != flag_str.size() && isascii(flag_str[e]) &&
-           (strchr("-_", flag_str[e]) != nullptr || isalnum(flag_str[e]))) {
-      e++;
-    }
-    if (e != flag_str.size() && flag_str[e] == '=' &&
-        e + 1 != flag_str.size() && strchr("'\"", flag_str[e + 1]) != nullptr) {
-      // A flag of the form  --flag="something in double or single quotes"
-      int c;
-      e++;  // point just past '='
-      size_t eflag = e;
-      char quote = flag_str[e];
-      e++;  // point just past quote
-      // Put in value the string with quotes removed.
-      string value;
-      for (; e != flag_str.size() && (c = flag_str[e]) != quote; e++) {
-        if (quote == '"' && c == '\\' && e + 1 != flag_str.size()) {
-          // Handle backslash in double quoted strings.  They are literal in
-          // single-quoted strings.
-          e++;
-          c = flag_str[e];
-        }
-        value += c;
-      }
-      if (e != flag_str.size()) {  // skip final " or '
-        e++;
-      }
-      AppendToEnvArgv(flag_str.data() + b, eflag - b, value.data(),
-                      value.size(), a);
-    } else {  // A flag without a quoted value.
-      e = FindFirstOf(flag_str, kWS, e);
-      AppendToEnvArgv(flag_str.data() + b, e - b, "", 0, a);
-    }
-    b = FindFirstNotOf(flag_str, kWS, e);
-  }
-}
-
-// Call ParseArgvFromString(..., a) on a string derived from the setting of an
-// environment variable kEnvVar, or a file it points to.
-static void SetArgvFromEnv(EnvArgv* a) {
-  if (!a->initialized) {
-    static const char kDummyArgv[] = "<argv[0]>";
-    AppendToEnvArgv(kDummyArgv, strlen(kDummyArgv), nullptr, 0,
-                    a);  // dummy argv[0]
-    const char* env = getenv(kEnvVar);
-    if (env == nullptr || env[0] == '\0') {
-      // nothing
-    } else if (env[strspn(env, kWS)] == '-') {  // flags in env var value
-      ParseArgvFromString(env, a);
-    } else {  // assume it's a file name
-      FILE* fp = fopen(env, "r");
-      if (fp != nullptr) {
-        string str;
-        char buf[512];
-        int n;
-        while ((n = fread(buf, 1, sizeof(buf), fp)) > 0) {
-          str.append(buf, n);
-        }
-        fclose(fp);
-        ParseArgvFromString(str, a);
-      }
-    }
-    AppendToEnvArgv(nullptr, 0, nullptr, 0, a);  // add trailing nullptr to *a.
-    a->initialized = true;
-  }
-}
-
-// The simulated argv[] parsed from the environment.
-static EnvArgv* env_argv;
-
-// Used to protect accesses to env_argv.
-static tensorflow::mutex env_argv_mu(tensorflow::LINKER_INITIALIZED);
-
-// Call Flags::Parse(argc, argv, flag_list) against any as yet unrecognized
-// flags passed in from the environment.
-bool ParseFlagsFromEnv(const std::vector<tensorflow::Flag>& flag_list) {
-  env_argv_mu.lock();
-  if (env_argv == nullptr) {
-    env_argv = new EnvArgv;
-  }
-  SetArgvFromEnv(env_argv);  // a no-op if already initialized
-  bool result =
-      tensorflow::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
-  env_argv_mu.unlock();
-  return result;
-}
-
-// Testing only.
-// Reset the env_argv struct so that subsequent calls to ParseFlagsFromEnv()
-// will parse the environment variable (or the file it points to) anew, and set
-// *pargc, and *pargv to point to the internal locations of the argc and argv
-// constructed from the environment.
-void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv) {
-  env_argv_mu.lock();
-  if (env_argv == nullptr) {
-    env_argv = new EnvArgv;
-  }
-  if (!env_argv->argv_save.empty()) {
-    for (int i = 0; env_argv->argv_save[i] != nullptr; i++) {
-      free(env_argv->argv_save[i]);
-    }
-  }
-  env_argv->initialized = false;
-  env_argv->argc = 0;
-  env_argv->argv.clear();
-  env_argv->argv_save.clear();
-  env_argv_mu.unlock();
-  *pargc = &env_argv->argc;
-  *pargv = &env_argv->argv;
-}
-
-}  // namespace legacy_flags
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h b/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h
deleted file mode 100644
index b54482ad2ba2224c781861341a80ceb878ffd343..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_PARSE_FLAGS_FROM_ENV_H_
-#define TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_PARSE_FLAGS_FROM_ENV_H_
-
-// This module exports ParseFlagsFromEnv(), which allows other modules to parse
-// flags from the environtment variable TF_XLA_FLAGS, or (if the first
-// non-whitespace in the variable value is not '-'), a file named by that
-// environment variable.  The accepted syntax is that flags arguments are of
-// the form --flag=value or (for boolean flags) --flag, and are whitespace
-// separated.  The <value> may be one of:
-// - <non-whitespace, non-nul not starting with single-quote or double-quote>
-//   in which case the effective value is the string itself
-// - <single-quote><characters string not containing nul or
-//   single-quote><single_quote> in which case the effective value is the
-//   string with the single-quotes removed
-// - <double-quote><character string not containing nul or unesecaped
-//   double-quote><double_quote> in which case the effective value if the
-//   string with the double-quotes removed, and escaped sequences of
-//   <backslash><char> replaced by <char>.
-//
-// Flags values inconsistent with the type of the flag will be rejected by the
-// flag parser.
-//
-// Examples:
-//    TF_XLA_FLAGS="--foo=bar  --wombat='value with a space'"
-//
-//    TF_XLA_FLAGS=/tmp/flagfile
-// where /tmp/flagfile might contain
-//    --some_flag="This is a string containing a \" and a '."
-//    --another_flag=wombats
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace xla {
-namespace legacy_flags {
-
-// Call tensorflow::Flags::Parse(argc, argv, flag_list) against any as yet
-// unrecognized flags passed in from the environment, and return its
-// return value.
-bool ParseFlagsFromEnv(const std::vector<tensorflow::Flag>& flag_list);
-
-// Used only for testing.  Not to be used by clients.
-void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv);
-
-}  // namespace legacy_flags
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_LEGACY_FLAGS_PARSE_FLAGS_FROM_ENV_H_
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 656ce720a13d5c9622e9dc05ae04ddcac8cbeee5..8f480c1f1079b4e1a5be53958ebdf6e004ad9ebe 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -22,16 +22,17 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -62,6 +63,14 @@ void ConvertEndianShort(char* bytes, int64 size) {
   }
 }
 
+// Since Eigen::half doesn't satisfy the absl::bit_cast contract, we need to be
+// able to transparently access the raw 16-bit value contained within.
+template <typename T>
+T GetRawValue(T val) {
+  return val;
+}
+uint16 GetRawValue(Eigen::half val) { return val.x; }
+
 }  // namespace
 
 LiteralBase::~LiteralBase() {}
@@ -283,13 +292,17 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
   if (!proto.has_shape()) {
     return InvalidArgument("LiteralProto has no shape");
   }
-  if (!LayoutUtil::HasLayout(proto.shape())) {
+  Shape shape(proto.shape());
+  if (ShapeUtil::HasPrimitiveType(shape, OPAQUE)) {
+    return InvalidArgument("Literal shape cannot include OPAQUE sub-shape");
+  }
+  if (!LayoutUtil::HasLayout(shape)) {
     return InvalidArgument("LiteralProto has no layout");
   }
 
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(proto.shape()));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
-  Literal literal(proto.shape());
+  Literal literal(shape);
 
   TF_RETURN_IF_ERROR(literal.root_piece_->ForEachMutableSubpieceWithStatus(
       [&](const ShapeIndex& index, Piece* piece) {
@@ -1009,167 +1022,143 @@ void LiteralBase::Piece::SortSparseElementsInternal() {
 
 namespace {
 
-void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
-                    bool print_layout, std::vector<string>* pieces) {
-  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
-  CHECK(LayoutUtil::HasLayout(literal.shape()));
-  CHECK(LayoutUtil::HasLayout(subshape));
+string ShapeToString(bool print_layout, const Shape& shape) {
+  return print_layout ? ShapeUtil::HumanStringWithLayout(shape)
+                      : ShapeUtil::HumanString(shape);
+}
 
-  auto shape_to_string = [print_layout](const Shape& shape) {
-    if (print_layout) {
-      return ShapeUtil::HumanStringWithLayout(shape);
-    } else {
-      return ShapeUtil::HumanString(shape);
-    }
-  };
+void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                    bool print_layout, std::vector<string>* pieces);
 
-  // TODO(b/32894291): refactor this code to reduce code duplication.
-  if (ShapeUtil::IsTuple(subshape)) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" (\n");
-    std::vector<string> tuple_pieces;
-    for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
-      ShapeIndex element_index = shape_index;
-      element_index.push_back(i);
-      std::vector<string> element_pieces;
-      ToStringHelper(literal, element_index, print_layout, &element_pieces);
-      tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
+void TupleToStringHelper(const LiteralBase& literal,
+                         const ShapeIndex& shape_index, bool print_layout,
+                         std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  pieces->push_back(ShapeToString(print_layout, subshape));
+  pieces->push_back(" (\n");
+  std::vector<string> tuple_pieces;
+  for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
+    ShapeIndex element_index = shape_index;
+    element_index.push_back(i);
+    std::vector<string> element_pieces;
+    ToStringHelper(literal, element_index, print_layout, &element_pieces);
+    tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
+  }
+  pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
+  pieces->push_back("\n)");
+}
+
+void SparseArrayToStringHelper(const LiteralBase& literal,
+                               const Shape& subshape, bool print_layout,
+                               std::vector<string>* pieces) {
+  pieces->push_back(ShapeToString(print_layout, subshape));
+  pieces->push_back("{");
+  int64 rank = ShapeUtil::Rank(subshape);
+  int64 num_elements = literal.sparse_element_count();
+  for (int64 i = 0; i < num_elements; ++i) {
+    if (i > 0) {
+      pieces->push_back(", ");
     }
-    pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
-    pieces->push_back("\n)");
-    return;
-  }
-
-  if (ShapeUtil::IsToken(subshape)) {
-    pieces->push_back("token");
-    return;
-  }
-
-  if (LayoutUtil::IsSparseArray(subshape)) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back("{");
-    int64 rank = ShapeUtil::Rank(subshape);
-    int64 num_elements = literal.sparse_element_count();
-    for (int64 i = 0; i < num_elements; ++i) {
-      if (i > 0) {
-        pieces->push_back(", ");
-      }
-      if (rank == 1) {
-        pieces->push_back(StrCat(literal.GetSparseIndex(i)[0]));
-        pieces->push_back(": ");
-      } else {
-        pieces->push_back("[");
-        pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", "));
-        pieces->push_back("]: ");
-      }
-      pieces->push_back(literal.GetSparseElementAsString(i));
+    if (rank == 1) {
+      pieces->push_back(StrCat(literal.GetSparseIndex(i)[0]));
+      pieces->push_back(": ");
+    } else {
+      pieces->push_back("[");
+      pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", "));
+      pieces->push_back("]: ");
     }
-    pieces->push_back("}");
-    return;
+    pieces->push_back(literal.GetSparseElementAsString(i));
   }
+  pieces->push_back("}");
+}
 
-  CHECK(LayoutUtil::IsDenseArray(subshape));
-
-  auto element_to_string = [&](absl::Span<const int64> indices) -> string {
-    PrimitiveType element_type = subshape.element_type();
-    if (element_type == PRED) {
-      // We display predicates in a densely packed form.
-      return literal.Get<bool>(indices, shape_index) ? "1" : "0";
-    }
-    return ((!indices.empty() && indices.back() > 0) ? ", " : "") +
-           literal.GetAsString(indices, shape_index);
-  };
+void DenseArrayToStringHelper(const LiteralBase& literal,
+                              const ShapeIndex& shape_index, bool print_layout,
+                              std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  int64 rank = ShapeUtil::Rank(subshape);
+
+  std::function<void(absl::Span<const int64> dimensions, std::vector<int64>*)>
+      to_string_recursive = [&](absl::Span<const int64> dimensions,
+                                std::vector<int64>* accum_indices) {
+        // dimensions.size() decreases by 1 at each recursive call,
+        // and accum_indices->size() increases by 1.
+        // Their sum is equal to the rank of the tensor.
+        CHECK_EQ(rank, dimensions.size() + accum_indices->size());
+
+        auto brace_to_string = [&](string brace) -> string {
+          // Handle 1D tensor
+          if (rank == 1) {
+            return brace;
+          }
+          // Handle the innermost tensor of a 2D+ tensor.
+          if (dimensions.size() == 1 && brace == "{") {
+            return StrCat("  ", brace, dimensions[0] <= 1 ? "" : " ");
+          }
+          if (dimensions.size() == 1 && brace == "}") {
+            return StrCat(dimensions[0] <= 1 ? "" : " ", brace);
+          }
+          // Handle the non-innermost tensors of a 2D+ tensor.
+          if (brace == "{") {
+            if (rank > 3 && !accum_indices->empty() &&
+                accum_indices->size() < rank) {
+              int index = accum_indices->size() - 1;
+              int value = accum_indices->back();
+              return StrCat(brace, " /*i", index, "=", value, "*/\n");
+            }
+            return StrCat(brace, "\n");
+          }
+          return StrCat("\n", brace);
+        };
 
-  if (ShapeUtil::Rank(subshape) == 0) {
-    pieces->push_back(literal.GetAsString({}, shape_index));
-  } else if (ShapeUtil::Rank(subshape) == 1) {
-    pieces->push_back("{");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(element_to_string({i0}));
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 2) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back("  { ");
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(element_to_string({i0, i1}));
-      }
-      pieces->push_back(" ");
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "}\n" : "},\n");
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 3) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(i0 > 0 ? ",\n{" : "{");
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(i1 > 0 ? ",\n  { " : " { ");
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back(element_to_string({i0, i1, i2}));
-        }
-        pieces->push_back(" }");
-      }
-      pieces->push_back(" }");
-    }
-    pieces->push_back("\n}");
-  } else if (ShapeUtil::Rank(subshape) == 4) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(StrFormat("  {  /*i0=%d*/\n", i0));
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(StrFormat("    {  /*i1=%d*/\n", i1));
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back("      {");
-          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
-            pieces->push_back(element_to_string({i0, i1, i2, i3}));
+        if (dimensions.empty()) {
+          // Display predicates as 0s and 1s so that the string is more dense.
+          string elem;
+          if (subshape.element_type() == PRED && rank > 0) {
+            elem = literal.Get<bool>(*accum_indices, shape_index) ? "1" : "0";
+          } else {
+            elem = literal.GetAsString(*accum_indices, shape_index);
           }
-          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "}\n" : "},\n");
-        }
-        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
-                                                           : "    },\n");
-      }
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 5) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(StrFormat("  {  /*i0=%d*/\n", i0));
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(StrFormat("    {  /*i1=%d*/\n", i1));
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back(StrFormat("      {  /*i2=%d*/\n", i2));
-          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
-            pieces->push_back("        {");
-            for (int64 i4 = 0; i4 < subshape.dimensions(4); ++i4) {
-              pieces->push_back(element_to_string({i0, i1, i2, i3, i4}));
+          pieces->push_back(elem);
+        } else {
+          pieces->push_back(brace_to_string("{"));
+          for (int i = 0; i < dimensions[0]; ++i) {
+            std::vector<int64> cloned_indices(*accum_indices);
+            cloned_indices.push_back(i);
+            to_string_recursive(dimensions.subspan(1), &cloned_indices);
+            if (i < dimensions[0] - 1) {
+              pieces->push_back(",");
+              pieces->push_back(dimensions.size() > 1 ? "\n" : " ");
             }
-            pieces->push_back(i3 == subshape.dimensions(3) - 1 ? "}\n"
-                                                               : "},\n");
           }
-          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "      }\n"
-                                                             : "      },\n");
+          pieces->push_back(brace_to_string("}"));
         }
-        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
-                                                           : "    },\n");
-      }
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
-    }
-    pieces->push_back("}");
+      };
+
+  if (rank > 1) {
+    pieces->push_back(ShapeToString(print_layout, subshape));
+    pieces->push_back(" ");
+  }
+  std::vector<int64> indices = {};
+  std::vector<int64> dimensions(subshape.dimensions().begin(),
+                                subshape.dimensions().end());
+  to_string_recursive(dimensions, &indices);
+}
+
+void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                    bool print_layout, std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  CHECK(LayoutUtil::HasLayout(literal.shape()));
+  CHECK(LayoutUtil::HasLayout(subshape));
+  if (ShapeUtil::IsTuple(subshape)) {
+    TupleToStringHelper(literal, shape_index, print_layout, pieces);
+  } else if (ShapeUtil::IsToken(subshape)) {
+    pieces->push_back("token");
+  } else if (LayoutUtil::IsSparseArray(subshape)) {
+    SparseArrayToStringHelper(literal, subshape, print_layout, pieces);
   } else {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {");
-    literal.EachCellAsString(
-        [&](absl::Span<const int64> indices, const string& value) {
-          pieces->push_back(" ");
-          pieces->push_back(value);
-        });
-    pieces->push_back("}");
+    CHECK(LayoutUtil::IsDenseArray(subshape));
+    DenseArrayToStringHelper(literal, shape_index, print_layout, pieces);
   }
 }
 
@@ -1226,16 +1215,32 @@ Literal ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
 }
 
 template <typename NativeSrcT, typename NativeDestT>
-typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)),
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT) &&
+                         !std::is_same<NativeDestT, Eigen::half>::value),
                         Literal>::type
 BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) {
-    return tensorflow::bit_cast<NativeDestT>(src);
+    return absl::bit_cast<NativeDestT>(GetRawValue(src));
   };
   return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
       src_literal, converter);
 }
 
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(Eigen::half) &&
+                         std::is_same<NativeDestT, Eigen::half>::value),
+                        Literal>::type
+BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
+  // Eigen::half doesn't satisfy the absl::bit_cast contract, so explicitly
+  // cast to unsigned short and then use raw_uint16_to_half.
+  auto converter = [](NativeSrcT src) {
+    return Eigen::half_impl::raw_uint16_to_half(
+        absl::bit_cast<uint16>(GetRawValue(src)));
+  };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, Eigen::half>(
+      src_literal, converter);
+}
+
 // This template specialization is here to make the compiler happy. bit_cast has
 // a static check that the types are the same size. This specialization should
 // never be used because the source and destination types are checked for
@@ -1432,10 +1437,14 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
       return EqualElementsInternal<bool>(other, &multi_index);
     case U8:
       return EqualElementsInternal<uint8>(other, &multi_index);
+    case S16:
+      return EqualElementsInternal<int16>(other, &multi_index);
     case S32:
       return EqualElementsInternal<int32>(other, &multi_index);
     case S64:
       return EqualElementsInternal<int64>(other, &multi_index);
+    case U16:
+      return EqualElementsInternal<uint16>(other, &multi_index);
     case U32:
       return EqualElementsInternal<uint32>(other, &multi_index);
     case U64:
@@ -1504,6 +1513,11 @@ bool LiteralBase::IsAll(int8 value) const {
             return AllElementsEqualValue<uint8>(piece.data<uint8>(), value);
           }
           return false;
+        case U16:
+          if (value >= 0) {
+            return AllElementsEqualValue<uint16>(piece.data<uint16>(), value);
+          }
+          return false;
         case U32:
           if (value >= 0) {
             return AllElementsEqualValue<uint32>(piece.data<uint32>(), value);
@@ -1516,6 +1530,8 @@ bool LiteralBase::IsAll(int8 value) const {
           return false;
         case S8:
           return AllElementsEqualValue<int8>(piece.data<int8>(), value);
+        case S16:
+          return AllElementsEqualValue<int16>(piece.data<int16>(), value);
         case S32:
           return AllElementsEqualValue<int32>(piece.data<int32>(), value);
         case S64:
@@ -1737,12 +1753,16 @@ bool LiteralBase::IsZero(absl::Span<const int64> indices) const {
   switch (shape().element_type()) {
     case U8:
       return Get<uint8>(indices) == 0;
+    case U16:
+      return Get<uint16>(indices) == 0;
     case U32:
       return Get<uint32>(indices) == 0;
     case U64:
       return Get<uint64>(indices) == 0;
     case S8:
       return Get<int8>(indices) == 0;
+    case S16:
+      return Get<int16>(indices) == 0;
     case S32:
       return Get<int32>(indices) == 0;
     case S64:
@@ -1775,7 +1795,7 @@ void CopyToRepeatedField(RepeatedFieldT* dest,
 }  // namespace
 
 void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
-  *proto->mutable_shape() = subshape();
+  *proto->mutable_shape() = subshape().ToProto();
   switch (subshape().element_type()) {
     case PRED:
       CopyToRepeatedField(proto->mutable_preds(), data<bool>());
@@ -1800,6 +1820,20 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
     case S64:
       CopyToRepeatedField(proto->mutable_s64s(), data<int64>());
       break;
+    case U16:
+      *proto->mutable_u16s() = string(
+          reinterpret_cast<const char*>(data<uint16_t>().data()), size_bytes());
+      if (!kLittleEndian) {
+        ConvertEndianShort(proto->mutable_u16s());
+      }
+      break;
+    case S16:
+      *proto->mutable_s16s() = string(
+          reinterpret_cast<const char*>(data<int16_t>().data()), size_bytes());
+      if (!kLittleEndian) {
+        ConvertEndianShort(proto->mutable_s16s());
+      }
+      break;
     case F16:
       *proto->mutable_f16s() = string(
           reinterpret_cast<const char*>(data<half>().data()), size_bytes());
@@ -1867,8 +1901,9 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   // These conditions should have been checked in
   // MutableLiteralBase::CreateFromProto.
   TF_RET_CHECK(proto.has_shape());
-  TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape()));
-  TF_RET_CHECK(ShapeUtil::Equal(proto.shape(), subshape()));
+  Shape shape(proto.shape());
+  TF_RET_CHECK(LayoutUtil::HasLayout(shape));
+  TF_RET_CHECK(ShapeUtil::Equal(shape, subshape()));
 
   if (LayoutUtil::IsSparseArray(subshape())) {
     // Compute the number of elements (indices) in the sparse shape and reserve
@@ -1914,6 +1949,22 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
     case U64:
       TF_RETURN_IF_ERROR(CopyFromRepeatedField(data<uint64>(), proto.u64s()));
       break;
+    case S16: {
+      const string& s(proto.s16s());
+      TF_RET_CHECK(data<int16_t>().size() * sizeof(int16_t) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+      if (!kLittleEndian) {
+        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+      }
+    } break;
+    case U16: {
+      const string& s(proto.u16s());
+      TF_RET_CHECK(data<uint16_t>().size() * sizeof(uint16_t) == s.size());
+      memcpy(untyped_data(), s.data(), s.size());
+      if (!kLittleEndian) {
+        ConvertEndianShort(reinterpret_cast<char*>(untyped_data()), s.size());
+      }
+    } break;
     case F16: {
       const string& s(proto.f16s());
       TF_RET_CHECK(data<half>().size() * sizeof(half) == s.size());
@@ -1992,7 +2043,7 @@ string LiteralBase::GetR1U8AsString() const {
   CHECK(ShapeUtil::IsArray(shape()));
   CHECK_EQ(ShapeUtil::Rank(shape()), 1);
   CHECK_EQ(shape().element_type(), U8);
-  return string(tensorflow::bit_cast<const char*>(data<uint8>().data()),
+  return string(absl::bit_cast<const char*>(data<uint8>().data()),
                 ShapeUtil::ElementsIn(shape()));
 }
 
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index 3cd3541fe1596600b4f0b43e3011e1f0322ac8fe..fa9a71af4ceb998a7a289443cbef70eb52cb1a11 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -301,7 +301,7 @@ class LiteralBase {
   //
   // Note: It's an antipattern to use this method then immediately call
   // MutableLiteralBase::Populate on the result (since that results in zero
-  // initialization, then reinitialization. Conside if a call to
+  // initialization, then reinitialization. Consider if a call to
   // absl::make_unique<Literal>(shape), followed by the call to
   // MutableLiteralBase::Populate can be used instead.
   static Literal CreateFromShape(const Shape& shape);
@@ -979,9 +979,8 @@ inline void MutableLiteralBase::PopulateR1(absl::Span<const NativeT> values) {
   CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size());
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
-  for (int64 i = 0; i < values.size(); ++i) {
-    Set({i}, values[i]);
-  }
+  auto data_span = data<NativeT>();
+  std::copy(values.begin(), values.end(), data_span.begin());
 }
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 3d8725ed7051cafc97987f25a96004fa876dfdd3..b044f0ad73f13a0599e77f1f43888bc974e31f73 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/env.h"
 
 using absl::StrAppend;
@@ -34,14 +34,22 @@ namespace xla {
 namespace literal_comparison {
 namespace {
 
+// Since Eigen::half doesn't satisfy the absl::bit_cast contract, we need to be
+// able to transparently access the raw 16-bit value contained within.
+template <typename T>
+T GetRawValue(T val) {
+  return val;
+}
+uint16 GetRawValue(Eigen::half val) { return val.x; }
+
 // Helper function for comparing a floating point type, FloatT, bitwise equal
 // between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
 // -- on miscompare, a nice error message is given in the AssertionFailure.
 template <typename FloatT, typename UnsignedT>
 Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs,
                                  absl::Span<const int64> multi_index) {
-  auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
-  auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
+  auto ulhs = absl::bit_cast<UnsignedT>(GetRawValue(lhs));
+  auto urhs = absl::bit_cast<UnsignedT>(GetRawValue(rhs));
   auto lhs_double = static_cast<double>(lhs);
   auto rhs_double = static_cast<double>(rhs);
   if (ulhs != urhs) {
@@ -133,8 +141,10 @@ int64 RecursiveElementCount(const Shape& shape) {
       total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
     }
     return total;
-  } else {
+  } else if (ShapeUtil::IsArray(shape)) {
     return ShapeUtil::ElementsIn(shape);
+  } else {
+    return 0;
   }
 }
 
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index dd5b54e4c99998f676419cf98a3da16593338829..49363ad802ddb9520f89b53257216bc7ddaf8ff5 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -133,7 +133,7 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
   auto pred_vec = LiteralUtil::CreateR1<bool>({true, false, true});
-  EXPECT_EQ("{101}", pred_vec.ToString());
+  EXPECT_EQ("{1, 0, 1}", pred_vec.ToString());
 }
 
 TEST_F(LiteralUtilTest, R2ToString) {
@@ -150,12 +150,58 @@ TEST_F(LiteralUtilTest, R3ToString) {
   const auto literal =
       LiteralUtil::CreateR3({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}});
   const string expected = R"(s32[3,2,1] {
-{ { 1 },
-  { 2 } },
-{ { 3 },
-  { 4 } },
-{ { 5 },
-  { 6 } }
+{
+  {1},
+  {2}
+},
+{
+  {3},
+  {4}
+},
+{
+  {5},
+  {6}
+}
+})";
+  EXPECT_EQ(expected, literal.ToString());
+}
+
+TEST_F(LiteralUtilTest, R6ToString) {
+  const auto literal =
+      LiteralUtil::CreateFromDimensions(S32, {2, 2, 1, 1, 1, 2});
+  const string expected = R"(s32[2,2,1,1,1,2] {
+{ /*i0=0*/
+{ /*i1=0*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+},
+{ /*i1=1*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+}
+},
+{ /*i0=1*/
+{ /*i1=0*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+},
+{ /*i1=1*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+}
+}
 })";
   EXPECT_EQ(expected, literal.ToString());
 }
@@ -190,12 +236,16 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
   EXPECT_THAT(literal.shape().dimensions(), ElementsAre(2, 3, 2));
   string result = literal.ToString();
   const string expected = R"(f32[2,3,2] {
-{ { 1, 2 },
+{
+  { 1, 2 },
   { 3, 4 },
-  { 5, 6 } },
-{ { 7, 8 },
+  { 5, 6 }
+},
+{
+  { 7, 8 },
   { 9, 10 },
-  { 11, 12 } }
+  { 11, 12 }
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -247,18 +297,18 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
   EXPECT_THAT(literal.shape().dimensions(), ElementsAre(1, 2, 3, 2));
   string result = literal.ToString();
   const string expected = R"(f32[1,2,3,2] {
-  {  /*i0=0*/
-    {  /*i1=0*/
-      {1, 2},
-      {1001, 1002},
-      {2001, 2002}
-    },
-    {  /*i1=1*/
-      {1, 2},
-      {1001, 1002},
-      {2001, 2002}
-    }
-  }
+{ /*i0=0*/
+{ /*i1=0*/
+  { 1, 2 },
+  { 1001, 1002 },
+  { 2001, 2002 }
+},
+{ /*i1=1*/
+  { 1, 2 },
+  { 1001, 1002 },
+  { 2001, 2002 }
+}
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -268,30 +318,30 @@ TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) {
               ElementsAre(2, 2, 3, 3));
   string result = literal_r4_2x2x3x3_dim0major_.ToString();
   const string expected = R"(f32[2,2,3,3] {
-  {  /*i0=0*/
-    {  /*i1=0*/
-      {1, 2, 3},
-      {4, 5, 6},
-      {7, 8, 9}
-    },
-    {  /*i1=1*/
-      {11, 12, 13},
-      {14, 15, 16},
-      {17, 18, 19}
-    }
-  },
-  {  /*i0=1*/
-    {  /*i1=0*/
-      {101, 102, 103},
-      {104, 105, 106},
-      {107, 108, 109}
-    },
-    {  /*i1=1*/
-      {201, 202, 203},
-      {204, 205, 206},
-      {207, 208, 209}
-    }
-  }
+{ /*i0=0*/
+{ /*i1=0*/
+  { 1, 2, 3 },
+  { 4, 5, 6 },
+  { 7, 8, 9 }
+},
+{ /*i1=1*/
+  { 11, 12, 13 },
+  { 14, 15, 16 },
+  { 17, 18, 19 }
+}
+},
+{ /*i0=1*/
+{ /*i1=0*/
+  { 101, 102, 103 },
+  { 104, 105, 106 },
+  { 107, 108, 109 }
+},
+{ /*i1=1*/
+  { 201, 202, 203 },
+  { 204, 205, 206 },
+  { 207, 208, 209 }
+}
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -1312,11 +1362,10 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
 
 TEST_F(LiteralUtilTest, BitcastConvert) {
   auto original = LiteralUtil::CreateR1<uint32>(
-      {tensorflow::bit_cast<uint32>(2.5f),
-       tensorflow::bit_cast<uint32>(-42.25f),
-       tensorflow::bit_cast<uint32>(100.f), 0xbeef});
+      {absl::bit_cast<uint32>(2.5f), absl::bit_cast<uint32>(-42.25f),
+       absl::bit_cast<uint32>(100.f), 0xbeef});
   auto expected = LiteralUtil::CreateR1<float>(
-      {2.5f, -42.25f, 100.0f, tensorflow::bit_cast<float>(0xbeef)});
+      {2.5f, -42.25f, 100.0f, absl::bit_cast<float>(0xbeef)});
   TF_ASSERT_OK_AND_ASSIGN(Literal converted, original.BitcastConvert(F32));
 }
 
@@ -1328,13 +1377,26 @@ TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
       absl::StrContains(status.error_message(), "bit widths are different"));
 }
 
+// Sets the layout of the given ShapeProto to the default.
+void SetDefaultLayoutOnProto(ShapeProto* shape_proto) {
+  CHECK(ShapeUtil::IsArrayPrimitiveType(shape_proto->element_type()));
+  shape_proto->mutable_layout()->set_format(DENSE);
+  auto* minor_to_major =
+      shape_proto->mutable_layout()->mutable_minor_to_major();
+  minor_to_major->Resize(shape_proto->dimensions_size(), 0);
+  const int64 size = minor_to_major->size();
+  for (int64 i = 0; i < size; ++i) {
+    minor_to_major->Set(i, size - 1 - i);
+  }
+}
+
 TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
   LiteralProto p;
   p.mutable_shape()->set_element_type(PRED);
   for (int len = 0; len < 25; ++len) {
     p.mutable_shape()->clear_dimensions();
     p.mutable_shape()->add_dimensions(len);
-    LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+    SetDefaultLayoutOnProto(p.mutable_shape());
     p.clear_preds();
     for (int i = 0; i < len; ++i) {
       p.add_preds((i % 2) == (len % 2));
@@ -1360,7 +1422,7 @@ TEST_F(LiteralUtilTest, ToProto_f16) {
   EXPECT_EQ(4, m.data<half>().size());
 
   LiteralProto p = m.ToProto();
-  EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape()));
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(Shape(p.shape())));
   EXPECT_EQ(8, p.f16s().size());
   const char* d = p.f16s().data();
   EXPECT_EQ(d[0], 0);
@@ -1383,7 +1445,7 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   p.mutable_shape()->set_element_type(F16);
   p.mutable_shape()->clear_dimensions();
   p.mutable_shape()->add_dimensions(4);
-  LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+  SetDefaultLayoutOnProto(p.mutable_shape());
   p.clear_f16s();
   p.set_f16s(half_vals, 8);
   TF_ASSERT_OK_AND_ASSIGN(Literal literal, Literal::CreateFromProto(p));
@@ -1395,6 +1457,28 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   EXPECT_EQ(h1, r[3]);
 }
 
+TEST_F(LiteralUtilTest, CopyFromProto_u16) {
+  uint16 u1(0xabcd);
+  uint16 u2(0x1234);
+
+  const unsigned char uint16_vals[8] = {0xcd, 0xab, 0x34, 0x12,
+                                        0x34, 0x12, 0xcd, 0xab};
+  LiteralProto p;
+  p.mutable_shape()->set_element_type(U16);
+  p.mutable_shape()->clear_dimensions();
+  p.mutable_shape()->add_dimensions(4);
+  SetDefaultLayoutOnProto(p.mutable_shape());
+  p.clear_u16s();
+  p.set_u16s(uint16_vals, 8);
+  TF_ASSERT_OK_AND_ASSIGN(Literal literal, Literal::CreateFromProto(p));
+  auto r = literal.data<uint16>();
+  ASSERT_EQ(4, r.size());
+  EXPECT_EQ(u1, r[0]);
+  EXPECT_EQ(u2, r[1]);
+  EXPECT_EQ(u2, r[2]);
+  EXPECT_EQ(u1, r[3]);
+}
+
 TEST_F(LiteralUtilTest, LiteralSliceTest) {
   auto scalar = LiteralUtil::CreateR0<float>(1.0);
   auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
@@ -1516,9 +1600,9 @@ TEST_F(LiteralUtilTest, DecomposeTuple) {
   Literal nested_tuple = LiteralUtil::MakeTuple(
       {&tuple_elements[0], &tuple_elements[1], &nil_literal});
 
-  EXPECT_FALSE(ShapeUtil::IsNil(nested_tuple.shape()));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(nested_tuple.shape()));
   std::vector<Literal> elements = nested_tuple.DecomposeTuple();
-  EXPECT_TRUE(ShapeUtil::IsNil(nested_tuple.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(nested_tuple.shape()));
 
   ASSERT_EQ(elements.size(), 3);
 
@@ -1569,7 +1653,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) {
   EXPECT_EQ(literal.Get<double>({1}, /*shape_index=*/{2, 1}), 44.0);
 
   for (const Literal& element : elements) {
-    EXPECT_TRUE(ShapeUtil::IsNil(element.shape()));
+    EXPECT_TRUE(ShapeUtil::IsEmptyTuple(element.shape()));
   }
 }
 
@@ -1685,7 +1769,7 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
 TEST_F(LiteralUtilTest, InvalidProtoNoValues) {
   // Proto contains a shape, but no values.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}).ToProto();
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(),
@@ -1706,7 +1790,7 @@ TEST_F(LiteralUtilTest, InvalidProtoNoShape) {
 TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
   // Proto contains values in wrong container.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}).ToProto();
   proto.add_preds(false);
   proto.add_preds(true);
   proto.add_preds(false);
@@ -1719,7 +1803,7 @@ TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
 TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) {
   // Proto contains too few values.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {42, 2});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {42, 2}).ToProto();
   proto.add_f32s(1.0);
   proto.add_f32s(2.0);
   proto.add_f32s(3.0);
@@ -1732,7 +1816,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) {
 TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) {
   // Proto contains too many values.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(S32, {2});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(S32, {2}).ToProto();
   proto.add_s32s(42);
   proto.add_s32s(-10);
   proto.add_s32s(100);
@@ -1745,8 +1829,8 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) {
 TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) {
   // Proto shape missing layout.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(PRED, {2, 2});
-  LayoutUtil::ClearLayout(proto.mutable_shape());
+  *proto.mutable_shape() = ShapeUtil::MakeShape(PRED, {2, 2}).ToProto();
+  proto.mutable_shape()->clear_layout();
   proto.add_preds(true);
   proto.add_preds(false);
   proto.add_preds(true);
@@ -1759,11 +1843,13 @@ TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) {
 TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
   // Proto has the too few tuple elements.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeTupleShape(
-      {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})});
+  *proto.mutable_shape() =
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})})
+          .ToProto();
   LiteralProto* element0 = proto.add_tuple_literals();
   *element0->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 0).ToProto();
   element0->add_preds(false);
   element0->add_preds(true);
 
@@ -1775,19 +1861,21 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
 TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) {
   // Proto has the too many tuple elements.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeTupleShape(
-      {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})});
+  *proto.mutable_shape() =
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})})
+          .ToProto();
   LiteralProto* element0 = proto.add_tuple_literals();
   *element0->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 0).ToProto();
   element0->add_preds(false);
   element0->add_preds(true);
   LiteralProto* element1 = proto.add_tuple_literals();
   *element1->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(proto.shape(), 1);
+      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 1).ToProto();
   element1->add_f32s(42.0);
   LiteralProto* element2 = proto.add_tuple_literals();
-  *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {});
+  *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {}).ToProto();
   element2->add_f32s(123.0);
 
   Status status = Literal::CreateFromProto(proto).status();
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 0cb1ae35f4ad31f091063d78ed32c1463be8ee0a..bb5e5e61000d0aca6ab052ac87d2fbcd96e55f70 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b568888d14f21c1330556d017eafba6c8dd2228
--- /dev/null
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -0,0 +1,234 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
+// modules to parse flags from an environtment variable, or a file named by the
+// environment variable.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+
+static const char kWS[] = " \t\r\n";           // whitespace
+
+// The following struct represents an argv[]-style array, parsed
+// from data gleaned from the environment.
+//
+// As usual, an anonymous namespace is advisable to avoid
+// constructor/destructor collisions with other "private" types
+// in the same named namespace.
+namespace {
+
+// Functor which deletes objects by calling `free`.  Necessary to free strdup'ed
+// strings created by AppendToEnvArgv.
+struct FreeDeleter {
+  void operator()(char* ptr) { free(ptr); }
+};
+
+struct EnvArgv {
+  EnvArgv() : initialized(false), argc(0) {}
+  bool initialized;         // whether the other fields have been set.
+  int argc;                 // elements used in argv[]
+  std::vector<char*> argv;  // flag arguments parsed from environment string.
+  // saved values from argv[] to avoid leaks
+  std::vector<std::unique_ptr<char, FreeDeleter>> argv_save;
+};
+}  // anonymous namespace
+
+// Append the string s0[0, .., s0len-1] concatenated with s1[0, .., s1len-1] as
+// a newly allocated nul-terminated string to the array *a.  If s0==nullptr, a
+// nullptr is appended without increasing a->argc.
+static void AppendToEnvArgv(const char* s0, size_t s0len, const char* s1,
+                            size_t s1len, EnvArgv* a) {
+  if (s0 == nullptr) {
+    a->argv.push_back(nullptr);
+    a->argv_save.push_back(nullptr);
+  } else {
+    string s = string(s0, s0len) + string(s1, s1len);
+    char* str = strdup(s.c_str());
+    a->argv.push_back(str);
+    a->argv_save.emplace_back(str);
+    a->argc++;
+  }
+}
+
+// Like s.find_first_of(x, pos), but return s.size() when find_first_of() would
+// return string::npos.  This avoids if-statements elsewhere.
+static size_t FindFirstOf(const string& s, const char* x, size_t pos) {
+  size_t result = s.find_first_of(x, pos);
+  return result == string::npos ? s.size() : result;
+}
+
+// Like s.find_first_not_of(x, pos), but return s.size() when
+// find_first_not_of() would return string::npos.  This avoids if-statements
+// elsewhere.
+static size_t FindFirstNotOf(const string& s, const char* x, size_t pos) {
+  size_t result = s.find_first_not_of(x, pos);
+  return result == string::npos ? s.size() : result;
+}
+
+// Given a string containing flags, parse them into the XLA command line flags.
+// The parse is best effort, and gives up on the first syntax error.
+static void ParseArgvFromString(const string& flag_str, EnvArgv* a) {
+  size_t b = FindFirstNotOf(flag_str, kWS, 0);
+  while (b != flag_str.size() && flag_str[b] == '-') {
+    // b is the index of the start of a flag.
+    // Set e to the index just past the end of the flag.
+    size_t e = b;
+    while (e != flag_str.size() && isascii(flag_str[e]) &&
+           (strchr("-_", flag_str[e]) != nullptr || isalnum(flag_str[e]))) {
+      e++;
+    }
+    if (e != flag_str.size() && flag_str[e] == '=' &&
+        e + 1 != flag_str.size() && strchr("'\"", flag_str[e + 1]) != nullptr) {
+      // A flag of the form  --flag="something in double or single quotes"
+      int c;
+      e++;  // point just past '='
+      size_t eflag = e;
+      char quote = flag_str[e];
+      e++;  // point just past quote
+      // Put in value the string with quotes removed.
+      string value;
+      for (; e != flag_str.size() && (c = flag_str[e]) != quote; e++) {
+        if (quote == '"' && c == '\\' && e + 1 != flag_str.size()) {
+          // Handle backslash in double quoted strings.  They are literal in
+          // single-quoted strings.
+          e++;
+          c = flag_str[e];
+        }
+        value += c;
+      }
+      if (e != flag_str.size()) {  // skip final " or '
+        e++;
+      }
+      AppendToEnvArgv(flag_str.data() + b, eflag - b, value.data(),
+                      value.size(), a);
+    } else {  // A flag without a quoted value.
+      e = FindFirstOf(flag_str, kWS, e);
+      AppendToEnvArgv(flag_str.data() + b, e - b, "", 0, a);
+    }
+    b = FindFirstNotOf(flag_str, kWS, e);
+  }
+}
+
+// Call ParseArgvFromString(..., a) on a string derived from the setting of the
+// environment variable `envvar`, or a file it points to.
+static void SetArgvFromEnv(absl::string_view envvar, EnvArgv* a) {
+  if (!a->initialized) {
+    static const char kDummyArgv[] = "<argv[0]>";
+    AppendToEnvArgv(kDummyArgv, strlen(kDummyArgv), nullptr, 0,
+                    a);  // dummy argv[0]
+    const char* env = getenv(string(envvar).c_str());
+    if (env == nullptr || env[0] == '\0') {
+      // nothing
+    } else if (env[strspn(env, kWS)] == '-') {  // flags in env var value
+      ParseArgvFromString(env, a);
+    } else {  // assume it's a file name
+      FILE* fp = fopen(env, "r");
+      if (fp != nullptr) {
+        string str;
+        char buf[512];
+        int n;
+        while ((n = fread(buf, 1, sizeof(buf), fp)) > 0) {
+          str.append(buf, n);
+        }
+        fclose(fp);
+        ParseArgvFromString(str, a);
+      }
+    }
+    AppendToEnvArgv(nullptr, 0, nullptr, 0, a);  // add trailing nullptr to *a.
+    a->initialized = true;
+  }
+}
+
+// The simulated argv[] parsed from the environment, one for each different
+// environment variable we've seen.
+static std::unordered_map<string, EnvArgv>& EnvArgvs() {
+  static auto* env_argvs = new std::unordered_map<string, EnvArgv>();
+  return *env_argvs;
+}
+
+// Used to protect accesses to env_argvs.
+static tensorflow::mutex env_argv_mu(tensorflow::LINKER_INITIALIZED);
+
+bool ParseFlagsFromEnvAndDieIfUnknown(
+    absl::string_view envvar, const std::vector<tensorflow::Flag>& flag_list) {
+  tensorflow::mutex_lock lock(env_argv_mu);
+  auto* env_argv = &EnvArgvs()[string(envvar)];
+  SetArgvFromEnv(envvar, env_argv);  // a no-op if already initialized
+  bool result =
+      tensorflow::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
+
+  // There's always at least one unparsed argc, namely the fake argv[0].
+  if (result && env_argv->argc != 1) {
+    // Skip the first argv, which is the fake argv[0].
+    auto unknown_flags = absl::MakeSpan(env_argv->argv);
+    unknown_flags.remove_prefix(1);
+
+    // Some flags are set on XLA_FLAGS, others on TF_XLA_FLAGS.  If we find an
+    // unrecognized flag, suggest the alternative.
+    string alternate_envvar;
+    if (envvar == "TF_XLA_FLAGS") {
+      alternate_envvar = "XLA_FLAGS";
+    } else if (envvar == "XLA_FLAGS") {
+      alternate_envvar = "TF_XLA_FLAGS";
+    }
+    string did_you_mean;
+    if (!alternate_envvar.empty()) {
+      did_you_mean = absl::StrFormat(
+          "\nPerhaps you meant to specify these on the %s envvar?",
+          alternate_envvar);
+    }
+
+    LOG(FATAL) << "Unknown flag" << (unknown_flags.size() > 1 ? "s" : "")
+               << " in " << envvar << ": " << absl::StrJoin(unknown_flags, " ")
+               << did_you_mean;
+    return false;
+  }
+  return result;
+}
+
+// Testing only.
+//
+// Resets the env_argv struct so that subsequent calls to
+// ParseFlagsFromEnvAndDieIfUnknown() will parse the environment variable (or
+// the file it points to) anew, and set *pargc, and *pargv to point to the
+// internal locations of the argc and argv constructed from the environment.
+void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
+                                 std::vector<char*>** pargv) {
+  tensorflow::mutex_lock lock(env_argv_mu);
+  EnvArgvs().erase(string(envvar));
+  auto& env_argv = EnvArgvs()[string(envvar)];
+  *pargc = &env_argv.argc;
+  *pargv = &env_argv.argv;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.h b/tensorflow/compiler/xla/parse_flags_from_env.h
new file mode 100644
index 0000000000000000000000000000000000000000..76940a4299ac50138222333ff250a264cc941288
--- /dev/null
+++ b/tensorflow/compiler/xla/parse_flags_from_env.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
+#define TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
+
+// This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
+// modules to parse flags from an environtment variable, or (if the first
+// non-whitespace in the variable value is not '-'), a file named by that
+// environment variable.
+//
+// The accepted syntax is that flags arguments are of the form --flag=value or
+// (for boolean flags) --flag, and are whitespace separated.  The <value> may be
+// one of:
+//
+//  - <non-whitespace, non-nul not starting with single-quote or double-quote>
+//    in which case the effective value is the string itself
+//  - <single-quote><characters string not containing nul or
+//    single-quote><single_quote> in which case the effective value is the
+//    string with the single-quotes removed
+//  - <double-quote><character string not containing nul or unesecaped
+//    double-quote><double_quote> in which case the effective value if the
+//    string with the double-quotes removed, and escaped sequences of
+//    <backslash><char> replaced by <char>.
+//
+// Flags values inconsistent with the type of the flag will be rejected by the
+// flag parser.
+//
+// Examples:
+//
+//  - TF_XLA_FLAGS="--foo=bar  --wombat='value with a space'"
+//  - TF_XLA_FLAGS=/tmp/flagfile
+//
+// where /tmp/flagfile might contain
+//
+//  --some_flag="This is a string containing a \" and a '."
+//  --another_flag=wombats
+
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+
+// Calls tensorflow::Flags::Parse(argc, argv, flag_list) against any as yet
+// unrecognized flags passed in the environment variable `envvar`, and returns
+// its return value.
+//
+// Raises a fatal error if any flags in `envvar` were not recognized.
+bool ParseFlagsFromEnvAndDieIfUnknown(
+    absl::string_view envvar, const std::vector<tensorflow::Flag>& flag_list);
+
+// Used only for testing.  Not to be used by clients.
+void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
+                                 std::vector<char*>** pargv);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
diff --git a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
similarity index 89%
rename from tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
rename to tensorflow/compiler/xla/parse_flags_from_env_test.cc
index 138c0c852e2bb0527d171f25b4d96cedc5671516..3465552ebbf52140fb954b247d99d3c6afe7fcde 100644
--- a/tensorflow/compiler/xla/legacy_flags/parse_flags_from_env_test.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // Test for parse_flags_from_env.cc
 
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
-namespace legacy_flags {
 
 // Test that XLA flags can be set from the environment.
 // Failure messages are accompanied by the text in msg[].
@@ -38,20 +37,7 @@ static void TestParseFlagsFromEnv(const char* msg) {
   // Initialize module under test.
   int* pargc;
   std::vector<char*>* pargv;
-  ResetFlagsFromEnvForTesting(&pargc, &pargv);
-
-  // Ensure that environment variable can be parsed when
-  // no flags are expected.
-  std::vector<tensorflow::Flag> empty_flag_list;
-  bool parsed_ok = ParseFlagsFromEnv(empty_flag_list);
-  CHECK(parsed_ok) << msg;
-  const std::vector<char*>& argv_first = *pargv;
-  CHECK_NE(argv_first[0], nullptr) << msg;
-  int i = 0;
-  while (argv_first[i] != nullptr) {
-    i++;
-  }
-  CHECK_EQ(i, *pargc) << msg;
+  ResetFlagsFromEnvForTesting("TF_XLA_FLAGS", &pargc, &pargv);
 
   // Check that actual flags can be parsed.
   bool simple = false;
@@ -66,7 +52,7 @@ static void TestParseFlagsFromEnv(const char* msg) {
       tensorflow::Flag("single_quoted", &single_quoted, ""),
       tensorflow::Flag("double_quoted", &double_quoted, ""),
   };
-  parsed_ok = ParseFlagsFromEnv(flag_list);
+  bool parsed_ok = ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
   CHECK_EQ(*pargc, 1) << msg;
   const std::vector<char*>& argv_second = *pargv;
   CHECK_NE(argv_second[0], nullptr) << msg;
@@ -159,12 +145,11 @@ TEST(ParseFlagsFromEnv, EnvAndFlag) {
   }
 }
 
-}  // namespace legacy_flags
 }  // namespace xla
 
 int main(int argc, char* argv[]) {
   // Save name of binary so that it may invoke itself.
-  xla::legacy_flags::binary_name = argv[0];
+  xla::binary_name = argv[0];
   bool recursing = false;
   xla::int32 int_flag = 1;
   const std::vector<tensorflow::Flag> flag_list = {
@@ -173,7 +158,8 @@ int main(int argc, char* argv[]) {
       tensorflow::Flag("int_flag", &int_flag, "An integer flag to test with"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  bool parse_ok = xla::legacy_flags::ParseFlagsFromEnv(flag_list);
+  bool parse_ok =
+      xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
   if (!parse_ok) {
     LOG(QFATAL) << "can't parse from environment\n" << usage;
   }
diff --git a/tensorflow/compiler/xla/protobuf_util.cc b/tensorflow/compiler/xla/protobuf_util.cc
index b507a2ef79f1d7e9ae632744675dddf574490805..ac342bf40fbc0052acbb09a346b9d062561ed06b 100644
--- a/tensorflow/compiler/xla/protobuf_util.cc
+++ b/tensorflow/compiler/xla/protobuf_util.cc
@@ -40,16 +40,6 @@ bool ProtobufEquals(const tensorflow::protobuf::Message& m1,
 
 namespace {
 
-string SanitizeFilename(const string& file_name) {
-  string safe_file_name = file_name;
-  for (char& c : safe_file_name) {
-    if (c == '/' || c == '\\') {
-      c = '_';
-    }
-  }
-  return safe_file_name;
-}
-
 std::pair<tensorflow::mutex*, std::vector<std::function<string(string)>>*>
 GetDirectoryExpanders() {
   static auto* mutex = new tensorflow::mutex;
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index f0d84646b9f01ad3ad209073f13b7b3ec21635d1..63ac1c6649210cbae9e238a74e0a45fb8ee4da63 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -3,6 +3,7 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 
 py_library(
     name = "xla_client",
@@ -50,7 +51,14 @@ cc_library(
     srcs = ["local_computation_builder.cc"],
     hdrs = ["local_computation_builder.h"],
     deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
@@ -59,8 +67,11 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/core:framework_lite",
+        "//tensorflow/compiler/xrt:xrt_proto",
+        "//tensorflow/compiler/xrt/cc:xrt_ops",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
@@ -72,6 +83,7 @@ tf_py_wrap_cc(
     srcs = ["xla.i"],
     swig_includes = [
         "local_computation_builder.i",
+        "//tensorflow/python:platform/base.i",
     ],
     deps = [
         ":local_computation_builder",
@@ -80,5 +92,7 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-    ],
+    ] + if_cuda_is_configured([
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+    ]),
 )
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index cd5fd330298fb0ff158e232dac121f8ffb271218..6e2ee866321a070d55a7221c7c68024ceaa93448 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -14,16 +14,42 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/python/local_computation_builder.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "absl/memory/memory.h"
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_compile_ops.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_execute_op.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_state_ops.h"
+#include "tensorflow/compiler/xrt/xrt.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace swig {
 
+// TODO(b/118641336): Factor out XRT parts into a small c++ library of their
+// own.
+
 // TODO(b/34473877) Ideally XLA would support AllReduce among arbitrary sets of
 // device handles instead of needing to set the number of replicas at XLA
 // service initialization time.
@@ -31,6 +57,12 @@ tensorflow::mutex g_local_client_mutex(tensorflow::LINKER_INITIALIZED);
 int g_replica_count GUARDED_BY(g_local_client_mutex) = 1;
 LocalClient* g_local_client GUARDED_BY(g_local_client_mutex) = nullptr;
 
+string* GetPlatformNameString() {
+  static string* platform_name_string PT_GUARDED_BY(g_local_client_mutex) =
+      new string("Host");
+  return platform_name_string;
+}
+
 Status InitializeReplicaCount(int replica_count) {
   if (replica_count < 1) {
     return InvalidArgument("Replica count must be >= 1; got %d.",
@@ -47,17 +79,33 @@ Status InitializeReplicaCount(int replica_count) {
   return Status::OK();
 }
 
+Status InitializePlatformName(const string& platform_name) {
+  string* g_platform_name = GetPlatformNameString();
+  tensorflow::mutex_lock lock(g_local_client_mutex);
+  if (g_local_client != nullptr) {
+    return FailedPrecondition(
+        "Attempted to set the platform name to %s, but a local XLA service was "
+        "previously created with a platform name of %s.",
+        platform_name, *g_platform_name);
+  }
+  TF_RETURN_IF_ERROR(PlatformUtil::GetPlatform(platform_name).status());
+  *g_platform_name = platform_name;
+  return Status::OK();
+}
+
 int GetReplicaCount() {
   tensorflow::mutex_lock lock(g_local_client_mutex);
   return g_replica_count;
 }
 
 LocalClient* GetOrCreateLocalClient() {
+  string* platform_name = GetPlatformNameString();
   tensorflow::mutex_lock lock(g_local_client_mutex);
   if (g_local_client != nullptr) {
     return g_local_client;
   }
   LocalClientOptions options;
+  options.set_platform(PlatformUtil::GetPlatform(*platform_name).ValueOrDie());
   options.set_number_of_replicas(g_replica_count);
   g_local_client = ClientLibrary::GetOrCreateLocalClient(options).ValueOrDie();
   CHECK(g_local_client != nullptr);
@@ -91,6 +139,33 @@ StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
   return client->TransferFromOutfeedLocal(shape, device_ordinal);
 }
 
+static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
+                                             int device_ordinal,
+                                             const Literal& arg) {
+  return client->LiteralToShapedBuffer(arg, device_ordinal,
+                                       client->backend().memory_allocator());
+}
+
+/* static */
+StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
+    const Literal& argument, const absl::optional<Shape>& shape_with_layout,
+    int replica_number) {
+  LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(int device_ordinal,
+                      client->ReplicaNumberToDeviceOrdinal(replica_number));
+  VLOG(1) << "Creating shaped buffer from literal on replica/ordinal: "
+          << replica_number << "/" << device_ordinal;
+  StatusOr<ScopedShapedBuffer> buf = [&] {
+    if (shape_with_layout) {
+      Literal relaid = argument.Relayout(shape_with_layout.value());
+      return ToBuffer(client, device_ordinal, relaid);
+    }
+    return ToBuffer(client, device_ordinal, argument);
+  }();
+  TF_RETURN_IF_ERROR(buf.status());
+  return new LocalShapedBuffer(std::move(buf).ValueOrDie());
+}
+
 LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer)
     : shaped_buffer_(std::move(shaped_buffer)) {}
 
@@ -100,11 +175,20 @@ const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const {
 
 ShapedBuffer LocalShapedBuffer::Release() { return shaped_buffer_.release(); }
 
+const Shape& LocalShapedBuffer::shape() const {
+  return shaped_buffer()->on_device_shape();
+}
+
+StatusOr<Literal> LocalShapedBuffer::ToLiteral() const {
+  LocalClient* client = GetOrCreateLocalClient();
+  return client->ShapedBufferToLiteral(*shaped_buffer());
+}
+
 LocalShapedBufferTuple::LocalShapedBufferTuple(
     std::vector<LocalShapedBuffer*> elements)
     : elements_(std::move(elements)) {
   for (auto* element : elements_) {
-    DCHECK(element != nullptr);
+    CHECK(element != nullptr);
   }
 }
 
@@ -126,157 +210,316 @@ StatusOr<LocalShapedBuffer*> LocalShapedBufferTuple::Release(int i) {
   return element;
 }
 
-int LocalShapedBufferTuple::size() const { return elements_.size(); }
+int64 LocalShapedBufferTuple::size() const { return elements_.size(); }
+
+XrtAllocation::XrtAllocation(int64 handle, Shape shape,
+                             const string& session_target)
+    : handle_(handle), shape_(shape), session_target_(session_target) {}
+
+XrtAllocation::~XrtAllocation() {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto allocation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto release =
+      tensorflow::ops::XRTReleaseAllocationHandle(root, allocation_handle);
+  if (!root.status().ok()) {
+    LOG(ERROR) << root.status();
+    return;
+  }
 
-static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
-                                             int device_ordinal,
-                                             const Literal& arg) {
-  return client->LiteralToShapedBuffer(arg, device_ordinal,
-                                       client->backend().memory_allocator());
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({allocation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  auto status = session.Run(inputs, {}, {release}, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return;
+  }
 }
 
 /* static */
-StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
-    const Literal& argument, const absl::optional<Shape>& shape_with_layout) {
-  LocalClient* client = GetOrCreateLocalClient();
-  StatusOr<ScopedShapedBuffer> buf = [&] {
-    if (shape_with_layout) {
-      Literal relaid = argument.Relayout(shape_with_layout.value());
-      return ToBuffer(client, /*device_ordinal=*/0, relaid);
+StatusOr<XrtAllocation*> XrtAllocation::FromLiteral(
+    const Literal& argument, const string& session_target) {
+  xrt::XLAAllocation alloc;
+  alloc.set_device_ordinal(0);
+  *alloc.mutable_value() = argument.ToProto();
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto literal_string =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto literal_handle = tensorflow::ops::XRTAllocate(root, literal_string);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({literal_string, alloc.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {literal_handle}, &outputs));
+
+  int64 handle = outputs[0].scalar<int64>()();
+  return new XrtAllocation(handle, argument.shape(), session_target);
+}
+
+const int64 XrtAllocation::handle() const { return handle_; }
+
+const Shape& XrtAllocation::shape() const { return shape_; }
+
+StatusOr<Literal> XrtAllocation::ToLiteral() const {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto allocation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto read_literal = tensorflow::ops::XRTReadLiteral(root, allocation_handle);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({allocation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {read_literal}, &outputs));
+
+  xla::LiteralProto response;
+  TF_RET_CHECK(response.ParseFromString(outputs[0].scalar<string>()()));
+  return Literal::CreateFromProto(response);
+}
+
+XrtAllocationTuple::XrtAllocationTuple(std::vector<XrtAllocation*> elements)
+    : elements_(std::move(elements)) {
+  for (auto* element : elements_) {
+    CHECK(element != nullptr);
+  }
+}
+
+XrtAllocationTuple::~XrtAllocationTuple() {
+  for (XrtAllocation* element : elements_) {
+    if (element != nullptr) {
+      delete element;
     }
-    return ToBuffer(client, /*device_ordinal=*/0, argument);
-  }();
-  TF_RETURN_IF_ERROR(buf.status());
-  return new LocalShapedBuffer(std::move(buf).ValueOrDie());
+  }
 }
 
-StatusOr<Literal> LocalShapedBuffer::ToLiteral() const {
-  LocalClient* client = GetOrCreateLocalClient();
-  return client->ShapedBufferToLiteral(*shaped_buffer());
+StatusOr<XrtAllocation*> XrtAllocationTuple::Release(int i) {
+  XrtAllocation* element = elements_[i];
+  if (element == nullptr) {
+    return InvalidArgument("Attempted to release already-released element %d.",
+                           i);
+  }
+  elements_[i] = nullptr;
+  return element;
 }
 
+int64 XrtAllocationTuple::size() const { return elements_.size(); }
+
 CompiledLocalComputation::CompiledLocalComputation(
     std::unique_ptr<LocalExecutable> executable)
     : executable_(std::move(executable)) {}
 
-StatusOr<Literal> CompiledLocalComputation::Execute(
-    const std::vector<Literal>& arguments,
-    const std::vector<absl::optional<Shape>>& shapes_with_layout) {
+StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
+    absl::Span<LocalShapedBuffer* const> argument_handles) {
   LocalClient* client = GetOrCreateLocalClient();
+  StatusOr<int> device_ordinal_status = client->ReplicaNumberToDeviceOrdinal(0);
+  StatusOr<ScopedShapedBuffer> result_buffer_status;
+  if (!device_ordinal_status.ok()) {
+    result_buffer_status = device_ordinal_status.status();
+  } else {
+    const int device_ordinal = device_ordinal_status.ValueOrDie();
+    VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
+            << device_ordinal;
+
+    std::vector<const ShapedBuffer*> argument_buffers;
+    argument_buffers.reserve(argument_handles.size());
+    for (auto& handle : argument_handles) {
+      argument_buffers.push_back(handle->shaped_buffer());
+    }
+
+    DeviceAssignment device_assignment =
+        client->backend()
+            .computation_placer()
+            ->AssignDevices(1, /*computation_count=*/1)
+            .ConsumeValueOrDie();
 
-  VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas.";
+    ExecutableRunOptions options;
+    options.set_device_ordinal(device_ordinal);
+    options.set_allocator(client->backend().memory_allocator());
+    options.set_intra_op_thread_pool(
+        client->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment);
 
-  // Each replica populates a StatusOr result, but only replica zero actually
-  // retrieves its literal value.
-  std::vector<StatusOr<Literal>> results(GetReplicaCount());
-  {
+    result_buffer_status = executable_->Run(argument_buffers, options);
+  }
+
+  if (!result_buffer_status.ok()) {
+    return InternalError(
+        "Failed running replica 0 (other replicas may have failed as well): "
+        "%s.",
+        result_buffer_status.status().ToString());
+  }
+  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie());
+}
+
+StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
+    absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles) {
+  LocalClient* client = GetOrCreateLocalClient();
+  const int num_replicas = GetReplicaCount();
+
+  if (argument_handles.size() != num_replicas) {
+    return InvalidArgument(
+        "Attempted to execute with %d replicas when replica count is %d",
+        argument_handles.size(), num_replicas);
+  }
+
+  VLOG(1) << "Executing with " << num_replicas << " replicas.";
+
+  // Each replica populates a StatusOr result, but only the output value of
+  // replica zero is returned.
+  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas);
+  auto execute = [this, client, num_replicas, &argument_handles,
+                  &results](int replica) {
+    StatusOr<int> device_ordinal_status =
+        client->ReplicaNumberToDeviceOrdinal(replica);
+    if (!device_ordinal_status.ok()) {
+      results[replica] = device_ordinal_status.status();
+      return;
+    }
+    const int device_ordinal = device_ordinal_status.ValueOrDie();
+    VLOG(3) << "Replica " << replica
+            << " mapped to device ordinal for execution: " << device_ordinal;
+
+    std::vector<const ShapedBuffer*> argument_buffers;
+    argument_buffers.reserve(argument_handles[replica].size());
+    for (auto& handle : argument_handles[replica]) {
+      argument_buffers.push_back(handle->shaped_buffer());
+    }
+
+    DeviceAssignment device_assignment =
+        client->backend()
+            .computation_placer()
+            ->AssignDevices(num_replicas, /*computation_count=*/1)
+            .ConsumeValueOrDie();
+
+    ExecutableRunOptions options;
+    options.set_device_ordinal(device_ordinal);
+    options.set_allocator(client->backend().memory_allocator());
+    options.set_intra_op_thread_pool(
+        client->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment);
+    StatusOr<ScopedShapedBuffer> result_buffer_status =
+        executable_->Run(argument_buffers, options);
+
+    results[replica] = std::move(result_buffer_status);
+  };
+
+  if (num_replicas == 1) {
+    // Fast-path if there is only one replica — run the computation on the
+    // current thread.
+    execute(0);
+  } else {
+    // TODO(phawkins): don't recreate the threadpool for each execution.
     tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "xlarun",
-                                        GetReplicaCount());
-
-    for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-      pool.Schedule(
-          [this, client, replica, &arguments, &shapes_with_layout, &results] {
-            StatusOr<int> device_ordinal_status =
-                client->ReplicaNumberToDeviceOrdinal(replica);
-            if (!device_ordinal_status.ok()) {
-              results[replica] = device_ordinal_status.status();
-              return;
-            }
-            const int device_ordinal = device_ordinal_status.ValueOrDie();
-            VLOG(3) << "Replica " << replica
-                    << " mapped to device ordinal for execution: "
-                    << device_ordinal;
-
-            // Transfer arguments in
-            std::vector<ScopedShapedBuffer> scoped_buffers;
-            scoped_buffers.reserve(arguments.size());
-            for (int i = 0; i < arguments.size(); ++i) {
-              const Literal& argument = arguments[i];
-              const absl::optional<Shape>& shape_with_layout =
-                  shapes_with_layout[i];
-
-              StatusOr<ScopedShapedBuffer> pushed;
-              if (shape_with_layout) {
-                Literal relaid = argument.Relayout(shape_with_layout.value());
-                pushed = ToBuffer(client, device_ordinal, relaid);
-              } else {
-                pushed = ToBuffer(client, device_ordinal, argument);
-              }
-              if (!pushed.ok()) {
-                results[replica] = pushed.status();
-                return;
-              }
-
-              scoped_buffers.push_back(std::move(pushed).ValueOrDie());
-            }
-
-            // Execute
-            std::vector<const ShapedBuffer*> argument_buffers;
-            argument_buffers.reserve(scoped_buffers.size());
-            for (auto& buffer : scoped_buffers) {
-              argument_buffers.push_back(&buffer);
-            }
-
-            DeviceAssignment device_assignment =
-                client->backend()
-                    .computation_placer()
-                    ->AssignDevices(GetReplicaCount(), /*computation_count=*/1)
-                    .ConsumeValueOrDie();
-
-            ExecutableRunOptions options;
-            options.set_device_ordinal(device_ordinal);
-            options.set_allocator(client->backend().memory_allocator());
-            options.set_intra_op_thread_pool(
-                client->backend().eigen_intra_op_thread_pool_device());
-            options.set_device_assignment(&device_assignment);
-            StatusOr<ScopedShapedBuffer> result_buffer_status =
-                executable_->Run(argument_buffers, options);
-            if (!result_buffer_status.ok()) {
-              results[replica] = result_buffer_status.status();
-              return;
-            }
-
-            // Transfer result out
-            results[replica] = client->ShapedBufferToLiteral(
-                std::move(result_buffer_status).ValueOrDie());
-          });
+                                        num_replicas - 1);
+
+    for (int replica = 0; replica < num_replicas - 1; ++replica) {
+      pool.Schedule([&execute, replica] { execute(replica); });
     }
+    execute(num_replicas - 1);
   }
 
-  for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-    const auto& statusor = results[replica];
+  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas);
+  for (int replica = 0; replica < num_replicas; ++replica) {
+    auto& statusor = results[replica];
     if (!statusor.ok()) {
       return InternalError(
           "Failed running replica %d (other replicas may have failed as well): "
           "%s.",
           replica, statusor.status().ToString());
     }
+    wrapped_results[replica] =
+        new LocalShapedBuffer(std::move(statusor).ValueOrDie());
   }
 
-  return std::move(results[0]);
+  return new LocalShapedBufferTuple(std::move(wrapped_results));
 }
 
-LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
-    absl::Span<LocalShapedBuffer* const> argument_handles) {
-  LocalClient* client = GetOrCreateLocalClient();
+static StatusOr<Shape> GetReturnValueShape(const XlaComputation& computation) {
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      computation.GetProgramShape());
+  return std::move(*program_shape.mutable_result());
+}
 
-  std::vector<const ShapedBuffer*> argument_buffers;
-  argument_buffers.reserve(argument_handles.size());
-  for (auto& handle : argument_handles) {
-    argument_buffers.push_back(handle->shaped_buffer());
+CompiledXrtComputation::CompiledXrtComputation(
+    const ProgramShape& program_shape, int64 handle,
+    const string& session_target)
+    : program_shape_(program_shape),
+      handle_(handle),
+      session_target_(session_target) {}
+
+CompiledXrtComputation::~CompiledXrtComputation() {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto computation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto release =
+      tensorflow::ops::XRTReleaseCompilationHandle(root, computation_handle);
+  if (!root.status().ok()) {
+    LOG(ERROR) << root.status();
+    return;
   }
 
-  // Execute
-  ExecutableRunOptions options;
-  options.set_allocator(client->backend().memory_allocator());
-  options.set_intra_op_thread_pool(
-      client->backend().eigen_intra_op_thread_pool_device());
-  ScopedShapedBuffer result_buffer =
-      executable_->Run(argument_buffers, options).ConsumeValueOrDie();
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({computation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  auto status = session.Run(inputs, {}, {release}, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return;
+  }
+}
 
-  return new LocalShapedBuffer(std::move(result_buffer));
+StatusOr<XrtAllocation*> CompiledXrtComputation::Execute(
+    absl::Span<XrtAllocation* const> argument_handles) {
+  const int num_expected_arguments = program_shape().parameters().size();
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  std::vector<tensorflow::Output> arguments;
+  arguments.reserve(num_expected_arguments);
+  for (int i = 0; i < num_expected_arguments; ++i) {
+    arguments.push_back(
+        tensorflow::ops::Placeholder(root, tensorflow::DT_INT64));
+  }
+  auto computation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto execution_config =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto execute = tensorflow::ops::XRTExecute(root, computation_handle,
+                                             execution_config, arguments);
+  TF_RETURN_IF_ERROR(root.status());
+
+  TF_RET_CHECK(argument_handles.size() == arguments.size());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(false);
+  e.set_release_compilation_handle(false);
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  for (int i = 0; i < arguments.size(); ++i) {
+    inputs.insert({arguments[i], argument_handles[i]->handle()});
+  }
+  inputs.insert({computation_handle, handle()});
+  inputs.insert({execution_config, e.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {execute}, &outputs));
+
+  int64 output = outputs[0].scalar<int64>()();
+  return new XrtAllocation(output, program_shape().result(), session_target_);
 }
 
+const ProgramShape& CompiledXrtComputation::program_shape() const {
+  return program_shape_;
+}
+
+int64 CompiledXrtComputation::handle() const { return handle_; }
+
 LocalComputation::LocalComputation(XlaComputation computation)
     : computation_(std::move(computation)) {}
 
@@ -300,6 +543,37 @@ StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
   return new CompiledLocalComputation(std::move(local_executable));
 }
 
+StatusOr<CompiledXrtComputation*> LocalComputation::CompileForXrt(
+    const std::vector<Shape>& argument_shapes, const string& session_target) {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto program = tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto compile = tensorflow::ops::XRTCompile(root, program);
+  TF_RETURN_IF_ERROR(root.status());
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  ProgramShape shapes;
+  for (auto& shape : argument_shapes) {
+    *shapes.add_parameters() = shape;
+  }
+  TF_ASSIGN_OR_RETURN(*shapes.mutable_result(), GetReturnValueShape());
+  LayoutUtil::SetToDefaultLayout(&shapes);
+  *config->mutable_program_shape() = shapes.ToProto();
+  auto snapshot = computation().Snapshot().ValueOrDie();
+  *c.mutable_hlo_snapshot() = *snapshot;
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({program, c.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {compile.handle}, &outputs));
+
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                      computation().GetProgramShape());
+  int64 handle = outputs[0].scalar<int64>()();
+  return new CompiledXrtComputation(program_shape, handle, session_target);
+}
+
 const XlaComputation& LocalComputation::computation() const {
   return computation_;
 }
@@ -314,9 +588,7 @@ string LocalComputation::GetSerializedProto() const {
 }
 
 StatusOr<Shape> LocalComputation::GetReturnValueShape() const {
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation_.GetProgramShape());
-  return std::move(*program_shape.mutable_result());
+  return swig::GetReturnValueShape(computation_);
 }
 
 LocalOp::LocalOp(const XlaOp& op) : op_(op) {}
@@ -343,6 +615,12 @@ LocalOp LocalComputationBuilder::Parameter(int64 parameter_number,
   return xla::Parameter(&builder_, parameter_number, shape, name);
 }
 
+StatusOr<LocalComputation*> LocalComputationBuilder::BuildWithRoot(
+    const LocalOp& root) {
+  TF_ASSIGN_OR_RETURN(XlaComputation computation, builder_.Build(root.op()));
+  return new LocalComputation(std::move(computation));
+}
+
 StatusOr<Shape> LocalComputationBuilder::GetShape(const LocalOp& operand) {
   return builder_.GetShape(operand.op());
 }
@@ -371,6 +649,12 @@ LocalOp LocalComputationBuilder::Broadcast(
   return xla::Broadcast(operand.op(), broadcast_sizes);
 }
 
+LocalOp LocalComputationBuilder::BroadcastInDim(
+    const LocalOp& operand, absl::Span<const int64> out_dim_sizes,
+    absl::Span<const int64> broadcast_dimensions) {
+  return xla::BroadcastInDim(operand.op(), out_dim_sizes, broadcast_dimensions);
+}
+
 LocalOp LocalComputationBuilder::Pad(const LocalOp& operand,
                                      const LocalOp& padding_value,
                                      const PaddingConfig& padding_config) {
@@ -532,10 +816,13 @@ LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
     const LocalComputation& local_computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
+    absl::Span<const int64> base_dilations,
+    absl::Span<const int64> window_dilations,
     absl::Span<const std::pair<int64, int64>> padding) {
   return xla::ReduceWindowWithGeneralPadding(
       operand.op(), init_value.op(), local_computation.computation(),
-      window_dimensions, window_strides, padding);
+      window_dimensions, window_strides, base_dilations, window_dilations,
+      padding);
 }
 
 LocalOp LocalComputationBuilder::RngNormal(const LocalOp& mu,
@@ -569,13 +856,13 @@ StatusOr<bool> LocalComputationBuilder::IsConstant(const LocalOp& operand) {
 }
 
 LocalOp LocalComputationBuilder::Sort(const LocalOp& operand, int64 dimension) {
-  return xla::Sort(operand.op(), absl::nullopt, dimension);
+  return xla::Sort(operand.op(), {}, dimension);
 }
 
 LocalOp LocalComputationBuilder::SortKeyVal(const LocalOp& keys,
                                             const LocalOp& values,
                                             int64 dimension) {
-  return xla::Sort(keys.op(), values.op(), dimension);
+  return xla::Sort(keys.op(), {values.op()}, dimension);
 }
 
 StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
@@ -674,23 +961,29 @@ void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer) {
   delete local_shaped_buffer;
 }
 
+void DeleteXrtAllocation(XrtAllocation* allocation) { delete allocation; }
+
 void DeleteCompiledLocalComputation(CompiledLocalComputation* computation) {
   delete computation;
 }
 
+void DeleteCompiledXrtComputation(CompiledXrtComputation* computation) {
+  delete computation;
+}
+
 void DeleteLocalComputation(LocalComputation* computation) {
   delete computation;
 }
 
 StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
     LocalShapedBuffer* local_shaped_buffer) {
-  if (!ShapeUtil::IsTuple(
-          local_shaped_buffer->shaped_buffer()->on_device_shape())) {
+  const Shape tuple_shape = local_shaped_buffer->shape();
+
+  if (!ShapeUtil::IsTuple(tuple_shape)) {
     return InvalidArgument(
         "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
         "shape; shape: %s",
-        ShapeUtil::HumanString(
-            local_shaped_buffer->shaped_buffer()->on_device_shape()));
+        ShapeUtil::HumanString(tuple_shape));
   }
 
   DeviceMemoryAllocator* allocator =
@@ -702,7 +995,6 @@ StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
   int device_ordinal = tuple_buffer.device_ordinal();
 
   ShapeTree<se::DeviceMemoryBase>& shape_tree = tuple_buffer.buffers();
-  const Shape& tuple_shape = tuple_buffer.on_device_shape();
   std::vector<LocalShapedBuffer*> results;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
     // Create a shaped buffer for this destructured tuple element.
@@ -730,5 +1022,47 @@ StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
   return new LocalShapedBufferTuple(std::move(results));
 }
 
+StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
+    XrtAllocation* allocation, const string& session_target) {
+  const Shape& tuple_shape = allocation->shape();
+
+  if (!ShapeUtil::IsTuple(tuple_shape)) {
+    return InvalidArgument(
+        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
+        "shape; shape: %s",
+        ShapeUtil::HumanString(tuple_shape));
+  }
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto base_handle = tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto shape_index = tensorflow::ops::Placeholder(root, tensorflow::DT_INT32);
+  auto subtuple = tensorflow::ops::XRTSubTuple(root, base_handle, shape_index);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  std::vector<XrtAllocation*> results;
+  for (int32 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
+    inputs.clear();
+    inputs.insert({base_handle, allocation->handle()});
+    inputs.insert({shape_index, {i}});
+    std::vector<tensorflow::Tensor> outputs;
+    auto status = session.Run(inputs, {subtuple}, &outputs);
+    if (!status.ok()) {
+      // Clean up before returning non-ok status.
+      for (int j = 0; j < results.size(); ++j) {
+        delete results[j];
+      }
+      return status;
+    }
+    const int64 subtuple_handle = outputs[0].scalar<int64>()();
+    const Shape& subtuple_shape =
+        ShapeUtil::GetTupleElementShape(tuple_shape, i);
+    results.push_back(
+        new XrtAllocation(subtuple_handle, subtuple_shape, session_target));
+  }
+  return new XrtAllocationTuple(std::move(results));
+}
+
 }  // namespace swig
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 2166bb6721ca380f3180a8802e4922f2e9e45945..149e44570df5c6a3df88bbe2ffa779be47842d82 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -16,7 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
 
+#include <string>
+#include <vector>
+
 #include "absl/types/span.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -34,6 +39,12 @@ namespace swig {
 // returned.
 Status InitializeReplicaCount(int replica_count);
 
+// Initializes the platform name that XLA will be initialized with (when
+// first obtaining a handle to the local XLA service). If this is called after
+// the handle to the local XLA service has been established, then an error is
+// returned.
+Status InitializePlatformName(const string& platform_name);
+
 // Returns the replica count that is currently set, regardless of whether the
 // local XLA service has been instantiated yet or not.
 int GetReplicaCount();
@@ -54,18 +65,19 @@ Status TransferToInfeedLocalReplica(const Literal& literal, int replica_number);
 StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
                                                   int replica_number);
 
-// Wraps a ScopedShapedBuffer produced by copying a literal "to
-// device," i.e. copying a literal to a scoped buffer via the local
-// client.
+// Represents a reference to literals that live in a device-allocated buffer via
+// XLA. Specifically, wraps a ScopedShapedBuffer produced by transferring a
+// literal to device via the local client.
 class LocalShapedBuffer {
  public:
   static StatusOr<LocalShapedBuffer*> FromLiteral(
-      const Literal& argument, const absl::optional<Shape>& shape_with_layout);
+      const Literal& argument, const absl::optional<Shape>& shape_with_layout,
+      int replica_number);
 
   LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
-  const ScopedShapedBuffer* shaped_buffer() const;
-
   StatusOr<Literal> ToLiteral() const;
+  const Shape& shape() const;
+  const ScopedShapedBuffer* shaped_buffer() const;
 
   // Transfers ownership of the encapsulated ShapedBuffer to the caller,
   // analogous to std::unique_ptr::release().
@@ -92,7 +104,7 @@ class LocalShapedBufferTuple {
   StatusOr<LocalShapedBuffer*> Release(int i);
 
   // Returns the number of elements in the destructured tuple.
-  int size() const;
+  int64 size() const;
 
  private:
   std::vector<LocalShapedBuffer*> elements_;
@@ -103,31 +115,99 @@ class LocalShapedBufferTuple {
 StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
     LocalShapedBuffer* local_shaped_buffer);
 
-// Wraps a LocalExecutable produced by compiling a
-// LocalComputation. The Execute method forwards to that of the
-// underlying LocalExecutable, and additionally handles tranferring
-// arguments and return values in and back out of the client library's
-// local client. This class is intended to be made available to Python
-// via SWIG.
+// Represents a reference to literals that live in a device-allocated buffer via
+// XRT. Specifically, wraps an int64 handle produced by running the allocation
+// graph, and an XLA shape to track the referent's shape.
+class XrtAllocation {
+ public:
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which allocation and deallocation
+  // graphs are run.
+  static StatusOr<XrtAllocation*> FromLiteral(const Literal& argument,
+                                              const string& session_target);
+
+  XrtAllocation(int64 handle, Shape shape, const string& session_target);
+  ~XrtAllocation();
+  StatusOr<Literal> ToLiteral() const;
+  const Shape& shape() const;
+  const int64 handle() const;
+
+ private:
+  const int64 handle_;
+  const Shape shape_;
+  const string session_target_;
+};
+
+// Result of a tuple destructuring operation on an XrtAllocation.
+class XrtAllocationTuple {
+ public:
+  // Note: any XrtAllocation elements that are not Release()'d will be
+  // deallocated in the destructor.
+  explicit XrtAllocationTuple(std::vector<XrtAllocation*> elements);
+
+  ~XrtAllocationTuple();
+
+  // Releases the ith element to the caller. Further attempts to release the ith
+  // element will return an invalid argument error.
+  StatusOr<XrtAllocation*> Release(int i);
+
+  // Returns the number of elements in the destructured tuple.
+  int64 size() const;
+
+ private:
+  std::vector<XrtAllocation*> elements_;
+};
+
+// Destructures a tuple-valued XrtAllocation into its constitutent elements
+// in XrtAllocationTuple form.
+//
+// Accepts a `session_target` argument, used in constructing the
+// `tensorflow::ClientSession` instance in which the sub-tupling graph is run,
+// and passed along in constructing each constituent XrtAllocation.
+StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
+    XrtAllocation* allocation, const string& session_target);
+
+// Represents a compiled computation that can be executed given handles to
+// device-allocated literals. Specifically, wraps an XLA LocalExecutable.
 class CompiledLocalComputation {
  public:
   CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
 
-  // Execute the computation with the given argument literals, and
-  // with optionally-specified argument layouts. The literals will be
-  // re-laid out according to the corresponding elements of
-  // shapes_with_layout.
-  StatusOr<Literal> Execute(
-      const std::vector<Literal>& arguments,
-      const std::vector<absl::optional<Shape> >& shapes_with_layout);
-
-  LocalShapedBuffer* ExecuteWithShapedBuffers(
+  StatusOr<LocalShapedBuffer*> Execute(
       absl::Span<LocalShapedBuffer* const> argument_handles);
 
+  // Execute on many replicas. Takes a sequence of argument lists (one argument
+  // list per replica) and returns a tuple of results (one result per replica).
+  // The number of argument lists must be equal to the replica count.
+  StatusOr<LocalShapedBufferTuple*> ExecutePerReplica(
+      absl::Span<const std::vector<LocalShapedBuffer*> > argument_handles);
+
  private:
   std::unique_ptr<LocalExecutable> executable_;
 };
 
+// Represents a compiled computation that can be executed given handles to
+// device-allocated literals. Specifically, wraps an XRT computation handle.
+class CompiledXrtComputation {
+ public:
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which the execution graph is run.
+  CompiledXrtComputation(const ProgramShape& program_shape, int64 handle,
+                         const string& session_target);
+  ~CompiledXrtComputation();
+
+  StatusOr<XrtAllocation*> Execute(
+      absl::Span<XrtAllocation* const> argument_handles);
+
+  const ProgramShape& program_shape() const;
+  int64 handle() const;
+
+ private:
+  const ProgramShape program_shape_;
+  const int64 handle_;
+  const string session_target_;
+};
+
 // Wraps a XlaComputation produced by a LocalComputationBuilder. The
 // Compile method compiles the computation to a (local) executable via
 // the client library's local client. This class is intended to be
@@ -140,6 +220,11 @@ class LocalComputation {
       const std::vector<Shape>& argument_shapes,
       const ExecutableBuildOptions* build_options);
 
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which the compilation graph is run.
+  StatusOr<CompiledXrtComputation*> CompileForXrt(
+      const std::vector<Shape>& argument_shapes, const string& session_target);
+
   const XlaComputation& computation() const;
 
   // Returns the HloModuleProto contained in the XlaComputation in the
@@ -183,6 +268,9 @@ class LocalComputationBuilder {
   // Returns an owned LocalComputation to the caller on success.
   StatusOr<LocalComputation*> Build();
 
+  // Returns an owned LocalComputation to the caller on success with given root.
+  StatusOr<LocalComputation*> BuildWithRoot(const LocalOp& root);
+
   LocalOp Parameter(int64 parameter_number, const Shape& shape,
                     const string& name);
 
@@ -201,6 +289,10 @@ class LocalComputationBuilder {
   LocalOp Broadcast(const LocalOp& operand,
                     absl::Span<const int64> broadcast_sizes);
 
+  LocalOp BroadcastInDim(const LocalOp& operand,
+                         absl::Span<const int64> out_dim_sizes,
+                         absl::Span<const int64> broadcast_dimensions);
+
   LocalOp Pad(const LocalOp& operand, const LocalOp& padding_value,
               const PaddingConfig& padding_config);
 
@@ -278,6 +370,8 @@ class LocalComputationBuilder {
       const LocalComputation& local_computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
+      absl::Span<const int64> base_dilations,
+      absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64> > padding);
 
   LocalOp RngNormal(const LocalOp& mu, const LocalOp& sigma,
@@ -389,7 +483,9 @@ class LocalComputationBuilder {
 
 // Functions for freeing resources from the Python side.
 void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer);
+void DeleteXrtAllocation(XrtAllocation* allocation);
 void DeleteCompiledLocalComputation(CompiledLocalComputation* computation);
+void DeleteCompiledXrtComputation(CompiledXrtComputation* computation);
 void DeleteLocalComputation(LocalComputation* computation);
 
 }  // namespace swig
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 521490e76c138553c5cc6895412eadb35a939881..d23d693c1e5bde43b52959e4397aa311268411bb 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -176,6 +176,81 @@ bool HandleStringAttribute(PyObject* o,
 tensorflow::ImportNumpy();
 %}
 
+// Basic types
+
+%typemap(out) StatusOr<bool> {
+  if ($1.ok()) {
+    $result = PyBool_FromLong($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) Status {
+  if (!$1.ok()) {
+    PyErr_SetString(
+        PyExc_RuntimeError, $1.ToString().c_str());
+    SWIG_fail;
+  }
+  Py_INCREF(Py_None);
+  $result = Py_None;
+}
+
+%typemap(in) absl::Span<const int64>
+    (std::vector<int64> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.resize(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    PyObject* py_int = numpy::PyNumberToPyInt(o);
+    if (!py_int) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Argument sequence element cannot be converted to int");
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
+    if (temps[i] == -1 && PyErr_Occurred()) {
+      Py_DECREF(py_int);
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    Py_DECREF(py_int);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// Computation builder types
+
+%typemap(in) absl::Span<const xla::swig::LocalOp>(
+      std::vector<LocalOp> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    LocalOp* op;
+    if ((SWIG_ConvertPtr(o, (void**)&op, $descriptor(xla::swig::LocalOp*),
+                         SWIG_POINTER_EXCEPTION)) == -1) {
+      SWIG_fail;
+    }
+    temps.push_back(*op);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// Computation and buffer/allocation types
+
 %typemap(out) StatusOr<xla::swig::CompiledLocalComputation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
@@ -189,12 +264,12 @@ tensorflow::ImportNumpy();
   }
 }
 
-%typemap(out) StatusOr<xla::swig::LocalShapedBuffer*> {
+%typemap(out) StatusOr<xla::swig::CompiledXrtComputation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::LocalShapedBuffer*)
+      $typemap(out, xla::swig::CompiledXrtComputation*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -202,12 +277,12 @@ tensorflow::ImportNumpy();
   }
 }
 
-%typemap(out) StatusOr<xla::swig::LocalShapedBufferTuple*> {
+%typemap(out) StatusOr<xla::swig::LocalShapedBuffer*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::LocalShapedBufferTuple*)
+      $typemap(out, xla::swig::LocalShapedBuffer*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -215,23 +290,25 @@ tensorflow::ImportNumpy();
   }
 }
 
-
-%typemap(out) StatusOr<Literal> {
+%typemap(out) StatusOr<xla::swig::LocalShapedBufferTuple*> {
   if ($1.ok()) {
-    Literal value = $1.ConsumeValueOrDie();
-    $result = numpy::PyObjectFromXlaLiteral(*value);
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::LocalShapedBufferTuple*)
+    }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
     SWIG_fail;
   }
 }
 
-%typemap(out) StatusOr<xla::swig::LocalComputation*> {
+%typemap(out) StatusOr<xla::swig::XrtAllocation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::LocalComputation*)
+      $typemap(out, xla::swig::XrtAllocation*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -239,92 +316,86 @@ tensorflow::ImportNumpy();
   }
 }
 
-%typemap(out) StatusOr<Shape> {
+%typemap(out) StatusOr<xla::swig::XrtAllocationTuple*> {
   if ($1.ok()) {
-    $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtAllocationTuple*)
+    }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
     SWIG_fail;
   }
 }
 
-%typemap(out) StatusOr<bool> {
+%typemap(out) StatusOr<xla::swig::LocalComputation*> {
   if ($1.ok()) {
-    $result = PyBool_FromLong($1.ConsumeValueOrDie());
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::LocalComputation*)
+    }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
     SWIG_fail;
   }
 }
 
-%typemap(out) Status {
-  if (!$1.ok()) {
-    PyErr_SetString(
-        PyExc_RuntimeError, $1.ToString().c_str());
-    SWIG_fail;
-  }
-  Py_INCREF(Py_None);
-  $result = Py_None;
-}
-
-// Span<int64>
-
-%typemap(in) absl::Span<const int64>
-    (std::vector<int64> temps) {
+%typemap(in) absl::Span<xla::swig::LocalShapedBuffer* const>
+    (std::vector<LocalShapedBuffer*> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
     SWIG_fail;
   }
   const int size = PySequence_Size($input);
-  temps.resize(size);
+  temps.reserve(size);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
-    PyObject* py_int = numpy::PyNumberToPyInt(o);
-    if (!py_int) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "Argument sequence element cannot be converted to int");
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
-    if (temps[i] == -1 && PyErr_Occurred()) {
-      Py_DECREF(py_int);
-      Py_DECREF(o);
+    LocalShapedBuffer* lsbp;
+    if ((SWIG_ConvertPtr(o, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
+                         SWIG_POINTER_EXCEPTION)) == -1) {
       SWIG_fail;
     }
-    Py_DECREF(py_int);
+    temps.push_back(lsbp);
     Py_DECREF(o);
   }
   $1 = temps;
 }
 
-// Span<LocalOp>
-
-%typemap(in) absl::Span<const xla::swig::LocalOp>(
-      std::vector<LocalOp> temps) {
+%typemap(in) absl::Span<const std::vector<xla::swig::LocalShapedBuffer*> >
+    (std::vector<std::vector<LocalShapedBuffer*> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
     SWIG_fail;
   }
   const int size = PySequence_Size($input);
+  temps.reserve(size);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
-    LocalOp* op;
-    if ((SWIG_ConvertPtr(o, (void**)&op, $descriptor(xla::swig::LocalOp*),
-                         SWIG_POINTER_EXCEPTION)) == -1) {
-      SWIG_fail;
+    std::vector<LocalShapedBuffer*> vec;
+    const int vec_size = PySequence_Size(o);
+    vec.reserve(vec_size);
+    for (int j = 0; j < vec_size; ++j) {
+      PyObject* vec_elt = PySequence_GetItem(o, j);
+      LocalShapedBuffer* lsbp;
+      if ((SWIG_ConvertPtr(vec_elt, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
+                           SWIG_POINTER_EXCEPTION)) == -1) {
+        Py_DECREF(vec_elt);
+        Py_DECREF(o);
+        SWIG_fail;
+      }
+      vec.push_back(lsbp);
+      Py_DECREF(vec_elt);
     }
-    temps.push_back(*op);
+    temps.push_back(vec);
     Py_DECREF(o);
   }
   $1 = temps;
 }
 
-// LocalShapedBuffer*
-
-%typemap(in) absl::Span<xla::swig::LocalShapedBuffer* const>
-    (std::vector<LocalShapedBuffer*> temps) {
+%typemap(in) absl::Span<xla::swig::XrtAllocation* const>
+    (std::vector<XrtAllocation*> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
     SWIG_fail;
@@ -333,12 +404,12 @@ tensorflow::ImportNumpy();
   temps.reserve(size);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
-    LocalShapedBuffer* lsbp;
-    if ((SWIG_ConvertPtr(o, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
+    XrtAllocation* xrta;
+    if ((SWIG_ConvertPtr(o, (void**) &xrta, $descriptor(xla::swig::XrtAllocation*),
                          SWIG_POINTER_EXCEPTION)) == -1) {
       SWIG_fail;
     }
-    temps.push_back(lsbp);
+    temps.push_back(xrta);
     Py_DECREF(o);
   }
   $1 = temps;
@@ -346,6 +417,16 @@ tensorflow::ImportNumpy();
 
 // Literal
 
+%typemap(out) StatusOr<Literal> {
+  if ($1.ok()) {
+    Literal value = $1.ConsumeValueOrDie();
+    $result = numpy::PyObjectFromXlaLiteral(*value);
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
 %typemap(in) const Literal& (StatusOr<Literal> literal_status) {
   literal_status = numpy::XlaLiteralFromPyObject($input);
   if (!literal_status.ok()) {
@@ -401,6 +482,19 @@ tensorflow::ImportNumpy();
 
 // Shape
 
+%typemap(out) const Shape& {
+  $result = numpy::PyShapeInfoFromXlaShape(*$1);
+}
+
+%typemap(out) StatusOr<Shape> {
+  if ($1.ok()) {
+    $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
 %typemap(in) const Shape& (Shape temp) {
   StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
   if (!statusor.ok()) {
@@ -858,22 +952,22 @@ tensorflow::ImportNumpy();
     $1 = NULL;
   } else {
     if (!HandleStringAttribute($input, "generate_hlo_graph", [&](string s) {
-      build_options.set_generate_hlo_graph(std::move(s));
+      build_options.mutable_debug_options()->set_xla_generate_hlo_graph(std::move(s));
     })) {
       return nullptr;
     }
     if (!HandleStringAttribute($input, "dump_optimized_hlo_proto_to", [&](string s) {
-      build_options.set_dump_optimized_hlo_proto_to(std::move(s));
+      build_options.mutable_debug_options()->set_xla_dump_optimized_hlo_proto_to(std::move(s));
     })) {
       return nullptr;
     }
     if (!HandleStringAttribute($input, "dump_unoptimized_hlo_proto_to", [&](string s) {
-      build_options.set_dump_unoptimized_hlo_proto_to(std::move(s));
+      build_options.mutable_debug_options()->set_xla_dump_unoptimized_hlo_proto_to(std::move(s));
     })) {
       return nullptr;
     }
     if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) {
-      build_options.set_dump_per_pass_hlo_proto_to(std::move(s));
+      build_options.mutable_debug_options()->set_xla_dump_per_pass_hlo_proto_to(std::move(s));
     })) {
       return nullptr;
     }
@@ -887,7 +981,7 @@ tensorflow::ImportNumpy();
         PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.hlo_profile must be a bool or None.");
         SWIG_fail;
       }
-      build_options.set_hlo_profile(o == Py_True);
+      build_options.mutable_debug_options()->set_xla_hlo_profile(o == Py_True);
     }
     Py_DECREF(o);
 
@@ -914,6 +1008,7 @@ tensorflow::ImportNumpy();
 %unignore xla;
 %unignore xla::swig;
 %unignore xla::swig::InitializeReplicaCount;
+%unignore xla::swig::InitializePlatformName;
 %unignore xla::swig::GetReplicaCount;
 %unignore xla::swig::TransferToInfeedLocal;
 %unignore xla::swig::TransferToInfeedLocalReplica;
@@ -921,20 +1016,32 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalShapedBuffer;
 %unignore xla::swig::LocalShapedBuffer::FromLiteral;
 %unignore xla::swig::LocalShapedBuffer::ToLiteral;
+%unignore xla::swig::LocalShapedBuffer::shape;
 %unignore xla::swig::LocalShapedBufferTuple;
 %unignore xla::swig::LocalShapedBufferTuple::Release;
 %unignore xla::swig::LocalShapedBufferTuple::size;
+%unignore xla::swig::XrtAllocation;
+%unignore xla::swig::XrtAllocation::FromLiteral;
+%unignore xla::swig::XrtAllocation::ToLiteral;
+%unignore xla::swig::XrtAllocation::shape;
+%unignore xla::swig::XrtAllocationTuple;
+%unignore xla::swig::XrtAllocationTuple::Release;
+%unignore xla::swig::XrtAllocationTuple::size;
 %unignore xla::swig::CompiledLocalComputation;
 %unignore xla::swig::CompiledLocalComputation::Execute;
-%unignore xla::swig::CompiledLocalComputation::ExecuteWithShapedBuffers;
+%unignore xla::swig::CompiledLocalComputation::ExecutePerReplica;
+%unignore xla::swig::CompiledXrtComputation;
+%unignore xla::swig::CompiledXrtComputation::Execute;
 %unignore xla::swig::LocalComputation;
 %unignore xla::swig::LocalComputation::Compile;
+%unignore xla::swig::LocalComputation::CompileForXrt;
 %unignore xla::swig::LocalComputation::GetReturnValueShape;
 %unignore xla::swig::LocalComputation::GetSerializedProto;
 %unignore xla::swig::LocalOp;
 %unignore xla::swig::LocalComputationBuilder;
 %unignore xla::swig::LocalComputationBuilder::LocalComputationBuilder;
 %unignore xla::swig::LocalComputationBuilder::Build;
+%unignore xla::swig::LocalComputationBuilder::BuildWithRoot;
 %unignore xla::swig::LocalComputationBuilder::SetOpMetadata;
 %unignore xla::swig::LocalComputationBuilder::ClearOpMetadata;
 %unignore xla::swig::LocalComputationBuilder::Parameter;
@@ -945,6 +1052,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::ConstantLiteral;
 %unignore xla::swig::LocalComputationBuilder::ConstantR0;
 %unignore xla::swig::LocalComputationBuilder::Broadcast;
+%unignore xla::swig::LocalComputationBuilder::BroadcastInDim;
 %unignore xla::swig::LocalComputationBuilder::Pad;
 %unignore xla::swig::LocalComputationBuilder::Reshape;
 %unignore xla::swig::LocalComputationBuilder::Collapse;
@@ -1036,10 +1144,13 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Imag;
 %unignore xla::swig::LocalComputationBuilder::Conj;
 %unignore xla::swig::LocalComputationBuilder::Complex;
+%unignore xla::swig::DeleteLocalComputation;
 %unignore xla::swig::DestructureLocalShapedBufferTuple;
+%unignore xla::swig::DestructureXrtAllocationTuple;
 %unignore xla::swig::DeleteLocalShapedBuffer;
-%unignore xla::swig::DeleteLocalComputation;
+%unignore xla::swig::DeleteXrtAllocation;
 %unignore xla::swig::DeleteCompiledLocalComputation;
+%unignore xla::swig::DeleteCompiledXrtComputation;
 
 %thread;
 %include "tensorflow/compiler/xla/python/local_computation_builder.h"
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index bb303c5678a2cac9a9e78925e857ab25c0c6d9be..c91a2aaf56dfe2127168628c78e0c4b868a28055 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -26,6 +26,9 @@ import os
 
 import numpy as np
 
+import six
+from six.moves import xrange
+
 from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python import pywrap_xla as c_api
 from tensorflow.compiler.xla.service import hlo_pb2
@@ -46,6 +49,15 @@ _OP_METADATA_FIELDS = [
 OpMetadata = collections.namedtuple('OpMetadata', _OP_METADATA_FIELDS)
 
 
+class BackendType(enum.Enum):
+  XLA_LOCAL = 1
+  XRT = 2
+
+
+BackendSpec = collections.namedtuple('Backend', ('backend_type', 'target'))
+XLA_LOCAL_BACKEND = BackendSpec(BackendType.XLA_LOCAL, 'local')
+
+
 def OpMetadataToProto(pyobj):
   proto = xla_data_pb2.OpMetadata()
   for field in _OP_METADATA_FIELDS:
@@ -66,6 +78,13 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
       source_line=lineno)
 
 
+def _maybe_encode_string(s):
+  if six.PY3:
+    return s.encode('utf-8')
+  else:
+    return s
+
+
 class PaddingType(enum.Enum):
   VALID = 1
   SAME = 2
@@ -73,16 +92,32 @@ class PaddingType(enum.Enum):
 
 def _convert_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
                                         window_strides):
-  """Maps PaddingType (VALID or SAME) to pad values (list of pairs of ints)."""
+  """Maps PaddingType or string to pad values (list of pairs of ints)."""
+  if not isinstance(padding_type, (str, PaddingType)):
+    msg = 'padding_type must be str or PaddingType, got {}.'
+    raise TypeError(msg.format(type(padding_type)))
+
+  if isinstance(padding_type, str):
+    if padding_type.upper() == 'VALID':
+      padding_type = PaddingType.VALID
+    elif padding_type.upper() == 'SAME':
+      padding_type = PaddingType.SAME
+    else:
+      msg = 'Unknown padding type string: expected "VALID" or "SAME", got {}.'
+      raise ValueError(msg.format(padding_type))
+
   if padding_type == PaddingType.VALID:
     return [(0, 0)] * len(window_strides)
-
-  out_shape = np.ceil(np.true_divide(lhs_dims, window_strides)).astype(int)
-  pad_sizes = [max((out_size - 1) * stride + filter_size - in_size, 0)
-               for out_size, stride, filter_size, in_size
-               in zip(out_shape, window_strides, rhs_dims, lhs_dims)]
-  return [(pad_size // 2, pad_size - pad_size // 2)
-          for pad_size in pad_sizes]
+  elif padding_type == PaddingType.SAME:
+    out_shape = np.ceil(np.true_divide(lhs_dims, window_strides)).astype(int)
+    pad_sizes = [max((out_size - 1) * stride + filter_size - in_size, 0)
+                 for out_size, stride, filter_size, in_size
+                 in zip(out_shape, window_strides, rhs_dims, lhs_dims)]
+    return [(pad_size // 2, pad_size - pad_size // 2)
+            for pad_size in pad_sizes]
+  else:
+    msg = 'Unexpected PaddingType value: {}'
+    raise ValueError(msg.format(padding_type))
 
 
 _UNARY_OPS = [
@@ -187,38 +222,66 @@ class LocalBuffer(object):
   means the referent is in device memory.
   """
 
-  def __init__(self, c_local_shaped_buffer):
-    self.c_local_shaped_buffer = c_local_shaped_buffer
-    self._delete = c_api.DeleteLocalShapedBuffer
+  def __init__(self, c_buffer, backend, replica):
+    self.c_buffer = c_buffer
+    self._backend = backend
+    self._replica = replica
+    if backend.backend_type == BackendType.XRT:
+      self._delete = c_api.DeleteXrtAllocation
+    else:
+      self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_pyval(pyval, layout_fn=None):
+  def from_pyval(pyval, replica=0, backend=XLA_LOCAL_BACKEND):
+    """Allocate and copy to XLA the given python value."""
     pyval = require_numpy_array_layout(pyval)
-    if layout_fn:
-      shape = Shape.from_pyval(pyval)
-      shape = shape.map_leaves(layout_fn)
+    num_replicas = get_replica_count()
+    if not 0 <= replica < num_replicas:
+      raise ValueError(
+          'Attempt to place buffer on replica {} when the replica count is {}'
+          .format(replica, num_replicas))
+    if backend.backend_type == BackendType.XRT:
+      if replica != 0:
+        raise NotImplementedError(
+            'Multi-replica execution is not yet supported via the XRT backend.')
+      cbuf = c_api.XrtAllocation.FromLiteral(
+          pyval, _maybe_encode_string(backend.target))
     else:
-      shape = None
-    return LocalBuffer(c_api.LocalShapedBuffer.FromLiteral(pyval, shape))
+      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None, replica)
+    return LocalBuffer(cbuf, backend, replica)
 
   def to_py(self):
-    return self.c_local_shaped_buffer.ToLiteral()
+    return self.c_buffer.ToLiteral()
+
+  def shape(self):
+    return _wrap_shape(self.c_buffer.shape())
+
+  def replica(self):
+    return self._replica
 
   def delete(self):
-    if self.c_local_shaped_buffer is not None:
-      self._delete(self.c_local_shaped_buffer)
-      self.c_local_shaped_buffer = None
+    if self.c_buffer is not None:
+      self._delete(self.c_buffer)
+      self.c_buffer = None
 
   def destructure(self):
-    assert self.c_local_shaped_buffer is not None
-    result = c_api.DestructureLocalShapedBufferTuple(self.c_local_shaped_buffer)
-    self.c_local_shaped_buffer = None
+    """Assuming a tuple buffer, unpack it into constituent tuple elements."""
+    assert self.c_buffer is not None
+    if self._backend.backend_type == BackendType.XRT:
+      result = c_api.DestructureXrtAllocationTuple(
+          self.c_buffer, _maybe_encode_string(self._backend.target))
+    else:
+      result = c_api.DestructureLocalShapedBufferTuple(self.c_buffer)
+    self.delete()
     size = result.size()
-    destructured = tuple(LocalBuffer(result.Release(i)) for i in xrange(size))
+    destructured = tuple(
+        LocalBuffer(
+            result.Release(i), replica=self._replica, backend=self._backend)
+        for i in xrange(size))
     return destructured
 
   def is_deleted(self):
-    return self.c_local_shaped_buffer is None
+    return self.c_buffer is None
 
   def __del__(self):
     self.delete()
@@ -283,6 +346,9 @@ class Shape(object):
   def __ne__(self, other):
     return not self == other
 
+  def __hash__(self):
+    return hash((self._dtype, self._dimensions, self._minor_to_major))
+
   def __repr__(self):
     return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, '
             '_is_tuple={!r}, _minor_to_major={!r})').format(
@@ -436,26 +502,37 @@ class LocalComputation(object):
   ComputationBuilder methods.
   """
 
-  def __init__(self, c_local_computation, is_compiled):
-    self.c_local_computation = c_local_computation
-    self.is_compiled = is_compiled
+  def __init__(self, c_computation, is_compiled, backend=XLA_LOCAL_BACKEND):
+    self._c_computation = c_computation
+    self._backend = backend
+    self._is_compiled = is_compiled
 
     # Ensure a reference to C-based destructor for use in __del__.
     if is_compiled:
-      assert isinstance(c_local_computation, c_api.CompiledLocalComputation)
-      self._delete = c_api.DeleteCompiledLocalComputation
+      if backend.backend_type == BackendType.XRT:
+        assert isinstance(c_computation, c_api.CompiledXrtComputation)
+        self._delete = c_api.DeleteCompiledXrtComputation
+      else:
+        assert isinstance(c_computation, c_api.CompiledLocalComputation)
+        self._delete = c_api.DeleteCompiledLocalComputation
     else:
-      assert isinstance(c_local_computation, c_api.LocalComputation)
+      assert isinstance(c_computation, c_api.LocalComputation)
       self._delete = c_api.DeleteLocalComputation
 
+  @property
+  def computation(self):
+    if self._is_compiled:
+      raise ValueError(
+          'Attempt to read the XLA computation of a compiled LocalComputation.')
+    return self._c_computation
+
   def GetProto(self):
     """Get the HloModuleProto proto object in this local computation.
 
     Returns:
        An HloModuleProto proto object that has the whole-graph information.
     """
-
-    serialized = self.c_local_computation.GetSerializedProto()
+    serialized = self.computation.GetSerializedProto()
     proto = hlo_pb2.HloModuleProto.FromString(serialized)
     return proto
 
@@ -480,10 +557,10 @@ class LocalComputation(object):
     Returns:
       A newly *compiled* local computation instance.
     """
-    if self.is_compiled:
+    if self._is_compiled:
       raise ValueError('Attempt to compile a compiled local XLA computation.')
 
-    result_shape = _wrap_shape(self.c_local_computation.GetReturnValueShape())
+    result_shape = _wrap_shape(self.computation.GetReturnValueShape())
 
     if layout_fn:
       argument_shapes = [
@@ -491,11 +568,16 @@ class LocalComputation(object):
       ]
       result_shape = result_shape.map_leaves(layout_fn)
 
+    argument_shapes = list(argument_shapes)
+
     compile_options = compile_options or CompileOptions()
     compile_options.result_shape = result_shape
-    return LocalComputation(
-        self.c_local_computation.Compile(argument_shapes, compile_options),
-        is_compiled=True)
+    if self._backend.backend_type == BackendType.XRT:
+      c = self.computation.CompileForXrt(
+          argument_shapes, _maybe_encode_string(self._backend.target))
+    else:
+      c = self.computation.Compile(argument_shapes, compile_options)
+    return LocalComputation(c, is_compiled=True, backend=self._backend)
 
   def CompileWithExampleArguments(self,
                                   arguments=(),
@@ -506,33 +588,89 @@ class LocalComputation(object):
         compile_options=compile_options,
         layout_fn=layout_fn)
 
-  def Execute(self, arguments=(), layout_fn=None):
-    """Execute with Python values as arguments and return value."""
-    if not self.is_compiled:
+  def GetReturnValueShape(self):
+    return _wrap_shape(self._c_computation.GetReturnValueShape())
+
+  def Execute(self, arguments=(), check_for_deleted_args=True):
+    """Execute on one replica with LocalBuffer arguments and return value."""
+    if check_for_deleted_args and any(arg.is_deleted() for arg in arguments):
+      raise ValueError('Executing with deleted local buffer argument')
+    raw_args = [arg.c_buffer for arg in arguments]
+    output_buffer = self._c_computation.Execute(raw_args)
+    return LocalBuffer(output_buffer, backend=self._backend, replica=0)
+
+  def ExecutePerReplica(self, arguments=None):
+    """Execute on many replicas with LocalBuffer arguments and return value.
+
+    Args:
+      arguments: A sequence of sequences of LocalBuffers. The i'th inner
+        sequence comprises the arguments for execution on the i'th replica.
+
+    Returns:
+      A list of the computation's outputs on each replica, as a LocalBuffer. If
+      a shallow sequence of arguments was passed in for `arguments`, then the
+      sole, zero'th replica's output is returned instead, as a LocalBuffer.
+    """
+    if not self._is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
-    argument_shapes = [Shape.from_pyval(arg) for arg in arguments]
-    if layout_fn:
-      argument_shapes = [
-          shape.map_leaves(layout_fn) for shape in argument_shapes
-      ]
+    if arguments is None:
+      arguments = ((),) * get_replica_count()
+    else:
+      arguments = [list(replica_args) for replica_args in arguments]
+
+    # Check arguments
+    for replica, replica_args in enumerate(arguments):
+      for arg in replica_args:
+        if arg.is_deleted():
+          raise ValueError('Executing with deleted local buffer argument')
+        if arg.replica() != replica:
+          raise ValueError(
+              'Executing on replica {} with argument from replica {}'.format(
+                  replica, arg.replica()))
+
+    # Pull out argument buffer handles
+    stripped_args = [
+        [arg.c_buffer for arg in replica_args] for replica_args in arguments
+    ]
+
+    # Execute
+    if self._backend.backend_type == BackendType.XRT:
+      if len(stripped_args) > 1:
+        raise NotImplementedError(
+            'Multi-replica execution is not yet supported via the XRT backend.')
+      output_buffers = [self._c_computation.Execute(stripped_args[0])]
     else:
-      argument_shapes = [None for shape in argument_shapes]
-    arguments = tuple(map(require_numpy_array_layout, arguments))
-    return self.c_local_computation.Execute(arguments, argument_shapes)
+      output_buffer_tup = self._c_computation.ExecutePerReplica(stripped_args)
+      size = output_buffer_tup.size()
+      output_buffers = [output_buffer_tup.Release(i) for i in xrange(size)]
 
-  def ExecuteWithLocalBuffers(self, arguments=()):
-    """Execute with LocalBuffer arguments and return value."""
-    if not self.is_compiled:
-      raise ValueError('Cannot execute an uncompiled local XLA computation.')
-    arguments = tuple(arguments)
-    if any(arg.is_deleted() for arg in arguments):
-      raise ValueError('Executing with deleted local buffer argument')
-    return LocalBuffer(
-        self.c_local_computation.ExecuteWithShapedBuffers(
-            [arg.c_local_shaped_buffer for arg in arguments]))
+    # Wrap output handles in LocalBuffer instances
+    return tuple(
+        LocalBuffer(output_buffer, backend=self._backend, replica=replica)
+        for replica, output_buffer in enumerate(output_buffers))
+
+  def ExecuteWithPythonValues(self, arguments=()):
+    """Execute on one replica with Python values as arguments and output."""
+
+    def put(arg):
+      return LocalBuffer.from_pyval(arg, backend=self._backend)
+
+    arguments = [put(arg) for arg in arguments]
+    return self.Execute(arguments).to_py()
+
+  def ExecuteWithPythonValuesPerReplica(self, arguments):
+    """Execute on many replicas with Python values as arguments and output."""
+
+    def put(arg, replica):
+      return LocalBuffer.from_pyval(arg, replica, backend=self._backend)
+
+    arguments = [[put(arg, replica)
+                  for arg in replica_args]
+                 for replica, replica_args in enumerate(arguments)]
+    return [out.to_py() for out in self.ExecutePerReplica(arguments)]
 
   def __del__(self):
-    self._delete(self.c_local_computation)
+    self._delete(self._c_computation)
 
 
 class ComputationBuilder(object):
@@ -554,8 +692,13 @@ class ComputationBuilder(object):
     self._client = c_api.LocalComputationBuilder(name.encode('utf8'))
     self._parameter_numbering = itertools.count()
 
-  def Build(self):
-    return LocalComputation(self._client.Build(), is_compiled=False)
+  def Build(self, root=None, backend=XLA_LOCAL_BACKEND):
+    if root is not None:
+      return LocalComputation(
+          self._client.BuildWithRoot(root), is_compiled=False, backend=backend)
+    else:
+      return LocalComputation(
+          self._client.Build(), is_compiled=False, backend=backend)
 
   def SetOpMetadata(self, op_metadata):
     """Set metadata for operations that are about to be enqueued."""
@@ -700,6 +843,20 @@ class ComputationBuilder(object):
     """
     return self._client.Broadcast(operand, sizes)
 
+  def BroadcastInDim(self, operand, shape, broadcast_dimensions):
+    """Enqueues a broadcast-in-dimensions operation onto the computation.
+
+    Args:
+      operand: the operand LocalOp to broadcast.
+      shape: tuple of integers, the expected output shape.
+      broadcast_dimensions: tuple of integers identifying which dimensions
+        of the output are to be broadcast into.
+
+    Returns:
+      A LocalOp representing the added broadcast-in-dimensions op.
+    """
+    return self._client.BroadcastInDim(operand, shape, broadcast_dimensions)
+
   def Concatenate(self, operands, dimension):
     """Enqueues a concatenate operation onto the computation.
 
@@ -834,8 +991,8 @@ class ComputationBuilder(object):
         padding, self.GetShape(operand).dimensions(),
         window_dimensions, window_strides)
     return self._client.SelectAndScatterWithGeneralPadding(
-        operand, select.c_local_computation, window_dimensions, window_strides,
-        pads, source, init_value, scatter.c_local_computation)
+        operand, select.computation, window_dimensions, window_strides, pads,
+        source, init_value, scatter.computation)
 
   def Select(self, pred, on_true, on_false):
     """Element-wise selection op.
@@ -943,7 +1100,7 @@ class ComputationBuilder(object):
     Returns:
       A LocalOp representing the added call op.
     """
-    return self._client.Call(computation_to_apply.c_local_computation, operands)
+    return self._client.Call(computation_to_apply.computation, operands)
 
   def Map(self, operands, computation_to_apply, dimensions):
     """Enqueues a map operation onto the computation.
@@ -956,7 +1113,7 @@ class ComputationBuilder(object):
     Returns:
       A LocalOp representing the added Map op.
     """
-    return self._client.Map(operands, computation_to_apply.c_local_computation,
+    return self._client.Map(operands, computation_to_apply.computation,
                             dimensions)
 
   def Reduce(self, operand, init_value, computation_to_apply, dimensions):
@@ -972,8 +1129,7 @@ class ComputationBuilder(object):
       A LocalOp representing the added Reduce op.
     """
     return self._client.Reduce(operand, init_value,
-                               computation_to_apply.c_local_computation,
-                               dimensions)
+                               computation_to_apply.computation, dimensions)
 
   def ReduceWindow(self, operand, init_value, computation_to_apply,
                    window_dimensions, window_strides, padding):
@@ -994,8 +1150,31 @@ class ComputationBuilder(object):
         padding, self.GetShape(operand).dimensions(), window_dimensions,
         window_strides)
     return self._client.ReduceWindowWithGeneralPadding(
-        operand, init_value, computation_to_apply.c_local_computation,
-        window_dimensions, window_strides, pads)
+        operand, init_value, computation_to_apply.computation,
+        window_dimensions, window_strides, (), (), pads)
+
+  def ReduceWindowWithGeneralPadding(
+      self, operand, init_value, computation_to_apply, window_dimensions,
+      window_strides, base_dilations, window_dilations, padding):
+    """Enqueues a windowed reduction operation onto the computation.
+
+    Args:
+      operand: reduction operand (LocalOp).
+      init_value: reduction initial value (LocalOp).
+      computation_to_apply: a binary reduction function (Computation).
+      window_dimensions: dimensions of window (sequence of integers).
+      window_strides: strides for window (sequence of integers).
+      base_dilations: dilations for the base (sequence of integers).
+      window_dilations: dilations for window (sequence of integers).
+      padding: length-N array-like of pairs of integers of (low, high) padding.
+
+    Returns:
+      A LocalOp representing the added ReduceWindow op.
+    """
+    return self._client.ReduceWindowWithGeneralPadding(
+        operand, init_value, computation_to_apply.computation,
+        window_dimensions, window_strides, base_dilations, window_dilations,
+        padding)
 
   def RngNormal(self, mu, sigma, dims):
     """Enqueues an RngNormal operation onto the computation.
@@ -1039,8 +1218,7 @@ class ComputationBuilder(object):
 
     Returns: a LocalOp representing the While operation.
     """
-    return self._client.While(cond.c_local_computation,
-                              body.c_local_computation, init)
+    return self._client.While(cond.computation, body.computation, init)
 
   def Conditional(self, pred, true_operand, true_computation, false_operand,
                   false_computation):
@@ -1056,8 +1234,8 @@ class ComputationBuilder(object):
     Returns: a LocalOp representing the Conditional operation.
     """
     return self._client.Conditional(
-        pred, true_operand, true_computation.c_local_computation, false_operand,
-        false_computation.c_local_computation)
+        pred, true_operand, true_computation.computation, false_operand,
+        false_computation.computation)
 
   def IsConstant(self, operand):
     """Checks whether the given operand is a compile-time constant.
@@ -1124,10 +1302,9 @@ class ComputationBuilder(object):
     pads = _convert_padding_type_to_pad_values(
         padding, self.GetShape(lhs).dimensions()[2:],
         self.GetShape(rhs).dimensions()[2:], window_strides)
-    dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
-    return self._client.ConvGeneralDilated(lhs, rhs, window_strides, pads, (),
-                                           (), dimension_numbers,
-                                           feature_group_count)
+    return self.ConvGeneralDilated(
+        lhs, rhs, window_strides, pads, (), (),
+        dimension_numbers=None, feature_group_count=feature_group_count)
 
   def ConvWithGeneralPadding(self, lhs, rhs, window_strides, padding,
                              lhs_dilation, rhs_dilation, feature_group_count=1):
@@ -1145,11 +1322,9 @@ class ComputationBuilder(object):
     Returns:
       A ComputationdataHandle representing the added ConvWithGeneralPadding op.
     """
-    dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
-    return self._client.ConvGeneralDilated(lhs, rhs, window_strides, padding,
-                                           lhs_dilation, rhs_dilation,
-                                           dimension_numbers,
-                                           feature_group_count)
+    return self.ConvGeneralDilated(
+        lhs, rhs, window_strides, padding, lhs_dilation, rhs_dilation,
+        dimension_numbers=None, feature_group_count=feature_group_count)
 
   def _GetConvDimensionNumbers(self, num_spatial_dims):
     """Create ConvolutionDimensionNumbers proto for convolutions."""
@@ -1167,7 +1342,7 @@ class ComputationBuilder(object):
     return dimension_numbers
 
   def ConvGeneralDilated(self, lhs, rhs, window_strides, padding, lhs_dilation,
-                         rhs_dilation, dimension_numbers,
+                         rhs_dilation, dimension_numbers=None,
                          feature_group_count=1):
     """Enqueues a ConvGeneralDilated operation onto the computation.
 
@@ -1178,10 +1353,11 @@ class ComputationBuilder(object):
       padding: length-N array-like of pairs of integers of (low, high) padding.
       lhs_dilation: length-N array-like of integer dilation factors.
       rhs_dilation: length-N array-like of integer dilation factors.
-      dimension_numbers: either an xla_data_pb2.ConvolutionDimensionNumbers or a
-        triple (lhs_spec, rhs_spec, out_spec) where each element is a string of
-        length N+2 identifying by position (1) batch dimensions in lhs, rhs, and
-        the output with the character 'N', (2) feature dimensions in lhs and the
+      dimension_numbers: optional, either an
+        xla_data_pb2.ConvolutionDimensionNumbers proto instance or a tuple
+        (lhs_spec, rhs_spec, out_spec) where each element is a string of length
+        N+2 identifying by position (1) batch dimensions in lhs, rhs, and the
+        output with the character 'N', (2) feature dimensions in lhs and the
         output with the character 'C', (3) input and output feature dimensions
         in rhs with the characters 'I' and 'O' respectively, and (4) spatial
         dimension correspondences between lhs, rhs, and the output using any
@@ -1194,13 +1370,16 @@ class ComputationBuilder(object):
         spatial dimension character labels according to the order in which the
         labels appear in the rhs_spec string, so that window_strides[0] is
         matched with the dimension corresponding to the first character
-        appearing in rhs_spec that is not 'I' or 'O'.
+        appearing in rhs_spec that is not 'I' or 'O'. By default, use the same
+        dimension numbering as Conv and ConvWithGeneralPadding.
       feature_group_count: number of feature groups for grouped convolution.
 
     Returns: a LocalOp representing the ConvGenralDilated operation.
     """
-    if not isinstance(dimension_numbers,
-                      xla_data_pb2.ConvolutionDimensionNumbers):
+    if dimension_numbers is None:
+      dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
+    elif not isinstance(dimension_numbers,
+                        xla_data_pb2.ConvolutionDimensionNumbers):
       lhs_spec, rhs_spec, out_spec = dimension_numbers
       dimension_numbers = xla_data_pb2.ConvolutionDimensionNumbers()
 
@@ -1285,6 +1464,19 @@ def initialize_replica_count(replica_count):
   c_api.InitializeReplicaCount(replica_count)
 
 
+def initialize_platform_name(platform_name):
+  """Initializes the desired platform name to use on XLA service init.
+
+  Args:
+    platform_name: string name of platform.
+
+  Raises:
+    A runtime exception if the XLA service has already been initialized.
+  """
+  platform_name = _maybe_encode_string(platform_name)
+  c_api.InitializePlatformName(platform_name)
+
+
 def get_replica_count():
   """Returns the current replica count used for the XLA service.
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 82103f03132e45ff822ce1ebcc2be47b24f5869f..21b5c93b615ec429a5da0b4ffe89e8f75f59ef1b 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -37,7 +37,7 @@ class LocalComputationTest(unittest.TestCase):
 
   def _Execute(self, c, arguments):
     compiled_c = c.Build().CompileWithExampleArguments(arguments)
-    return compiled_c.Execute(arguments)
+    return compiled_c.ExecuteWithPythonValues(arguments)
 
   def _ExecuteAndAssertWith(self, assert_func, c, arguments, expected):
     assert expected is not None
@@ -355,7 +355,7 @@ class LocalBufferTest(LocalComputationTest):
   def _Execute(self, c, arguments):
     compiled_c = c.Build().CompileWithExampleArguments(arguments)
     arg_buffers = [xla_client.LocalBuffer.from_pyval(arg) for arg in arguments]
-    result_buffer = compiled_c.ExecuteWithLocalBuffers(arg_buffers)
+    result_buffer = compiled_c.Execute(arg_buffers)
     return result_buffer.to_py()
 
   def testConstantSum(self):
@@ -388,7 +388,7 @@ class LocalBufferTest(LocalComputationTest):
     arg_buffer = xla_client.LocalBuffer.from_pyval(arg)
     arg_buffer.delete()
     with self.assertRaises(ValueError):
-      compiled_c.ExecuteWithLocalBuffers([arg_buffer])
+      compiled_c.Execute([arg_buffer])
 
   def testDestructureTupleEmpty(self):
     t = ()
@@ -439,6 +439,13 @@ class LocalBufferTest(LocalComputationTest):
     np.testing.assert_equal(NumpyArrayF32([1.0, 2.0]), got[0])
     np.testing.assert_equal(NumpyArrayS32([3, 4]), got[1])
 
+  def testShape(self):
+    pyval = np.array([[1., 2.]], np.float32)
+    local_buffer = xla_client.LocalBuffer.from_pyval(pyval)
+    xla_shape = local_buffer.shape()
+    self.assertEqual(xla_shape.dimensions(), (1, 2,))
+    self.assertEqual(np.dtype(xla_shape.element_type()), np.dtype(np.float32))
+
 
 class SingleOpTest(LocalComputationTest):
   """Tests for single ops.
@@ -478,7 +485,7 @@ class SingleOpTest(LocalComputationTest):
       x = c.Constant(np.array(template, dtype=src_dtype))
       c.ConvertElementType(x, xla_types[dst_dtype])
 
-      result = c.Build().Compile().Execute()
+      result = c.Build().Compile().ExecuteWithPythonValues()
       expected = np.array(template, dtype=dst_dtype)
 
       self.assertEqual(result.shape, expected.shape)
@@ -505,7 +512,7 @@ class SingleOpTest(LocalComputationTest):
       x = c.Constant(np.array(template, dtype=src_dtype))
       c.BitcastConvertType(x, dst_etype)
 
-      result = c.Build().Compile().Execute()
+      result = c.Build().Compile().ExecuteWithPythonValues()
       expected = np.array(template, src_dtype).view(dst_dtype)
 
       self.assertEqual(result.shape, expected.shape)
@@ -987,7 +994,7 @@ class SingleOpTest(LocalComputationTest):
     c.Tuple(
         c.ConstantS32Scalar(42), c.Constant(NumpyArrayF32([1.0, 2.0])),
         c.Constant(NumpyArrayBool([True, False, False, True])))
-    result = c.Build().Compile().Execute()
+    result = c.Build().Compile().ExecuteWithPythonValues()
     self.assertIsInstance(result, tuple)
     np.testing.assert_equal(result[0], 42)
     np.testing.assert_allclose(result[1], [1.0, 2.0])
@@ -1007,12 +1014,19 @@ class SingleOpTest(LocalComputationTest):
     self._ExecuteAndCompareExact(
         c, expected=[[10, 20, 30, 40], [10, 20, 30, 40], [10, 20, 30, 40]])
 
+  def testBroadcastInDim(self):
+    c = self._NewComputation()
+    c.BroadcastInDim(c.Constant(NumpyArrayS32([1, 2])), [2, 2], [0])
+    self._ExecuteAndCompareExact(c, expected=[[1, 1], [2, 2]])
+    c.BroadcastInDim(c.Constant(NumpyArrayS32([1, 2])), [2, 2], [1])
+    self._ExecuteAndCompareExact(c, expected=[[1, 2], [1, 2]])
+
   def testRngNormal(self):
     shape = (2, 3)
     c = self._NewComputation()
     c.RngNormal(c.Constant(NumpyArrayF32(0.)), c.Constant(NumpyArrayF32(1.)),
                 dims=shape)
-    result = c.Build().Compile().Execute()
+    result = c.Build().Compile().ExecuteWithPythonValues()
     # since the result is random, we just check shape and uniqueness
     self.assertEqual(result.shape, shape)
     self.assertEqual(len(np.unique(result)), np.prod(shape))
@@ -1023,7 +1037,7 @@ class SingleOpTest(LocalComputationTest):
     c = self._NewComputation()
     c.RngUniform(c.Constant(NumpyArrayF32(lo)), c.Constant(NumpyArrayF32(hi)),
                  dims=shape)
-    result = c.Build().Compile().Execute()
+    result = c.Build().Compile().ExecuteWithPythonValues()
     # since the result is random, we just check shape, uniqueness, and range
     self.assertEqual(result.shape, shape)
     self.assertEqual(len(np.unique(result)), np.prod(shape))
@@ -1036,7 +1050,7 @@ class SingleOpTest(LocalComputationTest):
     c = self._NewComputation()
     c.RngUniform(c.Constant(NumpyArrayS32(lo)), c.Constant(NumpyArrayS32(hi)),
                  dims=shape)
-    result = c.Build().Compile().Execute()
+    result = c.Build().Compile().ExecuteWithPythonValues()
     # since the result is random, we just check shape, integrality, and range
     self.assertEqual(result.shape, shape)
     self.assertEqual(result.dtype, np.int32)
@@ -1473,7 +1487,7 @@ class EmbeddedComputationsTest(LocalComputationTest):
       xla_client.transfer_to_infeed(item)
 
     for item in to_infeed:
-      result = compiled_c.Execute()
+      result = compiled_c.ExecuteWithPythonValues()
       self.assertEqual(result, item)
 
   def testInfeedThenOutfeedS32(self):
@@ -1511,5 +1525,20 @@ class ErrorTest(LocalComputationTest):
         lambda: c.Build().CompileWithExampleArguments([self.f32_scalar_2]))
 
 
+class ComputationRootTest(LocalComputationTest):
+  """Tests related to setting the root of the computation."""
+
+  def testComputationRootDifferentFromLastOp(self):
+    c = self._NewComputation()
+    x = c.ParameterFromNumpy(NumpyArrayF32(2.0))
+    result = c.Add(x, c.ConstantF32Scalar(3.14))
+    extra = c.Add(result, c.ConstantF32Scalar(1.618))  # pylint: disable=unused-variable
+
+    arg = NumpyArrayF32(1.0)
+    compiled_c = c.Build(result).CompileWithExampleArguments([arg])
+    ans = compiled_c.ExecuteWithPythonValues([arg])
+    np.testing.assert_allclose(ans, 4.14)
+
+
 if __name__ == "__main__":
   unittest.main()
diff --git a/tensorflow/compiler/xla/python_api/xla_shape.py b/tensorflow/compiler/xla/python_api/xla_shape.py
index f158f6b2410352432445f669155aff0af5526abf..95b2bf300ec67e9f034f77450416544cb088ae55 100644
--- a/tensorflow/compiler/xla/python_api/xla_shape.py
+++ b/tensorflow/compiler/xla/python_api/xla_shape.py
@@ -25,9 +25,10 @@ from tensorflow.compiler.xla.python_api import types
 
 
 class Shape(object):
-  """Wraps a xla_data_pb2.Shape message with a convenient Python type.
+  """Wraps a xla_data_pb2.ShapeProto message with a convenient Python type.
 
-  Provides direct access to the underlying xla_data_pb2.Shape message in the
+  Provides direct access to the underlying xla_data_pb2.ShapeProto message in
+  the
   message attribute, along with accessor wrappers to the message's fields.
   Avoid direct access to .message unless interacting directly with protobuf APIs
   like CopyFrom. In other words, prefer hauling the shape around in a Shape, and
@@ -48,7 +49,7 @@ class Shape(object):
     Raises:
       ValueError: if element_type is TUPLE but dimensions are not Shape objects.
     """
-    self.message = xla_data_pb2.Shape()
+    self.message = xla_data_pb2.ShapeProto()
     self.message.element_type = element_type
     if element_type == xla_data_pb2.TUPLE:
       if not all(isinstance(subshape, Shape) for subshape in dimensions):
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 3abb3855a42b8b5222115262448d359da3a80e87..26affbcceb33110baf41d507173e56f8b1c8c9eb 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -16,7 +16,6 @@ xla_proto_library(
     use_grpc_plugin = True,
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
     ],
 )
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index 4e1435fa30a24c320ddbedb84d37b369a3158a54..d8123a6de28ca532819ece4a75cd0b725f8c1bbd 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -47,11 +47,18 @@ namespace xla {
   });
 }
 
-::grpc::Status GRPCService::ExecuteGraph(::grpc::ServerContext* /*context*/,
-                                         const ExecuteGraphRequest* arg,
-                                         ExecuteResponse* result) {
+::grpc::Status GRPCService::Compile(::grpc::ServerContext* /*context*/,
+                                    const CompileRequest* arg,
+                                    CompileResponse* result) {
   return DelegateRPC(
-      [this, arg, result]() { return service_->ExecuteGraph(arg, result); });
+      [this, arg, result]() { return service_->Compile(arg, result); });
+}
+
+::grpc::Status GRPCService::Execute(::grpc::ServerContext* /*context*/,
+                                    const ExecuteRequest* arg,
+                                    ExecuteResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Execute(arg, result); });
 }
 
 ::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index ca1b09b648013ad45d806040c5ddcf11d9e5604e..3e586b288a56a22573d0c3b9ae7b2f25fdbf851a 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -39,9 +39,13 @@ class GRPCService : public grpc::XlaService::Service {
                                   const DeconstructTupleRequest* arg,
                                   DeconstructTupleResponse* result) override;
 
-  ::grpc::Status ExecuteGraph(::grpc::ServerContext* context,
-                              const ExecuteGraphRequest* arg,
-                              ExecuteResponse* result) override;
+  ::grpc::Status Compile(::grpc::ServerContext* context,
+                         const CompileRequest* arg,
+                         CompileResponse* result) override;
+
+  ::grpc::Status Execute(::grpc::ServerContext* context,
+                         const ExecuteRequest* arg,
+                         ExecuteResponse* result) override;
 
   ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
                                   const WaitForExecutionRequest* arg,
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
index 7b8ab158e1396d7087a407be180ab44d2e16e121..66abf66cfd6c2f753c5507aa373452ac880e9a29 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -62,10 +62,17 @@ Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
   });
 }
 
-Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
-                              ExecuteResponse* response) {
+Status GRPCStub::Compile(const CompileRequest* request,
+                         CompileResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->ExecuteGraph(context, *request, response);
+    return grpc_stub_->Compile(context, *request, response);
+  });
+}
+
+Status GRPCStub::Execute(const ExecuteRequest* request,
+                         ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Execute(context, *request, response);
   });
 }
 
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
index 8dfcb761387d608abbb1f62974f49b976a7ff7ff..f02b401399f3e895153f0b08e325bc9c2c2336ec 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.h
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -43,8 +43,11 @@ class GRPCStub : public ServiceInterface {
   Status ResetDevice(const ResetDeviceRequest* arg,
                      ResetDeviceResponse* result) override;
 
-  Status ExecuteGraph(const ExecuteGraphRequest* request,
-                      ExecuteResponse* response) override;
+  Status Compile(const CompileRequest* request,
+                 CompileResponse* response) override;
+
+  Status Execute(const ExecuteRequest* request,
+                 ExecuteResponse* response) override;
 
   Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* request,
                               ExecuteParallelResponse* response) override;
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
index 551ae895e05586daec0ffcd425f4950f76bdd50d..0ff8adc2acbe5fd21e85027dd63bfb14f5672a7d 100644
--- a/tensorflow/compiler/xla/rpc/xla_service.proto
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -43,7 +43,6 @@ limitations under the License.
 syntax = "proto3";
 
 import "tensorflow/compiler/xla/xla.proto";
-import "tensorflow/compiler/xla/xla_data.proto";
 
 package xla;
 
@@ -128,11 +127,14 @@ service XlaService {
       returns (CreateChannelHandleResponse) {
   }
 
-  // Invokes the provided computation with the provided global data passed as
-  // immutable arguments. The request contains the whole computation graph.
+  // Compiles the provided computation into executable. Returns the handle of
+  // the executable.
+  rpc Compile(CompileRequest) returns (CompileResponse) {}
+
+  // Invokes the provided executable with the provided global data passed as
+  // immutable arguments. The request contains the handle to the executable.
   // Returns global data output and execution timing.
-  rpc ExecuteGraph(ExecuteGraphRequest) returns (ExecuteResponse) {
-  }
+  rpc Execute(ExecuteRequest) returns (ExecuteResponse) {}
 
   // Invokes the provided list of computations in parallel with the provided
   // global data for each computation. Returns a list of global data output and
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 4797cf333070f2dd371e81c01ad659151cbc216d..4c21ae2a427477caa86fb4130616c38eb3bcf006 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -87,7 +87,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -124,7 +123,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -158,12 +156,12 @@ tf_cc_test(
         ":bfloat16_propagation",
         ":bfloat16_support",
         ":hlo",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
     ],
@@ -253,6 +251,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
@@ -280,12 +279,14 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -293,7 +294,9 @@ cc_library(
     name = "hlo",
     srcs = [
         "dfs_hlo_visitor.cc",
+        "dynamic_parameter_binding.cc",
         "hlo_computation.cc",
+        "hlo_input_output_alias_config.cc",
         "hlo_instruction.cc",
         "hlo_instructions.cc",
         "hlo_module.cc",
@@ -305,9 +308,11 @@ cc_library(
     hdrs = [
         "dfs_hlo_visitor.h",
         "dfs_hlo_visitor_with_default.h",
+        "dynamic_parameter_binding.h",
         "hlo_clone_context.h",
         "hlo_computation.h",
         "hlo_domain_metadata.h",
+        "hlo_input_output_alias_config.h",
         "hlo_instruction.h",
         "hlo_instructions.h",
         "hlo_module.h",
@@ -320,7 +325,6 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_module_config",
         ":hlo_proto",
-        ":hlo_reachability",
         ":name_uniquer",
         "//tensorflow/compiler/xla:array",
         "//tensorflow/compiler/xla:literal",
@@ -350,6 +354,25 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dynamic_parameter_binding_test",
+    srcs = ["dynamic_parameter_binding_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_memory_scheduler",
+        ":hlo_ordering",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 tf_cc_test(
     name = "dfs_hlo_visitor_with_default_test",
     srcs = ["dfs_hlo_visitor_with_default_test.cc"],
@@ -362,7 +385,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -388,9 +410,36 @@ tf_cc_test(
         ":hlo",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "pattern_matcher_gmock",
+    testonly = 1,
+    hdrs = ["pattern_matcher_gmock.h"],
+    deps = [
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:test",
+    ],
+)
+
+tf_cc_test(
+    name = "pattern_matcher_gmock_test",
+    srcs = ["pattern_matcher_gmock_test.cc"],
+    deps = [
+        ":hlo",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -399,10 +448,12 @@ cc_library(
     srcs = ["hlo_reachability.cc"],
     hdrs = ["hlo_reachability.h"],
     deps = [
+        ":hlo",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
     ],
@@ -417,7 +468,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -463,7 +513,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -516,7 +565,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -565,7 +613,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -588,7 +635,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -600,11 +646,11 @@ cc_library(
     hdrs = ["platform_util.h"],
     deps = [
         ":compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
@@ -644,6 +690,7 @@ cc_library(
         ":allocation_tracker",
         ":backend",
         ":channel_tracker",
+        ":compilation_cache",
         ":compiler",
         ":computation_layout",
         ":device_memory_allocator",
@@ -659,6 +706,7 @@ cc_library(
         ":source_map_util",
         ":stream_pool",
         ":transfer_manager",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:service_interface",
@@ -670,7 +718,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -727,12 +774,12 @@ cc_library(
         ":computation_layout",
         ":platform_util",
         ":service",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -808,6 +855,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:ptr_util",
+        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
     ],
@@ -830,6 +878,7 @@ cc_library(
         ":maybe_owning_device_memory",
         ":shaped_buffer",
         ":stream_pool",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:status",
@@ -837,7 +886,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -858,6 +906,7 @@ cc_library(
         ":executable",
         ":hlo",
         ":hlo_module_config",
+        ":hlo_module_group",
         ":logical_buffer",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -1082,7 +1131,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1099,6 +1147,7 @@ cc_library(
         ":hlo",
         ":hlo_dataflow_analysis",
         ":hlo_proto",
+        ":hlo_reachability",
         ":hlo_value",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1164,7 +1213,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1268,6 +1316,25 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_input_output_alias_config_test",
+    srcs = ["hlo_input_output_alias_config_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_memory_scheduler",
+        ":hlo_ordering",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 cc_library(
     name = "hlo_memory_scheduler",
     srcs = ["hlo_memory_scheduler.cc"],
@@ -1320,6 +1387,7 @@ cc_library(
         ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -1339,6 +1407,7 @@ cc_library(
         ":fusion_queue",
         ":hlo",
         ":hlo_pass",
+        ":hlo_reachability",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
@@ -1364,6 +1433,7 @@ cc_library(
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
     deps = [
+        ":hlo_reachability",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:hlo",
@@ -1404,7 +1474,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
@@ -1480,7 +1549,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
@@ -1523,7 +1591,10 @@ tf_cc_test(
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_matchers",
+        ":hlo_parser",
         ":hlo_pass",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -1532,7 +1603,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1569,7 +1639,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1619,7 +1688,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -1671,6 +1740,19 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "while_loop_analysis_test",
+    srcs = ["while_loop_analysis_test.cc"],
+    deps = [
+        ":while_loop_analysis",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "while_loop_simplifier",
     srcs = ["while_loop_simplifier.cc"],
@@ -1679,9 +1761,11 @@ cc_library(
         ":call_inliner",
         ":hlo",
         ":hlo_pass",
+        ":hlo_query",
+        ":pattern_matcher",
         ":while_loop_analysis",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -1693,10 +1777,17 @@ tf_cc_test(
     name = "while_loop_simplifier_test",
     srcs = ["while_loop_simplifier_test.cc"],
     deps = [
+        ":algebraic_simplifier",
+        ":hlo",
+        ":hlo_cse",
+        ":hlo_dce",
         ":hlo_matchers",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":tuple_simplifier",
         ":while_loop_simplifier",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings",
@@ -1727,7 +1818,7 @@ tf_cc_test(
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
 
@@ -1755,7 +1846,7 @@ tf_cc_test(
         ":implicit_broadcast_remover",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
 
@@ -1800,7 +1891,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -1820,6 +1910,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_dimension_inference",
+    srcs = ["dynamic_dimension_inference.cc"],
+    hdrs = ["dynamic_dimension_inference.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "dynamic_dimension_inference_test",
+    srcs = ["dynamic_dimension_inference_test.cc"],
+    deps = [
+        ":dynamic_dimension_inference",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "reshape_mover_test",
     srcs = ["reshape_mover_test.cc"],
@@ -1834,7 +1959,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
@@ -1977,7 +2102,8 @@ tf_cc_test(
     srcs = ["hlo_computation_test.cc"],
     deps = [
         ":hlo",
-        ":hlo_matchers",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -2240,7 +2366,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -2303,13 +2428,27 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
 )
 
+cc_library(
+    name = "compilation_cache",
+    srcs = ["compilation_cache.cc"],
+    hdrs = ["compilation_cache.h"],
+    deps = [
+        ":executable",
+        ":hlo_module_config",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 cc_library(
     name = "layout_assignment",
     srcs = [
@@ -2379,14 +2518,13 @@ tf_cc_test(
         ":hlo_graph_dumper",
         ":hlo_matchers",
         ":hlo_runner",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -2450,6 +2588,7 @@ tf_cc_test(
         ":hlo",
         ":hlo_parser",
         ":hlo_verifier",
+        ":layout_assignment",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
@@ -2503,7 +2642,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -2559,8 +2697,9 @@ tf_cc_test(
         ":algebraic_simplifier",
         ":computation_layout",
         ":hlo",
-        ":hlo_matchers",
         ":layout_assignment",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -2570,8 +2709,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/types:span",
@@ -2632,7 +2771,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2673,7 +2812,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -2707,12 +2845,13 @@ tf_cc_test(
         ":hlo_matchers",
         ":hlo_parser",
         ":hlo_pass",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2784,10 +2923,9 @@ tf_cc_test(
         ":hlo_domain_isolator",
         ":hlo_domain_remover",
         ":hlo_parser",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
@@ -2820,6 +2958,46 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_get_dimension_size_rewriter",
+    srcs = ["hlo_get_dimension_size_rewriter.cc"],
+    hdrs = ["hlo_get_dimension_size_rewriter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_get_dimension_size_rewriter_test",
+    srcs = ["hlo_get_dimension_size_rewriter_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_get_dimension_size_rewriter",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = [
@@ -2878,6 +3056,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
         "@llvm//:transform_utils",
@@ -2975,7 +3154,6 @@ tf_cc_test(
     deps = [
         ":hlo_tfgraph_builder",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -2992,6 +3170,7 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_execution_profile",
         ":hlo_tfgraph_builder",
+        ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -3126,6 +3305,7 @@ cc_library(
         ":buffer_assignment",
         ":hlo",
         ":hlo_proto",
+        ":hlo_verifier",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:util",
     ],
@@ -3188,6 +3368,7 @@ cc_library(
         ":computation_placer",
         ":executable",
         ":hlo",
+        ":hlo_module_group",
         ":transfer_manager",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -3215,6 +3396,7 @@ cc_library(
         ":hlo_profile_printer_data",
         ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3251,6 +3433,8 @@ cc_library(
         ":tuple_util",
         "//tensorflow/compiler/xla:literal_util",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -3277,10 +3461,11 @@ cc_library(
         ":hlo",
         ":hlo_pass",
         ":tuple_util",
+        ":while_loop_analysis",
         ":while_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3296,7 +3481,7 @@ tf_cc_test(
         ":while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -3326,7 +3511,7 @@ tf_cc_test(
         ":while_loop_constant_sinking",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
     ],
 )
@@ -3339,6 +3524,7 @@ cc_library(
         ":bfloat16_normalization",
         ":defuser",
         ":hlo",
+        ":hlo_memory_scheduler",
         ":hlo_pass",
         ":hlo_pass_pipeline",
         ":implicit_broadcast_remover",
@@ -3386,7 +3572,7 @@ tf_cc_test(
         ":indexed_array_analysis",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:test",
     ],
@@ -3422,6 +3608,9 @@ tf_cc_test(
         ":hlo_casting_utils",
         ":hlo_matchers",
         ":hlo_parser",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -3471,6 +3660,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ar_crs_combiner",
+    srcs = ["ar_crs_combiner.cc"],
+    hdrs = ["ar_crs_combiner.h"],
+    deps = [
+        ":call_graph",
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "ar_crs_combiner_test",
+    srcs = ["ar_crs_combiner_test.cc"],
+    deps = [
+        ":ar_crs_combiner",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "map_inliner_test",
     srcs = ["map_inliner_test.cc"],
@@ -3482,7 +3706,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 75dae7a7141647d7b7b60b0e07e11c143621ea63..985c5af1c4d89425dd6693585e42e22510fe21f8 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 
 #include <algorithm>
+#include <cmath>
+#include <iterator>
 #include <memory>
 #include <numeric>
 #include <string>
@@ -67,6 +69,45 @@ bool IsAll(const HloInstruction* op, int8 value) {
   }
 }
 
+// Checks whether `op` is a floating-point constant or broadcast of a constant
+// of the form +/- 2^k for some integer k positive, negative, or zero.  Such
+// values are interesting because multiplying by a power of 2 just moves the
+// exponent.
+bool IsAllFpConstantPowerOf2(const HloInstruction* op) {
+  // Unwrap the broadcast if necessary.
+  const HloInstruction* c;
+  if (!Match(op, m::ConstantEffectiveScalar(&c)) &&
+      !Match(op, m::Broadcast(m::Constant(&c).WithShape(
+                     m::Shape().IsEffectiveScalar())))) {
+    return false;
+  }
+  auto val = [&]() -> absl::optional<double> {
+    switch (c->shape().element_type()) {
+      case BF16:
+        return static_cast<double>(c->literal().GetFirstElement<bfloat16>());
+      case F16:
+        return static_cast<double>(c->literal().GetFirstElement<Eigen::half>());
+      case F32:
+        return c->literal().GetFirstElement<float>();
+      case F64:
+        return c->literal().GetFirstElement<double>();
+      default:
+        // Cowardly refuse to consider complex types.
+        return absl::nullopt;
+    }
+  }();
+  if (!val) {
+    return false;
+  }
+
+  int exp;
+  double mantissa = std::frexp(*val, &exp);
+  // frexp returns a value in the range (-1; -0.5] U [0.5, 1).  A return value
+  // of +/-0.5 therefore indicates that the floating point value is a power of
+  // 2.
+  return mantissa == 0.5 || mantissa == -0.5;
+}
+
 // Returns whether the given transpose produces a result which is bit-wise
 // identical to its operand and thus may be replaced with a bitcast.
 bool TransposeIsBitcast(const HloInstruction* transpose) {
@@ -83,7 +124,8 @@ bool TransposeIsBitcast(const HloInstruction* transpose) {
 // reshape may still be a bitcast. For example, a reshape from [28x28] to [784].
 bool ReshapeOrCopyIsBitcast(
     const HloInstruction* instr,
-    const AlgebraicSimplifier::ValidBitcastCallback& valid_bitcast_callback) {
+    const AlgebraicSimplifierOptions::ValidBitcastCallback&
+        valid_bitcast_callback) {
   CHECK(HloOpcode::kReshape == instr->opcode() ||
         HloOpcode::kCopy == instr->opcode());
 
@@ -94,6 +136,11 @@ bool ReshapeOrCopyIsBitcast(
          valid_bitcast_callback(operand->shape(), instr->shape());
 }
 
+bool IsUnstridedSlice(const HloInstruction* hlo) {
+  return absl::c_all_of(hlo->slice_strides(),
+                        [](int64 stride) { return stride == 1; });
+}
+
 // AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain
 // algebraic expressions to simplified forms. Note: This only supports
 // simplifications that simply look at the operands of an instruction. For the
@@ -107,6 +154,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleAdd(HloInstruction* add) override;
 
+  Status HandleAnd(HloInstruction* logical_and) override;
+
   Status HandleBitcast(HloInstruction* bitcast) override;
 
   Status HandleBitcastConvert(HloInstruction* bitcast) override;
@@ -141,6 +190,12 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleMultiply(HloInstruction* multiply) override;
 
+  Status HandleNegate(HloInstruction* negate) override;
+
+  Status HandleNot(HloInstruction* logical_not) override;
+
+  Status HandleOr(HloInstruction* logical_or) override;
+
   Status HandlePad(HloInstruction* pad) override;
 
   Status HandlePower(HloInstruction* power) override;
@@ -157,6 +212,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
 
+  Status HandleSelect(HloInstruction* select) override;
+
   Status HandleSort(HloInstruction* sort) override;
 
   Status HandleTranspose(HloInstruction* transpose) override;
@@ -169,21 +226,13 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   const bool changed() const { return changed_; }
 
   // Runs the visitor on a computation.
-  static bool Run(
-      HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_strength_reduction, bool enable_conv_simplification);
+  static bool Run(HloComputation* computation,
+                  const AlgebraicSimplifierOptions& options);
 
  private:
-  explicit AlgebraicSimplifierVisitor(
-      HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_strength_reduction, bool enable_conv_simplification)
-      : computation_(computation),
-        is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_strength_reduction_(enable_dot_strength_reduction),
-        enable_conv_simplification_(enable_conv_simplification) {}
+  explicit AlgebraicSimplifierVisitor(HloComputation* computation,
+                                      const AlgebraicSimplifierOptions& options)
+      : computation_(computation), options_(options) {}
 
   // Transforms Dots where at least one input is a vector or has a degenerate
   // dimension and converts it into a multiply and reduce. This should enable
@@ -222,10 +271,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                                      HloInstruction* new_instruction);
 
   // Returns whether the shape of the output of the given instructions are the
-  // same for the purposes of simplification. If is_layout_sensitive_ is true,
-  // then this tests shape equality including layout (ShapeUtil::Equal). If
-  // is_layout_sensitive_ is false, then the tests shape compatibility
-  // (ShapeUtil::Compatible).
+  // same for the purposes of simplification. If options_.is_layout_sensitive()
+  // is true, then this tests shape equality including layout
+  // (ShapeUtil::Equal). If options_.is_layout_sensitive() is false, then the
+  // tests shape compatibility (ShapeUtil::Compatible).
   bool SameShape(const HloInstruction* lhs, const HloInstruction* rhs) const;
 
   // Returns whether it was possible to transform `root` to a clamp instruction.
@@ -304,26 +353,22 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Tries to use a kDot in place of the given convolution.
   StatusOr<bool> SimplifyConvToDot(HloInstruction* convolution);
 
+  // Tries to simplify a slice where the result of the slice is a scalar.
+  StatusOr<bool> TrySimplifyScalarSlice(HloInstruction* slice);
+
+  // Tries to convert slice(reshape(X)) into reshape(slice(X))
+  StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
 
+  // The backend-specific options selected for the algebraic simplifier.
+  const AlgebraicSimplifierOptions& options_;
+
   // Whether algebraic simplification has occurred.
   bool changed_ = false;
 
-  // Whether layout is considered during transformation.
-  bool is_layout_sensitive_;
-
-  // Callback used to determine if a bitcast is possible.
-  AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback_;
-
-  // Disable dot strength reduction on platforms where it causes a slowdown.
-  bool enable_dot_strength_reduction_;
-
-  // Disable convolution -> dot simplification on platforms where it causes a
-  // slowdown.
-  bool enable_conv_simplification_;
-
   // Cached computation for adding two scalar F32.
   HloComputation* scalar_add_computation_ = nullptr;
 };
@@ -331,19 +376,15 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 }  // namespace
 
 bool AlgebraicSimplifierVisitor::Run(
-    HloComputation* computation, bool is_layout_sensitive,
-    AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-    bool enable_dot_strength_reduction, bool enable_conv_simplification) {
-  AlgebraicSimplifierVisitor visitor(
-      computation, is_layout_sensitive, std::move(valid_bitcast_callback),
-      enable_dot_strength_reduction, enable_conv_simplification);
+    HloComputation* computation, const AlgebraicSimplifierOptions& options) {
+  AlgebraicSimplifierVisitor visitor(computation, options);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
 
 bool AlgebraicSimplifierVisitor::SameShape(const HloInstruction* lhs,
                                            const HloInstruction* rhs) const {
-  if (is_layout_sensitive_) {
+  if (options_.is_layout_sensitive()) {
     return ShapeUtil::Equal(lhs->shape(), rhs->shape());
   } else {
     return ShapeUtil::Compatible(lhs->shape(), rhs->shape());
@@ -414,6 +455,77 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
                                           sum_of_constants));
   }
 
+  // A*C + B*C => (A+B)*C
+  //
+  //  - If A, B, and C are integers, do this unconditionally. Proof of
+  //    correctness: https://rise4fun.com/Alive/u9X.
+  //
+  //  - If A, B, and C are floating point, do this if C is a scalar constant or
+  //    broadcast of scalar constant and is equal to +/- 2^k for some (possibly
+  //    negative) integer k.
+  //
+  //    Multiplying by a power of 2 just moves the exponent, so our answer is
+  //    exact modulo rounding of intermediate results so long as
+  //
+  //     - none of the three products has an exponent which underflows (so the
+  //       result is 0 or denormal), and
+  //     - none of the three products overflows to inf.
+  //
+  //    Proof: See algebraic_simplifier_proof_distributive_property.py.
+  //
+  //    We deem these differences in rounding, underflow, and overflow
+  //    acceptable in the ML context.
+  HloInstruction *b, *c;
+  if (((Match(lhs, m::Multiply(m::Op(&a), m::Op(&c))) &&
+        Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b)))) ||
+       (Match(lhs, m::Multiply(m::Op(&c), m::Op(&a))) &&
+        Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b))))) &&
+      (ShapeUtil::ElementIsIntegral(add->shape()) ||
+       IsAllFpConstantPowerOf2(c))) {
+    return ReplaceWithNewInstruction(
+        add, HloInstruction::CreateBinary(
+                 add->shape(), HloOpcode::kMultiply,
+                 computation_->AddInstruction(HloInstruction::CreateBinary(
+                     add->shape(), HloOpcode::kAdd, a, b)),
+                 c));
+  }
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs))));
+  // Simplify logical and
+  if (ShapeUtil::HasPrimitiveType(lhs->shape(), xla::PRED) &&
+      ShapeUtil::HasPrimitiveType(rhs->shape(), xla::PRED)) {
+    // A && True => A
+    VLOG(10) << "trying transform [A && True => A]: "
+             << logical_and->ToString();
+    if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_and, lhs)) {
+      return Status::OK();
+    }
+    // True && A => A
+    VLOG(10) << "trying transform [True && A => A]: "
+             << logical_and->ToString();
+    if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_and, rhs)) {
+      return Status::OK();
+    }
+
+    // A && False => False
+    VLOG(10) << "trying transform [A && False => False]: "
+             << logical_and->ToString();
+    if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_and, rhs)) {
+      return Status::OK();
+    }
+
+    // False && A => False
+    VLOG(10) << "trying transform [False && A => False]: "
+             << logical_and->ToString();
+    if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_and, lhs)) {
+      return Status::OK();
+    }
+  }
+
   return Status::OK();
 }
 
@@ -450,8 +562,8 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
     return Status::OK();
   }
 
-  if (is_layout_sensitive_ &&
-      ReshapeOrCopyIsBitcast(copy, valid_bitcast_callback_)) {
+  if (options_.is_layout_sensitive() &&
+      ReshapeOrCopyIsBitcast(copy, options_.valid_bitcast_callback())) {
     ReplaceWithBitcast(copy);
   }
 
@@ -487,7 +599,74 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
     VLOG(10) << "trying to replace " << concatenate->ToString() << " with "
              << replacement->ToString();
     ReplaceInstructionIfSameShape(concatenate, replacement);
-  } else if (operands.size() == 2) {
+    return Status::OK();
+  }
+
+  // Check if we can merge "adjacent" slice operands which take slices from the
+  // same other op. For simplicity we only merge unstrided slices.
+  int64 concatenate_dimension = concatenate->concatenate_dimension();
+  for (int64 i = 0; i < operands.size(); ++i) {
+    if (operands[i]->opcode() != HloOpcode::kSlice ||
+        !IsUnstridedSlice(operands[i])) {
+      continue;
+    }
+    int64 slice_end = operands[i]->slice_limits(concatenate_dimension);
+    HloInstruction* slice_operand = operands[i]->mutable_operand(0);
+    int64 j = i + 1;
+    while (j < operands.size() && operands[j]->opcode() == HloOpcode::kSlice &&
+           IsUnstridedSlice(operands[j]) &&
+           operands[j]->operand(0) == slice_operand &&
+           operands[j]->slice_starts(concatenate_dimension) == slice_end) {
+      // Check that all the slice_start values are the same in all other
+      // dimensions. This implies that the slice_limit values are also the same,
+      // because operands of concatenate need to have the same shape, and we
+      // already checked that the slices are unstrided.
+      bool same_other_starts = true;
+      for (int64 k = 0; k < operands[j]->slice_starts().size(); ++k) {
+        if (k == concatenate_dimension) {
+          continue;
+        }
+        if (operands[i]->slice_starts(k) != operands[j]->slice_starts(k)) {
+          same_other_starts = false;
+          break;
+        }
+      }
+      if (!same_other_starts) {
+        break;
+      }
+      slice_end = operands[j]->slice_limits(concatenate_dimension);
+      ++j;
+    }
+    if (j - i > 1) {
+      Shape new_slice_shape = operands[i]->shape();
+      new_slice_shape.set_dimensions(
+          concatenate_dimension,
+          slice_end - operands[i]->slice_starts(concatenate_dimension));
+      auto new_limit_indices = operands[i]->slice_limits();
+      new_limit_indices[concatenate_dimension] = slice_end;
+      auto new_slice_op =
+          computation_->AddInstruction(HloInstruction::CreateSlice(
+              new_slice_shape, slice_operand,
+              /*start_indices=*/operands[i]->slice_starts(),
+              /*limit_indices=*/new_limit_indices,
+              /*strides=*/operands[i]->slice_strides()));
+      std::vector<HloInstruction*> new_operands;
+      for (int64 k = 0; k < i; ++k) {
+        new_operands.push_back(operands[k]);
+      }
+      new_operands.push_back(new_slice_op);
+      for (int64 k = j; k < operands.size(); ++k) {
+        new_operands.push_back(operands[k]);
+      }
+      auto replacement =
+          computation_->AddInstruction(concatenate->CloneWithNewOperands(
+              concatenate->shape(), new_operands));
+      ReplaceInstructionIfSameShape(concatenate, replacement);
+      return Status::OK();
+    }
+  }
+
+  if (operands.size() == 2) {
     // A binary concat with a broadcasted scalar as an operand can be converted
     // into a pad which is simpler to fold into other operations.
     bool is_effective_low_pad = Match(
@@ -503,7 +682,7 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
       padding_config_dim->set_edge_padding_high(0);
       padding_config_dim->set_edge_padding_low(0);
       padding_config_dim->set_interior_padding(0);
-      if (dim == concatenate->concatenate_dimension()) {
+      if (dim == concatenate_dimension) {
         if (is_effective_low_pad) {
           padding_config_dim->set_edge_padding_low(
               operands[0]->shape().dimensions(dim));
@@ -1161,7 +1340,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
     return ReplaceInstruction(dot, dot_of_gather_optimized);
   }
 
-  if (enable_dot_strength_reduction_ && !is_layout_sensitive_) {
+  if (options_.enable_dot_strength_reduction() &&
+      !options_.is_layout_sensitive()) {
     TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
                         HandleDotStrengthReduction(dot));
     if (did_strength_reduction) {
@@ -1223,6 +1403,64 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
   return Status::OK();
 }
 
+Status AlgebraicSimplifierVisitor::HandleNegate(HloInstruction* negate) {
+  // negate(negate(x)) => x
+  HloInstruction* x;
+  if (Match(negate, m::Negate(m::Negate(m::Op(&x)))) &&
+      ReplaceInstructionIfSameShape(negate, x)) {
+    return Status::OK();
+  }
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleNot(HloInstruction* logical_not) {
+  // not(not(x)) => x
+  HloInstruction* x;
+  if (Match(logical_not, m::Not(m::Not(m::Op(&x)))) &&
+      ReplaceInstructionIfSameShape(logical_not, x)) {
+    return Status::OK();
+  }
+  return Status::OK();
+}
+
+Status AlgebraicSimplifierVisitor::HandleOr(HloInstruction* logical_or) {
+  HloInstruction *lhs, *rhs;
+  CHECK(Match(logical_or, m::Or(m::Op(&lhs), m::Op(&rhs))));
+
+  // Simplify logical or
+  if (ShapeUtil::HasPrimitiveType(lhs->shape(), xla::PRED) &&
+      ShapeUtil::HasPrimitiveType(rhs->shape(), xla::PRED)) {
+    // A || True => True
+    VLOG(10) << "trying transform [A || True => True]: "
+             << logical_or->ToString();
+    if (IsAll(rhs, 1) && ReplaceInstructionIfSameShape(logical_or, rhs)) {
+      return Status::OK();
+    }
+    // True || A => True
+    VLOG(10) << "trying transform [True || A => True]: "
+             << logical_or->ToString();
+    if (IsAll(lhs, 1) && ReplaceInstructionIfSameShape(logical_or, lhs)) {
+      return Status::OK();
+    }
+
+    // A || False => A
+    VLOG(10) << "trying transform [A || False => A]: "
+             << logical_or->ToString();
+    if (IsAll(rhs, 0) && ReplaceInstructionIfSameShape(logical_or, lhs)) {
+      return Status::OK();
+    }
+
+    // False || A => A
+    VLOG(10) << "trying transform [False || A => A]: "
+             << logical_or->ToString();
+    if (IsAll(lhs, 0) && ReplaceInstructionIfSameShape(logical_or, rhs)) {
+      return Status::OK();
+    }
+  }
+
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleLog(HloInstruction* log) {
   // ln(exp(A)) => A
   VLOG(10) << "trying transform [ln(exp(A)) => A]: " << log->ToString();
@@ -1507,6 +1745,27 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
         pad, HloInstruction::CreateBroadcast(pad->shape(),
                                              pad->mutable_operand(1), {}));
   }
+
+  // Interior padding on one sized dimensions have no effect. As a result it
+  // makes other simplifications possible if there is no interior padding.
+  if (HasInteriorPadding(pad->padding_config())) {
+    PaddingConfig padding_config = pad->padding_config();
+    bool cleared_interior_padding = false;
+    for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+      if (padding_config.dimensions(i).interior_padding() > 0 &&
+          pad->operand(0)->shape().dimensions(i) == 1) {
+        cleared_interior_padding = true;
+        padding_config.mutable_dimensions(i)->set_interior_padding(0);
+      }
+    }
+    if (cleared_interior_padding) {
+      return ReplaceWithNewInstruction(
+          pad,
+          HloInstruction::CreatePad(pad->shape(), pad->mutable_operand(0),
+                                    pad->mutable_operand(1), padding_config));
+    }
+  }
+
   // Eliminate nop pads (padding all zero), and replace a pad with negative
   // padding with a pad with non-negative padding followed by a slice.
   bool all_zero = true;
@@ -1798,8 +2057,8 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   }
 
   // Make this a bitcast if possible.
-  if (is_layout_sensitive_ &&
-      ReshapeOrCopyIsBitcast(reshape, valid_bitcast_callback_)) {
+  if (options_.is_layout_sensitive() &&
+      ReshapeOrCopyIsBitcast(reshape, options_.valid_bitcast_callback())) {
     ReplaceWithBitcast(reshape);
     return Status::OK();
   }
@@ -1820,18 +2079,165 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse) {
   return Status::OK();
 }
 
+StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
+    HloInstruction* slice) {
+  // Only try to do this for effective scalars. We could do the same for slicing
+  // out larger pieces of padding (replacing with a broadcast of the padding
+  // value), but this is probably not worth it.
+  if (!ShapeUtil::IsEffectiveScalar(slice->shape())) {
+    return false;
+  }
+
+  if (slice->operand(0)->opcode() == HloOpcode::kPad) {
+    VLOG(10) << "Trying to simplify scalar slice of pad";
+    // Check there's no internal padding. Again, we could handle that too, since
+    // everything is statically known, but it's not worth it.
+    auto pad = Cast<HloPadInstruction>(slice->mutable_operand(0));
+    auto padding_config = pad->padding_config();
+    int64 rank = padding_config.dimensions_size();
+    if (HasInteriorPadding(padding_config)) {
+      VLOG(10) << "Not folding scalar slice of pad, pad has interior padding";
+      return false;
+    }
+
+    // Check whether the scalar we're slicing out falls into the padding.
+    bool in_padding = [&]() {
+      for (int64 i = 0; i < rank; ++i) {
+        int64 start = slice->slice_starts(i);
+        int64 low = padding_config.dimensions(i).edge_padding_low();
+        int64 data = pad->operand(0)->shape().dimensions(i);
+        if (start >= low && start < low + data) {
+          return false;
+        }
+      }
+      return true;
+    }();
+
+    if (in_padding) {
+      VLOG(10) << "Folding scalar slice of pad into padding value";
+      TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+          slice, HloInstruction::CreateReshape(slice->shape(),
+                                               pad->mutable_padding_value())));
+      return true;
+    } else {
+      // We already know the output of the slice is scalar. If the padded
+      // value is scalar, and it's not in the padding, then it's exactly the
+      // output value.
+      bool replaced =
+          ReplaceInstructionIfSameShape(slice, pad->mutable_operand(0));
+      if (replaced) {
+        VLOG(10) << "Folding scalar slice of pad into padded value";
+      } else {
+        VLOG(10) << "Not folding scalar slice of pad into padded value as they "
+                    "have different shapes.";
+      }
+      return replaced;
+    }
+  }
+
+  if (slice->operand(0)->opcode() == HloOpcode::kConcatenate) {
+    VLOG(10) << "Trying to simplify scalar slice of concat";
+    // Only do this for R1, there's no chance of this being useful otherwise.
+    if (ShapeUtil::Rank(slice->shape()) != 1) {
+      VLOG(10) << "Not folding, slice is not rank 1";
+      return false;
+    }
+    HloConcatenateInstruction* concat =
+        Cast<HloConcatenateInstruction>(slice->mutable_operand(0));
+    int64 operand_start = 0;
+    int64 operand_num = 0;
+    // Weird loop structure to avoid annoying off-by-one errors.
+    while (true) {
+      TF_RET_CHECK(operand_num < concat->operand_count());
+      const HloInstruction* operand = concat->operand(operand_num);
+      int64 next_operand_start = operand_start + operand->shape().dimensions(0);
+      if (next_operand_start > slice->slice_starts(0)) {
+        break;
+      }
+      operand_start = next_operand_start;
+      operand_num++;
+    }
+
+    bool replaced = ReplaceInstructionIfSameShape(
+        slice, concat->mutable_operand(operand_num));
+    if (replaced) {
+      VLOG(10) << "Folding scalar slice of concat into concat operand";
+    } else {
+      VLOG(10) << "Folding scalar slice of concat into slice of concat operand";
+      TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+          slice, HloInstruction::CreateSlice(
+                     slice->shape(), concat->mutable_operand(operand_num),
+                     {slice->slice_starts(0) - operand_start},
+                     {slice->slice_starts(0) - operand_start + 1},
+                     slice->slice_strides())));
+    }
+    return true;
+  }
+
+  return false;
+}
+
+StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
+    HloInstruction* slice) {
+  CHECK_EQ(slice->opcode(), HloOpcode::kSlice);
+  if (!IsUnstridedSlice(slice)) {
+    return false;
+  }
+  HloInstruction* reshape = slice->mutable_operand(0);
+  if (reshape->opcode() != HloOpcode::kReshape) {
+    return false;
+  }
+  HloInstruction* new_slice_operand = reshape->mutable_operand(0);
+  int64 slice_rank = ShapeUtil::Rank(slice->shape());
+  std::vector<int64> sliced_dims;
+  for (int64 i = 0; i < slice_rank; ++i) {
+    if (slice->slice_starts(i) != 0 ||
+        slice->slice_limits(i) != reshape->shape().dimensions(i)) {
+      sliced_dims.push_back(i);
+    }
+  }
+
+  if (sliced_dims.size() == 1 && sliced_dims[0] == 0 &&
+      slice->slice_starts(0) == 0) {
+    const Shape& new_slice_shape = new_slice_operand->shape();
+    const int64 rank = ShapeUtil::Rank(new_slice_shape);
+    std::vector<int64> new_slice_starts(rank, 0);
+    std::vector<int64> new_slice_stides(rank, 1);
+    std::vector<int64> new_slice_limits(new_slice_shape.dimensions().begin(),
+                                        new_slice_shape.dimensions().end());
+    int64 slice_elements = ShapeUtil::ElementsIn(slice->shape());
+    for (int64 i = rank - 1; i >= 0; --i) {
+      if (slice_elements >= new_slice_limits[i]) {
+        if (slice_elements % new_slice_limits[i] != 0) {
+          return false;
+        }
+        slice_elements /= new_slice_limits[i];
+      } else {
+        new_slice_limits[i] = slice_elements;
+        slice_elements = 1;
+      }
+    }
+    HloInstruction* new_slice =
+        computation_->AddInstruction(HloInstruction::CreateSlice(
+            ShapeUtil::MakeShape(new_slice_shape.element_type(),
+                                 new_slice_limits),
+            new_slice_operand, new_slice_starts, new_slice_limits,
+            new_slice_stides));
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+        slice, HloInstruction::CreateReshape(slice->shape(), new_slice)));
+    return true;
+  }
+  return false;
+}
+
 Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   // Delete no-op slices, i.e. where shape = operand shape.
   if (ReplaceInstructionIfSameShape(slice, slice->mutable_operand(0))) {
     return Status::OK();
   }
 
-  auto is_unstrided_slice = [](const HloInstruction* hlo) {
-    return absl::c_all_of(hlo->slice_strides(),
-                          [](int64 stride) { return stride == 1; });
-  };
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
-      is_unstrided_slice(slice) && is_unstrided_slice(slice->operand(0))) {
+      IsUnstridedSlice(slice) && IsUnstridedSlice(slice->operand(0))) {
     HloInstruction* operand_slice = slice->mutable_operand(0);
     std::vector<int64> new_slice_starts = slice->slice_starts();
     std::vector<int64> new_slice_limits = slice->slice_limits();
@@ -1844,6 +2250,16 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
                    slice->shape(), operand_slice->mutable_operand(0),
                    new_slice_starts, new_slice_limits, slice->slice_strides()));
   }
+
+  TF_ASSIGN_OR_RETURN(bool replaced, TrySimplifyScalarSlice(slice));
+  if (replaced) {
+    return Status::OK();
+  }
+
+  TF_ASSIGN_OR_RETURN(replaced, TryToReorderSliceAndReshape(slice));
+  if (replaced) {
+    return Status::OK();
+  }
   return Status::OK();
 }
 
@@ -2057,6 +2473,12 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     return Status::OK();
   }
 
+  // Bail on dilation.
+  if (window_util::HasDilation(window)) {
+    VLOG(10) << "Not folding pad into reduce-window as there is dilation.";
+    return Status::OK();
+  }
+
   VLOG(10) << "Considering folding Pad: " << pad->ToString()
            << "\ninto reduce-window: " << reduce_window->ToString()
            << (convert != nullptr
@@ -2193,6 +2615,22 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
                          /*reduce_computation=*/function));
 }
 
+Status AlgebraicSimplifierVisitor::HandleSelect(HloInstruction* select) {
+  // select(x, y, y) -> y.
+  if (select->operand(1) == select->operand(2)) {
+    return ReplaceInstruction(select, select->mutable_operand(1));
+  }
+  // select(true, x, y) -> x.
+  if (IsAll(select->operand(0), true)) {
+    return ReplaceInstruction(select, select->mutable_operand(1));
+  }
+  // select(false, x, y) -> y.
+  if (IsAll(select->operand(0), false)) {
+    return ReplaceInstruction(select, select->mutable_operand(2));
+  }
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
   auto operand = sort->mutable_operand(0);
   int64 dimension_to_sort = sort->dimensions(0);
@@ -2203,7 +2641,109 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
     }
     // If it is key/value sort, the output of sort is a tuple.
     return ReplaceWithNewInstruction(
-        sort, HloInstruction::CreateTuple({operand, sort->mutable_operand(1)}));
+        sort, HloInstruction::CreateTuple(sort->operands()));
+  }
+  if (!options_.enable_permutation_sort_replacement()) {
+    return Status::OK();
+  }
+  // Check if we are sorting a permutation. In that case, we know that the keys
+  // will be sorted to the identity permutation, and we can represent the
+  // changes to the 'values' parameter as a scatter.
+  if (sort->operand_count() == 2 &&
+      operand->opcode() == HloOpcode::kGetTupleElement) {
+    const HloInstruction* other_sort = operand->operand(0);
+    // Check whether the 'values' parameter is the result of another sort with
+    // the same sort dimension.
+    if (other_sort->opcode() == HloOpcode::kSort &&
+        other_sort->operand_count() >= 2 &&
+        other_sort->dimensions(0) == dimension_to_sort &&
+        other_sort->operand(operand->tuple_index())->opcode() ==
+            HloOpcode::kIota) {
+      auto* iota =
+          Cast<HloIotaInstruction>(other_sort->operand(operand->tuple_index()));
+      // The sort operand needs to be an integral iota, and the iota dimension
+      // needs to be the dimension that was sorted.
+      if (iota->iota_dimension() == dimension_to_sort &&
+          ShapeUtil::ElementIsIntegral(iota->shape())) {
+        // We use the following construction method for a Scatter that applies
+        // the permutation from 'keys' to the 'values' parameter.
+        // - Take the "keys" parameter of the second sort and reshape it to have
+        //   another "1" dimension at the end.
+        // - Concatenate it with iotas of the same extended shape with all
+        //   different iota_dimensions except the dimension_to_sort in the order
+        //   of iota_dimensions/dimension_to_sort, so e.g. with rank 3 and
+        //   dimension_to_sort = 1, we would have concatenate of (iota with
+        //   iota_dimension=0, keys, iota with iota_dimension = 2)
+        // - Use this as the indices parameter of scatter, and set updates
+        //   of the scatter to be a reshaped 'values' parameter of sort (adding
+        //   'rank' many 1 dimensions at the end).
+        int64 rank = ShapeUtil::Rank(operand->shape());
+        Shape extended_shape = operand->shape();
+        extended_shape.add_dimensions(1);
+        extended_shape.mutable_layout()->add_minor_to_major(rank);
+        auto reshaped_permutation = computation_->AddInstruction(
+            HloInstruction::CreateReshape(extended_shape, operand));
+        std::vector<HloInstruction*> concat_operands;
+        for (int64 i = 0; i < rank; ++i) {
+          if (i == dimension_to_sort) {
+            concat_operands.push_back(reshaped_permutation);
+          } else {
+            concat_operands.push_back(computation_->AddInstruction(
+                HloInstruction::CreateIota(extended_shape, i)));
+          }
+        }
+        Shape concat_shape = operand->shape();
+        concat_shape.add_dimensions(rank);
+        concat_shape.mutable_layout()->add_minor_to_major(rank);
+        auto scatter_indices =
+            rank > 1 ? computation_->AddInstruction(
+                           HloInstruction::CreateConcatenate(
+                               concat_shape, concat_operands, rank))
+                     : reshaped_permutation;
+
+        // We don't care about the operand, it will be completely overridden by
+        // the updates.
+        auto scatter_operand = computation_->AddInstruction(
+            HloInstruction::CreateIota(sort->operand(1)->shape(), 0));
+
+        // Construct the updates operand of scatter.
+        Shape update_shape = sort->operand(1)->shape();
+        for (int64 i = 0; i < rank; ++i) {
+          update_shape.add_dimensions(1);
+          update_shape.mutable_layout()->add_minor_to_major(rank + i);
+        }
+        auto scatter_updates =
+            computation_->AddInstruction(HloInstruction::CreateReshape(
+                update_shape, sort->mutable_operand(1)));
+
+        // Construct the updates computation, which simply replaces the operand
+        // values with the update values.
+        HloComputation::Builder b("update_replace_computation");
+        Shape scalar_shape = ShapeUtil::MakeShape(S32, {});
+        b.AddInstruction(
+            HloInstruction::CreateParameter(0, scalar_shape, "scalar_lhs"));
+        auto scalar_rhs = b.AddInstruction(
+            HloInstruction::CreateParameter(1, scalar_shape, "scalar_rhs"));
+        auto update_replace_computation =
+            computation_->parent()->AddEmbeddedComputation(b.Build(scalar_rhs));
+
+        ScatterDimensionNumbers dim_numbers;
+        dim_numbers.set_index_vector_dim(rank);
+        for (int64 i = 0; i < rank; ++i) {
+          dim_numbers.add_update_window_dims(rank + i);
+          dim_numbers.add_scatter_dims_to_operand_dims(i);
+        }
+        auto scatter =
+            computation_->AddInstruction(HloInstruction::CreateScatter(
+                sort->operand(1)->shape(), scatter_operand, scatter_indices,
+                scatter_updates, update_replace_computation, dim_numbers));
+        return ReplaceWithNewInstruction(
+            sort, HloInstruction::CreateTuple(
+                      {computation_->AddInstruction(HloInstruction::CreateIota(
+                           operand->shape(), dimension_to_sort)),
+                       scatter}));
+      }
+    }
   }
   return Status::OK();
 }
@@ -2229,7 +2769,7 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
     return ReplaceInstruction(transpose, operand);
   }
 
-  if (is_layout_sensitive_ && TransposeIsBitcast(transpose)) {
+  if (options_.is_layout_sensitive() && TransposeIsBitcast(transpose)) {
     ReplaceWithBitcast(transpose);
     return Status::OK();
   }
@@ -2378,13 +2918,13 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   const ConvolutionDimensionNumbers& dnums =
       convolution->convolution_dimension_numbers();
 
-  if (!enable_conv_simplification_) {
+  if (!options_.enable_conv_simplification()) {
     return false;
   }
 
   // TODO(b/31337498): For now, we cowardly refuse to do this optimization in
   // layout-insensitive mode, for fear of adding nontrivial reshapes.
-  if (!is_layout_sensitive_) {
+  if (!options_.is_layout_sensitive()) {
     return false;
   }
 
@@ -2474,9 +3014,9 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   // We cannot insert bitcasts if the layouts will not be compatible.
   // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
   // invalid.
-  if (!valid_bitcast_callback_(input_shape, new_input_shape) ||
-      !valid_bitcast_callback_(filter_shape, new_filter_shape) ||
-      !valid_bitcast_callback_(dot_output_shape, convolution_shape)) {
+  if (!options_.valid_bitcast_callback()(input_shape, new_input_shape) ||
+      !options_.valid_bitcast_callback()(filter_shape, new_filter_shape) ||
+      !options_.valid_bitcast_callback()(dot_output_shape, convolution_shape)) {
     return false;
   }
 
@@ -2582,9 +3122,7 @@ StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
                  "AlgebraicSimplifier::Run(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (AlgebraicSimplifierVisitor::Run(
-            comp, is_layout_sensitive_, valid_bitcast_callback_,
-            enable_dot_strength_reduction_, enable_conv_simplification_)) {
+    if (AlgebraicSimplifierVisitor::Run(comp, options_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 9f8d0ee88bdebcf17310cd0407b1b99e4b0a7b5f..d2775b9fafa7e4c625f5d181114e80e7369f9c78 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -23,8 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-// A pass which performs algebraic simplifications.
-class AlgebraicSimplifier : public HloModulePass {
+class AlgebraicSimplifierOptions {
  public:
   // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
   // bitcast from 'from_shape' to 'to_shape' after considering platform
@@ -34,18 +33,63 @@ class AlgebraicSimplifier : public HloModulePass {
   using ValidBitcastCallback =
       std::function<bool(const Shape& from_shape, const Shape& to_shape)>;
 
+  explicit AlgebraicSimplifierOptions(
+      ValidBitcastCallback valid_bitcast_callback)
+      : valid_bitcast_callback_(std::move(valid_bitcast_callback)) {}
+  // If valid_bitcast_callback returns true, then the pass will replace reshapes
+  // and transposes with bitcasts.
+  const ValidBitcastCallback& valid_bitcast_callback() const {
+    return valid_bitcast_callback_;
+  }
+
+  // If is_layout_sensitive is true, then the simplifier preserves layout during
+  // transformation. Otherwise, layout is ignored.
+  void set_is_layout_sensitive(bool is_layout_sensitive) {
+    is_layout_sensitive_ = is_layout_sensitive;
+  }
+  bool is_layout_sensitive() const { return is_layout_sensitive_; }
+
+  // Enable dot simplification on platforms where it is profitable.
+  void set_enable_dot_strength_reduction(bool enable_dot_strength_reduction) {
+    enable_dot_strength_reduction_ = enable_dot_strength_reduction;
+  }
+  bool enable_dot_strength_reduction() const {
+    return enable_dot_strength_reduction_;
+  }
+
+  // Enable convolution simplification on platforms where it is profitable.
+  void set_enable_conv_simplification(bool enable_conv_simplification) {
+    enable_conv_simplification_ = enable_conv_simplification;
+  }
+  bool enable_conv_simplification() const {
+    return enable_conv_simplification_;
+  }
+
+  // If enable_permutation_sort_replacement is true, a sort op that is known to
+  // sort a permutation will be replaced with a scatter op.
+  void set_enable_permutation_sort_replacement(
+      bool enable_permutation_sort_replacement) {
+    enable_permutation_sort_replacement_ = enable_permutation_sort_replacement;
+  }
+  bool enable_permutation_sort_replacement() const {
+    return enable_permutation_sort_replacement_;
+  }
+
+ private:
+  ValidBitcastCallback valid_bitcast_callback_;
+  bool is_layout_sensitive_{false};
+  bool enable_dot_strength_reduction_{true};
+  bool enable_conv_simplification_{true};
+  bool enable_permutation_sort_replacement_{false};
+};
+
+// A pass which performs algebraic simplifications.
+class AlgebraicSimplifier : public HloModulePass {
+ public:
   // If is_layout_sensitive is true, then the simplifier preserves layout during
-  // transformation. Otherwise, layout is ignored. If valid_bitcast_callback
-  // returns true, then the pass will replace reshapes and transposes with
-  // bitcasts.
-  AlgebraicSimplifier(bool is_layout_sensitive,
-                      ValidBitcastCallback valid_bitcast_callback,
-                      bool enable_dot_strength_reduction = true,
-                      bool enable_conv_simplification = true)
-      : is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_strength_reduction_(enable_dot_strength_reduction),
-        enable_conv_simplification_(enable_conv_simplification) {}
+  // transformation. Otherwise, layout is ignored.
+  explicit AlgebraicSimplifier(const AlgebraicSimplifierOptions& options)
+      : options_(options) {}
   ~AlgebraicSimplifier() override = default;
   absl::string_view name() const override { return "algsimp"; }
 
@@ -54,14 +98,7 @@ class AlgebraicSimplifier : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  bool is_layout_sensitive_;
-  ValidBitcastCallback valid_bitcast_callback_;
-
-  // Enable dot simplification on platforms where it is profitable.
-  bool enable_dot_strength_reduction_;
-
-  // Enable convolution simplification on platforms where it is profitable.
-  bool enable_conv_simplification_;
+  AlgebraicSimplifierOptions options_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py
new file mode 100644
index 0000000000000000000000000000000000000000..5da13da041b4ded813876af7ca379025187545ab
--- /dev/null
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Proof that transforming (A*C)+(B*C) <=> (A+B)*C is "safe" if C=2^k.
+
+Specifically, for all floating-point values A, B, and C, if
+
+ - C is equal to +/- 2^k for some (possibly negative) integer k, and
+ - A, B, C, A*C, B*C, and A+B are not subnormal, zero, or inf,
+
+then there exists a rounding mode rm in [RTZ, RNE] such that
+
+ (A*C) + (B*C) == (A+B) * C  (computed with rounding mode rm).
+
+Informally, this means that the equivalence holds for powers of 2 C, modulo
+flushing to zero or inf, and modulo rounding of intermediate results.
+
+Requires z3 python bindings; try `pip install z3-solver`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import z3
+
+# We do float16 because it lets the solver run much faster.  These results
+# should generalize to fp32 and fp64, and you can verify this by changing the
+# value of FLOAT_TY (and then waiting a while).
+FLOAT_TY = z3.Float16
+
+a = z3.FP("a", FLOAT_TY())
+b = z3.FP("b", FLOAT_TY())
+c = z3.FP("c", FLOAT_TY())
+
+s = z3.Solver()
+
+# C must be a power of 2, i.e. significand bits must all be 0.
+s.add(z3.Extract(FLOAT_TY().sbits() - 1, 0, z3.fpToIEEEBV(c)) == 0)
+
+for rm in [z3.RTZ(), z3.RNE()]:
+  z3.set_default_rounding_mode(rm)
+  before = a * c + b * c
+  after = (a + b) * c
+
+  # Check that before == after, allowing that 0 == -0.
+  s.add(
+      z3.Not(
+          z3.Or(
+              before == after,  #
+              z3.And(z3.fpIsZero(before), z3.fpIsZero(after)))))
+
+  for x in [
+      (a * c),
+      (b * c),
+      (a + b),
+  ]:
+    s.add(z3.Not(z3.fpIsSubnormal(x)))
+    s.add(z3.Not(z3.fpIsZero(x)))
+    s.add(z3.Not(z3.fpIsInf(x)))
+
+if s.check() == z3.sat:
+  m = s.model()
+  print("Counterexample found!")
+  print(m)
+  print("a*c:       ", z3.simplify(m[a] * m[c]))
+  print("b*c:       ", z3.simplify(m[b] * m[c]))
+  print("a+b:       ", z3.simplify(m[a] + m[b]))
+  print("a*c + b*c: ", z3.simplify(m[a] * m[c] + m[b] * m[c]))
+  print("(a+b) * c: ", z3.simplify((m[a] + m[b]) * m[c]))
+else:
+  print("Proved!")
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 2047f894b465816eb97ba205e79033bd52bf7a0c..14ce519b6a0fd221070006d336d23bddeb6cd621 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -27,13 +27,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -43,21 +44,24 @@ namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
+namespace m = match;
 
-namespace op = xla::testing::opcode_matchers;
-
-AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() {
+AlgebraicSimplifierOptions::ValidBitcastCallback bitcasting_callback() {
   return [](const Shape&, const Shape&) { return true; };
 }
 
-AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
+AlgebraicSimplifierOptions::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
 
-class AlgebraicSimplifierTest : public HloVerifiedTestBase {};
+class AlgebraicSimplifierTest : public HloTestBase {
+ protected:
+  AlgebraicSimplifierOptions default_options_{non_bitcasting_callback()};
+};
 
 // Test that A + 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, AddZero) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -67,18 +71,140 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
+TEST_F(AlgebraicSimplifierTest, FactorIntegerAddition) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = s32[8] parameter(0)
+      p1 = s32[8] parameter(1)
+      p2 = s32[8] parameter(2)
+      x = s32[8] multiply(p0, p2)
+      y = s32[8] multiply(p1, p2)
+      ROOT sum = s32[8] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::MultiplyAnyOrder(
+          m::AddAnyOrder(m::Parameter(0), m::Parameter(1)), m::Parameter(2))));
+}
+
+// A*C + B*C => (A+B)*C if C is a floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAddition) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      c = f32[] constant(0.125)
+      x = f32[] multiply(p0, c)
+      y = f32[] multiply(p1, c)
+      ROOT sum = f32[] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::ConstantScalar(0.125))));
+}
+
+// A*C + B*C => (A+B)*C if C is a broadcast of a floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionWithBroadcast) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      p1 = f32[4] parameter(1)
+      c = f32[] constant(0.125)
+      b = f32[4] broadcast(c), dimensions={}
+      x = f32[4] multiply(p0, b)
+      y = f32[4] multiply(p1, b)
+      ROOT sum = f32[4] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::Broadcast(m::ConstantScalar(0.125)))));
+}
+
+// A*C + B*C => (A+B)*C simplification should not happen if C is not a
+// floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionNotPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      c = f32[] constant(0.3)
+      x = f32[] multiply(p0, c)
+      y = f32[] multiply(p1, c)
+      ROOT sum = f32[] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+// A*C + B*C => (A+B)*C simplification should not happen if A, B, and C are
+// complex numbers.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionComplex) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = c64[8] parameter(0)
+      p1 = c64[8] parameter(1)
+      p2 = c64[8] parameter(2)
+      x = c64[8] multiply(p0, p2)
+      y = c64[8] multiply(p1, p2)
+      ROOT sum = c64[8] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+// A*C + B*C => (A+B)*C simplification is OK if A, B, and C are complex.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionBfloat16) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = bf16[4] parameter(0)
+      p1 = bf16[4] parameter(1)
+      c = bf16[] constant(0.125)
+      b = bf16[4] broadcast(c), dimensions={}
+      x = bf16[4] multiply(p0, b)
+      y = bf16[4] multiply(p1, b)
+      ROOT sum = bf16[4] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::Broadcast(m::ConstantScalar(0.125)))));
+}
+
 // Test that A * 0 is simplified to 0
 TEST_F(AlgebraicSimplifierTest, MulZero) {
+  auto m = CreateNewVerifiedModule();
   Shape r0s32 = ShapeUtil::MakeShape(S32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -88,17 +214,81 @@ TEST_F(AlgebraicSimplifierTest, MulZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0s32, HloOpcode::kMultiply, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMultiply);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), zero);
 }
 
+// Test that select(true, a, b) is simplified to a
+TEST_F(AlgebraicSimplifierTest, SelectTrue) {
+  Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0s32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0s32, "param1"));
+  HloInstruction* one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(HloInstruction::CreateTernary(
+      r0s32, HloOpcode::kSelect, one, param0, param1));
+
+  auto module = CreateNewVerifiedModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(computation->root_instruction(), param0);
+}
+
+// Test that select(false, a, b) is simplified to b
+TEST_F(AlgebraicSimplifierTest, SelectFalse) {
+  Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0s32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0s32, "param1"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateTernary(
+      r0s32, HloOpcode::kSelect, zero, param0, param1));
+
+  auto module = CreateNewVerifiedModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(computation->root_instruction(), param1);
+}
+
+// Test that select(a, b, b) is simplified to b
+TEST_F(AlgebraicSimplifierTest, SelectIdentical) {
+  Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0s32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0s32, "param1"));
+  builder.AddInstruction(HloInstruction::CreateTernary(
+      r0s32, HloOpcode::kSelect, param0, param1, param1));
+
+  auto module = CreateNewVerifiedModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(computation->root_instruction(), param1);
+}
+
 // Test that Reduce(Reduce(A)) -> Reduce(A)
 TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   // Create add computation.
   HloInstruction* zero = builder.AddInstruction(
@@ -113,7 +303,7 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
   HloInstruction* param = builder.AddInstruction(
@@ -126,17 +316,17 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
   Shape r1f32 = ShapeUtil::MakeShape(F32, {5});
   builder.AddInstruction(HloInstruction::CreateReduce(r1f32, reduce0, zero,
                                                       dims1, add_computation));
-  module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  HloInstruction* root = module().entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reduce(param, zero));
+  m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  HloInstruction* root = m->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Reduce(m::Parameter(0), m::Op().Is(zero))));
   EXPECT_EQ(root->dimensions(), std::vector<int64>({0, 2, 3}));
 }
 
 // Test that Const + A is canonicalized to A + Const.
 TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -146,18 +336,18 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, constant, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0), m::Constant())));
 }
 
 // Test that [(A + C1) + C2] => [A + (C1 + C2)] for constants C1 and C2.
 TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -172,17 +362,19 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, add1, constant2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Add(constant1, constant2)));
+  EXPECT_THAT(root, GmockMatch(m::Add(
+                        m::Op().Is(param0),
+                        m::Add(m::Op().Is(constant1), m::Op().Is(constant2)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -194,17 +386,17 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   // Create add computation.
   HloComputation* add_computation = nullptr;
@@ -217,7 +409,7 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   Shape r2f32 = ShapeUtil::MakeShape(F32, {32, 1});
   HloInstruction* param0 = builder.AddInstruction(
@@ -230,17 +422,18 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
                    HloInstruction::CreateBroadcast(r2f32, zero, {}))},
       add_computation));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMap);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Broadcast(zero)));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0),
+                                      m::Broadcast(m::Op().Is(zero)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -252,64 +445,64 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({3.14f, 3.14f, 3.14f})));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_EQ(3.14f, root->operand(0)->literal().GetFirstElement<float>());
 }
 
 TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({3.14, 3.14, 4})));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({0.0f, 1.0f, 2.0f})));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
 }
 
 // Test that A - 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, SubZero) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -319,18 +512,18 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kSubtract, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that A - Const is canonicalized to A + (-Const).
 TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -340,18 +533,19 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kSubtract, param0, constant));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Negate(constant)));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0),
+                                      m::Negate(m::Op().Is(constant)))));
 }
 
 // Test that (A/B)/C is simplified to A/(B*C).
 TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -365,21 +559,24 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, div, param2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Divide(param0, param1), param2));
+              GmockMatch(m::Divide(m::Divide(m::Parameter(0), m::Parameter(1)),
+                                   m::Parameter(2))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Multiply(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Multiply(m::Parameter(1), m::Parameter(2)))));
 }
 
 // Test that A/(B/C) is simplified to (A*C)/B.
 TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -393,21 +590,25 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Divide(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Divide(m::Parameter(1), m::Parameter(2)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Multiply(param0, param2), param1));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Multiply(m::Parameter(0), m::Parameter(2)),
+                           m::Parameter(1))));
 }
 
 // Test that (A/B)/(C/D) is simplified to (A*D)/(B*C).
 TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {42, 123});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -425,23 +626,25 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, div0, div1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Divide(op::Divide(param0, param1), op::Divide(param2, param3)));
+      GmockMatch(m::Divide(m::Divide(m::Parameter(0), m::Parameter(1)),
+                           m::Divide(m::Parameter(2), m::Parameter(3)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Divide(op::Multiply(param0, param3), op::Multiply(param1, param2)));
+      GmockMatch(m::Divide(m::Multiply(m::Parameter(0), m::Parameter(3)),
+                           m::Multiply(m::Parameter(1), m::Parameter(2)))));
 }
 
 // Test that A/exp(B) is simplified to A*exp(-B).
 TEST_F(AlgebraicSimplifierTest, DivOfExp) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -453,21 +656,22 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, exp));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Exp(param1)));
+              GmockMatch(m::Divide(m::Parameter(0), m::Exp(m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Exp(op::Negate(param1))));
+              GmockMatch(m::Multiply(m::Parameter(0),
+                                     m::Exp(m::Negate(m::Parameter(1))))));
 }
 
 // Test that A/pow(B,C) is simplified to A*pow(B,-C).
 TEST_F(AlgebraicSimplifierTest, DivOfPower) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -481,22 +685,26 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, power));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Power(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Power(m::Parameter(1), m::Parameter(2)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Power(param1, op::Negate(param2))));
+              GmockMatch(m::Multiply(
+                  m::Parameter(0),
+                  m::Power(m::Parameter(1), m::Negate(m::Parameter(2))))));
 }
 
 // Test that broadcasting is done on the right step when simplifying A/pow(B,C)
 // to A*pow(B,-C).
 TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -510,21 +718,25 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide, param0, power));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Power(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Power(m::Parameter(1), m::Parameter(2)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   ASSERT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Power(param1, op::Negate(param2))));
+              GmockMatch(m::Multiply(
+                  m::Parameter(0),
+                  m::Power(m::Parameter(1), m::Negate(m::Parameter(2))))));
 }
 
 // A / Const => A * InvertedConst
 TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {3});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -535,18 +747,18 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide,
                                                       param0, constant));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Constant()));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Constant())));
 }
 
 // pow(pow(A, X), Y) => pow(A, X*Y)
 TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* base = builder.AddInstruction(
@@ -560,17 +772,19 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Power(base, op::Multiply(exp1, exp2)));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Power(m::Op().Is(base),
+                          m::Multiply(m::Op().Is(exp1), m::Op().Is(exp2)))));
 }
 
 // Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex
 // numbers.
 TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
+  auto m = CreateNewVerifiedModule();
   Shape r1c64 = ShapeUtil::MakeShape(C64, {7});
   HloComputation::Builder builder(TestName());
   HloInstruction* base = builder.AddInstruction(
@@ -584,14 +798,14 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1c64, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Test that A/1 is simplified to A for a scalar.
 TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -601,18 +815,18 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that A/1 is simplified to A for an array.
 TEST_F(AlgebraicSimplifierTest, DivOneArray) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -622,18 +836,18 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that complex(real(c), imag(c)) is simplified to c.
 TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   Shape r2c64 = ShapeUtil::MakeShape(C64, {2, 2});
   HloComputation::Builder builder(TestName());
@@ -646,18 +860,18 @@ TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
   HloInstruction* cplx = builder.AddInstruction(
       HloInstruction::CreateBinary(r2c64, HloOpcode::kComplex, real, imag));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, cplx);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that real(complex(r,i)) is simplified to r.
 TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -670,18 +884,18 @@ TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
   HloInstruction* real = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kReal, cplx));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, real);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
 // Test that imag(complex(r,i)) is simplified to i.
 TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
+  auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -694,18 +908,18 @@ TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
   HloInstruction* imag = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kImag, cplx));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, imag);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param1);
 }
 
 // Test that get_element(make_tuple({A,B}),1) is simplified to B
 TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -721,18 +935,18 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, get, param2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, add);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param1, param2));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(1), m::Parameter(2))));
 }
 
 // Test that exp(A)/exp(B) is simplified to exp(A-B)
 TEST_F(AlgebraicSimplifierTest, ExpDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -746,21 +960,23 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, exp0, exp1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Exp(param0), op::Exp(param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Exp(m::Parameter(0)), m::Exp(m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Subtract(param0, param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Exp(m::Subtract(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that exp(A)*exp(B) is simplified to exp(A+B)
 TEST_F(AlgebraicSimplifierTest, ExpMul) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -774,21 +990,22 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, exp0, exp1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Exp(param0), op::Exp(param1)));
+              GmockMatch(m::Multiply(m::Exp(m::Parameter(0)),
+                                     m::Exp(m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Add(param0, param1)));
+              GmockMatch(m::Exp(m::Add(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that pow(exp(A), B) is simplified to exp(A*B)
 TEST_F(AlgebraicSimplifierTest, PowExp) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -800,21 +1017,22 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, exp0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Power(op::Exp(param0), param1));
+              GmockMatch(m::Power(m::Exp(m::Parameter(0)), m::Parameter(1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Multiply(param0, param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Exp(m::Multiply(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that ln(pow(A, B)) is simplified to ln(A)*B
 TEST_F(AlgebraicSimplifierTest, LnPow) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -826,21 +1044,22 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, pow));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Log(op::Power(param0, param1)));
+              GmockMatch(m::Log(m::Power(m::Parameter(0), m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Log(param0), param1));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Multiply(m::Log(m::Parameter(0)), m::Parameter(1))));
 }
 
 // Test that ln(exp(A)) is simplified to A
 TEST_F(AlgebraicSimplifierTest, LnExp) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -850,19 +1069,20 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, exp0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Log(m::Exp(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that ln(exp(A)/exp(B)) is simplified to A-B
 TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -878,21 +1098,23 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
+              GmockMatch(m::Log(m::Divide(m::Exp(m::Parameter(0)),
+                                          m::Exp(m::Parameter(1))))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Subtract(m::Parameter(0), m::Parameter(1))));
 }
 
 // Test that pow(A, 0) where A is a scalar is simplified to the scalar
 // constant 1.
 TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -902,21 +1124,22 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_EQ(root->literal().GetFirstElement<float>(), 1);
 }
 
 // Test that pow(A, 0) where A is not a scalar is simplified to broadcast(1).
 TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {42});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -926,16 +1149,16 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param0, zero));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast());
+  EXPECT_THAT(root, GmockMatch(m::Broadcast()));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), r1f32))
       << ShapeUtil::HumanString(root->shape());
   EXPECT_EQ(root->dimensions().size(), 0);
@@ -945,6 +1168,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
 
 // Test that pow(A, 1) is simplified to A.
 TEST_F(AlgebraicSimplifierTest, Pow1) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -954,19 +1178,20 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(one))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
 }
 
 // Test that pow(A, 2) is simplified to A*A.
 TEST_F(AlgebraicSimplifierTest, Pow2) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -976,19 +1201,21 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, two));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(two))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(0))));
 }
 
 // Test that pow(A, -1) is simplified to 1/A.
 TEST_F(AlgebraicSimplifierTest, PowNegative1) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -998,22 +1225,23 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32, HloOpcode::kPower,
                                                       param0, negative_one));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(negative_one))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
+  EXPECT_THAT(root, GmockMatch(m::Divide(m::Broadcast(), m::Parameter(0))));
   EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kBroadcast);
   EXPECT_EQ(root->operand(0)->operand(0)->literal().GetFirstElement<float>(),
             1);
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* lhs = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {3, 3, 0}), "lhs"));
@@ -1046,17 +1274,17 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(builder.Build());
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
-              op::Convolution(lhs, rhs));
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+  m->AddEntryComputation(builder.Build());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Convolution(m::Op().Is(lhs), m::Op().Is(rhs))));
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1081,24 +1309,24 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   builder.AddInstruction(HloInstruction::CreateReduceWindow(
       ShapeUtil::MakeShape(F32, {5, 2}), param,
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
       window, add_computation));
-  module().AddEntryComputation(builder.Build());
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
-              op::ReduceWindow(param, op::Constant()));
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+  m->AddEntryComputation(builder.Build());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::ReduceWindow(m::Parameter(0), m::Constant())));
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1115,17 +1343,17 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))),
       padding));
-  module().AddEntryComputation(builder.Build());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
-              op::Pad(param, op::Constant()));
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+  m->AddEntryComputation(builder.Build());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Constant())));
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
 
   auto builder = HloComputation::Builder(TestName());
@@ -1139,39 +1367,40 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
       ShapeUtil::MakeShape(F32, {3, 2}), broadcast));
 
   auto computation = builder.Build();
-  module().AddEntryComputation(std::move(computation));
+  m->AddEntryComputation(std::move(computation));
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
-              op::Reshape(op::Broadcast(op::Reshape(op))));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reshape(m::Broadcast(m::Reshape(m::Op().Is(op))))));
 
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(), op);
+  EXPECT_THAT(m->entry_computation()->root_instruction(), op);
 }
 
 // Test that convert(A, $TYPE) is simplified to A if A is of type $TYPE.
 TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* input = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert(m::Op().Is(input))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), input);
 }
 
 // Test that copies are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
+  auto m = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1179,18 +1408,19 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
 TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1201,24 +1431,30 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
       ShapeUtil::MakeShape(F32, {1, 14, 14, 64}), HloOpcode::kCopy, param));
   *copy->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({1, 2, 0, 3});
-  auto computation = module().AddEntryComputation(builder.Build());
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param));
+  auto computation = m->AddEntryComputation(builder.Build());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier1(/*is_layout_sensitive=*/true,
-                                  non_bitcasting_callback());
-  ASSERT_FALSE(simplifier1.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier1(options);
+  ASSERT_FALSE(simplifier1.Run(m.get()).ValueOrDie());
   // Verify that the copy is not replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier2(/*is_layout_sensitive=*/true,
-                                  bitcasting_callback());
-  ASSERT_TRUE(simplifier2.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options2(bitcasting_callback());
+  options2.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier2(options2);
+  ASSERT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
   // Verify that the copy is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 // Test that unary concatenates are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
@@ -1226,19 +1462,20 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
   builder.AddInstruction(
       HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Concatenate(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
 // Test that empty operands of concatenates are removed.
 TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
+  auto m = CreateNewVerifiedModule();
   const int kParamLength = 100;
   Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
@@ -1255,22 +1492,24 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(
-      computation->root_instruction(),
-      op::Concatenate(empty_literal, param0, param0, empty_slice, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Concatenate(
+                  m::Op().Is(empty_literal), m::Parameter(0), m::Parameter(0),
+                  m::Op().Is(empty_slice), m::Parameter(1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Concatenate(param0, param0, param1));
+              GmockMatch(m::Concatenate(m::Parameter(0), m::Parameter(0),
+                                        m::Parameter(1))));
 }
 
 // Test that reduce of concat is simplified.
 TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
+  auto m = CreateNewVerifiedModule();
   const int kParamLength = 100;
   Shape r3f32 =
       ShapeUtil::MakeShape(F32, {kParamLength, kParamLength, kParamLength});
@@ -1296,7 +1535,7 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
   Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
   Shape reduce_shape = ShapeUtil::MakeShape(F32, {kParamLength});
@@ -1306,20 +1545,21 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
   builder.AddInstruction(HloInstruction::CreateReduce(
       reduce_shape, Concatenate, zero, {1, 2}, add_computation));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Map(op::Map(op::Reduce(param0, zero), op::Reduce(param1, zero)),
-              op::Reduce(param2, zero)));
+      GmockMatch(m::Map(m::Map(m::Reduce(m::Parameter(0), m::Op().Is(zero)),
+                               m::Reduce(m::Parameter(1), m::Op().Is(zero))),
+                        m::Reduce(m::Parameter(2), m::Op().Is(zero)))));
 }
 
 // Test a concatenate with only empty operands is removed.
 TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
+  auto m = CreateNewVerifiedModule();
   const int kParamLength = 100;
   Shape r1f32 = ShapeUtil::MakeShape(F32, {kParamLength});
   HloComputation::Builder builder(TestName());
@@ -1334,20 +1574,21 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, empty_slice}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Concatenate(empty_literal, empty_slice));
+              GmockMatch(m::Concatenate(m::Op().Is(empty_literal),
+                                        m::Op().Is(empty_slice))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), empty_literal);
 }
 
 // Test that concat with a scalar broadcast becomes a pad.
 TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {100});
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   HloComputation::Builder builder(TestName());
@@ -1360,17 +1601,88 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       ShapeUtil::MakeShape(F32, {200}), {broadcast, param0}, 0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Parameter(1))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
+  auto m = CreateNewVerifiedModule();
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {100, 99});
+  Shape concat_shape = ShapeUtil::MakeShape(F32, {50, 80});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r2f32, "param1"));
+
+  HloInstruction* slice0 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{0, 0},
+      /*limit_indices=*/{50, 10}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice0' and 'slice1' because of different start indices in
+  // dimension 0.
+  HloInstruction* slice1 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 10},
+      /*limit_indices=*/{100, 20}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice1' and 'slice2' because of stride in dimension 2.
+  HloInstruction* slice2 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 20},
+      /*limit_indices=*/{100, 40}, /*strides=*/{1, 2}));
+
+  // Cannot merge 'slice2' and 'slice3' because of stride in dimension 2.
+  HloInstruction* slice3 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 40},
+      /*limit_indices=*/{100, 50}, /*strides=*/{1, 1}));
+
+  // Can merge 'slice3' and 'slice4'.
+  HloInstruction* slice4 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 50},
+      /*limit_indices=*/{100, 60}, /*strides=*/{1, 1}));
+
+  // Can merge 'slice4' and 'slice5'.
+  HloInstruction* slice5 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 60},
+      /*limit_indices=*/{100, 70}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice5' and 'slice6' because of overlap.
+  HloInstruction* slice6 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 69},
+      /*limit_indices=*/{100, 79}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice6' and 'slice7' because of slicing from a different
+  // parameter.
+  HloInstruction* slice7 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param1, /*start_indices=*/{50, 79},
+      /*limit_indices=*/{100, 89}, /*strides=*/{1, 1}));
+
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      concat_shape,
+      {slice0, slice1, slice2, slice3, slice4, slice5, slice6, slice7}, 1));
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  auto s = m::Slice(m::Parameter(0));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Concatenate(s, s, s, s, s, m::Slice(m::Parameter(1)))));
+  // The operand 3 should be a merge of 'slice3', 'slice4' and 'slice5', so its
+  // shape should have dimensions {50, 30}.
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->operand(3)->shape(),
+                       ShapeUtil::MakeShape(F32, {50, 30})));
+  EXPECT_EQ(computation->root_instruction()->operand(3)->slice_starts(1), 40);
 }
 
 // Test that a simplification which changes layouts is not performed if layout
 // sensitive is true.
 TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1378,25 +1690,29 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   // Set to different layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has not been removed.
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 }
 
 // Test that a simplification which preserves layouts is performed if layout
 // sensitive is true.
 TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1404,17 +1720,19 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   // Set to same layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has been removed.
   EXPECT_THAT(computation->root_instruction(), param0);
@@ -1423,6 +1741,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
 // Test that a reshape which could be replaced with a bitcast is not if
 // add_bitcasts is false.
 TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1435,20 +1754,24 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   *reshape->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3, 4, 5});
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 }
 
 // Test transforming reshapes and transposes of rng.
 TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* zero = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
@@ -1465,21 +1788,22 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
                                 ShapeUtil::MakeShape(F32, {4}), transpose))
                             ->shape();
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
-  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  // Verify that that reshape(transpose(rng)) is replace by a single rng of the
+  // Verify that reshape(transpose(rng)) is replace by a single rng of the
   // same shape as the reshape.
-  EXPECT_THAT(computation->root_instruction(), op::Rng());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Rng()));
   EXPECT_TRUE(ShapeUtil::Equal(computation->root_instruction()->shape(),
                                reshape_shape));
 }
 
 // Test transforming reshapes to bitcasts under various conditions.
 TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1511,25 +1835,29 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   builder.AddInstruction(HloInstruction::CreateTuple(
       {transformable_reshape, dimensions_wrong_reshape, layout_wrong_reshape}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(transformable_reshape, dimensions_wrong_reshape,
-                        layout_wrong_reshape));
+              GmockMatch(m::Tuple(m::Op().Is(transformable_reshape),
+                                  m::Op().Is(dimensions_wrong_reshape),
+                                  m::Op().Is(layout_wrong_reshape))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
-  simplifier.Run(&module()).ValueOrDie();
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  simplifier.Run(m.get()).ValueOrDie();
 
   // Verify that only the first reshape is replaced.
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Tuple(op::Bitcast(), dimensions_wrong_reshape, layout_wrong_reshape));
+      GmockMatch(m::Tuple(m::Bitcast(), m::Op().Is(dimensions_wrong_reshape),
+                          m::Op().Is(layout_wrong_reshape))));
 }
 
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
 // 'changed' bit in AlgebraicSimplifier to false.
 TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // This add (param0 + 0) can be simplified.
@@ -1544,15 +1872,16 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {4}), add));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
-  module().AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  m->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
 // 'changed' bit in AlgebraicSimplifier to false.
 TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // This add (param0 + 0) can be simplified.
@@ -1568,13 +1897,14 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
       HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add,
                                       /*broadcast_dimensions=*/{0, 1}));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
-  module().AddEntryComputation(builder.Build());
-  EXPECT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  m->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1588,19 +1918,23 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3});
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1614,19 +1948,23 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({3, 1, 2, 0});
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1639,19 +1977,20 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {1, 2, 1, 1, 2, 1}), reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Reshape(param0)));
+              GmockMatch(m::Reshape(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1666,18 +2005,22 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 2, 1}),
       HloOpcode::kCopy, copy1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Copy(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1690,21 +2033,23 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
   builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {4, 3, 2}), transpose1, {1, 0, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Op().Is(transpose1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
             computation->root_instruction()->dimensions());
 }
 
 // Test merging reshape and broadcast.
 TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {5}), "param0"));
@@ -1713,20 +2058,21 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {1, 2, 3, 5, 1}), reshape1, {0, 3, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Broadcast(op::Reshape(param0)));
+              GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
 }
 
 // Test merging broadcast and reshape.
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {2, 3}), "param0"));
@@ -1735,19 +2081,20 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2}), broadcast1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param0)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {1}), "param"));
@@ -1756,20 +2103,20 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), broadcast));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {4}), "param"));
@@ -1778,21 +2125,22 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), broadcast));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
   EXPECT_THAT(computation->root_instruction()->dimensions(),
               ::testing::ElementsAre(3));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {1}), "param"));
@@ -1801,16 +2149,16 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 1}), broadcast));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
   const std::vector<int64> broadcast_dims =
       computation->root_instruction()->dimensions();
   EXPECT_EQ(1, broadcast_dims.size());
@@ -1818,6 +2166,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto param = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {4}), "param"));
@@ -1826,115 +2175,119 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 8}), broadcast));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(HloInstruction::CreateIota(
       ShapeUtil::MakeShape(F32, {1, 2, 3, 7, 12, 1}), 2));
   Shape result_shape = ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2});
   builder.AddInstruction(HloInstruction::CreateReshape(result_shape, iota));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {1, 1}), 0));
   auto result_shape = iota->shape();
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   auto root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_EQ(0.0f, root->operand(0)->literal().GetFirstElement<float>());
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2_6) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2}), 1));
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6}), iota));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 4}), 2));
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), iota));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(computation->root_instruction())
                 ->iota_dimension(),
             3);
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 2}), 2));
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 2}), iota));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   const int64 iota_dim =
       Cast<HloIotaInstruction>(computation->root_instruction())
           ->iota_dimension();
@@ -1942,21 +2295,23 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4x2_6x8) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto iota = builder.AddInstruction(
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {3, 2, 4, 2}), 2));
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6, 8}), iota));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
@@ -1976,14 +2331,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {2, 2}), param, zero, no_padding));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -2009,11 +2364,10 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {11, 5}), param, zero, padding));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
 
   auto has_negative_padding = [](const HloInstruction* pad) {
     for (auto& padding_dimension : pad->padding_config().dimensions()) {
@@ -2025,16 +2379,54 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
     return false;
   };
 
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
   EXPECT_TRUE(has_negative_padding(pad));
 
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Pad(m::Parameter(0), m::Op().Is(zero)))));
   EXPECT_FALSE(
       has_negative_padding(computation->root_instruction()->operand(0)));
 }
 
+TEST_F(AlgebraicSimplifierTest, TrivialInteriorPadding) {
+  // Verify that a pad instruction with interior padding on one-sized
+  // dimensions, removes the interior padding.
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {2, 1}), "param"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+  PaddingConfig padding;
+  for (int i = 0; i < 2; ++i) {
+    auto dimension = padding.add_dimensions();
+    dimension->set_edge_padding_low(3);
+    dimension->set_edge_padding_high(3);
+    dimension->set_interior_padding(i * 3);
+  }
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(F32, {8, 7}), param, zero, padding));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(default_options_);
+
+  ASSERT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
+  ASSERT_TRUE(HasInteriorPadding(pad->padding_config()));
+
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
+  EXPECT_FALSE(
+      HasInteriorPadding(computation->root_instruction()->padding_config()));
+}
+
 TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
@@ -2043,14 +2435,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {2, 3}), param));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -2066,14 +2458,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
       ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
       /*limit_indices=*/{dim0, dim1}, /*strides=*/{1, 1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -2095,22 +2487,75 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
       ShapeUtil::MakeShape(F32, {dim0 - 5, dim1 - 9}), original_slice,
       /*start_indices=*/{2, 3},
       /*limit_indices=*/{dim0 - 3, dim1 - 6}, /*strides=*/{1, 1}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Slice(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
   EXPECT_EQ(computation->root_instruction()->slice_starts(0), 3);
   EXPECT_EQ(computation->root_instruction()->slice_starts(1), 5);
   EXPECT_EQ(computation->root_instruction()->slice_limits(0), dim0 - 2);
   EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4);
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfReshapeToReshapeOfSlice) {
+  HloComputation::Builder builder(TestName());
+  const int64 dim0 = 11;
+  const int64 dim1 = 12;
+  const int64 dim2 = 13;
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {dim0 * dim1, dim2}), "param"));
+  HloInstruction* original_reshape =
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(F32, {dim0, dim1, dim2}), param));
+
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {dim0 - 2, dim1, dim2}), original_reshape,
+      /*start_indices=*/{0, 0, 0},
+      /*limit_indices=*/{dim0 - 2, dim1, dim2}, /*strides=*/{1, 1, 1}));
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Slice(m::Parameter(0)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 144, 25, 1, 512}), "param"));
+  HloInstruction* original_reshape =
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(F32, {3600, 512}), param));
+
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {960, 512}), original_reshape,
+      /*start_indices=*/{0, 0},
+      /*limit_indices=*/{960, 512}, /*strides=*/{1, 1}));
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2118,14 +2563,86 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
   builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), keys);
 }
 
+TEST_F(AlgebraicSimplifierTest, ReplacePermutationSortWithScatter) {
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::Tuple(
+                  m::Iota(),
+                  m::Scatter(m::Iota(), m::Concatenate(m::Iota(), m::Reshape()),
+                             m::Reshape()))));
+}
+
+TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortIfNonIntegral) {
+  // Same as ReplacePermutationSortWithScatter except that the iota has F32
+  // type.
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = f32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = f32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(gte, values), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortWrongDimensions) {
+  // Same as ReplacePermutationSortWithScatter except that the sort dimensions
+  // don't match.
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2133,16 +2650,188 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   Shape values_shape = ShapeUtil::MakeShape(S32, {5, 0});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  auto values = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, values_shape, "values"));
+  auto values0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, values_shape, "values0"));
+  auto values1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, values_shape, "values1"));
   builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys, values));
-  auto module = CreateNewModule();
+      ShapeUtil::MakeTupleShape({keys_shape, values_shape, values_shape}), 0,
+      keys, {values0, values1}));
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Tuple(keys, values));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Tuple(m::Op().Is(keys), m::Op().Is(values0),
+                                  m::Op().Is(values1))));
+}
+
+// Test that A && True is simplified to A
+TEST_F(AlgebraicSimplifierTest, AndTrue) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      param0, const_true));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+// Test that True && A is simplified to A
+TEST_F(AlgebraicSimplifierTest, AndTrue2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      const_true, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+// Test that A && False is simplified to False
+TEST_F(AlgebraicSimplifierTest, AndFalse) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      param0, const_false));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_false);
+}
+
+// Test that False && A is simplified to False
+TEST_F(AlgebraicSimplifierTest, AndFalse2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
+                                                      const_false, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_false);
+}
+
+// Test that A || True is simplified to True
+TEST_F(AlgebraicSimplifierTest, OrTrue) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, param0, const_true));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_true);
+}
+
+// Test that True || A is simplified to True
+TEST_F(AlgebraicSimplifierTest, OrTrue2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_true = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, const_true, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, const_true);
+}
+
+// Test that A || False is simplified to A
+TEST_F(AlgebraicSimplifierTest, OrFalse) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr,
+                                                      param0, const_false));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
+// Test that False || A is simplified to A
+TEST_F(AlgebraicSimplifierTest, OrFalse2) {
+  auto m = CreateNewVerifiedModule();
+  Shape r0pred = ShapeUtil::MakeShape(PRED, {});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0pred, "param0"));
+  HloInstruction* const_false = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr,
+                                                      const_false, param0));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kOr);
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
 }
 
 // Used for TEST_Ps that test merging (or not) of a kPad instruction into a
@@ -2266,18 +2955,18 @@ TEST_P(ConvInputPaddingTest, DoTest) {
           .ValueOrDie(),
       lhs_pad, filter, /*feature_group_count=*/1, window, dnums,
       DefaultPrecisionConfig(2)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
-    ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
+    ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
-    ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+    ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
-    ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
+    ASSERT_THAT(conv,
+                GmockMatch(m::Convolution(m::Parameter(), m::Parameter())));
     EXPECT_EQ(window_util::ToString(conv->window()),
               absl::StrCat("size=3x3 ", testcase.expected_conv_window));
   }
@@ -2384,18 +3073,18 @@ TEST_P(ConvFilterPaddingTest, DoIt) {
       input, rhs_pad, /*feature_group_count=*/1, window, dnums,
       precision_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
-    ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
+    ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
-    ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+    ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
-    ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
+    ASSERT_THAT(conv,
+                GmockMatch(m::Convolution(m::Parameter(), m::Parameter())));
     EXPECT_EQ(window_util::ToString(conv->window()),
               absl::StrFormat("size=%dx%d %s",
                               conv->operand(1)->shape().dimensions(2),
@@ -2533,11 +3222,12 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
         /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
     // TODO(b/80488902): verify this module.
-    auto module = HloTestBase::CreateNewModule();
+    auto module = CreateNewUnverifiedModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
-    AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                   bitcasting_callback());
+    AlgebraicSimplifierOptions simplifier_options(bitcasting_callback());
+    simplifier_options.set_is_layout_sensitive(true);
+    AlgebraicSimplifier simplifier(simplifier_options);
     if (!simplifier.Run(module.get()).ValueOrDie()) {
       return "NO_CHANGE";
     }
@@ -2653,24 +3343,22 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
       slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, slice);
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
 
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
-
-  root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(scalar_param));
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Op().Is(scalar_param))
+                             .WithShapeEqualTo(&slice_shape)));
 }
 
 // Test that reshape(transpose(broadcast(/*scalar value*/))) simplifies to a
@@ -2692,26 +3380,24 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   HloInstruction* reshape = builder.AddInstruction(
       HloInstruction::CreateReshape(reshape_shape, transpose));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reshape);
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
-
-  root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(forty_two));
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Op().Is(forty_two))
+                             .WithShapeEqualTo(&reshape_shape)));
 }
 
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
 TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   // TODO(b/80488902): verify this module.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2766,8 +3452,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
@@ -2775,7 +3460,8 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 
   // Verify the result
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::ReduceWindow(operand, op::Constant()));
+  EXPECT_THAT(root,
+              GmockMatch(m::ReduceWindow(m::Op().Is(operand), m::Constant())));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
       << ShapeUtil::HumanString(root->shape()) << " vs "
       << ShapeUtil::HumanString(reduce_window_shape);
@@ -2793,7 +3479,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 // ReduceWindow(Convert(op), x).
 TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
   // TODO(b/80488902): verify this module.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2852,8 +3538,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
@@ -2861,7 +3546,8 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
 
   // Verify the result
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::ReduceWindow(op::Convert(parameter), op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::ReduceWindow(m::Convert(m::Parameter(0)),
+                                               m::Constant())));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
       << ShapeUtil::HumanString(root->shape()) << " vs "
       << ShapeUtil::HumanString(reduce_window_shape);
@@ -2883,12 +3569,11 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   builder.AddInstruction(
       HloInstruction::CreateReverse(shape, a, /*dimensions=*/{2, 3}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(a, root);
@@ -2899,6 +3584,7 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   // Dots add computations to the parent module. Test that, when the HloModule's
   // computations are updated, then iterator invalidation doesn't occur
   // when running on subsequent computations.
+  auto m = CreateNewVerifiedModule();
   Shape r1f32 = ShapeUtil::MakeShape(F32, {1});
   HloComputation::Builder builder(TestName() + ".Dot");
   HloInstruction* x =
@@ -2920,15 +3606,15 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   call_builder.AddInstruction(
       HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get()));
 
-  module().AddEmbeddedComputation(std::move(dot_computation));
-  module().AddEntryComputation(call_builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  m->AddEmbeddedComputation(std::move(dot_computation));
+  m->AddEntryComputation(call_builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
 // Test that a constant with tuple shape becomes a tuple of constants.
 TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   const float constant_scalar = 7.3f;
   std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
@@ -2937,19 +3623,19 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
   Literal value = LiteralUtil::MakeTuple({&elements[0], &elements[1]});
   builder.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(op::Constant(), op::Constant()));
+              GmockMatch(m::Tuple(m::Constant(), m::Constant())));
 }
 
 // A dynamic-slice is trivial if its start indices are all zeroes and the size
 // of its input equals the size of its output.  In this case, the dynamic slice
 // is equal to its input.
 TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -2961,17 +3647,17 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
           1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
       /*slice_sizes=*/{10, 100, 1000}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Parameter());
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Parameter()));
 }
 
 // A dynamic-update-slice is trivial if its start indices are all zeroes and the
 // size of its "update" equals the size of its output.  In this case, the
 // dynamic-update-slice is equal to its update.
 TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -2994,16 +3680,16 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
       builder.AddInstruction(HloInstruction::CreateParameter(
           3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::DynamicSlice(op::Parameter(), op::Parameter()));
+              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter())));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloInstruction* input_array = builder.AddInstruction(
@@ -3014,19 +3700,19 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r3f32, inner_bcast, {0, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_THAT(root->dimensions(), ElementsAre(2));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 3});
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 5, 3});
@@ -3040,19 +3726,19 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r4f32, inner_bcast, {1, 2, 3}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Parameter(0)));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Parameter(0))));
   EXPECT_THAT(root->dimensions(), ElementsAre(1, 3));
 }
 
 // Test that a broadcast of an iota can be merged to one iota.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
   HloInstruction* iota =
@@ -3060,19 +3746,19 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 2, 2});
   builder.AddInstruction(HloInstruction::CreateBroadcast(r3f32, iota, {0, 2}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
 }
 
 // Test that a broadcast of an iota can be merged to one iota.
 TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 5, 3});
   HloInstruction* iota =
@@ -3081,17 +3767,184 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r4f32, iota, {1, 2, 3}));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[3,4] parameter(0)
+      constant = f32[] constant(0.0)
+      pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[2:3],[0:1]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[3,4] parameter(0)
+      constant = f32[] constant(0.0)
+      pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[6:7],[9:10]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[3,4] parameter(0)
+      constant = f32[] constant(0.0)
+      pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[9:10]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[1,1] parameter(0)
+      constant = f32[] constant(0.0)
+      pad = f32[8,10] pad(f32[1,1] param, f32[] constant), padding=3_4x4_5
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[3:4],[4:5]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Parameter()));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = f32[2] parameter(0)
+      param.1 = f32[1] parameter(1)
+      param.2 = f32[3] parameter(2)
+      concat = f32[6] concatenate(param.0, param.1, param.2), dimensions={0}
+      ROOT slice = f32[1] slice(concat), slice={[2:3]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Parameter(1)));
+}
+
+TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = f32[2] parameter(0)
+      param.1 = f32[1] parameter(1)
+      param.2 = f32[3] parameter(2)
+      concat = f32[6] concatenate(param.0, param.1, param.2), dimensions={0}
+      ROOT slice = f32[1] slice(concat), slice={[4:5]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Slice(m::Parameter(2))));
+  EXPECT_EQ(root->slice_starts(0), 1);
+  EXPECT_EQ(root->slice_limits(0), 2);
+}
+
+TEST_F(AlgebraicSimplifierTest, NegateNegate) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = f32[2] parameter(0)
+      neg.0 = f32[2] negate(param.0)
+      ROOT neg.1 = f32[2] negate(neg.0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Parameter(0)));
+}
+
+TEST_F(AlgebraicSimplifierTest, NotNot) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param.0 = pred[2] parameter(0)
+      not.0 = pred[2] not(param.0)
+      ROOT not.1 = pred[2] not(not.0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Parameter(0)));
+}
+
 struct PadReduceWindowEffectiveBroadcastCase {
   std::vector<int64> input_spatials;
   std::vector<int64> symmetric_pad_spatials;
@@ -3121,6 +3974,7 @@ class PadReduceWindowEffectiveBroadcastTest
           PadReduceWindowEffectiveBroadcastCase> {};
 
 TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
+  auto m = CreateNewVerifiedModule();
   const auto& param = GetParam();
 
   // a and b are parallel bounds we can either turn into a B F S0 S1 or
@@ -3169,7 +4023,7 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module().AddEmbeddedComputation(builder.Build());
+    add_computation = m->AddEmbeddedComputation(builder.Build());
   }
 
   Window window = window_util::MakeWindow(
@@ -3183,20 +4037,19 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
   builder.AddInstruction(HloInstruction::CreateReduceWindow(
       output_shape, pad, zero, window, add_computation));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), output_shape));
 
   if (param.should_become_broadcast) {
-    EXPECT_THAT(computation->root_instruction(), op::Broadcast(::testing::_));
+    EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Broadcast()));
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::ReduceWindow(::testing::_, zero));
+                GmockMatch(m::ReduceWindow(m::Op(), m::Op().Is(zero))));
   }
 }
 
@@ -3235,6 +4088,7 @@ class DotStrengthReductionTest
       public ::testing::WithParamInterface<
           ::testing::tuple<int, int, int, bool, bool, PrimitiveType>> {};
 TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
+  auto module = CreateNewVerifiedModule();
   int m, k, n;
   bool transpose_lhs, transpose_rhs;
   PrimitiveType element_type;
@@ -3264,10 +4118,9 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(&module()));
+  auto computation = module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
   const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
   const bool computation_should_be_modified =
       dot_should_be_transformed || (transpose_lhs && transpose_rhs);
@@ -3295,7 +4148,7 @@ struct DotOfConcatTestSpec {
 };
 
 class DotOfConcatSimplificationTest
-    : public HloVerifiedTestBase,
+    : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<DotOfConcatTestSpec> {};
 
 // Test that we transform
@@ -3303,6 +4156,7 @@ class DotOfConcatSimplificationTest
 // to
 //  add(dot(const_0, A), dot(const_1, B),  dot(const_2, C))
 TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfConcatTestSpec spec = GetParam();
@@ -3341,20 +4195,20 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
 
-  auto match_dot_0 = op::Dot(op::Slice(op::Constant()), op::Parameter(0));
-  auto match_dot_1 = op::Dot(op::Slice(op::Constant()), op::Parameter(1));
-  auto match_dot_2 = op::Dot(op::Slice(op::Constant()), op::Parameter(2));
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2));
+  auto match_dot_0 = m::Dot(m::Slice(m::Constant()), m::Parameter(0));
+  auto match_dot_1 = m::Dot(m::Slice(m::Constant()), m::Parameter(1));
+  auto match_dot_2 = m::Dot(m::Slice(m::Constant()), m::Parameter(2));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Add(m::Add(match_dot_0, match_dot_1), match_dot_2)));
 }
 
 // Test that we transform
@@ -3362,6 +4216,7 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
 // to
 //  add(dot(A, const_0), dot(B, const_1),  dot(C, const_2))
 TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfConcatTestSpec spec = GetParam();
@@ -3405,21 +4260,21 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
 
-  auto match_dot_0 = op::Dot(op::Parameter(0), op::Slice(op::Constant()));
-  auto match_dot_1 = op::Dot(op::Parameter(1), op::Slice(op::Constant()));
-  auto match_dot_2 = op::Dot(op::Parameter(2), op::Slice(op::Constant()));
-  auto match_dot_3 = op::Dot(op::Parameter(3), op::Slice(op::Constant()));
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2),
-                      match_dot_3));
+  auto match_dot_0 = m::Dot(m::Parameter(0), m::Slice(m::Constant()));
+  auto match_dot_1 = m::Dot(m::Parameter(1), m::Slice(m::Constant()));
+  auto match_dot_2 = m::Dot(m::Parameter(2), m::Slice(m::Constant()));
+  auto match_dot_3 = m::Dot(m::Parameter(3), m::Slice(m::Constant()));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Add(m::Add(m::Add(match_dot_0, match_dot_1), match_dot_2),
+                        match_dot_3)));
 }
 
 DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
@@ -3433,6 +4288,7 @@ DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
 // Test that DynamicUpdateSlice update param with any dimension equal to zero
 // gets removed.
 TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   const Shape dslice_shape = ShapeUtil::MakeShape(F32, {10});
   HloInstruction* const operand = builder.AddInstruction(
@@ -3445,11 +4301,10 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       dslice_shape, operand, update, start_indices));
   const HloComputation* const computation =
-      module().AddEntryComputation(builder.Build());
+      m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), operand);
 }
 
@@ -3468,7 +4323,7 @@ struct DotOfGatherTestSpec {
 };
 
 class DotOfGatherSimplificationTest
-    : public HloVerifiedTestBase,
+    : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<DotOfGatherTestSpec> {};
 
 // input: dot(DS(ctA), ctB))
@@ -3477,6 +4332,7 @@ class DotOfGatherSimplificationTest
 // output: DS(dot(ctA, ctB))
 // => output dimensions: DS ({M x N}, {s, 0}, {1, N}) => {1 x N}.
 TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfGatherTestSpec spec = GetParam();
@@ -3523,10 +4379,9 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, ds, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
@@ -3536,8 +4391,8 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
               HloOpcode::kDynamicSlice);
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
-                                 op::Concatenate()));
+                GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
+                                           m::Concatenate())));
   }
 }
 
@@ -3547,6 +4402,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
 // output: DS(dot(ctA, ctB))
 // => output dimensions: DS ({M x N}, {0, s}, {M, 1}) => {M x 1}.
 TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   DotOfGatherTestSpec spec = GetParam();
@@ -3593,10 +4449,9 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, ds, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = module().AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  auto computation = m->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
@@ -3606,8 +4461,8 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
               HloOpcode::kDynamicSlice);
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
-                                 op::Concatenate()));
+                GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
+                                           m::Concatenate())));
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index 1ed6142dcecdc830cb7b8386e0cc20a2ea54aa7f..ef5e211646e7b0b66b8e6c09948be58063422943 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -176,13 +176,13 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
 }
 
 StatusOr<std::vector<const ShapedBuffer*>> AllocationTracker::Resolve(
-    const GlobalDataHandle& data) {
+    const GlobalDataHandle& data) const {
   tensorflow::mutex_lock lock(mutex_);
   return AllocationTracker::ResolveInternal(data);
 }
 
 StatusOr<const ShapedBuffer*> AllocationTracker::ResolveForReplica(
-    const GlobalDataHandle& data, int replica_id) {
+    const GlobalDataHandle& data, int replica_id) const {
   tensorflow::mutex_lock lock(mutex_);
   TF_ASSIGN_OR_RETURN(std::vector<const ShapedBuffer*> replicated_buffers,
                       ResolveInternal(data));
@@ -196,7 +196,7 @@ StatusOr<const ShapedBuffer*> AllocationTracker::ResolveForReplica(
 }
 
 StatusOr<std::vector<const ShapedBuffer*>> AllocationTracker::ResolveInternal(
-    const GlobalDataHandle& data) {
+    const GlobalDataHandle& data) const {
   VLOG(2) << "resolve:" << data.handle();
   auto it = handle_to_shaped_buffers_.find(data.handle());
   if (it == handle_to_shaped_buffers_.end()) {
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 43feccee3c67152c6f61098bb98d546379848b8c..98d1a302a9f66f4a00e05d62837a79133e222687 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -65,13 +65,13 @@ class AllocationTracker {
   // replica, or provide an error status to say whether any of those buffers
   // were not found (or found, but found deallocated).
   StatusOr<std::vector<const ShapedBuffer*>> Resolve(
-      const GlobalDataHandle& data);
+      const GlobalDataHandle& data) const;
 
   // Resolves a handle from an XLA client and replica id to a shaped buffer, or
   // provide an error status to say whether it was not found (or found, but
   // found deallocated).
   StatusOr<const ShapedBuffer*> ResolveForReplica(const GlobalDataHandle& data,
-                                                  int replica_id);
+                                                  int replica_id) const;
 
  private:
   // Data structure encapsulating single memory allocation on the device.
@@ -87,7 +87,7 @@ class AllocationTracker {
   // Internal helper which resolves the given GlobalDataHandle to a
   // list of ScopedShapedBuffers.
   StatusOr<std::vector<const ShapedBuffer*>> ResolveInternal(
-      const GlobalDataHandle& data) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+      const GlobalDataHandle& data) const EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   // Internal helper which registers a vector of shaped buffers, one per
   // replica.  ShapedBufferTy is either ScopedShapedBuffer or ShapedBuffer.  If
@@ -113,7 +113,7 @@ class AllocationTracker {
   // maintained per device ordinal.
   using AllocationMap = absl::flat_hash_map<const void*, Allocation>;
 
-  tensorflow::mutex mutex_;
+  mutable tensorflow::mutex mutex_;
 
   // Backend to use with this tracker. The backend supplies the memory allocator
   // to use when deallocating memory.
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..362bc44a1cf377b51c5519c6ab5e0d9628e80e58
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -0,0 +1,285 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/ar_crs_combiner.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+namespace {
+
+namespace m = match;
+
+// If the argument instruction is a CRS in the sequence
+// AR -> Convert -> Add -> CRS
+// then return the AR in the sequence.
+// TODO(b/117554291): Rewrite this to recognize more general patterns,
+// not just the specific one of AR -> Add -> Convert -> CRS.
+absl::optional<HloInstruction*> MatchesArCrsPattern(
+    HloInstruction* instruction) {
+  HloInstruction *ar, *convert, *add, *crs;
+  if (Match(instruction,
+            m::CrossReplicaSum(
+                &crs, m::Add(&add, m::Op(),
+                             m::Convert(&convert,
+                                        m::CrossReplicaSum(&ar, m::Op()))))) &&
+      ar->users().size() == 1 && ar->shape().element_type() == BF16 &&
+      convert->shape().element_type() == F32 && !crs->all_reduce_id()) {
+    return ar;
+  }
+  return absl::optional<HloInstruction*>();
+}
+
+}  // namespace
+
+absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
+    HloInstruction* instruction) {
+  CHECK_EQ(HloOpcode::kParameter, instruction->opcode());
+  HloComputation* computation = instruction->parent();
+  auto caller_instructions = call_graph_->GetComputationCallers(computation);
+  if (caller_instructions.size() == 1) {
+    auto caller_instruction = caller_instructions[0];
+    if (caller_instruction->opcode() == HloOpcode::kWhile) {
+      return caller_instruction;
+    }
+  }
+  return absl::optional<HloInstruction*>();
+}
+
+std::vector<HloInstruction*> ArCrsCombiner::GetAllTuples(
+    HloInstruction* instruction) {
+  if (instruction->opcode() == HloOpcode::kTuple) {
+    return {instruction};
+  }
+  if (instruction->opcode() == HloOpcode::kDomain) {
+    return GetAllTuples(instruction->operands()[0]);
+  }
+  if (instruction->opcode() == HloOpcode::kParameter) {
+    auto maybe_while = WhileFromBodyParameter(instruction);
+    if (!maybe_while) {
+      return {};
+    }
+    auto while_instr = *maybe_while;
+    auto init_tuples = GetAllTuples(while_instr->while_init());
+    auto body_tuples =
+        GetAllTuples(while_instr->while_body()->root_instruction());
+    if (init_tuples.empty() || body_tuples.empty()) {
+      return {};
+    }
+    init_tuples.insert(init_tuples.end(), body_tuples.begin(),
+                       body_tuples.end());
+    return init_tuples;
+  }
+  if (instruction->opcode() == HloOpcode::kGetTupleElement) {
+    std::vector<HloInstruction*> result_tuples;
+    for (auto tuple : GetAllTuples(instruction->operands()[0])) {
+      auto tmp_tuples =
+          GetAllTuples(tuple->mutable_operand(instruction->tuple_index()));
+      if (tmp_tuples.empty()) {
+        return {};
+      }
+      result_tuples.insert(result_tuples.end(), tmp_tuples.begin(),
+                           tmp_tuples.end());
+    }
+    return result_tuples;
+  }
+  return {};
+}
+
+bool ArCrsCombiner::TupleElementsComputeSameValue(
+    HloInstruction* tuple_shaped_instruction, int64 i1, int64 i2,
+    absl::flat_hash_map<int64, int64>* visited_pairs) {
+  auto tuples = GetAllTuples(tuple_shaped_instruction);
+  if (tuples.empty()) {
+    return false;
+  }
+  for (auto tuple : tuples) {
+    CHECK_EQ(tuple->opcode(), HloOpcode::kTuple);
+    if (!InstructionsComputeSameValue(tuple->mutable_operand(i1),
+                                      tuple->mutable_operand(i2),
+                                      visited_pairs)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/* static */
+bool ArCrsCombiner::TestInstructionsComputeSameValue(HloInstruction* i1,
+                                                     HloInstruction* i2) {
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2);
+  auto module = i1->parent()->parent();
+  CHECK_EQ(module, i2->parent()->parent());
+  combiner.call_graph_ = CallGraph::Build(module);
+  absl::flat_hash_map<int64, int64> visited_pairs;
+  return combiner.InstructionsComputeSameValue(i1, i2, &visited_pairs);
+}
+
+bool ArCrsCombiner::InstructionsComputeSameValue(
+    HloInstruction* i1, HloInstruction* i2,
+    absl::flat_hash_map<int64, int64>* visited_pairs) {
+  if (i1 == i2) {
+    return true;
+  }
+  auto uid1 = i1->unique_id();
+  auto uid2 = i2->unique_id();
+  auto min_uid = std::min(uid1, uid2);
+  auto max_uid = std::max(uid1, uid2);
+  auto it = visited_pairs->find(min_uid);
+  if (it != visited_pairs->end() && max_uid == it->second) {
+    return true;
+  }
+  auto opcode1 = i1->opcode();
+  auto operands1 = i1->operands();
+  if (opcode1 != i2->opcode() || operands1.size() != i2->operands().size()) {
+    return false;
+  }
+  visited_pairs->emplace(min_uid, max_uid);
+  for (int i = 0; i < operands1.size(); ++i) {
+    auto operand1 = operands1[i];
+    auto operand2 = i2->operands()[i];
+    if (!InstructionsComputeSameValue(operand1, operand2, visited_pairs)) {
+      return false;
+    }
+  }
+  if (opcode1 == HloOpcode::kParameter) {
+    // In the general case, we don't try to prove equality of parameters.
+    // We only try in the context of get-tuple-element
+    // (see TupleElementsComputeSameValue).
+    return false;
+  }
+  if (opcode1 == HloOpcode::kGetTupleElement) {
+    return i1->tuple_index() == i2->tuple_index() ||
+           TupleElementsComputeSameValue(operands1[0], i1->tuple_index(),
+                                         i2->tuple_index(), visited_pairs);
+  }
+  // Don't check that the operands are identical, because Identical can
+  // return false for instructions that compute the same value but are not
+  // identical, which we don't want. We have checked the arguments with
+  // InstructionsComputeSameValue earlier.
+  auto eq_instructions = [](const HloInstruction* i1,
+                            const HloInstruction* i2) -> bool { return true; };
+  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
+    return *a == *b;
+  };
+  return i1->Identical(*i2, eq_instructions, eq_computations,
+                       /*layout_sensitive=*/false);
+}
+
+void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      auto ar = MatchesArCrsPattern(instruction);
+      if (ar) {
+        all_reduce_map_[*((*ar)->all_reduce_id())].push_back(*ar);
+      }
+    }
+  }
+}
+
+void ArCrsCombiner::KeepProvablyEqualInstructionGroups() {
+  for (auto it : all_reduce_map_) {
+    auto instruction_vec = it.second;
+    CHECK_EQ(instruction_vec.size(), num_spatial_partitions_);
+
+    auto instr_0 = instruction_vec[0];
+    auto add_0 = instr_0->users()[0]->users()[0];
+    CHECK_EQ(HloOpcode::kAdd, add_0->opcode());
+
+    for (int i = 1; i < instruction_vec.size(); ++i) {
+      auto instr_i = instruction_vec[i];
+      auto add_i = instr_i->users()[0]->users()[0];
+      CHECK_EQ(HloOpcode::kAdd, add_i->opcode());
+      absl::flat_hash_map<int64, int64> visited_pairs;
+      if (!InstructionsComputeSameValue(add_0, add_i, &visited_pairs)) {
+        all_reduce_map_.erase(it.first);
+      }
+    }
+  }
+}
+
+StatusOr<bool> ArCrsCombiner::RewriteGraph() {
+  if (all_reduce_map_.empty()) {
+    return false;
+  }
+
+  auto computation_is_addition = [](HloComputation* c) {
+    return c->instruction_count() == 3 &&
+           Match(c->root_instruction(), m::Add(m::Parameter(), m::Parameter()));
+  };
+
+  for (auto it : all_reduce_map_) {
+    auto instruction_vec = it.second;
+    for (auto all_reduce : instruction_vec) {
+      auto parent_computation = all_reduce->parent();
+      auto convert = all_reduce->users()[0];
+      auto add = convert->users()[0];
+      auto crs = add->users()[0];
+
+      if (!computation_is_addition(all_reduce->called_computations()[0]) ||
+          !computation_is_addition(crs->called_computations()[0])) {
+        continue;
+      }
+      HloInstruction* other_summand = (add->operands()[0] == convert)
+                                          ? add->operands()[1]
+                                          : add->operands()[0];
+      // To move the AR past the addition, we need to divide other_summand by
+      // the number of spatial partitions.
+      CHECK_EQ(all_reduce->user_count(), 1);
+      TF_CHECK_OK(
+          all_reduce->ReplaceAllUsesWith(all_reduce->mutable_operand(0)));
+      auto shape = other_summand->shape();
+      Literal lit(shape);
+      lit.PopulateWithValue<float>(num_spatial_partitions_);
+      auto divisor = parent_computation->AddInstruction(
+          HloInstruction::CreateConstant(lit.Clone()));
+      auto division =
+          parent_computation->AddInstruction(HloInstruction::CreateBinary(
+              shape, HloOpcode::kDivide, other_summand, divisor));
+      TF_CHECK_OK(other_summand->ReplaceUseWith(add, division));
+      // The AllReduce and the CRS are combined to an all-core AllReduce.
+      crs->set_all_reduce_id(all_reduce->all_reduce_id());
+      TF_CHECK_OK(parent_computation->RemoveInstruction(all_reduce));
+    }
+  }
+
+  return true;
+}
+
+StatusOr<bool> ArCrsCombiner::Run(HloModule* module) {
+  call_graph_ = CallGraph::Build(module);
+
+  GroupAllReducesById(module);
+
+  KeepProvablyEqualInstructionGroups();
+
+  return RewriteGraph();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6a7ef76ec3b76972d1b2c7fb548cecfb9423160
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Combine an AllReduce and a CrossReplicaSum when they are close to each other
+// in the graph, to use an efficient CrossReplicaSum implementation that
+// fully utilizes the interconnect bandwidth.
+class ArCrsCombiner : public HloModulePass {
+ public:
+  ArCrsCombiner(int num_spatial_partitions)
+      : num_spatial_partitions_(num_spatial_partitions) {}
+  absl::string_view name() const override { return "ar-crs-combiner"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Helper method to allow testing of InstructionsComputeSameValue.
+  static bool TestInstructionsComputeSameValue(HloInstruction* i1,
+                                               HloInstruction* i2);
+
+ private:
+  // If the passed instruction is a while parameter, and the while body is only
+  // called by a single while instruction, return the while instruction.
+  absl::optional<HloInstruction*> WhileFromBodyParameter(
+      HloInstruction* instruction);
+
+  // Returns a vector of tuple instructions.
+  // If all instructions that flow to "instruction" are tuples, return them.
+  // Otherwise, return an empty vector.
+  std::vector<HloInstruction*> GetAllTuples(HloInstruction* instruction);
+
+  // Checks whether two different elements in the same tuple compute the same
+  // value.
+  bool TupleElementsComputeSameValue(
+      HloInstruction* tuple_shaped_instruction, int64 i1, int64 i2,
+      absl::flat_hash_map<int64, int64>* visited_pairs);
+
+  // Returns whether the instructions i1 and i2 can be shown to evaluate to the
+  // same value. Handling WHILE requires recursion, which may cause us to visit
+  // the same instruction again. To avoid infinite loops, we pass a cache of
+  // visited instruction pairs.
+  bool InstructionsComputeSameValue(
+      HloInstruction* i1, HloInstruction* i2,
+      absl::flat_hash_map<int64, int64>* visited_pairs);
+
+  // Populates all_reduce_map_.
+  void GroupAllReducesById(HloModule* module);
+
+  // Looks at each AllReduce group in all_reduce_map_, and keeps only the
+  // groups for which it's safe to move the AllReduce later in the HLO graph.
+  void KeepProvablyEqualInstructionGroups();
+
+  // Performs the graph rewrite that eliminates the early AllReduce and turns
+  // the later CRS into an AllReduce.
+  StatusOr<bool> RewriteGraph();
+
+  int num_spatial_partitions_;
+
+  // Map from all-reduce ids to the all reduce instructions.
+  absl::flat_hash_map<int64, std::vector<HloInstruction*>> all_reduce_map_;
+
+  std::unique_ptr<CallGraph> call_graph_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10171835d83c75fef091a34b8fe102d263211307
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -0,0 +1,496 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/ar_crs_combiner.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ArCrsCombinerTest : public HloTestBase {};
+
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(
+      i1, module->entry_computation()->parameter_instruction(0)));
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase2) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (x: f32[]) -> (f32[], f32[]) {
+  %x = f32[] parameter(0)
+  ROOT %tuple = (f32[], f32[]) tuple(%x, %x)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase3) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (x: f32[], y: f32[]) -> (f32[], f32[]) {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %tuple = (f32[], f32[]) tuple(%x, %y)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestNumOperands) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> ((f32[2,2]), (f32[2,2], f32[2,2])) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple1 = (f32[2,2]) tuple(%constant.f32)
+  %tuple2 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %tuple = ((f32[2,2]), (f32[2,2], f32[2,2])) tuple(%tuple1, %tuple2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestSliceIndicesMatch) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2]) -> (f32[1], f32[1]) {
+  %p = f32[2] parameter(0)
+  %slice.1 = f32[1] slice(f32[2] %p), slice={[0:1]}
+  %slice.2 = f32[1] slice(f32[2] %p), slice={[0:1]}
+  ROOT %tuple = (f32[1], f32[1]) tuple(%slice.1, %slice.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestSliceIndicesDontMatch) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2]) -> (f32[1], f32[1]) {
+  %p = f32[2] parameter(0)
+  %slice.1 = f32[1] slice(f32[2] %p), slice={[0:1]}
+  %slice.2 = f32[1] slice(f32[2] %p), slice={[1:2]}
+  ROOT %tuple = (f32[1], f32[1]) tuple(%slice.1, %slice.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementSameIndex) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementDifferentIndex1) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementDifferentIndex2) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{2, 3}, {4, 5}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile1) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0];
+  auto i2 = body_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile2) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {7, 8}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0];
+  auto i2 = body_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile3) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {1, 2}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32.1)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32.2)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0]->operands()[0];  // %get-tuple-element.1
+  auto i2 = body_tuple->operands()[1]->operands()[0];  // %get-tuple-element.2
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, RewritePatternArConvertAddCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+
+  %cross-replica-sum.ar.1 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=0}
+  %convert.1 = f32[2,2]
+      convert(%cross-replica-sum.ar.1),
+      sharding={maximal device=0}
+  %add.1 = f32[2,2]
+      add(%constant.f32, %convert.1),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[2,2]
+      cross-replica-sum(%add.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=1}
+  %convert.2 = f32[2,2]
+      convert(%cross-replica-sum.ar.2),
+      sharding={maximal device=1}
+  %add.2 = f32[2,2]
+      add(%constant.f32, %convert.2),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[2,2]
+      cross-replica-sum(%add.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[2,2], f32[2,2])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(
+          op::CrossReplicaSum(op::Add(
+              op::Divide(op::Constant(), op::Constant()), op::Convert())),
+          op::CrossReplicaSum(op::Add(
+              op::Divide(op::Constant(), op::Constant()), op::Convert()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  ASSERT_EQ(replica_groups_before.size(), replica_groups_after.size());
+  for (int i = 0; i < replica_groups_before.size(); ++i) {
+    // Somewhat verbose way to compare the replica_ids, because EqualsProto
+    // is not available in the open-source build.
+    auto group_before = replica_groups_before[i];
+    std::vector<int64> ids_before(group_before.replica_ids().begin(),
+                                  group_before.replica_ids().end());
+    auto group_after = replica_groups_after[i];
+    std::vector<int64> ids_after(group_after.replica_ids().begin(),
+                                 group_after.replica_ids().end());
+    EXPECT_EQ(ids_before, ids_after);
+  }
+}
+
+TEST_F(ArCrsCombinerTest, OtherSummandNotTheSameDontRewrite) {
+  const char* module_str = R"(
+HloModule foobar
+
+%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+
+  %cross-replica-sum.ar.1 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=0}
+  %convert.1 = f32[2,2]
+      convert(%cross-replica-sum.ar.1),
+      sharding={maximal device=0}
+  %add.1 = f32[2,2]
+      add(%constant.f32.1, %convert.1),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[2,2]
+      cross-replica-sum(%add.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=1}
+  %convert.2 = f32[2,2]
+      convert(%cross-replica-sum.ar.2),
+      sharding={maximal device=1}
+  %add.2 = f32[2,2]
+      add(%constant.f32.2, %convert.2),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[2,2]
+      cross-replica-sum(%add.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[2,2], f32[2,2])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
index 38f1a5d3a645f98220ec445bb9bbdf2b9b842109..52ec1a794c5e9f4452a4bf2b648f453d8acfe976 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
@@ -17,14 +17,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 
 namespace xla {
 namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class BatchDotSimplificationTest : public HloVerifiedTestBase {};
+class BatchDotSimplificationTest : public HloTestBase {};
 
 TEST_F(BatchDotSimplificationTest,
        ElideSingleDegenerateBatchDotDim_VectorVector) {
@@ -38,11 +37,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -61,11 +61,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -84,11 +85,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -107,11 +109,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -130,11 +133,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
@@ -153,11 +157,12 @@ main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
   BatchDotSimplification pass;
-  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(pass.Run(m.get()).ValueOrDie());
 
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root,
               op::Reshape(op::Dot(
                   op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index f70f6ddfec69c0113a1afe2073a2392098f49456..0e6ca1871b379a2f55b92207133822fc6258b007 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -107,19 +107,37 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   }
 
   std::unique_ptr<HloInstruction> Mean(
-      int64 element_count, HloInstruction* operand,
+      HloInstruction* element_count, HloInstruction* operand,
       const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
           add_instruction) {
-    HloInstruction* elem_count_recip =
-        add_instruction(HloInstruction::CreateBroadcast(
-            operand->shape(),
-            add_instruction(HloInstruction::CreateConvert(
-                ShapeUtil::MakeShape(operand->shape().element_type(), {}),
-                add_instruction(HloInstruction::CreateConstant(
-                    LiteralUtil::CreateR0<float>(1.0 / element_count))))),
-            {}));
-    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kMultiply,
-                                        operand, elem_count_recip);
+    auto broadcast = add_instruction(
+        HloInstruction::CreateBroadcast(operand->shape(), element_count, {}));
+    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kDivide,
+                                        operand, broadcast);
+  }
+
+  std::unique_ptr<HloInstruction> DynamicElementCountPerFeature(
+      HloInstruction* operand, int64 feature_index,
+      const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+          add_instruction) {
+    auto elements_per_feature_u32 = add_instruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)));
+
+    for (int64 i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+      if (i == feature_index) {
+        continue;
+      }
+      auto dynamic_dimension_size =
+          add_instruction(HloInstruction::CreateGetDimensionSize(
+              ShapeUtil::MakeShape(U32, {}), operand, i));
+      elements_per_feature_u32 = add_instruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(U32, {}), HloOpcode::kMultiply,
+          dynamic_dimension_size, elements_per_feature_u32));
+    }
+
+    return HloInstruction::CreateConvert(
+        ShapeUtil::MakeShape(operand->shape().element_type(), {}),
+        elements_per_feature_u32);
   }
 
   // Replaces the existing HLO instruction old_instruction, with
@@ -195,9 +213,6 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   const Shape operand_shape = operand->shape();
   PrimitiveType ptype = operand_shape.element_type();
   int64 feature_index = batch_norm->feature_index();
-  const int64 feature_count = operand_shape.dimensions(feature_index);
-  const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape);
-  int64 elements_per_feature_int64 = size_in_elements / feature_count;
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
@@ -220,6 +235,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     }
   }
 
+  auto elements_per_feature =
+      add(DynamicElementCountPerFeature(operand, feature_index, add));
+
   auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
@@ -243,13 +261,13 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       add_reduce_computation));
 
   // E[X].
-  auto mean = add(Mean(elements_per_feature_int64, sum, add));
+  auto mean = add(Mean(elements_per_feature, sum, add));
 
   auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = add(Mean(elements_per_feature_int64, squared_sum, add));
+  auto square_mean = add(Mean(elements_per_feature, squared_sum, add));
 
   // E^2[X].
   auto mean_square =
@@ -458,9 +476,8 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   int64 feature_index = batch_norm->feature_index();
 
-  const int64 size_in_elements = ShapeUtil::ElementsIn(activation_shape);
-  const int64 feature_count = activation_shape.dimensions(feature_index);
-  const int64 elements_per_feature_int64 = size_in_elements / feature_count;
+  auto elements_per_feature =
+      add(DynamicElementCountPerFeature(activation, feature_index, add));
 
   auto zero_literal = LiteralUtil::CreateR0(0.0f);
   TF_ASSIGN_OR_RETURN(zero_literal, zero_literal.Convert(ptype));
@@ -553,15 +570,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       add_binary(activation_shape, HloOpcode::kMultiply, scale_broadcasted,
                  rsqrt_var_add_epsilon_broadcasted);
 
-  scale_times_rsqrt_var_add_epsilon = add(
-      Mean(elements_per_feature_int64, scale_times_rsqrt_var_add_epsilon, add));
+  scale_times_rsqrt_var_add_epsilon =
+      add(Mean(elements_per_feature, scale_times_rsqrt_var_add_epsilon, add));
 
-  auto elements_per_feature_literal =
-      LiteralUtil::CreateR0<float>(elements_per_feature_int64);
-  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
-                      elements_per_feature_literal.Convert(ptype));
-  auto elements_per_feature = add(
-      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
   auto i1 = add_binary(activation_shape, HloOpcode::kMultiply, grad_output,
                        add(HloInstruction::CreateBroadcast(
                            activation_shape, elements_per_feature, {})));
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index f7ac8f5482908af104554a1cf812370b9098cda7..8e8fbbd935b154e5a77d68e60d861601d740bf03 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -29,14 +29,28 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
 
-using BatchNormExpanderTest = HloVerifiedTestBase;
+class BatchNormExpanderTest : public HloTestBase {
+ protected:
+  // BatchNorm should have a dynamic sized dividor for mean operations.
+  int64 CountGetDimensionSize(const HloModule& module) {
+    int64 count = 0;
+    for (HloComputation* comp : module.computations()) {
+      for (HloInstruction* inst : comp->instructions()) {
+        if (inst->opcode() == HloOpcode::kGetDimensionSize) {
+          count++;
+        }
+      }
+    }
+    return count;
+  }
+};
 
 // Test that we expand BatchNormTraining.
 TEST_F(BatchNormExpanderTest, BatchNormTraining) {
@@ -59,15 +73,16 @@ TEST_F(BatchNormExpanderTest, BatchNormTraining) {
       param0, param1, param2,
       /*epsilon=*/0.001, /*feature_index=*/3));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormTraining);
   BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
-  ASSERT_TRUE(rewriter.Run(module).ValueOrDie());
+  ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
+  EXPECT_EQ(CountGetDimensionSize(*module), 3);
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
 }
@@ -101,15 +116,16 @@ TEST_F(BatchNormExpanderTest, BatchNormGrad) {
       param1, param2, param3, param4,
       /*epsilon=*/0.001, /*feature_index=*/3));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBatchNormGrad);
   BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
-  ASSERT_TRUE(rewriter.Run(module).ValueOrDie());
+  ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
+  EXPECT_EQ(CountGetDimensionSize(*module), 3);
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
 }
@@ -126,13 +142,13 @@ ENTRY entry {
     epsilon=0.001, feature_index=1, sharding={maximal device=1}
 })";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
   BatchNormExpander rewriter(/*rewrite_training_op=*/true,
                              /*rewrite_inference_op=*/true,
                              /*rewrite_grad_op=*/true);
-  ASSERT_TRUE(rewriter.Run(&module()).ValueOrDie());
+  ASSERT_TRUE(rewriter.Run(m.get()).ValueOrDie());
 
-  for (auto* instruction : module().entry_computation()->instructions()) {
+  for (auto* instruction : m->entry_computation()->instructions()) {
     if (instruction->opcode() == HloOpcode::kParameter) {
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
index d63287539dfde5bb4890ab8303ef2205133d8125..e9d30fc03c1c3194de577e6683b36a95641694d9 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -151,15 +151,10 @@ Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
 
 Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
   // Do not fold BF16 conversions for instructions related to tuples, entry and
-  // exit of a computation, fusion, convert, and control flow.
+  // exit of a computation, fusion, convert, side-effecting instructions and
+  // control flow.
   if (hlo->opcode() == HloOpcode::kTuple ||            //
       hlo->opcode() == HloOpcode::kGetTupleElement ||  //
-      hlo->opcode() == HloOpcode::kInfeed ||           //
-      hlo->opcode() == HloOpcode::kOutfeed ||          //
-      hlo->opcode() == HloOpcode::kSend ||             //
-      hlo->opcode() == HloOpcode::kSendDone ||         //
-      hlo->opcode() == HloOpcode::kRecv ||             //
-      hlo->opcode() == HloOpcode::kRecvDone ||         //
       hlo->opcode() == HloOpcode::kConstant ||         //
       hlo->opcode() == HloOpcode::kParameter ||        //
       hlo->opcode() == HloOpcode::kFusion ||           //
@@ -167,7 +162,8 @@ Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
       hlo->opcode() == HloOpcode::kCall ||             //
       hlo->opcode() == HloOpcode::kCustomCall ||       //
       hlo->opcode() == HloOpcode::kWhile ||            //
-      hlo->opcode() == HloOpcode::kConditional) {
+      hlo->opcode() == HloOpcode::kConditional ||      //
+      hlo->HasSideEffectNoRecurse()) {
     return Status::OK();
   }
   if (hlo == computation_->root_instruction() &&
@@ -182,6 +178,10 @@ Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
 
 Status BFloat16ConversionFoldingVisitor::HandleCrossReplicaSum(
     HloInstruction* crs) {
+  if (crs->IsCrossModuleAllReduce()) {
+    // Cross-module all-reduce has side effect.
+    return Status::OK();
+  }
   // First use DefaultAction() to handle the operands. It can't handle
   // tuple-shaped output.
   TF_RETURN_IF_ERROR(DefaultAction(crs));
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index 5f93740887aa7e61458990992fe0573883ff056d..4ce351acc2c359773e618da70360c96faf5ca379 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -65,11 +65,11 @@ class TestBFloat16Support : public BFloat16Support {
   }
 };
 
-class BFloat16ConversionFoldingTest : public HloVerifiedTestBase {
+class BFloat16ConversionFoldingTest : public HloTestBase {
  protected:
   BFloat16ConversionFoldingTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/false,
-                            /*allow_mixed_precision=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   bool FoldConversions(HloModule* module) {
     TestBFloat16Support bfloat16_support_;
@@ -103,10 +103,10 @@ TEST_F(BFloat16ConversionFoldingTest, FoldIfSupported) {
       HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, convert1, c));
   builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, add1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(FoldConversions(module));
+  EXPECT_TRUE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), add1);
   EXPECT_EQ(add0->shape().element_type(), BF16);
@@ -138,10 +138,10 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldIfUnsupported) {
   HloInstruction* convert2 =
       builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, mul1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(FoldConversions(module));
+  EXPECT_FALSE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), convert2);
   EXPECT_EQ(mul0->shape().element_type(), F32);
@@ -173,10 +173,10 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldUnsupportedMixedPrecision) {
   HloInstruction* convert2 =
       builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, sub1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(FoldConversions(module));
+  EXPECT_FALSE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), convert2);
   EXPECT_EQ(sub0->shape().element_type(), F32);
@@ -203,10 +203,10 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
   HloInstruction* convert1 =
       builder.AddInstruction(HloInstruction::CreateConvert(bf16_shape, gte));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(FoldConversions(module));
+  EXPECT_FALSE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), convert1);
   EXPECT_EQ(gte->shape().element_type(), F32);
@@ -216,7 +216,7 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
 TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
   auto builder = HloComputation::Builder(TestName());
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder sum_builder("add");
   auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
@@ -252,7 +252,7 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(FoldConversions(module));
+  EXPECT_TRUE(FoldConversions(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), tuple);
   EXPECT_EQ(tuple->operand(0), gte_a);
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.cc b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
index d5b1148058898596bfdb837826a590bbc74e202a..b8a8f844eff17a95d4073f53495e0027c481f558 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
@@ -231,6 +231,10 @@ Status BFloat16NormalizationVisitor::HandleMultipleOutputs(
   for (auto* user : materialized_users) {
     TF_RETURN_IF_ERROR(hlo->ReplaceUseWith(user, tuple));
   }
+  bool is_root = computation_->root_instruction() == hlo;
+  if (is_root) {
+    computation_->set_root_instruction(tuple);
+  }
   *tuple->mutable_shape() = original_shape;
   return Status::OK();
 }
@@ -342,11 +346,9 @@ Status BFloat16NormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
 
 Status BFloat16NormalizationVisitor::DefaultAction(HloInstruction* hlo) {
   // Do not change instructions related to entry and exit of a computation,
-  // tuples, fusion, convert, and control flow.
+  // tuples, fusion, convert, side-effecting instructions, and control flow.
   if (hlo->opcode() == HloOpcode::kTuple ||            //
       hlo->opcode() == HloOpcode::kGetTupleElement ||  //
-      hlo->opcode() == HloOpcode::kInfeed ||           //
-      hlo->opcode() == HloOpcode::kOutfeed ||          //
       hlo->opcode() == HloOpcode::kConstant ||         //
       hlo->opcode() == HloOpcode::kParameter ||        //
       hlo->opcode() == HloOpcode::kFusion ||           //
@@ -354,7 +356,8 @@ Status BFloat16NormalizationVisitor::DefaultAction(HloInstruction* hlo) {
       hlo->opcode() == HloOpcode::kCall ||             //
       hlo->opcode() == HloOpcode::kCustomCall ||       //
       hlo->opcode() == HloOpcode::kWhile ||            //
-      hlo->opcode() == HloOpcode::kConditional) {
+      hlo->opcode() == HloOpcode::kConditional ||      //
+      hlo->HasSideEffectNoRecurse()) {
     return Status::OK();
   }
   // TODO(b/112040122): Correctly normalize variadic reduce.
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index cef0eba14e9dd463d6c32b047211bf25a84478f6..9f97d18c565c7915b9f9346f0c6330cdc3c707e9 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -68,11 +68,11 @@ class TestBFloat16Support : public BFloat16Support {
   }
 };
 
-class BFloat16NormalizationTest : public HloVerifiedTestBase {
+class BFloat16NormalizationTest : public HloTestBase {
  protected:
   BFloat16NormalizationTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/false,
-                            /*allow_mixed_precision=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   bool Normalize(HloModule* module) {
     TestBFloat16Support bfloat16_support_;
@@ -106,10 +106,10 @@ TEST_F(BFloat16NormalizationTest, NoopIfSupported) {
   HloInstruction* add1 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_shape, HloOpcode::kAdd, add0, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(Normalize(module));
+  EXPECT_FALSE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), add1);
   EXPECT_EQ(add0->shape().element_type(), BF16);
@@ -134,10 +134,10 @@ TEST_F(BFloat16NormalizationTest, ResolveIfUnsupportedBF16) {
   HloInstruction* mul1 = builder.AddInstruction(
       HloInstruction::CreateBinary(bf16_shape, HloOpcode::kMultiply, mul0, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(computation->root_instruction()->operand(0), mul1);
@@ -164,10 +164,10 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionSubtraction) {
   HloInstruction* sub1 = builder.AddInstruction(
       HloInstruction::CreateBinary(bf16_shape, HloOpcode::kSubtract, sub0, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(computation->root_instruction()->operand(0), sub1);
@@ -191,7 +191,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
       HloInstruction::CreateBinary(bf16_scalar_shape, HloOpcode::kAdd,
                                    reduce_comp_param0, reduce_comp_param1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto reduce_computation =
       module->AddEmbeddedComputation(reduce_comp_builder.Build());
 
@@ -205,7 +205,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), reduce);
   EXPECT_EQ(reduce->called_computations().size(), 1);
@@ -233,7 +233,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder sum_builder("sum");
   auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
@@ -263,7 +263,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), gte);
   EXPECT_EQ(gte->shape().element_type(), BF16);
@@ -272,7 +272,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {1024});
   Shape bf16_shape = ShapeUtil::MakeShape(BF16, {1024});
@@ -284,13 +284,13 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
       HloInstruction::CreateParameter(1, s32_shape, "value"));
 
   HloInstruction* sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({bf16_shape, s32_shape}), 0, key, value));
+      ShapeUtil::MakeTupleShape({bf16_shape, s32_shape}), 0, key, {value}));
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(bf16_shape, sort, 0));
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), gte);
   EXPECT_EQ(gte->shape().element_type(), BF16);
@@ -298,6 +298,30 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
   EXPECT_EQ(ShapeUtil::GetSubshape(sort->shape(), {0}).element_type(), F32);
 }
 
+TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSortRoot) {
+  auto module = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {1024});
+  Shape bf16_shape = ShapeUtil::MakeShape(BF16, {1024});
+
+  HloInstruction* key = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32_shape, "key"));
+  HloInstruction* value = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_shape, "value"));
+
+  HloInstruction* sort = builder.AddInstruction(HloInstruction::CreateSort(
+      ShapeUtil::MakeTupleShape({bf16_shape, bf16_shape}), 0, key, {value}));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(Normalize(module.get()));
+
+  EXPECT_EQ(sort->operand(0)->shape().element_type(), F32);
+  EXPECT_EQ(ShapeUtil::GetSubshape(sort->shape(), {0}).element_type(), F32);
+  EXPECT_NE(computation->root_instruction(), sort);
+  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kTuple);
+}
+
 // Tests that the normalization should not cause unsupported mixed precision due
 // to resolving unsupported BF16 operand.
 TEST_F(BFloat16NormalizationTest, DoNotAddUnsupportedMixedPrecision) {
@@ -318,10 +342,10 @@ TEST_F(BFloat16NormalizationTest, DoNotAddUnsupportedMixedPrecision) {
   HloInstruction* dot = builder.AddInstruction(
       HloInstruction::CreateDot(bf16_shape, a, b, dot_dnums, precision_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(Normalize(module));
+  EXPECT_TRUE(Normalize(module.get()));
 
   EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kConvert);
   EXPECT_EQ(dot->shape().element_type(), F32);
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index 002be9c97098ef1f73446c458dae24bbc826a626..63d4572f2028c462df1cac9d5e4ee616e407f37b 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -236,6 +236,10 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
         // the end of the BFloat16Propagation pass.
         continue;
       }
+      if (use.instruction->HasSideEffectNoRecurse()) {
+        // Keep side-effecting instruction's operands unchanged.
+        return false;
+      }
       // Any visited user that can accept BF16 has already been updated if
       // necessary, e.g., the output has been changed to BF16 if it propagates
       // precision, or a called computation's parameters have been changed to
@@ -329,22 +333,6 @@ void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo,
     return;
   }
 
-  // Do not change precision for instructions related to entry and exit of a
-  // computation, and control flow, because this pass might break the interfaces
-  // or assumptions for them.
-  if (hlo->opcode() == HloOpcode::kInfeed ||       //
-      hlo->opcode() == HloOpcode::kOutfeed ||      //
-      hlo->opcode() == HloOpcode::kSend ||         //
-      hlo->opcode() == HloOpcode::kSendDone ||     //
-      hlo->opcode() == HloOpcode::kRecv ||         //
-      hlo->opcode() == HloOpcode::kRecvDone ||     //
-      hlo->opcode() == HloOpcode::kCustomCall ||   //
-      hlo->opcode() == HloOpcode::kCall ||         //
-      hlo->opcode() == HloOpcode::kConditional ||  //
-      (hlo->opcode() == HloOpcode::kParameter && skip_parameters)) {
-    return;
-  }
-
   // Prevent root instructions from having their output modified by recording
   // all F32 output values as needing to stay as F32.
   CHECK(hlo->parent() != nullptr);
@@ -366,6 +354,17 @@ void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo,
     return;
   }
 
+  // Do not change precision for instructions related to entry and exit of a
+  // computation, side-effecting instructions, and control flow, because this
+  // pass might break the interfaces or assumptions for them.
+  if (hlo->opcode() == HloOpcode::kCustomCall ||   //
+      hlo->opcode() == HloOpcode::kCall ||         //
+      hlo->opcode() == HloOpcode::kConditional ||  //
+      hlo->HasSideEffectNoRecurse() ||             //
+      (hlo->opcode() == HloOpcode::kParameter && skip_parameters)) {
+    return;
+  }
+
   if (!ContainsKey(consider_using_bfloat16_, hlo)) {
     return;
   }
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index e032b5c624c0151fd63c870e0f21ec97656d625f..5be7141aae423adb4fe2f39262e463ff25ae8234 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -55,11 +55,11 @@ class TestBFloat16Support : public BFloat16Support {
   }
 };
 
-class BFloat16PropagationTest : public HloVerifiedTestBase {
+class BFloat16PropagationTest : public HloTestBase {
  protected:
   BFloat16PropagationTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/false,
-                            /*allow_mixed_precision=*/true) {}
+      : HloTestBase(/*verifier_layout_sensitive=*/false,
+                    /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 
   // Runs the propagation pass on the given module, and returns whether the
   // module is changed after this pass.
@@ -121,10 +121,10 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSelectButNotAdd) {
   HloInstruction* root = builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kAdd, dot, dot));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), root);
   EXPECT_TRUE(OutputsBF16(xpose));
@@ -136,6 +136,96 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSelectButNotAdd) {
   EXPECT_FALSE(OutputsBF16(c));
 }
 
+TEST_F(BFloat16PropagationTest, PropagateThroughMaxPoolReduceWindow) {
+  auto module = CreateNewVerifiedModule();
+
+  auto sub_builder = HloComputation::Builder("max");
+  HloInstruction* p0 = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "a"));
+  HloInstruction* p1 = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "b"));
+  sub_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {}), HloOpcode::kMaximum, p0, p1));
+  auto max_computation = module->AddEmbeddedComputation(sub_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* c =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, shape, "c"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_high(1);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(1);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+  HloInstruction* rw =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          shape, add,
+          builder.AddInstruction(
+              HloInstruction::CreateConstant(LiteralUtil::Zero(F32))),
+          window, max_computation));
+  HloInstruction* xpose =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {4, 2}), c, {1, 0}));
+  HloInstruction* dot = builder.AddInstruction(
+      CreateDot(ShapeUtil::MakeShape(F32, {4, 4}), xpose, rw));
+  HloInstruction* root = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kAdd, dot, dot));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(PropagatePrecision(module.get()));
+
+  EXPECT_EQ(computation->root_instruction(), root);
+  EXPECT_TRUE(OutputsBF16(add));
+  EXPECT_TRUE(OutputsBF16(xpose));
+  EXPECT_TRUE(OutputsBF16(rw));
+}
+
+// Tests that side-effecting all-reduce should not be changed.
+TEST_F(BFloat16PropagationTest, DoNotChangeAllReduce) {
+  auto module = CreateNewVerifiedModule();
+
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  auto rb = HloComputation::Builder(TestName());
+  rb.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kAdd,
+      rb.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")),
+      rb.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"))));
+  auto reduction = module->AddEmbeddedComputation(rb.Build());
+  HloInstruction* all_reduce =
+      builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
+          ShapeUtil::MakeTupleShape({shape, shape}), {a, b}, reduction,
+          /*replica_groups=*/{}, /*barrier=*/"", /*all_reduce_id=*/1));
+  HloInstruction* gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, all_reduce, 0));
+  HloInstruction* gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(shape, all_reduce, 1));
+  HloInstruction* dot = builder.AddInstruction(CreateDot(shape, gte0, gte1));
+  HloInstruction* root = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(PropagatePrecision(module.get()));
+  EXPECT_EQ(computation->root_instruction(), root);
+}
+
 // Tests that if a constant is converted to BF16 then its literal must also be
 // converted.
 TEST_F(BFloat16PropagationTest, ConvertConstantLiteral) {
@@ -152,10 +242,10 @@ TEST_F(BFloat16PropagationTest, ConvertConstantLiteral) {
       HloInstruction::CreateConstant(LiteralUtil::CreateFromArray(array_b)));
   HloInstruction* dot = builder.AddInstruction(CreateDot(shape, a, b));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(dot->operand(0)));
@@ -208,10 +298,10 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTuples) {
   HloInstruction* output_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({dot, add2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), output_tuple);
   EXPECT_TRUE(OutputsBF16(xpose));
@@ -247,10 +337,10 @@ TEST_F(BFloat16PropagationTest, SameValueReferencedTwice) {
   HloInstruction* dot = builder.AddInstruction(
       CreateDot(ShapeUtil::MakeShape(F32, {4, 4}), lhs, rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(add1));
@@ -276,10 +366,10 @@ TEST_F(BFloat16PropagationTest, DoNotChangeComputationRoot) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({add, dot}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(PropagatePrecision(module));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), tuple);
   EXPECT_FALSE(OutputsBF16(add));
@@ -287,7 +377,7 @@ TEST_F(BFloat16PropagationTest, DoNotChangeComputationRoot) {
 
 // Tests that BF16 is propagated properly through fused computations.
 TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -322,7 +412,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), fusion1);
   EXPECT_TRUE(OutputsBF16(add));
@@ -335,7 +425,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
 // Tests that changes to BF16 that cannot be propagated outside a fusion are
 // discarded.
 TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -359,7 +449,7 @@ TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(PropagatePrecision(module));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
   EXPECT_EQ(computation->root_instruction(), fusion);
 }
 
@@ -374,7 +464,7 @@ TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
 //   (BF16, BF16) fusion_computation(F32 a, F32 b)
 //     = tuple(BF16 convert(a), BF16 add(F32 a, F32 b))
 TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -405,7 +495,7 @@ TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(gte0));
@@ -424,7 +514,7 @@ TEST_F(BFloat16PropagationTest, ConvertTupleFusionElementIfUsedByAdd) {
 // on_true and on_false must match, so that as long as one of them is F32, the
 // other must be F32 as well.
 TEST_F(BFloat16PropagationTest, SelectOverTuples) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
 
@@ -455,7 +545,7 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) {
 
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_FALSE(OutputsBF16(add0));
@@ -468,7 +558,7 @@ TEST_F(BFloat16PropagationTest, SelectOverTuples) {
 // Tests that BF16 is propagated properly through a while computation with
 // non-tuple input/output.
 TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -511,7 +601,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
   auto dot = builder.AddInstruction(CreateDot(shape, while_hlo, while_hlo));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(
@@ -527,7 +617,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
 // made to the while body and thus the fusion node inside it.
 TEST_F(BFloat16PropagationTest,
        ConditionPreventsPropagationForFusionInsideWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -576,7 +666,7 @@ TEST_F(BFloat16PropagationTest,
   auto dot = builder.AddInstruction(CreateDot(shape, while_hlo, while_hlo));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(PropagatePrecision(module));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_FALSE(OutputsBF16(add));
   EXPECT_FALSE(OutputsBF16(body_fusion));
@@ -588,7 +678,7 @@ TEST_F(BFloat16PropagationTest,
 // Tests that BF16 is propagated properly through while computations with
 // tuple-shaped input/output.
 TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -656,7 +746,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
   auto dot = builder.AddInstruction(CreateDot(shape, lhs, rhs));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(lhs));
@@ -675,7 +765,7 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
 // Tests that BF16 is not propagated through multiple whiles that invoke the
 // same computation as long as one while prevents the propagation.
 TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
 
@@ -786,7 +876,7 @@ TEST_F(BFloat16PropagationTest, DoNotPropagateWhilesCallingSameComputation) {
   auto dot = builder.AddInstruction(CreateDot(shape, lhs, rhs));
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
   EXPECT_FALSE(OutputsBF16(body_dot));
   EXPECT_FALSE(OutputsBF16(body_rhs));
   EXPECT_FALSE(OutputsBF16(body_lhs));
@@ -825,10 +915,10 @@ TEST_F(BFloat16PropagationTest, NoopConversionRemoved) {
   HloInstruction* add2 = builder.AddInstruction(HloInstruction::CreateBinary(
       bf16_shape, HloOpcode::kAdd, convert0, convert1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), add2);
   EXPECT_EQ(add2->operand(0), add0);
@@ -861,10 +951,10 @@ TEST_F(BFloat16PropagationTest, TupleDomain) {
   HloInstruction* root = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
   EXPECT_EQ(computation->root_instruction(), root);
 
   // test BF16 propagated through domain
@@ -907,10 +997,10 @@ TEST_F(BFloat16PropagationTest, TupleDomainNoPropagation) {
   HloInstruction* root = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, dot, dot));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(PropagatePrecision(module));
+  EXPECT_TRUE(PropagatePrecision(module.get()));
 
   EXPECT_EQ(computation->root_instruction(), root);
   EXPECT_TRUE(OutputsBF16(a_trans));
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
index 5b48f10505e78c035608d4c575501e4623218987..2b9502f63a821f3675ddfb506f41bb2390cf4136 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 
@@ -107,6 +108,21 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision(
     case HloOpcode::kSelect:
     case HloOpcode::kTupleSelect:
       return operand_index == 1 || operand_index == 2;
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow: {
+      HloComputation* reduce_comp = hlo.called_computations()[0];
+      for (HloInstruction* inst : reduce_comp->instructions()) {
+        if (inst->opcode() == HloOpcode::kParameter) {
+          continue;
+        }
+        for (int64 i = 0; i < inst->operand_count(); ++i) {
+          if (!EffectiveOperandPrecisionIsOutputPrecision(*inst, i)) {
+            return false;
+          }
+        }
+      }
+      return true;
+    }
     default:
       break;
   }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 2c2d1626c2c0d5d4b13e401dad9fd6c51514fc13..8d7c62447852fd946440c41389300a92377c471f 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -239,7 +239,7 @@ BufferAllocation::Slice BufferAllocation::GetSlice(
 
 void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
                                      int64 size) {
-  VLOG(4) << "Trying to add " << buffer << " to " << this;
+  VLOG(4) << "Trying to add " << buffer << " to allocation #" << index();
   CHECK(assigned_buffers_.count(&buffer) == 0)
       << "LogicalBuffer " << buffer << " already assigned to allocation "
       << index_;
@@ -378,6 +378,20 @@ const BufferAllocation& BufferAssignment::GetAllocation(
   return allocations_[index];
 }
 
+const BufferAllocation* BufferAssignment::GetInstructionAllocation(
+    const HloInstruction* hlo, const ShapeIndex& shape_index) const {
+  const PointsToSet& points_to_set = points_to_analysis().GetPointsToSet(hlo);
+  const LogicalBuffer* buffer = points_to_set.element(shape_index)[0];
+
+  if (!HasAllocation(*buffer)) {
+    return nullptr;
+  }
+
+  const BufferAllocation& instruction_allocation =
+      GetAssignedAllocation(*buffer);
+  return &instruction_allocation;
+}
+
 BufferAllocation* BufferAssignment::GetMutableAllocation(
     BufferAllocation::Index index) {
   return const_cast<BufferAllocation*>(&GetAllocation(index));
@@ -514,6 +528,9 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
   TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer));
 
   allocation->AddAssignment(buffer, offset, size);
+  if (liveness().MaybeLiveOut(buffer)) {
+    allocation->set_maybe_live_out(true);
+  }
   allocation_index_for_buffer_[&buffer] = allocation->index();
 }
 
@@ -624,7 +641,7 @@ Status BufferAssignment::ComputeSummaryStats() {
   bool schedule_complete = true;
   for (const auto& computation : module_->computations()) {
     if (!computation->IsFusionComputation()) {
-      const std::vector<const HloInstruction*>* sequence =
+      const HloInstructionSequence* sequence =
           liveness_->hlo_ordering().SequentialOrder(*computation);
       if (sequence == nullptr) {
         schedule_complete = false;
@@ -728,14 +745,89 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     LogicalBuffer::SizeFunction buffer_size,
     LogicalBuffer::AlignmentFunction color_alignment,
     bool allow_input_output_aliasing, bool allocate_buffers_for_constants,
-    BufferLiveness::Colorer colorer) {
-  BufferAssigner assigner(allow_input_output_aliasing,
-                          allocate_buffers_for_constants, std::move(colorer));
+    BufferLiveness::Colorer colorer, ReuseAllocationFunction reuse_checker) {
+  BufferAssigner assigner(allocate_buffers_for_constants, std::move(colorer),
+                          std::move(reuse_checker));
   return assigner.CreateAssignment(module, std::move(hlo_ordering),
                                    std::move(buffer_size),
                                    std::move(color_alignment));
 }
 
+namespace {
+
+// a and b are in different subcomputations. Check for the case
+// where a is inside the while body, and b is outside, part of the same while's
+// init-operand or while-result.
+bool MayInterfereAcrossSubcomputations(BufferAssignment* assignment,
+                                       const LogicalBuffer& a_buffer,
+                                       const LogicalBuffer& b_buffer) {
+  auto call_graph = assignment->liveness().hlo_ordering().call_graph();
+  const HloInstruction* a_ancestor;
+  const HloInstruction* b_ancestor;
+  std::tie(a_ancestor, b_ancestor) =
+      call_graph.NearestAncestorsInSameComputation(a_buffer.instruction(),
+                                                   b_buffer.instruction());
+  if (a_ancestor == nullptr) {
+    // No common ancestor.
+    return true;
+  }
+  if (a_ancestor->opcode() == HloOpcode::kWhile &&
+      call_graph.InstructionIsNestedIn(a_buffer.instruction(),
+                                       a_ancestor->while_body())) {
+    const PointsToSet& init_set =
+        assignment->liveness().points_to_analysis().GetPointsToSet(
+            a_ancestor->operand(0));
+    if (init_set.ContainsBuffer(b_buffer)) {
+      VLOG(4) << "Can't interfere: " << a_buffer << " and " << b_buffer
+              << " (part of while-operand)";
+      return false;
+    }
+    const PointsToSet& while_set =
+        assignment->liveness().points_to_analysis().GetPointsToSet(a_ancestor);
+    if (while_set.ContainsBuffer(b_buffer)) {
+      VLOG(4) << "Can't interfere: " << a_buffer << " and " << b_buffer
+              << " (part of while)";
+      return false;
+    }
+  }
+  return true;
+}
+
+// Return true, if a and b can't possibly interfere (and therefore further
+// checking for interference can be skipped). This function checks for special
+// cases where copy insertion guarantees no interference, but the regular buffer
+// liveness is too conservative:
+//
+// Operations inside a while-body can't interfere with operations outside the
+// while op if their last use is at the while-loop itself as part of the
+// while-init op, or the while-result.  For ops that are live across a
+// while-loop, copy insertion will already insert the necessary copies to avoid
+// such interference.
+//
+// This allows sharing buffers in cases like this:
+// init = {...}
+// while (init):
+//  p = param(0)
+//  gte = get-tuple-element(p), index=i
+//  t1 = op1 (gte)
+//  t2 = op2 (t1)
+//  ROOT tuple = {..., t2, ...}
+//
+// where t1 and t2 can share the same buffer.
+bool MaySkipInterferenceCheck(BufferAssignment* assignment,
+                              const LogicalBuffer& a_buffer,
+                              const LogicalBuffer& b_buffer) {
+  if (a_buffer.instruction()->parent() == b_buffer.instruction()->parent()) {
+    // Ops within the same computation are not handled here. Assume that they
+    // may interfere.
+    return false;
+  }
+  return !MayInterfereAcrossSubcomputations(assignment, a_buffer, b_buffer) ||
+         !MayInterfereAcrossSubcomputations(assignment, b_buffer, a_buffer);
+}
+
+}  // namespace
+
 bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
                                        const LogicalBuffer& buffer,
                                        BufferAssignment* assignment) {
@@ -763,6 +855,12 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
     return false;
   }
 
+  if (reuse_checker_ != nullptr &&
+      !reuse_checker_(*assignment, *allocation, buffer)) {
+    VLOG(4) << "Can't assign: reuse_checker_(allocation, buffer) == false";
+    return false;
+  }
+
   if (!allocation->is_reusable()) {
     VLOG(4) << "Can't assign: allocation is not reusable";
     return false;
@@ -770,6 +868,9 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
 
   for (const auto& buffer_offset_size : allocation->assigned_buffers()) {
     const LogicalBuffer& assigned_buffer = *buffer_offset_size.first;
+    if (MaySkipInterferenceCheck(assignment, buffer, assigned_buffer)) {
+      continue;
+    }
     if (assignment->liveness().MayInterfere(assigned_buffer, buffer)) {
       VLOG(4) << "Can't assign: assignee " << assigned_buffer
               << " may interfere with " << buffer;
@@ -784,21 +885,6 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
     }
   }
 
-  if (allow_input_output_aliasing_ && allocation->maybe_live_out()) {
-    const HloComputation* entry_computation =
-        assignment->module_->entry_computation();
-    for (auto param : entry_computation->parameter_instructions()) {
-      for (auto& param_buffer :
-           assignment->points_to_analysis().GetBuffersDefinedByInstruction(
-               param)) {
-        if (assignment->liveness().MayInterfere(*param_buffer, buffer)) {
-          VLOG(4) << "Can't assign: Parameter interference with result";
-          return false;
-        }
-      }
-    }
-  }
-
   // If the buffer is live out of the computation then it should only be
   // assigned a buffer which exactly fits the result to avoid wasting memory
   // (result buffers can have arbitrary lifetimes).
@@ -1093,7 +1179,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       const HloComputation* computation = pair.first;
       const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
           pair.second;
-      const std::vector<const HloInstruction*>* instruction_sequence =
+      const HloInstructionSequence* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
       schedule.set_sequence(computation, *instruction_sequence);
@@ -1128,7 +1214,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       const HloComputation* computation = pair.first;
       const flat_hash_set<const LogicalBuffer*>& buffers_to_assign =
           pair.second;
-      const std::vector<const HloInstruction*>* instruction_sequence =
+      const HloInstructionSequence* instruction_sequence =
           hlo_ordering.SequentialOrder(*computation);
       CHECK(instruction_sequence != nullptr) << computation->name();
       auto color_map = SplitBuffersByColor(buffers_to_assign);
@@ -1143,7 +1229,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(get_heap_algorithm(alignment), *computation,
-                               HloInstructionSequence(*instruction_sequence),
+                               *instruction_sequence,
                                assignment->points_to_analysis(),
                                assignment->buffer_size_, options));
         AssignBuffersFromHeapSimulator(result, assignment,
@@ -1347,33 +1433,40 @@ BufferAssigner::MergeColocatedBufferSets(
            computation == module->entry_computation();
   };
 
+  std::vector<bool> set_can_be_merged(colocated_buffer_sets.size(), true);
+
+  // Do not merge if one of the sets includes live outs, entry parameters or
+  // constants.
+  //
+  // Buffer liveness does not report the correct live range for entry
+  // parameter and live out buffers so we have to special case them here.  On
+  // backends that support constant buffer allocations, constant buffers are
+  // assigned globals in readonly storage so we can't merge colocated buffer
+  // sets containing constants with colocated buffer sets containing writing
+  // instructions or other constants.
+  //
+  // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
+  // the caller of the executable so we can't write to entry parameters
+  // either, and the argument for not merging constants also applies to entry
+  // parameters.
+  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
+    for (auto& buffer : colocated_buffer_sets[i]) {
+      if (buffer_liveness.MaybeLiveOut(*buffer) ||
+          is_entry_parameter(*buffer) ||
+          buffer->instruction()->opcode() == HloOpcode::kConstant) {
+        set_can_be_merged[i] = false;
+        break;
+      }
+    }
+  }
+
   // Returns true if the two colocated buffer sets (specified by their indices
   // into the colocated_buffer_sets) can be merged into a single set.
   auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
                                    &buffer_size,
-                                   &is_entry_parameter](int64 i, int64 j) {
-    // Do not merge if one of the sets includes live outs, entry parameters or
-    // constants.
-    //
-    // Buffer liveness does not report the correct live range for entry
-    // parameter and live out buffers so we have to special case them here.  On
-    // backends that support constant buffer allocations, constant buffers are
-    // assigned globals in readonly storage so we can't merge colocated buffer
-    // sets containing constants with colocated buffer sets containing writing
-    // instructions or other constants.
-    //
-    // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
-    // the caller of the executable so we can't write to entry parameters
-    // either, and the argument for not merging constants also applies to entry
-    // parameters.
-    for (int64 key : {i, j}) {
-      for (auto& buffer : colocated_buffer_sets[key]) {
-        if (buffer_liveness.MaybeLiveOut(*buffer) ||
-            is_entry_parameter(*buffer) ||
-            buffer->instruction()->opcode() == HloOpcode::kConstant) {
-          return true;
-        }
-      }
+                                   &set_can_be_merged](int64 i, int64 j) {
+    if (!set_can_be_merged[i] || !set_can_be_merged[j]) {
+      return true;
     }
 
     // Colocated sets satisfy the invariant that all buffers within a set have
@@ -1434,13 +1527,30 @@ BufferAssigner::MergeColocatedBufferSets(
 
 // Builds sets of buffers in 'colocated_buffer_sets' which should be colocated
 // in the same allocation (currently just supports kWhile, kCall, and
-// kConditional).
+// kConditional and input output aliasing).
 void BufferAssigner::BuildColocatedBufferSets(
     const HloModule* module, const BufferLiveness& buffer_liveness,
     const LogicalBuffer::SizeFunction& buffer_size,
     std::vector<ColocatedBufferSet>* colocated_buffer_sets) {
   const TuplePointsToAnalysis& points_to_analysis =
       buffer_liveness.points_to_analysis();
+
+  // Set up colocated buffer set for input and output.
+  VLOG(4) << "Input/Output Alias Config: ";
+  VLOG(4) << module->input_output_alias_config();
+  module->input_output_alias_config().ForEachAlias(
+      [&](const ShapeIndex& output_index, int64 param_number,
+          const ShapeIndex& param_index) {
+        std::vector<const LogicalBuffer*> colocated_set;
+        AddBufferToColocatedSet(module->entry_computation()->root_instruction(),
+                                output_index, points_to_analysis,
+                                &colocated_set);
+        AddBufferToColocatedSet(
+            module->entry_computation()->parameter_instruction(param_number),
+            param_index, points_to_analysis, &colocated_set);
+        AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+      });
+
   for (const HloComputation* computation : module->MakeComputationPostOrder()) {
     if (computation->IsFusionComputation()) {
       continue;
@@ -1574,6 +1684,13 @@ void BufferAssigner::BuildColocatedBufferSets(
     return;
   }
 
+  int64 i = 0;
+  for (const auto& colocated_set : *colocated_buffer_sets) {
+    VLOG(4) << "Colocated set " << i++ << ":";
+    for (const auto& buffer : colocated_set) {
+      VLOG(4) << "  " << buffer->ToString();
+    }
+  }
   // Try to find more coalescing opportunities among the colocated buffer sets.
   //
   // TODO(b/32491382): We should be able to remove this by using the
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 899cd36e1f98c9e7b8ba7e42c06ced5c3e8afcc8..0a9fdede803e84ca42472259084615c031b206eb 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -362,6 +362,11 @@ class BufferAssignment {
   // with the given index.
   const BufferAllocation& GetAllocation(BufferAllocation::Index index) const;
 
+  // Returns the allocation with the given instruction and shape index. nullptr
+  // if no allocation exists.
+  const BufferAllocation* GetInstructionAllocation(
+      const HloInstruction* hlo, const ShapeIndex& shape_index) const;
+
   // Builds and returns a vector containing the slices which might contain the
   // subvalue at the given index of given instruction.
   std::set<BufferAllocation::Slice> GetAllSlices(
@@ -520,6 +525,11 @@ class BufferAssignment {
 // A class which constructs a buffer assignment.
 class BufferAssigner {
  public:
+  // Returns false if a buffer cannot be assigned to given allocation.
+  using ReuseAllocationFunction = std::function<bool(
+      const BufferAssignment& assignment, const BufferAllocation& alloc,
+      const LogicalBuffer& buffer)>;
+
   // Build and return a BufferAssignment for the given module. The given
   // HloOrdering is used to determine buffer liveness. buffer_size and
   // color_alignment are functions which returns the size and alignment of a
@@ -531,15 +541,16 @@ class BufferAssigner {
       LogicalBuffer::AlignmentFunction color_alignment,
       bool allow_input_output_aliasing = false,
       bool allocate_buffers_for_constants = false,
-      BufferLiveness::Colorer colorer = BufferLiveness::DefaultColorer());
+      BufferLiveness::Colorer colorer = BufferLiveness::DefaultColorer(),
+      ReuseAllocationFunction reuse_checker = nullptr);
 
  private:
-  BufferAssigner(bool allow_input_output_aliasing,
-                 bool allocate_buffers_for_constants,
-                 BufferLiveness::Colorer colorer)
-      : allow_input_output_aliasing_(allow_input_output_aliasing),
-        allocate_buffers_for_constants_(allocate_buffers_for_constants),
-        colorer_(colorer) {}
+  BufferAssigner(bool allocate_buffers_for_constants,
+                 BufferLiveness::Colorer colorer,
+                 ReuseAllocationFunction reuse_checker)
+      : allocate_buffers_for_constants_(allocate_buffers_for_constants),
+        colorer_(colorer),
+        reuse_checker_(reuse_checker) {}
   virtual ~BufferAssigner() = default;
 
   // Create a buffer assignment.
@@ -627,16 +638,15 @@ class BufferAssigner {
                       LogicalBuffer::Color::Hasher>
   SplitBuffersByColor(const absl::flat_hash_set<const LogicalBuffer*>& buffers);
 
-  // If true, buffer assignments assumes that input parameter buffers and output
-  // buffers can be shared if their sizes match.
-  bool allow_input_output_aliasing_;
-
   // If true, allocate buffers for constant instructions.
   bool allocate_buffers_for_constants_;
 
   // Functor used to assign colors to newly allocated logical buffers.
   BufferLiveness::Colorer colorer_;
 
+  // Functor to check if a buffer can reuse an allocation.
+  ReuseAllocationFunction reuse_checker_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(BufferAssigner);
 };
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 795beb9ff5ceb2998a85fbd03d8bb1d3b2febc12..8f482e6ba8c3e71c9980be5e6947ea61f3b4ef29 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -81,7 +81,7 @@ const std::vector<const HloInstruction*> GetInstructions(HloInstruction* root) {
   return main_list.GetInstructions();
 }
 
-class BufferAssignmentTest : public HloVerifiedTestBase {
+class BufferAssignmentTest : public HloTestBase {
  protected:
   ~BufferAssignmentTest() override {}
 
@@ -107,6 +107,24 @@ class BufferAssignmentTest : public HloVerifiedTestBase {
         .ConsumeValueOrDie();
   }
 
+  std::unique_ptr<BufferAssignment> RunBufferAssignmentNoBuffersReuseForAdd(
+      HloModule* module, int64 alignment = 1) {
+    auto reuse_checker = [](const BufferAssignment& assignment,
+                            const BufferAllocation& alloc,
+                            const LogicalBuffer& buffer) {
+      return (buffer.instruction()->opcode() != HloOpcode::kAdd);
+    };
+    return BufferAssigner::Run(
+               module, absl::make_unique<DependencyHloOrdering>(module),
+               backend().compiler()->BufferSizeBytesFunction(),
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/false,
+               /*colorer=*/BufferLiveness::DefaultColorer(),
+               /*reuse_checker=*/reuse_checker)
+        .ConsumeValueOrDie();
+  }
+
   std::unique_ptr<BufferAssignment> RunColoredBufferAssignment(
       HloModule* module, BufferLiveness::Colorer colorer, int64 alignment = 1) {
     return BufferAssigner::Run(
@@ -119,8 +137,7 @@ class BufferAssignmentTest : public HloVerifiedTestBase {
   }
 
   std::unique_ptr<BufferAssignment> RunBufferAssignmentWithInstructionSequence(
-      HloModule* module,
-      absl::Span<const HloInstruction* const> instruction_sequence,
+      HloModule* module, absl::Span<HloInstruction* const> instruction_sequence,
       int64 alignment = 1) {
     HloSchedule schedule(module);
     schedule.set_sequence(module->entry_computation(), instruction_sequence);
@@ -316,16 +333,16 @@ TEST_F(BufferAssignmentTest, ScalarConstant) {
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   {
-    auto buffers = RunBufferAssignment(module);
+    auto buffers = RunBufferAssignment(module.get());
     EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
   }
 
   {
-    auto buffers = RunBufferAssignmentNoBuffersForConstants(module);
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
     EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
   }
 }
@@ -340,17 +357,17 @@ TEST_F(BufferAssignmentTest, BufferForConst) {
       LiteralUtil::CreateR1<float>({4.1f, 4.2f, 4.3f, 4.4f})));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, const0, const1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   {
-    auto buffers = RunBufferAssignment(module);
+    auto buffers = RunBufferAssignment(module.get());
     EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
     EXPECT_TRUE(buffers->HasTopLevelAllocation(const1));
     GetAssignedOutputAllocation(*buffers, add);
   }
   {
-    auto buffers = RunBufferAssignmentNoBuffersForConstants(module);
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
     EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
     EXPECT_FALSE(buffers->HasTopLevelAllocation(const1));
     GetAssignedOutputAllocation(*buffers, add);
@@ -369,10 +386,10 @@ TEST_F(BufferAssignmentTest, HasAllocationAt) {
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param0));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({negate, param0, constant}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   // Make sure that HasAllocationAt() agrees with what HasTopLevelAllocation()
   // reports for the instruction directly.
   EXPECT_EQ(buffers->HasTopLevelAllocation(tuple),
@@ -392,10 +409,10 @@ TEST_F(BufferAssignmentTest, BufferForOutputConst) {
       LiteralUtil::CreateR1<float>({1.1f, 2.2f, 3.3f, 4.4f})));
   auto copy = builder.AddInstruction(
       HloInstruction::CreateUnary(const0->shape(), HloOpcode::kCopy, const0));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   // The copy node now has an output buffer.
   GetAssignedOutputAllocation(*buffers, copy);
 }
@@ -421,10 +438,10 @@ TEST_F(BufferAssignmentTest, Basic) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -447,6 +464,56 @@ TEST_F(BufferAssignmentTest, Basic) {
   GetAssignedOutputAllocation(*buffers, sub);
 }
 
+TEST_F(BufferAssignmentTest, AddCannotReuse) {
+  // Pass in a special rule to indicate that "add" cannot reuse any buffer.
+  //
+  // paramscalar ------- (mul) -- (add) -- (sub)
+  //                     /        /        /
+  // param0[100] -------/        /        /
+  //                            /        /
+  // param1[100] --------------/--------/
+  auto builder = HloComputation::Builder(TestName());
+  auto paramscalar =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p"));
+  auto broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(f32vec100_, paramscalar, {}));
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec100_, "p1"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec100_, "p2"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kMultiply, broadcast, param0));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
+  auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kSubtract, add, param1));
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  auto buffers = RunBufferAssignmentNoBuffersReuseForAdd(module.get());
+
+  // Distinct input buffers were assigned for parameters.
+  BufferAllocation paramscalar_buffer =
+      GetAssignedInputAllocation(*buffers, paramscalar);
+  BufferAllocation param0_buffer = GetAssignedInputAllocation(*buffers, param0);
+  BufferAllocation param1_buffer = GetAssignedInputAllocation(*buffers, param1);
+  EXPECT_NE(paramscalar_buffer.index(), param0_buffer.index());
+  EXPECT_NE(paramscalar_buffer.index(), param1_buffer.index());
+  EXPECT_NE(param0_buffer.index(), param1_buffer.index());
+
+  // The mul node has a valid buffer assigned, doesn't share with input.
+  const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
+  EXPECT_NE(mul_buffer.index(), param0_buffer.index());
+
+  // The add node cannot reuse the mul node's buffer since we told buffer
+  // assignment so.
+  const BufferAllocation& add_buffer = GetTopLevelAllocation(*buffers, add);
+  EXPECT_NE(add_buffer.index(), mul_buffer.index());
+
+  // The sub node has a valid output buffer assigned.
+  GetAssignedOutputAllocation(*buffers, sub);
+}
+
 TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
   // paramscalar ------- (mul) -- (add) -- (sub)
   //                     /        /        /
@@ -470,7 +537,7 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto colorer = [](const BufferLiveness& buffer_liveness) {
@@ -485,7 +552,7 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
     return Status::OK();
   };
 
-  auto buffers = RunColoredBufferAssignment(module, colorer);
+  auto buffers = RunColoredBufferAssignment(module.get(), colorer);
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -531,7 +598,7 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto colorer = [](const BufferLiveness& buffer_liveness) {
@@ -554,7 +621,7 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
     return Status::OK();
   };
 
-  auto buffers = RunColoredBufferAssignment(module, colorer);
+  auto buffers = RunColoredBufferAssignment(module.get(), colorer);
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -603,10 +670,10 @@ TEST_F(BufferAssignmentTest, MultipleUsersForNode) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kSubtract, add, mul));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
 
   // Input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -638,7 +705,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   // param0[100x10] ---> (map x+1)
   //
   // Builds the map function.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto map_computation =
       module->AddEmbeddedComputation(BuildMapComputationPlus1("f32+1"));
   auto inner_last = map_computation->root_instruction();
@@ -657,7 +724,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   EXPECT_EQ(3, level1.size()) << "Invalid nested add+1 size";
 
   // Assigns buffers and fetches sizes.
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   int64 size0 = ValidateBuffers(level0, *buffers);
   int64 size1 = ValidateBuffers(level1, *buffers);
 
@@ -693,7 +760,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) {
   // out-of-order reductions could overwrite an element before a use.)
   //
   // param0[100] --- (exp1) --- (exp2) --- (reduce x+y) --- (exp3)
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto reduce_computation =
       module->AddEmbeddedComputation(BuildReduceComputation("f32+f32"));
 
@@ -716,7 +783,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) {
 
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   const std::vector<const HloInstruction*> instrs = GetInstructions(exp3);
   ValidateBuffers(instrs, *buffers);
 
@@ -744,7 +811,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
   // const4[f32[4]] --- tuple --- while[condition, body]
   //
   // Builds the nested condition and body.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto condition_computation =
       module->AddEmbeddedComputation(BuildWhileConditionComputation("if<4"));
   auto body_computation =
@@ -772,7 +839,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
   EXPECT_EQ(8, levelb.size()) << "Invalid nested body size";
 
   // Assigns buffers and fetches sizes.
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   int64 size0 = ValidateBuffers(level0, *buffers);
   int64 sizec = ValidateBuffers(levelc, *buffers);
   int64 sizeb = ValidateBuffers(levelb, *buffers);
@@ -810,7 +877,7 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
 }
 
 TEST_F(BufferAssignmentTest, ExampleConditional) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto true_computation = module->AddEmbeddedComputation(
       BuildR0F32UnaryOpComputation(HloOpcode::kCeil, "Ceil"));
   auto false_computation = module->AddEmbeddedComputation(
@@ -837,7 +904,7 @@ TEST_F(BufferAssignmentTest, ExampleConditional) {
   EXPECT_EQ(2, true_instrs.size());
   EXPECT_EQ(2, false_instrs.size());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   ValidateBuffers(conditional_instrs, *buffers);
   ValidateBuffers(true_instrs, *buffers);
   ValidateBuffers(false_instrs, *buffers);
@@ -873,9 +940,9 @@ TEST_F(BufferAssignmentTest, UnaryOpReuseChain) {
   auto neg = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, exp2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // tanh and exp2 can reuse exp1's buffer
   EXPECT_TRUE(assignment->HasTopLevelAllocation(exp1));
@@ -902,9 +969,9 @@ TEST_F(BufferAssignmentTest, ReuseNonOperandBuffer) {
   auto broadcast = builder.AddInstruction(
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // negate and broadcast should share a buffer.
   EXPECT_TRUE(assignment->HasTopLevelAllocation(broadcast));
@@ -935,9 +1002,9 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) {
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({negate, broadcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The instructions should not share buffers.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -972,9 +1039,9 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) {
       HloInstruction::CreateBroadcast(f32a100x10_, slice, {1}));
   builder.AddInstruction(HloInstruction::CreateTuple({tuple, broadcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The instructions should not share buffers.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1007,9 +1074,9 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBuffer) {
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The broadcast output buffer cannot be shared.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1039,9 +1106,9 @@ TEST_F(BufferAssignmentTest, ReuseOutputBufferIfExactlySized) {
   auto broadcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {10, 10}), slice, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // negate and broadcast should share a buffer.
   EXPECT_TRUE(assignment->HasTopLevelAllocation(broadcast));
@@ -1077,9 +1144,9 @@ TEST_F(BufferAssignmentTest, DoNotReuseOversizedOutputBufferInTuple) {
       ShapeUtil::MakeShape(F32, {10, 4}), slice, {0}));
   builder.AddInstruction(HloInstruction::CreateTuple({broadcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // The broadcast output buffer cannot be shared.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1092,7 +1159,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
   // Verify that buffers for embedded computations are properly marked as
   // thread-local and that embedded parameters are not marked as
   // is_entry_computation_parameter.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto vec_shape = ShapeUtil::MakeShape(F32, {42});
   auto scalar_shape = ShapeUtil::MakeShape(F32, {});
 
@@ -1123,7 +1190,7 @@ TEST_F(BufferAssignmentTest, EmbeddedComputationBuffers) {
       HloInstruction::CreateMap(vec_shape, {call}, map_computation));
   module->AddEntryComputation(builder.Build());
 
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Allocations for the map computation should be thread-local and not
   // live-out.
@@ -1170,9 +1237,9 @@ TEST_F(BufferAssignmentTest, TupleParameterAsOutput) {
                                  ShapeUtil::MakeShape(S32, {42})}),
       "param0"));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // There should be four allocations: one for vector of pointers, and one for
   // each tuple element.
@@ -1206,9 +1273,9 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(tuple_param->shape(), {1}), tuple_param, 1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Only some of the elements of the input param are liveout.
   EXPECT_FALSE(
@@ -1250,9 +1317,9 @@ TEST_F(BufferAssignmentTest, TupleConstantAsOutput) {
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::MakeTuple({&elements[0], &elements[1]})));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_EQ(3, assignment->Allocations().size());
 }
@@ -1264,9 +1331,9 @@ TEST_F(BufferAssignmentTest, TupleCustomCallAsOutput) {
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(PRED, {1, 2, 3, 4}),
                                  ShapeUtil::MakeShape(S32, {101})}),
       /*operands=*/{}, /*custom_call_target=*/"foo_function"));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_EQ(3, assignment->Allocations().size());
   EXPECT_TRUE(
@@ -1279,7 +1346,7 @@ TEST_F(BufferAssignmentTest, TupleCustomCallAsOutput) {
 
 TEST_F(BufferAssignmentTest, TupleCallAsOutput) {
   // Test a computation which returns a tuple call value.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto elem_shape = f32vec4_;
   auto tuple_shape = ShapeUtil::MakeTupleShape({elem_shape});
 
@@ -1297,7 +1364,7 @@ TEST_F(BufferAssignmentTest, TupleCallAsOutput) {
       HloInstruction::CreateCall(tuple_shape, {param}, sub_computation));
   module->AddEntryComputation(builder.Build());
 
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_EQ(2, assignment->Allocations().size());
   // Buffers for call are colocated with the sub-computation.
@@ -1320,7 +1387,7 @@ TEST_F(BufferAssignmentTest, TupleChainedCallAsOutput) {
   // B: call(C, param)
   // C: call(D, param)
   // D: param
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto elem_shape = f32vec4_;
   auto tuple_shape = ShapeUtil::MakeTupleShape({elem_shape});
 
@@ -1359,7 +1426,7 @@ TEST_F(BufferAssignmentTest, TupleChainedCallAsOutput) {
   module->AddEntryComputation(std::move(a_computation));
   module->AddEmbeddedComputation(std::move(b_computation));
 
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Buffers for call are colocated with the sub-computations.
   EXPECT_EQ(GetAllocation(*assignment, a_call, /*index=*/{}),
@@ -1393,9 +1460,9 @@ TEST_F(BufferAssignmentTest, BitcastAsOutput) {
   auto bitcast = builder.AddInstruction(
       HloInstruction::CreateUnary(param->shape(), HloOpcode::kBitcast, param));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Bitcast should get the same allocation as the param.
   EXPECT_EQ(1, assignment->Allocations().size());
@@ -1420,9 +1487,9 @@ TEST_F(BufferAssignmentTest, AmbiguousBufferAsOutput) {
       HloInstruction::CreateTernary(tuple_shape, HloOpcode::kTupleSelect,
                                     pred_param, tuple_param0, tuple_param1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // Select shallow copies one of its operands so it defines its own top-level
   // buffer and receives its own allocation.
@@ -1458,9 +1525,9 @@ TEST_F(BufferAssignmentTest, TupleBufferNotReused) {
   auto copy = builder.AddInstruction(HloInstruction::CreateUnary(
       scalar_shape, HloOpcode::kCopy, tuple_element));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module);
+  auto assignment = RunBufferAssignment(module.get());
 
   // There should be no buffer reuse. The copy should not reuse the tuple
   // buffer.
@@ -1500,9 +1567,9 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
       HloInstruction::CreateConcatenate(shape_5x4, {dot_ab, dot_bc}, 0));
 
   // Run buffer assignment with alignment=1.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module, /*alignment=*/1);
+  auto assignment = RunBufferAssignment(module.get(), /*alignment=*/1);
 
   // There are 5 allocations: 3 parameters, 1 output, and 1 temp.
   EXPECT_EQ(5, assignment->Allocations().size());
@@ -1521,7 +1588,7 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
   EXPECT_EQ(80, slice_bc.allocation()->size());
 
   // Re-run buffer assignment with alignment=64.
-  assignment = RunBufferAssignment(module, /*alignment=*/64);
+  assignment = RunBufferAssignment(module.get(), /*alignment=*/64);
   EXPECT_EQ(5, assignment->Allocations().size());
   slice_ab = assignment->GetUniqueTopLevelSlice(dot_ab).ConsumeValueOrDie();
   slice_bc = assignment->GetUniqueTopLevelSlice(dot_bc).ConsumeValueOrDie();
@@ -1564,10 +1631,10 @@ TEST_F(BufferAssignmentTest, TrivialPeakBuffers) {
       HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
   builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kSubtract, add, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
 
   const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
   const std::vector<const LogicalBuffer*>& peak_buffers =
@@ -1605,11 +1672,11 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
 
       ShapeUtil::MakeShape(F32, {1}), concat, {0}, {1}, {1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto buffers = RunBufferAssignmentWithInstructionSequence(
-      module, {param, log, rev, neg, concat, root});
+      module.get(), {param, log, rev, neg, concat, root});
 
   // The temporary buffer should hold the 4 interior instructions.
   const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, concat);
@@ -1630,7 +1697,7 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
 }
 
 TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape shape = ShapeUtil::MakeShape(F32, {123, 123});
   HloComputation* condition;
   {
@@ -1665,7 +1732,7 @@ TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
       ShapeUtil::MakeShape(F32, {123, 123, 123}), bcast, {0}));
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module);
+  auto buffers = RunBufferAssignment(module.get());
   const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, bcast);
   const std::vector<const LogicalBuffer*>& peak_buffers =
       buffer.PeakMemoryLogicalBuffers();
@@ -1715,13 +1782,13 @@ ENTRY main {
 }
 )";
 
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
   HloInstruction* constant_1 =
-      module().entry_computation()->GetInstructionWithName("constant.1.1");
+      m->entry_computation()->GetInstructionWithName("constant.1.1");
   HloInstruction* constant_2 =
-      module().entry_computation()->GetInstructionWithName("constant.1.2");
+      m->entry_computation()->GetInstructionWithName("constant.1.2");
 
-  auto buffers = RunBufferAssignment(&module());
+  auto buffers = RunBufferAssignment(m.get());
 
   {
     const BufferAllocation& allocation_for_const_1 =
@@ -1750,7 +1817,7 @@ ENTRY main {
   }
 }
 
-class WhileBufferAssignmentTest : public HloVerifiedTestBase {
+class WhileBufferAssignmentTest : public HloTestBase {
  protected:
   std::unique_ptr<HloComputation> BuildWhileConditionComputation(
       const string& name) {
@@ -1785,7 +1852,7 @@ class WhileBufferAssignmentTest : public HloVerifiedTestBase {
   std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
                                                         int64 alignment = 1) {
     HloSchedule schedule =
-        ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie();
+        ScheduleModule(module, ByteSizeOf).ConsumeValueOrDie();
     return BufferAssigner::Run(
                module, absl::make_unique<SequentialHloOrdering>(schedule),
                ByteSizeOf,
@@ -1810,7 +1877,7 @@ static void RunCopyInsertion(HloModule* module) {
 }
 
 TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1849,8 +1916,8 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
       HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1));
 
   module->AddEntryComputation(builder.Build());
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
 
   // Verify 'input0' and read-only use while0{0} alias.
   EXPECT_EQ(assignment->GetUniqueSlice(input0, {}).ConsumeValueOrDie(),
@@ -1906,20 +1973,19 @@ ENTRY %test_module {
   ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={}
 })";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
   // Run CopyInsertion and check if the graph constructed above doesn't need
   // any copies inserted for BufferAssignment to run.
-  int64 instruction_count = module().instruction_count();
+  int64 instruction_count = m->instruction_count();
   CopyInsertion copy_insertion;
-  ASSERT_IS_OK(copy_insertion.Run(&module()).status());
-  ASSERT_EQ(instruction_count, module().instruction_count());
+  ASSERT_IS_OK(copy_insertion.Run(m.get()).status());
+  ASSERT_EQ(instruction_count, m->instruction_count());
 
   // Get the instructions in the module.
-  const HloInstruction* bcast =
-      module().entry_computation()->root_instruction();
+  const HloInstruction* bcast = m->entry_computation()->root_instruction();
   const HloInstruction* param =
-      module().entry_computation()->parameter_instruction(0);
+      m->entry_computation()->parameter_instruction(0);
   ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
   const HloInstruction* while1 = bcast->operand(0);
   ASSERT_EQ(while1->opcode(), HloOpcode::kWhile);
@@ -1927,7 +1993,7 @@ ENTRY %test_module {
   ASSERT_EQ(while0->opcode(), HloOpcode::kWhile);
 
   // Run buffer assignment.
-  auto assignment = RunBufferAssignment(&module());
+  auto assignment = RunBufferAssignment(m.get());
   TF_ASSERT_OK_AND_ASSIGN(auto slice_param,
                           assignment->GetUniqueSlice(param, {}));
   TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
@@ -1974,20 +2040,19 @@ ENTRY %test_module {
   ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={}
 })";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
 
   // Run CopyInsertion and check if the graph constructed above doesn't need
   // any copies inserted for BufferAssignment to run.
-  int64 instruction_count = module().instruction_count();
+  int64 instruction_count = m->instruction_count();
   CopyInsertion copy_insertion;
-  ASSERT_IS_OK(copy_insertion.Run(&module()).status());
-  ASSERT_EQ(instruction_count, module().instruction_count());
+  ASSERT_IS_OK(copy_insertion.Run(m.get()).status());
+  ASSERT_EQ(instruction_count, m->instruction_count());
 
   // Get the instructions in the module.
-  const HloInstruction* bcast =
-      module().entry_computation()->root_instruction();
+  const HloInstruction* bcast = m->entry_computation()->root_instruction();
   const HloInstruction* constant =
-      module().entry_computation()->GetInstructionWithName("constant.42");
+      m->entry_computation()->GetInstructionWithName("constant.42");
   ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
   const HloInstruction* while1 = bcast->operand(0);
   ASSERT_EQ(while1->opcode(), HloOpcode::kWhile);
@@ -1995,7 +2060,7 @@ ENTRY %test_module {
   ASSERT_EQ(while0->opcode(), HloOpcode::kWhile);
 
   // Run buffer assignment.
-  auto assignment = RunBufferAssignment(&module());
+  auto assignment = RunBufferAssignment(m.get());
   TF_ASSERT_OK_AND_ASSIGN(auto slice_constant,
                           assignment->GetUniqueSlice(constant, {}));
   TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
@@ -2053,7 +2118,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   };
 
   // Build the entry computation as described in the comment above.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto token = builder.AddInstruction(HloInstruction::CreateToken());
@@ -2088,7 +2153,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   // any copies inserted for BufferAssignment to run.
   int64 instruction_count = module->instruction_count();
   CopyInsertion copy_insertion;
-  ASSERT_IS_OK(copy_insertion.Run(module).status());
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
   ASSERT_EQ(instruction_count, module->instruction_count());
 
   // Create a sequential order among all the instructions in the entry
@@ -2096,7 +2161,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   // nodes are traversed during BufferAssignment.
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
@@ -2107,12 +2172,12 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto assignment,
-      BufferAssigner::Run(module,
-                          absl::make_unique<SequentialHloOrdering>(schedule),
-                          backend().compiler()->BufferSizeBytesFunction(),
-                          [](LogicalBuffer::Color) { return 1; },
-                          /*allow_input_output_aliasing=*/false,
-                          /*allocate_buffers_for_constants=*/true));
+      BufferAssigner::Run(
+          module.get(), absl::make_unique<SequentialHloOrdering>(schedule),
+          backend().compiler()->BufferSizeBytesFunction(),
+          [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true));
 
   // The result tuple elements must be assigned with different buffers.
   TF_ASSERT_OK_AND_ASSIGN(auto slice0, assignment->GetUniqueSlice(tuple, {0}));
@@ -2134,7 +2199,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
 }
 
 TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -2166,8 +2231,8 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
       HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, while0));
 
   module->AddEntryComputation(builder.Build());
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
 
   // while0 and while1 buffers should be completely aligned.
   EXPECT_EQ(assignment->GetUniqueSlice(while0, {0}).ConsumeValueOrDie(),
@@ -2179,7 +2244,7 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
 }
 
 TEST_F(BufferAssignmentTest, TwoCalls) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   Shape r0f32 = ShapeUtil::MakeShape(xla::F32, {});
   HloComputation* sub_computation;
   {
@@ -2209,13 +2274,13 @@ TEST_F(BufferAssignmentTest, TwoCalls) {
 
   {
     FlattenCallGraph flatten;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
     EXPECT_TRUE(result);
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   }
 
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
 
   EXPECT_TRUE(BuffersDistinct({call1}, {call2}, *assignment));
 }
@@ -2240,13 +2305,14 @@ ENTRY Main {
 )";
 
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-  ParseAndVerifyModule(hlo_text, config);
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
 
-  auto buffers = RunBufferAssignment(&module());
+  auto buffers = RunBufferAssignment(m.get());
 
-  HloComputation* main = module().entry_computation();
-  HloComputation* callee = module().GetComputationWithName("Callee");
+  HloComputation* main = m->entry_computation();
+  HloComputation* callee = m->GetComputationWithName("Callee");
   EXPECT_NE(callee, nullptr);
 
   HloInstruction* param0 = callee->parameter_instruction(0);
@@ -2270,7 +2336,7 @@ ENTRY Main {
 }
 
 TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto zero = builder.AddInstruction(
@@ -2317,40 +2383,41 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
 
   {
     FlattenCallGraph flatten;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
     EXPECT_TRUE(result);
   }
 
-  RunCopyInsertion(module);
+  RunCopyInsertion(module.get());
 
   HloSchedule schedule =
-      ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie();
+      ScheduleModule(module.get(), ByteSizeOf).ConsumeValueOrDie();
 
   // To trigger b/38494731, we want a specific Hlo schedule for the
   // root computation, so we overwrite that entry with a manually
   // crafted sequence.
-  schedule.set_sequence(module->entry_computation(),
-                        {input1, weights1, one, output1, while1->operand(0),
-                         while1, input0, weights0, zero, output0,
-                         while0->operand(0), while0, gte0, gte1, root_add});
+  schedule.set_sequence(
+      module->entry_computation(),
+      {input1, weights1, one, output1, while1->mutable_operand(0), while1,
+       input0, weights0, zero, output0, while0->mutable_operand(0), while0,
+       gte0, gte1, root_add});
 
   // If this ASSERT fails, we constructed a bogus sequence above and this test
   // itself is buggy.
   TF_ASSERT_OK(schedule.Verify());
 
   auto assignment =
-      BufferAssigner::Run(module,
-                          absl::make_unique<SequentialHloOrdering>(schedule),
-                          ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
-                          /*allow_input_output_aliasing=*/false,
-                          /*allocate_buffers_for_constants=*/true)
+      BufferAssigner::Run(
+          module.get(), absl::make_unique<SequentialHloOrdering>(schedule),
+          ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true)
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
 }
 
 TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -2394,8 +2461,8 @@ TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
       HloInstruction::CreateGetTupleElement(data_shape_, while1, 2));
 
   module->AddEntryComputation(builder.Build());
-  RunCopyInsertion(module);
-  auto assignment = RunBufferAssignment(module);
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
   // Get BufferAllocation for root instruction.
   auto* root_alloc = assignment->GetUniqueTopLevelSlice(while1_out)
                          .ConsumeValueOrDie()
@@ -2406,5 +2473,58 @@ TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
   EXPECT_FALSE(root_alloc->is_entry_computation_parameter());
 }
 
+TEST_F(WhileBufferAssignmentTest, WhileWithDynamicUpdateSliceShare) {
+  const char* const hlo_string = R"(
+HloModule test
+
+while_body {
+  state = (s32[], f32[1280,1,128]{2,1,0}) parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128]{2,1,0} broadcast(constant.1), dimensions={}
+  get-tuple-element.4 = f32[1280,1,128]{2,1,0} get-tuple-element(state), index=1
+  get-tuple-element.3 = s32[] get-tuple-element(state), index=0
+  constant.2 = s32[] constant(128)
+  add.5 = s32[] add(get-tuple-element.3, constant.2)
+  constant.3 = s32[3]{0} constant({0, 0, 0})
+  dynamic-update-slice.5 = f32[1280,1,128]{2,1,0} dynamic-update-slice(get-tuple-element.4, broadcast.6, constant.3)
+  dynamic-update-slice.9 = f32[1280,1,128]{2,1,0} dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3)
+  ROOT tuple.85 = (s32[], s32[], s32[2]{0}, f32[1280,1,128]{2,1,0}) tuple(add.5, dynamic-update-slice.9)
+}
+
+while_condition {
+  state = (s32[], f32[1280,1,128]{2,1,0}) parameter(0)
+  get-tuple-element = s32[] get-tuple-element(state), index=0
+  get-tuple-element.1 = s32[] constant(3)
+  ROOT less-than.339.338 = pred[] less-than(get-tuple-element, get-tuple-element.1)
+}
+
+ENTRY entry_computation {
+  constant.7 = s32[] constant(0)
+  copy.1 = s32[] copy(constant.7)
+  constant.6 = f32[] constant(0)
+  broadcast.6 = f32[1280,1,128]{2,1,0} broadcast(constant.6), dimensions={}
+  tuple.1 = (s32[], f32[1280,1,128]{2,1,0}) tuple(copy.1, broadcast.6)
+  while.0 = (s32[], f32[1280,1,128]{2,1,0}) while(tuple.1), condition=while_condition, body=while_body
+  ROOT get-tuple-element.2 = s32[] get-tuple-element(while.0), index=0
+}
+
+)";
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  auto module = module_or_status.ConsumeValueOrDie();
+
+  RunCopyInsertion(module.get());
+  auto assignment = RunBufferAssignment(module.get());
+  // Get BufferAllocation for root instruction.
+  auto dus9 = FindInstruction(module.get(), "dynamic-update-slice.9");
+  auto dus9_alloc_slice =
+      assignment->GetUniqueTopLevelSlice(dus9).ConsumeValueOrDie();
+  auto dus5 = FindInstruction(module.get(), "dynamic-update-slice.5");
+  auto dus5_alloc_slice =
+      assignment->GetUniqueTopLevelSlice(dus5).ConsumeValueOrDie();
+  // Test that the two dynamic-update-slice ops share the same allocation slice.
+  EXPECT_EQ(dus9_alloc_slice.allocation(), dus5_alloc_slice.allocation());
+  EXPECT_EQ(dus9_alloc_slice, dus5_alloc_slice);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index 17e50905059ad2c92784d14132c1cb1f46f35ade..40825a78716b1c0b9fb0121787977d275891c0f8 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -117,7 +117,7 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(vec_, HloOpcode::kLog, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -164,7 +164,7 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -213,7 +213,7 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
   auto reverse =
       builder.AddInstruction(HloInstruction::CreateReverse(vec_, negate, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -247,7 +247,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -289,7 +289,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -336,7 +336,7 @@ TEST_F(BufferLivenessTest, RootInstructionIsNotLastInSequentialOrder) {
       HloInstruction::CreateSend(recv_done, token, /*channel_id=*/1));
   auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build(add));
 
   HloSchedule schedule(module.get());
@@ -373,7 +373,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
   auto outer_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({inner_tuple, exp}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -393,7 +393,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
 
 TEST_F(BufferLivenessTest, EmbeddedComputation) {
   // Test MaybeLiveOut and MayInterfere for embedded computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto embedded_builder = HloComputation::Builder(TestName() + "_embedded");
   auto embedded_param = embedded_builder.AddInstruction(
@@ -450,7 +450,7 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
   builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       inner_tuple0.shape(), tuple_constant, 0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -514,7 +514,7 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(BuildDummyComputation());
   module->AddEmbeddedComputation(builder.Build());
 
@@ -576,7 +576,7 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(BuildDummyComputation());
   module->AddEmbeddedComputation(builder.Build());
 
@@ -611,8 +611,8 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
 class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
  protected:
   // Builds and runs a computation (see test case computation graphs below).
-  std::unique_ptr<HloModule> BuildModule(const bool update_uses_tuple_element1,
-                                         const bool fuse_gte0) {
+  std::unique_ptr<VerifiedHloModule> BuildModule(
+      const bool update_uses_tuple_element1, const bool fuse_gte0) {
     auto builder = HloComputation::Builder(TestName());
     // Create param0 Tuple.
     Shape data_shape = ShapeUtil::MakeShape(F32, {8});
@@ -646,7 +646,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewModule();
+    auto module = CreateNewVerifiedModule();
     module->AddEntryComputation(builder.Build());
     auto* computation = module->entry_computation();
     // Create fusion instruction based on number of tuple element 1 users.
@@ -802,7 +802,7 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     auto tuple_root = builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewModule();
+    auto module = CreateNewVerifiedModule();
     module->AddEntryComputation(BuildDummyComputation());
     module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
diff --git a/tensorflow/compiler/xla/service/buffer_value.h b/tensorflow/compiler/xla/service/buffer_value.h
index 69b36463560a1fad4f62687e9014fb3fbe5bbd13..11d8abc5badf7b1a05239ed74a05be0c899e37a1 100644
--- a/tensorflow/compiler/xla/service/buffer_value.h
+++ b/tensorflow/compiler/xla/service/buffer_value.h
@@ -141,6 +141,9 @@ class BufferValue {
   // operator< is required for std::set.
   bool operator<(const BufferValue& other) const { return id_ < other.id_; }
 
+  bool operator==(const BufferValue& other) const { return id_ == other.id_; }
+  bool operator!=(const BufferValue& other) const { return id_ != other.id_; }
+
   virtual string ToString() const = 0;
 
   // TODO(lauj) rename LogicalBufferProto to BufferValueProto.
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index bdd5069632e84fe6c67ca129f726432479ac1b35..7987343bfaf1069fd550909d127e4b11f2124701 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -325,6 +325,15 @@ bool CallGraph::IsFlattened() const {
   return true;
 }
 
+std::vector<HloInstruction*> CallGraph::GetComputationCallers(
+    HloComputation* c) {
+  std::vector<HloInstruction*> callers;
+  for (auto callsite : GetNode(c).caller_callsites()) {
+    callers.push_back(callsite.instruction());
+  }
+  return callers;
+}
+
 std::pair<HloInstruction*, HloInstruction*>
 CallGraph::NearestAncestorsInSameComputation(HloInstruction* a,
                                              HloInstruction* b) const {
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index cb56f4789d06ac33acdaadc8b619b9e37f683d58..05c7c998738f861ee804d1ec87bfa5fb17ddfb74 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -236,6 +236,10 @@ class CallGraph {
   // FlattenCallGraph.
   bool IsFlattened() const;
 
+  // Returns a vector of instructions calling the passed computation.
+  // (Often a vector of size 1.)
+  std::vector<HloInstruction*> GetComputationCallers(HloComputation* c);
+
   string ToString() const;
 
  private:
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 34f3f914d593bc603c4964663f9cafb70a136fd3..a3ac2568b0f3eec8556a42dbe3c2c64bd8564468 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -31,7 +31,7 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-class CallGraphTest : public HloVerifiedTestBase {
+class CallGraphTest : public HloTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
   std::unique_ptr<HloComputation> MakeScalarComputation(
@@ -93,10 +93,10 @@ class CallGraphTest : public HloVerifiedTestBase {
 
 TEST_F(CallGraphTest, SingletonComputation) {
   // Test the call graph of a module with a single computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(1, call_graph->nodes().size());
   EXPECT_TRUE(call_graph->IsFlattened());
 
@@ -112,13 +112,13 @@ TEST_F(CallGraphTest, SingletonComputation) {
 TEST_F(CallGraphTest, UnreachableComputation) {
   // Test the call graph of a module with an entry computation and an
   // unreachable computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(MakeScalarComputation());
   HloComputation* unreachable_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
@@ -134,13 +134,13 @@ TEST_F(CallGraphTest, UnreachableComputation) {
 TEST_F(CallGraphTest, ParallelComputation) {
   // Test a call graph of a module with an entry computation which calls another
   // computation in a parallel context via kMap.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* map_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
   HloComputation* entry_computation = module->AddEntryComputation(
       MakeMappingComputation(map_computation, /*callsites=*/5));
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
@@ -163,13 +163,13 @@ TEST_F(CallGraphTest, ParallelComputation) {
 TEST_F(CallGraphTest, SequentialComputations) {
   // Test a call graph of a module with an entry computation which calls another
   // computation in a sequential context via kCall.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* called_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
   HloComputation* entry_computation = module->AddEntryComputation(
       MakeCallingComputation(called_computation, /*callsites=*/3));
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   // The called computation is only called from one other computation, but there
@@ -196,7 +196,7 @@ TEST_F(CallGraphTest, SequentialComputations) {
 TEST_F(CallGraphTest, ContextBothComputations) {
   // Test a call graph of a module with an entry computation which calls another
   // computation in both a parallel and sequential context.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* subcomputation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
@@ -210,7 +210,7 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(2, call_graph->nodes().size());
 
   EXPECT_FALSE(call_graph->IsFlattened());
@@ -239,7 +239,7 @@ TEST_F(CallGraphTest, ContextBothComputations) {
 
 TEST_F(CallGraphTest, ComputationWithConditional) {
   // Test a call graph of a module with a conditional.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* true_computation =
       module->AddEmbeddedComputation(MakeScalarComputation(HloOpcode::kCeil));
   HloComputation* false_computation =
@@ -259,7 +259,7 @@ TEST_F(CallGraphTest, ComputationWithConditional) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   EXPECT_EQ(3, call_graph->nodes().size());
 
@@ -298,7 +298,7 @@ TEST_F(CallGraphTest, ComplexGraph) {
   //    c
   //
   // Calls are made via kCall, kWhile, and kMap instructions.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(MakeConditionComputation());
   HloComputation* c_computation =
@@ -328,7 +328,7 @@ TEST_F(CallGraphTest, ComplexGraph) {
     entry_computation = module->AddEntryComputation(builder.Build());
   }
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(5, call_graph->nodes().size());
   EXPECT_FALSE(call_graph->IsFlattened());
 
@@ -418,7 +418,7 @@ TEST_F(CallGraphTest, ComplexGraphNearestAncestors) {
   //    c
   //
   // Calls are made via kCall, kWhile, and kMap instructions.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(MakeConditionComputation());
   HloComputation* c_computation =
@@ -452,7 +452,7 @@ TEST_F(CallGraphTest, ComplexGraphNearestAncestors) {
     entry_computation = module->AddEntryComputation(builder.Build());
   }
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(5, call_graph->nodes().size());
 
   // Verify NearestAncestorsInSameComputation for various instructions in the
@@ -479,10 +479,10 @@ TEST_F(CallGraphTest, ComplexGraphNearestAncestors) {
 
 TEST_F(CallGraphTest, VisitSingletonComputation) {
   // Test the call graph visitor with a call graph with a single node.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   std::vector<HloComputation*> visited;
   TF_ASSERT_OK(call_graph->VisitNodes([&visited](const CallGraphNode& node) {
@@ -494,12 +494,12 @@ TEST_F(CallGraphTest, VisitSingletonComputation) {
 
 TEST_F(CallGraphTest, VisitUnreachableComputation) {
   // Test the call graph visitor with a call graph with an unreachable node.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(MakeScalarComputation());
   HloComputation* unreachable_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   // Test visitation of only reachable nodes.
   {
@@ -531,9 +531,9 @@ TEST_F(CallGraphTest, VisitUnreachableComputation) {
 
 TEST_F(CallGraphTest, VisitWithError) {
   // Test that the call graph visitor properly propagates errors.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(MakeScalarComputation());
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
 
   Status status = call_graph->VisitNodes(
       [](const CallGraphNode&) { return InternalError("Visitation failed"); });
diff --git a/tensorflow/compiler/xla/service/call_inliner_test.cc b/tensorflow/compiler/xla/service/call_inliner_test.cc
index e6b566543594a86eb5369ee9b7440f62618f6c5a..0b6e323f75c7a5dae127e20d2a4b92a83a72df3b 100644
--- a/tensorflow/compiler/xla/service/call_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/call_inliner_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -40,7 +40,7 @@ namespace {
 
 // Tests for call inlining that are most tractable at the HLO level (vs
 // ComputationBuilder API in call_test.cc).
-using CallInlinerTest = HloVerifiedTestBase;
+using CallInlinerTest = HloTestBase;
 
 TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   // "inner" computation just has a control dependency from the "zero" value to
@@ -51,7 +51,7 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   HloInstruction* one = inner.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   TF_ASSERT_OK(zero->AddControlDependencyTo(one));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* inner_computation =
       module->AddEmbeddedComputation(inner.Build());
 
@@ -64,7 +64,7 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
   auto computation = module->AddEntryComputation(outer.Build());
 
   CallInliner call_inliner;
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
   ASSERT_TRUE(mutated);
   EXPECT_THAT(computation->root_instruction(), op::Constant());
   EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<float>(),
@@ -79,7 +79,7 @@ TEST_F(CallInlinerTest, ControlDependenciesAreCarriedToCaller) {
 // returns false should be identical to just returning false).
 TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) {
   const Shape pred = ShapeUtil::MakeShape(PRED, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // Create a lambda that calls a function that returns the false predicate.
   // Note we also use this lambda twice by reference, just to make the test a
@@ -107,7 +107,7 @@ TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) {
   auto computation = module->AddEntryComputation(outer.Build());
 
   CallInliner call_inliner;
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
   ASSERT_TRUE(mutated);
   EXPECT_THAT(
       computation->root_instruction()->while_condition()->root_instruction(),
@@ -120,7 +120,7 @@ TEST_F(CallInlinerTest, CallsWithinWhileBodiesAreInlined) {
 // whole pass.
 TEST_F(CallInlinerTest, InlineWithoutRunningPass) {
   const Shape pred = ShapeUtil::MakeShape(PRED, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder just_false(TestName() + ".false");
   auto* true_constant = just_false.AddInstruction(
@@ -144,7 +144,7 @@ TEST_F(CallInlinerTest, InlineWithoutRunningPass) {
 
 TEST_F(CallInlinerTest, CallToOutfeedComputationIsInlined) {
   const Shape f32 = ShapeUtil::MakeShape(F32, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder outfeeder(TestName() + ".outfeeder");
   auto value = outfeeder.AddInstruction(
@@ -163,7 +163,7 @@ TEST_F(CallInlinerTest, CallToOutfeedComputationIsInlined) {
   module->AddEntryComputation(outer.Build());
 
   CallInliner call_inliner;
-  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool mutated, call_inliner.Run(module.get()));
   ASSERT_TRUE(mutated);
 }
 
diff --git a/tensorflow/compiler/xla/service/compilation_cache.cc b/tensorflow/compiler/xla/service/compilation_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2662fe46705f4936ce0d654df0943e7d30890ebe
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compilation_cache.cc
@@ -0,0 +1,70 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/compilation_cache.h"
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+int64 GetUniqueId() {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static int64 counter = 0;
+  tensorflow::mutex_lock loc(mu);
+  const int64 id = counter++;
+  return id;
+}
+
+}  // namespace
+
+ExecutionHandle CompilationCache::Insert(
+    std::unique_ptr<Executable> executable) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  CacheKey key = GetUniqueId();
+  VLOG(2) << "inserting cache key: " << key;
+  CHECK_EQ(cache_.count(key), 0);
+  cache_.emplace(key, std::move(executable));
+
+  ExecutionHandle handle;
+  handle.set_handle(key);
+  return handle;
+}
+
+StatusOr<std::shared_ptr<Executable>> CompilationCache::LookUp(
+    const ExecutionHandle& handle) const {
+  tensorflow::mutex_lock lock(mutex_);
+
+  CacheKey key = handle.handle();
+  VLOG(2) << "looking up cache key: " << key;
+  if (cache_.count(key) == 0) {
+    VLOG(2) << "cache key not found: " << key;
+    return InvalidArgumentStrCat("can not find executable with handle ", key);
+  } else {
+    auto& result = cache_.at(key);
+    VLOG(2) << "hit executable: " << result->module().name();
+    return result;
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compilation_cache.h b/tensorflow/compiler/xla/service/compilation_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f94def509d4d4a8950272cb498af5056a698ce0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compilation_cache.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
+
+#include <map>
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace xla {
+
+// A cache which stores Executables indexed by computation handle and version.
+//
+// TODO(b/119042872): Provide mechanism for removing computations from the
+// compilation cache.
+class CompilationCache {
+ public:
+  CompilationCache() {}
+
+  ExecutionHandle Insert(std::unique_ptr<Executable> executable);
+
+  // Lookup the Executable for the specified handle in the cache. Return a
+  // shared_ptr to the Executable if it exists in the cache.
+  StatusOr<std::shared_ptr<Executable>> LookUp(
+      const ExecutionHandle& handle) const;
+
+ protected:
+  mutable tensorflow::mutex mutex_;
+
+  using CacheKey = int64;
+
+  absl::flat_hash_map<CacheKey, std::shared_ptr<Executable>> cache_
+      GUARDED_BY(mutex_);
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CompilationCache);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 96bd2616f5607de888a096f8392ceb68490276e3..1965925fa7f6d50b1d7af918bc3468d4b4d5d0a2 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -67,7 +67,7 @@ CompileOnlyService::CompileAheadOfTime(
     std::unique_ptr<AotCompilationMetadata>* metadata) {
   std::vector<std::unique_ptr<HloModule>> hlo_modules;
   for (const AotXlaComputationInstance& instance : computations) {
-    TF_RET_CHECK(instance.computation.has_program_shape());
+    TF_RET_CHECK(instance.computation.has_host_program_shape());
 
     const DebugOptions& debug_options = options.debug_options();
 
@@ -86,13 +86,15 @@ CompileOnlyService::CompileAheadOfTime(
           Executable::DumpToDirectory(per_host_path, filename, hlo_snapshot));
     }
 
-    const auto& program_shape = instance.computation.program_shape();
     ExecutionOptions execution_options;
     *execution_options.mutable_debug_options() = debug_options;
+    *execution_options.mutable_shape_with_output_layout() =
+        instance.result_layout->ToProto();
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(program_shape, instance.argument_layouts,
-                           &execution_options));
+        CreateModuleConfig(
+            ProgramShape(instance.computation.host_program_shape()),
+            instance.argument_layouts, &execution_options));
 
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModule> hlo_module,
@@ -101,8 +103,10 @@ CompileOnlyService::CompileAheadOfTime(
     hlo_modules.push_back(std::move(hlo_module));
   }
 
-  return compiler_->CompileAheadOfTime(std::move(hlo_modules), options,
-                                       metadata);
+  return compiler_->CompileAheadOfTime(
+      absl::make_unique<HloModuleGroup>(hlo_modules[0]->name(),
+                                        absl::MakeSpan(hlo_modules)),
+      options, metadata);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 687ecafe0c308ecc22857fae650c6998677f605d..8f08c244908efb823b3870c19bdc3491fa87d44f 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -45,7 +45,7 @@ Compiler::ComputeDefaultBackendConfig(const HloInstruction& hlo,
 // Define a default version where metadata is not used.
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 Compiler::CompileAheadOfTime(
-    std::vector<std::unique_ptr<HloModule>> modules,
+    std::unique_ptr<HloModuleGroup> module_group,
     const AotCompilationOptions& options,
     std::unique_ptr<AotCompilationMetadata>* metadata) {
   if (metadata != nullptr) {
@@ -53,7 +53,7 @@ Compiler::CompileAheadOfTime(
         "Populating AotCompilationMetadata is not implemented on this "
         "compiler.");
   }
-  return CompileAheadOfTime(std::move(modules), options);
+  return CompileAheadOfTime(std::move(module_group), options);
 }
 
 /* static */ std::map<se::Platform::Id, Compiler::CompilerFactory>*
@@ -110,6 +110,6 @@ Compiler::GetPlatformCompilers() {
 }
 
 AotCompilationOptions::AotCompilationOptions()
-    : debug_options_(legacy_flags::GetDebugOptionsFromFlags()) {}
+    : debug_options_(GetDebugOptionsFromFlags()) {}
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 1fdda31c34a17a16f75e1efada542c2c2ea15038..d4db95da8eb901af8a6675f2991def73ccfe8ee6 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -135,22 +136,35 @@ class Compiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator) = 0;
 
+  // Optimizes a HLO module group, a set of module which runs concurrently on
+  // multiple devices potentially communicating data between the modules.
+  virtual Status RunHloPassesOnModuleGroup(
+      HloModuleGroup* module_group,
+      absl::Span<se::StreamExecutor* const> executors,
+      DeviceMemoryAllocator* device_allocator) = 0;
+
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
   // applied to module. Generally a module should be passed through RunHloPasses
   // prior to calling this method because some HLO passes are required for
-  // correctness. Takes ownership of the HLO module and is free to transform it.
+  // correctness. Takes ownership of the HLO module.
   //
   // The compiler may optionally specialize to the individual device
   // (not just type of device) indicated by the executor.
   //
   // device_allocator is optional; see RunHloPasses.
-  //
-  // Use the overload below to compile computations that run in parallel.
   virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
       DeviceMemoryAllocator* device_allocator) = 0;
 
+  // Compiles a set of HLO modules that can run in parallel, potentially
+  // communicating data between the modules.
+  virtual StatusOr<std::vector<std::unique_ptr<Executable>>>
+  RunBackendOnModuleGroup(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
+      DeviceMemoryAllocator* device_allocator) = 0;
+
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
@@ -160,7 +174,7 @@ class Compiler {
   // TODO(b/68666782): Remove this method after adding support for multiple
   // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> modules,
+      std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
       DeviceMemoryAllocator* device_allocator) = 0;
 
@@ -184,16 +198,16 @@ class Compiler {
   ComputeDefaultBackendConfig(const HloInstruction& hlo,
                               se::StreamExecutor* executor) const;
 
-  // Compiles the HLO module for ahead-of-time execution.  This is intended for
-  // use in static compilation.
+  // Compiles the HLO module group for ahead-of-time execution.  This is
+  // intended for use in static compilation.
   virtual StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                      const AotCompilationOptions& options) = 0;
 
   // Similar to CompileAheadOfTime above but AotCompilationMetadata
   // has an argument that can be populated during compilation.
   virtual StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                      const AotCompilationOptions& options,
                      std::unique_ptr<AotCompilationMetadata>* metadata);
 
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index af8f7f1027a40703137d6880a9865449c560a47b..efc893818d03a20d6bd65b7dc1da72ea5da5ceb0 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -56,4 +56,14 @@ string ComputationLayout::ToString() const {
                       result_layout_.ToString());
 }
 
+ProgramShape ComputationLayout::ComputeProgramShape() const {
+  ProgramShape program_shape;
+  for (int64 i = 0; i < parameter_layouts_.size(); ++i) {
+    *program_shape.add_parameters() = parameter_layouts_[i].shape();
+    *program_shape.add_parameter_names() = absl::StrCat("p", i);
+  }
+  *program_shape.mutable_result() = result_layout_.shape();
+  return program_shape;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h
index 6975f387b4864bf28ea0ad23d7d4602b5b346e08..a2fb656677f354fbf85ff613d826cd6be86ba3bf 100644
--- a/tensorflow/compiler/xla/service/computation_layout.h
+++ b/tensorflow/compiler/xla/service/computation_layout.h
@@ -83,6 +83,10 @@ class ComputationLayout {
   // Returns a string representation of this object.
   string ToString() const;
 
+  // Create a ProgramShape proto based on the parameter and result shapes held
+  // within this object.
+  ProgramShape ComputeProgramShape() const;
+
  private:
   std::vector<ShapeLayout> parameter_layouts_;
   ShapeLayout result_layout_;
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index c899ffb9dc562426ef14c0d414469c04debeec70..844b42a38d7539cccd5c4e30071c0ea6693e3bba 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -105,8 +105,6 @@ class ComputationPlacer {
   // Map from platform kind to computation placer singleton.
   static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
 
-  se::Platform::Id platform_id_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
 };
 
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
index c43a31b167d47af3c92ed35fa52594fa5da1e4af..289eb6d90239a72ecc0f3312a7e0e8453f946858 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -37,7 +37,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class ConditionalSimplifierTest : public HloVerifiedTestBase {
+class ConditionalSimplifierTest : public HloTestBase {
  public:
   // Makes a computation that contains a conditional with constant predicate.
   HloComputation* MakeConditional(HloModule* module);
@@ -96,25 +96,28 @@ HloComputation* ConditionalSimplifierTest::MakeConditional(HloModule* module) {
 }
 
 TEST_F(ConditionalSimplifierTest, ConditionalGetsInlined) {
-  HloComputation* computation = MakeConditional(&module());
-  ASSERT_TRUE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
+  ASSERT_TRUE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Parameter(), op::Constant()));
 }
 
 TEST_F(ConditionalSimplifierTest, ConditionalWithControlDependency) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
 
   auto* true_op = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
   TF_ASSERT_OK(
       true_op->AddControlDependencyTo(computation->root_instruction()));
 
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsSend) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
   auto* conditional = computation->root_instruction();
   ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
 
@@ -125,11 +128,12 @@ TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsSend) {
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true))),
       token, /*channel_id=*/0));
   true_computation->AddInstruction(HloInstruction::CreateSendDone(send));
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsRecv) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
   auto* conditional = computation->root_instruction();
   ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
 
@@ -138,18 +142,19 @@ TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsRecv) {
   auto* recv = true_computation->AddInstruction(HloInstruction::CreateRecv(
       ShapeUtil::MakeShape(F32, {1}), token, /*channel_id=*/0));
   true_computation->AddInstruction(HloInstruction::CreateRecvDone(recv));
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(ConditionalSimplifierTest, NotRemovedIfContainsNonRemovableInstruction) {
-  HloComputation* computation = MakeConditional(&module());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = MakeConditional(m.get());
   auto* conditional = computation->root_instruction();
   ASSERT_EQ(conditional->opcode(), HloOpcode::kConditional);
   auto* false_computation = conditional->false_computation();
   auto token = false_computation->AddInstruction(HloInstruction::CreateToken());
   false_computation->AddInstruction(HloInstruction::CreateInfeed(
       ShapeUtil::MakeShape(F32, {1}), token, "config"));
-  EXPECT_FALSE(ConditionalSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ConditionalSimplifier().Run(m.get()).ValueOrDie());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
index 0ac4a65ec6ae55fabd2b48ea2982b94f9551c8d2..95c7724c3c93507ae61a984301ecfc0111bef192 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
@@ -51,7 +51,8 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
   Status HandleConvolution(HloInstruction* convolution) override;
 
   // Runs the visitor on a computation.
-  static bool Run(HloComputation* computation);
+  static bool Run(HloComputation* computation,
+                  bool canonicalize_depthwise_filter);
 
   // Returns whether any convolution ops were rewritten.
   const bool changed() const { return changed_; }
@@ -59,18 +60,24 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
   ~ConvolutionVisitor() override = default;
 
  private:
-  explicit ConvolutionVisitor(HloComputation* computation)
-      : computation_(computation) {}
+  explicit ConvolutionVisitor(HloComputation* computation,
+                              bool canonicalize_depthwise_filter = false)
+      : computation_(computation),
+        filter_expansion_(!canonicalize_depthwise_filter) {}
 
   // Current HloComputation instance the ConvolutionVisitor is traversing.
   HloComputation* computation_;
 
   // Whether rewrite has occurred.
   bool changed_ = false;
+
+  // Whether filter expansion is required.
+  bool filter_expansion_;
 };
 
-bool ConvolutionVisitor::Run(HloComputation* computation) {
-  ConvolutionVisitor visitor(computation);
+bool ConvolutionVisitor::Run(HloComputation* computation,
+                             bool canonicalize_depthwise_filter) {
+  ConvolutionVisitor visitor(computation, canonicalize_depthwise_filter);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -135,16 +142,16 @@ std::vector<int32> GetMaskIds(int64 group_size, int64 group_count) {
 // Finally we use the Eq op of these two broadcasted constants and get the
 // desired mask.
 HloInstruction* GetExpandedFilterMask(
-    const Shape& filter_shape, int64 input_feature_dim,
-    int64 output_feature_dim, int64 group_count,
+    const Shape& filter_shape, int64 kernel_input_feature_dim,
+    int64 kernel_output_feature_dim, int64 group_count,
     const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
         add_instruction) {
   Shape expanded_filter_shape =
-      ExpandedFilterShape(filter_shape, group_count, input_feature_dim);
+      ExpandedFilterShape(filter_shape, group_count, kernel_input_feature_dim);
   Shape mask_shape = ShapeUtil::MakeShape(
       S32, AsInt64Slice(expanded_filter_shape.dimensions()));
-  int64 output_feature = filter_shape.dimensions(output_feature_dim);
-  int64 group_size = filter_shape.dimensions(input_feature_dim);
+  int64 output_feature = filter_shape.dimensions(kernel_output_feature_dim);
+  int64 group_size = filter_shape.dimensions(kernel_input_feature_dim);
 
   // Create a 'input_feature' sized linspace and 'output_feature' sized linspace
   // that will be broadcasted into perpendicular dimensions and compared.
@@ -152,15 +159,14 @@ HloInstruction* GetExpandedFilterMask(
       GetMaskIds(group_size, group_count);
   const std::vector<int32> output_feature_filter_mask =
       GetMaskIds(output_feature / group_count, group_count);
-
   auto mask1 = add_instruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>(input_feature_filter_mask)));
-  auto broadcasted_mask1 = add_instruction(
-      HloInstruction::CreateBroadcast(mask_shape, mask1, {input_feature_dim}));
+  auto broadcasted_mask1 = add_instruction(HloInstruction::CreateBroadcast(
+      mask_shape, mask1, {kernel_input_feature_dim}));
   auto mask2 = add_instruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>(output_feature_filter_mask)));
-  auto broadcasted_mask2 = add_instruction(
-      HloInstruction::CreateBroadcast(mask_shape, mask2, {output_feature_dim}));
+  auto broadcasted_mask2 = add_instruction(HloInstruction::CreateBroadcast(
+      mask_shape, mask2, {kernel_output_feature_dim}));
 
   // Compare the broadcasted output feature linspace to the input feature
   // linspace to create a diagonal predicate.
@@ -182,51 +188,203 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
   };
 
   auto dim_numbers = convolution->convolution_dimension_numbers();
-  int64 input_feature_dim = dim_numbers.kernel_input_feature_dimension();
-  int64 group_size = filter->shape().dimensions(input_feature_dim);
-  int64 output_feature_dim = dim_numbers.kernel_output_feature_dimension();
-  auto expanded_filter_shape =
-      ExpandedFilterShape(filter->shape(), group_count, input_feature_dim);
-  HloInstruction* filter_mask = GetExpandedFilterMask(
-      filter->shape(), input_feature_dim, output_feature_dim, group_count, add);
+  int64 kernel_input_feature_dim = dim_numbers.kernel_input_feature_dimension();
+  int64 group_size = filter->shape().dimensions(kernel_input_feature_dim);
+  int64 kernel_output_feature_dim =
+      dim_numbers.kernel_output_feature_dimension();
+  auto expanded_filter_shape = ExpandedFilterShape(filter->shape(), group_count,
+                                                   kernel_input_feature_dim);
+  HloInstruction* filter_mask =
+      GetExpandedFilterMask(filter->shape(), kernel_input_feature_dim,
+                            kernel_output_feature_dim, group_count, add);
   HloInstruction* expanded_filter;
-  // We want to repeat 'filter' in the 'input_feature_dim' dimension
-  // 'group_count' times.
+
   if (group_size == 1) {
+    bool depthwise_separable =
+        (group_count == filter->shape().dimensions(kernel_output_feature_dim));
+    // If the code generator handles depthwise separable convolutions
+    // inherently, then no filter expansion is needed.
+    if (!filter_expansion_ && depthwise_separable) {
+      return Status::OK();
+    }
+    // We want to repeat 'filter' in the 'input_feature_dim' dimension
+    // 'group_count' times.
     Shape reshaped_filter_shape =
-        ShapeUtil::DeleteDimension(input_feature_dim, filter->shape());
+        ShapeUtil::DeleteDimension(kernel_input_feature_dim, filter->shape());
     auto reshaped_filter =
         add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
     std::vector<int64> broadcast_dims;
     for (int64 i = 0; i < filter->shape().dimensions_size(); ++i) {
-      if (i == input_feature_dim) {
+      if (i == kernel_input_feature_dim) {
         continue;
       }
       broadcast_dims.push_back(i);
     }
     expanded_filter = add(HloInstruction::CreateBroadcast(
         expanded_filter_shape, reshaped_filter, broadcast_dims));
+
+    auto zero = add(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(expanded_filter_shape.element_type())));
+    auto zero_filter =
+        add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {}));
+    auto new_filter = add(HloInstruction::CreateTernary(
+        expanded_filter_shape, HloOpcode::kSelect, filter_mask, expanded_filter,
+        zero_filter));
+
+    auto new_convolution = HloInstruction::CreateConvolve(
+        convolution->shape(), convolution->mutable_operand(0), new_filter,
+        /*feature_group_count=*/1, convolution->window(), dim_numbers,
+        convolution->precision_config());
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+        convolution, std::move(new_convolution)));
   } else {
-    // We could possibly also use reshape, broadcast, reshape instead of concat
-    // here, but it would require more complex code, and for depthwise
-    // convolution we would never end up in this branch.
-    std::vector<HloInstruction*> concat_operands(group_count, filter);
-    expanded_filter = add(HloInstruction::CreateConcatenate(
-        expanded_filter_shape, concat_operands, input_feature_dim));
+    int64 activation_input_feature_dim = dim_numbers.input_feature_dimension();
+
+    int64 output_feature =
+        filter->shape().dimensions(kernel_output_feature_dim);
+
+    // If group_count == output_feature, then we map those grouped convolutions
+    // onto depthwise convolution. This is done by adding an additional spatial
+    // dimension to the activations, kernel, and the output.
+    // E.g., we would turn
+    // [2, 12]{B, IF} conv [3, 4]{IF, OF} into
+    // [3, 2, 4]{S, B, IF} depth conv [3, 1, 4]{S, IF, OF}, where S is the
+    // additional spatial dimension. The generated convolution output will be
+    // [1, 2, 4]{S, B, OF} and then reshape the output back to [2, 4] {B, OF}.
+
+    if (group_count == output_feature && !filter_expansion_) {
+      auto filter = convolution->mutable_operand(1);
+      auto activation = convolution->mutable_operand(0);
+
+      // Add spatial dimension to the activation, and reshape.
+      Shape reshaped_activation_shape = activation->shape();
+      ShapeUtil::AppendMajorDimension(group_size, &reshaped_activation_shape);
+
+      int64 new_spatial_dim = reshaped_activation_shape.dimensions().size() - 1;
+
+      reshaped_activation_shape.set_dimensions(activation_input_feature_dim,
+                                               group_count);
+      activation = add(
+          HloInstruction::CreateReshape(reshaped_activation_shape, activation));
+
+      // Add spatial dimension to the filter, and reshape.
+      Shape reshaped_filter_shape = filter->shape();
+      ShapeUtil::AppendMajorDimension(1, &reshaped_filter_shape);
+
+      filter =
+          add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
+
+      Shape new_output_shape = convolution->shape();
+      ShapeUtil::AppendMajorDimension(1, &new_output_shape);
+
+      // Edit convolution dimension numbers. Note that kernel_input_feature_dim
+      // now becomes a spatial dimension, and the newly added dimension of size
+      // 1 is the new kernel_input_feature_dim.
+      dim_numbers.add_input_spatial_dimensions(new_spatial_dim);
+      dim_numbers.add_kernel_spatial_dimensions(kernel_input_feature_dim);
+      dim_numbers.set_kernel_input_feature_dimension(new_spatial_dim);
+      dim_numbers.add_output_spatial_dimensions(new_spatial_dim);
+
+      // Add window for the new spatial dimension.
+      Window new_window = convolution->window();
+      auto* dim = new_window.add_dimensions();
+      dim->set_window_dilation(1);
+      dim->set_base_dilation(1);
+      dim->set_stride(1);
+      dim->set_size(group_size);
+
+      auto new_convolution = add(HloInstruction::CreateConvolve(
+          new_output_shape, activation, filter, group_count, new_window,
+          dim_numbers, convolution->precision_config()));
+
+      // Delete the extra spatial dimension, and reshape.
+      Shape reshaped_convolution_shape =
+          ShapeUtil::DeleteDimension(new_spatial_dim, new_convolution->shape());
+      auto reshaped_convolution = HloInstruction::CreateReshape(
+          reshaped_convolution_shape, new_convolution);
+
+      TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+          convolution, std::move(reshaped_convolution)));
+
+    } else {
+      // The filter expansion mechanism adds zeroes in the kernel.
+      // For an OF = 12, IF = 6, and kernel IF = 2, the expanded filter mask
+      // would look like (IF on the Y-axis, OF on the X-axis)
+      // 1 1 1 1 0 0 0 0 0 0 0 0
+      // 1 1 1 1 0 0 0 0 0 0 0 0
+      // 0 0 0 0 1 1 1 1 0 0 0 0
+      // 0 0 0 0 1 1 1 1 0 0 0 0
+      // 0 0 0 0 0 0 0 0 1 1 1 1
+      // 0 0 0 0 0 0 0 0 1 1 1 1
+      //
+      // Instead of convolving the above with the input, we instead slice the
+      // kernel into three kernels, each containing islands of 1s from the
+      // filter above. We also slice the activations in the IF dimension with
+      // each slice of size = group_size. For each slice, we perform
+      // convolutions, and concatenate the generated outputs in the output OF
+      // dimension.
+
+      std::vector<HloInstruction*> sliced_convolutions;
+      auto activation = convolution->mutable_operand(0);
+      std::vector<int64> slice_strides(filter->shape().dimensions_size(), 1);
+      std::vector<int64> filter_slice_starts(filter->shape().dimensions_size(),
+                                             0);
+      std::vector<int64> filter_slice_limits(
+          filter->shape().dimensions().begin(),
+          filter->shape().dimensions().end());
+      std::vector<int64> activation_slice_starts(
+          activation->shape().dimensions_size(), 0);
+      std::vector<int64> activation_slice_limits(
+          activation->shape().dimensions().begin(),
+          activation->shape().dimensions().end());
+
+      int64 output_feature =
+          filter->shape().dimensions(kernel_output_feature_dim);
+      auto output_feature_dim = dim_numbers.output_feature_dimension();
+      int64 filter_slice_width = output_feature / group_count;
+
+      int64 activation_input_feature_dim =
+          dim_numbers.input_feature_dimension();
+
+      for (int64 i = 0; i < group_count; i++) {
+        filter_slice_starts[kernel_output_feature_dim] = i * filter_slice_width;
+        filter_slice_limits[kernel_output_feature_dim] =
+            (i + 1) * filter_slice_width;
+        auto filter_sliced_shape = filter->shape();
+        filter_sliced_shape.set_dimensions(kernel_output_feature_dim,
+                                           filter_slice_width);
+        auto filter_slice = add(HloInstruction::CreateSlice(
+            filter_sliced_shape, filter, filter_slice_starts,
+            filter_slice_limits, slice_strides));
+
+        activation_slice_starts[activation_input_feature_dim] = i * group_size;
+        activation_slice_limits[activation_input_feature_dim] =
+            (i + 1) * group_size;
+        auto activation_sliced_shape = activation->shape();
+        activation_sliced_shape.set_dimensions(activation_input_feature_dim,
+                                               group_size);
+        auto activation_slice = add(HloInstruction::CreateSlice(
+            activation_sliced_shape, activation, activation_slice_starts,
+            activation_slice_limits, slice_strides));
+
+        auto conv_slice_shape = convolution->shape();
+        conv_slice_shape.set_dimensions(output_feature_dim, filter_slice_width);
+
+        auto new_convolution = add(HloInstruction::CreateConvolve(
+            conv_slice_shape, activation_slice, filter_slice,
+            /*feature_group_count=*/1, convolution->window(), dim_numbers,
+            convolution->precision_config()));
+
+        sliced_convolutions.push_back(new_convolution);
+      }
+
+      auto new_conv = HloInstruction::CreateConcatenate(
+          convolution->shape(), sliced_convolutions, output_feature_dim);
+      TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+          convolution, std::move(new_conv)));
+    }
   }
-  auto zero = add(HloInstruction::CreateConstant(
-      LiteralUtil::Zero(expanded_filter_shape.element_type())));
-  auto zero_filter =
-      add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {}));
-  auto new_filter = add(
-      HloInstruction::CreateTernary(expanded_filter_shape, HloOpcode::kSelect,
-                                    filter_mask, expanded_filter, zero_filter));
-  auto new_convolution = HloInstruction::CreateConvolve(
-      convolution->shape(), convolution->mutable_operand(0), new_filter,
-      /*feature_group_count=*/1, convolution->window(), dim_numbers,
-      convolution->precision_config());
-  TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
-      convolution, std::move(new_convolution)));
+
   return Status::OK();
 }
 
@@ -237,7 +395,7 @@ StatusOr<bool> ConvolutionFeatureGroupConverter::Run(HloModule* module) {
                         module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (ConvolutionVisitor::Run(comp)) {
+    if (ConvolutionVisitor::Run(comp, filter_expansion_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
index ce0138e56fbd51daaf5d3ac329ccbe31a9fdbde7..cb6bc04c00a2ff10f970da2a07fb540a561dad5a 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
@@ -27,7 +27,8 @@ namespace xla {
 // convolutions with feature_group_count = 1.
 class ConvolutionFeatureGroupConverter : public HloModulePass {
  public:
-  ConvolutionFeatureGroupConverter() {}
+  ConvolutionFeatureGroupConverter(bool canonicalize_depthwise_filter = false)
+      : filter_expansion_(canonicalize_depthwise_filter) {}
 
   absl::string_view name() const override {
     return "convolution-feature-group-converter";
@@ -36,6 +37,9 @@ class ConvolutionFeatureGroupConverter : public HloModulePass {
   // Run convolution rewriting on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
+
+  // Tells whether filter expansion is required.
+  bool filter_expansion_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
index 28373ebf636c7b6b3059dcf6cd931901ebc87fc2..e6bf2143a21bd5001d3530fe8727c88504be1d43 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
@@ -82,18 +82,14 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2
   ConvolutionFeatureGroupConverter converter;
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
-  // Make sure the convolution is converted to one with feature_group_count = 1.
-  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
-  EXPECT_EQ(root->feature_group_count(), 1);
-  // Verify that the filter operand has been replaced.
-  EXPECT_THAT(root->operand(1),
-              op::Select(op::Eq(op::Broadcast(op::Constant()),
-                                op::Broadcast(op::Constant())),
-                         // We expect to see Concatenate here instead of
-                         // Broadcast, because feature_group_count < input
-                         // feature dimension.
-                         op::Concatenate(op::Parameter(), op::Parameter()),
-                         op::Broadcast(op::Constant())));
+  // Make sure the convolution is replaced with a concatenate.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConcatenate);
+  // And the operands of the concatenate are convolutions, each with a feature
+  // group count = 1.
+  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->operand(1)->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->operand(0)->feature_group_count(), 1);
+  EXPECT_EQ(root->operand(1)->feature_group_count(), 1);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index f35324aa35370b592871749cba9fc2f66bea9219..df6059663876dfde71f4c75d3931b3d2de72c1df 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -40,10 +40,12 @@ namespace {
 
 using absl::StrAppend;
 
-bool IsEntryParameterValue(const HloValue& value) {
+bool IsReadonlyEntryParameterValue(const HloValue& value) {
   const HloComputation* computation = value.defining_instruction()->parent();
   return value.defining_instruction()->opcode() == HloOpcode::kParameter &&
-         computation == computation->parent()->entry_computation();
+         computation == computation->parent()->entry_computation() &&
+         !computation->parent()->input_output_alias_config().ParameterHasAlias(
+             value.defining_instruction()->parameter_number(), value.index());
 }
 
 bool IsConstantValue(const HloValue& value) {
@@ -51,7 +53,7 @@ bool IsConstantValue(const HloValue& value) {
 }
 
 bool ValueIsReadOnly(const HloValue& value) {
-  return IsConstantValue(value) || IsEntryParameterValue(value);
+  return IsConstantValue(value) || IsReadonlyEntryParameterValue(value);
 }
 
 // Data structure describing the action which should be taken on parts of a
@@ -79,8 +81,7 @@ SpecialCaseCopyPolicy GetSpecialCaseCopyPolicy(const CallGraphNode& node,
 bool ShouldCopyRootValue(const HloValue& value,
                          const SpecialCaseCopyPolicy& policy) {
   if (policy.copy_parameters_and_constants) {
-    return IsConstantValue(value) ||
-           value.defining_instruction()->opcode() == HloOpcode::kParameter;
+    return ValueIsReadOnly(value);
   }
   return false;
 }
@@ -332,6 +333,88 @@ Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
   return Status::OK();
 }
 
+// Conservatively adds copies before root instruction of entry computation and
+// each aliased parameter to resolve interference of aliased input and output
+// buffer. We later rely on the CopyRemover to drop the unnecessary ones.
+Status AddCopiesForAliasedInputOutputs(HloModule* module) {
+  HloComputation* entry = module->entry_computation();
+  HloInstruction* root = entry->root_instruction();
+
+  ShapeTree<bool> output_indices_to_copy(root->shape());
+  std::vector<absl::optional<ShapeTree<HloInstruction*>>> copied_parameters(
+      entry->num_parameters());
+  bool has_alias = false;
+  for (auto* param : entry->parameter_instructions()) {
+    bool param_has_alias = false;
+    ShapeTree<bool> param_indices_to_copy(param->shape());
+
+    module->input_output_alias_config().ForEachAlias(
+        [&](const ShapeIndex& output_index, int64 param_number,
+            const ShapeIndex& param_index) {
+          if (param_number == param->parameter_number()) {
+            param_has_alias = true;
+            *(param_indices_to_copy.mutable_element(param_index)) = true;
+            *(output_indices_to_copy.mutable_element(output_index)) = true;
+          }
+        });
+
+    if (!param_has_alias) {
+      continue;
+    }
+
+    TF_RET_CHECK(param->parameter_number() < entry->num_parameters());
+    TF_RET_CHECK(!copied_parameters[param->parameter_number()]);
+
+    has_alias = true;
+    // Store a snapshot of users before DeepCopyInstruction, as
+    // DeepCopyInstruction introduces new users of the instruction.
+    std::vector<HloInstruction*> users = param->users();
+    ShapeTree<HloInstruction*> param_copy_tree(param->shape(),
+                                               /*init_value=*/nullptr);
+    TF_ASSIGN_OR_RETURN(HloInstruction * copied,
+                        entry->DeepCopyInstruction(
+                            param, &param_indices_to_copy, &param_copy_tree));
+    for (HloInstruction* user : users) {
+      TF_RETURN_IF_ERROR(param->ReplaceUseWith(user, copied));
+    }
+
+    copied_parameters[param->parameter_number()] = param_copy_tree;
+  }
+
+  if (!has_alias) {
+    return Status::OK();
+  }
+
+  // Add copies before root instruction.
+  ShapeTree<HloInstruction*> output_copy_tree(root->shape(),
+                                              /*init_value=*/nullptr);
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * root_copied,
+                      root->parent()->DeepCopyInstruction(
+                          root, &output_indices_to_copy, &output_copy_tree));
+
+  // Add control dependencies between the input/output copies.
+  TF_RETURN_IF_ERROR(module->input_output_alias_config().ForEachAliasWithStatus(
+      [&](const ShapeIndex& output_index, int64 param_number,
+          const ShapeIndex& input_index) -> Status {
+        if (!copied_parameters[param_number]) {
+          return Status::OK();
+        }
+        HloInstruction* from =
+            copied_parameters[param_number]->element(input_index);
+        HloInstruction* to = output_copy_tree.element(output_index);
+
+        TF_RET_CHECK(from != nullptr);
+        TF_RET_CHECK(to != nullptr);
+        TF_RETURN_IF_ERROR(from->AddControlDependencyTo(to));
+        return Status::OK();
+      }));
+
+  entry->set_root_instruction(root_copied);
+
+  return Status::OK();
+}
+
 // Removes any control dependencies to or from the given instruction.
 Status StripControlDependenciesFrom(HloInstruction* instruction) {
   while (!instruction->control_successors().empty()) {
@@ -359,7 +442,6 @@ class CopyRemover {
               const HloOrdering& ordering, HloModule* module)
       : module_(module),
         alias_analysis_(alias_analysis),
-        ordering_(ordering),
         buffer_value_tracker_(*module, alias_analysis, ordering) {}
 
   // Try to elide the given copy. The copy is elided if the instruction is not
@@ -920,7 +1002,6 @@ class CopyRemover {
 
   HloModule* module_;
   const HloAliasAnalysis& alias_analysis_;
-  const HloOrdering& ordering_;
 
   // Object tracking the HLO values contained in each HLO buffer.
   BufferValueTracker buffer_value_tracker_;
@@ -953,6 +1034,8 @@ Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
       }
     }
   }
+
+  TF_RETURN_IF_ERROR(AddCopiesForAliasedInputOutputs(module));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index c097089e30d59936a32f69c49123c398f1611ea3..8866b5050bf1e7419dda6496ea95d034178d25d8 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -94,10 +94,12 @@ class CopyInsertion : public HloModulePass {
   Status VerifyNoLiveRangeInterference(const HloOrdering& ordering,
                                        HloModule* module);
 
- private:
+ protected:
   // Override which requires the caller to pass in a call graph.
-  Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module);
+  virtual Status AddSpecialCaseCopies(const CallGraph& call_graph,
+                                      HloModule* module);
 
+ private:
   Status AddCopiesToResolveInterference(HloModule* module);
 
   // Backend specific function that decides whether a fusion can share buffer
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 892d0d7b547aaf1e7f1c55e4163d1e1fd9518def..e4e9d7ba05c115be9dd0eb53ebd7de208d514efb 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <set>
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -94,7 +94,7 @@ TEST_F(CopyInsertionTest, SingleParameter) {
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -114,7 +114,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -127,7 +127,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 TEST_F(CopyInsertionTest, ExistingCopiesNotRemoved) {
   // Verify that kCopy instructions which change layout and exist before
   // copy-insertion remain in the graph after copy-insertion.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant =
@@ -181,7 +181,7 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
 
   builder.AddInstruction(HloInstruction::CreateTuple({constant2, x, add}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -217,7 +217,7 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
   EXPECT_THAT(constant2->users(), UnorderedElementsAre(tuple1, tuple2));
   EXPECT_THAT(constant3->users(), UnorderedElementsAre(tuple2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
@@ -238,7 +238,7 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -261,7 +261,7 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(bitcast));
@@ -283,7 +283,7 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
   builder.AddInstruction(HloInstruction::CreateTuple({bitcast}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -310,7 +310,7 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
            ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(HloOpcode::kParameter,
@@ -351,7 +351,7 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -388,7 +388,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(select->shape(), {0}), select, 0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -403,7 +403,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
 
 class WhileCopyInsertionTest : public CopyInsertionTest {
  protected:
-  WhileCopyInsertionTest() : module_(CreateNewModule()) {}
+  WhileCopyInsertionTest() : module_(CreateNewUnverifiedModule()) {}
 
   // Builds a While condition computation which reads the induction variable
   // from the tuple parameter, and returns a predicate indicating whether this
@@ -1295,7 +1295,7 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
 TEST_F(CopyInsertionTest, SwizzlingWhile) {
   // Test a while instruction with a body which permutes its tuple parameter
   // elements.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1351,13 +1351,225 @@ TEST_F(CopyInsertionTest, SwizzlingWhile) {
   EXPECT_THAT(xla_while->operand(0), op::Tuple(op::Copy(), op::Copy()));
 }
 
+TEST_F(CopyInsertionTest, CrossingParameters) {
+  // Test a case where two parameters' dataflow cross with each other while
+  // input and output are aliased with same index:
+  //
+  //  (p0 ,  p1)
+  //   | \   /|
+  //   |  \ / |
+  // alias X  alias
+  //   |  / \ |
+  //   | /   \|
+  //  (p1  ,  p0)
+  auto module = CreateNewVerifiedModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+  builder.AddInstruction(HloInstruction::CreateTuple({gte1, gte0}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 4);
+}
+
+TEST_F(CopyInsertionTest, ParametersAliasing) {
+  // Test a case where two parameters' dataflow don't interfere with each other
+  // while aliased.
+  //
+  //  (p0 ,  p1)
+  //   |      |
+  //   |      |
+  // alias   alias
+  //   |      |
+  //   |      |
+  //  (p0 ,  p1)
+  auto module = CreateNewVerifiedModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+  builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
+TEST_F(CopyInsertionTest, ParameterWithNoAliasing) {
+  // Test a case where no parameter is aliased with result. In this case, copy
+  // should be added
+  //
+  //  (p0 ,  p1)
+  //   |      |
+  //   |      |
+  //   |      |
+  //   |      |
+  //   |      |
+  //  (p0 ,  p1)
+  auto module = CreateNewVerifiedModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+  builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+  module->AddEntryComputation(builder.Build());
+  InsertCopies(module.get());
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(op::GetTupleElement(param, 0)),
+                        op::Copy(op::GetTupleElement(param, 1))));
+
+  EXPECT_EQ(CountCopies(*module), 2);
+}
+
+TEST_F(CopyInsertionTest, ParameterWithPartialAliasing) {
+  // Test a case where one parameter is aliased with result while another one
+  // isn't.
+  //
+  //  (p0 ,  p1)
+  //   |      |
+  //   |      |
+  // alias    |
+  //   |      |
+  //   |      |
+  //  (p0 ,  p1)
+  auto module = CreateNewVerifiedModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+  builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  InsertCopies(module.get());
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::GetTupleElement(param, 0),
+                        op::Copy(op::GetTupleElement(param, 1))));
+
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, ParameterAndParallelOpsWithPartialAliasing) {
+  // Test a case where one parameter is aliased with result while another one
+  // isn't.
+  //
+  //   +-- (p0 ,  p1)
+  //   |    |      |
+  //   |    |      |
+  // alias Negate  Negate
+  //   |    |      |
+  //   |    |      |
+  //   +-- (p0 ,  p1)
+  auto module = CreateNewVerifiedModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+
+  auto negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte0));
+
+  auto negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte1));
+  builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
+TEST_F(CopyInsertionTest, ParameterAndOpsWithPartialAliasing) {
+  // Test a case where one parameter is aliased with result while another one
+  // isn't.
+  //
+  //   +-- (p0 ,  p1)
+  //   |    |      |
+  //   |    |      |
+  // alias Negate  Negate
+  //   |    |      |
+  //   |    Add----+
+  //   |    |      |
+  //   +-- (p0 ,  p1)
+  auto module = CreateNewVerifiedModule();
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+
+  auto negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte0));
+
+  auto negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte1));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, negate0, negate1));
+  builder.AddInstruction(HloInstruction::CreateTuple({add, negate1}));
+  module->AddEntryComputation(builder.Build());
+  ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
 TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) {
   // Test a while instruction with a body which permutes its tuple parameter
   // elements and applies one operation to one of the elements. The addition of
   // the operation (instruction) on the element makes the live range of the
   // respective input and output elements different than if the instruction were
   // not there (as in the SwizzlingWhile test above).
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1420,7 +1632,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileSharedInput) {
   // the while body is a single constant (both loop state elements are the same
   // constant). This means no copies are necessary because both loop state
   // elements are the same so interchanging them is a no-op.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1481,7 +1693,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
   const Shape loop_state_shape = ShapeUtil::MakeTupleShape(
       {element_shape, element_shape, element_shape, element_shape});
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, element_shape, "param_0"));
@@ -1571,7 +1783,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
 TEST_F(CopyInsertionTest, WhileBodyWithConstantRoot) {
   // Test a while body and condition which are each simply a constant (root of
   // computation is a constant). The body constant should be copied.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
@@ -1684,7 +1896,7 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) {
   tensorflow::testing::StopTiming();
   for (int i = 0; i < num_iters; ++i) {
     HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
 
     auto builder = HloComputation::Builder("BM_SequentialWhiles");
@@ -1724,7 +1936,7 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) {
   tensorflow::testing::StopTiming();
   for (int i = 0; i < num_iters; ++i) {
     HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config.set_debug_options(GetDebugOptionsFromFlags());
     HloModule module("BM_SequentialWhiles", config);
 
     auto builder = HloComputation::Builder("BM_ParallelWhiles");
@@ -1791,7 +2003,7 @@ std::unique_ptr<HloComputation> MakeBenchmarkWhileBody(
 void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
   tensorflow::testing::StopTiming();
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsFromFlags());
   CopyInsertion copy_insertion;
   const Shape element_shape = ShapeUtil::MakeShape(F32, {});
   std::vector<HloInstruction*> tuple_params(num_tuple_inputs);
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 58abb330a6e31e9b7a8081cd7964cf89a5b64a09..ce4c2a9cc69240b9565b35a3f2504d7fc9373917 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -51,6 +51,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -95,6 +96,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
         "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -823,7 +825,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -845,7 +846,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
@@ -886,7 +886,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -960,17 +959,16 @@ tf_cc_test(
     srcs = ["cpu_copy_insertion_test.cc"],
     deps = [
         ":cpu_copy_insertion",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -996,7 +994,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 73b03440cbb936017257b8a92f16dcc25d41e21c..796a7cf94d02b0ad42366387a9d3f8d589b8840a 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -61,19 +61,6 @@ Disabling these as a starting point.
 // TODO(b/64227304) Creating a custom pass pipeline will replace this.
 
 namespace {
-class FilteredFunctionPassManager : public llvm::legacy::FunctionPassManager {
- public:
-  FilteredFunctionPassManager(llvm::Module* m, bool disable_expensive_passes)
-      : llvm::legacy::FunctionPassManager(m),
-        disable_expensive_passes_(disable_expensive_passes) {}
-  void add(llvm::Pass* p) override {
-    llvm::legacy::FunctionPassManager::add(p);
-  }
-
- private:
-  bool disable_expensive_passes_;
-};
-
 class FilteredPassManager : public llvm::legacy::PassManager {
  public:
   explicit FilteredPassManager(bool disable_expensive_passes)
@@ -96,8 +83,7 @@ class FilteredPassManager : public llvm::legacy::PassManager {
 std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
     llvm::Module& module) const {
   FilteredPassManager module_passes(disable_expensive_passes_);
-  FilteredFunctionPassManager function_passes(&module,
-                                              disable_expensive_passes_);
+  llvm::legacy::FunctionPassManager function_passes(&module);
 
   VLOG(2) << "IR before optimizations";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(module));
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index 2083f440fdd971db1b675d005664d25e6de53dbe..c58175428fea6a2d38253c35de598b99a4281bf1 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -32,7 +32,7 @@ namespace cpu {
 
 using ::testing::ElementsAre;
 
-class ConvCanonicalizationTest : public HloVerifiedTestBase {
+class ConvCanonicalizationTest : public HloTestBase {
  public:
   ConvCanonicalizationTest() {
     for (int i = 0; i < 2; ++i) {
@@ -87,7 +87,7 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
       input, kernel, /*feature_group_count=*/1, conv_window_, dnums,
       DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
@@ -96,7 +96,7 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
         return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
       });
   ConvCanonicalization conv_canonicalization(&target_machine_features);
-  EXPECT_TRUE(conv_canonicalization.Run(module).ValueOrDie());
+  EXPECT_TRUE(conv_canonicalization.Run(module.get()).ValueOrDie());
 
   const HloInstruction* output_reshape = entry_computation->root_instruction();
   EXPECT_EQ(HloOpcode::kTranspose, output_reshape->opcode());
@@ -150,7 +150,7 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
       input, kernel, /*feature_group_count=*/1, conv_window_, dnums,
       DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
@@ -158,7 +158,7 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
         return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
       });
   ConvCanonicalization conv_canonicalization(&target_machine_features);
-  EXPECT_FALSE(conv_canonicalization.Run(module).ValueOrDie());
+  EXPECT_FALSE(conv_canonicalization.Run(module.get()).ValueOrDie());
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 5834f672851f5379c56f6479fd463464c6f3791c..6374822c81bf42fd12829f57cf93c19457128219 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -76,6 +76,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -268,10 +269,11 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true);
-    pass.AddPass<AlgebraicSimplifier>(
-        /*is_layout_sensitive=*/false,
-        [](const Shape&, const Shape&) { return false; },
-        /*enable_dot_strength_reduction=*/false);
+    pipeline.AddPass<HloGetDimensionSizeRewriter>();
+    AlgebraicSimplifierOptions options(
+        [](const Shape&, const Shape&) { return false; });
+    options.set_enable_dot_strength_reduction(false);
+    pass.AddPass<AlgebraicSimplifier>(options);
     pass.AddPass<HloDCE>();
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
@@ -327,12 +329,18 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   {
     auto& pass = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
         "simplification after layout assignement");
-    pass.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
-                                          /*allow_mixed_precision=*/false);
-    pass.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        /*is_layout_sensitive=*/true,
-        [](const Shape&, const Shape&) { return true; },
-        /*enable_dot_strength_reduction=*/false);
+    // TODO(b/117156505): When the bug is fixed, the CPU backend should not
+    // produce layout changing elementwise operations. We will then pass
+    // LayoutAssignment::InstructionCanChangeLayout to the HLO verifier to
+    // enable stricter verification.
+    pass.AddInvariantChecker<HloVerifier>(
+        /*layout_sensitive=*/true,
+        /*allow_mixed_precision=*/false);
+    AlgebraicSimplifierOptions options(
+        [](const Shape&, const Shape&) { return true; });
+    options.set_is_layout_sensitive(true);
+    options.set_enable_dot_strength_reduction(false);
+    pass.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
     pass.AddPass<HloDCE>();
     pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
   }
@@ -497,8 +505,8 @@ Status CreateHloProfilingArtifacts(
 
   HloCostAnalysis cost_analysis(shape_size_bytes);
   TF_RETURN_IF_ERROR(entry_computation.Accept(&cost_analysis));
-  *hlo_profile_printer_data =
-      CreateHloProfilePrinterData(**hlo_profile_index_map, cost_analysis);
+  *hlo_profile_printer_data = CreateHloProfilePrinterData(
+      **hlo_profile_index_map, cost_analysis, entry_computation.name());
   *computation_to_profile_idx =
       (*hlo_profile_index_map)->computation_to_profile_idx();
 
@@ -582,9 +590,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // Select an order for emitting the HLO instructions for each
   // computation. Using this sequence enables tighter buffer liveness analysis
   // and reduced memory usage (as compared to using DependencyHloOrdering).
-  TF_ASSIGN_OR_RETURN(
-      HloSchedule schedule,
-      ScheduleModule(*module, BufferSizeBytesFunction(), DFSMemoryScheduler));
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                      ScheduleModule(module.get(), BufferSizeBytesFunction(),
+                                     DFSMemoryScheduler));
 
   // Run buffer allocation on the HLO graph.
   TF_ASSIGN_OR_RETURN(
@@ -671,9 +679,12 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 }
 
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                                 const AotCompilationOptions& aot_options) {
-  TF_RET_CHECK(!modules.empty());
+  TF_RET_CHECK(!module_group->empty());
+  std::vector<std::unique_ptr<HloModule>> modules =
+      module_group->ConsumeModules();
+
   std::call_once(llvm_command_line_options_initialized,
                  &llvm_ir::InitializeLLVMCommandLineOptions,
                  modules[0]->config());
@@ -771,7 +782,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     XLA_VLOG_LINES(2, module->ToString());
 
     TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                        ScheduleModule(*module, BufferSizeBytesFunction()));
+                        ScheduleModule(module, BufferSizeBytesFunction()));
 
     // Run buffer analysis on the HLO graph. This analysis figures out which
     // temporary buffers are required to run the computation.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index f2af923782df268e3e6da3895ec35579ab6aa51f..c67307548dda731f8fa56b8e6790e7e83f587113 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -142,7 +142,7 @@ class CpuCompiler : public LLVMCompiler {
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                      const AotCompilationOptions& options) override;
 
   se::Platform::Id PlatformId() const override;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
index c9fb34be1cd582c71618c770c892058c233c571a..c085f85fb73e98e4c7ba15af8db8bb19c2499f5f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
 
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -52,7 +52,7 @@ int64 CountCopies(const HloModule& module) {
   return count;
 }
 
-class CpuCopyInsertionTest : public HloVerifiedTestBase {
+class CpuCopyInsertionTest : public HloTestBase {
  protected:
   void InsertCopies(HloModule* module) {
     CpuCopyInsertion copy_insertion;
@@ -65,7 +65,7 @@ class CpuCopyInsertionTest : public HloVerifiedTestBase {
 TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) {
   // Test a while body and condition which are each simply a constant (root of
   // computation is a constant). Each constant should be copied.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
@@ -90,7 +90,7 @@ TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) {
 
   module->AddEntryComputation(builder.Build());
 
-  InsertCopies(module);
+  InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 3);
 
@@ -103,7 +103,7 @@ TEST_F(CpuCopyInsertionTest, TupleCall) {
   // Test a kCall instruction which calls a computation which produces a three
   // element tuple: one is a constant, one is a parameter, and one is produced
   // in the computation. The constant and parameter should be copied.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
@@ -127,7 +127,7 @@ TEST_F(CpuCopyInsertionTest, TupleCall) {
 
   module->AddEntryComputation(builder.Build());
 
-  InsertCopies(module);
+  InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*subcomputation), 2);
   EXPECT_THAT(subcomputation->root_instruction(),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 29abf38e439d919ff93629ed992cb3ff93a929bd..818b2b0d0db2893e11fa46c7867e6c74bbbb6905 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -51,8 +51,7 @@ namespace cpu {
 CpuExecutable::CpuExecutable(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<const HloModule> hlo_module,
-    const string& entry_function_name,
+    std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 3c3c047bfe8ee0d1ad90ede2432a86264f47870b..3b91b15ba9b5603b50f78f489e9a3fdad354c083 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -49,7 +49,7 @@ class CpuExecutable : public Executable {
  public:
   CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,
                 std::unique_ptr<const BufferAssignment> assignment,
-                std::unique_ptr<const HloModule> hlo_module,
+                std::unique_ptr<HloModule> hlo_module,
                 const string& entry_function_name,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
index be1208fb2df2a1a11a093810b5f6c2a83f468062..9cbfb88834bf51f4df54e97efe6cd7bf88b12334 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -25,7 +25,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class CpuHloSupportCheckerTest : public HloVerifiedTestBase {
+class CpuHloSupportCheckerTest : public HloTestBase {
  protected:
   CpuHloSupportChecker& checker() { return checker_; }
 
@@ -42,10 +42,10 @@ TEST_F(CpuHloSupportCheckerTest, Add) {
       HloInstruction::CreateParameter(1, scalar_shape, "param1"));
   builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  TF_ASSERT_OK(checker().Run(module).status());
+  TF_ASSERT_OK(checker().Run(module.get()).status());
 }
 
 TEST_F(CpuHloSupportCheckerTest, SparseUnimplemented) {
@@ -57,10 +57,13 @@ TEST_F(CpuHloSupportCheckerTest, SparseUnimplemented) {
       HloInstruction::CreateParameter(1, sparse_shape, "param1"));
   builder.AddInstruction(HloInstruction::CreateBinary(
       sparse_shape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
+  // Since verifier is reporting sparse layouts as errors, we should
+  // use a regular HloModule instead of VerifiedHloModule to avoid
+  // verifier errors being triggered in the destructor.
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  Status status = checker().Run(module).status();
+  Status status = checker().Run(module.get()).status();
   ASSERT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
   EXPECT_THAT(status.error_message(),
               HasSubstr("CPU backend does not support"));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index f9cd61bea3dc86cadff99d4a90eca44c16520823..6f79ad7c1468f27c74d84770ec6358fbcd1c1f09 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -48,10 +48,15 @@ bool IsMatrixVectorDot(const HloInstruction* hlo) {
          (hlo_shape.dimensions(0) == 1 || hlo_shape.dimensions(1) == 1);
 }
 
+bool HasExactlyOneUse(const HloInstruction& hlo_instr) {
+  return hlo_instr.user_count() == 1 &&
+         absl::c_count(hlo_instr.users().front()->operands(), &hlo_instr) == 1;
+}
+
 bool CanBeOutputFused(const HloInstruction* producer,
                       const HloInstruction* consumer) {
   return consumer->opcode() == HloOpcode::kAdd && IsMatrixVectorDot(producer) &&
-         producer->user_count() == 1;
+         HasExactlyOneUse(*producer) == 1;
 }
 
 bool CanBeOutputFusedIntoSomeOperand(const HloInstruction* consumer) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 7d99b914d4f5e5d27722bcd098d2ae0c54a36a23..527df0bd1c23bba74f32226e5622fed32f7dcf84 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -58,7 +58,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_0) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), exp0, arg1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -77,7 +77,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_1) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -98,7 +98,7 @@ TEST_F(InstructionFusionTest, DotOperationNoFusion_Bitcast) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), bitcast0, arg1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -119,7 +119,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Reshape) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), reshape0, arg1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_TRUE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -138,7 +138,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_TooLarge) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {1, 32 * 1024}), arg0, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -157,7 +157,7 @@ TEST_F(InstructionFusionTest, DotOperationFusion_ElementReuse) {
   HloInstruction* dot = builder.AddInstruction(
       MakeDot(ShapeUtil::MakeShape(F32, {2, 1024}), arg0, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(dot, computation->root_instruction());
   EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
@@ -321,7 +321,7 @@ TEST_F(OpcodeFusionTest, Exponential_Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -350,7 +350,7 @@ TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       dynamic_slice_shape, HloOpcode::kTanh, dynamic_slice4));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -370,7 +370,7 @@ TEST_F(OpcodeFusionTest, Broadcast_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, broadcast1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -392,7 +392,7 @@ TEST_F(OpcodeFusionTest, DynamicSlice_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, dynamic_slice2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -410,7 +410,7 @@ TEST_F(OpcodeFusionTest, Exponential_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -429,7 +429,7 @@ TEST_F(OpcodeFusionTest, Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -447,7 +447,7 @@ TEST_F(OpcodeFusionTest, Reverse_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, reverse1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -466,7 +466,7 @@ TEST_F(OpcodeFusionTest, Slice_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2}), HloOpcode::kNegate, slice1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -489,7 +489,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, transpose2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -498,7 +498,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
 }
 
 TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -517,7 +517,7 @@ TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
 }
 
 TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -542,7 +542,7 @@ TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
 }
 
 TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -573,7 +573,7 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
 }
 
 TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {4, 100, 10, 100, 50});
@@ -641,7 +641,7 @@ TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastUnary) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(large_shape, HloOpcode::kExp, small_exp));
 
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto did_fusion = CpuInstructionFusion().Run(module.get());
@@ -670,7 +670,7 @@ TEST_F(OpcodeFusionTest, ReuseViaImplicitBroadcastBinary) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       large_shape, HloOpcode::kAdd, small_exp, large_param));
 
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto did_fusion = CpuInstructionFusion().Run(module.get());
@@ -712,7 +712,7 @@ void CreateComputationForDotAddOutputFusionTest(const string& test_name,
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/1,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -725,7 +725,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/false);
@@ -738,7 +738,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -751,7 +751,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/true);
@@ -763,6 +763,28 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
               Not(op::Fusion()));
 }
 
+TEST_F(InstructionFusionTest,
+       DotOperationFusion_DontOutputFuseDuplicateOperands) {
+  absl::string_view module_string = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[50,60]{1,0} parameter(0)
+  b = f32[60,1]{1,0} parameter(1)
+  c = f32[50,1]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT d = f32[50,1]{1,0} add(c, c)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
 struct GatherLoopFusionTestSpec {
   string test_name;
   string hlo_computation_text;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 97659b88a7974d7caf91ab0d4741f3635e4dae4a..6c61b64758ede160e2d50e4429590a789ec253c3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -73,7 +73,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensor) {
   auto result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -114,7 +114,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor0) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       result_shape, HloOpcode::kAdd, dot_a_result, dot_b_result));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -158,7 +158,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor1) {
   auto tuple_result = builder.AddInstruction(
       HloInstruction::CreateTuple({dot_a_result, dot_b_result}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -192,7 +192,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantLhsTensor) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -232,7 +232,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensorThroughGTE) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -353,7 +353,7 @@ static void AssertCorrectLayoutForDotOutputFusion(
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -365,7 +365,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -377,7 +377,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -389,7 +389,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -401,7 +401,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
@@ -413,7 +413,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index b8ace5702688096822573c7afae234cbcbe77b28..92debb83e33b1400a59e5eef0f90971392ab7b22 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -22,7 +22,6 @@ limitations under the License.
 namespace {
 
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
-const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaEnableExperimentalLlvmIrGemm =
     "xla_enable_experimental_llvm_ir_gemm";
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 1cc2844470376ceb61601f6d1361def84eac5b45..1457582ac19c27e5c3150b4667e6af505345a6bd 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
@@ -183,7 +183,7 @@ Status CpuTransferManager::TransferLiteralFromOutfeed(
     // Note: OSS build didn't like implicit conversion from
     // literal_shape.dimensions() to the array slice on 2017-07-10.
     absl::Span<const int64> dimensions(
-        tensorflow::bit_cast<const int64*>(literal_shape.dimensions().data()),
+        absl::bit_cast<const int64*>(literal_shape.dimensions().data()),
         literal_shape.dimensions().size());
     TF_ASSIGN_OR_RETURN(
         Shape received_shape,
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 99fa707c959854e50c6d954fe92b87e93e267dc6..97f9b85a606e140fd7f3b1e3ecfb0dd5ba289f03 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -1546,10 +1546,8 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
       LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0};
 }
 
-// Return whether the given shape is a matrix with no padding.
-static bool IsRank2WithNoPadding(const Shape& shape) {
-  return ShapeUtil::Rank(shape) == 2 && !LayoutUtil::IsPadded(shape);
-}
+// Return whether the given shape is rank 2.
+static bool IsRank2(const Shape& shape) { return ShapeUtil::Rank(shape) == 2; }
 
 // In a gemm operation where output = lhs * rhs, check whether the given shapes
 // are valid for the operation.
@@ -1565,8 +1563,7 @@ static bool AreValidGemmShapes(
     return false;
   }
 
-  if (!(IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
-        IsRank2WithNoPadding(output_shape))) {
+  if (!(IsRank2(lhs_shape) && IsRank2(rhs_shape) && IsRank2(output_shape))) {
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index a70abb117acd2917e7273921e1919b0e03b6cd63..4032c2da2f33ee61da8771ae6225a14172cbe6e8 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
@@ -110,7 +111,7 @@ IrEmitter::IrEmitter(
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
     HloComputation* computation, const string& function_name_prefix,
     bool is_top_level_computation,
-    const std::vector<const HloInstruction*>* instruction_order) {
+    const std::vector<HloInstruction*>* instruction_order) {
   string function_name = name_uniquer_.GetUniqueName(function_name_prefix);
   VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix
           << "]; ordered? " << (instruction_order != nullptr);
@@ -139,7 +140,7 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
   // readcyclecounter if it is unavailable.
   bool use_rdtscp = arch_type_ == llvm::Triple::ArchType::x86 ||
                     arch_type_ == llvm::Triple::ArchType::x86_64;
-  profiling_state_ = ProfilingState(use_rdtscp, GetProfileCountersArgument());
+  profiling_state_ = ProfilingState(use_rdtscp);
   if (instruction_order == nullptr) {
     TF_RETURN_IF_ERROR(computation->Accept(this));
   } else {
@@ -493,53 +494,44 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
   return Status::OK();
 }
 
-Status IrEmitter::HandleSort(HloInstruction* sort) {
+Status IrEmitter::HandleSort(HloInstruction* hlo) {
+  const HloSortInstruction* sort = Cast<HloSortInstruction>(hlo);
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(sort));
-  auto keys = sort->operand(0);
-  auto values = sort->operand_count() > 1 ? sort->operand(1) : nullptr;
-  ShapeIndex keys_shape_index({});
-  ShapeIndex values_shape_index({});
-  if (values != nullptr) {
-    keys_shape_index = ShapeIndex({0});
-    values_shape_index = ShapeIndex({1});
-  }
-  auto keys_destination = GetAllocationSlice(*sort, keys_shape_index);
-  auto keys_destination_address =
-      EmitBufferPointer(keys_destination, keys->shape());
-  auto values_destination = GetAllocationSlice(*sort, values_shape_index);
-  llvm::Value* values_destination_address = nullptr;
-
-  // The sort is implemented in-place, therefore we first copy the operand
-  // buffer to the output buffer if they are not the same.
-  if (keys_destination != GetAllocationSlice(*keys)) {
-    int64 primitive_type_size =
-        ShapeUtil::ByteSizeOfPrimitiveType(keys->shape().element_type());
-    auto source_buffer = GetEmittedValueFor(keys);
-    int64 keys_size = ByteSizeOf(keys->shape());
-    MemCpy(keys_destination_address, /*DstAlign=*/primitive_type_size,
-           source_buffer,
-           /*SrcAlign=*/primitive_type_size, keys_size);
-  }
-  if (values != nullptr) {
-    values_destination_address =
-        EmitBufferPointer(values_destination, values->shape());
-    if (values_destination != GetAllocationSlice(*values)) {
+  Shape keys_shape = sort->keys()->shape();
+  std::vector<llvm::Value*> destination_addresses(sort->operand_count());
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    ShapeIndex shape_index =
+        sort->values_count() > 0 ? ShapeIndex({i}) : ShapeIndex({});
+    const HloInstruction* operand = sort->operand(i);
+    // We assume that the layout of all involved operands and outputs is the
+    // same.
+    TF_RET_CHECK(
+        LayoutUtil::LayoutsInShapesEqual(keys_shape, operand->shape()));
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+        keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index)));
+
+    // The sort is implemented in-place, therefore we first copy the operand
+    // buffer to the output buffer if they are not the same.
+    auto destination_buffer = GetAllocationSlice(*sort, shape_index);
+    destination_addresses[i] =
+        EmitBufferPointer(destination_buffer, operand->shape());
+    auto source_address = GetAllocationSlice(*operand);
+    if (destination_buffer != source_address) {
       int64 primitive_type_size =
-          ShapeUtil::ByteSizeOfPrimitiveType(values->shape().element_type());
-      auto source_buffer = GetEmittedValueFor(values);
-      int64 values_size = ByteSizeOf(values->shape());
-      MemCpy(values_destination_address, /*DstAlign=*/primitive_type_size,
+          ShapeUtil::ByteSizeOfPrimitiveType(operand->shape().element_type());
+      auto source_buffer = GetEmittedValueFor(operand);
+      int64 size = ByteSizeOf(operand->shape());
+      MemCpy(destination_addresses[i], /*DstAlign=*/primitive_type_size,
              source_buffer,
-             /*SrcAlign=*/primitive_type_size, values_size);
+             /*SrcAlign=*/primitive_type_size, size);
     }
   }
 
   // Normalize the shape and the dimension to sort.
   Shape normalized_keys_shape =
-      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
-          keys->shape());
+      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(keys_shape);
   int64 physical_dimension_to_sort = LayoutUtil::MakeLogicalToPhysical(
-      keys->shape().layout())[sort->dimensions(0)];
+      keys_shape.layout())[sort->sort_dimension()];
 
   int64 sort_dimension_elements =
       normalized_keys_shape.dimensions(physical_dimension_to_sort);
@@ -553,7 +545,7 @@ Status IrEmitter::HandleSort(HloInstruction* sort) {
     lower_dimensions *= normalized_keys_shape.dimensions(i);
   }
 
-  PrimitiveType keys_type = keys->shape().element_type();
+  PrimitiveType keys_type = keys_shape.element_type();
   const char* fn_name = nullptr;
   llvm::Type* keys_native_type = nullptr;
   switch (keys_type) {
@@ -614,28 +606,48 @@ Status IrEmitter::HandleSort(HloInstruction* sort) {
   llvm::FunctionType* key_value_sort_type = llvm::FunctionType::get(
       b_.getVoidTy(),
       {keys_native_type, b_.getInt64Ty(), b_.getInt64Ty(), b_.getInt64Ty(),
-       b_.getInt8PtrTy(), b_.getInt32Ty()},
+       b_.getInt8PtrTy()->getPointerTo(), b_.getInt32Ty(),
+       b_.getInt32Ty()->getPointerTo()},
       /*isVarArg=*/false);
   auto* key_value_sort_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, key_value_sort_type));
   key_value_sort_func->setCallingConv(llvm::CallingConv::C);
   key_value_sort_func->setDoesNotThrow();
-  key_value_sort_func->setOnlyAccessesArgMemory();
+  llvm::Value* values;
+  llvm::Value* sizes;
+  if (sort->values_count() == 0) {
+    values = llvm::Constant::getNullValue(b_.getInt8PtrTy()->getPointerTo());
+    sizes = llvm::Constant::getNullValue(b_.getInt32Ty()->getPointerTo());
+  } else {
+    values = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+        b_.getInt8PtrTy(), b_.getInt32(sort->values_count()),
+        "cc_values_alloca", &b_);
+    sizes = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+        b_.getInt32Ty(), b_.getInt32(sort->values_count()), "cc_sizes_alloca",
+        &b_);
+    for (int64 i = 0; i < sort->values_count(); ++i) {
+      llvm::Value* value_as_i8ptr =
+          PointerCast(destination_addresses[i + 1], b_.getInt8PtrTy());
+      llvm::Value* slot_in_values_alloca =
+          ConstInBoundsGEP1_32(b_.getInt8PtrTy(), values, i);
+      Store(value_as_i8ptr, slot_in_values_alloca);
+      llvm::Value* slot_in_sizes_alloca =
+          ConstInBoundsGEP1_32(b_.getInt32Ty(), sizes, i);
+      llvm::Value* size = b_.getInt32(ShapeUtil::ByteSizeOfPrimitiveType(
+          sort->operand(i + 1)->shape().element_type()));
+      Store(size, slot_in_sizes_alloca);
+    }
+  }
+
   Call(key_value_sort_func,
-       {PointerCast(keys_destination_address, keys_native_type),
+       {PointerCast(destination_addresses[0], keys_native_type),
         b_.getInt64(higher_dimensions), b_.getInt64(sort_dimension_elements),
-        b_.getInt64(lower_dimensions),
-        values != nullptr
-            ? PointerCast(values_destination_address, b_.getInt8PtrTy())
-            : llvm::Constant::getNullValue(b_.getInt8PtrTy()),
-        b_.getInt32(values != nullptr ? ShapeUtil::ByteSizeOfPrimitiveType(
-                                            values->shape().element_type())
-                                      : 0)});
-
-  if (values != nullptr) {
-    llvm_ir::EmitTuple(GetIrArrayFor(sort),
-                       {keys_destination_address, values_destination_address},
-                       &b_, module_);
+        b_.getInt64(lower_dimensions), values,
+        b_.getInt32(sort->values_count()), sizes});
+
+  if (sort->values_count() > 0) {
+    llvm_ir::EmitTuple(GetIrArrayFor(sort), destination_addresses, &b_,
+                       module_);
   }
   return Status::OK();
 }
@@ -688,8 +700,25 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
   for (size_t i = 0; i < index.size(); ++i) {
     llvm::Value* strided_index =
         NSWMul(index[i], b_.getInt64(window.dimensions(i).stride()));
-    input_index[i] = NSWSub(NSWAdd(strided_index, window_index[i]),
-                            b_.getInt64(window.dimensions(i).padding_low()));
+    input_index[i] = NSWSub(
+        NSWAdd(strided_index,
+               NSWMul(window_index[i],
+                      b_.getInt64(window.dimensions(i).window_dilation()))),
+        b_.getInt64(window.dimensions(i).padding_low()));
+
+    // We need to verify that we are not in the dilated base area.
+    llvm::Value* dilation_condition = ICmpEQ(
+        SRem(input_index[i], b_.getInt64(window.dimensions(i).base_dilation())),
+        b_.getInt64(0));
+    if (in_bounds_condition == nullptr) {
+      in_bounds_condition = dilation_condition;
+    } else {
+      in_bounds_condition = And(in_bounds_condition, dilation_condition);
+    }
+
+    // Apply base dilation to the index.
+    input_index[i] =
+        SDiv(input_index[i], b_.getInt64(window.dimensions(i).base_dilation()));
 
     // We need to check if 0 <= input_index[i] < bound, as otherwise we are in
     // the padding so that we can skip the computation. That is equivalent to
@@ -728,12 +757,6 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
       /*operands=*/{reduce_window->operand(0)},
       /*supported_types=*/{F32, BF16, S32, F16}));
 
-  // TODO(b/31410564): Implement dilation for reduce-window.
-  if (window_util::HasDilation(reduce_window->window())) {
-    return Unimplemented(
-        "Dilation for ReduceWindow is not implemented on CPU.");
-  }
-
   // Pseudo code for reduce window:
   //
   //   for (coordinates O in the output)
@@ -1356,33 +1379,6 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   return Status::OK();
 }
 
-// Fills up the free variables in 'index_with_free_var' with values from
-// 'filler_index'. The size of free variables must be the same as the
-// size of 'filler_index'.
-//
-// This is often used after dimension reduction, where
-// 'index_with_free_var' has one or more dimensions reduced, which serves as
-// free variables (represented as nullptr). For example, if we have a 4
-// dimensional input and index for the dimension being reduced is
-// 2 (third dimension), we will have an index like [i, j, NULL, k]
-// after reduced dimension.
-//
-// Here we fill up that free variable by 'filler_index', which contains
-// the value in the reduced dimension.
-static llvm_ir::IrArray::Index FillReducedDimensionIndex(
-    llvm_ir::IrArray::Index index_with_free_var,
-    llvm_ir::IrArray::Index filler_index) {
-  llvm_ir::IrArray::Index::const_iterator it = filler_index.begin();
-
-  for (size_t i = 0; i < index_with_free_var.size(); ++i) {
-    if (index_with_free_var[i] == nullptr) {
-      index_with_free_var[i] = *it++;
-    }
-  }
-  CHECK(filler_index.end() == it);
-  return index_with_free_var;
-}
-
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   VLOG(2) << "HandleParameter: " << parameter->ToString();
   return EmitTargetAddressForOp(parameter);
@@ -1513,7 +1509,8 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
 
     case HloOpcode::kMaximum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs,
+                 llvm::Value* rhs) -> llvm::Value* {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::maxnum,
                                               {lhs, rhs}, {lhs->getType()}, b);
@@ -1528,7 +1525,8 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
 
     case HloOpcode::kMinimum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs,
+                 llvm::Value* rhs) -> llvm::Value* {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::minnum,
                                               {lhs, rhs}, {lhs->getType()}, b);
@@ -2169,30 +2167,22 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   return Status::OK();
 }
 
-// If `hlo` is a Transpose, returns its operand; otherwise returns `hlo` itself.
-static const HloInstruction* StripTranspose(const HloInstruction& hlo) {
-  if (hlo.IsRank2Transpose()) {
-    return hlo.operand(0);
-  }
-  return &hlo;
-}
-
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   auto* root = fusion->fused_expression_root();
   if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion, assignment_)) {
     VLOG(3) << "HandleFusion FusedDynamicUpdateSliceInPlace";
     CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
     TF_RETURN_IF_ERROR(EmitTargetAddressForOp(fusion));
-
     // Delegate to common implementation of fused in-place dynamic-update-slice.
-    auto operands = GetIrArraysForOperandsOf(fusion);
     return llvm_ir::EmitFusedDynamicUpdateSliceInPlace(
-        fusion, operands, GetIrArrayFor(fusion), &elemental_emitter, &b_);
+        fusion, GetGeneratorForOperandIrArrays(fusion), GetIrArrayFor(fusion),
+        &elemental_emitter, &b_);
   } else if (fusion->fusion_kind() == HloInstruction::FusionKind::kLoop) {
     VLOG(3) << "HandleFusion kLoop";
     CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
     auto operands = GetIrArraysForOperandsOf(fusion);
-    FusedIrEmitter fused_emitter(operands, &elemental_emitter);
+    FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(fusion),
+                                 &elemental_emitter);
     TF_RETURN_IF_ERROR(fusion->fused_expression_root()->Accept(&fused_emitter));
 
     return EmitTargetElementLoop(fusion, fused_emitter.GetRootGenerator());
@@ -2392,14 +2382,8 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
       *failure_reason = "operand has mismatching layouts";
       return false;
     }
-    if (LayoutUtil::IsPadded(op->shape())) {
-      *failure_reason = "operand has padded layout";
-      return false;
-    }
   }
 
-  CHECK(!LayoutUtil::IsPadded(concatenate->shape()));
-
   // We split the dimensions into three categories: the dimension over which we
   // are concatenating (concat_dim), the dimensions that are minor to it
   // (inner_dims) and the dimensions that are major to it (outer_dims).
@@ -2581,10 +2565,17 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) {
   return Status::OK();
 }
 
-Status IrEmitter::HandleAfterAll(HloInstruction* gen_token) {
-  TF_RET_CHECK(ByteSizeOf(gen_token->shape()) == 0);
+Status IrEmitter::HandleAfterAll(HloInstruction* after_all) {
+  TF_RET_CHECK(ByteSizeOf(after_all->shape()) == 0);
   // No code to generate, but we need to emit an address for book-keeping.
-  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(gen_token));
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(after_all));
+  return Status::OK();
+}
+
+Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) {
+  // AddDedendency just forwards its zero-th operand.
+  emitted_value_[add_dependency] =
+      GetEmittedValueFor(add_dependency->operand(0));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 586f27b104ed706a3b128903c6a90abbf3667e59..559a8162a2d53f28ea6817653503c216af90a610 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -59,6 +59,9 @@ namespace cpu {
 class IrEmitter : public DfsHloVisitorWithDefault,
                   public IrBuilderMixin<IrEmitter> {
  public:
+  using GeneratorForOperandIrArrays =
+      std::function<std::vector<llvm_ir::IrArray>()>;
+
   // Create a new LLVM IR emitter.
   //
   // hlo_module: the HLO module we are emitting IR for.
@@ -98,7 +101,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   StatusOr<llvm::Function*> EmitComputation(
       HloComputation* computation, const string& function_name_prefix,
       bool is_top_level_computation,
-      const std::vector<const HloInstruction*>* instruction_order);
+      const std::vector<HloInstruction*>* instruction_order);
 
   llvm::IRBuilder<>* b() { return &b_; }
 
@@ -156,7 +159,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleConcatenate(HloInstruction* concatenate) override;
   Status HandleConditional(HloInstruction* conditional) override;
   Status HandleScatter(HloInstruction* scatter) override;
-  Status HandleAfterAll(HloInstruction* gen_token) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
   Status HandleRng(HloInstruction* rng) override;
   Status FinishVisit(HloInstruction* root) override;
 
@@ -208,6 +212,11 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   std::vector<llvm_ir::IrArray> GetIrArraysForOperandsOf(
       const HloInstruction* hlo);
 
+  GeneratorForOperandIrArrays GetGeneratorForOperandIrArrays(
+      HloInstruction* unnested_hlo) {
+    return [=]() { return GetIrArraysForOperandsOf(unnested_hlo); };
+  }
+
   // Augments IrArray with aliasing information.
   void AddAliasingInformationToIrArray(const HloInstruction& hlo,
                                        llvm_ir::IrArray* array) {
@@ -459,9 +468,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // profiling a computation.
   class ProfilingState {
    public:
-    ProfilingState() : use_rdtscp_(false), prof_counters_(nullptr) {}
-    ProfilingState(bool use_rdtscp, llvm::Value* prof_counters)
-        : use_rdtscp_(use_rdtscp), prof_counters_(prof_counters) {}
+    ProfilingState() : use_rdtscp_(false) {}
+    explicit ProfilingState(bool use_rdtscp) : use_rdtscp_(use_rdtscp) {}
 
     // Record the cycle counter before an HLO executes.
     void RecordCycleStart(llvm::IRBuilder<>* b, HloInstruction* hlo);
@@ -486,9 +494,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
     // intrinsic?
     bool use_rdtscp_;
 
-    // The argument which corresponds to the profile counter buffer.
-    llvm::Value* prof_counters_;
-
     // The first read cycle counter in the program.
     llvm::Value* first_read_cycle_start_ = nullptr;
 
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index cef5e57b0b12b7ae93af0d2508b2b9d6a592d390..f9722ffadac801521ddcbb568dd4435fd02e951b 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index fad76338a57cd9eb21d9469ca8552efa8ea0129b..f0b65046c14ccec5336abf7c4d05d1d755f783bd 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -17,13 +17,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
+class ParallelTaskAssignmentTest : public HloTestBase {
  protected:
   const HloCostAnalysis::ShapeSizeFunction shape_size_func_ =
       cpu::CpuExecutable::ShapeSizeBytes;
@@ -35,7 +35,7 @@ class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
   cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features_;
 
   ParallelTaskAssignmentTest()
-      : HloVerifiedTestBase(), target_machine_features_([](int64 shape_size) {
+      : HloTestBase(), target_machine_features_([](int64 shape_size) {
           return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
         }) {}
 
@@ -57,8 +57,9 @@ TEST_F(ParallelTaskAssignmentTest, DotOperationNotParallelized) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -84,8 +85,9 @@ TEST_F(ParallelTaskAssignmentTest,
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -100,8 +102,9 @@ TEST_F(ParallelTaskAssignmentTest, RngOperationNotParallelized) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -116,8 +119,9 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
   EXPECT_FALSE(changed);
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index e0e7deb98e579c090c8fae320a3ba8a3ce8dbe5f..722aa3120ef4d8c957873ac58c361f19632dde1f 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -41,66 +42,72 @@ void KeyValueSort(std::pair<KeyType, int64>* row_to_sort, int64 num_elements) {
   std::sort(row_to_sort, row_to_sort + num_elements);
 }
 
-// For floating point numbers, we want a total order comparator. -NaN and NaN
-// should appear at the beginning and end of the ordering, and -0.0 should
-// appear before 0.0. Also we want to have a stable sort, so if the keys are the
-// same, we compare the index values.
-template <typename KeyType>
-bool LessThan(KeyType lhs, int64 lhs_index, KeyType rhs, int64 rhs_index) {
-  bool lhs_is_negative = std::signbit(lhs);
-  bool rhs_is_negative = std::signbit(rhs);
-  // If the signs are different, we can just compare the signs.
-  if (lhs_is_negative != rhs_is_negative) {
-    return lhs_is_negative && !rhs_is_negative;
-  }
-  bool lhs_nan = std::isnan(lhs);
-  bool rhs_nan = std::isnan(rhs);
-  // Exactly one number is nan?
-  if (lhs_nan != rhs_nan) {
-    if (lhs_nan) {
-      return lhs_is_negative;
-    }
-    return !rhs_is_negative;
+// We would like a total order of floating point numbers so that the
+// sort has a predictable behavior in the presence of NaNs. Rather
+// than using floating point comparison, we use the following trick:
+// If f is a float, and
+// x = bit_cast<int32>(f);
+// y = x < 0 ? 0x7FFFFFFF - x : x;
+// then y is ordered as an int32 such that finite values have the
+// obvious order, -0 is ordered before 0, and -NaN and NaN appear at
+// the beginning and end of the ordering.
+template <typename CastType, typename UnsignedCastType, typename KeyType>
+CastType Convert(KeyType value) {
+  CastType casted_value;
+  memcpy(&casted_value, &value, sizeof(CastType));
+  if (casted_value < 0) {
+    return static_cast<UnsignedCastType>(std::numeric_limits<CastType>::max()) -
+           casted_value;
   }
-  if (lhs != rhs) {
-    return lhs < rhs;
-  }
-  return lhs_index < rhs_index;
+  return casted_value;
+}
+
+template <typename CastType, typename UnsignedCastType, typename KeyType>
+bool LessThan(KeyType lhs, KeyType rhs) {
+  return Convert<CastType, UnsignedCastType>(lhs) <
+         Convert<CastType, UnsignedCastType>(rhs);
 }
 
 template <>
 void KeyValueSort(std::pair<double, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<double, int64>& lhs,
-               const std::pair<double, int64>& rhs) -> bool {
-              return LessThan(lhs.first, lhs.second, rhs.first, rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<double, int64>& lhs,
+                      const std::pair<double, int64>& rhs) -> bool {
+                     return LessThan<int64, uint64>(lhs.first, rhs.first);
+                   });
 }
 
 template <>
 void KeyValueSort(std::pair<float, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<float, int64>& lhs,
-               const std::pair<float, int64>& rhs) -> bool {
-              return LessThan(lhs.first, lhs.second, rhs.first, rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<float, int64>& lhs,
+                      const std::pair<float, int64>& rhs) -> bool {
+                     return LessThan<int32, uint32>(lhs.first, rhs.first);
+                   });
 }
 
 template <>
 void KeyValueSort(std::pair<Eigen::half, int64>* row_to_sort,
                   int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<Eigen::half, int64>& lhs,
-               const std::pair<Eigen::half, int64>& rhs) -> bool {
-              return LessThan(
-                  Eigen::half_impl::half_to_float(lhs.first), lhs.second,
-                  Eigen::half_impl::half_to_float(rhs.first), rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<Eigen::half, int64>& lhs,
+                      const std::pair<Eigen::half, int64>& rhs) -> bool {
+                     return LessThan<int32, uint32>(
+                         Eigen::half_impl::half_to_float(lhs.first),
+                         Eigen::half_impl::half_to_float(rhs.first));
+                   });
 }
 
 template <typename KeyType>
-void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char* values,
-                      int32 values_primitive_type_size_in_bytes) {
+void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
+                      int32 values_count,
+                      int32* values_primitive_type_size_in_bytes) {
+  // 'values' and 'values_primitive_type_size_in_bytes' are managed by the JIT
+  // code, so msan can't tell they are initialized.
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(values, values_count * sizeof(char*));
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(values_primitive_type_size_in_bytes,
+                                    values_count * sizeof(int32));
+
   // High-level idea of the iteration/sorting logic:
   // Conceptually we have a 3-dimensional shape [a, b, c]. b corresponds to the
   // dimension to sort, c is the product of the more minor dimensions (set to 1
@@ -129,7 +136,7 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char* values,
         index % sort_dimension_offset +
         (index - index % sort_dimension_offset) * sort_dimension_elements;
     // TODO(b/26783907): We could define a custom iterator class that references
-    // both arrays. Then we could avoid the intermediate copy. However this
+    // all arrays. Then we could avoid the intermediate copy. However this
     // would become more complicated, and it is not clear if the benefit is high
     // enough.
     for (int64 i = 0; i < sort_dimension_elements; ++i) {
@@ -140,97 +147,109 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char* values,
     for (int64 i = 0; i < sort_dimension_elements; ++i) {
       keys[base_offset + i * sort_dimension_offset] = row_to_sort[i].first;
     }
-    if (values == nullptr) {
-      continue;
-    }
 
     // Reorder the values according to the order defined by the keys.
-    for (int64 i = 0; i < sort_dimension_elements; ++i) {
-      int64 memory_index =
-          (base_offset + row_to_sort[i].second * sort_dimension_offset) *
-          values_primitive_type_size_in_bytes;
-
-      reordered_values[i] = std::string(values + memory_index,
-                                        values_primitive_type_size_in_bytes);
-    }
-    for (int64 i = 0; i < sort_dimension_elements; ++i) {
-      int64 memory_index = (base_offset + i * sort_dimension_offset) *
-                           values_primitive_type_size_in_bytes;
-      memcpy(values + memory_index, reordered_values[i].c_str(),
-             values_primitive_type_size_in_bytes);
+    for (int32 idx = 0; idx < values_count; ++idx) {
+      for (int64 i = 0; i < sort_dimension_elements; ++i) {
+        int64 memory_index =
+            (base_offset + row_to_sort[i].second * sort_dimension_offset) *
+            values_primitive_type_size_in_bytes[idx];
+
+        reordered_values[i] =
+            std::string(values[idx] + memory_index,
+                        values_primitive_type_size_in_bytes[idx]);
+      }
+      for (int64 i = 0; i < sort_dimension_elements; ++i) {
+        int64 memory_index = (base_offset + i * sort_dimension_offset) *
+                             values_primitive_type_size_in_bytes[idx];
+        memcpy(values[idx] + memory_index, reordered_values[i].c_str(),
+               values_primitive_type_size_in_bytes[idx]);
+      }
     }
   }
 }
 }  // namespace
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortPRED(
-    bool* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    bool* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS8(
-    int8* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    int8* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU8(
-    uint8* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    uint8* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS16(
-    int16* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    int16* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU16(
-    uint16* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    uint16* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF16(
-    Eigen::half* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    Eigen::half* keys, int64 a, int64 b, int64 c, char** values,
+    int32 values_count, int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS32(
-    int32* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    int32* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU32(
-    uint32* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    uint32* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF32(
-    float* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    float* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS64(
-    int64* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    int64* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU64(
-    uint64* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    uint64* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF64(
-    double* keys, int64 a, int64 b, int64 c, char* values,
-    int32 values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_primitive_type_size_in_bytes);
+    double* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes) {
+  KeyValueSortImpl(keys, a, b, c, values, values_count,
+                   values_primitive_type_size_in_bytes);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
index 28e35e82c18cbf078f8a1e7f5b818bf839d3d3df..7821099386969e855ea1737cf53ef49c15c6e93b 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
@@ -22,67 +22,75 @@ limitations under the License.
 extern "C" {
 
 // 'keys' represents a 3-dimensional shape with dimensions [a, b, c]. The 'b'
-// dimension of 'keys' is sorted into ascending order. 'values' can be nullptr.
-// If 'values' is not nullptr, the elements in 'values' are reordered in such a
-// way that if the element at index 'i' in 'keys' was moved to index 'j', the
-// element at index 'i' in 'values' is also moved to index 'j' (which means that
-// the same elements correspond to each other as before).
+// dimension of 'keys' is sorted into ascending order. If 'values_count' is <=
+// 0, 'values' and 'values_primitive_type_size_in_bytes' can be nullptr.
+// If 'values_count' > 0, they contain exactly 'values_count' many elements.
+// Each element of 'values' also represents a 3-dimensional shape with
+// dimensions [a, b, c], and the size of the primitive type of the i-th shape
+// has exactly 'values_primitive_type_size_in_bytes[i]' bytes. The elements in
+// each 'values' shape are reordered in such a way that if the element at index
+// 'i' in 'keys' was moved to index 'j', the element at index 'i' in a 'values'
+// shape is also moved to index 'j' (which means that the same elements
+// correspond to each other as before).
 extern void __xla_cpu_runtime_KeyValueSortPRED(
     bool* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
-    char* values, tensorflow::int32 values_primitive_type_size_in_bytes);
+    char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortS8(
     tensorflow::int8* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char* values,
-    tensorflow::int32 values_primitive_type_size_in_bytes);
+    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortU8(
     tensorflow::uint8* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char* values,
-    tensorflow::int32 values_primitive_type_size_in_bytes);
+    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortS16(
     tensorflow::int16* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char* values,
-    tensorflow::int32 values_primitive_type_size_in_bytes);
+    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortU16(
     tensorflow::uint16* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char* values,
-    tensorflow::int32 values_primitive_type_size_in_bytes);
+    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortF16(
     Eigen::half* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char* values,
-    tensorflow::int32 values_primitive_type_size_in_bytes);
+    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortS32(
     tensorflow::int32* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char* values,
-    tensorflow::int32 values_primitive_type_size_in_bytes);
+    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortU32(
     tensorflow::uint32* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char* values,
-    tensorflow::int32 values_primitive_type_size_in_bytes);
+    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortF32(
     float* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
-    char* values, tensorflow::int32 values_primitive_type_size_in_bytes);
+    char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortS64(
     tensorflow::int64* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char* values,
-    tensorflow::int32 values_primitive_type_size_in_bytes);
+    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortU64(
     tensorflow::uint64* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char* values,
-    tensorflow::int32 values_primitive_type_size_in_bytes);
+    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 
 extern void __xla_cpu_runtime_KeyValueSortF64(
     double* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
-    char* values, tensorflow::int32 values_primitive_type_size_in_bytes);
+    char** values, tensorflow::int32 values_count,
+    tensorflow::int32* values_primitive_type_size_in_bytes);
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
index 1a3d82de954318368d61e3feeb0345dc592dcd8b..7d8e51f909e3db699b745f94a6c625407bc4a6e3 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include <random>
 
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 namespace cpu {
 namespace {
 
-class ShapePartitionAssignerTest : public HloVerifiedTestBase {
+class ShapePartitionAssignerTest : public HloTestBase {
  protected:
   typedef std::vector<int64> Vec;
 
@@ -91,7 +91,7 @@ TEST_F(ShapePartitionAssignerTest, Shape532WithLayout201) {
             expected_partitions);
 }
 
-class ShapePartitionIteratorTest : public HloVerifiedTestBase {
+class ShapePartitionIteratorTest : public HloTestBase {
  protected:
   typedef std::vector<std::pair<int64, int64>> Partition;
 };
@@ -145,7 +145,7 @@ TEST_F(ShapePartitionIteratorTest, Shape532WithLayout210) {
   }
 }
 
-class RandomShapePartitionIteratorTest : public HloVerifiedTestBase {
+class RandomShapePartitionIteratorTest : public HloTestBase {
  protected:
   typedef std::vector<std::pair<int64, int64>> Partition;
   RandomShapePartitionIteratorTest()
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 9ec0c8f65705db335379649def746921e6b05bea..efccadedf27181a4cddf4f1dc3610f7c6db1d821 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -108,15 +108,15 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
           [](llvm::Error Err) {
             cantFail(std::move(Err), "lookupFlags failed");
           })),
-      object_layer_(execution_session_,
-                    [this](llvm::orc::VModuleKey) {
-                      llvm::orc::RTDyldObjectLinkingLayer::Resources result;
-                      result.MemMgr =
-                          std::make_shared<llvm::SectionMemoryManager>(
-                              orc_jit_memory_mapper::GetInstance());
-                      result.Resolver = symbol_resolver_;
-                      return result;
-                    }),
+      object_layer_(
+          execution_session_,
+          [this](llvm::orc::VModuleKey) {
+            llvm::orc::LegacyRTDyldObjectLinkingLayer::Resources result;
+            result.MemMgr = std::make_shared<llvm::SectionMemoryManager>(
+                orc_jit_memory_mapper::GetInstance());
+            result.Resolver = symbol_resolver_;
+            return result;
+          }),
       compile_layer_(object_layer_,
                      CompilerFunctor(target_machine_.get(), &disassembler_,
                                      opt_level, optimize_for_size,
@@ -128,8 +128,18 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
 }
 
 llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
-  void* func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+  void* func_addr = nullptr;
+  if (name.size() > 1 && name.front() == data_layout_.getGlobalPrefix()) {
+    // On Mac OS X, 'name' may have a leading underscore prefix, even though the
+    // registered name may not.
+    std::string stripped_name(name.begin() + 1, name.end());
+    func_addr = CustomCallTargetRegistry::Global()->Lookup(stripped_name);
+  } else {
+    func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+  }
+
   if (func_addr == nullptr) {
+    VLOG(2) << "Unable to resolve runtime symbol: " << name;
     return nullptr;
   }
   llvm::JITEvaluatedSymbol symbol_info(reinterpret_cast<uint64_t>(func_addr),
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index d74b63fcf45bd70cd18ee41f1e9714ba6a222abd..78406ba143570183aea09d79db3f9b708c21bf70 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -44,9 +44,9 @@ namespace cpu {
 // it's added to the JIT.
 class SimpleOrcJIT {
  public:
-  using ObjLayerT = llvm::orc::RTDyldObjectLinkingLayer;
+  using ObjLayerT = llvm::orc::LegacyRTDyldObjectLinkingLayer;
   using CompileFtor = std::function<ObjLayerT::ObjectPtr(llvm::Module&)>;
-  using CompileLayerT = llvm::orc::IRCompileLayer<ObjLayerT, CompileFtor>;
+  using CompileLayerT = llvm::orc::LegacyIRCompileLayer<ObjLayerT, CompileFtor>;
   using VModuleKeyT = llvm::orc::VModuleKey;
 
   // Create a new JIT, targeting the host architecture.
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 4b129c95d46d8b5a119e5d23eef387daf7863cce..382dfd0d99df87bbadfe541ddaa32cd6da8e8068 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -48,7 +48,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:cpu_instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 18ee25ba9158c28baaf01492c290638b9673f1ec..f8f5f392da8ab3348e63185aecf7b639daacaa42 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -50,7 +50,7 @@ class CpuEigenDotOperationTest
         /*entry_point_name=*/"entry",
         /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewVerifiedModule();
     hlo_module->AddEntryComputation(std::move(entry_computation));
 
     CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index 00a7aa2ad2f6bac4877302296ccb76222557535c..e30f95311fce229f9c559d3bb40142151e8bf3e3 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -46,7 +46,7 @@ class CpuExternalConstantsTest : public CpuCodegenTest {
     builder.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, constant));
 
-    std::unique_ptr<HloModule> module = CreateNewModule();
+    std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
     module->AddEntryComputation(builder.Build());
 
     CompileAndVerifyIr(std::move(module), filecheck_pattern,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
index 1deb412064b02988a8d4a6d726969c948d354d47..04a81dfd35f459ff1fdb3181dc8fc65c62a37d4f 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
@@ -34,7 +34,7 @@ namespace xla {
 namespace cpu {
 namespace {
 
-class CpuFusionTest : public HloVerifiedTestBase {
+class CpuFusionTest : public HloTestBase {
  protected:
   CpuFusionTest() {}
 
@@ -57,11 +57,11 @@ TEST_F(CpuFusionTest, FuseTwoElementwiseOps) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, add1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The computation root instruction was fused. Verify the fusion instruction
   // is now the root.
@@ -104,11 +104,11 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, two, floor));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The computation root instruction was fused. Verify the fusion instruction
   // is now the root.
@@ -131,7 +131,7 @@ TEST_F(CpuFusionTest, FuseElementwiseOpChain) {
 TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) {
   // Test a chain of fusible ops with a non-fusible op (a reduce) thrown in the
   // middle.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({-1.5, -2.5, -3.0});
   Shape vshape = input_literal.shape();
@@ -183,7 +183,7 @@ TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusibleInstruction) {
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The computation root instruction was fused. Verify the fusion instruction
   // is now the root.
@@ -250,12 +250,12 @@ TEST_F(CpuFusionTest, TestOperandOrderToAvoidDuplication) {
       builder.AddInstruction(HloInstruction::CreateTuple({add1, add2}));
 
   // Create computation and module.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Run fusion.
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   auto fusion1 = result->operand(0);
   auto fusion2 = result->operand(1);
@@ -310,11 +310,11 @@ TEST_F(CpuFusionTest, DoNotDuplicateExpensiveOps) {
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({negate1, negate2, exp2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   CpuInstructionFusion fusion;
-  EXPECT_TRUE(fusion.Run(module).ValueOrDie());
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
 
   // The only fusion instruction should be operand 0 of the tuple (formerly
   // negate1).
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index a434c04a980b9b3cd849792b97a0d9e965ba09f2..9b10c49f4f547edfb2164f98c49cceb031148bdc 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -91,7 +91,7 @@ TEST_P(CpuUnaryIntrinsicTest, DoIt) {
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   string check_lines{spec.check_lines.data(), spec.check_lines.size()};
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index 3b87683ffffefd2aa24dd234cc072425bef00a24..fa0e09ff6b5694c0e97963b83c6e541b858a1376 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -63,7 +63,7 @@ CHECK-NOT: private constant [48 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_text));
+                          ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
@@ -104,14 +104,14 @@ ENTRY main {
 )";
 
   string filecheck_pattern = R"(
-CHECK: private constant [4 x i8]
-CHECK: private constant [8 x i8]
+CHECK-DAG: private constant [4 x i8]
+CHECK-DAG: private constant [8 x i8]
 CHECK-NOT: private constant [4 x i8]
 CHECK-NOT: private constant [8 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_text));
+                          ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index b35fd9dad877c319c3d0110c96a00aeefa78769e..a7702c2aeeaff8a46a2c4f2785ccb873ea2c08e5 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -56,7 +56,7 @@ TEST_F(CpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // Now that we have an HLO module, build an llvm_ir::AliasAnalysis for it.
diff --git a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
index 990ff94ba2338cb663b655ca3106bda83ab718a3..70008947f371d25e95d02839c30ba822fce7a292 100644
--- a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <deque>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/defuser_test.cc b/tensorflow/compiler/xla/service/defuser_test.cc
index e727ba49cb6321e499b5d50d5f45e7f7f6bb6fef..64fb50318394918b277fd717994f5366d762ac36 100644
--- a/tensorflow/compiler/xla/service/defuser_test.cc
+++ b/tensorflow/compiler/xla/service/defuser_test.cc
@@ -18,19 +18,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
 
-class DefuserTest : public HloVerifiedTestBase {
+class DefuserTest : public HloTestBase {
  protected:
   // Returns the number of fusion instructions in the module.
-  int FusionCount() {
+  int FusionCount(const HloModule* m) {
     int count = 0;
-    for (HloComputation* computation : module().computations()) {
+    for (HloComputation* computation : m->computations()) {
       if (computation->IsFusionComputation()) {
         count++;
       }
@@ -43,6 +43,7 @@ class DefuserTest : public HloVerifiedTestBase {
 };
 
 TEST_F(DefuserTest, NoFusionInstruction) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -51,13 +52,14 @@ TEST_F(DefuserTest, NoFusionInstruction) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, param0, param1));
 
-  module().AddEntryComputation(builder.Build());
-  EXPECT_EQ(0, FusionCount());
+  m->AddEntryComputation(builder.Build());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
-  EXPECT_FALSE(defuser_.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(defuser_.Run(m.get()).ValueOrDie());
 }
 
 TEST_F(DefuserTest, TrivialFusionInstructionAsRoot) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -66,21 +68,22 @@ TEST_F(DefuserTest, TrivialFusionInstructionAsRoot) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, param0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add},
                                        HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(1, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(1, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Parameter(), op::Parameter()));
 }
 
 TEST_F(DefuserTest, TrivialFusionInstructionNotAsRoot) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -91,21 +94,22 @@ TEST_F(DefuserTest, TrivialFusionInstructionNotAsRoot) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(shape_, HloOpcode::kNegate, add));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add},
                                        HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(), op::Negate(op::Fusion()));
 
-  EXPECT_EQ(1, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(1, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Negate(op::Add(op::Parameter(), op::Parameter())));
 }
 
 TEST_F(DefuserTest, NonTrivialFusionInstruction) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -128,22 +132,23 @@ TEST_F(DefuserTest, NonTrivialFusionInstruction) {
   auto add2 = builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, constant, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction(
       {add2, constant, div, mul, sub, negate, add},
       HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(1, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(1, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Constant(), op::Divide()));
 }
 
 TEST_F(DefuserTest, MultipleFusionInstructions) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -166,7 +171,7 @@ TEST_F(DefuserTest, MultipleFusionInstructions) {
   auto add2 = builder.AddInstruction(
       HloInstruction::CreateBinary(shape_, HloOpcode::kAdd, constant, div));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add2, constant, div, mul},
                                        HloInstruction::FusionKind::kLoop);
   computation->CreateFusionInstruction({sub, negate, add},
@@ -174,15 +179,16 @@ TEST_F(DefuserTest, MultipleFusionInstructions) {
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(2, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(2, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Constant(), op::Divide()));
 }
 
 TEST_F(DefuserTest, NestedFusionInstructions) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape_, "p0"));
@@ -193,7 +199,7 @@ TEST_F(DefuserTest, NestedFusionInstructions) {
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(shape_, HloOpcode::kNegate, add));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   auto outer_fusion = computation->CreateFusionInstruction(
       {negate, add}, HloInstruction::FusionKind::kLoop);
   HloInstruction* fused_negate = outer_fusion->fused_expression_root();
@@ -203,9 +209,9 @@ TEST_F(DefuserTest, NestedFusionInstructions) {
 
   EXPECT_THAT(computation->root_instruction(), op::Fusion());
 
-  EXPECT_EQ(2, FusionCount());
-  EXPECT_TRUE(defuser_.Run(&module()).ValueOrDie());
-  EXPECT_EQ(0, FusionCount());
+  EXPECT_EQ(2, FusionCount(m.get()));
+  EXPECT_TRUE(defuser_.Run(m.get()).ValueOrDie());
+  EXPECT_EQ(0, FusionCount(m.get()));
 
   EXPECT_THAT(computation->root_instruction(), op::Negate(op::Add()));
 }
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
index b3549acfc291a54b2345b006310613c3a45a4b47..ed37099a5428075928ec98b134632867d58bbfe7 100644
--- a/tensorflow/compiler/xla/service/despecializer.cc
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
 #include "tensorflow/compiler/xla/service/defuser.h"
+#include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
 
 namespace xla {
@@ -45,6 +46,7 @@ class ControlDepRemover : public HloModulePass {
 
 Despecializer::Despecializer() : pipeline_("despecializer") {
   // TODO(b/70588125): Also deal with window reversal in a fast way.
+  pipeline_.AddPass<HloDescheduler>();
   pipeline_.AddPass<ControlDepRemover>();
   pipeline_.AddPass<Defuser>();
   pipeline_.AddPass<ImplicitBroadcastRemover>();
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index edbcb25247421cdb50a845df1ec8b1851970efe3..e1e3b156fb34fd128864ed34c6d9d055294672bf 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 
 namespace xla {
 
@@ -39,6 +40,10 @@ StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
         "Failed to allocate request for %s (%uB) on device ordinal %d",
         tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal);
   }
+  VLOG(3) << absl::StreamFormat(
+      "Allocated %s (%uB) on device ordinal %d: %p",
+      tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal,
+      result.opaque());
   return OwningDeviceMemory(result, device_ordinal, this);
 }
 
@@ -47,6 +52,8 @@ Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
   if (!mem.is_null()) {
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                         GetStreamExecutor(device_ordinal));
+    VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
+                                  mem.opaque(), device_ordinal);
     stream_executor->Deallocate(&mem);
   }
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
index 3e7373adc5ab8a60fd18348ce2477175aaaa8fd4..c54f81e6915a286757e59821c2684a7271889816 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
@@ -50,7 +50,7 @@ void DfsHloVisitorBase<HloInstructionPtr>::SetVisiting(
     const HloInstruction& instruction) {
   VLOG(3) << "marking HLO " << &instruction << " as visiting: ";
   DCHECK(NotVisited(instruction));
-  visit_state_.SetState(instruction.unique_id(), VisitState::kVisiting);
+  visit_state_[instruction.unique_id()] = VisitState::kVisiting;
 }
 
 template <typename HloInstructionPtr>
@@ -58,7 +58,7 @@ void DfsHloVisitorBase<HloInstructionPtr>::SetVisited(
     const HloInstruction& instruction) {
   VLOG(3) << "marking HLO " << &instruction << " as visited: ";
   DCHECK(NotVisited(instruction) || IsVisiting(instruction));
-  visit_state_.SetState(instruction.unique_id(), VisitState::kVisited);
+  visit_state_[instruction.unique_id()] = VisitState::kVisited;
 }
 
 template <typename HloInstructionPtr>
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 68d01d75a2ed3d7eaadb03a46ba3bd20f43a9ffc..e84bf00153aa28df29d8df486b92654feab4afbf 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -107,6 +108,7 @@ class DfsHloVisitorBase {
   virtual Status HandleCrossReplicaSum(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
+  virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
@@ -249,6 +251,7 @@ class DfsHloVisitorBase {
 
   virtual Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0;
 
+  virtual Status HandleAddDependency(HloInstructionPtr add_dependency) = 0;
   virtual Status HandleAfterAll(HloInstructionPtr token) = 0;
 
   // Invoked to inform the visitor that the traversal has completed, and that
@@ -263,21 +266,25 @@ class DfsHloVisitorBase {
     kVisited = 2,
   };
 
-  VisitState GetVisitState(int id) { return visit_state_.GetState(id); }
+  VisitState GetVisitState(int id) {
+    auto iter = visit_state_.find(id);
+    if (iter == visit_state_.end()) {
+      return VisitState::kNotVisited;
+    }
+    return iter->second;
+  }
   VisitState GetVisitState(const HloInstruction& instruction);
 
   // Resize internal state if necessary to hold state for ids <= num.
   // This call is purely a performance hint and can be omitted without
   // affecting correctness.
-  void ReserveVisitStates(int num) { visit_state_.Reserve(num); }
+  void ReserveVisitStates(int num) { visit_state_.reserve(num); }
 
   // Useful when we want to visit the same computation more than once with the
   // same visitor.
-  void ResetVisitStates() { visit_state_.Reset(); }
+  void ResetVisitStates() { visit_state_.clear(); }
 
-  void SetVisitState(int id, VisitState state) {
-    visit_state_.SetState(id, state);
-  }
+  void SetVisitState(int id, VisitState state) { visit_state_[id] = state; }
 
   // Sets the visitation state of the given instruction as kVisiting.
   //
@@ -326,44 +333,7 @@ class DfsHloVisitorBase {
   virtual Status Postprocess(HloInstructionPtr hlo);
 
  private:
-  class DFSVisitStates {
-   public:
-    DFSVisitStates() {}
-    void Reserve(uint64 num) {
-      states_.reserve((num + kStatesPerWord - 1) / kStatesPerWord);
-    }
-    VisitState GetState(uint64 id) {
-      uint64 word_index = id / kStatesPerWord;
-      if (word_index >= states_.size()) {
-        return VisitState::kNotVisited;
-      }
-      static_assert(static_cast<int>(VisitState::kVisited) < 3,
-                    "VisitState must fit in two bits");
-      uint64 w = states_[word_index];
-      uint32 shift = 2 * (id % kStatesPerWord);  // 2 bits per state
-      return static_cast<VisitState>((w >> shift) & 0x3);
-    }
-    void SetState(uint64 id, VisitState state) {
-      uint64 word_index = id / kStatesPerWord;
-      if (word_index >= states_.size()) {
-        states_.resize(word_index + 1, 0);
-      }
-      uint64* w = &states_[word_index];
-      uint32 shift = 2 * (id % kStatesPerWord);  // 2 bits per state
-      uint64 mask = 0x3ull << shift;
-      *w = (*w & ~mask) | (static_cast<uint64>(state) << shift);
-      DCHECK_EQ(GetState(id), state);
-    }
-    void Reset() { states_.clear(); }
-
-   private:
-    static const uint32 kStatesPerWord = sizeof(uint64) / 2 /*bits per entry*/;
-    // Map from id to two-bit states.  We store 32 such states per 64-bit
-    // value
-    std::vector<uint64> states_;
-  };
-
-  DFSVisitStates visit_state_;
+  absl::flat_hash_map<int, VisitState> visit_state_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DfsHloVisitorBase);
 };
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 4cd10ab06cd3b804406607212d3f3c316d6cff95..80ea5be298aea44a0f424398da74c4e478f10346 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -203,6 +203,12 @@ class DfsHloVisitorWithDefaultBase
   Status HandleAfterAll(HloInstructionPtr token) override {
     return DefaultAction(token);
   }
+  Status HandleGetDimensionSize(HloInstructionPtr get_size) override {
+    return DefaultAction(get_size);
+  }
+  Status HandleAddDependency(HloInstructionPtr add_dependency) override {
+    return DefaultAction(add_dependency);
+  }
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d0472689bf48092ceef2e9792c1358687d707ec
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -0,0 +1,459 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+
+namespace {
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
+  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
+         window_dimension.padding_low() == 0 &&
+         window_dimension.padding_high() == 0 &&
+         window_dimension.window_dilation() == 1 &&
+         window_dimension.base_dilation() == 1;
+}
+}  // namespace
+
+class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit DynamicDimensionInferenceVisitor(
+      const DynamicParameterBinding& param_bindings,
+      DynamicDimensionInference* parent)
+      : param_bindings_(param_bindings), parent_(parent) {}
+
+  Status DefaultAction(HloInstruction* hlo) override;
+
+  static Status Run(HloComputation* computation,
+                    const DynamicParameterBinding& param_bindings,
+                    DynamicDimensionInference* parent) {
+    DynamicDimensionInferenceVisitor visitor(param_bindings, parent);
+    return computation->Accept(&visitor);
+  }
+
+  Status HandleParameter(HloInstruction* hlo) override;
+
+  Status HandleReduce(HloInstruction* hlo) override;
+
+  Status HandleDot(HloInstruction* hlo) override;
+
+  Status HandleTranspose(HloInstruction* hlo) override;
+
+  Status HandleReshape(HloInstruction* hlo) override;
+
+  Status HandlePad(HloInstruction* hlo) override;
+
+  Status HandleBroadcast(HloInstruction* hlo) override;
+
+  Status HandleGetDimensionSize(HloInstruction* hlo) override;
+
+  Status HandleSelect(HloInstruction* hlo) override;
+
+  Status HandleConvolution(HloInstruction* hlo) override;
+
+  Status HandleReduceWindow(HloInstruction* hlo) override;
+
+  Status HandleSelectAndScatter(HloInstruction* hlo) override;
+
+  Status HandleGetTupleElement(HloInstruction* hlo) override;
+
+  Status HandleElementwiseUnary(HloInstruction* hlo) override;
+
+  Status HandleElementwiseBinary(HloInstruction* hlo) override;
+
+ private:
+  using OperandDynamicDimensionFn = std::function<Status(
+      HloInstruction* operand, ShapeIndex index, int64 dimension,
+      int64 operand_index, HloInstruction* dynamic_size)>;
+
+  Status ForEachOperandDynamicDimension(HloInstruction* inst,
+                                        const OperandDynamicDimensionFn&);
+
+  // Pass through a dynamic dimension from the input to the output with the same
+  // value and index in the shape. This is a helper function to handle trivial
+  // instructions like elementwise operations.
+  Status PassThroughDynamicDimension(HloInstruction*);
+
+  // The dynamic parameter bindings of this computation.
+  const DynamicParameterBinding& param_bindings_;
+
+  // A pointer to DynamicDimensionInference, used to update the dynamic mapping.
+  DynamicDimensionInference* parent_;
+};
+
+Status DynamicDimensionInferenceVisitor::DefaultAction(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        return UnimplementedStrCat(
+            "Asked to propagate a dynamic dimension from hlo ",
+            operand->ToString(), "@", index.ToString(), "@", dimension,
+            " to hlo ", hlo->ToString(), ", which is not implemented.");
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        if (hlo->tuple_index() == index[0]) {
+          ShapeIndex new_index =
+              ShapeIndexView(index).ConsumeFront().ToShapeIndex();
+          parent_->SetDynamicSize(hlo, new_index, dimension, dynamic_size);
+        }
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleBroadcast(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        int64 broadcast_dim = hlo->dimensions(dimension);
+        parent_->SetDynamicSize(hlo, index, broadcast_dim, dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        if (operand_index != 0) {
+          return Unimplemented(
+              "Dynamic dimension on padding value is not supported");
+        }
+        const PaddingConfig_PaddingConfigDimension& padding_config =
+            hlo->padding_config().dimensions(dimension);
+        if (padding_config.interior_padding() == 0 &&
+            padding_config.edge_padding_low() == 0 &&
+            padding_config.edge_padding_high() == 0) {
+          parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
+          return Status::OK();
+        } else {
+          return Unimplemented(
+              "Dynamic dimension propagation on padding dimension is not "
+              "supported.");
+        }
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* reduce = hlo;
+        int64 operand_count = reduce->operand_count();
+        CHECK_EQ(operand_count % 2, 0);
+        if (operand_index >= operand_count / 2) {
+          // Init values doesn't have dynamic size.
+          return Status::OK();
+        }
+        if ((absl::c_count(reduce->dimensions(), dimension) != 0)) {
+          // Dimension is to be reduce, stop tracing.
+          return Status::OK();
+        }
+
+        // Find out the new dynamic dimension after reduce.
+        int64 dimensions_not_reduced_count = 0;
+        for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+          if (dimension == i) {
+            parent_->SetDynamicSize(reduce, {}, dimensions_not_reduced_count,
+                                    dynamic_size);
+
+            return Status::OK();
+          }
+          if (absl::c_count(reduce->dimensions(), i) == 0) {
+            dimensions_not_reduced_count++;
+          }
+        }
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* dot = hlo;
+        const DotDimensionNumbers& dimension_numbers =
+            dot->dot_dimension_numbers();
+        // A map from the operand dimensions to result dimension.
+        absl::flat_hash_map<int64, int64> result_dim_mapping;
+        int64 current_result_dims = 0;
+        std::unordered_set<int64> batch_dims(
+            dimension_numbers.rhs_batch_dimensions().begin(),
+            dimension_numbers.rhs_batch_dimensions().end());
+
+        for (int64 i : dimension_numbers.rhs_batch_dimensions()) {
+          result_dim_mapping[i] = current_result_dims++;
+        }
+
+        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(0)->shape()); i++) {
+          if (!absl::c_linear_search(
+                  dimension_numbers.lhs_contracting_dimensions(), i)) {
+            if (operand_index == 0) {
+              result_dim_mapping[i] = current_result_dims;
+            }
+            current_result_dims++;
+          }
+        }
+
+        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(1)->shape()); i++) {
+          if (!absl::c_linear_search(
+                  dimension_numbers.rhs_contracting_dimensions(), i) &&
+              !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(),
+                                     i)) {
+            if (operand_index == 1) {
+              result_dim_mapping[i] = current_result_dims;
+            }
+            current_result_dims++;
+          }
+        }
+
+        // Check if the operand dim is in the result shape. If so, add another
+        // work item to trace that dimension.
+        auto iter = result_dim_mapping.find(dimension);
+        if (iter != result_dim_mapping.end()) {
+          parent_->SetDynamicSize(dot, {}, iter->second, dynamic_size);
+        }
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleTranspose(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        parent_->SetDynamicSize(hlo, {}, hlo->dimensions()[dimension],
+                                dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleConvolution(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* conv = hlo;
+        const ConvolutionDimensionNumbers& dimension_numbers =
+            conv->convolution_dimension_numbers();
+
+        if (operand_index == 0) {
+          if (dimension == dimension_numbers.input_batch_dimension()) {
+            parent_->SetDynamicSize(conv, {},
+                                    dimension_numbers.output_batch_dimension(),
+                                    dynamic_size);
+            return Status::OK();
+          }
+
+          if (dimension == dimension_numbers.input_feature_dimension()) {
+            return Status::OK();
+          }
+        } else {
+          if (dimension == dimension_numbers.kernel_input_feature_dimension()) {
+            return Status::OK();
+          }
+        }
+
+        return Unimplemented("Dynamic Spatial Convolution is not supported: %s",
+                             conv->ToString());
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleGetDimensionSize(
+    HloInstruction*) {
+  // Dynamic dimension doesn't propagate through GetDimensionSize:
+  //
+  //   Input: F32[x, y, z]
+  //     |
+  //   GetDimensionSize(1): U32[]
+  //
+  // The returned value is a scalar, which doesn't have any dynamic dimension in
+  // the shape (although the value contains the real size of the dynamic
+  // dimension of the input).
+  return Status::OK();
+}
+
+Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleElementwiseUnary(
+    HloInstruction* hlo) {
+  return PassThroughDynamicDimension(hlo);
+}
+
+Status DynamicDimensionInferenceVisitor::HandleSelect(HloInstruction* hlo) {
+  return PassThroughDynamicDimension(hlo);
+}
+
+Status DynamicDimensionInferenceVisitor::HandleElementwiseBinary(
+    HloInstruction* hlo) {
+  return PassThroughDynamicDimension(hlo);
+}
+
+Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* reshape = hlo;
+        std::vector<std::pair<int64, int64>> unmodified_dims =
+            ShapeUtil::DimensionsUnmodifiedByReshape(operand->shape(),
+                                                     reshape->shape());
+        for (auto& unmodified : unmodified_dims) {
+          if (unmodified.first == dimension) {
+            parent_->SetDynamicSize(reshape, {}, unmodified.second,
+                                    dynamic_size);
+            return Status::OK();
+          }
+        }
+        return Unimplemented(
+            "Dynamic Reshape on modified dimensions is yet not supported: %s",
+            reshape->ToString());
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* reduce_window = hlo;
+        const WindowDimension& window_dimension =
+            reduce_window->window().dimensions(dimension);
+
+        if (!IsTrivialWindowDimension(window_dimension)) {
+          return Unimplemented(
+              "Dynamic Spatial reduce window is not supported: %s",
+              reduce_window->ToString());
+        }
+
+        parent_->SetDynamicSize(reduce_window, {}, dimension, dynamic_size);
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* select_and_scatter = hlo;
+        const WindowDimension& window_dimension =
+            select_and_scatter->window().dimensions(dimension);
+
+        if (!IsTrivialWindowDimension(window_dimension)) {
+          return Unimplemented(
+              "Dynamic Spatial select and scatter is not supported: %s",
+              select_and_scatter->ToString());
+        }
+
+        parent_->SetDynamicSize(select_and_scatter, {}, dimension,
+                                dynamic_size);
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleParameter(HloInstruction* hlo) {
+  return param_bindings_.ForEachBinding(
+      [&](const DynamicParameterBinding::DynamicParameter& dynamic_parameter,
+          const DynamicParameterBinding::DynamicDimension& dynamic_dimension) {
+        if (dynamic_dimension.parameter_num != hlo->parameter_number()) {
+          return Status::OK();
+        }
+        HloComputation* computation = hlo->parent();
+        HloInstruction* target_parameter =
+            computation->parameter_instruction(dynamic_dimension.parameter_num);
+
+        HloInstruction* dynamic_size =
+            computation->parameter_instruction(dynamic_parameter.parameter_num);
+        for (int64 i : dynamic_parameter.parameter_index) {
+          dynamic_size =
+              computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+                  ShapeUtil::GetSubshape(dynamic_size->shape(), {i}),
+                  dynamic_size, i));
+        }
+
+        parent_->SetDynamicSize(target_parameter,
+                                dynamic_dimension.parameter_index,
+                                dynamic_dimension.dimension, dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
+    HloInstruction* inst, const OperandDynamicDimensionFn& fn) {
+  for (int64 operand_index = 0; operand_index < inst->operand_count();
+       ++operand_index) {
+    auto iter =
+        parent_->per_hlo_dynamic_dimensions_.find(inst->operand(operand_index));
+    if (iter != parent_->per_hlo_dynamic_dimensions_.end()) {
+      for (auto& dynamic_dimension : iter->second) {
+        HloInstruction* dynamic_size = parent_->GetDynamicSize(
+            dynamic_dimension.inst, dynamic_dimension.index,
+            dynamic_dimension.dim);
+        TF_RETURN_IF_ERROR(fn(dynamic_dimension.inst, dynamic_dimension.index,
+                              dynamic_dimension.dim, operand_index,
+                              dynamic_size));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+/* static */
+StatusOr<DynamicDimensionInference> DynamicDimensionInference::Run(
+    HloModule* module) {
+  VLOG(0) << "Param Config " << module->dynamic_parameter_binding().ToString();
+  DynamicDimensionInference inference(module);
+  TF_RETURN_IF_ERROR(inference.AnalyzeDynamicDimensions());
+  return inference;
+}
+
+DynamicDimensionInference::DynamicDimensionInference(HloModule* module)
+    : module_(module) {}
+
+Status DynamicDimensionInference::AnalyzeDynamicDimensions() {
+  return DynamicDimensionInferenceVisitor::Run(
+      module_->entry_computation(), module_->dynamic_parameter_binding(), this);
+}
+
+HloInstruction* DynamicDimensionInference::GetDynamicSize(
+    HloInstruction* inst, const ShapeIndex& index, int64 dim) const {
+  auto iter = dynamic_mapping_.find(DynamicDimension{inst, index, dim});
+  if (iter != dynamic_mapping_.end()) {
+    return iter->second;
+  }
+  return nullptr;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..164d15bf111a92e3da957f609b54ee0662ef18b1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+// DynamicDimensionInference analyzes each HLO instruction in a graph and
+// inferences which dimensions are dynamic and which scalar instructions
+// represent the runtime real size of those dynamic dimensions.
+class DynamicDimensionInference {
+ public:
+  static StatusOr<DynamicDimensionInference> Run(HloModule* module);
+
+  string ToString() const;
+
+  // If the dimension `dim` of instruction `inst` at `index` has a dynamic size,
+  // returns a scalar HloInstruction that represents the runtime size of that
+  // dimension. Otherwise returns nullptr.
+  HloInstruction* GetDynamicSize(HloInstruction* inst, const ShapeIndex& index,
+                                 int64 dim) const;
+
+  friend class DynamicDimensionInferenceVisitor;
+
+ private:
+  explicit DynamicDimensionInference(HloModule* module);
+
+  // DynamicDimension is used as a key in the dynamic key-value mapping. It
+  // unambiguously represents a dynamic dimension of a instruction at a given
+  // index.
+  struct DynamicDimension {
+    // HloInstruction that holds the dimension.
+    HloInstruction* inst;
+    // Subshape of the instruction that holds the dimension.
+    ShapeIndex index;
+    // The dimension number of the dynamic dimension at given index of a given
+    // instruction.
+    int64 dim;
+
+    // Artifacts needed to make this struct able to be used as a `key` in absl
+    // maps. "friend" keywords are added so these functions can be found through
+    // ADL.
+    template <typename H>
+    friend H AbslHashValue(H h, const DynamicDimension& m) {
+      return H::combine(std::move(h), m.inst, m.index, m.dim);
+    }
+
+    friend bool operator==(const DynamicDimension& lhs,
+                           const DynamicDimension& rhs) {
+      return lhs.inst == rhs.inst && lhs.index == rhs.index &&
+             lhs.dim == rhs.dim;
+    }
+  };
+
+  // Update the dynamic mapping so that we know dimension `dim` of instruction
+  // `inst` at `index` has a dynamic size, and its runtime size is represented
+  // by a scalar instruction `size`.
+  void SetDynamicSize(HloInstruction* inst, const ShapeIndex& index, int64 dim,
+                      HloInstruction* size) {
+    dynamic_mapping_.try_emplace(DynamicDimension{inst, index, dim}, size);
+    auto iter = per_hlo_dynamic_dimensions_.try_emplace(inst);
+    iter.first->second.emplace(DynamicDimension{inst, index, dim});
+  }
+
+  // AnalyzeDynamicDimensions starts the analysis of the dynamic dimensions in
+  // module_.
+  Status AnalyzeDynamicDimensions();
+
+  // HloModule being analyzed.
+  HloModule* module_;
+
+  // dynamic_mapping_ holds the result of the analysis. It maps a dynamic
+  // dimension to a scalar HloInstruction that represents the real dynamic size
+  // of the dynamic dimension.
+  using DynamicMapping = absl::flat_hash_map<DynamicDimension, HloInstruction*>;
+  DynamicMapping dynamic_mapping_;
+
+  using PerHloDynamicDimensions =
+      absl::flat_hash_map<HloInstruction*,
+                          absl::flat_hash_set<DynamicDimension>>;
+  PerHloDynamicDimensions per_hlo_dynamic_dimensions_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea9ebed45d99797ce4f80376ec3d0b758da3ca17
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -0,0 +1,535 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class DynamicDimensionInferenceTest : public HloTestBase {
+ protected:
+  DynamicDimensionInferenceTest() : HloTestBase() {
+    module_ = CreateNewVerifiedModule();
+  }
+
+  Status RunInference() {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
+    TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
+                        DynamicDimensionInference::Run(module_.get()));
+
+    inference_ = absl::make_unique<DynamicDimensionInference>(inference);
+    return Status::OK();
+  }
+
+  HloComputation* GetAdd() {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    return module_->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  std::unique_ptr<HloModule> module_;
+  std::unique_ptr<DynamicDimensionInference> inference_;
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(S32, {});
+};
+
+TEST_F(DynamicDimensionInferenceTest, ParamTest) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "param"));
+  auto param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param"));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(param, {}, 1), param2);
+  EXPECT_EQ(inference_->GetDynamicSize(param, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(param2, {}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ParamTestTuple) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({input_shape, scalar_shape_}), "param"));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{0, {1}},
+      DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
+              op::GetTupleElement(param, 1));
+
+  EXPECT_EQ(inference_->GetDynamicSize(param, {0}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, GetTupleElement) {
+  // When data flows through GTE, the dynamic dimension size keeps the
+  // same, and the index has its front popped.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({input_shape, scalar_shape_}), "param"));
+
+  auto gte = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(input_shape, param, 0));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{0, {1}},
+      DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
+              op::GetTupleElement(param, 1));
+
+  EXPECT_THAT(inference_->GetDynamicSize(gte, {}, 1),
+              op::GetTupleElement(param, 1));
+
+  EXPECT_EQ(inference_->GetDynamicSize(param, {0}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ElementwiseTest) {
+  // When data flows through elementwise, the dynamic dimension size keeps the
+  // same.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto* negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(negate, {}, 1), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReduceTestI) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, negate, init, {0, 2}, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReduceTestII) {
+  // Same as ReduceTestI, but only reduce one dimension.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {1, 2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(
+      HloInstruction::CreateReduce(reduce_shape, negate, init, {1}, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, DotTest) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto xz_shape = ShapeUtil::MakeShape(F32, {xdim, zdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(xz_shape, a_param, b_param, dot_dnums,
+                                HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for non-contracting dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 0), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 1), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ConvolutionTest) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto zx_shape = ShapeUtil::MakeShape(F32, {zdim, xdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers(0);
+
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(1);
+  dnums.set_output_feature_dimension(0);
+
+  Window window;
+
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      zx_shape, a_param, b_param, /*feature_group_count=*/1, window, dnums,
+      HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for non-contracting dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, TransposeTest) {
+  // Test the ability to trace unmodified dimensions
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 3});
+  auto output_shape = ShapeUtil::MakeShape(F32, {3, 2, 1});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param_1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+  auto* size_param_2 = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+  auto* size_param_3 = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/3, scalar_shape_, "size_param"));
+
+  auto* transpose = builder.AddInstruction(
+      HloInstruction::CreateTranspose(output_shape, a_param, {2, 1, 0}));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{3, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 0), size_param_3);
+  EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 1), size_param_2);
+  EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 2), size_param_1);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReshapeTest) {
+  // Test the ability to trace unmodified reshape dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6});
+  auto output_shape = ShapeUtil::MakeShape(F32, {6, 4, 1, 5, 2, 3});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto* reshape = builder.AddInstruction(
+      HloInstruction::CreateReshape(output_shape, a_param));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 3}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 2), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 3), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 4), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 5), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReshapeTestUnimplemented) {
+  // Test the ability to trace unmodified reshape dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6});
+  auto output_shape = ShapeUtil::MakeShape(F32, {6, 4, 1, 5, 2, 3});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  builder.AddInstruction(HloInstruction::CreateReshape(output_shape, a_param));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  Status status = RunInference();
+  EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
+}
+
+TEST_F(DynamicDimensionInferenceTest, BroadcastTest) {
+  // Test the ability to trace broadcast dimension.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2});
+  auto output_shape = ShapeUtil::MakeShape(F32, {3, 2, 4});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto* broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(output_shape, a_param, {1}));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 2), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReduceWindowBatchTest) {
+  // Test the ability to trace reduce window batch dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
+  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+
+  Window window;
+  // First dimension is unchanged.
+  WindowDimension* batch_dim = window.add_dimensions();
+  batch_dim->set_size(1);
+  batch_dim->set_stride(1);
+  batch_dim->set_padding_low(0);
+  batch_dim->set_padding_high(0);
+  batch_dim->set_window_dilation(1);
+  batch_dim->set_base_dilation(1);
+
+  // Second and third dimension are reduced.
+  for (int64 i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(2);
+    dim->set_stride(2);
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          output_shape, a_param, init, window, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, SelectAndScatterTest) {
+  // Test the ability to trace select and scatter batch dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
+  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+
+  Window window;
+  // First dimension is unchanged.
+  WindowDimension* batch_dim = window.add_dimensions();
+  batch_dim->set_size(1);
+  batch_dim->set_stride(1);
+  batch_dim->set_padding_low(0);
+  batch_dim->set_padding_high(0);
+  batch_dim->set_window_dilation(1);
+  batch_dim->set_base_dilation(1);
+
+  // Second and third dimension are reduced.
+  for (int64 i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(2);
+    dim->set_stride(2);
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          output_shape, a_param, init, window, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8bfc8905064bcd7b68fe259fbcc1546ff083dbd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+
+Status DynamicParameterBinding::Bind(
+    const DynamicParameter& dynamic_parameter,
+    const DynamicDimension& dynamic_dimension) {
+  auto result = bindings_.emplace(dynamic_dimension, dynamic_parameter);
+  TF_RET_CHECK(result.second);
+  return Status::OK();
+}
+
+absl::optional<DynamicParameterBinding::DynamicParameter>
+DynamicParameterBinding::GetBinding(const DynamicDimension& dynamic_dimension) {
+  auto param_iter = bindings_.find(dynamic_dimension);
+  if (param_iter == bindings_.end()) {
+    return absl::nullopt;
+  }
+  return param_iter->second;
+}
+
+DynamicParameterBindingProto DynamicParameterBinding::ToProto() const {
+  DynamicParameterBindingProto result;
+  for (const auto& binding : bindings_) {
+    const DynamicDimension& dynamic_dimension = binding.first;
+    const DynamicParameter& dynamic_param = binding.second;
+    DynamicParameterBindingProto::Binding binding_proto;
+    binding_proto.set_dynamic_param_num(dynamic_param.parameter_num);
+    for (int64 i : dynamic_param.parameter_index) {
+      binding_proto.add_dynamic_param_index(i);
+    }
+
+    binding_proto.set_target_param_num(dynamic_dimension.parameter_num);
+
+    for (int64 i : dynamic_dimension.parameter_index) {
+      binding_proto.add_target_param_index(i);
+    }
+
+    binding_proto.set_target_param_dim_num(dynamic_dimension.dimension);
+    result.add_entries()->Swap(&binding_proto);
+  }
+  return result;
+}
+
+StatusOr<DynamicParameterBinding> DynamicParameterBinding::CreateFromProto(
+    const DynamicParameterBindingProto& proto) {
+  DynamicParameterBinding result;
+  for (const DynamicParameterBindingProto::Binding& binding : proto.entries()) {
+    int64 dynamic_param_num = binding.dynamic_param_num();
+    ShapeIndex dynamic_param_index(binding.dynamic_param_index().begin(),
+                                   binding.dynamic_param_index().end());
+    int64 target_param_num = binding.target_param_num();
+    ShapeIndex target_param_index(binding.target_param_index().begin(),
+                                  binding.target_param_index().end());
+    int64 target_dim_num = binding.target_param_num();
+
+    TF_RETURN_IF_ERROR(
+        result.Bind(DynamicParameter{dynamic_param_num, dynamic_param_index},
+                    DynamicDimension{target_param_num, target_param_index,
+                                     target_dim_num}));
+  }
+
+  return result;
+}
+
+string DynamicParameterBinding::ToString() const {
+  std::vector<string> pieces;
+  pieces.push_back("DynamicParameterBinding: ");
+  for (const auto& binding : bindings_) {
+    const DynamicDimension& dynamic_dimension = binding.first;
+    const DynamicParameter& dynamic_param = binding.second;
+    pieces.push_back(absl::StrFormat(
+        " -- Input param number %lld at %s has dim %lld as dynamic"
+        " dimension, which is represented by param number %lld at "
+        "%s",
+        dynamic_dimension.parameter_num,
+        dynamic_dimension.parameter_index.ToString(),
+        dynamic_dimension.dimension, dynamic_param.parameter_num,
+        dynamic_param.parameter_index.ToString()));
+  }
+  return absl::StrJoin(pieces, "\n");
+}
+
+Status DynamicParameterBinding::ForEachBinding(BindingFn fn) const {
+  for (const auto& binding : bindings_) {
+    TF_RETURN_IF_ERROR(fn(binding.second, binding.first));
+  }
+  return Status::OK();
+}
+
+Status DynamicParameterBinding::Verify(const HloModule& module) const {
+  const HloComputation* entry = module.entry_computation();
+  return ForEachBinding([&](const DynamicParameter& dynamic_parameter,
+                            const DynamicDimension& dynamic_dimension)
+                            -> Status {
+    TF_RET_CHECK(dynamic_parameter.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(dynamic_dimension.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        entry->parameter_instruction(dynamic_parameter.parameter_num)->shape(),
+        dynamic_parameter.parameter_index));
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        entry->parameter_instruction(dynamic_dimension.parameter_num)->shape(),
+        dynamic_dimension.parameter_index));
+    TF_RET_CHECK(
+        dynamic_dimension.dimension <
+        ShapeUtil::Rank(ShapeUtil::GetSubshape(
+            entry->parameter_instruction(dynamic_dimension.parameter_num)
+                ->shape(),
+            dynamic_dimension.parameter_index)));
+    return Status::OK();
+  });
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const DynamicParameterBinding& binding) {
+  out << binding.ToString();
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd474d8eed1b2c30ddb8f624a864198c74eacaba
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+class HloModule;
+// We currently use an explicit API that takes an extra parameter to indicate
+// the runtime size of a dynamic dimension. DynamicParameterBinding indicates
+// the relationship between parameter: We can have a dynamic parameter that
+// points to another target parameter to indicate that the target parameter is
+// dynamic.
+//
+//
+// TODO(b/119520625): Remove this API once we have more dynamic shape infra
+// ready.
+class DynamicParameterBinding {
+ public:
+  // DynamicParameter represents a special parameter that is used to represent
+  // the runtime size of a dimension of another parameter. A dynamic parameter
+  // has to be a scalar value.
+  struct DynamicParameter {
+    // The parameter number of dynamic parameter.
+    int64 parameter_num;
+    // The index of the parameter.
+    ShapeIndex parameter_index;
+  };
+
+  // DynamicDimension represents a dimension whose size is determined at
+  // runtime. A DynamicDimension's runtime size is determined by the binded
+  // DynamicParameter using `DynamicParameterBinding::Bind` method.
+  struct DynamicDimension {
+    // The parameter number of dynamic dimension.
+    int64 parameter_num;
+    // The subshape index of the parameter.
+    ShapeIndex parameter_index;
+    // The dimension number in the subshape.
+    int64 dimension;
+
+    // "friend" keyword are added so these functions can be found by ADL.
+    template <typename H>
+    friend H AbslHashValue(H h, const DynamicDimension& m) {
+      return H::combine(std::move(h), m.parameter_num, m.parameter_index,
+                        m.dimension);
+    }
+
+    friend bool operator==(const DynamicDimension& lhs,
+                           const DynamicDimension& rhs) {
+      return lhs.parameter_num == rhs.parameter_num &&
+             lhs.parameter_index == rhs.parameter_index &&
+             lhs.dimension == rhs.dimension;
+    }
+  };
+
+  DynamicParameterBinding() = default;
+
+  virtual ~DynamicParameterBinding() = default;
+
+  // Adds binding which indicates that the dimension indicated by
+  // `dynamic_dimension` is dynamic, and its runtime size is represented by
+  // `dynamic_parameter`.
+  Status Bind(const DynamicParameter& dynamic_parameter,
+              const DynamicDimension& dynamic_dimension);
+
+  // Returns the parameter and the index representing the runtime size of
+  // dimension `dim_num` of parameter `param_num` at `param_index`.
+  //
+  // Returns nullopt if the binding is not set.
+  absl::optional<DynamicParameter> GetBinding(
+      const DynamicDimension& dynamic_dimension);
+
+  using BindingFn =
+      std::function<Status(const DynamicParameter& dynamic_parameter,
+                           const DynamicDimension& dynamic_dimension)>;
+
+  // Iterate through each binding.
+  Status ForEachBinding(BindingFn fn) const;
+
+  DynamicParameterBindingProto ToProto() const;
+
+  static StatusOr<DynamicParameterBinding> CreateFromProto(
+      const DynamicParameterBindingProto& proto);
+
+  string ToString() const;
+
+  // Verifies that the given binding is valid for the given module.
+  // Specifically, the binding's parameter and parameter size should be valid.
+  Status Verify(const HloModule& module) const;
+
+ private:
+  // Keeps track of mappings from DynamicDimension to DynamicParameter. The
+  // direction of is chosen so that we can easily query if a dimension is
+  // dynamic and which dynamic parameter represents the real size of that
+  // dimension.
+  absl::flat_hash_map<DynamicDimension, DynamicParameter> bindings_;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const DynamicParameterBinding& binding);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83a6d83dffde7995bd8e43917d13c5fd2705ba6f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+class DynamicParameterBindingTest : public HloTestBase {};
+
+TEST_F(DynamicParameterBindingTest, SimpleBinding) {
+  // 'b' is a dynamic shape; 'a' represents the real size of b's first
+  // dimension.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[10] parameter(1)
+  ROOT root = (f32[], f32[10]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {}},
+                   DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
+                                                    /*parameter_index=*/{},
+                                                    /*dimension=*/0});
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({}));
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+TEST_F(DynamicParameterBindingTest, TupleBinding) {
+  // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's first
+  // dimension.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[10]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[10] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[10]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+TEST_F(DynamicParameterBindingTest, TupleBindingWithMultiDimension) {
+  // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's both
+  // dimensions.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[10, 10]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[10, 10] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[10, 10]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 1}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+  EXPECT_TRUE(param2);
+  EXPECT_EQ(param2->parameter_num, 0);
+  EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
+
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 515267edd7caf42e04ebe638b99006db8967ea30..6f1f95f2e9082649b6ca9cc0da5c238e15b77c10 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instructions.h"
@@ -1671,26 +1672,66 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
 
   b_->SetInsertPoint(init_block);
 
+  // Assign a unique id for each *different* operand, and count how often each
+  // operand is used. If all operands are different, the usage count will be 1
+  // for each operand.
+  absl::flat_hash_map<const HloInstruction*, int64> to_unique_operand_id;
+  std::vector<int64> operand_usage_count;
+  for (const auto* operand : hlo->operands()) {
+    if (to_unique_operand_id.contains(operand)) {
+      ++operand_usage_count[to_unique_operand_id[operand]];
+    } else {
+      int64 unique_operand_id = to_unique_operand_id.size();
+      to_unique_operand_id[operand] = unique_operand_id;
+      operand_usage_count.push_back(1);
+    }
+  }
+
+  // To avoid that we emit the same operand more than once, we create one basic
+  // block for each *different* operand with a PHI node for the different source
+  // index inputs.
+  std::vector<llvm::BasicBlock*> emit_operand_blocks(
+      to_unique_operand_id.size(), nullptr);
+  std::vector<llvm::PHINode*> source_index_phis(to_unique_operand_id.size(),
+                                                nullptr);
+  for (const auto* operand : hlo->operands()) {
+    int64 operand_id = to_unique_operand_id[operand];
+    if (emit_operand_blocks[operand_id] != nullptr) {
+      continue;
+    }
+
+    emit_operand_blocks[operand_id] = llvm_ir::CreateBasicBlock(
+        exit_block, StrCat("concat_index_from_operand_id", operand_id), b_);
+    auto saved_insert_point = b_->GetInsertPoint();
+    llvm_ir::SetToFirstInsertPoint(emit_operand_blocks[operand_id], b_);
+    source_index_phis[operand_id] =
+        PHI(source_index.GetType(), operand_usage_count[operand_id]);
+    auto operand_index = source_index;
+    operand_index[concat_dim] = source_index_phis[operand_id];
+
+    // Create the terminator of the block before calling operand generators,
+    // because they require non-degenerate basic blocks.
+    b_->SetInsertPoint(llvm::BranchInst::Create(
+        exit_block, /*InsertAtEnd=*/emit_operand_blocks[operand_id]));
+    TF_ASSIGN_OR_RETURN(llvm::Value * value,
+                        operand_to_generator.at(operand)(operand_index));
+    output->addIncoming(value, b_->GetInsertBlock());
+    b_->SetInsertPoint(init_block, saved_insert_point);
+  }
+
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
        ++operand_idx) {
     const HloInstruction* operand = hlo->operand(operand_idx);
-    auto true_block = llvm_ir::CreateBasicBlock(
-        exit_block, StrCat("concat_index_from_operand", operand_idx), b_);
     auto false_block = llvm_ir::CreateBasicBlock(
         exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_);
     auto concat_dim_size =
         llvm::ConstantInt::get(source_index[concat_dim]->getType(),
                                operand->shape().dimensions(concat_dim));
-    CondBr(ICmpULT(source_index[concat_dim], concat_dim_size), true_block,
-           false_block);
-
-    // Create the terminator of the true block before calling operand
-    // generators, because they require non-degenerate basic blocks.
-    b_->SetInsertPoint(
-        llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
-    TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                        operand_to_generator.at(operand)(source_index));
-    output->addIncoming(value, b_->GetInsertBlock());
+    int64 operand_id = to_unique_operand_id[operand];
+    source_index_phis[operand_id]->addIncoming(source_index[concat_dim],
+                                               b_->GetInsertBlock());
+    CondBr(ICmpULT(source_index[concat_dim], concat_dim_size),
+           emit_operand_blocks[operand_id], false_block);
 
     // Subtract the size of the concat dimension of the current operand
     // from the source index.
@@ -1815,8 +1856,6 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
     // Clamp the gather index so that the gather region fits in the operand.
     // gather_dim_component_extended_inbound =
     //     clamp(gather_dim_component_extended, 0, largest_valid_start_index);
-
-    // TODO(b/111078873): This is implementation defined behavior.
     bool is_signed = ShapeUtil::ElementIsSigned(indices_shape);
     auto gather_dim_component_extended_inbound = EmitIntegralMin(
         index.GetConstantWithIndexType(largest_valid_start_index),
@@ -2206,13 +2245,15 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                 : iota->shape();
         PrimitiveType component_element_type = component_shape.element_type();
         llvm::Value* iota_result;
-        if (ShapeUtil::ElementIsIntegral(component_shape)) {
+        if (primitive_util::IsIntegralType(component_element_type) ||
+            component_element_type == PRED) {
           iota_result = b_->CreateIntCast(
               elem_index_linear,
               llvm_ir::PrimitiveTypeToIrType(component_element_type, module_),
               /*isSigned=*/false);
         } else {
-          TF_RET_CHECK(ShapeUtil::ElementIsFloating(component_shape))
+          TF_RET_CHECK(
+              primitive_util::IsFloatingPointType(component_element_type))
               << component_element_type;
           llvm::Type* float_ir_type;
           if (component_element_type == BF16) {
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 47c56e2f7fbd9f53be6a2b189c5c36cf4fdcdccb..10b8c01ff1383658fcfb2271c177ba54347f985a 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 3a6780f2a67f230cae626ea00cfbf93b4e60d968..b34bca55a48b113c325dbf28c03f7a0f5b71f658 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -61,7 +61,7 @@ struct ExecutionOutput {
 class Executable {
  public:
   explicit Executable(
-      std::unique_ptr<const HloModule> hlo_module,
+      std::unique_ptr<HloModule> hlo_module,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
       : hlo_module_(std::move(hlo_module)),
@@ -162,7 +162,7 @@ class Executable {
     return hlo_profile_printer_data_ != nullptr;
   }
 
-  const HloModule& module() const { return *hlo_module_; }
+  HloModule& module() const { return *hlo_module_; }
 
   const bool has_module() const { return hlo_module_ != nullptr; }
 
@@ -199,7 +199,7 @@ class Executable {
   // HloModule this was compiled from. BufferAssignment keeps pointers to
   // HloInstructions owned by the HloModule so we need to keep the HloModule
   // around.
-  const std::unique_ptr<const HloModule> hlo_module_;
+  const std::unique_ptr<HloModule> hlo_module_;
 
   // HloSnapshot this was compiled from. Null if not dumping executions.
   std::unique_ptr<HloSnapshot> hlo_snapshot_;
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index 5fbd73a5363b4cdbcaafedbe6f4e7bd6bb2a92d8..8eeb930b48165a2e3c622581e05cb5f7063fa1fa 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -30,7 +30,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class FlattenCallGraphTest : public HloVerifiedTestBase {
+class FlattenCallGraphTest : public HloTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
   std::unique_ptr<HloComputation> MakeScalarComputation() {
@@ -108,7 +108,7 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
   //    c
   //
   // Calls are made via kCall, kWhile, and kMap instructions.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation =
       module->AddEmbeddedComputation(MakeConditionComputation());
   HloComputation* c_computation =
@@ -139,9 +139,9 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
   }
 
   {
-    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
     EXPECT_TRUE(result);
-    std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> flat_call_graph = CallGraph::Build(module.get());
     const CallGraphNode& c_node = flat_call_graph->GetNode(c_computation);
     EXPECT_EQ(1, c_node.caller_callsites().size());
   }
@@ -149,7 +149,7 @@ TEST_F(FlattenCallGraphTest, ComplexGraph) {
 
 // Test corner case of a computation used as a body and a loop condition.
 TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* cond_computation;
   {
     HloComputation::Builder builder(TestName() + ".cond");
@@ -176,15 +176,15 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
   }
 
   {
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
     const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
     EXPECT_EQ(2, cond_node.caller_callsites().size());
   }
 
   {
-    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
     EXPECT_TRUE(result);
-    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
     const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
     EXPECT_EQ(1, cond_node.caller_callsites().size());
   }
@@ -201,7 +201,7 @@ TEST_F(FlattenCallGraphTest, SharedWhileConditionAndBody) {
 //     C
 //
 TEST_F(FlattenCallGraphTest, FlattenCalls) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* c_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
@@ -211,9 +211,9 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
   module->AddEntryComputation(
       MakeCallingComputation(b_computation, /*callsites=*/2, ".Entry"));
 
-  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
   EXPECT_TRUE(result);
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   EXPECT_EQ(7, module->computation_count());
 
   const CallGraphNode& c_node = call_graph->GetNode(c_computation);
@@ -224,7 +224,7 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
 }
 
 TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* sub_computation =
       module->AddEmbeddedComputation(MakeScalarComputation());
 
@@ -243,9 +243,9 @@ TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
   module->AddEntryComputation(builder.Build());
   EXPECT_EQ(2, module->computation_count());
 
-  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
   EXPECT_TRUE(result);
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
   // The true and false computations must now be different.
   EXPECT_EQ(3, module->computation_count());
 
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index cb86c9857936f21d9d2ac6bc22c725b89cca6482..01cef499665c050d4453382289168276028e1d26 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -346,7 +346,8 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
           [&](HloInstruction* indvar,
               const std::vector<HloInstruction*>& loop_state) {
             return GatherLoopBody(*gather_instr, indvar, loop_state);
-          });
+          },
+          gather_instr->metadata());
 
   TF_ASSIGN_OR_RETURN(std::vector<HloInstruction*> gather_loop_result,
                       gather_loop_result_or_error);
diff --git a/tensorflow/compiler/xla/service/gather_expander.h b/tensorflow/compiler/xla/service/gather_expander.h
index 2b39359aae9fc01f1a88a2594108b2772788e826..8af9c6b71fbc391bf7c0e9809e979b65135a6df3 100644
--- a/tensorflow/compiler/xla/service/gather_expander.h
+++ b/tensorflow/compiler/xla/service/gather_expander.h
@@ -28,7 +28,7 @@ class GatherExpander : public HloModulePass {
   absl::string_view name() const override { return "gather_expander"; }
   StatusOr<bool> Run(HloModule* module) override;
 
- private:
+ protected:
   StatusOr<HloInstruction*> ExpandGather(HloInstruction* gather_instr);
 };
 
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index 141dd4d6f10272ce749edc4e91153c365ed322e6..a3102368cb1dba15da7422337666d278cef775ab 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -104,5 +104,44 @@ ENTRY main {
       ShapeUtil::MakeShape(S32, {2, 3}),
       ShapeUtil::GetTupleElementShape(while_shape, 3)));
 }
+
+TEST(GatherExpanderTest, CheckOpMetadata) {
+  const string hlo_text = R"(
+HloModule TensorFlowGatherV2
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  ROOT gather = s32[3,2] gather(operand, indices),
+      offset_dims={0},
+      collapsed_slice_dims={1},
+      start_index_map={1},
+      index_vector_dim=1,
+      slice_sizes={3, 1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text));
+  OpMetadata metadata;
+  metadata.set_op_name("Gather");
+  module->entry_computation()->root_instruction()->set_metadata(metadata);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  HloInstruction* while_instr = nullptr;
+  for (auto* instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      ASSERT_EQ(while_instr, nullptr)
+          << "Expected exactly one while instruction in the entry computation "
+             "after gather expansion";
+      while_instr = instr;
+    }
+  }
+
+  ASSERT_NE(while_instr, nullptr)
+      << "Expected exactly one while instruction in the entry computation "
+         "after gather expansion";
+  EXPECT_EQ(while_instr->metadata().op_name(), "Gather");
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 522e9f5948da2206f144ede4fdd95350474146d9..bfd1b6cb1492f5cb709e2ecefe73782094e26f5e 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -25,6 +25,10 @@ filegroup(
 )
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
 
 xla_proto_library(
     name = "backend_configs",
@@ -107,7 +111,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -154,7 +157,7 @@ cc_library(
     deps = [
         ":backend_configs",
         ":buffer_allocations",
-        ":cudnn_convolution_runner",
+        ":cudnn_conv_runner",
         ":elemental_ir_emitter",
         ":gpu_constants",
         ":gpu_executable",
@@ -323,7 +326,7 @@ cc_library(
     ],
     deps = [
         ":buffer_allocations",
-        ":cudnn_convolution_runner",
+        ":cudnn_conv_runner",
         ":hlo_execution_profiler",
         ":infeed_manager",
         ":ir_emission_utils",
@@ -385,13 +388,13 @@ cc_library(
 )
 
 cc_library(
-    name = "cudnn_convolution_algorithm_picker",
-    srcs = ["cudnn_convolution_algorithm_picker.cc"],
-    hdrs = ["cudnn_convolution_algorithm_picker.h"],
+    name = "cudnn_conv_algorithm_picker",
+    srcs = ["cudnn_conv_algorithm_picker.cc"],
+    hdrs = ["cudnn_conv_algorithm_picker.h"],
     deps = [
         ":backend_configs",
         ":buffer_comparator",
-        ":cudnn_convolution_runner",
+        ":cudnn_conv_runner",
         ":gpu_executable",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal_util",
@@ -404,14 +407,15 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
     ],
 )
 
 cc_library(
-    name = "cudnn_convolution_runner",
-    srcs = ["cudnn_convolution_runner.cc"],
-    hdrs = ["cudnn_convolution_runner.h"],
+    name = "cudnn_conv_runner",
+    srcs = ["cudnn_conv_runner.cc"],
+    hdrs = ["cudnn_conv_runner.h"],
     deps = [
         ":backend_configs",
         ":ir_emission_utils",
@@ -431,9 +435,9 @@ cc_library(
 )
 
 cc_library(
-    name = "cudnn_convolution_rewriter",
-    srcs = ["cudnn_convolution_rewriter.cc"],
-    hdrs = ["cudnn_convolution_rewriter.h"],
+    name = "cudnn_conv_rewriter",
+    srcs = ["cudnn_conv_rewriter.cc"],
+    hdrs = ["cudnn_conv_rewriter.h"],
     deps = [
         ":backend_configs",
         ":ir_emission_utils",
@@ -448,17 +452,17 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "cudnn_convolution_rewriter_test",
-    srcs = ["cudnn_convolution_rewriter_test.cc"],
+    name = "cudnn_conv_rewriter_test",
+    srcs = ["cudnn_conv_rewriter_test.cc"],
     deps = [
-        ":cudnn_convolution_rewriter",
+        ":cudnn_conv_rewriter",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:shape_inference",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:test",
     ],
@@ -580,9 +584,9 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "pad_insertion",
-    srcs = ["pad_insertion.cc"],
-    hdrs = ["pad_insertion.h"],
+    name = "cudnn_conv_padding_legalization",
+    srcs = ["cudnn_conv_padding_legalization.cc"],
+    hdrs = ["cudnn_conv_padding_legalization.h"],
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal",
@@ -590,6 +594,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:shape_inference",
@@ -598,9 +603,9 @@ cc_library(
 )
 
 cc_library(
-    name = "pad_for_tensor_cores",
-    srcs = ["pad_for_tensor_cores.cc"],
-    hdrs = ["pad_for_tensor_cores.h"],
+    name = "cudnn_conv_pad_for_tensor_cores",
+    srcs = ["cudnn_conv_pad_for_tensor_cores.cc"],
+    hdrs = ["cudnn_conv_pad_for_tensor_cores.h"],
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal_util",
@@ -612,16 +617,16 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "pad_for_tensor_cores_test",
-    srcs = ["pad_for_tensor_cores_test.cc"],
+    name = "cudnn_conv_pad_for_tensor_cores_test",
+    srcs = ["cudnn_conv_pad_for_tensor_cores_test.cc"],
     deps = [
+        ":cudnn_conv_pad_for_tensor_cores",
         ":ir_emission_utils",
-        ":pad_for_tensor_cores",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # build_cleaner: keep
     ],
 )
@@ -658,9 +663,11 @@ cc_library(
     srcs = ["nvptx_compiler.cc"],
     hdrs = ["nvptx_compiler.h"],
     deps = [
-        ":cudnn_convolution_algorithm_picker",
-        ":cudnn_convolution_rewriter",
-        ":cudnn_fused_convolution_rewriter",
+        ":cudnn_conv_algorithm_picker",
+        ":cudnn_conv_pad_for_tensor_cores",
+        ":cudnn_conv_padding_legalization",
+        ":cudnn_conv_rewriter",
+        ":cudnn_fused_conv_rewriter",
         ":fusion_merger",
         ":gpu_constants",
         ":gpu_copy_insertion",
@@ -672,11 +679,10 @@ cc_library(
         ":ir_emission_utils",
         ":ir_emitter",
         ":multi_output_fusion",
-        ":pad_for_tensor_cores",
-        ":pad_insertion",
         ":partition_assignment",
         ":stream_assignment",
         ":stream_executor_util",
+        ":variadic_op_splitter",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -695,6 +701,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
+        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -704,7 +711,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
-        "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
@@ -780,7 +786,6 @@ cc_library(
     srcs = ["gpu_layout_assignment.cc"],
     hdrs = ["gpu_layout_assignment.h"],
     deps = [
-        ":gpu_options",
         ":ir_emission_utils",
         ":stream_executor_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -844,7 +849,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/memory",
@@ -881,16 +885,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gpu_options",
-    srcs = ["gpu_options.cc"],
-    hdrs = ["gpu_options.h"],
-    deps = [
-        "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 cc_library(
     name = "stream_executor_util",
     srcs = ["stream_executor_util.cc"],
@@ -914,7 +908,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -976,9 +969,9 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "cudnn_fused_convolution_rewriter",
-    srcs = ["cudnn_fused_convolution_rewriter.cc"],
-    hdrs = ["cudnn_fused_convolution_rewriter.h"],
+    name = "cudnn_fused_conv_rewriter",
+    srcs = ["cudnn_fused_conv_rewriter.cc"],
+    hdrs = ["cudnn_fused_conv_rewriter.h"],
     deps = [
         ":backend_configs",
         ":ir_emission_utils",
@@ -990,3 +983,57 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
+
+tf_cc_test(
+    name = "cudnn_fused_conv_rewriter_test",
+    srcs = ["cudnn_fused_conv_rewriter_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
+        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "variadic_op_splitter",
+    srcs = ["variadic_op_splitter.cc"],
+    hdrs = ["variadic_op_splitter.h"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "variadic_op_splitter_test",
+    srcs = ["variadic_op_splitter_test.cc"],
+    deps = [
+        ":ir_emission_utils",
+        ":variadic_op_splitter",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 4effea637d01bf23b54d341b77306b20b1b133c8..e1dffad3045808c4f316ccafdda39a174e1560c8 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -56,9 +56,9 @@ Status ConvolutionThunk::ExecuteOnStream(
       buffer_allocations.GetDeviceAddress(scratch_buffer_);
 
   auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
-  TF_RETURN_IF_ERROR(RunCudnnConvolution(cudnn_call_,
-                                         absl::MakeSpan(operand_se_buffers),
-                                         result_buffer, scratch, stream));
+  TF_RETURN_IF_ERROR(RunCudnnConv(cudnn_call_,
+                                  absl::MakeSpan(operand_se_buffers),
+                                  result_buffer, scratch, stream));
 
   void* ptrs[] = {result_buffer.opaque(), scratch.opaque()};
   se::DeviceMemory<void*> tuple_addr(
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index f53bc541983378819dba36489dd69c348f50af32..c71515490c94ef54baad9005509d1813de630159 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d6780fa1c7b0c636eb771c40e74f074cd8c4c4b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -0,0 +1,407 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
+#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using absl::optional;
+using se::DeviceMemoryBase;
+using se::dnn::AlgorithmConfig;
+using se::dnn::AlgorithmDesc;
+
+class ScratchAllocator : public se::ScratchAllocator {
+ public:
+  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
+
+  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
+    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
+  }
+  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
+
+  StatusOr<se::DeviceMemory<uint8>> AllocateBytes(se::Stream* stream,
+                                                  int64 byte_size) override;
+
+ private:
+  const int device_ordinal_;
+  DeviceMemoryAllocator* memory_allocator_;
+  std::vector<OwningDeviceMemory> allocated_buffers_;
+  int64 total_allocated_bytes_ = 0;
+};
+
+StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
+    se::Stream* stream, int64 byte_size) {
+  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
+  if (byte_size > GetMemoryLimitInBytes(stream)) {
+    return se::port::Status(
+        se::port::error::RESOURCE_EXHAUSTED,
+        absl::StrFormat(
+            "Allocating %d bytes exceeds the memory limit of %d bytes.",
+            byte_size, GetMemoryLimitInBytes(stream)));
+  }
+
+  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+                      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                                  /*retry_on_failure=*/false));
+  total_allocated_bytes_ += byte_size;
+
+  se::DeviceMemoryBase buffer_addr = allocated_buffer.AsDeviceMemoryBase();
+  allocated_buffers_.push_back(std::move(allocated_buffer));
+  return se::DeviceMemory<uint8>(buffer_addr);
+}
+
+std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
+                                         se::StreamExecutor* stream_exec) {
+  std::vector<AlgorithmDesc> algorithms;
+  bool succ = false;
+  switch (kind) {
+    case CudnnConvKind::kBackwardFilter:
+      succ =
+          stream_exec->GetConvolveBackwardFilterAlgorithms(true, &algorithms);
+      break;
+    case CudnnConvKind::kBackwardInput:
+      succ = stream_exec->GetConvolveBackwardDataAlgorithms(true, &algorithms);
+      break;
+    case CudnnConvKind::kForward:
+    case CudnnConvKind::kForwardActivation:
+      succ = stream_exec->GetConvolveAlgorithms(true, &algorithms);
+      break;
+  }
+  DCHECK(succ);
+
+  return algorithms;
+}
+
+string AlgorithmToString(const AlgorithmDesc& algo) {
+  if (algo.tensor_ops_enabled()) {
+    return absl::StrCat(algo.algo_id(), "+TC");
+  }
+  return absl::StrCat(algo.algo_id());
+}
+
+string NumBytesToString(int64 bytes) {
+  return absl::StrCat(tensorflow::strings::HumanReadableNumBytes(bytes), " (",
+                      bytes, "B)");
+}
+
+// Acquires a process-global lock on the device pointed to by the given
+// StreamExecutor.
+//
+// This is used to prevent other XLA instances from trying to autotune on this
+// device while we're using it.
+tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  // se::Platform*s are global singletons guaranteed to live forever.
+  static auto* mutexes =
+      new std::map<std::pair<const se::Platform*, /*device_ordinal*/ int64>,
+                   tensorflow::mutex>();
+
+  tensorflow::mutex_lock global_lock(mu);
+  auto it = mutexes
+                ->emplace(std::piecewise_construct,
+                          std::make_tuple(stream_exec->platform(),
+                                          stream_exec->device_ordinal()),
+                          std::make_tuple())
+                .first;
+  return tensorflow::mutex_lock{it->second};
+}
+
+}  // anonymous namespace
+
+// We could have caching here so that we don't redo this work for two identical
+// convolutions.  Unfortunately our cache key would have to be a tuple
+// containing the protos passed to this function, and we have no utility for
+// hashing protos.  We could write our own hash functions, but they'd silently
+// break if we ever added a field to one of the protos.  Perhaps we could hack
+// using the binary-encoded proto as the hash key, on the assumption that two
+// protos being binary-equal is a sufficient, if not necessary, condition for
+// proper equality.  But that would still leave us open to having unnecessary
+// cache misses and doing extra work.  Overall, caching doesn't seem worth the
+// trouble, but we may want to revisit this if we ever find a model where
+// caching would speed up compilation a lot.
+StatusOr<CudnnConvAlgorithmPicker::AutotuneResult>
+CudnnConvAlgorithmPicker::PickBestAlgorithm(HloCustomCallInstruction* instr) {
+  // TODO(timshen): for now only check fp16. It can be expanded to other types,
+  // with some work on the HLO routines.
+  const bool cross_check_enabled =
+      instr->shape().tuple_shapes(0).element_type() == xla::F16;
+
+  // Don't run this function concurrently on the same GPU.
+  //
+  // This is a bit of a hack and doesn't protect us against arbitrary concurrent
+  // use of a GPU, but it's sufficient to let us compile two HLO modules
+  // concurrently and then run them sequentially.
+  tensorflow::mutex_lock lock = LockGpu(stream_exec_);
+
+  // Make sure any previous activity on this executor is done. We don't want to
+  // interfere with programs that are still running on the GPU.
+  if (!stream_exec_->SynchronizeAllActivity()) {
+    return InternalError("Failed to synchronize GPU for autotuning.");
+  }
+
+  // Create a stream for us to do our work on.
+  se::Stream stream{stream_exec_};
+  stream.Init();
+  const auto device_ordinal = stream_exec_->device_ordinal();
+
+  // allocator either points to this->allocator_ or, if that's null, to a
+  // StreamExecutorMemoryAllocator for stream_exec_.
+  DeviceMemoryAllocator* allocator;
+  optional<StreamExecutorMemoryAllocator> se_allocator;
+  if (allocator_ != nullptr) {
+    allocator = allocator_;
+  } else {
+    se_allocator.emplace(stream_exec_->platform(),
+                         absl::Span<se::StreamExecutor* const>({stream_exec_}));
+    allocator = &*se_allocator;
+  }
+
+  const auto initialize_buffer = [&stream, cross_check_enabled](
+                                     DeviceMemoryBase buffer) {
+    if (cross_check_enabled) {
+      // Broadcast a constant to the buffer, instead of zeroing the buffer. A
+      // non-zero constant is useful for the cross checking, because zero-inputs
+      // may not always reveal the bugs.
+      CHECK_EQ(0, (uintptr_t)buffer.opaque() % 4);
+      size_t left_over_bytes = buffer.size() % 4;
+      CHECK_EQ(0, left_over_bytes % 2);
+
+      constexpr float kBroadcastedConstant = 0.1f;
+      static const Eigen::half halfs[2] = {Eigen::half(kBroadcastedConstant),
+                                           Eigen::half(kBroadcastedConstant)};
+      uint32 bits;
+      static_assert(sizeof(bits) == sizeof(halfs), "");
+      memcpy(&bits, halfs, sizeof(bits));
+
+      size_t aligned_size = buffer.size() / 4 * 4;
+      stream.ThenMemset32(&buffer, bits, aligned_size);
+
+      DeviceMemoryBase left_over(
+          static_cast<char*>(buffer.opaque()) + aligned_size, left_over_bytes);
+      stream.ThenMemcpy(&left_over, halfs, left_over_bytes);
+    } else {
+      // Although we don't have evidence this matters, zero out the buffers
+      // before autotuning.  It's conceivable that using uninitialized memory as
+      // the inputs might affect performance if e.g. the inputs contain
+      // denormals, and this is easy enough.
+      stream.ThenMemZero(&buffer, buffer.size());
+    }
+  };
+
+  // Allocate space for the input, filter, and output of the convolution.  We
+  // use a ScratchAllocator for this instead of calling allocator_ directly so
+  // that our allocations don't leak.
+  ScratchAllocator input_output_allocator(device_ordinal, allocator);
+  std::vector<se::DeviceMemoryBase> operand_buffers;
+  for (const auto* operand : instr->operands()) {
+    TF_ASSIGN_OR_RETURN(auto buffer,
+                        input_output_allocator.AllocateBytes(
+                            &stream, ShapeUtil::ByteSizeOf(operand->shape())));
+    initialize_buffer(buffer);
+    operand_buffers.push_back(buffer);
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto result_buffer,
+      input_output_allocator.AllocateBytes(
+          &stream, ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
+  initialize_buffer(result_buffer);
+
+  se::dnn::ProfileResult best_result;
+  int64 best_result_bytes_used = 0;
+  TF_ASSIGN_OR_RETURN(auto backend_config,
+                      instr->backend_config<CudnnConvBackendConfig>());
+
+  optional<F16BufferComparator> comparator;
+  // Use the first algorithm that's supported as reference. There isn't a
+  // particular reason to use it, as any algorithm sufficies. It doesn't make
+  // this algorithm considered correct, though.
+  optional<AlgorithmDesc> first_algorithm;
+  TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
+  for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
+    ScratchAllocator scratch_allocator(device_ordinal, allocator);
+    se::dnn::ProfileResult profile_result;
+    VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
+            << instr->ToString();
+
+    backend_config.set_algorithm(alg.algo_id());
+    backend_config.set_tensor_ops_enabled(alg.tensor_ops_enabled());
+    TF_RETURN_IF_ERROR(instr->set_backend_config(backend_config));
+    bool launch_ok =
+        RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
+                     &scratch_allocator, &stream, &profile_result)
+            .ok();
+
+    if (launch_ok && profile_result.is_valid()) {
+      const bool crash_on_checking_failure =
+          instr->GetModule()
+              ->config()
+              .debug_options()
+              .xla_gpu_crash_on_verification_failures();
+      if (comparator.has_value()) {
+        StatusOr<bool> result = comparator->CompareEqual(
+            se::DeviceMemory<Eigen::half>(result_buffer));
+        if (!result.ok()) {
+          LOG(ERROR) << "Unable to compare "
+                     << AlgorithmToString(*first_algorithm) << " against "
+                     << AlgorithmToString(alg) << " for " << instr->ToString()
+                     << ": " << result.status();
+          CHECK(!crash_on_checking_failure);
+        } else if (!result.ValueOrDie()) {
+          LOG(ERROR) << "Results mismatch between different convolution "
+                        "algorithms. This is likely a bug in convolution, or "
+                        "an excessive loss of precision in convolution. "
+                     << instr->ToString() << " for "
+                     << AlgorithmToString(*first_algorithm) << " vs "
+                     << AlgorithmToString(alg);
+          CHECK(!crash_on_checking_failure);
+        }
+      } else if (cross_check_enabled) {
+        auto comp = F16BufferComparator::Create(
+            se::DeviceMemory<Eigen::half>(result_buffer), compiler_, allocator,
+            &stream);
+        if (comp.ok()) {
+          comparator.emplace(comp.ConsumeValueOrDie());
+          first_algorithm.emplace(alg);
+        } else {
+          LOG(ERROR) << "Fail to initialize buffer comparator: "
+                     << comp.status() << ", instruction: " << instr->ToString();
+          CHECK(!crash_on_checking_failure);
+        }
+      }
+      int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
+      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
+              << " succeeded, taking " << profile_result.elapsed_time_in_ms()
+              << "ms and using " << NumBytesToString(scratch_bytes_used)
+              << " of scratch (Best result: "
+              << best_result.elapsed_time_in_ms() << "ms, "
+              << NumBytesToString(best_result_bytes_used) << " of scratch)";
+      if (profile_result.elapsed_time_in_ms() <
+          best_result.elapsed_time_in_ms()) {
+        best_result = profile_result;
+        best_result_bytes_used = scratch_bytes_used;
+      }
+    } else {
+      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed.";
+    }
+  }
+  if (best_result.is_valid()) {
+    VLOG(2) << "Best algorithm for " << instr->ToString() << ": "
+            << AlgorithmToString(best_result.algorithm()) << ", takes "
+            << best_result.elapsed_time_in_ms() << "ms, and uses "
+            << best_result_bytes_used << "B of scratch memory.";
+    return AutotuneResult{best_result.algorithm().algo_id(),
+                          best_result.algorithm().tensor_ops_enabled(),
+                          best_result_bytes_used,
+                          absl::Milliseconds(best_result.elapsed_time_in_ms())};
+  }
+
+  return InternalError(
+      "All algorithms tried for convolution %s failed.  Falling back to "
+      "default algorithm.",
+      instr->ToString());
+}
+
+StatusOr<bool> CudnnConvAlgorithmPicker::RunOnInstruction(
+    HloInstruction* instr) {
+  CHECK(IsCustomCallToDnnConvolution(*instr));
+
+  StatusOr<AutotuneResult> best_algo_or =
+      PickBestAlgorithm(Cast<HloCustomCallInstruction>(instr));
+  if (!best_algo_or.ok()) {
+    LOG(ERROR) << best_algo_or.status();
+    return false;
+  }
+
+  auto best_algo = std::move(best_algo_or).ValueOrDie();
+  VLOG(1) << "Setting cudnn conv to use algorithm " << best_algo.algorithm
+          << " and " << NumBytesToString(best_algo.scratch_bytes)
+          << " of scratch memory: " << instr->ToString()
+          << " tensor_ops_enabled: " << best_algo.tensor_ops_enabled;
+
+  // Replace instr with a new CustomCall which has the correct algorithm, and
+  // whose output shape has the appropriate amount of scratch memory.
+  HloComputation* computation = instr->parent();
+  Shape new_call_shape = ShapeUtil::MakeTupleShape(
+      {instr->shape().tuple_shapes(0),
+       ShapeUtil::MakeShape(U8, {best_algo.scratch_bytes})});
+
+  TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
+                      instr->backend_config<CudnnConvBackendConfig>());
+  backend_config.set_algorithm(best_algo.algorithm);
+  backend_config.set_tensor_ops_enabled(best_algo.tensor_ops_enabled);
+
+  HloInstruction* new_call = computation->AddInstruction(
+      instr->CloneWithNewOperands(new_call_shape, instr->operands()));
+
+  VLOG(1) << "Replacing convolution " << instr->ToString() << " with "
+          << new_call->ToString();
+
+  TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
+
+  // Repackage new_call so it has the same shape as the original call, namely
+  // (conv_result, u8[0]).
+  HloInstruction* new_tuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(
+          {computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+               new_call_shape.tuple_shapes(0), new_call, 0)),
+           computation->AddInstruction(HloInstruction::CreateConstant(
+               LiteralUtil::CreateR1<uint8>({})))}));
+
+  TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple));
+  return true;
+}
+
+StatusOr<bool> CudnnConvAlgorithmPicker::RunOnComputation(
+    HloComputation* computation) {
+  std::vector<HloInstruction*> convs;
+  for (auto* instr : computation->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instr)) {
+      convs.push_back(instr);
+    }
+  }
+
+  bool changed = false;
+  for (auto* instr : convs) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
+    changed |= result;
+  }
+  return changed;
+}
+
+StatusOr<bool> CudnnConvAlgorithmPicker::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
new file mode 100644
index 0000000000000000000000000000000000000000..642af787afc71586d722ecc7e529ed8b3fa64d33
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_ALGORITHM_PICKER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_ALGORITHM_PICKER_H_
+
+#include "absl/time/time.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// Modifies CustomCalls to cudnn convolutions, choosing the best algorithm for
+// each and adding explicit scratch space to the CustomCalls.
+class CudnnConvAlgorithmPicker : public HloModulePass {
+ public:
+  // If the `allocator` parameter is not null, we will use it to allocate temp
+  // memory while timing the various convolution algorithms.  If it's null,
+  // we'll use the default allocator on the StreamExecutor.
+  CudnnConvAlgorithmPicker(se::StreamExecutor* stream_exec,
+                           DeviceMemoryAllocator* allocator, Compiler* compiler)
+      : stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {}
+
+  absl::string_view name() const override {
+    return "cudnn-conv-algorithm-picker";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  struct AutotuneResult {
+    int64 algorithm;
+    bool tensor_ops_enabled;
+    int64 scratch_bytes;
+    absl::Duration runtime;
+  };
+
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
+  StatusOr<bool> RunOnInstruction(HloInstruction* instr);
+  StatusOr<AutotuneResult> PickBestAlgorithm(HloCustomCallInstruction* instr);
+
+  se::StreamExecutor* stream_exec_;                   // never null
+  DeviceMemoryAllocator* allocator_;                  // may be null
+  Compiler* compiler_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_ALGORITHM_PICKER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5aa4f839f4be5f1060480fea98775f8ffada0bdd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
@@ -0,0 +1,243 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+
+namespace xla {
+namespace gpu {
+
+// We won't pad a conv if doing so increases the total number of bytes in the
+// lhs, rhs, or result by more than this amount.
+//
+// TODO(jlebar): This number was tuned experimentally.  It represents a
+// compromise on our current benchmarks; it speeds some up significantly, and
+// doesn't slow any down.  But we can observe by changing this value that
+// there's additional room for speedups.  Achieving those speedups without
+// also slowing other things down will likely require a more sophisticated
+// heuristic, possibly some form of auto-tuning.
+static constexpr double kMaxBytesTouchedIncrease = 1.35;
+
+// Creates and returns an HLO that zero-pads one or more dimensions in the given
+// instruction so that its shape is equal to the given shape.
+//
+// Padding is added to the end of each relevant dimension.
+//
+// If the instruction already has the given shape, simply returns it without an
+// intervening pad.
+static HloInstruction* PadInstruction(HloInstruction* instr,
+                                      const Shape& new_shape) {
+  HloComputation* comp = instr->parent();
+
+  const Shape& shape = instr->shape();
+  auto* zero = comp->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+
+  PaddingConfig pad_config = MakeNoPaddingConfig(ShapeUtil::Rank(shape));
+
+  bool added_padding = false;
+  for (int64 dim = 0; dim < ShapeUtil::Rank(shape); ++dim) {
+    if (shape.dimensions(dim) == new_shape.dimensions(dim)) {
+      continue;
+    }
+    CHECK_GT(new_shape.dimensions(dim), shape.dimensions(dim));
+    pad_config.mutable_dimensions(dim)->set_edge_padding_high(
+        new_shape.dimensions(dim) - shape.dimensions(dim));
+    added_padding = true;
+  }
+
+  if (!added_padding) {
+    return instr;
+  }
+  return comp->AddInstruction(
+      HloInstruction::CreatePad(new_shape, instr, zero, pad_config));
+}
+
+// Modifies the given convolution to have the given LHS/RHS/result shapes.
+static Status PadConv(HloCustomCallInstruction* conv,
+                      const Shape& new_lhs_shape, const Shape& new_rhs_shape,
+                      const Shape& new_result_shape) {
+  CHECK_EQ(0, conv->shape().tuple_shapes(1).dimensions(0))
+      << "conv must use 0 scratch bytes, i.e. this pass must be run "
+         "before CudnnConvAlgorithmPicker.";
+
+  auto* lhs = conv->mutable_operand(0);
+  auto* rhs = conv->mutable_operand(1);
+  auto* new_lhs = PadInstruction(lhs, new_lhs_shape);
+  auto* new_rhs = PadInstruction(rhs, new_rhs_shape);
+  const Shape& result_shape = conv->shape().tuple_shapes(0);
+  CHECK(new_lhs != lhs || new_rhs != rhs)
+      << "We should have had to pad either LHS or RHS.";
+
+  auto add = [&](std::unique_ptr<HloInstruction> new_instr) {
+    return conv->parent()->AddInstruction(std::move(new_instr));
+  };
+
+  Shape new_conv_shape = ShapeUtil::MakeTupleShape(
+      {new_result_shape, ShapeUtil::MakeShape(U8, {0})});
+  auto* new_conv =
+      add(conv->CloneWithNewOperands(new_conv_shape, {new_lhs, new_rhs}));
+
+  // Slice the new conv result if necessary, keeping in mind that new_conv has
+  // tuple shape (new_result_shape, u8[0]).
+  if (!ShapeUtil::Equal(result_shape, new_result_shape)) {
+    std::vector<int64> start_indices(result_shape.dimensions_size(), 0);
+    std::vector<int64> end_indices(result_shape.dimensions().begin(),
+                                   result_shape.dimensions().end());
+    std::vector<int64> strides(result_shape.dimensions_size(), 1);
+
+    auto* new_conv_result = add(
+        HloInstruction::CreateGetTupleElement(new_result_shape, new_conv, 0));
+    auto* empty_temp_buffer =
+        add(HloInstruction::CreateConstant(LiteralUtil::CreateR1<uint8>({})));
+    auto* sliced_result = add(HloInstruction::CreateSlice(
+        result_shape, new_conv_result, start_indices, end_indices, strides));
+    new_conv =
+        add(HloInstruction::CreateTuple({sliced_result, empty_temp_buffer}));
+  }
+
+  VLOG(2) << "Padded features of " << conv->ToString() << ", replaced with "
+          << new_conv->ToString();
+  return conv->parent()->ReplaceInstruction(conv, new_conv);
+}
+
+static StatusOr<bool> PadForTensorCores(HloCustomCallInstruction* conv) {
+  TF_ASSIGN_OR_RETURN(auto kind, GetCudnnConvKind(conv));
+  const auto& dnums = conv->convolution_dimension_numbers();
+  auto* lhs = conv->mutable_operand(0);
+  auto* rhs = conv->mutable_operand(1);
+  const Shape& result_shape = conv->shape().tuple_shapes(0);
+
+  // Nothing to do on non-f16 convolutions.
+  if (result_shape.element_type() != PrimitiveType::F16) {
+    return false;
+  }
+
+  // TODO(timshen): Don't skip forward-activation convs if we find a benchmark
+  // where there's a speedup.
+  if (kind == CudnnConvKind::kForwardActivation) {
+    return false;
+  }
+
+  Shape new_lhs_shape = lhs->shape();
+  Shape new_rhs_shape = rhs->shape();
+  Shape new_result_shape = conv->shape().tuple_shapes(0);
+
+  // new_{input,filter_output}_shape points to the appropriate one of
+  // new_{lhs,rhs,result}_shape.
+  Shape* new_input_shape;
+  Shape* new_filter_shape;
+  Shape* new_output_shape;
+  std::tie(new_input_shape, new_filter_shape, new_output_shape) = [&] {
+    switch (kind) {
+      case CudnnConvKind::kForward:
+      case CudnnConvKind::kForwardActivation:
+        return std::make_tuple(&new_lhs_shape, &new_rhs_shape,
+                               &new_result_shape);
+      case CudnnConvKind::kBackwardInput:
+        return std::make_tuple(&new_result_shape, &new_rhs_shape,
+                               &new_lhs_shape);
+      case CudnnConvKind::kBackwardFilter:
+        return std::make_tuple(&new_lhs_shape, &new_result_shape,
+                               &new_rhs_shape);
+    }
+  }();
+
+  // If there are 3 input features and 32 or 64 output features, pad the input
+  // features to 4.  Otherwise, try padding to multiples of 8 and check that
+  // this doesn't make any of the conv buffers too much larger.
+  auto input_features =
+      new_input_shape->dimensions(dnums.input_feature_dimension());
+  auto output_features =
+      new_output_shape->dimensions(dnums.output_feature_dimension());
+  if (input_features == 3 && (output_features == 32 || output_features == 64)) {
+    new_input_shape->set_dimensions(dnums.input_feature_dimension(), 4);
+    new_filter_shape->set_dimensions(dnums.kernel_input_feature_dimension(), 4);
+  } else {
+    auto pad_dim = [](Shape* s, int64 dim) {
+      s->set_dimensions(dim, RoundUpToNearest<int64>(s->dimensions(dim), 8));
+    };
+    pad_dim(new_input_shape, dnums.input_feature_dimension());
+    pad_dim(new_filter_shape, dnums.kernel_input_feature_dimension());
+    pad_dim(new_filter_shape, dnums.kernel_output_feature_dimension());
+    pad_dim(new_output_shape, dnums.output_feature_dimension());
+
+    // Check that padding wouldn't increase the total bytes read/written by this
+    // operation too much.
+    auto check_size_increase = [&](const Shape& old_shape,
+                                   const Shape& new_shape) {
+      int64 old_bytes = ShapeUtil::ByteSizeOf(old_shape);
+      int64 new_bytes = ShapeUtil::ByteSizeOf(new_shape);
+      if (new_bytes <= old_bytes * kMaxBytesTouchedIncrease) {
+        return true;
+      }
+      VLOG(3)
+          << "Not padding convolution; doing so would change input / result "
+             "shape from "
+          << ShapeUtil::HumanString(old_shape) << " to "
+          << ShapeUtil::HumanString(new_shape) << ", a size increase of "
+          << new_bytes / static_cast<double>(old_bytes) << "x > "
+          << kMaxBytesTouchedIncrease << "x: " << conv->ToString();
+      return false;
+    };
+
+    if (!check_size_increase(lhs->shape(), new_lhs_shape) ||
+        !check_size_increase(rhs->shape(), new_rhs_shape) ||
+        !check_size_increase(result_shape, new_result_shape)) {
+      return false;
+    }
+  }
+
+  if (ShapeUtil::Equal(lhs->shape(), new_lhs_shape) &&
+      ShapeUtil::Equal(rhs->shape(), new_rhs_shape)) {
+    VLOG(3) << "No need to pad features of " << conv->ToString();
+    return false;
+  }
+
+  // OK, let's do the transformation!
+  TF_RETURN_IF_ERROR(
+      PadConv(conv, new_lhs_shape, new_rhs_shape, new_result_shape));
+  return true;
+}
+
+static std::vector<HloCustomCallInstruction*> GetRelevantConvs(
+    HloComputation* comp) {
+  std::vector<HloCustomCallInstruction*> convs;
+  for (HloInstruction* instr : comp->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instr)) {
+      convs.push_back(Cast<HloCustomCallInstruction>(instr));
+    }
+  }
+  return convs;
+}
+
+StatusOr<bool> CudnnConvPadForTensorCores::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* comp : module->MakeNonfusionComputations()) {
+    for (HloCustomCallInstruction* conv : GetRelevantConvs(comp)) {
+      TF_ASSIGN_OR_RETURN(bool result, PadForTensorCores(conv));
+      changed |= result;
+    }
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4e51e86c1bf2c1f9aef2eed642604092033a538
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_PAD_FOR_TENSOR_CORES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_PAD_FOR_TENSOR_CORES_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Adds padding to cudnn convolutions to make them run faster on GPUs with
+// tensor cores.
+//
+//  - f16 convolutions are padded to have input/output channel dimensions that
+//    are multiples of 8, so that we can use tensor cores.
+//
+//  - f16 convolutions with 3 input channels and 32 or 64 output channels are
+//    padded to 4 input channels.  There's a special-cased cudnn algorithm just
+//    for this.
+//
+// Don't run this pass on GPUs without tensor cores -- it will make them slower!
+//
+// TODO(jlebar): Also pad dots.
+class CudnnConvPadForTensorCores : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "cudnn-conv-pad-for-speed"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_PAD_FOR_TENSOR_CORES_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af9303a5b761b99705945f1c02303156e3f874de
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores_test.cc
@@ -0,0 +1,195 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h"
+
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+using ::testing::_;
+
+class CudnnConvPadForTensorCoresTest : public HloTestBase {};
+
+TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvInputChannels) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    input = f16[10,20,30,41] parameter(0)
+    filter = f16[2,2,41,40] parameter(1)
+    ROOT result = (f16[10,20,30,40], u8[0]) custom-call(input, filter),
+                  window={size=2x2}, dim_labels=b01f_01io->b01f,
+                  custom_call_target="__cudnn$convForward"
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+
+  SCOPED_TRACE(module->ToString());
+  EXPECT_THAT(root, op::CustomCall(kCudnnConvForwardCallTarget,
+                                   op::Pad(op::Parameter(0), _),
+                                   op::Pad(op::Parameter(1), _)));
+  EXPECT_TRUE(ShapeUtil::Equal(root->operand(0)->shape(),
+                               ShapeUtil::MakeShape(F16, {10, 20, 30, 48})));
+  EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->shape(),
+                               ShapeUtil::MakeShape(F16, {2, 2, 48, 40})));
+}
+
+TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardInputConvOutputChannels) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    output = f16[10,20,30,41] parameter(0)
+    filter = f16[2,2,40,41] parameter(1)
+    ROOT result = (f16[10,20,30,40], u8[0]) custom-call(output, filter),
+                  window={size=2x2}, dim_labels=b01f_01io->b01f,
+                  custom_call_target="__cudnn$convBackwardInput"
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::CustomCall(kCudnnConvBackwardInputCallTarget,
+                                   op::Pad(op::Parameter(0), _),
+                                   op::Pad(op::Parameter(1), _)));
+  EXPECT_TRUE(ShapeUtil::Equal(root->operand(0)->shape(),
+                               ShapeUtil::MakeShape(F16, {10, 20, 30, 48})));
+  EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->shape(),
+                               ShapeUtil::MakeShape(F16, {2, 2, 40, 48})));
+}
+
+TEST_F(CudnnConvPadForTensorCoresTest, PadF16ForwardConvOutputChannels) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    input = f16[10,20,30,40] parameter(0)
+    filter = f16[2,2,40,41] parameter(1)
+    ROOT result = (f16[10,20,30,41], u8[0]) custom-call(input, filter),
+                  window={size=2x2}, dim_labels=b01f_01io->b01f,
+                  custom_call_target="__cudnn$convForward"
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Tuple(op::Slice(op::GetTupleElement(op::CustomCall(
+                                  kCudnnConvForwardCallTarget, op::Parameter(0),
+                                  op::Pad(op::Parameter(1), _)))),
+                              _));
+}
+
+TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardInputConvInputChannels) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    output = f16[10,20,30,40] parameter(0)
+    filter = f16[2,2,41,40] parameter(1)
+    result = (f16[10,20,30,41], u8[0]) custom-call(output, filter),
+              window={size=2x2}, dim_labels=b01f_01io->b01f,
+              custom_call_target="__cudnn$convBackwardInput"
+    ROOT gte = f16[10,20,30,41] get-tuple-element(result), index=0
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::GetTupleElement(op::Tuple(
+                        op::Slice(op::GetTupleElement(op::CustomCall(
+                            kCudnnConvBackwardInputCallTarget, op::Parameter(0),
+                            op::Pad(op::Parameter(1), _)))),
+                        _)));
+}
+
+TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardFilterConvInputChannels) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    input = f16[10,20,30,41] parameter(0)
+    output = f16[10,20,30,40] parameter(1)
+    result = (f16[2,2,41,40], u8[0]) custom-call(input, output),
+              window={size=2x2}, dim_labels=b01f_01io->b01f,
+              custom_call_target="__cudnn$convBackwardFilter"
+    ROOT gte = f16[2,2,41,40] get-tuple-element(result), index=0
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::GetTupleElement(op::Tuple(
+                        op::Slice(op::GetTupleElement(op::CustomCall(
+                            kCudnnConvBackwardFilterCallTarget,
+                            op::Pad(op::Parameter(0), _), op::Parameter(1)))),
+                        _)));
+}
+
+TEST_F(CudnnConvPadForTensorCoresTest, PadF16BackwardFilterConvOutputChannels) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    input = f16[10,20,30,40] parameter(0)
+    output = f16[10,20,30,41] parameter(1)
+    result = (f16[2,2,40,41], u8[0]) custom-call(input, output),
+              window={size=2x2}, dim_labels=b01f_01io->b01f,
+              custom_call_target="__cudnn$convBackwardFilter"
+    ROOT gte = f16[2,2,40,41] get-tuple-element(result), index=0
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::GetTupleElement(op::Tuple(
+                        op::Slice(op::GetTupleElement(op::CustomCall(
+                            kCudnnConvBackwardFilterCallTarget,
+                            op::Parameter(0), op::Pad(op::Parameter(1), _)))),
+                        _)));
+}
+
+TEST_F(CudnnConvPadForTensorCoresTest, PadInputFeatures3To4) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    input = f16[10,20,30,3] parameter(0)
+    filter = f16[2,2,3,32] parameter(1)
+    ROOT result = (f16[10,20,30,32], u8[0]) custom-call(input, filter),
+                  window={size=2x2}, dim_labels=b01f_01io->b01f,
+                  custom_call_target="__cudnn$convForward"
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(CudnnConvPadForTensorCores().Run(module.get()).ValueOrDie());
+  auto* root = module->entry_computation()->root_instruction();
+
+  SCOPED_TRACE(module->ToString());
+  EXPECT_THAT(root, op::CustomCall(kCudnnConvForwardCallTarget,
+                                   op::Pad(op::Parameter(0), _),
+                                   op::Pad(op::Parameter(1), _)));
+  EXPECT_TRUE(ShapeUtil::Equal(root->operand(0)->shape(),
+                               ShapeUtil::MakeShape(F16, {10, 20, 30, 4})));
+  EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->shape(),
+                               ShapeUtil::MakeShape(F16, {2, 2, 4, 32})));
+}
+
+}  // anonymous namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a09d4d4716950a09d65dd093272482d55ac5c27
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
@@ -0,0 +1,428 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h"
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+bool IsForwardConvolutionCanonical(const HloInstruction& conv) {
+  CHECK(conv.custom_call_target() == kCudnnConvForwardCallTarget ||
+        conv.custom_call_target() == kCudnnConvBiasActivationForwardCallTarget);
+  return window_util::HasSymmetricPadding(conv.window()) &&
+         !window_util::HasNegativePadding(conv.window()) &&
+         !window_util::HasDilation(conv.window());
+}
+
+// If the (positive and negative) padding on the input operand of a convolution
+// can't be folded into a cuDNN convolution libcall (e.g. uneven padding and
+// dilation), returns kPad and/or kSlice instructions that explicitly apply the
+// padding; otherwise returns the original input operand. When there is both
+// positive padding (including dilation) and negative padding, we insert both
+// kPad and kSlice. Modifies 'conv_window' accordingly if any padding was moved
+// into a kPad or kSlice op.
+HloInstruction* MaybePaddedAndSlicedInput(
+    Window* conv_window, const ConvolutionDimensionNumbers& conv_dnums,
+    HloInstruction* input) {
+  HloComputation* computation = input->parent();
+  if (!window_util::HasSymmetricPadding(*conv_window) ||
+      window_util::HasBaseDilation(*conv_window)) {
+    // If padding is uneven or has dilation, we insert a kPad instruction that
+    // applies positive padding and dilation.
+    //
+    // TODO(phawkins): If conv_window has asymmetric padding, perhaps instead of
+    // moving all the padding into an explicit pad op, we should keep as much
+    // padding inside of cudnn as possible, on the assumption that padding
+    // within cudnn is basically free, whereas a kPad's cost increases as the
+    // amount of padding increases.
+    PaddingConfig padding_config =
+        MakeNoPaddingConfig(input->shape().dimensions_size());
+    for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
+      int64 dim = conv_dnums.input_spatial_dimensions(i);
+      if (conv_window->dimensions(i).padding_low() > 0) {
+        padding_config.mutable_dimensions(dim)->set_edge_padding_low(
+            conv_window->dimensions(i).padding_low());
+        conv_window->mutable_dimensions(i)->set_padding_low(0);
+      }
+      if (conv_window->dimensions(i).padding_high() > 0) {
+        padding_config.mutable_dimensions(dim)->set_edge_padding_high(
+            conv_window->dimensions(i).padding_high());
+        conv_window->mutable_dimensions(i)->set_padding_high(0);
+      }
+      if (conv_window->dimensions(i).base_dilation() != 1) {
+        padding_config.mutable_dimensions(dim)->set_interior_padding(
+            conv_window->dimensions(i).base_dilation() - 1);
+        conv_window->mutable_dimensions(i)->set_base_dilation(1);
+      }
+    }
+    PrimitiveType element_type = input->shape().element_type();
+    HloInstruction* padding = computation->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
+    input = MakePadHlo(input, padding, padding_config).ValueOrDie();
+  }
+
+  if (window_util::HasNegativePadding(*conv_window)) {
+    // If the window has negative padding, insert a kSlice that explicitly
+    // applies negative padding.
+    //
+    // For each dimension, initialize the start index to 0 and the limit index
+    // to the size of that dimension.
+    std::vector<int64> start_indices(input->shape().dimensions_size(), 0);
+    std::vector<int64> limit_indices(input->shape().dimensions().begin(),
+                                     input->shape().dimensions().end());
+    std::vector<int64> strides(input->shape().dimensions_size(), 1);
+    for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
+      int64 dim = conv_dnums.input_spatial_dimensions(i);
+      // If dimension "dim" has negative padding, increase the start index or
+      // decrement the limit index by the amount of negative padding.
+      if (conv_window->dimensions(i).padding_low() < 0) {
+        start_indices[dim] += -conv_window->dimensions(i).padding_low();
+        conv_window->mutable_dimensions(i)->set_padding_low(0);
+      }
+      if (conv_window->dimensions(i).padding_high() < 0) {
+        limit_indices[dim] -= -conv_window->dimensions(i).padding_high();
+        conv_window->mutable_dimensions(i)->set_padding_high(0);
+      }
+    }
+
+    input =
+        MakeSliceHlo(input, start_indices, limit_indices, strides).ValueOrDie();
+  }
+
+  return input;
+}
+
+// If the padding on the kernel operand of a convolution can't be folded into a
+// cuDNN convolution libcall (e.g. dilation), returns a kPad instruction that
+// explicitly applies the padding; otherwise returns the original kernel
+// operand.
+HloInstruction* MaybePaddedKernel(const Window& conv_window,
+                                  const ConvolutionDimensionNumbers& conv_dnums,
+                                  HloInstruction* kernel) {
+  if (!window_util::HasWindowDilation(conv_window)) {
+    return kernel;
+  }
+
+  // Compute the shape and padding config of the pad to be inserted.
+  PaddingConfig padding_config;
+  for (size_t i = 0; i < kernel->shape().dimensions_size(); ++i) {
+    padding_config.add_dimensions();
+  }
+  for (size_t i = 0; i < conv_dnums.kernel_spatial_dimensions().size(); ++i) {
+    int64 dim = conv_dnums.kernel_spatial_dimensions(i);
+    padding_config.mutable_dimensions(dim)->set_interior_padding(
+        conv_window.dimensions(i).window_dilation() - 1);
+  }
+
+  HloComputation* computation = kernel->parent();
+  PrimitiveType element_type = kernel->shape().element_type();
+  HloInstruction* padding = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
+  return MakePadHlo(kernel, padding, padding_config).ValueOrDie();
+}
+}  // namespace
+
+bool CudnnConvPaddingLegalization::CanonicalizeForwardConvolution(
+    HloInstruction* conv) {
+  if (IsForwardConvolutionCanonical(*conv)) {
+    return false;
+  }
+
+  // Insert slices and/or pads between the convolution and its input and/or
+  // kernel operand.
+  Window new_conv_window = conv->window();
+  HloInstruction* new_input = MaybePaddedAndSlicedInput(
+      &new_conv_window, conv->convolution_dimension_numbers(),
+      conv->mutable_operand(0));
+  HloInstruction* new_kernel =
+      MaybePaddedKernel(new_conv_window, conv->convolution_dimension_numbers(),
+                        conv->mutable_operand(1));
+
+  // Remove the window dilation from convolution's window field. These paddings
+  // are made explicit with the pads inserted by MaybePaddedKernel().
+  for (size_t i = 0; i < new_conv_window.dimensions_size(); ++i) {
+    WindowDimension* dim = new_conv_window.mutable_dimensions(i);
+
+    // The size of the kernel may have changed so update the Window to match.
+    dim->set_size(new_kernel->shape().dimensions(
+        conv->convolution_dimension_numbers().kernel_spatial_dimensions(i)));
+    dim->set_window_dilation(1);
+  }
+
+  // The conv CustomCall returns a tuple (conv_result, scratch_buffer).  Extract
+  // out the shape of conv_result.
+  VLOG(1) << "Canonicalizing forward conv";
+  std::vector<HloInstruction*> operands(conv->operands().begin(),
+                                        conv->operands().end());
+  operands[0] = new_input;
+  operands[1] = new_kernel;
+  auto new_conv = conv->parent()->AddInstruction(
+      conv->CloneWithNewOperands(conv->shape(), operands));
+  new_conv->set_window(new_conv_window);
+  VLOG(1) << "Replacing:\n  " << conv->ToString() << "\nwith:\n  "
+          << new_conv->ToString();
+  TF_CHECK_OK(conv->parent()->ReplaceInstruction(conv, new_conv));
+  return true;
+}
+
+namespace {
+void IncreasePaddingLowBy(int64 delta, WindowDimension* window_dim) {
+  window_dim->set_padding_low(window_dim->padding_low() + delta);
+}
+
+void IncreasePaddingHighBy(int64 delta, WindowDimension* window_dim) {
+  window_dim->set_padding_high(window_dim->padding_high() + delta);
+}
+}  // namespace
+
+bool CudnnConvPaddingLegalization::CanonicalizeBackwardFilterConvolution(
+    HloInstruction* backward_conv) {
+  CHECK_EQ(backward_conv->custom_call_target(),
+           kCudnnConvBackwardFilterCallTarget);
+  if (window_util::HasSymmetricPadding(backward_conv->window())) {
+    return false;
+  }
+
+  // A backward filter convolution with uneven padding can be canonicalized to
+  // one with even padding by padding the activations (input) beforehand. For
+  // example,
+  //   BackwardFilterConv(ABCD, xyz, padding_low=1, padding_high=2)
+  // is equivalent to
+  //   ABCD0 = Pad(ABCD, padding_high=1)
+  //   BackwardFilterConv(ABCD0, xyz, padding_low=pading_high=1)
+  // We choose the lesser of padding_low and padding_high as the new padding.
+  HloInstruction* input = backward_conv->mutable_operand(0);
+  Window new_backward_conv_window = backward_conv->window();
+  // input_padding_config is the config of the kPad to be inserted.
+  PaddingConfig input_padding_config =
+      MakeNoPaddingConfig(ShapeUtil::Rank(input->shape()));
+  ConvolutionDimensionNumbers backward_conv_dnums =
+      backward_conv->convolution_dimension_numbers();
+  for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
+    int64 padding_low = backward_conv->window().dimensions(i).padding_low();
+    int64 padding_high = backward_conv->window().dimensions(i).padding_high();
+    if (padding_low < 0 || padding_high < 0) {
+      // TODO(b/32744257): The following canonicalization wouldn't remove
+      // negative padding in a backward convolution, and would therefore cause
+      // cuDNN convolution (which doesn't support negative padding) to fail.
+      return false;
+    }
+    // Compute the new, even padding for the backward conv operation.
+    int64 new_conv_padding = std::min(padding_low, padding_high);
+    int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
+    input_padding_config.mutable_dimensions(dim)->set_edge_padding_low(
+        padding_low - new_conv_padding);
+    input_padding_config.mutable_dimensions(dim)->set_edge_padding_high(
+        padding_high - new_conv_padding);
+
+    // Since we move some padding from the backward convolution to the kPad, we
+    // need to accordingly reduce the padding amount of the backward convolution
+    // and its inner forward convolution.
+    auto* new_dim = new_backward_conv_window.mutable_dimensions(i);
+    new_dim->set_padding_low(new_conv_padding);
+    new_dim->set_padding_high(new_conv_padding);
+  }
+
+  // Create a new backward convolution replacing the old one.
+  HloComputation* computation = backward_conv->parent();
+  HloInstruction* output = backward_conv->mutable_operand(1);
+  HloInstruction* padding =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::Zero(input->shape().element_type())));
+  HloInstruction* padded_input =
+      MakePadHlo(input, padding, input_padding_config).ValueOrDie();
+
+  // The shape of the backward_conv CustomCall is a tuple (conv_result,
+  // scratch_buffer).  Extract out the shape of conv_result.
+  HloInstruction* new_backward_conv =
+      computation->AddInstruction(backward_conv->CloneWithNewOperands(
+          backward_conv->shape(), {padded_input, output}));
+  new_backward_conv->set_window(new_backward_conv_window);
+
+  VLOG(1) << "Canonicalizing backward filter conv";
+  VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
+          << new_backward_conv->ToString();
+
+  TF_CHECK_OK(
+      computation->ReplaceInstruction(backward_conv, new_backward_conv));
+  return true;
+}
+
+bool CudnnConvPaddingLegalization::CanonicalizeBackwardInputConvolution(
+    HloInstruction* backward_conv) {
+  if (window_util::HasSymmetricPadding(backward_conv->window())) {
+    return false;
+  }
+
+  Window new_backward_conv_window = backward_conv->window();
+  ConvolutionDimensionNumbers backward_conv_dnums =
+      backward_conv->convolution_dimension_numbers();
+
+  // The backward_conv CustomCall returns a tuple (conv_result, scratch_memory).
+  // Get the shape of conv_result.
+  Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0);
+
+  Shape new_backward_conv_shape = backward_conv_shape;
+  for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
+    int64 padding_low = backward_conv->window().dimensions(i).padding_low();
+    int64 padding_high = backward_conv->window().dimensions(i).padding_high();
+    if (padding_low < 0 || padding_high < 0) {
+      // TODO(b/32744257): The following canonicalization wouldn't remove
+      // negative padding in a backward convolution, and would therefore cause
+      // cuDNN convolution (which doesn't support negative padding) to fail.
+      return false;
+    }
+    // If the backward convolution has uneven padding on the activations, we
+    // move some padding on the larger end to "internal" padding, so that the
+    // backward convolution produces larger activations which get sliced later.
+    //
+    // For example, suppose we have a non-canonical HLO
+    //   [A] = BackwardInputConvolve([a b], [x y z], padding=(low=2,high=1))
+    // where the amount of padding low is larger, we can canonicalize it to
+    //   [B A] = BackwardInputConvolve([a b], [x y z], padding=(low=1,high=1))
+    //   [A] = Slice([B A])
+    if (padding_low > padding_high) {
+      IncreasePaddingLowBy(padding_high - padding_low,
+                           new_backward_conv_window.mutable_dimensions(i));
+    } else if (padding_low < padding_high) {
+      IncreasePaddingHighBy(padding_low - padding_high,
+                            new_backward_conv_window.mutable_dimensions(i));
+    }
+    // Decreasing the padding by X *increases* the size of our output by X.
+    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
+    new_backward_conv_shape.set_dimensions(
+        dim, new_backward_conv_shape.dimensions(dim) +
+                 std::abs(padding_low - padding_high));
+  }
+
+  // Create a new backward convolution replacing the old one.
+  HloComputation* computation = backward_conv->parent();
+  HloInstruction* output = backward_conv->mutable_operand(0);
+  HloInstruction* filter = backward_conv->mutable_operand(1);
+
+  HloInstruction* new_backward_conv_call =
+      computation->AddInstruction(backward_conv->CloneWithNewOperands(
+          ShapeUtil::MakeTupleShape(
+              {new_backward_conv_shape, ShapeUtil::MakeShape(U8, {0})}),
+          {output, filter}));
+  new_backward_conv_call->set_window(new_backward_conv_window);
+
+  // The CustomCall created above returns a tuple (conv_result, scratch_memory).
+  // Extract out the two elements.
+  HloInstruction* new_backward_conv =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_backward_conv_shape, new_backward_conv_call, 0));
+  HloInstruction* new_backward_conv_scratch =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_backward_conv_call->shape().tuple_shapes(1),
+          new_backward_conv_call, 1));
+
+  // Slice the new backward convolution.
+  //
+  // Initialize start_indices and limit_indices as no slicing.
+  std::vector<int64> start_indices(new_backward_conv->shape().dimensions_size(),
+                                   0LL);
+  std::vector<int64> limit_indices(
+      new_backward_conv->shape().dimensions().begin(),
+      new_backward_conv->shape().dimensions().end());
+  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(), 1LL);
+  for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
+    int64 padding_low = backward_conv->window().dimensions(i).padding_low();
+    int64 padding_high = backward_conv->window().dimensions(i).padding_high();
+    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
+    if (padding_low > padding_high) {
+      // If the amount of low padding (of the old backward convolution) is
+      // larger, we internally pad the low end of the activations and slice
+      // internal padding out here.
+      start_indices[dim] += padding_low - padding_high;
+    } else if (padding_low < padding_high) {
+      // If the amount of high padding is larger, we slice out the internal
+      // padding on the high end.
+      limit_indices[dim] -= padding_high - padding_low;
+    }
+  }
+
+  // Replace the old backward convolution with the slice.
+  Shape slice_shape =
+      ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices,
+                                      limit_indices, strides)
+          .ConsumeValueOrDie();
+  CHECK(ShapeUtil::Compatible(slice_shape, backward_conv_shape))
+      << ShapeUtil::HumanString(slice_shape) << " vs "
+      << ShapeUtil::HumanString(backward_conv_shape);
+
+  HloInstruction* slice = computation->AddInstruction(
+      HloInstruction::CreateSlice(backward_conv_shape, new_backward_conv,
+                                  start_indices, limit_indices, strides));
+  HloInstruction* new_tuple = computation->AddInstruction(
+      HloInstruction::CreateTuple({slice, new_backward_conv_scratch}));
+
+  VLOG(1) << "Canonicalizing backward input conv";
+  VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
+          << new_tuple->ToString();
+
+  TF_CHECK_OK(computation->ReplaceInstruction(backward_conv, new_tuple));
+  return true;
+}
+
+StatusOr<bool> CudnnConvPaddingLegalization::RunOnComputation(
+    HloComputation* computation) {
+  bool changed = false;
+  std::vector<HloCustomCallInstruction*> convs;
+  for (auto* instr : computation->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instr)) {
+      convs.push_back(Cast<HloCustomCallInstruction>(instr));
+    }
+  }
+  for (HloCustomCallInstruction* instruction : convs) {
+    TF_ASSIGN_OR_RETURN(auto kind, GetCudnnConvKind(instruction));
+    changed |= [&] {
+      switch (kind) {
+        case CudnnConvKind::kForward:
+        case CudnnConvKind::kForwardActivation:
+          return CanonicalizeForwardConvolution(instruction);
+        case CudnnConvKind::kBackwardInput:
+          return CanonicalizeBackwardInputConvolution(instruction);
+        case CudnnConvKind::kBackwardFilter:
+          return CanonicalizeBackwardFilterConvolution(instruction);
+      }
+    }();
+  }
+  return changed;
+}
+
+StatusOr<bool> CudnnConvPaddingLegalization::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d1b075517fb285222506e0420984906579e681f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_PADDING_LEGALIZATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_PADDING_LEGALIZATION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// An HLO pass that canonicalizes convolution instructions for GPU codegen. It
+// inserts Pad instructions before Convolution instructions with uncanonicalized
+// padding, so that they can be lowered to cuDNN convolution.
+class CudnnConvPaddingLegalization : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "cudnn-conv-padding-legalization";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
+  // Returns if any changes are made to the parent computation.
+  bool CanonicalizeForwardConvolution(HloInstruction* conv);
+  bool CanonicalizeBackwardFilterConvolution(HloInstruction* backward_conv);
+  bool CanonicalizeBackwardInputConvolution(HloInstruction* backward_conv);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_PADDING_LEGALIZATION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e81850db69edced29ea31bb2a526b0503bf8a453
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -0,0 +1,581 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h"
+
+#include <cstdlib>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+HloInstruction* CreateCudnnConv(const char* call_target, const Shape& shape,
+                                HloInstruction* lhs, HloInstruction* rhs,
+                                const Window& window,
+                                const ConvolutionDimensionNumbers& dnums,
+                                int64 feature_group_count,
+                                const OpMetadata& metadata) {
+  HloComputation* computation = lhs->parent();
+
+  // This call returns a tuple of (conv_result, scratch_memory), where
+  // conv_result is the actual result of the convolution, and scratch_memory is
+  // temporary memory used by cudnn.
+  //
+  // At the moment, we don't know how much scratch memory this conv is going to
+  // use, so we put u8[0] in this place.  Later on another pass will choose
+  // which conv algorithm to use, and at that point we'll modify the shape of
+  // this second tuple element.
+  Shape call_shape =
+      ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
+
+  HloInstruction* custom_call = computation->AddInstruction(
+      HloInstruction::CreateCustomCall(call_shape, {lhs, rhs}, call_target));
+  custom_call->set_window(window);
+  custom_call->set_convolution_dimension_numbers(dnums);
+  custom_call->set_feature_group_count(feature_group_count);
+  custom_call->set_metadata(metadata);
+  return custom_call;
+}
+
+bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
+  const ConvolutionDimensionNumbers& dnums =
+      conv->convolution_dimension_numbers();
+  if (dnums.input_spatial_dimensions_size() > 3) {
+    return false;
+  }
+
+  // CuDNN does not accept zero-element arguments
+  if (ShapeUtil::IsZeroElementArray(conv->operand(0)->shape()) ||
+      ShapeUtil::IsZeroElementArray(conv->operand(1)->shape())) {
+    return false;
+  }
+
+  // CuDNN can perform either cross correlation (no reversal),
+  // or convolution (all dimensions reversed).
+  if (dnums.input_spatial_dimensions_size() == 2
+          ? !window_util::AllOrNoneReversed(conv->window())
+          : window_util::HasWindowReversal(conv->window())) {
+    return false;
+  }
+  return true;
+}
+
+// Try to match a backward filter pattern that contains "conv".
+// Precondition: "conv" is a kConvolution.
+std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
+    HloInstruction* conv) {
+  const auto no_match_result =
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
+  if (conv->feature_group_count() > 1) {
+    return no_match_result;
+  }
+  // Step 1: match the instruction pattern without considering the paddings and
+  // dimension numbers just yet. We may need some generic pattern matcher
+  // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h
+  //
+  // Backward filter convolution is implemented in XLA as the forward
+  // convolution of padded activations and dilated gradients. Padding on
+  // activations and dilation on gradients are specified in the "window" field
+  // of the forward convolution.
+  //
+  //        activations  gradients
+  //              \         /
+  //               v       v
+  //              Convolution
+  //                 conv
+  CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
+
+  // Step 2: match paddings and dimension numbers of the forward convolution.
+  const ConvolutionDimensionNumbers& conv_dnums =
+      conv->convolution_dimension_numbers();
+  auto input_batch_dim = conv_dnums.input_batch_dimension();
+  auto input_feature_dim = conv_dnums.input_feature_dimension();
+  auto input_spatial_dims = conv_dnums.input_spatial_dimensions();
+  auto kernel_input_feature_dim = conv_dnums.kernel_input_feature_dimension();
+  auto kernel_output_feature_dim = conv_dnums.kernel_output_feature_dimension();
+  auto kernel_spatial_dims = conv_dnums.kernel_spatial_dimensions();
+  auto output_batch_dim = conv_dnums.output_batch_dimension();
+  auto output_feature_dim = conv_dnums.output_feature_dimension();
+  auto output_spatial_dims = conv_dnums.output_spatial_dimensions();
+
+  for (const WindowDimension& window_dim : conv->window().dimensions()) {
+    if (window_dim.stride() != 1) {
+      VLOG(1) << "Forward convolution's window "
+              << conv->window().ShortDebugString()
+              << " should have stride of 1.";
+      return no_match_result;
+    }
+    if (window_dim.base_dilation() != 1) {
+      VLOG(1) << "Forward convolution's window "
+              << conv->window().ShortDebugString()
+              << " should have no base (LHS) dilation.";
+      return no_match_result;
+    }
+    if (window_dim.padding_low() < 0) {
+      VLOG(1) << "Padding low should be non-negative.";
+      return no_match_result;
+    }
+    if (window_dim.window_reversal()) {
+      VLOG(1) << "Window reversal field not supported";
+      return no_match_result;
+    }
+    // Padding high will be checked in Step 3.
+  }
+  if (input_batch_dim == output_batch_dim &&
+      !window_util::HasWindowDilation(conv->window())) {
+    VLOG(1) << conv->ToString()
+            << " is a regular forward convolution. No need "
+               "to fold it to a backward filter convolution.";
+    return no_match_result;
+  }
+
+  // Step 3: fuse the matched HLOs into a backward convolution instruction.
+  //
+  // Compute the window of the backward convolution.
+  Window backward_conv_window;
+  for (int i = 0; i < input_spatial_dims.size(); ++i) {
+    WindowDimension* dim = backward_conv_window.add_dimensions();
+    // The window size of the backward convolution equals the output size of the
+    // forward convolution.
+    int64 filter_size = conv->shape().dimensions(output_spatial_dims[i]);
+    dim->set_size(filter_size);
+    // The window stride equals the window dilation of the forward convolution.
+    dim->set_stride(conv->window().dimensions(i).window_dilation());
+    // The window's low padding is the same as the low padding of the
+    // activations.
+    dim->set_padding_low(conv->window().dimensions(i).padding_low());
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+
+    int64 input_size =
+        conv->operand(0)->shape().dimensions(input_spatial_dims[i]);
+    int64 output_size = conv->window().dimensions(i).size();
+    // Compute the range of the amount of valid high padding. We first compute
+    // min_padding_high, the amount of padding on the right/bottom to ensure the
+    // last patch ends at the border, i.e.,
+    //
+    //   input_size + dim->padding_low() + min_padding_high
+    //     = (output_size - 1) * stride + filter_size
+    //
+    // Because convolution ignores trailing incomplete windows, any amount of
+    // padding high from min_padding_high to min_padding_high+stride-1
+    // (max_padding_high) has the same effect.
+    int64 padded_input_size = filter_size + (output_size - 1) * dim->stride();
+    int64 min_padding_high =
+        padded_input_size - input_size - dim->padding_low();
+    int64 max_padding_high = min_padding_high + dim->stride() - 1;
+    CHECK_GE(dim->padding_low(), 0);
+    // In practice, since cuDNN convolution only supports even padding, we make
+    // the amount of high padding the same as the amount of low padding as long
+    // as it is between min_padding_high and max_padding_high. If it is not in
+    // that range, we pick the one that's closest to dim->padding_low() and let
+    // CudnnConvPaddingLegalization canonicalize the resultant backward
+    // convolution later. Picking the closest one minimizes the cost of the kPad
+    // instruction to be inserted by CudnnConvPaddingLegalization.
+    if (dim->padding_low() >= min_padding_high &&
+        dim->padding_low() <= max_padding_high) {
+      dim->set_padding_high(dim->padding_low());
+    } else {
+      if (dim->padding_low() < min_padding_high) {
+        dim->set_padding_high(min_padding_high);
+      } else {
+        dim->set_padding_high(max_padding_high);
+      }
+    }
+    if (dim->padding_high() < 0) {
+      LOG(ERROR)
+          << "Fusing this pattern to backward filter convolution would cause "
+             "negative padding ("
+          << dim->padding_high()
+          << ") on right/bottom of the weight gradients, which is not "
+             "supported by CudnnConvPaddingLegalization (b/32744257). "
+             "Falling back to "
+             "unfused convolution for instruction: "
+          << conv->ToString();
+      return no_match_result;
+    }
+  }
+
+  // Restore the dimension numbers of the backward convolution from the forward
+  // convolution. The two activation dimensions are reversed (batch and
+  // feature).
+  ConvolutionDimensionNumbers backward_conv_dnums;
+  backward_conv_dnums.set_input_batch_dimension(input_feature_dim);
+  backward_conv_dnums.set_input_feature_dimension(input_batch_dim);
+  for (int i = 0; i < input_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_input_spatial_dimensions(input_spatial_dims[i]);
+  }
+  backward_conv_dnums.set_output_batch_dimension(kernel_input_feature_dim);
+  backward_conv_dnums.set_output_feature_dimension(kernel_output_feature_dim);
+  for (int i = 0; i < kernel_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_output_spatial_dimensions(kernel_spatial_dims[i]);
+  }
+  // The dimension numbering of the output of the forward convolution (before
+  // transposition) is the same as that of the activations (according to the
+  // semantics of kConvolution). The batch dimension of the activations should
+  // be treated as the input feature dimension, and the feature dimension should
+  // be treated as the output feature.
+  backward_conv_dnums.set_kernel_input_feature_dimension(output_batch_dim);
+  backward_conv_dnums.set_kernel_output_feature_dimension(output_feature_dim);
+  for (int i = 0; i < output_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]);
+  }
+
+  return std::make_tuple(true, backward_conv_window, backward_conv_dnums);
+}
+
+// Try to match a backward input pattern that contains "conv".
+// Precondition: "conv" is a kConvolution.
+std::tuple<bool, Window, ConvolutionDimensionNumbers, HloInstruction*>
+MatchBackwardInput(HloInstruction* conv) {
+  const auto no_match_result =
+      std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr);
+
+  // TODO(b/119479517): Theoretically cuDNN supports grouped convolutions also
+  // for the backward input convolution, but at least for now with version 7.1.4
+  // it is slower. This needs to be re-evaluated for future cuDNN versions.
+  // Note that we already have the necessary code down below, the only thing to
+  // enable it is to remove the following early return.
+  if (conv->feature_group_count() > 1) {
+    return no_match_result;
+  }
+
+  // Match instruction pattern.
+  CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
+  HloInstruction* reverse_filter = conv->mutable_operand(1);
+  ConvolutionDimensionNumbers dnums = conv->convolution_dimension_numbers();
+
+  // We pattern-match to a backwards input conv if:
+  //
+  //  - all spatial dims of the filter are reversed
+  //
+  // OR
+  //
+  //  - filter is 1x1 or a constant AND
+  //  - conv has base dilation (otherwise this is just a regular forward conv).
+  //
+  // The final criterion above is just for canonicalization; cudnn seems to run
+  // just as fast if we canonicalize 1x1/constant filters without base dilation
+  // to forward or backward convs.  We canonicalize to forward conv because (a)
+  // it's more natural (constant filters usually show up when doing inference,
+  // and having backwards convolutions in inference graphs would be weird), and
+  // (b) cudnn has special fusions for forward conv plus bias and activation,
+  // and we want to pattern-match to that after running this pass.
+  bool is_reversed_filter =
+      reverse_filter->opcode() == HloOpcode::kReverse &&
+      absl::c_is_permutation(dnums.kernel_spatial_dimensions(),
+                             reverse_filter->dimensions());
+  bool is_1x1_filter =
+      absl::c_all_of(conv->window().dimensions(),
+                     [](const WindowDimension& d) { return d.size() == 1; });
+  if (!is_reversed_filter &&
+      !(window_util::HasBaseDilation(conv->window()) &&
+        (reverse_filter->IsConstant() || is_1x1_filter))) {
+    VLOG(1) << "Can't match to backwards convolution. Either filter is not "
+               "kReverse, or it's not a base-dilated conv with a 1x1 or "
+               "constant filter.";
+    return no_match_result;
+  }
+
+  // Match padding and dilation of the forward convolution.
+  for (const WindowDimension& window_dim : conv->window().dimensions()) {
+    if (window_dim.stride() != 1) {
+      VLOG(1) << "Forward convolution's window "
+              << conv->window().ShortDebugString()
+              << " should have stride of 1.";
+      return no_match_result;
+    }
+    if (window_dim.window_dilation() != 1) {
+      VLOG(1) << "Forward convolution's window "
+              << conv->window().ShortDebugString()
+              << " should have no window dilation.";
+      return no_match_result;
+    }
+    if (window_dim.window_reversal()) {
+      VLOG(1) << "Window reversal field not supported";
+      return no_match_result;
+    }
+  }
+
+  const auto& input_spatial_dims = dnums.input_spatial_dimensions();
+  const auto& output_spatial_dims = dnums.output_spatial_dimensions();
+  CHECK_EQ(conv->window().dimensions().size(), input_spatial_dims.size());
+  CHECK_EQ(output_spatial_dims.size(), input_spatial_dims.size());
+
+  const Window& old_window = conv->window();
+  Window new_window = old_window;
+  for (size_t i = 0; i < input_spatial_dims.size(); ++i) {
+    // Restore backward convolution's padding config from the matched pattern.
+    // See the comment in tensorflow/core/kernels/conv_grad_tuple_ops.cc
+    // for how we convert backward input convolution to a variant of forward
+    // convolution.
+    //
+    // The stride of the backward convolution
+    // = the base dilation factor of the forward convolution
+    auto dim = new_window.mutable_dimensions(i);
+    dim->set_stride(old_window.dimensions(i).base_dilation());
+    dim->set_base_dilation(1);
+
+    // The low padding = kernel_size - 1 - low padding on the gradients
+    // Make sure the low padding is not negative.
+    auto kernel_size = old_window.dimensions(i).size();
+    auto backward_padding_low =
+        kernel_size - 1 - old_window.dimensions(i).padding_low();
+    if (backward_padding_low < 0) {
+      LOG(ERROR)
+          << "The low padding of the backward convolution would be negative ("
+          << backward_padding_low
+          << "), which isn't supported by CudnnConvPaddingLegalization "
+             "for now (b/32744257).";
+      return no_match_result;
+    }
+    dim->set_padding_low(backward_padding_low);
+
+    // Compute the range of the amount of padding on the right/bottom of the
+    // activations. XLA's convolution requires all patches to be within the
+    // padded base. This gives us flexiblity to choose the amount of high
+    // padding from a set of values without changing the result of the backward
+    // convolution. The minimum amount (min_padding_high) makes the last patch
+    // end at the border. The maximum amount (max_padding_high) equals
+    // min_padding_high+stride-1 -- max_padding_high+1 would cause the output
+    // size to change.
+    auto unpadded_input_size = conv->shape().dimensions(output_spatial_dims[i]);
+    auto output_size =
+        conv->operand(0)->shape().dimensions(input_spatial_dims[i]);
+    auto padded_input_size = kernel_size + dim->stride() * (output_size - 1);
+    auto total_pad_size = padded_input_size - unpadded_input_size;
+    auto min_padding_high = total_pad_size - backward_padding_low;
+    auto max_padding_high = min_padding_high + dim->stride() - 1;
+
+    if (backward_padding_low >= min_padding_high &&
+        backward_padding_low <= max_padding_high) {
+      // In the best case (most likely), if backward_padding_low is in the range
+      // of the amounts of valid high padding, we choose backward_padding_low
+      // because cuDNN supports even padding only.
+      dim->set_padding_high(backward_padding_low);
+    } else {
+      // Otherwise, we choose the amount that's closest to backward_padding_low,
+      // and CudnnConvPaddingLegalization will later insert kSlice
+      // instructions to enforce even padding.
+      //
+      // For example, consider the backward convolution pattern
+      //
+      //   ab     xy
+      //   | pad  | reverse
+      //  .a.b    yx
+      //     \   /
+      //      ABC
+      //
+      // The amount of low padding on activations (in backward convolution) is
+      //   backward_padding_low = kernel_size - 1 - forward_padding_low
+      //                        = 2 - 1 - 1 = 0
+      //
+      // The amount of padding high must be between 1 and 2, in order to make
+      // Conv(ABC, xy, stride=2) produce exactly 2 elements (ab). 0 is not in
+      // the range of [1,2], so we pick the closest valid amount of padding
+      // high, which is 1 in this case. Therefore, we fuse the above pattern to
+      //
+      //   ABC = BackwardInputConv(ab, xy, stride=2, padding_high=1)
+      if (backward_padding_low < min_padding_high) {
+        dim->set_padding_high(min_padding_high);
+      } else {
+        dim->set_padding_high(max_padding_high);
+      }
+    }
+    // CudnnConvPaddingLegalization doesn't handle backward input
+    // convolution with negative padding for now. So fall back to unfused
+    // convolution in case of negative padding. For example,
+    //   ABCD = Conv(abc, reverse(xy), padding_high=2)
+    // could be fused to
+    //   ABCD = BackwardInputConv(abc, xy, padding_low=1, padding_high=-1)
+    // with positive padding low but negative padding high.
+    if (dim->padding_high() < 0) {
+      LOG(ERROR) << "Fusing this pattern to backward convolution would cause "
+                    "negative padding ("
+                 << dim->padding_high()
+                 << ") on right/bottom of the activations, which is not "
+                    "supported by CudnnConvPaddingLegalization (b/32744257). "
+                    "Falling back to unfused convolution for instruction: "
+                 << conv->ToString();
+      return no_match_result;
+    }
+  }
+
+  // OK, it's a match! Switch the input feature dimension with the output
+  // feature dimension. This is the way cuDNN expects it to be.
+  dnums.set_kernel_input_feature_dimension(
+      conv->convolution_dimension_numbers().kernel_output_feature_dimension());
+  dnums.set_kernel_output_feature_dimension(
+      conv->convolution_dimension_numbers().kernel_input_feature_dimension());
+
+  // If we matched against a constant, we need to add a reverse op that can be
+  // subsumed by the cuDNN call. algebraic-simplifier will later remove any
+  // unnecessary reverses.
+  if (reverse_filter->opcode() != HloOpcode::kReverse &&
+      reverse_filter->IsConstant()) {
+    // Create a double-reverse, which is a nop.
+    HloComputation* c = conv->parent();
+    reverse_filter = c->AddInstruction(HloInstruction::CreateReverse(
+        reverse_filter->shape(), reverse_filter,
+        AsInt64Slice(dnums.kernel_spatial_dimensions())));
+    reverse_filter = c->AddInstruction(HloInstruction::CreateReverse(
+        reverse_filter->shape(), reverse_filter,
+        AsInt64Slice(dnums.kernel_spatial_dimensions())));
+    TF_CHECK_OK(conv->ReplaceOperandWith(/*operand_no=*/1, reverse_filter));
+  }
+
+  // Calculate the 'rhs' that goes into the backward input convolution.
+  HloInstruction* rhs = reverse_filter;
+  // One reverse is subsumed by the cuDNN call.
+  if (rhs->opcode() == HloOpcode::kReverse) {
+    rhs = rhs->mutable_operand(0);
+  }
+  if (conv->feature_group_count() == 1) {
+    return std::make_tuple(true, new_window, dnums, rhs);
+  }
+
+  // Handle grouped convolutions. Because we swapped the input feature dimension
+  // with the output feature dimension, we need to also reshape the kernel so
+  // that the 'feature_group_count' parameter still makes sense. The
+  // 'feature_group_count' parameter essentially specifies how often the
+  // 'kernel_input_feature_dimension' is repeated. So when we swap these
+  // dimensions, we need to divide the new 'kernel_input_feature_dimension' by
+  // 'feature_group_count' and multiply the new
+  // 'kernel_output_feature_dimension' by 'feature_group_count'.
+  Shape new_shape = rhs->shape();
+  int64 input_feature_dimension = dnums.kernel_input_feature_dimension();
+  int64 output_feature_dimension = dnums.kernel_output_feature_dimension();
+
+  // In the backward convolution case, the spatial dimensions become the
+  // feature dimensions, and we are guaranteed that the spatial dimensions are
+  // adjacent.
+  CHECK_EQ(std::abs(input_feature_dimension - output_feature_dimension), 1LL);
+  int64 input_features = new_shape.dimensions(input_feature_dimension);
+  int64 output_features = new_shape.dimensions(output_feature_dimension);
+  new_shape.set_dimensions(input_feature_dimension,
+                           input_features / conv->feature_group_count());
+  new_shape.set_dimensions(output_feature_dimension,
+                           output_features * conv->feature_group_count());
+  HloComputation* c = conv->parent();
+  rhs = c->AddInstruction(HloInstruction::CreateReshape(new_shape, rhs));
+  return std::make_tuple(true, new_window, dnums, rhs);
+}
+
+CudnnConvBackendConfig GetDefaultBackendConfig() {
+  CudnnConvBackendConfig config;
+  config.set_conv_result_scale(1);
+  return config;
+}
+
+// Tries to rewrite a single convolution into a call to cudnn.
+StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
+  CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
+
+  HloInstruction* custom_call = [&]() -> HloInstruction* {
+    bool match;
+    Window window;
+    ConvolutionDimensionNumbers dnums;
+    HloInstruction* rhs;
+
+    std::tie(match, window, dnums) = MatchBackwardFilter(conv);
+    if (match) {
+      return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(),
+                             conv->mutable_operand(0), conv->mutable_operand(1),
+                             window, dnums, conv->feature_group_count(),
+                             conv->metadata());
+    }
+
+    std::tie(match, window, dnums, rhs) = MatchBackwardInput(conv);
+    if (match) {
+      return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, conv->shape(),
+                             conv->mutable_operand(0), rhs, window, dnums,
+                             conv->feature_group_count(), conv->metadata());
+    }
+
+    // If all else fails, try a forward convolution.
+    if (CanImplementAsCudnnForwardConv(conv)) {
+      return CreateCudnnConv(kCudnnConvForwardCallTarget, conv->shape(),
+                             conv->mutable_operand(0), conv->mutable_operand(1),
+                             conv->window(),
+                             conv->convolution_dimension_numbers(),
+                             conv->feature_group_count(), conv->metadata());
+    }
+
+    return nullptr;
+  }();
+
+  if (custom_call == nullptr) {
+    return false;
+  }
+
+  TF_RETURN_IF_ERROR(
+      custom_call->set_backend_config(GetDefaultBackendConfig()));
+
+  VLOG(1) << "Replacing convolution " << conv->ToString() << " with "
+          << custom_call->ToString();
+
+  // The CustomCall returns a tuple (conv_result, scratch_memory).  Extract out
+  // the conv result and replace `conv` with it.
+  TF_RETURN_IF_ERROR(conv->parent()->ReplaceWithNewInstruction(
+      conv,
+      HloInstruction::CreateGetTupleElement(conv->shape(), custom_call, 0)));
+  return true;
+}
+
+// Rewrites the convolutions in the given computation into calls to cudnn.
+// Returns true if it made any changes.
+StatusOr<bool> RunOnComputation(HloComputation* computation) {
+  std::vector<HloInstruction*> convs;
+  for (auto* hlo : computation->instructions()) {
+    if (hlo->opcode() == HloOpcode::kConvolution) {
+      convs.push_back(hlo);
+    }
+  }
+
+  bool changed = false;
+  for (HloInstruction* conv : convs) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(conv));
+    changed |= result;
+  }
+  return changed;
+}
+}  // namespace
+
+StatusOr<bool> CudnnConvRewriter::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8ec72c27bab8912d0dc2aeead114eb010b87b78
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_REWRITER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Rewrites plain convolutions, backwards-filter convolutions, and
+// backwards-input convolutions into CustomCall HLOs that call into cuDNN.
+class CudnnConvRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "cudnn-conv-rewriter"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..443883a89f66a747def1049bc5afb53fec3c2409
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
@@ -0,0 +1,627 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h"
+
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+using ::testing::_;
+
+class CudnnConvRewriterTest : public HloTestBase {
+ public:
+  CudnnConvRewriterTest()
+      : HloTestBase(/*layout_sensitive=*/true,
+                    /*allow_mixed_precision=*/false) {
+    for (int i = 0; i < 2; ++i) {
+      WindowDimension* window_dim = default_conv_window_.add_dimensions();
+      window_dim->set_size(1);
+      window_dim->set_stride(1);
+      window_dim->set_padding_low(0);
+      window_dim->set_padding_high(0);
+      window_dim->set_window_dilation(1);
+      window_dim->set_base_dilation(1);
+    }
+    // TF data shapes are by default in the NHWC order, and filter shape is by
+    // default in HWIO order. For backward filter convolution, we need to swap
+    // the batch and feature dimension in the activations, and treat the batch
+    // dimension in gradients as the input feature dimension in the filter.
+    //
+    // TODO(jingyue): Add more tests on NCHW input order, which TF also
+    // supports.
+    tf_default_dnums_for_backward_filter_.set_input_batch_dimension(3);
+    tf_default_dnums_for_backward_filter_.set_input_feature_dimension(0);
+    tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(1);
+    tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(2);
+    tf_default_dnums_for_backward_filter_.set_kernel_input_feature_dimension(0);
+    tf_default_dnums_for_backward_filter_.set_kernel_output_feature_dimension(
+        3);
+    tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(1);
+    tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(2);
+    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(0);
+    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(1);
+    tf_default_dnums_for_backward_filter_.set_output_batch_dimension(2);
+    tf_default_dnums_for_backward_filter_.set_output_feature_dimension(3);
+
+    tf_default_dnums_for_backward_input_.set_input_batch_dimension(0);
+    tf_default_dnums_for_backward_input_.set_output_batch_dimension(0);
+    tf_default_dnums_for_backward_input_.set_input_feature_dimension(3);
+    tf_default_dnums_for_backward_input_.set_output_feature_dimension(3);
+    tf_default_dnums_for_backward_input_.add_input_spatial_dimensions(1);
+    tf_default_dnums_for_backward_input_.add_output_spatial_dimensions(1);
+    tf_default_dnums_for_backward_input_.add_input_spatial_dimensions(2);
+    tf_default_dnums_for_backward_input_.add_output_spatial_dimensions(2);
+    tf_default_dnums_for_backward_input_.set_kernel_input_feature_dimension(3);
+    tf_default_dnums_for_backward_input_.set_kernel_output_feature_dimension(2);
+    tf_default_dnums_for_backward_input_.add_kernel_spatial_dimensions(0);
+    tf_default_dnums_for_backward_input_.add_kernel_spatial_dimensions(1);
+  }
+
+ protected:
+  bool RunPass(HloModule* module) {
+    return CudnnConvRewriter().Run(module).ValueOrDie();
+  }
+
+  // A convolution window with stride 1 and zero padding. The size fields are
+  // not set.
+  Window default_conv_window_;
+  ConvolutionDimensionNumbers tf_default_dnums_for_backward_filter_;
+  ConvolutionDimensionNumbers tf_default_dnums_for_backward_input_;
+};
+
+TEST_F(CudnnConvRewriterTest, BackwardFilterConvolve) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* activations =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "activations"));
+  HloInstruction* gradients =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {1, 1, 2, 1}), "gradients"));
+  Window conv_window = default_conv_window_;
+  conv_window.mutable_dimensions(1)->set_size(2);
+  conv_window.mutable_dimensions(1)->set_window_dilation(2);
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeInference::InferConvolveShape(
+          activations->shape(), gradients->shape(), /*feature_group_count=*/1,
+          conv_window, tf_default_dnums_for_backward_filter_)
+          .ConsumeValueOrDie(),
+      activations, gradients, /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
+
+  OpMetadata metadata;
+  metadata.set_op_name("foo");
+  conv->set_metadata(metadata);
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+
+  // Check that metadata was preserved.
+  const auto& md_after_opt =
+      entry_computation->root_instruction()->operand(0)->metadata();
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(md_after_opt, metadata))
+      << md_after_opt.DebugString() << " vs " << metadata.DebugString();
+}
+
+TEST_F(CudnnConvRewriterTest,
+       BackwardFilterConvolveEquivalentToForwardConvolution) {
+  HloComputation::Builder builder(TestName());
+  HloInstruction* activations =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "activations"));
+  HloInstruction* gradients =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "gradients"));
+  Window conv_window = default_conv_window_;
+  conv_window.mutable_dimensions(1)->set_size(3);
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeInference::InferConvolveShape(
+          activations->shape(), gradients->shape(), /*feature_group_count=*/1,
+          conv_window, tf_default_dnums_for_backward_filter_)
+          .ConsumeValueOrDie(),
+      activations, gradients, /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+}
+
+// Extracted from block35 training.
+TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithPaddedActivations) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* activations =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {20, 35, 35, 32}), "activations"));
+  HloInstruction* gradients =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {20, 35, 35, 32}), "gradients"));
+
+  Window conv_window = default_conv_window_;
+  for (int i = 0; i < 2; ++i) {
+    conv_window.mutable_dimensions(i)->set_size(35);
+    conv_window.mutable_dimensions(i)->set_padding_low(1);
+    conv_window.mutable_dimensions(i)->set_padding_high(1);
+  }
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {32, 3, 3, 32}), activations, gradients,
+      /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+}
+
+// Extracted from inception v3 training.
+TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithPaddedGradients) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* activations =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {20, 10, 10, 192}), "activations"));
+  HloInstruction* gradients =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {20, 4, 4, 320}), "gradients"));
+
+  Window conv_window = default_conv_window_;
+  for (int i = 0; i < 2; ++i) {
+    conv_window.mutable_dimensions(i)->set_size(4);
+    conv_window.mutable_dimensions(i)->set_padding_high(-1);
+    conv_window.mutable_dimensions(i)->set_window_dilation(2);
+  }
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {320, 3, 3, 192}), activations, gradients,
+      /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+}
+
+TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* activations =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {20, 35, 35, 32}), "activations"));
+  HloInstruction* gradients =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {20, 35, 35, 32}), "gradients"));
+
+  Window conv_window = default_conv_window_;
+  for (int i = 0; i < 2; ++i) {
+    conv_window.mutable_dimensions(i)->set_size(35);
+    // Uneven padding: padding_low=0, padding_high=1
+    conv_window.mutable_dimensions(i)->set_padding_high(1);
+  }
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {32, 2, 2, 32}), activations, gradients,
+      /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
+}
+
+TEST_F(CudnnConvRewriterTest, BackwardInputConvolveEvenPadding) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* output =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {4, 5, 16, 16}), "output"));
+  HloInstruction* kernel =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {5, 3, 7, 7}), "kernel"));
+  HloInstruction* reverse_kernel = builder.AddInstruction(
+      HloInstruction::CreateReverse(kernel->shape(), kernel, {2, 3}));
+
+  Window conv_window = default_conv_window_;
+  for (int i = 0; i < 2; ++i) {
+    conv_window.mutable_dimensions(i)->set_size(7);
+    conv_window.mutable_dimensions(i)->set_padding_low(3);
+    conv_window.mutable_dimensions(i)->set_padding_high(3);
+  }
+  ConvolutionDimensionNumbers conv_dnums;
+  conv_dnums.set_input_batch_dimension(0);
+  conv_dnums.set_output_batch_dimension(0);
+  conv_dnums.set_input_feature_dimension(1);
+  conv_dnums.set_output_feature_dimension(1);
+  conv_dnums.add_input_spatial_dimensions(2);
+  conv_dnums.add_output_spatial_dimensions(2);
+  conv_dnums.add_input_spatial_dimensions(3);
+  conv_dnums.add_output_spatial_dimensions(3);
+  conv_dnums.set_kernel_input_feature_dimension(0);
+  conv_dnums.set_kernel_output_feature_dimension(1);
+  conv_dnums.add_kernel_spatial_dimensions(2);
+  conv_dnums.add_kernel_spatial_dimensions(3);
+
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {4, 3, 16, 16}), /*lhs=*/output,
+      /*rhs=*/reverse_kernel, /*feature_group_count=*/1, conv_window,
+      conv_dnums, DefaultPrecisionConfig(2)));
+  // Verify the convolution's shape is consistent with ShapeInference.
+  CHECK(ShapeUtil::Compatible(
+      conv->shape(), ShapeInference::InferConvolveShape(
+                         output->shape(), reverse_kernel->shape(),
+                         /*feature_group_count=*/1, conv_window, conv_dnums)
+                         .ValueOrDie()));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
+  for (int i = 0; i < 2; ++i) {
+    const WindowDimension& window_dim = custom_call->window().dimensions(i);
+    // Low padding of the backward input convolution
+    //   = kernel_size - 1 - low padding on gradients.
+    EXPECT_EQ(3, window_dim.padding_low());
+    EXPECT_EQ(3, window_dim.padding_high());
+    EXPECT_EQ(1, window_dim.stride());
+    EXPECT_EQ(1, window_dim.base_dilation());
+  }
+}
+
+// Convolve([abc], [x], base_dilation=2)
+//   = Convolve([abc], Reverse([x]), base_dilation=2)
+//   = BackwardInputConvolve([abc], [x], stride=2)
+TEST_F(CudnnConvRewriterTest, BackwardInputConvolve1x1Filter) {
+  auto builder = HloComputation::Builder(TestName());
+  // NHWC dimension order.
+  HloInstruction* output =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "output"));
+  // HWOI dimension order.
+  HloInstruction* kernel =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {1, 1, 1, 1}), "kernel"));
+
+  Window conv_window = default_conv_window_;
+  conv_window.mutable_dimensions(1)->set_base_dilation(2);
+
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeInference::InferConvolveShape(output->shape(), kernel->shape(),
+                                         /*feature_group_count=*/1, conv_window,
+                                         tf_default_dnums_for_backward_input_)
+          .ConsumeValueOrDie(),
+      /*lhs=*/output, /*rhs=*/kernel, /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+}
+
+// BackwardInputConvolve([abc], [x], stride=1) is equivalent to
+// ForwardConvolve([abc], [x], stride=1). No need to fold it into backward input
+// convolution.
+TEST_F(CudnnConvRewriterTest,
+       BackwardInputConvolve1x1FilterEquivalentToForwardConvolve) {
+  auto builder = HloComputation::Builder(TestName());
+  // NHWC dimension order.
+  HloInstruction* output =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "output"));
+  // HWOI dimension order.
+  HloInstruction* kernel =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {1, 1, 1, 1}), "kernel"));
+
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeInference::InferConvolveShape(
+          output->shape(), kernel->shape(), /*feature_group_count=*/1,
+          default_conv_window_, tf_default_dnums_for_backward_input_)
+          .ConsumeValueOrDie(),
+      /*lhs=*/output, /*rhs=*/kernel, /*feature_group_count=*/1,
+      default_conv_window_, tf_default_dnums_for_backward_input_,
+      DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
+}
+
+// Extracted from Inception V3 training.
+//
+//                                  filter(HWIO)
+//                                  3x3x192x320
+//                                      |
+//                                      v
+//      gradients(NHWC)              reverse
+//        20x4x4x320               3x3x192x320
+//                    \            /
+//                     \          /
+//  conv (NHWC) with padding (low=2,high=3,interior=1)
+//                     20x10x10x192
+//
+// Gradients are padded unevenly.
+TEST_F(CudnnConvRewriterTest, BackwardInputConvolveUnevenPaddingOnGradients) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* output =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {20, 4, 4, 320}), "output"));
+  HloInstruction* kernel =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {3, 3, 192, 320}), "kernel"));
+  HloInstruction* reverse_kernel = builder.AddInstruction(
+      HloInstruction::CreateReverse(kernel->shape(), kernel, {0, 1}));
+
+  Window conv_window = default_conv_window_;
+  for (int i = 0; i < 2; ++i) {
+    conv_window.mutable_dimensions(i)->set_size(3);
+    conv_window.mutable_dimensions(i)->set_padding_low(2);
+    conv_window.mutable_dimensions(i)->set_padding_high(3);
+    // Interior padding = 1.
+    conv_window.mutable_dimensions(i)->set_base_dilation(2);
+  }
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {20, 10, 10, 192}), output, reverse_kernel,
+      /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
+  // Verify the convolution's shape is consistent with ShapeInference.
+  CHECK(ShapeUtil::Compatible(
+      conv->shape(),
+      ShapeInference::InferConvolveShape(
+          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
+          conv_window, tf_default_dnums_for_backward_input_)
+          .ValueOrDie()));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  const HloInstruction* custom_call =
+      entry_computation->root_instruction()->operand(0);
+  for (int i = 0; i < 2; ++i) {
+    const WindowDimension& window_dim = custom_call->window().dimensions(i);
+    EXPECT_EQ(0, window_dim.padding_low());
+    EXPECT_EQ(0, window_dim.padding_high());
+    EXPECT_EQ(2, window_dim.stride());
+    EXPECT_EQ(1, window_dim.base_dilation());
+  }
+}
+
+// Similar to BackwardInputConvolveUnevenPadding, but the low padding of the
+// gradients exceeds kernel_size - 1. Therefore, this pattern cannot be fused.
+TEST_F(CudnnConvRewriterTest, BackwardInputConvolveLowPaddingTooLarge) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* output =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {20, 4, 4, 320}), "output"));
+  HloInstruction* kernel =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {3, 3, 192, 320}), "kernel"));
+  HloInstruction* reverse_kernel = builder.AddInstruction(
+      HloInstruction::CreateReverse(kernel->shape(), kernel, {0, 1}));
+
+  Window conv_window = default_conv_window_;
+  for (int i = 0; i < 2; ++i) {
+    conv_window.mutable_dimensions(i)->set_size(3);
+    conv_window.mutable_dimensions(i)->set_padding_low(3);
+    conv_window.mutable_dimensions(i)->set_padding_high(2);
+    conv_window.mutable_dimensions(i)->set_base_dilation(2);
+  }
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {20, 10, 10, 192}), output, reverse_kernel,
+      /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
+  // Verify the convolution's shape is consistent with ShapeInference.
+  CHECK(ShapeUtil::Compatible(
+      conv->shape(),
+      ShapeInference::InferConvolveShape(
+          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
+          conv_window, tf_default_dnums_for_backward_input_)
+          .ValueOrDie()));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
+}
+
+// Extracted from Resnet-50.
+//
+// For simplicity, we focus on the column dimension and ignore other dimensions.
+// We use [?] to represent the shape instead of the content.
+//
+// Suppose operator FC does
+//   [4] = conv([14], [3], stride=2, padding_high=1)  // Padding::kSame
+//
+// BC = BackwardInput(FC) does:
+//   [14] = conv([7], reverse([3]),
+//               padding_low=2, padding_high=1, base_dilation=2)
+//
+// We should fuse BC even though padding on activations is uneven, because
+// CudnnConvPaddingLegalization will canonicalize the fusion HLO.
+TEST_F(CudnnConvRewriterTest, BackwardInputConvolveUnevenPaddingOnActivations) {
+  auto builder = HloComputation::Builder(TestName());
+  // The gradients are in NCHW layout.
+  HloInstruction* output =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 1, 7, 1}), "output"));
+  // The kernel is in HWIO layout.
+  HloInstruction* kernel =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {1, 3, 1, 1}), "kernel"));
+  HloInstruction* reverse_kernel = builder.AddInstruction(
+      HloInstruction::CreateReverse(kernel->shape(), kernel, {0, 1}));
+
+  Window conv_window = default_conv_window_;
+  WindowDimension* forward_conv_col_dim = conv_window.mutable_dimensions(1);
+  forward_conv_col_dim->set_size(3);
+  forward_conv_col_dim->set_padding_low(2);
+  forward_conv_col_dim->set_padding_high(1);
+  forward_conv_col_dim->set_base_dilation(2);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {1, 1, 14, 1}), output, reverse_kernel,
+      /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
+  // Verify the convolution's shape is consistent with ShapeInference.
+  CHECK(ShapeUtil::Compatible(
+      conv->shape(),
+      ShapeInference::InferConvolveShape(
+          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
+          conv_window, tf_default_dnums_for_backward_input_)
+          .ValueOrDie()));
+
+  auto module = CreateNewVerifiedModule();
+  const HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  ASSERT_THAT(entry_computation->root_instruction(),
+              op::GetTupleElement(
+                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
+  const WindowDimension& backward_conv_col_dim =
+      entry_computation->root_instruction()->operand(0)->window().dimensions(1);
+  EXPECT_EQ(0, backward_conv_col_dim.padding_low());
+  EXPECT_EQ(1, backward_conv_col_dim.padding_high());
+}
+
+// For simplicity, we focus on the column dimension and ignore other dimensions.
+// We use [?] to represent the shape instead of the content.
+//
+// Suppose operator FC does
+//   [3] = conv([4], [2], padding_low=1, padding_high=-1)
+//
+// BC = BackwardInput(FC) does:
+//   [4] = conv([3], reverse([2]), padding_high=2)
+//
+// We currently don't fuse BC because CudnnConvPaddingLegalization
+// doesn't support negative padding on the gradients of backward convolution
+// (b/32744257).
+TEST_F(CudnnConvRewriterTest,
+       BackwardInputConvolveNegativePaddingHighOnActivations) {
+  auto builder = HloComputation::Builder(TestName());
+  // The gradients are in NCHW layout.
+  HloInstruction* output =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "output"));
+  // The kernel is in HWIO layout.
+  HloInstruction* kernel =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(F32, {1, 2, 1, 1}), "kernel"));
+  HloInstruction* reverse_kernel = builder.AddInstruction(
+      HloInstruction::CreateReverse(kernel->shape(), kernel, {0, 1}));
+
+  Window conv_window = default_conv_window_;
+  WindowDimension* forward_conv_col_dim = conv_window.mutable_dimensions(1);
+  forward_conv_col_dim->set_size(2);
+  forward_conv_col_dim->set_padding_high(2);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {1, 1, 4, 1}), output, reverse_kernel,
+      /*feature_group_count=*/1, conv_window,
+      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
+  // Verify the convolution's shape is consistent with ShapeInference.
+  CHECK(ShapeUtil::Compatible(
+      conv->shape(),
+      ShapeInference::InferConvolveShape(
+          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
+          conv_window, tf_default_dnums_for_backward_input_)
+          .ValueOrDie()));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(RunPass(module.get()));
+  EXPECT_THAT(
+      entry_computation->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
+}
+
+// Check that we will materialize a reversed version of a constant in order to
+// pattern-match a backwards input convolution.
+TEST_F(CudnnConvRewriterTest, BackwardInputConvolveConstantFilter) {
+  Array4D<float> constant_arr(4, 4, 2, 2);
+  constant_arr.FillIota(0);
+  string constant_str =
+      LiteralUtil::CreateR4FromArray4D(constant_arr).ToString();
+
+  const string module_str = absl::StrFormat(R"(
+    HloModule test
+
+    ENTRY entry_computation {
+      param0 = f32[128,2,16,16]{3,2,1,0} parameter(0)
+      constant = f32[4,4,2,2]{3,2,1,0} constant(%s)
+      ROOT convolution = f32[128,2,32,32]{3,2,1,0} convolution(param0, constant),
+          window={size=4x4 pad=2_2x2_2 lhs_dilate=2x2},
+          dim_labels=bf01_01oi->bf01, feature_group_count=1
+    })",
+                                            constant_str);
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  EXPECT_TRUE(RunPass(m.get()));
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      op::GetTupleElement(op::CustomCall(kCudnnConvBackwardInputCallTarget, _,
+                                         op::Reverse(op::Constant())),
+                          0));
+}
+
+}  // anonymous namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3425e1b4942aaf1011ba1bf1c50dd7e79c1f9807
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -0,0 +1,430 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using se::DeviceMemory;
+using se::DeviceMemoryBase;
+using se::Stream;
+using se::dnn::AlgorithmConfig;
+using se::dnn::BatchDescriptor;
+using se::dnn::ConvolutionDescriptor;
+using se::dnn::DataLayout;
+using se::dnn::DimIndex;
+using se::dnn::FilterDescriptor;
+using se::dnn::FilterLayout;
+using se::dnn::ProfileResult;
+
+struct CudnnConvParams {
+  // Here are the fields related to cuDNN's fused convolution. The result thus
+  // is defined as:
+  //   activation(conv_result_scale * conv(x, w) +
+  //       side_input_scale * side_input + broadcast(bias))
+  //
+  // The most common fused conv is conv forward + relu/identity, for example.
+  //
+  // bias_buf is a single-dimensional array, with the length equal to the number
+  // of output features. It'll be broadcasted to the output shape in order to be
+  // added to the final results.
+  //
+  // side_input_buf, if valid, must have the same shape as the output buffer.
+  struct FusionParams {
+    se::dnn::ActivationMode mode;
+    double side_input_scale;
+    se::DeviceMemoryBase bias_buf;
+    se::DeviceMemoryBase side_input_buf;  // nullable
+  };
+
+  CudnnConvKind kind;
+  const Shape* input_shape;
+  const Shape* filter_shape;
+  const Shape* output_shape;
+  se::DeviceMemoryBase input_buf;
+  se::DeviceMemoryBase filter_buf;
+  se::DeviceMemoryBase output_buf;
+  const Window* window;
+  const ConvolutionDimensionNumbers* dnums;
+  int64 feature_group_count;
+  se::dnn::AlgorithmConfig algorithm;
+  double conv_result_scale;
+
+  absl::optional<FusionParams> fusion;
+};
+
+// A StreamExecutor ScratchAllocator that wraps a single XLA allocation,
+// returning it (in its entirety) the first time Allocate() is called.
+class ScratchBufAllocator : public se::ScratchAllocator {
+ public:
+  explicit ScratchBufAllocator(se::DeviceMemoryBase scratch)
+      : scratch_(scratch) {}
+
+  ~ScratchBufAllocator() override = default;
+
+  int64 GetMemoryLimitInBytes(se::Stream* /*stream*/) override {
+    return scratch_.size();
+  }
+
+  se::port::StatusOr<DeviceMemory<uint8>> AllocateBytes(
+      se::Stream* stream, int64 byte_size) override {
+    if (allocated_) {
+      return se::port::InternalError(
+          "Can't allocate twice from a ScratchBufAllocator.");
+    }
+    if (byte_size > scratch_.size()) {
+      return se::port::InternalError(absl::StrCat(
+          "Can't allocate ", byte_size,
+          " bytes from a ScratchBufAllocator of size ", scratch_.size()));
+    }
+
+    allocated_ = true;
+    return se::DeviceMemory<uint8>(scratch_);
+  }
+
+ private:
+  se::DeviceMemoryBase scratch_;
+  bool allocated_ = false;
+};
+
+template <typename T>
+Status RunCudnnConvImpl(CudnnConvParams params,
+                        se::ScratchAllocator* scratch_allocator,
+                        se::Stream* stream,
+                        se::dnn::ProfileResult* profile_result) {
+  CudnnConvKind kind = params.kind;
+  const Shape& input_shape = *params.input_shape;
+  const Shape& filter_shape = *params.filter_shape;
+  const Shape& output_shape = *params.output_shape;
+  DeviceMemory<T> input_buf(params.input_buf);
+  DeviceMemory<T> filter_buf(params.filter_buf);
+  DeviceMemory<T> output_buf(params.output_buf);
+  const Window& window = *params.window;
+  const ConvolutionDimensionNumbers& dnums = *params.dnums;
+  int64 feature_group_count = params.feature_group_count;
+  AlgorithmConfig algorithm = params.algorithm;
+
+  VLOG(3) << "Convolution Algorithm: " << algorithm.algorithm()->algo_id();
+  VLOG(3) << "tensor_ops_enabled: "
+          << algorithm.algorithm()->tensor_ops_enabled();
+  VLOG(3) << "Convolution kind: " << CudnnConvKindToString(kind);
+  VLOG(3) << "input shape: " << ShapeUtil::HumanStringWithLayout(input_shape);
+  VLOG(3) << "filter shape: " << ShapeUtil::HumanStringWithLayout(filter_shape);
+  VLOG(3) << "Output shape: " << ShapeUtil::HumanStringWithLayout(output_shape);
+  VLOG(3) << "Window: { " << window.ShortDebugString() << " }";
+  VLOG(3) << "Dim nums: { " << dnums.ShortDebugString() << " }";
+
+  const int num_dimensions = window.dimensions_size();
+  CHECK_LE(num_dimensions, 3);
+  CHECK_GE(num_dimensions, 1);
+  // cuDNN does not support 1D convolutions. We therefore express 1D
+  // convolutions as 2D convolutions where the first spatial dimension is 1.
+  // This matches the behavior of TF (see definition of conv1d in
+  // tensorflow/python/ops/nn_ops.py).
+  const int effective_num_dimensions = std::max(2, num_dimensions);
+
+  CHECK_EQ(primitive_util::NativeToPrimitiveType<T>(),
+           output_shape.element_type())
+      << ShapeUtil::HumanString(output_shape);
+
+  // If one dimension is reversed, we need to have all dimensions reversed (so
+  // we're doing convolution not cross correlation).
+  const bool dims_reversed = window.dimensions()[0].window_reversal();
+
+  CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size());
+  for (const WindowDimension& dim : window.dimensions()) {
+    CHECK_EQ(dims_reversed, dim.window_reversal());
+    CHECK_EQ(dim.padding_low(), dim.padding_high());
+    CHECK_EQ(dim.base_dilation(), 1)
+        << "cudnn does not support base dilation; it "
+           "must be made explicit with a kPad";
+    CHECK_EQ(dim.window_dilation(), 1)
+        << "XLA does not support window dilation (although cudnn does); it "
+           "must be made explicit with a kPad";
+  }
+
+  // cuDNN's convolution APIs support the BDYX layout for activations/output and
+  // the OIYX layout for weights.
+  DataLayout input_dl;
+  FilterLayout filter_dl;
+  DataLayout output_dl;
+
+  TF_ASSIGN_OR_RETURN(std::tie(input_dl, filter_dl, output_dl),
+                      XlaConvLayoutsToStreamExecutorLayouts(
+                          dnums, input_shape.layout(), filter_shape.layout(),
+                          output_shape.layout()));
+
+  BatchDescriptor input_descriptor(effective_num_dimensions);
+  input_descriptor.set_layout(input_dl)
+      .set_feature_map_count(
+          input_shape.dimensions(dnums.input_feature_dimension()))
+      .set_count(input_shape.dimensions(dnums.input_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    // Note that the dimensions are reversed. The same holds below.
+    input_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        input_shape.dimensions(dnums.input_spatial_dimensions(dim)));
+  }
+
+  FilterDescriptor filter_descriptor(effective_num_dimensions);
+  filter_descriptor.set_layout(filter_dl)
+      .set_input_feature_map_count(
+          filter_shape.dimensions(dnums.kernel_input_feature_dimension()))
+      .set_output_feature_map_count(
+          filter_shape.dimensions(dnums.kernel_output_feature_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    filter_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        filter_shape.dimensions(dnums.kernel_spatial_dimensions(dim)));
+  }
+
+  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
+  convolution_descriptor.set_group_count(feature_group_count);
+  convolution_descriptor.set_convolution_not_crosscorr(dims_reversed);
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    convolution_descriptor
+        .set_zero_padding(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).padding_low())
+        .set_filter_stride(
+            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+            window.dimensions(dim).stride());
+  }
+
+  BatchDescriptor output_descriptor(effective_num_dimensions);
+  output_descriptor.set_layout(output_dl)
+      .set_feature_map_count(
+          output_shape.dimensions(dnums.output_feature_dimension()))
+      .set_count(output_shape.dimensions(dnums.output_batch_dimension()));
+  for (int dim = 0; dim < num_dimensions; ++dim) {
+    output_descriptor.set_spatial_dim(
+        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
+        output_shape.dimensions(dnums.output_spatial_dimensions(dim)));
+  }
+
+  // Add a singleton dimension in the 1D convolution case.
+  if (num_dimensions == 1) {
+    input_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    output_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    filter_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
+    convolution_descriptor.set_zero_padding(static_cast<DimIndex>(0), 0)
+        .set_filter_stride(static_cast<DimIndex>(0), 1);
+  }
+
+  switch (kind) {
+    case CudnnConvKind::kForward:
+      if (params.conv_result_scale != 1) {
+        return InternalError(
+            "StreamExecutor doesn't support scaled convolution: %lf.",
+            params.conv_result_scale);
+      }
+      stream->ThenConvolveWithAlgorithm(
+          input_descriptor, input_buf, filter_descriptor, filter_buf,
+          convolution_descriptor, output_descriptor, &output_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+    case CudnnConvKind::kBackwardInput:
+      if (params.conv_result_scale != 1) {
+        return InternalError(
+            "StreamExecutor doesn't support scaled convolution: %lf.",
+            params.conv_result_scale);
+      }
+      stream->ThenConvolveBackwardDataWithAlgorithm(
+          filter_descriptor, filter_buf, output_descriptor, output_buf,
+          convolution_descriptor, input_descriptor, &input_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+    case CudnnConvKind::kBackwardFilter:
+      if (params.conv_result_scale != 1) {
+        return InternalError(
+            "StreamExecutor doesn't support scaled convolution: %lf.",
+            params.conv_result_scale);
+      }
+      stream->ThenConvolveBackwardFilterWithAlgorithm(
+          input_descriptor, input_buf, output_descriptor, output_buf,
+          convolution_descriptor, filter_descriptor, &filter_buf,
+          scratch_allocator, algorithm, profile_result);
+      break;
+    case CudnnConvKind::kForwardActivation: {
+      BatchDescriptor bias_desc;
+      bias_desc.set_count(1)
+          .set_height(1)
+          .set_width(1)
+          .set_feature_map_count(
+              output_shape.dimensions(dnums.output_feature_dimension()))
+          .set_layout(output_dl);
+
+      se::DeviceMemory<T> side_input(params.fusion->side_input_buf);
+      // If there is no side input, use output as the side input.
+      if (side_input.is_null()) {
+        if (params.fusion->side_input_scale != 0) {
+          return InternalError(
+              "Side input scale is not 0, yet no side input buffer is "
+              "provided");
+        }
+        // Since side-input scale is 0, the values in the side input don't
+        // matter.  The simplest thing to do would be to pass in a null buffer
+        // for the side input, but cudnn doesn't allow this.  cudnn does promise
+        // that if side-input-scale is 0 the side input won't be read, so we
+        // just pass in the output buffer, since it's handy and has the correct
+        // size.
+        side_input = output_buf;
+      }
+
+      stream->ThenFusedConvolveWithAlgorithm(
+          input_descriptor, input_buf, params.conv_result_scale,
+          filter_descriptor, filter_buf, convolution_descriptor, side_input,
+          params.fusion->side_input_scale, bias_desc,
+          DeviceMemory<T>(params.fusion->bias_buf), params.fusion->mode,
+          output_descriptor, &output_buf, scratch_allocator, algorithm,
+          profile_result);
+      break;
+    }
+  }
+
+  if (!stream->ok()) {
+    return InternalError(
+        "Unable to launch convolution with type %s and algorithm (%d, %d)",
+        CudnnConvKindToString(kind), algorithm.algorithm()->algo_id(),
+        algorithm.algorithm_no_scratch()->algo_id());
+  }
+  return Status::OK();
+}
+
+// Returns the cudnn convolution parameters generated from conv, which must be a
+// custom-call to a cudnn convolution.
+StatusOr<CudnnConvParams> GetCudnnConvParams(
+    const HloCustomCallInstruction* conv,
+    absl::Span<se::DeviceMemoryBase> operand_buffers,
+    se::DeviceMemoryBase result_buffer) {
+  CudnnConvParams params;
+
+  TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
+                      conv->backend_config<CudnnConvBackendConfig>());
+  TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(conv));
+  const auto& lhs_shape = conv->operand(0)->shape();
+  const auto& rhs_shape = conv->operand(1)->shape();
+  const auto& conv_result_shape = conv->shape().tuple_shapes(0);
+
+  params.kind = kind;
+  params.window = &conv->window();
+  params.dnums = &conv->convolution_dimension_numbers();
+  params.feature_group_count = conv->feature_group_count();
+  params.algorithm = se::dnn::AlgorithmConfig(se::dnn::AlgorithmDesc(
+      backend_config.algorithm(), backend_config.tensor_ops_enabled()));
+  params.conv_result_scale = backend_config.conv_result_scale();
+
+  switch (kind) {
+    case CudnnConvKind::kForward:
+      params.input_shape = &lhs_shape;
+      params.filter_shape = &rhs_shape;
+      params.output_shape = &conv_result_shape;
+      params.input_buf = operand_buffers[0];
+      params.filter_buf = operand_buffers[1];
+      params.output_buf = result_buffer;
+      break;
+    case CudnnConvKind::kBackwardInput:
+      params.input_shape = &conv_result_shape;
+      params.filter_shape = &rhs_shape;
+      params.output_shape = &lhs_shape;
+      params.input_buf = result_buffer;
+      params.filter_buf = operand_buffers[1];
+      params.output_buf = operand_buffers[0];
+      break;
+    case CudnnConvKind::kBackwardFilter:
+      params.input_shape = &lhs_shape;
+      params.filter_shape = &conv_result_shape;
+      params.output_shape = &rhs_shape;
+      params.input_buf = operand_buffers[0];
+      params.filter_buf = result_buffer;
+      params.output_buf = operand_buffers[1];
+      break;
+    case CudnnConvKind::kForwardActivation: {
+      params.kind = CudnnConvKind::kForwardActivation;
+      params.input_shape = &lhs_shape;
+      params.filter_shape = &rhs_shape;
+      params.output_shape = &conv_result_shape;
+      params.fusion.emplace();
+      auto& fusion = *params.fusion;
+      if (!se::dnn::ActivationMode_IsValid(backend_config.activation_mode())) {
+        return InternalError("Bad activation mode: %s",
+                             backend_config.ShortDebugString());
+      }
+      fusion.mode = static_cast<se::dnn::ActivationMode>(
+          backend_config.activation_mode());
+      fusion.side_input_scale = backend_config.side_input_scale();
+      params.input_buf = operand_buffers[0];
+      params.filter_buf = operand_buffers[1];
+      params.output_buf = result_buffer;
+      params.fusion->bias_buf = operand_buffers[2];
+      if (operand_buffers.size() >= 4) {
+        params.fusion->side_input_buf = operand_buffers[3];
+      }
+    }
+  }
+  return params;
+}
+
+}  // anonymous namespace
+
+Status RunCudnnConv(const HloCustomCallInstruction* conv,
+                    absl::Span<se::DeviceMemoryBase> operand_buffers,
+                    se::DeviceMemoryBase result_buffer,
+                    se::DeviceMemoryBase scratch_buf, se::Stream* stream,
+                    se::dnn::ProfileResult* profile_result) {
+  ScratchBufAllocator scratch_allocator(scratch_buf);
+  return RunCudnnConv(conv, operand_buffers, result_buffer, &scratch_allocator,
+                      stream, profile_result);
+}
+
+Status RunCudnnConv(const HloCustomCallInstruction* conv,
+                    absl::Span<se::DeviceMemoryBase> operand_buffers,
+                    se::DeviceMemoryBase result_buffer,
+                    se::ScratchAllocator* scratch_allocator, se::Stream* stream,
+                    se::dnn::ProfileResult* profile_result) {
+  TF_ASSIGN_OR_RETURN(CudnnConvParams params,
+                      GetCudnnConvParams(conv, operand_buffers, result_buffer));
+
+  PrimitiveType output_primitive_type =
+      conv->shape().tuple_shapes(0).element_type();
+  switch (output_primitive_type) {
+    case F16:
+      return RunCudnnConvImpl<Eigen::half>(params, scratch_allocator, stream,
+                                           profile_result);
+    case F32:
+      return RunCudnnConvImpl<float>(params, scratch_allocator, stream,
+                                     profile_result);
+    case F64:
+      return RunCudnnConvImpl<double>(params, scratch_allocator, stream,
+                                      profile_result);
+    default:
+      LOG(FATAL) << ShapeUtil::HumanString(*params.output_shape);
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..edbc75a94a1238540390b93f0fa5217852c7781f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_RUNNER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_RUNNER_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// This file contains low-level routines for running cudnn convolutions.
+
+// Calls into cudnn to run the specified convolution.
+//
+// We provide one overload which takes a scratch buffer, and another which takes
+// an allocator which is responsible for allocating the scratch space.  In
+// theory the second one shouldn't be necessary -- users of this function could
+// just ask cudnn how much scratch space it needs for a particular convolution.
+// But in practice, StreamExecutor does not expose such an API, and in the name
+// of parsimony, perhaps it's better not to add it.  Instead, the first time you
+// call a convolution, you should call the version that takes a scratch
+// allocator and take note of how much memory is used.  The next time you call
+// the same conv, you can provide an explicitly preallocated scratch buffer of
+// that size, if you like.
+Status RunCudnnConv(const HloCustomCallInstruction* conv,
+                    absl::Span<se::DeviceMemoryBase> operand_buffers,
+                    se::DeviceMemoryBase result_buffer,
+                    se::DeviceMemoryBase scratch_buf, se::Stream* stream,
+                    se::dnn::ProfileResult* profile_result = nullptr);
+
+Status RunCudnnConv(const HloCustomCallInstruction* conv,
+                    absl::Span<se::DeviceMemoryBase> operand_buffers,
+                    se::DeviceMemoryBase result_buffer,
+                    se::ScratchAllocator* scratch_allocator, se::Stream* stream,
+                    se::dnn::ProfileResult* profile_result = nullptr);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONV_RUNNER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
deleted file mode 100644
index 7125673887d28729287d67577bcfa06423f85611..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ /dev/null
@@ -1,411 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/types/optional.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
-#include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
-#include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using absl::optional;
-using se::DeviceMemoryBase;
-using se::dnn::AlgorithmConfig;
-using se::dnn::AlgorithmDesc;
-
-class ScratchAllocator : public se::ScratchAllocator {
- public:
-  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
-      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
-
-  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
-    return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
-  }
-  int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
-
-  StatusOr<se::DeviceMemory<uint8>> AllocateBytes(se::Stream* stream,
-                                                  int64 byte_size) override;
-
- private:
-  const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<OwningDeviceMemory> allocated_buffers_;
-  int64 total_allocated_bytes_ = 0;
-};
-
-StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
-    se::Stream* stream, int64 byte_size) {
-  CHECK_GE(byte_size, 0) << "byte_size must be positive.";
-  if (byte_size > GetMemoryLimitInBytes(stream)) {
-    return se::port::Status(
-        se::port::error::RESOURCE_EXHAUSTED,
-        absl::StrFormat(
-            "Allocating %d bytes exceeds the memory limit of %d bytes.",
-            byte_size, GetMemoryLimitInBytes(stream)));
-  }
-
-  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
-                      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                                  /*retry_on_failure=*/false));
-  total_allocated_bytes_ += byte_size;
-
-  se::DeviceMemoryBase buffer_addr = allocated_buffer.AsDeviceMemoryBase();
-  allocated_buffers_.push_back(std::move(allocated_buffer));
-  return se::DeviceMemory<uint8>(buffer_addr);
-}
-
-std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
-                                         se::StreamExecutor* stream_exec) {
-  std::vector<AlgorithmDesc> algorithms;
-  bool succ = false;
-  switch (kind) {
-    case CudnnConvKind::kBackwardFilter:
-      succ =
-          stream_exec->GetConvolveBackwardFilterAlgorithms(true, &algorithms);
-      break;
-    case CudnnConvKind::kBackwardInput:
-      succ = stream_exec->GetConvolveBackwardDataAlgorithms(true, &algorithms);
-      break;
-    case CudnnConvKind::kForward:
-    case CudnnConvKind::kForwardActivation:
-      succ = stream_exec->GetConvolveAlgorithms(true, &algorithms);
-      break;
-  }
-  DCHECK(succ);
-
-  return algorithms;
-}
-
-string AlgorithmToString(const AlgorithmDesc& algo) {
-  if (algo.tensor_ops_enabled()) {
-    return absl::StrCat(algo.algo_id(), "+TC");
-  }
-  return absl::StrCat(algo.algo_id());
-}
-
-string NumBytesToString(int64 bytes) {
-  return absl::StrCat(tensorflow::strings::HumanReadableNumBytes(bytes), " (",
-                      bytes, "B)");
-}
-
-// Acquires a process-global lock on the device pointed to by the given
-// StreamExecutor.
-//
-// This is used to prevent other XLA instances from trying to autotune on this
-// device while we're using it.
-tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
-  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
-  // se::Platform*s are global singletons guaranteed to live forever.
-  static auto* mutexes =
-      new std::map<std::pair<const se::Platform*, /*device_ordinal*/ int64>,
-                   tensorflow::mutex>();
-
-  tensorflow::mutex_lock global_lock(mu);
-  auto it = mutexes
-                ->emplace(std::piecewise_construct,
-                          std::make_tuple(stream_exec->platform(),
-                                          stream_exec->device_ordinal()),
-                          std::make_tuple())
-                .first;
-  return tensorflow::mutex_lock{it->second};
-}
-
-}  // anonymous namespace
-
-// We could have caching here so that we don't redo this work for two identical
-// convolutions.  Unfortunately our cache key would have to be a tuple
-// containing the protos passed to this function, and we have no utility for
-// hashing protos.  We could write our own hash functions, but they'd silently
-// break if we ever added a field to one of the protos.  Perhaps we could hack
-// using the binary-encoded proto as the hash key, on the assumption that two
-// protos being binary-equal is a sufficient, if not necessary, condition for
-// proper equality.  But that would still leave us open to having unnecessary
-// cache misses and doing extra work.  Overall, caching doesn't seem worth the
-// trouble, but we may want to revisit this if we ever find a model where
-// caching would speed up compilation a lot.
-StatusOr<std::tuple<int64, bool, int64>>
-CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
-    HloCustomCallInstruction* instr) {
-  // TODO(timshen): for now only check fp16. It can be expanded to other types,
-  // with some work on the HLO routines.
-  const bool cross_check_enabled =
-      instr->shape().tuple_shapes(0).element_type() == xla::F16;
-
-  // Don't run this function concurrently on the same GPU.
-  //
-  // This is a bit of a hack and doesn't protect us against arbitrary concurrent
-  // use of a GPU, but it's sufficient to let us compile two HLO modules
-  // concurrently and then run them sequentially.
-  tensorflow::mutex_lock lock = LockGpu(stream_exec_);
-
-  // Make sure any previous activity on this executor is done. We don't want to
-  // interfere with programs that are still running on the GPU.
-  if (!stream_exec_->SynchronizeAllActivity()) {
-    return InternalError("Failed to synchronize GPU for autotuning.");
-  }
-
-  // Create a stream for us to do our work on.
-  se::Stream stream{stream_exec_};
-  stream.Init();
-  const auto device_ordinal = stream_exec_->device_ordinal();
-
-  // allocator either points to this->allocator_ or, if that's null, to a
-  // StreamExecutorMemoryAllocator for stream_exec_.
-  DeviceMemoryAllocator* allocator;
-  optional<StreamExecutorMemoryAllocator> se_allocator;
-  if (allocator_ != nullptr) {
-    allocator = allocator_;
-  } else {
-    se_allocator.emplace(stream_exec_->platform(),
-                         absl::Span<se::StreamExecutor* const>({stream_exec_}));
-    allocator = &*se_allocator;
-  }
-
-  const auto initialize_buffer = [&stream, cross_check_enabled](
-                                     DeviceMemoryBase buffer) {
-    if (cross_check_enabled) {
-      // Broadcast a constant to the buffer, instead of zeroing the buffer. A
-      // non-zero constant is useful for the cross checking, because zero-inputs
-      // may not always reveal the bugs.
-      CHECK_EQ(0, (uintptr_t)buffer.opaque() % 4);
-      size_t left_over_bytes = buffer.size() % 4;
-      CHECK_EQ(0, left_over_bytes % 2);
-
-      constexpr float kBroadcastedConstant = 0.1f;
-      static const Eigen::half halfs[2] = {Eigen::half(kBroadcastedConstant),
-                                           Eigen::half(kBroadcastedConstant)};
-      uint32 bits;
-      static_assert(sizeof(bits) == sizeof(halfs), "");
-      memcpy(&bits, halfs, sizeof(bits));
-
-      size_t aligned_size = buffer.size() / 4 * 4;
-      stream.ThenMemset32(&buffer, bits, aligned_size);
-
-      DeviceMemoryBase left_over(
-          static_cast<char*>(buffer.opaque()) + aligned_size, left_over_bytes);
-      stream.ThenMemcpy(&left_over, halfs, left_over_bytes);
-    } else {
-      // Although we don't have evidence this matters, zero out the buffers
-      // before autotuning.  It's conceivable that using uninitialized memory as
-      // the inputs might affect performance if e.g. the inputs contain
-      // denormals, and this is easy enough.
-      stream.ThenMemZero(&buffer, buffer.size());
-    }
-  };
-
-  // Allocate space for the input, filter, and output of the convolution.  We
-  // use a ScratchAllocator for this instead of calling allocator_ directly so
-  // that our allocations don't leak.
-  ScratchAllocator input_output_allocator(device_ordinal, allocator);
-  std::vector<se::DeviceMemoryBase> operand_buffers;
-  for (const auto* operand : instr->operands()) {
-    TF_ASSIGN_OR_RETURN(auto buffer,
-                        input_output_allocator.AllocateBytes(
-                            &stream, ShapeUtil::ByteSizeOf(operand->shape())));
-    initialize_buffer(buffer);
-    operand_buffers.push_back(buffer);
-  }
-  TF_ASSIGN_OR_RETURN(
-      auto result_buffer,
-      input_output_allocator.AllocateBytes(
-          &stream, ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
-  initialize_buffer(result_buffer);
-
-  se::dnn::ProfileResult best_result;
-  int64 best_result_bytes_used = 0;
-  TF_ASSIGN_OR_RETURN(auto backend_config,
-                      instr->backend_config<CudnnConvBackendConfig>());
-
-  optional<F16BufferComparator> comparator;
-  // Use the first algorithm that's supported as reference. There isn't a
-  // particular reason to use it, as any algorithm sufficies. It doesn't make
-  // this algorithm considered correct, though.
-  optional<AlgorithmDesc> first_algorithm;
-  TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
-  for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
-    ScratchAllocator scratch_allocator(device_ordinal, allocator);
-    se::dnn::ProfileResult profile_result;
-    VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
-            << instr->ToString();
-
-    backend_config.set_algorithm(alg.algo_id());
-    backend_config.set_tensor_ops_enabled(alg.tensor_ops_enabled());
-    TF_RETURN_IF_ERROR(instr->set_backend_config(backend_config));
-    bool launch_ok = RunCudnnConvolution(instr, absl::MakeSpan(operand_buffers),
-                                         result_buffer, &scratch_allocator,
-                                         &stream, &profile_result)
-                         .ok();
-
-    if (launch_ok && profile_result.is_valid()) {
-      const bool crash_on_checking_failure =
-          instr->GetModule()
-              ->config()
-              .debug_options()
-              .xla_gpu_crash_on_verification_failures();
-      if (comparator.has_value()) {
-        StatusOr<bool> result = comparator->CompareEqual(
-            se::DeviceMemory<Eigen::half>(result_buffer));
-        if (!result.ok()) {
-          LOG(ERROR) << "Unable to compare "
-                     << AlgorithmToString(*first_algorithm) << " against "
-                     << AlgorithmToString(alg) << " for " << instr->ToString()
-                     << ": " << result.status();
-          CHECK(!crash_on_checking_failure);
-        } else if (!result.ValueOrDie()) {
-          LOG(ERROR) << "Results mismatch between different convolution "
-                        "algorithms. This is likely a bug in convolution, or "
-                        "an excessive loss of precision in convolution. "
-                     << instr->ToString() << " for "
-                     << AlgorithmToString(*first_algorithm) << " vs "
-                     << AlgorithmToString(alg);
-          CHECK(!crash_on_checking_failure);
-        }
-      } else if (cross_check_enabled) {
-        auto comp = F16BufferComparator::Create(
-            se::DeviceMemory<Eigen::half>(result_buffer), compiler_, allocator,
-            &stream);
-        if (comp.ok()) {
-          comparator.emplace(comp.ConsumeValueOrDie());
-          first_algorithm.emplace(alg);
-        } else {
-          LOG(ERROR) << "Fail to initialize buffer comparator: "
-                     << comp.status() << ", instruction: " << instr->ToString();
-          CHECK(!crash_on_checking_failure);
-        }
-      }
-      int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
-      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
-              << " succeeded, taking " << profile_result.elapsed_time_in_ms()
-              << "ms and using " << NumBytesToString(scratch_bytes_used)
-              << " of scratch (Best result: "
-              << best_result.elapsed_time_in_ms() << "ms, "
-              << NumBytesToString(best_result_bytes_used) << " of scratch)";
-      if (profile_result.elapsed_time_in_ms() <
-          best_result.elapsed_time_in_ms()) {
-        best_result = profile_result;
-        best_result_bytes_used = scratch_bytes_used;
-      }
-    } else {
-      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed.";
-    }
-  }
-  if (best_result.is_valid()) {
-    VLOG(2) << "Best algorithm for " << instr->ToString() << ": "
-            << AlgorithmToString(best_result.algorithm()) << ", takes "
-            << best_result.elapsed_time_in_ms() << "ms, and uses "
-            << best_result_bytes_used << "B of scratch memory.";
-    return std::make_tuple(best_result.algorithm().algo_id(),
-                           best_result.algorithm().tensor_ops_enabled(),
-                           best_result_bytes_used);
-  }
-
-  return InternalError(
-      "All algorithms tried for convolution %s failed.  Falling back to "
-      "default algorithm.",
-      instr->ToString());
-}
-
-StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
-    HloInstruction* instr) {
-  CHECK(IsCustomCallToDnnConvolution(*instr));
-
-  StatusOr<std::tuple<int64, bool, int64>> alg_scratch_and_tc =
-      PickBestAlgorithm(Cast<HloCustomCallInstruction>(instr));
-
-  if (!alg_scratch_and_tc.ok()) {
-    LOG(ERROR) << alg_scratch_and_tc.status();
-    return false;
-  }
-
-  int64 algorithm;
-  bool tensor_ops_enabled;
-  int64 scratch_bytes;
-
-  std::tie(algorithm, tensor_ops_enabled, scratch_bytes) =
-      alg_scratch_and_tc.ConsumeValueOrDie();
-
-  VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and "
-          << NumBytesToString(scratch_bytes)
-          << " of scratch memory: " << instr->ToString()
-          << " tensor_ops_enabled: " << tensor_ops_enabled;
-
-  // Replace instr with a new CustomCall which has the correct algorithm, and
-  // whose output shape has the appropriate amount of scratch memory.
-  HloComputation* computation = instr->parent();
-  Shape new_call_shape =
-      ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0),
-                                 ShapeUtil::MakeShape(U8, {scratch_bytes})});
-
-  TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
-                      instr->backend_config<CudnnConvBackendConfig>());
-  backend_config.set_algorithm(algorithm);
-  backend_config.set_tensor_ops_enabled(tensor_ops_enabled);
-
-  HloInstruction* new_call = computation->AddInstruction(
-      instr->CloneWithNewOperands(new_call_shape, instr->operands()));
-
-  TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
-
-  // Repackage new_call so it has the same shape as the original call, namely
-  // (conv_result, u8[0]).
-  HloInstruction* new_tuple =
-      computation->AddInstruction(HloInstruction::CreateTuple(
-          {computation->AddInstruction(HloInstruction::CreateGetTupleElement(
-               new_call_shape.tuple_shapes(0), new_call, 0)),
-           computation->AddInstruction(HloInstruction::CreateConstant(
-               LiteralUtil::CreateR1<uint8>({})))}));
-
-  TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple));
-  return true;
-}
-
-StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnComputation(
-    HloComputation* computation) {
-  std::vector<HloInstruction*> convs;
-  for (auto* instr : computation->instructions()) {
-    if (IsCustomCallToDnnConvolution(*instr)) {
-      convs.push_back(instr);
-    }
-  }
-
-  bool changed = false;
-  for (auto* instr : convs) {
-    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
-    changed |= result;
-  }
-  return changed;
-}
-
-StatusOr<bool> CudnnConvolutionAlgorithmPicker::Run(HloModule* module) {
-  bool changed = false;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
-    changed |= result;
-  }
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
deleted file mode 100644
index aeda2fc7f8b4d6169fc2baa8975119ba7bf68dd2..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
-
-#include "absl/types/optional.h"
-#include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-
-namespace xla {
-namespace gpu {
-
-// Modifies CustomCalls to cudnn convolutions, choosing the best algorithm for
-// each and adding explicit scratch space to the CustomCalls.
-class CudnnConvolutionAlgorithmPicker : public HloModulePass {
- public:
-  // If the `allocator` parameter is not null, we will use it to allocate temp
-  // memory while timing the various convolution algorithms.  If it's null,
-  // we'll use the default allocator on the StreamExecutor.
-  CudnnConvolutionAlgorithmPicker(se::StreamExecutor* stream_exec,
-                                  DeviceMemoryAllocator* allocator,
-                                  Compiler* compiler)
-      : stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {}
-
-  absl::string_view name() const override {
-    return "cudnn-convolution-algorithm-picker";
-  }
-
-  StatusOr<bool> Run(HloModule* module) override;
-
- private:
-  StatusOr<bool> RunOnComputation(HloComputation* computation);
-  StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  StatusOr<std::tuple<int64, bool, int64>> PickBestAlgorithm(
-      HloCustomCallInstruction* instr);
-
-  se::StreamExecutor* stream_exec_;                   // never null
-  DeviceMemoryAllocator* allocator_;                  // may be null
-  Compiler* compiler_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
deleted file mode 100644
index ef292373018295f5100b91c343df274b626c2fa1..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.cc
+++ /dev/null
@@ -1,565 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
-
-#include <cstdlib>
-#include <numeric>
-#include <vector>
-
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/window_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-HloInstruction* CreateCudnnConv(const char* call_target, const Shape& shape,
-                                HloInstruction* lhs, HloInstruction* rhs,
-                                const Window& window,
-                                const ConvolutionDimensionNumbers& dnums,
-                                int64 feature_group_count) {
-  HloComputation* computation = lhs->parent();
-
-  // This call returns a tuple of (conv_result, scratch_memory), where
-  // conv_result is the actual result of the convolution, and scratch_memory is
-  // temporary memory used by cudnn.
-  //
-  // At the moment, we don't know how much scratch memory this conv is going to
-  // use, so we put u8[0] in this place.  Later on another pass will choose
-  // which conv algorithm to use, and at that point we'll modify the shape of
-  // this second tuple element.
-  Shape call_shape =
-      ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
-
-  HloInstruction* custom_call = computation->AddInstruction(
-      HloInstruction::CreateCustomCall(call_shape, {lhs, rhs}, call_target));
-  custom_call->set_window(window);
-  custom_call->set_convolution_dimension_numbers(dnums);
-  custom_call->set_feature_group_count(feature_group_count);
-  return custom_call;
-}
-
-bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-  if (dnums.input_spatial_dimensions_size() > 3) {
-    return false;
-  }
-
-  // CuDNN does not accept zero-element arguments
-  if (ShapeUtil::IsZeroElementArray(conv->operand(0)->shape()) ||
-      ShapeUtil::IsZeroElementArray(conv->operand(1)->shape())) {
-    return false;
-  }
-
-  if (window_util::HasWindowReversal(conv->window())) {
-    return false;
-  }
-  return true;
-}
-
-// Try to match a backward filter pattern that contains "conv".
-// Precondition: "conv" is a kConvolution.
-std::tuple<bool, Window, ConvolutionDimensionNumbers> MatchBackwardFilter(
-    HloInstruction* conv) {
-  const auto no_match_result =
-      std::make_tuple(false, Window(), ConvolutionDimensionNumbers());
-  if (conv->feature_group_count() > 1) {
-    return no_match_result;
-  }
-  // Step 1: match the instruction pattern without considering the paddings and
-  // dimension numbers just yet. We may need some generic pattern matcher
-  // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h
-  //
-  // Backward filter convolution is implemented in XLA as the forward
-  // convolution of padded activations and dilated gradients. Padding on
-  // activations and dilation on gradients are specified in the "window" field
-  // of the forward convolution.
-  //
-  //        activations  gradients
-  //              \         /
-  //               v       v
-  //              Convolution
-  //                 conv
-  CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
-
-  // Step 2: match paddings and dimension numbers of the forward convolution.
-  const ConvolutionDimensionNumbers& conv_dnums =
-      conv->convolution_dimension_numbers();
-  auto input_batch_dim = conv_dnums.input_batch_dimension();
-  auto input_feature_dim = conv_dnums.input_feature_dimension();
-  auto input_spatial_dims = conv_dnums.input_spatial_dimensions();
-  auto kernel_input_feature_dim = conv_dnums.kernel_input_feature_dimension();
-  auto kernel_output_feature_dim = conv_dnums.kernel_output_feature_dimension();
-  auto kernel_spatial_dims = conv_dnums.kernel_spatial_dimensions();
-  auto output_batch_dim = conv_dnums.output_batch_dimension();
-  auto output_feature_dim = conv_dnums.output_feature_dimension();
-  auto output_spatial_dims = conv_dnums.output_spatial_dimensions();
-
-  for (const WindowDimension& window_dim : conv->window().dimensions()) {
-    if (window_dim.stride() != 1) {
-      VLOG(1) << "Forward convolution's window "
-              << conv->window().ShortDebugString()
-              << " should have stride of 1.";
-      return no_match_result;
-    }
-    if (window_dim.base_dilation() != 1) {
-      VLOG(1) << "Forward convolution's window "
-              << conv->window().ShortDebugString()
-              << " should have no base (LHS) dilation.";
-      return no_match_result;
-    }
-    if (window_dim.padding_low() < 0) {
-      VLOG(1) << "Padding low should be non-negative.";
-      return no_match_result;
-    }
-    if (window_dim.window_reversal()) {
-      VLOG(1) << "Window reversal field not supported";
-      return no_match_result;
-    }
-    // Padding high will be checked in Step 3.
-  }
-  if (input_batch_dim == output_batch_dim &&
-      !window_util::HasWindowDilation(conv->window())) {
-    VLOG(1) << conv->ToString()
-            << " is a regular forward convolution. No need "
-               "to fold it to a backward filter convolution.";
-    return no_match_result;
-  }
-
-  // Step 3: fuse the matched HLOs into a backward convolution instruction.
-  //
-  // Compute the window of the backward convolution.
-  Window backward_conv_window;
-  for (int i = 0; i < input_spatial_dims.size(); ++i) {
-    WindowDimension* dim = backward_conv_window.add_dimensions();
-    // The window size of the backward convolution equals the output size of the
-    // forward convolution.
-    int64 filter_size = conv->shape().dimensions(output_spatial_dims[i]);
-    dim->set_size(filter_size);
-    // The window stride equals the window dilation of the forward convolution.
-    dim->set_stride(conv->window().dimensions(i).window_dilation());
-    // The window's low padding is the same as the low padding of the
-    // activations.
-    dim->set_padding_low(conv->window().dimensions(i).padding_low());
-
-    int64 input_size =
-        conv->operand(0)->shape().dimensions(input_spatial_dims[i]);
-    int64 output_size = conv->window().dimensions(i).size();
-    // Compute the range of the amount of valid high padding. We first compute
-    // min_padding_high, the amount of padding on the right/bottom to ensure the
-    // last patch ends at the border, i.e.,
-    //
-    //   input_size + dim->padding_low() + min_padding_high
-    //     = (output_size - 1) * stride + filter_size
-    //
-    // Because convolution ignores trailing incomplete windows, any amount of
-    // padding high from min_padding_high to min_padding_high+stride-1
-    // (max_padding_high) has the same effect.
-    int64 padded_input_size = filter_size + (output_size - 1) * dim->stride();
-    int64 min_padding_high =
-        padded_input_size - input_size - dim->padding_low();
-    int64 max_padding_high = min_padding_high + dim->stride() - 1;
-    CHECK_GE(dim->padding_low(), 0);
-    // In practice, since cuDNN convolution only supports even padding, we make
-    // the amount of high padding the same as the amount of low padding as long
-    // as it is between min_padding_high and max_padding_high. If it is not in
-    // that range, we pick the one that's closest to dim->padding_low() and let
-    // PadInsertion canonicalize the resultant backward convolution later.
-    // Picking the closest one minimizes the cost of the kPad instruction to be
-    // inserted by PadInsertion.
-    if (dim->padding_low() >= min_padding_high &&
-        dim->padding_low() <= max_padding_high) {
-      dim->set_padding_high(dim->padding_low());
-    } else {
-      if (dim->padding_low() < min_padding_high) {
-        dim->set_padding_high(min_padding_high);
-      } else {
-        dim->set_padding_high(max_padding_high);
-      }
-    }
-    if (dim->padding_high() < 0) {
-      LOG(ERROR)
-          << "Fusing this pattern to backward filter convolution would cause "
-             "negative padding ("
-          << dim->padding_high()
-          << ") on right/bottom of the weight gradients, which is not "
-             "supported by PadInsertion (b/32744257). Falling back to "
-             "unfused convolution for instruction: "
-          << conv->ToString();
-      return no_match_result;
-    }
-  }
-
-  // Restore the dimension numbers of the backward convolution from the forward
-  // convolution. The two activation dimensions are reversed (batch and
-  // feature).
-  ConvolutionDimensionNumbers backward_conv_dnums;
-  backward_conv_dnums.set_input_batch_dimension(input_feature_dim);
-  backward_conv_dnums.set_input_feature_dimension(input_batch_dim);
-  for (int i = 0; i < input_spatial_dims.size(); ++i) {
-    backward_conv_dnums.add_input_spatial_dimensions(input_spatial_dims[i]);
-  }
-  backward_conv_dnums.set_output_batch_dimension(kernel_input_feature_dim);
-  backward_conv_dnums.set_output_feature_dimension(kernel_output_feature_dim);
-  for (int i = 0; i < kernel_spatial_dims.size(); ++i) {
-    backward_conv_dnums.add_output_spatial_dimensions(kernel_spatial_dims[i]);
-  }
-  // The dimension numbering of the output of the forward convolution (before
-  // transposition) is the same as that of the activations (according to the
-  // semantics of kConvolution). The batch dimension of the activations should
-  // be treated as the input feature dimension, and the feature dimension should
-  // be treated as the output feature.
-  backward_conv_dnums.set_kernel_input_feature_dimension(output_batch_dim);
-  backward_conv_dnums.set_kernel_output_feature_dimension(output_feature_dim);
-  for (int i = 0; i < output_spatial_dims.size(); ++i) {
-    backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]);
-  }
-
-  return std::make_tuple(true, backward_conv_window, backward_conv_dnums);
-}
-
-// Try to match a backward input pattern that contains "conv".
-// Precondition: "conv" is a kConvolution.
-std::tuple<bool, Window, ConvolutionDimensionNumbers, HloInstruction*>
-MatchBackwardInput(HloInstruction* conv) {
-  const auto no_match_result =
-      std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr);
-
-  // TODO(b/31709653): Theoretically cuDNN supports grouped convolutions also
-  // for the backward input convolution, but at least for now with version 7.1.4
-  // it is slower. This needs to be re-evaluated for future cuDNN versions.
-  // Note that we already have the necessary code down below, the only thing to
-  // enable it is to remove the following early return.
-  if (conv->feature_group_count() > 1) {
-    return no_match_result;
-  }
-
-  // Match instruction pattern.
-  CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
-  HloInstruction* reverse_filter = conv->mutable_operand(1);
-  ConvolutionDimensionNumbers dnums = conv->convolution_dimension_numbers();
-
-  // We pattern-match to a backwards input conv if:
-  //
-  //  - all spatial dims of the filter are reversed
-  //
-  // OR
-  //
-  //  - filter is 1x1 or a constant AND
-  //  - conv has base dilation (otherwise this is just a regular forward conv).
-  //
-  // The final criterion above is just for canonicalization; cudnn seems to run
-  // just as fast if we canonicalize 1x1/constant filters without base dilation
-  // to forward or backward convs.  We canonicalize to forward conv because (a)
-  // it's more natural (constant filters usually show up when doing inference,
-  // and having backwards convolutions in inference graphs would be weird), and
-  // (b) cudnn has special fusions for forward conv plus bias and activation,
-  // and we want to pattern-match to that after running this pass.
-  bool is_reversed_filter =
-      reverse_filter->opcode() == HloOpcode::kReverse &&
-      absl::c_is_permutation(dnums.kernel_spatial_dimensions(),
-                             reverse_filter->dimensions());
-  bool is_1x1_filter =
-      absl::c_all_of(conv->window().dimensions(),
-                     [](const WindowDimension& d) { return d.size() == 1; });
-  if (!is_reversed_filter &&
-      !(window_util::HasBaseDilation(conv->window()) &&
-        (reverse_filter->IsConstant() || is_1x1_filter))) {
-    VLOG(1) << "Can't match to backwards convolution. Either filter is not "
-               "kReverse, or it's not a base-dilated conv with a 1x1 or "
-               "constant filter.";
-    return no_match_result;
-  }
-
-  // Match padding and dilation of the forward convolution.
-  for (const WindowDimension& window_dim : conv->window().dimensions()) {
-    if (window_dim.stride() != 1) {
-      VLOG(1) << "Forward convolution's window "
-              << conv->window().ShortDebugString()
-              << " should have stride of 1.";
-      return no_match_result;
-    }
-    if (window_dim.window_dilation() != 1) {
-      VLOG(1) << "Forward convolution's window "
-              << conv->window().ShortDebugString()
-              << " should have no window dilation.";
-      return no_match_result;
-    }
-    if (window_dim.window_reversal()) {
-      VLOG(1) << "Window reversal field not supported";
-      return no_match_result;
-    }
-  }
-
-  const auto& input_spatial_dims = dnums.input_spatial_dimensions();
-  const auto& output_spatial_dims = dnums.output_spatial_dimensions();
-  CHECK_EQ(conv->window().dimensions().size(), input_spatial_dims.size());
-  CHECK_EQ(output_spatial_dims.size(), input_spatial_dims.size());
-
-  const Window& old_window = conv->window();
-  Window new_window = old_window;
-  for (size_t i = 0; i < input_spatial_dims.size(); ++i) {
-    // Restore backward convolution's padding config from the matched pattern.
-    // See the comment in tensorflow/core/kernels/conv_grad_tuple_ops.cc
-    // for how we convert backward input convolution to a variant of forward
-    // convolution.
-    //
-    // The stride of the backward convolution
-    // = the base dilation factor of the forward convolution
-    auto dim = new_window.mutable_dimensions(i);
-    dim->set_stride(old_window.dimensions(i).base_dilation());
-
-    // The low padding = kernel_size - 1 - low padding on the gradients
-    // Make sure the low padding is not negative.
-    auto kernel_size = old_window.dimensions(i).size();
-    auto backward_padding_low =
-        kernel_size - 1 - old_window.dimensions(i).padding_low();
-    if (backward_padding_low < 0) {
-      LOG(ERROR)
-          << "The low padding of the backward convolution would be negative ("
-          << backward_padding_low
-          << "), which isn't supported by PadInsertion for now (b/32744257).";
-      return no_match_result;
-    }
-    dim->set_padding_low(backward_padding_low);
-
-    // Compute the range of the amount of padding on the right/bottom of the
-    // activations. XLA's convolution requires all patches to be within the
-    // padded base. This gives us flexiblity to choose the amount of high
-    // padding from a set of values without changing the result of the backward
-    // convolution. The minimum amount (min_padding_high) makes the last patch
-    // end at the border. The maximum amount (max_padding_high) equals
-    // min_padding_high+stride-1 -- max_padding_high+1 would cause the output
-    // size to change.
-    auto unpadded_input_size = conv->shape().dimensions(output_spatial_dims[i]);
-    auto output_size =
-        conv->operand(0)->shape().dimensions(input_spatial_dims[i]);
-    auto padded_input_size = kernel_size + dim->stride() * (output_size - 1);
-    auto total_pad_size = padded_input_size - unpadded_input_size;
-    auto min_padding_high = total_pad_size - backward_padding_low;
-    auto max_padding_high = min_padding_high + dim->stride() - 1;
-
-    if (backward_padding_low >= min_padding_high &&
-        backward_padding_low <= max_padding_high) {
-      // In the best case (most likely), if backward_padding_low is in the range
-      // of the amounts of valid high padding, we choose backward_padding_low
-      // because cuDNN supports even padding only.
-      dim->set_padding_high(backward_padding_low);
-    } else {
-      // Otherwise, we choose the amount that's closest to backward_padding_low,
-      // and PadInsertion will later insert kSlice instructions to enforce even
-      // padding.
-      //
-      // For example, consider the backward convolution pattern
-      //
-      //   ab     xy
-      //   | pad  | reverse
-      //  .a.b    yx
-      //     \   /
-      //      ABC
-      //
-      // The amount of low padding on activations (in backward convolution) is
-      //   backward_padding_low = kernel_size - 1 - forward_padding_low
-      //                        = 2 - 1 - 1 = 0
-      //
-      // The amount of padding high must be between 1 and 2, in order to make
-      // Conv(ABC, xy, stride=2) produce exactly 2 elements (ab). 0 is not in
-      // the range of [1,2], so we pick the closest valid amount of padding
-      // high, which is 1 in this case. Therefore, we fuse the above pattern to
-      //
-      //   ABC = BackwardInputConv(ab, xy, stride=2, padding_high=1)
-      if (backward_padding_low < min_padding_high) {
-        dim->set_padding_high(min_padding_high);
-      } else {
-        dim->set_padding_high(max_padding_high);
-      }
-    }
-    // PadInsertion doesn't handle backward input convolution with negative
-    // padding for now. So fall back to unfused convolution in case of negative
-    // padding. For example,
-    //   ABCD = Conv(abc, reverse(xy), padding_high=2)
-    // could be fused to
-    //   ABCD = BackwardInputConv(abc, xy, padding_low=1, padding_high=-1)
-    // with positive padding low but negative padding high.
-    if (dim->padding_high() < 0) {
-      LOG(ERROR) << "Fusing this pattern to backward convolution would cause "
-                    "negative padding ("
-                 << dim->padding_high()
-                 << ") on right/bottom of the activations, which is not "
-                    "supported by PadInsertion (b/32744257). Falling back to "
-                    "unfused convolution for instruction: "
-                 << conv->ToString();
-      return no_match_result;
-    }
-  }
-
-  // OK, it's a match! Switch the input feature dimension with the output
-  // feature dimension. This is the way cuDNN expects it to be.
-  dnums.set_kernel_input_feature_dimension(
-      conv->convolution_dimension_numbers().kernel_output_feature_dimension());
-  dnums.set_kernel_output_feature_dimension(
-      conv->convolution_dimension_numbers().kernel_input_feature_dimension());
-
-  // If we matched against a constant, we need to add a reverse op that can be
-  // subsumed by the cuDNN call. algebraic-simplifier will later remove any
-  // unnecessary reverses.
-  if (reverse_filter->opcode() != HloOpcode::kReverse &&
-      reverse_filter->IsConstant()) {
-    // Create a double-reverse, which is a nop.
-    HloComputation* c = conv->parent();
-    reverse_filter = c->AddInstruction(HloInstruction::CreateReverse(
-        reverse_filter->shape(), reverse_filter,
-        AsInt64Slice(dnums.kernel_spatial_dimensions())));
-    reverse_filter = c->AddInstruction(HloInstruction::CreateReverse(
-        reverse_filter->shape(), reverse_filter,
-        AsInt64Slice(dnums.kernel_spatial_dimensions())));
-    TF_CHECK_OK(conv->ReplaceOperandWith(/*operand_no=*/1, reverse_filter));
-  }
-
-  // Calculate the 'rhs' that goes into the backward input convolution.
-  HloInstruction* rhs = reverse_filter;
-  // One reverse is subsumed by the cuDNN call.
-  if (rhs->opcode() == HloOpcode::kReverse) {
-    rhs = rhs->mutable_operand(0);
-  }
-  if (conv->feature_group_count() == 1) {
-    return std::make_tuple(true, new_window, dnums, rhs);
-  }
-
-  // Handle grouped convolutions. Because we swapped the input feature dimension
-  // with the output feature dimension, we need to also reshape the kernel so
-  // that the 'feature_group_count' parameter still makes sense. The
-  // 'feature_group_count' parameter essentially specifies how often the
-  // 'kernel_input_feature_dimension' is repeated. So when we swap these
-  // dimensions, we need to divide the new 'kernel_input_feature_dimension' by
-  // 'feature_group_count' and multiply the new
-  // 'kernel_output_feature_dimension' by 'feature_group_count'.
-  Shape new_shape = rhs->shape();
-  int64 input_feature_dimension = dnums.kernel_input_feature_dimension();
-  int64 output_feature_dimension = dnums.kernel_output_feature_dimension();
-
-  // In the backward convolution case, the spatial dimensions become the
-  // feature dimensions, and we are guaranteed that the spatial dimensions are
-  // adjacent.
-  CHECK_EQ(std::abs(input_feature_dimension - output_feature_dimension), 1LL);
-  int64 input_features = new_shape.dimensions(input_feature_dimension);
-  int64 output_features = new_shape.dimensions(output_feature_dimension);
-  new_shape.set_dimensions(input_feature_dimension,
-                           input_features / conv->feature_group_count());
-  new_shape.set_dimensions(output_feature_dimension,
-                           output_features * conv->feature_group_count());
-  HloComputation* c = conv->parent();
-  rhs = c->AddInstruction(HloInstruction::CreateReshape(new_shape, rhs));
-  return std::make_tuple(true, new_window, dnums, rhs);
-}
-
-CudnnConvBackendConfig GetDefaultBackendConfig() {
-  CudnnConvBackendConfig config;
-  config.set_conv_result_scale(1);
-  return config;
-}
-
-// Tries to rewrite a single convolution into a call to cudnn.
-StatusOr<bool> RunOnInstruction(HloInstruction* conv) {
-  CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
-
-  HloInstruction* custom_call = [&]() -> HloInstruction* {
-    bool match;
-    Window window;
-    ConvolutionDimensionNumbers dnums;
-    HloInstruction* rhs;
-
-    std::tie(match, window, dnums) = MatchBackwardFilter(conv);
-    if (match) {
-      return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(),
-                             conv->mutable_operand(0), conv->mutable_operand(1),
-                             window, dnums, conv->feature_group_count());
-    }
-
-    std::tie(match, window, dnums, rhs) = MatchBackwardInput(conv);
-    if (match) {
-      return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, conv->shape(),
-                             conv->mutable_operand(0), rhs, window, dnums,
-                             conv->feature_group_count());
-    }
-
-    // If all else fails, try a forward convolution.
-    if (CanImplementAsCudnnForwardConv(conv)) {
-      return CreateCudnnConv(
-          kCudnnConvForwardCallTarget, conv->shape(), conv->mutable_operand(0),
-          conv->mutable_operand(1), conv->window(),
-          conv->convolution_dimension_numbers(), conv->feature_group_count());
-    }
-
-    return nullptr;
-  }();
-
-  if (custom_call == nullptr) {
-    return false;
-  }
-
-  TF_RETURN_IF_ERROR(
-      custom_call->set_backend_config(GetDefaultBackendConfig()));
-
-  // The CustomCall returns a tuple (conv_result, scratch_memory).  Extract out
-  // the conv result and replace `conv` with it.
-  TF_RETURN_IF_ERROR(conv->parent()->ReplaceWithNewInstruction(
-      conv,
-      HloInstruction::CreateGetTupleElement(conv->shape(), custom_call, 0)));
-  return true;
-}
-
-// Rewrites the convolutions in the given computation into calls to cudnn.
-// Returns true if it made any changes.
-StatusOr<bool> RunOnComputation(HloComputation* computation) {
-  std::vector<HloInstruction*> convs;
-  for (auto* hlo : computation->instructions()) {
-    if (hlo->opcode() == HloOpcode::kConvolution) {
-      convs.push_back(hlo);
-    }
-  }
-
-  bool changed = false;
-  for (HloInstruction* conv : convs) {
-    TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(conv));
-    changed |= result;
-  }
-  return changed;
-}
-}  // namespace
-
-StatusOr<bool> CudnnConvolutionRewriter::Run(HloModule* module) {
-  bool changed = false;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
-    changed |= result;
-  }
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
deleted file mode 100644
index 8d7c6fdab510407428a115579a90e8cf85e9fad2..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
-
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-
-namespace xla {
-namespace gpu {
-
-// Rewrites plain convolutions, backwards-filter convolutions, and
-// backwards-input convolutions into CustomCall HLOs that call into cuDNN.
-class CudnnConvolutionRewriter : public HloModulePass {
- public:
-  absl::string_view name() const override {
-    return "cudnn-convolution-rewriter";
-  }
-
-  StatusOr<bool> Run(HloModule* module) override;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
deleted file mode 100644
index d237f8930b74d460ad3d4602670a5afb19b496a2..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter_test.cc
+++ /dev/null
@@ -1,615 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
-
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-namespace op = xla::testing::opcode_matchers;
-using ::testing::_;
-
-class CudnnConvolutionRewriterTest : public HloVerifiedTestBase {
- public:
-  CudnnConvolutionRewriterTest()
-      : HloVerifiedTestBase(/*layout_sensitive=*/true,
-                            /*allow_mixed_precision=*/false) {
-    for (int i = 0; i < 2; ++i) {
-      WindowDimension* window_dim = default_conv_window_.add_dimensions();
-      window_dim->set_size(1);
-      window_dim->set_stride(1);
-      window_dim->set_padding_low(0);
-      window_dim->set_padding_high(0);
-      window_dim->set_window_dilation(1);
-      window_dim->set_base_dilation(1);
-    }
-    // TF data shapes are by default in the NHWC order, and filter shape is by
-    // default in HWIO order. For backward filter convolution, we need to swap
-    // the batch and feature dimension in the activations, and treat the batch
-    // dimension in gradients as the input feature dimension in the filter.
-    //
-    // TODO(jingyue): Add more tests on NCHW input order, which TF also
-    // supports.
-    tf_default_dnums_for_backward_filter_.set_input_batch_dimension(3);
-    tf_default_dnums_for_backward_filter_.set_input_feature_dimension(0);
-    tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(1);
-    tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(2);
-    tf_default_dnums_for_backward_filter_.set_kernel_input_feature_dimension(0);
-    tf_default_dnums_for_backward_filter_.set_kernel_output_feature_dimension(
-        3);
-    tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(1);
-    tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(2);
-    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(0);
-    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(1);
-    tf_default_dnums_for_backward_filter_.set_output_batch_dimension(2);
-    tf_default_dnums_for_backward_filter_.set_output_feature_dimension(3);
-
-    tf_default_dnums_for_backward_input_.set_input_batch_dimension(0);
-    tf_default_dnums_for_backward_input_.set_output_batch_dimension(0);
-    tf_default_dnums_for_backward_input_.set_input_feature_dimension(3);
-    tf_default_dnums_for_backward_input_.set_output_feature_dimension(3);
-    tf_default_dnums_for_backward_input_.add_input_spatial_dimensions(1);
-    tf_default_dnums_for_backward_input_.add_output_spatial_dimensions(1);
-    tf_default_dnums_for_backward_input_.add_input_spatial_dimensions(2);
-    tf_default_dnums_for_backward_input_.add_output_spatial_dimensions(2);
-    tf_default_dnums_for_backward_input_.set_kernel_input_feature_dimension(3);
-    tf_default_dnums_for_backward_input_.set_kernel_output_feature_dimension(2);
-    tf_default_dnums_for_backward_input_.add_kernel_spatial_dimensions(0);
-    tf_default_dnums_for_backward_input_.add_kernel_spatial_dimensions(1);
-  }
-
- protected:
-  bool RunPass(HloModule* module) {
-    return CudnnConvolutionRewriter().Run(module).ValueOrDie();
-  }
-
-  // A convolution window with stride 1 and zero padding. The size fields are
-  // not set.
-  Window default_conv_window_;
-  ConvolutionDimensionNumbers tf_default_dnums_for_backward_filter_;
-  ConvolutionDimensionNumbers tf_default_dnums_for_backward_input_;
-};
-
-TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolve) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* activations =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "activations"));
-  HloInstruction* gradients =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {1, 1, 2, 1}), "gradients"));
-  Window conv_window = default_conv_window_;
-  conv_window.mutable_dimensions(1)->set_size(2);
-  conv_window.mutable_dimensions(1)->set_window_dilation(2);
-  builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeInference::InferConvolveShape(
-          activations->shape(), gradients->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_filter_)
-          .ConsumeValueOrDie(),
-      activations, gradients, /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
-}
-
-TEST_F(CudnnConvolutionRewriterTest,
-       BackwardFilterConvolveEquivalentToForwardConvolution) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* activations =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "activations"));
-  HloInstruction* gradients =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "gradients"));
-  Window conv_window = default_conv_window_;
-  conv_window.mutable_dimensions(1)->set_size(3);
-  builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeInference::InferConvolveShape(
-          activations->shape(), gradients->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_filter_)
-          .ConsumeValueOrDie(),
-      activations, gradients, /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
-}
-
-// Extracted from block35 training.
-TEST_F(CudnnConvolutionRewriterTest,
-       BackwardFilterConvolveWithPaddedActivations) {
-  auto builder = HloComputation::Builder(TestName());
-  HloInstruction* activations =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {20, 35, 35, 32}), "activations"));
-  HloInstruction* gradients =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {20, 35, 35, 32}), "gradients"));
-
-  Window conv_window = default_conv_window_;
-  for (int i = 0; i < 2; ++i) {
-    conv_window.mutable_dimensions(i)->set_size(35);
-    conv_window.mutable_dimensions(i)->set_padding_low(1);
-    conv_window.mutable_dimensions(i)->set_padding_high(1);
-  }
-  builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeUtil::MakeShape(F32, {32, 3, 3, 32}), activations, gradients,
-      /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
-}
-
-// Extracted from inception v3 training.
-TEST_F(CudnnConvolutionRewriterTest,
-       BackwardFilterConvolveWithPaddedGradients) {
-  auto builder = HloComputation::Builder(TestName());
-  HloInstruction* activations =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {20, 10, 10, 192}), "activations"));
-  HloInstruction* gradients =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {20, 4, 4, 320}), "gradients"));
-
-  Window conv_window = default_conv_window_;
-  for (int i = 0; i < 2; ++i) {
-    conv_window.mutable_dimensions(i)->set_size(4);
-    conv_window.mutable_dimensions(i)->set_padding_high(-1);
-    conv_window.mutable_dimensions(i)->set_window_dilation(2);
-  }
-  builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeUtil::MakeShape(F32, {320, 3, 3, 192}), activations, gradients,
-      /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
-}
-
-TEST_F(CudnnConvolutionRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
-  auto builder = HloComputation::Builder(TestName());
-  HloInstruction* activations =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {20, 35, 35, 32}), "activations"));
-  HloInstruction* gradients =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {20, 35, 35, 32}), "gradients"));
-
-  Window conv_window = default_conv_window_;
-  for (int i = 0; i < 2; ++i) {
-    conv_window.mutable_dimensions(i)->set_size(35);
-    // Uneven padding: padding_low=0, padding_high=1
-    conv_window.mutable_dimensions(i)->set_padding_high(1);
-  }
-  builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeUtil::MakeShape(F32, {32, 2, 2, 32}), activations, gradients,
-      /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0));
-}
-
-TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveEvenPadding) {
-  auto builder = HloComputation::Builder(TestName());
-  HloInstruction* output =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {4, 5, 16, 16}), "output"));
-  HloInstruction* kernel =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {5, 3, 7, 7}), "kernel"));
-  HloInstruction* reverse_kernel = builder.AddInstruction(
-      HloInstruction::CreateReverse(kernel->shape(), kernel, {2, 3}));
-
-  Window conv_window = default_conv_window_;
-  for (int i = 0; i < 2; ++i) {
-    conv_window.mutable_dimensions(i)->set_size(7);
-    conv_window.mutable_dimensions(i)->set_padding_low(3);
-    conv_window.mutable_dimensions(i)->set_padding_high(3);
-  }
-  ConvolutionDimensionNumbers conv_dnums;
-  conv_dnums.set_input_batch_dimension(0);
-  conv_dnums.set_output_batch_dimension(0);
-  conv_dnums.set_input_feature_dimension(1);
-  conv_dnums.set_output_feature_dimension(1);
-  conv_dnums.add_input_spatial_dimensions(2);
-  conv_dnums.add_output_spatial_dimensions(2);
-  conv_dnums.add_input_spatial_dimensions(3);
-  conv_dnums.add_output_spatial_dimensions(3);
-  conv_dnums.set_kernel_input_feature_dimension(0);
-  conv_dnums.set_kernel_output_feature_dimension(1);
-  conv_dnums.add_kernel_spatial_dimensions(2);
-  conv_dnums.add_kernel_spatial_dimensions(3);
-
-  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeUtil::MakeShape(F32, {4, 3, 16, 16}), /*lhs=*/output,
-      /*rhs=*/reverse_kernel, /*feature_group_count=*/1, conv_window,
-      conv_dnums, DefaultPrecisionConfig(2)));
-  // Verify the convolution's shape is consistent with ShapeInference.
-  CHECK(ShapeUtil::Compatible(
-      conv->shape(), ShapeInference::InferConvolveShape(
-                         output->shape(), reverse_kernel->shape(),
-                         /*feature_group_count=*/1, conv_window, conv_dnums)
-                         .ValueOrDie()));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-
-  ASSERT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
-  const HloInstruction* custom_call =
-      entry_computation->root_instruction()->operand(0);
-  for (int i = 0; i < 2; ++i) {
-    const WindowDimension& window_dim = custom_call->window().dimensions(i);
-    // Low padding of the backward input convolution
-    //   = kernel_size - 1 - low padding on gradients.
-    EXPECT_EQ(3, window_dim.padding_low());
-    EXPECT_EQ(3, window_dim.padding_high());
-    EXPECT_EQ(1, window_dim.stride());
-  }
-}
-
-// Convolve([abc], [x], base_dilation=2)
-//   = Convolve([abc], Reverse([x]), base_dilation=2)
-//   = BackwardInputConvolve([abc], [x], stride=2)
-TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolve1x1Filter) {
-  auto builder = HloComputation::Builder(TestName());
-  // NHWC dimension order.
-  HloInstruction* output =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "output"));
-  // HWOI dimension order.
-  HloInstruction* kernel =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {1, 1, 1, 1}), "kernel"));
-
-  Window conv_window = default_conv_window_;
-  conv_window.mutable_dimensions(1)->set_base_dilation(2);
-
-  builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeInference::InferConvolveShape(output->shape(), kernel->shape(),
-                                         /*feature_group_count=*/1, conv_window,
-                                         tf_default_dnums_for_backward_input_)
-          .ConsumeValueOrDie(),
-      /*lhs=*/output, /*rhs=*/kernel, /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  EXPECT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
-}
-
-// BackwardInputConvolve([abc], [x], stride=1) is equivalent to
-// ForwardConvolve([abc], [x], stride=1). No need to fold it into backward input
-// convolution.
-TEST_F(CudnnConvolutionRewriterTest,
-       BackwardInputConvolve1x1FilterEquivalentToForwardConvolve) {
-  auto builder = HloComputation::Builder(TestName());
-  // NHWC dimension order.
-  HloInstruction* output =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "output"));
-  // HWOI dimension order.
-  HloInstruction* kernel =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {1, 1, 1, 1}), "kernel"));
-
-  builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeInference::InferConvolveShape(
-          output->shape(), kernel->shape(), /*feature_group_count=*/1,
-          default_conv_window_, tf_default_dnums_for_backward_input_)
-          .ConsumeValueOrDie(),
-      /*lhs=*/output, /*rhs=*/kernel, /*feature_group_count=*/1,
-      default_conv_window_, tf_default_dnums_for_backward_input_,
-      DefaultPrecisionConfig(2)));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  EXPECT_THAT(
-      entry_computation->root_instruction(),
-      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
-}
-
-// Extracted from Inception V3 training.
-//
-//                                  filter(HWIO)
-//                                  3x3x192x320
-//                                      |
-//                                      v
-//      gradients(NHWC)              reverse
-//        20x4x4x320               3x3x192x320
-//                    \            /
-//                     \          /
-//  conv (NHWC) with padding (low=2,high=3,interior=1)
-//                     20x10x10x192
-//
-// Gradients are padded unevenly.
-TEST_F(CudnnConvolutionRewriterTest,
-       BackwardInputConvolveUnevenPaddingOnGradients) {
-  auto builder = HloComputation::Builder(TestName());
-  HloInstruction* output =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {20, 4, 4, 320}), "output"));
-  HloInstruction* kernel =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {3, 3, 192, 320}), "kernel"));
-  HloInstruction* reverse_kernel = builder.AddInstruction(
-      HloInstruction::CreateReverse(kernel->shape(), kernel, {0, 1}));
-
-  Window conv_window = default_conv_window_;
-  for (int i = 0; i < 2; ++i) {
-    conv_window.mutable_dimensions(i)->set_size(3);
-    conv_window.mutable_dimensions(i)->set_padding_low(2);
-    conv_window.mutable_dimensions(i)->set_padding_high(3);
-    // Interior padding = 1.
-    conv_window.mutable_dimensions(i)->set_base_dilation(2);
-  }
-  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeUtil::MakeShape(F32, {20, 10, 10, 192}), output, reverse_kernel,
-      /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
-  // Verify the convolution's shape is consistent with ShapeInference.
-  CHECK(ShapeUtil::Compatible(
-      conv->shape(),
-      ShapeInference::InferConvolveShape(
-          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_input_)
-          .ValueOrDie()));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  ASSERT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
-  const HloInstruction* custom_call =
-      entry_computation->root_instruction()->operand(0);
-  for (int i = 0; i < 2; ++i) {
-    const WindowDimension& window_dim = custom_call->window().dimensions(i);
-    EXPECT_EQ(0, window_dim.padding_low());
-    EXPECT_EQ(0, window_dim.padding_high());
-    EXPECT_EQ(2, window_dim.stride());
-  }
-}
-
-// Similar to BackwardInputConvolveUnevenPadding, but the low padding of the
-// gradients exceeds kernel_size - 1. Therefore, this pattern cannot be fused.
-TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveLowPaddingTooLarge) {
-  auto builder = HloComputation::Builder(TestName());
-  HloInstruction* output =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {20, 4, 4, 320}), "output"));
-  HloInstruction* kernel =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {3, 3, 192, 320}), "kernel"));
-  HloInstruction* reverse_kernel = builder.AddInstruction(
-      HloInstruction::CreateReverse(kernel->shape(), kernel, {0, 1}));
-
-  Window conv_window = default_conv_window_;
-  for (int i = 0; i < 2; ++i) {
-    conv_window.mutable_dimensions(i)->set_size(3);
-    conv_window.mutable_dimensions(i)->set_padding_low(3);
-    conv_window.mutable_dimensions(i)->set_padding_high(2);
-    conv_window.mutable_dimensions(i)->set_base_dilation(2);
-  }
-  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeUtil::MakeShape(F32, {20, 10, 10, 192}), output, reverse_kernel,
-      /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
-  // Verify the convolution's shape is consistent with ShapeInference.
-  CHECK(ShapeUtil::Compatible(
-      conv->shape(),
-      ShapeInference::InferConvolveShape(
-          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_input_)
-          .ValueOrDie()));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  EXPECT_THAT(
-      entry_computation->root_instruction(),
-      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
-}
-
-// Extracted from //learning/brain/google/xla/benchmarks/resnet.py
-//
-// For simplicity, we focus on the column dimension and ignore other dimensions.
-// We use [?] to represent the shape instead of the content.
-//
-// Suppose operator FC does
-//   [4] = conv([14], [3], stride=2, padding_high=1)  // Padding::kSame
-//
-// BC = BackwardInput(FC) does:
-//   [14] = conv([7], reverse([3]),
-//               padding_low=2, padding_high=1, base_dilation=2)
-//
-// We should fuse BC even though padding on activations is uneven, because
-// PadInsertion will canonicalize the fusion HLO.
-TEST_F(CudnnConvolutionRewriterTest,
-       BackwardInputConvolveUnevenPaddingOnActivations) {
-  auto builder = HloComputation::Builder(TestName());
-  // The gradients are in NCHW layout.
-  HloInstruction* output =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {1, 1, 7, 1}), "output"));
-  // The kernel is in HWIO layout.
-  HloInstruction* kernel =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {1, 3, 1, 1}), "kernel"));
-  HloInstruction* reverse_kernel = builder.AddInstruction(
-      HloInstruction::CreateReverse(kernel->shape(), kernel, {0, 1}));
-
-  Window conv_window = default_conv_window_;
-  WindowDimension* forward_conv_col_dim = conv_window.mutable_dimensions(1);
-  forward_conv_col_dim->set_size(3);
-  forward_conv_col_dim->set_padding_low(2);
-  forward_conv_col_dim->set_padding_high(1);
-  forward_conv_col_dim->set_base_dilation(2);
-  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeUtil::MakeShape(F32, {1, 1, 14, 1}), output, reverse_kernel,
-      /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
-  // Verify the convolution's shape is consistent with ShapeInference.
-  CHECK(ShapeUtil::Compatible(
-      conv->shape(),
-      ShapeInference::InferConvolveShape(
-          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_input_)
-          .ValueOrDie()));
-
-  auto module = CreateNewModule();
-  const HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  ASSERT_THAT(entry_computation->root_instruction(),
-              op::GetTupleElement(
-                  op::CustomCall(kCudnnConvBackwardInputCallTarget), 0));
-  const WindowDimension& backward_conv_col_dim =
-      entry_computation->root_instruction()->operand(0)->window().dimensions(1);
-  EXPECT_EQ(0, backward_conv_col_dim.padding_low());
-  EXPECT_EQ(1, backward_conv_col_dim.padding_high());
-}
-
-// For simplicity, we focus on the column dimension and ignore other dimensions.
-// We use [?] to represent the shape instead of the content.
-//
-// Suppose operator FC does
-//   [3] = conv([4], [2], padding_low=1, padding_high=-1)
-//
-// BC = BackwardInput(FC) does:
-//   [4] = conv([3], reverse([2]), padding_high=2)
-//
-// We currently don't fuse BC because PadInsertion doesn't support negative
-// padding on the gradients of backward convolution (b/32744257).
-TEST_F(CudnnConvolutionRewriterTest,
-       BackwardInputConvolveNegativePaddingHighOnActivations) {
-  auto builder = HloComputation::Builder(TestName());
-  // The gradients are in NCHW layout.
-  HloInstruction* output =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {1, 1, 3, 1}), "output"));
-  // The kernel is in HWIO layout.
-  HloInstruction* kernel =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(F32, {1, 2, 1, 1}), "kernel"));
-  HloInstruction* reverse_kernel = builder.AddInstruction(
-      HloInstruction::CreateReverse(kernel->shape(), kernel, {0, 1}));
-
-  Window conv_window = default_conv_window_;
-  WindowDimension* forward_conv_col_dim = conv_window.mutable_dimensions(1);
-  forward_conv_col_dim->set_size(2);
-  forward_conv_col_dim->set_padding_high(2);
-  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
-      ShapeUtil::MakeShape(F32, {1, 1, 4, 1}), output, reverse_kernel,
-      /*feature_group_count=*/1, conv_window,
-      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
-  // Verify the convolution's shape is consistent with ShapeInference.
-  CHECK(ShapeUtil::Compatible(
-      conv->shape(),
-      ShapeInference::InferConvolveShape(
-          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_input_)
-          .ValueOrDie()));
-
-  auto module = CreateNewModule();
-  HloComputation* entry_computation =
-      module->AddEntryComputation(builder.Build());
-  EXPECT_TRUE(RunPass(module));
-  EXPECT_THAT(
-      entry_computation->root_instruction(),
-      op::GetTupleElement(op::CustomCall(kCudnnConvForwardCallTarget), 0));
-}
-
-// Check that we will materialize a reversed version of a constant in order to
-// pattern-match a backwards input convolution.
-TEST_F(CudnnConvolutionRewriterTest, BackwardInputConvolveConstantFilter) {
-  Array4D<float> constant_arr(4, 4, 2, 2);
-  constant_arr.FillIota(0);
-  string constant_str =
-      LiteralUtil::CreateR4FromArray4D(constant_arr).ToString();
-  ParseAndVerifyModule(absl::StrFormat(R"(
-    HloModule test
-
-    ENTRY entry_computation {
-      param0 = f32[128,2,16,16]{3,2,1,0} parameter(0)
-      constant = f32[4,4,2,2]{3,2,1,0} constant(%s)
-      ROOT convolution = f32[128,2,32,32]{3,2,1,0} convolution(param0, constant),
-          window={size=4x4 pad=2_2x2_2 lhs_dilate=2x2},
-          dim_labels=bf01_01oi->bf01, feature_group_count=1
-    })",
-                                       constant_str));
-  EXPECT_TRUE(RunPass(&module()));
-  EXPECT_THAT(
-      module().entry_computation()->root_instruction(),
-      op::GetTupleElement(op::CustomCall(kCudnnConvBackwardInputCallTarget, _,
-                                         op::Reverse(op::Constant())),
-                          0));
-}
-
-}  // anonymous namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
deleted file mode 100644
index 89dd1bb272663ac1f6eecbaae070d201d38e44c8..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
+++ /dev/null
@@ -1,419 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/util.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using se::DeviceMemory;
-using se::DeviceMemoryBase;
-using se::Stream;
-using se::dnn::AlgorithmConfig;
-using se::dnn::BatchDescriptor;
-using se::dnn::ConvolutionDescriptor;
-using se::dnn::DataLayout;
-using se::dnn::DimIndex;
-using se::dnn::FilterDescriptor;
-using se::dnn::FilterLayout;
-using se::dnn::ProfileResult;
-
-struct CudnnConvParams {
-  // Here are the fields related to cuDNN's fused convolution. The result thus
-  // is defined as:
-  //   activation(conv_result_scale * conv(x, w) +
-  //       side_input_scale * side_input + broadcast(bias))
-  //
-  // The most common fused conv is conv forward + relu/identity, for example.
-  //
-  // bias_buf is a single-dimensional array, with the length equal to the number
-  // of output features. It'll be broadcasted to the output shape in order to be
-  // added to the final results.
-  //
-  // side_input_buf, if valid, must have the same shape as the output buffer.
-  struct FusionParams {
-    se::dnn::ActivationMode mode;
-    double side_input_scale;
-    se::DeviceMemoryBase bias_buf;
-    se::DeviceMemoryBase side_input_buf;  // nullable
-  };
-
-  CudnnConvKind kind;
-  const Shape* input_shape;
-  const Shape* filter_shape;
-  const Shape* output_shape;
-  se::DeviceMemoryBase input_buf;
-  se::DeviceMemoryBase filter_buf;
-  se::DeviceMemoryBase output_buf;
-  const Window* window;
-  const ConvolutionDimensionNumbers* dnums;
-  int64 feature_group_count;
-  se::dnn::AlgorithmConfig algorithm;
-  double conv_result_scale;
-
-  absl::optional<FusionParams> fusion;
-};
-
-// A StreamExecutor ScratchAllocator that wraps a single XLA allocation,
-// returning it (in its entirety) the first time Allocate() is called.
-class ScratchBufAllocator : public se::ScratchAllocator {
- public:
-  explicit ScratchBufAllocator(se::DeviceMemoryBase scratch)
-      : scratch_(scratch) {}
-
-  ~ScratchBufAllocator() override = default;
-
-  int64 GetMemoryLimitInBytes(se::Stream* /*stream*/) override {
-    return scratch_.size();
-  }
-
-  se::port::StatusOr<DeviceMemory<uint8>> AllocateBytes(
-      se::Stream* stream, int64 byte_size) override {
-    if (allocated_) {
-      return se::port::InternalError(
-          "Can't allocate twice from a ScratchBufAllocator.");
-    }
-    if (byte_size > scratch_.size()) {
-      return se::port::InternalError(absl::StrCat(
-          "Can't allocate ", byte_size,
-          " bytes from a ScratchBufAllocator of size ", scratch_.size()));
-    }
-
-    allocated_ = true;
-    return se::DeviceMemory<uint8>(scratch_);
-  }
-
- private:
-  se::DeviceMemoryBase scratch_;
-  bool allocated_ = false;
-};
-
-template <typename T>
-Status RunCudnnConvolutionImpl(CudnnConvParams params,
-                               se::ScratchAllocator* scratch_allocator,
-                               se::Stream* stream,
-                               se::dnn::ProfileResult* profile_result) {
-  CudnnConvKind kind = params.kind;
-  const Shape& input_shape = *params.input_shape;
-  const Shape& filter_shape = *params.filter_shape;
-  const Shape& output_shape = *params.output_shape;
-  DeviceMemory<T> input_buf(params.input_buf);
-  DeviceMemory<T> filter_buf(params.filter_buf);
-  DeviceMemory<T> output_buf(params.output_buf);
-  const Window& window = *params.window;
-  const ConvolutionDimensionNumbers& dnums = *params.dnums;
-  int64 feature_group_count = params.feature_group_count;
-  AlgorithmConfig algorithm = params.algorithm;
-
-  VLOG(3) << "Convolution Algorithm: " << algorithm.algorithm().algo_id();
-  VLOG(3) << "tensor_ops_enabled: "
-          << algorithm.algorithm().tensor_ops_enabled();
-  VLOG(3) << "Convolution kind: " << CudnnConvKindToString(kind);
-  VLOG(3) << "input shape: " << ShapeUtil::HumanStringWithLayout(input_shape);
-  VLOG(3) << "filter shape: " << ShapeUtil::HumanStringWithLayout(filter_shape);
-  VLOG(3) << "Output shape: " << ShapeUtil::HumanStringWithLayout(output_shape);
-  VLOG(3) << "Window: { " << window.ShortDebugString() << " }";
-  VLOG(3) << "Dim nums: { " << dnums.ShortDebugString() << " }";
-
-  const int num_dimensions = window.dimensions_size();
-  CHECK_LE(num_dimensions, 3);
-  // cuDNN does not support 1D convolutions. We therefore express 1D
-  // convolutions as 2D convolutions where the first spatial dimension is 1.
-  // This matches the behavior of TF (see definition of conv1d in
-  // tensorflow/python/ops/nn_ops.py).
-  const int effective_num_dimensions = std::max(2, num_dimensions);
-
-  CHECK_EQ(primitive_util::NativeToPrimitiveType<T>(),
-           output_shape.element_type())
-      << ShapeUtil::HumanString(output_shape);
-
-  CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
-  CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size());
-  for (const WindowDimension& dim : window.dimensions()) {
-    CHECK_EQ(dim.padding_low(), dim.padding_high());
-  }
-
-  // cuDNN's convolution APIs support the BDYX layout for activations/output and
-  // the OIYX layout for weights.
-  DataLayout input_dl;
-  FilterLayout filter_dl;
-  DataLayout output_dl;
-
-  TF_ASSIGN_OR_RETURN(std::tie(input_dl, filter_dl, output_dl),
-                      XlaConvLayoutsToStreamExecutorLayouts(
-                          dnums, input_shape.layout(), filter_shape.layout(),
-                          output_shape.layout()));
-
-  BatchDescriptor input_descriptor(effective_num_dimensions);
-  input_descriptor.set_layout(input_dl)
-      .set_feature_map_count(
-          input_shape.dimensions(dnums.input_feature_dimension()))
-      .set_count(input_shape.dimensions(dnums.input_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    // Note that the dimensions are reversed. The same holds below.
-    input_descriptor.set_spatial_dim(
-        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-        input_shape.dimensions(dnums.input_spatial_dimensions(dim)));
-  }
-
-  FilterDescriptor filter_descriptor(effective_num_dimensions);
-  filter_descriptor.set_layout(filter_dl)
-      .set_input_feature_map_count(
-          filter_shape.dimensions(dnums.kernel_input_feature_dimension()))
-      .set_output_feature_map_count(
-          filter_shape.dimensions(dnums.kernel_output_feature_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    filter_descriptor.set_spatial_dim(
-        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-        filter_shape.dimensions(dnums.kernel_spatial_dimensions(dim)));
-  }
-
-  ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
-  convolution_descriptor.set_group_count(feature_group_count);
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    convolution_descriptor
-        .set_zero_padding(
-            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-            window.dimensions(dim).padding_low())
-        .set_filter_stride(
-            static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-            window.dimensions(dim).stride());
-  }
-
-  BatchDescriptor output_descriptor(effective_num_dimensions);
-  output_descriptor.set_layout(output_dl)
-      .set_feature_map_count(
-          output_shape.dimensions(dnums.output_feature_dimension()))
-      .set_count(output_shape.dimensions(dnums.output_batch_dimension()));
-  for (int dim = 0; dim < num_dimensions; ++dim) {
-    output_descriptor.set_spatial_dim(
-        static_cast<DimIndex>(effective_num_dimensions - dim - 1),
-        output_shape.dimensions(dnums.output_spatial_dimensions(dim)));
-  }
-
-  // Add a singleton dimension in the 1D convolution case.
-  if (num_dimensions == 1) {
-    input_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
-    output_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
-    filter_descriptor.set_spatial_dim(static_cast<DimIndex>(0), 1);
-    convolution_descriptor.set_zero_padding(static_cast<DimIndex>(0), 0)
-        .set_filter_stride(static_cast<DimIndex>(0), 1);
-  }
-
-  switch (kind) {
-    case CudnnConvKind::kForward:
-      if (params.conv_result_scale != 1) {
-        return InternalError(
-            "StreamExecutor doesn't support scaled convolution: %lf.",
-            params.conv_result_scale);
-      }
-      stream->ThenConvolveWithAlgorithm(
-          input_descriptor, input_buf, filter_descriptor, filter_buf,
-          convolution_descriptor, output_descriptor, &output_buf,
-          scratch_allocator, algorithm, profile_result);
-      break;
-    case CudnnConvKind::kBackwardInput:
-      if (params.conv_result_scale != 1) {
-        return InternalError(
-            "StreamExecutor doesn't support scaled convolution: %lf.",
-            params.conv_result_scale);
-      }
-      stream->ThenConvolveBackwardDataWithAlgorithm(
-          filter_descriptor, filter_buf, output_descriptor, output_buf,
-          convolution_descriptor, input_descriptor, &input_buf,
-          scratch_allocator, algorithm, profile_result);
-      break;
-    case CudnnConvKind::kBackwardFilter:
-      if (params.conv_result_scale != 1) {
-        return InternalError(
-            "StreamExecutor doesn't support scaled convolution: %lf.",
-            params.conv_result_scale);
-      }
-      stream->ThenConvolveBackwardFilterWithAlgorithm(
-          input_descriptor, input_buf, output_descriptor, output_buf,
-          convolution_descriptor, filter_descriptor, &filter_buf,
-          scratch_allocator, algorithm, profile_result);
-      break;
-    case CudnnConvKind::kForwardActivation: {
-      BatchDescriptor bias_desc;
-      bias_desc.set_count(1)
-          .set_height(1)
-          .set_width(1)
-          .set_feature_map_count(
-              output_shape.dimensions(dnums.output_feature_dimension()))
-          .set_layout(output_dl);
-
-      se::DeviceMemory<T> side_input(params.fusion->side_input_buf);
-      // If there is no side input, use output as the side input.
-      if (side_input.is_null()) {
-        if (params.fusion->side_input_scale != 0) {
-          return InternalError(
-              "Side input scale is not 0, yet no side input buffer is "
-              "provided");
-        }
-        // Since side-input scale is 0, the values in the side input don't
-        // matter.  The simplest thing to do would be to pass in a null buffer
-        // for the side input, but cudnn doesn't allow this.  cudnn does promise
-        // that if side-input-scale is 0 the side input won't be read, so we
-        // just pass in the output buffer, since it's handy and has the correct
-        // size.
-        side_input = output_buf;
-      }
-
-      stream->ThenFusedConvolveWithAlgorithm(
-          input_descriptor, input_buf, params.conv_result_scale,
-          filter_descriptor, filter_buf, convolution_descriptor, side_input,
-          params.fusion->side_input_scale, bias_desc,
-          DeviceMemory<T>(params.fusion->bias_buf), params.fusion->mode,
-          output_descriptor, &output_buf, scratch_allocator, algorithm,
-          profile_result);
-      break;
-    }
-  }
-
-  if (!stream->ok()) {
-    return InternalError(
-        "Unable to launch convolution with type %s and algorithm (%d, %d)",
-        CudnnConvKindToString(kind), algorithm.algorithm().algo_id(),
-        algorithm.algorithm_no_scratch().algo_id());
-  }
-  return Status::OK();
-}
-
-// Returns the cudnn convolution parameters generated from conv, which must be a
-// custom-call to a cudnn convolution.
-StatusOr<CudnnConvParams> GetCudnnConvParams(
-    const HloCustomCallInstruction* conv,
-    absl::Span<se::DeviceMemoryBase> operand_buffers,
-    se::DeviceMemoryBase result_buffer) {
-  CudnnConvParams params;
-
-  TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
-                      conv->backend_config<CudnnConvBackendConfig>());
-  const auto& target = conv->custom_call_target();
-  const auto& lhs_shape = conv->operand(0)->shape();
-  const auto& rhs_shape = conv->operand(1)->shape();
-  const auto& conv_result_shape = conv->shape().tuple_shapes(0);
-
-  params.window = &conv->window();
-  params.dnums = &conv->convolution_dimension_numbers();
-  params.feature_group_count = conv->feature_group_count();
-  params.algorithm = se::dnn::AlgorithmConfig(se::dnn::AlgorithmDesc(
-      backend_config.algorithm(), backend_config.tensor_ops_enabled()));
-  params.conv_result_scale = backend_config.conv_result_scale();
-
-  if (target == kCudnnConvForwardCallTarget) {
-    params.kind = CudnnConvKind::kForward;
-    params.input_shape = &lhs_shape;
-    params.filter_shape = &rhs_shape;
-    params.output_shape = &conv_result_shape;
-    params.input_buf = operand_buffers[0];
-    params.filter_buf = operand_buffers[1];
-    params.output_buf = result_buffer;
-  } else if (target == kCudnnConvBackwardInputCallTarget) {
-    params.kind = CudnnConvKind::kBackwardInput;
-    params.input_shape = &conv_result_shape;
-    params.filter_shape = &rhs_shape;
-    params.output_shape = &lhs_shape;
-    params.input_buf = result_buffer;
-    params.filter_buf = operand_buffers[1];
-    params.output_buf = operand_buffers[0];
-  } else if (target == kCudnnConvBackwardFilterCallTarget) {
-    params.kind = CudnnConvKind::kBackwardFilter;
-    params.input_shape = &lhs_shape;
-    params.filter_shape = &conv_result_shape;
-    params.output_shape = &rhs_shape;
-    params.input_buf = operand_buffers[0];
-    params.filter_buf = result_buffer;
-    params.output_buf = operand_buffers[1];
-  } else if (target == kCudnnConvBiasActivationForwardCallTarget) {
-    params.kind = CudnnConvKind::kForwardActivation;
-    params.input_shape = &lhs_shape;
-    params.filter_shape = &rhs_shape;
-    params.output_shape = &conv_result_shape;
-    params.fusion.emplace();
-    auto& fusion = *params.fusion;
-    if (backend_config.activation_mode() <
-        static_cast<int64>(se::dnn::ActivationMode::kNumActivationModes)) {
-      fusion.mode = static_cast<se::dnn::ActivationMode>(
-          backend_config.activation_mode());
-    } else {
-      return InternalError("Bad activation mode: %s",
-                           backend_config.ShortDebugString());
-    }
-    fusion.side_input_scale = backend_config.side_input_scale();
-    params.input_buf = operand_buffers[0];
-    params.filter_buf = operand_buffers[1];
-    params.output_buf = result_buffer;
-    params.fusion->bias_buf = operand_buffers[2];
-    if (operand_buffers.size() >= 4) {
-      params.fusion->side_input_buf = operand_buffers[3];
-    }
-  } else {
-    return InternalError("Unexpected custom call target: %s", target);
-  }
-  return params;
-}
-
-}  // anonymous namespace
-
-Status RunCudnnConvolution(const HloCustomCallInstruction* conv,
-                           absl::Span<se::DeviceMemoryBase> operand_buffers,
-                           se::DeviceMemoryBase result_buffer,
-                           se::DeviceMemoryBase scratch_buf, se::Stream* stream,
-                           se::dnn::ProfileResult* profile_result) {
-  ScratchBufAllocator scratch_allocator(scratch_buf);
-  return RunCudnnConvolution(conv, operand_buffers, result_buffer,
-                             &scratch_allocator, stream, profile_result);
-}
-
-Status RunCudnnConvolution(const HloCustomCallInstruction* conv,
-                           absl::Span<se::DeviceMemoryBase> operand_buffers,
-                           se::DeviceMemoryBase result_buffer,
-                           se::ScratchAllocator* scratch_allocator,
-                           se::Stream* stream,
-                           se::dnn::ProfileResult* profile_result) {
-  TF_ASSIGN_OR_RETURN(CudnnConvParams params,
-                      GetCudnnConvParams(conv, operand_buffers, result_buffer));
-
-  PrimitiveType output_primitive_type =
-      conv->shape().tuple_shapes(0).element_type();
-  switch (output_primitive_type) {
-    case F16:
-      return RunCudnnConvolutionImpl<Eigen::half>(params, scratch_allocator,
-                                                  stream, profile_result);
-    case F32:
-      return RunCudnnConvolutionImpl<float>(params, scratch_allocator, stream,
-                                            profile_result);
-    case F64:
-      return RunCudnnConvolutionImpl<double>(params, scratch_allocator, stream,
-                                             profile_result);
-    default:
-      LOG(FATAL) << ShapeUtil::HumanString(*params.output_shape);
-  }
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
deleted file mode 100644
index 61aec1ceccec0f253f9ddaa688d64cacea800cf3..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
-
-#include "absl/types/optional.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/status.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-
-namespace xla {
-namespace gpu {
-
-// This file contains low-level routines for running cudnn convolutions.
-
-// Calls into cudnn to run the specified convolution.
-//
-// We provide one overload which takes a scratch buffer, and another which takes
-// an allocator which is responsible for allocating the scratch space.  In
-// theory the second one shouldn't be necessary -- users of this function could
-// just ask cudnn how much scratch space it needs for a particular convolution.
-// But in practice, StreamExecutor does not expose such an API, and in the name
-// of parsimony, perhaps it's better not to add it.  Instead, the first time you
-// call a convolution, you should call the version that takes a scratch
-// allocator and take note of how much memory is used.  The next time you call
-// the same conv, you can provide an explicitly preallocated scratch buffer of
-// that size, if you like.
-Status RunCudnnConvolution(const HloCustomCallInstruction* conv,
-                           absl::Span<se::DeviceMemoryBase> operand_buffers,
-                           se::DeviceMemoryBase result_buffer,
-                           se::DeviceMemoryBase scratch_buf, se::Stream* stream,
-                           se::dnn::ProfileResult* profile_result = nullptr);
-
-Status RunCudnnConvolution(const HloCustomCallInstruction* conv,
-                           absl::Span<se::DeviceMemoryBase> operand_buffers,
-                           se::DeviceMemoryBase result_buffer,
-                           se::ScratchAllocator* scratch_allocator,
-                           se::Stream* stream,
-                           se::dnn::ProfileResult* profile_result = nullptr);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_RUNNER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cde65ad5745a3c102d029907e0690dc8c34620fd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.cc
@@ -0,0 +1,280 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+// Describes a matched pattern:
+//   max(0, alpha1 * conv(x, w) + alpha2 * side_input + broadcast(bias));
+// Where side_input has the shape of output buffer, and bias is a 1D array with
+// the dimension of number of output features.
+struct ConvWithRelu {
+  HloInstruction* maximum;
+  HloCustomCallInstruction* conv;
+  HloInstruction* bias;
+  HloInstruction* side_input;
+  HloConstantInstruction* alpha_conv;
+  HloConstantInstruction* alpha_side_input;
+};
+
+absl::optional<ConvWithRelu> FindConvWithRelu(HloInstruction* instr) {
+  using match::Add;
+  using match::AddAnyOrder;
+  using match::AnyOf;
+  using match::Broadcast;
+  using match::Constant;
+  using match::GetTupleElement;
+  using match::Maximum;
+  using match::MultiplyAnyOrder;
+  using match::Op;
+
+  // The pattern we want to match:
+  //   max(0, alpha1 * conv(x, w) + alpha2 * side_input + broadcast(bias));
+  //
+  // With its variants involving commute/reassociation of adds, multiplies, and
+  // max, and omission of alpha1, side_input, alpha2, or bias.
+
+  HloInstruction* relu_input;
+
+  // Match max(0, relu_input).
+  auto zero_pattern = Broadcast(match::ConstantScalar(0));
+  if (!Match(instr, Maximum(zero_pattern, Op(&relu_input))) &&
+      !Match(instr, Maximum(Op(&relu_input), zero_pattern))) {
+    return absl::nullopt;
+  }
+  HloInstruction* conv_instr = nullptr;
+  HloInstruction* alpha_conv_instr = nullptr;
+  HloInstruction* alpha_side_input_instr = nullptr;
+  HloInstruction* bias_broadcast_instr = nullptr;
+  HloInstruction* bias = nullptr;
+  HloInstruction* side_input = nullptr;
+
+  // These nodes will not be in the returned value, but we need to check them
+  // for single use.
+  HloInstruction *gte = nullptr, *add1 = nullptr, *add2 = nullptr,
+                 *mul1 = nullptr, *mul2 = nullptr;
+
+  const auto bias_pattern = Broadcast(&bias_broadcast_instr, Op(&bias));
+  const auto conv_pattern = [&] {
+    auto alpha_pattern = Broadcast(Constant(&alpha_conv_instr));
+    auto conv_pattern = GetTupleElement(
+        &gte, Op(&conv_instr).WithOpcode(HloOpcode::kCustomCall), 0);
+    return AnyOf<HloInstruction>(
+        MultiplyAnyOrder(&mul1, alpha_pattern, conv_pattern), conv_pattern);
+  }();
+  const auto side_input_pattern = [&] {
+    auto alpha_pattern = Broadcast(Constant(&alpha_side_input_instr));
+    // If bias is already matched, match arbitrary additional input as side
+    // input. Note this may force a cheap operation (e.g. broadcast) to be
+    // materialized into a large buffer, as large as the output buffer.
+    //
+    // TODO(timshen): If in practice there are significant false positives, we
+    // should fix it.
+    auto side_input_pattern = Op(&side_input);
+    return AnyOf<HloInstruction>(
+        MultiplyAnyOrder(&mul2, alpha_pattern, side_input_pattern),
+        side_input_pattern);
+  }();
+
+  {
+    // Try to match any of the following form of add, in any association:
+    //   addends[0]
+    //   addends[0] + addends[1]
+    //   addends[0] + addends[1] + addends[2]
+    //
+    // Then try to match each addend with one of the three patterns: bias, conv,
+    // or side_input. Notice that side_input matching must go last, as it
+    // also matches a conv or a bias.
+    HloInstruction* addends[3] = {nullptr, nullptr, nullptr};
+    auto add3_pattern = [&] {
+      auto add2_pattern = Add(&add1, Op(&addends[0]), Op(&addends[1]));
+      return AnyOf<HloInstruction>(
+          AddAnyOrder(&add2, add2_pattern, Op(&addends[2])), add2_pattern,
+          Op(&addends[0]));
+    }();
+    CHECK(Match(relu_input, add3_pattern));
+    for (auto addend : addends) {
+      if (addend) {
+        if (bias == nullptr && Match(addend, bias_pattern)) {
+          CHECK(bias);
+        } else if (conv_instr == nullptr && Match(addend, conv_pattern)) {
+          CHECK(conv_instr);
+        } else if (side_input == nullptr && Match(addend, side_input_pattern)) {
+          CHECK(side_input);
+        } else {
+          return absl::nullopt;
+        }
+      }
+    }
+  }
+
+  if (conv_instr == nullptr) {
+    return absl::nullopt;
+  }
+
+  for (HloInstruction* instr :
+       {conv_instr, bias_broadcast_instr, gte, add1, add2, mul1, mul2}) {
+    if (instr && instr->user_count() > 1) {
+      return absl::nullopt;
+    }
+  }
+
+  auto conv = Cast<HloCustomCallInstruction>(conv_instr);
+  auto bias_broadcast =
+      CastOrNull<HloBroadcastInstruction>(bias_broadcast_instr);
+
+  if (conv->custom_call_target() != kCudnnConvForwardCallTarget) {
+    return absl::nullopt;
+  }
+
+  if (bias_broadcast) {
+    // TODO(timshen): handle bias_broadcast_instr->dimensions() == {}.
+    if (bias_broadcast_instr->dimensions().size() != 1) {
+      return absl::nullopt;
+    }
+    if (bias_broadcast_instr->dimensions(0) !=
+        conv->convolution_dimension_numbers().output_feature_dimension()) {
+      return absl::nullopt;
+    }
+  }
+
+  return ConvWithRelu{
+      instr,
+      conv,
+      bias,
+      side_input,
+      CastOrNull<HloConstantInstruction>(alpha_conv_instr),
+      CastOrNull<HloConstantInstruction>(alpha_side_input_instr)};
+}
+
+StatusOr<std::unique_ptr<HloInstruction>> TryRewriteToCudnnForwardRelu(
+    ConvWithRelu match) {
+  auto conv = match.conv;
+
+  HloComputation* computation = conv->parent();
+  PrimitiveType element_type = conv->operand(0)->shape().element_type();
+
+  const auto get_alpha_value =
+      [](HloConstantInstruction* instr) -> StatusOr<double> {
+    TF_ASSIGN_OR_RETURN(
+        auto alpha,
+        Cast<HloConstantInstruction>(instr)->literal().Convert(F64));
+    return alpha.GetFirstElement<double>();
+  };
+
+  double alpha_conv = 1;
+  if (match.alpha_conv) {
+    TF_ASSIGN_OR_RETURN(alpha_conv, get_alpha_value(match.alpha_conv));
+  }
+
+  double alpha_side_input;
+  if (match.side_input) {
+    if (match.alpha_side_input) {
+      TF_ASSIGN_OR_RETURN(alpha_side_input,
+                          get_alpha_value(match.alpha_side_input));
+    } else {
+      alpha_side_input = 1;
+    }
+  } else {
+    CHECK(match.alpha_side_input == nullptr);
+    alpha_side_input = 0;
+  }
+
+  auto bias = match.bias;
+  if (!bias) {
+    auto zero = computation->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
+
+    int64 num_output_feature = conv->shape().tuple_shapes(0).dimensions(
+        conv->convolution_dimension_numbers().output_feature_dimension());
+    bias = computation->AddInstruction(HloInstruction::CreateBroadcast(
+        ShapeUtil::MakeShapeWithDescendingLayout(element_type,
+                                                 {num_output_feature}),
+        zero, {}));
+  }
+
+  CHECK(bias);
+  std::vector<HloInstruction*> args = {conv->mutable_operand(0),
+                                       conv->mutable_operand(1), bias};
+  if (match.side_input) {
+    args.push_back(match.side_input);
+  }
+  auto new_conv = computation->AddInstruction(HloInstruction::CreateCustomCall(
+      conv->shape(), args, kCudnnConvBiasActivationForwardCallTarget));
+  new_conv->set_window(conv->window());
+  new_conv->set_convolution_dimension_numbers(
+      conv->convolution_dimension_numbers());
+  new_conv->set_metadata(conv->metadata());
+  TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig config,
+                      conv->backend_config<CudnnConvBackendConfig>());
+  config.set_activation_mode(
+      static_cast<int64>(se::dnn::ActivationMode::kRelu));
+  config.set_conv_result_scale(alpha_conv);
+  config.set_side_input_scale(alpha_side_input);
+  TF_RETURN_IF_ERROR(new_conv->set_backend_config(config));
+
+  VLOG(1) << "Replacing convolution " << conv->ToString() << " with "
+          << new_conv->ToString();
+  return HloInstruction::CreateGetTupleElement(conv->shape().tuple_shapes(0),
+                                               new_conv, 0);
+}
+
+}  // namespace
+
+StatusOr<bool> CudnnFusedConvRewriter::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    std::vector<ConvWithRelu> matches;
+    int num_forward_convs = 0;
+    for (auto instr : computation->instructions()) {
+      auto match = FindConvWithRelu(instr);
+      if (match.has_value()) {
+        matches.push_back(*match);
+      }
+      if (auto call = DynCast<HloCustomCallInstruction>(instr)) {
+        if (call->custom_call_target() == kCudnnConvForwardCallTarget) {
+          num_forward_convs++;
+        }
+      }
+    }
+    VLOG(1) << "Identified cuDNN forward conv + relu: " << matches.size()
+            << " out of " << num_forward_convs << " forward convs.";
+    std::vector<std::pair<HloInstruction*, std::unique_ptr<HloInstruction>>>
+        replacements;
+    for (const ConvWithRelu& match : matches) {
+      TF_ASSIGN_OR_RETURN(auto new_instr, TryRewriteToCudnnForwardRelu(match));
+      replacements.push_back({match.maximum, std::move(new_instr)});
+      changed = true;
+    }
+    for (auto& replacement : replacements) {
+      TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+          replacement.first, std::move(replacement.second)));
+    }
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..613ed8dbdc33dfc3684deb5fd3ee8f5b9ea5fc50
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_FUSED_CONV_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_FUSED_CONV_REWRITER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+class CudnnFusedConvRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "cudnn-fused-convolution-rewriter";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_FUSED_CONV_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b7dd07a50c637d514439bb7a8ec799e4cabfee55
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter_test.cc
@@ -0,0 +1,310 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::HasSubstr;
+using ::testing::Not;
+
+class CudnnFusedConvRewriterTest : public HloTestBase {
+ protected:
+  string GetOptimizedHlo(absl::string_view hlo_string) {
+    return backend()
+        .compiler()
+        ->RunHloPasses(ParseHloString(hlo_string, GetModuleConfigForTest())
+                           .ConsumeValueOrDie(),
+                       backend().default_stream_executor(),
+                       backend().memory_allocator())
+        .ConsumeValueOrDie()
+        ->ToString();
+  }
+
+  void TestMatchWithAllTypes(absl::string_view hlo_string) {
+    for (absl::string_view type : {"f16", "f32", "f64"}) {
+      const string hlo_with_new_type =
+          absl::StrReplaceAll(hlo_string, {{"TYPE", type}});
+      string optimized_hlo_string = GetOptimizedHlo(hlo_with_new_type);
+      EXPECT_THAT(optimized_hlo_string,
+                  Not(HasSubstr(kCudnnConvForwardCallTarget)));
+      EXPECT_THAT(optimized_hlo_string,
+                  HasSubstr(kCudnnConvBiasActivationForwardCallTarget));
+      EXPECT_TRUE(RunAndCompare(hlo_with_new_type, ErrorSpec{0.01}))
+          << optimized_hlo_string;
+    }
+  }
+
+  void TestNotMatchWithAllTypes(absl::string_view hlo_string) {
+    for (absl::string_view type : {"f16", "f32", "f64"}) {
+      const string hlo_with_new_type =
+          absl::StrReplaceAll(hlo_string, {{"TYPE", type}});
+      string optimized_hlo_string = GetOptimizedHlo(hlo_with_new_type);
+      EXPECT_THAT(optimized_hlo_string, HasSubstr(kCudnnConvForwardCallTarget));
+      EXPECT_THAT(optimized_hlo_string,
+                  Not(HasSubstr(kCudnnConvBiasActivationForwardCallTarget)));
+    }
+  }
+};
+
+TEST_F(CudnnFusedConvRewriterTest, TestConvOnly) {
+  // max(0, conv(x, w));
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,32,9,9] broadcast(zero), dimensions={}
+
+      input = TYPE[1,17,9,9] parameter(0)
+      filter = TYPE[3,3,17,32] parameter(1)
+
+      conv = TYPE[1,32,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+      ROOT relu = TYPE[1,32,9,9] maximum(zeros, conv)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestBias) {
+  // max(0, conv(x, w) + bias);
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      bias = TYPE[64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
+      add1 = TYPE[1,3,3,64] add(conv, broadcasted_bias)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestSideInputOnly) {
+  // max(0, conv(x, w) + side_input);
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      side_input = TYPE[1,3,3,64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      add1 = TYPE[1,3,3,64] add(conv, side_input)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestBiasAndSideInput) {
+  // max(0, conv(x, w) + side_input + bias);
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      side_input = TYPE[1,3,3,64] parameter(2)
+      bias = TYPE[64] parameter(3)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
+      add1 = TYPE[1,3,3,64] add(conv, broadcasted_bias)
+      add2 = TYPE[1,3,3,64] add(add1, side_input)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add2)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestScaledConv) {
+  // max(0, 0.999994934 * conv(x, w));
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,32,9,9] broadcast(zero), dimensions={}
+      alpha_conv_scalar = TYPE[] constant(0.999994934)
+
+      input = TYPE[1,17,9,9] parameter(0)
+      filter = TYPE[3,3,17,32] parameter(1)
+
+      conv = TYPE[1,32,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+      alpha_conv = TYPE[1,32,9,9] broadcast(alpha_conv_scalar), dimensions={}
+      scaled_conv = TYPE[1,32,9,9] multiply(conv, alpha_conv)
+      ROOT relu = TYPE[1,32,9,9] maximum(zeros, scaled_conv)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestScaledConvAndSideInput) {
+  // max(0, conv(x, w) + 0.899994934 * side_input);
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+      alpha_side_input_scalar = TYPE[] constant(0.899994934)
+      alpha_side_input = TYPE[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      side_input = TYPE[1,3,3,64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      scaled_side_input = TYPE[1,3,3,64] multiply(side_input, alpha_side_input)
+      add1 = TYPE[1,3,3,64] add(conv, scaled_side_input)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestScaledConvAndScaledSideInput) {
+  // max(0, 0.999994934 * conv(x, w) + 0.899994934 * side_input);
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+      alpha_conv_scalar = TYPE[] constant(0.999994934)
+      alpha_conv = TYPE[1,3,3,64] broadcast(alpha_conv_scalar), dimensions={}
+      alpha_side_input_scalar = TYPE[] constant(0.899994934)
+      alpha_side_input = TYPE[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      side_input = TYPE[1,3,3,64] parameter(2)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      scaled_conv = TYPE[1,3,3,64] multiply(conv, alpha_conv)
+      scaled_side_input = TYPE[1,3,3,64] multiply(side_input, alpha_side_input)
+      add1 = TYPE[1,3,3,64] add(scaled_conv, scaled_side_input)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestScaledConvAndScaledSideInputWithBias) {
+  // max(0, 0.999994934 * conv(x, w) + 0.899994934 * side_input + bias);
+  TestMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+      alpha_conv_scalar = TYPE[] constant(0.999994934)
+      alpha_conv = TYPE[1,3,3,64] broadcast(alpha_conv_scalar), dimensions={}
+      alpha_side_input_scalar = TYPE[] constant(0.899994934)
+      alpha_side_input = TYPE[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      side_input = TYPE[1,3,3,64] parameter(2)
+      bias = TYPE[64] parameter(3)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      scaled_conv = TYPE[1,3,3,64] multiply(conv, alpha_conv)
+      scaled_side_input = TYPE[1,3,3,64] multiply(side_input, alpha_side_input)
+      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
+      add1 = TYPE[1,3,3,64] add(scaled_conv, broadcasted_bias)
+      add2 = TYPE[1,3,3,64] add(add1, scaled_side_input)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add2)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestMatchMaxZeroOnly) {
+  // max(0.1, conv(x, w)) shouldn't match.
+  TestNotMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      point_one = TYPE[] constant(0.1)
+      point_ones = TYPE[1,32,9,9] broadcast(point_one), dimensions={}
+
+      input = TYPE[1,17,9,9] parameter(0)
+      filter = TYPE[3,3,17,32] parameter(1)
+
+      conv = TYPE[1,32,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
+      ROOT relu = TYPE[1,32,9,9] maximum(point_ones, conv)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, TestMatchBroadcastedBiasOnly) {
+  // max(0, conv(x, w) + side_input1 + side_input2) shouldn't match.
+  TestNotMatchWithAllTypes(R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = TYPE[] constant(0)
+      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
+
+      input = TYPE[1,3,3,64] parameter(0)
+      filter = TYPE[3,3,64,64] parameter(1)
+      side_input1 = TYPE[1,3,3,64] parameter(2)
+      side_input2 = TYPE[1,3,3,64] parameter(3)
+
+      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
+      add1 = TYPE[1,3,3,64] add(conv, side_input2)
+      add2 = TYPE[1,3,3,64] add(add1, side_input1)
+      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add2)
+    })");
+}
+
+TEST_F(CudnnFusedConvRewriterTest, PreservesMetadata) {
+  const char* kHloString = R"(
+    HloModule Test
+
+    ENTRY Test {
+      zero = f32[] constant(0)
+      zeros = f32[1,32,9,9] broadcast(zero), dimensions={}
+
+      input = f32[1,17,9,9] parameter(0)
+      filter = f32[3,3,17,32] parameter(1)
+
+      conv = f32[1,32,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1, metadata={op_type="foo"}
+      ROOT relu = f32[1,32,9,9] maximum(zeros, conv)
+    })";
+
+  const string optimized_hlo_string =
+      backend()
+          .compiler()
+          ->RunHloPasses(ParseHloString(kHloString, GetModuleConfigForTest())
+                             .ConsumeValueOrDie(),
+                         backend().default_stream_executor(),
+                         backend().memory_allocator())
+          .ConsumeValueOrDie()
+          ->ToString();
+  EXPECT_THAT(
+      optimized_hlo_string,
+      ::testing::ContainsRegex(R"(custom-call.*metadata=\{op_type="foo"\})"));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.cc
deleted file mode 100644
index 3761c19cfcab10e0c6faa17c2d1d535d706ff6c5..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.cc
+++ /dev/null
@@ -1,278 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.h"
-
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/service/pattern_matcher.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-// Describes a matched pattern:
-//   max(0, alpha1 * conv(x, w) + alpha2 * side_input + broadcast(bias));
-// Where side_input has the shape of output buffer, and bias is a 1D array with
-// the dimension of number of output features.
-struct ConvWithRelu {
-  HloInstruction* maximum;
-  HloCustomCallInstruction* conv;
-  HloInstruction* bias;
-  HloInstruction* side_input;
-  HloConstantInstruction* alpha_conv;
-  HloConstantInstruction* alpha_side_input;
-};
-
-absl::optional<ConvWithRelu> FindConvWithRelu(HloInstruction* instr) {
-  using match::Add;
-  using match::AddAnyOrder;
-  using match::AnyOf;
-  using match::Broadcast;
-  using match::Constant;
-  using match::GetTupleElement;
-  using match::Maximum;
-  using match::MultiplyAnyOrder;
-  using match::Op;
-
-  // The pattern we want to match:
-  //   max(0, alpha1 * conv(x, w) + alpha2 * side_input + broadcast(bias));
-  //
-  // With its variants involving commute/reassociation of adds, multiplies, and
-  // max, and omission of alpha1, side_input, alpha2, or bias.
-
-  HloInstruction* relu_input;
-
-  // Match max(0, relu_input).
-  auto zero_pattern = Broadcast(match::ConstantScalar(0));
-  if (!Match(instr, Maximum(zero_pattern, Op(&relu_input))) &&
-      !Match(instr, Maximum(Op(&relu_input), zero_pattern))) {
-    return absl::nullopt;
-  }
-  HloInstruction* conv_instr = nullptr;
-  HloInstruction* alpha_conv_instr = nullptr;
-  HloInstruction* alpha_side_input_instr = nullptr;
-  HloInstruction* bias_broadcast_instr = nullptr;
-  HloInstruction* bias = nullptr;
-  HloInstruction* side_input = nullptr;
-
-  // These nodes will not be in the returned value, but we need to check them
-  // for single use.
-  HloInstruction *gte = nullptr, *add1 = nullptr, *add2 = nullptr,
-                 *mul1 = nullptr, *mul2 = nullptr;
-
-  const auto bias_pattern = Broadcast(&bias_broadcast_instr, Op(&bias));
-  const auto conv_pattern = [&] {
-    auto alpha_pattern = Broadcast(Constant(&alpha_conv_instr));
-    auto conv_pattern = GetTupleElement(
-        &gte, Op(&conv_instr).WithOpcode(HloOpcode::kCustomCall), 0);
-    return AnyOf<HloInstruction>(
-        MultiplyAnyOrder(&mul1, alpha_pattern, conv_pattern), conv_pattern);
-  }();
-  const auto side_input_pattern = [&] {
-    auto alpha_pattern = Broadcast(Constant(&alpha_side_input_instr));
-    // If bias is already matched, match arbitrary additional input as side
-    // input. Note this may force a cheap operation (e.g. broadcast) to be
-    // materialized into a large buffer, as large as the output buffer.
-    //
-    // TODO(timshen): If in practice there are significant false positives, we
-    // should fix it.
-    auto side_input_pattern = Op(&side_input);
-    return AnyOf<HloInstruction>(
-        MultiplyAnyOrder(&mul2, alpha_pattern, side_input_pattern),
-        side_input_pattern);
-  }();
-
-  {
-    // Try to match any of the following form of add, in any association:
-    //   addends[0]
-    //   addends[0] + addends[1]
-    //   addends[0] + addends[1] + addends[2]
-    //
-    // Then try to match each addend with one of the three patterns: bias, conv,
-    // or side_input. Notice that side_input matching must go last, as it
-    // also matches a conv or a bias.
-    HloInstruction* addends[3] = {nullptr, nullptr, nullptr};
-    auto add3_pattern = [&] {
-      auto add2_pattern = Add(&add1, Op(&addends[0]), Op(&addends[1]));
-      return AnyOf<HloInstruction>(
-          AddAnyOrder(&add2, add2_pattern, Op(&addends[2])), add2_pattern,
-          Op(&addends[0]));
-    }();
-    CHECK(Match(relu_input, add3_pattern));
-    for (auto addend : addends) {
-      if (addend) {
-        if (bias == nullptr && Match(addend, bias_pattern)) {
-          CHECK(bias);
-        } else if (conv_instr == nullptr && Match(addend, conv_pattern)) {
-          CHECK(conv_instr);
-        } else if (side_input == nullptr && Match(addend, side_input_pattern)) {
-          CHECK(side_input);
-        } else {
-          return absl::nullopt;
-        }
-      }
-    }
-  }
-
-  if (conv_instr == nullptr) {
-    return absl::nullopt;
-  }
-
-  for (HloInstruction* instr :
-       {conv_instr, bias_broadcast_instr, gte, add1, add2, mul1, mul2}) {
-    if (instr && instr->user_count() > 1) {
-      return absl::nullopt;
-    }
-  }
-
-  auto conv = Cast<HloCustomCallInstruction>(conv_instr);
-  auto bias_broadcast =
-      CastOrNull<HloBroadcastInstruction>(bias_broadcast_instr);
-
-  if (conv->custom_call_target() != kCudnnConvForwardCallTarget) {
-    return absl::nullopt;
-  }
-
-  if (bias_broadcast) {
-    // TODO(timshen): handle bias_broadcast_instr->dimensions() == {}.
-    if (bias_broadcast_instr->dimensions().size() != 1) {
-      return absl::nullopt;
-    }
-    if (bias_broadcast_instr->dimensions(0) !=
-        conv->convolution_dimension_numbers().output_feature_dimension()) {
-      return absl::nullopt;
-    }
-  }
-
-  return ConvWithRelu{
-      instr,
-      conv,
-      bias,
-      side_input,
-      CastOrNull<HloConstantInstruction>(alpha_conv_instr),
-      CastOrNull<HloConstantInstruction>(alpha_side_input_instr)};
-}
-
-StatusOr<std::unique_ptr<HloInstruction>> TryRewriteToCudnnForwardRelu(
-    ConvWithRelu match) {
-  auto conv = match.conv;
-
-  HloComputation* computation = conv->parent();
-  PrimitiveType element_type = conv->operand(0)->shape().element_type();
-
-  const auto get_alpha_value =
-      [](HloConstantInstruction* instr) -> StatusOr<double> {
-    TF_ASSIGN_OR_RETURN(
-        auto alpha,
-        Cast<HloConstantInstruction>(instr)->literal().Convert(F64));
-    return alpha.GetFirstElement<double>();
-  };
-
-  double alpha_conv = 1;
-  if (match.alpha_conv) {
-    TF_ASSIGN_OR_RETURN(alpha_conv, get_alpha_value(match.alpha_conv));
-  }
-
-  double alpha_side_input;
-  if (match.side_input) {
-    if (match.alpha_side_input) {
-      TF_ASSIGN_OR_RETURN(alpha_side_input,
-                          get_alpha_value(match.alpha_side_input));
-    } else {
-      alpha_side_input = 1;
-    }
-  } else {
-    CHECK(match.alpha_side_input == nullptr);
-    alpha_side_input = 0;
-  }
-
-  auto bias = match.bias;
-  if (!bias) {
-    auto zero = computation->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
-
-    int64 num_output_feature = conv->shape().tuple_shapes(0).dimensions(
-        conv->convolution_dimension_numbers().output_feature_dimension());
-    bias = computation->AddInstruction(HloInstruction::CreateBroadcast(
-        ShapeUtil::MakeShapeWithDescendingLayout(element_type,
-                                                 {num_output_feature}),
-        zero, {}));
-  }
-
-  CHECK(bias);
-  std::vector<HloInstruction*> args = {conv->mutable_operand(0),
-                                       conv->mutable_operand(1), bias};
-  if (match.side_input) {
-    args.push_back(match.side_input);
-  }
-  auto new_conv = computation->AddInstruction(HloInstruction::CreateCustomCall(
-      conv->shape(), args, kCudnnConvBiasActivationForwardCallTarget));
-  new_conv->set_window(conv->window());
-  new_conv->set_convolution_dimension_numbers(
-      conv->convolution_dimension_numbers());
-  TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig config,
-                      conv->backend_config<CudnnConvBackendConfig>());
-  config.set_activation_mode(
-      static_cast<int64>(se::dnn::ActivationMode::kRelu));
-  config.set_conv_result_scale(alpha_conv);
-  config.set_side_input_scale(alpha_side_input);
-  TF_RETURN_IF_ERROR(new_conv->set_backend_config(config));
-
-  VLOG(1) << "Rewriting " << conv->name() << " to " << new_conv->name();
-  return HloInstruction::CreateGetTupleElement(conv->shape().tuple_shapes(0),
-                                               new_conv, 0);
-}
-
-}  // namespace
-
-StatusOr<bool> CudnnFusedConvolutionRewriter::Run(HloModule* module) {
-  bool changed = false;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    std::vector<ConvWithRelu> matches;
-    int num_forward_convs = 0;
-    for (auto instr : computation->instructions()) {
-      auto match = FindConvWithRelu(instr);
-      if (match.has_value()) {
-        matches.push_back(*match);
-      }
-      if (auto call = DynCast<HloCustomCallInstruction>(instr)) {
-        if (call->custom_call_target() == kCudnnConvForwardCallTarget) {
-          num_forward_convs++;
-        }
-      }
-    }
-    VLOG(1) << "Identified cuDNN forward conv + relu: " << matches.size()
-            << " out of " << num_forward_convs << " forward convs.";
-    std::vector<std::pair<HloInstruction*, std::unique_ptr<HloInstruction>>>
-        replacements;
-    for (const ConvWithRelu& match : matches) {
-      TF_ASSIGN_OR_RETURN(auto new_instr, TryRewriteToCudnnForwardRelu(match));
-      replacements.push_back({match.maximum, std::move(new_instr)});
-      changed = true;
-    }
-    for (auto& replacement : replacements) {
-      TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
-          replacement.first, std::move(replacement.second)));
-    }
-  }
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.h b/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.h
deleted file mode 100644
index bd12aadded9dd9e19bc695ddc11e5529931a306a..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_FUSED_CONVOLUTION_REWRITER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_FUSED_CONVOLUTION_REWRITER_H_
-
-#include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-
-namespace xla {
-namespace gpu {
-
-class CudnnFusedConvolutionRewriter : public HloModulePass {
- public:
-  absl::string_view name() const override {
-    return "cudnn-fused-convolution-rewriter";
-  }
-
-  StatusOr<bool> Run(HloModule* module) override;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_FUSED_CONVOLUTION_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index c1aaa4bf04ddc31edf723c056805ae5aad994e55..2ab754a471070d5f90a3eaebd0600ff180d2fe5d 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -161,6 +161,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
   PrimitiveType lhs_input_type = op->operand(0)->shape().element_type();
   PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
+  HloOpcode opcode = op->opcode();
+
+  if (hlo_module_config_.debug_options().xla_gpu_enable_fast_min_max() &&
+      (opcode == HloOpcode::kMaximum || opcode == HloOpcode::kMinimum)) {
+    return llvm_ir::EmitCallToIntrinsic(
+        opcode == HloOpcode::kMaximum ? llvm::Intrinsic::maxnum
+                                      : llvm::Intrinsic::minnum,
+        {lhs_value, rhs_value}, {lhs_value->getType()}, b_);
+  }
+
   switch (op->opcode()) {
     case HloOpcode::kRemainder: {
       return EmitLibdeviceMathCall("__nv_fmod", {lhs_value, rhs_value},
@@ -358,13 +368,6 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         const HloInstruction* operand = hlo->operand(0);
         const Window& window = hlo->window();
 
-        // TODO(b/31410564): Implement dilation for reduce-window.
-        if (window_util::HasDilation(window)) {
-          return Unimplemented(
-              "Dilation for reduce-window not implemented on GPU. "
-              "See b/31410564.");
-        }
-
         PrimitiveType operand_element_type = operand->shape().element_type();
         llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
             llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
@@ -397,9 +400,24 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         for (size_t i = 0; i < index.size(); ++i) {
           llvm::Value* stridden_index = NSWMul(
               index[i], index_typed_const(window.dimensions(i).stride()));
+          input_index[i] = NSWSub(
+              NSWAdd(stridden_index,
+                     NSWMul(window_index[i],
+                            index_typed_const(
+                                window.dimensions(i).window_dilation()))),
+              index_typed_const(window.dimensions(i).padding_low()));
+
+          // We need to verify that we are not in the dilated base area.
+          llvm::Value* dilation_condition = ICmpEQ(
+              SRem(input_index[i],
+                   index_typed_const(window.dimensions(i).base_dilation())),
+              index_typed_const(0));
+          in_bounds = And(in_bounds, dilation_condition);
+
+          // Apply base dilation to the index.
           input_index[i] =
-              NSWSub(NSWAdd(stridden_index, window_index[i]),
-                     index_typed_const(window.dimensions(i).padding_low()));
+              SDiv(input_index[i],
+                   index_typed_const(window.dimensions(i).base_dilation()));
 
           // We must check whether 0 ≤ input_index[i] < bound, as otherwise
           // we are in the pad and so can skip the computation. This
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 30c1f9088968305ad0207164ecb07ba13cc89ee6..470457935acacb8940af241dadb393d770786939 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -229,7 +229,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) {
         return user->opcode() == HloOpcode::kFusion &&
                (user->fusion_kind() == HloInstruction::FusionKind::kLoop ||
-                (user->fusion_kind() == HloInstruction::FusionKind::kInput &&
+                (IsReduceInputFusion(*user) &&
                  LayoutsAreReduceInputFusionFriendly(*fusion, *user)));
       })) {
     VLOG(3) << "Not merging " << fusion->name()
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 9c4a4903667ea1a6c99ce9e912c9d0497b8e389f..27f07b1d58125092c1ed6734b238e4ae0f11c4aa 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -51,7 +52,8 @@ struct MatrixDescriptor {
 // rhs_matrix, and stores the result to output_matrix.
 template <typename Element>
 bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
-            MatrixDescriptor output_matrix, double alpha, se::Stream* stream) {
+            MatrixDescriptor output_matrix, double alpha, double beta,
+            se::Stream* stream) {
   DCHECK(!output_matrix.transpose);
 
   const int64 batch_size = lhs_matrix.batch_size;
@@ -73,7 +75,7 @@ bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
             lhs_transpose, rhs_transpose, output_matrix.num_rows,
             output_matrix.num_cols, /*size of reduce dim=*/k, /*alpha=*/alpha,
             lhs_data, /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data,
-            /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/0.0,
+            /*leading dim of RHS=*/rhs_matrix.num_rows, /*beta=*/beta,
             &output_data, /*leading dim of output=*/output_matrix.num_rows)
         .ok();
   }
@@ -88,7 +90,7 @@ bool DoGemm(MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
           /*alpha=*/alpha, lhs_data,
           /*leading dim of LHS=*/lhs_matrix.num_rows, lhs_stride, rhs_data,
           /*leading dim of RHS=*/rhs_matrix.num_rows, rhs_stride,
-          /*beta=*/0.0, &output_data,
+          /*beta=*/beta, &output_data,
           /*leading dim of output=*/output_matrix.num_rows, output_stride,
           batch_size)
       .ok();
@@ -112,6 +114,7 @@ template <typename Element>
 bool DoGemmWithAlgorithm(MatrixDescriptor lhs_matrix,
                          MatrixDescriptor rhs_matrix,
                          MatrixDescriptor output_matrix, double alpha,
+                         double beta,
                          se::blas::ComputationType computation_type,
                          se::blas::AlgorithmType algorithm, se::Stream* stream,
                          se::blas::ProfileResult* output_profile_result) {
@@ -138,7 +141,7 @@ bool DoGemmWithAlgorithm(MatrixDescriptor lhs_matrix,
           /*alpha=*/static_cast<Element>(alpha), lhs_data,
           /*leading dim of LHS=*/lhs_matrix.num_rows, rhs_data,
           /*leading dim of RHS=*/rhs_matrix.num_rows,
-          /*beta=*/static_cast<Element>(0.0f), &output_data,
+          /*beta=*/static_cast<Element>(beta), &output_data,
           /*leading dim of output=*/output_matrix.num_rows, computation_type,
           algorithm, output_profile_result)
       .ok();
@@ -153,7 +156,7 @@ bool DoGemmWithAlgorithm(MatrixDescriptor lhs_matrix,
 template <typename Element>
 StatusOr<se::blas::AlgorithmType> DoGemmAutotune(
     MatrixDescriptor lhs_matrix, MatrixDescriptor rhs_matrix,
-    MatrixDescriptor output_matrix, double alpha,
+    MatrixDescriptor output_matrix, double alpha, double beta,
     se::blas::ComputationType computation_type, se::Stream* stream) {
   std::vector<se::blas::AlgorithmType> algorithms;
   CHECK(stream->parent()->GetBlasGemmAlgorithms(&algorithms));
@@ -166,7 +169,7 @@ StatusOr<se::blas::AlgorithmType> DoGemmAutotune(
     // non-null ProfileResult, DoGemmWithAlgorithm should always return true,
     // and the actual success-ness is returned in ProfileResult::is_valid.
     CHECK(DoGemmWithAlgorithm<Element>(lhs_matrix, rhs_matrix, output_matrix,
-                                       alpha, computation_type, algorithm,
+                                       alpha, beta, computation_type, algorithm,
                                        stream, &profile_result));
 
     if (profile_result.is_valid()) {
@@ -263,8 +266,9 @@ DotDimensionNumbers GetDimensionNumbers(const HloInstruction& hlo_instruction) {
   }
   CHECK_EQ(hlo_instruction.opcode(), HloOpcode::kFusion);
   CHECK_EQ(hlo_instruction.fusion_kind(), HloInstruction::FusionKind::kOutput);
-  CHECK_EQ(hlo_instruction.fused_expression_root()->opcode(),
-           HloOpcode::kMultiply);
+  CHECK(hlo_instruction.fused_expression_root()->opcode() == HloOpcode::kAdd ||
+        hlo_instruction.fused_expression_root()->opcode() ==
+            HloOpcode::kMultiply);
   // Try to find the dot inside the output fusion node.
   const HloInstruction* dot =
       hlo_instruction.fused_expression_root()->operand(0);
@@ -282,8 +286,9 @@ GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
                      const BufferAllocation::Slice& rhs_buffer,
                      const BufferAllocation::Slice& output_buffer,
                      const Shape& lhs_shape, const Shape& rhs_shape,
-                     const Shape& output_shape, double alpha,
-                     const HloInstruction* hlo_instruction)
+                     const Shape& output_shape, double alpha, double beta,
+                     const HloInstruction* hlo_instruction,
+                     bool implements_whole_instruction)
     : Thunk(Kind::kGemm, hlo_instruction),
       lhs_buffer_(lhs_buffer),
       rhs_buffer_(rhs_buffer),
@@ -291,7 +296,9 @@ GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
       lhs_shape_(lhs_shape),
       rhs_shape_(rhs_shape),
       output_shape_(output_shape),
-      alpha_(alpha) {}
+      alpha_(alpha),
+      beta_(beta),
+      implements_whole_instruction_(implements_whole_instruction) {}
 
 Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
                                   se::Stream* stream,
@@ -386,7 +393,7 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     // TODO(b/112111608): Implement auto tune for batched gemm.
     if (batch_size != 1) {
       return GetGemmFn(element_type)(lhs_matrix, rhs_matrix, output_matrix,
-                                     alpha_, stream);
+                                     alpha_, beta_, stream);
     }
 
     auto thunk_name = [&] {
@@ -398,9 +405,27 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     auto autotune_it = autotune_results_.find(device_name);
     if (autotune_it == autotune_results_.end()) {
       VLOG(3) << "Starting autotune of GemmThunk " << thunk_name();
-      StatusOr<se::blas::AlgorithmType> best_algorithm =
-          GetGemmAutotuneFn(element_type)(lhs_matrix, rhs_matrix, output_matrix,
-                                          alpha_, computation_type, stream);
+
+      // If the output buffer already contains a bias then autotune into a
+      // scratch buffer. This avoids overwriting the bias buffer. The scratch
+      // buffer may contain arbitrary garbage values.
+      se::DeviceMemoryBase scratch_data = output_data;
+      std::unique_ptr<se::TemporaryDeviceMemory<char>> scratch_mem;
+      if (beta_ != 0.0) {
+        auto temp_status = stream->AllocateTemporaryArray<char>(
+            ShapeUtil::ByteSizeOf(output_shape_));
+        if (!temp_status.ok()) {
+          return false;
+        }
+        scratch_mem = std::move(temp_status).ValueOrDie();
+        scratch_data = scratch_mem->device_memory();
+      }
+      const MatrixDescriptor scratch_descriptor(
+          scratch_data, false, output_num_cols, output_num_rows, batch_size);
+
+      StatusOr<se::blas::AlgorithmType> best_algorithm = GetGemmAutotuneFn(
+          element_type)(lhs_matrix, rhs_matrix, scratch_descriptor, alpha_,
+                        beta_, computation_type, stream);
       autotune_it =
           autotune_results_.insert({device_name, best_algorithm}).first;
 
@@ -421,18 +446,19 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
       VLOG(2) << "Using algorithm " << algorithm
               << " chosen by autotuning on GemmThunk " << thunk_name();
       return GetGemmWithAlgorithmFn(element_type)(
-          lhs_matrix, rhs_matrix, output_matrix, alpha_, computation_type,
-          algorithm, stream,
+          lhs_matrix, rhs_matrix, output_matrix, alpha_, beta_,
+          computation_type, algorithm, stream,
           /*output_profile_result=*/nullptr);
     }
 
     // Autotune will fail when CUDA 8 and GPU sm_50 or older are used.
     // Use the older Gemm API in this case.
     return GetGemmFn(element_type)(lhs_matrix, rhs_matrix, output_matrix,
-                                   alpha_, stream);
+                                   alpha_, beta_, stream);
   };
 
-  auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction());
+  auto op_profiler = profiler->MakeScopedInstructionProfiler(
+      implements_whole_instruction_ ? hlo_instruction() : nullptr);
   bool launch_ok;
   if (LayoutUtil::Minor(output_shape_.layout(), row_dim) == 0) {
     launch_ok = launch(lhs_descriptor, rhs_descriptor,
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 12c81f9bfc6bfdac63edf9c826b835057107fa41..cc2d12a39c045fc081292dcf53053f6613d3d9ef 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -41,8 +41,9 @@ class GemmThunk : public Thunk {
             const BufferAllocation::Slice& rhs_buffer,
             const BufferAllocation::Slice& output_buffer,
             const Shape& lhs_shape, const Shape& rhs_shape,
-            const Shape& output_shape, double alpha,
-            const HloInstruction* hlo_instruction);
+            const Shape& output_shape, double alpha, double beta,
+            const HloInstruction* hlo_instruction,
+            bool implements_whole_instruction);
 
   GemmThunk(const GemmThunk&) = delete;
   GemmThunk& operator=(const GemmThunk&) = delete;
@@ -70,6 +71,9 @@ class GemmThunk : public Thunk {
   const Shape output_shape_;
 
   const double alpha_;
+  const double beta_;
+
+  const bool implements_whole_instruction_;
 
   // Maps device names (StreamExecutor::DeviceDescription::name()) to autotune
   // results.  The map's value is the best algorithm we've found for this thunk
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 57426327822d95a42f407ed7488f35acfd3623d2..ae2e718db29803a085401969a7d9b09abf690a6c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -51,7 +51,7 @@ GpuExecutable::GpuExecutable(
     const string& ptx, const std::vector<uint8>& cubin,
     std::pair<int, int> compute_capability,
     std::unique_ptr<const ThunkSchedule> thunk_schedule,
-    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 0e276282e40fba0ae4881a51dad0c7c9e8d1c081..2b3c77f5b82aa94f44d8de56caf0f4d31c05e0cb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,7 @@ class GpuExecutable : public Executable {
   GpuExecutable(const string& ptx, const std::vector<uint8>& cubin,
                 std::pair<int, int> compute_capability,
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
-                std::unique_ptr<const HloModule> hlo_module,
+                std::unique_ptr<HloModule> hlo_module,
                 std::unique_ptr<const BufferAssignment> assignment,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 2d31fd5570c468b0c42fa308535fd335f3588a79..452e763a8eaadc805cd3a3859a68e2a31598fd36 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -55,7 +55,7 @@ bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
   });
 }
 
-bool IsInputFusibleReduction(const HloInstruction& instr) {
+bool IsReduceInputFusion(const HloInstruction& instr) {
   if (instr.IsMultiOutputFusion()) {
     for (const HloInstruction* operand :
          instr.fused_expression_root()->operands()) {
@@ -67,17 +67,70 @@ bool IsInputFusibleReduction(const HloInstruction& instr) {
         return true;
       }
     }
-    return false;
-  } else if (instr.opcode() == HloOpcode::kFusion) {
-    if (IsReductionToVector(*instr.fused_expression_root())) {
-      CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput)
-          << " Fusion rooted at reduction-to-vector op must be of kind kInput: "
-          << instr.ToString();
-      return true;
+  } else if (instr.opcode() == HloOpcode::kFusion &&
+             IsReductionToVector(*instr.fused_expression_root())) {
+    CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput)
+        << " Fusion rooted at reduction-to-vector op must be of kind kInput: "
+        << instr.ToString();
+    return true;
+  }
+  return false;
+}
+
+bool IsInputFusibleReduction(const HloInstruction& instr) {
+  return IsReduceInputFusion(instr) || IsReductionToVector(instr);
+}
+
+bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
+                                          const HloInstruction& instr2) {
+  // Returns the instructions that determines the emitter used for lowering,
+  // sometimes referred to as "the real hero".
+  auto get_real_hero =
+      [&](const HloInstruction* instr) -> const HloInstruction* {
+    if (instr->opcode() == HloOpcode::kFusion) {
+      auto fused_expression_root = instr->fused_expression_root();
+      if (instr->IsMultiOutputFusion()) {
+        // If possible, we want to pick a reduction-to-vector operand of the
+        // fusion root, because it has the most constraints.
+        for (const auto* inst : fused_expression_root->operands()) {
+          if (IsReductionToVector(*inst)) {
+            return inst;
+          }
+        }
+        return fused_expression_root->operands()[0];
+      }
+      return fused_expression_root;
     }
+    return instr;
+  };
+
+  // Multi-output fusion kernels share a common parallel loop. The loop
+  // dimenstions are determined by instruction shapes.
+  auto get_loop_shape = [&](const HloInstruction* element_instr) {
+    // Special-case reduction-to-vector ops: The loop dimensions are determined
+    // by the shape of the first operand.
+    if (IsReductionToVector(*element_instr)) {
+      return element_instr->operand(0)->shape();
+    }
+    return element_instr->shape();
+  };
+
+  // All shapes of the root tuple of multi-output fusions should agree, i.e. all
+  // root ops should have equal output shapes. An exception are
+  // reduction-to-vector ops. Here the input shapes of the reduction (first
+  // operand shape) and the reduction dimensions need to match.
+  auto* instr_1 = get_real_hero(&instr1);
+  auto* instr_2 = get_real_hero(&instr2);
+  // TODO(tjoerg): Relax the shape constraint. The datatype does not matter.
+  if (IsReductionToVector(*instr_1) && IsReductionToVector(*instr_2) &&
+      (!ShapeUtil::Equal(instr_1->shape(), instr_2->shape()) ||
+       instr_1->dimensions() != instr_2->dimensions())) {
     return false;
   }
-  return IsReductionToVector(instr);
+  // The elementwise output shapes must be the same (including layout).
+  // TODO(tjoerg): Further relax the constraint. The datatype does not matter.
+  return ShapeUtil::EqualIgnoringFpPrecision(get_loop_shape(instr_1),
+                                             get_loop_shape(instr_2));
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index f7c24a0d5bbfcc61389ea19ae7f769671e4e974d..e9d7ba1c4cfa865532a0d06c2ed883a2fea4e2cd 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -33,16 +33,29 @@ namespace gpu {
 bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
                                          const HloInstruction& reduce);
 
-// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
-// is either an unfused reduction-to-vector op, an input fusion rooted at a
-// reduction-to-vector op, or a multi-output input fusion with at least one
-// reduction-to-vector op root.
 // Note that reduction ops are lowered in different ways. Reduce input fusions
 // are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at
 // reduction-to-vector ops. Other reduction ops are lowered by
 // GpuElementalIrEmitter and fused like elementwise ops.
+
+// Whether `instr` is an input fusion rooted at a reduction-to-vector op or a
+// multi-output input fusion with at least one reduction-to-vector op root.
+bool IsReduceInputFusion(const HloInstruction& instr);
+
+// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
+// is either an unfused reduction-to-vector op or a reduce input fusion.
 bool IsInputFusibleReduction(const HloInstruction& instr);
 
+// Whether instruction shapes are compatible for multi-output fusion, i.e.
+// whether the emitters support lowering the resulting fusion.
+// This function works for both, sibling and producer-conumser multi-output
+// fusion.
+// So far, multi-output fusion is supported for loop fusions and reduce
+// input fusions only. It is up to the caller to ensure the instructions
+// themselves are fusible!
+bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
+                                          const HloInstruction& instr2);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index d91b7bc61fda5a07c163a07ec0e1644d2ad9db49..15d4ee206ce8debcb8a5dbc6ec65d29ba257d302 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -178,7 +178,7 @@ TEST_F(GpuFusibleTest,
   EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_ReductionToVector) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
       c0 = f32[] parameter(0)
@@ -191,10 +191,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_ElementalReduction) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
       c0 = f32[] parameter(0)
@@ -207,10 +208,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputInputReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -225,10 +227,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputLoopReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -243,10 +246,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputInputReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -263,11 +267,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
 TEST_F(GpuFusibleTest,
-       IsInputFusibleReduction_MultiOutputInputReduceFusionWithExtraOutputs) {
+       IsReduceInputFusion_MultiOutputInputReduceFusionWithExtraOutputs) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -284,10 +289,11 @@ TEST_F(GpuFusibleTest,
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputLoopReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -304,11 +310,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
 TEST_F(GpuFusibleTest,
-       IsInputFusibleReduction_MultiOutputLoopFusionReduceAndElementwiseOp) {
+       IsReduceInputFusion_MultiOutputLoopFusionReduceAndElementwiseOp) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -325,8 +332,304 @@ TEST_F(GpuFusibleTest,
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_LoopFusions) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[6400]{0} parameter(0)
+      const.2 = f32[] constant(1)
+      ROOT div = f32[6400]{0} divide(p0.2, const.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_IgnoreFpPrecision) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[6400]{0} parameter(0)
+      ROOT convert = f16[6400]{0} convert(p0.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_Reduce) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      const.2 = f32[] constant(0)
+      reduce = f32[] reduce(p0, const.2), dimensions={0}, to_apply=scalar_add
+      ROOT root = (f32[6400]{0}, f32[]) tuple(fusion.1, reduce)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* reduce =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion, *reduce));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_Elementwise) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      const.2 = f32[] constant(1)
+      div = f32[6400]{0} divide(p0, const.2)
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, div)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* div =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion, *div));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_MultiOutputLoopFusion) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1)
+      exp = f32[8,1,5,16,1,1]{5,4,3,2,1,0} exponential(p0.1)
+      ROOT tuple = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(mul, exp)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      const.2 = f32[] constant(0)
+      ROOT add = f32[8,1,5,16,1,1]{5,4,3,2,1,0} add(p0.2, const.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      fusion.1 = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      gte0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=0
+      gte1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=1
+      ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(gte0, gte1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1)->operand(0);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_UnfusedOps) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      exp = f32[2,2,2]{2,1,0} exponential(p0)
+      reduce = f32[2,2]{1,0} reduce(exp, c0), dimensions={2}, to_apply=scalar_add
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce, exp)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* reduce =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* exp =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*reduce, *exp));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_DifferentLayouts) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{0,1,2} parameter(1)
+      c0 = f32[] constant(0)
+      exp = f32[2,2,2]{2,1,0} exponential(p0)
+      reduce = f32[2,2]{0,1} reduce(p1, c0), dimensions={2}, to_apply=scalar_add
+      ROOT root = (f32[2,2]{0,1}, f32[2,2,2]{2,1,0}) tuple(reduce, exp)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* reduce =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* exp =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*reduce, *exp));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_MultiOutputReduceFusion) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_select {
+      p1.1 = f32[2,2,2]{2,1,0} parameter(1)
+      c0 = f32[] constant(0)
+      broadcast = f32[2,2,2]{2,1,0} broadcast(f32[] c0), dimensions={}
+      greater-than = pred[2,2,2]{2,1,0} greater-than(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast)
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      ROOT select = f32[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f32[2,2,2]{2,1,0} p0.1, f32[2,2,2]{2,1,0} broadcast)
+    }
+
+    fused_reduce {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      c1 = f32[] constant(0)
+      r1 = f32[2,2]{1,0} reduce(p0.2, c1), dimensions={2}, to_apply=scalar_add
+      mul = f32[2,2,2]{2,1,0} multiply(p0.2, p0.2)
+      r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=scalar_add
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2)
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      select = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_select
+      fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(select), kind=kInput, calls=fused_reduce
+      gte0 = f32[2,2]{1,0} get-tuple-element(fusion), index=0
+      gte1 = f32[2,2]{1,0} get-tuple-element(fusion), index=1
+      ROOT root = (f32[2,2]{1,0}, f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(gte1, gte1, select)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1)->operand(0);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_ReduceFusions) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_reduce_1 {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} p0.1, f32[] c0), dimensions={0}, to_apply=scalar_add
+    }
+
+    fused_reduce_2 {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2)
+      c1 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={0}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      reduce_1 = f32[2,2]{1,0} fusion(p0), kind=kLoop, calls=fused_reduce_1
+      reduce_2 = f32[2,2]{1,0} fusion(p1), kind=kLoop, calls=fused_reduce_2
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce_1, reduce_2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_DifferentReduceDimensions) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_reduce_1 {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} p0.1, f32[] c0), dimensions={0}, to_apply=scalar_add
+    }
+
+    fused_reduce_2 {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2)
+      c1 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={2}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      reduce_1 = f32[2,2]{1,0} fusion(p0), kind=kLoop, calls=fused_reduce_1
+      reduce_2 = f32[2,2]{1,0} fusion(p1), kind=kLoop, calls=fused_reduce_2
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce_1, reduce_2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_NoReductionToVector) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_element_wise {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      p1.1 = f32[2,2,2]{2,1,0} parameter(1)
+      ROOT add = f32[2,2,2]{2,1,0} add(p0.1, p1.1)
+    }
+
+    fused_reduce {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2)
+      c1 = f32[] constant(0)
+      // Note that reduce is not a reduction-to-vector.
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={1}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      element_wise = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_element_wise
+      fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(element_wise), kind=kLoop, calls=fused_reduce
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(fusion, element_wise)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
index 02a0d028c118aba23996f9b97d05443bb4a00c88..1126943624a3771433ecac591545d335c1890115 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
@@ -37,12 +37,12 @@ class GpuHloOrdering : public PredecessorHloOrdering {
  public:
   GpuHloOrdering(const HloModule* module,
                  const StreamAssignment& stream_assignment,
-                 const std::vector<const HloInstruction*>& thunk_launch_order);
+                 const std::vector<HloInstruction*>& thunk_launch_order);
   ~GpuHloOrdering() override = default;
 
   // Only the entry computation can possibly be sequentially ordered, and only
   // if we've assigned all instructions to a single stream.
-  const std::vector<const HloInstruction*>* SequentialOrder(
+  const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const override {
     return &computation == module_->entry_computation() ? entry_sequence_.get()
                                                         : nullptr;
@@ -51,17 +51,17 @@ class GpuHloOrdering : public PredecessorHloOrdering {
   string ToString() const override { return ToStringHelper("GpuHloOrdering"); }
 
  private:
-  std::unique_ptr<std::vector<const HloInstruction*>> entry_sequence_;
+  std::unique_ptr<HloInstructionSequence> entry_sequence_;
 };
 
 GpuHloOrdering::GpuHloOrdering(
     const HloModule* module, const StreamAssignment& stream_assignment,
-    const std::vector<const HloInstruction*>& thunk_launch_order)
+    const std::vector<HloInstruction*>& thunk_launch_order)
     : PredecessorHloOrdering(module) {
   // The entry computation has a total order when there's only one stream.
   if (stream_assignment.StreamCount() == 1) {
-    entry_sequence_ = absl::make_unique<std::vector<const HloInstruction*>>(
-        thunk_launch_order);
+    entry_sequence_ =
+        absl::make_unique<HloInstructionSequence>(thunk_launch_order);
   }
 
   // The ordering of instructions for the entry computation is determined by the
@@ -124,7 +124,8 @@ GpuHloOrdering::GpuHloOrdering(
   for (auto* computation : module->computations()) {
     if (computation != module->entry_computation() &&
         !computation->IsFusionComputation()) {
-      predecessors_.emplace(computation, computation->ComputeReachability());
+      predecessors_.emplace(computation,
+                            HloReachabilityMap::Build(computation));
     }
   }
 }
@@ -149,7 +150,7 @@ GpuHloOrdering::GpuHloOrdering(
 // However, if the total order is A,B,D,C,E, then C and E can run
 // concurrently.
 void BFSLaunchOrder(const HloComputation* computation,
-                    std::vector<const HloInstruction*>* launch_order) {
+                    std::vector<HloInstruction*>* launch_order) {
   // This topological sort uses two data structures:
   // 1. `incoming_edge_count` which keeps track of the number of incoming
   // edges to each HLO;
@@ -157,9 +158,9 @@ void BFSLaunchOrder(const HloComputation* computation,
   //
   // The sorting algorithm repeatedly pops the top from the queue and deletes
   // that HLO from the graph, making more HLOs incoming-edge free.
-  std::deque<const HloInstruction*> queue;
+  std::deque<HloInstruction*> queue;
   std::unordered_map<const HloInstruction*, int64> incoming_edge_count;
-  for (const auto& hlo : computation->instructions()) {
+  for (auto* hlo : computation->instructions()) {
     if (hlo->operand_count() == 0) {
       queue.push_back(hlo);
     } else {
@@ -171,10 +172,10 @@ void BFSLaunchOrder(const HloComputation* computation,
   }
 
   while (!queue.empty()) {
-    const HloInstruction* x = queue.front();
+    HloInstruction* x = queue.front();
     queue.pop_front();
     launch_order->push_back(x);
-    for (const HloInstruction* y : x->users()) {
+    for (HloInstruction* y : x->users()) {
       --incoming_edge_count[y];
       if (incoming_edge_count[y] == 0) {
         queue.push_back(y);
@@ -194,14 +195,14 @@ StatusOr<std::unique_ptr<GpuHloSchedule>> GpuHloSchedule::Build(
   std::unique_ptr<GpuHloSchedule> schedule(new GpuHloSchedule);
 
   // Initialize thunk_launch_order_, the total order of thunk launches.
-  const HloComputation* entry_computation = module.entry_computation();
+  HloComputation* entry_computation = module.entry_computation();
   if (stream_assignment.StreamCount() == 1) {
     // All kernels are launched on a single stream, so there's no loss of
     // concurrency by optimizing for minimal memory usage.
     TF_ASSIGN_OR_RETURN(
         HloInstructionSequence sequence,
         ScheduleComputation(
-            *entry_computation, [pointer_size](const BufferValue& buffer) {
+            entry_computation, [pointer_size](const BufferValue& buffer) {
               return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
             }));
     schedule->thunk_launch_order_ = sequence.instructions();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
index 07a7fc67aa555845c3de57e574ab582403ec0490..7f224ffe4f03f8f05b0f1907628d99d9df387770 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
@@ -46,7 +46,7 @@ class GpuHloSchedule {
 
   // Returns the total order of thunk launches, represented in terms of HLO
   // instructions.
-  const std::vector<const HloInstruction*>& ThunkLaunchOrder() const {
+  const std::vector<HloInstruction*>& ThunkLaunchOrder() const {
     return thunk_launch_order_;
   }
 
@@ -60,7 +60,7 @@ class GpuHloSchedule {
  private:
   GpuHloSchedule();
 
-  std::vector<const HloInstruction*> thunk_launch_order_;
+  std::vector<HloInstruction*> thunk_launch_order_;
   std::unique_ptr<HloOrdering> hlo_ordering_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
index b857fa775a76ec999b505a2a64332cc0c54cf00b..91db7151f22fd75b20244878bee86d65acd1d304 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -24,16 +24,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
 
-class GpuHloScheduleTest : public HloVerifiedTestBase {
+class GpuHloScheduleTest : public HloTestBase {
  protected:
-  using HloVec = std::vector<const HloInstruction*>;
+  using HloVec = std::vector<HloInstruction*>;
 
   // Pre-canned shapes.
   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
@@ -44,7 +44,7 @@ class GpuHloScheduleTest : public HloVerifiedTestBase {
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<HloModule> CreateNewModule() {
+  std::unique_ptr<HloModule> CreateNewVerifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -79,7 +79,7 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -139,7 +139,7 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
   HloInstruction* add3 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add3));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -209,7 +209,7 @@ TEST_F(GpuHloScheduleTest, ConcurrentMatMul) {
   HloInstruction* add =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, dot2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -288,7 +288,7 @@ TEST_F(GpuHloScheduleTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
index 27a4d0b601f3807fe6b94dd6171a44f292921ede..b511155f85fb24adc1828cbef7f3fb60778ef7ab 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -25,7 +25,7 @@ namespace {
 
 using ::testing::HasSubstr;
 
-class GpuHloSupportCheckerTest : public HloVerifiedTestBase {
+class GpuHloSupportCheckerTest : public HloTestBase {
  protected:
   GpuHloSupportChecker& checker() { return checker_; }
 
@@ -42,10 +42,10 @@ TEST_F(GpuHloSupportCheckerTest, Add) {
       HloInstruction::CreateParameter(1, scalar_shape, "param1"));
   builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  TF_ASSERT_OK(checker().Run(module).status());
+  TF_ASSERT_OK(checker().Run(module.get()).status());
 }
 
 TEST_F(GpuHloSupportCheckerTest, SparseUnimplemented) {
@@ -57,10 +57,13 @@ TEST_F(GpuHloSupportCheckerTest, SparseUnimplemented) {
       HloInstruction::CreateParameter(1, sparse_shape, "param1"));
   builder.AddInstruction(HloInstruction::CreateBinary(
       sparse_shape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
+  // Since verifier is reporting sparse layouts as errors, we should
+  // use a regular HloModule instead of VerifiedHloModule to avoid
+  // verifier errors being triggered in the destructor.
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  Status status = checker().Run(module).status();
+  Status status = checker().Run(module.get()).status();
   ASSERT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
   EXPECT_THAT(status.error_message(),
               HasSubstr("GPU backend does not support"));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 74352f26aa9c3a2ca597da21735438df92f863ab..f59da2caa18646676297e66dd329c66fb5fddf1b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_options.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -66,8 +65,8 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
 
   VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
 
-  // Empirically we've found with Volta and cudnn 7 that backward-input convs
-  // with stride are significantly faster with NCHW layouts.
+  // Empirically we've found with Volta and cudnn <= 7.3 that backward-input
+  // convs with stride are significantly faster with NCHW layouts.
   //
   // We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW),
   // which on paper gives good performance. However, there are two observations:
@@ -76,11 +75,17 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
   // * we've also observed that for mixed layouts, cuDNN transposes data back
   //   and forth from a different layout combination. If we end up with
   //   transposes anyway, we prefer to have them in XLA, as they can be fused.
-  // TODO(timshen): Figure out the exact condition. This may be achieved by
-  // auto-tuning layouts offline.
-  if (instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
-      window_util::HasStride(instr->window())) {
-    return kAllNCHW;
+  if (auto* dnn = stream_executor->AsDnn()) {
+    auto version_status = dnn->GetVersion();
+    if (version_status.ok()) {
+      auto version = version_status.ConsumeValueOrDie();
+      if (std::make_tuple(version.major_version(), version.minor_version()) <=
+              std::make_tuple(7, 3) &&
+          instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
+          window_util::HasStride(instr->window())) {
+        return kAllNCHW;
+      }
+    }
   }
 
   // For other Volta f16 convolutions, use NHWC.
@@ -125,14 +130,8 @@ Status GpuLayoutAssignment::AddBackendConstraintsToDnnConvCustomCall(
     DataLayout input;
     FilterLayout filter;
     DataLayout output;
-    if (ConvUseLayoutHeuristic(instr->GetModule()->config())) {
-      std::tie(input, filter, output) =
-          HeuristicLayoutAssignment(instr, stream_executor_);
-    } else {
-      input = DataLayout::kBatchDepthYX;
-      filter = FilterLayout::kOutputInputYX;
-      output = DataLayout::kBatchDepthYX;
-    }
+    std::tie(input, filter, output) =
+        HeuristicLayoutAssignment(instr, stream_executor_);
 
     TF_ASSIGN_OR_RETURN(
         std::tie(*input_shape->mutable_layout(),
@@ -215,21 +214,37 @@ Status GpuLayoutAssignment::AddBackendConstraints(
           constraints->SetOperandLayout(op1_shape, instruction, 1));
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(output_shape, instruction));
+    } else if (instruction->opcode() == HloOpcode::kSort &&
+               ShapeUtil::Rank(instruction->operand(0)->shape()) > 1) {
+      // Make sure that all the operands and the output(s) have the same layout.
+      Shape keys_shape = instruction->operand(0)->shape();
+      Layout keys_layout =
+          LayoutUtil::GetDefaultLayoutForRank(ShapeUtil::Rank(keys_shape));
+      for (int64 i = 0; i < instruction->operand_count(); ++i) {
+        Shape shape = instruction->operand(i)->shape();
+        *shape.mutable_layout() = keys_layout;
+        TF_RETURN_IF_ERROR(
+            constraints->SetOperandLayout(shape, instruction, i));
+        const LogicalBuffer* output_buffer;
+        if (ShapeUtil::IsArray(instruction->shape())) {
+          TF_ASSIGN_OR_RETURN(
+              output_buffer,
+              constraints->points_to_analysis().GetBufferDefinedAt(instruction,
+                                                                   {}));
+        } else {
+          TF_ASSIGN_OR_RETURN(
+              output_buffer,
+              constraints->points_to_analysis().GetBufferDefinedAt(instruction,
+                                                                   {i}));
+        }
+        TF_RETURN_IF_ERROR(
+            constraints->SetBufferLayout(keys_layout, *output_buffer));
+      }
     }
   }
   return Status::OK();
 }
 
-bool GpuLayoutAssignment::CustomCallRequiresMajorFirstLayout(
-    const HloInstruction* instruction) {
-  // - Inputs to cudnn batchnorm custom calls don't need the major-first layout
-  //   (i.e. {n, n-1, ...0}) -- we can handle any layout.
-  // - Inputs to cudnn convolution require custom layouts handled in
-  //   AddBackendConstraints.
-  return !IsCustomCallToDnnBatchNorm(*instruction) &&
-         !IsCustomCallToDnnConvolution(*instruction);
-}
-
 Status GpuLayoutAssignment::PropagateOperandConstraint(
     const OperandLayoutConstraint& layout_constraint,
     LayoutConstraints* constraints) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index 4ba7989e9cba9abe6cdc1fcabd5f011bd9cfb0ec..6a48e55fd2e784f80a50f4565107db177fb43bfc 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -46,8 +46,6 @@ class GpuLayoutAssignment : public LayoutAssignment {
   Status PropagateBufferConstraint(
       const BufferLayoutConstraint& buffer_constraint,
       LayoutConstraints* constraints) override;
-  bool CustomCallRequiresMajorFirstLayout(
-      const HloInstruction* instruction) override;
 
  private:
   Status AddBackendConstraintsToDnnConvCustomCall(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 04681cfcec792d86eed95585262691932b07b269..2ffc8bfb49b205dced0d540ba72426e72d95e596 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -61,7 +61,7 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
             HloInstruction::CreateParameter(1, ashape, "y"));
         auto add = builder.AddInstruction(
             HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, x, y));
-        auto module = CreateNewModule();
+        auto module = CreateNewVerifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(add));
 
@@ -148,7 +148,7 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) {
           {operand, scale, offset, mean, variance, epsilon, feature_index},
           kCudnnBatchNormForwardInferenceCallTarget));
 
-      auto module = CreateNewModule();
+      auto module = CreateNewVerifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -217,7 +217,7 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) {
           batchnorm_shape, {operand, scale, offset, epsilon, feature_index},
           kCudnnBatchNormForwardTrainingCallTarget));
 
-      auto module = CreateNewModule();
+      auto module = CreateNewVerifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -298,7 +298,7 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) {
                  feature_index},
                 kCudnnBatchNormBackwardCallTarget));
 
-        auto module = CreateNewModule();
+        auto module = CreateNewVerifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -351,7 +351,8 @@ TEST_F(LayoutAssignmentTest, DotLayout) {
                           ParseHloString(hlo_text));
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      module->entry_computation()->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
   GpuLayoutAssignment layout_assignment(
       &computation_layout, LayoutAssignment::InstructionCanChangeLayout,
       backend().default_stream_executor());
@@ -364,6 +365,34 @@ TEST_F(LayoutAssignmentTest, DotLayout) {
                       op::ShapeWithLayout(expected_shape)));
 }
 
+TEST_F(LayoutAssignmentTest, SortLayout) {
+  const char* hlo_text = R"(
+  HloModule SortLayout
+  ENTRY sort {
+    keys = f32[3,2]{0,1} constant(f32[3,2]{0,1}{{0,1},{0,1},{0,1}})
+    values = f32[2,3]{1,0} parameter(0)
+    transpose = f32[3,2]{1,0} transpose(values), dimensions={1,0}
+    ROOT sort = (f32[3,2]{1,0}, f32[3,2]{1,0}) sort(keys, transpose),
+      dimensions={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text));
+
+  ComputationLayout computation_layout(
+      module->entry_computation()->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
+  GpuLayoutAssignment layout_assignment(
+      &computation_layout, LayoutAssignment::InstructionCanChangeLayout,
+      backend().default_stream_executor());
+  EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
+
+  Shape expected_shape = ShapeUtil::MakeShapeWithLayout(F32, {3, 2}, {1, 0});
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Sort(op::ShapeWithLayout(expected_shape),
+                       op::ShapeWithLayout(expected_shape)));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_options.cc b/tensorflow/compiler/xla/service/gpu/gpu_options.cc
deleted file mode 100644
index 35b4b4e20b633792de4251a4b0e89f4b579053ce..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/gpu_options.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/gpu_options.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-
-namespace xla {
-namespace gpu {
-
-bool ConvUseLayoutHeuristic(const HloModuleConfig& config) {
-  return !config.debug_options().xla_backend_extra_options().count(
-      "xla_gpu_experimental_conv_disable_layout_heuristic");
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_options.h b/tensorflow/compiler/xla/service/gpu/gpu_options.h
deleted file mode 100644
index 498d4a94955cb2c50e0b165f28ded44ac1c0bfff..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/gpu_options.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_OPTIONS_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_OPTIONS_H_
-
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-
-// Helper functions for querying options that are specific to the GPU backend.
-
-namespace xla {
-namespace gpu {
-
-// Returns true if we should use heuristics to assign convolution layouts, as
-// opposed to always assigning NCHW.
-bool ConvUseLayoutHeuristic(const HloModuleConfig& config);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index b61f0387392d2301109a484ca5c1f65f18882265..6151dd8ff4c92bb81bd756c68cc9377633c8c9d5 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -47,6 +47,8 @@ bool IsFusible(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kReduce ||
          hlo.opcode() == HloOpcode::kReduceWindow ||
          hlo.opcode() == HloOpcode::kReshape ||
+         hlo.opcode() == HloOpcode::kReverse ||
+         hlo.opcode() == HloOpcode::kScatter ||
          hlo.opcode() == HloOpcode::kSlice ||
          hlo.opcode() == HloOpcode::kTranspose;
 }
@@ -78,7 +80,7 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
 // This function limits the maximum number of operands to a fusion.
 //
 // There's a cap on how many parameters we can pass to a CUDA kernel, but
-// exactly what that limit is is hazy, as it depends on (among other things) how
+// exactly what that limit is hazy, as it depends on (among other things) how
 // much GPU constant memory is in use for other purposes.
 //
 // Moreover, we don't even know at the point that we're running fusion how many
@@ -178,6 +180,11 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
           IsIEEEFloatingPointScalarConstant(alpha->operand(0))) {
         return true;
       }
+    } else if (consumer->operand_count() == 2 &&
+               consumer->opcode() == HloOpcode::kAdd &&
+               consumer->operand(other_operand_index) != producer) {
+      // Fuse a bias add into the output of the dot.
+      return true;
     }
   }
 
@@ -223,6 +230,11 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     return false;
   }
 
+  // Scatter is only supported at the root of a kInput fusion.
+  if (producer->opcode() == HloOpcode::kScatter) {
+    return false;
+  }
+
   // Do not fuse into reduce input fusions if the resulting kernel would suffer
   // from poor data locality (due to unfriendly input layouts).
   if (IsInputFusibleReduction(*consumer) &&
@@ -246,12 +258,17 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     return false;
   }
 
-  // Fuse scalar constants into loop fusion nodes, this reduces the number of
+  // Fuse scalar constants into loop fusion nodes. This reduces the number of
   // parameters and makes matching scalar broadcasts easier.
-  if (ShapeUtil::IsEffectiveScalar(producer->shape()) &&
-      consumer->opcode() == HloOpcode::kFusion &&
-      producer->opcode() == HloOpcode::kConstant) {
-    return true;
+  //
+  // Don't fuse other constants: Unfused constants in GPU land can be
+  // represented as an external constant (i.e. not emitted in LLVM IR / PTX),
+  // but fused constants are handled by shrared CPU/GPU code and always emitted
+  // in the IR/PTX.  The external constant representation makes for faster
+  // compiles and significantly smaller assembly code.
+  if (producer->opcode() == HloOpcode::kConstant) {
+    return ShapeUtil::IsEffectiveScalar(producer->shape()) &&
+           consumer->opcode() == HloOpcode::kFusion;
   }
 
   if (!IsFusible(*producer) || !IsFusible(*consumer) ||
@@ -285,7 +302,8 @@ bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
 
 HloInstruction::FusionKind GpuInstructionFusion::ChooseKind(
     const HloInstruction* producer, const HloInstruction* consumer) {
-  if (IsReductionToVector(*consumer)) {
+  if (IsReductionToVector(*consumer) ||
+      consumer->opcode() == HloOpcode::kScatter) {
     return HloInstruction::FusionKind::kInput;
   }
   if (producer->opcode() == HloOpcode::kDot ||
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 96bfe0c12eb9cd6ef25804d6b34767471616f7e4..688604cd36e5a45debf855aacd29d05ecda92341 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -41,7 +41,7 @@ TEST_F(InstructionFusionTest,
       builder.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::MakeShape(S32, {1}), exp1, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -61,7 +61,7 @@ TEST_F(InstructionFusionTest,
       builder.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::MakeShape(S32, {1}), negate1, {0}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(broadcast2, computation->root_instruction());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -80,7 +80,7 @@ TEST_F(InstructionFusionTest,
   HloInstruction* reshape2 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), exp1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -99,7 +99,7 @@ TEST_F(InstructionFusionTest,
   HloInstruction* transpose2 = builder.AddInstruction(
       HloInstruction::CreateTranspose(ShapeUtil::MakeShape(S32, {}), exp1, {}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfDotUnfused) {
   auto reshape2 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), dot1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -134,7 +134,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
   auto transpose2 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {1, 1}), dot1, {0, 1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -331,6 +331,56 @@ TEST_F(InstructionFusionTest, DotOutputFusion) {
                    op::Broadcast(op::Constant())));
 }
 
+TEST_F(InstructionFusionTest, DotOutputFusionBiasAdd) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    alpha = f32[] constant(3)
+    broadcast = f32[4,4]{1,0} broadcast(alpha), dimensions={}
+    p0 = f32[4,3]{1,0} parameter(0)
+    p1 = f32[4,3]{1,0} parameter(1)
+    p2 = f32[4,4]{1,0} parameter(2)
+    transpose = f32[3,4]{1,0} transpose(p1), dimensions={1, 0}
+    dot = f32[4,4]{1,0} dot(p0, transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT add = f32[4,4] add(dot, p2)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kOutput);
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Add(op::Dot(op::Parameter(), op::Transpose(op::Parameter())),
+                      op::Parameter()));
+}
+
+TEST_F(InstructionFusionTest,
+       DotOperationFusion_DontOutputFuseDuplicateOperands) {
+  absl::string_view module_string = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[50,60]{1,0} parameter(0)
+  b = f32[60,1]{1,0} parameter(1)
+  c = f32[50,1]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT d = f32[50,1]{1,0} add(c, c)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool fused_something,
+      GpuInstructionFusion(/*may_duplicate=*/false).Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
 // Compute sum(1/p0), where p0 has type f32, twice.  Check that the division is
 // duplicated and fused into both reduces.
 TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) {
@@ -696,7 +746,7 @@ TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
     sum = b.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sum, param));
   }
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(b.Build());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
                   .Run(module.get())
@@ -709,5 +759,95 @@ TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
   }
 }
 
+TEST_F(InstructionFusionTest, FuseIntoScatter) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY FuseIntoScatter {
+      p0 = s32[3,3] parameter(0)
+      operand = s32[3,3] add(p0, p0)
+      p1 = s32[2] parameter(1)
+      indices = s32[2] add(p1, p1)
+      p2 = s32[2,3] parameter(2)
+      updates = s32[2,3] add(p2, p2)
+      scatter = s32[3,3] scatter(operand, indices, updates),
+          to_apply=add,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+      ROOT add = s32[3,3] add(scatter, scatter)
+    })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Add(op::Fusion(), op::Fusion()));
+  EXPECT_EQ(root->operand(0)->fusion_kind(),
+            HloInstruction::FusionKind::kInput);
+  EXPECT_THAT(root->operand(0)->fused_expression_root(),
+              op::Scatter(op::Add(), op::Add(), op::Add()));
+}
+
+TEST_F(InstructionFusionTest, NonscalarConstantsNotFused) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY BroadcastIntoReduce {
+      constant = f32[16] constant({0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15})
+      broadcast = f32[16,16,16,16]{3,2,1,0} broadcast(constant), dimensions={0}
+      constant.1 = f32[] constant(0)
+      ROOT reduce = f32[] reduce(broadcast, constant.1), dimensions={0,1,2,3},
+                                                         to_apply=add
+    })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+  // The f32[16] constant should not be fused into the reduce, but the f32[]
+  // constant should be.
+  auto* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_instructions_computation()->root_instruction(),
+              op::Reduce(op::Broadcast(op::Parameter()), op::Constant()));
+}
+
+TEST_F(InstructionFusionTest, FuseReverse) {
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    ENTRY Reverse {
+      p0 = f32[50,96,1024]{2,1,0} parameter(0)
+      add = f32[50,96,1024]{2,1,0} add(p0, p0)
+      ROOT reverse = f32[50,96,1024] reverse(add), dimensions={0}
+    })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Reverse(op::Add(op::Parameter(), op::Parameter())));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index ec3d8f9405840bb7be97ba5cd5725a4ac68a15a8..33e41a2782b5932430eea621d3cea2c6634f292f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -38,10 +38,9 @@ namespace gpu {
 
 namespace {
 
-// Return whether the given shape is a matrix with no padding.
-bool IsRank2WithNoPadding(const Shape& shape, int64 batch_dimensions_size) {
-  return ShapeUtil::Rank(shape) == batch_dimensions_size + 2 &&
-         !LayoutUtil::IsPadded(shape);
+// Return whether the given shape is rank 2 excluding the batch dimensions.
+bool IsRank2(const Shape& shape, int64 batch_dimensions_size) {
+  return ShapeUtil::Rank(shape) == batch_dimensions_size + 2;
 }
 
 // In a gemm operation where output = lhs * rhs, check whether the given shapes
@@ -56,10 +55,9 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
   bool type_is_allowed =
       (output_primitive_type == F16 || output_primitive_type == F32 ||
        output_primitive_type == F64 || output_primitive_type == C64);
-  return type_is_allowed &&
-         IsRank2WithNoPadding(lhs_shape, batch_dimensions_size) &&
-         IsRank2WithNoPadding(rhs_shape, batch_dimensions_size) &&
-         IsRank2WithNoPadding(output_shape, batch_dimensions_size) &&
+  return type_is_allowed && IsRank2(lhs_shape, batch_dimensions_size) &&
+         IsRank2(rhs_shape, batch_dimensions_size) &&
+         IsRank2(output_shape, batch_dimensions_size) &&
          !ShapeUtil::IsZeroElementArray(lhs_shape) &&
          !ShapeUtil::IsZeroElementArray(rhs_shape);
 }
@@ -93,7 +91,8 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
 
   if (hlo.opcode() == HloOpcode::kFusion &&
       hlo.fusion_kind() == HloInstruction::FusionKind::kOutput &&
-      hlo.fused_expression_root()->opcode() == HloOpcode::kMultiply) {
+      (hlo.fused_expression_root()->opcode() == HloOpcode::kMultiply ||
+       hlo.fused_expression_root()->opcode() == HloOpcode::kAdd)) {
     // Try to find the dot inside the output fusion node.
     const HloInstruction* dot = hlo.fused_expression_root()->operand(0);
     if (dot->opcode() != HloOpcode::kDot) {
@@ -269,5 +268,17 @@ string CudnnConvKindToString(CudnnConvKind kind) {
   }
 }
 
+llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b) {
+  return b->CreateAnd(
+      b->CreateICmpEQ(
+          b->getInt32(0),
+          llvm_ir::EmitCallToIntrinsic(
+              llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b)),
+      b->CreateICmpEQ(
+          b->getInt32(0),
+          llvm_ir::EmitCallToIntrinsic(
+              llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b)));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index a64a616ab1329422d0197f4a7f99ec557a95f8ed..ebf4d926b7a280e10b09a2532caba7ad6ab3ceb2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -108,9 +108,9 @@ bool IsCustomCallToDnnBatchNorm(const HloInstruction& hlo);
 // memory used by cudnn.  Callers shouldn't inspect scratch_memory, as its value
 // is not well-defined.
 //
-// CudnnConvolutionRewriter lowers kConvolution HLOs to these custom calls.
+// CudnnConvRewriter lowers kConvolution HLOs to these custom calls.
 // When it does so, it chooses algorithm -1 and 0 bytes of scratch space.  Later
-// on in the pipeline, CudnnConvolutionAlgorithmChooser chooses an explicit
+// on in the pipeline, CudnnConvAlgorithmChooser chooses an explicit
 // algorithm for each conv and sets the amount of scratch space needed.
 //
 // (Representing the scratch memory as an output may seem strange at first, but
@@ -155,6 +155,10 @@ llvm::Value* EmitPrintf(absl::string_view fmt,
 llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
                                      llvm::IRBuilder<>* builder);
 
+// Emits code that determines whether the current thread is thread 0 within
+// block 0 of the kernel.
+llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index b7c37bcf3ca910f10d18339dfe7f1d29f2a55c9e..6693f66d62d8b04d1b78e001fdb515b34539c67f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -63,9 +63,6 @@ IrEmitter::IrEmitter(const HloModuleConfig& hlo_module_config,
                 &ir_emitter_context->buffer_assignment(), &b_, module_,
                 is_nested),
       hlo_module_config_(hlo_module_config) {
-  b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
-      /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_gpu_enable_fast_math()));
 }
 
 Status IrEmitter::DefaultAction(HloInstruction* hlo) {
@@ -97,6 +94,18 @@ Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) {
+  VLOG(2) << "HandleAddDependency: " << add_dependency->ToString();
+  const HloInstruction* operand = add_dependency->operand(0);
+  // Add_Dependency is a no-op, but we still want to bind it to an llvm::Value
+  // sometimes, e.g., when it's operand is a constant or a bitcast of a
+  // constant.
+  if (bindings_.BoundToIrValue(*operand)) {
+    bindings_.BindHloToIrValue(*add_dependency, GetBasePointer(*operand));
+  }
+  return Status::OK();
+}
+
 Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   auto operand = get_tuple_element->operand(0);
   CHECK(bindings_.BoundToIrValue(*operand));
@@ -179,6 +188,21 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
   bool is_atomic_integral = element_type == S32 || element_type == U32 ||
                             element_type == S64 || element_type == U64;
   llvm::Value* source = Load(source_address, "source");
+
+  // kCopy of RHS -> atomic store.
+  if (root_opcode == HloOpcode::kCopy &&
+      (element_type == F32 || is_atomic_integral) &&
+      computation.root_instruction()->operand(0)->opcode() ==
+          HloOpcode::kParameter &&
+      computation.root_instruction()->operand(0)->parameter_number() == 1) {
+    llvm::StoreInst* store = Store(source, output_address);
+    store->setAtomic(llvm::AtomicOrdering::Unordered);
+    // Derive a minimum alignment from the type. The optimizer can increase it
+    // later.
+    store->setAlignment(ShapeUtil::ByteSizeOfPrimitiveType(element_type));
+    return true;
+  }
+
   if (root_opcode == HloOpcode::kAdd) {
     // NVPTX supports atomicAdd on F32 and integer types.
     if (element_type == F32) {
@@ -480,18 +504,10 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   TF_RET_CHECK(!ShapeUtil::IsScalar(lhs_shape) &&
                !ShapeUtil::IsScalar(rhs_shape));
 
-  // Reduce along the last dimension of the LHS and the second-to-last dimension
-  // of the RHS. Vectors are a special case where the reduction dimension is 0
-  // for both LHS and RHS. This results in a vector dot product producing a
-  // scalar.
-  const int64 lhs_reduction_dimension =
-      ShapeUtil::GetDimensionNumber(lhs_shape, -1);
-  const int64 rhs_reduction_dimension =
-      ShapeUtil::Rank(rhs_shape) >= 2 + dnums.lhs_batch_dimensions_size()
-          ? ShapeUtil::GetDimensionNumber(rhs_shape, -2)
-          : dnums.lhs_batch_dimensions_size();
-
-  // Check that the batch dims don't cover the last two dims.
+  const int64 lhs_reduction_dimension = dnums.lhs_contracting_dimensions(0);
+  const int64 rhs_reduction_dimension = dnums.rhs_contracting_dimensions(0);
+
+  // Check that the batch dims don't cover the reduction dimensions.
   for (int64 batch_dim : dnums.lhs_batch_dimensions()) {
     CHECK_NE(lhs_reduction_dimension, batch_dim);
     CHECK_NE(rhs_reduction_dimension, batch_dim);
@@ -499,7 +515,11 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
 
   // Verify the reduction dimension in the two operands are the same size.
   TF_RET_CHECK(lhs_shape.dimensions(lhs_reduction_dimension) ==
-               rhs_shape.dimensions(rhs_reduction_dimension));
+               rhs_shape.dimensions(rhs_reduction_dimension))
+      << "lhs_shape.dimensions(" << lhs_reduction_dimension
+      << ") = " << lhs_shape.dimensions(lhs_reduction_dimension)
+      << ", and rhs_shape.dimensions(" << rhs_reduction_dimension
+      << ") = " << rhs_shape.dimensions(rhs_reduction_dimension);
 
   // Create loop nests which loop through the LHS operand dimensions and the RHS
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
@@ -686,15 +706,11 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   // kFusion for library calls should be handled by
   // IrEmitterUnnested::HandleFusion.
-  CHECK(HloInstruction::FusionKind::kLoop == fusion->fusion_kind());
-
-  std::vector<llvm_ir::IrArray> parameter_arrays;
-  for (HloInstruction* operand : fusion->operands()) {
-    parameter_arrays.push_back(GetIrArray(*operand, *fusion));
-  }
+  CHECK_EQ(HloInstruction::FusionKind::kLoop, fusion->fusion_kind());
   GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_, &b_,
                                           GetNestedComputer());
-  FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter);
+  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(fusion),
+                               &elemental_emitter);
   TF_RETURN_IF_ERROR(fusion->fused_expression_root()->Accept(&fused_emitter));
 
   return EmitTargetElementLoop(*fusion, fused_emitter.GetRootGenerator());
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 880520148005838cc25a5be9e26c8bc9028a70ce..2da46c016935d0e927879bbfb0d05cfc4899d818 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -68,6 +68,9 @@ namespace gpu {
 class IrEmitter : public DfsHloVisitorWithDefault,
                   public IrBuilderMixin<IrEmitter> {
  public:
+  using GeneratorForOperandIrArrays =
+      std::function<std::vector<llvm_ir::IrArray>()>;
+
   IrEmitter(const IrEmitter&) = delete;
   IrEmitter& operator=(const IrEmitter&) = delete;
 
@@ -97,6 +100,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleBatchNormInference(HloInstruction* batch_norm) override;
   Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
   Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   Status FinishVisit(HloInstruction* root) override { return Status::OK(); }
 
@@ -179,6 +183,20 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // Hlo configuration data used during code generation.
   const HloModuleConfig& hlo_module_config_;
 
+ protected:
+  GeneratorForOperandIrArrays GetGeneratorForOperandIrArrays(
+      HloInstruction* fusion) {
+    return [=]() {
+      std::vector<llvm_ir::IrArray> ir_arrays;
+      ir_arrays.reserve(fusion->operand_count());
+      absl::c_transform(fusion->operands(), std::back_inserter(ir_arrays),
+                        [&](const HloInstruction* operand) {
+                          return GetIrArray(*operand, *fusion);
+                        });
+      return ir_arrays;
+    };
+  }
+
  private:
   // A helper method for EmitAtomicOperationForNestedComputation. Certain
   // computations, such as floating-point addition and integer maximization, can
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index c792dd2ddb0faeba076548ba104aa291e0814140..fb040aff30d48bf5817946ce53d37bc6685941e4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
@@ -43,7 +44,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/for_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
@@ -64,11 +65,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
@@ -87,6 +88,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using llvm_ir::KernelMappingScheme;
+
 namespace {
 
 using absl::InlinedVector;
@@ -336,34 +339,26 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
 }  // namespace
 
 Status IrEmitterUnnested::DefaultAction(HloInstruction* hlo) {
-  int unroll_factor = 1;
-  // Unfused elementwise operations are usually memory bound, unroll them.
-  if (hlo->IsElementwise()) {
-    unroll_factor = ComputeMaxUnrollFactor(hlo);
-  }
-
-  thunk_sequence_->emplace_back(BuildKernelThunk(
-      hlo, /*implements_whole_instruction=*/true, unroll_factor));
   return IrEmitter::DefaultAction(hlo);
 }
 
 Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
   if (ImplementedAsGemm(*dot)) {
-    thunk_sequence_->emplace_back(BuildGemmThunk(dot));
+    AddThunkToThunkSequence(BuildGemmThunk(dot));
     return Status::OK();
   }
-  thunk_sequence_->emplace_back(
+  AddThunkToThunkSequence(
       BuildKernelThunk(dot, /*implements_whole_instruction=*/true));
   return IrEmitter::HandleDot(dot);
 }
 
 Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
-  thunk_sequence_->emplace_back(BuildConditionalThunk(conditional));
+  AddThunkToThunkSequence(BuildConditionalThunk(conditional));
   return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) {
-  thunk_sequence_->emplace_back(
+  AddThunkToThunkSequence(
       BuildKernelThunk(convolution, /*implements_whole_instruction=*/true));
   return IrEmitter::HandleConvolution(convolution);
 }
@@ -385,7 +380,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     CHECK(feature_index->IsConstant());
     int64 feature_index_value = feature_index->literal().Get<int64>({});
 
-    thunk_sequence_->emplace_back(
+    AddThunkToThunkSequence(
         absl::make_unique<CudnnBatchNormForwardInferenceThunk>(
             /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
             /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
@@ -415,7 +410,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     auto output_data = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
     auto output_mean = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
     auto output_inv_stddev = assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
-    thunk_sequence_->emplace_back(
+    AddThunkToThunkSequence(
         absl::make_unique<CudnnBatchNormForwardTrainingThunk>(
             /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
             /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
@@ -446,20 +441,19 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     auto output_grad_scale = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
     auto output_grad_offset =
         assn.GetUniqueSlice(custom_call, {2}).ValueOrDie();
-    thunk_sequence_->emplace_back(
-        absl::make_unique<CudnnBatchNormBackwardThunk>(
-            /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
-            /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
-            /*mean=*/GetAllocationSlice(*custom_call->operand(2)),
-            /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)),
-            /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)),
-            /*epsilon=*/epsilon_value,
-            /*feature_index=*/feature_index_value,
-            /*output_grad_data=*/output_grad_data,
-            /*output_grad_scale=*/output_grad_scale,
-            /*output_grad_offset=*/output_grad_offset,
-            /*output_tuple=*/GetAllocationSlice(*custom_call),
-            /*hlo=*/custom_call));
+    AddThunkToThunkSequence(absl::make_unique<CudnnBatchNormBackwardThunk>(
+        /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
+        /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
+        /*mean=*/GetAllocationSlice(*custom_call->operand(2)),
+        /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)),
+        /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)),
+        /*epsilon=*/epsilon_value,
+        /*feature_index=*/feature_index_value,
+        /*output_grad_data=*/output_grad_data,
+        /*output_grad_scale=*/output_grad_scale,
+        /*output_grad_offset=*/output_grad_offset,
+        /*output_tuple=*/GetAllocationSlice(*custom_call),
+        /*hlo=*/custom_call));
     return Status::OK();
   }
 
@@ -474,7 +468,7 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     auto conv_result_slice = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
     auto scratch_slice = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
 
-    thunk_sequence_->emplace_back(absl::make_unique<ConvolutionThunk>(
+    AddThunkToThunkSequence(absl::make_unique<ConvolutionThunk>(
         Cast<HloCustomCallInstruction>(custom_call), std::move(operand_slices),
         conv_result_slice, scratch_slice, tuple_result_slice));
     return Status::OK();
@@ -487,19 +481,68 @@ Status IrEmitterUnnested::HandleFft(HloInstruction* fft) {
   TF_RET_CHECK(
       LayoutUtil::IsMonotonicWithDim0Major(fft->operand(0)->shape().layout()));
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(fft->shape().layout()));
-  thunk_sequence_->emplace_back(BuildFftThunk(fft));
+  AddThunkToThunkSequence(BuildFftThunk(fft));
   return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
   HloInstruction* root = fusion->fused_expression_root();
-  // HandleFusion specializes reduction from a multi-dimensional array to a 1D
-  // array. The specialized version requires a initializer thunk that
-  // initializes the output array to the initial value of the reduce.
   if (HloInstruction::FusionKind::kInput == fusion->fusion_kind()) {
     switch (root->opcode()) {
+      case HloOpcode::kScatter: {
+        std::vector<std::unique_ptr<Thunk>> thunks;
+        // The initialization from 'operand' is using different loop bounds, so
+        // emit it in a separate kernel. Treat it like a loop fusion, writing to
+        // the output buffer.
+        {
+          int unroll_factor = ComputeMaxUnrollFactor(fusion);
+          thunks.push_back(BuildKernelThunk(
+              fusion, /*implements_whole_instruction=*/false, unroll_factor));
+
+          GpuElementalIrEmitter operand_elemental_emitter(
+              hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
+              GetNestedComputer());
+          FusedIrEmitter operand_fused_emitter(
+              GetGeneratorForOperandIrArrays(fusion),
+              &operand_elemental_emitter);
+          TF_RETURN_IF_ERROR(
+              root->mutable_operand(0)->Accept(&operand_fused_emitter));
+
+          TF_RETURN_IF_ERROR(EmitTargetElementLoopInThunk(
+              *fusion, operand_fused_emitter.GetGenerator(root->operand(0)),
+              static_cast<KernelThunk*>(thunks.back().get())));
+        }
+
+        // Now build the actual scatter, reading and writing to the freshly
+        // filled output buffer.
+        {
+          thunks.push_back(
+              BuildKernelThunk(fusion,
+                               /*implements_whole_instruction=*/false));
+          // Spin up a new fused emitter for the scatter kernel and emit it.
+          GpuElementalIrEmitter scatter_elemental_emitter(
+              hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
+              GetNestedComputer());
+          FusedIrEmitter scatter_fused_emitter(
+              GetGeneratorForOperandIrArrays(fusion),
+              &scatter_elemental_emitter);
+          TF_RETURN_IF_ERROR(root->Accept(&scatter_fused_emitter));
+          TF_RETURN_IF_ERROR(EmitScatter(
+              thunks.back().get(), root,
+              /*scatter_indices_gen=*/
+              scatter_fused_emitter.GetGenerator(root->operand(1)),
+              /*updates_gen=*/
+              scatter_fused_emitter.GetGenerator(root->operand(2))));
+        }
+        AddThunkToThunkSequence(
+            absl::make_unique<SequentialThunk>(std::move(thunks), fusion));
+        return Status::OK();
+      }
       case HloOpcode::kTuple:
       case HloOpcode::kReduce: {
+        // HandleFusion specializes reduction from a multi-dimensional array to
+        // a 1D array. The specialized version requires a initializer thunk that
+        // initializes the output array to the initial value of the reduce.
         if (root->opcode() == HloOpcode::kReduce &&
             ShapeUtil::IsTuple(root->shape())) {
           // TODO(b/112040122): Support variadic reduce.
@@ -528,18 +571,13 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
           }
         }
         CHECK(first_reduce != nullptr);
-        thunks.push_back(
-            BuildKernelThunk(fusion, /*implements_whole_instruction=*/false));
-        thunk_sequence_->emplace_back(
-            absl::make_unique<SequentialThunk>(std::move(thunks), fusion));
-        std::vector<IrArray> parameter_arrays;
-        for (HloInstruction* operand : fusion->operands()) {
-          parameter_arrays.push_back(GetIrArray(*operand, *fusion));
-        }
+        std::unique_ptr<KernelThunk> kernel_thunk =
+            BuildKernelThunk(fusion, /*implements_whole_instruction=*/false);
         GpuElementalIrEmitter elemental_emitter(
             hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
             GetNestedComputer());
-        FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter);
+        FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(fusion),
+                                     &elemental_emitter);
         TF_RETURN_IF_ERROR(root->Accept(&fused_emitter));
 
         // For multi-output fusion CHECK the constraints and feed all the
@@ -586,10 +624,15 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
           }
         }
         const Shape& input_shape = first_reduce->operand(0)->shape();
-        return EmitReductionToVector(first_reduce, input_shape, input_gens,
-                                     init_value_gens,
-                                     first_reduce->dimensions(), reducers,
-                                     reduce_output_shapes, extra_output_gens);
+        TF_CHECK_OK(EmitReductionToVector(
+            kernel_thunk.get(), first_reduce, input_shape, input_gens,
+            init_value_gens, first_reduce->dimensions(), reducers,
+            reduce_output_shapes, extra_output_gens));
+        thunks.push_back(std::move(kernel_thunk));
+        std::unique_ptr<SequentialThunk> sequential_thunk =
+            absl::make_unique<SequentialThunk>(std::move(thunks), fusion);
+        AddThunkToThunkSequence(std::move(sequential_thunk));
+        return Status::OK();
       }
       default:
         LOG(FATAL) << "Bad opcode for input fusion: "
@@ -603,12 +646,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     // touching the un-updated elements.
 
     // Set up kernel thunk and fused ir emitter.
-    thunk_sequence_->emplace_back(
-        BuildKernelThunk(fusion, /*implements_whole_instruction=*/true));
-    std::vector<IrArray> operand_arrays;
-    for (HloInstruction* operand : fusion->operands()) {
-      operand_arrays.push_back(GetIrArray(*operand, *fusion));
-    }
+    std::unique_ptr<KernelThunk> fusion_thunk =
+        BuildKernelThunk(fusion, /*implements_whole_instruction=*/true);
     GpuElementalIrEmitter elemental_emitter(hlo_module_config_,
                                             ir_emitter_context_->llvm_module(),
                                             &b_, GetNestedComputer());
@@ -622,18 +661,17 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
 
     LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
         update_shape, ir_emitter_context_->device_description());
-    CHECK(Thunk::Kind::kKernel == LastThunk()->kind());
-    UpdateLaunchDimensions(launch_dimensions,
-                           static_cast<KernelThunk*>(LastThunk()),
+    UpdateLaunchDimensions(launch_dimensions, fusion_thunk.get(),
                            ir_emitter_context_->llvm_module());
+    AddThunkToThunkSequence(std::move(fusion_thunk));
 
     return llvm_ir::EmitParallelFusedDynamicUpdateSliceInPlace(
-        fusion, operand_arrays, output_array, &elemental_emitter,
-        launch_dimensions, &b_);
+        fusion, GetGeneratorForOperandIrArrays(fusion), output_array,
+        &elemental_emitter, launch_dimensions, &b_);
   }
 
   if (ImplementedAsGemm(*fusion)) {
-    thunk_sequence_->emplace_back(BuildGemmThunk(fusion));
+    AddThunkToThunkSequence(BuildGemmThunk(fusion));
     return Status::OK();
   }
 
@@ -643,10 +681,6 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     return Status::OK();
   }
 
-  int unroll_factor = ComputeMaxUnrollFactor(fusion);
-
-  thunk_sequence_->emplace_back(BuildKernelThunk(
-      fusion, /*implements_whole_instruction=*/true, unroll_factor));
   return IrEmitter::HandleFusion(fusion);
 }
 
@@ -657,7 +691,7 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
   if (LayoutUtil::Equal(copy->operand(0)->shape().layout(),
                         copy->shape().layout()) &&
       buffer_assignment.GetUniqueTopLevelSlice(copy->operand(0)).ok()) {
-    thunk_sequence_->emplace_back(BuildDeviceToDeviceCopyThunk(copy));
+    AddThunkToThunkSequence(BuildDeviceToDeviceCopyThunk(copy));
     return Status::OK();
   }
   if (CheckAndEmitHloWithTile021(copy)) {
@@ -685,7 +719,7 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
 }
 
 Status IrEmitterUnnested::EmitReductionToScalar(
-    HloInstruction* reduce, const Shape& input_shape,
+    KernelThunk* kernel_thunk, HloInstruction* reduce, const Shape& input_shape,
     absl::Span<const llvm_ir::ElementGenerator> input_gens,
     absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
     absl::Span<HloComputation* const> reducers,
@@ -888,18 +922,16 @@ Status IrEmitterUnnested::EmitReductionToScalar(
   };
 
   // Emit a parallel loop that iterates through all input tiles, one per thread.
-  CHECK(LastThunk()->kind() == Thunk::Kind::kSequential);
-  UpdateLaunchDimensions(
-      launch_dimensions,
-      static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
-      ir_emitter_context_->llvm_module());
+  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
+                         ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
                              launch_dimensions, &b_)
       .EmitLoop(IrName(reduce), index_ty);
 }
 
 Status IrEmitterUnnested::EmitColumnReduction(
-    int64 height, int64 width, HloInstruction* reduce, const Shape& input_shape,
+    KernelThunk* kernel_thunk, int64 height, int64 width,
+    HloInstruction* reduce, const Shape& input_shape,
     absl::Span<const llvm_ir::ElementGenerator> input_gens,
     absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
     absl::Span<HloComputation* const> reducers,
@@ -1151,17 +1183,14 @@ Status IrEmitterUnnested::EmitColumnReduction(
   };
 
   // Emit a parallel loop that iterate through all input tiles.
-  CHECK(LastThunk()->kind() == Thunk::Kind::kSequential);
-  UpdateLaunchDimensions(
-      launch_dimensions,
-      static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
-      ir_emitter_context_->llvm_module());
+  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
+                         ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
                              launch_dimensions, &b_)
       .EmitLoop(IrName(reduce), index_ty);
 }
 
-static std::pair<int64, int64> ComputeTilingSchemeForReduction(
+static std::pair<int64, int64> ComputeKernelMappingSchemeForReduction(
     int64 depth, int64 width, int64 kWarpSize) {
   constexpr int64 kTargetNumElementsPerThread = 64;
   int64 x_tile_size = kTargetNumElementsPerThread;
@@ -1186,8 +1215,8 @@ static std::pair<int64, int64> ComputeTilingSchemeForReduction(
 }
 
 Status IrEmitterUnnested::EmitRowReduction(
-    int64 depth, int64 height, int64 width, HloInstruction* reduce,
-    const Shape& input_shape,
+    KernelThunk* kernel_thunk, int64 depth, int64 height, int64 width,
+    HloInstruction* reduce, const Shape& input_shape,
     absl::Span<const llvm_ir::ElementGenerator> input_gens,
     absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
     absl::Span<HloComputation* const> reducers,
@@ -1295,7 +1324,7 @@ Status IrEmitterUnnested::EmitRowReduction(
   int64 x_tile_size;
   int64 z_tile_size;
   std::tie(x_tile_size, z_tile_size) =
-      ComputeTilingSchemeForReduction(depth, width, kWarpSize);
+      ComputeKernelMappingSchemeForReduction(depth, width, kWarpSize);
 
   // Round the width in tiles up to the nearest multiple of kWarpSize, so that
   // the use of shfl_down is valid.
@@ -1522,11 +1551,8 @@ Status IrEmitterUnnested::EmitRowReduction(
   };
 
   // Emit a parallel loop that iterates through every input tiles.
-  CHECK(LastThunk()->kind() == Thunk::Kind::kSequential);
-  UpdateLaunchDimensions(
-      launch_dimensions,
-      static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
-      ir_emitter_context_->llvm_module());
+  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
+                         ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
                              launch_dimensions, &b_)
       .EmitLoop(IrName(reduce), index_ty);
@@ -1539,7 +1565,7 @@ Status IrEmitterUnnested::EmitRowReduction(
 //               and, if `reduce` is fused, the fused subgraph is pure
 //               elementwise.
 Status IrEmitterUnnested::EmitReductionToVector(
-    HloInstruction* reduce, const Shape& input_shape,
+    KernelThunk* kernel_thunk, HloInstruction* reduce, const Shape& input_shape,
     absl::Span<const llvm_ir::ElementGenerator> input_gens,
     absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
     absl::Span<const int64> dimensions_to_reduce,
@@ -1580,8 +1606,8 @@ Status IrEmitterUnnested::EmitReductionToVector(
   // the dimensions to keep are contiguous, by prerequisite of
   // `EmitReductionToVector`, we only need to check whether the minormost
   // dimension of the input is to keep.
-  if (input_dims_to_keep.empty()) {
-    return EmitReductionToScalar(reduce, input_shape, input_gens,
+  if (ShapeUtil::IsEffectiveScalar(reduce->shape())) {
+    return EmitReductionToScalar(kernel_thunk, reduce, input_shape, input_gens,
                                  init_value_gens, reducers,
                                  reduce_output_shapes, extra_output_gens);
   } else if (input_dims_to_keep.front() ==
@@ -1600,9 +1626,9 @@ Status IrEmitterUnnested::EmitReductionToVector(
         height *= input_shape.dimensions(input_dim);
       }
     }
-    return EmitColumnReduction(height, width, reduce, input_shape, input_gens,
-                               init_value_gens, reducers, reduce_output_shapes,
-                               extra_output_gens);
+    return EmitColumnReduction(kernel_thunk, height, width, reduce, input_shape,
+                               input_gens, init_value_gens, reducers,
+                               reduce_output_shapes, extra_output_gens);
   } else {
     // Reduce the row dimension of a matrix or reduce dimension 0 and 2 in a
     // 3D tensor. The size of dimension 1 (the height) is the size of the
@@ -1627,8 +1653,8 @@ Status IrEmitterUnnested::EmitReductionToVector(
       }
     }
     const int64 height = ShapeUtil::ElementsIn(reduce->shape());
-    return EmitRowReduction(depth, height, width, reduce, input_shape,
-                            input_gens, init_value_gens, reducers,
+    return EmitRowReduction(kernel_thunk, depth, height, width, reduce,
+                            input_shape, input_gens, init_value_gens, reducers,
                             reduce_output_shapes, extra_output_gens);
   }
 }
@@ -1650,28 +1676,40 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
                         BuildInitializerThunk(reduce));
     std::vector<std::unique_ptr<Thunk>> thunks;
     thunks.push_back(std::move(initializer_thunk));
-    thunks.push_back(
-        BuildKernelThunk(reduce, /*implements_whole_instruction=*/false));
-    thunk_sequence_->emplace_back(
-        absl::make_unique<SequentialThunk>(std::move(thunks), reduce));
+    std::unique_ptr<KernelThunk> kernel_thunk =
+        BuildKernelThunk(reduce, /*implements_whole_instruction=*/false);
 
-    return EmitReductionToVector(
-        reduce, input->shape(), {[&](const IrArray::Index& index) {
+    TF_CHECK_OK(EmitReductionToVector(
+        kernel_thunk.get(), reduce, input->shape(),
+        {[&](const IrArray::Index& index) {
           return GetIrArray(*input, *reduce).EmitReadArrayElement(index, &b_);
         }},
         {[&](const IrArray::Index& index) {
           return GetIrArray(*init_value, *reduce)
               .EmitReadArrayElement(index, &b_);
         }},
-        dimensions_to_reduce, {reducer}, {{}}, {});
+        dimensions_to_reduce, {reducer}, {{}}, {}));
+
+    thunks.push_back(std::move(kernel_thunk));
+
+    std::unique_ptr<SequentialThunk> sequential_thunk =
+        absl::make_unique<SequentialThunk>(std::move(thunks), reduce);
+    AddThunkToThunkSequence(std::move(sequential_thunk));
+    return Status::OK();
   }
 
-  thunk_sequence_->emplace_back(
-      BuildKernelThunk(reduce, /*implements_whole_instruction=*/true));
   return IrEmitter::HandleReduce(reduce);
 }
 
 Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
+  // For the root node of the entry computation we can elide writing the tuple
+  // buffer. We can always figure out the contents of the tuples from buffer
+  // assignment because we insert copies to ensure non-ambiguous output buffers.
+  // GpuExecutable never reads the tuple buffer.
+  if (tuple ==
+      tuple->parent()->parent()->entry_computation()->root_instruction()) {
+    return Status::OK();
+  }
   bool all_tuple_elements_have_buffer =
       absl::c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) {
         return ir_emitter_context_->buffer_assignment()
@@ -1695,11 +1733,11 @@ Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
     for (const HloInstruction* tuple_element : tuple->operands()) {
       tuple_element_buffers.push_back(GetAllocationSlice(*tuple_element));
     }
-    thunk_sequence_->emplace_back(absl::make_unique<TupleThunk>(
+    AddThunkToThunkSequence(absl::make_unique<TupleThunk>(
         tuple_element_buffers, GetAllocationSlice(*tuple), tuple));
     return Status::OK();
   }
-  thunk_sequence_->emplace_back(
+  AddThunkToThunkSequence(
       BuildKernelThunk(tuple, /*implements_whole_instruction=*/true));
   return IrEmitter::HandleTuple(tuple);
 }
@@ -1727,8 +1765,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   thunks.push_back(std::move(initializer_thunk));
   thunks.push_back(BuildKernelThunk(select_and_scatter,
                                     /*implements_whole_instruction=*/false));
-  thunk_sequence_->emplace_back(absl::make_unique<SequentialThunk>(
-      std::move(thunks), select_and_scatter));
+  std::unique_ptr<SequentialThunk> select_and_scatter_thunk =
+      absl::make_unique<SequentialThunk>(std::move(thunks), select_and_scatter);
 
   // TODO(b/31410564): Implement dilation rate for select-and-scatter.
   if (window_util::HasDilation(window)) {
@@ -1894,8 +1932,9 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
       // consisting of two thunks, an initializer KernelThunk that initializes
       // the output and another KernelThunk that accumulates the scattered
       // elements.
-      static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
+      select_and_scatter_thunk->thunks().back().get(),
       ir_emitter_context_->llvm_module());
+  AddThunkToThunkSequence(std::move(select_and_scatter_thunk));
   return ParallelLoopEmitter(loop_body_emitter, source->shape(),
                              launch_dimensions, &b_)
       .EmitLoop(IrName(select_and_scatter), index_type);
@@ -1909,10 +1948,10 @@ Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) {
   // Build ForThunk for conformant while loops, otherwise build WhileThunk.
   // TODO(b/112163966): Move trip count computation earlier in the pipeline.
   if (auto loop_trip_count = ComputeWhileLoopTripCount(xla_while)) {
-    thunk_sequence_->emplace_back(BuildForThunk(xla_while, *loop_trip_count));
+    AddThunkToThunkSequence(BuildForThunk(xla_while, *loop_trip_count));
     VLOG(3) << "Built ForThunk for while: " << xla_while->name();
   } else {
-    thunk_sequence_->emplace_back(BuildWhileThunk(xla_while));
+    AddThunkToThunkSequence(BuildWhileThunk(xla_while));
     VLOG(3) << "Built WhileThunk for while: " << xla_while->name();
   }
   return Status::OK();
@@ -1923,79 +1962,257 @@ Status IrEmitterUnnested::HandleRng(HloInstruction* rng) {
   //
   // Unroll the kernel so that the duplicated computation that calculates the
   // 128 bit sample can be optimized away by LLVM.
-  thunk_sequence_->emplace_back(
-      BuildKernelThunk(rng, /*implements_whole_instruction=*/false,
-                       ComputeMaxUnrollFactor(rng)));
+  std::unique_ptr<KernelThunk> rng_thunk = BuildKernelThunk(
+      rng, /*implements_whole_instruction=*/false, ComputeMaxUnrollFactor(rng));
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : rng->operands()) {
     operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
       return GetIrArray(*operand, *rng).EmitReadArrayElement(index, &b_);
     };
   }
-  TF_RETURN_IF_ERROR(EmitTargetElementLoop(
-      *rng, GpuElementalIrEmitter(hlo_module_config_, module_, &b_,
-                                  GetNestedComputer())
-                .MakeElementGenerator(rng, operand_to_generator)));
-  std::unique_ptr<Thunk> rng_thunk = std::move(thunk_sequence_->back());
-  thunk_sequence_->pop_back();
+  TF_RETURN_IF_ERROR(EmitTargetElementLoopInThunk(
+      *rng,
+      GpuElementalIrEmitter(hlo_module_config_, module_, &b_,
+                            GetNestedComputer())
+          .MakeElementGenerator(rng, operand_to_generator),
+      rng_thunk.get()));
 
   // Emit a kernel to increment the global state for Philox RNG algorithm.
-  thunk_sequence_->emplace_back(
-      BuildKernelThunk(rng, /*implements_whole_instruction=*/false));
-  llvm_ir::IncrementVariableForPhiloxRngState(1, module_, &b_);
   std::unique_ptr<Thunk> increment_seed_thunk =
-      std::move(thunk_sequence_->back());
-  thunk_sequence_->pop_back();
+      BuildKernelThunk(rng, /*implements_whole_instruction=*/false);
+  llvm_ir::IncrementVariableForPhiloxRngState(1, module_, &b_);
 
   // Build the SequentialThunk for the RNG hlo.
   std::vector<std::unique_ptr<Thunk>> thunks;
   thunks.reserve(2);
   thunks.push_back(std::move(rng_thunk));
   thunks.push_back(std::move(increment_seed_thunk));
-  thunk_sequence_->emplace_back(
+  AddThunkToThunkSequence(
       absl::make_unique<SequentialThunk>(std::move(thunks), rng));
 
   return Status::OK();
 }
 
+Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
+  const HloInstruction* operand = scatter->operand(0);
+  const HloInstruction* scatter_indices = scatter->operand(1);
+  const HloInstruction* updates = scatter->operand(2);
+
+  std::vector<std::unique_ptr<Thunk>> thunks;
+
+  // Copy the operand into the output if it's not the same buffer already.
+  auto operand_buffer = GetAllocationSlice(*operand);
+  auto destination_buffer = GetAllocationSlice(*scatter);
+  if (operand_buffer != destination_buffer) {
+    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+        /*source_address=*/operand_buffer,
+        /*destination_buffer=*/destination_buffer,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(operand->shape()), scatter));
+  }
+
+  thunks.push_back(
+      BuildKernelThunk(scatter,
+                       /*implements_whole_instruction=*/thunks.empty()));
+
+  TF_RETURN_IF_ERROR(
+      EmitScatter(thunks.back().get(), scatter,
+                  /*scatter_indices_gen=*/
+                  [=](const IrArray::Index& index) {
+                    return GetIrArray(*scatter_indices, *scatter)
+                        .EmitReadArrayElement(index, &b_, "scatter_index");
+                  },
+                  /*updates_gen=*/
+                  [=](const IrArray::Index& index) {
+                    return GetIrArray(*updates, *scatter)
+                        .EmitReadArrayElement(index, &b_, "update");
+                  }));
+
+  // Elide the sequential thunk if there's no copy.
+  if (thunks.size() == 1) {
+    AddThunkToThunkSequence(std::move(thunks[0]));
+  } else {
+    AddThunkToThunkSequence(
+        absl::make_unique<SequentialThunk>(std::move(thunks), scatter));
+  }
+
+  return Status::OK();
+}
+
+Status IrEmitterUnnested::EmitScatter(
+    Thunk* thunk, HloInstruction* scatter,
+    const llvm_ir::ElementGenerator& scatter_indices_gen,
+    const llvm_ir::ElementGenerator& updates_gen) {
+  const HloInstruction* operand = scatter->operand(0);
+  const HloInstruction* scatter_indices = scatter->operand(1);
+  const HloInstruction* updates = scatter->operand(2);
+  const ScatterDimensionNumbers& dim_numbers =
+      scatter->scatter_dimension_numbers();
+  CHECK(ShapeUtil::Equal(scatter->shape(), operand->shape()));
+
+  auto loop_body_emitter = [&](const IrArray::Index& index) -> Status {
+    std::vector<llvm::Value*> raw_window_multidim;
+    std::vector<llvm::Value*> input_scatter_multidim;
+    std::vector<int64> raw_window_bounds;
+
+    // Partition the index into window indices and scatter indices.
+    for (int64 i = 0, e = index.size(); i != e; ++i) {
+      // For window indices also remember the window size, this comes in handy
+      // later.
+      if (absl::c_binary_search(dim_numbers.update_window_dims(), i)) {
+        raw_window_multidim.push_back(index[i]);
+        raw_window_bounds.push_back(updates->shape().dimensions(i));
+      } else {
+        input_scatter_multidim.push_back(index[i]);
+      }
+    }
+    DCHECK_EQ(raw_window_multidim.size(),
+              dim_numbers.update_window_dims_size());
+
+    // Apply inserted_window_dims to the window dimensions.
+    int64 raw_window_multidim_idx = 0;
+    std::vector<llvm::Value*> input_window_multidim;
+    std::vector<int64> input_window_bounds;
+    for (int64 i = 0, e = ShapeUtil::Rank(operand->shape()); i != e; ++i) {
+      if (absl::c_binary_search(dim_numbers.inserted_window_dims(), i)) {
+        input_window_bounds.push_back(1);  // Trivial dimension.
+        input_window_multidim.push_back(index.GetConstantWithIndexType(0));
+      } else {
+        input_window_bounds.push_back(
+            raw_window_bounds[raw_window_multidim_idx]);
+        input_window_multidim.push_back(
+            raw_window_multidim[raw_window_multidim_idx]);
+        ++raw_window_multidim_idx;
+      }
+    }
+    DCHECK_EQ(input_window_multidim.size(), ShapeUtil::Rank(operand->shape()));
+
+    // Insert a 1 dimension at the end if index_vector_dim requests one.
+    Shape scatter_indices_shape = scatter_indices->shape();
+    if (dim_numbers.index_vector_dim() ==
+        ShapeUtil::Rank(scatter_indices_shape)) {
+      scatter_indices_shape.add_dimensions(1);
+      scatter_indices_shape.mutable_layout()->add_minor_to_major(
+          dim_numbers.index_vector_dim());
+    }
+
+    // Now load the indices corresponding to the current window from
+    // scatter_indices.
+    llvm_ir::IrArray::Index raw_scatter_index_index(input_scatter_multidim,
+                                                    index.GetType());
+    raw_scatter_index_index.InsertAt(dim_numbers.index_vector_dim(), nullptr);
+    llvm::Value* is_in_bounds = b_.getTrue();
+    for (int64 i = 0, e = dim_numbers.scatter_dims_to_operand_dims_size();
+         i != e; ++i) {
+      // Our index is stored along index_vector_dim, insert that into the lookup
+      // index into scatter_indices.
+      raw_scatter_index_index[dim_numbers.index_vector_dim()] =
+          raw_scatter_index_index.GetConstantWithIndexType(i);
+
+      int64 operand_dim = dim_numbers.scatter_dims_to_operand_dims(i);
+      TF_ASSIGN_OR_RETURN(
+          llvm::Value* const loaded_scatter_index,
+          scatter_indices_gen(raw_scatter_index_index.SourceIndexOfReshape(
+              scatter_indices_shape, scatter_indices->shape(), &b_)));
+      // And add the index to our window index. This yields the output index.
+      llvm::Value* casted_scatter_index =
+          IntCast(loaded_scatter_index, index.GetType(),
+                  /*isSigned=*/true);
+      llvm::Value* dim_offset =
+          Add(input_window_multidim[operand_dim], casted_scatter_index);
+      input_window_multidim[operand_dim] = dim_offset;
+
+      // Also do the bounds check now.
+      int64 max_index = operand->shape().dimensions(operand_dim) -
+                        input_window_bounds[operand_dim] + 1;
+      // is_in_bounds = index >= 0 && index < dim_size-window_size+1
+      //   --> index u< dim_size-window_size+1
+      is_in_bounds =
+          And(is_in_bounds, ICmpULT(casted_scatter_index,
+                                    index.GetConstantWithIndexType(max_index)));
+    }
+
+    llvm_ir::LlvmIfData if_window_in_bounds_data = llvm_ir::EmitIfThenElse(
+        is_in_bounds, "scatter.in_bounds", &b_, /*emit_else=*/false);
+    llvm_ir::SetToFirstInsertPoint(if_window_in_bounds_data.true_block, &b_);
+    // All done, now just read from the calculated input from the window, and do
+    // an atomic store to the calculated location in the output.
+    llvm_ir::IrArray::Index input_window_index(input_window_multidim,
+                                               index.GetType());
+    HloInstruction* output_hlo =
+        scatter->IsFused() ? scatter->parent()->FusionInstruction() : scatter;
+    llvm::Value* output_address =
+        GetIrArray(*output_hlo, *output_hlo)
+            .EmitArrayElementAddress(input_window_index, &b_);
+    llvm::Value* input_address = Alloca(llvm_ir::PrimitiveTypeToIrType(
+        updates->shape().element_type(), module_));
+    TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value, updates_gen(index));
+    Store(input_ir_value, input_address);
+    return EmitAtomicOperationForNestedComputation(
+        *scatter->to_apply(), output_address, input_address);
+  };
+
+  // Launch a kernel that reads every element in the updates tensor. We could
+  // also do one kernel per window instead if bounds checks turn out to be a
+  // bottleneck.
+  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+      updates->shape(), ir_emitter_context_->device_description());
+  UpdateLaunchDimensions(launch_dimensions, thunk,
+                         ir_emitter_context_->llvm_module());
+
+  return ParallelLoopEmitter(loop_body_emitter, updates->shape(),
+                             launch_dimensions, &b_)
+      .EmitLoop(IrName(scatter),
+                GetIndexTypeForKernel(scatter, launch_dimensions.launch_bound(),
+                                      &b_));
+}
+
 Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
-  thunk_sequence_->push_back(
-      BuildKernelThunk(select, /*implements_whole_instruction=*/true));
   return IrEmitter::HandleSelect(select);
 }
 
 Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   std::vector<std::unique_ptr<Thunk>> thunks;
-  auto keys = sort->operand(0);
-  auto values = sort->operand_count() > 1 ? sort->operand(1) : nullptr;
-  ShapeIndex keys_shape_index({});
-  ShapeIndex values_shape_index({});
-  if (values != nullptr) {
-    keys_shape_index = ShapeIndex({0});
-    values_shape_index = ShapeIndex({1});
-  }
-  auto keys_destination = GetAllocationSlice(*sort, keys_shape_index);
-  auto values_destination = GetAllocationSlice(*sort, values_shape_index);
-
-  if (keys_destination != GetAllocationSlice(*keys)) {
-    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
-        /*source_address=*/GetAllocationSlice(*keys),
-        /*destination_buffer=*/keys_destination,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(keys->shape()), nullptr));
-  }
-  if (values != nullptr && values_destination != GetAllocationSlice(*values)) {
-    // TODO(b/26783907): Figure out why we never seem to share buffers for
-    // key/value sort.
-    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
-        /*source_address=*/GetAllocationSlice(*values),
-        /*destination_buffer=*/values_destination,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(values->shape()), nullptr));
+  Shape keys_shape = sort->operand(0)->shape();
+  int64 dimension_to_sort = sort->dimensions(0);
+  // In case there is a 'values' parameter that is a iota, we take note and use
+  // it later to ensure a stable sort. Otherwise, we don't guarantee a stable
+  // sort.
+  int64 iota_values_parameter_index = -1;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    if (i > 0 && sort->operand(i)->opcode() == HloOpcode::kIota &&
+        ShapeUtil::ElementIsIntegral(sort->operand(i)->shape()) &&
+        Cast<HloIotaInstruction>(sort->operand(i))->iota_dimension() ==
+            dimension_to_sort) {
+      iota_values_parameter_index = i;
+    }
+    ShapeIndex shape_index =
+        sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
+    // We assume that the layout of all involved operands and outputs is the
+    // same.
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(keys_shape,
+                                                  sort->operand(i)->shape()));
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+        keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index)));
+
+    // If possible, we share buffers. If that is not possible, we need to copy
+    // the values, because the emitter does the sorting in-place.
+    auto destination_buffer = GetAllocationSlice(*sort, shape_index);
+    auto source_address = GetAllocationSlice(*sort->operand(i));
+    if (destination_buffer != source_address) {
+      // TODO(b/26783907): Figure out why we never seem to share buffers for
+      // key/value sort.
+      thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+          /*source_address=*/source_address,
+          /*destination_buffer=*/destination_buffer,
+          /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(i)->shape()),
+          nullptr));
+    }
   }
 
-  int64 dimension_to_sort = sort->dimensions(0);
-  int64 dimension_to_sort_bound = keys->shape().dimensions(dimension_to_sort);
+  uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
-  auto index_type = b_.getInt64Ty();
+  CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
+  CHECK_LT(1ULL << (num_stages - 1), dimension_to_sort_bound);
 
   // Naive C++ code for the outer loops:
   //
@@ -2009,41 +2226,128 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   //   }
   // }
   //
-  // This follows the algorithm described on Wikipedia:
-  // https://en.wikipedia.org/wiki/Bitonic_sorter
-
+  // This follows the alternative representation of the algorithm described on
+  // Wikipedia: https://en.wikipedia.org/wiki/Bitonic_sorter
+  //
+  // Each mask specifies how to derive from one position in the array the
+  // position with which it should be compared (we calculate the xor of the
+  // position with the mask).
+  // As an optimization, we can move the 'mask' loop to inside the
+  // sorting/comparison loop if the comparisons happen within a small block of
+  // the array. To make this work, we collect all consecutive masks that are
+  // smaller than our chosen power of 2 tile size, and pass them to SortInPlace.
+  // Each thread then processes one tile of data.
+
+  const uint64 kTileSize = std::min(2048ULL, 1ULL << num_stages);
+
+  // If we cannot combine several xor masks together, we don't use tiling, so we
+  // calculate the standard launch dimensions for the shape. However we only
+  // need to iterate through ~half of the dimension to sort (rounded up to the
+  // next highest power of 2), because each iteration compares one pair of
+  // elements.
+  Shape standard_iteration_shape = keys_shape;
+  uint64 standard_num_iterations_in_sort_dim = 1ULL << (num_stages - 1);
+  standard_iteration_shape.set_dimensions(dimension_to_sort,
+                                          standard_num_iterations_in_sort_dim);
+  LaunchDimensions standard_launch_dimensions = CalculateLaunchDimensions(
+      standard_iteration_shape, ir_emitter_context_->device_description());
+
+  // Calculate the launch dimensions for the case where we use tiling. We split
+  // the dimension that should be sorted into tiles of size 'kTileSize'. This
+  // means we first need to round 'dimension_to_sort_bound' up to be a multiple
+  // of the tile size.
+  int64 rounded_bound = RoundUpToNearest(dimension_to_sort_bound, kTileSize);
+  Shape iteration_shape = keys_shape;
+
+  // We iterate through the element pairs that should be compared.
+  uint64 num_iterations_in_sort_dim = rounded_bound / 2;
+  iteration_shape.set_dimensions(dimension_to_sort, num_iterations_in_sort_dim);
+  uint64 num_iterations = ShapeUtil::ElementsIn(iteration_shape);
+
+  // For correctness reasons we need exactly 'kTileSize' / 2 many threads per
+  // block. Each thread is responsible for copying exactly two adjacent elements
+  // into shared memory, and then does a comparison of two possibly different
+  // elements taken from shared memory.
+  const uint64 kThreadsPerBlock = kTileSize / 2;
+
+  // Check whether we should use any tiling. We might not be able to use it if
+  // we have not enough threads, or not enough shared memory. Also it does not
+  // give a speedup if the tile size is < 128.
+  int64 total_shared_memory_needed = 0;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    total_shared_memory_needed +=
+        kTileSize * ShapeUtil::ByteSizeOfPrimitiveType(
+                        sort->operand(i)->shape().element_type());
+  }
+  bool no_tiling =
+      kTileSize < 128 ||
+      kThreadsPerBlock >
+          ir_emitter_context_->device_description().threads_per_block_limit() ||
+      total_shared_memory_needed >
+          ir_emitter_context_->device_description().shared_memory_per_block();
+
+  uint64 num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
+  LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
+
+  auto emit_kernel = [&](absl::Span<const int64> xor_masks) {
+    thunks.push_back(
+        BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+    LaunchDimensions launch_dimensions = xor_masks.size() > 1
+                                             ? tiled_launch_dimensions
+                                             : standard_launch_dimensions;
+    UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
+                           ir_emitter_context_->llvm_module());
+    IrArray keys_array;
+    std::vector<IrArray> values_arrays;
+    values_arrays.reserve(sort->operand_count() - 1);
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      ShapeIndex shape_index =
+          sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
+      if (i == 0) {
+        keys_array = GetIrArray(*sort, *sort, shape_index);
+      } else {
+        values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
+      }
+    }
+    return llvm_ir::EmitSortInPlace(
+        dimension_to_sort, keys_array, values_arrays,
+        iota_values_parameter_index, IrName(sort), xor_masks, &b_,
+        launch_dimensions,
+        xor_masks.size() > 1 ? num_iterations_in_sort_dim
+                             : standard_num_iterations_in_sort_dim,
+        kTileSize);
+  };
+  std::vector<int64> xor_masks;
   for (int64 stage = 0; stage < num_stages; ++stage) {
     for (int64 mask = stage; mask >= 0; --mask) {
-      thunks.push_back(
-          BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
-      LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-          keys->shape(), ir_emitter_context_->device_description());
-      UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
-                             ir_emitter_context_->llvm_module());
-
-      llvm::Value* xor_mask;
+      int64 xor_mask;
       if (mask == stage) {
-        xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1);
+        xor_mask = (1LL << (stage + 1)) - 1;
       } else {
-        xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask);
+        xor_mask = 1LL << mask;
+      }
+      if (xor_mask >= kTileSize || no_tiling) {
+        if (!xor_masks.empty()) {
+          TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+          xor_masks.clear();
+        }
+        TF_RETURN_IF_ERROR(emit_kernel({xor_mask}));
+      } else {
+        xor_masks.push_back(xor_mask);
       }
-
-      TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace(
-          dimension_to_sort, GetIrArray(*sort, *sort, keys_shape_index),
-          values != nullptr ? absl::make_optional<IrArray>(
-                                  GetIrArray(*sort, *sort, values_shape_index))
-                            : absl::nullopt,
-          IrName(sort), xor_mask, &b_, &launch_dimensions));
     }
   }
+  if (!xor_masks.empty()) {
+    TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+  }
 
-  thunk_sequence_->emplace_back(
+  AddThunkToThunkSequence(
       absl::make_unique<SequentialThunk>(std::move(thunks), sort));
   return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) {
-  thunk_sequence_->push_back(
+  AddThunkToThunkSequence(
       BuildKernelThunk(tuple_select, /*implements_whole_instruction=*/true));
   return IrEmitter::HandleTupleSelect(tuple_select);
 }
@@ -2065,7 +2369,7 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
   if (crs->operand_count() == 1) {
     CHECK(ShapeUtil::IsArray(crs->operand(0)->shape()))
         << "Operands to cross-replica-sum must be arrays: " << crs->ToString();
-    thunk_sequence_->push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+    AddThunkToThunkSequence(absl::make_unique<DeviceToDeviceCopyThunk>(
         /*source_address=*/GetAllocationSlice(*crs->operand(0)),
         /*destination_buffer=*/GetAllocationSlice(*crs),
         /*mem_size=*/ShapeUtil::ByteSizeOf(crs->shape()), crs));
@@ -2089,22 +2393,22 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
   // Output a tuple of the buffers above.
   thunks.push_back(absl::make_unique<TupleThunk>(
       tuple_element_buffers, GetAllocationSlice(*crs), nullptr));
-  thunk_sequence_->push_back(
+  AddThunkToThunkSequence(
       absl::make_unique<SequentialThunk>(std::move(thunks), crs));
   return Status::OK();
 }
 
-Status IrEmitterUnnested::HandleAfterAll(HloInstruction* gen_token) {
+Status IrEmitterUnnested::HandleAfterAll(HloInstruction* after_all) {
   return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleInfeed(HloInstruction* infeed) {
-  thunk_sequence_->emplace_back(BuildInfeedThunk(infeed));
+  AddThunkToThunkSequence(BuildInfeedThunk(infeed));
   return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleOutfeed(HloInstruction* outfeed) {
-  thunk_sequence_->emplace_back(BuildOutfeedThunk(outfeed));
+  AddThunkToThunkSequence(BuildOutfeedThunk(outfeed));
   return Status::OK();
 }
 
@@ -2413,28 +2717,43 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
         rhs->shape(),               // The shape of RHS.
         inst->shape(),              // The shape of the output.
         1.0,                        // alpha.
-        inst);
+        0.0,                        // beta.
+        inst, /*implements_whole_instruction=*/true);
   }
 
   if (inst->opcode() == HloOpcode::kFusion) {
     CHECK_EQ(inst->fusion_kind(), HloInstruction::FusionKind::kOutput);
-    const HloInstruction* mul = inst->fused_expression_root();
-    const HloInstruction* dot = mul->operand(0);
-    const HloInstruction* alpha = mul->operand(1);
-    if (dot->opcode() != HloOpcode::kDot) {
-      std::swap(dot, alpha);
-    }
-    if (alpha->opcode() == HloOpcode::kBroadcast) {
-      alpha = alpha->operand(0);
-    }
-    if (alpha->opcode() == HloOpcode::kParameter) {
-      alpha = inst->operand(alpha->parameter_number());
-    }
-    // TODO(b/74185543): Remove the following if block once we support fusion
-    // with a non-constant as well. Then we will just always use the constant
-    // on the device.
-    if (alpha->opcode() == HloOpcode::kCopy) {
-      alpha = alpha->operand(0);
+    const HloInstruction* output_fused_op = inst->fused_expression_root();
+
+    double alpha_value = 1.0;
+    const HloInstruction* bias = nullptr;
+    const HloInstruction* dot = output_fused_op->operand(0);
+    if (output_fused_op->opcode() == HloOpcode::kMultiply) {
+      const HloInstruction* alpha = output_fused_op->operand(1);
+      if (dot->opcode() != HloOpcode::kDot) {
+        std::swap(dot, alpha);
+      }
+      if (alpha->opcode() == HloOpcode::kBroadcast) {
+        alpha = alpha->operand(0);
+      }
+      if (alpha->opcode() == HloOpcode::kParameter) {
+        alpha = inst->operand(alpha->parameter_number());
+      }
+      // TODO(b/74185543): Remove the following if block once we support fusion
+      // with a non-constant as well. Then we will just always use the constant
+      // on the device.
+      if (alpha->opcode() == HloOpcode::kCopy) {
+        alpha = alpha->operand(0);
+      }
+      alpha_value = GetScalarConstantAsDouble(alpha->literal());
+    } else {
+      // Fused bias add.
+      CHECK_EQ(output_fused_op->opcode(), HloOpcode::kAdd);
+      bias = output_fused_op->operand(1);
+      if (dot->opcode() != HloOpcode::kDot) {
+        std::swap(dot, bias);
+      }
+      bias = inst->operand(bias->parameter_number());
     }
 
     DCHECK(dot->opcode() == HloOpcode::kDot);
@@ -2447,15 +2766,38 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
     const HloInstruction* rhs =
         inst->operand(rhs_parameter->parameter_number());
 
+    // The bias is passed inside the output buffer. If those buffers are shared
+    // we can just use it, otherwise copy the bias values into the output buffer
+    // first.
+    if (bias != nullptr &&
+        GetAllocationSlice(*bias) != GetAllocationSlice(*inst)) {
+      std::vector<std::unique_ptr<Thunk>> thunks;
+      thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+          /*source_buffer=*/GetAllocationSlice(*bias),
+          /*destination_buffer=*/GetAllocationSlice(*inst),
+          /*mem_size=*/ShapeUtil::ByteSizeOf(inst->shape()), nullptr));
+      thunks.push_back(absl::make_unique<GemmThunk>(
+          GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
+          GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
+          GetAllocationSlice(*inst),  // The output buffer.
+          lhs->shape(),               // The shape of LHS.
+          rhs->shape(),               // The shape of RHS.
+          inst->shape(),              // The shape of the output.
+          alpha_value,                // alpha.
+          1.0,                        // beta.
+          inst, /*implements_whole_instruction=*/false));
+      return absl::make_unique<SequentialThunk>(std::move(thunks), inst);
+    }
     return absl::make_unique<GemmThunk>(
-        GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
-        GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
-        GetAllocationSlice(*inst),  // The output buffer.
-        lhs->shape(),               // The shape of LHS.
-        rhs->shape(),               // The shape of RHS.
-        inst->shape(),              // The shape of the output.
-        GetScalarConstantAsDouble(alpha->literal()),  // alpha.
-        inst);
+        GetAllocationSlice(*lhs),     // The buffer assigned to LHS.
+        GetAllocationSlice(*rhs),     // The buffer assigned to RHS.
+        GetAllocationSlice(*inst),    // The output buffer.
+        lhs->shape(),                 // The shape of LHS.
+        rhs->shape(),                 // The shape of RHS.
+        inst->shape(),                // The shape of the output.
+        alpha_value,                  // alpha.
+        bias != nullptr ? 1.0 : 0.0,  // beta.
+        inst, /*implements_whole_instruction=*/true);
   }
 
   LOG(FATAL) << "Cannot build a GemmThunk for " << inst->ToString();
@@ -2564,15 +2906,12 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
 
   if (fused) {
     // If init_value was fused into this reduce we have to generate it first.
-    std::vector<IrArray> parameter_arrays;
-    for (HloInstruction* operand : hlo->operands()) {
-      parameter_arrays.push_back(GetIrArray(*operand, *hlo));
-    }
     GpuElementalIrEmitter elemental_emitter(hlo_module_config_,
                                             ir_emitter_context_->llvm_module(),
                                             &b_, GetNestedComputer());
 
-    FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter);
+    FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
+                                 &elemental_emitter);
     TF_RETURN_IF_ERROR(init_value_operand->Accept(&fused_emitter));
     TF_RETURN_IF_ERROR(
         ParallelLoopEmitter(fused_emitter.GetGenerator(init_value_operand),
@@ -2777,8 +3116,18 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
             GetIndexTypeForKernel(&hlo, launch_dimensions.launch_bound(), &b_));
   }
 
-  // For multioutput fusion, we need to emit each operand and the root.
+  // Emit the tuple pointers in one thread.  We could do this at any point in
+  // the kernel, but we do it at the beginning in the hopes of reducing register
+  // pressure, since we touch threadIdx.x and blockIdx.x at the beginning of the
+  // kernel *anyway*.
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(hlo);
+  TF_RETURN_IF_ERROR(
+      KernelSupportLibrary(&b_).If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+        llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_, module_);
+        return Status::OK();
+      }));
+
+  // For multioutput fusion, we need to emit each operand and the root.
   TF_RETURN_IF_ERROR(
       ParallelLoopEmitter(element_generator, output_arrays, launch_dimensions,
                           &b_, unroll_factor)
@@ -2787,17 +3136,25 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
                         &hlo, launch_dimensions.launch_bound(), &b_)));
 
   b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
-  llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_, module_);
-
   return Status::OK();
 }
 
 Status IrEmitterUnnested::EmitTargetElementLoop(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator) {
-  CHECK_EQ(Thunk::Kind::kKernel, LastThunk()->kind());
-  return EmitTargetElementLoopInThunk(hlo, element_generator,
-                                      static_cast<KernelThunk*>(LastThunk()));
+  int unroll_factor = 1;
+  // Unfused elementwise operations are usually memory bound, unroll them.
+  if (hlo.IsElementwise() || hlo.opcode() == HloOpcode::kFusion) {
+    unroll_factor = ComputeMaxUnrollFactor(&hlo);
+  }
+
+  std::unique_ptr<KernelThunk> kernel_thunk = BuildKernelThunk(
+      &hlo, /*implements_whole_instruction=*/true, unroll_factor);
+  Status emit_status =
+      EmitTargetElementLoopInThunk(hlo, element_generator, kernel_thunk.get());
+  thunk_sequence_->emplace_back(std::move(kernel_thunk));
+
+  return emit_status;
 }
 
 std::vector<IrArray> IrEmitterUnnested::ConstructIrArrayForInputs(
@@ -2810,31 +3167,6 @@ std::vector<IrArray> IrEmitterUnnested::ConstructIrArrayForInputs(
   return param_arrays;
 }
 
-int IrEmitterUnnested::ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-    const HloInstruction& hlo, const std::vector<IrArray>& output_arrays,
-    absl::Span<const int64> reduced_output_dims,
-    std::vector<Shape>* output_reduced_shapes,
-    std::vector<IrArray>* output_in_reduced_shape_arrays) {
-  int64 num_outputs = 1;
-  if (hlo.IsMultiOutputFusion()) {
-    num_outputs = ShapeUtil::TupleElementCount(hlo.shape());
-    output_in_reduced_shape_arrays->reserve(num_outputs);
-    output_reduced_shapes->reserve(num_outputs);
-    for (int64 i = 0; i < num_outputs; ++i) {
-      output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
-          ShapeUtil::GetSubshape(hlo.shape(), {i}).element_type(),
-          reduced_output_dims));
-      output_in_reduced_shape_arrays->push_back(
-          output_arrays[i].CastToShape((*output_reduced_shapes)[i], &b_));
-    }
-  } else {
-    output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
-        hlo.shape().element_type(), reduced_output_dims));
-    output_in_reduced_shape_arrays->push_back(
-        output_arrays[0].CastToShape((*output_reduced_shapes)[0], &b_));
-  }
-  return num_outputs;
-}
 
 int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
     const HloInstruction& hlo, const std::vector<IrArray>& param_arrays,
@@ -2863,338 +3195,531 @@ int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
 
 namespace {
 
-// Reads thread_idx.x and converts it to a (y,x) coordinate, assuming that the
-// thread lives within a square tile of size tile_size (so thread blocks are of
-// size tile_size * tile_size).
-std::tuple<llvm::Value*, llvm::Value*> CalculateYXCoordinateWithinTile(
-    llvm::IRBuilder<>* builder, llvm::Value* tile_size,
-    int64 threads_per_tile) {
-  // Calculate the starting element coordinate within a tile for the current
-  // thread, (y, x) from thread_id.
-  llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, builder);
-  llvm_ir::AddRangeMetadata(0, threads_per_tile,
-                            llvm::cast<llvm::Instruction>(thread_id));
-  thread_id = builder->CreateIntCast(thread_id, tile_size->getType(),
-                                     /*isSigned=*/true, "thread.id.x");
-  auto x = builder->CreateURem(thread_id, tile_size);
-  auto y = builder->CreateUDiv(thread_id, tile_size);
-  return std::make_tuple(y, x);
-}
-
-// Reads block_idx.x, casts it to type index_ty, and adds the assumption that
-// it's in the range [0, num_blocks].
-llvm::Value* GetBlockIdx(llvm::IRBuilder<>* builder, llvm::Type* index_ty,
-                         int64 num_blocks) {
-  llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, builder);
-  llvm_ir::AddRangeMetadata(0, num_blocks,
-                            llvm::cast<llvm::Instruction>(block_id));
-  return builder->CreateIntCast(block_id, index_ty, /*isSigned=*/true,
-                                "block.id.x");
-}
-
-// Emits code to process up to (tile_size/num_rows) elements in a tile, given
-// `emit_elem_function` is the function to emit code to process one element, `y`
-// and `x` are the coordinates for the first element to process, and `index` is
-// the index for the origin of the tile. Emits bounds check to ensure that each
-// processed element is within the boundary defined by `tile_width` and
-// `tile_height`.
+void EmitFullTile(const KernelMappingScheme* mapping_scheme,
+                  const IrArray::Index& tile_origin_index,
+                  llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
+                  llvm::Type* index_ty,
+                  const std::function<void(const IrArray::Index&, llvm::Value*,
+                                           llvm::Value*)>& emit_elem_function) {
+  int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
+  int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+  int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
+  for (int64 i = 0; i < tile_size_y; i += num_threads_y) {
+    IrArray::Index source_idx_y =
+        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, i),
+                                         KernelMappingScheme::DimY, builder);
+    llvm::Value* y_loc =
+        builder->CreateAdd(llvm::ConstantInt::get(index_ty, i), y);
+    for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
+      IrArray::Index source_idx =
+          source_idx_y.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
+                                      KernelMappingScheme::DimX, builder);
+      llvm::Value* x_loc =
+          builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+      emit_elem_function(source_idx, y_loc, x_loc);
+    }
+  }
+}
+
+void EmitPartialTile(
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
+    llvm::Type* index_ty,
+    const std::function<void(const IrArray::Index&, llvm::Value*,
+                             llvm::Value*)>& emit_elem_function) {
+  int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
+  int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+
+  for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
+    IrArray::Index source_idx =
+        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
+                                         KernelMappingScheme::DimX, builder);
+    llvm::Value* x_loc =
+        builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+
+    ksl->IfReturnVoid(
+        "x_in_tile", builder->CreateICmpULT(x_loc, tile_width), [&] {
+          // tile_height_bound =
+          //   ceil(tile_height / num_threads_y) * num_threads_y
+          llvm::Value* ceiling_of_ratio = builder->CreateUDiv(
+              builder->CreateAdd(tile_height, llvm::ConstantInt::get(
+                                                  index_ty, num_threads_y - 1)),
+              llvm::ConstantInt::get(index_ty, num_threads_y));
+          llvm::Value* tile_height_bound = builder->CreateMul(
+              ceiling_of_ratio,
+              llvm::ConstantInt::get(index_ty, num_threads_y));
+          ksl->ForReturnVoid(
+              loop_name, /*start=*/llvm::ConstantInt::get(index_ty, 0),
+              /*end=*/tile_height_bound,
+              /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
+              [&](llvm::Value* y_indvar) {
+                llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
+                ksl->IfReturnVoid(
+                    "y_in_tile", builder->CreateICmpULT(y_loc, tile_height),
+                    [&] {
+                      emit_elem_function(
+                          source_idx.AddOffsetToDim(
+                              y_indvar, KernelMappingScheme::DimY, builder),
+                          y_loc, x_loc);
+                    });
+              });
+        });
+  }
+}
+
+// Emits code to process up to
+// (tile_size_x/num_threads_x * tile_size_y/num_threads_y) elements in a tile,
+// given `emit_elem_function` is the function to emit code to process one
+// element, `y` and `x` are the intra-tile coordinates for the first element
+// to process, and `index` is the index for the origin of the tile. Information
+// about tile_size_x/y and num_threads_x/y are stored in `mapping_scheme`. Emits
+// bounds check to ensure that each processed element is within the boundary
+// defined by `tile_width` and `tile_height`.
 void EmitTiledElementalCodeWithBoundsCheck(
-    int64 tile_size, int64 num_rows, const IrArray::Index& index,
-    const string& loop_name, KernelSupportLibrary* ksl,
-    llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
-    llvm::Value* tile_width, llvm::Value* tile_height,
-    const std::function<void(const IrArray::Index&, llvm::Value*)>&
-        emit_elem_function) {
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
+    const std::function<void(const IrArray::Index&, llvm::Value*,
+                             llvm::Value*)>& emit_elem_function) {
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+  int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
   llvm::Type* index_ty = tile_width->getType();
-  // Emits a constant value with index type.
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-  // Adds `addend` to the given `dim` of `index`.
-  auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = builder->CreateAdd(index[dim], addend);
-    return index;
-  };
-
-  auto emit_full_tile = [&] {
-    for (int64 i = 0; i < tile_size; i += num_rows) {
-      auto source_idx = offset_dim(index, index_typed_constant(i), /*dim=*/1);
-      auto y_loc = builder->CreateAdd(index_typed_constant(i), y);
-      emit_elem_function(source_idx, y_loc);
-    }
-  };
 
-  auto emit_last_row = [&] {
-    ksl->IfReturnVoid("x_in_tile", builder->CreateICmpULT(x, tile_width), [&] {
-      // tile_height_upper_bound =
-      //   ceil(tile_height / num_rows) * num_rows
-      auto tile_height_upper_bound = builder->CreateMul(
-          builder->CreateUDiv(
-              builder->CreateAdd(tile_height,
-                                 index_typed_constant(num_rows - 1)),
-              index_typed_constant(num_rows)),
-          index_typed_constant(num_rows));
-      ksl->ForReturnVoid(
-          loop_name, /*start=*/index_typed_constant(0),
-          /*end=*/tile_height_upper_bound,
-          /*step=*/index_typed_constant(num_rows), [&](llvm::Value* y_indvar) {
-            auto y_loc = builder->CreateAdd(y_indvar, y);
-            ksl->IfReturnVoid(
-                "y_in_tile", builder->CreateICmpULT(y_loc, tile_height), [&] {
-                  emit_elem_function(offset_dim(index, y_indvar, /*dim=*/1),
-                                     y_loc);
-                });
-          });
-    });
-  };
   ksl->IfReturnVoid(
       "full_tile",
       builder->CreateAnd(
-          builder->CreateICmpEQ(index_typed_constant(tile_size), tile_width),
-          builder->CreateICmpEQ(index_typed_constant(tile_size), tile_height)),
-      emit_full_tile, emit_last_row);
+          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_x),
+                                tile_width),
+          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_y),
+                                tile_height)),
+      [&] {
+        EmitFullTile(mapping_scheme, tile_origin_index, builder, y, x, index_ty,
+                     emit_elem_function);
+      },
+      [&] {
+        EmitPartialTile(mapping_scheme, tile_origin_index, loop_name, ksl,
+                        builder, y, x, tile_height, tile_width, index_ty,
+                        emit_elem_function);
+      });
 }
 }  // namespace
 
-// Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
-// algorithm to improve the memory access patterns for the input parameters
-// which have a shape that is a 0-2-1 transpose of the output tensors.
-//
-// For the purpose of tiling, the output tensors have a logical shape of three
-// components 0-2-1 while the relevant input parameters have a logical shape of
-// three components 0-1-2 in the order major to minor. The x- and y- dimensions
-// of the tensors are tiled in square tiles of edge length `kTileSize`. Each
-// thread block of `kTileSize` x `kNumRows` threads transposes one tile: each
-// thread copies kTileSize/kNumRows elements from the input to a shared memory
-// tile, then the otherwise "regular hlo kernel" reads from the shared memory
-// instead of the original input.
+// Emits code to process a tensor element in a tile for the given kCopy HLO that
+// performs a 0-2-1 transpose.
 //
-// This is similar to the following CUDA algorithm in TensorFlow:
-// https://goo.gl/MStRV6.
-//
-// `kTileSize` should usually be same as warp size. We currently choose 32 for
-// `kTileSize` and 4 for `kNumRows`. The CUDA algorithm uses 8 for `kNumRows`.
-//
-// TODO(b/33320379): Here each block transposes 1 tile. It may be more efficient
-// to launch fewer blocks so each transposes many tiles.
-LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
-    HloInstruction* hlo, absl::Span<const int64> reduced_output_dims,
-    absl::Span<const int64> tiled_param_ids) {
-  // Parameters for the tiling algorithm.
-  constexpr int64 kTileSize = 32;
-  constexpr int64 kNumRows = 4;
-  constexpr int64 kThreadsPerTile = kTileSize * kNumRows;
+// index: The index for the first output element in the normalized tensor. The
+//   normalized tensor is the resulting tensor after collapsing contiguous
+//   dimensions that play the same role in the transpose.
+// y_loc: The y coordinate within a tile.
+// x_loc: The x coordinate within a tile.
+// kernel_info: Other information to support the kernel code generation.
+void IrEmitterUnnested::EmitTileElementForCopy(
+    HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc) {
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
+  // TODO(jlebar): Add AA metadata to this load.
+  llvm::Instruction* load_from_shmem_buffer =
+      Load(GEP(tiled_param_info->GetBufferForParameter(0),
+               {b_.getInt64(0), x_loc, y_loc}),
+           "output_element");
+  llvm_ir::IrArray output_array = GetIrArray(*hlo, *hlo);
+  Shape output_reduced_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      hlo->shape().element_type(),
+      kernel_info->GetKernelMappingScheme()->GetDimensionsInElements());
+  // When the output_reduced_shape is a 0-2-1 transpose of the input shape,
+  // the 0-2-1 transpose is achieved through EmitWriteArrayElement.
+  output_array.CastToShape(output_reduced_shape, &b_)
+      .EmitWriteArrayElement(index, load_from_shmem_buffer, &b_);
+}
 
-  // Construct IrArrays for the inputs and outputs.
+// Emits code to process a tensor element in a tile for the given kLoop fusion
+// HLO containing parameters that are 0-2-1 transpose of its outputs.
+//
+// index: The index for the first output element in the normalized tensor, that
+//   is the resulting tensor after collapsing contiguous dimensions that play
+//   the same role in the transpose.
+// kernel_info: Other information to support the kernel code generation.
+// y_loc: The y coordinate within a tile.
+// x_loc: The x coordinate within a tile.
+void IrEmitterUnnested::EmitTileElementForFusion(
+    HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc) {
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
-  int64 num_outputs = output_arrays.size();
-  std::vector<IrArray> param_arrays = ConstructIrArrayForInputs(*hlo);
-  int64 num_params = param_arrays.size();
+  GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
+                                     GetNestedComputer());
+  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
+                               &elem_emitter);
+  tiled_param_info->set_y(y_loc);
+  tiled_param_info->set_x(x_loc);
+  fused_emitter.SetTiledParameterInfo(tiled_param_info);
+  TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
+  IrArray::Index untiled_index =
+      kernel_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
+          index, output_arrays[0].GetShape());
+  const llvm_ir::ElementGenerator& output_generator =
+      fused_emitter.GetRootGenerator();
+  llvm::Value* output_value = output_generator(untiled_index).ValueOrDie();
+  if (hlo->IsMultiOutputFusion()) {
+    DCHECK(output_value->getType()->isStructTy());
+    DCHECK_EQ(output_value->getType()->getStructNumElements(),
+              output_arrays.size());
+    for (int64 i = 0; i < output_arrays.size(); ++i) {
+      output_arrays[i].EmitWriteArrayElement(
+          untiled_index, ExtractValue(output_value, i), &b_);
+    }
+  } else {
+    output_arrays[0].EmitWriteArrayElement(untiled_index, output_value, &b_);
+  }
+}
 
+// Emits a block of tiles, given a function object to emit one tile.
+void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
+                                  const KernelCodegenInfo* kernel_info,
+                                  KernelSupportLibrary& ksl,
+                                  llvm::Type* index_ty) {
+  KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
+  absl::Span<const int64> dims_in_tile = mapping_scheme->GetDimensionsInTiles();
+  absl::Span<const int64> dims_in_block =
+      mapping_scheme->GetDimensionsInBlocks();
+  absl::Span<const int64> block_sizes = mapping_scheme->GetBlockSizes();
+  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
+    return llvm::ConstantInt::get(index_ty, c);
+  };
+
+  // Emit all the tiles for a given dimension in a tile block.
+  auto emit_tiles_for_block_dim =
+      [&](const string& loop_name, const IrArray::Index& starting_tile,
+          int dim_id,
+          const std::function<void(const IrArray::Index& tile_index)>
+              emit_next_block_dim) {
+        if (block_sizes[dim_id] == 1) {
+          emit_next_block_dim(starting_tile);
+        } else {
+          llvm::Value* starting_tile_index_for_dim = starting_tile[dim_id];
+          llvm::Value* block_size_for_dim =
+              index_typed_constant(block_sizes[dim_id]);
+          llvm::Value* block_id_for_dim =
+              b_.CreateUDiv(starting_tile_index_for_dim, block_size_for_dim);
+          llvm::Value* last_block_for_dim =
+              index_typed_constant(dims_in_block[dim_id] - 1);
+          llvm::Value* last_block_size_for_dim = index_typed_constant(
+              dims_in_tile[dim_id] -
+              (dims_in_block[dim_id] - 1) * block_sizes[dim_id]);
+          llvm::Value* num_tiles_in_block =
+              Select(ICmpEQ(last_block_for_dim, block_id_for_dim),
+                     last_block_size_for_dim, block_size_for_dim);
+
+          ksl.ForReturnVoid(
+              loop_name,
+              /*start=*/index_typed_constant(0),
+              /*end=*/num_tiles_in_block,
+              /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
+                IrArray::Index tile_index = starting_tile.AddOffsetToDim(
+                    block_dim_induction_var, dim_id, &b_);
+                emit_next_block_dim(tile_index);
+              });
+        }
+      };
+
+  absl::Span<const int64> reduced_dims =
+      mapping_scheme->GetDimensionsInElements();
+  const bool block_contains_multi_tiles =
+      mapping_scheme->GetNumberOfTilesInOneBlock() > 1;
+
+  // Emit the tile with a given tile_index, by calculating the tight bounds for
+  // each dimension of the tile and then calling emit_one_tile.
+  auto emit_one_tile_for_tile_index = [&](const IrArray::Index& tile_index) {
+    std::vector<llvm::Value*> output_tile_bounds(3);
+    for (int i = KernelMappingScheme::DimY; i < KernelMappingScheme::DimTot;
+         ++i) {
+      int64 tile_size_for_dim = mapping_scheme->GetTileSizeForDimension(i);
+      // Only last row or column may not have full size.
+      llvm::Value* is_last_row =
+          ICmpEQ(tile_index[i], index_typed_constant(dims_in_tile[i] - 1));
+      int64 partial_row_size =
+          reduced_dims[i] - (dims_in_tile[i] - 1) * tile_size_for_dim;
+      output_tile_bounds[i] =
+          Select(is_last_row, index_typed_constant(partial_row_size),
+                 index_typed_constant(tile_size_for_dim), "tile_bound");
+    }
+
+    IrArray::Index tile_origin =
+        mapping_scheme->GetElementIndexForTileOrigin(tile_index);
+    emit_one_tile(tile_origin, output_tile_bounds, block_contains_multi_tiles);
+  };
+
+  const IrArray::Index starting_block =
+      mapping_scheme->EmitBlockIndex(index_ty);
+  const IrArray::Index starting_tile_for_dim_z =
+      mapping_scheme->GetTileIndexForBlockOrigin(starting_block);
+
+  // Emit the three dimensional block of tiles.
+  emit_tiles_for_block_dim(
+      "block_dim_z", starting_tile_for_dim_z, KernelMappingScheme::DimZ,
+      [&](const IrArray::Index& starting_tile_for_dim_y) {
+        emit_tiles_for_block_dim(
+            "block_dim_y", starting_tile_for_dim_y, KernelMappingScheme::DimY,
+            [&](const IrArray::Index& starting_tile_for_dim_x) {
+              emit_tiles_for_block_dim("block_dim_x", starting_tile_for_dim_x,
+                                       KernelMappingScheme::DimX,
+                                       emit_one_tile_for_tile_index);
+            });
+      });
+}
+
+// Emits a kernel for the hlo instruction using the given kernel mapping scheme.
+//
+// unnested_hlo: The unnested hlo instruction for which the kernel is generated.
+//   Currently, these hlo instructions are supported: kLoop fusion, kCopy.
+// tiled_param_ids: The IDs for the parameters that are 0-2-1 transpose of
+//   other tensors with the same dimensions and need to be tiled and tranposed.
+// mapping_scheme: The tiling scheme to use.
+// kernel_generator: Contains function objects for code generation, such as
+//   element generator, block prologue and epilogue generators.
+// kernel_info: Represent other information to support the code generation
+//   of the tiled kernel for the hlo.
+LaunchDimensions IrEmitterUnnested::EmitKernel(
+    HloInstruction* unnested_hlo, absl::Span<const int64> tiled_param_ids,
+    const KernelCodeGenerator& kernel_generator,
+    KernelCodegenInfo* kernel_info) {
+  KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
+
+  std::vector<IrArray> param_arrays = ConstructIrArrayForInputs(*unnested_hlo);
+  int64 num_params = param_arrays.size();
   // Allocate shared memory buffers to store the tiled inputs.
   std::vector<llvm::Value*> param_shmem_buffers(num_params, nullptr);
   for (int64 id : tiled_param_ids) {
-    const HloInstruction* param = hlo->operand(id);
-    // Add 1 to the minor dimension to reduce shared memory bank conflicts.
-    llvm::Type* tile_type = llvm::ArrayType::get(
-        llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
-                                 param->shape().element_type(), module_),
-                             kTileSize + 1),
-        kTileSize);
-    const int kNVPTXSharedMemoryAddrSpace = 3;
-    auto* tile_base_ptr = new llvm::GlobalVariable(
-        *b_.GetInsertBlock()->getParent()->getParent(), tile_type,
-        /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage,
-        llvm::UndefValue::get(tile_type),
-        llvm_ir::AsStringRef(IrName(hlo, StrCat("tile", id))), nullptr,
-        llvm::GlobalValue::NotThreadLocal, kNVPTXSharedMemoryAddrSpace);
-    param_shmem_buffers[id] = tile_base_ptr;
+    const HloInstruction* param = unnested_hlo->operand(id);
+    param_shmem_buffers[id] =
+        mapping_scheme->GetSharedMemoryBufferForElementType(
+            llvm_ir::PrimitiveTypeToIrType(param->shape().element_type(),
+                                           module_),
+            IrName(unnested_hlo, StrCat("tile", id)));
     VLOG(3) << "Added shmem buffer for parameter " << id << ": "
-            << llvm_ir::DumpToString(*tile_base_ptr);
-  }
-
-  // The 0-2-1 shape of the tiling scheme is the reduced shape of the HLO result
-  // for the purpose of tiling. Calculate the logical output dimensions in the
-  // tile from the reduced output dimensions.
-  std::vector<int64> output_dims_in_tiles = std::vector<int64>(
-      reduced_output_dims.begin(), reduced_output_dims.end());
-  CHECK_EQ(output_dims_in_tiles.size(), 3);
-  for (int i = 1; i < 3; ++i) {
-    output_dims_in_tiles[i] =
-        CeilOfRatio<int64>(output_dims_in_tiles[i], kTileSize);
+            << llvm_ir::DumpToString(*param_shmem_buffers[id]);
   }
-  const int64 num_tiles =
-      absl::c_accumulate(output_dims_in_tiles, 1, std::multiplies<int64>());
-  LaunchDimensions launch_dimensions(num_tiles, kThreadsPerTile);
 
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(hlo, launch_dimensions.launch_bound(), &b_);
+  CHECK_EQ(mapping_scheme->GetThreadsPerTile() % kWarpSize, 0);
+  LaunchDimensions launch_dimensions = LaunchDimensions(
+      mapping_scheme->GetNumberOfBlocks(), mapping_scheme->GetThreadsPerTile());
+  llvm::Type* index_ty = GetIndexTypeForKernel(
+      unnested_hlo, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
 
-  // Cast each output IrArray to its corresponding reduced shape and keep the
-  // reduced shape live during IR emission.
-  std::vector<IrArray> output_in_reduced_shape_arrays;
-  std::vector<Shape> output_reduced_shapes;
-  CHECK_EQ(ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-               *hlo, output_arrays, reduced_output_dims, &output_reduced_shapes,
-               &output_in_reduced_shape_arrays),
-           num_outputs);
+  // For multioutput fusion, one thread needs to output a tuple with pointers to
+  // all the individual outputs.  We could do this at any point in the kernel,
+  // but we do it at the beginning in the hopes of reducing register pressure,
+  // since we touch threadIdx.x and blockIdx.x at the beginning of the kernel
+  // *anyway*.
+  if (unnested_hlo->IsMultiOutputFusion()) {
+    TF_CHECK_OK(KernelSupportLibrary(&b_).If(
+        "emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+          llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
+                             ConstructIrArrayForOutputs(*unnested_hlo), &b_,
+                             module_);
+          return Status::OK();
+        }));
+  }
 
   // For each tiled parameter, cast its input IrArray to the corresponding
   // reduced shape and keep the reduced shape live during IR emission.
   std::vector<IrArray> param_in_reduced_shape_arrays;
   std::vector<Shape> param_reduced_shapes;
-  CHECK_EQ(ConstructInputReducedShapeAndCastInputIrArrayToShape(
-               *hlo, param_arrays, param_shmem_buffers, reduced_output_dims,
-               &param_reduced_shapes, &param_in_reduced_shape_arrays),
-           num_params);
+  absl::Span<const int64> reduced_dims =
+      mapping_scheme->GetDimensionsInElements();
+  int num_shapes = ConstructInputReducedShapeAndCastInputIrArrayToShape(
+      *unnested_hlo, param_arrays, param_shmem_buffers, reduced_dims,
+      &param_reduced_shapes, &param_in_reduced_shape_arrays);
+  DCHECK_EQ(num_shapes, num_params);
 
   // Calculate the starting element coordinate within a tile for the current
   // thread, (y, x) from thread_id.
   llvm::Value* x;
   llvm::Value* y;
-  std::tie(y, x) = CalculateYXCoordinateWithinTile(
-      &b_, index_typed_constant(kTileSize), kThreadsPerTile);
-
-  // Calculate the index for the current output tile from block_id.
-  const IrArray::Index output_tile_index(
-      GetBlockIdx(&b_, index_ty, num_tiles),
-      ShapeUtil::MakeShapeWithDescendingLayout(PRED /*arbitrary*/,
-                                               output_dims_in_tiles),
-      &b_);
-
-  // Output tile origin is the index for the first element of the current output
-  // tile.
-  const IrArray::Index output_tile_origin = [&] {
-    IrArray::Index index = output_tile_index;
-    for (int i = 1; i < 3; ++i) {
-      index[i] = Mul(output_tile_index[i], index_typed_constant(kTileSize),
-                     "tile_origin." + std::to_string(i));
-    }
-    return index;
-  }();
+  std::tie(y, x) = mapping_scheme->EmitThreadYXCoordinate(index_ty);
 
-  // Calculate the input tile origin from the output tile origin.
-  const IrArray::Index input_tile_origin(
-      Permute({0, 2, 1}, output_tile_origin.multidim()));
-
-  // Calculate the current output tile bounds in each of the logical dimensions.
-  std::vector<llvm::Value*> output_tile_bounds(3);
-  for (int i = 1; i < 3; ++i) {
-    // Only last row or column may not have full size.
-    output_tile_bounds[i] =
-        Select(ICmpEQ(output_tile_index[i],
-                      index_typed_constant(output_dims_in_tiles[i] - 1)),
-               index_typed_constant(reduced_output_dims[i] -
-                                    (output_dims_in_tiles[i] - 1) * kTileSize),
-               index_typed_constant(kTileSize), "kTileSize");
-  }
+  kernel_info->SetLaneId(
+      mapping_scheme->GetNumberOfThreadsForDimensionX() == kWarpSize ? x
+                                                                     : nullptr);
 
   KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
-
   // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck.
   auto emit_tiled_elemental_code_with_bounds_check =
       [&](const IrArray::Index& index, const string& loop_name,
-          llvm::Value* tile_width, llvm::Value* tile_height,
-          const std::function<void(const IrArray::Index&, llvm::Value*)>&
-              emit_elem_function) {
-        EmitTiledElementalCodeWithBoundsCheck(
-            kTileSize, kNumRows, index, loop_name, &ksl, &b_, y, x, tile_width,
-            tile_height, emit_elem_function);
+          llvm::Value* tile_height, llvm::Value* tile_width,
+          const std::function<void(const IrArray::Index&, llvm::Value*,
+                                   llvm::Value*)>& emit_elem_function) {
+        EmitTiledElementalCodeWithBoundsCheck(mapping_scheme, index, loop_name,
+                                              &ksl, &b_, y, x, tile_height,
+                                              tile_width, emit_elem_function);
       };
 
-  // Adds `addend` to the given `dim` of `index`.
-  auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = Add(index[dim], addend);
-    return index;
-  };
-  const IrArray::Index input_index =
-      offset_dim(offset_dim(input_tile_origin, x, /*dim=*/2), y, /*dim=*/1);
-
-  // Copy input parameter values to shared memory buffers:
-  // tile[y, x] = input[index]
-  emit_tiled_elemental_code_with_bounds_check(
-      input_index, "input", output_tile_bounds[1], output_tile_bounds[2],
-      [&](const IrArray::Index& index, llvm::Value* y_loc) {
-        for (int64 id : tiled_param_ids) {
-          IrArray& input_in_logical_shape = param_in_reduced_shape_arrays[id];
-          llvm::Value* shmem_buffer = param_shmem_buffers[id];
-          // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
-          // global variables, so LLVM can't infer much about it.
-          Store(input_in_logical_shape.EmitReadArrayElement(index, &b_,
-                                                            "input_element"),
-                GEP(shmem_buffer, {index_typed_constant(0), y_loc, x}));
-        }
-      });
+  auto emit_one_tile = [&](const IrArray::Index& output_tile_origin,
+                           absl::Span<llvm::Value* const> output_tile_bounds,
+                           bool block_contains_multi_tiles) {
+    // Calculate the input tile origin from the output tile origin.
+    const IrArray::Index input_tile_origin(
+        Permute({0, 2, 1}, output_tile_origin.multidim()));
+
+    const IrArray::Index input_index =
+        input_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
+            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
+
+    // Copy input parameter values to shared memory buffers:
+    // tile[y, x] = input[index]
+    // Note that tile_width and tile_height are flipped here because we are
+    // reading a transposed tile.
+    emit_tiled_elemental_code_with_bounds_check(
+        input_index, "input", output_tile_bounds[2], output_tile_bounds[1],
+        [&](const IrArray::Index& index, llvm::Value* y_loc,
+            llvm::Value* x_loc) {
+          for (int64 id : tiled_param_ids) {
+            IrArray& input_in_logical_shape = param_in_reduced_shape_arrays[id];
+            llvm::Value* shmem_buffer = param_shmem_buffers[id];
+            // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
+            // global variables, so LLVM can't infer much about it.
+            Store(input_in_logical_shape.EmitReadArrayElement(index, &b_,
+                                                              "input_element"),
+                  GEP(shmem_buffer, {index_typed_constant(0), y_loc, x_loc}));
+          }
+        });
 
-  // Wait for all threads to reach this point, lest we copy a value from tile to
-  // output before the other thread copies it from input to tile.
-  // This is `__syncthreads` in CUDA.
-  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    // If shared memory transpose is needed, wait for all threads to reach this
+    // point, lest we copy a value from tile to output before the other thread
+    // copies it from input to tile. This is `__syncthreads` in CUDA.
+    if (!tiled_param_ids.empty()) {
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    }
 
-  llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
+    llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
+    kernel_info->SetTiledParamInfo(&tiled_param_info);
 
-  const IrArray::Index output_index =
-      offset_dim(offset_dim(output_tile_origin, x, /*dim=*/2), y, /*dim=*/1);
+    const IrArray::Index output_index =
+        output_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
+            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
 
-  // Write to output[index] by emitting code like normal, except that values for
-  // the tiled parameters are read from the shmem buffers.
-  if (hlo->opcode() == HloOpcode::kCopy) {
-    emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
-        [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          // TODO(jlebar): Add AA metadata to this load.
-          llvm::Instruction* load_from_shmem_buffer =
-              Load(GEP(param_shmem_buffers[0], {b_.getInt64(0), x, y_loc}),
-                   "output_element");
-          output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-              index, load_from_shmem_buffer, &b_);
-        });
-  } else {
-    CHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
+    // Write to output[index] by emitting code like normal, except that values
+    // for the tiled parameters are read from the shmem buffers.
     emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
-        [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
-                                             GetNestedComputer());
-          FusedIrEmitter fused_emitter(param_arrays, &elem_emitter);
-          tiled_param_info.set_y(y_loc);
-          fused_emitter.SetTiledParameterInfo(&tiled_param_info);
-          TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
-          IrArray::Index untiled_index = llvm_ir::GetUnreducedOutputIndex(
-              index, output_reduced_shapes[0], output_arrays[0].GetShape(),
-              &b_);
-          const llvm_ir::ElementGenerator& output_generator =
-              fused_emitter.GetRootGenerator();
-          llvm::Value* output_value =
-              output_generator(untiled_index).ValueOrDie();
-          if (hlo->IsMultiOutputFusion()) {
-            CHECK(output_value->getType()->isStructTy());
-            CHECK_EQ(output_value->getType()->getStructNumElements(),
-                     output_in_reduced_shape_arrays.size());
-            for (int64 i = 0; i < output_in_reduced_shape_arrays.size(); ++i) {
-              output_in_reduced_shape_arrays[i].EmitWriteArrayElement(
-                  index, ExtractValue(output_value, i), &b_);
-            }
-          } else {
-            output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-                index, output_value, &b_);
-          }
+        output_index, "output", output_tile_bounds[1], output_tile_bounds[2],
+        [&](const IrArray::Index& index, llvm::Value* y_loc,
+            llvm::Value* x_loc) {
+          kernel_generator.GetTileElementGenerator()(unnested_hlo, index,
+                                                     kernel_info, y_loc, x_loc);
         });
+    // If a tile block contains multiple tiles and shared memory buffers are
+    // used, we need to wait for all threads to finish using the shared memory
+    // buffer for the current tile before we move on to process the next tile
+    // and overwrite the shared memory buffers.
+    if (block_contains_multi_tiles && !tiled_param_ids.empty()) {
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    }
+  };
+
+  const BlockPrologueGenerator& block_prologue_generator =
+      kernel_generator.GetBlockPrologueGenerator();
+  if (block_prologue_generator) {
+    block_prologue_generator(unnested_hlo, kernel_info);
   }
 
-  // For multioutput fusion, emit a tuple with all the individual outputs.
-  if (hlo->IsMultiOutputFusion()) {
-    llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo), output_arrays, &b_, module_);
+  EmitBlock(std::move(emit_one_tile), kernel_info, ksl, index_ty);
+
+  const BlockEpilogueGenerator& block_epilogue_generator =
+      kernel_generator.GetBlockEpilogueGenerator();
+  if (block_epilogue_generator) {
+    block_epilogue_generator(unnested_hlo, kernel_info);
   }
 
   return launch_dimensions;
 }
 
+// Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
+// algorithm to improve the memory access patterns for the input parameters
+// with a shape that is a 0-2-1 transpose of the output tensor shape.
+//
+// For the purpose of tiling, the output tensors have a logical shape of three
+// components 0-2-1 while the relevant input parameters have a logical shape
+// of three components 0-1-2 in the order major to minor. The x- and y-
+// dimensions of the tensors are tiled in square tiles with an edge length
+// `kTileSize`. Each thread block of `kTileSize` x `kNumRows` threads
+// transposes one tile: each thread copies kTileSize/kNumRows elements from
+// the input to a shared memory tile, then the otherwise "regular HLO kernel"
+// reads from the shared memory instead of the original input.
+//
+// This is similar to the following CUDA algorithm in TensorFlow:
+// https://goo.gl/MStRV6.
+//
+// `kTileSize` should usually be same as warp size. We currently choose 32 for
+// `kTileSize` and 4 for `kNumRows`. The CUDA algorithm uses 8 for `kNumRows`.
+//
+// TODO(b/33320379): Here each block transposes 1 tile. It may be more
+// efficient to launch fewer blocks so each transposes many tiles.
+LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
+    HloInstruction* hlo, absl::Span<const int64> reduced_output_dims,
+    absl::Span<const int64> tiled_param_ids) {
+  constexpr int kNumRows = 4;
+  KernelMappingScheme mapping_scheme(
+      reduced_output_dims, /*tile_size_y=*/kWarpSize,
+      /*tile_size_x=*/kWarpSize, /*req_block_sizes=*/{1, 1, 1},
+      /*num_threads_y=*/kNumRows,
+      /*num_threads_x=*/kWarpSize, &b_);
+  TileElementGenerator element_generator;
+  if (hlo->opcode() == HloOpcode::kCopy) {
+    element_generator = [&](HloInstruction* hlo,
+                            const llvm_ir::IrArray::Index& index,
+                            const KernelCodegenInfo* kernel_info,
+                            llvm::Value* y_loc, llvm::Value* x_loc) {
+      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc);
+    };
+  } else {
+    DCHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
+    element_generator = [&](HloInstruction* hlo,
+                            const llvm_ir::IrArray::Index& index,
+                            const KernelCodegenInfo* kernel_info,
+                            llvm::Value* y_loc, llvm::Value* x_loc) {
+      EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc);
+    };
+  }
+  KernelCodegenInfo kernel_info(&mapping_scheme);
+  KernelCodeGenerator kernel_generator(std::move(element_generator));
+  return EmitKernel(hlo, tiled_param_ids, kernel_generator, &kernel_info);
+}
+
+namespace {
+// Returns true to indicate it is safe to use the tile based shared memory
+// transpose implementation to implement the kernel for the instruction.
+//
+// An instruction is not safe for such an implementation if it can change the
+// element order of a tensor without changing the dimension of the tensor, and
+// the instruction has a corresponding elemental_ir_emitter.
+bool IsInstructionSafeForTileBasedTranspose(const HloInstruction* hlo) {
+  auto is_safe_for_tile_based_transpose = [&](const HloInstruction* instr) {
+    HloOpcode opcode = instr->opcode();
+    CHECK_NE(opcode, HloOpcode::kFusion);
+    return (opcode != HloOpcode::kReverse && opcode != HloOpcode::kGather);
+  };
+
+  if (hlo->opcode() == HloOpcode::kFusion) {
+    return absl::c_all_of(hlo->fused_instructions_computation()->instructions(),
+                          is_safe_for_tile_based_transpose);
+  }
+
+  return is_safe_for_tile_based_transpose(hlo);
+}
+}  // namespace
+
 bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   HloOpcode opcode = hlo->opcode();
   CHECK(opcode == HloOpcode::kFusion || opcode == HloOpcode::kCopy);
@@ -3206,8 +3731,8 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
                                   ? ShapeUtil::GetSubshape(hlo->shape(), {0})
                                   : hlo->shape();
 
-  // If the output_shape is reduced to 021 shape, find all the parameters of the
-  // hlo that are in the corresponding 012 shape.
+  // If the output_shape is reduced to 021 shape, find all the parameters of
+  // the HLO that are in the corresponding 012 shape.
   std::vector<int64> params_012;
   optional<std::vector<int64>> reduced_dims_021;
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
@@ -3239,10 +3764,14 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
     return false;
   }
 
+  if (!IsInstructionSafeForTileBasedTranspose(hlo)) {
+    return false;
+  }
+
   // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
-  // elements are of size 4 bytes), and CUDA has an architectural limit of 48kb
-  // shared memory per SM.  (This is increased to 96kb in Volta, but we don't
-  // use this, in part because it eats into our L1 cache space.)
+  // elements are of size 4 bytes), and CUDA has an architectural limit of
+  // 48kb shared memory per SM.  (This is increased to 96kb in Volta, but we
+  // don't use this, in part because it eats into our L1 cache space.)
   //
   // For correctness we need to ensure that we don't make more than 48kb worth
   // of shmem tiles per block.  And for performance, we'd probably like to use
@@ -3250,9 +3779,9 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   // gpu core.
   //
   // We say without benchmarks that we want at least 3 threads/block,
-  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We choose
-  // which params get the shmem transpose treatment arbitrarily; it's not clear
-  // if there's a Right Choice.
+  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We
+  // choose which params get the shmem transpose treatment arbitrarily; it's
+  // not clear if there's a Right Choice.
   //
   // This is only sound if tiled transposes are the only place where we use
   // shared memory in fusions.  If in the future other fusible ops use shared
@@ -3274,12 +3803,13 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   }
 
   VLOG(3) << "EmitHlo021Tile Emitting hlo tile 0-2-1" << hlo->ToString();
-  thunk_sequence_->emplace_back(
-      BuildKernelThunk(hlo, /*implements_whole_instruction=*/true));
+  std::unique_ptr<KernelThunk> kernel_thunk =
+      BuildKernelThunk(hlo, /*implements_whole_instruction=*/true);
   const LaunchDimensions launch_dimensions =
       EmitHlo021Tile(hlo, *reduced_dims_021, params_012);
-  UpdateLaunchDimensions(launch_dimensions, LastThunk(),
+  UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
                          ir_emitter_context_->llvm_module());
+  AddThunkToThunkSequence(std::move(kernel_thunk));
 
   return true;
 }
@@ -3305,10 +3835,10 @@ Status IrEmitterUnnested::EmitConstantGlobals() {
     }
 
     // These globals will be looked up by name by GpuExecutable so we need to
-    // give them an external linkage.  Not all of their uses are visible in the
-    // LLVM IR (e.g. TupleThunk) so we can't give then a linkage that merely
-    // preserves their names (like available_externally), we also need to ensure
-    // that they stick around even if they're "unused".
+    // give them an external linkage.  Not all of their uses are visible in
+    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
+    // merely preserves their names (like available_externally), we also need
+    // to ensure that they stick around even if they're "unused".
     //
     // We may have to be more more clever here in the future if we notice that
     // we're keeping around too many globals because of their linkage.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index bd5db7205155dc6b15ddea069e172bbd8f419996..e09ed657a812be6ab4859a0e365a51c45a37bfed 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
 
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h"
 
 namespace xla {
@@ -46,6 +48,94 @@ namespace gpu {
 //
 class IrEmitterUnnested : public IrEmitter {
  public:
+  // Parameter block_contains_multi_tiles indicates whether a tile block
+  // consists of multiple tiles or not. If the tile block contains only one
+  // tile, there is no need to use atomic operation to accumulate a local result
+  // to a global result to implement reduction.
+  using TileGenerator =
+      std::function<void(const llvm_ir::IrArray::Index& output_tile_origin,
+                         absl::Span<llvm::Value* const> output_tile_bounds,
+                         bool block_contains_multi_tiles)>;
+  // KernelCodegenInfo records the common information to support the code
+  // generation for a kernel to process tensor elements by blocks. A block of
+  // tensor elements may contain one or multiple tiles. The code generators that
+  // generate code for tile elements or block prologue/epilogue refer to this
+  // class in their prototypes. If the implementations of such code generators
+  // require other information that are specific to the HLO instructions, the
+  // implementations need to define and use derived classes of this class.
+  class KernelCodegenInfo {
+   public:
+    explicit KernelCodegenInfo(llvm_ir::KernelMappingScheme* mapping_scheme)
+        : mapping_scheme_(mapping_scheme),
+          tiled_param_info_(nullptr),
+          lane_id_(nullptr) {}
+
+    void SetLaneId(llvm::Value* v) { lane_id_ = v; }
+    void SetTiledParamInfo(llvm_ir::TiledParameterInfo* tiled_param_info) {
+      CHECK_EQ(tiled_param_info_, nullptr);
+      tiled_param_info_ = tiled_param_info;
+    }
+
+    llvm::Value* GetLaneId() const { return lane_id_; }
+    llvm_ir::KernelMappingScheme* GetKernelMappingScheme() const {
+      return mapping_scheme_;
+    }
+    llvm_ir::TiledParameterInfo* GetTiledParameterInfo() const {
+      return tiled_param_info_;
+    }
+
+   private:
+    llvm_ir::KernelMappingScheme* mapping_scheme_;
+    llvm_ir::TiledParameterInfo* tiled_param_info_;
+    llvm::Value* lane_id_;
+  };
+
+  // A function object to prepare for the code generation for a tile block.
+  using BlockPrologueGenerator =
+      std::function<void(HloInstruction* hlo, KernelCodegenInfo* kernel_info)>;
+  // A function object to finalize the code generation for a tile block.
+  using BlockEpilogueGenerator =
+      std::function<void(HloInstruction* hlo, KernelCodegenInfo* kernel_info)>;
+  // A function object to generate code to process one element in a tile.
+  //
+  // hlo: the instruction for which the code is generated for.
+  // index: the index for the first output element of the current thread.
+  // y_loc: The y coordinate within a tile.
+  // x_loc: The x coordinate within a tile.
+  // kernel_info: Other information to support the kernel code generation.
+  using TileElementGenerator = std::function<void(
+      HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+      const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+      llvm::Value* x_loc)>;
+
+  // KernelCodeGenerator records the code generator objects that generate code
+  // for tile elements or tile block prologue/epilogue.
+  class KernelCodeGenerator {
+   public:
+    explicit KernelCodeGenerator(
+        TileElementGenerator tile_element_generator,
+        BlockPrologueGenerator block_prologue_generator = {},
+        BlockEpilogueGenerator block_epilogue_generator = {})
+        : tile_element_generator_(std::move(tile_element_generator)),
+          block_prologue_generator_(std::move(block_prologue_generator)),
+          block_epilogue_generator_(std::move(block_epilogue_generator)) {}
+
+    const TileElementGenerator& GetTileElementGenerator() const {
+      return tile_element_generator_;
+    }
+    const BlockPrologueGenerator& GetBlockPrologueGenerator() const {
+      return block_prologue_generator_;
+    }
+    const BlockEpilogueGenerator& GetBlockEpilogueGenerator() const {
+      return block_epilogue_generator_;
+    }
+
+   private:
+    TileElementGenerator tile_element_generator_;
+    BlockPrologueGenerator block_prologue_generator_;
+    BlockEpilogueGenerator block_epilogue_generator_;
+  };
+
   IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
                     const HloComputation* hlo_computation,
                     IrEmitterContext* ir_emitter_context);
@@ -76,11 +166,12 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleInfeed(HloInstruction* xla_infeed) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
   Status HandleRng(HloInstruction* random) override;
+  Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
   Status HandleSort(HloInstruction* sort) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
-  Status HandleAfterAll(HloInstruction* gen_token) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
 
   Status EmitTargetElementLoop(
       const HloInstruction& hlo,
@@ -96,10 +187,10 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitConstantGlobals();
 
  private:
-  // Builds the appropriate thunk for the instruction hlo and returns the owning
-  // pointer to it. The caller needs to make sure `inst` outlives the lifetime
-  // of the returned Thunk object.
-  std::unique_ptr<Thunk> BuildThunk(const HloInstruction* hlo);
+  // Add a owning Thunk object to the thunk sequence.
+  void AddThunkToThunkSequence(std::unique_ptr<Thunk> thunk) {
+    thunk_sequence_->emplace_back(std::move(thunk));
+  }
 
   // Builds the prototype of the IR kernel for `inst` and adds it to the module.
   // This kernel takes as arguments pointers to the given buffer allocations.
@@ -124,8 +215,8 @@ class IrEmitterUnnested : public IrEmitter {
   // [height x width], but can be bitcast to [height x width] with "height"
   // being the major dimension.
   Status EmitColumnReduction(
-      int64 height, int64 width, HloInstruction* reduce,
-      const Shape& input_shape,
+      KernelThunk* kernel_thunk, int64 height, int64 width,
+      HloInstruction* reduce, const Shape& input_shape,
       absl::Span<const llvm_ir::ElementGenerator> input_gens,
       absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
       absl::Span<HloComputation* const> reducers,
@@ -139,8 +230,8 @@ class IrEmitterUnnested : public IrEmitter {
   // [depth x height x width], but can be bitcast to [depth x height x width]
   // with "depth" being the most major dimension.
   Status EmitRowReduction(
-      int64 depth, int64 height, int64 width, HloInstruction* reduce,
-      const Shape& input_shape,
+      KernelThunk* kernel_thunk, int64 depth, int64 height, int64 width,
+      HloInstruction* reduce, const Shape& input_shape,
       absl::Span<const llvm_ir::ElementGenerator> input_gens,
       absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
       absl::Span<HloComputation* const> reducers,
@@ -150,7 +241,8 @@ class IrEmitterUnnested : public IrEmitter {
 
   // Emits code that reduces a tensor of arbitrary rank to a scalar.
   Status EmitReductionToScalar(
-      HloInstruction* reduce, const Shape& input_shape,
+      KernelThunk* kernel_thunk, HloInstruction* reduce,
+      const Shape& input_shape,
       absl::Span<const llvm_ir::ElementGenerator> input_gens,
       absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
       absl::Span<HloComputation* const> reducers,
@@ -175,7 +267,8 @@ class IrEmitterUnnested : public IrEmitter {
   //
   // Prerequisite: `IsReductionToVector(*reduce)`
   Status EmitReductionToVector(
-      HloInstruction* reduce, const Shape& input_shape,
+      KernelThunk* kernel_thunk, HloInstruction* reduce,
+      const Shape& input_shape,
       absl::Span<const llvm_ir::ElementGenerator> input_gens,
       absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
       absl::Span<const int64> dimensions_to_reduce,
@@ -184,6 +277,14 @@ class IrEmitterUnnested : public IrEmitter {
       absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
           extra_output_gens);
 
+  // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
+  // the process. `scatter` may be fused, scatter indices are taken from
+  // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
+  // expected to have the operand values in it already.
+  Status EmitScatter(Thunk* thunk, HloInstruction* scatter,
+                     const llvm_ir::ElementGenerator& scatter_indices_gen,
+                     const llvm_ir::ElementGenerator& updates_gen);
+
   // Returns true if a 0-2-1 tiling algorithm is already used to emit the kernel
   // for the hlo instruction.
   bool CheckAndEmitHloWithTile021(HloInstruction* hlo);
@@ -193,22 +294,32 @@ class IrEmitterUnnested : public IrEmitter {
   LaunchDimensions EmitHlo021Tile(HloInstruction* hlo,
                                   absl::Span<const int64> reduced_output_dims,
                                   absl::Span<const int64> tiled_param_ids);
+  // Emits a kernel for an unnested HLO instruction.
+  LaunchDimensions EmitKernel(HloInstruction* unnested_hlo,
+                              absl::Span<const int64> param_ids,
+                              const KernelCodeGenerator& kernel_generator,
+                              KernelCodegenInfo* kernel_info);
+  void EmitBlock(const TileGenerator& emit_one_tile,
+                 const KernelCodegenInfo* kernel_info,
+                 KernelSupportLibrary& ksl, llvm::Type* index_ty);
+  // Emits code to process a tensor element in a tile for the given kCopy HLO
+  // that performs a 0-2-1 transpose.
+  void EmitTileElementForCopy(HloInstruction* hlo,
+                              const llvm_ir::IrArray::Index& index,
+                              const KernelCodegenInfo* kernel_info,
+                              llvm::Value* y_loc, llvm::Value* x_loc);
+  // Emits code to process a tensor element in a tile for the given kLoop fusion
+  // HLO containing parameters that are 0-2-1 transpose of its outputs.
+  void EmitTileElementForFusion(HloInstruction* hlo,
+                                const llvm_ir::IrArray::Index& index,
+                                const KernelCodegenInfo* kernel_info,
+                                llvm::Value* y_loc, llvm::Value* x_loc);
 
   // Generates the IrArray for each input of an hlo and returns a vector that
   // constains such IrArrays.
   std::vector<llvm_ir::IrArray> ConstructIrArrayForInputs(
       const HloInstruction& hlo);
 
-  // For each output of the `hlo` instruction, constructs the reduced shape for
-  // the output with the given `reduced_output_dims` and cast the original
-  // output IrArray element in `output_arrays` to the reduced shape. Returns
-  // the number of outputs.
-  int ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-      const HloInstruction& hlo,
-      const std::vector<llvm_ir::IrArray>& output_arrays,
-      absl::Span<const int64> reduced_output_dims,
-      std::vector<Shape>* output_reduced_shapes,
-      std::vector<llvm_ir::IrArray>* output_in_reduced_shape_arrays);
   // For each input of the `hlo` instruction, checks its value in
   // `param_buffers` to find out whether the input has a reduced shape. If the
   // input has a reduced shape, constructs the reduced shape for the input and
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index 8751e3a9c2a4c8da46d3ecd8437629450d4a2ba2..bd53b90b42d8e657a3ee58e7ca03fb60522aae28 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -177,13 +177,6 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
 
   TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
-  llvm_ir::SetTargetOptions(
-      /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_gpu_enable_fast_math(),
-      &target_options);
-
-  // Enable FMA synthesis.
-  target_options.AllowFPOpFusion = FPOpFusion::Fast;
 
   // Set the verbose assembly options.
   target_options.MCOptions.AsmVerbose = false;
@@ -206,8 +199,7 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
   return absl::WrapUnique(target->createTargetMachine(
       triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx60", target_options,
-      Optional<Reloc::Model>(RelocModel), Optional<CodeModel::Model>(CMModel),
-      codegen_opt_level));
+      getRelocModel(), getCodeModel(), codegen_opt_level));
 }
 
 // Adds the standard LLVM optimization passes, based on the speed optimization
@@ -401,8 +393,16 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   int32 opt_level =
       hlo_module_config.debug_options().xla_backend_optimization_level();
 
-  CHECK_GE(opt_level, 2)
-      << "The XLA GPU backend doesn't support unoptimized code generation";
+  if (opt_level < 2) {
+    LOG(ERROR) << std::string(80, '*');
+    LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
+                  "generation but ";
+    LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
+               << "!";
+    LOG(ERROR) << "(Supported configuration is "
+                  "--xla_backend_optimization_level >= 2.)";
+    LOG(ERROR) << std::string(80, '*');
+  }
 
   AddOptimizationPasses(opt_level,
                         /*size_level=*/0, target_machine.get(), &module_passes,
@@ -453,18 +453,21 @@ void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
   // * 3-6 gives similar results as 2;
   // * >6 start hurting the performance of at least dot product kernels.
   //
-  // TODO(jingyue): The current threshold only considers the numbr of IR
+  // TODO(jingyue): The current threshold only considers the number of IR
   // instructions which do not accurately reflect the true cost. We need a
   // better cost model.
   FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
-  // TODO(b/22073864): Increase limit when scan memory dependency.
-  // This helps to reduce more redundant load instructions.
+  // Increase limit when scanning memory dependencies.  This helps to reduce
+  // more redundant load instructions.
   //
   // The specific value is currently large enough for s3d in shoc benchmark,
   // which contains a lot of load instructions and many arithmetic instructions
   // between those loads.
   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
 
+  // Use div.approx -- it matters for some float-division heavy benchmarks.
+  FeedLLVMWithFlags({"-nvptx-prec-divf32=0"});
+
   llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
 
   // Initialize the NVPTX target; it's the only target we link with, so call its
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 835924024b7b7de79624a369a69b07d72ac751ab..01fddcede64d1bb02ab89db5fc9524893c2d47a4 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -41,50 +41,7 @@ GpuMultiOutputFusion::GpuMultiOutputFusion() : MultiOutputFusion(INT64_MAX) {}
 
 bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
                                                      HloInstruction* instr2) {
-  auto get_element_instr =
-      [&](const HloInstruction* instr) -> const HloInstruction* {
-    const HloInstruction* element_instr = instr;
-    if (instr->opcode() == HloOpcode::kFusion) {
-      auto fused_expression_root = instr->fused_expression_root();
-      if (instr->IsMultiOutputFusion()) {
-        // If possible, we want to pick a reduce operand of the fusion root,
-        // because it has the most constraints.
-        for (const auto* inst : fused_expression_root->operands()) {
-          if (IsReductionToVector(*inst)) {
-            return inst;
-          }
-        }
-        return fused_expression_root->operands()[0];
-      } else {
-        element_instr = fused_expression_root;
-      }
-    }
-    return element_instr;
-  };
-
-  auto get_element_shape = [&](const HloInstruction* element_instr) {
-    // Special handling of kReduce instructions -- the fusion
-    // applies to the first operand.
-    if (IsReductionToVector(*element_instr)) {
-      return element_instr->operand(0)->shape();
-    }
-    return element_instr->shape();
-  };
-
-  // The shapes in all tuple operands should agree, unless it is a reduce.
-  // In that case, the operand of the reduce needs to have the same shape
-  // as the other tuple operands, but also we need to compare the output
-  // shapes of the reduces.
-  auto* element_instr_1 = get_element_instr(instr1);
-  auto* element_instr_2 = get_element_instr(instr2);
-  if (element_instr_1->opcode() == HloOpcode::kReduce &&
-      element_instr_2->opcode() == HloOpcode::kReduce &&
-      !ShapeUtil::Equal(element_instr_1->shape(), element_instr_2->shape())) {
-    return false;
-  }
-  // The elementwise output shapes must be the same (including layout).
-  return ShapeUtil::EqualIgnoringFpPrecision(
-      get_element_shape(element_instr_1), get_element_shape(element_instr_2));
+  return ShapesCompatibleForMultiOutputFusion(*instr1, *instr2);
 }
 
 bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) {
@@ -140,6 +97,18 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
     return false;
   }
 
+  // The emitter only supports in-place DUS for fusions with a single DUS at the
+  // root. Don't sibling fuse DUS for now.
+  // TODO(b/119178699): Multi-output fusing DUS can improve performance if we
+  // share the input and output buffers and add support to the emitter.
+  if (instr1->fused_expression_root()->opcode() ==
+          HloOpcode::kDynamicUpdateSlice ||
+      (instr2->opcode() == HloOpcode::kFusion &&
+       instr2->fused_expression_root()->opcode() ==
+           HloOpcode::kDynamicUpdateSlice)) {
+    return false;
+  }
+
   // Do this check last, as it may be expensive.
   return !GpuInstructionFusion::FusionWouldBeTooLarge(instr1, instr2);
 }
@@ -180,6 +149,12 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
         VLOG(3) << producer->name() << " is not fusible.";
         continue;
       }
+      // Never multi-output fuse constants.  To the extent that we want to fuse
+      // constants, that should be handled by the regular fusion pass.
+      if (producer->opcode() == HloOpcode::kConstant) {
+        VLOG(3) << producer->name() << " is a constant.";
+        continue;
+      }
       const bool is_loop_fusion =
           producer->opcode() == HloOpcode::kFusion &&
           producer->fusion_kind() == HloInstruction::FusionKind::kLoop;
@@ -187,7 +162,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
         VLOG(3) << producer->name() << " is not a loop fusion.";
         continue;
       }
-      if (!ShapesCompatibleForFusion(producer, consumer)) {
+      if (!ShapesCompatibleForMultiOutputFusion(*producer, *consumer)) {
         VLOG(3) << producer->name() << " has an incompatible shape.";
         continue;
       }
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 8a6e5327e082791ff857a89e840c6a4f045f0edb..d16c87ba5c63aa582753fe949e9e39ee2d8b81e5 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -505,7 +505,7 @@ TEST_F(MultiOutputFusionTest,
       p1.1 = f16[2,2,2]{2,1,0} parameter(1)
       c0 = f16[] constant(0)
       broadcast = f16[2,2,2]{2,1,0} broadcast(f16[] c0), dimensions={}
-      greater-than = pred[2,2,2]{2,1,0} greater-than(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast)
+      greater-than = pred[2,2,2]{2,1,0} greater-than(f16[2,2,2]{2,1,0} p1.1, f16[2,2,2]{2,1,0} broadcast)
       p0.1 = f16[2,2,2]{2,1,0} parameter(0)
       ROOT select = f16[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f16[2,2,2]{2,1,0} p0.1, f16[2,2,2]{2,1,0} broadcast)
     }
@@ -580,7 +580,7 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   //   ...
   // where each of the (pi * pj)'s is represented as a fusion node so that
   // multi-output fusion will pay attention to it.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
 
@@ -621,5 +621,39 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   }
 }
 
+TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
+  auto module = ParseHloString(R"(HloModule dus_mof
+    fusion.1 {
+      p.0 = f16[50,96,1024]{2,1,0} parameter(0)
+      p.1 = s32[1]{0} parameter(1)
+      p.2 = f16[1,96,1024]{2,1,0} parameter(2)
+      c.0 = s32[] constant(0)
+      pad = s32[3]{0} pad(p.1, c.0), padding=0_2
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+    }
+
+    fusion.2 {
+      p.0 = f16[50,96,1024]{2,1,0} parameter(0)
+      p.1 = s32[1]{0} parameter(1)
+      p.2 = f16[1,96,1024]{2,1,0} parameter(2)
+      c.0 = s32[] constant(0)
+      pad = s32[3]{0} pad(p.1, c.0), padding=0_2
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+    }
+
+    ENTRY entry {
+      p.00 = f16[50,96,1024]{2,1,0} parameter(0)
+      p.01 = f16[50,96,1024]{2,1,0} parameter(1)
+      p.1 = s32[1]{0} parameter(2)
+      p.2 = f16[1,96,1024]{2,1,0} parameter(3)
+
+      f1 = f16[50,96,1024] fusion(p.00, p.1, p.2), kind=kLoop, calls=fusion.1
+      f2 = f16[50,96,1024] fusion(p.01, p.1, p.2), kind=kLoop, calls=fusion.2
+      ROOT tuple = (f16[50,96,1024],f16[50,96,1024]) tuple(f1, f2)
+    })")
+                    .ValueOrDie();
+  ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index b4ae2e42c7c34774b86d0bf69eef4dba390c0cc5..f3e17d888242a36c268dcbfa0d6530f80cedceb0 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -38,9 +38,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
-#include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_rewriter.h"
-#include "tensorflow/compiler/xla/service/gpu/cudnn_fused_convolution_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_fused_conv_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
@@ -54,18 +56,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
-#include "tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h"
-#include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include "tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
@@ -75,7 +77,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
-#include "tensorflow/compiler/xla/service/scatter_expander.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -128,6 +129,7 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) {
             << potential_libdevice_dir;
   }
 
+  LOG(WARNING) << "Unable to find libdevice dir. Using '.'";
   // Last resort: maybe in the current folder.
   return ".";
 }
@@ -172,15 +174,16 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true);
 
+      pipeline.AddPass<HloGetDimensionSizeRewriter>();
+
       // BatchNormExpander can create zero-sized ops, so zero-sized HLO
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
 
-      pipeline.AddPass<ScatterExpander>();
-
-      pass.AddPass<AlgebraicSimplifier>(
-          /*is_layout_sensitive=*/false,
+      AlgebraicSimplifierOptions options(
           [](const Shape&, const Shape&) { return false; });
+      options.set_enable_permutation_sort_replacement(true);
+      pass.AddPass<AlgebraicSimplifier>(options);
       pass.AddPass<TupleSimplifier>();
       pass.AddPass<WhileLoopConstantSinking>();
       pass.AddPass<WhileLoopSimplifier>();
@@ -204,21 +207,22 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
   {
     // Convert convolutions into CustomCalls to cudnn, then canonicalize them
-    // (PadInsertion).
+    // (CudnnConvPaddingLegalization).
     HloPassPipeline pipeline("conv_canonicalization");
     pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                               /*allow_mixed_precision=*/false);
-    pipeline.AddPass<CudnnConvolutionRewriter>();
-    pipeline.AddPass<CudnnFusedConvolutionRewriter>();
-    pipeline.AddPass<PadInsertion>();
+    pipeline.AddPass<CudnnConvRewriter>();
+    pipeline.AddPass<CudnnFusedConvRewriter>();
+    pipeline.AddPass<CudnnConvPaddingLegalization>();
     if (IsVoltaOrLater(*stream_exec)) {
-      pipeline.AddPass<PadForTensorCores>();
-      // PadForTensorCores leaves behind unnecessary tuple/get-tuple-element
-      // pairs that TupleSimplifier fixes.
+      pipeline.AddPass<CudnnConvPadForTensorCores>();
+      // CudnnConvPadForTensorCores leaves behind unnecessary
+      // tuple/get-tuple-element pairs that TupleSimplifier fixes.
       pipeline.AddPass<TupleSimplifier>();
     }
-    // CudnnConvolutionRewriter, PadInsertion and PadForTensorCores may add
-    // instructions which can be simplified by constant folding.
+    // CudnnConvRewriter, CudnnConvPaddingLegalization and
+    // CudnnConvPadForTensorCores may add instructions which can be simplified
+    // by constant folding.
     pipeline.AddPass<HloConstantFolding>();
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
@@ -239,21 +243,27 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
   {
     HloPassPipeline pipeline("post-layout_assignment");
-    pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
-                                              /*allow_mixed_precision=*/false);
+    /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
+     * fixing the ticket. */
+    pipeline.AddInvariantChecker<HloVerifier>(
+        /*layout_sensitive=*/true,
+        /*allow_mixed_precision=*/false,
+        LayoutAssignment::InstructionCanChangeLayout);
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        /*is_layout_sensitive=*/true,
+    AlgebraicSimplifierOptions options(
         /*valid_bitcast_callback=*/[](const Shape&, const Shape&) {
           return true;
         });
+    options.set_is_layout_sensitive(true);
+    options.set_enable_permutation_sort_replacement(true);
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
 
     // Choose the fastest algorithm for each conv.
     //
     // We pick the algorithm before fusion so we can generate better HLO. After
-    // CudnnConvolutionRewriter, our convolutions are CustomCalls which return a
+    // CudnnConvRewriter, our convolutions are CustomCalls which return a
     // tuple (conv_result, scratch_memory), and the each conv uses 0 bytes of
     // scratch:
     //
@@ -271,12 +281,12 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     // The new tuple and gte instructions then be simplified away, because
     // nobody is expected to use the scratch value.
     //
-    // However, if we were to run CudnnConvolutionAlgorithmPicker after fusion
+    // However, if we were to run CudnnConvAlgorithmPicker after fusion
     // the gte(customcall, 0) would probably already be into a fusion node.  We
     // can't simplify across HloComputation boundaries, so in this case we
     // wouldn't be able to simplify away the new_tuple bits.
-    pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(
-        stream_exec, device_allocator, compiler);
+    pipeline.AddPass<CudnnConvAlgorithmPicker>(stream_exec, device_allocator,
+                                               compiler);
     // Clean up new_tuple described above.
     pipeline.AddPass<TupleSimplifier>();
 
@@ -286,8 +296,15 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
   {
     HloPassFix<HloPassPipeline> fusion("fusion");
-    fusion.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
-                                            /*allow_mixed_precision=*/false);
+    // We try to split variadic ops with many parameters into several such ops
+    // to avoid exceeding the parameter space.
+    fusion.AddPass<VariadicOpSplitter>();
+    /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
+     * fixing the ticket. */
+    fusion.AddInvariantChecker<HloVerifier>(
+        /*layout_sensitive=*/true,
+        /*allow_mixed_precision=*/false,
+        LayoutAssignment::InstructionCanChangeLayout);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
     fusion.AddPass<FusionMerger>();
@@ -298,8 +315,11 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
 
     HloPassPipeline reduce_pipeline("reduce-precision");
+    /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
+     * fixing the ticket. */
     reduce_pipeline.AddInvariantChecker<HloVerifier>(
-        /*is_layout_sensitive=*/true, /*allow_mixed_precision=*/false);
+        /*is_layout_sensitive=*/true, /*allow_mixed_precision=*/false,
+        LayoutAssignment::InstructionCanChangeLayout);
     ReducePrecisionInsertion::AddPasses(
         &reduce_pipeline, hlo_module->config().debug_options(),
         ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
@@ -325,8 +345,12 @@ Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // (b/27180329). Therefore, in that case, we set the output to be a copy of
   // the parameter.
   HloPassPipeline pipeline("GPU-ir-emit-prepare");
-  pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/true,
-                                            /*allow_mixed_precision=*/false);
+  /* TODO(b/117531509): Use LayoutAssignment::InstructionCanChangeLayout after
+   * fixing the ticket. */
+  pipeline.AddInvariantChecker<HloVerifier>(
+      /*layout_sensitive=*/true,
+      /*allow_mixed_precision=*/false,
+      LayoutAssignment::InstructionCanChangeLayout);
 
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
@@ -401,7 +425,7 @@ void WarnIfBadPtxasVersion(const string& ptxas_path) {
            "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
            "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
            "binary is sufficient.";
-  } else if ((vmaj < 9 || vmin < 2 || vdot < 88)) {
+  } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
     LOG(WARNING)
         << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
         << vdot
@@ -455,13 +479,15 @@ void WarnIfBadDriverJITVersion() {
 // Compiles the given PTX string using ptxas and returns the resulting machine
 // code (i.e. a cubin) as a byte array.
 StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
-                                        int cc_minor) {
+                                        int cc_minor,
+                                        bool disable_ptx_optimizations) {
   tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true);
   const string ptxas_path =
       tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas");
-  VLOG(2) << "Using ptxas at " << ptxas_path;
+  VLOG(2) << "Checking ptxas at " << ptxas_path;
   auto env = tensorflow::Env::Default();
   TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
+  VLOG(2) << "Using ptxas at " << ptxas_path;
 
   WarnIfBadPtxasVersion(ptxas_path);
 
@@ -494,6 +520,9 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
   if (VLOG_IS_ON(2)) {
     ptxas_args.push_back("-v");
   }
+  if (disable_ptx_optimizations) {
+    ptxas_args.push_back("-O0");
+  }
   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
                                      tensorflow::ACTION_PIPE);
@@ -527,14 +556,17 @@ StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* device_allocator) {
   // We dump the post-optimization HLO in RunBackend so no need to dump it here.
-  VLOG(2) << "*** HLO Before Optimization";
-  XLA_VLOG_LINES(2, module->ToString());
+  VLOG(3) << "*** HLO Before Optimization";
+  XLA_VLOG_LINES(3, module->ToString());
 
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
   tracing::ScopedActivity activity("HLO Transforms", module->name(),
                                    /*is_expensive=*/true);
   TF_RETURN_IF_ERROR(
       OptimizeHloModule(module.get(), stream_exec, device_allocator, this));
+
+  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
+
   return std::move(module);
 }
 
@@ -545,8 +577,6 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
 
   TF_RET_CHECK(stream_exec != nullptr);
 
-  TF_RETURN_IF_ERROR(PrepareHloModuleForIrEmitting(module.get()));
-
   llvm::LLVMContext llvm_context;
   std::string buffer;
   llvm::raw_string_ostream error(buffer);
@@ -586,8 +616,8 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   // include headers, so no need for us to print them ourselves.
   XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
   XLA_VLOG_LINES(2, buffer_assignment->ToString());
-  VLOG(2) << "*** HLO After Optimization";
-  XLA_VLOG_LINES(2, module->ToString());
+  VLOG(3) << "*** HLO After Optimization";
+  XLA_VLOG_LINES(3, module->ToString());
   const string xla_dump_optimized_hlo_proto_to =
       module->config().debug_options().xla_dump_optimized_hlo_proto_to();
   if (!xla_dump_optimized_hlo_proto_to.empty()) {
@@ -617,10 +647,10 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   string ir_module_string_before_opt;
   const bool embed_ir_in_executable =
       module->config().debug_options().xla_embed_ir_in_executable();
-  if (VLOG_IS_ON(2) || embed_ir_in_executable) {
+  if (VLOG_IS_ON(3) || embed_ir_in_executable) {
     ir_module_string_before_opt = llvm_ir::DumpModuleToString(llvm_module);
-    VLOG(2) << "LLVM module before optimizations:";
-    XLA_VLOG_LINES(2, ir_module_string_before_opt);
+    VLOG(3) << "LLVM module before optimizations:";
+    XLA_VLOG_LINES(3, ir_module_string_before_opt);
   }
 
   const string& ir_dump_directory =
@@ -664,6 +694,8 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
     }
     libdevice_dir = cached_libdevice_dir_;
   }
+  VLOG(2) << "Libdevice dir = " << libdevice_dir << "\n";
+
   int cc_major, cc_minor;
   if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
                                                                    &cc_minor)) {
@@ -690,10 +722,10 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   if (user_post_optimization_hook_) {
     TF_CHECK_OK(user_post_optimization_hook_(llvm_module));
   }
-  VLOG(2) << "LLVM module after optimizations:";
-  XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
-  VLOG(2) << "PTX:";
-  XLA_VLOG_LINES(2, ptx);
+  VLOG(3) << "LLVM module after optimizations:";
+  XLA_VLOG_LINES(3, llvm_ir::DumpModuleToString(llvm_module));
+  VLOG(3) << "PTX:";
+  XLA_VLOG_LINES(3, ptx);
 
   // Write PTX to IR dump directory, if IR dumping was requested.
   if (!ir_dump_directory.empty()) {
@@ -711,14 +743,15 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
     }
   }
 
-  const std::vector<uint8> cubin =
-      CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor);
+  const std::vector<uint8> cubin = CompilePtxOrGetCachedResult(
+      ptx, cc_major, cc_minor,
+      module->config().debug_options().xla_gpu_disable_ptxas_optimizations());
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
       hlo_schedule->ThunkLaunchOrder());
-  VLOG(2) << "Printing the thunk schedule...";
-  XLA_VLOG_LINES(2, thunk_schedule->ToString());
+  VLOG(3) << "Printing the thunk schedule...";
+  XLA_VLOG_LINES(3, thunk_schedule->ToString());
 
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
   std::unique_ptr<HloProfilePrinterData> profile_printer;
@@ -729,8 +762,8 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
         stream_exec->GetDeviceDescription().memory_bandwidth());
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
-    profile_printer =
-        CreateHloProfilePrinterData(*profile_index_map, cost_analysis);
+    profile_printer = CreateHloProfilePrinterData(
+        *profile_index_map, cost_analysis, entry_computation->name());
   }
 
   auto* gpu_executable = new GpuExecutable(
@@ -744,9 +777,9 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   return std::unique_ptr<Executable>(gpu_executable);
 }
 
-std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
-                                                              int cc_major,
-                                                              int cc_minor) {
+std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
+    const string& ptx, int cc_major, int cc_minor,
+    bool disable_ptx_optimizations) {
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompilePtxOrGetCachedResult");
   tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true);
   bool inserted;
@@ -774,8 +807,8 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
     if (inserted) {
       CHECK(!cache_value->compilation_done);
       if (!ptx.empty()) {
-        StatusOr<std::vector<uint8>> maybe_cubin =
-            CompilePtx(*cache_ptx, cc_major, cc_minor);
+        StatusOr<std::vector<uint8>> maybe_cubin = CompilePtx(
+            *cache_ptx, cc_major, cc_minor, disable_ptx_optimizations);
         if (maybe_cubin.ok()) {
           cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
           VLOG(2) << "Compiled PTX size:" << ptx.size()
@@ -788,7 +821,7 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
             // binaries are not available. We don't want to spam logs with
             // identical warnings in this case.
 
-            // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N
+            // TODO(jlebar): we should implement a LOG_FIRST_N and LOG_EVERY_N
             // for more general usage.
             static std::atomic<bool> warning_done(false);
             log_warning = !warning_done.exchange(true);
@@ -820,9 +853,8 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
 }
 
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-NVPTXCompiler::CompileAheadOfTime(
-    std::vector<std::unique_ptr<HloModule>> module,
-    const AotCompilationOptions& options) {
+NVPTXCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                                  const AotCompilationOptions& options) {
   return Unimplemented(
       "not yet implemented: NVPTXCompiler::CompileAheadOfTime");
 }
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index c4a0b727cd3d9ae0af61c1752c1608cd4fb65d2d..be5e31a50112686841e6f18b76f382a56e61bafc 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -59,7 +59,7 @@ class NVPTXCompiler : public LLVMCompiler {
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                      AotCompilationOptions const& options) override;
 
   se::Platform::Id PlatformId() const override;
@@ -97,8 +97,9 @@ class NVPTXCompiler : public LLVMCompiler {
 
   // Tries to compile the given ptx string to cubin.  Returns a vector with the
   // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
-  std::vector<uint8> CompilePtxOrGetCachedResult(const string& ptx,
-                                                 int cc_major, int cc_minor);
+  std::vector<uint8> CompilePtxOrGetCachedResult(
+      const string& ptx, int cc_major, int cc_minor,
+      bool disable_ptx_optimizations);
 
   // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
   // -> cubin so we don't recompile the same ptx twice.  This is important for
diff --git a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc
deleted file mode 100644
index e3869b5c368957571219a39600214140022a7318..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.cc
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h"
-
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/window_util.h"
-
-namespace xla {
-namespace gpu {
-
-// We want the input/output feature counts of an f16 conv to be factors of 8,
-// because without this cudnn can't use tensor cores on the conv.
-static constexpr int64 kDesiredNumFeaturesFactor = 8;
-
-// We won't pad a conv if doing so increases the total number of bytes in the
-// lhs, rhs, or result by more than this amount.
-//
-// TODO(jlebar): This number was tuned experimentally.  It represents a
-// compromise on our current benchmarks; it speeds some up significantly, and
-// doesn't slow any down.  But we can observe by changing this value that
-// there's additional room for speedups.  Achieving those speedups without also
-// slowing other things down will likely require a more sophisticated heuristic,
-// possibly some form of auto-tuning.
-//
-// This value should be >= 4/3, otherwise the "dims of size 3 padded up to 4"
-// special case inside PadShape won't fire.
-static constexpr double kMaxBytesTouchedIncrease = 1.35;
-
-// Pads the given dimensions in the given shape up to a multiple of
-// kDesiredNumFeaturesFactor.
-static Shape PadShape(Shape s, absl::Span<const int64> dims) {
-  for (int64 dim : dims) {
-    int64 dim_to_pad_size = s.dimensions(dim);
-
-    // Round dim_to_pad_size up to the next multiple of
-    // kDesiredNumFeaturesFactor.
-    //
-    // Special case: dims of size 3 are rounded up to 4, not
-    // kDesiredNumFeaturesFactor.  Empirically (and on the advice of nvidia),
-    // this helps, but as of writing, it's not supported by anything in the
-    // cudnn docs.
-    int64 new_dim_to_pad_size;
-    if (dim_to_pad_size == 3) {
-      new_dim_to_pad_size = 4;
-    } else {
-      new_dim_to_pad_size =
-          RoundUpToNearest(dim_to_pad_size, kDesiredNumFeaturesFactor);
-    }
-
-    s.set_dimensions(dim, new_dim_to_pad_size);
-  }
-  return s;
-}
-
-// Creates and returns an HLO that zero-pads one or more dimensions in the given
-// instruction so that its shape is equal to the given shape.
-//
-// Padding is added to the end of each relevant dimension.
-//
-// If the instruction already has the given shape, simply returns it without an
-// intervening pad.
-static HloInstruction* PadInstruction(HloInstruction* instr,
-                                      const Shape& new_shape) {
-  HloComputation* comp = instr->parent();
-
-  const Shape& shape = instr->shape();
-  auto* zero = comp->AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
-
-  PaddingConfig pad_config = MakeNoPaddingConfig(ShapeUtil::Rank(shape));
-
-  bool added_padding = false;
-  for (int64 dim = 0; dim < ShapeUtil::Rank(shape); ++dim) {
-    if (shape.dimensions(dim) == new_shape.dimensions(dim)) {
-      continue;
-    }
-    CHECK_GT(new_shape.dimensions(dim), shape.dimensions(dim));
-    pad_config.mutable_dimensions(dim)->set_edge_padding_high(
-        new_shape.dimensions(dim) - shape.dimensions(dim));
-    added_padding = true;
-  }
-
-  if (!added_padding) {
-    return instr;
-  }
-  return comp->AddInstruction(
-      HloInstruction::CreatePad(new_shape, instr, zero, pad_config));
-}
-
-// Pads the input/output feature dimensions of the given cudnn convolution
-// custom-call to be multiples of kDesiredNumFeaturesFactor.
-static StatusOr<bool> PadFeaturesDims(HloInstruction* conv) {
-  CHECK_EQ(0, conv->shape().tuple_shapes(1).dimensions(0))
-      << "conv must use 0 scratch bytes, i.e. this pass must be run "
-         "before CudnnConvolutionAlgorithmPicker.";
-
-  const auto& target = conv->custom_call_target();
-  const auto& dnums = conv->convolution_dimension_numbers();
-  auto* lhs = conv->mutable_operand(0);
-  auto* rhs = conv->mutable_operand(1);
-  const Shape& result_shape = conv->shape().tuple_shapes(0);
-
-  Shape new_lhs_shape = [&] {
-    if (target == kCudnnConvForwardCallTarget ||
-        target == kCudnnConvBackwardFilterCallTarget) {
-      // LHS is "input".
-      return PadShape(lhs->shape(), {dnums.input_feature_dimension()});
-    }
-    CHECK_EQ(target, kCudnnConvBackwardInputCallTarget);
-    // LHS is "output".
-    return PadShape(lhs->shape(), {dnums.output_feature_dimension()});
-  }();
-
-  Shape new_rhs_shape = [&] {
-    if (target == kCudnnConvForwardCallTarget ||
-        target == kCudnnConvBackwardInputCallTarget) {
-      // RHS is "filter".
-      return PadShape(rhs->shape(), {dnums.kernel_input_feature_dimension(),
-                                     dnums.kernel_output_feature_dimension()});
-    }
-    CHECK_EQ(target, kCudnnConvBackwardFilterCallTarget);
-    // RHS is "output".
-    return PadShape(rhs->shape(), {dnums.output_feature_dimension()});
-  }();
-
-  if (ShapeUtil::Equal(lhs->shape(), new_lhs_shape) &&
-      ShapeUtil::Equal(rhs->shape(), new_rhs_shape)) {
-    VLOG(3) << "No need to pad features of " << conv->ToString();
-    return false;
-  }
-
-  Shape new_result_shape = [&] {
-    if (target == kCudnnConvForwardCallTarget) {
-      // Result is "output".
-      return PadShape(result_shape, {dnums.output_feature_dimension()});
-    }
-    if (target == kCudnnConvBackwardInputCallTarget) {
-      // Result is "input".
-      return PadShape(result_shape, {dnums.input_feature_dimension()});
-    }
-    CHECK_EQ(target, kCudnnConvBackwardFilterCallTarget);
-    // Result is "filter".
-    return PadShape(result_shape, {dnums.kernel_input_feature_dimension(),
-                                   dnums.kernel_output_feature_dimension()});
-  }();
-
-  // Check that padding wouldn't increase the total bytes read/written by this
-  // operation too much.
-  auto check_size_increase = [&](const Shape& old_shape,
-                                 const Shape& new_shape) {
-    int64 old_bytes = ShapeUtil::ByteSizeOf(old_shape);
-    int64 new_bytes = ShapeUtil::ByteSizeOf(new_shape);
-    if (new_bytes <= old_bytes * kMaxBytesTouchedIncrease) {
-      return true;
-    }
-    VLOG(3) << "Not padding convolution; doing so would change input / result "
-               "shape from "
-            << ShapeUtil::HumanString(old_shape) << " to "
-            << ShapeUtil::HumanString(new_shape) << ", a size increase of "
-            << new_bytes / static_cast<double>(old_bytes) << "x > "
-            << kMaxBytesTouchedIncrease << "x: " << conv->ToString();
-    return false;
-  };
-  if (!check_size_increase(lhs->shape(), new_lhs_shape) ||
-      !check_size_increase(rhs->shape(), new_rhs_shape) ||
-      !check_size_increase(result_shape, new_result_shape)) {
-    return false;
-  }
-
-  // OK, let's do the transformation!
-
-  auto* new_lhs = PadInstruction(lhs, new_lhs_shape);
-  auto* new_rhs = PadInstruction(rhs, new_rhs_shape);
-  CHECK(new_lhs != lhs || new_rhs != rhs)
-      << "We should have had to pad either LHS or RHS.";
-
-  auto add = [&](std::unique_ptr<HloInstruction> new_instr) {
-    return conv->parent()->AddInstruction(std::move(new_instr));
-  };
-
-  Shape new_conv_shape = ShapeUtil::MakeTupleShape(
-      {new_result_shape, ShapeUtil::MakeShape(U8, {0})});
-  auto* new_conv =
-      add(conv->CloneWithNewOperands(new_conv_shape, {new_lhs, new_rhs}));
-
-  // Slice the new conv result if necessary, keeping in mind that new_conv has
-  // tuple shape (new_result_shape, u8[0]).
-  if (!ShapeUtil::Equal(result_shape, new_result_shape)) {
-    std::vector<int64> start_indices(result_shape.dimensions_size(), 0);
-    std::vector<int64> end_indices(result_shape.dimensions().begin(),
-                                   result_shape.dimensions().end());
-    std::vector<int64> strides(result_shape.dimensions_size(), 1);
-
-    auto* new_conv_result = add(
-        HloInstruction::CreateGetTupleElement(new_result_shape, new_conv, 0));
-    auto* empty_temp_buffer =
-        add(HloInstruction::CreateConstant(LiteralUtil::CreateR1<uint8>({})));
-    auto* sliced_result = add(HloInstruction::CreateSlice(
-        result_shape, new_conv_result, start_indices, end_indices, strides));
-    new_conv =
-        add(HloInstruction::CreateTuple({sliced_result, empty_temp_buffer}));
-  }
-
-  VLOG(2) << "Padded features of " << conv->ToString() << ", replaced with "
-          << new_conv->ToString();
-  TF_RETURN_IF_ERROR(conv->parent()->ReplaceInstruction(conv, new_conv));
-  return true;
-}
-
-static std::vector<HloInstruction*> GetRelevantConvs(HloComputation* comp) {
-  std::vector<HloInstruction*> convs;
-  for (HloInstruction* instr : comp->instructions()) {
-    if (IsCustomCallToDnnConvolution(*instr) &&
-        instr->operand(0)->shape().element_type() == F16 &&
-        // TODO(timshen): Disable for fused conv for now. Implement it if it's
-        // needed.
-        Cast<HloCustomCallInstruction>(instr)->custom_call_target() !=
-            kCudnnConvBiasActivationForwardCallTarget) {
-      convs.push_back(instr);
-    }
-  }
-  return convs;
-}
-
-StatusOr<bool> PadForTensorCores::Run(HloModule* module) {
-  bool changed = false;
-  for (HloComputation* comp : module->MakeNonfusionComputations()) {
-    for (HloInstruction* conv : GetRelevantConvs(comp)) {
-      TF_ASSIGN_OR_RETURN(bool result, PadFeaturesDims(conv));
-      changed |= result;
-    }
-  }
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h
deleted file mode 100644
index e592a3774ec28605fda912298c74ca7976ff99ac..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PAD_FOR_TENSOR_CORES_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PAD_FOR_TENSOR_CORES_H_
-
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-
-namespace xla {
-namespace gpu {
-
-// Ensures that f16 cudnn convolutions have input/output channel dimensions that
-// are multiples of 8, inserting pads/slices as necessary.
-//
-// This is useful primarily for Volta and newer GPUs, where tensor cores can
-// only be used if the channel dims are multiples of 8.  It's probably the
-// opposite of useful on other GPUs, so you should check what GPU you're
-// targeting before running this pass.
-//
-// TODO(jlebar): Also pad dots.
-class PadForTensorCores : public HloModulePass {
- public:
-  absl::string_view name() const override { return "pad for tensor cores"; }
-
-  StatusOr<bool> Run(HloModule* module) override;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PAD_FOR_TENSOR_CORES_H_
diff --git a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc b/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc
deleted file mode 100644
index 5c92b0dcb873b873074704dca8f27d4067b070df..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores_test.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/pad_for_tensor_cores.h"
-
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
-#include "tensorflow/compiler/xla/util.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-namespace op = xla::testing::opcode_matchers;
-using ::testing::_;
-
-class PadForTensorCoresTest : public HloVerifiedTestBase {};
-
-TEST_F(PadForTensorCoresTest, PadF16ForwardConvInputChannels) {
-  ParseAndVerifyModule(R"(
-  HloModule TestModule
-
-  ENTRY TestComputation {
-    input = f16[10,20,30,41] parameter(0)
-    filter = f16[2,2,41,40] parameter(1)
-    ROOT result = (f16[10,20,30,40], u8[0]) custom-call(input, filter),
-                  window={size=2x2}, dim_labels=b01f_01io->b01f,
-                  custom_call_target="__cudnn$convForward"
-  })");
-  EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie());
-  auto* root = module().entry_computation()->root_instruction();
-
-  SCOPED_TRACE(module().ToString());
-  EXPECT_THAT(root, op::CustomCall(kCudnnConvForwardCallTarget,
-                                   op::Pad(op::Parameter(0), _),
-                                   op::Pad(op::Parameter(1), _)));
-  EXPECT_TRUE(ShapeUtil::Equal(root->operand(0)->shape(),
-                               ShapeUtil::MakeShape(F16, {10, 20, 30, 48})));
-  EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->shape(),
-                               ShapeUtil::MakeShape(F16, {2, 2, 48, 40})));
-}
-
-TEST_F(PadForTensorCoresTest, PadF16BackwardInputConvOutputChannels) {
-  ParseAndVerifyModule(R"(
-  HloModule TestModule
-
-  ENTRY TestComputation {
-    output = f16[10,20,30,41] parameter(0)
-    filter = f16[2,2,40,41] parameter(1)
-    ROOT result = (f16[10,20,30,40], u8[0]) custom-call(output, filter),
-                  window={size=2x2}, dim_labels=b01f_01io->b01f,
-                  custom_call_target="__cudnn$convBackwardInput"
-  })");
-  EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie());
-  auto* root = module().entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::CustomCall(kCudnnConvBackwardInputCallTarget,
-                                   op::Pad(op::Parameter(0), _),
-                                   op::Pad(op::Parameter(1), _)));
-  EXPECT_TRUE(ShapeUtil::Equal(root->operand(0)->shape(),
-                               ShapeUtil::MakeShape(F16, {10, 20, 30, 48})));
-  EXPECT_TRUE(ShapeUtil::Equal(root->operand(1)->shape(),
-                               ShapeUtil::MakeShape(F16, {2, 2, 40, 48})));
-}
-
-TEST_F(PadForTensorCoresTest, PadF16ForwardConvOutputChannels) {
-  ParseAndVerifyModule(R"(
-  HloModule TestModule
-
-  ENTRY TestComputation {
-    input = f16[10,20,30,40] parameter(0)
-    filter = f16[2,2,40,41] parameter(1)
-    ROOT result = (f16[10,20,30,41], u8[0]) custom-call(input, filter),
-                  window={size=2x2}, dim_labels=b01f_01io->b01f,
-                  custom_call_target="__cudnn$convForward"
-  })");
-  EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie());
-  auto* root = module().entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Tuple(op::Slice(op::GetTupleElement(op::CustomCall(
-                                  kCudnnConvForwardCallTarget, op::Parameter(0),
-                                  op::Pad(op::Parameter(1), _)))),
-                              _));
-}
-
-TEST_F(PadForTensorCoresTest, PadF16BackwardInputConvInputChannels) {
-  ParseAndVerifyModule(R"(
-  HloModule TestModule
-
-  ENTRY TestComputation {
-    output = f16[10,20,30,40] parameter(0)
-    filter = f16[2,2,41,40] parameter(1)
-    result = (f16[10,20,30,41], u8[0]) custom-call(output, filter),
-              window={size=2x2}, dim_labels=b01f_01io->b01f,
-              custom_call_target="__cudnn$convBackwardInput"
-    ROOT gte = f16[10,20,30,41] get-tuple-element(result), index=0
-  })");
-  EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie());
-  auto* root = module().entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::GetTupleElement(op::Tuple(
-                        op::Slice(op::GetTupleElement(op::CustomCall(
-                            kCudnnConvBackwardInputCallTarget, op::Parameter(0),
-                            op::Pad(op::Parameter(1), _)))),
-                        _)));
-}
-
-TEST_F(PadForTensorCoresTest, PadF16BackwardFilterConvInputChannels) {
-  ParseAndVerifyModule(R"(
-  HloModule TestModule
-
-  ENTRY TestComputation {
-    input = f16[10,20,30,41] parameter(0)
-    output = f16[10,20,30,40] parameter(1)
-    result = (f16[2,2,41,40], u8[0]) custom-call(input, output),
-              window={size=2x2}, dim_labels=b01f_01io->b01f,
-              custom_call_target="__cudnn$convBackwardFilter"
-    ROOT gte = f16[2,2,41,40] get-tuple-element(result), index=0
-  })");
-  EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie());
-  auto* root = module().entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::GetTupleElement(op::Tuple(
-                        op::Slice(op::GetTupleElement(op::CustomCall(
-                            kCudnnConvBackwardFilterCallTarget,
-                            op::Pad(op::Parameter(0), _), op::Parameter(1)))),
-                        _)));
-}
-
-TEST_F(PadForTensorCoresTest, PadF16BackwardFilterConvOutputChannels) {
-  ParseAndVerifyModule(R"(
-  HloModule TestModule
-
-  ENTRY TestComputation {
-    input = f16[10,20,30,40] parameter(0)
-    output = f16[10,20,30,41] parameter(1)
-    result = (f16[2,2,40,41], u8[0]) custom-call(input, output),
-              window={size=2x2}, dim_labels=b01f_01io->b01f,
-              custom_call_target="__cudnn$convBackwardFilter"
-    ROOT gte = f16[2,2,40,41] get-tuple-element(result), index=0
-  })");
-  EXPECT_TRUE(PadForTensorCores().Run(&module()).ValueOrDie());
-  auto* root = module().entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::GetTupleElement(op::Tuple(
-                        op::Slice(op::GetTupleElement(op::CustomCall(
-                            kCudnnConvBackwardFilterCallTarget,
-                            op::Parameter(0), op::Pad(op::Parameter(1), _)))),
-                        _)));
-}
-
-}  // anonymous namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
deleted file mode 100644
index b42a19e3a2200e917f8040be183b8d79c9e4e161..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ /dev/null
@@ -1,414 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
-
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/window_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-bool IsForwardConvolutionCanonical(const HloInstruction& conv) {
-  CHECK(conv.custom_call_target() == kCudnnConvForwardCallTarget ||
-        conv.custom_call_target() == kCudnnConvBiasActivationForwardCallTarget);
-  return window_util::HasSymmetricPadding(conv.window()) &&
-         !window_util::HasNegativePadding(conv.window()) &&
-         !window_util::HasDilation(conv.window());
-}
-
-// If the (positive and negative) padding on the input operand of a convolution
-// can't be folded into a cuDNN convolution libcall (e.g. uneven padding and
-// dilation), returns kPad and/or kSlice instructions that explicitly apply the
-// padding; otherwise returns the original input operand. When there is both
-// positive padding (including dilation) and negative padding, we insert both
-// kPad and kSlice.
-HloInstruction* MaybePaddedAndSlicedInput(
-    const Window& conv_window, const ConvolutionDimensionNumbers& conv_dnums,
-    HloInstruction* input) {
-  HloComputation* computation = input->parent();
-  if (!window_util::HasSymmetricPadding(conv_window) ||
-      window_util::HasBaseDilation(conv_window)) {
-    // If padding is uneven or has dilation, we insert a kPad instruction that
-    // applies positive padding and dilation.
-    //
-    // TODO(phawkins): If conv_window has asymmetric padding, perhaps instead of
-    // moving all the padding into an explicit pad op, we should keep as much
-    // padding inside of cudnn as possible, on the assumption that padding
-    // within cudnn is basically free, whereas a kPad's cost increases as the
-    // amount of padding increases.
-    PaddingConfig padding_config =
-        MakeNoPaddingConfig(input->shape().dimensions_size());
-    for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
-      int64 dim = conv_dnums.input_spatial_dimensions(i);
-      padding_config.mutable_dimensions(dim)->set_edge_padding_low(
-          std::max<int64>(0LL, conv_window.dimensions(i).padding_low()));
-      padding_config.mutable_dimensions(dim)->set_edge_padding_high(
-          std::max<int64>(0LL, conv_window.dimensions(i).padding_high()));
-      padding_config.mutable_dimensions(dim)->set_interior_padding(
-          conv_window.dimensions(i).base_dilation() - 1);
-    }
-    PrimitiveType element_type = input->shape().element_type();
-    HloInstruction* padding = computation->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
-    input = MakePadHlo(input, padding, padding_config).ValueOrDie();
-  }
-
-  if (window_util::HasNegativePadding(conv_window)) {
-    // If the window has negative padding, insert a kSlice that explicitly
-    // applies negative padding.
-    //
-    // For each dimension, initialize the start index to 0 and the limit index
-    // to the size of that dimension.
-    std::vector<int64> start_indices(input->shape().dimensions_size(), 0);
-    std::vector<int64> limit_indices(input->shape().dimensions().begin(),
-                                     input->shape().dimensions().end());
-    std::vector<int64> strides(input->shape().dimensions_size(), 1);
-    for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
-      int64 dim = conv_dnums.input_spatial_dimensions(i);
-      // If dimension "dim" has negative padding, increase the start index or
-      // decrement the limit index by the amount of negative padding.
-      start_indices[dim] +=
-          std::max<int64>(0LL, -conv_window.dimensions(i).padding_low());
-      limit_indices[dim] -=
-          std::max<int64>(0LL, -conv_window.dimensions(i).padding_high());
-    }
-
-    input =
-        MakeSliceHlo(input, start_indices, limit_indices, strides).ValueOrDie();
-  }
-
-  return input;
-}
-
-// If the padding on the kernel operand of a convolution can't be folded into a
-// cuDNN convolution libcall (e.g. dilation), returns a kPad instruction that
-// explicitly applies the padding; otherwise returns the original kernel
-// operand.
-HloInstruction* MaybePaddedKernel(const Window& conv_window,
-                                  const ConvolutionDimensionNumbers& conv_dnums,
-                                  HloInstruction* kernel) {
-  if (!window_util::HasWindowDilation(conv_window)) {
-    return kernel;
-  }
-
-  // Compute the shape and padding config of the pad to be inserted.
-  PaddingConfig padding_config;
-  for (size_t i = 0; i < kernel->shape().dimensions_size(); ++i) {
-    padding_config.add_dimensions();
-  }
-  for (size_t i = 0; i < conv_dnums.kernel_spatial_dimensions().size(); ++i) {
-    int64 dim = conv_dnums.kernel_spatial_dimensions(i);
-    padding_config.mutable_dimensions(dim)->set_interior_padding(
-        conv_window.dimensions(i).window_dilation() - 1);
-  }
-
-  HloComputation* computation = kernel->parent();
-  PrimitiveType element_type = kernel->shape().element_type();
-  HloInstruction* padding = computation->AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
-  return MakePadHlo(kernel, padding, padding_config).ValueOrDie();
-}
-}  // namespace
-
-bool PadInsertion::CanonicalizeForwardConvolution(HloInstruction* conv) {
-  if (IsForwardConvolutionCanonical(*conv)) {
-    return false;
-  }
-
-  // Insert slices and/or pads between the convolution and its input and/or
-  // kernel operand.
-  HloInstruction* new_input = MaybePaddedAndSlicedInput(
-      conv->window(), conv->convolution_dimension_numbers(),
-      conv->mutable_operand(0));
-  HloInstruction* new_kernel =
-      MaybePaddedKernel(conv->window(), conv->convolution_dimension_numbers(),
-                        conv->mutable_operand(1));
-
-  // Remove the padding from convolution's window field. These paddings are
-  // made explicit with the inserted pads.
-  Window new_conv_window = conv->window();
-  for (size_t i = 0; i < new_conv_window.dimensions_size(); ++i) {
-    WindowDimension* dim = new_conv_window.mutable_dimensions(i);
-
-    // The size of the kernel may have changed so update the Window to match.
-    dim->set_size(new_kernel->shape().dimensions(
-        conv->convolution_dimension_numbers().kernel_spatial_dimensions(i)));
-    dim->set_padding_low(0);
-    dim->set_padding_high(0);
-    dim->set_base_dilation(1);
-    dim->set_window_dilation(1);
-  }
-
-  // The conv CustomCall returns a tuple (conv_result, scratch_buffer).  Extract
-  // out the shape of conv_result.
-  VLOG(1) << "Canonicalizing forward conv";
-  std::vector<HloInstruction*> operands(conv->operands().begin(),
-                                        conv->operands().end());
-  operands[0] = new_input;
-  operands[1] = new_kernel;
-  auto new_conv = conv->parent()->AddInstruction(
-      conv->CloneWithNewOperands(conv->shape(), operands));
-  new_conv->set_window(new_conv_window);
-  VLOG(1) << "Replacing:\n  " << conv->ToString() << "\nwith:\n  "
-          << new_conv->ToString();
-  TF_CHECK_OK(conv->parent()->ReplaceInstruction(conv, new_conv));
-  return true;
-}
-
-namespace {
-void IncreasePaddingLowBy(int64 delta, WindowDimension* window_dim) {
-  window_dim->set_padding_low(window_dim->padding_low() + delta);
-}
-
-void IncreasePaddingHighBy(int64 delta, WindowDimension* window_dim) {
-  window_dim->set_padding_high(window_dim->padding_high() + delta);
-}
-}  // namespace
-
-bool PadInsertion::CanonicalizeBackwardFilterConvolution(
-    HloInstruction* backward_conv) {
-  CHECK_EQ(backward_conv->custom_call_target(),
-           kCudnnConvBackwardFilterCallTarget);
-  if (window_util::HasSymmetricPadding(backward_conv->window())) {
-    return false;
-  }
-
-  // A backward filter convolution with uneven padding can be canonicalized to
-  // one with even padding by padding the activations (input) beforehand. For
-  // example,
-  //   BackwardFilterConv(ABCD, xyz, padding_low=1, padding_high=2)
-  // is equivalent to
-  //   ABCD0 = Pad(ABCD, padding_high=1)
-  //   BackwardFilterConv(ABCD0, xyz, padding_low=pading_high=1)
-  // We choose the lesser of padding_low and padding_high as the new padding.
-  HloInstruction* input = backward_conv->mutable_operand(0);
-  Window new_backward_conv_window = backward_conv->window();
-  // input_padding_config is the config of the kPad to be inserted.
-  PaddingConfig input_padding_config =
-      MakeNoPaddingConfig(ShapeUtil::Rank(input->shape()));
-  ConvolutionDimensionNumbers backward_conv_dnums =
-      backward_conv->convolution_dimension_numbers();
-  for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
-    int64 padding_low = backward_conv->window().dimensions(i).padding_low();
-    int64 padding_high = backward_conv->window().dimensions(i).padding_high();
-    if (padding_low < 0 || padding_high < 0) {
-      // TODO(b/32744257): The following canonicalization wouldn't remove
-      // negative padding in a backward convolution, and would therefore cause
-      // cuDNN convolution (which doesn't support negative padding) to fail.
-      return false;
-    }
-    // Compute the new, even padding for the backward conv operation.
-    int64 new_conv_padding = std::min(padding_low, padding_high);
-    int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
-    input_padding_config.mutable_dimensions(dim)->set_edge_padding_low(
-        padding_low - new_conv_padding);
-    input_padding_config.mutable_dimensions(dim)->set_edge_padding_high(
-        padding_high - new_conv_padding);
-
-    // Since we move some padding from the backward convolution to the kPad, we
-    // need to accordingly reduce the padding amount of the backward convolution
-    // and its inner forward convolution.
-    auto* new_dim = new_backward_conv_window.mutable_dimensions(i);
-    new_dim->set_padding_low(new_conv_padding);
-    new_dim->set_padding_high(new_conv_padding);
-  }
-
-  // Create a new backward convolution replacing the old one.
-  HloComputation* computation = backward_conv->parent();
-  HloInstruction* output = backward_conv->mutable_operand(1);
-  HloInstruction* padding =
-      computation->AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::Zero(input->shape().element_type())));
-  HloInstruction* padded_input =
-      MakePadHlo(input, padding, input_padding_config).ValueOrDie();
-
-  // The shape of the backward_conv CustomCall is a tuple (conv_result,
-  // scratch_buffer).  Extract out the shape of conv_result.
-  HloInstruction* new_backward_conv =
-      computation->AddInstruction(backward_conv->CloneWithNewOperands(
-          backward_conv->shape(), {padded_input, output}));
-  new_backward_conv->set_window(new_backward_conv_window);
-
-  VLOG(1) << "Canonicalizing backward filter conv";
-  VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
-          << new_backward_conv->ToString();
-
-  TF_CHECK_OK(
-      computation->ReplaceInstruction(backward_conv, new_backward_conv));
-  return true;
-}
-
-bool PadInsertion::CanonicalizeBackwardInputConvolution(
-    HloInstruction* backward_conv) {
-  if (window_util::HasSymmetricPadding(backward_conv->window())) {
-    return false;
-  }
-
-  Window new_backward_conv_window = backward_conv->window();
-  ConvolutionDimensionNumbers backward_conv_dnums =
-      backward_conv->convolution_dimension_numbers();
-
-  // The backward_conv CustomCall returns a tuple (conv_result, scratch_memory).
-  // Get the shape of conv_result.
-  Shape backward_conv_shape = backward_conv->shape().tuple_shapes(0);
-
-  Shape new_backward_conv_shape = backward_conv_shape;
-  for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
-    int64 padding_low = backward_conv->window().dimensions(i).padding_low();
-    int64 padding_high = backward_conv->window().dimensions(i).padding_high();
-    if (padding_low < 0 || padding_high < 0) {
-      // TODO(b/32744257): The following canonicalization wouldn't remove
-      // negative padding in a backward convolution, and would therefore cause
-      // cuDNN convolution (which doesn't support negative padding) to fail.
-      return false;
-    }
-    // If the backward convolution has uneven padding on the activations, we
-    // move some padding on the larger end to "internal" padding, so that the
-    // backward convolution produces larger activations which get sliced later.
-    //
-    // For example, suppose we have a non-canonical HLO
-    //   [A] = BackwardInputConvolve([a b], [x y z], padding=(low=2,high=1))
-    // where the amount of padding low is larger, we can canonicalize it to
-    //   [B A] = BackwardInputConvolve([a b], [x y z], padding=(low=1,high=1))
-    //   [A] = Slice([B A])
-    if (padding_low > padding_high) {
-      IncreasePaddingLowBy(padding_high - padding_low,
-                           new_backward_conv_window.mutable_dimensions(i));
-    } else if (padding_low < padding_high) {
-      IncreasePaddingHighBy(padding_low - padding_high,
-                            new_backward_conv_window.mutable_dimensions(i));
-    }
-    // Decreasing the padding by X *increases* the size of our output by X.
-    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
-    new_backward_conv_shape.set_dimensions(
-        dim, new_backward_conv_shape.dimensions(dim) +
-                 std::abs(padding_low - padding_high));
-  }
-
-  // Create a new backward convolution replacing the old one.
-  HloComputation* computation = backward_conv->parent();
-  HloInstruction* output = backward_conv->mutable_operand(0);
-  HloInstruction* filter = backward_conv->mutable_operand(1);
-
-  HloInstruction* new_backward_conv_call =
-      computation->AddInstruction(backward_conv->CloneWithNewOperands(
-          ShapeUtil::MakeTupleShape(
-              {new_backward_conv_shape, ShapeUtil::MakeShape(U8, {0})}),
-          {output, filter}));
-  new_backward_conv_call->set_window(new_backward_conv_window);
-
-  // The CustomCall created above returns a tuple (conv_result, scratch_memory).
-  // Extract out the two elements.
-  HloInstruction* new_backward_conv =
-      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
-          new_backward_conv_shape, new_backward_conv_call, 0));
-  HloInstruction* new_backward_conv_scratch =
-      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
-          new_backward_conv_call->shape().tuple_shapes(1),
-          new_backward_conv_call, 1));
-
-  // Slice the new backward convolution.
-  //
-  // Initialize start_indices and limit_indices as no slicing.
-  std::vector<int64> start_indices(new_backward_conv->shape().dimensions_size(),
-                                   0LL);
-  std::vector<int64> limit_indices(
-      new_backward_conv->shape().dimensions().begin(),
-      new_backward_conv->shape().dimensions().end());
-  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(), 1LL);
-  for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
-    int64 padding_low = backward_conv->window().dimensions(i).padding_low();
-    int64 padding_high = backward_conv->window().dimensions(i).padding_high();
-    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
-    if (padding_low > padding_high) {
-      // If the amount of low padding (of the old backward convolution) is
-      // larger, we internally pad the low end of the activations and slice
-      // internal padding out here.
-      start_indices[dim] += padding_low - padding_high;
-    } else if (padding_low < padding_high) {
-      // If the amount of high padding is larger, we slice out the internal
-      // padding on the high end.
-      limit_indices[dim] -= padding_high - padding_low;
-    }
-  }
-
-  // Replace the old backward convolution with the slice.
-  Shape slice_shape =
-      ShapeInference::InferSliceShape(new_backward_conv->shape(), start_indices,
-                                      limit_indices, strides)
-          .ConsumeValueOrDie();
-  CHECK(ShapeUtil::Compatible(slice_shape, backward_conv_shape))
-      << ShapeUtil::HumanString(slice_shape) << " vs "
-      << ShapeUtil::HumanString(backward_conv_shape);
-
-  HloInstruction* slice = computation->AddInstruction(
-      HloInstruction::CreateSlice(backward_conv_shape, new_backward_conv,
-                                  start_indices, limit_indices, strides));
-  HloInstruction* new_tuple = computation->AddInstruction(
-      HloInstruction::CreateTuple({slice, new_backward_conv_scratch}));
-
-  VLOG(1) << "Canonicalizing backward input conv";
-  VLOG(1) << "Replacing:\n  " << backward_conv->ToString() << "\nwith:\n  "
-          << new_tuple->ToString();
-
-  TF_CHECK_OK(computation->ReplaceInstruction(backward_conv, new_tuple));
-  return true;
-}
-
-StatusOr<bool> PadInsertion::RunOnComputation(HloComputation* computation) {
-  bool changed = false;
-  std::vector<HloInstruction*> convs;
-  for (auto* instr : computation->instructions()) {
-    if (IsCustomCallToDnnConvolution(*instr)) {
-      convs.push_back(instr);
-    }
-  }
-  for (HloInstruction* instruction : convs) {
-    const auto& target = instruction->custom_call_target();
-    if (target == kCudnnConvForwardCallTarget ||
-        target == kCudnnConvBiasActivationForwardCallTarget) {
-      changed |= CanonicalizeForwardConvolution(instruction);
-    } else if (target == kCudnnConvBackwardFilterCallTarget) {
-      changed |= CanonicalizeBackwardFilterConvolution(instruction);
-    } else if (target == kCudnnConvBackwardInputCallTarget) {
-      changed |= CanonicalizeBackwardInputConvolution(instruction);
-    } else {
-      LOG(FATAL) << "Unknown custom call target for cudnn conv: "
-                 << instruction->ToString();
-    }
-  }
-  return changed;
-}
-
-StatusOr<bool> PadInsertion::Run(HloModule* module) {
-  bool changed = false;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
-    changed |= result;
-  }
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.h b/tensorflow/compiler/xla/service/gpu/pad_insertion.h
deleted file mode 100644
index 25cdf64c4cf01300869044d3e4d7c34c85626a5a..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PAD_INSERTION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PAD_INSERTION_H_
-
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
-
-namespace xla {
-namespace gpu {
-
-// An HLO pass that canonicalizes convolution instructions for GPU codegen. It
-// inserts Pad instructions before Convolution instructions with uncanonicalized
-// padding, so that they can be lowered to cuDNN convolution.
-class PadInsertion : public HloModulePass {
- public:
-  absl::string_view name() const override { return "pad insertion"; }
-
-  StatusOr<bool> Run(HloModule* module) override;
-
- private:
-  StatusOr<bool> RunOnComputation(HloComputation* computation);
-  // Returns if any changes are made to the parent computation.
-  bool CanonicalizeForwardConvolution(HloInstruction* conv);
-  bool CanonicalizeBackwardFilterConvolution(HloInstruction* backward_conv);
-  bool CanonicalizeBackwardInputConvolution(HloInstruction* backward_conv);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_PAD_INSERTION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index 5b6cf2c04d05378a363232e33a6df6432cd6848e..4775baf44aecfe6adaf2bf0d2791595436635b16 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -122,7 +122,7 @@ std::unique_ptr<StreamAssignment> AssignStreams(const HloModule& module) {
   auto stream_assignment = absl::make_unique<StreamAssignment>();
   const HloComputation& computation = *module.entry_computation();
   std::unique_ptr<HloReachabilityMap> reachability =
-      computation.ComputeReachability();
+      HloReachabilityMap::Build(&computation);
   std::vector<const HloInstruction*> seen_gemms;
   // The execution of different RNG Hlo instructions in the same module updates
   // a common global variable. To avoid a race condition, we simply assign all
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index c4f43cc9a614283acb376b5f98e4976615b590ad..31a5d7a8c04e9863830e2026fc73cd7ded8c322e 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -21,16 +21,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
 
-class StreamAssignmentTest : public HloVerifiedTestBase {
+class StreamAssignmentTest : public HloTestBase {
  protected:
-  std::unique_ptr<HloModule> CreateNewModule() {
+  std::unique_ptr<HloModule> CreateNewVerifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -55,7 +55,7 @@ TEST_F(StreamAssignmentTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -76,7 +76,7 @@ TEST_F(StreamAssignmentTest, ConcurrentMatMul) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -120,7 +120,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index a7255335672a3622d122e9fc5ebfab236a5ba895..d798b31643782eb25bba08227e29903ec0e7a597 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -37,7 +37,7 @@ cc_library(
     hdrs = ["gpu_codegen_test.h"],
     tags = tf_cuda_tests_tags(),
     deps = [
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable",
         "//tensorflow/compiler/xla/tests:filecheck",
@@ -211,15 +211,13 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "cudnn_fused_convolution_rewriter_test",
-    srcs = ["cudnn_fused_convolution_rewriter_test.cc"],
+    name = "gpu_atomic_test",
+    srcs = ["gpu_atomic_test.cc"],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":gpu_codegen_test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/tests/cudnn_fused_convolution_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/tests/cudnn_fused_convolution_rewriter_test.cc
deleted file mode 100644
index 5632cac1862e21825888d94ab1eee5e1c9fd6800..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/tests/cudnn_fused_convolution_rewriter_test.cc
+++ /dev/null
@@ -1,283 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "absl/strings/str_replace.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-class CudnnFusedConvolutionRewriterTest : public HloTestBase {
- protected:
-  string GetOptimizedHlo(absl::string_view hlo_string) {
-    return backend()
-        .compiler()
-        ->RunHloPasses(ParseHloString(hlo_string, GetModuleConfigForTest())
-                           .ConsumeValueOrDie(),
-                       backend().default_stream_executor(),
-                       backend().memory_allocator())
-        .ConsumeValueOrDie()
-        ->ToString();
-  }
-
-  void TestMatchWithAllTypes(absl::string_view hlo_string) {
-    for (absl::string_view type : {"f16", "f32", "f64"}) {
-      const string hlo_with_new_type =
-          absl::StrReplaceAll(hlo_string, {{"TYPE", type}});
-      const string optimized_hlo_string = GetOptimizedHlo(hlo_with_new_type);
-      EXPECT_EQ(absl::string_view::npos,
-                optimized_hlo_string.find("__cudnn$convForward"))
-          << optimized_hlo_string;
-      EXPECT_NE(absl::string_view::npos,
-                optimized_hlo_string.find("__cudnn$convBiasActivationForward"))
-          << optimized_hlo_string;
-      EXPECT_TRUE(RunAndCompare(hlo_with_new_type, ErrorSpec{0.01}))
-          << optimized_hlo_string;
-    }
-  }
-
-  void TestNotMatchWithAllTypes(absl::string_view hlo_string) {
-    for (absl::string_view type : {"f16", "f32", "f64"}) {
-      const string hlo_with_new_type =
-          absl::StrReplaceAll(hlo_string, {{"TYPE", type}});
-      string optimized_hlo = GetOptimizedHlo(hlo_with_new_type);
-      EXPECT_NE(absl::string_view::npos,
-                optimized_hlo.find("__cudnn$convForward"))
-          << optimized_hlo;
-      EXPECT_EQ(absl::string_view::npos,
-                optimized_hlo.find("__cudnn$convBiasActivationForward"))
-          << optimized_hlo;
-    }
-  }
-};
-
-TEST_F(CudnnFusedConvolutionRewriterTest, TestConvOnly) {
-  // max(0, conv(x, w));
-  TestMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      zero = TYPE[] constant(0)
-      zeros = TYPE[1,32,9,9] broadcast(zero), dimensions={}
-
-      input = TYPE[1,17,9,9] parameter(0)
-      filter = TYPE[3,3,17,32] parameter(1)
-
-      conv = TYPE[1,32,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
-      ROOT relu = TYPE[1,32,9,9] maximum(zeros, conv)
-    })");
-}
-
-TEST_F(CudnnFusedConvolutionRewriterTest, TestBias) {
-  // max(0, conv(x, w) + bias);
-  TestMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      zero = TYPE[] constant(0)
-      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
-
-      input = TYPE[1,3,3,64] parameter(0)
-      filter = TYPE[3,3,64,64] parameter(1)
-      bias = TYPE[64] parameter(2)
-
-      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
-      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
-      add1 = TYPE[1,3,3,64] add(conv, broadcasted_bias)
-      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
-    })");
-}
-
-TEST_F(CudnnFusedConvolutionRewriterTest, TestSideInputOnly) {
-  // max(0, conv(x, w) + side_input);
-  TestMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      zero = TYPE[] constant(0)
-      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
-
-      input = TYPE[1,3,3,64] parameter(0)
-      filter = TYPE[3,3,64,64] parameter(1)
-      side_input = TYPE[1,3,3,64] parameter(2)
-
-      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
-      add1 = TYPE[1,3,3,64] add(conv, side_input)
-      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
-    })");
-}
-
-TEST_F(CudnnFusedConvolutionRewriterTest, TestBiasAndSideInput) {
-  // max(0, conv(x, w) + side_input + bias);
-  TestMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      zero = TYPE[] constant(0)
-      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
-
-      input = TYPE[1,3,3,64] parameter(0)
-      filter = TYPE[3,3,64,64] parameter(1)
-      side_input = TYPE[1,3,3,64] parameter(2)
-      bias = TYPE[64] parameter(3)
-
-      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
-      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
-      add1 = TYPE[1,3,3,64] add(conv, broadcasted_bias)
-      add2 = TYPE[1,3,3,64] add(add1, side_input)
-      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add2)
-    })");
-}
-
-TEST_F(CudnnFusedConvolutionRewriterTest, TestScaledConv) {
-  // max(0, 0.999994934 * conv(x, w));
-  TestMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      zero = TYPE[] constant(0)
-      zeros = TYPE[1,32,9,9] broadcast(zero), dimensions={}
-      alpha_conv_scalar = TYPE[] constant(0.999994934)
-
-      input = TYPE[1,17,9,9] parameter(0)
-      filter = TYPE[3,3,17,32] parameter(1)
-
-      conv = TYPE[1,32,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
-      alpha_conv = TYPE[1,32,9,9] broadcast(alpha_conv_scalar), dimensions={}
-      scaled_conv = TYPE[1,32,9,9] multiply(conv, alpha_conv)
-      ROOT relu = TYPE[1,32,9,9] maximum(zeros, scaled_conv)
-    })");
-}
-
-TEST_F(CudnnFusedConvolutionRewriterTest, TestScaledConvAndSideInput) {
-  // max(0, conv(x, w) + 0.899994934 * side_input);
-  TestMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      zero = TYPE[] constant(0)
-      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
-      alpha_side_input_scalar = TYPE[] constant(0.899994934)
-      alpha_side_input = TYPE[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
-
-      input = TYPE[1,3,3,64] parameter(0)
-      filter = TYPE[3,3,64,64] parameter(1)
-      side_input = TYPE[1,3,3,64] parameter(2)
-
-      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
-      scaled_side_input = TYPE[1,3,3,64] multiply(side_input, alpha_side_input)
-      add1 = TYPE[1,3,3,64] add(conv, scaled_side_input)
-      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
-    })");
-}
-
-TEST_F(CudnnFusedConvolutionRewriterTest, TestScaledConvAndScaledSideInput) {
-  // max(0, 0.999994934 * conv(x, w) + 0.899994934 * side_input);
-  TestMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      zero = TYPE[] constant(0)
-      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
-      alpha_conv_scalar = TYPE[] constant(0.999994934)
-      alpha_conv = TYPE[1,3,3,64] broadcast(alpha_conv_scalar), dimensions={}
-      alpha_side_input_scalar = TYPE[] constant(0.899994934)
-      alpha_side_input = TYPE[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
-
-      input = TYPE[1,3,3,64] parameter(0)
-      filter = TYPE[3,3,64,64] parameter(1)
-      side_input = TYPE[1,3,3,64] parameter(2)
-
-      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
-      scaled_conv = TYPE[1,3,3,64] multiply(conv, alpha_conv)
-      scaled_side_input = TYPE[1,3,3,64] multiply(side_input, alpha_side_input)
-      add1 = TYPE[1,3,3,64] add(scaled_conv, scaled_side_input)
-      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add1)
-    })");
-}
-
-TEST_F(CudnnFusedConvolutionRewriterTest,
-       TestScaledConvAndScaledSideInputWithBias) {
-  // max(0, 0.999994934 * conv(x, w) + 0.899994934 * side_input + bias);
-  TestMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      zero = TYPE[] constant(0)
-      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
-      alpha_conv_scalar = TYPE[] constant(0.999994934)
-      alpha_conv = TYPE[1,3,3,64] broadcast(alpha_conv_scalar), dimensions={}
-      alpha_side_input_scalar = TYPE[] constant(0.899994934)
-      alpha_side_input = TYPE[1,3,3,64] broadcast(alpha_side_input_scalar), dimensions={}
-
-      input = TYPE[1,3,3,64] parameter(0)
-      filter = TYPE[3,3,64,64] parameter(1)
-      side_input = TYPE[1,3,3,64] parameter(2)
-      bias = TYPE[64] parameter(3)
-
-      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
-      scaled_conv = TYPE[1,3,3,64] multiply(conv, alpha_conv)
-      scaled_side_input = TYPE[1,3,3,64] multiply(side_input, alpha_side_input)
-      broadcasted_bias = TYPE[1,3,3,64] broadcast(bias), dimensions={3}
-      add1 = TYPE[1,3,3,64] add(scaled_conv, broadcasted_bias)
-      add2 = TYPE[1,3,3,64] add(add1, scaled_side_input)
-      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add2)
-    })");
-}
-
-TEST_F(CudnnFusedConvolutionRewriterTest, TestMatchMaxZeroOnly) {
-  // max(0.1, conv(x, w)) shouldn't match.
-  TestNotMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      point_one = TYPE[] constant(0.1)
-      point_ones = TYPE[1,32,9,9] broadcast(point_one), dimensions={}
-
-      input = TYPE[1,17,9,9] parameter(0)
-      filter = TYPE[3,3,17,32] parameter(1)
-
-      conv = TYPE[1,32,9,9] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, feature_group_count=1
-      ROOT relu = TYPE[1,32,9,9] maximum(point_ones, conv)
-    })");
-}
-
-TEST_F(CudnnFusedConvolutionRewriterTest, TestMatchBroadcastedBiasOnly) {
-  // max(0, conv(x, w) + side_input1 + side_input2) shouldn't match.
-  TestNotMatchWithAllTypes(R"(
-    HloModule Test
-
-    ENTRY Test {
-      zero = TYPE[] constant(0)
-      zeros = TYPE[1,3,3,64] broadcast(zero), dimensions={}
-
-      input = TYPE[1,3,3,64] parameter(0)
-      filter = TYPE[3,3,64,64] parameter(1)
-      side_input1 = TYPE[1,3,3,64] parameter(2)
-      side_input2 = TYPE[1,3,3,64] parameter(3)
-
-      conv = TYPE[1,3,3,64] convolution(input, filter), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f, feature_group_count=1
-      add1 = TYPE[1,3,3,64] add(conv, side_input2)
-      add2 = TYPE[1,3,3,64] add(add1, side_input1)
-      ROOT relu = TYPE[1,3,3,64] maximum(zeros, add2)
-    })");
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b18c4c63714b4b3c06d7fa85f4a7a75b8e9ae12
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_atomic_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuAtomicTest : public GpuCodegenTest {};
+
+TEST_F(GpuAtomicTest, TestStore) {
+  const char* hlo_string = R"(
+    HloModule TensorFlowScatterV1
+
+    update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+      lhs = s32[] parameter(0)
+      ROOT rhs = s32[] parameter(1)
+    }
+
+    ENTRY main {
+      operand = s32[3,3] parameter(0)
+      indices = s32[2] parameter(1)
+      updates = s32[2,3] parameter(2)
+      ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+          to_apply=update_s32,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+    }
+)";
+
+  CompileAndVerifyIr(hlo_string, R"(
+CHECK: store atomic{{.*}}unordered, align 4
+)");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
index 79e77d4c4d649020cf52ac25c220c3f90e8469b9..9e3ff8750b88d08bcbc1aae3faead5aecfa19848 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/tests/filecheck.h"
 #include "tensorflow/core/platform/logging.h"
@@ -23,9 +23,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-std::unique_ptr<HloModule> GpuCodegenTest::CreateNewModuleWithFTZ(bool ftz) {
+std::unique_ptr<HloModule> GpuCodegenTest::CreateNewUnverifiedModuleWithFTZ(
+    bool ftz) {
   HloModuleConfig config;
-  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  auto debug_options = GetDebugOptionsFromFlags();
   debug_options.set_xla_gpu_ftz(ftz);
   debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
index e4a3573babb7ed746504c1466f85b582aa4d044f..d917320e36363c4fa7e4c0055e8f3345cbc610a2 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -26,9 +26,9 @@ namespace gpu {
 // Tests that verify IR or PTX emitted by the GPU backend is as expected.
 class GpuCodegenTest : public LlvmIrGenTestBase {
  protected:
-  // Like HloTestBase::CreateNewModule(), with a flag for configuring the ftz
-  // option.
-  std::unique_ptr<HloModule> CreateNewModuleWithFTZ(bool ftz);
+  // Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring
+  // the ftz option.
+  std::unique_ptr<HloModule> CreateNewUnverifiedModuleWithFTZ(bool ftz);
 
   // Compiles the given HLO module to PTX and verifies the PTX matches the given
   // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index 780539c164277f14c2bd964024f7c3ca179f4ada..a1ed8499040359fe7265a7317b0577a990a2234c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -46,7 +46,7 @@ TEST_F(GpuCopyTest, UseMemcpy) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // There should not be any kernel prefixed "copy".
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index 177b94934c7f519172508b5cc6e088f908401193..5e524faab18947f5793dc2ae34e9329a446d4235 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -39,7 +39,7 @@ class GpuFtzTest : public GpuCodegenTest {
         /* parameter_number=*/1, param_shape, "y"));
     builder.AddInstruction(HloInstruction::CreateBinary(param_shape, op, x, y));
 
-    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    auto hlo_module = CreateNewUnverifiedModuleWithFTZ(ftz_);
     hlo_module->AddEntryComputation(builder.Build());
     return hlo_module;
   }
@@ -54,7 +54,7 @@ class GpuFtzTest : public GpuCodegenTest {
         /* parameter_number=*/0, param_shape, "x"));
     builder.AddInstruction(HloInstruction::CreateUnary(param_shape, op, x));
 
-    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    auto hlo_module = CreateNewUnverifiedModuleWithFTZ(ftz_);
     hlo_module->AddEntryComputation(builder.Build());
     return hlo_module;
   }
@@ -75,16 +75,16 @@ class GpuFtzDisabledTest : public GpuFtzTest {
 // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
 TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
-    CHECK-NOT: mul.f32
-    CHECK: mul.ftz.f32
-    CHECK-NOT: mul.f32
+    CHECK-NOT: mul.rn.f32
+    CHECK: mul.rn.ftz.f32
+    CHECK-NOT: mul.rn.f32
   )");
 }
 TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
-    CHECK-NOT: mul.ftz.f32
-    CHECK: mul.f32
-    CHECK-NOT: mul.ftz.f32
+    CHECK-NOT: mul.rn.ftz.f32
+    CHECK: mul.rn.f32
+    CHECK-NOT: mul.rn.ftz.f32
   )");
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index a06576df7b874745236a8d9075355a01ec42e777..6814be779e0b02c38e3bc7008f036b845d88cb6f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -51,7 +51,7 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y));
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index 15d1e269cc22b88f5269175084f20600f165011c..a302b582ede3723acd118d2e4a4bb3efdf7a4d0b 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -193,6 +193,33 @@ TEST_F(GpuKernelTilingTest,
                      /*match_optimized_ir=*/true);
 }
 
+TEST_F(GpuKernelTilingTest, FusionTransposeWithReverseNotTiled) {
+  const char *const kHloString = R"(
+    HloModule FusionTransposeWithReverseNotTiled
+    fused_computation.1 {
+      arg0 = f32[128,64]{1,0} parameter(0)
+      copy0 = f32[128,64]{0,1} copy(arg0)
+      ROOT reverse0 = f32[128,64]{0,1} reverse(copy0), dimensions={0}
+    }
+
+    ENTRY reverse_break_assumption {
+      param0 = f32[128,64]{1,0} parameter(0)
+      ROOT fusion0 = f32[128,64]{0,1} fusion(param0), kind=kLoop,
+        calls=fused_computation.1
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index 6a9ecd9dae7c9ddde0b56d8615e4a39fb3df0af9..3019215c015a4e0aa094a62424d650ced0de2a0e 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -48,7 +48,7 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -73,7 +73,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
   builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -95,7 +95,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
 // reduce in the foreseeable future.  But if that turns out to be wrong, I give
 // you, future reader, permission to delete this test.
 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation* reduce_computation;
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index 15198865bda98f9718342d5a444a20305f923b48..ca0a78034d7dc83d17ad72202914d95f37ac122b 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -47,7 +47,7 @@ TEST_F(GpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyIr(std::move(hlo_module),
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
index 0f2d5568cafc9db0f5f067437fdd5e2e775ad2c8..4636f1d9d20b8c213ffadec427b3820a89c68a7f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -85,7 +85,7 @@ TEST_F(GpuUnrollingTest, UnrollFourTimes) {
 TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
   // The default unrolling factor is 4.
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsFromFlags());
   auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
 
   CompileAndVerifyIr(std::move(hlo_module),
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index 141f3219387940a08ef22cbcc0be0971a14c2cd6..6b2d76764a077dc6cfa3f9ddc6e525ab330323be 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -45,7 +45,7 @@ void ThunkSchedule::AddDependenciesOnTransitiveOperands(
 ThunkSchedule::ThunkSchedule(
     std::unique_ptr<ThunkSequence> thunks,
     std::unique_ptr<StreamAssignment> stream_assignment,
-    const std::vector<const HloInstruction*>& hlo_total_order)
+    const std::vector<HloInstruction*>& hlo_total_order)
     : thunks_(std::move(thunks)),
       stream_assignment_(std::move(stream_assignment)) {
   std::unordered_map<const HloInstruction*, Thunk*> hlo_to_thunk;
@@ -53,7 +53,7 @@ ThunkSchedule::ThunkSchedule(
     InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get());
   }
 
-  for (const HloInstruction* hlo : hlo_total_order) {
+  for (HloInstruction* hlo : hlo_total_order) {
     if (hlo_to_thunk.count(hlo)) {
       thunk_total_order_.push_back(FindOrDie(hlo_to_thunk, hlo));
     }
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
index d3352994f845a535233612a17e19107511ce0622..43b628a1baf0e79a3197f3cfad3547991642eaed 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
@@ -46,7 +46,7 @@ class ThunkSchedule {
  public:
   ThunkSchedule(std::unique_ptr<ThunkSequence> thunks,
                 std::unique_ptr<StreamAssignment> stream_assignment,
-                const std::vector<const HloInstruction*>& hlo_total_order);
+                const std::vector<HloInstruction*>& hlo_total_order);
 
   // Returns the total order of executing all the thunks.
   const std::vector<Thunk*>& TotalOrder() const { return thunk_total_order_; }
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c552c2925497f1c4808d74a615d35cdbeeba1858
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h"
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+// The parameter space on the GPU device is limited. We pick an arbitrary low
+// constant here to try to prevent exceeding this parameter space. For a proper
+// fix, we would have to take into account which parameters share a buffer, and
+// how big these buffers are.
+constexpr int32 kMaxParameters = 128;
+
+StatusOr<bool> SplitConcatenate(HloInstruction* concat, HloComputation* comp) {
+  auto operands = concat->operands();
+  std::vector<HloInstruction*> operands_to_split(operands.begin(),
+                                                 operands.end());
+  while (operands_to_split.size() > 1) {
+    std::vector<HloInstruction*> new_operands;
+    absl::Span<HloInstruction*> operands_span(operands_to_split);
+    for (int64 offset = 0; offset < operands_to_split.size();
+         offset += kMaxParameters) {
+      // Check if there is a remainder of operands that does not completely fill
+      // one "batch" of exactly 'kMaxParameters' operands. If there are only
+      // less than 'kMaxParameters' operands left, then we still put them into a
+      // concat together. Otherwise, we spare them for another round so that
+      // they can be put together into a concat with some of the newly created
+      // concats.
+      if (offset > 0 && offset + kMaxParameters > operands_to_split.size()) {
+        new_operands.insert(new_operands.end(),
+                            operands_to_split.begin() + offset,
+                            operands_to_split.end());
+      } else {
+        Shape new_shape = concat->shape();
+        int64 concat_dimension_size = 0;
+        for (int64 i = 0;
+             i < kMaxParameters && offset + i < operands_to_split.size(); ++i) {
+          concat_dimension_size +=
+              operands_to_split[i + offset]->shape().dimensions(
+                  concat->concatenate_dimension());
+        }
+        new_shape.set_dimensions(concat->concatenate_dimension(),
+                                 concat_dimension_size);
+        auto new_concat = comp->AddInstruction(concat->CloneWithNewOperands(
+            new_shape, operands_span.subspan(offset, kMaxParameters)));
+        new_operands.push_back(new_concat);
+      }
+    }
+    operands_to_split = new_operands;
+  }
+  TF_RETURN_IF_ERROR(comp->ReplaceInstruction(concat, operands_to_split[0]));
+  return true;
+}
+
+std::vector<HloInstruction*> GetRelevantVariadicOps(HloComputation* comp) {
+  std::vector<HloInstruction*> ops;
+  for (HloInstruction* instr : comp->instructions()) {
+    if (instr->opcode() == HloOpcode::kConcatenate &&
+        instr->operand_count() > kMaxParameters) {
+      ops.push_back(instr);
+    }
+  }
+  return ops;
+}
+
+}  // namespace
+
+StatusOr<bool> VariadicOpSplitter::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* comp : module->MakeNonfusionComputations()) {
+    for (HloInstruction* op : GetRelevantVariadicOps(comp)) {
+      // TODO(b/112613927): Handle also other ops than concatenate.
+      TF_ASSIGN_OR_RETURN(bool result, SplitConcatenate(op, comp));
+      changed |= result;
+    }
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..7673ad0d48a04508987025dac84b60e396e3d7dc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_VARIADIC_OP_SPLITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_VARIADIC_OP_SPLITTER_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+// Splits variadic ops with many operands into pieces such that we don't exceed
+// the parameter space on the GPU. Currently only concatenate ops are split up.
+class VariadicOpSplitter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "variadic-op-splitter"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_VARIADIC_OP_SPLITTER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d00ac4dc7b57664a317157c093d7ffaa01b4fd6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/variadic_op_splitter.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+using match::Concatenate;
+
+class VariadicOpSplitterTest : public HloTestBase {};
+
+TEST_F(VariadicOpSplitterTest, DontSplit) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule TestModule
+
+  ENTRY TestComputation {
+    p0 = f16[30,41] parameter(0)
+    p1 = f16[30,41] parameter(1)
+    ROOT result = f16[60, 41] concatenate(p0, p1), dimensions={0}
+  })")
+                    .ValueOrDie();
+  EXPECT_FALSE(VariadicOpSplitter().Run(module.get()).ValueOrDie());
+}
+
+TEST_F(VariadicOpSplitterTest, SplitInto2) {
+  auto builder = HloComputation::Builder(TestName());
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({42})));
+  std::vector<HloInstruction*> concat_operands(255, operand);
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      ShapeUtil::MakeShape(S32, {255}), concat_operands, 0));
+  auto module = CreateNewVerifiedModule();
+  auto entry_computation = module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(VariadicOpSplitter().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(Match(entry_computation->root_instruction(),
+                    Concatenate().WithNumOperands(128).WithOperand(
+                        0, Concatenate().WithNumOperands(128))));
+}
+
+TEST_F(VariadicOpSplitterTest, SplitInto3) {
+  auto builder = HloComputation::Builder(TestName());
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({42})));
+  std::vector<HloInstruction*> concat_operands(256, operand);
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      ShapeUtil::MakeShape(S32, {256}), concat_operands, 0));
+  auto module = CreateNewVerifiedModule();
+  auto entry_computation = module->AddEntryComputation(builder.Build());
+  EXPECT_TRUE(VariadicOpSplitter().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(Match(entry_computation->root_instruction(),
+                    Concatenate(Concatenate().WithNumOperands(128),
+                                Concatenate().WithNumOperands(128))));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index 9a61f8ac5a62e38e687a93890eb33481a01d51c8..2dce7749bbd8da2673ae607eee3d731d9917e8fe 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -29,7 +29,7 @@ namespace {
 class WhileTransformerTest : public HloTestBase {
  protected:
   WhileTransformerTest()
-      : module_(CreateNewModule()),
+      : module_(CreateNewVerifiedModule()),
         induction_variable_shape_(ShapeUtil::MakeShape(S32, {})),
         data_shape_(ShapeUtil::MakeShape(F32, {8})),
         condition_result_shape_(ShapeUtil::MakeShape(PRED, {})) {}
@@ -69,8 +69,10 @@ class WhileTransformerTest : public HloTestBase {
     auto data = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
         data_shape_, loop_state, data_tuple_index));
     // Use 'induction_variable' in computation with no path to output tuple.
+    auto cast = builder.AddInstruction(HloInstruction::CreateBitcastConvert(
+        ShapeUtil::MakeShape(F32, {}), induction_variable));
     auto update = builder.AddInstruction(
-        HloInstruction::CreateBroadcast(data_shape_, induction_variable, {}));
+        HloInstruction::CreateBroadcast(data_shape_, cast, {}));
     auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, data, update));
     // Create output Tuple.
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index e30e7667f3015bc7bfe67c65147a5016332780f7..dc40b9446ad1bffcb757543e52fc9ab20de6d52e 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -30,16 +30,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class MinimumMemoryForSequenceTest : public HloVerifiedTestBase {};
+class MinimumMemoryForSequenceTest : public HloTestBase {};
 
 TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape, scalar_shape});
@@ -86,7 +86,7 @@ TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
     return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   };
 
-  HloSchedule schedule(module);
+  HloSchedule schedule(module.get());
   schedule.set_sequence(cond_computation,
                         {cond_param, cond_iter, cond_data, cond_lt});
   schedule.set_sequence(body_computation, {body_param});
@@ -258,7 +258,7 @@ class HeapSimulatorTracker {
   // Constructor for testing a single entry computation.
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
-      const std::vector<const HloInstruction*>& instruction_sequence) {
+      const std::vector<HloInstruction*>& instruction_sequence) {
     HloModuleConfig config;
     module_ = absl::make_unique<HloModule>(name, config);
     module_->AddEntryComputation(std::move(computation));
@@ -286,7 +286,7 @@ class HeapSimulatorTracker {
   // Similar to the single entry computation constructor above, but runs the
   // simulation over the entire module.
   void RunWholeModule(
-      const std::vector<const HloInstruction*>& full_module_sequence) {
+      const std::vector<HloInstruction*>& full_module_sequence) {
     points_to_analysis_ =
         TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
 
@@ -294,7 +294,7 @@ class HeapSimulatorTracker {
     HloSchedule schedule(module_.get());
     absl::flat_hash_map<const HloInstruction*, int> reverse_position;
     for (int i = 0; i < full_module_sequence.size(); ++i) {
-      const HloInstruction* instruction = full_module_sequence[i];
+      HloInstruction* instruction = full_module_sequence[i];
       schedule.GetOrCreateSequence(instruction->parent())
           .push_back(instruction);
       reverse_position[instruction] = full_module_sequence.size() - i;
@@ -351,7 +351,7 @@ class HeapSimulatorTracker {
   HeapSimulator::Result result_;
 };
 
-class HeapSimulatorTest : public HloVerifiedTestBase {
+class HeapSimulatorTest : public HloTestBase {
  protected:
   HeapSimulatorTest() {}
   ~HeapSimulatorTest() override {}
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 1ea26ddd5b9ee01eaeb812b32539c7820d3d5dda..414c63271245315f037d04924c9291a9cd5b7a77 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -34,7 +34,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 56
+// Next ID: 58
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -51,7 +51,7 @@ message HloInstructionProto {
 
   string name = 1;
   string opcode = 2;
-  xla.Shape shape = 3;
+  xla.ShapeProto shape = 3;
 
   xla.OpMetadata metadata = 7;
 
@@ -132,7 +132,7 @@ message HloInstructionProto {
   string custom_call_opaque = 53;
 
   // Shape of outfeed request.
-  xla.Shape outfeed_shape = 29;
+  xla.ShapeProto outfeed_shape = 29;
 
   // Describes the dimension numbers used for a dot operation
   xla.DotDimensionNumbers dot_dimension_numbers = 30;
@@ -184,6 +184,13 @@ message HloInstructionProto {
   // Sharding for kDomain instructions.
   xla.OpSharding domain_entry_sharding = 54;
   xla.OpSharding domain_exit_sharding = 55;
+
+  // For custom call this indicates that the layouts are constrained. If
+  // constrain_layout is true then the 'shape' field must contain a layout, and
+  // 'operand_shapes_with_layout' must contain a shape with layout for each
+  // operand.
+  bool constrain_layout = 56;
+  repeated xla.ShapeProto operand_shapes_with_layout = 57;
 }
 
 // Serialization of HloComputation.
@@ -198,7 +205,8 @@ message HloComputationProto {
   repeated HloInstructionProto instructions = 2;
 
   // The program shape (with layout) of this computation.
-  xla.ProgramShape program_shape = 4;
+
+  xla.ProgramShapeProto program_shape = 4;
 
   // The id of this computation.
   int64 id = 5;
@@ -218,6 +226,67 @@ message HloScheduleProto {
   map<int64, InstructionSequence> sequences = 1;
 }
 
+message HloInputOutputAliasProto {
+  // The following proto describes a pair of aliased an input
+  // (described by parameter number and a ShapeIndex of the parameter)
+  // and an output (described by a ShapeIndex of the root
+  // instruction). For example:
+  //
+  // entry = {
+  //  output_shape_index={1},
+  //  parameter_number=0,
+  //  parameter_shape_index={1, 2},
+  // }
+  //
+  // This entry indicates that the first paremter's {1, 2} element is
+  // aliased with the {1} element of the root instruction.
+  message AliasEntryProto {
+    // ShapeIndex of the root hlo.
+    repeated int64 output_shape_index = 1;
+    // Number of the parameter in entry computation.
+    int64 parameter_number = 2;
+    // ShapeIndex of the parameter instruction.
+    repeated int64 parameter_shape_index = 3;
+  }
+
+  repeated AliasEntryProto entries = 1;
+}
+
+message DynamicParameterBindingProto {
+  // A list of bindings which indicates that the `target_dim_num` in
+  // the subshape `target_param_index` of parameter `target_param_num`
+  // is a dynamic dimension and its real dynamic size is represented
+  // by `dynamic_param_index` in parameter `dynamic_param_num`.
+  //
+  // As an example, imagine we have a program:
+  //
+  // ENTRY main {
+  //   a = f32[] parameter(0)
+  //   b = f32[10] parameter(1)
+  //   ROOT root = (f32[], f32[10]) tuple(%a, %b)
+  // }
+  //
+  // Let's say 'b' (param index 1) is a dynamic shape whose input has
+  // an upperbound of 10 and real size is determined at runtime.'a'
+  // represents the real size of b's first dimension.
+  //
+  // In this case, the fields are set in the following way:
+  // dynamic_param_num = 1
+  // dynamic_param_index = {}
+  // target_param_num = 0
+  // target_param_index = {}
+  // target_param_dim = 0
+  message Binding {
+    int64 dynamic_param_num = 1;
+    repeated int64 dynamic_param_index = 2;
+    int64 target_param_num = 3;
+    repeated int64 target_param_index = 4;
+    int64 target_param_dim_num = 5;
+  }
+
+  repeated Binding entries = 1;
+}
+
 // Serialization of HloModule.
 message HloModuleProto {
   string name = 1;
@@ -228,14 +297,19 @@ message HloModuleProto {
   // callees appear before their callers.
   repeated HloComputationProto computations = 3;
 
-  // The program shape (with layout) of the entry computation.
-  xla.ProgramShape program_shape = 4;
+  // The host program shape (with layout) of the entry computation.
+  xla.ProgramShapeProto host_program_shape = 4;
 
   // The id of this module.
   int64 id = 5;
 
   // The schedule for this module.
   HloScheduleProto schedule = 7;
+
+  // Describes alias information between inputs and outputs.
+  HloInputOutputAliasProto input_output_alias = 8;
+
+  DynamicParameterBindingProto dynamic_parameter_binding = 9;
 }
 
 // Serialization of LogicalBuffer.
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index c3da12e273c77793647981f8653649155aac9483..cf8e6594cbe5ffd28ca75dd5006e8817f1e8581c 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -59,8 +59,9 @@ class BufferValueMap {
   // construction process.
   using BufferNumber = int64;
 
-  explicit BufferValueMap(const HloDataflowAnalysis& dataflow)
-      : dataflow_(dataflow) {
+  explicit BufferValueMap(HloModule* module,
+                          const HloDataflowAnalysis& dataflow)
+      : module_(module), dataflow_(dataflow) {
     buffers_.reserve(dataflow_.values().size());
     value_to_buffer_number_.reserve(dataflow_.values().size());
     for (const HloValue* value : dataflow_.values()) {
@@ -171,6 +172,42 @@ class BufferValueMap {
     return value_to_buffer_number_.at(&value);
   }
 
+  void ComputeInputOutputAliasedBuffers(
+      const HloValue& value, std::vector<BufferNumber>* aliased_buffers) {
+    // Get parameter value from an aliased_input object.
+    const auto get_parameter_value =
+        [this](const std::pair<int64, ShapeIndex>& aliased_input)
+        -> const HloValue& {
+      int64 param_number = aliased_input.first;
+      const ShapeIndex& param_index = aliased_input.second;
+      return dataflow_.GetUniqueValueAt(
+          module_->entry_computation()->parameter_instruction(param_number),
+          param_index);
+    };
+
+    // If the value shows up in a root instruction, alias it with parameter
+    // intruction.
+    for (const HloPosition& pos : value.positions()) {
+      if (pos.instruction == module_->entry_computation()->root_instruction()) {
+        ShapeIndex output_index = pos.index;
+
+        auto aliased_input =
+            module_->input_output_alias_config().GetAliasedParameter(
+                output_index);
+        if (aliased_input) {
+          aliased_buffers->push_back(
+              GetBufferForValue(get_parameter_value(*aliased_input)));
+        }
+      }
+    }
+
+    // If the value is parameter instruction itself, alias it with itself.
+    if (value.instruction()->opcode() == HloOpcode::kParameter &&
+        value.instruction()->parent() == module_->entry_computation()) {
+      aliased_buffers->push_back(GetBufferForValue(value));
+    }
+  }
+
   void ComputeWhileAliasedBuffers(const HloValue& value,
                                   std::vector<BufferNumber>* aliased_buffers) {
     VLOG(3) << "Compute kWhile aliases";
@@ -278,6 +315,7 @@ class BufferValueMap {
       VLOG(2) << "Use of value " << value.ToShortString() << ": " << use;
     }
     std::vector<BufferNumber> aliased_buffers;
+    ComputeInputOutputAliasedBuffers(value, &aliased_buffers);
     ComputeWhileAliasedBuffers(value, &aliased_buffers);
     ComputeConditionalAliasedBuffers(value, &aliased_buffers);
     // Uniquify aliased buffers.
@@ -288,6 +326,8 @@ class BufferValueMap {
     return aliased_buffers;
   }
 
+  HloModule* module_;
+
   // Dataflow analysis used to construct the buffer map.
   const HloDataflowAnalysis& dataflow_;
 
@@ -461,7 +501,7 @@ StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
                                                /*bitcast_defines_value=*/false,
                                                fusion_can_share_buffer));
 
-  BufferValueMap buffer_map(alias_analysis->dataflow_analysis());
+  BufferValueMap buffer_map(module, alias_analysis->dataflow_analysis());
   buffer_map.MergeAliasedBuffers();
 
   // Create a vector of HloBuffers, one for each set of values in the
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 0cd0ab36fcf832af9a71ab5837c94f9b39bc4bf3..7e6150e94153cd15463725e862ce1b8593f2c991 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -39,17 +39,17 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-class HloAliasAnalysisTest : public HloVerifiedTestBase {
+class HloAliasAnalysisTest : public HloTestBase {
  protected:
-  HloAliasAnalysisTest() : HloVerifiedTestBase() {
-    module_ = CreateNewModule();
+  HloAliasAnalysisTest() : HloTestBase() {
+    module_ = CreateNewVerifiedModule();
   }
 
   // Run alias analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
   HloAliasAnalysis& RunAnalysis() {
     hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
-    analysis_ = HloAliasAnalysis::Run(module_,
+    analysis_ = HloAliasAnalysis::Run(module_.get(),
                                       /*fusion_can_share_buffer=*/nullptr)
                     .ConsumeValueOrDie();
     return *analysis_;
@@ -93,7 +93,7 @@ class HloAliasAnalysisTest : public HloVerifiedTestBase {
   // never occurs, but HLO graphs with interference can be explicitly
   // constructed.
   bool AnyValuesInSameBufferInterfere() {
-    DependencyHloOrdering ordering(module_);
+    DependencyHloOrdering ordering(module_.get());
     for (const HloBuffer& buffer : analysis_->buffers()) {
       for (const HloValue* value_a : buffer.values()) {
         for (const HloValue* value_b : buffer.values()) {
@@ -110,7 +110,7 @@ class HloAliasAnalysisTest : public HloVerifiedTestBase {
     return false;
   }
 
-  HloModule* module_;
+  std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloAliasAnalysis> analysis_;
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
@@ -217,6 +217,181 @@ TEST_F(HloAliasAnalysisTest, NondistinctTuple) {
   EXPECT_FALSE(AnyValuesInSameBufferInterfere());
 }
 
+TEST_F(HloAliasAnalysisTest, ParametersWithAliasing) {
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+
+  auto negate0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte0));
+  auto negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, gte1));
+
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
+  module_->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+
+  // Cannot alias an output twice.
+  ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte0),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{0}));
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte1),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{1}));
+}
+
+TEST_F(HloAliasAnalysisTest, ParametersWithCrossAliasing) {
+  // parameter 0 aliased with output 1 and parameter 1 aliased with output 0.
+  //
+  //  (p0 ,  p1)
+  //     \   /
+  //      \ /
+  // alias X
+  //      / \
+  //     /   \
+  //  (p0  ,  p1)
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+  module_->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1}));
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+
+  // Cannot alias an output twice.
+  ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  // Every Ops in this graph are aliased with each other.
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte0),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{0}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte0),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{1}));
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte1),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{0}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(gte1),
+            analysis.GetUniqueBufferAt(tuple, /*index=*/{1}));
+}
+
+TEST_F(HloAliasAnalysisTest, InputOutputAliasingWithWhile) {
+  // Test a simple single while instruction can be aliased with input and output
+  // of the computation.
+  //
+  // body((F32[], F32[]) %tuple_param):
+  //   %add = Add(%tuple_param{0}, %tuple_param{1})
+  //   return Tuple(%tuple_param{0}, %add)
+  //
+  // condition((F32[], F32[]) %tuple_param):
+  //   return Constant(false)
+  //
+  // entry:
+  //   %param1 = param1
+  //   %while = While(%param1, body, condition)
+  //   %while_1 = GTE(%while, 0)
+  //   %while_2 = GTE(%while, 1)
+  //   %negate_1 = Negate(%while_1)
+  //   %negate_2 = Negate(%while_2)
+  //   return Tuple(negate_1, negate_2)
+  //
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Element 0 passes transparently through the body.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
+  auto body_tuple = body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_0, add}));
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
+
+  // Condition computation trivially returns a constant "false".
+  auto cond_builder = HloComputation::Builder("condition");
+  auto cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, param));
+  auto while_element_1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, xla_while, 0));
+  auto while_element_2 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, xla_while, 1));
+  auto negate_1 = builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, while_element_1));
+  auto negate_2 = builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, while_element_2));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({negate_1, negate_2}));
+  module_->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+  TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+
+  const HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_THAT(
+      GetValuesInBuffer(analysis.GetUniqueBufferAt(xla_while, /*index=*/{1})),
+      UnorderedElementsAre(GetValueDefinedAt(param, {1}),
+                           GetValueDefinedAt(xla_while, /*index=*/{1}),
+                           GetValueDefinedAt(body_param, {1}),
+                           GetValueDefinedAt(cond_param, {1}),
+                           GetValueDefinedAt(add),
+                           GetValueDefinedAt(negate_2)));
+
+  EXPECT_THAT(
+      analysis.GetUniqueBufferAt(xla_while, /*index=*/{1}).ComputePositions(),
+      UnorderedElementsAre(
+          HloPosition{param, {1}}, HloPosition{xla_while, {1}},
+          HloPosition{while_element_2, {}}, HloPosition{body_param, {1}},
+          HloPosition{body_element_1, {}}, HloPosition{add, {}},
+          HloPosition{body_tuple, {1}}, HloPosition{tuple, {1}},
+          HloPosition{cond_param, {1}}, HloPosition{negate_2, {}}));
+
+  EXPECT_FALSE(AnyValuesInSameBufferInterfere());
+}
+
 TEST_F(HloAliasAnalysisTest, SingleCall) {
   // Test a single call of a subcomputation. The subcomputation adds its two
   // array-shaped parameters.
@@ -463,7 +638,7 @@ TEST_F(HloAliasAnalysisTest, SequentialWhiles) {
   module_->AddEntryComputation(builder.Build());
 
   FlattenCallGraph flattener;
-  TF_ASSERT_OK(flattener.Run(module_).status());
+  TF_ASSERT_OK(flattener.Run(module_.get()).status());
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -837,7 +1012,7 @@ TEST_F(HloAliasAnalysisTest, BitcastInterference) {
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
-  DependencyHloOrdering ordering(module_);
+  DependencyHloOrdering ordering(module_.get());
   EXPECT_FALSE(analysis.HasLiveRangeInterference(ordering));
 }
 
@@ -879,13 +1054,13 @@ TEST_F(HloAliasAnalysisTest, WhileInterference) {
   {
     // Dependency ordering should interfere because the negate and while are
     // unordered.
-    DependencyHloOrdering ordering(module_);
+    DependencyHloOrdering ordering(module_.get());
     EXPECT_TRUE(analysis.HasLiveRangeInterference(ordering));
   }
 
   // For a sequential order, if there is interference iff the negate is after
   // the while.
-  HloSchedule schedule(module_);
+  HloSchedule schedule(module_.get());
   schedule.set_sequence(body, {body_param, body_root});
   schedule.set_sequence(condition, {cond_param, cond_root});
   {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index c2041c466708fd8c88d34f14fbc0064905f594a9..ff122b529bdcdcc69d2245136e19101902dbf957 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -215,7 +216,7 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
 
     if (removed.count(item) != 0 || item->user_count() != 0 ||
         item == root_instruction() || !IsRemovable(item) ||
-        item->HasSideEffect()) {
+        (item->HasSideEffect() && item != instruction)) {
       continue;
     }
     for (int i = 0; i < item->operand_count(); ++i) {
@@ -321,7 +322,7 @@ void HloComputation::ComputeInstructionPostOrder(
 
     // Add the operands to the stack in reverse order so the first operand is
     // processed first. This will produce a more natural ordering and a nicer
-    // result for thigns like HLO stringification.
+    // result for things like HLO stringification.
     const auto& operands = current->operands();
     for (int64 i = operands.size() - 1; i >= 0; --i) {
       dfs_stack.emplace_back(operands[i]);
@@ -498,7 +499,7 @@ HloComputationProto HloComputation::ToProto() const {
     proto.add_instructions()->Swap(&instruction_proto);
   }
   proto.set_root_id(root_instruction()->unique_id());
-  *proto.mutable_program_shape() = ComputeProgramShape();
+  *proto.mutable_program_shape() = ComputeProgramShape().ToProto();
   return proto;
 }
 
@@ -710,6 +711,8 @@ bool HloComputation::operator==(const HloComputation& other) const {
   return eq(root_instruction(), other.root_instruction());
 }
 
+uint64 HloComputation::Hash() const { return root_instruction()->Hash(); }
+
 Status HloComputation::ReplaceWithNewInstruction(
     HloInstruction* old_instruction,
     std::unique_ptr<HloInstruction> new_instruction) {
@@ -739,72 +742,6 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction,
   return RemoveInstructionAndUnusedOperands(old_instruction);
 }
 
-std::unique_ptr<HloReachabilityMap> HloComputation::ComputeReachability()
-    const {
-  const auto& all = MakeInstructionPostOrder();
-  auto result = absl::make_unique<HloReachabilityMap>(all);
-  auto channel_dependency_map = ComputeChannelDependencies();
-
-  std::vector<HloInstruction*> inputs;
-  for (const HloInstruction* hlo : all) {
-    inputs.assign(hlo->operands().begin(), hlo->operands().end());
-    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
-                  hlo->control_predecessors().end());
-
-    switch (hlo->opcode()) {
-      case HloOpcode::kRecvDone: {
-        auto it = channel_dependency_map.find(hlo->channel_id());
-        if (it != channel_dependency_map.end()) {
-          absl::c_copy(it->second, std::back_inserter(inputs));
-        }
-        break;
-      }
-      case HloOpcode::kCrossReplicaSum: {
-        auto all_reduce_id = hlo->all_reduce_id();
-        if (all_reduce_id) {
-          auto it = channel_dependency_map.find(all_reduce_id.value());
-          if (it != channel_dependency_map.end()) {
-            absl::c_copy(it->second, std::back_inserter(inputs));
-          }
-        }
-        break;
-      }
-      default:
-        break;
-    }
-
-    result->FastSetReachabilityToUnion(inputs, hlo);
-  }
-  return result;
-}
-
-void HloComputation::UpdateReachabilityThroughInstruction(
-    const HloInstruction* instruction, HloReachabilityMap* reachability_map) {
-  std::queue<const HloInstruction*> worklist;
-  worklist.push(instruction);
-
-  std::vector<HloInstruction*> inputs;
-
-  while (!worklist.empty()) {
-    const HloInstruction* item = worklist.front();
-    worklist.pop();
-
-    inputs.assign(item->operands().begin(), item->operands().end());
-    inputs.insert(inputs.end(), item->control_predecessors().begin(),
-                  item->control_predecessors().end());
-
-    if (reachability_map->SetReachabilityToUnion(inputs, item)) {
-      // Add immediate successors to worklist.
-      for (const HloInstruction* user : item->users()) {
-        worklist.push(user);
-      }
-      for (const HloInstruction* succ : item->control_successors()) {
-        worklist.push(succ);
-      }
-    }
-  }
-}
-
 std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
   std::vector<HloInstruction*> unreachable_roots;
   for (auto* instruction : instructions()) {
@@ -860,7 +797,7 @@ Status HloComputation::AcceptWithOperandOrder(
 template <typename HloInstructionPtr>
 Status HloComputation::AcceptOrdered(
     DfsHloVisitorBase<HloInstructionPtr>* visitor,
-    const std::vector<const HloInstruction*>& order) const {
+    const std::vector<HloInstruction*>& order) const {
   VLOG(3) << "Accepting visitor with order.";
   for (HloInstruction* root : CollectUnreachableRoots()) {
     TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end())
@@ -890,9 +827,9 @@ Status HloComputation::AcceptOrdered(
 
 // Explicit instantiations.
 template Status HloComputation::AcceptOrdered(
-    DfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+    DfsHloVisitor*, const std::vector<HloInstruction*>&) const;
 template Status HloComputation::AcceptOrdered(
-    ConstDfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+    ConstDfsHloVisitor*, const std::vector<HloInstruction*>&) const;
 
 Status HloComputation::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
@@ -911,14 +848,46 @@ std::unique_ptr<HloComputation> HloComputation::Clone(
   return CloneWithReplacements(
       /*replacements=*/std::unordered_map<const HloInstruction*,
                                           std::unique_ptr<HloInstruction>>(),
-      /*extras=*/{}, context, suffix);
+      context, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+    HloCloneContext* context, const string& suffix) {
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(std::move(r1));
+  return CloneWithReplacements(std::move(replacements), context, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+    HloCloneContext* context, const string& suffix) {
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(std::move(r1));
+  replacements.emplace(std::move(r2));
+  return CloneWithReplacements(std::move(replacements), context, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+    std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
+    HloCloneContext* context, const string& suffix) {
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(std::move(r1));
+  replacements.emplace(std::move(r2));
+  replacements.emplace(std::move(r3));
+  return CloneWithReplacements(std::move(replacements), context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
-    absl::Span<HloInstruction*> extras, HloCloneContext* context,
-    const string& suffix) {
+    HloCloneContext* context, const string& suffix) {
   std::unique_ptr<HloCloneContext> context_ptr;
   if (context == nullptr) {
     context_ptr = absl::make_unique<HloCloneContext>(parent(), suffix);
@@ -939,18 +908,50 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
   };
 
   VLOG(1) << "Cloning " << name() << " --> " << suffix << "\n";
+
+  // We want to do a postorder walk over [replace(i) for i in instructions_].
+  // We can't reuse MakeInstructionPostOrder() for this, because that will
+  // generate a postorder of plain instructions_, and our replacements may
+  // change the postorder!
+  //
+  // The postorder we want here is simpler than what MakeInstructionPostOrder()
+  // does -- we only care about operand dependencies -- so let's just do it
+  // ourselves.
   std::vector<HloInstruction*> postorder;
-  for (HloInstruction* instr : extras) {
-    postorder.push_back(instr);
-  }
-  for (HloInstruction* instr : MakeInstructionPostOrder()) {
-    if (HloInstruction* replacement = replace(instr)) {
-      postorder.push_back(replacement);
+  absl::flat_hash_map<HloInstruction*, VisitState> visited;
+  for (const auto& instr : instructions_) {
+    std::vector<HloInstruction*> dfs_stack;
+    HloInstruction* new_instr = replace(instr.get());
+    if (!new_instr) {
+      continue;
+    }
+    dfs_stack.push_back(new_instr);
+
+    while (!dfs_stack.empty()) {
+      auto* cur = dfs_stack.back();
+      auto it = visited.find(cur);
+      if (it != visited.end()) {
+        dfs_stack.pop_back();
+        if (it->second == kVisited) {
+          continue;
+        }
+        CHECK_EQ(it->second, kVisiting);
+        postorder.push_back(cur);
+        it->second = kVisited;
+        continue;
+      }
+
+      visited.insert({cur, kVisiting});
+      for (HloInstruction* operand : cur->operands()) {
+        HloInstruction* new_operand = replace(operand);
+        if (new_operand) {
+          dfs_stack.emplace_back(new_operand);
+        }
+      }
     }
   }
 
   std::vector<std::unique_ptr<HloInstruction>> instructions;
-  std::unique_ptr<HloInstruction> new_instr;
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
@@ -960,9 +961,8 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
           << operand->ToString() << ", used by " << instr->ToString();
       new_operands.push_back(context->GetInstruction(replaced_operand));
     }
-    new_instr =
-        instr->CloneWithNewOperands(instr->shape(), new_operands, context);
-    instructions.push_back(std::move(new_instr));
+    instructions.push_back(
+        instr->CloneWithNewOperands(instr->shape(), new_operands, context));
   }
   Builder builder(name() + "." + suffix);
   for (auto& instr : instructions) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index d87ab4bda162a74421e8906e07cfcb97e2128fe4..c584e4c7ca5770533f28352b0df9dadd9dbe1860 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -128,9 +127,10 @@ class HloComputation {
   // users. Instruction is deallocated with this call.
   Status RemoveInstruction(HloInstruction* instruction);
 
-  // Remove an instruction from the computation and also transitively any
-  // operand that has no users post removing an instruction. The instruction
-  // must have no users. Instruction is deallocated with this call.
+  // Remove an instruction (including side effecting ones) from the computation
+  // and also transitively any operand that has no side effect and no users post
+  // removing an instruction. The instruction must have no users. Instruction is
+  // deallocated with this call.
   Status RemoveInstructionAndUnusedOperands(HloInstruction* instruction);
 
   // Set the root of the computation to the given instruction. The instruction
@@ -214,19 +214,6 @@ class HloComputation {
   // this order, definitions of values always appear before their uses.
   std::vector<HloInstruction*> MakeInstructionPostOrder() const;
 
-  // Computes and returns the reachability between HLO instructions in the
-  // computation. The returned HloReachabilityMap is constructed such that
-  // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a
-  // directed path (from producer to consumer) from 'a' to 'b'. Both data
-  // dependencies (operands) and control dependencies are considered for
-  // reachability. Trivially an instruction is reachable from itself.
-  std::unique_ptr<HloReachabilityMap> ComputeReachability() const;
-
-  // Updates the given reachability map after the immediate predecessor set
-  // (operands and control predecessors) of 'instruction' has changed.
-  void UpdateReachabilityThroughInstruction(
-      const HloInstruction* instruction, HloReachabilityMap* reachability_map);
-
   int64 instruction_count() const { return instruction_iterators_.size(); }
 
   // Creates and returns a list of the embedded computations called by this
@@ -277,6 +264,12 @@ class HloComputation {
   // Return whether `*this` and `other` are functionally equivalent.
   bool operator==(const HloComputation& other) const;
 
+  // Generates a hash value of an HLO computation. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO computations,
+  // with respect to HloInstruction::Identical() method.
+  uint64 Hash() const;
+
   // Replaces old instruction with newly created instruction. Removes old
   // instruction from computation. Updates uses and root instruction.
   Status ReplaceWithNewInstruction(
@@ -314,7 +307,7 @@ class HloComputation {
   // be a topological sort of all instructions in the computation.
   template <typename HloInstructionPtr>
   Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
-                       const std::vector<const HloInstruction*>& order) const;
+                       const std::vector<HloInstruction*>& order) const;
 
   // Same as Accept() above, but the visitor is given as a function.
   Status Accept(const std::function<Status(HloInstruction*)>& visitor_func);
@@ -332,14 +325,38 @@ class HloComputation {
   // the map's value to replace that instruction in the cloned computation.
   //
   // If replacements maps a key to nullptr, we remove that instruction from the
-  // new computation.
-  // If additional instructions are used by instructions in replacement map,
-  // they must be passed in post-order in the extras span.
+  // new computation.  If an element of `replacements` references an instruction
+  // that's not already in the computation, it's cloned and added to the new
+  // computation.
+  //
+  // All relevant instructions are cloned, *including* unique_ptr in the
+  // `replacements` map.
   std::unique_ptr<HloComputation> CloneWithReplacements(
       std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
           replacements,
-      absl::Span<HloInstruction*> extras, HloCloneContext* context = nullptr,
-      const string& suffix = "clone");
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
+
+  // Convenience overloads for CloneWithReplacements.  You want to do
+  //
+  //   CloneWithReplacements({{a, std::move(b)}, {c, std::move(d)}})  // ERROR
+  //
+  // but that doesn't work because std::initializer_list is not movable.  These
+  // overloads let you do
+  //
+  //   CloneWithReplacementPairs({a, std::move(b)}, {c, std::move(d)});   // OK
+  //
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
+  std::unique_ptr<HloComputation> CloneWithReplacementPairs(
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
+      std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
 
   // Returns true if the given instruction can be removed from the computation.
   // Parameter instructions cannot be removed without violating invariants of
@@ -354,6 +371,14 @@ class HloComputation {
   // channel complete).
   bool IsRemovable(const HloInstruction* instruction);
 
+  // Returns a map from channel-id to directed dependencies of the channel
+  // instructions. For send&recv pairs it means the send instruction and for
+  // cross-replica-sum the union of the dependencies for all participating
+  // instructions.
+  using ChannelDependencyMap =
+      absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
+  ChannelDependencyMap ComputeChannelDependencies() const;
+
   // Returns true if this computation has a side effect. A computation has a
   // side effect if it contains one or more instructions with a side effect.
   bool HasSideEffect() const;
@@ -409,14 +434,6 @@ class HloComputation {
   // Internal helper to collect unreachable roots.
   std::vector<HloInstruction*> CollectUnreachableRoots() const;
 
-  // Returns a map from channel-id to directed dependencies of the channel
-  // instructions. For send&recv pairs it means the send instruction and for
-  // cross-replica-sum the union of the dependencies for all participating
-  // instructions.
-  using ChannelDependencyMap =
-      absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
-  ChannelDependencyMap ComputeChannelDependencies() const;
-
   enum VisitState { kVisiting, kVisited };
   void ComputeInstructionPostOrder(
       const HloComputation::ChannelDependencyMap& channel_dependency_map,
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 2aaaef1d36d58bcce18db4aa37ff05ea352e484b..0361c87428f6e4c031d95492a5bc782ad388e5b5 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -20,19 +20,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
-namespace op = xla::testing::opcode_matchers;
-
 namespace xla {
 
 namespace {
 
+namespace m = match;
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
@@ -65,7 +65,7 @@ class HloComputationTest : public HloTestBase {
 };
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEntryComputation(CreateNegateComputation());
   EXPECT_TRUE(negate_computation->MakeEmbeddedComputationsList().empty());
@@ -73,7 +73,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
   // Create computation which calls one other computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map_computation =
@@ -85,7 +85,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsDiamond) {
   // Create computations with a diamond-shaped callgraph.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map1_computation =
@@ -119,7 +119,7 @@ TEST_F(HloComputationTest, PostOrderSingleton) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(), ElementsAre(constant));
 }
@@ -134,7 +134,7 @@ TEST_F(HloComputationTest, PostOrderSimple) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto negate2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               ElementsAre(constant, negate1, negate2));
@@ -151,7 +151,7 @@ TEST_F(HloComputationTest, PostOrderTrace) {
       builder.AddInstruction(HloInstruction::CreateTrace("foobar", negate1));
   auto negate2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Trace instructions should be at the end of the sort.
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
@@ -170,7 +170,7 @@ TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   auto constant4 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               UnorderedElementsAre(constant1, constant2, constant3, constant4));
@@ -192,7 +192,7 @@ TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
       r0f32_, HloOpcode::kAdd, constant2, constant3));
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant3));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto post_order = computation->MakeInstructionPostOrder();
   EXPECT_EQ(6, post_order.size());
@@ -217,7 +217,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
                                                       constant2, constant3));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       constant1, constant3));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Visitor which keeps track of which instructions have been visited.
   class TestVisitor : public DfsHloVisitorWithDefault {
@@ -257,11 +257,11 @@ TEST_F(HloComputationTest, DeepCopyArray) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(constant).ValueOrDie();
 
-  EXPECT_THAT(copy, op::Copy(constant));
+  EXPECT_THAT(copy, GmockMatch(m::Copy(m::Op().Is(constant))));
 }
 
 TEST_F(HloComputationTest, DeepCopyTuple) {
@@ -274,12 +274,13 @@ TEST_F(HloComputationTest, DeepCopyTuple) {
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto tuple_copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
-  EXPECT_THAT(tuple_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
-                                    op::Copy(op::GetTupleElement(tuple))));
+  EXPECT_THAT(tuple_copy, GmockMatch(m::Tuple(
+                              m::Copy(m::GetTupleElement(m::Op().Is(tuple))),
+                              m::Copy(m::GetTupleElement(m::Op().Is(tuple))))));
   EXPECT_EQ(0, tuple_copy->operand(0)->operand(0)->tuple_index());
   EXPECT_EQ(1, tuple_copy->operand(1)->operand(0)->tuple_index());
 }
@@ -297,7 +298,7 @@ TEST_F(HloComputationTest, DeepCopyArrayAtIndices) {
     ShapeTree<bool> indices_to_copy(constant->shape(), /*init_value=*/true);
     EXPECT_THAT(computation->DeepCopyInstruction(constant, &indices_to_copy)
                     .ValueOrDie(),
-                op::Copy(constant));
+                GmockMatch(m::Copy(m::Op().Is(constant))));
   }
 
   {
@@ -330,10 +331,11 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) {
         computation->DeepCopyInstruction(tuple, &indices_to_copy, &copies_added)
             .ValueOrDie();
 
-    EXPECT_THAT(deep_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
-                                     op::Copy(op::GetTupleElement(tuple))));
-    EXPECT_THAT(deep_copy, op::Tuple(copies_added.element({0}),
-                                     copies_added.element({1})));
+    EXPECT_THAT(deep_copy, GmockMatch(m::Tuple(
+                               m::Copy(m::GetTupleElement(m::Op().Is(tuple)))
+                                   .Is(copies_added.element({0})),
+                               m::Copy(m::GetTupleElement(m::Op().Is(tuple)))
+                                   .Is(copies_added.element({1})))));
   }
 
   {
@@ -346,8 +348,9 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) {
         computation->DeepCopyInstruction(tuple, &indices_to_copy, &copies_added)
             .ValueOrDie();
 
-    EXPECT_THAT(deep_copy, op::Tuple(op::GetTupleElement(tuple),
-                                     op::GetTupleElement(tuple)));
+    EXPECT_THAT(deep_copy,
+                GmockMatch(m::Tuple(m::GetTupleElement(m::Op().Is(tuple)),
+                                    m::GetTupleElement(m::Op().Is(tuple)))));
     EXPECT_TRUE(copies_added.element({}) == nullptr);
     EXPECT_TRUE(copies_added.element({0}) == nullptr);
     EXPECT_TRUE(copies_added.element({1}) == nullptr);
@@ -363,8 +366,9 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) {
         computation->DeepCopyInstruction(tuple, &indices_to_copy, &copies_added)
             .ValueOrDie();
 
-    EXPECT_THAT(deep_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
-                                     op::GetTupleElement(tuple)));
+    EXPECT_THAT(deep_copy, GmockMatch(m::Tuple(
+                               m::Copy(m::GetTupleElement(m::Op().Is(tuple))),
+                               m::GetTupleElement(m::Op().Is(tuple)))));
     EXPECT_TRUE(copies_added.element({}) == nullptr);
     EXPECT_TRUE(copies_added.element({0}) != nullptr);
     EXPECT_TRUE(copies_added.element({1}) == nullptr);
@@ -376,12 +380,12 @@ TEST_F(HloComputationTest, DeepCopyToken) {
   // copied.
   auto builder = HloComputation::Builder(TestName());
   auto token = builder.AddInstruction(HloInstruction::CreateToken());
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(token).ValueOrDie();
 
   // No copy should be added.
-  EXPECT_THAT(copy, op::AfterAll());
+  EXPECT_THAT(copy, GmockMatch(m::AfterAll()));
 }
 
 TEST_F(HloComputationTest, DeepCopyTokenTuple) {
@@ -393,14 +397,15 @@ TEST_F(HloComputationTest, DeepCopyTokenTuple) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({token, constant}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
   // Only the array (second tuple element) should be copied. The token is passed
   // through transparently.
-  EXPECT_THAT(copy, op::Tuple(op::GetTupleElement(tuple),
-                              op::Copy(op::GetTupleElement(tuple))));
+  EXPECT_THAT(copy, GmockMatch(m::Tuple(
+                        m::GetTupleElement(m::Op().Is(tuple)),
+                        m::Copy(m::GetTupleElement(m::Op().Is(tuple))))));
 }
 
 TEST_F(HloComputationTest, CycleDetection) {
@@ -412,7 +417,7 @@ TEST_F(HloComputationTest, CycleDetection) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, negate, negate));
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Add a control dependency to create a cycle.
   ASSERT_IS_OK(add->AddControlDependencyTo(negate));
@@ -440,16 +445,18 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
       r0f32_, HloOpcode::kAdd, dead_negate, dead_negate));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Negate(m::Op().Is(constant))));
   EXPECT_EQ(negate, computation->root_instruction());
 
   ASSERT_IS_OK(computation->RemoveInstructionAndUnusedOperands(dead_add));
 
   EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Negate(m::Op().Is(constant))));
   EXPECT_EQ(negate, computation->root_instruction());
 }
 
@@ -466,7 +473,7 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
       HloInstruction::CreateParameter(0, r0f32_, "param0"));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation =
       module->AddEntryComputation(builder.Build(/*root_instruction=*/add));
 
@@ -484,107 +491,6 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
   EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add));
 }
 
-TEST_F(HloComputationTest, Reachability) {
-  // Test reachability of a non-trivial computation:
-  //
-  // const1    const2
-  //    |         |
-  //    | +-------+
-  //    | |       |
-  //    add ..   negate
-  //     |   .     |
-  //     |   .... exp
-  //     |         |
-  //     +---+   +-+---+
-  //         |   |     |
-  //       multiply   copy
-  //
-  // There is a control dependency from 'add' to 'exp'.
-  auto builder = HloComputation::Builder(TestName());
-  auto constant1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
-  auto constant2 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
-      r0f32_, HloOpcode::kAdd, constant1, constant2));
-  auto negate = builder.AddInstruction(
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant2));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, negate));
-  auto mul = builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kMultiply, add, exp));
-  auto copy = builder.AddInstruction(
-      HloInstruction::CreateUnary(r0f32_, HloOpcode::kCopy, exp));
-
-  auto module = CreateNewModule();
-  auto computation =
-      module->AddEntryComputation(builder.Build(/*root_instruction=*/mul));
-
-  TF_CHECK_OK(add->AddControlDependencyTo(exp));
-  auto reachability = computation->ComputeReachability();
-
-  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
-  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant1, add));
-  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
-  EXPECT_TRUE(reachability->IsReachable(constant1, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
-  EXPECT_TRUE(reachability->IsReachable(constant1, copy));
-
-  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
-  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant2, add));
-  EXPECT_TRUE(reachability->IsReachable(constant2, negate));
-  EXPECT_TRUE(reachability->IsReachable(constant2, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
-  EXPECT_TRUE(reachability->IsReachable(constant2, copy));
-
-  EXPECT_FALSE(reachability->IsReachable(exp, constant1));
-  EXPECT_FALSE(reachability->IsReachable(exp, constant2));
-  EXPECT_FALSE(reachability->IsReachable(exp, add));
-  EXPECT_FALSE(reachability->IsReachable(exp, negate));
-  EXPECT_TRUE(reachability->IsReachable(exp, exp));
-  EXPECT_TRUE(reachability->IsReachable(exp, mul));
-  EXPECT_TRUE(reachability->IsReachable(exp, copy));
-
-  EXPECT_FALSE(reachability->IsReachable(mul, constant1));
-  EXPECT_FALSE(reachability->IsReachable(mul, constant2));
-  EXPECT_FALSE(reachability->IsReachable(mul, add));
-  EXPECT_FALSE(reachability->IsReachable(mul, negate));
-  EXPECT_FALSE(reachability->IsReachable(mul, exp));
-  EXPECT_TRUE(reachability->IsReachable(mul, mul));
-  EXPECT_FALSE(reachability->IsReachable(mul, copy));
-
-  EXPECT_TRUE(reachability->IsConnected(constant1, copy));
-  EXPECT_TRUE(reachability->IsConnected(copy, constant1));
-  EXPECT_FALSE(reachability->IsConnected(negate, add));
-  EXPECT_FALSE(reachability->IsConnected(add, negate));
-
-  // Remove the control dependency then update and verify the reachability map
-  ASSERT_IS_OK(add->RemoveControlDependencyTo(exp));
-  computation->UpdateReachabilityThroughInstruction(exp, reachability.get());
-
-  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
-  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant1, add));
-  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
-  EXPECT_FALSE(reachability->IsReachable(constant1, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
-  EXPECT_FALSE(reachability->IsReachable(constant1, copy));
-
-  // Change a use within the graph then update and verify the reachability map
-  ASSERT_IS_OK(constant2->ReplaceUseWith(negate, constant1));
-  computation->UpdateReachabilityThroughInstruction(negate, reachability.get());
-
-  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
-  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
-  EXPECT_TRUE(reachability->IsReachable(constant2, add));
-  EXPECT_FALSE(reachability->IsReachable(constant2, negate));
-  EXPECT_FALSE(reachability->IsReachable(constant2, exp));
-  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
-  EXPECT_FALSE(reachability->IsReachable(constant2, copy));
-}
-
 TEST_F(HloComputationTest, Stringification) {
   const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
   const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
@@ -606,7 +512,7 @@ TEST_F(HloComputationTest, Stringification) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
@@ -641,7 +547,7 @@ TEST_F(HloComputationTest, StringificationIndent) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options =
@@ -677,7 +583,7 @@ TEST_F(HloComputationTest, StringificationCanonical) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
@@ -700,27 +606,5 @@ TEST_F(HloComputationTest, StringificationCanonical) {
   EXPECT_EQ(computation->ToString(options), expected_computation2);
 }
 
-TEST_F(HloComputationTest, ChannelReachability) {
-  const Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
-  HloComputation::Builder builder("ChannelReachability");
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param"));
-  auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
-  auto send =
-      builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1));
-  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
-  auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
-  auto recv =
-      builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1));
-  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
-
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build(recv_done));
-  auto reachability = computation->ComputeReachability();
-  EXPECT_TRUE(reachability->IsReachable(param, recv_done));
-  EXPECT_FALSE(reachability->IsReachable(send, recv));
-  EXPECT_FALSE(reachability->IsReachable(send_done, recv));
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 4f898ce61c3f36e83e4b13130a404dbb4a2c36c6..5e37883d3d8d5067bab873ac6b5f732e7360c5fa 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -52,8 +52,10 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           computation->root_instruction() != instruction) {
         continue;
       }
-      // Skip Constant, Parameter, and AfterAll operation.
-      // TODO(b/64407269): Enable Tuple once the timeout issue is resolved.
+      // Skip Constant, Parameter, Tuple, AfterAll operation.
+      // Tuple constants are not directly supported by any backends, hence
+      // folding Tuple is not useful and would in fact be expanded back into
+      // kTuple by Algebraic Simplifier.
       // TODO(b/110532604): Enable AfterAll once AfterAll requires at least one
       // operand in which case constant folding will be impossible and this
       // special case is not necessary.
@@ -63,6 +65,7 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           instruction->opcode() == HloOpcode::kAfterAll) {
         continue;
       }
+
       // Skip instructions with non-constant operands.
       if (!hlo_query::AllOperandsAreConstants(*instruction)) {
         continue;
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index e45f905f7152c37a9ab2b41d407310671310c2a3..4f81dc94e577a63c09ae4019e5e8158252c712ce 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -22,22 +22,23 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
 
-namespace op = xla::testing::opcode_matchers;
-
 namespace xla {
 namespace {
 
-using HloConstantFoldingTest = HloVerifiedTestBase;
+namespace m = xla::match;
+
+using HloConstantFoldingTest = HloTestBase;
 
 TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   HloComputation::Builder builder(TestName());
@@ -46,16 +47,17 @@ TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {}), input));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert().WithOperand(0, m::Op().Is(input))));
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Constant()));
   EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<int64>(),
             42);
 }
@@ -67,16 +69,17 @@ TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert().WithOperand(0, m::Op().Is(input))));
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Constant()));
   EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<float>(),
             42.0f);
 }
@@ -88,16 +91,17 @@ TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(S64, {2}), input));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert().WithOperand(0, m::Op().Is(input))));
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Constant()));
   EXPECT_EQ(computation->root_instruction()->literal().Get<int64>({0}), 42);
   EXPECT_EQ(computation->root_instruction()->literal().Get<int64>({1}), 19);
 }
@@ -130,15 +134,15 @@ TEST_F(HloConstantFoldingTest, Concatenate) {
     Shape shape = ShapeUtil::MakeShape(F32, dimensions);
     builder.AddInstruction(HloInstruction::CreateConcatenate(
         shape, operands, test_config.concat_dimension));
-    auto module = CreateNewModule();
+    auto module = CreateNewVerifiedModule();
     auto computation = module->AddEntryComputation(builder.Build());
 
     HloConstantFolding const_folder;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+    TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
     EXPECT_TRUE(result);
 
     HloInstruction* root = computation->root_instruction();
-    EXPECT_THAT(root, op::Constant());
+    EXPECT_THAT(root, GmockMatch(m::Constant()));
     EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
   }
 }
@@ -157,15 +161,15 @@ TEST_F(HloConstantFoldingTest, Slice) {
   Shape shape = ShapeUtil::MakeShape(F32, {6, 6, 3, 4, 4});
   builder.AddInstruction(HloInstruction::CreateSlice(
       shape, literal_instruction, slice_start, slice_limits, slice_strides));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
 }
 
@@ -182,15 +186,15 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   const int64 permutation[] = {1, 2, 0, 4, 3};
   builder.AddInstruction(
       HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), shape));
 
   using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
@@ -219,27 +223,29 @@ const char* const kConstantFoldReduce = R"(
   })";
 
 TEST_F(HloConstantFoldingTest, ConstantFoldReduce) {
-  ParseAndVerifyModule(kConstantFoldReduce);
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(kConstantFoldReduce));
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(m.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_EQ(6, module()
-                   .entry_computation()
+  EXPECT_EQ(6, m->entry_computation()
                    ->root_instruction()
                    ->literal()
                    .GetFirstElement<int32>());
 }
 
 TEST_F(HloConstantFoldingTest, ConstantFoldReduceNoLayout) {
-  ParseAndVerifyModule(kConstantFoldReduce);
-  HloInstruction* add = module().computations().begin()->root_instruction();
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(kConstantFoldReduce));
+  HloInstruction* add = m->computations().begin()->root_instruction();
   LayoutUtil::ClearLayout(add->mutable_shape());
   HloConstantFolding const_folder;
-  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(m.get()));
   EXPECT_FALSE(result);
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(), op::Reduce());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reduce()));
 }
 
 const char* const kConstantFoldLargePad = R"(
@@ -259,7 +265,7 @@ TEST_F(HloConstantFoldingTest, DoesNotFoldLargePad) {
   EXPECT_FALSE(result);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Pad(op::Constant(), op::Constant()));
+              GmockMatch(m::Pad(m::Constant(), m::Constant())));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index a502fff9a0f1e40065746f2193bf76b1adefdb31..df7d3826dbad1f264a5dc53312c062900155b0f6 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -269,7 +269,7 @@ Status HloCostAnalysis::HandleOutfeed(const HloInstruction*) {
 Status HloCostAnalysis::HandleMap(const HloInstruction* map) {
   // Compute properties of the mapped function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(map->to_apply()));
+                      ProcessNestedSubcomputation(map->to_apply()));
 
   // Compute the cost of all elements for this Map operation.
   const int64 element_count = ShapeUtil::ElementsIn(map->shape());
@@ -285,7 +285,7 @@ Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
   HloComputation* function = reduce->to_apply();
   // Compute the cost of the user function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(function));
+                      ProcessNestedSubcomputation(function));
 
   // Compute the cost of all elements for this Reduce operation.
   // This counts the number of times the reduction function is applied, so it
@@ -311,7 +311,7 @@ Status HloCostAnalysis::HandleReduceWindow(
   auto function = reduce_window->to_apply();
   // Compute the properties of the reduction function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
-                      ProcessSubcomputation(function));
+                      ProcessNestedSubcomputation(function));
 
   // Compute the cost of all elements for this ReduceWindow operation. For each
   // output element there are window_size - 1 reductions to perform.
@@ -336,9 +336,9 @@ Status HloCostAnalysis::HandleSelectAndScatter(
   // Compute the properties of the select and scatter function.
   // Compute the properties of the reduction function.
   TF_ASSIGN_OR_RETURN(const Properties select_properties,
-                      ProcessSubcomputation(instruction->select()));
+                      ProcessNestedSubcomputation(instruction->select()));
   TF_ASSIGN_OR_RETURN(const Properties scatter_properties,
-                      ProcessSubcomputation(instruction->scatter()));
+                      ProcessNestedSubcomputation(instruction->scatter()));
 
   // Compute the cost of all elements for this operation. For each scatter
   // source element there are window_size - 1 select computations to perform and
@@ -419,6 +419,21 @@ Status HloCostAnalysis::HandleTranspose(const HloInstruction*) {
 }
 
 Status HloCostAnalysis::HandleAfterAll(const HloInstruction*) {
+  // This instruction is used to enforce ordering at compile time. No code is
+  // emitted.
+  current_should_compute_bottleneck_time_ = false;
+  current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleAddDependency(
+    const HloInstruction* add_dependency) {
+  // This instruction is used to enforce ordering at compile time. No code is
+  // emitted.
+  current_should_compute_bottleneck_time_ = false;
+  current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
@@ -574,7 +589,7 @@ Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
 Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
   TF_ASSIGN_OR_RETURN(
       current_properties_,
-      ProcessSubcomputation(fusion->fused_instructions_computation()));
+      ProcessNestedSubcomputation(fusion->fused_instructions_computation()));
 
   // Fusion nodes that produce a tuple also produce the entries in the tuple.
   // Ignore the memory accessed inside fused ops, since fusion is supposed to
@@ -595,7 +610,7 @@ Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
 
 Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
   TF_ASSIGN_OR_RETURN(current_properties_,
-                      ProcessSubcomputation(call->to_apply()));
+                      ProcessUnnestedSubcomputation(call->to_apply()));
   current_should_compute_bottleneck_time_ = false;
   return Status::OK();
 }
@@ -624,13 +639,12 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
   // Since the number of iterations of the while node will not always be
   // something that we can statically analyze, we cannot precisely compute the
   // cost of a while node. For now compute the cost of a single iteration.
-  //
-  // TODO(b/26346211): Improve the cost analysis for while nodes.
   TF_ASSIGN_OR_RETURN(const Properties body_properties,
-                      ProcessSubcomputation(xla_while->while_body()));
+                      ProcessUnnestedSubcomputation(xla_while->while_body()));
 
-  TF_ASSIGN_OR_RETURN(const Properties condition_properties,
-                      ProcessSubcomputation(xla_while->while_condition()));
+  TF_ASSIGN_OR_RETURN(
+      const Properties condition_properties,
+      ProcessUnnestedSubcomputation(xla_while->while_condition()));
 
   current_properties_.clear();
   for (const auto& property : body_properties) {
@@ -647,10 +661,12 @@ Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
 Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
   // Compute the cost of the true and false computations and take the maximum
   // from those for each property.
-  TF_ASSIGN_OR_RETURN(const Properties true_computation_properties,
-                      ProcessSubcomputation(conditional->true_computation()));
-  TF_ASSIGN_OR_RETURN(const Properties false_computation_properties,
-                      ProcessSubcomputation(conditional->false_computation()));
+  TF_ASSIGN_OR_RETURN(
+      const Properties true_computation_properties,
+      ProcessUnnestedSubcomputation(conditional->true_computation()));
+  TF_ASSIGN_OR_RETURN(
+      const Properties false_computation_properties,
+      ProcessUnnestedSubcomputation(conditional->false_computation()));
   current_properties_ = true_computation_properties;
   for (const auto& property : false_computation_properties) {
     if (!tensorflow::gtl::InsertIfNotPresent(&current_properties_, property)) {
@@ -664,12 +680,33 @@ Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
 }
 
 Status HloCostAnalysis::HandleGather(const HloInstruction* gather) {
+  // Gather doesn't read the whole input buffer, it's equivalent to a copy the
+  // size of the output shape and a read of the gather indices.
+  current_properties_[kBytesAccessedKey] =
+      GetShapeSize(gather->shape()) * 2 +
+      GetShapeSize(gather->operand(1)->shape());
   // Gather does not issue any flops.
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleScatter(const HloInstruction* scatter) {
-  // TODO(b/32945756): Compute the properties of the sub-computation.
+  current_properties_[kBytesAccessedKey] =
+      GetShapeSize(scatter->operand(2)->shape()) * 2 +
+      GetShapeSize(scatter->operand(1)->shape());
+  const int64 element_count =
+      ShapeUtil::ElementsIn(scatter->operand(2)->shape());
+  TF_ASSIGN_OR_RETURN(const Properties sub_properties,
+                      ProcessNestedSubcomputation(scatter->to_apply()));
+  for (const auto& property : sub_properties) {
+    if (property.first != kBytesAccessedKey) {
+      current_properties_[property.first] = property.second * element_count;
+    }
+  }
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleGetDimensionSize(
+    const HloInstruction* /*get_size*/) {
   return Status::OK();
 }
 
@@ -709,10 +746,19 @@ float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const {
   return GetPropertyForHlo(hlo, kOptimalSecondsKey, hlo_properties_);
 }
 
-StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
-    HloComputation* computation) {
+StatusOr<HloCostAnalysis::Properties>
+HloCostAnalysis::ProcessNestedSubcomputation(HloComputation* computation) {
+  HloCostAnalysis visitor(shape_size_, per_second_rates_);
+  TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  return visitor.properties();
+}
+
+StatusOr<HloCostAnalysis::Properties>
+HloCostAnalysis::ProcessUnnestedSubcomputation(HloComputation* computation) {
   HloCostAnalysis visitor(shape_size_, per_second_rates_);
   TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  hlo_properties_.insert(visitor.hlo_properties_.begin(),
+                         visitor.hlo_properties_.end());
   return visitor.properties();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 46b4bbeef222e6de581360fc01b293e812f1dedd..33983119c9b00a248c0e8dcc5815c6367192dca3 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -101,12 +101,14 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleBroadcast(const HloInstruction* broadcast) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
+  Status HandleAddDependency(const HloInstruction* add_dependency) override;
   Status HandleAfterAll(const HloInstruction* token) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
   Status HandleWhile(const HloInstruction* xla_while) override;
   Status HandleConditional(const HloInstruction* conditional) override;
   Status HandleGather(const HloInstruction* gather) override;
   Status HandleScatter(const HloInstruction* scatter) override;
+  Status HandleGetDimensionSize(const HloInstruction* get_size) override;
   Status FinishVisit(const HloInstruction* root) override;
 
   Status Preprocess(const HloInstruction* hlo) override;
@@ -153,7 +155,24 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
 
   // Returns the properties computed from visiting the computation rooted at the
   // given hlo.
-  StatusOr<Properties> ProcessSubcomputation(HloComputation* computation);
+  //
+  // The difference between ProcessNestedSubcomputation and
+  // ProcessUnnestedSubcomputation is that we expect to get profile results for
+  // an unnested subcomputation's individual instructions, while we expect that
+  // a nested subcomputation is completely subsumed by its parent.
+  //
+  // For example, subcomputations inside kFusion and kMap are considered nested,
+  // while subcomputations inside kWhile and kConditional are considered
+  // unnested.
+  //
+  // Another way of thinking of this is, kFusion is implemented on the GPU
+  // backend using just one GPU kernel, while kWhile's body is implemented as a
+  // sequence of kernels, one for each HLO therein.  Backends don't necessarily
+  // need to follow this same implementation strategy, but we assume they do for
+  // the purposes of this platform-generic cost analysis.
+  StatusOr<Properties> ProcessNestedSubcomputation(HloComputation* computation);
+  StatusOr<Properties> ProcessUnnestedSubcomputation(
+      HloComputation* computation);
 
   // Utility function to handle all element-wise operations.
   Status HandleElementwiseOp(const HloInstruction* hlo_instruction);
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index d76ce9ecbca67ae3bc3db4ee2452f30ccec5b88b..ff32faf298dd1f04c5b769f2a88f76a7a1e18ae7 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -387,7 +387,7 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
         HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract, mul, clamp));
     auto tuple = HloInstruction::CreateTuple({sub, sub, mul, c1});
 
-    auto module = CreateNewModule();
+    auto module = CreateNewVerifiedModule();
     auto* computation = module->AddEntryComputation(builder.Build());
     auto* fusion = computation->CreateFusionInstruction(
         {sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
@@ -429,7 +429,7 @@ TEST_F(FusionCostAnalysis, NoLayout) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       shape_with_layout, HloOpcode::kAdd, c1, broadcast));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add, broadcast}, HloInstruction::FusionKind::kLoop);
@@ -472,7 +472,7 @@ TEST_F(DomainCostAnalysis, DomainCost) {
   auto domain = builder.AddInstruction(
       HloInstruction::CreateDomain(tuple->shape(), tuple, nullptr, nullptr));
 
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(hlo_module->entry_computation()->root_instruction(), domain);
@@ -556,5 +556,56 @@ TEST_F(HloCostAnalysisTest, DynamicUpdateSlice) {
   EXPECT_EQ(analysis.bytes_accessed(), 8);
 }
 
+TEST_F(HloCostAnalysisTest, Gather) {
+  // Test the analysis on a gather.
+  XlaBuilder builder("gather");
+  Shape operand_shape = ShapeUtil::MakeShape(S32, {3, 3});
+  Shape indices_shape = ShapeUtil::MakeShape(S32, {2});
+
+  auto operand = Parameter(&builder, 0, operand_shape, "operand");
+  auto indices = Parameter(&builder, 1, indices_shape, "indices");
+  GatherDimensionNumbers dim_numbers;
+  dim_numbers.add_offset_dims(1);
+  dim_numbers.add_collapsed_slice_dims(0);
+  dim_numbers.add_start_index_map(0);
+  dim_numbers.set_index_vector_dim(1);
+  Gather(operand, indices, dim_numbers, {1, 3});
+
+  auto hlo_module = BuildHloGraph(&builder);
+
+  // Run HLO cost analysis.
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  EXPECT_EQ(analysis.bytes_accessed(), 56);
+}
+
+TEST_F(HloCostAnalysisTest, Scatter) {
+  // Test the analysis on a scatter.
+  XlaBuilder builder("scatter");
+  Shape operand_shape = ShapeUtil::MakeShape(F32, {3, 3});
+  Shape indices_shape = ShapeUtil::MakeShape(S32, {2});
+  Shape values_shape = ShapeUtil::MakeShape(F32, {2, 3});
+
+  auto operand = Parameter(&builder, 0, operand_shape, "operand");
+  auto indices = Parameter(&builder, 1, indices_shape, "indices");
+  auto values = Parameter(&builder, 2, values_shape, "values");
+  ScatterDimensionNumbers dim_numbers;
+  dim_numbers.set_index_vector_dim(1);
+  dim_numbers.add_update_window_dims(1);
+  dim_numbers.add_inserted_window_dims(0);
+  dim_numbers.add_scatter_dims_to_operand_dims(0);
+  Scatter(operand, indices, values, add_, dim_numbers);
+
+  auto hlo_module = BuildHloGraph(&builder);
+
+  // Run HLO cost analysis.
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  EXPECT_EQ(analysis.bytes_accessed(), 4 * (2 + 2 * (2 * 3)));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index e07a196d1154dc0ea45ccd2f15b0b9b56f7c41f8..aaa9ec60eb3c4e0159ed40b37d772e0973d306ec 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -19,22 +19,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
-class HloCreationUtilsTest : public HloVerifiedTestBase {
+class HloCreationUtilsTest : public HloTestBase {
  protected:
-  HloModule* CreateModuleWithProgramShape(
+  std::unique_ptr<VerifiedHloModule> CreateModuleWithProgramShape(
       PrimitiveType primitive_type, absl::Span<const int64> input_shape_dims,
       absl::Span<const int64> output_shape_dims, HloInstruction** param,
       HloComputation** entry_computation) {
     Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims);
     Shape output_shape =
         ShapeUtil::MakeShape(primitive_type, output_shape_dims);
-    auto module = CreateNewModule("test");
+    auto module = CreateNewVerifiedModule("test");
     *entry_computation = module->AddEntryComputation(
         CreateComputationWithSignature({&input_shape}, output_shape, "entry")
             .ValueOrDie());
@@ -47,10 +47,9 @@ TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{2},
-                                                   /*output_shape_dims=*/{2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{2}, &param,
+                                             &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_1_dims_collapsed,
                           CollapseFirstNDims(param, 1));
@@ -67,9 +66,8 @@ TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(
-      S32,
-      /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, &param,
+  auto module = CreateModuleWithProgramShape(
+      S32, /*input_shape_dims=*/{2, 3, 2}, /*output_shape_dims=*/{6, 2}, &param,
       &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_2_dims_collapsed,
@@ -92,10 +90,9 @@ TEST_F(HloCreationUtilsTest, Prepend1DegenerateDim) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{2},
-                                                   /*output_shape_dims=*/{1, 2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{1, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_1_degenerate_dim_prepended,
                           PrependDegenerateDims(param, 1));
@@ -113,10 +110,9 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(
-      S32,
-      /*input_shape_dims=*/{2}, /*output_shape_dims=*/{1, 1, 2}, &param,
-      &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{1, 1, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
                           PrependDegenerateDims(param, 2));
@@ -134,10 +130,9 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{},
-                                                   /*output_shape_dims=*/{1, 1},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{1, 1},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * with_2_degenerate_dims_prepended,
                           PrependDegenerateDims(param, 2));
@@ -154,10 +149,9 @@ TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(
-      S32,
-      /*input_shape_dims=*/{6}, /*output_shape_dims=*/{3, 1, 2}, &param,
-      &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{6},
+                                             /*output_shape_dims=*/{3, 1, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(HloInstruction * first_dim_expanded,
                           ExpandFirstDimIntoNDims(param, {3, 1, 2}));
@@ -176,10 +170,9 @@ TEST_F(HloCreationUtilsTest, PadVectorWithZeros) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{2},
-                                                   /*output_shape_dims=*/{6},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{2},
+                                             /*output_shape_dims=*/{6}, &param,
+                                             &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloInstruction * zero_padded_param,
@@ -197,10 +190,9 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(S32,
-                                                   /*input_shape_dims=*/{},
-                                                   /*output_shape_dims=*/{2, 2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(S32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloInstruction * zeros,
@@ -218,10 +210,9 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
-  HloModule* module = CreateModuleWithProgramShape(F32,
-                                                   /*input_shape_dims=*/{},
-                                                   /*output_shape_dims=*/{2, 2},
-                                                   &param, &entry_computation);
+  auto module = CreateModuleWithProgramShape(F32, /*input_shape_dims=*/{},
+                                             /*output_shape_dims=*/{2, 2},
+                                             &param, &entry_computation);
 
   TF_ASSERT_OK_AND_ASSIGN(
       HloInstruction * zeros,
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 9b18b0284f63c25934c1b7118dc8973caa62cadc..1eb0260468c4560985027947e89c62cc21139e7e 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -44,7 +44,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-class HloCseTest : public HloVerifiedTestBase {
+class HloCseTest : public HloTestBase {
  protected:
   HloCseTest() {}
 };
@@ -59,13 +59,13 @@ TEST_F(HloCseTest, CombineTwoConstants) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
   HloInstruction* constant = *computation->instructions().begin();
@@ -89,14 +89,14 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
   EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
   auto first_operand = add->operand(0);
@@ -121,14 +121,14 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
   EXPECT_THAT(add, op::Add(constant1, constant2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   EXPECT_THAT(add, op::Add(constant1, constant2));
@@ -171,13 +171,13 @@ TEST_F(HloCseTest, ConstantsSameValueDifferentType) {
         shape_r0, HloOpcode::kAdd, root, constants[i]));
   }
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(20, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   // CSE will remove both the second float(42.0f) and the corresponding
   // convert/cast.
@@ -201,7 +201,7 @@ TEST_F(HloCseTest, NonscalarConstants) {
   auto tuple = builder.AddInstruction(HloInstruction::CreateTuple(
       {common_constant1, common_constant2, uncommon_constant}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
@@ -209,7 +209,7 @@ TEST_F(HloCseTest, NonscalarConstants) {
               op::Tuple(common_constant1, common_constant2, uncommon_constant));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   auto first_operand = tuple->operand(0);
@@ -233,14 +233,14 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2, exp3}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2, exp3));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   auto first_operand = tuple->operand(0);
@@ -250,7 +250,7 @@ TEST_F(HloCseTest, IdenticalInstructions) {
 
 // Test two identical while loops with same inputs
 TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesSameInput) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalConditionsAndBodiesSameInput
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -277,21 +277,21 @@ index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
 f32[]) while((f32[], f32[]) %tuple.1), condition=%condition, body=%body ROOT
 %while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
 condition=%condition.1, body=%body
-    }
-    )");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(5, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_TRUE(cse.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(4, computation->instruction_count());
 }
 
 // Test two while loops with same conditions, same inputs, but different
 // bodies
 TEST_F(HloCseTest, WhileLoopsIdenticalConditionsSameInputAndDifferentBodies) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalConditionsSameInputAndDifferentBodies
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -327,20 +327,20 @@ index=1 %sub = f32[] subtract(f32[] %get-tuple-element.2, f32[]
       %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
 condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
 f32[]) %tuple.1), condition=%condition.1, body=%body2
-    }
-    )");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(5, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(5, computation->instruction_count());
 }
 
 // Test two identical while loops with different inputs
 TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesDifferentInput) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalConditionsAndBodiesDifferentInput
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -369,22 +369,21 @@ condition=%condition, body=%body %constant.4 = f32[] constant(1) %constant.5 =
 f32[] constant(2) %tuple.2 = (f32[], f32[]) tuple(f32[] %constant.4, f32[]
 %constant.5) ROOT %while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.2),
 condition=%condition.1, body=%body
-    }
-
-    )");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(8, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(8, computation->instruction_count());
 }
 
 // Test two while loops with identical bodies and same inputs, but different
 // conditions
 TEST_F(HloCseTest, WhileLoopsIdenticalBodiesAndInputDifferntConditions) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule WhileLoopsIdenticalBodiesAndInputDifferntConditions
 
     %body (param: (f32[], f32[])) -> (f32[], f32[]) {
@@ -411,13 +410,14 @@ f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2)
       %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
 condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
 f32[]) %tuple.1), condition=%condition.1, body=%body
-    })");
+    })";
 
-  auto computation = module().entry_computation();
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = m->entry_computation();
 
   EXPECT_EQ(5, computation->instruction_count());
   HloCSE cse(true);
-  EXPECT_FALSE(cse.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(cse.Run(m.get()).ValueOrDie());
   EXPECT_EQ(5, computation->instruction_count());
 }
 
@@ -439,14 +439,14 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/true);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
@@ -470,14 +470,14 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({exp1, exp2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
   auto first_operand = tuple->operand(0);
@@ -488,7 +488,7 @@ TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsInsensitive) {
 TEST_F(HloCseTest, FusionInternalCSE) {
   // Test that we can CSE expressions that live within a fusion node
   // computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape_r0 = ShapeUtil::MakeShape(F32, {});
@@ -512,7 +512,7 @@ TEST_F(HloCseTest, FusionInternalCSE) {
 
   EXPECT_EQ(5, fused_computation->instruction_count());
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
   EXPECT_EQ(4, fused_computation->instruction_count());
 
   auto root = fused_computation->root_instruction();
@@ -554,14 +554,14 @@ TEST_F(HloCseTest, IdenticalExpressions) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({add1, add2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(8, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(op::Add(negate1, exp1), op::Add(negate2, exp2)));
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(module).ValueOrDie());
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(5, computation->instruction_count());
   auto operand = tuple->operand(0);
@@ -586,7 +586,7 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, rng1, rng2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
@@ -595,7 +595,7 @@ TEST_F(HloCseTest, DoNotCombineRng) {
   uint32 count_before = computation->instruction_count();
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   uint32 count_after = computation->instruction_count();
   EXPECT_EQ(count_before, count_after);
@@ -607,7 +607,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   // Test that two calls to an impure function are not commoned. RNG
   // is the source of the impurity.
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // rng_function is an impure function because it does RNG.
   HloComputation* rng_function = nullptr;
@@ -649,7 +649,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   VLOG(3) << "before: " << module->ToString();
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   VLOG(3) << "after: " << module->ToString();
 
@@ -659,7 +659,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
 }
 
 TEST_F(HloCseTest, CompareComputations) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
     HloModule m
 
     add_computation {
@@ -680,11 +680,12 @@ TEST_F(HloCseTest, CompareComputations) {
       r1 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation
       r2 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation2
       ROOT f2 = (f32[],f32[]) tuple(r1, r2)
-    })");
+    })";
 
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module()).ValueOrDie());
-  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_TRUE(cse.Run(m.get()).ValueOrDie());
+  HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_EQ(root->operand(0), root->operand(1));
 }
 
@@ -697,19 +698,19 @@ TEST_F(HloCseTest, ConstantsSameValueInDifferentDomains) {
   builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(42)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_FALSE(cse.Run(module).ValueOrDie());
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(2, computation->instruction_count());
 }
 
 TEST_F(HloCseTest, Domain) {
-  ParseAndVerifyModule(R"(
+  const char* const hlo_string = R"(
 HloModule module
 ENTRY %entry {
   %param = f32[] parameter(0), sharding={maximal device=0}
@@ -730,11 +731,12 @@ ENTRY %entry {
     domain={kind="sharding", entry={maximal device=2}, exit={maximal device=0}}
   %add = f32[] add(%domain.3, %domain.4)
   ROOT %sub = f32[] subtract(%add, %domain.5)
-})");
+})";
 
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
   HloCSE cse(/*is_layout_sensitive=*/false);
-  EXPECT_TRUE(cse.Run(&module()).ValueOrDie());
-  const HloInstruction* sub = module().entry_computation()->root_instruction();
+  EXPECT_TRUE(cse.Run(m.get()).ValueOrDie());
+  const HloInstruction* sub = m->entry_computation()->root_instruction();
   const HloInstruction* add = sub->operand(0);
   EXPECT_EQ(add->operand(0), add->operand(1));
   EXPECT_NE(add->operand(0), sub->operand(1));
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index c22adcdd8dd936eebca3a8f0d85b1254401b5ef4..3ed3d3c11c71dc534f193ba3ffb556b0eb0c80e4 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -126,7 +126,7 @@ bool HloDataflowAnalysis::ValueIsDefinedAt(const HloInstruction* instruction,
 
 const HloValue& HloDataflowAnalysis::GetValueDefinedAt(
     const HloInstruction* instruction, const ShapeIndex& index) const {
-  CHECK(ValueIsDefinedAt(instruction, index));
+  CHECK(ValueIsDefinedAt(instruction, index)) << instruction->ToString();
   return GetUniqueValueAt(instruction, index);
 }
 
@@ -466,6 +466,21 @@ bool HloDataflowAnalysis::UpdateDomainValueSet(HloInstruction* domain) {
   return changed;
 }
 
+bool HloDataflowAnalysis::UpdateAddDependencyValueSet(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand.
+  CHECK_EQ(add_dependency->opcode(), HloOpcode::kAddDependency);
+  const InstructionValueSet& operand_set =
+      GetInstructionValueSet(add_dependency->operand(0));
+  InstructionValueSet& add_dependency_set =
+      GetInstructionValueSet(add_dependency);
+  if (operand_set != add_dependency_set) {
+    add_dependency_set = operand_set;
+    return true;
+  }
+  return false;
+}
+
 bool HloDataflowAnalysis::UpdateGetTupleElementValueSet(HloInstruction* gte) {
   CHECK_EQ(gte->opcode(), HloOpcode::kGetTupleElement);
   bool changed = false;
@@ -622,6 +637,8 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
     HloInstruction* instruction) {
   // Recompute from operands.
   switch (instruction->opcode()) {
+    case HloOpcode::kAddDependency:
+      return UpdateAddDependencyValueSet(instruction);
     case HloOpcode::kBitcast:
       return UpdateBitcastValueSet(instruction);
     case HloOpcode::kDomain:
@@ -795,6 +812,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
             define_all_values();
           }
           break;
+        case HloOpcode::kAddDependency:
         case HloOpcode::kWhile:
         case HloOpcode::kCall:
         case HloOpcode::kConditional:
@@ -1048,6 +1066,7 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
   }
 
   if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
+      user->opcode() == HloOpcode::kScatter ||
       user->opcode() == HloOpcode::kWhile) {
     // We eliminated other users in BufferLiveness::live_range_strictly_before,
     // so here we just need to check that the use is at operand index 0.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index abac398c04fc4c418d8814a0097db4434bc1cd9c..ece17fc4c3ea0261474df5d53c088dd05016e1e4 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -193,6 +193,7 @@ class HloDataflowAnalysis {
   bool UpdateSendValueSet(HloInstruction* send);
   bool UpdateTupleValueSet(HloInstruction* tuple);
   bool UpdateWhileValueSet(HloInstruction* xla_while);
+  bool UpdateAddDependencyValueSet(HloInstruction* add_dependency);
 
   // Propagate the dataflow through the module.
   void Propagate();
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 510d6360a1cf94ef06d2ed919a57c7a825886834..f7a1f19a6f52befd58a405d0e406d7d0d37a8e57 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -43,7 +43,7 @@ using ::testing::UnorderedElementsAre;
 class HloDataflowAnalysisTest : public HloTestBase,
                                 public ::testing::WithParamInterface<bool> {
  protected:
-  HloDataflowAnalysisTest() : module_(CreateNewModule()) {}
+  HloDataflowAnalysisTest() : module_(CreateNewVerifiedModule()) {}
 
   // Run dataflow analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
@@ -1877,6 +1877,30 @@ TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
   }
 }
 
+TEST_P(HloDataflowAnalysisTest, AddDependency) {
+  string module_string = R"(
+HloModule AddDependency
+ENTRY %AddDependency (p: f32[3]) -> f32[3] {
+  %p = f32[3] parameter(0)
+  %token = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %p, token[] %token)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloDataflowAnalysis> analysis,
+                          HloDataflowAnalysis::Run(*module));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAddDependency);
+
+  // The after-all and parameter should define a value. Add-dependency should
+  // not.
+  EXPECT_EQ(analysis->values().size(), 2);
+  EXPECT_FALSE(analysis->ValueIsDefinedAt(root));
+}
+
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
@@ -1884,7 +1908,7 @@ INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
 class HloDataflowAnalysisTestBase : public HloTestBase {
  protected:
   void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
+    module_ = CreateNewUnverifiedModule();
     computation_ = module_->AddEntryComputation(std::move(computation));
   }
 
@@ -2283,6 +2307,44 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
       dataflow_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
 }
 
+TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
+  const char* hlo_text = R"(
+    HloModule TensorFlowScatterV1
+
+    update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+      lhs = s32[] parameter(0)
+      ROOT rhs = s32[] parameter(1)
+    }
+
+    ENTRY main {
+      operand = s32[3,3] parameter(0)
+      indices = s32[2] parameter(1)
+      updates = s32[2,3] parameter(2)
+      ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+          to_apply=update_s32,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseHloString(hlo_text));
+  computation_ = module_->entry_computation();
+  RunAnalysis();
+
+  HloInstruction* operand_param = computation_->parameter_instruction(0);
+  HloInstruction* indices_param = computation_->parameter_instruction(1);
+  HloInstruction* updates_param = computation_->parameter_instruction(2);
+  HloInstruction* scatter = computation_->root_instruction();
+
+  EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(
+      operand_param, {}, scatter, {}));
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(
+      indices_param, {}, scatter, {}));
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(
+      updates_param, {}, scatter, {}));
+}
+
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2308,7 +2370,8 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
   auto values = builder.AddInstruction(
       HloInstruction::CreateParameter(1, values_shape, "values"));
   auto sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys, values));
+      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys,
+      {values}));
 
   BuildModuleAndRunAnalysis(builder.Build());
 
@@ -2437,7 +2500,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
     return builder.Build();
   };
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   HloComputation* cond_computation =
       module_->AddEmbeddedComputation(make_cond());
   HloComputation* body_computation =
@@ -2472,7 +2535,7 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
   auto add = sub_builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
   sub_computation->CreateFusionInstruction({add, ones},
                                            HloInstruction::FusionKind::kLoop);
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 3b5cde2996c4195ef458662cd21de85a832d8d55..1fa4259a3e42286cbc911907eea563e6ca6f8611 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -59,7 +59,7 @@ TEST_F(HloDceTest, NoDeadCode) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
@@ -80,7 +80,7 @@ TEST_F(HloDceTest, InstructionsWithSideEffect) {
       HloInstruction::CreateSend(constant, token, /*channel_id=*/0));
   builder.AddInstruction(HloInstruction::CreateTuple({}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(4, computation->instruction_count());
@@ -110,7 +110,7 @@ TEST_F(HloDceTest, DeadParameters) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       live_param->shape(), HloOpcode::kNegate, live_param));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
@@ -150,7 +150,7 @@ TEST_F(HloDceTest, ControlDependencies) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Add a control dependency between two instructions.
@@ -175,7 +175,7 @@ TEST_F(HloDceTest, ControlDependencies) {
 
 // Tests that a dead call instruction is removed.
 TEST_F(HloDceTest, DeadInstructionWithCalledComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Called computation for the call instruction.
@@ -215,7 +215,7 @@ TEST_F(HloDceTest, DeadInstructionWithCalledComputation) {
 // Tests that a while instruction with an infeed (effectul instruction) in its
 // body is not removed, even its user count is 0.
 TEST_F(HloDceTest, CalledComputationWithSideEffect) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Condition computation of a while instruction.
@@ -270,7 +270,7 @@ TEST_F(HloDceTest, CalledComputationWithSideEffect) {
 
 // Tests that a nested call instruction with a side effect is not removed.
 TEST_F(HloDceTest, CalledComputationWithNestedSideEffect) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Nested called computation with a side effect.
@@ -323,7 +323,7 @@ TEST_F(HloDceTest, CalledComputationWithNestedSideEffect) {
 }
 
 TEST_F(HloDceTest, RemoveDeadSubcomputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation::Builder subcomp_builder("reduction_subcomp");
@@ -364,7 +364,7 @@ TEST_F(HloDceTest, RemoveDeadSubcomputation) {
 }
 
 TEST_F(HloDceTest, KeepUsedSubcomputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation::Builder subcomp_builder("reduction_subcomp");
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
index 72185698c9bdcbf2bebed7ee82bc4ed082ce6a14..19b5734825df833fd34d634e4c1630dd75e96c4c 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
@@ -23,23 +23,14 @@ limitations under the License.
 
 namespace xla {
 
-class HloDomainIsolator::RunContext {
- public:
-  RunContext(HloModule* module, HloDomainIsolator* isolator)
-      : module_(module), isolator_(isolator) {}
+namespace {
 
-  StatusOr<bool> Run();
-
- private:
-  HloModule* module_;
-  HloDomainIsolator* isolator_;
-};
-
-StatusOr<bool> HloDomainIsolator::RunContext::Run() {
-  hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before Domain Isolator");
+StatusOr<bool> RunInternal(HloModule* module,
+                           HloDomainIsolator::DomainCreator* creator) {
+  hlo_graph_dumper::MaybeDumpHloModule(*module, "Before Domain Isolator");
 
   int64 added_domains = 0;
-  for (HloComputation* computation : module_->computations()) {
+  for (HloComputation* computation : module->computations()) {
     // Walk in post order and place all the required kDomain instructions.
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
@@ -55,8 +46,7 @@ StatusOr<bool> HloDomainIsolator::RunContext::Run() {
           root = root->mutable_operand(0);
         }
         // Check whether a kDomain is necessary between instruction and operand.
-        HloInstruction* domain =
-            isolator_->creator_(instruction, root, operand);
+        HloInstruction* domain = (*creator)(instruction, root, operand);
         if (domain != nullptr) {
           VLOG(4) << "New domain: " << domain->ToString();
           TF_RETURN_IF_ERROR(operand->ReplaceUseWith(instruction, domain));
@@ -67,17 +57,19 @@ StatusOr<bool> HloDomainIsolator::RunContext::Run() {
   }
   VLOG(3) << "Added " << added_domains << " kDomain instructions";
   if (added_domains > 0) {
-    hlo_graph_dumper::MaybeDumpHloModule(*module_, "After Domain Isolator");
+    hlo_graph_dumper::MaybeDumpHloModule(*module, "After Domain Isolator");
   }
   return added_domains > 0;
 }
 
-HloDomainIsolator::HloDomainIsolator(DomainCreator creator)
-    : creator_(std::move(creator)) {}
+}  // namespace
+
+HloDomainIsolator::HloDomainIsolator(DomainCreatorFactory creator_factory)
+    : creator_factory_(std::move(creator_factory)) {}
 
 StatusOr<bool> HloDomainIsolator::Run(HloModule* module) {
-  RunContext run_context(module, this);
-  return run_context.Run();
+  DomainCreator creator = creator_factory_();
+  return RunInternal(module, &creator);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.h b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
index c0bf1b9e16b52d81365db277abeb06defeb12d44..2274c3a96c2bdd1f4dbd454782699ccb0404529d 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_isolator.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
@@ -40,17 +40,15 @@ class HloDomainIsolator : public HloModulePass {
   // Returns nullptr in case no domain separation is necessary.
   using DomainCreator = std::function<HloInstruction*(
       HloInstruction*, HloInstruction*, HloInstruction*)>;
-
-  explicit HloDomainIsolator(DomainCreator creator);
+  using DomainCreatorFactory = std::function<DomainCreator()>;
+  explicit HloDomainIsolator(DomainCreatorFactory creator_factory_);
 
   absl::string_view name() const override { return "domain_isolator"; }
 
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  class RunContext;
-
-  DomainCreator creator_;
+  DomainCreatorFactory creator_factory_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index 6ca1255edec377cf0738a1ad2596cb06aa1c2c6f..c6d02f9f67bb599e496d20fc2acf2e627ed54438 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -42,18 +42,19 @@ namespace xla {
   return std::move(domain_map);
 }
 
-bool HloDomainMap::InSameDomain(HloInstruction* instruction1,
-                                HloInstruction* instruction2) const {
+bool HloDomainMap::InSameDomain(const HloInstruction* instruction1,
+                                const HloInstruction* instruction2) const {
   int64 domain_id1 = GetDomainId(instruction1);
   int64 domain_id2 = GetDomainId(instruction2);
   return domain_id1 >= 0 && domain_id1 == domain_id2;
 }
 
-int64 HloDomainMap::GetDomainId(HloInstruction* instruction) const {
+int64 HloDomainMap::GetDomainId(const HloInstruction* instruction) const {
   return FindOrDefault(instruction_to_domain_, instruction, -1);
 }
 
-int64 HloDomainMap::GetDomainMetadataId(HloInstruction* instruction) const {
+int64 HloDomainMap::GetDomainMetadataId(
+    const HloInstruction* instruction) const {
   return FindOrDie(domain_metadata_id_, instruction);
 }
 
@@ -200,7 +201,8 @@ StatusOr<std::unique_ptr<DomainMetadata::Domain>> HloDomainMap::CreateDomain(
   return std::move(domain);
 }
 
-bool HloDomainMap::IsDomainInstruction(HloInstruction* instruction) const {
+bool HloDomainMap::IsDomainInstruction(
+    const HloInstruction* instruction) const {
   if (instruction->opcode() != HloOpcode::kDomain) {
     return false;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.h b/tensorflow/compiler/xla/service/hlo_domain_map.h
index c8d581b74677674ed8682ecc1fa022cea890a649..bce7d1aa7cf1822ef1608674e7bf9483c628e4b5 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.h
@@ -58,21 +58,21 @@ class HloDomainMap {
   }
 
   // Checks whether two instructions are within the same domain.
-  bool InSameDomain(HloInstruction* instruction1,
-                    HloInstruction* instruction2) const;
+  bool InSameDomain(const HloInstruction* instruction1,
+                    const HloInstruction* instruction2) const;
 
   // Checks whether instruction is a kDomain instruction of the kind we are
   // currently processing.
-  bool IsDomainInstruction(HloInstruction* instruction) const;
+  bool IsDomainInstruction(const HloInstruction* instruction) const;
 
   // Retrieves the domain identifier of the instruction, or -1 in case
   // instruction is not found within any domain.
-  int64 GetDomainId(HloInstruction* instruction) const;
+  int64 GetDomainId(const HloInstruction* instruction) const;
 
   // Returns the unique id of the domain metadata for the domain the given
   // instruction belongs to. The given instruction must not be a kDomain
   // instruction since each domain instruction is associated with 2 domains.
-  int64 GetDomainMetadataId(HloInstruction* instruction) const;
+  int64 GetDomainMetadataId(const HloInstruction* instruction) const;
 
  private:
   // Map used for representing instruction ordering, i.e.
@@ -119,8 +119,8 @@ class HloDomainMap {
 
   string domain_kind_;
   std::vector<std::unique_ptr<DomainMetadata::Domain>> instruction_domains_;
-  absl::flat_hash_map<HloInstruction*, int64> instruction_to_domain_;
-  absl::flat_hash_map<HloInstruction*, int64> domain_metadata_id_;
+  absl::flat_hash_map<const HloInstruction*, int64> instruction_to_domain_;
+  absl::flat_hash_map<const HloInstruction*, int64> domain_metadata_id_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
index 43e74d2f6f07bd685ad8683401138a4f06cd2ad2..acdb42128e3d9a1fb912a466c9c2c3cbbe3d3f83 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
@@ -22,13 +22,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class HloDomainTest : public HloVerifiedTestBase {
+class HloDomainTest : public HloTestBase {
  protected:
   bool FindUserViaDomainPath(HloInstruction* instruction,
                              HloInstruction* operand) const {
@@ -64,13 +63,6 @@ class HloDomainTest : public HloVerifiedTestBase {
     }
     return false;
   }
-
-  StatusOr<HloModule*> ParseModule(absl::string_view hlo_string) {
-    HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-    ParseAndVerifyModule(hlo_string, config);
-    return &module();
-  }
 };
 
 // Dummy DomainMetadata implementation which create kDomain boundaries around
@@ -106,20 +98,22 @@ class OpNameMetadata : public DomainMetadata {
 };
 
 // Creator function for OpNameMetadata domains.
-HloInstruction* OpNameDomainCreator(HloInstruction* instruction,
-                                    HloInstruction* root,
-                                    HloInstruction* operand) {
-  if (instruction->metadata().op_name() == root->metadata().op_name()) {
-    return nullptr;
+class OpNameDomainCreator {
+ public:
+  HloInstruction* operator()(HloInstruction* instruction, HloInstruction* root,
+                             HloInstruction* operand) {
+    if (instruction->metadata().op_name() == root->metadata().op_name()) {
+      return nullptr;
+    }
+    std::unique_ptr<DomainMetadata> operand_side_metadata =
+        absl::make_unique<OpNameMetadata>(root->metadata().op_name());
+    std::unique_ptr<DomainMetadata> user_side_metadata =
+        absl::make_unique<OpNameMetadata>(instruction->metadata().op_name());
+    return operand->parent()->AddInstruction(HloInstruction::CreateDomain(
+        operand->shape(), operand, std::move(operand_side_metadata),
+        std::move(user_side_metadata)));
   }
-  std::unique_ptr<DomainMetadata> operand_side_metadata =
-      absl::make_unique<OpNameMetadata>(root->metadata().op_name());
-  std::unique_ptr<DomainMetadata> user_side_metadata =
-      absl::make_unique<OpNameMetadata>(instruction->metadata().op_name());
-  return operand->parent()->AddInstruction(HloInstruction::CreateDomain(
-      operand->shape(), operand, std::move(operand_side_metadata),
-      std::move(user_side_metadata)));
-}
+};
 
 Status OpNameDomainNormalizer(const DomainMetadata::Domain& domain,
                               const DomainMetadata* metadata) {
@@ -142,31 +136,32 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(ShardingDomainCreator{});
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 }
 
 TEST_F(HloDomainTest, CheckNoDomainAddedIfNoSharding) {
@@ -184,11 +179,12 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(ShardingDomainCreator{});
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(!isolator_changed);
 }
 
@@ -211,26 +207,27 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(ShardingDomainCreator{});
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "b", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "f", "e_element"));
-  EXPECT_FALSE(HasDomainEdge(module, "a", "p0"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "f", "e_element"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "a", "p0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "b", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "f", "e_element"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "f", "e_element"));
 }
 
 TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) {
@@ -248,11 +245,12 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(ShardingDomainCreator{});
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_FALSE(isolator_changed);
 }
 
@@ -271,15 +269,16 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_FALSE(remover_changed);
 
-  HloInstruction* add = FindInstruction(module, "c");
+  HloInstruction* add = FindInstruction(module.get(), "c");
   ASSERT_NE(add, nullptr);
   auto device = add->sharding_unique_device();
   EXPECT_TRUE(device.has_value());
@@ -302,41 +301,42 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator sharding_isolator(ShardingDomainCreator{});
+  HloDomainIsolator sharding_isolator([]() { return ShardingDomainCreator{}; });
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_isolator_changed,
-                          sharding_isolator.Run(module));
+                          sharding_isolator.Run(module.get()));
   EXPECT_TRUE(sharding_isolator_changed);
 
-  HloDomainIsolator opname_isolator(OpNameDomainCreator);
+  HloDomainIsolator opname_isolator([]() { return OpNameDomainCreator{}; });
   TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed,
-                          opname_isolator.Run(module));
+                          opname_isolator.Run(module.get()));
   EXPECT_TRUE(opname_isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover sharding_remover(ShardingMetadata::KindName(),
                                     ShardingMetadata::NormalizeShardingDomain);
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed,
-                          sharding_remover.Run(module));
+                          sharding_remover.Run(module.get()));
   EXPECT_TRUE(sharding_remover_changed);
 
   HloDomainRemover opname_remover(OpNameMetadata::KindName(),
                                   OpNameDomainNormalizer);
   TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed,
-                          opname_remover.Run(module));
+                          opname_remover.Run(module.get()));
   EXPECT_TRUE(opname_remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "c"));
 }
 
 TEST_F(HloDomainTest, CheckNormalizationOnInfeedTuple) {
@@ -357,16 +357,17 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator isolator(ShardingDomainCreator{});
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "infeed.data", "infeed"));
-  EXPECT_FALSE(HasDomainEdge(module, "copy0", "gte0"));
-  EXPECT_FALSE(HasDomainEdge(module, "copy1", "gte1"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "infeed.data", "infeed"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy0", "gte0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy1", "gte1"));
 
   // Inject unassigned tuple/gte within the infeed domain, to simulate the
   // HLO passes adding unexpected instructions.
@@ -382,7 +383,7 @@ ENTRY entry {
   //           \       /
   //             TUPLE
   //               |
-  HloInstruction* infeed_data = FindInstruction(module, "infeed.data");
+  HloInstruction* infeed_data = FindInstruction(module.get(), "infeed.data");
   ASSERT_NE(infeed_data, nullptr);
 
   auto infeed_data_users = infeed_data->users();
@@ -408,7 +409,7 @@ ENTRY entry {
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   struct Assignment {
@@ -444,25 +445,26 @@ ENTRY entry {
     sharding={maximal device=1}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
-  HloDomainIsolator isolator(ShardingDomainCreator{});
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "tuple", "param"));
-  EXPECT_FALSE(HasDomainEdge(module, "gte", "tuple"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "tuple", "param"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "gte", "tuple"));
 
   // Remove %tuple and %gte (tuple simplification)
-  HloInstruction* gte = FindInstruction(module, "gte");
-  HloInstruction* tuple = FindInstruction(module, "tuple");
+  HloInstruction* gte = FindInstruction(module.get(), "gte");
+  HloInstruction* tuple = FindInstruction(module.get(), "tuple");
   module->entry_computation()->set_root_instruction(tuple->mutable_operand(0));
   TF_EXPECT_OK(module->entry_computation()->RemoveInstruction(gte));
   TF_EXPECT_OK(module->entry_computation()->RemoveInstruction(tuple));
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   const HloInstruction* root = module->entry_computation()->root_instruction();
@@ -484,11 +486,11 @@ TEST_F(HloDomainTest, DumpParseNullSharding) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, domain, domain));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto hlo_string = module->ToString();
-  ASSERT_TRUE(ParseModule(hlo_string).status().ok());
+  ASSERT_TRUE(ParseAndReturnVerifiedModule(hlo_string).status().ok());
 }
 
 // Tuple inputs are domain instructions.
@@ -505,20 +507,21 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
-  HloDomainIsolator isolator(ShardingDomainCreator{});
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
   // Clear sharding of tpl instruction, in order to test domain sharding
   // application.
-  auto tpl = FindInstruction(module, "tpl");
+  auto tpl = FindInstruction(module.get(), "tpl");
   tpl->clear_sharding();
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   EXPECT_EQ(HloSharding::Tuple(tpl->shape(), {HloSharding::AssignDevice(1),
@@ -553,36 +556,37 @@ ENTRY %entry (p0: (f32[4], f32[4])) -> (f32[4], f32[4], f32[4]) {
   ROOT %g = (f32[4], f32[4], f32[4]) tuple(%domain.2, %domain.3, %domain.4)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   LOG(INFO) << "Original module:\n" << module->ToString();
 
-  HloDomainIsolator opname_isolator(OpNameDomainCreator);
+  HloDomainIsolator opname_isolator([]() { return OpNameDomainCreator{}; });
   TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed,
-                          opname_isolator.Run(module));
+                          opname_isolator.Run(module.get()));
   EXPECT_TRUE(opname_isolator_changed);
 
-  EXPECT_TRUE(HasDomainEdge(module, "c", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "c", "b"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "a"));
-  EXPECT_TRUE(HasDomainEdge(module, "d", "c"));
-  EXPECT_FALSE(HasDomainEdge(module, "e", "d"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
 
   HloDomainRemover sharding_remover(ShardingMetadata::KindName(),
                                     ShardingMetadata::NormalizeShardingDomain);
   TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed,
-                          sharding_remover.Run(module));
+                          sharding_remover.Run(module.get()));
   EXPECT_TRUE(sharding_remover_changed);
 
   HloDomainRemover opname_remover(OpNameMetadata::KindName(),
                                   OpNameDomainNormalizer);
   TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed,
-                          opname_remover.Run(module));
+                          opname_remover.Run(module.get()));
   EXPECT_TRUE(opname_remover_changed);
 
-  EXPECT_FALSE(HasDomainEdge(module, "c", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "c", "b"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "a"));
-  EXPECT_FALSE(HasDomainEdge(module, "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "c"));
 }
 
 // Emulate instructions inserted at top and bottom within nested tuple domain.
@@ -601,15 +605,16 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(HloModule * module, ParseModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
 
-  HloDomainIsolator isolator(ShardingDomainCreator{});
-  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module));
+  HloDomainIsolator isolator([]() { return ShardingDomainCreator{}; });
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
   EXPECT_TRUE(isolator_changed);
 
   // Clear sharding of tuple.0 instruction, in order to test domain sharding
   // application.
-  auto tuple0 = FindInstruction(module, "tuple.0");
+  auto tuple0 = FindInstruction(module.get(), "tuple.0");
   tuple0->clear_sharding();
 
   // Insert the following instructons above and below tuple.0, to emulate other
@@ -653,7 +658,7 @@ ENTRY entry {
 
   HloDomainRemover remover(ShardingMetadata::KindName(),
                            ShardingMetadata::NormalizeShardingDomain);
-  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module));
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
   EXPECT_TRUE(remover_changed);
 
   EXPECT_TRUE(tuple0->has_sharding());
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index eec8d242faaa70e84ab5b46904b0a0ea41d5b009..3a7652a8dc856b23c8988c4676916c8199e78860 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/index_util.h"
@@ -38,11 +39,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bitmap.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -189,6 +190,11 @@ HloEvaluator::HloEvaluator(int64 max_loop_iterations)
         return Unimplemented(
             "HloEvaluatorTypedVisitor: unhandled primitive type: OPAQUE.");
       });
+  typed_visitors_[TOKEN] =
+      absl::make_unique<FunctionVisitor>([](HloInstruction*) {
+        return Unimplemented(
+            "HloEvaluatorTypedVisitor: unhandled primitive type: TOKEN.");
+      });
 }
 
 template <typename LiteralPtr>
@@ -391,6 +397,16 @@ StatusOr<Literal> HloEvaluator::EvaluateDotOp(
   return Evaluate(cloned_instruction.get());
 }
 
+Status HloEvaluator::HandleBitcast(HloInstruction* bitcast) {
+  const Literal& operand_literal = GetEvaluatedLiteralFor(bitcast->operand(0));
+  Literal result(bitcast->shape());
+  TF_RET_CHECK(operand_literal.size_bytes() == result.size_bytes());
+  memcpy(result.untyped_data(), operand_literal.untyped_data(),
+         operand_literal.size_bytes());
+  evaluated_[bitcast] = std::move(result);
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
   CHECK_LT(parameter->parameter_number(), arg_literals_.size());
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
@@ -1041,8 +1057,15 @@ Status HloEvaluator::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
-Status HloEvaluator::HandleAfterAll(HloInstruction* token) {
-  evaluated_[token] = LiteralUtil::CreateToken();
+Status HloEvaluator::HandleAfterAll(HloInstruction* after_all) {
+  evaluated_[after_all] = LiteralUtil::CreateToken();
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleAddDependency(HloInstruction* add_dependency) {
+  // AddDedendency just forwards its zero-th operand.
+  evaluated_[add_dependency] =
+      GetEvaluatedLiteralFor(add_dependency->operand(0)).Clone();
   return Status::OK();
 }
 
@@ -1228,7 +1251,7 @@ StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
   TF_RET_CHECK(
       ShapeUtil::SameDimensions(keys_literal.shape(), values_literal.shape()))
       << "Sort keys and values must have the same dimensions";
-  TF_RET_CHECK(sort->operand_count() == 2) << "Expected key-value sort";
+  TF_RET_CHECK(sort->operand_count() >= 2) << "Expected key-value sort";
   // We need to sort an array of keys and an array of values, where the
   // sorted order of the values is determined by the keys. The simplest(?)
   // way to do this is to go to an array-of-pairs representation, sort the
@@ -1274,12 +1297,14 @@ StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
           key_value_vector.push_back(
               std::make_pair(keys_data[i], values_data[i]));
         }
-        std::sort(key_value_vector.begin(), key_value_vector.end(),
-                  [](const kv_pair& a, const kv_pair& b) {
-                    return SafeLess<KeyType>(a.first, b.first);
-                  });
+        std::stable_sort(key_value_vector.begin(), key_value_vector.end(),
+                         [](const kv_pair& a, const kv_pair& b) {
+                           return SafeLess<KeyType>(a.first, b.first);
+                         });
         std::vector<KeyType> result_keys;
-        std::vector<ValueType> result_values;
+        // We use a InlinedVector here because we need to convert it to an
+        // absl::Span later, and this would not work with std::vector<bool>.
+        absl::InlinedVector<ValueType, 10> result_values;
         for (const auto& key_value : key_value_vector) {
           result_keys.push_back(key_value.first);
           result_values.push_back(key_value.second);
@@ -1315,7 +1340,10 @@ template <typename KeyType>
 StatusOr<Literal> EvaluateSortCurried(HloInstruction* sort,
                                       const Literal& keys_literal,
                                       const Literal& values_literal) {
-  switch (sort->operand(1)->shape().element_type()) {
+  switch (values_literal.shape().element_type()) {
+    case PRED:
+      return EvaluateSortInternal<KeyType, bool>(sort, keys_literal,
+                                                 values_literal);
     case F32:
       return EvaluateSortInternal<KeyType, float>(sort, keys_literal,
                                                   values_literal);
@@ -1355,14 +1383,24 @@ Status HloEvaluator::HandleSort(HloInstruction* sort) {
   if (!ShapeUtil::IsTuple(sort->shape())) {
     return DefaultAction(sort);
   } else {
-    auto result = EvaluateSort(sort, GetEvaluatedLiteralFor(sort->operand(0)),
-                               GetEvaluatedLiteralFor(sort->operand(1)));
-    if (result.ok()) {
-      evaluated_[sort] = std::move(result.ValueOrDie());
-      return Status::OK();
-    } else {
-      return result.status();
+    // This is a really stupid work-around for the fact it's hard to support a
+    // multi-value sort directly, due to the fact we need to template the
+    // evaluation function on all of the value types.
+    std::vector<Literal> sort_results_backing;
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      auto result = EvaluateSort(sort, GetEvaluatedLiteralFor(sort->operand(0)),
+                                 GetEvaluatedLiteralFor(sort->operand(i)));
+      if (!result.ok()) {
+        return result.status();
+      }
+      sort_results_backing.push_back(
+          std::move(result.ValueOrDie().DecomposeTuple()[1]));
     }
+    std::vector<const Literal*> sort_results;
+    absl::c_transform(sort_results_backing, std::back_inserter(sort_results),
+                      [](const Literal& literal) { return &literal; });
+    evaluated_[sort] = LiteralUtil::MakeTuple(sort_results);
+    return Status::OK();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 07f8d0aad4af0b07303b4e485b3630cc75bcb519..45ed8131dc6b71f706fce45d65b206363dd79ac3 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -144,6 +144,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Operations that are type-agnostic or always return a specific type, such as
   // HandleIsFinite where boolean is always returned.
   //
+  Status HandleBitcast(HloInstruction* bitcast) override;
+
   Status HandleParameter(HloInstruction* parameter) override;
 
   Status HandleConstant(HloInstruction* constant) override;
@@ -180,7 +182,9 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleBroadcast(HloInstruction* broadcast) override;
 
-  Status HandleAfterAll(HloInstruction* token) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
+
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   Status HandleSort(HloInstruction* sort) override;
 
@@ -221,16 +225,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
       const Literal& operand_literal) {
     const auto shape = instruction->shape();
     const auto* operand = instruction->operand(0);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-    // removed.
-    if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s",
-          ShapeUtil::HumanString(shape),
-          ShapeUtil::HumanString(operand->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, operand->shape()));
 
     Literal result(shape);
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index cee11a8a2166f96ae801095b6364921ed05d0000..4eaaab20ea0add17d9b49b1b2b97991af0438dcc 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -33,8 +34,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -50,9 +52,9 @@ namespace {
 static std::array<bool, 2> use_bf16_params{true, false};
 
 class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
-                         public HloVerifiedTestBase {
+                         public HloTestBase {
  protected:
-  HloEvaluatorTest() : HloVerifiedTestBase(), use_bfloat16_(GetParam()) {
+  HloEvaluatorTest() : HloTestBase(), use_bfloat16_(GetParam()) {
     evaluator_ = absl::make_unique<HloEvaluator>();
   }
 
@@ -60,14 +62,14 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     if (use_bfloat16_) {
       // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
       auto type_converter = HloElementTypeConverter(F32, BF16);
-      type_converter.Run(&module()).ValueOrDie();
+      type_converter.Run(m_.get()).ValueOrDie();
     }
-    return evaluator_->Evaluate(*module().entry_computation(), arg_literals)
+    return evaluator_->Evaluate(*m_->entry_computation(), arg_literals)
         .ConsumeValueOrDie();
   }
 
-  // Evaluate function that takes in a local module instead of using module_
-  // that is in HloVerifiedTestBase. Once module_ in HloVerifiedTestBase is
+  // Evaluate function that takes in a local module instead of using m_
+  // that is in HloTestBase. Once m_ in HloTestBase is
   // removed, this should be the default Evaluate function.
   Literal EvaluateWithModule(
       HloModule* module, absl::Span<const Literal* const> arg_literals = {}) {
@@ -88,7 +90,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     auto c1 =
         b.AddInstruction(HloInstruction::CreateConstant(std::move(input)));
     b.AddInstruction(HloInstruction::CreateUnary(expected.shape(), opcode, c1));
-    module().AddEntryComputation(b.Build());
+    m_->AddEntryComputation(b.Build());
 
     Literal result = Evaluate();
 
@@ -108,7 +110,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs)));
     b.AddInstruction(
         HloInstruction::CreateBinary(expected.shape(), opcode, c1, c2));
-    module().AddEntryComputation(b.Build());
+    m_->AddEntryComputation(b.Build());
 
     Literal result = Evaluate();
 
@@ -116,6 +118,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
   }
 
   bool use_bfloat16_;
+  std::unique_ptr<HloModule> m_ = CreateNewVerifiedModule();
 };
 
 #define XLA_TYPED_TEST_P(test_case_name, test_name, test_type1) \
@@ -135,7 +138,7 @@ TEST_P(HloEvaluatorTest, DoesClamp) {
   auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
   b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -156,7 +159,7 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
   auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
   b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -181,7 +184,7 @@ TEST_P(HloEvaluatorTest, DoesSelect) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(on_false)));
   b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kSelect, c1, c2, c3));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -322,7 +325,7 @@ TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
       b.AddInstruction(HloInstruction::CreateParameter(2, shape, "rhs2"));
   b.AddInstruction(HloInstruction::CreateBinary(shape, HloOpcode::kAdd,
                                                 lhs_instruction, param_rhs2));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate(args);
 
@@ -346,7 +349,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) {
   const int64 permutation[] = {1, 2, 0, 4, 3};
   b.AddInstruction(
       HloInstruction::CreateTranspose(shape, literal_instruction, permutation));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -367,7 +370,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcast) {
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal.shape(), literal_instruction, {1, 2}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -386,7 +389,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
   b.AddInstruction(HloInstruction::CreateBroadcast(
       output_literal.shape(), literal_instruction,
       /*broadcast_dimensions=*/{}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate({});
 
@@ -406,7 +409,7 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
   Shape shape = ShapeUtil::MakeShape(S64, {4, 2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -428,7 +431,7 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   Shape shape = ShapeUtil::MakeShape(S64, {2});
   b.AddInstruction(HloInstruction::CreateConcatenate(shape, operands, 0));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -448,7 +451,7 @@ TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected.shape(), constant));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -468,7 +471,7 @@ TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
   HloInstruction* constant = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(input_literal)));
   b.AddInstruction(HloInstruction::CreateConvert(expected.shape(), constant));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -503,7 +506,7 @@ TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   Shape shape = ShapeUtil::MakeShape(S32, {5, 2});
   b.AddInstruction(HloInstruction::CreatePad(
       shape, operand_instruction, padding_value_instruction, padding_config));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -530,7 +533,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
       CreatePaddingConfig({{{1, 0, 2}}, {{0, 2, 1}}, {{0, 0, 0}}, {{0, 0, 0}}});
   b.AddInstruction(HloInstruction::CreatePad(
       shape, input_instruction, pad_instruction, r4_padding_on_dim0_dim1));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -574,7 +577,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -619,7 +622,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
                                              pad_value_instruction,
                                              r2_padding_on_dim0_dim1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -658,7 +661,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
                                              rhs_instruction, dot_dnums,
                                              DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -704,7 +707,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
   b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
                                              rhs_instruction, dot_dnums,
                                              DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -748,7 +751,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
                                              rhs_instruction, dot_dnums,
                                              DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -802,7 +805,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -857,7 +860,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -941,7 +944,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1019,7 +1022,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1079,7 +1082,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1143,7 +1146,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1215,7 +1218,7 @@ TEST_P(HloEvaluatorTest,
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1286,7 +1289,7 @@ TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction,
       /*feature_group_count=*/2, window, dnums, DefaultPrecisionConfig(2)));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1297,11 +1300,12 @@ TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
+class HloEvaluatorPreciseReduceTest : public HloTestBase {};
 
 // Tests that Reduce doesn't lose precision when adding many numbers (because
 // it accumulates its result in a double).
 TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder b(TestName());
 
   constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
@@ -1319,12 +1323,12 @@ TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m->AddEmbeddedComputation(add_computation.Build());
 
   HloInstruction* reduce_instruction = b.AddInstruction(
       HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
                                    /*dimensions_to_reduce=*/{0}, add_func));
-  module().AddEntryComputation(b.Build());
+  m->AddEntryComputation(b.Build());
 
   HloEvaluator hlo_eval;
   Literal result = hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
@@ -1337,7 +1341,7 @@ void BM_ReducePrecisely(int num_iters) {
   tensorflow::testing::StopTiming();
   HloComputation::Builder b("BM_ReducePrecisely");
   HloModuleConfig config;
-  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  config.set_debug_options(GetDebugOptionsFromFlags());
   HloModule module("BM_ReducePrecisely", config);
 
   constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
@@ -1396,14 +1400,14 @@ TEST_P(HloEvaluatorTest, ReduceAdd) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m_->AddEmbeddedComputation(add_computation.Build());
 
   Shape shape = ShapeUtil::MakeShape(F32, {2});
   b.AddInstruction(
       HloInstruction::CreateReduce(shape, arg_instruction, init_value,
                                    /*dimensions_to_reduce=*/{1}, add_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1438,7 +1442,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   max_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
-  auto max_func = module().AddEmbeddedComputation(max_computation.Build());
+  auto max_func = m_->AddEmbeddedComputation(max_computation.Build());
 
   Window window;
   WindowDimension dim;
@@ -1455,7 +1459,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, max_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1463,6 +1467,58 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
+TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
+  HloComputation::Builder b(TestName());
+
+  // arg:
+  // f32[3,3] {
+  //  { 1, 2, 3 },
+  //  { 5, 6, 7 },
+  //  { 9, 10, 11 },
+  // }
+  auto arg_array = absl::make_unique<Array2D<float>>(3, 3);
+  arg_array->FillUnique(1.0f);
+  auto arg_literal = LiteralUtil::CreateR2FromArray2D<float>(*arg_array);
+
+  HloInstruction* arg_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
+
+  auto init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.f)));
+
+  HloComputation::Builder max_computation("max");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = max_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = max_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  max_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kMaximum, param_lhs, param_rhs));
+  auto max_func = m_->AddEmbeddedComputation(max_computation.Build());
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(2);
+  dim.set_stride(1);
+  dim.set_padding_low(0);
+  dim.set_padding_high(0);
+  dim.set_window_dilation(2);
+  dim.set_base_dilation(1);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+
+  Shape shape = ShapeUtil::MakeShape(F32, {1, 1});
+  b.AddInstruction(HloInstruction::CreateReduceWindow(
+      shape, arg_instruction, init_value, window, max_func));
+
+  m_->AddEntryComputation(b.Build());
+
+  Literal result = Evaluate();
+
+  auto expected = LiteralUtil::CreateR2<float>({{11}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
 TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   HloComputation::Builder b(TestName());
 
@@ -1489,7 +1545,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m_->AddEmbeddedComputation(add_computation.Build());
 
   Window window;
   WindowDimension dim;
@@ -1512,7 +1568,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1542,7 +1598,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
       HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
   add_computation.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+  auto add_func = m_->AddEmbeddedComputation(add_computation.Build());
 
   Window window;
 
@@ -1573,7 +1629,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   b.AddInstruction(HloInstruction::CreateReduceWindow(
       shape, arg_instruction, init_value, window, add_func));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1605,7 +1661,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) {
                                                /*start_indices=*/{0, 2},
                                                /*limit_indices=*/{3, 5},
                                                /*strides=*/{2, 3}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1639,7 +1695,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1675,7 +1731,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
                                                       start_indices, {2, 3}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1712,7 +1768,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       shape, operand, update, start_indices));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1748,7 +1804,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) {
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateGetTupleElement(shape, tuple, 1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1787,7 +1843,7 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
   b.AddInstruction(
       HloInstruction::CreateGetTupleElement(tuple2->shape(), outer_tuple, 1));
 
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1825,7 +1881,7 @@ TEST_P(HloEvaluatorTest, Reverse) {
 
   const Shape shape = ShapeUtil::MakeShape(F32, {4, 3, 2, 1});
   b.AddInstruction(HloInstruction::CreateReverse(shape, operand, {0, 1}));
-  module().AddEntryComputation(b.Build());
+  m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
 
@@ -1914,7 +1970,7 @@ ENTRY main {
       slice_sizes={1, 3}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -1938,7 +1994,7 @@ ENTRY main {
       slice_sizes={3, 1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -1962,7 +2018,7 @@ ENTRY main {
       slice_sizes={3, 1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
@@ -1987,7 +2043,7 @@ ENTRY main {
       slice_sizes={1,1,2}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2014,7 +2070,7 @@ ENTRY main {
       slice_sizes={1,1,2}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2040,7 +2096,7 @@ ENTRY main {
       slice_sizes={1,1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({1, 1});
@@ -2063,7 +2119,7 @@ ENTRY main {
       slice_sizes={1,1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal start_indices = LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
@@ -2087,7 +2143,7 @@ ENTRY main {
       slice_sizes={1, 0}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
   Literal start_indices = LiteralUtil::CreateR1<int32>({0, 2});
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR2<int32>({{}, {}}),
@@ -2109,7 +2165,7 @@ ENTRY main {
       slice_sizes={1}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
   Literal start_indices =
@@ -2140,7 +2196,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2171,7 +2227,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2204,7 +2260,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2236,7 +2292,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
@@ -2268,7 +2324,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<float>(
       {{1.1, 2.2, 3.3}, {4.4, 5.5, 6.6}, {7.7, 8.8, 9.9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({2, 1});
@@ -2302,7 +2358,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({1, 1});
@@ -2334,7 +2390,7 @@ ENTRY main {
       index_vector_dim=2
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR2<int32>({{0, 2}, {2, 1}});
@@ -2366,7 +2422,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2403,7 +2459,7 @@ ENTRY main {
       index_vector_dim=0
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
                                     {{-4, 4}, {-5, 5}, {-6, 6}},  //
@@ -2439,7 +2495,7 @@ ENTRY main {
       index_vector_dim=0
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({1, 1});
@@ -2471,7 +2527,7 @@ ENTRY main {
       index_vector_dim=0
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand =
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   Literal scatter_indices = LiteralUtil::CreateR2<int32>({{2, 1}, {1, 1}});
@@ -2503,7 +2559,7 @@ ENTRY main {
       index_vector_dim=1
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
   Literal operand = LiteralUtil::CreateR2<int32>({{}, {}, {}});
   Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 2});
   Literal updates = LiteralUtil::CreateR2<int32>({{}, {}});
@@ -2533,7 +2589,7 @@ ENTRY main {
       index_vector_dim=2
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal operand = LiteralUtil::CreateR1<int32>({0, 1, 2});
   Literal scatter_indices =
@@ -2684,7 +2740,7 @@ ENTRY main {
   ROOT %reduce = bf16[] reduce(arg0, init), dimensions={0}, to_apply=add_bf16
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal arg = LiteralUtil::CreateR1<bfloat16>(
       {bfloat16(1.0f), bfloat16(3.0f), bfloat16(-2.0f), bfloat16(42.0f)});
@@ -2702,7 +2758,7 @@ ENTRY main {
   ROOT %slice = f32[2,2,2]{1,0,2} slice(f32[2,2,2]{0,1,2} %arg), slice={[0:2], [0:2], [0:2]}
 }
 )";
-  ParseAndVerifyModule(hlo_text);
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
 
   Literal arg = LiteralUtil::CreateR3WithLayout<float>(
       {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}},
@@ -2711,6 +2767,33 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(arg, actual));
 }
 
+TEST_P(HloEvaluatorTest, Bitcast) {
+  // Regression test for b/114735354.
+  constexpr absl::string_view hlo_text_base = R"(
+HloModule Bitcast
+
+ENTRY main {
+  param = %s[32,121]{1,0} parameter(0)
+  ROOT bitcast = %s[121,32,1]{0,1,2} bitcast(%s[32,121]{1,0} param)
+}
+)";
+  string hlo_text;
+  if (use_bfloat16_) {
+    hlo_text = absl::StrFormat(hlo_text_base, "bf16", "bf16", "bf16");
+  } else {
+    hlo_text = absl::StrFormat(hlo_text_base, "f32", "f32", "f32");
+  }
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  if (use_bfloat16_) {
+    EXPECT_TRUE(
+        absl::c_equal(args[0].data<bfloat16>(), actual.data<bfloat16>()));
+  } else {
+    EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
                         ::testing::ValuesIn(use_bf16_params));
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index b2d12c94b848e4fd8ae473fdc0e4a9f5fecf6286..b87fc3e34012e75ee07bff6c1e113dce404f83cb 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cmath>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/types/optional.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
-#include "tensorflow/core/lib/core/casts.h"
 
 namespace xla {
 
@@ -161,9 +161,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                          HloOpcodeString(hlo_instruction->opcode()));
   }
 
-  // TODO(b/35950897): many of the stl functions used in the handlers are not
-  // overloaded for every XLA primitive type.
-
   template <typename NativeT,
             typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
                 nullptr>
@@ -596,7 +593,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  Status HandleDivide(HloInstruction* divide) {
+  Status HandleDivide(HloInstruction* divide) override {
     return HandleDivide<ElementwiseT>(divide);
   }
 
@@ -1072,66 +1069,66 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
       // Convolve input feature with kernel.
       do {
+        // Find corresponding spatial dimension index for input (lhs).
+        int64 lhs_linear_spatial_index = 0;
+        int64 rhs_linear_spatial_index = 0;
+        for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
+          // Spatial dimension number for input (lhs) and output.
+          const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
+          const int64 output_spatial_dim = dnums.output_spatial_dimensions(ki);
+
+          // Calculate lhs (input) index without taking base dilation into
+          // account.
+          const auto& window_dim = window.dimensions(ki);
+          const int64 undilated_index =
+              out_index[output_spatial_dim] * window_dim.stride() -
+              window_dim.padding_low() +
+              rhs_spatial_index[ki] * window_dim.window_dilation();
+          // Skip if the lhs (input) index is to be dilated.  As an
+          // optimization, skip this mod if there's no dilation.
+          if (window_dim.base_dilation() > 1 &&
+              undilated_index % window_dim.base_dilation() != 0) {
+            goto cnt;
+          }
+
+          // Calculate the actual lhs (input) index after dilation.  As an
+          // optimization, skip this integer divide if there's no dilation.
+          int64 lhs_spatial_index;
+          if (window_dim.base_dilation() > 1) {
+            lhs_spatial_index = undilated_index / window_dim.base_dilation();
+          } else {
+            lhs_spatial_index = undilated_index;
+          }
+
+          // Skip if input index is not in bounds.
+          if (!(lhs_spatial_index >= 0 &&
+                lhs_spatial_index < lhs_shape.dimensions(input_spatial_dim))) {
+            goto cnt;
+          }
+
+          lhs_linear_spatial_index +=
+              lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
+          rhs_linear_spatial_index +=
+              (window_dim.window_reversal()
+                   ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                   : rhs_spatial_index[ki]) *
+              rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
+        }
+
         for (int64 rhs_iz = 0; rhs_iz < input_feature_group_size; ++rhs_iz) {
           const int64 iz =
               feature_group_index * input_feature_group_size + rhs_iz;
 
-          int64 lhs_linear_index = 0;
+          int64 lhs_linear_index = lhs_linear_spatial_index;
           lhs_linear_index += out_index[output_batch_dim] *
                               lhs_dim_multipliers[input_batch_dim];
           lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
 
-          int64 rhs_linear_index = 0;
+          int64 rhs_linear_index = rhs_linear_spatial_index;
           rhs_linear_index += out_index[output_z_dim] *
                               rhs_dim_multipliers[kernel_output_z_dim];
           rhs_linear_index += rhs_iz * rhs_dim_multipliers[kernel_input_z_dim];
 
-          // Find corresponding spatial dimension index for input (lhs).
-          for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
-            // Spatial dimension number for input (lhs) and output.
-            const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
-            const int64 output_spatial_dim =
-                dnums.output_spatial_dimensions(ki);
-
-            // Calculate lhs (input) index without taking base dilation into
-            // account.
-            const auto& window_dim = window.dimensions(ki);
-            const int64 undilated_index =
-                out_index[output_spatial_dim] * window_dim.stride() -
-                window_dim.padding_low() +
-                rhs_spatial_index[ki] * window_dim.window_dilation();
-            // Skip if the lhs (input) index is to be dilated.  As an
-            // optimization, skip this mod if there's no dilation.
-            if (window_dim.base_dilation() > 1 &&
-                undilated_index % window_dim.base_dilation() != 0) {
-              goto cnt;
-            }
-
-            // Calculate the actual lhs (input) index after dilation.  As an
-            // optimization, skip this integer divide if there's no dilation.
-            int64 lhs_spatial_index;
-            if (window_dim.base_dilation() > 1) {
-              lhs_spatial_index = undilated_index / window_dim.base_dilation();
-            } else {
-              lhs_spatial_index = undilated_index;
-            }
-            lhs_linear_index +=
-                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
-
-            // Skip if input index is not in bounds.
-            if (!(lhs_spatial_index >= 0 &&
-                  lhs_spatial_index <
-                      lhs_shape.dimensions(input_spatial_dim))) {
-              goto cnt;
-            }
-
-            rhs_linear_index +=
-                (window_dim.window_reversal()
-                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
-                     : rhs_spatial_index[ki]) *
-                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
-          }
-
           result_val +=
               static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
               static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
@@ -1556,10 +1553,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           const auto& row_data = row_to_sort.data<NativeT>();
 
           std::vector<NativeT> result_data(row_data.begin(), row_data.end());
-          std::sort(result_data.begin(), result_data.end(),
-                    [](const NativeT& a, const NativeT& b) {
-                      return SafeLess<NativeT>(a, b);
-                    });
+          std::stable_sort(result_data.begin(), result_data.end(),
+                           [](const NativeT& a, const NativeT& b) {
+                             return SafeLess<NativeT>(a, b);
+                           });
           Literal sorted_row(ShapeUtil::MakeShape(keys->shape().element_type(),
                                                   {sort_dim_elements}));
           sorted_row.PopulateR1(absl::Span<const NativeT>(result_data));
@@ -2442,7 +2439,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         parent_->evaluated_[reduce_precision],
         ElementWiseUnaryOp(reduce_precision, [reduce_precision](
                                                  ElementwiseT elem) {
-          uint32_t value_as_int = tensorflow::bit_cast<uint32_t>(elem);
+          uint32_t value_as_int = absl::bit_cast<uint32_t>(elem);
           const uint32_t mantissa_bits = reduce_precision->mantissa_bits();
           const uint32_t exponent_bits = reduce_precision->exponent_bits();
 
@@ -2515,7 +2512,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             value_as_int = x_underflows ? x_signed_zero : value_as_int;
           }
 
-          float reduced_result = tensorflow::bit_cast<float>(value_as_int);
+          float reduced_result = absl::bit_cast<float>(value_as_int);
           if (std::isnan(elem)) {
             reduced_result = mantissa_bits > 0
                                  ? elem
@@ -2546,12 +2543,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   template <typename NativeT,
             typename std::enable_if<
-                std::is_same<NativeT, float>::value ||
-                std::is_same<NativeT, int32>::value ||
-                std::is_same<NativeT, uint32>::value>::type* = nullptr>
+                std::is_integral<NativeT>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleIota(HloInstruction* instruction) {
     auto* iota = Cast<HloIotaInstruction>(instruction);
-    std::vector<NativeT> data(iota->shape().dimensions(iota->iota_dimension()));
+    // Avoid using std::vector since std::vector<bool> does not convert to
+    // absl::Span<bool>.
+    absl::InlinedVector<NativeT, 1> data(
+        iota->shape().dimensions(iota->iota_dimension()));
     std::iota(data.begin(), data.end(), 0);
     auto result = LiteralUtil::CreateR1<NativeT>(data);
 
@@ -2568,9 +2567,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
   template <typename NativeT,
             typename std::enable_if<
-                !(std::is_same<NativeT, float>::value ||
-                  std::is_same<NativeT, int32>::value ||
-                  std::is_same<NativeT, uint32>::value)>::type* = nullptr>
+                !(std::is_integral<NativeT>::value ||
+                  std::is_floating_point<NativeT>::value)>::type* = nullptr>
   Status HandleIota(HloInstruction* iota) {
     return InvalidArgument("Unsupported type for iota");
   }
@@ -2613,8 +2611,17 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       std::vector<int64> base_index(rank);
       bool out_of_bound = false;
       for (int64 i = 0; i < rank; ++i) {
-        base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
-                        window_index[i] - window.dimensions(i).padding_low();
+        base_index[i] =
+            window_count_index[i] * window.dimensions(i).stride() +
+            window_index[i] * window.dimensions(i).window_dilation() -
+            window.dimensions(i).padding_low();
+        // We are not in the base area if the dilation placed us out of bounds.
+        if (base_index[i] % window.dimensions(i).base_dilation() != 0) {
+          out_of_bound = true;
+          break;
+        }
+        // Apply the dilation to the base area.
+        base_index[i] /= window.dimensions(i).base_dilation();
         if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
           out_of_bound = true;
           break;
@@ -2713,17 +2720,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const auto shape = instruction->shape();
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
-    // is removed.
-    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()),
-          ShapeUtil::HumanString(rhs->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
@@ -2747,19 +2745,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
     const auto* ehs = instruction->operand(2);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
-    // broadcast is removed.
-    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
-          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()),
-          ShapeUtil::HumanString(rhs->shape()),
-          ShapeUtil::HumanString(ehs->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, lhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()));
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index ce4cad42355ec5881f2ae14f4dd52a0588d51cf7..2df8eb962ae54eb5b9492fdeb274eec309a8288f 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -28,7 +28,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
-HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
+HloProfileIndexMap::HloProfileIndexMap(const HloModule& module,
+                                       absl::Span<const string> extra_metrics) {
   size_t current_profile_index = 0;
   for (xla::HloComputation* computation : module.MakeComputationPostOrder()) {
     InsertOrDie(&computation_to_profile_idx_, computation,
@@ -40,11 +41,15 @@ HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
                   current_profile_index++);
     }
   }
+  for (const string& key : extra_metrics) {
+    InsertOrDie(&extra_metric_to_profile_idx_, key, current_profile_index++);
+  }
 }
 
 std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
-    const HloCostAnalysis& cost_analysis) {
+    const HloCostAnalysis& cost_analysis,
+    const string& entry_computation_name) {
   using HloComputationInfo = HloProfilePrinterData::HloComputationInfo;
   using HloInstructionInfo = HloProfilePrinterData::HloInstructionInfo;
 
@@ -105,6 +110,14 @@ std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     }
   }
 
+  // Add extra metrics if any.
+  for (const auto& pair : hlo_profile_index_map.extra_metric_to_profile_idx()) {
+    profile_printer_data->mutable_extra_metrics()->insert(
+        {pair.first, pair.second});
+  }
+
+  profile_printer_data->set_entry_computation(entry_computation_name);
+
   return profile_printer_data;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index be989846ef5cd2645da88ac9bbfea9534dd47821..da30e15908328f9aa7fe282656a6d44ab7348195 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EXECUTION_PROFILE_H_
 
 #include <unordered_map>
+#include <vector>
 
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
@@ -34,7 +35,10 @@ class HloInstruction;
 class HloProfileIndexMap {
  public:
   // Scans `module` to populate this instance of HloProfileIndexMap.
-  explicit HloProfileIndexMap(const HloModule& module);
+  explicit HloProfileIndexMap(const HloModule& module)
+      : HloProfileIndexMap(module, {}) {}
+  explicit HloProfileIndexMap(const HloModule& module,
+                              absl::Span<const string> extra_metrics);
 
   HloProfileIndexMap(const HloProfileIndexMap&) = default;
   HloProfileIndexMap(HloProfileIndexMap&&) = default;
@@ -50,6 +54,10 @@ class HloProfileIndexMap {
     return FindOrDie(computation_to_profile_idx(), &computation);
   }
 
+  size_t GetProfileIndexFor(const string& key) const {
+    return xla::FindOrDie(extra_metric_to_profile_idx(), key);
+  }
+
   size_t instruction_count() const {
     return instruction_to_profile_idx().size();
   }
@@ -58,8 +66,12 @@ class HloProfileIndexMap {
     return computation_to_profile_idx().size();
   }
 
+  size_t extra_metrics_count() const {
+    return extra_metric_to_profile_idx().size();
+  }
+
   size_t total_count() const {
-    return instruction_count() + computation_count();
+    return instruction_count() + computation_count() + extra_metrics_count();
   }
 
   const std::unordered_map<const HloInstruction*, int64>&
@@ -72,15 +84,20 @@ class HloProfileIndexMap {
     return computation_to_profile_idx_;
   }
 
+  const std::unordered_map<string, int64>& extra_metric_to_profile_idx() const {
+    return extra_metric_to_profile_idx_;
+  }
+
  private:
   std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx_;
   std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
+  std::unordered_map<string, int64> extra_metric_to_profile_idx_;
 };
 
 // Create an instance of `HloProfilePrinterData`.
 std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData(
     const HloProfileIndexMap& hlo_profile_index_map,
-    const HloCostAnalysis& cost_analysis);
+    const HloCostAnalysis& cost_analysis, const string& entry_computation_name);
 
 // Describes how much time each HLO operation took.
 //
@@ -113,6 +130,12 @@ class HloExecutionProfile {
         total_cycles_executed;
   }
 
+  // Record extra metric.
+  void set_extra_metrics(const string& metric, uint64 value) {
+    profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(metric)] =
+        value;
+  }
+
   // Returns a version of the execution profile suitable for performance
   // debugging; e.g. emits cycle counts, execution time at the nominal device
   // frequency, and the effective throughput given the provided cost_analysis
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index 460ae2b5eca78659f86df1227e6a0a4e57508611..5be9dba3aa49d63c580cd6f5800f608667826b6a 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -54,7 +54,8 @@ TEST_F(HloExecutionProfileTest, Basic) {
   HloCostAnalysis cost_analysis(shape_size_function);
   HloProfileIndexMap profile_index_map(*hlo_module);
   std::unique_ptr<HloProfilePrinterData> profile_printer =
-      CreateHloProfilePrinterData(profile_index_map, cost_analysis);
+      CreateHloProfilePrinterData(profile_index_map, cost_analysis,
+                                  hlo_module->entry_computation()->name());
   HloExecutionProfile execution_profile(profile_printer.get(),
                                         &profile_index_map);
 
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c919dbd82d3668c477bf37074f1d56f8cb7d9506
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+
+namespace xla {
+
+namespace {
+
+StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kGetDimensionSize) {
+    return false;
+  }
+  HloComputation* computation = instr->parent();
+
+  TF_ASSIGN_OR_RETURN(auto legal_shape,
+                      ShapeInference::InferGetDimensionSizeShape(
+                          instr->operand(0)->shape(), instr->dimension()));
+  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape));
+  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), U32));
+  uint32 size = instr->operand(0)->shape().dimensions(instr->dimension());
+  HloInstruction* new_instr = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
+  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+  return true;
+}
+
+}  // namespace
+
+StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
+  bool changed = false;
+  HloProto proto;
+  *proto.mutable_hlo_module() = module->ToProto();
+  for (auto* computation : module->computations()) {
+    for (auto instruction : computation->instructions()) {
+      TF_ASSIGN_OR_RETURN(bool replaced, ReplaceGetSize(instruction));
+      changed = changed || replaced;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..30f44c23a835b3bcc935caaa917e040e07c4e703
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Pass to replace a kGetDimensionSize instruction with a constant instruction.
+class HloGetDimensionSizeRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "hlo-get-dimension-size-rewriter";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a86aebdd5b64240e6e07d8e8050c0c8681cce765
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class HloGetDimensionSizeRewriterTest : public HloTestBase {
+ protected:
+  HloGetDimensionSizeRewriterTest() {}
+};
+
+TEST_F(HloGetDimensionSizeRewriterTest, Ok) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3,4] parameter(0)
+  size0 = u32[] get-dimension-size(p), dimensions={0}
+  size1 = u32[] get-dimension-size(p), dimensions={1}
+  ROOT mul = u32[] multiply(size0, size1)
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Multiply(op::Constant(), op::Constant()));
+}
+
+TEST_F(HloGetDimensionSizeRewriterTest, IllegalType) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3]{0} parameter(0)
+  ROOT gds = s64[] get-dimension-size(p), dimensions={0}
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+TEST_F(HloGetDimensionSizeRewriterTest, IllegalDimension) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = f32[2,5] parameter(0)
+  ROOT gds = u32[] get-dimension-size(p), dimensions={2}
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 13a74fd8a115c5dc9a9518b226dfee4445cc7180..302eca656be53a3cec86ddbf05a7fa3925c5185b 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <deque>
 #include <map>
 #include <memory>
+#include <queue>
 #include <string>
 #include <tuple>
 #include <unordered_map>
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -111,11 +113,6 @@ class NodeFilter {
            result == kSomeUsersOmitted;
   }
 
-  bool ShowFusionSubcomputation(const HloInstruction* instr) const {
-    CHECK_EQ(instr->opcode(), HloOpcode::kFusion);
-    return Show(instr) && !SomeOrAllOperandsOmitted(instr);
-  }
-
  private:
   std::function<NodeFilterResult(const HloInstruction* instr)> filter_;
 };
@@ -240,34 +237,28 @@ string HtmlLikeStringSanitize(absl::string_view s) {
 // it to a short string lets us tell the user what the subcomputation is without
 // drawing it as a graph.
 optional<string> MatchTrivialComputation(const HloComputation* computation) {
+  namespace m = match;
+
   if (computation->instruction_count() != 3) {
     return nullopt;
   }
-
   HloInstruction* root = computation->root_instruction();
-  if (root->operand_count() != 2) {
-    return nullopt;
-  }
-
-  // Check that both of the operands to the root are parameters.
-  const HloInstruction* operand0 = root->operand(0);
-  const HloInstruction* operand1 = root->operand(1);
-  if (operand0->opcode() != HloOpcode::kParameter ||
-      operand1->opcode() != HloOpcode::kParameter) {
-    return nullopt;
-  }
-
-  // Check that the two operands of root are param0 and param1.  All of the
-  // opcodes we recognize are commutative, so we're OK with either order.
-  auto n0 = operand0->parameter_number();
-  auto n1 = operand1->parameter_number();
-  if (!(n0 == 0 && n1 == 1) && !(n1 == 0 && n0 == 1)) {
+  const HloInstruction *param0, *param1;
+  if (!Match(root, m::Op()
+                       .WithNumOperands(2)
+                       .WithShape(m::Shape().IsEffectiveScalar())
+                       .WithBinaryOperandsAnyOrder(
+                           m::Parameter(&param0, 0)
+                               .WithShape(m::Shape().IsEffectiveScalar()),
+                           m::Parameter(&param1, 1)
+                               .WithShape(m::Shape().IsEffectiveScalar())))) {
     return nullopt;
   }
 
-  // If the params are reversed, check that the operation being performed is
-  // commutative.
-  if (n0 == 1) {
+  // If the params are reversed (i.e. operand0 is param1 and operand1 is
+  // param0), check that the operation being performed is commutative.
+  if (root->operand(0) == param1) {
+    CHECK_EQ(root->operand(1), param0);
     switch (root->opcode()) {
       case HloOpcode::kLe:
       case HloOpcode::kGe:
@@ -279,13 +270,6 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
     }
   }
 
-  // Check that the root and params are all effective scalars.
-  if (!ShapeUtil::IsEffectiveScalar(root->shape()) ||
-      !ShapeUtil::IsEffectiveScalar(operand0->shape()) ||
-      !ShapeUtil::IsEffectiveScalar(operand1->shape())) {
-    return nullopt;
-  }
-
   // If we recognize the root's opcode, we've successfully pattern-matched!
   switch (root->opcode()) {
     case HloOpcode::kAdd:
@@ -578,7 +562,7 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
 
   // Show the subcomputation if we're showing any of its members.
   return std::any_of(
-      computation_->instructions().begin(), computation_->instructions().end(),
+      subcomp->instructions().begin(), subcomp->instructions().end(),
       [&](const HloInstruction* instr) { return filter_.Show(instr); });
 }
 
@@ -987,6 +971,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kTrace:
     case HloOpcode::kAfterAll:
+    case HloOpcode::kAddDependency:
     case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
@@ -1043,6 +1028,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kDomain:
     case HloOpcode::kFusion:
     case HloOpcode::kMap:
+    case HloOpcode::kGetDimensionSize:
       return kGray;
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kAllToAll:
@@ -1266,12 +1252,12 @@ const HloInstruction* HloDotDumper::GetNodeForEdge(
 
 class GraphRendererRegistry {
  public:
-  void AddRenderer(GraphRendererInterface* graph_renderer) {
+  void SetRenderer(std::shared_ptr<GraphRendererInterface> graph_renderer) {
     tensorflow::mutex_lock lock(mu_);
     graph_renderer_ = graph_renderer;
   }
 
-  GraphRendererInterface* GetDefaultRenderer() {
+  std::shared_ptr<GraphRendererInterface> GetDefaultRenderer() {
     tensorflow::mutex_lock lock(mu_);
     return graph_renderer_;
   }
@@ -1283,20 +1269,21 @@ class GraphRendererRegistry {
 
  private:
   tensorflow::mutex mu_;
-  GraphRendererInterface* graph_renderer_ = nullptr;
+  std::shared_ptr<GraphRendererInterface> graph_renderer_ GUARDED_BY(mu_);
 };
 
 }  // namespace
 
-Registrar::Registrar(GraphRendererInterface* dumper) {
-  GraphRendererRegistry::Default()->AddRenderer(dumper);
+Registrar::Registrar(std::shared_ptr<GraphRendererInterface> dumper) {
+  GraphRendererRegistry::Default()->SetRenderer(dumper);
 }
 
 namespace {
 
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
-NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
+NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
+                                      int64 radius) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
   std::unordered_map<const HloInstruction*, NodeFilterResult> nodes;
@@ -1403,6 +1390,56 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
   });
 }
 
+// Gets a node filter that includes nodes on all paths from `from` to `to`.  If
+// the all-paths set contains more than max_nodes elements, includes the nodes
+// on the shortest paths and sets hit_limit to true.
+NodeFilter MakeNodeFromToFilter(const HloInstruction* from,
+                                const HloInstruction* to, int64 max_nodes,
+                                bool* hit_limit) {
+  *hit_limit = false;
+
+  // Elements in the queue are paths through the graph.
+  std::deque<std::vector<const HloInstruction*>> queue;
+  queue.push_front({from});
+
+  // Compute the set of nodes we want to show using a slightly-modified
+  // Djikstra's algorithm.  The only real difference is, rather than stopping
+  // when we find a (shortest) path, we continue until we've found max_nodes
+  // nodes on some path.
+  std::unordered_set<const HloInstruction*> visited;
+  std::unordered_set<const HloInstruction*> to_display = {from, to};
+  while (!queue.empty() && to_display.size() < max_nodes) {
+    std::vector<const HloInstruction*> path = std::move(queue.front());
+    queue.pop_front();
+    if (!visited.insert(path.back()).second) {
+      continue;
+    }
+
+    for (const auto* user : path.back()->users()) {
+      if (user == to) {
+        auto it = path.begin();
+        for (; it != path.end() && to_display.size() < max_nodes; ++it) {
+          to_display.insert(*it);
+        }
+        if (it != path.end()) {
+          *hit_limit = true;
+        }
+      } else if (!visited.count(user)) {
+        auto new_path = path;
+        new_path.push_back(user);
+        queue.push_back(std::move(new_path));
+      }
+    }
+  }
+
+  return NodeFilter([=](const HloInstruction* instr) {
+    if (instr == from || instr == to) {
+      return kHighlightNode;
+    }
+    return to_display.count(instr) ? kNormalNode : kHideNode;
+  });
+}
+
 string SaveGraph(const string& graph,
                  GraphRendererInterface::GraphKind graph_kind,
                  const string& dest_path) {
@@ -1482,7 +1519,7 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
   auto debug_options = node.GetModule()->config().debug_options();
   string label =
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
-  NodeFilter filter = MakeNodeFilter(&node, radius);
+  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius);
   string graph =
       HloDotDumper(node.parent(), label, debug_options, show_backend_config,
                    /*profile=*/nullptr, filter)
@@ -1490,6 +1527,29 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
   return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
 }
 
+string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
+                          int64 max_nodes, bool show_backend_config) {
+  CHECK_EQ(from.parent(), to.parent()) << "Nodes must be in same computation!";
+  auto debug_options = from.GetModule()->config().debug_options();
+
+  bool hit_limit = false;
+  NodeFilter filter = MakeNodeFromToFilter(&from, &to, max_nodes, &hit_limit);
+  string label;
+  if (!hit_limit) {
+    label = StrCat("All paths from ", from.name(), " to ", to.name());
+  } else {
+    label = StrCat(max_nodes, " nodes on the shortest paths from ", from.name(),
+                   " to ", to.name(),
+                   "<br/><br/>***SHOWING ONLY A SUBSET OF ALL PATHS BETWEEN "
+                   "NODES***<br/><br/>");
+  }
+  string graph =
+      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+                   /*profile=*/nullptr, filter)
+          .Dump();
+  return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
+}
+
 void DumpText(const HloModule& module, const string& label,
               const string& directory_path, bool do_prefix) {
   Env* env = Env::Default();
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 0b11f34abb7f0d937a24d11f4dc5d2d6a0aae6e7..de1eefab776f9c3d2c73959a5cd267e938a78a32 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -66,6 +66,12 @@ string DumpGraph(const HloComputation& computation, const string& label,
 string DumpNeighborhoodAround(const HloInstruction& node, int radius,
                               bool show_backend_config = false);
 
+// Dumps nodes on any of the paths from `from` to `to`.  If there are more than
+// max_nodes on all paths, restricts to the max_nodes nodes on the shortest
+// paths.
+string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
+                          int64 max_nodes, bool show_backend_config = false);
+
 // Dumps the HloModule::ToString() as a file into the provided directory path
 // suffixed with the provided label.
 //
@@ -87,13 +93,13 @@ void DumpText(const HloModule& module, const string& label,
 // Class that registers a graph renderer.
 class Registrar {
  public:
-  Registrar(GraphRendererInterface* dumper);
+  Registrar(std::shared_ptr<GraphRendererInterface> dumper);
 };
 
-#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, ctr, ...)   \
-  static ::xla::hlo_graph_dumper::Registrar                       \
-      XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr)(new factory, \
-                                                     ##__VA_ARGS__)
+#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, ctr, ...) \
+  static ::xla::hlo_graph_dumper::Registrar                     \
+      XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr)(           \
+          std::make_shared<factory>(), ##__VA_ARGS__)
 
 // __COUNTER__ must go through another macro to be properly expanded
 #define XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr) ___##ctr##__object_
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e1597fd03db0a78aa560340b7b9b64fe500df0c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
@@ -0,0 +1,207 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
+                                             int64 param_number,
+                                             const ShapeIndex& param_index) {
+  TF_RET_CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index))
+      << absl::StrCat("Tring to set up alias at ", output_index.ToString(),
+                      " which is an invalid index for shape ",
+                      ShapeUtil::HumanString(alias_.shape()));
+  // Output can't be aliased with multiple parameters.
+  TF_RET_CHECK(!alias_.element(output_index)) << absl::StrFormat(
+      "Trying to set up output alias for param %lld at %s but failed: output "
+      "index %s is already aliased with param %lld at %s",
+      param_number, param_index.ToString(), output_index.ToString(),
+      alias_.element(output_index)->first,
+      alias_.element(output_index)->second.ToString());
+  (*alias_.mutable_element(output_index)) =
+      std::make_pair(param_number, param_index);
+  VLOG(4) << "Set up alias between output index " << output_index.ToString()
+          << " and parameter " << param_index << " at index "
+          << param_index.ToString();
+  return Status::OK();
+}
+
+HloInputOutputAliasProto HloInputOutputAliasConfig::ToProto() const {
+  HloInputOutputAliasProto result;
+  alias_.ForEachElement(
+      [&](const ShapeIndex& index,
+          const absl::optional<std::pair<int64, ShapeIndex>>& data) {
+        if (data) {
+          HloInputOutputAliasProto::AliasEntryProto entry;
+          for (int64 i : index) {
+            entry.add_output_shape_index(i);
+          }
+          entry.set_parameter_number(data->first);
+          for (int64 i : data->second) {
+            entry.add_parameter_shape_index(i);
+          }
+          result.add_entries()->Swap(&entry);
+        }
+      });
+  return result;
+}
+
+StatusOr<HloInputOutputAliasConfig> HloInputOutputAliasConfig::CreateFromProto(
+    const Shape& output_shape, const HloInputOutputAliasProto& proto) {
+  HloInputOutputAliasConfig result(output_shape);
+  for (const HloInputOutputAliasProto::AliasEntryProto& entry :
+       proto.entries()) {
+    ShapeIndex output_index(entry.output_shape_index().begin(),
+                            entry.output_shape_index().end());
+
+    int64 param_number = entry.parameter_number();
+    ShapeIndex param_index(entry.parameter_shape_index().begin(),
+                           entry.parameter_shape_index().end());
+    TF_RETURN_IF_ERROR(
+        result.SetUpAlias(output_index, param_number, param_index));
+  }
+
+  return result;
+}
+
+string HloInputOutputAliasConfig::ToString() const {
+  std::vector<string> pieces;
+  pieces.push_back("HloInputOutputAliasConfig");
+
+  ForEachAlias([&](const ShapeIndex& output_index, int64 param_number,
+                   const ShapeIndex& param_index) {
+    pieces.push_back(absl::StrFormat(
+        "  OutputIndex %s is aliased with parameter %lld at %s:",
+        output_index.ToString(), param_number, param_index.ToString()));
+  });
+
+  return absl::StrJoin(pieces, "\n");
+}
+
+bool HloInputOutputAliasConfig::ParameterHasAlias(
+    int64 param_number, const ShapeIndex& param_index) const {
+  bool output = false;
+  alias_.ForEachElement(
+      [&](const xla::ShapeIndex&,
+          absl::optional<std::pair<int64, ShapeIndex>> alias) {
+        if (alias && alias->first == param_number &&
+            alias->second == param_index) {
+          output = true;
+        }
+      });
+  return output;
+}
+
+absl::optional<ShapeIndex> HloInputOutputAliasConfig::GetAliasedOutput(
+    int64 param_number, const ShapeIndex& param_index) const {
+  absl::optional<ShapeIndex> output;
+  alias_.ForEachElement(
+      [&](const xla::ShapeIndex& output_index,
+          absl::optional<std::pair<int64, ShapeIndex>> alias) {
+        if (alias && alias->first == param_number &&
+            alias->second == param_index) {
+          output = output_index;
+        }
+      });
+  return output;
+}
+
+absl::optional<std::pair<int64, ShapeIndex>>
+HloInputOutputAliasConfig::GetAliasedParameter(
+    const ShapeIndex& output_index) const {
+  CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index));
+  return alias_.element(output_index);
+}
+
+void HloInputOutputAliasConfig::ForEachAlias(AliasFn fn) const {
+  alias_.ForEachElement(
+      [&](const ShapeIndex& output_index,
+          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+        if (aliased) {
+          fn(output_index, aliased->first, aliased->second);
+        }
+      });
+}
+
+Status HloInputOutputAliasConfig::ForEachAliasWithStatus(
+    AliasFnWithStatus fn) const {
+  return alias_.ForEachElementWithStatus(
+      [&](const ShapeIndex& output_index,
+          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+        if (aliased) {
+          TF_RETURN_IF_ERROR(fn(output_index, aliased->first, aliased->second));
+        }
+        return Status::OK();
+      });
+}
+
+Status HloInputOutputAliasConfig::Verify(
+    const HloModule& module,
+    std::function<int64(const Shape&)> size_func) const {
+  std::vector<ShapeTree<bool>> param_has_seen;
+  const HloComputation* entry = module.entry_computation();
+  for (int64 i = 0; i < entry->num_parameters(); ++i) {
+    HloInstruction* param = entry->parameter_instruction(i);
+    param_has_seen.emplace_back(param->shape());
+  }
+  return ForEachAliasWithStatus([&](const ShapeIndex& output_index,
+                                    int64 param_number,
+                                    const ShapeIndex& param_index) -> Status {
+    const HloInstruction* root = entry->root_instruction();
+
+    TF_RET_CHECK(0 <= param_number);
+    TF_RET_CHECK(entry->num_parameters() > param_number);
+    const Shape& param_shape =
+        entry->parameter_instruction(param_number)->shape();
+    const Shape& output_shape = root->shape();
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(param_shape, param_index));
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(output_shape, output_index));
+
+    const Shape& param_subshape =
+        ShapeUtil::GetSubshape(param_shape, param_index);
+    const Shape& output_subshape =
+        ShapeUtil::GetSubshape(output_shape, output_index);
+    TF_RET_CHECK(LayoutUtil::IsDenseArray(param_subshape));
+    TF_RET_CHECK(LayoutUtil::IsDenseArray(output_subshape));
+
+    if (size_func(param_subshape) != size_func(output_subshape)) {
+      return InternalError(
+          "Expected aliased input %lld at index %s and output at index %s to "
+          "have the same size. Input sub-shape is %s with size %lld, output "
+          "sub-shape is %s with size %lld",
+          param_number, param_index.ToString(), output_index.ToString(),
+          ShapeUtil::HumanStringWithLayout(param_subshape),
+          size_func(param_subshape),
+          ShapeUtil::HumanStringWithLayout(output_subshape),
+          size_func(output_subshape));
+    }
+
+    // Check each param_number and param_index pair only show up once. No
+    // input can be aliased with output buffers.
+    TF_RET_CHECK(param_has_seen[param_number].element(param_index) == false);
+
+    *(param_has_seen[param_number].mutable_element(param_index)) = true;
+
+    return Status::OK();
+  });
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const HloInputOutputAliasConfig& config) {
+  out << config.ToString();
+  return out;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..439676b1546c4af7f781fb80bccffd5248309b0f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
+
+#include <utility>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+class HloModule;
+
+// This class specifies the alias map from output index to parameter number and
+// parameter index in the entry computation.
+class HloInputOutputAliasConfig {
+ public:
+  HloInputOutputAliasConfig() = default;
+
+  explicit HloInputOutputAliasConfig(Shape shape) : alias_(shape) {}
+
+  virtual ~HloInputOutputAliasConfig() = default;
+
+  // Sets up alias config from `output_index` to `param_index` at
+  // `param_number`.
+  Status SetUpAlias(const ShapeIndex& output_index, int64 param_number,
+                    const ShapeIndex& param_index);
+
+  // Returns true if the given parameter is aliased with one of the output
+  // buffers.
+  bool ParameterHasAlias(int64 param_number,
+                         const ShapeIndex& param_index) const;
+
+  // (De)Serializes an HloInputOutoutAliasConfig to/from an
+  // HloInputOutoutAliasProto.
+  HloInputOutputAliasProto ToProto() const;
+
+  static StatusOr<HloInputOutputAliasConfig> CreateFromProto(
+      const Shape& output_shape, const HloInputOutputAliasProto& proto);
+
+  // Returns the output index that the given parameter and parameter index is
+  // aliased with. A nullopt is returned if there is no output that is aliased
+  // with the parameter number and index.
+  absl::optional<ShapeIndex> GetAliasedOutput(
+      int64 param_number, const ShapeIndex& param_index) const;
+
+  // Returns the number of parameter and index of the parameter buffer that the
+  // given output buffer index is aliased with. A nullopt is returned if there
+  // is no parameter is aliased with the specific output.
+  absl::optional<std::pair<int64, ShapeIndex>> GetAliasedParameter(
+      const ShapeIndex& output_index) const;
+
+  using AliasFn =
+      std::function<void(const ShapeIndex& output_index, int64 param_number,
+                         const ShapeIndex& param_index)>;
+
+  // Iterates through each aliased output and input.
+  void ForEachAlias(AliasFn fn) const;
+
+  using AliasFnWithStatus =
+      std::function<Status(const ShapeIndex& output_index, int64 param_number,
+                           const ShapeIndex& param_index)>;
+
+  // Verifies that the given config is valid for the given module.
+  // Specifically, the config's input and output should be in-bound and size of
+  // the aliased buffers should match.
+  Status Verify(const HloModule& module,
+                std::function<int64(const Shape&)> size_func_) const;
+
+  Status ForEachAliasWithStatus(AliasFnWithStatus fn) const;
+
+  string ToString() const;
+
+ private:
+  // A ShapeTree which indicates the list of buffers that's expected to be
+  // aliased. The key on this shape tree represents the output index. The value
+  // is a pair of parameter number and index into the buffer. If the value is
+  // nullopt, it means there is no parameter aliasing for this output.
+  ShapeTree<absl::optional<std::pair<int64, ShapeIndex>>> alias_;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const HloInputOutputAliasConfig& config);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aeb9b0fdc8b6cca87731a2d4aae25120af6c3215
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
@@ -0,0 +1,210 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+class HloInputOutputAliasConfigTest : public HloTestBase {
+ protected:
+  void expect_aliased(const ShapeIndex& output_index, int64 param_number,
+                      const ShapeIndex& param_index,
+                      const HloInputOutputAliasConfig& config) {
+    absl::optional<ShapeIndex> aliased_output =
+        config.GetAliasedOutput(param_number, param_index);
+
+    EXPECT_TRUE(aliased_output);
+    EXPECT_EQ(aliased_output.value(), output_index);
+
+    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+        config.GetAliasedParameter(output_index);
+
+    EXPECT_TRUE(aliased_param);
+    EXPECT_EQ(aliased_param.value(), std::make_pair(param_number, param_index));
+  }
+
+  void expect_not_aliased(const ShapeIndex& output_index, int64 param_number,
+                          const ShapeIndex& param_index,
+                          const HloInputOutputAliasConfig& config) {
+    absl::optional<ShapeIndex> aliased_output =
+        config.GetAliasedOutput(param_number, param_index);
+
+    EXPECT_FALSE(aliased_output && aliased_output == output_index);
+
+    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+        config.GetAliasedParameter(output_index);
+
+    EXPECT_FALSE(aliased_param && aliased_param->first == param_number &&
+                 aliased_param->second == param_index);
+  }
+};
+
+TEST_F(HloInputOutputAliasConfigTest, SimpleAliasing) {
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT root = (f32[], f32[]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  HloInputOutputAliasConfig config(
+      module->entry_computation()->root_instruction()->shape());
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
+                                 /*param_index=*/{}));
+
+  expect_aliased(/*output_index=*/{0}, /*param_number=*/1,
+                 /*param_index=*/{}, config);
+
+  expect_not_aliased(/*output_index=*/{1}, /*param_number=*/1,
+                     /*param_index=*/{}, config);
+
+  expect_not_aliased(/*output_index=*/{0}, /*param_number=*/0,
+                     /*param_index=*/{}, config);
+}
+
+TEST_F(HloInputOutputAliasConfigTest, SimpleAliasingWithTupleInput) {
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  HloInputOutputAliasConfig config(
+      module->entry_computation()->root_instruction()->shape());
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
+                                 /*param_index=*/{0}));
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
+                                 /*param_index=*/{1}));
+
+  expect_aliased(/*output_index=*/{0}, /*param_number=*/0,
+                 /*param_index=*/{0}, config);
+
+  expect_aliased(/*output_index=*/{1}, /*param_number=*/0,
+                 /*param_index=*/{1}, config);
+
+  expect_not_aliased(/*output_index=*/{1}, /*param_number=*/1,
+                     /*param_index=*/{}, config);
+
+  expect_not_aliased(/*output_index=*/{0}, /*param_number=*/0,
+                     /*param_index=*/{}, config);
+}
+
+TEST_F(HloInputOutputAliasConfigTest, InputDoNotAliasTwice) {
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT root = (f32[], f32[]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  HloInputOutputAliasConfig config(
+      module->entry_computation()->root_instruction()->shape());
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
+                                 /*param_index=*/{}));
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
+                                 /*param_index=*/{}));
+
+  ASSERT_IS_NOT_OK(config.Verify(*module, [](const Shape& shape) {
+    return ShapeUtil::ByteSizeOf(shape);
+  }));
+}
+
+TEST_F(HloInputOutputAliasConfigTest, SizesMustMatch) {
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[4096] parameter(1)
+  ROOT root = (f32[], f32[4096]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  HloInputOutputAliasConfig config(
+      module->entry_computation()->root_instruction()->shape());
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
+                                 /*param_index=*/{}));
+
+  ASSERT_IS_NOT_OK(config.Verify(*module, [](const Shape& shape) {
+    return ShapeUtil::ByteSizeOf(shape);
+  }));
+}
+
+TEST_F(HloInputOutputAliasConfigTest, OutputDoNotAliasTwice) {
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT root = (f32[], f32[]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  HloInputOutputAliasConfig config(
+      module->entry_computation()->root_instruction()->shape());
+
+  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
+                                 /*param_index=*/{}));
+
+  ASSERT_IS_NOT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
+                                     /*param_index=*/{}));
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 2f6db7cd7c0ada166dc81f75c4a9989eb9d70638..21b1dbc1676cccd2fe5b331a1f9d6ff5e3a73fcd 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -93,7 +93,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       [&computation_map](int64 id) { return computation_map.contains(id); }))
       << proto.name() << " instruction references invalid computation id(s)";
 
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(proto.shape()));
+  Shape shape(proto.shape());
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
   switch (opcode) {
     // Ops migrated to subclasses.
@@ -101,23 +102,23 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 3)
           << "BatchNormTraining instruction should have 3 operands but sees "
           << proto.operand_ids_size();
-      instruction = CreateBatchNormTraining(
-          proto.shape(), operands(0), operands(1), operands(2), proto.epsilon(),
-          proto.feature_index());
+      instruction =
+          CreateBatchNormTraining(shape, operands(0), operands(1), operands(2),
+                                  proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormInference:
       TF_RET_CHECK(proto.operand_ids_size() == 5)
           << "BatchNormInference instruction should have 5 operands but sees "
           << proto.operand_ids_size();
       instruction = CreateBatchNormInference(
-          proto.shape(), operands(0), operands(1), operands(2), operands(3),
+          shape, operands(0), operands(1), operands(2), operands(3),
           operands(4), proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormGrad:
       TF_RET_CHECK(proto.operand_ids_size() == 5)
           << "BatchNormGrad instruction should have 5 operands but sees "
           << proto.operand_ids_size();
-      instruction = CreateBatchNormGrad(proto.shape(), operands(0), operands(1),
+      instruction = CreateBatchNormGrad(shape, operands(0), operands(1),
                                         operands(2), operands(3), operands(4),
                                         proto.epsilon(), proto.feature_index());
       break;
@@ -127,7 +128,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       std::vector<int64> fft_length(proto.fft_length().begin(),
                                     proto.fft_length().end());
-      instruction = CreateFft(proto.shape(), operands(0), proto.fft_type(),
+      instruction = CreateFft(shape, operands(0), proto.fft_type(),
                               absl::Span<const int64>(fft_length));
       break;
     }
@@ -148,7 +149,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Recv instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction = CreateRecv(proto.shape().tuple_shapes(0), operands(0),
+      instruction = CreateRecv(shape.tuple_shapes(0), operands(0),
                                proto.channel_id(), proto.is_host_transfer());
       break;
     case HloOpcode::kRecvDone:
@@ -161,7 +162,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Reverse instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction = CreateReverse(proto.shape(), operands(0),
+      instruction = CreateReverse(shape, operands(0),
                                   std::vector<int64>(proto.dimensions().begin(),
                                                      proto.dimensions().end()));
       break;
@@ -170,7 +171,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Concatenate instruction should have 1 dimension but sees "
           << proto.dimensions_size();
       instruction =
-          CreateConcatenate(proto.shape(), all_operands(), proto.dimensions(0));
+          CreateConcatenate(shape, all_operands(), proto.dimensions(0));
       break;
     case HloOpcode::kReduce:
       TF_RET_CHECK(proto.operand_ids_size() % 2 == 0)
@@ -188,24 +189,23 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             absl::MakeSpan(reduce_operands)
                 .subspan(reduce_operands.size() / 2, reduce_operands.size());
         instruction =
-            CreateReduce(proto.shape(), inputs, init_values,
+            CreateReduce(shape, inputs, init_values,
                          std::vector<int64>(proto.dimensions().begin(),
                                             proto.dimensions().end()),
                          computations(0));
       }
       break;
     case HloOpcode::kSort: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1 ||
-                   proto.operand_ids_size() == 2)
-          << "Sort instruction should have 1 or 2 operands but has "
+      TF_RET_CHECK(proto.operand_ids_size() >= 1)
+          << "Sort instruction should have at least 1 operand but has "
           << proto.operand_ids_size();
       TF_RET_CHECK(proto.dimensions().size() == 1)
           << "Sort instruction should have 1 dimension";
-      HloInstruction* keys = operands(0);
-      HloInstruction* values =
-          proto.operand_ids_size() == 2 ? operands(1) : nullptr;
-      instruction =
-          CreateSort(proto.shape(), proto.dimensions(0), keys, values);
+      auto sort_operands = all_operands();
+      HloInstruction* keys = sort_operands[0];
+      instruction = CreateSort(
+          shape, proto.dimensions(0), keys,
+          absl::Span<HloInstruction* const>(sort_operands).subspan(1));
       break;
     }
     case HloOpcode::kTranspose:
@@ -213,7 +213,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Transpose instruction should have 1 operand but sees "
           << proto.operand_ids_size();
       instruction =
-          CreateTranspose(proto.shape(), operands(0),
+          CreateTranspose(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
                                              proto.dimensions().end()));
       break;
@@ -222,7 +222,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Broadcast instruction should have 1 operand but sees "
           << proto.operand_ids_size();
       instruction =
-          CreateBroadcast(proto.shape(), operands(0),
+          CreateBroadcast(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
                                              proto.dimensions().end()));
       break;
@@ -230,7 +230,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "Map instruction should have 1 called computation but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateMap(proto.shape(), all_operands(), computations(0));
+      instruction = CreateMap(shape, all_operands(), computations(0));
       break;
     case HloOpcode::kSlice: {
       TF_RET_CHECK(proto.operand_ids_size() == 1)
@@ -243,8 +243,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         slice_limits.push_back(slice_dimensions.limit());
         slice_strides.push_back(slice_dimensions.stride());
       }
-      instruction = CreateSlice(proto.shape(), operands(0), slice_starts,
-                                slice_limits, slice_strides);
+      instruction = CreateSlice(shape, operands(0), slice_starts, slice_limits,
+                                slice_strides);
       break;
     }
     case HloOpcode::kConstant: {
@@ -254,7 +254,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                             Literal::CreateFromProto(proto.literal()));
         instruction = CreateConstant(std::move(literal));
       } else {
-        instruction = absl::make_unique<HloConstantInstruction>(proto.shape());
+        instruction = absl::make_unique<HloConstantInstruction>(shape);
       }
       break;
     }
@@ -285,44 +285,54 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           tensorflow::gtl::FindPtrOrNull(computation_map, fusion_id);
       TF_RET_CHECK(fused_computation != nullptr)
           << "No fusion computation with id " << fusion_id;
-      instruction = CreateFusion(proto.shape(), fusion_kind, all_operands(),
-                                 fused_computation);
+      instruction =
+          CreateFusion(shape, fusion_kind, all_operands(), fused_computation);
       break;
     }
     case HloOpcode::kRng:
-      instruction =
-          CreateRng(proto.shape(), proto.distribution(), all_operands());
+      instruction = CreateRng(shape, proto.distribution(), all_operands());
       break;
     case HloOpcode::kParameter:
-      instruction = CreateParameter(proto.parameter_number(), proto.shape(),
-                                    proto.name());
+      instruction =
+          CreateParameter(proto.parameter_number(), shape, proto.name());
       break;
     case HloOpcode::kGetTupleElement:
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "GetTupleElement instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction = CreateGetTupleElement(proto.shape(), operands(0),
-                                          proto.tuple_index());
+      instruction =
+          CreateGetTupleElement(shape, operands(0), proto.tuple_index());
       break;
     case HloOpcode::kReducePrecision:
-      instruction =
-          CreateReducePrecision(proto.shape(), operands(0),
-                                proto.exponent_bits(), proto.mantissa_bits());
+      TF_RET_CHECK(proto.operand_ids_size() == 1)
+          << "ReducePrecision instruction should have 1 operand but sees "
+          << proto.operand_ids_size();
+      instruction = CreateReducePrecision(
+          shape, operands(0), proto.exponent_bits(), proto.mantissa_bits());
       break;
     case HloOpcode::kInfeed: {
-      const Shape& data_shape =
-          ShapeUtil::GetTupleElementShape(proto.shape(), 0);
-      TF_RET_CHECK(proto.operand_ids_size() == 1);
+      TF_RET_CHECK(ShapeUtil::IsTuple(shape) &&
+                   (ShapeUtil::TupleElementCount(shape) == 2))
+          << "Infeed should have a tuple shape with 2 operands, but has: "
+          << shape;
+      const Shape& data_shape = ShapeUtil::GetTupleElementShape(shape, 0);
+      TF_RET_CHECK(proto.operand_ids_size() == 1)
+          << "Infeed instruction should have 1 operand but sees "
+          << proto.operand_ids_size();
       instruction =
           CreateInfeed(data_shape, operands(0), proto.infeed_config());
     } break;
-    case HloOpcode::kOutfeed:
-      TF_RET_CHECK(proto.operand_ids_size() == 2);
+    case HloOpcode::kOutfeed: {
+      TF_RET_CHECK(proto.operand_ids_size() == 2)
+          << "Outfeed instruction should have 2 operands but sees "
+          << proto.operand_ids_size();
+      Shape outfeed_shape(proto.outfeed_shape());
       TF_RETURN_IF_ERROR(
-          ShapeUtil::ValidateShapeWithOptionalLayout(proto.outfeed_shape()));
-      instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
-                                  operands(1), proto.outfeed_config());
+          ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape));
+      instruction = CreateOutfeed(outfeed_shape, operands(0), operands(1),
+                                  proto.outfeed_config());
       break;
+    }
     case HloOpcode::kCrossReplicaSum: {
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "CrossReplicaSum should have 1 called computation but sees "
@@ -332,7 +342,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         all_reduce_id = proto.all_reduce_id();
       }
       instruction = CreateCrossReplicaSum(
-          proto.shape(), all_operands(), computations(0),
+          shape, all_operands(), computations(0),
           /*replica_groups=*/
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
                                     proto.replica_groups().end()),
@@ -342,21 +352,24 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     }
     case HloOpcode::kAllToAll: {
       instruction = CreateAllToAll(
-          proto.shape(), all_operands(),
+          shape, all_operands(),
           /*replica_groups=*/
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
                                     proto.replica_groups().end()));
       break;
     }
     case HloOpcode::kCollectivePermute: {
+      TF_RET_CHECK(proto.operand_ids_size() == 1)
+          << "CollectivePermute instruction should have 1 operand but sees "
+          << proto.operand_ids_size();
       std::vector<std::pair<int64, int64>> source_target_pairs(
           proto.source_target_pairs_size());
       for (int i = 0; i < source_target_pairs.size(); i++) {
         source_target_pairs[i].first = proto.source_target_pairs(i).source();
         source_target_pairs[i].second = proto.source_target_pairs(i).target();
       }
-      instruction = CreateCollectivePermute(proto.shape(), operands(0),
-                                            source_target_pairs);
+      instruction =
+          CreateCollectivePermute(shape, operands(0), source_target_pairs);
       break;
     }
     case HloOpcode::kConvolution: {
@@ -369,7 +382,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       precision_config.mutable_operand_precision()->Resize(
           proto.operand_ids_size(), PrecisionConfig::DEFAULT);
       instruction = CreateConvolve(
-          proto.shape(), operands(0), operands(1),
+          shape, operands(0), operands(1),
           std::max<int64>(proto.feature_group_count(), 1), proto.window(),
           proto.convolution_dimension_numbers(), precision_config);
       break;
@@ -381,7 +394,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "ReduceWindow should have 1 called computation but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateReduceWindow(proto.shape(), operands(0), operands(1),
+      instruction = CreateReduceWindow(shape, operands(0), operands(1),
                                        proto.window(), computations(0));
       break;
     case HloOpcode::kSelectAndScatter:
@@ -391,14 +404,28 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 2)
           << "SelectAndScatter should have 2 called computations but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateSelectAndScatter(
-          proto.shape(), operands(0), computations(0), proto.window(),
-          operands(1), operands(2), computations(1));
+      instruction = CreateSelectAndScatter(shape, operands(0), computations(0),
+                                           proto.window(), operands(1),
+                                           operands(2), computations(1));
       break;
     case HloOpcode::kCustomCall:
-      instruction = CreateCustomCall(proto.shape(), all_operands(),
-                                     proto.custom_call_target(),
-                                     proto.custom_call_opaque());
+      if (proto.constrain_layout()) {
+        // A proto RepeatedPtrField cannot be converted to a Span (it is a
+        // vector of pointers essentially) so create a vector of shapes to pass
+        // in.
+        std::vector<Shape> operand_shapes;
+        for (const ShapeProto& shape_proto :
+             proto.operand_shapes_with_layout()) {
+          operand_shapes.emplace_back(shape_proto);
+        }
+        instruction =
+            CreateCustomCall(shape, all_operands(), proto.custom_call_target(),
+                             operand_shapes, proto.custom_call_opaque());
+      } else {
+        instruction =
+            CreateCustomCall(shape, all_operands(), proto.custom_call_target(),
+                             proto.custom_call_opaque());
+      }
       if (proto.has_window()) {
         static_cast<HloCustomCallInstruction*>(instruction.get())
             ->set_window(proto.window());
@@ -417,8 +444,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Pad instruction should have 2 operands but sees "
           << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_padding_config());
-      instruction = CreatePad(proto.shape(), operands(0), operands(1),
-                              proto.padding_config());
+      instruction =
+          CreatePad(shape, operands(0), operands(1), proto.padding_config());
       break;
     case HloOpcode::kDynamicSlice: {
       TF_RET_CHECK(proto.operand_ids_size() == 2)
@@ -426,8 +453,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       std::vector<int64> slice_sizes(proto.dynamic_slice_sizes_size());
       absl::c_copy(proto.dynamic_slice_sizes(), slice_sizes.begin());
-      instruction = CreateDynamicSlice(proto.shape(), operands(0), operands(1),
-                                       slice_sizes);
+      instruction =
+          CreateDynamicSlice(shape, operands(0), operands(1), slice_sizes);
       break;
     }
     case HloOpcode::kGather: {
@@ -443,7 +470,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       for (int64 bound : proto.gather_slice_sizes()) {
         gather_slice_sizes.push_back(bound);
       }
-      instruction = CreateGather(proto.shape(), operands(0), operands(1),
+      instruction = CreateGather(shape, operands(0), operands(1),
                                  *gather_dimension_numbers, gather_slice_sizes);
       break;
     }
@@ -459,16 +486,15 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       auto scatter_dimension_numbers =
           absl::make_unique<ScatterDimensionNumbers>(
               proto.scatter_dimension_numbers());
-      instruction =
-          CreateScatter(proto.shape(), operands(0), operands(1), operands(2),
-                        computations(0), *scatter_dimension_numbers);
+      instruction = CreateScatter(shape, operands(0), operands(1), operands(2),
+                                  computations(0), *scatter_dimension_numbers);
       break;
     }
     case HloOpcode::kIota:
       TF_RET_CHECK(proto.dimensions_size() == 1)
           << "Iota instruction should have 1 dimension but sees "
           << proto.dimensions_size();
-      instruction = CreateIota(proto.shape(), proto.dimensions(0));
+      instruction = CreateIota(shape, proto.dimensions(0));
       break;
     case HloOpcode::kDot: {
       TF_RET_CHECK(proto.has_dot_dimension_numbers())
@@ -480,33 +506,42 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       precision_config.mutable_operand_precision()->Resize(
           proto.operand_ids_size(), PrecisionConfig::DEFAULT);
       instruction = absl::make_unique<HloDotInstruction>(
-          proto.shape(), operands(0), operands(1),
-          proto.dot_dimension_numbers(), precision_config);
+          shape, operands(0), operands(1), proto.dot_dimension_numbers(),
+          precision_config);
       break;
     }
     case HloOpcode::kDomain: {
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Domain instruction should have 1 operands but sees "
           << proto.operand_ids_size();
-      TF_RET_CHECK(proto.has_domain_entry_sharding())
-          << "Domain instruction must domain_entry_sharding";
-      TF_RET_CHECK(proto.has_domain_exit_sharding())
-          << "Domain instruction must domain_exit_sharding";
-      TF_ASSIGN_OR_RETURN(
-          HloSharding entry_hlo_sharding,
-          HloSharding::FromProto(proto.domain_entry_sharding()));
-      TF_ASSIGN_OR_RETURN(HloSharding exit_hlo_sharding,
-                          HloSharding::FromProto(proto.domain_exit_sharding()));
+      std::shared_ptr<const HloSharding> entry_hlo_sharding;
+      std::shared_ptr<const HloSharding> exit_hlo_sharding;
+      if (proto.has_domain_entry_sharding()) {
+        TF_ASSIGN_OR_RETURN(
+            HloSharding sharding,
+            HloSharding::FromProto(proto.domain_entry_sharding()));
+        entry_hlo_sharding = std::make_shared<const HloSharding>(sharding);
+      }
+      if (proto.has_domain_exit_sharding()) {
+        TF_ASSIGN_OR_RETURN(
+            HloSharding sharding,
+            HloSharding::FromProto(proto.domain_exit_sharding()));
+        exit_hlo_sharding = std::make_shared<const HloSharding>(sharding);
+      }
       instruction = absl::make_unique<HloDomainInstruction>(
-          proto.shape(), operands(0),
-          absl::make_unique<ShardingMetadata>(
-              std::make_shared<const HloSharding>(entry_hlo_sharding)),
-          absl::make_unique<ShardingMetadata>(
-              std::make_shared<const HloSharding>(exit_hlo_sharding)));
+          shape, operands(0),
+          absl::make_unique<ShardingMetadata>(entry_hlo_sharding),
+          absl::make_unique<ShardingMetadata>(exit_hlo_sharding));
       break;
     }
+    case HloOpcode::kGetDimensionSize:
+      TF_RET_CHECK(proto.operand_ids_size() == 1);
+      TF_RET_CHECK(proto.dimensions_size() == 1);
+      instruction =
+          CreateGetDimensionSize(shape, operands(0), proto.dimensions(0));
+      break;
     default: {
-      instruction = absl::WrapUnique(new HloInstruction(opcode, proto.shape()));
+      instruction = absl::WrapUnique(new HloInstruction(opcode, shape));
       for (const int64 operand_id : proto.operand_ids()) {
         instruction->AppendOperand(instruction_map.at(operand_id));
       }
@@ -820,6 +855,16 @@ HloInstruction::CreateCollectivePermute(
       new HloInstruction(HloOpcode::kAfterAll, ShapeUtil::MakeTokenShape()));
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateAddDependency(HloInstruction* data_operand,
+                                    HloInstruction* token_operand) {
+  auto instruction = absl::WrapUnique(
+      new HloInstruction(HloOpcode::kAddDependency, data_operand->shape()));
+  instruction->AppendOperand(data_operand);
+  instruction->AppendOperand(token_operand);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateWhile(
     const Shape& shape, HloComputation* condition, HloComputation* body,
     HloInstruction* init) {
@@ -976,6 +1021,14 @@ HloInstruction::CreateSelectAndScatter(
                                                     broadcast_dimensions);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateGetDimensionSize(const Shape& shape,
+                                       HloInstruction* operand,
+                                       int64 dimension) {
+  return absl::make_unique<HloGetDimensionSizeInstruction>(shape, operand,
+                                                           dimension);
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBroadcastSequence(
     const Shape& output_shape, HloInstruction* operand,
@@ -1055,7 +1108,7 @@ HloInstruction::CreateBroadcastSequence(
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSort(
     const Shape& shape, int64 dimension, HloInstruction* keys,
-    HloInstruction* values) {
+    absl::Span<HloInstruction* const> values) {
   return absl::make_unique<HloSortInstruction>(shape, dimension, keys, values);
 }
 
@@ -1084,7 +1137,11 @@ void HloInstruction::set_single_sharding(const HloSharding& sharding) {
 
 void HloInstruction::SetupDerivedInstruction(
     HloInstruction* derived_instruction) const {
-  if (sharding_ != nullptr) {
+  if (sharding_ != nullptr && ShapeUtil::CompatibleIgnoringElementType(
+                                  shape_, derived_instruction->shape())) {
+    // Only copy sharding if the shape of the two instruction is compatible
+    // because copying it between differently shaped instructions can produce
+    // invalid shardings.
     derived_instruction->set_sharding(*sharding_);
   } else {
     derived_instruction->clear_sharding();
@@ -1142,6 +1199,15 @@ bool HloInstruction::HasSideEffect() const {
       shape, operands, custom_call_target, opaque);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCustomCall(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    absl::string_view custom_call_target,
+    absl::Span<const Shape> operand_shapes_with_layout,
+    absl::string_view opaque) {
+  return absl::make_unique<HloCustomCallInstruction>(
+      shape, operands, custom_call_target, opaque, operand_shapes_with_layout);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTuple(
     absl::Span<HloInstruction* const> elements) {
   std::vector<Shape> element_shapes;
@@ -1234,6 +1300,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kIota:
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
+    case HloOpcode::kGetDimensionSize:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1337,6 +1404,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
         clone = CreateAfterAll(new_operands);
       }
       break;
+    case HloOpcode::kAddDependency:
+      CHECK_EQ(new_operands.size(), 2);
+      clone = CreateAddDependency(new_operands[0], new_operands[1]);
+      break;
   }
   // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
@@ -1623,6 +1694,7 @@ bool HloInstruction::IdenticalSlowPath(
 
     // This opcode has complex or special behavior so just return false.
     case HloOpcode::kAfterAll:
+    case HloOpcode::kAddDependency:
       return false;
 
     // Remaining instructions with special values.
@@ -1681,12 +1753,33 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kScatter:
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
+    case HloOpcode::kGetDimensionSize:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
   return false;
 }
 
+uint64 HloInstruction::Hash() const {
+  using tensorflow::Hash64Combine;
+
+  uint64 hash_value = Hash64Combine(0, static_cast<uint64>(opcode()));
+  hash_value = Hash64Combine(hash_value, ShapeUtil::Hash(shape()));
+
+  if (!IsCrossModuleAllReduce()) {
+    if (!operands().empty()) {
+      for (size_t i = 0; i < operands().size(); ++i) {
+        hash_value = Hash64Combine(hash_value, operand(i)->Hash());
+      }
+    }
+  }
+
+  hash_value = Hash64Combine(hash_value, InnerHash());
+  return hash_value;
+}
+
+uint64 HloInstruction::InnerHash() const { return 13; }
+
 void HloInstruction::RemoveUser(HloInstruction* user) {
   auto set_it = user_set_.find(user);
   CHECK(set_it != user_set_.end());
@@ -1842,6 +1935,11 @@ void HloInstruction::set_while_body(HloComputation* computation) {
   called_computations_[kBodyComputationIndex] = computation;
 }
 
+HloInstruction* HloInstruction::while_init() const {
+  CHECK_EQ(HloOpcode::kWhile, opcode_);
+  return operands_[0];
+}
+
 HloComputation* HloInstruction::true_computation() const {
   CHECK_EQ(HloOpcode::kConditional, opcode_);
   return called_computations_[kTrueComputationIndex];
@@ -2156,7 +2254,7 @@ HloInstructionProto HloInstruction::ToProto() const {
   proto.set_id(unique_id_);
   proto.set_name(name_);
   proto.set_opcode(HloOpcodeString(opcode_));
-  *proto.mutable_shape() = shape_;
+  *proto.mutable_shape() = shape_.ToProto();
   for (const HloInstruction* operand : operands_) {
     proto.add_operand_ids(operand->unique_id());
   }
@@ -2404,8 +2502,12 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleDomain(this);
     case HloOpcode::kAfterAll:
       return visitor->HandleAfterAll(this);
+    case HloOpcode::kAddDependency:
+      return visitor->HandleAddDependency(this);
     case HloOpcode::kIota:
       return visitor->HandleIota(this);
+    case HloOpcode::kGetDimensionSize:
+      return visitor->HandleGetDimensionSize(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
@@ -2563,36 +2665,6 @@ Status HloInstruction::AcceptWithOperandOrder(
   return Status::OK();
 }
 
-namespace {
-
-// Returns true if the given order is a topological sort of the instructions
-// it contains.
-bool OrderIsTopologicalSort(const std::vector<const HloInstruction*>& order) {
-  // Create a map from instruction to its position in 'order'.
-  std::unordered_map<const HloInstruction*, int> order_position;
-  for (int i = 0; i < order.size(); i++) {
-    if (!order_position.insert({order[i], i}).second) {
-      // Instruction order[i] is duplicated in the order.
-      return false;
-    }
-  }
-  // Verify that the operand of each instruction in the order is also in the
-  // order *and* the operand's position is earlier (defs are before uses for
-  // all ops).
-  for (auto* instruction : order) {
-    for (auto* operand : instruction->operands()) {
-      if (!ContainsKey(order_position, operand) ||
-          order_position.at(operand) >= order_position.at(instruction)) {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-}  // namespace
-
 Status HloInstruction::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
   FunctionVisitor visitor(visitor_func);
@@ -2605,50 +2677,7 @@ Status HloInstruction::Accept(
   return this->Accept(&visitor);
 }
 
-Status HloInstruction::AcceptOrdered(
-    DfsHloVisitor* visitor, const std::vector<const HloInstruction*>& order) {
-  VLOG(2) << "HloInstruction::AcceptOrdered(%" << name() << ")";
-  TF_RET_CHECK(OrderIsTopologicalSort(order));
-
-  // Compute the predecessors of this instruction.
-  std::unordered_set<const HloInstruction*> predecessors;
-  TF_RETURN_IF_ERROR(this->Accept([&predecessors](HloInstruction* instruction) {
-    predecessors.insert(instruction);
-    return Status::OK();
-  }));
-
-  for (auto* const_instruction : order) {
-    if (!ContainsKey(predecessors, const_instruction)) {
-      // Instruction is not a predecessors of 'this'.
-      continue;
-    }
-
-    // The visitor can mark instructions as visited to skip particular
-    // instructions.
-    if (visitor->DidVisit(*const_instruction)) {
-      VLOG(3) << "Not visiting HLO %" << const_instruction->name()
-              << " as it was already visited.";
-      continue;
-    }
-
-    // TODO(b/78350259): Eliminate const laundering.
-    HloInstruction* instruction =
-        const_cast<HloInstruction*>(const_instruction);
-
-    TF_RETURN_IF_ERROR(visitor->Preprocess(instruction));
-    VLOG(2) << "Visiting HLO %" << instruction->name();
-    TF_RETURN_IF_ERROR(instruction->Visit(visitor));
-    visitor->SetVisited(*instruction);
-    TF_RETURN_IF_ERROR(visitor->Postprocess(instruction));
-  }
-
-  return visitor->FinishVisit(this);
-}
-
-const Shape& HloInstruction::shape() const {
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_));
-  return shape_;
-}
+const Shape& HloInstruction::shape() const { return shape_; }
 
 std::vector<int64> HloInstruction::OperandIndices(
     const HloInstruction* operand) const {
@@ -3005,6 +3034,16 @@ const PrecisionConfig& HloInstruction::precision_config() const {
   LOG(FATAL) << "Unimplemented method.";
 }
 
+PrecisionConfig* HloInstruction::mutable_precision_config() {
+  if (auto* convolution = DynCast<HloConvolutionInstruction>(this)) {
+    return convolution->mutable_precision_config();
+  }
+  if (auto* dot = DynCast<HloDotInstruction>(this)) {
+    return dot->mutable_precision_config();
+  }
+  LOG(FATAL) << "Unimplemented method.";
+}
+
 HloModule* HloInstruction::GetModule() const {
   if (parent_) {
     return parent_->parent();
@@ -3047,6 +3086,10 @@ int64 HloInstruction::concatenate_dimension() const {
   return Cast<HloConcatenateInstruction>(this)->concatenate_dimension();
 }
 
+int64 HloInstruction::dimension() const {
+  return Cast<HloGetDimensionSizeInstruction>(this)->dimension();
+}
+
 bool HloInstruction::IsRank2Transpose() const {
   auto transpose = DynCast<HloTransposeInstruction>(this);
   return transpose != nullptr && transpose->IsRank2Transpose();
@@ -3226,6 +3269,11 @@ absl::optional<int64> HloInstruction::all_reduce_id() const {
   return Cast<HloAllReduceInstruction>(this)->all_reduce_id();
 }
 
+void HloInstruction::set_all_reduce_id(
+    const absl::optional<int64>& all_reduce_id) {
+  return Cast<HloAllReduceInstruction>(this)->set_all_reduce_id(all_reduce_id);
+}
+
 const ConvolutionDimensionNumbers&
 HloInstruction::convolution_dimension_numbers() const {
   if (auto convolution = DynCast<HloConvolutionInstruction>(this)) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 374862c4b672bf4cb7c6e3dbc60392a1018520b7..a54716217d6bbc5c0601f5d9ff7bf4072a6b30f5 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -28,11 +28,10 @@ limitations under the License.
 #include <set>
 #include <string>
 #include <tuple>
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -464,7 +463,7 @@ class HloInstruction {
   // the same all_reduce_id, they will be 'Allreduce'd. If empty, Allreduce will
   // not be applied cross modules.
   //
-  // TODO(b/79737069): Rename this to AllReduce.
+  // TODO(b/117564385): Rename this to AllReduce.
   static std::unique_ptr<HloInstruction> CreateCrossReplicaSum(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
@@ -670,10 +669,10 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       absl::Span<const int64> dimensions);
 
-  // Creates a sort op, with a keys operand, and an optional values operand.
+  // Creates a sort op, with a keys operand, and optional values operands.
   static std::unique_ptr<HloInstruction> CreateSort(
       const Shape& shape, int64 dimension, HloInstruction* keys,
-      HloInstruction* values = nullptr);
+      absl::Span<HloInstruction* const> values = {});
 
   // Creates a while instruction, given a condition computation, a body
   // computation, and the initial value for the input of the computations. For
@@ -734,6 +733,16 @@ class HloInstruction {
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       absl::string_view custom_call_target, absl::string_view opaque = "");
 
+  // Overload which constrains the layouts of the operand and result. 'shape'
+  // and 'operand_shapes_with_layout' must have layouts.
+  // 'operand_shapes_with_layout' must have a compatible element for each
+  // operand.
+  static std::unique_ptr<HloInstruction> CreateCustomCall(
+      const Shape& shape, absl::Span<HloInstruction* const> operands,
+      absl::string_view custom_call_target,
+      absl::Span<const Shape> operand_shapes_with_layout,
+      absl::string_view opaque = "");
+
   // Creates a tuple instruction with the given elements. This is a convenience
   // wrapper around CreateVariadic.
   static std::unique_ptr<HloInstruction> CreateTuple(
@@ -758,6 +767,12 @@ class HloInstruction {
   // when we plumb a primordial token from the entry computation.
   static std::unique_ptr<HloInstruction> CreateToken();
 
+  static std::unique_ptr<HloInstruction> CreateGetDimensionSize(
+      const Shape& shape, HloInstruction* operand, int64 dimension);
+
+  static std::unique_ptr<HloInstruction> CreateAddDependency(
+      HloInstruction* data_operand, HloInstruction* token_operand);
+
   // Returns the opcode for this instruction.
   HloOpcode opcode() const { return opcode_; }
 
@@ -871,11 +886,15 @@ class HloInstruction {
       return false;
     }
 
-    // Use an explicit loop rather than ContainerEquals, because copying around
-    // std::functions may be too expensive in some cases.
-    for (size_t i = 0; i < operands().size(); ++i) {
-      if (!eq_operands(operand(i), other.operand(i))) {
-        return false;
+    // Two AllReduces are Identical if they have the same all_reduce_id.
+    // Their operands don't have to be Identical.
+    if (!IsCrossModuleAllReduce()) {
+      // Use an explicit loop rather than ContainerEquals, because copying
+      // around std::functions may be too expensive in some cases.
+      for (size_t i = 0; i < operands().size(); ++i) {
+        if (!eq_operands(operand(i), other.operand(i))) {
+          return false;
+        }
       }
     }
 
@@ -886,6 +905,12 @@ class HloInstruction {
     return IdenticalSlowPath(other, eq_computations);
   }
 
+  // Generates a hash value of an HLO instruction. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO instructions,
+  // with respect to HloInstruction::Identical() method.
+  uint64 Hash() const;
+
   // Returns whether the instruction has a constant operand.
   bool HasConstantOperand() const;
 
@@ -945,16 +970,6 @@ class HloInstruction {
   Status Accept(
       const std::function<Status(const HloInstruction*)>& visitor_func) const;
 
-  // Visits all instructions rooted at this instruction using the given visitor
-  // in the given order. 'order' must contain at least the set of instructions
-  // rooted at this node (ie, those accessible from a DFS traversal from this
-  // instruction). Instructions contained in 'order' which are not in the set of
-  // instructions rooted at this node are ignored. 'order' must also be a valid
-  // topological sort of these instructions (defs appear before uses) though
-  // need not be a DFS post-order.
-  Status AcceptOrdered(DfsHloVisitor* visitor,
-                       const std::vector<const HloInstruction*>& order);
-
   // Visit this instruction and only this instruction with the given visitor.
   template <typename HloInstructionPtr>
   Status Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor);
@@ -995,6 +1010,8 @@ class HloInstruction {
   void set_while_condition(HloComputation* while_condition);
   void set_while_body(HloComputation* while_body);
 
+  HloInstruction* while_init() const;
+
   // Gets/sets the true and false HloComputation for Conditional. The setters
   // should only be called by HloModule or HloComputation methods.
   //
@@ -1255,6 +1272,7 @@ class HloInstruction {
   // superior.
   // Precondition: opcode must be kConvolution or kDot.
   const PrecisionConfig& precision_config() const;
+  PrecisionConfig* mutable_precision_config();
 
   // Sets the debug metadata for this instruction.
   void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
@@ -1315,6 +1333,9 @@ class HloInstruction {
   // Delegates to HloConcatenateInstruction::concatenate_dimension.
   int64 concatenate_dimension() const;
 
+  // Delegates to HloGetDimensionSizeInstruction::dimension.
+  int64 dimension() const;
+
   // Returns whether this instruction does a rank-2 transposition.
   bool IsRank2Transpose() const;
 
@@ -1433,6 +1454,7 @@ class HloInstruction {
 
   // Delegates to HloAllReduceInstruction::all_reduce_id.
   absl::optional<int64> all_reduce_id() const;
+  void set_all_reduce_id(const absl::optional<int64>& all_reduce_id);
 
   // Returns data on the window in a windowed operation such as
   // convolution.
@@ -1597,6 +1619,10 @@ class HloInstruction {
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const;
 
+  // Generates a hash value specific to a particular type of an instruction.
+  // This function typically considers the inner root instruction.
+  virtual uint64 InnerHash() const;
+
   // Creates an n-ary elementwise operation.
   static std::unique_ptr<HloInstruction> CreateNary(
       const Shape& shape, HloOpcode opcode,
@@ -1635,7 +1661,7 @@ class HloInstruction {
   // members. The set enables fast membership testing and the vector enables
   // fast, stable iteration.
   std::vector<HloInstruction*> users_;
-  std::unordered_set<const HloInstruction*> user_set_;
+  absl::flat_hash_set<const HloInstruction*> user_set_;
 
   // The set of control successors of this instruction.
   std::vector<HloInstruction*> control_successors_;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index c1b7c3832b44b5d65b715dffa5211a5c92e17953..8048e332cb57747286758b75773b29ba154aa888 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 
@@ -39,7 +39,7 @@ namespace {
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
-class HloInstructionTest : public HloVerifiedTestBase {
+class HloInstructionTest : public HloTestBase {
  protected:
   Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
 };
@@ -135,7 +135,8 @@ TEST_F(HloInstructionTest, BasicProperties) {
   auto parameter = HloInstruction::CreateParameter(1, r0f32_, "foo");
 
   EXPECT_EQ(HloOpcode::kParameter, parameter->opcode());
-  EXPECT_TRUE(ShapeUtil::IsScalarF32(parameter->shape()));
+  EXPECT_TRUE(ShapeUtil::IsScalarWithElementType(parameter->shape(), F32));
+  EXPECT_FALSE(ShapeUtil::IsScalarWithElementType(parameter->shape(), S32));
   EXPECT_EQ(0, parameter->operand_count());
 }
 
@@ -150,7 +151,7 @@ TEST_F(HloInstructionTest, UserWithTwoOperands) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r0f32_, "bar"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(add->operands(), UnorderedElementsAre(foo, bar));
@@ -187,7 +188,7 @@ TEST_F(HloInstructionTest, MultipleUsers) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
@@ -220,7 +221,7 @@ TEST_F(HloInstructionTest, RepeatedUser) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "foo"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(1, foo->user_count());
@@ -255,7 +256,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperands) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c0, param1));
   auto addtotal = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
@@ -304,7 +305,7 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperandsWithUnaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
   auto neg2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, addtotal));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
@@ -326,7 +327,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
   //
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   Shape f32a100x10 = ShapeUtil::MakeShape(F32, {100, 10});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // Builds an x+1.0 computation to use in a Map.
   auto embedded_builder = HloComputation::Builder("f32+1");
@@ -374,7 +375,7 @@ TEST_F(HloInstructionTest, TrivialReduce) {
       HloInstruction::CreateParameter(1, r0f32, "y"));
   embedded_builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, paramx, paramy));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
 
   // Builds a parameter and an initial value and feeds them to the reduce.
@@ -415,7 +416,7 @@ TEST_F(HloInstructionTest, ReplaceUseInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -450,7 +451,7 @@ TEST_F(HloInstructionTest, ReplaceUseInVariadicOp) {
       builder.AddInstruction(HloInstruction::CreateTuple({foo, bar, baz, foo}));
   auto add_foobar = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -478,7 +479,7 @@ TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -515,7 +516,7 @@ TEST_F(HloInstructionTest, ReplaceAllUsesWithInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
@@ -545,7 +546,7 @@ TEST_F(HloInstructionTest, ReplaceAllUsesInMultipleOps) {
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({foo, bar}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
@@ -610,7 +611,7 @@ TEST_F(HloInstructionTest, PostProcessAllVisitedNodes) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, exp, log));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   NodeCollectorAndPostProcessor visitor;
@@ -628,7 +629,7 @@ TEST_F(HloInstructionTest, SingletonFusionOp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f)));
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp}, HloInstruction::FusionKind::kLoop);
@@ -646,7 +647,7 @@ TEST_F(HloInstructionTest, BinaryFusionOp) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.1f)));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant2));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add}, HloInstruction::FusionKind::kLoop);
@@ -668,7 +669,7 @@ TEST_F(HloInstructionTest, ChainFusionOp) {
   auto exp3 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp3, exp2, exp1}, HloInstruction::FusionKind::kLoop);
@@ -691,7 +692,7 @@ TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
   exp1->set_metadata(metadata);
   exp2->set_metadata(metadata);
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp2, exp1}, HloInstruction::FusionKind::kLoop);
@@ -748,7 +749,7 @@ TEST_F(HloInstructionTest, PreserveTupleShapeThroughClone) {
 TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
   // Create a fusion instruction containing a single unary operation.
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto make_map_computation = [&]() {
     auto builder = HloComputation::Builder("FusionMap");
@@ -816,7 +817,7 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({sub, sub, mul, c1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {tuple, sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
@@ -976,7 +977,7 @@ TEST_F(HloInstructionTest, FunctionVisitor) {
       HloInstruction::CreateUnary(f32, HloOpcode::kExp, param));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32, HloOpcode::kAdd, negate, exp));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   int visit_num = 0;
@@ -1005,7 +1006,7 @@ TEST_F(HloInstructionTest, FullyElementwise) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, x, y));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(add->IsElementwise());
@@ -1015,7 +1016,7 @@ TEST_F(HloInstructionTest, FullyElementwise) {
 }
 
 TEST_F(HloInstructionTest, MapIsElementwise) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape r2f32 = ShapeUtil::MakeShapeWithLayout(F32, {10, 10}, {1, 0});
   HloComputation::Builder builder(TestName());
   HloComputation::Builder map_builder("id");
@@ -1066,7 +1067,7 @@ TEST_F(HloInstructionTest, PartiallyElementwise) {
   HloInstruction* max = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kMaximum, div, broadcast));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {max, broadcast, div, mul}, HloInstruction::FusionKind::kLoop);
@@ -1107,7 +1108,7 @@ TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) {
   HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kSubtract, min, broadcast));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {sub, broadcast, min}, HloInstruction::FusionKind::kLoop);
@@ -1150,7 +1151,7 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       sout, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kLoop);
@@ -1191,7 +1192,7 @@ TEST_F(HloInstructionTest, NoRedundantFusionOperandsAfterReplacingUse) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       s, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kLoop);
@@ -1203,7 +1204,7 @@ TEST_F(HloInstructionTest, NoRedundantFusionOperandsAfterReplacingUse) {
 }
 
 TEST_F(HloInstructionTest, FusionEquality) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Create two fusion instructions containing a single unary operation.
@@ -1225,7 +1226,7 @@ TEST_F(HloInstructionTest, FusionEquality) {
 }
 
 TEST_F(HloInstructionTest, NestedFusionEquality) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   // Build a nested fusion computation.
@@ -1329,7 +1330,7 @@ TEST_F(HloInstructionTest, Stringification) {
             "%dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} "
             "%transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* loop = builder.AddInstruction(
@@ -1372,7 +1373,7 @@ TEST_F(HloInstructionTest, StringifyGather_0) {
                                        /*index_vector_dim=*/4),
                                    /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gather_instruction->ToString(),
@@ -1407,7 +1408,7 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
                                        /*index_vector_dim=*/2),
                                    /*slice_sizes=*/{30, 29, 28, 27, 26}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gather_instruction->ToString(),
@@ -1442,7 +1443,7 @@ TEST_F(HloInstructionTest, StringifyScatter) {
   update_builder.AddInstruction(
       HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "p2"));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* update_computation =
       module->AddEmbeddedComputation(update_builder.Build());
 
@@ -1494,7 +1495,7 @@ TEST_F(HloInstructionTest, CanonnicalStringificationFusion) {
             "f32[5,20]{1,0} dot(f32[5,10]{1,0}, f32[10,20]{1,0}), "
             "lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kLoop);
@@ -1530,7 +1531,7 @@ TEST_F(HloInstructionTest, CanonnicalStringificationWhile) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       sout, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({dot, reshape},
                                        HloInstruction::FusionKind::kLoop);
@@ -1586,7 +1587,7 @@ TEST_F(HloInstructionTest, CanonnicalStringificationConditional) {
   HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
       sout, x, reshape, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({dot, reshape},
                                        HloInstruction::FusionKind::kLoop);
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 152d8eacdb591a31afcbbf7f9f01d51864c929f0..1ea02cf9c03866a598bec0e5356f0eb31ad27755 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -370,6 +370,11 @@ HloAllReduceInstruction::HloAllReduceInstruction(
   AppendComputation(reduce_computation);
 }
 
+void HloAllReduceInstruction::set_all_reduce_id(
+    const absl::optional<int64>& all_reduce_id) {
+  all_reduce_id_ = all_reduce_id;
+}
+
 HloInstructionProto HloAllReduceInstruction::ToProto() const {
   HloInstructionProto proto = HloCollectiveInstruction::ToProto();
   // Proto3 is so sad.
@@ -600,11 +605,11 @@ std::unique_ptr<HloInstruction> HloReduceInstruction::CloneWithNewOperandsImpl(
 
 HloSortInstruction::HloSortInstruction(const Shape& shape, int64 dimension,
                                        HloInstruction* keys,
-                                       HloInstruction* values)
+                                       absl::Span<HloInstruction* const> values)
     : HloInstruction(HloOpcode::kSort, shape), dimensions_({dimension}) {
   AppendOperand(keys);
-  if (values) {
-    AppendOperand(values);
+  for (auto* value : values) {
+    AppendOperand(value);
   }
 }
 
@@ -633,9 +638,8 @@ std::unique_ptr<HloInstruction> HloSortInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   HloInstruction* keys = new_operands[0];
-  HloInstruction* values = new_operands.size() == 2 ? new_operands[1] : nullptr;
   return absl::make_unique<HloSortInstruction>(shape, dimensions(0), keys,
-                                               values);
+                                               new_operands.subspan(1));
 }
 
 HloTransposeInstruction::HloTransposeInstruction(
@@ -1368,6 +1372,10 @@ bool HloFusionInstruction::IdenticalSlowPath(
                          other.fused_instructions_computation());
 }
 
+uint64 HloFusionInstruction::InnerHash() const {
+  return fused_instructions_computation()->Hash();
+}
+
 std::unique_ptr<HloInstruction> HloFusionInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
@@ -1611,7 +1619,7 @@ HloOutfeedInstruction::HloOutfeedInstruction(const Shape& outfeed_shape,
 HloInstructionProto HloOutfeedInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_outfeed_config(outfeed_config());
-  *proto.mutable_outfeed_shape() = outfeed_shape();
+  *proto.mutable_outfeed_shape() = outfeed_shape().ToProto();
   return proto;
 }
 
@@ -1825,7 +1833,24 @@ HloCustomCallInstruction::HloCustomCallInstruction(
     : HloInstruction(HloOpcode::kCustomCall, shape),
       custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
       opaque_(opaque.begin(), opaque.end()),
-      feature_group_count_(1) {
+      feature_group_count_(1),
+      layout_constrained_(false) {
+  for (auto operand : operands) {
+    AppendOperand(operand);
+  }
+}
+
+HloCustomCallInstruction::HloCustomCallInstruction(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    absl::string_view custom_call_target, absl::string_view opaque,
+    absl::Span<const Shape> operand_shapes_with_layout)
+    : HloInstruction(HloOpcode::kCustomCall, shape),
+      custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
+      opaque_(opaque.begin(), opaque.end()),
+      feature_group_count_(1),
+      layout_constrained_(true),
+      operand_shapes_with_layout_(operand_shapes_with_layout.begin(),
+                                  operand_shapes_with_layout.end()) {
   for (auto operand : operands) {
     AppendOperand(operand);
   }
@@ -1843,6 +1868,12 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
   proto.set_custom_call_target(custom_call_target_);
   proto.set_custom_call_opaque(opaque_);
   proto.set_feature_group_count(feature_group_count_);
+  if (layout_constrained()) {
+    proto.set_constrain_layout(true);
+    for (const Shape& shape : operand_shapes_with_layout_) {
+      *proto.add_operand_shapes_with_layout() = shape.ToProto();
+    }
+  }
   return proto;
 }
 
@@ -1870,6 +1901,14 @@ std::vector<string> HloCustomCallInstruction::ExtraAttributesToStringImpl(
   if (!opaque_.empty()) {
     extra.push_back(StrCat("opaque=\"", CEscape(opaque_), "\""));
   }
+  if (layout_constrained()) {
+    std::vector<string> shape_strings;
+    for (const Shape& shape : operand_shapes_with_layout_) {
+      shape_strings.push_back(ShapeUtil::HumanStringWithLayout(shape));
+    }
+    extra.push_back(StrCat("operand_layout_constraints={",
+                           StrJoin(shape_strings, ", "), "}"));
+  }
   return extra;
 }
 
@@ -2305,18 +2344,57 @@ HloInstructionProto HloDomainInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   auto operand_side_sharding =
       dynamic_cast<const ShardingMetadata*>(operand_side_metadata_.get());
-  if (operand_side_sharding) {
+  if (operand_side_sharding && operand_side_sharding->sharding() != nullptr) {
     *proto.mutable_domain_entry_sharding() =
         operand_side_sharding->sharding()->ToProto();
   }
 
   auto user_side_sharding =
       dynamic_cast<const ShardingMetadata*>(user_side_metadata_.get());
-  if (user_side_sharding) {
+  if (user_side_sharding && user_side_sharding->sharding() != nullptr) {
     *proto.mutable_domain_exit_sharding() =
         user_side_sharding->sharding()->ToProto();
   }
 
   return proto;
 }
+
+HloGetDimensionSizeInstruction::HloGetDimensionSizeInstruction(
+    const Shape& shape, HloInstruction* operand, int64 dimension)
+    : HloInstruction(HloOpcode::kGetDimensionSize, shape),
+      dimension_(dimension) {
+  AppendOperand(operand);
+}
+
+HloInstructionProto HloGetDimensionSizeInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.add_dimensions(dimension());
+  return proto;
+}
+
+std::vector<string> HloGetDimensionSizeInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& /*options*/) const {
+  return {StrCat("dimensions={", dimension(), "}")};
+}
+
+bool HloGetDimensionSizeInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+    /*eq_computations*/) const {
+  const auto& casted_other =
+      static_cast<const HloGetDimensionSizeInstruction&>(other);
+  return dimension() == casted_other.dimension();
+}
+
+std::unique_ptr<HloInstruction>
+HloGetDimensionSizeInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* /*context*/) const {
+  if (new_operands.size() != 1) {
+    LOG(FATAL) << "expects 1 operand";
+  }
+  return absl::make_unique<HloGetDimensionSizeInstruction>(
+      shape, new_operands[0], dimension());
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index e169604072a6d23c5e601fcbe00b7a7bf37a933d..b5c28137a145667a977d39c9d3c40c6d36a8436e 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -252,6 +252,7 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   }
 
   absl::optional<int64> all_reduce_id() const { return all_reduce_id_; }
+  void set_all_reduce_id(const absl::optional<int64>& all_reduce_id);
 
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
@@ -418,14 +419,19 @@ class HloSortInstruction : public HloInstruction {
  public:
   explicit HloSortInstruction(const Shape& shape, int64 dimension,
                               HloInstruction* keys,
-                              HloInstruction* values = nullptr);
+                              absl::Span<HloInstruction* const> values = {});
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
   // Returns the sort dimension for this instruction
-  int64 sort_dimension() { return dimensions(0); }
+  int64 sort_dimension() const { return dimensions(0); }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
+  // Returns the key operand to this instruction.
+  const HloInstruction* keys() const { return operand(0); }
+  HloInstruction* mutable_keys() { return mutable_operand(0); }
+  // Returns the number of value operands.
+  int64 values_count() const { return operand_count() - 1; }
 
  private:
   std::vector<string> ExtraAttributesToStringImpl(
@@ -737,6 +743,8 @@ class HloFusionInstruction : public HloInstruction {
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
+  uint64 InnerHash() const override;
+
   // Implementation for non-common logic of CloneWithNewOperands.
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
@@ -949,6 +957,7 @@ class HloConvolutionInstruction : public HloInstruction {
   // information but it is presumed that the alternate lowering is strictly
   // superior.
   const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
 
   string ToCategory() const override;
   // Returns a serialized representation of this instruction.
@@ -1053,10 +1062,19 @@ class HloSelectAndScatterInstruction : public HloInstruction {
 
 class HloCustomCallInstruction : public HloInstruction {
  public:
-  explicit HloCustomCallInstruction(const Shape& shape,
-                                    absl::Span<HloInstruction* const> operands,
-                                    absl::string_view custom_call_target,
-                                    absl::string_view opaque);
+  HloCustomCallInstruction(const Shape& shape,
+                           absl::Span<HloInstruction* const> operands,
+                           absl::string_view custom_call_target,
+                           absl::string_view opaque);
+
+  // Constructor for a custom call with constrained layout. 'shape' and
+  // 'operands_with_layout' must all have layouts.
+  HloCustomCallInstruction(const Shape& shape,
+                           absl::Span<HloInstruction* const> operands,
+                           absl::string_view custom_call_target,
+                           absl::string_view opaque,
+                           absl::Span<const Shape> operand_shapes_with_layout);
+
   const Window& window() const override {
     CHECK(window_ != nullptr);
     return *window_;
@@ -1085,6 +1103,16 @@ class HloCustomCallInstruction : public HloInstruction {
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
+  // Returns whether the result and operand layouts are constrained.
+  bool layout_constrained() const { return layout_constrained_; }
+
+  // Returns the shapes (with layout) of the operands. CHECKs if this custom
+  // call does not have constrained layouts.
+  const std::vector<Shape>& operand_shapes_with_layout() const {
+    CHECK(layout_constrained());
+    return operand_shapes_with_layout_;
+  }
+
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -1106,6 +1134,11 @@ class HloCustomCallInstruction : public HloInstruction {
   std::unique_ptr<ConvolutionDimensionNumbers> convolution_dimension_numbers_;
   // The number of feature groups. This is used for grouped convolutions.
   int64 feature_group_count_;
+  // Whether the result and operand layouts are constrained.
+  bool layout_constrained_;
+  // For layout-constrained custom calls, this vector holds the shape with
+  // layout for each operand.
+  std::vector<Shape> operand_shapes_with_layout_;
 };
 
 class HloPadInstruction : public HloInstruction {
@@ -1115,6 +1148,9 @@ class HloPadInstruction : public HloInstruction {
                              const PaddingConfig& padding_config);
   // Returns the padding configuration for a pad node.
   const PaddingConfig& padding_config() const { return padding_config_; }
+  // Returns the padding value.
+  const HloInstruction* padding_value() const { return operand(1); }
+  HloInstruction* mutable_padding_value() { return mutable_operand(1); }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -1293,6 +1329,7 @@ class HloDotInstruction : public HloInstruction {
   // information but it is presumed that the alternate lowering is strictly
   // superior.
   const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
 
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
@@ -1353,6 +1390,33 @@ class HloDomainInstruction : public HloInstruction {
   std::unique_ptr<DomainMetadata> operand_side_metadata_;
   std::unique_ptr<DomainMetadata> user_side_metadata_;
 };
+
+class HloGetDimensionSizeInstruction : public HloInstruction {
+ public:
+  explicit HloGetDimensionSizeInstruction(const Shape& shape,
+                                          HloInstruction* operand,
+                                          int64 dimension);
+
+  // Returns the dimension sizes or numbers associated with this instruction.
+  int64 dimension() const { return dimension_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64 dimension_;
+};
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index d9be841dd751651ba029998fd062fcaec3691945..1390537101e95a08e4ba4eef7ae8d6059a40e916 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -163,6 +163,9 @@ TokKind HloLexer::LexToken() {
               current_ptr_ = comment_start;
               return TokKind::kError;
             }
+            if (current == kError) {
+              return TokKind::kError;
+            }
           }
           // Return no token for the comment. Keep lexing.
           continue;
@@ -177,6 +180,9 @@ TokKind HloLexer::LexToken() {
             if (current == kEOF || current == '\n' || current == '\r') {
               break;
             }
+            if (current == kError) {
+              return TokKind::kError;
+            }
             current_ptr_++;
           }
           continue;
@@ -204,7 +210,7 @@ TokKind HloLexer::LexIdentifier() {
     auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
     // 'consumable' will be advanced iff its prefix matches the pattern.
     static LazyRE2 shape_pattern = {
-        R"(^(\w*\d*)\[([\d,]*)\](?:(dense|sparse)?{([\d,]+)})?)"};
+        R"(^(\w*\d*)\[([\d,\s]*)\](?:(dense|sparse)?{([\d,\s]+)})?)"};
     if (RE2::Consume(&consumable, *shape_pattern)) {
       auto status_or_shape = ShapeUtil::ParseShapeString(
           StringPieceFromPointers(token_start_, consumable.begin()));
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 3e2f8bcd52f9043f161197756a2060b28dded1d9..d6a2b292a3916b2ff85f278cf5cb9f1567df88fa 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_token.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 5269cad94d35be3dd1c009588bbe422ff1533364..d28e79d41ad5d58a8881cfb80d488684af26564f 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -237,8 +237,4 @@ void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
   *os << (inst ? inst->ToString() : "nullptr");
 }
 
-void PrintTo(HloInstruction* inst, ::std::ostream* os) {
-  PrintTo(const_cast<const HloInstruction*>(inst), os);
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 5502e565b6dfbaca6cfa2101950fb0a68c89771f..235efb19ce4ed28a5cd9fe5ca52ae5d8e9e5ba3d 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -165,6 +165,7 @@ namespace opcode_matchers {
   }
 HLO_MATCHER(Abs);
 HLO_MATCHER(Add);
+HLO_MATCHER(AllToAll);
 HLO_MATCHER(Bitcast);
 HLO_MATCHER(Broadcast);
 HLO_MATCHER(BatchNormGrad);
@@ -178,7 +179,9 @@ HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
+HLO_MATCHER(CollectivePermute);
 HLO_MATCHER(Divide);
+HLO_MATCHER(Domain);
 HLO_MATCHER(DynamicSlice);
 HLO_MATCHER(DynamicUpdateSlice);
 HLO_MATCHER(Eq);
@@ -216,6 +219,7 @@ HLO_MATCHER(Remainder);
 HLO_MATCHER(Reshape);
 HLO_MATCHER(Reverse);
 HLO_MATCHER(Rng);
+HLO_MATCHER(Scatter);
 HLO_MATCHER(Select);
 HLO_MATCHER(SelectAndScatter);
 HLO_MATCHER(Send);
@@ -381,7 +385,6 @@ std::vector<const HloInstruction*> Pointers(const Container& container) {
 // Tell GMock to print HloInstruction* by value, so error messages are nice.
 // Has to be in the same namespace as 'HloInstruction'.
 void PrintTo(const HloInstruction* inst, ::std::ostream* os);
-void PrintTo(HloInstruction* inst, ::std::ostream* os);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index 5cee865b7ad34eded1743d9d5455bb40febf6182..d2740bcce26f04c5d7c8b64cfdaea53e3c697855 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -73,7 +73,7 @@ class ListScheduler {
   // Construct and return a memory-minimizing sequence of HLO instructions
   // containing the given HLO computation.
   static StatusOr<HloInstructionSequence> Run(
-      const HloComputation& computation,
+      HloComputation* computation,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_function,
       const absl::flat_hash_map<const HloComputation*, int64>&
@@ -98,7 +98,7 @@ class ListScheduler {
   // comparison operators.
   using Priority = std::pair<int64, int64>;
 
-  ListScheduler(const HloComputation& computation,
+  ListScheduler(HloComputation* computation,
                 const TuplePointsToAnalysis& points_to_analysis,
                 const LogicalBuffer::SizeFunction& size_function,
                 const absl::flat_hash_map<const HloComputation*, int64>&
@@ -111,7 +111,7 @@ class ListScheduler {
     // instruction. An HLO instruction "uses" a LogicalBuffer if the
     // LogicalBuffer is in an operand of the instruction as indicated by
     // points-to analysis.
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       absl::flat_hash_set<const LogicalBuffer*> instr_uses;
       for (auto* operand : instruction->operands()) {
         points_to_analysis.GetPointsToSet(operand).ForEachElement(
@@ -126,13 +126,13 @@ class ListScheduler {
 
     // Create map containing the number of unscheduled uses (hlo instructions)
     // of each logical buffer.
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       for (auto* buffer :
            points_to_analysis.GetBuffersDefinedByInstruction(instruction)) {
         unscheduled_use_count_[buffer] = 0;
       }
     }
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       for (const LogicalBuffer* buffer : buffer_uses_.at(instruction)) {
         ++unscheduled_use_count_[buffer];
       }
@@ -141,7 +141,7 @@ class ListScheduler {
     // Buffers live out of the computation have an implicit use at the end of
     // the computation.
     for (const LogicalBuffer* live_out_buffer :
-         points_to_analysis.GetPointsToSet(computation.root_instruction())
+         points_to_analysis.GetPointsToSet(computation->root_instruction())
              .CreateFlattenedSet()) {
       ++unscheduled_use_count_[live_out_buffer];
     }
@@ -157,7 +157,7 @@ class ListScheduler {
   // HloInstruction, plus some cached metadata, saved for the purposes of making
   // BytesFreedIfScheduled fast.
   struct ReadyListEntry {
-    const HloInstruction* instruction;
+    HloInstruction* instruction;
 
     // The total size of all buffers defined by this instruction.
     int64 bytes_defined;
@@ -171,7 +171,7 @@ class ListScheduler {
   };
 
   // Creates a ReadyListEntry for the given instruction.
-  ReadyListEntry MakeReadyListEntry(const HloInstruction* instruction) {
+  ReadyListEntry MakeReadyListEntry(HloInstruction* instruction) {
     ReadyListEntry entry;
     entry.instruction = instruction;
 
@@ -250,13 +250,13 @@ class ListScheduler {
     // Populate the ready list with instructions which have no operands or
     // control predecessors.
     absl::flat_hash_map<const HloInstruction*, int64> unscheduled_pred_count;
-    for (auto* instruction : computation_.instructions()) {
+    for (auto* instruction : computation_->instructions()) {
       // TODO(b/34466113): Replace this and above with successors() or
       // predecessors() when these methods are added to HloInstruction.
-      for (const HloInstruction* user : instruction->users()) {
+      for (HloInstruction* user : instruction->users()) {
         unscheduled_pred_count[user]++;
       }
-      for (const HloInstruction* succ : instruction->control_successors()) {
+      for (HloInstruction* succ : instruction->control_successors()) {
         unscheduled_pred_count[succ]++;
       }
     }
@@ -275,7 +275,7 @@ class ListScheduler {
       ready_instructions[inst] = it;
     };
 
-    for (auto* instruction : computation_.instructions()) {
+    for (auto* instruction : computation_->instructions()) {
       if (instruction->operands().empty() &&
           instruction->control_predecessors().empty()) {
         add_to_ready_queue(instruction);
@@ -287,7 +287,7 @@ class ListScheduler {
       // schedule.
       auto best_it = ready_queue.end();
       --best_it;
-      const HloInstruction* best = best_it->second.instruction;
+      HloInstruction* best = best_it->second.instruction;
       VLOG(2) << "Schedule instruction: " << best->ToShortString()
               << " Bytes freed: " << best_it->first.first;
       ready_queue.erase(best_it);
@@ -348,13 +348,13 @@ class ListScheduler {
         }
       }
     }
-    CHECK_EQ(schedule.size(), computation_.instruction_count());
-    CHECK_EQ(scheduled_instructions_.size(), computation_.instruction_count());
+    CHECK_EQ(schedule.size(), computation_->instruction_count());
+    CHECK_EQ(scheduled_instructions_.size(), computation_->instruction_count());
 
     return schedule;
   }
 
-  const HloComputation& computation_;
+  HloComputation* computation_;
   const TuplePointsToAnalysis& points_to_analysis_;
   const LogicalBuffer::SizeFunction& size_function_;
   // Computations are analyzed in post-order. When scheduling an instruction
@@ -386,13 +386,13 @@ int64 SumLogicalBufferSizes(
 }
 
 StatusOr<HloInstructionSequence> ScheduleComputationHelper(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
-  VLOG(2) << "Computation: " << computation.name();
+  VLOG(2) << "Computation: " << computation->name();
   if (algorithm) {
     return algorithm(computation, points_to_analysis, size_function,
                      memory_by_computation);
@@ -404,17 +404,17 @@ StatusOr<HloInstructionSequence> ScheduleComputationHelper(
 }  // namespace
 
 StatusOr<HloInstructionSequence> DFSMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   // These variables are a hack to prevent overflows.
   int64 cumulative_total_size = 0;
-  int64 total_hlos = computation.parent()->instruction_count();
+  int64 total_hlos = computation->parent()->instruction_count();
   absl::flat_hash_map<const HloInstruction*, int64> extra_users;
   absl::flat_hash_map<const HloInstruction*, int64> total_sizes;
-  for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
+  for (const HloInstruction* hlo : computation->MakeInstructionPostOrder()) {
     if (ListScheduler::IgnoreInstruction(*hlo)) {
       extra_users[hlo] = 0;
       total_sizes[hlo] = 0;
@@ -448,8 +448,8 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size);
     extra_users[hlo] = std::min(extra_users[hlo], total_hlos);
   }
-  CHECK_EQ(extra_users.size(), computation.instruction_count());
-  CHECK_EQ(total_sizes.size(), computation.instruction_count());
+  CHECK_EQ(extra_users.size(), computation->instruction_count());
+  CHECK_EQ(total_sizes.size(), computation->instruction_count());
 
   // Construct a total order based on DFS post-order, visiting operands in
   // decreasing cumulative extra user order, and next by cumulative size, with a
@@ -459,7 +459,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     sequence.push_back(hlo);
     return Status::OK();
   });
-  TF_RETURN_IF_ERROR(computation.AcceptWithOperandOrder(
+  TF_RETURN_IF_ERROR(computation->AcceptWithOperandOrder(
       &visitor, [&extra_users, &total_sizes](const HloInstruction* a,
                                              const HloInstruction* b) {
         if (extra_users[a] != extra_users[b]) {
@@ -470,12 +470,12 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
         }
         return a->name() < b->name();
       }));
-  CHECK_EQ(sequence.size(), computation.instruction_count());
+  CHECK_EQ(sequence.size(), computation->instruction_count());
   return sequence;
 }  // namespace xla
 
 StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -485,16 +485,16 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
 }
 
 StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
-  return HloInstructionSequence(computation.MakeInstructionPostOrder());
+  return HloInstructionSequence(computation->MakeInstructionPostOrder());
 }
 
 StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -513,7 +513,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                           memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 list_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, list_sequence, points_to_analysis,
+                          *computation, list_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
@@ -522,7 +522,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                                          size_function, memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 dfs_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, dfs_sequence, points_to_analysis,
+                          *computation, dfs_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
@@ -532,7 +532,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                                memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 post_order_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, post_order_sequence, points_to_analysis,
+                          *computation, post_order_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory post order sequence: "
           << HumanReadableNumBytes(post_order_memory);
@@ -555,17 +555,17 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
 }
 
 StatusOr<HloSchedule> ScheduleModule(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    HloModule* module, const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm) {
-  HloSchedule schedule(&module);
+  HloSchedule schedule(module);
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(&module));
+                      TuplePointsToAnalysis::Run(module));
   absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
-  for (const auto* computation : module.MakeComputationPostOrder()) {
+  for (auto* computation : module->MakeComputationPostOrder()) {
     if (!computation->IsFusionComputation()) {
       TF_ASSIGN_OR_RETURN(HloInstructionSequence computation_sequence,
                           ScheduleComputationHelper(
-                              *computation, *points_to_analysis, size_function,
+                              computation, *points_to_analysis, size_function,
                               algorithm, memory_by_computation));
       memory_by_computation[computation] =
           HeapSimulator::MinimumMemoryForComputation(
@@ -583,11 +583,11 @@ StatusOr<HloSchedule> ScheduleModule(
 }
 
 StatusOr<HloInstructionSequence> ScheduleComputation(
-    const HloComputation& computation,
+    HloComputation* computation,
     const LogicalBuffer::SizeFunction& size_function) {
-  CHECK(!computation.IsFusionComputation());
+  CHECK(!computation->IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(computation.parent()));
+                      TuplePointsToAnalysis::Run(computation->parent()));
   absl::flat_hash_map<const HloComputation*, int64> empty_map;
   return ScheduleComputationHelper(computation, *points_to_analysis,
                                    size_function, nullptr, empty_map);
@@ -600,7 +600,24 @@ HloMemoryScheduler::HloMemoryScheduler(
 
 StatusOr<bool> HloMemoryScheduler::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                      ScheduleModule(*module, size_function_, algorithm_));
+                      ScheduleModule(module, size_function_, algorithm_));
+  TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
+  return true;
+}
+
+StatusOr<bool> HloTrivialScheduler::Run(HloModule* module) {
+  HloSchedule schedule(module);
+  for (HloComputation* computation : module->MakeComputationPostOrder()) {
+    if (!computation->IsFusionComputation()) {
+      HloInstructionSequence& computation_sequence =
+          schedule.GetOrCreateSequence(computation);
+      TF_RETURN_IF_ERROR(computation->Accept(
+          [&computation_sequence](HloInstruction* instruction) {
+            computation_sequence.push_back(instruction);
+            return Status::OK();
+          }));
+    }
+  }
   TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
   return true;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
index a4c1d3db8170a1725043def576f913e09b352e5d..7227bfb27c74758d2b79e404afc9eb97a1ca894d 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
@@ -36,14 +36,14 @@ namespace xla {
 // that describes buffer aliasing, together with a target-specific size function
 // that maps a tensor's logical size to its padded size.
 typedef std::function<StatusOr<HloInstructionSequence>(
-    const HloComputation&, const TuplePointsToAnalysis&,
+    HloComputation*, const TuplePointsToAnalysis&,
     const LogicalBuffer::SizeFunction&,
     const absl::flat_hash_map<const HloComputation*, int64>&)>
     MemorySchedulerAlgorithm;
 
 // List scheduler
 StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -51,7 +51,7 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
 
 // DFS-order scheduler
 StatusOr<HloInstructionSequence> DFSMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -59,7 +59,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
 
 // Naive Post Order scheduler
 StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -69,7 +69,7 @@ StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
 // and the DFS scheduler, and chooses whichever returns a lower min-memory,
 // not accounting for fragmentation.
 StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -79,13 +79,13 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
 // the computation. size_function is the function returning the number of bytes
 // required for a LogicalBuffer.
 StatusOr<HloSchedule> ScheduleModule(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    HloModule* module, const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm = {});
 
 // Computes the schedule for a single computation.
 // Currently only used by the GPU backend.
 StatusOr<HloInstructionSequence> ScheduleComputation(
-    const HloComputation& computation,
+    HloComputation* computation,
     const LogicalBuffer::SizeFunction& size_function);
 
 // A pass which schedules the HLO instructions in a module. The HloModule's
@@ -108,6 +108,15 @@ class HloMemoryScheduler : public HloModulePass {
   MemorySchedulerAlgorithm algorithm_;
 };
 
+// A pass which produces a naive, but correct schedule. The schedule is produced
+// using a DFS traversal of the graph with no attempt to minimize memory use.
+class HloTrivialScheduler : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "hlo-trivial-scheduler"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
 // A trivial pass which clears the schedule currently set on the
 // HloModule. After this pass runs HloModudle::has_schedule will return false.
 class HloDescheduler : public HloModulePass {
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 214119fba881c4411a262cd4227b5cc49cef0d14..bc0d7e2bc00eab014f2660c95a51b966642eaee9 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -65,7 +65,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloMemoryScheduler scheduler([](const BufferValue& buffer) {
@@ -78,7 +78,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
   TF_ASSERT_OK(module->schedule().Verify());
 
   // Verify that all instructions are in the sequence.
-  const std::vector<const HloInstruction*>& sequence =
+  const std::vector<HloInstruction*>& sequence =
       module->schedule().sequence(module->entry_computation()).instructions();
   EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size());
 
@@ -124,9 +124,9 @@ ENTRY root {
   };
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
+      ScheduleModule(module.get(), size_fn, ListMemoryScheduler));
   // Verify that all instructions are in the sequence.
-  const std::vector<const HloInstruction*>& sequence =
+  const std::vector<HloInstruction*>& sequence =
       schedule.sequence(module->entry_computation()).instructions();
   EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size());
 
@@ -172,15 +172,16 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd,
                                                       tuple_elm, abs_abs2));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
-                          ScheduleModule(*module,
-                                         [](const BufferValue& buffer) {
-                                           return ShapeUtil::ByteSizeOf(
-                                               buffer.shape(), TUPLE_SIZE);
-                                         },
-                                         ListMemoryScheduler));
+                          ScheduleModule(
+                              module.get(),
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape(),
+                                                             TUPLE_SIZE);
+                              },
+                              ListMemoryScheduler));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -218,19 +219,19 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, tuple_elm, exp));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto fusion = computation->CreateFusionInstruction(
       {tuple, mul, add}, HloInstruction::FusionKind::kLoop);
 
   TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
-                          ScheduleModule(*module,
-                                         [](const BufferValue& buffer) {
-                                           return ShapeUtil::ByteSizeOf(
-                                               buffer.shape(), 2);
-                                         },
-                                         ListMemoryScheduler));
+                          ScheduleModule(
+                              module.get(),
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape(), 2);
+                              },
+                              ListMemoryScheduler));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -242,7 +243,7 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
 }
 
 TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
 
   // param != 0
@@ -252,7 +253,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
       HloInstruction::CreateParameter(0, r1f32, "cond_param"));
   HloInstruction* zero_vector =
       cond_builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{0, 0, 0, 0}})));
+          LiteralUtil::CreateR1<float>({0, 0, 0, 0})));
   cond_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector));
   auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
@@ -284,7 +285,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
   };
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
+      ScheduleModule(module.get(), size_fn, ListMemoryScheduler));
   // Verify that all instructions are in the sequence.
   auto entry_computation = module->entry_computation();
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -309,5 +310,40 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
                     .ValueOrDie());
 }
 
+TEST_F(HloSchedulingTest, TrivialScheduler) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  param.b = (s32[], s32[]) parameter(0)
+  gte.0 = s32[] get-tuple-element(param.b), index=0
+  gte.1 = s32[] get-tuple-element(param.b), index=1
+  add = s32[] add(gte.0, gte.1)
+  ROOT tuple = (s32[], s32[]) tuple(gte.0, add)
+}
+
+cond {
+  param.c = (s32[], s32[]) parameter(0)
+  ROOT constant = pred[] constant(true)
+}
+
+ENTRY main {
+  init = (s32[], s32[]) parameter(0)
+  ROOT while = (s32[], s32[]) while(init), condition=cond, body=body
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  EXPECT_FALSE(module->has_schedule());
+  TF_ASSERT_OK(HloTrivialScheduler().Run(module.get()).status());
+  ASSERT_TRUE(module->has_schedule());
+  TF_ASSERT_OK(module->schedule().Verify());
+
+  // Verify that a clone of the module also has a schedule.
+  std::unique_ptr<HloModule> clone = module->Clone();
+  ASSERT_TRUE(clone->has_schedule());
+  TF_ASSERT_OK(clone->schedule().Verify());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 93e04eb3db47ba3dadfbd412733997b92c07da92..fe8371384c0fa3900a9022f101ff0b296439cf16 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -41,18 +41,6 @@ HloModule::HloModule(const string& name, const HloModuleConfig& config)
       config_(config),
       unique_id_(next_unique_module_id_++) {}
 
-StatusOr<HloInstruction*> HloModule::LaunderConstInstructionFromModule(
-    const HloInstruction* hlo) {
-  if (hlo == nullptr) {
-    return nullptr;
-  }
-
-  TF_RET_CHECK(hlo->GetModule() == this);
-
-  // TODO(b/78350259): Eliminate const laundering.
-  return const_cast<HloInstruction*>(hlo);
-}
-
 Status HloModule::set_schedule(HloSchedule schedule) {
   TF_RET_CHECK(schedule.module() == this);
   TF_RETURN_IF_ERROR(schedule.Verify());
@@ -73,6 +61,8 @@ HloComputation* HloModule::AddComputationInternal(
       config_.SetDefaultComputationLayout(
           entry_computation_->ComputeProgramShape());
     }
+    input_output_alias_config_ = HloInputOutputAliasConfig(
+        entry_computation_->root_instruction()->shape());
   }
 
   if (uniquify_identifiers) {
@@ -244,14 +234,16 @@ HloModuleProto HloModule::ToProto() const {
   proto.set_entry_computation_id(entry_computation_->unique_id());
   for (const HloComputation* computation : MakeComputationPostOrder()) {
     HloComputationProto computation_proto = computation->ToProto();
-    if (computation->name() == entry_computation_->name()) {
-      *proto.mutable_program_shape() = computation_proto.program_shape();
-    }
     proto.add_computations()->Swap(&computation_proto);
   }
   if (has_schedule()) {
     *proto.mutable_schedule() = schedule().ToProto().ValueOrDie();
   }
+  *proto.mutable_host_program_shape() =
+      entry_computation_layout().ComputeProgramShape().ToProto();
+  *proto.mutable_input_output_alias() = input_output_alias_config().ToProto();
+  *proto.mutable_dynamic_parameter_binding() =
+      dynamic_parameter_binding().ToProto();
   return proto;
 }
 
@@ -263,9 +255,9 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
   // The ProgramShape in the passed in module config must match the shapes of
   // the entry parameters and root.
-  TF_RET_CHECK(proto.has_program_shape())
+  TF_RET_CHECK(proto.has_host_program_shape())
       << "No program shape found in the proto";
-  const auto& expected_program_shape = proto.program_shape();
+  ProgramShape expected_program_shape(proto.host_program_shape());
   TF_RET_CHECK(expected_program_shape.parameters_size() ==
                module_config.entry_computation_layout().parameter_count());
   for (int i = 0; i < expected_program_shape.parameters_size(); ++i) {
@@ -328,8 +320,17 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   }
   TF_RET_CHECK(module->entry_computation_ != nullptr);
 
+  TF_ASSIGN_OR_RETURN(
+      module->input_output_alias_config_,
+      HloInputOutputAliasConfig::CreateFromProto(
+          entry->ComputeProgramShape().result(), proto.input_output_alias()));
+
   // Because we didn't uniquify the names or the ids, double-check that the
   // instruction and computation names and ids are unique from the proto.
+  TF_ASSIGN_OR_RETURN(module->dynamic_parameter_binding_,
+                      DynamicParameterBinding::CreateFromProto(
+                          proto.dynamic_parameter_binding()));
+
   absl::flat_hash_set<string> computation_names;
   absl::flat_hash_set<string> instruction_names;
   absl::flat_hash_set<int> computation_ids;
@@ -366,11 +367,11 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 /* static */
 StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
     const HloModuleProto& module, const DebugOptions& debug_options) {
-  TF_RET_CHECK(module.has_program_shape())
+  TF_RET_CHECK(module.has_host_program_shape())
       << "No program shape found in the proto";
-  const auto& program_shape = module.program_shape();
+  ProgramShape program_shape(module.host_program_shape());
 
-  HloModuleConfig module_config(program_shape);
+  HloModuleConfig module_config(ProgramShape{program_shape});
   module_config.set_debug_options(debug_options);
 
   // The module config is constructed with default layouts regardless of what is
@@ -558,12 +559,34 @@ std::vector<HloComputation*> HloModule::MakeNonfusionComputations() const {
 }
 
 std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
+  return Clone(config(), suffix);
+}
+
+std::unique_ptr<HloModule> HloModule::Clone(const HloModuleConfig& config,
+                                            const string& suffix) const {
   VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n";
-  auto module = absl::make_unique<HloModule>(name_ + "-" + suffix, config_);
+  auto module = absl::make_unique<HloModule>(
+      absl::StrCat(name_, suffix.empty() ? "" : "-", suffix), config);
 
   HloCloneContext context(module.get(), suffix);
   auto cloned_computation = entry_computation_->Clone(suffix, &context);
   module->AddEntryComputation(std::move(cloned_computation));
+
+  if (has_schedule() && schedule().Verify().ok()) {
+    HloSchedule clone_schedule(module.get());
+    for (HloComputation* computation : computations()) {
+      if (schedule().is_computation_scheduled(computation)) {
+        HloInstructionSequence& clone_sequence =
+            clone_schedule.GetOrCreateSequence(
+                context.GetComputation(computation));
+        for (const HloInstruction* instruction :
+             schedule().sequence(computation).instructions()) {
+          clone_sequence.push_back(context.GetInstruction(instruction));
+        }
+      }
+    }
+    TF_CHECK_OK(module->set_schedule(std::move(clone_schedule)));
+  }
   return module;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 735804e827afd77e2b7f2a4a7d490ee6f5ee7b4f..7b9cbf9a53a2201b1312405bbd7ed2b88f65c9be 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -28,9 +28,11 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
@@ -92,6 +94,8 @@ class HloModule {
 
   // Returns a deep copy of this module including all computations.
   std::unique_ptr<HloModule> Clone(const string& suffix = "clone") const;
+  std::unique_ptr<HloModule> Clone(const HloModuleConfig& config,
+                                   const string& suffix = "clone") const;
 
   // Performs a deep clone of the computation, by recursively cloning all
   // the called computations as well. If the clone context is specified, it
@@ -99,14 +103,18 @@ class HloModule {
   HloComputation* DeepCloneComputation(HloComputation* computation,
                                        HloCloneContext* context = nullptr);
 
-  // Return a pointer to the entry computation of the module..
-  const HloComputation* entry_computation() const {
+  // Return a pointer to the entry computation of the module.
+  HloComputation* entry_computation() const {
     CHECK_NE(nullptr, entry_computation_);
     return entry_computation_;
   }
-  HloComputation* entry_computation() {
+
+  // Returns the root instruction shape of entry computation.
+  //
+  // Precondition: entry_computation_ is not nullptr.
+  const Shape& result_shape() const {
     CHECK_NE(nullptr, entry_computation_);
-    return entry_computation_;
+    return entry_computation()->root_instruction()->shape();
   }
 
   // Creates the ComputationLayout which describes the current status of the HLO
@@ -124,6 +132,12 @@ class HloModule {
     return config_.entry_computation_layout();
   }
 
+  // Generates a hash value of an HLO module. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO modules,
+  // with respect to HloInstruction::Identical() method.
+  uint64 Hash() const { return entry_computation()->Hash(); }
+
   // Gets the computations in this module.
   //
   // Returns a view of HloComputation*s, so you can iterate over this in the
@@ -212,33 +226,29 @@ class HloModule {
     return result;
   }
 
-  // Returns the number of unique intruction ids given out.  All ids up to
-  // this point are guaranteed to be in the range [0..NumUniqueInstructionIds())
-  int NumUniqueInstructionIds() const { return next_unique_id_; }
+  // input_output_alias_config indicates the list of aliased buffers that are
+  // expected from the module.
+  HloInputOutputAliasConfig& input_output_alias_config() {
+    return input_output_alias_config_;
+  }
+  const HloInputOutputAliasConfig& input_output_alias_config() const {
+    return input_output_alias_config_;
+  }
+
+  // DynamicParameterBinding holds the list of bindings that indicates which
+  // parameter dimensions are dynamic and which parameters represent their
+  // runtime value.
+  DynamicParameterBinding& dynamic_parameter_binding() {
+    return dynamic_parameter_binding_;
+  }
+  const DynamicParameterBinding& dynamic_parameter_binding() const {
+    return dynamic_parameter_binding_;
+  }
 
   // Returns an id that is unique to this module across all modules created over
   // the lifetime of this process.
   int unique_id() const { return unique_id_; }
 
-  // Returns a non-const version of the passed-in const HloInstruction*. This is
-  // safe on the argument that if you have a non-const module, then you can
-  // access all instructions in the module as non-const.
-  //
-  // Returns an error if the passed-in instruction is not from this module,
-  // except that it is allowed to pass in a null pointer.
-  //
-  // TODO(b/78350259): Eliminate const laundering. The argument above is not
-  // reliable since at any time someone could add or discover a way for a
-  // non-const module to transitively contain a const HloInstruction. The
-  // reliable way to do this would be to create a const laundering map from a
-  // module, mapping each encountered HloInstruction to its non-const version
-  // and then look up each instruction in need of laundering in that map, but
-  // this is much more expensive and complicated. This returns a Status instead
-  // of doing a CHECK-failure in part to make it strongly apparent that this is
-  // something that can fail.
-  StatusOr<HloInstruction*> LaunderConstInstructionFromModule(
-      const HloInstruction* hlo);
-
   // Sets the schedule of the module to the given schedule.
   Status set_schedule(HloSchedule schedule);
 
@@ -284,6 +294,13 @@ class HloModule {
   // sequential order of instructions for each non-fusion computation in the
   // module.
   absl::optional<HloSchedule> schedule_;
+
+  // alias_config indicates the alias information of input/output buffers that
+  // are expected from the module.
+  HloInputOutputAliasConfig input_output_alias_config_;
+
+  // Bindings for dynamic parameter mapping.
+  DynamicParameterBinding dynamic_parameter_binding_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_group.cc b/tensorflow/compiler/xla/service/hlo_module_group.cc
index f9b56ef4643f2ca88e56456ae6c990161adb5085..69d57c3f146f17ebbddef1ed972b92a587d67be7 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group.cc
@@ -17,9 +17,8 @@ limitations under the License.
 
 namespace xla {
 
-HloModuleGroup::HloModuleGroup(absl::string_view name,
-                               std::unique_ptr<HloModule> module)
-    : name_(name) {
+HloModuleGroup::HloModuleGroup(std::unique_ptr<HloModule> module)
+    : name_(module->name()) {
   push_back(std::move(module));
 }
 
@@ -31,6 +30,14 @@ HloModuleGroup::HloModuleGroup(absl::string_view name,
   }
 }
 
+HloModuleGroup::HloModuleGroup(
+    absl::string_view name, std::vector<std::unique_ptr<HloModule>>&& modules)
+    : name_(name) {
+  for (auto& module : modules) {
+    push_back(std::move(module));
+  }
+}
+
 std::vector<std::unique_ptr<HloModule>> HloModuleGroup::ConsumeModules() {
   std::vector<std::unique_ptr<HloModule>> ret_modules = std::move(modules_);
 
@@ -83,6 +90,12 @@ void HloModuleGroup::push_back(std::unique_ptr<HloModule> module) {
   module_ptrs_.push_back(modules_.back().get());
 }
 
+void HloModuleGroup::ReplaceModule(int index,
+                                   std::unique_ptr<HloModule> module) {
+  modules_.at(index) = std::move(module);
+  module_ptrs_.at(index) = modules_.at(index).get();
+}
+
 std::ostream& operator<<(std::ostream& out, const HloModuleGroup& group) {
   out << group.ToString();
   return out;
diff --git a/tensorflow/compiler/xla/service/hlo_module_group.h b/tensorflow/compiler/xla/service/hlo_module_group.h
index 7338be8b9c5ed47f0ba5829cc1d603b21f00b6e0..c4b10f3b22ab2aa0a346cae4e2d0d87496722368 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group.h
@@ -35,11 +35,13 @@ class HloModuleGroup {
   explicit HloModuleGroup(absl::string_view name) : name_(name) {}
 
   // Construct a module group containing a single module.
-  HloModuleGroup(absl::string_view name, std::unique_ptr<HloModule> module);
+  explicit HloModuleGroup(std::unique_ptr<HloModule> module);
 
   // Construct a module group containing any number of modules.
   HloModuleGroup(absl::string_view name,
                  absl::Span<std::unique_ptr<HloModule>> modules);
+  HloModuleGroup(absl::string_view name,
+                 std::vector<std::unique_ptr<HloModule>>&& modules);
 
   // Returns the modules contained in the group.
   const std::vector<HloModule*>& modules() const { return module_ptrs_; }
@@ -50,11 +52,16 @@ class HloModuleGroup {
   // Add a module to the back of vector of modules in the group.
   void push_back(std::unique_ptr<HloModule> module);
 
+  // Replaces the existing module at the given index with the given module. The
+  // existing module is discarded.
+  void ReplaceModule(int index, std::unique_ptr<HloModule> module);
+
   // Moves all modules from the group into the returned vector. After this
   // method runs, the module group will be empty.
   std::vector<std::unique_ptr<HloModule>> ConsumeModules();
 
   string name() const { return name_; }
+
   string ToString() const;
 
   // Serialize the module group to/from a proto.
@@ -63,6 +70,12 @@ class HloModuleGroup {
       const HloModuleGroupProto& proto,
       absl::Span<const HloModuleConfig> module_configs);
 
+  // Returns the number of modules in the module group.
+  int size() const { return modules_.size(); }
+
+  // Returns true if there are no modules in the module group.
+  bool empty() const { return modules_.empty(); }
+
  private:
   string name_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 83352ef91b35b61ee2560b1488ee2ecdff6bea0a..b4aac4c8076cb69647d42c6243bc969d06d0709e 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -59,7 +59,7 @@ string HloModuleGroupMetadata::TrackedInstruction::ToString() const {
 }
 
 /* static */ StatusOr<std::unique_ptr<HloModuleGroupMetadata>>
-HloModuleGroupMetadata::Build(const std::vector<HloModule*>& modules) {
+HloModuleGroupMetadata::Build(absl::Span<HloModule* const> modules) {
   auto metadata = absl::make_unique<HloModuleGroupMetadata>(modules);
   TF_RETURN_IF_ERROR(metadata->Build());
   return std::move(metadata);
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 0311b7320721e98ab80ff0a28adb2e8fe53cee9b..928df0f5a7444ad877961a5de970c752e1d024da 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -102,14 +102,14 @@ class HloModuleGroupMetadata {
     HloInstruction* recv_done = nullptr;
   };
 
-  explicit HloModuleGroupMetadata(const std::vector<HloModule*>& modules)
-      : modules_(modules) {}
+  explicit HloModuleGroupMetadata(absl::Span<HloModule* const> modules)
+      : modules_(modules.begin(), modules.end()) {}
 
   ~HloModuleGroupMetadata() = default;
 
   // Build and return the metadata for the given modules.
   static StatusOr<std::unique_ptr<HloModuleGroupMetadata>> Build(
-      const std::vector<HloModule*>& modules);
+      absl::Span<HloModule* const> modules);
 
   // Returns true if the instruction is one of the 4 channel instructions (Send,
   // Recv, SendDone, RecvDone).
@@ -274,7 +274,7 @@ class HloModuleGroupMetadata {
   int64 max_channel_id_ = -1;
 
   // The modules that this metadata was built from.
-  const std::vector<HloModule*>& modules_;
+  const std::vector<HloModule*> modules_;
 
   absl::flat_hash_map<HloModule*, std::unique_ptr<TuplePointsToAnalysis>>
       points_to_analyses_;
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_test.cc b/tensorflow/compiler/xla/service/hlo_module_group_test.cc
index b7b12cb72b8df4610b964fb842da78e160d22d9f..5a9a86af5649bf240bb5de6d30fc80b0f6a58eba 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_test.cc
@@ -46,7 +46,7 @@ ENTRY %entry (x: f32[], y: f32[]) -> f32[] {
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseHloString(text));
-  HloModuleGroup group(TestName(), std::move(module));
+  HloModuleGroup group(std::move(module));
 
   EXPECT_EQ(group.modules().size(), 1);
   EXPECT_THAT(
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 39f38b417ab0e8b54864176d8d1e0ad1a422eca6..620cb7e01ad1a060915f5b73474f6950ab18122a 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -63,7 +63,7 @@ class HloModuleTest : public HloTestBase {
 
 TEST_F(HloModuleTest, OneComputationPostOrder) {
   // Create a module with a single computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(CreateConstantComputation());
 
   EXPECT_THAT(module->MakeComputationPostOrder(),
@@ -72,7 +72,7 @@ TEST_F(HloModuleTest, OneComputationPostOrder) {
 
 TEST_F(HloModuleTest, TwoComputationsPostOrder) {
   // Create a module with two unconnected computations.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 = module->AddEntryComputation(CreateConstantComputation());
   auto computation2 =
       module->AddEmbeddedComputation(CreateConstantComputation());
@@ -88,7 +88,7 @@ TEST_F(HloModuleTest, TwoComputationsPostOrder) {
 
 TEST_F(HloModuleTest, CloneTest) {
   // Create and copy a module with a diamond call graph of computations.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -111,7 +111,7 @@ TEST_F(HloModuleTest, CloneTest) {
 }
 
 TEST_F(HloModuleTest, CloneHasFusion) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   // Create the fused computation.
   HloComputation* fused_computation;
@@ -154,7 +154,7 @@ TEST_F(HloModuleTest, CloneHasFusion) {
 
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -174,7 +174,7 @@ TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
 
 TEST_F(HloModuleTest, LargeConstantToString) {
   // Create a module with a single computation.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("Constant");
   std::vector<float> values(16, 42.0);
   builder.AddInstruction(
@@ -194,8 +194,8 @@ TEST_F(HloModuleTest, LargeConstantToString) {
 }
 
 TEST_F(HloModuleTest, UniqueModuleId) {
-  auto module_a = CreateNewModule();
-  auto module_b = CreateNewModule();
+  auto module_a = CreateNewVerifiedModule();
+  auto module_b = CreateNewVerifiedModule();
   EXPECT_NE(module_a->unique_id(), module_b->unique_id());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index e6bfb8025d4bfeba1d334d1f946e33841a2da092..127cfd165a5d8229cac3035f56a66f1bcfa734f3 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -47,6 +47,8 @@ namespace xla {
 #define HLO_OPCODE_LIST(V)                                   \
   V(kAbs, "abs")                                             \
   V(kAdd, "add")                                             \
+  V(kAddDependency, "add-dependency")                        \
+  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
   V(kAllToAll, "all-to-all")                                 \
   V(kAtan2, "atan2")                                         \
   V(kBatchNormGrad, "batch-norm-grad")                       \
@@ -83,7 +85,7 @@ namespace xla {
   V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
   V(kGather, "gather")                                       \
   V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
-  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
+  V(kGetDimensionSize, "get-dimension-size")                 \
   V(kGetTupleElement, "get-tuple-element")                   \
   V(kGt, "greater-than", kHloOpcodeIsComparison)             \
   V(kImag, "imag")                                           \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 23d41d91d6969ddf9062507e926ae39c1e1315d4..ca6a154809be46d6a0305c29e2b89219de408019 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -334,7 +334,7 @@ DependencyHloOrdering::DependencyHloOrdering(const HloModule* module)
   // ordering based on dependencies. ExecutesBefore will return true iff there
   // exists a path in the HLO computation graph from 'a' to 'b'.
   for (auto* computation : module->MakeNonfusionComputations()) {
-    predecessors_.emplace(computation, computation->ComputeReachability());
+    predecessors_.emplace(computation, HloReachabilityMap::Build(computation));
   }
 }
 
@@ -356,8 +356,7 @@ void SequentialHloOrdering::Initialize() {
   // Create a map from instruction to its order position.
   TF_DCHECK_OK(schedule_.Verify());
   for (const auto& computation_sequence : schedule_.sequences()) {
-    const std::vector<const HloInstruction*>& order =
-        computation_sequence.second.instructions();
+    const auto& order = computation_sequence.second.instructions();
     for (int i = 0; i < order.size(); ++i) {
       InsertOrDie(&order_position_, order[i], i);
     }
@@ -374,11 +373,10 @@ bool SequentialHloOrdering::ExecutesBeforeInSameComputation(
   return order_position_.at(a) < order_position_.at(b);
 }
 
-const std::vector<const HloInstruction*>*
-SequentialHloOrdering::SequentialOrder(
+const HloInstructionSequence* SequentialHloOrdering::SequentialOrder(
     const HloComputation& computation) const {
   return schedule_.is_computation_scheduled(&computation)
-             ? &schedule_.sequence(&computation).instructions()
+             ? &schedule_.sequence(&computation)
              : nullptr;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.h b/tensorflow/compiler/xla/service/hlo_ordering.h
index 66313492eb2dd10ac9a6000639ddb8991b367c0f..a07214c22c0989a438f12219e136a7e76ee0dcce 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.h
+++ b/tensorflow/compiler/xla/service/hlo_ordering.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -64,7 +65,7 @@ class HloOrdering {
 
   // Returns the sequential instruction order for the given computation, or
   // nullptr if the computation does not have a sequential ordering.
-  virtual const std::vector<const HloInstruction*>* SequentialOrder(
+  virtual const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const = 0;
 
   // Return the call graph of the module used to compute ordering.
@@ -96,7 +97,7 @@ class PredecessorHloOrdering : public HloOrdering {
 
   // Returns nullptr indicating the computation does not have a sequential
   // ordering.
-  const std::vector<const HloInstruction*>* SequentialOrder(
+  const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const override {
     return nullptr;
   }
@@ -185,7 +186,7 @@ class SequentialHloOrdering : public HloOrdering {
   ~SequentialHloOrdering() override = default;
 
   // Returns the sequential instruction order for the given computation.
-  const std::vector<const HloInstruction*>* SequentialOrder(
+  const HloInstructionSequence* SequentialOrder(
       const HloComputation& computation) const override;
 
   string ToString() const override;
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index b045adc9640ac0ca8cf4a127fea2fbfcbb1aaf3f..3ca77e60cd5275c22eb0e338cd5437fc44b49958 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -53,7 +53,7 @@ TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
   //   %c = Constant(42.0f)
   //
   // This results in a diamond-shaped callgraph.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder_c = HloComputation::Builder("C");
@@ -126,7 +126,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
   //   %constant = Constant(1.0)
   //   return While(%constant, body, condition)
   //
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -176,7 +176,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
 
 TEST_F(HloOrderingTest, ParametersDefinedBeforeOthers) {
   // Entry parameter should always be defined before other instruction.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
@@ -209,7 +209,7 @@ TEST_F(HloOrderingTest, ValuesInWhileComputations) {
   //   %while = While(%constant, body, condition)
   //   %add = Add(%constant, %while)
   //
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -407,7 +407,7 @@ TEST_F(HloOrderingTest,
   //   %dead = Constant(123.0)
   //
   // %root should interfere with %dead.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder = HloComputation::Builder(TestName());
@@ -455,7 +455,7 @@ TEST_F(HloOrderingTest,
   //   ROOT %call = call({%c}), subcomputation
   //
   // %root should interfere with %dead.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto subbuilder = HloComputation::Builder(TestName() + ".sub");
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index dd62988bccf7a0b2daa0bd39fc642452c768fceb..9b5bb5d0bd6af104ef62eaa5d3e53cedbe0213d3 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -47,11 +47,11 @@ const double kF16max = 65504;
 
 // Creates and returns a schedule created using the order of the instructions in
 // the HloComputation::instructions() vectors in the module.
-HloSchedule ScheduleFromInstructionOrder(const HloModule* module) {
+HloSchedule ScheduleFromInstructionOrder(HloModule* module) {
   HloSchedule schedule(module);
-  for (const HloComputation* computation : module->computations()) {
+  for (HloComputation* computation : module->computations()) {
     if (!computation->IsFusionComputation()) {
-      for (const HloInstruction* instruction : computation->instructions()) {
+      for (HloInstruction* instruction : computation->instructions()) {
         schedule.GetOrCreateSequence(computation).push_back(instruction);
       }
     }
@@ -108,7 +108,7 @@ class HloParser {
   bool ParseInstructionList(HloComputation** computation,
                             const string& computation_name);
   bool ParseInstruction(HloComputation::Builder* builder, string* root_name);
-  bool ParseInstruciontRhs(HloComputation::Builder* builder, const string& name,
+  bool ParseInstructionRhs(HloComputation::Builder* builder, const string& name,
                            LocTy name_loc);
   bool ParseControlPredecessors(HloInstruction* instruction);
   bool ParseLiteral(Literal* literal, const Shape& shape);
@@ -174,6 +174,7 @@ class HloParser {
     kDistribution,
     kDomain,
     kPrecisionList,
+    kShapeList
   };
 
   struct AttrConfig {
@@ -240,6 +241,7 @@ class HloParser {
 
   bool ParseSliceRanges(SliceRanges* result);
   bool ParsePrecisionList(std::vector<PrecisionConfig::Precision>* result);
+  bool ParseShapeList(std::vector<Shape>* result);
   bool ParseInt64List(const TokKind start, const TokKind end,
                       const TokKind delim,
                       std::vector<tensorflow::int64>* result);
@@ -416,6 +418,18 @@ std::pair<HloInstruction*, HloParser::LocTy>* HloParser::FindInstruction(
     }
     return create_missing_instruction_(name, *shape);
   }
+
+  if (instr != nullptr && shape.has_value() &&
+      !ShapeUtil::Compatible(instr->first->shape(), shape.value())) {
+    Error(
+        lexer_.GetLoc(),
+        StrCat("The declared operand shape ",
+               ShapeUtil::HumanStringWithLayout(shape.value()),
+               " is not compatible with the shape of the operand instruction ",
+               ShapeUtil::HumanStringWithLayout(instr->first->shape()), "."));
+    return nullptr;
+  }
+
   return instr;
 }
 
@@ -594,10 +608,10 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     *root_name = name;
   }
 
-  return ParseInstruciontRhs(builder, name, name_loc);
+  return ParseInstructionRhs(builder, name, name_loc);
 }
 
-bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
+bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
                                     const string& name, LocTy name_loc) {
   Shape shape;
   HloOpcode opcode;
@@ -836,9 +850,16 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
       }
       break;
     }
+    case HloOpcode::kAddDependency: {
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateAddDependency(operands[0], operands[1]));
+      break;
+    }
     case HloOpcode::kSort: {
-      auto loc = lexer_.GetLoc();
-
       optional<std::vector<tensorflow::int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
@@ -846,20 +867,10 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
           dimensions->size() != 1) {
         return false;
       }
-      switch (operands.size()) {
-        case 1:
-          instruction = builder->AddInstruction(HloInstruction::CreateSort(
-              shape, dimensions->at(0), /*keys=*/operands[0]));
-          break;
-        case 2:
-          instruction = builder->AddInstruction(HloInstruction::CreateSort(
-              shape, dimensions->at(0),
-              /*keys=*/operands[0], /*values=*/operands[1]));
-          break;
-        default:
-          return Error(loc, StrCat("expects either 1 or 2 operands, but has ",
-                                   operands.size(), " operands"));
-      }
+      instruction = builder->AddInstruction(HloInstruction::CreateSort(
+          shape, dimensions->at(0),
+          /*keys=*/operands[0],
+          /*values=*/absl::Span<HloInstruction* const>(operands).subspan(1)));
       break;
     }
     case HloOpcode::kTuple: {
@@ -1099,8 +1110,8 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
           absl::Span<HloInstruction* const>(operands).subspan(
               0, operands.size() / 2),
           /*init_values=*/
-          absl::Span<HloInstruction* const>(operands).subspan(
-              operands.size() / 2, operands.size()),
+          absl::Span<HloInstruction* const>(operands).subspan(operands.size() /
+                                                              2),
           *dimensions_to_reduce, *reduce_computation));
       break;
     }
@@ -1341,6 +1352,7 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
       optional<Window> window;
       optional<ConvolutionDimensionNumbers> dnums;
       optional<int64> feature_group_count;
+      optional<std::vector<Shape>> operand_layout_constraints;
       attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
                                      &custom_call_target};
       attrs["opaque"] = {/*required=*/false, AttrTy::kString, &opaque};
@@ -1349,12 +1361,52 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
                              AttrTy::kConvolutionDimensionNumbers, &dnums};
       attrs["feature_group_count"] = {/*required=*/false, AttrTy::kInt64,
                                       &feature_group_count};
+      attrs["operand_layout_constraints"] = {
+          /*required=*/false, AttrTy::kShapeList, &operand_layout_constraints};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
-      instruction = builder->AddInstruction(
-          HloInstruction::CreateCustomCall(shape, operands, *custom_call_target,
-                                           opaque.has_value() ? *opaque : ""));
+      if (operand_layout_constraints.has_value()) {
+        if (!LayoutUtil::HasLayout(shape)) {
+          return Error(lexer_.GetLoc(),
+                       "Layout must be set on layout-constrained custom call");
+        }
+        if (operands.size() != operand_layout_constraints->size()) {
+          return Error(lexer_.GetLoc(),
+                       StrCat("Expected ", operands.size(),
+                              " operand layout constraints, ",
+                              operand_layout_constraints->size(), " given"));
+        }
+        for (int64 i = 0; i < operands.size(); ++i) {
+          const Shape& operand_shape_with_layout =
+              (*operand_layout_constraints)[i];
+          if (!LayoutUtil::HasLayout(operand_shape_with_layout)) {
+            return Error(lexer_.GetLoc(),
+                         StrCat("Operand layout constraint shape ",
+                                ShapeUtil::HumanStringWithLayout(
+                                    operand_shape_with_layout),
+                                " for operand ", i, " does not have a layout"));
+          }
+          if (!ShapeUtil::Compatible(operand_shape_with_layout,
+                                     operands[i]->shape())) {
+            return Error(
+                lexer_.GetLoc(),
+                StrCat(
+                    "Operand layout constraint shape ",
+                    ShapeUtil::HumanStringWithLayout(operand_shape_with_layout),
+                    " for operand ", i,
+                    " is not compatible with operand shape ",
+                    ShapeUtil::HumanStringWithLayout(operands[i]->shape())));
+          }
+        }
+        instruction = builder->AddInstruction(HloInstruction::CreateCustomCall(
+            shape, operands, *custom_call_target, *operand_layout_constraints,
+            opaque.has_value() ? *opaque : ""));
+      } else {
+        instruction = builder->AddInstruction(HloInstruction::CreateCustomCall(
+            shape, operands, *custom_call_target,
+            opaque.has_value() ? *opaque : ""));
+      }
       if (window.has_value()) {
         instruction->set_window(*window);
       }
@@ -1504,6 +1556,18 @@ bool HloParser::ParseInstruciontRhs(HloComputation::Builder* builder,
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
+    case HloOpcode::kGetDimensionSize:
+      optional<std::vector<tensorflow::int64>> dimensions;
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateGetDimensionSize(
+              shape, operands[0], (*dimensions)[0]));
+      break;
   }
 
   instruction->SetAndSanitizeName(name);
@@ -1763,6 +1827,10 @@ bool HloParser::SetValueInLiteral(tensorflow::int64 value,
     case U64:
       return SetValueInLiteralHelper<tensorflow::uint64>(value, linear_index,
                                                          literal);
+    case PRED:
+      // Bool type literals with rank >= 1 are printed in 0s and 1s.
+      return SetValueInLiteralHelper<bool>(static_cast<bool>(value),
+                                           linear_index, literal);
     default:
       LOG(FATAL) << "unknown integral primitive type "
                  << PrimitiveType_Name(shape.element_type());
@@ -2017,14 +2085,13 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
         }
         if (lexer_.GetKind() == TokKind::kw_true ||
             lexer_.GetKind() == TokKind::kw_false) {
-          // TODO(congliu): bool type literals with rank >= 1 are actually
-          // printed in a compact form instead of "true" or "false". Fix that.
           if (!SetValueInLiteral(lexer_.GetKind() == TokKind::kw_true,
                                  linear_index++, literal)) {
             return false;
           }
           lexer_.Lex();
-        } else if (primitive_util::IsIntegralType(shape.element_type())) {
+        } else if (primitive_util::IsIntegralType(shape.element_type()) ||
+                   shape.element_type() == PRED) {
           LocTy loc = lexer_.GetLoc();
           tensorflow::int64 value;
           if (!ParseInt64(&value)) {
@@ -2533,6 +2600,15 @@ bool HloParser::ParseAttributeHelper(
             ->emplace(result);
         return true;
       }
+      case AttrTy::kShapeList: {
+        std::vector<Shape> result;
+        if (!ParseShapeList(&result)) {
+          return false;
+        }
+        static_cast<optional<std::vector<Shape>>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
     }
   }();
   if (!success) {
@@ -2653,7 +2729,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
 
   // The str is expected to have 3 items, lhs, rhs, out, and it must look like
   // lhs_rhs->out, that is, the first separator is "_" and the second is "->".
-  std::vector<string> split1 = absl::StrSplit(str, "_");
+  std::vector<string> split1 = absl::StrSplit(str, '_');
   if (split1.size() != 2) {
     LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees "
                << str;
@@ -2825,6 +2901,23 @@ bool HloParser::ParsePrecisionList(
                    parse_and_add_item);
 }
 
+// shapelist ::= '{' shapes '}'
+// precision_elements
+//   ::= /*empty*/
+//   ::= shape (',' shape)*
+bool HloParser::ParseShapeList(std::vector<Shape>* result) {
+  auto parse_and_add_item = [&]() {
+    Shape shape;
+    if (!ParseShape(&shape)) {
+      return false;
+    }
+    result->push_back(std::move(shape));
+    return true;
+  };
+  return ParseList(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
+                   parse_and_add_item);
+}
+
 // int64list ::= start int64_elements end
 // int64_elements
 //   ::= /*empty*/
@@ -2832,23 +2925,15 @@ bool HloParser::ParsePrecisionList(
 bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
                                const TokKind delim,
                                std::vector<tensorflow::int64>* result) {
-  if (!ParseToken(start, StrCat("expects an int64 list starting with ",
-                                TokKindToString(start)))) {
-    return false;
-  }
-  if (lexer_.GetKind() == end) {
-    // empty
-  } else {
-    do {
-      tensorflow::int64 i;
-      if (!ParseInt64(&i)) {
-        return false;
-      }
-      result->push_back(i);
-    } while (EatIfPresent(delim));
-  }
-  return ParseToken(
-      end, StrCat("expects an int64 list to end with ", TokKindToString(end)));
+  auto parse_and_add_item = [&]() {
+    tensorflow::int64 i;
+    if (!ParseInt64(&i)) {
+      return false;
+    }
+    result->push_back(i);
+    return true;
+  };
+  return ParseList(start, end, delim, parse_and_add_item);
 }
 
 bool HloParser::ParseList(const TokKind start, const TokKind end,
@@ -2933,7 +3018,8 @@ bool HloParser::ParseShape(Shape* result) {
   }
 
   if (lexer_.GetKind() != TokKind::kShape) {
-    return TokenError("expects shape");
+    return TokenError(absl::StrCat("expected shape, saw ",
+                                   TokKindToString(lexer_.GetKind())));
   }
   *result = lexer_.GetShapeVal();
   lexer_.Lex();
@@ -3324,7 +3410,7 @@ bool HloParser::ParseSingleInstruction(HloModule* module) {
     // e.g.
     //
     //  f32[10] fusion(...), calls={...}
-    if (!ParseInstruciontRhs(&builder, module->name(), lexer_.GetLoc())) {
+    if (!ParseInstructionRhs(&builder, module->name(), lexer_.GetLoc())) {
       return false;
     }
   } else {
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index 81eeb9f13bf7f06123c0b35e9f3352c197866a7a..d830fa61438239005875f785f85cf2486123ebc9 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -44,7 +44,9 @@ Status ParseHloString(absl::string_view str, HloModule* module);
 // creates a HloModule with default config.
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str);
 
-// Parses the result of HloSharding::ToString(), e.g. "{replicated}".
+// ParseHloString sharding from str. str is supposed to contain the body of the
+// sharding, i.e. just the rhs of the "sharding={...}" attribute string,
+// e.g., "{replicated}".
 StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
 // Parses the result of window_util::ToString(const Window&).
@@ -55,10 +57,6 @@ StatusOr<Window> ParseWindow(absl::string_view str);
 StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
     absl::string_view str);
 
-// ParseHloString sharding from str. str is supposed to contain the body of the
-// sharding, i.e. just the rhs of the "sharding={...}" attribute string.
-StatusOr<HloSharding> ParseSharding(absl::string_view str);
-
 // Parses the result of PaddingConfigToString(), e.g. "0_0x1_1".
 StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str);
 
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 255123d331c91b1c862980b9248afe9a03d564c8..ab71f011ac9d77d00ddfb41aca7a224d26d416b7 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -29,7 +30,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-namespace op = ::xla::testing::opcode_matchers;
+namespace m = ::xla::match;
 using absl::string_view;
 
 struct TestData {
@@ -75,6 +76,18 @@ ENTRY %constant_pred () -> pred[] {
 
 )"
 },
+// pred array constant
+{
+"ConstantPredArray",
+R"(HloModule module
+
+ENTRY %constant_pred_array () -> pred[2,3] {
+  ROOT %constant = pred[2,3]{1,0} constant(pred[2,3] { { 0, 1, 0 }, { 1, 0, 1 } })
+}
+
+)"
+},
+
 // s32 constant
 {
 "ConstantS32",
@@ -183,7 +196,7 @@ ENTRY %add_constants () -> f32[] {
 R"(HloModule TupleConstant_module
 
 ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
-  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} ))
+  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { {1}, {2} }, {2, 42} ))
 }
 
 )"
@@ -575,7 +588,7 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
 R"(HloModule BasicTraining_module
 
 ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
-  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } })
+  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ { 1, 2 } }, { /*i1=1*/ { 3, 4 } } }, { /*i0=1*/ { /*i1=0*/ { 5, 6 } }, { /*i1=1*/ { 7, 8 } } } })
   %constant.1 = f32[2]{0} constant({2, 3})
   %constant.2 = f32[2]{0} constant({1, 2})
   ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3
@@ -802,6 +815,43 @@ ENTRY %ConstantUnsignedNoOverflow () -> u64[] {
   ROOT %constant = u64[] constant(9223372036854775807)
 }
 
+)"
+},
+// CustomCallWithLayoutConstraints
+{
+"CustomCallWithLayoutConstraints",
+R"(HloModule CustomCallWithLayoutConstraints
+
+ENTRY %CustomCallWithLayoutConstraints (p0: f32[42,2,3], p1: f32[123,4]) -> f32[1,2,3] {
+  %p0 = f32[42,2,3]{0,1,2} parameter(0)
+  %p1 = f32[123,4]{0,1} parameter(1)
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[42,2,3]{0,1,2} %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={f32[42,2,3]{0,1,2}, f32[123,4]{1,0}}
+}
+
+)"
+},
+// CustomCallWithLayoutConstraintsNoOperands
+{
+"CustomCallWithLayoutConstraintsNoOperands",
+R"(HloModule CustomCallWithLayoutConstraintsNoOperands
+
+ENTRY %CustomCallWithLayoutConstraints () -> f32[1,2,3] {
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(), custom_call_target="baz", operand_layout_constraints={}
+}
+
+)"
+},
+// CustomCallWithLayoutConstraintsTupleShapes
+{
+"CustomCallWithLayoutConstraintsTupleShapes",
+R"(HloModule CustomCallWithLayoutConstraintsTupleShapes
+
+ENTRY %CustomCallWithLayoutConstraints (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4]) -> (f32[1,2,3], f32[1,2,3]) {
+  %p0 = (f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) parameter(0)
+  %p1 = f32[123,4]{0,1} parameter(1)
+  ROOT %custom-call = (f32[1,2,3]{0,2,1}, f32[1,2,3]{1,2,0}) custom-call((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={(f32[2,2]{1,0}, f32[42,2,3]{2,0,1}), f32[123,4]{1,0}}
+}
+
 )"
 },
   });
@@ -966,6 +1016,21 @@ ENTRY Sort {
   ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}) sort(keys, values), dimensions={0}
 }
 
+)"
+},
+// Sort (Key, Value, Value, Value)
+{
+"SortManyValues",
+R"(HloModule sort
+
+ENTRY Sort {
+  keys = f32[1024,16]{0,1} parameter(0)
+  values.0 = s32[1024,16]{0,1} parameter(1)
+  values.1 = u32[1024,16]{0,1} parameter(2)
+  values.2 = f32[1024,16]{0,1} parameter(3)
+  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1}) sort(keys, values.0, values.1, values.2), dimensions={0}
+}
+
 )"
 },
 // Conditional
@@ -1086,6 +1151,25 @@ ENTRY CrossReplicaSumWithSubgroups {
   ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_groups={{0,1},{2,3}}, barrier="abc", to_apply=add
 }
 
+)"
+},
+// cross-replica-sum with all-reduce-id
+{
+"CrossReplicaSumAllReduce",
+R"(HloModule CRS
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY CRS {
+  input = f32[8]{0} parameter(0)
+  crs.1 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
+  ROOT crs.0 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
+}
+
 )"
 },
 // all-to-all
@@ -1158,7 +1242,38 @@ ENTRY Sort {
 }
 
 )"
+    },
+// AfterAll with multiple operands
+{
+"AfterAllWithMultipleOperands",
+R"(HloModule AfterAllWithMultipleOperands
+
+ENTRY AfterAllWithMultipleOperands {
+  p0 = f32[] parameter(0)
+  token0 = token[] after-all()
+  token1 = token[] after-all()
+  ROOT after-all = token[] after-all(p0, token0, token1)
+}
+
+)"
+},
+// AddDependency
+// A dependency chain is created from 'neg' to 'exp' using tokens.
+{
+"AddDependency",
+R"(HloModule AddDependency
+
+ENTRY AddDependency {
+  p = f32[] parameter(0)
+  neg = f32[] negate(p)
+  token = token[] after-all(neg)
+  p_after_token = f32[] add-dependency(p, token)
+  exp = f32[] exponential(p_after_token)
+  ROOT sum = f32[] add(neg, exp)
 }
+
+)"
+},
 });
   // clang-format on
 }
@@ -1779,7 +1894,8 @@ ENTRY ReduceR3ToR2 {
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(original));
   ASSERT_NE(module->entry_computation(), nullptr);
-  EXPECT_THAT(module->entry_computation()->root_instruction(), op::Reduce());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Reduce()));
 }
 
 TEST_F(HloParserTest, ParseSharding) {
@@ -1839,7 +1955,7 @@ TEST(HloParserSingleOpTest, SingleOp) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST(HloParserSingleOpTest, SingleOpNoShapeProducesError) {
@@ -1867,7 +1983,7 @@ TEST(HloParserSingleOpTest, SingleOpNoNames) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST(HloParserSingleOpTest, CanonicalOp) {
@@ -1876,7 +1992,7 @@ TEST(HloParserSingleOpTest, CanonicalOp) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
   EXPECT_EQ(
       computation->root_instruction()->ToString(HloPrintOptions::Canonical()),
       text);
@@ -1930,7 +2046,11 @@ TEST(HloParserSingleOpTest, SingleOpWithNested) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Fusion(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Op()
+                             .WithOpcode(HloOpcode::kFusion)
+                             .WithNumOperands(2)
+                             .WithOperand(0, m::Parameter(0))
+                             .WithOperand(1, m::Parameter(1))));
 }
 
 TEST(HloParserSingleOpTest, SingleOpWithNested_DoesNotExist) {
@@ -1974,7 +2094,7 @@ TEST(HloParserSingleOpTest, ConvolutionTrivialFeatureGroupCount) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Convolution(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Convolution(m::Parameter(0), m::Parameter(1))));
   auto* convolution =
       Cast<HloConvolutionInstruction>(computation->root_instruction());
   EXPECT_EQ(convolution->feature_group_count(), 1);
@@ -2038,8 +2158,10 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
       module->schedule().is_computation_scheduled(module->entry_computation()));
   EXPECT_THAT(
       module->schedule().sequence(module->entry_computation()).instructions(),
-      ::testing::ElementsAre(op::Parameter(), op::Broadcast(), op::Parameter(),
-                             op::Multiply(), op::Parameter(), op::Add()));
+      ::testing::ElementsAre(
+          GmockMatch(m::Parameter()), GmockMatch(m::Broadcast()),
+          GmockMatch(m::Parameter()), GmockMatch(m::Multiply()),
+          GmockMatch(m::Parameter()), GmockMatch(m::Add())));
 }
 
 TEST_F(HloParserTest, IsScheduledIsTrueDifferentOrder) {
@@ -2065,9 +2187,69 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
       module->schedule().is_computation_scheduled(module->entry_computation()));
   EXPECT_THAT(
       module->schedule().sequence(module->entry_computation()).instructions(),
-      ::testing::ElementsAre(op::Parameter(), op::Parameter(), op::Parameter(),
-                             op::Broadcast(), op::Multiply(), op::Add()));
+      ::testing::ElementsAre(
+          GmockMatch(m::Parameter()), GmockMatch(m::Parameter()),
+          GmockMatch(m::Parameter()), GmockMatch(m::Broadcast()),
+          GmockMatch(m::Multiply()), GmockMatch(m::Add())));
+}
+
+TEST_F(HloParserTest, CustomCallWrongNumberofOperandConstraints) {
+  const string original = R"(HloModule CustomCallWrongNumberofOperandConstraints
+
+ENTRY %CustomCallWrongNumberofOperandConstraints (p0: f32[42,2,3], p1: f32[123,4]) -> f32[1,2,3] {
+  %p0 = f32[42,2,3]{0,1,2} parameter(0)
+  %p1 = f32[123,4]{0,1} parameter(1)
+  ROOT %custom-call = f32[1,2,3]{0,1,2} custom-call(f32[42,2,3]{0,1,2} %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={f32[42,2,3]{0,1,2}}
+}
+
+)";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Expected 2 operand layout constraints, 1 given");
+}
+
+TEST_F(HloParserTest, CustomCallIncompatibleOperandConstraints) {
+  const string original = R"(HloModule CustomCallIncompatibleOperandConstraints
+
+ENTRY %CustomCallIncompatibleOperandConstraints (p0: f32[42,2,3], p1: f32[123,4]) -> f32[1,2,3] {
+  %p0 = f32[42,2,3]{0,1,2} parameter(0)
+  %p1 = f32[123,4]{0,1} parameter(1)
+  ROOT %custom-call = f32[1,2,3]{0,1,2} custom-call(f32[42,2,3]{0,1,2} %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={f32[42,2,3]{0,1,2}, f32[555,5]{1,0}}
+}
+
+)";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "operand 1 is not compatible with operand shape");
 }
 
+TEST_F(HloParserTest, AllowShapeWhitespace) {
+  const string text = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT root = f32[ 1, 2,3, 4, 5]{0, 1, 2,3, 4 } parameter(0)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(text));
+}
+
+TEST_F(HloParserTest, ShapeMismatchInOperand) {
+  const string text = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> f32[2,2] {
+  %p = f32[2,2] parameter(0)
+  %constant.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  ROOT %add.1 = f32[2,2] add(f32[2,2] %p, f32[2,5] %constant.1)
+}
+)";
+
+  ExpectHasSubstr(ParseHloString(text).status().error_message(),
+                  "The declared operand shape f32[2,5]{1,0} is not compatible"
+                  " with the shape of the operand instruction f32[2,2]{1,0}.");
+}
+
+// custom call incompatible shape.
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 5e004ce78ac1fd6da18ab2a54d23ef27e9586cf6..51177f24f5ee702be96fc8b4530ed38a5798109f 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -113,9 +113,10 @@ void HloPassPipeline::MaybeDumpHlo(const HloModule& module,
   }
 
   const string message =
-      StrCat("after ", after_pass_name, ", before ", before_pass_name);
+      absl::StrCat("after ", after_pass_name, ", before ", before_pass_name);
   hlo_graph_dumper::MaybeDumpHloModule(module, message);
   VLOG(3) << "HLO " << message << ":";
+  VLOG(3) << module.entry_computation_layout().ToString();
   XLA_VLOG_LINES(3, module.ToString());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
index 09e7033ea4ed88849d2f3665d04f74f3f388b3f5..60d72b9d296d71f7bc2f1637bcbec1675513e5df 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.h
@@ -105,8 +105,6 @@ class HloPassPipeline : public HloPassInterface {
   std::vector<std::unique_ptr<HloPassInterface>> passes_;
   std::vector<std::unique_ptr<HloPassInterface>> invariant_checkers_;
   bool run_called_ = false;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(HloPassPipeline);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
index ee8cb12b231718e09f6ac0d05d7a6887f4c4d746..20384b9da6be4bab447b474f0e2240bcb277a620 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline_test.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-class HloPassPipelineTest : public HloVerifiedTestBase {
+class HloPassPipelineTest : public HloTestBase {
  protected:
   StatusOr<HloModuleGroup> ParseModuleGroup(
       absl::Span<const string> hlo_strings) {
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
index dcc22793015147aaf3229875078b2989e4ef7559..5eb707a957e49d86cdb2f72b72ce750bf29b8fd2 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.cc
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
 
 namespace xla {
@@ -25,6 +26,11 @@ string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
 
   string result;
 
+  for (const auto& item : hlo_profile_printer_data.extra_metrics()) {
+    absl::StrAppend(&result, "Extra metric ", item.first, ": ",
+                    counters[item.second], "\n");
+  }
+
   for (const HloComputationInfo& computation_info :
        hlo_profile_printer_data.computation_infos()) {
     const auto& instruction_infos = computation_info.instruction_infos();
@@ -41,8 +47,9 @@ string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
     // Once we start using this in AOT for real, we will probably need a more
     // minimal version of HumanReadableProfileBuilder.
     HumanReadableProfileBuilder builder(
-        computation_info.name(), counters[computation_info.profile_index()],
-        clock_rate_ghz);
+        computation_info.name(),
+        hlo_profile_printer_data.entry_computation() == computation_info.name(),
+        counters[computation_info.profile_index()], clock_rate_ghz);
 
     for (const auto& instruction_info : instruction_infos) {
       builder.AddOp(
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto b/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto
index 9f22b733fe1d676b177039a9d7a3064b8638d7bc..ee66c86ffcb4fb74a24033e05f588a2f4d27dfe4 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer_data.proto
@@ -57,4 +57,10 @@ message HloProfilePrinterData {
 
   // The size of the profile counters array we will pretty-print.
   int64 profile_counters_size = 2;
+
+  // Maps extra metric name to the index into the profile counters array.
+  map<string, int64> extra_metrics = 3;
+
+  // Name of the entry computation.
+  string entry_computation = 4;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.cc b/tensorflow/compiler/xla/service/hlo_proto_util.cc
index b9c0b0c4ee1957fce48641230cef6391bcc9180e..981d06ce101644ecce587c4bd2f7a12c8edf6548 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
 
 #include <string>
 
@@ -36,35 +37,47 @@ HloProto MakeHloProto(const HloModule& module) {
   return proto;
 }
 
-StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
+StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
+    const HloModuleProto& proto, const HloModuleConfig& module_config) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(proto, module_config));
+  TF_RETURN_IF_ERROR(
+      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false)
+          .Run(module.get())
+          .status());
+  return std::move(module);
+}
+
+StatusOr<std::vector<const ShapeProto*>> EntryComputationParameterShapes(
     const HloProto& hlo_proto) {
   if (!hlo_proto.has_hlo_module()) {
     return NotFound("HloProto missing HloModuleProto.");
   }
-  if (!hlo_proto.hlo_module().has_program_shape()) {
+  if (!hlo_proto.hlo_module().has_host_program_shape()) {
     return NotFound("HloProto missing program shape.");
   }
 
-  std::vector<const Shape*> parameter_shapes;
-  const auto& program_shape = hlo_proto.hlo_module().program_shape();
-  for (const Shape& shape : program_shape.parameters()) {
+  std::vector<const ShapeProto*> parameter_shapes;
+  const auto& program_shape = hlo_proto.hlo_module().host_program_shape();
+  for (const ShapeProto& shape : program_shape.parameters()) {
     parameter_shapes.push_back(&shape);
   }
   return parameter_shapes;
 }
 
-StatusOr<const Shape*> EntryComputationOutputShape(const HloProto& hlo_proto) {
+StatusOr<const ShapeProto*> EntryComputationOutputShape(
+    const HloProto& hlo_proto) {
   if (!hlo_proto.has_hlo_module()) {
     return NotFound("HloProto missing HloModuleProto.");
   }
-  if (!hlo_proto.hlo_module().has_program_shape()) {
+  if (!hlo_proto.hlo_module().has_host_program_shape()) {
     return NotFound("HloProto missing program shape.");
   }
-  if (!hlo_proto.hlo_module().program_shape().has_result()) {
+  if (!hlo_proto.hlo_module().host_program_shape().has_result()) {
     return NotFound("HloProto missing result in its program shape");
   }
 
-  return &hlo_proto.hlo_module().program_shape().result();
+  return &hlo_proto.hlo_module().host_program_shape().result();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.h b/tensorflow/compiler/xla/service/hlo_proto_util.h
index 3d9c375cd5d26f92cf8316f78789daf4fc08c927..31ea2aaffd9cdb76d21edbd0d4a03aa5f865f4f0 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.h
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.h
@@ -35,14 +35,21 @@ HloProto MakeHloProto(const HloModule& module,
 // will not be included in the output.
 HloProto MakeHloProto(const HloModule& module);
 
+// Create an HLO state from serialized representation. In addition to
+// creating the proto with HloModule::CreateFromProto(...) it also
+// uses HloVerifier to ensure basic invariants are held.
+StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
+    const HloModuleProto& proto, const HloModuleConfig& module_config);
+
 // Returns the shapes of the parameters of the entry computation. Shape pointers
 // refer to shapes inside of the given HloProto.
-StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
+StatusOr<std::vector<const ShapeProto*>> EntryComputationParameterShapes(
     const HloProto& hlo_proto);
 
 // Returns the shape of the output of the entry computation. The shape pointer
 // refers to the output shape inside of the given HloProto.
-StatusOr<const Shape*> EntryComputationOutputShape(const HloProto& hlo_proto);
+StatusOr<const ShapeProto*> EntryComputationOutputShape(
+    const HloProto& hlo_proto);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc
index 2a07b6fcbc243d955e136ccdf097c8155a115845..f968a4a94453f678f5c17e0b8d1df4aea70c93ea 100644
--- a/tensorflow/compiler/xla/service/hlo_query.cc
+++ b/tensorflow/compiler/xla/service/hlo_query.cc
@@ -24,7 +24,7 @@ namespace hlo_query {
 
 bool IsConstantR0F32(HloInstruction* instruction, float* out) {
   if (instruction->opcode() == HloOpcode::kConstant &&
-      ShapeUtil::IsScalarF32(instruction->shape())) {
+      ShapeUtil::IsScalarWithElementType(instruction->shape(), F32)) {
     *out = instruction->literal().Get<float>({});
     return true;
   }
@@ -104,5 +104,20 @@ bool IsScalarConstant(const HloInstruction* instruction) {
   return instruction->IsConstant() && ShapeUtil::IsScalar(instruction->shape());
 }
 
+bool ContainsInstrWithOpcode(const HloComputation* comp,
+                             const absl::flat_hash_set<HloOpcode>& opcodes) {
+  for (const auto* instr : comp->instructions()) {
+    if (opcodes.count(instr->opcode())) {
+      return true;
+    }
+    for (const HloComputation* subcomp : instr->called_computations()) {
+      if (ContainsInstrWithOpcode(subcomp, opcodes)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace hlo_query
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h
index c0826a6aee1f693484207a86ec258c6604d92318..215051f8834fc94eb9e32b508f34b13626ac9349 100644
--- a/tensorflow/compiler/xla/service/hlo_query.h
+++ b/tensorflow/compiler/xla/service/hlo_query.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_QUERY_H_
 
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -41,6 +43,12 @@ bool AllOperandsAreConstants(const HloInstruction& instruction);
 // Returns whether the instruction is a scalar constant.
 bool IsScalarConstant(const HloInstruction* instruction);
 
+// Determines whether the given computation contains an instruction with one of
+// the given opcodes.  Checks both comp's instructions and the instructions of
+// any computations nested within it.
+bool ContainsInstrWithOpcode(const HloComputation* comp,
+                             const absl::flat_hash_set<HloOpcode>& opcodes);
+
 // Returns an operand of an instruction with the given opcode. If there are
 // multiple matching operands, then the first matching operand is returned. If
 // there are no matching operands then nullptr is returned.
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index 961930f0a888e90f86e4354fa1373a303af8ec2f..4aa8067752481ffab29e1a573ffa49d4aa046f1f 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <queue>
+
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 
 namespace xla {
@@ -22,7 +24,7 @@ HloReachabilityMap::HloReachabilityMap(
     : size_(instructions.size()) {
   bit_vectors_.reserve(size_);
   for (const HloInstruction* hlo : instructions) {
-    indices_[hlo] = bit_vectors_.size();
+    indices_[GetKey(hlo)] = bit_vectors_.size();
     bit_vectors_.emplace_back(size_);
   }
   CHECK_EQ(size_, indices_.size());  // instructions should be unique
@@ -71,4 +73,70 @@ bool HloReachabilityMap::IsConnected(const HloInstruction* a,
   return IsReachable(a, b) || IsReachable(b, a);
 }
 
+std::unique_ptr<HloReachabilityMap> HloReachabilityMap::Build(
+    const HloComputation* computation) {
+  const auto& all = computation->MakeInstructionPostOrder();
+  auto result = absl::make_unique<HloReachabilityMap>(all);
+  auto channel_dependency_map = computation->ComputeChannelDependencies();
+
+  std::vector<HloInstruction*> inputs;
+  for (const HloInstruction* hlo : all) {
+    inputs.assign(hlo->operands().begin(), hlo->operands().end());
+    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
+                  hlo->control_predecessors().end());
+
+    switch (hlo->opcode()) {
+      case HloOpcode::kRecvDone: {
+        auto it = channel_dependency_map.find(hlo->channel_id());
+        if (it != channel_dependency_map.end()) {
+          absl::c_copy(it->second, std::back_inserter(inputs));
+        }
+        break;
+      }
+      case HloOpcode::kCrossReplicaSum: {
+        auto all_reduce_id = hlo->all_reduce_id();
+        if (all_reduce_id) {
+          auto it = channel_dependency_map.find(all_reduce_id.value());
+          if (it != channel_dependency_map.end()) {
+            absl::c_copy(it->second, std::back_inserter(inputs));
+          }
+        }
+        break;
+      }
+      default:
+        break;
+    }
+
+    result->FastSetReachabilityToUnion(inputs, hlo);
+  }
+  return result;
+}
+
+void HloReachabilityMap::UpdateReachabilityThroughInstruction(
+    const HloInstruction* instruction) {
+  std::queue<const HloInstruction*> worklist;
+  worklist.push(instruction);
+
+  std::vector<HloInstruction*> inputs;
+
+  while (!worklist.empty()) {
+    const HloInstruction* item = worklist.front();
+    worklist.pop();
+
+    inputs.assign(item->operands().begin(), item->operands().end());
+    inputs.insert(inputs.end(), item->control_predecessors().begin(),
+                  item->control_predecessors().end());
+
+    if (SetReachabilityToUnion(inputs, item)) {
+      // Add immediate successors to worklist.
+      for (const HloInstruction* user : item->users()) {
+        worklist.push(user);
+      }
+      for (const HloInstruction* succ : item->control_successors()) {
+        worklist.push(succ);
+      }
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index 5a5f01f8fd647c74217c80ce4a7633b8957e335f..7823b06a41b3052f6f50f7ffa358de5b23ba679f 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -16,27 +16,30 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_REACHABILITY_H_
 
+#include <cstdio>
 #include <list>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-class HloInstruction;
-
 // A class for representing reachability between HloInstructions.
 //
-// !!! THIS CLASS DOES NOT COMPUTE REACHABILITY !!! It has an adjacency matrix
-// and it is up to the user of the class to set the adjacency matrix such that
-// it represents reachability, i.e. such that it is transitive. That the graph
-// be transitive is thus not an invariant of this class, but it is required for
-// the name of the class and its methods to make sense.
+// It has an adjacency matrix and it is up to the user of the class to set the
+// adjacency matrix such that it represents reachability, i.e. such that it is
+// transitive. That the graph be transitive is thus not an invariant of this
+// class, but it is required for the name of the class and its methods to make
+// sense.
 class HloReachabilityMap {
  public:
   // Sets up a graph with no edges and where the nodes correspond to the given
@@ -44,6 +47,15 @@ class HloReachabilityMap {
   explicit HloReachabilityMap(
       absl::Span<const HloInstruction* const> instructions);
 
+  // Computes and returns the reachability between HLO instructions in the
+  // computation. The returned HloReachabilityMap is constructed such that
+  // HloReachabilityMap::IsReachable(a, b) returns true iff there exists a
+  // directed path (from producer to consumer) from 'a' to 'b'. Both data
+  // dependencies (operands) and control dependencies are considered for
+  // reachability. Trivially an instruction is reachable from itself.
+  static std::unique_ptr<HloReachabilityMap> Build(
+      const HloComputation* computation);
+
   // Set the reachability set of 'instruction' to the union of the reachability
   // sets of 'inputs'. Upon return, IsReachable(x, instruction) where
   // 'x' is not 'instruction' will return true iff IsReachable(x, input) is true
@@ -70,6 +82,10 @@ class HloReachabilityMap {
   // adjacency matrix.
   void SetReachable(const HloInstruction* a, const HloInstruction* b);
 
+  // Updates the given reachability map after the immediate predecessor set
+  // (operands and control predecessors) of 'instruction' has changed.
+  void UpdateReachabilityThroughInstruction(const HloInstruction* instruction);
+
   // Returns true if "b" is reachable from "a"
   //
   // Note that this function only correctly answers queries about reachability
@@ -82,6 +98,11 @@ class HloReachabilityMap {
   // if the set of edges that have been provided to this class are transitive.
   bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
 
+  // Checks if an instruction is in the Reachability map.
+  bool IsPresent(const HloInstruction* a) const {
+    return indices_.contains(GetKey(a));
+  }
+
  private:
   // A bit-vector implementation specialized for this use case which provides a
   // fast bitwise OR operation not available in tensorflow::gtl::BitMap.
@@ -143,18 +164,24 @@ class HloReachabilityMap {
       absl::Span<const HloInstruction* const> inputs,
       const HloInstruction* instruction, BitVector* bit_vector);
 
+  uint64 GetKey(const HloInstruction* instruction) const {
+    uint64 unique_id = absl::bit_cast<uint32>(instruction->unique_id());
+    uint64 module_id =
+        absl::bit_cast<uint32>(instruction->parent()->parent()->unique_id());
+    return (module_id << 32) | unique_id;
+  }
   // Return the index of the given instruction. The value is used to index into
   // the vector of BitVectors and the BitVectors themselves.
   int GetIndex(const HloInstruction* instruction) const {
-    return FindOrDie(indices_, instruction);
+    return FindOrDie(indices_, GetKey(instruction));
   }
 
   // The number of instructions in the reachability map.
   const size_t size_;
 
-  // Dense assignment from HloInstruction* to number. These numbers index
-  // into the bit_vectors_ vector and into the bits within a BitVector.
-  absl::flat_hash_map<const HloInstruction*, int> indices_;
+  // Dense assignment from HloInstruction::unique_id to number. These numbers
+  // index into the bit_vectors_ vector and into the bits within a BitVector.
+  absl::flat_hash_map<uint64, int> indices_;
 
   // Bitvectors holding the reachability to each instruction. The bit vector for
   // instruction X includes ones for each instruction which X is reachable from.
diff --git a/tensorflow/compiler/xla/service/hlo_reachability_test.cc b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
index d9848cee0bfa904a90aea4626c3ee62c2cbb45b6..595176709806d54fc7c7c5ea301654717096b2d6 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability_test.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
 
 namespace {
 
-class HloReachabilityTest : public HloVerifiedTestBase {};
+class HloReachabilityTest : public HloTestBase {};
 
 TEST_F(HloReachabilityTest, Reachability) {
   // Construct and test a reachability graph of the following form:
@@ -48,7 +48,8 @@ TEST_F(HloReachabilityTest, Reachability) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
   auto e = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
-  builder.Build();
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
 
   HloReachabilityMap reachability({a, b, c, d, e});
   reachability.SetReachable(a, a);
@@ -81,6 +82,130 @@ TEST_F(HloReachabilityTest, Reachability) {
   EXPECT_FALSE(reachability.SetReachabilityToUnion({b, c}, d));
 }
 
+TEST_F(HloReachabilityTest, NonTrivialReachability) {
+  // Test reachability of a non-trivial computation:
+  //
+  // const1    const2
+  //    |         |
+  //    | +-------+
+  //    | |       |
+  //    add ..   negate
+  //     |   .     |
+  //     |   .... exp
+  //     |         |
+  //     +---+   +-+---+
+  //         |   |     |
+  //       multiply   copy
+  //
+  // There is a control dependency from 'add' to 'exp'.
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0f)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32, HloOpcode::kAdd, constant1, constant2));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kNegate, constant2));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kExp, negate));
+  auto mul = builder.AddInstruction(
+      HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, add, exp));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(r0f32, HloOpcode::kCopy, exp));
+
+  auto module = CreateNewVerifiedModule();
+  auto computation =
+      module->AddEntryComputation(builder.Build(/*root_instruction=*/mul));
+
+  TF_CHECK_OK(add->AddControlDependencyTo(exp));
+  auto reachability = HloReachabilityMap::Build(computation);
+
+  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
+  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant1, add));
+  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
+  EXPECT_TRUE(reachability->IsReachable(constant1, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
+  EXPECT_TRUE(reachability->IsReachable(constant1, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
+  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant2, add));
+  EXPECT_TRUE(reachability->IsReachable(constant2, negate));
+  EXPECT_TRUE(reachability->IsReachable(constant2, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
+  EXPECT_TRUE(reachability->IsReachable(constant2, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(exp, constant1));
+  EXPECT_FALSE(reachability->IsReachable(exp, constant2));
+  EXPECT_FALSE(reachability->IsReachable(exp, add));
+  EXPECT_FALSE(reachability->IsReachable(exp, negate));
+  EXPECT_TRUE(reachability->IsReachable(exp, exp));
+  EXPECT_TRUE(reachability->IsReachable(exp, mul));
+  EXPECT_TRUE(reachability->IsReachable(exp, copy));
+
+  EXPECT_FALSE(reachability->IsReachable(mul, constant1));
+  EXPECT_FALSE(reachability->IsReachable(mul, constant2));
+  EXPECT_FALSE(reachability->IsReachable(mul, add));
+  EXPECT_FALSE(reachability->IsReachable(mul, negate));
+  EXPECT_FALSE(reachability->IsReachable(mul, exp));
+  EXPECT_TRUE(reachability->IsReachable(mul, mul));
+  EXPECT_FALSE(reachability->IsReachable(mul, copy));
+
+  EXPECT_TRUE(reachability->IsConnected(constant1, copy));
+  EXPECT_TRUE(reachability->IsConnected(copy, constant1));
+  EXPECT_FALSE(reachability->IsConnected(negate, add));
+  EXPECT_FALSE(reachability->IsConnected(add, negate));
+
+  // Remove the control dependency then update and verify the reachability map
+  ASSERT_IS_OK(add->RemoveControlDependencyTo(exp));
+  reachability->UpdateReachabilityThroughInstruction(exp);
+
+  EXPECT_TRUE(reachability->IsReachable(constant1, constant1));
+  EXPECT_FALSE(reachability->IsReachable(constant1, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant1, add));
+  EXPECT_FALSE(reachability->IsReachable(constant1, negate));
+  EXPECT_FALSE(reachability->IsReachable(constant1, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant1, mul));
+  EXPECT_FALSE(reachability->IsReachable(constant1, copy));
+
+  // Change a use within the graph then update and verify the reachability map
+  ASSERT_IS_OK(constant2->ReplaceUseWith(negate, constant1));
+  reachability->UpdateReachabilityThroughInstruction(negate);
+
+  EXPECT_FALSE(reachability->IsReachable(constant2, constant1));
+  EXPECT_TRUE(reachability->IsReachable(constant2, constant2));
+  EXPECT_TRUE(reachability->IsReachable(constant2, add));
+  EXPECT_FALSE(reachability->IsReachable(constant2, negate));
+  EXPECT_FALSE(reachability->IsReachable(constant2, exp));
+  EXPECT_TRUE(reachability->IsReachable(constant2, mul));
+  EXPECT_FALSE(reachability->IsReachable(constant2, copy));
+}
+
+TEST_F(HloReachabilityTest, ChannelReachability) {
+  const Shape shape = ShapeUtil::MakeShape(F32, {5, 7});
+  HloComputation::Builder builder("ChannelReachability");
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
+  auto send =
+      builder.AddInstruction(HloInstruction::CreateSend(param, token0, 1));
+  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
+  auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
+  auto recv =
+      builder.AddInstruction(HloInstruction::CreateRecv(shape, token1, 1));
+  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
+
+  auto module = CreateNewVerifiedModule();
+  auto computation = module->AddEntryComputation(builder.Build(recv_done));
+  auto reachability = HloReachabilityMap::Build(computation);
+  EXPECT_TRUE(reachability->IsReachable(param, recv_done));
+  EXPECT_FALSE(reachability->IsReachable(send, recv));
+  EXPECT_FALSE(reachability->IsReachable(send_done, recv));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 5ac43808ee2945eaa5003baad24d5d331419db83..48add75523f02005c70bc6baf69a6b7d5aa4f7ef 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -130,10 +130,10 @@ using ItemList = absl::InlinedVector<Item*, 3>;
 // before arbitrary elements.
 class InstructionList {
  public:
-  explicit InstructionList(const std::vector<const HloInstruction*>& order) {
+  explicit InstructionList(const HloInstructionSequence& order) {
     int64 position = 0;
     Item* last = nullptr;
-    for (const HloInstruction* inst : order) {
+    for (HloInstruction* inst : order.instructions()) {
       // Add a new item to the linked list.
       Item* item = new Item;
       item->next = nullptr;
@@ -151,7 +151,7 @@ class InstructionList {
       // to be monotonically increasing through the list, and so is still useful
       // for quickly(-ish) determining the order of arbitrary instructions in
       // the list.
-      item->instruction = const_cast<HloInstruction*>(inst);
+      item->instruction = inst;
       item->position = position;
       position++;
 
@@ -927,7 +927,7 @@ Item* PickRematerializationCandidate(
 
 StatusOr<int64> HloRematerialization::ComputePeakMemory(
     const HloComputation* computation,
-    const std::vector<const HloInstruction*>& order) const {
+    const HloInstructionSequence& order) const {
   InstructionList instruction_list(order);
   MemoryUsageTracker tracker(computation, size_function_, *points_to_analysis_,
                              instruction_list);
@@ -971,8 +971,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
           << HumanReadableNumBytes(computation_peak_memory_.at(computation));
   CHECK(!ContainsKey(rematerialized_computations_, computation));
 
-  InstructionList instruction_list(
-      schedule->sequence(computation).instructions());
+  InstructionList instruction_list(schedule->sequence(computation));
   MemoryUsageTracker memory_tracker(computation, size_function_,
                                     *points_to_analysis_, instruction_list);
   bool changed = false;
@@ -1184,7 +1183,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   sequence.clear();
   for (auto* item = instruction_list.first(); item != nullptr;
        item = instruction_list.next(item)) {
-    const HloInstruction* instruction = item->instruction;
+    HloInstruction* instruction = item->instruction;
     sequence.push_back(instruction);
   }
   rematerialized_computations_.insert(computation);
@@ -1215,7 +1214,7 @@ StatusOr<bool> HloRematerialization::Run(HloModule* module) {
   // by the caller.
   int64 module_output_size = 0;
   ShapeUtil::ForEachSubshape(
-      module->entry_computation()->root_instruction()->shape(),
+      module->result_shape(),
       [&module_output_size, this](const Shape& subshape,
                                   const ShapeIndex& /*index*/) {
         module_output_size += size_function_(subshape);
@@ -1235,10 +1234,8 @@ StatusOr<bool> HloRematerialization::Run(HloModule* module) {
         if (node.context() == CallContext::kSequential) {
           TF_ASSIGN_OR_RETURN(
               computation_peak_memory_[node.computation()],
-              ComputePeakMemory(node.computation(),
-                                module->schedule()
-                                    .sequence(node.computation())
-                                    .instructions()));
+              ComputePeakMemory(node.computation(), module->schedule().sequence(
+                                                        node.computation())));
         }
         return Status::OK();
       },
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 70d83c04f07ca7fd0139f586869e8fe688f958f4..a07d348041b72bba45c6fd1f726f2a0065d01e53 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -87,9 +87,8 @@ class HloRematerialization : public HloModulePass {
   // peak memory is the maximum total size of all live HLO instruction values at
   // any program point. 'order' is the order in which the HLO instructions will
   // be emitted which is used to determine lifespans of HLO values.
-  StatusOr<int64> ComputePeakMemory(
-      const HloComputation* computation,
-      const std::vector<const HloInstruction*>& order) const;
+  StatusOr<int64> ComputePeakMemory(const HloComputation* computation,
+                                    const HloInstructionSequence& order) const;
 
   // Returns the peak memory usage of the called computations for the given
   // instruction. Zero is returned if the instruction calls no computations.
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index f7e82fb1f88e856305f6f481a451d4cd64ba4acf..22c3c40a93a1ddcd36659483fcc79fede32dd2c3 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -36,7 +36,7 @@ namespace op = xla::testing::opcode_matchers;
 
 using ::testing::_;
 
-class HloRematerializationTest : public HloVerifiedTestBase {
+class HloRematerializationTest : public HloTestBase {
  protected:
   // Creates and returns a computation which can benefit from
   // rematerialization. The computation looks like:
@@ -162,7 +162,7 @@ class HloRematerializationTest : public HloVerifiedTestBase {
 // Test rematerialization of a single computation produced by
 // MakeRematerializableComputation.
 TEST_F(HloRematerializationTest, SingleComputation) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeRematerializableComputation());
 
@@ -177,7 +177,7 @@ TEST_F(HloRematerializationTest, SingleComputation) {
   // with rematerialization so pick a memory limit between these values (14KB).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/14 * 1024, module));
+                              /*memory_limit_bytes=*/14 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // Root should not have changed.
@@ -203,7 +203,7 @@ TEST_F(HloRematerializationTest, SingleComputation) {
 // MakeRematerializableComputation but with a sufficiently high memory limit
 // such that no instructions are rematerialized.
 TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation =
       module->AddEntryComputation(MakeRematerializableComputation());
 
@@ -211,7 +211,7 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/20 * 1024, module));
+                              /*memory_limit_bytes=*/20 * 1024, module.get()));
 
   // No instructions should have been materialized.
   EXPECT_FALSE(changed);
@@ -225,7 +225,7 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
 // computation should be the one chosen because rematerialization in the while
 // will presumably be more expensive.
 TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
   cond_builder.AddInstruction(
@@ -249,7 +249,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
   // bit lower (17KB) to force rematerialization of the entry computation.
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/17 * 1024, module));
+                              /*memory_limit_bytes=*/17 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // Only the entry computation should have a rematerialized instruction added.
@@ -261,7 +261,7 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
 // while. Both the entry computation and while body computation should have
 // computations rematerialized.
 TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
   cond_builder.AddInstruction(
@@ -282,7 +282,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/15 * 1024, module));
+                              /*memory_limit_bytes=*/15 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // Both computations should have rematerialized instructions added.
@@ -293,7 +293,7 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
 // Test rematerialization of a doubly nested computation. All computations
 // should have an instruction rematerialized.
 TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto cond_builder = HloComputation::Builder(TestName() + ".cond");
   cond_builder.AddInstruction(
@@ -321,7 +321,7 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   // ~12K so pick something slightly larger.
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/13 * 1024, module));
+                              /*memory_limit_bytes=*/13 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // All computations should have rematerialized instructions added.
@@ -346,7 +346,7 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) {
   //
   //   F32[1024] add_2 = add(rng, add(tanh, add_1))  // LIVE: add_2 + add_1 +
   //                                                 //       rng + tanh + exp
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto param = builder.AddInstruction(
@@ -390,7 +390,7 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) {
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
       RunHloRematerialization(
-          /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_), module));
+          /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_), module.get()));
   EXPECT_TRUE(changed);
   // The rng should not have been rematerialized.
   EXPECT_EQ(count_rngs(entry_computation), 1);
@@ -420,7 +420,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // The value %bcast is live across each call of Subcomputation (which requires
   // 8KB) though the value is not used in the calls. Rematerializing %bcast
   // across these calls reduces peak memory use from ~20KB down to ~16KB.
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation* subcomputation = nullptr;
   {
@@ -482,7 +482,7 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // rematerialization).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/22 * 1024, module));
+                              /*memory_limit_bytes=*/22 * 1024, module.get()));
   EXPECT_TRUE(changed);
 
   // The broadcast should have been rematerialized 3 times.
@@ -533,7 +533,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // (ie %bcast is used indirectly by %negate), otherwise the %negate operand
   // aliases %add_2.
   const bool indirectly_used = GetParam();
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation* subcomputation = nullptr;
   {
@@ -576,7 +576,7 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // rematerialization).
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           RunHloRematerialization(
-                              /*memory_limit_bytes=*/22 * 1024, module));
+                              /*memory_limit_bytes=*/22 * 1024, module.get()));
   // Rematerialization should only occur if the rematerializable instruction has
   // no indirect uses.
   if (indirectly_used) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index fa7f216321988137dcf9104a324f5f7789869aa5..5a9b820a9d7f58695383b21c9e2126cf98970c83 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -204,6 +205,40 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
       /*profile=*/profile);
 }
 
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<Executable> executable,
+    const absl::Span<const ShapedBuffer* const> arguments,
+    ExecutionProfile* profile) {
+  // Get service run options.
+  se::Stream stream(backend().default_stream_executor());
+  stream.Init();
+  ServiceExecutableRunOptions service_run_options =
+      GetServiceRunOptionsForDevice(backend().default_device_ordinal(), &stream,
+                                    nullptr);
+
+  TF_ASSIGN_OR_RETURN(
+      ScopedShapedBuffer retval,
+      executable->ExecuteOnStreamWrapper(&service_run_options,
+                                         /*profile=*/profile, arguments));
+  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+  return std::move(retval);
+}
+
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<Executable> executable,
+    const absl::Span<const ScopedShapedBuffer> arguments,
+    ExecutionProfile* profile) {
+  std::vector<const ShapedBuffer*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&argument);
+  }
+  return ExecuteWithDeviceBuffers(
+      /*executable=*/std::move(executable),
+      /*arguments=*/argument_pointers,
+      /*profile=*/profile);
+}
+
 StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const ReplicatedExecuteOptions& options) {
@@ -324,10 +359,13 @@ StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
 StatusOr<std::unique_ptr<Executable>> HloRunner::CreateExecutable(
     std::unique_ptr<HloModule> module, bool run_hlo_passes) {
   if (run_hlo_passes) {
+    auto module_group = absl::make_unique<HloModuleGroup>(std::move(module));
     TF_ASSIGN_OR_RETURN(
-        module, backend().compiler()->RunHloPasses(
-                    std::move(module), backend().default_stream_executor(),
-                    backend().memory_allocator()));
+        auto executables,
+        backend().compiler()->Compile(std::move(module_group),
+                                      {{backend().default_stream_executor()}},
+                                      backend().memory_allocator()));
+    return std::move(executables[0]);
   }
   return backend().compiler()->RunBackend(std::move(module),
                                           backend().default_stream_executor(),
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 2e934bf66ae43ea412f242030b874dddb6d3722d..bb792cf8c9825ff67ca33bbcf2c3c32b1a0ecb85 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -136,6 +136,21 @@ class HloRunner {
       const absl::Span<const ScopedShapedBuffer> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<Executable> executable,
+      const absl::Span<const ShapedBuffer* const> arguments,
+      ExecutionProfile* profile = nullptr);
+
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<Executable> executable,
+      const absl::Span<const ScopedShapedBuffer> arguments,
+      ExecutionProfile* profile = nullptr);
+
+  // Creates an executable object given an HLO module. If run_hlo_passes is
+  // true, the HLO passes will be run as part of compilation.
+  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes);
+
   // Executes a given HLO module into a set of replicas, and returns a map
   // with the replica number as key, and the corresponding returned literal as
   // value.
@@ -152,11 +167,6 @@ class HloRunner {
   const Backend& backend() const;
 
  private:
-  // Creates an executable object given an HLO module. If run_hlo_passes is
-  // true, the HLO passes will be run before.
-  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
-      std::unique_ptr<HloModule> module, bool run_hlo_passes);
-
   // Creates a ServiceExecutableRunOptions object to configure a run on device,
   // using the provided stream object. If device_assignment is not nullptr, it
   // will be used to configure the replication parameters. Replicated executions
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc
index 9972eb20774550817143cb27dd94667364cf68ec..8f6eb974c5179b420c8f961393ca923e0a3b3530 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule.cc
@@ -46,8 +46,8 @@ namespace xla {
         << "No computation exists in HLO module with id " << computation_id;
     const HloComputation* computation = comp_it->second;
 
-    absl::flat_hash_map<int64, const HloInstruction*> id_to_instruction;
-    for (const HloInstruction* instruction : computation->instructions()) {
+    absl::flat_hash_map<int64, HloInstruction*> id_to_instruction;
+    for (HloInstruction* instruction : computation->instructions()) {
       id_to_instruction[instruction->unique_id()] = instruction;
     }
 
@@ -81,9 +81,8 @@ StatusOr<HloScheduleProto> HloSchedule::ToProto() const {
   return std::move(proto);
 }
 
-void HloSchedule::set_sequence(
-    const HloComputation* computation,
-    absl::Span<const HloInstruction* const> sequence) {
+void HloSchedule::set_sequence(const HloComputation* computation,
+                               absl::Span<HloInstruction* const> sequence) {
   set_sequence(computation, HloInstructionSequence(sequence));
 }
 
@@ -114,8 +113,8 @@ Status HloSchedule::UpdateComputationSchedule(
     const HloComputation* computation) {
   // Map from unique ID to HloInstruction pointer for instructions in the
   // computation.
-  absl::flat_hash_map<int, const HloInstruction*> id_to_instruction;
-  for (const HloInstruction* instruction : computation->instructions()) {
+  absl::flat_hash_map<int, HloInstruction*> id_to_instruction;
+  for (HloInstruction* instruction : computation->instructions()) {
     InsertOrDie(&id_to_instruction, instruction->unique_id(), instruction);
   }
 
@@ -128,7 +127,7 @@ Status HloSchedule::UpdateComputationSchedule(
   // Map from HloInstruction X to newly added instructions (instruction is in
   // computation, but not in schedule) which use X. If an instruction is not in
   // the map, then it has no users which are newly added instructions.
-  absl::flat_hash_map<const HloInstruction*, std::vector<const HloInstruction*>>
+  absl::flat_hash_map<const HloInstruction*, std::vector<HloInstruction*>>
       new_instruction_uses;
 
   // For each newly added instruction, this is the count of the instruction's
@@ -138,9 +137,9 @@ Status HloSchedule::UpdateComputationSchedule(
 
   // Create a worklist of newly added instructions which are ready to be added
   // to the schedule. Initialize worklist with those that have zero operands.
-  std::queue<const HloInstruction*> worklist;
+  std::queue<HloInstruction*> worklist;
 
-  for (const HloInstruction* instruction : computation->instructions()) {
+  for (HloInstruction* instruction : computation->instructions()) {
     if (ids_in_schedule.count(instruction->unique_id()) == 0) {
       // This is a newly added instruction which is not in the schedule.
       if (instruction->operands().empty()) {
@@ -161,17 +160,17 @@ Status HloSchedule::UpdateComputationSchedule(
   // Lambda which schedules all instructions on the worklist.
   auto schedule_worklist = [&]() {
     while (!worklist.empty()) {
-      const HloInstruction* instruction = worklist.front();
+      HloInstruction* instruction = worklist.front();
       worklist.pop();
       new_sequence.push_back(instruction);
-      std::vector<const HloInstruction*>* new_users =
+      std::vector<HloInstruction*>* new_users =
           tensorflow::gtl::FindOrNull(new_instruction_uses, instruction);
       if (new_users != nullptr) {
         // This just-scheduled instruction has users which are newly added to
         // the module. Update the number of unscheduled operands and push the
         // newly added instruction to the worklist if it is ready to
         // schedule.
-        for (const HloInstruction* new_user : *new_users) {
+        for (HloInstruction* new_user : *new_users) {
           unscheduled_operand_count.at(new_user)--;
           CHECK_GE(unscheduled_operand_count.at(new_user), 0);
           if (unscheduled_operand_count.at(new_user) == 0) {
@@ -235,7 +234,6 @@ Status HloSchedule::Update() {
 
 Status HloSchedule::Verify() const {
   VLOG(2) << "VerifySchedule()";
-  XLA_VLOG_LINES(3, module_->ToString());
   XLA_VLOG_LINES(2, ToString());
 
   // Verify schedule contains exactly the same set of non-fusion computations as
@@ -265,7 +263,10 @@ Status HloSchedule::Verify() const {
     }
 
     TF_RET_CHECK(instruction_position.size() ==
-                 computation->instruction_count());
+                 computation->instruction_count())
+        << "Schedule for computation " << computation->name() << " has "
+        << instruction_position.size() << " instructions, expected "
+        << computation->instruction_count();
     for (const HloInstruction* instruction : computation->instructions()) {
       TF_RET_CHECK(instruction_position.count(instruction) == 1)
           << "Instruction " << instruction->name() << " is not in schedule";
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h
index 0a714101ee587aa847fa674bbde5586287c51f33..486ddbf499de80c634bc497158cd79ca066cc866 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/hlo_schedule.h
@@ -35,14 +35,14 @@ class HloInstructionSequence {
  public:
   HloInstructionSequence() = default;
   explicit HloInstructionSequence(
-      absl::Span<const HloInstruction* const> instructions) {
-    for (const HloInstruction* instruction : instructions) {
+      absl::Span<HloInstruction* const> instructions) {
+    for (HloInstruction* instruction : instructions) {
       push_back(instruction);
     }
   }
 
   // Adds the instruction to the end of the sequence.
-  void push_back(const HloInstruction* instruction) {
+  void push_back(HloInstruction* instruction) {
     instruction_sequence_.push_back(instruction);
     id_sequence_.push_back(instruction->unique_id());
   }
@@ -56,7 +56,7 @@ class HloInstructionSequence {
   int64 size() const { return instruction_sequence_.size(); }
 
   // Returns the sequence of HLO instructions.
-  const std::vector<const HloInstruction*>& instructions() const {
+  const std::vector<HloInstruction*>& instructions() const {
     return instruction_sequence_;
   }
 
@@ -65,7 +65,7 @@ class HloInstructionSequence {
 
  private:
   // The sequence as HloInstructions.
-  std::vector<const HloInstruction*> instruction_sequence_;
+  std::vector<HloInstruction*> instruction_sequence_;
 
   // The sequence of HLO instructions, represented by their unique IDs. The
   // sequence is stored as both HloInstructions and unique IDs because the
@@ -98,7 +98,7 @@ class HloSchedule {
 
   // Sets the sequence for the given computation to the given sequence.
   void set_sequence(const HloComputation* computation,
-                    absl::Span<const HloInstruction* const> sequence);
+                    absl::Span<HloInstruction* const> sequence);
   void set_sequence(const HloComputation* computation,
                     HloInstructionSequence sequence);
 
diff --git a/tensorflow/compiler/xla/service/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
index 1424569ac1f62e4b965876141f1eb40be4f15bea..0e56e6f760e35ddcb45c6f58771d78405a09acfe 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
@@ -56,10 +56,10 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
-  const std::vector<const HloInstruction*>& entry_schedule =
+  const auto& entry_schedule =
       schedule.sequence(module->entry_computation()).instructions();
 
   EXPECT_EQ(entry_schedule.size(), 6);
@@ -90,7 +90,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -139,7 +139,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -183,7 +183,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -244,7 +244,7 @@ ENTRY %WhileLoop () -> s32[] {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
@@ -313,7 +313,7 @@ ENTRY %WhileLoop () -> s32[] {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 188f4acc7945f3ec98065eae5a87a41c39730432..70a860c356ca2fb1c4c973ea3d96c50fabc2c7c2 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -469,6 +469,9 @@ absl::optional<HloSharding> HloSharding::ExtractSingleSharding() const {
   if (!IsTuple()) {
     return *this;
   }
+  if (tuple_elements_.empty()) {
+    return absl::nullopt;
+  }
   for (int64 i = 1; i < tuple_elements_.size(); ++i) {
     if (tuple_elements_[0] != tuple_elements_[i]) {
       return absl::nullopt;
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
index e3f4a9852ace86c20610362aa6ad3c3d9c78de30..f5061304456e04ab40448861343ef201c9450dcf 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -169,14 +169,14 @@ Status ApplyDomainSingleSharding(const DomainMetadata::Domain& domain,
 // If user is a tuple instruction, return the tuple subsharding corresponding to
 // the operand matching the instruction argument, because that is the
 // subsharding corresponding to instruction.
-ShapeTree<HloSharding> GetShardingTreeFromUser(
+StatusOr<ShapeTree<HloSharding>> GetShardingTreeFromUser(
     const HloInstruction& instruction, const HloInstruction& user) {
   if (user.opcode() == HloOpcode::kTuple) {
     return user.sharding()
         .GetSubSharding(user.shape(), {user.operand_index(&instruction)})
-        .GetAsShapeTree(instruction.shape());
+        .AsShapeTree(instruction.shape());
   }
-  return user.sharding().GetAsShapeTree(user.shape());
+  return user.sharding().AsShapeTree(user.shape());
 }
 
 // Assign rhs to lhs. If rhs is unassigned (assigned to kUnassignedDevice)
@@ -253,7 +253,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
       instruction->shape(), HloSharding::AssignDevice(kUnassignedDevice));
   for (HloInstruction* user : instruction->users()) {
     if (user->opcode() == HloOpcode::kDomain &&
-        domain.exit_domains.count(const_cast<HloInstruction*>(user)) > 0) {
+        domain.exit_domains.count(user) > 0) {
       // If a user is a domain and it is registered in the domain exits, then
       // the instruction sharding is taken directly from the domain, and no
       // further users need to be visited.
@@ -264,8 +264,8 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
       continue;
     }
     AssignmentKind sub_assigned = AssignmentKind::kUnassigned;
-    ShapeTree<HloSharding> user_sharding_tree =
-        GetShardingTreeFromUser(*instruction, *user);
+    TF_ASSIGN_OR_RETURN(ShapeTree<HloSharding> user_sharding_tree,
+                        GetShardingTreeFromUser(*instruction, *user));
     if (ShapeUtil::IsTuple(instruction->shape())) {
       // For tuple-shaped instructions collect individual tuple subshardings
       // from the uses, and then combine them into the tuple sharding.
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
index 45c684d66752862eec301b8943d350804f070309..c1073911ea9dc3811c195e27bcbae9b00929ad17 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
@@ -66,7 +66,7 @@ class HloSubcomputationUnificationTest : public HloTestBase {
 };
 
 TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -103,7 +103,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
 }
 
 TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -143,7 +143,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
 
 // Do not unify subcomputations with different parameter shapes.
 TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -184,7 +184,7 @@ TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
 // Regression test for b/31466798. Checks that entry_computation is still valid
 // after unification.
 TEST_F(HloSubcomputationUnificationTest, TwoIdenticalComputations) {
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   for (int i = 0; i < 2; ++i) {
     HloComputation::Builder builder("pow");
     auto x =
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
index 6fd734a2b9e6c8c9fca76a944ca3df4c3b8a212f..1e2b31a1f2bb4865faafc3d14e2b194e3aa171a1 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 
@@ -24,7 +24,7 @@ namespace {
 
 using ::tensorflow::GraphDef;
 
-class HloTfGraphBuilderTest : public HloVerifiedTestBase {
+class HloTfGraphBuilderTest : public HloTestBase {
  protected:
   HloTfGraphBuilderTest() {}
   HloTfGraphBuilder generator_;
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index b6670d409b92e8be42f5cdb40fba8d662ae83958..1f01b0bb365450a933da9cc443db5223c06903f0 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -166,9 +166,6 @@ class HloValue : public BufferValue {
 
   // Whether this value is live out of the HLO module.
   bool live_out_of_module_ = false;
-
-  // Whether this value is live out of its computation.
-  bool live_out_of_computation_ = false;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloValue& hlo_value);
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index b5498bb936cb422d7a9dfa3d647266fa8b024b97..77db7b098a38ff4efdcc7447935fae61561c9ff4 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
@@ -27,6 +28,68 @@ limitations under the License.
 
 namespace xla {
 
+Status VerifyNotSparse(const Shape& shape) {
+  return ShapeUtil::ForEachSubshapeWithStatus(
+      shape, [](const Shape& subshape, const ShapeIndex&) -> Status {
+        if (LayoutUtil::IsSparseArray(subshape)) {
+          return InternalError("Sparse arrays are not yet fully supported: %s",
+                               ShapeUtil::HumanStringWithLayout(subshape));
+        }
+        return Status::OK();
+      });
+}
+
+bool IsCallerInstruction(HloInstruction* hlo) {
+  switch (hlo->opcode()) {
+    case HloOpcode::kCall:
+    case HloOpcode::kConditional:
+    case HloOpcode::kWhile:
+    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kMap:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kFusion:
+      return true;
+    default:
+      return false;
+  }
+}
+
+Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
+  if (!hlo->called_computations().empty() && !IsCallerInstruction(hlo)) {
+    return InternalError(
+        "Called computations specified for non-caller instruction  %s",
+        hlo->ToString());
+  }
+  return VerifyNotSparse(hlo->shape());
+}
+
+namespace {
+
+Status CheckOperandCount(const HloInstruction* hlo, int expected) {
+  if (hlo->operand_count() != expected) {
+    return InternalError("Expected %d operands for %s instruction: %s",
+                         expected, HloOpcodeString(hlo->opcode()),
+                         hlo->ToString());
+  }
+  return Status::OK();
+}
+
+Status CheckParameterCount(const HloInstruction* calling_instruction,
+                           const HloComputation* computation, int expected) {
+  if (computation->num_parameters() != expected) {
+    return InternalError(
+        "Expected computation %s called from %s to have %d parameters, has %d",
+        computation->name(), calling_instruction->name(), expected,
+        computation->num_parameters());
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 Status ShapeVerifier::HandleElementwiseUnary(HloInstruction* hlo) {
   return CheckUnaryShape(hlo);
 }
@@ -58,12 +121,14 @@ Status ShapeVerifier::HandleConcatenate(HloInstruction* concatenate) {
 }
 
 Status ShapeVerifier::HandleConvert(HloInstruction* convert) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(convert, 1));
   return CheckShape(convert, ShapeInference::InferConvertShape(
                                  convert->operand(0)->shape(),
                                  convert->shape().element_type()));
 }
 
 Status ShapeVerifier::HandleBitcastConvert(HloInstruction* convert) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(convert, 1));
   return CheckShape(convert, ShapeInference::InferBitcastConvertShape(
                                  convert->operand(0)->shape(),
                                  convert->shape().element_type()));
@@ -74,6 +139,7 @@ Status ShapeVerifier::HandleCopy(HloInstruction* copy) {
 }
 
 Status ShapeVerifier::HandleDot(HloInstruction* dot) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(dot, 2));
   TF_ASSIGN_OR_RETURN(const Shape expected,
                       ShapeInference::InferDotOpShape(
                           dot->operand(0)->shape(), dot->operand(1)->shape(),
@@ -82,6 +148,7 @@ Status ShapeVerifier::HandleDot(HloInstruction* dot) {
 }
 
 Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(convolution, 2));
   TF_ASSIGN_OR_RETURN(
       const Shape expected,
       ShapeInference::InferConvolveShape(
@@ -92,6 +159,7 @@ Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
 }
 
 Status ShapeVerifier::HandleFft(HloInstruction* fft) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(fft, 1));
   TF_ASSIGN_OR_RETURN(
       const Shape expected,
       ShapeInference::InferFftShape(fft->operand(0)->shape(), fft->fft_type(),
@@ -118,11 +186,13 @@ Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
 }
 
 Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 1));
   return CheckShape(hlo, ShapeInference::InferCollectivePermuteShape(
                              hlo->operand(0)->shape()));
 }
 
 Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(reduce_precision, 1));
   return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape(
                                           reduce_precision->operand(0)->shape(),
                                           reduce_precision->exponent_bits(),
@@ -156,6 +226,7 @@ Status ShapeVerifier::CheckOperandAndParameter(
 }
 
 Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 1));
   HloInfeedInstruction* infeed = Cast<HloInfeedInstruction>(instruction);
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 0));
 
@@ -166,6 +237,7 @@ Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 2));
   HloOutfeedInstruction* outfeed = Cast<HloOutfeedInstruction>(instruction);
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 1));
 
@@ -192,10 +264,7 @@ bool ShapeVerifier::HasCompatibleElementTypes(const Shape& shape_0,
 }
 
 Status ShapeVerifier::HandleRng(HloInstruction* instruction) {
-  if (instruction->operand_count() != 2) {
-    return InternalError("Expected two operands for Rng instruction: %s",
-                         instruction->ToString());
-  }
+  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 2));
 
   const Shape& shape_0 = instruction->operand(0)->shape();
   const Shape& shape_1 = instruction->operand(1)->shape();
@@ -244,29 +313,42 @@ Status ShapeVerifier::HandleRng(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(reverse, 1));
   return CheckShape(
       reverse, ShapeInference::InferReverseShape(reverse->operand(0)->shape(),
                                                  reverse->dimensions()));
 }
 
 Status ShapeVerifier::HandleSort(HloInstruction* sort) {
-  if (sort->operand_count() == 2 &&
-      !ShapeUtil::SameDimensions(sort->operand(0)->shape(),
-                                 sort->operand(1)->shape())) {
-    return InternalError(
-        "Expected sort to have to have the same dimensions for the keys and "
-        "the values. Keys shape is: %s\n, Values shape is: %s",
-        StringifyShape(sort->operand(0)->shape()),
-        StringifyShape(sort->operand(1)->shape()));
+  if (sort->operand_count() < 1) {
+    return InternalError("Expected at least 1 operand for %s instruction: %s",
+                         HloOpcodeString(sort->opcode()), sort->ToString());
+  }
+  for (int64 operand = 1; operand < sort->operand_count(); ++operand) {
+    if (!ShapeUtil::SameDimensions(sort->operand(0)->shape(),
+                                   sort->operand(operand)->shape())) {
+      return InternalError(
+          "Expected sort to have to have the same dimensions for the keys "
+          "and the values. Keys shape is: %s\n, Values shape (operand index "
+          "%lld) is: %s",
+          StringifyShape(sort->operand(0)->shape()), operand,
+          StringifyShape(sort->operand(operand)->shape()));
+    }
   }
   return CheckVariadicShape(sort);
 }
 
 Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(constant, 0));
+  if (!Cast<HloConstantInstruction>(constant)->HasLiteral()) {
+    return InternalError("Constant is required to have a valid literal: %s",
+                         constant->ToString());
+  }
   return CheckShape(constant, constant->literal().shape());
 }
 
 Status ShapeVerifier::HandleIota(HloInstruction* instruction) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 0));
   auto* iota = Cast<HloIotaInstruction>(instruction);
   const int64 rank = ShapeUtil::Rank(iota->shape());
   if (rank == 0) {
@@ -281,6 +363,7 @@ Status ShapeVerifier::HandleIota(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(get_tuple_element, 1));
   return CheckShape(get_tuple_element,
                     ShapeInference::InferGetTupleElementShape(
                         get_tuple_element->operand(0)->shape(),
@@ -288,6 +371,12 @@ Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
 }
 
 Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
+  if (reduce->operand_count() % 2 != 0) {
+    return InternalError(
+        "Expected an even number of operands for %s instruction: %s",
+        HloOpcodeString(reduce->opcode()), reduce->ToString());
+  }
+
   std::vector<const Shape*> operand_shapes;
   for (const HloInstruction* operand : reduce->operands()) {
     operand_shapes.push_back(&operand->shape());
@@ -298,48 +387,64 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
 }
 
 Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(bitcast, 1));
   return Status::OK();
 }
 
 Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(broadcast, 1));
   // HLO broadcast has no exact analog at the proto level so there is no
   // ShapeInference method. Check the output shape explicitly.
   const Shape& operand_shape = broadcast->operand(0)->shape();
   // Check for mixed precision.
-  TF_RETURN_IF_ERROR(CheckShape(broadcast, broadcast->shape()));
+  TF_RET_CHECK(SameElementType(broadcast->shape(), operand_shape));
   TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
                broadcast->dimensions().size());
   for (int64 operand_dimension = 0;
        operand_dimension < ShapeUtil::Rank(operand_shape);
        ++operand_dimension) {
     int64 output_dimension = broadcast->dimensions()[operand_dimension];
-    TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) ==
-                 operand_shape.dimensions(operand_dimension))
+    TF_RET_CHECK((output_dimension < ShapeUtil::Rank(broadcast->shape())) &&
+                 output_dimension >= 0 &&
+                 (broadcast->shape().dimensions(output_dimension) ==
+                  operand_shape.dimensions(operand_dimension)))
         << broadcast->ToString() << " operand shape " << operand_shape;
   }
   return Status::OK();
 }
 
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(reshape, 1));
   // Check for mixed precision.
-  TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape()));
+  const Shape& operand_shape = reshape->operand(0)->shape();
+  TF_RET_CHECK(SameElementType(reshape->shape(), operand_shape));
   TF_RET_CHECK(ShapeUtil::ElementsIn(reshape->shape()) ==
-               ShapeUtil::ElementsIn(reshape->operand(0)->shape()));
+               ShapeUtil::ElementsIn(operand_shape));
   return Status::OK();
 }
 
 Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(transpose, 1));
   return CheckShape(
       transpose, ShapeInference::InferTransposeShape(
                      transpose->operand(0)->shape(), transpose->dimensions()));
 }
 
 Status ShapeVerifier::HandleParameter(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 0));
   return Status::OK();
 }
 
 Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
-  for (HloInstruction* fused_param : fusion->fused_parameters()) {
+  auto& fused_parameters = fusion->fused_parameters();
+  if (fused_parameters.size() != fusion->operand_count()) {
+    return InternalError(
+        "Fused parameter count (%d) does not match the number of operands (%d)"
+        " passed to the fusion instruction in: %s.",
+        fused_parameters.size(), fusion->operand_count(),
+        fusion->ToString().c_str());
+  }
+  for (HloInstruction* fused_param : fused_parameters) {
     int64 param_no = fused_param->parameter_number();
     if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape())) {
       return InternalError(
@@ -352,6 +457,8 @@ Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
 }
 
 Status ShapeVerifier::HandleCall(HloInstruction* call) {
+  TF_RETURN_IF_ERROR(
+      CheckParameterCount(call, call->to_apply(), call->operand_count()));
   for (int64 i = 0; i < call->to_apply()->num_parameters(); ++i) {
     TF_RETURN_IF_ERROR(CheckOperandAndParameter(call, i, call->to_apply(), i));
   }
@@ -359,9 +466,30 @@ Status ShapeVerifier::HandleCall(HloInstruction* call) {
   return CheckShape(call, call->to_apply()->root_instruction()->shape());
 }
 
-Status ShapeVerifier::HandleCustomCall(HloInstruction*) { return Status::OK(); }
+Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
+  const HloCustomCallInstruction* custom_call =
+      DynCast<const HloCustomCallInstruction>(instruction);
+  TF_RET_CHECK(custom_call != nullptr);
+  if (custom_call->layout_constrained()) {
+    // If the layout is constrained, verify all the respective shapes have
+    // layouts and that the constrained operand shapes match the shapes of the
+    // operands.
+    TF_RET_CHECK(LayoutUtil::HasLayout(custom_call->shape()));
+    TF_RET_CHECK(custom_call->operand_count() ==
+                 custom_call->operand_shapes_with_layout().size());
+    for (int64 i = 0; i < custom_call->operand_count(); ++i) {
+      const Shape& operand_shape_with_layout =
+          custom_call->operand_shapes_with_layout()[i];
+      TF_RET_CHECK(ShapeUtil::Compatible(custom_call->operand(i)->shape(),
+                                         operand_shape_with_layout));
+      TF_RET_CHECK(LayoutUtil::HasLayout(operand_shape_with_layout));
+    }
+  }
+  return Status::OK();
+}
 
 Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(slice, 1));
   return CheckShape(slice,
                     ShapeInference::InferSliceShape(
                         slice->operand(0)->shape(), slice->slice_starts(),
@@ -369,6 +497,7 @@ Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
 }
 
 Status ShapeVerifier::HandleDynamicSlice(HloInstruction* dynamic_slice) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(dynamic_slice, 2));
   return CheckShape(dynamic_slice, ShapeInference::InferDynamicSliceShape(
                                        dynamic_slice->operand(0)->shape(),
                                        dynamic_slice->operand(1)->shape(),
@@ -377,6 +506,7 @@ Status ShapeVerifier::HandleDynamicSlice(HloInstruction* dynamic_slice) {
 
 Status ShapeVerifier::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(dynamic_update_slice, 3));
   return CheckShape(dynamic_update_slice,
                     ShapeInference::InferDynamicUpdateSliceShape(
                         dynamic_update_slice->operand(0)->shape(),
@@ -406,6 +536,7 @@ Status ShapeVerifier::HandleMap(HloInstruction* map) {
 }
 
 Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(reduce_window, 2));
   return CheckShape(
       reduce_window,
       ShapeInference::InferReduceWindowShape(
@@ -415,6 +546,7 @@ Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
 }
 
 Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 3));
   return CheckShape(
       instruction,
       ShapeInference::InferSelectAndScatterShape(
@@ -425,6 +557,11 @@ Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(xla_while, 1));
+  TF_RETURN_IF_ERROR(
+      CheckParameterCount(xla_while, xla_while->while_body(), 1));
+  TF_RETURN_IF_ERROR(
+      CheckParameterCount(xla_while, xla_while->while_condition(), 1));
   TF_RETURN_IF_ERROR(
       CheckOperandAndParameter(xla_while, 0, xla_while->while_body(), 0));
   TF_RETURN_IF_ERROR(
@@ -444,6 +581,11 @@ Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
 }
 
 Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(conditional, 3));
+  TF_RETURN_IF_ERROR(
+      CheckParameterCount(conditional, conditional->true_computation(), 1));
+  TF_RETURN_IF_ERROR(
+      CheckParameterCount(conditional, conditional->false_computation(), 1));
   TF_RETURN_IF_ERROR(CheckOperandAndParameter(
       conditional, 1, conditional->true_computation(), 0));
   TF_RETURN_IF_ERROR(CheckOperandAndParameter(
@@ -458,12 +600,14 @@ Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
 }
 
 Status ShapeVerifier::HandlePad(HloInstruction* pad) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(pad, 2));
   return CheckShape(pad, ShapeInference::InferPadShape(pad->operand(0)->shape(),
                                                        pad->operand(1)->shape(),
                                                        pad->padding_config()));
 }
 
 Status ShapeVerifier::HandleSend(HloInstruction* send) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(send, 2));
   return CheckShape(send,
                     ShapeUtil::MakeTupleShape({send->operand(0)->shape(),
                                                ShapeUtil::MakeShape(U32, {}),
@@ -471,10 +615,12 @@ Status ShapeVerifier::HandleSend(HloInstruction* send) {
 }
 
 Status ShapeVerifier::HandleSendDone(HloInstruction* send_done) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(send_done, 1));
   return CheckShape(send_done, ShapeUtil::MakeTokenShape());
 }
 
 Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(recv, 1));
   return CheckShape(
       recv, ShapeUtil::MakeTupleShape(
                 {ShapeUtil::GetTupleElementShape(recv->shape(), 0),
@@ -482,6 +628,7 @@ Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
 }
 
 Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(recv_done, 1));
   return CheckShape(
       recv_done,
       ShapeUtil::MakeTupleShape(
@@ -491,6 +638,7 @@ Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
 
 Status ShapeVerifier::HandleBatchNormTraining(
     HloInstruction* batch_norm_training) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(batch_norm_training, 3));
   return CheckShape(batch_norm_training,
                     ShapeInference::InferBatchNormTrainingShape(
                         batch_norm_training->operand(0)->shape(),
@@ -501,6 +649,7 @@ Status ShapeVerifier::HandleBatchNormTraining(
 
 Status ShapeVerifier::HandleBatchNormInference(
     HloInstruction* batch_norm_inference) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(batch_norm_inference, 5));
   return CheckShape(batch_norm_inference,
                     ShapeInference::InferBatchNormInferenceShape(
                         batch_norm_inference->operand(0)->shape(),
@@ -512,6 +661,7 @@ Status ShapeVerifier::HandleBatchNormInference(
 }
 
 Status ShapeVerifier::HandleBatchNormGrad(HloInstruction* batch_norm_grad) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(batch_norm_grad, 5));
   return CheckShape(batch_norm_grad, ShapeInference::InferBatchNormGradShape(
                                          batch_norm_grad->operand(0)->shape(),
                                          batch_norm_grad->operand(1)->shape(),
@@ -548,6 +698,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
     case HloOpcode::kTupleSelect:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
+    case HloOpcode::kSort:
     case HloOpcode::kTuple:
     case HloOpcode::kWhile:
       break;
@@ -579,6 +730,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
 }  // namespace
 
 Status ShapeVerifier::HandleGather(HloInstruction* gather) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(gather, 2));
   return CheckShape(
       gather,
       ShapeInference::InferGatherShape(
@@ -587,6 +739,7 @@ Status ShapeVerifier::HandleGather(HloInstruction* gather) {
 }
 
 Status ShapeVerifier::HandleScatter(HloInstruction* scatter) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(scatter, 3));
   return CheckShape(
       scatter, ShapeInference::InferScatterShape(
                    scatter->operand(0)->shape(), scatter->operand(1)->shape(),
@@ -600,7 +753,19 @@ Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
   for (const HloInstruction* operand : token->operands()) {
     operand_shapes.push_back(&operand->shape());
   }
-  return CheckShape(token, ShapeInference::InferAfterAllShape(operand_shapes));
+  return CheckShape(token, ShapeUtil::MakeTokenShape());
+}
+
+Status ShapeVerifier::HandleAddDependency(HloInstruction* add_dependency) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(add_dependency, 2));
+  TF_RETURN_IF_ERROR(CheckIsTokenOperand(add_dependency, 1));
+  return CheckShape(add_dependency, add_dependency->operand(0)->shape());
+}
+
+Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) {
+  return CheckShape(get_size,
+                    ShapeInference::InferGetDimensionSizeShape(
+                        get_size->operand(0)->shape(), get_size->dimension()));
 }
 
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
@@ -674,12 +839,14 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
 }
 
 Status ShapeVerifier::CheckUnaryShape(const HloInstruction* instruction) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 1));
   return CheckShape(instruction,
                     ShapeInference::InferUnaryOpShape(instruction->opcode(),
                                                       instruction->operand(0)));
 }
 
 Status ShapeVerifier::CheckBinaryShape(const HloInstruction* instruction) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 2));
   return CheckShape(
       instruction, ShapeInference::InferBinaryOpShape(instruction->opcode(),
                                                       instruction->operand(0),
@@ -687,6 +854,7 @@ Status ShapeVerifier::CheckBinaryShape(const HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::CheckTernaryShape(const HloInstruction* instruction) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 3));
   return CheckShape(instruction,
                     ShapeInference::InferTernaryOpShape(
                         instruction->opcode(), instruction->operand(0),
@@ -699,6 +867,50 @@ Status ShapeVerifier::CheckVariadicShape(const HloInstruction* instruction) {
                         instruction->opcode(), instruction->operands()));
 }
 
+Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
+  const HloComputation* computation = module.entry_computation();
+  const auto& layout = module.entry_computation_layout();
+  const ShapeLayout& result_layout = layout.result_layout();
+
+  TF_RETURN_IF_ERROR(
+      ShapeUtil::ValidateShapeWithOptionalLayout(result_layout.shape()));
+
+  TF_RETURN_IF_ERROR(VerifyNotSparse(result_layout.shape()));
+
+  if (!ShapeUtil::Compatible(computation->root_instruction()->shape(),
+                             result_layout.shape())) {
+    return InternalError(
+        "Shape of the root instruction of entry computation (%s) should be "
+        "compatible to one specified in module's entry computation layout (%s)",
+        ShapeUtil::HumanString(computation->root_instruction()->shape()),
+        ShapeUtil::HumanString(result_layout.shape()));
+  }
+
+  if (computation->num_parameters() != layout.parameter_count()) {
+    return InternalError(
+        "Number of parameters in entry computation layout (%d) must be same "
+        "as number of parameters of entry computation computation (%d)",
+        layout.parameter_count(), computation->num_parameters());
+  }
+
+  for (int i = 0; i < computation->num_parameters(); ++i) {
+    const HloInstruction* parameter = computation->parameter_instruction(i);
+    TF_RETURN_IF_ERROR(
+        ShapeUtil::ValidateShapeWithOptionalLayout(layout.parameter_shape(i)));
+    TF_RETURN_IF_ERROR(VerifyNotSparse(layout.parameter_shape(i)));
+    if (!ShapeUtil::Compatible(parameter->shape(), layout.parameter_shape(i))) {
+      return InternalError(
+          "Shape of the entry computation parameter %d is %s should be "
+          "compatible to the one specified in module's entry computation "
+          "layout %s",
+          i, ShapeUtil::HumanString(parameter->shape()),
+          ShapeUtil::HumanString(layout.parameter_shape(i)));
+    }
+  }
+
+  return Status::OK();
+}
+
 string ComputationsToString(absl::Span<HloComputation* const> computations) {
   return absl::StrJoin(computations, ",",
                        [](string* s, const HloComputation* computation) {
@@ -1041,7 +1253,10 @@ Status CheckElementwiseInstruction(HloInstruction* instruction) {
 // not check result shape as that is checked in the ShapeVerifier.
 class InstructionVerifier : public DfsHloVisitorWithDefault {
  public:
-  InstructionVerifier() {}
+  explicit InstructionVerifier(std::function<bool(const HloInstruction*)>
+                                   instruction_can_change_layout_func)
+      : instruction_can_change_layout_func_(
+            instruction_can_change_layout_func) {}
 
   Status DefaultAction(HloInstruction*) override { return Status::OK(); }
 
@@ -1129,6 +1344,15 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleCrossReplicaSum(HloInstruction* crs) override {
+    if (crs->all_reduce_id().has_value()) {
+      TF_RET_CHECK(crs->all_reduce_id().value() > 0)
+          << "All reduce id must be greater than 0 for "
+          << crs->ToShortString();
+    }
+    return Status::OK();
+  }
+
   Status Preprocess(HloInstruction* instruction) override {
     auto previous = instructions_by_name_.find(instruction->name());
     TF_RET_CHECK(previous == instructions_by_name_.end())
@@ -1142,26 +1366,59 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status Postprocess(HloInstruction* instruction) override {
+    if (instruction_can_change_layout_func_ &&
+        LayoutUtil::IsDenseArray(instruction->shape()) &&
+        !instruction_can_change_layout_func_(instruction)) {
+      const Shape& result_shape = instruction->shape();
+      const Layout& result_layout = result_shape.layout();
+      for (HloInstruction* operand : instruction->operands()) {
+        const Shape& operand_shape = operand->shape();
+        if (LayoutUtil::IsDenseArray(operand_shape) &&
+            ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(result_shape)) {
+          const Layout& operand_layout = operand_shape.layout();
+          TF_RET_CHECK(LayoutUtil::Equal(result_layout, operand_layout))
+              << "Instruction shouldn't change layouts "
+              << instruction->ToString() << " From " << result_shape << " To "
+              << operand_shape;
+        }
+      }
+    }
+
+    return Status::OK();
+  }
+
  private:
   absl::flat_hash_map<string, const HloInstruction*> instructions_by_name_;
+  // Determines whether an instruction can change layouts.
+  std::function<bool(const HloInstruction*)>
+      instruction_can_change_layout_func_;
 };
 
 }  // namespace
 
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RET_CHECK(!module->name().empty());
+
+  if (module->entry_computation()->IsFusionComputation()) {
+    return InvalidArgument(
+        "Module entry computation cannot be a fusion computation");
+  }
+
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
   TF_RETURN_IF_ERROR(VerifySendsAndRecvs(*module));
 
-
+  std::unique_ptr<ShapeVerifier> shape_verifier =
+      target_metadata_->GetVerifier();
   for (auto* computation : module->computations()) {
-    std::unique_ptr<ShapeVerifier> shape_verifier = shape_verifier_factory_();
     TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
 
-    InstructionVerifier instruction_verifier;
+    InstructionVerifier instruction_verifier(
+        instruction_can_change_layout_func_);
     TF_RETURN_IF_ERROR(computation->Accept(&instruction_verifier));
   }
 
+  TF_RETURN_IF_ERROR(shape_verifier->VerifyEntryComputationLayout(*module));
   TF_RETURN_IF_ERROR(VerifyEntryAndExitShapes(*module));
 
   // If the module has a schedule, it must be valid.
@@ -1169,6 +1426,13 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
     TF_RETURN_IF_ERROR(module->schedule().Verify());
   }
 
+  TF_RETURN_IF_ERROR(module->input_output_alias_config().Verify(
+      *module, [this](const Shape& shape) {
+        return target_metadata_->ShapeSize(shape);
+      }));
+
+  TF_RETURN_IF_ERROR(module->dynamic_parameter_binding().Verify(*module));
+
   return false;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 6d16586c2c062d407e37392e3fe50be4fd29120b..e4d0c3d6957885f1d719fedb5a900de601e397f8 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VERIFIER_H_
 
+#include <memory>
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 #include "absl/memory/memory.h"
@@ -28,10 +29,16 @@ namespace xla {
 // TODO(b/26024837): Check output shape for all instruction types.
 class ShapeVerifier : public DfsHloVisitor {
  public:
-  explicit ShapeVerifier(bool layout_sensitive, bool allow_mixed_precision)
+  ShapeVerifier(bool layout_sensitive, bool allow_mixed_precision)
       : layout_sensitive_(layout_sensitive),
         allow_mixed_precision_(allow_mixed_precision) {}
 
+  // Verifies that entry computation layout matches parameters and root shape of
+  // the module's entry computation.
+  virtual Status VerifyEntryComputationLayout(const HloModule& module);
+
+  Status Preprocess(HloInstruction* hlo) override;
+
   Status HandleElementwiseUnary(HloInstruction* hlo) override;
   Status HandleElementwiseBinary(HloInstruction* hlo) override;
   Status HandleClamp(HloInstruction* clamp) override;
@@ -87,6 +94,8 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleGather(HloInstruction* gather) override;
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleAfterAll(HloInstruction* token) override;
+  Status HandleGetDimensionSize(HloInstruction* get_size) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   Status FinishVisit(HloInstruction*) override { return Status::OK(); }
 
@@ -121,6 +130,13 @@ class ShapeVerifier : public DfsHloVisitor {
                              : ShapeUtil::HumanString(s);
   }
 
+  // Helpers that switch on allow_mixed_precision_.
+  bool SameElementType(const Shape& a, const Shape& b) {
+    return allow_mixed_precision_
+               ? ShapeUtil::SameElementTypeIgnoringFpPrecision(a, b)
+               : ShapeUtil::SameElementType(a, b);
+  }
+
   // Checks that the given operand of the given instruction is of type TOKEN.
   Status CheckIsTokenOperand(const HloInstruction* instruction,
                              int64 operand_no);
@@ -149,21 +165,64 @@ class ShapeVerifier : public DfsHloVisitor {
   bool allow_mixed_precision_;
 };
 
+// An interface used to encapsulate target-specific verification quirks.
+class TargetVerifierMetadata {
+ public:
+  // Returns a target-specific shape size.
+  virtual int64 ShapeSize(const Shape& shape) const = 0;
+
+  virtual std::unique_ptr<ShapeVerifier> GetVerifier() const = 0;
+
+  TargetVerifierMetadata() {}
+  virtual ~TargetVerifierMetadata() {}
+
+  TargetVerifierMetadata(const TargetVerifierMetadata&) = delete;
+  TargetVerifierMetadata& operator=(const TargetVerifierMetadata&) = delete;
+};
+
+// The default implementation of TargetVerifierMetadata, used unless the target
+// needs to override it.
+class DefaultVerifierMetadata : public TargetVerifierMetadata {
+ public:
+  DefaultVerifierMetadata(bool layout_sensitive, bool allow_mixed_precision)
+      : layout_sensitive_(layout_sensitive),
+        allow_mixed_precision_(allow_mixed_precision) {}
+
+  int64 ShapeSize(const Shape& shape) const override {
+    return ShapeUtil::ByteSizeOf(shape);
+  }
+
+  // Creates a ShapeVerifier that checks that shapes match inferred
+  // expectations. This creates a new verifier every time because ShapeVerifier,
+  // being a DfsHloVisitor, is stateful. We want a clean object for each run of
+  // the verifier.
+  std::unique_ptr<ShapeVerifier> GetVerifier() const override {
+    return absl::make_unique<ShapeVerifier>(layout_sensitive_,
+                                            allow_mixed_precision_);
+  }
+
+ private:
+  bool layout_sensitive_;
+  bool allow_mixed_precision_;
+};
+
 // HLO pass that verifies invariants of HLO instructions for each computation in
 // the module.
 class HloVerifier : public HloModulePass {
  public:
-  using ShapeVerifierFactory = std::function<std::unique_ptr<ShapeVerifier>()>;
-
-  explicit HloVerifier(bool layout_sensitive, bool allow_mixed_precision)
-      : shape_verifier_factory_([layout_sensitive, allow_mixed_precision] {
-          return absl::make_unique<ShapeVerifier>(layout_sensitive,
-                                                  allow_mixed_precision);
-        }) {}
+  explicit HloVerifier(bool layout_sensitive, bool allow_mixed_precision,
+                       std::function<bool(const HloInstruction*)>
+                           instruction_can_change_layout_func = {})
+      : target_metadata_(absl::make_unique<DefaultVerifierMetadata>(
+            layout_sensitive, allow_mixed_precision)),
+        instruction_can_change_layout_func_(
+            std::move(instruction_can_change_layout_func)) {
+    CHECK(instruction_can_change_layout_func_ == nullptr || layout_sensitive);
+  }
 
-  // Uses custom shape verification.
-  explicit HloVerifier(ShapeVerifierFactory shape_verifier_factory)
-      : shape_verifier_factory_(std::move(shape_verifier_factory)) {}
+  // Uses custom target metadata
+  explicit HloVerifier(std::unique_ptr<TargetVerifierMetadata> target_metadata)
+      : target_metadata_(std::move(target_metadata)) {}
 
   ~HloVerifier() override = default;
   absl::string_view name() const override { return "verifier"; }
@@ -172,11 +231,11 @@ class HloVerifier : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  // Creates a ShapeVerifier that checks that shapes match inferred
-  // expectations. This is a factory function because ShapeVerifier,
-  // being a DfsHloVisitor, is stateful. We want a clean object
-  // for each run of the verifier.
-  ShapeVerifierFactory shape_verifier_factory_;
+  std::unique_ptr<TargetVerifierMetadata> target_metadata_;
+
+  // Determines whether an instruction can change layouts.
+  std::function<bool(const HloInstruction*)>
+      instruction_can_change_layout_func_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 8f0423bb1c72ceb209437116a898d027f4d2c657..4bc557e4e62e7df4e25fda86fe417e84129b464c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -34,7 +35,11 @@ namespace {
 
 using ::testing::HasSubstr;
 
-// This class cannot be converted to use HloVerifiedTestBase. It explicitly
+std::unique_ptr<HloModule> CreateUnverifiedModule() {
+  return absl::make_unique<HloModule>("module", HloModuleConfig());
+}
+
+// This class cannot be converted to use HloTestBase. It explicitly
 // uses HloTestBase to create and test malformed HLOs.
 class HloVerifierTest : public HloTestBase {
  public:
@@ -50,6 +55,14 @@ class HloVerifierTestAllowMixedPrecision : public HloTestBase {
                     /*allow_mixed_precision_in_hlo_verifier=*/true) {}
 };
 
+class HloVerifierTestLayoutSensitive : public HloTestBase {
+ public:
+  HloVerifierTestLayoutSensitive()
+      : HloTestBase(/*verifier_layout_sensitive=*/true,
+                    /*allow_mixed_precision_in_hlo_verifier=*/false,
+                    LayoutAssignment::InstructionCanChangeLayout) {}
+};
+
 TEST_F(HloVerifierTest, NullInstructionParent) {
   HloComputation::Builder builder(TestName());
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
@@ -57,7 +70,7 @@ TEST_F(HloVerifierTest, NullInstructionParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -76,7 +89,7 @@ TEST_F(HloVerifierTest, NullComputationParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -95,7 +108,7 @@ TEST_F(HloVerifierTest, DifferentOperandParents) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloComputation::Builder emb_builder(TestName());
@@ -129,7 +142,7 @@ TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add));
 
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Run the verifier twice.  It should fail both times, because it shouldn't
@@ -294,7 +307,7 @@ TEST_F(HloVerifierTest, NegativeInteriorPaddingNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32))),
       padding_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto status = verifier().Run(module.get()).status();
@@ -318,7 +331,7 @@ TEST_F(HloVerifierTest, PadNegativeInteriorDilationNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32).Clone())),
       padding_config));
 
-  auto module = CreateNewModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
@@ -358,5 +371,63 @@ TEST_F(HloVerifierTest, ConvNegativeBaseDilationNotAllowed) {
               HasSubstr("non-positive base area dilation factor"));
 }
 
+static const char* const kAddWithLayoutChangeHlo = R"(
+   HloModule AddWithLayoutChange
+    ENTRY AddWithLayoutChange {
+      par0 = f32[3,4]{1,0} parameter(0)
+      par1 = f32[3,4]{0,1} parameter(1)
+      ROOT add0 = f32[3,4]{1,0} add(par0,par1)
+    }
+  )";
+
+TEST_F(HloVerifierTest, AddWithLayoutChange) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kAddWithLayoutChangeHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTestLayoutSensitive, AddWithLayoutChangeNotAllowed) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kAddWithLayoutChangeHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Instruction shouldn't change layouts"));
+}
+
+TEST_F(HloVerifierTestLayoutSensitive, SliceWithLayoutChangeNotAllowed) {
+  const char* const kSliceWithLayoutChangeHlo = R"(
+   HloModule SliceWithLayoutChange
+    ENTRY SliceWithLayoutChange {
+      par0 = f32[4,5]{0,1} parameter(0)
+      par1 = s32[2] parameter(1)
+      ROOT dslice0 = f32[3,4]{1,0} dynamic-slice(par0, par1),
+        dynamic_slice_sizes={3,4}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kSliceWithLayoutChangeHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Instruction shouldn't change layouts"));
+}
+
+TEST_F(HloVerifierTestLayoutSensitive, ConcatWithLayoutChangeNotAllowed) {
+  const char* const kConcatWithLayoutChangeHlo = R"(
+   HloModule ConcatWithLayoutChange
+   ENTRY ConcatWithLayoutChange {
+      par0 = f32[3,5]{0,1} parameter(0)
+      par1 = f32[3,3]{1,0} parameter(1)
+      ROOT concat0 = f32[3,8]{1,0} concatenate(f32[3,5] par0, f32[3,3] par1),
+        dimensions={1}
+   }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kConcatWithLayoutChangeHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Instruction shouldn't change layouts"));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index e76b93107c923b41666f6b0a388dda143a8cb50a..90904ac00110457bcc3b8974816a7080c4ab89fc 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -90,20 +90,29 @@ string HumanReadableProfileBuilder::ToString() const {
         op.optimal_seconds < 0
             ? ""
             : StrFormat("(%12.1f optimal)", op.optimal_seconds * 1e6),
-        op.flop_count <= 0 ? "" : HumanReadableNumFlops(op.flop_count, nsecs),
-        op.transcendental_count <= 0
-            ? ""
-            : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs),
+        op.flop_count > 0 && nsecs > 0
+            ? HumanReadableNumFlops(op.flop_count, nsecs)
+            : "",
+        op.transcendental_count > 0 && nsecs > 0
+            ? HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs)
+            : "",
         bytes_per_sec, bytes_per_cycle, op.name);
   };
 
-  float optimal_seconds_sum = 0.0;
+  double optimal_seconds_sum = 0;
   int64 total_flops = 0.;
   int64 total_transcendentals = 0.;
   int64 total_bytes = 0;
   for (const auto& op : op_infos_) {
     if (op.optimal_seconds > 0) {
-      optimal_seconds_sum += op.optimal_seconds;
+      // An op can run faster than the estimated optimum. For example, we might
+      // estimate a fusion's speed by looking at the size of its operands and
+      // result, but perhaps the fusion doesn't read the entirety of all of its
+      // inputs.  For the purposes of summing the instructions' optimal speeds,
+      // we treat the "optimum" as the smallest of either the estimated optimum
+      // and the actual speed.
+      optimal_seconds_sum +=
+          std::min(double{op.optimal_seconds}, CyclesToSeconds(op.cycles));
     }
     total_flops += std::max(op.flop_count, int64{0});
     total_transcendentals += std::max(op.transcendental_count, int64{0});
@@ -112,8 +121,9 @@ string HumanReadableProfileBuilder::ToString() const {
 
   VLOG(1) << "Total floating point ops: " << total_flops;
 
-  print_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops,
-            total_transcendentals, total_bytes, optimal_seconds_sum},
+  print_op({is_entry_computation_ ? "[total] [entry]" : "[total]", "[total]",
+            /*category=*/"", total_cycles_, total_flops, total_transcendentals,
+            total_bytes, static_cast<float>(optimal_seconds_sum)},
            /*is_total=*/true);
 
   // Sort ops in decreasing order of cycles, and print them.
@@ -154,8 +164,10 @@ string HumanReadableProfileBuilder::ToString() const {
         entry.text = op.name;
         entry.short_text = op.short_name;
         entry.category_text = op.category;
-        entry.metric =
-            CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6;
+        // Ignore ops that run faster than the estimated optimal here, as we do
+        // when calculating optimal_seconds_sum.
+        entry.metric = std::max(
+            0., CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6);
         total_discrepancy_in_microseconds += entry.metric;
         table.AddEntry(std::move(entry));
       }
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.h b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
index 925111fa1f1e48650b0089f402d92e431043eabe..d4e5cbbe27418ddf3c81ebe00bc8aa979d3c2d5e 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.h
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
@@ -30,9 +30,11 @@ namespace xla {
 class HumanReadableProfileBuilder {
  public:
   explicit HumanReadableProfileBuilder(absl::string_view computation_name,
+                                       bool is_entry_computation,
                                        int64 total_cycles,
                                        double clock_rate_ghz)
       : computation_name_(computation_name),
+        is_entry_computation_(is_entry_computation),
         total_cycles_(total_cycles),
         clock_rate_ghz_(clock_rate_ghz) {
     CHECK_GE(clock_rate_ghz, 1e-9);
@@ -75,6 +77,7 @@ class HumanReadableProfileBuilder {
   }
 
   string computation_name_;
+  bool is_entry_computation_;
   int64 total_cycles_;
   double clock_rate_ghz_;
   std::vector<OpInfo> op_infos_;
diff --git a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
index f85d31d5225b8012b68f851b2bfec219d736ba0d..cf6cf897fe11eda01ba6b22119bba34ac2bef8fe 100644
--- a/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
+++ b/tensorflow/compiler/xla/service/implicit_broadcast_remover_test.cc
@@ -18,19 +18,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace op = xla::testing::opcode_matchers;
 
 namespace xla {
 namespace {
 
-class ImplicitBroadcastRemoverTest : public HloVerifiedTestBase {
+class ImplicitBroadcastRemoverTest : public HloTestBase {
  protected:
   ImplicitBroadcastRemover remover_;
 };
 
 TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
@@ -41,15 +42,16 @@ TEST_F(ImplicitBroadcastRemoverTest, NoImplicitBroadcast) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_FALSE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_FALSE(remover_.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Parameter(), op::Parameter()));
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
@@ -60,13 +62,13 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kPower, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
 
   EXPECT_FALSE(ShapeUtil::Compatible(root->shape(), root->operand(0)->shape()));
   EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), root->operand(1)->shape()));
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
 
   EXPECT_THAT(root, op::Power(op::Broadcast(op::Parameter()), op::Parameter()));
@@ -76,6 +78,7 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcast) {
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
@@ -86,9 +89,9 @@ TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       shape, HloOpcode::kSubtract, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Subtract(op::Parameter(),
@@ -98,6 +101,7 @@ TEST_F(ImplicitBroadcastRemoverTest, DegenerateDimensionBroadcast) {
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {1, 4, 1});
@@ -108,9 +112,9 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       shape, HloOpcode::kSubtract, param0, param1));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root,
@@ -120,6 +124,7 @@ TEST_F(ImplicitBroadcastRemoverTest, ScalarBroadcastToDegenerateDimensions) {
 }
 
 TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6, 8});
@@ -132,9 +137,9 @@ TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
   builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
                                                        param0, param1, param2));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Reshape(op::Parameter())),
@@ -147,6 +152,7 @@ TEST_F(ImplicitBroadcastRemoverTest, TernaryDegenerateDimensionBroadcast) {
 
 TEST_F(ImplicitBroadcastRemoverTest,
        TernaryScalarAndDegenerateDimensionBroadcast) {
+  auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 4, 6});
@@ -159,9 +165,9 @@ TEST_F(ImplicitBroadcastRemoverTest,
   builder.AddInstruction(HloInstruction::CreateTernary(shape, HloOpcode::kClamp,
                                                        param0, param1, param2));
 
-  HloComputation* computation = module().AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_TRUE(remover_.Run(&module()).ValueOrDie());
+  EXPECT_TRUE(remover_.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, op::Clamp(op::Broadcast(op::Parameter()),
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 2d03aebc1aca4c55cca588072233b7a18e70a306..98246d5403e4aebc2f4d81e52145706355ddd9a9 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include <ctype.h>
 
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
 namespace xla {
 namespace {
-class IndexedArrayAnalysisTest : public HloVerifiedTestBase {
+class IndexedArrayAnalysisTest : public HloTestBase {
  protected:
   void AssertArrayForRootExpressionIs(const string& hlo_text,
                                       const string& root_expression) {
@@ -61,12 +61,12 @@ class IndexedArrayAnalysisTest : public HloVerifiedTestBase {
                                           const string& root_expression,
                                           bool print_constants) {
     IndexedArrayAnalysis indexed_tensor_analysis;
-    ParseAndVerifyModule(hlo_text);
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                            ParseAndReturnVerifiedModule(hlo_text));
 
-    TF_ASSERT_OK_AND_ASSIGN(
-        IndexedArrayAnalysis::Array* const array_result,
-        indexed_tensor_analysis.GetArrayFor(
-            module().entry_computation()->root_instruction()));
+    TF_ASSERT_OK_AND_ASSIGN(IndexedArrayAnalysis::Array* const array_result,
+                            indexed_tensor_analysis.GetArrayFor(
+                                m->entry_computation()->root_instruction()));
     string string_result = CanonicalizeWhitespace(
         indexed_tensor_analysis.ToString(array_result, print_constants));
     LOG(INFO) << string_result;
@@ -481,8 +481,8 @@ ENTRY main {
   const char* expected_root_expression = R"(
 (scalar-indexed-const
   (constant s32[2,1,1,1,6] s32[2,1,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } },
-    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } } })
+    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } },
+    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } } })
   (reshape %indices to s32[])
   0->[])
 )";
@@ -512,8 +512,8 @@ ENTRY main {
   const char* expected_root_expression = R"(
 (scalar-indexed-const
   (constant s32[2,1,1,6] s32[2,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } },
-    { /*i0=1*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } } })
+    { /*i0=0*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } },
+    { /*i0=1*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } } })
   (reshape %indices to s32[5])
   0->[2])
 )";
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 69a4c160ee5c4539272c3085338dc6de1b9347ff..7559ed1bab84b21a4d51bc38db999900befcfad7 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -26,7 +26,9 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/fusion_queue.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -101,7 +103,6 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kShiftRightLogical:
     case HloOpcode::kSlice:
     case HloOpcode::kSubtract:
-    case HloOpcode::kAfterAll:
     case HloOpcode::kTranspose:
     case HloOpcode::kTuple:
     case HloOpcode::kTupleSelect:
@@ -114,7 +115,10 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kSin:
       return ShapeUtil::ElementIsComplex(instruction.shape());
 
-    // Expensive instructions.
+    // Expensive instructions or unusual instructions for which fusion is
+    // nonsensical.
+    case HloOpcode::kAddDependency:
+    case HloOpcode::kAfterAll:
     case HloOpcode::kAtan2:
     case HloOpcode::kBatchNormGrad:
     case HloOpcode::kBatchNormInference:
@@ -153,6 +157,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kTanh:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
+    case HloOpcode::kGetDimensionSize:
       return true;
   }
 
@@ -437,8 +442,7 @@ class ReversePostOrderFusionQueue : public FusionQueue {
 }  // namespace
 
 std::unique_ptr<FusionQueue> InstructionFusion::GetFusionQueue(
-    HloComputation* computation,
-    const std::function<bool(HloInstruction*)>& skip_producer) {
+    HloComputation* computation) {
   return absl::make_unique<ReversePostOrderFusionQueue>(computation);
 }
 
@@ -451,14 +455,16 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
   for (auto* computation : module->MakeNonfusionComputations()) {
     CHECK(!computation->IsFusionComputation());
     computation_ = computation;
-    reachability_ = computation_->ComputeReachability();
-
-    HloInstructionSet do_not_duplicate =
-        ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
-    auto fusion_queue =
-        GetFusionQueue(computation_, [&](HloInstruction* producer) {
-          return do_not_duplicate.count(producer) > 0;
-        });
+    reachability_ = HloReachabilityMap::Build(computation_);
+
+    HloInstructionSet do_not_duplicate;
+    // If we allow duplications, we need to compute which instructions we do not
+    // want to duplicate based on a global analysis of the graph.
+    if (may_duplicate_) {
+      do_not_duplicate =
+          ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
+    }
+    auto fusion_queue = GetFusionQueue(computation_);
 
     // Instruction fusion effectively fuses edges in the computation graph
     // (producer instruction -> consumer instruction) so we iterate over all
@@ -489,9 +495,8 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         HloInstruction* fusion_instruction;
         // Try "regular" fusion if the operand may be duplicated. Otherwise,
         // perform multi-output fusion, unless this creates a cycle.
-        // TODO(tjoerg): Consider making multi-output fusion the default.
-        if (ShouldFuse(instruction, i) &&
-            do_not_duplicate.count(operand) == 0) {
+        if (do_not_duplicate.count(operand) == 0 &&
+            ShouldFuse(instruction, i)) {
           fusion_queue->PreFusion(operand, instruction);
           fusion_instruction = Fuse(operand, instruction);
         } else if (ShouldFuseIntoMultiOutput(instruction, i) &&
@@ -565,15 +570,19 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput(
 
 bool InstructionFusion::MultiOutputFusionCreatesCycle(
     HloInstruction* producer, HloInstruction* consumer) {
-  return absl::c_any_of(
-      consumer->operands(), [&](const HloInstruction* consumer_operand) {
-        // The fusion algorithm traverses the HLO graph in reverse post order.
-        // Thus `cosumers` is visited before its operands (including
-        // `producer`). Therefore, consumer operands cannot have been fused yet.
-        // It is thus safe to use the pre-computed reachability map.
-        return consumer_operand != producer &&
-               reachability_->IsReachable(producer, consumer_operand);
-      });
+  auto is_reachable = [&](const HloInstruction* a, const HloInstruction* b) {
+    // A consumer operand may have been multi-output fused into a parallel
+    // consumer and thus be missing from the original reachability map.
+    if (!reachability_->IsPresent(a) || !reachability_->IsPresent(b)) {
+      reachability_ = HloReachabilityMap::Build(consumer->parent());
+    }
+    return reachability_->IsReachable(a, b);
+  };
+  return absl::c_any_of(consumer->operands(),
+                        [&](const HloInstruction* consumer_operand) {
+                          return consumer_operand != producer &&
+                                 is_reachable(producer, consumer_operand);
+                        });
 }
 
 bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index f14c6675208c72112aea0179c238b58709d625b5..198bd7fce5f392e5e895b959523d4fe9cf208ba2 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace xla {
@@ -54,8 +55,7 @@ class InstructionFusion : public HloModulePass {
   // fused. The default implementation processes consumers in reverse post
   // order.
   virtual std::unique_ptr<FusionQueue> GetFusionQueue(
-      HloComputation* computation,
-      const std::function<bool(HloInstruction*)>& skip_producer);
+      HloComputation* computation);
 
   // Returns whether the given producer instruction should be fused into the
   // given consumer instruction. producer is necessarily an operand of consumer.
@@ -111,6 +111,10 @@ class InstructionFusion : public HloModulePass {
     return is_expensive_(instruction);
   }
 
+  // Whether multi-output fusion would introduce a cycle into the HLO graph.
+  bool MultiOutputFusionCreatesCycle(HloInstruction* producer,
+                                     HloInstruction* consumer);
+
   // Current HloComputation instance the loop fuser is traversing.
   HloComputation* computation_;
   HloModule* module_;
@@ -145,10 +149,6 @@ class InstructionFusion : public HloModulePass {
   // duplicated.
   std::function<bool(const HloInstruction& instruction)> is_expensive_;
 
-  // Whether multi-output fusion would introduce a cycle into the HLO graph.
-  bool MultiOutputFusionCreatesCycle(HloInstruction* producer,
-                                     HloInstruction* consumer);
-
   // Returns whether we may duplicate an instruction if we want to fuse it.
   bool may_duplicate_;
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index da1ad90959dc0ab1a840b3390281ce9d4999651e..58b7135cea7419f13d60ed510ecf7a88126aee48 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -133,7 +133,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastSimpleReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -149,7 +149,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) {
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {}), param0, {}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose1, computation->root_instruction());
   EXPECT_FALSE(
@@ -172,7 +172,7 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusible) {
   HloInstruction* unary = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(unary, computation->root_instruction());
   EXPECT_FALSE(
@@ -361,7 +361,7 @@ TEST_F(InstructionFusionTest, AllowUnaryDuplication) {
   HloInstruction* unary2 = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, unary1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(unary2, computation->root_instruction());
   EXPECT_TRUE(
@@ -385,7 +385,7 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
   HloInstruction* unary = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kAbs, binary1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(unary, computation->root_instruction());
   EXPECT_TRUE(
@@ -394,6 +394,56 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
           .ValueOrDie());
 }
 
+TEST_F(InstructionFusionTest, FuseDiamondGraphsNoDuplication) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f32[100] parameter(0)
+    p1 = f32[100] parameter(1)
+    add = f32[100] add(p0, p1)
+    slice1 = f32[99] slice(add), slice={[0:99:1]}
+    slice2 = f32[99] slice(add), slice={[1:100:1]}
+    ROOT add2 = f32[99] add(slice1, slice2)
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/false)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  // 'add' would originally need to be duplicated if fused. However after its
+  // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one
+  // user and can now be also fused.
+  EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter()));
+}
+
+TEST_F(InstructionFusionTest, FuseDiamondGraphsAllowDuplication) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f32[100] parameter(0)
+    p1 = f32[100] parameter(1)
+    add = f32[100] add(p0, p1)
+    slice1 = f32[99] slice(add), slice={[0:99:1]}
+    slice2 = f32[99] slice(add), slice={[1:100:1]}
+    ROOT add2 = f32[99] add(slice1, slice2)
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  // 'add' would originally need to be duplicated if fused. However after its
+  // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one
+  // user and can now be also fused.
+  EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter()));
+}
+
 TEST_F(InstructionFusionTest,
        WideningConvertsAreAlwaysDuplicableIntoConsumers) {
   auto module = ParseHloString(R"(
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 1484e14df10d94841c5a2e849761779f5800392d..a981d94a999e3d322986bc2bfd56a5b0b5d175fc 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -1,4 +1,4 @@
-licenses(["restricted"])
+licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:public"])
 
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 7c79eb7d791bc9a0743605d3171ff69c6ef41d58..3a5177c418e3af8253df228a51f2fc0901d10041 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -57,6 +57,13 @@ StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
   return std::move(hlo_module);
 }
 
+Status InterpreterCompiler::RunHloPassesOnModuleGroup(
+    HloModuleGroup* module_group,
+    absl::Span<se::StreamExecutor* const> executors,
+    DeviceMemoryAllocator* device_allocator) {
+  return Unimplemented("Module group compilation not supported on Interpreter");
+}
+
 StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
     std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* /*device_allocator*/) {
@@ -76,17 +83,45 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
   return std::move(executable);
 }
 
+StatusOr<std::vector<std::unique_ptr<Executable>>>
+InterpreterCompiler::RunBackendOnModuleGroup(
+    std::unique_ptr<HloModuleGroup> module_group,
+    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
+  return Unimplemented(
+      "Module group compilation is not supported on Interpreter.");
+}
+
 StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> /*hlo_modules*/,
-    std::vector<std::vector<se::StreamExecutor*>> /*stream_execs*/,
-    DeviceMemoryAllocator* /*device_allocator*/) {
-  return tensorflow::errors::Unimplemented(
-      "Compilation of multiple HLO modules is not supported on Interpreter.");
+    std::unique_ptr<HloModuleGroup> module_group,
+    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
+  if (module_group->empty()) {
+    return std::vector<std::unique_ptr<Executable>>();
+  }
+  if (module_group->size() > 1) {
+    return tensorflow::errors::Unimplemented(
+        "Compilation of multiple HLO modules is not supported on Interpreter.");
+  }
+  if (stream_exec.size() != 1 || stream_exec[0].size() != 1) {
+    return tensorflow::errors::Unimplemented(
+        "Unexpected number of StreamExecutor's.");
+  }
+  auto hlo_modules = module_group->ConsumeModules();
+  TF_ASSIGN_OR_RETURN(auto module,
+                      RunHloPasses(std::move(hlo_modules[0]), stream_exec[0][0],
+                                   device_allocator));
+  TF_ASSIGN_OR_RETURN(
+      auto executable,
+      RunBackend(std::move(module), stream_exec[0][0], device_allocator));
+  std::vector<std::unique_ptr<Executable>> ret;
+  ret.push_back(std::move(executable));
+  return std::move(ret);
 }
 
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 InterpreterCompiler::CompileAheadOfTime(
-    std::vector<std::unique_ptr<HloModule>> hlo_modules,
+    std::unique_ptr<HloModuleGroup> module_group,
     const AotCompilationOptions& aot_options) {
   return tensorflow::errors::InvalidArgument(
       "AOT compilation not supported on Interpreter");
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index e90ae3e818522e6e4fd9d9f5acb846800bc899ca..591272951a01a3e2aa3b615673dceced8e94f674 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -46,18 +46,26 @@ class InterpreterCompiler : public Compiler {
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
+  Status RunHloPassesOnModuleGroup(
+      HloModuleGroup* module_group,
+      absl::Span<se::StreamExecutor* const> executors,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
+  StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> hlo_modules,
+      std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
       DeviceMemoryAllocator* device_allocator) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> hlo_modules,
+  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                      const AotCompilationOptions& aot_options) override;
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index a06d6113e84630df14ff68280c248cccb9afaf06..de9204011ce5ba8a9fc2871c6bd7120b6ed371b5 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -37,7 +37,7 @@ namespace xla {
 namespace interpreter {
 
 InterpreterExecutable::InterpreterExecutable(
-    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloEvaluator> evaluator)
     : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
                  /*hlo_profile_index_map=*/nullptr),
@@ -85,6 +85,7 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
   Literal result_literal;
   {
     tensorflow::mutex_lock lock(evaluator_lock_);
+    evaluator_->ResetVisitStates();
     TF_ASSIGN_OR_RETURN(result_literal, evaluator_->Evaluate<Literal>(
                                             *computation, arg_literals));
   }
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index 3b1ebce0c75457d65e6834c809fe488a9c4a159a..bda13d376360306c81230e41b01cefc6caff230d 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -42,7 +42,7 @@ namespace interpreter {
 // buffer allocation. Refer to interpreter/README.md for more.
 class InterpreterExecutable : public Executable {
  public:
-  InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module,
+  InterpreterExecutable(std::unique_ptr<HloModule> hlo_module,
                         std::unique_ptr<HloEvaluator> evaluator);
   ~InterpreterExecutable() override;
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 4fb67bd0b72fc591c1ffa76ebb0513bf14ed3737..e3e5fa71543baa309b3a68888b1b9bdfd43cfbd5 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -78,9 +78,14 @@ port::Status XlaInterpreterExecutor::SynchronousMemcpy(
   return port::Status::OK();
 }
 
-bool XlaInterpreterExecutor::HostCallback(Stream *stream,
-                                          std::function<void()> callback) {
-  AsExecutorStream(stream)->EnqueueTask(callback);
+bool XlaInterpreterExecutor::HostCallback(
+    Stream *stream, std::function<port::Status()> callback) {
+  AsExecutorStream(stream)->EnqueueTask([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index fbb99457847dca69a1901006d5d8ff713882f918..400c30515464ed5b00251fba303fef303a26b97b 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -125,7 +125,8 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return port::Status{port::error::UNIMPLEMENTED, ""};
   }
 
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+  bool HostCallback(Stream *stream,
+                    std::function<port::Status()> callback) override;
 
   port::Status AllocateEvent(Event *event) override {
     return port::Status{port::error::UNIMPLEMENTED, ""};
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index c9b40d3c6195f80a19272a0d98890049d02315b9..b0fc1af8b89d7327a00f77f471e90d143a92de7c 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -110,3 +110,5 @@ REGISTER_MODULE_INITIALIZER(
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(interpreter_platform,
                                      multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     interpreter_platform);
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index cc4a342e9d38415599256a5eaf3f5cf757652659..eddef850cf5250b85b564c1e6c92d1cc8ecd1a43 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -419,6 +419,16 @@ Status LayoutAssignment::BuildHostChannelConstraints(
   return Status::OK();
 }
 
+namespace {
+
+bool IsLayoutConstrainedCustomCall(HloInstruction* instruction) {
+  const HloCustomCallInstruction* custom_call =
+      DynCast<HloCustomCallInstruction>(instruction);
+  return custom_call != nullptr && custom_call->layout_constrained();
+}
+
+}  // namespace
+
 Status LayoutAssignment::AddMandatoryConstraints(
     const ComputationLayout* computation_layout,
     ChannelLayoutConstraints* channel_constraints, HloComputation* computation,
@@ -434,13 +444,11 @@ Status LayoutAssignment::AddMandatoryConstraints(
   // Constrain layouts of instructions which define values with pre-existing
   // layouts.
   for (auto* instruction : computation->instructions()) {
-    Shape const* shape_with_layout = nullptr;
     if (instruction->opcode() == HloOpcode::kInfeed) {
       // Infeed layouts must match the layout of the original inserted
       // instruction.
       // TODO(b/31425034): Change infeeds to be more like parameters, with
       // shapes in the ComputationLayout.
-      DCHECK(!LayoutUtil::IsPadded(instruction->shape()));
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(instruction->shape(), instruction));
     } else if (instruction->opcode() == HloOpcode::kOutfeed) {
@@ -456,17 +464,21 @@ Status LayoutAssignment::AddMandatoryConstraints(
         if (parameter_layout.LayoutIsSet()) {
           // Parameter layouts must match the respective layout in
           // ComputationLayout, if there is one.
-          shape_with_layout = &parameter_layout.shape();
+          TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
+              parameter_layout.shape(), instruction));
         }
       }
-    }
-    if (shape_with_layout != nullptr) {
+    } else if (IsLayoutConstrainedCustomCall(instruction)) {
+      const HloCustomCallInstruction* custom_call =
+          DynCast<HloCustomCallInstruction>(instruction);
       TF_RETURN_IF_ERROR(
-          constraints->SetInstructionLayout(*shape_with_layout, instruction));
-    }
-
-    if (instruction->opcode() == HloOpcode::kSend ||
-        instruction->opcode() == HloOpcode::kRecv) {
+          constraints->SetInstructionLayout(custom_call->shape(), custom_call));
+      for (int64 i = 0; i < custom_call->operand_count(); ++i) {
+        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+            custom_call->operand_shapes_with_layout()[i], custom_call, i));
+      }
+    } else if (instruction->opcode() == HloOpcode::kSend ||
+               instruction->opcode() == HloOpcode::kRecv) {
       CHECK(get_channel_constraints(instruction))
           << "Multi-module layout assignment requires ChannelLayoutConstraints";
       int64 channel_id = instruction->channel_id();
@@ -621,31 +633,6 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           false_computation_layout.parameter_shape(0), instruction, 2,
           /*mandatory=*/true));
-    } else if (instruction->opcode() == HloOpcode::kCustomCall) {
-      if (!CustomCallRequiresMajorFirstLayout(instruction)) {
-        continue;
-      }
-      // Add constraints for kCustomCall instruction operands and instructions.
-      // For now we only support major-first layouts for all inputs and outputs.
-      Shape result_shape = ShapeUtil::MakeShapeWithDescendingLayout(
-          instruction->shape().element_type(),
-          AsInt64Slice(instruction->shape().dimensions()));
-      TF_RETURN_IF_ERROR(
-          constraints->SetInstructionLayout(result_shape, instruction));
-      for (int64 i = 0; i < instruction->operand_count(); ++i) {
-        const Shape& operand_shape = instruction->operand(i)->shape();
-        // Opaque operands don't get a layout constraint.
-        if (ShapeUtil::IsOpaque(operand_shape)) {
-          continue;
-        }
-
-        Shape row_major_operand_shape =
-            ShapeUtil::MakeShapeWithDescendingLayout(
-                operand_shape.element_type(),
-                AsInt64Slice(operand_shape.dimensions()));
-        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
-            row_major_operand_shape, instruction, i));
-      }
     }
   }
   // Finally set the result layout to match ComputationLayout, if there is one.
@@ -676,16 +663,18 @@ Status CheckCallLayout(HloInstruction* call,
   return Status::OK();
 }
 
-// Custom calls have fixed input and output layouts.
-Status CheckCustomCallLayout(HloInstruction* custom_call) {
-  for (const HloInstruction* operand : custom_call->operands()) {
-    TF_RET_CHECK(
-        ShapeUtil::IsOpaque(operand->shape()) ||
-        LayoutUtil::IsMonotonicWithDim0Major(operand->shape().layout()));
+// Operands of layout-constrained custom calls must match the expected
+// constrained layouts.
+Status CheckCustomCallLayout(HloInstruction* instruction) {
+  if (IsLayoutConstrainedCustomCall(instruction)) {
+    const HloCustomCallInstruction* custom_call =
+        DynCast<HloCustomCallInstruction>(instruction);
+    for (int64 i = 0; i < custom_call->operand_count(); ++i) {
+      TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+          custom_call->operand(i)->shape(),
+          custom_call->operand_shapes_with_layout()[i]));
+    }
   }
-  TF_RET_CHECK(
-      ShapeUtil::IsOpaque(custom_call->shape()) ||
-      LayoutUtil::IsMonotonicWithDim0Major(custom_call->shape().layout()));
   return Status::OK();
 }
 
@@ -932,9 +921,7 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
               FindOrDie(computation_layouts_, instruction->to_apply())));
           break;
         case HloOpcode::kCustomCall:
-          if (CustomCallRequiresMajorFirstLayout(instruction)) {
-            TF_RETURN_IF_ERROR(CheckCustomCallLayout(instruction));
-          }
+          TF_RETURN_IF_ERROR(CheckCustomCallLayout(instruction));
           break;
         case HloOpcode::kFusion:
           TF_RETURN_IF_ERROR(CheckFusionLayout(instruction));
@@ -971,9 +958,8 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
       FindOrDie(computation_layouts_, module->entry_computation())
           .result_layout();
   if (result_layout.LayoutIsSet()) {
-    TF_RET_CHECK(ShapeUtil::Equal(
-        module->entry_computation()->root_instruction()->shape(),
-        result_layout.shape()));
+    TF_RET_CHECK(
+        ShapeUtil::Equal(module->result_shape(), result_layout.shape()));
   }
   return Status::OK();
 }
@@ -1002,10 +988,8 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     const Layout& output_layout, const HloInstruction* instruction,
     int64 operand_no) {
   const HloInstruction* operand = instruction->operand(operand_no);
-
   CHECK(ShapeUtil::IsArray(instruction->shape()));
   CHECK(ShapeUtil::IsArray(operand->shape()));
-
   if (!ShapeUtil::IsScalar(operand->shape()) &&
       ShapeUtil::Rank(operand->shape()) ==
           ShapeUtil::Rank(instruction->shape()) &&
@@ -1264,12 +1248,20 @@ Status LayoutAssignment::PropagateOperandConstraint(
                                    operand_constraint.operand(), constraints));
 
   // For array-shaped operands and user instructions try to pick a minimum cost
-  // layout. For example, if the operand of a elementwise instruction is
-  // constained to a certain layout we want the output of the instruction to
+  // layout. For example, if the operand of an elementwise instruction is
+  // constrained to a certain layout we want the output of the instruction to
   // have the same layout.
+  //
+  // If the user is not array-shaped, we still want to propagate the layout
+  // to siblings if the instruction can't change layout. This is to represent
+  // the information that non-layout-changing instructions should have the same
+  // layout for the operands with the same ranks.
   const HloInstruction* operand = operand_constraint.operand();
   const HloInstruction* user = operand_constraint.instruction();
-  if (!ShapeUtil::IsArray(operand->shape()) ||
+  if (!ShapeUtil::IsArray(operand->shape())) {
+    return Status::OK();
+  }
+  if (instruction_can_change_layout_func_(user) &&
       !ShapeUtil::IsArray(user->shape())) {
     return Status::OK();
   }
@@ -1280,52 +1272,183 @@ Status LayoutAssignment::PropagateOperandConstraint(
                                           operand_constraint.operand_no())) {
     return Status::OK();
   }
-  TF_ASSIGN_OR_RETURN(
-      const LogicalBuffer* buffer,
-      constraints->points_to_analysis().GetBufferDefinedAt(user, /*index=*/{}));
 
-  if (constraints->BufferLayout(*buffer) == nullptr) {
-    std::unique_ptr<Layout> layout = ChooseOutputLayoutFromOperandLayout(
-        operand_constraint.shape_layout().layout(), user,
-        operand_constraint.operand_no());
-    if (layout != nullptr) {
-      TF_RETURN_IF_ERROR(
-          constraints->SetBufferLayout(*layout, *buffer, /*mandatory=*/false));
+  int64 operand_rank = ShapeUtil::Rank(operand->shape());
+  if (operand_rank <= 1) {
+    return Status::OK();
+  }
+
+  // Propagate layouts between operands of the same instruction. This is a
+  // constraint on non-layout-changing instructions.
+  if (!instruction_can_change_layout_func_(user)) {
+    // Make sure all siblings have the same layout as the operand.
+    for (int64 operand_no = 0; operand_no < user->operand_count();
+         ++operand_no) {
+      if (user->operand(operand_no) == operand) {
+        continue;
+      }
+      const HloInstruction* sibling = user->operand(operand_no);
+      const int64 sibling_rank = ShapeUtil::Rank(sibling->shape());
+      if (sibling_rank <= 1) {
+        continue;
+      }
+      if (operand_rank != sibling_rank) {
+        continue;
+      }
+      const OperandLayoutConstraint* constraint =
+          constraints->GetOperandLayoutConstraint(user, operand_no);
+      if (constraint != nullptr) {
+        // Due to the DFS of the propagation we can end up here when operand_no
+        // has a layout set that hasn't been propagated yet (is still on the
+        // stack of layouts to propagate).
+        // We can continue here and leave the operands with different layouts,
+        // as we will either:
+        // - overwrite the current operand when the DFS gets back to propagating
+        //   operand(operand_no) to its siblings
+        // - overwrite operand(operand_no)'s layout with a mandatory layout if
+        //   we continue to propagate our layout to the result, and then
+        //   backwards into all operands (if the result is an array of rank > 1)
+        continue;
+      }
+      TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
+          operand_constraint.shape_layout().layout(), user, operand_no,
+          /*mandatory=*/false));
     }
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+        user->shape(),
+        [&](const Shape& subshape, const ShapeIndex& shape_index) {
+          if (ShapeUtil::IsTuple(subshape)) {
+            return Status::OK();
+          }
+          if (ShapeUtil::Rank(subshape) <= 1) {
+            return Status::OK();
+          }
+
+          // Assign the right layout to input fusion of higher rank reduce
+          // operations.
+          if (ShapeUtil::Rank(subshape) != ShapeUtil::Rank(operand->shape())) {
+            return Status::OK();
+          }
+          // TODO(b/67641796): Are there cases except fusion that use this code
+          // path?
+          TF_ASSIGN_OR_RETURN(
+              const LogicalBuffer* buffer,
+              constraints->points_to_analysis().GetBufferDefinedAt(
+                  user, shape_index));
+          // Make sure the output has the same layout as the operand.
+          const BufferLayoutConstraint* constraint =
+              constraints->GetBufferLayoutConstraint(*buffer);
+          // If we already have a constraint for the buffer it was assigned but
+          // hasn't propagated yet. This can happen with diamond-shaped graphs
+          // where one path is first evaluated in depth-first order (we're here)
+          // and the other path is propagated later. We don't set the layout
+          // here as it will always be overwritten later.
+          if (constraint == nullptr) {
+            TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
+                operand_constraint.shape_layout().layout(), *buffer,
+                /*mandatory=*/false));
+          }
+          return Status::OK();
+        }));
+    return Status::OK();
   }
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      user->shape(), [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (ShapeUtil::IsTuple(subshape)) {
+          return Status::OK();
+        }
+        if (ShapeUtil::Rank(subshape) <= 1) {
+          return Status::OK();
+        }
+        TF_ASSIGN_OR_RETURN(
+            const LogicalBuffer* buffer,
+            constraints->points_to_analysis().GetBufferDefinedAt(user,
+                                                                 shape_index));
+        if (constraints->BufferLayout(*buffer) == nullptr ||
+            !constraints->GetBufferLayoutConstraint(*buffer)->mandatory()) {
+          std::unique_ptr<Layout> layout = ChooseOutputLayoutFromOperandLayout(
+              operand_constraint.shape_layout().layout(), user,
+              operand_constraint.operand_no());
+          if (layout != nullptr) {
+            TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
+                *layout, *buffer,
+                /*mandatory=*/user->opcode() == HloOpcode::kReduce,
+                /*dfs=*/false));
+          }
+        }
+        return Status::OK();
+      }));
   return Status::OK();
 }
 
-Status LayoutAssignment::PropagateBufferConstraint(
+Status LayoutAssignment::PropagateBufferConstraintToOperands(
     const BufferLayoutConstraint& buffer_constraint,
     LayoutConstraints* constraints) {
-  // Only propagate array layouts.
+  VLOG(5) << "PropagateBufferConstraintToOperands: "
+          << buffer_constraint.ToString();
   const LogicalBuffer& buffer = buffer_constraint.buffer();
-  if (!buffer.IsArray()) {
+
+  const HloInstruction* instruction = buffer.instruction();
+  if (IsAtMostRank1(instruction->shape())) {
     return Status::OK();
   }
 
-  // If this buffer is the result of an array-shaped op (as opposed to an array
-  // element in a tuple) try to propagate the layout to its operands.
-  if (buffer.IsTopLevel()) {
-    const HloInstruction* instruction = buffer.instruction();
-    // Propagate the def-constraint on an instruction to the use-constraints on
-    // its operands (use-def propagation).
-    for (int64 operand_no = 0; operand_no < instruction->operand_count();
-         ++operand_no) {
-      if (constraints->OperandLayout(instruction, operand_no) == nullptr &&
-          ShapeUtil::IsArray(instruction->operand(operand_no)->shape())) {
+  for (int64 operand_no = 0; operand_no < instruction->operand_count();
+       ++operand_no) {
+    const HloInstruction* operand = instruction->operand(operand_no);
+    if (IsAtMostRank1(operand->shape())) {
+      continue;
+    }
+    if (!instruction_can_change_layout_func_(instruction)) {
+      // Copy the layout to the operand.
+      if (buffer.IsArray() && ShapeUtil::IsArray(operand->shape()) &&
+          ShapeUtil::Rank(operand->shape()) ==
+              LayoutUtil::MinorToMajor(buffer_constraint.layout()).size()) {
+        TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
+            buffer_constraint.layout(), instruction, operand_no,
+            /*mandatory=*/true));
+      }
+    } else {
+      if (!buffer.IsTopLevel() ||
+          !ShapeUtil::IsArray(instruction->operand(operand_no)->shape())) {
+        continue;  // Don't touch buffers that are internal to a tuple.
+      }
+      VLOG(6) << "Propagating constraint to operand " << operand_no << " of "
+              << instruction->ToShortString();
+      // Assign a layout if there is no constraint already.
+      const OperandLayoutConstraint* constraint =
+          constraints->GetOperandLayoutConstraint(instruction, operand_no);
+      if (constraint == nullptr || !constraint->mandatory()) {
         std::unique_ptr<Layout> operand_layout =
             ChooseOperandLayoutFromOutputLayout(buffer_constraint.layout(),
                                                 instruction, operand_no);
         if (operand_layout != nullptr) {
+          // Do not propagate operand constraints of transposes and reshapes, it
+          // tends to create really bad layouts.
           TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
-              *operand_layout, instruction, operand_no, /*mandatory=*/true));
+              *operand_layout, instruction, operand_no, /*mandatory=*/false,
+              /*dfs=*/false));
         }
+      } else {
+        VLOG(6) << "Operand already has a constraint "
+                << constraint->ToString();
       }
     }
   }
-  return PropagateBufferConstraintToUses(buffer_constraint, constraints);
+  return Status::OK();
+}
+
+Status LayoutAssignment::PropagateBufferConstraint(
+    const BufferLayoutConstraint& buffer_constraint,
+    LayoutConstraints* constraints) {
+  // Only propagate array layouts.
+  const LogicalBuffer& buffer = buffer_constraint.buffer();
+  if (!buffer.IsArray()) {
+    return Status::OK();
+  }
+  TF_RETURN_IF_ERROR(
+      PropagateBufferConstraintToUses(buffer_constraint, constraints));
+  return PropagateBufferConstraintToOperands(buffer_constraint, constraints);
 }
 
 Status LayoutAssignment::PropagateBufferConstraintToUses(
@@ -1353,12 +1476,12 @@ Status LayoutAssignment::PropagateBufferConstraintToUses(
 }
 
 Status LayoutAssignment::PropagateResultConstraint(
-    const ResultLayoutConstraint& result_constraint,
+    const ResultLayoutConstraint& layout_constraint,
     LayoutConstraints* constraints) {
   // Propagate the use constraint of the root instruction up to the logical
   // buffers which make up the result.
   return PropagateUseConstraintToDefs(
-      result_constraint.shape_layout(),
+      layout_constraint.shape_layout(),
       constraints->computation()->root_instruction(), constraints);
 }
 
@@ -1536,6 +1659,10 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
     // Execute extra verification step once the layout has been finalized.
     TF_RETURN_IF_ERROR(Verify(instruction));
 
+    // Shape must be valid.
+    TF_RETURN_IF_ERROR(
+        ShapeUtil::ValidateShapeWithOptionalLayout(instruction->shape()));
+
     // Verify all layouts in the shape have been set.
     TF_RET_CHECK(LayoutUtil::HasLayout(instruction->shape()));
   }
@@ -1554,11 +1681,11 @@ Status LayoutAssignment::CalculateComputationLayout(
 
 Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
   // Clear existing layouts of the instructions.  All layouts must be assigned
-  // by the LayoutAssignment pass, except for those on infeeds, parameters,
-  // and the computation result. The latter two are specified in
-  // computation_layout, so we only need to keep the existing layouts for
-  // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
-  // layout assignment pass that may accidentally use the existing layout.
+  // by the LayoutAssignment pass, except for those on parameters, the
+  // computation result, and a couple special cases. The former two are
+  // specified in computation_layout.  Clearing the layouts here avoids hiding
+  // potential bugs in the layout assignment pass that may accidentally use the
+  // existing layout.
   for (HloInstruction* instruction : computation->instructions()) {
     if (instruction->opcode() == HloOpcode::kBitcast) {
       // bitcasts are inherently layout sensitive and so a bitcast instruction
@@ -1567,7 +1694,9 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
           "Unexpected bitcast operation seen during layout assignment: %s.",
           instruction->ToString());
     }
-    if (instruction->opcode() != HloOpcode::kInfeed) {
+    // Some instructions carry mandatory layouts in their shape.
+    if (instruction->opcode() != HloOpcode::kInfeed &&
+        !IsLayoutConstrainedCustomCall(instruction)) {
       LayoutUtil::ClearLayout(instruction->mutable_shape());
     }
   }
@@ -1802,6 +1931,18 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   }
   TF_RETURN_IF_ERROR(Init());
 
+  // Verify computation layout is sane.
+  const HloComputation* entry = module->entry_computation();
+  TF_RET_CHECK(entry_computation_layout_->parameter_count() ==
+               entry->num_parameters());
+  for (int64 i = 0; i < entry->num_parameters(); ++i) {
+    TF_RET_CHECK(
+        ShapeUtil::Compatible(entry_computation_layout_->parameter_shape(i),
+                              entry->parameter_instruction(i)->shape()));
+  }
+  TF_RET_CHECK(ShapeUtil::Compatible(entry_computation_layout_->result_shape(),
+                                     entry->root_instruction()->shape()));
+
   // We do two passes. The first one we pass a nullptr ComputationLayout to
   // the RunOnComputation() calls (for non entry computations), and we register
   // the ComputationLayout which are naturally flowing in DFS fashion to the
@@ -1859,6 +2000,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
   switch (instruction->opcode()) {
     case HloOpcode::kAbs:
     case HloOpcode::kAdd:
+    case HloOpcode::kAddDependency:
     case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
     case HloOpcode::kBitcastConvert:
@@ -1873,7 +2015,6 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
-    case HloOpcode::kCustomCall:
     case HloOpcode::kDivide:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
@@ -1907,6 +2048,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kRemainder:
     case HloOpcode::kReverse:
     case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kScatter:
     case HloOpcode::kSelect:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kShiftLeft:
@@ -1930,6 +2072,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kConstant:
     case HloOpcode::kConvolution:
     case HloOpcode::kCopy:
+    case HloOpcode::kCustomCall:
     case HloOpcode::kDomain:
     case HloOpcode::kDot:
     case HloOpcode::kFusion:
@@ -1944,17 +2087,27 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kReduce:
     case HloOpcode::kReshape:
     case HloOpcode::kRng:
-    case HloOpcode::kScatter:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kAfterAll:
     case HloOpcode::kTrace:
     case HloOpcode::kTranspose:
     case HloOpcode::kTuple:
+    case HloOpcode::kGetDimensionSize:
       return true;
   }
 }
 
+/* static */
+bool LayoutAssignment::IsAtMostRank1(const Shape& shape) {
+  if (ShapeUtil::IsArray(shape)) {
+    return ShapeUtil::Rank(shape) <= 1;
+  }
+  return absl::c_all_of(shape.tuple_shapes(), [](const Shape& subshape) {
+    return IsAtMostRank1(subshape);
+  });
+}
+
 Status LayoutAssignment::Init() {
   computation_layouts_.clear();
   *entry_computation_layout_ = saved_entry_computation_layout_;
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 2d48e122637c080fc2bcf7bce1c2a2521f51e41f..3b081de3c7826c3c11a7d87d542835d0ecce1b7e 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -315,6 +315,10 @@ class LayoutAssignment : public HloModulePass {
   // rank as the output to have the same layout as the output.
   static bool InstructionCanChangeLayout(const HloInstruction* instruction);
 
+  // In case of an array shape returns true iff it is at most rank 1. In case of
+  // a tuple shape returns true iff all leaf shapes are at most rank 1.
+  static bool IsAtMostRank1(const Shape& shape);
+
  protected:
   // These methods, invoked by PropagateConstraints, propagate a layout
   // constraint to its neighbors (i.e. operands and users) in order to minimize
@@ -333,19 +337,6 @@ class LayoutAssignment : public HloModulePass {
       const ResultLayoutConstraint& layout_constraint,
       LayoutConstraints* constraints);
 
-  // By default LayoutAssignment ensures that inputs and outputs of CustomCalls
-  // have the "major-first" layout (i.e.  {n, n-1, ..., 0}).
-  //
-  // If this function returns true, LayoutAssignment does not set a layout for
-  // the given CustomCall.  It's up to the backend to set one in
-  // AddBackendConstraints, if necessary.
-  //
-  // Precondition: instruction->opcode() == HloOpcode::kCustomCall.
-  virtual bool CustomCallRequiresMajorFirstLayout(
-      const HloInstruction* /*instruction*/) {
-    return true;
-  }
-
   // Called after layouts of an instruction have been finalized to allow
   // subclasses to check for platform specific assumptions.
   virtual Status Verify(const HloInstruction* instruction) {
@@ -375,7 +366,7 @@ class LayoutAssignment : public HloModulePass {
   // `user` that minimizes its cost on that operand.  Returns null if it can't
   // decide the best layout.
   // Precondition: `user` and the operand are array-shaped.
-  std::unique_ptr<Layout> ChooseOutputLayoutFromOperandLayout(
+  virtual std::unique_ptr<Layout> ChooseOutputLayoutFromOperandLayout(
       const Layout& operand_layout, const HloInstruction* user,
       int64 operand_no);
 
@@ -421,6 +412,10 @@ class LayoutAssignment : public HloModulePass {
   // required for correctness.
   Status PropagateConstraints(LayoutConstraints* constraints);
 
+  Status PropagateBufferConstraintToOperands(
+      const BufferLayoutConstraint& buffer_constraint,
+      LayoutConstraints* constraints);
+
   // Check that all layouts in the module have been set and satisfy all
   // necessary conditions.
   Status CheckLayouts(HloModule* module);
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 2c549cd872b35e55cc00527b6579f79d8516b66c..5c661bfacb08fe27f3cbdc1fb9db083315166008 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -27,50 +27,70 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
-namespace op = xla::testing::opcode_matchers;
-
 namespace xla {
 namespace {
 
+namespace m = xla::match;
 using ::testing::ElementsAre;
 
-class LayoutAssignmentTest : public HloVerifiedTestBase {
+class LayoutAssignmentTest : public HloTestBase {
  protected:
-  void AssignLayouts(HloModule* module,
-                     ComputationLayout* entry_computation_layout,
+  void AssignLayouts(HloModule* m, ComputationLayout* entry_computation_layout,
                      ChannelLayoutConstraints* channel_constraints = nullptr) {
     LayoutAssignment layout_assignment(
         entry_computation_layout, LayoutAssignment::InstructionCanChangeLayout,
         /*channel_constraints=*/channel_constraints);
-    EXPECT_IS_OK(layout_assignment.Run(module).status());
+    EXPECT_IS_OK(layout_assignment.Run(m).status());
   }
 
-  std::vector<int64> LayoutOf(HloModule* module, absl::string_view name) {
+  std::vector<int64> LayoutOf(HloModule* m, absl::string_view name) {
     auto minor_to_major =
-        FindInstruction(module, name)->shape().layout().minor_to_major();
+        FindInstruction(m, name)->shape().layout().minor_to_major();
     return std::vector<int64>(minor_to_major.begin(), minor_to_major.end());
   }
+
+  void ExpectLayoutIs(const Shape& shape,
+                      absl::Span<const int64> minor_to_major) {
+    const Layout expected = LayoutUtil::MakeLayout(minor_to_major);
+    EXPECT_TRUE(LayoutUtil::Equal(shape.layout(), expected))
+        << "Expected layout " << expected << ", actual " << shape.layout();
+  }
+
+  void ExpectTupleLayoutIs(
+      const Shape& shape,
+      std::initializer_list<absl::Span<const int64>> minor_to_majors) {
+    int i = 0;
+    for (const absl::Span<const int64> minor_to_major : minor_to_majors) {
+      const Layout expected = LayoutUtil::MakeLayout(minor_to_major);
+      const Layout& actual = ShapeUtil::GetTupleElementShape(shape, i).layout();
+      EXPECT_TRUE(LayoutUtil::Equal(actual, expected))
+          << "Expected tuple element " << i << " layout " << expected
+          << ", actual " << actual;
+      ++i;
+    }
+  }
 };
 
 TEST_F(LayoutAssignmentTest, ComputationLayout) {
   // Verify the layouts of the root and parameter instructions of a computation
   // match the ComputationLayout for two different layouts.
-  std::vector<std::initializer_list<int64>> minor_to_majors = {{0, 1}, {1, 0}};
+  std::vector<std::vector<int64>> minor_to_majors = {{0, 1}, {1, 0}};
   for (auto& minor_to_major : minor_to_majors) {
     auto builder = HloComputation::Builder(TestName());
     Shape ashape = ShapeUtil::MakeShape(F32, {42, 12});
@@ -80,8 +100,8 @@ TEST_F(LayoutAssignmentTest, ComputationLayout) {
         HloInstruction::CreateParameter(1, ashape, "param1"));
     auto add = builder.AddInstruction(
         HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, param0, param1));
-    auto module = CreateNewModule();
-    HloComputation* computation = module->AddEntryComputation(builder.Build());
+    auto m = CreateNewVerifiedModule();
+    HloComputation* computation = m->AddEntryComputation(builder.Build());
 
     Layout layout = LayoutUtil::MakeLayout(minor_to_major);
     Shape shape(ashape);
@@ -92,7 +112,7 @@ TEST_F(LayoutAssignmentTest, ComputationLayout) {
     *computation_layout.mutable_parameter_layout(0) = shape_layout;
     *computation_layout.mutable_parameter_layout(1) = shape_layout;
     *computation_layout.mutable_result_layout() = shape_layout;
-    AssignLayouts(module, &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
     EXPECT_TRUE(LayoutUtil::Equal(layout, param0->shape().layout()));
     EXPECT_TRUE(LayoutUtil::Equal(layout, param1->shape().layout()));
     EXPECT_TRUE(LayoutUtil::Equal(layout, add->shape().layout()));
@@ -110,8 +130,8 @@ TEST_F(LayoutAssignmentTest, ComputationLayoutMixedLayout) {
       HloInstruction::CreateParameter(1, ashape, "param1"));
   builder.AddInstruction(
       HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, param0, param1));
-  auto module = CreateNewModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   Layout col_major_layout = LayoutUtil::MakeLayout({1, 0});
   Shape col_major_shape(ashape);
@@ -128,7 +148,7 @@ TEST_F(LayoutAssignmentTest, ComputationLayoutMixedLayout) {
   *computation_layout.mutable_parameter_layout(1) = row_major;
   *computation_layout.mutable_result_layout() = col_major;
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
   EXPECT_TRUE(LayoutUtil::Equal(col_major_layout, param0->shape().layout()));
   EXPECT_TRUE(LayoutUtil::Equal(row_major_layout, param1->shape().layout()));
   EXPECT_TRUE(LayoutUtil::Equal(
@@ -139,7 +159,7 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
   // Verify that the layout of the fused parameters in a fusion instruction
   // match that of the fusion operands. Other fused instructions should have no
   // layout.
-  std::vector<std::initializer_list<int64>> minor_to_majors = {{0, 1}, {1, 0}};
+  std::vector<std::vector<int64>> minor_to_majors = {{0, 1}, {1, 0}};
   for (auto& minor_to_major : minor_to_majors) {
     auto builder = HloComputation::Builder(TestName());
     auto constant_literal1 = LiteralUtil::CreateR2WithLayout<float>(
@@ -159,8 +179,8 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
     auto negate2 = builder.AddInstruction(
         HloInstruction::CreateUnary(ashape, HloOpcode::kNegate, negate1));
 
-    auto module = CreateNewModule();
-    HloComputation* computation = module->AddEntryComputation(builder.Build());
+    auto m = CreateNewVerifiedModule();
+    HloComputation* computation = m->AddEntryComputation(builder.Build());
 
     auto fusion = computation->CreateFusionInstruction(
         {negate2, negate1, add}, HloInstruction::FusionKind::kLoop);
@@ -173,7 +193,7 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
     ComputationLayout computation_layout(computation->ComputeProgramShape());
     *computation_layout.mutable_result_layout() = shape_layout;
 
-    AssignLayouts(module, &computation_layout);
+    AssignLayouts(m.get(), &computation_layout);
 
     EXPECT_TRUE(LayoutUtil::Equal(
         layout, fusion->fused_parameter(0)->shape().layout()));
@@ -208,13 +228,13 @@ TEST_F(LayoutAssignmentTest, TupleLayout) {
   auto negate = builder.AddInstruction(HloInstruction::CreateUnary(
       constant0->shape(), HloOpcode::kNegate, get_element0));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_TRUE(
       LayoutUtil::LayoutsInShapesEqual(constant0->shape(), constant1->shape()));
@@ -246,17 +266,17 @@ TEST_F(LayoutAssignmentTest, TupleSelect) {
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       tuple0->shape(), HloOpcode::kTupleSelect, pred, tuple0, tuple1));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape result_shape =
       ShapeUtil::MakeTupleShape({constant0->shape(), constant1->shape()});
   TF_CHECK_OK(computation_layout.mutable_result_layout()->CopyLayoutFromShape(
       result_shape));
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(result_shape, select->shape()));
 }
@@ -281,11 +301,11 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   auto nested_tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({inner_tuple, inner_tuple}));
 
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape result_shape = nested_tuple->shape();
   *ShapeUtil::GetMutableSubshape(&result_shape, /*index=*/{0, 0}) =
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
@@ -295,7 +315,7 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
       result_shape));
 
   LayoutAssignment layout_assignment(&computation_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   // Layout assignment should have deep copied the result of the computation to
   // address the layout conflict. This results in several Tuple() and
@@ -308,12 +328,11 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   //  %tuple.1 = Tuple(%copy) layout=({0,1})
   //  %tuple.2 = Tuple(%tuple.0, %tuple.1) layout=(({1,0}), ({0,1}))
   //
-  EXPECT_TRUE(
-      AlgebraicSimplifier(/*is_layout_sensitive=*/true,
-                          [](const Shape&, const Shape&) { return false; })
-          .Run(module)
-          .ValueOrDie());
-  HloInstruction* root = module->entry_computation()->root_instruction();
+  AlgebraicSimplifierOptions options(
+      [](const Shape&, const Shape&) { return false; });
+  options.set_is_layout_sensitive(true);
+  EXPECT_TRUE(AlgebraicSimplifier(options).Run(m.get()).ValueOrDie());
+  HloInstruction* root = m->entry_computation()->root_instruction();
   // Verify layout of the root and the root's operands.
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, root->shape()));
   EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::GetSubshape(result_shape, {0}),
@@ -323,7 +342,8 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
 
   // Verify the structure of the HLO graph.
   EXPECT_THAT(root,
-              op::Tuple(op::Tuple(constant), op::Tuple(op::Copy(constant))));
+              GmockMatch(m::Tuple(m::Tuple(m::Op().Is(constant)),
+                                  m::Tuple(m::Copy(m::Op().Is(constant))))));
 }
 
 TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
@@ -340,9 +360,8 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   auto tanh = builder.AddInstruction(
       HloInstruction::CreateUnary(bshape, HloOpcode::kTanh, reshape));
 
-  auto module = CreateNewModule();
-  HloComputation* computation =
-      module->AddEntryComputation(builder.Build(tanh));
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build(tanh));
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
@@ -353,7 +372,7 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ashape_with_layout);
   *computation_layout.mutable_result_layout() = ShapeLayout(bshape_with_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   auto log_minor_to_major =
       AsInt64Slice(log->shape().layout().minor_to_major());
@@ -382,8 +401,8 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndTranspose) {
       HloInstruction::CreateTranspose(bshape, log, {1, 0}));
   auto tanh = builder.AddInstruction(
       HloInstruction::CreateUnary(bshape, HloOpcode::kTanh, transpose));
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build(tanh));
+  auto m = CreateNewVerifiedModule();
+  auto computation = m->AddEntryComputation(builder.Build(tanh));
 
   Shape ashape_with_layout(ashape);
   Shape bshape_with_layout(bshape);
@@ -394,7 +413,7 @@ TEST_F(LayoutAssignmentTest, ElementwiseAndTranspose) {
   *computation_layout.mutable_parameter_layout(0) =
       ShapeLayout(ashape_with_layout);
   *computation_layout.mutable_result_layout() = ShapeLayout(bshape_with_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_TRUE(
       LayoutUtil::Equal(ashape_with_layout.layout(), log->shape().layout()));
@@ -418,9 +437,9 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       HloInstruction::CreateBroadcast(bshape, param, {1, 2}));
   auto transpose = builder.AddInstruction(
       HloInstruction::CreateTranspose(cshape, broadcast, {2, 1, 0}));
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   HloComputation* computation =
-      module->AddEntryComputation(builder.Build(transpose));
+      m->AddEntryComputation(builder.Build(transpose));
 
   Shape input_shape_with_layout(ashape);
   Shape output_shape_with_layout(cshape);
@@ -433,7 +452,7 @@ TEST_F(LayoutAssignmentTest, BroadcastAndTranspose) {
       ShapeLayout(input_shape_with_layout);
   *computation_layout.mutable_result_layout() =
       ShapeLayout(output_shape_with_layout);
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_THAT(broadcast->shape().layout().minor_to_major(),
               ElementsAre(0, 1, 2));
@@ -467,9 +486,8 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
       HloInstruction::CreateBroadcast(f32_234, tanh, {1, 2}));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({transpose, broadcast2}));
-  auto module = CreateNewModule();
-  HloComputation* computation =
-      module->AddEntryComputation(builder.Build(tuple));
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build(tuple));
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
   Shape param_shape_with_layout(f32_4);
@@ -486,7 +504,7 @@ TEST_F(LayoutAssignmentTest, ReshapeOperandHasMultipleUsers) {
   *computation_layout.mutable_result_layout() =
       ShapeLayout(ShapeUtil::MakeTupleShape(
           {transpose_shape_with_layout, broadcast2_shape_with_layout}));
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   EXPECT_THAT(broadcast->shape().layout().minor_to_major(), ElementsAre(0, 1));
   EXPECT_THAT(transpose->shape().layout().minor_to_major(), ElementsAre(1, 0));
@@ -537,9 +555,8 @@ TEST_F(LayoutAssignmentTest, MakeOperandsTheSame) {
       HloInstruction::CreateConcatenate(bshape, {param0, param1}, 1));
   auto reshape = builder.AddInstruction(
       HloInstruction::CreateReshape(cshape, concatenate));
-  auto module = CreateNewModule();
-  HloComputation* computation =
-      module->AddEntryComputation(builder.Build(reshape));
+  auto m = CreateNewVerifiedModule();
+  HloComputation* computation = m->AddEntryComputation(builder.Build(reshape));
 
   Shape param0_shape_with_layout(ashape);
   Shape param1_shape_with_layout(ashape);
@@ -552,7 +569,7 @@ TEST_F(LayoutAssignmentTest, MakeOperandsTheSame) {
   *computation_layout.mutable_parameter_layout(1) =
       ShapeLayout(param1_shape_with_layout);
   OperandsMustBeTheSameLayoutAssignment layout_assignment(&computation_layout);
-  EXPECT_IS_OK(layout_assignment.Run(module).status());
+  EXPECT_IS_OK(layout_assignment.Run(m.get()).status());
 
   EXPECT_EQ(HloOpcode::kCopy, concatenate->operand(0)->opcode());
   EXPECT_THAT(concatenate->operand(0)->shape().layout().minor_to_major(),
@@ -572,11 +589,11 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastFromOperand) {
       HloInstruction::CreateParameter(0, input_shape_with_layout, "param"));
   auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {6, 7, 3, 5}), param, {2, 3, 0, 1}));
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   HloComputation* computation =
-      module->AddEntryComputation(builder.Build(transpose));
+      m->AddEntryComputation(builder.Build(transpose));
   ComputationLayout computation_layout(computation->ComputeProgramShape());
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
@@ -590,11 +607,11 @@ TEST_F(LayoutAssignmentTest, TransposeToBitcastToUser) {
       HloInstruction::CreateBroadcast(input_shape, constant, {}));
   auto transpose = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {6, 7, 3, 5}), broadcast, {2, 3, 0, 1}));
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   HloComputation* computation =
-      module->AddEntryComputation(builder.Build(transpose));
+      m->AddEntryComputation(builder.Build(transpose));
   ComputationLayout computation_layout(computation->ComputeProgramShape());
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
   EXPECT_TRUE(ShapeUtil::TransposeIsBitcast(transpose->operand(0)->shape(),
                                             transpose->shape(), {2, 3, 0, 1}));
 }
@@ -660,12 +677,12 @@ TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
-
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   std::unique_ptr<HloModule> compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
 
@@ -700,9 +717,10 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module().entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShapeWithLayout(F32, {2, 2, 2}, {0, 1, 2}),
        ShapeUtil::MakeTupleShape({
@@ -714,19 +732,19 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
           param_shape));
   computation_layout.mutable_result_layout()->ResetLayout(
       LayoutUtil::MakeLayout({2, 1, 0}));
-  AssignLayouts(&module(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
-  EXPECT_THAT(LayoutOf(&module(), "gte0"), ElementsAre(0, 1, 2));
-  EXPECT_THAT(LayoutOf(&module(), "gte1a"), ElementsAre(1, 2, 0));
-  EXPECT_THAT(LayoutOf(&module(), "gte1b"), ElementsAre(2, 0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "fresult"), ElementsAre(2, 1, 0));
-  EXPECT_THAT(FindInstruction(&module(), "gte1")
+  EXPECT_THAT(LayoutOf(m.get(), "gte0"), ElementsAre(0, 1, 2));
+  EXPECT_THAT(LayoutOf(m.get(), "gte1a"), ElementsAre(1, 2, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "gte1b"), ElementsAre(2, 0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "fresult"), ElementsAre(2, 1, 0));
+  EXPECT_THAT(FindInstruction(m.get(), "gte1")
                   ->shape()
                   .tuple_shapes(0)
                   .layout()
                   .minor_to_major(),
               ElementsAre(1, 2, 0));
-  EXPECT_THAT(FindInstruction(&module(), "gte1")
+  EXPECT_THAT(FindInstruction(m.get(), "gte1")
                   ->shape()
                   .tuple_shapes(1)
                   .layout()
@@ -736,7 +754,7 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
 
 TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   auto builder = HloComputation::Builder(TestName());
-  auto module = CreateNewModule();
+  auto m = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {128, 8});
   Shape tshape = ShapeUtil::MakeTupleShape({shape, shape});
   Shape result_tshape = ShapeUtil::MakeTupleShape({shape});
@@ -763,7 +781,7 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
     true_builder.AddInstruction(HloInstruction::CreateTuple({add}));
   }
   HloComputation* true_computation =
-      module->AddEmbeddedComputation(true_builder.Build());
+      m->AddEmbeddedComputation(true_builder.Build());
 
   auto false_builder = HloComputation::Builder(TestName() + "_FalseBranch");
   {
@@ -779,14 +797,14 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
     false_builder.AddInstruction(HloInstruction::CreateTuple({infeed_data}));
   }
   HloComputation* false_computation =
-      module->AddEmbeddedComputation(false_builder.Build());
+      m->AddEmbeddedComputation(false_builder.Build());
   builder.AddInstruction(HloInstruction::CreateConditional(
       result_tshape, pred, tuple, true_computation, tuple, false_computation));
 
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation = m->AddEntryComputation(builder.Build());
   ComputationLayout computation_layout(computation->ComputeProgramShape());
 
-  AssignLayouts(module, &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   const HloInstruction* true_root = true_computation->root_instruction();
   const HloInstruction* false_root = false_computation->root_instruction();
@@ -807,13 +825,13 @@ TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
           {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
   builder.AddInstruction(HloInstruction::CreateUnary(
       constant0->shape(), HloOpcode::kBitcast, constant0));
-  auto module = CreateNewModule();
-  module->AddEntryComputation(builder.Build());
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   LayoutAssignment layout_assignment(&computation_layout);
-  Status error_status = layout_assignment.Run(module).status();
+  Status error_status = layout_assignment.Run(m.get()).status();
   EXPECT_FALSE(error_status.ok());
   EXPECT_THAT(
       error_status.error_message(),
@@ -840,9 +858,10 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module().entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})});
   TF_ASSERT_OK(
@@ -852,12 +871,12 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
       LayoutUtil::MakeLayout({1, 0}));
 
   ChannelLayoutConstraints channel_constraints;
-  AssignLayouts(&module(), &computation_layout, &channel_constraints);
+  AssignLayouts(m.get(), &computation_layout, &channel_constraints);
 
-  EXPECT_THAT(LayoutOf(&module(), "gte"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "root"), ElementsAre(1, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "gte"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "root"), ElementsAre(1, 0));
   EXPECT_TRUE(ShapeUtil::Equal(
-      ShapeUtil::GetSubshape(FindInstruction(&module(), "send")->shape(), {0}),
+      ShapeUtil::GetSubshape(FindInstruction(m.get(), "send")->shape(), {0}),
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0})));
 }
 
@@ -876,17 +895,17 @@ TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) {
       param = (f32[2,2]) parameter(0)
       gte = f32[2,2] get-tuple-element(param), index=0
       ar.0 = f32[2,2] cross-replica-sum(gte),
-        all_reduce_id=0, replica_groups={{0}}, to_apply=add,
+        all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=0}
       const = f32[2,2] constant(f32[2,2]{{0,1},{2,3}})
       ROOT ar.1 = f32[2,2] cross-replica-sum(const),
-        all_reduce_id=0, replica_groups={{0}}, to_apply=add,
+        all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=1}
     })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
                           ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module->entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {0, 1})});
   TF_ASSERT_OK(
@@ -896,12 +915,12 @@ TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) {
       LayoutUtil::MakeLayout({1, 0}));
 
   ChannelLayoutConstraints channel_constraints;
-  AssignLayouts(module.get(), &computation_layout, &channel_constraints);
+  AssignLayouts(m.get(), &computation_layout, &channel_constraints);
 
-  EXPECT_THAT(LayoutOf(module.get(), "gte"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(module.get(), "ar.0"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(module.get(), "ar.1"), ElementsAre(0, 1));
-  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(LayoutOf(m.get(), "gte"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "ar.0"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "ar.1"), ElementsAre(0, 1));
+  const HloInstruction* root = m->entry_computation()->root_instruction();
   EXPECT_THAT(root->shape().layout().minor_to_major(), ElementsAre(1, 0));
 }
 
@@ -917,19 +936,22 @@ TEST_F(LayoutAssignmentTest, CopySliceOperandToAvoidImplicitLayoutChange) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {4, 5}, {1, 0});
-  EXPECT_THAT(root, op::Add(op::Parameter(),
-                            op::Slice(AllOf(op::Copy(op::Parameter(1)),
-                                            op::ShapeWithLayout(shape_copy)))));
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Add(
+          m::Parameter(),
+          m::Slice(m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy)))));
 }
 
 TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
@@ -945,21 +967,23 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {4, 5}, {1, 0});
   EXPECT_THAT(root,
-              op::Add(op::Parameter(),
-                      op::DynamicSlice(AllOf(op::Copy(op::Parameter(1)),
-                                             op::ShapeWithLayout(shape_copy)),
-                                       op::Parameter(2))));
+              GmockMatch(m::Add(
+                  m::Parameter(),
+                  m::DynamicSlice(
+                      m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy),
+                      m::Parameter(2)))));
 }
 
 TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
@@ -976,21 +1000,23 @@ TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {3, 5}, {1, 0});
-  EXPECT_THAT(root,
-              op::Add(op::Parameter(),
-                      op::Concatenate(AllOf(op::Copy(op::Parameter(1)),
-                                            op::ShapeWithLayout(shape_copy)),
-                                      op::Parameter(2))));
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Add(
+          m::Parameter(),
+          m::Concatenate(m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy),
+                         m::Parameter(2)))));
 }
 
 TEST_F(LayoutAssignmentTest,
@@ -1007,16 +1033,18 @@ TEST_F(LayoutAssignmentTest,
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Convolution(op::Parameter(0), op::Parameter(1)));
+  EXPECT_THAT(root,
+              GmockMatch(m::Convolution(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST_F(LayoutAssignmentTest, PropagatingLayoutFromResultToOperand) {
@@ -1029,18 +1057,20 @@ TEST_F(LayoutAssignmentTest, PropagatingLayoutFromResultToOperand) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   auto compiled_module =
       backend()
           .compiler()
-          ->RunHloPasses(module().Clone(), backend().default_stream_executor(),
+          ->RunHloPasses(m->Clone(), backend().default_stream_executor(),
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {4, 5}, {0, 1});
-  EXPECT_THAT(root, op::Slice(AllOf(op::Copy(op::Parameter(0)),
-                                    op::ShapeWithLayout(shape_copy))));
+  EXPECT_THAT(root,
+              GmockMatch(m::Slice(
+                  m::Copy(m::Parameter(0)).WithShapeEqualTo(&shape_copy))));
 }
 
 TEST_F(LayoutAssignmentTest, TupleCopyOnLayoutMismatch) {
@@ -1086,20 +1116,241 @@ TEST_F(LayoutAssignmentTest, TupleCopyOnLayoutMismatch) {
     }
   )";
 
-  ParseAndVerifyModule(module_str);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(module_str));
   ComputationLayout computation_layout(
-      module().entry_computation()->ComputeProgramShape());
+      m->entry_computation()->ComputeProgramShape());
 
   // Sanity check to verify that there's a layout mismatch.
-  EXPECT_THAT(LayoutOf(&module(), "ibuf"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "next_buf"), ElementsAre(1, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "ibuf"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "next_buf"), ElementsAre(1, 0));
 
-  AssignLayouts(&module(), &computation_layout);
+  AssignLayouts(m.get(), &computation_layout);
 
   // Make sure that layout assignment did not magically eliminate the mismatch,
   // in which case the test didn't prove anything.
-  EXPECT_THAT(LayoutOf(&module(), "ibuf"), ElementsAre(0, 1));
-  EXPECT_THAT(LayoutOf(&module(), "next_buf"), ElementsAre(1, 0));
+  EXPECT_THAT(LayoutOf(m.get(), "ibuf"), ElementsAre(0, 1));
+  EXPECT_THAT(LayoutOf(m.get(), "next_buf"), ElementsAre(1, 0));
+}
+
+TEST_F(LayoutAssignmentTest, CustomCallNotLayoutConstrained) {
+  const char* module_str = R"(
+HloModule CustomCallNotLayoutConstrained
+
+ENTRY %CustomCallWithNotLayoutConstrained (p: f32[42,2,3]) -> f32[1,2,3,4] {
+  %p = f32[42,2,3] parameter(0)
+  ROOT %custom-call = f32[1,2,3,4] custom-call(f32[42,2,3] %p), custom_call_target="baz"
+}
+)";
+  // Try with a couple different layouts. In each case the custom calls operand
+  // and result layout should match that of the computation.
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<VerifiedHloModule> m,
+        ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+    ComputationLayout computation_layout = m->entry_computation_layout();
+    *computation_layout.mutable_parameter_layout(0) =
+        ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {42, 2, 3}, {0, 2, 1}));
+    *computation_layout.mutable_result_layout() = ShapeLayout(
+        ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {3, 2, 0, 1}));
+    AssignLayouts(m.get(), &computation_layout);
+
+    HloInstruction* root = m->entry_computation()->root_instruction();
+    ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter())));
+    ExpectLayoutIs(root->shape(), {3, 2, 0, 1});
+    ExpectLayoutIs(root->operand(0)->shape(), {0, 2, 1});
+  }
+  {
+    TF_ASSERT_OK_AND_ASSIGN(
+        std::unique_ptr<VerifiedHloModule> m,
+        ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+    ComputationLayout computation_layout = m->entry_computation_layout();
+    *computation_layout.mutable_parameter_layout(0) =
+        ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {42, 2, 3}, {0, 1, 2}));
+    *computation_layout.mutable_result_layout() = ShapeLayout(
+        ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {0, 2, 3, 1}));
+    AssignLayouts(m.get(), &computation_layout);
+
+    HloInstruction* root = m->entry_computation()->root_instruction();
+    ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter())));
+    ExpectLayoutIs(root->shape(), {0, 2, 3, 1});
+    ExpectLayoutIs(root->operand(0)->shape(), {0, 1, 2});
+  }
+}
+
+TEST_F(LayoutAssignmentTest, CustomCallLayoutConstrained) {
+  const char* module_str = R"(
+HloModule CustomCallLayoutConstrained
+
+ENTRY %CustomCallWithLayoutConstraints (p0: f32[4,4], p1: f32[2,3]) -> f32[1,2,3,4] {
+  %p0 = f32[4,4] parameter(0)
+  %p1 = f32[2,3] parameter(1)
+  ROOT %custom-call = f32[1,2,3,4]{3,2,0,1} custom-call(f32[4,4] %p0, f32[2,3] %p1), custom_call_target="baz", operand_layout_constraints={f32[4,4]{0,1}, f32[2,3]{1,0}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> m,
+      ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+  ComputationLayout computation_layout = m->entry_computation_layout();
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
+  *computation_layout.mutable_parameter_layout(1) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0}));
+  *computation_layout.mutable_result_layout() = ShapeLayout(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
+  AssignLayouts(m.get(), &computation_layout);
+
+  // The custom call should be partially encapsulated in kCopy instructions
+  // because of the layout mismatches.
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(m::CustomCall(m::Copy(), m::Parameter()))));
+
+  const HloInstruction* custom_call =
+      m->entry_computation()->root_instruction()->operand(0);
+  ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
+  ExpectLayoutIs(custom_call->operand(0)->shape(), {0, 1});
+  ExpectLayoutIs(custom_call->operand(1)->shape(), {1, 0});
+}
+
+TEST_F(LayoutAssignmentTest, CustomCallLayoutConstrainedZeroOperands) {
+  const char* module_str = R"(
+HloModule CustomCallLayoutConstrainedZeroOperands
+
+ENTRY %CustomCallLayoutConstrainedZeroOperands () -> f32[1,2,3,4] {
+  ROOT %custom-call = f32[1,2,3,4]{3,2,0,1} custom-call(), custom_call_target="baz", operand_layout_constraints={}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> m,
+      ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+  ComputationLayout computation_layout = m->entry_computation_layout();
+  *computation_layout.mutable_result_layout() = ShapeLayout(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
+  AssignLayouts(m.get(), &computation_layout);
+
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(m::CustomCall())));
+
+  const HloInstruction* custom_call =
+      m->entry_computation()->root_instruction()->operand(0);
+  ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
+}
+
+TEST_F(LayoutAssignmentTest, CustomCallLayoutConstrainedTupleOperand) {
+  const char* module_str = R"(
+HloModule CustomCallLayoutConstrainedTupleOperand
+
+ENTRY %CustomCallLayoutConstrainedTupleOperand (p0: f32[4,4], p1: f32[2,3]) -> f32[1,2,3,4] {
+  %p0 = f32[4,4] parameter(0)
+  %p1 = f32[2,3] parameter(1)
+  %tuple = (f32[4,4], f32[2,3]) tuple(%p0, %p1)
+  ROOT %custom-call = f32[1,2,3,4]{3,2,0,1} custom-call(%tuple), custom_call_target="baz", operand_layout_constraints={(f32[4,4]{1,0}, f32[2,3]{0,1})}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> m,
+      ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+  ComputationLayout computation_layout = m->entry_computation_layout();
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
+  *computation_layout.mutable_parameter_layout(1) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0}));
+  *computation_layout.mutable_result_layout() = ShapeLayout(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2, 3, 4}, {2, 1, 0, 3}));
+  AssignLayouts(m.get(), &computation_layout);
+
+  HloInstruction* root = m->entry_computation()->root_instruction();
+  ExpectLayoutIs(root->shape(), {2, 1, 0, 3});
+
+  ASSERT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Copy(m::CustomCall(m::Tuple()))));
+
+  const HloInstruction* custom_call =
+      m->entry_computation()->root_instruction()->operand(0);
+  ExpectLayoutIs(custom_call->shape(), {3, 2, 0, 1});
+  ExpectTupleLayoutIs(custom_call->operand(0)->shape(), {{1, 0}, {0, 1}});
+}
+
+TEST_F(LayoutAssignmentTest, CustomCallLayoutConstrainedTupleResult) {
+  const char* module_str = R"(
+HloModule CustomCallLayoutConstrainedTupleResult
+
+ENTRY %CustomCallLayoutConstrainedTupleResult (p0: f32[4,4]) -> (f32[4,4]{1,0}, f32[2,3]{0,1}) {
+  %p0 = f32[4,4] parameter(0)
+  ROOT %custom-call = (f32[4,4]{1,0}, f32[2,3]{0,1}) custom-call(%p0), custom_call_target="baz", operand_layout_constraints={f32[4,4]{1,0}}
+}
+)";
+  // Try with a couple different layouts. In each case the custom calls operand
+  // and result layout should match that of the computation.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<VerifiedHloModule> m,
+      ParseAndReturnVerifiedModule(module_str, GetModuleConfigForTest()));
+  ComputationLayout computation_layout = m->entry_computation_layout();
+  *computation_layout.mutable_parameter_layout(0) =
+      ShapeLayout(ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}));
+  *computation_layout.mutable_result_layout() =
+      ShapeLayout(ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShapeWithLayout(F32, {4, 4}, {1, 0}),
+           ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0})}));
+  AssignLayouts(m.get(), &computation_layout);
+
+  ExpectTupleLayoutIs(m->result_shape(), {{1, 0}, {1, 0}});
+
+  const HloInstruction* custom_call = FindInstruction(m.get(), "custom-call");
+  ExpectTupleLayoutIs(custom_call->shape(), {{1, 0}, {0, 1}});
+}
+
+Status AssignLayoutsToComputation(
+    HloModule* m, ChannelLayoutConstraints* channel_constraints = nullptr) {
+  if (!m->entry_computation_layout().result_layout().LayoutIsSet()) {
+    m->mutable_entry_computation_layout()
+        ->mutable_result_layout()
+        ->SetToDefaultLayout();
+  }
+  LayoutAssignment layout_assignment(
+      m->mutable_entry_computation_layout(),
+      LayoutAssignment::InstructionCanChangeLayout, channel_constraints);
+  return layout_assignment.Run(m).status();
+}
+
+TEST_F(LayoutAssignmentTest, OverwriteDiamondShapedConstraintsX) {
+  // Check that we handle a diamond-shaped graph correctly.
+  //      transpose
+  //       /    \
+  //     add    |
+  //       \    /
+  //        tuple
+
+  auto b = HloComputation::Builder(TestName());
+  Shape ashape = ShapeUtil::MakeShape(F32, {12, 8});
+  Shape bshape = ShapeUtil::MakeShape(F32, {8, 12});
+  auto param0 =
+      b.AddInstruction(HloInstruction::CreateParameter(0, bshape, "input"));
+  auto param1 =
+      b.AddInstruction(HloInstruction::CreateParameter(1, ashape, "input"));
+  auto transpose =
+      b.AddInstruction(HloInstruction::CreateTranspose(ashape, param0, {1, 0}));
+  auto add = b.AddInstruction(
+      HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, transpose, param1));
+  b.AddInstruction(HloInstruction::CreateTuple({add, transpose}));
+  auto m = CreateNewVerifiedModule();
+  m->AddEntryComputation(b.Build());
+  Shape ashape_major = ShapeUtil::MakeShapeWithLayout(F32, {12, 8}, {1, 0});
+  Shape ashape_minor = ShapeUtil::MakeShapeWithLayout(F32, {12, 8}, {0, 1});
+  *m->mutable_entry_computation_layout()->mutable_result_layout() =
+      ShapeLayout(ShapeUtil::MakeTupleShape({ashape_major, ashape_minor}));
+  const Layout r2_dim0major = LayoutUtil::MakeLayout({1, 0});
+  ForceParameterLayout(m.get(), 0, r2_dim0major);
+  ForceParameterLayout(m.get(), 1, r2_dim0major);
+  TF_ASSERT_OK(AssignLayoutsToComputation(m.get()));
+
+  EXPECT_THAT(add->shape().layout().minor_to_major(), ElementsAre(1, 0));
+  EXPECT_THAT(add->operand(0)->shape().layout().minor_to_major(),
+              ElementsAre(1, 0));
+  EXPECT_THAT(add->operand(1)->shape().layout().minor_to_major(),
+              ElementsAre(1, 0));
+
+  EXPECT_THAT(transpose->shape().layout().minor_to_major(), ElementsAre(0, 1));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
index b17c9d504501a907e27d5152e0082799e87443c7..382b575120277ffb0e63e693757591681a78479e 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -21,8 +21,25 @@ limitations under the License.
 #endif
 
 namespace xla {
+Status LLVMCompiler::RunHloPassesOnModuleGroup(
+    HloModuleGroup* module_group,
+    absl::Span<se::StreamExecutor* const> executors,
+    DeviceMemoryAllocator* device_allocator) {
+  return Unimplemented(
+      "Model partitioning not implemented for the CPU/GPU compilers!");
+}
+
+StatusOr<std::vector<std::unique_ptr<Executable>>>
+LLVMCompiler::RunBackendOnModuleGroup(
+    std::unique_ptr<HloModuleGroup> module_group,
+    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
+    DeviceMemoryAllocator* device_allocator) {
+  return Unimplemented(
+      "Model partitioning not implemented for the CPU/GPU compilers!");
+}
+
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> modules,
+    std::unique_ptr<HloModuleGroup> module_group,
     std::vector<std::vector<se::StreamExecutor*>> stream_execs,
     DeviceMemoryAllocator* device_allocator) {
   // Tensorflow tries to enable the following behaviors in all its threads:
@@ -38,6 +55,8 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
   tensorflow::port::ScopedDontFlushDenormal dont_flush_denormals;
 
   std::vector<std::unique_ptr<Executable>> result;
+  std::vector<std::unique_ptr<HloModule>> modules =
+      module_group->ConsumeModules();
   for (size_t i = 0; i < modules.size(); i++) {
     if (stream_execs[i].size() != 1) {
       return Unimplemented(
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index f1c623508c5307f2b1c036d3ec6823b75c7eda13..182d8edbe30da292f28aeab53be646ce6651839f 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -69,8 +69,18 @@ class LLVMCompiler : public Compiler {
   using Compiler::RunBackend;
   using Compiler::RunHloPasses;
 
+  Status RunHloPassesOnModuleGroup(
+      HloModuleGroup* module_group,
+      absl::Span<se::StreamExecutor* const> executors,
+      DeviceMemoryAllocator* device_allocator) override;
+
+  StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
+      std::unique_ptr<HloModuleGroup> module_group,
+      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
+      DeviceMemoryAllocator* device_allocator) override;
+
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> modules,
+      std::unique_ptr<HloModuleGroup> module_group,
       std::vector<std::vector<se::StreamExecutor*>> stream_execs,
       DeviceMemoryAllocator* device_allocator) override;
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 6223a34b1258961944a3ac64cd10876d1272c94e..728a66b388f0f9af480ff88b5e96990a26e36af5 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -72,6 +72,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
@@ -168,6 +169,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
     ],
@@ -196,15 +198,17 @@ cc_library(
     hdrs = ["sort_util.h"],
     deps = [
         ":ir_array",
+        ":kernel_support_library",
         ":llvm_loop",
         ":llvm_util",
         ":loop_emitter",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
         "//tensorflow/compiler/xla/service/gpu:partition_assignment",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@llvm//:core",
         "@llvm//:support",
     ],
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index cc2e862f2eb9a49099c5f90efe1b29fb77c8f106..4d7f36d9f8b565a819edf0631efc5c7a58c4f87f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -130,7 +130,8 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
 //
 // Emits a sequential loop if launch_dimensions is null.
 static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
-    HloInstruction* fusion, absl::Span<const IrArray> fusion_operand_arrays,
+    HloInstruction* fusion,
+    GeneratorForOperandIrArrays operand_arrays_generator,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
     const gpu::LaunchDimensions* launch_dimensions, llvm::IRBuilder<>* b) {
   CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
@@ -160,7 +161,8 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
       LayoutUtil::CopyLayoutBetweenShapes(fusion->shape(), &update_shape));
 
   // Create element generators for update and start_indices.
-  FusedIrEmitter fused_emitter(fusion_operand_arrays, elemental_emitter);
+  FusedIrEmitter fused_emitter(std::move(operand_arrays_generator),
+                               elemental_emitter);
   TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
   ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
   ElementGenerator start_indices_generator =
@@ -173,21 +175,24 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
 }
 
 Status EmitFusedDynamicUpdateSliceInPlace(
-    HloInstruction* fusion, absl::Span<const IrArray> fusion_operand_arrays,
+    HloInstruction* fusion,
+    GeneratorForOperandIrArrays operand_arrays_generator,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
     llvm::IRBuilder<>* b) {
   return EmitFusedDynamicUpdateSliceInPlaceImpl(
-      fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
+      fusion, std::move(operand_arrays_generator), fusion_output_array,
+      elemental_emitter,
       /*launch_dimensions=*/nullptr, b);
 }
 
 Status EmitParallelFusedDynamicUpdateSliceInPlace(
-    HloInstruction* fusion, absl::Span<const IrArray> fusion_operand_arrays,
+    HloInstruction* fusion,
+    GeneratorForOperandIrArrays operand_arrays_generator,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
     const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b) {
   return EmitFusedDynamicUpdateSliceInPlaceImpl(
-      fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
-      &launch_dimensions, b);
+      fusion, std::move(operand_arrays_generator), fusion_output_array,
+      elemental_emitter, &launch_dimensions, b);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
index fb3e4eb97cae06f2a0c87dd7118b8332048df56e..7fe803d1f8da5251c99f0a8fd97f99e9ca031175 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -27,6 +27,9 @@ limitations under the License.
 namespace xla {
 namespace llvm_ir {
 
+using GeneratorForOperandIrArrays =
+    std::function<std::vector<llvm_ir::IrArray>()>;
+
 // Checks if we can emit code for the given DynamicUpdateSlice node that updates
 // its input in place.  Returns true if the dynamic-update-slice's
 // array-to-be-updated and output share the same BufferAllocation::Slice.
@@ -73,14 +76,16 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
 // (sequential) code for a fusion node that does the dynamic-update-slice in
 // place.
 Status EmitFusedDynamicUpdateSliceInPlace(
-    HloInstruction* fusion, absl::Span<const IrArray> fusion_operand_arrays,
+    HloInstruction* fusion,
+    GeneratorForOperandIrArrays operand_arrays_generator,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
     llvm::IRBuilder<>* b);
 
 // Same as EmitFusedDynamicUpdateSliceInPlace, except emits a parallel loop with
 // the given launch dimensions.
 Status EmitParallelFusedDynamicUpdateSliceInPlace(
-    HloInstruction* fusion, absl::Span<const IrArray> fusion_operand_arrays,
+    HloInstruction* fusion,
+    GeneratorForOperandIrArrays operand_arrays_generator,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
     const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b);
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index b606c993a2d58a6d177af10de7b214de130c2279..38f2b5da23a7b92e4547dceaba011ce654977da3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -33,7 +33,7 @@ namespace xla {
 using llvm_ir::IrArray;
 
 Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
-  generators_[hlo] =
+  indexed_generators_[hlo] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
     if (generated_value_cache_[hlo].count(index.multidim()) > 0) {
       llvm::Value* generated_value =
@@ -63,25 +63,26 @@ Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
               << llvm_ir::AsString(b_->GetInsertBlock()->getName()) << ").";
     }
 
-    TF_ASSIGN_OR_RETURN(
-        generated_value_cache_[hlo][index.multidim()],
-        elemental_emitter_->MakeElementGenerator(hlo, generators_)(index));
+    TF_ASSIGN_OR_RETURN(generated_value_cache_[hlo][index.multidim()],
+                        elemental_emitter_->MakeElementGenerator(
+                            hlo, indexed_generators_)(index));
     return generated_value_cache_[hlo][index.multidim()];
   };
   return Status::OK();
 }
 
 Status FusedIrEmitter::HandleConstant(HloInstruction* constant) {
-  const Literal& literal = constant->literal();
-  llvm::Constant* initializer =
-      llvm_ir::ConvertLiteralToIrConstant(literal, module_);
-  llvm::GlobalVariable* global = new llvm::GlobalVariable(
-      *b_->GetInsertBlock()->getModule(), initializer->getType(),
-      /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, initializer,
-      /*Name=*/"");
-  llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast(
-      global, llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo());
-  generators_[constant] = [=](const IrArray::Index& index) {
+  indexed_generators_[constant] = [=](const IrArray::Index& index) {
+    const Literal& literal = constant->literal();
+    llvm::Constant* initializer =
+        llvm_ir::ConvertLiteralToIrConstant(literal, module_);
+    llvm::GlobalVariable* global = new llvm::GlobalVariable(
+        *b_->GetInsertBlock()->getModule(), initializer->getType(),
+        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, initializer,
+        /*Name=*/"");
+    llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast(
+        global,
+        llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo());
     return IrArray(shape_constant, constant->shape())
         .EmitReadArrayElement(index, b_);
   };
@@ -91,34 +92,47 @@ Status FusedIrEmitter::HandleConstant(HloInstruction* constant) {
 
 Status FusedIrEmitter::HandleGetTupleElement(
     HloInstruction* get_tuple_element) {
-  // Lookup ir value for 'operand'.
-  auto operand = get_tuple_element->operand(0);
-  auto it = gte_values_.find(operand);
-  if (it == gte_values_.end()) {
-    return Unimplemented(
-        "GetTupleElement fusion currently only supports"
-        " parameter operands, but found operand: %s",
-        operand->name());
-  }
-  // Emit code to lookup tuple element pointer, and store it in 'gte_values_'.
-  llvm::Value* tuple_element_ptr = llvm_ir::EmitGetTupleElement(
-      get_tuple_element->shape(), get_tuple_element->tuple_index(),
-      /*alignment=*/1, it->second, b_, module_);
-  gte_values_.insert(std::make_pair(get_tuple_element, tuple_element_ptr));
-  // Emit code to read base tuple element array (if non-tuple shaped).
+  auto emit_tuple_element_ptr = [=]() -> StatusOr<llvm::Value*> {
+    const HloInstruction* tuple_operand = get_tuple_element->operand(0);
+    llvm::Value* tuple_ptr;
+    if (tuple_operand->opcode() == HloOpcode::kGetTupleElement) {
+      TF_ASSIGN_OR_RETURN(tuple_ptr, non_indexed_generators_[tuple_operand]());
+    } else {
+      if (tuple_operand->opcode() != HloOpcode::kParameter) {
+        return Unimplemented(
+            "GetTupleElement fusion currently only supports parameter or "
+            "nested"
+            "GetTupleElement as tuple operand, found an exception: %s",
+            tuple_operand->name());
+      }
+      tuple_ptr =
+          GetBasePointerForFusedParameter(tuple_operand->parameter_number());
+    }
+
+    // Lookup tuple element pointer.
+    return llvm_ir::EmitGetTupleElement(
+        get_tuple_element->shape(), get_tuple_element->tuple_index(),
+        /*alignment=*/1, tuple_ptr, b_, module_);
+  };
+
   if (!ShapeUtil::IsTuple(get_tuple_element->shape())) {
-    generators_[get_tuple_element] =
+    indexed_generators_[get_tuple_element] =
         [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
       // TODO(b/34080002) Add aliasing information to tuple element IrArray.
+      TF_ASSIGN_OR_RETURN(llvm::Value * tuple_element_ptr,
+                          emit_tuple_element_ptr());
       return IrArray(tuple_element_ptr, get_tuple_element->shape())
           .EmitReadArrayElement(index, b_);
     };
+  } else {
+    non_indexed_generators_[get_tuple_element] = emit_tuple_element_ptr;
   }
   return Status::OK();
 }
 
 Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) {
-  generators_[parameter] = [=](const IrArray::Index& index) -> llvm::Value* {
+  indexed_generators_[parameter] =
+      [=](const IrArray::Index& index) -> llvm::Value* {
     if (tiled_parameter_info_) {
       if (llvm::Value* param_tile_buffer =
               tiled_parameter_info_->GetBufferForParameter(
@@ -135,14 +149,9 @@ Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) {
             "tiled_buffer");
       }
     }
-    return parameter_arrays_[parameter->parameter_number()]
+    return GetIrArrayForFusedParameter(parameter->parameter_number())
         .EmitReadArrayElement(index, b_);
   };
-  // Store ir value for fusion operand associated with fusion parameter to be
-  // accessed by subsequent fused GetTupleElement instructions.
-  gte_values_.insert(std::make_pair(
-      parameter,
-      parameter_arrays_[parameter->parameter_number()].GetBasePointer()));
   return Status::OK();
 }
 
@@ -153,12 +162,13 @@ Status FusedIrEmitter::HandleTuple(HloInstruction* tuple) {
     operand_elemental_ir_types.push_back(llvm_ir::PrimitiveTypeToIrType(
         operand->shape().element_type(), module_));
   }
-  generators_[tuple] =
+  indexed_generators_[tuple] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
     llvm::Value* ret = llvm::UndefValue::get(
         llvm::StructType::get(b_->getContext(), operand_elemental_ir_types));
     for (size_t i = 0; i < ShapeUtil::TupleElementCount(tuple->shape()); ++i) {
-      TF_ASSIGN_OR_RETURN(llvm::Value * val_i, generators_[operands[i]](index));
+      TF_ASSIGN_OR_RETURN(llvm::Value * val_i,
+                          indexed_generators_[operands[i]](index));
       ret = b_->CreateInsertValue(ret, val_i, i);
     }
     return ret;
@@ -171,15 +181,15 @@ Status FusedIrEmitter::FinishVisit(HloInstruction* root) {
   return Status::OK();
 }
 
-FusedIrEmitter::Generator FusedIrEmitter::GetRootGenerator() const {
+FusedIrEmitter::IndexedGenerator FusedIrEmitter::GetRootGenerator() const {
   CHECK_NE(nullptr, fused_root_)
       << "GetRootGenerator should be called after Accept.";
-  return generators_.at(fused_root_);
+  return indexed_generators_.at(fused_root_);
 }
 
-FusedIrEmitter::Generator FusedIrEmitter::GetGenerator(
+FusedIrEmitter::IndexedGenerator FusedIrEmitter::GetGenerator(
     const HloInstruction* instruction) const {
-  return generators_.at(instruction);
+  return indexed_generators_.at(instruction);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 44d21fa750a532633f46614002d59c90fc0b5d40..1b9c61f6700e2a1309b21e499f4a9e2439ed3702 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <map>
 #include <unordered_map>
 
+#include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
@@ -52,11 +53,15 @@ namespace xla {
 // same length.
 class FusedIrEmitter : public DfsHloVisitorWithDefault {
  public:
-  using Generator = llvm_ir::ElementGenerator;
+  using IndexedGenerator = llvm_ir::ElementGenerator;
+  using NonIndexedGenerator = std::function<StatusOr<llvm::Value*>()>;
+  using GeneratorForOperandIrArrays =
+      std::function<std::vector<llvm_ir::IrArray>()>;
 
-  FusedIrEmitter(absl::Span<const llvm_ir::IrArray> parameter_arrays,
+  FusedIrEmitter(GeneratorForOperandIrArrays operand_arrays_generator,
                  ElementalIrEmitter* elemental_emitter)
-      : parameter_arrays_(parameter_arrays),
+      : operand_arrays_(),
+        operand_arrays_generator_(std::move(operand_arrays_generator)),
         tiled_parameter_info_(nullptr),
         elemental_emitter_(elemental_emitter),
         b_(elemental_emitter->b()),
@@ -76,25 +81,34 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
   Status FinishVisit(HloInstruction* root) override;
 
   // Returns the generator function for the root of the fused computation.
-  Generator GetRootGenerator() const;
+  IndexedGenerator GetRootGenerator() const;
 
   // Returns the generator function for the given instruction.
-  Generator GetGenerator(const HloInstruction* instruction) const;
-
-  // Returns the ir value for instruction 'hlo'.
-  llvm::Value* GetIrValueForGTE(const HloInstruction* hlo) const {
-    auto it = gte_values_.find(hlo);
-    CHECK(it != gte_values_.end());
-    return it->second;
-  }
+  IndexedGenerator GetGenerator(const HloInstruction* instruction) const;
 
   void SetTiledParameterInfo(const llvm_ir::TiledParameterInfo* info) {
     tiled_parameter_info_ = info;
   }
 
+ protected:
+  // Returns the IrArrays for the fusion instruction operands.
+  llvm_ir::IrArray& GetIrArrayForFusedParameter(int64 parameter_number) {
+    if (!operand_arrays_.has_value()) {
+      operand_arrays_ = operand_arrays_generator_();
+    }
+    return operand_arrays_.value()[parameter_number];
+  }
+
+  llvm::Value* GetBasePointerForFusedParameter(int64 parameter_number) {
+    return GetIrArrayForFusedParameter(parameter_number).GetBasePointer();
+  }
+
  private:
-  // Arrays of parameters of fusion instruction
-  absl::Span<const llvm_ir::IrArray> parameter_arrays_;
+  // IrArrays for the fusion instruction operands, whose base addresses are the
+  // base address of the corresponding parameters in the fused computation.
+  absl::optional<std::vector<llvm_ir::IrArray>> operand_arrays_;
+  GeneratorForOperandIrArrays operand_arrays_generator_;
+
   const llvm_ir::TiledParameterInfo* tiled_parameter_info_;
 
   ElementalIrEmitter* elemental_emitter_;
@@ -106,19 +120,23 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
   llvm::IRBuilder<>* b_;
   llvm::Module* module_;
 
-  // Map from instruction pointers to functions to generate elements of their
-  // outputs
-  std::unordered_map<const HloInstruction*, Generator> generators_;
+  // Map from instructions to functions that generate code for the output
+  // elements. If an instruction is a GetTupleElement instruction, the
+  // instruction produces non-tuple result.
+  std::unordered_map<const HloInstruction*, IndexedGenerator>
+      indexed_generators_;
+
+  // Map from tuple-result-producing GetTupleELement instructions to functions
+  // that generate the base pointers for the output elements. This is used to
+  // support the translation of nested GetTupleElement instructions.
+  std::unordered_map<const HloInstruction*, NonIndexedGenerator>
+      non_indexed_generators_;
 
   // Cache of generated values, lest we regenerate an element of a node with
   // multiple outgoing edges
   std::unordered_map<const HloInstruction*,
                      std::map<std::vector<llvm::Value*>, llvm::Value*>>
       generated_value_cache_;
-
-  // Stores ir values required to emit fused (and possibly nested)
-  // GetTupleElement instructions.
-  std::unordered_map<const HloInstruction*, llvm::Value*> gte_values_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index f4b05f29c38529b3cce81b4c8ee6fae5c00cafcc..d6d84994ee147f4b8c1a333b0eaccdf6e0a2219b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
@@ -108,6 +109,14 @@ class IrArray {
     Index(absl::Span<llvm::Value* const> multidim, llvm::Value* linear,
           const Shape& shape);
 
+    // Returns an index that adds `addend` to the given `dim` of the object.
+    Index AddOffsetToDim(llvm::Value* addend, int64 dim,
+                         llvm::IRBuilder<>* b) const {
+      IrArray::Index index = *this;
+      index[dim] = b->CreateAdd(index[dim], addend);
+      return index;
+    }
+
     const std::vector<llvm::Value*>& multidim() const { return multidim_; }
     llvm::Value* linear() const { return linear_; }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index e5fbdbd51b8a9aa14decadedd1eeb3bdbf831738..c26711e526c9b89cdedcb6aed9f93d41dd25dc83 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -52,6 +52,29 @@ Shape MergeDimensions(absl::Span<const size_t> segs, const Shape& shape) {
   return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
                                                   dimensions);
 }
+
+// Given an index for a shape, return the equivalent new index if the shape is
+// reshaped to another shape.
+IrArray::Index GetReshapedIndex(const IrArray::Index& index, const Shape& shape,
+                                const Shape& reshaped_shape,
+                                llvm::IRBuilder<>* b) {
+  auto bounds = shape.dimensions();
+  auto minor_to_major = shape.layout().minor_to_major();
+  llvm::Value* linear_index = index.GetConstantWithIndexType(0);
+  int64 multiplier = 1;
+  for (int i = 0; i < index.size(); ++i) {
+    int64 dim = minor_to_major[i];
+    llvm::Value* addend = b->CreateMul(
+        index[dim], index.GetConstantWithIndexType(multiplier), "linearizing",
+        /*HasNUW=*/true, /*HasNSW=*/true);
+    linear_index = b->CreateAdd(linear_index, addend, "",
+                                /*HasNUW=*/true, /*HasNSW=*/true);
+    multiplier *= bounds[dim];
+  }
+
+  return IrArray::Index(linear_index, reshaped_shape, b);
+}
+
 }  // namespace
 
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
@@ -60,28 +83,30 @@ absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
     return absl::nullopt;
   }
 
-  std::vector<int64> perm(a.dimensions().size());
-  {
-    auto layout_a_orig = LayoutUtil::MinorToMajor(a);
-    std::vector<int64> layout_a(layout_a_orig.rbegin(), layout_a_orig.rend());
-    auto layout_b_orig = LayoutUtil::MinorToMajor(b);
-    std::vector<int64> layout_b(layout_b_orig.rbegin(), layout_b_orig.rend());
-    for (size_t i = 0; i < perm.size(); ++i) {
-      perm[i] = PositionInContainer(layout_b, layout_a[i]);
-    }
+  std::vector<int64> permutation(a.dimensions().size());
+  absl::Span<const int64> minor_to_major_a = LayoutUtil::MinorToMajor(a);
+  std::vector<int64> major_to_minor_a(minor_to_major_a.rbegin(),
+                                      minor_to_major_a.rend());
+  absl::Span<const int64> minor_to_major_b = LayoutUtil::MinorToMajor(b);
+  std::vector<int64> major_to_minor_b(minor_to_major_b.rbegin(),
+                                      minor_to_major_b.rend());
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    permutation[i] = PositionInContainer(major_to_minor_b, major_to_minor_a[i]);
   }
-  auto segs = ConsecutiveSegments(perm);
-  if ((3 == segs.size() && 0 == perm[0]) || 2 == segs.size()) {
-    Shape norm_a =
+
+  std::vector<size_t> segments = ConsecutiveSegments(permutation);
+  if ((3 == segments.size() && 0 == permutation[0]) || 2 == segments.size()) {
+    Shape descending_layout_shape =
         ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a);
-    Shape reduced_a = MergeDimensions(segs, norm_a);
-    auto reduced_a_dims = reduced_a.dimensions();
+    Shape normalized_shape = MergeDimensions(segments, descending_layout_shape);
+    absl::Span<const int64> normalized_dims =
+        AsInt64Slice(normalized_shape.dimensions());
     std::vector<int64> dims_021;
-    if (2 == segs.size()) {
+    if (2 == segments.size()) {
       // The logical component-0 is of size one.
-      dims_021 = {1, reduced_a_dims[1], reduced_a_dims[0]};
+      dims_021 = {1, normalized_dims[1], normalized_dims[0]};
     } else {
-      dims_021 = {reduced_a_dims[0], reduced_a_dims[2], reduced_a_dims[1]};
+      dims_021 = {normalized_dims[0], normalized_dims[2], normalized_dims[1]};
     }
 
     return dims_021;
@@ -90,27 +115,117 @@ absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
   return absl::nullopt;
 }
 
-IrArray::Index GetUnreducedOutputIndex(
-    const IrArray::Index& reduced_output_index,
-    const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* b) {
-  auto bounds = reduced_output_shape.dimensions();
-  auto minor_to_major = reduced_output_shape.layout().minor_to_major();
-  llvm::Value* linear_index = reduced_output_index.GetConstantWithIndexType(0);
-  int64 multiplier = 1;
-  for (int i = 0; i < reduced_output_index.size(); ++i) {
-    int64 dim = minor_to_major[i];
-    llvm::Value* addend =
-        b->CreateMul(reduced_output_index[dim],
-                     reduced_output_index.GetConstantWithIndexType(multiplier),
-                     "linearizing",
-                     /*HasNUW=*/true, /*HasNSW=*/true);
-    linear_index = b->CreateAdd(linear_index, addend, "",
-                                /*HasNUW=*/true, /*HasNSW=*/true);
-    multiplier *= bounds[dim];
+KernelMappingScheme::KernelMappingScheme(
+    absl::Span<const int64> dims_in_elems, int64 tile_size_y, int64 tile_size_x,
+    absl::Span<const int64> req_block_sizes, int64 num_threads_y,
+    int64 num_threads_x, llvm::IRBuilder<>* b)
+    : b_(b),
+      dims_in_elems_(dims_in_elems),
+      tile_sizes_{1, tile_size_y, tile_size_x},
+      num_threads_x_(num_threads_x),
+      num_threads_y_(num_threads_y) {
+  DCHECK_EQ(dims_in_elems_.size(), 3);
+  DCHECK_EQ(req_block_sizes.size(), 3);
+
+  DCHECK_EQ(tile_size_y % num_threads_y_, 0);
+  DCHECK_EQ(tile_size_x % num_threads_x_, 0);
+
+  dims_in_tiles_ = ElementWiseCeilOfRatio<int64>(dims_in_elems_, tile_sizes_);
+  block_sizes_.reserve(req_block_sizes.size());
+  absl::c_transform(req_block_sizes, dims_in_tiles_,
+                    std::back_inserter(block_sizes_),
+                    [](const int64 requested_size, const int64 max_size) {
+                      return std::min(requested_size, max_size);
+                    });
+  dims_in_blocks_ = ElementWiseCeilOfRatio<int64>(dims_in_tiles_, block_sizes_);
+
+  VLOG(10) << "dims_in_elems_ = [" << absl::StrJoin(dims_in_elems_, ",") << "]";
+  VLOG(10) << "dims_in_tiles_ = [" << absl::StrJoin(dims_in_tiles_, ",") << "]";
+  VLOG(10) << "dims_in_blocks_ = [" << absl::StrJoin(dims_in_blocks_, ",")
+           << "]";
+}
+
+IrArray::Index KernelMappingScheme::GetUnnormalizedIndex(
+    const IrArray::Index& normalized_shape_index,
+    const Shape& unnormalized_shape) {
+  DCHECK_EQ(normalized_shape_index.size(), dims_in_elems_.size());
+  Shape output_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      unnormalized_shape.element_type(), GetDimensionsInElements());
+  return GetReshapedIndex(normalized_shape_index, output_shape,
+                          unnormalized_shape, b_);
+}
+
+IrArray::Index KernelMappingScheme::EmitBlockIndex(llvm::Type* index_ty) {
+  llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_);
+  llvm_ir::AddRangeMetadata(0, GetNumberOfBlocks(),
+                            llvm::cast<llvm::Instruction>(block_id));
+  llvm::Value* linear_block_id =
+      b_->CreateIntCast(block_id, index_ty, /*isSigned=*/true, "block.id.x");
+  return IrArray::Index(linear_block_id,
+                        ShapeUtil::MakeShapeWithDescendingLayout(
+                            PRED /*arbitrary*/, dims_in_blocks_),
+                        b_);
+}
+
+IrArray::Index KernelMappingScheme::GetTileIndexForBlockOrigin(
+    const IrArray::Index& block_index) {
+  IrArray::Index tile_index = block_index;
+  for (int i = 0; i < block_sizes_.size(); ++i) {
+    tile_index[i] = b_->CreateMul(
+        block_index[i],
+        llvm::ConstantInt::get(block_index[i]->getType(), block_sizes_[i]),
+        "block_origin." + std::to_string(i));
+  }
+  return tile_index;
+}
+
+IrArray::Index KernelMappingScheme::GetElementIndexForTileOrigin(
+    const IrArray::Index& tile_index) {
+  IrArray::Index elem_index = tile_index;
+  for (int i = DimY; i < DimTot; ++i) {
+    elem_index[i] =
+        b_->CreateMul(tile_index[i],
+                      llvm::ConstantInt::get(tile_index[i]->getType(),
+                                             GetTileSizeForDimension(i)),
+                      "tile_origin." + std::to_string(i));
   }
+  return elem_index;
+}
+
+llvm::GlobalVariable* KernelMappingScheme::GetSharedMemoryBufferForElementType(
+    llvm::Type* elem_ty, absl::string_view buffer_name) {
+  // If shared memory tranpose is needed, we use square tiles.
+  CHECK_EQ(GetTileSizeForDimensionX(), GetTileSizeForDimensionY());
+
+  // For Nvidia GPUs, the warp size is 32 threads and the shared memory bank is
+  // organized into 32-way. We usually use the warp size or a multiplier or a
+  // the warp size as the size for tiling. This may cause all elements in the
+  // same column of a tile use the same memory bank and therefore shared memory
+  // bank conflicts. Adding 1 to the minor dimension of the shared memory buffer
+  // can reduce such shared memory bank conflicts.
+  llvm::Type* buffer_type = llvm::ArrayType::get(
+      llvm::ArrayType::get(elem_ty, GetTileSizeForDimension(DimX) + 1),
+      GetTileSizeForDimension(DimY));
+  return llvm_ir::AllocateSharedMemoryTile(b_->GetInsertBlock()->getModule(),
+                                           buffer_type, buffer_name);
+}
 
-  return IrArray::Index(linear_index, unreduced_output_shape, b);
+std::tuple<llvm::Value*, llvm::Value*>
+KernelMappingScheme::EmitThreadYXCoordinate(llvm::Type* index_ty) {
+  // Calculate (y, x) coordinate of the thread in the 2D view of thread block
+  // defined by (num_thread_y, num_thread_x) from thread_id.
+  llvm::CallInst* thread_id_raw = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_);
+  llvm_ir::AddRangeMetadata(0, GetThreadsPerTile(), thread_id_raw);
+  llvm::Value* thread_id_int =
+      b_->CreateIntCast(thread_id_raw, index_ty,
+                        /*isSigned=*/true, "thread.id.x");
+  llvm::Value* num_thread_x =
+      llvm::ConstantInt::get(index_ty, GetNumberOfThreadsForDimensionX());
+  llvm::Value* x = b_->CreateURem(thread_id_int, num_thread_x);
+  llvm::Value* y = b_->CreateUDiv(thread_id_int, num_thread_x);
+  return std::make_tuple(y, x);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index 5ea05b3188a1c0881e4c0c41625d530aff1b1205..06002d57b0d7daa07f903feebe67a60a083c0e7c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -28,23 +28,160 @@ namespace llvm_ir {
 // If a shape can be viewed as three logical components 0-1-2 in the order of
 // major to minor, a 0-2-1-transpose changes the order of such logical
 // components to 0-2-1. We call the shape being transposed the input shape and
-// the transposed shape the output shape. The logical view of the input and
-// output shapes for the transpose are called the 0-1-2 shape or reduced input
-// shape and the 0-2-1 shape or the reduced output shape respectively. The
-// original input and output shapes are called the unreduced input and output
-// shapes.
-
+// the transposed shape the output shape. The logical view of the input/output
+// shapes for the transpose are called the 0-1-2/0-2-1 shapes or the normalized
+// shapes. The original input/output shapes are called unnormalized shapes.
+//
 // If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the
-// reduced shape of `b` or the 0-2-1 shape.
+// normalized shape of `b` or the 0-2-1 shape.
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
                                                      const Shape& b);
 
-// Return the unreduced output index corresponding to the given reduced output
-// index.
-IrArray::Index GetUnreducedOutputIndex(
-    const IrArray::Index& reduced_output_index,
-    const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* b);
+// A tile is a spatial subdivision of a tensor. We group tensor elements into
+// tiles so that we can launch kernels to process the tensor elements in blocks
+// of tiles.
+//
+// A kernel mapping scheme describes a method to partition the tensors accessed
+// by an unnested HLO instruction into tiles and blocks of tiles, and the
+// associated information to use hardware threads to process the tensor elements
+// in blocks of tiles.
+//
+// Currently, there are two main use cases for a tiling scheme. First, we
+// implement kernels with 0-2-1 memory transpose using shared memory to improve
+// memory access pattern. Second, we implement reduction to contiguous
+// dimensions in layout, with or without memory tranpsose, to achieve better
+// memory access pattern as well as to reduce the need numbers of executed
+// expensive instructions, such as thread synchronization related instructions
+// and atomic operations. For both use cases, we can apply a normalization to
+// the original tensors, to collapse contiguous dimensions for the same purpose
+// and produce normlized three dimensional tensors. For this reason, the tiling
+// scheme class only needs to handle normalized three dimensional tensors and
+// two dimensional tiles.
+//
+// The current implementation of the class is somewhat NVIDIA GPU oriented. This
+// situation can be improved when there is a need though. The idea of 0-2-1
+// transpose using shared memory can be found in the following CUDA algorithm in
+// TensorFlow: https://goo.gl/MStRV6.
+//
+// We use a thread block to process a tile because we want to use the HW thread
+// block synchronization primitives to synchronize the processing of all the
+// elements in the same tile. A thread block can be viewed as a two dimensional
+// array of threads, described by the number of threads for the Y and X
+// dimensions. A thread block (num_threads_y, num_threads_x) processes a tile of
+// (tile_size_y, tile_size_x) as follows: each thread in the thread block
+// processes one element in the tile so that all the threads in the thread block
+// together process a subdivision of the tile that has the same dimension as the
+// thread block array. Then the thread block moves on to process the next
+// subdivision of the tile until the whole tile is processed. Therefore, each
+// thread in the thread block processes
+// tile_size_x/num_threads_x * tile_size_y/num_threads_y elements in a tile.
+//
+// There are situations where we want a thread block to process multiple
+// tiles. We can't group those tiles into a bigger tiles because we limit a tile
+// to a two dimensional spatial subdivision of a tensor. For example, when we
+// use tiling to implement reduction with tranpose, we want the partial sum
+// produced by each thread to accumulate values for more elements before using
+// shlf_down and atomic_add instructions for further reduction, to amortize the
+// cost of such expensive instructions. The concept of tile block is introduced
+// for this purpose. A tile block is a three dimensional array of tiles, of
+// which some dimensions may be degenerated to only one tile.
+class KernelMappingScheme {
+ public:
+  enum { DimZ = 0, DimY, DimX, DimTot };
+
+ public:
+  // dims_in_elems: the normalized tensor dimensions.
+  // req_block_sizes: the requested block size in number of tiles for each
+  //   dimension. The actual block size is set to min(req_block_size,
+  //   dims_in_number_of_blocks).
+  explicit KernelMappingScheme(absl::Span<const int64> dims_in_elems,
+                               int64 tile_size_y, int64 tile_size_x,
+                               absl::Span<const int64> req_block_sizes,
+                               int64 num_threads_y, int64 num_threads_x,
+                               llvm::IRBuilder<>* b);
+
+  absl::Span<const int64> GetDimensionsInElements() const {
+    return dims_in_elems_;
+  }
+  absl::Span<const int64> GetDimensionsInTiles() const {
+    return dims_in_tiles_;
+  }
+  absl::Span<const int64> GetDimensionsInBlocks() const {
+    return dims_in_blocks_;
+  }
+
+  int64 GetNumberOfTilesInTotal() const {
+    return absl::c_accumulate(dims_in_tiles_, 1LL, std::multiplies<int64>());
+  }
+  int64 GetNumberOfTilesInOneBlock() const {
+    return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
+  }
+
+  int64 GetNumberOfBlocks() const {
+    return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
+  }
+
+  int64 GetTileSizeForDimension(int d) const {
+    DCHECK(d >= DimZ && d <= DimX);
+    return tile_sizes_[d];
+  }
+  int64 GetTileSizeForDimensionX() const {
+    return GetTileSizeForDimension(DimX);
+  }
+  int64 GetTileSizeForDimensionY() const {
+    return GetTileSizeForDimension(DimY);
+  }
+
+  absl::Span<const int64> GetBlockSizes() const { return block_sizes_; }
+
+  int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; }
+  int64 GetNumberOfThreadsForDimensionY() const { return num_threads_y_; }
+
+  int64 GetThreadsPerTile() const {
+    return GetNumberOfThreadsForDimensionX() *
+           GetNumberOfThreadsForDimensionY();
+  }
+
+  IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
+  // Returns the index for the first tile in the block with the given block
+  // index.
+  IrArray::Index GetTileIndexForBlockOrigin(const IrArray::Index& block_index);
+  // Returns the index for the first element in the tile with the given tile
+  // index.
+  IrArray::Index GetElementIndexForTileOrigin(const IrArray::Index& tile_index);
+
+  std::tuple<llvm::Value*, llvm::Value*> EmitThreadYXCoordinate(
+      llvm::Type* index_ty);
+
+  IrArray::Index GetUnnormalizedIndex(
+      const IrArray::Index& normalized_shape_index,
+      const Shape& unnormalized_shape);
+
+  llvm::GlobalVariable* GetSharedMemoryBufferForElementType(
+      llvm::Type* elem_ty, absl::string_view buffer_name);
+
+ private:
+  llvm::IRBuilder<>* b_;
+  // The number of elements in each dimension.
+  absl::Span<const int64> dims_in_elems_;
+
+  // The number of elements for each dimension of a tile.
+  std::vector<int64> tile_sizes_;
+  // The number of tiles in each dimension. It is computed from dims_in_elem_
+  // and tile_sizes_.
+  std::vector<int64> dims_in_tiles_;
+
+  // The number of tiles for each dimension of a tile block.
+  std::vector<int64> block_sizes_;
+  // The number of blocks in each dimension of a tile block. It is computed from
+  // dims_in_tile_ and block_sizes_.
+  std::vector<int64> dims_in_blocks_;
+
+  // Number of threads used to process elements in the X direction of a tile.
+  int64 num_threads_x_;
+  // Number of threads used to process elements in the Y direction of a tile.
+  int64 num_threads_y_;
+};
 
 // A class to represent information for tiled parameters to support IR emission
 // for 021 transpose.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 1a53c026be340ca3bec3a49b11666d6124728130..ceea24685af566e02340664f0a40c398c62b5ab0 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -19,10 +19,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Target/TargetOptions.h"
@@ -33,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/byte_order.h"
@@ -83,10 +84,9 @@ string DumpModuleToString(const llvm::Module& module) {
   return AsString(buffer_string);
 }
 
-llvm::Value* EmitCallToIntrinsic(llvm::Intrinsic::ID intrinsic_id,
-                                 absl::Span<llvm::Value* const> operands,
-                                 absl::Span<llvm::Type* const> overloaded_types,
-                                 llvm::IRBuilder<>* b) {
+llvm::CallInst* EmitCallToIntrinsic(
+    llvm::Intrinsic::ID intrinsic_id, absl::Span<llvm::Value* const> operands,
+    absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilder<>* b) {
   llvm::Module* module = ModuleFromIRBuilder(b);
   llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(
       module, intrinsic_id, AsArrayRef(overloaded_types));
@@ -244,10 +244,11 @@ StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(const Shape& shape,
 
 StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
                                                   int32 size_bytes) {
-  Shape shape;
-  TF_RET_CHECK(shape.ParseFromArray(shape_ptr, size_bytes));
+  ShapeProto shape_proto;
+  TF_RET_CHECK(shape_proto.ParseFromArray(shape_ptr, size_bytes));
+  Shape shape(shape_proto);
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
-  return shape;
+  return std::move(shape);
 }
 
 llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
@@ -260,6 +261,17 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
       /*AddNull=*/false);
 }
 
+llvm::GlobalVariable* AllocateSharedMemoryTile(llvm::Module* module,
+                                               llvm::Type* tile_type,
+                                               absl::string_view name) {
+  const int kNVPTXSharedMemoryAddrSpace = 3;
+  return new llvm::GlobalVariable(
+      *module, tile_type,
+      /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage,
+      llvm::UndefValue::get(tile_type), AsStringRef(name), nullptr,
+      llvm::GlobalValue::NotThreadLocal, kNVPTXSharedMemoryAddrSpace);
+}
+
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
                                             absl::string_view name,
                                             llvm::IRBuilder<>* b,
@@ -362,11 +374,10 @@ static void LogS64(const char* tag, int64 value) {
 void EmitLogging(const char* tag, llvm::Value* value, llvm::IRBuilder<>* b) {
   llvm::FunctionType* log_function_type = llvm::FunctionType::get(
       b->getVoidTy(), {b->getInt64Ty(), b->getInt64Ty()}, /*isVarArg=*/false);
-  b->CreateCall(
-      log_function_type,
-      b->CreateIntToPtr(b->getInt64(tensorflow::bit_cast<int64>(&LogS64)),
-                        log_function_type->getPointerTo()),
-      {b->getInt64(tensorflow::bit_cast<int64>(tag)), value});
+  b->CreateCall(log_function_type,
+                b->CreateIntToPtr(b->getInt64(absl::bit_cast<int64>(&LogS64)),
+                                  log_function_type->getPointerTo()),
+                {b->getInt64(absl::bit_cast<int64>(tag)), value});
 }
 
 void SetAlignmentMetadataForLoad(llvm::LoadInst* load, uint64_t alignment) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index f59baff263fe7184c6b0821c9dbd9eee205586a6..c604c7c870adf734a29017e6accbd159317a9548 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -101,10 +102,9 @@ string SanitizeFunctionName(string function_name);
 // intrinsics (for example, "minnum") must include a type in overloaded_types
 // for each overloaded type. Typically, overloaded intrinsics have only a single
 // overloaded type.
-llvm::Value* EmitCallToIntrinsic(llvm::Intrinsic::ID intrinsic_id,
-                                 absl::Span<llvm::Value* const> operands,
-                                 absl::Span<llvm::Type* const> overloaded_types,
-                                 llvm::IRBuilder<>* b);
+llvm::CallInst* EmitCallToIntrinsic(
+    llvm::Intrinsic::ID intrinsic_id, absl::Span<llvm::Value* const> operands,
+    absl::Span<llvm::Type* const> overloaded_types, llvm::IRBuilder<>* b);
 
 // Emit float max. Emit maxnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
@@ -155,6 +155,11 @@ StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
 llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
                                            llvm::Module* module);
 
+// Allocates a tile of shared memory.
+llvm::GlobalVariable* AllocateSharedMemoryTile(llvm::Module* module,
+                                               llvm::Type* tile_type,
+                                               absl::string_view name);
+
 // Inserts an allocate of the requested type at the entry point of the
 // function that the builder is currently building. The insert point
 // of the builder is set to the same place after calling this function
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index 944c79580c133906cd431722fd6b29e6aee5f918..e22c2173c271fc9571be1ddb0759d2b31562dc98 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 
+#include <vector>
+
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -27,10 +30,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -38,148 +43,365 @@ namespace xla {
 namespace llvm_ir {
 
 namespace {
-// Adds the inner comparison loop where we compare elements pointed to by
-// 'keys_index' and 'compare_keys_index'.
-void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index,
-                     const IrArray::Index& compare_keys_index,
-                     const IrArray& keys_array,
-                     const absl::optional<IrArray>& values_array,
-                     llvm::IRBuilder<>* b) {
-  // if (is_smaller_index &&
-  //     compare_keys[dimension_to_sort] < dimension_to_sort_bound)
-  llvm::Value* is_smaller_index = b->CreateICmpSLT(
-      keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]);
-  int64 dimension_to_sort_bound =
-      keys_array.GetShape().dimensions(dimension_to_sort);
-  auto if_data = EmitIfThenElse(
-      b->CreateAnd(is_smaller_index,
-                   b->CreateICmpSLT(compare_keys_index[dimension_to_sort],
-                                    keys_index.GetConstantWithIndexType(
-                                        dimension_to_sort_bound))),
-      "smaller_comparison_index", b, /*emit_else=*/false);
-  SetToFirstInsertPoint(if_data.true_block, b);
-  auto key1 = keys_array.EmitReadArrayElement(keys_index, b);
-  auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, b);
-  auto compare_key1 = key1;
-  auto compare_key2 = key2;
-  auto key_type = keys_array.GetShape().element_type();
-  bool is_signed_comparison = true;
-  if (primitive_util::IsFloatingPointType(key_type)) {
-    // We would like a total order of floating point numbers so that the sort
-    // has a predictable behavior in the presence of NaNs. Rather than using
-    // floating point comparison, we use the following trick:
-    // If f is a float, and
-    // x = bit_cast<int32>(f);
-    // y = x < 0 ? 0x7FFFFFFF - x : x;
-    // then y is ordered as an int32 such that finite values have the obvious
-    // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
-    // and end of the ordering.
-    auto k = b->getInt(llvm::APInt::getSignedMaxValue(
-        key1->getType()->getPrimitiveSizeInBits()));
-    auto comparison_type = k->getType();
-    auto zero = llvm::ConstantInt::get(comparison_type, 0);
-    auto maybe_flip = [&](llvm::Value* v) {
-      return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
-                             b->CreateSub(k, v), v);
-    };
-    compare_key1 = b->CreateBitCast(key1, comparison_type);
-    compare_key2 = b->CreateBitCast(key2, comparison_type);
-    compare_key1 = maybe_flip(compare_key1);
-    compare_key2 = maybe_flip(compare_key2);
-  } else if (!primitive_util::IsSignedIntegralType(key_type)) {
-    is_signed_comparison = false;
+
+// Adds the inner comparison loop body where we compare elements.
+void EmitCompareLoopBody(
+    int64 iteration_bound, PrimitiveType key_type, int64 num_values,
+    int64 iota_values_parameter_index, llvm::Value* element_pair_index,
+    int64 xor_mask, llvm::Type* index_type,
+    std::function<llvm::Value*(int64 operand, llvm::Value* index)> read_element,
+    std::function<void(int64 operand, llvm::Value* index, llvm::Value* value)>
+        write_element,
+    llvm::IRBuilder<>* b, bool needs_bounds_checks = true) {
+  auto index_typed_constant = [&](int64 value) {
+    return llvm::ConstantInt::get(index_type, value);
+  };
+  // The 'xor_mask' determines which elements are compared against each other.
+  // Index 'current_keys_index' will be compared with 'current_keys_index' xor
+  // 'xor_mask'. This means that we will always compare a block of consecutive
+  // elements against elements from the adjacent block of the same size. When
+  // 'xor_mask' is a power of 2, it immediately identifies the size of such a
+  // block. We can also have 'xor_mask' being 2^k - 1 (for some value of k). In
+  // that case, we essentially flip the last 'k' - 1 bits when computing the
+  // position of the element to compare to, so the block size is 2^(k - 1).
+  int64 block_size = xor_mask;
+  // Check if it is a value 2^k - 1.
+  if (xor_mask > 1 && (xor_mask & (xor_mask + 1)) == 0) {
+    block_size = (xor_mask + 1) / 2;
+  }
+  auto current_keys_index = element_pair_index;
+  if (block_size == 1) {
+    // If the block size is 1, we take every second element and compare it to
+    // the next one.
+    current_keys_index =
+        b->CreateMul(current_keys_index, index_typed_constant(2));
+  } else if (block_size * 2 < iteration_bound) {
+    // current_keys_index iterates through the 'left' elements of the element
+    // pairs to be compared. We first need to compute the comparison block to
+    // which the element belongs. The block id of that block is index /
+    // block_size.
+    auto block_id =
+        b->CreateUDiv(current_keys_index, index_typed_constant(block_size));
+    // The index of the 'left' element within its block is simply the remainder
+    // when dividing by 'block_size'.
+    auto index_within_block =
+        b->CreateURem(current_keys_index, index_typed_constant(block_size));
+    // The first element of the 'left' block of elements that is compared
+    // against elements from the adjacent 'right' block of elements is
+    // 'block_id' * (2 * 'block_size').
+    auto first_element_in_block =
+        b->CreateMul(block_id, index_typed_constant(2 * block_size));
+    current_keys_index =
+        b->CreateAdd(first_element_in_block, index_within_block);
+  }
+  auto compare_keys_index =
+      b->CreateXor(current_keys_index, index_typed_constant(xor_mask));
+  // current_keys_index < compare_keys_index
+  llvm::Value* is_smaller_index =
+      b->CreateICmpSLT(current_keys_index, compare_keys_index);
+  // compare_keys_index < iteration_bound
+  llvm::Value* index_is_inbounds = b->CreateICmpSLT(
+      compare_keys_index, index_typed_constant(iteration_bound));
+  llvm::Value* do_comparison =
+      needs_bounds_checks ? b->CreateAnd(is_smaller_index, index_is_inbounds)
+                          : b->getInt1(true);
+
+  // if (is_smaller_index && index_is_inbounds)
+  KernelSupportLibrary ksl(b);
+  ksl.IfReturnVoid("smaller_comparison_index", do_comparison, [&]() {
+    auto key1 = read_element(0, current_keys_index);
+    auto key2 = read_element(0, compare_keys_index);
+    auto compare_key1 = key1;
+    auto compare_key2 = key2;
+    bool is_signed_comparison = true;
+    if (primitive_util::IsFloatingPointType(key_type)) {
+      // We would like a total order of floating point numbers so that the
+      // sort has a predictable behavior in the presence of NaNs. Rather
+      // than using floating point comparison, we use the following trick:
+      // If f is a float, and
+      // x = bit_cast<int32>(f);
+      // y = x < 0 ? 0x7FFFFFFF - x : x;
+      // then y is ordered as an int32 such that finite values have the
+      // obvious order, -0 is ordered before 0, and -NaN and NaN appear at
+      // the beginning and end of the ordering.
+      auto k = b->getInt(llvm::APInt::getSignedMaxValue(
+          key1->getType()->getPrimitiveSizeInBits()));
+      auto comparison_type = k->getType();
+      auto zero = llvm::ConstantInt::get(comparison_type, 0);
+      auto maybe_flip = [&](llvm::Value* v) {
+        return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
+                               b->CreateSub(k, v), v);
+      };
+      compare_key1 = b->CreateBitCast(key1, comparison_type);
+      compare_key2 = b->CreateBitCast(key2, comparison_type);
+      compare_key1 = maybe_flip(compare_key1);
+      compare_key2 = maybe_flip(compare_key2);
+    } else if (!primitive_util::IsSignedIntegralType(key_type)) {
+      is_signed_comparison = false;
+    }
+    // If key2 < key1
+    auto is_smaller_than =
+        b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
+                                           : llvm::ICmpInst::ICMP_ULT,
+                      compare_key2, compare_key1);
+    if (iota_values_parameter_index >= 0) {
+      auto keys_equal = b->CreateICmpEQ(compare_key1, compare_key2);
+      auto key_index1 =
+          read_element(iota_values_parameter_index, current_keys_index);
+      auto key_index2 =
+          read_element(iota_values_parameter_index, compare_keys_index);
+      auto index_is_smaller_than =
+          b->CreateICmp(llvm::ICmpInst::ICMP_ULT, key_index2, key_index1);
+      is_smaller_than = b->CreateOr(
+          is_smaller_than, b->CreateAnd(keys_equal, index_is_smaller_than));
+    }
+    ksl.IfReturnVoid("is_smaller_than", is_smaller_than, [&]() {
+      // Swap key1 with key2.
+      write_element(0, current_keys_index, key2);
+      write_element(0, compare_keys_index, key1);
+      for (int64 i = 1; i <= num_values; ++i) {
+        // Also swap the values.
+        auto value1 = read_element(i, current_keys_index);
+        auto value2 = read_element(i, compare_keys_index);
+        write_element(i, current_keys_index, value2);
+        write_element(i, compare_keys_index, value1);
+      }
+    });
+  });
+}
+
+void EmitTiledCompareLoop(
+    const IrArray::Index& tiled_keys_index, int64 dimension_to_sort,
+    int64 dimension_to_sort_bound, PrimitiveType keys_type,
+    absl::Span<const int64> xor_masks, const std::vector<IrArray>& params,
+    const std::vector<llvm::Value*>& param_shmem_buffers,
+    int64 iota_values_parameter_index, int64 tile_size, llvm::IRBuilder<>* b) {
+  KernelSupportLibrary ksl(b);
+  llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b);
+  llvm_ir::AddRangeMetadata(0, tile_size / 2,
+                            llvm::cast<llvm::Instruction>(thread_id));
+  thread_id = b->CreateIntCast(thread_id, tiled_keys_index.GetType(),
+                               /*isSigned=*/true, "thread.id.x");
+
+  auto copy_loop_body =
+      [&](std::function<void(llvm::Value * cache_index, llvm::Value * index)>
+              read_or_write) {
+        auto value_one = tiled_keys_index.GetConstantWithIndexType(1);
+        auto current_keys_index =
+            b->CreateShl(tiled_keys_index[dimension_to_sort], value_one);
+        // We want to copy two adjacent elements. We first check whether the
+        // first index position is within bounds.
+        ksl.IfReturnVoid(
+            "smaller_keys_index",
+            b->CreateICmpSLT(current_keys_index,
+                             tiled_keys_index.GetConstantWithIndexType(
+                                 dimension_to_sort_bound)),
+            [&]() {
+              auto cache_index = b->CreateShl(thread_id, value_one);
+              read_or_write(cache_index, current_keys_index);
+              // Increment to go the next index position.
+              current_keys_index = b->CreateAdd(current_keys_index, value_one);
+              // Here we check whether the next index position is within bounds.
+              ksl.IfReturnVoid(
+                  "inner_smaller_keys_index",
+                  b->CreateICmpSLT(current_keys_index,
+                                   tiled_keys_index.GetConstantWithIndexType(
+                                       dimension_to_sort_bound)),
+                  [&]() {
+                    cache_index = b->CreateAdd(cache_index, value_one);
+                    read_or_write(cache_index, current_keys_index);
+                  });
+            });
+      };
+
+  // Copy operand tiles from the operand buffers to shared memory.
+  IrArray::Index keys_index = tiled_keys_index;
+  for (int64 i = 0; i < params.size(); ++i) {
+    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+      keys_index[dimension_to_sort] = index;
+      auto value = params[i].EmitReadArrayElement(keys_index, b);
+      b->CreateStore(value,
+                     b->CreateGEP(param_shmem_buffers[i],
+                                  {tiled_keys_index.GetConstantWithIndexType(0),
+                                   cache_index}));
+    });
+  }
+  // Wait until all reads have happened.
+  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
+
+  // Now emit the bodies of the comparison loops.
+  auto read_element = [&](int64 operand, llvm::Value* index) {
+    return b->CreateLoad(
+        b->CreateGEP(param_shmem_buffers[operand],
+                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+  };
+  auto write_element = [&](int64 operand, llvm::Value* index,
+                           llvm::Value* value) {
+    b->CreateStore(
+        value,
+        b->CreateGEP(param_shmem_buffers[operand],
+                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+  };
+  for (int64 xor_mask : xor_masks) {
+    // The index of the element pair to be compared within the tile stored in
+    // shared memory. We order the element pairs by the element with the smaller
+    // index.
+    auto element_pair_index = thread_id;
+    // If 'dimension_to_sort_bound' is evenly divisible by 'tile_size', we don't
+    // need any bounds checks.
+    if (dimension_to_sort_bound % tile_size) {
+      // Otherwise we need a bounds check for the last tile. The last tile has
+      // size 'dimension_to_sort_bound' % 'tile_size'.
+      ksl.IfReturnVoid(
+          "is_last_tile",
+          b->CreateICmpUGE(
+              b->CreateMul(tiled_keys_index[dimension_to_sort],
+                           tiled_keys_index.GetConstantWithIndexType(2)),
+              tiled_keys_index.GetConstantWithIndexType(
+                  RoundDownToNearest(dimension_to_sort_bound, tile_size))),
+          [&]() {
+            EmitCompareLoopBody(dimension_to_sort_bound % tile_size, keys_type,
+                                params.size() - 1, iota_values_parameter_index,
+                                element_pair_index, xor_mask,
+                                tiled_keys_index.GetType(), read_element,
+                                write_element, b);
+          },
+          [&]() {
+            EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
+                                iota_values_parameter_index, element_pair_index,
+                                xor_mask, tiled_keys_index.GetType(),
+                                read_element, write_element, b,
+                                /*needs_bounds_checks=*/false);
+          });
+    } else {
+      EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
+                          iota_values_parameter_index, element_pair_index,
+                          xor_mask, tiled_keys_index.GetType(), read_element,
+                          write_element, b, /*needs_bounds_checks=*/false);
+    }
+    // Wait until all comparisons have happened.
+    llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
   }
-  auto comparison =
-      b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
-                                         : llvm::ICmpInst::ICMP_ULT,
-                    compare_key2, compare_key1);
-  // If key2 < key1
-  auto if_smaller_data =
-      EmitIfThenElse(comparison, "is_smaller_than", b, /*emit_else=*/false);
-  SetToFirstInsertPoint(if_smaller_data.true_block, b);
-  // Swap key1 with key2.
-  keys_array.EmitWriteArrayElement(keys_index, key2, b);
-  keys_array.EmitWriteArrayElement(compare_keys_index, key1, b);
-  if (values_array.has_value()) {
-    // Also swap the values.
-    auto value1 = values_array.value().EmitReadArrayElement(keys_index, b);
-    auto value2 =
-        values_array.value().EmitReadArrayElement(compare_keys_index, b);
-    values_array.value().EmitWriteArrayElement(keys_index, value2, b);
-    values_array.value().EmitWriteArrayElement(compare_keys_index, value1, b);
+
+  // Copy the operand tiles back from shared memory to the operand buffers.
+  for (int64 i = 0; i < params.size(); ++i) {
+    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+      keys_index[dimension_to_sort] = index;
+      auto value = b->CreateLoad(b->CreateGEP(
+          param_shmem_buffers[i],
+          {tiled_keys_index.GetConstantWithIndexType(0), cache_index}));
+      params[i].EmitWriteArrayElement(keys_index, value, b);
+    });
   }
+  // We should normally synchronize here to make sure all writes have happened.
+  // However the very next thing each thread does is reading 2 elements from the
+  // operand buffer and writing it into the same location in shared memory from
+  // which it previously copied it to the operand buffer, and we synchronize
+  // after this has happened. We can be sure that a thread always writes to the
+  // same location in shared memory because we have exactly tile_size / 2 many
+  // threads, and the linear index calculated by ParallelLoopEmitter uses
+  // linear_index = blockIdx.x * blockDim.x + threadIdx.x;
 }
 }  // namespace
 
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const absl::optional<IrArray>& values_array,
-                       absl::string_view name, llvm::Value* xor_mask,
-                       llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions* launch_dimensions) {
-  const Shape& keys_shape = keys_array.GetShape();
+                       const std::vector<IrArray>& values_arrays,
+                       int64 iota_values_parameter_index,
+                       absl::string_view name,
+                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions& launch_dimensions,
+                       int64 num_iterations_in_sort_dim,
+                       const int64 tile_size) {
+  // Iterate through the keys shape in physical order, but skip the dimension to
+  // sort and make it the innermost loop which is the loop where the comparisons
+  // happen. In the dimension to sort, if we use tiling, we iterate through it
+  // in tiles of 64 elements each, so we use another loop that happens within
+  // one thread to process this tile worth of data (thereby combining several
+  // comparison stages of the bitonic sort algorithm because they all happen
+  // within those 64 elements and are therefore independent of the other
+  // comparisons).
 
-  // Create loop nests which loop through the operand dimensions. The sort
-  // dimension is handled in the innermost loop which performs the sorting.
-  ForLoopNest loop_nest(name, b);
-  IrArray::Index keys_index =
-      loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys");
-  if (loop_nest.GetInnerLoopBodyBasicBlock() != nullptr) {
-    SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b);
+  const Shape& keys_shape = keys_array.GetShape();
+  int64 rank = ShapeUtil::Rank(keys_shape);
+  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
+  std::vector<int64> dimensions_in_iteration_order(rank);
+  std::vector<int64> iteration_order_to_logical_order(rank);
+  int64 dim = 0;
+  for (int64 dimension : LayoutUtil::MinorToMajor(keys_shape)) {
+    if (dimension != dimension_to_sort) {
+      dimensions_in_iteration_order[dim] = keys_shape.dimensions(dimension);
+      iteration_order_to_logical_order[dim++] = dimension;
+    }
   }
+  dimensions_in_iteration_order[dim] = num_iterations_in_sort_dim;
+  iteration_order_to_logical_order[dim] = dimension_to_sort;
 
-  // 'compare_keys_index' is the index of the element that 'keys_index' should
-  // be compared to.
-  IrArray::Index compare_keys_index(keys_index.GetType());
-  for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) {
-    if (dimension != dimension_to_sort) {
-      compare_keys_index.push_back(keys_index[dimension]);
-    } else {
-      compare_keys_index.push_back(nullptr);
+  Shape iteration_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
+                                               dimensions_in_iteration_order);
+  std::vector<IrArray> params(1, keys_array);
+  params.insert(params.end(), values_arrays.begin(), values_arrays.end());
+
+  // Allocate shared memory for the tiled compare loop.
+  std::vector<llvm::Value*> param_shmem_buffers(params.size(), nullptr);
+  if (xor_masks.size() > 1) {
+    llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    for (int64 i = 0; i < params.size(); ++i) {
+      llvm::Type* tile_type =
+          llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
+                                   params[i].GetShape().element_type(), module),
+                               tile_size);
+      param_shmem_buffers[i] = llvm_ir::AllocateSharedMemoryTile(
+          module, tile_type, absl::StrCat(name, "_tile_param_", i));
     }
   }
 
-  // Naive C++ code for the inner compare loop:
-  //
-  // for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
-  //   int64 j = i ^ xor_mask;
-  //   if (i < j && j < dimension_to_sort_bound) {
-  //     int64 min_key = std::min(keys[i], keys[j]);
-  //     keys[j] = std::max(keys[i], keys[j]);
-  //     keys[i] = min_key;
-  //   }
-  // }
-  //
-  // This follows the algorithm described on Wikipedia:
-  // https://en.wikipedia.org/wiki/Bitonic_sorter
-
-  int64 dimension_to_sort_bound =
-      keys_array.GetShape().dimensions(dimension_to_sort);
-  Shape compare_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
-                                             {dimension_to_sort_bound});
   auto compare_loop_body_emitter =
-      [&](const IrArray::Index& compare_index) -> Status {
-    keys_index[dimension_to_sort] = compare_index[0];
-    compare_keys_index[dimension_to_sort] =
-        b->CreateXor(compare_index[0], xor_mask);
-    EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index,
-                    keys_array, values_array, b);
+      [&](const IrArray::Index& tiles_index) -> Status {
+    // Naive C++ code for the inner compare loop:
+    //
+    // for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+    //   int64 j = i ^ xor_mask;
+    //   /* emitted in EmitCompareLoopBody() */
+    //   if (i < j && j < dimension_to_sort_bound) {
+    //     int64 min_key = std::min(keys[i], keys[j]);
+    //     keys[j] = std::max(keys[i], keys[j]);
+    //     keys[i] = min_key;
+    //   }
+    // }
+    //
+    // This follows the algorithm described on Wikipedia:
+    // https://en.wikipedia.org/wiki/Bitonic_sorter
+    IrArray::Index keys_index(tiles_index.GetType(), rank);
+    for (int64 i = 0; i < rank; ++i) {
+      keys_index[iteration_order_to_logical_order[i]] = tiles_index[i];
+    }
+    if (xor_masks.size() > 1) {
+      EmitTiledCompareLoop(keys_index, dimension_to_sort,
+                           dimension_to_sort_bound, keys_shape.element_type(),
+                           xor_masks, params, param_shmem_buffers,
+                           iota_values_parameter_index, tile_size, b);
+    } else {
+      auto read_element = [&](int64 operand, llvm::Value* index) {
+        keys_index[dimension_to_sort] = index;
+        return params[operand].EmitReadArrayElement(keys_index, b);
+      };
+      auto write_element = [&](int64 operand, llvm::Value* index,
+                               llvm::Value* value) {
+        keys_index[dimension_to_sort] = index;
+        params[operand].EmitWriteArrayElement(keys_index, value, b);
+      };
+      EmitCompareLoopBody(dimension_to_sort_bound, keys_shape.element_type(),
+                          values_arrays.size(), iota_values_parameter_index,
+                          tiles_index[rank - 1], xor_masks[0],
+                          tiles_index.GetType(), read_element, write_element,
+                          b);
+    }
     return Status::OK();
   };
-  if (launch_dimensions != nullptr) {
-    TF_RETURN_IF_ERROR(gpu::ParallelLoopEmitter(compare_loop_body_emitter,
-                                                compare_shape,
-                                                *launch_dimensions, b)
-                           .EmitLoop(name));
-  } else {
-    TF_RETURN_IF_ERROR(LoopEmitter(compare_loop_body_emitter, compare_shape, b)
-                           .EmitLoop(name));
-  }
-
-  // Set the IR builder insert point to the exit basic block of the outer most
-  // loop. This ensures later instructions are inserted after this loop nest.
-  b->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
-
-  return Status::OK();
+  return gpu::ParallelLoopEmitter(compare_loop_body_emitter, iteration_shape,
+                                  launch_dimensions, b)
+      .EmitLoop(name);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index 527ed10374ce9482045a8459e38fd041e0e83001..685f9383acba416f51681270e4037d56abb4b6ea 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
 
+#include <vector>
+
 #include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -28,13 +30,17 @@ namespace xla {
 namespace llvm_ir {
 // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
 // dimension of 'keys_array'. All other dimensions are kept as-is. This
-// implements the inner loop of BitonicSort. If 'launch_dimensions' is nullptr,
-// the inner compare loop will not be parallelized.
+// implements the inner loop of BitonicSort. It is assumed that 'xor_masks'
+// contains only powers of 2, or values 2^k - 1 (k > 0). If
+// 'iota_values_parameter_index' is >= 0, it points at a 'values_arrays' operand
+// that is a iota and can be used to make the sorting stable.
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const absl::optional<IrArray>& values_array,
-                       absl::string_view name, llvm::Value* xor_mask,
-                       llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions* launch_dimensions);
+                       const std::vector<IrArray>& values_arrays,
+                       int64 iota_values_parameter_index,
+                       absl::string_view name,
+                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions& launch_dimensions,
+                       int64 num_iterations_in_sort_dim, int64 tile_size);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 0d0fb7946ae6815905491ca55652d7d0ab278a3c..6c89700983363fec46c41b5430c6eab6b366a1b6 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -96,44 +96,18 @@ ExecutionOptions CreateExecutionOptions(
     const ExecutableBuildOptions& build_options,
     const ProgramShape* program_shape) {
   ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-  if (build_options.hlo_profile().has_value()) {
-    execution_options.mutable_debug_options()->set_xla_hlo_profile(
-        *build_options.hlo_profile());
-  }
-  if (build_options.generate_hlo_graph().has_value()) {
-    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
-        build_options.generate_hlo_graph().value());
-  }
-  if (build_options.dump_optimized_hlo_proto_to().has_value()) {
-    execution_options.mutable_debug_options()
-        ->set_xla_dump_optimized_hlo_proto_to(
-            build_options.dump_optimized_hlo_proto_to().value());
-  }
-  if (build_options.dump_unoptimized_hlo_proto_to().has_value()) {
-    execution_options.mutable_debug_options()
-        ->set_xla_dump_unoptimized_hlo_proto_to(
-            build_options.dump_unoptimized_hlo_proto_to().value());
-  }
-  if (build_options.dump_per_pass_hlo_proto_to().has_value()) {
-    execution_options.mutable_debug_options()
-        ->set_xla_dump_per_pass_hlo_proto_to(
-            build_options.dump_per_pass_hlo_proto_to().value());
+  if (build_options.has_debug_options()) {
+    *execution_options.mutable_debug_options() = build_options.debug_options();
   }
   if (build_options.result_layout() != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
-        *build_options.result_layout();
+        build_options.result_layout()->ToProto();
   } else {
+    Shape result_shape(program_shape->result());
+    LayoutUtil::SetToDefaultLayout(&result_shape);
     *execution_options.mutable_shape_with_output_layout() =
-        program_shape->result();
-    LayoutUtil::SetToDefaultLayout(
-        execution_options.mutable_shape_with_output_layout());
+        result_shape.ToProto();
   }
-
-  for (const std::string& disabled_pass : build_options.disabled_hlo_passes()) {
-    execution_options.mutable_debug_options()->add_xla_disable_hlo_passes(
-        disabled_pass);
-  }
-
   return execution_options;
 }
 
@@ -144,8 +118,8 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const absl::Span<const Shape* const> argument_layouts,
     const ExecutableBuildOptions& build_options) {
   const HloModuleProto& proto = computation.proto();
-  TF_RET_CHECK(proto.has_program_shape());
-  const ProgramShape& program_shape = proto.program_shape();
+  TF_RET_CHECK(proto.has_host_program_shape());
+  ProgramShape program_shape(proto.host_program_shape());
 
   // Validate incoming layouts.
   if (argument_layouts.size() != program_shape.parameters_size()) {
@@ -220,4 +194,10 @@ StatusOr<const ShapedBuffer*> LocalService::GlobalDataToShapedBuffer(
   return buffers[replica_number];
 }
 
+StatusOr<GlobalDataHandle> LocalService::RegisterReplicatedBuffers(
+    std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag) {
+  return allocation_tracker_.RegisterReplicatedBuffers(
+      std::move(replicated_buffers), tag);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 3b4f0b50832d6d2b64528ffb63eb5c7375396aec..f56ba32b04b9bf3aba75654bdb98887ad22e6791 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -63,6 +63,11 @@ class LocalService : public Service {
   StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
       const GlobalDataHandle& data, int replica_number);
 
+  // Registers a vector of shaped buffers of device memory, one per replica, and
+  // returns a corresponding handle that can be used for talking to XLA clients.
+  StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
+      std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag);
+
  private:
   explicit LocalService(const ServiceOptions& options,
                         std::unique_ptr<Backend> backend);
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
index ec52a24d782a44fda961feab3230886072e755c7..972a5b9ced0d84387ef8308efe2a7aff7317d047 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
@@ -113,6 +113,13 @@ Status LogicalBufferAnalysis::HandleGetTupleElement(HloInstruction*) {
   return Status::OK();
 }
 
+Status LogicalBufferAnalysis::HandleAddDependency(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand and does not
+  // create buffers.
+  return Status::OK();
+}
+
 Status LogicalBufferAnalysis::HandleCopy(HloInstruction* copy) {
   // The top-level buffer (index={}) for kCopy is newly created, but all other
   // buffers (in the case of a tuple shape) come from the operand
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index 81f524d84a8091e1fff13dc7c55b401143a02753..7ffca943d0f7805ad4420343fcdbf860415c4c40 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -64,6 +64,7 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   // A map from the buffer ID to the logical buffer
   std::vector<std::unique_ptr<LogicalBuffer>> logical_buffers_;
diff --git a/tensorflow/compiler/xla/service/map_inliner_test.cc b/tensorflow/compiler/xla/service/map_inliner_test.cc
index 84059dd0f71ee8fc0a25703cbab2268d7dc149a8..fd18bfdc3e7f4b5f94237c554c3e6ca8bd065a35 100644
--- a/tensorflow/compiler/xla/service/map_inliner_test.cc
+++ b/tensorflow/compiler/xla/service/map_inliner_test.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -35,7 +35,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-using MapInlinerTest = HloVerifiedTestBase;
+using MapInlinerTest = HloTestBase;
 
 // Test that `map` with `max` is transformed to `max`
 TEST_F(MapInlinerTest, MapMax) {
@@ -59,12 +59,12 @@ TEST_F(MapInlinerTest, MapMax) {
       HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
 
   MapInliner inliner;
-  EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
               op::Maximum(lhs, rhs));
 
@@ -93,12 +93,12 @@ TEST_F(MapInlinerTest, MapConstant) {
       HloInstruction::CreateMap(lhs->shape(), {lhs}, const2_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEmbeddedComputation(std::move(const2_f32));
   hlo_module->AddEntryComputation(std::move(computation));
   HloInstruction* root = hlo_module->entry_computation()->root_instruction();
   MapInliner inliner;
-  EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   root = hlo_module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Broadcast(op::Constant()));
 
@@ -131,12 +131,12 @@ TEST_F(MapInlinerTest, MapSubtractOppositeOrder) {
     HloInstruction::CreateMap(lhs->shape(), {lhs, rhs}, max_f32.get()));
 
   auto computation = builder.Build();
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEmbeddedComputation(std::move(max_f32));
   hlo_module->AddEntryComputation(std::move(computation));
 
   MapInliner inliner;
-  EXPECT_TRUE(inliner.Run(hlo_module).ValueOrDie());
+  EXPECT_TRUE(inliner.Run(hlo_module.get()).ValueOrDie());
   EXPECT_THAT(hlo_module->entry_computation()->root_instruction(),
           op::Subtract(rhs, lhs));
 
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index 2ca527bc4cb8f66a085c1e6a7cbb8ddaedbfc07e..9ccdd7d8d818b9fa3aa77cdd10d37ca18928b448 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -257,7 +258,7 @@ bool MultiOutputFusion::LegalToFuse(HloInstruction* instr1,
 }
 
 void MultiOutputFusion::RecomputeReachability() {
-  reachability_ = computation_->ComputeReachability();
+  reachability_ = HloReachabilityMap::Build(computation_);
 }
 
 void MultiOutputFusion::UpdateReachability(
@@ -317,9 +318,9 @@ bool MultiOutputFusion::Perform() {
                 << instr2->fused_instructions_computation()->ToString(
                        HloPrintOptions().set_indent_amount(1));
       }
+      Update(instr1, instr2);
       HloInstruction* ret = Fuse(instr1, instr2);
       set_is_fused(ret == instr1 ? instr2 : instr1);
-      Update(instr1, instr2);
       changed = true;
       VLOG(2) << "After fusion, \t this: " << ret->name() << "\n"
               << ret->fused_instructions_computation()->ToString(
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h
index 9508ab2ed1d38ec40983d8892ec8875b848fb21b..1c7583ece720f9e4d4b71a6279b976fed40e10cb 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 380cde0e6a858c7800445be94bb08dc22f3e776a..c35f72699bfe90f7b8021916c0f81d5e1926ff4c 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
 
+#include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/utility/utility.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -44,32 +45,48 @@ namespace xla {
 //
 // This pattern will match Add instructions whose first operand is a constant.
 //
-// Each pattern type has the following modifiers:
+// Each pattern type has the following modifiers, which are described where
+// nontrivial.
 //
 //   Op():
-//     - WithName: match operations with the given name
-//     - WithOpcode: match operations with the given opcode
-//     - WithShape: match operations whose shape matches the given pattern
-//     - WithOperand: match operations whose operand matches the given pattern
+//     - Is: is the given HloInstruction* (i.e. pointer equality)
+//     - WithName
+//     - WithOpcode
+//     - WithoutOpcode: anything other than the given opcode
+//     - WithShape: instr's shape matches the given pattern
+//     - WithShapeEqualTo: instr's shape is equal to the given Shape
+//     - WithShapeCompatibleTo: instr's shape is compatible with the given Shape
+//     - WithNumOperands
+//     - WithOperand: operand at the given index matches the given pattern
+//     - IsConstant
+//     - IsNonConstant
+//     - IsConstantScalar/IsEffectiveConstantScalar: Optionally accepts a value,
+//       e.g. IsConstantScalar() or IsConstantScalar(42).
+//     - WithFusionKind
+//     - WithTupleIndex: get-tuple-element operations with the given tuple index
+//     - WithOneUse: Instruction is used as an operand exactly once.
+//     - WithOneUser: Instruction is used by exactly one other instruction, but
+//       is possibly used more than once as an operand (e.g. multiply(x,x)).
 //
 //   Shape():
-//     - EqualTo: matches shapes that are equal to the argument
-//     - CompatibleTo: matches shapes that are compatible to the argument
-//     - IsScalar/IsArray/IsTuple: matches scalar/array/tuple shapes
-//     - IsDenseArray/IsSparseArray: matches arrays with dense/sparse format
-//     - WithLayout: match shapes whose layout matches the given pattern
-//     - WithLayoutEqualTo: matches shapes whose layouts equal the argument
-//     - WithSubshape: matches tuple shapes whose subshape matches the given
-//       pattern
-//     - WithSubshapeEqualTo: matches shapes with a subshape equal the argument
-//     - WithElementType: matches array/scalar shapes with the given element
-//       type
-//     - WithRank: matches array/scalar types with the given rank
+//     - EqualTo
+//     - CompatibleTo
+//     - IsScalar/IsEffectiveScalar/IsArray/IsTuple
+//     - IsDenseArray/IsSparseArray
+//     - WithLayout: layout shape's layout matches the given pattern (e.g.
+//       Layout().WithDenseFormat())
+//     - WithLayoutEqualTo: shape's layout equals the argument (i.e. another
+//       Layout, but not the result of Layout().foo())
+//     - WithSubshape: shape is a tuple whose subshape matches the given pattern
+//       (e.g. Shape().IsScalar()).
+//     - WithSubshapeEqualTo: shape is a tuple with a subshape equal to the arg
+//       (i.e. another Shape, but not the result of Shape().foo())
+//     - WithElementType: shape is an array/scalar with the given elem type
+//     - WithRank: shape is an array/scalar with the given rank
 //
 //  Layout():
-//     - EqualTo: matches layouts that are equal to the argument
-//     - WithDenseFormat/WithSparseFormat: matches layouts with dense/sparse
-//       format
+//     - EqualTo
+//     - WithDenseFormat/WithSparseFormat
 //
 // Op(), Shape(), and Layout() may be passed an argument of type
 // HloInstruction**, Shape**, or Layout**, respectively, or const versions of
@@ -82,53 +99,55 @@ namespace xla {
 //   CHECK(Match(foo,
 //               match::Op().WithOperand(0, match::Op(&matched_operand))));
 //
-// Helpers are provided for common nullary, unary, binary, and ternary
-// instructions. These helpers can be called with no arguments, in which case
-// they will match any instruction matching the opcode. They may also be called
-// with matches for the operands and with an optional capture. (The capture must
-// be the first argument.) Some examples of these helpers and their equivalents
-// are provided below.
-//
+// Helpers are provided for most HLO instructions. These helpers can be called
+// with no arguments, in which case they will match any instruction matching the
+// opcode. They may also be called with matches for the operands and with an
+// optional capture. (The capture must be the first argument.) Some examples of
+// these helpers and their equivalents are provided below.
+
 // Example nullary instruction:
-//   Param()                        == Op().WithOpcode(HloOpcode::kParam)
-//   Param(&a)                      == Op(&a).WithOpcode(HloOpcode::kParam)
+//   Parameter()                    == Op().WithOpcode(HloOpcode::kParameter)
+//   Parameter(&a)                  == Op(&a).WithOpcode(HloOpcode::kParameter)
 //
 // Example unary instruction:
-//   Abs()                             == Op().WithOpcode(HloOpcode::kAbs)
-//   Abs(Op(&a))                       == Op().WithOpcode(HloOpcode::kAbs)
-//                                            .WithOperand(0, Op(&a)))
-//   Abs(&a, Op(&b))                   == Op(&a).WithOpcode(HloOpcode::kAbs)
-//                                              .WithOperand(0, Op(&b))
+//   Abs()                          == Op().WithOpcode(HloOpcode::kAbs)
+//   Abs(Op(&a))                    == Op().WithOpcode(HloOpcode::kAbs)
+//                                         .WithOperand(0, Op(&a)))
+//   Abs(&a, Op(&b))                == Op(&a).WithOpcode(HloOpcode::kAbs)
+//                                           .WithOperand(0, Op(&b))
+//
+// Commutative binary instructions have a special form that accepts either order
+// of args, e.g.:
+//
+//   AddAnyOrder(Parameter(1), Abs()) ==
+//     Op().WithOpcode(HloOpcode::kAdd)
+//         .WithBinaryOperandsAnyOrder(Op().WithParameterNum(1), Abs());
 //
-// Example binary instruction:
-//   Add()                             == Op().WithOpcode(HloOpcode::kAdd)
-//   Add(Op(&a), Op(&b))               == Op().WithOpcode(HloOpcode::kAdd)
-//                                            .WithOperand(0, Op(&a))
-//                                            .WithOperand(1, Op(&b))
-//   Add(&a, Op(&b), Op(&c))           == Op(&a).WithOpcode(HloOpcode::kAdd)
-//                                              .WithOperand(0, Op(&b))
-//                                              .WithOperand(1, Op(&c))
+//   MultiplyAnyOrder(&a, Parameter(), Abs())  // Captures the mul in `a`.
 //
-// Example ternary instruction:
-//   Clamp()                           == Op().WithOpcode(HloOpcode::kClamp)
-//   Clamp(Op(&a), Op(&b), Op(&c))     == Op().WithOpcode(HloOpcode::kClamp)
-//                                            .WithOperand(0, Op(&a))
-//                                            .WithOperand(1, Op(&b))
-//                                            .WithOperand(2, Op(&c))
-//   Clamp(&a, Op(&b), Op(&c), Op(&d)) == Op(&a).WithOpcode(HloOpcode::kClamp)
-//                                              .WithOperand(0, Op(&b))
-//                                              .WithOperand(1, Op(&c))
-//                                              .WithOperand(2, Op(&d))
+// The following additional helpers are provided.  In all cases, `&a` is
+// optional.
 //
+//   ConstantScalar(&a)               == Op(&a).IsConstantScalar();
+//   ConstantScalar(&a, v)            == Op(&a).IsConstantScalar(v);
+//   ConstantEffectiveScalar(&a)      == Op(&a).IsConstantEffectiveScalar();
+//   ConstantEffectiveScalar(&a, v)   == Op(&a).IsConstantEffectiveScalar(&a, v)
+//   NonConstant(&a)                  == Op(&a).IsNonConstant()
+//   GetTupleElement(&a, b, index)    == Op(&a).WithTupleIndex(index)
+//                                             .WithOperand(0, b);
+//   Parameter(&a, n)                 == Op(&a).WithParameterNum(n);
 
 struct MatchOption {
   // If true, actually capture matched item into the user pointer.
   bool capture;
+
+  // An explanation for why we failed to match is streamed here, if not-null.
+  std::ostream* explain_os;
 };
 
 template <typename Value, typename Pattern>
 bool Match(Value* value, const Pattern& pattern,
-           MatchOption option = {/*.capture=*/true}) {
+           MatchOption option = {/*.capture=*/true, /*.explain_os=*/nullptr}) {
   if (option.capture) {
     auto new_option = option;
     new_option.capture = false;
@@ -143,6 +162,77 @@ namespace match {
 
 namespace detail {
 
+// Macro for streaming to option.explain_os if it's not null.
+//
+//   EXPLAIN << "value of foo(): " << foo()
+//
+#pragma push_macro("EXPLAIN")
+#define EXPLAIN \
+  if (option.explain_os) *option.explain_os
+
+// kIndentInc is the additional number of spaces that we indent by when we
+// increase the indent "by one".
+enum {
+  kIndentInc = 2,
+};
+
+// Writes a newline and then `indent` spaces.
+//
+// We follow an unintuitive convention in this file's pretty-printers: Indents
+// are performed by the caller, not the callee.  For example, if you want to
+// print
+//
+//   foo:
+//    - bar
+//
+// you'd do:
+//
+//  Foo::DescribeTo(std::ostream* os, int64 indent) {
+//    *os << "foo:";
+//    Indent(os, indent)  // Create a newline at the *current* indent level.
+//    *os << " - ";
+//    bar.DescribeTo(os, indent + 3);  // + 3 because strlen(" * ") == 3.
+//  }
+//
+//  Bar::DescribeTo(std::ostream* os, int64 indent) { *os << "bar"; }
+//
+// Notice that Bar::DescribeTo() does not call Indent; the indenting is
+// performed by Foo.  This convention allows the caller to decide whether a
+// matcher is preceded by a newline, which is important e.g. for the AllOf
+// matcher.
+//
+// (Incidentally, indenting in Match's explanations is handled differently.
+// Indents are a common case in DescribeTo [we're printing a whole tree], but
+// they're a special case in Match [we're printing only a path through the tree
+// that encounters a failing node]. Indents in Match only appear when we
+// encounter a failing disjunction, so we just handle them as a special case
+// there.)
+inline void Indent(std::ostream* os, int64 indent) {
+  *os << "\n";
+  for (int64 i = 0; i < indent; ++i) {
+    *os << " ";
+  }
+}
+
+// SFINAE template that determines whether T declares a static member
+// kIsTrivialMatcher.
+//
+// Trivial matchers get special treatment.  For example, when printing
+// a conjunction of matchers, we don't print "and" after a trivial matcher. This
+// yields e.g.
+//    "a shape compatible with f32[1,2]"
+// rather than
+//    "a shape AND compatible with f32[1,2]"
+template <typename T, typename Dummy = void>
+struct IsTrivialMatcher {
+  static constexpr bool value = false;
+};
+template <typename T>
+struct IsTrivialMatcher<T,
+                        typename std::enable_if<T::kIsTrivialMatcher>::type> {
+  static constexpr bool value = true;
+};
+
 template <typename Item, typename... Patterns>
 class AllOfPattern {
  public:
@@ -162,10 +252,19 @@ class AllOfPattern {
     return matched;
   }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    DescribeToImpl(os, std::integral_constant<size_t, 0>(), indent);
+  }
+
+  // Accessor for patterns_.  Please don't use this outside of this file.
+  const std::tuple<Patterns...>& patterns() const { return patterns_; }
+
  private:
   template <typename ItemType, size_t index>
   bool MatchImpl(ItemType* item, MatchOption option,
                  std::integral_constant<size_t, index>) const {
+    // We don't need to do any EXPLAINing here; it's all correctly handled by
+    // our sub-matchers (if any fail).
     return std::get<index>(patterns_).Match(item, option) &&
            MatchImpl(item, option, std::integral_constant<size_t, index + 1>());
   }
@@ -176,6 +275,73 @@ class AllOfPattern {
     return true;
   }
 
+  // Pretty-printing a conjunction has some special cases to make it easy to
+  // read in the simple (common) case.
+  //
+  // If sizeof...(Patterns) == 1, prints as e.g.
+  //
+  //   a shape
+  //
+  // If sizeof...(Patterns) == 2 and patterns_[0] is a trivial matcher (e.g. "a
+  // shape") prints as
+  //
+  //   a shape compatible with f32[1,2]
+  //
+  // If sizeof...(Patterns) > 2 and patterns_[0] is a trivial matcher, prints as
+  //
+  //   a shape:
+  //    * compatible with f32[1,2] AND
+  //    * that represents a scalar
+  //
+  // Otherwise prints as:
+  //
+  //   all of:
+  //    * foo AND
+  //    * bar
+  //
+  template <size_t index>
+  void DescribeToImpl(std::ostream* os, std::integral_constant<size_t, index>,
+                      int64 indent) const {
+    constexpr bool first_is_trivial =
+        IsTrivialMatcher<typename std::remove_reference<decltype(
+            std::get<0>(patterns_))>::type>::value;
+    constexpr bool is_last = index == sizeof...(Patterns) - 1;
+    const auto& submatcher = std::get<index>(patterns_);
+
+    auto print_bulleted_item = [&] {
+      *os << " * ";
+      submatcher.DescribeTo(os, indent + 3);
+      if (!is_last) {
+        *os << " AND";
+        Indent(os, indent);
+      }
+    };
+
+    if (index == 0) {
+      if (first_is_trivial || is_last) {
+        submatcher.DescribeTo(os, indent + kIndentInc);
+        if (sizeof...(Patterns) > 2) {
+          *os << ":";
+          Indent(os, indent);
+        }
+      } else {
+        *os << "all of:";
+        Indent(os, indent);
+        print_bulleted_item();
+      }
+    } else if (first_is_trivial && index == 1 && sizeof...(Patterns) == 2) {
+      *os << " ";
+      submatcher.DescribeTo(os, indent);
+    } else {
+      print_bulleted_item();
+    }
+    DescribeToImpl(os, std::integral_constant<size_t, index + 1>(), indent);
+  }
+
+  void DescribeToImpl(std::ostream* os,
+                      std::integral_constant<size_t, sizeof...(Patterns)>,
+                      int64 indent) const {}
+
   std::tuple<Patterns...> patterns_;
 };
 
@@ -183,10 +349,6 @@ class AllOfPattern {
 
 // Returns a pattern that represents the conjunction of all input patterns. All
 // patterns need to match in order to have the AllOf pattern match.
-//
-// TODO(timshen): Currently AllOf is still nested, e.g. AllOf<AllOf<A>, B> is
-// not AllOf<A, B>. We might want to flatten the AllOf type structure if the
-// C++ compile error message gets annoying.
 template <typename Item, typename... Patterns>
 detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
     const Patterns&... patterns) {
@@ -194,6 +356,25 @@ detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
                               Patterns...>(patterns...);
 }
 
+// AllOf<AllOf<A, B...>, X, Y, ...> => AllOf<A, B, ..., X, Y, ...>.
+//
+// This transformation is necessary for good pretty-printing.
+template <typename Item, typename... InnerPs, typename... OuterPs>
+detail::AllOfPattern<typename std::remove_const<Item>::type, InnerPs...,
+                     OuterPs...>
+AllOf(const detail::AllOfPattern<Item, InnerPs...>& inner_p,
+      const OuterPs&... outer_ps) {
+  // Invoke constructor of AllOfPattern<Item, InnerPs..., OuterPs...>.
+  auto make_all_of = [](const InnerPs&... inner_ps,
+                        const OuterPs&... outer_ps) {
+    return detail::AllOfPattern<typename std::remove_const<Item>::type,
+                                InnerPs..., OuterPs...>(inner_ps...,
+                                                        outer_ps...);
+  };
+  return absl::apply(make_all_of, std::tuple_cat(inner_p.patterns(),
+                                                 std::make_tuple(outer_ps...)));
+}
+
 namespace detail {
 
 template <typename LayoutType, typename Impl>
@@ -204,8 +385,18 @@ class LayoutPattern;
 class LayoutPatternBaseImpl {
  public:
   bool Match(const ::xla::Layout* layout, MatchOption option) const {
-    return layout != nullptr;
+    if (layout == nullptr) {
+      EXPLAIN << "Layout is null";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "a layout";
   }
+
+  static constexpr bool kIsTrivialMatcher = true;
 };
 
 // A LayoutPattern implementation that matches only if the layout equals a
@@ -216,7 +407,17 @@ class LayoutPatternEqualImpl {
       : layout_(layout) {}
 
   bool Match(const ::xla::Layout* layout, MatchOption option) const {
-    return LayoutUtil::Equal(*layout_, *layout);
+    if (!LayoutUtil::Equal(*layout_, *layout)) {
+      EXPLAIN << "Layout " << LayoutUtil::HumanString(*layout)
+              << " is not equal to expected "
+              << LayoutUtil::HumanString(*layout_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "equal to " << LayoutUtil::HumanString(*layout_);
   }
 
  private:
@@ -230,7 +431,16 @@ class LayoutPatternFormatImpl {
   explicit constexpr LayoutPatternFormatImpl(Format format) : format_(format) {}
 
   bool Match(const ::xla::Layout* layout, MatchOption option) const {
-    return layout->format() == format_;
+    if (layout->format() != format_) {
+      EXPLAIN << "Layout has format " << Format_Name(layout->format())
+              << " but expected " << Format_Name(format_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with format " << Format_Name(format_);
   }
 
  private:
@@ -242,11 +452,13 @@ template <typename LayoutType, typename Impl>
 class LayoutPattern {
  private:
   template <typename NewImpl>
-  LayoutPattern<LayoutType, AllOfPattern<::xla::Layout, Impl, NewImpl>>
-  AppendImpl(NewImpl new_impl) const {
-    return LayoutPattern<LayoutType,
-                         AllOfPattern<::xla::Layout, Impl, NewImpl>>(
-        AllOf<Layout>(impl_, std::move(new_impl)), matched_layout_);
+  auto AppendImpl(NewImpl new_impl) const
+      -> LayoutPattern<LayoutType,
+                       decltype(AllOf<Layout>(std::declval<Impl>(),
+                                              std::move(new_impl)))> {
+    auto new_allof = AllOf<Layout>(impl_, std::move(new_impl));
+    return LayoutPattern<LayoutType, decltype(new_allof)>(std::move(new_allof),
+                                                          matched_layout_);
   }
 
  public:
@@ -276,6 +488,10 @@ class LayoutPattern {
     return false;
   }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    impl_.DescribeTo(os, indent);
+  }
+
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr auto EqualTo(const ::xla::Layout* layout) const
@@ -306,19 +522,48 @@ class AnyOfPattern {
   explicit AnyOfPattern(const Patterns&... patterns) : patterns_(patterns...) {}
 
   bool Match(const Item* item, MatchOption option) const {
-    return MatchImpl(item, option, std::integral_constant<size_t, 0>());
+    return MatchImpl(item, option);
   }
 
   bool Match(Item* item, MatchOption option) const {
-    return MatchImpl(item, option, std::integral_constant<size_t, 0>());
+    return MatchImpl(item, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "any of:";
+    Indent(os, indent);
+    DescribeToImpl(os, std::integral_constant<size_t, 0>(), indent);
   }
 
  private:
+  template <typename ItemType>
+  bool MatchImpl(ItemType* item, MatchOption option) const {
+    // If we're generating an explanation, buffer it until we know we failed.
+    absl::optional<std::stringstream> explanation;
+    MatchOption new_option = option;
+    if (option.explain_os) {
+      new_option.explain_os = &explanation.emplace();
+    }
+    bool rv = MatchRecursiveImpl(item, new_option,
+                                 std::integral_constant<size_t, 0>());
+    if (!rv && option.explain_os) {
+      EXPLAIN << "None of the following matchers succeeded:";
+      EXPLAIN << explanation->str();
+    }
+    return rv;
+  }
+
   template <typename ItemType, size_t index>
-  bool MatchImpl(ItemType* item, MatchOption option,
-                 std::integral_constant<size_t, index>) const {
+  bool MatchRecursiveImpl(ItemType* item, MatchOption option,
+                          std::integral_constant<size_t, index>) const {
     auto new_option = option;
     new_option.capture = false;
+
+    absl::optional<std::stringstream> explanation;
+    if (option.explain_os) {
+      new_option.explain_os = &explanation.emplace();
+    }
+
     // Try to match the sub-pattern without capturing behavior.
     if (std::get<index>(patterns_).Match(item, new_option)) {
       // Capture the branch.
@@ -337,20 +582,46 @@ class AnyOfPattern {
         // AnyOf will be a runtime number indicate which sub-pattern is matched.
         // Then we run another pass to do captures only with the help of the
         // trace.
-        bool ret = std::get<index>(patterns_).Match(item, option);
-        DCHECK(ret);
+        bool matched = std::get<index>(patterns_).Match(item, option);
+        DCHECK(matched);
       }
       return true;
     }
-    return MatchImpl(item, option, std::integral_constant<size_t, index + 1>());
+    if (option.explain_os) {
+      EXPLAIN << "\nMatcher #" << index + 1;
+      EXPLAIN << "\n - ";
+      std::get<index>(patterns_).DescribeTo(option.explain_os, /*indent=*/3);
+      EXPLAIN << "\nfailed with";
+      EXPLAIN << "\n - ";
+      EXPLAIN << absl::StrReplaceAll(explanation->str(), {{"\n", "\n   "}});
+    }
+    return MatchRecursiveImpl(item, option,
+                              std::integral_constant<size_t, index + 1>());
   }
 
   template <typename ItemType>
-  bool MatchImpl(ItemType* item, MatchOption option,
-                 std::integral_constant<size_t, sizeof...(Patterns)>) const {
+  bool MatchRecursiveImpl(
+      ItemType* item, MatchOption option,
+      std::integral_constant<size_t, sizeof...(Patterns)>) const {
     return false;
   }
 
+  template <size_t index>
+  void DescribeToImpl(std::ostream* os, std::integral_constant<size_t, index>,
+                      int64 indent) const {
+    *os << " - ";
+    std::get<index>(patterns_).DescribeTo(os, indent + 3);
+    if (index != sizeof...(Patterns) - 1) {
+      *os << " OR";
+      Indent(os, indent);
+    }
+    DescribeToImpl(os, std::integral_constant<size_t, index + 1>(), indent);
+  }
+
+  void DescribeToImpl(std::ostream* os,
+                      std::integral_constant<size_t, sizeof...(Patterns)>,
+                      int64 indent) const {}
+
   std::tuple<Patterns...> patterns_;
 };
 
@@ -395,8 +666,17 @@ class ShapePattern;
 class ShapePatternBaseImpl {
  public:
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (shape == nullptr) {
+      EXPLAIN << "Shape is null";
+    }
     return shape != nullptr;
   }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "a shape";
+  }
+
+  static constexpr bool kIsTrivialMatcher = true;
 };
 
 // A ShapePattern implementation that matches only if the shape equals a Shape
@@ -407,7 +687,16 @@ class ShapePatternEqualImpl {
       : shape_(shape) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::Equal(*shape_, *shape);
+    if (!ShapeUtil::Equal(*shape_, *shape)) {
+      EXPLAIN << "Shape not equal to "
+              << ShapeUtil::HumanStringWithLayout(*shape_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "equal to " << ShapeUtil::HumanStringWithLayout(*shape_);
   }
 
  private:
@@ -422,7 +711,16 @@ class ShapePatternCompatibleImpl {
       : shape_(shape) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::Compatible(*shape_, *shape);
+    if (!ShapeUtil::Compatible(*shape_, *shape)) {
+      EXPLAIN << "Shape not compatible with "
+              << ShapeUtil::HumanString(*shape_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "compatible with " << ShapeUtil::HumanString(*shape_);
   }
 
  private:
@@ -437,7 +735,16 @@ class ShapePatternElementTypeImpl {
       : element_type_(element_type) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return shape->element_type() == element_type_;
+    if (shape->element_type() != element_type_) {
+      EXPLAIN << "Shape does not have element type "
+              << PrimitiveType_Name(element_type_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with element type " << PrimitiveType_Name(element_type_);
   }
 
  private:
@@ -450,7 +757,15 @@ class ShapePatternIsScalarImpl {
   explicit constexpr ShapePatternIsScalarImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IsScalar(*shape);
+    if (!ShapeUtil::IsScalar(*shape)) {
+      EXPLAIN << "Shape is not a scalar";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that represents a scalar";
   }
 };
 
@@ -460,7 +775,15 @@ class ShapePatternIsArrayImpl {
   explicit constexpr ShapePatternIsArrayImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IsArray(*shape);
+    if (!ShapeUtil::IsArray(*shape)) {
+      EXPLAIN << "Shape is not an array";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that represents an array";
   }
 };
 
@@ -470,7 +793,34 @@ class ShapePatternIsTupleImpl {
   explicit constexpr ShapePatternIsTupleImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IsTuple(*shape);
+    if (!ShapeUtil::IsTuple(*shape)) {
+      EXPLAIN << "Shape is not a tuple";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that represents a tuple";
+  }
+};
+
+// A ShapePattern implementation that matches only if the shape is an effective
+// scalar.
+class ShapePatternEffectiveScalarImpl {
+ public:
+  explicit constexpr ShapePatternEffectiveScalarImpl() {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!ShapeUtil::IsEffectiveScalar(*shape)) {
+      EXPLAIN << "Shape is not an effective scalar";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that is an effective scalar";
   }
 };
 
@@ -481,7 +831,23 @@ class ShapePatternRankImpl {
   explicit constexpr ShapePatternRankImpl(int64 rank) : rank_(rank) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::Rank(*shape) == rank_;
+    if (ShapeUtil::Rank(*shape) != rank_) {
+      if (rank_ == 0) {
+        EXPLAIN << "Shape is not a scalar";
+      } else {
+        EXPLAIN << "Shape does not have rank " << rank_;
+      }
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    if (rank_ == 0) {
+      *os << "that is a scalar";
+    } else {
+      *os << "that has " << rank_ << " dimension" << (rank_ != 1 ? "s" : "");
+    }
   }
 
  private:
@@ -503,8 +869,21 @@ class ShapePatternLayoutImpl {
   }
 
   bool Match(Shape* shape, MatchOption option) const {
-    return LayoutUtil::HasLayout(*shape) &&
-           layout_.Match(shape->mutable_layout(), option);
+    if (!LayoutUtil::HasLayout(*shape)) {
+      EXPLAIN << "Shape does not have a layout";
+      return false;
+    }
+    if (!layout_.Match(shape->mutable_layout(), option)) {
+      EXPLAIN << "\nin layout";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with";
+    Indent(os, indent + kIndentInc);
+    layout_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
@@ -522,17 +901,40 @@ class ShapePatternSubshapeImpl {
       : index_(index), subshape_(subshape) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IndexIsValid(*shape, index_) &&
-           subshape_.Match(&ShapeUtil::GetSubshape(*shape, index_), option);
+    return MatchImpl(shape, option);
   }
 
   bool Match(::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IndexIsValid(*shape, index_) &&
-           subshape_.Match(ShapeUtil::GetMutableSubshape(shape, index_),
-                           option);
+    return MatchImpl(shape, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with subshape at index " << index_.ToString() << " which is";
+    Indent(os, indent + kIndentInc);
+    subshape_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
+  Shape* GetSubshape(Shape* shape) const {
+    return ShapeUtil::GetMutableSubshape(shape, index_);
+  }
+  const Shape* GetSubshape(const Shape* shape) const {
+    return &ShapeUtil::GetSubshape(*shape, index_);
+  }
+
+  template <typename ShapeType>
+  bool MatchImpl(ShapeType* shape, MatchOption option) const {
+    if (!ShapeUtil::IndexIsValid(*shape, index_)) {
+      EXPLAIN << "No subshape at " << index_.ToString();
+      return false;
+    }
+    if (!subshape_.Match(GetSubshape(shape), option)) {
+      EXPLAIN << "\nin subshape at " << index_.ToString();
+      return false;
+    }
+    return true;
+  }
+
   ShapeIndexView index_;
   ShapePattern<SubshapeType, SubshapeImpl> subshape_;
 };
@@ -542,10 +944,12 @@ template <typename ShapeType, typename Impl>
 class ShapePattern {
  private:
   template <typename NewImpl>
-  ShapePattern<ShapeType, AllOfPattern<::xla::Shape, Impl, NewImpl>> AppendImpl(
-      NewImpl new_impl) const {
-    return ShapePattern<ShapeType, AllOfPattern<::xla::Shape, Impl, NewImpl>>(
-        AllOf<Shape>(impl_, std::move(new_impl)), matched_shape_);
+  auto AppendImpl(NewImpl new_impl) const
+      -> ShapePattern<ShapeType, decltype(AllOf<Shape>(std::declval<Impl>(),
+                                                       std::move(new_impl)))> {
+    auto new_all_of = AllOf<Shape>(impl_, std::move(new_impl));
+    return ShapePattern<ShapeType, decltype(new_all_of)>(std::move(new_all_of),
+                                                         matched_shape_);
   }
 
  public:
@@ -560,6 +964,11 @@ class ShapePattern {
       }
       return true;
     }
+    if (shape) {
+      EXPLAIN << "\nin "
+              << (shape->has_layout() ? ShapeUtil::HumanStringWithLayout(*shape)
+                                      : ShapeUtil::HumanString(*shape));
+    }
     return false;
   }
 
@@ -571,9 +980,16 @@ class ShapePattern {
       }
       return true;
     }
+    EXPLAIN << "\nin "
+            << (shape->has_layout() ? ShapeUtil::HumanStringWithLayout(*shape)
+                                    : ShapeUtil::HumanString(*shape));
     return false;
   }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    return impl_.DescribeTo(os, indent);
+  }
+
   // Modifies the pattern to match only if the shape equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr auto EqualTo(const ::xla::Shape* shape) const
@@ -612,6 +1028,11 @@ class ShapePattern {
     return AppendImpl(ShapePatternIsTupleImpl());
   }
 
+  constexpr auto IsEffectiveScalar() const
+      -> decltype(this->AppendImpl(ShapePatternEffectiveScalarImpl())) {
+    return AppendImpl(ShapePatternEffectiveScalarImpl());
+  }
+
   // Modifies the pattern to match only if the shape has the given rank.
   constexpr auto WithRank(int64 rank) const
       -> decltype(this->AppendImpl(ShapePatternRankImpl(rank))) {
@@ -706,6 +1127,22 @@ Shape(::xla::Shape** matched_shape) {
 
 namespace detail {
 
+// Overloads to get a const or non-const operand out of an instruction.
+inline HloInstruction* HloOperand(HloInstruction* instr, int64 idx) {
+  return instr->mutable_operand(idx);
+}
+inline const HloInstruction* HloOperand(const HloInstruction* instr,
+                                        int64 idx) {
+  return instr->operand(idx);
+}
+
+// Pretty-printer for HloInstruction.  Sort of like ToShortString, but with
+// fewer %s and more shapes.
+inline string InstToString(const HloInstruction* inst) {
+  return inst->ToString(
+      HloPrintOptions().set_print_metadata(false).set_print_percent(false));
+}
+
 template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern;
 
@@ -714,8 +1151,18 @@ class HloInstructionPattern;
 class HloInstructionPatternBaseImpl {
  public:
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst != nullptr;
+    if (inst == nullptr) {
+      EXPLAIN << "HloInstruction* is null";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "an HloInstruction";
   }
+
+  static constexpr bool kIsTrivialMatcher = true;
 };
 
 // An HloInstructionPattern implementation that matches only if the instruction
@@ -726,13 +1173,44 @@ class HloInstructionPatternNameImpl {
       : name_(name) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->name() == name_;
+    if (inst->name() != name_) {
+      EXPLAIN << "HloInstruction not named \"" << name_ << "\"";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "named \"" << name_ << "\"";
   }
 
  private:
   absl::string_view name_;
 };
 
+// An HloInstructionPattern implementation that matches only if the instruction
+// equals a particular pointer.
+class HloInstructionIsImpl {
+ public:
+  explicit HloInstructionIsImpl(const HloInstruction* inst) : inst_(inst) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst != inst_) {
+      EXPLAIN << "HloInstruction " << inst << " is not " << inst_ << " ("
+              << InstToString(inst_) << ")";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is " << inst_ << " (" << InstToString(inst_) << ")";
+  }
+
+ private:
+  const HloInstruction* inst_;
+};
+
 // An HloInstructionPattern implementation that matches only if the instruction
 // has a given opcode.
 class HloInstructionPatternOpcodeImpl {
@@ -742,7 +1220,25 @@ class HloInstructionPatternOpcodeImpl {
       : opcode_(opcode), invert_(invert) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return (invert_ ^ (inst->opcode() == opcode_));
+    if (invert_ && inst->opcode() == opcode_) {
+      EXPLAIN << "HloInstruction has opcode " << HloOpcodeString(opcode_)
+              << ", expected anything else";
+      return false;
+    }
+    if (!invert_ && inst->opcode() != opcode_) {
+      EXPLAIN << "HloInstruction doesn't have opcode "
+              << HloOpcodeString(opcode_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    if (!invert_) {
+      *os << "with opcode " << HloOpcodeString(opcode_);
+    } else {
+      *os << "with any opcode other than " << HloOpcodeString(opcode_);
+    }
   }
 
  private:
@@ -750,6 +1246,30 @@ class HloInstructionPatternOpcodeImpl {
   bool invert_;
 };
 
+// An HloInstructionPattern implementation that matches only if the instruction
+// has the given number of operands.
+class HloInstructionPatternNumOperandsImpl {
+ public:
+  explicit constexpr HloInstructionPatternNumOperandsImpl(int64 num_operands)
+      : num_operands_(num_operands) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->operand_count() != num_operands_) {
+      EXPLAIN << "HloInstruction doesn't have " << num_operands_ << " operands";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with " << num_operands_ << " operand"
+        << (num_operands_ != 1 ? "s" : "");
+  }
+
+ private:
+  int64 num_operands_;
+};
+
 // An HloInstructionPattern implementation that matches only if the instruction
 // has a shape that matches a given pattern.
 template <typename ShapeType, typename ShapeImpl>
@@ -760,11 +1280,25 @@ class HloInstructionPatternShapeImpl {
       : shape_(shape) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return shape_.Match(&inst->shape(), option);
+    if (!shape_.Match(&inst->shape(), option)) {
+      EXPLAIN << "\nin output shape";
+      return false;
+    }
+    return true;
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return shape_.Match(inst->mutable_shape(), option);
+    if (!shape_.Match(inst->mutable_shape(), option)) {
+      EXPLAIN << "\nin output shape";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "outputting";
+    Indent(os, indent + kIndentInc);
+    shape_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
@@ -782,20 +1316,197 @@ class HloInstructionPatternOperandImpl {
       : operand_index_(operand_index), operand_(operand) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return operand_index_ < inst->operand_count() &&
-           operand_.Match(inst->operand(operand_index_), option);
+    return MatchImpl(inst, option);
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return operand_index_ < inst->operand_count() &&
-           operand_.Match(inst->mutable_operand(operand_index_), option);
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with operand " << operand_index_ << " which is:";
+    Indent(os, indent + kIndentInc);
+    operand_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (operand_index_ >= inst->operand_count()) {
+      EXPLAIN << "desired operand index " << operand_index_
+              << " is out of bounds";
+      return false;
+    }
+    if (!operand_.Match(HloOperand(inst, operand_index_), option)) {
+      EXPLAIN << "\nin operand " << operand_index_;
+      return false;
+    }
+    return true;
+  }
+
   int64 operand_index_;
   HloInstructionPattern<OperandType, OperandImpl> operand_;
 };
 
+// Matches a binary instruction whose operands come in any order.
+template <typename OperandType1, typename OperandImpl1, typename OperandType2,
+          typename OperandImpl2>
+class HloInstructionPatternBinaryOperandsAnyOrderImpl {
+ public:
+  explicit constexpr HloInstructionPatternBinaryOperandsAnyOrderImpl(
+      const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2)
+      : op1_(op1), op2_(op2) {}
+
+  bool Match(HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with two operands in either order:";
+    Indent(os, indent);
+    *os << " - ";
+    op1_.DescribeTo(os, indent + 3);
+    Indent(os, indent);
+    *os << " - ";
+    op2_.DescribeTo(os, indent + 3);
+  }
+
+ private:
+  HloInstruction* operand(HloInstruction* inst, int64 idx) const {
+    return inst->mutable_operand(idx);
+  }
+  const HloInstruction* operand(const HloInstruction* inst, int64 idx) const {
+    return inst->operand(idx);
+  }
+
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    // We could implement this using AnyOf and AllOf matchers, but the templates
+    // get pretty difficult to debug, since any compile error herein becomes
+    // not-an-error via SFINAE.  Also this way lets us give better messages on
+    // failure.
+    if (inst->operand_count() != 2) {
+      EXPLAIN << "HloInstruction did not have two operands";
+      return false;
+    }
+
+    // If we're not generating explanations, this is pretty simple.
+    if (!option.explain_os) {
+      auto try_match = [&](int64 idx1, int64 idx2) {
+        MatchOption new_option = option;
+        new_option.capture = false;
+        if (op1_.Match(operand(inst, idx1), new_option) &&
+            op2_.Match(operand(inst, idx2), new_option)) {
+          if (option.capture) {
+            bool matched = op1_.Match(operand(inst, idx1), option) &&
+                           op2_.Match(operand(inst, idx2), option);
+            DCHECK(matched);
+          }
+          return true;
+        }
+        return false;
+      };
+      return try_match(0, 1) || try_match(1, 0);
+    }
+
+    // If we are generating explanations, we have some work to do in order to
+    // generate a helpful error.
+    //
+    // First, try all four operand/matcher combinations, recording the
+    // failure explanations separately from option.explain_os. matches[i][j]
+    // tells us if matcher_i matches operand j.
+    bool matches[/*matcher*/ 2][/*operand*/ 2];
+    std::stringstream explanations[/*matcher*/ 2][/*operand*/ 2];
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        MatchOption new_option = option;
+        new_option.capture = false;
+        new_option.explain_os = &explanations[i][j];
+        matches[i][j] = i == 0 ? op1_.Match(operand(inst, j), new_option)
+                               : op2_.Match(operand(inst, j), new_option);
+      }
+    }
+
+    // Check if the match succeeded.
+    for (int i = 0; i < 2; ++i) {
+      if (matches[0][i] && matches[1][(i + 1) % 2]) {
+        // Rerun the matches with capture enabled if necessary.
+        if (option.capture) {
+          auto* operand1 = operand(inst, i);
+          auto* operand2 = operand(inst, (i + 1) % 2);
+          bool matched =
+              op1_.Match(operand1, option) && op2_.Match(operand2, option);
+          DCHECK(matched);
+        }
+        return true;
+      }
+    }
+
+    auto describe_matcher = [&](int matcher_idx) {
+      EXPLAIN << "\n - ";
+      if (matcher_idx == 0) {
+        op1_.DescribeTo(option.explain_os, /*indent=*/3);
+      } else {
+        CHECK_EQ(matcher_idx, 1);
+        op2_.DescribeTo(option.explain_os, /*indent=*/3);
+      }
+      for (int i = 0; i < 2; ++i) {
+        if (matches[matcher_idx][/*operand*/ i]) {
+          continue;
+        }
+        EXPLAIN << "\ndoes not match " << (i == 0 ? "LHS" : "RHS") << ":\n";
+        EXPLAIN << " - ";
+        EXPLAIN << absl::StrReplaceAll(
+            explanations[matcher_idx][/*operand*/ i].str(), {{"\n", "\n   "}});
+      }
+    };
+
+    // If we failed to match, one of the following is true:
+    //  1. op1 (op2) matches neither LHS nor RHS, or
+    //  2. op1 and op2 both match LHS (RHS), but neither matches RHS (LHS).
+    // We print different explanations depending on which case we're in.
+
+    // Case 1.
+    bool wrote_explanation = false;
+    for (int i = 0; !wrote_explanation && i < 2; ++i) {
+      if (!matches[i][0] && !matches[i][1]) {
+        EXPLAIN << "HloInstruction's operands (ignoring order) did not match "
+                << (i == 0 ? "first" : "second") << " matcher.  Specifically,";
+        describe_matcher(i);
+        wrote_explanation = true;
+      }
+    }
+
+    // Case 2.
+    for (int i = 0; !wrote_explanation && i < 2; ++i) {
+      if (matches[/*matcher*/ 0][/*operand*/ i] &&
+          matches[/*matcher*/ 1][/*operand*/ i]) {
+        CHECK(!matches[0][(i + 1) % 2]);
+        CHECK(!matches[1][(i + 1) % 2]);
+        CHECK(!wrote_explanation);
+        EXPLAIN << "HloInstruction's " << (i == 1 ? "LHS" : "RHS")
+                << " operand did not match either of the two matchers.  "
+                   "Specifically,";
+        describe_matcher(0);
+        EXPLAIN << "\nand";
+        describe_matcher(1);
+        wrote_explanation = true;
+      }
+    }
+
+    CHECK(wrote_explanation);
+    return false;
+  }
+
+  HloInstructionPattern<OperandType1, OperandImpl1> op1_;
+  HloInstructionPattern<OperandType2, OperandImpl2> op2_;
+};
+
 // An HloInstructionPattern implementation that matches only if the instruction
 // is a fusion node with a particular kind.
 class HloInstructionPatternFusionKindImpl {
@@ -805,14 +1516,32 @@ class HloInstructionPatternFusionKindImpl {
       : kind_(kind) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kFusion && inst->fusion_kind() == kind_;
+    return MatchImpl(inst, option);
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kFusion && inst->fusion_kind() == kind_;
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with fusion kind " << ToString(kind_);
   }
 
  private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kFusion) {
+      EXPLAIN << "HloInstruction does not have fusion kind " << ToString(kind_)
+              << "; it's not a fusion";
+      return false;
+    }
+    if (inst->fusion_kind() != kind_) {
+      EXPLAIN << "HloInstruction does not have fusion kind " << ToString(kind_);
+      return false;
+    }
+    return true;
+  }
+
   ::xla::HloInstruction::FusionKind kind_;
 };
 
@@ -824,47 +1553,211 @@ class HloInstructionPatternTupleIndexImpl {
       : tuple_index_(tuple_index) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kGetTupleElement &&
-           inst->tuple_index() == tuple_index_;
+    return MatchImpl(inst, option);
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kGetTupleElement &&
-           inst->tuple_index() == tuple_index_;
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is a GTE with index " << tuple_index_;
   }
 
  private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kGetTupleElement) {
+      EXPLAIN << "HloInstruction is not a GTE with index " << tuple_index_
+              << "; it's not a GTE at all";
+      return false;
+    }
+    if (inst->tuple_index() != tuple_index_) {
+      EXPLAIN << "HloInstruction is not a GTE with index " << tuple_index_;
+      return false;
+    }
+    return true;
+  }
+
   int64 tuple_index_;
 };
 
-template <typename ItemType, typename Predicate>
-class HloPredicatePatternImpl {
+class HloInstructionPatternParameterNumImpl {
  public:
-  explicit HloPredicatePatternImpl(Predicate pred) : pred_(std::move(pred)) {}
+  explicit constexpr HloInstructionPatternParameterNumImpl(int64 parameter_num)
+      : parameter_num_(parameter_num) {}
 
-  bool Match(const ItemType* item, MatchOption option) const {
-    return pred_(item);
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
   }
 
-  bool Match(ItemType* item, MatchOption option) const { return pred_(item); }
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is parameter " << parameter_num_;
+  }
 
  private:
-  Predicate pred_;
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kParameter ||
+        inst->parameter_number() != parameter_num_) {
+      EXPLAIN << "HloInstruction is not parameter " << parameter_num_;
+      return false;
+    }
+    return true;
+  }
+
+  int64 parameter_num_;
 };
 
-struct PatternFriend;
+// Superclass that contains common code used by Op::WithOneUse() and
+// Op::WithOneUser().
+class HloInstructionPatternOneUseOrUserImpl {
+ protected:
+  bool MatchOneUser(const HloInstruction* inst, MatchOption option) const {
+    if (inst->user_count() != 1) {
+      EXPLAIN << "HloInstruction has " << inst->user_count()
+              << " users, but expected exactly one.";
+      if (inst->user_count() > 1) {
+        EXPLAIN << "\nAll users:";
+        for (const HloInstruction* user : inst->users()) {
+          EXPLAIN << "\n - " << InstToString(user);
+        }
+      }
+      return false;
+    }
+    return true;
+  }
+};
+
+class HloInstructionPatternOneUseImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    if (!MatchOneUser(inst, option)) {
+      return false;
+    }
+
+    int64 use_count = absl::c_count_if(
+        inst->users()[0]->operands(),
+        [&](const HloInstruction* operand) { return operand == inst; });
+    if (use_count != 1) {
+      EXPLAIN << "HloInstruction is used " << use_count
+              << " times by its user, but is expected to be used just once: "
+              << InstToString(inst->users()[0]);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which has exactly one use";
+  }
+};
+
+class HloInstructionPatternOneUserImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    return MatchOneUser(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which has exactly one user (but possibly is used multiple times by "
+           "that instruction)";
+  }
+};
+
+// Matches a constant scalar or effective scalar, optionally with a given value.
+template <typename ScalarTy>
+class HloConstantScalarImpl {
+ public:
+  explicit constexpr HloConstantScalarImpl(bool match_effective_scalar)
+      : val_(absl::nullopt), match_effective_scalar_(match_effective_scalar) {}
+
+  constexpr HloConstantScalarImpl(ScalarTy val, bool match_effective_scalar)
+      : val_(val), match_effective_scalar_(match_effective_scalar) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is a constant "
+        << (match_effective_scalar_ ? "effective " : "") << "scalar";
+    if (val_.has_value()) {
+      *os << " with value " << *val_;
+    }
+  }
+
+ private:
+  template <typename InstTy>
+  bool MatchImpl(InstTy* inst, MatchOption option) const {
+    const auto* const_inst = DynCast<HloConstantInstruction>(inst);
+    if (!const_inst) {
+      EXPLAIN << "HloInstruction is not a constant";
+      return false;
+    }
+    if (match_effective_scalar_ &&
+        !ShapeUtil::IsEffectiveScalar(inst->shape())) {
+      EXPLAIN << "HloInstruction is not an effective scalar";
+      return false;
+    }
+    if (!match_effective_scalar_ && !ShapeUtil::IsScalar(inst->shape())) {
+      EXPLAIN << "HloInstruction is not a scalar";
+      return false;
+    }
+    if (!val_.has_value()) {
+      return true;
+    }
+
+    // Check that literal == static_cast<LitearlTy>(val) and
+    // val == static_cast<ValTy>(literal).  This is sufficient to ensure that
+    // the two constant scalars are actually "equal".
+    auto val_literal = LiteralUtil::CreateR0(*val_);
+    auto literal_r0_or = const_inst->literal().Reshape({});
+    auto val_as_literal_ty_or =
+        val_literal.Convert(const_inst->shape().element_type());
+    if (!literal_r0_or.ok() || !val_as_literal_ty_or.ok()) {
+      EXPLAIN << "could not construct relevant Literals (how did this happen?)";
+      return false;
+    }
+    auto literal_r0 = std::move(literal_r0_or).ValueOrDie();
+    auto val_as_literal_ty = std::move(val_as_literal_ty_or).ValueOrDie();
+    auto literal_r0_as_val_ty_or =
+        literal_r0.Convert(val_literal.shape().element_type());
+    bool rv = literal_r0_as_val_ty_or.ok() &&  //
+              literal_r0_as_val_ty_or.ValueOrDie() == val_literal &&
+              literal_r0 == val_as_literal_ty;
+    if (!rv) {
+      EXPLAIN << "HloInstruction's constant value " << literal_r0.ToString()
+              << " did not match expected value " << *val_;
+    }
+    return rv;
+  }
+
+  absl::optional<ScalarTy> val_;
+  bool match_effective_scalar_;
+};
 
 // A pattern that matches HloInstructions.
 template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern {
  private:
   template <typename NewImpl>
-  HloInstructionPattern<HloInstructionType,
-                        AllOfPattern<::xla::HloInstruction, Impl, NewImpl>>
-  AppendImpl(NewImpl new_impl) const {
-    return HloInstructionPattern<
-        HloInstructionType, AllOfPattern<::xla::HloInstruction, Impl, NewImpl>>(
-        AllOf<HloInstruction>(impl_, std::move(new_impl)), matched_inst_);
+  auto AppendImpl(NewImpl new_impl) const -> HloInstructionPattern<
+      HloInstructionType, decltype(AllOf<HloInstruction>(
+                              std::declval<Impl>(), std::move(new_impl)))> {
+    auto new_allof = AllOf<HloInstruction>(impl_, std::move(new_impl));
+    return HloInstructionPattern<HloInstructionType, decltype(new_allof)>(
+        std::move(new_allof), matched_inst_);
   }
 
  public:
@@ -880,6 +1773,9 @@ class HloInstructionPattern {
       }
       return true;
     }
+    if (inst != nullptr) {
+      EXPLAIN << "\nin " << InstToString(inst);
+    }
     return false;
   }
 
@@ -891,6 +1787,7 @@ class HloInstructionPattern {
       }
       return true;
     }
+    EXPLAIN << "\nin " << InstToString(inst);
     return false;
   }
 
@@ -907,6 +1804,11 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, false));
   }
 
+  auto WithNumOperands(int64 num_operands) const -> decltype(
+      this->AppendImpl(HloInstructionPatternNumOperandsImpl(num_operands))) {
+    return AppendImpl(HloInstructionPatternNumOperandsImpl(num_operands));
+  }
+
   // Modifies the pattern to match only if the instruction does not have the
   // given opcode.
   auto WithoutOpcode(HloOpcode opcode) const
@@ -915,12 +1817,47 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, true));
   }
 
+  constexpr auto Is(const HloInstruction* instr) const
+      -> decltype(this->AppendImpl(HloInstructionIsImpl(instr))) {
+    return AppendImpl(HloInstructionIsImpl(instr));
+  }
+
   // Modifies the pattern to match only if the instruction is a constant.
   constexpr auto IsConstant() const
       -> decltype(this->WithOpcode(HloOpcode::kConstant)) {
     return WithOpcode(HloOpcode::kConstant);
   }
 
+  constexpr auto IsConstantScalar() const -> decltype(this->AppendImpl(
+      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false))) {
+    return AppendImpl(
+        HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false));
+  }
+
+  // This does not check that T has the same type as the instruction, so e.g.
+  // IsConstantScalar(1.0) may match a constant of shape int32[].
+  template <typename ScalarTy>
+  constexpr auto IsConstantScalar(const ScalarTy& val) const
+      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
+          val, /*match_effective_scalar=*/false))) {
+    return AppendImpl(
+        HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/false));
+  }
+
+  constexpr auto IsConstantEffectiveScalar() const -> decltype(this->AppendImpl(
+      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true))) {
+    return AppendImpl(
+        HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true));
+  }
+
+  template <typename ScalarTy>
+  constexpr auto IsConstantEffectiveScalar(const ScalarTy& val) const
+      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
+          val, /*match_effective_scalar=*/true))) {
+    return AppendImpl(
+        HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/true));
+  }
+
   // Modifies the pattern to match only if the instruction is not a constant.
   constexpr auto IsNonConstant() const
       -> decltype(this->WithoutOpcode(HloOpcode::kConstant)) {
@@ -937,6 +1874,22 @@ class HloInstructionPattern {
         HloInstructionPatternShapeImpl<ShapeType, ShapeImpl>(shape));
   }
 
+  // Make this a templated function to work around gcc 4.9.4 template infinite
+  // recursion bug.
+  template <typename Dummy = void>
+  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape)
+      -> decltype(this->WithShape(Shape().EqualTo(shape))) {
+    return WithShape(Shape().EqualTo(shape));
+  }
+
+  // Make this a templated function to work around gcc 4.9.4 template infinite
+  // recursion bug.
+  template <typename Dummy = void>
+  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape)
+      -> decltype(this->WithShape(Shape().CompatibleTo(shape))) {
+    return WithShape(Shape().CompatibleTo(shape));
+  }
+
   // Modifies the pattern to match only if the instruction has an operand that
   // matches the given pattern.
   template <typename OperandType, typename OperandImpl>
@@ -951,6 +1904,20 @@ class HloInstructionPattern {
             operand_index, operand));
   }
 
+  template <typename OperandType1, typename OperandImpl1, typename OperandType2,
+            typename OperandImpl2>
+  constexpr auto WithBinaryOperandsAnyOrder(
+      const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2) const
+      -> decltype(this->AppendImpl(
+          HloInstructionPatternBinaryOperandsAnyOrderImpl<
+              OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1,
+                                                                      op2))) {
+    return AppendImpl(
+        HloInstructionPatternBinaryOperandsAnyOrderImpl<
+            OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1, op2));
+  }
+
   // Modifies the pattern to match only if the instruction is a fusion node with
   // the given kind.
   constexpr auto WithFusionKind(HloInstruction::FusionKind kind) const
@@ -965,17 +1932,34 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternTupleIndexImpl(tuple_index));
   }
 
- private:
-  template <typename Predicate>
-  constexpr auto WithPredicate(Predicate pred) const -> decltype(
-      this->AppendImpl(HloPredicatePatternImpl<HloInstruction, Predicate>(
-          std::move(pred)))) {
-    return AppendImpl(
-        HloPredicatePatternImpl<HloInstruction, Predicate>(std::move(pred)));
+  // Modifies the pattern to match only if the instruction is a parameter
+  // with the given parameter number.
+  constexpr auto WithParameterNum(int64 parameter_num) const -> decltype(
+      this->AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num))) {
+    return AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num));
   }
 
-  friend struct PatternFriend;
+  // Modifies the pattern to match if the instruction is used exactly once.
+  // Does not match if the instruction is used twice by the same user (e.g.
+  // multiply(x,x)).
+  constexpr auto WithOneUse() const
+      -> decltype(this->AppendImpl(HloInstructionPatternOneUseImpl())) {
+    return AppendImpl(HloInstructionPatternOneUseImpl());
+  }
 
+  // Modifies the pattern to match if the instruction is used by exactly one
+  // other instruction.  Will match if the instruction is used twice, so long as
+  // it's by the same user (e.g.  multiply(x,x)).
+  constexpr auto WithOneUser() const
+      -> decltype(this->AppendImpl(HloInstructionPatternOneUserImpl())) {
+    return AppendImpl(HloInstructionPatternOneUserImpl());
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    impl_.DescribeTo(os, indent);
+  }
+
+ private:
   Impl impl_;
   HloInstructionType** matched_inst_;
 };
@@ -1016,6 +2000,7 @@ Op(::xla::HloInstruction** matched_inst) {
 XLA_NULLOP_PATTERN(Constant)
 XLA_NULLOP_PATTERN(Parameter)
 XLA_NULLOP_PATTERN(Iota)
+XLA_NULLOP_PATTERN(Rng)
 #undef XLA_NULLOP_PATTERN
 
 // Helpers for unary instructions.
@@ -1047,8 +2032,10 @@ XLA_UNOP_PATTERN(RoundNearestAfz)
 XLA_UNOP_PATTERN(Bitcast)
 XLA_UNOP_PATTERN(Broadcast)
 XLA_UNOP_PATTERN(Ceil)
+XLA_UNOP_PATTERN(Convert)
 XLA_UNOP_PATTERN(Copy)
 XLA_UNOP_PATTERN(Cos)
+XLA_UNOP_PATTERN(CrossReplicaSum)
 XLA_UNOP_PATTERN(Exp)
 XLA_UNOP_PATTERN(Fft)
 XLA_UNOP_PATTERN(Floor)
@@ -1062,13 +2049,13 @@ XLA_UNOP_PATTERN(Negate)
 XLA_UNOP_PATTERN(Real)
 XLA_UNOP_PATTERN(Recv)
 XLA_UNOP_PATTERN(RecvDone)
-XLA_UNOP_PATTERN(Reduce)
 XLA_UNOP_PATTERN(ReducePrecision)
 XLA_UNOP_PATTERN(Reshape)
 XLA_UNOP_PATTERN(Reverse)
 XLA_UNOP_PATTERN(SendDone)
 XLA_UNOP_PATTERN(Sign)
 XLA_UNOP_PATTERN(Sin)
+XLA_UNOP_PATTERN(Slice)
 XLA_UNOP_PATTERN(Sort)
 XLA_UNOP_PATTERN(Tanh)
 XLA_UNOP_PATTERN(Transpose)
@@ -1106,25 +2093,32 @@ XLA_UNOP_PATTERN(Transpose)
 #define XLA_COMMUTATIVE_BINOP_PATTERN(NAME)                                 \
   XLA_BINOP_PATTERN(NAME)                                                   \
                                                                             \
-  template <typename Lhs, typename Rhs>                                     \
-  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
-      ->decltype(AnyOf<HloInstruction>(NAME(lhs, rhs), NAME(rhs, lhs))) {   \
-    return AnyOf<HloInstruction>(NAME(lhs, rhs), NAME(rhs, lhs));           \
-  }                                                                         \
-                                                                            \
   template <typename HloInstructionType, typename Lhs, typename Rhs>        \
   inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs,  \
                              Rhs&& rhs)                                     \
-      ->decltype(AnyOf<HloInstructionType>(NAME(matched_inst, lhs, rhs),    \
-                                           NAME(matched_inst, rhs, lhs))) { \
-    return AnyOf<HloInstructionType>(NAME(matched_inst, lhs, rhs),          \
-                                     NAME(matched_inst, rhs, lhs));         \
+      ->decltype(Op(matched_inst)                                           \
+                     .WithOpcode(HloOpcode::k##NAME)                        \
+                     .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),    \
+                                                 std::forward<Rhs>(rhs))) { \
+    return Op(matched_inst)                                                 \
+        .WithOpcode(HloOpcode::k##NAME)                                     \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                 \
+                                    std::forward<Rhs>(rhs));                \
+  }                                                                         \
+  template <typename Lhs, typename Rhs>                                     \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
+      ->decltype(NAME##AnyOrder<const HloInstruction>(                      \
+          nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs))) {       \
+    return NAME##AnyOrder<const HloInstruction>(                            \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));           \
   }
 XLA_COMMUTATIVE_BINOP_PATTERN(Add)
 XLA_BINOP_PATTERN(Atan2)
 XLA_BINOP_PATTERN(Divide)
 XLA_BINOP_PATTERN(Complex)
+XLA_BINOP_PATTERN(Convolution)
 XLA_BINOP_PATTERN(Dot)
+XLA_BINOP_PATTERN(DynamicSlice)
 XLA_COMMUTATIVE_BINOP_PATTERN(Eq)
 XLA_BINOP_PATTERN(Gather)
 XLA_BINOP_PATTERN(Ge)
@@ -1136,7 +2130,9 @@ XLA_COMMUTATIVE_BINOP_PATTERN(Minimum)
 XLA_COMMUTATIVE_BINOP_PATTERN(Multiply)
 XLA_COMMUTATIVE_BINOP_PATTERN(Ne)
 XLA_BINOP_PATTERN(Outfeed)
+XLA_BINOP_PATTERN(Pad)
 XLA_BINOP_PATTERN(Power)
+XLA_BINOP_PATTERN(ReduceWindow)
 XLA_BINOP_PATTERN(Remainder)
 XLA_BINOP_PATTERN(Send)
 XLA_BINOP_PATTERN(Subtract)
@@ -1183,33 +2179,66 @@ XLA_BINOP_PATTERN(ShiftRightLogical)
         .WithOperand(2, std::forward<Arg2>(arg2));                     \
   }
 XLA_TERNOP_PATTERN(Clamp);
+XLA_TERNOP_PATTERN(Scatter);
 XLA_TERNOP_PATTERN(Select);
 #undef XLA_TERNOP_PATTERN
 
 namespace detail {
-struct PatternFriend {
-  template <typename T>
-  static auto ConstantScalar(T constant) -> decltype(
-      Constant()
-          .WithShape(match::Shape().IsScalar())
-          .WithPredicate(
-              std::declval<std::function<bool(const HloInstruction*)>>())) {
-    std::function<bool(const HloInstruction*)> pred =
-        [constant](const HloInstruction* instr) {
-          const auto& literal = Cast<HloConstantInstruction>(instr)->literal();
-          auto status_or_const = LiteralUtil::CreateR0(constant).Convert(
-              literal.shape().element_type());
-          return status_or_const.ok() &&
-                 literal == status_or_const.ConsumeValueOrDie();
-        };
-
-    return Constant()
-        .WithShape(match::Shape().IsScalar())
-        .WithPredicate(std::move(pred));
-  }
-};
+template <typename Matcher, typename FirstArg>
+inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg)
+    -> decltype(m.WithOperand(operand_num, std::forward<FirstArg>(first_arg))) {
+  return m.WithOperand(operand_num, std::forward<FirstArg>(first_arg));
+}
+
+template <typename Matcher, typename FirstArg, typename... Args>
+inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
+                         Args&&... args)
+    -> decltype(WithOperands(m.WithOperand(operand_num,
+                                           std::forward<FirstArg>(first_arg)),
+                             operand_num + 1, std::forward<Args>(args)...)) {
+  return WithOperands(
+      m.WithOperand(operand_num, std::forward<FirstArg>(first_arg)),
+      operand_num + 1, std::forward<Args>(args)...);
+}
 }  // namespace detail
 
+#define XLA_VARIADIC_OP_PATTERN(NAME)                                         \
+  inline auto NAME()->decltype(Op().WithOpcode(HloOpcode::k##NAME)) {         \
+    return Op().WithOpcode(HloOpcode::k##NAME);                               \
+  }                                                                           \
+                                                                              \
+  template <typename... Args>                                                 \
+  inline auto NAME(Args&&... args)                                            \
+      ->decltype(detail::WithOperands(Op().WithOpcode(HloOpcode::k##NAME)     \
+                                          .WithNumOperands(sizeof...(Args)),  \
+                                      0, std::forward<Args>(args)...)) {      \
+    return detail::WithOperands(                                              \
+        Op().WithOpcode(HloOpcode::k##NAME).WithNumOperands(sizeof...(Args)), \
+        /*operand_num=*/0, std::forward<Args>(args)...);                      \
+  }                                                                           \
+                                                                              \
+  template <typename HloInstructionType, typename... Args>                    \
+  inline auto NAME(HloInstructionType** matched_inst, Args&&... args)         \
+      ->decltype(detail::WithOperands(Op(matched_inst)                        \
+                                          .WithOpcode(HloOpcode::k##NAME)     \
+                                          .WithNumOperands(sizeof...(Args)),  \
+                                      0, std::forward<Args>(args)...)) {      \
+    return detail::WithOperands(Op(matched_inst)                              \
+                                    .WithOpcode(HloOpcode::k##NAME)           \
+                                    .WithNumOperands(sizeof...(Args)),        \
+                                /*operand_num=*/0,                            \
+                                std::forward<Args>(args)...);                 \
+  }
+
+// We could implement all ops as "variadic" ops, but it would make the
+// already-bad compile errors even worse.
+XLA_VARIADIC_OP_PATTERN(AfterAll);
+XLA_VARIADIC_OP_PATTERN(Concatenate);
+XLA_VARIADIC_OP_PATTERN(CustomCall);
+XLA_VARIADIC_OP_PATTERN(Map)
+XLA_VARIADIC_OP_PATTERN(Reduce);
+XLA_VARIADIC_OP_PATTERN(Tuple);
+
 // Helpers for matching non-constant instructions.
 inline auto NonConstant() -> decltype(Op().IsNonConstant()) {
   return Op().IsNonConstant();
@@ -1247,14 +2276,71 @@ inline auto GetTupleElement(HloInstructionType** matched_inst, Arg&& arg,
       .WithTupleIndex(tuple_index);
 }
 
-template <typename T>
-inline auto ConstantScalar(T constant)
-    -> decltype(detail::PatternFriend::ConstantScalar(constant)) {
-  return detail::PatternFriend::ConstantScalar(constant);
+// Add overloads for Parameter which take an int64 specifying the parameter
+// number.
+inline auto Parameter(int64 parameter_num) -> decltype(
+    Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num)) {
+  return Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num);
+}
+template <typename HloInstructionType>
+inline auto Parameter(HloInstructionType** matched_inst, int64 parameter_num)
+    -> decltype(Op(matched_inst)
+                    .WithOpcode(HloOpcode::kParameter)
+                    .WithParameterNum(parameter_num)) {
+  return Op(matched_inst)
+      .WithOpcode(HloOpcode::kParameter)
+      .WithParameterNum(parameter_num);
+}
+
+inline auto ConstantScalar() -> decltype(Op().IsConstantScalar()) {
+  return Op().IsConstantScalar();
+}
+
+template <typename HloInstructionType>
+inline auto ConstantScalar(HloInstructionType** matched_inst)
+    -> decltype(Op(matched_inst).IsConstantScalar()) {
+  return Op(matched_inst).IsConstantScalar();
+}
+
+template <typename ScalarTy>
+inline auto ConstantScalar(ScalarTy val)
+    -> decltype(Op().IsConstantScalar(val)) {
+  return Op().IsConstantScalar(val);
+}
+
+template <typename HloInstructionType, typename ScalarTy>
+inline auto ConstantScalar(HloInstructionType** matched_inst, ScalarTy val)
+    -> decltype(Op(matched_inst).IsConstantScalar(val)) {
+  return Op(matched_inst).IsConstantScalar(val);
+}
+
+inline auto ConstantEffectiveScalar() -> decltype(Op().IsConstantScalar()) {
+  return Op().IsConstantEffectiveScalar();
+}
+
+template <typename HloInstructionType>
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst)
+    -> decltype(Op(matched_inst).IsConstantScalar()) {
+  return Op(matched_inst).IsConstantEffectiveScalar();
+}
+
+template <typename ScalarTy>
+inline auto ConstantEffectiveScalar(ScalarTy val)
+    -> decltype(Op().IsConstantEffectiveScalar(val)) {
+  return Op().IsConstantEffectiveScalar(val);
+}
+
+template <typename HloInstructionType, typename ScalarTy>
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst,
+                                    ScalarTy val)
+    -> decltype(Op(matched_inst).IsConstantEffectiveScalar(val)) {
+  return Op(matched_inst).IsConstantEffectiveScalar(val);
 }
 
 }  // namespace match
 
 }  // namespace xla
 
+#undef EXPLAIN
+#pragma pop_macro("EXPLAIN")
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock.h b/tensorflow/compiler/xla/service/pattern_matcher_gmock.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fe2d10a11b5b2d26ee222c63e0db2d55e361d12
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock.h
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
+
+#include <ostream>
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+namespace pattern_matcher_gmock_detail {
+template <typename Pattern>
+class GmockMatcher {
+ public:
+  explicit GmockMatcher(Pattern p) : pattern_(std::move(p)) {}
+
+  // In service of better error messages, list out the overloads explicitly
+  // rather than just using a template.  gMock's polymorphism plus
+  // pattern_matcher yields some pretty gnarly stuff.
+  bool MatchAndExplain(const Layout& l,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&l, listener);
+  }
+  bool MatchAndExplain(const Layout* l,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(l, listener);
+  }
+
+  bool MatchAndExplain(const Shape& s,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&s, listener);
+  }
+  bool MatchAndExplain(const Shape* s,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(s, listener);
+  }
+
+  bool MatchAndExplain(const HloInstruction& instr,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&instr, listener);
+  }
+  bool MatchAndExplain(const HloInstruction* instr,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(instr, listener);
+  }
+
+  void DescribeTo(std::ostream* os) const { pattern_.DescribeTo(os); }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is NOT: ";
+    DescribeTo(os);
+  }
+
+ private:
+  template <typename T>
+  bool MatchAndExplainImpl(const T* t,
+                           ::testing::MatchResultListener* listener) const {
+    MatchOption options{/*.capture=*/true, /*.explain_os=*/listener->stream()};
+    return Match(t, pattern_, options);
+  }
+
+  Pattern pattern_;
+};
+}  // namespace pattern_matcher_gmock_detail
+
+template <typename Pattern>
+::testing::PolymorphicMatcher<
+    pattern_matcher_gmock_detail::GmockMatcher<Pattern>>
+GmockMatch(Pattern&& p) {
+  return ::testing::MakePolymorphicMatcher(
+      pattern_matcher_gmock_detail::GmockMatcher<Pattern>(
+          std::forward<Pattern>(p)));
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ca2fb05c1f7ef093c58237cf21fbc7c813a592a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+namespace m = ::xla::match;
+using ::testing::Eq;
+using ::testing::Not;
+
+template <typename MatchedTy>
+string Describe(const ::testing::Matcher<MatchedTy>& m) {
+  std::stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
+template <typename MatchedTy>
+string Explain(
+    const MatchedTy& val,
+    const ::testing::Matcher<typename std::remove_cv<MatchedTy>::type>& m) {
+  ::testing::StringMatchResultListener listener;
+  EXPECT_THAT(val, ::testing::Not(m));  // For the error message.
+  EXPECT_FALSE(m.MatchAndExplain(val, &listener));
+  return listener.str();
+}
+
+// This file tests the GmockMatch function.  The actual explanation and
+// description returned by matchers is tested in pattern_matchers_test.
+TEST(PatternMatcherGmock, MatchShape) {
+  Shape s = ShapeUtil::MakeShape(F32, {10, 100});
+  // You can pass const Shape& or a const Shape*.
+  EXPECT_THAT(s, GmockMatch(m::Shape()));
+  EXPECT_THAT(&s, Not(GmockMatch(m::Shape().WithElementType(F16))));
+  EXPECT_THAT(Describe<Shape>(GmockMatch(m::Shape().IsArray())),
+              "a shape that represents an array");
+}
+
+TEST(PatternMatcherGmock, MatchLayout) {
+  Layout l = LayoutUtil::MakeLayout({0, 1});
+  EXPECT_THAT(l, GmockMatch(m::Layout()));
+  EXPECT_THAT(&l, Not(GmockMatch(m::Layout().WithSparseFormat())));
+  EXPECT_THAT(Describe<Layout>(GmockMatch(m::Layout().WithSparseFormat())),
+              "a layout with format SPARSE");
+}
+
+TEST(PatternMatchGmock, MatchInstruction) {
+  auto instr =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {42}), "p");
+  EXPECT_THAT(instr.get(), GmockMatch(m::Parameter()));
+  EXPECT_THAT(*instr, GmockMatch(m::Parameter(0)));
+  EXPECT_THAT(*instr, Not(GmockMatch(m::Parameter(1))));
+  EXPECT_THAT(Describe<HloInstruction*>(GmockMatch(m::Parameter())),
+              "an HloInstruction with opcode parameter");
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 3ab7b7fd7168d7ddd1470fdb03a04ba7b171fddb..186ef0c7911a2724df810780e018f52586e3e6a8 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -14,14 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
+namespace m = match;
+
 TEST(PatternMatcherTest, AddOp) {
   constexpr char kModuleStr[] = R"(HloModule two_plus_two_module
     ENTRY %two_plus_two_computation () -> f32[] {
@@ -229,23 +233,74 @@ TEST(PatternMatcherTest, AnyOf) {
 }
 
 TEST(PatternMatcherTest, ConstantScalar) {
-  constexpr char kModuleStr[] = R"(
-    HloModule test_module ENTRY test { ROOT constant = f16[] constant(42) })";
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
-  auto* root = hlo_module->entry_computation()->root_instruction();
-
-  EXPECT_TRUE(Match(root, match::ConstantScalar(42)));
-  EXPECT_FALSE(Match(root, match::ConstantScalar(41)));
-  EXPECT_FALSE(Match(root, match::ConstantScalar(0)));
-}
+  using match::ConstantEffectiveScalar;
+  using match::ConstantScalar;
+  using match::Op;
+  using match::Tuple;
 
-TEST(PatternMatcherTest, NoMatchConstantScalar) {
   constexpr char kModuleStr[] = R"(
-    HloModule test_module ENTRY test { ROOT v = f16[] parameter(0) })";
+    HloModule test_module
+    ENTRY test {
+      a = s32[] constant(1)
+      b = s32[1,1] constant(s32[1,1]{{2}})
+      c = s32[1,2] constant(s32[1,2]{{2,2}})
+      d = f32[] constant(1)
+      e = f32[] constant(1.25)
+      ROOT tuple = (s32[], s32[1,1], s32[1,2], f32[], f32[]) tuple(a,b,c,d,e)
+    })";
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
   auto* root = hlo_module->entry_computation()->root_instruction();
 
-  EXPECT_FALSE(Match(root, match::ConstantScalar(42)));
+  const HloInstruction* a = root->operand(0);
+  const HloInstruction* b = root->operand(1);
+  const HloInstruction* c = root->operand(2);
+  const HloInstruction* d = root->operand(3);
+  const HloInstruction* e = root->operand(4);
+  EXPECT_TRUE(Match(a, ConstantScalar()));
+  EXPECT_TRUE(Match(a, ConstantScalar(1)));
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar()));
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar(1)));
+  EXPECT_FALSE(Match(a, ConstantScalar(2)));
+  EXPECT_FALSE(Match(a, ConstantScalar(2.01)));
+  EXPECT_FALSE(Match(a, ConstantEffectiveScalar(2)));
+  EXPECT_FALSE(Match(a, ConstantEffectiveScalar(1.01)));
+
+  EXPECT_FALSE(Match(b, ConstantScalar()));
+  EXPECT_FALSE(Match(b, ConstantScalar(2)));
+  EXPECT_TRUE(Match(b, ConstantEffectiveScalar()));
+  EXPECT_TRUE(Match(b, ConstantEffectiveScalar(2)));
+
+  EXPECT_FALSE(Match(c, ConstantScalar()));
+  EXPECT_FALSE(Match(c, ConstantScalar(2)));
+  EXPECT_FALSE(Match(c, ConstantEffectiveScalar()));
+  EXPECT_FALSE(Match(c, ConstantEffectiveScalar(2)));
+
+  EXPECT_TRUE(Match(d, ConstantScalar(1)));
+  EXPECT_TRUE(Match(d, ConstantEffectiveScalar(1)));
+  EXPECT_TRUE(Match(d, ConstantScalar(1.0)));
+  EXPECT_TRUE(Match(d, ConstantEffectiveScalar(1.0)));
+
+  EXPECT_TRUE(Match(e, ConstantScalar(1.25f)));
+  EXPECT_TRUE(Match(e, ConstantScalar(1.25)));
+  EXPECT_TRUE(Match(e, ConstantEffectiveScalar(1.25)));
+  EXPECT_FALSE(Match(e, ConstantScalar(1)));
+  EXPECT_FALSE(Match(e, ConstantEffectiveScalar(1)));
+
+  const HloInstruction* instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantScalar(&instr)));
+  EXPECT_EQ(instr, a);
+
+  instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantScalar(&instr, 1)));
+  EXPECT_EQ(instr, a);
+
+  instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar(&instr)));
+  EXPECT_EQ(instr, a);
+
+  instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar(&instr, 1)));
+  EXPECT_EQ(instr, a);
 }
 
 TEST(PatternMatcherTest, MultiplyAnyOrder) {
@@ -267,6 +322,15 @@ TEST(PatternMatcherTest, MultiplyAnyOrder) {
       root, MultiplyAnyOrder(&instr, ConstantScalar(42), ConstantScalar(52))));
   EXPECT_TRUE(Match(
       root, MultiplyAnyOrder(&instr, ConstantScalar(52), ConstantScalar(42))));
+
+  // Check that MultiplyAnyOrder exposes the same API as Op(), so we can call
+  // e.g. IsNonConstant() on it.
+  EXPECT_TRUE(Match(
+      root, MultiplyAnyOrder(&instr, ConstantScalar(42), ConstantScalar(52))
+                .IsNonConstant()));
+  EXPECT_TRUE(
+      Match(root, MultiplyAnyOrder(ConstantScalar(42), ConstantScalar(52))
+                      .IsNonConstant()));
 }
 
 TEST(PatternMatcherTest, AnyOfShortCircuit) {
@@ -315,14 +379,22 @@ TEST(PatternMatcherTest, AllOf) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
   auto* root = hlo_module->entry_computation()->root_instruction();
 
+  auto f16_scalar = ShapeUtil::MakeShape(F16, {});
+  auto f16_pattern = Constant().WithShapeEqualTo(&f16_scalar);
+  auto f16_compatible_pattern = Constant().WithShapeCompatibleTo(&f16_scalar);
   auto scalar_pattern = Constant().WithShape(match::Shape().IsScalar());
-  auto f16_pattern = Constant().WithShape(match::Shape().WithElementType(F16));
   ASSERT_TRUE(Match(root, scalar_pattern));
   ASSERT_TRUE(Match(root, f16_pattern));
-  EXPECT_TRUE(Match(root, AllOf<HloInstruction>(scalar_pattern, f16_pattern)));
-  EXPECT_TRUE(Match(root, AllOf<HloInstruction>(f16_pattern, scalar_pattern)));
+  ASSERT_TRUE(Match(root, f16_compatible_pattern));
+  EXPECT_TRUE(Match(root, AllOf<HloInstruction>(scalar_pattern, f16_pattern,
+                                                f16_compatible_pattern)));
+  EXPECT_TRUE(
+      Match(root, AllOf<HloInstruction>(f16_pattern, f16_compatible_pattern,
+                                        scalar_pattern)));
   EXPECT_FALSE(
       Match(root, AllOf<HloInstruction>(Broadcast(Op()), f16_pattern)));
+  EXPECT_FALSE(Match(
+      root, AllOf<HloInstruction>(Broadcast(Op()), f16_compatible_pattern)));
   EXPECT_FALSE(
       Match(root, AllOf<HloInstruction>(Broadcast(Op()), scalar_pattern)));
 }
@@ -394,5 +466,470 @@ TEST(PatternMatcherTest, TestCaptureMatchedSubPatternForAnyOf) {
   EXPECT_EQ(nullptr, addend2);
 }
 
+TEST(PatternMatcherTest, TestConcat) {
+  using match::Concatenate;
+  using match::ConstantScalar;
+  using match::Op;
+  using match::Reshape;
+
+  constexpr char kModuleStr[] = R"(
+    HloModule test_module
+    ENTRY test {
+      c1 = u32[] constant(1)
+      c2 = u32[] constant(2)
+      c3 = u32[] constant(3)
+      c4 = u32[] constant(4)
+      r1 = u32[1] reshape(c1)
+      r2 = u32[1] reshape(c2)
+      r3 = u32[1] reshape(c3)
+      r4 = u32[1] reshape(c4)
+      ROOT concat = u32[4] concatenate(r1, r2, r3, r4), dimensions={0}
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
+  auto* root = hlo_module->entry_computation()->root_instruction();
+  ASSERT_TRUE(Match(
+      root,
+      Concatenate(Reshape(ConstantScalar(1)), Reshape(ConstantScalar(2)),
+                  Reshape(ConstantScalar(3)), Reshape(ConstantScalar(4)))));
+  ASSERT_FALSE(Match(
+      root,
+      Concatenate(Reshape(ConstantScalar(2)), Reshape(ConstantScalar(1)),
+                  Reshape(ConstantScalar(3)), Reshape(ConstantScalar(4)))));
+  ASSERT_FALSE(Match(
+      root, Concatenate(Reshape(ConstantScalar(1)), Reshape(ConstantScalar(2)),
+                        Reshape(ConstantScalar(3)))));
+  ASSERT_FALSE(Match(
+      root, Concatenate(Reshape(ConstantScalar(2)), Reshape(ConstantScalar(3)),
+                        Reshape(ConstantScalar(4)))));
+}
+
+template <typename Pattern>
+string Description(const Pattern& pattern) {
+  std::stringstream ss;
+  pattern.DescribeTo(&ss);
+  return ss.str();
+}
+
+template <typename Elem, typename Pattern>
+string Explanation(Elem* elem, const Pattern& pattern) {
+  std::stringstream ss;
+  MatchOption options{/*.capture=*/true, /*.explain_os=*/&ss};
+  Match(elem, pattern, options);
+  return ss.str();
+}
+template <typename Elem, typename Pattern>
+string Explanation(const std::unique_ptr<Elem>& elem, const Pattern& pattern) {
+  return Explanation(elem.get(), pattern);
+}
+template <typename Elem, typename Pattern>
+string Explanation(const Elem& elem, const Pattern& pattern) {
+  return Explanation(&elem, pattern);
+}
+
+// Helper macro for checking a pattern's description and the explanation printed
+// when attempting to match (and presumably failing) on a given object.
+//
+// We use a macro rather than a function because we want good line numbers in
+// errors.  We use this rather than writing a helper that returns a pair of
+// (description, explanation) and doing something like
+//
+//   EXPECT_THAT(DescAndExplanation(...), ::testing::Pair(..., ...));
+//
+// because EXPECT_EQ prints a unified diff if multiline string comparison fails,
+// while EXPECT_THAT does not.  This unified diff makes the errors much easier
+// to read.
+#define EXPECT_DESC_AND_EXPLANATION(elem, pattern, expected_desc,    \
+                                    expected_explanation)            \
+  do {                                                               \
+    EXPECT_EQ(Description(pattern), (expected_desc));                \
+    EXPECT_EQ(Explanation((elem), (pattern)), expected_explanation); \
+  } while (0)
+
+TEST(PatternMatcherTest, LayoutDescribeToAndExplain) {
+  auto layout = LayoutUtil::MakeLayout({1, 2});
+  auto layout2 = LayoutUtil::MakeLayout({2, 2});
+
+  EXPECT_DESC_AND_EXPLANATION(static_cast<const Layout*>(nullptr), m::Layout(),
+                              "a layout", "Layout is null");
+  EXPECT_DESC_AND_EXPLANATION(layout2, m::Layout().EqualTo(&layout),
+                              "a layout equal to {1,2}",
+                              "Layout {2,2} is not equal to expected {1,2}");
+  EXPECT_DESC_AND_EXPLANATION(layout2, m::Layout().WithSparseFormat(),
+                              "a layout with format SPARSE",
+                              "Layout has format DENSE but expected SPARSE");
+  EXPECT_DESC_AND_EXPLANATION(layout,
+                              m::Layout().EqualTo(&layout).WithSparseFormat(),
+                              "a layout:\n"
+                              " * equal to {1,2} AND\n"
+                              " * with format SPARSE",
+                              "Layout has format DENSE but expected SPARSE");
+}
+
+TEST(PatternMatcherTest, ShapeDescribeToAndExplain) {
+  auto shape = ShapeUtil::MakeShapeWithLayout(F32, {1, 2}, {0, 1});
+  auto layout = shape.layout();
+
+  EXPECT_DESC_AND_EXPLANATION(static_cast<const Shape*>(nullptr), m::Shape(),
+                              "a shape", "Shape is null");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2}, {1, 0}),
+      m::Shape().EqualTo(&shape), "a shape equal to f32[1,2]{0,1}",
+      "Shape not equal to f32[1,2]{0,1}\n"
+      "in f32[1,2]{1,0}");
+  EXPECT_DESC_AND_EXPLANATION(ShapeUtil::MakeShape(F32, {2, 2}),
+                              m::Shape().CompatibleTo(&shape),
+                              "a shape compatible with f32[1,2]",
+                              "Shape not compatible with f32[1,2]\n"
+                              "in f32[2,2]{1,0}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithElementType(F16),
+                              "a shape with element type F16",
+                              "Shape does not have element type F16\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().IsScalar(),
+                              "a shape that represents a scalar",
+                              "Shape is not a scalar\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(ShapeUtil::MakeNil(), m::Shape().IsArray(),
+                              "a shape that represents an array",
+                              "Shape is not an array\n"
+                              "in ()");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().IsTuple(),
+                              "a shape that represents a tuple",
+                              "Shape is not a tuple\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().IsEffectiveScalar(),
+                              "a shape that is an effective scalar",
+                              "Shape is not an effective scalar\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithRank(42),
+                              "a shape that has 42 dimensions",
+                              "Shape does not have rank 42\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithRank(0),
+                              "a shape that is a scalar",
+                              "Shape is not a scalar\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithRank(1).IsArray(),
+                              "a shape:\n"
+                              " * that has 1 dimension AND\n"
+                              " * that represents an array",
+                              "Shape does not have rank 1\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(ShapeUtil::MakeNil(),
+                              m::Shape().IsArray().WithRank(1),
+                              "a shape:\n"
+                              " * that represents an array AND\n"
+                              " * that has 1 dimension",
+                              "Shape is not an array\n"
+                              "in ()");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2}, {1, 0}),
+      m::Shape().WithLayoutEqualTo(&layout),
+      "a shape with\n  a layout equal to {0,1}",
+      "Layout {1,0} is not equal to expected {0,1}\n"
+      "in f32[1,2]{1,0}");
+  EXPECT_DESC_AND_EXPLANATION(
+      shape, m::Shape().WithLayout(m::Layout().WithSparseFormat()),
+      "a shape with\n  a layout with format SPARSE",
+      "Layout has format DENSE but expected SPARSE\n"
+      "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape,
+                              m::Shape().WithSubshapeEqualTo({10}, &shape),
+                              "a shape with subshape at index {10} which is\n"
+                              "  a shape equal to f32[1,2]{0,1}",
+                              "No subshape at {10}\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2, 2})}),
+      m::Shape().WithSubshapeEqualTo({0}, &shape),
+      "a shape with subshape at index {0} which is\n"
+      "  a shape equal to f32[1,2]{0,1}",
+      "Shape not equal to f32[1,2]{0,1}\n"
+      "in f32[2,2]{1,0}\n"
+      "in subshape at {0}\n"
+      "in (f32[2,2])");
+  EXPECT_DESC_AND_EXPLANATION(shape,
+                              m::Shape().WithSubshapeCompatibleTo({10}, &shape),
+                              "a shape with subshape at index {10} which is\n"
+                              "  a shape compatible with f32[1,2]",
+                              "No subshape at {10}\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2, 2})}),
+      m::Shape().WithSubshapeCompatibleTo({0}, &shape),
+      "a shape with subshape at index {0} which is\n"
+      "  a shape compatible with f32[1,2]",
+      "Shape not compatible with f32[1,2]\n"
+      "in f32[2,2]{1,0}\n"
+      "in subshape at {0}\n"
+      "in (f32[2,2])");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeTupleShape({shape})}),
+      m::Shape().WithSubshape({0, 0}, m::Shape().IsScalar()),
+      "a shape with subshape at index {0,0} which is\n"
+      "  a shape that represents a scalar",
+      "Shape is not a scalar\n"
+      "in f32[1,2]{0,1}\n"
+      "in subshape at {0,0}\n"
+      "in ((f32[1,2]))");
+}
+
+std::unique_ptr<HloInstruction> SetName(absl::string_view name,
+                                        std::unique_ptr<HloInstruction> instr) {
+  instr->SetAndSanitizeName(string(name));
+  return instr;
+}
+
+TEST(PatternMatcherTest, HloInstructionDescribeToAndExplain) {
+  std::unique_ptr<HloInstruction> iota =
+      SetName("i", HloInstruction::CreateIota(ShapeUtil::MakeShape(S32, {42}),
+                                              /*iota_dimension=*/0));
+  std::unique_ptr<HloInstruction> constant =
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0)));
+
+  EXPECT_DESC_AND_EXPLANATION(static_cast<const HloInstruction*>(nullptr),
+                              m::Op(), "an HloInstruction",
+                              "HloInstruction* is null");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithName("foo"),
+                              "an HloInstruction named \"foo\"",
+                              "HloInstruction not named \"foo\"\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithOpcode(HloOpcode::kAdd),
+                              "an HloInstruction with opcode add",
+                              "HloInstruction doesn't have opcode add\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      constant, m::Op().IsNonConstant(),
+      "an HloInstruction with any opcode other than constant",
+      "HloInstruction has opcode constant, expected anything else\n"
+      "in c = s32[] constant(0)");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithNumOperands(42),
+                              "an HloInstruction with 42 operands",
+                              "HloInstruction doesn't have 42 operands\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithShape(m::Shape().IsTuple()),
+                              "an HloInstruction outputting\n"
+                              "  a shape that represents a tuple",
+                              "Shape is not a tuple\n"
+                              "in s32[42]{0}\n"
+                              "in output shape\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      iota, m::Op().WithOperand(2, m::Op().WithOpcode(HloOpcode::kAdd)),
+      "an HloInstruction with operand 2 which is:\n"
+      "  an HloInstruction with opcode add",
+      "desired operand index 2 is out of bounds\n"
+      "in i = s32[42]{0} iota(), iota_dimension=0");
+
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("a", HloInstruction::CreateBinary(ShapeUtil::MakeShape(S32, {}),
+                                                HloOpcode::kAdd, constant.get(),
+                                                constant.get())),
+      m::Op().WithOperand(1, m::Op().IsNonConstant()),
+      "an HloInstruction with operand 1 which is:\n"
+      "  an HloInstruction with any opcode other than constant",
+      "HloInstruction has opcode constant, expected anything else\n"
+      "in c = s32[] constant(0)\n"
+      "in operand 1\n"
+      "in a = s32[] add(s32[] c, s32[] c)");
+  EXPECT_DESC_AND_EXPLANATION(
+      iota, m::Op().WithFusionKind(HloInstruction::FusionKind::kLoop),
+      "an HloInstruction with fusion kind kLoop",
+      "HloInstruction does not have fusion kind kLoop; it's not a fusion\n"
+      "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      iota, m::Op().WithTupleIndex(42),
+      "an HloInstruction which is a GTE with index 42",
+      "HloInstruction is not a GTE with index 42; it's not a GTE at all\n"
+      "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().IsConstantScalar(),
+                              "an HloInstruction which is a constant scalar",
+                              "HloInstruction is not a constant\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(
+                       LiteralUtil::CreateR1<int>({1, 2}))),
+      m::Op().IsConstantEffectiveScalar(),
+      "an HloInstruction which is a constant effective scalar",
+      "HloInstruction is not an effective scalar\n"
+      "in c = s32[2]{0} constant({1, 2})");
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(10))),
+      m::Op().IsConstantScalar(42),
+      "an HloInstruction which is a constant scalar with value 42",
+      "HloInstruction's constant value 10 did not match expected value 42\n"
+      "in c = s32[] constant(10)");
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.25))),
+      m::Op().IsConstantEffectiveScalar(1.25),
+      "an HloInstruction which is a constant effective scalar with value 1.25",
+      "HloInstruction's constant value 2.25 did not match expected value 1.25\n"
+      "in c = f64[] constant(2.25)");
+  EXPECT_DESC_AND_EXPLANATION(
+      constant, m::Op().Is(iota.get()),
+      absl::StrCat("an HloInstruction which is 0x", absl::Hex(iota.get()),
+                   " (i = s32[42]{0} iota(), iota_dimension=0)"),
+      absl::StrCat("HloInstruction 0x", absl::Hex(constant.get()), " is not 0x",
+                   absl::Hex(iota.get()),
+                   " (i = s32[42]{0} iota(), iota_dimension=0)\n"
+                   "in c = s32[] constant(0)"));
+}
+
+TEST(PatternMatcherTest, HloInstructionMatcherAnyOrderDescribeTo) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("a", HloInstruction::CreateBinary(
+                       scalar_s32, HloOpcode::kAdd,
+                       SetName("b", HloInstruction::CreateConstant(
+                                        LiteralUtil::CreateR0(0)))
+                           .get(),
+                       SetName("c", HloInstruction::CreateConstant(
+                                        LiteralUtil::CreateR0(0)))
+                           .get())),
+      m::AddAnyOrder(m::Op().WithName("b"), m::Op().WithName("bar")),
+      "an HloInstruction:\n"
+      " * with opcode add AND\n"
+      " * with two operands in either order:\n"
+      "    - an HloInstruction named \"b\"\n"
+      "    - an HloInstruction named \"bar\"",
+      "HloInstruction's operands (ignoring order) did not match second "
+      "matcher.  Specifically,\n"
+      " - an HloInstruction named \"bar\"\n"
+      "does not match LHS:\n"
+      " - HloInstruction not named \"bar\"\n"
+      "   in b = s32[] constant(0)\n"
+      "does not match RHS:\n"
+      " - HloInstruction not named \"bar\"\n"
+      "   in c = s32[] constant(0)\n"
+      "in a = s32[] add(s32[] b, s32[] c)");
+
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("a",
+              HloInstruction::CreateBinary(
+                  scalar_s32, HloOpcode::kAdd,
+                  HloInstruction::CreateParameter(0, scalar_s32, "p").get(),
+                  SetName("c", HloInstruction::CreateConstant(
+                                   LiteralUtil::CreateR0(0)))
+                      .get())),
+      m::AddAnyOrder(m::Op().IsConstantScalar(), m::Op().IsConstant()),
+      "an HloInstruction:\n"
+      " * with opcode add AND\n"
+      " * with two operands in either order:\n"
+      "    - an HloInstruction which is a constant scalar\n"
+      "    - an HloInstruction with opcode constant",
+      "HloInstruction's LHS operand did not match either of the two matchers.  "
+      "Specifically,\n"
+      " - an HloInstruction which is a constant scalar\n"
+      "does not match LHS:\n"
+      " - HloInstruction is not a constant\n"
+      "   in p = s32[] parameter(0)\n"
+      "and\n"
+      " - an HloInstruction with opcode constant\n"
+      "does not match LHS:\n"
+      " - HloInstruction doesn't have opcode constant\n"
+      "   in p = s32[] parameter(0)\n"
+      "in a = s32[] add(s32[] p, s32[] c)");
+}
+
+TEST(PatternMatcherTest, AnyOfMatcherDescribeToAndExplain) {
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))),
+      m::AnyOf<HloInstruction>(m::Op().WithName("foo"),
+                               m::Op().WithName("bar")),
+      "any of:\n"
+      " - an HloInstruction named \"foo\" OR\n"
+      " - an HloInstruction named \"bar\"",
+      "None of the following matchers succeeded:\n"
+      "Matcher #1\n"
+      " - an HloInstruction named \"foo\"\n"
+      "failed with\n"
+      " - HloInstruction not named \"foo\"\n"
+      "   in c = s32[] constant(0)\n"
+      "Matcher #2\n"
+      " - an HloInstruction named \"bar\"\n"
+      "failed with\n"
+      " - HloInstruction not named \"bar\"\n"
+      "   in c = s32[] constant(0)");
+}
+
+TEST(PatternMatcherTest, Parameter) {
+  auto param =
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "p1");
+  auto non_param =
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0)));
+  EXPECT_FALSE(Match(param.get(), m::Parameter(0)));
+  EXPECT_TRUE(Match(param.get(), m::Parameter()));
+  EXPECT_TRUE(Match(param.get(), m::Parameter(1)));
+  EXPECT_FALSE(Match(non_param.get(), m::Parameter()));
+  EXPECT_FALSE(Match(non_param.get(), m::Parameter(1)));
+
+  EXPECT_DESC_AND_EXPLANATION(non_param, m::Parameter(1),
+                              "an HloInstruction:\n"
+                              " * with opcode parameter AND\n"
+                              " * which is parameter 1",
+                              "HloInstruction doesn't have opcode parameter\n"
+                              "in c = s32[] constant(0)");
+  EXPECT_EQ(Explanation(HloInstruction::CreateParameter(
+                            0, ShapeUtil::MakeShape(F32, {}), "p0"),
+                        m::Parameter(1)),
+            "HloInstruction is not parameter 1\n"
+            "in p0 = f32[] parameter(0)");
+}
+
+TEST(PatternMatcherTest, OneUseAndOneUser) {
+  auto param =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+  EXPECT_DESC_AND_EXPLANATION(
+      param, m::Op().WithOneUse(),
+      "an HloInstruction which has exactly one use",
+      "HloInstruction has 0 users, but expected exactly one.\n"
+      "in p0 = f32[] parameter(0)");
+
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser()));
+  EXPECT_DESC_AND_EXPLANATION(
+      param, m::Op().WithOneUser(),
+      "an HloInstruction which has exactly one user (but possibly is used "
+      "multiple times by that instruction)",
+      "HloInstruction has 0 users, but expected exactly one.\n"
+      "in p0 = f32[] parameter(0)");
+
+  {
+    auto reshape =
+        SetName("r", HloInstruction::CreateReshape(
+                         ShapeUtil::MakeShape(F32, {1}), param.get()));
+    EXPECT_TRUE(Match(param.get(), m::Op().WithOneUse()));
+    EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser()));
+
+    auto reshape1 =
+        SetName("r1", HloInstruction::CreateReshape(
+                          ShapeUtil::MakeShape(F32, {1}), param.get()));
+    EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+    EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser()));
+
+    const char* kMultipleUserExplanation =
+        "HloInstruction has 2 users, but expected exactly one.\n"
+        "All users:\n"
+        " - r = f32[1]{0} reshape(f32[] p0)\n"
+        " - r1 = f32[1]{0} reshape(f32[] p0)\n"
+        "in p0 = f32[] parameter(0)";
+    EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
+              kMultipleUserExplanation);
+    EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUser()),
+              kMultipleUserExplanation);
+  }
+
+  auto add = SetName("add", HloInstruction::CreateBinary(
+                                ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd,
+                                param.get(), param.get()));
+  EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser()));
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+  EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
+            "HloInstruction is used 2 times by its user, but is expected to be "
+            "used just once: add = f32[] add(f32[] p0, f32[] p0)\n"
+            "in p0 = f32[] parameter(0)");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index c522e7ae23b734090f85d241bf365fccc37f0adb..c227106511c2c17b44569d3b696cd7d764226e81 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -59,20 +59,15 @@ string CanonicalPlatformName(const string& name) {
 
 /* static */ StatusOr<std::vector<se::Platform*>>
 PlatformUtil::GetSupportedPlatforms() {
-  se::MultiPlatformManager::PlatformMap platform_map;
-  se::port::Status platforms_status = se::MultiPlatformManager::WithPlatforms(
-      [&platform_map](se::MultiPlatformManager::PlatformMap* map) {
-        platform_map = *map;
-        return se::port::Status::OK();
-      });
-  if (platform_map.empty()) {
+  std::vector<se::Platform*> all_platforms =
+      se::MultiPlatformManager::AllPlatforms();
+  if (all_platforms.empty()) {
     LOG(WARNING) << "no executor platforms available: platform map is empty";
   }
 
   // Gather all platforms which have an XLA compiler.
   std::vector<se::Platform*> platforms;
-  for (auto& platform_pair : platform_map) {
-    auto* platform = platform_pair.second;
+  for (se::Platform* platform : all_platforms) {
     auto compiler_status = Compiler::GetForPlatform(platform);
     if (compiler_status.ok()) {
       platforms.push_back(platform);
@@ -222,8 +217,8 @@ PlatformUtil::GetStreamExecutors(se::Platform* platform) {
     // fix the number of devices to one.  However we do let the user override
     // this behavior to help run tests on the host that run models in parallel
     // across multiple devices.
-    device_count = legacy_flags::GetDebugOptionsFromFlags()
-                       .xla_force_host_platform_device_count();
+    device_count =
+        GetDebugOptionsFromFlags().xla_force_host_platform_device_count();
   }
   std::vector<se::StreamExecutor*> stream_executors(device_count, nullptr);
   VLOG(1) << "Initializing devices";
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
index 688cceff0cd10df62a4093f00ad3331ca77652e0..b70cb7057477a338bfb36ebab76237b30d018e41 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -111,7 +111,7 @@ StatusOr<bool> ReducePrecisionInsertion::insert_on_inputs(
       VLOG(2) << "Adding to operand " << i << ": " << operand;
 
       if (!is_valid_shape(operand->shape())) {
-        VLOG(2) << "Skipped: value is not an F32 vector";
+        VLOG(2) << "Skipped: value is not of type F32";
         continue;
       }
 
@@ -168,7 +168,7 @@ StatusOr<bool> ReducePrecisionInsertion::insert_on_outputs(
             << instruction->ToString();
 
     if (!is_valid_shape(instruction->shape())) {
-      VLOG(2) << "Skipped: value is not an F32 nonscalar array";
+      VLOG(2) << "Skipped: value is not of type F32";
       continue;
     }
 
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.h b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
index 0b4e82e8d606cf2cacfab42d07c2201939d5e10b..76c6a87f176ec9c6f8e49c25278c6dad703e3c7c 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.h
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.h
@@ -118,13 +118,7 @@ class ReducePrecisionInsertion : public HloModulePass {
     // equivalent behavior can be obtained by adding ReducePrecision
     // instructions after the instructions that pull the F32 arrays out of
     // the tuples.
-    //
-    // TODO(b/64093391): Remove the IsScalar check once this won't cause
-    // failures on the GPU backend if the ReducePrecision instruction ends up
-    // inserted between a scalar constant and the init_value argument of a
-    // Reduce operation.
-    return shape.element_type() == PrimitiveType::F32 &&
-           !ShapeUtil::IsScalar(shape);
+    return shape.element_type() == PrimitiveType::F32;
   }
 
   // Is this instruction one such that following or preceding it with a new
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
index 69e4b534bd8e3aeab8b729f3e594a10b4368f15f..efeec96571455d8a9e4b7837dd7286392c12f1a3 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
@@ -54,7 +54,34 @@ TEST_F(ReducePrecisionInsertionTest, BeforeUnaryInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  // Confirm expected state before adding ops.
+  EXPECT_EQ(computation->root_instruction(), b);
+  EXPECT_EQ(b->operand(0), a);
+
+  EXPECT_TRUE(InsertOps(module.get(), HloReducePrecisionOptions::OP_INPUTS,
+                        [](const HloInstruction* instruction) {
+                          return instruction->opcode() == HloOpcode::kCos;
+                        }));
+
+  // Confirm expected graph after adding ops.
+  EXPECT_EQ(computation->root_instruction(), b);
+  EXPECT_THAT(b->operand(0), op::ReducePrecision(a));
+}
+
+TEST_F(ReducePrecisionInsertionTest, BeforeUnaryScalarInstruction) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {});
+
+  // Create a simple graph with a parameter feeding a unary cosine function.
+  HloInstruction* a =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
+
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -84,7 +111,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeBinaryInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -113,7 +140,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeZeroInputInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -146,7 +173,7 @@ TEST_F(ReducePrecisionInsertionTest, AvoidAddingDuplicateInstructions) {
   HloInstruction* d = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, b, c));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -178,7 +205,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterRootInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -215,7 +242,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterNonRootInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a_cos, b_cos));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -242,7 +269,7 @@ TEST_F(ReducePrecisionInsertionTest, OutputIsNotFloat) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -268,7 +295,7 @@ TEST_F(ReducePrecisionInsertionTest, ShouldReduceOutputPrecisionIsFalse) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -294,7 +321,7 @@ TEST_F(ReducePrecisionInsertionTest, InsertionIsNotRecursive) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, a, 8, 23));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -321,7 +348,7 @@ TEST_F(ReducePrecisionInsertionTest, SkipRedundantReducePrecisionAfter) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 5, 10));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -349,7 +376,7 @@ TEST_F(ReducePrecisionInsertionTest, AddNonRedundantReducePrecision) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 8, 23));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -375,7 +402,7 @@ TEST_F(ReducePrecisionInsertionTest, IgnoreOpsInsideFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -411,7 +438,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInHeadOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -458,7 +485,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInTailOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index fcf269eee925c2ddb7511d70e71bd815e4b8c24a..341659b15c4c7355d39739ee171a4a749d87e929 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -34,9 +34,10 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class ReshapeMoverTest : public HloVerifiedTestBase {};
+class ReshapeMoverTest : public HloTestBase {};
 
 TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -50,12 +51,12 @@ TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
@@ -74,6 +75,7 @@ TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
 // Verifies that the reshape is not moved, since rng0 is trivially reshapable
 // and therefore there is no nontrivial reshapes to move.
 TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto rng0 = builder.AddInstruction(HloInstruction::CreateRng(
@@ -92,18 +94,19 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, const1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(rng0), const1));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(rng0), const1));
 }
 
 TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -117,12 +120,12 @@ TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -130,6 +133,7 @@ TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -143,11 +147,11 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(param1)));
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Add(param0, param1)));
@@ -177,6 +181,7 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
 // |
 // reshape4
 TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
   auto const0 = builder.AddInstruction(
@@ -196,12 +201,12 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, const0, reshape1, reshape2));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(const0, reshape1, reshape2));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Select(op::Reshape(const0), param1, param2)));
@@ -221,6 +226,7 @@ TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
 // Verifies that the reshape0 does not sink below add, because param1 is not
 // trivially reshapable nor is a Reshape/Transpose.
 TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -232,11 +238,11 @@ TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), param1));
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), param1));
@@ -257,6 +263,7 @@ TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
 // Verifies that we don't unnecessarily sink reshapes, which are in fact
 // trivial reshapes.
 TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {3, 2});
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -275,12 +282,12 @@ TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, pred, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
@@ -309,6 +316,7 @@ TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
 //
 // (note that reshape1 here is trivial).
 TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -320,12 +328,12 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, const1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), const1));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Add(param0, op::Reshape(const1))));
@@ -348,6 +356,7 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
 // For now we treat it as non-trivial, so we verify that we don't sink the
 // reshapes in this case.
 TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {1, 1, 3});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -362,12 +371,12 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(const1)));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Add(op::Reshape(param0), op::Reshape(const1)));
@@ -376,6 +385,7 @@ TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -389,14 +399,14 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       root_shape, HloOpcode::kAdd, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   computation->CreateFusionInstruction({add},
                                        HloInstruction::FusionKind::kLoop);
 
   EXPECT_THAT(computation->root_instruction(),
               op::Fusion(op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Fusion(param0, param1)));
@@ -405,6 +415,7 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
   auto pred_shape = ShapeUtil::MakeShape(PRED, {8, 7});
@@ -423,13 +434,13 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, reshape_pred, reshape0, reshape1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
       op::Select(op::Reshape(pred), op::Reshape(param0), op::Reshape(param1)));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Reshape(op::Select(pred, param0, param1)));
@@ -438,6 +449,7 @@ TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
 }
 
 TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
+  auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   auto root_shape = ShapeUtil::MakeShape(F32, {});
   auto pred_shape = ShapeUtil::MakeShape(PRED, {});
@@ -452,11 +464,11 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
   auto select = builder.AddInstruction(HloInstruction::CreateTernary(
       root_shape, HloOpcode::kSelect, reshape_pred, param0, param1));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               op::Select(op::Reshape(pred), param0, param1));
 
-  EXPECT_FALSE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_FALSE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Select(op::Reshape(pred), param0, param1));
@@ -477,6 +489,7 @@ TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
 //
 // We expect reshape{0,1} AND reshape{2,3} to be lifted.
 TEST_F(ReshapeMoverTest, MultiplePasses) {
+  auto m = CreateNewVerifiedModule();
   auto shape1 = ShapeUtil::MakeShape(F32, {1, 8, 1, 7});
   auto shape2 = ShapeUtil::MakeShape(F32, {8, 7, 1});
   auto shape3 = ShapeUtil::MakeShape(F32, {8, 7});
@@ -500,14 +513,14 @@ TEST_F(ReshapeMoverTest, MultiplePasses) {
   builder.AddInstruction(HloInstruction::CreateBinary(shape3, HloOpcode::kAdd,
                                                       reshape2, reshape3));
 
-  auto computation = module().AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
       op::Add(op::Reshape(param2),
               op::Reshape(op::Add(op::Reshape(param0), op::Reshape(param1)))));
 
-  EXPECT_TRUE(ReshapeMover().Run(&module()).ValueOrDie());
+  EXPECT_TRUE(ReshapeMover().Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -526,11 +539,11 @@ TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_TRUE(changed);
 
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Transpose(op::Multiply()));
 }
 
@@ -555,8 +568,8 @@ TEST_F(ReshapeMoverTest, ReshapeWithUsersOutsideCandidatesNotSink) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_FALSE(changed);
 }
 
@@ -580,10 +593,10 @@ TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink1) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_TRUE(changed);
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Reshape(), op::Reshape(), op::Reshape()));
 }
 
@@ -597,10 +610,10 @@ TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink2) {
     }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
   EXPECT_TRUE(changed);
-  EXPECT_THAT(module().entry_computation()->root_instruction(),
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Reshape(op::Add()));
 }
 
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
index de7aee262e61195b37099fc661a95508d0539e18..11c2f8392d285095816dd5d61f7029c1bfd158d4 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -392,7 +392,8 @@ StatusOr<HloInstruction*> ScatterExpander::ExpandScatter(
           [&](HloInstruction* induction_var,
               const std::vector<HloInstruction*>& loop_state) {
             return ScatterLoopBody(scatter, induction_var, loop_state);
-          });
+          },
+          scatter->metadata());
   TF_ASSIGN_OR_RETURN(std::vector<HloInstruction*> scatter_loop_result,
                       scatter_loop_result_status);
   return scatter_loop_result.front();
diff --git a/tensorflow/compiler/xla/service/scatter_expander.h b/tensorflow/compiler/xla/service/scatter_expander.h
index 559a85dccfef27816e7dbf746fd71c44bbf46f60..533af060bc9f943e5bc2882db626e25c77484029 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.h
+++ b/tensorflow/compiler/xla/service/scatter_expander.h
@@ -25,7 +25,7 @@ class ScatterExpander : public HloModulePass {
   absl::string_view name() const override { return "scatter_expander"; }
   StatusOr<bool> Run(HloModule* module) override;
 
- private:
+ protected:
   StatusOr<HloInstruction*> ExpandScatter(HloInstruction* scatter);
 };
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index b27a92f2a0761a2bccd97eb2c0467ead27565c37..5ec7fe2adedac2fc3d8a7588e853dba90e99006f 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -175,7 +176,14 @@ Status Service::CreateChannelHandle(const CreateChannelHandleRequest* arg,
 
 Status Service::Unregister(const UnregisterRequest* arg,
                            UnregisterResponse* result) {
-  return allocation_tracker_.Unregister(arg->data());
+  Status status;
+  for (auto& data : arg->data()) {
+    Status unregister_status = allocation_tracker_.Unregister(data);
+    if (!unregister_status.ok() && status.ok()) {
+      status = unregister_status;
+    }
+  }
+  return status;
 }
 
 // Deconstructs a previously-allocated global handle.
@@ -207,7 +215,7 @@ Status Service::ValidateResultShape(const Shape& client_shape,
 StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
 Service::ResolveAndValidateArguments(
     absl::Span<const GlobalDataHandle* const> arguments,
-    absl::Span<se::StreamExecutor* const> stream_executors) {
+    absl::Span<se::StreamExecutor* const> stream_executors) const {
   CHECK_EQ(options_.number_of_replicas(), stream_executors.size());
   std::vector<std::vector<const ShapedBuffer*>> replicated_arguments;
   replicated_arguments.resize(options_.number_of_replicas());
@@ -268,8 +276,8 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
   }
   if (execution_options != nullptr &&
       execution_options->has_shape_with_output_layout()) {
-    const auto& shape_with_output_layout =
-        execution_options->shape_with_output_layout();
+    const Shape shape_with_output_layout(
+        execution_options->shape_with_output_layout());
     TF_RETURN_IF_ERROR(
         ValidateResultShape(shape_with_output_layout, program_shape.result()));
     TF_RETURN_IF_ERROR(
@@ -285,7 +293,7 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     config->set_seed(execution_options->seed());
     config->set_debug_options(execution_options->debug_options());
   } else {
-    config->set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config->set_debug_options(GetDebugOptionsFromFlags());
   }
 
   if (execute_backend_ != nullptr &&
@@ -341,19 +349,19 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
   }
 
   CHECK_EQ(module_protos.size(), module_configs.size());
-  std::vector<std::unique_ptr<HloModule>> modules;
+  auto module_group =
+      absl::make_unique<HloModuleGroup>(module_protos[0]->name());
   for (int64 i = 0; i < module_protos.size(); ++i) {
     const HloModuleProto* proto = module_protos[i];
     const HloModuleConfig& config = *module_configs[i];
-    TF_ASSIGN_OR_RETURN(auto module,
-                        HloModule::CreateFromProto(*proto, config));
-    modules.push_back(std::move(module));
+    TF_ASSIGN_OR_RETURN(auto module, CreateModuleFromProto(*proto, config));
+    module_group->push_back(std::move(module));
   }
 
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<Executable>> executables,
-      backend->compiler()->Compile(std::move(modules), std::move(executors),
-                                   device_allocator));
+      backend->compiler()->Compile(std::move(module_group),
+                                   std::move(executors), device_allocator));
 
   for (size_t i = 0; i < module_protos.size(); ++i) {
     if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
@@ -590,7 +598,7 @@ StatusOr<std::vector<se::StreamExecutor*>> Service::GetExecutors(
 
 StatusOr<std::vector<std::vector<const ShapedBuffer*>>> Service::GetArguments(
     const ExecutionOptions& execution_options,
-    absl::Span<const GlobalDataHandle* const> arguments) {
+    absl::Span<const GlobalDataHandle* const> arguments) const {
   // Resolve the allocations for the arguments of the computation, and create
   // a vector of device memory offsets for the arguments from the allocations.
   // In the case of partitioned computations, assume all arguments go on the
@@ -634,7 +642,7 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
         arg->requests(i).execution_options();
     const ExecuteGraphRequest& request = arg->requests(i);
     TF_RET_CHECK(request.has_computation()) << "computations may not be empty";
-    TF_RET_CHECK(request.computation().has_program_shape())
+    TF_RET_CHECK(request.computation().has_host_program_shape())
         << "programe shape may not be empty";
 
     // Get the executors.
@@ -651,9 +659,9 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
     // replica 0.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(request.computation().program_shape(),
-                           replicated_arguments.front(),
-                           request.execution_options()));
+        CreateModuleConfig(
+            ProgramShape{request.computation().host_program_shape()},
+            replicated_arguments.front(), request.execution_options()));
     VLOG(3)
         << "ExecuteGraphParallel created HloModuleConfig computation layout: "
         << module_config->entry_computation_layout().ToString();
@@ -738,9 +746,9 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   }
   if (available_device_count < arg->device_count() * replica_count) {
     return ResourceExhausted(
-        "Requested device count (%d) exceeds the number of available devices "
-        "on the target (%d)",
-        arg->device_count(), available_device_count);
+        "Requested logical device count (%d) with replica count (%d) exceeds "
+        "the number of available physical devices on the target (%d)",
+        arg->device_count(), replica_count, available_device_count);
   }
 
   for (int64 i = 0; i < arg->device_count(); ++i) {
@@ -753,38 +761,6 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   return Status::OK();
 }
 
-Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
-                              ExecuteResponse* result) {
-  ExecuteGraphParallelRequest parallel_arg;
-  *parallel_arg.add_requests() = *arg;
-  ExecuteParallelResponse parallel_result;
-  TF_RETURN_IF_ERROR(ExecuteGraphParallel(&parallel_arg, &parallel_result));
-  return PickParallelResponse(parallel_result, result);
-}
-
-Status Service::PickParallelResponse(
-    const ExecuteParallelResponse& parallel_result, ExecuteResponse* result) {
-  // The "result device" selection is a bit hacky, but better than assuming it
-  // is device 0. We have b/76035356 for restructuring the client API to clean
-  // up the current asymmetries and support more functionalities.
-  for (int64 i = 0; i < parallel_result.responses_size(); ++i) {
-    TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
-                        allocation_tracker_.ResolveForReplica(
-                            parallel_result.responses(i).output(), 0));
-    const Shape& shape = buffer->on_host_shape();
-    if (!ShapeUtil::IsEmptyTuple(shape)) {
-      *result = parallel_result.responses(i);
-      VLOG(3) << "Fetching result from device " << i << ": "
-              << ShapeUtil::HumanString(shape);
-      return Status::OK();
-    }
-  }
-  TF_RET_CHECK(parallel_result.responses_size() > 0);
-  *result = parallel_result.responses(0);
-  VLOG(1) << "Defaulting to device 0 result";
-  return Status::OK();
-}
-
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const HloModuleProto& module_proto,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -810,7 +786,7 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      HloModule::CreateFromProto(module_proto, *module_config));
+                      CreateModuleFromProto(module_proto, *module_config));
 
   TF_RETURN_IF_ERROR(MaybeDumpUnoptimizedHloModule(*module));
 
@@ -829,32 +805,33 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   return std::move(executable);
 }
 
-Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
-                             ExecuteResponse* result) {
-  VLOG(1) << "running execute-graph request";
-
+Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
+  VLOG(1) << "running compile request";
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
   }
-  if (!arg->computation().has_program_shape()) {
+  if (!arg->computation().has_host_program_shape()) {
     return InvalidArgument("programe shape may not be empty");
   }
 
-  // If we received multiple device handles, we must partition the module.
   if (arg->execution_options().device_handles_size() > 1) {
-    return ExecuteOneToN(arg, result);
+    return InvalidArgument(
+        "The compile request does not support multiple device handles.");
   }
 
-  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
-                                              SingleComputationDeviceHandle()));
+  std::vector<Shape> argument_shapes;
+  argument_shapes.reserve(arg->input_shape_with_layout_size());
+  std::vector<const Shape*> argument_shape_ptrs;
+  for (const ShapeProto& shape_proto : arg->input_shape_with_layout()) {
+    argument_shapes.push_back(Shape(shape_proto));
+    argument_shape_ptrs.push_back(&argument_shapes.back());
+  }
   TF_ASSIGN_OR_RETURN(
-      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-      ResolveAndValidateArguments(arg->arguments(), replicas));
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(arg->computation().program_shape(),
-                                         replicated_arguments.front(),
-                                         arg->execution_options()));
+      std::unique_ptr<HloModuleConfig> module_config,
+      CreateModuleConfig(ProgramShape{arg->computation().host_program_shape()},
+                         argument_shape_ptrs, &arg->execution_options()));
+  VLOG(3) << "Compile created HloModuleConfig computation layout: "
+          << module_config->entry_computation_layout().ToString();
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
@@ -863,6 +840,48 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
                       execute_backend_->default_stream_executor(),
                       /*device_allocator=*/nullptr));
 
+  *result->mutable_handle() = compilation_cache_.Insert(std::move(executable));
+
+  VLOG(1) << "successfully completed 'compile' request";
+  return Status::OK();
+}
+
+Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
+  VLOG(1) << "running execute request";
+  if (!arg->has_handle()) {
+    return InvalidArgument("execution handle should not be empty");
+  }
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      compilation_cache_.LookUp(arg->handle()));
+
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
+                                              SingleComputationDeviceHandle()));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arg->arguments(), replicas));
+
+  // Check that the replicated_arguments has the same shape and layout as the
+  // module config used when creating the exectuable.
+  const int64 num_module_args =
+      executable->module_config().entry_computation_layout().parameter_count();
+  if (num_module_args != arg->arguments_size()) {
+    return InvalidArgument(
+        "The executable expects %lld arguments, but sees %lld.",
+        num_module_args, arg->arguments_size());
+  }
+  for (int64 i = 0; i < num_module_args; i++) {
+    const Shape& shape_module =
+        executable->module_config().entry_computation_layout().parameter_shape(
+            i);
+    const Shape& shape_arg = replicated_arguments.front()[i]->on_host_shape();
+    if (!ShapeUtil::Equal(shape_module, shape_arg)) {
+      return InvalidArgumentStrCat(
+          "The executable exepcts the ", i, "th argument in shape ",
+          ShapeUtil::HumanStringWithLayout(shape_module), " but sees ",
+          ShapeUtil::HumanStringWithLayout(shape_arg));
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(auto stream,
                       execute_backend_->BorrowStream(
                           execute_backend_->default_stream_executor()));
@@ -876,9 +895,10 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
 
   TF_ASSIGN_OR_RETURN(
       *result->mutable_output(),
-      ExecuteAndRegisterResult(
-          executable.get(), replicated_arguments, execute_backend_.get(),
-          "result of " + arg->computation().name(), result->mutable_profile()));
+      ExecuteAndRegisterResult(executable.get(), replicated_arguments,
+                               execute_backend_.get(),
+                               "result of " + executable->module().name(),
+                               result->mutable_profile()));
 
   if (executable->dumping_snapshot()) {
     TF_ASSIGN_OR_RETURN(
@@ -890,7 +910,7 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
     TF_RETURN_IF_ERROR(executable->DumpHloSnapshot());
   }
 
-  VLOG(1) << "successfully completed 'execute-graph' request";
+  VLOG(1) << "successfully completed 'execute' request";
   return Status::OK();
 }
 
@@ -914,14 +934,14 @@ Status Service::TransferToClient(const TransferToClientRequest* arg,
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
 
-  const Shape* return_shape;
+  Shape return_shape;
   if (arg->has_shape_with_layout()) {
-    if (!LayoutUtil::HasLayout(arg->shape_with_layout())) {
+    return_shape = Shape(arg->shape_with_layout());
+    if (!LayoutUtil::HasLayout(return_shape)) {
       return InvalidArgument("shape_with_layout must have layout if present.");
     }
-    return_shape = &arg->shape_with_layout();
   } else {
-    return_shape = &shaped_buffer->on_host_shape();
+    return_shape = Shape(shaped_buffer->on_host_shape());
   }
 
   TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream(
@@ -932,30 +952,15 @@ Status Service::TransferToClient(const TransferToClientRequest* arg,
       execute_backend_->transfer_manager()->TransferLiteralFromDevice(
           stream.get(), *shaped_buffer));
 
-  if (LayoutUtil::LayoutsInShapesEqual(*return_shape, result_literal.shape())) {
+  if (LayoutUtil::LayoutsInShapesEqual(return_shape, result_literal.shape())) {
     *result->mutable_literal() = result_literal.ToProto();
   } else {
     *result->mutable_literal() =
-        result_literal.Relayout(*return_shape).ToProto();
+        result_literal.Relayout(return_shape).ToProto();
   }
   return Status::OK();
 }
 
-namespace {
-
-// Creates a clone of the given shaped buffer with the given device ordinal. The
-// shape and DeviceMemoryBase values of the clone are identical to the original.
-std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
-    const ShapedBuffer& shaped_buffer, int device_ordinal) {
-  auto clone = absl::make_unique<ShapedBuffer>(
-      shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape(),
-      shaped_buffer.platform(), device_ordinal);
-  clone->buffers() = shaped_buffer.buffers();
-  return clone;
-}
-
-}  // namespace
-
 Status Service::TransferToServer(const TransferToServerRequest* arg,
                                  TransferToServerResponse* result) {
   TF_ASSIGN_OR_RETURN(Literal literal,
@@ -1044,11 +1049,11 @@ Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
     executor = replicas[arg->replica_id()];
   }
 
-  auto literal = Literal::CreateFromShape(arg->shape_with_layout());
+  auto literal = Literal::CreateFromShape(Shape(arg->shape_with_layout()));
 
   TF_RETURN_IF_ERROR(
       execute_backend_->transfer_manager()->TransferLiteralFromOutfeed(
-          executor, arg->shape_with_layout(), literal));
+          executor, Shape(arg->shape_with_layout()), literal));
   *result->mutable_literal() = literal.ToProto();
   return Status::OK();
 }
@@ -1063,15 +1068,15 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
   }
-  if (!arg->computation().has_program_shape()) {
+  if (!arg->computation().has_host_program_shape()) {
     return InvalidArgument("program shape may not be empty");
   }
-  if (arg->computation().program_shape().parameters_size() != 0) {
+  if (arg->computation().host_program_shape().parameters_size() != 0) {
     return InvalidArgument(
         "constant computation may not depend on any parameters.");
   }
 
-  ProgramShape program_shape = arg->computation().program_shape();
+  ProgramShape program_shape(arg->computation().host_program_shape());
   TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
   if (arg->has_output_layout()) {
     TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
@@ -1081,7 +1086,7 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
   HloModuleConfig config(program_shape);
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      HloModule::CreateFromProto(arg->computation(), config));
+                      CreateModuleFromProto(arg->computation(), config));
 
   HloEvaluator evaluator;
   TF_ASSIGN_OR_RETURN(auto result_literal, evaluator.Evaluate<Literal>(
@@ -1102,7 +1107,7 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
 Status Service::GetShape(const GetShapeRequest* arg, GetShapeResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
-  *result->mutable_shape() = buffer->on_host_shape();
+  *result->mutable_shape() = buffer->on_host_shape().ToProto();
   return Status::OK();
 }
 
@@ -1111,14 +1116,14 @@ Status Service::GetComputationGraphStats(
   if (!arg->has_computation()) {
     return InvalidArgument("Computations may not be empty.");
   }
-  if (!arg->computation().has_program_shape()) {
+  if (!arg->computation().has_host_program_shape()) {
     return InvalidArgument("Program shape may not be empty.");
   }
 
-  HloModuleConfig config(arg->computation().program_shape());
+  HloModuleConfig config(ProgramShape{arg->computation().host_program_shape()});
   config.set_debug_options(arg->debug_options());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      HloModule::CreateFromProto(arg->computation(), config));
+                      CreateModuleFromProto(arg->computation(), config));
 
   hlo_graph_dumper::MaybeDumpHloModule(*module,
                                        "computation statistics subject");
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 1f62fad4c8079eba7013b3f647fe19bbc031fc77..11e1a79552fbd944ab28da129b08cfe676fb08e9 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -22,11 +22,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/allocation_tracker.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
+#include "tensorflow/compiler/xla/service/compilation_cache.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
@@ -90,11 +91,14 @@ class Service : public ServiceInterface {
   Status DeconstructTuple(const DeconstructTupleRequest* arg,
                           DeconstructTupleResponse* result) override;
 
-  // Executes a computation with the provided global data passed as
-  // immutable arguments. The request contains the whole computation graph.
-  // Returns global data output and execution timing.
-  Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                      ExecuteResponse* result) override;
+  // Compiles a computation into an executable. The request contains the whole
+  // computation graph. Returns the handle to the executable.
+  Status Compile(const CompileRequest* arg, CompileResponse* result) override;
+
+  // Executes an executable with the provided global data passes as immutable
+  // arguments. The request contains the handle to the executable. Returns
+  // global data output and execution timing.
+  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override;
 
   // Executes one or more computations in parallel with the provided global data
   // passed as immutable arguments. Returns global data output for each
@@ -179,10 +183,6 @@ class Service : public ServiceInterface {
       absl::Span<const ShapedBuffer* const> arguments,
       const ExecutionOptions& execution_options);
 
-  // Picks a parallel response and fills the result.
-  Status PickParallelResponse(const ExecuteParallelResponse& parallel_result,
-                              ExecuteResponse* result);
-
   // Prepare the executors for executing parallel.
   StatusOr<std::vector<se::StreamExecutor*>> GetExecutors(
       const ExecutionOptions& execution_options, int64 requests_size,
@@ -191,7 +191,7 @@ class Service : public ServiceInterface {
   // Prepare the arguments for executing parallel.
   StatusOr<std::vector<std::vector<const ShapedBuffer*>>> GetArguments(
       const ExecutionOptions& execution_options,
-      absl::Span<const GlobalDataHandle* const> arguments);
+      absl::Span<const GlobalDataHandle* const> arguments) const;
 
  protected:
   friend class LocalExecutable;
@@ -208,7 +208,7 @@ class Service : public ServiceInterface {
   StatusOr<std::vector<std::vector<const ShapedBuffer*>>>
   ResolveAndValidateArguments(
       absl::Span<const GlobalDataHandle* const> arguments,
-      absl::Span<se::StreamExecutor* const> stream_executors);
+      absl::Span<se::StreamExecutor* const> stream_executors) const;
 
   // Create a Hlo module config for the given program shape and arguments.
   // execution_options is optional; if not given a default is used.
@@ -254,11 +254,6 @@ class Service : public ServiceInterface {
       Backend* backend, absl::Span<const DeviceHandle> device_handles,
       absl::Span<const string> result_tags, ExecutionProfile* profile);
 
-  // Executes a single computation which has more than one target device.
-  // The N devices are expected to all return an empty tuple, but one, which
-  // will be the result of this computation.
-  Status ExecuteOneToN(const ExecuteGraphRequest* arg, ExecuteResponse* result);
-
   // Convenience function which checks whether the given client_shape
   // (presumably passed by the client to set the result layout) is valid for the
   // given computation result shape.
@@ -281,6 +276,9 @@ class Service : public ServiceInterface {
 
   ServiceOptions options_;
 
+  // Cache containing previously built Executables.
+  CompilationCache compilation_cache_;
+
   // Tracks channels created via the API.
   ChannelTracker channel_tracker_;
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index e379911462f1d2caa53f708a6ebf8b7363dc2fc3..7e7282a737041458aed39b0054f901c23aa87d7a 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -207,7 +207,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
         padded_dilated_base, dilated_window, dim.stride());
   }
 
-  return ShapeUtil::MakeShape(element_type, output_dimensions);
+  return ShapeUtil::MakeValidatedShape(element_type, output_dimensions);
 }
 
 }  // namespace
@@ -391,17 +391,6 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   return ShapeUtil::MakeShape(element_type, new_dimensions);
 }
 
-/* static */ StatusOr<Shape> ShapeInference::InferAfterAllShape(
-    absl::Span<const Shape* const> arg_shapes) {
-  for (const Shape* arg_shape : arg_shapes) {
-    if (arg_shape->element_type() != TOKEN) {
-      return InvalidArgument(
-          "Operands of token instructions must be TOKEN types.");
-    }
-  }
-  return ShapeUtil::MakeTokenShape();
-}
-
 /* static */ StatusOr<Shape> ShapeInference::InferConvertShape(
     const Shape& operand_shape, PrimitiveType new_element_type) {
   auto old_element_type = operand_shape.element_type();
@@ -919,6 +908,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   switch (opcode) {
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
+      return InferElementwiseBinaryOpShape(opcode, lhs, rhs,
+                                           broadcast_dimensions);
+
     case HloOpcode::kSubtract:
     case HloOpcode::kAdd:
     case HloOpcode::kAtan2:
@@ -929,6 +921,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
+      if (lhs.element_type() == PRED || rhs.element_type() == PRED) {
+        return InvalidArgument(
+            "Expected element type in shape to be arithmetic type for "
+            "operation %s; got PRED.",
+            HloOpcodeString(opcode));
+      }
       return InferElementwiseBinaryOpShape(opcode, lhs, rhs,
                                            broadcast_dimensions);
 
@@ -1020,7 +1018,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   switch (opcode) {
     case HloOpcode::kTuple: {
       Shape result = ShapeUtil::MakeTupleShape({});
-      result.mutable_tuple_shapes()->Reserve(operand_shapes.size());
+      result.mutable_tuple_shapes()->reserve(operand_shapes.size());
       for (const Shape* shape : operand_shapes) {
         ShapeUtil::AppendShapeToTuple(*shape, &result);
       }
@@ -1029,17 +1027,22 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     case HloOpcode::kSort: {
       if (operand_shapes.size() == 1) {
         return *operand_shapes[0];
-      } else if (operand_shapes.size() == 2) {
-        if (!ShapeUtil::SameDimensions(*operand_shapes[0],
-                                       *operand_shapes[1])) {
-          return InvalidArgument(
-              "Sort keys and values dimensions must match. "
-              "Keys shape is: %s\n, Values shape is: %s",
-              ShapeUtil::HumanString(*operand_shapes[0]),
-              ShapeUtil::HumanString(*operand_shapes[1]));
+      } else {
+        for (int64 operand = 1; operand < operand_shapes.size(); ++operand) {
+          if (!ShapeUtil::SameDimensions(*operand_shapes[0],
+                                         *operand_shapes[operand])) {
+            return InvalidArgument(
+                "Sort keys and values dimensions must match. "
+                "Keys shape is: %s\n, Values shape (operand index %lld) is: %s",
+                ShapeUtil::HumanString(*operand_shapes[0]), operand,
+                ShapeUtil::HumanString(*operand_shapes[operand]));
+          }
         }
-        return ShapeUtil::MakeTupleShape(
-            {*operand_shapes[0], *operand_shapes[1]});
+        std::vector<Shape> operand_shape_values;
+        for (const Shape* operand_shape : operand_shapes) {
+          operand_shape_values.push_back(*operand_shape);
+        }
+        return ShapeUtil::MakeTupleShape(operand_shape_values);
       }
       return InvalidArgument("Unexpected number of operands for sort");
     }
@@ -1557,6 +1560,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   TF_RETURN_IF_ERROR(ExpectArray(lhs, "lhs of convolution"));
   TF_RETURN_IF_ERROR(ExpectArray(rhs, "rhs of convolution"));
 
+  if (feature_group_count <= 0) {
+    return InvalidArgument(
+        "feature_group_count must be a positive number, got %d",
+        feature_group_count);
+  }
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return InvalidArgument(
         "Convolution with different element types: %s and %s.",
@@ -1566,8 +1574,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
       dnums.kernel_spatial_dimensions_size()) {
     return InvalidArgument(
         "Both arguments to convolution must have same number of dimensions.\n"
-        "Window: %s",
-        window.DebugString());
+        "Numbers: %s",
+        dnums.DebugString());
+  }
+
+  if (dnums.input_spatial_dimensions_size() !=
+      dnums.output_spatial_dimensions_size()) {
+    return InvalidArgument(
+        "Both input and output of convolution must have same number of "
+        "dimensions.\nNumbers: %s",
+        dnums.DebugString());
   }
 
   const int num_spatial_dims = dnums.input_spatial_dimensions_size();
@@ -1586,8 +1602,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   }
   if (ShapeUtil::Rank(rhs) != num_dims) {
     return InvalidArgument(
-        "The RHS argument to a convolution should have rank %d; lhs: %s.",
-        num_dims, ShapeUtil::HumanString(lhs));
+        "The RHS argument to a convolution should have rank %d; rhs: %s.",
+        num_dims, ShapeUtil::HumanString(rhs));
   }
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
@@ -1662,14 +1678,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   const int64 kernel_output_features =
       rhs.dimensions(dnums.kernel_output_feature_dimension());
 
-  if (input_features != kernel_input_features * feature_group_count) {
+  if (input_features % feature_group_count != 0 ||
+      input_features / feature_group_count != kernel_input_features) {
     return InvalidArgument(
-        "Expected LHS feature dimension (value %d) to match RHS "
-        "input feature dimension * feature_group_count (value %d * %d = %d); "
+        "Expected LHS feature dimension (value %d) to be a multiple of "
+        "feature_group_count (value %d), and LHS feature dimension / "
+        "feature_group_count = RHS feature dimension (value %d); "
         "got <conv>(%s, %s)\n"
         "Dimension numbers: {%s}.",
-        input_features, kernel_input_features, feature_group_count,
-        kernel_input_features * feature_group_count,
+        input_features, feature_group_count, kernel_input_features,
         ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs),
         dnums.DebugString());
   }
@@ -2003,6 +2020,25 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return operand_shape;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferGetDimensionSizeShape(
+    const Shape& shape, int64 dimension) {
+  if (dimension < 0 || dimension >= ShapeUtil::Rank(shape)) {
+    return InvalidArgument("GetDimensionSize dimension out of bounds: %d.",
+                           dimension);
+  }
+
+  // TODO(b/119580730): Remove this restriction when very large dimension size
+  // is needed.
+  if (shape.dimensions(dimension) > std::numeric_limits<uint32>::max()) {
+    return InvalidArgument(
+        "GetDimensionSize's input shape is %s, the %dth dimension exceeds the "
+        "UINT_MAX limit.",
+        ShapeUtil::HumanString(shape), dimension);
+  }
+
+  return ShapeUtil::MakeShape(U32, {});
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
     const Shape& arg, absl::Span<const int64> starts,
     absl::Span<const int64> limits, absl::Span<const int64> strides) {
@@ -2337,6 +2373,52 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return ShapeUtil::MakeShape(operand.element_type(), dimensions);
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferBroadcastShape(
+    const Shape& operand_shape, const Shape& output_shape,
+    absl::Span<const int64> broadcast_dimensions) {
+  TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of broadcast"));
+  TF_RETURN_IF_ERROR(ExpectArray(output_shape, "operand of broadcast"));
+  const int64 operand_rank = ShapeUtil::Rank(operand_shape);
+  const int64 output_rank = ShapeUtil::Rank(output_shape);
+  if (operand_rank > output_rank) {
+    return InvalidArgument(
+        "InDim style broadcast must be to an equal or higher ranked shape; "
+        "operand rank: %lld; output rank: %lld",
+        operand_rank, output_rank);
+  }
+  if (operand_rank != broadcast_dimensions.size()) {
+    return InvalidArgument(
+        "Size of broadcast_dimensions has to match operand's rank; operand "
+        "rank: %lld, size of broadcast_dimensions %u.",
+        operand_rank, broadcast_dimensions.size());
+  }
+  for (int64 i = 0; i < operand_rank; i++) {
+    if (broadcast_dimensions[i] < 0 || broadcast_dimensions[i] >= output_rank) {
+      return InvalidArgument("Broadcast dimension %lld is out of bound",
+                             broadcast_dimensions[i]);
+    }
+    if (operand_shape.dimensions(i) !=
+            output_shape.dimensions(broadcast_dimensions[i]) &&
+        operand_shape.dimensions(i) != 1) {
+      return InvalidArgument(
+          "Input dimension should be either 1 or equal to the output dimension "
+          "it's broadcasting into; the %lldth operand dimension is %lld, the "
+          "%lldth output dimension is %lld.",
+          i, operand_shape.dimensions(i), broadcast_dimensions[i],
+          output_shape.dimensions(broadcast_dimensions[i]));
+    }
+    // Make sure the broadcast dimensions are listed in a strictly increasing
+    // order.
+    if (i > 0 && broadcast_dimensions[i - 1] >= broadcast_dimensions[i]) {
+      return InvalidArgument(
+          "Broadcast dimensions order is wrong: %d comes after %d.",
+          broadcast_dimensions[i], broadcast_dimensions.at(i - 1));
+    }
+  }
+
+  return output_shape;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferReshapeShape(
     const Shape& operand, absl::Span<const int64> dimensions,
     absl::Span<const int64> new_sizes) {
@@ -2759,6 +2841,15 @@ Status ValidateScatterDimensionNumbers(
     }
   }
 
+  // Validate window size.
+  auto window_size = dim_numbers.update_window_dims_size() +
+                     dim_numbers.inserted_window_dims_size();
+  if (window_size != ShapeUtil::Rank(operand_shape)) {
+    return InvalidArgument(
+        "Scatter op has window of size %d; doesn't match operand of rank %d.",
+        window_size, ShapeUtil::Rank(operand_shape));
+  }
+
   // Validate scatter_dims_to_operand_dims in ScatterDimensionNumbers.
   if (dim_numbers.scatter_dims_to_operand_dims_size() !=
       scatter_indices_shape[dim_numbers.index_vector_dim()]) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 96a0ee165d46753da4fef119e7072f66637bf2c4..d94385a04d50baff8156570a09620fd458547936 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -210,6 +210,12 @@ class ShapeInference {
   static StatusOr<Shape> InferBroadcastShape(
       const Shape& operand, absl::Span<const int64> broadcast_sizes);
 
+  // Checks whether the given parameters can form a broadcast. Returns the same
+  // output_shape if it's legal.
+  static StatusOr<Shape> InferBroadcastShape(
+      const Shape& operand_shape, const Shape& output_shape,
+      absl::Span<const int64> broadcast_dimensions);
+
   // Infers the shape produced by a reshape operation from the element type of
   // its operand and the new dimension sizes specified.
   static StatusOr<Shape> InferReshapeShape(const Shape& operand,
@@ -226,13 +232,6 @@ class ShapeInference {
   static StatusOr<Shape> InferConcatOpShape(
       absl::Span<const Shape* const> arg_shapes, int64 dimension);
 
-  // Infers the shape produced by a kAfterAll. Trivially this shape is always a
-  // TOKEN shape. However, ShapeInference serves two purposes: inferring shapes
-  // and checking operand shapes. This method verifies that the operand shapes
-  // are all TOKENs.
-  static StatusOr<Shape> InferAfterAllShape(
-      absl::Span<const Shape* const> arg_shapes);
-
   // Helper that validates the given operand shape can be converted to the
   // target output_shape via a convert instruction -- the requirement is that
   // the shape is identical except for the element type.
@@ -285,6 +284,9 @@ class ShapeInference {
       const Shape& updates_shape, const ProgramShape& to_apply_shape,
       const ScatterDimensionNumbers& scatter_dim_numbers);
 
+  static StatusOr<Shape> InferGetDimensionSizeShape(const Shape& shape,
+                                                    int64 dimension);
+
  private:
   // Helper that infers the shape produced by performing an element-wise binary
   // operation with the given LHS and RHS shapes.
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 864ed43118cd066f6ce14cd808b873f137b8414a..4639e32db4d59080a9e85e46983fac61d9e76be9 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -1618,13 +1618,37 @@ TEST_F(ShapeInferenceTest, BadSort) {
   auto values = ShapeUtil::MakeShape(F32, {5});
   StatusOr<Shape> statusor =
       ShapeInference::InferVariadicOpShape(HloOpcode::kSort, {&keys, &values});
-  ASSERT_FALSE(statusor.ok());
+  EXPECT_FALSE(statusor.ok());
+  EXPECT_THAT(statusor.status().error_message(),
+              HasSubstr("dimensions must match"))
+      << statusor.status();
+}
 
+TEST_F(ShapeInferenceTest, BadSortValuesMismatch) {
+  auto keys = ShapeUtil::MakeShape(F32, {4});
+  auto values_good = ShapeUtil::MakeShape(F32, {4});
+  auto values_bad = ShapeUtil::MakeShape(F32, {5});
+  StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
+      HloOpcode::kSort, {&keys, &values_good, &values_bad});
+  EXPECT_FALSE(statusor.ok());
   EXPECT_THAT(statusor.status().error_message(),
               HasSubstr("dimensions must match"))
       << statusor.status();
 }
 
+TEST_F(ShapeInferenceTest, SortManyValues) {
+  auto keys = ShapeUtil::MakeShape(F32, {4});
+  auto values_s32 = ShapeUtil::MakeShape(S32, {4});
+  auto values_u32 = ShapeUtil::MakeShape(U32, {4});
+  StatusOr<Shape> statusor = ShapeInference::InferVariadicOpShape(
+      HloOpcode::kSort, {&keys, &values_s32, &values_u32});
+  EXPECT_IS_OK(statusor);
+  Shape inferred_shape = statusor.ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Compatible(
+      inferred_shape,
+      ShapeUtil::MakeTupleShape({keys, values_s32, values_u32})));
+}
+
 class ScatterGatherShapeInferenceTest : public ShapeInferenceTest {
  protected:
   const Shape s64_scalar_ = ShapeUtil::MakeShape(S64, {});
@@ -2649,5 +2673,23 @@ TEST_F(ScatterGatherShapeInferenceTest,
       << statusor.status();
 }
 
+TEST_F(ScatterGatherShapeInferenceTest,
+       InvalidScatterDimNumbers_InsufficientWindowDims) {
+  StatusOr<Shape> statusor = ShapeInference::InferScatterShape(
+      f32_5d_tensor_50_49_48_47_46_, s64_scalar_,
+      ShapeUtil::MakeShape(F32, {30, 29, 28, 27}), to_apply_,
+      HloScatterInstruction::MakeScatterDimNumbers(
+          /*update_window_dims=*/{0, 1, 2, 3},
+          /*inserted_window_dims=*/{},
+          /*scatter_dims_to_operand_dims=*/{0},
+          /*index_vector_dim=*/0));
+  ASSERT_FALSE(statusor.ok());
+  EXPECT_THAT(
+      statusor.status().error_message(),
+      HasSubstr(
+          "Scatter op has window of size 4; doesn't match operand of rank 5."))
+      << statusor.status();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 56952e3adae59656605a12fd499162504a2a3379..28a30b5ee2dbcb5012804578d4d037c241045309 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -157,4 +157,23 @@ void ScopedShapedBuffer::Deallocate() {
   }
 }
 
+ScopedShapedBuffer ScopedShapedBuffer::TakeSubTree(ShapeIndexView index) {
+  const xla::Shape& sub_on_host_shape =
+      xla::ShapeUtil::GetSubshape(on_host_shape(), {index});
+  const xla::Shape& sub_on_device_shape =
+      xla::ShapeUtil::GetSubshape(on_device_shape(), {index});
+
+  ScopedShapedBuffer output(sub_on_host_shape, sub_on_device_shape,
+                            memory_allocator(), device_ordinal());
+  auto src_it = buffers().find(index);
+  auto dst_it = output.buffers().begin();
+  while (dst_it != output.buffers().end()) {
+    dst_it->second = src_it->second;
+    src_it->second = tensorflow::se::DeviceMemoryBase(nullptr, 0);
+    ++src_it;
+    ++dst_it;
+  }
+  return output;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index e1d26da4a20c0105be304b1a34c81515fcdc6b7f..f5210c9cfa6b29853bcd0f5bfd581ee3e116a509 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -176,6 +176,11 @@ class ScopedShapedBuffer : public ShapedBuffer {
   // It's the caller's job to ensure that the memory contained therein is freed.
   TF_MUST_USE_RESULT ShapedBuffer release();
 
+  // Extracts the sub-tree rooted at 'index' and returns a ScopedShapedBuffer
+  // that holds ownership of the subtree. Sets the buffers corresponding to the
+  // subtree to null in 'this'.
+  ScopedShapedBuffer TakeSubTree(ShapeIndexView index);
+
  protected:
   void Deallocate();
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
index d69e6362e91e4696dab3c46d99a981c67b593a1c..ca64bd3c8dd2baa686db2b85c937a034b37ab22b 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace xla {
@@ -107,5 +109,79 @@ TEST(ScopedShapedBufferTest, TestMoveAssignmentOperator) {
   // TestAllocator's destructor checks that all memory was freed.
 }
 
+TEST(ScopedShapedBufferTest, TestTakeSubTree) {
+  TestAllocator allocator;
+
+  Shape s = ShapeUtil::MakeShape(F32, {1});
+  s = xla::ShapeUtil::MakeTupleShape(std::vector<xla::Shape>(2, s));
+  s = xla::ShapeUtil::MakeTupleShape(std::vector<xla::Shape>(3, s));
+
+  ScopedShapedBuffer sb(s, s, &allocator, /*device_ordinal=*/0);
+  sb.buffers().ForEachMutableElement(
+      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        TF_ASSERT_OK_AND_ASSIGN(
+            OwningDeviceMemory m,
+            allocator.Allocate(/*device_ordinal=*/0, /*size=*/77));
+        *buffer = m.Forget();
+      });
+  ShapeTree<se::DeviceMemoryBase> buffers = sb.buffers();
+
+  // Takes a subtree out of 'sb', and verifies the buffers are as expected.
+  xla::ShapeIndex subtree_index = {1};
+  ScopedShapedBuffer output = sb.TakeSubTree(subtree_index);
+
+  output.buffers().ForEachElement([&](const xla::ShapeIndex& sub_index,
+                                      const se::DeviceMemoryBase& buffer) {
+    xla::ShapeIndex orig_index = subtree_index;
+    for (int i : sub_index) {
+      orig_index.push_back(i);
+    }
+    EXPECT_TRUE(buffers.find(orig_index)->second.IsSameAs(buffer));
+  });
+  sb.buffers().ForEachElement(
+      [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
+        if (ShapeIndexView(index).StartsWith(subtree_index)) {
+          EXPECT_TRUE(buffer.is_null());
+        } else {
+          EXPECT_TRUE(buffers.find(index)->second.IsSameAs(buffer));
+        }
+      });
+}
+
+// Test TakeSubTree with different depths (depth of ShapeTree) and fan-outs
+// (cardinality of each non-leaf node's children).
+void BM_TakeSubTree(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  TestAllocator allocator;
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  }
+  xla::ScopedShapedBuffer shaped_buffer(shape, shape, /*allocator=*/&allocator,
+                                        /*device_ordinal=*/0);
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    // Extract a buffer from approximately the middle of the first level of the
+    // tree.
+    (void)shaped_buffer.TakeSubTree(/*index=*/{fan_out / 2}).release();
+  }
+  tensorflow::testing::StopTiming();
+}
+
+BENCHMARK(BM_TakeSubTree)
+    ->ArgPair(1, 4)
+    ->ArgPair(1, 8)
+    ->ArgPair(1, 32)
+    ->ArgPair(1, 64)
+    ->ArgPair(1, 128)
+    ->ArgPair(1, 256)
+    ->ArgPair(1, 512)
+    ->ArgPair(2, 4)
+    ->ArgPair(2, 8)
+    ->ArgPair(2, 32)
+    ->ArgPair(2, 64)
+    ->ArgPair(2, 128);
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index f952e64af2b675b9c0f8a30e9a2bc3c855e34efa..49f0b8f8b72001f07200d3e94828f60fcb0fa8fb 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -95,7 +95,13 @@ class TransferManager {
   // but need not have the same layout.
   //
   // This operation is performed asynchronously on the given stream. It returns
-  // once the transfer is enqueued.
+  // once the transfer is enqueued, and may return before the transfer has
+  // completed.
+  //
+  // The caller may free the data structures 'literal' and 'device_buffer'
+  // immediately after this function returns, however their constituent buffers
+  // on both host and device must remain valid until the enqueued transfer has
+  // completed on 'stream'.
   virtual Status TransferLiteralToDeviceAsync(
       se::Stream* stream, const LiteralSlice& literal,
       const ShapedBuffer& device_buffer) = 0;
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 79b5c09abb355cd067a4891af558c8c44d80d88e..17cdaa74fc328d156292f5af828d4222a9a01f1f 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -172,7 +172,7 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   HloInstruction* mul = builder.AddInstruction(HloInstruction::CreateBinary(
       add->shape(), HloOpcode::kMultiply, add, sub));
 
-  auto module = CreateNewModule("fuse_with_constant_operands");
+  auto module = CreateNewVerifiedModule("fuse_with_constant_operands");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(mul));
   HloInstruction* call = module->OutlineExpressionFromComputation(
@@ -247,7 +247,7 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -302,7 +302,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -362,7 +362,7 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -428,7 +428,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 811ac55e2dc2939293e62f1ebcd2bce266a12133..50d51eaeb762e208004c1dae3dcc27503f3f94e9 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -148,7 +148,7 @@ TuplePointsToAnalysis::Run(const HloModule* module) {
 
 Status TuplePointsToAnalysis::Analyze() {
   per_instruction_.clear();
-  per_instruction_.resize(module_->NumUniqueInstructionIds());
+  per_instruction_.reserve(module_->instruction_count());
 
   logical_buffer_aliases_.clear();
   logical_buffer_aliases_.resize(
@@ -280,6 +280,13 @@ Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) {
   return Status::OK();
 }
 
+Status TuplePointsToAnalysis::HandleAddDependency(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand.
+  CreateCopiedPointsToSet(add_dependency, add_dependency->operand(0));
+  return Status::OK();
+}
+
 Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   // RecvDone aliases its input (Recv) tuple element {0} to element {0} of its
   // output. The other indices ({} and {1}) define their own buffers.
@@ -756,6 +763,7 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
     }
   }
   if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
+      user->opcode() == HloOpcode::kScatter ||
       user->opcode() == HloOpcode::kWhile) {
     // We eliminated other users in BufferLiveness::live_range_strictly_before,
     // so here we just need to check that the use is at operand index 0.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 30c365053c5dac5af3c559f7c92b11d389d7fff8..0a1d5649d6d69fea12263e6986ce76af62615ec7 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
@@ -251,6 +252,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   string ToString() const;
 
@@ -315,14 +317,23 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   const PerInstruction* PerInst(const HloInstruction* inst) const {
     int id = inst->unique_id();
     DCHECK_GE(id, 0);
-    DCHECK_LT(id, per_instruction_.size());
-    return &per_instruction_[id];
+    auto iter = per_instruction_.find(id);
+    if (iter == per_instruction_.end()) {
+      LOG(FATAL) << "Expected per-instruction information to already exist";
+    } else {
+      return iter->second.get();
+    }
   }
   PerInstruction* PerInst(const HloInstruction* inst) {
     int id = inst->unique_id();
     DCHECK_GE(id, 0);
-    DCHECK_LT(id, per_instruction_.size());
-    return &per_instruction_[id];
+    auto iter = per_instruction_.find(id);
+    if (iter == per_instruction_.end()) {
+      return per_instruction_.emplace(id, absl::make_unique<PerInstruction>())
+          .first->second.get();
+    } else {
+      return iter->second.get();
+    }
   }
 
   std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
@@ -339,7 +350,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   const std::unique_ptr<LogicalBufferAnalysis> logical_buffer_analysis_;
 
   // A map from instruction->unique_id() to
-  std::vector<PerInstruction> per_instruction_;
+  absl::flat_hash_map<int, std::unique_ptr<PerInstruction>> per_instruction_;
 
   // A map from LogicalBuffer->id() to alias information about that logical
   // buffer
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index e9a07b14ed685fa4388aca583395370a60176cca..561762b5d424ed5f537665be9d67a81dc8bdd56e 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -48,7 +48,7 @@ class TuplePointsToAnalysisTest : public HloTestBase {
   }
 
   void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
+    module_ = CreateNewUnverifiedModule();
     module_->AddEntryComputation(std::move(computation));
   }
 
@@ -264,6 +264,22 @@ TEST_F(TuplePointsToAnalysisTest, GetTupleElement) {
               UnorderedElementsAre(inner_tuple));
 }
 
+TEST_F(TuplePointsToAnalysisTest, AddDependency) {
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto token = builder.AddInstruction(HloInstruction::CreateToken());
+  auto add_dependency = builder.AddInstruction(
+      HloInstruction::CreateAddDependency(constant, token));
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  auto& points_to_set = points_to_analysis_->GetPointsToSet(add_dependency);
+  EXPECT_EQ(1, points_to_set.size());
+  EXPECT_FALSE(points_to_set.IsAmbiguous());
+  EXPECT_TRUE(points_to_set.IsDistinct());
+  ExpectHasTopLevelBuffers(points_to_set.CreateFlattenedSet(), {constant});
+}
+
 TEST_F(TuplePointsToAnalysisTest, DuplicatedElement) {
   // Create a tuple which contains duplicate elements.
   auto builder = HloComputation::Builder(TestName());
@@ -809,7 +825,7 @@ TEST_F(FusionPointsToAnalysisTest, FusionParam0TwoUsers) {
 class PointsToAnalysisTestBase : public HloTestBase {
  protected:
   void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
+    module_ = CreateNewUnverifiedModule();
     computation_ = module_->AddEntryComputation(std::move(computation));
   }
 
@@ -1010,6 +1026,44 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
       points_to_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
 }
 
+TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
+  const char* hlo_text = R"(
+    HloModule TensorFlowScatterV1
+
+    update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+      lhs = s32[] parameter(0)
+      ROOT rhs = s32[] parameter(1)
+    }
+
+    ENTRY main {
+      operand = s32[3,3] parameter(0)
+      indices = s32[2] parameter(1)
+      updates = s32[2,3] parameter(2)
+      ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+          to_apply=update_s32,
+          update_window_dims={1},
+          inserted_window_dims={0},
+          scatter_dims_to_operand_dims={0},
+          index_vector_dim=1
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseHloString(hlo_text));
+  computation_ = module_->entry_computation();
+  RunAnalysis();
+
+  HloInstruction* operand_param = computation_->parameter_instruction(0);
+  HloInstruction* indices_param = computation_->parameter_instruction(1);
+  HloInstruction* updates_param = computation_->parameter_instruction(2);
+  HloInstruction* scatter = computation_->root_instruction();
+
+  EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(
+      operand_param, {}, scatter, {}));
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(
+      indices_param, {}, scatter, {}));
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(
+      updates_param, {}, scatter, {}));
+}
+
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -1035,7 +1089,8 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
   auto values = builder.AddInstruction(
       HloInstruction::CreateParameter(1, values_shape, "values"));
   auto sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys, values));
+      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys,
+      {values}));
 
   BuildModuleAndRunAnalysis(builder.Build());
 
@@ -1137,7 +1192,7 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
     return builder.Build();
   };
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   HloComputation* cond_computation =
       module_->AddEmbeddedComputation(make_cond());
   HloComputation* body_computation =
@@ -1172,7 +1227,7 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
   auto add = sub_builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
 
-  module_ = CreateNewModule();
+  module_ = CreateNewUnverifiedModule();
   auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
   sub_computation->CreateFusionInstruction({add, ones},
                                            HloInstruction::FusionKind::kLoop);
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index 516754e2110ee50a597818c4a8bcfbfbb76c5cec..65b0f8c804475d8f22fff9798e79c9881a51f1f1 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -34,7 +34,7 @@ namespace op = xla::testing::opcode_matchers;
 namespace xla {
 namespace {
 
-class TupleSimplifierTest : public HloVerifiedTestBase {
+class TupleSimplifierTest : public HloTestBase {
  protected:
   void Run(HloModule* module, bool change_expected) {
     TupleSimplifier simplifier;
@@ -65,10 +65,10 @@ TEST_F(TupleSimplifierTest, TupleOfParameters) {
   HloInstruction* param2 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, scalar_shape_, "param2"));
   builder.AddInstruction(HloInstruction::CreateTuple({param0, param1, param2}));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  Run(module, /*change_expected=*/false);
+  Run(module.get(), /*change_expected=*/false);
 }
 
 TEST_F(TupleSimplifierTest, GteOfTupleOfParameter) {
@@ -78,10 +78,10 @@ TEST_F(TupleSimplifierTest, GteOfTupleOfParameter) {
       HloInstruction::CreateParameter(0, tuple_shape_, "param"));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape_, param, 1));
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  Run(module, /*change_expected=*/false);
+  Run(module.get(), /*change_expected=*/false);
 }
 
 TEST_F(TupleSimplifierTest, GteOfTuple) {
@@ -98,12 +98,12 @@ TEST_F(TupleSimplifierTest, GteOfTuple) {
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_shape_, tuple, 1));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), gte);
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), param1);
 }
@@ -125,13 +125,13 @@ TEST_F(TupleSimplifierTest, GteOfTupleChain) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape_, HloOpcode::kNegate, element));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Negate(op::GetTupleElement(op::Tuple())));
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), op::Negate(op::Parameter()));
 }
@@ -157,12 +157,12 @@ TEST_F(TupleSimplifierTest, NestedGteOfTuples) {
         ShapeUtil::GetTupleElementShape(element->shape(), 0), element, 0));
   }
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), element);
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -182,12 +182,12 @@ TEST_F(TupleSimplifierTest, TupleOfGteInstructions) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), tuple);
 
-  Run(module, /*change_expected=*/true);
+  Run(module.get(), /*change_expected=*/true);
 
   EXPECT_THAT(computation->root_instruction(), tuple_param);
 }
@@ -207,19 +207,19 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
 
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), tuple);
 
-  Run(module, /*change_expected=*/false);
+  Run(module.get(), /*change_expected=*/false);
 
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
 TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
   //  Verify that the root computation can be excluded
-  auto module = CreateNewModule();
+  auto module = CreateNewVerifiedModule();
 
   HloInstruction* p0;
   HloInstruction* p1;
@@ -281,7 +281,7 @@ TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
     entry = module->AddEntryComputation(builder.Build());
   }
 
-  Run(module, /*change_expected=*/true, /*exclude_entry=*/true);
+  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/true);
 
   EXPECT_THAT(c0->root_instruction(), p0);
   EXPECT_THAT(c1->root_instruction(), p1);
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
index 541b117e0299c94de330604ec5c16e20f07c425f..68e2569f66bea9ec1223e454d1ead0efc7b9498e 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 
 namespace xla {
 
@@ -229,4 +232,96 @@ optional<int64> ComputeWhileLoopTripCount(HloInstruction* while_op,
   return nullopt;
 }
 
+// If the only user of this instruction is a get-tuple-element, return that
+// get-tuple-element, otherwise return null. If this runs before CSE/DCE, we may
+// get a false negative if there are several copies of the same GTE, or there
+// are unused GTEs, but we can live with this.
+static HloInstruction* GetOnlyGTE(HloInstruction* inst) {
+  if (inst->user_count() != 1) {
+    return nullptr;
+  }
+
+  HloInstruction* user = inst->users().back();
+  if (user->opcode() != HloOpcode::kGetTupleElement) {
+    return nullptr;
+  }
+  return user;
+}
+
+optional<int64> ComputeWhileLoopTripCountUpperBound(HloInstruction* while_op) {
+  // If we know the exact trip count, it's also the upper bound.
+  auto exact_trip_count = ComputeWhileLoopTripCount(while_op);
+  if (exact_trip_count) {
+    VLOG(2) << "Loop has exact trip count.";
+    return exact_trip_count;
+  }
+
+  // There is one more case we know how to handle. If the loop condition only
+  // looks at one element of the tuple, and the loop body sets this element to a
+  // constant, there are two options:
+  // 1) Evaluating the condition on this constant returns true. In this case,
+  // the loop either executes 0 times, or is an infinite loop, depending on the
+  // init value.
+  // 2) Evaluating the condition on this constant returns false. In this case,
+  // the loop executes 0 or 1 times, depending on the init value. This means
+  // that, regardless of the init value, the upper bound on the trip count is 1.
+
+  // Check whether the condition depends on a single parameter, and find out
+  // which.
+  auto* while_cond = while_op->while_condition();
+  auto* while_cond_param = while_cond->parameter_instruction(0);
+  auto* cond_gte = GetOnlyGTE(while_cond_param);
+  if (!cond_gte) {
+    VLOG(2) << "Induction variable not found in loop condition: "
+            << while_cond->root_instruction()->ToString();
+    return nullopt;
+  }
+
+  // Now check whether this gets set to a constant by the while body.
+  auto* while_body = while_op->while_body();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_body_root->opcode() != HloOpcode::kTuple) {
+    VLOG(3) << "While body's root is not a tuple instruction: "
+            << while_body_root->ToString();
+    return nullopt;
+  }
+
+  int64 indvar_index = cond_gte->tuple_index();
+  auto* while_body_indvar = while_body_root->operand(indvar_index);
+  if (while_body_indvar->opcode() != HloOpcode::kConstant) {
+    VLOG(3) << "While body does not set the IV to a constant: "
+            << while_body_indvar->ToString();
+    return nullopt;
+  }
+
+  // We have a constant. Evaluate the condition on this constant.
+  HloEvaluator evaluator(/*max_loop_iterations=*/0);
+  Literal fake_input = Literal::CreateFromShape(while_cond_param->shape());
+  TF_CHECK_OK(fake_input.CopyFrom(while_body_indvar->literal(),
+                                  /*dest_shape_index=*/{indvar_index},
+                                  /*src_shape_index=*/{}));
+  StatusOr<Literal> eval_result =
+      evaluator.Evaluate<Literal>(*while_cond, {std::move(fake_input)});
+
+  if (!eval_result.ok()) {
+    VLOG(2) << "Couldn't evaluate while loop condition.";
+    return nullopt;
+  }
+
+  Literal cond_result_pred = std::move(eval_result.ValueOrDie());
+  CHECK(ShapeUtil::Equal(cond_result_pred.shape(),
+                         ShapeUtil::MakeShape(PRED, {})));
+
+  // Per the explanation above, if the evaluated condition returns false, the
+  // loop executes at most once.
+  bool cond_returns_true = cond_result_pred.GetFirstElement<bool>();
+  if (!cond_returns_true) {
+    VLOG(2) << "Upper bound on the trip count is 1";
+    return 1;
+  }
+
+  VLOG(2) << "Loop has no known upper bound on the trip count.";
+  return nullopt;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.h b/tensorflow/compiler/xla/service/while_loop_analysis.h
index bf497f4892b95c927379411468a66d8961465413..ac69a727bd6b403672a676400993fb7d8afc0a55 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.h
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.h
@@ -28,6 +28,10 @@ namespace xla {
 absl::optional<int64> ComputeWhileLoopTripCount(HloInstruction *while_op,
                                                 int64 max_value_returned = 128);
 
+// Returns an upper bound on the trip count of the loop if it's statically
+// known, nullopt otherwise.
+absl::optional<int64> ComputeWhileLoopTripCountUpperBound(
+    HloInstruction *while_op);
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis_test.cc b/tensorflow/compiler/xla/service/while_loop_analysis_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1da0fbeac89a93eaaef893e5f25dd3b87cc1d5d5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_analysis_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
+
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class WhileLoopAnalysisTest : public HloTestBase {};
+
+TEST_F(WhileLoopAnalysisTest, SingleIterationUpperBound) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], s32[]) parameter(0)
+      val = f32[2] get-tuple-element(p_body), index=0
+      const = s32[] constant(-1)
+      ROOT root = (f32[2], s32[]) tuple(val, const)
+    }
+
+    condition {
+      p_cond = (f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=1
+      const = s32[] constant(42)
+      ROOT result = pred[] equal-to(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], s32[]) tuple(param.0, param.1)
+      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  HloInstruction* while_op = module->entry_computation()->root_instruction();
+  EXPECT_EQ(*ComputeWhileLoopTripCountUpperBound(while_op), 1);
+}
+
+TEST_F(WhileLoopAnalysisTest, NoUpperBound) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], s32[]) parameter(0)
+      val = f32[2] get-tuple-element(p_body), index=0
+      const = s32[] constant(42)
+      ROOT root = (f32[2], s32[]) tuple(val, const)
+    }
+
+    condition {
+      p_cond = (f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=1
+      const = s32[] constant(42)
+      ROOT result = pred[] equal-to(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], s32[]) tuple(param.0, param.1)
+      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  HloInstruction* while_op = module->entry_computation()->root_instruction();
+  EXPECT_EQ(ComputeWhileLoopTripCountUpperBound(while_op), absl::nullopt);
+}
+
+TEST_F(WhileLoopAnalysisTest, ExactBound) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], s32[]) parameter(0)
+      val = f32[2] get-tuple-element(p_body), index=0
+      index = s32[] get-tuple-element(p_body), index=1
+      one = s32[] constant(1)
+      inc = s32[] add(index, one)
+      ROOT root = (f32[2], s32[]) tuple(val, inc)
+    }
+
+    condition {
+      p_cond = (f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=1
+      const = s32[] constant(42)
+      ROOT result = pred[] less-than(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] constant(0)
+      while_init = (f32[2], s32[]) tuple(param.0, param.1)
+      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  HloInstruction* while_op = module->entry_computation()->root_instruction();
+  EXPECT_EQ(*ComputeWhileLoopTripCountUpperBound(while_op), 42);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
index 067cfcc17d65860a249de4d9e31703df12091d3a..8b381dec07397c1427e98bc30511ac21dc577610 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
@@ -46,8 +46,9 @@ static Status ReplaceUsesWhileKeepingLoopInvariance(
   return Status::OK();
 }
 
-StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody(
+StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileLoop(
     HloInstruction* while_instr) {
+  HloComputation* while_cond = while_instr->while_condition();
   HloComputation* while_body = while_instr->while_body();
 
   const HloInstruction& init_value = *while_instr->operand(0);
@@ -57,24 +58,48 @@ StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody(
 
   bool changed = false;
 
-  for (HloInstruction* invariant_gte :
-       WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) {
-    int64 index = invariant_gte->tuple_index();
+  absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>
+      conditional_gte_index_to_insts =
+          WhileUtil::GetGTEsMapForWhileConditional(*while_cond);
+  std::vector<HloInstruction*> invariant_body_gtes =
+      WhileUtil::GetInvariantGTEsForWhileBody(*while_body);
+
+  for (HloInstruction* invariant_body_gte : invariant_body_gtes) {
+    int64 index = invariant_body_gte->tuple_index();
     const HloInstruction& invariant_value = *init_value.operand(index);
 
-    // Should have at least one user that's not while_body_root.
-    if (invariant_gte->user_count() <= 1) {
+    // Original value should be a constant.
+    if (invariant_value.opcode() != HloOpcode::kConstant) {
       continue;
     }
 
-    if (invariant_value.opcode() == HloOpcode::kConstant) {
-      auto* constant_instr =
+    // Sink into the while_body.
+    // Should have at least one user that's not while_body_root.
+    if (invariant_body_gte->user_count() > 1) {
+      HloInstruction* constant_instr =
           while_body->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk"));
       TF_RETURN_IF_ERROR(ReplaceUsesWhileKeepingLoopInvariance(
-          invariant_gte, constant_instr, while_body->root_instruction(),
+          invariant_body_gte, constant_instr, while_body->root_instruction(),
           index));
       changed = true;
     }
+
+    // Check if there is a corresponding GTE in while_conditional.
+    auto it = conditional_gte_index_to_insts.find(index);
+    if (it == conditional_gte_index_to_insts.end()) {
+      continue;
+    }
+
+    for (HloInstruction* invariant_cond_gte : it->second) {
+      // Should have at least one user.
+      if (invariant_cond_gte->user_count() > 0) {
+        HloInstruction* constant_instr = while_cond->AddInstruction(
+            invariant_value.Clone(/*suffix=*/".sunk"));
+        TF_RETURN_IF_ERROR(
+            invariant_cond_gte->ReplaceAllUsesWith(constant_instr));
+        changed = true;
+      }
+    }
   }
 
   return changed;
@@ -115,10 +140,8 @@ StatusOr<bool> WhileLoopConstantSinking::Run(HloModule* module) {
   }
 
   for (HloInstruction* while_instr : while_instrs) {
-    // We only sink into while loop bodies, but this can be extended to
-    // transform conditions as well.
     TF_ASSIGN_OR_RETURN(bool result,
-                        TrySinkingConstantsIntoWhileBody(while_instr));
+                        TrySinkingConstantsIntoWhileLoop(while_instr));
     changed |= result;
   }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
index 577bad6c7062d2ee40271e407e8eed7655fa13bf..a866bc1264b4013bb7530b5e02b546e6f78d676b 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
@@ -23,8 +23,8 @@ limitations under the License.
 namespace xla {
 
 // Sinks while loop invariant values that happen to be constants into the while
-// loop body.  This is probably not a win in isolation but may unlock further
-// optimizations like constant folding.
+// loop body and conditional. This is probably not a win in isolation but may
+// unlock further optimizations like constant folding.
 //
 //   state = (..., const, ...)
 //   while (pred(state)) {
@@ -46,22 +46,19 @@ namespace xla {
 // tuple trivially loop invariant.  WhileLoopSimplifier will later get rid of
 // `v`.
 //
-// We only sink into while loop bodies, but this can be extended to transform
-// conditions as well.
-//
 // TODO(b/79121449):  We should also sink broadcasts of constants.
 class WhileLoopConstantSinking : public HloModulePass {
  public:
   ~WhileLoopConstantSinking() override = default;
 
   absl::string_view name() const override {
-    return "while-loop-invariant-code-motion";
+    return "while-loop-constant-sinking";
   }
 
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  StatusOr<bool> TrySinkingConstantsIntoWhileBody(HloInstruction* while_instr);
+  StatusOr<bool> TrySinkingConstantsIntoWhileLoop(HloInstruction* while_instr);
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index 0e7667de832c54f647d071e3c9563091d0f994aa..75d406435b6f58faecc86b82c33e9e2dd6bccbea 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -114,7 +114,7 @@ HloModule ModuleWithWhile
 
 body {
   p_b = (f32[2],(f32[2],f32[2])) parameter(0)
-  p_b.0 = f32[2] get-tuple-element((f32[2],f32[2],f32[2]) p_b), index=0
+  p_b.0 = f32[2] get-tuple-element((f32[2],(f32[2],f32[2])) p_b), index=0
   p_b.1 = (f32[2],f32[2]) get-tuple-element((f32[2],(f32[2],f32[2])) p_b), index=1
 
   p_b.1.1 = f32[2] get-tuple-element(p_b.1), index=0
@@ -242,5 +242,178 @@ ENTRY entry {
     }
   }
 }
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalSinkConstant) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[],f32[]) parameter(0)
+  p_body.0 = f32[] get-tuple-element((f32[],f32[]) p_body), index=0
+  const = f32[] constant(1)
+  add = f32[] add(p_body.0, const)
+  p_body.1 = f32[] get-tuple-element((f32[],f32[]) p_body), index=1
+  ROOT root = (f32[],f32[]) tuple(add, p_body.1)
+}
+
+condition {
+  p_cond = (f32[],f32[]) parameter(0)
+  p_cond.0 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=0
+  p_cond.1 = f32[] get-tuple-element((f32[],f32[]) p_cond), index=1
+  ROOT result = pred[] less-than(p_cond.0, p_cond.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = f32[] constant(10)
+  while_init = (f32[],f32[]) tuple(const_0, const_1)
+  ROOT while = (f32[],f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(), op::Lt(_, op::Constant()));
+}
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalTupleShapedConstants) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_b = (f32[],(f32[],f32[])) parameter(0)
+  p_b.0 = f32[] get-tuple-element((f32[],(f32[],f32[])) p_b), index=0
+  p_b.1 = (f32[],f32[]) get-tuple-element((f32[],(f32[],f32[])) p_b), index=1
+  p_b.1.0 = f32[] get-tuple-element((f32[],f32[]) p_b.1), index=0
+  add = f32[] add(p_b.0, p_b.1.0)
+  ROOT root = (f32[],(f32[],f32[])) tuple(add, p_b.1)
+}
+
+condition {
+  p_c = (f32[],(f32[],f32[])) parameter(0)
+  p_c.0 = f32[] get-tuple-element((f32[],(f32[],f32[])) p_c), index=0
+  p_c.1 = (f32[],f32[]) get-tuple-element((f32[],(f32[],f32[])) p_c), index=1
+  p_c.1.1 = f32[] get-tuple-element((f32[],f32[]) p_c.1), index=1
+  ROOT result = pred[] less-than(p_c.0, p_c.1.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = (f32[], f32[]) constant((f32[], f32[]) (1, 10))
+  while_init = (f32[],(f32[],f32[])) tuple(const_0, const_1)
+  ROOT while = (f32[],(f32[],f32[])) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(),
+              op::Lt(_, op::GetTupleElement(op::Constant())));
+}
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalDontCreateDeadConstant) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[],f32[],f32[]) parameter(0)
+  p_body.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=0
+  const = f32[] constant(1)
+  add = f32[] add(p_body.0, const)
+  p_body.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=1
+  p_body.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=2
+  ROOT root = (f32[],f32[],f32[]) tuple(add, p_body.1, p_body.2)
+}
+
+condition {
+  p_cond = (f32[],f32[],f32[]) parameter(0)
+  p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0
+  p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1
+  p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
+  ROOT result = pred[] less-than(p_cond.0, p_cond.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = f32[] constant(10)
+  const_2 = f32[] constant(12)
+  while_init = (f32[],f32[],f32[]) tuple(const_0, const_1, const_2)
+  ROOT while = (f32[],f32[],f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(), op::Lt(_, op::Constant()));
+  for (const HloInstruction* inst : while_condition->instructions()) {
+    if (inst->opcode() == HloOpcode::kConstant) {
+      EXPECT_GT(inst->user_count(), 0);
+    }
+  }
+}
+
+TEST_F(WhileLoopConstantSinkingTest, ConditionalMultipleSameIndexGTEs) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[],f32[],f32[]) parameter(0)
+  p_body.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=0
+  const = f32[] constant(1)
+  add.0 = f32[] add(p_body.0, const)
+  p_body.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=1
+  add.1 = f32[] add(p_body.1, const)
+  p_body.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_body), index=2
+  ROOT root = (f32[],f32[],f32[]) tuple(add.0, add.1, p_body.2)
+}
+
+condition {
+  p_cond = (f32[],f32[],f32[]) parameter(0)
+  p_cond.0 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=0
+  p_cond.2 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
+  lt.0 = pred[] less-than(p_cond.0, p_cond.2)
+  p_cond.1 = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=1
+  p_cond.2.c = f32[] get-tuple-element((f32[],f32[],f32[]) p_cond), index=2
+  lt.1 = pred[] less-than(p_cond.1, p_cond.2.c)
+  ROOT result = pred[] and(lt.0, lt.1)
+}
+
+ENTRY entry {
+  const_0 = f32[] constant(0)
+  const_1 = f32[] constant(0)
+  const_2 = f32[] constant(12)
+  while_init = (f32[],f32[],f32[]) tuple(const_0, const_1, const_2)
+  ROOT while = (f32[],f32[],f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_condition = module->GetComputationWithName("condition");
+  EXPECT_THAT(while_condition->root_instruction(),
+              op::And(op::Lt(_, op::Constant()), op::Lt(_, op::Constant())));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 9795b2830b6d9add82b89ac76b5438ddc3d2bfe8..41011176ffa91e885bc58364d1fb19617d3518ad 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -19,7 +19,9 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
+#include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -143,6 +145,12 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   string while_instr_name = while_instr->ToString(print_no_metadata);
   VLOG(2) << "Trying to hoist from " << while_instr_name;
 
+  auto maybe_upper_bound = ComputeWhileLoopTripCountUpperBound(while_instr);
+  if (maybe_upper_bound && *maybe_upper_bound <= 1) {
+    VLOG(2) << "Loop has a trip count of at most 1, skipping.";
+    return false;
+  }
+
   HloComputation* while_body = while_instr->while_body();
 
   // Maps instructions in the while body to instructions hoisted outside the
@@ -180,6 +188,13 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
     return false;
   }
 
+  // LICM in the presence of domain instructions is complex, bail.
+  for (auto* instruction : while_body->MakeInstructionPostOrder()) {
+    if (instruction->opcode() == HloOpcode::kDomain) {
+      return false;
+    }
+  }
+
   // instructions_to_replace[i] is hoisted into a loop invariant instruction
   // replacement_instructions[i].
   std::vector<HloInstruction*> instructions_to_replace;
@@ -193,6 +208,37 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
       continue;
     }
 
+    if (!hoist_size_inflating_ops_) {
+      // Check that hoisting the instruction doesn't cause a significant memory
+      // blow-up. LICM extends the live-range of the output of the hoisted
+      // instruction to be the entire while loop, which may be problematic on
+      // platforms where memory is limited. This can be especially harmful if
+      // the instruction has a significantly larger output than its input, e.g.
+      // kIota, kBroadcast or kConstant.
+      int64 input_size = 0, output_size = 0;
+
+      for (auto* operand : instruction->operands()) {
+        ShapeUtil::ForEachSubshape(
+            operand->shape(),
+            [&input_size](const Shape& subshape, const ShapeIndex& /*index*/) {
+              if (ShapeUtil::IsArray(subshape)) {
+                input_size += ShapeUtil::ByteSizeOfElements(subshape);
+              }
+            });
+      }
+      ShapeUtil::ForEachSubshape(
+          instruction->shape(),
+          [&output_size](const Shape& subshape, const ShapeIndex& /*index*/) {
+            if (ShapeUtil::IsArray(subshape)) {
+              output_size += ShapeUtil::ByteSizeOfElements(subshape);
+            }
+          });
+
+      if (output_size > input_size) {
+        continue;
+      }
+    }
+
     auto is_invariant = [&](HloInstruction* op) {
       return hoisted_instructions.find(op) != hoisted_instructions.end() ||
              unhoisted_invariant_instructions.count(op) ||
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
index 3031899f71e0fd77f20448d9d7489798af01615c..bd6232dc0a988775a0490abbf6125daad8476295 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
@@ -34,8 +34,14 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
   // Setting `hoist_constants` to false can be help if LICM is run in the mid
   // level HLO pipeline because hoisting constants out of while loop bodies can
   // break optimizations like constant folding.
-  explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false)
-      : hoist_constants_(hoist_constants) {}
+  // Setting `hoist_size_inflating_ops` to false will forbid hoisting
+  // instructions where the size of the output(s) is larger than the size of the
+  // input(s). This is useful on platforms on which it's important to prevent
+  // blow-ups in memory size.
+  explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false,
+                                        bool hoist_size_inflating_ops = true)
+      : hoist_constants_(hoist_constants),
+        hoist_size_inflating_ops_(hoist_size_inflating_ops) {}
   ~WhileLoopInvariantCodeMotion() override = default;
 
   absl::string_view name() const override {
@@ -49,6 +55,7 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
       HloInstruction* while_instr);
 
   bool hoist_constants_;
+  bool hoist_size_inflating_ops_;
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 32e69c335b713c438bd7fcb2053709b0624f58ed..8e7c4bc8828552e197b41f874c070d496b85a382 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -26,7 +26,7 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class WhileLoopInvariantCodeMotionTest : public HloVerifiedTestBase {
+class WhileLoopInvariantCodeMotionTest : public HloTestBase {
  public:
   // Makes a computation which has one parameter, of the given shape, and always
   // returns PRED[]{true}.  This is useful as a dummy loop condition.
@@ -58,6 +58,7 @@ HloComputation* WhileLoopInvariantCodeMotionTest::MakeAlwaysTrueComputation(
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -76,19 +77,18 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  HloComputation* entry_computation =
-      module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  HloComputation* entry_computation = m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
   HloInstruction* transformed_while;
@@ -100,6 +100,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistOneInvariantOperation) {
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -135,19 +136,18 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, divide_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  HloComputation* entry_computation =
-      module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  HloComputation* entry_computation = m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
   HloInstruction* transformed_while;
@@ -173,6 +173,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistInvariantOperationTree) {
 TEST_F(WhileLoopInvariantCodeMotionTest,
        DontHoistTriviallyLoopVaryingComputation) {
   // Basic negative test: the add expression is not loop invariant.
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
 
@@ -189,20 +190,20 @@ TEST_F(WhileLoopInvariantCodeMotionTest,
             scalar_s32, HloOpcode::kAdd, gte_0, gte_1));
     builder.AddInstruction(HloInstruction::CreateTuple({gte_0, add_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(), Contains(op::Add()));
@@ -210,6 +211,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest,
 
 TEST_F(WhileLoopInvariantCodeMotionTest,
        DontHoistLoopVaryingComputationWithAlternatingTuples) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -228,25 +230,26 @@ TEST_F(WhileLoopInvariantCodeMotionTest,
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_1, gte_0, add_result}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(), Contains(op::Add()));
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto token_shape = ShapeUtil::MakeTokenShape();
   Shape while_shape =
@@ -267,7 +270,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, out_token}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
@@ -277,14 +280,14 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateTuple({scalar_param, scalar_param, token}));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_s32, while_inst, 0));
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   ASSERT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(),
@@ -294,6 +297,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistInstructionWithSideEffects) {
 TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
   // The bitcast's user, an outfeed, can't be hoisted, so don't hoist the
   // bitcast either.
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
   auto token_shape = ShapeUtil::MakeTokenShape();
@@ -317,7 +321,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, out_token}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
@@ -327,15 +331,15 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateTuple({scalar_param, scalar_param, token}));
   auto* while_inst = builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
   builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(scalar_s32, while_inst, 0));
 
-  module().AddEntryComputation(builder.Build());
+  m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 
   EXPECT_THAT(while_inst->while_body()->instructions(),
@@ -346,6 +350,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
   // The bitcast's user can be hoisted, so hoist the bitcast too.
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
   Shape while_shape =
@@ -367,21 +372,20 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_inst}));
 
-    return module().AddEmbeddedComputation(builder.Build());
+    return m->AddEmbeddedComputation(builder.Build());
   }();
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
 
-  HloComputation* entry_computation =
-      module().AddEntryComputation(builder.Build());
+  HloComputation* entry_computation = m->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
   HloInstruction* transformed_while;
@@ -396,6 +400,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistControlDependencies) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, scalar_s32});
@@ -416,22 +421,23 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistControlDependencies) {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_result}));
 
-    while_body = module().AddEmbeddedComputation(builder.Build());
+    while_body = m->AddEmbeddedComputation(builder.Build());
   }
 
   HloComputation::Builder builder(TestName());
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
+  auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   Shape while_shape = ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32});
 
@@ -439,7 +445,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
     HloComputation::Builder builder(TestName() + ".passthrough");
     HloInstruction* param = builder.AddInstruction(
         HloInstruction::CreateParameter(0, while_shape, "param"));
-    HloComputation* result = module().AddEmbeddedComputation(builder.Build());
+    HloComputation* result = m->AddEmbeddedComputation(builder.Build());
 
     result->AddInstruction(
         HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
@@ -450,11 +456,11 @@ TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
   auto* init_value = builder.AddInstruction(
       HloInstruction::CreateParameter(0, while_shape, "init_value"));
   builder.AddInstruction(HloInstruction::CreateWhile(
-      while_shape, MakeAlwaysTrueComputation(while_shape, &module()),
-      while_body, init_value));
-  module().AddEntryComputation(builder.Build());
+      while_shape, MakeAlwaysTrueComputation(while_shape, m.get()), while_body,
+      init_value));
+  m->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
@@ -482,14 +488,14 @@ ENTRY entry {
 )";
 
 TEST_F(WhileLoopInvariantCodeMotionTest, HoistsConstantWhenAsked) {
-  ParseAndVerifyModule(kConstantHoistingTestCase);
+  auto m = ParseAndReturnVerifiedModule(kConstantHoistingTestCase).ValueOrDie();
 
   TF_ASSERT_OK_AND_ASSIGN(
       bool simplified_loop,
-      WhileLoopInvariantCodeMotion{/*hoist_constants=*/true}.Run(&module()));
+      WhileLoopInvariantCodeMotion{/*hoist_constants=*/true}.Run(m.get()));
   EXPECT_TRUE(simplified_loop);
 
-  HloComputation* while_body = module().GetComputationWithName("wide.body");
+  HloComputation* while_body = m->GetComputationWithName("wide.body");
   ASSERT_NE(while_body, nullptr);
 
   // We expect the while body to be the equivalent of:
@@ -523,10 +529,98 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistsConstantWhenAsked) {
 }
 
 TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistConstantByDefault) {
-  ParseAndVerifyModule(kConstantHoistingTestCase);
+  auto m = ParseAndReturnVerifiedModule(kConstantHoistingTestCase).ValueOrDie();
 
   TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
-                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+                          WhileLoopInvariantCodeMotion{}.Run(m.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, DoNotHoistOutOfSingleIteration) {
+  const char* const kHloModule = R"(
+    HloModule ModuleWithWhile
+
+    body {
+      p_body = (f32[2], f32[2], f32[2], s32[]) parameter(0)
+      val.0 = f32[2] get-tuple-element(p_body), index=0
+      val.1 = f32[2] get-tuple-element(p_body), index=1
+      add = f32[2] add(val.0, val.1)
+      const = s32[] constant(-1)
+      ROOT root = (f32[2], f32[2], f32[2], s32[]) tuple(val.0, val.1, add, const)
+    }
+
+    condition {
+      p_cond = (f32[2], f32[2], f32[2], s32[]) parameter(0)
+      gte = s32[] get-tuple-element(p_cond), index=3
+      const = s32[] constant(42)
+      ROOT result = pred[] equal-to(gte, const)
+    }
+
+    ENTRY entry {
+      param.0 = f32[2] parameter(0)
+      param.1 = s32[] parameter(1)
+      while_init = (f32[2], f32[2], f32[2], s32[]) tuple(param.0, param.0, param.0, param.1)
+      ROOT while = (f32[2], f32[2], f32[2], s32[]) while(while_init), condition=condition, body=body
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHloModule));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(module.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
+const char* const kInflatingTestCase = R"(
+HloModule ModuleWithWhile
+
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+
+body {
+  p_body = (f32[]) parameter(0)
+  iota = f32[1024, 1024] iota(), iota_dimension=0
+  add = f32[1024, 1024] add(iota, iota)
+  constant = f32[] constant(1.0)
+  reduce = f32[] reduce(f32[1024, 1024] add, f32[] constant), dimensions={0,1}, to_apply=mul
+  ROOT root = (f32[]) tuple(reduce)
+}
+
+condition {
+  p_cond = (f32[]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  param = f32[] parameter(0)
+  while_init = (f32[]) tuple(param)
+  ROOT while = (f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+TEST_F(WhileLoopInvariantCodeMotionTest, HoistsInflatingByDefault) {
+  auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopInvariantCodeMotion(/*hoist_constants=*/true).Run(m.get()));
+  EXPECT_TRUE(simplified_loop);
+
+  HloComputation* while_body = m->GetComputationWithName("wide.body");
+  ASSERT_NE(while_body, nullptr);
+  EXPECT_THAT(while_body->instructions(), Not(Contains(op::Iota())));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, NoHoistInflating) {
+  auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopInvariantCodeMotion(/*hoist_constants=*/true,
+                                   /*hoist_size_inflating_ops=*/false)
+          .Run(m.get()));
   EXPECT_FALSE(simplified_loop);
 }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 630d71e5ca25e9d282ce6283284a32d6f725a193..d30f67dd8110b88166fe807762fb653190ec00bc 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -19,41 +19,19 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 
 namespace xla {
 
+namespace m = match;
 using absl::optional;
-
-// Determines whether the given instruction is a send/recv node, or has a
-// subcomputation which contains a send/recv node.
-static bool IsOrContainsSendOrRecv(const HloInstruction* instr);
-
-// Determines whether the given computation contains a send or recv node.
-static bool ContainsSendOrRecv(const HloComputation* comp) {
-  for (const auto* instr : comp->instructions()) {
-    if (IsOrContainsSendOrRecv(instr)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-static bool IsOrContainsSendOrRecv(const HloInstruction* instr) {
-  if (instr->opcode() == HloOpcode::kSend ||
-      instr->opcode() == HloOpcode::kSendDone ||
-      instr->opcode() == HloOpcode::kRecv ||
-      instr->opcode() == HloOpcode::kRecvDone) {
-    return true;
-  }
-  for (const auto& subcomp : instr->called_computations()) {
-    if (ContainsSendOrRecv(subcomp)) {
-      return true;
-    }
-  }
-  return false;
-}
+using hlo_query::ContainsInstrWithOpcode;
 
 // Tries to remove elements in a while loop's tuple that aren't used within the
 // loop.
@@ -253,7 +231,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // Create the new while condition, body, and init value.
   std::unique_ptr<HloComputation> new_while_cond =
       while_cond->CloneWithReplacements(
-          make_while_computation_replacements(while_cond), /*extras=*/{});
+          make_while_computation_replacements(while_cond));
 
   std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       while_body_replacements = make_while_computation_replacements(while_body);
@@ -266,8 +244,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   while_body_replacements.emplace(
       while_body_root, HloInstruction::CreateTuple(new_while_body_root_elems));
   std::unique_ptr<HloComputation> new_while_body =
-      while_body->CloneWithReplacements(std::move(while_body_replacements),
-                                        /*extras=*/{});
+      while_body->CloneWithReplacements(std::move(while_body_replacements));
 
   // Add a new while_init instruction that repackages the old while_init
   // instruction's elements.  We rely on the AlgebraicSimplifier and DCE to
@@ -329,6 +306,147 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   return true;
 }
 
+// Removes each loop parameter (i.e. member of the while loop tuple) that is a
+// constant and is the same in the while loop body and the while loop init.
+static StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+
+  absl::flat_hash_set<int64> constant_tuple_indices;
+  const auto& while_shape = while_init->shape();
+  for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    auto* init_elem = while_init->operand(i);
+    auto* body_elem = while_body_root->operand(i);
+    if (init_elem->opcode() == HloOpcode::kConstant &&
+        body_elem->opcode() == HloOpcode::kConstant &&
+        init_elem->literal() == body_elem->literal()) {
+      constant_tuple_indices.insert(i);
+    }
+  }
+
+  if (constant_tuple_indices.empty()) {
+    return false;
+  }
+
+  // OK, we found some constant elements of the while parameter!  Eliminate
+  // them.
+  std::vector<Shape> new_while_shape_elems;
+  for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    if (!constant_tuple_indices.count(i)) {
+      new_while_shape_elems.push_back(while_shape.tuple_shapes(i));
+    }
+  }
+  Shape new_while_shape = ShapeUtil::MakeTupleShape(new_while_shape_elems);
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  // Returns a new tuple without the elements of constant_tuple_indices.
+  auto remove_constant_elems = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), while_shape));
+
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      if (!constant_tuple_indices.count(i)) {
+        tuple_elems.push_back(
+            add_new_instr(HloInstruction::CreateGetTupleElement(
+                while_shape.tuple_shapes(i), instr, i)));
+      }
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  auto add_constant_elems = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
+
+    std::vector<HloInstruction*> tuple_elems;
+    int64 j = 0;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      if (constant_tuple_indices.count(i)) {
+        tuple_elems.push_back(while_init->mutable_operand(i));
+      } else {
+        tuple_elems.push_back(
+            add_new_instr(HloInstruction::CreateGetTupleElement(
+                while_shape.tuple_shapes(i), instr, j)));
+        ++j;
+      }
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Special case: constant_tuple_indices covers the whole while parameter, so
+  // the new while shape is the empty tuple.  In this case, the value of the
+  // while loop is simply equal to the value of `init`.
+  //
+  // It's unfortunate to special-case this, but it's simpler than the
+  // alternative.  The problem is that if our while parameter has no
+  // non-constant elems, the tuple returned by `add_constant_elems` won't depend
+  // on instr (the loop body/cond parameter), and therefore
+  // CloneWithReplacementPairs will *leave the parameter out entirely*, creating
+  // invalid HLO.
+  if (ShapeUtil::IsEmptyTuple(new_while_shape)) {
+    TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, while_init));
+    return true;
+  }
+
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          add_constant_elems(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  std::unique_ptr<HloComputation> new_while_body =
+      while_body->CloneWithReplacementPairs(
+          {
+              while_body->parameter_instruction(0),
+              add_constant_elems(add_new_instr(HloInstruction::CreateParameter(
+                  0, new_while_shape,
+                  while_cond->parameter_instruction(0)->name()))),
+          },
+          {
+              while_body->root_instruction(),
+              remove_constant_elems(
+                  add_new_instr(while_body->root_instruction()->Clone())),
+          });
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op,
+      add_constant_elems(
+          computation->AddInstruction(HloInstruction::CreateWhile(
+              new_while_shape,
+              module->AddEmbeddedComputation(std::move(new_while_cond)),
+              module->AddEmbeddedComputation(std::move(new_while_body)),
+              add_new_instr(remove_constant_elems(while_init)))))));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return true;
+}
+
 // Tries to remove a while loop from the graph.
 //
 //  - Loops with trip count of 0 can be replaced by the loop's "init" value.
@@ -408,16 +526,14 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
   // performance by forcing us to copy constants.
   absl::flat_hash_map<int, const HloInstruction*> index_to_constant;
   for (int i = 0; i < root_operands.size(); i++) {
-    HloInstruction* instr = root_operands[i];
-    if (instr->opcode() == HloOpcode::kGetTupleElement &&
-        instr->tuple_index() == i && instr->operand(0) == while_body_param &&
-        ShapeUtil::IsScalar(instr->shape())) {
-      auto tuple_element = while_init->operand(i);
-      if (tuple_element->IsConstant()) {
-        VLOG(3) << "Found loop invariant tuple element " << i << " "
-                << tuple_element->ToString();
-        index_to_constant[i] = tuple_element;
-      }
+    const HloInstruction* init_tuple_elem = nullptr;
+    if (Match(root_operands[i],
+              m::GetTupleElement(m::Op().Is(while_body_param), i)
+                  .WithShape(m::Shape().IsScalar())) &&
+        Match(while_init->operand(i), m::Constant(&init_tuple_elem))) {
+      VLOG(3) << "Found loop invariant tuple element " << i << " "
+              << init_tuple_elem->ToString();
+      index_to_constant[i] = init_tuple_elem;
     }
   }
 
@@ -458,6 +574,409 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
   return changed_cond || changed_body;
 }
 
+// Converts a flat list of instructions into a tuple of the desired shape.  For
+// example, given a tuple shape ((x, x), x) and instructions {A, B, C}, returns
+// a tuple of value ((A, B), C).
+//
+// desired_shape must be a tuple.  (This precondition allows us to return a
+// unique_ptr rather than a raw ptr.)
+static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
+    absl::Span<HloInstruction*> instrs, const Shape& desired_shape,
+    std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
+  CHECK(ShapeUtil::IsTuple(desired_shape))
+      << ShapeUtil::HumanString(desired_shape);
+
+  // For each child shape in `desired_shape`, slice out the correct number of
+  // `instrs` and call UnflattenTupleInstr recursively.  At each step we remove
+  // elements from `instrs` so that it only contains instructions we have not
+  // yet processed.
+  std::vector<HloInstruction*> elems;
+  for (int64 i = 0; i < desired_shape.tuple_shapes_size(); ++i) {
+    const Shape& subshape = desired_shape.tuple_shapes(i);
+    if (!ShapeUtil::IsTuple(subshape)) {
+      elems.push_back(instrs[0]);
+      instrs.remove_prefix(1);
+      continue;
+    }
+
+    // Count the number of leaf nodes underneath desired_shape[i].
+    int64 num_leaves = 0;
+    ShapeUtil::ForEachSubshape(
+        subshape, [&](const Shape& s, const ShapeIndex& /*index*/) {
+          if (!ShapeUtil::IsTuple(s)) {
+            ++num_leaves;
+          }
+        });
+
+    std::unique_ptr<HloInstruction> subinstr =
+        UnflattenTupleInstr(instrs.subspan(0, num_leaves),
+                            desired_shape.tuple_shapes(i), new_instrs);
+    elems.push_back(subinstr.get());
+    new_instrs->push_back(std::move(subinstr));
+    instrs.remove_prefix(num_leaves);
+  }
+  return HloInstruction::CreateTuple(elems);
+}
+
+// Builds a vector whose elements are the values in the flattened tuple for
+// `instr`.  For example, if `instr` is a tuple of form ((A, B), C), returns the
+// vector {A, B, C} (or kGetTupleElement ops which point to A, B, and C).
+static std::vector<HloInstruction*> GetFlatTupleElems(
+    HloInstruction* instr,
+    std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
+  const auto& shape = instr->shape();
+  if (!ShapeUtil::IsTuple(shape)) {
+    return {instr};
+  }
+  std::vector<HloInstruction*> elems;
+  for (int64 i = 0; i < shape.tuple_shapes_size(); ++i) {
+    const Shape& subshape = shape.tuple_shapes(i);
+    new_instrs->push_back(
+        HloInstruction::CreateGetTupleElement(subshape, instr, i));
+    auto* gte = new_instrs->back().get();
+    auto flattened_subshape = GetFlatTupleElems(gte, new_instrs);
+    elems.insert(elems.end(), flattened_subshape.begin(),
+                 flattened_subshape.end());
+  }
+  return elems;
+}
+
+static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+  Shape while_shape = while_init->shape();
+  if (!ShapeUtil::IsNestedTuple(while_shape)) {
+    return false;
+  }
+
+  std::vector<Shape> flattened_shape_elems;
+  ShapeUtil::ForEachSubshape(while_shape,
+                             [&](const Shape& s, const ShapeIndex& /*index*/) {
+                               if (!ShapeUtil::IsTuple(s)) {
+                                 flattened_shape_elems.push_back(s);
+                               }
+                             });
+  Shape flattened_shape = ShapeUtil::MakeTupleShape(flattened_shape_elems);
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  auto nested = [&](HloInstruction* instr) {
+    std::vector<HloInstruction*> gtes;
+    const Shape& flat_shape = instr->shape();
+    for (int64 i = 0; i < flat_shape.tuple_shapes_size(); ++i) {
+      gtes.push_back(add_new_instr(HloInstruction::CreateGetTupleElement(
+          flat_shape.tuple_shapes(i), instr, i)));
+    }
+    auto nested_instr =
+        UnflattenTupleInstr(absl::MakeSpan(gtes), while_shape, &new_instrs);
+    CHECK(ShapeUtil::Compatible(nested_instr->shape(), while_shape))
+        << ShapeUtil::HumanString(nested_instr->shape()) << " vs "
+        << ShapeUtil::HumanString(while_shape);
+    return nested_instr;
+  };
+
+  auto flattened = [&](HloInstruction* instr) {
+    return HloInstruction::CreateTuple(GetFlatTupleElems(instr, &new_instrs));
+  };
+
+  // Create a new while-condition computation, where parameter 0 has flat shape
+  // but all uses of it go through the nested shape.
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          nested(add_new_instr(HloInstruction::CreateParameter(
+              0, flattened_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  // Create a new while-body computation, where parameter 0 has a flat shape and
+  // all uses of it go through the nested shape, and where the root has a flat
+  // shape constructed from the old nested root.
+  std::unique_ptr<HloComputation> new_while_body =
+      while_body->CloneWithReplacementPairs(
+          {
+              while_body->parameter_instruction(0),
+              nested(add_new_instr(HloInstruction::CreateParameter(
+                  0, flattened_shape,
+                  while_body->parameter_instruction(0)->name()))),
+          },
+          {
+              while_body->root_instruction(),
+              flattened(add_new_instr(while_body->root_instruction()->Clone())),
+          });
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op, nested(computation->AddInstruction(HloInstruction::CreateWhile(
+                    flattened_shape,
+                    module->AddEmbeddedComputation(std::move(new_while_cond)),
+                    module->AddEmbeddedComputation(std::move(new_while_body)),
+                    computation->AddInstruction(flattened(while_init)))))));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return true;
+}
+
+// Tries to merge loop induction variables of a given type.
+//
+// In this pass we're only concerned with elements of the loop's tuple that
+// are effective-scalars of type `elem_ty`.  Some terminology:
+//
+//  - The trip counter is the first element of the loop's tuple that starts at
+//    0 and does x++ on each iteration.
+//
+//  - An induction variable is an element of the loop's tuple that is not the
+//    trip counter and does `x += <constant>` on each iteration of the loop.
+//    Negative constants are OK.
+//
+// This pass adds a trip counter if one isn't already present, then replaces
+// each induction variable with
+//
+//   <initial_value> + <trip_count> * <constant>.
+//
+// This reduces the number of scalar operations in the loop, which is important
+// e.g. on GPUs, where each scalar operation is nontrivially expensive because
+// it's a separate kernel launch.
+//
+// Returns the new loop if a change was made, or null if no change was made.
+// Note that the new loop is not a valid replacement for the old loop; it may
+// need to be wrapped in a tuple that changes its shape.  We return the loop
+// itself so that you can call TryMergeInductionVariables in a loop, once for
+// each integral type elem_ty.
+static StatusOr<HloInstruction*> TryMergeInductionVariables(
+    HloInstruction* while_op, PrimitiveType elem_ty) {
+  CHECK(primitive_util::IsIntegralType(elem_ty)) << PrimitiveType_Name(elem_ty);
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return nullptr;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+  Shape while_shape = while_init->shape();
+
+  // The tuple index of the trip counter, if one is present.
+  absl::optional<int64> trip_counter;
+  // Maps the tuple index of each induction variable to its constant increment.
+  absl::flat_hash_map<int64, const HloConstantInstruction*> induction_vars;
+  for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
+    HloInstruction* constant;
+    if (!Match(while_body_root->mutable_operand(i),
+               m::AddAnyOrder(m::GetTupleElement(m::Parameter(), i),
+                              m::ConstantScalar(&constant))
+                   .WithShape(m::Shape().WithElementType(elem_ty)))) {
+      continue;
+    }
+    if (!trip_counter && constant->literal().IsAll(1) &&
+        while_init->operand(i)->IsConstant() &&
+        while_init->operand(i)->literal().IsAll(0)) {
+      VLOG(10) << "Found existing trip counter at index " << i;
+      trip_counter = i;
+    } else {
+      VLOG(10) << "Found induction variable at index " << i;
+      induction_vars.emplace(i, Cast<HloConstantInstruction>(constant));
+    }
+  }
+
+  // There's only something to simplify if we can either:
+  //
+  //  - combine one or more induction vars with an existing trip counter, or
+  //  - replace two or more induction variables with a new trip counter.
+  //
+  // Put another way, there's only something to simplify if the number of
+  // induction vars plus the number of existing trip counters (0 or 1) is >= 2.
+  if (induction_vars.size() + (trip_counter.has_value() ? 1 : 0) < 2) {
+    return nullptr;
+  }
+
+  // OK, we're going to do the transformation!  Set up some helpers.
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  auto add_binary_op = [&](const Shape& shape, HloOpcode opcode,
+                           HloInstruction* lhs, HloInstruction* rhs) {
+    // Reshape lhs/rhs to the output shape if necessary.  This deals with the
+    // fact that induction variables need only be effective scalars, not true
+    // scalars.
+    if (!ShapeUtil::Compatible(shape, lhs->shape())) {
+      lhs = add_new_instr(HloInstruction::CreateReshape(shape, lhs));
+    }
+    if (!ShapeUtil::Compatible(shape, rhs->shape())) {
+      rhs = add_new_instr(HloInstruction::CreateReshape(shape, rhs));
+    }
+    return add_new_instr(HloInstruction::CreateBinary(shape, opcode, lhs, rhs));
+  };
+
+  auto add_gte = [&](HloInstruction* src, int64 idx) {
+    return add_new_instr(HloInstruction::CreateGetTupleElement(
+        src->shape().tuple_shapes(idx), src, idx));
+  };
+
+  // Our new while loop will have the same shape as the old while loop, except
+  // we'll add a trip counter to the end if it wasn't originally present.
+  Shape new_while_shape = while_shape;
+  bool added_trip_counter = false;
+  if (!trip_counter) {
+    VLOG(10) << "Adding new trip counter to end of loop's tuple.";
+    trip_counter = new_while_shape.tuple_shapes_size();
+    *new_while_shape.add_tuple_shapes() =
+        ShapeUtil::MakeShape(elem_ty, /*dimensions=*/{});
+    added_trip_counter = true;
+  }
+
+  // Converts `instr` into a tuple of the "old" form -- that is, to a tuple with
+  // shape `while_body->shape()` and where the induction variables are "reified"
+  // (i.e. they have value <init> + <counter> * <constant>).
+  auto convert_to_old_form = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      const auto& elem_shape = while_shape.tuple_shapes(i);
+      if (!induction_vars.count(i)) {
+        tuple_elems.push_back(add_gte(instr, i));
+        continue;
+      }
+      tuple_elems.push_back(add_binary_op(
+          elem_shape, HloOpcode::kAdd, add_gte(instr, i),
+          add_binary_op(elem_shape, HloOpcode::kMultiply,
+                        add_gte(instr, *trip_counter),
+                        add_new_instr(induction_vars.at(i)->Clone()))));
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Converts `root` into a tuple of the "new" form -- that is, to a tuple with
+  // shape `new_while_shape` and where the induction variables (but not trip
+  // counters) are replaced with their unchanging <loop_body_param> values.
+  auto convert_to_new_form = [&](HloInstruction* old_root,
+                                 HloParameterInstruction* loop_body_param) {
+    CHECK(ShapeUtil::Compatible(old_root->shape(), while_shape));
+    std::vector<HloInstruction*> tuple_elems;
+
+    // In the new form, induction variables come from `init`, everything else
+    // (including the trip counter if it's not one we created ourselves) comes
+    // from the `root` tuple unmodified.
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      tuple_elems.push_back(
+          add_gte((induction_vars.count(i) ? loop_body_param : old_root), i));
+    }
+    // If we created a trip counter ourselves, add 1 to it in the next
+    // iteration.
+    if (added_trip_counter) {
+      tuple_elems.push_back(add_binary_op(
+          new_while_shape.tuple_shapes(*trip_counter), HloOpcode::kAdd,
+          add_gte(loop_body_param, *trip_counter),
+          add_new_instr(
+              HloInstruction::CreateConstant(LiteralUtil::One(elem_ty)))));
+    }
+
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Creates a new init tuple, which is the same as the old init tuple except if
+  // we added a trip counter, it's set to 0.
+  auto get_new_while_init = [&](HloInstruction* init) {
+    CHECK(ShapeUtil::Compatible(init->shape(), while_shape));
+    if (!added_trip_counter) {
+      return init;
+    }
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      tuple_elems.push_back(add_gte(init, i));
+    }
+    tuple_elems.push_back(add_new_instr(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(elem_ty))));
+    return add_new_instr(HloInstruction::CreateTuple(tuple_elems));
+  };
+
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          convert_to_old_form(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  // Creating the new while body proceeds in two steps.  First we convert the
+  // users of the parameter to the old form.  Then as a second
+  // CloneWithReplacement operation we convert the root to the new form.  We
+  // have to do this in two steps because the new root needs to use the new
+  // param0, and during the first clone operation, only the *old-form* param0 is
+  // accessible.
+  //
+  // We have to add temp_new_while_body to the module because cloning a
+  // computation touches the module (to get its NameUniquer).
+  HloComputation* temp_new_while_body =
+      module->AddEmbeddedComputation(while_body->CloneWithReplacementPairs({
+          while_body->parameter_instruction(0),
+          convert_to_old_form(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_body->parameter_instruction(0)->name()))),
+      }));
+  std::unique_ptr<HloComputation> new_while_body =
+      temp_new_while_body->CloneWithReplacementPairs({
+          temp_new_while_body->root_instruction(),
+          convert_to_new_form(
+              add_new_instr(temp_new_while_body->root_instruction()->Clone()),
+              Cast<HloParameterInstruction>(
+                  temp_new_while_body->parameter_instruction(0))),
+      });
+  TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(temp_new_while_body));
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  auto* new_while = computation->AddInstruction(HloInstruction::CreateWhile(
+      new_while_shape,
+      module->AddEmbeddedComputation(std::move(new_while_cond)),
+      module->AddEmbeddedComputation(std::move(new_while_body)),
+      get_new_while_init(while_init)));
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op, convert_to_old_form(new_while)));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return new_while;
+}
+
 StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(3,
                  "WhileLoopSimplifier::Run(), before:\n" + module->ToString());
@@ -478,32 +997,77 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
   for (HloInstruction* while_op : while_ops) {
     // We can't remove while loops that contain send/recv nodes, because we rely
     // on the particular loop structure around the node matching on the send and
-    // recv sides.  Removing dead while params requires us to remove the loop
+    // recv sides.  Other while simplifications require us to remove the loop
     // and replace it with a new one, so we can't do that either.
-    if (ContainsSendOrRecv(while_op->while_body()) ||
-        ContainsSendOrRecv(while_op->while_condition())) {
+    if (ContainsInstrWithOpcode(while_op->while_body(),
+                                {HloOpcode::kSend, HloOpcode::kSendDone,
+                                 HloOpcode::kRecv, HloOpcode::kRecvDone}) ||
+        ContainsInstrWithOpcode(while_op->while_condition(),
+                                {HloOpcode::kSend, HloOpcode::kSendDone,
+                                 HloOpcode::kRecv, HloOpcode::kRecvDone})) {
       VLOG(2) << "Not attempting to simplify while loop because it contains a "
                  "send/recv node: "
               << while_op->ToShortString();
       continue;
     }
 
-    StatusOr<bool> result = TryPropagateConstant(while_op);
-    TF_RETURN_IF_ERROR(result.status());
-    changed |= result.ValueOrDie();
+    TF_ASSIGN_OR_RETURN(bool result, TryPropagateConstant(while_op));
+    changed |= result;
+
+    TF_ASSIGN_OR_RETURN(result, TryRemoveWhileLoop(while_op));
+    changed |= result;
+    if (result) {
+      // Don't continue simplifying after successfully removing the while loop
+      // -- that would result in use-after-free nastiness.
+      continue;
+    }
+
+    // TODO(b/119281462): Cowardly refuse to perform any of the following
+    // optimizations in the presence of kDomain instructions.  It seems that
+    // modifying a while loop's tuple doesn't work when kDomain is present.
+    if (ContainsInstrWithOpcode(while_op->while_body(), {HloOpcode::kDomain}) ||
+        ContainsInstrWithOpcode(while_op->while_condition(),
+                                {HloOpcode::kDomain})) {
+      continue;
+    }
+
+    // Each of the optimizations below modifies the while loop itself if it's
+    // successful, meaning that `while_op` is no longer valid after one of these
+    // transformations returns true.
 
-    result = TryRemoveWhileLoop(while_op);
-    TF_RETURN_IF_ERROR(result.status());
-    if (result.ValueOrDie()) {
-      changed = true;
-      // Don't try to remove dead while params after successfully removing the
-      // while loop -- that would result in use-after-free nastiness.
+    TF_ASSIGN_OR_RETURN(result, TryFlattenNestedTuples(while_op));
+    changed |= result;
+    if (result) {
       continue;
     }
 
-    result = TryRemoveDeadWhileParams(while_op);
-    TF_RETURN_IF_ERROR(result.status());
-    changed |= result.ValueOrDie();
+    TF_ASSIGN_OR_RETURN(result, TryRemoveDeadWhileParams(while_op));
+    changed |= result;
+    if (result) {
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(result, TryRemoveConstantParams(while_op));
+    changed |= result;
+    if (result) {
+      continue;
+    }
+
+    bool merged_induction_vars = false;
+    // Notably missing from this list are S16 and U16.  These don't currently
+    // work because S/U16 literals are not implemented.
+    for (auto elem_ty : {S8, U8, S32, U32, S64, U64}) {
+      TF_ASSIGN_OR_RETURN(auto* new_while_op,
+                          TryMergeInductionVariables(while_op, elem_ty));
+      if (new_while_op) {
+        while_op = new_while_op;
+        changed = true;
+        merged_induction_vars = true;
+      }
+    }
+    if (merged_induction_vars) {
+      continue;
+    }
   }
 
   XLA_VLOG_LINES(3,
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
index 0bc5a0107bbcfb3b29a01d593fb79b89a863e49b..a378f179c63c788cd205ddbb784dee0e6b2106d7 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.h
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -25,11 +25,22 @@ namespace xla {
 // HLO pass that makes the following transformations on while loops:
 //
 //  - A while loop with static trip count of 0 is deleted.
+//
 //  - A while loop with static trip count of 1 is replaced by its body (sans
 //    loop).
+//
 //  - Elements of a while loop's tuple that the loop doesn't use are removed
 //    from the tuple.
 //
+//  - If the while loop's parameter is a nested tuple, it's flattened to a
+//    single-level tuple.  This is good because it usually reduces the number of
+//    kTuple instructions, but also because it unlocks additional optimizations
+//    (e.g. removing unused loop parameters).
+//
+// Flattening nested while loop tuples adds a whole mess of likely unnecessary
+// kGetTupleElement and kTuple operations to the graph.  We expect that tuple
+// simplifier will be run afterwards.
+//
 class WhileLoopSimplifier : public HloModulePass {
  public:
   ~WhileLoopSimplifier() override {}
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 1c892ba179ec67ccc9dbfe93d925551d6977ba15..4950e8269e9cf0723d717bd1734518d104c0c9f2 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -17,28 +17,45 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
+using ::testing::_;
 namespace op = xla::testing::opcode_matchers;
 
-class WhileLoopSimplifierTest : public HloVerifiedTestBase {
+// Returns the first kWhile instruction within m's entry computation.
+HloInstruction* FindFirstWhile(HloModule* m) {
+  const auto& instrs = m->entry_computation()->instructions();
+  return *absl::c_find_if(instrs, [](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kWhile;
+  });
+}
+
+class WhileLoopSimplifierTest : public HloTestBase {
  protected:
   // Makes an HloModule that contains a loop with `num_iters` iteration.
-  void MakeModuleWithSimpleLoop(int num_iters);
+  TF_MUST_USE_RESULT std::unique_ptr<VerifiedHloModule>
+  MakeModuleWithSimpleLoop(int num_iters);
 
   // Similar to MakeModuleWithSimpleLoop except that the loop bound is passed to
   // the loop-condition through an element of a tuple which is the
   // loop-condition parameter.
-  void MakeModuleWithSimpleLoopTupleElementLoopBound(int num_iters);
+  TF_MUST_USE_RESULT std::unique_ptr<VerifiedHloModule>
+  MakeModuleWithSimpleLoopTupleElementLoopBound(int num_iters);
 };
 
-void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
+std::unique_ptr<VerifiedHloModule>
+WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
   string hlo_string_template = R"(
   HloModule SimpleLoop
   SimpleLoop.body {
@@ -67,10 +84,11 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoop(int num_iters) {
 
   string hlo_string = absl::StrReplaceAll(
       hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}});
-  ParseAndVerifyModule(hlo_string);
+  return ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
 }
 
-void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
+std::unique_ptr<VerifiedHloModule>
+WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
     int num_iters) {
   string hlo_string_template = R"(
   HloModule SimpleLoopWithIndirectLoopBound
@@ -104,60 +122,55 @@ void WhileLoopSimplifierTest::MakeModuleWithSimpleLoopTupleElementLoopBound(
 
   string hlo_string = absl::StrReplaceAll(
       hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(42 + num_iters)}});
-  ParseAndVerifyModule(hlo_string);
+  return ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithZeroIterationSimiplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/0);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/0);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Constant(), op::Constant()));
 }
 
 TEST_F(WhileLoopSimplifierTest,
        LoopWithZeroIterationTupleElementLoopBoundSimplified) {
-  MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/0);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/0);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Constant(), op::Constant(), op::Constant()));
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithOneIterationSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Add(), op::Multiply()));
 }
 
 TEST_F(WhileLoopSimplifierTest,
        LoopWithOneIterationTupleELementLoopBoundSimplified) {
-  MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
-  EXPECT_THAT(the_module->entry_computation()->root_instruction(),
+  auto m = MakeModuleWithSimpleLoopTupleElementLoopBound(/*num_iters=*/1);
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(op::Add(), op::Multiply(), op::Constant()));
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithTwoIterationsNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/2);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/2);
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest,
        LoopWithControlDependencySimplifiedDependencyPreserved) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* true_op = while_op->while_body()->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(true)));
   TF_ASSERT_OK(true_op->AddControlDependencyTo(
       while_op->while_body()->root_instruction()));
-  ASSERT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  ASSERT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction()->control_predecessors(),
               ElementsAre(op::Constant()))
       << computation->ToString();
@@ -166,9 +179,8 @@ TEST_F(WhileLoopSimplifierTest,
 // Loops that contain send/recv nodes can't be simplified; the loop structure
 // around send/recv nodes must be preserved.
 TEST_F(WhileLoopSimplifierTest, LoopWithSendNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
@@ -179,13 +191,12 @@ TEST_F(WhileLoopSimplifierTest, LoopWithSendNotSimplified) {
       token,
       /*channel_id=*/0));
   while_body->AddInstruction(HloInstruction::CreateSendDone(send));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
@@ -194,7 +205,7 @@ TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
       HloInstruction::CreateRecv(ShapeUtil::MakeShape(F32, {1}), token,
                                  /*channel_id=*/0));
   while_body->AddInstruction(HloInstruction::CreateRecvDone(recv));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // The limitation on not being able to simplify loops that contain infeeds (and
@@ -202,16 +213,15 @@ TEST_F(WhileLoopSimplifierTest, LoopWithRecvNotSimplified) {
 // fact that our infrastructure sees simplifying such a loop as tantamount to
 // removing the non-removable instruction.
 TEST_F(WhileLoopSimplifierTest, LoopWithInfeedNotSimplified) {
-  MakeModuleWithSimpleLoop(/*num_iters=*/1);
-  HloModule* the_module = &module();
-  HloComputation* computation = the_module->entry_computation();
+  auto m = MakeModuleWithSimpleLoop(/*num_iters=*/1);
+  HloComputation* computation = m->entry_computation();
   auto* while_op = computation->root_instruction();
   ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
   auto* while_body = while_op->while_body();
   auto token = while_body->AddInstruction(HloInstruction::CreateToken());
   while_body->AddInstruction(HloInstruction::CreateInfeed(
       ShapeUtil::MakeShape(F32, {1}), token, "config"));
-  EXPECT_FALSE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // A non-tuple shaped loop shouldn't be simplified or crash the compiler.
@@ -236,8 +246,8 @@ TEST_F(WhileLoopSimplifierTest, NonTupleShapedLoopNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // A while loop that does nothing else besides swapping tuple elements
@@ -268,8 +278,8 @@ TEST_F(WhileLoopSimplifierTest, LoopSwappingTupleElementsNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // Construct a loop where we assign a constant to tuple element 0 in each
@@ -297,8 +307,8 @@ TEST_F(WhileLoopSimplifierTest,
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // Nothing to simplify in a while loop whose tuple has 0 elements.
@@ -320,8 +330,8 @@ TEST_F(WhileLoopSimplifierTest, LoopWithEmptyTupleNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // While loop where one tuple element is used twice in the body, and thus can't
@@ -348,8 +358,8 @@ TEST_F(WhileLoopSimplifierTest, LoopWithElemUsedTwiceNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 // This while loop has three tuple elements.  Element 0 is unused and should be
@@ -390,16 +400,15 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  HloModule* the_module = &module();
-  EXPECT_TRUE(WhileLoopSimplifier().Run(the_module).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 
   // The original while instruction is still left in the module as a dead
   // instruction, find a while instruction with a different name as the new
   // while instruction.
   HloInstruction* new_while_op =
-      *std::find_if(the_module->entry_computation()->instructions().begin(),
-                    the_module->entry_computation()->instructions().end(),
+      *std::find_if(m->entry_computation()->instructions().begin(),
+                    m->entry_computation()->instructions().end(),
                     [&](const HloInstruction* instr) {
                       return (instr->opcode() == HloOpcode::kWhile &&
                               instr->name() != "while");
@@ -440,8 +449,8 @@ TEST_F(WhileLoopSimplifierTest, LoopWithNonTupleBodyShapeNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest,
@@ -473,8 +482,8 @@ TEST_F(WhileLoopSimplifierTest,
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
 }
 
 TEST_F(WhileLoopSimplifierTest, LoopWithArrayConstantNotSimplified) {
@@ -505,8 +514,233 @@ TEST_F(WhileLoopSimplifierTest, LoopWithArrayConstantNotSimplified) {
   }
   )";
 
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_FALSE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+}
+
+TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ta = (s32[1]) get-tuple-element(param), index=0
+    a = s32[1] get-tuple-element(ta), index=0
+    a.1 = s32[1] add(a, a)
+    tbcd = (s32[2], s32[3], (s32[4])) get-tuple-element(param), index=1
+    ROOT tuple = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd)
+  }
+  Cond {
+    param = ((s32[1]), (s32[2], s32[3], (s32[4]))) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    b = s32[2] constant({0,1})
+    c = s32[3] constant({0,1,2})
+    d = s32[4] constant({0,1,2,3})
+    ta = (s32[1]) tuple(a)
+    td = (s32[4]) tuple(d)
+    tbcd = (s32[2], s32[3], (s32[4])) tuple(b, c, td)
+    init = ((s32[1]), (s32[2], s32[3], (s32[4]))) tuple(ta, tbcd)
+    ROOT while = ((s32[1]), (s32[2], s32[3], (s32[4]))) while(init),
+      condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  Shape flat_tuple =
+      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3], s32[4])")
+          .ValueOrDie();
+  SCOPED_TRACE(m->ToString());
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(), flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      flat_tuple));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      m->entry_computation()->root_instruction()->shape(),
+      ShapeUtil::ParseShapeString("((s32[1]), (s32[2], s32[3], (s32[4])))")
+          .ValueOrDie()));
+}
+
+// Edge-case: All elements of the loop carry are constants which can be removed,
+// leaving us with a nullary loop.  This is a special case, we just replace the
+// loop with its init.
+TEST_F(WhileLoopSimplifierTest, OnlyConstantsInLoopCarry) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1]) parameter(0)
+    a = s32[1] constant({0})
+    ROOT tuple = (s32[1]) tuple(a)
+  }
+  Cond {
+    param = (s32[1]) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    init = (s32[1]) tuple(a)
+    ROOT while = (s32[1]) while(init), condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(op::Constant()));
+}
+
+TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    a.1 = s32[1] add(a, a)
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({10,10,10})
+    ROOT tuple = (s32[1], s32[2], s32[3]) tuple(a.1, b, c)
+  }
+  Cond {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    /* Use each tuple element.  The verifier will then ensure that if any of
+     * these get modified, they're replaced with values of the correct shape. */
+    a = s32[1] get-tuple-element(param), index=0
+    b = s32[2] get-tuple-element(param), index=1
+    c = s32[3] get-tuple-element(param), index=2
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    /* Only `b` should be simplified away.  `a` is not a constant within the
+     * loop, and `c`'s value changes depending on whether we run 0 or 1
+     * iterations of the loop. */
+    a = s32[1] constant({0})
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({2,2,2})
+    init = (s32[1], s32[2], s32[3]) tuple(a,b,c)
+    ROOT while = (s32[1], s32[2], s32[3]) while(init),
+      condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  // Run the tuple simplifier to make the resulting HLO a bit easier to check.
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  Shape new_while_shape =
+      ShapeUtil::ParseShapeString("(s32[1], s32[3])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      m->entry_computation()->root_instruction()->shape(),
+      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3])").ValueOrDie()));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(_, op::Constant(), _));
+}
+
+const char* const kSimpleMergeInductionVariablesModule = R"(
+  HloModule Test
+  Body {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+
+    a = TYPE[] get-tuple-element(param), index=0
+    one = TYPE[] constant(1)
+    a1 = TYPE[] add(a, one)
+
+    b = TYPE[] get-tuple-element(param), index=1
+    negone = TYPE[] constant(-1)
+    b1 = TYPE[] add(b, negone)
+
+    c = TYPE[] add(a, b)
+
+    ROOT tuple = (TYPE[], TYPE[], TYPE[]) tuple(a1,b1,c)
+  }
+  Cond {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+    a = TYPE[] get-tuple-element(param), index=0
+    b = TYPE[] get-tuple-element(param), index=1
+    sum = TYPE[] power(a, b)
+    ten = TYPE[] constant(10)
+    ROOT cond = pred[] less-than(sum, ten)
+  }
+  ENTRY Loop {
+    a = TYPE[] constant(10)
+    b = TYPE[] constant(100)
+    c = TYPE[] constant(0)
+    init = (TYPE[], TYPE[], TYPE[]) tuple(a,b,c)
+    while = (TYPE[], TYPE[], TYPE[]) while(init), condition=Cond, body=Body
+
+    a1 = TYPE[] get-tuple-element(while), index=0
+    b1 = TYPE[] get-tuple-element(while), index=1
+    ROOT sum = TYPE[] add(a1, b1)
+  })";
+
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_Simple) {
+  string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule,
+                                          {{"TYPE", "s32"}});
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find, and run the tuple simplifier to make the resulting HLO
+  // easier to check.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  // We should have added a new loop counter for s32[] to the end of the tuple.
+  SCOPED_TRACE(m->ToString());
+  Shape new_while_shape =
+      ShapeUtil::ParseShapeString("(s32[], s32[], s32[], s32[])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+
+  EXPECT_THAT(new_while->while_body()->root_instruction(),
+              op::Tuple(op::GetTupleElement(op::Parameter(), 0),
+                        op::GetTupleElement(op::Parameter(), 1), op::Add(),
+                        op::Add(op::GetTupleElement(op::Parameter(), 3),
+                                op::Constant())));
+  EXPECT_THAT(new_while->while_condition()->root_instruction(),
+              op::Lt(op::Power(op::Add(), op::Add()), op::Constant()));
+}
+
+// We shouldn't merge S16 induction variables; we can't create constants of this
+// type because S16 literals are not implemented.
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_SkipS16) {
+  string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule,
+                                          {{"TYPE", "s16"}});
+  EXPECT_FALSE(
+      WhileLoopSimplifier()
+          .Run(ParseAndReturnVerifiedModule(hlo_string).ValueOrDie().get())
+          .ValueOrDie());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index f90ac91f9d07aded8cafccf82dae894c9a149bd1..039ccda7322f5efda6a827efbeda1225c3596cc0 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/while_util.h"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -225,7 +227,8 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
 /*static*/ StatusOr<WhileUtil::LoopStateTy> WhileUtil::MakeCountedLoop(
     HloComputation* computation, int32 trip_count,
     const WhileUtil::LoopStateTy& init_values,
-    const WhileUtil::LoopBodyGeneratorTy& loop_body_generator) {
+    const WhileUtil::LoopBodyGeneratorTy& loop_body_generator,
+    const OpMetadata& metadata) {
   CHECK_GE(trip_count, 0);
 
   Shape loop_state_shape = MakeLoopStateShape(init_values);
@@ -242,6 +245,7 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
       computation->AddInstruction(HloInstruction::CreateWhile(
           loop_state_shape, module->AddEmbeddedComputation(std::move(cond)),
           module->AddEmbeddedComputation(std::move(body)), init_tuple));
+  while_instr->set_metadata(metadata);
 
   std::vector<HloInstruction*> result;
   for (int64 i = 0, e = init_values.size(); i < e; i++) {
@@ -268,4 +272,17 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
   return result;
 }
 
+/*static*/ absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>
+WhileUtil::GetGTEsMapForWhileConditional(
+    const HloComputation& while_conditional) {
+  absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>> result;
+  for (HloInstruction* user :
+       while_conditional.parameter_instruction(0)->users()) {
+    if (user->opcode() == HloOpcode::kGetTupleElement) {
+      result[user->tuple_index()].push_back(user);
+    }
+  }
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
index b1c4486887ae0ddbe2ba4e79f45a265689111017..cba41ccd8b184ba3d867bc170724aee71e777788 100644
--- a/tensorflow/compiler/xla/service/while_util.h
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_UTIL_H_
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
@@ -77,13 +79,21 @@ class WhileUtil {
   static StatusOr<LoopStateTy> MakeCountedLoop(
       HloComputation* computation, int32 trip_count,
       const LoopStateTy& init_values,
-      const LoopBodyGeneratorTy& loop_body_generator);
+      const LoopBodyGeneratorTy& loop_body_generator,
+      const OpMetadata& metadata);
 
   // Returns the GetTupleElement instructions in `while_body` that access
   // elements in the parameter tuple that don't change across iterations.
   // Assumes `while_body` is the body computation of the while loop in question.
   static std::vector<HloInstruction*> GetInvariantGTEsForWhileBody(
       const HloComputation& while_body);
+
+  // Returns a map of index to GetTupleElement instructions in
+  // `while_conditional` that access elements in the parameter tuple. Assumes
+  // `while_conditional` is the conditional computation of the while loop in
+  // question.
+  static absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>
+  GetGTEsMapForWhileConditional(const HloComputation& while_conditional);
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
index b9ef18892d7aa859f6b0b505db4c004e4f5c5066..a546a6d39cc55d1f327b8449c7d26cd4c95dbf98 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -45,7 +45,8 @@ class ZeroSizedHloEliminationTest : public HloTestBase {
                 0, ShapeUtil::MakeShape(F32, {3, 0}), "zero sized param"))) {}
 
   StatusOr<bool> RunZeroSizedElimination() {
-    auto module = CreateNewModule("zero_sized_elimination_test_module");
+    auto module =
+        CreateNewUnverifiedModule("zero_sized_elimination_test_module");
     module->AddEntryComputation(builder_.Build());
     return ZeroSizedHloElimination{}.Run(module.get());
   }
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 14c35e7b84f07bebac33a9753ac26a8ee1418f1e..33edbd1b20d01bf132f2a152625d5f49a45f26f9 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -47,8 +47,11 @@ class ServiceInterface {
   virtual Status ResetDevice(const ResetDeviceRequest* arg,
                              ResetDeviceResponse* result) = 0;
 
-  virtual Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                              ExecuteResponse* result) = 0;
+  virtual Status Compile(const CompileRequest* arg,
+                         CompileResponse* result) = 0;
+
+  virtual Status Execute(const ExecuteRequest* arg,
+                         ExecuteResponse* result) = 0;
 
   virtual Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
                                       ExecuteParallelResponse* result) = 0;
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..746ab9e9977b1b10cdb0cb57197027d65bd50f55
--- /dev/null
+++ b/tensorflow/compiler/xla/shape.cc
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/shape.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+Shape::Shape(const ShapeProto& shape_proto) {
+  set_element_type(shape_proto.element_type());
+  dimensions_.reserve(shape_proto.dimensions_size());
+  for (const int64 dimension : shape_proto.dimensions()) {
+    add_dimensions(dimension);
+  }
+  tuple_shapes_.reserve(shape_proto.tuple_shapes_size());
+  for (const ShapeProto& element_shape : shape_proto.tuple_shapes()) {
+    *add_tuple_shapes() = Shape(element_shape);
+  }
+  if (shape_proto.has_layout()) {
+    *mutable_layout() = shape_proto.layout();
+  }
+}
+
+ShapeProto Shape::ToProto() const {
+  ShapeProto proto;
+  proto.set_element_type(element_type_);
+  proto.mutable_dimensions()->Reserve(dimensions_size());
+  for (const int64 dimension : dimensions()) {
+    proto.add_dimensions(dimension);
+  }
+  proto.mutable_tuple_shapes()->Reserve(tuple_shapes_size());
+  for (const Shape& shape : tuple_shapes()) {
+    *proto.add_tuple_shapes() = shape.ToProto();
+  }
+  if (has_layout()) {
+    *proto.mutable_layout() = layout();
+  }
+  return proto;
+}
+
+string Shape::ToString(bool print_layout) const {
+  if (print_layout) {
+    return ShapeUtil::HumanStringWithLayout(*this);
+  } else {
+    return ShapeUtil::HumanString(*this);
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const Shape& shape) {
+  out << shape.ToString(/*print_layout=*/true);
+  return out;
+}
+
+ProgramShape::ProgramShape(const ProgramShapeProto& program_shape_proto) {
+  for (const ShapeProto& shape_proto : program_shape_proto.parameters()) {
+    *add_parameters() = Shape(shape_proto);
+  }
+  *mutable_result() = Shape(program_shape_proto.result());
+  for (const string& name : program_shape_proto.parameter_names()) {
+    add_parameter_names(name);
+  }
+}
+
+ProgramShapeProto ProgramShape::ToProto() const {
+  ProgramShapeProto proto;
+  for (const Shape& shape : parameters()) {
+    *proto.add_parameters() = shape.ToProto();
+  }
+  *proto.mutable_result() = result().ToProto();
+  for (const string& name : parameter_names()) {
+    proto.add_parameter_names(name);
+  }
+  return proto;
+}
+
+string ProgramShape::ToString() const {
+  std::vector<string> parameter_strings(parameters_size());
+  for (int i = 0; i < parameters_size(); ++i) {
+    parameter_strings[i] = absl::StrCat(
+        i < parameter_names_size() ? parameter_names(i) : "(unknown)", ": ",
+        ShapeUtil::HumanString(parameters(i)));
+  }
+  return absl::StrCat("(", absl::StrJoin(parameter_strings, ", "), ") -> ",
+                      ShapeUtil::HumanString(result()));
+}
+
+std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape) {
+  out << program_shape.ToString() << "\n";
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f6b14ab4286c696dce64d2250a3fe8a57e4865b
--- /dev/null
+++ b/tensorflow/compiler/xla/shape.h
@@ -0,0 +1,204 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SHAPE_H_
+#define TENSORFLOW_COMPILER_XLA_SHAPE_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// A shape describes the number of dimensions in a array, the bounds of each
+// dimension, and the primitive component type. For tuples, shape describes the
+// structure (number of elements and nesting).
+class Shape {
+ public:
+  Shape() = default;
+
+  // Construct a shape from a ShapeProto.
+  explicit Shape(const ShapeProto& shape_proto);
+
+  // Returns a ShapeProto representation of the Shape.
+  ShapeProto ToProto() const;
+
+  // Returns a human-readable string that represents the given shape, with or
+  // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
+  string ToString(bool print_layout = false) const;
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message ShapeProto. This enabled easy migration of this data structure
+  // from a proto to a proper C++ class.
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing the primitive type.
+  PrimitiveType element_type() const { return element_type_; }
+  void set_element_type(PrimitiveType value) { element_type_ = value; }
+
+  // Methods for accessing the dimensions array.
+  int dimensions_size() const { return dimensions_.size(); }
+  int64 dimensions(int index) const { return dimensions_.at(index); }
+  void set_dimensions(int index, int64 value) { dimensions_.at(index) = value; }
+  void add_dimensions(int64 value) { dimensions_.push_back(value); }
+  void clear_dimensions() { dimensions_.clear(); }
+  const std::vector<int64>& dimensions() const { return dimensions_; }
+  std::vector<int64>* mutable_dimensions() { return &dimensions_; }
+
+  // Methods for accessing the tuple subshapes. This field only non-empty for
+  // tuple shapes.
+  int tuple_shapes_size() const { return tuple_shapes_.size(); }
+  const Shape& tuple_shapes(int index) const { return tuple_shapes_.at(index); }
+  Shape* mutable_tuple_shapes(int index) { return &tuple_shapes_.at(index); }
+  Shape* add_tuple_shapes() {
+    tuple_shapes_.push_back(Shape());
+    return &tuple_shapes_.back();
+  }
+  void clear_tuple_shapes() { tuple_shapes_.clear(); }
+  const std::vector<Shape>& tuple_shapes() const { return tuple_shapes_; }
+  std::vector<Shape>* mutable_tuple_shapes() { return &tuple_shapes_; }
+
+  // Methods for accessing the layout field.
+  bool has_layout() const { return layout_.has_value(); }
+  const Layout& layout() const {
+    if (layout_.has_value()) {
+      return *layout_;
+    } else {
+      return Layout::default_instance();
+    }
+  }
+  Layout* mutable_layout() {
+    if (!layout_.has_value()) {
+      layout_ = Layout();
+    }
+    return &layout_.value();
+  }
+  void clear_layout() { layout_.reset(); }
+
+  void Swap(Shape* other) {
+    using std::swap;
+    swap(*this, *other);
+  }
+
+  void Clear() {
+    element_type_ = PRIMITIVE_TYPE_INVALID;
+    dimensions_.clear();
+    tuple_shapes_.clear();
+    layout_.reset();
+  }
+
+  string SerializeAsString() const { return ToProto().SerializeAsString(); }
+  string ShortDebugString() const { return ToProto().ShortDebugString(); }
+  string DebugString() const { return ToProto().DebugString(); }
+
+ public:
+  // The element type of this shape (tuple, array, etc).
+  PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
+
+  // The array bounds of the dimensions. This is nonempty only for array shapes.
+  std::vector<int64> dimensions_;
+
+  // The tuple element subshapes. This is nonempty only for tuple shapes.
+  std::vector<Shape> tuple_shapes_;
+
+  // The array layout of the shape. This is present only for array shapes.
+  absl::optional<Layout> layout_;
+};
+
+// Shape of the parameters and output of an XLA computation. This is analogous
+// to a traditional function signature.
+class ProgramShape {
+ public:
+  ProgramShape() = default;
+
+  // Creates a ProgramShape from a ProgramShapeProto protobuf.
+  explicit ProgramShape(const ProgramShapeProto& program_shape_proto);
+
+  // Returns a proto representation of the object.
+  ProgramShapeProto ToProto() const;
+
+  string ToString() const;
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message ProgramShapeProto. This enabled easy migration of this data
+  // structure from a proto to a proper C++ class.
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing and manipulating the Shape of the parameters.
+  int parameters_size() const { return parameters_.size(); }
+  const Shape& parameters(int index) const { return parameters_.at(index); }
+  Shape* mutable_parameters(int index) { return &parameters_.at(index); }
+  Shape* add_parameters() {
+    parameters_.emplace_back();
+    return &parameters_.back();
+  }
+  void clear_parameters() { parameters_.clear(); }
+  const std::vector<Shape>& parameters() const { return parameters_; }
+  std::vector<Shape>* mutable_parameters() { return &parameters_; }
+
+  // Methods for accessing and manipulating the Shape of the result.
+  const Shape& result() const { return result_; }
+  Shape* mutable_result() { return &result_; }
+
+  // Methods for accessing and manipulating the names of the parameters.
+  int parameter_names_size() const { return parameter_names_.size(); }
+  const string& parameter_names(int index) const {
+    return parameter_names_.at(index);
+  }
+  void set_parameter_names(int index, const string& value) {
+    parameter_names_.at(index) = value;
+  }
+  string* mutable_parameter_names(int index) {
+    return &parameter_names_.at(index);
+  }
+  void add_parameter_names(const string& value) {
+    parameter_names_.push_back(value);
+  }
+  string* add_parameter_names() {
+    parameter_names_.push_back("");
+    return &parameter_names_.back();
+  }
+  void clear_parameter_names() { parameter_names_.clear(); }
+  const std::vector<string>& parameter_names() const {
+    return parameter_names_;
+  }
+  std::vector<string>* mutable_parameter_names() { return &parameter_names_; }
+
+  string ShortDebugString() const { return ToProto().ShortDebugString(); }
+  string DebugString() const { return ToProto().DebugString(); }
+
+ private:
+  // The shapes of the parameters of the computation represented by this object.
+  std::vector<Shape> parameters_;
+
+  // The names of the parameters of the computation represented by this object.
+  std::vector<string> parameter_names_;
+
+  // The shape of the result of the computation represented by this object.
+  Shape result_;
+};
+
+std::ostream& operator<<(std::ostream& out, const Shape& shape);
+std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SHAPE_H_
diff --git a/tensorflow/compiler/xla/shape_test.cc b/tensorflow/compiler/xla/shape_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e396897eeebc2e7bdc2dc49300c8906710608b05
--- /dev/null
+++ b/tensorflow/compiler/xla/shape_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/shape.h"
+
+#include <numeric>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class ShapeTest : public ::testing::Test {
+ protected:
+  const Shape opaque_ = ShapeUtil::MakeOpaqueShape();
+  const Shape token_ = ShapeUtil::MakeTokenShape();
+  const Shape scalar_ = ShapeUtil::MakeShape(F32, {});
+  const Shape matrix_ = ShapeUtil::MakeShape(U32, {1, 2});
+  const Shape matrix2_ = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1});
+  const Shape tuple_ =
+      ShapeUtil::MakeTupleShape({opaque_, scalar_, matrix_, matrix2_});
+  const Shape nested_tuple_ =
+      ShapeUtil::MakeTupleShape({tuple_, matrix_, token_});
+};
+
+TEST_F(ShapeTest, ShapeToFromProto) {
+  for (const Shape& shape :
+       {opaque_, token_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_}) {
+    Shape shape_copy(shape.ToProto());
+    EXPECT_TRUE(ShapeUtil::Equal(shape, shape_copy))
+        << shape << " != " << shape_copy;
+  }
+}
+
+TEST_F(ShapeTest, ShapeToString) {
+  EXPECT_EQ("opaque[]", opaque_.ToString());
+  EXPECT_EQ("token[]", token_.ToString());
+  EXPECT_EQ("f32[]", scalar_.ToString());
+  EXPECT_EQ("u32[1,2]", matrix_.ToString());
+  EXPECT_EQ("s32[3,4]", matrix2_.ToString());
+  EXPECT_EQ("(opaque[], f32[], u32[1,2], s32[3,4])", tuple_.ToString());
+  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
+            nested_tuple_.ToString());
+
+  EXPECT_EQ("opaque[]", opaque_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("f32[]", scalar_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("u32[1,2]{1,0}", matrix_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("s32[3,4]{0,1}", matrix2_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
+            tuple_.ToString(/*print_layout=*/true));
+  EXPECT_EQ(
+      "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, "
+      "token[])",
+      nested_tuple_.ToString(/*print_layout=*/true));
+}
+
+TEST_F(ShapeTest, ProgramShapeToFromProto) {
+  ProgramShape program_shape;
+  *program_shape.add_parameters() = ShapeUtil::MakeShape(F32, {1, 2, 3});
+  *program_shape.add_parameters() = ShapeUtil::MakeTokenShape();
+  *program_shape.add_parameters() = ShapeUtil::MakeShape(S64, {});
+  *program_shape.add_parameters() = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(S32, {}),
+       ShapeUtil::MakeTupleShape({ShapeUtil::MakeTokenShape()}),
+       ShapeUtil::MakeShape(F32, {42, 42})});
+
+  *program_shape.mutable_result() = ShapeUtil::MakeShape(F32, {7});
+
+  program_shape.add_parameter_names("foo");
+  program_shape.add_parameter_names("bar");
+  program_shape.add_parameter_names("baz");
+  program_shape.add_parameter_names("qux qux");
+
+  // Create a copy of the program shape by round-tripping through a proto.
+  ProgramShape program_shape_copy(program_shape.ToProto());
+  ASSERT_EQ(program_shape.parameters_size(),
+            program_shape_copy.parameters_size());
+  for (int i = 0; i < program_shape.parameters_size(); ++i) {
+    EXPECT_TRUE(ShapeUtil::Equal(program_shape.parameters(i),
+                                 program_shape_copy.parameters(i)));
+  }
+
+  EXPECT_TRUE(
+      ShapeUtil::Equal(program_shape.result(), program_shape_copy.result()));
+
+  ASSERT_EQ(program_shape.parameter_names_size(),
+            program_shape_copy.parameter_names_size());
+  for (int i = 0; i < program_shape.parameter_names_size(); ++i) {
+    EXPECT_EQ(program_shape.parameter_names(i),
+              program_shape_copy.parameter_names(i));
+  }
+}
+
+TEST_F(ShapeTest, ProgramShapeToString) {
+  ProgramShape prog = ShapeUtil::MakeProgramShape(
+      {opaque_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_},
+      nested_tuple_);
+  EXPECT_EQ(
+      "((unknown): opaque[], "
+      "(unknown): f32[], "
+      "(unknown): u32[1,2], "
+      "(unknown): s32[3,4], "
+      "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), "
+      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
+      prog.ToString());
+
+  prog.add_parameter_names("arg0");
+  prog.add_parameter_names("scalar");
+  prog.add_parameter_names("matrix");
+  prog.add_parameter_names("matrix2");
+  prog.add_parameter_names("tuple");
+  prog.add_parameter_names("nested_tuple");
+  EXPECT_EQ(
+      "(arg0: opaque[], "
+      "scalar: f32[], "
+      "matrix: u32[1,2], "
+      "matrix2: s32[3,4], "
+      "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), "
+      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], "
+      "token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
+      prog.ToString());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index df610102b4c7fa08c0b7030124939009130f89f4..7bf97729165bef98fabc29040e02203eee68a53c 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -667,12 +667,11 @@ void ShapeTree<T>::CopySubtreeFrom(const ShapeTree<T>& other,
 template <typename T>
 bool ShapeTree<T>::operator==(const ShapeTree<T>& other) const {
   bool equal = true;
-  ForEachElement(
-      [this, &other, &equal](const ShapeIndex& index, const T& data) {
-        if (data != other.element(index)) {
-          equal = false;
-        }
-      });
+  ForEachElement([&other, &equal](const ShapeIndex& index, const T& data) {
+    if (data != other.element(index)) {
+      equal = false;
+    }
+  });
   return equal;
 }
 
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index c8ff55e7845785d9292516b823fb591cc28cbfad..2b6c484bc4f205be0180403eeac2dd391029b110 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -52,10 +52,10 @@ class ShapeTreeTest : public ::testing::Test {
 
 TEST_F(ShapeTreeTest, DefaultConstructor) {
   ShapeTree<int> int_tree;
-  EXPECT_TRUE(ShapeUtil::IsNil(int_tree.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(int_tree.shape()));
 
   ShapeTree<bool> bool_tree;
-  EXPECT_TRUE(ShapeUtil::IsNil(bool_tree.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(bool_tree.shape()));
 }
 
 void ShapeTreeTest::TestShapeConstructor(const Shape& shape,
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index d244923532d8963dcc4a7433b8d353ff5dc483f2..a4d4e1e53e727bdf7822cacaa4559fcae59d4eae 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -74,14 +74,19 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index) {
   return out;
 }
 
-namespace {
+bool ShapeIndexView::StartsWith(ShapeIndexView prefix) const {
+  return size() >= prefix.size() &&
+         indices_.subspan(0, prefix.size()) == prefix.indices_;
+}
 
-// Returns whether the given primitive type corresponds to an array shape.
-bool IsArrayPrimitiveType(PrimitiveType primitive_type) {
+/* static */ bool ShapeUtil::IsArrayPrimitiveType(
+    PrimitiveType primitive_type) {
   return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE &&
          primitive_type != OPAQUE && primitive_type != TOKEN;
 }
 
+namespace {
+
 // Recursive helper for comparing the equality of two shapes. Returns true if
 // the shapes are the same. If compare_layouts is true, then layouts must also
 // match.
@@ -116,14 +121,21 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts,
         VLOG(3) << "CompareShapes: lhs layout != rhs layout";
         return false;
       }
-      if (!absl::c_equal(lhs.layout().padded_dimensions(),
-                         rhs.layout().padded_dimensions())) {
-        VLOG(3)
-            << "CompareShapes: lhs padded_dimensions != rhs padded_dimensions";
+
+      const auto& lhs_tiles = lhs.layout().tiles();
+      const auto& rhs_tiles = rhs.layout().tiles();
+      if (lhs_tiles.size() != rhs_tiles.size()) {
         return false;
       }
-      if (lhs.layout().padding_value() != rhs.layout().padding_value()) {
-        VLOG(3) << "CompareShapes: lhs padding value != rhs padding_value";
+      for (int64 i = 0; i < lhs_tiles.size(); i++) {
+        if (!absl::c_equal(lhs_tiles[i].dimensions(),
+                           rhs_tiles[i].dimensions())) {
+          return false;
+        }
+      }
+
+      if (lhs.layout().element_size_in_bits() !=
+          rhs.layout().element_size_in_bits()) {
         return false;
       }
     }
@@ -149,7 +161,8 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
     return InvalidArgument("Unsupported element type: %s",
                            PrimitiveType_Name(element_type));
   }
-  Shape shape = ShapeUtil::MakeShape(element_type, dimensions);
+  TF_ASSIGN_OR_RETURN(Shape shape,
+                      ShapeUtil::MakeValidatedShape(element_type, dimensions));
   auto min2maj = shape.mutable_layout()->mutable_minor_to_major();
   min2maj->Clear();
   for (int64 value : minor_to_major) {
@@ -207,7 +220,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 /* static */ ProgramShape ShapeUtil::MakeProgramShape(
     std::initializer_list<Shape> parameters, Shape result) {
   ProgramShape program_shape;
-  for (const auto& shape : parameters) {
+  for (const Shape& shape : parameters) {
     *program_shape.add_parameters() = shape;
   }
   *program_shape.mutable_result() = std::move(result);
@@ -216,9 +229,14 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 
 /* static */ Shape ShapeUtil::MakeShape(PrimitiveType element_type,
                                         absl::Span<const int64> dimensions) {
+  return MakeValidatedShape(element_type, dimensions).ValueOrDie();
+}
+
+/* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
+    PrimitiveType element_type, absl::Span<const int64> dimensions) {
   CHECK(IsArrayPrimitiveType(element_type));
   Shape result;
-  PopulateShape(element_type, dimensions, &result);
+  TF_RETURN_IF_ERROR(PopulateShape(element_type, dimensions, &result));
   return result;
 }
 
@@ -256,22 +274,22 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return MakeShapeWithDescendingLayout(shape.element_type(), dims);
 }
 
-/* static */ void ShapeUtil::PopulateShape(PrimitiveType element_type,
-                                           absl::Span<const int64> dimensions,
-                                           Shape* shape) {
+/* static */ Status ShapeUtil::PopulateShape(PrimitiveType element_type,
+                                             absl::Span<const int64> dimensions,
+                                             Shape* shape) {
   shape->Clear();
   shape->set_element_type(element_type);
   for (int64 dimension : dimensions) {
     shape->add_dimensions(dimension);
   }
   LayoutUtil::SetToDefaultLayout(shape);
-  TF_DCHECK_OK(ValidateShape(*shape));
+  return ValidateShape(*shape);
 }
 
 /* static */ Shape ShapeUtil::MakeTupleShape(absl::Span<const Shape> shapes) {
   Shape result;
   result.set_element_type(TUPLE);
-  result.mutable_tuple_shapes()->Reserve(shapes.size());
+  result.mutable_tuple_shapes()->reserve(shapes.size());
   for (const auto& shape : shapes) {
     AppendShapeToTuple(shape, &result);
   }
@@ -371,10 +389,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return IsTuple(shape) && TupleElementCount(shape) == 0;
 }
 
-/* static */ bool ShapeUtil::IsNil(const Shape& shape) {
-  return IsEmptyTuple(shape);
-}
-
 /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) {
   CHECK(IsTuple(shape)) << HumanString(shape);
   return shape.tuple_shapes_size();
@@ -461,8 +475,9 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return ShapeUtil::IsArray(shape) && ElementsIn(shape) == 0;
 }
 
-/* static */ bool ShapeUtil::IsScalarF32(const Shape& shape) {
-  return shape.element_type() == F32 && Rank(shape) == 0;
+/* static */ bool ShapeUtil::IsScalarWithElementType(
+    const Shape& shape, PrimitiveType element_type) {
+  return IsScalar(shape) && shape.element_type() == element_type;
 }
 
 namespace {
@@ -569,7 +584,7 @@ namespace {
 // Parses shapes with simple recursive descent structure -- consumes from the
 // front of s and passes that view recursively as required.
 StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
-  *s = StripLeadingAsciiWhitespace(*s);
+  *s = absl::StripLeadingAsciiWhitespace(*s);
 
   if (absl::ConsumePrefix(s, "(")) {  // Tuple.
     std::vector<Shape> shapes;
@@ -582,7 +597,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
       }
       shapes.emplace_back();
       TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
-      *s = StripLeadingAsciiWhitespace(*s);
+      *s = absl::StripLeadingAsciiWhitespace(*s);
       must_end = !absl::ConsumePrefix(s, ",");
     }
     return ShapeUtil::MakeTupleShape(shapes);
@@ -596,7 +611,8 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
   // we convert in to the RE2-consumable type and then consume the corresponding
   // amount from our string_view type.
   static LazyRE2 shape_pattern = {
-      "^(\\w*\\d*)\\[([\\d,]*)\\](?:\\s*(dense|sparse)?\\s*{([\\d,]+)})?"};
+      "^(\\w*\\d*)\\[([\\d,\\s]*)\\](?:\\s*(dense|sparse)?\\s*{([\\d,\\s]+)})"
+      "?"};
   tensorflow::RegexpStringPiece s_consumable(s->data(), s->size());
   if (RE2::Consume(&s_consumable, *shape_pattern, &element_type_string,
                    &dimensions_string, &format_string, &layout_string)) {
@@ -641,7 +657,8 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
       result = ShapeUtil::MakeTokenShape();
     } else if (format_string.empty() && layout_string.empty()) {
       // Create a shape without a layout set.
-      result = ShapeUtil::MakeShape(primitive_type, dimensions);
+      TF_ASSIGN_OR_RETURN(
+          result, ShapeUtil::MakeValidatedShape(primitive_type, dimensions));
     } else if (format_string == "sparse") {
       TF_ASSIGN_OR_RETURN(int64 max_elements, string_to_int64(layout_string));
       result = ShapeUtil::MakeShapeWithSparseLayout(primitive_type, dimensions,
@@ -784,6 +801,9 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
     return byte_size;
   } else if (shape.element_type() == TOKEN) {
     return 0;
+  } else if (shape.element_type() == OPAQUE) {
+    CHECK_GT(pointer_size, 0);
+    return pointer_size;
   }
   LOG(FATAL) << PrimitiveType_Name(shape.element_type())
              << " primitive type has no definitive size";
@@ -806,17 +826,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
     allocated_element_count = LayoutUtil::MaxSparseElements(shape.layout());
   } else {
     CHECK(LayoutUtil::IsDenseArray(shape)) << shape.ShortDebugString();
-    absl::Span<const int64> padded_dimensions =
-        LayoutUtil::PaddedDimensions(shape);
-    if (!padded_dimensions.empty()) {
-      CHECK_EQ(Rank(shape), padded_dimensions.size());
-      allocated_element_count = 1;
-      for (int64 dimension_size : padded_dimensions) {
-        allocated_element_count *= dimension_size;
-      }
-    } else {
-      allocated_element_count = ElementsIn(shape);
-    }
+    allocated_element_count = ElementsIn(shape);
   }
   return allocated_element_count *
          ByteSizeOfPrimitiveType(shape.element_type());
@@ -892,8 +902,13 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
     return Status::OK();
   }
 
-  int64 shape_size = [&shape]() {
-    if (LayoutUtil::IsSparseArray(shape)) {
+  // We can only reason about some aspects of array's shape if it has a valid
+  // layout, these aspects will be ignored otherwise.
+  bool shape_has_valid_layout = LayoutUtil::HasLayout(shape) &&
+                                LayoutUtil::ValidateLayoutInShape(shape).ok();
+
+  int64 shape_size = [&]() {
+    if (shape_has_valid_layout && LayoutUtil::IsSparseArray(shape)) {
       int64 max_sparse_elements = LayoutUtil::MaxSparseElements(shape.layout());
       if (max_sparse_elements < 0) {
         return max_sparse_elements;
@@ -929,7 +944,9 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
       return dense_shape_size;
     }
 
-    for (int64 dim : shape.dimensions()) {
+    absl::Span<const int64> shape_max_dimensions =
+        AsInt64Slice(shape.dimensions());
+    for (int64 dim : shape_max_dimensions) {
       dense_shape_size = MultiplyWithoutOverflow(dense_shape_size, dim);
       if (dense_shape_size < 0) {
         return dense_shape_size;
@@ -951,11 +968,10 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
 
 /* static */ Status ShapeUtil::ValidateShapeWithOptionalLayout(
     const Shape& shape) {
-  if (LayoutUtil::HasLayout(shape)) {
-    // Since a layout is present, upgrade to the full set of invariant checks.
-    return ValidateShape(shape);
-  }
-  return ValidateShapeWithOptionalLayoutInternal(shape);
+  TF_RETURN_IF_ERROR(ValidateShapeWithOptionalLayoutInternal(shape));
+
+  return LayoutUtil::ValidateLayoutInShape(shape,
+                                           /*allow_missing_layouts=*/true);
 }
 
 /* static */ Status ShapeUtil::ValidateShape(const Shape& shape) {
@@ -975,7 +991,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
                                           ShapeIndexView index) {
   const Shape* subshape = &shape;
   for (auto i : index) {
-    if (!IsTuple(*subshape) || i >= subshape->tuple_shapes_size()) {
+    if (!IsTuple(*subshape) || i >= subshape->tuple_shapes_size() || i < 0) {
       return false;
     }
     subshape = &subshape->tuple_shapes(i);
@@ -1152,7 +1168,7 @@ Status ForEachMutableSubshapeHelper(
   // Let the argument `permutation` be P.  This is a permutation over `shape`'s
   // dimensions, so our return value will be a shape with dims P.I = P.  Our
   // goal is to construct a layout permutation L* that we can apply to P such
-  // that that the physical dimension ordering of the returned shape is the same
+  // that the physical dimension ordering of the returned shape is the same
   // as that of the original shape, namely L'.
   //
   // Our returned shape has dims P and layout L*, so its in-memory layout is
@@ -1171,13 +1187,6 @@ Status ForEachMutableSubshapeHelper(
              permutation, AsInt64Slice(shape.layout().minor_to_major()))) {
       new_layout->add_minor_to_major(index);
     }
-    if (shape.layout().padded_dimensions_size() > 0) {
-      new_layout->clear_padded_dimensions();
-      for (auto dim :
-           Permute(permutation, shape.layout().padded_dimensions())) {
-        new_layout->add_padded_dimensions(dim);
-      }
-    }
     // The permutation accepted by TransposeIsBitcast is the inverse of the
     // permutation here.
     CHECK(TransposeIsBitcast(shape, new_shape, InversePermutation(permutation)))
@@ -1280,11 +1289,6 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     return false;
   }
 
-  // Padding is not handled.
-  if (LayoutUtil::IsPadded(input_shape) && LayoutUtil::IsPadded(output_shape)) {
-    return false;
-  }
-
   // Check the reshape permutes the positions of each dimension in the
   // minor-to-major order. positions[i]=k means dimension `i` is k-th minor.
   //   input_positions = apply(dimension_mapping, output_positions)
@@ -1316,11 +1320,6 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     return false;
   }
 
-  // Padding is not handled.
-  if (LayoutUtil::IsPadded(input_shape) || LayoutUtil::IsPadded(output_shape)) {
-    return false;
-  }
-
   CHECK_EQ(ElementsIn(input_shape), ElementsIn(output_shape));
   if (ElementsIn(input_shape) == 0) {
     return true;
@@ -1603,14 +1602,19 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   Shape output_shape_with_layout = MakeShapeWithLayout(
       output_shape.element_type(), AsInt64Slice(output_shape.dimensions()),
       output_layout);
-  CHECK(ReshapeIsBitcast(input_shape, output_shape_with_layout));
+  CHECK(ReshapeIsBitcast(input_shape, output_shape_with_layout))
+      << "reshape is not a bitcast for input_shape: "
+      << ShapeUtil::HumanStringWithLayout(input_shape)
+      << " and output_shape_with_layout: "
+      << ShapeUtil::HumanStringWithLayout(output_shape_with_layout);
   return output_shape_with_layout;
 }
 
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
   CHECK(IsArray(shape));
-  shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
+  shape.mutable_dimensions()->erase(shape.mutable_dimensions()->begin() +
+                                    dim_to_delete);
   if (LayoutUtil::HasLayout(shape)) {
     Layout* layout = shape.mutable_layout();
     layout->set_format(DENSE);
@@ -1644,11 +1648,6 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   return shape;
 }
 
-std::ostream& operator<<(std::ostream& out, const Shape& shape) {
-  out << ShapeUtil::HumanString(shape);
-  return out;
-}
-
 /*static*/ size_t ShapeUtil::Hash(const Shape& shape) {
   using tensorflow::hash;
   using tensorflow::Hash64Combine;
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index d8bb27beae64bb665c79c2cd7134f613495529cc..84a27f662a57ba274562e2e9be57b7e971c9b477 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -72,7 +74,7 @@ class ShapeIndex {
   void push_back(int64 value) { indices_.push_back(value); }
   void pop_back() { indices_.pop_back(); }
 
-  // push_front is O(n^2), but shapes don't usually have a ton of dimensions.
+  // push_front is O(n), but shapes don't usually have a ton of dimensions.
   void push_front(int64 value) { indices_.insert(indices_.begin(), value); }
 
   using container_type = absl::InlinedVector<int64, 2>;
@@ -100,6 +102,11 @@ class ShapeIndex {
 
   string ToString() const;
 
+  template <typename H>
+  friend H AbslHashValue(H h, const ShapeIndex& index) {
+    return H::combine(std::move(h), index.indices_);
+  }
+
  private:
   container_type indices_;
 };
@@ -147,6 +154,9 @@ class ShapeIndexView {
 
   string ToString() const;
 
+  // Returns true if this shape index starts with 'prefix'.
+  bool StartsWith(ShapeIndexView prefix) const;
+
  private:
   absl::Span<const int64> indices_;
 };
@@ -312,7 +322,10 @@ class ShapeUtil {
   static bool IsEffectiveScalar(const Shape& shape) {
     return IsArray(shape) && TrueRank(shape) == 0;
   }
-  static bool IsScalarF32(const Shape& shape);
+
+  // Returns whether "shape" is a scalar (array) with the given element_type.
+  static bool IsScalarWithElementType(const Shape& shape,
+                                      PrimitiveType element_type);
 
   // Extracts the size of the shape's dimension at dimension number
   // GetDimensionNumber(dimension_number).
@@ -362,6 +375,12 @@ class ShapeUtil {
   static Shape MakeShape(PrimitiveType element_type,
                          absl::Span<const int64> dimensions);
 
+  // Constructs a new shape with the given element type and sequence of
+  // dimensions. Method checks if the element type is valid and the shape's
+  // size fits in std::numeric_limits<int64>::max().
+  static StatusOr<Shape> MakeValidatedShape(PrimitiveType element_type,
+                                            absl::Span<const int64> dimensions);
+
   // Creates a Shape with element type corresponding to T and the given
   // dimensions
   template <typename T>
@@ -393,8 +412,8 @@ class ShapeUtil {
       const Shape& shape);
 
   // As MakeShape, but the object to write to is passed in.
-  static void PopulateShape(PrimitiveType element_type,
-                            absl::Span<const int64> dimensions, Shape* shape);
+  static Status PopulateShape(PrimitiveType element_type,
+                              absl::Span<const int64> dimensions, Shape* shape);
 
   // Validates that the provided shape satisfies invariants.
   static Status ValidateShape(const Shape& shape);
@@ -449,6 +468,9 @@ class ShapeUtil {
   // arrays.
   static bool IsArray(const Shape& shape);
 
+  // Returns whether the given primitive type corresponds to an array shape.
+  static bool IsArrayPrimitiveType(PrimitiveType primitive_type);
+
   // Returns whether the shape is a tuple with at least one element which is
   // also a tuple.
   static bool IsNestedTuple(const Shape& shape);
@@ -456,9 +478,6 @@ class ShapeUtil {
   // Returns true if shape is an empty tuple.
   static bool IsEmptyTuple(const Shape& shape);
 
-  // Returns true if shape is the nil shape (an empty tuple).
-  static bool IsNil(const Shape& shape);
-
   // Returns the number of elements in the given tuple shape.
   // Precondition: IsTuple(shape)
   static int64 TupleElementCount(const Shape& shape);
@@ -742,10 +761,18 @@ class ShapeUtil {
       pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads);
     }
 
+    tensorflow::mutex mu;
+    Status status;  // Guarded by mu
+
     while (n < rank) {
       if (pool != absl::nullopt) {
-        pool->Schedule(
-            [indexes, &visitor_function] { visitor_function(indexes); });
+        pool->Schedule([indexes, &visitor_function, &mu, &status] {
+          StatusOr<bool> result = visitor_function(indexes);
+          if (!result.ok()) {
+            tensorflow::mutex_lock lock(mu);
+            status = status.ok() ? result.status() : status;
+          }
+        });
       } else {
         TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
         if (!should_continue) {
@@ -763,14 +790,14 @@ class ShapeUtil {
       }
     }
 
-    return Status::OK();
+    // Waits for the scheduled work to complete.
+    pool.reset();
+    return status;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil);
 };
 
-std::ostream& operator<<(std::ostream& out, const Shape& shape);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SHAPE_UTIL_H_
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index c622ecdca1fd66604d1a6ceaf705f2e70edaee55..60bdbe302045e6f3b4bae500c50bc68fb217525d 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -345,26 +345,6 @@ TEST(ShapeUtilTest, OpaqueVsArray) {
   EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(shape2, shape1));
 }
 
-TEST(ShapeUtilTest, CompareShapesWithPaddedDimensionsMismatch) {
-  Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30});
-  shape1.mutable_layout()->add_padded_dimensions(10);
-
-  Shape shape2 = ShapeUtil::MakeShape(F32, {20, 30});
-  shape2.mutable_layout()->add_padded_dimensions(11);
-
-  EXPECT_FALSE(ShapeUtil::Equal(shape1, shape2));
-}
-
-TEST(ShapeUtilTest, CompareShapesWithPaddingValueMismatch) {
-  Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30});
-  shape1.mutable_layout()->set_padding_value(ZERO_PAD);
-
-  Shape shape2 = ShapeUtil::MakeShape(F32, {20, 30});
-  shape2.mutable_layout()->set_padding_value(LOWEST_PAD);
-
-  EXPECT_FALSE(ShapeUtil::Equal(shape1, shape2));
-}
-
 TEST(ShapeUtilTest, ScalarDefaultLayoutEqualsScalarEmptyMin2Maj) {
   Shape scalar_default_layout = ShapeUtil::MakeShape(F32, {});
   ASSERT_TRUE(scalar_default_layout.has_layout())
@@ -395,23 +375,13 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
   EXPECT_EQ(0, ShapeUtil::ByteSizeOf(ShapeUtil::MakeTokenShape()));
 }
 
-TEST(ShapeUtilTest, ByteSizeOfWithPadding) {
-  EXPECT_EQ(4, ShapeUtil::ByteSizeOfPrimitiveType(F32));
-  Shape shape = ShapeUtil::MakeShape(F32, {10, 20});
-  EXPECT_EQ(800, ShapeUtil::ByteSizeOf(shape));
-
-  shape.mutable_layout()->add_padded_dimensions(15);
-  shape.mutable_layout()->add_padded_dimensions(21);
-  EXPECT_EQ(15 * 21 * 4, ShapeUtil::ByteSizeOf(shape));
-}
-
 TEST(ShapeUtilTest, NilShape) {
-  EXPECT_TRUE(ShapeUtil::IsNil(ShapeUtil::MakeNil()));
-  EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {1, 2, 3})));
-  EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {0, 1})));
-  EXPECT_FALSE(ShapeUtil::IsNil(
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeNil()));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {1, 2, 3})));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {0, 1})));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {})})));
-  EXPECT_FALSE(ShapeUtil::IsNil(
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {0})})));
 }
 
@@ -576,68 +546,6 @@ TEST(ShapeUtilTest, IsLeafIndex) {
   EXPECT_TRUE(ShapeUtil::IsLeafIndex(nested_tuple_shape, {1, 1}));
 }
 
-TEST(ShapeUtilTest, HumanString) {
-  Shape opaque = ShapeUtil::MakeOpaqueShape();
-  Shape token = ShapeUtil::MakeTokenShape();
-  Shape scalar = ShapeUtil::MakeShape(F32, {});
-  Shape matrix = ShapeUtil::MakeShape(U32, {1, 2});
-  Shape matrix2 = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1});
-  Shape tuple = ShapeUtil::MakeTupleShape({opaque, scalar, matrix, matrix2});
-  Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix, token});
-
-  EXPECT_EQ("opaque[]", ShapeUtil::HumanString(opaque));
-  EXPECT_EQ("token[]", ShapeUtil::HumanString(token));
-  EXPECT_EQ("f32[]", ShapeUtil::HumanString(scalar));
-  EXPECT_EQ("u32[1,2]", ShapeUtil::HumanString(matrix));
-  EXPECT_EQ("s32[3,4]", ShapeUtil::HumanString(matrix2));
-  EXPECT_EQ("(opaque[], f32[], u32[1,2], s32[3,4])",
-            ShapeUtil::HumanString(tuple));
-  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
-            ShapeUtil::HumanString(nested_tuple));
-
-  EXPECT_EQ("opaque[]", ShapeUtil::HumanStringWithLayout(opaque));
-  EXPECT_EQ("f32[]", ShapeUtil::HumanStringWithLayout(scalar));
-  EXPECT_EQ("u32[1,2]{1,0}", ShapeUtil::HumanStringWithLayout(matrix));
-  EXPECT_EQ("s32[3,4]{0,1}", ShapeUtil::HumanStringWithLayout(matrix2));
-  EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
-            ShapeUtil::HumanStringWithLayout(tuple));
-  EXPECT_EQ(
-      "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, "
-      "token[])",
-      ShapeUtil::HumanStringWithLayout(nested_tuple));
-
-  ProgramShape prog = ShapeUtil::MakeProgramShape(
-      {opaque, scalar, matrix, matrix2, tuple, nested_tuple}, nested_tuple);
-  EXPECT_EQ(
-      "((unknown): opaque[], "
-      "(unknown): f32[], "
-      "(unknown): u32[1,2], "
-      "(unknown): s32[3,4], "
-      "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) "
-      "-> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
-      ShapeUtil::HumanString(prog));
-
-  prog.add_parameter_names("arg0");
-  prog.add_parameter_names("scalar");
-  prog.add_parameter_names("matrix");
-  prog.add_parameter_names("matrix2");
-  prog.add_parameter_names("tuple");
-  prog.add_parameter_names("nested_tuple");
-  EXPECT_EQ(
-      "(arg0: opaque[], "
-      "scalar: f32[], "
-      "matrix: u32[1,2], "
-      "matrix2: s32[3,4], "
-      "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], "
-      "token[])) "
-      "-> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
-      ShapeUtil::HumanString(prog));
-}
-
 TEST(ShapeUtilTest, ForEachSubshapeArray) {
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   int calls = 0;
diff --git a/tensorflow/compiler/xla/sparse_index_array.cc b/tensorflow/compiler/xla/sparse_index_array.cc
index 1c135dda864b3060b8bdc6369f18268d7c5c7f9e..a40bb7875e7ea53a8959a9a67ec09ec260ba9c37 100644
--- a/tensorflow/compiler/xla/sparse_index_array.cc
+++ b/tensorflow/compiler/xla/sparse_index_array.cc
@@ -29,7 +29,7 @@ SparseIndexArray::SparseIndexArray(int64 max_indices, int64 rank,
   CHECK_GT(rank_, 0);
   CHECK_EQ(indices_.size() % rank_, 0)
       << "indices_.size(): " << indices_.size() << ", rank_: " << rank_;
-  CHECK_LT(index_count(), max_indices_);
+  CHECK_LE(index_count(), max_indices_);
 }
 
 SparseIndexArray::SparseIndexArray(int64 max_indices, int64 rank,
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 8a0ae330420531b833ed670118e6b6b1056bd358..5a7a4faa7e89b27fb537f20d94c21cb4a76e000d 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -44,7 +44,7 @@ cc_library(
     testonly = True,
     srcs = ["xla_internal_test_main.cc"],
     deps = [
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/strings",
@@ -79,6 +79,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -117,12 +118,12 @@ cc_library(
     deps = [
         ":literal_test_util",
         ":test_utils",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
@@ -135,50 +136,13 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-cc_library(
-    name = "hlo_verified_test_base",
-    testonly = True,
-    srcs = ["hlo_verified_test_base.cc"],
-    hdrs = ["hlo_verified_test_base.h"],
-    deps = [
-        ":hlo_test_base",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_verifier",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_verified_test_base_test",
-    srcs = ["hlo_verified_test_base_test.cc"],
-    deps = [
-        ":hlo_test_base",
-        ":hlo_verified_test_base",
-        ":test_macros_cpu",
-        ":test_utils",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:hlo_verifier",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 tf_cc_binary(
     name = "local_client_aot_test_helper",
     srcs = ["local_client_aot_test_helper.cc"],
@@ -335,6 +299,52 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "conv_depthwise_test",
+    timeout = "long",
+    srcs = ["conv_depthwise_test.cc"],
+    shard_count = 50,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+xla_test(
+    name = "grouped_convolution_test",
+    timeout = "long",
+    srcs = ["grouped_convolution_test.cc"],
+    blacklisted_backends = [
+        # disabled because of a break b/119590850.
+        "gpu",
+        # disabled because it times out.
+        "cpu",
+    ],
+    shard_count = 50,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
@@ -476,7 +486,9 @@ xla_test(
     name = "params_test",
     srcs = ["params_test.cc"],
     shard_count = 30,
-    tags = ["optonly"],
+    tags = [
+        "optonly",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -658,6 +670,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -682,6 +695,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
     ],
 )
 
@@ -705,6 +719,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -863,7 +878,8 @@ xla_test(
     name = "convolution_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
-    shard_count = 25,
+    shard_count = 40,
+    tags = ["optonly"],
     deps = CONVOLUTION_TEST_DEPS + [
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1172,6 +1188,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -1296,6 +1313,7 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1614,6 +1632,7 @@ xla_test(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
     ],
 )
 
@@ -1858,6 +1877,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -1894,6 +1914,7 @@ xla_test(
 xla_test(
     name = "multioutput_fusion_test",
     srcs = ["multioutput_fusion_test.cc"],
+    backends = ["gpu"],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -2150,6 +2171,7 @@ xla_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index c257566fb218d4769aec0c793efb9256b023b7ea..915b456b52215f8d6a9eb6c5b933f3502f1d3d2c 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -139,7 +139,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteZeroElementF32s) {
 }
 
 // A non-canonical quiet NaN value.
-static const float kNonCanonicalNaN = tensorflow::bit_cast<float>(0x7FD01234);
+static const float kNonCanonicalNaN = absl::bit_cast<float>(0x7FD01234);
 
 XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteScalarF32) {
   XlaBuilder builder(TestName());
@@ -329,13 +329,13 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
   Literal b_literal = LiteralUtil::CreateR1<float>({b_values});
   std::unique_ptr<GlobalData> b_data =
       client_->TransferToServer(b_literal).ConsumeValueOrDie();
-  auto b_constant = Parameter(&builder, 1, a_literal.shape(), "b_param");
-  auto b_param = ConstantR1<float>(&builder, b_values);
+  auto b_param = Parameter(&builder, 1, a_literal.shape(), "b_param");
+  auto b_constant = ConstantR1<float>(&builder, b_values);
 
-  auto sum1 = Add(a_constant, b_constant);
-  auto sum2 = Add(a_constant, b_param);
-  auto sum3 = Add(a_param, b_constant);
-  auto sum4 = Add(a_param, b_param);
+  auto sum1 = Add(a_constant, b_param);
+  auto sum2 = Add(a_constant, b_constant);
+  auto sum3 = Add(a_param, b_param);
+  auto sum4 = Add(a_param, b_constant);
 
   auto sum = Add(sum1, sum2);
   sum = Add(sum, sum3);
@@ -350,6 +350,44 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
                              error_spec_);
 }
 
+// TODO(b/119692968): This test runs OOM on the GPU and CPU backend.
+XLA_TEST_F(ArrayElementwiseOpTest,
+           DISABLED_ON_GPU(DISABLED_ON_CPU(DeeplyNestedAddWithSlices))) {
+  XlaBuilder builder(TestName());
+  std::vector<float> values(30, 0.0);
+  auto a_literal = LiteralUtil::CreateR1<float>(values);
+  auto a = Parameter(&builder, 0, a_literal.shape(), "x");
+  auto b_literal = LiteralUtil::CreateR1<float>(values);
+  auto b = Parameter(&builder, 1, b_literal.shape(), "x");
+
+  // Construct a sequence of diamond-shaped gadgets like this:
+  //
+  //      add
+  //    /    \
+  //  slice  slice
+  //     \   /
+  //      add
+  //
+  // Each 'left' slice removes the last element, each 'right' slice removes the
+  // first element. In this way, we index into the add with different
+  // multi-dimensional index arrays, which defeats the caching we use to avoid
+  // exponential compile time.
+  std::function<XlaOp(int64)> generate_recursive =
+      [&](int64 slice_size) -> XlaOp {
+    if (slice_size == values.size()) {
+      return Add(a, b);
+    }
+    XlaOp param = generate_recursive(slice_size + 1);
+    auto slice1 = Slice(param, {0}, {slice_size}, {1});
+    auto slice2 = Slice(param, {1}, {slice_size + 1}, {1});
+    return Add(slice1, slice2);
+  };
+  generate_recursive(1);
+  auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie();
+  auto b_data = client_->TransferToServer(b_literal).ConsumeValueOrDie();
+  ComputeAndCompareR1<float>(&builder, {0.0}, {a_data.get(), b_data.get()});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<float>(&builder, {-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
@@ -2478,8 +2516,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ne) {
   Ne(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,2] {
-  { 00 },
-  { 01 }
+  { 0, 0 },
+  { 0, 1 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2492,8 +2530,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ge) {
   Ge(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
-  { 1100 },
-  { 0001 }
+  { 1, 1, 0, 0 },
+  { 0, 0, 0, 1 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2506,8 +2544,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Gt) {
   Gt(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
-  { 0100 },
-  { 0000 }
+  { 0, 1, 0, 0 },
+  { 0, 0, 0, 0 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2520,8 +2558,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Le) {
   Le(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
-  { 1011 },
-  { 1111 }
+  { 1, 0, 1, 1 },
+  { 1, 1, 1, 1 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2534,8 +2572,8 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Lt) {
   Lt(v, m, /*broadcast_dimensions=*/{1});
 
   const string expected = R"(pred[2,4] {
-  { 0011 },
-  { 1110 }
+  { 0, 0, 1, 1 },
+  { 1, 1, 1, 0 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2744,12 +2782,16 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
   Array3D<int> expected_3d(
       {{{0, 1}, {0, 0}, {0, 0}}, {{0, 1}, {1, 0}, {0, 1}}});
   const string expected = R"(pred[2,3,2] {
-{ { 01 },
-  { 00 },
-  { 00 } },
-{ { 01 },
-  { 10 },
-  { 01 } }
+{
+  { 0, 1 },
+  { 0, 0 },
+  { 0, 0 }
+},
+{
+  { 0, 1 },
+  { 1, 0 },
+  { 0, 1 }
+}
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index dde19fb65d65064c9452a6ac49c70e20cf113336..702fb32adfc8a0ded26845c92245776a79777c34 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -161,8 +161,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR1<float>(&b, {1, 2}),
-                 ShapeUtil::MakeShape(F32, {2, 2}), {1});
+  BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {2, 2}, {1});
 
   Array2D<float> expected(2, 2);
   expected(0, 0) = 1;
@@ -175,8 +174,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) {
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR1<float>(&b, {1, 2}),
-                 ShapeUtil::MakeShape(F32, {2, 2}), {0});
+  BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {2, 2}, {0});
 
   Array2D<float> expected(2, 2);
   expected(0, 0) = 1;
@@ -189,8 +187,8 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) {
 
 XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}),
-                 ShapeUtil::MakeShape(F32, {2, 2, 2}), {0, 1});
+  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}), {2, 2, 2},
+                 {0, 1});
 
   Array3D<float> expected(2, 2, 2);
   expected(0, 0, 0) = 1.0;
@@ -207,8 +205,8 @@ XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) {
 
 XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}),
-                 ShapeUtil::MakeShape(F32, {2, 2, 2}), {0, 2});
+  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}), {2, 2, 2},
+                 {0, 2});
 
   Array3D<float> expected(2, 2, 2);
   expected(0, 0, 0) = 1.0;
@@ -225,8 +223,7 @@ XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) {
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsNotPossibleWithBroadCast) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR1<float>(&b, {1, 2}),
-                 ShapeUtil::MakeShape(F32, {3, 2}), {1});
+  BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {3, 2}, {1});
 
   Array2D<float> expected(3, 2);
   expected(0, 0) = 1;
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index 9966e4606ef7f104487182e0240e64e4c9e4d834..9930bfc95c297093584d427397cac042c296050f 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -42,7 +42,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) {
       ShapeUtil::MakeShape(F32, {}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -58,7 +58,7 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -81,7 +81,7 @@ XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
   builder.AddInstruction(HloInstruction::CreateTuple({element1, element2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -102,7 +102,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {0, 1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -121,7 +121,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
       ShapeUtil::MakeShape(F32, {2, 2}), input, {1, 0}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -138,7 +138,7 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
       ShapeUtil::MakeShape(F32, {2, 3, 2}), input, {0, 2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -158,7 +158,7 @@ TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
       ShapeUtil::MakeShape(F32, {2, 2, 3, 3}), input, {1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -183,7 +183,7 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
       ShapeUtil::MakeShape(F32, {3, 3, 3, r1_size}), input, {3}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -214,7 +214,7 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
       ShapeUtil::MakeShape(F32, {32, 64, 7, 7}), input, {1}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -230,7 +230,7 @@ TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
       ShapeUtil::MakeShape(F32, {64, 64, 3, 3}), input, {}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   LOG(INFO) << hlo_module->ToString();
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
@@ -253,7 +253,7 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
       ShapeUtil::MakeShape(F32, {3, 3, 2, 2}), input, {2, 3}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
@@ -287,7 +287,7 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
       ShapeUtil::MakeShape(F32, {2, 3, 4, 5}), input, {0, 1, 2}));
 
   // Create HLO module, compile, and execute.
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index fbdf0fcb6543f09dedefef55cfe0f8a5d9067d5a..12c029983336cc9aed0fde4ce6881c9a00a9869e 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -107,7 +107,7 @@ StatusOr<Literal> ClientLibraryTestBase::ExecuteAndTransfer(
   ExecutionOptions execution_options = execution_options_;
   if (shape_with_output_layout != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
-        *shape_with_output_layout;
+        shape_with_output_layout->ToProto();
   }
   return client_->ExecuteAndTransfer(computation, arguments,
                                      &execution_options);
@@ -127,7 +127,7 @@ StatusOr<Literal> ClientLibraryTestBase::ExecuteAndTransferReference(
   ExecutionOptions execution_options = execution_options_;
   if (shape_with_output_layout != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
-        *shape_with_output_layout;
+        shape_with_output_layout->ToProto();
   }
   execution_options.clear_device_handles();
   return ref_client_->ExecuteAndTransfer(computation, arguments,
@@ -262,6 +262,28 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
   return choose(0);
 }
 
+StatusOr<Literal> ClientLibraryTestBase::ComputeAndTransfer(
+    XlaBuilder* builder, absl::Span<GlobalData* const> arguments_passed_in,
+    const Shape* shape_with_layout) {
+  std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
+                                     arguments_passed_in.end());
+
+  // Transfer and use elements of arguments_, if the AddParam() API was used.
+  std::vector<std::unique_ptr<GlobalData>> owning_arguments;
+  if (!arguments_.empty()) {
+    CHECK(arguments.empty());
+    for (const auto& argument : arguments_) {
+      owning_arguments.push_back(
+          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument))
+              .ValueOrDie());
+      arguments.push_back(owning_arguments.back().get());
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+  return ExecuteAndTransfer(computation, arguments, shape_with_layout);
+}
+
 Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     XlaBuilder* builder, const Literal& expected,
     absl::Span<GlobalData* const> arguments_passed_in,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 9d32f4f5174a57a53a9d3e6477b46fa4de852f7f..65a23dd883594b9bf9c37494a37e9be39b197788 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -76,7 +76,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   void SetFastMathDisabled(bool disabled) {
     auto* opts = execution_options_.mutable_debug_options();
     opts->set_xla_cpu_enable_fast_math(!disabled);
-    opts->set_xla_gpu_enable_fast_math(!disabled);
+    opts->set_xla_gpu_enable_fast_min_max(!disabled);
   }
 
   void SetSeed(uint64 seed) { execution_options_.set_seed(seed); }
@@ -188,6 +188,13 @@ class ClientLibraryTestBase : public ::testing::Test {
                                 ErrorSpec error,
                                 const Shape* shape_with_layout = nullptr);
 
+  // Build and run the computation and return the result as a literal.
+  // shape_with_layout indicates the result layout to request when calling
+  // Execute.
+  StatusOr<Literal> ComputeAndTransfer(
+      XlaBuilder* builder, absl::Span<GlobalData* const> arguments,
+      const Shape* shape_with_layout = nullptr);
+
   // ComputeAndCompare variant which returns an error status.
   Status ComputeAndCompareLiteralWithStatus(
       XlaBuilder* builder, const Literal& expected,
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 6f2ca84bb646e88af221ab80b727911ff7d990eb..363dee74b2755a6bdc3c5a5164a85378581c21d2 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -50,7 +50,8 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) {
       ExecutionOptions execution_options = execution_options_;
       *execution_options.mutable_shape_with_output_layout() =
           ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
-                                         execute_layout);
+                                         execute_layout)
+              .ToProto();
       TF_ASSERT_OK_AND_ASSIGN(
           std::unique_ptr<GlobalData> data,
           client_->Execute(computation, {}, &execution_options));
@@ -84,7 +85,8 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
           {ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
                                           /*minor_to_major=*/{0, 1}),
            ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
-                                          /*minor_to_major=*/{1, 0})});
+                                          /*minor_to_major=*/{1, 0})})
+          .ToProto();
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result,
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index 022641394f113ef28e7c53058385d77572822213..fbebe0408730f2fb37aa57a0f19291bbaa3826f9 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -32,11 +32,10 @@ StatusOr<std::unique_ptr<AotCompilationResult>>
 CodegenTestBase::CompileToAotCompilationResult(
     std::unique_ptr<HloModule> hlo_module,
     const AotCompilationOptions& options) {
-  std::vector<std::unique_ptr<HloModule>> hlo_modules;
-  hlo_modules.push_back(std::move(hlo_module));
+  auto module_group = absl::make_unique<HloModuleGroup>(std::move(hlo_module));
   TF_ASSIGN_OR_RETURN(
       std::vector<std::unique_ptr<AotCompilationResult>> results,
-      backend().compiler()->CompileAheadOfTime(std::move(hlo_modules),
+      backend().compiler()->CompileAheadOfTime(std::move(module_group),
                                                options));
   return std::move(results.front());
 }
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 9811a015e91d866d6f4de6ebb6dac536ed6c7e06..4f5b525a34252db9e967a55af0d1bf39a2dd830e 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -492,6 +492,32 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
   ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
 }
 
+XLA_TEST_F(ConcatTest, ConcatDeeplyNested) {
+  XlaBuilder builder(TestName());
+  auto a_literal = LiteralUtil::CreateR1<float>({256.0});
+  auto a = Parameter(&builder, 0, a_literal.shape(), "x");
+  auto b = ConcatInDim(&builder, {a, a}, 0);
+  auto c = ConcatInDim(&builder, {b, b}, 0);
+  auto d = ConcatInDim(&builder, {c, c}, 0);
+  auto e = ConcatInDim(&builder, {d, d}, 0);
+  auto f = ConcatInDim(&builder, {e, e}, 0);
+  auto g = ConcatInDim(&builder, {f, f}, 0);
+  auto h = ConcatInDim(&builder, {g, g}, 0);
+  auto i = ConcatInDim(&builder, {h, h}, 0);
+  auto j = ConcatInDim(&builder, {i, i}, 0);
+  auto k = ConcatInDim(&builder, {j, j}, 0);
+  auto l = ConcatInDim(&builder, {k, k}, 0);
+  auto m = ConcatInDim(&builder, {l, l}, 0);
+  auto n = ConcatInDim(&builder, {m, m}, 0);
+  auto o = ConcatInDim(&builder, {n, n}, 0);
+  auto p = ConcatInDim(&builder, {o, o}, 0);
+  auto q = ConcatInDim(&builder, {p, p}, 0);
+  ConcatInDim(&builder, {q, q}, 0);
+  std::vector<float> expected(131072, 256.0);
+  auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie();
+  ComputeAndCompareR1<float>(&builder, expected, {a_data.get()});
+}
+
 // Describes a binary rank-2 concatenation test.
 struct R2BinarySpec {
   int64 lhs_dim0;
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..627a17a0ca114085240dbaf28211bb3511cf0cab
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct DepthwiseConvolution2DSpec {
+  int64 output_feature, window, stride, pad, lhs_dilate;
+  std::vector<int64> activation_dims;
+  std::vector<int64> activation_layout;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> kernel_layout;
+  std::vector<int64> output_dims;
+  std::vector<int64> output_layout;
+};
+
+class DepthwiseConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<DepthwiseConvolution2DSpec, bool>> {};
+
+static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<DepthwiseConvolution2DSpec> config_set;
+  std::vector<std::vector<int64>> config_options = {
+      {128, 6, 3, 64},  {256, 5, 3, 256}, {256, 5, 2, 144}, {144, 5, 3, 64},
+      {144, 5, 2, 256}, {8, 48, 17, 8},   {128, 20, 6, 64}, {64, 14, 12, 172},
+      {16, 9, 4, 16},   {128, 1, 2, 144}, {256, 1, 2, 64}};
+
+  for (auto option : config_options) {
+    int64 feature = option[0];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[3];
+
+    std::vector<int64> kernel_layout = {3, 2, 1, 0};
+    DepthwiseConvolution2DSpec config;
+    config.output_feature = feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size, feature};
+    config.activation_layout = {3, 0, 2, 1};
+
+    config.kernel_dims = {kernel_size, kernel_size, 1, feature};
+    config.kernel_layout = {3, 2, 1, 0};
+
+    if (activation_size == 1 && kernel_size == 2) {
+      // Test for outer dim.
+      config.output_dims = {batch, activation_size + kernel_size - 1,
+                            activation_size + kernel_size, feature};
+    } else if (feature == 256) {
+      // Restrict dilation-based tests only to one feature configuration.
+      config.stride = activation_size - 1;
+      config.pad = 0;
+      config.lhs_dilate = feature / 32;
+      config.output_dims = {batch, feature / 32,
+                            activation_size - kernel_size + 1, feature};
+    } else {
+      config.stride = config.pad = config.lhs_dilate = -1;
+      config.output_dims = {batch, activation_size - kernel_size + 1,
+                            activation_size - kernel_size + 1, feature};
+    }
+
+    // Try this layout for all kernel shapes.
+    config.output_layout = {3, 0, 2, 1};
+    config_set.push_back(config);
+
+    // Try other layouts only for certain kernel shapes.
+    if (kernel_size % 2 == 0) {
+      config.activation_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.output_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.activation_layout = {3, 0, 2, 1};
+      config_set.push_back(config);
+    }
+  }
+
+  return config_set;
+}
+
+string DepthwiseConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
+      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"), data_type);
+  // -1 indicates non-existence.
+  if (spec.stride != -1) {
+    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
+  }
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextDepthwiseConvolution2D(
+    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
+          feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.window, spec.window, spec.window, spec.output_feature);
+
+  } else if (spec.stride == -1) {
+    return absl::StrFormat(
+        R"(
+      HloModule TensorFlowDepthwiseConv
+
+      ENTRY main {
+        activation = %s[%s]{%s} parameter(0)
+        kernel = %s[%s]{%s} parameter(1)
+        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
+            feature_group_count=%d
+      }
+      )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.output_feature);
+  } else {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
+          dim_labels=b01f_01io->b01f, feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.stride, 0, 0, spec.lhs_dilate, spec.output_feature);
+  }
+}
+
+XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) {
+  const DepthwiseConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text =
+      BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
+                            [](HloModule* module) -> Status {
+                              BFloat16MixedPrecisionRemoval remover;
+                              TF_RETURN_IF_ERROR(remover.Run(module).status());
+                              Despecializer despecializer;
+                              return despecializer.Run(module).status();
+                            }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DepthwiseConvolution2DTestWithRandomIndices, DepthwiseConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    DepthwiseConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 5f063e67847487f1d18bf4ee80b1634ebdf4183a..20bf3c317986c30c12dca7dca14dbf80c70b42f6 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
@@ -429,11 +429,9 @@ TEST_F(ConvertTest, ConvertReshape) {
 
 std::vector<float> GetInterestingF16ConversionTestCases() {
   float infinity = std::numeric_limits<float>::infinity();
-  float half_min_positive_normal =
-      tensorflow::bit_cast<float, uint32>(0x38800000);
-  float half_max_subnormal = tensorflow::bit_cast<float, uint32>(0x387fc000);
-  float half_min_positive_subnormal =
-      tensorflow::bit_cast<float, uint32>(0x33800000);
+  float half_min_positive_normal = absl::bit_cast<float, uint32>(0x38800000);
+  float half_max_subnormal = absl::bit_cast<float, uint32>(0x387fc000);
+  float half_min_positive_subnormal = absl::bit_cast<float, uint32>(0x33800000);
   float half_max = 65504.0f;
 
   std::vector<float> test_cases(
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 070b092d18930027e215cb43ff917e36cac99f12..4a58a1ed66c438d1dd9561f4eb029b38d8c6cbdd 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -91,7 +91,14 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest {
     XlaBuilder builder(TestName());
     auto lhs = ConstantR4FromArray4D<T>(&builder, *alhs);
     auto rhs = ConstantR4FromArray4D<T>(&builder, *arhs);
-    Conv(lhs, rhs, {1, 1}, Padding::kValid);
+    PrecisionConfig precision;
+    // The left hand side of the convolution is numbers between 0 and 2304 which
+    // requires at least 11 mantissa bits and the DEFAULT precision config is
+    // allowed to round to bfloat16 which only has 7 mantissa bits.
+    precision.add_operand_precision(PrecisionConfig::HIGHEST);
+    precision.add_operand_precision(PrecisionConfig::DEFAULT);
+    Conv(lhs, rhs, {1, 1}, Padding::kValid, /*feature_group_count=*/1,
+         &precision);
 
     ComputeAndCompare(&builder, {}, error_spec_);
   }
@@ -590,7 +597,692 @@ TYPED_TEST(Convolve2D_1x3x3x5_3x3x1x15_Depthwise_Valid, Types) {
 }
 
 template <typename T>
-class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
+class Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 5};
+    std::vector<int64> filter_dims = {3, 3, 1, 5};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/5);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(6864),  static_cast<T>(7296),  static_cast<T>(7746),
+         static_cast<T>(8214),  static_cast<T>(8700),  static_cast<T>(7809),
+         static_cast<T>(8286),  static_cast<T>(8781),  static_cast<T>(9294),
+         static_cast<T>(9825),  static_cast<T>(10644), static_cast<T>(11256),
+         static_cast<T>(11886), static_cast<T>(12534), static_cast<T>(13200),
+         static_cast<T>(11589), static_cast<T>(12246), static_cast<T>(12921),
+         static_cast<T>(13614), static_cast<T>(14325)});
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 5}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+
+    auto filter_r = filter_r1.Reshape(filter_dims);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 512}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 512}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(
+    Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes,
+    TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {256, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048 * 256, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 =
+        expected_r1.Reshape({256, 2, 2, 512}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {256, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048 * 256, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 =
+        expected_r1.Reshape({256, 2, 2, 512}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 5};
+    std::vector<int64> filter_dims = {3, 3, 1, 5};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/5);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(6864),  static_cast<T>(7296),  static_cast<T>(7746),
+         static_cast<T>(8214),  static_cast<T>(8700),  static_cast<T>(7809),
+         static_cast<T>(8286),  static_cast<T>(8781),  static_cast<T>(9294),
+         static_cast<T>(9825),  static_cast<T>(10644), static_cast<T>(11256),
+         static_cast<T>(11886), static_cast<T>(12534), static_cast<T>(13200),
+         static_cast<T>(11589), static_cast<T>(12246), static_cast<T>(12921),
+         static_cast<T>(13614), static_cast<T>(14325)});
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 5}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(
+    Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes,
+    TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({3, 0, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 1024};
+    std::vector<int64> filter_dims = {3, 3, 1, 1024};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/1024);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(4096, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 1024}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid : public ConvolutionTest {
  public:
   void RunTest() {
     XlaBuilder builder(TestName());
@@ -618,7 +1310,200 @@ class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
       dnums.set_kernel_output_feature_dimension(3);
 
       ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
-                                /*feature_group_count=*/3);
+                                /*feature_group_count=*/3);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(5076), static_cast<T>(5160), static_cast<T>(5244),
+         static_cast<T>(5328), static_cast<T>(6164), static_cast<T>(6264),
+         static_cast<T>(6364), static_cast<T>(6464), static_cast<T>(7380),
+         static_cast<T>(7496), static_cast<T>(7612), static_cast<T>(7728)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 12}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x1024_2x2x128x512_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 1024};
+    std::vector<int64> filter_dims = {2, 2, 128, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/8);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(512, static_cast<T>(1024));
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 512}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x1024_2x2x128x512_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x1024_2x2x128x512_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x1024_2x2x128x8_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 1024};
+    std::vector<int64> filter_dims = {2, 2, 128, 8};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/8);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(8, static_cast<T>(1024));
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 8}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x1024_2x2x128x8_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x1024_2x2x128x8_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 12};
+    std::vector<int64> filter_dims = {2, 2, 3, 4};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/4);
     }
 
     std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
@@ -631,12 +1516,140 @@ class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
     auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
     auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
 
+    auto expected_r1 =
+        LiteralUtil::CreateR1<T>({static_cast<T>(7712), static_cast<T>(8816),
+                                  static_cast<T>(9992), static_cast<T>(11240)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 4}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid_Filter_OF_In_Sublanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 12};
+    std::vector<int64> filter_dims = {2, 2, 4, 3};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(3);
+      dnums.set_kernel_output_feature_dimension(2);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/4);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+    auto filter_r4_relaid =
+        filter_r4.Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
     auto expected_r1 = LiteralUtil::CreateR1<T>(
-        {static_cast<T>(5076), static_cast<T>(5160), static_cast<T>(5244),
-         static_cast<T>(5328), static_cast<T>(6164), static_cast<T>(6264),
-         static_cast<T>(6364), static_cast<T>(6464), static_cast<T>(7380),
-         static_cast<T>(7496), static_cast<T>(7612), static_cast<T>(7728)});
-    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 12}).ConsumeValueOrDie();
+        {static_cast<T>(6968), static_cast<T>(8516), static_cast<T>(10280),
+         static_cast<T>(12260)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 4}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4_relaid).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid_Filter_OF_In_Sublanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid_Filter_OF_In_Sublanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x1x1x12_1x1x3x4_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 1, 1, 12};
+    std::vector<int64> filter_dims = {1, 1, 3, 4};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/4);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 =
+        LiteralUtil::CreateR1<T>({static_cast<T>(38), static_cast<T>(98),
+                                  static_cast<T>(176), static_cast<T>(272)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 4}).ConsumeValueOrDie();
 
     auto input_literal =
         client_->TransferToServer(input_r4).ConsumeValueOrDie();
@@ -649,8 +1662,8 @@ class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
   }
 };
 
-TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, TestTypes);
-TYPED_TEST(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, Types) {
+TYPED_TEST_CASE(Convolve2D_1x1x1x12_1x1x3x4_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x1x1x12_1x1x3x4_Grouped_Valid, Types) {
   this->RunTest();
 }
 
@@ -876,7 +1889,7 @@ XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
 // (We run this test on all platforms, because, what the heck.)
 XLA_TEST_F(ConvolutionTest, NoCudnnAlgorithmPicker) {
   execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
-      "cudnn-convolution-algorithm-picker");
+      "cudnn-conv-algorithm-picker");
 
   XlaBuilder builder(TestName());
   Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
@@ -944,6 +1957,18 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF32ForwardReversed)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %arg0 = f32[3,56,56,16] parameter(0)
+  %arg1 = f32[3,3,3,32] parameter(1)
+  ROOT %conv = f32[54,54,16,32] convolution(%arg0, %arg1), window={size=3x3 rhs_reversal=1x1}, dim_labels=f01b_i01o->01bf
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
 XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardFilter)) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 1407e68d9a336b6bb1c960711015430f872aa912..3622f2c1e84639baed13059b21b20609d1347da6 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -45,7 +45,7 @@ class CopyOpTest : public HloTestBase {
     builder.AddInstruction(HloInstruction::CreateUnary(
         constant->shape(), HloOpcode::kCopy, constant));
     auto computation = builder.Build();
-    auto module = CreateNewModule();
+    auto module = CreateNewUnverifiedModule();
     module->AddEntryComputation(std::move(computation));
 
     Literal result = ExecuteAndTransfer(std::move(module), {});
@@ -98,7 +98,7 @@ XLA_TEST_F(CopyOpTest, CopyParameterScalar) {
 
   auto computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
 
   Literal result = ExecuteAndTransfer(std::move(module), {&literal});
@@ -119,7 +119,7 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2Twice) {
 
   auto computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   Literal result = ExecuteAndTransfer(std::move(module), {});
   LiteralTestUtil::ExpectR2Near<float>({{1.0, 2.0}, {3.0, 4.0}}, result,
@@ -143,7 +143,7 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2DifferentLayouts) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   Literal result = ExecuteAndTransfer(std::move(module), {});
 
@@ -175,7 +175,7 @@ void CopyOpTest::TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   ForceResultLayout(module.get(), LayoutUtil::MakeLayout({1, 2, 0}));
   Literal result = ExecuteAndTransfer(std::move(module), {});
@@ -209,7 +209,7 @@ void CopyOpTest::TestCopyConstantLayoutR4(size_t n1, size_t n2, size_t n3,
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   module->AddEntryComputation(std::move(computation));
   ForceResultLayout(module.get(), LayoutUtil::MakeLayout(permutation));
   Literal result = ExecuteAndTransfer(std::move(module), {});
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index a693fa35954bcb2d95074c94d0aa3eabc1d5fd62..738b6442354b01364278e3e3c713aa2cdb5cf47d 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -70,7 +70,7 @@ class CustomCallTest : public HloTestBase {
 };
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto constant = builder.AddInstruction(
@@ -85,7 +85,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
 }
 
 XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
-  auto module = CreateNewModule();
+  auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   Array2D<float> array(2, 2);
@@ -105,9 +105,8 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
   LiteralTestUtil::ExpectR0Near<float>(10.0f, result, error_spec_);
 }
 
-XLA_TEST_F(CustomCallTest,
-           DISABLED_ON_GPU(CustomCall_UsedInOtherComputations)) {
-  auto module = CreateNewModule();
+XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
+  auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
   auto input = b.AddInstruction(
@@ -130,6 +129,53 @@ XLA_TEST_F(CustomCallTest,
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, result);
 }
 
+XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(InputAndOutputLayoutDiffer)) {
+  auto module = CreateNewUnverifiedModule();
+  auto b = HloComputation::Builder(TestName());
+
+  auto input =
+      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+  b.AddInstruction(
+      HloInstruction::CreateCustomCall(r2f32_, {input}, "Add1ToValues"));
+
+  module->AddEntryComputation(b.Build());
+  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+
+  Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
+
+  // Note, the expected result is transposed! This is because the input and
+  // output layouts of the custom call differ and the called function just
+  // blindly adds one to each element.
+  Literal result = ExecuteAndTransfer(std::move(module), {&argument});
+  LiteralTestUtil::ExpectR2Equal<float>({{2.f, 4.f}, {3.f, 5.f}}, result);
+}
+
+XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(LayoutConstrained)) {
+  // The argument and result of the computation are set to different layouts,
+  // but the custom call is layout constrained to a fixed operand and result
+  // layout, so the correct result should be produced.
+  auto module = CreateNewUnverifiedModule();
+  auto b = HloComputation::Builder(TestName());
+
+  auto input =
+      b.AddInstruction(HloInstruction::CreateParameter(0, r2f32_, "p"));
+
+  const Shape& r2f32_dim0_major =
+      ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
+  b.AddInstruction(HloInstruction::CreateCustomCall(
+      r2f32_dim0_major, {input}, "Add1ToValues", {r2f32_dim0_major}));
+
+  module->AddEntryComputation(b.Build());
+  ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
+  ForceResultLayout(module.get(), LayoutUtil::MakeLayout({0, 1}));
+
+  Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
+
+  Literal result = ExecuteAndTransfer(std::move(module), {&argument});
+  LiteralTestUtil::ExpectR2Equal<float>({{2.f, 3.f}, {4.f, 5.f}}, result);
+}
+
 class CustomCallClientAPITest : public ClientLibraryTestBase {};
 
 // When using the client API, CustomCall targets can't begin with '$' -- these
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 6c0847a875798870b4362a99ac2ab65d99f9f3e6..c5d8b663f4abe77e05ec213d2e4e075c260a8655 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
 namespace {
@@ -637,6 +636,76 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMul) {
       {x_data.get(), y_data.get()}, this->error_spec_);
 }
 
+#ifndef XLA_TEST_BACKEND_CPU
+// TODO(b/74459949): failed on CPU on 2018-10-29.
+XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR3LhsR2Rhs) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  auto x =
+      Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2}), "x");
+  auto y = Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({2, 2}), "y");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+
+  DotGeneral(x, y, dnums);
+
+  auto x_data =
+      this->client_
+          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
+              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
+          .ConsumeValueOrDie();
+
+  auto y_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
+                    .ConsumeValueOrDie();
+
+  this->template ComputeAndCompareR2<T>(
+      &builder,
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      this->error_spec_);
+}
+
+// TODO(b/74459949): failed on CPU on 2018-10-29.
+XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR2LhsR3Rhs) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  auto x = Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({2, 2}), "x");
+  auto y =
+      Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({2, 2, 2}), "y");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+
+  DotGeneral(x, y, dnums);
+
+  auto x_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
+                    .ConsumeValueOrDie();
+
+  auto y_data =
+      this->client_
+          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
+              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
+          .ConsumeValueOrDie();
+
+  this->template ComputeAndCompareR2<T>(
+      &builder,
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      this->error_spec_);
+}
+#endif  // XLA_TEST_BACKEND_CPU
+
 XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulMultipleBatch) {
   using T = TypeParam;
 
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
index 51b50d456e496c9c01c38fb8539bb3737de16937..c84973e17b234c24c84f02a369ce0185f5772cca 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/lib/core/casts.h"
 
 namespace xla {
 namespace {
@@ -47,7 +47,7 @@ class ExhaustiveF32ElementwiseOpTest
         // input to 0 under the assumption that the op is at least correct on 0.
         input_literal.Set({i - begin}, 0.0f);
       } else {
-        input_literal.Set({i - begin}, tensorflow::bit_cast<float, int>(i));
+        input_literal.Set({i - begin}, absl::bit_cast<float, int>(i));
       }
     }
 
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 4d4b676a538947c8dd92a7e34db72e45766cae2c..d1fddf9d6b494a822610e41307fa103dc90bdef3 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -81,7 +81,7 @@ class FusionTest : public HloTestBase {
     }
 
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
 
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
 
@@ -183,7 +183,7 @@ XLA_TEST_F(FusionTest, Test) {
   //                     (-{{1.0, 1.0, 1.0}, {0.0, 0.0, 0.0}}),
   //              {{0.5, 0.5, 0.5}, {0.5, 0.5, 0.5}})) = {{0.5}, {2.72}}
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0}, {2.0}, {3.0}})));
   auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -231,7 +231,7 @@ XLA_TEST_F(FusionTest, Parameter) {
   // Build a computation and fuse part of it so the fusion instruction has an
   // operand parameter.
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<float>({{1.0, 2.0, 3.0}})));
   auto copy1 = builder.AddInstruction(HloInstruction::CreateUnary(
@@ -266,7 +266,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
       ShapeUtil::MakeShapeWithLayout(F32, {rand_dim0_size, dim1_size}, {1, 0});
   // Build simple fusion computation: y = x^2 (elementwise).
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto two = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(2.0)));
@@ -290,7 +290,7 @@ XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
 
 XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const_vector = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
   auto const_array = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -314,7 +314,7 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
 
 XLA_TEST_F(FusionTest, ReshapeToScalar) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto single_element_array = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR2<int32>({{5}})));
   auto reshape = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -329,7 +329,7 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) {
 
 XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -344,7 +344,7 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
 
 XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}})));
   auto reshape1 = builder.AddInstruction(
@@ -359,7 +359,7 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
 
 XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR3<int32>({{{7}}})));
   auto reshape1 = builder.AddInstruction(
@@ -374,7 +374,7 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
 
 XLA_TEST_F(FusionTest, Reshape__1by1by1) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateReshape(
@@ -389,7 +389,7 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) {
 
 XLA_TEST_F(FusionTest, Reshape__) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(7)));
   auto reshape1 = builder.AddInstruction(
@@ -404,7 +404,7 @@ XLA_TEST_F(FusionTest, Reshape__) {
 
 XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(
@@ -419,7 +419,7 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
 
 XLA_TEST_F(FusionTest, Transpose_2by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -434,7 +434,7 @@ XLA_TEST_F(FusionTest, Transpose_2by3) {
 
 XLA_TEST_F(FusionTest, Transpose_3by3) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}})));
   auto reshape1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -449,7 +449,7 @@ XLA_TEST_F(FusionTest, Transpose_3by3) {
 
 XLA_TEST_F(FusionTest, Reverse) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
@@ -465,7 +465,7 @@ XLA_TEST_F(FusionTest, Reverse) {
 
 XLA_TEST_F(FusionTest, ReverseNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1, 2, 3})));
   auto reverse1 = builder.AddInstruction(HloInstruction::CreateReverse(
@@ -483,7 +483,7 @@ XLA_TEST_F(FusionTest, ReverseNegate) {
 
 XLA_TEST_F(FusionTest, BroadcastNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
   auto broadcast1 = builder.AddInstruction(HloInstruction::CreateBroadcast(
@@ -501,7 +501,7 @@ XLA_TEST_F(FusionTest, BroadcastNegate) {
 
 XLA_TEST_F(FusionTest, SliceNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto slice1 = builder.AddInstruction(HloInstruction::CreateSlice(
@@ -519,7 +519,7 @@ XLA_TEST_F(FusionTest, SliceNegate) {
 
 XLA_TEST_F(FusionTest, DynamicSliceNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto const1 = builder.AddInstruction(
@@ -541,7 +541,7 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) {
 
 XLA_TEST_F(FusionTest, ReshapeNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto reshape1 = builder.AddInstruction(
@@ -559,7 +559,7 @@ XLA_TEST_F(FusionTest, ReshapeNegate) {
 
 XLA_TEST_F(FusionTest, TransposeNegate) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}})));
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -587,7 +587,7 @@ std::unique_ptr<HloComputation> MakeReduceTestComputation() {
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -607,7 +607,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
@@ -630,7 +630,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
   auto builder = HloComputation::Builder(TestName());
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<int32>({{2, 3, 5}, {7, 11, 13}, {17, 19, 23}})));
   auto const1 = builder.AddInstruction(
@@ -682,7 +682,7 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
 // into a fusion, it should remain shared, rather than being duplicated
 // within the fusion.
 XLA_TEST_F(FusionTest, SharedConstant) {
-  auto hlo_module = CreateNewModule();
+  auto hlo_module = CreateNewUnverifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/tests/grouped_convolution_test.cc b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f7049910e70c4e591636a47c1b6ba72cf2c234f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
@@ -0,0 +1,245 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct GroupedConvolution2DSpec {
+  int64 input_feature, output_feature, window, stride, pad, lhs_dilate;
+  int64 group_size, group_count;
+  std::vector<int64> activation_dims;
+  std::vector<int64> activation_layout;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> kernel_layout;
+  std::vector<int64> output_dims;
+  std::vector<int64> output_layout;
+};
+
+class GroupedConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<GroupedConvolution2DSpec, bool>> {};
+
+static std::vector<GroupedConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<GroupedConvolution2DSpec> config_set;
+  // Add to this set if you want a new test configuration.
+  // Rule : the penultimate number must be divisible by the last number.
+  std::vector<std::vector<int64>> config_options = {{8, 2, 2, 1, 1024, 128},
+                                                    {512, 3, 3, 144, 1024, 16},
+                                                    {256, 3, 3, 129, 512, 64},
+                                                    {64, 1, 2, 127, 32, 8},
+                                                    {256, 3, 3, 256, 1024, 4}};
+
+  for (auto option : config_options) {
+    int64 output_feature = option[0];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[3];
+    int64 input_feature = option[4];
+    int64 group_size = option[5];
+
+    std::vector<int64> kernel_layout = {3, 2, 1, 0};
+    GroupedConvolution2DSpec config;
+    config.group_size = group_size;
+    config.group_count = input_feature / group_size;
+    config.output_feature = output_feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size,
+                              input_feature};
+    config.activation_layout = {3, 0, 2, 1};
+
+    config.kernel_dims = {kernel_size, kernel_size, group_size, output_feature};
+    config.kernel_layout = {3, 2, 1, 0};
+
+    if (activation_size == 1 && kernel_size == 2) {
+      // Test for outer dim.
+      config.output_dims = {batch, activation_size + kernel_size - 1,
+                            activation_size + kernel_size, output_feature};
+    } else if (output_feature == 256) {
+      // Restrict dilation-based tests only to one feature configuration.
+      config.stride = activation_size - 1;
+      config.pad = 0;
+      config.lhs_dilate = output_feature / 32;
+      config.output_dims = {batch, output_feature / 32,
+                            activation_size - kernel_size + 1, output_feature};
+    } else {
+      config.stride = config.pad = config.lhs_dilate = -1;
+      config.output_dims = {batch, activation_size - kernel_size + 1,
+                            activation_size - kernel_size + 1, output_feature};
+    }
+
+    // Try this layout for all kernel shapes.
+    config.output_layout = {3, 0, 2, 1};
+    config_set.push_back(config);
+
+    // Try other layouts only for certain kernel shapes.
+    if (kernel_size % 2 == 0) {
+      config.activation_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.output_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.activation_layout = {3, 0, 2, 1};
+      config_set.push_back(config);
+    }
+  }
+
+  return config_set;
+}
+
+string GroupedConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<GroupedConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
+      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"), data_type);
+  // -1 indicates non-existence.
+  if (spec.stride != -1) {
+    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
+  }
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextGroupedConvolution2D(const GroupedConvolution2DSpec& spec,
+                                        bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
+    // Check for outer dim.
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
+          feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.window, spec.window, spec.window, spec.group_count);
+
+  } else if (spec.stride == -1) {
+    // Check for basic, non-dilated cases.
+    return absl::StrFormat(
+        R"(
+      HloModule TensorFlowDepthwiseConv
+
+      ENTRY main {
+        activation = %s[%s]{%s} parameter(0)
+        kernel = %s[%s]{%s} parameter(1)
+        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
+            feature_group_count=%d
+      }
+      )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.group_count);
+  } else {
+    // Check for base dilations.
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
+          dim_labels=b01f_01io->b01f, feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.stride, 0, 0, spec.lhs_dilate, spec.group_count);
+  }
+}
+
+XLA_TEST_P(GroupedConvolution2DTest, DoIt) {
+  const GroupedConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text = BuildHloTextGroupedConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
+                            [](HloModule* module) -> Status {
+                              BFloat16MixedPrecisionRemoval remover;
+                              TF_RETURN_IF_ERROR(remover.Run(module).status());
+                              Despecializer despecializer;
+                              return despecializer.Run(module).status();
+                            }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    GroupedConvolution2DTestWithRandomIndices, GroupedConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    GroupedConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index bdd4fd7e3d0f585d81e94a3326e6d24bb5c42f39..989a7c705a8254f99e5cc0e97dfde5942f146964 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
@@ -85,26 +85,74 @@ ProgramShape GetProgramShapeWithLayout(const HloModule& module) {
 
 }  // namespace
 
+Status VerifiedHloModule::Verify() {
+  if (computation_count() == 0) {
+    // The computation was never built. Nothing to verify.
+    return Status::OK();
+  }
+  return verifier_.Run(this).status();
+}
+
+void VerifiedHloModule::VerifyOrAddFailure(const string& message) {
+  Status status = Verify();
+  if (!status.ok()) {
+    ADD_FAILURE() << "HloVerifier failed on module " << name()
+                  << (message.empty() ? "" : absl::StrCat(" (", message, ")"))
+                  << ": " << status;
+    LOG(ERROR) << "Contents of bad module:";
+    XLA_LOG_LINES(tensorflow::ERROR, ToString());
+  }
+}
+
 HloTestBase::HloTestBase(bool verifier_layout_sensitive,
-                         bool allow_mixed_precision_in_hlo_verifier)
+                         bool allow_mixed_precision_in_hlo_verifier,
+                         std::function<bool(const HloInstruction*)>
+                             instruction_can_change_layout_func)
     : HloTestBase(GetTestPlatform(), GetReferencePlatform(),
                   verifier_layout_sensitive,
-                  allow_mixed_precision_in_hlo_verifier) {}
+                  allow_mixed_precision_in_hlo_verifier,
+                  instruction_can_change_layout_func) {}
 
 HloTestBase::HloTestBase(se::Platform* test_platform,
                          se::Platform* reference_platform,
                          bool verifier_layout_sensitive,
-                         bool allow_mixed_precision_in_hlo_verifier)
-    : test_runner_(test_platform), reference_runner_(reference_platform) {
+                         bool allow_mixed_precision_in_hlo_verifier,
+                         std::function<bool(const HloInstruction*)>
+                             instruction_can_change_layout_func)
+    : test_runner_(test_platform),
+      reference_runner_(reference_platform),
+      verifier_layout_sensitive_(verifier_layout_sensitive),
+      allow_mixed_precision_in_hlo_verifier_(
+          allow_mixed_precision_in_hlo_verifier) {
   hlo_verifier_ = absl::make_unique<HloVerifier>(
       /*layout_sensitive=*/verifier_layout_sensitive,
-      /*allow_mixed_precision=*/allow_mixed_precision_in_hlo_verifier);
+      /*allow_mixed_precision=*/allow_mixed_precision_in_hlo_verifier,
+      instruction_can_change_layout_func);
 }
 
-std::unique_ptr<HloModule> HloTestBase::CreateNewModule(const string& name) {
+std::unique_ptr<HloModule> HloTestBase::CreateNewUnverifiedModule(
+    const string& name) {
   return absl::make_unique<HloModule>(name, GetModuleConfigForTest());
 }
 
+std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
+    const string& name) {
+  return absl::make_unique<VerifiedHloModule>(
+      name, GetModuleConfigForTest(), verifier_layout_sensitive_,
+      allow_mixed_precision_in_hlo_verifier_);
+}
+
+StatusOr<std::unique_ptr<VerifiedHloModule>>
+HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
+                                          const HloModuleConfig& config) {
+  auto module = absl::make_unique<VerifiedHloModule>(
+      TestName(), config, verifier_layout_sensitive_,
+      allow_mixed_precision_in_hlo_verifier_);
+  TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
+  TF_RETURN_IF_ERROR(module->Verify());
+  return std::move(module);
+}
+
 /* static */
 StatusOr<bool> HloTestBase::RunHloPass(HloPassInterface* hlo_pass,
                                        HloModule* module) {
@@ -129,7 +177,7 @@ PrecisionConfig HloTestBase::DefaultPrecisionConfig(int operands) {
 }
 
 DebugOptions HloTestBase::GetDebugOptionsForTest() {
-  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  auto debug_options = GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
   debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 0ae4bdc104d656946d45008adec9ea3960984545..1d1e7f437296a7493ef7da07039fcf6d273f35bc 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/macros.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/backend.h"
@@ -38,6 +39,31 @@ limitations under the License.
 
 namespace xla {
 
+// An HLO module derived class which verifies itself on destruction. This class
+// is intended to be used in unit tests. Any verification errors are raised via
+// ADD_FAILURE.
+class VerifiedHloModule : public HloModule {
+ public:
+  VerifiedHloModule(const string& name, const HloModuleConfig& config,
+                    bool verifier_layout_sensitive,
+                    bool allow_mixed_precision_in_hlo_verifier)
+      : HloModule(name, config),
+        verifier_(verifier_layout_sensitive,
+                  allow_mixed_precision_in_hlo_verifier) {}
+
+  ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
+
+  // Verifies the module using HloVerifier and returns the status.
+  Status Verify();
+
+  // Verifies the module and flags any error with ADD_FAILURE. 'message' is
+  // included in the failure message.
+  void VerifyOrAddFailure(const string& message);
+
+ private:
+  HloVerifier verifier_;
+};
+
 // A base class for tests which build and/or run HLO code. The class includes
 // support for running an HLO module on two platforms and compare the results.
 // This is a lower level of abstraction than using the client interface and
@@ -72,7 +98,22 @@ class HloTestBase : public ::testing::Test {
   // options from command-line flags. If you want a fresh HloModule object and
   // then add HloComputations to it, it's recommended to use this method in your
   // tests.
-  std::unique_ptr<HloModule> CreateNewModule(const string& name = TestName());
+  //
+  // This returns a vanilla HloModule that doesn't run the HLO verifier on
+  // destruction.
+  ABSL_DEPRECATED("Use CreateNewVerifiedModule instead.")
+  std::unique_ptr<HloModule> CreateNewUnverifiedModule(
+      const string& name = TestName());
+
+  // Like CreateNewUnverifiedModule, except the HloModule returned here runs the
+  // HLO verifier on destruction.
+  std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
+      const string& name = TestName());
+
+  // Parses the given string and returns module as a VerifiedHloModule.
+  StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
+      absl::string_view hlo_text,
+      const HloModuleConfig& config = HloModuleConfig());
 
   // Runs the hlo_pass with the provided module and returns the result. This
   // function also verifies that the module remains unchanged when hlo_pass
@@ -88,14 +129,18 @@ class HloTestBase : public ::testing::Test {
   // interpreter is the only supported backend, it will be both the test backend
   // and the reference backend.
   HloTestBase(bool verifier_layout_sensitive = false,
-              bool allow_mixed_precision_in_hlo_verifier = true);
+              bool allow_mixed_precision_in_hlo_verifier = true,
+              std::function<bool(const HloInstruction*)>
+                  instruction_can_change_layout_func = {});
 
   // If your test doesn't use interpreter as the reference backend, you can use
   // this constructor. Note that your test target is responsible for linking in
   // both needed backends.
   HloTestBase(se::Platform* test_platform, se::Platform* reference_platform,
               bool verifier_layout_sensitive = false,
-              bool allow_mixed_precision_in_hlo_verifier = true);
+              bool allow_mixed_precision_in_hlo_verifier = true,
+              std::function<bool(const HloInstruction*)>
+                  instruction_can_change_layout_func = {});
 
   ~HloTestBase() override {}
 
@@ -243,6 +288,8 @@ class HloTestBase : public ::testing::Test {
   HloRunner test_runner_;
   HloRunner reference_runner_;
 
+  bool verifier_layout_sensitive_;
+  bool allow_mixed_precision_in_hlo_verifier_;
   std::unique_ptr<HloVerifier> hlo_verifier_;
 
   ErrorSpec error_spec_{0.0001};
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
deleted file mode 100644
index 8bd0a729b77f3ec14204952cb0062103c823883e..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
-
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/hlo_verifier.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace xla {
-
-Status VerifiedHloModule::Verify() {
-  if (computation_count() == 0) {
-    // The computation was never built. Nothing to verify.
-    return Status::OK();
-  }
-  return verifier_.Run(this).status();
-}
-
-void VerifiedHloModule::VerifyOrAddFailure(const string& message) {
-  Status status = Verify();
-  if (!status.ok()) {
-    ADD_FAILURE() << "HloVerifier failed on module " << name()
-                  << (message.empty() ? "" : absl::StrCat(" (", message, ")"))
-                  << ": " << status;
-  }
-}
-
-HloVerifiedTestBase::HloVerifiedTestBase(bool layout_sensitive,
-                                         bool allow_mixed_precision)
-    : HloTestBase(
-          /*verifier_layout_sensitive=*/layout_sensitive,
-          /*allow_mixed_precision_in_hlo_verifier=*/allow_mixed_precision),
-      verifier_layout_sensitive_(layout_sensitive),
-      allow_mixed_precision_in_hlo_verifier_(allow_mixed_precision) {}
-
-HloModule& HloVerifiedTestBase::module() {
-  if (!module_) {
-    module_ = CreateNewVerifiedModule(TestName());
-  }
-  return *module_;
-}
-
-HloModule* HloVerifiedTestBase::CreateNewModule(const string& name) {
-  modules_.emplace_back(CreateNewVerifiedModule(name));
-  return modules_.back().get();
-}
-
-void HloVerifiedTestBase::ParseAndVerifyModule(absl::string_view hlo_text,
-                                               const HloModuleConfig& config) {
-  CHECK(!module_) << "Called ParseModule when test already has a module.";
-  module_ = CreateNewVerifiedModule(TestName());
-  TF_CHECK_OK(ParseHloString(hlo_text, module_.get()));
-  module_->VerifyOrAddFailure("after parsing");
-}
-
-StatusOr<std::unique_ptr<VerifiedHloModule>>
-HloVerifiedTestBase::ParseAndReturnVerifiedModule(
-    absl::string_view hlo_text, const HloModuleConfig& config) {
-  auto module = CreateNewVerifiedModule(TestName());
-  TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
-  TF_RETURN_IF_ERROR(module->Verify());
-  return std::move(module);
-}
-
-std::unique_ptr<VerifiedHloModule> HloVerifiedTestBase::CreateNewVerifiedModule(
-    const string& name) {
-  return absl::make_unique<VerifiedHloModule>(
-      name, GetModuleConfigForTest(), verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_);
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
deleted file mode 100644
index 388a99bb36408665edbc20ade6c6a733d64db88d..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_TESTS_HLO_VERIFIED_TEST_BASE_H_
-#define TENSORFLOW_COMPILER_XLA_TESTS_HLO_VERIFIED_TEST_BASE_H_
-
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "absl/base/macros.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-
-namespace xla {
-
-// An HLO module derived class which verifies itself on destruction. This class
-// is intended to be used in unit tests. Any verification errors are raised via
-// ADD_FAILURE.
-class VerifiedHloModule : public HloModule {
- public:
-  VerifiedHloModule(const string& name, const HloModuleConfig& config,
-                    bool verifier_layout_sensitive,
-                    bool allow_mixed_precision_in_hlo_verifier)
-      : HloModule(name, config),
-        verifier_(verifier_layout_sensitive,
-                  allow_mixed_precision_in_hlo_verifier) {}
-
-  ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
-
-  // Verifies the module using HloVerifier and returns the status.
-  Status Verify();
-
-  // Verifies the module and flags any error with ADD_FAILURE. 'message' is
-  // included in the failure message.
-  void VerifyOrAddFailure(const string& message);
-
- private:
-  HloVerifier verifier_;
-};
-
-// A base class for HLO tests that stores a default VerifiedHloModule.
-class HloVerifiedTestBase : public HloTestBase {
- protected:
-  HloVerifiedTestBase(bool layout_sensitive = false,
-                      bool allow_mixed_precision = false);
-
-  // Constructs a default shape verifier.
-  std::unique_ptr<ShapeVerifier> MakeShapeVerifier();
-
-  // Returns the default HloModule, lazily creating it if necessary via
-  // HloTestBase::CreateNewModule().
-  ABSL_DEPRECATED("Use CreateNewVerifiedModule() instead.")
-  HloModule& module();
-
-  ABSL_DEPRECATED("Use ParseAndReturnVerifiedModule() instead.")
-  void ParseAndVerifyModule(absl::string_view hlo_text,
-                            const HloModuleConfig& config = HloModuleConfig());
-
-  // Parses the given string and returns module as a VerifiedHloModule.
-  StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
-      absl::string_view hlo_text,
-      const HloModuleConfig& config = HloModuleConfig());
-
-  // Creates a new module for a test, and stores it in modules_ so it can be
-  // verified. Intentionally hides HloTestBase::CreateNewModule, to prevent
-  // creation of unverified modules.
-  ABSL_DEPRECATED("Use CreateNewVerifiedModule() instead.")
-  HloModule* CreateNewModule(const string& name = TestName());
-
-  // Creates and returns a verified HLO module with the given name.
-  std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
-      const string& name = TestName());
-
- private:
-  // It is confusing to store modules created by module() and CreateNewModule()
-  // in different fields, but it allows us to migrate tests to
-  // HloVerifiedTestBase more easily, so it's a win because we can verify more
-  // modules. See b/80488902.
-  //
-  // Lazily populated. Access via module().
-  std::unique_ptr<VerifiedHloModule> module_;
-
-  // Populated by calls to CreateNewModule.
-  std::vector<std::unique_ptr<VerifiedHloModule>> modules_;
-
-  bool verifier_layout_sensitive_;
-  bool allow_mixed_precision_in_hlo_verifier_;
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_TESTS_HLO_VERIFIED_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base_test.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base_test.cc
deleted file mode 100644
index 5c0263e811f94c90a69a460525ffa0c65127ebb5..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base_test.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
-
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_verifier.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-namespace {
-
-// This class includes unit tests which are expected to fail because invalid HLO
-// modules are intentionally built. Unfortunately, Tensorflow doesn't appear to
-// include the necessary gunit parts to test this test machinery (needs the
-// macro EXPECT_NONFATAL_FAILURE). The disabled tests can be run with the
-// disabled tests enabled and failures can be manually compared against
-// expectations.
-class HloVerifiedTestBaseTest : public HloVerifiedTestBase {};
-
-XLA_TEST_F(HloVerifiedTestBaseTest, NoModule) {
-  // Test shouldn't fail if no module is created at all.
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, GoodLazilyCreatedModule) {
-  // Use module() to lazily create an empty module, build it up, and verify no
-  // failures.
-  HloModule& hlo_module = module();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  hlo_module.AddEntryComputation(builder.Build());
-}
-
-// This test is expected to fail. See test class comment.
-XLA_TEST_F(HloVerifiedTestBaseTest, DISABLED_BadLazilyCreatedModule) {
-  // Use module() to lazily create an empty module and build up an invalid
-  // module.
-  HloModule& hlo_module = module();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  hlo_module.AddEntryComputation(builder.Build());
-
-  *hlo_module.entry_computation()->root_instruction()->mutable_shape() =
-      ShapeUtil::MakeShape(PRED, {1, 2, 3});
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, GoodCreateNewModule) {
-  // Call CreateNewModule and build up a valid module.
-  HloModule* module = CreateNewModule();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  module->AddEntryComputation(builder.Build());
-}
-
-// This test is expected to fail. See test class comment.
-XLA_TEST_F(HloVerifiedTestBaseTest, DISABLED_BadCreateNewModule) {
-  // Call CreateNewModule and build up a invalid module.
-  HloModule* module = CreateNewModule();
-  auto builder = HloComputation::Builder(TestName());
-  auto input = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateUnary(input->shape(), HloOpcode::kNegate, input));
-  module->AddEntryComputation(builder.Build());
-
-  *module->entry_computation()->root_instruction()->mutable_shape() =
-      ShapeUtil::MakeShape(PRED, {1, 2, 3});
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, ParseAndVerifyModuleGood) {
-  const char* const hlo_string = R"(
-HloModule ParseAndVerifyModuleGood
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[] add(x,y)
-}
-)";
-
-  ParseAndVerifyModule(hlo_string);
-  EXPECT_EQ(module().entry_computation()->instruction_count(), 3);
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, ParseAndReturnVerifiedModuleGood) {
-  const char* const hlo_string = R"(
-HloModule ParseAndReturnVerifiedModuleGood
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[] add(x,y)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  EXPECT_EQ(module->entry_computation()->instruction_count(), 3);
-}
-
-XLA_TEST_F(HloVerifiedTestBaseTest, ParseAndReturnVerifiedModuleInvalidText) {
-  const char* const hlo_string = R"(
-HloModule ParseAndReturnVerifiedModuleGood
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[] add(x,y)
-}
-
-RANDOM GARBAGE
-)";
-
-  ASSERT_IS_NOT_OK(ParseAndReturnVerifiedModule(hlo_string).status());
-}
-
-// This test is expected to fail. See test class comment.
-XLA_TEST_F(HloVerifiedTestBaseTest, DISABLED_ParseAndReturnVerifiedModuleBad) {
-  const char* const hlo_string = R"(
-HloModule ParseAndReturnVerifiedModuleBad
-
-ENTRY entry {
-  x = f32[] parameter(0)
-  y = f32[] parameter(1)
-  ROOT add = f32[1234] add(x,y)
-}
-)";
-
-  ASSERT_IS_NOT_OK(ParseAndReturnVerifiedModule(hlo_string).status());
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/iota_test.cc b/tensorflow/compiler/xla/tests/iota_test.cc
index 310f3495922250d68aa463fcbb24ef0b04603d09..65205f53ddc582ae477d67705f161fef1e31b857 100644
--- a/tensorflow/compiler/xla/tests/iota_test.cc
+++ b/tensorflow/compiler/xla/tests/iota_test.cc
@@ -113,5 +113,26 @@ INSTANTIATE_TEST_CASE_P(IotaR3TestInstantiation, IotaR3Test,
                                                             /*step=*/10),
                                            ::testing::Values(0, 1, 2)));
 
+class IotaR3PredTest : public ClientLibraryTestBase,
+                       public ::testing::WithParamInterface<int> {};
+
+TEST_P(IotaR3PredTest, DoIt) {
+  const auto element_type = PRED;
+  const int64 num_elements = 2;
+  const int64 iota_dim = GetParam();
+  XlaBuilder builder(TestName() + "_" + PrimitiveType_Name(element_type));
+  std::vector<int64> dimensions = {42, 19};
+  dimensions.insert(dimensions.begin() + iota_dim, num_elements);
+  Iota(&builder, ShapeUtil::MakeShape(element_type, dimensions), iota_dim);
+  if (primitive_util::IsFloatingPointType(element_type)) {
+    ComputeAndCompare(&builder, {}, ErrorSpec{0.0001});
+  } else {
+    ComputeAndCompare(&builder, {});
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(IotaR3PredTestInstantiation, IotaR3PredTest,
+                        ::testing::Values(0, 1, 2));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 8d658695576035cdc34a213847460dd80de5f67e..a78ccacec114858740bf1b9c04e9b688bca5818d 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -68,7 +68,7 @@ class LLVMCompilerTest : public ::testing::Test {
     builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
 
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
     hlo_module->AddEntryComputation(builder.Build());
 
     compiler->SetPreOptimizationHook(pre_opt_hook);
@@ -90,18 +90,19 @@ class LLVMCompilerTest : public ::testing::Test {
     builder.AddInstruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
 
-    std::unique_ptr<HloModule> hlo_module = CreateNewModule();
+    std::unique_ptr<HloModule> hlo_module = CreateNewUnverifiedModule();
     hlo_module->AddEntryComputation(builder.Build());
 
-    std::vector<std::unique_ptr<HloModule>> modules;
-    modules.push_back(hlo_module->Clone());
-    modules.push_back(std::move(hlo_module));
+    auto module_group = absl::make_unique<HloModuleGroup>("test_module_group");
+    module_group->push_back(hlo_module->Clone());
+    module_group->push_back(std::move(hlo_module));
 
     std::vector<std::vector<se::StreamExecutor *>> executors;
     executors.push_back({backend_->default_stream_executor()});
     executors.push_back({backend_->default_stream_executor()});
 
-    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors),
+    EXPECT_IS_OK(compiler->Compile(std::move(module_group),
+                                   std::move(executors),
                                    /*device_allocator=*/nullptr));
   }
 
@@ -123,9 +124,9 @@ class LLVMCompilerTest : public ::testing::Test {
     return ::testing::UnitTest::GetInstance()->current_test_info()->name();
   }
 
-  static std::unique_ptr<HloModule> CreateNewModule() {
+  static std::unique_ptr<HloModule> CreateNewUnverifiedModule() {
     HloModuleConfig config;
-    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    config.set_debug_options(GetDebugOptionsFromFlags());
     return absl::make_unique<HloModule>(TestName(), config);
   }
 };
@@ -150,12 +151,12 @@ TEST_F(GpuCompilerTest, HooksTest) {
   TestCompilerHooks(&compiler);
 }
 
-TEST_F(CpuCompilerTest, MultiModuleCompilation) {
+TEST_F(CpuCompilerTest, CpuMultiModuleCompilation) {
   cpu::CpuCompiler compiler;
   TestMultiModuleCompilation(&compiler);
 }
 
-TEST_F(GpuCompilerTest, MultModuleCompilation) {
+TEST_F(GpuCompilerTest, NVPTXMultiModuleCompilation) {
   gpu::NVPTXCompiler compiler;
   TestMultiModuleCompilation(&compiler);
 }
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 56aaeb0e6878737e6c689e8065d8f1e1871b3472..3f5135438fc59bea98527b1be30ee49339edd455 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -62,7 +62,7 @@ class MultiOutputFusionTest : public HloTestBase {
 
   void RunTest2D(bool manual_fusion, int64 size) {
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
 
     const Shape elem_shape0 = ShapeUtil::MakeShapeWithLayout(F32, {}, {});
     const Shape elem_shape2 =
@@ -122,7 +122,7 @@ class MultiOutputFusionTest : public HloTestBase {
 
   void RunTest1D(bool manual_fusion, int size) {
     auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
+    auto hlo_module = CreateNewUnverifiedModule();
 
     const Shape elem_shape_F32 =
         ShapeUtil::MakeShapeWithDescendingLayout(F32, {size});
@@ -192,7 +192,7 @@ XLA_TEST_F(MultiOutputFusionTest, DiffentTypesFusion) { RunTest1D(true, 8); }
 
 XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) {
   const char* testcase = R"(
-    HloModule m
+    HloModule m, is_scheduled=true
 
     fused_computation {
       x.param_0 = (((s32[]), f32[]), (f32[], s32[])) parameter(0)
@@ -224,7 +224,7 @@ XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) {
 
 XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
   const char* testcase = R"(
-    HloModule m
+    HloModule m, is_scheduled=true
 
     fused_computation {
       p = f32[4] parameter(0)
@@ -251,7 +251,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
 
 XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
   const char* testcase = R"(
-    HloModule m
+    HloModule m, is_scheduled=true
 
     fused_computation {
       p = f32[] parameter(0)
@@ -282,7 +282,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
 }
 
 const char* const kScalarOps = R"(
-    HloModule m
+    HloModule m, is_scheduled=true
 
     Add {
       lhsadd = f32[] parameter(0)
diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc
index 58539e6b061b0cec1cc660b52e78894e5deeea56..774eb8d2a85914c52597144e70838ee117ee1134 100644
--- a/tensorflow/compiler/xla/tests/pred_test.cc
+++ b/tensorflow/compiler/xla/tests/pred_test.cc
@@ -87,8 +87,8 @@ TEST_F(PredTest, ConstantR2Pred) {
   XlaBuilder builder(TestName());
   ConstantR2<bool>(&builder, {{false, true, true}, {true, false, false}});
   const string expected = R"(pred[2,3] {
-  { 011 },
-  { 100 }
+  { 0, 1, 1 },
+  { 1, 0, 0 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 26e2bfde5cdc19657640f24f31bc008d09ad7106..f80d29b9de440b11c36e8c9bc65d4a93353a6267 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -216,14 +216,13 @@ XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) {
   const uint32_t sign_bit = 1u << 31;
   for (const auto& test_value : test_values) {
     // Add positive values.
-    input_values.push_back(tensorflow::bit_cast<float>(test_value[0]));
-    expected_values.push_back(tensorflow::bit_cast<float>(test_value[index]));
+    input_values.push_back(absl::bit_cast<float>(test_value[0]));
+    expected_values.push_back(absl::bit_cast<float>(test_value[index]));
     // Add negative values.  We do this in the bitwise representation so as to
     // avoid problems with NaN handling.
-    input_values.push_back(
-        tensorflow::bit_cast<float>(test_value[0] ^ sign_bit));
+    input_values.push_back(absl::bit_cast<float>(test_value[0] ^ sign_bit));
     expected_values.push_back(
-        tensorflow::bit_cast<float>(test_value[index] ^ sign_bit));
+        absl::bit_cast<float>(test_value[index] ^ sign_bit));
   }
 
   // This is required for proper handling of NaN values.
@@ -283,7 +282,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) {
   XlaBuilder builder(TestName());
 
-  Literal a_literal = LiteralUtil::CreateR1<float>({1.00001});
+  Literal a_literal = LiteralUtil::CreateR1<float>({1.00001, 1.00001});
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(a_literal).ConsumeValueOrDie();
   auto a = Parameter(&builder, 0, a_literal.shape(), "a");
@@ -301,7 +300,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
       HloReducePrecisionOptions::UNFUSED_OP_OUTPUTS, 5, 10,
       [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; });
 
-  ComputeAndCompareR1<float>(&builder, {-1.00001f}, {a_data.get()});
+  ComputeAndCompareR1<float>(&builder, {-1.00001f, -1.00001f}, {a_data.get()});
 }
 
 // The interpreter has no fusion pass, so skip this test.
@@ -309,7 +308,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) {
   XlaBuilder builder(TestName());
 
-  Literal a_literal = LiteralUtil::CreateR1<float>({1.00001});
+  Literal a_literal = LiteralUtil::CreateR1<float>({1.00001, 1.00001});
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(a_literal).ConsumeValueOrDie();
   auto a = Parameter(&builder, 0, a_literal.shape(), "a");
@@ -325,7 +324,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
       HloReducePrecisionOptions::UNFUSED_OP_OUTPUTS, 5, 10,
       [](const HloOpcode opcode) { return opcode == HloOpcode::kFusion; });
 
-  ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()});
+  ComputeAndCompareR1<float>(&builder, {-1.0f, -1.0f}, {a_data.get()});
 }
 
 // The interpreter has no fusion pass, so skip this test.
@@ -358,7 +357,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) {
   XlaBuilder builder(TestName());
 
-  Literal a_literal = LiteralUtil::CreateR1<float>({1.00001});
+  Literal a_literal = LiteralUtil::CreateR1<float>({1.00001, 1.00001});
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(a_literal).ConsumeValueOrDie();
   auto a = Parameter(&builder, 0, a_literal.shape(), "a");
@@ -375,7 +374,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
       HloReducePrecisionOptions::FUSION_OUTPUTS_BY_CONTENT, 5, 10,
       [](const HloOpcode opcode) { return opcode == HloOpcode::kAbs; });
 
-  ComputeAndCompareR1<float>(&builder, {-1.0f}, {a_data.get()});
+  ComputeAndCompareR1<float>(&builder, {-1.0f, -1.0f}, {a_data.get()});
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 83997cdac21c437d460dabdbdfdb31100b1359af..18c99490a387923aaf68e06041cd11ed3b954aa5 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -980,5 +981,25 @@ XLA_TEST_F(ReduceTest, OrReduceU64) {
   ComputeAndCompareR1<uint64>(&builder, expected, {});
 }
 
+XLA_TEST_F(ReduceTest, R0ReduceInDisguise) {
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
+  constexpr int element_count = 127;
+  const Shape input_shape = ShapeUtil::MakeShape(F32, {element_count, 1});
+  auto input = Parameter(&builder, 0, input_shape, "input");
+  auto zero = ConstantR0<float>(&builder, 0.0);
+  Reduce(input, zero, add_f32, /*dimensions_to_reduce=*/{0});
+
+  Array2D<float> input_data(element_count, 1);
+  input_data.FillRandom(3.0f);
+  Literal input_literal = LiteralUtil::CreateR2FromArray2D(input_data);
+  std::unique_ptr<GlobalData> input_global_data =
+      client_->TransferToServer(input_literal).ConsumeValueOrDie();
+
+  float expected = absl::c_accumulate(input_data, 0.0f);
+  ComputeAndCompareR1<float>(&builder, {expected}, {input_global_data.get()},
+                             ErrorSpec(0.001));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index c25ccafaf83cf1b29095a77eefa357d9af08dc60..22fe4a2670e2e0e1fedc45036a1ceec19f44e42e 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -638,6 +638,8 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
         /*computation=*/computation,
         /*window_dimensions=*/param.window_bounds,
         /*window_strides=*/param.strides,
+        /*base_dilations=*/{},
+        /*window_dilations=*/{},
         /*padding=*/padding);
 
     CHECK(reducer == kAdd || reducer == kMax);
@@ -1158,7 +1160,10 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
         /*init_value=*/init_value,
         /*computation=*/computation,
         /*window_dimensions=*/param.window_bounds,
-        /*window_strides=*/param.strides, /*padding=*/padding);
+        /*window_strides=*/param.strides,
+        /*base_dilations=*/{},
+        /*window_dilations=*/{},
+        /*padding=*/padding);
 
     auto reduce_func = param.reducer == kAdd
                            ? +[](float a, float b) { return a + b; }
@@ -1369,7 +1374,10 @@ TEST_P(R1ReduceWindowTest, DoIt) {
       /*init_value=*/init_value,
       /*computation=*/computation,
       /*window_dimensions=*/param.window_bounds,
-      /*window_strides=*/param.strides, /*padding=*/padding);
+      /*window_strides=*/param.strides,
+      /*base_dilations=*/{},
+      /*window_dilations=*/{},
+      /*padding=*/padding);
 
   auto reduce_func = param.reducer == kAdd
                          ? +[](float a, float b) { return a + b; }
diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc
index 5cf87e565bf493167f5173588e7afa3b96282488..34c7dc7c46427b2d18ea21fc286ee03175f70800 100644
--- a/tensorflow/compiler/xla/tests/replay_test.cc
+++ b/tensorflow/compiler/xla/tests/replay_test.cc
@@ -55,7 +55,8 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) {
       client_->GetComputationShape(computation).ConsumeValueOrDie();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
+  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
+                                            replayed_shape->ToProto()));
 
   // Run it.
   Literal literal =
@@ -87,7 +88,8 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) {
       client_->GetComputationShape(computation).ConsumeValueOrDie();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
+  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
+                                            replayed_shape->ToProto()));
 
   // Run it.
   std::unique_ptr<GlobalData> x_data =
@@ -133,7 +135,8 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) {
       client_->GetComputationShape(computation).ConsumeValueOrDie();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
+  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
+                                            replayed_shape->ToProto()));
 
   // Run it.
   Literal literal =
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index dedc95b5ae8315185a35f786af42aad53bd7ad96..298136002e9ef47188e0bae95af3f596596e6062 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -618,7 +618,8 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
-                                     {1, 0});
+                                     {1, 0})
+          .ToProto();
   Literal actual =
       client_
           ->ExecuteAndTransfer(computation, {input.get()}, &execution_options)
@@ -767,7 +768,8 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {7, 2, 3, 5},
-                                     {2, 3, 0, 1});
+                                     {2, 3, 0, 1})
+          .ToProto();
   Literal output_literal =
       client_
           ->ExecuteAndTransfer(computation, {input_data.get()},
diff --git a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
index 091a5d2cacce6ac5bf986776e5ec96612d08cc75..606a099ecbc4e5677034c6d57e7ba5c398c69ab9 100644
--- a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
+++ b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/base/casts.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -47,7 +47,7 @@ class RoundTripPackedLiteralTest : public ClientLibraryTestBase {
 
 TEST_F(RoundTripPackedLiteralTest, RoundTripsR1F32Length2) {
   string data(sizeof(float) * 2, 0);
-  absl::Span<float> floats(tensorflow::bit_cast<float*>(data.data()), 2);
+  absl::Span<float> floats(absl::bit_cast<float*>(data.data()), 2);
   floats[0] = 42.0;
   floats[1] = 24.0;
 
@@ -69,7 +69,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR1F32Length2) {
 
 TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim0Minor) {
   string data(sizeof(float) * 4, 0);
-  absl::Span<float> floats(tensorflow::bit_cast<float*>(data.data()), 4);
+  absl::Span<float> floats(absl::bit_cast<float*>(data.data()), 4);
   // With x as the minor dimension, these will become:
   floats[0] = 42.0;  // y=0,x=0
   floats[1] = 24.0;  // y=0,x=1
@@ -102,7 +102,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim0Minor) {
 
 TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) {
   string data(sizeof(float) * 4, 0);
-  absl::Span<float> floats(tensorflow::bit_cast<float*>(data.data()), 4);
+  absl::Span<float> floats(absl::bit_cast<float*>(data.data()), 4);
   // With y as the minor dimension, these will become:
   floats[0] = 42.0;  // y=0,x=0
   floats[1] = 24.0;  // y=1,x=0
diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
index b21dd56045e1dc11847e213852dea60cd033be7b..32de0fdf78f9c442e17c55e1b951e39122dac5ef 100644
--- a/tensorflow/compiler/xla/tests/scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -69,6 +69,37 @@ ENTRY main {
   RunTest(hlo_text, &operand, &scatter_indices, &updates);
 }
 
+XLA_TEST_F(ScatterTest, TensorFlowScatterV1_WithFusedAdds) {
+  const string hlo_text = R"(
+HloModule TensorFlowScatterV1
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  p0 = s32[3,3] parameter(0)
+  operand = s32[3,3] add(p0, p0)
+  p1 = s32[2] parameter(1)
+  indices = s32[2] add(p1, p1)
+  p2 = s32[2,3] parameter(2)
+  updates = s32[2,3] add(p2, p2)
+  ROOT scatter = s32[3,3] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+}
+)";
+  Literal operand =
+      LiteralUtil::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  Literal scatter_indices = LiteralUtil::CreateR1<int32>({0, 1});
+  Literal updates = LiteralUtil::CreateR2<int32>({{10, 20, 30}, {70, 80, 90}});
+  RunTest(hlo_text, &operand, &scatter_indices, &updates);
+}
+
 XLA_TEST_F(ScatterTest, TensorFlowScatterV2_Update) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterV2
@@ -98,6 +129,73 @@ ENTRY main {
   RunTest(hlo_text, &operand, &scatter_indices, &updates);
 }
 
+XLA_TEST_F(ScatterTest, TensorFlowScatterV2_InversePermutation) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterV2
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  permutation = s32[3,4] parameter(0)
+  reshape = s32[3,4,1] reshape(permutation)
+  operand = s32[3,4] iota(), iota_dimension=1
+  updates = s32[3,4,1,1] iota(), iota_dimension=1
+  iota = s32[3,4,1] iota(), iota_dimension=0
+  indices = s32[3,4,2] concatenate(iota, reshape), dimensions={2}
+  ROOT scatter = s32[3,4] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={2,3},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=2
+}
+)";
+  Literal permutation =
+      LiteralUtil::CreateR2<int32>({{1, 3, 2, 0}, {3, 0, 2, 1}, {2, 3, 1, 0}});
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text, config));
+  auto actual = ExecuteAndTransfer(std::move(module), {&permutation});
+  Literal expected =
+      LiteralUtil::CreateR2<int32>({{3, 0, 2, 1}, {1, 3, 2, 0}, {3, 2, 0, 1}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, actual));
+}
+
+XLA_TEST_F(ScatterTest, SimpleR4) {
+  const char* hlo_text = R"(
+HloModule SimpleR4
+
+add_f32 (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(f32[] lhs, f32[] rhs)
+}
+
+ENTRY main {
+  operand = f32[1,2,2,1] parameter(0)
+  indices = s32[1,3] parameter(1)
+  updates = f32[1,2,2,1] parameter(2)
+  ROOT scatter = f32[1,2,2,1] scatter(operand, indices, updates),
+      to_apply=add_f32,
+      update_window_dims={1,2,3},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0, 2, 1},
+      index_vector_dim=1
+}
+)";
+
+  Literal operand =
+      LiteralUtil::CreateR4<float>({{{{0.f}, {0.f}}, {{0.f}, {0.f}}}});
+  Literal updates =
+      LiteralUtil::CreateR4<float>({{{{0.12}, {0.28}}, {{0.018}, {0.42}}}});
+  Literal scatter_indices = LiteralUtil::CreateR2<int32>({{0, 0, 0}});
+  RunTest(hlo_text, &operand, &scatter_indices, &updates);
+}
+
 XLA_TEST_F(ScatterTest, TensorFlowScatter_Add) {
   const string hlo_text = R"(
 HloModule TensorFlowScatter_Add
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 2cc33ab0963afe8ba2d8e9a6972dcf0622e27c48..3fb69419e735bfd9c5054673e0687f5139a410cb 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -166,6 +166,26 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
 }
 
+TEST_F(SliceTest, SliceOfReshape) {
+  Array2D<int> values(2 * 3 * 24, 7);
+  values.FillIota(1);
+  XlaBuilder builder(TestName());
+  auto original = ConstantR2FromArray2D(&builder, values);
+  auto reshape = Reshape(original, {24, 3, 2, 7});
+  Slice(reshape, {0, 0, 0, 0}, {11, 3, 2, 7}, {1, 1, 1, 1});
+  ComputeAndCompare(&builder, {});
+}
+
+TEST_F(SliceTest, SliceOfCollapsingReshape) {
+  Array4D<int> values(2, 3, 5, 7);
+  values.FillIota(1);
+  XlaBuilder builder(TestName());
+  auto original = ConstantR4FromArray4D(&builder, values);
+  auto reshape = Reshape(original, {2 * 3 * 5, 7});
+  Slice(reshape, {0, 0}, {4, 7}, {1, 1});
+  ComputeAndCompare(&builder, {});
+}
+
 XLA_TEST_F(SliceTest, StridedSliceR4WithOutputLayout) {
   Array4D<float> values(2, 4, 6, 8);
   values.FillRandom(3.14f);
@@ -253,7 +273,6 @@ XLA_TEST_P(SliceR1LargeTest, DoIt_S64) { Run<int64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_PRED) { Run<bool>(GetParam()); }
 
-
 // Tests for R1 slice ops.
 // The format for each testcase is {input size, start, limit, stride}.
 // clang-format off
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 5155f0c652c7c6dbba60c421159494fa28072090..eafa48ed7b8cf2bd67fe767ad36082661dbbd66e 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 
+#include "absl/base/casts.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -28,65 +29,113 @@ namespace xla {
 namespace {
 
 template <typename FloatT, typename GeneratorT>
-void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
-                                             std::minstd_rand0* engine,
-                                             bool no_duplicates) {
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
+  std::uniform_real_distribution<GeneratorT> generator(-0.1f, 0.2f);
+  for (FloatT& value : literal->data<FloatT>()) {
+    value = static_cast<FloatT>(generator(*engine));
+  }
+}
+
+template <typename FloatT>
+void PopulateWithIntNext(Literal* literal);
+
+template <>
+void PopulateWithIntNext<half>(Literal* literal) {
+  // Duplicates may be generated if we don't have enough bits.
+  uint16 next_value = 0;
+  for (half& value : literal->data<half>()) {
+    // Zero-out the MSB of the exponent to avoid Infs and NaNs, and put it into
+    // the sign bit. We could be less wasteful, but this is best-effort anyway.
+    uint16 exponent_msb = next_value & 0x4000;
+    value.x = (next_value & 0xBFFF) | (exponent_msb << 1);
+    next_value++;
+  }
+}
+
+template <>
+void PopulateWithIntNext<bfloat16>(Literal* literal) {
+  // Duplicates may be generated if we don't have enough bits.
+  // Start at 0x80 rather than 0 to avoid denormals.
+  uint16 next_value = 0x80;
+  for (bfloat16& value : literal->data<bfloat16>()) {
+    // Zero-out the MSB of the exponent to avoid Infs and NaNs, and put it into
+    // the sign bit. We could be less wasteful, but this is best-effort anyway.
+    uint16 exponent_msb = next_value & 0x4000;
+    value.value = (next_value & 0xBFFF) | (exponent_msb << 1);
+    next_value++;
+  }
+}
+
+template <typename FloatT>
+void PopulateWithNextAfter(Literal* literal) {
+  // Duplicates may be generated if the number of elements in the literal
+  // exceeds the number of positive values supported by the type.
+  float next_value = std::numeric_limits<float>::min();
+  for (float& value : literal->data<float>()) {
+    value = next_value;
+    next_value = std::nextafter(next_value, std::numeric_limits<float>::max());
+  }
+}
+
+template <typename FloatT,
+          typename std::enable_if<std::is_same<bfloat16, FloatT>::value ||
+                                      std::is_same<half, FloatT>::value,
+                                  int>::type = 0>
+void PopulateWithNoDuplicateData(Literal* literal, std::minstd_rand0* engine) {
+  PopulateWithIntNext<FloatT>(literal);
+  std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
+               *engine);
+}
+
+template <typename FloatT,
+          typename std::enable_if<!std::is_same<bfloat16, FloatT>::value &&
+                                      !std::is_same<half, FloatT>::value,
+                                  int>::type = 0>
+void PopulateWithNoDuplicateData(Literal* literal, std::minstd_rand0* engine) {
+  PopulateWithNextAfter<FloatT>(literal);
+  std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
+               *engine);
+}
+
+template <typename FloatT>
+void PopulateWithFloatingPointData(Literal* literal, std::minstd_rand0* engine,
+                                   bool no_duplicates) {
   CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   if (no_duplicates) {
-    // Duplicates may be generated if the number of elements in the literal
-    // exceeds the number of positive values supported by the type.
-    FloatT next_value = std::numeric_limits<FloatT>::min();
-    for (FloatT& value : literal->data<FloatT>()) {
-      value = next_value;
-      next_value =
-          std::nextafter(next_value, std::numeric_limits<FloatT>::max());
-    }
-    std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
-                 *engine);
+    PopulateWithNoDuplicateData<FloatT>(literal, engine);
   } else {
-    std::uniform_real_distribution<GeneratorT> generator(-0.1f, 0.2f);
-    for (FloatT& value : literal->data<FloatT>()) {
-      value = static_cast<FloatT>(generator(*engine));
-    }
+    PopulateWithRandomFloatingPointData<FloatT, FloatT>(literal, engine);
   }
 }
 
-template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal,
+template <>
+void PopulateWithFloatingPointData<half>(Literal* literal,
                                          std::minstd_rand0* engine,
                                          bool no_duplicates) {
   CHECK(engine != nullptr);
-  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine,
-                                                          no_duplicates);
-}
-
-template <>
-void PopulateWithRandomFloatingPointData<half>(Literal* literal,
-                                               std::minstd_rand0* engine,
-                                               bool no_duplicates) {
-  // no_duplicates is ignored for half types. Unique values can only be
-  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
-  // best-effort anyway.
-  CHECK(engine != nullptr);
-  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
-  for (half& value : literal->data<half>()) {
-    value = static_cast<half>(generator(*engine));
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<half>());
+  if (no_duplicates) {
+    PopulateWithNoDuplicateData<half>(literal, engine);
+  } else {
+    PopulateWithRandomFloatingPointData<half, float>(literal, engine);
   }
 }
 
 template <>
-void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
-                                                   std::minstd_rand0* engine,
-                                                   bool no_duplicates) {
-  // no_duplicates is ignored for bfloat types. Unique values can only be
-  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
-  // best-effort anyway.
+void PopulateWithFloatingPointData<bfloat16>(Literal* literal,
+                                             std::minstd_rand0* engine,
+                                             bool no_duplicates) {
   CHECK(engine != nullptr);
-  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
-  for (bfloat16& value : literal->data<bfloat16>()) {
-    value = static_cast<bfloat16>(generator(*engine));
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<bfloat16>());
+  if (no_duplicates) {
+    PopulateWithNoDuplicateData<bfloat16>(literal, engine);
+  } else {
+    PopulateWithRandomFloatingPointData<bfloat16, float>(literal, engine);
   }
 }
 
@@ -135,20 +184,16 @@ StatusOr<Literal> MakeFakeLiteralInternal(const Shape& shape,
   Literal literal(shape);
   switch (shape.element_type()) {
     case BF16:
-      PopulateWithRandomFloatingPointData<bfloat16>(&literal, engine,
-                                                    no_duplicates);
+      PopulateWithFloatingPointData<bfloat16>(&literal, engine, no_duplicates);
       break;
     case F16:
-      PopulateWithRandomFloatingPointData<half>(&literal, engine,
-                                                no_duplicates);
+      PopulateWithFloatingPointData<half>(&literal, engine, no_duplicates);
       break;
     case F32:
-      PopulateWithRandomFloatingPointData<float>(&literal, engine,
-                                                 no_duplicates);
+      PopulateWithFloatingPointData<float>(&literal, engine, no_duplicates);
       break;
     case F64:
-      PopulateWithRandomFloatingPointData<double>(&literal, engine,
-                                                  no_duplicates);
+      PopulateWithFloatingPointData<double>(&literal, engine, no_duplicates);
       break;
     case S8:
       PopulateWithRandomIntegralData<int8>(&literal, engine, no_duplicates);
@@ -272,9 +317,11 @@ std::vector<HloInstruction*> FindConstrainedUses(
         constrained_uses.insert(constrained_uses.end(), converted_uses.begin(),
                                 converted_uses.end());
       } else if (opcode == HloOpcode::kSort &&
-                 instruction->operand_count() == 2 && op_num == 0) {
+                 instruction->operand_count() >= 2 && op_num == 0) {
         // Operand 0 of sort is the array of keys used for key/value
-        // (two-operand) kSort instructions.
+        // (two-operand) kSort instructions. Since sort stability is not
+        // guaranteed, constrain keys of key-value sort not to have duplicates,
+        // since otherwise the value order may legitimately differ.
         constrained_uses.push_back(instruction);
       }
     }
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index bc433eac8fcb02087d8e4eb10f638c85dc141b22..e8f5d7a9a79ebddea3cb989dbe8eab90b630d5e7 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
+#include "absl/base/casts.h"
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -148,7 +148,7 @@ ENTRY %sort.148.1589 (parameter.0: f32[1048576], parameter.1: s32[1048576]) -> (
 
   absl::flat_hash_set<uint32> key_set;
   for (const float& value : key_arg.data<float>()) {
-    EXPECT_TRUE(key_set.insert(tensorflow::bit_cast<uint32>(value)).second);
+    EXPECT_TRUE(key_set.insert(absl::bit_cast<uint32>(value)).second);
   }
 }
 
@@ -171,7 +171,30 @@ ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (
 
   absl::flat_hash_set<int32> key_set;
   for (const int32& value : key_arg.data<int32>()) {
-    EXPECT_TRUE(key_set.insert(tensorflow::bit_cast<uint32>(value)).second);
+    EXPECT_TRUE(key_set.insert(absl::bit_cast<uint32>(value)).second);
+  }
+}
+
+XLA_TEST_F(TestUtilsTest, NoDuplicatesBfloat16) {
+  // Inputs which are sort keys in key/value sorts should have no duplicates.
+  auto module = ParseHloString(R"(
+HloModule sort, is_scheduled=true
+
+ENTRY %sort. (parameter.0: bf16[2,1452], parameter.1: s32[2,1452]) -> (bf16[2,1452], s32[2,1452]) {
+  %parameter.0 = bf16[2,1452]{1,0} parameter(0)
+  %parameter.1 = s32[2,1452]{1,0} parameter(1)
+  ROOT %sort = (bf16[2,1452]{1,0}, s32[2,1452]{1,0}) sort(bf16[2,1452]{1,0} %parameter.0, s32[2,1452]{1,0} %parameter.1), dimensions={1}
+}
+)")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  const Literal& key_arg = args[0];
+
+  absl::flat_hash_set<uint16> key_set;
+  for (const bfloat16& value : key_arg.data<bfloat16>()) {
+    EXPECT_TRUE(key_set.insert(absl::bit_cast<uint16>(value)).second);
   }
 }
 
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index b34fd0f2e873214c509533f29553af914ddc984d..601c6b06938fef1f1ae809b33209ae59b24c70a2 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <array>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -28,7 +29,7 @@ namespace {
 class TokenHloTest : public HloTestBase {};
 
 XLA_TEST_F(TokenHloTest, SingleTokenInstruction) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(HloInstruction::CreateToken());
 
@@ -38,8 +39,22 @@ XLA_TEST_F(TokenHloTest, SingleTokenInstruction) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, LiteralUtil::CreateToken()));
 }
 
+XLA_TEST_F(TokenHloTest, TokenInTuple) {
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto token = builder.AddInstruction(HloInstruction::CreateToken());
+  builder.AddInstruction(HloInstruction::CreateTuple({token}));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(Literal result, Execute(std::move(module), {}));
+  Literal token_literal = LiteralUtil::CreateToken();
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(result, LiteralUtil::MakeTuple({&token_literal})));
+}
+
 XLA_TEST_F(TokenHloTest, TokenTree) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto token0 = builder.AddInstruction(HloInstruction::CreateToken());
   auto token1 = builder.AddInstruction(HloInstruction::CreateToken());
@@ -54,7 +69,7 @@ XLA_TEST_F(TokenHloTest, TokenTree) {
 }
 
 XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
@@ -75,7 +90,7 @@ XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
 }
 
 XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
+  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
   builder.AddInstruction(HloInstruction::CreateParameter(
       0,
@@ -94,26 +109,6 @@ XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
       ::testing::HasSubstr("Entry parameter 0 is or contains a token shape"));
 }
 
-XLA_TEST_F(TokenHloTest, InvalidOperandToTokenInstruction) {
-  std::unique_ptr<HloModule> module = CreateNewModule();
-  auto builder = HloComputation::Builder(TestName());
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
-  builder.AddInstruction(HloInstruction::CreateAfterAll({param}));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(123)));
-  module->AddEntryComputation(builder.Build());
-
-  Status status =
-      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false)
-          .Run(module.get())
-          .status();
-  ASSERT_IS_NOT_OK(status);
-  EXPECT_THAT(status.error_message(),
-              ::testing::HasSubstr(
-                  "Operands of token instructions must be TOKEN types"));
-}
-
 XLA_TEST_F(TokenHloTest, TokenInWhileLoop) {
   // Thread a token around a while loop. Token is created and consumed by a
   // AfterAll instruction in the while body.
@@ -206,5 +201,95 @@ ENTRY %TokenInConditional (param.3: pred[]) -> s32[] {
   }
 }
 
+XLA_TEST_F(TokenHloTest, AddDependency) {
+  string module_string = R"(
+HloModule AddDependency, is_scheduled=true
+
+// Computes (p0 + 42) * (-p1)
+// where there is a dependency from the add to the negation using a token
+// with after-all and add-dependency instructions.
+ENTRY %AddDependency (p0: f32[], p1: f32[]) -> f32[] {
+  %p0 = f32[] parameter(0)
+  %p1 = f32[] parameter(1)
+
+  %forty_two = f32[] constant(42.0)
+  %add = f32[] add(f32[] %p0, f32[] %forty_two)
+  %token = token[] after-all(f32[] %add)
+  %p1_after_token = f32[] add-dependency(f32[] %p1, token[] %token)
+  %neg = f32[] negate(f32[] %p1_after_token)
+  ROOT %product = f32[] multiply(f32[] %add, f32[] %neg)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto p0 = LiteralUtil::CreateR0<float>(10.0);
+  auto p1 = LiteralUtil::CreateR0<float>(3.0);
+  auto expected = LiteralUtil::CreateR0<float>(-156.0);
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0, &p1}));
+}
+
+XLA_TEST_F(TokenHloTest, AddDependencyOfConstant) {
+  string module_string = R"(
+HloModule AddDependencyOfConstant, is_scheduled=true
+
+ENTRY %AddDependency (p0: f32[]) -> f32[] {
+  %p0 = f32[] parameter(0)
+  %forty_two = f32[] constant(42.0)
+  %token = token[] after-all(f32[] %p0)
+  %forty_two_after_token = f32[] add-dependency(f32[] %forty_two, token[] %token)
+  ROOT %product = f32[] multiply(f32[] %p0, f32[] %forty_two_after_token)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto p0 = LiteralUtil::CreateR0<float>(10.0);
+  auto expected = LiteralUtil::CreateR0<float>(420.0);
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0}));
+}
+
+XLA_TEST_F(TokenHloTest, AddDependencyAsRoot) {
+  string module_string = R"(
+HloModule AddDependencyAsRoot, is_scheduled=true
+ENTRY %AddDependency (p: f32[3]) -> f32[3] {
+  %p = f32[3] parameter(0)
+  %neg = f32[3] negate(f32[3] %p)
+  %token = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %neg, token[] %token)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto input = LiteralUtil::CreateR1<float>({1.0, 3.0, 7.0});
+  auto expected = LiteralUtil::CreateR1<float>({-1.0, -3.0, -7.0});
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&input}));
+}
+
+XLA_TEST_F(TokenHloTest, TupleShapedAddDependency) {
+  string module_string = R"(
+HloModule TupleShapedAddDependency, is_scheduled=true
+ENTRY %TupleShapedAddDependency (p0: f32[3], p1: f32[3]) -> f32[3] {
+  %p0 = f32[3] parameter(0)
+  %p1 = f32[3] parameter(1)
+  %forty_two = f32[] constant(42.0)
+  %token = token[] after-all()
+  %tuple = (f32[3], token[], f32[3], f32[]) tuple(f32[3] %p0, token[] %token, f32[3] %p1, f32[] %forty_two)
+  %add_dep = (f32[3], token[], f32[3], f32[]) add-dependency((f32[3], token[], f32[3], f32[]) %tuple, token[] %token)
+  %elem0 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=0
+  %elem2 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=2
+  ROOT %diff = f32[3] subtract(f32[3] %elem0, f32[3] %elem2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto p0 = LiteralUtil::CreateR1<float>({3.0, 3.0, 47.0});
+  auto p1 = LiteralUtil::CreateR1<float>({1.0, -2.0, 2.0});
+  auto expected = LiteralUtil::CreateR1<float>({2.0, 5.0, 45.0});
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0, &p1}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 619d2a388b5646c31f0a61f709a2ab3067e39c03..27ce243e9bd4afbdcc1fdc5b6873d4968086e459 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -515,7 +515,7 @@ class TupleHloTest : public HloTestBase {};
 // Disabled on the interpreter because bitcast doesn't exist on the interpreter.
 XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
   const char* testcase = R"(
-    HloModule m
+    HloModule m, is_scheduled=true
 
     ENTRY test {
       name.1 = (f32[3]{0}) parameter(0)
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 8b1b9e151992296b9d022ae1d9d974eadd2074a8..6d5f276e82087cedc356691b0ff08df24cec8d20 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -48,7 +48,7 @@ class WhileTest : public ClientLibraryTestBase {};
 // while (result < 5) {
 //   result = result + 1;
 // }
-TEST_F(WhileTest, WhileWithScalarS32Result) {
+XLA_TEST_F(WhileTest, WhileWithScalarS32Result) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
@@ -84,7 +84,7 @@ TEST_F(WhileTest, WhileWithScalarS32Result) {
 // while (result < 5) {
 //   result = result + 1;
 // }
-TEST_F(WhileTest, WhileWithScalarS64Result) {
+XLA_TEST_F(WhileTest, WhileWithScalarS64Result) {
   auto result_shape = ShapeUtil::MakeShape(S64, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
@@ -114,7 +114,7 @@ TEST_F(WhileTest, WhileWithScalarS64Result) {
   ComputeAndCompareR0<int64>(&builder, 5, {});
 }
 
-TEST_F(WhileTest, WhileWithScalarResultNonConstInit) {
+XLA_TEST_F(WhileTest, WhileWithScalarResultNonConstInit) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
   auto orig_shape = ShapeUtil::MakeShape(S32, {2});
 
@@ -147,7 +147,7 @@ TEST_F(WhileTest, WhileWithScalarResultNonConstInit) {
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
 
-TEST_F(WhileTest, WhileWithPredicateResult) {
+XLA_TEST_F(WhileTest, WhileWithPredicateResult) {
   auto result_shape = ShapeUtil::MakeShape(PRED, {});
 
   // Create a computation for the condition: run until condition is true.
@@ -184,7 +184,7 @@ TEST_F(WhileTest, WhileWithPredicateResult) {
 // while (result.sum() < 15.5f) {
 //   result = result + vector<float>(0);
 // }
-TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithEmptyVectorResult)) {
+XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithEmptyVectorResult)) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {0});
 
   // Create a computation for the reduction.
@@ -238,7 +238,7 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithEmptyVectorResult)) {
 // while (result.sum() < 15.5f) {
 //   result = result + vector<float>(8, 0.125f);
 // }
-TEST_F(WhileTest, WhileWithVectorResult) {
+XLA_TEST_F(WhileTest, WhileWithVectorResult) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
@@ -298,7 +298,7 @@ TEST_F(WhileTest, WhileWithVectorResult) {
 //   result = result + vector<float>(8, 0.125f);
 // }
 // tuple = tuple { while }
-TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
+XLA_TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
@@ -353,7 +353,7 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
   ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
-TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
+XLA_TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
   std::vector<Shape> shape_elements = {
       ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {3}),
       ShapeUtil::MakeShape(F32, {3}), ShapeUtil::MakeShape(F32, {3})};
@@ -407,7 +407,7 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
   ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
-TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
+XLA_TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   std::vector<Shape> shape_elements = {
       ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {3}),
       ShapeUtil::MakeShape(F32, {3}), ShapeUtil::MakeShape(F32, {3})};
@@ -465,7 +465,7 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
 //   get<0>(result) = get<0>(result) + 1;
 //   get<1>(result) = get<1>(result) + vector<float>(10, 1.0f);
 // }
-TEST_F(WhileTest, WhileWithTupleResult) {
+XLA_TEST_F(WhileTest, WhileWithTupleResult) {
   std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
                                        ShapeUtil::MakeShape(F32, {10})};
   Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
@@ -514,7 +514,7 @@ TEST_F(WhileTest, WhileWithTupleResult) {
   ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
-TEST_F(WhileTest, WhileWithPredicateTupleResult) {
+XLA_TEST_F(WhileTest, WhileWithPredicateTupleResult) {
   std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
                                        ShapeUtil::MakeShape(PRED, {})};
   Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
@@ -560,7 +560,7 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
   ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0));
 }
 
-TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
+XLA_TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
   std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
                                        ShapeUtil::MakeShape(S32, {})};
   Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
@@ -619,7 +619,7 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
 //        get<1>(w1) = get<1>(w1) + vector<float>(10, 1.0f);
 //      }
 // result = get<1>(w0) + get<1>(w1)
-TEST_F(WhileTest, TwoWhileWithTupleResult) {
+XLA_TEST_F(WhileTest, TwoWhileWithTupleResult) {
   std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
                                        ShapeUtil::MakeShape(F32, {10})};
   Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
@@ -698,7 +698,7 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
 }
 
 // Test while nodes that share the while body computation.
-TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
+XLA_TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
                                        ShapeUtil::MakeShape(F32, {10})};
   Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
@@ -763,7 +763,7 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
-TEST_F(WhileTest, WhileLoopsWithSharedBodyAndInit) {
+XLA_TEST_F(WhileTest, WhileLoopsWithSharedBodyAndInit) {
   std::vector<Shape> shape_elements = {ShapeUtil::MakeShape(S32, {}),
                                        ShapeUtil::MakeShape(F32, {10})};
   Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
@@ -901,7 +901,7 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
 // Per backend the values generated can be different as the different backends
 // use different random number generators.
 // TODO(b/32240857): Extend test to verify outputs.
-TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
+XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
   auto v6s32 = ShapeUtil::MakeShape(S32, {6});
 
   // Create a computation for the condition: repeat for count iterations.
@@ -947,7 +947,7 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
   }
 }
 
-TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
+XLA_TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
   XlaBuilder outer("outer");
@@ -979,7 +979,7 @@ TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
                          ErrorSpec(1e-6));
 }
 
-TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
+XLA_TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
   XlaBuilder outer("outer");
@@ -1004,7 +1004,7 @@ TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
                              ErrorSpec(1e-6));
 }
 
-TEST_F(WhileTest, WhileThatTurnsScalarParameterToTupleElement) {
+XLA_TEST_F(WhileTest, WhileThatTurnsScalarParameterToTupleElement) {
   auto element_shape = ShapeUtil::MakeShape(F32, {});
 
   XlaBuilder outer("outer");
@@ -1038,7 +1038,7 @@ TEST_F(WhileTest, WhileThatTurnsScalarParameterToTupleElement) {
 //   result[0] = result[0] + 1;
 //   result[1] = result[1] + 1;
 // }
-TEST_F(WhileTest, WhileWithMixedTupleElements) {
+XLA_TEST_F(WhileTest, WhileWithMixedTupleElements) {
   auto result_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})});
 
@@ -1146,7 +1146,7 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 // while (f(result).get<0>()) {
 //   result = result + 1;
 // }
-TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
+XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
@@ -1186,7 +1186,7 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
 
-TEST_F(WhileTest, WhileWithLoopInvariantOperation) {
+XLA_TEST_F(WhileTest, WhileWithLoopInvariantOperation) {
   auto matrix_shape = ShapeUtil::MakeShape(F32, {2, 2});
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   auto while_shape = ShapeUtil::MakeTupleShape(
@@ -1230,7 +1230,7 @@ TEST_F(WhileTest, WhileWithLoopInvariantOperation) {
       {param_value.get()}, ErrorSpec(4e-5));
 }
 
-TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileInfeedCondition)) {
+XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileInfeedCondition)) {
   auto while_shape = ShapeUtil::MakeShape(S32, {});
 
   XlaComputation condition;
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index a6e70eb6ca25ffac24a8ebaf0420238e109e4fad..e57d072a0632b492b8b6e34439f4e80332b843b6 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -91,16 +91,16 @@ Status ParseOneProfileOutputLine(
   string match_usecs = "([0-9.]+) usec";
   string match_flops = "([^ ]*)";
   string match_trops = "([^ ]*)";
-  string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
-  string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
+  string match_bytes_per_sec = "([0-9.TGMKi]*)(?:B/s)?";
+  string match_bytes_per_cycle = "([0-9.TGMKi]*)(?:B/cycle)?";
 
   // The underlined part is what we're trying to match with match_opcode:
   //
   //   %dot33 = f32[256,256]{1,0} dot(...)
   //                              ^^^
 
-  string match_opcode =
-      expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*" : "(\\[total\\])";
+  string match_opcode = expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*"
+                                   : "(\\[total\\])( \\[entry\\])?";
   string regexp_pattern = absl::StrCat(
       " +", match_cycles, separator, match_usecs, separator, match_flops,
       separator, match_trops, separator, match_bytes_per_sec, separator,
@@ -125,6 +125,10 @@ Status ParseOneProfileOutputLine(
   return Status::OK();
 }
 
+bool IsExtraMetricProfileOutputLine(const string& line) {
+  return RE2::FullMatch(line, "Extra metric \\S+: \\d+");
+}
+
 // Returns void so that we can ASSERT.
 void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
                             const XlaComputation& computation,
@@ -153,10 +157,12 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
       stream_ptr.get(), Literal::CreateFromShape(rhs_arg_shape), rhs_arg));
 
+  ExecutableBuildOptions build_options;
+  build_options.mutable_debug_options()->set_xla_hlo_profile(true);
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LocalExecutable> local_executable,
       client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
-                      ExecutableBuildOptions().set_hlo_profile(true)));
+                      build_options));
 
   Executable* executable = local_executable->executable();
   HloExecutionProfile hlo_execution_profile(
@@ -204,20 +210,32 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
   string profile_output;
   ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape,
                          rhs_shape);
-
+  VLOG(4) << "Profile Output:\n" << profile_output;
   std::vector<string> profile_output_lines =
       absl::StrSplit(profile_output, '\n');
 
   absl::flat_hash_map<string, ParsedProfileOutputLine> parsed_profile_lines;
 
-  TF_ASSERT_OK(ParseOneProfileOutputLine(
-      profile_output_lines[1], /*expect_hlo=*/false, &parsed_profile_lines));
+  int line_no = 0;
+
+  // Skip extra metrics.
+  while (IsExtraMetricProfileOutputLine(profile_output_lines[line_no])) {
+    line_no++;
+  }
+
+  line_no++;  // Skip 'Execution profile for ....'
 
-  TF_ASSERT_OK(ParseOneProfileOutputLine(
-      profile_output_lines[2], /*expect_hlo=*/true, &parsed_profile_lines));
+  TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
+                                         /*expect_hlo=*/false,
+                                         &parsed_profile_lines));
 
-  TF_ASSERT_OK(ParseOneProfileOutputLine(
-      profile_output_lines[3], /*expect_hlo=*/true, &parsed_profile_lines));
+  TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
+                                         /*expect_hlo=*/true,
+                                         &parsed_profile_lines));
+
+  TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
+                                         /*expect_hlo=*/true,
+                                         &parsed_profile_lines));
 
   TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_profile,
                           MaybeFind(parsed_profile_lines, "[total]"));
@@ -291,6 +309,7 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) {
   string profile_output;
   ExecuteAndFetchProfile(&profile_output, client, computation, matrix_shape,
                          matrix_shape);
+  SCOPED_TRACE(profile_output);
 
   std::vector<string> profile_output_lines =
       absl::StrSplit(profile_output, '\n');
@@ -302,14 +321,13 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) {
 
   ASSERT_NE(while_body_profile_start, profile_output_lines.cend());
 
-  auto while_body_profile_end = std::find_if(
-      while_body_profile_start, profile_output_lines.end(),
-      [](absl::string_view s) {
-        return absl::StartsWith(s, "********** microseconds report **********");
-      });
+  auto while_body_profile_end =
+      std::find_if(while_body_profile_start, profile_output_lines.end(),
+                   [](absl::string_view s) {
+                     return absl::StartsWith(s, "********** microseconds ");
+                   });
 
-  // We emit a blank line before the "********** microseconds report **********"
-  // line.
+  // We emit a blank line before the "microseconds report" line.
   while_body_profile_end--;
 
   ASSERT_NE(while_body_profile_end, profile_output_lines.end());
@@ -364,7 +382,7 @@ static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) {
 
 GTEST_API_ int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv);
 
   auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
index 15603619b62d8f45cdce97ac7d83924a78f88cf3..dca0aa52a533130372759156a3238f1a3b10ca42 100644
--- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
+++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 GTEST_API_ int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   auto usage = tensorflow::Flags::Usage(argv[0], flag_list);
   if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) {
     LOG(ERROR) << "\n" << usage;
@@ -49,7 +49,7 @@ GTEST_API_ int main(int argc, char** argv) {
       // different API than Tensorflow's.
       testing::InitGoogleTest(&argc, argv);
 #if defined(PLATFORM_GOOGLE)
-      base::SetFlag(&FLAGS_benchmarks, pattern);
+      absl::SetFlag(&FLAGS_benchmarks, pattern);
       RunSpecifiedBenchmarks();
 #else
       tensorflow::testing::Benchmark::Run(pattern);
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 3a086c66bbb37965b1ad7c83a93f0054ae723e87..8926bbed2b54fceaaf0e6e991f0e881d35731ef4 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -33,6 +33,7 @@ cc_library(
     name = "dumped_computation_to_graphviz_library",
     srcs = ["dumped_computation_to_graphviz.cc"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -40,7 +41,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
@@ -78,6 +78,7 @@ cc_library(
     name = "replay_computation_library",
     srcs = ["replay_computation.cc"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -91,7 +92,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:testing",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service/gpu:infeed_manager",
@@ -207,13 +207,13 @@ tf_cc_binary(
     name = "dumped_computation_to_tf_graphdef",
     srcs = ["dumped_computation_to_tf_graphdef.cc"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_proto",
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
index c866a13de7543fc948311f94708bc6b904717b62..b623556468fb4a5d96be614b6c067d5a1df51a6f 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -54,7 +54,7 @@ void RealMain(absl::Span<char* const> args) {
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
     XlaComputation computation =
         client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+    DebugOptions debug_options = GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     ComputationStats stats =
         client->GetComputationStats(computation, debug_options)
@@ -68,7 +68,7 @@ void RealMain(absl::Span<char* const> args) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
index 07ef5ff656bb48519a700a1d7d6c60b655a40ed6..f8bb9a6b1e217fc4e6e15c8a3302be61ed339c82 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -53,7 +53,7 @@ void RealMain(absl::Span<char* const> args) {
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
     XlaComputation computation =
         client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
+    DebugOptions debug_options = GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     debug_options.set_xla_hlo_dump_as_graphdef(true);
     ComputationStats stats =
@@ -68,7 +68,7 @@ void RealMain(absl::Span<char* const> args) {
 
 int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
-  xla::legacy_flags::AppendDebugOptionsFlags(&flag_list);
+  xla::AppendDebugOptionsFlags(&flag_list);
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
   if (!parse_result) {
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 0c41f227b31ebe1f01073785ea2a666093aefdb3..ff2c3399928c0e6339304323c4f93e212933a340 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -47,8 +47,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
-#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -82,12 +82,17 @@ struct Options {
 std::unique_ptr<LocalExecutable> CompileExecutable(const HloSnapshot& module,
                                                    LocalClient* client) {
   XlaComputation computation(module.hlo().hlo_module());
-  std::vector<const Shape*> argument_layouts;
-  for (const auto& param : computation.proto().program_shape().parameters()) {
-    argument_layouts.push_back(&param);
+  std::vector<Shape> argument_layouts;
+  argument_layouts.reserve(
+      computation.proto().host_program_shape().parameters_size());
+  std::vector<const Shape*> argument_layout_ptrs;
+  for (const ShapeProto& param :
+       computation.proto().host_program_shape().parameters()) {
+    argument_layouts.push_back(Shape(param));
+    argument_layout_ptrs.push_back(&argument_layouts.back());
   }
   return client
-      ->Compile(computation, argument_layouts, ExecutableBuildOptions())
+      ->Compile(computation, argument_layout_ptrs, ExecutableBuildOptions())
       .ValueOrDie();
 }
 
@@ -148,7 +153,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
               << "--generate_fake_infeed only works if the model has 0 or 1 "
                  "infeed ops, but this one has >= 2.";
           provide_infeed = true;
-          infeed_shape = instruction.shape();
+          infeed_shape = Shape(instruction.shape());
           LOG(INFO) << "Generating fake infeed shape for inferred shape: "
                     << ShapeUtil::HumanString(infeed_shape);
         }
@@ -190,16 +195,16 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
 
   // Run the computation num_runs times, and return the result from the last
   // execution.
-  const bool xla_hlo_profile =
-      legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile();
+  const bool xla_hlo_profile = GetDebugOptionsFromFlags().xla_hlo_profile();
   StreamExecutorMemoryAllocator allocator(
       client->platform(),
       {client->platform()->ExecutorForDevice(0).ValueOrDie()});
-  absl::optional<ScopedShapedBuffer> result;
+  absl::optional<ScopedShapedBuffer> final_result;
   for (int i = 0; i < opts.num_runs; ++i) {
     // If xla_hlo_profile is enabled, print a noisy message before the last run,
     // making it easier to separate this profile from the others in the logspam.
-    if (xla_hlo_profile && i == opts.num_runs - 1) {
+    bool is_final_result = i == opts.num_runs - 1;
+    if (xla_hlo_profile && is_final_result) {
       LOG(INFO) << "\n\n***** Final run below ******";
     }
     ExecutionProfile profile;
@@ -207,14 +212,22 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
     run_options.set_execution_profile(&profile);
     run_options.set_allocator(&allocator);
 
-    TF_ASSIGN_OR_RETURN(result, executable->Run(argument_ptrs, run_options));
+    TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                        executable->Run(argument_ptrs, run_options));
     LOG(INFO) << "Done executing in "
               << static_cast<double>(profile.compute_time_ns()) / 1e9
               << "s: " << module.hlo().hlo_module().name();
+
+    // Save the result if this is for the final iteration.  Otherwise discard
+    // the result before rerunning the computation, so as to free up the
+    // relevant memory.
+    if (is_final_result) {
+      final_result = std::move(result);
+    }
   }
 
   TF_ASSIGN_OR_RETURN(Literal result_literal,
-                      client->ShapedBufferToLiteral(*result));
+                      client->ShapedBufferToLiteral(*final_result));
   return result_literal;
 }
 
@@ -306,9 +319,10 @@ int RealMain(absl::Span<char* const> args, const Options& opts) {
       if (snapshot.has_result()) {
         Literal literal =
             Literal::CreateFromProto(snapshot.result()).ConsumeValueOrDie();
-        fprintf(stdout, "was %s:%s\n",
-                ShapeUtil::HumanString(snapshot.result().shape()).c_str(),
-                literal.ToString().c_str());
+        fprintf(
+            stdout, "was %s:%s\n",
+            ShapeUtil::HumanString(Shape(snapshot.result().shape())).c_str(),
+            literal.ToString().c_str());
       }
     }
   }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 8ce741647414a1fa75e6d706ec1e719ace7b7cc8..6722641e9d2c177440361e6f0d1f6c0804eb7cda 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -152,6 +152,13 @@ static inline absl::Span<const int64> AsInt64Slice(
                                  slice.size());
 }
 
+// TODO(b/29771030): This nop overload was added to simplify the migration of
+// Shape from a proto to a C++ class. Remove after class has been migrated.
+static inline absl::Span<const int64> AsInt64Slice(
+    absl::Span<const int64> slice) {
+  return slice;
+}
+
 // As above, but for uint64 types.
 static inline absl::Span<const uint64> AsUInt64Slice(
     const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>& v) {
@@ -387,6 +394,19 @@ T CeilOfRatio(T dividend, T divisor) {
   return tensorflow::MathUtil::CeilOfRatio<T>(dividend, divisor);
 }
 
+template <typename T>
+std::vector<T> ElementWiseCeilOfRatio(absl::Span<const T> dividends,
+                                      absl::Span<const T> divisors) {
+  std::vector<T> ceil_of_ratios;
+  CHECK_EQ(dividends.size(), divisors.size());
+  ceil_of_ratios.reserve(dividends.size());
+  absl::c_transform(dividends, divisors, std::back_inserter(ceil_of_ratios),
+                    [](const T dividend, const T divisor) {
+                      return CeilOfRatio<T>(dividend, divisor);
+                    });
+  return ceil_of_ratios;
+}
+
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
 // then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
 template <typename T>
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 8ea8dbab2574ca1e24271e7c1c7762d4a6b6a8de..51c73b3d17e4c32d9a8a14d3055ab56f02922af3 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -185,6 +185,17 @@ bool HasWindowReversal(const Window& window) {
   return false;
 }
 
+bool AllOrNoneReversed(const Window& window) {
+  if (window.dimensions().empty()) {
+    return true;
+  }
+  bool reversed = window.dimensions()[0].window_reversal();
+  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
+                     [&](const WindowDimension& dim) {
+                       return dim.window_reversal() == reversed;
+                     });
+}
+
 bool HasDilation(const Window& window) {
   return HasBaseDilation(window) || HasWindowDilation(window);
 }
diff --git a/tensorflow/compiler/xla/window_util.h b/tensorflow/compiler/xla/window_util.h
index 1fb9e855fc16f334eb0e83dfd27b307b2149628f..099d7ecdd5c732ffc8c6ff6370288a2fc4144fa2 100644
--- a/tensorflow/compiler/xla/window_util.h
+++ b/tensorflow/compiler/xla/window_util.h
@@ -56,6 +56,7 @@ bool HasWindowDilation(const Window& window);
 bool HasDilation(const Window& window);
 
 bool HasWindowReversal(const Window& window);
+bool AllOrNoneReversed(const Window& window);
 
 // Returns true if the given logical dimension is inactive in the sense that it
 // has window bound 1, no striding and no padding.
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 60d25a6407476cddba77aadd1df2e3939f5e40ac..a37eac7fe441d91aa71e1b6fd7b84099fee2215b 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -193,7 +193,11 @@ message DebugOptions {
   //  - Assuming that operations never produce or consume NaN or +/- Inf.
   //  - Assuming that +0 and -0 are indistinguishable.
   bool xla_cpu_enable_fast_math = 99;
-  bool xla_gpu_enable_fast_math = 100;
+
+  // When true we lower the Minimum and Maximum hlos in the GPU backend such
+  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
+  // this is true we don't propagate NaNs through Min and Max.
+  bool xla_gpu_enable_fast_min_max = 100;
 
   // Crashes the program when any kind of verification fails, instead of just
   // logging the failures. One example is cross checking of convolution results
@@ -209,6 +213,9 @@ message DebugOptions {
   // the host that run models in parallel across multiple devices.
   int32 xla_force_host_platform_device_count = 102;
 
+  // If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3).
+  bool xla_gpu_disable_ptxas_optimizations = 103;
+
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
@@ -224,7 +231,7 @@ message ExecutionOptions {
   // may be faster when using this layout.
   //
   // We use a Shape here to accommodate computations that return a tuple.
-  Shape shape_with_output_layout = 2;
+  ShapeProto shape_with_output_layout = 2;
 
   // Used to seed random-number generators used in this computation.  If this is
   // 0, we generate a seed ourselves.
@@ -253,7 +260,7 @@ message TransferToClientRequest {
 
   // This optional field directs the service to return the literal in this
   // layout. A shape is used to hold the layout to accommodate tuples.
-  Shape shape_with_layout = 2;
+  ShapeProto shape_with_layout = 2;
 }
 
 message TransferToClientResponse {
@@ -281,7 +288,7 @@ message TransferToInfeedResponse {
 message TransferFromOutfeedRequest {
   // This optional field directs the service to return the literal in this
   // layout. A shape is used to hold the layout to accommodate tuples.
-  Shape shape_with_layout = 1;
+  ShapeProto shape_with_layout = 1;
 
   int64 replica_id = 2;
   DeviceHandle device_handle = 3;
@@ -316,12 +323,40 @@ message CreateChannelHandleResponse {
 }
 
 message UnregisterRequest {
-  GlobalDataHandle data = 1;
+  repeated GlobalDataHandle data = 1;
 }
 
 message UnregisterResponse {
 }
 
+message CompileRequest {
+  // The graph to be compiled.
+  HloModuleProto computation = 1;
+
+  // Options that affect how XLA compiles code to service this request.
+  ExecutionOptions execution_options = 2;
+
+  // The layouts of the input arguments. If not set, the default layout will be
+  // used. Although the real arguments are not needed in compilation, the
+  // layouts of the arguments can affect the compilation.
+  repeated ShapeProto input_shape_with_layout = 3;
+}
+
+message CompileResponse {
+  // The handle to the executable.
+  ExecutionHandle handle = 1;
+}
+
+message ExecuteRequest {
+  ExecutionHandle handle = 1;
+
+  // The shape and layout of the arguments must be the same as the those of the
+  // executable's parameters.
+  repeated GlobalDataHandle arguments = 2;
+}
+
+// TODO(b/118493728): Remove this and ExecuteGraphParallelRequest and replace
+// the uses with calls to Compile and Execute.
 message ExecuteGraphRequest {
   HloModuleProto computation = 1;
   repeated GlobalDataHandle arguments = 2;
@@ -378,7 +413,7 @@ message LoadDataRequest {
   string columnio_field = 2;
 
   // Individual element shape, excluding rows.
-  Shape element_shape = 3;
+  ShapeProto element_shape = 3;
 
   // Warning: ColumnIO does not support random-access, so use offset with
   // caution in performance-critical scenarios.
@@ -394,7 +429,7 @@ message LoadDataRequest {
 
 message LoadDataResponse {
   GlobalDataHandle data = 1;
-  Shape data_shape = 2;
+  ShapeProto data_shape = 2;
   int64 available_rows = 3;
   int64 rows_loaded = 4;
   int64 nanoseconds = 5;
@@ -405,7 +440,7 @@ message GetShapeRequest {
 }
 
 message GetShapeResponse {
-  Shape shape = 1;
+  ShapeProto shape = 1;
 }
 
 message UnpackRequest {
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 73b3589dbf12341ddb3f3e819a550467a7b4d166..85ec83437a10d973687a7fb84285c2e2541a53c7 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -78,28 +78,6 @@ enum PrimitiveType {
   // Next = 18
 }
 
-// Describes the value held inside padding elements.
-enum PaddingValue {
-  INVALID_PAD = 0;
-
-  // Zero padding must be 0-values that correspond to the shape's element type.
-  ZERO_PAD = 1;
-
-  // One padding must be 1-values that correspond to the shape's element type.
-  ONE_PAD = 2;
-
-  // "Lowest" padding must be the lowest values in the shape's element type,
-  // used as padding for operations like max-accumulation.
-  LOWEST_PAD = 3;
-
-  // "Highest" padding must be the largest values in the shape's element type,
-  // used as padding for operations like min-accumulation.
-  HIGHEST_PAD = 4;
-
-  // Unknown padding could be anything; e.g. floating NaNs!
-  UNKNOWN_PAD = 5;
-}
-
 // Describes the padding configuration for Pad operation. The padding amount on
 // both edges as well as between the elements are specified for each dimension.
 message PaddingConfig {
@@ -123,17 +101,25 @@ message PaddingConfig {
 // A format specifies the method used by a layout to store an array in memory.
 enum Format {
   INVALID_FORMAT = 0;
-  // The default layout, with exactly one storage location per element (ignoring
-  // padding).
+  // The default layout, with exactly one storage location per element.
   DENSE = 1;
   // A sparsely encoded layout, providing only the index/value pairs of non-zero
   // elements.
   SPARSE = 2;
 }
 
+// Describes a tile used in tiling-based layout. Refer to
+// g3doc/layout_with_tiling.md for details about tiling-based layout.
+message Tile {
+  // Number of elements in each dimension of the tile. It's ordered from the
+  // most major dimension of the tile to the most minor dimension of the tile.
+  // The dimensions correspond to a suffix of the dimensions of the shape being
+  // tiled.
+  repeated int64 dimensions = 1;
+}
+
 // A layout describes how the array is placed in (1D) memory space.  This
-// includes the minor-to-major ordering of dimensions within a shape, as well as
-// any padding present in those dimensions.
+// includes the minor-to-major ordering of dimensions within a shape.
 //
 // Clients must specify the layouts of input Literals to the
 // computation. Layouts specified in interior operations which take Shapes (for
@@ -151,22 +137,31 @@ message Layout {
   // (slowest varying index). This field is required.
   repeated int64 minor_to_major = 1;
 
-  // The width to which the layout of each dimension is padded up to. If
-  // present, the size of the padded_dimensions must equal the rank of the
-  // shape. The padding appears at the end of a dimension, not at the
-  // beginning. This kind of padding, unlike padding in e.g. convolution, is not
-  // part of the shape. This field must be unset unless the format is DENSE.
-  repeated int64 padded_dimensions = 2;
+  reserved 2;
+  reserved "padded_dimensions";
 
-  // Describes the values in the padding specified by padded_dimensions. This
-  // field must be unset unless the format is DENSE.
-  PaddingValue padding_value = 3;
+  reserved 3;
+  reserved "padding_value";
 
   // The maximum number of elements that can be stored for SPARSE formats.  This
   // can be used to determine the maximum size in bytes of arrays stored in
   // memory.  This field must be unset unless the format is SPARSE.
   int64 max_sparse_elements = 5;
 
+  // A sequence of tiles, starting from the tile that's applied first to the
+  // Shape.
+  //
+  // TODO(b/119839262): implement tiling in each backend or add Unimplemented
+  // error.
+  repeated Tile tiles = 6;
+
+  // Bit size of each element. If the size is bigger than what the element
+  // type requires, the value is stored in the least significant
+  // bits and the additional most significant bits are filled with 0's.
+  //
+  // TODO(b/119839262): implement in each backend or add Unimplemented error.
+  int64 element_size_in_bits = 7;
+
   // Important: if any field is added, be sure to modify ShapeUtil::Equal() and
   // LayoutUtil::Hash appropriately to account for the new field.
 }
@@ -183,7 +178,7 @@ message Layout {
 // See the XLA documentation for more information on shapes and layouts.
 //
 // LINT.IfChange
-message Shape {
+message ShapeProto {
   reserved 1;
   reserved "rank";
 
@@ -198,7 +193,7 @@ message Shape {
   repeated int64 dimensions = 3;
 
   // For tuples only, the shapes of constitutent shapes in the tuple sequence.
-  repeated Shape tuple_shapes = 4;
+  repeated ShapeProto tuple_shapes = 4;
 
   // The layout used to back this shape.
   Layout layout = 5;
@@ -212,9 +207,9 @@ message Shape {
 
 // Shape of the parameters and output of a computation (like a traditional
 // function signature).
-message ProgramShape {
-  repeated Shape parameters = 1;
-  Shape result = 2;
+message ProgramShapeProto {
+  repeated ShapeProto parameters = 1;
+  ShapeProto result = 2;
   repeated string parameter_names = 3;
 }
 
@@ -349,7 +344,7 @@ message DeviceAssignmentProto {
 // Transfers to/from the client are encoded in literal form, and the structure
 // of the repeated fields is implied by the shape.
 message LiteralProto {
-  Shape shape = 1;
+  ShapeProto shape = 1;
   repeated bool preds = 2;
   bytes s8s = 15;
   bytes u8s = 3;
@@ -361,11 +356,13 @@ message LiteralProto {
   repeated double f64s = 9;
   repeated float c64s = 12;  // Stored as interleaved real, imag floats.
   repeated LiteralProto tuple_literals = 10;
-  // The F16s and BF16s are encoded in little endian byte order
+  // The F16s, BF16s, U16s and S16s are encoded in little endian byte order
   bytes f16s = 11;
   bytes bf16s = 13;
+  bytes u16s = 16;
+  bytes s16s = 17;
   repeated int64 sparse_indices = 14;
-  // Next = 16
+  // Next = 18
 }
 
 message WindowDimension {
@@ -548,7 +545,7 @@ message OpSharding {
   }
   Type type = 1;
   // The shape of the sharded tile.
-  Shape tile_shape = 2;
+  ShapeProto tile_shape = 2;
   // The shape of the tile assignment tensor - this must be the same rank as
   // tile_shape and the product of its dimensions must equal
   // tile_assignment_devices.size().
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 2ff97914f862e0ec30fc54602ec5fee2a0a5ebca..2dae746d034a1bf52e84de74dfb0c6e23aaed4d1 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -22,6 +22,7 @@ xla_proto_library(
     deps = [
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
     ],
 )
@@ -32,20 +33,25 @@ cc_library(
         "xrt_compilation_cache.cc",
         "xrt_device.cc",
         "xrt_state.cc",
+        "xrt_util.cc",
     ],
     hdrs = [
         "xrt_compilation_cache.h",
         "xrt_device.h",
         "xrt_state.h",
+        "xrt_util.h",
     ],
     deps = [
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 9e3d2454d16730c1d1f93cb384db88544380f77e..67f475846e5f16060c1080759b0acb4216c4e72b 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -12,6 +12,7 @@ cc_library(
     hdrs = ["xrt_state_ops.h"],
     deps = [
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -21,7 +22,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:hlo_proto",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 1d4f8d97f2ed8b263878b94b365b7fb5b949b1a2..2ccdf0f02d840600d5e0649c4805e3672d4a1286 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
+#include "tensorflow/compiler/xrt/xrt_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -108,19 +109,26 @@ Status XRTCompileOp::Compile(OpKernelContext* ctx,
   TF_ASSIGN_OR_RETURN(xla::XlaComputation computation,
                       client->LoadSnapshot(computation_proto.hlo_snapshot()));
 
-  std::vector<const xla::Shape*> argument_layouts(
+  std::vector<xla::Shape> argument_layouts(
+      config.program_shape().parameters_size());
+  std::vector<const xla::Shape*> argument_layout_ptrs(
       config.program_shape().parameters_size());
   for (int i = 0; i < config.program_shape().parameters_size(); ++i) {
-    argument_layouts[i] = &config.program_shape().parameters(i);
+    argument_layouts[i] = xla::Shape(config.program_shape().parameters(i));
+    argument_layout_ptrs[i] = &argument_layouts[i];
   }
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client->default_device_ordinal());
-  build_options.set_result_layout(config.program_shape().result());
+  build_options.set_result_layout(xla::Shape(config.program_shape().result()));
   build_options.set_device_allocator(device_ref.backend()->memory_allocator());
+  if (config.has_debug_options()) {
+    *build_options.mutable_debug_options() =
+        BuildXlaDebugOptions(config.debug_options());
+  }
 
   VLOG(1) << "Building executable";
   auto compile_result =
-      client->Compile(computation, argument_layouts, build_options);
+      client->Compile(computation, argument_layout_ptrs, build_options);
   if (!compile_result.ok()) {
     return compile_result.status();
   }
@@ -166,10 +174,23 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
                  VLOG(1) << "Compiling XLA executable";
                  return Compile(ctx, computation_proto, program);
                }));
-
-  Tensor output(DT_INT64, TensorShape({}));
-  output.scalar<int64>()() = uid;
-  ctx->set_output(0, output);
+  std::unique_ptr<XRTCompilationCacheEntryRef> entry;
+  OP_REQUIRES_OK(ctx, cache->Lookup(uid, &entry));
+
+  Tensor handle_output(DT_INT64, TensorShape({}));
+  handle_output.scalar<int64>()() = uid;
+  ctx->set_output(0, handle_output);
+
+  xla::LocalExecutable* executable = entry->get().get_executable();
+  xla::ProgramShapeProto program_shape = executable->executable()
+                                             ->module()
+                                             .config()
+                                             .entry_computation_layout()
+                                             .ComputeProgramShape()
+                                             .ToProto();
+  Tensor program_shape_output(DT_STRING, TensorShape({1}));
+  program_shape_output.vec<string>()(0) = program_shape.SerializeAsString();
+  ctx->set_output(1, program_shape_output);
 }
 
 XRTCompileOp::~XRTCompileOp() = default;
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 257b054f16a49f3e14e1d76746c9fe0ba7fa8658..751329eefc33f3372335c805233dafabbf42bf36 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -64,28 +64,36 @@ uint32 GetXLARandomSeed() {
   return counter.fetch_add(2);
 }
 
-// Looks up the input `key` in the compilation cache.
-Status GetComputationCacheEntry(
-    XRTCompilationCache* cache, int64 key,
-    std::unique_ptr<XRTCompilationCacheEntryRef>* entry) {
-  TF_RETURN_IF_ERROR(cache->Lookup(key, entry));
-  return Status::OK();
-}
-
 // Populates `inputs` with the input tensors to the computation.
 Status GetComputationInputs(OpKernelContext* context, ResourceMgr* rm,
                             bool release_inputs,
                             std::vector<XRTTupleAllocation*>* input_tuples,
                             std::vector<xla::ShapedBuffer>* input_allocations,
                             std::vector<xla::ShapedBuffer*>* input_pointers) {
+  std::vector<int64> input_uids;
   OpInputList arg_list;
   TF_RETURN_IF_ERROR(context->input_list("input_handles", &arg_list));
 
-  input_tuples->resize(arg_list.size());
-  input_pointers->resize(arg_list.size());
+  // Concatenate all input uids from list of scalars-or-vectors carrying them.
   for (int i = 0; i < arg_list.size(); ++i) {
-    TF_RET_CHECK(TensorShapeUtils::IsScalar(arg_list[i].shape()));
-    int64 input_uid = arg_list[i].scalar<int64>()();
+    const Tensor& arg = arg_list[i];
+    if (TensorShapeUtils::IsScalar(arg.shape())) {
+      input_uids.push_back(arg.scalar<int64>()());
+    } else {
+      TF_RET_CHECK(TensorShapeUtils::IsVector(arg.shape()));
+      auto arg_vec = arg.vec<int64>();
+      const int64 num_elts = arg.shape().dim_size(0);
+      for (int i = 0; i < num_elts; ++i) {
+        input_uids.push_back(arg_vec(i));
+      }
+    }
+  }
+
+  // Retrieve allocations for the uids.
+  input_tuples->resize(input_uids.size());
+  input_pointers->resize(input_uids.size());
+  for (int i = 0; i < input_uids.size(); ++i) {
+    const int64 input_uid = input_uids[i];
     TF_RETURN_IF_ERROR(
         XRTTupleAllocation::Lookup(rm, input_uid, &(*input_tuples)[i]));
     if (release_inputs) {
@@ -98,7 +106,7 @@ Status GetComputationInputs(OpKernelContext* context, ResourceMgr* rm,
     XRTTupleAllocation* tuple = (*input_tuples)[i];
     input_allocations->emplace_back(tuple->ToShapedBuffer());
   }
-  for (int i = 0; i < arg_list.size(); ++i) {
+  for (int i = 0; i < input_uids.size(); ++i) {
     (*input_pointers)[i] = &(*input_allocations)[i];
   }
   return Status::OK();
@@ -220,14 +228,35 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
       shaped_buffer, device_ref.backend(), device_ref.device_ordinal(),
       &output_tuple));
-
-  Tensor* output_tensor;
-  TF_RETURN_IF_ERROR(
-      context->allocate_output(0, TensorShape({}), &output_tensor));
-  int64 key;
-  TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
-  output_tensor->scalar<int64>()() = key;
-
+  if (config_proto.return_exploded_tuple() &&
+      xla::ShapeUtil::IsTuple(output_tuple->on_device_shape())) {
+    int64 tuple_element_count =
+        xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape());
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(context->allocate_output(
+        0, TensorShape({tuple_element_count}), &output_tensor));
+
+    for (int64 i = 0; i < tuple_element_count; ++i) {
+      xla::ShapeIndex shape_index;
+      shape_index.push_back(i);
+
+      XRTTupleAllocation* suballocation;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          output_tuple, shape_index, &suballocation,
+          /*alias_parent_allocation=*/false));
+      int64 key;
+      TF_RETURN_IF_ERROR(suballocation->Intern(rm, &key));
+      output_tensor->vec<int64>()(i) = key;
+    }
+    output_tuple->Unref();
+  } else {
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(0, TensorShape({}), &output_tensor));
+    int64 key;
+    TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
+    output_tensor->scalar<int64>()() = key;
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index ffea592491d43788b876a51866dc8a6611e8c734..3258286c10665225aab917107ffa614459c53f3d 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -87,6 +87,19 @@ REGISTER_KERNEL_BUILDER(Name("XRTReadLiteral")
                             .HostMemory("literal"),
                         XRTReadLiteralOp<false, XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("handle")
+                            .HostMemory("literal")
+                            .HostMemory("output_handle"),
+                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("handle")
+                            .HostMemory("literal")
+                            .HostMemory("output_handle"),
+                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
+
 REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
                             .Device(DEVICE_XLA_GPU)
                             .HostMemory("handle")
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 54b06558adcd8ef1f8f1bee52d210d558801afea..26a58fa42d8b730b365b11d2e5608e9945497763 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -393,6 +393,56 @@ class XRTReadLiteralOp : public OpKernel {
   }
 };
 
+// Op that writes a new literal value into device-resident memory.
+template <class DeviceAccessor>
+class XRTWriteLiteralOp : public OpKernel {
+ public:
+  explicit XRTWriteLiteralOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~XRTWriteLiteralOp() override = default;
+  XRTWriteLiteralOp(const XRTWriteLiteralOp&) = delete;
+  XRTWriteLiteralOp& operator=(const XRTWriteLiteralOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTWriteLiteralOp::Compute";
+
+    const Tensor& handle_tensor = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
+        errors::Internal("computation input should be an int64 scalar"));
+    int64 allocation_handle = handle_tensor.scalar<int64>()();
+
+    const Tensor& literal_info = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(literal_info.shape()),
+                errors::Internal("literal input should be a string scalar"));
+    xla::LiteralProto literal_proto;
+    OP_REQUIRES(ctx,
+                literal_proto.ParseFromString(literal_info.scalar<string>()()),
+                errors::InvalidArgument(
+                    "Unable to parse allocation input to LiteralProto"));
+    xla::Literal literal;
+    OP_REQUIRES_OK(ctx, XRTStateHelpers::MakeLiteral(literal_proto, &literal));
+
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+
+    XRTTupleAllocation* allocation;
+    OP_REQUIRES_OK(
+        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
+    core::ScopedUnref allocation_unref(allocation);
+    // We are guaranteed that the underlying device object won't be deleted out
+    // from under us, while the ScopedRef is live.
+    typename DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
+                            ctx, allocation->device_ordinal(), &device_ref));
+    OP_REQUIRES_OK(ctx,
+                   allocation->WriteLiteral(device_ref.backend(), literal));
+
+    Tensor output(DT_INT64, TensorShape({}));
+    output.scalar<int64>()() = allocation_handle;
+    ctx->set_output(0, output);
+  }
+};
+
 // Op that discards a handle to device memory.
 template <class DeviceAccessor>
 class XRTReleaseAllocationOp : public OpKernel {
diff --git a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc b/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
index 5cfc8711f9f4b4d54016156dd53471cadb34b581..7b3b50c69559f6003a108fdf6a1325dbdbaa80a6 100644
--- a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
@@ -23,7 +23,12 @@ namespace tensorflow {
 REGISTER_OP("XRTCompile")
     .Input("computation: string")
     .Output("handle: int64")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Output("program_shape: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->UnknownShapeOfRank(1));
+      return Status::OK();
+    })
     .Doc(
         R"(
 Reads a computation proto, compiles it, and places it in the global compilation
diff --git a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc b/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
index 40ec1b0ba9b336f5b6407c79c8d63e31219f9b84..4f59fccaf120e2358fa49518b030f0b0f42c322e 100644
--- a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
@@ -26,7 +26,16 @@ REGISTER_OP("XRTExecute")
     .Input("execution_config: string")
     .Input("input_handles: Ninputs * int64")
     .Output("output_handle: int64")
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<shape_inference::ShapeHandle> input_handle_shapes;
+      TF_RETURN_IF_ERROR(c->input("input_handles", &input_handle_shapes));
+      for (size_t i = 0; i < input_handle_shapes.size(); ++i) {
+        shape_inference::ShapeHandle unused;
+        TF_RETURN_IF_ERROR(
+            c->WithRankAtMost(input_handle_shapes[i], 1, &unused));
+      }
+      return tensorflow::shape_inference::ScalarShape(c);
+    })
     .Doc(
         R"(
 Runs a previously-compiled computation on a core. If
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index 07d025ce343f229097b557d33ad41bf9612b0696..a3d63106fa14674a9f5887ccfd908ce17dbc6384 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -95,6 +95,20 @@ Copies an allocated tuple from device memory and returns it as a literal.
 'literal' is a serialized xla::LiteralProto proto.
 )");
 
+REGISTER_OP("XRTWriteLiteral")
+    .Input("handle: int64")
+    .Input("literal: string")
+    .Output("output_handle: int64")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Copies the input literal into the device memory pointed to by handle.
+Returns the handle itself.
+
+'handle' is the id returned from the Op that produced the on-device allocation.
+'literal' is a serialized xla::LiteralProto proto to be written to device memory.
+)");
+
 REGISTER_OP("XRTReadLiteralAndRelease")
     .Input("handle: int64")
     .Output("literal: string")
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
index b6dcfc4eb96316b5dad95a65b04d0ae69e4485f6..be44a3474acdeb9905c1d21b932fa0dd10b5a212 100644
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ b/tensorflow/compiler/xrt/tests/BUILD
@@ -29,8 +29,11 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt:xrt_server",
         "//tensorflow/compiler/xrt/cc:xrt_ops",
@@ -49,7 +52,10 @@ tf_cc_test(
     name = "raw_api_test_cpu",
     size = "medium",
     srcs = [],
-    args = ["--xla_test_device=XLA_CPU"],
+    args = [
+        "--xla_test_device=XLA_CPU",
+        "--xla_platform=CPU",
+    ],
     deps = [
         ":raw_api_test_lib",
         "//tensorflow/compiler/jit:xla_cpu_device",
@@ -60,7 +66,10 @@ tf_cuda_cc_test(
     name = "raw_api_test_gpu",
     size = "medium",
     srcs = [],
-    args = ["--xla_test_device=XLA_GPU"],
+    args = [
+        "--xla_test_device=XLA_GPU",
+        "--xla_platform=GPU",
+    ],
     tags = tf_cuda_tests_tags(),
     deps = [
         ":raw_api_test_lib",
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index f590fbf0d9d85e6e8b041f6719ab6a14ec9e2191..abaa17e50e3f5e47a45f5a8a45fa2090d3efee39 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -22,10 +22,13 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/compiler/xrt/cc/ops/xrt_compile_ops.h"
@@ -43,6 +46,7 @@ namespace tensorflow {
 namespace {
 
 string* xla_test_device_ptr;  // initial value set in main()
+string* xla_platform_ptr;     // initial value set in main()
 
 string DeviceFromFlag() {
   string xla_test_device = *xla_test_device_ptr;
@@ -85,13 +89,20 @@ xla::LiteralProto FloatVector(absl::Span<const float> v) {
   return array.ToProto();
 }
 
+xla::LiteralProto FloatMatrix(
+    std::initializer_list<std::initializer_list<float>> v,
+    const xla::Layout& layout) {
+  auto array = xla::LiteralUtil::CreateR2WithLayout<float>(v, layout);
+  return array.ToProto();
+}
+
 bool CompareLiteralProtos(const xla::LiteralProto& a,
                           const xla::LiteralProto& b) {
   auto l_a = xla::Literal::CreateFromProto(a).ValueOrDie();
   auto l_b = xla::Literal::CreateFromProto(b).ValueOrDie();
   bool equal = l_a == l_b;
   if (!equal) {
-    LOG(INFO) << "LiteralProtos don't match " << a.DebugString()
+    LOG(INFO) << "LiteralProtos don't match: " << a.DebugString()
               << " != " << b.DebugString();
   }
   return equal;
@@ -128,6 +139,31 @@ xla::XlaComputation AddAndScale() {
   return builder.Build().ValueOrDie();
 }
 
+xla::XlaComputation Dot() {
+  xla::XlaBuilder builder("Dot");
+  auto p0 = xla::Parameter(
+      &builder, 0,
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 2}, {0, 1}), "P0");
+  auto p1 = xla::Parameter(
+      &builder, 1,
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1}), "P1");
+  xla::DotDimensionNumbers ddn;
+  ddn.add_lhs_contracting_dimensions(1);
+  ddn.add_rhs_contracting_dimensions(0);
+  xla::DotGeneral(p0, p1, ddn);
+  return builder.Build().ValueOrDie();
+}
+
+xla::XlaComputation AddS64() {
+  xla::XlaBuilder builder("AddS64");
+  auto p0 = xla::Parameter(&builder, 0, xla::ShapeUtil::MakeShape(xla::S64, {}),
+                           "P0");
+  auto p1 = xla::Parameter(&builder, 1, xla::ShapeUtil::MakeShape(xla::S64, {}),
+                           "P1");
+  xla::Add(p0, p1);
+  return builder.Build().ValueOrDie();
+}
+
 xla::XlaComputation AddAndTuple() {
   xla::XlaBuilder builder("AddAndTuple");
   auto p0 = xla::Parameter(&builder, 0,
@@ -139,12 +175,96 @@ xla::XlaComputation AddAndTuple() {
   return builder.Build().ValueOrDie();
 }
 
+xla::XlaComputation AddAndSubTuple() {
+  xla::XlaBuilder builder("AddAndSubTuple");
+  auto p0 = xla::Parameter(&builder, 0, xla::ShapeUtil::MakeShape(xla::F32, {}),
+                           "P0");
+  auto p1 = xla::Parameter(&builder, 1, xla::ShapeUtil::MakeShape(xla::F32, {}),
+                           "P1");
+  auto sum = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  xla::Tuple(&builder, {sum, sub});
+  return builder.Build().ValueOrDie();
+}
+
 void StoreComputationSnapshot(const xla::XlaComputation& computation,
                               xla::HloSnapshot* dst) {
   auto snapshot = computation.Snapshot().ValueOrDie();
   *dst = *snapshot;
 }
 
+xla::ProgramShape XlaCompiledProgramShape(
+    const xla::XlaComputation& computation,
+    const xla::ProgramShape& input_program_shape) {
+  se::Platform* platform =
+      xla::PlatformUtil::GetPlatform(*xla_platform_ptr).ValueOrDie();
+  xla::LocalClient* client =
+      xla::ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
+  xla::ExecutableBuildOptions exec_options;
+  exec_options.set_result_layout(input_program_shape.result());
+  std::vector<const xla::Shape*> parameters_shapes;
+  for (int64 i = 0; i < input_program_shape.parameters_size(); ++i) {
+    parameters_shapes.push_back(&input_program_shape.parameters(i));
+  }
+  auto local_executable =
+      client->Compile(computation, parameters_shapes, exec_options)
+          .ValueOrDie();
+  return local_executable->executable()
+      ->module()
+      .entry_computation()
+      ->ComputeProgramShape();
+}
+
+TEST(RawApiTest, AllocAndRewrite) {
+  xrt::XLAAllocation alloc;
+  alloc.set_device_ordinal(0);
+  *alloc.mutable_value() =
+      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto value =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
+  auto handle = ops::XRTAllocate(root, value);
+  auto read_back = ops::XRTReadLiteral(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  tensorflow::ClientSession session(root);
+  std::vector<tensorflow::Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 allocation_handle = outputs[1].scalar<int64>()();
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
+  outputs.clear();
+
+  xla::LiteralProto new_literal =
+      xla::LiteralUtil::CreateR2({{9, 2}, {4, 1}}).ToProto();
+  auto new_value = ops::Const(root.WithDevice("/device:CPU:0"),
+                              new_literal.SerializeAsString());
+  auto write_op =
+      ops::XRTWriteLiteral(root, Input(allocation_handle), new_value);
+  TF_ASSERT_OK(root.status());
+  TF_EXPECT_OK(session.Run({write_op}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+  EXPECT_EQ(allocation_handle, outputs[0].scalar<int64>()());
+  outputs.clear();
+
+  auto read_after_write = ops::XRTReadLiteral(root, Input(allocation_handle));
+  TF_EXPECT_OK(session.Run({read_after_write}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto new_response;
+  EXPECT_TRUE(new_response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralProtos(new_literal, new_response));
+
+  auto release =
+      ops::XRTReleaseAllocationHandle(root, Input(allocation_handle));
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
 TEST(RawApiTest, ReadAndWriteState) {
   xrt::XLAAllocation alloc;
   alloc.set_device_ordinal(0);
@@ -317,9 +437,12 @@ TEST(RawApiTest, CompileAndExecute) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {2});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
   StoreComputationSnapshot(AddAndScale(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -338,27 +461,207 @@ TEST(RawApiTest, CompileAndExecute) {
   auto p1_value =
       ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
   auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto result = ops::XRTExecute(root, c_handle, e_config,
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
                                 {Output(p0_handle), Output(p1_handle)});
   auto read_back = ops::XRTReadLiteralAndRelease(root, result);
   TF_ASSERT_OK(root.status());
 
   ClientSession session(root);
   std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+
+  xla::ProgramShapeProto program_shape;
+  EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
+  EXPECT_EQ(program_shape.parameters_size(), 2);
+}
+
+TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
+  xrt::XLAAllocation p0;
+  p0.set_device_ordinal(0);
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f});
+  xrt::XLAAllocation p1;
+  p1.set_device_ordinal(0);
+  *p1.mutable_value() = FloatVector({8.0f, 5.0f});
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  StoreComputationSnapshot(AddAndScale(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto packed_args = ops::Stack(root.WithDevice("/device:CPU:0"),
+                                {Output(p0_handle), Output(p1_handle)});
+  auto result =
+      ops::XRTExecute(root, c_handle.handle, e_config, {Output(packed_args)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
 
   xla::LiteralProto response;
   EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
 
   auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+
+  xla::ProgramShapeProto program_shape;
+  EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
+  EXPECT_EQ(program_shape.parameters_size(), 2);
+}
+
+TEST(RawApiTest, CompileWithXlaReturnShapes) {
+  xla::XlaBuilder builder("XrtXlaShapes");
+  auto input_shape = xla::ShapeUtil::MakeShape(xla::BF16, {32, 3, 128, 128});
+  auto kernel_shape = xla::ShapeUtil::MakeShape(xla::BF16, {3, 3, 5, 5});
+  // Clear layouts to signal XLA we are ready to get whatever are coming out of
+  // the compilation process.
+  xla::LayoutUtil::ClearLayout(&input_shape);
+  xla::LayoutUtil::ClearLayout(&kernel_shape);
+  auto param_shape =
+      xla::ShapeUtil::MakeTupleShape({input_shape, kernel_shape});
+  auto param = xla::Parameter(&builder, 0, param_shape, "param");
+  auto input = xla::GetTupleElement(param, 0);
+  auto kernel = xla::GetTupleElement(param, 1);
+  xla::Conv(input, kernel, {1, 1}, xla::Padding::kSame);
+  TF_ASSERT_OK_AND_ASSIGN(xla::XlaComputation xla_computation, builder.Build());
+
+  auto result_shape = xla_computation.GetProgramShape().ValueOrDie().result();
+  // Clear the result shape layout to tell XLA we are accepting whatever are
+  // coming out of the compilation process.
+  xla::LayoutUtil::ClearLayout(&result_shape);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() = param_shape.ToProto();
+  *shapes->mutable_result() = result_shape.ToProto();
+  StoreComputationSnapshot(xla_computation, c.mutable_hlo_snapshot());
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto release = ops::XRTReleaseCompilationHandle(root, c_handle.handle);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(),
+                           {c_handle.program_shape}, {release}, &outputs));
+
+  xla::ProgramShapeProto program_shape_proto;
+  EXPECT_TRUE(program_shape_proto.ParseFromString(outputs[0].vec<string>()(0)));
+  xla::ProgramShape program_shape(program_shape_proto);
+  EXPECT_EQ(program_shape.parameters_size(), 1);
+
+  VLOG(2) << "Param: "
+          << xla::ShapeUtil::HumanStringWithLayout(program_shape.parameters(0));
+  VLOG(2) << "Result: "
+          << xla::ShapeUtil::HumanStringWithLayout(program_shape.result());
+
+  xla::ProgramShape xla_program_shape =
+      XlaCompiledProgramShape(xla_computation, xla::ProgramShape(*shapes));
+  EXPECT_TRUE(xla::LayoutUtil::Equal(
+      xla::ShapeUtil::GetSubshape(program_shape.parameters(0), {0}).layout(),
+      xla::ShapeUtil::GetSubshape(xla_program_shape.parameters(0), {0})
+          .layout()));
+  EXPECT_TRUE(xla::LayoutUtil::Equal(
+      xla::ShapeUtil::GetSubshape(program_shape.parameters(0), {1}).layout(),
+      xla::ShapeUtil::GetSubshape(xla_program_shape.parameters(0), {1})
+          .layout()));
+  EXPECT_TRUE(xla::LayoutUtil::Equal(program_shape.result().layout(),
+                                     xla_program_shape.result().layout()));
+}
+
+TEST(RawApiTest, DotGeneralWithLayoutTest) {
+  auto layout = xla::LayoutUtil::MakeLayout({0, 1});
+
+  xrt::XLAAllocation p0;
+  p0.set_device_ordinal(0);
+  *p0.mutable_value() = FloatMatrix({{1.0f, 2.0f}, {3.0f, 4.0f}}, layout);
+  xrt::XLAAllocation p1;
+  p1.set_device_ordinal(0);
+  *p1.mutable_value() = FloatMatrix({{8.0f}, {5.0f}}, layout);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 2}, {0, 1}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1}).ToProto();
+  StoreComputationSnapshot(Dot(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(p0_handle), Output(p1_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+
+  auto expected =
+      xla::LiteralUtil::CreateR2WithLayout<float>({{18.0f}, {44.0f}}, layout);
+
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 }
 
 TEST(RawApiTest, CompileAndExecuteZeroArg) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {});
+  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
 
   xrt::XRTExecutionConfig e;
   e.set_release_input_handles(true);
@@ -371,7 +674,7 @@ TEST(RawApiTest, CompileAndExecuteZeroArg) {
   auto computation =
       ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
   auto c_handle = ops::XRTCompile(root, computation);
-  auto result = ops::XRTExecute(root, c_handle, e_config,
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
                                 std::initializer_list<Input>({}));
   auto read_back = ops::XRTReadLiteralAndRelease(root, result);
   TF_ASSERT_OK(root.status());
@@ -398,10 +701,13 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::F32, {2})});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
+          .ToProto();
   StoreComputationSnapshot(AddAndTuple(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -420,7 +726,7 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   auto p1_value =
       ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
   auto p1_handle = ops::XRTAllocate(root, p1_value);
-  auto result = ops::XRTExecute(root, c_handle, e_config,
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
                                 {Output(p0_handle), Output(p1_handle)});
   auto read_back = ops::XRTReadLiteralAndRelease(root, result);
   TF_ASSERT_OK(root.status());
@@ -437,15 +743,160 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 }
 
+TEST(RawApiTest, CompileAndExecuteReturnExplodedTuple) {
+  xrt::XLAAllocation p0;
+  p0.set_device_ordinal(0);
+  *p0.mutable_value() = xla::LiteralUtil::CreateR0<float>(12.0f).ToProto();
+
+  xrt::XLAAllocation p1;
+  p1.set_device_ordinal(0);
+  *p1.mutable_value() = xla::LiteralUtil::CreateR0<float>(3.0f).ToProto();
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {}),
+                                      xla::ShapeUtil::MakeShape(xla::F32, {})})
+          .ToProto();
+  StoreComputationSnapshot(AddAndSubTuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+  e.set_return_exploded_tuple(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(p0_handle), Output(p1_handle)});
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({result}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  auto handles_vec = outputs.front().vec<int64>();
+  EXPECT_EQ(handles_vec.size(), 2);
+
+  const float kResults[2] = {15.0f, 9.0f};
+  for (int64 i = 0; i < handles_vec.size(); ++i) {
+    auto read_back = ops::XRTReadLiteralAndRelease(root, Input(handles_vec(i)));
+    std::vector<Tensor> voutputs;
+    TF_EXPECT_OK(session.Run({read_back}, &voutputs));
+    EXPECT_EQ(voutputs.size(), 1);
+
+    xla::LiteralProto response;
+    EXPECT_TRUE(response.ParseFromString(voutputs[0].scalar<string>()()));
+
+    auto expected = xla::LiteralUtil::CreateR0<float>(kResults[i]);
+    EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+  }
+}
+
+TEST(RawApiTest, LeakCompilationReference) {
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
+          .ToProto();
+  StoreComputationSnapshot(AddAndTuple(), c.mutable_hlo_snapshot());
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({c_handle.handle}, &outputs));
+}
+
+TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
+  xrt::XLAAllocation p0;
+  p0.set_device_ordinal(0);
+  *p0.mutable_value() = xla::LiteralUtil::CreateR0<int64>(11031965).ToProto();
+  xrt::XLAAllocation p1;
+  p1.set_device_ordinal(0);
+  *p1.mutable_value() = xla::LiteralUtil::CreateR0<int64>(4091934).ToProto();
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
+  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
+  StoreComputationSnapshot(AddS64(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(p0_handle), Output(p1_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+
+  auto expected = xla::LiteralUtil::CreateR0<int64>(15123899);
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+
+  xla::ProgramShapeProto program_shape;
+  EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
+  EXPECT_EQ(program_shape.parameters_size(), 2);
+  EXPECT_TRUE(xla::ShapeUtil::HasPrimitiveType(
+      xla::Shape(program_shape.result()), xla::S64));
+}
+
 }  // namespace
 
 }  // namespace tensorflow
 
 int main(int argc, char** argv) {
   tensorflow::xla_test_device_ptr = new tensorflow::string("XLA_CPU");
+  tensorflow::xla_platform_ptr = new tensorflow::string("CPU");
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("xla_test_device", tensorflow::xla_test_device_ptr,
                        "Tensorflow device type to use for test, e.g., XLA_CPU"),
+      tensorflow::Flag("xla_platform", tensorflow::xla_platform_ptr,
+                       "The XLA platform to select for the device"),
   };
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 5678f0905ff5b8956e0811026e7450acba8815e9..378bb9246f27b8106310d565435404d7ac260a87 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -3,9 +3,28 @@ syntax = "proto3";
 package xrt;
 
 import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
+import "tensorflow/compiler/xla/xla.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
 import "tensorflow/compiler/xla/service/hlo.proto";
 
+message DeviceAssignment {
+  message ComputationDevice {
+    message DeviceMeshCoordinates {
+      // The mesh coordinates for the device. Usually (X, Y, Core), in the order
+      // in which they are returned in the TopologyProto.
+      //  X    = value(0)
+      //  Y    = value(1)
+      //  Core = value(2)
+      repeated int32 value = 1;
+    }
+    // As many replicas as there are in the replicated computation.
+    repeated DeviceMeshCoordinates replica_devices = 1;
+  }
+  // As many ComputationDevice as many there are computations (number
+  // of cores per replica).
+  repeated ComputationDevice computation_devices = 1;
+}
+
 // Options for an XLA compilation.
 message XLAComputationConfig {
   // The number of replicas the computation will be run on. If this is
@@ -18,11 +37,18 @@ message XLAComputationConfig {
   tensorflow.tf2xla.HostComputeMetadata host_compute_metadata = 3;
 
   // The arg/result shapes for the whole computation.
-  xla.ProgramShape program_shape = 4;
+  xla.ProgramShapeProto program_shape = 4;
   // The arg/result shapes for each core of a model-parallel
   // computation. per_core_args_and_result_shapes is optional for a
   // single-core computation.
-  repeated xla.ProgramShape per_core_program_shape = 5;
+  repeated xla.ProgramShapeProto per_core_program_shape = 5;
+  // Describes how replicated computation instances should be assigned to
+  // devices. There are num_cores_per_replica computations, and each one will be
+  // sent and executed to the set of replica device numbers described in the
+  // DeviceAssignment proto.
+  DeviceAssignment device_assignment = 6;
+  // The debugging options to be passed to the XLA compilation process.
+  xla.DebugOptions debug_options = 7;
 }
 
 // Options and XLA computation for a compilation.
@@ -75,4 +101,8 @@ message XRTExecutionConfig {
   bool release_input_handles = 5;
   // If true, release the handle to the computation after running.
   bool release_compilation_handle = 6;
+  // If set to true, and the result shape is a tuple, then instead of returning
+  // a single tuple allocation the execution will return a vector of
+  // allocations, one for each of the first-level elements of the result tuple.
+  bool return_exploded_tuple = 7;
 }
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.cc b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
index 4844c7fb7106862dd42b3b3d07245350c9d2383c..d1405eae468492748ae88d842334a922dce272c6 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.cc
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
@@ -18,9 +18,19 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 
+namespace {
+
+int64 get_uid() {
+  uint64 unsigned_rand = random::New64() & INT64_MAX;
+  return static_cast<int64>(unsigned_rand);
+}
+
+}  // namespace
+
 const char* kXRTCompilationCacheResourceName = "xrt_compilation_cache";
 
 XRTCompilationCache::EntryRefImpl::EntryRefImpl(XRTCompilationCache* parent,
@@ -46,12 +56,17 @@ XRTCompilationCache::XRTCompilationCache(int max_number_of_entries)
 
 XRTCompilationCache::~XRTCompilationCache() {
   VLOG(1) << "XRTCompilationCache::~XRTCompilationCache()";
+  // A buggy client may be holding onto a reference, or a client might have
+  // crashed while holding onto a reference. In either case, discard all
+  // outstanding client references to avoid leaking storage.
+  for (const auto& entry : entries_by_uid_) {
+    while (!entry.second->RefCountIsOne()) {
+      entry.second->Unref();
+    }
+  }
   while (!entries_by_last_use_.empty()) {
     MarkOldestEntryForEviction();
   }
-  // By the time the cache is deleted all reference holders should have already
-  // been deleted, since they were holding references to the cache. So all
-  // entries should be gone at this point.
   CHECK_EQ(cache_.size(), 0);
   CHECK_EQ(entries_by_uid_.size(), 0);
   CHECK_EQ(cache_entries_, 0);
@@ -148,7 +163,7 @@ XRTCompilationCache::CompiledSubgraph* XRTCompilationCache::InitializeEntry(
   CompiledSubgraph* entry = new CompiledSubgraph();
   entry->parent = this;
   entry->key = key;
-  entry->uid = next_uid_++;
+  entry->uid = get_uid();
   // Add the entry to the cache. Once the computation has been compiled,
   // UpdateEntryAfterCompilation will be called to potentially mark old entries
   // that don't fit any more for eviction.
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.h b/tensorflow/compiler/xrt/xrt_compilation_cache.h
index c505299a454506e2136e36fb26833c28ed0d47bc..c43d0fc47873abdc82ee937c155bebc346a05f17 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.h
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.h
@@ -211,8 +211,6 @@ class XRTCompilationCache : public ResourceBase {
   const int max_cache_entries_;
 
   mutable absl::Mutex mu_;
-  // The uid to assign to the next new entry created.
-  int64 next_uid_ GUARDED_BY(mu_) = 0;
   // The total number of entries that are stored and not marked for eviction.
   int cache_entries_ GUARDED_BY(mu_) = 0;
   // The total number of entries that are marked for eviction.
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index d05a1e7dcbff440e0daf03bd25535c26d82b6a0b..31603e044d17baa3ae0ae583f61837811bb12495 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt_state.h"
 
 #include <stdint.h>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -33,6 +34,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
@@ -40,14 +43,44 @@ namespace tensorflow {
 
 namespace {
 
+class BufferAllocStats {
+ public:
+  struct Stats {
+    int64 count = 0;
+    int64 size = 0;
+  };
+
+  Stats ReportAlloc(int64 device, int64 msize) {
+    mutex_lock lock(lock_);
+    Stats* device_stats = &stats_[device];
+    device_stats->count += 1;
+    device_stats->size += msize;
+    return *device_stats;
+  }
+
+  Stats ReportFree(int64 device, int64 msize) {
+    mutex_lock lock(lock_);
+    Stats* device_stats = &stats_[device];
+    device_stats->count -= 1;
+    device_stats->size -= msize;
+    return *device_stats;
+  }
+
+ private:
+  mutable mutex lock_;
+  std::map<int64, Stats> stats_;
+};
+
 const char* kTupleContainer = "tuples";
 
-// Counter used to assign unique handles.
-mutex _uid_mutex(tensorflow::LINKER_INITIALIZED);
-int64 _uid GUARDED_BY(_uid_mutex) = 0;
 int64 get_uid() {
-  mutex_lock l(_uid_mutex);
-  return _uid++;
+  uint64 unsigned_rand = random::New64() & INT64_MAX;
+  return static_cast<int64>(unsigned_rand);
+}
+
+BufferAllocStats* GetAllocStats() {
+  static BufferAllocStats* stats = new BufferAllocStats();
+  return stats;
 }
 
 Status AllocateScopedShapedBuffer(
@@ -67,6 +100,9 @@ Status AllocateScopedShapedBuffer(
   // requests the host-shape sub-buffer at index i, that will correspond to the
   // right device-shape sub-buffer at the same index.
   xla::Shape on_device_shape = transfer_manager->HostShapeToDeviceShape(shape);
+  VLOG(3) << "Allocating literal buffer: host_shape="
+          << xla::ShapeUtil::HumanStringWithLayout(shape) << " device_shape="
+          << xla::ShapeUtil::HumanStringWithLayout(on_device_shape);
 
   // The ScopedShapedBuffer frees the buffers that have so far been allocated if
   // it goes out of scope. That's useful if we return early as the result of an
@@ -99,9 +135,19 @@ XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
                                          xla::DeviceMemoryAllocator* allocator)
     : allocation_(allocation),
       device_ordinal_(device_ordinal),
-      allocator_(allocator) {}
+      allocator_(allocator) {
+  if (VLOG_IS_ON(2)) {
+    auto stats =
+        GetAllocStats()->ReportAlloc(device_ordinal_, allocation_.size());
+    LOG(INFO) << "XRT Allocation Stats: device=" << device_ordinal_
+              << " count=" << stats.count << " size=" << stats.size;
+  }
+}
 
 XRTBufferAllocation::~XRTBufferAllocation() {
+  if (VLOG_IS_ON(2)) {
+    GetAllocStats()->ReportFree(device_ordinal_, allocation_.size());
+  }
   // Deallocate explicitly allows allocation_ to be null.
   Status s = allocator_->Deallocate(device_ordinal_, allocation_);
   // Nothing to do but check fail here if memory datastructures are corrupted.
@@ -182,6 +228,20 @@ Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
   return Status::OK();
 }
 
+Status XRTTupleAllocation::WriteLiteral(xla::Backend* backend,
+                                        const xla::Literal& literal) {
+  if (!xla::ShapeUtil::Equal(literal.shape(), on_host_shape())) {
+    return errors::InvalidArgument(
+        "New literal shape not matching the existing one: literal=",
+        xla::ShapeUtil::HumanStringWithLayout(literal.shape()),
+        " device=", xla::ShapeUtil::HumanStringWithLayout(on_host_shape()));
+  }
+  auto transfer_manager = backend->transfer_manager();
+  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
+  return transfer_manager->TransferLiteralToDevice(stream.get(), literal,
+                                                   ToShapedBuffer());
+}
+
 void XRTTupleAllocation::DiscardAllocation(
     const xla::ShapeIndex& buffer_index) {
   buffers_.element(buffer_index)->DiscardAllocation();
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 73b5584e38f781343fe6793af7ad28232fbfc184..3664c0cd4e6ad26945ae1012208fdb006164a066 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -137,6 +137,9 @@ class XRTTupleAllocation : public ResourceBase {
   Status ToLiteral(xla::Backend* backend, int device_ordinal,
                    xla::Literal* literal);
 
+  // Write a new literal value to the allocation.
+  Status WriteLiteral(xla::Backend* backend, const xla::Literal& literal);
+
   // True if none of the buffers in the allocation are aliased by any other live
   // handle.
   bool IsExclusiveOwner();
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ef8bedc7324696cd255c72a851f0f2410e03848
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xrt/xrt_util.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace {
+
+bool DebugOptionsPassThroughEnabled() {
+  const char* env = getenv("TF_XLA_DEBUG_OPTIONS_PASSTHROUGH");
+  bool enabled =
+      env != nullptr && (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
+  if (enabled) {
+    LOG(WARNING) << "Passing through XLA debug options!";
+  } else {
+    LOG(WARNING) << "TF_XLA_DEBUG_OPTIONS_PASSTHROUGH not set, not all options "
+                    "will be retained";
+  }
+  return enabled;
+}
+
+string SafeDebugPath(const string& path) {
+  if (path.empty() || path.compare(0, 5, "gs://") == 0 ||
+      path.compare(0, 11, "bigstore://") == 0) {
+    return path;
+  }
+  LOG(WARNING) << "Invalid config path (will be dropped): " << path;
+  return string();
+}
+
+}  // namespace
+
+xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
+  static const bool options_passthrough = DebugOptionsPassThroughEnabled();
+  if (options_passthrough) {
+    return ref_options;
+  }
+  xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
+  options.set_xla_generate_hlo_text_to(
+      SafeDebugPath(ref_options.xla_generate_hlo_text_to()));
+  options.set_xla_dump_optimized_hlo_proto_to(
+      SafeDebugPath(ref_options.xla_dump_optimized_hlo_proto_to()));
+  options.set_xla_dump_computations_to(
+      SafeDebugPath(ref_options.xla_dump_computations_to()));
+  options.set_xla_dump_executions_to(
+      SafeDebugPath(ref_options.xla_dump_executions_to()));
+  for (auto& pass : ref_options.xla_disable_hlo_passes()) {
+    options.add_xla_disable_hlo_passes(pass);
+  }
+  options.set_xla_dump_unoptimized_hlo_proto_to(
+      SafeDebugPath(ref_options.xla_dump_unoptimized_hlo_proto_to()));
+  options.set_xla_dump_per_pass_hlo_proto_to(
+      SafeDebugPath(ref_options.xla_dump_per_pass_hlo_proto_to()));
+  return options;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9c05a7f3406313f99ae214d67b34e8e7de8be3e
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utility functions in support of the XRT API.
+
+#ifndef TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
+#define TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
+
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+namespace tensorflow {
+
+// Filters the debug options provided as argument according to the value of the
+// TF_XLA_DEBUG_OPTIONS_PASSTHROUGH environment variable. If such variable is
+// set to "1" or "true", the debug options will be returned as is. Otherwise
+// only a subset of them will be set in the returned ones, and all the paths
+// contained in it, will be limited to gs:// and bigstore:// ones.
+xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index fa06d351d4e64bfc2fc5e64c81c810185600000a..832db0f4ab46911e067d17b4a125706c276cf798 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -72,7 +72,6 @@ py_library(
         "//tensorflow/contrib/metrics:metrics_py",
         "//tensorflow/contrib/mixed_precision:mixed_precision",
         "//tensorflow/contrib/model_pruning",
-        "//tensorflow/contrib/nccl:nccl_py",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
@@ -113,22 +112,52 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/estimator:estimator_py",
     ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
+        "//tensorflow:no_kafka_support": [],
         "//conditions:default": [
-            "//tensorflow/contrib/bigtable",
-            "//tensorflow/contrib/cloud:cloud_py",
-            "//tensorflow/contrib/fused_conv:fused_conv_py",  # unresolved symbols, need to export more symbols
             "//tensorflow/contrib/kafka",
+        ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_aws_support": [],
+        "//conditions:default": [
             "//tensorflow/contrib/kinesis",
+        ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "//tensorflow/contrib/fused_conv:fused_conv_py",
             "//tensorflow/contrib/tensorrt:init_py",
             "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         ],
     }) + select({
-        "//tensorflow:with_ignite_support": [
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_gcp_support": [],
+        "//conditions:default": [
+            "//tensorflow/contrib/bigtable",
+            "//tensorflow/contrib/cloud:cloud_py",
+        ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_ignite_support": [],
+        "//conditions:default": [
             "//tensorflow/contrib/ignite",
         ],
-        "//conditions:default": [],
     }),
 )
 
@@ -149,17 +178,27 @@ cc_library(
         "//tensorflow/contrib/tensor_forest:stats_ops_kernels",
         "//tensorflow/contrib/tensor_forest:tensor_forest_kernels",
         "//tensorflow/contrib/text:all_kernels",
-    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + if_cuda([
-        "//tensorflow/contrib/nccl:nccl_kernels",
-    ]) + select({
+    ] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_collectives_py"]) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
+        "//tensorflow:no_kafka_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/kafka:dataset_kernels",
+        ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_aws_support": [],
+        "//conditions:default": [
             "//tensorflow/contrib/kinesis:dataset_kernels",
-            "//tensorflow/contrib/tensorrt:trt_engine_op_kernel",
         ],
-    }),
+    }) + if_not_windows([
+        "//tensorflow/contrib/tensorrt:trt_engine_op_kernel",
+    ]),
 )
 
 cc_library(
@@ -173,7 +212,6 @@ cc_library(
         "//tensorflow/contrib/hadoop:dataset_ops_op_lib",
         "//tensorflow/contrib/input_pipeline:input_pipeline_ops_op_lib",
         "//tensorflow/contrib/layers:sparse_feature_cross_op_op_lib",
-        "//tensorflow/contrib/nccl:nccl_ops_op_lib",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_ops_op_lib",
         "//tensorflow/contrib/rnn:all_ops",
         "//tensorflow/contrib/seq2seq:beam_search_ops_op_lib",
@@ -183,17 +221,33 @@ cc_library(
         "//tensorflow/contrib/text:all_ops",
         "//tensorflow/contrib/tpu:all_ops",
     ] + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
+        "//tensorflow:no_kafka_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/kafka:dataset_ops_op_lib",
-            "//tensorflow/contrib/kinesis:dataset_ops_op_lib",
-            "//tensorflow/contrib/tensorrt:trt_engine_op_op_lib",
         ],
     }) + select({
-        "//tensorflow:with_ignite_support": [
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_aws_support": [],
+        "//conditions:default": [
+            "//tensorflow/contrib/kinesis:dataset_ops_op_lib",
+        ],
+    }) + if_not_windows([
+        "//tensorflow/contrib/tensorrt:trt_engine_op_op_lib",
+    ]) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_ignite_support": [],
+        "//conditions:default": [
             "//tensorflow/contrib/ignite:dataset_ops_op_lib",
         ],
-        "//conditions:default": [],
     }),
 )
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index e71b0e0ae33f9c2dd48643e557447372bc67b3e3..4f1a2a5693235183c8f486817b82c8c81fa389ec 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -21,14 +21,6 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.python.tools import component_api_helper
-component_api_helper.package_hook(
-    parent_package_str=(
-        "tensorflow.contrib"),
-    child_package_str=(
-        "tensorflow_estimator.contrib.estimator"))
-del component_api_helper
-
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import autograph
 from tensorflow.contrib import batching
@@ -70,7 +62,6 @@ from tensorflow.contrib import memory_stats
 from tensorflow.contrib import metrics
 from tensorflow.contrib import mixed_precision
 from tensorflow.contrib import model_pruning
-from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
 from tensorflow.contrib import opt
 from tensorflow.contrib import periodic_resample
diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index 881808a98bfd688c2efaa8beb5b8f11a2527fee8..f6c6560c1c354ed8a36b98b1f564835eb9958e55 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -9,8 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
 py_library(
     name = "all_reduce_py",
     srcs = ["__init__.py"],
@@ -29,29 +27,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/nccl:nccl_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-    ],
-)
-
-tf_py_test(
-    name = "all_reduce_test",
-    srcs = ["python/all_reduce_test.py"],
-    additional_deps = [
-        ":all_reduce",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:state_ops",
+        "//tensorflow/python/distribute:all_reduce",
     ],
 )
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 3b539734a236804026826a8117d9c668c0dd089a..238cdaf8a79812df3f043d9d070bbcfd443f6e1e 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -18,842 +18,5 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import math
-
-from tensorflow.contrib import nccl
-from tensorflow.python.framework import device as device_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-
-
-def _flatten_tensors(tensors):
-  """Check tensors for isomorphism and flatten.
-
-  Args:
-    tensors: list of T `tf.Tensor` which must all have the same shape.
-
-  Returns:
-    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
-    shape: the original shape of each element of input tensors
-
-  Raises:
-    ValueError: tensors are empty or non-isomorphic or have unknown shape.
-  """
-  if not tensors:
-    raise ValueError("tensors cannot be empty")
-  shape = tensors[0].shape
-  for tensor in tensors:
-    shape = shape.merge_with(tensor.shape)
-  if not shape.is_fully_defined():
-    raise ValueError("Tensors must have statically known shape.")
-  if len(shape) != 1:
-    reshaped = []
-    for t in tensors:
-      with ops.colocate_with(t):
-        reshaped.append(array_ops.reshape(t, [-1]))
-    tensors = reshaped
-  return tensors, shape
-
-
-def _reshape_tensors(tensors, shape):
-  """Reshape tensors flattened by _flatten_tensors.
-
-  Args:
-    tensors: list of T `tf.Tensor` of identical length 1D tensors.
-    shape: list of integers describing the desired shape.  Product of
-      the elements must equal the length of each tensor.
-
-  Returns:
-    list of T `tf.Tensor` which are the reshaped inputs.
-  """
-  reshaped = []
-  for t in tensors:
-    with ops.colocate_with(t):
-      reshaped.append(array_ops.reshape(t, shape))
-  return reshaped
-
-
-def _padded_split(tensor, pieces):
-  """Like split for 1D tensors but pads-out case where len % pieces != 0.
-
-  Args:
-    tensor: T `tf.Tensor` that must be 1D.
-    pieces: a positive integer specifying the number of pieces into which
-      tensor should be split.
-
-  Returns:
-    list of T `tf.Tensor` of length pieces, which hold the values of
-      thin input tensor, in order.  The final tensor may
-      be zero-padded on the end to make its size equal to those of all
-      of the other tensors.
-
-  Raises:
-    ValueError: The input tensor is not 1D.
-  """
-  shape = tensor.shape
-  if 1 != len(shape):
-    raise ValueError("input tensor must be 1D")
-  tensor_len = shape[0].value
-  with ops.colocate_with(tensor):
-    if tensor_len % pieces != 0:
-      # pad to an even length
-      chunk_size = 1 + tensor_len // pieces
-      if pieces > tensor_len:
-        # This is an edge case that should not come up in practice,
-        # i.e. a different reduction algorithm would be better,
-        # but we'll make it work just for completeness.
-        pad_len = pieces - tensor_len
-        extended_whole = array_ops.concat(
-            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        parts = array_ops.split(extended_whole, pieces)
-        return parts, pad_len
-      elif (pieces - 1) * chunk_size >= tensor_len:
-        # Another edge case of limited real interest.
-        pad_len = (pieces * chunk_size) % tensor_len
-        extended_whole = array_ops.concat(
-            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        parts = array_ops.split(extended_whole, pieces)
-        return parts, pad_len
-      else:
-        last_chunk_size = tensor_len - (pieces - 1) * chunk_size
-        pad_len = chunk_size - last_chunk_size
-        piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
-        parts = array_ops.split(tensor, piece_lens)
-        parts[-1] = array_ops.concat(
-            [parts[-1], array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        return parts, pad_len
-    else:
-      return array_ops.split(tensor, pieces), 0
-
-
-def _strip_padding(tensors, pad_len):
-  """Strip the suffix padding added by _padded_split.
-
-  Args:
-    tensors: list of T `tf.Tensor` of identical length 1D tensors.
-    pad_len: number of elements to be stripped from the end of each tensor.
-
-  Returns:
-    list of T `tf.Tensor` which are the stripped inputs.
-
-  Raises:
-    ValueError: tensors must be a non-empty list of 1D tensors, and
-      each must be longer than pad_len.
-  """
-  if not tensors:
-    raise ValueError("tensors cannot be empty")
-  shape = tensors[0].shape
-  if len(shape) > 1:
-    raise ValueError("tensors must be 1D")
-  prefix_len = int(shape[0] - pad_len)
-  if prefix_len < 0:
-    raise ValueError("pad_len longer than tensor")
-  stripped = []
-  for t in tensors:
-    with ops.colocate_with(t):
-      stripped.append(array_ops.slice(t, [0], [prefix_len]))
-  return stripped
-
-
-def _ragged_split(tensor, pieces):
-  """Like split for 1D tensors but allows case where len % pieces != 0.
-
-  Args:
-    tensor: T `tf.Tensor` that must be 1D.
-    pieces: a positive integer specifying the number of pieces into which
-      tensor should be split.
-
-  Returns:
-    list of T `tf.Tensor` of length pieces, which hold the values of
-      the input tensor, in order.  The final tensor may be shorter
-      than the others, which will all be of equal length.
-
-  Raises:
-    ValueError: input tensor must be 1D.
-  """
-  shape = tensor.shape
-  if 1 != len(shape):
-    raise ValueError("input tensor must be 1D")
-  tensor_len = shape[0].value
-  chunk_size = tensor_len // pieces
-  with ops.colocate_with(tensor):
-    if tensor_len != (pieces * chunk_size):
-      # last piece will be short
-      assert pieces > 1
-      last_chunk_size = tensor_len - ((pieces - 1) * chunk_size)
-      assert last_chunk_size > 0
-      piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
-      return array_ops.split(tensor, piece_lens)
-    else:
-      return array_ops.split(tensor, pieces)
-
-
-def _ring_permutations(num_workers, num_subchunks, gpu_perm):
-  """"Generate an array of device index arrays, one for each subchunk.
-
-  In the basic ring reduction algorithm there are size(T)/num_devices
-  data chunks and each device process one chunk per tick, i.e. sending
-  one chunk and receiving one chunk.  The idea of subchunking is that
-  each device processes num_subchunks smaller data regions per tick,
-  and the ring rank permutation is different for each subchunk index
-  so that a device is potentially sending to and receiving from
-  num_subchunks different other devices at each tick.  Where multiple
-  independent data channels exist between devices, this strategy
-  supplies a method of using them in parallel.
-
-  Args:
-    num_workers: number of worker tasks
-    num_subchunks: number of subchunks into which to divide each per-GPU chunk.
-    gpu_perm: an array of integers in [0, num_gpus-1] giving the default
-      ring order of GPUs at each worker.  Other permutations will be generated
-      by rotating this array and splicing together per-worker instances.
-
-  Raises:
-    ValueError: the number of subchunks may not exceed the number of GPUs.
-
-  Returns:
-    pred_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
-        preceding device in the permutation for that subchunk.  The
-        device index of GPU i at worker j is i + (j * num_gpus).
-    rank_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
-       local rank of device d in the permutation for that subchunk.
-  """
-  num_gpus = len(gpu_perm)
-  devices = num_workers * num_gpus
-  if devices == 0:
-    return [], []
-  if num_subchunks > num_gpus:
-    raise ValueError(
-        "num_subchunks %d must be <= num_gpus %d" % (num_subchunks, num_gpus))
-  rotation_interval = max(1, int(num_gpus / num_subchunks))
-  perms_by_s = []
-  for s in range(0, num_subchunks):
-    full_order = []
-    offset = s * rotation_interval
-    for w in range(0, num_workers):
-      default_order = [(w * num_gpus) + i for i in gpu_perm]
-      dev_order = default_order[offset:] + default_order[:offset]
-      full_order += dev_order
-    perms_by_s.append(full_order)
-  pred_by_s_d = [[-1 for d in range(0, devices)]
-                 for s in range(0, num_subchunks)]
-  rank_by_s_d = [[-1 for d in range(0, devices)]
-                 for s in range(0, num_subchunks)]
-  for s in range(0, num_subchunks):
-    for d in range(0, devices):
-      for t in range(0, devices):
-        if d == perms_by_s[s][t]:
-          rank_by_s_d[s][d] = t
-          pred_by_s_d[s][d] = perms_by_s[s][(t + devices - 1) % devices]
-          break
-  return (pred_by_s_d, rank_by_s_d)
-
-
-def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
-                          gpu_perm, red_op, un_op=None):
-  """Construct a subgraph performing a ring-style all-reduce of input_tensors.
-
-  Args:
-    input_tensors: a list of T `tf.Tensor` objects, which must all
-      have the same shape and type.
-    num_workers: number of worker tasks spanned by input_tensors.
-    num_subchunks: number of subchunks each device should process in one tick.
-    gpu_perm: a list of ints giving a ring-wise rank ordering of GPUs at
-      each worker.  All workers must have the same number of
-      GPUs with the same rank ordering.  If NVLINK is available, this should
-      be a ring order supported by NVLINK edges.
-    red_op: a binary operator for elementwise reduction.
-    un_op: an optional unary operator to apply to fully reduced values.
-
-  Raises:
-    ValueError: empty input_tensors or they don't all have same
-    size.
-
-  Returns:
-    a list of T `tf.Tensor` identical sum-reductions of input_tensors.
-  """
-  if len(input_tensors) < 2:
-    raise ValueError("input_tensors must be length 2 or longer")
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  devices = [t.device for t in input_tensors]
-  (pred_by_s_d, rank_by_s_d) = _ring_permutations(
-      num_workers, num_subchunks, gpu_perm)
-  chunks_by_dev, pad_len = _build_ring_gather(
-      input_tensors, devices,
-      num_subchunks, pred_by_s_d, rank_by_s_d, red_op)
-  if un_op:
-    chunks_by_dev = _apply_unary_to_chunks(un_op, chunks_by_dev)
-  output_tensors = _build_ring_scatter(pred_by_s_d, rank_by_s_d,
-                                       chunks_by_dev)
-  if pad_len > 0:
-    output_tensors = _strip_padding(output_tensors, pad_len)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_ring_gather(input_tensors, devices, num_subchunks,
-                       pred_by_s_d, rank_by_s_d, red_op):
-  """Construct a subgraph for the first (reduction) pass of ring all-reduce.
-
-  Args:
-    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
-      shape and type.
-    devices: array of device name strings
-    num_subchunks: number of subchunks each device should process in one tick.
-    pred_by_s_d: as produced by _ring_permutations
-    rank_by_s_d: as produced by _ring_permutations
-    red_op: a binary operator for elementwise reduction
-
-  Raises:
-    ValueError: tensors must all be one dimensional.
-
-  Returns:
-    list of list of T `tf.Tensor` of (partially) reduced values where
-    exactly num_subchunks chunks at each device are fully reduced.
-  """
-  num_devices = len(input_tensors)
-  if num_devices == 0:
-    return []
-  if num_devices == 1:
-    return input_tensors
-  shape = input_tensors[0].shape
-  if 1 != len(shape):
-    raise ValueError("input tensors must be 1D")
-  num_chunks = num_devices * num_subchunks
-  num_ticks = num_devices - 1
-  # Initialize chunks_by_dev with splits of the input tensors.
-  chunks_by_dev = []
-  split_pad_len = 0
-  for d in range(0, num_devices):
-    with ops.device(devices[d]):
-      splits, split_pad_len = _padded_split(input_tensors[d], num_chunks)
-      chunks_by_dev.append(splits)
-  # Reduction phase
-  for tick in range(0, num_ticks):
-    # One new partial reduction for every chunk
-    new_partial_reductions = [None for _ in range(0, num_chunks)]
-    # Compute reductions with respect to last tick's values
-    for d in range(0, num_devices):
-      with ops.device(devices[d]):
-        for s in range(0, num_subchunks):
-          rank = rank_by_s_d[s][d]
-          seg_index = (rank + num_devices - (2 + tick)) % num_devices
-          pred_dev = pred_by_s_d[s][d]
-          chunk_index = (seg_index * num_subchunks) + s
-          new_partial_reductions[chunk_index] = red_op(
-              chunks_by_dev[pred_dev][chunk_index],
-              chunks_by_dev[d][chunk_index])
-    # Update chunks_by_dev with the new values at the end of the tick.
-    for d in range(0, num_devices):
-      for s in range(0, num_subchunks):
-        rank = rank_by_s_d[s][d]
-        seg_index = (rank + num_devices - (2 + tick)) % num_devices
-        chunk_index = (seg_index * num_subchunks) + s
-        chunks_by_dev[d][chunk_index] = new_partial_reductions[chunk_index]
-  return chunks_by_dev, split_pad_len
-
-
-def _apply_unary_to_chunks(f, chunks_by_dev):
-  """Apply a unary op to each tensor in chunks_by_dev, on same device.
-
-  Args:
-    f: a unary function over T `tf.Tensor`.
-    chunks_by_dev: list of lists of T `tf.Tensor`.
-
-  Returns:
-    new list of lists of T `tf.Tensor` with the same structure as
-    chunks_by_dev containing the derived tensors.
-  """
-  output = []
-  for x in chunks_by_dev:
-    with ops.colocate_with(x[0]):
-      output.append([f(t) for t in x])
-  return output
-
-
-def _build_ring_scatter(pred_by_s_d, rank_by_s_d,
-                        chunks_by_dev):
-  """Construct subgraph for second (scatter) pass of ring all-reduce.
-
-  Args:
-    pred_by_s_d: as produced by _ring_permutations
-    rank_by_s_d: as produced by _ring_permutations
-    chunks_by_dev: list of list of T `tf.Tensor` indexed by ints
-      (device, chunk)
-
-  Raises:
-    ValueError: chunks_by_dev is not well-formed
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors, one
-    at each device corresponding to the outer dimension of chunks_by_dev.
-  """
-  num_devices = len(chunks_by_dev)
-  num_chunks = len(chunks_by_dev[0])
-  if 0 != num_chunks % num_devices:
-    raise ValueError(
-        "Expect number of chunks per device to be divisible by num_devices")
-  num_subchunks = int(num_chunks / num_devices)
-  num_ticks = num_devices - 1
-  for tick in range(0, num_ticks):
-    passed_values = [None for _ in range(0, num_chunks)]
-    for d in range(0, num_devices):
-      with ops.colocate_with(chunks_by_dev[d][0]):
-        for s in range(0, num_subchunks):
-          rank = rank_by_s_d[s][d]
-          seg_index = (rank + num_devices - (1 + tick)) % num_devices
-          pred_dev = pred_by_s_d[s][d]
-          chunk_index = (seg_index * num_subchunks) + s
-          passed_values[chunk_index] = array_ops.identity(
-              chunks_by_dev[pred_dev][chunk_index])
-    for d in range(0, num_devices):
-      for s in range(0, num_subchunks):
-        rank = rank_by_s_d[s][d]
-        seg_index = (rank + num_devices - (1 + tick)) % num_devices
-        chunk_index = (seg_index * num_subchunks) + s
-        chunks_by_dev[d][chunk_index] = passed_values[chunk_index]
-  # Join chunks at each device.
-  output = []
-  for x in chunks_by_dev:
-    with ops.colocate_with(x[0]):
-      output.append(array_ops.concat(x, 0))
-  return output
-
-
-def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
-  """Construct a subgraph for recursive halving-doubling all-reduce.
-
-  The recursive halving-doubling algorithm is described in
-  http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf
-
-  The concept is to arrange the participating n devices in
-  a linear sequence where devices exchange data pairwise
-  with one other device in each round.  During the gather
-  phase there are lg(n) rounds where devices exchange
-  increasingly smaller sub-tensors with another device
-  at increasingly greater distances, until at the top
-  each device has 1/n of the fully reduced values.  During the
-  scatter phase each device exchanges its fully reduced
-  sub-tensor (which doubles in length at each round)
-  with one other device at increasingly smaller distances
-  until each device has all of the fully reduced values.
-
-  Note: this preliminary version requires that len(input_tensors) be a
-    power of 2.  TODO(tucker): relax this restriction.  Also, the
-    number of elements in each tensor must be divisible by 2^h where h
-    is the number of hops in each phase.  This will also be relaxed in
-    the future with edge-case specific logic.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
-    red_op: a binary elementwise reduction Op.
-    un_op: an optional unary elementwise Op to apply to reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors, one
-    at each device of input_tensors.
-
-  Raises:
-    ValueError: num_devices not a power of 2, or tensor len not divisible
-    by 2 the proper number of times.
-  """
-  devices = [t.device for t in input_tensors]
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  reduced_shards = _build_recursive_hd_gather(input_tensors, devices, red_op)
-  if un_op:
-    reduced_shards = [un_op(t) for t in reduced_shards]
-  output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_recursive_hd_gather(input_tensors, devices, red_op):
-  """Construct the gather phase of recursive halving-doubling all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
-    devices: a list of strings naming the devices hosting input_tensors,
-      which will also be used to host the (partial) reduction values.
-    red_op: a binary elementwise reduction Op.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensor shards.
-
-  Raises:
-    ValueError: num_devices not a power of 2, or tensor len not divisible
-    by 2 the proper number of times.
-  """
-  num_devices = len(devices)
-  num_hops = int(math.log(num_devices, 2))
-  if num_devices != (2 ** num_hops):
-    raise ValueError("num_devices must be a power of 2")
-  chunks = input_tensors
-  for h in range(0, num_hops):
-    span = 2 ** h
-    group_size = span * 2
-    new_chunks = [[] for _ in devices]
-    for d in range(0, num_devices):
-      if (d % group_size) >= (group_size / 2):
-        # skip right half of a pair
-        continue
-      left_dev = devices[d]
-      right_dev = devices[d + span]
-      left_split = array_ops.split(chunks[d], 2)
-      right_split = array_ops.split(chunks[d+span], 2)
-      with ops.device(left_dev):
-        new_chunks[d] = red_op(left_split[0], right_split[0])
-      with ops.device(right_dev):
-        new_chunks[d + span] = red_op(left_split[1], right_split[1])
-    chunks = new_chunks
-  return chunks
-
-
-def _build_recursive_hd_scatter(input_tensors, devices):
-  """Construct the scatter phase of recursive halving-doublng all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
-    devices: a list of strings naming the devices on which the reconstituted
-      full tensors should be placed.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors.
-  """
-  num_devices = len(devices)
-  num_hops = int(math.log(num_devices, 2))
-  assert num_devices == (2 ** num_hops), "num_devices must be a power of 2"
-  chunks = input_tensors
-  for h in reversed(range(0, num_hops)):
-    span = 2 ** h
-    group_size = span * 2
-    new_chunks = [[] for _ in devices]
-    for d in range(0, num_devices):
-      if (d % group_size) >= (group_size / 2):
-        # skip right half of a pair
-        continue
-      left_idx = d
-      right_idx = d + span
-      left_dev = devices[left_idx]
-      right_dev = devices[right_idx]
-      with ops.device(left_dev):
-        new_chunks[left_idx] = array_ops.concat([chunks[left_idx],
-                                                 chunks[right_idx]], 0)
-      with ops.device(right_dev):
-        new_chunks[right_idx] = array_ops.concat([chunks[left_idx],
-                                                  chunks[right_idx]], 0)
-    chunks = new_chunks
-  return chunks
-
-
-def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
-  """Construct a subgraph for shuffle all-reduce.
-
-  Shuffle reduce is essentially the algorithm implemented when using
-  parameter servers.  Suppose tensor length is n, there are d devices
-  and g gather shards.  Each device sends a n/g length sub-tensor to
-  each gather shard.  The gather shards perform a reduction across d
-  fragments, then broadcast the result back to each device.  The
-  devices then join the g fully reduced fragments they receive from
-  the shards.  The gather shards could perform d-1 pairwise
-  reductions, or one d-way reduction.  The first is better where
-  reduction Op time is low compared to transmission time, the second
-  better in the other case.
-
-  Args:
-    input_tensors: list of T @(tf.Tensor} values to be reduced.
-    gather_devices: list of names of devices on which reduction shards
-      should be placed.
-    red_op: an n-array elementwise reduction Op
-    un_op: optional elementwise unary Op to be applied to fully-reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  dst_devices = [t.device for t in input_tensors]
-  reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
-                                         red_op, un_op)
-  output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None):
-  """Construct the gather (concentrate and reduce) phase of shuffle all-reduce.
-
-  Args:
-    input_tensors: list of T @(tf.Tensor} values to be reduced.
-    gather_devices: list of names of devices on which reduction shards
-      should be placed.
-    red_op: the binary reduction Op
-    un_op: optional elementwise unary Op to be applied to fully-reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced shards.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  num_source_devices = len(input_tensors)
-  num_gather_devices = len(gather_devices)
-  shape = input_tensors[0].shape
-  if len(shape) != 1:
-    raise ValueError("input_tensors must be 1D")
-  shards_by_source = []
-  for d in range(0, num_source_devices):
-    with ops.colocate_with(input_tensors[d]):
-      shards_by_source.append(
-          _ragged_split(input_tensors[d], num_gather_devices))
-  reduced_shards = []
-  for d in range(0, num_gather_devices):
-    with ops.device(gather_devices[d]):
-      values = [s[d] for s in shards_by_source]
-      red_shard = red_op(values)
-      if un_op:
-        red_shard = un_op(red_shard)
-      reduced_shards.append(red_shard)
-  return reduced_shards
-
-
-def _build_shuffle_scatter(reduced_shards, dst_devices):
-  """Build the scatter phase of shuffle all-reduce.
-
-  Args:
-    reduced_shards:  list of T @(tf.Tensor} fully reduced shards
-    dst_devices: list of names of devices at which the fully-reduced value
-      should be reconstituted.
-
-  Returns:
-    list of T `tf.Tensor` scattered tensors.
-  """
-  num_devices = len(dst_devices)
-  out_tensors = []
-  for d in range(0, num_devices):
-    with ops.device(dst_devices[d]):
-      out_tensors.append(array_ops.concat(reduced_shards, 0))
-  return out_tensors
-
-
-def _split_by_task(devices, values):
-  """Partition devices and values by common task.
-
-  Args:
-    devices: list of device name strings
-    values: list of T `tf.tensor` of same length as devices.
-
-  Returns:
-    (per_task_devices, per_task_values) where both values are
-    lists of lists with isomorphic structure: the outer list is
-    indexed by task, and the inner list has length of the number
-    of values belonging to that task.  per_task_devices contains
-    the specific devices to which the values are local, and
-    per_task_values contains the corresponding values.
-
-  Raises:
-    ValueError: devices must be same length as values.
-  """
-  num_devices = len(devices)
-  if num_devices != len(values):
-    raise ValueError("len(devices) must equal len(values)")
-  per_task_devices = collections.OrderedDict()
-  per_task_values = collections.OrderedDict()
-  for d in range(num_devices):
-    d_spec = device_lib.DeviceSpec.from_string(devices[d])
-    if not hasattr(d_spec, "task") or d_spec.task is None:
-      assert False, "failed to parse device %s" % devices[d]
-    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
-    if index not in per_task_devices:
-      per_task_devices[index] = []
-      per_task_values[index] = []
-    per_task_devices[index].append(devices[d])
-    per_task_values[index].append(values[d])
-
-  return (list(per_task_devices.values()), list(per_task_values.values()))
-
-
-def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
-  """Build a subgraph that does one full all-reduce, using NCCL.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    red_op: binary elementwise reduction operator.  Must be one of
-      {tf.add}
-    un_op: optional unary elementwise Op to apply to fully-reduce values.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: red_op not supported.
-  """
-  if red_op == math_ops.add:
-    output_tensors = nccl.all_sum(input_tensors)
-  else:
-    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
-  if un_op:
-    un_op_wrapped = []
-    for t in output_tensors:
-      with ops.colocate_with(t):
-        un_op_wrapped.append(un_op(t))
-    output_tensors = un_op_wrapped
-  return output_tensors
-
-
-def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
-  """Construct a subgraph for NCCL hybrid all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    red_op: binary elementwise reduction operator.
-    upper_level_f: function for reducing one value per worker, across
-      workers.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  devices = [t.device for t in input_tensors]
-  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
-  num_workers = len(per_worker_devices)
-  up_values = [None for w in range(0, num_workers)]
-  up_devices = up_values[:]
-  down_values = up_values[:]
-  # First stage: reduce within each worker using NCCL
-  for w in range(0, num_workers):
-    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
-    # NOTE: these reductions will not run to completion unless
-    # every output value is used.  Since we only need one, we
-    # need to put control dependencies on the rest.
-    with ops.control_dependencies(worker_values):
-      with ops.device(worker_values[0].device):
-        up_values[w] = array_ops.identity(worker_values[0])
-      up_devices[w] = per_worker_devices[w][0]
-  # Second stage: Apply upper_level_f to reduce across first device at
-  # each worker
-  level_2_output = upper_level_f(up_values)
-  # Third stage: propagate within each worker using NCCL Broadcast
-  for w in range(0, num_workers):
-    dst_tensors = []
-    with ops.device(per_worker_devices[w][0]):
-      broadcast_src = nccl.broadcast(array_ops.identity(level_2_output[w]))
-    for d in per_worker_devices[w]:
-      with ops.device(d):
-        dst_tensors.append(array_ops.identity(broadcast_src))
-    down_values[w] = dst_tensors
-  output_tensors = [v for sublist in down_values for v in sublist]
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _reduce_non_singleton(input_tensors, red_f, un_op):
-  """If input_tensors has more than one element apply red_f, else apply un_op."""
-  if len(input_tensors) > 1:
-    return red_f(input_tensors)
-  else:
-    if not un_op:
-      return input_tensors
-    output_tensors = []
-    for t in input_tensors:
-      with ops.colocate_with(t):
-        output_tensors.append(un_op(t))
-    return output_tensors
-
-
-def build_nccl_then_ring(input_tensors, subdiv, red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Ring across workers."""
-  def upper_builder(y):
-    return build_ring_all_reduce(y, len(y), subdiv, [0], red_op, un_op)
-  def upper_level_f(x):
-    return _reduce_non_singleton(x, upper_builder, un_op)
-  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
-
-
-def build_nccl_then_recursive_hd(input_tensors, red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Recursive-HD across workers."""
-  upper_level_f = lambda x: build_recursive_hd_all_reduce(x, red_op, un_op)
-  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
-
-
-def build_nccl_then_shuffle(input_tensors, gather_devices, nccl_red_op,
-                            shuffle_red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Shuffle across workers."""
-  upper_level_f = lambda x: build_shuffle_all_reduce(x, gather_devices,
-                                                     shuffle_red_op, un_op)
-  return _build_nccl_hybrid(input_tensors, nccl_red_op, upper_level_f)
-
-
-def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
-  """Construct a subgraph for Shuffle hybrid all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    gather_devices: list of device names on which to host gather shards.
-    red_op: binary elementwise reduction operator.
-    upper_level_f: function for reducing one value per worker, across
-      workers.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  # First stage, reduce across each worker using gather_devices.
-  devices = [t.device for t in input_tensors]
-  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
-  num_workers = len(per_worker_devices)
-  up_values = []
-  if len(gather_devices) != num_workers:
-    raise ValueError("For shuffle hybrid, gather_devices must contain one "
-                     "device per worker. ")
-  for w in range(0, num_workers):
-    reduced_shards = _build_shuffle_gather(
-        per_worker_values[w], [gather_devices[w]], red_op)
-    up_values.append(reduced_shards[0])
-  # Second stage, apply upper_level_f.
-  level_2_output = upper_level_f(up_values)
-  # Third stage, apply shuffle scatter at each worker.
-  output_tensors = []
-  for w in range(0, num_workers):
-    output_tensors += _build_shuffle_scatter(
-        [level_2_output[w]], per_worker_devices[w])
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def build_shuffle_then_ring(input_tensors, gather_devices, subdiv,
-                            red_n_op, red_op, un_op=None):
-  """Construct hybrid of Shuffle within workers, Ring across workers."""
-  def upper_builder(tensors):
-    return build_ring_all_reduce(tensors, len(tensors), subdiv, [0],
-                                 red_op, un_op)
-  def upper_level_f(tensors):
-    return _reduce_non_singleton(tensors, upper_builder, un_op)
-  return _build_shuffle_hybrid(
-      input_tensors, gather_devices, red_n_op, upper_level_f)
-
-
-def build_shuffle_then_shuffle(input_tensors, first_gather_devices,
-                               second_gather_devices, red_op, un_op=None):
-  """Construct hybrid of Shuffle within workers, Shuffle across workers."""
-  def upper_builder(tensors):
-    return build_shuffle_all_reduce(tensors, second_gather_devices,
-                                    red_op, un_op)
-  def upper_level_f(tensors):
-    return _reduce_non_singleton(tensors, upper_builder, un_op)
-  return _build_shuffle_hybrid(
-      input_tensors, first_gather_devices, red_op, upper_level_f)
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.python.distribute.all_reduce import *
diff --git a/tensorflow/contrib/android/README.md b/tensorflow/contrib/android/README.md
index db37bcf73d144eb81c32a461a276d10be7e2d193..27f8ac21323e6eb21a80dfab4d2239738c2fcf1e 100644
--- a/tensorflow/contrib/android/README.md
+++ b/tensorflow/contrib/android/README.md
@@ -52,6 +52,7 @@ Then, to build the native TF library:
 bazel build -c opt //tensorflow/contrib/android:libtensorflow_inference.so \
    --crosstool_top=//external:android/crosstool \
    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+   --cxxopt=-std=c++11 \
    --cpu=armeabi-v7a
 ```
 
diff --git a/tensorflow/contrib/android/cmake/build.gradle b/tensorflow/contrib/android/cmake/build.gradle
index 17a57b99fd6c9efc09bda0ce1249b1f51bd5af5c..ddec08894f34f96b080610f1d27a6a436f7ffa91 100644
--- a/tensorflow/contrib/android/cmake/build.gradle
+++ b/tensorflow/contrib/android/cmake/build.gradle
@@ -22,8 +22,8 @@ android {
         }
         externalNativeBuild {
             cmake {
-                arguments '-DANDROID_TOOLCHAIN=gcc',
-                          '-DANDROID_STL=gnustl_static'
+                arguments '-DANDROID_TOOLCHAIN=clang',
+                          '-DANDROID_STL=c++_static'
             }
         }
     }
@@ -70,7 +70,7 @@ if (ndkDir == null || ndkDir == "") {
     ndkDir = System.getenv('ANDROID_NDK_HOME')
 }
 
-if(! Os.isFamily(Os.FAMILY_WINDOWS)) {
+if (!Os.isFamily(Os.FAMILY_WINDOWS)) {
     // This script is for non-Windows OS. For Windows OS, MANUALLY build
     // (or copy the built) libs/headers to the
     //    ${TENSORFLOW_ROOT_DIR}/tensorflow/contrib/makefile/gen
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/BUILD b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6d2d70c99b4cc804f2c8bf57afdc8c11f1f73516
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
@@ -0,0 +1,36 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
+
+py_library(
+    name = "benchmark_base",
+    srcs = [
+        "benchmark_base.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "cartpole_benchmark",
+    size = "enormous",
+    srcs = ["cartpole_benchmark.py"],
+    tags = [
+        "local",
+        "manual",
+        "no_oss",
+        "notap",
+        "nozapfhahn",
+    ],
+    deps = [
+        ":benchmark_base",
+        # Note: required gym dependency may need to be added here.
+    ],
+)
+
+tf_py_logged_benchmark(
+    name = "cartpole_logged_benchmark",
+    target = "//tensorflow/contrib/autograph/examples/benchmarks:cartpole_benchmark",
+)
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..93c694849c4dc3faca71e7f9d8614649a7784f99
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common benchmarking code.
+
+See https://www.tensorflow.org/community/benchmarks for usage.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+import tensorflow as tf
+
+
+class ReportingBenchmark(tf.test.Benchmark):
+  """Base class for a benchmark that reports general performance metrics.
+
+  Subclasses only need to call one of the _profile methods, and optionally
+  report_results.
+  """
+
+  def time_execution(self, name, target, iters, warm_up_iters=5):
+    for _ in range(warm_up_iters):
+      target()
+
+    all_times = []
+    for _ in range(iters):
+      iter_time = time.time()
+      target()
+      all_times.append(time.time() - iter_time)
+
+    avg_time = np.average(all_times)
+
+    extras = dict()
+    extras['all_times'] = all_times
+
+    if isinstance(name, tuple):
+      extras['name'] = name
+      name = '_'.join(str(piece) for piece in name)
+
+    self.report_benchmark(
+        iters=iters, wall_time=avg_time, name=name, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f553be58e94f11e45f0697558348fbbd26bfb91
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py
@@ -0,0 +1,492 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A basic RL cartpole benchmark.
+
+The RL model uses the OpenAI Gym environment to train a simple network using
+the policy gradients method. The training scales the gradients for each step
+by the episode's cumulative discounted reward and averages these gradients over
+a fixed number of games before applying the optimization step.
+
+For benchmarking purposes, we replace the OpenAI Gym environment to a fake
+that returns random actions and rewards and never ends the episode. This way
+the benchmarks compare the same amount of computation at each step.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib import eager
+from tensorflow.contrib.autograph.examples.benchmarks import benchmark_base
+from tensorflow.python import autograph as ag
+from tensorflow.python.eager import context
+
+#
+# AutoGraph implementation
+#
+
+
+@ag.convert()
+def graph_append_discounted_rewards(destination, rewards, discount_rate):
+  """Discounts episode rewards and appends them to destination."""
+  ag.set_element_type(rewards, tf.float32)
+
+  cdr = 0.0
+  reverse_discounted = []
+  ag.set_element_type(reverse_discounted, tf.float32)
+
+  for i in range(len(rewards) - 1, -1, -1):
+    cdr = cdr * discount_rate + rewards[i]
+    cdr.set_shape(())
+    reverse_discounted.append(cdr)
+
+  retval = destination
+  # Note: AutoGraph doesn't yet support reversed() so we use a loop instead.
+  for i in range(len(reverse_discounted) - 1, -1, -1):
+    retval.append(reverse_discounted[i])
+
+  return retval
+
+
+class GraphPolicyNetwork(tf.keras.Model):
+  """Policy network for the cart-pole reinforcement learning problem.
+
+  The forward path of the network takes an observation from the cart-pole
+  environment (length-4 vector) and outputs an action.
+  """
+
+  def __init__(self, hidden_size):
+    super(GraphPolicyNetwork, self).__init__()
+    self._hidden_layer = tf.keras.layers.Dense(
+        hidden_size, activation=tf.nn.elu)
+    self._output_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    """Calculates logits and action.
+
+    Args:
+      inputs: Observations from a step in the cart-pole environment, of shape
+        `(batch_size, input_size)`
+
+    Returns:
+      logits: the logits output by the output layer. This can be viewed as the
+        likelihood vales of choosing the left (0) action. Shape:
+        `(batch_size, 1)`.
+      actions: randomly selected actions ({0, 1}) based on the logits. Shape:
+        `(batch_size, 1)`.
+    """
+    hidden = self._hidden_layer(inputs)
+    logits = self._output_layer(hidden)
+
+    left_prob = tf.nn.sigmoid(logits)
+    action_probs = tf.concat([left_prob, 1.0 - left_prob], 1)
+
+    actions = tf.multinomial(tf.log(action_probs), 1)
+    return logits, actions
+
+  # TODO(mdan): Move this method out of the class.
+  @ag.convert()
+  def train(self, cart_pole_env, optimizer, discount_rate, num_games,
+            max_steps_per_game):
+    var_list = tf.trainable_variables()
+    grad_list = [
+        tf.TensorArray(tf.float32, 0, dynamic_size=True) for _ in var_list
+    ]
+
+    step_counts = []
+    discounted_rewards = []
+    ag.set_element_type(discounted_rewards, tf.float32)
+    ag.set_element_type(step_counts, tf.int32)
+
+    # Note: we use a shared object, cart_pole_env here. Because calls to the
+    # object's method are made through py_func, TensorFlow cannot detect its
+    # data dependencies. Hence we must manually synchronize access to it
+    # and ensure the control dependencies are set in such a way that
+    # calls to reset(), take_one_step, etc. are made in the correct order.
+    sync_counter = tf.constant(0)
+
+    for _ in tf.range(num_games):
+      with tf.control_dependencies([sync_counter]):
+        obs = cart_pole_env.reset()
+        with tf.control_dependencies([obs]):
+          sync_counter += 1
+
+        game_rewards = []
+        ag.set_element_type(game_rewards, tf.float32)
+
+        for step in tf.range(max_steps_per_game):
+          logits, actions = self(obs)  # pylint:disable=not-callable
+          logits = tf.reshape(logits, ())
+          actions = tf.reshape(actions, ())
+
+          labels = 1.0 - tf.cast(actions, tf.float32)
+          loss = tf.nn.sigmoid_cross_entropy_with_logits(
+              labels=labels, logits=logits)
+          grads = tf.gradients(loss, var_list)
+
+          for i in range(len(grads)):
+            grad_list[i].append(grads[i])
+
+          with tf.control_dependencies([sync_counter]):
+            obs, reward, done = cart_pole_env.step(actions)
+            with tf.control_dependencies([obs]):
+              sync_counter += 1
+            obs = tf.reshape(obs, (1, 4))
+
+          game_rewards.append(reward)
+          if reward < 0.1 or done:
+            step_counts.append(step + 1)
+            break
+
+        discounted_rewards = graph_append_discounted_rewards(
+            discounted_rewards, game_rewards, discount_rate)
+
+    discounted_rewards = ag.stack(discounted_rewards)
+    discounted_rewards.set_shape((None,))
+    mean, variance = tf.nn.moments(discounted_rewards, [0])
+    normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)
+
+    for i in range(len(grad_list)):
+      g = ag.stack(grad_list[i])
+
+      # This block just adjusts the shapes to match for multiplication.
+      r = normalized_rewards
+      if r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+      if r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+
+      grad_list[i] = tf.reduce_mean(g * r, axis=0)
+
+    optimizer.apply_gradients(
+        zip(grad_list, var_list), global_step=tf.train.get_global_step())
+
+    return ag.stack(step_counts)
+
+
+@ag.convert()
+def graph_train_model(policy_network, cart_pole_env, optimizer, iterations):
+  """Trains the policy network for a given number of iterations."""
+  i = tf.constant(0)
+  mean_steps_per_iteration = []
+  ag.set_element_type(mean_steps_per_iteration, tf.int32)
+
+  while i < iterations:
+    steps_per_game = policy_network.train(
+        cart_pole_env,
+        optimizer,
+        discount_rate=0.95,
+        num_games=20,
+        max_steps_per_game=200)
+    mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
+    i += 1
+
+  return ag.stack(mean_steps_per_iteration)
+
+
+class GraphGymCartpoleEnv(object):
+  """An env backed by OpenAI Gym's CartPole environment.
+
+  Used to confirm a functional model only.
+  """
+
+  def __init__(self):
+    cart_pole_env = gym.make('CartPole-v1')
+    cart_pole_env.seed(0)
+    cart_pole_env.reset()
+    self.env = cart_pole_env
+
+  def reset(self):
+    obs = ag.utils.wrap_py_func(self.env.reset, tf.float64, ())
+    obs = tf.reshape(obs, (1, 4))
+    obs = tf.cast(obs, tf.float32)
+    return obs
+
+  def step(self, actions):
+
+    def take_one_step(actions):
+      obs, reward, done, _ = self.env.step(actions)
+      obs = obs.astype(np.float32)
+      reward = np.float32(reward)
+      return obs, reward, done
+
+    return ag.utils.wrap_py_func(take_one_step,
+                                 (tf.float32, tf.float32, tf.bool), (actions,))
+
+
+class GraphRandomCartpoleEnv(object):
+  """An environment that returns random actions and never finishes.
+
+  Used during benchmarking, it will cause training to run a constant number of
+  steps.
+  """
+
+  def reset(self):
+    return tf.random.normal((1, 4))
+
+  def step(self, actions):
+    with tf.control_dependencies([actions]):
+      random_obs = tf.random.normal((1, 4))
+      fixed_reward = tf.constant(0.001)
+      done = tf.constant(False)
+      return random_obs, fixed_reward, done
+
+
+#
+# Eager implementation
+#
+
+
+def eager_append_discounted_rewards(discounted_rewards, rewards, discount_rate):
+  cdr = 0.0
+  reverse_discounted = []
+
+  for i in range(len(rewards) - 1, -1, -1):
+    cdr = cdr * discount_rate + rewards[i]
+    reverse_discounted.append(cdr)
+
+  discounted_rewards.extend(reversed(reverse_discounted))
+  return discounted_rewards
+
+
+class EagerPolicyNetwork(tf.keras.Model):
+  """Policy network for the cart-pole reinforcement learning problem.
+
+  The forward path of the network takes an observation from the cart-pole
+  environment (length-4 vector) and outputs an action.
+  """
+
+  def __init__(self, hidden_size):
+    super(EagerPolicyNetwork, self).__init__()
+    self._hidden_layer = tf.keras.layers.Dense(
+        hidden_size, activation=tf.nn.elu)
+    self._output_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    """Calculates logits and action.
+
+    Args:
+      inputs: Observations from a step in the cart-pole environment, of shape
+        `(batch_size, input_size)`
+
+    Returns:
+      logits: the logits output by the output layer. This can be viewed as the
+        likelihood vales of choosing the left (0) action. Shape:
+        `(batch_size, 1)`.
+      actions: randomly selected actions ({0, 1}) based on the logits. Shape:
+        `(batch_size, 1)`.
+    """
+    hidden = self._hidden_layer(inputs)
+    logits = self._output_layer(hidden)
+
+    left_prob = tf.nn.sigmoid(logits)
+    action_probs = tf.concat([left_prob, 1.0 - left_prob], 1)
+
+    self._grad_fn = eager.implicit_gradients(
+        self._get_cross_entropy_and_save_actions)
+
+    actions = tf.multinomial(tf.log(action_probs), 1)
+    return logits, actions
+
+  def _get_cross_entropy_and_save_actions(self, inputs):
+    logits, actions = self(inputs)  # pylint:disable=not-callable
+    self._current_actions = actions
+    labels = 1.0 - tf.cast(actions, tf.float32)
+    return tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
+
+  def train(self, cart_pole_env, optimizer, discount_rate, num_games,
+            max_steps_per_game):
+    grad_list = None
+
+    step_counts = []
+    discounted_rewards = []
+
+    for _ in range(num_games):
+      obs = cart_pole_env.reset()
+
+      game_rewards = []
+
+      for step in range(max_steps_per_game):
+        grads_and_vars = self._grad_fn(tf.constant([obs], dtype=tf.float32))
+        grads, var_list = zip(*grads_and_vars)
+        actions = self._current_actions.numpy()[0][0]
+
+        if grad_list is None:
+          grad_list = [[g] for g in grads]
+        else:
+          for i in range(len(grads)):
+            grad_list[i].append(grads[i])
+
+        obs, reward, done = cart_pole_env.step(actions)
+
+        game_rewards.append(reward)
+        if reward < 0.1 or done:
+          step_counts.append(step + 1)
+          break
+
+      discounted_rewards = eager_append_discounted_rewards(
+          discounted_rewards, game_rewards, discount_rate)
+
+    discounted_rewards = tf.stack(discounted_rewards)
+    mean, variance = tf.nn.moments(discounted_rewards, [0])
+    normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)
+
+    for i in range(len(grad_list)):
+      g = tf.stack(grad_list[i])
+
+      r = normalized_rewards
+      while r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+
+      grad_list[i] = tf.reduce_mean(g * r, axis=0)
+
+    optimizer.apply_gradients(
+        zip(grad_list, var_list), global_step=tf.train.get_global_step())
+
+    return tf.stack(step_counts)
+
+
+def eager_train_model(policy_network, cart_pole_env, optimizer, iterations):
+  """Trains the policy network for a given number of iterations."""
+  mean_steps_per_iteration = []
+
+  for _ in range(iterations):
+    steps_per_game = policy_network.train(
+        cart_pole_env,
+        optimizer,
+        discount_rate=0.95,
+        num_games=20,
+        max_steps_per_game=200)
+    mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
+
+  return mean_steps_per_iteration
+
+
+class EagerGymCartpoleEnv(object):
+  """An env backed by OpenAI Gym's CartPole environment.
+
+  Used to confirm a functional model only.
+  """
+
+  def __init__(self):
+    cart_pole_env = gym.make('CartPole-v1')
+    cart_pole_env.seed(0)
+    cart_pole_env.reset()
+    self.env = cart_pole_env
+
+  def reset(self):
+    return self.env.reset()
+
+  def step(self, actions):
+    obs, reward, done, _ = self.env.step(actions)
+    return obs, reward, done
+
+
+class EagerRandomCartpoleEnv(object):
+  """An environment that returns random actions and never finishes.
+
+  Used during benchmarking, it will cause training to run a constant number of
+  steps.
+  """
+
+  def reset(self):
+    return np.random.normal(size=(4,))
+
+  def step(self, actions):
+    with tf.control_dependencies([actions]):
+      random_obs = np.random.normal(size=(4,))
+      fixed_reward = 0.001
+      done = False
+      return random_obs, fixed_reward, done
+
+
+def graph_demo_training():
+  """Not used in the benchmark. Used to confirm a functional model."""
+  with tf.Graph().as_default():
+    tf.set_random_seed(0)
+
+    network = GraphPolicyNetwork(hidden_size=5)
+    network.build((1, 4))
+    env = GraphGymCartpoleEnv()
+    opt = tf.train.AdamOptimizer(0.05)
+
+    train_ops = graph_train_model(network, env, opt, iterations=5)
+
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(tf.local_variables_initializer())
+      steps_per_iteration = sess.run(train_ops)
+      for i, steps in enumerate(steps_per_iteration):
+        print('Step {} iterations: {}'.format(i, steps))
+
+
+def eager_demo_training():
+  with context.eager_mode():
+    network = EagerPolicyNetwork(hidden_size=5)
+    network.build((1, 4))
+    env = EagerGymCartpoleEnv()
+    opt = tf.train.AdamOptimizer(0.05)
+
+    steps_per_iteration = eager_train_model(network, env, opt, iterations=5)
+    for i, steps in enumerate(steps_per_iteration):
+      print('Step {} iterations: {}'.format(i, steps))
+
+
+class RLCartPoleBenchmark(benchmark_base.ReportingBenchmark):
+  """Actual benchmark.
+
+  Trains the RL agent a fixed number of times, on random environments that
+  result in constant number of steps.
+  """
+
+  def benchmark_cartpole(self):
+
+    def train_session(sess, ops):
+      return lambda: sess.run(ops)
+
+    def train_eager(network, env, opt):
+      return lambda: eager_train_model(network, env, opt, iterations=10)
+
+    for model_size in (10, 100, 1000):
+      with tf.Graph().as_default():
+        network = GraphPolicyNetwork(hidden_size=model_size)
+        network.build((1, 4))
+        env = GraphRandomCartpoleEnv()
+        opt = tf.train.AdamOptimizer(0.05)
+        train_ops = graph_train_model(network, env, opt, iterations=10)
+
+        with tf.Session() as sess:
+          sess.run(tf.global_variables_initializer())
+          sess.run(tf.local_variables_initializer())
+
+          self.time_execution(('cartpole', 'autograph', model_size),
+                              train_session(sess, train_ops), 20)
+
+      with context.eager_mode():
+        network = EagerPolicyNetwork(hidden_size=model_size)
+        network.build((1, 4))
+        env = EagerRandomCartpoleEnv()
+        opt = tf.train.AdamOptimizer(0.05)
+
+        self.time_execution(('cartpole', 'eager', model_size),
+                            train_eager(network, env, opt), 20)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
index 55faad983f2bcf2f3fa633669bd371608e2e925b..3e4d0dc1cec76b068c1c846eb476eec615e4f613 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -18,8 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import function
+from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import gen_batch_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -101,12 +102,15 @@ def batch_function(num_batch_threads,
   def decorator(fn):  # pylint: disable=missing-docstring
 
     def decorated(*args):  # pylint: disable=missing-docstring
-      types = [arg.dtype for arg in args]
 
-      @function.Defun(*types)
+      @function.defun()
       def computation(*computation_args):
         return fn(*computation_args)
 
+      computation = computation.get_concrete_function(
+          *[tensor_spec.TensorSpec(dtype=x.dtype, shape=x.shape, name=str(i))
+            for i, x in enumerate(args)])
+
       with ops.name_scope("batch") as name:
         for a in args:
           if not isinstance(a, ops.Tensor):
@@ -123,7 +127,7 @@ def batch_function(num_batch_threads,
             f=computation,
             in_tensors=list(args),
             captured_tensors=computation.captured_inputs,
-            Tout=[o.type for o in computation.definition.signature.output_arg])
+            Tout=[o.dtype for o in computation.outputs])
 
     return decorated
 
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
index 01ee8703a93836d607ee9b765c51c79fe3bb974f..9109b9c1c91cefa4c52bad49de23336a6e05e1ef 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops_test.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -219,6 +219,7 @@ class BatchOpsTest(test.TestCase):
 
       @batch_ops.batch_function(1, 10, 100000)
       def computation(in_t):
+        self.assertTrue(in_t.shape is not None)
         return in_t + 1
 
       inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py
index 41a8c920fc4e81af90f4c94a149d8c404c58b747..493046b39907971e92f91ecc60d375ea273ff1d2 100644
--- a/tensorflow/contrib/bayesflow/__init__.py
+++ b/tensorflow/contrib/bayesflow/__init__.py
@@ -14,6 +14,8 @@
 # ==============================================================================
 """Ops for representing Bayesian computation.
 
+Use [tfp](/probability/api_docs/python/tfp) instead.
+
 ## This package provides classes for Bayesian computation with TensorFlow.
 """
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
index 13215ffabf3a956d3f83697f867457b2fa72e7c9..8b6ed9f041b89a0da02a505ec261bca82b094f74 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
@@ -81,7 +81,7 @@ class ExpectationImportanceSampleTest(test.TestCase):
       # Compute E_p[X_1 * X_2 > 0], with X_i the ith component of X ~ p(x).
       # Should equal 1/2 because p is a spherical Gaussian centered at (0, 0).
       def indicator(x):
-        x1_times_x2 = math_ops.reduce_prod(x, reduction_indices=[-1])
+        x1_times_x2 = math_ops.reduce_prod(x, axis=[-1])
         return 0.5 * (math_ops.sign(x1_times_x2) + 1.0)
 
       prob = mc.expectation_importance_sampler(
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
index 68fa415eeaf1d1ae7c2ecf1be1c300eddbfa4e69..28a829d87ddecc4a147c588b5b0536b44db8393f 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Monte Carlo integration and helpers."""
+"""Monte Carlo integration and helpers.
+
+Use [tfp.monte_carlo](/probability/api_docs/python/tfp/monte_carlo) instead.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 18d40fc1dff8e7c9aefffbe3ceba770598a42096..e83a54851195708eb7e6412b7400236f4bc06e6b 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -353,12 +353,12 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
 def _sample_mean(values):
   """Mean over sample indices.  In this module this is always [0]."""
-  return math_ops.reduce_mean(values, reduction_indices=[0])
+  return math_ops.reduce_mean(values, axis=[0])
 
 
 def _sample_max(values):
   """Max over sample indices.  In this module this is always [0]."""
-  return math_ops.reduce_max(values, reduction_indices=[0])
+  return math_ops.reduce_max(values, axis=[0])
 
 
 def _get_samples(dist, z, n, seed):
diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md
index 2c44abed5e1955cc666273e97e6b2378766f13d2..79052bee35c7895cb4048b10c1f73acb036d1587 100644
--- a/tensorflow/contrib/bigtable/README.md
+++ b/tensorflow/contrib/bigtable/README.md
@@ -51,25 +51,18 @@ BIGTABLE_TABLE_NAME = '<FILL_ME_IN>'
 PREFIX = 'train-'
 
 def main():
+  tf.enable_eager_execution()
+
   client = tf.contrib.cloud.BigtableClient(GCP_PROJECT_ID, BIGTABLE_INSTANCE_ID)
   table = client.table(BIGTABLE_TABLE_NAME)
   dataset = table.keys_by_prefix_dataset(PREFIX)
-  iterator = dataset.make_initializable_iterator()
-  get_next_op = iterator.get_next()
 
-  with tf.Session() as sess:
-    print('Initializing the iterator.')
-    sess.run(iterator.initializer)
-    print('Retrieving rows:')
-    row_index = 0
-    while True:
-      try:
-        row_key = sess.run(get_next_op)
-        print('Row key %d: %s' % (row_index, row_key))
-        row_index += 1
-      except tf.errors.OutOfRangeError:
-        print('Finished reading data!')
-        break
+  print('Retrieving rows:')
+  row_index = 0
+  for row_key in dataset:
+    print('Row key %d: %s' % (row_index, row_key))
+    row_index += 1
+  print('Finished reading data!')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
index 67bf14c17646cff81af707405b66c9fba2ded0bd..98f906408c230a4382ffafe412ee9990d4384930 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
@@ -29,8 +29,7 @@ Status GrpcStatusToTfStatus(const ::grpc::Status& status) {
   }
   return Status(static_cast<::tensorflow::error::Code>(status.error_code()),
                 strings::StrCat("Error reading from Cloud Bigtable: ",
-                                status.error_message(),
-                                " (Details: ", status.error_details(), ")"));
+                                status.error_message()));
 }
 
 string RegexFromStringSet(const std::vector<string>& strs) {
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
index 01608dc6bc07890c3a59577ef31c90c2694e6a87..f0c3ef4e2ecbf5f4d21e421be4fb527d4769200c 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_sample_key_pairs_dataset_op.cc
@@ -167,7 +167,7 @@ class BigtableSampleKeyPairsDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-        if (index_ > keys_.size() - 2) {
+        if (index_ + 2 > keys_.size()) {
           *end_of_sequence = true;
           return Status::OK();
         }
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index f083ce6f44b3c2a83d9b5d3235056eb94c4be4a8..e95dc577184f7e81d942755b41065f52131ce9f6 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -366,6 +366,39 @@ BigtableTestClient::MutateRows(
   return MakeUnique<MutateRowsResponse>(request.entries_size());
 }
 
+std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+    google::bigtable::v2::MutateRowResponse>>
+BigtableTestClient::AsyncMutateRow(
+    grpc::ClientContext* context,
+    google::bigtable::v2::MutateRowRequest const& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
+std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+    ::google::bigtable::v2::SampleRowKeysResponse>>
+BigtableTestClient::AsyncSampleRowKeys(
+    ::grpc::ClientContext* context,
+    const ::google::bigtable::v2::SampleRowKeysRequest& request,
+    ::grpc::CompletionQueue* cq, void* tag) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
+std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+    ::google::bigtable::v2::MutateRowsResponse>>
+BigtableTestClient::AsyncMutateRows(
+    ::grpc::ClientContext* context,
+    const ::google::bigtable::v2::MutateRowsRequest& request,
+    ::grpc::CompletionQueue* cq, void* tag) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::shared_ptr<grpc::Channel> BigtableTestClient::Channel() {
   LOG(WARNING) << "Call to InMemoryDataClient::Channel(); this will likely "
                   "cause a crash!";
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index dac2b16a216d26f02684c7401ed2ddaa4b7baddb..c4a1f06bc504c3565c7bb09b42e48e7fbddb9cc6 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -61,6 +61,25 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
   MutateRows(grpc::ClientContext* context,
              google::bigtable::v2::MutateRowsRequest const& request) override;
 
+  std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+      google::bigtable::v2::MutateRowResponse>>
+  AsyncMutateRow(grpc::ClientContext* context,
+                 google::bigtable::v2::MutateRowRequest const& request,
+                 grpc::CompletionQueue* cq) override;
+
+  std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+      ::google::bigtable::v2::SampleRowKeysResponse>>
+  AsyncSampleRowKeys(
+      ::grpc::ClientContext* context,
+      const ::google::bigtable::v2::SampleRowKeysRequest& request,
+      ::grpc::CompletionQueue* cq, void* tag) override;
+
+  std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+      ::google::bigtable::v2::MutateRowsResponse>>
+  AsyncMutateRows(::grpc::ClientContext* context,
+                  const ::google::bigtable::v2::MutateRowsRequest& request,
+                  ::grpc::CompletionQueue* cq, void* tag) override;
+
   std::shared_ptr<grpc::Channel> Channel() override;
 
  private:
diff --git a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
index 316da9ebe152ef52c7e7f846cf8c3eb1555ee8a6..197f5578eb010bee5a3aad7c05446393193f99e2 100644
--- a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
+++ b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
@@ -57,7 +57,7 @@ class BigtableOpsTest(test.TestCase):
     sess.run(write_op)
 
   def runReadKeyTest(self, read_ds):
-    itr = read_ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(read_ds)
     n = itr.get_next()
     expected = list(self.COMMON_ROW_KEYS)
     expected.reverse()
@@ -78,7 +78,7 @@ class BigtableOpsTest(test.TestCase):
     self.runReadKeyTest(self._table.keys_by_range_dataset("r1", "r4"))
 
   def runScanTest(self, read_ds):
-    itr = read_ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(read_ds)
     n = itr.get_next()
     expected_keys = list(self.COMMON_ROW_KEYS)
     expected_keys.reverse()
@@ -120,7 +120,7 @@ class BigtableOpsTest(test.TestCase):
   def testLookup(self):
     ds = self._table.keys_by_prefix_dataset("r")
     ds = ds.apply(self._table.lookup_columns(cf1="c1"))
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     expected_keys = list(self.COMMON_ROW_KEYS)
     expected_values = list(self.COMMON_VALUES)
@@ -141,7 +141,7 @@ class BigtableOpsTest(test.TestCase):
 
   def testSampleKeys(self):
     ds = self._table.sample_keys()
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     expected_key = self.COMMON_ROW_KEYS[0]
     with self.cached_session() as sess:
@@ -161,7 +161,7 @@ class BigtableOpsTest(test.TestCase):
         sess.run(n)
 
   def runSampleKeyPairsTest(self, ds, expected_key_pairs):
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
@@ -218,7 +218,7 @@ class BigtableOpsTest(test.TestCase):
   def testSampleKeyPairsPrefixAndStartKey(self):
     ds = bigtable_api._BigtableSampleKeyPairsDataset(
         self._table, prefix="r", start="r1", end="")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     with self.cached_session() as sess:
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(itr.initializer)
@@ -226,14 +226,14 @@ class BigtableOpsTest(test.TestCase):
   def testSampleKeyPairsPrefixAndEndKey(self):
     ds = bigtable_api._BigtableSampleKeyPairsDataset(
         self._table, prefix="r", start="", end="r3")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     with self.cached_session() as sess:
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(itr.initializer)
 
   def testParallelScanPrefix(self):
     ds = self._table.parallel_scan_prefix(prefix="r", cf1="c1")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
@@ -251,7 +251,7 @@ class BigtableOpsTest(test.TestCase):
 
   def testParallelScanRange(self):
     ds = self._table.parallel_scan_range(start="r1", end="r4", cf1="c1")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index 7c87b0daeb09950cc44c51f49c16534d413f0376..b6cdc7aab0320fe5f457288ada03a46e18a694cc 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -35,8 +35,8 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import resource_loader
 
@@ -111,8 +111,7 @@ class BigtableClient(object):
 
 
 class BigtableTable(object):
-  """BigtableTable is the entrypoint for reading and writing data in Cloud
-  Bigtable.
+  """Entry point for reading and writing data in Cloud Bigtable.
 
   This BigtableTable class is the Python representation of the Cloud Bigtable
   table within TensorFlow. Methods on this class allow data to be read from and
@@ -222,7 +221,7 @@ class BigtableTable(object):
       A `tf.data.Dataset`. containing `tf.string` Tensors corresponding to all
       of the row keys matching that prefix.
     """
-    return _BigtablePrefixKeyDataset(self, prefix)
+    return dataset_ops.DatasetV1Adapter(_BigtablePrefixKeyDataset(self, prefix))
 
   def sample_keys(self):
     """Retrieves a sampling of row keys from the Bigtable table.
@@ -234,7 +233,7 @@ class BigtableTable(object):
     Returns:
       A `tf.data.Dataset` returning string row keys.
     """
-    return _BigtableSampleKeysDataset(self)
+    return dataset_ops.DatasetV1Adapter(_BigtableSampleKeysDataset(self))
 
   def scan_prefix(self, prefix, probability=None, columns=None, **kwargs):
     """Retrieves row (including values) from the Bigtable service.
@@ -279,7 +278,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    return _BigtableScanDataset(self, prefix, "", "", normalized, probability)
+    return dataset_ops.DatasetV1Adapter(
+        _BigtableScanDataset(self, prefix, "", "", normalized, probability))
 
   def scan_range(self, start, end, probability=None, columns=None, **kwargs):
     """Retrieves rows (including values) from the Bigtable service.
@@ -324,7 +324,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    return _BigtableScanDataset(self, "", start, end, normalized, probability)
+    return dataset_ops.DatasetV1Adapter(
+        _BigtableScanDataset(self, "", start, end, normalized, probability))
 
   def parallel_scan_prefix(self,
                            prefix,
@@ -380,7 +381,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    ds = _BigtableSampleKeyPairsDataset(self, prefix, "", "")
+    ds = dataset_ops.DatasetV1Adapter(
+        _BigtableSampleKeyPairsDataset(self, prefix, "", ""))
     return self._make_parallel_scan_dataset(ds, num_parallel_scans, probability,
                                             normalized)
 
@@ -442,7 +444,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    ds = _BigtableSampleKeyPairsDataset(self, "", start, end)
+    ds = dataset_ops.DatasetV1Adapter(
+        _BigtableSampleKeyPairsDataset(self, "", start, end))
     return self._make_parallel_scan_dataset(ds, num_parallel_scans, probability,
                                             normalized)
 
@@ -589,16 +592,8 @@ class _BigtableKeyDataset(dataset_ops.DatasetSource):
     self._table = table
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.TensorShape([])
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
 
 class _BigtablePrefixKeyDataset(_BigtableKeyDataset):
@@ -658,16 +653,9 @@ class _BigtableLookupDataset(dataset_ops.DatasetSource):
     self._columns = [i[1] for i in normalized]
 
   @property
-  def output_classes(self):
-    return tuple([ops.Tensor] * self._num_outputs)
-
-  @property
-  def output_shapes(self):
-    return tuple([tensor_shape.TensorShape([])] * self._num_outputs)
-
-  @property
-  def output_types(self):
-    return tuple([dtypes.string] * self._num_outputs)
+  def _element_structure(self):
+    return structure.NestedStructure(tuple(
+        [structure.TensorStructure(dtypes.string, [])] * self._num_outputs))
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
@@ -693,16 +681,9 @@ class _BigtableScanDataset(dataset_ops.DatasetSource):
     self._num_outputs = len(normalized) + 1  # 1 for row key
 
   @property
-  def output_classes(self):
-    return tuple([ops.Tensor] * self._num_outputs)
-
-  @property
-  def output_shapes(self):
-    return tuple([tensor_shape.TensorShape([])] * self._num_outputs)
-
-  @property
-  def output_types(self):
-    return tuple([dtypes.string] * self._num_outputs)
+  def _element_structure(self):
+    return structure.NestedStructure(tuple(
+        [structure.TensorStructure(dtypes.string, [])] * self._num_outputs))
 
   def _as_variant_tensor(self):
     return gen_bigtable_ops.bigtable_scan_dataset(
@@ -726,16 +707,10 @@ class _BigtableSampleKeyPairsDataset(dataset_ops.DatasetSource):
     self._end = end
 
   @property
-  def output_classes(self):
-    return (ops.Tensor, ops.Tensor)
-
-  @property
-  def output_shapes(self):
-    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
-
-  @property
-  def output_types(self):
-    return (dtypes.string, dtypes.string)
+  def _element_structure(self):
+    return structure.NestedStructure(
+        (structure.TensorStructure(dtypes.string, []),
+         structure.TensorStructure(dtypes.string, [])))
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index f03eab510c2f9010fc92eb1934ac77dc0626a44b..f7f15a302a00ee4187d57fc4d40727b84e6c587c 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -98,7 +98,6 @@ py_library(
         "//tensorflow/contrib/boosted_trees/proto:learner_proto_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
-        "//tensorflow/contrib/stateless",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -108,6 +107,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 14b6fc4ac26f74f54628ae37ad6437c7d3e8caba..d3b23d949ee2c7674c3918d39e8b71d76eefcfec 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -132,6 +132,7 @@ py_library(
     srcs = ["estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":custom_loss_head",
         ":estimator_utils",
         ":model",
         "//tensorflow/contrib/boosted_trees:losses",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index 48f12a64f94c7bd0531488ef537b199558e17e3e..b314b4d74df882a421d9a2ecce2629a63d5c5248 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -41,7 +41,8 @@ def make_custom_export_strategy(name,
                                 convert_fn,
                                 feature_columns,
                                 export_input_fn,
-                                use_core_columns=False):
+                                use_core_columns=False,
+                                feature_engineering_fn=None):
   """Makes custom exporter of GTFlow tree format.
 
   Args:
@@ -52,6 +53,7 @@ def make_custom_export_strategy(name,
     export_input_fn: A function that takes no arguments and returns an
       `InputFnOps`.
     use_core_columns: A boolean, whether core feature columns were used.
+    feature_engineering_fn: Feature eng function to be called on the input.
 
   Returns:
     An `ExportStrategy`.
@@ -59,9 +61,12 @@ def make_custom_export_strategy(name,
   base_strategy = saved_model_export_utils.make_export_strategy(
       serving_input_fn=export_input_fn, strip_default_attrs=True)
   input_fn = export_input_fn()
+  features = input_fn.features
+  if feature_engineering_fn is not None:
+    features, _ = feature_engineering_fn(features, labels=None)
   (sorted_feature_names, dense_floats, sparse_float_indices, _, _,
    sparse_int_indices, _, _) = gbdt_batch.extract_features(
-       input_fn.features, feature_columns, use_core_columns)
+       features, feature_columns, use_core_columns)
 
   def export_fn(estimator, export_dir, checkpoint_path=None, eval_result=None):
     """A wrapper to export to SavedModel, and convert it to other formats."""
@@ -196,6 +201,10 @@ def convert_to_universal_format(dtec, sorted_feature_names,
           matching_id = categorical_test.value.add()
           matching_id.int64_value = split.feature_id
           node.custom_left_child_test.Pack(categorical_test)
+        elif (node_type == "oblivious_dense_float_binary_split" or
+              node_type == "oblivious_categorical_id_binary_split"):
+          raise ValueError("Universal tree format doesn't support oblivious "
+                           "trees")
         else:
           raise ValueError("Unexpected node type %s" % node_type)
         node.left_child_id.value = split.left_id
@@ -229,6 +238,13 @@ def _get_feature_importances(dtec, feature_names, num_dense_floats,
         split = tree_node.categorical_id_binary_split
         split_column = feature_names[split.feature_column + num_dense_floats +
                                      num_sparse_float]
+      elif node_type == "oblivious_dense_float_binary_split":
+        split = tree_node.oblivious_dense_float_binary_split
+        split_column = feature_names[split.feature_column]
+      elif node_type == "oblivious_categorical_id_binary_split":
+        split = tree_node.oblivious_categorical_id_binary_split
+        split_column = feature_names[split.feature_column + num_dense_floats +
+                                     num_sparse_float]
       elif node_type == "categorical_id_set_membership_binary_split":
         split = tree_node.categorical_id_set_membership_binary_split
         split_column = feature_names[split.feature_column + num_dense_floats +
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index 194a5c8754cb0ab2db299e3fb5c998c0f27f8435..358404cd946bbc56d2f7228be8fe4223749c850b 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -28,7 +28,6 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.boosted_trees.estimator_batch import model
 from tensorflow.contrib.boosted_trees.estimator_batch import distillation_loss
-from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
@@ -37,7 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
-from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.feature_column import feature_column_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -170,6 +169,7 @@ def _dnn_tree_combined_model_fn(
   if (output_type == model.ModelBuilderOutputType.ESTIMATOR_SPEC and
       not use_core_versions):
     raise ValueError("You must use core versions with Estimator Spec")
+  global_step = training_util.get_global_step()
 
   with variable_scope.variable_scope(
       dnn_parent_scope,
@@ -191,46 +191,58 @@ def _dnn_tree_combined_model_fn(
             feature_columns=dnn_feature_columns,
             weight_collections=[dnn_parent_scope],
             scope=input_layer_scope)
-    previous_layer = input_layer
-    for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
+    def dnn_logits_fn():
+      """Builds the logits from the input layer."""
+      previous_layer = input_layer
+      for layer_id, num_hidden_units in enumerate(dnn_hidden_units):
+        with variable_scope.variable_scope(
+            "hiddenlayer_%d" % layer_id,
+            values=(previous_layer,)) as hidden_layer_scope:
+          net = layers.fully_connected(
+              previous_layer,
+              num_hidden_units,
+              activation_fn=dnn_activation_fn,
+              variables_collections=[dnn_parent_scope],
+              scope=hidden_layer_scope)
+          if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN:
+            net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout))
+        _add_hidden_layer_summary(net, hidden_layer_scope.name)
+        previous_layer = net
       with variable_scope.variable_scope(
-          "hiddenlayer_%d" % layer_id,
-          values=(previous_layer,)) as hidden_layer_scope:
-        net = layers.fully_connected(
+          "logits", values=(previous_layer,)) as logits_scope:
+        dnn_logits = layers.fully_connected(
             previous_layer,
-            num_hidden_units,
-            activation_fn=dnn_activation_fn,
+            head.logits_dimension,
+            activation_fn=None,
             variables_collections=[dnn_parent_scope],
-            scope=hidden_layer_scope)
-        if dnn_dropout is not None and mode == model_fn.ModeKeys.TRAIN:
-          net = layers.dropout(net, keep_prob=(1.0 - dnn_dropout))
-      _add_hidden_layer_summary(net, hidden_layer_scope.name)
-      previous_layer = net
-    with variable_scope.variable_scope(
-        "logits", values=(previous_layer,)) as logits_scope:
-      dnn_logits = layers.fully_connected(
-          previous_layer,
-          head.logits_dimension,
-          activation_fn=None,
-          variables_collections=[dnn_parent_scope],
-          scope=logits_scope)
-    _add_hidden_layer_summary(dnn_logits, logits_scope.name)
-
-    def _dnn_train_op_fn(loss):
-      """Returns the op to optimize the loss."""
-      return optimizers.optimize_loss(
-          loss=loss,
-          global_step=training_util.get_global_step(),
-          learning_rate=_DNN_LEARNING_RATE,
-          optimizer=_get_optimizer(dnn_optimizer),
-          name=dnn_parent_scope,
-          variables=ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope),
-          # Empty summaries to prevent optimizers from logging training_loss.
-          summaries=[])
+            scope=logits_scope)
+      _add_hidden_layer_summary(dnn_logits, logits_scope.name)
+      return dnn_logits
+    if predict_with_tree_only and mode == model_fn.ModeKeys.INFER:
+      dnn_logits = array_ops.constant(0.0)
+      dnn_train_op_fn = control_flow_ops.no_op
+    elif predict_with_tree_only and mode == model_fn.ModeKeys.EVAL:
+      dnn_logits = control_flow_ops.cond(
+          global_step > dnn_steps_to_train,
+          lambda: array_ops.constant(0.0),
+          dnn_logits_fn)
+      dnn_train_op_fn = control_flow_ops.no_op
+    else:
+      dnn_logits = dnn_logits_fn()
+      def dnn_train_op_fn(loss):
+        """Returns the op to optimize the loss."""
+        return optimizers.optimize_loss(
+            loss=loss,
+            global_step=training_util.get_global_step(),
+            learning_rate=_DNN_LEARNING_RATE,
+            optimizer=_get_optimizer(dnn_optimizer),
+            name=dnn_parent_scope,
+            variables=ops.get_collection(
+                ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope),
+            # Empty summaries to prevent optimizers from logging training_loss.
+            summaries=[])
 
   # Build Tree Logits.
-  global_step = training_util.get_global_step()
   with ops.device(global_step.device):
     ensemble_handle = model_ops.tree_ensemble_variable(
         stamp_token=0,
@@ -261,8 +273,13 @@ def _dnn_tree_combined_model_fn(
       """Returns the op to optimize the loss."""
       if dnn_to_tree_distillation_param:
         loss_weight, loss_fn = dnn_to_tree_distillation_param
-        weight_tensor = head_lib._weight_tensor(  # pylint: disable=protected-access
-            features, head.weight_column_name)
+        # pylint: disable=protected-access
+        if use_core_versions:
+          weight_tensor = head_lib._weight_tensor(features, head._weight_column)
+        else:
+          weight_tensor = head_lib._weight_tensor(
+              features, head.weight_column_name)
+        # pylint: enable=protected-access
         dnn_logits_fixed = array_ops.stop_gradient(dnn_logits)
 
         if loss_fn is None:
@@ -305,52 +322,26 @@ def _dnn_tree_combined_model_fn(
   finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
 
   if output_type == model.ModelBuilderOutputType.MODEL_FN_OPS:
-    if use_core_versions:
-      model_fn_ops = head.create_estimator_spec(
-          features=features,
-          mode=mode,
-          labels=labels,
-          train_op_fn=_no_train_op_fn,
-          logits=tree_train_logits)
-      dnn_train_op = head.create_estimator_spec(
-          features=features,
-          mode=mode,
-          labels=labels,
-          train_op_fn=_dnn_train_op_fn,
-          logits=dnn_logits)
-      dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
-          dnn_train_op).train_op
-
-      tree_train_op = head.create_estimator_spec(
-          features=tree_features,
-          mode=mode,
-          labels=labels,
-          train_op_fn=_tree_train_op_fn,
-          logits=tree_train_logits)
-      tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
-          tree_train_op).train_op
-
-      model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(
-          model_fn_ops)
-    else:
-      model_fn_ops = head.create_model_fn_ops(
-          features=features,
-          mode=mode,
-          labels=labels,
-          train_op_fn=_no_train_op_fn,
-          logits=tree_train_logits)
-      dnn_train_op = head.create_model_fn_ops(
-          features=features,
-          mode=mode,
-          labels=labels,
-          train_op_fn=_dnn_train_op_fn,
-          logits=dnn_logits).train_op
-      tree_train_op = head.create_model_fn_ops(
-          features=tree_features,
-          mode=mode,
-          labels=labels,
-          train_op_fn=_tree_train_op_fn,
-          logits=tree_train_logits).train_op
+    model_fn_ops = head.create_model_fn_ops(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_no_train_op_fn,
+        logits=tree_train_logits)
+    if mode != model_fn.ModeKeys.TRAIN:
+      return model_fn_ops
+    dnn_train_op = head.create_model_fn_ops(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=dnn_train_op_fn,
+        logits=dnn_logits).train_op
+    tree_train_op = head.create_model_fn_ops(
+        features=tree_features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_tree_train_op_fn,
+        logits=tree_train_logits).train_op
 
     # Add the hooks
     model_fn_ops.training_hooks.extend([
@@ -369,11 +360,13 @@ def _dnn_tree_combined_model_fn(
         labels=labels,
         train_op_fn=_no_train_op_fn,
         logits=tree_train_logits)
+    if mode != model_fn.ModeKeys.TRAIN:
+      return fusion_spec
     dnn_spec = head.create_estimator_spec(
         features=features,
         mode=mode,
         labels=labels,
-        train_op_fn=_dnn_train_op_fn,
+        train_op_fn=dnn_train_op_fn,
         logits=dnn_logits)
     tree_spec = head.create_estimator_spec(
         features=tree_features,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
index 839eedd3a87ccaa1faecd1966fe5907d682cac02..dea19b7c62649679f944809b44c51ba0cd361904 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -18,13 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
 from tensorflow.contrib.boosted_trees.estimator_batch import dnn_tree_combined_estimator as estimator
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator_test_utils
 from tensorflow.contrib.learn.python.learn.estimators import run_config
+from tensorflow.python.estimator import exporter
 from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.export import export
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -34,6 +38,7 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import checkpoint_utils
 
+
 def _train_input_fn():
   features = {
       "x": constant_op.constant([[2.], [1.], [1.]])
@@ -103,35 +108,6 @@ class DNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
     classifier.fit(input_fn=_train_input_fn, steps=15)
     classifier.evaluate(input_fn=_eval_input_fn, steps=1)
 
-  def testFitAndEvaluateDontThrowExceptionWithCore(self):
-    learner_config = learner_pb2.LearnerConfig()
-    learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 1
-    model_dir = tempfile.mkdtemp()
-    config = run_config.RunConfig()
-
-    # Use core head
-    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-
-    classifier = estimator.DNNBoostedTreeCombinedEstimator(
-        head=head_fn,
-        dnn_hidden_units=[1],
-        # Use core feature columns
-        dnn_feature_columns=[core_feature_column.numeric_column("x")],
-        tree_learner_config=learner_config,
-        num_trees=1,
-        tree_examples_per_layer=3,
-        model_dir=model_dir,
-        config=config,
-        dnn_steps_to_train=10,
-        dnn_input_layer_to_tree=True,
-        tree_feature_columns=[],
-        use_core_versions=True)
-
-    classifier.fit(input_fn=_train_input_fn, steps=15)
-    classifier.evaluate(input_fn=_eval_input_fn, steps=1)
-
   def testFitAndEvaluateWithDistillation(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
@@ -223,6 +199,51 @@ class CoreDNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
     self.assertLess(0.5, res["auc"])
     est.predict(input_fn=_eval_input_fn)
 
+  def testTrainEvaluateWithDnnForInputAndTreeForPredict(self):
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    est = estimator.CoreDNNBoostedTreeCombinedEstimator(
+        head=head_fn,
+        dnn_hidden_units=[1],
+        dnn_feature_columns=[core_feature_column.numeric_column("x")],
+        tree_learner_config=learner_config,
+        num_trees=1,
+        tree_examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        dnn_steps_to_train=10,
+        dnn_input_layer_to_tree=True,
+        predict_with_tree_only=True,
+        dnn_to_tree_distillation_param=(0.5, None),
+        tree_feature_columns=[])
+
+    # Train for a few steps.
+    est.train(input_fn=_train_input_fn, steps=1000)
+    res = est.evaluate(input_fn=_eval_input_fn, steps=1)
+    self.assertLess(0.5, res["auc"])
+    est.predict(input_fn=_eval_input_fn)
+    serving_input_fn = (
+        export.build_parsing_serving_input_receiver_fn(
+            feature_spec={"x": parsing_ops.FixedLenFeature(
+                [1], dtype=dtypes.float32)}))
+    base_exporter = exporter.FinalExporter(
+        name="Servo",
+        serving_input_receiver_fn=serving_input_fn,
+        assets_extra=None)
+    export_path = os.path.join(model_dir, "export")
+    base_exporter.export(
+        est,
+        export_path=export_path,
+        checkpoint_path=None,
+        eval_result={},
+        is_the_final_export=True)
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 4c7a538b385ec19f520bff79bab20a121221c60f..a178820841c4c8bcb7f5742babdb6d0f4825de31 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.boosted_trees.estimator_batch import model
 from tensorflow.contrib.boosted_trees.python.utils import losses
 from tensorflow.contrib.learn.python.learn.estimators import estimator
@@ -26,7 +28,8 @@ from tensorflow.python.estimator.canned import head as core_head_lib
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.losses import losses as core_losses
-
+from tensorflow.contrib.boosted_trees.estimator_batch import custom_loss_head
+from tensorflow.python.ops import array_ops
 
 # ================== Old estimator interface===================================
 # The estimators below were designed for old feature columns and old estimator
@@ -414,30 +417,167 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+# When using this estimator, make sure to regularize the hessian (at least l2,
+# min_node_weight)!
+# TODO(nponomareva): extend to take multiple quantiles in one go.
+class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
+  """An estimator that does quantile regression and returns quantile estimates.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               quantiles,
+               label_dimension=1,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               use_core_libs=False,
+               output_leaf_index=False,
+               override_global_step_value=None,
+               num_quantiles=100):
+    """Initializes a GradientBoostedDecisionTreeQuantileRegressor instance.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      quantiles: a list of quantiles for the loss, each between 0 and 1.
+      label_dimension: Dimension of regression label. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`). When label_dimension>1, it is
+        recommended to use multiclass strategy diagonal hessian or full hessian.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+      override_global_step_value: If after the training is done, global step
+        value must be reset to this value. This should be used to reset global
+        step to a number > number of steps used to train the current ensemble.
+        For example, the usual way is to train a number of trees and set a very
+        large number of training steps. When the training is done (number of
+        trees were trained), this parameter can be used to set the global step
+        to a large value, making it look like that number of training steps ran.
+        If None, no override of global step will happen.
+      num_quantiles: Number of quantiles to build for numeric feature values.
+    """
+
+    if len(quantiles) > 1:
+      raise ValueError('For now, just one quantile per estimator is supported')
+
+    def _quantile_regression_head(quantile):
+      # Use quantile regression.
+      head = custom_loss_head.CustomLossHead(
+          loss_fn=functools.partial(
+              losses.per_example_quantile_regression_loss, quantile=quantile),
+          link_fn=array_ops.identity,
+          logit_dimension=label_dimension)
+      return head
+
+    learner_config.num_classes = max(2, label_dimension)
+
+    super(GradientBoostedDecisionTreeQuantileRegressor, self).__init__(
+        model_fn=model.model_builder,
+        params={
+            'head': _quantile_regression_head(quantiles[0]),
+            'feature_columns': feature_columns,
+            'learner_config': learner_config,
+            'num_trees': num_trees,
+            'weight_column_name': weight_column_name,
+            'examples_per_layer': examples_per_layer,
+            'logits_modifier_function': logits_modifier_function,
+            'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
+            'output_leaf_index': False,
+            'override_global_step_value': override_global_step_value,
+            'num_quantiles': num_quantiles,
+        },
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
+
 # ================== New Estimator interface===================================
 # The estimators below use new core Estimator interface and must be used with
 # new feature columns and heads.
 
+
 # For multiclass classification, use the following head since it uses loss
 # that is twice differentiable.
-def core_multiclass_head(n_classes):
+def core_multiclass_head(
+    n_classes,
+    weight_column=None,
+    loss_reduction=core_losses.Reduction.SUM_OVER_NONZERO_WEIGHTS):
   """Core head for multiclass problems."""
 
   def loss_fn(labels, logits):
     result = losses.per_example_maxent_loss(
-        labels=labels, logits=logits, weights=None, num_classes=n_classes)
+        labels=labels,
+        logits=logits,
+        weights=weight_column,
+        num_classes=n_classes)
     return result[0]
 
   # pylint:disable=protected-access
   head_fn = core_head_lib._multi_class_head_with_softmax_cross_entropy_loss(
       n_classes=n_classes,
       loss_fn=loss_fn,
-      loss_reduction=core_losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+      loss_reduction=loss_reduction,
+      weight_column=weight_column)
   # pylint:enable=protected-access
 
   return head_fn
 
 
+# For quantile regression, use this head with Core..Estimator, or use
+# Core..QuantileRegressor directly,
+def core_quantile_regression_head(
+    quantiles,
+    label_dimension=1,
+    weight_column=None,
+    loss_reduction=core_losses.Reduction.SUM_OVER_NONZERO_WEIGHTS):
+  """Core head for quantile regression problems."""
+
+  def loss_fn(labels, logits):
+    result = losses.per_example_quantile_regression_loss(
+        labels=labels,
+        predictions=logits,
+        weights=weight_column,
+        quantile=quantiles)
+    return result[0]
+
+  # pylint:disable=protected-access
+  head_fn = core_head_lib._regression_head(
+      label_dimension=label_dimension,
+      loss_fn=loss_fn,
+      loss_reduction=loss_reduction,
+      weight_column=weight_column)
+  # pylint:enable=protected-access
+  return head_fn
+
+
 class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator):
   """An estimator using gradient boosted decision trees.
 
@@ -601,3 +741,104 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
 
     super(CoreGradientBoostedDecisionTreeRanker, self).__init__(
         model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+# When using this estimator, make sure to regularize the hessian (at least l2,
+# min_node_weight)!
+# TODO(nponomareva): extend to take multiple quantiles in one go.
+class CoreGradientBoostedDecisionTreeQuantileRegressor(
+    core_estimator.Estimator):
+  """An estimator that does quantile regression and returns quantile estimates.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               quantiles,
+               label_dimension=1,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_keys=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               output_leaf_index=False,
+               num_quantiles=100):
+    """Initializes a core version of GradientBoostedDecisionTreeEstimator.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      quantiles: a list of quantiles for the loss, each between 0 and 1.
+      label_dimension: Dimension of regression label. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`). When label_dimension>1, it is
+        recommended to use multiclass strategy diagonal hessian or full hessian.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+      num_quantiles: Number of quantiles to build for numeric feature values.
+    """
+    if len(quantiles) > 1:
+      raise ValueError('For now, just one quantile per estimator is supported')
+
+    def _model_fn(features, labels, mode, config):
+      return model.model_builder(
+          features=features,
+          labels=labels,
+          mode=mode,
+          config=config,
+          params={
+              'head':
+                  core_quantile_regression_head(
+                      quantiles[0], label_dimension=label_dimension),
+              'feature_columns':
+                  feature_columns,
+              'learner_config':
+                  learner_config,
+              'num_trees':
+                  num_trees,
+              'weight_column_name':
+                  weight_column_name,
+              'examples_per_layer':
+                  examples_per_layer,
+              'center_bias':
+                  center_bias,
+              'logits_modifier_function':
+                  logits_modifier_function,
+              'use_core_libs':
+                  True,
+              'output_leaf_index':
+                  output_leaf_index,
+              'override_global_step_value':
+                  None,
+              'num_quantiles':
+                  num_quantiles,
+          },
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC)
+
+    super(CoreGradientBoostedDecisionTreeQuantileRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index c155128c0e4ccf928349ee6453baff4384222096..ee052ac60387d8f993e4942dd7dff39e191dd3a4 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -47,8 +48,8 @@ def _multiclass_train_input_fn():
   features = {
       "x": constant_op.constant([[2.], [1.], [1.], [5.], [3.5], [4.6], [3.5]])
   }
-  label = constant_op.constant(
-      [[1], [0], [0], [2], [2], [0], [1]], dtype=dtypes.int32)
+  label = constant_op.constant([[1], [0], [0], [2], [2], [0], [1]],
+                               dtype=dtypes.int32)
   return features, label
 
 
@@ -77,6 +78,59 @@ def _infer_ranking_train_input_fn():
   return features, None
 
 
+_QUANTILE_REGRESSION_SIZE = 1000
+
+
+def _quantile_regression_input_fns(two_dimension=False):
+  # The data generation is taken from
+  # http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html
+  np.random.seed(1)
+
+  def f(x):
+    """The function to predict."""
+    return x * np.sin(x)
+
+  def g(x):
+    """The function to predict."""
+    return x * np.cos(x)
+
+  #  Training data.
+  x = np.atleast_2d(np.random.uniform(0, 10.0,
+                                      size=_QUANTILE_REGRESSION_SIZE)).T
+  x = x.astype(np.float32)
+
+  # Labels.
+  if not two_dimension:
+    y = f(x).ravel()
+  else:
+    y = np.column_stack((f(x).ravel(), g(x).ravel()))
+
+  # Add random noise.
+  dy = 1.5 + 1.0 * np.random.random(y.shape)
+  noise = np.random.normal(0, dy)
+  y += noise
+  y_original = y.astype(np.float32)
+  if not two_dimension:
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1)
+
+  train_input_fn = numpy_io.numpy_input_fn(
+      x=x,
+      y=y,
+      batch_size=_QUANTILE_REGRESSION_SIZE,
+      num_epochs=None,
+      shuffle=True)
+
+  # Test on the training data to make sure the predictions are calibrated.
+  test_input_fn = numpy_io.numpy_input_fn(
+      x=x,
+      y=y,
+      batch_size=_QUANTILE_REGRESSION_SIZE,
+      num_epochs=1,
+      shuffle=False)
+
+  return train_input_fn, test_input_fn, y_original
+
+
 class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -341,6 +395,130 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     for prediction_dict in result_iter:
       self.assertTrue("classes" in prediction_dict)
 
+  # One dimensional quantile regression.
+  def testQuantileRegression(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
+
+    # 95% percentile.
+    model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["scores"])
+
+    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper >= 0.92)
+    self.assertTrue(frac_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
+    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["scores"])
+
+    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower >= 0.92)
+    self.assertTrue(frac_above_lower <= 0.98)
+
+  # Multi-dimensional quantile regression.
+  def testQuantileRegressionMultiDimLabel(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
+        two_dimension=True)
+
+    # 95% percentile.
+    model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        label_dimension=2,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["scores"])
+
+    count_below_upper = np.count_nonzero(upper > y, axis=0)
+    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
+    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
+    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
+    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper_0 >= 0.92)
+    self.assertTrue(frac_below_upper_0 <= 0.98)
+    self.assertTrue(frac_below_upper_1 >= 0.92)
+    self.assertTrue(frac_below_upper_1 <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.92)
+    self.assertTrue(frac_both_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
+        two_dimension=True)
+    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        label_dimension=2,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["scores"])
+
+    count_above_lower = np.count_nonzero(lower < y, axis=0)
+    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
+    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
+    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
+    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower_0 >= 0.92)
+    self.assertTrue(frac_above_lower_0 <= 0.98)
+    self.assertTrue(frac_above_lower_1 >= 0.92)
+    self.assertTrue(frac_above_lower_1 <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.92)
+    self.assertTrue(frac_both_above_lower <= 0.98)
+
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
@@ -489,8 +667,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
     feature_columns = [
         core_feature_column.weighted_categorical_column(
-            categorical_column=core_feature_column.
-            categorical_column_with_vocabulary_list(
+            categorical_column=core_feature_column
+            .categorical_column_with_vocabulary_list(
                 key="word", vocabulary_list=["the", "cat", "dog"]),
             weight_feature_key="weight")
     ]
@@ -509,8 +687,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
         # Weights for the words are 5 - cat, 6- dog and 1 -the.
         features_dict["word"] = sparse_tensor.SparseTensor(
             indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
-            values=constant_op.constant(
-                ["the", "cat", "dog", "the"], dtype=dtypes.string),
+            values=constant_op.constant(["the", "cat", "dog", "the"],
+                                        dtype=dtypes.string),
             dense_shape=[4, 3])
         features_dict["weight"] = sparse_tensor.SparseTensor(
             indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
@@ -534,6 +712,132 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     est.evaluate(input_fn=input_fn, steps=1)
     est.predict(input_fn=input_fn)
 
+  # One dimensional quantile regression.
+  def testQuantileRegression(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1)
+
+    # 95% percentile.
+    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["predictions"])
+
+    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper >= 0.92)
+    self.assertTrue(frac_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
+    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["predictions"])
+
+    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower >= 0.92)
+    self.assertTrue(frac_above_lower <= 0.98)
+
+  # Multi-dimensional quantile regression.
+  def testQuantileRegressionMultiDimLabel(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
+        two_dimension=True)
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 2)
+
+    # 95% percentile.
+    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        label_dimension=2,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["predictions"])
+
+    count_below_upper = np.count_nonzero(upper > y, axis=0)
+    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
+    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
+    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
+    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper_0 >= 0.92)
+    self.assertTrue(frac_below_upper_0 <= 0.98)
+    self.assertTrue(frac_below_upper_1 >= 0.92)
+    self.assertTrue(frac_below_upper_1 <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.92)
+    self.assertTrue(frac_both_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
+        two_dimension=True)
+    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        label_dimension=2,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["predictions"])
+
+    count_above_lower = np.count_nonzero(lower < y, axis=0)
+    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
+    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
+    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
+    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower_0 >= 0.92)
+    self.assertTrue(frac_above_lower_0 <= 0.98)
+    self.assertTrue(frac_above_lower_1 >= 0.92)
+    self.assertTrue(frac_above_lower_1 <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.92)
+    self.assertTrue(frac_both_above_lower <= 0.98)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/examples/boston.py b/tensorflow/contrib/boosted_trees/examples/boston.py
index 54c4ff059e3408d2cb8fc689a9ae877f57485f58..09b240a7006a8ef53eb95108b3adbfae728cf8fc 100644
--- a/tensorflow/contrib/boosted_trees/examples/boston.py
+++ b/tensorflow/contrib/boosted_trees/examples/boston.py
@@ -90,13 +90,13 @@ def _make_experiment_fn(output_dir):
   (x_train, y_train), (x_test,
                        y_test) = tf.keras.datasets.boston_housing.load_data()
 
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_train},
       y=y_train,
       batch_size=FLAGS.batch_size,
       num_epochs=None,
       shuffle=True)
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   feature_columns = [
diff --git a/tensorflow/contrib/boosted_trees/examples/boston_combined.py b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
index e04b56afbfd266dc13a5b0d78d171ea273415ee3..d640af354f55423b7c9706900359f5e64c459f39 100644
--- a/tensorflow/contrib/boosted_trees/examples/boston_combined.py
+++ b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
@@ -80,13 +80,13 @@ def _make_experiment_fn(output_dir):
   (x_train, y_train), (x_test,
                        y_test) = tf.keras.datasets.boston_housing.load_data()
 
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_train},
       y=y_train,
       batch_size=FLAGS.batch_size,
       num_epochs=None,
       shuffle=True)
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   feature_columns = [
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 8edb5d6c640611bbb90d7731b2fea4354e125563..6d78e27e8f69ea289b686af8402bd91967f997f4 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -834,8 +834,13 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
       root_gradient_stats *= normalizer_ratio;
       NodeStats root_stats = state->ComputeNodeStats(root_gradient_stats);
       int32 best_feature_idx = 0;
+      bool best_feature_updated = false;
       NodeStats best_right_node_stats(0);
       NodeStats best_left_node_stats(0);
+      CHECK(end_index - start_index >= 2)
+          << "Partition should have a non bias feature. Start index "
+          << start_index << " and end index " << end_index;
+
       for (int64 feature_idx = start_index + 1; feature_idx < end_index;
            ++feature_idx) {
         GradientStats left_gradient_stats(*gradients_t, *hessians_t,
@@ -845,11 +850,13 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
             root_gradient_stats - left_gradient_stats;
         NodeStats left_stats = state->ComputeNodeStats(left_gradient_stats);
         NodeStats right_stats = state->ComputeNodeStats(right_gradient_stats);
-        if (left_stats.gain + right_stats.gain > best_gain) {
+        if (!best_feature_updated ||
+            left_stats.gain + right_stats.gain > best_gain) {
           best_gain = left_stats.gain + right_stats.gain;
           best_left_node_stats = left_stats;
           best_right_node_stats = right_stats;
           best_feature_idx = feature_idx;
+          best_feature_updated = true;
         }
       }
       SplitInfo split_info;
@@ -864,7 +871,7 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
           << feature_ids(best_feature_idx, 0) << ", "
           << feature_ids(best_feature_idx, 1)
           << "\nPartition IDS: " << partition_ids(start_index) << "  "
-          << partition_ids(best_feature_idx);
+          << partition_ids(best_feature_idx) << " and best gain " << best_gain;
       equality_split->set_feature_id(feature_ids(best_feature_idx, 0));
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
index ab2853352a70073648f47e9835f8a66852ff584f..a30cfa663f4a4954f83224a7fd6448b369ad93b4 100644
--- a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
@@ -382,8 +382,7 @@ class GrowTreeEnsembleOp : public OpKernel {
         break;
       }
       case LearnerConfig::OBLIVIOUS_DECISION_TREE: {
-        FindBestSplitsPerPartitionOblivious(context, gains_list, splits_list,
-                                            &best_splits);
+        FindBestSplitOblivious(context, gains_list, splits_list, &best_splits);
         break;
       }
     }
@@ -475,10 +474,10 @@ class GrowTreeEnsembleOp : public OpKernel {
     }
   }
 
-  void FindBestSplitsPerPartitionOblivious(
-      OpKernelContext* const context, const OpInputList& gains_list,
-      const OpInputList& splits_list,
-      std::map<int32, SplitCandidate>* best_splits) {
+  void FindBestSplitOblivious(OpKernelContext* const context,
+                              const OpInputList& gains_list,
+                              const OpInputList& splits_list,
+                              std::map<int32, SplitCandidate>* best_splits) {
     // Find best split per partition going through every feature candidate.
     for (int64 handler_id = 0; handler_id < num_handlers_; ++handler_id) {
       const auto& gains = gains_list[handler_id].vec<float>();
@@ -654,6 +653,12 @@ class GrowTreeEnsembleOp : public OpKernel {
       return dest;
     }
 
+    if (dest->leaf_case() == boosted_trees::trees::Leaf::LEAF_NOT_SET) {
+      // No merging is required. Just copy the source weights;
+      *dest = source;
+      return dest;
+    }
+
     // Handle leaf merging based on type.
     switch (source.leaf_case()) {
       case boosted_trees::trees::Leaf::kVector: {
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index 3028c2281705bd7e34b212332160d25386559d4e..fd832de982a4a7a2bd39e450ad495e60c284ace7 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -67,6 +67,7 @@ tf_cc_test(
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py
index 5d4819b0f1cb598cfbe146f569aecd7883186339..efa2ab1dad8df9815c983afaa2e43982a49c5787 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/base_split_handler.py
@@ -19,15 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+
+import six
+
 from tensorflow.contrib.boosted_trees.python.ops import batch_ops_utils
 from tensorflow.python.ops import control_flow_ops
 
 
+@six.add_metaclass(abc.ABCMeta)
 class BaseSplitHandler(object):
   """Abstract Base class defining split handlers interface."""
 
-  __metaclass__ = abc.ABCMeta
-
   def __init__(self,
                l1_regularization,
                l2_regularization,
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
index 4da25298cb82093ac501997cc21c48265df06860..d26af58419752170bbc58bba757ac43349fc2cff 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
@@ -119,7 +119,7 @@ class EqualitySplitHandler(base_split_handler.BaseSplitHandler):
 
     def not_active_inputs():
       return (constant_op.constant([], dtype=dtypes.int32),
-              constant_op.constant([], dtype=dtypes.int64, shape=[1, 2]),
+              constant_op.constant_v1([], dtype=dtypes.int64, shape=[1, 2]),
               empty_gradients, empty_hessians)
 
     def active_inputs():
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
index a2f708081a4b484d649b5d09b172c2c60db69aeb..386dc19fc7b9529993a9625fb1298f6eb9a70d87 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
@@ -36,9 +36,9 @@ def get_empty_tensors(gradient_shape, hessian_shape):
   empty_hess_shape = [1] + hessian_shape.as_list()
   empty_grad_shape = [1] + gradient_shape.as_list()
 
-  empty_gradients = constant_op.constant(
+  empty_gradients = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_grad_shape)
-  empty_hessians = constant_op.constant(
+  empty_hessians = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_hess_shape)
 
   return empty_gradients, empty_hessians
@@ -486,8 +486,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
       gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
       hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
       partition_ids = [0, 0, 0, 1]
-      indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
-      values = array_ops.constant([], dtype=dtypes.int64)
+      indices = constant_op.constant_v1([], dtype=dtypes.int64, shape=[0, 2])
+      values = constant_op.constant_v1([], dtype=dtypes.int64)
 
       gradient_shape = tensor_shape.scalar()
       hessian_shape = tensor_shape.scalar()
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index f45010ec26ed25127ca78b97f4d6fd7ebd6467ae..0476bed2cd3f3ea5b47b10c51a819f17d6e37c74 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -142,7 +142,7 @@ class InequalitySplitHandler(base_split_handler.BaseSplitHandler):
         name="StatsAccumulator/{}".format(self._name))
     # Allocate both stats accumulator and quantile accumulator on the same
     # device so that we can build splits with fewer RPCs.
-    with ops.colocate_with(self._stats_accumulator.resource()):
+    with ops.colocate_with(self._stats_accumulator.resource_handle):
       self._quantile_accumulator = quantile_ops.QuantileAccumulator(
           init_stamp_token,
           epsilon=epsilon,
@@ -268,8 +268,8 @@ class DenseSplitHandler(InequalitySplitHandler):
       handler = make_dense_split_tensor
 
     are_splits_ready, partition_ids, gains, split_infos = (
-        handler(self._quantile_accumulator.resource(),
-                self._stats_accumulator.resource(), stamp_token,
+        handler(self._quantile_accumulator.resource_handle,
+                self._stats_accumulator.resource_handle, stamp_token,
                 next_stamp_token, self._multiclass_strategy, class_id,
                 self._feature_column_group_id, self._l1_regularization,
                 self._l2_regularization, self._tree_complexity_regularization,
@@ -447,8 +447,8 @@ class SparseSplitHandler(InequalitySplitHandler):
       handler = make_sparse_split_tensor
 
     are_splits_ready, partition_ids, gains, split_infos = (
-        handler(self._quantile_accumulator.resource(),
-                self._stats_accumulator.resource(), stamp_token,
+        handler(self._quantile_accumulator.resource_handle,
+                self._stats_accumulator.resource_handle, stamp_token,
                 next_stamp_token, self._multiclass_strategy, class_id,
                 self._feature_column_group_id, self._l1_regularization,
                 self._l2_regularization, self._tree_complexity_regularization,
@@ -605,7 +605,7 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
                             quantile_buckets, example_partition_ids, gradients,
                             hessians, weights, empty_gradients, empty_hessians):
   """Updates the state for dense split handler."""
-  empty_float = constant_op.constant([], dtype=dtypes.float32)
+  empty_float = constant_op.constant_v1([], dtype=dtypes.float32)
 
   quantile_values, quantile_weights = control_flow_ops.cond(
       is_active[1],  # For the next layer, this handler is inactive.
@@ -621,8 +621,8 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
     return (example_partition_ids, quantized_feature, gradients, hessians)
 
   def not_ready_inputs_fn():
-    return (constant_op.constant([], dtype=dtypes.int32),
-            constant_op.constant([[]], dtype=dtypes.int64, shape=[1, 2]),
+    return (constant_op.constant_v1([], dtype=dtypes.int32),
+            constant_op.constant_v1([[]], dtype=dtypes.int64, shape=[1, 2]),
             empty_gradients, empty_hessians)
 
   example_partition_ids, feature_ids, gradients, hessians = (
@@ -708,11 +708,11 @@ def sparse_make_stats_update(
 
   def quantiles_not_ready():
     """The subgraph for when the quantiles are not ready."""
-    return (constant_op.constant([], dtype=dtypes.int32),
-            constant_op.constant([], dtype=dtypes.int64, shape=[1, 2]),
+    return (constant_op.constant_v1([], dtype=dtypes.int32),
+            constant_op.constant_v1([], dtype=dtypes.int64, shape=[1, 2]),
             empty_gradients, empty_hessians)
 
-  empty_float = constant_op.constant([], dtype=dtypes.float32)
+  empty_float = constant_op.constant_v1([], dtype=dtypes.float32)
   handler_not_active = (constant_op.constant(
       [], dtype=dtypes.int64, shape=[0, 2]), empty_float,
                         constant_op.constant([0, 1], dtype=dtypes.int64),
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
index 74b0ea6989c65e83e7a466107d624712a0e72d1b..4a1b528646e7d2139d7eabb0264b8d280f8da133 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -39,9 +39,9 @@ def get_empty_tensors(gradient_shape, hessian_shape):
   empty_hess_shape = [1] + hessian_shape.as_list()
   empty_grad_shape = [1] + gradient_shape.as_list()
 
-  empty_gradients = constant_op.constant(
+  empty_gradients = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_grad_shape)
-  empty_hessians = constant_op.constant(
+  empty_hessians = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_hess_shape)
 
   return empty_gradients, empty_hessians
@@ -1476,9 +1476,9 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
 
   def testEmpty(self):
     with self.cached_session() as sess:
-      indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
+      indices = constant_op.constant_v1([], dtype=dtypes.int64, shape=[0, 2])
       # No values in this feature column in this mini-batch.
-      values = array_ops.constant([], dtype=dtypes.float32)
+      values = constant_op.constant_v1([], dtype=dtypes.float32)
       sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
 
       gradient_shape = tensor_shape.scalar()
@@ -1549,8 +1549,9 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
       sparse_column = array_ops.sparse_placeholder(dtypes.float32)
 
       # We have two batches - at first, a sparse feature is empty.
-      empty_indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
-      empty_values = array_ops.constant([], dtype=dtypes.float32)
+      empty_indices = constant_op.constant_v1([], dtype=dtypes.int64,
+                                              shape=[0, 2])
+      empty_values = constant_op.constant_v1([], dtype=dtypes.float32)
       empty_sparse_column = sparse_tensor.SparseTensor(empty_indices,
                                                        empty_values, [4, 2])
       empty_sparse_column = empty_sparse_column.eval(session=sess)
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
index 64921faf81c0ea8ae7fb1bbec71396ef3408e6ca..de30a7bde792e727ceab7798458566d4527f5867 100644
--- a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
@@ -81,9 +81,10 @@ int DecisionTree::Traverse(const DecisionTreeConfig& config,
         const auto& split = current_node.categorical_id_binary_split();
         const auto& features =
             example.sparse_int_features[split.feature_column()];
-        node_id = features.find(split.feature_id()) != features.end()
-                      ? split.left_id()
-                      : split.right_id();
+        node_id = (std::find(features.begin(), features.end(),
+                             split.feature_id()) == features.end())
+                      ? split.right_id()
+                      : split.left_id();
         break;
       }
       case TreeNode::kCategoricalIdSetMembershipBinarySplit: {
@@ -117,7 +118,8 @@ int DecisionTree::Traverse(const DecisionTreeConfig& config,
         oblivious_leaf_idx <<= 1;
         const auto& features =
             example.sparse_int_features[split.feature_column()];
-        if (features.find(split.feature_id()) == features.end()) {
+        if (std::find(features.begin(), features.end(), split.feature_id()) ==
+            features.end()) {
           oblivious_leaf_idx++;
         }
         node_id++;
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/example.h b/tensorflow/contrib/boosted_trees/lib/utils/example.h
index 1371ff337f78dd1c38f2bd0ba86911642f3aeb3e..445ffaaa714c4a69710f9a21d5f2775b8b0f6e22 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/example.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/example.h
@@ -20,6 +20,7 @@
 #include <unordered_set>
 #include <vector>
 #include "tensorflow/contrib/boosted_trees/lib/utils/optional_value.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace tensorflow {
 namespace boosted_trees {
@@ -124,7 +125,9 @@ struct Example {
   // Sparse integer features indexed by feature column.
   // Note that all integer features are assumed to be categorical, i.e. will
   // never be compared by order. Also these features can be multivalent.
-  std::vector<std::unordered_set<int64>> sparse_int_features;
+  // By default we allocate a InlinedVector of length 1 though since that is
+  // the most common case.
+  std::vector<gtl::InlinedVector<int64, 1>> sparse_int_features;
 };
 
 }  // namespace utils
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
index 1b654e1c44e545fb97216ad950f3cd2d3240ffd0..3c5e0fbbb40a916e6a3c4197007fb2b562682aae 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h
@@ -148,7 +148,7 @@ class ExamplesIterable {
                                                       row_range.start);
           for (int64 row_idx = row_range.start; row_idx < row_range.end;
                ++row_idx) {
-            sparse_int_features[sparse_int_idx].insert(
+            sparse_int_features[sparse_int_idx].push_back(
                 iter_->sparse_int_column_values_[sparse_int_idx](row_idx));
           }
         }
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc
index 30c37435fe16ef29a9e29202850501098e9ac7f8..2f4f2495eaf799a35fb78e183e545f6a1e2d7790 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 // =============================================================================
 #include "tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h"
+#include "absl/algorithm/container.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -90,8 +91,8 @@ TEST_F(ExamplesIterableTest, Iterate) {
         EXPECT_EQ(1.0f, example.sparse_float_features[1][1].get_value());
 
         EXPECT_EQ(2, example.sparse_int_features[0].size());
-        EXPECT_EQ(1, example.sparse_int_features[0].count(1));
-        EXPECT_EQ(1, example.sparse_int_features[0].count(8));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[0], 1));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[0], 8));
         EXPECT_EQ(0, example.sparse_int_features[1].size());
       } break;
       case 1: {
@@ -105,9 +106,9 @@ TEST_F(ExamplesIterableTest, Iterate) {
         EXPECT_FALSE(example.sparse_float_features[1][1].has_value());
 
         EXPECT_EQ(1, example.sparse_int_features[0].size());
-        EXPECT_EQ(1, example.sparse_int_features[0].count(0));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[0], 0));
         EXPECT_EQ(1, example.sparse_int_features[1].size());
-        EXPECT_EQ(1, example.sparse_int_features[1].count(7));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[1], 7));
       } break;
       case 2: {
         EXPECT_EQ(2, example.example_idx);
@@ -122,7 +123,7 @@ TEST_F(ExamplesIterableTest, Iterate) {
 
         EXPECT_EQ(0, example.sparse_int_features[0].size());
         EXPECT_EQ(1, example.sparse_int_features[1].size());
-        EXPECT_EQ(1, example.sparse_int_features[1].count(13));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[1], 13));
       } break;
       case 3: {
         EXPECT_EQ(3, example.example_idx);
@@ -136,10 +137,10 @@ TEST_F(ExamplesIterableTest, Iterate) {
         EXPECT_FALSE(example.sparse_float_features[1][1].has_value());
 
         EXPECT_EQ(2, example.sparse_int_features[0].size());
-        EXPECT_EQ(1, example.sparse_int_features[0].count(2));
-        EXPECT_EQ(1, example.sparse_int_features[0].count(0));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[0], 2));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[0], 0));
         EXPECT_EQ(1, example.sparse_int_features[1].size());
-        EXPECT_EQ(1, example.sparse_int_features[1].count(4));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[1], 4));
       } break;
       case 4: {
         EXPECT_EQ(4, example.example_idx);
@@ -154,7 +155,7 @@ TEST_F(ExamplesIterableTest, Iterate) {
 
         EXPECT_EQ(0, example.sparse_int_features[0].size());
         EXPECT_EQ(1, example.sparse_int_features[1].size());
-        EXPECT_EQ(1, example.sparse_int_features[1].count(0));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[1], 0));
       } break;
       case 5: {
         EXPECT_EQ(5, example.example_idx);
@@ -191,7 +192,7 @@ TEST_F(ExamplesIterableTest, Iterate) {
         EXPECT_FALSE(example.sparse_float_features[1][1].has_value());
 
         EXPECT_EQ(1, example.sparse_int_features[0].size());
-        EXPECT_EQ(1, example.sparse_int_features[0].count(5));
+        EXPECT_EQ(1, absl::c_count(example.sparse_int_features[0], 5));
       } break;
       default: { LOG(QFATAL) << "Invalid example index."; } break;
     }
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
index 05ce0884ccfff53484fdc0c26e596e7fb6fcdfd6..356ae337685d580319da16a20bbab27ccaa73255 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
@@ -34,7 +34,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -62,7 +62,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2, 1],
@@ -91,7 +91,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -123,7 +123,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -133,7 +133,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
 
       with ops.control_dependencies([op1]):
         (stamp_token, num_updates, partition_1, feature_1, grads_1,
-         hessians_1) = accumulator.serialize()
+         hessians_1) = accumulator.saveable.serialize()
       # Make sure that the accumulator hasn't changed during serialization.
       with ops.control_dependencies([stamp_token]):
         num_updates_2, partition_2, feature_2, grads_2, hessians_2 = (
@@ -164,7 +164,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.scalar(),
           hessian_shape=tensor_shape.scalar())
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         # These will be deleted due to deserialize call.
         op1 = accumulator.add(
             stamp_token=0,
@@ -175,7 +175,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
 
       with ops.control_dependencies([op1]):
         deserialize = (
-            accumulator.deserialize(
+            accumulator.saveable.deserialize(
                 stamp_token=2,
                 num_updates=3,
                 partition_ids=[3, 4],
@@ -223,7 +223,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -261,7 +261,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -299,7 +299,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -336,7 +336,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
@@ -349,7 +349,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
 
       with ops.control_dependencies([op1]):
         (stamp_token, num_updates_1, partition_1, feature_1, grads_1,
-         hessians_1) = accumulator.serialize()
+         hessians_1) = accumulator.saveable.serialize()
       # Make sure that the accumulator hasn't changed during serialization.
       with ops.control_dependencies([stamp_token]):
         num_updates_2, partition_2, feature_2, grads_2, hessians_2 = (
@@ -386,7 +386,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           stamp_token=0,
           gradient_shape=tensor_shape.TensorShape([2]),
           hessian_shape=tensor_shape.TensorShape([2, 2]))
-      with ops.control_dependencies([accumulator._create_op]):
+      with ops.control_dependencies([accumulator.initializer]):
         # These will be deleted due to deserialize call.
         op1 = accumulator.add(
             stamp_token=0,
@@ -399,7 +399,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
                                                                     0.08]]])
 
       with ops.control_dependencies([op1]):
-        deserialize = accumulator.deserialize(
+        deserialize = accumulator.saveable.deserialize(
             stamp_token=2,
             num_updates=3,
             partition_ids=[3, 4],
diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
index 843420968ac6a6716fdf6b4967146e131139f67c..4dc764f95713ab788c282c2f3e7fb278a24f4822 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 import abc
 import collections
 
+import six
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -27,11 +29,10 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 
 
+@six.add_metaclass(abc.ABCMeta)
 class ScheduledOp(object):
   """Represents a scheduled remote operation."""
 
-  __metaclass__ = abc.ABCMeta
-
   @abc.abstractmethod
   def batching_key(self):
     """Returns the key for batching operations."""
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
index 25b2c9e2fd72bd018717e8a87fce726f26bad968..fca22c71a83459cb290eaebcf107cf1c14c222b7 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 # pylint: disable=unused-import
 from tensorflow.contrib.boosted_trees.python.ops import boosted_trees_ops_loader
 # pylint: enable=unused-import
@@ -31,6 +33,7 @@ from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensem
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 ops.NotDifferentiable("TreeEnsembleVariable")
 ops.NotDifferentiable("TreeEnsembleSerialize")
@@ -82,6 +85,44 @@ class TreeEnsembleVariableSavable(saver.BaseSaverBuilder.SaveableObject):
           tree_ensemble_config=restored_tensors[1])
 
 
+class TreeEnsembleVariable(tracking.TrackableResource):
+  """A Tree ensemble model."""
+
+  def __init__(self, stamp_token, tree_ensemble_config, name, container=None):
+    self._stamp_token = stamp_token
+    self._tree_ensemble_config = tree_ensemble_config
+    self._name = name
+    self._container = container
+    self._init_op = None
+    super(TreeEnsembleVariable, self).__init__()
+
+  def create_resource(self):
+    return gen_model_ops.decision_tree_ensemble_resource_handle_op(
+        self._container, shared_name=self._name, name=self._name)
+
+  def initialize(self):
+    return gen_model_ops.create_tree_ensemble_variable(
+        self.resource_handle, self._stamp_token, self._tree_ensemble_config)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_model_ops.tree_ensemble_is_initialized_op(self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    return {
+        "tree_ensemble_variable":
+            functools.partial(
+                TreeEnsembleVariableSavable,
+                tree_ensemble_handle=self.resource_handle,
+                create_op=self.initializer)
+    }
+
+
 def tree_ensemble_variable(stamp_token,
                            tree_ensemble_config,
                            name,
@@ -99,12 +140,11 @@ def tree_ensemble_variable(stamp_token,
     A `Tensor` of type mutable `string`. The handle to the tree ensemble.
   """
   with ops.name_scope(name, "TreeEnsembleVariable") as name:
-    resource_handle = gen_model_ops.decision_tree_ensemble_resource_handle_op(
-        container, shared_name=name, name=name)
-    create_op = gen_model_ops.create_tree_ensemble_variable(
-        resource_handle, stamp_token, tree_ensemble_config)
-    is_initialized_op = gen_model_ops.tree_ensemble_is_initialized_op(
-        resource_handle)
+    tree_ensemble_var = TreeEnsembleVariable(stamp_token, tree_ensemble_config,
+                                             name, container)
+    resource_handle = tree_ensemble_var.resource_handle
+    create_op = tree_ensemble_var.initializer
+    is_initialized_op = tree_ensemble_var.is_initialized()
     # Adds the variable to the savable list.
     saveable = TreeEnsembleVariableSavable(resource_handle, create_op,
                                            resource_handle.name)
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 19b6b3296db394b07f57a25dbde187eb9195af38..0c319cc9bd1f720eb404a9da05227c5807ec874f 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -33,59 +33,20 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
 
 
-class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
-  """A resource that allows distributed quantile computation."""
-
-  def __init__(self,
-               init_stamp_token,
-               epsilon,
-               num_quantiles,
-               max_elements=None,
-               name=None,
-               container=None,
-               generate_quantiles=False):
-    """Creates a QuantileAccumulator object.
-
-    Args:
-      init_stamp_token: The initial value for the stamp token.
-      epsilon: Error bound on the quantile computation.
-      num_quantiles: Number of quantiles to produce from the final summary.
-      max_elements: Maximum number of elements added to the accumulator.
-      name: the name to save the accumulator under.
-      container: An optional `string`. Defaults to `""`
-      generate_quantiles: Generate quantiles instead of approximate boundaries.
-        If true, exactly `num_quantiles` will be produced in the final summary.
-    """
-    self._epsilon = epsilon
-    self._generate_quantiles = generate_quantiles
+class QuantileAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for QuantileAccumulator."""
 
-    name = _PATTERN.sub("", name)
-    with ops.name_scope(name, "QuantileAccumulator") as name:
-      self._quantile_accumulator_handle = (
-          gen_quantile_ops.quantile_stream_resource_handle_op(
-              container=container, shared_name=name, name=name))
-      self._create_op = gen_quantile_ops.create_quantile_accumulator(
-          self._quantile_accumulator_handle,
-          init_stamp_token,
-          epsilon=epsilon,
-          max_elements=max_elements,
-          num_quantiles=num_quantiles,
-          generate_quantiles=generate_quantiles)
-      is_initialized_op = gen_quantile_ops.quantile_accumulator_is_initialized(
-          self._quantile_accumulator_handle)
-    resources.register_resource(self._quantile_accumulator_handle,
-                                self._create_op, is_initialized_op)
-    self._make_savable(name)
-
-  def _make_savable(self, name):
+  def __init__(self, resource_handle, create_op, name):
+    self._resource_handle = resource_handle
+    self._create_op = create_op
     stamp_token, state, are_buckets_ready, buckets = (
-        gen_quantile_ops.quantile_accumulator_serialize(
-            self._quantile_accumulator_handle))
+        gen_quantile_ops.quantile_accumulator_serialize(resource_handle))
     # slice_spec is useful for saving a slice from a variable.
     # It's not meaningful in quantile accumulator.
     slice_spec = ""
@@ -96,9 +57,8 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     specs += [make_save_spec(state, "_state")]
     specs += [make_save_spec(are_buckets_ready, "_are_buckets_ready")]
     specs += [make_save_spec(buckets, "buckets")]
-    super(QuantileAccumulator,
-          self).__init__(self._quantile_accumulator_handle, specs, name)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+    super(QuantileAccumulatorSaveable, self).__init__(self._resource_handle,
+                                                      specs, name)
 
   def restore(self, restored_tensors, unused_restored_shapes):
     """Restores the associated quantile accumulator from 'restored_tensors'.
@@ -119,24 +79,94 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     buckets = restored_tensors[3]
     with ops.control_dependencies([self._create_op]):
       return gen_quantile_ops.quantile_accumulator_deserialize(
-          self._quantile_accumulator_handle,
+          self._resource_handle,
           stamp_token=stamp_token,
           stream_state=state,
           are_buckets_ready=are_buckets_ready,
           buckets=buckets)
 
+
+class QuantileAccumulator(tracking.TrackableResource):
+  """A resource that allows distributed quantile computation."""
+
+  def __init__(self,
+               init_stamp_token,
+               epsilon,
+               num_quantiles,
+               max_elements=None,
+               name=None,
+               container=None,
+               generate_quantiles=False):
+    """Creates a QuantileAccumulator object.
+
+    Args:
+      init_stamp_token: The initial value for the stamp token.
+      epsilon: Error bound on the quantile computation.
+      num_quantiles: Number of quantiles to produce from the final summary.
+      max_elements: Maximum number of elements added to the accumulator.
+      name: the name to save the accumulator under.
+      container: An optional `string`. Defaults to `""`
+      generate_quantiles: Generate quantiles instead of approximate boundaries.
+        If true, exactly `num_quantiles` will be produced in the final summary.
+    """
+    self._init_stamp_token = init_stamp_token
+    self._epsilon = epsilon
+    self._num_quantiles = num_quantiles
+    self._max_elements = max_elements
+    self._container = container
+    self._generate_quantiles = generate_quantiles
+    super(QuantileAccumulator, self).__init__()
+
+    name = _PATTERN.sub("", name)
+    with ops.name_scope(name, "QuantileAccumulator") as name:
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
+    resources.register_resource(self.resource_handle, self._init_op,
+                                is_initialized_op)
+    self._saveable = QuantileAccumulatorSaveable(self.resource_handle,
+                                                 self._init_op, name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
+
+  def create_resource(self):
+    return gen_quantile_ops.quantile_stream_resource_handle_op(
+        container=self._container, shared_name=self._name, name=self._name)
+
+  def initialize(self):
+    return gen_quantile_ops.create_quantile_accumulator(
+        self.resource_handle,
+        self._init_stamp_token,
+        epsilon=self._epsilon,
+        max_elements=self._max_elements,
+        num_quantiles=self._num_quantiles,
+        generate_quantiles=self._generate_quantiles)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_quantile_ops.quantile_accumulator_is_initialized(
+        self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    return {"quantile_accumulator", self.saveable}
+
   def get_buckets(self, stamp_token):
     """Returns quantile buckets created during previous flush."""
     are_buckets_ready, buckets = (
         gen_quantile_ops.quantile_accumulator_get_buckets(
-            quantile_accumulator_handles=[self._quantile_accumulator_handle],
+            quantile_accumulator_handles=[self.resource_handle],
             stamp_token=stamp_token))
     return are_buckets_ready[0], buckets[0]
 
   def schedule_get_buckets(self):
     """Returns a scheduled read of buckets created during previous flush."""
     return batch_ops_utils.ScheduledStampedResourceOp(
-        resource_handle=self._quantile_accumulator_handle,
+        resource_handle=self.resource_handle,
         op=gen_quantile_ops.quantile_accumulator_get_buckets)
 
   def _make_summary(self, column, example_weights):
@@ -161,14 +191,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     """Adds quantile summary to its stream in resource."""
     summary = self._make_summary(column, example_weights)
     return gen_quantile_ops.quantile_accumulator_add_summaries(
-        quantile_accumulator_handles=[self._quantile_accumulator_handle],
+        quantile_accumulator_handles=[self.resource_handle],
         stamp_token=stamp_token,
         summaries=[summary])
 
   def add_prebuilt_summary(self, stamp_token, summary):
     """Adds quantile summary to its stream in resource."""
     return gen_quantile_ops.quantile_accumulator_add_summaries(
-        quantile_accumulator_handles=[self._quantile_accumulator_handle],
+        quantile_accumulator_handles=[self.resource_handle],
         stamp_token=stamp_token,
         summaries=[summary])
 
@@ -177,7 +207,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
     summary = self._make_summary(column, example_weights)
     return batch_ops_utils.ScheduledStampedResourceOp(
         op=gen_quantile_ops.quantile_accumulator_add_summaries,
-        resource_handle=self._quantile_accumulator_handle,
+        resource_handle=self.resource_handle,
         summaries=summary)
 
   def flush(self, stamp_token, next_stamp_token):
@@ -190,17 +220,14 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
       The flush operation.
     """
     return gen_quantile_ops.quantile_accumulator_flush(
-        quantile_accumulator_handle=self._quantile_accumulator_handle,
+        quantile_accumulator_handle=self.resource_handle,
         stamp_token=stamp_token,
         next_stamp_token=next_stamp_token)
 
   def flush_summary(self, stamp_token, next_stamp_token):
     """Finalizes quantile summary stream and resets it for next iteration."""
     result = gen_quantile_ops.quantile_accumulator_flush_summary(
-        quantile_accumulator_handle=self._quantile_accumulator_handle,
+        quantile_accumulator_handle=self.resource_handle,
         stamp_token=stamp_token,
         next_stamp_token=next_stamp_token)
     return result
-
-  def resource(self):
-    return self._quantile_accumulator_handle
diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
index 2e94e353f325f06eed2d290d3a7a461861820c39..ad1191d41236e71008bff8c8a7fbd42c16e3f9c5 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
@@ -26,12 +26,83 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
 
 
-class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
+class StatsAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for StatsAccumulator."""
+
+  def __init__(self, resource_handle, create_op, is_scalar, name):
+    self._create_op = create_op
+    self._resource_handle = resource_handle
+    self._is_scalar = is_scalar
+    slice_spec = ""
+    saver_name = self._resource_handle.name
+    (stamp_token, num_updates, partition_ids, feature_ids, gradients,
+     hessians) = self.serialize()
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
+                                        saver_name + "_stamp"),
+        saver.BaseSaverBuilder.SaveSpec(num_updates, slice_spec,
+                                        saver_name + "_num_updates"),
+        saver.BaseSaverBuilder.SaveSpec(partition_ids, slice_spec,
+                                        saver_name + "_partition_ids"),
+        saver.BaseSaverBuilder.SaveSpec(feature_ids, slice_spec,
+                                        saver_name + "_feature_ids"),
+        saver.BaseSaverBuilder.SaveSpec(gradients, slice_spec,
+                                        saver_name + "_gradients"),
+        saver.BaseSaverBuilder.SaveSpec(hessians, slice_spec,
+                                        saver_name + "hessians"),
+    ]
+    super(StatsAccumulatorSaveable, self).__init__(self._resource_handle, specs,
+                                                   name)
+
+  def serialize(self):
+    """Serializes the stats accumulator state."""
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_serialize(
+          self._resource_handle)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_serialize(
+          self._resource_handle)
+
+  def deserialize(self, stamp_token, num_updates, partition_ids, feature_ids,
+                  gradients, hessians):
+    """Resets the stats accumulator with the serialized state."""
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_deserialize(
+          self._resource_handle, stamp_token, num_updates, partition_ids,
+          feature_ids, gradients, hessians)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_deserialize(
+          self._resource_handle, stamp_token, num_updates, partition_ids,
+          feature_ids, gradients, hessians)
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree ensemble from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree ensemble variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return self.deserialize(
+          stamp_token=restored_tensors[0],
+          num_updates=restored_tensors[1],
+          partition_ids=restored_tensors[2],
+          feature_ids=restored_tensors[3],
+          gradients=restored_tensors[4],
+          hessians=restored_tensors[5])
+
+
+class StatsAccumulator(tracking.TrackableResource):
   """A resource that allows to accumulate gradients and hessians.
 
   For consistency guarantees, we use read and write stamp tokens.
@@ -58,58 +129,69 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
     Returns:
       A `Tensor` of type mutable `string`. The handle to the stats accumulator.
     """
+    self._stamp_token = stamp_token
+    self._gradient_shape = gradient_shape
+    self._hessian_shape = hessian_shape
+    self._container = container
+
+    if (gradient_shape == tensor_shape.scalar() and
+        hessian_shape == tensor_shape.scalar()):
+      self._is_scalar = True
+    else:
+      self._is_scalar = False
+
     if name is not None:
       name = _PATTERN.sub("", name)
     with ops.name_scope(name, "StatsAccumulator") as name:
-      # Both values are scalars.
-      if (gradient_shape == tensor_shape.scalar() and
-          hessian_shape == tensor_shape.scalar()):
-        self._is_scalar = True
-        self._resource_handle = (gen_stats_accumulator_ops.
-                                 stats_accumulator_scalar_resource_handle_op(
-                                     container, name, name=name))
-
-        create_op = gen_stats_accumulator_ops.create_stats_accumulator_scalar(
-            self._resource_handle, stamp_token)
-        is_initialized_op = (
-            gen_stats_accumulator_ops.stats_accumulator_scalar_is_initialized(
-                self._resource_handle))
-      else:
-        self._is_scalar = False
-        self._resource_handle = (gen_stats_accumulator_ops.
-                                 stats_accumulator_tensor_resource_handle_op(
-                                     container, name, name=name))
-        create_op = gen_stats_accumulator_ops.create_stats_accumulator_tensor(
-            self._resource_handle, stamp_token, gradient_shape.as_list(),
-            hessian_shape.as_list())
-        is_initialized_op = (
-            gen_stats_accumulator_ops.stats_accumulator_tensor_is_initialized(
-                self._resource_handle))
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
+    resources.register_resource(self.resource_handle, self.initializer,
+                                is_initialized_op)
+    self._saveable = StatsAccumulatorSaveable(
+        self.resource_handle, self.initializer, self._is_scalar, name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
-    self._create_op = create_op
-    slice_spec = ""
-    saver_name = self._resource_handle.name
-    (stamp_token, num_updates, partition_ids, feature_ids, gradients,
-     hessians) = self.serialize()
-    specs = [
-        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
-                                        saver_name + "_stamp"),
-        saver.BaseSaverBuilder.SaveSpec(num_updates, slice_spec,
-                                        saver_name + "_num_updates"),
-        saver.BaseSaverBuilder.SaveSpec(partition_ids, slice_spec,
-                                        saver_name + "_partition_ids"),
-        saver.BaseSaverBuilder.SaveSpec(feature_ids, slice_spec,
-                                        saver_name + "_feature_ids"),
-        saver.BaseSaverBuilder.SaveSpec(gradients, slice_spec,
-                                        saver_name + "_gradients"),
-        saver.BaseSaverBuilder.SaveSpec(hessians, slice_spec,
-                                        saver_name + "hessians"),
-    ]
+  def create_resource(self):
+    if self._is_scalar:
+      return (
+          gen_stats_accumulator_ops.stats_accumulator_scalar_resource_handle_op(
+              self._container, self._name, name=self._name))
+    else:
+      return (
+          gen_stats_accumulator_ops.stats_accumulator_tensor_resource_handle_op(
+              self._container, self._name, name=self._name))
 
-    super(StatsAccumulator, self).__init__(self._resource_handle, specs, name)
-    resources.register_resource(self._resource_handle, create_op,
-                                is_initialized_op)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+  def initialize(self):
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.create_stats_accumulator_scalar(
+          self.resource_handle, self._stamp_token)
+    else:
+      return gen_stats_accumulator_ops.create_stats_accumulator_tensor(
+          self.resource_handle, self._stamp_token,
+          self._gradient_shape.as_list(), self._hessian_shape.as_list())
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    if self._is_scalar:
+      return gen_stats_accumulator_ops.stats_accumulator_scalar_is_initialized(
+          self.resource_handle)
+    else:
+      return gen_stats_accumulator_ops.stats_accumulator_tensor_is_initialized(
+          self.resource_handle)
+
+  @property
+  def saveable(self):
+    return self._saveable
+
+  def _gather_saveables_for_checkpoint(self):
+    return {"stats_accumulator", self.saveable}
 
   def add(self, stamp_token, partition_ids, feature_ids, gradients, hessians):
     """Updates the stats accumulator."""
@@ -117,11 +199,11 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
         partition_ids, feature_ids, gradients, hessians))
     if self._is_scalar:
       return gen_stats_accumulator_ops.stats_accumulator_scalar_add(
-          [self._resource_handle], stamp_token, [partition_ids], [feature_ids],
+          [self.resource_handle], stamp_token, [partition_ids], [feature_ids],
           [gradients], [hessians])
     else:
       return gen_stats_accumulator_ops.stats_accumulator_tensor_add(
-          [self._resource_handle], stamp_token, [partition_ids], [feature_ids],
+          [self.resource_handle], stamp_token, [partition_ids], [feature_ids],
           [gradients], [hessians])
 
   def schedule_add(self, partition_ids, feature_ids, gradients, hessians):
@@ -131,7 +213,7 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
     if self._is_scalar:
       return batch_ops_utils.ScheduledStampedResourceOp(
           op=gen_stats_accumulator_ops.stats_accumulator_scalar_add,
-          resource_handle=self._resource_handle,
+          resource_handle=self.resource_handle,
           partition_ids=partition_ids,
           feature_ids=feature_ids,
           gradients=gradients,
@@ -139,7 +221,7 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
     else:
       return batch_ops_utils.ScheduledStampedResourceOp(
           op=gen_stats_accumulator_ops.stats_accumulator_tensor_add,
-          resource_handle=self._resource_handle,
+          resource_handle=self.resource_handle,
           partition_ids=partition_ids,
           feature_ids=feature_ids,
           gradients=gradients,
@@ -153,55 +235,11 @@ class StatsAccumulator(saver.BaseSaverBuilder.SaveableObject):
       return gen_stats_accumulator_ops.stats_accumulator_tensor_make_summary(
           partition_ids, feature_ids, gradients, hessians)
 
-  def deserialize(self, stamp_token, num_updates, partition_ids, feature_ids,
-                  gradients, hessians):
-    """Resets the stats accumulator with the serialized state."""
-    if self._is_scalar:
-      return gen_stats_accumulator_ops.stats_accumulator_scalar_deserialize(
-          self._resource_handle, stamp_token, num_updates, partition_ids,
-          feature_ids, gradients, hessians)
-    else:
-      return gen_stats_accumulator_ops.stats_accumulator_tensor_deserialize(
-          self._resource_handle, stamp_token, num_updates, partition_ids,
-          feature_ids, gradients, hessians)
-
   def flush(self, stamp_token, next_stamp_token):
     """Flushes the stats accumulator."""
     if self._is_scalar:
       return gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
-          self._resource_handle, stamp_token, next_stamp_token)
+          self.resource_handle, stamp_token, next_stamp_token)
     else:
       return gen_stats_accumulator_ops.stats_accumulator_tensor_flush(
-          self._resource_handle, stamp_token, next_stamp_token)
-
-  def serialize(self):
-    """Serializes the stats accumulator state."""
-    if self._is_scalar:
-      return gen_stats_accumulator_ops.stats_accumulator_scalar_serialize(
-          self._resource_handle)
-    else:
-      return gen_stats_accumulator_ops.stats_accumulator_tensor_serialize(
-          self._resource_handle)
-
-  def restore(self, restored_tensors, unused_restored_shapes):
-    """Restores the associated tree ensemble from 'restored_tensors'.
-
-    Args:
-      restored_tensors: the tensors that were loaded from a checkpoint.
-      unused_restored_shapes: the shapes this object should conform to after
-        restore. Not meaningful for trees.
-
-    Returns:
-      The operation that restores the state of the tree ensemble variable.
-    """
-    with ops.control_dependencies([self._create_op]):
-      return self.deserialize(
-          stamp_token=restored_tensors[0],
-          num_updates=restored_tensors[1],
-          partition_ids=restored_tensors[2],
-          feature_ids=restored_tensors[3],
-          gradients=restored_tensors[4],
-          hessians=restored_tensors[5])
-
-  def resource(self):
-    return self._resource_handle
+          self.resource_handle, stamp_token, next_stamp_token)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 8531e97f90236b8e5eb64bc0f4c9bb3b674f35cd..9fdc2fc0c2c7b85502f7a3f9ae7c85cf05d5916c 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -22,7 +22,6 @@ import collections
 import copy
 
 from tensorflow.contrib import learn
-from tensorflow.contrib import stateless
 from tensorflow.contrib.boosted_trees.lib.learner.batch import categorical_split_handler
 from tensorflow.contrib.boosted_trees.lib.learner.batch import ordinal_split_handler
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
@@ -44,6 +43,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops as stateless
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
@@ -386,10 +386,21 @@ class GradientBoostedDecisionTreeModel(object):
         learner_pb2.LearnerConfig.GROWING_MODE_UNSPECIFIED):
       learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
 
+    if (learner_config.weak_learner_type == learner_pb2.LearnerConfig
+        .OBLIVIOUS_DECISION_TREE and learner_config.pruning_mode == learner_pb2
+        .LearnerConfig.PRUNING_MODE_UNSPECIFIED):
+      learner_config.pruning_mode = learner_pb2.LearnerConfig.PRE_PRUNE
+
     if (learner_config.pruning_mode ==
         learner_pb2.LearnerConfig.PRUNING_MODE_UNSPECIFIED):
       learner_config.pruning_mode = learner_pb2.LearnerConfig.POST_PRUNE
 
+    if (learner_config.weak_learner_type == learner_pb2.LearnerConfig
+        .OBLIVIOUS_DECISION_TREE and
+        learner_config.pruning_mode == learner_pb2.LearnerConfig.POST_PRUNE):
+      raise ValueError(
+          "Post pruning is not implmented for oblivious decision trees.")
+
     if learner_config.constraints.max_tree_depth == 0:
       # Use 6 as the default maximum depth.
       learner_config.constraints.max_tree_depth = 6
@@ -418,6 +429,11 @@ class GradientBoostedDecisionTreeModel(object):
      sparse_float_shapes, sparse_int_indices,
      sparse_int_values, sparse_int_shapes) = extract_features(
          features, self._feature_columns, use_core_columns)
+    if (learner_config.weak_learner_type == learner_pb2.LearnerConfig
+        .OBLIVIOUS_DECISION_TREE and sparse_float_indices):
+      raise ValueError("Oblivious trees don't handle sparse float features yet."
+                      )
+
     logging.info("Active Feature Columns: " + str(fc_names))
     logging.info("Learner config: " + str(learner_config))
     self._fc_names = fc_names
@@ -881,9 +897,9 @@ class GradientBoostedDecisionTreeModel(object):
     empty_hess_shape = [1] + self._hessian_shape.as_list()
     empty_grad_shape = [1] + self._gradient_shape.as_list()
 
-    empty_gradients = constant_op.constant(
+    empty_gradients = constant_op.constant_v1(
         [], dtype=dtypes.float32, shape=empty_grad_shape)
-    empty_hessians = constant_op.constant(
+    empty_hessians = constant_op.constant_v1(
         [], dtype=dtypes.float32, shape=empty_hess_shape)
 
     active_handlers = array_ops.unstack(active_handlers, axis=0)
@@ -976,7 +992,7 @@ class GradientBoostedDecisionTreeModel(object):
 
         # Get accumulated steps and examples for the current layer.
         _, _, _, _, acc_examples, acc_steps = (
-            steps_accumulator.serialize())
+            steps_accumulator.saveable.serialize())
         acc_examples = math_ops.cast(acc_examples[0], dtypes.int64)
         acc_steps = math_ops.cast(acc_steps[0], dtypes.int64)
         ensemble_update_ops.append(
@@ -1241,13 +1257,12 @@ class GradientBoostedDecisionTreeModel(object):
   def _get_replica_device_setter(self, worker_device):
     """Creates a replica device setter."""
     ps_tasks = self._num_ps_replicas
-    ps_ops = [
-        "Variable",
-        "VariableV2",
+    ps_ops = list(device_setter.STANDARD_PS_OPS)
+    ps_ops.extend([
         "DecisionTreeEnsembleResourceHandleOp",
         "StatsAccumulatorScalarResourceHandleOp",
         "StatsAccumulatorTensorResourceHandleOp",
-    ]
+    ])
     ps_strategy = _OpRoundRobinStrategy(ps_ops, ps_tasks)
     return device_setter.replica_device_setter(
         worker_device=worker_device,
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 6d20a2e7f482953481fb1effe4c6e2e5a300786f..92068e88a76cb8bfdd394c1093347a8fb8a63449 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -1257,6 +1257,96 @@ class GbdtTest(test_util.TensorFlowTestCase):
       self.assertArrayNear(expected_leaf_2,
                            output.trees[0].nodes[2].leaf.vector.value, 1e-3)
 
+  def testTrainFnMulticlassDiagonalHessianOblivious(self):
+    """Tests the GBDT train for multiclass diagonal hessian."""
+    with self.cached_session():
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 1
+      # Use full hessian multiclass strategy.
+      learner_config.multi_class_strategy = (
+          learner_pb2.LearnerConfig.DIAGONAL_HESSIAN)
+      learner_config.num_classes = 5
+      learner_config.regularization.l1 = 0
+      # To make matrix inversible.
+      learner_config.regularization.l2 = 1e-5
+      learner_config.weak_learner_type = (
+          learner_pb2.LearnerConfig.OBLIVIOUS_DECISION_TREE)
+      learner_config.pruning_mode = learner_pb2.LearnerConfig.PRE_PRUNE
+      learner_config.constraints.max_tree_depth = 5
+      learner_config.constraints.min_node_weight = 0
+      batch_size = 3
+      features = {}
+      features["sparse_int"] = sparse_tensor.SparseTensor(
+          array_ops.constant([[0, 0], [1, 0]], dtypes.int64),
+          array_ops.constant([1, 2], dtypes.int64),
+          array_ops.constant([3, 1], dtypes.int64))
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          logits_dimension=5,
+          features=features)
+
+      labels = array_ops.constant([[2], [2], [3]], dtype=dtypes.float32)
+      weights = array_ops.ones([batch_size, 1], dtypes.float32)
+
+      predictions_dict = gbdt_model.predict(learn.ModeKeys.TRAIN)
+      predictions = predictions_dict["predictions"]
+
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              losses.per_example_maxent_loss(
+                  labels,
+                  weights,
+                  predictions,
+                  num_classes=learner_config.num_classes)[0]),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+
+      # Grow 2 layers.
+      train_op.run()
+      train_op.run()
+
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output.ParseFromString(serialized.eval())
+      self.assertEqual(len(output.trees), 1)
+      # We got 6 nodes: one parent and 4 leafs.
+      self.assertEqual(len(output.trees[0].nodes), 6)
+      self.assertAllClose(output.tree_weights, [1])
+      self.assertEqual(stamp_token.eval(), 2)
+
+      print(output.trees[0])
+      # Leafs should have a dense vector of size 5.
+      expected_leaf_1 = [-1.2497, -1.24976, 4.999, -1.24976, -1.2497]
+      expected_leaf_2 = [-2.2362, -2.2362, 6.0028, -2.2362, -2.2362]
+      expected_leaf_3 = [-2.2694, -2.2694, 4.0064, -0.0084, -2.2694]
+      expected_leaf_4 = [-2.2694, -2.2694, -0.0084, 4.0064, -2.2694]
+      self.assertArrayNear(expected_leaf_1,
+                           output.trees[0].nodes[2].leaf.vector.value, 1e-3)
+      self.assertArrayNear(expected_leaf_2,
+                           output.trees[0].nodes[3].leaf.vector.value, 1e-3)
+      self.assertArrayNear(expected_leaf_3,
+                           output.trees[0].nodes[4].leaf.vector.value, 1e-3)
+      self.assertArrayNear(expected_leaf_4,
+                           output.trees[0].nodes[5].leaf.vector.value, 1e-3)
+
   def testTrainFnMulticlassTreePerClass(self):
     """Tests the GBDT train for multiclass tree per class strategy."""
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
index b5ebaf1999519f65110e8164fa20bace5ecc3ef6..220e981618b7c0bfb1e4e98c087d83b451b9b3cf 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -48,6 +48,47 @@ def per_example_logistic_loss(labels, weights, predictions):
       labels=labels, logits=predictions)
   return unweighted_loss * weights, control_flow_ops.no_op()
 
+# MUST USE WITH HESSIAN REGULARIZATION,
+# This loss can have zero hessian, so it must be used with l2 or min_node_weight
+# regularization.
+# An example config is
+# learner_config.constraints.min_node_weight = 1 / num_examples_per_layer
+# learner_config.regularization.l2 = 1.0 / num_examples_per_layer
+# TODO(nponomareva): make it multidimensional so we can estimate several
+# quantiles at once.
+def per_example_quantile_regression_loss(labels, weights, predictions,
+                                         quantile):
+  """Smoothed loss for quantile regression.
+
+  The standard quantile regression loss is quantile*(y-y') when y>y' and
+  (quantile-1)*(y-y') otherwise, y' is a prediction, y is a label. The impl
+  below is this loss but squared in the region where the loss value < 1.
+
+  Args:
+    labels: Rank 2 (N, D) tensor of per-example labels.
+    weights: Rank 2 (N, 1) tensor of per-example weights.
+    predictions: Rank 2 (N, D) tensor of per-example predictions.
+    quantile: The quantile to use.
+
+  Returns:
+    loss: A Rank 2 (N, 1) tensor of per-example quantile loss.
+    update_op: An update operation to update the loss's internal state.
+  """
+  labels = math_ops.to_float(labels)
+  error = labels - predictions
+  square_loss_right = array_ops.where(error * quantile < 1.0,
+                                      math_ops.square(quantile * error),
+                                      quantile * error)
+  square_loss_left = array_ops.where(error * (quantile - 1) < 1,
+                                     math_ops.square((quantile - 1) * error),
+                                     (quantile - 1) * error)
+
+  unweighted_loss = array_ops.where(error > 0, square_loss_right,
+                                    square_loss_left)
+  if weights is None:
+    return unweighted_loss, control_flow_ops.no_op()
+  else:
+    return unweighted_loss * weights, control_flow_ops.no_op()
 
 # This is classical form of Maximum entropy loss, that is twice differentiable
 # (sparse_softmax_cross_entropy which is what we go for is not twice
@@ -78,8 +119,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
     labels = array_ops.expand_dims(labels, 1)
   # Labels are indices of classes, convert them to one hot encodings.
   target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes)
-  labels = math_ops.reduce_sum(
-      input_tensor=target_one_hot, reduction_indices=[1])
+  labels = math_ops.reduce_sum(input_tensor=target_one_hot, axis=[1])
   labels = math_ops.to_float(labels)
 
   # Calculate softmax probabilities for each class.
diff --git a/tensorflow/contrib/checkpoint/python/containers.py b/tensorflow/contrib/checkpoint/python/containers.py
index 242c1e8ba45e0b2f6f9a1a51695b824546382666..5418e2605b724edb60878e250d2c50fcc6ff5633 100644
--- a/tensorflow/contrib/checkpoint/python/containers.py
+++ b/tensorflow/contrib/checkpoint/python/containers.py
@@ -46,6 +46,10 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
     self._maybe_initialize_checkpointable()
     self._name_counts = {}
 
+  @property
+  def _values(self):
+    return [dep.ref for dep in self._checkpoint_dependencies]
+
   def track(self, checkpointable, base_name):
     """Add a dependency on `checkpointable`.
 
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 707f6211846ca0310bde297603928e9ec5bb471c..f944b7f88438ff257a44581170ead16640540e69 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -21,91 +21,25 @@ py_library(
 
 py_library(
     name = "cluster_resolver_py",
-    srcs = [
+    srcs = glob([
         "__init__.py",
-        "python/training/__init__.py",
-    ],
+        "python/training/*.py",
+    ]),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [
-        ":base_cluster_resolver_py",
-        ":gce_cluster_resolver_py",
-        ":tpu_cluster_resolver_py",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "base_cluster_resolver_py",
-    srcs = ["python/training/cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "gce_cluster_resolver_py",
-    srcs = ["python/training/gce_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "tpu_cluster_resolver_py",
-    srcs = ["python/training/tpu_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-tf_py_test(
-    name = "base_cluster_resolver_py_test",
-    srcs = ["python/training/cluster_resolver_test.py"],
-    additional_deps = [
-        ":cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    main = "python/training/cluster_resolver_test.py",
+    deps = ["//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib"],
 )
 
 tf_py_test(
-    name = "gce_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/gce_cluster_resolver_test.py"],
+    name = "cluster_resolver_initialization_test",
+    srcs = ["cluster_resolver_initialization_test.py"],
     additional_deps = [
         ":cluster_resolver_py",
-        ":gce_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    main = "python/training/gce_cluster_resolver_test.py",
-)
-
-tf_py_test(
-    name = "tpu_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/tpu_cluster_resolver_test.py"],
-    additional_deps = [
-        ":tpu_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
     ],
-    grpc_enabled = True,
-    main = "python/training/tpu_cluster_resolver_test.py",
+    main = "cluster_resolver_initialization_test.py",
 )
diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py
index b4d8cd4a7cf42e910e7506dbeec8656a2cef62eb..390b3e7550b3d991269bb84707c3500f2fa33290 100644
--- a/tensorflow/contrib/cluster_resolver/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/__init__.py
@@ -20,11 +20,14 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
 # pylint: enable=wildcard-import,unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -34,7 +37,10 @@ _allowed_symbols = [
     'SimpleClusterResolver',
     'UnionClusterResolver',
     'GceClusterResolver',
+    'KubernetesClusterResolver',
+    'TFConfigClusterResolver',
     'TPUClusterResolver',
+    'SlurmClusterResolver',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/cluster_resolver_initialization_test.py b/tensorflow/contrib/cluster_resolver/cluster_resolver_initialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..01ff1478c694cf0901aeed48b6e0f873d8abe65e
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/cluster_resolver_initialization_test.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests to ensure ClusterResolvers are usable via the old contrib path."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.cluster_resolver import SimpleClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training import cluster_resolver
+from tensorflow.contrib.cluster_resolver.python.training import UnionClusterResolver
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class ClusterResolverInitializationTest(test.TestCase):
+
+  def testCreateSimpleClusterResolverFromLib(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    cluster_resolver.SimpleClusterResolver(base_cluster_spec)
+
+  def testCreateSimpleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    SimpleClusterResolver(base_cluster_spec)
+
+  def testCreateUnionClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    simple_cr = SimpleClusterResolver(base_cluster_spec)
+    UnionClusterResolver(simple_cr)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/cluster_resolver/python/training/__init__.py b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
index 0b0464b7d2ddbd26b588bafc9624d412de326f6a..10d93549ebbd4f7e900796d0516b0af1744224af 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
@@ -18,8 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'cluster_resolver',
+    'gce_cluster_resolver',
+    'kubernetes_cluster_resolver',
+    'slurm_cluster_resolver',
+    'tfconfig_cluster_resolver',
+    'tpu_cluster_resolver',
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'KubernetesClusterResolver',
+    'TFConfigClusterResolver',
+    'TPUClusterResolver',
+    'SlurmClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
index 1c480b25134b1e54200e0ddb780bd7bb0f122341..99840fb5166dd739b3bee06a926e06b534011d1f 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,181 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Cluster Resolvers are used for dynamic cluster IP/hostname resolution."""
+"""Stub file for ClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from tensorflow.python.training.server_lib import ClusterSpec
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+# pylint: enable=unused-import
 
+from tensorflow.python.util.all_util import remove_undocumented
 
-class ClusterResolver(object):
-  """Abstract class for all implementations of ClusterResolvers.
+_allowed_symbols = [
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+]
 
-  This defines the skeleton for all implementations of ClusterResolvers.
-  ClusterResolvers are a way for TensorFlow to communicate with various cluster
-  management systems (e.g. GCE, AWS, etc...).
+remove_undocumented(__name__, _allowed_symbols)
 
-  By letting TensorFlow communicate with these systems, we will be able to
-  automatically discover and resolve IP addresses for various TensorFlow
-  workers. This will eventually allow us to automatically recover from
-  underlying machine failures and scale TensorFlow worker clusters up and down.
-  """
-
-  @abc.abstractmethod
-  def cluster_spec(self):
-    """Retrieve the current state of the cluster and returns a ClusterSpec.
-
-    Returns:
-      A ClusterSpec representing the state of the cluster at the moment this
-      function is called.
-
-    Implementors of this function must take care in ensuring that the
-    ClusterSpec returned is up-to-date at the time of calling this function.
-    This usually means retrieving the information from the underlying cluster
-    management system every time this function is invoked and reconstructing
-    a cluster_spec, rather than attempting to cache anything.
-    """
-    raise NotImplementedError(
-        'cluster_spec is not implemented for {}.'.format(self))
-
-  @abc.abstractmethod
-  def master(self):
-    """..."""
-    raise NotImplementedError('master is not implemented for {}.'.format(self))
-
-
-class SimpleClusterResolver(ClusterResolver):
-  """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
-
-  def __init__(self, cluster_spec, master=''):
-    """Creates a SimpleClusterResolver from a ClusterSpec."""
-    super(SimpleClusterResolver, self).__init__()
-
-    if not isinstance(cluster_spec, ClusterSpec):
-      raise TypeError('cluster_spec must be a ClusterSpec.')
-    self._cluster_spec = cluster_spec
-
-    if not isinstance(master, str):
-      raise TypeError('master must be a string.')
-    self._master = master
-
-  def cluster_spec(self):
-    """Returns the ClusterSpec passed into the constructor."""
-    return self._cluster_spec
-
-  def master(self):
-    """Returns the master address to use when creating a session."""
-    return self._master
-
-
-class UnionClusterResolver(ClusterResolver):
-  """Performs a union on underlying ClusterResolvers.
-
-  This class performs a union given two or more existing ClusterResolvers. It
-  merges the underlying ClusterResolvers, and returns one unified ClusterSpec
-  when cluster_spec is called. The details of the merge function is
-  documented in the cluster_spec function.
-  """
-
-  def __init__(self, *args):
-    """Initializes a UnionClusterResolver with other ClusterResolvers.
-
-    Args:
-      *args: `ClusterResolver` objects to be unionized.
-
-    Raises:
-      TypeError: If any argument is not a subclass of `ClusterResolvers`.
-      ValueError: If there are no arguments passed.
-    """
-    super(UnionClusterResolver, self).__init__()
-
-    if not args:
-      raise ValueError('At least one ClusterResolver is required.')
-
-    for cluster_resolver in args:
-      if not isinstance(cluster_resolver, ClusterResolver):
-        raise TypeError('All arguments must be a sub-class of '
-                        '`ClusterResolver.`')
-    self._cluster_resolvers = args
-
-  def cluster_spec(self):
-    """Returns a union of all the ClusterSpecs from the ClusterResolvers.
-
-    Returns:
-      A ClusterSpec containing host information merged from all the underlying
-      ClusterResolvers.
-
-    Raises:
-      KeyError: If there are conflicting keys detected when merging two or
-      more dictionaries, this exception is raised.
-
-    Note: If there are multiple ClusterResolvers exposing ClusterSpecs with the
-    same job name, we will merge the list/dict of workers.
-
-    If *all* underlying ClusterSpecs expose the set of workers as lists, we will
-    concatenate the lists of workers, starting with the list of workers from
-    the first ClusterResolver passed into the constructor.
-
-    If *any* of the ClusterSpecs expose the set of workers as a dict, we will
-    treat all the sets of workers as dicts (even if they are returned as lists)
-    and will only merge them into a dict if there is no conflicting keys. If
-    there is a conflicting key, we will raise a `KeyError`.
-    """
-
-    merged_cluster = {}
-
-    # We figure out whether it is all lists for a particular job, or whether
-    # there are dicts inside.
-    for cluster_resolver in self._cluster_resolvers:
-      cluster_spec = cluster_resolver.cluster_spec()
-      cluster_dict = cluster_spec.as_dict()
-
-      for job_name, tasks in cluster_dict.items():
-        if job_name in merged_cluster:
-          # If we see a dict, then we write a dict out regardless.
-          if isinstance(tasks, dict):
-            merged_cluster[job_name] = {}
-        else:
-          # We take whichever type is present.
-          if isinstance(tasks, list):
-            merged_cluster[job_name] = []
-          else:
-            merged_cluster[job_name] = {}
-
-    # We then do the merge as appropriate in merged_cluster[job].
-    for cluster_resolver in self._cluster_resolvers:
-      cluster_spec = cluster_resolver.cluster_spec()
-      cluster_dict = cluster_spec.as_dict()
-
-      for job_name, tasks in cluster_dict.items():
-        if isinstance(merged_cluster[job_name], list):
-          # We all have lists, we can just concatenate and be done.
-          merged_cluster[job_name].extend(tasks)
-        else:
-          if isinstance(tasks, list):
-            # We convert to a dictionary if the type is a list.
-            task_dict = dict(zip(range(0, len(tasks)), tasks))
-          else:
-            # We can simply make a copy (for update) and be done.
-            task_dict = tasks.copy()
-
-          # We detect if there are duplicates, and raise an error if so.
-          task_keys = set(task_dict)
-          merged_keys = set(merged_cluster[job_name].keys())
-          intersected_keys = task_keys.intersection(merged_keys)
-          if intersected_keys:
-            raise KeyError('Duplicate keys detected when merging two '
-                           'ClusterSpecs: %s' % repr(intersected_keys))
-
-          # We do the merge after all the processing.
-          merged_cluster[job_name].update(task_dict)
-
-    return ClusterSpec(merged_cluster)
-
-  def master(self):
-    """master returns the master address from the first cluster resolver."""
-    return self._cluster_resolvers[0].master()
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
deleted file mode 100644
index d9c97d53eb3663f6ab2f7b40395592dc7638b896..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
+++ /dev/null
@@ -1,240 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Cluster Resolvers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
-
-
-class UnionClusterResolverTest(test.TestCase):
-  # TODO(frankchn): Transform to parameterized test after it is included in the
-  # TF open source codebase.
-
-  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
-    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
-    self.assertProtoEquals(
-        expected_proto, server_lib.ClusterSpec(cluster_spec).as_cluster_def())
-    self.assertProtoEquals(
-        expected_proto,
-        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
-    self.assertProtoEquals(
-        expected_proto,
-        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
-
-  def testSingleClusterResolver(self):
-    base_cluster_spec = server_lib.ClusterSpec({
-        "ps": ["ps0:2222", "ps1:2222"],
-        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
-    })
-    simple_resolver = SimpleClusterResolver(base_cluster_spec)
-    union_resolver = UnionClusterResolver(simple_resolver)
-
-    expected_proto = """
-    job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
-                     tasks { key: 1 value: 'ps1:2222' } }
-    job { name: 'worker' tasks { key: 0 value: 'worker0:2222' }
-                         tasks { key: 1 value: 'worker1:2222' }
-                         tasks { key: 2 value: 'worker2:2222' } }
-    """
-    actual_cluster_spec = union_resolver.cluster_spec()
-    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-
-  def testTwoNonOverlappingJobMergedClusterResolver(self):
-    cluster_spec_1 = server_lib.ClusterSpec({
-        "ps": [
-            "ps0:2222",
-            "ps1:2222"
-        ]
-    })
-    cluster_spec_2 = server_lib.ClusterSpec({
-        "worker": [
-            "worker0:2222",
-            "worker1:2222",
-            "worker2:2222"
-        ]
-    })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
-
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
-    cluster_spec = union_cluster.cluster_spec()
-
-    expected_proto = """
-    job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
-                     tasks { key: 1 value: 'ps1:2222' } }
-    job { name: 'worker' tasks { key: 0 value: 'worker0:2222' }
-                         tasks { key: 1 value: 'worker1:2222' }
-                         tasks { key: 2 value: 'worker2:2222' } }
-    """
-    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
-
-  def testOverlappingJobMergedClusterResolver(self):
-    cluster_spec_1 = server_lib.ClusterSpec({
-        "worker": [
-            "worker4:2222",
-            "worker5:2222"
-        ]
-    })
-    cluster_spec_2 = server_lib.ClusterSpec({
-        "worker": [
-            "worker0:2222",
-            "worker1:2222",
-            "worker2:2222"
-        ]
-    })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
-
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
-    cluster_spec = union_cluster.cluster_spec()
-
-    expected_proto = """
-    job { name: 'worker' tasks { key: 0 value: 'worker4:2222' }
-                         tasks { key: 1 value: 'worker5:2222' }
-                         tasks { key: 2 value: 'worker0:2222' }
-                         tasks { key: 3 value: 'worker1:2222' }
-                         tasks { key: 4 value: 'worker2:2222' } }
-    """
-    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
-
-  def testOverlappingSparseJobMergedClusterResolverThrowError(self):
-    cluster_spec_1 = server_lib.ClusterSpec({
-        "worker": {
-            7: "worker4:2222",
-            9: "worker5:2222"
-        }
-    })
-    cluster_spec_2 = server_lib.ClusterSpec({
-        "worker": {
-            3: "worker0:2222",
-            6: "worker1:2222",
-            7: "worker2:2222"
-        }
-    })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
-
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
-    self.assertRaises(KeyError, union_cluster.cluster_spec)
-
-  def testOverlappingDictAndListThrowError(self):
-    cluster_spec_1 = server_lib.ClusterSpec({
-        "worker": [
-            "worker4:2222",
-            "worker5:2222"
-        ]
-    })
-    cluster_spec_2 = server_lib.ClusterSpec({
-        "worker": {
-            1: "worker0:2222",
-            2: "worker1:2222",
-            3: "worker2:2222"
-        }
-    })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
-
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
-    self.assertRaises(KeyError, union_cluster.cluster_spec)
-
-  def testOverlappingJobNonOverlappingKey(self):
-    cluster_spec_1 = server_lib.ClusterSpec({
-        "worker": {
-            5: "worker4:2222",
-            9: "worker5:2222"
-        }
-    })
-    cluster_spec_2 = server_lib.ClusterSpec({
-        "worker": {
-            3: "worker0:2222",
-            6: "worker1:2222",
-            7: "worker2:2222"
-        }
-    })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
-
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
-    cluster_spec = union_cluster.cluster_spec()
-
-    expected_proto = """
-    job { name: 'worker' tasks { key: 3 value: 'worker0:2222' }
-                         tasks { key: 5 value: 'worker4:2222' }
-                         tasks { key: 6 value: 'worker1:2222' }
-                         tasks { key: 7 value: 'worker2:2222' }
-                         tasks { key: 9 value: 'worker5:2222' }}
-    """
-    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
-
-  def testMixedModeNonOverlappingKey(self):
-    cluster_spec_1 = server_lib.ClusterSpec({
-        "worker": [
-            "worker4:2222",
-            "worker5:2222"
-        ]
-    })
-    cluster_spec_2 = server_lib.ClusterSpec({
-        "worker": {
-            3: "worker0:2222",
-            6: "worker1:2222",
-            7: "worker2:2222"
-        }
-    })
-    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
-    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
-
-    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
-    cluster_spec = union_cluster.cluster_spec()
-
-    expected_proto = """
-    job { name: 'worker' tasks { key: 0 value: 'worker4:2222' }
-                         tasks { key: 1 value: 'worker5:2222' }
-                         tasks { key: 3 value: 'worker0:2222' }
-                         tasks { key: 6 value: 'worker1:2222' }
-                         tasks { key: 7 value: 'worker2:2222' }}
-    """
-    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
-
-  def testRetainSparseJobWithNoMerging(self):
-    base_cluster_spec = server_lib.ClusterSpec({
-        "worker": {
-            1: "worker0:2222",
-            3: "worker1:2222",
-            5: "worker2:2222"
-        }
-    })
-
-    base_cluster_resolver = SimpleClusterResolver(base_cluster_spec)
-    union_cluster = UnionClusterResolver(base_cluster_resolver)
-    cluster_spec = union_cluster.cluster_spec()
-
-    expected_proto = """
-    job { name: 'worker' tasks { key: 1 value: 'worker0:2222' }
-                         tasks { key: 3 value: 'worker1:2222' }
-                         tasks { key: 5 value: 'worker2:2222' } }
-    """
-    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
-
-
-# TODO(saeta): Include tests for master resolution
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
index 3f5824128948453634bc5e5a7d6fdeedae60f5bd..55e61155c683c928efab9bb018868faec3e3df8c 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,128 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for GCE Instance Groups."""
+"""Stub file for GceClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training.server_lib import ClusterSpec
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+# pylint: enable=unused-import
 
-_GOOGLE_API_CLIENT_INSTALLED = True
-try:
-  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
-  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
-except ImportError:
-  _GOOGLE_API_CLIENT_INSTALLED = False
+from tensorflow.python.util.all_util import remove_undocumented
 
+_allowed_symbols = [
+    'GceClusterResolver',
+]
 
-class GceClusterResolver(ClusterResolver):
-  """Cluster Resolver for Google Compute Engine.
-
-  This is an implementation of cluster resolvers for the Google Compute Engine
-  instance group platform. By specifying a project, zone, and instance group,
-  this will retrieve the IP address of all the instances within the instance
-  group and return a Cluster Resolver object suitable for use for distributed
-  TensorFlow.
-  """
-
-  def __init__(self,
-               project,
-               zone,
-               instance_group,
-               port,
-               job_name='worker',
-               credentials='default',
-               service=None):
-    """Creates a new GceClusterResolver object.
-
-    This takes in a few parameters and creates a GceClusterResolver project. It
-    will then use these parameters to query the GCE API for the IP addresses of
-    each instance in the instance group.
-
-    Args:
-      project: Name of the GCE project
-      zone: Zone of the GCE instance group
-      instance_group: Name of the GCE instance group
-      port: Port of the listening TensorFlow server (default: 8470)
-      job_name: Name of the TensorFlow job this set of instances belongs to
-      credentials: GCE Credentials. If nothing is specified, this defaults to
-        GoogleCredentials.get_application_default()
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. (Default: discovery.build('compute', 'v1')). If you specify a
-        custom service object, then the credentials parameter will be ignored.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-    """
-    self._project = project
-    self._zone = zone
-    self._instance_group = instance_group
-    self._job_name = job_name
-    self._port = port
-    self._credentials = credentials
-
-    if credentials == 'default':
-      if _GOOGLE_API_CLIENT_INSTALLED:
-        self._credentials = GoogleCredentials.get_application_default()
-
-    if service is None:
-      if not _GOOGLE_API_CLIENT_INSTALLED:
-        raise ImportError('googleapiclient must be installed before using the '
-                          'GCE cluster resolver')
-      self._service = discovery.build(
-          'compute', 'v1',
-          credentials=self._credentials)
-    else:
-      self._service = service
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest instance group info.
-
-    This returns a ClusterSpec object for use based on information from the
-    specified instance group. We will retrieve the information from the GCE APIs
-    every time this method is called.
-
-    Returns:
-      A ClusterSpec containing host information retrieved from GCE.
-    """
-    request_body = {'instanceState': 'RUNNING'}
-    request = self._service.instanceGroups().listInstances(
-        project=self._project,
-        zone=self._zone,
-        instanceGroups=self._instance_group,
-        body=request_body,
-        orderBy='name')
-
-    worker_list = []
-
-    while request is not None:
-      response = request.execute()
-
-      items = response['items']
-      for instance in items:
-        instance_name = instance['instance'].split('/')[-1]
-
-        instance_request = self._service.instances().get(
-            project=self._project,
-            zone=self._zone,
-            instance=instance_name)
-
-        if instance_request is not None:
-          instance_details = instance_request.execute()
-          ip_address = instance_details['networkInterfaces'][0]['networkIP']
-          instance_url = '%s:%s' % (ip_address, self._port)
-          worker_list.append(instance_url)
-
-      request = self._service.instanceGroups().listInstances_next(
-          previous_request=request,
-          previous_response=response)
-
-    worker_list.sort()
-    return ClusterSpec({self._job_name: worker_list})
-
-  def master(self):
-    return ''
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8eaf33629a6299d5da5f8a930e0cad7d07044e8
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
@@ -0,0 +1,36 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file for KubernetesClusterResolver for backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
+
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'KubernetesClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
+
diff --git a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcd2a846eeb1be7ad4b5a98b067a125afbbebc7d
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
@@ -0,0 +1,35 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file for SlurmClusterResolver to maintain backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
+
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'SlurmClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db7f47dcb49c499719b9002b1d2d6c4837a7bd2
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
@@ -0,0 +1,36 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file for TFConfigClusterResolver to maintain backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
+
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'TFConfigClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
+
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index f4a8e16c99f464b813a98e981579bd0ff53bd464..3a1eaccd06e574babbe9a3232dacd1d66f3a4648 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,311 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Cloud TPUs."""
+"""Stub file for TPUClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from six.moves.urllib.request import Request
-from six.moves.urllib.request import urlopen
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+# pylint: enable=unused-import
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat
+from tensorflow.python.util.all_util import remove_undocumented
 
-_GOOGLE_API_CLIENT_INSTALLED = True
-try:
-  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
-  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
-except ImportError:
-  _GOOGLE_API_CLIENT_INSTALLED = False
+_allowed_symbols = [
+    'TPUClusterResolver',
+]
 
-
-_GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
-_ENDPOINTS_SEPARATOR = ','
-_DEFAULT_ENV_VARIABLE = 'TPU_NAME'
-_DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
-
-
-class TPUClusterResolver(ClusterResolver):
-  """Cluster Resolver for Google Cloud TPUs.
-
-  This is an implementation of cluster resolvers for the Google Cloud TPU
-  service. As Cloud TPUs are in alpha, you will need to specify a API definition
-  file for this to consume, in addition to a list of Cloud TPUs in your Google
-  Cloud Platform project.
-  """
-
-  def _requestComputeMetadata(self, path):
-    req = Request('http://metadata/computeMetadata/v1/%s' % path,
-                  headers={'Metadata-Flavor': 'Google'})
-    resp = urlopen(req)
-    return compat.as_bytes(resp.read())
-
-  def _shouldResolve(self):
-    if (self._tpu == compat.as_bytes('') or
-        self._tpu == compat.as_bytes('local') or
-        self._tpu.startswith(compat.as_bytes('/bns')) or
-        self._tpu.startswith(compat.as_bytes('localhost:')) or
-        self._tpu.startswith(compat.as_bytes('grpc://'))):
-      return False
-    return True
-
-  @staticmethod
-  def _inGke():
-    """When running in GKE, the environment variable will be set."""
-    return _GKE_ENV_VARIABLE in os.environ
-
-  @staticmethod
-  def _gkeEndpoints():
-    return os.environ[_GKE_ENV_VARIABLE]
-
-  @staticmethod
-  def _envVarFallback():
-    if _DEFAULT_ENV_VARIABLE in os.environ:
-      return os.environ[_DEFAULT_ENV_VARIABLE]
-    return None
-
-  @staticmethod
-  def _discoveryUrl():
-    return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
-
-  def __init__(self,
-               tpu=None,
-               zone=None,
-               project=None,
-               job_name='worker',
-               coordinator_name=None,
-               coordinator_address=None,
-               credentials='default',
-               service=None,
-               discovery_url=None):
-    """Creates a new TPUClusterResolver object.
-
-    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
-    for the IP addresses and ports of each Cloud TPU listed.
-
-    Args:
-      tpu: Either a string, or a list of strings corresponding to the TPUs to
-        use. If the single string is the empty string, the string 'local', or a
-        string that begins with 'grpc://' or '/bns', then it is assumed to not
-        correspond with a Cloud TPU and will instead be passed as the session
-        master and no ClusterSpec propagation will be done.
-      zone: Zone where the TPUs are located. If omitted or empty, we will assume
-        that the zone of the TPU is the same as the zone of the GCE VM, which we
-        will try to discover from the GCE metadata service.
-      project: Name of the GCP project containing Cloud TPUs. If omitted or
-        empty, we will try to discover the project name of the GCE VM from the
-        GCE metadata service.
-      job_name: Name of the TensorFlow job the TPUs belong to.
-      coordinator_name: The name to use for the coordinator. Set to None if the
-        coordinator should not be included in the computed ClusterSpec.
-      coordinator_address: The address of the coordinator (typically an ip:port
-        pair). If set to None, a TF server will be started. If coordinator_name
-        is None, a TF server will not be started even if coordinator_address is
-        None.
-      credentials: GCE Credentials. If None, then we use default credentials
-        from the oauth2client
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. If you specify a custom service object, then the credentials
-        parameter will be ignored.
-      discovery_url: A URL template that points to the location of
-        the discovery service. It should have two parameters {api} and
-        {apiVersion} that when filled in produce an absolute URL to the
-        discovery document for that service. The environment variable
-        'TPU_API_DISCOVERY_URL' will override this.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-      ValueError: If no TPUs are specified.
-    """
-    if isinstance(tpu, list):
-      if not tpu:
-        raise ValueError('At least one TPU must be specified.')
-      if len(tpu) != 1:
-        raise NotImplementedError(
-            'Using multiple TPUs in a single session is not yet implemented')
-      tpu = tpu[0]
-
-    in_gke = self._inGke()
-    # When using GKE with Cloud TPUs, the env variable will be set.
-    if tpu is None:
-      if in_gke:
-        tpu = self._gkeEndpoints()
-      else:
-        tpu = self._envVarFallback()
-
-    if tpu is None:
-      raise ValueError('Please provide a TPU Name to connect to.')
-
-    self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
-    self._job_name = job_name
-    self._credentials = credentials
-
-    should_resolve = self._shouldResolve()
-
-    if not project and should_resolve:
-      project = compat.as_str(
-          self._requestComputeMetadata('project/project-id'))
-
-    if not zone and should_resolve:
-      zone_path = compat.as_str(self._requestComputeMetadata('instance/zone'))
-      zone = zone_path.split('/')[-1]
-
-    self._project = project
-    self._zone = zone
-
-    if credentials == 'default' and should_resolve:
-      if _GOOGLE_API_CLIENT_INSTALLED:
-        self._credentials = GoogleCredentials.get_application_default()
-
-    if service is None and should_resolve:
-      if not _GOOGLE_API_CLIENT_INSTALLED:
-        raise ImportError('googleapiclient and oauth2client must be installed '
-                          'before using the TPU cluster resolver. Execute: '
-                          '`pip install --upgrade google-api-python-client` '
-                          'and `pip install --upgrade oauth2client` to '
-                          'install with pip.')
-
-      final_discovery_url = self._discoveryUrl() or discovery_url
-      if final_discovery_url:
-        self._service = discovery.build(
-            'tpu', 'v1alpha1',
-            credentials=self._credentials,
-            discoveryServiceUrl=final_discovery_url)
-      else:
-        self._service = discovery.build(
-            'tpu', 'v1alpha1',
-            credentials=self._credentials)
-    else:
-      self._service = service
-
-    self._coordinator_name = coordinator_name
-    if coordinator_name and not coordinator_address and (should_resolve or
-                                                         in_gke):
-      self._start_local_server()
-    else:
-      self._coordinator_address = coordinator_address
-
-  def master(self):
-    """Get the Master string to be used for the session.
-
-    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
-    first instance in the ClusterSpec returned by the cluster_spec function.
-
-    If a non-TPU name is used when constructing a TPUClusterResolver, that will
-    be returned instead (e.g. If the tpus argument's value when constructing
-    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
-    'grpc://10.240.1.2:8470' will be returned).
-
-    Returns:
-      string, the connection string to use when creating a session.
-
-    Raises:
-      ValueError: If none of the TPUs specified exists.
-    """
-    if not self._shouldResolve():
-      return self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
-
-    job_tasks = self.cluster_spec().job_tasks(self._job_name)
-    if not job_tasks:
-      raise ValueError('No TPUs exists with the specified names exist.')
-
-    return 'grpc://' + job_tasks[0]
-
-  def get_master(self):
-    return self.master()
-
-  def get_job_name(self):
-    if self._shouldResolve():
-      return self._job_name
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest TPU information.
-
-    We retrieve the information from the GCE APIs every time this method is
-    called.
-
-    Returns:
-      A ClusterSpec containing host information returned from Cloud TPUs.
-
-    Raises:
-      RuntimeError: If the provided TPU is not healthy.
-    """
-    ############################################################################
-    # There are 5 potential cases this code must handle:
-    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
-    #      a. Create a ClusterSpec that includes the coordinator job
-    #      b. Create a ClusterSpec without the coordinator job.
-    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
-    #     tasks and
-    #      a. Create a ClusterSpec with the coordinator
-    #      b. Create a ClusterSpec without the coordinator
-    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
-    ############################################################################
-
-    if self._shouldResolve():
-      # Case 1.
-      full_name = 'projects/%s/locations/%s/nodes/%s' % (
-          self._project, self._zone, compat.as_text(self._tpu))
-      request = self._service.projects().locations().nodes().get(name=full_name)
-      response = request.execute()
-
-      if 'state' in response and response['state'] != 'READY':
-        raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
-                           (compat.as_text(self._tpu), response['state']))
-
-      if 'health' in response and response['health'] != 'HEALTHY':
-        raise RuntimeError('TPU "%s" is unhealthy: "%s"' %
-                           (compat.as_text(self._tpu), response['health']))
-
-      if 'networkEndpoints' in response:
-        worker_list = [
-            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-            for endpoint in response['networkEndpoints']
-        ]
-      else:
-        # Fall back to the deprecated response format
-        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-        worker_list = [instance_url]
-
-      cluster_spec = {self._job_name: worker_list}
-    else:
-      if not self._tpu.startswith(compat.as_bytes('grpc://')):
-        # Case 3.
-        return None
-      # Case 2.
-      cluster_spec = {
-          self._job_name: [
-              x[len(compat.as_bytes('grpc://')):]
-              for x in self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))
-          ]
-      }
-
-    if self._coordinator_address:
-      # {1, 2}.a
-      cluster_spec[self._coordinator_name] = [self._coordinator_address]
-
-    return server_lib.ClusterSpec(cluster_spec)
-
-  def _start_local_server(self):
-    address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
-    self._server = server_lib.Server(
-        {
-            'local': ['0.0.0.0:0']
-        }, protocol='grpc', config=None, start=True)
-    # self._server.target is of the form: grpc://ipaddress:port
-    target = compat.as_bytes(self._server.target)
-    splits = target.split(compat.as_bytes(':'))
-    assert len(splits) == 3, self._server.target
-    assert splits[0] == compat.as_bytes('grpc'), self._server.target
-    self._coordinator_port = compat.as_text(splits[2])
-    self._coordinator_address = '%s:%s' % (
-        address, compat.as_text(self._coordinator_port))
-
-  def __deepcopy__(self, memo):
-    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
-    return self
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
deleted file mode 100644
index ad4f6432630be44a7de6e778f55f1fb7fd66f307..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ /dev/null
@@ -1,468 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TPUClusterResolver."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
-from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat
-
-mock = test.mock
-
-
-class MockRequestClass(object):
-
-  def __init__(self, name, tpu_map):
-    self._name = name
-    self._tpu_map = tpu_map
-
-  def execute(self):
-    if self._name in self._tpu_map:
-      return self._tpu_map[self._name]
-    else:
-      raise KeyError('Resource %s was not found' % self._name)
-
-
-class MockNodeClass(object):
-
-  def __init__(self, tpu_map):
-    self._tpu_map = tpu_map
-
-  def get(self, name):
-    return MockRequestClass(name, self._tpu_map)
-
-
-def mock_request_compute_metadata(cls, *args, **kwargs):
-  del cls, kwargs  # Unused.
-  if args[0] == 'project/project-id':
-    return 'test-project'
-  elif args[0] == 'instance/zone':
-    return 'projects/test-project/locations/us-central1-c'
-  elif args[0] == 'instance/network-interfaces/0/ip':
-    return '10.128.1.2'
-  return ''
-
-
-class TPUClusterResolverTest(test.TestCase):
-
-  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
-    """Verifies that the ClusterSpec generates the correct proto.
-
-    We are testing this four different ways to ensure that the ClusterSpec
-    returned by the TPUClusterResolver behaves identically to a normal
-    ClusterSpec when passed into the generic ClusterSpec libraries.
-
-    Args:
-      cluster_spec: ClusterSpec returned by the TPUClusterResolver
-      expected_proto: Expected protobuf
-    """
-    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
-    self.assertProtoEquals(
-        expected_proto,
-        server_lib.ClusterSpec(cluster_spec).as_cluster_def())
-    self.assertProtoEquals(expected_proto,
-                           server_lib.ClusterSpec(
-                               cluster_spec.as_cluster_def()).as_cluster_def())
-    self.assertProtoEquals(expected_proto,
-                           server_lib.ClusterSpec(
-                               cluster_spec.as_dict()).as_cluster_def())
-
-  def mock_service_client(self, tpu_map=None):
-
-    if tpu_map is None:
-      tpu_map = {}
-
-    mock_locations = mock.MagicMock()
-    mock_locations.nodes.return_value = MockNodeClass(tpu_map)
-
-    mock_project = mock.MagicMock()
-    mock_project.locations.return_value = mock_locations
-
-    mock_client = mock.MagicMock()
-    mock_client.projects.return_value = mock_project
-
-    return mock_client
-
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
-                     mock_request_compute_metadata)
-  def testRetrieveProjectAndZoneFromMetadata(self):
-    tpu_map = {
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'ipAddress': '10.1.2.3',
-            'port': '8470',
-            'health': 'HEALTHY'
-        }
-    }
-
-    tpu_cluster_resolver = TPUClusterResolver(
-        project=None,
-        zone=None,
-        tpu=['test-tpu-1'],
-        credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map),
-        coordinator_name='coordinator')
-
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
-    expected_proto = """
-    job {
-      name: 'coordinator'
-      tasks { key: 0 value: '10.128.1.2:%s' }
-    }
-    job {
-      name: 'worker'
-      tasks { key: 0 value: '10.1.2.3:8470' }
-    }
-    """ % tpu_cluster_resolver._coordinator_port
-    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
-
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
-                     mock_request_compute_metadata)
-  def testRetrieveProjectAndZoneFromMetadataNoCoordinator(self):
-    tpu_map = {
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'ipAddress': '10.1.2.3',
-            'port': '8470',
-            'health': 'HEALTHY'
-        }
-    }
-
-    tpu_cluster_resolver = TPUClusterResolver(
-        project=None,
-        zone=None,
-        tpu=['test-tpu-1'],
-        coordinator_name=None,
-        credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
-
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
-    expected_proto = """
-    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
-    """
-    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
-                     mock_request_compute_metadata)
-  def testUnhealthyCloudTpu(self):
-    tpu_map = {
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'ipAddress': '10.1.2.3',
-            'port': '8470',
-            'health': 'UNHEALTHY'
-        }
-    }
-
-    tpu_cluster_resolver = TPUClusterResolver(
-        project=None,
-        zone=None,
-        tpu='test-tpu-1',
-        coordinator_name=None,
-        credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
-
-    with self.assertRaises(RuntimeError):
-      tpu_cluster_resolver.cluster_spec()
-
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
-                     mock_request_compute_metadata)
-  def testNotReadyCloudTpu(self):
-    tpu_map = {
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'ipAddress': '10.1.2.3',
-            'port': '8470',
-            'state': 'CREATING'
-        }
-    }
-
-    tpu_cluster_resolver = TPUClusterResolver(
-        project=None,
-        zone=None,
-        tpu='test-tpu-1',
-        coordinator_name=None,
-        credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
-
-    with self.assertRaises(RuntimeError):
-      tpu_cluster_resolver.cluster_spec()
-
-  def testSimpleSuccessfulRetrieval(self):
-    tpu_map = {
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'ipAddress': '10.1.2.3',
-            'port': '8470',
-            'health': 'HEALTHY'
-        }
-    }
-
-    tpu_cluster_resolver = TPUClusterResolver(
-        project='test-project',
-        zone='us-central1-c',
-        tpu=['test-tpu-1'],
-        coordinator_name='coordinator',
-        coordinator_address='10.128.1.5:10203',
-        credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
-
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
-    expected_proto = """
-    job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
-    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
-    """
-    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-
-  def testNewNetworkEndpointFormat(self):
-    tpu_map = {
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'health': 'HEALTHY',
-            'networkEndpoints': [{
-                'ipAddress': '10.2.3.4',
-                'port': 8470,
-            }]
-        }
-    }
-
-    tpu_cluster_resolver = TPUClusterResolver(
-        project='test-project',
-        zone='us-central1-c',
-        tpu='test-tpu-1',
-        coordinator_name='coordinator',
-        coordinator_address='10.128.1.5:10203',
-        credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
-
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
-    expected_proto = """
-    job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
-    job { name: 'worker' tasks { key: 0 value: '10.2.3.4:8470' } }
-    """
-    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-    self.assertEqual('grpc://10.2.3.4:8470', tpu_cluster_resolver.master())
-
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
-                     mock_request_compute_metadata)
-  def testPodResolution(self):
-    tpu_map = {
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'health':
-                'HEALTHY',
-            'networkEndpoints': [
-                {
-                    'ipAddress': '10.2.3.4',
-                    'port': 8470,
-                },
-                {
-                    'ipAddress': '10.2.3.5',
-                    'port': 8470,
-                },
-                {
-                    'ipAddress': '10.2.3.6',
-                    'port': 8470,
-                },
-                {
-                    'ipAddress': '10.2.3.7',
-                    'port': 8470,
-                },
-            ]
-        }
-    }
-
-    tpu_cluster_resolver = TPUClusterResolver(
-        tpu='test-tpu-1',
-        credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map),
-        coordinator_name='coordinator')
-
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
-    expected_proto = """
-    job {
-      name: 'coordinator',
-      tasks { key: 0 value: '10.128.1.2:%s'}
-    }
-    job {
-      name: 'worker'
-      tasks { key: 0 value: '10.2.3.4:8470' }
-      tasks { key: 1 value: '10.2.3.5:8470' }
-      tasks { key: 2 value: '10.2.3.6:8470' }
-      tasks { key: 3 value: '10.2.3.7:8470' }
-    }
-    """ % tpu_cluster_resolver._coordinator_port
-    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
-
-  def testPodResolutionNoCoordinator(self):
-    tpu_map = {
-        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-            'health':
-                'HEALTHY',
-            'networkEndpoints': [
-                {
-                    'ipAddress': '10.2.3.4',
-                    'port': 8470,
-                },
-                {
-                    'ipAddress': '10.2.3.5',
-                    'port': 8470,
-                },
-                {
-                    'ipAddress': '10.2.3.6',
-                    'port': 8470,
-                },
-                {
-                    'ipAddress': '10.2.3.7',
-                    'port': 8470,
-                },
-            ]
-        }
-    }
-
-    tpu_cluster_resolver = TPUClusterResolver(
-        project='test-project',
-        zone='us-central1-c',
-        tpu='test-tpu-1',
-        coordinator_name=None,
-        credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
-
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
-    expected_proto = """
-    job {
-      name: 'worker'
-      tasks { key: 0 value: '10.2.3.4:8470' }
-      tasks { key: 1 value: '10.2.3.5:8470' }
-      tasks { key: 2 value: '10.2.3.6:8470' }
-      tasks { key: 3 value: '10.2.3.7:8470' }
-    }
-    """
-    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-
-  def testGetMasterNoEntries(self):
-    tpu_map = {}
-
-    with self.assertRaises(ValueError):
-      TPUClusterResolver(
-          project='test-project',
-          zone='us-central1-c',
-          tpu=[],
-          coordinator_name=None,
-          credentials=None,
-          service=self.mock_service_client(tpu_map=tpu_map))
-
-  # TODO(saeta): Convert to parameterized test when included in OSS TF.
-  def verifyShouldResolve(self, tpu, should_resolve):
-    tpu_cluster_resolver = TPUClusterResolver(
-        project='test-project',
-        zone='us-central1-c',
-        tpu=tpu,
-        coordinator_name=None,
-        credentials=None,
-        service=self.mock_service_client(tpu_map={}))
-    self.assertEqual(should_resolve, tpu_cluster_resolver._shouldResolve(),
-                     "TPU: '%s'" % tpu)
-
-  def testShouldResolveNoName(self):
-    self.verifyShouldResolve('', False)
-
-  def testShouldResolveLocal(self):
-    self.verifyShouldResolve('local', False)
-
-  def testShouldResolveGrpc(self):
-    self.verifyShouldResolve('grpc://10.1.2.3:8470', False)
-
-  def testShouldResolveBns(self):
-    self.verifyShouldResolve('/bns/foo/bar', False)
-
-  def testShouldResolveName(self):
-    self.verifyShouldResolve('mytpu', True)
-
-  def testShouldResolveList(self):
-    self.verifyShouldResolve(['myothertpu'], True)
-
-  def testShouldResolveGrpcPrefix(self):
-    self.verifyShouldResolve('grpctpu', True)
-
-  def testNoCallComputeMetadata(self):
-    tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar')
-    self.assertEqual(
-        compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master())
-    self.assertEqual(None, tpu_cluster_resolver.cluster_spec())
-
-  def testGkeEnvironmentForDonut(self):
-    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
-
-    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(TPUClusterResolver._inGke())
-    self.assertEqual(
-        compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
-
-    tpu_cluster_resolver = TPUClusterResolver()
-    self.assertEqual(
-        compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver.master()))
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
-    expected_proto = """
-    job {
-      name: 'worker'
-      tasks { key: 0 value: '10.120.27.5:8470' }
-    }
-    """
-    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-
-    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
-
-  def testGkeEnvironmentForPod(self):
-    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = ('grpc://10.120.27.5:8470,'
-                                                     'grpc://10.120.27.6:8470,'
-                                                     'grpc://10.120.27.7:8470,'
-                                                     'grpc://10.120.27.8:8470')
-
-    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(TPUClusterResolver._inGke())
-    self.assertEqual(
-        compat.as_bytes('grpc://10.120.27.5:8470,'
-                        'grpc://10.120.27.6:8470,'
-                        'grpc://10.120.27.7:8470,'
-                        'grpc://10.120.27.8:8470'),
-        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
-
-    tpu_cluster_resolver = TPUClusterResolver()
-    self.assertEqual(
-        compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver.master()))
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
-    expected_proto = """
-    job {
-      name: 'worker'
-      tasks { key: 0 value: '10.120.27.5:8470' }
-      tasks { key: 1 value: '10.120.27.6:8470' }
-      tasks { key: 2 value: '10.120.27.7:8470' }
-      tasks { key: 3 value: '10.120.27.8:8470' }
-    }
-    """
-    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-
-    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
-
-  def testDiscoveryUrl(self):
-    os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
-    self.assertEqual('https://{api}.internal/{apiVersion}',
-                     TPUClusterResolver._discoveryUrl())
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index f675c135f4fc362ea620ea5b04d6b7fd536fceaf..2ad9ae42a16f690d38b8e2652e853012ec1dd267 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -1,8 +1,18 @@
 # Minimum CMake required
 cmake_minimum_required(VERSION 3.5)
 
+if(WIN32)
+	if(${CMAKE_VERSION} VERSION_LESS "3.8")
+		message(WARNING "Your current cmake version is ${CMAKE_VERSION} which does not support setting the toolset architecture to x64. This may cause \"compiler out of heap space\" errors when building. Consider upgrading your cmake to > 3.8 and using the flag -Thost=x64 when running cmake. Ignore this if you are on CMake GUI.")
+	else()
+		if(NOT CMAKE_VS_PLATFORM_TOOLSET_HOST_ARCHITECTURE OR NOT "${CMAKE_VS_PLATFORM_TOOLSET_HOST_ARCHITECTURE}" STREQUAL "x64")
+			message(WARNING "Your current cmake generator is set to use 32 bit toolset architecture. This may cause \"compiler out of heap space\" errors when building. Consider using the flag -Thost=x64 when running cmake. Ignore this if you are on CMake GUI.")
+		endif()
+	endif()
+endif()
+
 # Project
-project(tensorflow C CXX)
+project(tensorflow VERSION 1.12.0 LANGUAGES C CXX)
 
 # Set C++14 as standard for the whole project
 set(CMAKE_CXX_STANDARD 14)
@@ -42,15 +52,19 @@ option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for th
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
 option(tensorflow_DISABLE_EIGEN_FORCEINLINE "Disable forceinline, to speed up build on windows." OFF)
 
+if (WIN32)
+SET(tensorflow_WIN_CPU_SIMD_OPTIONS "/arch:AVX" CACHE STRING "Enables CPU SIMD instructions")
+SET_PROPERTY(CACHE tensorflow_WIN_CPU_SIMD_OPTIONS PROPERTY STRINGS /arch:AVX) 
+endif()
+
 # SIMD, MKL and MKLDNN options
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions" OFF)
 option(tensorflow_ENABLE_MKL_SUPPORT "Enable Intel MKL support" OFF)
 option(tensorflow_ENABLE_MKLDNN_SUPPORT "Enable Intel MKLDNN support, requires MKL enabled" OFF)
 
+
 # GPU, CUDA and cuDNN options
 option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
-set(tensorflow_CUDA_VERSION "9.0" CACHE STRING "CUDA version to build against")
-set(tensorflow_CUDNN_VERSION "7" CACHE STRING "cuDNN version to build against")
 
 if(HAIKU)
 	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
@@ -62,25 +76,30 @@ endif()
 if (NOT WIN32)
   # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
   # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
-  find_package (Threads)
+  find_package (Threads REQUIRED)
 
   # Options for linking CUDA/CUDNN libraries
-  option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
+  option(tensorflow_PATH_CUDA_LIB "Additional library search path for cudnn, nccl, culibos" /usr/local/cuda/lib64/)
   option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
   if (NOT tensorflow_CUDNN_INCLUDE)
     # option's default value is OFF. Fill it with real default values
     set(tensorflow_CUDNN_INCLUDE /usr/include)
   endif (NOT tensorflow_CUDNN_INCLUDE)
-  option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB})
-  if (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
+  option(tensorflow_NCCL_INCLUDE "nccl.h header install path" /usr/include/)
+  if (NOT tensorflow_NCCL_INCLUDE)
+    # option's default value is OFF. Fill it with real default values
+    set(tensorflow_NCCL_INCLUDE /usr/include)
+  endif (NOT tensorflow_NCCL_INCLUDE)
+  option(tensorflow_PATH_CUDNN_LIB "Override PATH_CUDA_LIB for cudnn" ${tensorflow_PATH_CUDA_LIB})
+  if (NOT tensorflow_PATH_CUDNN_LIB)
     # option's default value is OFF. Fill it with real default values
-    set (tensorflow_PATH_CUDNN_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
-  endif (NOT tensorflow_PATH_CUDNN_STATIC_LIB)
-  option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB})
-  if (NOT tensorflow_PATH_NCCL_STATIC_LIB)
+    set (tensorflow_PATH_CUDNN_LIB ${tensorflow_PATH_CUDA_LIB})
+  endif (NOT tensorflow_PATH_CUDNN_LIB)
+  option(tensorflow_PATH_NCCL_LIB "Override PATH_CUDA_LIB for nccl" ${tensorflow_PATH_CUDA_LIB})
+  if (NOT tensorflow_PATH_NCCL_LIB)
     # option's default value is OFF. Fill it with real default values
-    set (tensorflow_PATH_NCCL_STATIC_LIB ${tensorflow_PATH_STATIC_LIB})
-  endif (NOT tensorflow_PATH_NCCL_STATIC_LIB)
+    set (tensorflow_PATH_NCCL_LIB ${tensorflow_PATH_CUDA_LIB})
+  endif (NOT tensorflow_PATH_NCCL_LIB)
   option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64)
   if (NOT tensorflow_CUDA_LIBRARY_PATH)
     # option's default value is OFF. Fill it with real default values
@@ -89,10 +108,12 @@ if (NOT WIN32)
 
   # Options for linking other libraries
   option(systemlib_ZLIB "Use the system installed library as shared objects instead of downloading ZLIB and statically linking to it: ZLIB" OFF)
+  option(systemlib_ABSEIL_CPP "Use the system installed library as shared objects instead of downloading ABSEIL_CPP and statically linking to it: ABSEIL_CPP" OFF)
 
   option(systemlib_ALL "Turn on every possible systemlib_* options" OFF)
   if (systemlib_ALL)
     set (systemlib_ZLIB ON)
+    set (systemlib_ABSEIL_CPP ON)
   endif (systemlib_ALL)
 endif()
 
@@ -114,7 +135,7 @@ function(SHOW_VARIABLES)
 endfunction()
 
 # External dependencies
-set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external)
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external ${PROJECT_SOURCE_DIR}/modules)
 
 # Location where external projects will be downloaded
 set (DOWNLOAD_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/downloads"
@@ -183,6 +204,7 @@ if(WIN32)
   set(CMAKE_SUPPRESS_REGENERATION ON)
 endif()
 
+
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -std=c++11")
 endif()
@@ -198,14 +220,17 @@ endif()
 include(CheckCXXCompilerFlag)
 
 # OpenMP Support
-CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
-if (GCC_OPENMP_SUPPORT)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-endif()
-CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
-if (MSVC_OPENMP_SUPPORT)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
-endif()
+if (WIN32)
+  CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
+  if (MSVC_OPENMP_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+  endif()
+else (WIN32)
+  CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
+  if (GCC_OPENMP_SUPPORT)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+  endif()
+endif (WIN32)
 
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
@@ -235,6 +260,7 @@ include(re2)
 include(cub)
 include(sqlite)
 include(double_conversion)
+include(abseil_cpp)
 if (tensorflow_BUILD_CC_TESTS)
   include(googletest)
 endif()
@@ -243,6 +269,7 @@ add_definitions(${ADD_CFLAGS})
 link_directories(${ADD_LINK_DIRECTORY})
 
 set(tensorflow_EXTERNAL_LIBRARIES
+    ${tensorflow_EXTERNAL_LIBRARIES}
     ${gif_STATIC_LIBRARIES}
     ${png_STATIC_LIBRARIES}
     ${jpeg_STATIC_LIBRARIES}
@@ -266,6 +293,14 @@ else (systemlib_ZLIB)
     ${zlib_STATIC_LIBRARIES})
 endif (systemlib_ZLIB)
 
+if (systemlib_ABSEIL_CPP)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+      ${abseil_cpp_LIBRARIES})
+else (systemlib_ABSEIL_CPP)
+  set(tensorflow_EXTERNAL_LIBRARIES ${tensorflow_EXTERNAL_LIBRARIES}
+    ${abseil_cpp_STATIC_LIBRARIES})
+endif (systemlib_ABSEIL_CPP)
+
 set(tensorflow_EXTERNAL_DEPENDENCIES
     zlib_copy_headers_to_destination
     gif_copy_headers_to_destination
@@ -352,9 +387,7 @@ if (tensorflow_ENABLE_MKL_SUPPORT)
     list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
     list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn_copy_shared_to_destination)
     include_directories(${mkldnn_INCLUDE_DIRS})
-  else (tensorflow_ENABLE_MKLDNN_SUPPORT)
-    add_definitions(-DINTEL_MKL_ML_ONLY)
-  endif()
+  endif(tensorflow_ENABLE_MKLDNN_SUPPORT)
 endif (tensorflow_ENABLE_MKL_SUPPORT)
 
 if (tensorflow_ENABLE_GPU)
@@ -365,32 +398,23 @@ if (tensorflow_ENABLE_GPU)
     list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
   endif (NOT WIN32)
 
-  # later command will make use of the value in tensorflow_CUDA_VERSION
-  find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED EXACT)
-
-  # Test compatibility of compiler on CUDA
-  try_compile(CUDA_TEST_COMPILE_C
-    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
-    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.c
-    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
-  try_compile(CUDA_TEST_COMPILE_CXX
-    ${CMAKE_CURRENT_BINARY_DIR}/tests/cuda
-    ${CMAKE_CURRENT_SOURCE_DIR}/tests/cuda/compatibility_test.cc
-    CMAKE_FLAGS -DINCLUDE_DIRECTORIES=${CUDA_INCLUDE_DIRS})
-  if(NOT (CUDA_TEST_COMPILE_C AND CUDA_TEST_COMPILE_CXX))
-    message(FATAL_ERROR "Selected compiler (or version) is not supported for CUDA")
+  # minimum 9.0 in cuda version
+  find_package(CUDA 9.0 REQUIRED)
+  if(NOT CUDA_FOUND)
+    message(FATAL_ERROR "CUDA not found.")
   endif()
 
-  # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
-  # CUDA_NVCC_FLAGS and cuda_config.h below
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_37,code=\"sm_37,compute_37\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_52,code=\"sm_52,compute_52\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_60,code=\"sm_60,compute_60\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_61,code=\"sm_61,compute_61\")
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_70,code=\"sm_70,compute_70\")
+  # use cmake internal CUDA_ARCH_NAME switch
+  # e.g. CUDA_ARCH_NAME="Auto" will autodetect
+  #      CUDA_ARCH_NAME="All"  will use all arches
+  cuda_select_nvcc_arch_flags(NVCC_ARCH_FLAGS ${CUDA_ARCH_NAME})
+  list(APPEND CUDA_NVCC_FLAGS ${NVCC_ARCH_FLAGS})
+  message(STATUS "Using CUDA arch flags: ${NVCC_ARCH_FLAGS_readable}")
+
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
   set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
+
   include_directories(${CUDA_INCLUDE})
   if (WIN32)
     add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.7,5.2,6.0,6.1,7.0)
@@ -411,43 +435,94 @@ if (tensorflow_ENABLE_GPU)
   else (WIN32)
     set(CUDNN_INCLUDE "${tensorflow_CUDNN_INCLUDE}")
 
-    find_library(nccl_STATIC_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
-    if (NOT nccl_STATIC_LIBRARY)
+    if (tensorflow_BUILD_SHARED_LIB)
+      find_library(nccl_LIBRARY NAMES libnccl.so PATHS ${tensorflow_PATH_NCCL_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    else (tensorflow_BUILD_SHARED_LIB)
+      find_library(nccl_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    endif (tensorflow_BUILD_SHARED_LIB)
+    if (NOT nccl_LIBRARY)
       message(FATAL_ERROR "NCCL is required for GPU-build")
-    else (NOT nccl_STATIC_LIBRARY)
-      message("nccl-static: ${nccl_STATIC_LIBRARY}")
+    else (NOT nccl_LIBRARY)
+      message("nccl: ${nccl_LIBRARY}")
       # something like /usr/lib64/libnccl_static.a
-    endif (NOT nccl_STATIC_LIBRARY)
-
-    find_library(cudnn_STATIC_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
-    if (NOT cudnn_STATIC_LIBRARY)
+    endif (NOT nccl_LIBRARY)
+
+    if (tensorflow_BUILD_SHARED_LIB)
+      find_library(cudnn_LIBRARY NAMES libcudnn.so PATHS ${tensorflow_PATH_CUDNN_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    else (tensorflow_BUILD_SHARED_LIB)
+      find_library(cudnn_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    endif (tensorflow_BUILD_SHARED_LIB)
+    if (NOT cudnn_LIBRARY)
       message(FATAL_ERROR "CUDNN is required for GPU-build")
-    else (NOT cudnn_STATIC_LIBRARY)
-      message("cudnn-static: ${cudnn_STATIC_LIBRARY}")
-    endif (NOT cudnn_STATIC_LIBRARY)
-
-    find_library(culibos_STATIC_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
-    if (NOT culibos_STATIC_LIBRARY)
+    else (NOT cudnn_LIBRARY)
+      file(READ ${CUDNN_INCLUDE}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
+      # fetch cudnn version
+      string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
+             CUDNN_VERSION_MAJOR "${CUDNN_VERSION_FILE_CONTENTS}")
+      string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
+             CUDNN_VERSION_MAJOR "${CUDNN_VERSION_MAJOR}")
+      string(REGEX MATCH "define CUDNN_MINOR * +([0-9]+)"
+             CUDNN_VERSION_MINOR "${CUDNN_VERSION_FILE_CONTENTS}")
+      string(REGEX REPLACE "define CUDNN_MINOR * +([0-9]+)" "\\1"
+             CUDNN_VERSION_MINOR "${CUDNN_VERSION_MINOR}")
+      string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
+             CUDNN_VERSION_PATCH "${CUDNN_VERSION_FILE_CONTENTS}")
+      string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
+             CUDNN_VERSION_PATCH "${CUDNN_VERSION_PATCH}")
+      if(NOT CUDNN_VERSION_MAJOR)
+        set(CUDNN_VERSION "???")
+      else()
+        set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}")
+      endif()
+      message(STATUS "cudnn library: ${cudnn_LIBRARY} (found version: \"${CUDNN_VERSION}\")")
+    endif (NOT cudnn_LIBRARY)
+
+    if (tensorflow_BUILD_SHARED_LIB)
+      # shared first (if exists) else static one
+      find_library(culibos_LIBRARY NAMES libculibos.so libculibos.a PATHS ${tensorflow_PATH_CUDA_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    else (tensorflow_BUILD_SHARED_LIB)
+      # only static version
+      find_library(culibos_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_CUDA_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    endif (tensorflow_BUILD_SHARED_LIB)
+    if (NOT culibos_LIBRARY)
       message(FATAL_ERROR "CULIBOS is required for GPU-build")
-    else (NOT culibos_STATIC_LIBRARY)
-      message("culibos-static: ${culibos_STATIC_LIBRARY}")
-    endif (NOT culibos_STATIC_LIBRARY)
+    else (NOT culibos_LIBRARY)
+      message("culibos: ${culibos_LIBRARY}")
+    endif (NOT culibos_LIBRARY)
 
     set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
-      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
+      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_LIBRARY} ${culibos_LIBRARY} ${nccl_LIBRARY})
   endif (WIN32)
   include_directories(${CUDNN_INCLUDE})
 
   # Remove "." from CUDA version variable.
-  string(REPLACE "." "" short_CUDA_VER ${tensorflow_CUDA_VERSION})
+  string(REPLACE "." "" short_CUDA_VER ${CUDA_VERSION})
+
+  # List of enumerated CUDA caps
+  string(REPLACE " " ";" NVCC_ARCH_LIST "${NVCC_ARCH_FLAGS_readable}")
+  set(list ${NVCC_ARCH_LIST})
+
+  # Construct capability string
+  foreach(NVCC_ARCH ${NVCC_ARCH_LIST})
+    if (NVCC_ARCH MATCHES "sm_")
+      string(REGEX REPLACE "^.sm*" "" NVCC_ARCH ${NVCC_ARCH})
+      math(EXPR NVCC_ARCH_MAJOR "${NVCC_ARCH} / 10")
+      math(EXPR NVCC_ARCH_MINOR "(${NVCC_ARCH} - (${NVCC_ARCH_MAJOR}*10))")
+      if (TF_CUDA_CAP)
+        set(TF_CUDA_CAP "${TF_CUDA_CAP},CudaVersion(\"${NVCC_ARCH_MAJOR}.${NVCC_ARCH_MINOR}\")")
+      else (TF_CUDA_CAP)
+        set(TF_CUDA_CAP "CudaVersion(\"${NVCC_ARCH_MAJOR}.${NVCC_ARCH_MINOR}\")")
+      endif (TF_CUDA_CAP)
+    endif()
+  endforeach()
 
   # create cuda_config.h
   FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
     "#ifndef CUDA_CUDA_CONFIG_H_\n"
     "#define CUDA_CUDA_CONFIG_H_\n"
-    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.7\"),CudaVersion(\"5.2\"),CudaVersion(\"6.0\"),CudaVersion(\"6.1\"),CudaVersion(\"7.0\")\n"
+    "#define TF_CUDA_CAPABILITIES ${TF_CUDA_CAP}\n"
     "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n"
-    "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n"
+    "#define TF_CUDNN_VERSION \"64_${CUDNN_VERSION}\"\n"
     "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
     "#endif  // CUDA_CUDA_CONFIG_H_\n"
   )
@@ -482,24 +557,30 @@ if (tensorflow_ENABLE_GPU)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
       msvcp_dll_name=msvcp140.dll
       cudart_dll_name=cudart64_${short_CUDA_VER}.dll
-      cuda_version_number=${tensorflow_CUDA_VERSION}
+      cuda_version_number=${CUDA_VERSION}
       nvcuda_dll_name=nvcuda.dll
       cudnn_dll_name=cudnn64_${tensorflow_CUDNN_VERSION}.dll
       cudnn_version_number=${tensorflow_CUDNN_VERSION})
   else(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
-	    cuda_version_number=${tensorflow_CUDA_VERSION}
-	    cudnn_version_number=${tensorflow_CUDNN_VERSION})
+      cuda_version_number=${CUDA_VERSION}
+      cudnn_version_number=${tensorflow_CUDNN_VERSION})
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
-  set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
-    msvcp_dll_name=msvcp140.dll)
+  if(WIN32)
+    set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
+      msvcp_dll_name=msvcp140.dll)
+  else()
+    set(tensorflow_BUILD_INFO_FLAGS --build_config cpu)
+  endif()
 endif(tensorflow_ENABLE_GPU)
 
-# Find python executable
-include(FindPythonInterp)
-if(NOT ${PYTHONINTERP_FOUND})
-    message(FATAL_ERROR "CMake was unable to find a python interpreter.")
+if(tensorflow_BUILD_PYTHON_BINDINGS)
+  # Find python executable
+  include(FindPythonInterp)
+  if(NOT ${PYTHONINTERP_FOUND})
+      message(FATAL_ERROR "CMake was unable to find a python interpreter.")
+  endif()
 endif()
 
 # Let's get to work!
@@ -520,6 +601,7 @@ include(tf_cc_ops.cmake)
 include(tf_c.cmake)
 include(tf_grappler.cmake)
 include(tf_core_profiler.cmake)
+include(tf_core_eager_runtime.cmake)
 if(tensorflow_BUILD_CC_EXAMPLE)
   include(tf_tutorials.cmake)
   include(tf_label_image_example.cmake)
@@ -533,4 +615,4 @@ if(tensorflow_BUILD_SHARED_LIB)
 endif()
 if(tensorflow_BUILD_CC_TESTS OR tensorflow_BUILD_PYTHON_TESTS)
   include(tf_tests.cmake)
-endif()
+endif()
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 77242b34fd8302cb9104c50a83d4141607911e7f..df8b48dfc46124d3b9454d92ffb70dbcf1bc4217 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -5,10 +5,10 @@ CMAKE build is deprecated for TensorFlow. Please use `bazel` to build TF for all
 platforms. For details, see the
 [TensorFlow install guide](https://www.tensorflow.org/install/).
 
-This directory contains CMake files for building TensorFlow on Microsoft
-Windows. [CMake](https://cmake.org) is a cross-platform tool that can
-generate build scripts for multiple build systems, including Microsoft
-Visual Studio.
+This directory contains CMake files for building TensorFlow on Microsoft Windows
+and Linux. [CMake](https://cmake.org) is a cross-platform tool that can generate
+build scripts for multiple build systems, including Microsoft Visual Studio and
+GCC. "The method has not been tested on Mac OS X.
 
 **N.B.** We provide Linux build instructions primarily for the purpose of
 testing the build. We recommend using the standard Bazel-based build on
@@ -17,12 +17,17 @@ Linux.
 Current Status
 --------------
 
-CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/install/source_windows)
-for instructions on how to install a pre-built TensorFlow package on Windows.
+CMake can be used to build TensorFlow on all platforms. See the
+[getting started documentation](https://www.tensorflow.org/install/install_windows)
+for instructions on how to install a pre-built TensorFlow package on Windows and
+Linux. The procedure in MacOS is similar to the Linux build.
 
 ### Current known limitations
-* It is not possible to load a custom Op library.
-* GCS file system is not supported.
+
+*   It is not possible to load a custom Op library.
+*   GCS file system is not supported.
+*   Debug build is not available since Python for Windows is no longer
+    distributed with a debug library.
 
 ## Building with CMake
 
@@ -32,70 +37,88 @@ bindings.
 
 ### Prerequisites
 
-* CMake version 3.5 or later.
+*   CMake version 3.5 or later.
+
+*   [Git](https://git-scm.com)
+
+*   [SWIG](http://www.swig.org/download.html)
+
+*   [Perl](https://www.perl.org/get.html) (optional, for SSL support build)
+
+*   [Go](https://golang.org/) (optional, for SSL support build)
 
-* [Git](https://git-scm.com)
+*   [NASM](http://www.nasm.us/)/[YASM](http://yasm.tortall.net/) (optional, for
+    SSL support build)
 
-* [SWIG](http://www.swig.org/download.html)
+*   Additional pre-requisites for Microsoft Windows:
 
-* Additional prerequisites for Microsoft Windows:
-  - Visual Studio 2015
-  - Python 3.5
+    -   Visual Studio 2015 (latest version of MSVC 2017 is not supported by CUDA
+        yet, try it on your own risk)
 
-* Additional prerequisites for Linux:
-  - Python 2.7 or later
-  - [Docker](https://www.docker.com/) (for automated testing)
+    -   Python 3.5
 
-* Python dependencies:
-  - wheel
-  - NumPy 1.11.0 or later
+*   Additional prerequisites for Linux:
+
+    -   Python 2.7 or later
+    -   [Docker](https://www.docker.com/) (for automated testing)
+
+*   Python dependencies:
+
+    -   wheel
+    -   NumPy 1.11.0 or later
 
 ### Known-good configurations
 
-* Microsoft Windows 10
-  - Microsoft Visual Studio Enterprise 2015 with Visual C++ 2015
-  - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.anaconda.com/download/)
-  - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
-  - [swigwin-3.0.10](http://www.swig.org/download.html)
-  - [NVidia CUDA Toolkit 8.0](https://developer.nvidia.com/cuda-downloads)
-  - [NVidia CUDNN 5.1](https://developer.nvidia.com/cudnn)
-  - [CMake 3.6](https://cmake.org/files/v3.6/cmake-3.6.3-win64-x64.msi)
+*   Microsoft Windows 10
+
+    -   Microsoft Visual Studio Enterprise/ Community 2015 with Visual C++ 2015
+    -   [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.anaconda.com/download/)
+    -   [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
+    -   [swigwin-3.0.10](http://www.swig.org/download.html)
+    -   [NVidia CUDA Toolkit 9.0](https://developer.nvidia.com/cuda-downloads)
+    -   [NVidia CUDNN 7](https://developer.nvidia.com/cudnn)
+    -   [CMake 3.6](https://cmake.org/files/v3.6/cmake-3.6.3-win64-x64.msi)
 
-* Ubuntu 14.04
-  - Makefile generator
-  - Docker 1.9.1 (for automated testing)
+*   Ubuntu 14.04
+
+    -   Makefile generator
+    -   Docker 1.9.1 (for automated testing)
 
 ### Current known limitations
-  - The Python package supports **Python 3.5 only**, because that is the only
-    version for which standard Python binaries exist and those binaries are
-    compatible with the TensorFlow runtime. (On Windows, the standard Python
+
+-   The Python package supports **Python 3.5/3.6 only**, because these are the
+    only versions for which standard Python binaries exist and those binaries
+    are compatible with the TensorFlow runtime. (On Windows, the standard Python
     binaries for versions earlier than 3.5 were compiled with older compilers
     that do not have all of the features (e.g. C++11 support) needed to compile
-    TensorFlow. We welcome patches for making TensorFlow work with Python 2.7
-    on Windows, but have not yet committed to supporting that configuration.)
-
-  - The following Python APIs are not currently implemented:
-    * Loading custom op libraries via `tf.load_op_library()`. In order to use your
-      custom op, please put the source code under the tensorflow/core/user_ops
-      directory, and a shape function is required (not optional) for each op.
-    * Path manipulation functions (such as `tf.gfile.ListDirectory()`) are not
-      functional.
-
-  - The `tf.contrib` libraries are not currently included in the PIP package.
-
-  - The following operations are not currently implemented:
-    * `DepthwiseConv2dNative`
-    * `Digamma`
-    * `Erf`
-    * `Erfc`
-    * `Igamma`
-    * `Igammac`
-    * `ImmutableConst`
-    * `Lgamma`
-    * `Polygamma`
-    * `Zeta`
-
-  - Google Cloud Storage support is not currently implemented. The GCS library
+    TensorFlow. We welcome patches for making TensorFlow work with Python 2.7 on
+    Windows, but have not yet committed to supporting that configuration.)
+
+-   The following Python APIs are not currently implemented:
+
+    *   Loading custom op libraries via `tf.load_op_library()`. In order to use
+        your custom op, please put the source code under the
+        tensorflow/core/user_ops directory, and a shape function is required
+        (not optional) for each op.
+    *   Path manipulation functions (such as `tf.gfile.ListDirectory()`) are not
+        functional.
+
+-   The `tf.contrib` libraries are not currently included in the PIP package.
+
+-   The following operations are not currently implemented:
+
+    *   `DepthwiseConv2dNative`
+    *   `Digamma`
+    *   `Erf`
+    *   `Erfc`
+    *   `Igamma`
+    *   `Igammac`
+    *   `ImmutableConst`
+    *   `Lgamma`
+    *   `Polygamma`
+    *   `Zeta`
+
+-   Google Cloud Storage support is not currently implemented. The GCS library
     currently depends on `libcurl` and `boringssl`, and the Windows version
     could use standard Windows APIs for making HTTP requests and cryptography
     (for OAuth). Contributions are welcome for this feature.
@@ -104,184 +127,383 @@ We are actively working on improving CMake and Windows support, and addressing
 these limitations. We would appreciate pull requests that implement missing
 ops or APIs.
 
+# CMake GUI build (all platforms)
+
+Install from CMake GUI would be a convenient way to generate C++ build projects.
+The software supports Windows, MacOS and Linux, while the posix platform
+provides an extra ccmake binary to run command line GUI. Both working principal
+of cmake, ccmake and cmake-gui are the same, the only difference is by providing
+suitable interface for project configuration and dependency setting.
+
+1.  Pre-buid checklist: The following binary/libraries should be setted in
+    system path, otherwise you need to set manualy via cmake.
+    *   Compiler (GCC for Linux, MSVC for Windows)
+    *   Make sure compiler directory has been set to system path
+    *   CUDA 9.0 (GPU build)
+    *   CUDNN (GPU build)
+    *   NCCL (GPU build on Linux)
+    *   SWIG (python binding)
+    *   Perl (required if you need ssl support, optional)
+    *   Go (required if you need ssl support, optional)
+    *   NASM/YASM (required by grpc for ssl support, optional)
+2.  Start CMake GUI
+3.  Click on `Browse Source` and direct to the the folder
+    `<tensorflow-source>/tensorflow/contrib/cmake`
+4.  Click on `Browse Build` and spectify a location that you want tensorflow to
+    be build
+5.  Click on `Configure`, a new window will be prompted out, specify the
+    generator mode for the project generation. For Windows, choose `Visual
+    Studio <version> <year> Win64`, for Linux, choose `Unix Makefiles`, then
+    press `Finish`. Wait for a moment, the default project dependecy would
+    automatically generate.
+6.  There are a few options that you can customize your own build. **The setting
+    here is crucial for a sucessful build, please check all items carefully.**
+
+    *   `tensorflow_BUILD_ALL_KERNELS` should alway be `on`
+    *   `tensorflow_BUILD_CC_EXAMPLE` is default to be `on`. This can help you
+        to test build (optional)
+    *   `tensorflow_BUILD_CONTRIB_KERNELS` is default to be `on`, but it won't
+        affect tensorflow function, turn it to `off` if you want a slim build.
+        (optional)
+    *   `tensorflow_BUILD_PYTHON_BINDING` is default to be `on`. Set to `off` if
+        you don't need python interaface. If SWIG is not in system path, you
+        need set it manually. (optional)
+    *   `tensorflow_BUILD_SHARED_LIB` is default to be `off`. Set to `on` if you
+        want the c++ interface. (optional)
+    *   `tensorflow_ENABLE_GPU` is default to be `off`. Set to `on` if you want
+        GPU support. It will search CUDA and CUDNN dependecies if you have set
+        them to system path, otherwise CMake would prompt error and request you
+        to set it manually. (optional)
+    *   `tensorflow_ENABLE_GRPC_SUPPORT` is default to be `on`. For Linux build,
+        this option must always be `on`. This need to be `on` for a gpu build.
+        Reminded that Perl, Go and NASM/YASM are required for this option if you
+        want to build grpc with offical SSL support.
+    *   `tensorflow_ENABLE_POSITION_INDEPENDENT_CODE` should always be `on`
+    *   `tensorflow_ENABLE_SNAPPY_SUPPORT` should always be `on`
+    *   `tensorflow_OPTIMIZE_FOR_NATIVE_ARCH` should always be `on`
+    *   `CMAKE_INSTALL_PREFIX` is the location where the final package will be
+        installed. You may change it to your own preferred path (optional)
+
+7.  After changing the configuration in step 5, press `Configure` again
+
+8.  If not error is found, press `Generate`
+
+#### Windows
+
+1.  Open `tensorflow.sln` in the build folder (Windows). Change build type from
+    `Debug` to `Release`. Choose `Build`->`Build Solution`. This may take more
+    than hours of compilation. If everything is alright, the output window would
+    show no error.
+
+    ##### Python
+
+    In solution explorer, right click on `tf_python_build_pip_package` ->
+    `build`. It will generate the wheel file in
+    `<tensorflow-build>/tf_python/dist`. Install with following command:
+
+    `pip install --upgrade tensorflow-<config>.whl`
+
+    ***The wheel name varies depends on you config. Change to your own wheel
+    filename.***
+
+    Reminded that some pip installation requires administrator right command
+    prompt.
+
+    ##### C++
+
+    You can directly use the build folder tree for C++ interface with cmake. If
+    you want to do installation for api releasing, right click on `Install` ->
+    `build`. The headers and library will be installed in the directory specify
+    by `CMAKE_INSTALL_PREFIX` during configuration.
+
+1.  For smaller RAM computer, it is noticed that out of heap space error
+    appears. Change to command prompt build is an alternative to do step 1.
+
+    Open `VS2015 x64 Native Tools Command Prompt`. You can open it by press
+    `Start`, then type the binary name. Use `VS2017 x64 Native Tools Command
+    Prompt` if you are using MSVC 2017.
+
+    ##### Python
+
+    Directly build python wheel package by following command:
+
+    `MSBuild /p:Configuration=Release
+    <path-to-tf_python_build_pip_package.vcxproj>`
+
+    Remember to change `<path-to-tf_python_build_pip_package.vcxproj>` to the
+    actual path of the file, it can be found at the root of build directory
+
+    Install the wheel file generated as instructed by step 1.
+
+    ##### C++ interface
+
+    Build from VS native toolchain with following command: `MSBuild
+    /p:Configuration=Release <path-to-ALL_BUILD.vcxproj>`
+
+    Headers are discretely located in the build folders. Tensorflow library can
+    be found at `<path-to-build>/Release`, namely `tensorflow.dll` and
+    `tensorflow.lib`.
+
+    *   Build to install for api release (optional): `MSBuild
+        /p:Configuration=Release <path-to-INSTALL.vcxproj>`
+
+    Remember to change `<path-to-ALL_BUILD.vcxproj>` and
+    `<path-to-INSTALL.vcxproj>` to the actual path of the file, it can be found
+    at the root of build directory.
+
+#### Linux/MacOS (command line GNU build)
+
+1.  Open the terminal, change working directory to the one specified in step 3.
+
+2.  Type the following command:
+
+    `make -sj<number-of-threads> all`
+
+    ##### Python
+
+    **Important Note** CMake generated python wheel for Linux/MacOs is currently
+    under development. Please use bazel build.
+
+    Follow code is an expected Linux/MacOS python package build after
+    development work is completed.
+
+    ```
+    make -sj<number-of-threads> tf_python_build_pip_package
+    cd tf_python
+    pip install --upgrade tensorflow-<config>.whl
+    ```
+
+    ##### C++ interface
+
+    `make -sj<number-of-threads> install`
+
+    Where `<number-of-threads>` is the threads used for the compilation, change
+    to any integer less or equal to your computer's maxiumum thread number.
+
+    Headers are discretely located in the build folders. Tensorflow library can
+    be found at `<path-to-build>`, namely `tensorflow.so` (Linux) or
+    `tensorflow.dylib` (MacOS).
+
+#### Start a Tensorflow C++ project with CMake
+
+Here we assume that you have basic knowledge on gathering dependency with
+`CMakeLists.txt`. Here we introduce how the C++ api works with
+[official hello world tutorial](https://www.tensorflow.org/api_guides/cc/guide).
+
+1.  Create a new working directory and create a new text file named
+    `CMakeLists.txt` and the c++ file `main.cxx`
+2.  Fill in the `main.cxx` with the code provided in
+    [official c++ api basic](https://www.tensorflow.org/api_guides/cc/guide).
+3.  Fill in the `CMakeLists.txt` with following code: ``` cmake
+    cmake_minimum_required (VERSION 2.6) project (tf_hello)
+
+    # Tensorflow
+
+    find_package(Tensorflow REQUIRED)
+    include_directories(${TENSORFLOW_INCLUDE_DIRS})
+
+    # compiler setting required by tensorflow, to be tested on all compilers
+
+    # currently only tested on MSVC and GCC
+
+    if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) add_definitions(-DCOMPILER_MSVC)
+    elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU) if
+    (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS "3")
+    add_definitions(-DCOMPILER_GCC3) else() add_definitions(-D__GNUC__) endif()
+    else() message(ERROR " compiler ${CMAKE_CXX_COMPILER_ID} not supported by
+    this CMakeList.txt, under development") endif()
+
+    add_executable(tf_hello main.cxx) target_link_libraries(tf_hello
+    ${TENSORFLOW_LIBRARIES}) ```
+
+4.  Configure the folder with cmake-gui, an error should be prompted out,
+    requesting you to locate the folder containing `TensorflowConfig.cmake`.
+    This file can be found at `<tensorflow-build>` or `<tensorflow-intall>` (for
+    those have build install in previous steps).
+
+5.  Configure again, generate the project.
+
+6.  Compile the project with `Release` config (Windows). For Linux users, just
+    compile the project.
+
+7.  Copy the `tensorflow.dll`(Windows)/`tensorflow.so`(Linux) from build
+    directory to the build folder containing `tf_hello` binary.
+
+8.  Run `tf_hello` binary
+
+# Step-by-step Windows build (command prompt)
+
+1.  Install the prerequisites detailed above, and set up your environment.
+
+    *   When building with GPU support after installing the CUDNN zip file from
+        NVidia, append its bin directory to your PATH environment variable. In
+        case TensorFlow fails to find the CUDA dll's during initialization,
+        check your PATH environment variable. It should contain the directory of
+        the CUDA dlls and the directory of the CUDNN dll. For example:
+
+        ```
+        D:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin
+        D:\local\cuda\bin
+        ```
+
+    *   When building with MKL support after installing
+        [MKL](https://software.intel.com/en-us/mkl) from INTEL, append its bin
+        directories to your PATH environment variable.
+
+        In case TensorFlow fails to find the MKL dll's during initialization,
+        check your PATH environment variable. It should contain the directory of
+        the MKL dlls. For example:
 
-Step-by-step Windows build
-==========================
-
-1. Install the prerequisites detailed above, and set up your environment.
-
-   * The following commands assume that you are using the Windows Command
-     Prompt (`cmd.exe`). You will need to set up your environment to use the
-     appropriate toolchain, i.e. the 64-bit tools. (Some of the binary targets
-     we will build are too large for the 32-bit tools, and they will fail with
-     out-of-memory errors.) The typical command to do set up your
-     environment is:
-
-     ```
-     D:\temp> "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvarsall.bat"
-     ```
-
-   * When building with GPU support after installing the CUDNN zip file from NVidia, append its
-     bin directory to your PATH environment variable.
-     In case TensorFlow fails to find the CUDA dll's during initialization, check your PATH environment variable.
-     It should contain the directory of the CUDA dlls and the directory of the CUDNN dll.
-     For example:
-
-     ```
-     D:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin
-     D:\local\cuda\bin
-     ```
-
-   * When building with MKL support after installing [MKL](https://software.intel.com/en-us/mkl) from INTEL, append its bin directories to your PATH environment variable.
-
-     In case TensorFlow fails to find the MKL dll's during initialization, check your PATH environment variable.
-     It should contain the directory of the MKL dlls. For example:
-
-     ```
-     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\mkl
-     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\compiler
-     D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\tbb\vc_mt
-     ```
-
-
-   * We assume that `cmake` and `git` are installed and in your `%PATH%`. If
-     for example `cmake` is not in your path and it is installed in
-     `C:\Program Files (x86)\CMake\bin\cmake.exe`, you can add this directory
-     to your `%PATH%` as follows:
-
-     ```
-     D:\temp> set PATH="%PATH%;C:\Program Files (x86)\CMake\bin\cmake.exe"
-     ```
-
-2. Clone the TensorFlow repository and create a working directory for your
-   build:
-
-   ```
-   D:\temp> git clone https://github.com/tensorflow/tensorflow.git
-   D:\temp> cd tensorflow\tensorflow\contrib\cmake
-   D:\temp\tensorflow\tensorflow\contrib\cmake> mkdir build
-   D:\temp\tensorflow\tensorflow\contrib\cmake> cd build
-   D:\temp\tensorflow\tensorflow\contrib\cmake\build>
-   ```
-
-3. Invoke CMake to create Visual Studio solution and project files.
-
-   **N.B.** This assumes that `cmake.exe` is in your `%PATH%` environment
-   variable. The other paths are for illustrative purposes only, and may
-   be different on your platform. The `^` character is a line continuation
-   and must be the last character on each line.
-
-   ```
-   D:\...\build> cmake .. -A x64 -DCMAKE_BUILD_TYPE=Release ^
-   More? -DSWIG_EXECUTABLE=C:/tools/swigwin-3.0.10/swig.exe ^
-   More? -DPYTHON_EXECUTABLE=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/python.exe ^
-   More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib
-   ```
-   To build with GPU support add "^" at the end of the last line above following with:
-   ```
-   More? -Dtensorflow_ENABLE_GPU=ON ^
-   More? -DCUDNN_HOME="D:\...\cudnn"
-   ```
-   To build with MKL support add "^" at the end of the last line above following with:
-
-   ```
-   More? -Dtensorflow_ENABLE_MKL_SUPPORT=ON ^
-   More? -DMKL_HOME="D:\...\compilers_and_libraries"
-   ```
-
-   To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
-
-   ```
-   More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
-   ```
-
-   Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
-   configuration that you choose when invoking `msbuild`. The known-good
-   values are `Release` and `RelWithDebInfo`. The `Debug` build type is
-   not currently supported, because it relies on a `Debug` library for
-   Python (`python35d.lib`) that is not distributed by default.
-
-   There are various options that can be specified when generating the
-   solution and project files:
-
-   * `-DCMAKE_BUILD_TYPE=(Release|RelWithDebInfo)`: Note that the
-     `CMAKE_BUILD_TYPE` option must match the build configuration that you
-     choose when invoking MSBuild in step 4. The known-good values are
-     `Release` and `RelWithDebInfo`. The `Debug` build type is not currently
-     supported, because it relies on a `Debug` library for Python
-     (`python35d.lib`) that is not distributed by default.
-
-   * `-Dtensorflow_BUILD_ALL_KERNELS=(ON|OFF)`. Defaults to `ON`. You can
-     build a small subset of the kernels for a faster build by setting this
-     option to `OFF`.
-
-   * `-Dtensorflow_BUILD_CC_EXAMPLE=(ON|OFF)`. Defaults to `ON`. Generate
-     project files for a simple C++
-     [example training program](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/tutorials/example_trainer.cc).
-
-   * `-Dtensorflow_BUILD_PYTHON_BINDINGS=(ON|OFF)`. Defaults to `ON`. Generate
-     project files for building a PIP package containing the TensorFlow runtime
-     and its Python bindings.
-
-   * `-Dtensorflow_ENABLE_GRPC_SUPPORT=(ON|OFF)`. Defaults to `ON`. Include
-     gRPC support and the distributed client and server code in the TensorFlow
-     runtime.
-
-   * `-Dtensorflow_ENABLE_SSL_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include
-     SSL support (for making secure HTTP requests) in the TensorFlow runtime.
-     This support is incomplete, and will be used for Google Cloud Storage
-     support.
-
-   * `-Dtensorflow_ENABLE_GPU=(ON|OFF)`. Defaults to `OFF`. Include
-     GPU support. If GPU is enabled you need to install the CUDA 8.0 Toolkit and CUDNN 5.1.
-     CMake will expect the location of CUDNN in -DCUDNN_HOME=path_you_unzipped_cudnn.
-
-   * `-Dtensorflow_BUILD_CC_TESTS=(ON|OFF)`. Defaults to `OFF`. This builds cc unit tests.
-     There are many of them and building will take a few hours.
-     After cmake, build and execute the tests with
-     ```
-     MSBuild /p:Configuration=RelWithDebInfo ALL_BUILD.vcxproj
-     ctest -C RelWithDebInfo
-     ```
-
-   * `-Dtensorflow_BUILD_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This enables python kernel tests.
-     After building the python wheel, you need to install the new wheel before running the tests.
-     To execute the tests, use
-     ```
-     ctest -C RelWithDebInfo
-     ```
-
-   * `-Dtensorflow_BUILD_MORE_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This enables python tests on
-     serveral major packages. This option is only valid if this and tensorflow_BUILD_PYTHON_TESTS are both set as `ON`.
-     After building the python wheel, you need to install the new wheel before running the tests.
-     To execute the tests, use
-     ```
-     ctest -C RelWithDebInfo
-     ```
-
-   * `-Dtensorflow_ENABLE_MKL_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL support. If MKL is enabled you need to install the [Intel Math Kernal Library](https://software.intel.com/en-us/mkl).
-     CMake will expect the location of MKL in -MKL_HOME=path_you_install_mkl.
-
-   * `-Dtensorflow_ENABLE_MKLDNN_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include MKL DNN support. MKL DNN is [Intel(R) Math Kernel Library for Deep Neural Networks (Intel(R) MKL-DNN)](https://github.com/intel/mkl-dnn). You have to add `-Dtensorflow_ENABLE_MKL_SUPPORT=ON` before including MKL DNN support.
-
-
-4. Invoke MSBuild to build TensorFlow.
-
-   To build the C++ example program, which will be created as a `.exe`
-   executable in the subdirectory `.\Release`:
-
-   ```
-   D:\...\build> MSBuild /p:Configuration=Release tf_tutorials_example_trainer.vcxproj
-   D:\...\build> Release\tf_tutorials_example_trainer.exe
-   ```
-
-   To build the PIP package, which will be created as a `.whl` file in the
-   subdirectory `.\tf_python\dist`:
-
-   ```
-   D:\...\build> MSBuild /p:Configuration=Release tf_python_build_pip_package.vcxproj
-   ```
+        ```
+        D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\mkl
+        D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\compiler
+        D:\Tools\IntelSWTools\compilers_and_libraries\windows\redist\intel64\tbb\vc_mt
+        ```
 
+    *   We assume that `cmake` and `git` are installed and in your `%PATH%`. If
+        for example `cmake` is not in your path and it is installed in
+        `C:\Program Files (x86)\CMake\bin\cmake.exe`, you can add this directory
+        to your `%PATH%` as follows:
+
+        ```
+        D:\temp> set PATH="%PATH%;C:\Program Files (x86)\CMake\bin\cmake.exe"
+        ```
+
+2.  Clone the TensorFlow repository and create a working directory for your
+    build:
+
+    ```
+    D:\temp> git clone https://github.com/tensorflow/tensorflow.git
+    D:\temp> cd tensorflow\tensorflow\contrib\cmake
+    D:\temp\tensorflow\tensorflow\contrib\cmake> mkdir build
+    D:\temp\tensorflow\tensorflow\contrib\cmake> cd build
+    D:\temp\tensorflow\tensorflow\contrib\cmake\build>
+    ```
+
+3.  Invoke CMake to create Visual Studio solution and project files.
+
+    **N.B.** This assumes that `cmake.exe` is in your `%PATH%` environment
+    variable. The other paths are for illustrative purposes only, and may be
+    different on your platform. The `^` character is a line continuation and
+    must be the last character on each line.
+
+    ```
+    D:\...\build> cmake .. -A x64 -Thost=x64 -DCMAKE_BUILD_TYPE=Release ^
+    More? -DSWIG_EXECUTABLE=C:/tools/swigwin-3.0.10/swig.exe ^
+    More? -DPYTHON_EXECUTABLE=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/python.exe ^
+    More? -DPYTHON_LIBRARIES=C:/Users/%USERNAME%/AppData/Local/Continuum/Anaconda3/libs/python35.lib
+    ```
+
+    To build with GPU support add "^" at the end of the last line above
+    following with: `More? -Dtensorflow_ENABLE_GPU=ON ^ More?
+    -DCUDNN_HOME="D:\...\cudnn"` To build with MKL support add "^" at the end of
+    the last line above following with:
+
+    ```
+    More? -Dtensorflow_ENABLE_MKL_SUPPORT=ON ^
+    More? -DMKL_HOME="D:\...\compilers_and_libraries"
+    ```
+
+    To enable SIMD instructions with MSVC, as AVX and SSE, define it as follows:
+
+    ```
+    More? -Dtensorflow_WIN_CPU_SIMD_OPTIONS=/arch:AVX
+    ```
+
+    Note that the `-DCMAKE_BUILD_TYPE=Release` flag must match the build
+    configuration that you choose when invoking `msbuild`. The known-good values
+    are `Release` and `RelWithDebInfo`. The `Debug` build type is not currently
+    supported, because it relies on a `Debug` library for Python
+    (`python35d.lib`) that is not distributed by default.
+
+    The `-Thost=x64` flag will ensure that the 64 bit compiler and linker is
+    used when building. Without this flag, MSBuild will use the 32 bit toolchain
+    which is prone to compile errors such as "compiler out of heap space".
+
+    There are various options that can be specified when generating the solution
+    and project files:
+
+    *   `-DCMAKE_BUILD_TYPE=(Release|RelWithDebInfo)`: Note that the
+        `CMAKE_BUILD_TYPE` option must match the build configuration that you
+        choose when invoking MSBuild in step 4. The known-good values are
+        `Release` and `RelWithDebInfo`. The `Debug` build type is not currently
+        supported, because it relies on a `Debug` library for Python
+        (`python35d.lib`) that is not distributed by default.
+
+    *   `-Dtensorflow_BUILD_ALL_KERNELS=(ON|OFF)`. Defaults to `ON`. You can
+        build a small subset of the kernels for a faster build by setting this
+        option to `OFF`.
+
+    *   `-Dtensorflow_BUILD_CC_EXAMPLE=(ON|OFF)`. Defaults to `ON`. Generate
+        project files for a simple C++
+        [example training program](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/tutorials/example_trainer.cc).
+
+    *   `-Dtensorflow_BUILD_PYTHON_BINDINGS=(ON|OFF)`. Defaults to `ON`.
+        Generate project files for building a PIP package containing the
+        TensorFlow runtime and its Python bindings.
+
+    *   `-Dtensorflow_ENABLE_GRPC_SUPPORT=(ON|OFF)`. Defaults to `ON`. Include
+        gRPC support and the distributed client and server code in the
+        TensorFlow runtime.
+
+    *   `-Dtensorflow_ENABLE_SSL_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include
+        SSL support (for making secure HTTP requests) in the TensorFlow runtime.
+        This support is incomplete, and will be used for Google Cloud Storage
+        support.
+
+    *   `-Dtensorflow_ENABLE_GPU=(ON|OFF)`. Defaults to `OFF`. Include GPU
+        support. If GPU is enabled you need to install the CUDA 8.0 Toolkit and
+        CUDNN 5.1. CMake will expect the location of CUDNN in
+        -DCUDNN_HOME=path_you_unzipped_cudnn.
+
+    *   `-Dtensorflow_BUILD_CC_TESTS=(ON|OFF)`. Defaults to `OFF`. This builds
+        cc unit tests. There are many of them and building will take a few
+        hours. After cmake, build and execute the tests with `MSBuild
+        /p:Configuration=RelWithDebInfo ALL_BUILD.vcxproj ctest -C
+        RelWithDebInfo`
+
+    *   `-Dtensorflow_BUILD_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This
+        enables python kernel tests. After building the python wheel, you need
+        to install the new wheel before running the tests. To execute the tests,
+        use `ctest -C RelWithDebInfo`
+
+    *   `-Dtensorflow_BUILD_MORE_PYTHON_TESTS=(ON|OFF)`. Defaults to `OFF`. This
+        enables python tests on serveral major packages. This option is only
+        valid if this and tensorflow_BUILD_PYTHON_TESTS are both set as `ON`.
+        After building the python wheel, you need to install the new wheel
+        before running the tests. To execute the tests, use `ctest -C
+        RelWithDebInfo`
+
+    *   `-Dtensorflow_ENABLE_MKL_SUPPORT=(ON|OFF)`. Defaults to `OFF`. Include
+        MKL support. If MKL is enabled you need to install the
+        [Intel Math Kernal Library](https://software.intel.com/en-us/mkl). CMake
+        will expect the location of MKL in -MKL_HOME=path_you_install_mkl.
+
+    *   `-Dtensorflow_ENABLE_MKLDNN_SUPPORT=(ON|OFF)`. Defaults to `OFF`.
+        Include MKL DNN support. MKL DNN is [Intel(R) Math Kernel Library for
+        Deep Neural Networks (Intel(R)
+        MKL-DNN)](https://github.com/intel/mkl-dnn). You have to add
+        `-Dtensorflow_ENABLE_MKL_SUPPORT=ON` before including MKL DNN support.
+
+4.  Invoke MSBuild to build TensorFlow.
+
+    Set up the path to find MSbuild: `D:\temp> "C:\Program Files (x86)\Microsoft
+    Visual Studio 14.0\VC\bin\amd64\vcvarsall.bat"`
+
+    To build the C++ example program, which will be created as a `.exe`
+    executable in the subdirectory `.\Release`:
+
+    ```
+    D:\...\build> MSBuild /p:Configuration=Release tf_tutorials_example_trainer.vcxproj
+    D:\...\build> Release\tf_tutorials_example_trainer.exe
+    ```
+
+    To build the PIP package, which will be created as a `.whl` file in the
+    subdirectory `.\tf_python\dist`:
+
+    ```
+    D:\...\build> MSBuild /p:Configuration=Release tf_python_build_pip_package.vcxproj
+    ```
 
 Linux Continuous Integration build
 ==================================
diff --git a/tensorflow/contrib/cmake/TensorflowConfig.cmake.in b/tensorflow/contrib/cmake/TensorflowConfig.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..cc04db6e952f53b8bb5416dde60b8173e60bf60e
--- /dev/null
+++ b/tensorflow/contrib/cmake/TensorflowConfig.cmake.in
@@ -0,0 +1,16 @@
+# - Config file for the Tensorflow package
+# It defines the following variables
+#  TENSORFLOW_INCLUDE_DIRS - include directories for FooBar
+#  TENSORFLOW_LIBRARIES    - libraries to link against
+ 
+# Compute paths
+get_filename_component(TENSORFLOW_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+set(TENSORFLOW_INCLUDE_DIRS "@CONF_INCLUDE_DIRS@")
+ 
+# Our library dependencies (contains definitions for IMPORTED targets)
+if(NOT TENSORFLOW_BINARY_DIR)
+  include("${TENSORFLOW_CMAKE_DIR}/TensorflowTargets.cmake")
+endif()
+ 
+# These are IMPORTED targets created by TensorflowTargets.cmake
+set(TENSORFLOW_LIBRARIES tensorflow)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/TensorflowConfigVersion.cmake.in b/tensorflow/contrib/cmake/TensorflowConfigVersion.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..2a9609ddb9c4ca864651818bdfae0f8fe290de31
--- /dev/null
+++ b/tensorflow/contrib/cmake/TensorflowConfigVersion.cmake.in
@@ -0,0 +1,11 @@
+set(PACKAGE_VERSION "@TENSORFLOW_VERSION@")
+ 
+# Check whether the requested PACKAGE_FIND_VERSION is compatible
+if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}")
+  set(PACKAGE_VERSION_COMPATIBLE FALSE)
+else()
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}")
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..46a193971c5084523d432065f265fa7a9909f595
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
@@ -0,0 +1,98 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+if (systemlib_ABSEIL_CPP)
+
+  find_package(AbseilCpp REQUIRED
+               absl_base
+               absl_spinlock_wait
+               absl_dynamic_annotations
+               absl_malloc_internal
+               absl_throw_delegate
+               absl_int128
+               absl_strings
+               str_format_internal
+               absl_bad_optional_access)
+
+  include_directories(${ABSEIL_CPP_INCLUDE_DIR})
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${ABSEIL_CPP_LIBRARIES})
+
+  message(STATUS "  abseil_cpp includes: ${ABSEIL_CPP_INCLUDE_DIR}")
+  message(STATUS "  abseil_cpp libraries: ${ABSEIL_CPP_LIBRARIES}")
+
+  add_custom_target(abseil_cpp)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
+
+else (systemlib_ABSEIL_CPP)
+
+  include (ExternalProject)
+
+  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp)
+  set(abseil_cpp_URL https://github.com/abseil/abseil-cpp/archive/e01d95528ea2137a4a27a88d1f57c6cb260aafed.tar.gz)
+  set(abseil_cpp_HASH SHA256=84043ed402d2a2a6ba4cdddb7e85118b1158fd81fe4ac3a14adc343d054c1e2e)
+  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp-build)
+
+  if(WIN32)
+    if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+      set(abseil_cpp_STATIC_LIBRARIES
+          ${abseil_cpp_BUILD}/absl/base/Release/absl_base.lib
+          ${abseil_cpp_BUILD}/absl/base/Release/absl_dynamic_annotations.lib
+          ${abseil_cpp_BUILD}/absl/base/Release/absl_internal_malloc_internal.lib
+          ${abseil_cpp_BUILD}/absl/strings/Release/absl_strings.lib
+          ${abseil_cpp_BUILD}/absl/strings/Release/str_format_internal.lib
+          ${abseil_cpp_BUILD}/absl/types/Release/absl_bad_optional_access.lib)
+    else()
+      set(abseil_cpp_STATIC_LIBRARIES
+          ${abseil_cpp_BUILD}/absl/base/absl_base.lib
+          ${abseil_cpp_BUILD}/absl/base/absl_spinlock_wait.lib
+          ${abseil_cpp_BUILD}/absl/base/absl_dynamic_annotations.lib
+          ${abseil_cpp_BUILD}/absl/base/absl_malloc_internal.lib
+          ${abseil_cpp_BUILD}/absl/base/absl_throw_delegate.lib
+          ${abseil_cpp_BUILD}/absl/numeric/absl_int128.lib
+          ${abseil_cpp_BUILD}/absl/strings/absl_strings.lib
+          ${abseil_cpp_BUILD}/absl/strings/str_format_internal.lib
+          ${abseil_cpp_BUILD}/absl/types/absl_bad_optional_access.lib)
+    endif()
+  else()
+    set(abseil_cpp_STATIC_LIBRARIES
+        ${abseil_cpp_BUILD}/absl/base/libabsl_base.a
+        ${abseil_cpp_BUILD}/absl/base/libabsl_spinlock_wait.a
+        ${abseil_cpp_BUILD}/absl/base/libabsl_dynamic_annotations.a
+        ${abseil_cpp_BUILD}/absl/base/libabsl_malloc_internal.a
+        ${abseil_cpp_BUILD}/absl/base/libabsl_throw_delegate.a
+        ${abseil_cpp_BUILD}/absl/numeric/libabsl_int128.a
+        ${abseil_cpp_BUILD}/absl/strings/libabsl_strings.a
+        ${abseil_cpp_BUILD}/absl/strings/libstr_format_internal.a
+        ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a)
+  endif()
+
+  ExternalProject_Add(abseil_cpp
+      PREFIX abseil_cpp
+      URL ${abseil_cpp_URL}
+      URL_HASH ${abseil_cpp_HASH}
+      DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      BUILD_BYPRODUCTS ${abseil_cpp_STATIC_LIBRARIES}
+      INSTALL_COMMAND ""
+      CMAKE_CACHE_ARGS
+          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
+          -DCMAKE_BUILD_TYPE:STRING=Release
+          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+  )
+
+  include_directories(${abseil_cpp_INCLUDE_DIR})
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${abseil_cpp_STATIC_LIBRARIES})
+
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
+
+endif (systemlib_ABSEIL_CPP)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index b1e64aa55c80ad59cfdc0f4767c0282b4f73367f..e570c09ecb5e64130ed6f3375a51d74850cc3989 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG d184fa229d75d336aedea0041bd59cb93e7e267f)
+set(GRPC_TAG 69b6c047bc767b4d80e7af4d00ccb7c45b683dae)
 
 if(WIN32)
   # We use unsecure gRPC because boringssl does not build on windows
@@ -26,9 +26,9 @@ if(WIN32)
   set(grpc_SSL_PROVIDER NONE)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
     set(grpc_STATIC_LIBRARIES
-        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc++_unsecure.lib
-        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc_unsecure.lib
-        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/gpr.lib)
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc++_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/grpc_unsecure.lib
+        ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/$(Configuration)/gpr.lib)
   else()
     set(grpc_STATIC_LIBRARIES
         ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/grpc++_unsecure.lib
@@ -43,8 +43,9 @@ else()
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/zlib/libz.a)
 endif()
 
 add_definitions(-DGRPC_ARES=0)
@@ -66,7 +67,7 @@ ExternalProject_Add(grpc
         -DPROTOBUF_INCLUDE_DIRS:STRING=${PROTOBUF_INCLUDE_DIRS}
         -DPROTOBUF_LIBRARIES:STRING=${protobuf_STATIC_LIBRARIES}
         -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
-	-DgRPC_SSL_PROVIDER:STRING=${grpc_SSL_PROVIDER}
+        -DgRPC_SSL_PROVIDER:STRING=${grpc_SSL_PROVIDER}
 )
 
 # grpc/src/core/ext/census/tracing.c depends on the existence of openssl/rand.h.
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 1a147e9c8e5a9fee17a81e37c9babe3c9ec0290b..32e6d78e508e25f76bd263e9d52b6574ca315f6c 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -59,6 +59,7 @@ ExternalProject_Add(png
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
 	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
+  -DPNG_TESTS:BOOL=OFF
 )
 
 ## put png includes in the directory where they are expected
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index 56a57a2340ddc7f923c611c222a0399e279ad58a..773c37b309b1dff4ed28d24cd7d6140a63ec5bc6 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -16,7 +16,18 @@ include (ExternalProject)
 
 set(PROTOBUF_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src)
 set(PROTOBUF_URL https://github.com/google/protobuf.git)
-set(PROTOBUF_TAG v3.6.1)
+
+# enable choose protobuf versions
+SET(PROTOBUF_VERSION "3.6.1" CACHE STRING "Protobuf version")
+SET_PROPERTY(CACHE PROTOBUF_VERSION PROPERTY STRINGS "3.4.0" "3.5.0" "3.6.1") 
+
+if(${PROTOBUF_VERSION} STREQUAL "3.5.1")
+    set(PROTOBUF_TAG v3.6.1)
+elseif(${PROTOBUF_VERSION} STREQUAL "3.5.0")
+    set(PROTOBUF_TAG 2761122b810fe8861004ae785cc3ab39f384d342)
+elseif(${PROTOBUF_VERSION} STREQUAL "3.4.0")
+    set(PROTOBUF_TAG b04e5cba356212e4e8c66c61bbe0c3a20537c5b9)
+endif()
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
diff --git a/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake b/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..944ae3997a9489c13f65f93d9a7e61c21dd975c1
--- /dev/null
+++ b/tensorflow/contrib/cmake/modules/FindAbseilCpp.cmake
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+find_path(ABSEIL_CPP_INCLUDE_DIR absl/base/config.h
+  HINTS "${ABSEIL_CPP_INCLUDE_DIR_HINTS}"
+  PATHS "$ENV{PROGRAMFILES}"
+        "$ENV{PROGRAMW6432}"
+  PATH_SUFFIXES "")
+
+if(EXISTS "${ABSEIL_CPP_INCLUDE_DIR}" AND NOT "${ABSEIL_CPP_INCLUDE_DIR}" STREQUAL "")
+
+  if(NOT AbseilCpp_FIND_COMPONENTS)
+    # search all libraries if no COMPONENTS was requested
+    set(AbseilCpp_FIND_COMPONENTS
+        "absl_algorithm;absl_any;absl_bad_any_cast"
+        "absl_bad_optional_access;absl_base;absl_container;absl_debugging"
+        "absl_dynamic_annotations;absl_examine_stack;absl_failure_signal_handler"
+        "absl_int128;absl_leak_check;absl_internal_malloc_internal;absl_memory;absl_meta"
+        "absl_numeric;absl_optional;absl_span;absl_internal_spinlock_wait;absl_stack_consumption"
+        "absl_stacktrace;absl_str_format;absl_strings;absl_symbolize;absl_synchronization"
+        "absl_throw_delegate;absl_time;absl_utility;str_format_extension_internal"
+        "str_format_internal;test_instance_tracker_lib")
+  endif()
+
+  foreach(LIBNAME ${AbseilCpp_FIND_COMPONENTS})
+
+    unset(ABSEIL_CPP_LIBRARY CACHE)
+
+    find_library(ABSEIL_CPP_LIBRARY
+                 NAMES ${LIBNAME}
+                 HINTS ${ABSEIL_CPP_LIBRARIES_DIR_HINTS})
+
+    if(ABSEIL_CPP_LIBRARY)
+      list(APPEND ABSEIL_CPP_LIBRARIES ${ABSEIL_CPP_LIBRARY})
+    else()
+      message(FATAL_ERROR "\n"
+        "abseil_cpp library \"${LIBNAME}\" not found in system path.\n"
+        "Please provide locations using: -DABSEIL_CPP_LIBRARIES_DIR_HINTS:STRING=\"PATH\"\n")
+    endif()
+
+  endforeach()
+
+  unset(LIBNAME CACHE)
+  unset(ABSEIL_CPP_LIBRARY CACHE)
+
+  set(ABSEIL_CPP_FOUND TRUE)
+  message(STATUS "Found abseil_cpp libraries")
+
+  set(ABSEIL_CPP_INCLUDE_DIR "${ABSEIL_CPP_INCLUDE_DIR}" CACHE PATH "" FORCE)
+  mark_as_advanced(ABSEIL_CPP_INCLUDE_DIR)
+
+  set(ABSEIL_CPP_LIBRARIES "${ABSEIL_CPP_LIBRARIES}" CACHE PATH "" FORCE)
+  mark_as_advanced(ABSEIL_CPP_LIBRARIES)
+
+else()
+
+  message(FATAL_ERROR "\n"
+    "abseil_cpp headers not found in system path.\n"
+    "Please provide locations using: -DABSEIL_CPP_INCLUDE_DIR_HINTS:STRING=\"PATH\"\n")
+
+endif()
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 6e72670142d560a364350bb4769f1153f884b0f6..96160568fa79291a7b391761373e1eaf0f70974e 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -57,6 +57,7 @@ tensorflow/python/ops
 tensorflow/python/ops/distributions
 tensorflow/python/ops/linalg
 tensorflow/python/ops/losses
+tensorflow/python/ops/signal
 tensorflow/python/platform
 tensorflow/python/profiler
 tensorflow/python/profiler/internal
@@ -279,10 +280,10 @@ tensorflow/contrib/linear_optimizer/kernels/g3doc
 tensorflow/contrib/linear_optimizer/python
 tensorflow/contrib/linear_optimizer/python/ops
 # TODO(drpngx): Fix failing imports
-# tensorflow/contrib/lite
-# tensorflow/contrib/lite/python
-# tensorflow/contrib/lite/toco
-# tensorflow/contrib/lite/toco/python
+# tensorflow/lite
+# tensorflow/lite/python
+# tensorflow/lite/toco
+# tensorflow/lite/toco/python
 tensorflow/contrib/lookup
 tensorflow/contrib/losses
 tensorflow/contrib/losses/python
@@ -308,11 +309,6 @@ tensorflow/contrib/model_pruning/examples
 tensorflow/contrib/model_pruning/examples/cifar10
 tensorflow/contrib/model_pruning/python
 tensorflow/contrib/model_pruning/python/layers
-tensorflow/contrib/nccl
-tensorflow/contrib/nccl/kernels
-tensorflow/contrib/nccl/ops
-tensorflow/contrib/nccl/python
-tensorflow/contrib/nccl/python/ops
 tensorflow/contrib/nearest_neighbor
 tensorflow/contrib/nearest_neighbor/kernels
 tensorflow/contrib/nearest_neighbor/ops
@@ -382,8 +378,6 @@ tensorflow/contrib/seq2seq/python/ops
 tensorflow/contrib/session_bundle
 tensorflow/contrib/session_bundle/example
 tensorflow/contrib/signal
-tensorflow/contrib/signal/python
-tensorflow/contrib/signal/python/ops
 tensorflow/contrib/slim
 tensorflow/contrib/slim/python
 tensorflow/contrib/slim/python/slim
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index 42afbd9105ef3789430606d909979ca308e2eaa8..013180c89083748b240ad061b342300e886d3568 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -6,7 +6,7 @@ tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/cloud/kernels
 tensorflow/contrib/decision_trees/proto
 tensorflow/contrib/gdr
-tensorflow/contrib/lite/toco
+tensorflow/lite/toco
 tensorflow/contrib/mpi
 tensorflow/contrib/mpi_collectives
 tensorflow/contrib/session_bundle
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index 7a30eb94f54b18a2a517615a315e23e09e1170d0..a04142bd249ed5e16beba11057d0efc1e191e31b 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+
 ########################################################
 # tf_c_framework library
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index 6c90cf398c69c8c1b22ea75e0c407f258e2535f9..6514ae50a4a35b35ba100af6997079294c22f9b8 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -149,11 +149,7 @@ add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
 
 if (WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
-  else()
-    set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
-  endif()
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib")
 else (WIN32)
   set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif (WIN32)
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index a54cbff33b66d63d7229fa2f50b8a4ca962111ed..d8884d464fb5974d77506561a9ed36110a3804c0 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -39,6 +39,8 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/*main.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.h"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_eager_runtime.cmake b/tensorflow/contrib/cmake/tf_core_eager_runtime.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..78e4c0d3035cdaefa1d0950f4270d60152c805af
--- /dev/null
+++ b/tensorflow/contrib/cmake/tf_core_eager_runtime.cmake
@@ -0,0 +1,57 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+########################################################
+# tf_core_eager_runtime library
+########################################################
+file(GLOB_RECURSE tf_core_eager_runtime_srcs
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*.h"
+)
+
+file(GLOB_RECURSE tf_core_eager_runtime_exclude_srcs
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*test*.h"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/eager/*test*.cc"
+)
+
+list(REMOVE_ITEM tf_core_eager_runtime_srcs ${tf_core_eager_runtime_exclude_srcs})
+
+add_library(tf_core_eager_runtime OBJECT ${tf_core_eager_runtime_srcs})
+add_dependencies(
+	tf_core_eager_runtime 
+	tf_c 
+	tf_core_lib)
+
+
+file(GLOB_RECURSE tf_c_eager_srcs
+    "${tensorflow_source_dir}/tensorflow/c/eager/*.cc"
+    "${tensorflow_source_dir}/tensorflow/c/eager/*.h"
+)
+
+file(GLOB_RECURSE tf_c_eager_exlclude_srcs
+    "${tensorflow_source_dir}/tensorflow/c/eager/*test*.h"
+    "${tensorflow_source_dir}/tensorflow/c/eager/*test*.cc"
+)
+
+list(REMOVE_ITEM tf_c_eager_srcs ${tf_c_eager_exlclude_srcs})
+
+add_library(tf_c_eager OBJECT ${tf_c_eager_srcs})
+add_dependencies(
+  tf_c_eager
+  tf_core_eager_runtime
+  tf_c
+  tf_cc_framework
+  tf_cc_while_loop
+  tf_core_lib
+  tf_protos_cc)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 7e806685b8448cbd629985cdc00ed1193857abe6..d7b2a1339e047aba0a9424a53a63726805e89721 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -140,16 +140,19 @@ set(tf_proto_text_srcs
     "tensorflow/core/example/example.proto"
     "tensorflow/core/example/feature.proto"
     "tensorflow/core/framework/allocation_description.proto"
+    "tensorflow/core/framework/api_def.proto"
     "tensorflow/core/framework/attr_value.proto"
     "tensorflow/core/framework/cost_graph.proto"
     "tensorflow/core/framework/device_attributes.proto"
     "tensorflow/core/framework/function.proto"
     "tensorflow/core/framework/graph.proto"
     "tensorflow/core/framework/graph_transfer_info.proto"
+    "tensorflow/core/framework/iterator.proto"
     "tensorflow/core/framework/kernel_def.proto"
     "tensorflow/core/framework/log_memory.proto"
     "tensorflow/core/framework/node_def.proto"
     "tensorflow/core/framework/op_def.proto"
+    "tensorflow/core/framework/reader_base.proto"
     "tensorflow/core/framework/remote_fused_graph_execute_info.proto"
     "tensorflow/core/framework/resource_handle.proto"
     "tensorflow/core/framework/step_stats.proto"
@@ -159,6 +162,7 @@ set(tf_proto_text_srcs
     "tensorflow/core/framework/tensor_shape.proto"
     "tensorflow/core/framework/tensor_slice.proto"
     "tensorflow/core/framework/types.proto"
+    "tensorflow/core/framework/variable.proto"
     "tensorflow/core/framework/versions.proto"
     "tensorflow/core/lib/core/error_codes.proto"
     "tensorflow/core/protobuf/cluster.proto"
@@ -204,10 +208,10 @@ file(GLOB tf_core_platform_srcs
     "${tensorflow_source_dir}/tensorflow/core/framework/resource_handle.h"
     "${tensorflow_source_dir}/tensorflow/core/framework/resource_handle.cc")
 if (NOT tensorflow_ENABLE_GPU)
-  file(GLOB tf_core_platform_gpu_srcs
+  file(GLOB tf_core_platform_gpu_srcs_exclude
       "${tensorflow_source_dir}/tensorflow/core/platform/cuda_libdevice_path.*"
       "${tensorflow_source_dir}/tensorflow/core/platform/default/cuda_libdevice_path.*")
-  list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs})
+  list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs_exclude})
 else()
   file(GLOB tf_core_platform_srcs_exclude
       "${tensorflow_source_dir}/tensorflow/core/platform/default/device_tracer.cc")
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 7b892ba248bc43cd885f295288c677ac97efaa06..d66e39ac07c7b7c9423fa7e878a9cefd94b867bd 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -68,14 +68,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/assert_next_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/csv_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/unique_dataset_op.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/clustering_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/kernels/wals_solver_ops.cc"
@@ -97,9 +89,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/libsvm/kernels/decode_libsvm_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/libsvm/ops/libsvm_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_manager.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/nccl/kernels/nccl_ops.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/ops/nearest_neighbor_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/resampler/kernels/resampler_ops.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index bc753333dba4f67eee0114c4022743dd59a05982..310eed4ecbfdd30a3b3bdd4728c030fe70930797 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -13,13 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 set(tf_op_lib_names
-    "audio_ops"
     "array_ops"
+    "audio_ops"
     "batch_ops"
     "bitwise_ops"
     "boosted_trees_ops"
     "candidate_sampling_ops"
     "checkpoint_ops"
+    "collective_ops"
     "control_flow_ops"
     "ctc_ops"
     "cudnn_rnn_ops"
@@ -27,13 +28,14 @@ set(tf_op_lib_names
     "dataset_ops"
     "decode_proto_ops"
     "encode_proto_ops"
+    "function_ops"
     "functional_ops"
     "image_ops"
     "io_ops"
     "linalg_ops"
     "list_ops"
-    "lookup_ops"
     "logging_ops"
+    "lookup_ops"
     "manip_ops"
     "math_ops"
     "nn_ops"
@@ -43,10 +45,11 @@ set(tf_op_lib_names
     "remote_fused_graph_ops"
     "resource_variable_ops"
     "rpc_ops"
+    "scoped_allocator_ops"
     "script_ops"
     "sdca_ops"
-    "set_ops"
     "sendrecv_ops"
+    "set_ops"
     "sparse_ops"
     "spectral_ops"
     "state_ops"
@@ -54,6 +57,7 @@ set(tf_op_lib_names
     "string_ops"
     "summary_ops"
     "training_ops"
+    "word2vec_ops"
 )
 
 foreach(tf_op_lib_name ${tf_op_lib_names})
@@ -89,7 +93,6 @@ GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_prediction "${tensorflow_source_dir}/t
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_quantiles "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(boosted_trees_stats_accumulator "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(coder "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(data_dataset "${tensorflow_source_dir}/tensorflow/contrib/data/ops/dataset_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_clustering "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/clustering_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(factorization_factorization "${tensorflow_source_dir}/tensorflow/contrib/factorization/ops/factorization_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(framework_variable "${tensorflow_source_dir}/tensorflow/contrib/framework/ops/variable_ops.cc")
@@ -99,7 +102,6 @@ GENERATE_CONTRIB_OP_LIBRARY(image_distort_image "${tensorflow_source_dir}/tensor
 GENERATE_CONTRIB_OP_LIBRARY(image_sirds "${tensorflow_source_dir}/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(layers_sparse_feature_cross "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(memory_stats "${tensorflow_source_dir}/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc")
-GENERATE_CONTRIB_OP_LIBRARY(nccl "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(periodic_resample "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/ops/array_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nearest_neighbor "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/ops/nearest_neighbor_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(resampler "${tensorflow_source_dir}/tensorflow/contrib/resampler/ops/resampler_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 6d86daf5f174a3238ab92e5bba6085c904766766..8faccf8d55902e6701ebb4ce534b84705304fd5f 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -222,17 +222,17 @@ endforeach(python_module)
 
 add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
     COMMAND ${CMAKE_COMMAND} -E make_directory
-    "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite")
+    "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/lite")
 add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
     COMMAND ${CMAKE_COMMAND} -E make_directory
-    "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite/python")
+    "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/lite/python")
 add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
     COMMAND ${CMAKE_COMMAND} -E touch
-    "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite/python/__init__.py")
+    "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/lite/python/__init__.py")
 add_custom_command(
     TARGET tf_python_copy_scripts_to_destination PRE_BUILD
     COMMAND ${CMAKE_COMMAND} -E touch
-    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite/python/lite.py)
+    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/lite/python/lite.py)
 
 # Generate the tensorflow.python.platform.build_info module.
 set(BUILD_INFO_PY "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/platform/build_info.py")
@@ -313,15 +313,14 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
         ${GENERATE_PYTHON_OP_LIB_DESTINATION} PARENT_SCOPE)
 endfunction()
 
-GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
+GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("batch_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
 GENERATE_PYTHON_OP_LIB("boosted_trees_ops")
-GENERATE_PYTHON_OP_LIB("math_ops")
-GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("candidate_sampling_ops")
 GENERATE_PYTHON_OP_LIB("checkpoint_ops")
+GENERATE_PYTHON_OP_LIB("collective_ops")
 GENERATE_PYTHON_OP_LIB("control_flow_ops"
   ADDITIONAL_LIBRARIES $<TARGET_OBJECTS:tf_no_op>)
 GENERATE_PYTHON_OP_LIB("ctc_ops")
@@ -332,14 +331,18 @@ GENERATE_PYTHON_OP_LIB("decode_proto_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_decode_proto_op.py)
 GENERATE_PYTHON_OP_LIB("encode_proto_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/proto/python/ops/gen_encode_proto_op.py)
+GENERATE_PYTHON_OP_LIB("function_ops")
+GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
 GENERATE_PYTHON_OP_LIB("list_ops")
 GENERATE_PYTHON_OP_LIB("logging_ops")
 GENERATE_PYTHON_OP_LIB("lookup_ops")
-GENERATE_PYTHON_OP_LIB("nn_ops")
 GENERATE_PYTHON_OP_LIB("manip_ops")
+GENERATE_PYTHON_OP_LIB("math_ops")
+GENERATE_PYTHON_OP_LIB("nn_ops")
+GENERATE_PYTHON_OP_LIB("no_op")
 GENERATE_PYTHON_OP_LIB("parsing_ops")
 GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
@@ -347,17 +350,21 @@ GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
 GENERATE_PYTHON_OP_LIB("rpc_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rpc/python/ops/gen_rpc_op.py)
+GENERATE_PYTHON_OP_LIB("scoped_allocator_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
+GENERATE_PYTHON_OP_LIB("sendrecv_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
-GENERATE_PYTHON_OP_LIB("state_ops")
 GENERATE_PYTHON_OP_LIB("sparse_ops")
 GENERATE_PYTHON_OP_LIB("spectral_ops")
+GENERATE_PYTHON_OP_LIB("state_ops")
+GENERATE_PYTHON_OP_LIB("stateless_random_ops")
 GENERATE_PYTHON_OP_LIB("string_ops")
 GENERATE_PYTHON_OP_LIB("summary_ops")
 GENERATE_PYTHON_OP_LIB("user_ops")
 GENERATE_PYTHON_OP_LIB("training_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/training/gen_training_ops.py)
+GENERATE_PYTHON_OP_LIB("word2vec_ops")
 
 GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_model_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_model_ops.py)
@@ -373,8 +380,6 @@ GENERATE_PYTHON_OP_LIB("contrib_boosted_trees_stats_accumulator_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/boosted_trees/python/ops/gen_stats_accumulator_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_coder_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/coder/python/ops/gen_coder_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_data_dataset_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/data/python/ops/gen_dataset_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_clustering_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/factorization/python/ops/gen_clustering_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_factorization_factorization_ops"
@@ -393,11 +398,8 @@ GENERATE_PYTHON_OP_LIB("contrib_layers_sparse_feature_cross_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/layers/ops/gen_sparse_feature_cross_op.py)
 GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/memory_stats/ops/gen_memory_stats_ops.py)
-GENERATE_PYTHON_OP_LIB("contrib_nccl_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nccl/ops/gen_nccl_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_periodic_resample_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/periodic_resample/python/ops/gen_periodic_resample_op.py)
-
 GENERATE_PYTHON_OP_LIB("contrib_nearest_neighbor_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nearest_neighbor/ops/gen_nearest_neighbor_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_resampler_ops"
@@ -422,8 +424,6 @@ GENERATE_PYTHON_OP_LIB("contrib_bigquery_reader_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_bigquery_reader_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_gcs_config_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_gcs_config_ops.py)
-GENERATE_PYTHON_OP_LIB("stateless_random_ops"
-  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/stateless/gen_stateless_random_ops.py)
 GENERATE_PYTHON_OP_LIB("debug_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/debug/ops/gen_debug_ops.py)
 
@@ -526,11 +526,13 @@ if(WIN32)
     add_library(pywrap_tensorflow_internal_static STATIC
         ${pywrap_tensorflow_internal_src}
         $<TARGET_OBJECTS:tf_c>
+        $<TARGET_OBJECTS:tf_c_eager>
         $<TARGET_OBJECTS:tf_c_python_api>
         $<TARGET_OBJECTS:tf_core_lib>
         $<TARGET_OBJECTS:tf_core_cpu>
         $<TARGET_OBJECTS:tf_core_framework>
         $<TARGET_OBJECTS:tf_core_profiler>
+        $<TARGET_OBJECTS:tf_core_eager_runtime>
         $<TARGET_OBJECTS:tf_cc>
         $<TARGET_OBJECTS:tf_cc_ops>
         $<TARGET_OBJECTS:tf_cc_while_loop>
@@ -583,11 +585,13 @@ endif(WIN32)
 add_library(pywrap_tensorflow_internal SHARED
     ${pywrap_tensorflow_internal_src}
     $<TARGET_OBJECTS:tf_c>
+    $<TARGET_OBJECTS:tf_c_eager>
     $<TARGET_OBJECTS:tf_c_python_api>
     $<TARGET_OBJECTS:tf_core_lib>
     $<TARGET_OBJECTS:tf_core_cpu>
     $<TARGET_OBJECTS:tf_core_framework>
     $<TARGET_OBJECTS:tf_core_profiler>
+    $<TARGET_OBJECTS:tf_core_eager_runtime>
     $<TARGET_OBJECTS:tf_cc>
     $<TARGET_OBJECTS:tf_cc_ops>
     $<TARGET_OBJECTS:tf_cc_while_loop>
@@ -617,13 +621,28 @@ target_include_directories(pywrap_tensorflow_internal PUBLIC
     ${NUMPY_INCLUDE_DIR}
 )
 
-target_link_libraries(pywrap_tensorflow_internal PRIVATE
+if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+	# There is a bug in GCC 5 resulting in undefined reference to a __cpu_model function when
+	# linking to the tensorflow library. Adding the following libraries fixes it.
+	# See issue on github: https://github.com/tensorflow/tensorflow/issues/9593
+	target_link_libraries(pywrap_tensorflow_internal PRIVATE
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
     tf_protos_cc
     tf_python_protos_cc
     ${PYTHON_LIBRARIES}
+    gcc_s
+    gcc
 )
+else()
+	target_link_libraries(pywrap_tensorflow_internal PRIVATE
+    ${tf_core_gpu_kernels_lib}
+    ${tensorflow_EXTERNAL_LIBRARIES}
+    tf_protos_cc
+    tf_python_protos_cc
+    ${PYTHON_LIBRARIES}
+)
+endif()
 
 if(WIN32)
 
@@ -808,10 +827,10 @@ add_dependencies(tf_python_api tf_python_ops)
 ########################################################
 
 # Parse tensorflow/python/tools/api/generator/BUILD to get list of generated files.
-FILE(READ ${tensorflow_source_dir}/tensorflow/python/tools/api/generator/api_gen.bzl api_generator_BUILD_text)
-STRING(REGEX MATCH "# BEGIN GENERATED ESTIMATOR FILES.*# END GENERATED ESTIMATOR FILES" api_init_files_text ${api_generator_BUILD_text})
-string(REPLACE "# BEGIN GENERATED ESTIMATOR FILES" "" api_init_files_text ${api_init_files_text})
-string(REPLACE "# END GENERATED ESTIMATOR FILES" "" api_init_files_text ${api_init_files_text})
+FILE(READ ${tensorflow_source_dir}/tensorflow/python/tools/api/generator/api_init_files.bzl api_generator_BUILD_text)
+STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
+string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
 string(REPLACE "," ";" api_init_files_list ${api_init_files_text})
 
 set(api_init_files "")
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index fdf522f1fd90ffc64acbe82381ef57a389645d61..62005dd113bfb80fbdf23afb6d4aa5f90a1e32de 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -23,6 +23,8 @@ if(WIN32)
   # we need.
   #
   add_library(tensorflow_static STATIC
+      $<TARGET_OBJECTS:tf_c_eager>
+      $<TARGET_OBJECTS:tf_core_eager_runtime>
       $<TARGET_OBJECTS:tf_c>
       $<TARGET_OBJECTS:tf_cc>
       $<TARGET_OBJECTS:tf_cc_framework>
@@ -65,6 +67,8 @@ endif(WIN32)
 # tensorflow is a shared library containing all of the
 # TensorFlow runtime and the standard ops and kernels.
 add_library(tensorflow SHARED
+    $<TARGET_OBJECTS:tf_c_eager>
+    $<TARGET_OBJECTS:tf_core_eager_runtime>
     $<TARGET_OBJECTS:tf_c>
     $<TARGET_OBJECTS:tf_cc>
     $<TARGET_OBJECTS:tf_cc_framework>
@@ -96,6 +100,27 @@ if(CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
     target_link_libraries(tensorflow PRIVATE gcc_s gcc)
 endif()
 
+# Offer the user the choice of overriding the installation directories
+set(INSTALL_LIB_DIR lib CACHE PATH "Installation directory for libraries")
+set(INSTALL_BIN_DIR bin CACHE PATH "Installation directory for executables")
+set(INSTALL_INCLUDE_DIR include CACHE PATH
+  "Installation directory for header files")
+if(WIN32 AND NOT CYGWIN)
+  set(DEF_INSTALL_CMAKE_DIR cmake)
+else()
+  set(DEF_INSTALL_CMAKE_DIR lib/cmake)
+endif()
+set(INSTALL_CMAKE_DIR ${DEF_INSTALL_CMAKE_DIR} CACHE PATH
+  "Installation directory for CMake files")
+
+# Make relative paths absolute (needed later on)
+foreach(p LIB BIN INCLUDE CMAKE)
+  set(var INSTALL_${p}_DIR)
+  if(NOT IS_ABSOLUTE "${${var}}")
+    set(${var} "${CMAKE_INSTALL_PREFIX}/${${var}}")
+  endif()
+endforeach()
+
 if(WIN32)
   add_dependencies(tensorflow tensorflow_static)
 endif(WIN32)
@@ -103,14 +128,57 @@ endif(WIN32)
 target_include_directories(tensorflow PUBLIC 
     $<INSTALL_INTERFACE:include/>)
 
-install(TARGETS tensorflow EXPORT tensorflow_export
-        RUNTIME DESTINATION bin
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
+# Add all targets to build-tree export set
+export(TARGETS tensorflow
+  FILE ${PROJECT_BINARY_DIR}/TensorflowTargets.cmake)
+
+# Export the package for use from the build-tree
+export(PACKAGE Tensorflow)
+
+# Create the TensorflowConfig.cmake and TensorflowConfigVersion files
+file(RELATIVE_PATH REL_INCLUDE_DIR "${INSTALL_CMAKE_DIR}"
+   "${INSTALL_INCLUDE_DIR}")
+# for the build tree
+set(CONF_INCLUDE_DIRS "${tensorflow_source_dir}" 
+                      "${PROJECT_BINARY_DIR}"
+                      "${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src"
+                      "${CMAKE_CURRENT_BINARY_DIR}/nsync/install/include" # Please if there is a better directory
+                      "${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen/"
+                      "${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/"
+                      "${tensorflow_source_dir}/third_party/eigen3/"
+                      "${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/")
+configure_file(TensorflowConfig.cmake.in
+  "${PROJECT_BINARY_DIR}/TensorflowConfig.cmake" @ONLY)
+# for the install tree, yet to be complete
+set(CONF_INCLUDE_DIRS "\${TENSORFLOW_CMAKE_DIR}/${REL_INCLUDE_DIR}")
+configure_file(TensorflowConfig.cmake.in
+  "${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/TensorflowConfig.cmake" @ONLY)
+# for both
+configure_file(TensorflowConfigVersion.cmake.in
+  "${PROJECT_BINARY_DIR}/TensorflowConfigVersion.cmake" @ONLY)
+
+# install(TARGETS tensorflow EXPORT tensorflow_export
+#         RUNTIME DESTINATION ${INSTALL_BIN_DIR}
+#         LIBRARY DESTINATION ${INSTALL_LIB_DIR}
+#         ARCHIVE DESTINATION ${INSTALL_LIB_DIR})
+
+# install(EXPORT tensorflow_export
+#         FILE TensorflowConfig.cmake
+#         DESTINATION ${INSTALL_CMAKE_DIR})
         
-install(EXPORT tensorflow_export
-        FILE TensorflowConfig.cmake
-        DESTINATION lib/cmake)
+install(FILES
+  "${PROJECT_BINARY_DIR}/${CMAKE_FILES_DIRECTORY}/TensorflowConfig.cmake"
+  "${PROJECT_BINARY_DIR}/TensorflowConfigVersion.cmake"
+  DESTINATION "${INSTALL_CMAKE_DIR}" COMPONENT dev)
+
+# install the export set for use with the install-tree
+install(EXPORT TensorflowTargets 
+  DESTINATION ${INSTALL_CMAKE_DIR})
+
+install(TARGETS tensorflow EXPORT TensorflowTargets
+        RUNTIME DESTINATION ${INSTALL_BIN_DIR}
+        LIBRARY DESTINATION ${INSTALL_LIB_DIR}
+        ARCHIVE DESTINATION ${INSTALL_LIB_DIR})
 
 # install necessary headers
 # tensorflow headers
@@ -145,6 +213,10 @@ install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
 # unsupported Eigen directory
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
         DESTINATION include/unsupported/Eigen)
+# absl directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/abseil_cpp/src/abseil_cpp/absl/
+        DESTINATION include/absl
+        FILES_MATCHING PATTERN "*.h")
 # mkl
 if (tensorflow_ENABLE_MKL_SUPPORT)
     install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include/
diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
index 4bfd753bb1d1fc254c66a4f7eb1d6ac83a40cb70..7f96a103d4cd797bc733a41a673eac492419b4c6 100644
--- a/tensorflow/contrib/coder/BUILD
+++ b/tensorflow/contrib/coder/BUILD
@@ -13,12 +13,12 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
     "tf_custom_op_library",
-    "tf_custom_op_py_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
     "tf_kernel_library",
     "tf_py_test",
 )
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 cc_library(
     name = "range_coder",
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index f83386b8a4246ff2d7acdd2190804296582ee945..e4566437c60ebb2da039e61c171fbe954a7355c9 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -7,6 +7,7 @@ package_group(
     includes = ["//tensorflow/compiler/jit:friends"],
     packages = [
         "//tensorflow/...",
+        "//tensorflow_models/...",
         "//third_party/py/tensor2tensor/...",
     ],
 )
@@ -57,7 +58,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/compiler/jit:xla_ops_py",
-        "//tensorflow/contrib/tpu:tpu_lib",
+        "//tensorflow/compiler/jit/ops:xla_ops_grad",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
@@ -80,6 +81,7 @@ tf_py_test(
         "//tensorflow/python:control_flow_util",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/contrib/tpu:tpu_lib",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
diff --git a/tensorflow/contrib/compiler/xla.py b/tensorflow/contrib/compiler/xla.py
index 873b03580d6f1d9cb25c79cb31989d43cdb8c9a7..f867cd15b67dbd43650d8012b4299845af7200a8 100644
--- a/tensorflow/contrib/compiler/xla.py
+++ b/tensorflow/contrib/compiler/xla.py
@@ -23,7 +23,7 @@ import contextlib
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.jit.ops import xla_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_function
+from tensorflow.compiler.jit.ops import xla_ops_grad  # pylint: disable=unused-import
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
@@ -35,6 +35,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 _XLA_COMPILE_ATTR = '_xla_compile_id'
 _MAX_WARNING_LINES = 5
@@ -179,14 +180,11 @@ class XLACompileContext(control_flow_ops.XLAControlFlowContext):
     if external_control_inputs:
       # Use an identity to pull control inputs as data inputs. Note that we
       # ignore ops which don't have outputs. TODO(phawkins): fix that.
-      with ops.control_dependencies(None):
-        self.Enter()
-        external_control_inputs = [
-            array_ops.identity(x.outputs[0]).op
-            for x in external_control_inputs
-            if x.outputs
-        ]
-        self.Exit()
+      external_control_inputs = [
+          array_ops.identity(x.outputs[0]).op
+          for x in external_control_inputs
+          if x.outputs
+      ]
       # pylint: disable=protected-access
       op._add_control_inputs(external_control_inputs)
       # pylint: enable=protected-access
@@ -266,13 +264,13 @@ def _compile_internal(computation, inputs=None):
   inputs = [ops.convert_to_tensor(x) for x in inputs]
   input_arity = len(inputs)
 
-  arg_error = tpu_function.check_function_argument_count(
+  arg_error = check_function_argument_count(
       computation, input_arity, infeed_queue=None)
   if arg_error is not None:
     raise TypeError(
         'Supplied computation cannot be called with the specified inputs. You '
         'specified %d inputs: %s, but the computation needs %s' %
-        (input_arity, str([i.name for i in inputs[0]]), arg_error))
+        (input_arity, str([i.name for i in inputs]), arg_error))
 
   cluster_name = ops.get_default_graph().unique_name('cluster')
   pivot = control_flow_ops.no_op(name=cluster_name + '/pivot')
@@ -606,8 +604,8 @@ class _ModelFnWrapper(object):
 def estimator_model_fn(target_model_fn=None):
   """estimator_model_fn decorates a model_fn to be compiled for execution.
 
-  Currently only it only works with `TPUEstimator`. If you need to use it with
-  base `Estimator`, please add `tf.enable_resource_variables()` at beginning of
+  Currently it only works with `TPUEstimator`. If you need to use it with base
+  `Estimator`, please add `tf.enable_resource_variables()` at the beginning of
   your program.
 
   Example 1, decorating model_fn:
@@ -645,3 +643,51 @@ def estimator_model_fn(target_model_fn=None):
     return tf_decorator.make_decorator(function, _ModelFnWrapper(function))
 
   return decorated(target_model_fn) if target_model_fn else decorated
+
+
+def check_function_argument_count(func, input_arity, infeed_queue):
+  """Validate the number of input arguments to an XLA function.
+
+  Args:
+    func: the Python function that will be called to generate the body of an XLA
+      computation graph.
+    input_arity: the number of explicit arguments supplied by the caller.
+    infeed_queue: if not None, the infeed queue that will supply
+      additional arguments to the function.
+
+  Returns:
+    None if function can be called with the supplied number of
+      arguments, or an error string if it cannot.
+  """
+  def format_error(complaint, quantity):
+    return '%s %d argument%s' % (complaint, quantity, ''
+                                 if quantity == 1 else 's')
+
+  num_args_supplied = input_arity
+  if infeed_queue is not None:
+    num_args_supplied += infeed_queue.number_of_tuple_elements
+  arg_spec = tf_inspect.getargspec(func)
+  num_func_args = len(arg_spec.args)
+  if arg_spec.defaults is None:
+    num_func_defaults = 0
+  else:
+    num_func_defaults = len(arg_spec.defaults)
+  min_func_args = num_func_args - num_func_defaults
+  if num_args_supplied < min_func_args:
+    # The required number of arguments is not enough to call the function.
+    if num_func_defaults == 0 and arg_spec.varargs is None:
+      return format_error('exactly', num_func_args)
+    else:
+      return format_error('at least', min_func_args)
+  if arg_spec.varargs is None and num_args_supplied > num_func_args:
+    # The required number of arguments is too many to call the function.
+    if num_func_defaults == 0:
+      return format_error('exactly', num_func_args)
+    else:
+      return format_error('at most', num_func_args)
+  # Reaching here means either
+  # 1) There are varargs, func can accept any number of arguments greater than
+  # the minimum.
+  # 2) Number of supplied arguments falls in range of acceptable argument count
+  # of func.
+  return None
diff --git a/tensorflow/contrib/compiler/xla_test.py b/tensorflow/contrib/compiler/xla_test.py
index a306b56f63bd3b135b0231da89fb2e3445570740..3b49755afcf0753d31c0ce506dce42709b1ee8bc 100644
--- a/tensorflow/contrib/compiler/xla_test.py
+++ b/tensorflow/contrib/compiler/xla_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.compiler import xla
+from tensorflow.contrib.tpu.python.tpu import tpu_feed
 from tensorflow.python import summary
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -27,7 +28,6 @@ from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
@@ -48,7 +48,7 @@ class XLACompileContextTest(test.TestCase):
     histogram_summary = summary.histogram('histogram_summary', dummy_tensor)
     image_summary = summary.image('image_summary', dummy_tensor)
     scalar_summary = summary.scalar('scalar_summary', dummy_tensor)
-    tensor_summary = summary_ops.tensor_summary('tensor_summary', dummy_tensor)
+    tensor_summary = summary.tensor_summary('tensor_summary', dummy_tensor)
     summary.merge(
         [
             audio_summary, histogram_summary, image_summary, scalar_summary,
@@ -176,5 +176,81 @@ class XLACompileContextTest(test.TestCase):
     self.assertFalse(op.graph.is_fetchable(op.op))
 
 
+class CheckFunctionArgumentCountTest(test.TestCase):
+
+  def testSimple(self):
+    """Tests that arg checker works for functions with no varargs or defaults.
+    """
+
+    def func(x, y, z):
+      return x + y + z
+
+    self.assertEqual(None, xla.check_function_argument_count(func, 3, None))
+    self.assertEqual('exactly 3 arguments',
+                     xla.check_function_argument_count(func, 2, None))
+    queue = tpu_feed.InfeedQueue(2)
+    self.assertEqual(None, xla.check_function_argument_count(func, 1, queue))
+    self.assertEqual('exactly 3 arguments',
+                     xla.check_function_argument_count(func, 2, queue))
+
+  def testDefaultArgs(self):
+    """Tests that arg checker works for a function with no varargs."""
+
+    def func(x, y, z=17):
+      return x + y + z
+
+    self.assertEqual(None, xla.check_function_argument_count(func, 3, None))
+    self.assertEqual(None, xla.check_function_argument_count(func, 2, None))
+    self.assertEqual('at least 2 arguments',
+                     xla.check_function_argument_count(func, 1, None))
+    self.assertEqual('at most 3 arguments',
+                     xla.check_function_argument_count(func, 4, None))
+    queue = tpu_feed.InfeedQueue(1)
+    self.assertEqual(None, xla.check_function_argument_count(func, 2, queue))
+    self.assertEqual(None, xla.check_function_argument_count(func, 1, queue))
+    self.assertEqual('at least 2 arguments',
+                     xla.check_function_argument_count(func, 0, queue))
+    self.assertEqual('at most 3 arguments',
+                     xla.check_function_argument_count(func, 4, queue))
+
+  def testVarArgs(self):
+    """Tests that arg checker works for a function with varargs."""
+
+    def func(x, y, *z):
+      return x + y + len(z)
+
+    self.assertEqual(None, xla.check_function_argument_count(func, 2, None))
+    self.assertEqual(None, xla.check_function_argument_count(func, 3, None))
+    self.assertEqual(None, xla.check_function_argument_count(func, 4, None))
+    self.assertEqual('at least 2 arguments',
+                     xla.check_function_argument_count(func, 1, None))
+    queue = tpu_feed.InfeedQueue(1)
+    self.assertEqual(None, xla.check_function_argument_count(func, 1, queue))
+    self.assertEqual(None, xla.check_function_argument_count(func, 2, queue))
+    self.assertEqual(None, xla.check_function_argument_count(func, 3, queue))
+    self.assertEqual('at least 2 arguments',
+                     xla.check_function_argument_count(func, 0, queue))
+
+  def testVarArgsAndDefaults(self):
+    """Tests that arg checker works for a function with varargs and defaults."""
+
+    def func(x, y, z=17, *q):  # pylint: disable=keyword-arg-before-vararg
+      return x + y + z + len(q)
+
+    self.assertEqual(None, xla.check_function_argument_count(func, 2, None))
+    self.assertEqual(None, xla.check_function_argument_count(func, 3, None))
+    self.assertEqual(None, xla.check_function_argument_count(func, 4, None))
+    self.assertEqual(None, xla.check_function_argument_count(func, 5, None))
+    self.assertEqual('at least 2 arguments',
+                     xla.check_function_argument_count(func, 1, None))
+    queue = tpu_feed.InfeedQueue(1)
+    self.assertEqual(None, xla.check_function_argument_count(func, 1, queue))
+    self.assertEqual(None, xla.check_function_argument_count(func, 2, queue))
+    self.assertEqual(None, xla.check_function_argument_count(func, 3, queue))
+    self.assertEqual(None, xla.check_function_argument_count(func, 4, queue))
+    self.assertEqual('at least 2 arguments',
+                     xla.check_function_argument_count(func, 0, queue))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
index 41258edd90866ae9f644a02c42dfe2dc589da998..6926c0d03fe38ab2d62cc588950c7f5a49b2aba1 100644
--- a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
@@ -74,8 +74,8 @@ class ConstrainedMinimizationProblem(object):
 
     if (constraints_shape.ndims is None or
         proxy_constraints_shape.ndims is None or
-        any([ii is None for ii in constraints_shape.as_list()]) or
-        any([ii is None for ii in proxy_constraints_shape.as_list()])):
+        any(ii is None for ii in constraints_shape.as_list()) or
+        any(ii is None for ii in proxy_constraints_shape.as_list())):
       raise ValueError(
           "constraints and proxy_constraints must have fully-known shapes")
     if constraints_shape != proxy_constraints_shape:
diff --git a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
index 67f8ac2b9322f39b02c521f8b9cde3831c7889b8..fb0f849b33b0c5d28fff09eb5aac7f2c0d1adc0b 100644
--- a/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/external_regret_optimizer.py
@@ -82,7 +82,7 @@ def _project_multipliers_wrt_euclidean_norm(multipliers, radius):
     raise ValueError(
         "multipliers must be one dimensional (instead is %d-dimensional)" %
         multipliers_shape.ndims)
-  dimension = multipliers_shape[0].value
+  dimension = multipliers_shape.dims[0].value
   if dimension is None:
     raise ValueError("multipliers must have fully-known shape")
 
diff --git a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
index a6cb1f62f059770c90bd1aeea391d841aed9aacf..14e6d8701124ba67cdff8140250b5078f6194693 100644
--- a/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
+++ b/tensorflow/contrib/constrained_optimization/python/swap_regret_optimizer.py
@@ -156,7 +156,7 @@ def _project_stochastic_matrix_wrt_euclidean_norm(matrix):
   if matrix_shape[0] != matrix_shape[1]:
     raise ValueError("matrix must be square (instead has shape (%d,%d))" %
                      (matrix_shape[0], matrix_shape[1]))
-  dimension = matrix_shape[0].value
+  dimension = matrix_shape.dims[0].value
   if dimension is None:
     raise ValueError("matrix must have fully-known shape")
 
@@ -601,7 +601,7 @@ class MultiplicativeSwapRegretOptimizer(_SwapRegretOptimizer):
       assert state_shape is not None
       assert state_shape.ndims == 2
       assert state_shape[0] == state_shape[1]
-      dimension = state_shape[0].value
+      dimension = state_shape.dims[0].value
       assert dimension is not None
 
       minimum_log_multiplier = standard_ops.log(
diff --git a/tensorflow/contrib/copy_graph/python/__init__.py b/tensorflow/contrib/copy_graph/python/__init__.py
index b9ff28eb0d7115ff5919c2f758f70ba388f5d4d2..5c1048e02a3104c958f7710ba97980d3353adbad 100644
--- a/tensorflow/contrib/copy_graph/python/__init__.py
+++ b/tensorflow/contrib/copy_graph/python/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/copy_graph/python/util/__init__.py b/tensorflow/contrib/copy_graph/python/util/__init__.py
index b9ff28eb0d7115ff5919c2f758f70ba388f5d4d2..5c1048e02a3104c958f7710ba97980d3353adbad 100644
--- a/tensorflow/contrib/copy_graph/python/util/__init__.py
+++ b/tensorflow/contrib/copy_graph/python/util/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_test.py b/tensorflow/contrib/copy_graph/python/util/copy_test.py
index ba97c7845635596c3f4f849044b6707ec43f5bbf..4d8651a79fde9b876d4fdd9b050e71d2eb7c893d 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_test.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_test.py
@@ -26,15 +26,16 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
-graph1 = ops.Graph()
-graph2 = ops.Graph()
-
 
 class CopyVariablesTest(test.TestCase):
 
+  def setUp(self):
+    self.graph1 = ops.Graph()
+    self.graph2 = ops.Graph()
+
   def testVariableCopy(self):
 
-    with graph1.as_default():
+    with self.graph1.as_default():
       #Define a Variable in graph1
       some_var = variables.VariableV1(2)
       #Initialize session
@@ -43,13 +44,15 @@ class CopyVariablesTest(test.TestCase):
       variables.global_variables_initializer().run(session=sess1)
 
     #Make a copy of some_var in the defsult scope in graph2
-    copy1 = copy_elements.copy_variable_to_graph(some_var, graph2)
+    copy1 = copy_elements.copy_variable_to_graph(some_var, self.graph2)
 
     #Make another copy with different scope
-    copy2 = copy_elements.copy_variable_to_graph(some_var, graph2, "test_scope")
+    copy2 = copy_elements.copy_variable_to_graph(some_var,
+                                                 self.graph2,
+                                                 "test_scope")
 
     #Initialize both the copies
-    with graph2.as_default():
+    with self.graph2.as_default():
       #Initialize Session
       sess2 = session_lib.Session()
       #Initialize the Variables
@@ -67,9 +70,13 @@ class CopyVariablesTest(test.TestCase):
 
 class CopyOpsTest(test.TestCase):
 
+  def setUp(self):
+    self.graph1 = ops.Graph()
+    self.graph2 = ops.Graph()
+
   def testOpsCopy(self):
 
-    with graph1.as_default():
+    with self.graph1.as_default():
       #Initialize a basic expression y = ax + b
       x = array_ops.placeholder("float")
       a = variables.VariableV1(3.0)
@@ -82,21 +89,21 @@ class CopyOpsTest(test.TestCase):
       variables.global_variables_initializer().run(session=sess1)
 
     #First, initialize a as a Variable in graph2
-    a1 = copy_elements.copy_variable_to_graph(a, graph2)
+    a1 = copy_elements.copy_variable_to_graph(a, self.graph2)
 
     #Initialize a1 in graph2
-    with graph2.as_default():
+    with self.graph2.as_default():
       #Initialize session
       sess2 = session_lib.Session()
       #Initialize the Variable
       variables.global_variables_initializer().run(session=sess2)
 
     #Initialize a copy of y in graph2
-    y1 = copy_elements.copy_op_to_graph(y, graph2, [a1])
+    y1 = copy_elements.copy_op_to_graph(y, self.graph2, [a1])
 
     #Now that y has been copied, x must be copied too.
     #Get that instance
-    x1 = copy_elements.get_copied_op(x, graph2)
+    x1 = copy_elements.get_copied_op(x, self.graph2)
 
     #Compare values of y & y1 for a sample input
     #and check if they match
diff --git a/tensorflow/contrib/crf/__init__.py b/tensorflow/contrib/crf/__init__.py
index fe5e34d258fbc1508a0a85655f29c2c9bc8fa8b1..d53549048f33162ec89dfe957ca58a4bbb4e95c6 100644
--- a/tensorflow/contrib/crf/__init__.py
+++ b/tensorflow/contrib/crf/__init__.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """Linear-chain CRF layer.
 
-See the [CRF](https://tensorflow.org/api_guides/python/contrib.crf) guide.
-
 @@crf_binary_score
 @@crf_decode
 @@crf_log_likelihood
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 43bb43129bfe1cb1c66f4965476f9b7f849658ad..40e159b8fcbd1864284e208cb15d9ed96119f840 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -38,12 +38,12 @@ tf_unary_scores, tf_sequence_lengths, tf_transition_params, _ = session.run(
     [unary_scores, sequence_lengths, transition_params, train_op])
 for tf_unary_scores_, tf_sequence_length_ in zip(tf_unary_scores,
                                                  tf_sequence_lengths):
-# Remove padding.
-tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
+    # Remove padding.
+    tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
 
-# Compute the highest score and its tag sequence.
-tf_viterbi_sequence, tf_viterbi_score = tf.contrib.crf.viterbi_decode(
-    tf_unary_scores_, tf_transition_params)
+    # Compute the highest score and its tag sequence.
+    tf_viterbi_sequence, tf_viterbi_score = tf.contrib.crf.viterbi_decode(
+        tf_unary_scores_, tf_transition_params)
 """
 
 from __future__ import absolute_import
@@ -54,6 +54,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
@@ -107,8 +108,10 @@ def crf_sequence_score(inputs, tag_indices, sequence_lengths,
     return sequence_scores
 
   return utils.smart_cond(
-      pred=math_ops.equal(inputs.shape[1].value or array_ops.shape(inputs)[1],
-                          1),
+      pred=math_ops.equal(
+          tensor_shape.dimension_value(
+              inputs.shape[1]) or array_ops.shape(inputs)[1],
+          1),
       true_fn=_single_seq_fn,
       false_fn=_multi_seq_fn)
 
@@ -157,8 +160,10 @@ def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
         transition_params=transition_params)
 
   return utils.smart_cond(
-      pred=math_ops.equal(inputs.shape[1].value or array_ops.shape(inputs)[1],
-                          1),
+      pred=math_ops.equal(
+          tensor_shape.dimension_value(
+              inputs.shape[1]) or array_ops.shape(inputs)[1],
+          1),
       true_fn=_single_seq_fn,
       false_fn=_multi_seq_fn)
 
@@ -214,8 +219,10 @@ def crf_log_norm(inputs, sequence_lengths, transition_params):
     return log_norm
 
   return utils.smart_cond(
-      pred=math_ops.equal(inputs.shape[1].value or
-                          array_ops.shape(inputs)[1], 1),
+      pred=math_ops.equal(
+          tensor_shape.dimension_value(
+              inputs.shape[1]) or array_ops.shape(inputs)[1],
+          1),
       true_fn=_single_seq_fn,
       false_fn=_multi_seq_fn)
 
@@ -240,7 +247,7 @@ def crf_log_likelihood(inputs,
         provided by the caller or created in this function.
   """
   # Get shape information.
-  num_tags = inputs.get_shape()[2].value
+  num_tags = tensor_shape.dimension_value(inputs.shape[2])
 
   # Get the transition matrix if not provided.
   if transition_params is None:
@@ -342,7 +349,7 @@ class CrfForwardRnnCell(rnn_cell.RNNCell):
           for the broadcast summation occurring within the cell.
     """
     self._transition_params = array_ops.expand_dims(transition_params, 0)
-    self._num_tags = transition_params.get_shape()[0].value
+    self._num_tags = tensor_shape.dimension_value(transition_params.shape[0])
 
   @property
   def state_size(self):
@@ -428,7 +435,7 @@ class CrfDecodeForwardRnnCell(rnn_cell.RNNCell):
         summation occurring within the cell.
     """
     self._transition_params = array_ops.expand_dims(transition_params, 0)
-    self._num_tags = transition_params.get_shape()[0].value
+    self._num_tags = tensor_shape.dimension_value(transition_params.shape[0])
 
   @property
   def state_size(self):
@@ -540,7 +547,7 @@ def crf_decode(potentials, transition_params, sequence_length):
 
     # For simplicity, in shape comments, denote:
     # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
-    num_tags = potentials.get_shape()[2].value
+    num_tags = tensor_shape.dimension_value(potentials.shape[2])
 
     # Computes forward decoding. Get last score and backpointers.
     crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
@@ -583,7 +590,7 @@ def crf_decode(potentials, transition_params, sequence_length):
     return decode_tags, best_score
 
   return utils.smart_cond(
-      pred=math_ops.equal(potentials.shape[1].value or
+      pred=math_ops.equal(tensor_shape.dimension_value(potentials.shape[1]) or
                           array_ops.shape(potentials)[1], 1),
       true_fn=_single_seq_fn,
       false_fn=_multi_seq_fn)
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index aeefa3cee62281c74388765ea5e2cbc7f16ff927..8d35622e393e15a2f2dfea7c75ad2c9f48aa7150 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -9,8 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
@@ -44,10 +42,11 @@ tf_custom_op_py_library(
 
 cuda_py_test(
     name = "cudnn_rnn_ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
     additional_deps = [
         ":cudnn_rnn_py",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python/ops/losses:losses",
@@ -63,10 +62,10 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
-    shard_count = 6,
+    shard_count = 2,
     tags = [
-        "manual",
-        "requires_cudnn5",
+        "noasan",  # http://b/62067814
+        "requires-gpu-sm35",
     ],
 )
 
@@ -93,8 +92,7 @@ cuda_py_test(
     ],
     shard_count = 6,
     tags = [
-        "manual",
-        "requires_cudnn5",
+        "noasan",  # http://b/62067814
     ],
 )
 
@@ -121,6 +119,5 @@ cuda_py_test(
         "noasan",  # http://b/62067814
         "nomsan",
         "notsan",
-        "requires_cudnn5",
     ],
 )
diff --git a/tensorflow/contrib/cudnn_rnn/__init__.py b/tensorflow/contrib/cudnn_rnn/__init__.py
index 5d8c6191f8db9f96532aa78e4790a4665d3b4877..5320232268657fa73bcd3e86da49d6525e9b8db5 100644
--- a/tensorflow/contrib/cudnn_rnn/__init__.py
+++ b/tensorflow/contrib/cudnn_rnn/__init__.py
@@ -24,6 +24,10 @@
 @@CudnnGRUSaveable
 @@CudnnRNNReluSaveable
 @@CudnnRNNTanhSaveable
+@@CudnnParamsFormatConverterLSTM
+@@CudnnParamsFormatConverterGRU
+@@CudnnParamsFormatConverterTanh
+@@CudnnParamsFormatConverterRelu
 """
 
 from __future__ import absolute_import
@@ -48,6 +52,10 @@ _allowed_symbols = [
     "CudnnGRUSaveable",
     "CudnnRNNReluSaveable",
     "CudnnRNNTanhSaveable",
+    "CudnnParamsFormatConverterLSTM",
+    "CudnnParamsFormatConverterGRU",
+    "CudnnParamsFormatConverterTanh",
+    "CudnnParamsFormatConverterRelu",
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index c59d3682d404e032d9f4bf81ef54ab456341cefa..a268415f0e65206294431a537be18cadbe1a1e84 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -18,24 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import itertools
 import os
 import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
@@ -56,710 +62,989 @@ CUDNN_RNN_TANH_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_TANH_PARAMS_PER_LAYER
 CUDNN_RNN_RELU_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_RELU_PARAMS_PER_LAYER
 
 
-def _CreateModel(rnn_mode,
-                 num_layers,
-                 num_units,
-                 input_size,
-                 input_mode="linear_input",
-                 direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-                 dtype=dtypes.float32,
-                 dropout=0.):
-  del input_mode
-  if rnn_mode == cudnn_rnn_ops.CUDNN_LSTM:
-    model_fn = cudnn_rnn_ops.CudnnLSTM
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_GRU:
-    model_fn = cudnn_rnn_ops.CudnnGRU
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_TANH:
-    model_fn = cudnn_rnn_ops.CudnnRNNTanh
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_RELU:
-    model_fn = cudnn_rnn_ops.CudnnRNNRelu
+def RunLSTM(sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers=1,
+            is_training=True,
+            dropout=0.,
+            num_dirs=True,
+            dtype=dtypes.float32):
+  # TODO(jamesqin): add multi-layer tests.
+  # TODO(jamesqin): add multi-dir tests
+  assert num_layers == 1
+  assert num_dirs == 1
+  if is_training and not np.isclose(dropout, 0):
+    raise ValueError("dropout can not be 0. when test training.")
+
+  # set graph level random seed and numpy random seed.
+  random_seed.set_random_seed(0)
+  np.random.seed(0)
+
+  inputs = variable_scope.get_variable(
+      "inputs",
+      initializer=np.random.rand(time, batch_size,
+                                 input_size).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_h_op = variable_scope.get_variable(
+      "initial_h_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_c_op = variable_scope.get_variable(
+      "initial_c_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+
+  initializer = init_ops.random_uniform_initializer(
+      -0.01, 0.01, dtype=dtype, seed=19980904)
+
+  with variable_scope.variable_scope("test", initializer=initializer):
+    w = variable_scope.get_variable(
+        "rnn/lstm_cell/kernel",
+        shape=[input_size + num_units, num_units * 4],
+        dtype=dtype)
+    b = variable_scope.get_variable(
+        "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype)
+
+    # canonical lstm. must set forget_bias to 0. to align with cudnn lstm.
+    cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
+    outputs_op, state_tuple_op = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=rnn_cell_impl.LSTMStateTuple(
+            h=initial_h_op, c=initial_c_op),
+        dtype=dtype,
+        time_major=True,
+        scope=None)
+
+  # Convert to cudnn opaque param.
+  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
+      num_layers, num_units, input_size)
+  opaque_params = format_converter.tf_canonical_to_opaque([w, b])
+
+  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
+  cu_initial_c_op = array_ops.expand_dims(initial_c_op, axis=0)
+  cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
+      inputs,
+      cu_initial_h_op,
+      cu_initial_c_op,
+      opaque_params,
+      dropout=dropout,
+      is_training=is_training,
+      rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
+  # Remove the trivial 1st dimension.
+  cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
+      c=array_ops.squeeze(cu_c_op, axis=0),
+      h=array_ops.squeeze(cu_h_op, axis=0))
+
+  if is_training:
+    (inp_grad_op, hgrad_op,
+     cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients(
+         outputs_op, [inputs, initial_h_op, initial_c_op, w, b])
+
+    (cu_inp_grad_op, cu_hgrad_op,
+     cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
+         cu_outputs_op,
+         [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
+    # Remove the trivial 1st dimension
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+    # Remove the trivial 1st dimension
+    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0)
+
+    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
+        opaque_grad_op)
+    cu_wgrad_op = cu_wgrad_op[0]
+    cu_bgrad_op = cu_bgrad_op[0]
+    # cudnn lstm has 2 biases each gate. When converting to tf canonical format,
+    # the two biases are summed into one. Thus here bias gradient should be
+    # halved when comparing with tf lstm.
+    cu_bgrad_op *= 0.5
+
+  init_op = variables.global_variables_initializer()
+  sess.run(init_op)
+
+  if is_training:
+    outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([
+        outputs_op, state_tuple_op, inp_grad_op,
+        (hgrad_op, cgrad_op), wgrad_op, bgrad_op
+    ])
+    (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
+     cu_bgrad) = sess.run([
+         cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
+         (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
+     ])
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
+    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
+    logging.vlog(1, "inp_grad: %s" % inp_grad)
+    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
+    logging.vlog(1, "state_grad: %s" % str(state_grad))
+    logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad))
+    logging.vlog(1, "wgrad: %s" % str(wgrad))
+    logging.vlog(1, "bgrad: %s" % str(bgrad))
+    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
+    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
+    return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad,
+            cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad,
+            cu_bgrad)
   else:
-    raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
-  return model_fn(
-      num_layers,
-      num_units,
-      input_size,
-      direction=direction,
-      dtype=dtype,
-      dropout=dropout)
-
-
-def _CreateParamsSavable(params,
-                         model,
-                         base_variable_scope=None,
-                         name="params_canonical"):
-  """Create a RNNParamsSaveable for the weight and bias parameters.
+    outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
+    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op])
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
+    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
+  return outputs, cu_outputs, state_tuple, cu_state_tuple
+
+
+# Basic set of RNN configs to test. They can be further extended in relevant
+# test (e.g. adding num_dirs).
+NAMED_RNN_TESTCASES = ({
+    "testcase_name": "xsmall",
+    "num_units": 1,
+    "input_size": 1,
+    "batch_size": 1,
+    "time": 1,
+    "num_layers": 1,
+}, {
+    "testcase_name": "small",
+    "num_units": 4,
+    "input_size": 4,
+    "batch_size": 4,
+    "time": 4,
+    "num_layers": 1,
+}, {
+    "testcase_name": "medium",
+    "num_units": 128,
+    "input_size": 64,
+    "batch_size": 8,
+    "time": 16,
+    "num_layers": 1,
+}, {
+    "testcase_name": "large",
+    "num_units": 128,
+    "input_size": 128,
+    "batch_size": 16,
+    "time": 32,
+    "num_layers": 1,
+})
+
+
+def ExpandNamedTestCases(inputs, *remove_keys, **extra_configs):
+  """Expands testcase with new config dimensions.
+
+  Example:
+    inputs = (
+      {'testcase_name': 'test1', 'gender': 'male'}
+      {'testcase_name': 'test2', 'gender': 'female'}
+    )
+    remove_keys:  empty
+    extra_configs = {
+      'age': [40, 80]
+      'height': [5, 6]
+    }
+
+    Returns:
+      (
+        {'testcase_name': 'test1_age_40_height_5','gender': 'male', 'age':
+        40,'height': 5}
+        {'testcase_name': 'test1_age_40_height_6', 'gender': 'male', 'age': 40,
+        'height': 6}
+        {'testcase_name': 'test1_age_80_height_5', 'gender': 'male', 'age': 80,
+        'height': 5}
+        {'testcase_name': 'test1_age_80_height_6', 'gender': 'male', 'age': 80,
+        'height': 6}
+
+        {'testcase_name': 'test2_age_40_height_5', 'gender': 'female', 'age':
+        40,
+        'height': 5}
+        {'testcase_name': 'test2_age_40_height_6', 'gender': 'female', 'age':
+        40,
+        'height': 6}
+        {'testcase_name': 'test2_age_80_height_5', 'gender': 'female', 'age':
+        80,
+        'height': 5}
+        {'testcase_name': 'test2_age_80_height_6', 'gender': 'female', 'age':
+        80,
+        'height': 6}
+      )
 
   Args:
-    params: a Variable for weight and bias parameters.
-    model: a CudnnRNN model.
-    base_variable_scope: a string, prefix of names of saved variables.
-    name: a string, name of the RNNParamsSaveable object.
+    inputs: A list of dictionary, each being a testcase.
+    *remove_keys: A list of keys into testcase which are not needed in new
+      testcases.
+    **extra_configs: A dict of new test dimension and applicable values in that
+      dimension.
+
   Returns:
-    a RNNParamsSaveable object.
+    A list of dictionary with expanded test cases.
   """
-  if model._rnn_mode == CUDNN_LSTM:
-    fn = cudnn_rnn_ops.CudnnLSTMSaveable
-  elif model._rnn_mode == CUDNN_GRU:
-    fn = cudnn_rnn_ops.CudnnGRUSaveable
-  elif model._rnn_mode == CUDNN_RNN_TANH:
-    fn = cudnn_rnn_ops.CudnnRNNTanhSaveable
-  elif model._rnn_mode == CUDNN_RNN_RELU:
-    fn = cudnn_rnn_ops.CudnnRNNReluSaveable
-  params_saveable = fn(
-      params,
-      model.num_layers,
-      model.num_units,
-      model.input_size,
-      model.input_mode,
-      model.direction,
-      scope=base_variable_scope,
-      name=name)
-  ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
-  return params_saveable
-
-
-def _MinLSTMParamSize(num_layers,
-                      num_units,
-                      input_size,
-                      direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION):
-  if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION:
-    first_layer_weights = 4 * num_units * (num_units + input_size)
-    higher_layer_weights = 8 * (num_layers - 1) * num_units * num_units
-    all_biases = 8 * num_layers * num_units
-    return first_layer_weights + higher_layer_weights + all_biases
-  elif direction == cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION:
-    first_layer_weights = 4 * num_units * (num_units + input_size)
-    higher_layer_weights = (num_layers - 1) * (
-        4 * 2 * num_units * num_units + 4 * num_units**2)
-    all_biases = 8 * num_layers * num_units
-    return 2 * (first_layer_weights + higher_layer_weights + all_biases)
-  else:
-    raise ValueError("%s direction is not supported.")
+  res = []
+  ordered_extra_configs = collections.OrderedDict(extra_configs)
+  keys = ordered_extra_configs.keys()
+  # A list of list of configs.
+  # The outer loop is iterating keys, the innner is values of one key.
+  combined_kv = [[(k, v) for v in ordered_extra_configs[k]] for k in keys]
+  logging.info("combined_kv: %s", combined_kv)
 
+  for inp in inputs:
+    # Each inp is a dict
+    for config in itertools.product(*combined_kv):
+      new_inp = dict(inp)
+      # config is a list in the form of [(k_i, v_j), (k_p, v_q), ...]
+      suffix = ["%s_%s" % (p[0], str(p[1])) for p in config]
+      suffix = "_".join(suffix)
+      new_inp["testcase_name"] += "_" + suffix
+      for k, v in config:
+        new_inp[k] = v
+      # Remove not used keys from the new test case.
+      if remove_keys:
+        if not isinstance(remove_keys, (list, tuple)):
+          remove_keys = [remove_keys]
+        for k in remove_keys:
+          new_inp.pop(k, None)
+      logging.info("new_inp: %s", new_inp)
+      res.append(new_inp)
+  # Dedup, necessary if `remove_keys` is set.
+  return [dict(t) for t in {tuple(d.items()) for d in res}]
 
-class CudnnRNNTestSaveRestore(TensorFlowTestCase):
 
-  def _CompareWeights(self, lhs, rhs):
-    self.assertEqual(len(lhs), len(rhs))
-    for lw, rw in zip(lhs, rhs):
-      self.assertAllEqual(lw, rw)
+class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
 
-  def _CompareBiases(self, lhs, rhs, rnn_mode, num_layers, direction):
-    self.assertEqual(len(lhs), len(rhs))
-    if rnn_mode == CUDNN_LSTM:
-      num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
-    elif rnn_mode == CUDNN_GRU:
-      num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
-    elif rnn_mode == CUDNN_RNN_TANH:
-      num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
-    else:
-      num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
-    num_dirs = 1 if direction == CUDNN_RNN_UNIDIRECTION else 2
-    num_params_per_layer *= num_dirs
-    self.assertEqual(num_params_per_layer * num_layers, len(lhs))
-
-    for i in range(num_layers):
-      layer_lhs = lhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
-      layer_rhs = rhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
-      if direction == CUDNN_RNN_UNIDIRECTION:
-        self._CompareSingleLayerBiases(layer_lhs, layer_rhs)
-      else:
-        size = len(layer_lhs)
-        fw_lhs, bw_lhs = layer_lhs[:size//2], layer_lhs[size//2:]
-        fw_rhs, bw_rhs = layer_rhs[:size//2], layer_rhs[size//2:]
-        self._CompareSingleLayerBiases(fw_lhs, fw_rhs)
-        self._CompareSingleLayerBiases(bw_lhs, bw_rhs)
-
-  def _CompareSingleLayerBiases(self, lhs, rhs):
-    self.assertEqual(len(lhs), len(rhs))
-
-    lf_lhs, rt_lhs = lhs[:len(lhs)//2], lhs[len(lhs)//2:]
-    lf_rhs, rt_rhs = rhs[:len(rhs)//2], rhs[len(rhs)//2:]
-    self.assertEqual(len(lf_lhs), len(rt_lhs))
-    self.assertEqual(len(lf_rhs), len(rt_rhs))
-
-    sum_lhs, sum_rhs = [], []
-    for lf, rt in zip(lf_lhs, rt_lhs):
-      sum_lhs.append(lf + rt)
-    for lf, rt in zip(lf_rhs, rt_rhs):
-      sum_rhs.append(lf + rt)
-    self.assertEqual(len(sum_lhs), len(sum_rhs))
-    for lf, rt in zip(sum_lhs, sum_rhs):
-      self.assertAllEqual(lf, rt)
+  def _test_training_helper(self,
+                            num_units,
+                            input_size,
+                            batch_size,
+                            time,
+                            num_layers,
+                            dtype,
+                            rtol=2e-6,
+                            atol=2e-6):
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad,
+       state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunLSTM(
+           sess, num_units, input_size, batch_size, time, num_layers)
 
-  def _testSaveRestoreVariable(self, rnn_mode, direction, dtype):
-    num_layers = 2
-    num_units = 7
-    input_size = 3
-    with ops.Graph().as_default():
-      model = _CreateModel(
-          rnn_mode,
-          num_layers=num_layers,
-          num_units=num_units,
-          input_size=input_size,
-          direction=direction,
-          dtype=dtype)
-      random_seed.set_random_seed(1234)
-      params_size_t = model.params_size()
-      params = variables.Variable(
-          random_ops.random_uniform([params_size_t], dtype=dtype),
-          dtype=dtype,
-          validate_shape=False)
-      saveable = _CreateParamsSavable(params, model)
-      weights, biases = saveable._OpaqueParamsToCanonical()
-      reset_params = state_ops.assign(
-          params,
-          array_ops.zeros([params_size_t], dtype=dtype),
-          validate_shape=False)
-      save_path = os.path.join(self.get_temp_dir(),
-                               "save-restore-variable-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      for s, cu_s in zip(state_tuple, cu_state_tuple):
+        self.assertAllClose(s, cu_s, rtol=rtol, atol=atol)
+      for sg, cu_sg in zip(state_grad, cu_state_grad):
+        self.assertAllClose(sg, cu_sg, rtol=rtol, atol=atol)
+      self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol)
+      self.assertAllClose(bgrad, cu_bgrad, rtol=rtol, atol=atol)
+      self.assertAllClose(wgrad, cu_wgrad, rtol=rtol, atol=atol)
 
-        weights_v, biases_v = sess.run([weights, biases])
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(num_units, input_size, batch_size, time,
+                               num_layers, dtypes.float32)
 
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        weights_v_restored, biases_v_restored = sess.run([weights, biases])
-
-        self._CompareWeights(weights_v, weights_v_restored)
-        self._CompareBiases(biases_v, biases_v_restored, rnn_mode, num_layers,
-                            direction)
-
-  def _testSaveRestoreTwoVariables(self, rnn_mode, direction, dtype):
-    num_layers = 2
-    num_units = 7
-    input_size = 3
-    with ops.Graph().as_default():
-      model = _CreateModel(
-          rnn_mode,
-          num_layers=num_layers,
-          num_units=num_units,
-          input_size=input_size,
-          direction=direction,
-          dtype=dtype)
-      random_seed.set_random_seed(1234)
-      params_size_t = model.params_size()
-      names = ["rnn_1", "rnn_2"]
-      param_vars = [
-          variables.Variable(
-              random_ops.random_uniform([params_size_t], dtype=dtype),
-              dtype=dtype,
-              validate_shape=False) for name in names
-      ]
-      saveables = []
-      for name, params in zip(names, param_vars):
-        saveables.append(_CreateParamsSavable(params, model, name, name))
-      weights1, biases1 = saveables[0]._OpaqueParamsToCanonical()
-      weights2, biases2 = saveables[1]._OpaqueParamsToCanonical()
-      reset_params = [
-          state_ops.assign(
-              params,
-              array_ops.zeros([params_size_t], dtype=dtype),
-              validate_shape=False) for params in param_vars
-      ]
-      save_path = os.path.join(self.get_temp_dir(),
-                               "save-restore-variable-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(use_gpu=True,
-                             graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
-        weights1_v, biases1_v = sess.run([weights1, biases1])
-        weights2_v, biases2_v = sess.run([weights2, biases2])
-
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        weights1_v_restored, biases1_v_restored = sess.run([weights1, biases1])
-        weights2_v_restored, biases2_v_restored = sess.run([weights2, biases2])
-
-        self._CompareWeights(weights1_v, weights1_v_restored)
-        self._CompareWeights(weights2_v, weights2_v_restored)
-        self._CompareBiases(biases1_v, biases1_v_restored, rnn_mode, num_layers,
-                            direction)
-        self._CompareBiases(biases2_v, biases2_v_restored, rnn_mode, num_layers,
-                            direction)
-
-  def _testSaveRestoreOutput(self, rnn_mode, direction, dtype):
-    with ops.Graph().as_default():
-      num_layers = 2
-      num_units = 7
-      input_size = 7
-      seq_length = 10
-      batch_size = 5
-      dir_count = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
-      model = _CreateModel(
-          rnn_mode,
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training_fp16(self, num_units, input_size, batch_size, time,
+                         num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(
+        num_units,
+        input_size,
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float16,
+        rtol=5e-3,
+        atol=5e-4)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
           num_layers,
+          is_training=False)
+
+      self.assertAllClose(outputs, cu_outputs)
+      # h
+      self.assertAllClose(state_tuple.h, cu_state_tuple.h)
+      # c
+      self.assertAllClose(state_tuple.c, cu_state_tuple.c)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_fp16(self, num_units, input_size, batch_size, time,
+                          num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM(
+          sess,
           num_units,
           input_size,
-          direction=direction,
-          dtype=dtype)
-      params_size_t = model.params_size()
-      params = variables.Variable(
-          array_ops.ones([params_size_t], dtype=dtype),
-          validate_shape=False,
-          dtype=dtype)
-      _CreateParamsSavable(params, model)
-      save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+          batch_size,
+          time,
+          num_layers,
+          is_training=False,
+          dtype=dtypes.float16)
 
-      np.random.seed(1234)
-      has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-      input_data = constant_op.constant(
-          np.random.randn(seq_length, batch_size, input_size), dtype=dtype)
-      input_h = constant_op.constant(
-          np.random.randn(num_layers * dir_count, batch_size, num_units),
-          dtype=dtype)
-      if has_input_c:
-        input_c = constant_op.constant(
-            np.random.randn(num_layers * dir_count, batch_size, num_units),
-            dtype=dtype)
-        outputs = model(
-            input_data=input_data,
-            input_h=input_h,
-            input_c=input_c,
-            params=params,
-            is_training=False)
-      else:
-        outputs = model(
-            input_data=input_data,
-            input_h=input_h,
-            params=params,
-            is_training=False)
-      total_sum = sum(map(math_ops.reduce_sum, outputs))
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        total_sum_v = sess.run(total_sum)
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        reset_params = state_ops.assign(
-            params,
-            array_ops.zeros([params_size_t], dtype=dtype),
-            validate_shape=False)
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        total_sum_v_restored = sess.run(total_sum)
-        self.assertAllClose(total_sum_v, total_sum_v_restored, atol=1e-5)
+      rtol, atol = 5e-3, 5e-4
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      # h
+      self.assertAllClose(
+          state_tuple.h, cu_state_tuple.h, rtol=rtol, atol=atol)
+      # c
+      self.assertAllClose(
+          state_tuple.c, cu_state_tuple.c, rtol=rtol, atol=atol)
 
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSaveRestore(self):
-    rnn_modes = [
-        cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU,
-        cudnn_rnn_ops.CUDNN_RNN_TANH, cudnn_rnn_ops.CUDNN_RNN_RELU
-    ]
-    directions = [
-        cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-        cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION
-    ]
-    dtype_list = [dtypes.float32, dtypes.float64]
-    for rnn_mode, direction, dtype in itertools.product(rnn_modes, directions,
-                                                        dtype_list):
-      self._testSaveRestoreVariable(rnn_mode, direction, dtype)
-      self._testSaveRestoreTwoVariables(rnn_mode, direction, dtype)
-      self._testSaveRestoreOutput(rnn_mode, direction, dtype)
-
-
-class CudnnRNNTestParamsSize(TensorFlowTestCase):
-
-  def _testOneLSTMParamsSize(self, num_layers, num_units, input_size,
-                             direction):
-    logging.info("Testing one lstm param size with config: %s", locals())
-    min_params_size = _MinLSTMParamSize(num_layers, num_units, input_size,
-                                        direction)
-    model = _CreateModel(
-        cudnn_rnn_ops.CUDNN_LSTM,
-        num_layers,
+  def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
+                                  num_layers):
+    """Validates that dropout does not affect Cudnn Rnn inference."""
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    # Hand-picked dropouts are used below (0. and 1.)
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        # 1st time w/o dropout.
+        (_, cu_outputs, _, cu_state_tuple) = RunLSTM(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=0.)
+
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        (_, cu_outputs2, _, cu_state_tuple2) = RunLSTM(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=1.)
+
+    self.assertAllClose(cu_outputs, cu_outputs2)
+    # h
+    self.assertAllClose(cu_state_tuple.h, cu_state_tuple2.h)
+    # c
+    self.assertAllClose(cu_state_tuple.c, cu_state_tuple2.c)
+
+
+def RunGRU(sess,
+           num_units,
+           input_size,
+           batch_size,
+           time,
+           num_layers=1,
+           is_training=True,
+           dropout=0.,
+           num_dirs=True,
+           dtype=dtypes.float32):
+  # TODO(jamesqin): add multi-layer tests.
+  # TODO(jamesqin): add multi-dir tests
+  assert num_layers == 1
+  assert num_dirs == 1
+  if is_training and not np.isclose(dropout, 0):
+    raise ValueError("dropout can not be 0. when test training.")
+
+  # set graph level random seed and numpy random seed.
+  random_seed.set_random_seed(0)
+  np.random.seed(0)
+
+  inputs = variable_scope.get_variable(
+      "inputs",
+      initializer=np.random.rand(time, batch_size,
+                                 input_size).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_h_op = variable_scope.get_variable(
+      "initial_h_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+
+  initializer = init_ops.random_uniform_initializer(
+      -0.01, 0.01, dtype=dtype, seed=19980904)
+  with variable_scope.variable_scope("test", initializer=initializer):
+    gate_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/gates/kernel",
+        shape=[input_size + num_units, num_units * 2],
+        dtype=dtype)
+    gate_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/gates/bias",
+        shape=[num_units * 2],
+        dtype=dtype)
+    candidate_inp_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel",
+        shape=[input_size, num_units],
+        dtype=dtype)
+    candidate_inp_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias",
+        shape=[num_units],
+        dtype=dtype)
+    candidate_hid_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel",
+        shape=[num_units, num_units],
+        dtype=dtype)
+    candidate_hid_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias",
+        shape=[num_units],
+        dtype=dtype)
+
+    cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True)
+    outputs_op, h_op = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=initial_h_op,
+        dtype=dtype,
+        time_major=True,
+        scope=None)
+
+  ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel]
+  bs = [gate_bias, candidate_inp_bias, candidate_hid_bias]
+  # Convert to cudnn opaque param.
+  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
+      num_layers, num_units, input_size)
+  opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+
+  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
+  cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
+      inputs,
+      cu_initial_h_op,
+      array_ops.zeros_like(cu_initial_h_op),  # not used
+      opaque_params,
+      dropout=dropout,
+      is_training=is_training,
+      rnn_mode=cudnn_rnn_ops.CUDNN_GRU)
+
+  if is_training:
+    (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op,
+     cib_grad_op, chb_grad_op) = gradients_impl.gradients(
+         outputs_op, [inputs, initial_h_op] + ws + bs)
+
+    (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients(
+        cu_outputs_op, [inputs, cu_initial_h_op, opaque_params])
+    # Remove the trivial 1st dimension
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+
+    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
+        opaque_grad_op)
+    (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op
+    (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op
+    # cudnn gru has 2 biases for reset and update gates. When converting to tf
+    # canonical format, the two biases are summed into one.  Thus here relevant
+    # bias gradient should be halved before comparing with tf gru.
+    cu_gb_grad_op *= 0.5
+
+  init_op = variables.global_variables_initializer()
+  sess.run(init_op)
+
+  if is_training:
+    outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([
+        outputs_op, h_op, inp_grad_op, hgrad_op,
+        (gk_grad_op, cik_grad_op, chk_grad_op),
+        (gb_grad_op, cib_grad_op, chb_grad_op)
+    ])
+    (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run([
+        cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
+        (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
+        (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
+    ])
+    # Remove the trivial 1st dimension
+    cu_h = np.squeeze(cu_h, axis=0)
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "h: %s" % h)
+    logging.vlog(1, "cu_h: %s" % h)
+    logging.vlog(1, "inp_grad: %s" % inp_grad)
+    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
+    logging.vlog(1, "hgrad: %s" % hgrad)
+    logging.vlog(1, "cu_hgrad: %s" % cu_hgrad)
+    logging.vlog(1, "wgrad: %s" % str(wgrad))
+    logging.vlog(1, "bgrad: %s" % str(bgrad))
+    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
+    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
+    return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
+            cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad)
+  else:
+    outputs, h = sess.run([outputs_op, h_op])
+    cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op])
+    # Remove the trivial 1st dimension.
+    cu_h = np.squeeze(cu_h, axis=0)
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "h: %s" % h)
+    logging.vlog(1, "cu_h: %s" % h)
+  return outputs, cu_outputs, h, cu_h
+
+
+class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
+
+  def _test_training_helper(self,
+                            num_units,
+                            input_size,
+                            batch_size,
+                            time,
+                            num_layers,
+                            dtype,
+                            rtol=2e-6,
+                            atol=2e-6):
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
+       cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunGRU(
+           sess, num_units, input_size, batch_size, time, num_layers)
+
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
+      self.assertAllClose(hgrad, cu_hgrad, rtol=rtol, atol=atol)
+      self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol)
+      for bg, cu_bg in zip(bgrad, cu_bgrad):
+        self.assertAllClose(bg, cu_bg, rtol=rtol, atol=atol)
+      for wg, cu_wg in zip(wgrad, cu_wgrad):
+        self.assertAllClose(wg, cu_wg, rtol=rtol, atol=atol)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(num_units, input_size, batch_size, time,
+                               num_layers, dtypes.float32)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training_fp16(self, num_units, input_size, batch_size, time,
+                         num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(
         num_units,
         input_size,
-        direction=direction)
-    params_size = model.params_size()
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      params_size_v = sess.run(params_size)
-      self.assertLessEqual(min_params_size, params_size_v)
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float16,
+        rtol=5e-3,
+        atol=5e-4)
 
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testLSTMParamsSize(self):
-    test_configs = [
-        [4, 200, 200],
-        [4, 200, 300],
-        [4, 200, 100],
-        [1, 100, 200],
-        [2, 200, 100],
-        [3, 200, 400],
-    ]
-    directions = [
-        cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-        cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION
-    ]
-    for (config, direction) in itertools.product(test_configs, directions):
-      num_layers, num_units, input_size = config
-      with ops.Graph().as_default():
-        self._testOneLSTMParamsSize(num_layers, num_units, input_size,
-                                    direction)
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h) = RunGRU(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False)
+      self.assertAllClose(outputs, cu_outputs)
+      self.assertAllClose(h, cu_h)
 
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testLSTMParamsSizeShape(self):
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
-          cudnn_rnn_ops.CUDNN_LSTM,
-          constant_op.constant([4]), 200, 200,
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      params_size = model.params_size()
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
-          cudnn_rnn_ops.CUDNN_LSTM,
-          4, constant_op.constant([200]), 200,
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      params_size = model.params_size()
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
+  def test_inference_fp16(self, num_units, input_size, batch_size, time,
+                          num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h) = RunGRU(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False,
+          dtype=dtypes.float16)
+
+      rtol, atol = 5e-3, 5e-4
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
+                                  num_layers):
+    """Validates that dropout does not affect Cudnn Rnn inference."""
+    # Hand-picked dropouts are used below (0. and 1.)
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        # 1st time w/o dropout.
+        (_, cu_outputs, _, cu_h) = RunGRU(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=0.)
+
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        (_, cu_outputs2, _, cu_h2) = RunGRU(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=1.)
+
+    self.assertAllClose(cu_outputs, cu_outputs2)
+    self.assertAllClose(cu_h[0], cu_h2[0])
+
+
+class CudnnParamsFormatConverterTest(TensorFlowTestCase,
+                                     parameterized.TestCase):
+  """Class for testing various format converters."""
+
+  def _test_lstm_helper(self, num_units, input_size, num_layers, direction):
+    with self.session(use_gpu=True) as sess:
+      random_seed.set_random_seed(0)
+      np.random.seed(0)
+
+      num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
+      format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
+          num_layers, num_units, input_size, direction=direction)
+
+      ws, bs = [], []
+      for _ in range(num_layers * num_dirs):
+        w = constant_op.constant(
+            np.random.rand(input_size + num_units, 4 * num_units),
+            dtype=dtypes.float32)
+        b = constant_op.constant(
+            np.random.rand(4 * num_units), dtype=dtypes.float32)
+        ws.append(w)
+        bs.append(b)
+
+      opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+      opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
           cudnn_rnn_ops.CUDNN_LSTM,
-          4, 200, constant_op.constant([200]),
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      params_size = model.params_size()
+          num_layers,
+          num_units,
+          input_size,
+          direction=direction)
 
+      ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params)
 
-class CudnnRNNTestInference(TensorFlowTestCase):
+      # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical()
+      # returns the original input.
+      ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r])
+      for w, w_r in zip(ws, ws_r):
+        self.assertAllClose(w, w_r)
+      for b, b_r in zip(bs, bs_r):
+        self.assertAllClose(b, b_r)
 
-  def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
-                              batch_size, seq_length, dir_count, dropout,
-                              expected, tolerance):
-    random_seed.set_random_seed(5678)
-    model = _CreateModel(
-        rnn_mode,
-        num_layers,
-        num_units,
-        input_size,
-        input_mode="auto_select",
-        direction=(cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1
-                   else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION),
-        dropout=dropout)
-    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-    params_size_t = model.params_size()
-    input_data = array_ops.ones([seq_length, batch_size, input_size])
-    input_h = array_ops.ones([num_layers * dir_count, batch_size, num_units])
-    params = variables.Variable(
-        array_ops.ones([params_size_t]), validate_shape=False)
-    if has_input_c:
-      input_c = array_ops.ones([num_layers * dir_count, batch_size, num_units])
-      output, output_h, output_c = model(
-          input_data=input_data,
-          input_h=input_h,
-          input_c=input_c,
-          params=params,
-          is_training=False)
-    else:
-      output, output_h = model(
-          input_data=input_data,
-          input_h=input_h,
-          params=params,
-          is_training=False)
-    output_sum = math_ops.reduce_sum(output)
-    output_h_sum = math_ops.reduce_sum(output_h)
-    total_sum = output_sum + output_h_sum
-    if has_input_c:
-      output_c_sum = math_ops.reduce_sum(output_c)
-      total_sum += output_c_sum
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      sess.run(variables.global_variables_initializer())
-      total_sum_v = sess.run([total_sum])
+      # Test opaque_params size lower bound
+      opaque_params_size_v = sess.run(opaque_params_size)
+      min_params_size = sum(x.size for x in ws) + np.sum(x.size for x in bs)
+      logging.info("min_parm_size: %d vs actual_opaque_param_size: %d",
+                   min_params_size, opaque_params_size_v)
+      self.assertLessEqual(min_params_size, opaque_params_size_v)
 
-      self.assertAllClose(
-          total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_lstm(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_lstm_helper(num_units, input_size, num_layers,
+                           cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
 
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleInference(self):
-    test_configs = [
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "expected": 231833.22,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "expected": 56000,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "expected": 56000,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "expected": 130688,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 8,
-                "input_size": 4,
-                "batch_size": 4,
-                "seq_length": 2,
-                "dir_count": 1,
-            },
-        },
-    ]
-    # Cudnn scales result for dropout during training, therefore dropout has no
-    # impact for inference results.
-    # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
-    # demonstrative of the dropout-invariant nature of CudnnRnn.)
-    dropouts = [0., 0.5, 1.]
-    for (config, dropout) in itertools.product(test_configs, dropouts):
-      rnn_mode = config["rnn_mode"]
-      expected = config["expected"]
-      tolerance = config["tolerance"]
-      shape = config["shape"]
-      with ops.Graph().as_default():
-        self._testOneSimpleInference(
-            rnn_mode, shape["num_layers"], shape["num_units"],
-            shape["input_size"], shape["batch_size"], shape["seq_length"],
-            shape["dir_count"], dropout, expected, tolerance)
-
-
-class CudnnRNNTestTraining(TensorFlowTestCase):
-
-  def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
-                             batch_size, seq_length, dir_count, dropout, dtype,
-                             delta, tolerance):
-    # Gradient checking runs two forward ops with almost the same input. Need to
-    # make sure the drop patterns across the two runs are the same.
-    logging.info("Training test with config: %s", locals())
-    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
-    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
-    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-    random_seed.set_random_seed(5678)
-    direction = (cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1
-                 else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
-    model = _CreateModel(
-        rnn_mode,
-        num_layers,
-        num_units,
-        input_size,
-        direction=direction,
-        dtype=dtype,
-        dropout=dropout)
-    params_size_t = model.params_size()
-    input_data = variables.Variable(
-        random_ops.random_uniform(
-            [seq_length, batch_size, input_size], dtype=dtype),
-        dtype=dtype)
-    input_h = variables.Variable(
-        random_ops.random_uniform(
-            [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-        dtype=dtype)
-    params = variables.Variable(
-        random_ops.random_uniform([params_size_t], dtype=dtype),
-        validate_shape=False,
-        dtype=dtype)
-    if has_input_c:
-      input_c = variables.Variable(
-          random_ops.random_uniform(
-              [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-          dtype=dtype)
-
-      output, output_h, output_c = model(
-          input_data=input_data,
-          input_h=input_h,
-          input_c=input_c,
-          params=params)
-    else:
-      output, output_h = model(
-          input_data=input_data, input_h=input_h, params=params)
-    output_sum = math_ops.reduce_sum(output)
-    output_h_sum = math_ops.reduce_sum(output_h)
-    total_sum = output_sum + output_h_sum
-    if has_input_c:
-      output_c_sum = math_ops.reduce_sum(output_c)
-      total_sum += output_c_sum
-
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      params_size_v = sess.run(params_size_t)
-      inputs_and_shapes = [
-          (input_data, [seq_length, batch_size, input_size]),
-          (input_h, [num_layers * dir_count, batch_size, num_units]),
-          (params, [params_size_v]),
-      ]
-      if has_input_c:
-        inputs_and_shapes.append(
-            (input_c, [num_layers * dir_count, batch_size, num_units]),)
-      sess.run(variables.global_variables_initializer())
-      all_inputs = [entry[0] for entry in inputs_and_shapes]
-      all_shapes = [entry[1] for entry in inputs_and_shapes]
-
-      err = gradient_checker.compute_gradient_error(
-          all_inputs, all_shapes, total_sum, [1], delta=delta)
-
-      self.assertLess(err, tolerance)
-      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
+  def test_lstm_bidi(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_lstm_helper(num_units, input_size, num_layers,
+                           cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
+
+  def _test_gru_helper(self, num_units, input_size, num_layers, direction):
+    with self.session(use_gpu=True) as sess:
+      random_seed.set_random_seed(0)
+      np.random.seed(0)
+
+      num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
+      format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
+          num_layers, num_units, input_size, direction=direction)
 
+      ws, bs = [], []
+      for _ in range(num_layers * num_dirs):
+        gate_kernel = constant_op.constant(
+            np.random.rand(input_size + num_units, num_units * 2),
+            dtype=dtypes.float32)
+        gate_bias = constant_op.constant(
+            np.random.rand(num_units * 2), dtype=dtypes.float32)
+        candidate_inp_kernel = constant_op.constant(
+            np.random.rand(input_size, num_units), dtype=dtypes.float32)
+        candidate_inp_bias = constant_op.constant(
+            np.random.rand(num_units), dtype=dtypes.float32)
+        candidate_hid_kernel = constant_op.constant(
+            np.random.rand(num_units, num_units), dtype=dtypes.float32)
+        candidate_hid_bias = constant_op.constant(
+            np.random.rand(num_units), dtype=dtypes.float32)
+        ws.extend([gate_kernel, candidate_inp_kernel, candidate_hid_kernel])
+        bs.extend([gate_bias, candidate_inp_bias, candidate_hid_bias])
+
+      opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+      opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
+          cudnn_rnn_ops.CUDNN_GRU,
+          num_layers,
+          num_units,
+          input_size,
+          direction=direction)
+
+      ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params)
+
+      # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical()
+      # returns the original input.
+      ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r])
+      for w, w_r in zip(ws, ws_r):
+        self.assertAllClose(w, w_r)
+      for b, b_r in zip(bs, bs_r):
+        self.assertAllClose(b, b_r)
+
+      # Test opaque_params size lower bound
+      opaque_params_size_v = sess.run(opaque_params_size)
+      min_params_size = sum(x.size for x in ws) + sum(x.size for x in bs)
+      logging.info("min_parm_size: %d vs actual_opaque_param_size: %d",
+                   min_params_size, opaque_params_size_v)
+      self.assertLessEqual(min_params_size, opaque_params_size_v)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_gru(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_gru_helper(num_units, input_size, num_layers,
+                          cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTraining(self):
-    test_configs = [
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "dtype": dtypes.float32,
-            "tolerance": 1.5e-2,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "dtype": dtypes.float32,
-            "tolerance": 4e-3,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "dtype": dtypes.float32,
-            "tolerance": 5e-3,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "dtype": dtypes.float32,
-            "tolerance": 5e-1,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-    ]
-    dropouts = [0., 0.5, 1.]
-    dir_counts = [1]
-    for config, dropout, dir_count in itertools.product(test_configs, dropouts,
-                                                        dir_counts):
-      rnn_mode = config["rnn_mode"]
-      dtype = config.get("dtype", dtypes.float32)
-      delta = config.get("delta", 1e-3)
-      tolerance = config["tolerance"]
-      shape = config["shape"]
-      with ops.Graph().as_default():
-        self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    dir_count, dropout, dtype, delta, tolerance)
+  def test_gru_bidi(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_gru_helper(num_units, input_size, num_layers,
+                          cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
+
+
+class CudnnRnnSaveRestoreTest(TensorFlowTestCase, parameterized.TestCase):
+  """Class for testing various Cudnn Rnn SaveableObjects."""
+
+  def _create_opaque_param(self,
+                           rnn_mode,
+                           num_units,
+                           input_size,
+                           num_layers,
+                           direction,
+                           name=None):
+    param_size_t = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
+        rnn_mode, num_layers, num_units, input_size, direction=direction)
+    init_val = random_ops.random_uniform([param_size_t])
+    return variable_scope.get_variable(
+        name or "opaque_param", initializer=init_val, validate_shape=False)
+
+  def _create_saveable(self, opaque_param, rnn_mode, num_units, input_size,
+                       num_layers, direction):
+    if rnn_mode == CUDNN_LSTM:
+      fn = cudnn_rnn_ops.CudnnLSTMSaveable
+    elif rnn_mode == CUDNN_GRU:
+      fn = cudnn_rnn_ops.CudnnGRUSaveable
+    elif rnn_mode == CUDNN_RNN_TANH:
+      fn = cudnn_rnn_ops.CudnnRNNTanhSaveable
+    elif rnn_mode == CUDNN_RNN_RELU:
+      fn = cudnn_rnn_ops.CudnnRNNReluSaveable
+    saveable = fn(
+        opaque_param, num_layers, num_units, input_size, direction=direction)
+    return saveable
+
+  def _compare_weights(self, lhs, rhs):
+    self.assertLen(rhs, len(lhs))
+    for lw, rw in zip(lhs, rhs):
+      self.assertAllEqual(lw, rw)
+
+  def _compare_biases(self, lhs, rhs):
+    self.assertLen(rhs, len(lhs))
+    for lf, rt in zip(lhs, rhs):
+      self.assertAllEqual(lf, rt)
+
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, "time", "batch_size", **{
+              "rnn_mode": [
+                  CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH
+              ],
+              "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
+          }))
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_save_restore_variable(self, rnn_mode, num_units, input_size,
+                                 num_layers, direction):
+    # Verify the restored opaque param, once converted to tf_canonical format,
+    # is the same as the tf canonicals of the pre-restored param.
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      opaque_param = self._create_opaque_param(rnn_mode, num_units, input_size,
+                                               num_layers, direction)
+      saveable = self._create_saveable(opaque_param, rnn_mode, num_units,
+                                       input_size, num_layers, direction)
+      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      weights_op, biases_op = saveable.format_converter.opaque_to_tf_canonical(
+          saveable._variables)
+
+      save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test")
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      init_op = variables.global_variables_initializer()
+      reset_op = state_ops.assign(opaque_param,
+                                  array_ops.zeros_like(opaque_param))
+      sess.run(init_op)
+      self.assertEqual(save_path, saver.save(sess, save_path))
+
+      # Get the tf canonical vals before reset-restore
+      weights, biases = sess.run([weights_op, biases_op])
+
+      # Reset the opaque param value
+      sess.run(reset_op)
+      # Assert reset happened.
+      weights_z, biases_z = sess.run([weights_op, biases_op])
+      for w in weights_z:
+        self.assertAllClose(w, np.zeros_like(w))
+      for b in biases_z:
+        self.assertAllClose(b, np.zeros_like(b))
+
+      # Restore opaque param value from checkpoint.
+      saver.restore(sess, save_path)
+      weights_r, biases_r = sess.run([weights_op, biases_op])
+      self._compare_weights(weights, weights_r)
+      self._compare_biases(biases, biases_r)
+
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, "time", "batch_size", **{
+              "rnn_mode": [
+                  CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH
+              ],
+              "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
+          }))
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_save_restore_multi_variables(self, rnn_mode, num_units, input_size,
+                                        num_layers, direction):
+    # Verify the restored opaque param, once converted to tf_canonical format,
+    # is the same as the tf canonicals of the pre-restored param.
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      opaque_params = []
+      saveables = []
+      num_opaque_params = 2
+      for i in range(num_opaque_params):
+        opaque_params.append(
+            self._create_opaque_param(
+                rnn_mode,
+                num_units,
+                input_size,
+                num_layers,
+                direction,
+                name="opaque_param_%d" % i))
+        saveable = self._create_saveable(opaque_params[i], rnn_mode, num_units,
+                                         input_size, num_layers, direction)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+        saveables.append(saveable)
+
+      weights_ops, biases_ops = [], []
+      for i in range(num_opaque_params):
+        weights_op, biases_op = (
+            saveables[i].format_converter.opaque_to_tf_canonical(
+                saveables[i]._variables))
+        weights_ops.append(weights_op)
+        biases_ops.append(biases_op)
+
+      save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test")
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      init_op = variables.global_variables_initializer()
+      reset_ops = []
+      for i in range(num_opaque_params):
+        reset_ops.append(
+            state_ops.assign(opaque_params[i],
+                             array_ops.zeros_like(opaque_params[i])))
+      sess.run(init_op)
+      self.assertEqual(save_path, saver.save(sess, save_path))
+
+      # Get the tf canonical vals before reset-restore
+      for i in range(num_opaque_params):
+        weights, biases = sess.run([weights_ops[i], biases_ops[i]])
+
+        # Reset the opaque param value
+        sess.run(reset_ops[i])
+
+        # Assert reset happened.
+        weights_z, biases_z = sess.run([weights_ops[i], biases_ops[i]])
+        for w in weights_z:
+          self.assertAllClose(w, np.zeros_like(w))
+        for b in biases_z:
+          self.assertAllClose(b, np.zeros_like(b))
+
+        # Restore opaque param value from checkpoint.
+        saver.restore(sess, save_path)
+        weights_r, biases_r = sess.run([weights_ops[i], biases_ops[i]])
+        self._compare_weights(weights, weights_r)
+        self._compare_biases(biases, biases_r)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 57793a8ff5e2ec49dfea42c08eb9456cb2875eab..7e1b4062ce435f3ab4216e90b4f5fcbab984c1dc 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -536,7 +536,9 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
       save_path = os.path.join(self.get_temp_dir(),
                                "save-restore-variable-test")
       saver = saver_lib.Saver()
-      weights, biases = model.rnn.saveable._OpaqueParamsToCanonical()
+      weights, biases = (
+          model.rnn.saveable.format_converter._opaque_to_cu_canonical(
+              model.rnn.saveable._variables))
       opaque_params = rnn.trainable_variables[0]
       # CudnnTestModel() creates CudnnOpaqueParamsSaveable that helps saver save
       # Cudnn vars in canonical format.
@@ -583,8 +585,12 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
             dtype=dtype)
       opaque_params = (model1.rnn.trainable_variables[0],
                        model2.rnn.trainable_variables[0])
-      weights1, biases1 = model1.rnn.saveable._OpaqueParamsToCanonical()
-      weights2, biases2 = model2.rnn.saveable._OpaqueParamsToCanonical()
+      saveable1 = model1.rnn.saveable
+      weights1, biases1 = saveable1.format_converter._opaque_to_cu_canonical(
+          saveable1._variables)
+      saveable2 = model1.rnn.saveable
+      weights2, biases2 = saveable2.format_converter._opaque_to_cu_canonical(
+          saveable2._variables)
       reset_params = [
           state_ops.assign(params,
                            array_ops.zeros_like(params, dtype=dtype))
@@ -1039,8 +1045,8 @@ class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase):
 
     # Min param size estimate = sum(weights.size) + sum(biases.size)
     min_params_size = (
-        np.sum(map(np.prod, rnn.canonical_weight_shapes)) +
-        np.sum([sp[0] for sp in rnn.canonical_bias_shapes]))
+        sum(map(np.prod, rnn.canonical_weight_shapes)) +
+        sum(sp[0] for sp in rnn.canonical_bias_shapes))
 
     opaque_params = rnn.trainable_variables[0]
     with self.test_session(use_gpu=True, graph=ops.get_default_graph()):
@@ -1184,7 +1190,8 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
 
     num_grads = [self._ComputeNumericGrad(sess, y, x, delta) for x in xs]
     self.assertEqual(len(sym_grads), len(num_grads))
-    for sym, num in zip(sym_grads, num_grads):
+    for x, sym, num in zip(xs, sym_grads, num_grads):
+      logging.info("Comparing gradients for input: %s", x.name)
       self.assertFalse(np.any(np.isnan(sym)))
       self.assertFalse(np.any(np.isnan(num)))
       self.assertAllClose(sym, num, atol=tolerance, rtol=tolerance)
@@ -1225,18 +1232,18 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
     params = rnn.trainable_variables[0]
 
     inputs = variables.Variable(
-        random_ops.random_uniform(
-            [seq_length, batch_size, input_size], dtype=dtype),
-        dtype=dtype)
+        random_ops.random_uniform([seq_length, batch_size, input_size],
+                                  dtype=dtype),
+        dtype=dtype).read_value()
     input_h = variables.Variable(
         random_ops.random_uniform(
             [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-        dtype=dtype)
+        dtype=dtype).read_value()
     if has_input_c:
       input_c = variables.Variable(
           random_ops.random_uniform(
               [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-          dtype=dtype)
+          dtype=dtype).read_value()
       initial_state = (input_h, input_c)
     else:
       initial_state = (input_h,)
@@ -1262,7 +1269,7 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
 
   def _TestSimpleTrainingHelper(self, rnn_mode, test_configs):
     dropouts = [0, 0.5, 1.]
-    v2_options = [str(False), str(True)]
+    v2_options = [False, True]
     for config, dropout, use_v2 in itertools.product(test_configs, dropouts,
                                                      v2_options):
       dtype = config.get("dtype", dtypes.float32)
@@ -1270,6 +1277,9 @@ class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
       tolerance = config.get("tolerance", 1e-6)
       dir_count = config.get("dir_count", 1)
       shape = config["shape"]
+      if dtype == dtypes.float64:
+        # TODO(jamesqin): b/117848763
+        use_v2 = False
       with ops.Graph().as_default():
         self._TestOneSimpleTraining(
             rnn_mode, shape["num_layers"], shape["num_units"],
@@ -1519,7 +1529,7 @@ if __name__ == "__main__":
   parser.add_argument(
       "--grad_check_num_samples",
       type=int,
-      default=5,
+      default=1,
       help="Number of samples to run for gradient check.")
   FLAGS, unparsed = parser.parse_known_args()
   sys.argv = [argv0] + unparsed
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py b/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
index f09466b631f69d6234573dd5eafada650421c117..60229af374be869005139921483793156e5e7a05 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
@@ -27,5 +27,10 @@ from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibl
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibleLSTMCell
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnGRUSaveable
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnLSTMSaveable
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterGRU
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterLSTM
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterRelu
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnParamsFormatConverterTanh
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNReluSaveable
 from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNTanhSaveable
+
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index e26d56c8579e110d61c73c6154b82f47f0093687..8e25637ed91a1559b321ea96efbfaa2910f67158 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -21,6 +21,7 @@ from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -322,7 +323,7 @@ class _CudnnRNN(base_layer.Layer):
       raise ValueError("The last dimension of the inputs to `CudnnRNN` "
                        "should be defined. Found `None`.")
     self._input_size = input_shape[-1].value
-    self.input_spec = base_layer.InputSpec(ndim=3, axes={-1: self._input_size})
+    self.input_spec = input_spec.InputSpec(ndim=3, axes={-1: self._input_size})
 
     self._set_scope(None)
 
@@ -356,7 +357,8 @@ class _CudnnRNN(base_layer.Layer):
             "Partitioner is not supported for Cudnn RNN layer variables, using "
             "it will create forward-compatibility issues with future "
             "CUDA/CuDNN generations.")
-      # Initialize opaque params with a tensor.
+      # Initialize opaque params with a tensor with unknown shape, thus couldn't
+      # use self.add_variable(name, shape, initializer, ...)
       self.kernel = vs.get_variable(
           "opaque_kernel", dtype=self._plain_dtype,
           initializer=opaque_params_t, validate_shape=False)
@@ -387,11 +389,11 @@ class _CudnnRNN(base_layer.Layer):
       output_states: a tuple of tensor(s) of the same shape and structure as
         `initial_state`.
     Raises:
-      ValueError: initial_state is not a tuple.
+      TypeError: initial_state is not a tuple.
     """
     if initial_state is not None and not isinstance(initial_state, tuple):
-      raise ValueError("Invalid initial_state type: %s, expecting tuple.",
-                       type(initial_state))
+      raise TypeError("Invalid initial_state type: %s, expecting tuple." %
+                      initial_state)
     dtype = self.dtype
     inputs = ops.convert_to_tensor(inputs, dtype=dtype)
 
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 2c92f31788378c2a9f01183bc04b035668b59b59..1ce29b42d52ff67477161278ed11016c2e73041d 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -74,7 +74,7 @@ class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell):
 
 
 class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
-  """Cudnn Compatible GRUCell.
+  r"""Cudnn Compatible GRUCell.
 
   A GRU impl akin to `tf.nn.rnn_cell.GRUCell` to use along with
   `tf.contrib.cudnn_rnn.CudnnGRU`. The latter's params can be used by
@@ -177,172 +177,60 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
     return new_h, new_h
 
 
-# TODO(yaozhang): make sure we only save the canonical version of params and
-# don't save the platform-specific version to avoid potential race
-# conditions where params is updated by both versions when being restored.
-# Currently, checkpointing will function properly, despite that we save both
-# versions, because Saver restores customized savables after Variables.
-# However, it is good to not rely on this restoring order of Saver and to
-# avoid unnecessary storage. Add a test to check only the canonical version is
-# saved.
-class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
-  """Abstract SaveableObject implementation handling Cudnn opaque params."""
+class CudnnParamsFormatConverter(object):
+  """Abstract class that converts between params of Cudnn Rnn and TF Rnn."""
 
   def __init__(self,
-               opaque_params,
                num_layers,
                num_units,
                input_size,
                input_mode=CUDNN_INPUT_LINEAR_MODE,
-               direction=CUDNN_RNN_UNIDIRECTION,
-               scope=None,
-               name="cudnn_rnn_saveable"):
-    """Creates a CudnnOpaqueParamsSaveable object.
-
-       CudnnOpaqueParamsSaveable is saveable/restorable in a checkpoint file
-       and is used to save/restore the weights and biases parameters in a
-       canonical format which is directly consumable by platform-independent tf
-       RNN cells. Parameters are saved as tensors layer by layer with weight
-       tensors followed by bias tensors, and forward direction followed by
-       backward direction (if applicable). When restoring, a user could name
-       param_variables as desired, and restore weight and bias tensors to these
-       variables.
-
-       For CudnnRNNRelu or CudnnRNNTanh, there are 2 tensors per weight and per
-       bias for each layer: tensor 0 is applied to the input from the previous
-       layer and tensor 1 to the recurrent input.
-
-       For CudnnLSTM, there are 8 tensors per weight and per bias for each
-       layer: tensor 0-3 are applied to the input from the previous layer and
-       tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate;
-       tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate;
-       tensor 3 and 7 the output gate.
-
-       For CudnnGRU, there are 6 tensors per weight and per bias for each layer:
-       tensor 0-2 are applied to the input from the previous layer and
-       tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate;
-       tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
+               direction=CUDNN_RNN_UNIDIRECTION):
+    """Constructor.
 
     Args:
-      opaque_params: a variable, Cudnn RNN opaque params.
       num_layers: the number of layers for the RNN model.
       num_units: the number of units within the RNN model.
       input_size: the size of the input, it could be different from the
-          num_units.
+        num_units.
       input_mode: indicate whether there is a linear projection between the
-          input and the actual computation before the first layer. It could be
-          'linear_input', 'skip_input' or 'auto_select'.
-          'linear_input' (default) always applies a linear projection of input
-          onto RNN hidden state. (standard RNN behavior).
-          'skip_input' is only allowed when input_size == num_units;
-          'auto_select' implies 'skip_input' when input_size == num_units;
-          otherwise, it implies 'linear_input'.
+        input and the actual computation before the first layer. It could be one
+        of 'linear_input', 'skip_input' or 'auto_select'. * 'linear_input'
+        (default) always applies a linear projection of input onto RNN hidden
+        state. (standard RNN behavior). * 'skip_input' is only allowed when
+        input_size == num_units; * 'auto_select' implies 'skip_input' when
+        input_size == num_units; otherwise, it implies 'linear_input'.
       direction: the direction model that the model operates. Could be either
-          'unidirectional' or 'bidirectional'
-      scope: string of VariableScope, the scope of equivalent subgraph
-          consisting only platform-independent tf RNN cells.
-      name: the name of the CudnnOpaqueParamsSaveable object.
+        'unidirectional' or 'bidirectional'
     """
-    # Define in subclasses.
     self._num_layers = num_layers
     self._input_size = input_size
     self._num_units = num_units
     self._input_mode = input_mode
     self._direction = direction
-    if scope is not None:
-      scope_name = scope.name if isinstance(scope, vs.VariableScope) else scope
-      self._scope = scope_name or None
-    else:
-      self._scope = None
-
-    self._variables = opaque_params
     self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
     self._num_params = (
         self._num_params_per_layer * self._num_layers * self._num_dirs)
 
-    weights, biases = self._OpaqueParamsToCanonical()
-    (weights, weight_names), (biases, bias_names) = self._TransformCanonical(
-        weights, biases)
-    # We currently don't use slice_spec. It might be useful in a distributed
-    # setting where each parameter server node stores a slice of variable,
-    # instead of having the master pull all slices and then save them.
-    slice_spec = ""
-    params = weights + biases
-    self._weight_names = weight_names
-    self._bias_names = bias_names
-    self._param_names = weight_names + bias_names
-    prefixed_param_names = weight_names + bias_names
-    if self._scope:
-      prefixed_param_names = [
-          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names]
-    specs = [
-        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
-        for param, param_name in zip(params, prefixed_param_names)
-    ]
-    super(CudnnOpaqueParamsSaveable, self).__init__(
-        array_ops.identity(self._variables), specs, name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    weights, biases = self._ReverseTransformCanonical(restored_tensors)
-    weights = [array_ops.reshape(w, [-1]) for w in weights]
-    opaque_params = self._CanonicalToOpaqueParams(weights, biases)
-
-    return state_ops.assign(
-        self._variables, opaque_params, validate_shape=False)
+  def tf_canonical_to_opaque(self, tf_canonicals):
+    r"""Converts tf canonical weights to cudnn opaque param."""
+    cu_weights, cu_biases = self._tf_canonical_to_cu_canonical(tf_canonicals)
+    cu_weights = [array_ops.reshape(w, [-1]) for w in cu_weights]
+    opaque_params = self._cu_canonical_to_opaque(cu_weights, cu_biases)
+    return opaque_params
 
-  def _checkpointable_save(self, save_buffer):
-    weights, biases = self._OpaqueParamsToCanonical()
-    with ops.device("gpu:0"):
-      (weights, _), (biases, _) = self._TransformCanonical(
-          weights, biases)
-    for name, tensor in zip(self._param_names, weights + biases):
-      save_buffer[name] = array_ops.identity(tensor)
+  def opaque_to_tf_canonical(self, opaque_param):
+    r"""Converts cudnn opaque param to tf canonical weights."""
+    cu_weights, cu_biases = self._opaque_to_cu_canonical(opaque_param)
+    weights, biases = self._cu_canonical_to_tf_canonical(cu_weights, cu_biases)
+    return weights, biases
 
-  def _checkpointable_restore(self, restore_buffer):
-    tensors = [array_ops.identity(restore_buffer[name])
-               for name in self._param_names]
-    return self.restore(
-        restored_tensors=tensors,
-        restored_shapes=None  # Unused
-    )
-
-  def _add_checkpointable_dependencies(self, checkpointable, dtype):
-    """Add canonical weight dependencies to `checkpointable`.
-
-    When saving or restoring, converts to or from the opaque buffer
-    format. Weights are saved and loaded in the configuration expected by
-    cuDNN-compatible cells.
-
-    Args:
-      checkpointable: An object inheriting from `CheckpointableBase` to add
-        dependencies too (typically the cuDNN `Layer`).
-      dtype: The dtype for the canonical parameter Tensors.
-    """
-    split_dependencies = split_dependency.split_dependency(
-        component_names=self._param_names,
-        component_dtypes=(dtype,) * len(self._param_names),
-        fill_save_buffer_fn=self._checkpointable_save,
-        consume_restore_buffer_fn=self._checkpointable_restore)
-    self._checkpointable_track_params(checkpointable, split_dependencies)
-
-  def _checkpointable_track_params(self, checkpointable, params):
-    """Tracks parameters in a canonical configuration."""
-    return  # NotImplementedError raised by the Layer.
-
-  def _TFCanonicalNamePrefix(self, layer, is_fwd=True):
-    if self._direction == CUDNN_RNN_UNIDIRECTION:
-      return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
-    else:
-      if is_fwd:
-        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/fw/%s" %
-                (layer, self._rnn_cell_name))
-      else:
-        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/bw/%s" %
-                (layer, self._rnn_cell_name))
-
-  def _OpaqueParamsToCanonical(self):
+  def _opaque_to_cu_canonical(self, opaque_param):
     """Converts opaque params to Cudnn canonical format.
 
+    Args:
+      opaque_param: An opaque tensor storing cudnn rnn params (weights and
+        biases).
     Returns:
       2 list for weights and biases respectively.
     """
@@ -351,14 +239,14 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           num_layers=self._num_layers,
           num_units=self._num_units,
           input_size=self._input_size,
-          params=self._variables,
+          params=opaque_param,
           num_params=self._num_params,
           rnn_mode=self._rnn_mode,
           input_mode=self._input_mode,
           direction=self._direction)
       return (weights, biases)
 
-  def _CanonicalToOpaqueParams(self, cu_weights, cu_biases):
+  def _cu_canonical_to_opaque(self, cu_weights, cu_biases):
     """Converts from Cudnn canonical format to opaque params.
 
     Args:
@@ -378,7 +266,7 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
           input_mode=self._input_mode,
           direction=self._direction)
 
-  def _TransformCanonical(self, cu_weights, cu_biases):
+  def _cu_canonical_to_tf_canonical(self, cu_weights, cu_biases):
     r"""Transform from Cudnn canonical to tf canonical.
 
     The elements of argument lists are laid out in the following format:
@@ -398,46 +286,43 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
       cu_weights: a list of tensors of Cudnn canonical weights.
       cu_biases: a list of tensors of Cudnn canonical biases.
     Returns:
-      2 tuples, one for weights and the other for bias.
-      Each tuple has two lists: the 1st for transformed tf canonical tensors
-      and the 2nd for the names of the tensors under which they are saved.
+      1 tuple, tf canonical weights and biases.
     """
     tf_weights, tf_biases = [], []
-    tf_weights_names, tf_bias_names = [], []
 
     layer_weights_num = self._num_params_per_layer * self._num_dirs
     layer_biases_num = layer_weights_num
 
     for i in range(self._num_layers):
-      layer_weights = cu_weights[i * layer_weights_num:
-                                 (i + 1) * layer_weights_num]
+      layer_weights = cu_weights[i * layer_weights_num:(i + 1) *
+                                 layer_weights_num]
       layer_biases = cu_biases[i * layer_biases_num:(i + 1) * layer_biases_num]
       if self._direction == CUDNN_RNN_UNIDIRECTION:
-        prefix = self._TFCanonicalNamePrefix(i)
-        self._TransformSingleLayerCanonical(layer_weights, layer_biases, prefix,
-                                            tf_weights, tf_weights_names,
-                                            tf_biases, tf_bias_names)
+        self._cu_canonical_to_tf_canonical_single_layer(
+            layer_weights, layer_biases, tf_weights, tf_biases)
       else:
-        fw_prefix = self._TFCanonicalNamePrefix(i, is_fwd=True)
-        bw_prefix = self._TFCanonicalNamePrefix(i, is_fwd=False)
-
         fw_weights = layer_weights[:len(layer_weights) // 2]
         bw_weights = layer_weights[len(layer_weights) // 2:]
         fw_biases = layer_biases[:len(layer_biases) // 2]
         bw_biases = layer_biases[len(layer_biases) // 2:]
 
-        self._TransformSingleLayerCanonical(fw_weights, fw_biases, fw_prefix,
-                                            tf_weights, tf_weights_names,
-                                            tf_biases, tf_bias_names)
-
-        self._TransformSingleLayerCanonical(bw_weights, bw_biases, bw_prefix,
-                                            tf_weights, tf_weights_names,
-                                            tf_biases, tf_bias_names)
-    return (tf_weights, tf_weights_names), (tf_biases, tf_bias_names)
-
-  def _TransformSingleLayerCanonical(self, cu_weights, cu_biases, prefix,
-                                     tf_weights, tf_weights_names, tf_biases,
-                                     tf_bias_names):
+        self._cu_canonical_to_tf_canonical_single_layer(
+            fw_weights,
+            fw_biases,
+            tf_weights,
+            tf_biases,
+        )
+
+        self._cu_canonical_to_tf_canonical_single_layer(
+            bw_weights,
+            bw_biases,
+            tf_weights,
+            tf_biases,
+        )
+    return (tf_weights, tf_biases)
+
+  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
+                                                 tf_weights, tf_biases):
     r"""Transform single layer Cudnn canonicals to tf canonicals.
 
     The elements of cu_weights, cu_biases are laid out in the following format:
@@ -447,15 +332,12 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     Args:
       cu_weights: a list of tensors, single layer weights.
       cu_biases: a list of tensors, single layer biases.
-      prefix: the shared prefix of all tensor names.
       tf_weights: a list where transformed weights are stored.
-      tf_weights_names: a list where names of transformed weights are stored.
       tf_biases: a list where transformed biases are stored.
-      tf_bias_names: a list where names of transformed biases are stored.
     """
     raise NotImplementedError("Abstract method")
 
-  def _ReverseTransformCanonical(self, tf_canonicals):
+  def _tf_canonical_to_cu_canonical(self, tf_canonicals):
     r"""Transform from tf canonical to Cudnn canonical.
 
     This is the reverse routine of _TransformCanonical().
@@ -502,30 +384,27 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return cu_weights, cu_biases
 
   def _cudnn_to_tf_weights(self, *cu_weights):
-    r"""Stitching cudnn canonical weights to generate tf canonical weights."""
+    r"""Stitches cudnn canonical weights to generate tf canonical weights."""
     raise NotImplementedError("Abstract method")
 
   def _tf_to_cudnn_weights(self, layer, *tf_weights):
-    r"""Reverse the operations in StitchWeights()."""
+    r"""Reverses the operations in StitchWeights()."""
     raise NotImplementedError("Abstract method")
 
   def _cudnn_to_tf_biases(self, *biases):
-    r"""Stitching cudnn canonical biases to generate tf canonical biases."""
+    r"""Stitches cudnn canonical biases to generate tf canonical biases."""
     raise NotImplementedError("Abstract method")
 
   def _tf_to_cudnn_biases(self, *tf_biases):
-    r"""Reverse the operations in StitchBiases()."""
+    r"""Reverses the operations in StitchBiases()."""
     raise NotImplementedError("Abstract method")
 
 
-class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
-  """SaveableObject implementation handling Cudnn LSTM opaque params."""
-
+class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter):
+  """Helper class that converts between params of Cudnn and TF LSTM."""
   _rnn_mode = CUDNN_LSTM
   _num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
 
-  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
-
   def _cudnn_to_tf_gate_params(self, *cu_gate_order):
     i_g, f_g, c_g, o_g = cu_gate_order
     return [i_g, c_g, f_g, o_g]
@@ -603,44 +482,16 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     # Return ifco order for Cudnn LSTM.
     return b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro
 
-  def _TransformSingleLayerCanonical(self, weights, biases, prefix, tf_weights,
-                                     tf_weights_names, tf_biases,
-                                     tf_bias_names):
-    (w,) = self._cudnn_to_tf_weights(*weights)
-    (b,) = self._cudnn_to_tf_biases(*biases)
-
+  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
+                                                 tf_weights, tf_biases):
+    (w,) = self._cudnn_to_tf_weights(*cu_weights)
+    (b,) = self._cudnn_to_tf_biases(*cu_biases)
     tf_weights.append(w)
-    tf_weights_names.append(prefix + "/kernel")
-
     tf_biases.append(b)
-    tf_bias_names.append(prefix + "/bias")
-
-  def _checkpointable_track_params(self, checkpointable, params):
-    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
-    biases = []
-    weights = []
-    for name in self._weight_names:
-      weights.append(params[name])
-    for name in self._bias_names:
-      biases.append(params[name])
-    assert len(params) == len(weights) + len(biases)
-    if len(weights) == 1 and len(biases) == 1:
-      # For single-layer cells, allow substituting a cell with no MultiRNNCell
-      # wrapping.
-      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
-      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
-      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
-      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
-    assert len(biases) == len(weights)
-    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
-      cell = checkpointable_lib.Checkpointable()
-      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
-      cell.bias = bias
-      cell.kernel = kernel
 
 
-class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
-  """SaveableObject implementation handling Cudnn GRU opaque params."""
+class CudnnParamsFormatConverterGRU(CudnnParamsFormatConverter):
+  """Helper class that converts between params of Cudnn and TF GRU."""
 
   _rnn_mode = CUDNN_GRU
   _num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
@@ -702,29 +553,18 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
     b_ri, b_rr = array_ops.split(br, 2, axis=0)
     return b_wi, b_wr, b_wh, b_ri, b_rr, b_rh
 
-  def _TransformSingleLayerCanonical(self, weights, biases, prefix, tf_weights,
-                                     tf_weights_names, tf_biases,
-                                     tf_bias_names):
+  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
+                                                 tf_weights, tf_biases):
     # pylint: disable=invalid-name
-    W_ir, w_h, r_h = self._cudnn_to_tf_weights(*weights)
-    b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*biases)
+    W_ir, w_h, r_h = self._cudnn_to_tf_weights(*cu_weights)
+    b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*cu_biases)
     # pylint: enable=invalid-name
-
     tf_weights.extend([W_ir, w_h, r_h])
-    tf_weights_names.append(prefix + "/gates/kernel")
-    tf_weights_names.append(prefix + "/candidate/input_projection/kernel")
-    tf_weights_names.append(prefix + "/candidate/hidden_projection/kernel")
-
     tf_biases.extend([b_ir, b_wh, b_rh])
-    tf_bias_names.append(prefix + "/gates/bias")
-    tf_bias_names.append(prefix + "/candidate/input_projection/bias")
-    tf_bias_names.append(prefix + "/candidate/hidden_projection/bias")
-
 
-class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
-  """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
 
-  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
+class CudnnParamsFormatConverterBasic(CudnnParamsFormatConverterLSTM):
+  """Helper class that converts between params of Cudnn and TF Relu/Tanh RNN."""
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
@@ -766,18 +606,270 @@ class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
     return b_i, b_h
 
 
-class CudnnRNNTanhSaveable(CudnnRNNSimpleSaveable):
-  """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
+class CudnnParamsFormatConverterTanh(CudnnParamsFormatConverterBasic):
+  """Helper class that converts between params of Cudnn and TF Tanh RNN."""
   _rnn_mode = CUDNN_RNN_TANH
   _num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
 
 
-class CudnnRNNReluSaveable(CudnnRNNSimpleSaveable):
-  """SaveableObject implementation handling Cudnn RNN Relu opaque params."""
+class CudnnParamsFormatConverterRelu(CudnnParamsFormatConverterBasic):
+  """Helper class that converts between params of Cudnn and TF Relu RNN."""
   _rnn_mode = CUDNN_RNN_RELU
   _num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
 
 
+# TODO(yaozhang): make sure we only save the canonical version of params and
+# don't save the platform-specific version to avoid potential race
+# conditions where params is updated by both versions when being restored.
+# Currently, checkpointing will function properly, despite that we save both
+# versions, because Saver restores customized savables after Variables.
+# However, it is good to not rely on this restoring order of Saver and to
+# avoid unnecessary storage. Add a test to check only the canonical version is
+# saved.
+class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Abstract SaveableObject implementation handling Cudnn opaque params."""
+
+  def __init__(self,
+               opaque_params,
+               num_layers,
+               num_units,
+               input_size,
+               input_mode=CUDNN_INPUT_LINEAR_MODE,
+               direction=CUDNN_RNN_UNIDIRECTION,
+               scope=None,
+               name="cudnn_rnn_saveable"):
+    """Creates a CudnnOpaqueParamsSaveable object.
+
+       CudnnOpaqueParamsSaveable is saveable/restorable in a checkpoint file
+       and is used to save/restore the weights and biases parameters in a
+       canonical format which is directly consumable by platform-independent tf
+       RNN cells. Parameters are saved as tensors layer by layer with weight
+       tensors followed by bias tensors, and forward direction followed by
+       backward direction (if applicable). When restoring, a user could name
+       param_variables as desired, and restore weight and bias tensors to these
+       variables.
+
+       For CudnnRNNRelu or CudnnRNNTanh, there are 2 tensors per weight and per
+       bias for each layer: tensor 0 is applied to the input from the previous
+       layer and tensor 1 to the recurrent input.
+
+       For CudnnLSTM, there are 8 tensors per weight and per bias for each
+       layer: tensor 0-3 are applied to the input from the previous layer and
+       tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate;
+       tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate;
+       tensor 3 and 7 the output gate.
+
+       For CudnnGRU, there are 6 tensors per weight and per bias for each layer:
+       tensor 0-2 are applied to the input from the previous layer and
+       tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate;
+       tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
+
+    Args:
+      opaque_params: a variable, Cudnn RNN opaque params.
+      num_layers: the number of layers for the RNN model.
+      num_units: the number of units within the RNN model.
+      input_size: the size of the input, it could be different from the
+        num_units.
+      input_mode: indicate whether there is a linear projection between the
+        input and the actual computation before the first layer. It could be
+        'linear_input', 'skip_input' or 'auto_select'. 'linear_input' (default)
+        always applies a linear projection of input onto RNN hidden state.
+        (standard RNN behavior). 'skip_input' is only allowed when input_size ==
+        num_units; 'auto_select' implies 'skip_input' when input_size ==
+        num_units; otherwise, it implies 'linear_input'.
+      direction: the direction model that the model operates. Could be either
+        'unidirectional' or 'bidirectional'
+      scope: string of VariableScope, the scope of equivalent subgraph
+        consisting only platform-independent tf RNN cells.
+      name: the name of the CudnnOpaqueParamsSaveable object.
+    """
+    # Define in subclasses.
+    self._num_layers = num_layers
+    self._input_size = input_size
+    self._num_units = num_units
+    self._input_mode = input_mode
+    self._direction = direction
+    if scope is not None:
+      scope_name = scope.name if isinstance(scope, vs.VariableScope) else scope
+      self._scope = scope_name or None
+    else:
+      self._scope = None
+
+    self._variables = opaque_params
+    self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
+    # Defined in subclasses.
+    self._format_converter = None
+
+    tf_weights, tf_biases = (
+        self.format_converter.opaque_to_tf_canonical(self._variables))
+    tf_weight_names, tf_bias_names = self._tf_canonical_names()
+    # We currently don't use slice_spec. It might be useful in a distributed
+    # setting where each parameter server node stores a slice of variable,
+    # instead of having the master pull all slices and then save them.
+    slice_spec = ""
+    params = tf_weights + tf_biases
+    self._weight_names = tf_weight_names
+    self._bias_names = tf_bias_names
+    self._param_names = tf_weight_names + tf_bias_names
+    prefixed_param_names = tf_weight_names + tf_bias_names
+    if self._scope:
+      prefixed_param_names = [
+          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names
+      ]
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
+        for param, param_name in zip(params, prefixed_param_names)
+    ]
+    super(CudnnOpaqueParamsSaveable, self).__init__(
+        array_ops.identity(self._variables), specs, name)
+
+  @property
+  def format_converter(self):
+    if self._format_converter is None:
+      self._format_converter = self._format_converter_cls(
+          self._num_layers, self._num_units, self._input_size, self._input_mode,
+          self._direction)
+    return self._format_converter
+
+  def restore(self, restored_tensors, restored_shapes):
+    opaque_params = self.format_converter.tf_canonical_to_opaque(
+        restored_tensors)
+    return state_ops.assign(
+        self._variables, opaque_params, validate_shape=False)
+
+  def _checkpointable_save(self, save_buffer):
+    weights, biases = self.format_converter.opaque_to_tf_canonical(
+        self._variables)
+    for name, tensor in zip(self._param_names, weights + biases):
+      save_buffer[name] = array_ops.identity(tensor)
+
+  def _checkpointable_restore(self, restore_buffer):
+    tensors = [
+        array_ops.identity(restore_buffer[name]) for name in self._param_names
+    ]
+    return self.restore(
+        restored_tensors=tensors,
+        restored_shapes=None  # Unused
+    )
+
+  def _add_checkpointable_dependencies(self, checkpointable, dtype):
+    """Add canonical weight dependencies to `checkpointable`.
+
+    When saving or restoring, converts to or from the opaque buffer
+    format. Weights are saved and loaded in the configuration expected by
+    cuDNN-compatible cells.
+
+    Args:
+      checkpointable: An object inheriting from `CheckpointableBase` to add
+        dependencies too (typically the cuDNN `Layer`).
+      dtype: The dtype for the canonical parameter Tensors.
+    """
+    split_dependencies = split_dependency.split_dependency(
+        component_names=self._param_names,
+        component_dtypes=(dtype,) * len(self._param_names),
+        fill_save_buffer_fn=self._checkpointable_save,
+        consume_restore_buffer_fn=self._checkpointable_restore)
+    self._checkpointable_track_params(checkpointable, split_dependencies)
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Tracks parameters in a canonical configuration."""
+    return  # NotImplementedError raised by the Layer.
+
+  def _tf_canonical_names(self):
+    tf_weights_names, tf_biases_names = [], []
+    for i in range(self._num_layers):
+      if self._direction == CUDNN_RNN_UNIDIRECTION:
+        prefix = self._tf_canonical_name_prefix(i)
+        self._tf_canonical_names_single_layer(prefix, tf_weights_names,
+                                              tf_biases_names)
+      else:
+        fwd_prefix = self._tf_canonical_name_prefix(i, is_fwd=True)
+        bak_prefix = self._tf_canonical_name_prefix(i, is_fwd=False)
+
+        self._tf_canonical_names_single_layer(fwd_prefix, tf_weights_names,
+                                              tf_biases_names)
+        self._tf_canonical_names_single_layer(bak_prefix, tf_weights_names,
+                                              tf_biases_names)
+    return tf_weights_names, tf_biases_names
+
+  def _tf_canonical_name_prefix(self, layer, is_fwd=True):
+    if self._direction == CUDNN_RNN_UNIDIRECTION:
+      return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
+    else:
+      if is_fwd:
+        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/fw/%s" %
+                (layer, self._rnn_cell_name))
+      else:
+        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/bw/%s" %
+                (layer, self._rnn_cell_name))
+
+  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
+                                       tf_biases_names):
+    raise NotImplementedError("Abstract method")
+
+
+class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
+  """SaveableObject implementation handling Cudnn LSTM opaque params."""
+
+  _format_converter_cls = CudnnParamsFormatConverterLSTM
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
+
+  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
+                                       tf_bias_names):
+    tf_weights_names.append(prefix + "/kernel")
+    tf_bias_names.append(prefix + "/bias")
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
+    biases = []
+    weights = []
+    for name in self._weight_names:
+      weights.append(params[name])
+    for name in self._bias_names:
+      biases.append(params[name])
+    assert len(params) == len(weights) + len(biases)
+    if len(weights) == 1 and len(biases) == 1:
+      # For single-layer cells, allow substituting a cell with no MultiRNNCell
+      # wrapping.
+      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
+      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
+      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
+      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+    assert len(biases) == len(weights)
+    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
+      cell = checkpointable_lib.Checkpointable()
+      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell.bias = bias
+      cell.kernel = kernel
+
+
+class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
+  """SaveableObject implementation handling Cudnn GRU opaque params."""
+
+  _format_converter_cls = CudnnParamsFormatConverterGRU
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
+
+  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
+                                       tf_bias_names):
+    tf_weights_names.append(prefix + "/gates/kernel")
+    tf_weights_names.append(prefix + "/candidate/input_projection/kernel")
+    tf_weights_names.append(prefix + "/candidate/hidden_projection/kernel")
+
+    tf_bias_names.append(prefix + "/gates/bias")
+    tf_bias_names.append(prefix + "/candidate/input_projection/bias")
+    tf_bias_names.append(prefix + "/candidate/hidden_projection/bias")
+
+
+class CudnnRNNTanhSaveable(CudnnLSTMSaveable):
+  _format_converter_cls = CudnnParamsFormatConverterTanh
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
+
+
+class CudnnRNNReluSaveable(CudnnLSTMSaveable):
+  _format_converter_cls = CudnnParamsFormatConverterRelu
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
+
+
 _cudnn_rnn_common_doc_string = """
   Cudnn RNN has an opaque parameter buffer that can be used for inference and
   training. But it is possible that the layout of the parameter buffers
@@ -850,7 +942,7 @@ def _get_num_params(rnn_mode, num_layers, direction):
   elif rnn_mode == CUDNN_RNN_TANH:
     num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
   else:
-    raise ValueError("Invalid \'rnn_mode\': %s", rnn_mode)
+    raise ValueError("Invalid \'rnn_mode\': %s" % rnn_mode)
   num_params = num_layers * num_params_per_layer
   if direction != CUDNN_RNN_UNIDIRECTION:
     num_params *= 2
@@ -918,7 +1010,7 @@ def _cudnn_rnn(inputs,
       "seed2": seed2,
       "name": name
   }
-  if use_cudnn_v2 is not "1":
+  if use_cudnn_v2 != "1":
     outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
   else:
     outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
@@ -1582,7 +1674,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
     """
 
     if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION):
-      raise ValueError("Invalid direction: %s", direction)
+      raise ValueError("Invalid direction: %s" % direction)
 
     super(_CudnnRNNNoInputC, self).__init__(
         self._rnn_mode,
diff --git a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
index 0456463a1928cf226010670b90a5d574579e0411..6c5f8c6b00975b3fba041271309a93cecd9f5057 100644
--- a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
@@ -46,7 +46,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -88,7 +88,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -115,9 +115,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((3, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset.apply(batching.assert_element_shape(wrong_shapes)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -142,7 +141,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
                      tensor_shape.TensorShape((3, 4)))
     self.assertEqual(actual_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -184,7 +183,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -211,9 +210,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((None, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset.apply(batching.assert_element_shape(wrong_shapes)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
index d2a72272db159755ac2d741bcdbce9ec646d928e..b9840b1ff1a3df5a05db0e64f436637220f49f80 100644
--- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -23,6 +23,7 @@ import shutil
 
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -48,7 +49,7 @@ class LMDBDatasetTest(test_base.DatasetTestBase):
     num_repeats = 2
 
     dataset = readers.LMDBDataset(filenames).repeat(num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index c5a786232252432481566e3cde23e9310df172cc..2527706709fae8e459aca3489324d4db3c784be6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -63,13 +63,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
     # RepeatDataset(count) ->
     # _SlideDataset(window_size, window_shift, window_stride).
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(count).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 window_shift=window_shift_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -127,13 +127,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
     # RepeatDataset(count) -> _SlideDataset(window_size, stride, window_stride).
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(count).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 stride=stride_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -173,12 +173,12 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     window_shift_t = array_ops.placeholder(dtypes.int64, shape=[])
     window_stride_t = array_ops.placeholder(dtypes.int64, shape=[])
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(10).map(lambda x: x).repeat(count_t).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 window_shift=window_shift_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
 
     with self.cached_session() as sess:
@@ -204,9 +204,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=[[0]], values=(i * [1]), dense_shape=[1])
 
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(
-            window_size=5, window_shift=3)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).apply(
+            sliding.sliding_window_batch(window_size=5, window_shift=3)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -233,9 +233,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           values=array_ops.fill([math_ops.to_int32(i)], i),
           dense_shape=[i])
 
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(
-            window_size=5, window_shift=3)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).apply(
+            sliding.sliding_window_batch(window_size=5, window_shift=3)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -265,11 +265,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=[[0]], values=(i * [1]), dense_shape=[1])
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(10).map(_sparse).apply(
             sliding.sliding_window_batch(window_size=4, window_shift=2)).apply(
-                sliding.sliding_window_batch(window_size=3, window_shift=1))
-        .make_initializable_iterator())
+                sliding.sliding_window_batch(window_size=3, window_shift=1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -305,11 +304,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       yield [4.0, 5.0, 6.0]
       yield [7.0, 8.0, 9.0, 10.0]
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(
             generator, dtypes.float32, output_shapes=[None]).apply(
-                sliding.sliding_window_batch(window_size=3, window_shift=1))
-        .make_initializable_iterator())
+                sliding.sliding_window_batch(window_size=3, window_shift=1)))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 34dc2379d0cb38f8f6962fa42efe21b793bc8d65..0fb406f1167053a128646c5c692986b0ce016f1e 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -188,8 +188,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 4601376dff47e161962e92678883039c4b88bab7..c0152156a1ba70297adb7054622b15ca04f859cd 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -21,10 +21,9 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util import deprecation
 
@@ -355,7 +354,7 @@ def read_batch_features(file_pattern,
       shuffle=randomize_input,
       num_epochs=num_epochs,
       shuffle_buffer_size=capacity)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   outputs = iterator.get_next()
   return outputs
 
@@ -379,15 +378,13 @@ class LMDBDataset(dataset_ops.DatasetSource):
     (key value) pairs sequentially.
     For example:
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.contrib.lmdb.LMDBDataset("/foo/bar.mdb")
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
+
     # Prints the (key, value) pairs inside a lmdb file.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for key, value in dataset:
+      print(key, value)
     ```
     Args:
       filenames: A `tf.string` tensor containing one or more filenames.
@@ -398,18 +395,10 @@ class LMDBDataset(dataset_ops.DatasetSource):
 
   def _as_variant_tensor(self):
     return gen_experimental_dataset_ops.experimental_lmdb_dataset(
-        self._filenames,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
-
-  @property
-  def output_classes(self):
-    return ops.Tensor, ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+        self._filenames, **dataset_ops.flat_structure(self))
 
   @property
-  def output_types(self):
-    return dtypes.string, dtypes.string
+  def _element_structure(self):
+    return structure.NestedStructure(
+        (structure.TensorStructure(dtypes.string, []),
+         structure.TensorStructure(dtypes.string, [])))
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
index bcc383587c54bd89502313f9328bc06c49046a87..5c6ee6bfdc7167d14b292f8f763adafca4e3a72c 100644
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -18,11 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util import deprecation
 
 
@@ -40,8 +39,13 @@ class _SlideDataset(dataset_ops.UnaryDataset):
     self._window_shift = ops.convert_to_tensor(
         window_shift, dtype=dtypes.int64, name="window_shift")
 
+    input_structure = structure.convert_legacy_structure(
+        input_dataset.output_types, input_dataset.output_shapes,
+        input_dataset.output_classes)
+    self._structure = input_structure._batch(None)  # pylint: disable=protected-access
+
   def _as_variant_tensor(self):
-    return gen_dataset_ops.slide_dataset(
+    return ged_ops.experimental_sliding_window_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         window_size=self._window_size,
         window_shift=self._window_shift,
@@ -49,20 +53,8 @@ class _SlideDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    input_shapes = self._input_dataset.output_shapes
-    return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(None).concatenate(s)
-        for s in nest.flatten(self._input_dataset.output_shapes)
-    ])
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 @deprecation.deprecated_args(
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
index a87a5624c88d1d0af10055261dad55937ed6aeb0..3ecd755d86f6be47910aebbdb46d335d165427d8 100644
--- a/tensorflow/contrib/distribute/BUILD
+++ b/tensorflow/contrib/distribute/BUILD
@@ -26,7 +26,6 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/contrib/distribute/python:collective_all_reduce_strategy",
-        "//tensorflow/contrib/distribute/python:cross_tower_ops",
         "//tensorflow/contrib/distribute/python:mirrored_strategy",
         "//tensorflow/contrib/distribute/python:monitor",
         "//tensorflow/contrib/distribute/python:one_device_strategy",
@@ -35,6 +34,7 @@ py_library(
         "//tensorflow/contrib/distribute/python:tpu_strategy",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:distribute_config",
         "//tensorflow/python/distribute:distribute_coordinator",
     ],
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 2e025765e4aaab7114aa6e3e79336e48a71b5b55..8a8dc159ade6f2a4a9b5ec29055ea4848492b29f 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -20,7 +20,7 @@ on many GPUs on one machine. Essentially, we create copies of all variables in
 the model's layers on each device. We then use all-reduce to combine gradients
 across the devices before applying them to the variables to keep them in sync.
 * [`CollectiveAllReduceStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/CollectiveAllReduceStrategy):
-This is a version of `MirroredStrategy` for multi-working training. It uses
+This is a version of `MirroredStrategy` for multi-worker training. It uses
 a collective op to do all-reduce. This supports between-graph communication and
 synchronization, and delegates the specifics of the all-reduce implementation to
 the runtime (as opposed to encoding it in the graph). This allows it to perform
@@ -31,8 +31,8 @@ fault-tolerance to allow training to continue when there is worker failure.
 * [`ParameterServerStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/ParameterServerStrategy):
 This strategy supports using parameter servers either for multi-GPU local
 training or asynchronous multi-machine training. When used to train locally,
-variables are not mirrored, instead they placed on the CPU and operations are
-replicated across all local GPUs. In a multi-machine setting, some are
+variables are not mirrored, instead they are placed on the CPU and operations
+are replicated across all local GPUs. In a multi-machine setting, some are
 designated as workers and some as parameter servers. Each variable is placed on
 one parameter server. Computation operations are replicated across all GPUs of
 the workers.
@@ -46,6 +46,9 @@ Let's see how to scale to multiple GPUs on one machine using `MirroredStrategy`
 Take a very simple model consisting of a single layer:
 
 ```python
+import tensorflow as tf
+from tensorflow import keras
+
 inputs = tf.keras.layers.Input(shape=(1,))
 predictions = tf.keras.layers.Dense(1)(inputs)
 model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
@@ -90,8 +93,8 @@ Similarly, we can also call `evaluate` and `predict` as before using appropriate
 datasets.
 
 ```python
-model.evaluate(eval_dataset)
-model.predict(predict_dataset)
+model.evaluate(eval_dataset, steps=1)
+model.predict(predict_dataset, steps=1)
 ```
 
 That's all you need to train your model with Keras on multiple GPUs with
@@ -131,7 +134,7 @@ def model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, loss=loss)
 
   if mode == tf.estimator.ModeKeys.TRAIN:
-    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss_fn())
+    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
 ```
 
@@ -190,7 +193,7 @@ in the input function gives a solid boost in performance. When using
 For multi-worker training, no code change is required to the `Estimator` code.
 You can run the same model code for all tasks in your cluster including
 parameter servers and the evaluator. But you need to use
-`tf.estimator.train_and_evaluator`, explicitly specify `num_gpus_per_workers`
+`tf.estimator.train_and_evaluate`, explicitly specify `num_gpus_per_workers`
 for your strategy object, and set "TF\_CONFIG" environment variables for each
 binary running in your cluster. We'll provide a Kubernetes template in the
 [tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) repo which sets
@@ -245,19 +248,17 @@ Let's use the same example for multi-worker. We'll start a cluster with 3
 workers doing synchronous all-reduce training. In the following code snippet, we
 start multi-worker training using `tf.estimator.train_and_evaluate`:
 
-
 ```python
 def model_main():
-  estimator = ...
   distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
       num_gpus_per_worker=2)
   config = tf.estimator.RunConfig(train_distribute=distribution)
+  estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
   train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
   eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
   tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
 ```
 
-
 **Note**: You don't have to set "TF\_CONFIG" manually if you use our provided
 Kubernetes template.
 
@@ -324,13 +325,13 @@ start training.
 On your laptop, you can run
 
 ```python
-estimator = ...
 distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
     num_gpus_per_worker=2)
 config = tf.estimator.RunConfig(
     experimental_distribute=tf.contrib.distribute.DistributeConfig(
         train_distribute=distribution,
         remote_cluster={"worker": ["host1:port", "host2:port", "host3:port"]}))
+estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
 train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
 eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
 tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
index 823fe6a917f4f31ab6822e4bb1130d62ff45f0c9..8ec73654e30e4967f318c558ba94301e84a206e4 100644
--- a/tensorflow/contrib/distribute/__init__.py
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -25,13 +25,13 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.distribute.python.collective_all_reduce_strategy import CollectiveAllReduceStrategy
-from tensorflow.contrib.distribute.python.cross_tower_ops import *
 from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
 from tensorflow.contrib.distribute.python.monitor import Monitor
 from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
 from tensorflow.contrib.distribute.python.step_fn import *
 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
+from tensorflow.python.distribute.cross_device_ops import *
 from tensorflow.python.distribute.distribute_config import DistributeConfig
 from tensorflow.python.distribute.distribute_coordinator import run_standard_tensorflow_server
 from tensorflow.python.training.distribute import *
@@ -41,27 +41,30 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
-    'AllReduceCrossTowerOps',
+    'AllReduceCrossDeviceOps',
     'CollectiveAllReduceStrategy',
-    'CrossTowerOps',
+    'CrossDeviceOps',
     'DistributeConfig',
     'DistributionStrategy',
+    'DistributionStrategyExtended',
     'MirroredStrategy',
     'Monitor',
+    'MultiWorkerAllReduce',
     'OneDeviceStrategy',
     'ParameterServerStrategy',
-    'ReductionToOneDeviceCrossTowerOps',
+    'ReductionToOneDeviceCrossDeviceOps',
     'Step',
     'StandardInputStep',
     'StandardSingleLossStep',
-    'TowerContext',
+    'ReplicaContext',
     'TPUStrategy',
-    'get_cross_tower_context',
+    'get_cross_replica_context',
     'get_distribution_strategy',
     'get_loss_reduction',
-    'get_tower_context',
+    'get_replica_context',
     'has_distribution_strategy',
-    'require_tower_context',
+    'in_cross_replica_context',
+    'require_replica_context',
     'run_standard_tensorflow_server',
     'UpdateContext',
 ]
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 8267612236bcf2946c033d3e5071eee935d2c03a..4c9c35da5a36aa8149d15c8d1c25e4dfaa6a07c1 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -16,45 +16,26 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 # TODO(priyag): Figure out testonly issues that are preventing us from
 # including our tests in pip for now.
 
-py_library(
-    name = "values",
-    srcs = ["values.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":input_ops",
-        ":prefetching_ops_v2",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device_util",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
-        "@six_archive//:six",
-    ],
-)
-
 cuda_py_test(
     name = "values_test",
     srcs = ["values_test.py"],
     additional_deps = [
+        ":combinations",
         ":mirrored_strategy",
         ":multi_worker_test_base",
-        ":values",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python:errors",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python:device_util",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -68,25 +49,9 @@ py_library(
     srcs = ["mirrored_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
-        ":shared_variable_creator",
-        ":values",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:device_util",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -95,16 +60,17 @@ py_library(
     srcs = ["parameter_server_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
         ":mirrored_strategy",
-        ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -116,7 +82,7 @@ cuda_py_test(
         ":combinations",
         ":multi_worker_test_base",
         ":parameter_server_strategy",
-        ":values",
+        ":strategy_test_lib",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -127,10 +93,12 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:layers",
         "//tensorflow/python:session",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -145,12 +113,13 @@ py_library(
     srcs = ["one_device_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":values",
-        "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:distribute",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -161,16 +130,16 @@ py_library(
     srcs = ["collective_all_reduce_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
-        ":cross_tower_utils",
         ":mirrored_strategy",
-        ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -187,11 +156,11 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:layers",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -212,10 +181,10 @@ py_library(
         ":tpu_strategy",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/optimizer_v2:training",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -233,28 +202,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "mirrored_strategy_test",
-    srcs = ["mirrored_strategy_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
-        ":mirrored_strategy",
-        ":multi_worker_test_base",
-        ":strategy_test_lib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 py_test(
     name = "one_device_strategy_test",
     srcs = ["one_device_strategy_test.py"],
@@ -270,35 +217,32 @@ py_test(
     ],
 )
 
+# TODO(priyag): Rename this test to mirrored_strategy_test
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
     srcs = ["mirrored_strategy_multigpu_test.py"],
     additional_deps = [
+        ":combinations",
         ":mirrored_strategy",
         ":multi_worker_test_base",
-        ":values",
         ":strategy_test_lib",
-        "//tensorflow/python:distribute",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:layers",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
+    shard_count = 5,
     tags = [
         "guitar",
-        "no_pip",
         "multi_and_single_gpu",
-        # Do not perform the extra analysis on this test, because it is already
-        # performed for the `:mirrored_strategy_test` target.
-        "no_oss",
-        "noasan",
-        "notap",
-        "notsan",
+        "no_pip",
     ],
 )
 
@@ -315,6 +259,7 @@ py_library(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:session",
+        "//tensorflow/python:util",
         "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
     ],
@@ -336,12 +281,15 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":one_device_strategy",
-        ":values",
         "//tensorflow/contrib/tpu:tpu_lib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -351,7 +299,6 @@ cuda_py_test(
     additional_deps = [
         ":collective_all_reduce_strategy",
         ":combinations",
-        ":cross_tower_utils",
         ":multi_worker_test_base",
         ":strategy_test_lib",
         "@absl_py//absl/testing:parameterized",
@@ -367,6 +314,7 @@ cuda_py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -411,6 +359,24 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "moving_averages_test",
+    srcs = ["moving_averages_test.py"],
+    additional_deps = [
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
+
 cuda_py_test(
     name = "optimizer_v2_test",
     srcs = ["optimizer_v2_test.py"],
@@ -448,15 +414,31 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # http://b/119349471
+        "no_pip",
+        "tf_integration_test",
+    ],
+)
+
+cuda_py_test(
+    name = "keras_optimizer_v2_test",
+    srcs = ["keras_optimizer_v2_test.py"],
+    additional_deps = [
+        ":keras_test_lib",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # http://b/119349471
         "no_pip",
+        "tf_integration_test",
     ],
 )
 
 cuda_py_test(
     name = "estimator_training_test",
-    size = "enormous",
     srcs = ["estimator_training_test.py"],
     additional_deps = [
+        ":collective_all_reduce_strategy",
         ":combinations",
         ":mirrored_strategy",
         ":multi_worker_test_base",
@@ -464,7 +446,9 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/contrib/optimizer_v2:training",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute",
+        "//tensorflow/python/distribute:distribute_config",
+        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/feature_column",
@@ -472,9 +456,15 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
     ],
+    shard_count = 48,
     tags = [
         "multi_and_single_gpu",
         "no_pip",
+        # TODO(b/118768923): Re-enable {a,m,t}san test.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "no_oss",  # http://b/119349471
     ],
 )
 
@@ -550,52 +540,16 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "shared_variable_creator",
-    srcs = ["shared_variable_creator.py"],
-    visibility = ["//tensorflow:internal"],
-)
-
-py_test(
-    name = "shared_variable_creator_test",
-    srcs = ["shared_variable_creator_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":shared_variable_creator",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
-py_library(
-    name = "cross_tower_utils",
-    srcs = ["cross_tower_utils.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":values",
-        "//tensorflow/contrib/all_reduce:all_reduce_py",
-        "//tensorflow/contrib/nccl:nccl_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-    ],
-)
-
 cuda_py_test(
-    name = "cross_tower_utils_test",
-    srcs = ["cross_tower_utils_test.py"],
+    name = "cross_device_utils_test",
+    srcs = ["cross_device_utils_test.py"],
     additional_deps = [
         ":combinations",
-        ":cross_tower_utils",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
@@ -604,41 +558,20 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "cross_tower_ops",
-    srcs = ["cross_tower_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":cross_tower_utils",
-        ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:device_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-        "@six_archive//:six",
-    ],
-)
-
 cuda_py_test(
-    name = "cross_tower_ops_test",
-    size = "large",
-    srcs = ["cross_tower_ops_test.py"],
+    name = "cross_device_ops_test",
+    srcs = ["cross_device_ops_test.py"],
     additional_deps = [
         ":combinations",
-        ":cross_tower_ops",
         ":multi_worker_test_base",
         ":mirrored_strategy",
-        ":values",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
@@ -648,63 +581,6 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "prefetching_ops_v2",
-    srcs = ["prefetching_ops_v2.py"],
-    deps = [
-        "//tensorflow/contrib/data/python/ops:prefetching_ops",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-    ],
-)
-
-cuda_py_test(
-    name = "prefetching_ops_v2_test",
-    srcs = ["prefetching_ops_v2_test.py"],
-    additional_deps = [
-        ":prefetching_ops_v2",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-py_library(
-    name = "input_ops",
-    srcs = ["input_ops.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-cuda_py_test(
-    name = "input_ops_test",
-    srcs = ["input_ops_test.py"],
-    additional_deps = [
-        ":input_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python:util",
-    ],
-    tags = [
-        "no_pip",
-    ],
-)
-
 py_library(
     name = "keras_test_lib",
     testonly = 1,
@@ -715,6 +591,7 @@ py_library(
         "//tensorflow/contrib/distribute/python:tpu_strategy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/keras",
         "//third_party/py/numpy",
@@ -731,6 +608,7 @@ cuda_py_test(
     shard_count = 16,
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
         "no_pip",
         "no_windows_gpu",
         "notsan",
@@ -743,7 +621,6 @@ py_library(
     srcs = ["metrics_v1_test.py"],
     deps = [
         ":combinations",
-        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index 865dba803f562e0ab98341dd8343e3c72b03d39b..31bd0e996a247a2fc01405fb3b8172a40853d698 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -43,10 +43,12 @@ class CheckpointUtilsWithDistributionStrategyTest(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
-      in_tower_mode=[True, False],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
+      in_replica_mode=[True, False],
       mode=["graph"]))
-  def testInitFromCheckpoint(self, distribution, in_tower_mode):
+  def testInitFromCheckpoint(self, distribution, in_replica_mode):
     checkpoint_dir = self.get_temp_dir()
     with self.cached_session() as session:
       v1_value, v2_value, _, _ = checkpoint_utils_test._create_checkpoints(
@@ -68,8 +70,8 @@ class CheckpointUtilsWithDistributionStrategyTest(
         self.assertAllEqual(v2_value, self.evaluate(v2))
 
     with ops.Graph().as_default() as g, distribution.scope():
-      if in_tower_mode:
-        distribution.call_for_each_tower(init_and_verify, g)
+      if in_replica_mode:
+        distribution.call_for_each_replica(init_and_verify, args=[g])
       else:
         init_and_verify(g)
 
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index 9809204f8f107270b5a7b51e65e06afdae7d96b8..5c50a20490482856becedf7b1379d2a0583d9a11 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -18,12 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import cross_tower_utils
+import copy
+
 from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -32,7 +36,7 @@ from tensorflow.python.platform import tf_logging as logging
 
 
 # TODO(yuefengz): support in-graph replication.
-class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
+class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
   """Distribution strategy that uses collective ops for all-reduce.
 
   It is similar to the MirroredStrategy but it uses collective ops for
@@ -53,6 +57,17 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
       num_gpus_per_worker: number of local GPUs or GPUs per worker, the default
         is 0 meaning CPU only.
     """
+    super(CollectiveAllReduceStrategy, self).__init__(
+        CollectiveAllReduceExtended(self, num_gpus_per_worker))
+
+
+class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
+  """Implementation of CollectiveAllReduceStrategy."""
+
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    distribute_lib.DistributionStrategyExtended.__init__(
+        self, container_strategy)
+    self._cross_device_ops = None
     self._num_gpus_per_worker = num_gpus_per_worker
     self._initialize_local_worker(num_gpus_per_worker)
 
@@ -62,19 +77,19 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     self._num_workers = 1
 
     if num_gpus_per_worker:
-      local_devices = [
+      local_devices = tuple(
           "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
-      ]
+      )
     else:
-      local_devices = ["/device:CPU:0"]
+      local_devices = ("/device:CPU:0",)
+    self._worker_device = device_util.canonicalize("/device:CPU:0")
 
-    self._collective_keys = cross_tower_utils.CollectiveKeys()
-    super(CollectiveAllReduceStrategy, self).__init__(
-        devices=local_devices,
-        cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
-            num_workers=1,
-            num_gpus_per_worker=num_gpus_per_worker,
-            collective_keys=self._collective_keys))
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._initialize_local(local_devices)
+    self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus_per_worker,
+        collective_keys=self._collective_keys)
 
     self._cluster_spec = None
     self._task_type = None
@@ -89,13 +104,12 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     if task_type is None or task_id is None:
       raise ValueError("When `cluster_spec` is given, you must also specify "
                        "`task_type` and `task_id`")
-    if task_type not in ["chief", "worker"]:
+    if task_type not in ("chief", "worker"):
       raise ValueError(
           "Unrecognized task_type: %r, valid task types are: \"chief\", "
           "\"worker\"." % task_type)
     cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._num_workers = len(cluster_spec.as_dict().get("worker", [])) + len(
-        cluster_spec.as_dict().get("chief", []))
+    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
     if not self._num_workers:
       raise ValueError("No `worker` or `chief` tasks can be found in "
                        "`cluster_spec`.")
@@ -103,22 +117,21 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                 task_id)
 
-    worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
     if num_gpus_per_worker:
-      local_devices = [
-          "%s/device:GPU:%d" % (worker_device, i)
+      local_devices = tuple(
+          "%s/device:GPU:%d" % (self._worker_device, i)
           for i in range(num_gpus_per_worker)
-      ]
+      )
     else:
-      local_devices = [worker_device]
+      local_devices = (self._worker_device,)
 
-    self._collective_keys = cross_tower_utils.CollectiveKeys()
-    super(CollectiveAllReduceStrategy, self).__init__(
-        devices=local_devices,
-        cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
-            num_workers=self._num_workers,
-            num_gpus_per_worker=num_gpus_per_worker,
-            collective_keys=self._collective_keys))
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._initialize_local(local_devices)
+    self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus_per_worker,
+        collective_keys=self._collective_keys)
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
@@ -160,7 +173,7 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
           if i > 0:
             # Give replicas meaningful distinct names:
             var0name = index[devices[0]].name.split(":")[0]
-            # We append a / to variable names created on towers with id > 0 to
+            # We append a / to variable names created on replicas with id > 0 to
             # ensure that we ignore the name scope and instead use the given
             # name as the absolute name of the variable.
             kwargs["name"] = "%s/replica_%d/" % (var0name, i)
@@ -202,17 +215,40 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     return mirrored_strategy._create_mirrored_variable(
         devices, _real_mirrored_creator, *args, **kwargs)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
     # TODO(yuefengz): shard the dataset.
-    return values.PerDeviceDataset(
+    return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._devices, True)
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _make_dataset_iterator(self, dataset):
+    worker_device_pairs = [(self._worker_device, self._devices)]
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec is None:
+      input_pipeline_id = 0
+    else:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=self._num_workers,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+
+    return values.InputFunctionIterator(
+        input_fn, [(self._worker_device, self._devices)], [input_context])
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     """Configures the object.
 
     Args:
@@ -232,8 +268,25 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
       self._initialize_multi_worker(self._num_gpus_per_worker, cluster_spec,
                                     task_type, task_id)
 
-    if not session_config or not self._cluster_spec:
-      return
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    # Enable the scoped allocator optimization for CollectiveOps.  This
+    # optimization converts many small all-reduces into fewer larger
+    # all-reduces.
+    rewrite_options = updated_config.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_optimization = (
+        rewriter_config_pb2.RewriterConfig.ON)
+    # We turn on ScopedAllocator only for CollectiveReduce op, i.e. enable_op =
+    # ["CollectiveReduce"].  Since we can't assign to a repeated proto field, we
+    # clear and then append.
+    del rewrite_options.scoped_allocator_opts.enable_op[:]
+    rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
+
+    if not self._cluster_spec:
+      return updated_config
 
     assert self._task_type
     assert self._task_id is not None
@@ -241,34 +294,28 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     # Collective group leader is needed for collective ops to coordinate
     # workers.
     if "chief" in self._cluster_spec.jobs:
-      session_config.experimental.collective_group_leader = (
+      updated_config.experimental.collective_group_leader = (
           "/job:chief/replica:0/task:0")
     else:
       if "worker" not in self._cluster_spec.jobs:
         raise ValueError(
             "You must have `chief` or `worker` jobs in the `cluster_spec`.")
-      session_config.experimental.collective_group_leader = (
+      updated_config.experimental.collective_group_leader = (
           "/job:worker/replica:0/task:0")
 
     # The device filters prevent communication between workers.
-    del session_config.device_filters[:]
-    session_config.device_filters.append(
+    del updated_config.device_filters[:]
+    updated_config.device_filters.append(
         "/job:%s/task:%d" % (self._task_type, self._task_id))
 
-    # The scoped_allocator_optimization is to optimize graphs for collective
-    # ops.
-    rewrite_options = session_config.graph_options.rewrite_options
-    rewrite_options.scoped_allocator_optimization = (
-        rewriter_config_pb2.RewriterConfig.ON)
-    del rewrite_options.scoped_allocator_opts.enable_op[:]
-    rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
+    return updated_config
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
     return True
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return True
 
   @property
@@ -278,3 +325,12 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
   @property
   def should_save_summary(self):
     return self._is_chief
+
+  @property
+  def _num_replicas_in_sync(self):
+    return len(self._devices) * self._num_workers
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 6796a23d464d344554ae9654e0992e30df5ad213..8a9e583f0afaac37a2057bae9b1ed79de43d68bc 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -23,13 +23,19 @@ import numpy as np
 
 from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
@@ -51,11 +57,6 @@ class CollectiveAllReduceStrategyTestBase(
   collective_key_base = 0
 
   def setUp(self):
-    self._run_options = config_pb2.RunOptions()
-    self._run_options.experimental.collective_graph_key = 6
-
-    self._sess_config = config_pb2.ConfigProto()
-
     # We use a different key_base for each test so that collective keys won't be
     # reused.
     # TODO(yuefengz, tucker): enable it to reuse collective keys in different
@@ -66,33 +67,38 @@ class CollectiveAllReduceStrategyTestBase(
   def _get_test_object(self, task_type, task_id, num_gpus=0):
     distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
         num_gpus_per_worker=num_gpus)
+    session_config = config_pb2.ConfigProto()
     if task_type and task_id is not None:
       distribution.configure(
-          session_config=self._sess_config,
+          session_config=session_config,
           cluster_spec=self._cluster_spec,
           task_type=task_type,
           task_id=task_id)
-    collective_keys = cross_tower_utils.CollectiveKeys(
+    collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_start=num_gpus * 100 +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         CollectiveAllReduceStrategyTestBase.collective_key_base)
-    distribution._collective_keys = collective_keys
-    distribution._cross_tower_ops._collective_keys = collective_keys
+    distribution.extended._collective_keys = collective_keys
+    distribution.extended._inferred_cross_device_ops._collective_keys = (
+        collective_keys)
     if task_type and task_id is not None:
-      return distribution, 'grpc://' + self._cluster_spec[task_type][task_id]
+      return distribution, 'grpc://' + self._cluster_spec[task_type][
+          task_id], session_config
     else:
-      return distribution, ''
+      return distribution, '', session_config
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
-    d, master_target = self._get_test_object(task_type, task_id, num_gpus)
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
     with ops.Graph().as_default(), \
-         self.test_session(config=self._sess_config,
-                           target=master_target) as sess, \
+         self.cached_session(config=config,
+                             target=master_target) as sess, \
          d.scope():
-      l = core.Dense(1, use_bias=False, name='gpu_%d' % d._num_gpus_per_worker)
+      l = core.Dense(1, use_bias=False,
+                     name='gpu_%d' % d.extended._num_gpus_per_worker)
 
       def loss_fn(x):
         y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
@@ -117,7 +123,7 @@ class CollectiveAllReduceStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_tower(grad_fn, one)
+        g_v = d.call_for_each_replica(grad_fn, args=[one])
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
@@ -126,8 +132,8 @@ class CollectiveAllReduceStrategyTestBase(
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
                 d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -135,14 +141,13 @@ class CollectiveAllReduceStrategyTestBase(
 
       before_out, after_out = step()
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
-      sess.run(
-          variables.global_variables_initializer(), options=self._run_options)
+      sess.run(variables.global_variables_initializer())
 
       for i in range(10):
-        b, a = sess.run((before_out, after_out), options=self._run_options)
+        b, a = sess.run((before_out, after_out))
         if i == 0:
           before, = b
         after, = a
@@ -154,7 +159,8 @@ class CollectiveAllReduceStrategyTestBase(
       return error_after < error_before
 
   def _test_complex_model(self, task_type, task_id, num_gpus):
-    d, master_target = self._get_test_object(task_type, task_id, num_gpus)
+    d, master_target, config = self._get_test_object(task_type, task_id,
+                                                     num_gpus)
 
     def model_fn():
       """Mnist model with synthetic input."""
@@ -193,10 +199,10 @@ class CollectiveAllReduceStrategyTestBase(
       return train_op
 
     with ops.Graph().as_default(), \
-         self.test_session(config=self._sess_config,
-                           target=master_target) as sess:
+         self.cached_session(config=config,
+                             target=master_target) as sess:
       with d.scope():
-        train_op = d.call_for_each_tower(model_fn)
+        train_op = d.call_for_each_replica(model_fn)
         train_op = d.group(d.unwrap(train_op))
 
       sess.run(variables.global_variables_initializer())
@@ -204,11 +210,11 @@ class CollectiveAllReduceStrategyTestBase(
       return True
 
   def _test_variable_initialization(self, task_type, task_id, num_gpus):
-    distribution, master_target = self._get_test_object(task_type, task_id,
-                                                        num_gpus)
+    distribution, master_target, config = self._get_test_object(
+        task_type, task_id, num_gpus)
     with ops.Graph().as_default(), \
-         self.test_session(config=self._sess_config,
-                           target=master_target) as sess, \
+         self.cached_session(config=config,
+                             target=master_target) as sess, \
          distribution.scope():
 
       def model_fn():
@@ -219,27 +225,55 @@ class CollectiveAllReduceStrategyTestBase(
                 1.0, 10.0, dtype=dtypes.float32))
         return array_ops.identity(x)
 
-      x = distribution.call_for_each_tower(model_fn)
-      reduced_x = distribution.unwrap(
-          distribution.reduce(
-              variable_scope.VariableAggregation.MEAN, x,
-              destinations='/cpu:0'))[0]
+      x = distribution.call_for_each_replica(model_fn)
+      reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x)
       x = distribution.unwrap(x)[0]
 
-      sess.run(
-          variables.global_variables_initializer(), options=self._run_options)
+      sess.run(variables.global_variables_initializer())
 
-      x_value, reduced_x_value = sess.run(
-          [x, reduced_x], options=self._run_options)
+      x_value, reduced_x_value = sess.run([x, reduced_x])
       self.assertTrue(
           np.allclose(x_value, reduced_x_value, atol=1e-5),
           msg=('x_value = %r, reduced_x_value = %r' % (x_value,
                                                        reduced_x_value)))
     return np.allclose(x_value, reduced_x_value, atol=1e-5)
 
+  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
+                              expected_values):
+    distribution, master_target, config = self._get_test_object(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_device(d, next_element) for d in devices])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
 
 class DistributedCollectiveAllReduceStrategyTest(
-    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
@@ -247,6 +281,16 @@ class DistributedCollectiveAllReduceStrategyTest(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0)
 
+  def test_num_replicas_in_sync(self):
+    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+        num_gpus_per_worker=2)
+    distribution.configure(cluster_spec=self._cluster_spec, task_type='worker',
+                           task_id=0)
+    num_workers = len(self._cluster_spec.get('chief', []) +
+                      self._cluster_spec.get('worker', []))
+    self.assertEqual(2 * num_workers,
+                     distribution.num_replicas_in_sync)
+
   @combinations.generate(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testMinimizeLossGraph(self, num_gpus):
@@ -257,7 +301,7 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testVariableInitialization(self, num_gpus):
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_variable_initialization,
         self._cluster_spec,
@@ -267,10 +311,56 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
 
+  # TODO(yuefengz): Update how we use num_gpus and required_gpus
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testMakeInputFnIterator(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    # We use CPU as the device when num_gpus = 0
+    devices_per_worker = max(1, num_gpus)
+    expected_values = [[i+j for j in range(devices_per_worker)]
+                       for i in range(0, 100, devices_per_worker)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=3*devices_per_worker,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator('worker', 1, num_gpus,
+                                 input_fn, expected_values)
+
+  def testUpdateConfigProto(self):
+    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+        num_gpus_per_worker=2)
+    distribution.configure(
+        cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
+
+    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
+    rewrite_options = config_proto.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_opts.enable_op.append('to_be_removed')
+
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify group leader
+    self.assertEqual('/job:worker/replica:0/task:0',
+                     new_config.experimental.collective_group_leader)
+
+    # Verify device filters.
+    self.assertEqual(['/job:worker/task:1'], new_config.device_filters)
+
+    # Verify rewrite options.
+    new_rewrite_options = new_config.graph_options.rewrite_options
+    self.assertEqual(rewriter_config_pb2.RewriterConfig.ON,
+                     new_rewrite_options.scoped_allocator_optimization)
+    self.assertEqual(['CollectiveReduce'],
+                     new_rewrite_options.scoped_allocator_opts.enable_op)
+
 
 class DistributedCollectiveAllReduceStrategyTestWithChief(
     CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
@@ -281,10 +371,6 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0, has_chief=True)
 
-  def setUp(self):
-    super(DistributedCollectiveAllReduceStrategyTestWithChief, self).setUp()
-    self._run_options.experimental.collective_graph_key = 7
-
   @combinations.generate(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testMinimizeLossGraph(self, num_gpus):
@@ -310,21 +396,37 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
         self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
 
 
-class LocalCollectiveAllReduceStrategy(
-    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
+class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase,
+                                       strategy_test_lib.DistributionTestBase,
+                                       parameterized.TestCase):
 
   def testMinimizeLossGraph(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._test_minimize_loss_graph(None, None, num_gpus)
 
   def testComplexModel(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._test_complex_model(None, None, num_gpus)
 
+  def testMakeInputFnIterator(self, num_gpus=2):
+    # Collective ops doesn't support strategy with one device.
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    self._test_input_fn_iterator(None, None, num_gpus,
+                                 input_fn, expected_values)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index cff4b0a463e43b1e63fa9e9e96a9df6ee193b506..365ce5cdec79f1914f0c9ccdf59a7dc59e6f819e 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -53,11 +53,11 @@ from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
 from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 from tensorflow.python.util import tf_inspect
@@ -168,6 +168,8 @@ def _augment_with_special_arguments(test_method):
       if GPU_TEST:
         self.skipTest("Test that doesn't require GPUs.")
     elif context.num_gpus() < required_gpus:
+      # TODO(priyag): Consider allowing tests in graph mode using soft
+      # placement.
       self.skipTest(
           "{} GPUs are not available for this test. {} GPUs are available".
           format(required_gpus, context.num_gpus()))
@@ -190,7 +192,7 @@ def _augment_with_special_arguments(test_method):
         kwargs_to_pass[arg] = kwargs[arg]
 
     if mode == "eager":
-      with ops.Graph().as_default(), context.eager_mode():
+      with context.eager_mode():
         if distribution:
           kwargs_to_pass["distribution"] = distribution.strategy
         test_method(**kwargs_to_pass)
@@ -335,40 +337,58 @@ tpu_strategy_one_step = NamedDistribution(
     "TPUOneStep", lambda: tpu_lib.TPUStrategy(
         TPUClusterResolver(""), steps_per_run=1),
     required_tpu=True)
-# Note that we disable prefetching for testing since prefetching makes
-# the input non-deterministic.
+mirrored_strategy_with_one_cpu = NamedDistribution(
+    "Mirrored1CPU",
+    lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
+mirrored_strategy_with_one_gpu = NamedDistribution(
+    "Mirrored1GPU",
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]),
+    required_gpus=1)
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
-    lambda: mirrored_lib.MirroredStrategy(
-        ["/gpu:0", "/cpu:0"], prefetch_on_device=False),
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
     required_gpus=1)
 mirrored_strategy_with_two_gpus = NamedDistribution(
     "Mirrored2GPUs",
-    lambda: mirrored_lib.MirroredStrategy(
-        ["/gpu:0", "/gpu:1"], prefetch_on_device=False),
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
+    required_gpus=2)
+core_mirrored_strategy_with_one_cpu = NamedDistribution(
+    "CoreMirrored1CPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/cpu:0"]))
+core_mirrored_strategy_with_one_gpu = NamedDistribution(
+    "CoreMirrored1GPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0"]),
+    required_gpus=1)
+core_mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
+    "CoreMirroredCPUAndGPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/cpu:0"]),
+    required_gpus=1)
+core_mirrored_strategy_with_two_gpus = NamedDistribution(
+    "CoreMirrored2GPUs",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
 
 
-adam_optimizer_v1_fn = NamedObject(
-    "AdamV1", lambda: adam.AdamOptimizer(0.001, epsilon=1))
 gradient_descent_optimizer_v1_fn = NamedObject(
     "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2))
 adagrad_optimizer_v1_fn = NamedObject(
     "AdagradV1", lambda: adagrad.AdagradOptimizer(0.001))
+adam_optimizer_v1_fn = NamedObject("AdamV1",
+                                   lambda: adam.AdamOptimizer(0.001, epsilon=1))
 rmsprop_optimizer_v1_fn = NamedObject(
     "RmsPropV1", lambda: rmsprop.RMSPropOptimizer(0.001))
-optimizers_v1 = [adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn,
-                 adagrad_optimizer_v1_fn]
 
-adam_optimizer_v2_fn = NamedObject(
-    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1))
+optimizers_v1 = [gradient_descent_optimizer_v1_fn, adagrad_optimizer_v1_fn]
+
 gradient_descent_optimizer_v2_fn = NamedObject(
     "GradientDescentV2",
     lambda: gradient_descent_v2.GradientDescentOptimizer(0.2))
 adagrad_optimizer_v2_fn = NamedObject(
     "AdagradV2", lambda: adagrad_v2.AdagradOptimizer(0.001))
-optimizers_v2 = [adam_optimizer_v2_fn, gradient_descent_optimizer_v2_fn,
-                 adagrad_optimizer_v2_fn]
+adam_optimizer_v2_fn = NamedObject(
+    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1))
+
+optimizers_v2 = [gradient_descent_optimizer_v2_fn, adagrad_optimizer_v2_fn]
 
 graph_and_eager_modes = ["graph", "eager"]
 
@@ -377,8 +397,11 @@ def distributions_and_v1_optimizers():
   """A common set of combination with DistributionStrategies and Optimizers."""
   return combine(
       distribution=[
-          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
-          mirrored_strategy_with_two_gpus
+          one_device_strategy,
+          mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus,
+          core_mirrored_strategy_with_gpu_and_cpu,
+          core_mirrored_strategy_with_two_gpus,
       ],
       optimizer_fn=optimizers_v1)
 
@@ -387,7 +410,10 @@ def distributions_and_v2_optimizers():
   """DistributionStrategies and V2 Optimizers."""
   return combine(
       distribution=[
-          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
-          mirrored_strategy_with_two_gpus
+          one_device_strategy,
+          mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus,
+          core_mirrored_strategy_with_gpu_and_cpu,
+          core_mirrored_strategy_with_two_gpus,
       ],
       optimizer_fn=optimizers_v2)
diff --git a/tensorflow/contrib/distribute/python/cross_device_ops_test.py b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6e9521c1c1115ffdbdcf375ad4017bacb962832
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
@@ -0,0 +1,580 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CrossDeviceOps."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _make_per_replica(values, devices, regroup=False):
+  devices = cross_device_ops_lib.get_devices_from(devices)
+  assert len(values) == len(devices)
+
+  # We simulate the result of regroup called on PerReplica which strips the
+  # PerReplica wrapper if it has only one value.
+  if len(values) == 1 and regroup:
+    with ops.device(devices[0]):
+      placed_v = array_ops.identity(values[0])
+    return placed_v
+
+  index = {}
+  for d, v in zip(devices, values):
+    with ops.device(d):
+      placed_v = array_ops.identity(v)
+    index[d] = placed_v
+  return value_lib.PerReplica(index)
+
+
+# pylint: disable=g-doc-args,g-doc-return-or-yield
+def _fake_mirrored(value, devices):
+  """Create a faked Mirrored object for testing.
+
+  All components of the returned Mirrored have the same objects, which is not
+  true in reality.
+  """
+  devices = cross_device_ops_lib.get_devices_from(devices)
+  return value_lib.Mirrored(
+      {d: v for d, v in zip(devices, [value] * len(devices))})
+
+
+def _make_indexed_slices(values, indices, dense_shape, device):
+  with ops.device(device):
+    tensor = ops.IndexedSlices(
+        values=constant_op.constant(values),
+        indices=constant_op.constant(indices),
+        dense_shape=constant_op.constant(dense_shape))
+  return tensor
+
+
+def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
+  return value_lib.Mirrored({
+      d: _make_indexed_slices(values, indices, dense_shape, d) for d in devices
+  })
+
+
+_cpu_device = "/device:CPU:0"
+
+
+class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
+
+  def _assert_indexed_slices_equal(self, left, right):
+    self.assertIsInstance(left, ops.IndexedSlices)
+    self.assertIsInstance(right, ops.IndexedSlices)
+    self.assertEqual(device_util.resolve(left.device),
+                     device_util.resolve(right.device))
+    self.assertAllEqual(
+        self.evaluate(ops.convert_to_tensor(left)),
+        self.evaluate(ops.convert_to_tensor(right)))
+
+  def _assert_values_equal(self, left, right):
+    if isinstance(left, list):
+      for l, r in zip(left, right):
+        self._assert_values_equal(l, r)
+    else:
+      self.assertEqual(type(left), type(right))
+      self.assertEqual(set(left.devices), set(right.devices))
+      if isinstance(list(left._index.values())[0], ops.IndexedSlices):
+        for (d, v) in left._index.items():
+          self._assert_indexed_slices_equal(v, right._index[d])
+      elif context.executing_eagerly():
+        self.assertEqual([v.numpy() for v in left._index.values()],
+                         list(right._index.values()))
+      else:
+        with self.cached_session() as sess:
+          self.assertEqual(
+              sess.run(list(left._index.values())), list(right._index.values()))
+
+  def _testReductionAndBroadcast(self, cross_device_ops, distribution):
+    devices = distribution.extended.worker_devices
+
+    values = [constant_op.constant(float(d)) for d in range(len(devices))]
+    per_replica = _make_per_replica(values, devices)
+    mean = (len(devices) - 1.) / 2.
+
+    values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
+    per_replica_2 = _make_per_replica(values_2, devices)
+    mean_2 = mean + 1.
+
+    destination_mirrored = _fake_mirrored(1., devices)
+    destination_different = _fake_mirrored(1., _cpu_device)
+    destination_str = _cpu_device
+
+    all_destinations = [
+        destination_mirrored, destination_different, destination_str,
+    ]
+
+    # test reduce()
+    for destinations in all_destinations:
+      self._assert_values_equal(
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.MEAN,
+              per_replica,
+              destinations=destinations),
+          _fake_mirrored(mean, destinations))
+      self._assert_values_equal(
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.MEAN,
+              per_replica_2,
+              destinations=destinations),
+          _fake_mirrored(mean_2, destinations))
+      self._assert_values_equal(
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.SUM, per_replica,
+              destinations=destinations),
+          _fake_mirrored(mean * len(devices), destinations))
+      self._assert_values_equal(
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.SUM,
+              per_replica_2,
+              destinations=destinations),
+          _fake_mirrored(mean_2 * len(devices), destinations))
+
+    # test batch_reduce()
+    for d1, d2 in itertools.product(all_destinations, all_destinations):
+      self._assert_values_equal(
+          cross_device_ops.batch_reduce(
+              reduce_util.ReduceOp.MEAN,
+              [(per_replica, d1), (per_replica_2, d2)]),
+          [
+              _fake_mirrored(mean, d1),
+              _fake_mirrored(mean_2, d2)
+          ])
+      self._assert_values_equal(
+          cross_device_ops.batch_reduce(
+              reduce_util.ReduceOp.SUM,
+              [(per_replica, d1), (per_replica_2, d2)]),
+          [
+              _fake_mirrored(mean * len(devices), d1),
+              _fake_mirrored(mean_2 * len(devices), d2)
+          ])
+
+    # test broadcast()
+    for destinations in all_destinations:
+      self._assert_values_equal(
+          cross_device_ops.broadcast(constant_op.constant(1.), destinations),
+          _fake_mirrored(1., destinations))
+
+
+class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
+  # TODO(yuefengz): decouple the num_gpus check from distribution in
+  # combinations module so that we can pass in devices instead of a distribution
+  # strategy.
+  reduction_to_one_combinations = combinations.combine(
+      cross_device_ops=[
+          combinations.NamedObject(
+              "DefaultReductionToOneDeviceCrossDeviceOps",
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+          combinations.NamedObject(
+              "ReductionToCPUDeviceCrossDeviceOps",
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+                  reduce_to_device=_cpu_device)),
+          combinations.NamedObject(
+              "AccumulateNCrossDeviceOp",
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+                  accumulation_fn=math_ops.accumulate_n)),
+      ],
+      distribution=[
+          combinations.one_device_strategy,
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_two_gpus
+      ],
+      mode=["graph", "eager"])
+  allreduce_combinations = combinations.combine(
+      cross_device_ops=[
+          combinations.NamedObject(
+              "AllReduce",
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_device_ops_lib.AllReduceCrossDeviceOps(
+                  "hierarchical_copy", 8, 0, 0)),
+          combinations.NamedObject(
+              "AllReduceNoGradientRepacking",
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopyAggregateSmallTensors",
+              cross_device_ops_lib.AllReduceCrossDeviceOps(
+                  "hierarchical_copy", 0, 100, 10))
+      ],
+      distribution=[combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_two_gpus],
+      mode=["graph", "eager"])
+
+  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
+  def testReductionAndBroadcast(self, cross_device_ops, distribution):
+    with distribution.scope():
+      self._testReductionAndBroadcast(cross_device_ops, distribution)
+
+  def testChooseAlgorithm(self):
+    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
+                    [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
+    self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result._num_packs, 8)
+
+    # if there are only 4 devices
+    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
+    self.assertEqual(result._all_reduce_alg, "nccl")
+    self.assertEqual(result._num_packs, 1)
+
+    # if devices links contain each device itself
+    device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
+                    [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
+                    [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
+    self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result._num_packs, 8)
+
+    # if not dgx1-like links
+    device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
+                    [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
+    self.assertEqual(result._all_reduce_alg, "nccl")
+    self.assertEqual(result._num_packs, 1)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      required_gpus=1))
+  def testSimpleReduceWithIndexedSlices(self):
+    devices = ["/cpu:0", "/gpu:0"]
+    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
+    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
+    per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
+    result = cross_device_ops_lib._simple_reduce(
+        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)
+
+    # Test that the result is semantically equal to both the concatenated
+    # IndexedSlices with and without duplicate indices.
+    total_with_dups = _make_indexed_slices(
+        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
+    total_without_dups = _make_indexed_slices(
+        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    self._assert_indexed_slices_equal(total_with_dups, result)
+    self._assert_indexed_slices_equal(total_without_dups, result)
+
+  @combinations.generate(
+      combinations.combine(
+          cross_device_ops_instance=[
+              combinations.NamedObject(
+                  "ReductionToOneDeviceCrossDeviceOps",
+                  cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+              combinations.NamedObject(
+                  "AllReduceCrossDeviceOps",
+                  cross_device_ops_lib.AllReduceCrossDeviceOps())
+          ],
+          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
+          batch_reduce=[True, False],
+          mode=["graph", "eager"],
+          required_gpus=1))
+  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
+                                 batch_reduce):
+    devices = ["/cpu:0", "/gpu:0"]
+    dense_shape = [5, 2]
+    t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
+    t1 = _make_indexed_slices(
+        [[3., 4.], [5., 6.]], [1, 3], dense_shape, devices[1])
+    per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
+
+    if batch_reduce:
+      result = cross_device_ops_instance.batch_reduce(
+          reduce_op, [(per_replica, per_replica)])
+    else:
+      result = cross_device_ops_instance.reduce(
+          reduce_op, per_replica, per_replica)
+
+    total_indices_with_dups = [1, 1, 3]
+    total_indices_without_dups = [1, 3]
+
+    if reduce_op == reduce_util.ReduceOp.SUM:
+      total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
+      total_values_without_dups = [[4., 6.], [5., 6.]]
+    else:
+      assert reduce_op == reduce_util.ReduceOp.MEAN
+      total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
+      total_values_without_dups = [[2., 3.], [2.5, 3.]]
+
+    total_mirrored_with_dups = _make_mirrored_indexed_slices(
+        devices, total_values_with_dups, total_indices_with_dups, dense_shape)
+    total_mirrored_without_dups = _make_mirrored_indexed_slices(
+        devices, total_values_without_dups, total_indices_without_dups,
+        dense_shape)
+
+    # Test that the result is semantically equal to both the concatenated
+    # IndexedSlices, as well as when the duplicate indices are summed up.
+    if batch_reduce:
+      total_mirrored_with_dups = [total_mirrored_with_dups]
+      total_mirrored_without_dups = [total_mirrored_without_dups]
+
+    self._assert_values_equal(total_mirrored_with_dups, result)
+    self._assert_values_equal(total_mirrored_without_dups, result)
+
+
+class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
+                                    CrossDeviceOpsTestBase):
+
+  worker_devices = [
+      "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
+  ]
+  multi_worker_allreduce_combinations = combinations.combine(
+      cross_device_ops=[
+          combinations.NamedObject(
+              "MultiWorkerAllReduce",
+              cross_device_ops_lib.MultiWorkerAllReduce(
+                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
+          combinations.NamedObject(
+              "MultiWorkerAllReducePack",
+              cross_device_ops_lib.MultiWorkerAllReduce(
+                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
+          combinations.NamedObject(
+              "MultiWorkerAllReduceAggregation",
+              cross_device_ops_lib.MultiWorkerAllReduce(
+                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
+          combinations.NamedObject(
+              "MultiWorkerAllReduceMultipleSpecs",
+              cross_device_ops_lib.MultiWorkerAllReduce(
+                  worker_devices, 2, [("pscpu/pscpu", 2, 100),
+                                      ("xring", 2, -1)], 0, 0, 0)),
+      ],
+      distribution=[
+          combinations.NamedDistribution(
+              "MirroredCPU",
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=0),
+              required_gpus=0),
+          combinations.NamedDistribution(
+              "Mirrored1GPU",
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=1),
+              required_gpus=1),
+          combinations.NamedDistribution(
+              "Mirrored2GPUs",
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=2),
+              required_gpus=2),
+          # pylint: disable=g-long-lambda
+          combinations.NamedDistribution(
+              "CoreMirroredCPU",
+              lambda: mirrored_strategy.CoreMirroredStrategy(["/device:CPU:0"]),
+              required_gpus=0),
+          combinations.NamedDistribution(
+              "CoreMirrored1GPU",
+              lambda: mirrored_strategy.CoreMirroredStrategy(["/device:GPU:0"]),
+              required_gpus=1),
+          combinations.NamedDistribution(
+              "CoreMirrored2GPUs",
+              lambda: mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"]),
+              required_gpus=2),
+      ],
+      mode=["graph"])
+
+  @combinations.generate(multi_worker_allreduce_combinations)
+  def testReductionAndBroadcast(self, cross_device_ops, distribution):
+    distribution.configure(cluster_spec={
+        "worker":
+            ["/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"]
+    })
+    with distribution.scope():
+      self._testReductionAndBroadcast(cross_device_ops, distribution)
+
+
+class MultiWorkerCollectiveAllReduceTest(
+    multi_worker_test_base.MultiWorkerTestBase, parameterized.TestCase):
+
+  collective_key_base = 100000
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 2 workers."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=0)
+
+  def setUp(self):
+    super(MultiWorkerCollectiveAllReduceTest, self).setUp()
+    # Reusing keys are not supported well. So we have to give a different
+    # collective key base for different tests.
+    MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000
+
+  def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False):
+    collective_keys = cross_device_utils.CollectiveKeys(
+        group_key_start=10 * num_gpus +
+        MultiWorkerCollectiveAllReduceTest.collective_key_base,
+        instance_key_start=num_gpus * 100 +
+        MultiWorkerCollectiveAllReduceTest.collective_key_base,
+        instance_key_with_id_start=num_gpus * 10000 +
+        MultiWorkerCollectiveAllReduceTest.collective_key_base)
+    if local_mode:
+      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+          1, num_gpus, collective_keys=collective_keys)
+      if num_gpus:
+        devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
+      else:
+        devices = ["/device:CPU:0"]
+      return collective_all_reduce_ops, devices, ""
+    else:
+      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+          3, num_gpus, collective_keys=collective_keys)
+      if num_gpus:
+        devices = [
+            "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i)
+            for i in range(num_gpus)
+        ]
+      else:
+        devices = ["/job:%s/task:%d" % (task_type, task_id)]
+      return (collective_all_reduce_ops, devices,
+              "grpc://" + self._cluster_spec[task_type][task_id])
+
+  def _assert_values_equal(self, left, right, sess):
+    if isinstance(left, list):
+      for l, r in zip(left, right):
+        self._assert_values_equal(l, r, sess)
+    else:
+      self.assertEqual(type(left), type(right))
+      self.assertEqual(set(left.devices), set(right.devices))
+
+      run_options = config_pb2.RunOptions()
+      run_options.experimental.collective_graph_key = 6
+
+      left_values = np.array(
+          sess.run(list(left._index.values()), options=run_options)).flatten()
+      right_values = np.array(list(right._index.values())).flatten()
+      self.assertEqual(len(left_values), len(right_values))
+      for l, r in zip(left_values, right_values):
+        self.assertEqual(l, r)
+
+  def _test_reduction(self, task_type, task_id, num_gpus, local_mode=False):
+    collective_all_reduce, devices, master_target = self._get_test_objects(
+        task_type, task_id, num_gpus, local_mode=local_mode)
+    if local_mode:
+      num_workers = 1
+      worker_device = None
+    else:
+      num_workers = len(self._cluster_spec.get("chief", [])) + len(
+          self._cluster_spec.get("worker", []))
+      worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    with ops.Graph().as_default(), \
+         ops.device(worker_device), \
+         self.cached_session(target=master_target) as sess:
+      # Collective ops doesn't support scalar tensors, so we have to construct
+      # 1-d tensors.
+      values = [constant_op.constant([float(d)]) for d in range(len(devices))]
+      per_replica = _make_per_replica(values, devices, regroup=True)
+      mean = np.array([(len(devices) - 1.) / 2.])
+
+      values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
+      per_replica_2 = _make_per_replica(values_2, devices)
+      mean_2 = np.array([mean[0] + 1.])
+
+      destination_mirrored = _fake_mirrored(1., devices)
+      destination_different = _fake_mirrored(1., _cpu_device)
+      destination_str = _cpu_device
+
+      all_destinations = [
+          destination_different, destination_mirrored, destination_str
+      ]
+
+      # test reduce()
+      for destinations in all_destinations:
+        self._assert_values_equal(
+            collective_all_reduce.reduce(
+                reduce_util.ReduceOp.MEAN,
+                per_replica,
+                destinations=destinations),
+            _fake_mirrored(mean, destinations), sess)
+        self._assert_values_equal(
+            collective_all_reduce.reduce(
+                reduce_util.ReduceOp.MEAN,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2, destinations), sess)
+        self._assert_values_equal(
+            collective_all_reduce.reduce(
+                reduce_util.ReduceOp.SUM,
+                per_replica,
+                destinations=destinations),
+            _fake_mirrored(mean * len(devices) * num_workers, destinations),
+            sess)
+        self._assert_values_equal(
+            collective_all_reduce.reduce(
+                reduce_util.ReduceOp.SUM,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
+            sess)
+
+      # test batch_reduce()
+      for d1, d2 in itertools.product(all_destinations, all_destinations):
+        self._assert_values_equal(
+            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.MEAN,
+                                               [(per_replica, d1),
+                                                (per_replica_2, d2)]),
+            [
+                _fake_mirrored(mean, d1),
+                _fake_mirrored(mean_2, d2)
+            ], sess)
+        self._assert_values_equal(
+            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.SUM,
+                                               [(per_replica, d1),
+                                                (per_replica_2, d2)]),
+            [
+                _fake_mirrored(mean * len(devices) * num_workers, d1),
+                _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
+            ], sess)
+
+    return True
+
+  @combinations.generate(
+      combinations.combine(mode=["graph"], num_gpus=[0, 1, 2], required_gpus=1))
+  def testReductionDistributed(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      return
+    self._run_between_graph_clients(self._test_reduction, self._cluster_spec,
+                                    num_gpus)
+
+  # Collective ops doesn't support strategy with one device.
+  def testReductionLocal(self, num_gpus=2):
+    if context.num_gpus() < num_gpus:
+      return
+    self._test_reduction(None, None, num_gpus, local_mode=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_device_utils_test.py b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2303a31677afbd12a0b8e7eea3ecf7c7736c46ad
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
@@ -0,0 +1,141 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for cross_device_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+
+
+class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
+
+  def _assert_values_equal(self, left, right):
+    self.assertAllEqual(
+        self.evaluate(ops.convert_to_tensor(left)),
+        self.evaluate(ops.convert_to_tensor(right)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAggregateTensors(self):
+    t0 = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
+    t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]])
+    total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
+    result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    self._assert_values_equal(total, result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAggregateIndexedSlices(self):
+    t0 = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(
+        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
+    total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
+    result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    self.assertIsInstance(result, ops.IndexedSlices)
+    self._assert_values_equal(total, result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDivideTensor(self):
+    t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
+    n = 2
+    expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
+    result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    self._assert_values_equal(expected, result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDivideIndexedSlices(self):
+    t = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    n = 2
+    expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
+    result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    self.assertIsInstance(result, ops.IndexedSlices)
+    self._assert_values_equal(expected, result)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testIsIndexedSlices(self):
+    t = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    self.assertTrue(cross_device_utils.contains_indexed_slices(t))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testContainsIndexedSlices_List(self):
+    t0 = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(
+        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
+    self.assertTrue(cross_device_utils.contains_indexed_slices([t0, t1]))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testContainsIndexedSlices_Tuple(self):
+    t0 = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(
+        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
+    self.assertTrue(cross_device_utils.contains_indexed_slices((t0, t1)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testContainsIndexedSlices_PerReplica(self):
+    t0 = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(
+        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
+    per_replica = value_lib.PerReplica({"/gpu:0": t0, "/cpu:0": t1})
+    self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica))
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      required_gpus=1))
+  def testCopyTensor(self):
+    with ops.device("/cpu:0"):
+      t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
+    destination = "/gpu:0"
+    result = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
+        t, destination)
+
+    self._assert_values_equal(t, result)
+    self.assertEqual(device_util.resolve(destination),
+                     device_util.resolve(result.device))
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      required_gpus=1))
+  def testCopyIndexedSlices(self):
+    with ops.device("/cpu:0"):
+      t = math_ops._as_indexed_slices(
+          constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    destination = "/gpu:0"
+    result = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
+        t, destination)
+
+    self.assertIsInstance(result, ops.IndexedSlices)
+    self._assert_values_equal(t, result)
+    self.assertEqual(device_util.resolve(destination),
+                     device_util.resolve(result.device))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
deleted file mode 100644
index e08ba9c2a668cd675defb025d7ad060e1338506b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ /dev/null
@@ -1,959 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes for different algorithms of reduction and broadcasting."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import six
-
-from tensorflow.contrib.distribute.python import cross_tower_utils
-from tensorflow.contrib.distribute.python import values as value_lib
-from tensorflow.python.client import device_lib
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import device_util
-
-
-def check_destinations(destinations):
-  """Checks whether `destinations` is not empty.
-
-  Args:
-    destinations: a DistributedValues, Variable, string or a list of strings.
-
-  Returns:
-    Boolean which is True if `destinations` is not empty.
-  """
-  # Calling bool() on a ResourceVariable is not allowed.
-  if isinstance(destinations, resource_variable_ops.ResourceVariable):
-    return bool(destinations.device)
-  return bool(destinations)
-
-
-def validate_destinations(destinations):
-  if not isinstance(
-      destinations,
-      (value_lib.DistributedValues, resource_variable_ops.ResourceVariable,
-       value_lib.AggregatingVariable, six.string_types, list)):
-    raise ValueError("destinations must be one of a `DistributedValues` object,"
-                     " a tf.Variable object, a device string, a list of device "
-                     "strings")
-
-  if not check_destinations(destinations):
-    raise ValueError("destinations can not be empty")
-
-
-def _make_tensor_into_per_device(input_tensor):
-  """Converts a single tensor into a PerDevice object."""
-  if isinstance(input_tensor, (tuple, list)):
-    raise ValueError("Cannot convert `input_tensor` to a `PerDevice` object, "
-                     "got %r but expected a object that is not a tuple or list."
-                     % (input_tensor,))
-  if isinstance(input_tensor, value_lib.PerDevice):
-    return input_tensor
-
-  try:
-    device = input_tensor.device
-  except AttributeError:
-    raise ValueError("Cannot convert `input_tensor` to a `PerDevice` object "
-                     "because it doesn't have device set.")
-
-  return value_lib.PerDevice({device: input_tensor})
-
-
-def _normalize_value_destination_pairs(value_destination_pairs):
-  """Converts each tensor into a PerDevice object in the input list."""
-  result = []
-  if not isinstance(value_destination_pairs, (list, tuple)):
-    raise ValueError("`value_destination_pairs` should be a list or tuple")
-  for pair in value_destination_pairs:
-    if not isinstance(pair, tuple):
-      raise ValueError(
-          "Each element of `value_destination_pairs` should be a tuple.")
-    if len(pair) != 2:
-      raise ValueError("Each element of `value_destination_pairs` should be a "
-                       "tuple of size 2.")
-
-    per_device = _make_tensor_into_per_device(pair[0])
-    result.append((per_device, pair[1]))
-  return result
-
-
-def _validate_value_destination_pairs(value_destination_pairs):
-  # TODO(yuefengz): raise exceptions instead of returning False.
-  # pylint: disable=g-missing-docstring
-  if not value_destination_pairs: return False
-  if not isinstance(value_destination_pairs, (list, tuple)): return False
-  if not all([isinstance(pair, tuple) for pair in value_destination_pairs]):
-    return False
-  if not all([isinstance(v[0], value_lib.PerDevice)
-              for v in value_destination_pairs]):
-    return False
-  return True
-
-
-# TODO(yuefengz): consider calling this function in the caller of CrossTowerOps.
-def get_devices_from(destinations):
-  if isinstance(destinations, value_lib.DistributedValues):
-    return list(destinations.devices)
-  elif isinstance(destinations, (resource_variable_ops.ResourceVariable,
-                                 value_lib.AggregatingVariable)):
-    return [destinations.device]
-  elif isinstance(destinations, six.string_types):
-    return [device_util.resolve(destinations)]
-  elif isinstance(destinations, (list, tuple)):
-    return [device_util.resolve(destination) for destination in destinations]
-  else:
-    return [destinations.device]
-
-
-def _devices_match(left, right):
-  return set(get_devices_from(left)) == set(get_devices_from(right))
-
-
-def _all_devices_match(value_destination_pairs):
-  if not all([_devices_match(v, d) for v, d in value_destination_pairs]):
-    return False
-  if not all([_devices_match(v, value_destination_pairs[0][0])
-              for v, _ in value_destination_pairs[1:]]):
-    return False
-  return True
-
-
-def _simple_broadcast(value, destinations):
-  index = {}
-  devices = get_devices_from(destinations)
-  for d in devices:
-    index[d] = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
-        value, d)
-  return value_lib.Mirrored(index)
-
-
-def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn,
-                   aggregation):
-  # pylint: disable=g-missing-docstring
-  all_values = []
-  count = 0
-  for v in per_device_value._index.values():  # pylint: disable=protected-access
-    if isinstance(v, value_lib.MapOutput):
-      v_list = v.get()
-      if not v_list:
-        continue
-      count += len(v_list)
-      # Sum within each device before aggregating across devices.
-      # TODO(yuefengz): Check whether it helps to use accumulation_fn here.
-      v = cross_tower_utils.aggregate_tensors_or_indexed_slices(
-          v_list, math_ops.add_n)
-    else:
-      count += 1
-    all_values.append(v)
-  if not all_values:
-    raise ValueError("`per_device_value` must be non-empty")
-
-  with ops.device(reduce_to_device):
-    with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-      reduced = cross_tower_utils.aggregate_tensors_or_indexed_slices(
-          all_values, accumulation_fn)
-      if aggregation == vs.VariableAggregation.MEAN:
-        reduced = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(
-            reduced, count)
-      elif aggregation != vs.VariableAggregation.SUM:
-        raise ValueError("`aggregation` must be VariableAggregation.SUM "
-                         "or VariableAggregation.MEAN.")
-  return reduced
-
-
-class CrossTowerOps(object):
-  """Base class for cross-tower reduction and broadcasting algorithms."""
-
-  def __init__(self):
-    pass
-
-  def reduce(self, aggregation, per_device_value, destinations):
-    """Reduce `per_device_value` to `destinations`.
-
-    It runs the reduction operation defined by `aggregation` and put the
-    result on `destinations`.
-
-    Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
-      per_device_value: a PerDevice object or a tensor with device set.
-      destinations: the reduction destinations.
-
-    Returns:
-      a Mirrored object.
-
-    Raises:
-      ValueError: if per_device_value is not a PerDevice object.
-    """
-    if not isinstance(per_device_value, value_lib.PerDevice):
-      per_device_value = _make_tensor_into_per_device(per_device_value)
-
-    validate_destinations(destinations)
-    return self._reduce(aggregation, per_device_value, destinations)
-
-  def batch_reduce(self, aggregation, value_destination_pairs):
-    """Reduce PerDevice objects in a batch.
-
-    Reduce each first element in `value_destination_pairs` to each second
-    element which indicates the destinations.
-
-    Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
-      value_destination_pairs: a list or a tuple of tuples of PerDevice objects
-        (or tensors with device set if there is one tower) and destinations.
-
-    Returns:
-      a list of Mirrored objects.
-
-    Raises:
-      ValueError: if `value_destination_pairs` is not a list or a tuple of
-        tuples of PerDevice objects and destinations
-    """
-    if not _validate_value_destination_pairs(value_destination_pairs):
-      # If the first element of each pair is a tensor, we try to turn it into a
-      # PerDevice object.
-      value_destination_pairs = _normalize_value_destination_pairs(
-          value_destination_pairs)
-
-    for _, d in value_destination_pairs:
-      validate_destinations(d)
-
-    return self._batch_reduce(aggregation, value_destination_pairs)
-
-  def broadcast(self, tensor, destinations):
-    """Broadcast the `tensor` to destinations.
-
-    Args:
-      tensor: the tensor to broadcast.
-      destinations: the broadcast destinations.
-
-    Returns:
-      a Mirrored object.
-    """
-    validate_destinations(destinations)
-    return self._broadcast(tensor, destinations)
-
-  def _reduce(self, aggregation, per_device_value, destinations):
-    raise NotImplementedError(
-        "_reduce method must be implemented in descendants.")
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    raise NotImplementedError(
-        "_batch_reduce method must be implemented in descendants.")
-
-  def _broadcast(self, tensor, destinations):
-    return _simple_broadcast(tensor, destinations)
-
-
-class ReductionToOneDeviceCrossTowerOps(CrossTowerOps):
-  """Always do reduction to one device first and then do broadcasting.
-
-    Batch reduction is done by reduction on each element one by one.
-  """
-
-  def __init__(self, reduce_to_device=None, accumulation_fn=math_ops.add_n):
-    """Constructor.
-
-    Args:
-      reduce_to_device: the intermediate device to reduce to. If None, reduce
-        to the first device in `destinations` of the reduce() method.
-      accumulation_fn: a function that does accumulation.
-    """
-    self.reduce_to_device = reduce_to_device
-    self.accumulation_fn = accumulation_fn
-    super(ReductionToOneDeviceCrossTowerOps, self).__init__()
-
-  def _reduce(self, aggregation, per_device_value, destinations):
-    if check_destinations(destinations):
-      devices = get_devices_from(destinations)
-    else:
-      devices = get_devices_from(per_device_value)
-    reduce_to_device = self.reduce_to_device or devices[0]
-    reduced = _simple_reduce(per_device_value, reduce_to_device,
-                             self.accumulation_fn, aggregation)
-    return self.broadcast(reduced, devices)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    return [
-        self._reduce(aggregation, t, destinations=v)
-        for t, v in value_destination_pairs
-    ]
-
-
-def _group_value_by_device(per_device_values):
-  """Group values into sublists by their devices.
-
-  This grouping is needed to call the all-reduce library because it expects a
-  list of the following form:
-    [[(grad0_gpu0, v0_gpu0), (grad1_gpu0, v1_gpu0), (grad2_gpu0, v2_gpu0) ...],
-     [(grad0_gpu1, v0_gpu1), (grad1_gpu1, v1_gpu1), (grad2_gpu1, v2_gpu1) ...],
-     [(grad0_gpu2, v0_gpu2), (grad1_gpu0, v1_gpu2), (grad2_gpu0, v2_gpu2) ...],
-     ...
-    ]
-
-  Args:
-    per_device_values: a list of PerDevice obejcts.
-
-  Returns:
-    a list of lists, each sublist has components for its corresponding device of
-      PerDevice objects, paired with a None.
-  """
-  destinations = per_device_values[0].devices
-  grouped = [[] for _ in range(len(destinations))]
-  for per_device_value in per_device_values:
-    # pylint: disable=protected-access
-    for i, v in enumerate(per_device_value._index.values()):
-      assert per_device_value.devices == destinations
-      grouped[i].append((v, None))
-  return grouped
-
-
-def _ungroup_and_make_mirrored(grouped_reduced,
-                               destinations,
-                               aggregation,
-                               num_between_graph_workers=1):
-  """Ungroup results from all-reduce and make Mirrored objects.
-
-  Each all-reduce result will be divided by the number of destinations before
-  Mirrored objects are created if aggregation is "mean".
-
-  Args:
-    grouped_reduced: a list of lists, each sublist has components for each
-      device, paired with a None. It is the result from
-      cross_tower_utils.aggregate_gradients_using*.
-    destinations: a list of device strings for returned Mirrored objects.
-    aggregation: Indicates how a variable will be aggregated. Accepted values
-      are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
-    num_between_graph_workers: number of workers in the between-graph
-      replication.
-
-  Returns:
-    a list of Mirrored objects.
-  """
-  index = [{} for _ in range(len(grouped_reduced[0]))]
-  for d, per_device_reduced in enumerate(grouped_reduced):
-    for i, (v, _) in enumerate(per_device_reduced):
-      if aggregation == vs.VariableAggregation.MEAN:
-        index[i][destinations[d]] = v / (
-            len(destinations) * num_between_graph_workers)
-      else:
-        index[i][destinations[d]] = v
-  return [value_lib.Mirrored(v) for v in index]
-
-
-class ConcatAndSplitPacker(object):
-  """Concatenate and split tensors for reduction."""
-
-  def __init__(self, num_packs=1):
-    """Initialize the ConcatAndSplitPacker object.
-
-    Args:
-      num_packs: specifies the number of split packs that will be
-        formed.
-
-    Raises:
-      ValueError: if num_packs is not greater than 0.
-    """
-    if num_packs <= 0:
-      raise ValueError("num_packs must be greater than zero.")
-    self.num_packs = num_packs
-
-  def pack(self, grouped_grads_and_vars):
-    """Pack tensors."""
-    self.grouped_grads_and_vars = grouped_grads_and_vars
-    self.all_tower_shapes = []
-    self.all_tower_sizes = []
-
-    device_grad_packs = []
-    for tower_grads_and_vars in grouped_grads_and_vars:
-      with ops.colocate_with(tower_grads_and_vars[0][0]):
-        # Flatten all the grads.
-        flat_grads = [
-            array_ops.reshape(g, [-1]) for g, _ in tower_grads_and_vars
-        ]
-        # Remember the original shape of all the grads.
-        tower_shapes = [array_ops.shape(g) for g, _ in tower_grads_and_vars]
-        # Remember the original sizes of all the grads.
-        tower_sizes = [array_ops.size(g) for g, _ in tower_grads_and_vars]
-        # Concat all the flat grads into a big flat tensor.
-        concat_grads = array_ops.concat(flat_grads, 0)
-
-        # Split the big tensor into num_splits packs. In cases where the
-        # total size is not divisible num_splits, the last pack gets
-        # more elements.
-        # TODO(zhengxq): it is also possible to optimize away all the concat
-        # as well.
-        num_splits = self.num_packs
-
-        # The array_ops.size function will sometimes remove static shapes. So if
-        # all gradient shapes are defined, we use another method to get the
-        # total size.
-        # TODO(yuefengz): move this logic to array_ops.size.
-        if all([g.shape.is_fully_defined() for g, _ in tower_grads_and_vars]):
-          total_grad_size = sum(
-              [g.shape.num_elements() for g, _ in tower_grads_and_vars])
-        else:
-          total_grad_size = array_ops.size(concat_grads)
-
-        split_size = total_grad_size // num_splits
-        split_size_last = total_grad_size - split_size * (num_splits - 1)
-        split_sizes = [split_size] * (num_splits - 1) + [split_size_last]
-        grad_packs = array_ops.split(concat_grads, split_sizes)
-
-        # Ready to aggregate the repacked gradients, with fake variables.
-        # TODO(zhengxq): It is hacky to have to use fake variables.
-        # We should remove the need for variables in
-        # aggregate_gradients_using*.
-        device_grad_packs.append(zip(grad_packs, [None] * num_splits))
-        self.all_tower_shapes.append(tower_shapes)
-        self.all_tower_sizes.append(tower_sizes)
-
-    return device_grad_packs
-
-  def unpack(self, summed_device_grad_packs):
-    """Reverse the pack."""
-    aggregated_device_grads = []
-    for (summed_tower_grad_packs,
-         tower_grads_and_vars, tower_shapes, tower_sizes) in zip(
-             summed_device_grad_packs, self.grouped_grads_and_vars,
-             self.all_tower_shapes, self.all_tower_sizes):
-      # pylint: enable=line-too-long
-      # Reverse the packing operations in the previous steps. Form the
-      # summed gradients back into their original shapes.
-      with ops.colocate_with(summed_tower_grad_packs[0][0]):
-        # Form a list of the summed grad packs.
-        device_grad_packs = [g for g, _ in summed_tower_grad_packs]
-
-        # Concat them back into a big flat tensor.
-        device_grads_concat = array_ops.concat(device_grad_packs, 0)
-
-        # Split the tensors back into their original sizes.
-        grads_with_sizes = array_ops.split(device_grads_concat, tower_sizes)
-
-        # Reshape the tensors back into their original shapes.
-        grads_with_shapes = [
-            array_ops.reshape(grad, shape)
-            for shape, grad in zip(tower_shapes, grads_with_sizes)
-        ]
-
-        # Form the list with the original list of variables.
-        summed_tower_grads = [
-            (g, v) for g, (_, v) in zip(grads_with_shapes, tower_grads_and_vars)
-        ]
-        aggregated_device_grads.append(summed_tower_grads)
-    return aggregated_device_grads
-
-
-class AggregateSmallTensorPacker(object):
-  """Concatenate small gradient tensors together for reduction."""
-
-  def __init__(self,
-               agg_small_grads_max_bytes=1048576,
-               agg_small_grads_max_group=16):
-    """Initialize the AggregateSmallTensorPacker object.
-
-    Args:
-      agg_small_grads_max_bytes: largest tensor eligible for aggregation,
-        in number of bytes.
-      agg_small_grads_max_group: largest permitted aggregation of small
-        tensors.
-
-    Raises:
-      ValueError: if `agg_small_grads_max_bytes` or `agg_small_grads_max_group`
-        is not greater than 0.
-    """
-    if agg_small_grads_max_bytes <= 0 or agg_small_grads_max_group <= 0:
-      raise ValueError("agg_small_grads_max_bytes and agg_small_grads_max_group"
-                       " should both be greater than zero.")
-    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
-    self.agg_small_grads_max_group = agg_small_grads_max_group
-
-  def pack(self, grouped_grads_and_vars):
-    """Aggregate small tensors."""
-    if (self.agg_small_grads_max_bytes > 0 and
-        self.agg_small_grads_max_group > 0):
-      tower_grads, self.packing = cross_tower_utils.pack_small_tensors(
-          grouped_grads_and_vars,
-          max_bytes=self.agg_small_grads_max_bytes,
-          max_group=self.agg_small_grads_max_group)
-    return tower_grads
-
-  def unpack(self, summed_device_grad_packs):
-    """Reverse the aggregation process."""
-    return cross_tower_utils.unpack_small_tensors(summed_device_grad_packs,
-                                                  self.packing)
-
-
-def _pack_tensors(device_grads,
-                  num_packs=0,
-                  agg_small_grads_max_bytes=0,
-                  agg_small_grads_max_group=0):
-  """Pack tensors if specified."""
-  if num_packs > 0:
-    tensor_packer = ConcatAndSplitPacker(num_packs)
-    device_grad_packs = tensor_packer.pack(device_grads)
-  elif agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
-    tensor_packer = AggregateSmallTensorPacker(agg_small_grads_max_bytes,
-                                               agg_small_grads_max_group)
-    device_grad_packs = tensor_packer.pack(device_grads)
-  else:
-    tensor_packer = None
-    device_grad_packs = device_grads
-  return device_grad_packs, tensor_packer
-
-
-def _unpack_tensors(reduced, tensor_packer=None):
-  """Unpack tensors if they are packed before all-reduce."""
-  if tensor_packer:
-    return tensor_packer.unpack(reduced)
-  return reduced
-
-
-class AllReduceCrossTowerOps(CrossTowerOps):
-  """Reduction using all reduce."""
-
-  def __init__(self,
-               all_reduce_alg="nccl",
-               num_packs=1,
-               agg_small_grads_max_bytes=0,
-               agg_small_grads_max_group=10):
-    """All-reduce implementation of CrossTowerOps.
-
-    Before performing all-reduce, tensors will be repacked or aggregated for
-    more efficient cross-device transportation:
-      1) If `num_packs` is non-zero, pack values into
-        `num_packs` splits.
-      2) Otherwise, if `agg_small_grads_max_bytes` > 0 and
-        `agg_small_grads_max_group` > 0, aggregate values smaller than
-        `agg_small_grads_max_bytes` into groups with at most
-        `agg_small_grads_max_group` values.
-      3) Otherwise, no repacking or grouping will happen.
-
-    Args:
-      all_reduce_alg: the all-reduce algorithm to use, currently only "nccl" or
-        "hierarchical_copy" are supported.
-      num_packs: see above.
-      agg_small_grads_max_bytes: see above.
-      agg_small_grads_max_group: see above.
-        tensors.
-    """
-    self._all_reduce_alg = all_reduce_alg
-    self._num_packs = num_packs
-    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
-    self._agg_small_grads_max_group = agg_small_grads_max_group
-    super(AllReduceCrossTowerOps, self).__init__()
-
-  def _reduce(self, aggregation, per_device_value, destinations):
-    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
-        per_device_value)
-    if (_devices_match(per_device_value, destinations)
-        and not context.executing_eagerly()
-        and not contains_indexed_slices):
-      return self._batch_all_reduce(aggregation, [per_device_value])[0]
-    else:
-      if contains_indexed_slices:
-        logging.log_first_n(
-            logging.WARN,
-            "Efficient allreduce is not supported for IndexedSlices.", 10)
-
-      if check_destinations(destinations):
-        devices = get_devices_from(destinations)
-      else:
-        devices = get_devices_from(per_device_value)
-      reduce_to_device = devices[0]
-      reduced = _simple_reduce(per_device_value, reduce_to_device,
-                               math_ops.add_n, aggregation)
-      return self.broadcast(reduced, devices)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    all_devices_match = _all_devices_match(value_destination_pairs)
-    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
-        value_destination_pairs)
-    if (all_devices_match and not context.executing_eagerly()
-        and not contains_indexed_slices):
-      return self._batch_all_reduce(aggregation,
-                                    [v[0] for v in value_destination_pairs])
-    else:
-      if not all_devices_match:
-        logging.log_first_n(logging.WARN,
-                            "Efficient batch_reduce is not supported if "
-                            "destinations are different.",
-                            10)
-
-      return [
-          self._reduce(aggregation, t, destinations=v)
-          for t, v in value_destination_pairs
-      ]
-
-  def _batch_all_reduce(self, aggregation, per_device_values):
-    """All reduce algorithm in a batch."""
-    logging.log_first_n(
-        logging.INFO, "batch_all_reduce invoked for batches size = %d with "
-        "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
-        "agg_small_grads_max_group = %d" %
-        (len(per_device_values), self._all_reduce_alg, self._num_packs,
-         self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
-    destinations = per_device_values[0].devices
-    grouped = _group_value_by_device(per_device_values)
-
-    device_grad_packs, tensor_packer = _pack_tensors(
-        grouped, self._num_packs, self._agg_small_grads_max_bytes,
-        self._agg_small_grads_max_group)
-
-    # The actual aggregation of the repacked gradients. Note that they are
-    # sharded among different aggregation trees. So it is important to strike
-    # the balance on num_splits.
-    if self._all_reduce_alg == "nccl":
-      # TODO(yuefengz): merge this into the all-reduce library.
-      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
-          device_grad_packs)
-    else:
-      # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
-      # order.
-      reduced = (
-          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
-              destinations, device_grad_packs))
-
-    reduced = _unpack_tensors(reduced, tensor_packer)
-    return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
-                                      aggregation)
-
-
-AllReduceSpecTuple = collections.namedtuple("AllReduceSpecTuple",
-                                            "alg shards limit")
-
-
-class MultiWorkerAllReduce(AllReduceCrossTowerOps):
-  """All-reduce algorithms for distributed TensorFlow."""
-
-  def __init__(self,
-               worker_devices,
-               num_gpus_per_worker,
-               all_reduce_spec=("pscpu/pscpu", 2, -1),
-               num_packs=0,
-               agg_small_grads_max_bytes=0,
-               agg_small_grads_max_group=10):
-    """Initialize the all-reduce algorithm.
-
-    Args:
-      worker_devices: a list of device strings for workers participating in
-        all-reduce.
-      num_gpus_per_worker: number of GPU devices per worker.
-      all_reduce_spec: a tuple or a named tuple or a list of tuples specifying
-        the all-reduce algorithm.
-        1. The first element of a tuple is the name of the all-reduce algorithm.
-        Valid algorithm names are: "nccl", "nccl/xring", "nccl/rechd",
-        "nccl/pscpu", "xring", "pscpu", "psgpu", "pscpu/pscpu". Algorithms with
-        a "/" are hierarchical, so two all-reduces are executed, the first one
-        aggregates tensors within a worker and the second aggregates across
-        workers.
-        2. The second element of a tuple is the number of shards when doing
-        all-reduce. Let's say its values is M, each tensor after packing will be
-        split into M shards and then M parallel all-reduces would be performed
-        before finally they are concatenated backed into a complete tensor.
-        3. The third element is the maximum size of tensors that will be
-        applicable for the algorithm specified by the first element. For
-        example, if all_reduce_spec=[("nccl", 2, 1024), ("pscpu/pscpu", 2, -1)],
-        tensors with size not larger than 1024 bytes will be applied a 2-shard
-        "nccl" all-reduce and other tensors will be applied a 2-shard
-        "pscpu/pscpu" algorithm. The third elements should be in increasing
-        order across tuples and end with -1 which indicates infinity.
-      num_packs: see AllReduceCrossTowerOps.
-      agg_small_grads_max_bytes: see AllReduceCrossTowerOps.
-      agg_small_grads_max_group: see AllReduceCrossTowerOps.
-    """
-    self._worker_devices = worker_devices
-    self._num_gpus_per_worker = num_gpus_per_worker
-    super(MultiWorkerAllReduce, self).__init__(
-        num_packs=num_packs,
-        agg_small_grads_max_bytes=agg_small_grads_max_bytes,
-        agg_small_grads_max_group=agg_small_grads_max_group)
-
-    def validate_and_complete_spec(spec):
-      """Validate and complete the all-reduce spec."""
-      # TODO(yuefengz): support namedtuple.
-      if not isinstance(spec, tuple):
-        raise ValueError(
-            "A tuple is expected for all-reduce spec: %r" % all_reduce_spec)
-      if not spec or len(spec) > 3:
-        raise ValueError(
-            "Too many elements in the all-reduce spec tuple: %r" % spec)
-      if len(spec) == 1:
-        return AllReduceSpecTuple(spec[0], 1, -1)
-      elif len(spec) == 2:
-        return AllReduceSpecTuple(spec[0], spec[1], -1)
-      else:
-        return AllReduceSpecTuple(*spec)
-
-    self._all_reduce_spec = []
-    if isinstance(all_reduce_spec, six.string_types):
-      self._all_reduce_spec.append(AllReduceSpecTuple(all_reduce_spec, 1, -1))
-    elif isinstance(all_reduce_spec, tuple):
-      self._all_reduce_spec.append(validate_and_complete_spec(all_reduce_spec))
-    elif isinstance(all_reduce_spec, list):
-      self._all_reduce_spec = [
-          validate_and_complete_spec(spec) for spec in all_reduce_spec
-      ]
-
-  def _batch_all_reduce(self, aggregation, per_device_values):
-    """All reduce algorithm in a batch."""
-    logging.log_first_n(
-        logging.INFO,
-        "distributed batch_all_reduce invoked for batches size = %d with "
-        "allreduce_spec = %r, num_packs = %d, agg_small_grads_max_bytes = %d "
-        "and agg_small_grads_max_group = %d" %
-        (len(per_device_values), self._all_reduce_spec, self._num_packs,
-         self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
-
-    destinations = sorted(per_device_values[0].devices)
-    device_grads = _group_value_by_device(per_device_values)
-
-    # The all reduce library requires fully defined shapes.
-    # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
-    # required as well.
-    for device_grad in device_grads:
-      for grad, _ in device_grad:
-        if not grad.shape.is_fully_defined():
-          raise ValueError("Shape is unknown for node %r" % grad)
-
-    remaining_grads = device_grads
-    aggregated_grads = []
-    for spec_tuple in self._all_reduce_spec:
-      if spec_tuple.limit < 0:
-        this_grads = remaining_grads
-        remaining_grads = []
-      else:
-        (this_grads, remaining_grads) = cross_tower_utils.split_grads_by_size(
-            spec_tuple.limit, remaining_grads)
-      if this_grads:
-        device_grad_packs, tensor_packer = _pack_tensors(
-            this_grads, self._num_packs, self._agg_small_grads_max_bytes,
-            self._agg_small_grads_max_group)
-        range_agg_grads = cross_tower_utils.sum_gradients_all_reduce(
-            self._worker_devices, device_grad_packs, len(self._worker_devices),
-            spec_tuple.alg, spec_tuple.shards, range(self._num_gpus_per_worker))
-        range_agg_grads = _unpack_tensors(range_agg_grads, tensor_packer)
-
-        if not aggregated_grads:
-          aggregated_grads = range_agg_grads
-        else:
-          assert len(aggregated_grads) == len(range_agg_grads)
-          for i in range(len(aggregated_grads)):
-            aggregated_grads[i] += range_agg_grads[i]
-    assert not remaining_grads
-
-    return _ungroup_and_make_mirrored(aggregated_grads, destinations,
-                                      aggregation)
-
-
-# TODO(yuefengz): support in-graph collective all-reduce.
-class CollectiveAllReduce(CrossTowerOps):
-  """All-reduce cross tower ops using collective ops.
-
-  In the between-graph replicated training, it will still do all-reduces across
-  all workers and then put results on the right destinations.
-  """
-
-  def __init__(self,
-               num_workers=1,
-               num_gpus_per_worker=0,
-               all_reduce_merge_scope=32,
-               collective_keys=None):
-    """Initializes the object.
-
-    Args:
-      num_workers: number of workers in the between-graph replicated training.
-      num_gpus_per_worker: number of GPUs per worker.
-      all_reduce_merge_scope: size of groups into which to partition consecutive
-        gradients grouped under a common 'allreduce' name scope. This is useful
-        for some optimization of collective ops.
-      collective_keys: an optional CollectiveKey object.
-    """
-    self._num_workers = num_workers
-    self._num_gpus_per_worker = num_gpus_per_worker
-    self._all_reduce_merge_scope = all_reduce_merge_scope
-    self._collective_keys = collective_keys or cross_tower_utils.CollectiveKeys(
-    )
-    super(CollectiveAllReduce, self).__init__()
-
-  # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
-  def _reduce(self, aggregation, per_device_value, destinations):
-    if cross_tower_utils.contains_indexed_slices(per_device_value):
-      raise ValueError(
-          "`IndexSlices` is not supported for Collective All-Reduce.")
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution is not supported for Collective All-Reduce")
-
-    all_reduced = self._batch_all_reduce(aggregation, [per_device_value])[0]
-    if _devices_match(per_device_value, destinations):
-      return all_reduced
-    else:
-      index = {}
-      for d in get_devices_from(destinations):
-        # pylint: disable=protected-access
-        if d in all_reduced._index:
-          index[d] = all_reduced._index[d]
-        else:
-          with ops.control_dependencies(list(
-              all_reduced._index.values())), ops.device(d):
-            index[d] = array_ops.identity(list(all_reduced._index.values())[0])
-
-      return value_lib.Mirrored(index)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if cross_tower_utils.contains_indexed_slices(value_destination_pairs):
-      raise ValueError(
-          "`IndexSlices` is not supported for Collective All-Reduce.")
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution is not supported for Collective All-Reduce")
-
-    all_devices_match = _all_devices_match(value_destination_pairs)
-    if all_devices_match:
-      return self._batch_all_reduce(aggregation,
-                                    [v[0] for v in value_destination_pairs])
-    else:
-      if not all_devices_match:
-        logging.log_first_n(
-            logging.WARN, "Efficient batch_reduce is not supported if "
-            "destinations are different.", 10)
-
-      return [
-          self._reduce(aggregation, t, destinations=v)
-          for t, v in value_destination_pairs
-      ]
-
-  def _batch_all_reduce(self, aggregation, per_device_values):
-    """All-reduce across all workers in a batch."""
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution with collective ops is not supported yet.")
-
-    logging.log_first_n(
-        logging.INFO, "Collective All-reduce invoked with batches size = %d, "
-        "num_workers = %d" % (len(per_device_values), self._num_workers), 10)
-
-    grouped_by_tower = _group_value_by_device(per_device_values)
-
-    grouped_by_var = list(zip(*grouped_by_tower))
-    # grouped_by_var is grouped by variables and takes the following format:
-    # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..),
-    #  ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..),
-    #  ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..),
-    #  ...
-    # ]
-    chunked_gv = [
-        grouped_by_var[x:x + self._all_reduce_merge_scope]
-        for x in range(0, len(grouped_by_var), self._all_reduce_merge_scope)
-    ]
-
-    reduced_gv_list = []
-    for chunk in chunked_gv:
-      with ops.name_scope("allreduce"):
-        for grad_and_vars in chunk:
-          scaled_grads = [g for g, _ in grad_and_vars]
-          collective_reduced = cross_tower_utils.build_collective_reduce(
-              scaled_grads, self._num_workers, self._collective_keys, "Add",
-              "Id")
-          result = []
-          for (_, v), g in zip(grad_and_vars, collective_reduced):
-            result.append([g, v])
-          reduced_gv_list.append(result)
-
-    new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
-    return _ungroup_and_make_mirrored(
-        new_tower_grads,
-        per_device_values[0].devices,
-        aggregation,
-        num_between_graph_workers=self._num_workers)
-
-
-_dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
-               [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
-
-
-def _has_dgx1_like_links(gpu_links):
-  if not gpu_links:
-    return False
-  # TODO(yuefengz): figure out the right topology for hierarchial copy if
-  # number of gpus are less than 8.
-  if len(gpu_links) < 8:
-    return False
-  for i, (gpu_link, dgx1_link) in enumerate(zip(gpu_links, _dgx1_links)):
-    if (set(gpu_link) != set(dgx1_link) and
-        set(gpu_link) != set(dgx1_link + [i])):
-      return False
-  return True
-
-
-def _choose_all_reduce_algorithm(device_links):
-  if _has_dgx1_like_links(device_links):
-    logging.info("Configured hierarchical_copy with num_packs=%d",
-                 len(device_links))
-    return AllReduceCrossTowerOps(
-        "hierarchical_copy", num_packs=len(device_links))
-  else:
-    logging.info("Configured nccl all-reduce.")
-    return AllReduceCrossTowerOps("nccl", num_packs=1)
-
-
-def choose_the_best(devices, session_config=None):
-  """Find the best subclass of CrossTowerOps given a tensorflow session.
-
-  Args:
-    devices: a list of devices passed for distribute strategy.
-    session_config: a tensorflow session config or None. If None, it will make
-      deciesion based on all local devices.
-
-  Returns:
-    a subclass of CrossTowerOps.
-  """
-  requested_devices = set([device_util.canonicalize(d) for d in devices])
-  machine_devices = device_lib.list_local_devices(session_config=session_config)
-  using_devices = []
-  for d in machine_devices:
-    if device_util.canonicalize(d.name) in requested_devices:
-      using_devices.append(d)
-    else:
-      logging.info(
-          "Device is available but not used by distribute strategy: %s", d.name)
-
-  if len(using_devices) != len(requested_devices):
-    logging.warning("Not all devices in distribute strategy are visible by "
-                    "TensorFlow sessions.")
-    return ReductionToOneDeviceCrossTowerOps()
-
-  if any([d.device_type.lower() != "gpu" for d in using_devices]):
-    logging.warning("Not all devices in DistributionStrategy are visible to "
-                    "TensorFlow session.")
-    return ReductionToOneDeviceCrossTowerOps()
-
-  device_links = [[] for _ in range(len(using_devices))]
-  for i, device in enumerate(using_devices):
-    for link in device.locality.links.link:
-      device_links[i].append(link.device_id)
-
-  return _choose_all_reduce_algorithm(device_links)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
deleted file mode 100644
index 490371477a1b43551c4b4d8768c96d60e5f2c6d8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
+++ /dev/null
@@ -1,564 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for CrossTowerOps."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import cross_tower_utils
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import values as value_lib
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.training import device_util
-
-
-def _make_per_device(values, devices, regroup=False):
-  devices = cross_tower_ops_lib.get_devices_from(devices)
-  assert len(values) == len(devices)
-
-  # We simulate the result of regroup called on PerDevice which strips the
-  # PerDevice wrapper if it has only one value.
-  if len(values) == 1 and regroup:
-    with ops.device(devices[0]):
-      placed_v = array_ops.identity(values[0])
-    return placed_v
-
-  index = {}
-  for d, v in zip(devices, values):
-    with ops.device(d):
-      placed_v = array_ops.identity(v)
-    index[d] = placed_v
-  return value_lib.PerDevice(index)
-
-
-# pylint: disable=g-doc-args,g-doc-return-or-yield
-def _fake_mirrored(value, devices):
-  """Create a faked Mirrored object for testing.
-
-  All components of the returned Mirrored have the same objects, which is not
-  true in reality.
-  """
-  devices = cross_tower_ops_lib.get_devices_from(devices)
-  return value_lib.Mirrored(
-      {d: v for d, v in zip(devices, [value] * len(devices))})
-
-
-def _make_indexed_slices(values, indices, dense_shape, device):
-  with ops.device(device):
-    tensor = ops.IndexedSlices(
-        values=constant_op.constant(values),
-        indices=constant_op.constant(indices),
-        dense_shape=constant_op.constant(dense_shape))
-  return tensor
-
-
-def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
-  return value_lib.Mirrored({
-      d: _make_indexed_slices(values, indices, dense_shape, d) for d in devices
-  })
-
-
-_cpu_device = "/device:CPU:0"
-
-
-class CrossTowerOpsTestBase(test.TestCase, parameterized.TestCase):
-
-  def _assert_indexed_slices_equal(self, left, right):
-    self.assertIsInstance(left, ops.IndexedSlices)
-    self.assertIsInstance(right, ops.IndexedSlices)
-    self.assertEqual(device_util.resolve(left.device),
-                     device_util.resolve(right.device))
-    self.assertAllEqual(
-        self.evaluate(ops.convert_to_tensor(left)),
-        self.evaluate(ops.convert_to_tensor(right)))
-
-  def _assert_values_equal(self, left, right):
-    if isinstance(left, list):
-      for l, r in zip(left, right):
-        self._assert_values_equal(l, r)
-    else:
-      self.assertEqual(type(left), type(right))
-      self.assertEqual(set(left.devices), set(right.devices))
-      if isinstance(list(left._index.values())[0], ops.IndexedSlices):
-        for (d, v) in left._index.items():
-          self._assert_indexed_slices_equal(v, right._index[d])
-      elif context.executing_eagerly():
-        self.assertEqual([v.numpy() for v in left._index.values()],
-                         list(right._index.values()))
-      else:
-        with self.test_session() as sess:
-          self.assertEqual(
-              sess.run(list(left._index.values())), list(right._index.values()))
-
-  def _testReductionAndBroadcast(self, cross_tower_ops, distribution):
-    devices = distribution.worker_devices
-
-    values = [constant_op.constant(float(d)) for d in range(len(devices))]
-    per_device = _make_per_device(values, devices)
-    mean = (len(devices) - 1.) / 2.
-
-    values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
-    per_device_2 = _make_per_device(values_2, devices)
-    mean_2 = mean + 1.
-
-    destination_mirrored = _fake_mirrored(1., devices)
-    destination_different = _fake_mirrored(1., _cpu_device)
-    destination_str = _cpu_device
-    destination_list = devices
-
-    all_destinations = [
-        destination_mirrored, destination_different, destination_str,
-        destination_list
-    ]
-
-    # test reduce()
-    for destinations in all_destinations:
-      self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.MEAN,
-              per_device,
-              destinations=destinations),
-          _fake_mirrored(mean, destinations))
-      self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.MEAN,
-              per_device_2,
-              destinations=destinations),
-          _fake_mirrored(mean_2, destinations))
-      self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.SUM, per_device,
-              destinations=destinations),
-          _fake_mirrored(mean * len(devices), destinations))
-      self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.SUM,
-              per_device_2,
-              destinations=destinations),
-          _fake_mirrored(mean_2 * len(devices), destinations))
-
-    # test batch_reduce()
-    for d1, d2 in itertools.product(all_destinations, all_destinations):
-      self._assert_values_equal(
-          cross_tower_ops.batch_reduce(vs.VariableAggregation.MEAN,
-                                       [(per_device, d1), (per_device_2, d2)]),
-          [
-              _fake_mirrored(mean, d1),
-              _fake_mirrored(mean_2, d2)
-          ])
-      self._assert_values_equal(
-          cross_tower_ops.batch_reduce(vs.VariableAggregation.SUM,
-                                       [(per_device, d1), (per_device_2, d2)]),
-          [
-              _fake_mirrored(mean * len(devices), d1),
-              _fake_mirrored(mean_2 * len(devices), d2)
-          ])
-
-    # test broadcast()
-    for destinations in all_destinations:
-      self._assert_values_equal(
-          cross_tower_ops.broadcast(constant_op.constant(1.), destinations),
-          _fake_mirrored(1., destinations))
-
-
-class SingleWorkerCrossTowerOpsTest(CrossTowerOpsTestBase):
-  # TODO(yuefengz): decouple the num_gpus check from distribution in
-  # combinations module so that we can pass in devices instead of a distribution
-  # strategy.
-  reduction_to_one_combinations = combinations.combine(
-      cross_tower_ops=[
-          combinations.NamedObject(
-              "DefaultReductionToOneDeviceCrossTowerOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps()),
-          combinations.NamedObject(
-              "ReductionToCPUDeviceCrossTowerOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
-                  reduce_to_device=_cpu_device)),
-          combinations.NamedObject(
-              "AccumulateNCrossTowerOp",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
-                  accumulation_fn=math_ops.accumulate_n)),
-      ],
-      distribution=[
-          combinations.one_device_strategy,
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.mirrored_strategy_with_two_gpus
-      ],
-      mode=["graph", "eager"])
-  allreduce_combinations = combinations.combine(
-      cross_tower_ops=[
-          combinations.NamedObject(
-              "AllReduce",
-              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 1, 0, 0)),
-          combinations.NamedObject(
-              "HierarchicalCopy",
-              cross_tower_ops_lib.AllReduceCrossTowerOps(
-                  "hierarchical_copy", 8, 0, 0)),
-          combinations.NamedObject(
-              "AllReduceNoGradientRepacking",
-              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 0, 0, 0)),
-          combinations.NamedObject(
-              "HierarchicalCopyAggregateSmallTensors",
-              cross_tower_ops_lib.AllReduceCrossTowerOps(
-                  "hierarchical_copy", 0, 100, 10))
-      ],
-      distribution=[combinations.mirrored_strategy_with_two_gpus],
-      mode=["graph", "eager"])
-
-  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
-    with distribution.scope():
-      self._testReductionAndBroadcast(cross_tower_ops, distribution)
-
-  def testChooseAlgorithm(self):
-    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
-                    [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
-    self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
-    self.assertEqual(result._num_packs, 8)
-
-    # if there are only 4 devices
-    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
-    self.assertEqual(result._all_reduce_alg, "nccl")
-    self.assertEqual(result._num_packs, 1)
-
-    # if devices links contain each device itself
-    device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
-                    [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
-                    [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
-    self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
-    self.assertEqual(result._num_packs, 8)
-
-    # if not dgx1-like links
-    device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
-                    [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
-    self.assertEqual(result._all_reduce_alg, "nccl")
-    self.assertEqual(result._num_packs, 1)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      required_gpus=1))
-  def testSimpleReduceWithIndexedSlices(self):
-    devices = ["/cpu:0", "/gpu:0"]
-    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
-    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
-    per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})
-    result = cross_tower_ops_lib._simple_reduce(
-        per_device, devices[0], math_ops.add_n, vs.VariableAggregation.SUM)
-
-    # Test that the result is semantically equal to both the concatenated
-    # IndexedSlices with and without duplicate indices.
-    total_with_dups = _make_indexed_slices(
-        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
-    total_without_dups = _make_indexed_slices(
-        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
-    self._assert_indexed_slices_equal(total_with_dups, result)
-    self._assert_indexed_slices_equal(total_without_dups, result)
-
-  @combinations.generate(
-      combinations.combine(
-          cross_tower_ops_instance=[
-              combinations.NamedObject(
-                  "ReductionToOneDeviceCrossTowerOps",
-                  cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps()),
-              combinations.NamedObject(
-                  "AllReduceCrossTowerOps",
-                  cross_tower_ops_lib.AllReduceCrossTowerOps())
-          ],
-          aggregation=[vs.VariableAggregation.SUM, vs.VariableAggregation.MEAN],
-          batch_reduce=[True, False],
-          mode=["graph", "eager"],
-          required_gpus=1))
-  def testIndexedSlicesAllReduce(self, cross_tower_ops_instance, aggregation,
-                                 batch_reduce):
-    devices = ["/cpu:0", "/gpu:0"]
-    dense_shape = [5, 2]
-    t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
-    t1 = _make_indexed_slices(
-        [[3., 4.], [5., 6.]], [1, 3], dense_shape, devices[1])
-    per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})
-
-    if batch_reduce:
-      result = cross_tower_ops_instance.batch_reduce(aggregation,
-                                                     [(per_device, devices)])
-    else:
-      result = cross_tower_ops_instance.reduce(aggregation, per_device, devices)
-
-    total_indices_with_dups = [1, 1, 3]
-    total_indices_without_dups = [1, 3]
-
-    if aggregation == vs.VariableAggregation.SUM:
-      total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
-      total_values_without_dups = [[4., 6.], [5., 6.]]
-    else:
-      assert aggregation == vs.VariableAggregation.MEAN
-      total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
-      total_values_without_dups = [[2., 3.], [2.5, 3.]]
-
-    total_mirrored_with_dups = _make_mirrored_indexed_slices(
-        devices, total_values_with_dups, total_indices_with_dups, dense_shape)
-    total_mirrored_without_dups = _make_mirrored_indexed_slices(
-        devices, total_values_without_dups, total_indices_without_dups,
-        dense_shape)
-
-    # Test that the result is semantically equal to both the concatenated
-    # IndexedSlices, as well as when the duplicate indices are summed up.
-    if batch_reduce:
-      total_mirrored_with_dups = [total_mirrored_with_dups]
-      total_mirrored_without_dups = [total_mirrored_without_dups]
-
-    self._assert_values_equal(total_mirrored_with_dups, result)
-    self._assert_values_equal(total_mirrored_without_dups, result)
-
-
-class MultiWorkerCrossTowerOpsTest(multi_worker_test_base.MultiWorkerTestBase,
-                                   CrossTowerOpsTestBase):
-
-  worker_devices = [
-      "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
-  ]
-  multi_worker_allreduce_combinations = combinations.combine(
-      cross_tower_ops=[
-          combinations.NamedObject(
-              "MultiWorkerAllReduce",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
-                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
-          combinations.NamedObject(
-              "MultiWorkerAllReducePack",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
-                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
-          combinations.NamedObject(
-              "MultiWorkerAllReduceAggregation",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
-                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
-          combinations.NamedObject(
-              "MultiWorkerAllReduceMultipleSpecs",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
-                  worker_devices, 2, [("pscpu/pscpu", 2, 100),
-                                      ("xring", 2, -1)], 0, 0, 0)),
-      ],
-      distribution=[
-          combinations.NamedDistribution(
-              "MirroredCPU",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=0),
-              required_gpus=0),
-          combinations.NamedDistribution(
-              "Mirrored1GPU",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=1),
-              required_gpus=1),
-          combinations.NamedDistribution(
-              "Mirrored2GPUs",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=2),
-              required_gpus=2),
-      ],
-      mode=["graph"])
-
-  @combinations.generate(multi_worker_allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
-    distribution.configure(cluster_spec={
-        "worker":
-            ["/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"]
-    })
-    with distribution.scope():
-      self._testReductionAndBroadcast(cross_tower_ops, distribution)
-
-
-class MultiWorkerCollectiveAllReduceTest(
-    multi_worker_test_base.MultiWorkerTestBase, parameterized.TestCase):
-
-  collective_key_base = 100000
-
-  @classmethod
-  def setUpClass(cls):
-    """Create a local cluster with 2 workers."""
-    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
-        num_workers=3, num_ps=0)
-
-  def setUp(self):
-    super(MultiWorkerCollectiveAllReduceTest, self).setUp()
-    # Reusing keys are not supported well. So we have to give a different
-    # collective key base for different tests.
-    MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000
-
-  def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False):
-    collective_keys = cross_tower_utils.CollectiveKeys(
-        group_key_start=10 * num_gpus +
-        MultiWorkerCollectiveAllReduceTest.collective_key_base,
-        instance_key_start=num_gpus * 100 +
-        MultiWorkerCollectiveAllReduceTest.collective_key_base,
-        instance_key_with_id_start=num_gpus * 10000 +
-        MultiWorkerCollectiveAllReduceTest.collective_key_base)
-    if local_mode:
-      collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
-          1, num_gpus, collective_keys=collective_keys)
-      if num_gpus:
-        devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
-      else:
-        devices = ["/device:CPU:0"]
-      return collective_all_reduce_ops, devices, ""
-    else:
-      collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
-          3, num_gpus, collective_keys=collective_keys)
-      if num_gpus:
-        devices = [
-            "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i)
-            for i in range(num_gpus)
-        ]
-      else:
-        devices = ["/job:%s/task:%d" % (task_type, task_id)]
-      return (collective_all_reduce_ops, devices,
-              "grpc://" + self._cluster_spec[task_type][task_id])
-
-  def _assert_values_equal(self, left, right, sess):
-    if isinstance(left, list):
-      for l, r in zip(left, right):
-        self._assert_values_equal(l, r, sess)
-    else:
-      self.assertEqual(type(left), type(right))
-      self.assertEqual(set(left.devices), set(right.devices))
-
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 6
-
-      left_values = np.array(
-          sess.run(list(left._index.values()), options=run_options)).flatten()
-      right_values = np.array(list(right._index.values())).flatten()
-      self.assertEqual(len(left_values), len(right_values))
-      for l, r in zip(left_values, right_values):
-        self.assertEqual(l, r)
-
-  def _test_reduction(self, task_type, task_id, num_gpus, local_mode=False):
-    collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type, task_id, num_gpus, local_mode=local_mode)
-    if local_mode:
-      num_workers = 1
-      worker_device = None
-    else:
-      num_workers = len(self._cluster_spec.get("chief", [])) + len(
-          self._cluster_spec.get("worker", []))
-      worker_device = "/job:%s/task:%d" % (task_type, task_id)
-    with ops.Graph().as_default(), \
-         ops.device(worker_device), \
-         self.test_session(target=master_target) as sess:
-      # Collective ops doesn't support scalar tensors, so we have to construct
-      # 1-d tensors.
-      values = [constant_op.constant([float(d)]) for d in range(len(devices))]
-      per_device = _make_per_device(values, devices, regroup=True)
-      mean = np.array([(len(devices) - 1.) / 2.])
-
-      values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
-      per_device_2 = _make_per_device(values_2, devices)
-      mean_2 = np.array([mean[0] + 1.])
-
-      destination_mirrored = _fake_mirrored(1., devices)
-      destination_different = _fake_mirrored(1., _cpu_device)
-      destination_str = _cpu_device
-      destination_list = devices
-
-      all_destinations = [
-          destination_different, destination_mirrored, destination_str,
-          destination_list
-      ]
-
-      # test reduce()
-      for destinations in all_destinations:
-        self._assert_values_equal(
-            collective_all_reduce.reduce(
-                vs.VariableAggregation.MEAN,
-                per_device,
-                destinations=destinations),
-            _fake_mirrored(mean, destinations), sess)
-        self._assert_values_equal(
-            collective_all_reduce.reduce(
-                vs.VariableAggregation.MEAN,
-                per_device_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2, destinations), sess)
-        self._assert_values_equal(
-            collective_all_reduce.reduce(
-                vs.VariableAggregation.SUM,
-                per_device,
-                destinations=destinations),
-            _fake_mirrored(mean * len(devices) * num_workers, destinations),
-            sess)
-        self._assert_values_equal(
-            collective_all_reduce.reduce(
-                vs.VariableAggregation.SUM,
-                per_device_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
-            sess)
-
-      # test batch_reduce()
-      for d1, d2 in itertools.product(all_destinations, all_destinations):
-        self._assert_values_equal(
-            collective_all_reduce.batch_reduce(vs.VariableAggregation.MEAN,
-                                               [(per_device, d1),
-                                                (per_device_2, d2)]),
-            [
-                _fake_mirrored(mean, d1),
-                _fake_mirrored(mean_2, d2)
-            ], sess)
-        self._assert_values_equal(
-            collective_all_reduce.batch_reduce(vs.VariableAggregation.SUM,
-                                               [(per_device, d1),
-                                                (per_device_2, d2)]),
-            [
-                _fake_mirrored(mean * len(devices) * num_workers, d1),
-                _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
-            ], sess)
-
-    return True
-
-  @combinations.generate(
-      combinations.combine(mode=["graph"], num_gpus=[0, 1, 2], required_gpus=1))
-  def testReductionDistributed(self, num_gpus):
-    if context.num_gpus() < num_gpus:
-      return
-    self._run_between_graph_clients(self._test_reduction, self._cluster_spec,
-                                    num_gpus)
-
-  # Collective ops doesn't support strategy with one device.
-  def testReductionLocal(self, num_gpus=2):
-    if context.num_gpus() < num_gpus:
-      return
-    self._test_reduction(None, None, num_gpus, local_mode=True)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/contrib/distribute/python/cross_tower_utils.py
deleted file mode 100644
index 9fc1b8895516f64a956accd9290e7bf42ccef330..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/cross_tower_utils.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for cross_tower_ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections as pycoll
-import threading
-
-from tensorflow.contrib import nccl
-from tensorflow.contrib.all_reduce.python import all_reduce
-from tensorflow.contrib.distribute.python import values as value_lib
-from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import collective_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-
-
-def aggregate_gradients_using_nccl(tower_grads):
-  """Aggregate gradients using nccl allreduce."""
-  agg_all_g_and_v = []
-  for single_g_and_v in zip(*tower_grads):
-    single_grads = [g for g, _ in single_g_and_v]
-    agg_grads = nccl.all_sum(single_grads)
-    agg_all_g_and_v.append(
-        [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)])
-
-  agg_all_g_and_v = list(zip(*agg_all_g_and_v))
-
-  return agg_all_g_and_v
-
-
-def aggregate_gradients_using_hierarchical_copy(avail_devices, tower_grads):
-  """Aggregate gradients using hierarchical copies.
-
-  Args:
-    avail_devices: available GPU devices.
-    tower_grads: List of lists of (gradient, variable) tuples. The outer list
-      is over towers. The inner list is over individual gradients.
-
-  Returns:
-    The list of (aggregated_gradient, variable), where the gradient has been
-      summed across all towers and the variable is chosen from the first tower.
-  """
-  # This only works for DGX-1 type of machine topology
-  # Device peer to peer matrix
-  # DMA: 0 1 2 3 4 5 6 7
-  # 0:   Y Y Y Y Y N N N
-  # 1:   Y Y Y Y N Y N N
-  # 2:   Y Y Y Y N N Y N
-  # 3:   Y Y Y Y N N N Y
-  # 4:   Y N N N Y Y Y Y
-  # 5:   N Y N N Y Y Y Y
-  # 6:   N N Y N Y Y Y Y
-  # 7:   N N N Y Y Y Y Y
-  agg_grads = []
-  num_devices = len(avail_devices)
-  # In the special case of DGX-1 machine topology, the two groups have equal
-  # size.
-  group_size = num_devices // 2
-  for i, single_grads in enumerate(zip(*tower_grads)):
-    group_0_main_device = i % num_devices
-    group_1_main_device = (group_0_main_device + group_size) % num_devices
-    if group_0_main_device < group_size:
-      group_0_begin = 0
-      group_1_begin = group_size
-    else:
-      group_0_begin = group_size
-      group_1_begin = 0
-
-    # Aggregate the first group.
-    group_0_device_grads = single_grads[group_0_begin:
-                                        group_0_begin + group_size]
-    with ops.device(avail_devices[group_0_main_device]):
-      group_0_agg_grads, _ = aggregate_single_gradient_using_copy(
-          group_0_device_grads, False, False)
-
-    # Aggregate the second group.
-    group_1_device_grads = single_grads[group_1_begin:
-                                        group_1_begin + group_size]
-    with ops.device(avail_devices[group_1_main_device]):
-      group_1_agg_grads, _ = aggregate_single_gradient_using_copy(
-          group_1_device_grads, False, False)
-
-    # Aggregate between the groups.
-    with ops.device(avail_devices[group_0_main_device]):
-      (agg_total_grads, _), _ = aggregate_single_gradient_using_copy(
-          [group_0_agg_grads, group_1_agg_grads], False, False)
-
-    # Broadcast the result back into the root of each group.
-    with ops.device(avail_devices[group_0_main_device]):
-      group_0_agg_grads_bcast = array_ops.identity(agg_total_grads)
-    with ops.device(avail_devices[group_1_main_device]):
-      group_1_agg_grads_bcast = array_ops.identity(agg_total_grads)
-
-    agg_grads_bcast = []
-    for j in range(len(single_grads)):
-      with ops.device(avail_devices[j]):
-        # Broadcast the result back to each member in the group from the root.
-        if (group_0_main_device < group_size) == (j < group_size):
-          src_device_grad = group_0_agg_grads_bcast
-        else:
-          src_device_grad = group_1_agg_grads_bcast
-        agg_grads_bcast.append(array_ops.identity(src_device_grad))
-
-    agg_grads.append(
-        [(g, v) for g, (_, v) in zip(agg_grads_bcast, single_grads)])
-
-  agg_grads = list(zip(*agg_grads))
-
-  return agg_grads
-
-
-def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
-                                         check_inf_nan):
-  """Calculate the average gradient for a shared variable across all towers.
-
-  Note that this function provides a synchronization point across all towers.
-
-  Args:
-    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
-      (gradient, variable) pair within the outer list represents the gradient
-      of the variable calculated for a single tower, and the number of pairs
-      equals the number of towers.
-    use_mean: if True, mean is taken, else sum of gradients is taken.
-    check_inf_nan: check grads for nans and infs.
-
-  Returns:
-    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
-      gradient has been averaged across all towers. The variable is chosen from
-      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
-  """
-  grads = [g for g, _ in grad_and_vars]
-  grad = math_ops.add_n(grads)
-
-  if use_mean and len(grads) > 1:
-    grad = array_ops.multiply(grad, 1.0 / len(grads))
-
-  v = grad_and_vars[0][1]
-  if check_inf_nan:
-    has_nan_or_inf = array_ops.logical_not(
-        array_ops.reduce_all(array_ops.is_finite(grads)))
-    return (grad, v), has_nan_or_inf
-  else:
-    return (grad, v), None
-
-
-def group_device_names(devices, group_size):
-  """Group device names into groups of group_size.
-
-  Args:
-    devices: a list of canonical device strings.
-    group_size: integer which is equal to or greater than 1.
-
-  Returns:
-    list of lists of devices, where each inner list is group_size long,
-      and each device appears at least once in an inner list.  If
-      len(devices) % group_size == 0 then each device will appear exactly once.
-
-  Raises:
-    ValueError: if group_size > len(devices)
-  """
-  num_devices = len(devices)
-  if group_size > num_devices:
-    raise ValueError(
-        'only %d devices, but group_size=%d' % (num_devices, group_size))
-  num_groups = (
-      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
-  groups = [[] for i in range(num_groups)]
-  for i in range(num_groups * group_size):
-    groups[i % num_groups].append(devices[i % num_devices])
-  return groups
-
-
-def split_grads_by_size(threshold_size, device_grads):
-  """Break gradients into two sets according to tensor size.
-
-  Args:
-    threshold_size: int size cutoff for small vs large tensor.
-    device_grads: List of lists of (gradient, variable) tuples.  The outer
-        list is over devices. The inner list is over individual gradients.
-
-  Returns:
-    small_grads: Subset of device_grads where shape is <= threshold_size
-       elements.
-    large_grads: Subset of device_grads where shape is > threshold_size
-       elements.
-  """
-  small_grads = []
-  large_grads = []
-  for dl in device_grads:
-    small_dl = []
-    large_dl = []
-    for (g, v) in dl:
-      tensor_size = g.get_shape().num_elements()
-      if tensor_size <= threshold_size:
-        small_dl.append([g, v])
-      else:
-        large_dl.append([g, v])
-    if small_dl:
-      small_grads.append(small_dl)
-    if large_dl:
-      large_grads.append(large_dl)
-  return small_grads, large_grads
-
-
-# threading.Lock() and threading.local() cannot be pickled and therefore cannot
-# be a field of CollectiveKeys. Right now _thread_local is not necessary to be
-# an instance member of CollectiveKeys since we always create a new thread for
-# each tower.
-_lock = threading.Lock()
-_thread_local = threading.local()
-
-
-# TODO(yuefengz): use random key starts to avoid reusing keys?
-class CollectiveKeys(object):
-  """Class that manages collective keys.
-
-  We need to manage three different keys for collective:
-
-  *Group key*: an integer key to identify the set of cooperative devices.
-  Collective ops work under the same set of devices must using the same group
-  key.
-
-  *Instance key*: an integer key to identify the set of same counterpart of
-  tensors on different devices in a device group that need to be all-reduced.
-
-  "Graph key": an integer key that is unique key graph. This is used to support
-  multiple graphs per client session. It must be non-zero and set in the
-  `config` argument of each call to `session.run`.
-  """
-
-  def __init__(self,
-               group_key_start=1,
-               instance_key_start=100,
-               instance_key_with_id_start=10000):
-    """Initializes the object.
-
-    Args:
-      group_key_start: the starting integer of group key.
-      instance_key_start: the starting integer of instance key.
-      instance_key_with_id_start: the starting integer of instance key that is
-        recorded with an id.
-    """
-    self._group_key = group_key_start
-    self._group_key_table = dict()
-
-    # For instance keys with ids
-    self._instance_key_id_to_key_table = dict()
-    self._instance_key_with_id_counter = instance_key_with_id_start
-
-    # For instance keys without ids
-    self._instance_key_start = instance_key_start
-
-  def _get_thread_local_object(self):
-    # We make instance key without key ids thread local so that it will work
-    # with MirroredStrategy and distribute coordinator.
-    if not hasattr(_thread_local, 'instance_key'):
-      _thread_local.instance_key = self._instance_key_start
-    return _thread_local
-
-  def get_group_key(self, devices):
-    """Returns a group key for the set of devices.
-
-    Args:
-      devices: list of strings naming devices in a collective group.
-
-    Returns:
-      int key uniquely identifying the set of device names.
-    """
-    parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
-    # In the between-graph replicated training, different workers need to get
-    # the same device key. So we remove the task_type and task_id from the
-    # devices.
-    # TODO(yuefengz): in the in-graph replicated training, we need to include
-    # task_type and task_id.
-    names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
-    key_id = ','.join(names)
-    with _lock:
-      if key_id not in self._group_key_table:
-        new_key = self._group_key
-        self._group_key += 1
-        self._group_key_table[key_id] = new_key
-    return self._group_key_table[key_id]
-
-  def get_instance_key(self, key_id=None):
-    """Returns a new instance key for use in defining a collective op.
-
-    Args:
-      key_id: optional string. If set, key will be recorded and the same key
-        will be returned when the same key_id is provided. If not, an increasing
-        instance key will be returned.
-    """
-    if key_id:
-      with _lock:
-        if key_id not in self._instance_key_id_to_key_table:
-          self._instance_key_with_id_counter += 1
-          self._instance_key_id_to_key_table[key_id] = (
-              self._instance_key_with_id_counter)
-      return self._instance_key_id_to_key_table[key_id]
-    else:
-      v = self._get_thread_local_object().instance_key
-      self._get_thread_local_object().instance_key += 1
-      return v
-
-
-def build_collective_reduce(input_tensors,
-                            num_workers,
-                            collective_keys,
-                            reduction_op='Add',
-                            unary_op='Id'):
-  """Build a subgraph that does one full all-reduce, using the collective Op.
-
-  Args:
-    input_tensors: tensors within a single worker graph that are to be reduced
-      together; must be one per device.
-    num_workers: total number of workers with identical independent graphs that
-      will be doing this same reduction.  The reduction will actually include
-      the corresponding tensors at all these workers.
-    collective_keys: a CollectiveKeys object.
-    reduction_op: string naming the reduction op.
-    unary_op: string naming the unary final op.
-
-  Returns:
-    An array of final tensors, one per device, computed by the full reduction.
-
-  Raises:
-    ValueError: There must be at least two tensors over all the workers.
-  """
-  group_size = len(input_tensors) * num_workers
-  if group_size < 2:
-    raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
-  devices = [t.device for t in input_tensors]
-  num_devices = len(devices)
-  group_key = collective_keys.get_group_key(devices)
-  instance_key = collective_keys.get_instance_key()
-  out_tensors = []
-  subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
-  for d in range(num_devices):
-    with ops.device(devices[d]):
-      reduce_op = collective_ops.all_reduce(
-          input_tensors[d], group_size, group_key, instance_key, reduction_op,
-          unary_op, subdiv_offsets)
-      out_tensors.append(reduce_op)
-  return out_tensors
-
-
-def sum_grad_and_var_all_reduce(grad_and_vars,
-                                num_workers,
-                                alg,
-                                gpu_indices,
-                                aux_devices=None,
-                                num_shards=1):
-  """Apply all-reduce algorithm over specified gradient tensors."""
-  with ops.name_scope('allreduce'):
-    # Note that each grad_and_vars looks like the following:
-    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-    scaled_grads = [g for g, _ in grad_and_vars]
-    if alg == 'nccl':
-      summed_grads = nccl.all_sum(scaled_grads)
-    elif alg == 'xring':
-      summed_grads = all_reduce.build_ring_all_reduce(
-          scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
-    elif alg == 'nccl/xring':
-      summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
-                                                     math_ops.add)
-    elif alg == 'nccl/rechd':
-      summed_grads = all_reduce.build_nccl_then_recursive_hd(
-          scaled_grads, math_ops.add)
-    elif alg == 'nccl/pscpu':
-      summed_grads = all_reduce.build_nccl_then_shuffle(
-          scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
-    elif alg == 'pscpu/pscpu':
-      second_gather_devices = aux_devices[:num_shards]
-      summed_grads = all_reduce.build_shuffle_then_shuffle(
-          scaled_grads, aux_devices, second_gather_devices, math_ops.add_n)
-    elif alg in ['pscpu', 'psgpu']:
-      summed_grads = all_reduce.build_shuffle_all_reduce(
-          scaled_grads, aux_devices, math_ops.add_n)
-    else:
-      raise ValueError('unsupported all_reduce alg: ', alg)
-
-  result = []
-  for (_, v), g in zip(grad_and_vars, summed_grads):
-    result.append([g, v])
-  return result
-
-
-def sum_gradients_all_reduce(dev_prefixes, tower_grads, num_workers, alg,
-                             num_shards, gpu_indices):
-  """Apply all-reduce algorithm over specified gradient tensors.
-
-  Args:
-    dev_prefixes: list of prefix strings to use to generate PS device names.
-    tower_grads: the gradients to reduce.
-    num_workers: number of worker processes across entire job.
-    alg: the all-reduce algorithm to apply.
-    num_shards: alg-specific sharding factor.
-    gpu_indices: indices of local GPUs in order usable for ring-reduce.
-
-  Returns:
-    list of reduced tensors
-  """
-  alg_contains_shuffle = any([n in alg for n in ['pscpu', 'psgpu']])
-  is_hierarchical = '/' in alg
-  if 'pscpu' in alg:
-    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
-  elif 'psgpu' in alg:
-    aux_devices = [
-        prefix + '/gpu:%d' % i
-        for i in range(len(gpu_indices))
-        for prefix in dev_prefixes
-    ]
-  else:
-    aux_devices = ['/job:localhost/cpu:0']
-  # Auxiliary devices for hierarchical all-reduces.
-  aux_device_groups = group_device_names(
-      aux_devices, num_shards if alg_contains_shuffle else 1)
-  group_index = 0
-  reduced_gv_list = []
-  for grad_and_vars in zip(*tower_grads):
-    reduced_gv_list.append(
-        sum_grad_and_var_all_reduce(
-            grad_and_vars, num_workers, alg, gpu_indices, aux_devices
-            if is_hierarchical else aux_device_groups[group_index], num_shards))
-    group_index = (group_index + 1) % len(aux_device_groups)
-  new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
-  return new_tower_grads
-
-
-def extract_ranges(index_list, range_size_limit=32):
-  """Extract consecutive ranges and singles from index_list.
-
-  Args:
-    index_list: List of monotone increasing non-negative integers.
-    range_size_limit: Largest size range to return.  If a larger
-      consecutive range exists, it will be returned as multiple
-      ranges.
-
-  Returns:
-    (ranges, singles) where ranges is a list of [first, last] pairs of
-      consecutive elements in index_list, and singles is all of the
-      other elements, in original order.
-  """
-  if not index_list:
-    return [], []
-  first = index_list[0]
-  last = first
-  ranges = []
-  singles = []
-  for i in index_list[1:]:
-    if i == last + 1 and (last - first) <= range_size_limit:
-      last = i
-    else:
-      if last > first:
-        ranges.append([first, last])
-      else:
-        singles.append(first)
-      first = i
-      last = i
-  if last > first:
-    ranges.append([first, last])
-  else:
-    singles.append(first)
-  return ranges, singles
-
-
-GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
-
-
-def pack_range(key, packing, grad_vars, rng):
-  """Form the concatenation of a specified range of gradient tensors.
-
-  Args:
-    key: Value under which to store meta-data in packing that will be used
-      later to restore the grad_var list structure.
-    packing: Dict holding data describing packed ranges of small tensors.
-    grad_vars: List of (grad, var) pairs for one tower.
-    rng: A pair of integers giving the first, last indices of a consecutive
-      range of tensors to be packed.
-
-  Returns:
-    A tensor that is the concatenation of all the specified small tensors.
-  """
-  to_pack = grad_vars[rng[0]:rng[1] + 1]
-  members = []
-  variables = []
-  restore_shapes = []
-  with ops.name_scope('pack'):
-    for g, v in to_pack:
-      variables.append(v)
-      restore_shapes.append(g.shape)
-      with ops.device(g.device):
-        members.append(array_ops.reshape(g, [-1]))
-    packing[key] = GradPackTuple(
-        indices=range(rng[0], rng[1] + 1),
-        vars=variables,
-        shapes=restore_shapes)
-    with ops.device(members[0].device):
-      return array_ops.concat(members, 0)
-
-
-def unpack_grad_tuple(gv, gpt):
-  """Unpack a previously packed collection of gradient tensors.
-
-  Args:
-    gv: A (grad, var) pair to be unpacked.
-    gpt: A GradPackTuple describing the packing operation that produced gv.
-
-  Returns:
-    A list of (grad, var) pairs corresponding to the values that were
-     originally packed into gv, maybe following subsequent operations like
-     reduction.
-  """
-  elt_widths = [x.num_elements() for x in gpt.shapes]
-  with ops.device(gv[0][0].device):
-    with ops.name_scope('unpack'):
-      splits = array_ops.split(gv[0], elt_widths)
-      unpacked_gv = []
-      for idx, s in enumerate(splits):
-        unpacked_gv.append((array_ops.reshape(s, gpt.shapes[idx]),
-                            gpt.vars[idx]))
-  return unpacked_gv
-
-
-def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
-  """Concatenate small gradient tensors together for reduction.
-
-  Args:
-    tower_grads: List of lists of (gradient, variable) tuples.
-    max_bytes: Int giving max number of bytes in a tensor that
-      may be considered small.
-    max_group: Int giving max number of small tensors that may be
-      concatenated into one new tensor.
-
-  Returns:
-    new_tower_grads, packing where new_tower_grads is identical to
-      tower_grads except that all feasible small_tensors have been removed
-      from their places and concatenated into larger tensors that are
-      now in the front of the list for each tower, and packing contains
-      the data necessary to restore the tower_grads structure.
-
-  Look through the first tower for gradients of the same type (float),
-  and small size, that are all sequential.  For each such group,
-  replace by a new tensor that is a flattened concatenation.  Note
-  that the corresponding variable will be absent, which doesn't matter
-  because it isn't used during all-reduce.
-
-  Requires:
-    Every gv_list in towers must have isomorphic structure including identical
-      tensor sizes and types.
-  """
-  small_indices = []
-  large_indices = []
-  for idx, (g, _) in enumerate(tower_grads[0]):
-    if g.dtype == dtypes.float32 and (4 * g.shape.num_elements()) <= max_bytes:
-      small_indices.append(idx)
-    else:
-      large_indices.append(idx)
-  small_ranges, small_singles = extract_ranges(
-      small_indices, range_size_limit=max_group)
-  large_indices = sorted(large_indices + small_singles)
-  num_gv = len(tower_grads[0])
-  packing = {}
-  if small_ranges:
-    new_tower_grads = []
-    for dev_idx, gv_list in enumerate(tower_grads):
-      assert len(gv_list) == num_gv
-      new_gv_list = []
-      for r in small_ranges:
-        key = '%d:%d' % (dev_idx, len(new_gv_list))
-        new_gv_list.append((pack_range(key, packing, gv_list, r),
-                            'packing_var_placeholder'))
-      for i in large_indices:
-        new_gv_list.append(gv_list[i])
-      new_tower_grads.append(new_gv_list)
-    return new_tower_grads, packing
-  else:
-    return tower_grads, None
-
-
-def unpack_small_tensors(tower_grads, packing):
-  """Undo the structure alterations to tower_grads done by pack_small_tensors.
-
-  Args:
-    tower_grads: List of List of (grad, var) tuples.
-    packing: A dict generated by pack_small_tensors describing the changes
-      it made to tower_grads.
-
-  Returns:
-    new_tower_grads: identical to tower_grads except that concatenations
-      of small tensors have been split apart and returned to their original
-      positions, paired with their original variables.
-  """
-  if not packing:
-    return tower_grads
-  new_tower_grads = []
-  num_devices = len(tower_grads)
-  num_packed = len(packing.keys()) // num_devices
-  for dev_idx, gv_list in enumerate(tower_grads):
-    gv_list = list(gv_list)
-    new_gv_list = gv_list[num_packed:]
-    for i in range(num_packed):
-      k = '%d:%d' % (dev_idx, i)
-      gpt = packing[k]
-      gv = unpack_grad_tuple(gv_list[i], gpt)
-      for gi, idx in enumerate(gpt.indices):
-        assert idx == gpt.indices[gi]
-        new_gv_list.insert(idx, gv[gi])
-    new_tower_grads.append(new_gv_list)
-  return new_tower_grads
-
-
-def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
-  """Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
-  if any(isinstance(v, ops.IndexedSlices) for v in values):
-    return gradients_impl._AggregateIndexedSlicesGradients(values)  # pylint: disable=protected-access
-  else:
-    return accumulation_fn(values)
-
-
-def divide_by_n_tensors_or_indexed_slices(value, n):
-  if isinstance(value, ops.IndexedSlices):
-    value = gradients_impl._HandleNestedIndexedSlices(value)  # pylint: disable=protected-access
-    return ops.IndexedSlices(
-        value.values / n, value.indices, value.dense_shape)
-  else:
-    return value / n
-
-
-def copy_tensor_or_indexed_slices_to_device(value, device):
-  with ops.device(device):
-    if isinstance(value, ops.IndexedSlices):
-      copied_values = array_ops.identity(value.values)
-      copied_indices = array_ops.identity(value.indices)
-      copied_shape = array_ops.identity(value.dense_shape)
-      result = ops.IndexedSlices(copied_values, copied_indices, copied_shape)
-    else:
-      result = array_ops.identity(value)
-  return result
-
-
-def contains_indexed_slices(value):
-  """Check whether the value is `IndexedSlices` or contains `IndexedSlices`."""
-  if isinstance(value, ops.IndexedSlices):
-    return True
-  elif isinstance(value, (list, tuple)) and value:
-    return any(contains_indexed_slices(v) for v in value)
-  elif isinstance(value, value_lib.DistributedValues):
-    return contains_indexed_slices(list(value._index.values()))  # pylint: disable=protected-access
-  elif isinstance(value, value_lib.MapOutput):
-    return contains_indexed_slices(value.get())
-  else:
-    return False
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py b/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
deleted file mode 100644
index d25964fa41adc7b1c9164a4ffe49c4c5532f76ac..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for cross_tower_utils."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-
-from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_utils
-from tensorflow.contrib.distribute.python import values as value_lib
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import math_ops
-from tensorflow.python.training import device_util
-
-
-class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
-
-  def _assert_values_equal(self, left, right):
-    self.assertAllEqual(
-        self.evaluate(ops.convert_to_tensor(left)),
-        self.evaluate(ops.convert_to_tensor(right)))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testAggregateTensors(self):
-    t0 = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
-    t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]])
-    total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
-    self._assert_values_equal(total, result)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testAggregateIndexedSlices(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
-    self.assertIsInstance(result, ops.IndexedSlices)
-    self._assert_values_equal(total, result)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDivideTensor(self):
-    t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
-    n = 2
-    expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
-    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
-    self._assert_values_equal(expected, result)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDivideIndexedSlices(self):
-    t = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    n = 2
-    expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
-    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
-    self.assertIsInstance(result, ops.IndexedSlices)
-    self._assert_values_equal(expected, result)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testIsIndexedSlices(self):
-    t = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(t))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_List(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices([t0, t1]))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_Tuple(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices((t0, t1)))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerDevice(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_device = value_lib.PerDevice({"/gpu:0": t0, "/cpu:0": t1})
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerDeviceMapOutput(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_device = value_lib.PerDevice({
-        "/gpu:0": value_lib.MapOutput([t0]),
-        "/cpu:0": value_lib.MapOutput([t1])})
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      required_gpus=1))
-  def testCopyTensor(self):
-    with ops.device("/cpu:0"):
-      t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
-    destination = "/gpu:0"
-    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
-        t, destination)
-
-    self._assert_values_equal(t, result)
-    self.assertEqual(device_util.resolve(destination),
-                     device_util.resolve(result.device))
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      required_gpus=1))
-  def testCopyIndexedSlices(self):
-    with ops.device("/cpu:0"):
-      t = math_ops._as_indexed_slices(
-          constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    destination = "/gpu:0"
-    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
-        t, destination)
-
-    self.assertIsInstance(result, ops.IndexedSlices)
-    self._assert_values_equal(t, result)
-    self.assertEqual(device_util.resolve(destination),
-                     device_util.resolve(result.device))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index cc626c33bf8e282736f8e6e0c151e5a3d3f3244b..e17085628ba6d1dfc79839fd824801723f07a518 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary.writer import writer_cache
@@ -63,7 +63,9 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
           distribution=[
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus
           ],
           use_train_and_evaluate=[True, False]))
   def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
@@ -75,12 +77,12 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
     train_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices),
+        batch_size=batch_size // distribution.num_replicas_in_sync,
         shuffle=True)
     eval_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices),
+        batch_size=batch_size // distribution.num_replicas_in_sync,
         shuffle=False)
     predict_input_fn = numpy_io.numpy_input_fn(
         x={'x': data}, batch_size=batch_size, shuffle=False)
@@ -126,8 +128,8 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
     feature_spec = feature_column.make_parse_example_spec(feature_columns)
     serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
         feature_spec)
-    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
-                                             serving_input_receiver_fn)
+    export_dir = estimator.export_saved_model(tempfile.mkdtemp(),
+                                              serving_input_receiver_fn)
     self.assertTrue(gfile.Exists(export_dir))
 
   def tearDown(self):
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
index 157618f72ff2ea6dde171e7edb62ccaf7e1de516..b369a7fefe6f35cf5a9b64451419cf4f72a99471 100644
--- a/tensorflow/contrib/distribute/python/estimator_training_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_training_test.py
@@ -18,15 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import glob
 import json
 import os
 import sys
 import tempfile
-import threading
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
@@ -43,11 +44,13 @@ from tensorflow.python.estimator import training as estimator_training
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export as export_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import session_manager
+
 
 BATCH_SIZE = 10
 LABEL_DIMENSION = 2
@@ -66,57 +69,19 @@ PS = dc._TaskType.PS
 original_run_std_server = dc._run_std_server
 
 
-class MockOsEnv(dict):
-
-  def __init__(self, *args):
-    self._thread_local = threading.local()
-    super(MockOsEnv, self).__init__(*args)
-
-  def get(self, key, default):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.get(self._thread_local.dict, key, default)
-    else:
-      return dict.get(self, key, default)
-
-  def __getitem__(self, key):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.__getitem__(self._thread_local.dict, key)
-    else:
-      return dict.__getitem__(self, key)
-
-  def __setitem__(self, key, val):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.__setitem__(self._thread_local.dict, key, val)
-    else:
-      return dict.__setitem__(self, key, val)
-
-
-class DistributeCoordinatorIntegrationTest(test.TestCase,
-                                           parameterized.TestCase):
+class DistributeCoordinatorIntegrationTest(
+    multi_worker_test_base.IndependentWorkerTestBase, parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
     """Create a local cluster with 2 workers."""
+    super(DistributeCoordinatorIntegrationTest, cls).setUpClass()
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2, has_eval=True)
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
-    self._mock_os_env = MockOsEnv()
-    self._mock_context = test.mock.patch.object(os, "environ",
-                                                self._mock_os_env)
     super(DistributeCoordinatorIntegrationTest, self).setUp()
-    self._mock_context.__enter__()
-
-  def tearDown(self):
-    self._mock_context.__exit__(None, None, None)
-    super(DistributeCoordinatorIntegrationTest, self).tearDown()
 
   def dataset_input_fn(self, x, y, batch_size, shuffle):
 
@@ -139,6 +104,8 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
   def _extract_loss_and_global_step(self, event_folder):
     """Returns the loss and global step in last event."""
     event_paths = glob.glob(os.path.join(event_folder, "events*"))
+    self.assertNotEmpty(
+        event_paths, msg="Event file not found in dir %s" % event_folder)
 
     loss = None
     global_step_count = None
@@ -189,7 +156,8 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
   def _complete_flow(self,
                      train_distribute,
                      eval_distribute,
-                     remote_cluster=None):
+                     remote_cluster=None,
+                     use_train_and_evaluate=True):
     estimator = self._get_estimator(train_distribute, eval_distribute,
                                     remote_cluster)
 
@@ -197,10 +165,10 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     train_input_fn = self.dataset_input_fn(
         x={"x": DATA},
         y=DATA,
-        batch_size=BATCH_SIZE // len(train_distribute.worker_devices),
+        batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync,
         shuffle=True)
     if eval_distribute:
-      eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices)
+      eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync
     else:
       eval_batch_size = BATCH_SIZE
     eval_input_fn = self.dataset_input_fn(
@@ -214,16 +182,37 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     ]
     feature_columns = linear_feature_columns + dnn_feature_columns
 
-    estimator_training.train_and_evaluate(
-        estimator,
-        estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS),
-        estimator_training.EvalSpec(
-            name=EVAL_NAME,
-            input_fn=eval_input_fn,
-            steps=None,
-            exporters=self._get_exporter(EXPORTER_NAME, feature_columns),
-            start_delay_secs=0,
-            throttle_secs=1))
+    eval_spec = estimator_training.EvalSpec(
+        name=EVAL_NAME,
+        input_fn=eval_input_fn,
+        steps=None,
+        exporters=self._get_exporter(EXPORTER_NAME, feature_columns),
+        start_delay_secs=0,
+        throttle_secs=1)
+
+    if use_train_and_evaluate:
+      estimator_training.train_and_evaluate(
+          estimator,
+          estimator_training.TrainSpec(train_input_fn, max_steps=MAX_STEPS),
+          eval_spec)
+    else:
+      estimator.train(train_input_fn, max_steps=MAX_STEPS)
+
+      latest_ckpt_path = estimator.latest_checkpoint()
+      metrics = estimator.evaluate(eval_input_fn,
+                                   checkpoint_path=latest_ckpt_path,
+                                   name=EVAL_NAME)
+
+      # Export the eval result to files.
+      eval_result = estimator_training._EvalResult(
+          status=estimator_training._EvalStatus.EVALUATED,
+          metrics=metrics,
+          checkpoint_path=latest_ckpt_path)
+      evaluator = estimator_training._TrainingExecutor._Evaluator(estimator,
+                                                                  eval_spec,
+                                                                  None)
+      evaluator._export_eval_result(eval_result, True)
+
     return estimator
 
   def _inspect_train_and_eval_events(self, estimator):
@@ -259,32 +248,74 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     ])
     self.assertAllEqual((BATCH_SIZE, LABEL_DIMENSION), predicted_proba.shape)
 
+  def _get_strategy_object(self, strategy_cls):
+    if strategy_cls == mirrored_strategy.CoreMirroredStrategy:
+      return strategy_cls(mirrored_strategy.all_local_devices())
+    else:
+      return strategy_cls(num_gpus_per_worker=context.num_gpus())
+
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
           train_distribute_cls=[
+              collective_all_reduce_strategy.CollectiveAllReduceStrategy,
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy
           ],
           eval_distribute_cls=[
-              None, mirrored_strategy.MirroredStrategy,
-              parameter_server_strategy.ParameterServerStrategy
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
+              parameter_server_strategy.ParameterServerStrategy,
           ],
-          required_gpus=1))
+          required_gpus=[0, 1]))
   def test_complete_flow_standalone_client(self, train_distribute_cls,
                                            eval_distribute_cls):
-    try:
-      train_distribute = train_distribute_cls(num_gpus=context.num_gpus())
-    except TypeError:
-      train_distribute = train_distribute_cls(num_gpus_per_worker=2)
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls()
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
+    cluster_spec = copy.deepcopy(self._cluster_spec)
+    if (train_distribute_cls !=
+        parameter_server_strategy.ParameterServerStrategy):
+      cluster_spec.pop("ps", None)
+    estimator = self._complete_flow(train_distribute, eval_distribute,
+                                    cluster_spec)
+    self._inspect_train_and_eval_events(estimator)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          train_distribute_cls=[
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
+          ],
+          eval_distribute_cls=[
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
+          ],
+          required_gpus=[0, 1]))
+  def test_estimator_standalone_client(self, train_distribute_cls,
+                                       eval_distribute_cls):
+    train_distribute = self._get_strategy_object(train_distribute_cls)
+
+    if eval_distribute_cls:
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
+    else:
+      eval_distribute = None
+
+    # We use the whole cluster for evaluation.
+    cluster = copy.deepcopy(self._cluster_spec)
+    cluster.pop("evaluator", None)
+
     estimator = self._complete_flow(
-        train_distribute, eval_distribute, remote_cluster=self._cluster_spec)
+        train_distribute, eval_distribute, remote_cluster=cluster,
+        use_train_and_evaluate=False)
     self._inspect_train_and_eval_events(estimator)
 
   def _mock_run_std_server(self, *args, **kwargs):
@@ -294,75 +325,56 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     self._barrier.wait()
     return ret
 
-  def _task_thread(self, train_distribute, eval_distribute, tf_config):
-    os.environ["TF_CONFIG"] = json.dumps(tf_config)
+  def _independent_worker_fn(
+      self,
+      train_distribute,
+      eval_distribute,
+  ):
     with test.mock.patch.object(dc, "_run_std_server",
                                 self._mock_run_std_server):
       self._complete_flow(train_distribute, eval_distribute)
 
-  def _run_task_in_thread(self, cluster_spec, task_type, task_id,
-                          train_distribute, eval_distribute):
-    if task_type:
-      tf_config = {
-          "cluster": cluster_spec,
-          "task": {
-              "type": task_type,
-              "index": task_id
-          }
-      }
-    else:
-      tf_config = {
-          "cluster": cluster_spec,
-          "task": {
-              "type": task_type,
-              "index": task_id
-          }
-      }
-    t = threading.Thread(
-        target=self._task_thread,
-        args=(train_distribute, eval_distribute, tf_config))
-    t.start()
-    return t
-
-  def _run_multiple_tasks_in_threads(self, cluster_spec, train_distribute,
-                                     eval_distribute):
-    threads = {}
-    for task_type in cluster_spec.keys():
-      threads[task_type] = []
-      for task_id in range(len(cluster_spec[task_type])):
-        t = self._run_task_in_thread(cluster_spec, task_type, task_id,
-                                     train_distribute, eval_distribute)
-        threads[task_type].append(t)
-    return threads
-
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
           train_distribute_cls=[
+              collective_all_reduce_strategy.CollectiveAllReduceStrategy,
               parameter_server_strategy.ParameterServerStrategy,
           ],
           eval_distribute_cls=[
               None, mirrored_strategy.MirroredStrategy,
-              parameter_server_strategy.ParameterServerStrategy
+              mirrored_strategy.CoreMirroredStrategy,
+              parameter_server_strategy.ParameterServerStrategy,
           ],
-          required_gpus=1))
+          required_gpus=[0, 1]))
   def test_complete_flow_indepedent_worker_between_graph(
       self, train_distribute_cls, eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    if (context.num_gpus() < 2 and eval_distribute_cls ==
+        collective_all_reduce_strategy.CollectiveAllReduceStrategy):
+      self.skipTest("`CollectiveAllReduceStrategy` needs at least two towers.")
+
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls()
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
-    cluster_spec = multi_worker_test_base.create_cluster_spec(
-        num_workers=3, num_ps=2, has_eval=True)
-    # 3 workers, 2 ps and 1 evaluator.
-    self._barrier = dc._Barrier(6)
-
-    threads = self._run_multiple_tasks_in_threads(
-        cluster_spec, train_distribute, eval_distribute)
+    if (train_distribute_cls == parameter_server_strategy
+        .ParameterServerStrategy):
+      cluster_spec = multi_worker_test_base.create_cluster_spec(
+          num_workers=3, num_ps=2, has_eval=True)
+      # 3 workers, 2 ps and 1 evaluator.
+      self._barrier = dc._Barrier(6)
+    else:
+      cluster_spec = multi_worker_test_base.create_cluster_spec(
+          num_workers=3, num_ps=0, has_eval=True)
+      # 3 workers and 1 evaluator.
+      self._barrier = dc._Barrier(4)
+
+    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
+                                                 cluster_spec, train_distribute,
+                                                 eval_distribute)
     for task_type, ts in threads.items():
       if task_type == PS:
         continue
@@ -375,15 +387,22 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
-          train_distribute_cls=[mirrored_strategy.MirroredStrategy],
-          eval_distribute_cls=[None, mirrored_strategy.MirroredStrategy],
-          required_gpus=1))
+          train_distribute_cls=[
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy
+          ],
+          eval_distribute_cls=[
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy
+          ],
+          required_gpus=[0, 1]))
   def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls,
                                                     eval_distribute_cls):
-    train_distribute = train_distribute_cls(num_gpus=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls()
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -391,8 +410,9 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
         num_workers=3, num_ps=0, has_eval=True)
     # 3 workers and 1 evaluator.
     self._barrier = dc._Barrier(4)
-    threads = self._run_multiple_tasks_in_threads(
-        cluster_spec, train_distribute, eval_distribute)
+    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
+                                                 cluster_spec, train_distribute,
+                                                 eval_distribute)
     threads[WORKER][0].join()
     threads[EVALUATOR][0].join()
 
@@ -430,7 +450,8 @@ class RunConfigTest(test.TestCase):
         "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}):
       run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
 
   def test_should_run_distribute_coordinator(self):
     """Tests that should_run_distribute_coordinator return a correct value."""
@@ -453,10 +474,12 @@ class RunConfigTest(test.TestCase):
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
       config_with_train_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
       config_with_eval_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              eval_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
     self.assertTrue(
         dc_training.should_run_distribute_coordinator(
             config_with_train_distribute))
@@ -469,26 +492,27 @@ class RunConfigTest(test.TestCase):
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
       config = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
     self.assertFalse(dc_training.should_run_distribute_coordinator(config))
 
   def test_init_run_config_duplicate_distribute(self):
     with self.assertRaises(ValueError):
       run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy(),
+          train_distribute=mirrored_strategy.CoreMirroredStrategy(),
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy()))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy()))
 
     with self.assertRaises(ValueError):
       run_config_lib.RunConfig(
-          eval_distribute=mirrored_strategy.MirroredStrategy(),
+          eval_distribute=mirrored_strategy.CoreMirroredStrategy(),
           experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.MirroredStrategy()))
+              eval_distribute=mirrored_strategy.CoreMirroredStrategy()))
 
   def test_init_run_config_none_distribute_coordinator_mode(self):
     # We don't use distribute coordinator for local training.
     config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.MirroredStrategy())
+        train_distribute=mirrored_strategy.CoreMirroredStrategy())
     dc_training.init_run_config(config, {})
     self.assertIsNone(config._distribute_coordinator_mode)
 
@@ -496,7 +520,7 @@ class RunConfigTest(test.TestCase):
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
       config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy())
+          train_distribute=mirrored_strategy.CoreMirroredStrategy())
       self.assertIsNone(config._distribute_coordinator_mode)
 
     # When `train_distribute` is not specified, don't use distribute
@@ -512,7 +536,7 @@ class RunConfigTest(test.TestCase):
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
       config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy())
+          train_distribute=mirrored_strategy.CoreMirroredStrategy())
     self.assertEqual(config._distribute_coordinator_mode,
                      dc.CoordinatorMode.INDEPENDENT_WORKER)
 
@@ -521,7 +545,7 @@ class RunConfigTest(test.TestCase):
     # `experimental.remote_cluster` is set use distribute coordinator with
     # STANDALONE_CLIENT mode.
     config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.MirroredStrategy(),
+        train_distribute=mirrored_strategy.CoreMirroredStrategy(),
         experimental_distribute=DistributeConfig(
             remote_cluster={"chief": ["fake_worker"]}))
     self.assertEqual(config._distribute_coordinator_mode,
@@ -529,5 +553,15 @@ class RunConfigTest(test.TestCase):
 
 
 if __name__ == "__main__":
+  # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly.
+  orig_init = session_manager.SessionManager.__init__
+
+  def new_init(*args, **kwargs):
+    kwargs.pop("recovery_wait_secs", None)
+    kwargs["recovery_wait_secs"] = 0.5
+    orig_init(*args, **kwargs)
+
+  session_manager.SessionManager.__init__ = new_init
+
   with test.mock.patch.object(sys, "exit", os._exit):
     test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index a84ef041960e389c08246fc8a16df2300856d968..60fda996642464135fe1fb8c314bcf7f04d19362 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -20,18 +20,26 @@ from __future__ import print_function
 import tensorflow as tf
 
 
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+
+
 NUM_CLASSES = 10
 
 
-def get_input_datasets():
+def get_input_datasets(use_bfloat16=False):
   """Downloads the MNIST dataset and creates train and eval dataset objects.
 
+  Args:
+    use_bfloat16: Boolean to determine if input should be cast to bfloat16
+
   Returns:
     Train dataset, eval dataset and input shape.
 
   """
   # input image dimensions
   img_rows, img_cols = 28, 28
+  cast_dtype = tf.bfloat16 if use_bfloat16 else tf.float32
 
   # the data, split between train and test sets
   (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
@@ -57,12 +65,13 @@ def get_input_datasets():
   # train dataset
   train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
   train_ds = train_ds.repeat()
-  train_ds = train_ds.shuffle(100)
+  train_ds = train_ds.map(lambda x, y: (tf.cast(x, cast_dtype), y))
   train_ds = train_ds.batch(64, drop_remainder=True)
 
   # eval dataset
   eval_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
   eval_ds = eval_ds.repeat()
+  eval_ds = eval_ds.map(lambda x, y: (tf.cast(x, cast_dtype), y))
   eval_ds = eval_ds.batch(64, drop_remainder=True)
 
   return train_ds, eval_ds, input_shape
@@ -97,23 +106,28 @@ def main(_):
   # Build the train and eval datasets from the MNIST data. Also return the
   # input shape which is constructed based on the `image_data_format`
   # i.e channels_first or channels_last.
+  tf.enable_eager_execution()
+
   train_ds, eval_ds, input_shape = get_input_datasets()
   model = get_model(input_shape)
 
   # Instantiate the MirroredStrategy object. If we don't specify `num_gpus` or
   # the `devices` argument then all the GPUs available on the machine are used.
-  strategy = tf.contrib.distribute.MirroredStrategy()
+  # TODO(priyag): Use `tf.distribute.MirroredStrategy` once available.
+  strategy = mirrored_strategy.MirroredStrategy(['/gpu:0', '/cpu:0'])
+
+  optimizer = rmsprop.RMSProp(learning_rate=0.001)
 
   # Compile the model by passing the distribution strategy object to the
   # `distribute` argument. `fit`, `evaluate` and `predict` will be distributed
   # based on the strategy instantiated.
   model.compile(loss=tf.keras.losses.categorical_crossentropy,
-                optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001),
+                optimizer=optimizer,
                 metrics=['accuracy'],
                 distribute=strategy)
 
   # Train the model with the train dataset.
-  model.fit(x=train_ds, epochs=20, steps_per_epoch=310)
+  model.fit(x=train_ds, epochs=20, steps_per_epoch=468)
 
   # Evaluate the model with the eval dataset.
   score = model.evaluate(eval_ds, steps=10, verbose=0)
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dfd85bcc4f3784e2744fd876a7190cc9581d96a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -0,0 +1,285 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that show that DistributionStrategy works with canned Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator import training
+from tensorflow.python.estimator.canned import dnn_linear_combined
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column_lib as feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+
+
+class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def dataset_input_fn(self, x, y, batch_size):
+
+    def input_fn():
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(1).batch(batch_size)
+      return dataset
+
+    return input_fn
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          distribution=[
+              combinations.one_device_strategy,
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus
+          ],
+          use_train_and_evaluate=[True, False]))
+  def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    train_input_fn = self.dataset_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size // distribution.num_replicas_in_sync)
+    eval_input_fn = self.dataset_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size // distribution.num_replicas_in_sync)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, batch_size=batch_size, shuffle=False)
+
+    linear_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    dnn_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+    session_config = config_pb2.ConfigProto(
+        log_device_placement=True, allow_soft_placement=True)
+    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
+        linear_feature_columns=linear_feature_columns,
+        dnn_hidden_units=(2, 2),
+        dnn_feature_columns=dnn_feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir,
+        dnn_optimizer=adam.Adam(0.001),
+        linear_optimizer=adam.Adam(0.001),
+        config=run_config.RunConfig(
+            train_distribute=distribution,
+            eval_distribute=distribution,
+            session_config=session_config))
+
+    num_steps = 2
+    if use_train_and_evaluate:
+      scores, _ = training.train_and_evaluate(
+          estimator, training.TrainSpec(train_input_fn, max_steps=num_steps),
+          training.EvalSpec(eval_input_fn))
+    else:
+      estimator.train(train_input_fn, steps=num_steps)
+      scores = estimator.evaluate(eval_input_fn)
+
+    self.assertIn('loss', six.iterkeys(scores))
+
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in estimator.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
+                                             serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+
+def get_model():
+  x = keras.layers.Input(shape=(3,), name='input')
+  y = keras.layers.Dense(4, name='dense')(x)
+  model = keras.Model(x, y)
+  return model
+
+
+class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def testKerasOptimizerWithUnequalInput(self, distribution):
+    def create_fn():
+      var = variables.Variable(
+          2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM)
+      # grad for cpu is 1, grad for gpu is 2, avg grad is 1.5.
+      loss = math_ops.cast(_replica_id() + 1, dtype=dtypes.float32) * var
+      optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
+      train_op = optimizer.minimize(loss, var_list=[var])
+      m = optimizer.get_slot(var, 'm')
+      v = optimizer.get_slot(var, 'v')
+      return (var, m, v, train_op, optimizer.iterations)
+
+    devices = ['/device:GPU:0', '/device:CPU:0']
+    with distribution.scope():
+      (var, m, v, op, counter) = distribution.call_for_each_replica(create_fn)
+      self.evaluate(variables.global_variables_initializer())
+      var_val = [2.0, 2.0, 2.0]
+      self.assertAllClose(
+          var_val,
+          self.evaluate(
+              [distribution.read_var(var),
+               var.get(devices[0]),
+               var.get(devices[1])]))
+      self.assertAllClose([0, 0, 0],
+                          self.evaluate([
+                              distribution.read_var(counter),
+                              counter.get(devices[0]),
+                              counter.get(devices[1])
+                          ]))
+
+      train_op = distribution.unwrap(op)
+      self.evaluate(train_op)
+      # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
+      m_val = [1.2, 1.2, 1.2]
+      # assert slot variables in both replicas are the same.
+      self.assertAllClose(
+          m_val,
+          self.evaluate(
+              [distribution.read_var(m),
+               m.get(devices[0]),
+               m.get(devices[1])]))
+      # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
+      v_val = [1.8, 1.8, 1.8]
+      self.assertAllClose(
+          v_val,
+          self.evaluate(
+              [distribution.read_var(v),
+               v.get(devices[0]),
+               v.get(devices[1])]))
+      # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1)
+      #        = 2.0 - 0.01 * 1.2 * sqrt(0.8) / sqrt(1.8) / 0.8
+      var_val = [1.99, 1.99, 1.99]
+      self.assertAllClose(
+          var_val,
+          self.evaluate(
+              [distribution.read_var(var),
+               var.get(devices[0]),
+               var.get(devices[1])]))
+      self.assertAllClose([1, 1, 1],
+                          self.evaluate([
+                              distribution.read_var(counter),
+                              counter.get(devices[0]),
+                              counter.get(devices[1])
+                          ]))
+
+      self.evaluate(train_op)
+      # m(2) = beta1 * m(1) + (1-beta1) * grad = 0.2 * 1.2 + 0.8 * 1.5
+      m_val = [1.44, 1.44, 1.44]
+      self.assertAllClose(
+          m_val,
+          self.evaluate(
+              [distribution.read_var(m),
+               m.get(devices[0]),
+               m.get(devices[1])]))
+      # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
+      v_val = [2.16, 2.16, 2.16]
+      self.assertAllClose(
+          v_val,
+          self.evaluate(
+              [distribution.read_var(v),
+               v.get(devices[0]),
+               v.get(devices[1])]))
+      self.assertAllClose([2, 2, 2],
+                          self.evaluate([
+                              distribution.read_var(counter),
+                              counter.get(devices[0]),
+                              counter.get(devices[1])
+                          ]))
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
+
+    with self.cached_session():
+      model = get_model()
+      optimizer = gradient_descent.SGD(0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 4), dtype=np.float32)
+
+      model.fit(
+          inputs,
+          targets,
+          epochs=1,
+          batch_size=2,
+          verbose=0,
+          validation_data=(inputs, targets))
+      model.evaluate(inputs, targets)
+      model.predict(inputs)
+
+
+def _replica_id():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if not isinstance(replica_id, ops.Tensor):
+    replica_id = constant_op.constant(replica_id)
+  return replica_id
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 3511b7761ff4d8c995bfa40a1098b8e803f2a1b3..683cc89bfbae9c877ea6794d311ffc00c96c6937 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -24,24 +24,25 @@ import numpy as np
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import tpu_strategy
-from tensorflow.contrib.distribute.python import values
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import test
 from tensorflow.python.estimator import keras as keras_lib
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 
-
 _RANDOM_SEED = 1337
 _TRAIN_SIZE = 200
 _INPUT_SIZE = (10,)
@@ -164,7 +165,9 @@ def get_multi_inputs_multi_outputs_data():
   return (train_data, test_data)
 
 
-def batch_wrapper(dataset, batch_size, distribution):
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
   # TPUs currently require fully defined input shapes, drop_remainder ensures
   # the input will have fully defined shapes.
   if isinstance(distribution, tpu_strategy.TPUStrategy):
@@ -197,30 +200,166 @@ def get_predict_dataset(distribution):
   return dataset
 
 
-strategies = [combinations.default_strategy,
-              combinations.one_device_strategy,
-              combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus,
-              combinations.tpu_strategy_one_step]
+def multi_input_output_model():
+  a = keras.layers.Input(shape=(3,), name='input_a')
+  b = keras.layers.Input(shape=(5,), name='input_b')
+  # TODO(anjalisridhar): Change the output dimension of the second Dense layer
+  # once the iterator output validation issue has been fixed.
+  dense_1 = keras.layers.Dense(7, name='dense_1')
+  dense_2 = keras.layers.Dense(7, name='dense_2')
+  c = dense_1(a)
+  d = dense_2(b)
+  e = keras.layers.Dropout(0.5, name='dropout')(c)
+  model = keras.models.Model([a, b], [d, e])
+  return model
+
+
+def get_correctness_test_inputs(use_numpy, use_validation_data,
+                                with_distribution,
+                                x_train, y_train, x_predict):
+  """Generates the inputs for correctness check when enable Keras with DS."""
+  training_epochs = 2
+  global_batch_size = 64
+  batch_size = global_batch_size
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  use_per_core_batch_size = (
+      with_distribution and
+      not distributed_training_utils.global_batch_size_supported(
+          with_distribution))
+  if use_per_core_batch_size:
+    batch_size //= with_distribution.num_replicas_in_sync
+
+  if use_numpy:
+    training_inputs = {
+        'batch_size': batch_size,
+        'x': x_train,
+        'y': y_train,
+        'epochs': training_epochs,
+        'shuffle': False,
+    }
+
+    if use_validation_data:
+      eval_inputs = None
+      training_inputs['validation_data'] = (x_train, y_train)
+    else:
+      eval_inputs = {
+          'batch_size': batch_size,
+          'x': x_train,
+          'y': y_train,
+      }
+    predict_inputs = {
+        'x': np.array(x_predict, dtype=np.float32),
+    }
+  else:
+    # For dataset inputs, we do not pass batch_size to
+    # keras.fit/evaluate/predict. The batch size is part of the dataset.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (x_train, y_train))
+    x = batch_wrapper(
+        train_dataset, batch_size, with_distribution, repeat=training_epochs)
+
+    training_inputs = {
+        'batch_size': None,
+        'x': x,
+        'y': None,
+        'epochs': training_epochs,
+        'shuffle': False,
+        'steps_per_epoch': len(x_train) // global_batch_size,
+    }
+    if use_validation_data:
+      eval_inputs = None  # Remove the eval_inputs
+      eval_dataset = dataset_ops.Dataset.from_tensor_slices(
+          (x_train, y_train))
+      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
+      training_inputs['validation_data'] = x
+      training_inputs['validation_steps'] = 5
+    else:
+      eval_inputs = {
+          'batch_size': None,
+          'x': x,
+          'y': None,
+          'steps': 20,
+      }
+
+    predict_batch_size = len(x_predict)
+    if use_per_core_batch_size:
+      predict_batch_size //= with_distribution.num_replicas_in_sync
+    predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
+    predict_dataset = batch_wrapper(predict_dataset,
+                                    predict_batch_size, with_distribution)
+    predict_inputs = {
+        'steps': 1,
+        'x': predict_dataset,
+    }
+
+  return training_inputs, eval_inputs, predict_inputs
 
 
-def strategy_combinations():
+strategies_minus_tpu = [
+    combinations.default_strategy,
+    combinations.one_device_strategy,
+    combinations.mirrored_strategy_with_gpu_and_cpu,
+    combinations.mirrored_strategy_with_two_gpus,
+    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+    combinations.core_mirrored_strategy_with_two_gpus]
+
+tpu_strategies = [
+    combinations.tpu_strategy,  # steps_per_run=2
+    combinations.tpu_strategy_one_step]
+
+
+def strategy_minus_tpu_combinations():
   return combinations.combine(
-      distribution=strategies,
+      distribution=strategies_minus_tpu,
+      mode=['graph', 'eager'])
+
+
+def tpu_strategy_combinations():
+  return combinations.combine(
+      distribution=tpu_strategies,
       mode=['graph'])
 
 
+def all_strategy_combinations():
+  return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
+
+
+# TODO(priyag): Add v2 optimizers here.
 def strategy_and_optimizer_combinations():
+  return combinations.times(
+      all_strategy_combinations(),
+      combinations.combine(
+          optimizer=[combinations.adagrad_optimizer_v1_fn,
+                     combinations.adam_optimizer_v1_fn,
+                     combinations.gradient_descent_optimizer_v1_fn,
+                     combinations.rmsprop_optimizer_v1_fn]))
+
+
+def strategy_and_input_combinations():
+  return (
+      combinations.times(
+          combinations.combine(distribution=strategies_minus_tpu),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])
+          + combinations.combine(mode=['eager'],
+                                 use_numpy=[False],
+                                 use_validation_data=[False])) +
+      combinations.times(
+          combinations.combine(distribution=tpu_strategies),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])))
+
+
+def strategy_for_numpy_input_combinations():
   return combinations.combine(
-      distribution=strategies,
-      optimizer=[combinations.adagrad_optimizer_v1_fn,
-                 combinations.adam_optimizer_v1_fn,
-                 combinations.gradient_descent_optimizer_v1_fn,
-                 combinations.rmsprop_optimizer_v1_fn],
+      distribution=strategies_minus_tpu + tpu_strategies,
       mode=['graph'])
 
 
-class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
+class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
+                                        parameterized.TestCase):
 
   def setUp(self):
     self._base_dir = os.path.join(self.get_temp_dir(),
@@ -228,17 +367,18 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     gfile.MakeDirs(self._base_dir)
     self._config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
-    self._dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
 
   def tearDown(self):
     writer_cache.FileWriterCache.clear()
     if os.path.isdir(self._base_dir):
       gfile.DeleteRecursively(self._base_dir)
 
-  def test_train_functional_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_train_functional_with_distribution_strategy(self, distribution):
     keras_model = simple_functional_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -246,8 +386,8 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist,
-                                      eval_distribute=dist)
+                                      train_distribute=distribution,
+                                      eval_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
@@ -261,9 +401,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     writer_cache.FileWriterCache.clear()
     gfile.DeleteRecursively(self._config.model_dir)
 
-  def test_train_sequential_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_train_sequential_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -271,7 +414,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist)
+                                      train_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
@@ -285,7 +428,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     writer_cache.FileWriterCache.clear()
     gfile.DeleteRecursively(self._config.model_dir)
 
-  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self, distribution):
     train_data, test_data = get_multi_inputs_multi_outputs_data()
 
     def train_input_fn():
@@ -315,14 +463,14 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
                                                      output_dict)).batch(16)
 
     self.do_test_multi_inputs_multi_outputs_with_input_fn(
-        train_input_fn, eval_input_fn)
+        distribution, train_input_fn, eval_input_fn)
 
-  def do_test_multi_inputs_multi_outputs_with_input_fn(self, train_input_fn,
-                                                       eval_input_fn):
+  def do_test_multi_inputs_multi_outputs_with_input_fn(
+      self, distribution, train_input_fn, eval_input_fn):
     config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED,
         model_dir=self._base_dir,
-        train_distribute=self._dist)
+        train_distribute=distribution)
     with self.cached_session():
       model = multi_inputs_multi_outputs_model()
       est_keras = keras_lib.model_to_estimator(keras_model=model, config=config)
@@ -332,9 +480,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
       eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
       self.assertLess(eval_results['loss'], baseline_eval_results['loss'])
 
-  def test_keras_optimizer_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_keras_optimizer_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -342,7 +493,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
 
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist)
+                                      train_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(keras_model=keras_model,
                                                config=config)
@@ -358,7 +509,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_creating_var_with_numpy_arrays(self, distribution):
     with self.cached_session():
       x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
@@ -367,7 +518,135 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # Verify that the numpy value is copied to the variable.
       self.assertAllEqual(x, val)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
+    with self.cached_session():
+      # Input samples of different sizes
+      input_20_samples = np.zeros((20, 3), dtype=np.float32)
+      input_63_samples = np.zeros((63, 3), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Default global batch size 32 for input with 64 samples run in 2 steps
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=None)
+      self.assertEqual(batch_size, 32 // replica_scale_factor)
+      self.assertEqual(steps, 2)
+
+      # Computed global batch size 20 is lower than 32 if we pass less samples.
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_20_samples, steps=None, batch_size=None)
+      self.assertEqual(batch_size, 20 // replica_scale_factor)
+      self.assertEqual(steps, 1)
+
+      #  Default global batch size 32 cannot be used with 63 samples.
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=None, batch_size=None)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_with_steps_no_batch_size(self,
+                                                             distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
+    with self.cached_session():
+      # Input samples of different sizes
+      input_63_samples = np.zeros((63, 3), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Computed global batch size is correct for number of specified 1 step
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=1, batch_size=None)
+      self.assertEqual(batch_size, 64 // replica_scale_factor)
+      self.assertEqual(steps, 1)
+
+      # Computed global batch size is correct for number of specified 2 steps
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=2, batch_size=None)
+      self.assertEqual(batch_size, 32 // replica_scale_factor)
+      self.assertEqual(steps, 2)
+
+      # All samples can not be consumed in specified number of steps
+      with self.assertRaisesRegexp(ValueError, 'not divisible by steps'):
+        distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=2, batch_size=None)
+
+      # This cases is different for different strategies due to the
+      # difference in supported batch size being global or per-replica.
+      if replica_scale_factor == 1:
+        # Computed global batch size is correct even if not sharadable
+        steps, batch_size = distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=3, batch_size=None)
+        self.assertEqual(batch_size, 21)
+        self.assertEqual(steps, 3)
+      else:
+        # Computed global batch size can not be sharded across replicas
+        with self.assertRaisesRegexp(ValueError, 'could not be sharded evenly '
+                                     'across the sync replicas'):
+          distributed_training_utils.get_input_params(
+              distribution, input_63_samples, steps=1, batch_size=None)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_no_steps_with_batch_size(self,
+                                                             distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
+    with self.cached_session():
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Computed steps is correct for specified batch size
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=16)
+      self.assertEqual(batch_size, 16)
+      self.assertEqual(steps, 4 // replica_scale_factor)
+
+      # Computed steps is correct for specified batch size
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=32)
+      self.assertEqual(batch_size, 32)
+      self.assertEqual(steps, 2 // replica_scale_factor)
+
+      # Number of samples is not divisible by the global batch size
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=None, batch_size=20)
+
+      # Number of samples is not divisible by the global batch size
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=None, batch_size=3)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_with_steps_with_batch_size(self,
+                                                               distribution):
+    with self.cached_session():
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # No change to steps and batch size if both specified and feasible
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=5, batch_size=3)
+      self.assertEqual(batch_size, 3)
+      self.assertEqual(steps, 5)
+
+      # Number of samples is less than global batch size * steps
+      with self.assertRaisesRegexp(ValueError, 'less than samples required'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=10, batch_size=13)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -398,29 +677,21 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # with batch_size
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
-
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
-
-      model = keras.models.Model([a, b], [d, e])
+      model = multi_input_output_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
       loss = 'mse'
       model.compile(optimizer, loss, distribute=distribution)
 
       input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
-      input_b_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
+      input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32)
       inputs = [input_a_np, input_b_np]
 
-      output_d_np = np.asarray(np.random.random((64, 4)), dtype=np.float32)
-      output_e_np = np.asarray(np.random.random((64, 4)), dtype=np.float32)
+      output_d_np = np.asarray(np.random.random((64, 7)), dtype=np.float32)
+      output_e_np = np.asarray(np.random.random((64, 7)), dtype=np.float32)
       targets = [output_d_np, output_e_np]
 
       # Call fit with validation data
@@ -440,11 +711,50 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # with batch_size
       model.predict(inputs, batch_size=8)
 
+  @combinations.generate(combinations.combine(
+      distribution=strategies_minus_tpu, mode=['graph']))
+  def test_numpy_with_sample_weights(self, distribution):
+    model = get_model()
+    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(optimizer, loss, distribute=distribution)
+
+    inputs = np.zeros((20, 3), np.float32)
+    targets = np.zeros((20, 4), np.float32)
+    sample_weights = np.ones((20), np.float32)
+
+    model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
+              steps_per_epoch=2, verbose=1)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_flatten_predict_outputs(self, distribution):
+    with self.cached_session():
+      model = multi_input_output_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      # We take 6 input samples with each input having a dimension of 3 or 5.
+      input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32)
+      input_b_np = np.asarray(np.random.random((6, 5)), dtype=np.float32)
+      inputs = [input_a_np, input_b_np]
+
+      outs = model.predict(inputs, steps=1)
+      # `predict` a list that is equal in length to the number of model outputs.
+      # In this test our model has two outputs and each element of `outs`
+      # corresponds to all the samples of one of the model outputs.
+      self.assertLen(outs, 2)
+      # Each of the output samples have a dimension of 7. We should process all
+      # the available input samples(6).
+      self.assertAllEqual([6, 7], outs[0].shape)
+      self.assertAllEqual([6, 7], outs[1].shape)
+
 
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -463,32 +773,68 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                 validation_data=dataset, validation_steps=2)
       model.predict(get_predict_dataset(distribution), steps=2)
 
+  @combinations.generate(all_strategy_combinations())
+  def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
+    with self.cached_session():
+      user_controlled_model = get_model()
+      user_controlled_model.compile(
+          gradient_descent.GradientDescentOptimizer(0.001),
+          loss='mse',
+          metrics=['mae', keras.metrics.CategoricalAccuracy()],
+          distribute=distribution)
+
+      interleaved_model = get_model()
+      interleaved_model.set_weights(user_controlled_model.get_weights())
+      interleaved_model.compile(
+          gradient_descent.GradientDescentOptimizer(0.001),
+          loss='mse',
+          metrics=['mae', keras.metrics.CategoricalAccuracy()],
+          distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      # Call fit with validation interleaved
+      interleaved_output = interleaved_model.fit(
+          dataset, epochs=2, steps_per_epoch=2, verbose=1,
+          validation_data=dataset, validation_steps=2, shuffle=False)
+
+      # Manually control the validation running after each epoch.
+      user_controlled_output = []
+      for _ in range(2):
+        user_controlled_model.fit(
+            dataset, epochs=1, steps_per_epoch=2, verbose=1, shuffle=False)
+        user_controlled_output.append(
+            user_controlled_model.evaluate(dataset, steps=2))
+
+      self.assertEqual(interleaved_output.history['val_loss'],
+                       [x[0] for x in user_controlled_output])
+      self.assertEqual(interleaved_output.history['val_mean_absolute_error'],
+                       [x[1] for x in user_controlled_output])
+      self.assertEqual(interleaved_output.history['val_categorical_accuracy'],
+                       [x[2] for x in user_controlled_output])
+
   # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work
   # as clone_model's input_tensors argument only seems to accept list and not
   # tuples or dict.
-  def test_fit_with_tuple_and_dict_dataset_inputs(self):
-    with self.cached_session():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
-
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-      model = keras.models.Model([a, b], [d, e])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
+    with self.cached_session():
+      model = multi_input_output_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
       input_a_np = np.random.random((10, 3))
-      input_b_np = np.random.random((10, 3))
-      output_d_np = np.random.random((10, 4))
-      output_e_np = np.random.random((10, 4))
+      input_b_np = np.random.random((10, 5))
+      output_d_np = np.random.random((10, 7))
+      output_e_np = np.random.random((10, 7))
 
       # Test with tuples
       dataset_tuple = dataset_ops.Dataset.from_tensor_slices((
@@ -507,7 +853,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -537,35 +883,67 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       model.evaluate(dataset, steps=2, verbose=1)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  def test_dataset_input_shape_validation(self):
+  @combinations.generate(strategy_minus_tpu_combinations())
+  def test_dataset_with_sample_weights(self, distribution):
+    model = get_model()
+    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(optimizer, loss, distribute=distribution)
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    sample_weights = np.ones((10), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
+                                                      sample_weights))
+    dataset = dataset.repeat()
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_dataset_wrong_input_shape(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
-      model.compile(optimizer, loss, distribute=strategy)
+      model.compile(optimizer, loss, distribute=distribution)
 
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 3), dtype=np.float32)
+      # Wrong input shape
+      inputs = np.zeros((10, 5), dtype=np.float32)
       targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
 
-      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
+      with self.assertRaisesRegexp(ValueError,
+                                   'expected input to have shape'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
-      # Wrong input shape
-      inputs = np.zeros((10, 5), dtype=np.float32)
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_dataset_no_batch_input_validation(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3), dtype=np.float32)
       targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'expected input to have shape'):
+      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
   @combinations.generate(combinations.combine(
@@ -587,51 +965,91 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       with self.assertRaisesRegexp(ValueError, 'requires fully defined shapes'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
-  def test_learning_phase_value(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_learning_phase_value(self, distribution):
     # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
     # meaningful values. Currently we don't pass the learning phase if the
     # Lambda layer uses the learning phase.
     with self.cached_session():
-      x = keras.layers.Input(shape=(16,), name='input')
-      y = keras.layers.Dense(16)(x)
+      x = keras.layers.Input(shape=(1,), name='input')
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
       z = keras.layers.Dropout(0.9999)(y)
       model = keras.Model(x, z)
+      initial_weights = model.get_weights()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.005)
       loss = 'mse'
       metrics = ['acc']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      batch_size = 8
+      if isinstance(distribution, mirrored_strategy.CoreMirroredStrategy):
+        # CoreMirroredStrategy uses global batch size.
+        batch_size = 8 * distribution.num_replicas_in_sync
 
-      inputs = np.random.rand(10, 16)
-      targets = np.ones((10, 16), dtype=np.float32)
+      inputs = np.ones((10, 1), dtype=np.float32)
+      targets = np.ones((10, 1), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(8)
+      dataset = dataset.repeat().batch(batch_size)
+      hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1)
+      self.assertAlmostEqual(hist.history['acc'][0], 0, 0)
+
+      model.set_weights(initial_weights)
+      # TODO(psv/anjalisridhar): Enable these lines after we fix b/117431185.
+      # evaluate_output = model.evaluate(dataset, steps=20)
+      # self.assertAlmostEqual(evaluate_output[1], 1, 0)
+
+      inputs = np.ones((10, 1), dtype=np.float32)
+      predict_dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+
+      predict_dataset = predict_dataset.repeat().batch(batch_size)
+      output = model.predict(predict_dataset, steps=10)
+      # `predict` runs for 10 steps
+      ref_output = np.ones((160, 1), dtype=np.float32)
+      self.assertArrayNear(output, ref_output, 1e-1)
+
+  @combinations.generate(strategy_minus_tpu_combinations())
+  def testOptimizerWithCallbacks(self, distribution):
+    with self.cached_session():
+      model = get_model()
 
-      hist = model.fit(dataset, epochs=5, steps_per_epoch=20, verbose=1)
-      self.assertEqual(hist.history['acc'][0], 1)
+      optimizer = gradient_descent_keras.SGD(0.01)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      dataset = get_dataset(distribution)
 
-      evaluate_output = model.evaluate(dataset, steps=20)
-      self.assertEqual(evaluate_output[1], 0)
+      def schedule(_):
+        return 0.001
 
-      predict_output = model.predict(dataset, steps=1)
-      self.assertNotEqual(np.mean(predict_output), 0)
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+      grouped_models = distribution.unwrap(model._grouped_model)
+      with distribution.scope():
+        for m in grouped_models:
+          self.assertAllClose(0.001, keras.backend.get_value(
+              m.optimizer.lr), atol=1e-05, rtol=1e-05)
 
 
 class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
-  def test_validating_dataset_input_tensors_with_shape_mismatch(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_shape_mismatch(self,
+                                                                distribution):
     with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2))
       b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
       x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
       y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
+      with distribution.scope():
         # Removed device and input tensor shape details from the error message
         # since the order of the device and the corresponding input tensor shape
         # is not deterministic over different runs.
@@ -640,17 +1058,21 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                      'distributed tensor inputs '
                                      'DistributedValues:.+'):
           distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
+              distribution, x, y)
 
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_dtype_mismatch(self,
+                                                                distribution):
     with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
       b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
       x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
       y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
+      with distribution.scope():
         # Removed device and input tensor dtype details from the error message
         # since the order of the device and the corresponding input tensor dtype
         # is not deterministic over different runs.
@@ -659,21 +1081,23 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                      'distributed tensor inputs '
                                      'DistributedValues:.+'):
           distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
+              distribution, x, y)
 
-  def test_unsupported_features(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_unsupported_features(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      dataset = get_dataset(strategy)
+      dataset = get_dataset(distribution)
 
       # Test with validation split
       with self.assertRaisesRegexp(
@@ -687,8 +1111,8 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       # Test with sample weight.
       sample_weight = np.random.random((10,))
       with self.assertRaisesRegexp(
-          NotImplementedError, '`sample_weight` is currently not supported '
-                               'when using DistributionStrategy.'):
+          ValueError, '`sample_weight` argument is not supported when input '
+                      '`x` is a dataset or a dataset iterator.'):
         model.fit(
             dataset,
             epochs=1,
@@ -708,45 +1132,48 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                    'you should specify the `steps` argument'):
         model.predict(dataset, verbose=0)
 
-  def test_calling_with_unsupported_predefined_callbacks(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      dataset = get_dataset(strategy)
+      dataset = get_dataset(distribution)
 
       def schedule(_):
         return 0.001
       with self.assertRaisesRegexp(ValueError,
-                                   'LearningRateScheduler callback is not '
-                                   'supported with DistributionStrategy.'):
+                                   'You must specify a Keras Optimizer V2 when '
+                                   'using'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                   callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
 
       with self.assertRaisesRegexp(ValueError,
-                                   'ReduceLROnPlateau callback is not '
-                                   'supported with DistributionStrategy.'):
+                                   'You must specify a Keras Optimizer V2 when '
+                                   'using'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                   callbacks=[keras.callbacks.ReduceLROnPlateau()])
-      with self.assertRaisesRegexp(ValueError,
-                                   'histogram_freq in the TensorBoard callback '
-                                   'is not supported when using '
-                                   'DistributionStrategy.'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)])
 
 
-class TestDistributionStrategyWithLossMasking(test.TestCase):
+class TestDistributionStrategyWithLossMasking(test.TestCase,
+                                              parameterized.TestCase):
 
   # TODO(priyag): Enable all strategies for this test. Currently it does not
   # work for TPU due to some invalid datatype.
-  def test_masking(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_masking(self, distribution):
     with self.cached_session():
       np.random.seed(1337)
       x = np.array([[[1], [1]], [[0], [0]]])
@@ -755,12 +1182,9 @@ class TestDistributionStrategyWithLossMasking(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
       model.compile(loss='mse',
                     optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    distribute=strategy)
+                    distribute=distribution)
       y = np.array([[[1], [1]], [[1], [1]]])
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
@@ -772,7 +1196,7 @@ class TestDistributionStrategyWithLossMasking(test.TestCase):
 class TestDistributionStrategyWithNormalizationLayer(
     test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_batchnorm_correctness(self, distribution):
     with self.cached_session():
       model = keras.models.Sequential()
@@ -804,7 +1228,7 @@ class TestDistributionStrategyWithNormalizationLayer(
 class TestDistributionStrategyCorrectness(test.TestCase,
                                           parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_metric_correctness(self, distribution):
     with self.cached_session():
       keras.backend.set_image_data_format('channels_last')
@@ -827,78 +1251,152 @@ class TestDistributionStrategyCorrectness(test.TestCase,
           distribute=distribution)
 
       batch_size = 64
-      batch_size //= distribution.num_towers
+      if not distributed_training_utils.global_batch_size_supported(
+          distribution):
+        batch_size //= distribution.num_replicas_in_sync
       train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
       train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
 
-      history = model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
-      self.assertEqual(history.history['binary_accuracy'], [1.0])
+      history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
+      self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
 
-  @combinations.generate(strategy_combinations())
-  def test_correctness(self, distribution):
+  @combinations.generate(all_strategy_combinations())
+  def test_eval_metrics_correctness(self, distribution):
     with self.cached_session():
-      keras.backend.set_image_data_format('channels_last')
-      num_samples = 10000
+      model = keras.Sequential()
+      model.add(
+          keras.layers.Dense(
+              3, activation='relu', input_dim=4, kernel_initializer='ones'))
+      model.add(
+          keras.layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones'))
+      model.compile(
+          loss='mae',
+          metrics=['accuracy', keras.metrics.BinaryAccuracy()],
+          optimizer=gradient_descent.GradientDescentOptimizer(0.001),
+          distribute=distribution)
+
+      # verify correctness of stateful and stateless metrics.
+      x = np.ones((100, 4)).astype('float32')
+      y = np.ones((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = batch_wrapper(dataset, 4, distribution)
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 1.)
+      self.assertEqual(outs[2], 1.)
+
+      y = np.zeros((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = batch_wrapper(dataset, 4, distribution)
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 0.)
+      self.assertEqual(outs[2], 0.)
+
+  @combinations.generate(strategy_and_input_combinations())
+  def test_correctness(self, distribution, use_numpy, use_validation_data):
 
-      # Train and predict datasets are created with the same input numpy arrays.
+    with self.cached_session():
+      default_tolerance = 1e-5
+      tol_table = {}
+
+      if isinstance(distribution, (mirrored_strategy.MirroredStrategy,
+                                   mirrored_strategy.CoreMirroredStrategy)):
+        # TODO(b/119257215): Weights are not exactly the same, so use larger
+        # tolerance for now. Predict should be related to weights.
+        tol_table = {
+            'weights_1': 1e-4,
+            'weights_2': 1e-4,
+            'predict_result_1': 1e-4,
+        }
+
+      keras.backend.set_image_data_format('channels_last')
+      np.random.seed(_RANDOM_SEED)
+      random_seed.set_random_seed(_RANDOM_SEED)
+
+      # Train, eval, and predict datasets are created with the same input numpy
+      # arrays.
+      # TODO(xiejw): Change this back to 10000, once we support final partial
+      # batch.
+      num_samples = 9984
       x_train = np.random.rand(num_samples, 1)
       y_train = 3 * x_train
       x_train = x_train.astype('float32')
       y_train = y_train.astype('float32')
+      x_predict = [[1.], [2.], [3.], [4.]]
 
       # The model is built once and the initial weights are saved.
       # This is used to initialize the model for both the distribution and
-      # non-distribution run.
-      model = keras.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(1,)))
+      # non-distribution run. In addition, we add few non-linear layers to make
+      # it non-trivial.
+      def _create_model():
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(1))
+        return model
+
+      model = _create_model()
       initial_weights = model.get_weights()
+      del model  # avoid accident usage.
 
-      def fit_and_predict(with_distribution=None):
+      def fit_eval_and_predict(with_distribution=None):
+        model = _create_model()
+        # We have initialized the model to the same weight for the distribution
+        # and non-distribution run.
         model.set_weights(initial_weights)
         model.compile(
             loss=keras.losses.mean_squared_error,
-            optimizer=gradient_descent.GradientDescentOptimizer(0.5),
+            optimizer=gradient_descent_keras.SGD(0.5),
+            metrics=['mse'],
             distribute=with_distribution)
 
-        batch_size = 64
-        if with_distribution:
-          batch_size //= with_distribution.num_towers
-        train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train,
-                                                                y_train))
-        train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
-        # We have initialized the model to the same weight for the distribution
-        # and non-distribution run. If you want to initialize the model to
-        # random weights for each run, you need to run the model through the
-        # entire dataset at least once to ensure that the weights converge to
-        # the same value.
-        model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
-
-        weights = model.get_weights()
-        x_predict = [[1.], [2.], [3.], [4.]]
-        predict_batch_size = 4
-        if with_distribution:
-          predict_batch_size //= with_distribution.num_towers
-        predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
-        predict_dataset = batch_wrapper(predict_dataset,
-                                        predict_batch_size, distribution)
-        predict_result = model.predict(predict_dataset, steps=1)
-        predict_result = np.reshape(predict_result, (4, 1))
-
-        return weights, predict_result
-
-      wts_with_ds, predict_with_ds = fit_and_predict(
-          with_distribution=distribution)
-      wts_without_ds, predict_without_ds = fit_and_predict(
-          with_distribution=None)
-
-      # Verify that the weights are the same within some limits of tolerance.
-      np.testing.assert_allclose(wts_with_ds[0], wts_without_ds[0], rtol=1e-3)
-      # Verify that the predicted outputs are the same within some limits of
-      # tolerance.
-      np.testing.assert_allclose(predict_with_ds, predict_without_ds, rtol=1e-3)
-
-
-# TODO(priyag): Add a test for TPUStrategy with steps_per_run > 1.
+        training_inputs, eval_inputs, predict_inputs = (
+            get_correctness_test_inputs(use_numpy, use_validation_data,
+                                        with_distribution,
+                                        x_train, y_train, x_predict))
+
+        result = {}
+        result['training_history_1'] = model.fit(**training_inputs).history
+
+        if eval_inputs is not None:
+          result['eval_result_1'] = model.evaluate(**eval_inputs)
+
+        result['weights_1'] = model.get_weights()
+        result['predict_result_1'] = model.predict(**predict_inputs)
+
+        # Train and eval again to mimic user's flow.
+
+        result['training_history_2'] = model.fit(**training_inputs).history
+
+        if eval_inputs is not None:
+          result['eval_result_2'] = model.evaluate(**eval_inputs)
+
+        result['weights_2'] = model.get_weights()
+
+        return result
+
+      results_with_ds = fit_eval_and_predict(with_distribution=distribution)
+      results_without_ds = fit_eval_and_predict(with_distribution=None)
+
+      # Verify that the weights, training history, eval results, predict outputs
+      # are the same within some limits of tolerance.
+      for key in results_with_ds:
+        if (key.startswith('training_history') and
+            isinstance(distribution, tpu_strategy.TPUStrategy) and
+            distribution.extended.steps_per_run > 1):
+          # TODO(b/119894254): Enable this test for all cases once the
+          # underlying bug is fixed.
+          continue
+
+        tolerance = tol_table.get(key, default_tolerance)
+
+        self.assertAllClose(
+            results_with_ds[key],
+            results_without_ds[key],
+            atol=tolerance,
+            rtol=tolerance,
+            msg='Fail to assert {}.'.format(key))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index ae4189eb1cb217f8a209b57f91a0ddb82e63dcd9..8ac659abe96370b751ed1556cc699fe20788a0fd 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -72,14 +72,14 @@ def _regression_dataset_fn():
       "predictions": [1., .75, .25, 0.]}).repeat()
 
 
-# TODO(priyag): Add TPU Strategy to this once metrics aggregate correctly using
-# TowerLocalVariables on TPUs. Submit http://cl/208914352.
 def all_combinations():
   return combinations.combine(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       mode=["graph"])
 
 
@@ -96,30 +96,32 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
   def _test_metric(self, distribution, dataset_fn, metric_fn, expected_fn):
     with ops.Graph().as_default(), distribution.scope():
       iterator = distribution.distribute_dataset(
-          dataset_fn).make_one_shot_iterator()
+          dataset_fn).make_initializable_iterator()
       if isinstance(distribution, tpu_strategy.TPUStrategy):
         def step_fn(ctx, inputs):
-          value, update = distribution.call_for_each_tower(
-              metric_fn, inputs)
+          value, update = distribution.call_for_each_replica(
+              metric_fn, args=inputs)
           ctx.set_non_tensor_output(name="value", output=value)
           return distribution.group(update)
 
         ctx = distribution.run_steps_on_dataset(
-            step_fn, iterator, iterations=distribution.steps_per_run)
+            step_fn, iterator, iterations=distribution.extended.steps_per_run)
         update = ctx.run_op
         value = ctx.non_tensor_outputs["value"]
         # In each run, we run multiple steps, and each steps consumes as many
-        # batches as number of towers.
+        # batches as number of replicas.
         batches_per_update = (
-            distribution.num_towers * distribution.steps_per_run)
+            distribution.num_replicas_in_sync *
+            distribution.extended.steps_per_run)
       else:
-        value, update = distribution.call_for_each_tower(
+        value, update = distribution.call_for_each_replica(
             metric_fn, iterator.get_next())
         update = distribution.group(update)
         # TODO(josh11b): Once we switch to using a global batch size for input,
-        # replace "distribution.num_towers" with "1".
-        batches_per_update = distribution.num_towers
+        # replace "distribution.num_replicas_in_sync" with "1".
+        batches_per_update = distribution.num_replicas_in_sync
 
+      self.evaluate(iterator.initializer)
       self.evaluate(distribution.initialize())
       self.evaluate(variables.local_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index ba147e78241e5ab45809e498e00debd45a2c49b4..f09483cb56b66fd4720ee71085203c14f1ccadc3 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -22,10 +22,10 @@ from absl.testing import parameterized
 import numpy
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
 from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -41,6 +41,14 @@ from tensorflow.python.ops.losses import losses_impl
 
 class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
+  def _get_iterator(self, ds):
+    if context.executing_eagerly():
+      iterator = ds.make_one_shot_iterator()
+    else:
+      iterator = ds.make_initializable_iterator()
+      self.evaluate(iterator.initializer)
+    return iterator
+
   @combinations.generate(
       combinations.times(
           combinations.distributions_and_v1_optimizers(),
@@ -56,14 +64,12 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_tower(
-                model_fn, *inputs, run_concurrently=layer.built))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_one_shot_iterator()
+      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
       def run_step():
         return distribution.run_steps_on_dataset(
@@ -93,19 +99,18 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           combinations.distributions_and_v1_optimizers(),
           combinations.combine(mode=["graph"], use_callable_loss=[True, False])
           + combinations.combine(mode=["eager"], use_callable_loss=[True])))
-  def testTrainNetworkByCallForEachTower(self, distribution, optimizer_fn,
-                                         use_callable_loss):
+  def testTrainNetworkByCallForEachReplica(self, distribution, optimizer_fn,
+                                           use_callable_loss):
     with distribution.scope():
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_one_shot_iterator()
+      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
       def run_step():
         return distribution.group(
-            distribution.call_for_each_tower(
-                model_fn, iterator.get_next(), run_concurrently=layer.built))
+            distribution.call_for_each_replica(
+                model_fn, args=(iterator.get_next(),)))
 
       if not context.executing_eagerly():
         with self.cached_session() as sess:
@@ -153,14 +158,12 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_tower(
-                model_fn, *inputs, run_concurrently=layer.built))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_one_shot_iterator()
+      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
       def run_step():
         return distribution.run_steps_on_dataset(
@@ -179,11 +182,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def get_expected_variables(optimizer_fn, num_parameter_devices):
         variables_map = {
             "GradientDescent": ["dense/kernel", "dense/bias"],
-            "Adam": [
-                "dense/kernel", "dense/bias", "beta1_power", "beta2_power",
-                "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam",
-                "dense/bias/Adam_1"
-            ],
             "Adagrad": [
                 "dense/kernel/Adagrad", "dense/kernel",
                 "dense/bias/Adagrad", "dense/bias"
@@ -210,42 +208,34 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
               combinations.combine(
                   mode=["graph", "eager"],
                   # TODO(isaprykin):  Allow False here.  Currently subsequent
-                  # towers will re-execute UPDATE_OPS of previous towers.
-                  update_ops_in_cross_tower_mode=[True])) +
+                  # replicas will re-execute UPDATE_OPS of previous replicas.
+                  update_ops_in_cross_replica_mode=[True])) +
           combinations.combine(
               distribution=[combinations.tpu_strategy],
               optimizer_fn=combinations.optimizers_v1,
               mode=["graph"],
-              update_ops_in_cross_tower_mode=[False])))
+              update_ops_in_cross_replica_mode=[False])))
   def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
-                                    renorm, update_ops_in_cross_tower_mode):
-    """Verifies that moving mean updates are reduced across towers."""
+                                    renorm, update_ops_in_cross_replica_mode):
+    """Verifies that moving mean updates are reduced across replicas."""
     with distribution.scope():
-      num_towers = len(distribution.worker_devices)
+      num_replicas = distribution.num_replicas_in_sync
       model_fn, dataset_fn, batchnorm = batchnorm_example(
           optimizer_fn,
-          batch_per_epoch=num_towers,
+          batch_per_epoch=num_replicas,
           momentum=momentum,
           renorm=renorm,
-          update_ops_in_tower_mode=not update_ops_in_cross_tower_mode)
-
-      # Make sure prefetching is disabled since that makes the
-      # specific input on each device to be non deterministic, and
-      # this test relies on specific input being on each device.
-      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
-        self.assertFalse(distribution._prefetch_on_device)
+          update_ops_in_replica_mode=not update_ops_in_cross_replica_mode)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         fetches = distribution.unwrap(
-            distribution.call_for_each_tower(
-                model_fn, *inputs, run_concurrently=batchnorm.built))
-        if update_ops_in_cross_tower_mode:
-          fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+            distribution.call_for_each_replica(model_fn, args=inputs))
+        if update_ops_in_cross_replica_mode:
+          fetches += tuple(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
         return control_flow_ops.group(fetches)
 
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_one_shot_iterator()
+      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
       def run_step():
         return distribution.run_steps_on_dataset(
@@ -261,17 +251,17 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       def averaged_batch_mean(i):
         # Each batch has shape [16, 8] where the ith element in jth list is
-        # (8 * j + i + tower_id * 100). So the batch mean in each tower is
-        # (60 + i + tower_id * 100). So here comes its batch mean over all
-        # towers:
-        return 60. + i + (num_towers - 1.) / 2. * 100.
+        # (8 * j + i + replica_id * 100). So the batch mean in each replica is
+        # (60 + i + replica_id * 100). So here comes its batch mean over all
+        # replicas:
+        return 60. + i + (num_replicas - 1.) / 2. * 100.
 
       for _ in range(10):
         run_step()
         moving_means = self.evaluate(batchnorm.moving_mean)
 
         # We make sure that the moving_mean is updated as if the sample mean is
-        # calculated over all towers.
+        # calculated over all replicas.
         for i, expected_moving_mean in enumerate(expected_moving_means):
           expected_moving_means[i] -= ((
               expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
@@ -296,7 +286,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
                   distribution=[
                       combinations.one_device_strategy,
                       combinations.mirrored_strategy_with_gpu_and_cpu,
-                      combinations.mirrored_strategy_with_two_gpus
+                      combinations.mirrored_strategy_with_two_gpus,
+                      combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                      combinations.core_mirrored_strategy_with_two_gpus
                   ]),
               combinations.combine(
                   mode=["graph"], use_callable_loss=[True, False]) +
@@ -332,14 +324,12 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
         return dataset_ops.Dataset.zip((features, labels)).repeat()
 
-      def step_fn(ctx, x, y):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_tower(
-                model_fn, x, y, run_concurrently=False))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_one_shot_iterator()
+      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
       def run_step():
         return distribution.run_steps_on_dataset(
@@ -354,7 +344,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       run_step()
 
       v = all_vars[0]
-      self.assertTrue(all([v is vi for vi in all_vars[1:]]))
+      self.assertTrue(all(v is vi for vi in all_vars[1:]))
       weight = numpy.squeeze(self.evaluate(v))
       # Our model is:
       #   predict = x * w
@@ -371,10 +361,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
       # with sum loss reduction, or 10.6 with mean.
       if loss_reduction == losses_impl.Reduction.SUM:
-        # Note that the "distribution.num_towers" factor will go away once
-        # we split the input across towers, instead of pulling a complete
-        # batch of input per tower.
-        self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001)
+        # Note that the "distribution.num_replicas_in_sync" factor will go away
+        # once we split the input across replicas, instead of pulling a complete
+        # batch of input per replica.
+        self.assertNear(weight, 2 + 21.2 * distribution.num_replicas_in_sync,
+                        0.0001)
       else:
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 10.6, 0.0001)
@@ -414,59 +405,58 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         train_op = optimizer.minimize(loss_fn)
         loss = loss_fn()
         output_context.set_last_step_output(
-            name="tower_loss_agg",
+            name="replica_loss_reduced",
             output=loss,
-            aggregation=variables_lib.VariableAggregation.MEAN)
+            reduce_op=reduce_util.ReduceOp.MEAN)
         output_context.set_non_tensor_output(key1, value1)
         return (train_op, loss)
 
-      def step_fn(output_context, *inputs):
-        (train_op, loss) = distribution.call_for_each_tower(
-            model_fn, output_context, *inputs, run_concurrently=False)
+      def step_fn(output_context, inputs):
+        (train_op, loss) = distribution.call_for_each_replica(
+            model_fn, args=(output_context,) + inputs)
         output_context.set_last_step_output(
-            name="cross_tower_loss_agg",
+            name="cross_replica_loss_reduced",
             output=loss,
-            aggregation=variables_lib.VariableAggregation.MEAN)
+            reduce_op=reduce_util.ReduceOp.MEAN)
         output_context.set_last_step_output(
-            name="cross_tower_loss_noagg",
+            name="cross_replica_loss_not_reduced",
             output=loss)
         return distribution.group(train_op)
 
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_one_shot_iterator()
+      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
       def run_step():
         initial_loss = lambda: constant_op.constant(1e7)
-        # Initial values corresponding to aggregated losses are just single
-        # tensors. But for non aggregated losses, we need to have initial
+        # Initial values corresponding to reduced losses are just single
+        # tensors. But for non reduced losses, we need to have initial
         # values that are of the same structure as non reduced losses. In
         # MirroredStrategy, this will be a list of losses, in TPUStrategy
         # it will be single tensor. Using `broadcast` followed by `unwrap`
         # gives us the desired initial value structure.
         initial_loop_values = {
-            "tower_loss_agg": initial_loss(),
-            "cross_tower_loss_agg": initial_loss(),
-            "cross_tower_loss_noagg":
+            "replica_loss_reduced": initial_loss(),
+            "cross_replica_loss_reduced": initial_loss(),
+            "cross_replica_loss_not_reduced":
             distribution.unwrap(distribution.broadcast(initial_loss()))
         }
         ctx = distribution.run_steps_on_dataset(
             step_fn, iterator, iterations=2,
             initial_loop_values=initial_loop_values)
 
-        self.assertEqual({key1: [value1]}, ctx.non_tensor_outputs)
+        self.assertEqual({key1: (value1,)}, ctx.non_tensor_outputs)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["tower_loss_agg"],
-            aggregated=True, distribution=distribution)
+            loss_output=ctx.last_step_outputs["replica_loss_reduced"],
+            reduced=True, distribution=distribution)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_tower_loss_agg"],
-            aggregated=True, distribution=distribution)
+            loss_output=ctx.last_step_outputs["cross_replica_loss_reduced"],
+            reduced=True, distribution=distribution)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_tower_loss_noagg"],
-            aggregated=False, distribution=distribution)
-        return (ctx.run_op, ctx.last_step_outputs["tower_loss_agg"])
+            loss_output=ctx.last_step_outputs["cross_replica_loss_not_reduced"],
+            reduced=False, distribution=distribution)
+        return (ctx.run_op, ctx.last_step_outputs["replica_loss_reduced"])
 
       self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
@@ -491,18 +481,16 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       error_is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(error_is_not_increasing)
 
-  def _verify_loss_output(self, initial_loss, loss_output, aggregated,
+  def _verify_loss_output(self, initial_loss, loss_output, reduced,
                           distribution):
-    if not aggregated:
-      self.assertEqual(distribution.num_towers,
-                       len(distribution.unwrap(loss_output)))
-      loss_output = distribution.reduce(
-          aggregation=variables_lib.VariableAggregation.MEAN,
-          value=loss_output, destinations="/device:CPU:0")
-
-    unwrapped_output = distribution.unwrap(loss_output)
-    self.assertEqual(1, len(unwrapped_output))
-    loss_tensor = unwrapped_output[0]
+    if not reduced:
+      self.assertLen(distribution.unwrap(loss_output),
+                     distribution.num_replicas_in_sync)
+      loss_tensor = distribution.reduce(reduce_util.ReduceOp.MEAN, loss_output)
+    else:
+      unwrapped_output = distribution.unwrap(loss_output)
+      self.assertLen(unwrapped_output, 1)
+      loss_tensor = unwrapped_output[0]
     self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
     self.assertEqual(initial_loss.shape, loss_tensor.shape)
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index a32424b316b003cc58ccf28fd968acb6a764a542..20f1a08d4261b931a9353738147fba7d7dff9225 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -12,300 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Class MirroredStrategy implementing DistributionStrategy."""
+"""Contrib version of MirroredStrategy."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-from functools import partial
-import threading
+import functools
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import shared_variable_creator
-from tensorflow.contrib.distribute.python import values
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.util import nest
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import values
 
 
-# TODO(josh11b): Replace asserts in this file with if ...: raise ...
-
-
-@contextlib.contextmanager
-def _enter_graph(g):
-  if context.executing_eagerly():
-    with g.as_default(), context.eager_mode():
-      yield
-  else:
-    with g.as_default():
-      yield
-
-
-def _cpu_device(device):
-  cpu_device = tf_device.DeviceSpec.from_string(device)
-  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
-  return cpu_device.to_string()
-
-
-class _RequestedStop(Exception):
-  pass
-
-
-# _call_for_each_tower and _reduce_non_distributed_value are not members of
-# MirroredStrategy so that they are generally not allowed to use anything
-# specific to MirroredStrategy and thus can be shared with other distribution
-# strategies.
-
-
-# TODO(yuefengz): maybe create a common class for those who need to call this
-# _call_for_each_tower.
-def _call_for_each_tower(distribution, fn, *args, **kwargs):
-  """Run `fn` in separate threads, once per tower/worker device.
-
-  Args:
-    distribution: the DistributionStrategy object.
-    fn: function to run (will be run once per device, each in its own thread).
-    *args: positional arguments for `fn`
-    **kwargs: keyword arguments for `fn`.
-        `"run_concurrently"`: Boolean indicating whether executions of `fn`
-           can be run concurrently (under eager execution only), defaults to
-           `True`.
-
-  Returns:
-    Merged return value of `fn` across all towers.
-
-  Raises:
-    RuntimeError: If fn() calls get_tower_context().merge_call() a different
-        number of times from the available devices.
-  """
-  run_concurrently = kwargs.pop("run_concurrently", True)
-  if not context.executing_eagerly():
-    # Lots of TF library code isn't thread-safe in graph mode, and
-    # there is little to be gained by turning on multithreading when
-    # constructing a graph.
-    run_concurrently = False
-    # Needed for per-thread device, etc. contexts in graph mode.
-    ops.get_default_graph().switch_to_thread_local()
-  elif run_concurrently is None:
-    run_concurrently = True
-
-  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
-
-  shared_variable_store = {}
-
-  # TODO(isaprykin): Create these threads once instead of during every run()
-  # call.
-  threads = []
-  for index, d in enumerate(distribution.worker_devices):
-    variable_creator_fn = shared_variable_creator.make_fn(
-        shared_variable_store, index)
-    t = MirroredStrategy._MirroredTowerThread(  # pylint: disable=protected-access
-        distribution, coord, d, variable_creator_fn, fn,
-        *values.select_device(d, args), **values.select_device(d, kwargs))
-    threads.append(t)
-
-  for t in threads:
-    t.start()
-
-  # When `fn` starts `should_run` event is set on _MirroredTowerThread
-  # (`MTT`) threads. The execution waits until
-  # `MTT.has_paused` is set, which indicates that either `fn` is
-  # complete or a `get_tower_context().merge_call()` is called.  If `fn` is
-  # complete, then `MTT.done` is set to True.  Otherwise, arguments
-  # of `get_tower_context().merge_call` from all paused threads are grouped
-  # and the `merge_fn` is performed.  Results of the
-  # `get_tower_context().merge_call` are then set to `MTT.merge_result`.
-  # Each such `get_tower_context().merge_call` call returns the
-  # `MTT.merge_result` for that thread when `MTT.should_run` event
-  # is reset again. Execution of `fn` resumes.
-
-  try:
-    with coord.stop_on_exception():
-      all_done = False
-      while not all_done and not coord.should_stop():
-        done = []
-        if run_concurrently:
-          for t in threads:
-            t.should_run.set()
-          for t in threads:
-            t.has_paused.wait()
-            t.has_paused.clear()
-            if coord.should_stop():
-              return None
-            done.append(t.done)
-        else:
-          for t in threads:
-            t.should_run.set()
-            t.has_paused.wait()
-            t.has_paused.clear()
-            if coord.should_stop():
-              return None
-            done.append(t.done)
-        if coord.should_stop():
-          return None
-        all_done = all(done)
-        if not all_done:
-          if any(done):
-            raise RuntimeError("Some towers made a different number of "
-                               "tower_context().merge_call() calls.")
-          # get_tower_context().merge_call() case
-          merge_args = values.regroup({t.device: t.merge_args for t in threads})
-          merge_kwargs = values.regroup(
-              {t.device: t.merge_kwargs for t in threads})
-          # We capture the name_scope of the MTT when we call merge_fn
-          # to ensure that if we have opened a name scope in the MTT,
-          # it will be respected when executing the merge function. We only
-          # capture the name_scope from the first MTT and assume it is
-          # the same for all other MTTs.
-          mtt_captured_name_scope = threads[0].captured_name_scope
-          with ops.name_scope(mtt_captured_name_scope):
-            merge_result = threads[0].merge_fn(distribution, *merge_args,
-                                               **merge_kwargs)
-          for t in threads:
-            t.merge_result = values.select_device(t.device, merge_result)
-  finally:
-    for t in threads:
-      t.should_run.set()
-    coord.join(threads)
-
-  return values.regroup({t.device: t.main_result for t in threads})
-
-
-def _reduce_non_distributed_value(distribution, aggregation, value,
-                                  destinations):
-  """Reduce a non-DistributedValue `value` to `destinations`."""
-  if isinstance(value, values.DistributedValues):
-    raise ValueError("You are passing a `DistributedValue` to "
-                     "`_reduce_non_distributed_value`, which is not allowed.")
-
-  # If the same value is present on all towers then the PerDevice value will
-  # be a single value. We also handle the case when `value` is a single value
-  # and equal to 0.
-  if value == 0:
-    return 0
-  # If the aggregation type is MEAN or ONLY_FIRST_TOWER, then this
-  # essentially means that the same value should be on all destinations.
-  if aggregation in (
-      variable_scope.VariableAggregation.MEAN,
-      variable_scope.VariableAggregation.ONLY_FIRST_TOWER):
-    return value
-
-  cross_tower_ops_lib.validate_destinations(destinations)
-  # We do not support an aggregation type of SUM if the value is the same across
-  # all towers. We call this as part of assign functions for MirroredVariables
-  # and summing up identical values across towers is not clearly defined.
-  if (len(distribution.worker_devices) != 1 or
-      not cross_tower_ops_lib.check_destinations(destinations)):
-    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
-                     "the given aggregation %s." % (value, aggregation))
-  # TODO(anjalisridhar): Moves these methods to a device utility file?
-  devices = cross_tower_ops_lib.get_devices_from(destinations)
-  if len(devices) == 1:
-    with ops.device(devices[0]):
-      return array_ops.identity(value)
-  else:
-    value_updates = {}
-    for d in devices:
-      with ops.device(d):
-        value_updates[d] = array_ops.identity(value)
-    return values.Mirrored(value_updates)
-
-
-def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
-  # Figure out what collections this variable should be added to.
-  # We'll add the MirroredVariable to those collections instead.
-  collections = kwargs.pop("collections", None)
-  if collections is None:
-    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-  kwargs["collections"] = []
-
-  # Get synchronization value
-  synchronization = kwargs.get("synchronization",
-                               variable_scope.VariableSynchronization.ON_WRITE)
-  if synchronization == variable_scope.VariableSynchronization.NONE:
-    raise ValueError("`NONE` variable synchronization mode is not "
-                     "supported with `Mirrored` distribution strategy. Please"
-                     " change the `synchronization` for variable: " +
-                     kwargs["name"])
-  elif synchronization == variable_scope.VariableSynchronization.ON_READ:
-    # Variables that are to be synced on read are tower local.
-    is_tower_local = True
-    kwargs["trainable"] = False
-  elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or
-        synchronization == variable_scope.VariableSynchronization.AUTO):
-    # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`.
-    is_tower_local = False
-  else:
-    raise ValueError("Invalid variable synchronization mode: " +
-                     synchronization + " for variable: " + kwargs["name"])
-
-  # Get aggregation value
-  aggregation = kwargs.pop("aggregation",
-                           variable_scope.VariableAggregation.NONE)
-  if aggregation not in (
-      variable_scope.VariableAggregation.NONE,
-      variable_scope.VariableAggregation.SUM,
-      variable_scope.VariableAggregation.MEAN,
-      variable_scope.VariableAggregation.ONLY_FIRST_TOWER
-  ):
-    raise ValueError("Invalid variable aggregation mode: " + aggregation +
-                     " for variable: " + kwargs["name"])
-
-  # Ignore user-specified caching device, not needed for mirrored variables.
-  kwargs.pop("caching_device", None)
-
-  # TODO(josh11b,apassos): It would be better if variable initialization
-  # was never recorded on the tape instead of having to do this manually
-  # here.
-  with tape.stop_recording():
-    index = real_mirrored_creator(devices, *args, **kwargs)
-
-    if is_tower_local:
-      result = values.TowerLocalVariable(index, index[devices[0]], aggregation)
-    else:
-      result = values.MirroredVariable(index, index[devices[0]], aggregation)
-
-  # Add the wrapped variable to the requested collections.
-  # The handling of eager mode and the global step matches
-  # ResourceVariable._init_from_args().
-  if not context.executing_eagerly():
-    g = ops.get_default_graph()
-    # If "trainable" is True, next_creator() will add the member variables
-    # to the TRAINABLE_VARIABLES collection, so we manually remove
-    # them and replace with the MirroredVariable. We can't set
-    # "trainable" to False for next_creator() since that causes functions
-    # like implicit_gradients to skip those variables.
-    if kwargs.get("trainable", True):
-      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for v in index.values():
-        l.remove(v)
-    g.add_to_collections(collections, result)
-  elif ops.GraphKeys.GLOBAL_STEP in collections:
-    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
-
-  return result
+# pylint: disable=protected-access,invalid-name
+_call_for_each_replica = mirrored_strategy._call_for_each_replica
+_reduce_non_distributed_value = mirrored_strategy._reduce_non_distributed_value
+_create_mirrored_variable = mirrored_strategy._create_mirrored_variable
+all_local_devices = mirrored_strategy.all_local_devices
+CoreMirroredStrategy = mirrored_strategy.MirroredStrategy
+CoreMirroredExtended = mirrored_strategy.MirroredExtended
+# pylint: enable=protected-access,invalid-name
 
 
 class MirroredStrategy(distribute_lib.DistributionStrategy):
   """Mirrors vars to distribute across multiple devices and machines.
 
-  This strategy uses one tower per device and sync replication for its multi-GPU
-  version.
+  *** contrib version ***
+
+  This strategy uses one replica per device and sync replication for its
+  multi-GPU version.
 
   When `cluster_spec` is given by the `configure` method., it turns into the
   mulit-worker version that works on multiple workers with in-graph replication.
@@ -329,12 +66,12 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     index. They all do similar things except for one worker checkpointing model
     variables, writing summaries, etc. in addition to its ordinary work.
 
-  The multi-worker version of this class maps one tower to one device on a
-  worker. It mirrors all model variables on all towers. For example, if you have
-  two `worker`s and each `worker` has 4 GPUs, it will create 8 copies of the
-  model variables on these 8 GPUs. Then like in MirroredStrategy, each tower
-  performs their computation with their own copy of variables unless in
-  cross-tower model where variable or tensor reduction happens.
+  The multi-worker version of this class maps one replica to one device on a
+  worker. It mirrors all model variables on all replicas. For example, if you
+  have two `worker`s and each `worker` has 4 GPUs, it will create 8 copies of
+  the model variables on these 8 GPUs. Then like in MirroredStrategy, each
+  replica performs their computation with their own copy of variables unless in
+  cross-replica model where variable or tensor reduction happens.
 
   Args:
     devices: a list of device strings.
@@ -344,489 +81,80 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     num_gpus_per_worker: number of GPUs per worker. This is the same as
       `num_gpus` and only one of `num_gpus` and `num_gpus_per_worker` can be
       specified.
-    cross_tower_ops: optional, a descedant of `CrossTowerOps`. If this is not
+    cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
       set, the `configure` method will try to find the best one.
-    prefetch_on_device: optional boolean to specify whether to prefetch input
-      data to devices.
     auto_shard_dataset: whether to auto-shard the dataset when there are
       multiple workers.
+    cross_tower_ops: Deprecated alias for `cross_device_ops`.
   """
 
   def __init__(self,
                devices=None,
                num_gpus=None,
                num_gpus_per_worker=None,
-               cross_tower_ops=None,
-               prefetch_on_device=None,
-               auto_shard_dataset=False):
-    super(MirroredStrategy, self).__init__()
-
-    self._cross_tower_ops = cross_tower_ops
-    self._prefetch_on_device = prefetch_on_device
-    self._auto_shard_dataset = auto_shard_dataset
-    # Rememeber num GPUs which might be needed by `configure` method.
+               cross_device_ops=None,
+               auto_shard_dataset=False,
+               cross_tower_ops=None):
+    assert not (cross_device_ops and cross_tower_ops)
     if num_gpus is not None and num_gpus_per_worker is not None:
       raise ValueError(
           "You cannot specify both `num_gpus` and `num_gpus_per_worker`.")
-    if num_gpus is not None:
-      self._num_gpus = num_gpus
-    else:
-      self._num_gpus = num_gpus_per_worker
-
-    self._initialize_local(self._num_gpus, devices)
-
-  def _initialize_local(self, num_gpus, devices):
-    """Initializes the object for local training."""
-    self._cluster_spec = None
-    # Convert `num_gpus` into `devices`, shouldn't specify both.
-    if devices is None:
-      if num_gpus is None:
-        num_gpus = context.num_gpus()
-      if num_gpus == 0:
-        devices = ["/device:CPU:0"]
-      else:
-        devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
-    elif num_gpus is not None:
-      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
-    self._num_gpus = num_gpus
-    # TODO(yuefengz): consider setting the default device.
-
-    assert devices, "Must specify at least one device."
-    assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
-    # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerDevice({d: i for i, d in enumerate(devices)})
-
-  def _initialize_multi_worker(self, num_gpus, cluster_spec):
-    """Initializes the object for multi-worker training."""
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._cluster_spec = cluster_spec
-
-    self._workers = []
-    for job in ["chief", "worker"]:
-      for task in range(len(cluster_spec.as_dict().get(job, []))):
-        self._workers.append("/job:%s/task:%d" % (job, task))
-
     if num_gpus is None:
-      raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
-    if num_gpus > 0:
-      self._worker_device_map = {
-          worker: [
-              device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
-              for gpu in range(num_gpus)
-          ] for worker in self._workers
-      }
-    else:
-      self._worker_device_map = {
-          worker: [device_util.canonicalize(worker, "/device:CPU:0")]
-          for worker in self._workers
-      }
+      num_gpus = num_gpus_per_worker
+    extended = MirroredExtended(self, devices, num_gpus,
+                                cross_device_ops or cross_tower_ops,
+                                auto_shard_dataset)
+    super(MirroredStrategy, self).__init__(extended)
 
-    devices = nest.flatten(self._worker_device_map)
 
-    # Setting `_default_device` will add a device scope in the
-    # distribution.scope. We set the default device to the first worker. When
-    # users specify device under distribution.scope by
-    #   with tf.device("/cpu:0"):
-    #     ...
-    # their ops will end up on the cpu device of its first worker, e.g.
-    # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
-    self._default_device = self._workers[0]
+class MirroredExtended(CoreMirroredExtended):
+  """Implementation of (contrib) MirroredStrategy."""
 
-    assert devices, "Must specify at least one device."
-    assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
-    # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerDevice(
-        {d: i for i, d in enumerate(devices)})
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    """Create a mirrored variable. See `DistributionStrategy.scope`."""
-    colocate_with = kwargs.pop("colocate_with", None)
-    devices = self._get_devices_from(colocate_with)
-
-    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
-      index = {}
-      for i, d in enumerate(devices):
-        with ops.device(d):
-          if i > 0:
-            # Give replicas meaningful distinct names:
-            var0name = index[devices[0]].name.split(":")[0]
-            # We append a / to variable names created on towers with id > 0 to
-            # ensure that we ignore the name scope and instead use the given
-            # name as the absolute name of the variable.
-            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
-            # Initialize replicas with the same value:
-            if context.executing_eagerly():
-              kwargs["initial_value"] = array_ops.identity(
-                  index[devices[0]].value())
-            else:
-              def initial_value_fn(device=d):
-                with ops.device(device):
-                  return array_ops.identity(index[devices[0]].initial_value)
-              kwargs["initial_value"] = initial_value_fn
-          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-            v = next_creator(*args, **kwargs)
-          assert not isinstance(v, values.DistributedVariable)
-          index[d] = v
-      return index
-
-    return _create_mirrored_variable(devices, _real_mirrored_creator, *args,
-                                     **kwargs)
+  def __init__(self,
+               container_strategy,
+               devices=None,
+               num_gpus_per_worker=None,
+               cross_device_ops=None,
+               auto_shard_dataset=False):
+    if devices is None:
+      devices = mirrored_strategy.all_local_devices(num_gpus_per_worker)
+    elif num_gpus_per_worker is not None:
+      raise ValueError(
+          "Must only specify one of `devices` and `num_gpus_per_worker`.")
+    super(MirroredExtended, self).__init__(container_strategy, devices,
+                                           cross_device_ops)
+    self._auto_shard_dataset = auto_shard_dataset
 
-  def distribute_dataset(self, dataset_fn):
-    if self._cluster_spec:
-      return values.MultiWorkerDataset(
-          partial(self._call_dataset_fn, dataset_fn), self._worker_device_map,
-          self._prefetch_on_device, self._auto_shard_dataset)
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch.
+
+    This implementation is different than the one in
+    `tf.distribute.MirroredStrategy` for purposes of backward compatibility.
+    We treat the incoming dataset's batch size as per replica batch size.
+
+    Args:
+      dataset: `tf.data.Dataset` for input.
+    Returns:
+      An `InputIterator` which returns inputs for each step of the computation.
+    """
+    if self._local_mode:
+      worker = device_util.canonicalize("/device:CPU:0")
+      worker_device_pairs = [(worker, self._devices)]
     else:
-      return values.PerDeviceDataset(
-          self._call_dataset_fn(dataset_fn), self._devices,
-          self._prefetch_on_device)
-
-  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values=None):
-    if initial_loop_values is None:
-      initial_loop_values = {}
-    initial_loop_values = nest.flatten(initial_loop_values)
-
-    ctx = values.MultiStepContext()
-    def body(i, *args):
-      """A wrapper around `fn` to create the while loop body."""
-      del args
-      fn_inputs = iterator.get_next()
-      if not isinstance(fn_inputs, tuple):
-        fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
-      for (name, output) in ctx.last_step_outputs.items():
-        # Convert all outputs to tensors, potentially from `DistributedValues`.
-        ctx.last_step_outputs[name] = self.unwrap(output)
-      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
-      with ops.control_dependencies([fn_result]):
-        return [i + 1] + flat_last_step_outputs
-
-    # We capture the control_flow_context at this point, before we run `fn`
-    # inside a while_loop. This is useful in cases where we might need to exit
-    # these contexts and get back to the outer context to do some things, for
-    # e.g. create an op which should be evaluated only once at the end of the
-    # loop on the host. One such usage is in creating metrics' value op.
-    self._outer_control_flow_context = (
-        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
-
-    cond = lambda i, *args: i < iterations
-    i = constant_op.constant(0)
-    loop_result = control_flow_ops.while_loop(
-        cond, body, [i] + initial_loop_values, name="",
-        parallel_iterations=1, back_prop=False, swap_memory=False,
-        return_same_structure=True)
-    del self._outer_control_flow_context
-
-    ctx.run_op = control_flow_ops.group(loop_result)
-
-    # Convert the last_step_outputs from a list to the original dict structure
-    # of last_step_outputs.
-    last_step_tensor_outputs = loop_result[1:]
-    last_step_tensor_outputs_dict = nest.pack_sequence_as(
-        ctx.last_step_outputs, last_step_tensor_outputs)
-
-    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
-      output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been aggregated, wrap them in a Mirrored
-      # container, else in a PerDevice container.
-      if aggregation is variables_lib.VariableAggregation.NONE:
-        last_step_tensor_outputs_dict[name] = values.regroup(
-            {d: t for d, t in zip(self._devices, output)}, values.PerDevice)
-      else:
-        assert len(output) == 1
-        last_step_tensor_outputs_dict[name] = output[0]
-
-    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
-    return ctx
-
-  def _broadcast(self, tensor, destinations):
-    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
-    return self._get_cross_tower_ops().broadcast(tensor, destinations or
-                                                 self._devices)
-
-  def _call_for_each_tower(self, fn, *args, **kwargs):
-    return _call_for_each_tower(self, fn, *args, **kwargs)
-
-  def map(self, map_over, fn, *args, **kwargs):
-    # TODO(josh11b): In eager mode, use one thread per device.
-    index = {}
-    for i, m in enumerate(map_over):
-      d = self._devices[i % len(self._devices)]
-      with ops.device(d):
-        l = index.get(d, [])
-        l.append(fn(m,
-                    *values.select_device_mirrored(d, args),
-                    **values.select_device_mirrored(d, kwargs)))
-        index[d] = l
-    # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput
-    # in addition to PerDevice data.
-    return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
-
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
-    del task_type, task_id
-
-    if session_config:
-      session_config.isolate_session_state = True
-
-    if cluster_spec:
-      self._initialize_multi_worker(self._num_gpus, cluster_spec)
-
-    if self._cross_tower_ops is None:
-      if self._cluster_spec:
-        # It currently cannot detect the toplogy of remote workers. So we
-        # hard-code the multi-worker all-reduce algorithm for now.
-        if len(self._workers) == 1:
-          # The default is "nccl".
-          self._cross_tower_ops = cross_tower_ops_lib.AllReduceCrossTowerOps()
-        else:
-          # The default is hierarchical reduce and broadcast.
-          self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce(
-              self._workers, self._num_gpus)
-      else:
-        self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
-            self._devices, session_config=session_config)
-
-  def _get_cross_tower_ops(self):
-    if self._cross_tower_ops is None:
-      self._cross_tower_ops = (
-          cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps())
-    return self._cross_tower_ops
-
-  def _reduce(self, aggregation, value, destinations):
-    assert not isinstance(value, values.Mirrored)
-    if not isinstance(value, values.DistributedValues):
-      # This function handles reducing values that are not PerDevice or Mirrored
-      # values. For example, the same value could be present on all towers in
-      # which case `value` would be a single value or value could be 0.
-      return _reduce_non_distributed_value(self, aggregation, value,
-                                           destinations)
-    if aggregation == variable_scope.VariableAggregation.ONLY_FIRST_TOWER:
-      value = value.get(self._devices[0])
-      if isinstance(value, (int, float)):
-        return value
-      return self.broadcast(value, destinations)
-    return self._get_cross_tower_ops().reduce(
-        aggregation, value, destinations=destinations)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if aggregation == variable_scope.VariableAggregation.ONLY_FIRST_TOWER:
-      return [self.broadcast(v.get(self._devices[0]), d)
-              for v, d in value_destination_pairs]
-    return self._get_cross_tower_ops().batch_reduce(aggregation,
-                                                    value_destination_pairs)
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    # TODO(josh11b): In eager mode, use one thread per device.
-    assert isinstance(var, values.DistributedVariable)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    updates = {}
-    for d, v in var._index.items():  # pylint: disable=protected-access
-      name = "update_%d" % self._device_index.get(d)
-      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        # If args and kwargs are not mirrored, the value is returned as is.
-        updates[d] = fn(v,
-                        *values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    assert isinstance(colocate_with, list)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    # TODO(josh11b): In eager mode, use one thread per device.
-    updates = {}
-    for d in colocate_with:
-      name = "update_%d" % self._device_index.get(d)
-      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        updates[d] = fn(*values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  def read_var(self, tower_local_var):
-    """Read the aggregate value of a tower-local variable."""
-    if isinstance(tower_local_var, values.TowerLocalVariable):
-      return tower_local_var._get_cross_tower()  # pylint: disable=protected-access
-    assert isinstance(tower_local_var, values.Mirrored)
-    return array_ops.identity(tower_local_var.get())
-
-  def _unwrap(self, val):
-    if isinstance(val, values.DistributedValues):
-      # Return in a deterministic order.
-      if set(val.devices) == self._canonical_device_set:
-        return [val.get(device=d) for d in self._devices]
-      return [val.get(device=d) for d in sorted(val.devices)]
-    return [val]
-
-  def value_container(self, val):
-    return values.value_container(val)
-
-  @property
-  def is_single_tower(self):
-    return len(self._devices) == 1
-
-  @property
-  def num_towers(self):
-    return len(self._devices)
-
-  def _worker_device_index(self):
-    return self._device_index
-
-  @property
-  def worker_devices(self):
-    # Make a copy to prevent users from accidentally mutating our copy.
-    return list(self._devices)
-
-  @property
-  def parameter_devices(self):
-    return list(self._devices)
-
-  @property
-  def between_graph(self):
-    return False
-
-  @property
-  def should_init(self):
-    return True
-
-  @property
-  def should_checkpoint(self):
-    return True
-
-  @property
-  def should_save_summary(self):
-    return True
-
-  def non_slot_devices(self, var_list):
-    del var_list
-    return list(self._devices)
+      worker_device_pairs = self._worker_devices
+    return values.DatasetIterator(dataset, worker_device_pairs)
 
-  def _get_devices_from(self, colocate_with=None):
-    if colocate_with is None:
-      return self._devices
+  def _distribute_dataset(self, dataset_fn):
+    if self._local_mode:
+      return values.PerReplicaDataset(
+          self._call_dataset_fn(dataset_fn), self._devices)
     else:
-      return cross_tower_ops_lib.get_devices_from(colocate_with)
-
-  class _MirroredTowerThread(threading.Thread):
-    """A thread that runs() a function on a device."""
-
-    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
-                 **kwargs):
-      super(MirroredStrategy._MirroredTowerThread, self).__init__()  # pylint: disable=protected-access
-      self.coord = coord
-      self.distribution = dist
-      self.device = device
-      self.tower_id = dist.worker_devices.index(device)
-      self.variable_creator_fn = variable_creator_fn
-      # State needed to run and return the results of `fn`.
-      self.main_fn = fn
-      self.main_args = args
-      self.main_kwargs = kwargs
-      self.main_result = None
-      self.done = False
-      # State needed to run the next merge_call() (if any) requested via
-      # TowerContext.
-      self.merge_fn = None
-      self.merge_args = None
-      self.merge_kwargs = None
-      self.merge_result = None
-      self.captured_name_scope = None
-      # We use a thread.Event for the main thread to signal when this
-      # thread should start running (`should_run`), and another for
-      # this thread to transfer control back to the main thread
-      # (`has_paused`, either when it gets to a
-      # `get_tower_context().merge_call` or when `fn` returns). In
-      # either case the event starts cleared, is signaled by calling
-      # set(). The receiving thread waits for the signal by calling
-      # wait() and then immediately clearing the event using clear().
-      self.should_run = threading.Event()
-      self.has_paused = threading.Event()
-      # These fields have to do with inheriting various contexts from the
-      # parent thread:
-      # pylint: disable=protected-access
-      self.context_mode = context.context()._eager_context.mode
-      if not context.context()._context_handle:
-        context.context()._initialize_handle_and_devices()
-      self.context_device_policy = (
-          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-              context.context()._context_handle))
-      self.graph = ops.get_default_graph()
-      self._variable_creator_stack = self.graph._variable_creator_stack[:]
-      self._captured_var_scope = variable_scope.get_variable_scope()
-      # Adding a "/" at end lets us re-enter this scope later.
-      self._name_scope = self.graph.get_name_scope()
-      if self._name_scope:
-        self._name_scope += "/"
-      if self.tower_id > 0:
-        if not self._name_scope:
-          self._name_scope = ""
-        self._name_scope += "tower_%d/" % self.tower_id
-
-    def run(self):
-      # pylint: disable=protected-access
-      self.graph._variable_creator_stack = self._variable_creator_stack
-      self.should_run.wait()
-      self.should_run.clear()
-      try:
-        if self.coord.should_stop():
-          return
-        with self.coord.stop_on_exception(), \
-            context.context()._mode(self.context_mode), \
-            context.context().device_policy(self.context_device_policy), \
-            _enter_graph(self.graph), \
-            MirroredTowerContext(self.distribution, self.tower_id), \
-            ops.device(self.device), \
-            ops.name_scope(self._name_scope), \
-            variable_scope.variable_scope(
-                self._captured_var_scope, reuse=self.tower_id > 0), \
-            variable_scope.variable_creator_scope(self.variable_creator_fn):
-          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
-          self.done = True
-      finally:
-        self.has_paused.set()
-
-
-class MirroredTowerContext(distribute_lib.TowerContext):
-  """TowerContext used in MirroredStrategy.call_for_each_tower().
-
-  Opened in `_MirroredTowerThread`, to allow the user to invoke
-  `MirroredStrategy`'s specific implementation of `merge_call()`,
-  which works by delegating the function and its arguments to
-  the main thread (the one that invoked
-  `MirroredStrategy.call_for_each_tower()`).
-  """
-
-  def _merge_call(self, fn, *args, **kwargs):
-    """Delegate to the main thread to actually perform merge_call()."""
-    t = threading.current_thread()  # a _MirroredTowerThread
-    t.merge_fn = fn
-    t.merge_args = args
-    t.merge_kwargs = kwargs
-    t.captured_name_scope = t.graph.get_name_scope()
-    # Adding a "/" at end lets us re-enter this scope later.
-    if t.captured_name_scope:
-      t.captured_name_scope += "/"
-    t.has_paused.set()
-    t.should_run.wait()
-    t.should_run.clear()
-    if t.coord.should_stop():
-      raise _RequestedStop()
-    return t.merge_result
+      return values.MultiWorkerDataset(
+          functools.partial(self._call_dataset_fn, dataset_fn),
+          self._worker_devices,
+          auto_shard=self._auto_shard_dataset)
 
+  # TODO(priyag): Delete this once all strategies use global batch size.
   @property
-  def device(self):
-    distribute_lib.require_tower_context(self)
-    return self._distribution_strategy.worker_devices[self._tower_id]
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index eeac528329a239f6a8a68a72c44272566b1d83d1..36be5c83f8bafb6c934d1d7682b5227b1f71c089 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -20,269 +20,268 @@ from __future__ import print_function
 
 import sys
 
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training as keras_training
+from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import server_lib
 
 
 GPU_TEST = "test_gpu" in sys.argv[0]
 
 
-class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.mirrored_strategy_with_two_gpus,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_two_gpus],
+    mode=["graph", "eager"]))
+class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
+                                        parameterized.TestCase):
 
-  def _get_distribution_strategy(self):
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    if GPU_TEST:
-      self.assertGreater(context.num_gpus(), 0)
-      if context.num_gpus() > 1:
-        devices = ["/device:GPU:0", "/device:GPU:1"]
-    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
-    return mirrored_strategy.MirroredStrategy(devices)
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
 
-  def testMinimizeLossEager(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
 
-  def testMinimizeLossGraph(self):
-    soft_placement = not GPU_TEST
-    print("testMinimizeLossGraph soft_placement:", soft_placement)
-    self._test_minimize_loss_graph(
-        self._get_distribution_strategy(), soft_placement=soft_placement)
-
-  def testMapReduce(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_map_reduce(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_device_index(self._get_distribution_strategy())
-
-  def testTowerId(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_tower_id(self._get_distribution_strategy())
-
-  def testNumTowers(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self.assertEqual(2, self._get_distribution_strategy().num_towers)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testRunRegroupError(self):
-
-    def run_fn(device_id):
+  def testNumReplicasInSync(self, distribution):
+    self.assertEqual(2, distribution.num_replicas_in_sync)
+
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
+
+  def testRunRegroupError(self, distribution):
+    def run_fn():
+      replica_id = int(self.evaluate(_replica_id()))
       # Generates a list with different lengths on different devices.
       # Will fail in _regroup() (if more than one device).
-      return list(range(device_id))
-
-    dist = self._get_distribution_strategy()
-    with dist.scope(), self.assertRaises(AssertionError):
-      dist.call_for_each_tower(run_fn, dist.worker_device_index)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testReduceToCpu(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    def run_fn(device_id):
-      return device_id
-
-    dist = self._get_distribution_strategy()
-    with dist.scope():
-      result = dist.call_for_each_tower(run_fn, dist.worker_device_index)
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.SUM,
-          result,
-          destinations="/device:CPU:0")
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(1, len(unwrapped))
-      expected = sum(range(len(dist.worker_devices)))
-      self.assertEqual(expected, self.evaluate(unwrapped[0]))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testReduceOnlyFirstTowerUpdates(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    def run_fn(device_id):
-      return constant_op.constant(3 + 5 * device_id)
-
-    dist = self._get_distribution_strategy()
-    with dist.scope():
-      result = dist.call_for_each_tower(run_fn, dist.worker_device_index)
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.ONLY_FIRST_TOWER,
-          result,
-          destinations="/device:CPU:0")
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(1, len(unwrapped))
-      self.assertEqual(3, self.evaluate(unwrapped[0]))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testReduceToMultipleDestinations(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    devices = ["/device:GPU:0"]
-    if GPU_TEST:
-      self.assertGreater(context.num_gpus(), 0)
-    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
-
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.SUM,
-          1.0,
-          destinations=["/device:CPU:0", "/device:GPU:0"])
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(2, len(unwrapped))
-      self.assertEqual(1.0, self.evaluate(unwrapped[0]))
+      return list(range(replica_id))
+
+    with distribution.scope(), self.assertRaises(AssertionError):
+      distribution.extended.call_for_each_replica(run_fn)
+
+  def testReduceToCpu(self, distribution):
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(_replica_id)
+      reduced = distribution.reduce(reduce_util.ReduceOp.SUM, result)
+      expected = sum(range(distribution.num_replicas_in_sync))
+      self.assertEqual(expected, self.evaluate(reduced))
+
+  def testMakeInputFnIterator(self, distribution):
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=2,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = distribution.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
+                                 expected_values)
+
+  def testGlobalStepUpdate(self, distribution):
+    self._test_global_step_update(distribution)
+
+
+def one_device_combinations():
+  return combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_one_cpu,
+          combinations.mirrored_strategy_with_one_gpu,
+          combinations.core_mirrored_strategy_with_one_cpu,
+          combinations.core_mirrored_strategy_with_one_gpu],
+      mode=["graph", "eager"])
+
+
+class MirroredOneDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(one_device_combinations())
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
+
+  @combinations.generate(one_device_combinations())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
+
+  @combinations.generate(one_device_combinations())
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
+
+
+class MirroredStrategyVariableCreatorStackTest(
+    test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=["graph"]))
+  def testCreatorStacksAreThreadLocal(self, distribution):
+    def model_fn():
+      replica_id_str = str(self.evaluate(_replica_id()))
+
+      def thread_creator_fn(next_creator, *args, **kwargs):
+        return next_creator(*args, **kwargs) + ":thread_" + replica_id_str
+
+      with variable_scope.variable_creator_scope(thread_creator_fn):
+        # Create a variable in this scope.
+        v = variable_scope.variable(1.0)
 
+        # This will pause the current thread, and execute the other thread.
+        ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
 
+    def main_thread_creator(next_creator, *args, **kwargs):
+      # We are not using the underlying next_creator for test purposes.
+      del next_creator, args, kwargs
+      return "main_thread"
+
+    with context.graph_mode(), \
+        distribution.scope(), \
+        variable_scope.variable_creator_scope(main_thread_creator):
+      result = distribution.extended.call_for_each_replica(model_fn)
+      result = distribution.unwrap(result)
+      expected = ("main_thread:thread_0", "main_thread:thread_1")
+      self.assertEqual(expected, result)
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredStrategyVariableCreationTest(test.TestCase):
 
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
+  # TODO(priyag): Modify more tests to use this helper and check more
+  # properties.
+  def _test_mv_properties(self, var, name):
+    self.assertIsInstance(var, values.MirroredVariable)
+    self.assertEqual(name, var.name)
+    for d in var.devices:
+      self.assertEqual(d, var.get(d).device)
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Enough GPUs not available for this test in eager mode.")
+  def testVariableInFuncGraph(self, distribution):
+    def model_fn():
+      v = variable_scope.variable(2.0, name="bar")
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
+
+    with func_graph.FuncGraph("fg").as_default(), distribution.scope():
+      v1 = variable_scope.variable(1.0, name="foo")
+      v2 = distribution.extended.call_for_each_replica(model_fn)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSingleVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._test_mv_properties(v1, "foo:0")
+    self._test_mv_properties(v2, "bar:0")
 
+  def testSingleVariable(self, distribution):
     def model_fn():
       # This variable should be created only once across the threads because of
-      # special variable_creator functions used by `dist.call_for_each_tower`.
+      # special variable_creator functions used by
+      # `distribution.extended.call_for_each_replica`.
       v = variable_scope.variable(1.0, name="foo")
-      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
-      self.assertIsInstance(result, values.MirroredVariable)
-      self.assertEquals("foo:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testUnnamedVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self._test_mv_properties(result, "foo:0")
 
+  def testUnnamedVariable(self, distribution):
     def model_fn():
       v = variable_scope.variable(1.0)
-      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
-      self.assertIsInstance(result, values.MirroredVariable)
-      # Default name of "Variable" will be used.
-      self.assertEquals("Variable:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleVariables(self):
-    self._skip_eager_if_gpus_less_than(1)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self._test_mv_properties(result, "Variable:0")
 
+  def testMultipleVariables(self, distribution):
     def model_fn():
       vs = []
       for i in range(5):
         vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
-      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return vs
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       for i, v in enumerate(result):
-        self.assertIsInstance(v, values.MirroredVariable)
-        self.assertEquals("foo" + str(i) + ":0", v.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleVariablesWithSameCanonicalName(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self._test_mv_properties(v, "foo" + str(i) + ":0")
 
+  def testMultipleVariablesWithSameCanonicalName(self, distribution):
     def model_fn():
       vs = []
       vs.append(variable_scope.variable(1.0, name="foo/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
       vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
-      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return vs
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       for v in result:
         self.assertIsInstance(v, values.MirroredVariable)
-      self.assertEquals(4, len(result))
-      self.assertEquals("foo/bar:0", result[0].name)
-      self.assertEquals("foo_1/bar:0", result[1].name)
-      self.assertEquals("foo_1/bar_1:0", result[2].name)
-      self.assertEquals("foo/bar_1:0", result[3].name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testVariableWithSameCanonicalNameAcrossThreads(self):
-    self._skip_eager_if_gpus_less_than(1)
-
-    def model_fn(device_id):
-      v = variable_scope.variable(1.0, name="foo_" + str(device_id))
-      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
-      return v
+      self.assertEqual(4, len(result))
+      self.assertEqual("foo/bar:0", result[0].name)
+      self.assertEqual("foo_1/bar:0", result[1].name)
+      self.assertEqual("foo_1/bar_1:0", result[2].name)
+      self.assertEqual("foo/bar_1:0", result[3].name)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
+  def testVariableWithSameCanonicalNameAcrossThreads(self, distribution):
+    def model_fn():
+      replica_id = self.evaluate(_replica_id())
+      v = variable_scope.variable(1.0, name="foo_" + str(replica_id))
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
 
-    with dist.scope():
-      result = dist.call_for_each_tower(
-          model_fn, dist.worker_device_index, run_concurrently=False)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       self.assertIsInstance(result, values.MirroredVariable)
       # The resulting mirrored variable will use the name from the first device.
-      self.assertEquals("foo_0:0", result.name)
+      self.assertEqual("foo_0:0", result.name)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithLayers(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testWithLayers(self, distribution):
     def model_fn(features):
       with variable_scope.variable_scope("common"):
         layer1 = core.Dense(1)
@@ -290,41 +289,40 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         layer2 = core.Dense(1)
         layer2(features)
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_tower_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         layer3 = core.Dense(1)
         layer3(features)
         return [(layer1.kernel, layer1.bias),
                 (layer2.kernel, layer2.bias),
                 (layer3.kernel, layer3.bias)]
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-    features = dist.distribute_dataset(
-        lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
-    ).make_one_shot_iterator().get_next()
+    ds = distribution.distribute_dataset(
+        lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
+    if context.executing_eagerly():
+      iterator = ds.make_one_shot_iterator()
+    else:
+      iterator = ds.make_initializable_iterator()
+      self.evaluate([iterator.initializer])
 
-    with dist.scope():
-      result = dist.call_for_each_tower(
-          model_fn, features, run_concurrently=False)
+    features = iterator.get_next()
+
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=(features,))
       suffixes = ["", "_1", "_2"]
       for (kernel, bias), suffix in zip(result, suffixes):
         self.assertIsInstance(kernel, values.MirroredVariable)
-        self.assertEquals("common/dense" + suffix + "/kernel:0", kernel.name)
+        self.assertEqual("common/dense" + suffix + "/kernel:0", kernel.name)
         self.assertIsInstance(bias, values.MirroredVariable)
-        self.assertEquals("common/dense" + suffix + "/bias:0", bias.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithVariableAndVariableScope(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self.assertEqual("common/dense" + suffix + "/bias:0", bias.name)
 
+  def testWithVariableAndVariableScope(self, distribution):
     def model_fn():
       v0 = variable_scope.variable(1.0, name="var0", aggregation=None)
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.variable(1.0, name="var1")
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_tower_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         v2 = variable_scope.variable(
             1.0,
             name="var2",
@@ -338,37 +336,31 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       return v0, v1, v2, v3
 
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+    with distribution.scope():
       v = variable_scope.variable(1.0, name="var-main0")
-      self.assertEquals("var-main0:0", v.name)
+      self.assertEqual("var-main0:0", v.name)
 
-      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
-      self.assertEquals(4, len(result))
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertEqual(4, len(result))
       v0, v1, v2, v3 = result
       self.assertIsInstance(v0, values.MirroredVariable)
-      self.assertEquals("var0:0", v0.name)
+      self.assertEqual("var0:0", v0.name)
       self.assertIsInstance(v1, values.MirroredVariable)
-      self.assertEquals("common/var1:0", v1.name)
-      self.assertIsInstance(v2, values.TowerLocalVariable)
-      self.assertEquals("common/var2:0", v2.name)
-      self.assertEquals(variable_scope.VariableAggregation.SUM, v2.aggregation)
+      self.assertEqual("common/var1:0", v1.name)
+      self.assertIsInstance(v2, values.ReplicaLocalVariable)
+      self.assertEqual("common/var2:0", v2.name)
+      self.assertEqual(variable_scope.VariableAggregation.SUM, v2.aggregation)
       self.assertIsInstance(v3, values.MirroredVariable)
-      self.assertEquals("common/var3:0", v3.name)
-      self.assertEquals(variable_scope.VariableAggregation.MEAN, v3.aggregation)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithGetVariableAndVariableScope(self):
-    self._skip_eager_if_gpus_less_than(1)
+      self.assertEqual("common/var3:0", v3.name)
+      self.assertEqual(variable_scope.VariableAggregation.MEAN, v3.aggregation)
 
+  def testWithGetVariableAndVariableScope(self, distribution):
     def model_fn():
       v0 = variable_scope.get_variable("var0", [1])
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.get_variable("var1", [1])
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_tower_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         v2 = variable_scope.get_variable(
             "var2", [1],
             synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -380,35 +372,30 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       return v0, v1, v2, v3
 
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+    with distribution.scope():
       with variable_scope.variable_scope("main"):
         v = variable_scope.get_variable("var-main0", [1])
-        self.assertEquals("main/var-main0:0", v.name)
+        self.assertEqual("main/var-main0:0", v.name)
 
-        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
-        self.assertEquals(4, len(result))
+        result = distribution.extended.call_for_each_replica(model_fn)
+        self.assertEqual(4, len(result))
         v0, v1, v2, v3 = result
         self.assertIsInstance(v0, values.MirroredVariable)
-        self.assertEquals("main/var0:0", v0.name)
+        self.assertEqual("main/var0:0", v0.name)
         self.assertIsInstance(v1, values.MirroredVariable)
-        self.assertEquals("main/common/var1:0", v1.name)
-        self.assertIsInstance(v2, values.TowerLocalVariable)
-        self.assertEquals("main/common/var2:0", v2.name)
-        self.assertEquals(variable_scope.VariableAggregation.SUM,
-                          v2.aggregation)
+        self.assertEqual("main/common/var1:0", v1.name)
+        self.assertIsInstance(v2, values.ReplicaLocalVariable)
+        self.assertEqual("main/common/var2:0", v2.name)
+        self.assertEqual(variable_scope.VariableAggregation.SUM,
+                         v2.aggregation)
         self.assertIsInstance(v3, values.MirroredVariable)
-        self.assertEquals("main/common/var3:0", v3.name)
-        self.assertEquals(variable_scope.VariableAggregation.MEAN,
-                          v3.aggregation)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testOnlyFirstTowerUpdatesVariables(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self.assertEqual("main/common/var3:0", v3.name)
+        self.assertEqual(variable_scope.VariableAggregation.MEAN,
+                         v3.aggregation)
 
+  def testOnlyFirstReplicaUpdatesVariables(self, distribution):
     def create_fn():
-      aggregation = variable_scope.VariableAggregation.ONLY_FIRST_TOWER
+      aggregation = variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
       v0 = variable_scope.variable(
           2.0,
           name="on_read",
@@ -422,71 +409,73 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       return v0, v1
 
     devices = ["/device:GPU:0", "/device:CPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      v0, v1 = dist.call_for_each_tower(create_fn, run_concurrently=False)
+    with distribution.scope():
+      v0, v1 = distribution.extended.call_for_each_replica(create_fn)
       self.evaluate(v0.initializer)
       self.assertEqual(2.0, self.evaluate(v0.get(devices[0])))
       self.assertEqual(2.0, self.evaluate(v0.get(devices[1])))
-      self.assertEqual(2.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0, self.evaluate(distribution.extended.read_var(v0)))
       self.evaluate(v1.initializer)
       self.assertEqual(3.0, self.evaluate(v1.get(devices[0])))
       self.assertEqual(3.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0, self.evaluate(dist.read_var(v1)))
+      self.assertEqual(3.0, self.evaluate(distribution.extended.read_var(v1)))
+
+      def replica_id_plus_one():
+        return math_ops.cast(_replica_id() + 1, dtype=dtypes.float32)
 
       # Update using the assign_add member function.
-      def update_member_fn(device_id):
-        update0 = v0.assign_add(5.0 * (device_id + 1))
-        update1 = v1.assign_add(7.0 * (device_id + 1))
+      def update_member_fn():
+        update0 = v0.assign_add(5.0 * replica_id_plus_one())
+        update1 = v1.assign_add(7.0 * replica_id_plus_one())
         return update0, update1
 
-      update0a, update1a = dist.call_for_each_tower(
-          update_member_fn, dist.worker_device_index, run_concurrently=False)
+      update0a, update1a = distribution.extended.call_for_each_replica(
+          update_member_fn)
 
       # Update "sync on read" variable.
-      self.evaluate(dist.group(update0a))
+      self.evaluate(distribution.group(update0a))
       self.assertEqual(2.0 + 5.0, self.evaluate(v0.get(devices[0])))
       # Writes are not synchronized for "sync on read" variables,
       # so device[1] can end up with a different value.
       self.assertEqual(2.0 + 2*5.0, self.evaluate(v0.get(devices[1])))
       # Always reads from device 0.
-      self.assertEqual(2.0 + 5.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0 + 5.0, self.evaluate(
+          distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
-      self.evaluate(dist.group(update1a))
+      self.evaluate(distribution.group(update1a))
       self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[0])))
       # Writes are synchronized for v1, only the argument to assign_add on
       # device[0] is used.
       self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0 + 7.0, self.evaluate(dist.read_var(v1)))
+      self.assertEqual(3.0 + 7.0, self.evaluate(
+          distribution.extended.read_var(v1)))
 
       # Update using state_ops.assign_add global function.
-      def update_state_ops_fn(device_id):
-        update0 = state_ops.assign_add(v0, 11.0 * (device_id + 1))
-        update1 = state_ops.assign_add(v1, 13.0 * (device_id + 1))
+      def update_state_ops_fn():
+        update0 = state_ops.assign_add(v0, 11.0 * replica_id_plus_one())
+        update1 = state_ops.assign_add(v1, 13.0 * replica_id_plus_one())
         return update0, update1
 
-      update0b, update1b = dist.call_for_each_tower(
-          update_state_ops_fn, dist.worker_device_index, run_concurrently=False)
-      self.evaluate(dist.group(update0b))
+      update0b, update1b = distribution.extended.call_for_each_replica(
+          update_state_ops_fn)
+      self.evaluate(distribution.group(update0b))
 
       # Update "sync on read" variable.
       self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(v0.get(devices[0])))
       self.assertEqual(2.0 + 2*5.0 + 2*11.0, self.evaluate(v0.get(devices[1])))
-      self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(
+          distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
-      self.evaluate(dist.group(update1b))
+      self.evaluate(distribution.group(update1b))
       self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[0])))
       self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(dist.read_var(v1)))
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNoneSynchronizationWithGetVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(
+          distribution.extended.read_var(v1)))
+
+  def testNoneSynchronizationWithGetVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
@@ -495,12 +484,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             "v", [1],
             synchronization=variable_scope.VariableSynchronization.NONE)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNoneSynchronizationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testNoneSynchronizationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
@@ -510,23 +495,15 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             name="v",
             synchronization=variable_scope.VariableSynchronization.NONE)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidSynchronizationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidSynchronizationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable synchronization mode: Invalid for "
           "variable: v"):
         variable_scope.variable(1.0, name="v", synchronization="Invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidAggregationWithGetVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidAggregationWithGetVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
@@ -535,12 +512,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
             aggregation="invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidAggregationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidAggregationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
@@ -550,53 +523,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
             aggregation="invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testThreeDevices(self):
-    self._skip_eager_if_gpus_less_than(2)
-
-    def model_fn():
-      v = variable_scope.variable(1.0, name="foo")
-      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
-      return v
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
-      self.assertIsInstance(result, values.MirroredVariable)
-      self.assertEquals("foo:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNonMatchingVariableCreation(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testNonMatchingVariableCreation(self, distribution):
     def model_fn(name):
       v = variable_scope.variable(1.0, name=name)
-      distribution_strategy_context.get_tower_context().merge_call(lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
+    with distribution.scope():
       names = values.DistributedValues({
           "/device:CPU:0": "foo",
           "/device:GPU:0": "bar"
       })
       with self.assertRaises(RuntimeError):
-        _ = dist.call_for_each_tower(model_fn, names, run_concurrently=False)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testTowerLocalVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+        _ = distribution.extended.call_for_each_replica(model_fn, args=(names,))
 
+  def testReplicaLocalVariable(self, distribution):
     all_v_sum = {}
     all_v_mean = {}
     components_sum = {}
     components_mean = {}
 
-    def model_fn(device_id):
+    def model_fn():
+      replica_id = self.evaluate(_replica_id())
       v_sum = variable_scope.variable(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -605,29 +553,25 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
           4.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           aggregation=variable_scope.VariableAggregation.MEAN)
-      self.assertTrue(isinstance(v_sum, values.TowerLocalVariable))
-      self.assertTrue(isinstance(v_mean, values.TowerLocalVariable))
-      updates = [v_sum.assign_add(2.0 + device_id),
-                 v_mean.assign(6.0 * device_id)]
-      all_v_sum[device_id] = v_sum
-      all_v_mean[device_id] = v_mean
+      self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
+      self.assertTrue(isinstance(v_mean, values.ReplicaLocalVariable))
+      updates = [v_sum.assign_add(2.0 + replica_id),
+                 v_mean.assign(6.0 * replica_id)]
+      all_v_sum[replica_id] = v_sum
+      all_v_mean[replica_id] = v_mean
       c_sum = v_sum.get()
       c_mean = v_mean.get()
-      components_sum[device_id] = c_sum
-      components_mean[device_id] = c_mean
+      components_sum[replica_id] = c_sum
+      components_mean[replica_id] = c_mean
       self.assertIsNot(v_sum, c_sum)
       self.assertIsNot(v_mean, c_mean)
       return updates, v_sum, v_mean, c_sum, c_mean
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      # Create "sum" and "mean" versions of TowerLocalVariables.
+    with distribution.scope():
+      # Create "sum" and "mean" versions of ReplicaLocalVariables.
       ret_ops, ret_v_sum, ret_v_mean, regrouped_sum, regrouped_mean = (
-          dist.call_for_each_tower(
-              model_fn, dist.worker_device_index, run_concurrently=False))
-      # Should see the same wrapping instance in all towers.
+          distribution.extended.call_for_each_replica(model_fn))
+      # Should see the same wrapping instance in all replicas.
       self.assertIs(all_v_sum[0], ret_v_sum)
       self.assertIs(all_v_mean[0], ret_v_mean)
       self.assertIs(all_v_sum[0], all_v_sum[1])
@@ -641,10 +585,10 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       # Apply updates
       self.evaluate(variables.global_variables_initializer())
-      self.evaluate([y for x in ret_ops for y in dist.unwrap(x)])
+      self.evaluate([y for x in ret_ops for y in distribution.unwrap(x)])
       expected_sum = 0.0
       expected_mean = 0.0
-      for i, d in enumerate(dist.worker_devices):
+      for i, d in enumerate(distribution.extended.worker_devices):
         # Should see different values on different devices.
         v_sum_value = self.evaluate(ret_v_sum.get(d).read_value())
         v_mean_value = self.evaluate(ret_v_mean.get(d).read_value())
@@ -654,221 +598,235 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         expected = i * 6.0
         self.assertEqual(expected, v_mean_value)
         expected_mean += expected
-      expected_mean /= len(dist.worker_devices)
+      expected_mean /= len(distribution.extended.worker_devices)
 
       # Without get(device), should return the value you get by
-      # applying the reduction across all towers (whether you use
+      # applying the reduction across all replicas (whether you use
       # read_var(), get(), or nothing).
-      self.assertEqual(expected_sum, self.evaluate(dist.read_var(ret_v_sum)))
-      self.assertEqual(expected_mean, self.evaluate(dist.read_var(ret_v_mean)))
+      self.assertEqual(expected_sum, self.evaluate(
+          distribution.extended.read_var(ret_v_sum)))
+      self.assertEqual(expected_mean, self.evaluate(
+          distribution.extended.read_var(ret_v_mean)))
       self.assertEqual(expected_sum, self.evaluate(ret_v_sum.get()))
       self.assertEqual(expected_mean, self.evaluate(ret_v_mean.get()))
       self.assertEqual(expected_sum, self.evaluate(ret_v_sum))
       self.assertEqual(expected_mean, self.evaluate(ret_v_mean))
 
+  # TODO(priyag): Update this test to work in eager mode as well.
+  def testDynamicRnnVariables(self, distribution):
+    def model_fn():
+      inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
+      cell_fw = rnn_cell_impl.LSTMCell(300)
+      cell_bw = rnn_cell_impl.LSTMCell(300)
+      (outputs, _) = rnn.bidirectional_dynamic_rnn(
+          cell_fw,
+          cell_bw,
+          inputs,
+          dtype=dtypes.float32)
+      return outputs
+
+    with context.graph_mode(), distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      # Two variables are created by the RNN layer.
+      self.assertEqual(2, len(result))
+      for v in result:
+        self.assertIsInstance(v, values.DistributedValues)
+        _, v1 = distribution.unwrap(v)
+        self.assertStartsWith(v1._op.name, "replica_1/")
+
+  def testReplicaLocalVariableUpdate(self, distribution):
+    def model_fn():
+      v_sum = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.SUM)
+      self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
+      return v_sum
+
+    def update(var, value):
+      return var.assign(value)
+
+    with distribution.scope():
+      ret_v_sum = distribution.extended.call_for_each_replica(model_fn)
+
+      # Initialize variables.
+      self.evaluate(variables.global_variables_initializer())
+      # Assert that the aggregated value of the replica local vars is the sum
+      # of the individual values before running the update ops.
+      self.assertEqual(1.0, self.evaluate(ret_v_sum.get(
+          distribution.extended.worker_devices[0]).read_value()))
+      self.assertEqual(2.0, self.evaluate(ret_v_sum))
+
+      # Apply updates.
+      update_ops = distribution.extended.update(
+          ret_v_sum, update, args=(5.0,), group=False)
+      self.evaluate(update_ops)
+      # Assert that the aggregated value of the replica local vars is the sum
+      # of the individual values after running the update ops.
+      self.assertEqual(5.0, self.evaluate(ret_v_sum.get(
+          distribution.extended.worker_devices[0]).read_value()))
+      self.assertEqual(10.0, self.evaluate(ret_v_sum))
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph"]))
+class MirroredStrategyNameScopeTest(test.TestCase):
   # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
   # testing this in eager mode.
 
-  def testNameScope(self):
+  def testNameScope(self, distribution):
     def model_fn():
       with ops.name_scope("foo"):
         a = constant_op.constant(1.0, name="a")
-        distribution_strategy_context.get_tower_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         b = constant_op.constant(1.0, name="b")
       return a, b
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
+    with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
-        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
-        self.assertEquals(2, len(result))
+        result = distribution.extended.call_for_each_replica(model_fn)
+        self.assertEqual(2, len(result))
         for v, name in zip(result, ["a", "b"]):
           self.assertIsInstance(v, values.DistributedValues)
-          v0, v1 = dist.unwrap(v)
-          self.assertEquals("main/foo/" + name + ":0", v0.name)
-          self.assertEquals("main/tower_1/foo/" + name + ":0", v1.name)
+          v0, v1 = distribution.unwrap(v)
+          self.assertEqual("main/foo/" + name + ":0", v0.name)
+          self.assertEqual("main/replica_1/foo/" + name + ":0", v1.name)
 
-  def testWithDefaultName(self):
+  def testWithDefaultName(self, distribution):
     def model_fn():
       with ops.name_scope(None, "foo"):
         a = constant_op.constant(1.0, name="a")
-        distribution_strategy_context.get_tower_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         b = constant_op.constant(2.0, name="b")
       return a, b
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
-      self.assertEquals(2, len(result))
+    with context.graph_mode(), distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertEqual(2, len(result))
       for v, name in zip(result, ["a", "b"]):
         self.assertIsInstance(v, values.DistributedValues)
-        v0, v1 = dist.unwrap(v)
-        self.assertEquals("foo/" + name + ":0", v0.name)
-        self.assertEquals("tower_1/foo/" + name + ":0", v1.name)
+        v0, v1 = distribution.unwrap(v)
+        self.assertEqual("foo/" + name + ":0", v0.name)
+        self.assertEqual("replica_1/foo/" + name + ":0", v1.name)
 
   # variable_scope.variable() respects name scopes when creating
   # variables. On the other hand variable_scope.get_variable() ignores name
   # scopes when creating variables. We test both methods of creating variables
   # to make sure that we have the same variable names in both cases.
-  def testNameScopeWithVariable(self):
-    def in_cross_tower(_):
+  def testNameScopeWithVariable(self, distribution):
+    def in_cross_replica(_):
       c = variable_scope.variable(1.0, name="c")
       return c
 
     def model_fn():
       b = variable_scope.variable(1.0, name="b")
       with ops.name_scope("foo"):
-        c = distribution_strategy_context.get_tower_context().merge_call(
-            in_cross_tower)
+        c = ds_context.get_replica_context().merge_call(in_cross_replica)
       return b, c
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
+    with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
         a = variable_scope.variable(1.0, name="a")
-        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+        result = distribution.extended.call_for_each_replica(model_fn)
       result_b = result[0]
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
       self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = dist.unwrap(a)
-      b0, b1 = dist.unwrap(result_b)
-      c0, c1 = dist.unwrap(result_c)
-      self.assertEquals("main/a:0", a0.name)
-      self.assertEquals("main/a/replica_1:0", a1.name)
-      self.assertEquals("main/b:0", b0.name)
-      self.assertEquals("main/b/replica_1:0", b1.name)
-      self.assertEquals("main/foo/c:0", c0.name)
-      self.assertEquals("main/foo/c/replica_1:0", c1.name)
-
-  def testNameScopeWithGetVariable(self):
-    def in_cross_tower(_):
+      a0, a1 = distribution.unwrap(a)
+      b0, b1 = distribution.unwrap(result_b)
+      c0, c1 = distribution.unwrap(result_c)
+      self.assertEqual("main/a:0", a0.name)
+      self.assertEqual("main/a/replica_1:0", a1.name)
+      self.assertEqual("main/b:0", b0.name)
+      self.assertEqual("main/b/replica_1:0", b1.name)
+      self.assertEqual("main/foo/c:0", c0.name)
+      self.assertEqual("main/foo/c/replica_1:0", c1.name)
+
+  def testNameScopeWithGetVariable(self, distribution):
+    def in_cross_replica(_):
       c = variable_scope.get_variable("c", [1])
       return c
 
     def model_fn():
       b = variable_scope.get_variable("b", [1])
       with ops.name_scope("foo"):
-        c = distribution_strategy_context.get_tower_context().merge_call(
-            in_cross_tower)
+        c = ds_context.get_replica_context().merge_call(in_cross_replica)
       return b, c
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
+    with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
         a = variable_scope.get_variable("a", [1])
-        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+        result = distribution.extended.call_for_each_replica(model_fn)
       result_b = result[0]
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
       self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = dist.unwrap(a)
-      b0, b1 = dist.unwrap(result_b)
-      c0, c1 = dist.unwrap(result_c)
-      self.assertEquals("a:0", a0.name)
-      self.assertEquals("a/replica_1:0", a1.name)
-      self.assertEquals("b:0", b0.name)
-      self.assertEquals("b/replica_1:0", b1.name)
-      self.assertEquals("c:0", c0.name)
-      self.assertEquals("c/replica_1:0", c1.name)
-
-  def testDynamicRnnVariables(self):
+      a0, a1 = distribution.unwrap(a)
+      b0, b1 = distribution.unwrap(result_b)
+      c0, c1 = distribution.unwrap(result_c)
+      self.assertEqual("a:0", a0.name)
+      self.assertEqual("a/replica_1:0", a1.name)
+      self.assertEqual("b:0", b0.name)
+      self.assertEqual("b/replica_1:0", b1.name)
+      self.assertEqual("c:0", c0.name)
+      self.assertEqual("c/replica_1:0", c1.name)
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            combinations.NamedDistribution(
+                "Mirrored3Devices",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.MirroredStrategy(
+                    ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+                required_gpus=2),
+            combinations.NamedDistribution(
+                "CoreMirrored3Devices",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.CoreMirroredStrategy(
+                    ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+                required_gpus=2)
+        ],
+        mode=["graph", "eager"]))
+class MirroredThreeDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  def testThreeDevices(self, distribution):
     def model_fn():
-      inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
-      cell_fw = rnn_cell_impl.LSTMCell(300)
-      cell_bw = rnn_cell_impl.LSTMCell(300)
-      (outputs, _) = rnn.bidirectional_dynamic_rnn(
-          cell_fw,
-          cell_bw,
-          inputs,
-          dtype=dtypes.float32)
-      return outputs
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
-      # Two variables are created by the RNN layer.
-      self.assertEquals(2, len(result))
-      for v in result:
-        self.assertIsInstance(v, values.DistributedValues)
-        _, v1 = dist.unwrap(v)
-        self.assertStartsWith(v1.name, "tower_1/")
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testTowerLocalVariableUpdate(self):
-    with context.graph_mode():
-
-      def model_fn():
-        v_sum = variable_scope.variable(
-            1.0,
-            synchronization=variable_scope.VariableSynchronization.ON_READ,
-            aggregation=variable_scope.VariableAggregation.SUM)
-        self.assertTrue(isinstance(v_sum, values.TowerLocalVariable))
-        return v_sum
-
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:GPU:1"])
-
-      def update(var, value):
-        return var.assign(value)
-
-      with dist.scope():
-        ret_v_sum = dist.call_for_each_tower(model_fn, run_concurrently=False)
-        update_ops = dist.update(ret_v_sum, update, 5.0, grouped=False)
-
-        # Initialize variables.
-        self.evaluate(variables.global_variables_initializer())
-        # Assert that the aggregated value of the tower local vars is the sum of
-        # the individual values before running the update ops.
-        self.assertEquals(1.0, self.evaluate(
-            ret_v_sum.get(dist._devices[0]).read_value()))
-        self.assertEquals(2.0, self.evaluate(ret_v_sum))
+      v = variable_scope.variable(1.0, name="foo")
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
 
-        # Apply updates.
-        self.evaluate(update_ops)
-        # Assert that the aggregated value of the tower local vars is the sum of
-        # the individual values after running the update ops.
-        self.assertEquals(5.0, self.evaluate(
-            ret_v_sum.get(dist._devices[0]).read_value()))
-        self.assertEquals(10.0, self.evaluate(ret_v_sum))
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEqual("foo:0", result.name)
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredVariableUpdateTest(test.TestCase):
   # The following tests check assign, assign_add and assign_sub on Mirrored
-  # variables in tower and cross tower context.
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
-
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Enough GPUs not available for this test in eager mode.")
+  # variables in replica and cross replica context.
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarTowerContextWithoutAggregationType(self):
+  def testAssignMirroredVarReplicaContextWithoutAggregationType(self,
+                                                                distribution):
     # Test that we always have an aggregation type set on the mirrored variable
-    # if we assign to it in tower mode.
-    self._skip_eager_if_gpus_less_than(1)
+    # if we assign to it in replica mode.
     def var_fn():
       v = variable_scope.variable(1.0, name="foo")
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -877,24 +835,20 @@ class MirroredVariableUpdateTest(test.TestCase):
 
       with self.assertRaisesRegexp(
           ValueError, "You must specify an aggregation method to update a "
-                      "MirroredVariable in Tower Context."):
-        self.evaluate(dist.unwrap(dist.call_for_each_tower(model_fn)))
+                      "MirroredVariable in Replica Context."):
+        self.evaluate(distribution.unwrap(
+            distribution.extended.call_for_each_replica(model_fn)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarTowerContextWithSum(self):
-    # Test that we don't reduce a non-per-device value with the "sum"
+  def testAssignMirroredVarReplicaContextWithSum(self, distribution):
+    # Test that we don't reduce a non-per-replica value with the "sum"
     # aggregation type.
-    self._skip_eager_if_gpus_less_than(1)
     def var_fn():
       v = variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.SUM)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -903,225 +857,184 @@ class MirroredVariableUpdateTest(test.TestCase):
 
       with self.assertRaisesRegexp(
           ValueError, "A non-DistributedValues value 5.0 cannot be reduced "
-          "with the given aggregation VariableAggregation.SUM."):
-        self.evaluate(dist.unwrap(dist.call_for_each_tower(model_fn)))
+          "with the given reduce op ReduceOp.SUM."):
+        self.evaluate(distribution.unwrap(
+            distribution.extended.call_for_each_replica(model_fn)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarCrossTowerContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(1.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign(6.0))
-      self.assertEquals(6.0, mirrored_var_result)
+      self.assertEqual(6.0, mirrored_var_result)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarTowerContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_tower_context().tower_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_tower(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(0.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(0.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarTowerContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_tower(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarCrossTowerContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(1.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       # read_value == True
       mirrored_var_result = self.evaluate(
           mirrored_var.assign_add(6.0, read_value=True))
-      self.assertEquals(7.0, mirrored_var_result)
-      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
-      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(7.0, mirrored_var_result)
+      self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
 
       # read_value == False
       self.evaluate(mirrored_var.assign_add(2.0, read_value=False))
-      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
-      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarTowerContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_tower_context().tower_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_add(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_tower(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(1.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(1.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarTowerContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign_add(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_tower(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(6.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(6.0, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarCrossTowerContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(5.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign_sub(2.0))
-      self.assertEquals(3.0, mirrored_var_result)
-      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
-      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(3.0, mirrored_var_result)
+      self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarTowerContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_tower_context().tower_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_sub(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_tower(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(4.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(4.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarTowerContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_tower(var_fn, run_concurrently=False)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign_sub(1.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_tower(
-          model_fn, run_concurrently=False)))
-      self.assertEquals(4.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(4.0, self.evaluate(mirrored_var))
 
 
-class MirroredAndTowerLocalVariableInitializerTest(test.TestCase):
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
 
-  def testAssignMirroredVarInitializer(self):
+  def testAssignMirroredVarInitializer(self, distribution):
     # This test is not eager compatible since in eager variables are initialized
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
@@ -1129,17 +1042,14 @@ class MirroredAndTowerLocalVariableInitializerTest(test.TestCase):
         v = variable_scope.variable(1.0, name="foo")
         return v
 
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:CPU:0"])
-
-      with dist.scope():
-        mirrored_var = dist.call_for_each_tower(var_fn)
+      with distribution.scope():
+        mirrored_var = distribution.extended.call_for_each_replica(var_fn)
         self.assertIsInstance(mirrored_var, values.MirroredVariable)
         self.assertFalse(self.evaluate(mirrored_var.is_initialized()))
         self.evaluate(mirrored_var.initializer)
         self.assertTrue(self.evaluate(mirrored_var.is_initialized()))
 
-  def testAssignTowerLocalVarInitializer(self):
+  def testAssignReplicaLocalVarInitializer(self, distribution):
     # This test is not eager compatible since in eager variables are initialized
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
@@ -1148,31 +1058,27 @@ class MirroredAndTowerLocalVariableInitializerTest(test.TestCase):
             1.0,
             synchronization=variable_scope.VariableSynchronization.ON_READ,
             aggregation=variable_scope.VariableAggregation.SUM)
-        self.assertTrue(isinstance(v_sum, values.TowerLocalVariable))
+        self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
         return v_sum
 
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:CPU:0"])
-
-      with dist.scope():
-        tower_local_var = dist.call_for_each_tower(model_fn)
-        self.assertTrue(isinstance(tower_local_var, values.TowerLocalVariable))
-        self.assertFalse(self.evaluate(tower_local_var.is_initialized()))
-        self.evaluate(tower_local_var.initializer)
-        self.assertTrue(self.evaluate(tower_local_var.is_initialized()))
-
+      with distribution.scope():
+        replica_local_var = distribution.extended.call_for_each_replica(
+            model_fn)
+        self.assertTrue(isinstance(replica_local_var,
+                                   values.ReplicaLocalVariable))
+        self.assertFalse(self.evaluate(replica_local_var.is_initialized()))
+        self.evaluate(replica_local_var.initializer)
+        self.assertTrue(self.evaluate(replica_local_var.is_initialized()))
 
-class TowerLocalVariableAssignTest(test.TestCase):
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Not enough GPUs available for this test in eager mode.")
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class ReplicaLocalVariableAssignTest(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignTowerLocalVarSumAggregation(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignReplicaLocalVarSumAggregation(self, distribution):
     def model_fn():
       v_sum = variable_scope.variable(
           1.0,
@@ -1180,30 +1086,27 @@ class TowerLocalVariableAssignTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.SUM)
       return v_sum
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      tower_local_var = dist.call_for_each_tower(model_fn,
-                                                 run_concurrently=False)
-      self.assertTrue(isinstance(tower_local_var, values.TowerLocalVariable))
+    with distribution.scope():
+      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
+      self.assertTrue(isinstance(replica_local_var,
+                                 values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
-      # Each tower has a value of 1.0 assigned to it in tower context.
+      # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the SUM of each of
-      # values on each of the towers.
-      self.assertEqual(2.0, self.evaluate(dist.read_var(tower_local_var)))
-      # Assigning 6.0 in cross tower context will assign a value of
-      # 6.0/num_towers to each tower.
-      tlv_ops = tower_local_var.assign(6.0)
+      # values on each of the replicas.
+      self.assertEqual(2.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
+      # Assigning 6.0 in cross replica context will assign a value of
+      # 6.0/num_replicas to each replica.
+      tlv_ops = replica_local_var.assign(6.0)
       self.evaluate(tlv_ops)
-      # On reading the tower local var we should get the assigned value back.
-      # The value on all the towers are added before being returned by
+      # On reading the replica local var we should get the assigned value back.
+      # The value on all the replicas are added before being returned by
       # `read_var`.
-      self.assertEqual(6.0, self.evaluate(dist.read_var(tower_local_var)))
+      self.assertEqual(6.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignTowerLocalVarMeanAggregation(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignReplicaLocalVarMeanAggregation(self, distribution):
     def model_fn():
       v_sum = variable_scope.variable(
           1.0,
@@ -1211,23 +1114,22 @@ class TowerLocalVariableAssignTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.MEAN)
       return v_sum
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      tower_local_var = dist.call_for_each_tower(model_fn,
-                                                 run_concurrently=False)
-      self.assertTrue(isinstance(tower_local_var, values.TowerLocalVariable))
+    with distribution.scope():
+      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
+      self.assertTrue(isinstance(replica_local_var,
+                                 values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
-      # Each tower has a value of 1.0 assigned to it in tower context.
+      # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the MEAN of values
-      # on all towers which is the value assigned in tower context.
-      self.assertEqual(1.0, self.evaluate(dist.read_var(tower_local_var)))
-      tlv_ops = tower_local_var.assign(6.0)
+      # on all replicas which is the value assigned in replica context.
+      self.assertEqual(1.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
+      tlv_ops = replica_local_var.assign(6.0)
       self.evaluate(tlv_ops)
-      # On reading the tower local var we should get the MEAN of all values
+      # On reading the replica local var we should get the MEAN of all values
       # which is equal to the value assigned.
-      self.assertEqual(6.0, self.evaluate(dist.read_var(tower_local_var)))
+      self.assertEqual(6.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
 
 
 class MockModel(object):
@@ -1245,25 +1147,41 @@ class MockModel(object):
     return x
 
 
-class MirroredStrategyDefunTest(test.TestCase):
+class MiniModel(keras_training.Model):
+  """Minimal model for mnist.
+
+  Useful for testing and debugging on slow TPU simulators.
+  """
+
+  def __init__(self):
+    super(MiniModel, self).__init__(name="")
+    self.fc = keras_core.Dense(1, name="fc", kernel_initializer="ones",
+                               bias_initializer="ones")
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Not enough GPUs available for this test in eager mode.")
+  def call(self, inputs, training=True):
+    inputs = array_ops.ones([1, 10])
+    return self.fc(inputs)
 
-  def _call_and_check(self, model_fn, inputs, expected_result, defuns,
-                      two_variables=False):
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class MirroredStrategyDefunTest(test.TestCase):
+
+  def _call_and_check(self, distribution, model_fn, inputs, expected_result,
+                      defuns, two_variables=False):
     cpu_dev = device_util.canonicalize("CPU:0")
     gpu_dev = device_util.canonicalize("GPU:0")
     devices = [cpu_dev, gpu_dev]
-    dist = mirrored_strategy.MirroredStrategy(devices)
 
-    with dist.scope():
+    with distribution.scope():
       mock_model = MockModel(two_variables)
       self.evaluate(variables.global_variables_initializer())
 
-      result = dist.call_for_each_tower(model_fn, mock_model, *inputs,
-                                        run_concurrently=False)
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=[mock_model] + inputs)
       for device in devices:
         device_result = values.select_device(device, result)
         device_expected_result = values.select_device(device, expected_result)
@@ -1275,18 +1193,15 @@ class MirroredStrategyDefunTest(test.TestCase):
         # call_for_each has one trace per device. To check that the expected set
         # of variables was accessed on each trace, we first retrieve each
         # device-specific graph function.
-        per_device_graph_functions = dist.call_for_each_tower(
-            defun.get_concrete_function,
-            mock_model, *inputs, run_concurrently=False)
+        per_replica_graph_functions = (
+            distribution.extended.call_for_each_replica(
+                defun.get_concrete_function, args=[mock_model] + inputs))
         for device in devices:
-          graph_function = per_device_graph_functions.get(device=device)
+          graph_function = per_replica_graph_functions.get(device=device)
           self.assertEqual(set(mock_model.variables),
                            set(graph_function.graph.variables))
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testVariableInDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testVariableInDefun(self, distribution):
     @function.defun
     def times_two(mock_model):
       return mock_model()
@@ -1294,12 +1209,9 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return times_two(mock_model)
 
-    self._call_and_check(model_fn, [], 2.5, [times_two])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testVariableInNestedDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 2.5, [times_two])
 
+  def testVariableInNestedDefun(self, distribution):
     @function.defun
     def times_two(mock_model):
       return mock_model()
@@ -1311,12 +1223,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return two_x_plus_one(mock_model)
 
-    self._call_and_check(model_fn, [], 3.5, [times_two, two_x_plus_one])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testTwoVariablesInNestedDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 3.5,
+                         [times_two, two_x_plus_one])
 
+  def testTwoVariablesInNestedDefun(self, distribution):
     @function.defun
     def fn1(mock_model):
       return mock_model()
@@ -1328,12 +1238,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return fn2(mock_model)
 
-    self._call_and_check(model_fn, [], 5.5, [fn1, fn2], two_variables=True)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testGradientTapeOverNestedDefuns(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 5.5, [fn1, fn2],
+                         two_variables=True)
 
+  def testGradientTapeOverNestedDefuns(self, distribution):
     @function.defun
     def fn1(mock_model):
       return mock_model()
@@ -1349,38 +1257,122 @@ class MirroredStrategyDefunTest(test.TestCase):
                              [v.get() for v in mock_model.variables])
       return grads
 
-    self._call_and_check(model_fn, [], [2.0, 1.0], [fn1, fn2],
+    self._call_and_check(distribution, model_fn, [], [2.0, 1.0], [fn1, fn2],
                          two_variables=True)
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testPassPerDevice(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testPassPerReplica(self, distribution):
     @function.defun
     def fn1(mock_model, factor):
       return mock_model(factor)
 
-    factors = values.PerDevice({"CPU:0": 5.0, "GPU:0": 3.0})
-    expected_result = values.PerDevice({"CPU:0": 5.0 * 1.25,
-                                        "GPU:0": 3.0 * 1.25})
-    self._call_and_check(fn1, [factors], expected_result, [fn1])
+    factors = values.PerReplica({"CPU:0": 5.0, "GPU:0": 3.0})
+    expected_result = values.PerReplica({"CPU:0": 5.0 * 1.25,
+                                         "GPU:0": 3.0 * 1.25})
+    self._call_and_check(distribution, fn1, [factors], expected_result, [fn1])
+
+  def testTrain(self, distribution):
+    with distribution.scope():
+      mock_model = MiniModel()
+      mock_model.call = function.defun(mock_model.call)
+
+      def loss_fn(ctx):
+        del ctx
+        return mock_model(array_ops.ones([1, 10]))
+
+      gradients_fn = backprop.implicit_grad(loss_fn)
+      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
+      grads_and_vars = distribution.extended.call_for_each_replica(
+          gradients_fn, args=(None,))
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.25)
+      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
 
+      if not context.executing_eagerly():
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(update_ops)
 
+      updated_var_values = self.evaluate(mock_model.variables)
+      # All variables start at 1.0 and get two updates of 0.25.
+      self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
+      self.assertAllEqual([0.5], updated_var_values[1])
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            combinations.NamedDistribution(
+                "Mirrored",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=
+                                                           context.num_gpus()),
+                required_gpus=1),
+            combinations.NamedDistribution(
+                "CoreMirrored",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.CoreMirroredStrategy(
+                    mirrored_strategy.all_local_devices()),
+                required_gpus=1)
+        ],
+        mode=["graph"]))
 class MultiWorkerMirroredStrategyTest(
     multi_worker_test_base.MultiWorkerTestBase,
     strategy_test_lib.DistributionTestBase):
 
-  def _get_distribution_strategy(self):
+  def _configure_distribution_strategy(self, distribution):
     cluster_spec = server_lib.ClusterSpec({
         "worker": ["/job:worker/task:0", "/job:worker/task:1"]
     })
-    strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
-    strategy.configure(cluster_spec=cluster_spec)
-    return strategy
-
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy(),
-                                   learning_rate=0.05)
+    distribution.configure(cluster_spec=cluster_spec)
+
+  def test_num_replicas_in_sync(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    # We calculate the total number of gpus across the workers(2) specified in
+    # the cluster spec.
+    self.assertEqual(context.num_gpus() * 2, distribution.num_replicas_in_sync)
+
+  def testMinimizeLossGraph(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    self._test_minimize_loss_graph(distribution, learning_rate=0.05)
+
+  def testDeviceScope(self, distribution):
+    """Test the device scope of multi-worker MirroredStrategy."""
+    self._configure_distribution_strategy(distribution)
+    with distribution.scope():
+      a = constant_op.constant(1.)
+      with ops.device("/cpu:0"):
+        b = constant_op.constant(1.)
+      self.assertEqual(a.device, "/job:worker/task:0")
+      self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
+
+  def testMakeInputFnIterator(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    num_gpus = context.num_gpus()
+    num_workers = 2
+
+    expected_values = [[i+j for j in range(num_gpus)] * num_workers
+                       for i in range(0, 100, num_gpus)]
+
+    with context.graph_mode(), self.cached_session() as sess:
+      # `expected_input_pipeline_id` is None because the input_fn will be called
+      # multiple times, each with a different input_pipeline_id.
+      input_fn = self._input_fn_to_test_input_context(
+          dataset_fn,
+          expected_num_replicas_in_sync=num_workers*num_gpus,
+          expected_num_input_pipelines=num_workers,
+          expected_input_pipeline_id=None)
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      self._test_input_fn_iterator(
+          iterator, distribution.extended.worker_devices, expected_values, sess)
+
+  def testUpdateConfigProto(self, distribution):
+    distribution.configure(cluster_spec={"worker": ["fake1", "fake2"]})
+
+    config_proto = config_pb2.ConfigProto()
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify isolate_session_state
+    self.assertTrue(new_config.isolate_session_state)
 
 
 class MultiWorkerMirroredStrategyTestWithChief(
@@ -1400,6 +1392,19 @@ class MultiWorkerMirroredStrategyTestWithChief(
     strategy.configure(cluster_spec=self._cluster_spec)
     self._test_minimize_loss_graph(strategy, learning_rate=0.05)
 
+  def testMinimizeLossGraphCoreMirroredStrategy(self):
+    strategy = mirrored_strategy.CoreMirroredStrategy(
+        mirrored_strategy.all_local_devices())
+    strategy.configure(cluster_spec=self._cluster_spec)
+    self._test_minimize_loss_graph(strategy, learning_rate=0.05)
+
+
+def _replica_id():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if not isinstance(replica_id, ops.Tensor):
+    replica_id = constant_op.constant(replica_id)
+  return replica_id
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
deleted file mode 100644
index 969e1269560e52736d05e6b14ce320d9bd4fcac0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for class MirroredStrategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribution_strategy_context
-
-
-class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
-
-  def _get_distribution_strategy(self):
-    return mirrored_strategy.MirroredStrategy(["/device:CPU:0"])
-
-  def testMinimizeLossEager(self):
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
-
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy())
-
-  def testMapReduce(self):
-    self._test_map_reduce(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    self._test_device_index(self._get_distribution_strategy())
-
-  def testTowerId(self):
-    self._test_tower_id(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-
-class VariableCreatorStackTest(test.TestCase):
-
-  def testCreatorStacksAreThreadLocal(self):
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-
-    def model_fn(device_id):
-      assert isinstance(device_id, int)
-
-      def thread_creator_fn(next_creator, *args, **kwargs):
-        return next_creator(*args, **kwargs) + ":thread_" + str(device_id)
-
-      with variable_scope.variable_creator_scope(thread_creator_fn):
-        # Create a variable in this scope.
-        v = variable_scope.variable(1.0)
-
-        # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_tower_context().merge_call(
-            lambda _: _)
-      return v
-
-    def main_thread_creator(next_creator, *args, **kwargs):
-      # We are not using the underlying next_creator for test purposes.
-      del next_creator, args, kwargs
-      return "main_thread"
-
-    with context.graph_mode(), \
-        dist.scope(), \
-        variable_scope.variable_creator_scope(main_thread_creator):
-      result = dist.call_for_each_tower(model_fn, dist.worker_device_index)
-      result = dist.unwrap(result)
-      expected = ["main_thread:thread_0", "main_thread:thread_1"]
-      self.assertEquals(expected, result)
-
-
-class MultiWorkerMirroredStrategyTest(test.TestCase):
-
-  def testDeviceScope(self):
-    """Test the device scope of multi-worker MirroredStrategy."""
-    with context.graph_mode():
-      strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
-      strategy.configure(
-          cluster_spec={"worker": ["/job:worker/task:0", "/job:worker/task:1"]})
-      with strategy.scope():
-        a = constant_op.constant(1.)
-        with ops.device("/cpu:0"):
-          b = constant_op.constant(1.)
-        self.assertEqual(a.device, "/job:worker/task:0")
-        self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/monitor.py b/tensorflow/contrib/distribute/python/monitor.py
index 7644acedc99361d7287a91832d76bc68cbc6ac0a..17b7ab74f63f42e1ee14a82d3bffdd1df9b25857 100644
--- a/tensorflow/contrib/distribute/python/monitor.py
+++ b/tensorflow/contrib/distribute/python/monitor.py
@@ -51,6 +51,7 @@ class Monitor(object):
     else:
       if session is None:
         raise ValueError("Should provide a `session` in Graph mode.")
+      session.run(step_callable._iterator.initializer)  # pylint: disable=protected-access
       self._run_step = session.make_callable(step_callable())
       session.run(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f13e9153ea7a951dd722c4549882c97e79b57fe
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -0,0 +1,165 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training.moving_averages when using a DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import moving_averages
+
+
+all_combinations = combinations.combine(
+    distribution=[combinations.default_strategy,
+                  combinations.one_device_strategy,
+                  combinations.mirrored_strategy_with_gpu_and_cpu,
+                  combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph"])
+
+
+class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_combinations)
+  def testReplicaModeWithoutZeroDebias(self, distribution):
+    replica_id = [0]
+
+    def replica_fn():
+      var = variables.Variable([10.0, 11.0])
+      val = constant_op.constant([1.0 + replica_id[0], 2.0 - replica_id[0]])
+      replica_id[0] += 1
+      decay = 0.25
+      assign = moving_averages.assign_moving_average(
+          var, val, decay, zero_debias=False)
+      return var, assign
+
+    with distribution.scope(), self.cached_session() as sess:
+      var, assign = distribution.call_for_each_replica(replica_fn)
+      variables.global_variables_initializer().run()
+      self.assertAllClose([10.0, 11.0], var.eval())
+      sess.run(distribution.unwrap(assign))
+      # Mean of val across calls to replica_fn().
+      average_val = [1.0 + 0.5 * (replica_id[0] - 1),
+                     2.0 - 0.5 * (replica_id[0] - 1)]
+      val_weight = 1.0 - 0.25
+      self.assertAllClose(
+          [10.0 * 0.25 + average_val[0] * val_weight,
+           11.0 * 0.25 + average_val[1] * val_weight],
+          var.eval())
+
+  @combinations.generate(all_combinations)
+  def testReplicaMode(self, distribution):
+    replica_id = [0]
+
+    def replica_fn():
+      var = variables.Variable([0.0, 0.0])
+      val = constant_op.constant([1.0 + replica_id[0], 2.0 - replica_id[0]])
+      replica_id[0] += 1
+      decay = 0.25
+      assign = moving_averages.assign_moving_average(var, val, decay)
+      return var, assign.op
+
+    with distribution.scope(), self.cached_session() as sess:
+      var, assign_op = distribution.call_for_each_replica(replica_fn)
+      variables.global_variables_initializer().run()
+      self.assertAllClose([0.0, 0.0], var.eval())
+      sess.run(distribution.unwrap(assign_op))
+      # Mean of val across calls to replica_fn().
+      average_val = [1.0 + 0.5 * (replica_id[0] - 1),
+                     2.0 - 0.5 * (replica_id[0] - 1)]
+      self.assertAllClose(average_val, var.eval())
+
+  @combinations.generate(all_combinations)
+  def testCrossDeviceWithoutZeroDebias(self, distribution):
+    with distribution.scope(), self.cached_session() as sess:
+      var = variables.Variable([10.0, 11.0])
+      val = constant_op.constant([1.0, 2.0])
+      decay = 0.25
+      # NOTE(josh11b): We currently generate an error if val is a PerReplica
+      # value.
+      assign = moving_averages.assign_moving_average(
+          var, val, decay, zero_debias=False)
+
+      variables.global_variables_initializer().run()
+      self.assertAllClose([10.0, 11.0], var.eval())
+      sess.run(assign)
+      average_val = [1.0, 2.0]
+      val_weight = 1.0 - 0.25
+      self.assertAllClose(
+          [10.0 * 0.25 + average_val[0] * val_weight,
+           11.0 * 0.25 + average_val[1] * val_weight],
+          var.eval())
+      # Also try assign.op.
+      sess.run(assign.op)
+      orig_weight = 0.25 * 0.25
+      val_weight = 1.0 - orig_weight
+      self.assertAllClose(
+          [10.0 * orig_weight + average_val[0] * val_weight,
+           11.0 * orig_weight + average_val[1] * val_weight],
+          var.eval())
+
+  @combinations.generate(all_combinations)
+  def testCrossDevice(self, distribution):
+    with distribution.scope(), self.cached_session() as sess:
+      var = variables.Variable([0.0, 0.0])
+      val = array_ops.placeholder(dtypes.float32)
+      decay = 0.25
+      # NOTE(josh11b): We currently generate an error if val is a PerReplica
+      # value.
+      assign = moving_averages.assign_moving_average(var, val, decay)
+
+      variables.global_variables_initializer().run()
+      self.assertAllClose([0.0, 0.0], var.eval())
+      sess.run(assign, feed_dict={val: [1.0, 2.0]})
+      self.assertAllClose([1.0, 2.0], var.eval())
+
+      # Also try assign.op.
+      sess.run(assign.op, feed_dict={val: [10.0, 0.0]})
+      self.assertAllClose(
+          [(1.0 * 0.25 + 10.0) / (1.0 * 0.25 + 1.0),
+           (2.0 * 0.25 + 0.0) / (1.0 * 0.25 + 1.0)],
+          var.eval())
+
+  @combinations.generate(all_combinations)
+  def testAssignVariable(self, distribution):
+
+    def replica_fn():
+      var = variables.Variable([10.0, 11.0])
+      # Here we expect to check the case when input value are variable.
+      val = variables.Variable([1., 2.])
+      decay = 0.25
+      assign = moving_averages.assign_moving_average(
+          var, val, decay, zero_debias=False)
+      return var, assign
+
+    with distribution.scope(), self.cached_session() as sess:
+      var, assign = distribution.call_for_each_replica(replica_fn)
+      variables.global_variables_initializer().run()
+      self.assertAllClose([10.0, 11.0], var.eval())
+      sess.run(distribution.unwrap(assign))
+      self.assertAllClose(
+          [10 * 0.25 + 1. * (1 - 0.25), 11 * 0.25 + 2. * (1 - 0.25)],
+          var.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index 9f92ba7dde5fc2798201cef2238bcc4b20b698a8..147c9b83f866fd364ea23cf7988692a7b5f61b9c 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import contextlib
 import copy
+import json
+import os
 import threading
 import numpy as np
 
@@ -39,7 +42,6 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
-
 ASSIGNED_PORTS = set()
 lock = threading.Lock()
 
@@ -207,12 +209,10 @@ class MultiWorkerTestBase(test.TestCase):
     self._lock = threading.Lock()
 
   @contextlib.contextmanager
-  def test_session(self, graph=None, config=None, target=None):
+  def session(self, graph=None, config=None, target=None):
     """Create a test session with master target set to the testing cluster.
 
-    This overrides the base class' method, removes arguments that are not needed
-    by the multi-node case and creates a test session that connects to the local
-    testing cluster.
+    Creates a test session that connects to the local testing cluster.
 
     Args:
       graph: Optional graph to use during the returned session.
@@ -224,9 +224,44 @@ class MultiWorkerTestBase(test.TestCase):
       A Session object that should be used as a context manager to surround
       the graph building and execution code in a test case.
     """
-    if self.id().endswith('.test_session'):
-      self.skipTest('Not a test.')
+    config = self._create_config(config)
+
+    if target is None:
+      target = self._default_target
+    with session.Session(graph=graph, config=config, target=target) as sess:
+      yield sess
+
+  @contextlib.contextmanager
+  # TODO(b/117573461): Overwrite self.evaluate() to use this function.
+  def cached_session(self, graph=None, config=None, target=None):
+    """Create a test session with master target set to the testing cluster.
+
+    Creates a test session that connects to the local testing cluster.
+    The session is only created once per test and then reused.
+
+    Args:
+      graph: Optional graph to use during the returned session.
+      config: An optional config_pb2.ConfigProto to use to configure the
+        session.
+      target: the target of session to connect to.
+
+    Yields:
+      A Session object that should be used as a context manager to surround
+      the graph building and execution code in a test case. Note that the
+      session will live until the end of the test.
+    """
+    config = self._create_config(config)
 
+    if target is None:
+      target = self._default_target
+    if getattr(self._thread_local, 'cached_session', None) is None:
+      self._thread_local.cached_session = session.Session(
+          graph=None, config=config, target=target)
+    sess = self._thread_local.cached_session
+    with sess.graph.as_default(), sess.as_default():
+      yield sess
+
+  def _create_config(self, config):
     if config is None:
       config = config_pb2.ConfigProto(allow_soft_placement=True)
     else:
@@ -237,18 +272,7 @@ class MultiWorkerTestBase(test.TestCase):
     config.graph_options.rewrite_options.constant_folding = (
         rewriter_config_pb2.RewriterConfig.OFF)
 
-    if target is None:
-      target = self._default_target
-    if graph is None:
-      if getattr(self._thread_local, 'cached_session', None) is None:
-        self._thread_local.cached_session = session.Session(
-            graph=None, config=config, target=target)
-      sess = self._thread_local.cached_session
-      with sess.graph.as_default(), sess.as_default():
-        yield sess
-    else:
-      with session.Session(graph=graph, config=config, target=target) as sess:
-        yield sess
+    return config
 
   def _run_client(self, client_fn, task_type, task_id, num_gpus, *args,
                   **kwargs):
@@ -281,3 +305,101 @@ class MultiWorkerTestBase(test.TestCase):
     for t in threads:
       t.join()
     self.assertEqual(self._result, len(threads))
+
+
+class MockOsEnv(collections.Mapping):
+  """A class that allows per-thread TF_CONFIG."""
+
+  def __init__(self, *args):
+    self._dict = dict()
+    self._thread_local = threading.local()
+    super(MockOsEnv, self).__init__(*args)
+
+  def get(self, key, default=None):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.get(self._thread_local.dict, key, default)
+    else:
+      return dict.get(self._dict, key, default)
+
+  def __getitem__(self, key):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.__getitem__(self._thread_local.dict, key)
+    else:
+      return dict.__getitem__(self._dict, key)
+
+  def __setitem__(self, key, val):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.__setitem__(self._thread_local.dict, key, val)
+    else:
+      return dict.__setitem__(self._dict, key, val)
+
+  def __iter__(self):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    for x in self._thread_local.dict.items():
+      yield x
+    for x in self._dict.items():
+      yield x
+
+  def __len__(self):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    return self._thread_local.dict.__len__() + self._dict.__len__()
+
+
+class IndependentWorkerTestBase(test.TestCase):
+  """Testing infra for independent workers."""
+
+  def setUp(self):
+    self._mock_os_env = MockOsEnv()
+    self._mock_context = test.mock.patch.object(os, 'environ',
+                                                self._mock_os_env)
+    super(IndependentWorkerTestBase, self).setUp()
+    self._mock_context.__enter__()
+
+  def tearDown(self):
+    self._mock_context.__exit__(None, None, None)
+    super(IndependentWorkerTestBase, self).tearDown()
+
+  def _task_thread(self, task_fn, tf_config, *args, **kwargs):
+    os.environ['TF_CONFIG'] = json.dumps(tf_config)
+    task_fn(*args, **kwargs)
+
+  def _run_task_in_thread(self, task_fn, cluster_spec, task_type, task_id,
+                          *args, **kwargs):
+    if task_type:
+      tf_config = {
+          'cluster': cluster_spec,
+          'task': {
+              'type': task_type,
+              'index': task_id
+          }
+      }
+    else:
+      tf_config = {
+          'cluster': cluster_spec,
+      }
+    t = threading.Thread(
+        target=self._task_thread,
+        args=(task_fn, tf_config) + args,
+        kwargs=kwargs)
+    t.start()
+    return t
+
+  def run_multiple_tasks_in_threads(self, task_fn, cluster_spec, *args,
+                                    **kwargs):
+    # The task_fn should create std_server by itself.
+    threads = {}
+    for task_type in cluster_spec.keys():
+      threads[task_type] = []
+      for task_id in range(len(cluster_spec[task_type])):
+        t = self._run_task_in_thread(task_fn, cluster_spec, task_type, task_id,
+                                     *args, **kwargs)
+        threads[task_type].append(t)
+    return threads
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index f5259190485e701c190beb49220caff743f8fdcb..fdbfba4e04358451a46b23ef250dc7c534c855a0 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -20,14 +20,14 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.contrib.distribute.python import values
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import values
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
@@ -40,10 +40,16 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
   # doing something that won't work with other DistributionStrategy
   # implementations?
 
-  def __init__(self, device, prefetch_on_device=None):
-    super(OneDeviceStrategy, self).__init__()
+  def __init__(self, device):
+    super(OneDeviceStrategy, self).__init__(OneDeviceExtended(self, device))
+
+
+class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of OneDeviceStrategy."""
+
+  def __init__(self, container_strategy, device):
+    super(OneDeviceExtended, self).__init__(container_strategy)
     self._device = device
-    self._prefetch_on_device = prefetch_on_device
     self._default_device = device
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -54,25 +60,40 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     if isinstance(colocate_with, six.string_types):
       with ops.device(colocate_with):
         return next_creator(*args, **kwargs)
-    if (isinstance(colocate_with, list) and len(colocate_with) == 1 and
+    if (isinstance(colocate_with, (list, tuple)) and len(colocate_with) == 1 and
         isinstance(colocate_with[0], six.string_types)):
       with ops.device(colocate_with[0]):
         return next_creator(*args, **kwargs)
     with ops.colocate_with(colocate_with):
       return next_creator(*args, **kwargs)
 
-  def distribute_dataset(self, dataset_fn):
-    return values.PerDeviceDataset(
-        self._call_dataset_fn(dataset_fn), [self._device],
-        self._prefetch_on_device)
-
-  def _broadcast(self, tensor, destinations):
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch."""
+    worker = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(worker, [self._device])]
+    return values.DatasetIterator(dataset, worker_device_pairs)
+
+  def _distribute_dataset(self, dataset_fn):
+    return values.PerReplicaDataset(
+        self._call_dataset_fn(dataset_fn), [self._device])
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    worker = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(worker, [self._device])]
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs,
+        [distribute_lib.InputContext()])
+
+  def _broadcast_to(self, tensor, destinations):
     del destinations
     return tensor
 
   # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values=None):
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
     if initial_loop_values is None:
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
@@ -84,7 +105,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
       fn_inputs = iterator.get_next()
       if not isinstance(fn_inputs, tuple):
         fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
+      fn_result = fn(ctx, fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       with ops.control_dependencies([fn_result]):
         return [i + 1] + flat_last_step_outputs
@@ -117,86 +138,82 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
     return ctx
 
-  def _call_for_each_tower(self, fn, *args, **kwargs):
-    # We don't run `fn` in multiple threads in OneDeviceStrategy.
-    kwargs.pop("run_concurrently", None)
-    with ops.device(self._device), _OneDeviceTowerContext(self):
+  def _call_for_each_replica(self, fn, args, kwargs):
+    strategy = self._container_strategy()
+    with ops.device(self._device), _OneDeviceReplicaContext(strategy):
       return fn(*args, **kwargs)
 
-  def map(self, map_over, fn, *args, **kwargs):
-    with ops.device(self._device):
-      return values.MapOutput([fn(m, *args, **kwargs) for m in map_over])
-
-  def _reduce(self, aggregation, value, destinations):
-    del destinations
-    if not isinstance(value, values.MapOutput):
-      return value
-    l = value.get()
-    assert l
-    with ops.device(self._device):
-      if aggregation == vs.VariableAggregation.SUM:
-        return math_ops.add_n(l)
-      elif aggregation == vs.VariableAggregation.MEAN:
-        return math_ops.add_n(l) / len(l)
-      else:
-        assert False
+  def _reduce_to(self, reduce_op, value, destinations):
+    del reduce_op, destinations
+    return value
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     # The implementations of _update() and _update_non_slot() are identical
     # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
 
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     del colocate_with
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.device(self._device), distribute_lib.UpdateContext(self._device):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
 
-  def read_var(self, tower_local_var):
-    """Read the aggregate value of a tower-local variable."""
-    return array_ops.identity(tower_local_var)
+  def read_var(self, replica_local_var):
+    """Read the aggregate value of a replica-local variable."""
+    return array_ops.identity(replica_local_var)
 
   def _unwrap(self, value):
-    return [value]
+    return (value,)
 
   def value_container(self, value):
     return value
 
   @property
-  def is_single_tower(self):
-    return True
-
-  @property
-  def num_towers(self):
+  def _num_replicas_in_sync(self):
     return 1
 
   @property
   def worker_devices(self):
-    return [self._device]
+    return (self._device,)
 
   @property
   def parameter_devices(self):
-    return [self._device]
+    return (self._device,)
 
   def non_slot_devices(self, var_list):
     del var_list
-    return [self._device]
+    return (self._device,)
 
-  def _worker_device_index(self):
-    return 0
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
 
 
-class _OneDeviceTowerContext(distribute_lib.TowerContext):
+class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext for OneDeviceStrategy."""
 
   def __init__(self, distribution_strategy):
-    distribute_lib.TowerContext.__init__(
-        self, distribution_strategy, tower_id=0)
+    distribute_lib.ReplicaContext.__init__(
+        self,
+        distribution_strategy,
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
 
   @property
-  def device(self):
-    return self._distribution_strategy.worker_devices[0]
+  def devices(self):
+    return self._distribution_strategy.extended.worker_devices
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index 4fdc0f72e6745b7ef25c591157955f214e0b2c79..d46cd6f529e363f76bfa2b22339add63530cfde8 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distribute.python import one_device_strategy
 from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 
@@ -35,19 +36,27 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
   def testMinimizeLossGraph(self):
     self._test_minimize_loss_graph(self._get_distribution_strategy())
 
-  def testMapReduce(self):
-    self._test_map_reduce(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    self._test_device_index(self._get_distribution_strategy())
-
-  def testTowerId(self):
-    self._test_tower_id(self._get_distribution_strategy())
+  def testReplicaId(self):
+    self._test_replica_id(self._get_distribution_strategy())
 
   @test_util.run_in_graph_and_eager_modes
   def testCallAndMergeExceptions(self):
     self._test_call_and_merge_exceptions(self._get_distribution_strategy())
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMakeInputFnIterator(self):
+    d = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i] for i in range(10)]
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=1,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = d.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(
+        iterator, d.extended.worker_devices, expected_values)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index 6e9ba37a198fc8038c086d2672251adfac30fdcf..fa4705af7cb592119f56686d1f693a156f7b4b13 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -42,16 +42,20 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_one_shot_iterator()
+      ds = distribution.distribute_dataset(dataset_fn)
+      if context.executing_eagerly():
+        iterator = ds.make_one_shot_iterator()
+      else:
+        iterator = ds.make_initializable_iterator()
 
       def run_step():
         return control_flow_ops.group(distribution.unwrap(
-            distribution.call_for_each_tower(
-                model_fn, iterator.get_next(), run_concurrently=layer.built)))
+            distribution.call_for_each_replica(
+                model_fn, args=(iterator.get_next(),))))
 
       if not context.executing_eagerly():
         with self.cached_session() as sess:
+          sess.run(iterator.initializer)
           run_step = sess.make_callable(run_step())
         self.evaluate(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 6ddd91507bf86e8b0cf710a2340fd61abcdebe71..2c7766f95fbcb7b68a53ad0052f21485c763a1db 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+import copy
+
 from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import values
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
@@ -30,8 +34,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import device_setter
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 _LOCAL_CPU = "/device:CPU:0"
@@ -61,16 +63,16 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   for a particular worker. Note that each graph and worker is independent.
   This means that while each worker will synchronously compute a single gradient
   update across all GPUs, updates between workers proceed asynchronously.
-  Operations that occur only on the first tower (such as incrementing the global
-  step), will occur on the first tower *of every worker*.
+  Operations that occur only on the first replica (such as incrementing the
+  global step), will occur on the first replica *of every worker*.
 
-  It is expected to call `call_for_each_tower(fn, *args, **kwargs)` for any
-  operations which potentially can be replicated across towers (i.e. multiple
+  It is expected to call `call_for_each_replica(fn, ...)` for any
+  operations which potentially can be replicated across replicas (i.e. multiple
   GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
   caution needs to be taken:
 
   1) Always use `tf.get_variable` instead of `tf.Variable` which is not able
-  to refer to the same variable on different towers.
+  to refer to the same variable on different replicas.
 
   2) It is generally not recommended to open a device scope under the strategy's
   scope. A device scope (i.e. calling `tf.device`) will be merged with or
@@ -94,13 +96,21 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
         not.
     """
-    super(ParameterServerStrategy, self).__init__()
+    super(ParameterServerStrategy, self).__init__(
+        ParameterServerExtended(self, num_gpus_per_worker))
+
+
+class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of ParameterServerStrategy."""
+
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    super(ParameterServerExtended, self).__init__(container_strategy)
     self._num_gpus_per_worker = num_gpus_per_worker
     self._initialize_local(num_gpus_per_worker)
 
     # We typically don't need to do all-reduce in this strategy.
-    self._cross_tower_ops = (
-        cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+    self._cross_device_ops = (
+        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
             reduce_to_device=_LOCAL_CPU))
 
   def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
@@ -108,10 +118,10 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     """Initialize devices for multiple workers.
 
     It creates variable devices and compute devices. Variables and operations
-    will be assigned to them respectively. We have one compute device per tower.
-    The variable device is a device function or device string. The default
-    variable device assigns variables to parameter servers in a round-robin
-    fashion.
+    will be assigned to them respectively. We have one compute device per
+    replica. The variable device is a device function or device string. The
+    default variable device assigns variables to parameter servers in a
+    round-robin fashion.
 
     Args:
       num_gpus_per_worker: number of local GPUs or GPUs per worker.
@@ -132,17 +142,17 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)
 
     # Define compute devices which is a list of device strings and one for each
-    # tower. When there are GPUs, replicate operations on these GPUs. Otherwise,
-    # place operations on CPU.
+    # replica. When there are GPUs, replicate operations on these GPUs.
+    # Otherwise, place operations on CPU.
     if num_gpus_per_worker > 0:
-      self._compute_devices = [
+      self._compute_devices = tuple(
           "%s/device:GPU:%d" % (self._worker_device, i)
           for i in range(num_gpus_per_worker)
-      ]
+      )
     else:
-      self._compute_devices = [self._worker_device]
+      self._compute_devices = (self._worker_device,)
 
-    self._compute_devices = list(
+    self._compute_devices = tuple(
         map(device_util.resolve, self._compute_devices))
     self._canonical_compute_device_set = set(self._compute_devices)
 
@@ -166,8 +176,8 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     # The `_parameter_devices` is needed for the `parameter_devices` property
     # and is a list of all variable devices. Here parameter devices are all
     # tasks of the "ps" job.
-    self._parameter_devices = map("/job:ps/task:{}".format,
-                                  range(num_ps_replicas))
+    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
+                                        range(num_ps_replicas)))
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
@@ -189,28 +199,29 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
   def _initialize_local(self, num_gpus_per_worker):
     """Initialize internal devices for local training."""
+    self._worker_device = device_util.canonicalize("/device:CPU:0")
     # Define compute devices which is a list of device strings and one for each
-    # tower. When there are GPUs, replicate operations on these GPUs. Otherwise,
-    # place operations on CPU.
+    # replica. When there are GPUs, replicate operations on these GPUs.
+    # Otherwise, place operations on CPU.
     if num_gpus_per_worker > 0:
-      self._compute_devices = list(
+      self._compute_devices = tuple(
           map("/device:GPU:{}".format, range(num_gpus_per_worker)))
     else:
-      self._compute_devices = [_LOCAL_CPU]
+      self._compute_devices = (_LOCAL_CPU,)
 
-    self._compute_devices = list(
+    self._compute_devices = tuple(
         map(device_util.resolve, self._compute_devices))
     self._canonical_compute_device_set = set(self._compute_devices)
 
     # If there is only one GPU, put everything on that GPU. Otherwise, place
     # variables on CPU.
     if num_gpus_per_worker == 1:
-      assert len(list(self._compute_devices)) == 1
+      assert len(self._compute_devices) == 1
       self._variable_device = _LOCAL_GPU_0
-      self._parameter_devices = [_LOCAL_GPU_0]
+      self._parameter_devices = (_LOCAL_GPU_0,)
     else:
       self._variable_device = _LOCAL_CPU
-      self._parameter_devices = [_LOCAL_CPU]
+      self._parameter_devices = (_LOCAL_CPU,)
 
     self._is_chief = True
     self._cluster_spec = None
@@ -221,31 +232,68 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
         "ParameterServerStrategy with compute_devices = %r, "
         "variable_device = %r", self._compute_devices, self._variable_device)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
-    return values.PerDeviceDataset(
+    return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._compute_devices, True)
 
-  def _broadcast(self, tensor, destinations):
-    if not cross_tower_ops_lib.check_destinations(destinations):
+  def _make_dataset_iterator(self, dataset):
+    worker_device_pairs = [(self._worker_device, self._compute_devices)]
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+      num_input_pipelines = multi_worker_util.worker_count(
+          self._cluster_spec, self._task_type)
+    else:
+      input_pipeline_id = 0
+      num_input_pipelines = 1
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=num_input_pipelines,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+    worker_device_pairs = [(self._worker_device, self._compute_devices)]
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs, [input_context])
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    if not cross_device_ops_lib.check_destinations(destinations):
       destinations = self._compute_devices
-    return self._cross_tower_ops.broadcast(tensor, destinations)
+    return self._cross_device_ops.broadcast(tensor, destinations)
+
+  def _allow_variable_partition(self):
+    return not context.executing_eagerly()
 
   # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
   # this creator, such as "MutableHashTable".
   def _create_variable(self, next_creator, *args, **kwargs):
-    if self.num_towers > 1:
+    if self._num_replicas_in_sync > 1:
       aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
       if aggregation not in (
           vs.VariableAggregation.NONE,
           vs.VariableAggregation.SUM,
           vs.VariableAggregation.MEAN,
-          vs.VariableAggregation.ONLY_FIRST_TOWER
+          vs.VariableAggregation.ONLY_FIRST_REPLICA
       ):
         raise ValueError("Invalid variable aggregation mode: " + aggregation +
                          " for variable: " + kwargs["name"])
 
       def var_creator(*args, **kwargs):
+        """Create an AggregatingVariable and fix up collections."""
         # Record what collections this variable should be added to.
         collections = kwargs.pop("collections", None)
         if collections is None:
@@ -287,39 +335,37 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       with ops.device(self._variable_device):
         return var_creator(*args, **kwargs)
 
-  def _call_for_each_tower(self, fn, *args, **kwargs):
+  def _call_for_each_replica(self, fn, args, kwargs):
     # pylint: disable=protected-access
-    return mirrored_strategy._call_for_each_tower(self, fn, *args, **kwargs)
+    return mirrored_strategy._call_for_each_replica(
+        self._container_strategy(), fn, args, kwargs)
 
   def _verify_destinations_not_different_worker(self, destinations):
+    if not self._cluster_spec:
+      return
     if destinations is None:
       return
-    for d in cross_tower_ops_lib.get_devices_from(destinations):
+    for d in cross_device_ops_lib.get_devices_from(destinations):
       d_spec = tf_device.DeviceSpec.from_string(d)
       if d_spec.job == self._task_type and d_spec.task != self._task_id:
         raise ValueError(
             "Cannot reduce to another worker: %r, current worker is %r" %
             (d, self._worker_device))
 
-  def _reduce(self, aggregation, value, destinations):
+  def _reduce_to(self, reduce_op, value, destinations):
     self._verify_destinations_not_different_worker(destinations)
     if not isinstance(value, values.DistributedValues):
       # pylint: disable=protected-access
       return mirrored_strategy._reduce_non_distributed_value(
-          self, aggregation, value, destinations)
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER:
-      return self.broadcast(value.get(self._compute_devices[0]), destinations)
-    return self._cross_tower_ops.reduce(
-        aggregation, value, destinations=destinations)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER:
-      return [self.broadcast(v.get(self._compute_devices[0]), d)
-              for v, d in value_destination_pairs]
+          self, reduce_op, value, destinations)
+    return self._cross_device_ops.reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
     for _, destinations in value_destination_pairs:
       self._verify_destinations_not_different_worker(destinations)
-    return self._cross_tower_ops.batch_reduce(aggregation,
-                                              value_destination_pairs)
+    return self._cross_device_ops.batch_reduce(reduce_op,
+                                               value_destination_pairs)
 
   def _select_single_value(self, structured):
     """Select any single values in `structured`."""
@@ -333,9 +379,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
               "You cannot update variable with a Mirrored object with multiple "
               "components %r when using ParameterServerStrategy. You must "
               "specify a single value or a Mirrored with a single value." % x)
-      elif isinstance(x, values.PerDevice):
+      elif isinstance(x, values.PerReplica):
         raise ValueError(
-            "You cannot update variable with a PerDevice object %r when using "
+            "You cannot update variable with a PerReplica object %r when using "
             "ParameterServerStrategy. You must specify a single value or a "
             "Mirrored with a single value" % x)
       else:
@@ -343,30 +389,26 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
     return nest.map_structure(_select_fn, structured)
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     if isinstance(var, values.AggregatingVariable):
       var = var.get()
     if not isinstance(var, resource_variable_ops.ResourceVariable):
       raise ValueError(
           "You can not update `var` %r. It must be a Variable." % var)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
       result = fn(var, *self._select_single_value(args),
                   **self._select_single_value(kwargs))
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
 
   # TODO(yuefengz): does it need to call _select_single_value?
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     with ops.device(
         colocate_with.device), distribute_lib.UpdateContext(colocate_with):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -375,22 +417,28 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     if isinstance(val, values.DistributedValues):
       # Return in a deterministic order.
       if set(val.devices) == self._canonical_compute_device_set:
-        return [val.get(device=d) for d in self._compute_devices]
-      return [val.get(device=d) for d in sorted(val.devices)]
-    return [val]
+        return tuple(val.get(device=d) for d in self._compute_devices)
+      return tuple(val.get(device=d) for d in sorted(val.devices))
+    return (val,)
 
   def value_container(self, val):
-    return values.value_container(val)
+    if (hasattr(val, "_aggregating_container") and
+        not isinstance(val, values.AggregatingVariable)):
+      wrapper = val._aggregating_container()  # pylint: disable=protected-access
+      if wrapper is not None:
+        return wrapper
+    return val
 
   def read_var(self, var):
-    # No need to distinguish between normal variables and tower-local variables.
+    # No need to distinguish between normal variables and replica-local
+    # variables.
     return array_ops.identity(var)
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     """Configures the strategy class.
 
     The strategy object will be re-initialized if `cluster_spec` is given but
@@ -421,44 +469,50 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       self._initialize_multi_worker(self._num_gpus_per_worker,
                                     self._cluster_spec, task_type, task_id)
 
-    if not session_config or not self._cluster_spec:
-      return
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    if not self._cluster_spec:
+      updated_config.isolate_session_state = True
+      return updated_config
 
-    session_config.isolate_session_state = False
+    updated_config.isolate_session_state = False
 
-    assert self._cluster_spec
     assert self._task_type
     assert self._task_id is not None
 
     # The device filters prevent communication between workers.
     if self._task_type not in ["chief", "worker"]:
-      return
-    del session_config.device_filters[:]
-    session_config.device_filters.extend(
+      return updated_config
+    del updated_config.device_filters[:]
+    updated_config.device_filters.extend(
         ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
+    return updated_config
 
   @property
-  def num_towers(self):
+  def _num_replicas_in_sync(self):
     return len(self._compute_devices)
 
   @property
   def worker_devices(self):
-    # Make a copy to prevent users from accidentally mutating our copy.
-    return list(self._compute_devices)
+    return self._compute_devices
 
   @property
   def parameter_devices(self):
-    return list(self._parameter_devices)
+    return self._parameter_devices
 
   def non_slot_devices(self, var_list):
     return min(var_list, key=lambda x: x.name)
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
+    # TODO(yuefengz): Should this return False in the local case?
     return True
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return self._is_chief
 
   @property
@@ -468,3 +522,8 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   @property
   def should_save_summary(self):
     return self._is_chief
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 353d11a5831904abd43828f1d9d4abfc61aede60..83d7473666a65e438a1c0119d2a12bf54e53c8fc 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -25,22 +25,29 @@ from absl.testing import parameterized
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import parameter_server_strategy
-from tensorflow.contrib.distribute.python import values
+from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import training_util
 
 CHIEF = run_config.TaskType.CHIEF
@@ -48,6 +55,13 @@ WORKER = run_config.TaskType.WORKER
 PS = run_config.TaskType.PS
 
 
+def _get_replica_id_integer():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id
+
+
 class ParameterServerStrategyTestBase(
     multi_worker_test_base.MultiWorkerTestBase):
 
@@ -80,12 +94,11 @@ class ParameterServerStrategyTestBase(
     worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id)
     d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
     with ops.Graph().as_default(), \
-         self.test_session(target=self._default_target,
-                           config=sess_config) as sess, \
+         self.cached_session(target=self._default_target,
+                             config=sess_config) as sess, \
          d.scope():
 
-      # Define a variable outside the call_for_each_tower scope. This is not
-      # recommended.
+      # Define a variable outside the call_for_each_replica scope.
       n = variable_scope.get_variable('n', initializer=10.0)
       self.assertEqual(n.device, '/job:ps/task:0')
 
@@ -93,9 +106,8 @@ class ParameterServerStrategyTestBase(
         if num_gpus == 0:
           last_part_device = 'device:CPU:0'
         else:
-          last_part_device = (
-              'device:GPU:%d' %
-              distribution_strategy_context.get_tower_context().tower_id)
+          replica_id = _get_replica_id_integer()
+          last_part_device = ('device:GPU:%d' % replica_id)
 
         a = constant_op.constant(1.0)
         b = constant_op.constant(2.0)
@@ -165,7 +177,7 @@ class ParameterServerStrategyTestBase(
         self.assertIn('/job:ps/', h.device)
         return y_add, z_add, f
 
-      y, z, f = d.call_for_each_tower(model_fn)
+      y, z, f = d.call_for_each_replica(model_fn)
       self.assertNotEqual(y, None)
       self.assertNotEqual(z, None)
       self.assertNotEqual(f, None)
@@ -177,39 +189,108 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(z_val, 43.0)
         self.assertEqual(f_val, 46.0)
 
+  def _test_device_assignment_distributed_enable_partitioner(
+      self, task_type, task_id, num_gpus):
+    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
+    num_shards = len(d.parameter_devices)
+    partitioner = partitioned_variables.fixed_size_partitioner(num_shards)
+    with ops.Graph().as_default(), \
+         self.cached_session(target=self._default_target,
+                             config=sess_config) as sess, \
+         d.scope():
+
+      n = variable_scope.get_variable(
+          'n',
+          initializer=constant_op.constant([10.0, 20.0]),
+          aggregation=variable_scope.VariableAggregation.SUM,
+          partitioner=partitioner)
+
+      for part_id, var in enumerate(n):
+        self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
+
+      def model_fn():
+        a = constant_op.constant([3.0, 5.0])
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/job:worker/task:0'):
+          x = variable_scope.get_variable(
+              'x',
+              initializer=constant_op.constant([10.0, 20.0]),
+              aggregation=variable_scope.VariableAggregation.SUM,
+              partitioner=partitioner)
+          x_add = x.assign_add(a, name='x_add')
+        # The variable x is on the task 1 since the device_function has been
+        # called once before the model_fn.
+        for part_id, var in enumerate(x):
+          self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
+          self.assertEqual(var.device, x_add[part_id].device)
+
+        # The colocate_vars_with can override the distribution's device.
+        with d.colocate_vars_with(x_add[0]):
+          y = variable_scope.get_variable(
+              'y',
+              initializer=constant_op.constant([20.0, 10.0]),
+              aggregation=variable_scope.VariableAggregation.SUM,
+              partitioner=partitioner)
+        y_add = y.assign_add(
+            [array_ops.identity(x_add[0]),
+             array_ops.identity(x_add[1])])
+
+        for part_id, var in enumerate(y):
+          self.assertEqual(var.device, '/job:ps/task:0')
+          self.assertEqual(y_add[part_id].device, var.device)
+          self.assertEqual(var.device, x_add[0].device)
+
+        return x_add, y_add
+
+      x, y = d.call_for_each_replica(model_fn)
+
+      if context.num_gpus() >= 1:
+        variables.global_variables_initializer().run()
+        x_val, y_val = sess.run([x, y])
+        if num_gpus < 1:
+          self.assertEqual(x_val, [13.0, 25.0])
+          self.assertEqual(y_val, [33.0, 35.0])
+        else:
+          x_expect = [10.0 + 3 * num_gpus, 20.0 + 5 * num_gpus]
+          y_expect = [
+              20.0 + x_expect[0] * num_gpus, 10.0 + x_expect[1] * num_gpus
+          ]
+          self.assertEqual(x_val, x_expect)
+          self.assertEqual(y_val, y_expect)
+
   def _test_device_assignment_local(self,
                                     d,
                                     compute_device='CPU',
                                     variable_device='CPU',
                                     num_gpus=0):
     with ops.Graph().as_default(), \
-         self.test_session(target=self._default_target,
-                           config=self._sess_config) as sess, \
+         self.cached_session(target=self._default_target,
+                             config=self._sess_config) as sess, \
          d.scope():
 
       def model_fn():
         if 'CPU' in compute_device:
-          tower_compute_device = '/device:CPU:0'
+          replica_compute_device = '/device:CPU:0'
         else:
-          tower_compute_device = (
-              '/device:GPU:%d' %
-              distribution_strategy_context.get_tower_context().tower_id)
-        tower_compute_device = device_util.canonicalize(tower_compute_device)
+          replica_id = _get_replica_id_integer()
+          replica_compute_device = ('/device:GPU:%d' % replica_id)
+        replica_compute_device = device_util.canonicalize(
+            replica_compute_device)
 
         if 'CPU' in variable_device:
-          tower_variable_device = '/device:CPU:0'
+          replica_variable_device = '/device:CPU:0'
         else:
-          tower_variable_device = (
-              '/device:GPU:%d' %
-              distribution_strategy_context.get_tower_context().tower_id)
-        tower_variable_device = device_util.canonicalize(tower_variable_device)
+          replica_id = _get_replica_id_integer()
+          replica_variable_device = ('/device:GPU:%d' % replica_id)
+        replica_variable_device = device_util.canonicalize(
+            replica_variable_device)
 
         a = constant_op.constant(1.0)
         b = constant_op.constant(2.0)
         c = a + b
-        self.assertEqual(a.device, tower_compute_device)
-        self.assertEqual(b.device, tower_compute_device)
-        self.assertEqual(c.device, tower_compute_device)
+        self.assertEqual(a.device, replica_compute_device)
+        self.assertEqual(b.device, replica_compute_device)
+        self.assertEqual(c.device, replica_compute_device)
 
         # The device scope is ignored for variables but not for normal ops.
         with ops.device('/device:GPU:2'):
@@ -219,7 +300,7 @@ class ParameterServerStrategyTestBase(
           x_add = x.assign_add(c)
           e = a + c
         self.assertEqual(
-            device_util.canonicalize(x.device), tower_variable_device)
+            device_util.canonicalize(x.device), replica_variable_device)
         self.assertEqual(x_add.device, x.device)
         self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2'))
 
@@ -232,7 +313,7 @@ class ParameterServerStrategyTestBase(
         # non-distributed values.
         y_add = y.assign_add(array_ops.identity(x_add))
         self.assertEqual(
-            device_util.canonicalize(y.device), tower_variable_device)
+            device_util.canonicalize(y.device), replica_variable_device)
         self.assertEqual(y_add.device, y.device)
         self.assertEqual(y.device, x.device)
 
@@ -240,7 +321,7 @@ class ParameterServerStrategyTestBase(
             'z', initializer=10.0,
             aggregation=variable_scope.VariableAggregation.SUM)
         self.assertEqual(
-            device_util.canonicalize(z.device), tower_variable_device)
+            device_util.canonicalize(z.device), replica_variable_device)
 
         with ops.control_dependencies([y_add]):
           # We add an identity here to avoid complaints about summing
@@ -248,7 +329,7 @@ class ParameterServerStrategyTestBase(
           z_add = z.assign_add(array_ops.identity(y))
         with ops.control_dependencies([z_add]):
           f = z + c
-        self.assertEqual(f.device, tower_compute_device)
+        self.assertEqual(f.device, replica_compute_device)
 
         # The device scope would merge with the default worker device.
         with ops.device('/CPU:1'):
@@ -261,11 +342,13 @@ class ParameterServerStrategyTestBase(
           u = variable_scope.get_variable('u', initializer=30.0)
           h = f + 1.0
         self.assertEqual(
-            device_util.canonicalize(u.device), tower_variable_device)
-        self.assertEqual(device_util.canonicalize(x.device), h.device)
+            device_util.canonicalize(u.device), replica_variable_device)
+        self.assertEqual(
+            device_util.canonicalize(x.device),
+            device_util.canonicalize(h.device))
         return y_add, z_add, f
 
-      y, z, f = d.call_for_each_tower(model_fn)
+      y, z, f = d.call_for_each_replica(model_fn)
       self.assertNotEqual(y, None)
       self.assertNotEqual(z, None)
       self.assertNotEqual(f, None)
@@ -280,15 +363,15 @@ class ParameterServerStrategyTestBase(
   def _test_simple_increment(self, task_type, task_id, num_gpus):
     d, master_target, sess_config = self._get_test_objects(
         task_type, task_id, num_gpus)
-    if hasattr(d, '_cluster_spec') and d._cluster_spec:
-      num_workers = len(d._cluster_spec.as_dict().get(WORKER))
-      if 'chief' in d._cluster_spec.as_dict():
+    if d.extended._cluster_spec:
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if 'chief' in d.extended._cluster_spec.as_dict():
         num_workers += 1
     else:
       num_workers = 1
     with ops.Graph().as_default(), \
-         self.test_session(target=master_target,
-                           config=sess_config) as sess, \
+         self.cached_session(target=master_target,
+                             config=sess_config) as sess, \
          d.scope():
 
       def model_fn():
@@ -300,7 +383,7 @@ class ParameterServerStrategyTestBase(
             aggregation=variable_scope.VariableAggregation.SUM)
         z = variable_scope.get_variable(
             'z', initializer=30.0,
-            aggregation=variable_scope.VariableAggregation.ONLY_FIRST_TOWER)
+            aggregation=variable_scope.VariableAggregation.ONLY_FIRST_REPLICA)
 
         # We explicitly make a constant tensor here to avoid complaints about
         # summing non-distributed values.
@@ -312,10 +395,10 @@ class ParameterServerStrategyTestBase(
         train_op = control_flow_ops.group(x_add, y_add, z_add)
         return x, y, z, train_op
 
-      x, y, z, train_op = d.call_for_each_tower(model_fn)
+      x, y, z, train_op = d.call_for_each_replica(model_fn)
       train_op = d.group(train_op)
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
       if task_id == 0:
@@ -340,24 +423,29 @@ class ParameterServerStrategyTestBase(
       self._finish_condition.release()
 
       x_val, y_val, z_val = sess.run([x, y, z])
-      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_towers)
-      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_towers)
+      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas_in_sync)
+      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas_in_sync)
       self.assertEqual(z_val, 30.0 + 1.0 * num_workers)
-      return (x_val == 10.0 + 1.0 * num_workers * d.num_towers and
-              y_val == 20.0 + 1.0 * num_workers * d.num_towers and
+      return (x_val == 10.0 + 1.0 * num_workers * d.num_replicas_in_sync and
+              y_val == 20.0 + 1.0 * num_workers * d.num_replicas_in_sync and
               z_val == 30.0 + 1.0 * num_workers)
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
     d, master_target, sess_config = self._get_test_objects(
         task_type, task_id, num_gpus)
-    assert hasattr(d, '_cluster_spec') and d._cluster_spec
-    num_workers = len(d._cluster_spec.as_dict().get(WORKER))
-    if CHIEF in d._cluster_spec.as_dict():
-      num_workers += 1
+    if task_type:
+      # Multi-worker
+      assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if CHIEF in d.extended._cluster_spec.as_dict():
+        num_workers += 1
+    else:
+      # local
+      num_workers = 1
 
     with ops.Graph().as_default(), \
-         self.test_session(target=master_target,
-                           config=sess_config) as sess, \
+         self.cached_session(target=master_target,
+                             config=sess_config) as sess, \
          d.scope():
       l = core.Dense(1, use_bias=False)
 
@@ -384,7 +472,7 @@ class ParameterServerStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_tower(grad_fn, one)
+        g_v = d.call_for_each_replica(grad_fn, args=(one,))
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
@@ -393,8 +481,8 @@ class ParameterServerStrategyTestBase(
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
                 d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -402,10 +490,12 @@ class ParameterServerStrategyTestBase(
 
       before_out, after_out = step()
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
-      if multi_worker_util.is_chief(d._cluster_spec, task_type, task_id):
+      if (not task_type or
+          multi_worker_util.is_chief(
+              d.extended._cluster_spec, task_type, task_id)):
         variables.global_variables_initializer().run()
 
       # Workers waiting for chief worker's initializing variables.
@@ -428,8 +518,40 @@ class ParameterServerStrategyTestBase(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
+  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
+                              expected_values):
+    distribution, master_target, config = self._get_test_objects(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_device(d, next_element) for d in devices])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
 
 class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
+                                  strategy_test_lib.DistributionTestBase,
                                   parameterized.TestCase):
 
   @classmethod
@@ -438,6 +560,13 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
         num_workers=3, num_ps=2)
     cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
 
+  def test_num_replicas_in_sync(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    # All the devices on a given worker are in sync which in this case is the
+    # number of gpus on each worker.
+    self.assertEqual(2, distribution.num_replicas_in_sync)
+
   def testDeviceAssignmentLocalCPU(self):
     distribution = parameter_server_strategy.ParameterServerStrategy(
         num_gpus_per_worker=0)
@@ -461,6 +590,12 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
   def testDeviceAssignmentDistributed(self, num_gpus):
     self._test_device_assignment_distributed('worker', 1, num_gpus)
 
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus):
+    self._test_device_assignment_distributed_enable_partitioner(
+        'worker', 1, num_gpus)
+
   def testSimpleBetweenGraph(self):
     self._run_between_graph_clients(self._test_simple_increment,
                                     self._cluster_spec, context.num_gpus())
@@ -472,10 +607,82 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
 
   @combinations.generate(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraph(self, num_gpus):
+  def testMinimizeLossGraphDistributed(self, num_gpus):
     self._run_between_graph_clients(self._test_minimize_loss_graph,
                                     self._cluster_spec, num_gpus)
 
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraphLocal(self, num_gpus):
+    self._test_minimize_loss_graph(None, None, num_gpus)
+
+  # TODO(priyag): Refactor this and other multi worker tests.
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
+  def testMakeInputFnIteratorDistributed(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator('worker', 1, num_gpus,
+                                 input_fn, expected_values)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
+  def testMakeInputFnIteratorLocal(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)  # only one worker and pipeline for local.
+    self._test_input_fn_iterator(None, None, num_gpus,
+                                 input_fn, expected_values)
+
+  def testGlobalStepUpdate(self):
+    strategy = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=context.num_gpus())
+    self._test_global_step_update(strategy)
+
+  def testUpdateConfigProtoMultiWorker(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    distribution.configure(
+        cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
+
+    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
+
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify device filters.
+    self.assertEqual(['/job:worker/task:1', '/job:ps'],
+                     new_config.device_filters)
+
+    # Verify isolate_session_state
+    self.assertFalse(new_config.isolate_session_state)
+
+  def testUpdateConfigProtoLocal(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+
+    config_proto = config_pb2.ConfigProto()
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify isolate_session_state
+    self.assertTrue(new_config.isolate_session_state)
+
 
 class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                            parameterized.TestCase):
@@ -509,6 +716,19 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
       self.assertIs(values.AggregatingVariable, type(created_step))
       self.assertIs(values.AggregatingVariable, type(get_step))
 
+  def testValueContainer(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    with ops.Graph().as_default(), distribution.scope():
+      def f():
+        with backprop.GradientTape() as tape:
+          v = variable_scope.get_variable('v', initializer=10.0)
+          _ = v * v
+        v, = tape.watched_variables()
+        w = distribution.extended.value_container(v)
+        self.assertIs(values.AggregatingVariable, type(w))
+      distribution.extended.call_for_each_replica(f)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
deleted file mode 100644
index d48aa9c89bc894a6afc4aab8b60fabc52a06b198..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Extension of prefetching_ops to support more than one device."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import warnings
-
-from tensorflow.python.data.experimental.ops import prefetching_ops
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.util import nest as data_nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
-from tensorflow.python.util import nest
-
-
-# pylint: disable=protected-access
-class _PrefetchToDeviceIterator(object):
-  """A replacement for `tf.data.Iterator` that prefetches to another device.
-
-  Args:
-    input_dataset: The input dataset.
-    one_shot: If true, we make a one shot iterator that's already initialized.
-    devices: Devices on which to prefetch.
-    buffer_size: Size of the prefetching buffer.
-    shared_name: (Optional.) If non-empty, the returned iterator will be shared
-      under the given name across multiple sessions that share the same devices
-      (e.g. when using a remote server). Only used if one_shot is False.
-
-  Returns:
-    An Iterator type object.
-  """
-
-  def __init__(self,
-               input_dataset,
-               one_shot,
-               devices,
-               buffer_size,
-               shared_name=None):
-    self._input_dataset = input_dataset
-    self._get_next_call_count = 0
-    self._one_shot = one_shot
-    if shared_name is None:
-      shared_name = ""
-    self._devices = devices
-
-    if self._one_shot:
-      self._input_iterator = input_dataset.make_one_shot_iterator()
-    else:
-      self._input_iterator = iterator_ops.Iterator.from_structure(
-          self._input_dataset.output_types, self._input_dataset.output_shapes,
-          shared_name, self._input_dataset.output_classes)
-    input_iterator_handle = self._input_iterator.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _prefetch_fn(handle):
-      """Prefetches one element from `input_iterator`."""
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, self._input_iterator.output_types,
-          self._input_iterator.output_shapes,
-          self._input_iterator.output_classes)
-      ret = remote_iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    target_device = ged_ops.experimental_iterator_get_device(
-        self._input_iterator._iterator_resource)
-    self._buffering_resources = []
-    for device in nest.flatten(self._devices):
-      with ops.device(device):
-        buffer_resource_handle = prefetching_ops.function_buffering_resource(
-            f=_prefetch_fn,
-            output_types=data_nest.flatten(
-                sparse.as_dense_types(self._input_dataset.output_types,
-                                      self._input_dataset.output_classes)),
-            target_device=target_device,
-            string_arg=input_iterator_handle,
-            buffer_size=buffer_size,
-            shared_name=shared_name)
-        self._buffering_resources.append(buffer_resource_handle)
-
-    if not self._one_shot:
-      reset_ops = []
-      for buffer_resource in self._buffering_resources:
-        reset_ops.append(
-            ged_ops.experimental_function_buffering_resource_reset(
-                buffer_resource))
-      with ops.control_dependencies(reset_ops):
-        self._initializer = self._input_iterator.make_initializer(
-            self._input_dataset)
-
-  def get_next(self, name=None):
-    """See `tf.data.Iterator.get_next`."""
-    self._get_next_call_count += 1
-    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
-      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
-
-    flat_result = []
-    # TODO(priyag): This will fail if the input size (typically number of
-    # batches) is not divisible by number of devices.
-    # How do we handle that more gracefully / let the user know?
-    for buffer_resource in self._buffering_resources:
-      flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
-          buffer_resource,
-          output_types=data_nest.flatten(
-              sparse.as_dense_types(self.output_types, self.output_classes)),
-          name=name)
-
-      ret = sparse.deserialize_sparse_tensors(
-          data_nest.pack_sequence_as(self.output_types, flat_ret),
-          self.output_types, self.output_shapes, self.output_classes)
-
-      for tensor, shape in zip(
-          data_nest.flatten(ret), data_nest.flatten(self.output_shapes)):
-        if isinstance(tensor, ops.Tensor):
-          tensor.set_shape(shape)
-      flat_result.append(ret)
-
-    return nest.pack_sequence_as(self._devices, flat_result)
-
-  @property
-  def initializer(self):
-    if self._one_shot:
-      raise NotImplementedError("Can't initialize a one_shot_iterator")
-    return self._initializer
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-# pylint: enable=protected-access
-
-
-class _PrefetchToDeviceDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` whose iterator prefetches elements to other device(s)."""
-
-  def __init__(self, input_dataset, devices, buffer_size):
-    super(_PrefetchToDeviceDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._devices = devices
-    self._buffer_size = buffer_size if buffer_size is not None else 1
-
-  def make_one_shot_iterator(self):
-    return _PrefetchToDeviceIterator(
-        self._input_dataset,
-        one_shot=True,
-        devices=self._devices,
-        buffer_size=self._buffer_size)
-
-  def make_initializable_iterator(self, shared_name=None):
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "make_initializable_iterator is not supported when eager "
-          "execution is enabled.")
-
-    return _PrefetchToDeviceIterator(
-        self._input_dataset,
-        one_shot=False,
-        devices=self._devices,
-        buffer_size=self._buffer_size,
-        shared_name=shared_name)
-
-  def _as_variant_tensor(self):
-    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
-    # transformation methods is called.
-    # TODO(mrry): Investigate support for chaining further transformations after
-    # the prefetch, including GPU support.
-    raise NotImplementedError("`prefetch_to_devices()` must be the last "
-                              "transformation in a dataset pipeline.")
-
-  # TODO(priyag): Fix the output types, shapes and classes to match the result
-  # of get_next (which has the additional nesting layer of devices now).
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-
-def prefetch_to_devices(devices, buffer_size=None):
-  """A transformation that prefetches dataset values to the given `devices`.
-
-  NOTE: Although the transformation creates a `tf.data.Dataset`, the
-  transformation must be the final `Dataset` in the input pipeline.
-
-  Args:
-    devices: A nested structure of devices on which to prefetch the data. It can
-      be a single device name, or a tuple or list of device names.
-    buffer_size: (Optional.) The number of elements to buffer on each device.
-      Defaults to an automatically chosen value.
-
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    `tf.data.Dataset.apply`.
-  """
-
-  def _apply_fn(dataset):
-    return _PrefetchToDeviceDataset(dataset, devices, buffer_size)
-
-  return _apply_fn
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
deleted file mode 100644
index 16799104e8112f4391152c0cf2a15af81f8c2c9d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for prefetching_ops_v2."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distribute.python import prefetching_ops_v2
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import test
-
-
-class PrefetchingOpsV2Test(test.TestCase):
-
-  def testPrefetchToOneDevice(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    host_dataset = dataset_ops.Dataset.range(10)
-    device_dataset = host_dataset.apply(
-        prefetching_ops_v2.prefetch_to_devices("/gpu:0"))
-
-    iterator = device_dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testPrefetchToTwoDevicesInAList(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    host_dataset = dataset_ops.Dataset.range(10)
-    device_dataset = host_dataset.apply(
-        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
-
-    iterator = device_dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    output = []
-    # TODO(rohanj): Modify test to go till the end of the dataset when we
-    # switch to MultiDeviceIterator.
-    with self.cached_session() as sess:
-      for _ in range(4):
-        result = sess.run(next_element)
-        self.assertEqual(2, len(result))
-        output.extend(result)
-      self.assertEquals(set(range(8)), set(output))
-
-  def testPrefetchToTwoDevicesWithReinit(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    host_dataset = dataset_ops.Dataset.range(10)
-    device_dataset = host_dataset.apply(
-        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
-
-    iterator = device_dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    # TODO(rohanj): Modify test to go till the end of the dataset when we
-    # switch to MultiDeviceIterator.
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      for _ in range(4):
-        sess.run(next_element)
-      sess.run(iterator.initializer)
-      for _ in range(4):
-        sess.run(next_element)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index 09b351ffa4165656e2fc9666ab4b7725ef061f50..be724fb59a7efa18c43c4cb98649ced806f7bcb4 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -90,7 +90,7 @@ def batchnorm_example(optimizer_fn,
                       batch_per_epoch=1,
                       momentum=0.9,
                       renorm=False,
-                      update_ops_in_tower_mode=False):
+                      update_ops_in_replica_mode=False):
   """Example of non-distribution-aware legacy code with batch normalization."""
 
   def dataset_fn():
@@ -113,7 +113,7 @@ def batchnorm_example(optimizer_fn,
       y = batchnorm(x, training=True)
       with ops.control_dependencies(
           ops.get_collection(ops.GraphKeys.UPDATE_OPS)
-          if update_ops_in_tower_mode else []):
+          if update_ops_in_replica_mode else []):
         loss = math_ops.reduce_mean(
             math_ops.reduce_sum(layer(y)) - constant_op.constant(1.))
       # `x` and `y` will be fetched by the gradient computation, but not `loss`.
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index 1b5a4f64e5bb1ffabfe1b87c150f713c755bb682..c928b6d9f1f21508edd753f94c38ab2723cc0a9f 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.training import optimizer as optimizer_lib
 
 
@@ -50,7 +51,11 @@ class StandardInputStep(Step):
   def __init__(self, dataset_fn, distribution):
     super(StandardInputStep, self).__init__(distribution)
     self._distributed_input = distribution.distribute_dataset(dataset_fn)
-    self._iterator = self._distributed_input.make_one_shot_iterator()
+    if context.executing_eagerly():
+      self._iterator = self._distributed_input.make_one_shot_iterator()
+    else:
+      # TODO(priyag): Expose initializer via some initializer property.
+      self._iterator = self._distributed_input.make_initializable_iterator()
 
 
 class StandardSingleLossStep(StandardInputStep):
@@ -85,25 +90,21 @@ class StandardSingleLossStep(StandardInputStep):
     super(StandardSingleLossStep, self).__init__(dataset_fn, distribution)
     self._loss_fn = loss_fn
     self._optimizer = optimizer
-    self._is_run_concurrently = False
     self._iterations_per_step = iterations_per_step
 
   def __call__(self):
     with self._distribution.scope():
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         """Function to run one iteration with one input."""
         gradients_fn = backprop.implicit_grad(self._loss_fn)
         gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
 
-        grads_and_vars = self.distribution.call_for_each_tower(
-            gradients_fn,
-            ctx, *inputs,
-            run_concurrently=self._is_run_concurrently)
+        grads_and_vars = self.distribution.call_for_each_replica(
+            gradients_fn, args=(ctx,) + inputs)
         # If threads use layers, then we need to run the first step
         # sequentially, so that layers.build() is not executed in parallel.
         # Otherwise, multiple sets of mirrored variables are going to be
         # created.
-        self._is_run_concurrently = True
         return self._optimizer._distributed_apply(  # pylint: disable=protected-access
             self.distribution, grads_and_vars)
 
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
index f1ada49fa378358f112fb75a4bcdbe9a8a09cd13..1ff9b9ceec13351b098d47ed3ff62f689a625a31 100644
--- a/tensorflow/contrib/distribute/python/step_fn_test.py
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -50,6 +50,7 @@ class SingleLossStepTest(test.TestCase, parameterized.TestCase):
         run_step = single_loss_step
       else:
         with self.cached_session() as sess:
+          sess.run(single_loss_step._iterator.initializer)
           run_step = sess.make_callable(single_loss_step())
       self.evaluate(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index fd280f5754b34170cdd6b948236138d0e77dd8bc..d441b5af5f6aa41efde2c75d09d9589516c54992 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -19,16 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer
 
 
@@ -36,45 +41,43 @@ class _TestException(Exception):
   pass
 
 
-# May be the argument to either distribution.call_for_each_tower() or
-# get_tower_context().merge_call()
+# May be the argument to either distribution.call_for_each_replica() or
+# get_replica_context().merge_call()
 def _raise_exception_fn(_=None):
   raise _TestException()
 
 
-# Must be the argument to a distribution.call_for_each_tower() call, calls a
-# get_tower_context().merge_call() that raises an exception.
+# Must be the argument to a distribution.call_for_each_replica() call, calls a
+# get_replica_context().merge_call() that raises an exception.
 def _merge_raises_fn():
-  distribution_strategy_context.get_tower_context().merge_call(
-      _raise_exception_fn)
+  ds_context.get_replica_context().merge_call(_raise_exception_fn)
 
 
-# Must be the argument to a get_tower_context().merge_call() call, calls
-# dist.call_for_each_tower() with a function that raises an exception.
+# Must be the argument to a get_replica_context().merge_call() call, calls
+# dist.call_for_each_replica() with a function that raises an exception.
 def _call_raises_fn(dist):
-  dist.call_for_each_tower(_raise_exception_fn)
+  dist.call_for_each_replica(_raise_exception_fn)
 
 
-# Must be the argument to a distribution.call_for_each_tower() call,
-# calls a get_tower_context().merge_call() that calls a
-# call_for_each_tower() that raises an exception.
+# Must be the argument to a distribution.call_for_each_replica() call,
+# calls a get_replica_context().merge_call() that calls a
+# call_for_each_replica() that raises an exception.
 def _merge_call_raises_fn():
-  distribution_strategy_context.get_tower_context().merge_call(_call_raises_fn)
+  ds_context.get_replica_context().merge_call(_call_raises_fn)
 
 
-# Must be the argument to a get_tower_context().merge_call() call, calls
-# dist.call_for_each_tower() with a function that calls a
-# get_tower_context().merge_call() that raises an exception.
+# Must be the argument to a get_replica_context().merge_call() call, calls
+# dist.call_for_each_replica() with a function that calls a
+# get_replica_context().merge_call() that raises an exception.
 def _call_merge_raises_fn(dist):
-  dist.call_for_each_tower(_merge_raises_fn)
+  dist.call_for_each_replica(_merge_raises_fn)
 
 
-# Must be the argument to a distribution.call_for_each_tower() call, calls a
-# get_tower_context().merge_call() that calls a call_for_each_tower() that
-# calls a get_tower_context().merge_call() that raises an exception.
+# Must be the argument to a distribution.call_for_each_replica() call, calls a
+# get_replica_context().merge_call() that calls a call_for_each_replica() that
+# calls a get_replica_context().merge_call() that raises an exception.
 def _merge_call_merge_raises_fn():
-  distribution_strategy_context.get_tower_context().merge_call(
-      _call_merge_raises_fn)
+  ds_context.get_replica_context().merge_call(_call_merge_raises_fn)
 
 
 class DistributionTestBase(test.TestCase):
@@ -103,7 +106,7 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_tower(grad_fn, one, run_concurrently=l.built)
+        g_v = d.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
@@ -113,8 +116,8 @@ class DistributionTestBase(test.TestCase):
           before_list.append(fetched)
           # control_dependencies irrelevant but harmless in eager execution
           with ops.control_dependencies([fetched]):
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(d.update(
                 v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -138,7 +141,7 @@ class DistributionTestBase(test.TestCase):
     config.gpu_options.per_process_gpu_memory_fraction = 0.3
     with context.graph_mode(), \
          ops.Graph().as_default(), \
-         self.test_session(config=config) as sess, \
+         self.cached_session(config=config) as sess, \
          d.scope():
       l = core.Dense(1, use_bias=False)
 
@@ -159,7 +162,7 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_tower(grad_fn, one)
+        g_v = d.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
@@ -168,8 +171,8 @@ class DistributionTestBase(test.TestCase):
           fetched = d.read_var(v)
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(d.update(
                 v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -188,47 +191,103 @@ class DistributionTestBase(test.TestCase):
       # Error should go down
       self.assertLess(error_after, error_before)
 
-  def _test_map_reduce(self, d, in_graph=None):
+  def _test_replica_id(self, d):
     with d.scope():
-      map_in = [constant_op.constant(i) for i in range(10)]
-      map_out = d.map(map_in, lambda x, y: x * y, 2)
-      observed = d.reduce(variable_scope.VariableAggregation.SUM, map_out,
-                          "/device:CPU:0")
-      expected = 90  # 2 * (0 + 1 + ... + 9)
-      self.assertEqual(expected, observed.numpy())
-
-  def _test_device_index(self, d):
-    with d.scope():
-      expected_devices = [False] * len(d.worker_devices)
-
-      def mark_devices_fn(device_id):
-        self.assertLess(device_id, len(d.worker_devices))
-        self.assertFalse(expected_devices[device_id])
-        expected_devices[device_id] = True
-
-      d.call_for_each_tower(mark_devices_fn, d.worker_device_index)
-      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
-
-  def _test_tower_id(self, d):
-    with d.scope():
-      expected_devices = [False] * len(d.worker_devices)
+      expected_devices = [False] * len(d.extended.worker_devices)
 
       def mark_devices_fn():
-        tower_id = distribution_strategy_context.get_tower_context().tower_id
-        self.assertLess(tower_id, len(d.worker_devices))
-        self.assertFalse(expected_devices[tower_id])
-        expected_devices[tower_id] = True
+        replica_id = self.evaluate(
+            ds_context.get_replica_context().replica_id_in_sync_group)
+        self.assertLess(replica_id, len(d.extended.worker_devices))
+        self.assertFalse(expected_devices[replica_id])
+        expected_devices[replica_id] = True
 
-      d.call_for_each_tower(mark_devices_fn)
-      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+      d.call_for_each_replica(mark_devices_fn)
+      self.assertAllEqual(expected_devices,
+                          [True] * len(d.extended.worker_devices))
 
   def _test_call_and_merge_exceptions(self, dist):
     with dist.scope():
       with self.assertRaises(_TestException):
-        dist.call_for_each_tower(_raise_exception_fn)
+        dist.call_for_each_replica(_raise_exception_fn)
       with self.assertRaises(_TestException):
-        dist.call_for_each_tower(_merge_raises_fn)
+        dist.call_for_each_replica(_merge_raises_fn)
       with self.assertRaises(_TestException):
-        dist.call_for_each_tower(_merge_call_raises_fn)
+        dist.call_for_each_replica(_merge_call_raises_fn)
       with self.assertRaises(_TestException):
-        dist.call_for_each_tower(_merge_call_merge_raises_fn)
+        dist.call_for_each_replica(_merge_call_merge_raises_fn)
+
+  def _input_fn_to_test_input_context(self,
+                                      dataset_fn,
+                                      expected_num_replicas_in_sync,
+                                      expected_num_input_pipelines,
+                                      expected_input_pipeline_id):
+    # Use a list of one element as counter so that it can be captured by the
+    # `_input_fn`. This counter is incremented by 1 each time an input_fn is
+    # called. We use this counter to check whether the `input_pipeline_id`
+    # matches the counter in the in-graph replication.
+    worker_id_counter = [0]
+
+    def _input_fn(input_context):
+      """Input fn for testing."""
+      self.assertIsNotNone(input_context)
+      self.assertEqual(expected_num_replicas_in_sync,
+                       input_context.num_replicas_in_sync)
+      self.assertEqual(expected_num_input_pipelines,
+                       input_context.num_input_pipelines)
+      if expected_input_pipeline_id is not None:
+        self.assertEqual(expected_input_pipeline_id,
+                         input_context.input_pipeline_id)
+      else:
+        self.assertEqual(worker_id_counter[0], input_context.input_pipeline_id)
+        worker_id_counter[0] += 1
+
+      return dataset_fn()
+
+    return _input_fn
+
+  def _test_input_fn_iterator(self, iterator, devices, expected_values,
+                              sess=None):
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+    evaluate(iterator.initialize())
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_device(d, next_element) for d in devices])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(iterator.initialize())
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
+
+  def _test_global_step_update(self, strategy):
+    with strategy.scope():
+      global_step = variable_scope.get_variable(
+          "global_step",
+          shape=[],
+          dtype=dtypes.int64,
+          initializer=init_ops.zeros_initializer(),
+          trainable=False,
+          aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      self.evaluate(variables.global_variables_initializer())
+
+      def model_fn():
+        train_op = global_step.assign_add(1)
+        value = global_step.read_value()
+        return train_op, value
+
+      train_ops, value = strategy.call_for_each_replica(model_fn)
+      self.evaluate(strategy.group(train_ops))
+      global_step_tensors = strategy.unwrap(value)
+      global_step_values = self.evaluate(global_step_tensors)
+      self.assertEqual((1,) * len(global_step_tensors), global_step_values)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index 1d9e299b38409b874610765e54fa0052fafd5f4b..b6f5b492017fc7dfd329e69ad9ca418ae682bc4b 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -21,30 +21,34 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import one_device_strategy
-from tensorflow.contrib.distribute.python import values
+import copy
+import functools
+
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.contrib.tpu.python.tpu import training_loop
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
-_TPU_INITIALIZE_SYSTEM_COLLECTION = "TPU_STRATEGY_INITIALIZE"
-
-
 def get_tpu_system_metadata(tpu_cluster_resolver):
   """Retrieves TPU system metadata given a TPUClusterResolver."""
   master = tpu_cluster_resolver.master()
@@ -75,13 +79,13 @@ def _create_tpu_mirrored_variable(devices, real_mirrored_creator, *args,
   # synchronization settings?
 
   # Get aggregation value
-  # TODO(jhseu): Support aggregation in a tower context.
+  # TODO(jhseu): Support aggregation in a replica context.
   aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
   if aggregation not in [
       vs.VariableAggregation.NONE,
       vs.VariableAggregation.SUM,
       vs.VariableAggregation.MEAN,
-      vs.VariableAggregation.ONLY_FIRST_TOWER,
+      vs.VariableAggregation.ONLY_FIRST_REPLICA,
   ]:
     raise ValueError("Invalid variable aggregation mode: {} for variable: {}"
                      .format(aggregation, kwargs["name"]))
@@ -112,9 +116,8 @@ def _create_tpu_mirrored_variable(devices, real_mirrored_creator, *args,
   return result
 
 
-# TODO(jhseu): Stop inheriting from OneDeviceStrategy.
-class TPUStrategy(one_device_strategy.OneDeviceStrategy):
-  """Experimental TPU distribution strategy implementation."""
+class TPUStrategy(distribute_lib.DistributionStrategy):
+  """TPU distribution strategy implementation."""
 
   def __init__(self, tpu_cluster_resolver, steps_per_run, num_cores=None):
     """Initializes the TPUStrategy object.
@@ -130,10 +133,24 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
       num_cores: Number of cores to use on the TPU. If None specified, then
           auto-detect the cores and topology of the TPU system.
     """
-    # TODO(sourabhbajaj): OneDeviceStrategy should be initialized with the
-    # master node fetched from the cluster resolver.
-    super(TPUStrategy, self).__init__("/device:CPU:0")
+    super(TPUStrategy, self).__init__(TPUExtended(
+        self, tpu_cluster_resolver, steps_per_run, num_cores))
+
+  @property
+  def steps_per_run(self):
+    """DEPRECATED: use .extended.steps_per_run instead."""
+    return self._extended.steps_per_run
+
+
+class TPUExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of TPUStrategy."""
 
+  # Track what TPU devices have been initialized.
+  _initialized_devices = []
+
+  def __init__(self, container_strategy, tpu_cluster_resolver, steps_per_run,
+               num_cores=None):
+    super(TPUExtended, self).__init__(container_strategy)
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
     # TODO(sourabhbajaj): Change this from num_cores to metadata_override
@@ -143,19 +160,45 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     # parallelism.
     device_map = {d.name: i for i, d in enumerate(self._tpu_metadata.devices)
                   if "device:TPU:" in d.name}
-    self._device_index = values.PerDevice(device_map)
-    self._tpu_devices = sorted(device_map.keys())
-    # Only create variables for the number of towers we're running.
-    self._tpu_devices = self._tpu_devices[:self.num_towers]
+    self._device_index = values.PerReplica(device_map)
+    self._host_device = self.get_host_cpu_device(0)
+    self._tpu_devices = tuple(sorted(device_map.keys()))
+    # Only create variables for the number of replicas we're running.
+    self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
-
     self._require_static_shapes = True
 
-  def _get_enqueue_op_per_host(self, host_id, iterator, input_shapes,
-                               iterations):
+    # Initialize the TPU devices.
+    self._initialize_tpu()
+
+  def _initialize_tpu(self):
+    """Initialize the TPU devices in a separate session and graph.
+
+    We keep track of all the TPU devices that we're initialized as we should
+    only be running TPU initialize once for the entire process.
+    """
+    master = self._tpu_cluster_resolver.master()
+    # Verify TPU has not already been initialized in this process.
+    if master in TPUExtended._initialized_devices:
+      logging.info("TPU master %s has already been initialized." % master)
+      return
+
+    logging.info("Initializing the TPU system.")
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    self._configure(session_config)
+    with ops.Graph().as_default():
+      with session_lib.Session(config=session_config, target=master) as sess:
+        sess.run([tpu.initialize_system()])
+    logging.info("Finized initializing TPU system.")
+
+    # Update Strategy state to make sure we can track device initialization.
+    TPUExtended._initialized_devices.append(master)
+
+  def _get_enqueue_op_per_host(self, host_id, multi_worker_iterator,
+                               input_shapes, iterations):
     """Create an enqueue op for a single host identified using host_id.
 
     The while_loop op returned will run `iterations` times and in each run
@@ -163,7 +206,7 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
 
     Args:
       host_id: integer, id of the host to run the enqueue ops on.
-      iterator: `tf.data` iterator to read the input data.
+      multi_worker_iterator: MultiWorkerDataIterator to read the input data.
       input_shapes: shape of inputs to be enqueue on the queue. This is same as
         the value of `nest.flatten(iterator.output_shapes)`.
       iterations: integer, number of iterations to be run; determines the
@@ -174,6 +217,10 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
       on the infeed queue from the host with id `host_id` for each device shard.
     """
     host = self.get_host_cpu_device(host_id)
+    # TODO(sourabhbajaj): Possibly make changes to MultiWorkerDataset
+    # to work with TPU Prefetch so clean up this code.
+    iterator = (
+        multi_worker_iterator.get_iterator(self.get_host(host_id))._iterator)  # pylint: disable=protected-access
 
     def _infeed_enqueue_ops_fn():
       """Enqueue ops for one iteration."""
@@ -182,7 +229,7 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
       enqueue_ops = []
 
       with ops.device(host):
-        for _ in range(self.num_towers_per_host):
+        for _ in range(self.num_replicas_per_host):
           # Use control dependencies to ensure a deterministic ordering.
           with ops.control_dependencies(control_deps):
             inputs = nest.flatten(iterator.get_next())
@@ -211,44 +258,59 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
 
     return enqueue_op_per_host
 
-  def distribute_dataset(self, dataset_fn):
-    # TODO(priyag): Perhaps distribute across cores here.
-    return self._call_dataset_fn(dataset_fn)
+  def _make_dataset_iterator(self, dataset):
+    """Make iterators for each of the TPU hosts."""
+
+    worker_devices = [
+        (self.get_host(hid), [self.get_host_cpu_device(hid)])
+        for hid in range(self.num_hosts)
+    ]
+    return values.DatasetIterator(dataset, worker_devices,
+                                  self._num_replicas_in_sync)
+
+  def _distribute_dataset(self, dataset_fn):
+    worker_devices = [
+        (self.get_host(hid), [self.get_host_cpu_device(hid)])
+        for hid in range(self.num_hosts)
+    ]
+    return values.MultiWorkerDataset(
+        functools.partial(self._call_dataset_fn, dataset_fn), worker_devices)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values=None):
-
-    shapes = nest.flatten(iterator.output_shapes)
-    if any([not s.is_fully_defined() for s in shapes]):
+  def _experimental_run_steps_on_iterator(
+      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
+    output_shapes = multi_worker_iterator.output_shapes
+    shapes = nest.flatten(output_shapes)
+    if any(not s.is_fully_defined() for s in shapes):
       raise ValueError(
-          'TPU currently requires fully defined shapes. Either use '
-          'set_shape() on the input tensors or use '
-          'dataset.batch(..., drop_remainder=True).')
-    types = nest.flatten(iterator.output_types)
+          "TPU currently requires fully defined shapes. Either use "
+          "set_shape() on the input tensors or use "
+          "dataset.batch(..., drop_remainder=True).")
+    types = nest.flatten(multi_worker_iterator.output_types)
 
     enqueue_ops = [
-        self._get_enqueue_op_per_host(host_id, iterator, shapes, iterations)
+        self._get_enqueue_op_per_host(host_id, multi_worker_iterator, shapes,
+                                      iterations)
         for host_id in range(self.num_hosts)]
 
     def dequeue_fn():
       dequeued = tpu_ops.infeed_dequeue_tuple(dtypes=types, shapes=shapes)
-      return nest.pack_sequence_as(iterator.output_shapes, dequeued)
+      return nest.pack_sequence_as(output_shapes, dequeued)
 
     # Wrap `fn` for repeat.
     if initial_loop_values is None:
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
     ctx = values.MultiStepContext()
-    def run_fn(*args, **kwargs):
+
+    def run_fn():
       """Single step on the TPU device."""
-      del args, kwargs
       fn_inputs = dequeue_fn()
       if not isinstance(fn_inputs, tuple):
         fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
+      fn_result = fn(ctx, fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       if flat_last_step_outputs:
         with ops.control_dependencies([fn_result]):
@@ -256,11 +318,6 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
       else:
         return fn_result
 
-    # TODO(sourabhbajaj): The input to while loop should be based on the output
-    # type of the step_fn
-    def iterate_on_tpu():
-      return training_loop.repeat(iterations, run_fn, initial_loop_values)
-
     # We capture the control_flow_context at this point, before we run `fn`
     # inside a while_loop and TPU replicate context. This is useful in cases
     # where we might need to exit these contexts and get back to the outer
@@ -270,74 +327,98 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     self._outer_control_flow_context = (
         ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
 
-    replicate_inputs = [[]] * self.num_towers
-    replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
+    def rewrite_fn(*args):
+      """The rewritten step fn running on TPU."""
+      del args
+      replicate_inputs = [[]] * self._num_replicas_in_sync
+      replicate_outputs = tpu.replicate(run_fn, replicate_inputs)
+
+      # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
+      # will flatten it in this case. If run_fn has no tensor outputs,
+      # tpu.replicate returns a list of no_ops, we will keep the output as it
+      # is.
+      if isinstance(replicate_outputs[0], list):
+        replicate_outputs = nest.flatten(replicate_outputs)
+
+      return replicate_outputs
+
+    # TODO(sourabhbajaj): The input to while loop should be based on the output
+    # type of the step_fn
+    assert isinstance(initial_loop_values, list)
+    initial_loop_values = initial_loop_values * self._num_replicas_in_sync
+
+    # Put the while loop op on host 0.
+    with ops.device(self.get_host_cpu_device(0)):
+      replicate_outputs = training_loop.repeat(iterations, rewrite_fn,
+                                               initial_loop_values)
+
     del self._outer_control_flow_context
     ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
 
-    # Filter out any ops from the outputs, typically this would be the case
-    # when there were no tensor outputs.
-    last_step_tensor_outputs = [x for x in replicate_outputs
-                                if not isinstance(x, ops.Operation)]
-
-    # Outputs are currently of the structure (grouped by device)
-    # [[output0_device0, output1_device0, output2_device0],
-    #  [output0_device1, output1_device1, output2_device1]]
-    # Convert this to the following structure instead: (grouped by output)
-    # [[output0_device0, output0_device1],
-    #  [output1_device0, output1_device1],
-    #  [output2_device0, output2_device1]]
-    last_step_tensor_outputs = [list(x) for x in zip(*last_step_tensor_outputs)]
+    if isinstance(replicate_outputs, list):
+      # Filter out any ops from the outputs, typically this would be the case
+      # when there were no tensor outputs.
+      last_step_tensor_outputs = [
+          x for x in replicate_outputs if not isinstance(x, ops.Operation)
+      ]
+
+      # Outputs are currently of the structure (flattened)
+      # [output0_device0, output1_device0, output2_device0,
+      #  output0_device1, output1_device1, output2_device1,
+      #  ...]
+      # Convert this to the following structure instead: (grouped by output)
+      # [[output0_device0, output0_device1],
+      #  [output1_device0, output1_device1],
+      #  [output2_device0, output2_device1]]
+      output_num = len(last_step_tensor_outputs) // self._num_replicas_in_sync
+      last_step_tensor_outputs = [
+          last_step_tensor_outputs[i::output_num] for i in range(output_num)
+      ]
+    else:
+      # no tensors returned.
+      last_step_tensor_outputs = []
 
     # Convert replicate_outputs to the original dict structure of
     # last_step_outputs.
     last_step_tensor_outputs_dict = nest.pack_sequence_as(
         ctx.last_step_outputs, last_step_tensor_outputs)
 
-    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
+    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
       output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been aggregated, take the first value
+      # For outputs that have already been reduced, take the first value
       # from the list as each value should be the same. Else return the full
       # list of values.
-      # TODO(josh11b): If aggregation is NONE, we should return a PerDevice value.
-      if aggregation is not variables_lib.VariableAggregation.NONE:
+      # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
+      # value.
+      if reduce_op is not None:
         # TODO(priyag): Should this return the element or a list with 1 element
         last_step_tensor_outputs_dict[name] = output[0]
     ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
 
     return ctx
 
-  def _call_for_each_tower(self, fn, *args, **kwargs):
-    # TODO(jhseu): Consider making it so call_for_each_tower implies that we're
-    # in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
-    kwargs.pop('run_concurrently', None)
-    with one_device_strategy._OneDeviceTowerContext(self):  # pylint: disable=protected-access
+  def _call_for_each_replica(self, fn, args, kwargs):
+    # TODO(jhseu): Consider making it so call_for_each_replica implies that
+    # we're in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
+    with _TPUReplicaContext(self._container_strategy()):
       return fn(*args, **kwargs)
 
-  def initialize(self):
+  def _initialize(self):
     if context.executing_eagerly():
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
-      raise NotImplementedError('Eager mode not supported in TPUStrategy.')
+      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
     else:
-      # TODO(jhseu): We need this hack because DistributionStrategies must be
-      # pickleable for copy.deepcopy(). Remove when initialize_system goes away.
-      graph = ops.get_default_graph()
-      tpu_init = graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
-      if tpu_init:
-        return tpu_init
-      graph.add_to_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION,
-                              tpu.initialize_system())
-      return graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
-
-  def finalize(self):
+      return []
+
+  def _finalize(self):
     if context.executing_eagerly():
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
-      raise NotImplementedError('Eager mode not supported in TPUStrategy.')
+      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
     else:
-      return [tpu.shutdown_system()]
+      return []
 
   def _get_devices_from(self, colocate_with=None):
-     # TODO(jhseu): Change this when we support model parallelism.
+    # TODO(jhseu): Change this when we support model parallelism.
     return self._tpu_devices
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -352,7 +433,7 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
           if i > 0:
             # Give replicas meaningful distinct names:
             var0name = index[devices[0]].name.split(":")[0]
-            # We append a / to variable names created on towers with id > 0 to
+            # We append a / to variable names created on replicas with id > 0 to
             # ensure that we ignore the name scope and instead use the given
             # name as the absolute name of the variable.
             kwargs["name"] = "%s/replica_%d/" % (var0name, i)
@@ -374,12 +455,12 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     return _create_tpu_mirrored_variable(devices, _real_mirrored_creator, *args,
                                          **kwargs)
 
-  def _reduce(self, aggregation, value, destinations):
+  def _reduce_to(self, reduce_op, value, destinations):
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if aggregation == vs.VariableAggregation.MEAN:
+      if reduce_op == reduce_util.ReduceOp.MEAN:
         # TODO(jhseu):  Revisit once we support model-parallelism.
-        value *= (1. / self.num_towers)
-      elif aggregation != vs.VariableAggregation.SUM:
+        value *= (1. / self._num_replicas_in_sync)
+      elif reduce_op != reduce_util.ReduceOp.SUM:
         raise NotImplementedError(
             "Currently only support sum & mean in TPUStrategy.")
       return tpu_ops.cross_replica_sum(value)
@@ -387,27 +468,22 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
     # Validate that the destination is same as the host device
     # Note we don't do this when in replicate context as the reduction is
     # performed on the TPU device itself.
-    devices = cross_tower_ops_lib.get_devices_from(destinations)
+    devices = cross_device_ops_lib.get_devices_from(destinations)
     if len(devices) == 1:
       assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
-          self.get_host_cpu_device(0))
+          self._host_device)
     else:
-      raise ValueError('Multiple devices are not supported for TPUStrategy')
+      raise ValueError("Multiple devices are not supported for TPUStrategy")
 
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER:
-      return value[0]
     output = math_ops.add_n(value)
-    if aggregation == vs.VariableAggregation.MEAN:
+    if reduce_op == reduce_util.ReduceOp.MEAN:
       return output * (1. / len(value))
     return output
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     assert isinstance(var, values.TPUMirroredVariable)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if should_group:
+      if group:
         return fn(var, *args, **kwargs)
       else:
         return [fn(var, *args, **kwargs)]
@@ -422,9 +498,7 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
         updates[d] = fn(v,
                         *values.select_device_mirrored(d, args),
                         **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  # TODO(josh11b): Need to implement _update_non_slot()!
+    return values.update_regroup(self, updates, group)
 
   def read_var(self, var):
     assert isinstance(var, values.TPUMirroredVariable)
@@ -433,33 +507,39 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   def _unwrap(self, val):
     if isinstance(val, values.DistributedValues):
       # Return in a deterministic order.
-      return [val.get(device=d) for d in sorted(val.devices)]
+      return tuple(val.get(device=d) for d in sorted(val.devices))
     elif isinstance(val, list):
       # TODO(josh11b): We need to remove this case; per device values should
-      # be represented using a PerDevice wrapper instead of a list with
+      # be represented using a PerReplica wrapper instead of a list with
       # one entry per device.
-      return val
-    return [val]
+      return tuple(val)
+    return (val,)
 
+  def value_container(self, value):
+    return value
 
-  @property
-  def num_towers(self):
-    return self._num_cores_override or self._tpu_metadata.num_cores
+  def _broadcast_to(self, tensor, destinations):
+    del destinations
+    return tensor
 
   @property
   def num_hosts(self):
     return self._tpu_metadata.num_hosts
 
   @property
-  def num_towers_per_host(self):
+  def num_replicas_per_host(self):
     return self._tpu_metadata.num_of_cores_per_host
 
   @property
-  def between_graph(self):
+  def _num_replicas_in_sync(self):
+    return self._num_cores_override or self._tpu_metadata.num_cores
+
+  @property
+  def experimental_between_graph(self):
     return False
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return True
 
   @property
@@ -478,20 +558,65 @@ class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   def parameter_devices(self):
     return self._tpu_devices
 
+  def non_slot_devices(self, var_list):
+    return self._host_device
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    del colocate_with
+    with ops.device(self._host_device), distribute_lib.UpdateContext(
+        self._host_device):
+      result = fn(*args, **kwargs)
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  def get_host(self, host_id):
+    if self._tpu_cluster_resolver.get_master() in ("", "local"):
+      return "/replica:0/task:0"
+    job_name = self._tpu_cluster_resolver.get_job_name() or "tpu_worker"
+    return "/job:%s/task:%d" % (job_name, host_id)
+
   def get_host_cpu_device(self, host_id):
-    if self._tpu_cluster_resolver.get_master() in ('', 'local'):
-      return '/replica:0/task:0/device:CPU:0'
-    job_name = self._tpu_cluster_resolver.get_job_name() or 'tpu_worker'
-    return '/job:%s/task:%d/device:CPU:0' % (job_name, host_id)
-
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+    return self.get_host(host_id) + "/device:CPU:0"
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     del cluster_spec, task_type, task_id
     if session_config:
-      session_config.isolate_session_state = True
-      cluster_spec = self._tpu_cluster_resolver.cluster_spec()
-      if cluster_spec:
-        session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    updated_config.isolate_session_state = True
+    cluster_spec = self._tpu_cluster_resolver.cluster_spec()
+    if cluster_spec:
+      updated_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
+    return updated_config
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
+
+
+class _TPUReplicaContext(distribute_lib.ReplicaContext):
+  """Replication Context class for TPU Strategy."""
+
+  # TODO(sourabhbajaj): Call for each tower should be updating this.
+  def __init__(self, distribution_strategy):
+    distribute_lib.ReplicaContext.__init__(
+        self,
+        distribution_strategy,
+        # TODO(b/118385803): properly initialize replica_id, instead of always 0
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
+
+  @property
+  def devices(self):
+    distribute_lib.require_replica_context(self)
+    ds = self._distribution_strategy
+    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+    return (ds.extended.worker_devices[replica_id],)
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
deleted file mode 100644
index 18ceba42c2a57917de1de315973cd111d9a022cf..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/values.py
+++ /dev/null
@@ -1,1614 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Various classes representing distributed values.
-
-See go/tf-distribution-strategy.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import contextlib
-import weakref
-import six
-
-from tensorflow.contrib.distribute.python import input_ops
-from tensorflow.contrib.distribute.python import prefetching_ops_v2
-from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_resource_variable_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.util import nest
-
-
-# pylint: disable=line-too-long
-# TODO(josh11b): Should device values be strings or DeviceSpec objects?
-# Not sure DeviceSpec objects are usable as a dict key.
-class DistributedValues(object):
-  """Holds a map from device to values. Either PerDevice or Mirrored."""
-
-  def __init__(self, index):
-    self._index = {device_util.canonicalize(key): value
-                   for key, value in six.iteritems(index)}
-
-  def get(self, device=None):
-    """Returns the value for the current device or raises a ValueError."""
-    if device is None:
-      tower_context = distribution_strategy_context.get_tower_context()
-      if tower_context:
-        device = tower_context.device
-      else:
-        device = distribute_lib.get_update_device()
-        if device is None:
-          return self._get_cross_tower()
-    device = device_util.canonicalize(device)
-    try:
-      return self._index[device]
-    except KeyError as e:
-      six.raise_from(
-          ValueError("Device %s not found in %s (current device %s)" %
-                     (device, self._index.keys(), device_util.current())), e)
-
-  def on_device(self, device):
-    device = device_util.canonicalize(device)
-    return device in self._index
-
-  @property
-  def devices(self):
-    return list(self._index.keys())
-
-  @property
-  def is_tensor_like(self):
-    for v in self._index.values():
-      if not tensor_util.is_tensor(v):
-        return False
-    return True
-
-  def __str__(self):
-    return "%s:%s" % (self.__class__.__name__, self._index)
-
-  def __repr__(self):
-    return "%s(%r)" % (self.__class__.__name__, self._index)
-
-  # TODO(josh11b): Possibly make an accessor for _index for use by
-  # DistributionStrategy implementations.
-
-
-class DistributedDelegate(DistributedValues):
-  """A map from device to values; acts as the same type as the values."""
-
-  def __init__(self, index):
-    super(DistributedDelegate, self).__init__(index)
-
-  def __getattr__(self, name):
-    return getattr(self.get(), name)
-
-  # pylint: disable=multiple-statements
-  def __add__(self, o): return self.get() + o
-  def __radd__(self, o): return o + self.get()
-  def __sub__(self, o): return self.get() - o
-  def __rsub__(self, o): return o - self.get()
-  def __mul__(self, o): return self.get() * o
-  def __rmul__(self, o): return o * self.get()
-  def __truediv__(self, o): return self.get() / o
-  def __rtruediv__(self, o): return o / self.get()
-  def __floordiv__(self, o): return self.get() // o
-  def __rfloordiv__(self, o): return o // self.get()
-  def __mod__(self, o): return self.get() % o
-  def __rmod__(self, o): return o % self.get()
-  def __lt__(self, o): return self.get() < o
-  def __le__(self, o): return self.get() <= o
-  def __gt__(self, o): return self.get() > o
-  def __ge__(self, o): return self.get() >= o
-  def __and__(self, o): return self.get() & o
-  def __rand__(self, o): return o & self.get()
-  def __or__(self, o): return self.get() | o
-  def __ror__(self, o): return o | self.get()
-  def __xor__(self, o): return self.get() ^ o
-  def __rxor__(self, o): return o ^ self.get()
-  def __getitem__(self, o): return self.get()[o]
-  def __pow__(self, o, modulo=None): return pow(self.get(), o, modulo)
-  def __rpow__(self, o): return pow(o, self.get())
-  def __invert__(self): return ~self.get()
-  def __neg__(self): return -self.get()
-  def __abs__(self): return abs(self.get())
-
-  def __div__(self, o):
-    try:
-      return self.get().__div__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rdiv__(self, o):
-    try:
-      return self.get().__rdiv__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __matmul__(self, o):
-    try:
-      return self.get().__matmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rmatmul__(self, o):
-    try:
-      return self.get().__rmatmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  # TODO(josh11b): Even more operator overloads.
-
-
-class PerDevice(DistributedValues):
-  """Holds a map from device to unsynchronized values."""
-  pass
-
-
-# Note that unlike PerDevice, Mirrored values inherit from
-# DistributedDelegate and so can be used directly in cross-tower mode.
-class Mirrored(DistributedDelegate):
-  """Holds a map from device to values which are kept in sync."""
-
-  def _get_cross_tower(self):
-    device = device_util.canonicalize(device_util.current())
-    if device in self._index:
-      return self._index[device]
-    return list(self._index.values())[0]
-
-  def _as_graph_element(self):
-    obj = self.get()
-    # pylint: disable=protected-access
-    conv_fn = getattr(obj, "_as_graph_element", None)
-    if conv_fn and callable(conv_fn):
-      return conv_fn()
-    return obj
-
-
-def _assign_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign(array_ops.identity(tensor))
-
-
-DistributedVarOp = collections.namedtuple(
-    "DistributedVarOp", ["name", "graph", "type"])
-
-
-class DistributedVariable(DistributedDelegate):
-  """Holds a map from device to variables."""
-  # TODO(josh11b): Support changing the set of variables if e.g. if new
-  # devices are joining or a device is to leave.
-
-  def __init__(self, index):
-    # Child class must set self._primary_var before calling
-    # super(...).__init__(index).
-    self._common_name = self._primary_var.name.split(":")[0]
-    # Use a weakref to make it easy to map from the contained values
-    # to the container without introducing a reference cycle.
-    for v in six.itervalues(index):
-      v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
-    # tf.keras keeps track of variables initialized using this attribute. When
-    # tf.keras gets the default session, it initializes all uninitialized vars.
-    # We need to make _keras_initialized a member of DistributedVariable because
-    # without this it will use `__getattr__` which will delegate to a component
-    # variable.
-    self._keras_initialized = False
-    # Typically, a `DistributedVariable`'s initializer is composed of the
-    # initializers of the components variables. However, in some cases, such as
-    # when restoring from a checkpoint, we may set the _initializer_op
-    # property on the entire `DistributedVariable`.
-    self._initializer_op = None
-    super(DistributedVariable, self).__init__(index)
-
-  def is_initialized(self, name=None):
-    """Identifies if all the component variables are initialized.
-
-    Args:
-      name: Name of the final `logical_and` op.
-
-    Returns:
-      The op that evaluates to True or False depending on if all the
-      component variables are initialized.
-    """
-    # We have to cast the self._index.values() to a `list` because when we
-    # use `model_to_estimator` to run tf.keras models, self._index.values() is
-    # of type `dict_values` and not `list`.
-    values_list = list(self._index.values())
-    result = values_list[0].is_initialized()
-    # We iterate through the list of values except the last one to allow us to
-    # name the final `logical_and` op the same name that is passed by the user
-    # to the `is_initialized` op. For distributed variables, the
-    # `is_initialized` op is a `logical_and` op.
-    for v in values_list[1:-1]:
-      result = math_ops.logical_and(result, v.is_initialized())
-    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
-                                  name=name)
-    return result
-
-  @property
-  def initializer(self):
-    if self._initializer_op:
-      init_op = self._initializer_op
-    else:
-      # return grouped ops of all the var initializations of component values of
-      # the mirrored variable
-      init_op = control_flow_ops.group(
-          [v.initializer for v in self._index.values()])
-    return init_op
-
-  @property
-  def graph(self):
-    return self._primary_var.graph
-
-  @property
-  def _shared_name(self):
-    return self._common_name
-
-  @property
-  def _unique_id(self):
-    return self._primary_var._unique_id   # pylint: disable=protected-access
-
-  @property
-  def name(self):
-    return self._primary_var.name
-
-  @property
-  def dtype(self):
-    return self._primary_var.dtype
-
-  @property
-  def shape(self):
-    return self._primary_var.shape
-
-  def get_shape(self):
-    return self._primary_var.get_shape()
-
-  def to_proto(self, export_scope=None):
-    return self._primary_var.to_proto(export_scope=export_scope)
-
-  @property
-  def op(self):
-    # We want cross-tower code that does some var.op.X calls
-    # to work (even if the current device isn't in self.devices), but
-    # other uses of var.op in a cross-tower context to fail.
-    if distribution_strategy_context.get_cross_tower_context():
-      return DistributedVarOp(self._primary_var.op.name,
-                              self._primary_var.op.graph,
-                              self._primary_var.op.type)
-    return self.get().op
-
-  @property
-  def _in_graph_mode(self):
-    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
-
-  def read_value(self):
-    return distribution_strategy_context.get_distribution_strategy().read_var(
-        self)
-
-  def _should_act_as_resource_variable(self):
-    """Pass resource_variable_ops.is_resource_variable check."""
-    pass
-
-
-ops.register_dense_tensor_like_type(DistributedVariable)
-
-
-class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
-  """Class for defining how to restore a MirroredVariable."""
-
-  def __init__(self, mirrored_variable, primary_variable, name):
-    self._mirrored_variable = mirrored_variable
-    super(_MirroredSaveable, self).__init__(primary_variable, "", name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    """Restore the same value into all variables."""
-    tensor, = restored_tensors
-    return control_flow_ops.group([
-        _assign_on_device(d, v, tensor)
-        for d, v in six.iteritems(self._mirrored_variable._index)])  # pylint: disable=protected-access
-
-
-class MirroredVariable(DistributedVariable, Mirrored,
-                       checkpointable.CheckpointableBase):
-  """Holds a map from device to variables whose values are kept in sync."""
-
-  def __init__(self, index, primary_var, aggregation):
-    self._primary_var = primary_var
-    self._aggregation = aggregation
-    super(MirroredVariable, self).__init__(index)
-
-  # The arguments to update() are automatically unwrapped so the update()
-  # function would normally see regular variables, not MirroredVariables.
-  # However, the update function can still operate on wrapped MirroredVariables
-  # through object members, captured arguments, etc. This is more likely in an
-  # update_non_slot() function (like OptimizerV2._finish), which can
-  # update several non-slot variables in one call.
-  def _assign_func(self, *args, **kwargs):
-    f = kwargs.pop("f")
-    if distribution_strategy_context.get_cross_tower_context():
-      update_device = distribute_lib.get_update_device()
-      if update_device is not None:
-        # We are calling an assign function on the mirrored variable in an
-        # update context.
-        v = self.get(device=update_device)
-        return f(v, *args, **kwargs)
-
-      # We are calling assign on the mirrored variable in cross tower context,
-      # use update to update the variable.
-      strategy = distribution_strategy_context.get_distribution_strategy()
-      return strategy.update(self, f, *args, **kwargs)
-    else:
-      _assert_tower_context()
-      # We are calling an assign function on the mirrored variable in tower
-      # context.
-      # We reduce the value we want to assign/add/sub. More details about how we
-      # handle the different use cases can be found in the _reduce method.
-      # We call the function on each of the mirrored variables with the reduced
-      # value.
-      if self._aggregation == vs.VariableAggregation.NONE:
-        raise ValueError("You must specify an aggregation method to update a "
-                         "MirroredVariable in Tower Context.")
-
-      def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
-
-      return distribution_strategy_context.get_tower_context().merge_call(
-          merge_fn, *args, **kwargs)
-
-  def assign_sub(self, *args, **kwargs):
-    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
-    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
-    return self._assign_func(f=assign_add_fn, *args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
-    return self._assign_func(f=assign_fn, *args, **kwargs)
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  def _get_cross_tower(self):
-    device = device_util.canonicalize(device_util.current())
-    if device in self._index:
-      return array_ops.identity(self._index[device])
-    return array_ops.identity(self._primary_var)
-
-  def _as_graph_element(self):
-    # pylint: disable=protected-access
-    if distribution_strategy_context.get_cross_tower_context():
-      return self._primary_var._as_graph_element()
-    return self.get()._as_graph_element()
-
-  def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
-
-    This allows both name-based and object-based save and restore of
-    MirroredVariables.
-
-    Returns:
-      A dictionary mapping attribute names to `SaveableObject` factories.
-    """
-    def _saveable_factory(name=self._common_name):
-      return _MirroredSaveable(self, self._primary_var, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
-  # Try to avoid assignments to and other mutations of MirroredVariable
-  # state except through a DistributionStrategy.update() call.
-  assert not as_ref
-  return ops.internal_convert_to_tensor(
-      var.get(), dtype=dtype, name=name, as_ref=as_ref)
-
-
-ops.register_tensor_conversion_function(MirroredVariable,
-                                        _tensor_conversion_mirrored)
-
-
-def _enclosing_tpu_context():
-  # pylint: disable=protected-access
-  tpu_context = ops.get_default_graph()._get_control_flow_context()
-  # pylint: enable=protected-access
-  while tpu_context is not None and not isinstance(
-      tpu_context, control_flow_ops.XLAControlFlowContext):
-    tpu_context = tpu_context.outer_context
-  return tpu_context
-
-
-# TODO(jhseu): Deduplicate code. We copy code because we don't want to
-# inherit from DistributedDelegate. DistributedDelegate will not work in a
-# tpu.replicate() because it assumes that you're in a device context where you
-# can operate on a single version of the variable, but a tpu.replicate()
-# operates on all variables and is replicated during a rewrite pass.
-class TPUMirroredVariable(checkpointable.CheckpointableBase):
-  """Holds a map from device to TPU variables whose values are kept in sync."""
-
-  def __init__(self, index, primary_var, aggregation):
-    # Use a weakref to make it easy to map from the contained values
-    # to the container without introducing a reference cycle.
-    for v in six.itervalues(index):
-      v._mirrored_container = weakref.ref(self)  # pylint: disable=protected-access
-    self._index = {device_util.canonicalize(key): value
-                   for key, value in six.iteritems(index)}
-    self._primary_var = primary_var
-    self._common_name = self._primary_var.name.split(":")[0]
-    self._aggregation = aggregation
-    # Needed for GradientTape
-    self._trainable = self._primary_var.trainable
-
-  def _get(self, device=None):
-    """Returns the value for the current device or raises a ValueError."""
-    if device is None:
-      tower_context = distribution_strategy_context.get_tower_context()
-      if tower_context:
-        device = tower_context.device
-      else:
-        device = distribute_lib.get_update_device()
-        if device is None:
-          return self._get_cross_tower()
-    device = device_util.canonicalize(device)
-    try:
-      return self._index[device]
-    except KeyError as e:
-      six.raise_from(
-          ValueError("Device %s not found in %s (current device %s)" %
-                     (device, self._index.keys(), device_util.current())), e)
-
-  # pylint: disable=multiple-statements
-  def __add__(self, o): return self.read_value() + o
-  def __radd__(self, o): return o + self.read_value()
-  def __sub__(self, o): return self.read_value() - o
-  def __rsub__(self, o): return o - self.read_value()
-  def __mul__(self, o): return self.read_value() * o
-  def __rmul__(self, o): return o * self.read_value()
-  def __truediv__(self, o): return self.read_value() / o
-  def __rtruediv__(self, o): return o / self.read_value()
-  def __floordiv__(self, o): return self.read_value() // o
-  def __rfloordiv__(self, o): return o // self.read_value()
-  def __mod__(self, o): return self.read_value() % o
-  def __rmod__(self, o): return o % self.read_value()
-  def __lt__(self, o): return self.read_value() < o
-  def __le__(self, o): return self.read_value() <= o
-  def __gt__(self, o): return self.read_value() > o
-  def __ge__(self, o): return self.read_value() >= o
-  def __and__(self, o): return self.read_value() & o
-  def __rand__(self, o): return o & self.read_value()
-  def __or__(self, o): return self.read_value() | o
-  def __ror__(self, o): return o | self.read_value()
-  def __xor__(self, o): return self.read_value() ^ o
-  def __rxor__(self, o): return o ^ self.read_value()
-  def __getitem__(self, o): return self.read_value()[o]
-  def __pow__(self, o, modulo=None): return pow(self.read_value(), o, modulo)
-  def __rpow__(self, o): return pow(o, self.read_value())
-  def __invert__(self): return ~self.read_value()
-  def __neg__(self): return -self.read_value()
-  def __abs__(self): return abs(self.read_value())
-
-  def __div__(self, o):
-    try:
-      return self.read_value().__div__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rdiv__(self, o):
-    try:
-      return self.read_value().__rdiv__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __matmul__(self, o):
-    try:
-      return self.read_value().__matmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rmatmul__(self, o):
-    try:
-      return self.read_value().__rmatmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  @property
-  def handle(self):
-    # If we're in a tpu.rewrite(), return the replicated handle.
-    tpu_context = _enclosing_tpu_context()
-    if tpu_context is not None:
-      return tpu_context.get_replicated_var_handle(
-          self._common_name, nest.flatten(self._index))
-
-    device = distribute_lib.get_update_device()
-    if device is None:
-      return self._primary_var.handle
-    device = device_util.canonicalize(device)
-    try:
-      return self._index[device].handle
-    except KeyError as e:
-      six.raise_from(
-          ValueError("Device %s not found in %s (current device %s)" %
-                     (device, self._index.keys(), device_util.current())), e)
-
-  # The arguments to update() are automatically unwrapped so the update()
-  # function would normally see regular variables, not MirroredVariables.
-  # However, the update function can still operate on wrapped MirroredVariables
-  # through object members, captured arguments, etc. This is more likely in an
-  # update_non_slot() function (like OptimizerV2._finish), which can
-  # update several non-slot variables in one call.
-  def _assign_func(self, *args, **kwargs):
-    if distribution_strategy_context.get_distribution_strategy().__class__.__name__ != "TPUStrategy":
-      raise ValueError("You may only assign to a TPUMirroredVariable within a "
-                       "TPUStrategy.")
-    f = kwargs.pop("f")
-    if distribution_strategy_context.get_cross_tower_context():
-      if _enclosing_tpu_context() is not None:
-        return distribution_strategy_context.get_distribution_strategy().update(
-            self, f, *args, **kwargs)
-
-      update_device = distribute_lib.get_update_device()
-      # We are calling update on the mirrored variable in cross tower context.
-      if update_device is not None:
-        # We are calling an assign function on the mirrored variable in cross
-        # tower context.
-        v = self._get(device=update_device)
-        return f(v, *args, **kwargs)
-
-      return distribution_strategy_context.get_distribution_strategy().update(
-          self, f, *args, **kwargs)
-    else:
-      _assert_tower_context()
-      # We are calling an assign function on the mirrored variable in tower
-      # context.
-      # We reduce the value we want to assign/add/sub. More details about how we
-      # handle the different use cases can be found in the _reduce method.
-      # We call the function on each of the mirrored variables with the reduced
-      # value.
-      if self._aggregation == vs.VariableAggregation.NONE:
-        raise ValueError("You must specify an aggregation method to update a "
-                         "TPUMirroredVariable in Tower Context.")
-
-      def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
-
-      return distribution_strategy_context.get_tower_context().merge_call(
-          merge_fn, *args, **kwargs)
-
-  @contextlib.contextmanager
-  def _handle_graph(self, handle):
-    # Note: might have an eager tensor but not be executing eagerly when
-    # building functions.
-    if (context.executing_eagerly() or isinstance(handle, ops.EagerTensor)
-        or ops.has_default_graph()):
-      yield
-    else:
-      with handle.graph.as_default():
-        yield
-
-  @property
-  def trainable(self):
-    return self._trainable
-
-  def _read_variable_op(self, parent_op=None):
-    if self.trainable:
-      tape.variable_accessed(self)
-    if parent_op is not None:
-      with ops.control_dependencies([parent_op]):
-        return gen_resource_variable_ops.read_variable_op(
-            self.handle, self.dtype)
-
-    return gen_resource_variable_ops.read_variable_op(
-        self.handle, self.dtype)
-
-  def read_value(self):
-    return self._read_variable_op()
-
-  def assign_sub(self, *args, **kwargs):
-    def assign_sub_fn(var, delta, **kw):
-      name = kw.pop("name", None)
-      read_value = kw.pop("read_value", True)
-      with self._handle_graph(var.handle):
-        op = gen_resource_variable_ops.assign_sub_variable_op(
-            var.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
-            name=name)
-      if read_value:
-        return self._read_variable_op(parent_op=op)
-      return op
-
-    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    def assign_add_fn(var, delta, **kw):
-      name = kw.pop("name", None)
-      read_value = kw.pop("read_value", True)
-      with self._handle_graph(var.handle):
-        op = gen_resource_variable_ops.assign_add_variable_op(
-            var.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
-            name=name)
-      if read_value:
-        return self._read_variable_op(parent_op=op)
-      return op
-
-    return self._assign_func(f=assign_add_fn, *args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    def assign_fn(var, value, **kw):
-      name = kw.pop("name", None)
-      read_value = kw.pop("read_value", True)
-      with self._handle_graph(var.handle):
-        op = gen_resource_variable_ops.assign_variable_op(
-            var.handle, ops.convert_to_tensor(value, dtype=self.dtype),
-            name=name)
-      if read_value:
-        return self._read_variable_op(parent_op=op)
-      return op
-
-    return self._assign_func(f=assign_fn, *args, **kwargs)
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  @property
-  def constraint(self):
-    return None
-
-  @property
-  def initializer(self):
-    return control_flow_ops.group(
-        [v.initializer for v in nest.flatten(self._index)])
-
-  @property
-  def graph(self):
-    return self._primary_var.graph
-
-  @property
-  def _shared_name(self):
-    return self._common_name
-
-  @property
-  def _unique_id(self):
-    return self._primary_var._unique_id  # pylint: disable=protected-access
-
-  @property
-  def name(self):
-    return self._primary_var.name
-
-  @property
-  def dtype(self):
-    return self._primary_var.dtype
-
-  @property
-  def shape(self):
-    return self._primary_var.shape
-
-  def get_shape(self):
-    return self._primary_var.get_shape()
-
-  def to_proto(self, export_scope=None):
-    return self._primary_var.to_proto(export_scope=export_scope)
-
-  def _get_cross_tower(self):
-    device = device_util.canonicalize(device_util.current())
-    if device in self._index:
-      return self._index[device]
-    return self._primary_var
-
-  def _as_graph_element(self):
-    # pylint: disable=protected-access
-    if distribution_strategy_context.get_cross_tower_context():
-      return self._primary_var._as_graph_element()
-    return self._read_variable_op()
-
-  def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
-
-    This allows both name-based and object-based save and restore of
-    MirroredVariables.
-
-    Returns:
-      A dictionary mapping attribute names to `SaveableObject` factories.
-    """
-    def _saveable_factory(name=self._common_name):
-      return _MirroredSaveable(self, self._primary_var, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-  def _should_act_as_resource_variable(self):
-    """Pass resource_variable_ops.is_resource_variable check."""
-    pass
-
-  # Needed to pass ResourceVariable checks.
-  @property
-  def op(self):
-    return self._primary_var.op
-
-  @property
-  def _in_graph_mode(self):
-    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    """Converts a variable to a tensor."""
-    # pylint: disable=protected-access
-    if _enclosing_tpu_context() is None:
-      return self._get()._dense_var_to_tensor(dtype, name, as_ref)
-    # pylint: enable=protected-access
-    if dtype is not None and dtype != self.dtype:
-      raise NotImplementedError
-    if as_ref:
-      return self.handle
-    else:
-      return self.read_value()
-
-  def is_initialized(self, name=None):
-    """Identifies if all the component variables are initialized.
-
-    Args:
-      name: Name of the final `logical_and` op.
-
-    Returns:
-      The op that evaluates to True or False depending on if all the
-      component variables are initialized.
-    """
-    # TODO(jhseu): Do we need TPU context implementation?
-
-    # We have to cast the self._index.values() to a `list` because when we
-    # use `model_to_estimator` to run tf.keras models, self._index.values() is
-    # of type `dict_values` and not `list`.
-    values_list = nest.flatten(self._index)
-    result = values_list[0].is_initialized()
-    # We iterate through the list of values except the last one to allow us to
-    # name the final `logical_and` op the same name that is passed by the user
-    # to the `is_initialized` op. For distributed variables, the
-    # `is_initialized` op is a `logical_and` op.
-    for v in values_list[1:-1]:
-      result = math_ops.logical_and(result, v.is_initialized())
-    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
-                                  name=name)
-    return result
-
-
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-def _tensor_conversion_tpu_mirrored(var, dtype=None, name=None, as_ref=False):
-  return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
-
-
-ops.register_tensor_conversion_function(TPUMirroredVariable,
-                                        _tensor_conversion_tpu_mirrored)
-ops.register_dense_tensor_like_type(TPUMirroredVariable)
-
-
-class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
-  """Class for defining how to restore a TowerLocalVariable."""
-
-  def __init__(self, tower_local_variable, name):
-    self._tower_local_variable = tower_local_variable
-    # We use a callable so that we don't have to evaluate this expression
-    # in the case where we are trying to restore instead of save.
-    def tensor():
-      return distribution_strategy_context.get_distribution_strategy().read_var(
-          tower_local_variable)
-    spec = saver.BaseSaverBuilder.SaveSpec(
-        tensor=tensor,
-        slice_spec="",
-        name=name,
-        dtype=tower_local_variable.dtype)
-    super(_TowerLocalSaveable, self).__init__(tensor, [spec], name)
-
-  def restore(self, restored_tensors, restored_shapes):
-    """Restore the same value into all variables."""
-    tensor, = restored_tensors
-    return self._tower_local_variable.assign(tensor)
-
-
-def _assert_tower_context():
-  if not distribution_strategy_context.get_tower_context():
-    raise RuntimeError(
-        "Tower-local variables may only be assigned in a tower context.")
-
-
-class TowerLocalVariable(DistributedVariable, PerDevice,
-                         checkpointable.CheckpointableBase):
-  """Holds a map from device to variables whose values are reduced on save."""
-
-  def __init__(self, index, primary_var, aggregation):
-    self._primary_var = primary_var
-    self._aggregation = aggregation
-    super(TowerLocalVariable, self).__init__(index)
-
-  def assign_sub(self, *args, **kwargs):
-    _assert_tower_context()
-    return self.get().assign_sub(*args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    _assert_tower_context()
-    return self.get().assign_add(*args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    if distribution_strategy_context.get_cross_tower_context():
-      # To preserve the sum across save and restore, we have to divide the
-      # total across all devices when restoring a variable that was summed
-      # when saving.
-      tensor = args[0]
-      if self._aggregation == vs.VariableAggregation.SUM:
-        tensor *= 1. / len(self.devices)
-      return control_flow_ops.group(
-          [_assign_on_device(d, v, tensor)
-           for d, v in six.iteritems(self._index)])
-    else:
-      _assert_tower_context()
-      return self.get().assign(*args, **kwargs)
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  def _get_cross_tower(self):
-    if self._aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER:
-      return self._primary_var
-    all_components = tuple(self._index.values())
-    # TODO(josh11b): Use a strategy-specific method.
-    total = math_ops.add_n(all_components)
-    if self._aggregation == vs.VariableAggregation.MEAN:
-      return total * (1./ len(all_components))
-    return total
-
-  def _as_graph_element(self):
-    # pylint: disable=protected-access
-    if distribution_strategy_context.get_cross_tower_context():
-      return self._get_cross_tower()
-    return self.get()._as_graph_element()
-
-  def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
-
-    This allows both name-based and object-based save and restore of
-    TowerLocalVariables.
-
-    Returns:
-      A dictionary mapping attribute names to `SaveableObject` factories.
-    """
-    def _saveable_factory(name=self._common_name):
-      return _TowerLocalSaveable(self, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-
-# Register a conversion function for TowerLocalVariable which allows as_ref to
-# be true.
-def _tensor_conversion_tower_local(var, dtype=None, name=None, as_ref=False):
-  return ops.internal_convert_to_tensor(
-      var.get(), dtype=dtype, name=name, as_ref=as_ref)
-
-
-ops.register_tensor_conversion_function(TowerLocalVariable,
-                                        _tensor_conversion_tower_local)
-
-
-def _devices_match(d1, d2):
-  return device_util.canonicalize(d1) == device_util.canonicalize(d2)
-
-
-def regroup(per_device, wrap_class=PerDevice):
-  """Makes device->nest map into a nest of PerDevice/Mirrored values."""
-  items = list(per_device.items())
-  assert items
-  v0 = items[0][1]  # First value
-
-  if isinstance(v0, list):
-    for _, v in items[1:]:
-      assert isinstance(v, list)
-      assert len(v) == len(v0), ("len(v) == %d, len(v0) == %d, v: %s, v0: %s" %
-                                 (len(v), len(v0), v, v0))
-    return [regroup({k: v[i] for k, v in items}, wrap_class)
-            for i in range(len(v0))]
-
-  if isinstance(v0, tuple):
-    for _, v in items[1:]:
-      assert isinstance(v, tuple)
-      assert len(v) == len(v0)
-    regrouped_tuple = tuple(regroup({k: v[i] for k, v in items}, wrap_class)
-                            for i in range(len(v0)))
-    if hasattr(v0, "_fields"):
-      # This tuple is in fact a namedtuple! Create a new namedtuple instance
-      # and initialize it with the regrouped values:
-      assert hasattr(type(v0), "_make")
-      return type(v0)._make(regrouped_tuple)
-    else:
-      return regrouped_tuple
-
-  if isinstance(v0, dict):
-    v0keys = set(v0.keys())
-    for _, v in items[1:]:
-      assert isinstance(v, dict)
-      assert set(v.keys()) == v0keys
-    return {key: regroup({k: v[key] for k, v in items}, wrap_class)
-            for key in v0keys}
-
-  # If exactly the same object across all devices, return it unwrapped.
-  same_id = True
-  for _, v in items[1:]:
-    if v is not v0:
-      same_id = False
-      break
-  # Consider three cases where same_id is true:
-  # * If v0 is a DistributedVariable (a MirroredVariable or
-  #   TowerLocalVariable, and same_id means it is the same across all
-  #   devices), we want to return it. We check DistributedVariable
-  #   specifically since it can look like it has a
-  #   _distributed_container member since its members do.
-  # * If v0 is a member of a distributed variable, in which case
-  #   hasattr(v0, "_distributed_container") is true, we want to
-  #   return the DistributedVariable that contains it using the
-  #   _distributed_container logic below. This case can trigger
-  #   same_id when there is only one device.
-  # * In any other situation, same_id means we return v0.
-  if same_id and (isinstance(v0, DistributedVariable) or
-                  not hasattr(v0, "_distributed_container")):
-    return v0
-
-  # Detect the case where each device has a parallel component of the
-  # same MirroredVariable (or TowerLocalVariable). In this case we
-  # want to return the containing MirroredVariable, after a bunch of
-  # sanity checking. In particular, each component should have the
-  # same container, and the devices of the variables should match the
-  # keys of the per-device dictionary.
-  if hasattr(v0, "_distributed_container"):
-    # pylint: disable=protected-access
-    assert not isinstance(v0, MirroredVariable), (
-        "ids = %s, items = %s" % ([id(v[1]) for v in items], items))
-    assert _devices_match(v0.device, items[0][0]), (
-        "v0.device = %s, items = %s" % (v0.device, items))
-    distributed_container = v0._distributed_container()
-    assert distributed_container is not None
-    for d, v in items[1:]:
-      assert _devices_match(v.device, d), (
-          "v.device = %s, d = %s, items = %s" % (v.device, d, items))
-      assert distributed_container is v._distributed_container()
-    return distributed_container
-  # pylint: enable=protected-access
-
-  return wrap_class(per_device)
-
-
-def select_device(device, structured):
-  """Specialize a nest of regular & per-device values for one device."""
-  def _get(x):
-    return x.get(device) if isinstance(x, DistributedValues) else x
-
-  return nest.map_structure(_get, structured)
-
-
-def select_device_mirrored(device, structured):
-  """Specialize a nest of regular & mirrored values for one device."""
-  def _get_mirrored(x):
-    if isinstance(x, DistributedValues):
-      if not isinstance(x, Mirrored):
-        raise TypeError(
-            "Expected value to be mirrored across towers: %s in %s." %
-            (x, structured))
-      return x.get(device)
-    else:
-      return x
-
-  return nest.map_structure(_get_mirrored, structured)
-
-
-def update_regroup(strategy, updates, should_group):
-  """Regroup for an update, with dependencies to ensure all updates execute."""
-  regrouped = regroup(updates, Mirrored)
-  if not should_group:
-    return nest.map_structure(strategy.unwrap, regrouped)
-  grouped_flat = []
-  for u in nest.flatten(regrouped):
-    if isinstance(u, DistributedValues):
-      g = strategy.group(u)
-      if u.is_tensor_like:
-        # Make sure we run all updates. Without this, something like
-        # session.run(strategy.update(...)) may only update one tower.
-        index = {}
-        for d in u.devices:
-          with ops.device(d), ops.control_dependencies([g]):
-            index[d] = array_ops.identity(u.get(d))
-        g = Mirrored(index)
-    else:
-      g = u
-    grouped_flat.append(g)
-  return nest.pack_sequence_as(regrouped, grouped_flat)
-
-
-class PerDeviceDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `PerDeviceDataset`."""
-
-  def __init__(self, iterator, devices, prefetch_on_device=None):
-    self._iterator = iterator
-    self._devices = devices
-    self._prefetch_on_device = prefetch_on_device
-
-  @property
-  def initializer(self):
-    return self._iterator.initializer
-
-  def get_next(self, name=None):
-    """Scatter the input across devices."""
-    if self._prefetch_on_device:
-      data_list = self._iterator.get_next(name=name)
-      index = dict(zip(self._devices, data_list))
-    else:
-      batch = self._iterator.get_next(name=name)
-      index = {}
-      def get_ith(i):
-        return lambda x: x[i]
-
-      for i, d in enumerate(self._devices):
-        index[d] = nest.map_structure(get_ith(i), batch)
-        if context.executing_eagerly():
-          with ops.device(d):
-            index[d] = nest.map_structure(array_ops.identity, index[d])
-
-    return regroup(index)
-
-
-class PerDeviceDataset(object):
-  """Like `tf.data.Dataset` split devices, producing `PerDevice` data."""
-
-  def __init__(self, dataset, devices, prefetch_on_device=None):
-    self._devices = devices
-
-    # Default to using prefetching in graph mode, unless specified.
-    # TODO(priyag): Enable prefetching in eager mode.
-    self._prefetch_on_device = prefetch_on_device
-    if self._prefetch_on_device is None:
-      self._prefetch_on_device = not context.executing_eagerly()
-    assert not (self._prefetch_on_device and context.executing_eagerly()), (
-        "Prefetching is only supported in graph mode currently")
-
-    if self._prefetch_on_device:
-      self._dataset = dataset.apply(
-          prefetching_ops_v2.prefetch_to_devices(self._devices))
-    else:
-      # TODO(priyag): If dropping remainder is not appropriate, find another
-      # approach to distributing the dataset when not possible to divide evenly.
-      # Possibly not an issue when we start using PartitionedDataset.
-      self._dataset = dataset.batch(len(devices), drop_remainder=True)
-
-  def make_one_shot_iterator(self):
-    """Get a one time use iterator for the distributed PerDeviceDataset."""
-    dataset_iterator = self._dataset.make_one_shot_iterator()
-    return PerDeviceDataIterator(dataset_iterator, self._devices,
-                                 self._prefetch_on_device)
-
-  def make_initializable_iterator(self):
-    """Get an initializable iterator for the distributed PerDeviceDataset."""
-    dataset_iterator = self._dataset.make_initializable_iterator()
-    return PerDeviceDataIterator(dataset_iterator, self._devices,
-                                 self._prefetch_on_device)
-
-
-class MultiWorkerDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
-
-  def __init__(self, iterators, worker_device_map):
-    """Initialize the MultiWorkerDataIterator object.
-
-    Args:
-      iterators: a dict mapping from each worker to an iterator for
-        that worker.
-      worker_device_map: a dict mapping from each worker's devices to a list of
-        devices that belong to this worker.
-
-    Raises:
-      ValueError: if iterators and worker_device_map are not compatible.
-    """
-    self._iterators = iterators
-    self._worker_device_map = worker_device_map
-    if set(self._iterators) != set(self._worker_device_map):
-      raise ValueError("iterators and worker_device_map are not compatible.")
-
-  @property
-  def initializer(self):
-    return control_flow_ops.group(
-        [iterator.initializer for iterator in self._iterators.values()])
-
-  def get_next(self, name=None):
-    """Scatter the input across hosts and devices."""
-    index = {}
-    for worker, iterator in six.iteritems(self._iterators):
-      if name is not None:
-        d = tf_device.DeviceSpec.from_string(worker)
-        new_name = "%s_%s_%d" % (name, d.job, d.task)
-      else:
-        new_name = None
-      with ops.device(worker):
-        data_per_worker = iterator.get_next(name=new_name)
-
-      worker_devices = self._worker_device_map[worker]
-      # Ungroup these per-device value so as to get a flat map from devices to
-      # values.
-      for d in worker_devices:
-        v = select_device(d, data_per_worker)
-        if d in index:
-          raise ValueError("Duplicated devices in worker_device_map: %r" % v)
-        index[d] = v
-
-    return regroup(index)
-
-
-class MultiWorkerDataset(object):
-  """Like a `tf.data.Dataset` that distributes data to different workers.
-
-  Each worker gets one shard of the input dataset. It is currently not working
-  in
-  eager mode.
-  """
-
-  def __init__(self, dataset_fn, worker_device_map, prefetch_on_device=None,
-               auto_shard=False):
-    """Initialize the MultiWorkerDataset object.
-
-    Args:
-      dataset_fn: a function that returns a `tf.data.Dataset`.
-      worker_device_map: a dict mapping from each worker to a list of devices
-        that belong to this worker.
-      prefetch_on_device: whether to prefetch to devices.
-      auto_shard: whether to auto-shard the dataset.
-    """
-    self._worker_device_map = worker_device_map
-    self._datasets = {}
-    # TODO(yuefengz, priyag): support different set of jobs for input
-    # processing.
-    for i, (worker, worker_devices) in enumerate(
-        six.iteritems(worker_device_map)):
-      with ops.device(worker):
-        worker_input = dataset_fn()
-        if auto_shard:
-          worker_input = input_ops.auto_shard_dataset(
-              worker_input, len(worker_device_map), i)
-        self._datasets[worker] = PerDeviceDataset(
-            worker_input, worker_devices, prefetch_on_device=prefetch_on_device)
-
-  def make_one_shot_iterator(self):
-    iterators = {}
-    for worker, dataset in six.iteritems(self._datasets):
-      with ops.device(worker):
-        iterators[worker] = dataset.make_one_shot_iterator()
-    return MultiWorkerDataIterator(iterators, self._worker_device_map)
-
-  def make_initializable_iterator(self):
-    iterators = {}
-    for worker, dataset in six.iteritems(self._datasets):
-      with ops.device(worker):
-        iterators[worker] = dataset.make_initializable_iterator()
-    return MultiWorkerDataIterator(iterators, self._worker_device_map)
-
-
-class _PerKey(object):
-  """Holds data associated by keys."""
-
-  def __init__(self, *index):
-    # pylint: disable=protected-access
-    self._index = list(index)
-
-  def get(self, iteration):
-    return array_ops.gather(self._index, iteration)
-
-  def get_shape(self):
-    return self._index[-1][-1].get_shape()
-
-  def get_dtype(self):
-    return self._index[-1][-1].dtype
-
-  def __str__(self):
-    return "%s:%s" % (self.__class__.__name__, self._index)
-
-  def __repr__(self):
-    return "%s(%r)" % (self.__class__.__name__, self._index)
-
-
-class PerIteration(_PerKey):
-  """Holds input for multiple iterations at once."""
-
-  def __init__(self, *index):
-    # pylint: disable=protected-access
-    super(PerIteration, self).__init__(*[batch._index for batch in index])
-
-
-class Batches(_PerKey):
-  pass
-
-
-class MultiIterator(object):
-  """Iterator that returns results of multiple get_next()s."""
-
-  def __init__(self, dataset_iterator, iterations, batches_per_iteration):
-    self._dataset_iterator = dataset_iterator
-    self._iterations = iterations
-    self._batches_per_iteration = batches_per_iteration
-
-  def get_next(self, name=None):
-    """Return PerIteration with `iterations x batches_per_iteration` inputs."""
-    data = []
-    for _ in range(self._batches_per_iteration):
-      batch = []
-      for _ in range(self._iterations):
-        batch.append(self._dataset_iterator.get_next(name=name))
-      data.append(batch)
-
-    # Here is an example.  Suppose each get_next returns a tuple of two tensors.
-    # For 3 `iterations` and 2 `batches_per_iteration`, the `data` is:
-    # [[(a,z), (b,y), (c,x)], [(A,Z), (B,Y), (C,X)]]
-    #
-    # After the first `map_structure` it gets transformed to:
-    #  [(Batches(a, A), Batches(z, Z)),
-    #   (Batches(b, B), Batches(y, Y)),
-    #   (Batches(c, C), Batches(x, X))]
-    #
-    # After the second `map_structure` it gets transformed to a tuple of:
-    # (PerIteration([Batches(a, A), Batches(b, B), Batches(c, C)]),
-    #  PerIteration([Batches(z, Z), Batches(y, Y), Batches(x, X)]))
-
-    data = nest.map_structure(Batches, *data)
-    data = nest.map_structure(PerIteration, *data)
-
-    return data
-
-  @property
-  def initializer(self):
-    return self._dataset_iterator.initializer
-
-
-class PerIterationDataset(object):
-  """A dataset that returns MultiIterators."""
-
-  def __init__(self, dataset, iterations, batches_per_iteration):
-    self._dataset = dataset
-    self._iterations = iterations
-    self._batches_per_iteration = batches_per_iteration
-
-  def make_one_shot_iterator(self):
-    iterator = self._dataset.make_one_shot_iterator()
-    return MultiIterator(iterator, self._iterations,
-                         self._batches_per_iteration)
-
-  def make_initializable_iterator(self):
-    iterator = self._dataset.make_initializable_iterator()
-    return MultiIterator(iterator, self._iterations,
-                         self._batches_per_iteration)
-
-
-class MapOutput(object):
-  """Map can result in multiple outputs per device."""
-
-  def __init__(self, l):
-    self._l = l
-
-  def get(self):
-    return self._l
-
-
-class MultiStepContext(object):
-  """A context object that can be used to capture things when running steps.
-
-  This context object is useful when running multiple steps at a time using the
-  `run_steps_on_dataset` API. For e.g. it allows the user's step function to
-  specify which outputs to emit at what frequency. Currently it supports
-  capturing output from the last step, as well as capturing non tensor outputs.
-  In the future it will be augmented to support other use cases such as output
-  each N steps.
-  """
-
-  def __init__(self):
-    """Initializes an output context.
-
-    Returns:
-      A context object.
-    """
-    self._last_step_outputs = {}
-    self._last_step_outputs_aggregations = {}
-    self._non_tensor_outputs = {}
-
-  @property
-  def last_step_outputs(self):
-    """A dictionary consisting of outputs to be captured on last step.
-
-    Keys in the dictionary are names of tensors to be captured, as specified
-    when `set_last_step_output` is called.
-    Values in the dictionary are the tensors themselves. If
-    `set_last_step_output` was called with an `aggregation` for this output,
-    then the value is the aggregated value.
-
-    Returns:
-      A dictionary with last step outputs.
-    """
-    return self._last_step_outputs
-
-  def _set_last_step_outputs(self, outputs):
-    """Replace the entire dictionary of last step outputs."""
-    if not isinstance(outputs, dict):
-      raise ValueError("Need a dictionary to set last_step_outputs.")
-    self._last_step_outputs = outputs
-
-  def set_last_step_output(self, name, output,
-                           aggregation=variables_lib.VariableAggregation.NONE):
-    """Set `output` with `name` to be outputted from the last step.
-
-    Args:
-      name: String, name to identify the output. Doesn't need to match tensor
-        name.
-      output: The tensors that should be outputted with `name`. See below for
-        actual types supported.
-      aggregation: Aggregation method to use to aggregate outputs from multiple
-        towers. Required if `set_last_step_output` is called in a tower context.
-        Optional in cross_tower_context.
-        When present, the outputs from all the towers are aggregated using the
-        current distribution strategy's `reduce` method. Hence, the type of
-        `output` must be what's supported by the corresponding `reduce` method.
-        For e.g. if using MirroredStrategy and aggregation is set, output
-        must be a `PerDevice` value.
-        The aggregation method is also recorded in a dictionary
-        `_last_step_outputs_aggregations` for later interpreting of the
-        outputs as already reduced or not.
-
-    """
-    if distribution_strategy_context.get_cross_tower_context():
-      self._last_step_outputs_aggregations[name] = aggregation
-      if aggregation is variables_lib.VariableAggregation.NONE:
-        self._last_step_outputs[name] = output
-      else:
-        distribution = distribution_strategy_context.get_distribution_strategy()
-        self._last_step_outputs[name] = distribution.reduce(
-            aggregation, output, destinations="/device:CPU:0")
-    else:
-      assert aggregation is not variables_lib.VariableAggregation.NONE
-      def merge_fn(distribution, value):
-        self._last_step_outputs[name] = distribution.reduce(
-            aggregation, value, destinations="/device:CPU:0")
-        # Setting this inside the `merge_fn` because all towers share the same
-        # context object, so it's more robust to set it only once (even if all
-        # the towers are trying to set the same value).
-        self._last_step_outputs_aggregations[name] = aggregation
-
-      distribution_strategy_context.get_tower_context().merge_call(
-          merge_fn, output)
-
-  @property
-  def non_tensor_outputs(self):
-    """A dictionary consisting of any non tensor outputs to be captured."""
-    return self._non_tensor_outputs
-
-  def set_non_tensor_output(self, name, output):
-    """Set `output` with `name` to be captured as a non tensor output."""
-    if distribution_strategy_context.get_cross_tower_context():
-      self._non_tensor_outputs[name] = output
-    else:
-      def merge_fn(distribution, value):
-        # NOTE(priyag): For non tensor outputs, we simply return all the values
-        # in a list as aggregation doesn't make sense on non tensors.
-        self._non_tensor_outputs[name] = distribution.unwrap(value)
-      distribution_strategy_context.get_tower_context().merge_call(
-          merge_fn, output)
-
-
-def value_container(val):
-  """Returns the container that this per-device `value` belongs to.
-
-  Args:
-    val: A value returned by `call_for_each_tower()` or a variable
-      created in `scope()`.
-
-  Returns:
-    A container that `value` belongs to.
-    If value does not belong to any container (including the case of
-    container having been destroyed), returns the value itself.
-  """
-  # pylint: disable=protected-access
-  if (hasattr(val, "_distributed_container") and
-      # DistributedVariable has _distributed_container defined
-      # but we don't want to return it.
-      not isinstance(val, DistributedVariable)):
-    container = val._distributed_container()
-    # pylint: disable=protected-access
-    if container is not None:
-      return container
-  return val
-
-
-# TODO(josh11b): Descend from Variable.
-class AggregatingVariable(checkpointable.CheckpointableBase):
-  """A wrapper around a variable that aggregates updates across towers."""
-
-  def __init__(self, v, aggregation):
-    self._v = v
-    # TODO(josh11b): Set v._distributed_container?
-    # v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
-    self._aggregation = aggregation
-
-  def get(self):
-    return self._v
-
-  def __getattr__(self, name):
-    return getattr(self._v, name)
-
-  def _assign_func(self, *args, **kwargs):
-    f = kwargs.pop("f")
-    if distribution_strategy_context.get_cross_tower_context():
-      update_device = distribute_lib.get_update_device()
-      if update_device is not None:
-        # We are calling an assign function in an update context.
-        return f(self._v, *args, **kwargs)
-
-      # We are calling an assign function in cross tower context, wrap it in an
-      # update call.
-      return distribution_strategy_context.get_distribution_strategy().update(
-          self, f, *args, **kwargs)
-    else:
-      assert distribution_strategy_context.get_tower_context()
-      # We are calling an assign function in tower context.
-      # We reduce the value we want to assign/add/sub. More details about how we
-      # handle the different use cases can be found in the _reduce method.
-      # We call the function with the reduced value.
-      if self._aggregation == vs.VariableAggregation.NONE:
-        raise ValueError("You must specify an aggregation method to update a "
-                         "a variable in Tower Context.")
-
-      def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
-
-      return distribution_strategy_context.get_tower_context().merge_call(
-          merge_fn, *args, **kwargs)
-
-  def assign_sub(self, *args, **kwargs):
-    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
-    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
-    return self._assign_func(f=assign_add_fn, *args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
-    return self._assign_func(f=assign_fn, *args, **kwargs)
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  @property
-  def name(self):
-    return self._v.name
-
-  @property
-  def dtype(self):
-    return self._v.dtype
-
-  # TODO(josh11b): Test saving & restoring.
-  def _gather_saveables_for_checkpoint(self):
-    return {checkpointable.VARIABLE_VALUE_KEY: self._v}
-
-  # pylint: disable=multiple-statements
-  def __add__(self, o): return self._v + o
-  def __radd__(self, o): return o + self._v
-  def __sub__(self, o): return self._v - o
-  def __rsub__(self, o): return o - self._v
-  def __mul__(self, o): return self._v * o
-  def __rmul__(self, o): return o * self._v
-  def __truediv__(self, o): return self._v / o
-  def __rtruediv__(self, o): return o / self._v
-  def __floordiv__(self, o): return self._v // o
-  def __rfloordiv__(self, o): return o // self._v
-  def __mod__(self, o): return self._v % o
-  def __rmod__(self, o): return o % self._v
-  def __lt__(self, o): return self._v < o
-  def __le__(self, o): return self._v <= o
-  def __gt__(self, o): return self._v > o
-  def __ge__(self, o): return self._v >= o
-  def __and__(self, o): return self._v & o
-  def __rand__(self, o): return o & self._v
-  def __or__(self, o): return self._v | o
-  def __ror__(self, o): return o | self._v
-  def __xor__(self, o): return self._v ^ o
-  def __rxor__(self, o): return o ^ self._v
-  def __getitem__(self, o): return self._v[o]
-  def __pow__(self, o, modulo=None): return pow(self._v, o, modulo)
-  def __rpow__(self, o): return pow(o, self._v)
-  def __invert__(self): return ~self._v
-  def __neg__(self): return -self._v
-  def __abs__(self): return abs(self._v)
-
-  def __div__(self, o):
-    try:
-      return self._v.__div__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rdiv__(self, o):
-    try:
-      return self._v.__rdiv__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __matmul__(self, o):
-    try:
-      return self._v.__matmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rmatmul__(self, o):
-    try:
-      return self._v.__rmatmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __str__(self):
-    return str(self._v)
-
-  def __repr__(self):
-    return repr(self._v)
-
-  def _should_act_as_resource_variable(self):
-    """Pass resource_variable_ops.is_resource_variable check."""
-    pass
-
-
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
-  return ops.internal_convert_to_tensor(
-      var.get(), dtype=dtype, name=name, as_ref=as_ref)
-
-
-ops.register_tensor_conversion_function(
-    AggregatingVariable, _tensor_conversion_aggregate)
-ops.register_dense_tensor_like_type(AggregatingVariable)
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 121d2fbb3fbccd913599a581b3de9850ab33eae0..538b859f3d1ece55b460f6dbf8f01540a6013381 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -18,14 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
+from absl.testing import parameterized
 
-from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import model_fn as model_fn_lib
@@ -35,10 +37,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import device_util
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import nest
 
@@ -190,10 +192,10 @@ def _make_mirrored():
 
 class RegroupAndSelectDeviceTest(test.TestCase):
 
-  def _is_per_device(self, result, expected, klass=values.PerDevice):
+  def _is_per_replica(self, result, expected, klass=values.PerReplica):
     self.assertIsInstance(result, klass)
     # We canonicalize the devices to match the device strings returned
-    # by PerDevice, which also does device string canonicalization.
+    # by PerReplica, which also does device string canonicalization.
     devices = [device_util.canonicalize(_device_str(i))
                for i in range(len(expected))]
     self.assertEqual(set(devices), set(result.devices))
@@ -206,18 +208,18 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                              _device_str(1): _nested_value("2")})
     self.assertIsInstance(result, tuple)
     self.assertEqual(3, len(result))
-    self._is_per_device(result[0], ["a1", "a2"])
-    self._is_per_device(result[2], ["h1", "h2"])
+    self._is_per_replica(result[0], ["a1", "a2"])
+    self._is_per_replica(result[2], ["h1", "h2"])
 
     self.assertIsInstance(result[1], list)
     self.assertEqual(3, len(result[1]))
-    self._is_per_device(result[1][0], ["b1", "b2"])
-    self._is_per_device(result[1][2], ["g1", "g2"])
+    self._is_per_replica(result[1][0], ["b1", "b2"])
+    self._is_per_replica(result[1][2], ["g1", "g2"])
 
     self.assertIsInstance(result[1][1], dict)
     self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
-    self._is_per_device(result[1][1]["c"], ["d1", "d2"])
-    self._is_per_device(result[1][1]["e"], ["f1", "f2"])
+    self._is_per_replica(result[1][1]["c"], ["d1", "d2"])
+    self._is_per_replica(result[1][1]["e"], ["f1", "f2"])
 
     # Also test that we can undo the merge using select_device()
     self.assertEqual(_nested_value("1"),
@@ -238,18 +240,18 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                             values.Mirrored)
     self.assertIsInstance(result, tuple)
     self.assertEqual(3, len(result))
-    self._is_per_device(result[0], ["a1", "a2"], values.Mirrored)
-    self._is_per_device(result[2], ["h1", "h2"], values.Mirrored)
+    self._is_per_replica(result[0], ["a1", "a2"], values.Mirrored)
+    self._is_per_replica(result[2], ["h1", "h2"], values.Mirrored)
 
     self.assertIsInstance(result[1], list)
     self.assertEqual(3, len(result[1]))
-    self._is_per_device(result[1][0], ["b1", "b2"], values.Mirrored)
-    self._is_per_device(result[1][2], ["g1", "g2"], values.Mirrored)
+    self._is_per_replica(result[1][0], ["b1", "b2"], values.Mirrored)
+    self._is_per_replica(result[1][2], ["g1", "g2"], values.Mirrored)
 
     self.assertIsInstance(result[1][1], dict)
     self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
-    self._is_per_device(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
-    self._is_per_device(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
+    self._is_per_replica(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
+    self._is_per_replica(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
 
     # Also test that we can undo the merge using select_device()
     self.assertEqual(_nested_value("1"),
@@ -275,7 +277,7 @@ class RegroupAndSelectDeviceTest(test.TestCase):
                              _device_str(1): ("b", foo)})
     self.assertIsInstance(result, tuple)
     self.assertEqual(2, len(result))
-    self._is_per_device(result[0], ["a", "b"])
+    self._is_per_replica(result[0], ["a", "b"])
     self.assertIs(foo, result[1])
 
     # Test select_device(), should undo the merge done by regroup().
@@ -325,72 +327,46 @@ class RegroupAndSelectDeviceTest(test.TestCase):
 
       self.assertTrue(
           isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec))
-      self.assertEquals(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
+      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
       for device_id in range(3):
         d = _device_str(device_id)
-        self.assertEquals(created_estimator_specs[device_id].loss,
-                          merged_estimator_spec.loss.get(d))
-        self.assertEquals(created_estimator_specs[device_id].train_op,
-                          merged_estimator_spec.train_op.get(d))
+        self.assertEqual(created_estimator_specs[device_id].loss,
+                         merged_estimator_spec.loss.get(d))
+        self.assertEqual(created_estimator_specs[device_id].train_op,
+                         merged_estimator_spec.train_op.get(d))
         # Scaffold is populated by `EstimatorSpec.__new__`.
-        self.assertEquals(created_estimator_specs[device_id].scaffold,
-                          merged_estimator_spec.scaffold.get(d))
+        self.assertEqual(created_estimator_specs[device_id].scaffold,
+                         merged_estimator_spec.scaffold.get(d))
         # Also test that we can undo the merge using select_device()
-        self.assertEquals(created_estimator_specs[device_id],
-                          values.select_device(_device_str(device_id),
-                                               merged_estimator_spec))
+        self.assertEqual(created_estimator_specs[device_id],
+                         values.select_device(_device_str(device_id),
+                                              merged_estimator_spec))
 
 
-class PerDeviceDatasetTest(test.TestCase):
+class PerReplicaDatasetTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
 
-  def _test_iterator_no_prefetch(self, devices, dataset, expected_values):
-    per_device_dataset = values.PerDeviceDataset(
-        dataset, devices, prefetch_on_device=False)
-    iterator = per_device_dataset.make_one_shot_iterator()
+  def _test_iterator(self, devices, dataset, expected_values):
+    per_replica_dataset = values.PerReplicaDataset(dataset, devices)
+    if context.executing_eagerly():
+      iterator = per_replica_dataset.make_one_shot_iterator()
+    else:
+      iterator = per_replica_dataset.make_initializable_iterator()
+      self.evaluate([iterator.initializer])
 
     for expected_value in expected_values:
       next_element = iterator.get_next()
-      actual = self.evaluate([
-          values.select_device(d, next_element) for d in devices])
-      self.assertEqual(expected_value, actual)
+      computed_value = self.evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
 
     with self.assertRaises(errors.OutOfRangeError):
       next_element = iterator.get_next()
       self.evaluate([
           values.select_device(d, next_element) for d in devices])
 
-  def _test_iterator_with_prefetch(self, devices, dataset, expected_values):
-    if not context.executing_eagerly():
-      per_device_dataset = values.PerDeviceDataset(
-          dataset, devices, prefetch_on_device=True)
-      iterator = per_device_dataset.make_one_shot_iterator()
-
-      # With prefetching, we cannot guarantee which input ends up on which
-      # device, so we verify that the complete set seen on all devices is
-      # correct, and equal numbers are distributed to each device.
-      combined_actual = []
-      combined_expected = []
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        combined_actual.extend(
-            self.evaluate(
-                [values.select_device(d, next_element) for d in devices]))
-        combined_expected.extend(expected_value)
-
-      self.assertEqual(set(combined_expected), set(combined_actual))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        next_element = iterator.get_next()
-        self.evaluate([
-            values.select_device(d, next_element) for d in devices])
-
-  def _test_iterator(self, devices, dataset, expected_values):
-    self._test_iterator_no_prefetch(devices, dataset, expected_values)
-    self._test_iterator_with_prefetch(devices, dataset, expected_values)
-
   @test_util.run_in_graph_and_eager_modes
   def testOneDevice(self):
     devices = ["/device:CPU:0"]
@@ -445,9 +421,8 @@ class PerDeviceDatasetTest(test.TestCase):
       dataset = dataset_ops.Dataset.from_tensor_slices(
           random_ops.random_uniform((10,)))
 
-      per_device_dataset = values.PerDeviceDataset(
-          dataset, devices, prefetch_on_device=False)
-      iterator = per_device_dataset.make_initializable_iterator()
+      per_replica_dataset = values.PerReplicaDataset(dataset, devices)
+      iterator = per_replica_dataset.make_initializable_iterator()
 
       self.evaluate(iterator.initializer)
       next_element = iterator.get_next()
@@ -466,7 +441,7 @@ class PerDeviceDatasetTest(test.TestCase):
 
 class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
 
-  def _test_iterator(self, iterator, devices, expected_values):
+  def _test_iterator(self, sess, iterator, devices, expected_values):
     next_element = iterator.get_next()
     for device in devices:
       v = values.select_device(device, next_element)
@@ -475,73 +450,79 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
         self.assertTrue(element.device in device)
 
     for expected_value in expected_values:
-      actual = self.evaluate(
+      actual = sess.run(
           [values.select_device(d, next_element) for d in devices])
       self.assertEqual(expected_value, actual)
 
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate([values.select_device(d, next_element) for d in devices])
+      sess.run([values.select_device(d, next_element) for d in devices])
 
-  def _test_dataset(self, dataset_fn, worker_device_map, devices,
-                    expected_values):
+  def _test_dataset(self, dataset_fn, worker_devices, devices,
+                    expected_values, auto_shard=True):
     multi_worker_dataset = values.MultiWorkerDataset(
-        dataset_fn, worker_device_map, prefetch_on_device=False)
-    multi_worker_iterator = multi_worker_dataset.make_one_shot_iterator()
-    self._test_iterator(multi_worker_iterator, devices, expected_values)
+        dataset_fn, worker_devices, auto_shard=auto_shard)
+    multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
+    with self.cached_session() as sess:
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
 
   def _cpu_devices(self):
-    worker_device_map = collections.OrderedDict(
-        [("/job:worker/replica:0/task:0",
-          ["/job:worker/replica:0/task:0/device:CPU:0"]),
-         ("/job:worker/replica:0/task:1",
-          ["/job:worker/replica:0/task:1/device:CPU:0"])])
+    worker_devices = [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
     devices = [
         "/job:worker/replica:0/task:0/device:CPU:0",
         "/job:worker/replica:0/task:1/device:CPU:0"
     ]
-    return worker_device_map, devices
+    return worker_devices, devices
 
   def _cpu_and_one_gpu_devices(self):
-    # The worker_device_map doesn't have to be a OrderDict object, this is just
-    # to simplify the testing so that we can pass expected values as a list
-    # instead of a dict.
-    worker_device_map = collections.OrderedDict(
-        [("/job:worker/replica:0/task:0", [
+    worker_devices = [
+        ("/job:worker/replica:0/task:0", [
             "/job:worker/replica:0/task:0/device:GPU:0",
             "/job:worker/replica:0/task:0/device:CPU:0"
-        ]), ("/job:worker/replica:0/task:1", [
+        ]),
+        ("/job:worker/replica:0/task:1", [
             "/job:worker/replica:0/task:1/device:GPU:0",
             "/job:worker/replica:0/task:1/device:CPU:0"
-        ])])
+        ])
+    ]
     devices = [
         "/job:worker/replica:0/task:0/device:GPU:0",
         "/job:worker/replica:0/task:0/device:CPU:0",
         "/job:worker/replica:0/task:1/device:GPU:0",
         "/job:worker/replica:0/task:1/device:CPU:0"
     ]
-    return worker_device_map, devices
+    return worker_devices, devices
 
   def testDataDistributionOneDevicePerWorker(self):
-    self.skipTest("Temporarily disabled.")
-    worker_device_map, devices = self._cpu_devices()
+    worker_devices, devices = self._cpu_devices()
     with context.graph_mode():
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(dataset_fn, worker_device_map, devices,
+      self._test_dataset(dataset_fn, worker_devices, devices,
                          [[0, 1], [2, 3], [4, 5], [6, 7]])
 
+  def testDataDistributionNoAutoShard(self):
+    worker_devices, devices = self._cpu_devices()
+    with context.graph_mode():
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_dataset(dataset_fn, worker_devices, devices,
+                         [[0, 0], [1, 1], [2, 2], [3, 3]],
+                         auto_shard=False)
+
   def testDataDistributionTwoDevicePerWorker(self):
-    self.skipTest("Temporarily disabled.")
     if context.num_gpus() < 1:
       self.skipTest("A GPU is not available for this test.")
-    worker_device_map, devices = self._cpu_and_one_gpu_devices()
+    worker_devices, devices = self._cpu_and_one_gpu_devices()
     with context.graph_mode():
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(dataset_fn, worker_device_map, devices,
+      self._test_dataset(dataset_fn, worker_devices, devices,
                          [[0, 2, 1, 3], [4, 6, 5, 7]])
 
   def testTupleDataset(self):
-    self.skipTest("Temporarily disabled.")
-    worker_device_map, devices = self._cpu_devices()
+    worker_devices, devices = self._cpu_devices()
 
     with context.graph_mode():
 
@@ -553,47 +534,221 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
       expected_values = [
           [(i, i**2), (i + 1, (i + 1)**2)] for i in range(0, 8, 2)
       ]
-      self._test_dataset(dataset_fn, worker_device_map, devices,
+      self._test_dataset(dataset_fn, worker_devices, devices,
                          expected_values)
 
   def testInitializableIterator(self):
-    self.skipTest("Temporarily disabled.")
-    worker_device_map, devices = self._cpu_devices()
-    with context.graph_mode():
+    worker_devices, devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
       multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, worker_device_map, prefetch_on_device=False)
+          dataset_fn, worker_devices, auto_shard=True)
       multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
 
-      self.evaluate(multi_worker_iterator.initializer)
-      self._test_iterator(multi_worker_iterator, devices,
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices,
                           [[0, 1], [2, 3], [4, 5], [6, 7]])
 
       # After re-initializing the iterator, should be able to iterate again.
-      self.evaluate(multi_worker_iterator.initializer)
-      self._test_iterator(multi_worker_iterator, devices,
+      sess.run(multi_worker_iterator.initializer)
+      self._test_iterator(sess, multi_worker_iterator, devices,
                           [[0, 1], [2, 3], [4, 5], [6, 7]])
 
   def testValueErrorForIterator(self):
-    self.skipTest("Temporarily disabled.")
     # Incompatiable arguments.
     with self.assertRaises(ValueError):
       values.MultiWorkerDataIterator({"w1": None}, {"w1": "d1", "w2": "d2"})
 
     # Test duplicated devices under same worker.
-    worker_device_map, _ = self._cpu_devices()
-    worker_device_map["/job:worker/replica:0/task:0"].append(
-        "/job:worker/replica:0/task:0/device:CPU:0")
+    worker_devices, _ = self._cpu_devices()
+    worker_devices[0][1].append("/job:worker/replica:0/task:0/device:CPU:0")
     with context.graph_mode():
       dataset_fn = lambda: dataset_ops.Dataset.range(8)
       multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, worker_device_map, prefetch_on_device=False)
+          dataset_fn, worker_devices, auto_shard=True)
       multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
       with self.assertRaises(ValueError):
         multi_worker_iterator.get_next()
 
 
-class MirroredVariableTest(test.TestCase):
+class InputIteratorTestBase(test.TestCase):
+
+  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
+                     expected_values, sess=None, split_batch_by=None):
+    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+
+    if input_type == "input_fn":
+      input_contexts = [
+          distribute_lib.InputContext() for _ in worker_device_pairs]
+      input_fn = lambda _: dataset_fn()
+      iterator = values.InputFunctionIterator(input_fn, worker_device_pairs,
+                                              input_contexts)
+    else:
+      iterator = values.DatasetIterator(dataset_fn(), worker_device_pairs,
+                                        split_batch_by)
+
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertAllEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_device(d, next_element) for d in devices])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertAllEqual(expected_value, computed_value)
+
+
+class InputIteratorSingleWorkerTest(InputIteratorTestBase,
+                                    parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDeviceCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesOneGPUOneCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTupleDataset(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    def dataset_fn():
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+      return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testUnevenDatasetBatches(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["dataset"],
+      split_batch_by=[None, 2],
+      required_gpus=1))
+  def testBatchSplitting(self, input_type, split_batch_by):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    batch_size = 10
+    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
+
+    updated_batch_size = (
+        batch_size // split_batch_by if split_batch_by else batch_size)
+    expected_values = [[range(i, i+updated_batch_size),
+                        range(i+updated_batch_size, i+2*updated_batch_size)]
+                       for i in range(0, 100, updated_batch_size*2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values, sess=None,
+                        split_batch_by=split_batch_by)
+
+
+class InputIteratorMultiWorkerTest(
+    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
+    parameterized.TestCase):
+
+  def _cpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
+
+  def _cpu_and_one_gpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0", [
+            "/job:worker/replica:0/task:0/device:GPU:0",
+            "/job:worker/replica:0/task:0/device:CPU:0"
+        ]),
+        ("/job:worker/replica:0/task:1", [
+            "/job:worker/replica:0/task:1/device:GPU:0",
+            "/job:worker/replica:0/task:1/device:CPU:0"
+        ])
+    ]
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDevicePerWorker(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesPerWorker(self, input_type):
+    worker_devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testTupleDataset(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      def dataset_fn():
+        dataset1 = dataset_ops.Dataset.range(4)
+        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
+        return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          expected_values, sess)
+
+
+class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -605,9 +760,9 @@ class MirroredVariableTest(test.TestCase):
 
     v, _, mirrored = _make_mirrored()
 
-    self.assertEquals(v[0].name, mirrored.name)
-    self.assertEquals(v[0].dtype, mirrored.dtype)
-    self.assertEquals(v[0].shape, mirrored.shape)
+    self.assertEqual(v[0].name, mirrored.name)
+    self.assertEqual(v[0].dtype, mirrored.dtype)
+    self.assertEqual(v[0].shape, mirrored.shape)
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testVariableOnAnotherDevice(self):
@@ -617,9 +772,9 @@ class MirroredVariableTest(test.TestCase):
     mirrored = values.MirroredVariable(index, v,
                                        variable_scope.VariableAggregation.MEAN)
 
-    self.assertEquals(v.name, mirrored.name)
-    self.assertEquals(v.dtype, mirrored.dtype)
-    self.assertEquals(v.shape, mirrored.shape)
+    self.assertEqual(v.name, mirrored.name)
+    self.assertEqual(v.dtype, mirrored.dtype)
+    self.assertEqual(v.shape, mirrored.shape)
 
   def _assign_mirrored(self, devices, v, new):
     for d, var, n in zip(devices, v, new):
@@ -739,14 +894,13 @@ class MirroredVariableTest(test.TestCase):
     save_path = self._save_normal()
     self._restore_mirrored(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testFetchAMirroredVariable(self):
-    if context.num_gpus() < 1 or context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test or it's eager mode.")
-
-    with self.session(
-        graph=ops.Graph()) as sess, mirrored_strategy.MirroredStrategy(
-            ["/device:GPU:0"]).scope():
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_one_gpu,
+          combinations.core_mirrored_strategy_with_one_gpu],
+      mode=["graph"]))
+  def testFetchAMirroredVariable(self, distribution):
+    with self.session(graph=ops.Graph()) as sess, distribution.scope():
       with ops.device("/device:GPU:0"):
         v = variable_scope.get_variable(
             name="v", initializer=1., use_resource=True)
@@ -760,7 +914,7 @@ class MirroredVariableTest(test.TestCase):
 _devices = ["/device:GPU:0", "/device:CPU:0"]
 
 
-def _make_tower_local(method):
+def _make_replica_local(method):
   v = []
   index = {}
   for d, n, init in zip(_devices, ["v", "v/replica"], [1., 2.]):
@@ -768,11 +922,11 @@ def _make_tower_local(method):
       v.append(variable_scope.get_variable(
           name=n, initializer=init, use_resource=True))
       index[d] = v[-1]
-  tower_local = values.TowerLocalVariable(index, v[0], method)
-  return v, tower_local
+  replica_local = values.ReplicaLocalVariable(index, v[0], method)
+  return v, replica_local
 
 
-class TowerLocalVariableTest(test.TestCase):
+class ReplicaLocalVariablePropertiesTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -781,30 +935,51 @@ class TowerLocalVariableTest(test.TestCase):
   def testProperties(self):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
+    v, replica_local = _make_replica_local(
+        variable_scope.VariableAggregation.SUM)
 
-    v, tower_local = _make_tower_local(variable_scope.VariableAggregation.SUM)
-
-    self.assertEquals(v[0].name, tower_local.name)
-    self.assertEquals(v[0].dtype, tower_local.dtype)
-    self.assertEquals(v[0].shape, tower_local.shape)
-    self.assertEquals(variable_scope.VariableAggregation.SUM,
-                      tower_local.aggregation)
+    self.assertEqual(v[0].name, replica_local.name)
+    self.assertEqual(v[0].dtype, replica_local.dtype)
+    self.assertEqual(v[0].shape, replica_local.shape)
+    self.assertEqual(variable_scope.VariableAggregation.SUM,
+                     replica_local.aggregation)
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testVariableOnAnotherDevice(self):
     v = variable_scope.get_variable(
         name="v", initializer=[1.], use_resource=True)
     index = {"/job:foo/device:CPU:0": v}
-    tower_local = values.TowerLocalVariable(
+    replica_local = values.ReplicaLocalVariable(
         index, v, variable_scope.VariableAggregation.MEAN)
 
-    self.assertEquals(v.name, tower_local.name)
-    self.assertEquals(v.dtype, tower_local.dtype)
-    self.assertEquals(v.shape, tower_local.shape)
-    self.assertEquals(variable_scope.VariableAggregation.MEAN,
-                      tower_local.aggregation)
+    self.assertEqual(v.name, replica_local.name)
+    self.assertEqual(v.dtype, replica_local.dtype)
+    self.assertEqual(v.shape, replica_local.shape)
+    self.assertEqual(variable_scope.VariableAggregation.MEAN,
+                     replica_local.aggregation)
+
+  def testTensorConversion(self):
+    with context.graph_mode():
+      _, replica_local = _make_replica_local(
+          variable_scope.VariableAggregation.SUM)
+      converted = ops.internal_convert_to_tensor(replica_local, as_ref=False)
+      self.assertIsInstance(converted, ops.Tensor)
+      self.assertEqual(converted.dtype, replica_local.dtype)
+
+      converted = ops.internal_convert_to_tensor(replica_local, as_ref=True)
+      # Resources variable are converted to tensors as well when as_ref is True.
+      self.assertIsInstance(converted, ops.Tensor)
+      self.assertEqual(converted.dtype, replica_local.dtype)
+
 
-  def _assign_tower_local(self, devices, v, new):
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
+
+  def _assign_replica_local(self, devices, v, new):
     for d, var, n in zip(devices, v, new):
       with ops.device(d):
         self.evaluate(var.assign(n))
@@ -819,86 +994,79 @@ class TowerLocalVariableTest(test.TestCase):
     save_path, _ = self._save_return_saver(sess, var)
     return save_path
 
-  def _dist_scope(self):
-    return mirrored_strategy.MirroredStrategy(_devices).scope()
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreTowerLocalSumOneGraph(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    with self.cached_session(config=self.config) as sess:
-      v, tower_local = _make_tower_local(variable_scope.VariableAggregation.SUM)
+  def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution):
+    with self.cached_session() as sess:
+      v, replica_local = _make_replica_local(
+          variable_scope.VariableAggregation.SUM)
 
       # Overwrite the initial values.
-      self._assign_tower_local(_devices, v, [3., 4.])
+      self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of v[0] + v[1], 7.
-        save_path, saver = self._save_return_saver(sess, tower_local)
+        save_path, saver = self._save_return_saver(sess, replica_local)
 
         # Change the values between save and restore.
-        self._assign_tower_local(_devices, v, [5., 6.])
+        self._assign_replica_local(_devices, v, [5., 6.])
 
         # Restores the saved value of 7. which gets divided equally
         # between the variables.
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreTowerLocalMeanOneGraph(self):
+  def testSaveAndRestoreReplicaLocalMeanOneGraph(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    with self.cached_session(config=self.config) as sess:
-      v, tower_local = _make_tower_local(
+    with self.cached_session() as sess:
+      v, replica_local = _make_replica_local(
           variable_scope.VariableAggregation.MEAN)
 
       # Overwrite the initial values.
-      self._assign_tower_local(_devices, v, [3., 4.])
+      self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of (v[0] + v[1])/2, 3.5.
-        save_path, saver = self._save_return_saver(sess, tower_local)
+        save_path, saver = self._save_return_saver(sess, replica_local)
 
         # Change the values between save and restore.
-        self._assign_tower_local(_devices, v, [5., 6.])
+        self._assign_replica_local(_devices, v, [5., 6.])
 
         # Restores the saved value of 3.5 to both variables.
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  def _save_tower_local_mean(self):
+  def _save_replica_local_mean(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
-      v, tower_local = _make_tower_local(
+      v, replica_local = _make_replica_local(
           variable_scope.VariableAggregation.MEAN)
 
       # Overwrite the initial values.
-      self._assign_tower_local(_devices, v, [3., 4.])
+      self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of (v[0] + v[1])/2, 3.5
-        save_path = self._save(sess, tower_local)
+        save_path = self._save(sess, replica_local)
 
         # Change the values between save and restore.
-        self._assign_tower_local(_devices, v, [5., 6.])
+        self._assign_replica_local(_devices, v, [5., 6.])
     return save_path
 
-  def _save_tower_local_sum(self):
+  def _save_replica_local_sum(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
-      v, tower_local = _make_tower_local("sum")
+      v, replica_local = _make_replica_local("sum")
 
       # Overwrite the initial values.
-      self._assign_tower_local(_devices, v, [1.5, 2.])
+      self._assign_replica_local(_devices, v, [1.5, 2.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of v[0] + v[1], 3.5
-        save_path = self._save(sess, tower_local)
+        save_path = self._save(sess, replica_local)
 
         # Change the values between save and restore.
-        self._assign_tower_local(_devices, v, [5., 6.])
+        self._assign_replica_local(_devices, v, [5., 6.])
     return save_path
 
   def _save_normal(self):
@@ -931,94 +1099,59 @@ class TowerLocalVariableTest(test.TestCase):
       saver.restore(sess, save_path)
       self.assertEqual(3.5, self.evaluate(var))
 
-  def _restore_tower_local_mean(self, save_path):
+  def _restore_replica_local_mean(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
-      v, tower_local = _make_tower_local(
+      v, replica_local = _make_replica_local(
           variable_scope.VariableAggregation.MEAN)
 
       # Overwrite the initial values.
-      self._assign_tower_local(_devices, v, [7., 8.])
+      self._assign_replica_local(_devices, v, [7., 8.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Restores the saved value of 3.5 to both variables.
-        saver = saver_lib.Saver(var_list=[tower_local])
+        saver = saver_lib.Saver(var_list=[replica_local])
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  def _restore_tower_local_sum(self, save_path):
+  def _restore_replica_local_sum(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
-      v, tower_local = _make_tower_local(variable_scope.VariableAggregation.SUM)
+      v, replica_local = _make_replica_local(
+          variable_scope.VariableAggregation.SUM)
 
       # Overwrite the initial values.
-      self._assign_tower_local(_devices, v, [7., 8.])
+      self._assign_replica_local(_devices, v, [7., 8.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Restores the saved value of 3.5 to both variables.
-        saver = saver_lib.Saver(var_list=[tower_local])
+        saver = saver_lib.Saver(var_list=[replica_local])
         saver.restore(sess, save_path)
         self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveTowerLocalRestoreTowerLocalMean(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_tower_local_mean()
-    self._restore_tower_local_mean(save_path)
+  def testSaveReplicaLocalRestoreReplicaLocalMean(self, distribution):
+    save_path = self._save_replica_local_mean(distribution)
+    self._restore_replica_local_mean(save_path, distribution)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveTowerLocalRestoreTowerLocalSum(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_tower_local_sum()
-    self._restore_tower_local_sum(save_path)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveTowerLocalMeanRestoreNormal(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
+  def testSaveReplicaLocalRestoreReplicaLocalSum(self, distribution):
+    save_path = self._save_replica_local_sum(distribution)
+    self._restore_replica_local_sum(save_path, distribution)
 
-    save_path = self._save_tower_local_mean()
+  def testSaveReplicaLocalMeanRestoreNormal(self, distribution):
+    save_path = self._save_replica_local_mean(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveTowerLocalSumRestoreNormal(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_tower_local_sum()
+  def testSaveReplicaLocalSumRestoreNormal(self, distribution):
+    save_path = self._save_replica_local_sum(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreTowerLocalMean(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
+  def testSaveNormalRestoreReplicaLocalMean(self, distribution):
     save_path = self._save_normal()
-    self._restore_tower_local_mean(save_path)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreTowerLocalSum(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
+    self._restore_replica_local_mean(save_path, distribution)
 
+  def testSaveNormalRestoreReplicaLocalSum(self, distribution):
     save_path = self._save_normal()
-    self._restore_tower_local_sum(save_path)
-
-  def testTensorConversion(self):
-    with context.graph_mode():
-      _, tower_local = _make_tower_local(variable_scope.VariableAggregation.SUM)
-      converted = ops.internal_convert_to_tensor(tower_local, as_ref=False)
-      self.assertIsInstance(converted, ops.Tensor)
-      self.assertEqual(converted.dtype, tower_local.dtype)
-
-      converted = ops.internal_convert_to_tensor(tower_local, as_ref=True)
-      # Resources variable are converted to tensors as well when as_ref is True.
-      self.assertIsInstance(converted, ops.Tensor)
-      self.assertEqual(converted.dtype, tower_local.dtype)
+    self._restore_replica_local_sum(save_path, distribution)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/warm_starting_util_test.py b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
index 5d57d144c1c16a08280970ecd89eb54f7cf1ffd4..b0bcf9b17456c938204a4892451928daf90b6743 100644
--- a/tensorflow/contrib/distribute/python/warm_starting_util_test.py
+++ b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
@@ -44,7 +44,9 @@ class WarmStartingUtilWithDistributionStrategyTest(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       save_with_distribution=[True, False],
       restore_with_distribution=[True, False],
       mode=["graph"]))
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 60f6b90edcb71f04bca29b90744db201e83cd545..3079175015a9aee1625404902070df8f13b2089c 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -72,7 +72,6 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:spectral_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
@@ -80,6 +79,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/signal",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 5cec93c4df2e970f203253be6342bb292f296eb0..5f6b7fe30996aa97653d97bffb007703437c3d14 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -13,74 +13,80 @@
 # limitations under the License.
 # ==============================================================================
 """Classes representing statistical distributions and ops for working with them.
+
+Use [tfp.distributions](/probability/api_docs/python/tfp/distributions) instead.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
+from tensorflow.python.util import deprecation
+
+
+# pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member,g-import-not-at-top
 
-from tensorflow.contrib.distributions.python.ops import bijectors
-from tensorflow.contrib.distributions.python.ops.autoregressive import *
-from tensorflow.contrib.distributions.python.ops.batch_reshape import *
-from tensorflow.contrib.distributions.python.ops.binomial import *
-from tensorflow.contrib.distributions.python.ops.cauchy import *
-from tensorflow.contrib.distributions.python.ops.chi2 import *
-from tensorflow.contrib.distributions.python.ops.conditional_distribution import *
-from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
-from tensorflow.contrib.distributions.python.ops.deterministic import *
-from tensorflow.contrib.distributions.python.ops.distribution_util import fill_triangular
-from tensorflow.contrib.distributions.python.ops.distribution_util import fill_triangular_inverse
-from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
-from tensorflow.contrib.distributions.python.ops.distribution_util import reduce_weighted_logsumexp
-from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
-from tensorflow.contrib.distributions.python.ops.distribution_util import tridiag
-from tensorflow.contrib.distributions.python.ops.estimator import *
-from tensorflow.contrib.distributions.python.ops.geometric import *
-from tensorflow.contrib.distributions.python.ops.half_normal import *
-from tensorflow.contrib.distributions.python.ops.independent import *
-from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
-from tensorflow.contrib.distributions.python.ops.kumaraswamy import *
-from tensorflow.contrib.distributions.python.ops.logistic import *
-from tensorflow.contrib.distributions.python.ops.mixture import *
-from tensorflow.contrib.distributions.python.ops.mixture_same_family import *
-from tensorflow.contrib.distributions.python.ops.moving_stats import *
-from tensorflow.contrib.distributions.python.ops.mvn_diag import *
-from tensorflow.contrib.distributions.python.ops.mvn_diag_plus_low_rank import *
-from tensorflow.contrib.distributions.python.ops.mvn_full_covariance import *
-from tensorflow.contrib.distributions.python.ops.mvn_tril import *
-from tensorflow.contrib.distributions.python.ops.negative_binomial import *
-from tensorflow.contrib.distributions.python.ops.normal_conjugate_posteriors import *
-from tensorflow.contrib.distributions.python.ops.onehot_categorical import *
-from tensorflow.contrib.distributions.python.ops.poisson import *
-from tensorflow.contrib.distributions.python.ops.poisson_lognormal import *
-from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
-from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
-from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
-from tensorflow.contrib.distributions.python.ops.sample_stats import *
-from tensorflow.contrib.distributions.python.ops.seed_stream import *
-from tensorflow.contrib.distributions.python.ops.sinh_arcsinh import *
-from tensorflow.contrib.distributions.python.ops.test_util import *
-from tensorflow.contrib.distributions.python.ops.vector_diffeomixture import *
-from tensorflow.contrib.distributions.python.ops.vector_exponential_diag import *
-from tensorflow.contrib.distributions.python.ops.vector_laplace_diag import *
-from tensorflow.contrib.distributions.python.ops.vector_sinh_arcsinh_diag import *
-from tensorflow.contrib.distributions.python.ops.wishart import *
-from tensorflow.python.ops.distributions.bernoulli import *
-from tensorflow.python.ops.distributions.beta import *
-from tensorflow.python.ops.distributions.categorical import *
-from tensorflow.python.ops.distributions.dirichlet import *
-from tensorflow.python.ops.distributions.dirichlet_multinomial import *
-from tensorflow.python.ops.distributions.distribution import *
-from tensorflow.python.ops.distributions.exponential import *
-from tensorflow.python.ops.distributions.gamma import *
-from tensorflow.python.ops.distributions.kullback_leibler import *
-from tensorflow.python.ops.distributions.laplace import *
-from tensorflow.python.ops.distributions.multinomial import *
-from tensorflow.python.ops.distributions.normal import *
-from tensorflow.python.ops.distributions.student_t import *
-from tensorflow.python.ops.distributions.transformed_distribution import *
-from tensorflow.python.ops.distributions.uniform import *
+with deprecation.silence():
+  from tensorflow.contrib.distributions.python.ops import bijectors
+  from tensorflow.contrib.distributions.python.ops.autoregressive import *
+  from tensorflow.contrib.distributions.python.ops.batch_reshape import *
+  from tensorflow.contrib.distributions.python.ops.binomial import *
+  from tensorflow.contrib.distributions.python.ops.cauchy import *
+  from tensorflow.contrib.distributions.python.ops.chi2 import *
+  from tensorflow.contrib.distributions.python.ops.conditional_distribution import *
+  from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
+  from tensorflow.contrib.distributions.python.ops.deterministic import *
+  from tensorflow.contrib.distributions.python.ops.distribution_util import fill_triangular
+  from tensorflow.contrib.distributions.python.ops.distribution_util import fill_triangular_inverse
+  from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
+  from tensorflow.contrib.distributions.python.ops.distribution_util import reduce_weighted_logsumexp
+  from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
+  from tensorflow.contrib.distributions.python.ops.distribution_util import tridiag
+  from tensorflow.contrib.distributions.python.ops.estimator import *
+  from tensorflow.contrib.distributions.python.ops.geometric import *
+  from tensorflow.contrib.distributions.python.ops.half_normal import *
+  from tensorflow.contrib.distributions.python.ops.independent import *
+  from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
+  from tensorflow.contrib.distributions.python.ops.kumaraswamy import *
+  from tensorflow.contrib.distributions.python.ops.logistic import *
+  from tensorflow.contrib.distributions.python.ops.mixture import *
+  from tensorflow.contrib.distributions.python.ops.mixture_same_family import *
+  from tensorflow.contrib.distributions.python.ops.moving_stats import *
+  from tensorflow.contrib.distributions.python.ops.mvn_diag import *
+  from tensorflow.contrib.distributions.python.ops.mvn_diag_plus_low_rank import *
+  from tensorflow.contrib.distributions.python.ops.mvn_full_covariance import *
+  from tensorflow.contrib.distributions.python.ops.mvn_tril import *
+  from tensorflow.contrib.distributions.python.ops.negative_binomial import *
+  from tensorflow.contrib.distributions.python.ops.normal_conjugate_posteriors import *
+  from tensorflow.contrib.distributions.python.ops.onehot_categorical import *
+  from tensorflow.contrib.distributions.python.ops.poisson import *
+  from tensorflow.contrib.distributions.python.ops.poisson_lognormal import *
+  from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
+  from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
+  from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
+  from tensorflow.contrib.distributions.python.ops.sample_stats import *
+  from tensorflow.contrib.distributions.python.ops.seed_stream import *
+  from tensorflow.contrib.distributions.python.ops.sinh_arcsinh import *
+  from tensorflow.contrib.distributions.python.ops.test_util import *
+  from tensorflow.contrib.distributions.python.ops.vector_diffeomixture import *
+  from tensorflow.contrib.distributions.python.ops.vector_exponential_diag import *
+  from tensorflow.contrib.distributions.python.ops.vector_laplace_diag import *
+  from tensorflow.contrib.distributions.python.ops.vector_sinh_arcsinh_diag import *
+  from tensorflow.contrib.distributions.python.ops.wishart import *
+  from tensorflow.python.ops.distributions.bernoulli import *
+  from tensorflow.python.ops.distributions.beta import *
+  from tensorflow.python.ops.distributions.categorical import *
+  from tensorflow.python.ops.distributions.dirichlet import *
+  from tensorflow.python.ops.distributions.dirichlet_multinomial import *
+  from tensorflow.python.ops.distributions.distribution import *
+  from tensorflow.python.ops.distributions.exponential import *
+  from tensorflow.python.ops.distributions.gamma import *
+  from tensorflow.python.ops.distributions.kullback_leibler import *
+  from tensorflow.python.ops.distributions.laplace import *
+  from tensorflow.python.ops.distributions.multinomial import *
+  from tensorflow.python.ops.distributions.normal import *
+  from tensorflow.python.ops.distributions.student_t import *
+  from tensorflow.python.ops.distributions.transformed_distribution import *
+  from tensorflow.python.ops.distributions.uniform import *
 
 # pylint: enable=unused-import,wildcard-import,line-too-long,g-importing-member
 
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
index 9b9b3ce2dd9d42286d2d9657d5f00de8445261f0..99cb105d66885fd5cf8cb6a3f87e2fe82a5bf4d0 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_test.py
@@ -250,13 +250,22 @@ class DistributionTest(test.TestCase):
     mvn_dynamic = tfd.MultivariateNormalDiag(
         loc=array_ops.placeholder(shape=[None, 3], dtype=dtypes.float32),
         name="MVN2")
-    self.assertEqual(
-        ("tfp.distributions.MultivariateNormalDiag("
-         "\"MVN2/\", "
-         "batch_shape=(?,), "  # Partially known.
-         "event_shape=(3,), "
-         "dtype=float32)"),
-        str(mvn_dynamic))
+    if mvn_dynamic.batch_shape._v2_behavior:
+      self.assertEqual(
+          ("tfp.distributions.MultivariateNormalDiag("
+           "\"MVN2/\", "
+           "batch_shape=(None,), "  # Partially known.
+           "event_shape=(3,), "
+           "dtype=float32)"),
+          str(mvn_dynamic))
+    else:
+      self.assertEqual(
+          ("tfp.distributions.MultivariateNormalDiag("
+           "\"MVN2/\", "
+           "batch_shape=(?,), "  # Partially known.
+           "event_shape=(3,), "
+           "dtype=float32)"),
+          str(mvn_dynamic))
 
   def testReprWorksCorrectlyScalar(self):
     normal = tfd.Normal(loc=np.float16(0), scale=np.float16(1))
@@ -300,13 +309,22 @@ class DistributionTest(test.TestCase):
     mvn_dynamic = tfd.MultivariateNormalDiag(
         loc=array_ops.placeholder(shape=[None, 3], dtype=dtypes.float32),
         name="MVN2")
-    self.assertEqual(
-        ("<tfp.distributions.MultivariateNormalDiag"
-         " 'MVN2/'"
-         " batch_shape=(?,)"  # Partially known.
-         " event_shape=(3,)"
-         " dtype=float32>"),
-        repr(mvn_dynamic))
+    if mvn_dynamic.batch_shape._v2_behavior:
+      self.assertEqual(
+          ("<tfp.distributions.MultivariateNormalDiag"
+           " 'MVN2/'"
+           " batch_shape=(None,)"  # Partially known.
+           " event_shape=(3,)"
+           " dtype=float32>"),
+          repr(mvn_dynamic))
+    else:
+      self.assertEqual(
+          ("<tfp.distributions.MultivariateNormalDiag"
+           " 'MVN2/'"
+           " batch_shape=(?,)"  # Partially known.
+           " event_shape=(3,)"
+           " dtype=float32>"),
+          repr(mvn_dynamic))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
index 29eeaf43c5185ce5519d4a1211f66e99ce61c6ab..ab3c07172a68255f4e387e071ac2f8341e93b90c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
@@ -82,7 +82,7 @@ class NormalTest(test.TestCase):
       x = constant_op.constant(
           [[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], [2.5, -2.5, -4.0, 0.0, 1.0, -2.0]],
           dtype=dtypes.float32)
-      s = math_ops.reduce_sum(x, reduction_indices=[1])
+      s = math_ops.reduce_sum(x, axis=[1])
       x = array_ops.transpose(x)  # Reshape to shape (6, 2)
       n = constant_op.constant([6] * 2)
       prior = distributions.Normal(loc=mu0, scale=sigma0)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index a60056c444a3fe7262939c5b3c73673f9a7c1469..cdee30bbc42e661952a9c757d7a30ebcd393f794 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -147,14 +147,13 @@ class WishartCholeskyTest(test.TestCase):
       x = chol_w.sample(10000, seed=42)
       self.assertAllEqual((10000, 3, 3), x.get_shape())
 
-      moment1_estimate = math_ops.reduce_mean(x, reduction_indices=[0]).eval()
+      moment1_estimate = math_ops.reduce_mean(x, axis=[0]).eval()
       self.assertAllClose(chol_w.mean().eval(), moment1_estimate, rtol=0.05)
 
       # The Variance estimate uses the squares rather than outer-products
       # because Wishart.Variance is the diagonal of the Wishart covariance
       # matrix.
-      variance_estimate = (math_ops.reduce_mean(
-          math_ops.square(x), reduction_indices=[0]) -
+      variance_estimate = (math_ops.reduce_mean(math_ops.square(x), axis=[0]) -
                            math_ops.square(moment1_estimate)).eval()
       self.assertAllClose(
           chol_w.variance().eval(), variance_estimate, rtol=0.05)
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
index 612376efb7f43b0dfcd3ffeb5437f2a419f66f4d..d450379088813caeac6f3dca72fae99c5f886b5a 100644
--- a/tensorflow/contrib/distributions/python/ops/batch_reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -429,5 +429,6 @@ def validate_init_args_statically(distribution, batch_shape):
 
   if batch_shape_static.dims is not None:
     if any(
-        dim.value is not None and dim.value < 1 for dim in batch_shape_static):
+        dim.value is not None and
+        dim.value < 1 for dim in batch_shape_static.dims):
       raise ValueError("`batch_shape` elements must be >=-1.")
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index e141f8b5c6423bd6cce4d09da6f49d55b3e25a24..3b17de9b8a903956bfdc4d46cf5bbfbfd8530e9f 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -14,6 +14,8 @@
 # ==============================================================================
 """Bijector Ops.
 
+Use [tfp.bijectors](/probability/api_docs/python/tfp/bijectors) instead.
+
 @@AbsoluteValue
 @@Affine
 @@AffineLinearOperator
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index 3e1e4fc82971b71792d193ea8518dd402e4a4d9d..2358ef5976b2f21c77130c71d5214a463d17bf0e 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -168,11 +168,11 @@ class CholeskyOuterProduct(bijector.Bijector):
           [is_matrix, is_square, is_positive_definite], x)
 
     # Create a vector equal to: [p, p-1, ..., 2, 1].
-    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
+    if x.get_shape().ndims is None or x.get_shape().dims[-1].value is None:
       p_int = array_ops.shape(x)[-1]
       p_float = math_ops.cast(p_int, dtype=x.dtype)
     else:
-      p_int = x.get_shape()[-1].value
+      p_int = x.get_shape().dims[-1].value
       p_float = np.array(p_int, dtype=x.dtype.as_numpy_dtype)
     exponents = math_ops.linspace(p_float, 1., p_int)
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py b/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py
index 31a9ca27e519bc312813668bf621a875838f12a0..7ae98878986eb10570b5e93a4a57d6bad6b38c0c 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
@@ -104,7 +105,8 @@ class FillTriangular(bijector.Bijector):
     return array_ops.zeros_like(y[..., 0, 0])
 
   def _forward_event_shape(self, input_shape):
-    batch_shape, d = input_shape[:-1], input_shape[-1].value
+    batch_shape, d = (input_shape[:-1],
+                      tensor_shape.dimension_value(input_shape[-1]))
     if d is None:
       n = None
     else:
@@ -113,8 +115,8 @@ class FillTriangular(bijector.Bijector):
 
   def _inverse_event_shape(self, output_shape):
     batch_shape, n1, n2 = (output_shape[:-2],
-                           output_shape[-2].value,
-                           output_shape[-1].value)
+                           tensor_shape.dimension_value(output_shape[-2]),
+                           tensor_shape.dimension_value(output_shape[-1]))
     if n1 is None or n2 is None:
       m = None
     elif n1 != n2:
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index 3b3d8ee6f2dc595983fd5e283d0435e8a227f2ba..c30de1f989a7b83fba1f69a83b96b8f45dea02c6 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import core as layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -237,7 +238,8 @@ class MaskedAutoregressiveFlow(bijector.Bijector):
 
   def _forward(self, x):
     if self._unroll_loop:
-      event_size = x.shape.with_rank_at_least(1)[-1].value
+      event_size = tensor_shape.dimension_value(
+          x.shape.with_rank_at_least(1)[-1])
       if event_size is None:
         raise ValueError(
             "The final dimension of `x` must be known at graph construction "
@@ -260,7 +262,8 @@ class MaskedAutoregressiveFlow(bijector.Bijector):
     # the graph compiler of the maximum number of steps. If not,
     # static_event_size will be None, and the maximum_iterations argument will
     # have no effect.
-    static_event_size = x.shape.with_rank_at_least(1)[-1].value
+    static_event_size = tensor_shape.dimension_value(
+        x.shape.with_rank_at_least(1)[-1])
     y0 = array_ops.zeros_like(x, name="y0")
     # call the template once to ensure creation
     _ = self._shift_and_log_scale_fn(y0)
@@ -405,7 +408,8 @@ def masked_dense(inputs,
        Conference on Machine Learning_, 2015. https://arxiv.org/abs/1502.03509
   """
   # TODO(b/67594795): Better support of dynamic shape.
-  input_depth = inputs.shape.with_rank_at_least(1)[-1].value
+  input_depth = tensor_shape.dimension_value(
+      inputs.shape.with_rank_at_least(1)[-1])
   if input_depth is None:
     raise NotImplementedError(
         "Rightmost dimension must be known prior to graph execution.")
@@ -520,7 +524,8 @@ def masked_autoregressive_default_template(
     def _fn(x):
       """MADE parameterized via `masked_autoregressive_default_template`."""
       # TODO(b/67594795): Better support of dynamic shape.
-      input_depth = x.shape.with_rank_at_least(1)[-1].value
+      input_depth = tensor_shape.dimension_value(
+          x.shape.with_rank_at_least(1)[-1])
       if input_depth is None:
         raise NotImplementedError(
             "Rightmost dimension must be known prior to graph execution.")
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
index 0bcb08cdea7142b82af3116245306a11773ef93c..17e9b8dec9f009415a9a26c3b043afacc2c4ec72 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/real_nvp.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import core as layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -96,16 +97,18 @@ class RealNVP(bijector.Bijector):
 
   # A common choice for a normalizing flow is to use a Gaussian for the base
   # distribution. (However, any continuous distribution would work.) E.g.,
+  num_dims = 3
+  num_samples = 1
   nvp = tfd.TransformedDistribution(
-      distribution=tfd.MultivariateNormalDiag(loc=[0., 0., 0.])),
+      distribution=tfd.MultivariateNormalDiag(loc=np.zeros(num_dims)),
       bijector=tfb.RealNVP(
           num_masked=2,
           shift_and_log_scale_fn=tfb.real_nvp_default_template(
               hidden_layers=[512, 512])))
 
-  x = nvp.sample()
+  x = nvp.sample(num_samples)
   nvp.log_prob(x)
-  nvp.log_prob(0.)
+  nvp.log_prob(np.zeros([num_samples, num_dims]))
   ```
 
   For more examples, see [Jang (2018)][3].
@@ -183,7 +186,8 @@ class RealNVP(bijector.Bijector):
 
   def _cache_input_depth(self, x):
     if self._input_depth is None:
-      self._input_depth = x.shape.with_rank_at_least(1)[-1].value
+      self._input_depth = tensor_shape.dimension_value(
+          x.shape.with_rank_at_least(1)[-1])
       if self._input_depth is None:
         raise NotImplementedError(
             "Rightmost dimension must be known prior to graph execution.")
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
index 71ac29038fc12e7d046df8624c6e3e5bb97d3d8f..ec203e171730a1ef6de6b72c6d96c52d4010d7e6 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -46,7 +47,7 @@ __all__ = [
     "instead of `tf.contrib.distributions`.",
     warn_once=True)
 def _static_ndims_from_shape(shape):
-  return shape.shape.with_rank_at_least(1)[0].value
+  return tensor_shape.dimension_value(shape.shape.with_rank_at_least(1)[0])
 
 
 @deprecation.deprecated(
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index 20ee0d340833d5c5275e2ab52a89dcdf7198add1..74765f19e584c5de07c6aee4a36ec4e85438f862 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -110,7 +110,7 @@ class SoftmaxCentered(bijector.Bijector):
 
     # Set shape hints.
     if x.shape.ndims is not None:
-      shape = x.shape[:-1].concatenate(x.shape[-1] + 1)
+      shape = x.shape[:-1].concatenate(x.shape.dims[-1] + 1)
       y.shape.assert_is_compatible_with(shape)
       y.set_shape(shape)
 
@@ -135,7 +135,7 @@ class SoftmaxCentered(bijector.Bijector):
 
     # Set shape hints.
     if y.shape.ndims is not None:
-      shape = y.shape[:-1].concatenate(y.shape[-1] - 1)
+      shape = y.shape[:-1].concatenate(y.shape.dims[-1] - 1)
       x.shape.assert_is_compatible_with(shape)
       x.set_shape(shape)
 
@@ -168,7 +168,7 @@ class SoftmaxCentered(bijector.Bijector):
     #   log_normalization = 1 + reduce_sum(exp(logits))
     #   -log_normalization + reduce_sum(logits - log_normalization)
     log_normalization = nn_ops.softplus(
-        math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
+        math_ops.reduce_logsumexp(x, axis=-1, keepdims=True))
     return array_ops.squeeze(
         (-log_normalization + math_ops.reduce_sum(
             x - log_normalization, axis=-1, keepdims=True)), axis=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index b4ad33cf6dbf073419a27f378c8eefdba97c5af7..1415f85e5cb5598e99c4d6b8e6c6a2d254503db0 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -315,7 +316,7 @@ def shapes_from_loc_and_scale(loc, scale, name="shapes_from_loc_and_scale"):
 
     # Static check that event shapes match.
     if loc is not None:
-      loc_event_size = loc.get_shape()[-1].value
+      loc_event_size = tensor_shape.dimension_value(loc.get_shape()[-1])
       if loc_event_size is not None and event_size_const is not None:
         if loc_event_size != 1 and loc_event_size != event_size_const:
           raise ValueError(
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index e1cfff3c66a2bcbc98af8a257dbdea2d916270e2..cf15deebb78b6c92865c34f61d806bc9e9ab3ee1 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -166,8 +166,10 @@ class Independent(distribution_lib.Distribution):
   def _batch_shape_tensor(self):
     with ops.control_dependencies(self._runtime_assertions):
       batch_shape = self.distribution.batch_shape_tensor()
-      batch_ndims = (batch_shape.shape[0].value
-                     if batch_shape.shape.with_rank_at_least(1)[0].value
+      dim0 = tensor_shape.dimension_value(
+          batch_shape.shape.with_rank_at_least(1)[0])
+      batch_ndims = (dim0
+                     if dim0 is not None
                      else array_ops.shape(batch_shape)[0])
       return batch_shape[:batch_ndims - self.reinterpreted_batch_ndims]
 
@@ -182,8 +184,10 @@ class Independent(distribution_lib.Distribution):
   def _event_shape_tensor(self):
     with ops.control_dependencies(self._runtime_assertions):
       batch_shape = self.distribution.batch_shape_tensor()
-      batch_ndims = (batch_shape.shape[0].value
-                     if batch_shape.shape.with_rank_at_least(1)[0].value
+      dim0 = tensor_shape.dimension_value(
+          batch_shape.shape.with_rank_at_least(1)[0])
+      batch_ndims = (dim0
+                     if dim0 is not None
                      else array_ops.shape(batch_shape)[0])
       return array_ops.concat([
           batch_shape[batch_ndims - self.reinterpreted_batch_ndims:],
@@ -239,9 +243,11 @@ class Independent(distribution_lib.Distribution):
                              static_reinterpreted_batch_ndims, batch_ndims))
     elif validate_args:
       batch_shape = distribution.batch_shape_tensor()
+      dim0 = tensor_shape.dimension_value(
+          batch_shape.shape.with_rank_at_least(1)[0])
       batch_ndims = (
-          batch_shape.shape[0].value
-          if batch_shape.shape.with_rank_at_least(1)[0].value is not None
+          dim0
+          if dim0 is not None
           else array_ops.shape(batch_shape)[0])
       assertions.append(check_ops.assert_less_equal(
           reinterpreted_batch_ndims, batch_ndims,
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index f4d394ff29f072907a019afb52bd8dc5d244e955..f34317f5abfed1c71b516c5fde42baca614d7f9b 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import distribution_util as distribution_utils
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -147,8 +148,9 @@ class MixtureSameFamily(distribution.Distribution):
       self._runtime_assertions = []
 
       s = components_distribution.event_shape_tensor()
-      self._event_ndims = (s.shape[0].value
-                           if s.shape.with_rank_at_least(1)[0].value is not None
+      s_dim0 = tensor_shape.dimension_value(s.shape[0])
+      self._event_ndims = (s_dim0
+                           if s_dim0 is not None
                            else array_ops.shape(s)[0])
 
       if not mixture_distribution.dtype.is_integer:
@@ -186,8 +188,10 @@ class MixtureSameFamily(distribution.Distribution):
                     "`mixture_distribution.batch_shape` is not "
                     "compatible with `components_distribution.batch_shape`"))]
 
-      km = mixture_distribution.logits.shape.with_rank_at_least(1)[-1].value
-      kc = components_distribution.batch_shape.with_rank_at_least(1)[-1].value
+      km = tensor_shape.dimension_value(
+          mixture_distribution.logits.shape.with_rank_at_least(1)[-1])
+      kc = tensor_shape.dimension_value(
+          components_distribution.batch_shape.with_rank_at_least(1)[-1])
       if km is not None and kc is not None and km != kc:
         raise ValueError("`mixture_distribution components` ({}) does not "
                          "equal `components_distribution.batch_shape[-1]` "
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index aa680a92be64cf0f099acd335369f2a1610c5953..978e627d6638ddeea9df288d389354f0ac53d115 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -29,8 +29,8 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import spectral_ops
 from tensorflow.python.ops.distributions import util
+from tensorflow.python.ops.signal import fft_ops
 
 __all__ = [
     "auto_correlation",
@@ -157,11 +157,11 @@ def auto_correlation(
                                        dtype.real_dtype.as_numpy_dtype(0.))
 
     # Autocorrelation is IFFT of power-spectral density (up to some scaling).
-    fft_x_rotated_pad = spectral_ops.fft(x_rotated_pad)
+    fft_x_rotated_pad = fft_ops.fft(x_rotated_pad)
     spectral_density = fft_x_rotated_pad * math_ops.conj(fft_x_rotated_pad)
     # shifted_product is R[m] from above detailed explanation.
     # It is the inner product sum_n X[n] * Conj(X[n - m]).
-    shifted_product = spectral_ops.ifft(spectral_density)
+    shifted_product = fft_ops.ifft(spectral_density)
 
     # Cast back to real-valued if x was real to begin with.
     shifted_product = math_ops.cast(shifted_product, dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index a3d178357b79b9d0d15c738603d5019321eff112..a648d61ac8dd5c1d368cf41505b85827dfeb63e1 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -183,7 +183,7 @@ def quadrature_scheme_softmaxnormal_quantiles(
     def _get_final_shape(qs):
       """Helper to build `TensorShape`."""
       bs = dist.batch_shape.with_rank_at_least(1)
-      num_components = bs[-1].value
+      num_components = tensor_shape.dimension_value(bs[-1])
       if num_components is not None:
         num_components += 1
       tail = tensor_shape.TensorShape([num_components, qs])
@@ -791,7 +791,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
 
   def _expand_mix_distribution_probs(self):
     p = self.mixture_distribution.probs  # [B, deg]
-    deg = p.shape.with_rank_at_least(1)[-1].value
+    deg = tensor_shape.dimension_value(p.shape.with_rank_at_least(1)[-1])
     if deg is None:
       deg = array_ops.shape(p)[-1]
     event_ndims = self.event_shape.ndims
@@ -831,10 +831,12 @@ def maybe_check_quadrature_param(param, name, validate_args):
 
     # TODO(jvdillon): Remove once we support k-mixtures.
     if param.shape.with_rank_at_least(1)[-1] is not None:
-      if param.shape[-1].value != 1:
+      if tensor_shape.dimension_value(param.shape[-1]) != 1:
         raise NotImplementedError("Currently only bimixtures are supported; "
                                   "{}.shape[-1]={} is not 1.".format(
-                                      name, param.shape[-1].value))
+                                      name,
+                                      tensor_shape.dimension_value(
+                                          param.shape[-1])))
     elif validate_args:
       assertions.append(check_ops.assert_equal(
           array_ops.shape(param)[-1], 1,
@@ -905,7 +907,7 @@ def interpolate_loc(grid, loc):
   if len(loc) != 2:
     raise NotImplementedError("Currently only bimixtures are supported; "
                               "len(scale)={} is not 2.".format(len(loc)))
-  deg = grid.shape.with_rank_at_least(1)[-1].value
+  deg = tensor_shape.dimension_value(grid.shape.with_rank_at_least(1)[-1])
   if deg is None:
     raise ValueError("Num quadrature grid points must be known prior "
                      "to graph execution.")
@@ -939,7 +941,7 @@ def interpolate_scale(grid, scale):
   if len(scale) != 2:
     raise NotImplementedError("Currently only bimixtures are supported; "
                               "len(scale)={} is not 2.".format(len(scale)))
-  deg = grid.shape.with_rank_at_least(1)[-1].value
+  deg = tensor_shape.dimension_value(grid.shape.with_rank_at_least(1)[-1])
   if deg is None:
     raise ValueError("Num quadrature grid points must be known prior "
                      "to graph execution.")
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index ee2fc58864d4ac528ebae3d681d2e4922fb60771..2d83f0c13f14a8e5d1eee4fa1436bd05991e934e 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -136,13 +136,13 @@ class _WishartLinearOperator(distribution.Distribution):
         contrib_tensor_util.assert_same_float_dtype(
             (self._df, self._scale_operator))
         if (self._scale_operator.shape.ndims is None or
-            self._scale_operator.shape[-1].value is None):
+            self._scale_operator.shape.dims[-1].value is None):
           self._dimension = math_ops.cast(
               self._scale_operator.domain_dimension_tensor(),
               dtype=self._scale_operator.dtype, name="dimension")
         else:
           self._dimension = ops.convert_to_tensor(
-              self._scale_operator.shape[-1].value,
+              self._scale_operator.shape.dims[-1].value,
               dtype=self._scale_operator.dtype, name="dimension")
         df_val = tensor_util.constant_value(self._df)
         dim_val = tensor_util.constant_value(self._dimension)
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 33a1d572a20e68479d3ec1147d4892449e7beb8a..77052a75a70bec1162feb2b126d247924b3a2e36 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -28,6 +28,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:function",
     ],
@@ -249,11 +250,10 @@ py_library(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "remote_test",
     srcs = ["remote_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":parameter_server",
         ":remote",
         "//tensorflow/contrib/eager/python:tfe",
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 3aed121233be1268531495a2fa83fd323412e1fd..34614b86a75b93ab93cf844c645c211b1329c6d5 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -52,12 +52,6 @@ class Iterator(iterator_ops.EagerIterator):
       TypeError: If `dataset` is an unsupported type.
       RuntimeError: When invoked without eager execution enabled.
     """
-    if isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset):  # pylint: disable=protected-access
-      raise TypeError(
-          "`tf.data.experimental.prefetch_to_device()` is not compatible with "
-          "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
-          "over the dataset instead.")
-
     if not context.context().device_spec.device_type:
       is_remote_device = False
     else:
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 6a508fc6ba98740c4d441a064dc8a3e2b321f585..257d02057ae0d280074559aa9e97725bf5cc3fd0 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.contrib import lookup
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.data import Dataset
-from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.eager import test
@@ -208,18 +207,6 @@ class IteratorTest(test.TestCase):
         y = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], y.numpy())
 
-  def testTensorsExplicitPrefetchToDevice(self):
-    ds = Dataset.from_tensor_slices([0., 1.])
-    ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name()))
-
-    with self.assertRaisesRegexp(TypeError, 'prefetch_to_device'):
-      datasets.Iterator(ds)
-
-    for i, x in enumerate(ds):
-      with ops.device(test.gpu_device_name()):
-        x = math_ops.add(x, x)
-        self.assertEqual(float(i) + float(i), x.numpy())
-
   def testOverrideThreadPool(self):
 
     def get_thread_id(_):
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 7949a3f6da293abdd85512209242bae76ab4d816..51443d24829bdc31a41813e0ff50ad7102422112 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -22,6 +22,7 @@ import six
 
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.contrib.eager.python import metrics
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import errors_impl
@@ -164,8 +165,8 @@ class Evaluator(object):
         self.__call__(example, *args, **kwargs)
       return self.all_metric_results(summary_logdir)
     # Graph construction
-    call_op = self.__call__(dataset.make_one_shot_iterator().get_next(), *args,
-                            **kwargs)
+    call_op = self.__call__(
+        dataset_ops.make_one_shot_iterator(dataset).get_next(), *args, **kwargs)
     init_op = self.init_variables()
     results_op = self.all_metric_results(summary_logdir)
     return (init_op, call_op, results_op)
diff --git a/tensorflow/contrib/eager/python/examples/densenet/BUILD b/tensorflow/contrib/eager/python/examples/densenet/BUILD
index 2dc196f550a10367066730f6f042c4ed69533ec3..e2154fcc5fcf774dcd52285d9442dfd5073a4992 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/densenet/BUILD
@@ -3,6 +3,7 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_binary")
 
 py_binary(
     name = "densenet",
diff --git a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
index 4b3cb624bc947a1d1956eff6accb6d4da3bf3b87..24f6b007b526b29157011f3b1e9abdbd50bacc8e 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
@@ -119,7 +119,8 @@ class DensenetBenchmark(tf.test.Benchmark):
       with tf.Graph().as_default():
         np_images, np_labels = random_batch(batch_size)
         dataset = tf.data.Dataset.from_tensors((np_images, np_labels)).repeat()
-        (images, labels) = dataset.make_one_shot_iterator().get_next()
+        (images, labels) = tf.compat.v1.data.make_one_shot_iterator(
+            dataset).get_next()
 
         model = densenet.DenseNet(self.depth, self.growth_rate, self.num_blocks,
                                   self.output_classes,
diff --git a/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py b/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py
index e5058bfd9480e25b3cf040f0d96bf21242a147b8..a9fb0035d299d64b35d756eaf1ae5f7034ff5599 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py
+++ b/tensorflow/contrib/eager/python/examples/densenet/densenet_test.py
@@ -228,6 +228,7 @@ class DensenetBenchmark(tf.test.Benchmark):
                                 weight_decay=1e-4, dropout_rate=0,
                                 pool_initial=True, include_top=True)
       if defun:
+        # TODO(apassos) enable tfe.function here
         model.call = tfe.defun(model.call)
       batch_size = 64
       num_burn = 5
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
index 12b39b0cde49d4c017acfa74572c725036c54eff..e73841fbf724e05eaa3be90cc8650f795d3e1ccf 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
@@ -42,7 +42,8 @@ class MnistGraphGanBenchmark(tf.test.Benchmark):
     # Generate some random data.
     images_data = np.random.randn(batch_size, 784).astype(np.float32)
     dataset = tf.data.Dataset.from_tensors(images_data)
-    images = dataset.repeat().make_one_shot_iterator().get_next()
+    images = tf.compat.v1.data.make_one_shot_iterator(
+        dataset.repeat()).get_next()
 
     # Create the models and optimizers
     generator = mnist.Generator(data_format())
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
index ca27a85a229d41a85fa26ecdc982da478fe9e202..1a08cc0fd06516be4af5c2b0b46a3ffcf9101e95 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
@@ -470,7 +470,7 @@
         "\n",
         "  if epoch % 1 == 0:\n",
         "    loss = tfe.metrics.Mean()\n",
-        "    for test_x in test_dataset.make_one_shot_iterator():\n",
+        "    for test_x in test_dataset:\n",
         "      loss(compute_loss(model, test_x))\n",
         "    elbo = -loss.result()\n",
         "    display.clear_output(wait=False)\n",
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
index 5621d6a358e8969ea1a6663c1c770987de41ce0c..78fcd397087fd1fd64aebed7ac3b5c6b2f45c450 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
@@ -1,324 +1,405 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "dcgan.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python2",
+      "display_name": "Python 2"
+    },
+    "accelerator": "GPU"
+  },
   "cells": [
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "0TD5ZrvEMbhZ"
       },
+      "cell_type": "markdown",
       "source": [
-        "##### Copyright 2018 The TensorFlow Authors.\n",
+        "**Copyright 2018 The TensorFlow Authors**.\n",
         "\n",
         "Licensed under the Apache License, Version 2.0 (the \"License\").\n",
         "\n",
-        "# DCGAN: An example with tf.keras and eager\n",
+        "# Generating Handwritten Digits with DCGAN\n",
         "\n",
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\"\u003e\n",
-        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
-        "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
+        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\">\n",
+        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
+        "</td><td>\n",
+        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "ITZuApL56Mny"
       },
+      "cell_type": "markdown",
+      "source": [
+        "This tutorial demonstrates how to generate images of handwritten digits using a Deep Convolutional Generative Adversarial Network ([DCGAN](https://arxiv.org/pdf/1511.06434.pdf)). The code is written in [tf.keras](https://www.tensorflow.org/programmers_guide/keras) with [eager execution](https://www.tensorflow.org/programmers_guide/eager) enabled. "
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "toc",
+        "id": "x2McrO9bMyLN"
+      },
+      "cell_type": "markdown",
+      "source": [
+        ">[Generating Handwritten Digits with DCGAN](#scrollTo=0TD5ZrvEMbhZ)\n",
+        "\n",
+        ">>[What are GANs?](#scrollTo=2MbKJY38Puy9)\n",
+        "\n",
+        ">>>[Import TensorFlow and enable eager execution](#scrollTo=e1_Y75QXJS6h)\n",
+        "\n",
+        ">>>[Load the dataset](#scrollTo=iYn4MdZnKCey)\n",
+        "\n",
+        ">>>[Use tf.data to create batches and shuffle the dataset](#scrollTo=PIGN6ouoQxt3)\n",
+        "\n",
+        ">>[Create the models](#scrollTo=THY-sZMiQ4UV)\n",
+        "\n",
+        ">>>[The Generator Model](#scrollTo=-tEyxE-GMC48)\n",
+        "\n",
+        ">>>[The Discriminator model](#scrollTo=D0IKnaCtg6WE)\n",
+        "\n",
+        ">>[Define the loss functions and the optimizer](#scrollTo=0FMYgY_mPfTi)\n",
+        "\n",
+        ">>>[Generator loss](#scrollTo=Jd-3GCUEiKtv)\n",
+        "\n",
+        ">>>[Discriminator loss](#scrollTo=PKY_iPSPNWoj)\n",
+        "\n",
+        ">>[Set up GANs for Training](#scrollTo=Rw1fkAczTQYh)\n",
+        "\n",
+        ">>[Train the GANs](#scrollTo=dZrd4CdjR-Fp)\n",
+        "\n",
+        ">>[Generated images](#scrollTo=P4M_vIbUi7c0)\n",
+        "\n",
+        ">>[Learn more about GANs](#scrollTo=k6qC-SbjK0yW)\n",
+        "\n"
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "2MbKJY38Puy9"
+      },
+      "cell_type": "markdown",
       "source": [
-        "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). To do so, we use Deep Convolutional Generative Adverserial Networks ([DCGAN](https://arxiv.org/pdf/1511.06434.pdf)).\n",
+        "## What are GANs?\n",
+        "GANs, or [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661), are a framework for estimating generative models. Two models are trained simultaneously by an adversarial process: a Generator, which is responsible for generating data (say, images), and a Discriminator, which is responsible for estimating the probability that an image was drawn from the training data (the image is real), or was produced by the Generator (the image is fake). During training, the Generator becomes progressively better at generating images, until the Discriminator is no longer able to distinguish real images from fake. \n",
         "\n",
-        "This model takes about ~30 seconds per epoch (using tf.contrib.eager.defun to create graph functions) to train on a single Tesla K80 on Colab, as of July 2018.\n",
+        "![alt text](https://github.com/margaretmz/tensorflow/blob/margaret-dcgan/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png?raw=1)\n",
         "\n",
-        "Below is the output generated after training the generator and discriminator models for 150 epochs.\n",
+        "We will demonstrate this process end-to-end on MNIST. Below is an animation that shows a series of images produced by the Generator as it was trained for 50 epochs. Overtime, the generated images become increasingly difficult to distinguish from the training set.\n",
+        "\n",
+        "To learn more about GANs, we recommend MIT's [Intro to Deep Learning](http://introtodeeplearning.com/) course, which includes a lecture on Deep Generative Models ([video](https://youtu.be/JVb54xhEw6Y) | [slides](http://introtodeeplearning.com/materials/2018_6S191_Lecture4.pdf)). Now, let's head to the code!\n",
         "\n",
         "![sample output](https://tensorflow.org/images/gan/dcgan.gif)"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "u_2z-B3piVsw"
+        "id": "u_2z-B3piVsw",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "# to generate gifs\n",
+        "# Install imgeio in order to generate an animated gif showing the image generating process\n",
         "!pip install imageio"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "e1_Y75QXJS6h"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Import TensorFlow and enable eager execution"
+        "### Import TensorFlow and enable eager execution"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "YfIk2es3hJEd"
+        "id": "YfIk2es3hJEd",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "from __future__ import absolute_import, division, print_function\n",
-        "\n",
-        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()\n",
         "\n",
-        "import os\n",
-        "import time\n",
-        "import numpy as np\n",
         "import glob\n",
+        "import imageio\n",
         "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import os\n",
         "import PIL\n",
-        "import imageio\n",
+        "import time\n",
+        "\n",
         "from IPython import display"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "iYn4MdZnKCey"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Load the dataset\n",
+        "### Load the dataset\n",
         "\n",
-        "We are going to use the MNIST dataset to train the generator and the discriminator. The generator will then generate handwritten digits."
+        "We are going to use the MNIST dataset to train the generator and the discriminator. The generator will generate handwritten digits resembling the MNIST data."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "a4fYMGxGhrna"
+        "id": "a4fYMGxGhrna",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "(train_images, train_labels), (_, _) = tf.keras.datasets.mnist.load_data()"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "NFC2ghIdiZYE"
+        "id": "NFC2ghIdiZYE",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "# We are normalizing the images to the range of [-1, 1]\n",
-        "train_images = (train_images - 127.5) / 127.5"
-      ]
+        "train_images = (train_images - 127.5) / 127.5 # Normalize the images to [-1, 1]"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "S4PIDhoDLbsZ"
+        "id": "S4PIDhoDLbsZ",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "BUFFER_SIZE = 60000\n",
         "BATCH_SIZE = 256"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "PIGN6ouoQxt3"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Use tf.data to create batches and shuffle the dataset"
+        "### Use tf.data to create batches and shuffle the dataset"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "-yKCCQOoJ7cn"
+        "id": "-yKCCQOoJ7cn",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "THY-sZMiQ4UV"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Write the generator and discriminator models\n",
+        "## Create the models\n",
         "\n",
-        "* **Generator** \n",
-        "  * It is responsible for **creating convincing images that are good enough to fool the discriminator**.\n",
-        "  * It consists of Conv2DTranspose (Upsampling) layers. We start with a fully connected layer and upsample the image 2 times so as to reach the desired image size (mnist image size) which is (28, 28, 1). \n",
-        "  * We use **leaky relu** activation except for the **last layer** which uses **tanh** activation.\n",
-        "  \n",
-        "* **Discriminator**\n",
-        "  * **The discriminator is responsible for classifying the fake images from the real images.**\n",
-        "  * In other words, the discriminator is given generated images (from the generator) and the real MNIST images. The job of the discriminator is to classify these images into fake (generated) and real (MNIST images).\n",
-        "  * **Basically the generator should be good enough to fool the discriminator that the generated images are real**."
+        "We will use tf.keras [Sequential API](https://www.tensorflow.org/guide/keras#sequential_model) to define the generator and discriminator models."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
+        "colab_type": "text",
+        "id": "-tEyxE-GMC48"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "### The Generator Model\n",
+        "\n",
+        "The generator is responsible for creating convincing images that are good enough to fool the discriminator. The network architecture for the generator consists of [Conv2DTranspose](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv2DTranspose) (Upsampling) layers. We start with a fully connected layer and upsample the image two times in order to reach the desired image size of 28x28x1. We increase the width and height, and reduce the depth as we move through the layers in the network. We use [Leaky ReLU](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LeakyReLU) activation for each layer except for the last one where we use a tanh activation."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "6bpTcDqoLWjY",
         "colab_type": "code",
-        "id": "VGLbvBEmjK0a"
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "class Generator(tf.keras.Model):\n",
-        "  def __init__(self):\n",
-        "    super(Generator, self).__init__()\n",
-        "    self.fc1 = tf.keras.layers.Dense(7*7*64, use_bias=False)\n",
-        "    self.batchnorm1 = tf.keras.layers.BatchNormalization()\n",
-        "    \n",
-        "    self.conv1 = tf.keras.layers.Conv2DTranspose(64, (5, 5), strides=(1, 1), padding='same', use_bias=False)\n",
-        "    self.batchnorm2 = tf.keras.layers.BatchNormalization()\n",
-        "    \n",
-        "    self.conv2 = tf.keras.layers.Conv2DTranspose(32, (5, 5), strides=(2, 2), padding='same', use_bias=False)\n",
-        "    self.batchnorm3 = tf.keras.layers.BatchNormalization()\n",
+        "def make_generator_model():\n",
+        "    model = tf.keras.Sequential()\n",
+        "    model.add(tf.keras.layers.Dense(7*7*256, use_bias=False, input_shape=(100,)))\n",
+        "    model.add(tf.keras.layers.BatchNormalization())\n",
+        "    model.add(tf.keras.layers.LeakyReLU())\n",
+        "      \n",
+        "    model.add(tf.keras.layers.Reshape((7, 7, 256)))\n",
+        "    assert model.output_shape == (None, 7, 7, 256) # Note: None is the batch size\n",
         "    \n",
-        "    self.conv3 = tf.keras.layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False)\n",
-        "\n",
-        "  def call(self, x, training=True):\n",
-        "    x = self.fc1(x)\n",
-        "    x = self.batchnorm1(x, training=training)\n",
-        "    x = tf.nn.relu(x)\n",
-        "\n",
-        "    x = tf.reshape(x, shape=(-1, 7, 7, 64))\n",
+        "    model.add(tf.keras.layers.Conv2DTranspose(128, (5, 5), strides=(1, 1), padding='same', use_bias=False))\n",
+        "    assert model.output_shape == (None, 7, 7, 128)  \n",
+        "    model.add(tf.keras.layers.BatchNormalization())\n",
+        "    model.add(tf.keras.layers.LeakyReLU())\n",
         "\n",
-        "    x = self.conv1(x)\n",
-        "    x = self.batchnorm2(x, training=training)\n",
-        "    x = tf.nn.relu(x)\n",
+        "    model.add(tf.keras.layers.Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False))\n",
+        "    assert model.output_shape == (None, 14, 14, 64)    \n",
+        "    model.add(tf.keras.layers.BatchNormalization())\n",
+        "    model.add(tf.keras.layers.LeakyReLU())\n",
         "\n",
-        "    x = self.conv2(x)\n",
-        "    x = self.batchnorm3(x, training=training)\n",
-        "    x = tf.nn.relu(x)\n",
+        "    model.add(tf.keras.layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))\n",
+        "    assert model.output_shape == (None, 28, 28, 1)\n",
+        "  \n",
+        "    return model"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "D0IKnaCtg6WE"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "### The Discriminator model\n",
         "\n",
-        "    x = tf.nn.tanh(self.conv3(x))  \n",
-        "    return x"
+        "The discriminator is responsible for distinguishing fake images from real images. It's similar to a regular CNN-based image classifier."
       ]
     },
     {
+      "metadata": {
+        "id": "dw2tPLmk2pEP",
+        "colab_type": "code",
+        "colab": {}
+      },
       "cell_type": "code",
+      "source": [
+        "def make_discriminator_model():\n",
+        "    model = tf.keras.Sequential()\n",
+        "    model.add(tf.keras.layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same'))\n",
+        "    model.add(tf.keras.layers.LeakyReLU())\n",
+        "    model.add(tf.keras.layers.Dropout(0.3))\n",
+        "      \n",
+        "    model.add(tf.keras.layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'))\n",
+        "    model.add(tf.keras.layers.LeakyReLU())\n",
+        "    model.add(tf.keras.layers.Dropout(0.3))\n",
+        "       \n",
+        "    model.add(tf.keras.layers.Flatten())\n",
+        "    model.add(tf.keras.layers.Dense(1))\n",
+        "     \n",
+        "    return model"
+      ],
       "execution_count": 0,
+      "outputs": []
+    },
+    {
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "bkOfJxk5j5Hi"
-      },
-      "outputs": [],
-      "source": [
-        "class Discriminator(tf.keras.Model):\n",
-        "  def __init__(self):\n",
-        "    super(Discriminator, self).__init__()\n",
-        "    self.conv1 = tf.keras.layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same')\n",
-        "    self.conv2 = tf.keras.layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same')\n",
-        "    self.dropout = tf.keras.layers.Dropout(0.3)\n",
-        "    self.flatten = tf.keras.layers.Flatten()\n",
-        "    self.fc1 = tf.keras.layers.Dense(1)\n",
-        "\n",
-        "  def call(self, x, training=True):\n",
-        "    x = tf.nn.leaky_relu(self.conv1(x))\n",
-        "    x = self.dropout(x, training=training)\n",
-        "    x = tf.nn.leaky_relu(self.conv2(x))\n",
-        "    x = self.dropout(x, training=training)\n",
-        "    x = self.flatten(x)\n",
-        "    x = self.fc1(x)\n",
-        "    return x"
+        "id": "gDkA05NE6QMs",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "generator = make_generator_model()\n",
+        "discriminator = make_discriminator_model()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "0FMYgY_mPfTi"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Define the loss functions and the optimizer\n",
+        "\n",
+        "Let's define the loss functions and the optimizers for the generator and the discriminator.\n"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "gDkA05NE6QMs"
+        "colab_type": "text",
+        "id": "Jd-3GCUEiKtv"
       },
-      "outputs": [],
+      "cell_type": "markdown",
       "source": [
-        "generator = Generator()\n",
-        "discriminator = Discriminator()"
+        "### Generator loss\n",
+        "The generator loss is a sigmoid cross entropy loss of the generated images and an array of ones, since the generator is trying to generate fake images that resemble the real images."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "k1HpMSLImuRi"
+        "id": "90BIcCKcDMxz",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "# Defun gives 10 secs/epoch performance boost\n",
-        "generator.call = tf.contrib.eager.defun(generator.call)\n",
-        "discriminator.call = tf.contrib.eager.defun(discriminator.call)"
-      ]
+        "def generator_loss(generated_output):\n",
+        "    return tf.losses.sigmoid_cross_entropy(tf.ones_like(generated_output), generated_output)"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "0FMYgY_mPfTi"
+        "id": "PKY_iPSPNWoj"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Define the loss functions and the optimizer\n",
-        "\n",
-        "* **Discriminator loss**\n",
-        "  * The discriminator loss function takes 2 inputs; **real images, generated images**\n",
-        "  * real_loss is a sigmoid cross entropy loss of the **real images** and an **array of ones (since these are the real images)**\n",
-        "  * generated_loss is a sigmoid cross entropy loss of the **generated images** and an **array of zeros (since these are the fake images)**\n",
-        "  * Then the total_loss is the sum of real_loss and the generated_loss\n",
-        "  \n",
-        "* **Generator loss**\n",
-        "  * It is a sigmoid cross entropy loss of the generated images and an **array of ones**\n",
-        "  \n",
+        "### Discriminator loss\n",
         "\n",
-        "* The discriminator and the generator optimizers are different since we will train them separately."
+        "The discriminator loss function takes two inputs: real images, and generated images. Here is how to calculate the discriminator loss:\n",
+        "1. Calculate real_loss which is a sigmoid cross entropy loss of the real images and an array of ones (since these are the real images).\n",
+        "2. Calculate generated_loss which is a sigmoid cross entropy loss of the generated images and an array of zeros (since these are the fake images).\n",
+        "3. Calculate the total_loss as the sum of real_loss and generated_loss."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "wkMNfBWlT-PV"
+        "id": "wkMNfBWlT-PV",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def discriminator_loss(real_output, generated_output):\n",
-        "    # [1,1,...,1] with real output since it is true and we want\n",
-        "    # our generated examples to look like it\n",
+        "    # [1,1,...,1] with real output since it is true and we want our generated examples to look like it\n",
         "    real_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.ones_like(real_output), logits=real_output)\n",
         "\n",
         "    # [0,0,...,0] with generated images since they are fake\n",
@@ -327,55 +408,51 @@
         "    total_loss = real_loss + generated_loss\n",
         "\n",
         "    return total_loss"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "90BIcCKcDMxz"
+        "colab_type": "text",
+        "id": "MgIc7i0th_Iu"
       },
-      "outputs": [],
+      "cell_type": "markdown",
       "source": [
-        "def generator_loss(generated_output):\n",
-        "    return tf.losses.sigmoid_cross_entropy(tf.ones_like(generated_output), generated_output)"
+        "The discriminator and the generator optimizers are different since we will train two networks separately."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "iWCn_PVdEJZ7"
+        "id": "iWCn_PVdEJZ7",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "discriminator_optimizer = tf.train.AdamOptimizer(1e-4)\n",
-        "generator_optimizer = tf.train.AdamOptimizer(1e-4)"
-      ]
+        "generator_optimizer = tf.train.AdamOptimizer(1e-4)\n",
+        "discriminator_optimizer = tf.train.AdamOptimizer(1e-4)"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "mWtinsGDPJlV"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Checkpoints (Object-based saving)"
+        "**Checkpoints (Object-based saving)**"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "CA1w-7s2POEy"
+        "id": "CA1w-7s2POEy",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "checkpoint_dir = './training_checkpoints'\n",
         "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
@@ -383,93 +460,85 @@
         "                                 discriminator_optimizer=discriminator_optimizer,\n",
         "                                 generator=generator,\n",
         "                                 discriminator=discriminator)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "Rw1fkAczTQYh"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Set up GANs for Training\n",
+        "\n"
       ]
     },
     {
+      "metadata": {
+        "colab_type": "text",
+        "id": "5QC5BABamh_c"
+      },
       "cell_type": "markdown",
+      "source": [
+        "Now it's time to put together the generator and discriminator to set up the Generative Adversarial Networks, as you see in the diagam at the beginning of the tutorial."
+      ]
+    },
+    {
       "metadata": {
         "colab_type": "text",
-        "id": "Rw1fkAczTQYh"
+        "id": "Ff6oN6PZX27n"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Training\n",
-        "\n",
-        "* We start by iterating over the dataset\n",
-        "* The generator is given **noise as an input** which when passed through the generator model will output a image looking like a handwritten digit\n",
-        "* The discriminator is given the **real MNIST images as well as the generated images (from the generator)**.\n",
-        "* Next, we calculate the generator and the discriminator loss.\n",
-        "* Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables (inputs) and apply those to the optimizer.\n",
-        "\n",
-        "## Generate Images\n",
-        "\n",
-        "* After training, its time to generate some images!\n",
-        "* We start by creating noise array as an input to the generator\n",
-        "* The generator will then convert the noise into handwritten images.\n",
-        "* Last step is to plot the predictions and **voila!**"
+        "**Define training parameters**"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "NS2GWywBbAWo"
+        "id": "NS2GWywBbAWo",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "EPOCHS = 150\n",
+        "EPOCHS = 50\n",
         "noise_dim = 100\n",
         "num_examples_to_generate = 16\n",
         "\n",
-        "# keeping the random vector constant for generation (prediction) so\n",
-        "# it will be easier to see the improvement of the gan.\n",
+        "# We'll re-use this random vector used to seed the generator so\n",
+        "# it will be easier to see the improvement over time.\n",
         "random_vector_for_generation = tf.random_normal([num_examples_to_generate,\n",
         "                                                 noise_dim])"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "RmdVsmvhPxyy"
+        "colab_type": "text",
+        "id": "jylSonrqSWfi"
       },
-      "outputs": [],
+      "cell_type": "markdown",
       "source": [
-        "def generate_and_save_images(model, epoch, test_input):\n",
-        "  # make sure the training parameter is set to False because we\n",
-        "  # don't want to train the batchnorm layer when doing inference.\n",
-        "  predictions = model(test_input, training=False)\n",
+        "**Define training method**\n",
         "\n",
-        "  fig = plt.figure(figsize=(4,4))\n",
-        "  \n",
-        "  for i in range(predictions.shape[0]):\n",
-        "      plt.subplot(4, 4, i+1)\n",
-        "      plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')\n",
-        "      plt.axis('off')\n",
-        "        \n",
-        "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
-        "  plt.show()"
+        "We start by iterating over the dataset. The generator is given a random vector as an input which is processed to  output an image looking like a handwritten digit. The discriminator is then shown the real MNIST images as well as the generated images.\n",
+        "\n",
+        "Next, we calculate the generator and the discriminator loss. Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
+        "id": "3t5ibNo05jCB",
         "colab_type": "code",
-        "id": "2M7LmLtGEMQJ"
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "def train(dataset, epochs, noise_dim):  \n",
-        "  for epoch in range(epochs):\n",
-        "    start = time.time()\n",
-        "    \n",
-        "    for images in dataset:\n",
-        "      # generating noise from a uniform distribution\n",
+        "def train_step(images):\n",
+        "   # generating noise from a normal distribution\n",
         "      noise = tf.random_normal([BATCH_SIZE, noise_dim])\n",
         "      \n",
         "      with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:\n",
@@ -477,7 +546,7 @@
         "      \n",
         "        real_output = discriminator(images, training=True)\n",
         "        generated_output = discriminator(generated_images, training=True)\n",
-        "        \n",
+        "         \n",
         "        gen_loss = generator_loss(generated_output)\n",
         "        disc_loss = discriminator_loss(real_output, generated_output)\n",
         "        \n",
@@ -485,12 +554,54 @@
         "      gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.variables)\n",
         "      \n",
         "      generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.variables))\n",
-        "      discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.variables))\n",
+        "      discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.variables))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "6TSZgwc2BUQ-"
+      },
+      "cell_type": "markdown",
+      "source": [
         "\n",
-        "      \n",
-        "    if epoch % 1 == 0:\n",
-        "      display.clear_output(wait=True)\n",
-        "      generate_and_save_images(generator,\n",
+        "This model takes about ~30 seconds per epoch to train on a single Tesla K80 on Colab, as of October 2018. \n",
+        "\n",
+        "Eager execution can be slower than executing the equivalent graph as it can't benefit from whole-program optimizations on the graph, and also incurs overheads of interpreting Python code. By using [tf.contrib.eager.defun](https://www.tensorflow.org/api_docs/python/tf/contrib/eager/defun) to create graph functions, we get a ~20 secs/epoch performance boost (from ~50 secs/epoch down to ~30 secs/epoch). This way we get the best of both eager execution (easier for debugging) and graph mode (better performance)."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "Iwya07_j5p2A",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "train_step = tf.contrib.eager.defun(train_step)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "code",
+        "id": "2M7LmLtGEMQJ",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "def train(dataset, epochs):  \n",
+        "  for epoch in range(epochs):\n",
+        "    start = time.time()\n",
+        "    \n",
+        "    for images in dataset:\n",
+        "      train_step(images)\n",
+        "\n",
+        "    display.clear_output(wait=True)\n",
+        "    generate_and_save_images(generator,\n",
         "                               epoch + 1,\n",
         "                               random_vector_for_generation)\n",
         "    \n",
@@ -505,111 +616,167 @@
         "  generate_and_save_images(generator,\n",
         "                           epochs,\n",
         "                           random_vector_for_generation)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "2aFF7Hk3XdeW"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "**Generate and save images**\n",
+        "\n"
       ]
     },
     {
+      "metadata": {
+        "colab_type": "code",
+        "id": "RmdVsmvhPxyy",
+        "colab": {}
+      },
       "cell_type": "code",
+      "source": [
+        "def generate_and_save_images(model, epoch, test_input):\n",
+        "  # make sure the training parameter is set to False because we\n",
+        "  # don't want to train the batchnorm layer when doing inference.\n",
+        "  predictions = model(test_input, training=False)\n",
+        "\n",
+        "  fig = plt.figure(figsize=(4,4))\n",
+        "  \n",
+        "  for i in range(predictions.shape[0]):\n",
+        "      plt.subplot(4, 4, i+1)\n",
+        "      plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')\n",
+        "      plt.axis('off')\n",
+        "        \n",
+        "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
+        "  plt.show()"
+      ],
       "execution_count": 0,
+      "outputs": []
+    },
+    {
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "Ly3UN0SLLY2l"
+        "colab_type": "text",
+        "id": "dZrd4CdjR-Fp"
       },
-      "outputs": [],
+      "cell_type": "markdown",
       "source": [
-        "train(train_dataset, EPOCHS, noise_dim)"
+        "## Train the GANs\n",
+        "We will call the train() method defined above to train the generator and discriminator simultaneously. Note, training GANs can be tricky. It's important that the generator and discriminator do not overpower each other (e.g., that they train at a similar rate).\n",
+        "\n",
+        "At the beginning of the training, the generated images look like random noise. As training progresses, you can see the generated digits look increasingly real. After 50 epochs, they look very much like the MNIST digits."
       ]
     },
     {
-      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "code",
+        "id": "Ly3UN0SLLY2l",
+        "colab": {}
+      },
+      "cell_type": "code",
+      "source": [
+        "%%time\n",
+        "train(train_dataset, EPOCHS)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
       "metadata": {
         "colab_type": "text",
         "id": "rfM4YcPVPkNO"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Restore the latest checkpoint"
+        "**Restore the latest checkpoint**"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "XhXsd0srPo8c"
+        "id": "XhXsd0srPo8c",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# restoring the latest checkpoint in checkpoint_dir\n",
         "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "P4M_vIbUi7c0"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Display an image using the epoch number"
+        "## Generated images \n"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "WfO5wCdclHGL"
+        "colab_type": "text",
+        "id": "mLskt7EfXAjr"
       },
-      "outputs": [],
+      "cell_type": "markdown",
       "source": [
-        "def display_image(epoch_no):\n",
-        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
+        "\n",
+        "After training, its time to generate some images! \n",
+        "The last step is to plot the generated images and voila!\n"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "5x3q9_Oe5q0A"
+        "id": "WfO5wCdclHGL",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "display_image(EPOCHS)"
-      ]
+        "# Display a single image using the epoch number\n",
+        "def display_image(epoch_no):\n",
+        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
-        "id": "NywiH3nL8guF"
+        "colab_type": "code",
+        "id": "5x3q9_Oe5q0A",
+        "colab": {}
       },
+      "cell_type": "code",
       "source": [
-        "## Generate a GIF of all the saved images."
-      ]
+        "display_image(EPOCHS)"
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "xmO0Dmu2WICn"
+        "id": "NywiH3nL8guF"
       },
+      "cell_type": "markdown",
       "source": [
-        "\u003c!-- TODO(markdaoust): Remove the hack when Ipython version is updated --\u003e\n"
+        "**Generate a GIF of all the saved images**\n",
+        "\n",
+        "We will use imageio to create an animated gif using all the images saved during training."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "IGKQgENQ8lEI"
+        "id": "IGKQgENQ8lEI",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "with imageio.get_writer('dcgan.gif', mode='I') as writer:\n",
         "  filenames = glob.glob('image*.png')\n",
@@ -617,7 +784,7 @@
         "  last = -1\n",
         "  for i,filename in enumerate(filenames):\n",
         "    frame = 2*(i**0.5)\n",
-        "    if round(frame) \u003e round(last):\n",
+        "    if round(frame) > round(last):\n",
         "      last = frame\n",
         "    else:\n",
         "      continue\n",
@@ -628,67 +795,84 @@
         "    \n",
         "# this is a hack to display the gif inside the notebook\n",
         "os.system('cp dcgan.gif dcgan.gif.png')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "cGhC3-fMWSwl"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Display the animated gif with all the mages generated during the training of GANs."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "uV0yiKpzNP1b"
+        "id": "uV0yiKpzNP1b",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "display.Image(filename=\"dcgan.gif.png\")"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "6EEG-wePkmJQ"
       },
+      "cell_type": "markdown",
       "source": [
-        "To downlod the animation from Colab uncomment the code below:"
+        "**Download the animated gif**\n",
+        "\n",
+        "Uncomment the code below to download an animated gif from Colab."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "4UJjSnIMOzOJ"
+        "id": "4UJjSnIMOzOJ",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "#from google.colab import files\n",
         "#files.download('dcgan.gif')"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "dcgan.ipynb",
-      "private_outputs": true,
-      "provenance": [
-        {
-          "file_id": "1eb0NOTQapkYs3X0v-zL1x5_LFKgDISnp",
-          "timestamp": 1527173385672
-        }
       ],
-      "toc_visible": true,
-      "version": "0.3.2"
+      "execution_count": 0,
+      "outputs": []
     },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "k6qC-SbjK0yW"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Learn more about GANs\n"
+      ]
+    },
+    {
+      "metadata": {
+        "colab_type": "text",
+        "id": "xjjkT9KAK6H7"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "We hope this tutorial was helpful! As a next step, you might like to experiment with a different dataset, for example the Large-scale Celeb Faces Attributes (CelebA) dataset [available on Kaggle](https://www.kaggle.com/jessicali9530/celeba-dataset/home).\n",
+        "\n",
+        "To learn more about GANs:\n",
+        "\n",
+        "* Check out MIT's lecture (linked above), or [this](http://cs231n.stanford.edu/slides/2018/cs231n_2018_lecture12.pdf) lecture form Stanford's CS231n. \n",
+        "\n",
+        "* We also recommend the [CVPR 2018 Tutorial on GANs](https://sites.google.com/view/cvpr2018tutorialongans/), and the [NIPS 2016 Tutorial: Generative Adversarial Networks](https://arxiv.org/abs/1701.00160).\n"
+      ]
     }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png b/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..b715bd83ef117641c6429e0ac173dbe9b8d5fd88
Binary files /dev/null and b/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png differ
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
index 3acecd283cda83992bab0c37cf0b8037ed2cf27a..12c5eff2b4aa901bdab52bf545e95b1e4dce7468 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
@@ -1,1184 +1,1174 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "K2s1A9eLRPEj"
+   },
+   "source": [
+    "##### Copyright 2018 The TensorFlow Authors.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Cffg2i257iMS"
+   },
+   "source": [
+    "# Image Captioning with Attention\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
+    "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
+    "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
+    "</td><td>\n",
+    "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "QASbY_HGo4Lq"
+   },
+   "source": [
+    "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
+    "\n",
+    "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
+    "\n",
+    "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
+    "\n",
+    "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
+    "\n",
+    "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
+    "\n",
+    "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
+    "\n",
+    "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
+    "\n",
+    "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
+    "\n",
+    "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
+    "\n",
+    "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
     "colab": {
-      "name": "image_captioning_with_attention.ipynb",
-      "version": "0.3.2",
-      "views": {},
-      "default_view": {},
-      "provenance": [
-        {
-          "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
-          "timestamp": 1530222436922
-        }
-      ],
-      "private_outputs": true,
-      "collapsed_sections": [],
-      "toc_visible": true
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "accelerator": "GPU"
+    "colab_type": "code",
+    "id": "U8l4RJ0XRPEm"
+   },
+   "outputs": [],
+   "source": [
+    "# Import TensorFlow and enable eager execution\n",
+    "# This code requires TensorFlow version >=1.9\n",
+    "import tensorflow as tf\n",
+    "tf.enable_eager_execution()\n",
+    "\n",
+    "# We'll generate plots of attention in order to see which parts of an image\n",
+    "# our model focuses on during captioning\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Scikit-learn includes many helpful utilities\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.utils import shuffle\n",
+    "\n",
+    "import re\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import time\n",
+    "import json\n",
+    "from glob import glob\n",
+    "from PIL import Image\n",
+    "import pickle"
+   ]
   },
-  "cells": [
-    {
-      "metadata": {
-        "id": "K2s1A9eLRPEj",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "##### Copyright 2018 The TensorFlow Authors.\n",
-        "\n",
-        "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Cffg2i257iMS",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "# Image Captioning with Attention\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "QASbY_HGo4Lq",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
-        "\n",
-        "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
-        "\n",
-        "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
-        "\n",
-        "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
-        "\n",
-        "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
-        "\n",
-        "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
-        "\n",
-        "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
-        "\n",
-        "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
-        "\n",
-        "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
-        "\n",
-        "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "U8l4RJ0XRPEm",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# Import TensorFlow and enable eager execution\n",
-        "# This code requires TensorFlow version >=1.9\n",
-        "import tensorflow as tf\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "# We'll generate plots of attention in order to see which parts of an image\n",
-        "# our model focuses on during captioning\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "# Scikit-learn includes many helpful utilities\n",
-        "from sklearn.model_selection import train_test_split\n",
-        "from sklearn.utils import shuffle\n",
-        "\n",
-        "import re\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import time\n",
-        "import json\n",
-        "from glob import glob\n",
-        "from PIL import Image\n",
-        "import pickle"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "b6qbGw8MRPE5",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Download and prepare the MS-COCO dataset\n",
-        "\n",
-        "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
-        "\n",
-        "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "krQuPYTtRPE7",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
-        "                                          cache_subdir=os.path.abspath('.'),\n",
-        "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
-        "                                          extract = True)\n",
-        "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
-        "\n",
-        "name_of_zip = 'train2014.zip'\n",
-        "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
-        "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
-        "                                      cache_subdir=os.path.abspath('.'),\n",
-        "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
-        "                                      extract = True)\n",
-        "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
-        "else:\n",
-        "  PATH = os.path.abspath('.')+'/train2014/'"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "aANEzb5WwSzg",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Optionally, limit the size of the training set for faster training\n",
-        "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "4G3b8x8_RPFD",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# read the json file\n",
-        "with open(annotation_file, 'r') as f:\n",
-        "    annotations = json.load(f)\n",
-        "\n",
-        "# storing the captions and the image name in vectors\n",
-        "all_captions = []\n",
-        "all_img_name_vector = []\n",
-        "\n",
-        "for annot in annotations['annotations']:\n",
-        "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
-        "    image_id = annot['image_id']\n",
-        "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
-        "    \n",
-        "    all_img_name_vector.append(full_coco_image_path)\n",
-        "    all_captions.append(caption)\n",
-        "\n",
-        "# shuffling the captions and image_names together\n",
-        "# setting a random state\n",
-        "train_captions, img_name_vector = shuffle(all_captions,\n",
-        "                                          all_img_name_vector,\n",
-        "                                          random_state=1)\n",
-        "\n",
-        "# selecting the first 30000 captions from the shuffled set\n",
-        "num_examples = 30000\n",
-        "train_captions = train_captions[:num_examples]\n",
-        "img_name_vector = img_name_vector[:num_examples]"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "mPBMgK34RPFL",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "len(train_captions), len(all_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "8cSW4u-ORPFQ",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Preprocess the images using InceptionV3\n",
-        "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
-        "\n",
-        "First, we will need to convert the images into the format inceptionV3 expects by:\n",
-        "* Resizing the image to (299, 299)\n",
-        "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "zXR0217aRPFR",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def load_image(image_path):\n",
-        "    img = tf.read_file(image_path)\n",
-        "    img = tf.image.decode_jpeg(img, channels=3)\n",
-        "    img = tf.image.resize_images(img, (299, 299))\n",
-        "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
-        "    return img, image_path"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "MDvIu4sXRPFV",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
-        "\n",
-        "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
-        "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
-        "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
-        "* We avoid doing this during training so it does not become a bottleneck. \n",
-        "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "RD3vW4SsRPFW",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
-        "                                                weights='imagenet')\n",
-        "new_input = image_model.input\n",
-        "hidden_layer = image_model.layers[-1].output\n",
-        "\n",
-        "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "rERqlR3WRPGO",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Caching the features extracted from InceptionV3\n",
-        "\n",
-        "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
-        "\n",
-        "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
-        "\n",
-        "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
-        "\n",
-        "```for img, path in image_dataset:``` \n",
-        "\n",
-        "to:\n",
-        "\n",
-        "```for img, path in tqdm(image_dataset):```."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Dx_fvbVgRPGQ",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# getting the unique images\n",
-        "encode_train = sorted(set(img_name_vector))\n",
-        "\n",
-        "# feel free to change the batch_size according to your system configuration\n",
-        "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
-        "                                encode_train).map(load_image).batch(16)\n",
-        "\n",
-        "for img, path in image_dataset:\n",
-        "  batch_features = image_features_extract_model(img)\n",
-        "  batch_features = tf.reshape(batch_features, \n",
-        "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
-        "\n",
-        "  for bf, p in zip(batch_features, path):\n",
-        "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
-        "    np.save(path_of_feature, bf.numpy())"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "nyqH3zFwRPFi",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Preprocess and tokenize the captions\n",
-        "\n",
-        "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
-        "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
-        "* Finally, we create a word --> index mapping and vice-versa.\n",
-        "* We will then pad all sequences to the be same length as the longest one. "
-      ]
-    },
-    {
-      "metadata": {
-        "id": "HZfK8RhQRPFj",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# This will find the maximum length of any caption in our dataset\n",
-        "def calc_max_length(tensor):\n",
-        "    return max(len(t) for t in tensor)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "oJGE34aiRPFo",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# The steps above is a general process of dealing with text processing\n",
-        "\n",
-        "# choosing the top 5000 words from the vocabulary\n",
-        "top_k = 5000\n",
-        "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
-        "                                                  oov_token=\"<unk>\", \n",
-        "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
-        "tokenizer.fit_on_texts(train_captions)\n",
-        "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "b6qbGw8MRPE5"
+   },
+   "source": [
+    "## Download and prepare the MS-COCO dataset\n",
+    "\n",
+    "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
+    "\n",
+    "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "8Q44tNQVRPFt",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "tokenizer.word_index = {key:value for key, value in tokenizer.word_index.items() if value <= top_k}\n",
-        "# putting <unk> token in the word2idx dictionary\n",
-        "tokenizer.word_index[tokenizer.oov_token] = top_k + 1\n",
-        "tokenizer.word_index['<pad>'] = 0"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "krQuPYTtRPE7"
+   },
+   "outputs": [],
+   "source": [
+    "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
+    "                                          cache_subdir=os.path.abspath('.'),\n",
+    "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
+    "                                          extract = True)\n",
+    "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
+    "\n",
+    "name_of_zip = 'train2014.zip'\n",
+    "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
+    "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
+    "                                      cache_subdir=os.path.abspath('.'),\n",
+    "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
+    "                                      extract = True)\n",
+    "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
+    "else:\n",
+    "  PATH = os.path.abspath('.')+'/train2014/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "aANEzb5WwSzg"
+   },
+   "source": [
+    "## Optionally, limit the size of the training set for faster training\n",
+    "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "0fpJb5ojRPFv",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# creating the tokenized vectors\n",
-        "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "4G3b8x8_RPFD"
+   },
+   "outputs": [],
+   "source": [
+    "# read the json file\n",
+    "with open(annotation_file, 'r') as f:\n",
+    "    annotations = json.load(f)\n",
+    "\n",
+    "# storing the captions and the image name in vectors\n",
+    "all_captions = []\n",
+    "all_img_name_vector = []\n",
+    "\n",
+    "for annot in annotations['annotations']:\n",
+    "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
+    "    image_id = annot['image_id']\n",
+    "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
+    "    \n",
+    "    all_img_name_vector.append(full_coco_image_path)\n",
+    "    all_captions.append(caption)\n",
+    "\n",
+    "# shuffling the captions and image_names together\n",
+    "# setting a random state\n",
+    "train_captions, img_name_vector = shuffle(all_captions,\n",
+    "                                          all_img_name_vector,\n",
+    "                                          random_state=1)\n",
+    "\n",
+    "# selecting the first 30000 captions from the shuffled set\n",
+    "num_examples = 30000\n",
+    "train_captions = train_captions[:num_examples]\n",
+    "img_name_vector = img_name_vector[:num_examples]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "olQArbgbRPF1",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# creating a reverse mapping (index -> word)\n",
-        "index_word = {value:key for key, value in tokenizer.word_index.items()}"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "mPBMgK34RPFL"
+   },
+   "outputs": [],
+   "source": [
+    "len(train_captions), len(all_captions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "8cSW4u-ORPFQ"
+   },
+   "source": [
+    "## Preprocess the images using InceptionV3\n",
+    "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
+    "\n",
+    "First, we will need to convert the images into the format inceptionV3 expects by:\n",
+    "* Resizing the image to (299, 299)\n",
+    "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AidglIZVRPF4",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# padding each vector to the max_length of the captions\n",
-        "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
-        "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "zXR0217aRPFR"
+   },
+   "outputs": [],
+   "source": [
+    "def load_image(image_path):\n",
+    "    img = tf.read_file(image_path)\n",
+    "    img = tf.image.decode_jpeg(img, channels=3)\n",
+    "    img = tf.image.resize_images(img, (299, 299))\n",
+    "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
+    "    return img, image_path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "MDvIu4sXRPFV"
+   },
+   "source": [
+    "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
+    "\n",
+    "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
+    "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
+    "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
+    "* We avoid doing this during training so it does not become a bottleneck. \n",
+    "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "gL0wkttkRPGA",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# calculating the max_length \n",
-        "# used to store the attention weights\n",
-        "max_length = calc_max_length(train_seqs)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "RD3vW4SsRPFW"
+   },
+   "outputs": [],
+   "source": [
+    "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
+    "                                                weights='imagenet')\n",
+    "new_input = image_model.input\n",
+    "hidden_layer = image_model.layers[-1].output\n",
+    "\n",
+    "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "rERqlR3WRPGO"
+   },
+   "source": [
+    "## Caching the features extracted from InceptionV3\n",
+    "\n",
+    "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
+    "\n",
+    "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
+    "\n",
+    "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
+    "\n",
+    "```for img, path in image_dataset:``` \n",
+    "\n",
+    "to:\n",
+    "\n",
+    "```for img, path in tqdm(image_dataset):```."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "M3CD75nDpvTI",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Split the data into training and testing"
-      ]
+    "colab_type": "code",
+    "id": "Dx_fvbVgRPGQ"
+   },
+   "outputs": [],
+   "source": [
+    "# getting the unique images\n",
+    "encode_train = sorted(set(img_name_vector))\n",
+    "\n",
+    "# feel free to change the batch_size according to your system configuration\n",
+    "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "                                encode_train).map(load_image).batch(16)\n",
+    "\n",
+    "for img, path in image_dataset:\n",
+    "  batch_features = image_features_extract_model(img)\n",
+    "  batch_features = tf.reshape(batch_features, \n",
+    "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
+    "\n",
+    "  for bf, p in zip(batch_features, path):\n",
+    "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
+    "    np.save(path_of_feature, bf.numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "nyqH3zFwRPFi"
+   },
+   "source": [
+    "## Preprocess and tokenize the captions\n",
+    "\n",
+    "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
+    "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
+    "* Finally, we create a word --> index mapping and vice-versa.\n",
+    "* We will then pad all sequences to the be same length as the longest one. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "iS7DDMszRPGF",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# Create training and validation sets using 80-20 split\n",
-        "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
-        "                                                                    cap_vector, \n",
-        "                                                                    test_size=0.2, \n",
-        "                                                                    random_state=0)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "HZfK8RhQRPFj"
+   },
+   "outputs": [],
+   "source": [
+    "# This will find the maximum length of any caption in our dataset\n",
+    "def calc_max_length(tensor):\n",
+    "    return max(len(t) for t in tensor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "XmViPkRFRPGH",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "oJGE34aiRPFo"
+   },
+   "outputs": [],
+   "source": [
+    "# The steps above is a general process of dealing with text processing\n",
+    "\n",
+    "# choosing the top 5000 words from the vocabulary\n",
+    "top_k = 5000\n",
+    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
+    "                                                  oov_token=\"<unk>\", \n",
+    "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
+    "tokenizer.fit_on_texts(train_captions)\n",
+    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "uEWM9xrYcg45",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
-        "\n"
-      ]
+    "colab_type": "code",
+    "id": "8Q44tNQVRPFt"
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer.word_index['<pad>'] = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Q3TnZ1ToRPGV",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# feel free to change these parameters according to your system's configuration\n",
-        "\n",
-        "BATCH_SIZE = 64\n",
-        "BUFFER_SIZE = 1000\n",
-        "embedding_dim = 256\n",
-        "units = 512\n",
-        "vocab_size = len(tokenizer.word_index)\n",
-        "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
-        "# these two variables represent that\n",
-        "features_shape = 2048\n",
-        "attention_features_shape = 64"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "0fpJb5ojRPFv"
+   },
+   "outputs": [],
+   "source": [
+    "# creating the tokenized vectors\n",
+    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "SmZS2N0bXG3T",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# loading the numpy files \n",
-        "def map_func(img_name, cap):\n",
-        "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
-        "    return img_tensor, cap"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "AidglIZVRPF4"
+   },
+   "outputs": [],
+   "source": [
+    "# padding each vector to the max_length of the captions\n",
+    "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
+    "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "FDF_Nm3tRPGZ",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
-        "\n",
-        "# using map to load the numpy files in parallel\n",
-        "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
-        "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
-        "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
-        "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
-        "\n",
-        "# shuffling and batching\n",
-        "dataset = dataset.shuffle(BUFFER_SIZE)\n",
-        "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
-        "dataset = dataset.batch(BATCH_SIZE)\n",
-        "dataset = dataset.prefetch(1)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "gL0wkttkRPGA"
+   },
+   "outputs": [],
+   "source": [
+    "# calculating the max_length \n",
+    "# used to store the attention weights\n",
+    "max_length = calc_max_length(train_seqs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "M3CD75nDpvTI"
+   },
+   "source": [
+    "## Split the data into training and testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "nrvoDphgRPGd",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Model\n",
-        "\n",
-        "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-        "\n",
-        "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
-        "\n",
-        "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
-        "* We squash that to a shape of (64, 2048).\n",
-        "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
-        "* The RNN(here GRU) attends over the image to predict the next word."
-      ]
+    "colab_type": "code",
+    "id": "iS7DDMszRPGF"
+   },
+   "outputs": [],
+   "source": [
+    "# Create training and validation sets using 80-20 split\n",
+    "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
+    "                                                                    cap_vector, \n",
+    "                                                                    test_size=0.2, \n",
+    "                                                                    random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AAppCGLKRPGd",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def gru(units):\n",
-        "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
-        "  # significant speedup).\n",
-        "  if tf.test.is_gpu_available():\n",
-        "    return tf.keras.layers.CuDNNGRU(units, \n",
-        "                                    return_sequences=True, \n",
-        "                                    return_state=True, \n",
-        "                                    recurrent_initializer='glorot_uniform')\n",
-        "  else:\n",
-        "    return tf.keras.layers.GRU(units, \n",
-        "                               return_sequences=True, \n",
-        "                               return_state=True, \n",
-        "                               recurrent_activation='sigmoid', \n",
-        "                               recurrent_initializer='glorot_uniform')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "XmViPkRFRPGH"
+   },
+   "outputs": [],
+   "source": [
+    "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "uEWM9xrYcg45"
+   },
+   "source": [
+    "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "ja2LFTMSdeV3",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class BahdanauAttention(tf.keras.Model):\n",
-        "  def __init__(self, units):\n",
-        "    super(BahdanauAttention, self).__init__()\n",
-        "    self.W1 = tf.keras.layers.Dense(units)\n",
-        "    self.W2 = tf.keras.layers.Dense(units)\n",
-        "    self.V = tf.keras.layers.Dense(1)\n",
-        "  \n",
-        "  def call(self, features, hidden):\n",
-        "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
-        "    \n",
-        "    # hidden shape == (batch_size, hidden_size)\n",
-        "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
-        "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
-        "    \n",
-        "    # score shape == (batch_size, 64, hidden_size)\n",
-        "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
-        "    \n",
-        "    # attention_weights shape == (batch_size, 64, 1)\n",
-        "    # we get 1 at the last axis because we are applying score to self.V\n",
-        "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
-        "    \n",
-        "    # context_vector shape after sum == (batch_size, hidden_size)\n",
-        "    context_vector = attention_weights * features\n",
-        "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
-        "    \n",
-        "    return context_vector, attention_weights"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "Q3TnZ1ToRPGV"
+   },
+   "outputs": [],
+   "source": [
+    "# feel free to change these parameters according to your system's configuration\n",
+    "\n",
+    "BATCH_SIZE = 64\n",
+    "BUFFER_SIZE = 1000\n",
+    "embedding_dim = 256\n",
+    "units = 512\n",
+    "vocab_size = len(tokenizer.word_index)\n",
+    "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
+    "# these two variables represent that\n",
+    "features_shape = 2048\n",
+    "attention_features_shape = 64"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AZ7R1RxHRPGf",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class CNN_Encoder(tf.keras.Model):\n",
-        "    # Since we have already extracted the features and dumped it using pickle\n",
-        "    # This encoder passes those features through a Fully connected layer\n",
-        "    def __init__(self, embedding_dim):\n",
-        "        super(CNN_Encoder, self).__init__()\n",
-        "        # shape after fc == (batch_size, 64, embedding_dim)\n",
-        "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
-        "        \n",
-        "    def call(self, x):\n",
-        "        x = self.fc(x)\n",
-        "        x = tf.nn.relu(x)\n",
-        "        return x"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "SmZS2N0bXG3T"
+   },
+   "outputs": [],
+   "source": [
+    "# loading the numpy files \n",
+    "def map_func(img_name, cap):\n",
+    "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
+    "    return img_tensor, cap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "V9UbGQmERPGi",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class RNN_Decoder(tf.keras.Model):\n",
-        "  def __init__(self, embedding_dim, units, vocab_size):\n",
-        "    super(RNN_Decoder, self).__init__()\n",
-        "    self.units = units\n",
-        "\n",
-        "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-        "    self.gru = gru(self.units)\n",
-        "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
-        "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
-        "    \n",
-        "    self.attention = BahdanauAttention(self.units)\n",
-        "        \n",
-        "  def call(self, x, features, hidden):\n",
-        "    # defining attention as a separate model\n",
-        "    context_vector, attention_weights = self.attention(features, hidden)\n",
-        "    \n",
-        "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
-        "    x = self.embedding(x)\n",
-        "    \n",
-        "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
-        "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
-        "    \n",
-        "    # passing the concatenated vector to the GRU\n",
-        "    output, state = self.gru(x)\n",
-        "    \n",
-        "    # shape == (batch_size, max_length, hidden_size)\n",
-        "    x = self.fc1(output)\n",
-        "    \n",
-        "    # x shape == (batch_size * max_length, hidden_size)\n",
-        "    x = tf.reshape(x, (-1, x.shape[2]))\n",
-        "    \n",
-        "    # output shape == (batch_size * max_length, vocab)\n",
-        "    x = self.fc2(x)\n",
-        "\n",
-        "    return x, state, attention_weights\n",
-        "\n",
-        "  def reset_state(self, batch_size):\n",
-        "    return tf.zeros((batch_size, self.units))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "FDF_Nm3tRPGZ"
+   },
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
+    "\n",
+    "# using map to load the numpy files in parallel\n",
+    "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
+    "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
+    "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
+    "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
+    "\n",
+    "# shuffling and batching\n",
+    "dataset = dataset.shuffle(BUFFER_SIZE)\n",
+    "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
+    "dataset = dataset.batch(BATCH_SIZE)\n",
+    "dataset = dataset.prefetch(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "nrvoDphgRPGd"
+   },
+   "source": [
+    "## Model\n",
+    "\n",
+    "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
+    "\n",
+    "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
+    "\n",
+    "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
+    "* We squash that to a shape of (64, 2048).\n",
+    "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
+    "* The RNN(here GRU) attends over the image to predict the next word."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Qs_Sr03wRPGk",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "encoder = CNN_Encoder(embedding_dim)\n",
-        "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "AAppCGLKRPGd"
+   },
+   "outputs": [],
+   "source": [
+    "def gru(units):\n",
+    "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
+    "  # significant speedup).\n",
+    "  if tf.test.is_gpu_available():\n",
+    "    return tf.keras.layers.CuDNNGRU(units, \n",
+    "                                    return_sequences=True, \n",
+    "                                    return_state=True, \n",
+    "                                    recurrent_initializer='glorot_uniform')\n",
+    "  else:\n",
+    "    return tf.keras.layers.GRU(units, \n",
+    "                               return_sequences=True, \n",
+    "                               return_state=True, \n",
+    "                               recurrent_activation='sigmoid', \n",
+    "                               recurrent_initializer='glorot_uniform')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "-bYN7xA0RPGl",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "optimizer = tf.train.AdamOptimizer()\n",
-        "\n",
-        "# We are masking the loss calculated for padding\n",
-        "def loss_function(real, pred):\n",
-        "    mask = 1 - np.equal(real, 0)\n",
-        "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
-        "    return tf.reduce_mean(loss_)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "ja2LFTMSdeV3"
+   },
+   "outputs": [],
+   "source": [
+    "class BahdanauAttention(tf.keras.Model):\n",
+    "  def __init__(self, units):\n",
+    "    super(BahdanauAttention, self).__init__()\n",
+    "    self.W1 = tf.keras.layers.Dense(units)\n",
+    "    self.W2 = tf.keras.layers.Dense(units)\n",
+    "    self.V = tf.keras.layers.Dense(1)\n",
+    "  \n",
+    "  def call(self, features, hidden):\n",
+    "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
+    "    \n",
+    "    # hidden shape == (batch_size, hidden_size)\n",
+    "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
+    "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
+    "    \n",
+    "    # score shape == (batch_size, 64, hidden_size)\n",
+    "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
+    "    \n",
+    "    # attention_weights shape == (batch_size, 64, 1)\n",
+    "    # we get 1 at the last axis because we are applying score to self.V\n",
+    "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
+    "    \n",
+    "    # context_vector shape after sum == (batch_size, hidden_size)\n",
+    "    context_vector = attention_weights * features\n",
+    "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
+    "    \n",
+    "    return context_vector, attention_weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "PHod7t72RPGn",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Training\n",
-        "\n",
-        "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
-        "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
-        "* The decoder returns the predictions and the decoder hidden state.\n",
-        "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
-        "* Use teacher forcing to decide the next input to the decoder.\n",
-        "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
-        "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
-      ]
+    "colab_type": "code",
+    "id": "AZ7R1RxHRPGf"
+   },
+   "outputs": [],
+   "source": [
+    "class CNN_Encoder(tf.keras.Model):\n",
+    "    # Since we have already extracted the features and dumped it using pickle\n",
+    "    # This encoder passes those features through a Fully connected layer\n",
+    "    def __init__(self, embedding_dim):\n",
+    "        super(CNN_Encoder, self).__init__()\n",
+    "        # shape after fc == (batch_size, 64, embedding_dim)\n",
+    "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
+    "        \n",
+    "    def call(self, x):\n",
+    "        x = self.fc(x)\n",
+    "        x = tf.nn.relu(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Vt4WZ5mhJE-E",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# adding this in a separate cell because if you run the training cell \n",
-        "# many times, the loss_plot array will be reset\n",
-        "loss_plot = []"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "V9UbGQmERPGi"
+   },
+   "outputs": [],
+   "source": [
+    "class RNN_Decoder(tf.keras.Model):\n",
+    "  def __init__(self, embedding_dim, units, vocab_size):\n",
+    "    super(RNN_Decoder, self).__init__()\n",
+    "    self.units = units\n",
+    "\n",
+    "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+    "    self.gru = gru(self.units)\n",
+    "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
+    "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
+    "    \n",
+    "    self.attention = BahdanauAttention(self.units)\n",
+    "        \n",
+    "  def call(self, x, features, hidden):\n",
+    "    # defining attention as a separate model\n",
+    "    context_vector, attention_weights = self.attention(features, hidden)\n",
+    "    \n",
+    "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
+    "    x = self.embedding(x)\n",
+    "    \n",
+    "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
+    "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
+    "    \n",
+    "    # passing the concatenated vector to the GRU\n",
+    "    output, state = self.gru(x)\n",
+    "    \n",
+    "    # shape == (batch_size, max_length, hidden_size)\n",
+    "    x = self.fc1(output)\n",
+    "    \n",
+    "    # x shape == (batch_size * max_length, hidden_size)\n",
+    "    x = tf.reshape(x, (-1, x.shape[2]))\n",
+    "    \n",
+    "    # output shape == (batch_size * max_length, vocab)\n",
+    "    x = self.fc2(x)\n",
+    "\n",
+    "    return x, state, attention_weights\n",
+    "\n",
+    "  def reset_state(self, batch_size):\n",
+    "    return tf.zeros((batch_size, self.units))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "UlA4VIQpRPGo",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "EPOCHS = 20\n",
-        "\n",
-        "for epoch in range(EPOCHS):\n",
-        "    start = time.time()\n",
-        "    total_loss = 0\n",
-        "    \n",
-        "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
-        "        loss = 0\n",
-        "        \n",
-        "        # initializing the hidden state for each batch\n",
-        "        # because the captions are not related from image to image\n",
-        "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
-        "\n",
-        "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
-        "        \n",
-        "        with tf.GradientTape() as tape:\n",
-        "            features = encoder(img_tensor)\n",
-        "            \n",
-        "            for i in range(1, target.shape[1]):\n",
-        "                # passing the features through the decoder\n",
-        "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
-        "\n",
-        "                loss += loss_function(target[:, i], predictions)\n",
-        "                \n",
-        "                # using teacher forcing\n",
-        "                dec_input = tf.expand_dims(target[:, i], 1)\n",
-        "        \n",
-        "        total_loss += (loss / int(target.shape[1]))\n",
-        "        \n",
-        "        variables = encoder.variables + decoder.variables\n",
-        "        \n",
-        "        gradients = tape.gradient(loss, variables) \n",
-        "        \n",
-        "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
-        "        \n",
-        "        if batch % 100 == 0:\n",
-        "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
-        "                                                          batch, \n",
-        "                                                          loss.numpy() / int(target.shape[1])))\n",
-        "    # storing the epoch end loss value to plot later\n",
-        "    loss_plot.append(total_loss / len(cap_vector))\n",
-        "    \n",
-        "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
-        "                                         total_loss/len(cap_vector)))\n",
-        "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "Qs_Sr03wRPGk"
+   },
+   "outputs": [],
+   "source": [
+    "encoder = CNN_Encoder(embedding_dim)\n",
+    "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "1Wm83G-ZBPcC",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "plt.plot(loss_plot)\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('Loss')\n",
-        "plt.title('Loss Plot')\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "-bYN7xA0RPGl"
+   },
+   "outputs": [],
+   "source": [
+    "optimizer = tf.train.AdamOptimizer()\n",
+    "\n",
+    "# We are masking the loss calculated for padding\n",
+    "def loss_function(real, pred):\n",
+    "    mask = 1 - np.equal(real, 0)\n",
+    "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
+    "    return tf.reduce_mean(loss_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "PHod7t72RPGn"
+   },
+   "source": [
+    "## Training\n",
+    "\n",
+    "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
+    "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
+    "* The decoder returns the predictions and the decoder hidden state.\n",
+    "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
+    "* Use teacher forcing to decide the next input to the decoder.\n",
+    "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
+    "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "xGvOcLQKghXN",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Caption!\n",
-        "\n",
-        "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
-        "* Stop predicting when the model predicts the end token.\n",
-        "* And store the attention weights for every time step."
-      ]
+    "colab_type": "code",
+    "id": "Vt4WZ5mhJE-E"
+   },
+   "outputs": [],
+   "source": [
+    "# adding this in a separate cell because if you run the training cell \n",
+    "# many times, the loss_plot array will be reset\n",
+    "loss_plot = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "RCWpDtyNRPGs",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def evaluate(image):\n",
-        "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
-        "\n",
-        "    hidden = decoder.reset_state(batch_size=1)\n",
-        "\n",
-        "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
-        "    img_tensor_val = image_features_extract_model(temp_input)\n",
-        "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
-        "\n",
-        "    features = encoder(img_tensor_val)\n",
-        "\n",
-        "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
-        "    result = []\n",
-        "\n",
-        "    for i in range(max_length):\n",
-        "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
-        "\n",
-        "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
-        "\n",
-        "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
-        "        result.append(index_word[predicted_id])\n",
-        "\n",
-        "        if index_word[predicted_id] == '<end>':\n",
-        "            return result, attention_plot\n",
-        "\n",
-        "        dec_input = tf.expand_dims([predicted_id], 0)\n",
-        "\n",
-        "    attention_plot = attention_plot[:len(result), :]\n",
-        "    return result, attention_plot"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "UlA4VIQpRPGo"
+   },
+   "outputs": [],
+   "source": [
+    "EPOCHS = 20\n",
+    "\n",
+    "for epoch in range(EPOCHS):\n",
+    "    start = time.time()\n",
+    "    total_loss = 0\n",
+    "    \n",
+    "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
+    "        loss = 0\n",
+    "        \n",
+    "        # initializing the hidden state for each batch\n",
+    "        # because the captions are not related from image to image\n",
+    "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
+    "\n",
+    "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
+    "        \n",
+    "        with tf.GradientTape() as tape:\n",
+    "            features = encoder(img_tensor)\n",
+    "            \n",
+    "            for i in range(1, target.shape[1]):\n",
+    "                # passing the features through the decoder\n",
+    "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
+    "\n",
+    "                loss += loss_function(target[:, i], predictions)\n",
+    "                \n",
+    "                # using teacher forcing\n",
+    "                dec_input = tf.expand_dims(target[:, i], 1)\n",
+    "        \n",
+    "        total_loss += (loss / int(target.shape[1]))\n",
+    "        \n",
+    "        variables = encoder.variables + decoder.variables\n",
+    "        \n",
+    "        gradients = tape.gradient(loss, variables) \n",
+    "        \n",
+    "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
+    "        \n",
+    "        if batch % 100 == 0:\n",
+    "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
+    "                                                          batch, \n",
+    "                                                          loss.numpy() / int(target.shape[1])))\n",
+    "    # storing the epoch end loss value to plot later\n",
+    "    loss_plot.append(total_loss / len(cap_vector))\n",
+    "    \n",
+    "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
+    "                                         total_loss/len(cap_vector)))\n",
+    "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "fD_y7PD6RPGt",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def plot_attention(image, result, attention_plot):\n",
-        "    temp_image = np.array(Image.open(image))\n",
-        "\n",
-        "    fig = plt.figure(figsize=(10, 10))\n",
-        "    \n",
-        "    len_result = len(result)\n",
-        "    for l in range(len_result):\n",
-        "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
-        "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
-        "        ax.set_title(result[l])\n",
-        "        img = ax.imshow(temp_image)\n",
-        "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
-        "\n",
-        "    plt.tight_layout()\n",
-        "    plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "1Wm83G-ZBPcC"
+   },
+   "outputs": [],
+   "source": [
+    "plt.plot(loss_plot)\n",
+    "plt.xlabel('Epochs')\n",
+    "plt.ylabel('Loss')\n",
+    "plt.title('Loss Plot')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "xGvOcLQKghXN"
+   },
+   "source": [
+    "## Caption!\n",
+    "\n",
+    "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
+    "* Stop predicting when the model predicts the end token.\n",
+    "* And store the attention weights for every time step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "io7ws3ReRPGv",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# captions on the validation set\n",
-        "rid = np.random.randint(0, len(img_name_val))\n",
-        "image = img_name_val[rid]\n",
-        "real_caption = ' '.join([index_word[i] for i in cap_val[rid] if i not in [0]])\n",
-        "result, attention_plot = evaluate(image)\n",
-        "\n",
-        "print ('Real Caption:', real_caption)\n",
-        "print ('Prediction Caption:', ' '.join(result))\n",
-        "plot_attention(image, result, attention_plot)\n",
-        "# opening the image\n",
-        "Image.open(img_name_val[rid])"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "RCWpDtyNRPGs"
+   },
+   "outputs": [],
+   "source": [
+    "def evaluate(image):\n",
+    "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
+    "\n",
+    "    hidden = decoder.reset_state(batch_size=1)\n",
+    "\n",
+    "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
+    "    img_tensor_val = image_features_extract_model(temp_input)\n",
+    "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
+    "\n",
+    "    features = encoder(img_tensor_val)\n",
+    "\n",
+    "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
+    "    result = []\n",
+    "\n",
+    "    for i in range(max_length):\n",
+    "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
+    "\n",
+    "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
+    "\n",
+    "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
+    "        result.append(tokenizer.index_word[predicted_id])\n",
+    "\n",
+    "        if tokenizer.index_word[predicted_id] == '<end>':\n",
+    "            return result, attention_plot\n",
+    "\n",
+    "        dec_input = tf.expand_dims([predicted_id], 0)\n",
+    "\n",
+    "    attention_plot = attention_plot[:len(result), :]\n",
+    "    return result, attention_plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Rprk3HEvZuxb",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Try it on your own images\n",
-        "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
-      ]
+    "colab_type": "code",
+    "id": "fD_y7PD6RPGt"
+   },
+   "outputs": [],
+   "source": [
+    "def plot_attention(image, result, attention_plot):\n",
+    "    temp_image = np.array(Image.open(image))\n",
+    "\n",
+    "    fig = plt.figure(figsize=(10, 10))\n",
+    "    \n",
+    "    len_result = len(result)\n",
+    "    for l in range(len_result):\n",
+    "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
+    "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
+    "        ax.set_title(result[l])\n",
+    "        img = ax.imshow(temp_image)\n",
+    "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "9Psd1quzaAWg",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
-        "image_extension = image_url[-4:]\n",
-        "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
-        "                                     origin=image_url)\n",
-        "\n",
-        "result, attention_plot = evaluate(image_path)\n",
-        "print ('Prediction Caption:', ' '.join(result))\n",
-        "plot_attention(image_path, result, attention_plot)\n",
-        "# opening the image\n",
-        "Image.open(image_path)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "io7ws3ReRPGv"
+   },
+   "outputs": [],
+   "source": [
+    "# captions on the validation set\n",
+    "rid = np.random.randint(0, len(img_name_val))\n",
+    "image = img_name_val[rid]\n",
+    "real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])\n",
+    "result, attention_plot = evaluate(image)\n",
+    "\n",
+    "print ('Real Caption:', real_caption)\n",
+    "print ('Prediction Caption:', ' '.join(result))\n",
+    "plot_attention(image, result, attention_plot)\n",
+    "# opening the image\n",
+    "Image.open(img_name_val[rid])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Rprk3HEvZuxb"
+   },
+   "source": [
+    "## Try it on your own images\n",
+    "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
+    "colab_type": "code",
+    "id": "9Psd1quzaAWg"
+   },
+   "outputs": [],
+   "source": [
+    "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
+    "image_extension = image_url[-4:]\n",
+    "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
+    "                                     origin=image_url)\n",
+    "\n",
+    "result, attention_plot = evaluate(image_path)\n",
+    "print ('Prediction Caption:', ' '.join(result))\n",
+    "plot_attention(image_path, result, attention_plot)\n",
+    "# opening the image\n",
+    "Image.open(image_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "VJZXyJco6uLO"
+   },
+   "source": [
+    "# Next steps\n",
+    "\n",
+    "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "default_view": {},
+   "name": "image_captioning_with_attention.ipynb",
+   "private_outputs": true,
+   "provenance": [
     {
-      "metadata": {
-        "id": "VJZXyJco6uLO",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "# Next steps\n",
-        "\n",
-        "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
-      ]
+     "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
+     "timestamp": 1530222436922
     }
-  ]
+   ],
+   "toc_visible": true,
+   "version": "0.3.2",
+   "views": {}
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
index e0d5e494d432b365b0d1dcff6b634de2e6213a43..bda9e77085e45ae31a228142135425e22a1c6780 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
@@ -567,7 +567,7 @@
         "\n",
         "* We get predictions using the start_string and the hidden state\n",
         "\n",
-        "* Then we use a multinomial distribution to calculate the index of the predicted word. **We use this predicted word as our next input to the model**\n",
+        "* Then we use argmax to calculate the index of the predicted word. **We use this predicted word as our next input to the model**\n",
         "\n",
         "* **The hidden state returned by the model is fed back into the model so that it now has more context rather than just one word.** After we predict the next word, the modified hidden states are again fed back into the model, which is how it learns as it gets more context from the previously predicted words.\n",
         "\n",
@@ -598,19 +598,13 @@
         "# empty string to store our results\n",
         "text_generated = ''\n",
         "\n",
-        "# low temperatures results in more predictable text.\n",
-        "# higher temperatures results in more surprising text\n",
-        "# experiment to find the best setting\n",
-        "temperature = 1.0\n",
-        "\n",
         "# hidden state shape == (batch_size, number of rnn units); here batch size == 1\n",
         "hidden = [tf.zeros((1, units))]\n",
         "for i in range(num_generate):\n",
         "    predictions, hidden = model(input_eval, hidden)\n",
         "\n",
-        "    # using a multinomial distribution to predict the word returned by the model\n",
-        "    predictions = predictions / temperature\n",
-        "    predicted_id = tf.argmax(predictions[0]).numpy()\n",
+        "    # using argmax to predict the word returned by the model\n",
+        "    predicted_id = tf.argmax(predictions[-1]).numpy()\n",
         "    \n",
         "    # We pass the predicted word as the next input to the model\n",
         "    # along with the previous hidden state\n",
@@ -632,7 +626,6 @@
         "\n",
         "* Change the start string to a different character, or the start of a sentence.\n",
         "* Experiment with training on a different, or with different parameters. [Project  Gutenberg](http://www.gutenberg.org/ebooks/100), for example, contains a large collection of books.\n",
-        "* Experiment with the temperature parameter.\n",
         "* Add another RNN layer.\n"
       ]
     },
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/main.py b/tensorflow/contrib/eager/python/examples/l2hmc/main.py
index 45e1f98429f48749d374c2aefd8874690c3830ad..98fcb2ba10aa4148dc1d4bd7ddfb6fa9c8c4537c 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/main.py
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/main.py
@@ -71,7 +71,7 @@ def main(_):
     # Training
     if FLAGS.use_defun:
       # Use `tfe.deun` to boost performance when there are lots of small ops
-      loss_fn = tfe.defun(l2hmc.compute_loss)
+      loss_fn = tfe.function(l2hmc.compute_loss)
     else:
       loss_fn = l2hmc.compute_loss
 
@@ -104,7 +104,7 @@ def main(_):
   # Evaluation
   if FLAGS.use_defun:
     # Use tfe.deun to boost performance when there are lots of small ops
-    apply_transition = tfe.defun(dynamics.apply_transition)
+    apply_transition = tfe.function(dynamics.apply_transition)
   else:
     apply_transition = dynamics.apply_transition
 
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
index 557ad42752144243ae3da61b955b31398cba846e..d412b25b368260b81256fd58034330b884261b2b 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
@@ -36,7 +36,7 @@ class GraphLinearRegressionBenchmark(tf.test.Benchmark):
         noise_level=0.01,
         batch_size=batch_size,
         num_batches=num_batches)
-    iterator = dataset.make_initializable_iterator()
+    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
     x, y = iterator.get_next()
 
     model = linear_regression.LinearModel()
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 560fc8c5a22a0e7acf1f37cf7daf7790dc14de19..66d52a74943d0d81fde05ce51b019558b327978d 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -352,7 +352,7 @@
         "And the pseudo-code:\n",
         "\n",
         "* `score = FC(tanh(FC(EO) + FC(H)))`\n",
-        "* `attention weights = softmax(score, axis = 1)`. Softmax by default is applied on the last axis but here we want to apply it on the *1st axis*, since the shape of score is *(batch_size, max_length, hidden_size)*. `Max_length` is the length of our input. Since we are trying to assign a weight to each input, softmax should be applied on that axis.\n",
+        "* `attention weights = softmax(score, axis = 1)`. Softmax by default is applied on the last axis but here we want to apply it on the *1st axis*, since the shape of score is *(batch_size, max_length, 1)*. `Max_length` is the length of our input. Since we are trying to assign a weight to each input, softmax should be applied on that axis.\n",
         "* `context vector = sum(attention weights * EO, axis = 1)`. Same reason as above for choosing axis as 1.\n",
         "* `embedding output` = The input to the decoder X is passed through an embedding layer.\n",
         "* `merged vector = concat(embedding output, context vector)`\n",
@@ -446,12 +446,12 @@
         "        # we are doing this to perform addition to calculate the score\n",
         "        hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
         "        \n",
-        "        # score shape == (batch_size, max_length, hidden_size)\n",
-        "        score = tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))\n",
+        "        # score shape == (batch_size, max_length, 1)\n",
+        "        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V\n",
+        "        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))\n",
         "        \n",
         "        # attention_weights shape == (batch_size, max_length, 1)\n",
-        "        # we get 1 at the last axis because we are applying score to self.V\n",
-        "        attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
+        "        attention_weights = tf.nn.softmax(score, axis=1)\n",
         "        \n",
         "        # context_vector shape after sum == (batch_size, hidden_size)\n",
         "        context_vector = attention_weights * enc_output\n",
@@ -768,7 +768,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -781,7 +781,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -794,7 +794,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -808,7 +808,7 @@
       "outputs": [],
       "source": [
         "# wrong translation\n",
-        "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb
index 8fae622e12864ddeee0cedd3cf99be8ea5e4bc48..446e3401184ded6bc34ed64cdd720e29a2851855 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/automatic_differentiation.ipynb
@@ -65,7 +65,7 @@
         "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/eager/automatic_differentiation.ipynb\"\u003e\n",
         "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
         "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/blob/master/site/en/tutorials/eager/automatic_differentiation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/eager/automatic_differentiation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     }
   ],
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
index 68a84d5fbb4f13e4ebe0d71e3f5caebe97e2101c..f3135a9668fc0dc7faa93a5f119b53f3efd34c6e 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD
+++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
@@ -35,6 +35,12 @@ cuda_py_test(
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    tags = [
+        "noasan",  # Fix b/118130911
+        "nomsan",  # Fix b/118130911
+        "notsan",  # Fix b/118130911
+        "optonly",
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
index f3bb978875e226f58d6a00e09154191673a97415..fb7975d8fe867711cff31d627788a2d62a520aa9 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
@@ -142,7 +142,8 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       with tf.Graph().as_default():
         np_images, np_labels = random_batch(batch_size)
         dataset = tf.data.Dataset.from_tensors((np_images, np_labels)).repeat()
-        (images, labels) = dataset.make_one_shot_iterator().get_next()
+        images, labels = tf.compat.v1.data.make_one_shot_iterator(
+            dataset).get_next()
 
         model = resnet50.ResNet50(data_format())
         logits = model(images, training=True)
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index d265169b5eff685f7b79fb221b9bd52be37ead9c..fb81979d7bd8d17a55b8c448008765268dd07d1d 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -77,7 +77,7 @@ class ResNet50Test(tf.test.TestCase):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format)
     if defun:
-      model.call = tfe.defun(model.call)
+      model.call = tfe.function(model.call)
     with tf.device(device), tfe.execution_mode(execution_mode):
       images, _ = random_batch(2, data_format)
       output = model(images, training=False)
@@ -221,7 +221,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       device, data_format = device_and_format
       model = resnet50.ResNet50(data_format)
       if defun:
-        model.call = tfe.defun(model.call)
+        model.call = tfe.function(model.call)
       batch_size = 64
       num_burn = 5
       num_iters = 30
@@ -266,8 +266,8 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         optimizer = tf.train.GradientDescentOptimizer(0.1)
         apply_grads = apply_gradients
         if defun:
-          model.call = tfe.defun(model.call)
-          apply_grads = tfe.defun(apply_gradients)
+          model.call = tfe.function(model.call)
+          apply_grads = tfe.function(apply_gradients)
 
         num_burn = 3
         num_iters = 10
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main.py b/tensorflow/contrib/eager/python/examples/revnet/main.py
index b702e91f92220c2a9003a1b82411131332012a9e..9585f3565f83af724b6336e466d3671443ba2361 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/main.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/main.py
@@ -72,14 +72,11 @@ def main(_):
     train_one_iter(model, x, y, optimizer, global_step=global_step)
 
     if global_step.numpy() % config.log_every == 0:
-      it_test = ds_test.make_one_shot_iterator()
-      acc_test, loss_test = evaluate(model, it_test)
+      acc_test, loss_test = evaluate(model, ds_test)
 
       if FLAGS.validate:
-        it_train = ds_train_one_shot.make_one_shot_iterator()
-        it_validation = ds_validation.make_one_shot_iterator()
-        acc_train, loss_train = evaluate(model, it_train)
-        acc_validation, loss_validation = evaluate(model, it_validation)
+        acc_train, loss_train = evaluate(model, ds_train_one_shot)
+        acc_validation, loss_validation = evaluate(model, ds_validation)
         print("Iter {}, "
               "training set accuracy {:.4f}, loss {:.4f}; "
               "validation set accuracy {:.4f}, loss {:.4f}; "
@@ -218,11 +215,11 @@ def train_one_iter(model, inputs, labels, optimizer, global_step=None):
   return logits, loss
 
 
-def evaluate(model, iterator):
+def evaluate(model, dataset):
   """Compute accuracy with the given dataset iterator."""
   mean_loss = tfe.metrics.Mean()
   accuracy = tfe.metrics.Accuracy()
-  for x, y in iterator:
+  for x, y in dataset:
     logits, _ = model(x, training=False)
     loss = model.compute_loss(logits=logits, labels=y)
     accuracy(
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py b/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py
index 3a17eb30da3b989acb0b33f2fcb730da76546c18..125adbb9de6e4febbb4284bfe3a31f257e2e8037 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py
@@ -173,7 +173,7 @@ def main(_):
     input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
         "image": inputs
     })
-    revnet_estimator.export_savedmodel(FLAGS.model_dir, input_fn)
+    revnet_estimator.export_saved_model(FLAGS.model_dir, input_fn)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py b/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py
index 8520cf5b71af503be35d5415707a283fb363a476..b0676916a8da276704de741a50f40cd7d9525228 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py
@@ -307,7 +307,7 @@ def main(_):
       # The guide to serve an exported TensorFlow model is at:
       #    https://www.tensorflow.org/serving/serving_basic
       tf.logging.info("Starting to export model.")
-      revnet_classifier.export_savedmodel(
+      revnet_classifier.export_saved_model(
           export_dir_base=FLAGS.export_dir,
           serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
 
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
index 4f4cc3af6f1d5c626b3e2ea7939ecad0ee2d41f1..971aa44f3034692dfb0d03ed3dabf4d6e911eb9f 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
@@ -127,6 +127,8 @@ class RevNetTest(tf.test.TestCase):
 
   def test_compute_gradients_defun(self):
     """Test `compute_gradients` function with defun."""
+    # TODO(apassos): make cond support returning None to let this happen with
+    # tf.function.
     compute_gradients = tfe.defun(self.model.compute_gradients)
     _, saved_hidden = self.model(self.x)
     grads, _ = compute_gradients(saved_hidden=saved_hidden, labels=self.t)
@@ -235,6 +237,7 @@ class RevNetBenchmark(tf.test.Benchmark):
       device, data_format = device_and_format
       model = revnet.RevNet(config=config)
       if defun:
+        # TODO(apassos): reenable after cond lets you return None
         model.call = tfe.defun(model.call)
       batch_size = 64
       num_burn = 5
@@ -282,7 +285,7 @@ class RevNetBenchmark(tf.test.Benchmark):
         model = revnet.RevNet(config=config)
         optimizer = tf.train.GradientDescentOptimizer(0.1)
         if defun:
-          model.call = tfe.defun(model.call)
+          model.call = tfe.function(model.call)
 
         num_burn = 3
         num_iters = 10
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
index 63b5c4c54d13e9c2448ec1f572ca1389f2443bef..770484abed96e540cf75cc5368a1410c31a8d2d0 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
@@ -82,7 +82,7 @@ class PTBBenchmark(tf.test.Benchmark):
         tf.ones(
             [PTBBenchmark.SEQ_LEN, PTBBenchmark.BATCH_SIZE],
             dtype=tf.int64)).repeat(num_iters + num_warmup)
-    inputs = dataset.make_one_shot_iterator().get_next()
+    inputs = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
 
     with tf.device(tf.test.gpu_device_name()):
       outputs = model(inputs, training=True)
@@ -124,7 +124,8 @@ class PTBBenchmark(tf.test.Benchmark):
             dtype=tf.int64)).repeat(num_iters + num_warmup)
     # inputs and labels have the same shape
     dataset = tf.data.Dataset.zip((dataset, dataset))
-    (inputs, labels) = dataset.make_one_shot_iterator().get_next()
+    (inputs, labels) = tf.compat.v1.data.make_one_shot_iterator(
+        dataset).get_next()
 
     with tf.device(tf.test.gpu_device_name()):
       optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 930e62b68096b468846a01b9674c669a8b8e9a53..566246de4957c1dc5919c10e22146706f9e50be8 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -24,6 +24,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -347,16 +348,17 @@ class Mean(Metric):
     Raises:
       ValueError: if the optional argument is not bool
     """
-     # Convert the boolean to tensor for tf.cond, if it is not.
+    # Convert the boolean to tensor for tf.cond, if it is not.
     if not isinstance(write_summary, ops.Tensor):
       write_summary = ops.convert_to_tensor(write_summary)
     t = self.numer / self.denom
     def write_summary_f():
       summary_ops.scalar(name=self.name, tensor=t)
       return t
-    control_flow_ops.cond(write_summary,
+    smart_cond.smart_cond(write_summary,
                           write_summary_f,
-                          lambda: t)
+                          lambda: t,
+                          name="")
     return t
 
 
@@ -487,6 +489,8 @@ class BinaryAccuracy(Mean):
         message="Shapes of labels and predictions are unequal")
     predictions = ops.convert_to_tensor(predictions)
     predictions = predictions > self.threshold
+    # Convert labels to bool to match predictions.
+    labels = math_ops.cast(labels, dtypes.bool)
     matches = math_ops.equal(labels, predictions)
     matches = math_ops.cast(matches, self.dtype)
     super(BinaryAccuracy, self).call(matches, weights=weights)
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 9d2d172752c7f3f3ee6eaa11ab8952313a4a3543..39e5957f5d1760613f2c33607c0bdb163040efb4 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -49,18 +49,6 @@ class MetricsTest(test.TestCase):
     self.assertEqual(dtypes.float64, m.dtype)
     self.assertEqual(dtypes.float64, m.result().dtype)
 
-  def testSummaryArg(self):
-    m = metrics.Mean()
-    m([1, 10, 100])
-    m(1000)
-    m([10000.0, 100000.0])
-    self.assertEqual(111111.0/6, m.result(write_summary=True).numpy())
-    self.assertEqual(111111.0/6, m.result(write_summary=False).numpy())
-    with self.assertRaises(ValueError):
-      m.result(write_summary=5)
-    with self.assertRaises(ValueError):
-      m.result(write_summary=[True])
-
   def testVariableCollections(self):
     with context.graph_mode(), ops.Graph().as_default():
       m = metrics.Mean()
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index f801d9a47b2f831a48d9b6335c69612c1356d800..5cc0c4f23d9d641ff1452c7cc9c1fcde612a33a2 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -24,7 +24,7 @@ import weakref
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.engine import base_layer as keras_base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
@@ -220,7 +220,7 @@ class Network(base.Layer):
         avoid_names = parent_network._owned_layers
         name_uid_map = parent_network._sub_layer_name_uids
       else:
-        name_uid_map = keras_base_layer.get_default_graph_uid_map()
+        name_uid_map = base_layer_utils.get_default_graph_uid_map()
         # Figure out which names we have to avoid based on which variable scope
         # we're nested in.
         strip_name = self._default_parent_variable_scope.name
diff --git a/tensorflow/contrib/eager/python/parameter_server.py b/tensorflow/contrib/eager/python/parameter_server.py
index 3a9e7b027ed68935f2bc0ddbd27a1821a663850d..7803a6799bb64441fab881bf6ca986d5cf3851a8 100644
--- a/tensorflow/contrib/eager/python/parameter_server.py
+++ b/tensorflow/contrib/eager/python/parameter_server.py
@@ -56,12 +56,7 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     # shape inference doesn't run in eager mode we copy this data here for when
     # the handle is captured by an eager mode function.
     # pylint: disable=protected-access
-    if ops._USE_C_SHAPES:
-      handle._handle_data = resource_variable_ops.get_resource_handle_data(h)
-    else:
-      if h._handle_data is None:
-        ops.set_shape_and_handle_data_for_outputs(h.op)
-      handle._handle_data = h._handle_data
+    handle._handle_data = resource_variable_ops.get_resource_handle_data(h)
     # pylint: enable=protected-access
   # Clean up op->graph->op reference cycles.
   ops.dismantle_graph(graph)
diff --git a/tensorflow/contrib/eager/python/remote_test.py b/tensorflow/contrib/eager/python/remote_test.py
index 7aa4b598b833c3419af501b49f1509d18f3530d5..3926de15e71c9917f88fc3f58740b8c75354ab26 100644
--- a/tensorflow/contrib/eager/python/remote_test.py
+++ b/tensorflow/contrib/eager/python/remote_test.py
@@ -206,6 +206,33 @@ class RemoteExecutionTest(test.TestCase):
       y = math_ops.matmul(x1, x2)
     np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
 
+  @run_sync_and_async
+  def testContextDeviceUpdated(self):
+    """Tests that the context device is correctly updated."""
+
+    with ops.device("cpu:0"):
+      x1 = array_ops.ones([2, 2])
+      x2 = array_ops.ones([2, 2])
+      y = math_ops.matmul(x1, x2)
+    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
+
+    # `y` is placed on the local CPU as expected.
+    self.assertEqual(y.device,
+                     "/job:%s/replica:0/task:0/device:CPU:0" % JOB_NAME)
+
+  @run_sync_and_async
+  def testGPUToRemoteCopy(self):
+    """Tests that the remote copy happens satisfactorily."""
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs.")
+
+    x1 = array_ops.ones([2, 2]).gpu()
+
+    with ops.device("/job:remote_device/replica:0/task:1/device:CPU:0"):
+      x2 = x1._copy()  # pylint: disable=protected-access
+
+    np.testing.assert_array_equal(x1.numpy(), x2.numpy())
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index f9c716360c5755ee1902b576545d776725f9966f..1d0d6c6c14ce4a8e454206e0be9fea4724f09192 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -115,6 +115,11 @@ def restore_variables_on_create(save_path, map_func=None):
 
 class Saver(object):
   """A tf.train.Saver adapter for use when eager execution is enabled.
+
+  `Saver`'s name-based checkpointing strategy is fragile. Please switch to
+  `tf.train.Checkpoint` or `tf.keras.Model.save_weights`, which perform a more
+  robust object-based saving. These APIs will load checkpoints written by
+  `Saver`.
   """
 
   def __init__(self, var_list):
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index f5b8d95e4fc7fe5cd90d658eda49590e0b330bb0..33c988fd9065e7fbe7b9aeb85cad82eb3c119f76 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -25,6 +25,7 @@ To use, at program startup, call `tf.enable_eager_execution()`.
 
 @@py_func
 @@defun
+@@function
 @@make_template
 @@implicit_gradients
 @@implicit_value_and_gradients
@@ -101,7 +102,7 @@ from tensorflow.contrib.eager.python.saver import get_optimizer_variables
 from tensorflow.contrib.eager.python.saver import restore_variables_on_create
 from tensorflow.contrib.eager.python.saver import Saver
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import function
+from tensorflow.python.eager import function as _function_lib
 from tensorflow.python.eager.context import DEVICE_PLACEMENT_EXPLICIT
 from tensorflow.python.eager.context import DEVICE_PLACEMENT_WARN
 from tensorflow.python.eager.context import DEVICE_PLACEMENT_SILENT
@@ -115,6 +116,7 @@ from tensorflow.python.eager.context import SYNC
 from tensorflow.python.eager.context import ASYNC
 from tensorflow.python.eager.context import num_gpus
 from tensorflow.python.eager.context import set_server_def
+from tensorflow.python.eager.def_function import function
 from tensorflow.python.eager.execution_callbacks import add_execution_callback
 from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
 from tensorflow.python.eager.execution_callbacks import inf_callback
@@ -138,7 +140,7 @@ from tensorflow.python.training.checkpointable.util import Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
 
 py_func = script_ops.eager_py_func
-defun = function.defun
+defun = _function_lib.defun
 make_template = template.make_template_internal
 implicit_gradients = backprop.implicit_grad
 implicit_value_and_gradients = backprop.implicit_val_and_grad
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index 4454abfb9667f824b9de0100bb81bae24ad5f7a6..8c35dddb5a515aa09cc70c173a9f0605e8567e82 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -87,8 +87,8 @@ class TFETest(test_util.TensorFlowTestCase):
       x += 1.
     # Without a device context, heuristics are used to place ops.
     # In this case, ops.reduce_mean runs on the GPU.
-    reduction_indices = range(x.shape.ndims)
-    m = math_ops.reduce_mean(x, reduction_indices)
+    axis = range(x.shape.ndims)
+    m = math_ops.reduce_mean(x, axis)
     # m is on GPU, bring it back to CPU and compare.
     self.assertEqual(3.5, m.cpu().numpy())
 
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 1ea00fb7f3c6a19824abc8eb80726bb3bba183aa..a888379f13e79d1c246d4cd6d19a225c065692a2 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -8,61 +8,29 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+# PLACEHOLDER PIP REQUIREMENTS
 
 py_library(
     name = "estimator_py",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":baseline",
         ":boosted_trees",
-        ":dnn",
-        ":dnn_linear_combined",
         ":dnn_with_layer_annotations",
         ":early_stopping",
+        ":expect_tensorflow_estimator_installed",
         ":export",
         ":exporter",
         ":extenders",
         ":head",
         ":hooks",
-        ":linear",
         ":logit_fns",
         ":multi_head",
         ":replicate_model_fn",
         ":rnn",
         ":saved_model_estimator",
         "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
-py_library(
-    name = "baseline",
-    srcs = ["python/estimator/baseline.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:baseline",
-    ],
-)
-
-py_test(
-    name = "baseline_test",
-    size = "small",
-    srcs = ["python/estimator/baseline_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",
-    ],
-    deps = [
-        ":baseline",
-        ":head",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:metric_keys",
-        "//tensorflow/python/estimator:numpy_io",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
@@ -71,67 +39,18 @@ py_library(
     srcs = ["python/estimator/boosted_trees.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:boosted_trees",
     ],
 )
 
-py_test(
-    name = "boosted_trees_test",
-    size = "medium",
-    srcs = ["python/estimator/boosted_trees_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",
-    ],
-    deps = [
-        ":boosted_trees",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:numpy_io",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "dnn",
-    srcs = ["python/estimator/dnn.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:dnn",
-    ],
-)
-
-py_test(
-    name = "dnn_test",
-    size = "medium",
-    srcs = ["python/estimator/dnn_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",
-        "optonly",  # times out http://b/79220679
-    ],
-    deps = [
-        ":dnn",
-        ":head",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:dnn_testing_utils",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:prediction_keys",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "dnn_with_layer_annotations",
     srcs = ["python/estimator/dnn_with_layer_annotations.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:head",
@@ -140,64 +59,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "dnn_with_layer_annotations_test",
-    size = "medium",
-    srcs = ["python/estimator/dnn_with_layer_annotations_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",  # b/67510291
-    ],
-    deps = [
-        ":dnn_with_layer_annotations",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:dnn",
-        "//tensorflow/python/estimator:dnn_testing_utils",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:pandas_io",
-        "//tensorflow/python/estimator:prediction_keys",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "dnn_linear_combined",
-    srcs = ["python/estimator/dnn_linear_combined.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:dnn_linear_combined",
-    ],
-)
-
-py_test(
-    name = "dnn_linear_combined_test",
-    size = "medium",
-    srcs = ["python/estimator/dnn_linear_combined_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",
-    ],
-    deps = [
-        ":dnn_linear_combined",
-        ":head",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:dnn_testing_utils",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:linear_testing_utils",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:prediction_keys",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "extenders",
     srcs = [
@@ -205,6 +66,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:model_fn",
@@ -213,23 +75,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "extenders_test",
-    size = "medium",
-    srcs = ["python/estimator/extenders_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/62863147
-    deps = [
-        ":extenders",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/predictor",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/estimator:linear",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_library(
     name = "export",
     srcs = [
@@ -237,22 +82,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python/estimator:model_fn",
-    ],
-)
-
-py_test(
-    name = "export_test",
-    size = "medium",
-    srcs = ["python/estimator/export_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/62863147
-    deps = [
-        ":export",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:export_output",
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow/python/estimator:model_fn",
     ],
 )
@@ -264,24 +94,12 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:exporter",
     ],
 )
 
-py_test(
-    name = "exporter_test",
-    size = "medium",
-    srcs = ["python/estimator/exporter_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":exporter",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:exporter",
-    ],
-)
-
 py_library(
     name = "head",
     srcs = [
@@ -289,6 +107,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:head",
@@ -298,22 +117,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "head_test",
-    size = "medium",
-    srcs = ["python/estimator/head_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":head",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:metric_keys",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/estimator:prediction_keys",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "hooks",
     srcs = [
@@ -321,58 +124,12 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
-py_test(
-    name = "hooks_test",
-    size = "medium",
-    srcs = ["python/estimator/hooks_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":hooks",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:estimator_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "linear",
-    srcs = ["python/estimator/linear.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:linear",
-    ],
-)
-
-py_test(
-    name = "linear_test",
-    size = "medium",
-    srcs = ["python/estimator/linear_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",
-    ],
-    deps = [
-        ":head",
-        ":linear",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:linear_testing_utils",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:prediction_keys",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "logit_fns",
     srcs = [
@@ -380,24 +137,13 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:dnn",
         "//tensorflow/python/estimator:linear",
     ],
 )
 
-py_test(
-    name = "logit_fns_test",
-    size = "small",
-    srcs = ["python/estimator/logit_fns_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":logit_fns",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:model_fn",
-    ],
-)
-
 py_library(
     name = "multi_head",
     srcs = [
@@ -405,6 +151,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:head",
@@ -414,23 +161,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "multi_head_test",
-    size = "small",
-    srcs = ["python/estimator/multi_head_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":head",
-        ":multi_head",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:metric_keys",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/estimator:prediction_keys",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "replicate_model_fn",
     srcs = [
@@ -438,6 +168,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:model_fn",
@@ -446,35 +177,12 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "replicate_model_fn_test",
-    size = "medium",
-    srcs = ["python/estimator/replicate_model_fn_test.py"],
-    additional_deps = [
-        "@absl_py//absl/testing:parameterized",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:dnn",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:export_output",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:optimizers",
-        "//tensorflow/python/estimator:prediction_keys",
-        ":replicate_model_fn",
-    ],
-    tags = [
-        "manual",
-        "multi_gpu",
-        "notap",
-    ],
-)
-
 py_library(
     name = "rnn",
     srcs = ["python/estimator/rnn.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":extenders",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/contrib/feature_column:feature_column_py",
@@ -485,55 +193,22 @@ py_library(
     ],
 )
 
-py_test(
-    name = "rnn_test",
-    size = "medium",
-    srcs = ["python/estimator/rnn_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "noasan",  # times out
-        "notsan",
-        "optonly",  # times out http://b/79220679
-    ],
-    deps = [
-        ":head",
-        ":rnn",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/contrib/data",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:parsing_utils",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "early_stopping",
     srcs = ["python/estimator/early_stopping.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
     ],
 )
 
-py_test(
-    name = "early_stopping_test",
-    srcs = ["python/estimator/early_stopping_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":early_stopping",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 py_library(
     name = "saved_model_estimator",
     srcs = ["python/estimator/saved_model_estimator.py"],
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":export",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
@@ -542,21 +217,9 @@ py_library(
     ],
 )
 
-py_test(
-    name = "saved_model_estimator_test",
-    size = "medium",
-    srcs = ["python/estimator/saved_model_estimator_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "notsan",
-    ],
-    deps = [
-        ":export",
-        ":saved_model_estimator",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:export_export",
-        "//tensorflow/python/estimator:export_output",
-        "//tensorflow/python/estimator:model_fn",
-    ],
+py_library(
+    name = "expect_tensorflow_estimator_installed",
+    # This is a dummy rule used as a dependency in open-source.
+    # We expect tensorflow_estimator to already be installed.
+    visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 419609b1af7b19dc9cf2960e96e71d54d8eb0c9b..7d61247e7ef26d3777843cd3be20684583e9058c 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,33 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Experimental utilities re:tf.estimator.*."""
+"""estimator python module.
+
+Importing from tensorflow.python.estimator
+is unsupported and will soon break!
+"""
+
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.contrib.estimator.python.estimator.baseline import *
-from tensorflow.contrib.estimator.python.estimator.boosted_trees import *
-from tensorflow.contrib.estimator.python.estimator.dnn import *
-from tensorflow.contrib.estimator.python.estimator.dnn_with_layer_annotations import *
-from tensorflow.contrib.estimator.python.estimator.dnn_linear_combined import *
-from tensorflow.contrib.estimator.python.estimator.early_stopping import *
-from tensorflow.contrib.estimator.python.estimator.export import *
-from tensorflow.contrib.estimator.python.estimator.extenders import *
-from tensorflow.contrib.estimator.python.estimator.head import *
-from tensorflow.contrib.estimator.python.estimator.hooks import *
-from tensorflow.contrib.estimator.python.estimator.linear import *
-from tensorflow.contrib.estimator.python.estimator.logit_fns import *
-from tensorflow.contrib.estimator.python.estimator.multi_head import *
-from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
-from tensorflow.contrib.estimator.python.estimator.rnn import *
-from tensorflow.contrib.estimator.python.estimator.saved_model_estimator import *
-from tensorflow.python.estimator.export.export import *
+# Importing from tensorflow.python.estimator
+# is unsupported and will soon break!
+
+from tensorflow_estimator.contrib import estimator
+
+# Fixes remove_undocumented not working as intended.
+#
+# Problem is that when the below import happens (for first time,
+# Python only imports things once), Python sets attribute named
+# 'python' to this package. If this first import happens
+# after the call to remove_undocumented, then the 'python'
+# attribute won't be removed.
+import tensorflow.contrib.estimator.python
+
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+estimator.__all__ = [s for s in dir(estimator) if not s.startswith('__')]
 
+from tensorflow_estimator.contrib.estimator import *
 from tensorflow.python.util.all_util import remove_undocumented
-# pylint: enable=unused-import,line-too-long,wildcard-import
 
 _allowed_symbols = [
     'add_metrics',
@@ -53,10 +58,6 @@ _allowed_symbols = [
     'multi_label_head',
     'poisson_regression_head',
     'regression_head',
-    'BaselineEstimator',
-    'DNNEstimator',
-    'DNNLinearCombinedEstimator',
-    'LinearEstimator',
     'boosted_trees_classifier_train_in_memory',
     'boosted_trees_regressor_train_in_memory',
     'call_logit_fn',
diff --git a/tensorflow/contrib/estimator/python/estimator/baseline.py b/tensorflow/contrib/estimator/python/estimator/baseline.py
deleted file mode 100644
index beffbee73064b9ef425b115317c43e29477b19af..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/baseline.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Baseline estimators."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import baseline
-
-
-class BaselineEstimator(estimator.Estimator):
-  """An estimator that can establish a simple baseline.
-
-  The estimator uses a user-specified head.
-
-  This estimator ignores feature values and will learn to predict the average
-  value of each label. E.g. for single-label classification problems, this will
-  predict the probability distribution of the classes as seen in the labels.
-  For multi-label classification problems, it will predict the ratio of examples
-  that contain each class.
-
-  Example:
-
-  ```python
-
-  # Build baseline multi-label classifier.
-  estimator = BaselineEstimator(
-      head=tf.contrib.estimator.multi_label_head(n_classes=3))
-
-  # Input builders
-  def input_fn_train: # returns x, y (where y represents label's class index).
-    pass
-
-  def input_fn_eval: # returns x, y (where y represents label's class index).
-    pass
-
-  # Fit model.
-  estimator.train(input_fn=input_fn_train)
-
-  # Evaluates cross entropy between the test and train labels.
-  loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
-
-  # For each class, predicts the ratio of training examples that contain the
-  # class.
-  predictions = classifier.predict(new_samples)
-
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-    otherwise there will be a `KeyError`:
-
-  * if `weight_column` passed to the `head` constructor is not `None`, a feature
-    with `key=weight_column` whose value is a `Tensor`.
-  """
-
-  def __init__(self,
-               head,
-               model_dir=None,
-               optimizer='Ftrl',
-               config=None):
-    """Initializes a BaselineEstimator instance.
-
-    Args:
-      head: A `_Head` instance constructed with a method such as
-        `tf.contrib.estimator.multi_label_head`.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      optimizer: String, `tf.Optimizer` object, or callable that creates the
-        optimizer to use for training. If not specified, will use
-        `FtrlOptimizer` with a default learning rate of 0.3.
-      config: `RunConfig` object to configure the runtime settings.
-    """
-    def _model_fn(features, labels, mode, config):
-      return baseline._baseline_model_fn(  # pylint: disable=protected-access
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          optimizer=optimizer,
-          config=config)
-    super(BaselineEstimator, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/baseline_test.py b/tensorflow/contrib/estimator/python/estimator/baseline_test.py
deleted file mode 100644
index 513feb03b6fb7b0806d2a5fb560b1e3394d4094c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/baseline_test.py
+++ /dev/null
@@ -1,436 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for baseline.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.contrib.estimator.python.estimator import baseline
-from tensorflow.contrib.estimator.python.estimator import head as head_lib
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import optimizer
-from tensorflow.python.training import saver
-
-# Names of variables created by model.
-BIAS_NAME = 'baseline/bias'
-
-
-def assert_close(expected, actual, rtol=1e-04, name='assert_close'):
-  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
-    expected = ops.convert_to_tensor(expected, name='expected')
-    actual = ops.convert_to_tensor(actual, name='actual')
-    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
-    rtol = ops.convert_to_tensor(rtol, name='rtol')
-    return check_ops.assert_less(
-        rdiff,
-        rtol,
-        data=('Condition expected =~ actual did not hold element-wise:'
-              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
-              'rtol = ', rtol,),
-        name=scope)
-
-
-def save_variables_to_ckpt(model_dir):
-  init_all_op = [variables.global_variables_initializer()]
-  with tf_session.Session() as sess:
-    sess.run(init_all_op)
-    saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
-
-
-def _baseline_estimator_fn(
-    weight_column=None, label_dimension=1, *args, **kwargs):
-  """Returns a BaselineEstimator that uses regression_head."""
-  return baseline.BaselineEstimator(
-      head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension,
-          # Tests in core (from which this test inherits) test the sum loss.
-          loss_reduction=losses.Reduction.SUM),
-      *args, **kwargs)
-
-
-class BaselineEstimatorEvaluationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_evaluation_batch(self):
-    """Tests evaluation for batch_size==2."""
-    with ops.Graph().as_default():
-      variables.Variable([13.0], name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_estimator = _baseline_estimator_fn(model_dir=self._model_dir)
-    eval_metrics = baseline_estimator.evaluate(
-        input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
-
-    # Logit is bias = 13, while label is 10.
-    # Loss per example is 3**2 = 9.
-    # Training loss is the sum over batch = 9 + 9 = 18
-    # Average loss is the average over batch = 9
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 18.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 13.,
-        metric_keys.MetricKeys.LABEL_MEAN: 10.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_weights(self):
-    """Tests evaluation with weights."""
-    with ops.Graph().as_default():
-      variables.Variable([13.0], name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    def _input_fn():
-      features = {'age': ((1,), (1,)), 'weights': ((1.,), (2.,))}
-      labels = ((10.,), (10.,))
-      return features, labels
-
-    baseline_estimator = _baseline_estimator_fn(
-        weight_column='weights',
-        model_dir=self._model_dir)
-    eval_metrics = baseline_estimator.evaluate(input_fn=_input_fn, steps=1)
-
-    # Logit is bias = 13, while label is 10.
-    # Loss per example is 3**2 = 9.
-    # Training loss is the weighted sum over batch = 9 + 2*9 = 27
-    # average loss is the weighted average = 9 + 2*9 / (1 + 2) = 9
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 27.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 13.,
-        metric_keys.MetricKeys.LABEL_MEAN: 10.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_for_multi_dimensions(self):
-    label_dim = 2
-    with ops.Graph().as_default():
-      variables.Variable([46.0, 58.0], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_estimator = _baseline_estimator_fn(
-        label_dimension=label_dim,
-        model_dir=self._model_dir)
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'age': np.array([[2., 4., 5.]]),
-        },
-        y=np.array([[46., 58.]]),
-        batch_size=1,
-        num_epochs=None,
-        shuffle=False)
-    eval_metrics = baseline_estimator.evaluate(input_fn=input_fn, steps=1)
-
-    self.assertItemsEqual(
-        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
-         metric_keys.MetricKeys.PREDICTION_MEAN,
-         metric_keys.MetricKeys.LABEL_MEAN, ops.GraphKeys.GLOBAL_STEP),
-        eval_metrics.keys())
-
-    # Logit is bias which is [46, 58]
-    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
-
-
-class BaselineEstimatorPredictTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_1d(self):
-    """Tests predict when all variables are one-dimensional."""
-    with ops.Graph().as_default():
-      variables.Variable([.2], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_estimator = _baseline_estimator_fn(model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': np.array([[2.]])},
-        y=None,
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    predictions = baseline_estimator.predict(input_fn=predict_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    # x * weight + bias = 2. * 10. + .2 = 20.2
-    self.assertAllClose([[.2]], predicted_scores)
-
-  def testMultiDim(self):
-    """Tests predict when all variables are multi-dimenstional."""
-    batch_size = 2
-    label_dimension = 3
-    with ops.Graph().as_default():
-      variables.Variable(  # shape=[label_dimension]
-          [.2, .4, .6], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_estimator = _baseline_estimator_fn(
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        # x shape=[batch_size, x_dim]
-        x={'x': np.array([[1., 2., 3., 4.], [5., 6., 7., 8.]])},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predictions = baseline_estimator.predict(input_fn=predict_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    # score = bias, shape=[batch_size, label_dimension]
-    self.assertAllClose([[0.2, 0.4, 0.6], [0.2, 0.4, 0.6]],
-                        predicted_scores)
-
-
-class BaselineEstimatorIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, label_dimension, prediction_length):
-    feature_columns = [
-        feature_column_lib.numeric_column('x', shape=(input_dimension,))
-    ]
-    est = _baseline_estimator_fn(
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    # learn y = x
-    est.train(train_input_fn, steps=200)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array(
-        [x['predictions'] for x in est.predict(predict_input_fn)])
-    self.assertAllEqual((prediction_length, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-
-class BaselineEstimatorTrainingTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _mock_optimizer(self, expected_loss=None):
-    expected_var_names = [
-        '%s:0' % BIAS_NAME
-    ]
-
-    def _minimize(loss, global_step=None, var_list=None):
-      trainable_vars = var_list or ops.get_collection(
-          ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertItemsEqual(expected_var_names,
-                            [var.name for var in trainable_vars])
-
-      # Verify loss. We can't check the value directly, so we add an assert op.
-      self.assertEquals(0, loss.shape.ndims)
-      if expected_loss is None:
-        if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
-        return control_flow_ops.no_op()
-      assert_loss = assert_close(
-          math_ops.to_float(expected_loss, name='expected'),
-          loss,
-          name='assert_loss')
-      with ops.control_dependencies((assert_loss,)):
-        if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
-        return control_flow_ops.no_op()
-
-    mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer.Optimizer,
-        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
-    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
-
-    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
-    # So, return mock_optimizer itself for deepcopy.
-    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
-    return mock_optimizer
-
-  def _assert_checkpoint(self,
-                         label_dimension,
-                         expected_global_step,
-                         expected_bias=None):
-    shapes = {
-        name: shape
-        for (name, shape) in checkpoint_utils.list_variables(self._model_dir)
-    }
-
-    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(self._model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
-
-    self.assertEqual([label_dimension], shapes[BIAS_NAME])
-    if expected_bias is not None:
-      self.assertEqual(expected_bias,
-                       checkpoint_utils.load_variable(self._model_dir,
-                                                      BIAS_NAME))
-
-  def testFromScratch(self):
-    # Create BaselineRegressor.
-    label = 5.
-    age = 17
-    # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
-    mock_optimizer = self._mock_optimizer(expected_loss=25.)
-    baseline_estimator = _baseline_estimator_fn(
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    baseline_estimator.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        label_dimension=1,
-        expected_global_step=num_steps,
-        expected_bias=[0.])
-
-  def testFromCheckpoint(self):
-    # Create initial checkpoint.
-    bias = 7.0
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable([bias], name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = bias = 6.
-    # loss = (logits - label)^2 = (7 - 5)^2 = 4
-    mock_optimizer = self._mock_optimizer(expected_loss=4.)
-    baseline_estimator = _baseline_estimator_fn(
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    baseline_estimator.train(
-        input_fn=lambda: ({'age': ((17,),)}, ((5.,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        label_dimension=1,
-        expected_global_step=initial_global_step + num_steps,
-        expected_bias=[bias])
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index a1f1c5f3d7a25ad28c58e9c215b862b6d51f4cd8..4cb66883a50621297518e34bf2c70bbdee146733 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -12,414 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Boosted Trees estimators."""
+"""boosted_trees python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
-from tensorflow.python.estimator.canned import head as head_lib
-
-
-def _validate_input_fn_and_repeat_dataset(train_input_fn):
-  """Validates whether the input_fn is valid, and repeat() if tf.Dataset."""
-  def _input_fn():
-    result_input_fn = train_input_fn()
-    if isinstance(result_input_fn, dataset_ops.Dataset):
-      return result_input_fn.repeat()
-    return result_input_fn
-
-  return _input_fn
-
-
-def _is_classification_head(head):
-  """Infers if the head is a classification head."""
-  # Check using all classification heads defined in canned/head.py. However, it
-  # is not a complete list - it does not check for other classification heads
-  # not defined in the head library.
-  # pylint: disable=protected-access
-  return isinstance(head,
-                    (head_lib._BinaryLogisticHeadWithSigmoidCrossEntropyLoss,
-                     head_lib._MultiClassHeadWithSoftmaxCrossEntropyLoss))
-  # pylint: enable=protected-access
-
-
-class _BoostedTreesEstimator(canned_boosted_trees._BoostedTreesBase):  # pylint: disable=protected-access
-  """An Estimator for Tensorflow Boosted Trees models."""
-
-  def __init__(self,
-               feature_columns,
-               n_batches_per_layer,
-               head,
-               model_dir=None,
-               weight_column=None,
-               n_trees=100,
-               max_depth=6,
-               learning_rate=0.1,
-               l1_regularization=0.,
-               l2_regularization=0.,
-               tree_complexity=0.,
-               min_node_weight=0.,
-               config=None,
-               center_bias=False,
-               pruning_mode='none'):
-    """Initializes a `BoostedTreesEstimator` instance.
-
-    Args:
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `FeatureColumn`.
-      n_batches_per_layer: the number of batches to collect statistics per
-        layer.
-      head: the `Head` instance defined for Estimator.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to downweight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      n_trees: number trees to be created.
-      max_depth: maximum depth of the tree to grow.
-      learning_rate: shrinkage parameter to be used when a tree added to the
-        model.
-      l1_regularization: regularization multiplier applied to the absolute
-        weights of the tree leafs.
-      l2_regularization: regularization multiplier applied to the square weights
-        of the tree leafs.
-      tree_complexity: regularization factor to penalize trees with more leaves.
-      min_node_weight: minimum hessian a node must have for a split to be
-        considered. The value will be compared with sum(leaf_hessian)/
-        (batch_size * n_batches_per_layer).
-      config: `RunConfig` object to configure the runtime settings.
-      center_bias: Whether bias centering needs to occur. Bias centering refers
-        to the first node in the very first tree returning the prediction that
-        is aligned with the original labels distribution. For example, for
-        regression problems, the first node will return the mean of the labels.
-        For binary classification problems, it will return a logit for a prior
-        probability of label 1.
-      pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
-        pruning (do not split a node if not enough gain is observed) and post
-        pruning (build the tree up to a max depth and then prune branches with
-        negative gain). For pre and post pruning, you MUST provide
-        tree_complexity >0.
-
-    Raises:
-      ValueError: when wrong arguments are given or unsupported functionalities
-         are requested.
-    """
-    # HParams for the model.
-    # pylint: disable=protected-access
-    tree_hparams = canned_boosted_trees._TreeHParams(
-        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity, min_node_weight, center_bias, pruning_mode)
-
-    def _model_fn(features, labels, mode, config):
-      return canned_boosted_trees._bt_model_fn(
-          features,
-          labels,
-          mode,
-          head,
-          feature_columns,
-          tree_hparams,
-          n_batches_per_layer,
-          config=config)
-
-    super(_BoostedTreesEstimator, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config,
-        feature_columns=feature_columns,
-        head=head,
-        center_bias=center_bias,
-        is_classification=_is_classification_head(head))
-    # pylint: enable=protected-access
-
-
-def boosted_trees_classifier_train_in_memory(
-    train_input_fn,
-    feature_columns,
-    model_dir=None,
-    n_classes=canned_boosted_trees._HOLD_FOR_MULTI_CLASS_SUPPORT,
-    weight_column=None,
-    label_vocabulary=None,
-    n_trees=100,
-    max_depth=6,
-    learning_rate=0.1,
-    l1_regularization=0.,
-    l2_regularization=0.,
-    tree_complexity=0.,
-    min_node_weight=0.,
-    config=None,
-    train_hooks=None,
-    center_bias=False,
-    pruning_mode='none'):
-  """Trains a boosted tree classifier with in memory dataset.
-
-  Example:
-
-  ```python
-  bucketized_feature_1 = bucketized_column(
-    numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
-  bucketized_feature_2 = bucketized_column(
-    numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
-
-  def train_input_fn():
-    dataset = create-dataset-from-training-data
-    # This is tf.data.Dataset of a tuple of feature dict and label.
-    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
-    #                     Dataset.from_tensors(label_array)))
-    # The returned Dataset shouldn't be batched.
-    # If Dataset repeats, only the first repetition would be used for training.
-    return dataset
-
-  classifier = boosted_trees_classifier_train_in_memory(
-      train_input_fn,
-      feature_columns=[bucketized_feature_1, bucketized_feature_2],
-      n_trees=100,
-      ... <some other params>
-  )
-
-  def input_fn_eval():
-    ...
-    return dataset
-
-  metrics = classifier.evaluate(input_fn=input_fn_eval, steps=10)
-  ```
-
-  Args:
-    train_input_fn: the input function returns a dataset containing a single
-      epoch of *unbatched* features and labels.
-    feature_columns: An iterable containing all the feature columns used by
-      the model. All items in the set should be instances of classes derived
-      from `FeatureColumn`.
-    model_dir: Directory to save model parameters, graph and etc. This can
-      also be used to load checkpoints from the directory into a estimator
-      to continue training a previously saved model.
-    n_classes: number of label classes. Default is binary classification.
-      Multiclass support is not yet implemented.
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to downweight or boost examples during training. It
-      will be multiplied by the loss of the example. If it is a string, it is
-      used as a key to fetch weight tensor from the `features`. If it is a
-      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-      then weight_column.normalizer_fn is applied on it to get weight tensor.
-    label_vocabulary: A list of strings represents possible label values. If
-      given, labels must be string type and have any value in
-      `label_vocabulary`. If it is not given, that means labels are
-      already encoded as integer or float within [0, 1] for `n_classes=2` and
-      encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
-      Also there will be errors if vocabulary is not provided and labels are
-      string.
-    n_trees: number trees to be created.
-    max_depth: maximum depth of the tree to grow.
-    learning_rate: shrinkage parameter to be used when a tree added to the
-      model.
-    l1_regularization: regularization multiplier applied to the absolute
-      weights of the tree leafs.
-    l2_regularization: regularization multiplier applied to the square weights
-      of the tree leafs.
-    tree_complexity: regularization factor to penalize trees with more leaves.
-    min_node_weight: minimum hessian a node must have for a split to be
-        considered. The value will be compared with sum(leaf_hessian)/
-        (batch_size * n_batches_per_layer).
-    config: `RunConfig` object to configure the runtime settings.
-    train_hooks: a list of Hook instances to be passed to estimator.train()
-    center_bias: Whether bias centering needs to occur. Bias centering refers
-        to the first node in the very first tree returning the prediction that
-        is aligned with the original labels distribution. For example, for
-        regression problems, the first node will return the mean of the labels.
-        For binary classification problems, it will return a logit for a prior
-        probability of label 1.
-    pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
-        pruning (do not split a node if not enough gain is observed) and post
-        pruning (build the tree up to a max depth and then prune branches with
-        negative gain). For pre and post pruning, you MUST provide
-        tree_complexity >0.
-
-  Returns:
-    a `BoostedTreesClassifier` instance created with the given arguments and
-      trained with the data loaded up on memory from the input_fn.
-
-  Raises:
-    ValueError: when wrong arguments are given or unsupported functionalities
-       are requested.
-  """
-  # pylint: disable=protected-access
-  # TODO(nponomareva): Support multi-class cases.
-  if n_classes == canned_boosted_trees._HOLD_FOR_MULTI_CLASS_SUPPORT:
-    n_classes = 2
-  head, closed_form = (
-      canned_boosted_trees._create_classification_head_and_closed_form(
-          n_classes, weight_column, label_vocabulary=label_vocabulary))
-
-  # HParams for the model.
-  tree_hparams = canned_boosted_trees._TreeHParams(
-      n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity, min_node_weight, center_bias, pruning_mode)
-
-  def _model_fn(features, labels, mode, config):
-    return canned_boosted_trees._bt_model_fn(
-        features,
-        labels,
-        mode,
-        head,
-        feature_columns,
-        tree_hparams,
-        n_batches_per_layer=1,
-        config=config,
-        closed_form_grad_and_hess_fn=closed_form,
-        train_in_memory=True)
-
-  in_memory_classifier = estimator.Estimator(
-      model_fn=_model_fn, model_dir=model_dir, config=config)
-
-  in_memory_classifier.train(
-      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
-      hooks=train_hooks)
-
-  return in_memory_classifier
-  # pylint: enable=protected-access
-
-
-def boosted_trees_regressor_train_in_memory(
-    train_input_fn,
-    feature_columns,
-    model_dir=None,
-    label_dimension=canned_boosted_trees._HOLD_FOR_MULTI_DIM_SUPPORT,
-    weight_column=None,
-    n_trees=100,
-    max_depth=6,
-    learning_rate=0.1,
-    l1_regularization=0.,
-    l2_regularization=0.,
-    tree_complexity=0.,
-    min_node_weight=0.,
-    config=None,
-    train_hooks=None,
-    center_bias=False,
-    pruning_mode='none'):
-  """Trains a boosted tree regressor with in memory dataset.
-
-  Example:
-
-  ```python
-  bucketized_feature_1 = bucketized_column(
-    numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
-  bucketized_feature_2 = bucketized_column(
-    numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
-
-  def train_input_fn():
-    dataset = create-dataset-from-training-data
-    # This is tf.data.Dataset of a tuple of feature dict and label.
-    #   e.g. Dataset.zip((Dataset.from_tensors({'f1': f1_array, ...}),
-    #                     Dataset.from_tensors(label_array)))
-    # The returned Dataset shouldn't be batched.
-    # If Dataset repeats, only the first repetition would be used for training.
-    return dataset
-
-  regressor = boosted_trees_regressor_train_in_memory(
-      train_input_fn,
-      feature_columns=[bucketized_feature_1, bucketized_feature_2],
-      n_trees=100,
-      ... <some other params>
-  )
-
-  def input_fn_eval():
-    ...
-    return dataset
-
-  metrics = regressor.evaluate(input_fn=input_fn_eval, steps=10)
-  ```
-
-  Args:
-    train_input_fn: the input function returns a dataset containing a single
-      epoch of *unbatched* features and labels.
-    feature_columns: An iterable containing all the feature columns used by
-      the model. All items in the set should be instances of classes derived
-      from `FeatureColumn`.
-    model_dir: Directory to save model parameters, graph and etc. This can
-      also be used to load checkpoints from the directory into a estimator
-      to continue training a previously saved model.
-    label_dimension: Number of regression targets per example.
-      Multi-dimensional support is not yet implemented.
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to downweight or boost examples during training. It
-      will be multiplied by the loss of the example. If it is a string, it is
-      used as a key to fetch weight tensor from the `features`. If it is a
-      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-      then weight_column.normalizer_fn is applied on it to get weight tensor.
-    n_trees: number trees to be created.
-    max_depth: maximum depth of the tree to grow.
-    learning_rate: shrinkage parameter to be used when a tree added to the
-      model.
-    l1_regularization: regularization multiplier applied to the absolute
-      weights of the tree leafs.
-    l2_regularization: regularization multiplier applied to the square weights
-      of the tree leafs.
-    tree_complexity: regularization factor to penalize trees with more leaves.
-    min_node_weight: minimum hessian a node must have for a split to be
-        considered. The value will be compared with sum(leaf_hessian)/
-        (batch_size * n_batches_per_layer).
-    config: `RunConfig` object to configure the runtime settings.
-    train_hooks: a list of Hook instances to be passed to estimator.train().
-    center_bias: Whether bias centering needs to occur. Bias centering refers
-        to the first node in the very first tree returning the prediction that
-        is aligned with the original labels distribution. For example, for
-        regression problems, the first node will return the mean of the labels.
-        For binary classification problems, it will return a logit for a prior
-        probability of label 1.
-    pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
-        pruning (do not split a node if not enough gain is observed) and post
-        pruning (build the tree up to a max depth and then prune branches with
-        negative gain). For pre and post pruning, you MUST provide
-        tree_complexity >0.
-
-  Returns:
-    a `BoostedTreesClassifier` instance created with the given arguments and
-      trained with the data loaded up on memory from the input_fn.
-
-  Raises:
-    ValueError: when wrong arguments are given or unsupported functionalities
-       are requested.
-  """
-  # pylint: disable=protected-access
-  # TODO(nponomareva): Extend it to multi-dimension cases.
-  if label_dimension == canned_boosted_trees._HOLD_FOR_MULTI_DIM_SUPPORT:
-    label_dimension = 1
-  head = canned_boosted_trees._create_regression_head(label_dimension,
-                                                      weight_column)
-
-  # HParams for the model.
-  tree_hparams = canned_boosted_trees._TreeHParams(
-      n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity, min_node_weight, center_bias, pruning_mode)
-
-  def _model_fn(features, labels, mode, config):
-    return canned_boosted_trees._bt_model_fn(
-        features,
-        labels,
-        mode,
-        head,
-        feature_columns,
-        tree_hparams,
-        n_batches_per_layer=1,
-        config=config,
-        train_in_memory=True)
-
-  in_memory_regressor = estimator.Estimator(
-      model_fn=_model_fn, model_dir=model_dir, config=config)
+from tensorflow_estimator.contrib.estimator.python.estimator import boosted_trees
 
-  in_memory_regressor.train(
-      input_fn=_validate_input_fn_and_repeat_dataset(train_input_fn),
-      hooks=train_hooks)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+boosted_trees.__all__ = [
+    s for s in dir(boosted_trees) if not s.startswith('__')
+]
 
-  return in_memory_regressor
-  # pylint: enable=protected-access
+from tensorflow_estimator.contrib.estimator.python.estimator.boosted_trees import *
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
deleted file mode 100644
index e23d9c0fc4c32ce0ce23dcf4be518577795dd35f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ /dev/null
@@ -1,438 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests boosted_trees estimators."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.estimator.python.estimator import boosted_trees
-from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-from tensorflow.python.training import checkpoint_utils
-
-NUM_FEATURES = 3
-
-BUCKET_BOUNDARIES = [-2., .5, 12.]  # Boundaries for all the features.
-INPUT_FEATURES = np.array(
-    [
-        [12.5, 1.0, -2.001, -2.0001, -1.999],  # feature_0 quantized:[3,2,0,0,1]
-        [2.0, -3.0, 0.5, 0.0, 0.4995],         # feature_1 quantized:[2,0,2,1,1]
-        [3.0, 20.0, 50.0, -100.0, 102.75],     # feature_2 quantized:[2,3,3,0,3]
-    ],
-    dtype=np.float32)
-CLASSIFICATION_LABELS = [[0.], [1.], [1.], [0.], [0.]]
-REGRESSION_LABELS = [[1.5], [0.3], [0.2], [2.], [5.]]
-FEATURES_DICT = {'f_%d' % i: INPUT_FEATURES[i] for i in range(NUM_FEATURES)}
-
-
-def _make_train_input_fn(is_classification):
-  """Makes train input_fn for classification/regression."""
-
-  def _input_fn():
-    features_dict = dict(FEATURES_DICT)
-    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
-    return features_dict, labels
-
-  return _input_fn
-
-
-def _make_train_input_fn_dataset(is_classification):
-  """Makes input_fn using Dataset."""
-
-  def _input_fn():
-    features_dict = dict(FEATURES_DICT)
-    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
-    ds = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(features_dict),
-         dataset_ops.Dataset.from_tensors(labels)
-        ))
-    return ds
-
-  return _input_fn
-
-
-class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._head = canned_boosted_trees._create_regression_head(label_dimension=1)
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
-                         attempted_layers):
-    reader = checkpoint_utils.load_checkpoint(model_dir)
-    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
-    serialized = reader.get_tensor('boosted_trees:0_serialized')
-    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-    ensemble_proto.ParseFromString(serialized)
-    self.assertEqual(
-        finalized_trees,
-        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
-    self.assertEqual(attempted_layers,
-                     ensemble_proto.growing_metadata.num_layers_attempted)
-
-  def testTrainAndEvaluateEstimator(self):
-    input_fn = _make_train_input_fn(is_classification=False)
-
-    est = boosted_trees._BoostedTreesEstimator(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=2,
-        head=self._head,
-        max_depth=5)
-
-    # It will stop after 10 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 1.008551)
-
-  def testTrainAndEvaluateEstimatorWithCenterBias(self):
-    input_fn = _make_train_input_fn(is_classification=False)
-
-    est = boosted_trees._BoostedTreesEstimator(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=2,
-        head=self._head,
-        max_depth=5,
-        center_bias=True)
-
-    # It will stop after 11 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(input_fn, steps=num_steps)
-    # 10 steps for training and 2 step for bias centering.
-    self._assert_checkpoint(
-        est.model_dir, global_step=12, finalized_trees=2, attempted_layers=10)
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 0.614642)
-
-  def testTrainAndEvaluateEstimatorWithPrePruning(self):
-    input_fn = _make_train_input_fn(is_classification=False)
-
-    est = boosted_trees._BoostedTreesEstimator(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=2,
-        head=self._head,
-        max_depth=5,
-        tree_complexity=0.001,
-        pruning_mode='pre')
-
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(input_fn, steps=num_steps)
-    # We stop actually after 2*depth*n_trees steps (via a hook) because we still
-    # could not grow 2 trees of depth 5 (due to pre-pruning).
-    self._assert_checkpoint(
-        est.model_dir, global_step=21, finalized_trees=0, attempted_layers=21)
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 3.83943)
-
-  def testTrainAndEvaluateEstimatorWithPostPruning(self):
-    input_fn = _make_train_input_fn(is_classification=False)
-
-    est = boosted_trees._BoostedTreesEstimator(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=2,
-        head=self._head,
-        max_depth=5,
-        tree_complexity=0.001,
-        pruning_mode='post')
-
-    # It will stop after 10 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.37652)
-
-  def testInferEstimator(self):
-    train_input_fn = _make_train_input_fn(is_classification=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees._BoostedTreesEstimator(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5,
-        head=self._head)
-
-    # It will stop after 5 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    # Validate predictions.
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose(
-        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
-        [pred['predictions'] for pred in predictions])
-
-  def testInferEstimatorWithCenterBias(self):
-    train_input_fn = _make_train_input_fn(is_classification=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees._BoostedTreesEstimator(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5,
-        center_bias=True,
-        head=self._head)
-
-    # It will stop after 6 steps because of the max depth and num trees (5 for
-    # training and 2 for bias centering).
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=7, finalized_trees=1, attempted_layers=5)
-    # Validate predictions.
-    predictions = list(est.predict(input_fn=predict_input_fn))
-
-    self.assertAllClose(
-        [[1.634501], [1.325703], [1.187431], [2.019683], [2.832683]],
-        [pred['predictions'] for pred in predictions])
-
-  def testBinaryClassifierTrainInMemoryAndEvalAndInfer(self):
-    train_input_fn = _make_train_input_fn(is_classification=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.boosted_trees_classifier_train_in_memory(
-        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
-        n_trees=1, max_depth=5)
-    # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-
-    # Check evaluate and predict.
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-    # Validate predictions.
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose([[0], [1], [1], [0], [0]],
-                        [pred['class_ids'] for pred in predictions])
-
-  def testBinaryClassifierTrainInMemoryAndEvalAndInferWithCenterBias(self):
-    train_input_fn = _make_train_input_fn(is_classification=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.boosted_trees_classifier_train_in_memory(
-        train_input_fn=train_input_fn,
-        feature_columns=self._feature_columns,
-        n_trees=1,
-        max_depth=5,
-        center_bias=True)
-    # It will stop after 5 steps + 3 for bias, because of the max depth and num
-    # trees.
-    self._assert_checkpoint(
-        est.model_dir, global_step=8, finalized_trees=1, attempted_layers=5)
-
-    # Check evaluate and predict.
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-    # Validate predictions.
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose([[0], [1], [1], [0], [0]],
-                        [pred['class_ids'] for pred in predictions])
-
-  def testBinaryClassifierTrainInMemoryAndEvalAndInferWithPrePruning(self):
-    train_input_fn = _make_train_input_fn(is_classification=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.boosted_trees_classifier_train_in_memory(
-        train_input_fn=train_input_fn,
-        feature_columns=self._feature_columns,
-        n_trees=1,
-        max_depth=5,
-        pruning_mode='pre',
-        tree_complexity=0.01)
-    # We stop actually after 2*depth*n_trees steps (via a hook) because we still
-    # could not grow 1 trees of depth 5 (due to pre-pruning).
-    self._assert_checkpoint(
-        est.model_dir, global_step=11, finalized_trees=0, attempted_layers=11)
-
-    # Check evaluate and predict.
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-    # Validate predictions.
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose([[0], [1], [1], [0], [0]],
-                        [pred['class_ids'] for pred in predictions])
-
-  def testBinaryClassifierTrainInMemoryWithDataset(self):
-    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.boosted_trees_classifier_train_in_memory(
-        train_input_fn=train_input_fn,
-        feature_columns=self._feature_columns,
-        n_trees=1,
-        max_depth=5)
-    # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-
-    # Check evaluate and predict.
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose([[0], [1], [1], [0], [0]],
-                        [pred['class_ids'] for pred in predictions])
-
-  def testRegressorTrainInMemoryAndEvalAndInfer(self):
-    train_input_fn = _make_train_input_fn(is_classification=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.boosted_trees_regressor_train_in_memory(
-        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
-        n_trees=1, max_depth=5)
-    # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-
-    # Check evaluate and predict.
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.478283)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose(
-        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
-        [pred['predictions'] for pred in predictions])
-
-  def testRegressorTrainInMemoryWithDataset(self):
-    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.boosted_trees_regressor_train_in_memory(
-        train_input_fn=train_input_fn, feature_columns=self._feature_columns,
-        n_trees=1, max_depth=5)
-    # It will stop after 5 steps because of the max depth and num trees.
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    # Check evaluate and predict.
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.478283)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose(
-        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
-        [pred['predictions'] for pred in predictions])
-
-
-class BoostedTreesDebugOutputTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._head = canned_boosted_trees._create_regression_head(label_dimension=1)
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
-    }
-
-  def testContribEstimatorThatDFCIsInPredictions(self):
-    # pylint:disable=protected-access
-    head = canned_boosted_trees._create_regression_head(label_dimension=1)
-    train_input_fn = _make_train_input_fn(is_classification=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees._BoostedTreesEstimator(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        head=head,
-        n_trees=1,
-        max_depth=5,
-        center_bias=True)
-    # pylint:enable=protected-access
-
-    num_steps = 100
-    # Train for a few steps. Validate debug outputs in prediction dicts.
-    est.train(train_input_fn, steps=num_steps)
-    debug_predictions = est.experimental_predict_with_explanations(
-        predict_input_fn)
-    biases, dfcs = zip(*[(pred['bias'], pred['dfc'])
-                         for pred in debug_predictions])
-    self.assertAllClose([1.8] * 5, biases)
-    self.assertAllClose(({
-        0: -0.070499420166015625,
-        1: -0.095000028610229492,
-        2: 0.0
-    }, {
-        0: -0.53763031959533691,
-        1: 0.063333392143249512,
-        2: 0.0
-    }, {
-        0: -0.51756942272186279,
-        1: -0.095000028610229492,
-        2: 0.0
-    }, {
-        0: 0.1563495397567749,
-        1: 0.063333392143249512,
-        2: 0.0
-    }, {
-        0: 0.96934974193572998,
-        1: 0.063333392143249512,
-        2: 0.0
-    }), dfcs)
-
-    # Assert sum(dfcs) + bias == predictions.
-    expected_predictions = [[1.6345005], [1.32570302], [1.1874305],
-                            [2.01968288], [2.83268309]]
-    predictions = [
-        [sum(dfc.values()) + bias] for (dfc, bias) in zip(dfcs, biases)
-    ]
-    self.assertAllClose(expected_predictions, predictions)
-
-    # Test when user doesn't include bias or dfc in predict_keys.
-    debug_predictions = est.experimental_predict_with_explanations(
-        predict_input_fn, predict_keys=['predictions'])
-    for prediction_dict in debug_predictions:
-      self.assertTrue('bias' in prediction_dict)
-      self.assertTrue('dfc' in prediction_dict)
-      self.assertTrue('predictions' in prediction_dict)
-      self.assertEqual(len(prediction_dict), 3)
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn.py b/tensorflow/contrib/estimator/python/estimator/dnn.py
deleted file mode 100644
index 9efa8f474d865a36788cba40a15404bf0b30a17e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/dnn.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Deep Neural Network estimators."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import dnn as dnn_lib
-from tensorflow.python.ops import nn
-
-
-class DNNEstimator(estimator.Estimator):
-  """An estimator for TensorFlow DNN models with user-specified head.
-
-  Example:
-
-  ```python
-  sparse_feature_a = sparse_column_with_hash_bucket(...)
-  sparse_feature_b = sparse_column_with_hash_bucket(...)
-
-  sparse_feature_a_emb = embedding_column(sparse_id_column=sparse_feature_a,
-                                          ...)
-  sparse_feature_b_emb = embedding_column(sparse_id_column=sparse_feature_b,
-                                          ...)
-
-  estimator = DNNEstimator(
-      head=tf.contrib.estimator.multi_label_head(n_classes=3),
-      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-      hidden_units=[1024, 512, 256])
-
-  # Or estimator using the ProximalAdagradOptimizer optimizer with
-  # regularization.
-  estimator = DNNEstimator(
-      head=tf.contrib.estimator.multi_label_head(n_classes=3),
-      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-      hidden_units=[1024, 512, 256],
-      optimizer=tf.train.ProximalAdagradOptimizer(
-        learning_rate=0.1,
-        l1_regularization_strength=0.001
-      ))
-
-  # Or estimator using an optimizer with a learning rate decay.
-  estimator = DNNEstimator(
-      head=tf.contrib.estimator.multi_label_head(n_classes=3),
-      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-      hidden_units=[1024, 512, 256],
-      optimizer=lambda: tf.AdamOptimizer(
-          learning_rate=tf.exponential_decay(
-              learning_rate=0.1,
-              global_step=tf.get_global_step(),
-              decay_steps=10000,
-              decay_rate=0.96))
-
-  # Or estimator with warm-starting from a previous checkpoint.
-  estimator = DNNEstimator(
-      head=tf.contrib.estimator.multi_label_head(n_classes=3),
-      feature_columns=[sparse_feature_a_emb, sparse_feature_b_emb],
-      hidden_units=[1024, 512, 256],
-      warm_start_from="/path/to/checkpoint/dir")
-
-  # Input builders
-  def input_fn_train: # returns x, y
-    pass
-  estimator.train(input_fn=input_fn_train, steps=100)
-
-  def input_fn_eval: # returns x, y
-    pass
-  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
-  def input_fn_predict: # returns x, None
-    pass
-  predictions = estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-    `key=weight_column` whose value is a `Tensor`.
-  * for each `column` in `feature_columns`:
-    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
-      with `key` the id column name, the second with `key` the weight column
-      name. Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss and predicted output are determined by the specified head.
-  """
-
-  def __init__(self,
-               head,
-               hidden_units,
-               feature_columns,
-               model_dir=None,
-               optimizer='Adagrad',
-               activation_fn=nn.relu,
-               dropout=None,
-               input_layer_partitioner=None,
-               config=None,
-               warm_start_from=None,
-               batch_norm=False):
-    """Initializes a `DNNEstimator` instance.
-
-    Args:
-      head: A `_Head` instance constructed with a method such as
-        `tf.contrib.estimator.multi_label_head`.
-      hidden_units: Iterable of number hidden units per layer. All layers are
-        fully connected. Ex. `[64, 32]` means first layer has 64 nodes and
-        second one has 32.
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `_FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
-        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
-        callable. Defaults to Adagrad optimizer.
-      activation_fn: Activation function applied to each layer. If `None`, will
-        use `tf.nn.relu`.
-      dropout: When not `None`, the probability we will drop out a given
-        coordinate.
-      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
-        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-      config: `RunConfig` object to configure the runtime settings.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights are warm-started, and it is assumed that vocabularies and Tensor
-        names are unchanged.
-      batch_norm: Whether to use batch normalization after each hidden layer.
-    """
-    def _model_fn(features, labels, mode, config):
-      return dnn_lib._dnn_model_fn(  # pylint: disable=protected-access
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          hidden_units=hidden_units,
-          feature_columns=tuple(feature_columns or []),
-          optimizer=optimizer,
-          activation_fn=activation_fn,
-          dropout=dropout,
-          input_layer_partitioner=input_layer_partitioner,
-          config=config,
-          batch_norm=batch_norm)
-    super(DNNEstimator, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config,
-        warm_start_from=warm_start_from)
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
deleted file mode 100644
index 724bc2c82f8289bbaa19a1dbbc1dc81b6e158e02..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorFlow estimator for Linear and DNN joined training models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import dnn_linear_combined as dnn_linear_combined_lib
-from tensorflow.python.ops import nn
-
-
-class DNNLinearCombinedEstimator(estimator.Estimator):
-  """An estimator for TensorFlow Linear and DNN joined models with custom head.
-
-  Note: This estimator is also known as wide-n-deep.
-
-  Example:
-
-  ```python
-  numeric_feature = numeric_column(...)
-  categorical_column_a = categorical_column_with_hash_bucket(...)
-  categorical_column_b = categorical_column_with_hash_bucket(...)
-
-  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
-  categorical_feature_a_emb = embedding_column(
-      categorical_column=categorical_feature_a, ...)
-  categorical_feature_b_emb = embedding_column(
-      categorical_column=categorical_feature_b, ...)
-
-  estimator = DNNLinearCombinedEstimator(
-      head=tf.contrib.estimator.multi_label_head(n_classes=3),
-      # wide settings
-      linear_feature_columns=[categorical_feature_a_x_categorical_feature_b],
-      linear_optimizer=tf.train.FtrlOptimizer(...),
-      # deep settings
-      dnn_feature_columns=[
-          categorical_feature_a_emb, categorical_feature_b_emb,
-          numeric_feature],
-      dnn_hidden_units=[1000, 500, 100],
-      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...))
-
-  # To apply L1 and L2 regularization, you can set dnn_optimizer to:
-  tf.train.ProximalAdagradOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001,
-      l2_regularization_strength=0.001)
-  # To apply learning rate decay, you can set dnn_optimizer to a callable:
-  lambda: tf.AdamOptimizer(
-      learning_rate=tf.exponential_decay(
-          learning_rate=0.1,
-          global_step=tf.get_global_step(),
-          decay_steps=10000,
-          decay_rate=0.96)
-  # It is the same for linear_optimizer.
-
-  # Input builders
-  def input_fn_train: # returns x, y
-    pass
-  estimator.train(input_fn=input_fn_train, steps=100)
-
-  def input_fn_eval: # returns x, y
-    pass
-  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
-  def input_fn_predict: # returns x, None
-    pass
-  predictions = estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-  * for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
-    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
-      with `key` the id column name, the second with `key` the weight column
-      name. Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss is calculated by using mean squared error.
-
-  @compatibility(eager)
-  Estimators are not compatible with eager execution.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               head,
-               model_dir=None,
-               linear_feature_columns=None,
-               linear_optimizer='Ftrl',
-               dnn_feature_columns=None,
-               dnn_optimizer='Adagrad',
-               dnn_hidden_units=None,
-               dnn_activation_fn=nn.relu,
-               dnn_dropout=None,
-               input_layer_partitioner=None,
-               config=None,
-               linear_sparse_combiner='sum'):
-    """Initializes a DNNLinearCombinedEstimator instance.
-
-    Args:
-      head: A `_Head` instance constructed with a method such as
-        `tf.contrib.estimator.multi_label_head`.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      linear_feature_columns: An iterable containing all the feature columns
-        used by linear part of the model. All items in the set must be
-        instances of classes derived from `FeatureColumn`.
-      linear_optimizer: An instance of `tf.Optimizer` used to apply gradients to
-        the linear part of the model. Can also be a string (one of 'Adagrad',
-        'Adam', 'Ftrl', 'RMSProp', 'SGD'), or callable. Defaults to FTRL
-        optimizer.
-      dnn_feature_columns: An iterable containing all the feature columns used
-        by deep part of the model. All items in the set must be instances of
-        classes derived from `FeatureColumn`.
-      dnn_optimizer: An instance of `tf.Optimizer` used to apply gradients to
-        the deep part of the model. Can also be a string (one of 'Adagrad',
-        'Adam', 'Ftrl', 'RMSProp', 'SGD'), or callable. Defaults to Adagrad
-        optimizer.
-      dnn_hidden_units: List of hidden units per layer. All layers are fully
-        connected.
-      dnn_activation_fn: Activation function applied to each layer. If None,
-        will use `tf.nn.relu`.
-      dnn_dropout: When not None, the probability we will drop out
-        a given coordinate.
-      input_layer_partitioner: Partitioner for input layer. Defaults to
-        `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-      config: RunConfig object to configure the runtime settings.
-      linear_sparse_combiner: A string specifying how to reduce the linear model
-        if a categorical column is multivalent.  One of "mean", "sqrtn", and
-        "sum" -- these are effectively different ways to do example-level
-        normalization, which can be useful for bag-of-words features.  For more
-        details, see `tf.feature_column.linear_model`.
-
-    Raises:
-      ValueError: If both linear_feature_columns and dnn_features_columns are
-        empty at the same time.
-    """
-    linear_feature_columns = linear_feature_columns or []
-    dnn_feature_columns = dnn_feature_columns or []
-    self._feature_columns = (
-        list(linear_feature_columns) + list(dnn_feature_columns))
-    if not self._feature_columns:
-      raise ValueError('Either linear_feature_columns or dnn_feature_columns '
-                       'must be defined.')
-
-    def _model_fn(features, labels, mode, config):
-      return dnn_linear_combined_lib._dnn_linear_combined_model_fn(  # pylint: disable=protected-access
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          linear_feature_columns=linear_feature_columns,
-          linear_optimizer=linear_optimizer,
-          dnn_feature_columns=dnn_feature_columns,
-          dnn_optimizer=dnn_optimizer,
-          dnn_hidden_units=dnn_hidden_units,
-          dnn_activation_fn=dnn_activation_fn,
-          dnn_dropout=dnn_dropout,
-          input_layer_partitioner=input_layer_partitioner,
-          config=config,
-          linear_sparse_combiner=linear_sparse_combiner)
-
-    super(DNNLinearCombinedEstimator, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
deleted file mode 100644
index 51b9ce7005cec3910ba73db62a674e4628ca30a2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for dnn_linear_combined.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.contrib.estimator.python.estimator import dnn_linear_combined
-from tensorflow.contrib.estimator.python.estimator import head as head_lib
-from tensorflow.python.estimator.canned import dnn_testing_utils
-from tensorflow.python.estimator.canned import linear_testing_utils
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-
-
-def _dnn_only_estimator_fn(
-    hidden_units,
-    feature_columns,
-    model_dir=None,
-    label_dimension=1,
-    weight_column=None,
-    optimizer='Adagrad',
-    activation_fn=nn.relu,
-    dropout=None,
-    input_layer_partitioner=None,
-    config=None):
-  return dnn_linear_combined.DNNLinearCombinedEstimator(
-      head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension,
-          # Tests in core (from which this test inherits) test the sum loss.
-          loss_reduction=losses.Reduction.SUM),
-      model_dir=model_dir,
-      dnn_feature_columns=feature_columns,
-      dnn_optimizer=optimizer,
-      dnn_hidden_units=hidden_units,
-      dnn_activation_fn=activation_fn,
-      dnn_dropout=dropout,
-      input_layer_partitioner=input_layer_partitioner,
-      config=config)
-
-
-class DNNOnlyEstimatorEvaluateTest(
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
-        self, _dnn_only_estimator_fn)
-
-
-class DNNOnlyEstimatorPredictTest(
-    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
-        self, _dnn_only_estimator_fn)
-
-
-class DNNOnlyEstimatorTrainTest(
-    dnn_testing_utils.BaseDNNRegressorTrainTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
-        self, _dnn_only_estimator_fn)
-
-
-def _linear_only_estimator_fn(
-    feature_columns,
-    model_dir=None,
-    label_dimension=1,
-    weight_column=None,
-    optimizer='Ftrl',
-    config=None,
-    partitioner=None,
-    sparse_combiner='sum'):
-  return dnn_linear_combined.DNNLinearCombinedEstimator(
-      head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension,
-          # Tests in core (from which this test inherits) test the sum loss.
-          loss_reduction=losses.Reduction.SUM),
-      model_dir=model_dir,
-      linear_feature_columns=feature_columns,
-      linear_optimizer=optimizer,
-      input_layer_partitioner=partitioner,
-      config=config,
-      linear_sparse_combiner=sparse_combiner)
-
-
-class LinearOnlyEstimatorEvaluateTest(
-    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
-        self, _linear_only_estimator_fn)
-
-
-class LinearOnlyEstimatorPredictTest(
-    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
-        self, _linear_only_estimator_fn)
-
-
-class LinearOnlyEstimatorTrainTest(
-    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
-        self, _linear_only_estimator_fn)
-
-
-class DNNLinearCombinedEstimatorIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(
-      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
-      label_dimension, batch_size):
-    linear_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))]
-    dnn_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))]
-    feature_columns = linear_feature_columns + dnn_feature_columns
-    est = dnn_linear_combined.DNNLinearCombinedEstimator(
-        head=head_lib.regression_head(label_dimension=label_dimension),
-        linear_feature_columns=linear_feature_columns,
-        dnn_feature_columns=dnn_feature_columns,
-        dnn_hidden_units=(2, 2),
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_test.py
deleted file mode 100644
index 050b0428bf7b685229e12561cfb0682d931299d2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/dnn_test.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for dnn.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.contrib.estimator.python.estimator import dnn
-from tensorflow.contrib.estimator.python.estimator import head as head_lib
-from tensorflow.python.estimator.canned import dnn_testing_utils
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import ops
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-
-
-def _dnn_estimator_fn(weight_column=None, label_dimension=1, *args, **kwargs):  # pylint: disable=keyword-arg-before-vararg
-  """Returns a DNNEstimator that uses regression_head."""
-  return dnn.DNNEstimator(
-      head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension,
-          # Tests in core (from which this test inherits) test the sum loss.
-          loss_reduction=losses.Reduction.SUM),
-      *args, **kwargs)
-
-
-def _dnn_estimator_classifier_fn(n_classes=3, *args, **kwargs):  # pylint: disable=keyword-arg-before-vararg
-  """Returns a DNNEstimator that uses multi_class_head."""
-  return dnn.DNNEstimator(head=head_lib.multi_class_head(n_classes=n_classes),
-                          *args, **kwargs)
-
-
-class DNNEstimatorEvaluateTest(
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
-        self, _dnn_estimator_fn)
-
-
-class DNNEstimatorPredictTest(
-    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
-        self, _dnn_estimator_fn)
-
-
-class DNNEstimatorTrainTest(
-    dnn_testing_utils.BaseDNNRegressorTrainTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
-        self, _dnn_estimator_fn)
-
-
-class DNNEstimatorWarmStartingTest(dnn_testing_utils.BaseDNNWarmStartingTest,
-                                   test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNWarmStartingTest.__init__(
-        self, _dnn_estimator_classifier_fn, _dnn_estimator_fn)
-
-
-class DNNEstimatorIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(
-      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
-      label_dimension, batch_size):
-    feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))]
-    est = dnn.DNNEstimator(
-        head=head_lib.regression_head(label_dimension=label_dimension),
-        hidden_units=(2, 2),
-        feature_columns=feature_columns,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py b/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py
index 6ca7aaf98972c76c608c9c397a82ca94286a2656..854d2e4011b40428b8048e9d61411f66c1bb3840 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,425 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Deep Neural Network estimators with layer annotations."""
+"""dnn_with_layer_annotations python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-import pickle
-
-from google.protobuf.any_pb2 import Any
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import dnn
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.saved_model import utils as saved_model_utils
-
-
-class LayerAnnotationsCollectionNames(object):
-  """Names for the collections containing the annotations."""
-
-  UNPROCESSED_FEATURES = 'layer_annotations/unprocessed_features'
-  PROCESSED_FEATURES = 'layer_annotatons/processed_features'
-  FEATURE_COLUMNS = 'layer_annotations/feature_columns'
-
-  @classmethod
-  def keys(cls, collection_name):
-    return '%s/keys' % collection_name
-
-  @classmethod
-  def values(cls, collection_name):
-    return '%s/values' % collection_name
-
-
-def serialize_feature_column(feature_column):
-  if isinstance(feature_column, feature_column_lib._EmbeddingColumn):  # pylint: disable=protected-access
-    # We can't pickle nested functions, and we don't need the value of
-    # layer_creator in most cases anyway, so just discard its value.
-    args = feature_column._asdict()
-    args['layer_creator'] = None
-    temp = type(feature_column)(**args)
-    return pickle.dumps(temp)
-  return pickle.dumps(feature_column)
-
-
-def _to_any_wrapped_tensor_info(tensor):
-  """Converts a `Tensor` to a `TensorInfo` wrapped in a proto `Any`."""
-  any_buf = Any()
-  tensor_info = saved_model_utils.build_tensor_info(tensor)
-  any_buf.Pack(tensor_info)
-  return any_buf
-
-
-def make_input_layer_with_layer_annotations(original_input_layer):
-  """Make an input_layer replacement function that adds layer annotations."""
-
-  def input_layer_with_layer_annotations(features,
-                                         feature_columns,
-                                         weight_collections=None,
-                                         trainable=True,
-                                         cols_to_vars=None,
-                                         scope=None,
-                                         cols_to_output_tensors=None,
-                                         from_template=False):
-    """Returns a dense `Tensor` as input layer based on given `feature_columns`.
-
-    Generally a single example in training data is described with
-    FeatureColumns.
-    At the first layer of the model, this column oriented data should be
-    converted
-    to a single `Tensor`.
-
-    This is like tf.feature_column.input_layer, except with added
-    Integrated-Gradient annotations.
-
-    Args:
-      features: A mapping from key to tensors. `_FeatureColumn`s look up via
-        these keys. For example `numeric_column('price')` will look at 'price'
-        key in this dict. Values can be a `SparseTensor` or a `Tensor` depends
-        on corresponding `_FeatureColumn`.
-      feature_columns: An iterable containing the FeatureColumns to use as
-        inputs to your model. All items should be instances of classes derived
-        from `_DenseColumn` such as `numeric_column`, `embedding_column`,
-        `bucketized_column`, `indicator_column`. If you have categorical
-        features, you can wrap them with an `embedding_column` or
-        `indicator_column`.
-      weight_collections: A list of collection names to which the Variable will
-        be added. Note that variables will also be added to collections
-        `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
-      trainable: If `True` also add the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-      cols_to_vars: If not `None`, must be a dictionary that will be filled with
-        a mapping from `_FeatureColumn` to list of `Variable`s.  For example,
-        after the call, we might have cols_to_vars = {_EmbeddingColumn(
-        categorical_column=_HashedCategoricalColumn( key='sparse_feature',
-        hash_bucket_size=5, dtype=tf.string), dimension=10): [<tf.Variable
-        'some_variable:0' shape=(5, 10), <tf.Variable 'some_variable:1'
-          shape=(5, 10)]} If a column creates no variables, its value will be an
-          empty list.
-      scope: A name or variable scope to use
-      cols_to_output_tensors: If not `None`, must be a dictionary that will be
-        filled with a mapping from '_FeatureColumn' to the associated output
-        `Tensor`s.
-      from_template: True if the method is being instantiated from a
-        `make_template`.
-
-    Returns:
-      A `Tensor` which represents input layer of a model. Its shape
-      is (batch_size, first_layer_dimension) and its dtype is `float32`.
-      first_layer_dimension is determined based on given `feature_columns`.
-
-    Raises:
-      ValueError: features and feature_columns have different lengths.
-    """
-
-    local_cols_to_output_tensors = {}
-    input_layer = original_input_layer(
-        features=features,
-        feature_columns=feature_columns,
-        weight_collections=weight_collections,
-        trainable=trainable,
-        cols_to_vars=cols_to_vars,
-        scope=scope,
-        cols_to_output_tensors=local_cols_to_output_tensors,
-        from_template=from_template)
-
-    if cols_to_output_tensors is not None:
-      cols_to_output_tensors = local_cols_to_output_tensors
-
-    # Annotate features.
-    # These are the parsed Tensors, before embedding.
-
-    # Only annotate features used by FeatureColumns.
-    # We figure which ones are used by FeatureColumns by creating a parsing
-    # spec and looking at the keys.
-    spec = feature_column_lib.make_parse_example_spec(feature_columns)
-    for key in spec.keys():
-      tensor = ops.convert_to_tensor_or_indexed_slices(features[key])
-      ops.add_to_collection(
-          LayerAnnotationsCollectionNames.keys(
-              LayerAnnotationsCollectionNames.UNPROCESSED_FEATURES), key)
-      ops.add_to_collection(
-          LayerAnnotationsCollectionNames.values(
-              LayerAnnotationsCollectionNames.UNPROCESSED_FEATURES),
-          _to_any_wrapped_tensor_info(tensor))
-
-    # Annotate feature columns.
-    for column in feature_columns:
-      # TODO(cyfoo): Find a better way to serialize and deserialize
-      # _FeatureColumn.
-      ops.add_to_collection(LayerAnnotationsCollectionNames.FEATURE_COLUMNS,
-                            serialize_feature_column(column))
-
-    for column, tensor in local_cols_to_output_tensors.items():
-      ops.add_to_collection(
-          LayerAnnotationsCollectionNames.keys(
-              LayerAnnotationsCollectionNames.PROCESSED_FEATURES), column.name)
-      ops.add_to_collection(
-          LayerAnnotationsCollectionNames.values(
-              LayerAnnotationsCollectionNames.PROCESSED_FEATURES),
-          _to_any_wrapped_tensor_info(tensor))
-
-    return input_layer
-
-  return input_layer_with_layer_annotations
-
-
-@contextlib.contextmanager
-def _monkey_patch(module, function, replacement):
-  old_function = getattr(module, function)
-  setattr(module, function, replacement)
-  yield
-  setattr(module, function, old_function)
-
-
-def DNNClassifierWithLayerAnnotations(  # pylint: disable=invalid-name
-    hidden_units,
-    feature_columns,
-    model_dir=None,
-    n_classes=2,
-    weight_column=None,
-    label_vocabulary=None,
-    optimizer='Adagrad',
-    activation_fn=nn.relu,
-    dropout=None,
-    input_layer_partitioner=None,
-    config=None,
-    warm_start_from=None,
-    loss_reduction=losses.Reduction.SUM):
-  """A classifier for TensorFlow DNN models with layer annotations.
-
-  This classifier is fuctionally identical to estimator.DNNClassifier as far as
-  training and evaluating models is concerned. The key difference is that this
-  classifier adds additional layer annotations, which can be used for computing
-  Integrated Gradients.
-
-  Integrated Gradients is a method for attributing a classifier's predictions
-  to its input features (https://arxiv.org/pdf/1703.01365.pdf). Given an input
-  instance, the method assigns attribution scores to individual features in
-  proportion to the feature's importance to the classifier's prediction.
-
-  See estimator.DNNClassifer for example code for training and evaluating models
-  using this classifier.
-
-  This classifier is checkpoint-compatible with estimator.DNNClassifier and
-  therefore the following should work seamlessly:
-
-  # Instantiate ordinary estimator as usual.
-  estimator = tf.estimator.DNNClassifier(
-    config, feature_columns, hidden_units, ...)
-
-  # Train estimator, export checkpoint.
-  tf.estimator.train_and_evaluate(estimator, ...)
-
-  # Instantiate estimator with annotations with the same configuration as the
-  # ordinary estimator.
-  estimator_with_annotations = (
-    tf.contrib.estimator.DNNClassifierWithLayerAnnotations(
-      config, feature_columns, hidden_units, ...))
-
-  # Call export_savedmodel with the same arguments as the ordinary estimator,
-  # using the checkpoint produced for the ordinary estimator.
-  estimator_with_annotations.export_saved_model(
-    export_dir_base, serving_input_receiver, ...
-    checkpoint_path='/path/to/ordinary/estimator/checkpoint/model.ckpt-1234')
-
-  Args:
-    hidden_units: Iterable of number hidden units per layer. All layers are
-      fully connected. Ex. `[64, 32]` means first layer has 64 nodes and second
-      one has 32.
-    feature_columns: An iterable containing all the feature columns used by the
-      model. All items in the set should be instances of classes derived from
-      `_FeatureColumn`.
-    model_dir: Directory to save model parameters, graph and etc. This can also
-      be used to load checkpoints from the directory into a estimator to
-      continue training a previously saved model.
-    n_classes: Number of label classes. Defaults to 2, namely binary
-      classification. Must be > 1.
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example. If it is a string, it is
-      used as a key to fetch weight tensor from the `features`. If it is a
-      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`, then
-      weight_column.normalizer_fn is applied on it to get weight tensor.
-    label_vocabulary: A list of strings represents possible label values. If
-      given, labels must be string type and have any value in
-      `label_vocabulary`. If it is not given, that means labels are already
-      encoded as integer or float within [0, 1] for `n_classes=2` and encoded as
-      integer values in {0, 1,..., n_classes-1} for `n_classes`>2 . Also there
-      will be errors if vocabulary is not provided and labels are string.
-    optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
-      to Adagrad optimizer.
-    activation_fn: Activation function applied to each layer. If `None`, will
-      use `tf.nn.relu`.
-    dropout: When not `None`, the probability we will drop out a given
-      coordinate.
-    input_layer_partitioner: Optional. Partitioner for input layer. Defaults to
-      `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-    config: `RunConfig` object to configure the runtime settings.
-    warm_start_from: A string filepath to a checkpoint to warm-start from, or a
-      `WarmStartSettings` object to fully configure warm-starting.  If the
-      string filepath is provided instead of a `WarmStartSettings`, then all
-      weights are warm-started, and it is assumed that vocabularies and Tensor
-      names are unchanged.
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
-
-  Returns:
-    DNNClassifier with layer annotations.
-  """
-
-  original = dnn.DNNClassifier(
-      hidden_units=hidden_units,
-      feature_columns=feature_columns,
-      model_dir=model_dir,
-      n_classes=n_classes,
-      weight_column=weight_column,
-      label_vocabulary=label_vocabulary,
-      optimizer=optimizer,
-      activation_fn=activation_fn,
-      dropout=dropout,
-      input_layer_partitioner=input_layer_partitioner,
-      config=config,
-      warm_start_from=warm_start_from,
-      loss_reduction=loss_reduction)
-
-  def _model_fn(features, labels, mode, config):
-    with _monkey_patch(
-        feature_column_lib, '_internal_input_layer',
-        make_input_layer_with_layer_annotations(
-            feature_column_lib._internal_input_layer)):  # pylint: disable=protected-access
-      return original.model_fn(features, labels, mode, config)
-
-  return estimator.Estimator(
-      model_fn=_model_fn,
-      model_dir=model_dir,
-      config=config,
-      warm_start_from=warm_start_from)
-
-
-def DNNRegressorWithLayerAnnotations(  # pylint: disable=invalid-name
-    hidden_units,
-    feature_columns,
-    model_dir=None,
-    label_dimension=1,
-    weight_column=None,
-    optimizer='Adagrad',
-    activation_fn=nn.relu,
-    dropout=None,
-    input_layer_partitioner=None,
-    config=None,
-    warm_start_from=None,
-    loss_reduction=losses.Reduction.SUM,
-):
-  """A regressor for TensorFlow DNN models with layer annotations.
-
-  This regressor is fuctionally identical to estimator.DNNRegressor as far as
-  training and evaluating models is concerned. The key difference is that this
-  classifier adds additional layer annotations, which can be used for computing
-  Integrated Gradients.
-
-  Integrated Gradients is a method for attributing a classifier's predictions
-  to its input features (https://arxiv.org/pdf/1703.01365.pdf). Given an input
-  instance, the method assigns attribution scores to individual features in
-  proportion to the feature's importance to the classifier's prediction.
-
-  See estimator.DNNRegressor for example code for training and evaluating models
-  using this regressor.
-
-  This regressor is checkpoint-compatible with estimator.DNNRegressor and
-  therefore the following should work seamlessly:
-
-  # Instantiate ordinary estimator as usual.
-  estimator = tf.estimator.DNNRegressor(
-    config, feature_columns, hidden_units, ...)
-
-  # Train estimator, export checkpoint.
-  tf.estimator.train_and_evaluate(estimator, ...)
-
-  # Instantiate estimator with annotations with the same configuration as the
-  # ordinary estimator.
-  estimator_with_annotations = (
-    tf.contrib.estimator.DNNRegressorWithLayerAnnotations(
-      config, feature_columns, hidden_units, ...))
-
-  # Call export_savedmodel with the same arguments as the ordinary estimator,
-  # using the checkpoint produced for the ordinary estimator.
-  estimator_with_annotations.export_saved_model(
-    export_dir_base, serving_input_receiver, ...
-    checkpoint_path='/path/to/ordinary/estimator/checkpoint/model.ckpt-1234')
-
-  Args:
-    hidden_units: Iterable of number hidden units per layer. All layers are
-      fully connected. Ex. `[64, 32]` means first layer has 64 nodes and second
-      one has 32.
-    feature_columns: An iterable containing all the feature columns used by the
-      model. All items in the set should be instances of classes derived from
-      `_FeatureColumn`.
-    model_dir: Directory to save model parameters, graph and etc. This can also
-      be used to load checkpoints from the directory into a estimator to
-      continue training a previously saved model.
-    label_dimension: Number of regression targets per example. This is the size
-      of the last dimension of the labels and logits `Tensor` objects
-      (typically, these have shape `[batch_size, label_dimension]`).
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example. If it is a string, it is
-      used as a key to fetch weight tensor from the `features`. If it is a
-      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`, then
-      weight_column.normalizer_fn is applied on it to get weight tensor.
-    optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
-      to Adagrad optimizer.
-    activation_fn: Activation function applied to each layer. If `None`, will
-      use `tf.nn.relu`.
-    dropout: When not `None`, the probability we will drop out a given
-      coordinate.
-    input_layer_partitioner: Optional. Partitioner for input layer. Defaults to
-      `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-    config: `RunConfig` object to configure the runtime settings.
-    warm_start_from: A string filepath to a checkpoint to warm-start from, or a
-      `WarmStartSettings` object to fully configure warm-starting.  If the
-      string filepath is provided instead of a `WarmStartSettings`, then all
-      weights are warm-started, and it is assumed that vocabularies and Tensor
-      names are unchanged.
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
-
-  Returns:
-    DNNRegressor with layer annotations.
-  """
-
-  original = dnn.DNNRegressor(
-      hidden_units=hidden_units,
-      feature_columns=feature_columns,
-      model_dir=model_dir,
-      label_dimension=label_dimension,
-      weight_column=weight_column,
-      optimizer=optimizer,
-      activation_fn=activation_fn,
-      dropout=dropout,
-      input_layer_partitioner=input_layer_partitioner,
-      config=config,
-      warm_start_from=warm_start_from,
-      loss_reduction=loss_reduction,
-  )
+from tensorflow_estimator.contrib.estimator.python.estimator import dnn_with_layer_annotations
 
-  def _model_fn(features, labels, mode, config):
-    with _monkey_patch(
-        feature_column_lib, '_internal_input_layer',
-        make_input_layer_with_layer_annotations(
-            feature_column_lib._internal_input_layer)):  # pylint: disable=protected-access
-      return original.model_fn(features, labels, mode, config)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+dnn_with_layer_annotations.__all__ = [
+    s for s in dir(dnn_with_layer_annotations) if not s.startswith('__')
+]
 
-  return estimator.Estimator(
-      model_fn=_model_fn,
-      model_dir=model_dir,
-      config=config,
-      warm_start_from=warm_start_from)
+from tensorflow_estimator.contrib.estimator.python.estimator.dnn_with_layer_annotations import *
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations_test.py
deleted file mode 100644
index 2fe3d4c72e731a3f17ad41dc0482c6a759d5642e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/dnn_with_layer_annotations_test.py
+++ /dev/null
@@ -1,611 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for dnn_with_layer_annotations.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.contrib.estimator.python.estimator import dnn_with_layer_annotations
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.canned import dnn
-from tensorflow.python.estimator.canned import dnn_testing_utils
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.estimator.inputs import pandas_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import input as input_lib
-from tensorflow.python.training import queue_runner
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-
-def _dnn_classifier_fn(*args, **kwargs):
-  return dnn_with_layer_annotations.DNNClassifierWithLayerAnnotations(
-      *args, **kwargs)
-
-
-class DNNWarmStartingTest(dnn_testing_utils.BaseDNNWarmStartingTest,
-                          test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNWarmStartingTest.__init__(self, _dnn_classifier_fn,
-                                                       _dnn_regressor_fn)
-
-
-class DNNWithLayerAnnotationsClassifierEvaluateTest(
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest.__init__(
-        self, _dnn_classifier_fn)
-
-
-class DNNClassifierWithLayerAnnotationsPredictTest(
-    dnn_testing_utils.BaseDNNClassifierPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierPredictTest.__init__(
-        self, _dnn_classifier_fn)
-
-
-class DNNClassifierWithLayerAnnotationsTrainTest(
-    dnn_testing_utils.BaseDNNClassifierTrainTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierTrainTest.__init__(
-        self, _dnn_classifier_fn)
-
-
-def _dnn_regressor_fn(*args, **kwargs):
-  return dnn_with_layer_annotations.DNNRegressorWithLayerAnnotations(
-      *args, **kwargs)
-
-
-class DNNWithLayerAnnotationsTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def _getLayerAnnotationCollection(self, graph, collection_name):
-    keys = graph.get_collection(
-        dnn_with_layer_annotations.LayerAnnotationsCollectionNames.keys(
-            collection_name))
-    values = graph.get_collection(
-        dnn_with_layer_annotations.LayerAnnotationsCollectionNames.values(
-            collection_name))
-    if len(keys) != len(values):
-      raise ValueError('keys and values should have same length. lengths were: '
-                       '%d and %d, and elements were %s and %s' %
-                       (len(keys), len(values), keys, values))
-    return dict(zip(keys, values))
-
-  def _testAnnotationsPresentForEstimator(self, estimator_class):
-    feature_columns = [
-        feature_column.numeric_column('x', shape=(1,)),
-        feature_column.embedding_column(
-            feature_column.categorical_column_with_vocabulary_list(
-                'y', vocabulary_list=['a', 'b', 'c']),
-            dimension=3)
-    ]
-    estimator = estimator_class(
-        hidden_units=(2, 2),
-        feature_columns=feature_columns,
-        model_dir=self._model_dir)
-    model_fn = estimator.model_fn
-
-    graph = ops.Graph()
-    with graph.as_default():
-      model_fn({
-          'x': array_ops.constant([1.0]),
-          'y': array_ops.constant(['a'])
-      }, {},
-               model_fn_lib.ModeKeys.PREDICT,
-               config=None)
-
-      unprocessed_features = self._getLayerAnnotationCollection(
-          graph, dnn_with_layer_annotations.LayerAnnotationsCollectionNames
-          .UNPROCESSED_FEATURES)
-      processed_features = self._getLayerAnnotationCollection(
-          graph, dnn_with_layer_annotations.LayerAnnotationsCollectionNames
-          .PROCESSED_FEATURES)
-      feature_columns = graph.get_collection(
-          dnn_with_layer_annotations.LayerAnnotationsCollectionNames
-          .FEATURE_COLUMNS)
-
-      self.assertItemsEqual(unprocessed_features.keys(), ['x', 'y'])
-      self.assertEqual(2, len(processed_features.keys()))
-      self.assertEqual(2, len(feature_columns))
-
-  def testAnnotationsPresentForClassifier(self):
-    self._testAnnotationsPresentForEstimator(
-        dnn_with_layer_annotations.DNNClassifierWithLayerAnnotations)
-
-  def testAnnotationsPresentForRegressor(self):
-    self._testAnnotationsPresentForEstimator(
-        dnn_with_layer_annotations.DNNRegressorWithLayerAnnotations)
-
-  def _testCheckpointCompatibleWithNonAnnotatedEstimator(
-      self, train_input_fn, predict_input_fn, non_annotated_class,
-      annotated_class, prediction_key, estimator_args):
-    input_dimension = 2
-    feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))
-    ]
-    estimator = non_annotated_class(
-        model_dir=self._model_dir,
-        hidden_units=(2, 2),
-        feature_columns=feature_columns,
-        **estimator_args)
-
-    estimator.train(train_input_fn, steps=10)
-
-    predictions = np.array(
-        [x[prediction_key] for x in estimator.predict(predict_input_fn)])
-
-    annotated_estimator = annotated_class(
-        model_dir=self._model_dir,
-        hidden_units=(2, 2),
-        feature_columns=feature_columns,
-        warm_start_from=self._model_dir,
-        **estimator_args)
-
-    annotated_predictions = np.array([
-        x[prediction_key] for x in annotated_estimator.predict(predict_input_fn)
-    ])
-
-    self.assertAllEqual(predictions.shape, annotated_predictions.shape)
-    for i, (a, b) in enumerate(
-        zip(predictions.flatten(), annotated_predictions.flatten())):
-      self.assertAlmostEqual(a, b, msg='index=%d' % i)
-
-  def testCheckpointCompatibleForClassifier(self):
-    n_classes = 2
-    input_dimension = 2
-    batch_size = 10
-    data = np.linspace(
-        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
-    x_data = data.reshape(batch_size, input_dimension)
-    y_data = np.reshape(
-        np.rint(data[:batch_size]).astype(np.int64), (batch_size, 1))
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data},
-        y=y_data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data}, batch_size=batch_size, shuffle=False)
-
-    self._testCheckpointCompatibleWithNonAnnotatedEstimator(
-        train_input_fn,
-        predict_input_fn,
-        dnn.DNNClassifier,
-        dnn_with_layer_annotations.DNNClassifierWithLayerAnnotations,
-        prediction_key=prediction_keys.PredictionKeys.PROBABILITIES,
-        estimator_args={'n_classes': n_classes})
-
-  def testCheckpointCompatibleForRegressor(self):
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, batch_size=batch_size, shuffle=False)
-
-    self._testCheckpointCompatibleWithNonAnnotatedEstimator(
-        train_input_fn,
-        predict_input_fn,
-        dnn.DNNRegressor,
-        dnn_with_layer_annotations.DNNRegressorWithLayerAnnotations,
-        prediction_key=prediction_keys.PredictionKeys.PREDICTIONS,
-        estimator_args={'label_dimension': label_dimension})
-
-
-class DNNRegressorWithLayerAnnotationsEvaluateTest(
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
-        self, _dnn_regressor_fn)
-
-
-class DNNRegressorWithLayerAnnotationsPredictTest(
-    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
-        self, _dnn_regressor_fn)
-
-
-class DNNRegressorWithLayerAnnotationsTrainTest(
-    dnn_testing_utils.BaseDNNRegressorTrainTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
-        self, _dnn_regressor_fn)
-
-
-def _queue_parsed_features(feature_map):
-  tensors_to_enqueue = []
-  keys = []
-  for key, tensor in six.iteritems(feature_map):
-    keys.append(key)
-    tensors_to_enqueue.append(tensor)
-  queue_dtypes = [x.dtype for x in tensors_to_enqueue]
-  input_queue = data_flow_ops.FIFOQueue(capacity=100, dtypes=queue_dtypes)
-  queue_runner.add_queue_runner(
-      queue_runner.QueueRunner(input_queue,
-                               [input_queue.enqueue(tensors_to_enqueue)]))
-  dequeued_tensors = input_queue.dequeue()
-  return {keys[i]: dequeued_tensors[i] for i in range(len(dequeued_tensors))}
-
-
-class DNNRegressorWithLayerAnnotationsIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, label_dimension, batch_size):
-    feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))
-    ]
-    est = dnn_with_layer_annotations.DNNRegressorWithLayerAnnotations(
-        hidden_units=(2, 2),
-        feature_columns=feature_columns,
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, y=data, batch_size=batch_size, shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size)
-
-  def test_pandas_input_fn(self):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-    label_dimension = 1
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size, dtype=np.float32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(data)
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size)
-
-  def test_input_fn_from_parse_example(self):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(
-          features=feature_pb2.Features(
-              feature={
-                  'x':
-                      feature_pb2.Feature(
-                          float_list=feature_pb2.FloatList(value=datum)),
-                  'y':
-                      feature_pb2.Feature(
-                          float_list=feature_pb2.FloatList(value=datum)),
-              }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
-    }
-
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = _queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = _queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = _queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size)
-
-
-class DNNClassifierWithLayerAnnotationsIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _as_label(self, data_in_float):
-    return np.rint(data_in_float).astype(np.int64)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, n_classes, batch_size):
-    feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))
-    ]
-    est = dnn_with_layer_annotations.DNNClassifierWithLayerAnnotations(
-        hidden_units=(2, 2),
-        feature_columns=feature_columns,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predicted_proba = np.array([
-        x[prediction_keys.PredictionKeys.PROBABILITIES]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
-
-    # EXPORT
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    n_classes = 3
-    input_dimension = 2
-    batch_size = 10
-    data = np.linspace(
-        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
-    x_data = data.reshape(batch_size, input_dimension)
-    y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data},
-        y=y_data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data}, y=y_data, batch_size=batch_size, shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data}, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        n_classes=n_classes,
-        batch_size=batch_size)
-
-  def test_pandas_input_fn(self):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-    input_dimension = 1
-    n_classes = 3
-    batch_size = 10
-    data = np.linspace(0., n_classes - 1., batch_size, dtype=np.float32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(self._as_label(data))
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        n_classes=n_classes,
-        batch_size=batch_size)
-
-  def test_input_fn_from_parse_example(self):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    input_dimension = 2
-    n_classes = 3
-    batch_size = 10
-    data = np.linspace(
-        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, input_dimension)
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(
-          features=feature_pb2.Features(
-              feature={
-                  'x':
-                      feature_pb2.Feature(
-                          float_list=feature_pb2.FloatList(value=datum)),
-                  'y':
-                      feature_pb2.Feature(
-                          int64_list=feature_pb2.Int64List(
-                              value=self._as_label(datum[:1]))),
-              }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
-    }
-
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = _queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = _queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = _queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=input_dimension,
-        n_classes=n_classes,
-        batch_size=batch_size)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/early_stopping.py b/tensorflow/contrib/estimator/python/estimator/early_stopping.py
index cafe8279c714bf5d50be61921c9070ca982b99c9..11856ece38bf08dfdf16e8b0d9890bbfb0033216 100644
--- a/tensorflow/contrib/estimator/python/estimator/early_stopping.py
+++ b/tensorflow/contrib/estimator/python/estimator/early_stopping.py
@@ -12,495 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for early stopping."""
+"""early_stopping python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import operator
-import os
-
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.summary import summary_iterator
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training_util
-
-_EVENT_FILE_GLOB_PATTERN = 'events.out.tfevents.*'
-
-
-def make_early_stopping_hook(estimator,
-                             should_stop_fn,
-                             run_every_secs=60,
-                             run_every_steps=None):
-  """Creates early-stopping hook.
-
-  Returns a `SessionRunHook` that stops training when `should_stop_fn` returns
-  `True`.
-
-  Usage example:
-
-  ```python
-  estimator = ...
-  hook = early_stopping.make_early_stopping_hook(
-      estimator, should_stop_fn=make_stop_fn(...))
-  train_spec = tf.estimator.TrainSpec(..., hooks=[hook])
-  tf.estimator.train_and_evaluate(estimator, train_spec, ...)
-  ```
-
-  Caveat: Current implementation supports early-stopping both training and
-  evaluation in local mode. In distributed mode, training can be stopped but
-  evaluation (where it's a separate job) will indefinitely wait for new model
-  checkpoints to evaluate, so you will need other means to detect and stop it.
-  Early-stopping evaluation in distributed mode requires changes in
-  `train_and_evaluate` API and will be addressed in a future revision.
-
-  Args:
-    estimator: A `tf.estimator.Estimator` instance.
-    should_stop_fn: `callable`, function that takes no arguments and returns a
-      `bool`. If the function returns `True`, stopping will be initiated by the
-      chief.
-    run_every_secs: If specified, calls `should_stop_fn` at an interval of
-      `run_every_secs` seconds. Defaults to 60 seconds. Either this or
-      `run_every_steps` must be set.
-    run_every_steps: If specified, calls `should_stop_fn` every
-      `run_every_steps` steps. Either this or `run_every_secs` must be set.
-
-  Returns:
-    A `SessionRunHook` that periodically executes `should_stop_fn` and initiates
-    early stopping if the function returns `True`.
-
-  Raises:
-    TypeError: If `estimator` is not of type `tf.estimator.Estimator`.
-    ValueError: If both `run_every_secs` and `run_every_steps` are set.
-  """
-  if not isinstance(estimator, estimator_lib.Estimator):
-    raise TypeError('`estimator` must have type `tf.estimator.Estimator`. '
-                    'Got: {}'.format(type(estimator)))
-
-  if run_every_secs is not None and run_every_steps is not None:
-    raise ValueError('Only one of `run_every_secs` and `run_every_steps` must '
-                     'be set.')
-
-  if estimator.config.is_chief:
-    return _StopOnPredicateHook(should_stop_fn, run_every_secs, run_every_steps)
-  else:
-    return _CheckForStoppingHook()
-
-
-def stop_if_higher_hook(estimator,
-                        metric_name,
-                        threshold,
-                        eval_dir=None,
-                        min_steps=0,
-                        run_every_secs=60,
-                        run_every_steps=None):
-  """Creates hook to stop if the given metric is higher than the threshold.
-
-  Usage example:
-
-  ```python
-  estimator = ...
-  # Hook to stop training if accuracy becomes higher than 0.9.
-  hook = early_stopping.stop_if_higher_hook(estimator, "accuracy", 0.9)
-  train_spec = tf.estimator.TrainSpec(..., hooks=[hook])
-  tf.estimator.train_and_evaluate(estimator, train_spec, ...)
-  ```
-
-  Caveat: Current implementation supports early-stopping both training and
-  evaluation in local mode. In distributed mode, training can be stopped but
-  evaluation (where it's a separate job) will indefinitely wait for new model
-  checkpoints to evaluate, so you will need other means to detect and stop it.
-  Early-stopping evaluation in distributed mode requires changes in
-  `train_and_evaluate` API and will be addressed in a future revision.
-
-  Args:
-    estimator: A `tf.estimator.Estimator` instance.
-    metric_name: `str`, metric to track. "loss", "accuracy", etc.
-    threshold: Numeric threshold for the given metric.
-    eval_dir: If set, directory containing summary files with eval metrics. By
-      default, `estimator.eval_dir()` will be used.
-    min_steps: `int`, stop is never requested if global step is less than this
-      value. Defaults to 0.
-    run_every_secs: If specified, calls `should_stop_fn` at an interval of
-      `run_every_secs` seconds. Defaults to 60 seconds. Either this or
-      `run_every_steps` must be set.
-    run_every_steps: If specified, calls `should_stop_fn` every
-      `run_every_steps` steps. Either this or `run_every_secs` must be set.
-
-  Returns:
-    An early-stopping hook of type `SessionRunHook` that periodically checks
-    if the given metric is higher than specified threshold and initiates
-    early stopping if true.
-  """
-  return _stop_if_threshold_crossed_hook(
-      estimator=estimator,
-      metric_name=metric_name,
-      threshold=threshold,
-      higher_is_better=True,
-      eval_dir=eval_dir,
-      min_steps=min_steps,
-      run_every_secs=run_every_secs,
-      run_every_steps=run_every_steps)
-
-
-def stop_if_lower_hook(estimator,
-                       metric_name,
-                       threshold,
-                       eval_dir=None,
-                       min_steps=0,
-                       run_every_secs=60,
-                       run_every_steps=None):
-  """Creates hook to stop if the given metric is lower than the threshold.
-
-  Usage example:
-
-  ```python
-  estimator = ...
-  # Hook to stop training if loss becomes lower than 100.
-  hook = early_stopping.stop_if_lower_hook(estimator, "loss", 100)
-  train_spec = tf.estimator.TrainSpec(..., hooks=[hook])
-  tf.estimator.train_and_evaluate(estimator, train_spec, ...)
-  ```
-
-  Caveat: Current implementation supports early-stopping both training and
-  evaluation in local mode. In distributed mode, training can be stopped but
-  evaluation (where it's a separate job) will indefinitely wait for new model
-  checkpoints to evaluate, so you will need other means to detect and stop it.
-  Early-stopping evaluation in distributed mode requires changes in
-  `train_and_evaluate` API and will be addressed in a future revision.
-
-  Args:
-    estimator: A `tf.estimator.Estimator` instance.
-    metric_name: `str`, metric to track. "loss", "accuracy", etc.
-    threshold: Numeric threshold for the given metric.
-    eval_dir: If set, directory containing summary files with eval metrics. By
-      default, `estimator.eval_dir()` will be used.
-    min_steps: `int`, stop is never requested if global step is less than this
-      value. Defaults to 0.
-    run_every_secs: If specified, calls `should_stop_fn` at an interval of
-      `run_every_secs` seconds. Defaults to 60 seconds. Either this or
-      `run_every_steps` must be set.
-    run_every_steps: If specified, calls `should_stop_fn` every
-      `run_every_steps` steps. Either this or `run_every_secs` must be set.
-
-  Returns:
-    An early-stopping hook of type `SessionRunHook` that periodically checks
-    if the given metric is lower than specified threshold and initiates
-    early stopping if true.
-  """
-  return _stop_if_threshold_crossed_hook(
-      estimator=estimator,
-      metric_name=metric_name,
-      threshold=threshold,
-      higher_is_better=False,
-      eval_dir=eval_dir,
-      min_steps=min_steps,
-      run_every_secs=run_every_secs,
-      run_every_steps=run_every_steps)
-
-
-def stop_if_no_increase_hook(estimator,
-                             metric_name,
-                             max_steps_without_increase,
-                             eval_dir=None,
-                             min_steps=0,
-                             run_every_secs=60,
-                             run_every_steps=None):
-  """Creates hook to stop if metric does not increase within given max steps.
-
-  Usage example:
-
-  ```python
-  estimator = ...
-  # Hook to stop training if accuracy does not increase in over 100000 steps.
-  hook = early_stopping.stop_if_no_increase_hook(estimator, "accuracy", 100000)
-  train_spec = tf.estimator.TrainSpec(..., hooks=[hook])
-  tf.estimator.train_and_evaluate(estimator, train_spec, ...)
-  ```
-
-  Caveat: Current implementation supports early-stopping both training and
-  evaluation in local mode. In distributed mode, training can be stopped but
-  evaluation (where it's a separate job) will indefinitely wait for new model
-  checkpoints to evaluate, so you will need other means to detect and stop it.
-  Early-stopping evaluation in distributed mode requires changes in
-  `train_and_evaluate` API and will be addressed in a future revision.
-
-  Args:
-    estimator: A `tf.estimator.Estimator` instance.
-    metric_name: `str`, metric to track. "loss", "accuracy", etc.
-    max_steps_without_increase: `int`, maximum number of training steps with no
-      increase in the given metric.
-    eval_dir: If set, directory containing summary files with eval metrics. By
-      default, `estimator.eval_dir()` will be used.
-    min_steps: `int`, stop is never requested if global step is less than this
-      value. Defaults to 0.
-    run_every_secs: If specified, calls `should_stop_fn` at an interval of
-      `run_every_secs` seconds. Defaults to 60 seconds. Either this or
-      `run_every_steps` must be set.
-    run_every_steps: If specified, calls `should_stop_fn` every
-      `run_every_steps` steps. Either this or `run_every_secs` must be set.
-
-  Returns:
-    An early-stopping hook of type `SessionRunHook` that periodically checks
-    if the given metric shows no increase over given maximum number of
-    training steps, and initiates early stopping if true.
-  """
-  return _stop_if_no_metric_improvement_hook(
-      estimator=estimator,
-      metric_name=metric_name,
-      max_steps_without_improvement=max_steps_without_increase,
-      higher_is_better=True,
-      eval_dir=eval_dir,
-      min_steps=min_steps,
-      run_every_secs=run_every_secs,
-      run_every_steps=run_every_steps)
-
-
-def stop_if_no_decrease_hook(estimator,
-                             metric_name,
-                             max_steps_without_decrease,
-                             eval_dir=None,
-                             min_steps=0,
-                             run_every_secs=60,
-                             run_every_steps=None):
-  """Creates hook to stop if metric does not decrease within given max steps.
-
-  Usage example:
-
-  ```python
-  estimator = ...
-  # Hook to stop training if loss does not decrease in over 100000 steps.
-  hook = early_stopping.stop_if_no_decrease_hook(estimator, "loss", 100000)
-  train_spec = tf.estimator.TrainSpec(..., hooks=[hook])
-  tf.estimator.train_and_evaluate(estimator, train_spec, ...)
-  ```
-
-  Caveat: Current implementation supports early-stopping both training and
-  evaluation in local mode. In distributed mode, training can be stopped but
-  evaluation (where it's a separate job) will indefinitely wait for new model
-  checkpoints to evaluate, so you will need other means to detect and stop it.
-  Early-stopping evaluation in distributed mode requires changes in
-  `train_and_evaluate` API and will be addressed in a future revision.
-
-  Args:
-    estimator: A `tf.estimator.Estimator` instance.
-    metric_name: `str`, metric to track. "loss", "accuracy", etc.
-    max_steps_without_decrease: `int`, maximum number of training steps with no
-      decrease in the given metric.
-    eval_dir: If set, directory containing summary files with eval metrics. By
-      default, `estimator.eval_dir()` will be used.
-    min_steps: `int`, stop is never requested if global step is less than this
-      value. Defaults to 0.
-    run_every_secs: If specified, calls `should_stop_fn` at an interval of
-      `run_every_secs` seconds. Defaults to 60 seconds. Either this or
-      `run_every_steps` must be set.
-    run_every_steps: If specified, calls `should_stop_fn` every
-      `run_every_steps` steps. Either this or `run_every_secs` must be set.
-
-  Returns:
-    An early-stopping hook of type `SessionRunHook` that periodically checks
-    if the given metric shows no decrease over given maximum number of
-    training steps, and initiates early stopping if true.
-  """
-  return _stop_if_no_metric_improvement_hook(
-      estimator=estimator,
-      metric_name=metric_name,
-      max_steps_without_improvement=max_steps_without_decrease,
-      higher_is_better=False,
-      eval_dir=eval_dir,
-      min_steps=min_steps,
-      run_every_secs=run_every_secs,
-      run_every_steps=run_every_steps)
-
-
-def read_eval_metrics(eval_dir):
-  """Helper to read eval metrics from eval summary files.
-
-  Args:
-    eval_dir: Directory containing summary files with eval metrics.
-
-  Returns:
-    A `dict` with global steps mapping to `dict` of metric names and values.
-  """
-  eval_metrics_dict = {}
-  for event in _summaries(eval_dir):
-    if not event.HasField('summary'):
-      continue
-    metrics = {}
-    for value in event.summary.value:
-      if value.HasField('simple_value'):
-        metrics[value.tag] = value.simple_value
-    if metrics:
-      eval_metrics_dict[event.step] = metrics
-  return collections.OrderedDict(
-      sorted(eval_metrics_dict.items(), key=lambda t: t[0]))
-
-
-def _stop_if_threshold_crossed_hook(estimator, metric_name, threshold,
-                                    higher_is_better, eval_dir, min_steps,
-                                    run_every_secs, run_every_steps):
-  """Creates early-stopping hook to stop training if threshold is crossed."""
-
-  if eval_dir is None:
-    eval_dir = estimator.eval_dir()
-
-  is_lhs_better = operator.gt if higher_is_better else operator.lt
-  greater_or_lesser = 'greater than' if higher_is_better else 'less than'
-
-  def stop_if_threshold_crossed_fn():
-    """Returns `True` if the given metric crosses specified threshold."""
-
-    eval_results = read_eval_metrics(eval_dir)
-
-    for step, metrics in eval_results.items():
-      if step < min_steps:
-        continue
-      val = metrics[metric_name]
-      if is_lhs_better(val, threshold):
-        tf_logging.info(
-            'At step %s, metric "%s" has value %s which is %s the configured '
-            'threshold (%s) for early stopping.', step, metric_name, val,
-            greater_or_lesser, threshold)
-        return True
-    return False
-
-  return make_early_stopping_hook(
-      estimator=estimator,
-      should_stop_fn=stop_if_threshold_crossed_fn,
-      run_every_secs=run_every_secs,
-      run_every_steps=run_every_steps)
-
-
-def _stop_if_no_metric_improvement_hook(
-    estimator, metric_name, max_steps_without_improvement, higher_is_better,
-    eval_dir, min_steps, run_every_secs, run_every_steps):
-  """Returns hook to stop training if given metric shows no improvement."""
-
-  if eval_dir is None:
-    eval_dir = estimator.eval_dir()
-
-  is_lhs_better = operator.gt if higher_is_better else operator.lt
-  increase_or_decrease = 'increase' if higher_is_better else 'decrease'
-
-  def stop_if_no_metric_improvement_fn():
-    """Returns `True` if metric does not improve within max steps."""
-
-    eval_results = read_eval_metrics(eval_dir)
-
-    best_val = None
-    best_val_step = None
-    for step, metrics in eval_results.items():
-      if step < min_steps:
-        continue
-      val = metrics[metric_name]
-      if best_val is None or is_lhs_better(val, best_val):
-        best_val = val
-        best_val_step = step
-      if step - best_val_step >= max_steps_without_improvement:
-        tf_logging.info(
-            'No %s in metric "%s" for %s steps, which is greater than or equal '
-            'to max steps (%s) configured for early stopping.',
-            increase_or_decrease, metric_name, step - best_val_step,
-            max_steps_without_improvement)
-        return True
-    return False
-
-  return make_early_stopping_hook(
-      estimator=estimator,
-      should_stop_fn=stop_if_no_metric_improvement_fn,
-      run_every_secs=run_every_secs,
-      run_every_steps=run_every_steps)
-
-
-def _summaries(eval_dir):
-  """Yields `tensorflow.Event` protos from event files in the eval dir.
-
-  Args:
-    eval_dir: Directory containing summary files with eval metrics.
-
-  Yields:
-    `tensorflow.Event` object read from the event files.
-  """
-  if gfile.Exists(eval_dir):
-    for event_file in gfile.Glob(
-        os.path.join(eval_dir, _EVENT_FILE_GLOB_PATTERN)):
-      for event in summary_iterator.summary_iterator(event_file):
-        yield event
-
-
-def _get_or_create_stop_var():
-  with variable_scope.variable_scope(
-      name_or_scope='signal_early_stopping',
-      values=[],
-      reuse=variable_scope.AUTO_REUSE):
-    return variable_scope.get_variable(
-        name='STOP',
-        shape=[],
-        dtype=dtypes.bool,
-        initializer=init_ops.constant_initializer(False),
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        trainable=False)
-
-
-class _StopOnPredicateHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop when `should_stop_fn` returns `True`."""
-
-  def __init__(self, should_stop_fn, run_every_secs=60, run_every_steps=None):
-    if not callable(should_stop_fn):
-      raise TypeError('`should_stop_fn` must be callable.')
-
-    self._should_stop_fn = should_stop_fn
-    self._timer = basic_session_run_hooks.SecondOrStepTimer(
-        every_secs=run_every_secs, every_steps=run_every_steps)
-    self._global_step_tensor = None
-    self._stop_var = None
-    self._stop_op = None
-
-  def begin(self):
-    self._global_step_tensor = training_util.get_global_step()
-    self._stop_var = _get_or_create_stop_var()
-    self._stop_op = state_ops.assign(self._stop_var, True)
-
-  def before_run(self, run_context):
-    del run_context
-    return session_run_hook.SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    global_step = run_values.results
-    if self._timer.should_trigger_for_step(global_step):
-      self._timer.update_last_triggered_step(global_step)
-      if self._should_stop_fn():
-        tf_logging.info('Requesting early stopping at global step %d',
-                        global_step)
-        run_context.session.run(self._stop_op)
-        run_context.request_stop()
-
-
-class _CheckForStoppingHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop if stop is requested by `_StopOnPredicateHook`."""
-
-  def __init__(self):
-    self._stop_var = None
-
-  def begin(self):
-    self._stop_var = _get_or_create_stop_var()
+from tensorflow_estimator.contrib.estimator.python.estimator import early_stopping
 
-  def before_run(self, run_context):
-    del run_context
-    return session_run_hook.SessionRunArgs(self._stop_var)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+early_stopping.__all__ = [
+    s for s in dir(early_stopping) if not s.startswith('__')
+]
 
-  def after_run(self, run_context, run_values):
-    should_early_stop = run_values.results
-    if should_early_stop:
-      tf_logging.info('Early stopping requested, suspending run.')
-      run_context.request_stop()
+from tensorflow_estimator.contrib.estimator.python.estimator.early_stopping import *
diff --git a/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py b/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py
deleted file mode 100644
index e4bfd4b446b9413bd1627ef6904ff2dc9f1a9120..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for early_stopping."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-
-from absl.testing import parameterized
-from tensorflow.contrib.estimator.python.estimator import early_stopping
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import run_config
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.platform import test
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import training_util
-
-
-class _FakeRunConfig(run_config.RunConfig):
-
-  def __init__(self, is_chief):
-    super(_FakeRunConfig, self).__init__()
-    self._is_chief = is_chief
-
-  @property
-  def is_chief(self):
-    return self._is_chief
-
-
-def _dummy_model_fn(features, labels, params):
-  _, _, _ = features, labels, params
-
-
-class _FakeEstimator(estimator.Estimator):
-  """Fake estimator for testing."""
-
-  def __init__(self, config):
-    super(_FakeEstimator, self).__init__(
-        model_fn=_dummy_model_fn, config=config)
-
-
-def _write_events(eval_dir, params):
-  """Test helper to write events to summary files."""
-  for steps, loss, accuracy in params:
-    estimator._write_dict_to_summary(eval_dir, {
-        'loss': loss,
-        'accuracy': accuracy,
-    }, steps)
-
-
-class ReadEvalMetricsTest(test.TestCase):
-
-  def test_read_eval_metrics(self):
-    eval_dir = tempfile.mkdtemp()
-    _write_events(
-        eval_dir,
-        [
-            # steps, loss, accuracy
-            (1000, 1, 2),
-            (2000, 3, 4),
-            (3000, 5, 6),
-        ])
-    self.assertEqual({
-        1000: {
-            'loss': 1,
-            'accuracy': 2
-        },
-        2000: {
-            'loss': 3,
-            'accuracy': 4
-        },
-        3000: {
-            'loss': 5,
-            'accuracy': 6
-        },
-    }, early_stopping.read_eval_metrics(eval_dir))
-
-  def test_read_eval_metrics_when_no_events(self):
-    eval_dir = tempfile.mkdtemp()
-    self.assertTrue(os.path.exists(eval_dir))
-
-    # No error should be raised when eval directory exists with no event files.
-    self.assertEqual({}, early_stopping.read_eval_metrics(eval_dir))
-
-    os.rmdir(eval_dir)
-    self.assertFalse(os.path.exists(eval_dir))
-
-    # No error should be raised when eval directory does not exist.
-    self.assertEqual({}, early_stopping.read_eval_metrics(eval_dir))
-
-
-class EarlyStoppingHooksTest(test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    config = _FakeRunConfig(is_chief=True)
-    self._estimator = _FakeEstimator(config=config)
-    eval_dir = self._estimator.eval_dir()
-    os.makedirs(eval_dir)
-    _write_events(
-        eval_dir,
-        [
-            # steps, loss, accuracy
-            (1000, 0.8, 0.5),
-            (2000, 0.7, 0.6),
-            (3000, 0.4, 0.7),
-            (3500, 0.41, 0.68),
-        ])
-
-  def run_session(self, hooks, should_stop):
-    hooks = hooks if isinstance(hooks, list) else [hooks]
-    with ops.Graph().as_default():
-      training_util.create_global_step()
-      no_op = control_flow_ops.no_op()
-      with monitored_session.SingularMonitoredSession(hooks=hooks) as mon_sess:
-        mon_sess.run(no_op)
-        self.assertEqual(mon_sess.should_stop(), should_stop)
-
-  @parameterized.parameters((0.8, 0, False), (0.6, 4000, False), (0.6, 0, True))
-  def test_stop_if_higher_hook(self, threshold, min_steps, should_stop):
-    self.run_session(
-        early_stopping.stop_if_higher_hook(
-            self._estimator,
-            metric_name='accuracy',
-            threshold=threshold,
-            min_steps=min_steps), should_stop)
-
-  @parameterized.parameters((0.3, 0, False), (0.5, 4000, False), (0.5, 0, True))
-  def test_stop_if_lower_hook(self, threshold, min_steps, should_stop):
-    self.run_session(
-        early_stopping.stop_if_lower_hook(
-            self._estimator,
-            metric_name='loss',
-            threshold=threshold,
-            min_steps=min_steps), should_stop)
-
-  @parameterized.parameters((1500, 0, False), (500, 4000, False),
-                            (500, 0, True))
-  def test_stop_if_no_increase_hook(self, max_steps, min_steps, should_stop):
-    self.run_session(
-        early_stopping.stop_if_no_increase_hook(
-            self._estimator,
-            metric_name='accuracy',
-            max_steps_without_increase=max_steps,
-            min_steps=min_steps), should_stop)
-
-  @parameterized.parameters((1500, 0, False), (500, 4000, False),
-                            (500, 0, True))
-  def test_stop_if_no_decrease_hook(self, max_steps, min_steps, should_stop):
-    self.run_session(
-        early_stopping.stop_if_no_decrease_hook(
-            self._estimator,
-            metric_name='loss',
-            max_steps_without_decrease=max_steps,
-            min_steps=min_steps), should_stop)
-
-  @parameterized.parameters((1500, 0.3, False), (1500, 0.5, True),
-                            (500, 0.3, True))
-  def test_multiple_hooks(self, max_steps, loss_threshold, should_stop):
-    self.run_session([
-        early_stopping.stop_if_no_decrease_hook(
-            self._estimator,
-            metric_name='loss',
-            max_steps_without_decrease=max_steps),
-        early_stopping.stop_if_lower_hook(
-            self._estimator, metric_name='loss', threshold=loss_threshold)
-    ], should_stop)
-
-  @parameterized.parameters(False, True)
-  def test_make_early_stopping_hook(self, should_stop):
-    self.run_session([
-        early_stopping.make_early_stopping_hook(
-            self._estimator, should_stop_fn=lambda: should_stop)
-    ], should_stop)
-
-  def test_make_early_stopping_hook_typeerror(self):
-    with self.assertRaises(TypeError):
-      early_stopping.make_early_stopping_hook(
-          estimator=object(), should_stop_fn=lambda: True)
-
-  def test_make_early_stopping_hook_valueerror(self):
-    with self.assertRaises(ValueError):
-      early_stopping.make_early_stopping_hook(
-          self._estimator,
-          should_stop_fn=lambda: True,
-          run_every_secs=60,
-          run_every_steps=100)
-
-
-class StopOnPredicateHookTest(test.TestCase):
-
-  def test_stop(self):
-    hook = early_stopping._StopOnPredicateHook(
-        should_stop_fn=lambda: False, run_every_secs=0)
-    with ops.Graph().as_default():
-      training_util.create_global_step()
-      no_op = control_flow_ops.no_op()
-      with monitored_session.SingularMonitoredSession(hooks=[hook]) as mon_sess:
-        mon_sess.run(no_op)
-        self.assertFalse(mon_sess.should_stop())
-        self.assertFalse(mon_sess.raw_session().run(hook._stop_var))
-
-    hook = early_stopping._StopOnPredicateHook(
-        should_stop_fn=lambda: True, run_every_secs=0)
-    with ops.Graph().as_default():
-      training_util.create_global_step()
-      no_op = control_flow_ops.no_op()
-      with monitored_session.SingularMonitoredSession(hooks=[hook]) as mon_sess:
-        mon_sess.run(no_op)
-        self.assertTrue(mon_sess.should_stop())
-        self.assertTrue(mon_sess.raw_session().run(hook._stop_var))
-
-
-class CheckForStoppingHookTest(test.TestCase):
-
-  def test_stop(self):
-    hook = early_stopping._CheckForStoppingHook()
-    with ops.Graph().as_default():
-      no_op = control_flow_ops.no_op()
-      assign_op = state_ops.assign(early_stopping._get_or_create_stop_var(),
-                                   True)
-      with monitored_session.SingularMonitoredSession(hooks=[hook]) as mon_sess:
-        mon_sess.run(no_op)
-        self.assertFalse(mon_sess.should_stop())
-        mon_sess.run(assign_op)
-        self.assertTrue(mon_sess.should_stop())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/export.py b/tensorflow/contrib/estimator/python/estimator/export.py
index b0deb9b494ab3ad0fe8c56967606e5e5952b7ccf..738b343dfde21fd5926320e865e87b0392713749 100644
--- a/tensorflow/contrib/estimator/python/estimator/export.py
+++ b/tensorflow/contrib/estimator/python/estimator/export.py
@@ -12,212 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Wrapper for methods to export train/eval graphs from Estimator."""
+"""export python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.estimator import model_fn as model_fn_lib
-
-
-def export_saved_model_for_mode(
-    estimator, export_dir_base, input_receiver_fn,
-    assets_extra=None,
-    as_text=False,
-    checkpoint_path=None,
-    strip_default_attrs=False,
-    mode=model_fn_lib.ModeKeys.PREDICT):
-  # pylint: disable=line-too-long
-  """Exports a single train/eval/predict graph as a SavedModel.
-
-  For a detailed guide, see [Using SavedModel with Estimators](
-  https://tensorflow.org/guide/saved_model#using_savedmodel_with_estimators).
-
-  Sample usage:
-  ```python
-  classifier = tf.estimator.LinearClassifier(
-      feature_columns=[age, language])
-  classifier.train(input_fn=input_fn, steps=1000)
-
-  feature_spec = {
-      'age': tf.placeholder(dtype=tf.int64),
-      'language': array_ops.placeholder(dtype=tf.string)
-  }
-  label_spec = tf.placeholder(dtype=dtypes.int64)
-
-  train_rcvr_fn = tf.contrib.estimator.build_raw_supervised_input_receiver_fn(
-      feature_spec, label_spec)
-
-  export_dir = tf.contrib.estimator.export_saved_model_for_mode(
-      classifier,
-      export_dir_base='my_model/',
-      input_receiver_fn=train_rcvr_fn,
-      mode=model_fn_lib.ModeKeys.TRAIN)
-
-  # export_dir is a timestamped directory with the SavedModel, which
-  # can be used for serving, analysis with TFMA, or directly loaded in.
-  with ops.Graph().as_default() as graph:
-    with session.Session(graph=graph) as sess:
-      loader.load(sess, [tag_constants.TRAINING], export_dir)
-      weights = graph.get_tensor_by_name(''linear/linear_model/age/weights')
-      ...
-  ```
-
-  This method is a wrapper for _export_all_saved_models, and wraps a raw
-  input_receiver_fn in a dictionary to pass in to that function.
-  See _export_all_saved_models for full docs.
-
-  See tf.contrib.estimator.export_saved_model_for_mode for the currently
-  exposed version of this function.
-
-  Args:
-    estimator: an instance of tf.estimator.Estimator
-    export_dir_base: A string containing a directory in which to create
-      timestamped subdirectories containing exported SavedModels.
-    input_receiver_fn: a function that takes no argument and
-      returns the appropriate subclass of `InputReceiver`.
-    assets_extra: A dict specifying how to populate the assets.extra directory
-      within the exported SavedModel, or `None` if no extra assets are needed.
-    as_text: whether to write the SavedModel proto in text format.
-    checkpoint_path: The checkpoint path to export.  If `None` (the default),
-      the most recent checkpoint found within the model directory is chosen.
-    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-      removed from the NodeDefs. For a detailed guide, see
-      [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-    mode: tf.estimator.ModeKeys value indicating with mode will be exported.
-
-  Returns:
-    The string path to the exported directory.
-
-  Raises:
-    ValueError: if input_receiver_fn is None, no export_outputs
-      are provided, or no checkpoint can be found.
-  """
-  # pylint: enable=line-too-long
-
-  # pylint: disable=protected-access
-  return estimator._export_saved_model_for_mode(
-      export_dir_base, input_receiver_fn,
-      assets_extra=assets_extra,
-      as_text=as_text,
-      checkpoint_path=checkpoint_path,
-      strip_default_attrs=strip_default_attrs,
-      mode=mode)
-  # pylint: enable=protected-access
-
-
-def export_all_saved_models(
-    estimator, export_dir_base, input_receiver_fn_map,
-    assets_extra=None,
-    as_text=False,
-    checkpoint_path=None,
-    strip_default_attrs=False):
-  # pylint: disable=line-too-long
-  """Exports requested train/eval/predict graphs as separate SavedModels.
-
-  See tf.contrib.estimator.export_all_saved_models for the currently
-  exposed version of this function.
-
-  For each mode passed in via the input_receiver_fn_map,
-  this method builds a new graph by calling the input_receiver_fn to obtain
-  feature and label `Tensor`s. Next, this method calls the `Estimator`'s
-  model_fn in the passed mode to generate the model graph based on
-  those features and labels, and restores the given checkpoint
-  (or, lacking that, the most recent checkpoint) into the graph.
-  Only one of the modes is used for saving variables to the SavedModel
-  (order of preference: TRAIN, EVAL, then PREDICT), such that up to three
-  MetaGraphDefs are saved with a single set of variables in a single
-  SavedModel directory.
-
-  For prediction, the exported `MetaGraphDef` will provide one `SignatureDef`
-  for each element of the export_outputs dict returned from the model_fn,
-  named using the same keys.  One of these keys is always
-  signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
-  signature will be served when a serving request does not specify one.
-  For each signature, the outputs are provided by the corresponding
-  `ExportOutput`s, and the inputs are always the input receivers provided by
-  the serving_input_receiver_fn.
-
-  For training and evaluation, the train_op is stored in an extra collection,
-  and loss, metrics, and predictions are included in a SignatureDef for the
-  mode in question.
-
-  Extra assets may be written into the SavedModel via the assets_extra
-  argument.  This should be a dict, where each key gives a destination path
-  (including the filename) relative to the assets.extra directory.  The
-  corresponding value gives the full path of the source file to be copied.
-  For example, the simple case of copying a single file without renaming it
-  is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-
-  Sample usage:
-  ```python
-  classifier = tf.estimator.LinearClassifier(
-      feature_columns=[age, language])
-  classifier.train(input_fn=input_fn)
-
-  feature_spec = {
-      'age': tf.placeholder(dtype=tf.int64),
-      'language': array_ops.placeholder(dtype=tf.string)
-  }
-  label_spec = tf.placeholder(dtype=dtypes.int64)
-
-  train_rcvr_fn = tf.contrib.estimator.build_raw_supervised_input_receiver_fn(
-      feature_spec, label_spec)
-
-  serve_rcvr_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
-      feature_spec)
-
-  rcvr_fn_map = {
-      model_fn_lib.ModeKeys.TRAIN: train_rcvr_fn,
-      model_fn_lib.ModeKeys.PREDICT: serve_rcvr_fn,
-  }
-
-  export_dir = tf.contrib.estimator.export_all_saved_models(
-      classifier,
-      export_dir_base='my_model/',
-      input_receiver_fn_map=rcvr_fn_map)
-
-  # export_dirs is a dict of directories with SavedModels, which
-  # can be used for serving, analysis with TFMA, or directly loaded in.
-  with ops.Graph().as_default() as graph:
-    with session.Session(graph=graph) as sess:
-      loader.load(sess, [tag_constants.TRAINING], export_dir)
-      weights = graph.get_tensor_by_name('linear/linear_model/age/weights')
-      ...
-  ```
-
-  Args:
-    estimator: an instance of tf.estimator.Estimator
-    export_dir_base: A string containing a directory in which to create
-      timestamped subdirectories containing exported SavedModels.
-    input_receiver_fn_map: dict of tf.estimator.ModeKeys to input_receiver_fn
-      mappings, where the input_receiver_fn is a function that takes no
-      argument and returns the appropriate subclass of `InputReceiver`.
-    assets_extra: A dict specifying how to populate the assets.extra directory
-      within the exported SavedModel, or `None` if no extra assets are needed.
-    as_text: whether to write the SavedModel proto in text format.
-    checkpoint_path: The checkpoint path to export.  If `None` (the default),
-      the most recent checkpoint found within the model directory is chosen.
-    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-      removed from the NodeDefs. For a detailed guide, see
-      [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-
-  Returns:
-    A dict of tf.estimator.ModeKeys value to string path for each exported
-    directory.
+from tensorflow_estimator.contrib.estimator.python.estimator import export
 
-  Raises:
-    ValueError: if any input_receiver_fn is None, no export_outputs
-      are provided, or no checkpoint can be found.
-  """
-  # pylint: enable=line-too-long
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+export.__all__ = [s for s in dir(export) if not s.startswith('__')]
 
-  # pylint: disable=protected-access
-  return estimator._export_all_saved_models(
-      export_dir_base, input_receiver_fn_map,
-      assets_extra=assets_extra,
-      as_text=as_text,
-      checkpoint_path=checkpoint_path,
-      strip_default_attrs=strip_default_attrs)
-  # pylint: enable=protected-access
+from tensorflow_estimator.contrib.estimator.python.estimator.export import *
diff --git a/tensorflow/contrib/estimator/python/estimator/export_test.py b/tensorflow/contrib/estimator/python/estimator/export_test.py
deleted file mode 100644
index 050821ee672f30a6926c4a0a0e48915515d9afd7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/export_test.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for contrib wrapping of export_saved_model_for_mode functionality.
-
-These are direct copies of the tests included in core, with import locations
-changed. These should be removed when the functionality in core is part of the
-public API.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-
-from tensorflow.contrib.estimator.python.estimator import export as contrib_export
-from tensorflow.python.client import session
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import loader
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training import training
-from tensorflow.python.util import compat
-
-
-def _model_fn_for_export_tests(features, labels, mode):
-  _, _ = features, labels
-  variables.Variable(1., name='weight')
-  scores = constant_op.constant([3.])
-  classes = constant_op.constant(['wumpus'])
-  update_global_step = state_ops.assign_add(training.get_global_step(), 1)
-  with ops.control_dependencies([update_global_step]):
-    train_op = constant_op.constant(2.)
-  return model_fn_lib.EstimatorSpec(
-      mode,
-      predictions=constant_op.constant(10.),
-      loss=constant_op.constant(1.),
-      train_op=train_op,
-      export_outputs={
-          'test': export_output.ClassificationOutput(scores, classes)})
-
-
-def _x_y_input_fn():
-  return ({'x': constant_op.constant([[1], [1]]),
-           'y': constant_op.constant([[2], [2]])},
-          constant_op.constant([[1], [1]]))
-
-
-def _model_fn_with_x_y(features, labels, mode):
-  _ = labels
-  variables.Variable(1., name='weight')
-  scores = constant_op.constant([3.])
-  classes = constant_op.constant(['wumpus'])
-  if mode == model_fn_lib.ModeKeys.PREDICT:
-    variables.Variable(36., name='name_collision')
-    return model_fn_lib.EstimatorSpec(
-        mode,
-        predictions=constant_op.constant(10.),
-        export_outputs={
-            'test': export_output.ClassificationOutput(scores, classes)})
-  else:
-    prefix = 'eval_' if mode == model_fn_lib.ModeKeys.EVAL else ''
-
-    multiplied = math_ops.multiply(
-        features['x'], features['y'], name='{}multiplied'.format(prefix))
-    metrics = {'mean': metrics_lib.mean(features['x'] - features['y'],
-                                        name='{}mean'.format(prefix))}
-    variables.Variable(1., name='later_var')
-    variables.Variable(3., name='name_collision')
-    return model_fn_lib.EstimatorSpec(
-        mode,
-        predictions=multiplied,
-        loss=constant_op.constant(1.),
-        train_op=state_ops.assign_add(training.get_global_step(), 1),
-        eval_metric_ops=metrics)
-
-
-def _get_serving_input_receiver_fn():
-  feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                  'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-  return export.build_parsing_serving_input_receiver_fn(feature_spec)
-
-
-def _get_supervised_input_receiver_fn():
-  feature_spec = {
-      'x': array_ops.placeholder(
-          dtype=dtypes.int64, shape=(2, 1), name='feature_x'),
-      'y': array_ops.placeholder(
-          dtype=dtypes.int64, shape=(2, 1), name='feature_y')
-      }
-  label_spec = array_ops.placeholder(
-      dtype=dtypes.float32, shape=[1], name='truth')
-
-  return export.build_raw_supervised_input_receiver_fn(
-      feature_spec, label_spec)
-
-
-class EstimatorExportTest(test.TestCase):
-
-  def test_export_saved_model_train(self):
-    self._test_export_saved_model_for_mode(
-        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.TRAIN)
-
-  def test_export_saved_model_eval(self):
-    self._test_export_saved_model_for_mode(
-        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.EVAL)
-
-  def test_export_saved_model_predict(self):
-    self._test_export_saved_model_for_mode(
-        _get_serving_input_receiver_fn(), model_fn_lib.ModeKeys.PREDICT)
-
-  def _test_export_saved_model_for_mode(self, input_receiver_fn, mode):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
-    est.train(input_fn=_x_y_input_fn, steps=1)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = contrib_export.export_saved_model_for_mode(
-        est, export_dir_base, input_receiver_fn, mode=mode)
-
-    # Check that all the files are in the right places.
-    self.assertTrue(gfile.Exists(export_dir_base))
-    self._validate_exported_files(export_dir)
-
-    # Restore, to validate that the export was well-formed.
-    tag_set = model_fn_lib.EXPORT_TAG_MAP[mode]
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, tag_set, export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertFalse('name_collision_1' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_receiver_map(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('input_example_tensor' in graph_ops)
-        self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        self.assertFalse('feature_x' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_train_only(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('multiplied' in graph_ops)
-        self.assertTrue('mean/update_op' in graph_ops)
-        self.assertFalse('eval_multiplied' in graph_ops)
-        self.assertTrue('feature_x' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_eval_only(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.EVAL], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('eval_multiplied' in graph_ops)
-        self.assertTrue('eval_mean/value' in graph_ops)
-        self.assertFalse('multiplied' in graph_ops)
-        self.assertTrue('feature_x' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_no_serving(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('multiplied' in graph_ops)
-        self.assertFalse('eval_multiplied' in graph_ops)
-        self.assertTrue('feature_x' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.EVAL], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('eval_multiplied' in graph_ops)
-        self.assertFalse('multiplied' in graph_ops)
-        # TODO(karmel): is this the desired behavior when names are shared?
-        self.assertTrue('feature_x_1' in graph_ops)
-        self.assertTrue('feature_y_1' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_three_defs(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    # Restore, to validate that the export was well-formed.
-    for tag_set in model_fn_lib.EXPORT_TAG_MAP.values():
-      with ops.Graph().as_default() as graph:
-        with session.Session(graph=graph) as sess:
-          loader.load(sess, tag_set, export_dir)
-          graph_ops = [x.name for x in graph.get_operations()]
-          self.assertTrue('global_step/Assign' in graph_ops)
-          self.assertTrue('global_step/Initializer/zeros' in graph_ops)
-          self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_all_vars(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('later_var' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertFalse('later_var' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_name_collision(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('name_collision' in graph_ops)
-        self.assertFalse('name_collision_1' in graph_ops)
-        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-        self.assertEqual(3, collection_vars[-1].eval())
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('name_collision' in graph_ops)
-        self.assertFalse('name_collision_1' in graph_ops)
-        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-        # This is a non-obvious detail: when we load the estimator spec
-        # for predict, name_collision gets set to 36. However, we then restore
-        # from checkpoint, which should overwrite that var and make it the 3
-        # from training. In practice, this would not be a good way to write
-        # a model_fn, but leaving this check in for now to ensure consistency
-        # with what would happen given our current order of spec, then
-        # checkpoint.
-        self.assertEqual(3, collection_vars[-1].eval())
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def _test_export_all_saved_models(self, input_receiver_fn_map):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_with_x_y)
-    est.train(input_fn=_x_y_input_fn, steps=1)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = contrib_export.export_all_saved_models(
-        est, export_dir_base, input_receiver_fn_map)
-
-    # Check that all the files are in the right places.
-    self.assertTrue(gfile.Exists(export_dir_base))
-
-    self._validate_exported_files(export_dir)
-
-    return export_dir, tmpdir
-
-  def _validate_exported_files(self, export_dir):
-    self.assertTrue(gfile.Exists(export_dir))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('saved_model.pb'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.index'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.data-00000-of-00001'))))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/exporter.py b/tensorflow/contrib/estimator/python/estimator/exporter.py
index 09d744060568e458a3af32e9d7497dbfbeec561e..25ac78ca37d17079c03068d6847a915bbe23bef0 100644
--- a/tensorflow/contrib/estimator/python/estimator/exporter.py
+++ b/tensorflow/contrib/estimator/python/estimator/exporter.py
@@ -12,269 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implements StepsExporter to export the model in user specified steps."""
+"""exporter python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-from tensorflow.python.estimator import exporter
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.summary import summary_iterator
-
-DEFAULT_GLOBAL_STEP_KEY = ops.GraphKeys.GLOBAL_STEP
-
-
-class StepsExporter(exporter.Exporter):
-  """This class exports the model in user specified steps.
-
-  This class exports the model at the steps given by the `steps_to_keep`
-  argument. Each number in the list is treated as a lower bound for model
-  exports, to handle the case when evaluation is performed at different steps.
-
-  Consider this example:
-
-  ```
-  steps_to_keep = [1, 2, 3, 6, 7, 10, 12, 25]
-  ```
-
-  The model is evaluated at step increments of 5: `[5, 10, 15, 20, 25, 30]`.
-  The `StepsExporter` will export the model when it has reached steps
-  `[5, 10, 15, 25]`.
-
-  This example illustrates the two cases when the model is exported:
-
-  1. Model is evaluated on a step defined in the list `steps_to_keep`.
-
-     In the example, the model is exported on step `10` and `25`.
-
-  2. Model is evaluated on a step not defined in the list `steps_to_keep`, but
-     is still exported because a step in `steps_to_keep` was missed.
-
-     In the example, when the model reaches step `5`, the model is exported even
-     though  `steps_to_keep` does not contain `5`. Step `5` is exported to make
-     up for step `3`, which was missed. Steps `1` and `2` in `steps_to_keep` are
-     skipped completely (e.g. say the model is evaluated at step `6`. It will
-     **not** be exported to make up for step `2`).
-
-  Using the `steps_to_keep` list as a lower bound allows users to define
-  approximate step boundaries for exporting their models, and avoid frustrating
-  off-by-one calculation errors.
-
-  Sample Use Cases:
-    There are specific points during the training when having a saved version of
-    the model would be useful. One example is at the end of each training phase
-    when the set of freezed weights is changed.
-    Another good use case is saving the model at the end of each epoch for
-    visualization or retraining.
-  """
-
-  def __init__(self,
-               steps_to_keep,
-               name='steps_exporter',
-               serving_input_receiver_fn=None,
-               event_file_pattern='eval/*.tfevents.*',
-               assets_extra=None,
-               as_text=False):
-    """Create an `StepsExporter` to use with `tf.estimator.EvalSpec`.
-
-    Example of creating a StepsExporter for training and evaluation:
-
-    ```python
-    categorical_feature_a = categorical_column_with_hash_bucket(...)
-    categorical_feature_b = categorical_column_with_hash_bucket(...)
-
-    categorical_feature_a_emb = embedding_column(
-        categorical_column=categorical_feature_a, ...)
-    categorical_feature_b_emb = embedding_column(
-        categorical_column=categorical_feature_b, ...)
-
-    estimator = tf.estimator.DNNClassifier(
-        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-        hidden_units=[1024, 512, 256])
-
-    # Input pipeline for train and evaluate.
-    def train_input_fn: # returns x, y
-      # please shuffle the data.
-      pass
-    def eval_input_fn_eval: # returns x, y
-      pass
-
-    exporter = tf.contrib.estimator.exporter.StepsExporter(
-        name="steps_exporter",
-        serving_input_receiver_fn=serving_input_receiver_fn,
-        event_file_pattern='eval/*.tfevents.*'
-        steps_to_keep=[...])
-
-    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=1000)
-
-    eval_spec = [tf.estimator.EvalSpec(
-      input_fn=eval_input_fn,
-      steps=1,
-      exporters=exporter,
-      start_delay_secs=0,
-      throttle_secs=5)]
-
-    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
-
-    # Models will be exported to estimator.model_dir in timestamped directories,
-    # which can be used for serving, analysis with TFMA, or directly loaded in.
-    # For example:
-    export_dir = os.path.join(estimator.model_dir,
-                              <timestamped directory name>)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        tf.saved_model.loader.load(
-            sess, [tf.saved_model.tag_constants.SERVING], export_dir)
-
-    ```
-
-    Args:
-      steps_to_keep: Non-empty list of positive integers containing
-        the step numbers at which the model should be exported. All the exports
-        will be kept, so there is no garbage collection.
-      name: Unique name of this `Exporter` that is going to be used in the
-        export path.
-      serving_input_receiver_fn: A function that takes no arguments and returns
-        a `ServingInputReceiver`.
-      event_file_pattern: Event file name pattern relative to model_dir. If
-        None, however, the exporter would not be preemption-safe. To be
-        preemption-safe, event_file_pattern should be specified.
-      assets_extra: An optional dict specifying how to populate the assets.extra
-        directory within the exported SavedModel.  Each key should give the
-        destination path (including the filename) relative to the assets.extra
-        directory.  The corresponding value gives the full path of the source
-        file to be copied.  For example, the simple case of copying a single
-        file without renaming it is specified as `{'my_asset_file.txt':
-        '/path/to/my_asset_file.txt'}`.
-      as_text: Whether to write the SavedModel proto in text format. Defaults to
-        `False`.
-
-    Raises:
-      ValueError: If any arguments is invalid.
-    """
-    # pylint: disable=protected-access
-    self._saved_model_exporter = exporter._SavedModelExporter(
-        name, serving_input_receiver_fn, assets_extra, as_text)
-    # pylint: enable=protected-access
-
-    self._event_file_pattern = event_file_pattern
-    self._model_dir = None
-
-    self._input_steps_to_keep = steps_to_keep
-    steps_to_keep = [step for step in steps_to_keep if isinstance(step, int)]
-    steps_to_keep = [step for step in steps_to_keep if step > 0]
-    if not steps_to_keep:
-      raise ValueError(
-          '`steps_to_keep` list must have at least one positive integer')
-    elif self._input_steps_to_keep != steps_to_keep:
-      tf_logging.warn('Changed `steps_to_keep`, by omitting non-integer or'
-                      ' less than 1 elements, to [%s]',
-                      ', '.join(str(step) for step in steps_to_keep))
-    self._steps_to_keep = sorted(steps_to_keep)
-    self._steps_kept = []
-
-  @property
-  def name(self):
-    return self._saved_model_exporter.name
-
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    """Exports the given Estimator to a specific format.
-
-    Args:
-      estimator: A `tf.estimator.Estimator` instance to export.
-      export_path: A string containing a directory where to write the export.
-      checkpoint_path: The checkpoint path to export.
-      eval_result: The output of Estimator.evaluate on this checkpoint.
-      is_the_final_export: This boolean is True when this is an export in the
-        end of training. It is False for the intermediate exports during the
-        training. When passing Exporter to tf.estimator.train_and_evaluate
-        is_the_final_export is always False if TrainSpec.max_steps is None.
-
-    Returns:
-      The string path to the exported directory or None if export is skipped.
-
-    Raises:
-      ValueError: If `eval_result` is None or doesn't have
-        `ops.GraphKeys.GLOBAL_STEP` as a key.
-    """
-    export_result = None
-
-    if not eval_result or DEFAULT_GLOBAL_STEP_KEY not in eval_result:
-      raise ValueError(
-          '`eval_result` is empty, or does not have global step. This'
-          ' should never happen as Estimator always sets the global step in '
-          '`eval_result`. Please file a bug report. Got eval_result: %s'
-          % str(eval_result))
-
-    if self._model_dir != estimator.model_dir and self._event_file_pattern:
-      tf_logging.info('Loads the steps that the model was already evaluated at,'
-                      'from event files')
-      self._model_dir = estimator.model_dir
-      full_event_file_pattern = os.path.join(self._model_dir,
-                                             self._event_file_pattern)
-      self._steps_kept = self._get_kept_steps(full_event_file_pattern)
-
-      if self._steps_kept:
-        self._steps_kept = sorted(self._steps_kept)
-        self._steps_to_keep = [step for step in self._steps_to_keep if
-                               step > self._steps_kept[-1]]
-    # It is assumed that the model is exported at any evaluated step 'n' if
-    # there is any `steps_missed` lower than 'n'. As a result, all the steps in
-    # `_steps_to_keep` lower than the last evaluated step will be removed.
-    steps_missed = [step for step in self._steps_to_keep
-                    if step <= eval_result[DEFAULT_GLOBAL_STEP_KEY]]
-
-    if steps_missed:
-      # update the `_steps_to_keep` list by omitting all steps smaller than the
-      # current global step which are missed to be exported
-      export_result = self._saved_model_exporter.export(estimator, export_path,
-                                                        checkpoint_path,
-                                                        eval_result,
-                                                        is_the_final_export)
-      self._steps_to_keep = [step for step in self._steps_to_keep if step
-                             not in steps_missed]
-      # contains all the steps in which export has happened.
-      self._steps_kept.append(eval_result[DEFAULT_GLOBAL_STEP_KEY])
-      # Show warning for all the missed steps except the last one
-      if steps_missed[:-1]:
-        tf_logging.warn('Missed steps [%s] for exporting, as no evaluation'
-                        ' took place at them.', ', '.join(str(step) for step in
-                                                          steps_missed[:-1]))
-      # Log model export if the last missed step is the same as the current step
-      if steps_missed[-1] == eval_result[DEFAULT_GLOBAL_STEP_KEY]:
-        tf_logging.info('Performing model export at step %d.',
-                        eval_result[DEFAULT_GLOBAL_STEP_KEY])
-      # Show warning for exporting model at another step instead of the user
-      #   specified one
-      else:
-        tf_logging.warn('Performing model export at step %d instead of %d, as'
-                        ' no evaluation took place at step %d.',
-                        eval_result[DEFAULT_GLOBAL_STEP_KEY], steps_missed[-1],
-                        steps_missed[-1])
-    return export_result
-
-  def _get_kept_steps(self, event_files):
-    """Get the steps that the model was evaluated at, from event files.
-
-    Args:
-      event_files: Absolute pattern of event files.
+from tensorflow_estimator.contrib.estimator.python.estimator import exporter
 
-    Returns:
-      steps_kept: A list of steps in which the model was evaluated.
-    """
-    if not event_files:
-      return None
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+exporter.__all__ = [s for s in dir(exporter) if not s.startswith('__')]
 
-    steps_kept = []
-    for event_file in gfile.Glob(os.path.join(event_files)):
-      for event in summary_iterator.summary_iterator(event_file):
-        if event.step not in steps_kept:
-          steps_kept.append(event.step)
-    return steps_kept
+from tensorflow_estimator.contrib.estimator.python.estimator.exporter import *
diff --git a/tensorflow/contrib/estimator/python/estimator/exporter_test.py b/tensorflow/contrib/estimator/python/estimator/exporter_test.py
deleted file mode 100644
index 0d009b945e748394074a7278833abb1e12b15e7b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/exporter_test.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for `StepsExporter`."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-import tempfile
-
-from tensorflow.contrib.estimator.python.estimator import exporter as exporter_lib
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-
-
-class StepsExporterTest(test.TestCase):
-
-  def test_error_out_if_steps_to_keep_has_no_positive_integers(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    with self.assertRaisesRegexp(ValueError, "positive integer"):
-      exporter = exporter_lib.StepsExporter(
-          name="specified_steps_exporter",
-          serving_input_receiver_fn=_serving_input_receiver_fn,
-          steps_to_keep=[-1, 0, 1.1])
-      self.assertEqual("specified_steps_exporter", exporter.name)
-
-  def test_steps_exporter(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp()
-    gfile.MkDir(export_dir_base)
-    gfile.MkDir(export_dir_base + "/export")
-    gfile.MkDir(export_dir_base + "/eval")
-
-    exporter = exporter_lib.StepsExporter(
-        name="steps_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        steps_to_keep=[1])
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.export_savedmodel.return_value = "export_result_path"
-    estimator.model_dir = export_dir_base
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 1},
-                                    False)
-
-    self.assertEqual("export_result_path", export_result)
-    estimator.export_savedmodel.assert_called_with(
-        export_dir_base,
-        _serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        checkpoint_path="checkpoint_path",
-        strip_default_attrs=True)
-
-    shutil.rmtree(export_dir_base, ignore_errors=True)
-
-  def test_steps_exporter_with_preemption(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp()
-    gfile.MkDir(export_dir_base)
-    gfile.MkDir(export_dir_base + "/export")
-    gfile.MkDir(export_dir_base + "/eval")
-
-    eval_dir_base = os.path.join(export_dir_base, "eval_continuous")
-    estimator_lib._write_dict_to_summary(eval_dir_base, {}, 1)
-    estimator_lib._write_dict_to_summary(eval_dir_base, {}, 2)
-
-    exporter = exporter_lib.StepsExporter(
-        name="steps_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        event_file_pattern="eval_continuous/*.tfevents.*",
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        steps_to_keep=[1, 2, 6, 8])
-
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.model_dir = export_dir_base
-    estimator.export_savedmodel.return_value = "export_result_path"
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 3},
-                                    False)
-    self.assertEqual(None, export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 6},
-                                    False)
-    self.assertEqual("export_result_path", export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 7},
-                                    False)
-    self.assertEqual(None, export_result)
-
-    shutil.rmtree(export_dir_base, ignore_errors=True)
-
-  def test_specified_step_is_saved(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp()
-    gfile.MkDir(export_dir_base)
-    gfile.MkDir(export_dir_base + "/export")
-    gfile.MkDir(export_dir_base + "/eval")
-
-    exporter = exporter_lib.StepsExporter(
-        name="steps_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        steps_to_keep=[1, 5, 8, 10, 11])
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.export_savedmodel.return_value = "export_result_path"
-    estimator.model_dir = export_dir_base
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 1},
-                                    False)
-
-    self.assertTrue(estimator.export_savedmodel.called)
-    self.assertEqual("export_result_path", export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 2},
-                                    False)
-    self.assertEqual(None, export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 5},
-                                    False)
-    self.assertTrue(estimator.export_savedmodel.called)
-    self.assertEqual("export_result_path", export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 10},
-                                    False)
-    self.assertTrue(estimator.export_savedmodel.called)
-    self.assertEqual("export_result_path", export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 15},
-                                    False)
-    self.assertTrue(estimator.export_savedmodel.called)
-    self.assertEqual("export_result_path", export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"global_step": 20},
-                                    False)
-    self.assertEqual(None, export_result)
-
-    shutil.rmtree(export_dir_base, ignore_errors=True)
-
-  def test_steps_exporter_with_no_global_step_key(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp()
-    gfile.MkDir(export_dir_base)
-    gfile.MkDir(export_dir_base + "/export")
-    gfile.MkDir(export_dir_base + "/eval")
-
-    exporter = exporter_lib.StepsExporter(
-        name="steps_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        steps_to_keep=[1])
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.export_savedmodel.return_value = "export_result_path"
-    estimator.model_dir = export_dir_base
-
-    with self.assertRaisesRegexp(ValueError, "does not have global step"):
-      exporter.export(estimator, export_dir_base, "checkpoint_path", {}, False)
-
-    shutil.rmtree(export_dir_base, ignore_errors=True)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index e3c44bea663969b5f251275ca10676d1cd567de2..9ab9bc7a8e731dbc96dbfe1a6ca0d18036d40271 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,346 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Extenders of tf.estimator.Estimator."""
+"""extenders python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export.export_output import PredictOutput
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.util import function_utils
-
-
-_VALID_METRIC_FN_ARGS = set(['features', 'labels', 'predictions', 'config'])
-
-
-def add_metrics(estimator, metric_fn):
-  """Creates a new `tf.estimator.Estimator` which has given metrics.
-
-  Example:
-
-  ```python
-    def my_auc(labels, predictions):
-      return {'auc': tf.metrics.auc(labels, predictions['logistic'])}
-
-    estimator = tf.estimator.DNNClassifier(...)
-    estimator = tf.contrib.estimator.add_metrics(estimator, my_auc)
-    estimator.train(...)
-    estimator.evaluate(...)
-  ```
-  Example usage of custom metric which uses features:
-
-  ```python
-    def my_auc(features, labels, predictions):
-      return {'auc': tf.metrics.auc(
-        labels, predictions['logistic'], weights=features['weight'])}
-
-    estimator = tf.estimator.DNNClassifier(...)
-    estimator = tf.contrib.estimator.add_metrics(estimator, my_auc)
-    estimator.train(...)
-    estimator.evaluate(...)
-  ```
-
-  Args:
-    estimator: A `tf.estimator.Estimator` object.
-    metric_fn: A function which should obey the following signature:
-      - Args: can only have following four arguments in any order:
-        * predictions: Predictions `Tensor` or dict of `Tensor` created by given
-          `estimator`.
-        * features: Input `dict` of `Tensor` objects created by `input_fn` which
-          is given to `estimator.evaluate` as an argument.
-        * labels:  Labels `Tensor` or dict of `Tensor` created by `input_fn`
-          which is given to `estimator.evaluate` as an argument.
-        * config: config attribute of the `estimator`.
-       - Returns:
-         Dict of metric results keyed by name. Final metrics are a union of this
-         and `estimator's` existing metrics. If there is a name conflict between
-         this and `estimator`s existing metrics, this will override the existing
-         one. The values of the dict are the results of calling a metric
-         function, namely a `(metric_tensor, update_op)` tuple.
-
-  Returns:
-      A new `tf.estimator.Estimator` which has a union of original metrics with
-        given ones.
-  """
-  _verify_metric_fn_args(metric_fn)
-
-  def new_model_fn(features, labels, mode, config):
-    spec = estimator.model_fn(features, labels, mode, config)
-    if mode != model_fn_lib.ModeKeys.EVAL:
-      return spec
-    new_metrics = _call_metric_fn(metric_fn, features, labels, spec.predictions,
-                                  config)
-    all_metrics = spec.eval_metric_ops or {}
-    all_metrics.update(new_metrics)
-    return spec._replace(eval_metric_ops=all_metrics)
-
-  return estimator_lib.Estimator(
-      model_fn=new_model_fn,
-      model_dir=estimator.model_dir,
-      config=estimator.config,
-      # pylint: disable=protected-access
-      warm_start_from=estimator._warm_start_settings)
-      # pylint: enable=protected-access
-
-
-def clip_gradients_by_norm(optimizer, clip_norm):
-  """Returns an optimizer which clips gradients before applying them.
-
-  Example:
-
-  ```python
-  optimizer = tf.train.ProximalAdagradOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001)
-  optimizer = tf.contrib.estimator.clip_gradients_by_norm(
-      optimizer, clip_norm)
-  estimator = tf.estimator.DNNClassifier(
-      feature_columns=[...],
-      hidden_units=[1024, 512, 256],
-      optimizer=optimizer)
-  ```
-
-  Args:
-    optimizer: An `tf.Optimizer` object to apply gradients.
-    clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
-
-  Returns:
-    A `tf.Optimizer`.
-  """
-
-  def clip_grads(grads_and_vars):
-    gradients, variables = zip(*grads_and_vars)
-    gradients = clip_ops.clip_by_global_norm(gradients, clip_norm)[0]
-    grads_and_vars = list(zip(gradients, variables))
-    return grads_and_vars
-
-  return _TransformGradients(
-      optimizer=optimizer,
-      transform_grads_fn=clip_grads,
-      name='ClipByNorm' + optimizer.get_name())
-
-
-def forward_features(estimator, keys=None, sparse_default_values=None):
-  """Forward features to predictions dictionary.
-
-  In some cases, user wants to see some of the features in estimators prediction
-  output. As an example, consider a batch prediction service: The service simply
-  runs inference on the users graph and returns the results. Keys are essential
-  because there is no order guarantee on the outputs so they need to be rejoined
-  to the inputs via keys or transclusion of the inputs in the outputs.
-  Example:
-  ```python
-    def input_fn():
-      features, labels = ...
-      features['unique_example_id'] = ...
-      features, labels
-    estimator = tf.estimator.LinearClassifier(...)
-    estimator = tf.contrib.estimator.forward_features(
-        estimator, 'unique_example_id')
-    estimator.train(...)
-    assert 'unique_example_id' in estimator.predict(...)
-  ```
-  Args:
-    estimator: A `tf.estimator.Estimator` object.
-    keys: A `string` or a `list` of `string`. If it is `None`, all of the
-      `features` in `dict` is forwarded to the `predictions`. If it is a
-      `string`, only given key is forwarded. If it is a `list` of strings, all
-      the given `keys` are forwarded.
-    sparse_default_values: A dict of `str` keys mapping the name of the sparse
-      features to be converted to dense, to the default value to use. Only
-      sparse features indicated in the dictionary are converted to dense and the
-      provided default value is used.
-
-  Returns:
-      A new `tf.estimator.Estimator` which forwards features to predictions.
-  Raises:
-    ValueError:
-      * if `keys` is already part of `predictions`. We don't allow
-        override.
-      * if 'keys' does not exist in `features`.
-    TypeError: if `keys` type is not one of `string` or list/tuple of `string`.
-  """
-
-  def verify_key_types(keys):  # pylint: disable=missing-docstring
-    if keys is None:
-      return keys
-    if isinstance(keys, six.string_types):
-      return [keys]
-    if not isinstance(keys, (list, tuple)):
-      raise TypeError('keys should be either a string or a list of strings. '
-                      'Given: {}'.format(type(keys)))
-    for key in keys:
-      if not isinstance(key, six.string_types):
-        raise TypeError('All items in the given keys list should be a string. '
-                        'There exist an item with type: {}'.format(type(key)))
-    return keys
-
-  def get_keys(features):
-    if keys is None:
-      return features.keys()
-    return keys
-
-  def verify_keys_and_predictions(features, predictions):
-    if not isinstance(predictions, dict):
-      raise ValueError(
-          'Predictions should be a dict to be able to forward features. '
-          'Given: {}'.format(type(predictions)))
-    for key in get_keys(features):
-      if key not in features:
-        raise ValueError(
-            'keys should be exist in features. Key "{}" is not in features '
-            'dict. features dict has following keys: {}. Please check '
-            'arguments of forward_features.'.format(key, features.keys()))
-      if key in predictions:
-        raise ValueError(
-            'Cannot forward feature key ({}). Since it does exist in '
-            'predictions. Existing prediction keys: {}. Please check arguments '
-            'of forward_features.'.format(key, predictions.keys()))
-
-  keys = verify_key_types(keys)
-
-  def new_model_fn(features, labels, mode, config):  # pylint: disable=missing-docstring
-    spec = estimator.model_fn(features, labels, mode, config)
-    predictions = spec.predictions
-    if predictions is None:
-      return spec
-    verify_keys_and_predictions(features, predictions)
-    for key in get_keys(features):
-      feature = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
-          features[key])
-      if sparse_default_values and (key in sparse_default_values):
-        if not isinstance(feature, sparse_tensor_lib.SparseTensor):
-          raise ValueError(
-              'Feature ({}) is expected to be a `SparseTensor`.'.format(key))
-        feature = sparse_ops.sparse_tensor_to_dense(
-            feature, default_value=sparse_default_values[key])
-      if not isinstance(feature, ops.Tensor):
-        raise ValueError(
-            'Feature ({}) should be a Tensor. Please use `keys` '
-            'argument of forward_features to filter unwanted features, or'
-            'add key to argument `sparse_default_values`.'
-            'Type of features[{}] is {}.'.format(key, key, type(feature)))
-      predictions[key] = feature
-    spec = spec._replace(predictions=predictions)
-    if spec.export_outputs:
-      for ekey in ['predict', 'serving_default']:
-        if (ekey in spec.export_outputs and
-            isinstance(spec.export_outputs[ekey],
-                       PredictOutput)):
-          export_outputs = spec.export_outputs[ekey].outputs
-          for key in get_keys(features):
-            export_outputs[key] = predictions[key]
-
-    return spec
-
-  return estimator_lib.Estimator(
-      model_fn=new_model_fn,
-      model_dir=estimator.model_dir,
-      config=estimator.config)
-
-
-class _TransformGradients(optimizer_lib.Optimizer):
-  """Add given gradient transformation to the optimizer."""
-
-  def __init__(self, optimizer, transform_grads_fn, name=None):
-    """Construct an `tf.Optimizer` wrapper to apply given transformations.
-
-    Example:
-
-    ```python
-    optimizer = tf.train.ProximalAdagradOptimizer(
-        learning_rate=0.1,
-        l1_regularization_strength=0.001)
-    def clip_grads(grads_and_vars):
-      gradients, variables = zip(*grads_and_vars)
-      gradients = tf.clip_by_global_norm(grads, my_norm)[0]
-      grads_and_vars = list(zip(gradients, variables))
-      return grads_and_vars
-    optimizer = _TransformGradients(
-        opt=optimizer, transform_grads_fn=clip_grads)
-    estimator = tf.estimator.DNNClassifier(
-        feature_columns=[...],
-        hidden_units=[1024, 512, 256],
-        optimizer=optimizer)
-    ```
-
-    Args:
-      optimizer: An `tf.Optimizer` object to apply gradients.
-      transform_grads_fn: A function which takes a single argument, a list of
-        gradient to variable pairs (tuples), performs any requested gradient
-        updates, such as gradient clipping or multipliers, and returns the
-        updated list.
-      name: A string which will be used for debugging purposes.
-    """
-    super(_TransformGradients, self).__init__(
-        use_locking=False, name=name or optimizer.get_name())
-    self._optimizer = optimizer
-    self._transform_grads_fn = transform_grads_fn
-
-  def compute_gradients(self, *args, **kwargs):
-    """See `tf.Optimizer`."""
-    return self._optimizer.compute_gradients(*args, **kwargs)
-
-  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    """Apply gradients to variables.
-
-    Calls `transform_grads_fn`, and then applies the real optimizer.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs as returned by
-        compute_gradients().
-      global_step: Optional Variable to increment by one after the
-        variables have been updated.
-      name: Optional name for the returned operation.  Default to the
-        name passed to the Optimizer constructor.
-
-    Returns:
-      An `Operation` that applies the gradients. If `global_step` was not None,
-      that operation also increments `global_step`.
-
-    Raises:
-      ValueError: If the grads_and_vars is malformed.
-    """
-    grads_and_vars = self._transform_grads_fn(grads_and_vars)
-    return self._optimizer.apply_gradients(grads_and_vars, global_step, name)
-
-  def get_slot(self, *args, **kwargs):
-    """See `tf.Optimizer`."""
-    return self._optimizer.get_slot(*args, **kwargs)
-
-  def get_slot_names(self, *args, **kwargs):
-    """See `tf.Optimizer`."""
-    return self._optimizer.get_slot_names(*args, **kwargs)
-
-
-def _verify_metric_fn_args(metric_fn):
-  args = set(function_utils.fn_args(metric_fn))
-  invalid_args = list(args - _VALID_METRIC_FN_ARGS)
-  if invalid_args:
-    raise ValueError('metric_fn (%s) has following not expected args: %s' %
-                     (metric_fn, invalid_args))
+from tensorflow_estimator.contrib.estimator.python.estimator import extenders
 
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+extenders.__all__ = [s for s in dir(extenders) if not s.startswith('__')]
 
-def _call_metric_fn(metric_fn, features, labels, predictions, config):
-  """Calls metric fn with proper arguments."""
-  metric_fn_args = function_utils.fn_args(metric_fn)
-  kwargs = {}
-  if 'features' in metric_fn_args:
-    kwargs['features'] = features
-  if 'labels' in metric_fn_args:
-    kwargs['labels'] = labels
-  if 'predictions' in metric_fn_args:
-    kwargs['predictions'] = predictions
-  if 'config' in metric_fn_args:
-    kwargs['config'] = config
-  return metric_fn(**kwargs)
+from tensorflow_estimator.contrib.estimator.python.estimator.extenders import *
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders_test.py b/tensorflow/contrib/estimator/python/estimator/extenders_test.py
deleted file mode 100644
index c8fdaa8791b83e54d69993cfed3205d6d343ed19..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/extenders_test.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""extenders tests."""
-
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-import numpy as np
-
-from tensorflow.contrib.estimator.python.estimator import extenders
-from tensorflow.contrib.layers.python.layers import layers
-from tensorflow.contrib.predictor import from_saved_model
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.estimator.canned import linear
-from tensorflow.python.feature_column import feature_column as fc
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.training import training
-from tensorflow.python.util import compat
-
-
-def get_input_fn(x, y):
-
-  def input_fn():
-    dataset = dataset_ops.Dataset.from_tensor_slices({'x': x, 'y': y})
-    iterator = dataset.make_one_shot_iterator()
-    features = iterator.get_next()
-    labels = features.pop('y')
-    return features, labels
-
-  return input_fn
-
-
-class AddMetricsTest(test.TestCase):
-
-  def test_should_add_metrics(self):
-    input_fn = get_input_fn(
-        x=np.arange(4)[:, None, None], y=np.ones(4)[:, None])
-    estimator = linear.LinearClassifier([fc.numeric_column('x')])
-
-    def metric_fn(features):
-      return {'mean_x': metrics_lib.mean(features['x'])}
-
-    estimator = extenders.add_metrics(estimator, metric_fn)
-
-    estimator.train(input_fn=input_fn)
-    metrics = estimator.evaluate(input_fn=input_fn)
-    self.assertIn('mean_x', metrics)
-    self.assertEqual(1.5, metrics['mean_x'])
-    # assert that it keeps original estimators metrics
-    self.assertIn('auc', metrics)
-
-  def test_should_error_out_for_not_recognized_args(self):
-    estimator = linear.LinearClassifier([fc.numeric_column('x')])
-
-    def metric_fn(features, not_recognized):
-      _, _ = features, not_recognized
-      return {}
-
-    with self.assertRaisesRegexp(ValueError, 'not_recognized'):
-      estimator = extenders.add_metrics(estimator, metric_fn)
-
-  def test_all_supported_args(self):
-    input_fn = get_input_fn(x=[[[0.]]], y=[[[1]]])
-    estimator = linear.LinearClassifier([fc.numeric_column('x')])
-
-    def metric_fn(features, predictions, labels, config):
-      self.assertIn('x', features)
-      self.assertIsNotNone(labels)
-      self.assertIn('logistic', predictions)
-      self.assertTrue(isinstance(config, estimator_lib.RunConfig))
-      return {}
-
-    estimator = extenders.add_metrics(estimator, metric_fn)
-
-    estimator.train(input_fn=input_fn)
-    estimator.evaluate(input_fn=input_fn)
-
-  def test_all_supported_args_in_different_order(self):
-    input_fn = get_input_fn(x=[[[0.]]], y=[[[1]]])
-    estimator = linear.LinearClassifier([fc.numeric_column('x')])
-
-    def metric_fn(labels, config, features, predictions):
-      self.assertIn('x', features)
-      self.assertIsNotNone(labels)
-      self.assertIn('logistic', predictions)
-      self.assertTrue(isinstance(config, estimator_lib.RunConfig))
-      return {}
-
-    estimator = extenders.add_metrics(estimator, metric_fn)
-
-    estimator.train(input_fn=input_fn)
-    estimator.evaluate(input_fn=input_fn)
-
-  def test_all_args_are_optional(self):
-    input_fn = get_input_fn(x=[[[0.]]], y=[[[1]]])
-    estimator = linear.LinearClassifier([fc.numeric_column('x')])
-
-    def metric_fn():
-      return {'two': metrics_lib.mean(constant_op.constant([2.]))}
-
-    estimator = extenders.add_metrics(estimator, metric_fn)
-
-    estimator.train(input_fn=input_fn)
-    metrics = estimator.evaluate(input_fn=input_fn)
-    self.assertEqual(2., metrics['two'])
-
-  def test_overrides_existing_metrics(self):
-    input_fn = get_input_fn(x=[[[0.]]], y=[[[1]]])
-    estimator = linear.LinearClassifier([fc.numeric_column('x')])
-    estimator.train(input_fn=input_fn)
-    metrics = estimator.evaluate(input_fn=input_fn)
-    self.assertNotEqual(2., metrics['auc'])
-
-    def metric_fn():
-      return {'auc': metrics_lib.mean(constant_op.constant([2.]))}
-
-    estimator = extenders.add_metrics(estimator, metric_fn)
-    metrics = estimator.evaluate(input_fn=input_fn)
-    self.assertEqual(2., metrics['auc'])
-
-
-class ClipGradientsByNormTest(test.TestCase):
-  """Tests clip_gradients_by_norm."""
-
-  def test_applies_norm(self):
-    optimizer = extenders.clip_gradients_by_norm(
-        training.GradientDescentOptimizer(1.0), clip_norm=3.)
-    with ops.Graph().as_default():
-      w = variables.Variable(1., name='weight')
-      x = constant_op.constant(5.)
-      y = -x * w
-      grads = optimizer.compute_gradients(y, var_list=[w])[0]
-      opt_op = optimizer.minimize(y, var_list=[w])
-      with training.MonitoredSession() as sess:
-        grads_value = sess.run(grads)
-        self.assertEqual(-5., grads_value[0])
-        sess.run(opt_op)
-        new_w = sess.run(w)
-        self.assertEqual(4., new_w)  # 1 + 1*3 (w - lr * clipped_grad)
-
-  def test_name(self):
-    optimizer = extenders.clip_gradients_by_norm(
-        training.GradientDescentOptimizer(1.0), clip_norm=3.)
-    self.assertEqual('ClipByNormGradientDescent', optimizer.get_name())
-
-
-class ForwardFeaturesTest(test.TestCase):
-  """Tests forward_features."""
-
-  def _export_estimator(self, estimator, serving_input_fn):
-    tmpdir = tempfile.mkdtemp()
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = estimator.export_savedmodel(export_dir_base, serving_input_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-    return export_dir, tmpdir
-
-  def make_dummy_input_fn(self):
-    def _input_fn():
-      dataset = dataset_ops.Dataset.from_tensors({
-          'x': [[3.], [5.]],
-          'id': [[101], [102]],
-          'sparse_id': sparse_tensor.SparseTensor(
-              values=[1, 2, 3],
-              indices=[[0, 0], [1, 0], [1, 1]],
-              dense_shape=[2, 2]),
-          'labels': [[1.], [2.]]
-      })
-      def _split(x):
-        labels = x.pop('labels')
-        return x, labels
-      dataset = dataset.map(_split)
-      return dataset
-    return _input_fn
-
-  def test_forward_keys(self):
-
-    input_fn = self.make_dummy_input_fn()
-    estimator = linear.LinearRegressor([fc.numeric_column('x')])
-    estimator.train(input_fn=input_fn, steps=1)
-
-    forwarded_keys = ['id', 'sparse_id']
-
-    for key in forwarded_keys:
-      self.assertNotIn(key, next(estimator.predict(input_fn=input_fn)))
-
-    estimator = extenders.forward_features(
-        estimator, forwarded_keys, sparse_default_values={'sparse_id': 1})
-
-    expected_results = [101, 2, 102, 5]
-    predictions = estimator.predict(input_fn=input_fn)
-    for _ in range(2):
-      prediction = next(predictions)
-      for key in forwarded_keys:
-        self.assertIn(key, prediction)
-        self.assertEqual(expected_results.pop(0), sum(prediction[key]))
-
-  def test_forward_in_exported(self):
-
-    def serving_input_fn():
-      features_ph = {
-          'x': array_ops.placeholder(dtypes.float32, [None]),
-          'id': array_ops.placeholder(dtypes.int32, [None])
-      }
-      features = {
-          key: array_ops.expand_dims(tensor, -1)
-          for key, tensor in features_ph.items()
-      }
-      return estimator_lib.export.ServingInputReceiver(features, features_ph)
-    def input_fn():
-      return {'x': [[3.], [5.]], 'id': [[101], [102]]}, [[1.], [2.]]
-    # create estimator
-    feature_columns = [fc.numeric_column('x')]
-    estimator = linear.LinearRegressor(feature_columns)
-    estimator.train(input_fn=input_fn, steps=1)
-    estimator = extenders.forward_features(estimator, 'id')
-
-    # export saved model
-    export_dir, tmpdir = self._export_estimator(estimator, serving_input_fn)
-
-    # restore model
-    predict_fn = from_saved_model(export_dir, signature_def_key='predict')
-    predictions = predict_fn({'x': [3], 'id': [101]})
-
-    # verify that 'id' exists in predictions
-    self.assertIn('id', predictions)
-    self.assertEqual(101, predictions['id'])
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_forward_in_exported_sparse(self):
-    features_columns = [fc.indicator_column(
-        fc.categorical_column_with_vocabulary_list('x', range(10)))]
-
-    classifier = linear.LinearClassifier(feature_columns=features_columns)
-
-    def train_input_fn():
-      dataset = dataset_ops.Dataset.from_tensors({
-          'x': sparse_tensor.SparseTensor(
-              values=[1, 2, 3],
-              indices=[[0, 0], [1, 0], [1, 1]],
-              dense_shape=[2, 2]),
-          'labels': [[0], [1]]
-      })
-      def _split(x):
-        labels = x.pop('labels')
-        return x, labels
-      dataset = dataset.map(_split)
-      return dataset
-
-    classifier.train(train_input_fn, max_steps=1)
-
-    classifier = extenders.forward_features(
-        classifier, keys=['x'], sparse_default_values={'x': 0})
-
-    def serving_input_fn():
-      features_ph = array_ops.placeholder(dtype=dtypes.int32, name='x',
-                                          shape=[None])
-      features = {'x': layers.dense_to_sparse(features_ph)}
-      return estimator_lib.export.ServingInputReceiver(features,
-                                                       {'x': features_ph})
-    export_dir, tmpdir = self._export_estimator(classifier, serving_input_fn)
-    prediction_fn = from_saved_model(export_dir, signature_def_key='predict')
-
-    features = (0, 2)
-    prediction = prediction_fn({'x': features})
-
-    self.assertIn('x', prediction)
-    self.assertEqual(features, tuple(prediction['x']))
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_forward_list(self):
-
-    def input_fn():
-      return {'x': [[3.], [5.]], 'id': [[101], [102]]}, [[1.], [2.]]
-
-    estimator = linear.LinearRegressor([fc.numeric_column('x')])
-    estimator.train(input_fn=input_fn, steps=1)
-
-    self.assertNotIn('id', next(estimator.predict(input_fn=input_fn)))
-    estimator = extenders.forward_features(estimator, ['x', 'id'])
-    predictions = next(estimator.predict(input_fn=input_fn))
-    self.assertIn('id', predictions)
-    self.assertIn('x', predictions)
-    self.assertEqual(101, predictions['id'])
-    self.assertEqual(3., predictions['x'])
-
-  def test_forward_all(self):
-
-    def input_fn():
-      return {'x': [[3.], [5.]], 'id': [[101], [102]]}, [[1.], [2.]]
-
-    estimator = linear.LinearRegressor([fc.numeric_column('x')])
-    estimator.train(input_fn=input_fn, steps=1)
-
-    self.assertNotIn('id', next(estimator.predict(input_fn=input_fn)))
-    self.assertNotIn('x', next(estimator.predict(input_fn=input_fn)))
-    estimator = extenders.forward_features(estimator)
-    predictions = next(estimator.predict(input_fn=input_fn))
-    self.assertIn('id', predictions)
-    self.assertIn('x', predictions)
-    self.assertEqual(101, predictions['id'])
-    self.assertEqual(3., predictions['x'])
-
-  def test_key_should_be_string(self):
-    estimator = linear.LinearRegressor([fc.numeric_column('x')])
-    with self.assertRaisesRegexp(TypeError, 'keys should be either a string'):
-      extenders.forward_features(estimator, estimator)
-
-  def test_key_should_be_list_of_string(self):
-    estimator = linear.LinearRegressor([fc.numeric_column('x')])
-    with self.assertRaisesRegexp(TypeError, 'should be a string'):
-      extenders.forward_features(estimator, ['x', estimator])
-
-  def test_key_should_be_in_features(self):
-    def input_fn():
-      return {'x': [[3.], [5.]], 'id': [[101], [102]]}, [[1.], [2.]]
-
-    estimator = linear.LinearRegressor([fc.numeric_column('x')])
-    estimator.train(input_fn=input_fn, steps=1)
-
-    estimator = extenders.forward_features(estimator, 'y')
-    with self.assertRaisesRegexp(ValueError,
-                                 'keys should be exist in features'):
-      next(estimator.predict(input_fn=input_fn))
-
-  def test_forwarded_feature_should_not_be_a_sparse_tensor(self):
-    def input_fn():
-      return {
-          'x': [[3.], [5.]],
-          'id': sparse_tensor.SparseTensor(
-              values=['1', '2'],
-              indices=[[0, 0], [1, 0]],
-              dense_shape=[2, 1])
-          }, [[1.], [2.]]
-
-    estimator = linear.LinearRegressor([fc.numeric_column('x')])
-    estimator.train(input_fn=input_fn, steps=1)
-
-    estimator = extenders.forward_features(estimator)
-    with self.assertRaisesRegexp(ValueError,
-                                 'Feature .* should be a Tensor.*'):
-      next(estimator.predict(input_fn=input_fn))
-
-  def test_forwarded_feature_should_be_a_sparse_tensor(self):
-    input_fn = self.make_dummy_input_fn()
-
-    estimator = linear.LinearRegressor([fc.numeric_column('x')])
-    estimator.train(input_fn=input_fn, steps=1)
-
-    estimator = extenders.forward_features(
-        estimator, sparse_default_values={'id': 0, 'sparse_id': 0})
-    with self.assertRaisesRegexp(
-        ValueError, 'Feature .* is expected to be a `SparseTensor`.'):
-      next(estimator.predict(input_fn=input_fn))
-
-  def test_predictions_should_be_dict(self):
-    def input_fn():
-      return {'x': [[3.], [5.]], 'id': [[101], [102]]}
-
-    def model_fn(features, mode):
-      del features
-      global_step = training.get_global_step()
-      return estimator_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant([5.]),
-          predictions=constant_op.constant([5.]),
-          train_op=global_step.assign_add(1))
-
-    estimator = estimator_lib.Estimator(model_fn=model_fn)
-    estimator.train(input_fn=input_fn, steps=1)
-
-    estimator = extenders.forward_features(estimator)
-    with self.assertRaisesRegexp(ValueError, 'Predictions should be a dict'):
-      next(estimator.predict(input_fn=input_fn))
-
-  def test_should_not_conflict_with_existing_predictions(self):
-
-    def input_fn():
-      return {'x': [[3.], [5.]], 'id': [[101], [102]]}
-
-    def model_fn(features, mode):
-      del features
-      global_step = training.get_global_step()
-      return estimator_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant([5.]),
-          predictions={'x': constant_op.constant([5.])},
-          train_op=global_step.assign_add(1))
-
-    estimator = estimator_lib.Estimator(model_fn=model_fn)
-    estimator.train(input_fn=input_fn, steps=1)
-
-    estimator = extenders.forward_features(estimator)
-    with self.assertRaisesRegexp(ValueError, 'Cannot forward feature key'):
-      next(estimator.predict(input_fn=input_fn))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 34f765d56546d3cd10fcde5ac444a221c73602cd..92144d394b5bfd04b88ea4033413fe976b61c85a 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,966 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Abstractions for the head(s) of a model."""
+"""head python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.summary import summary
-from tensorflow.python.training import training_util
-
-_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
-
-def multi_class_head(n_classes,
-                     weight_column=None,
-                     label_vocabulary=None,
-                     loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
-                     loss_fn=None,
-                     name=None):
-  """Creates a `_Head` for multi class classification.
-
-  Uses `sparse_softmax_cross_entropy` loss.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`.
-  In many applications, the shape is `[batch_size, n_classes]`.
-
-  `labels` must be a dense `Tensor` with shape matching `logits`, namely
-  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
-  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
-  `labels` must be an integer `Tensor` with values specifying the class index.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
-
-  The loss is the weighted sum over the input dimensions. Namely, if the input
-  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
-  `batch_size`.
-
-  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
-  `(labels, logits, features)` as arguments and returns unreduced loss with
-  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support integer `labels` with
-  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
-  the input labels before passing them to `loss_fn`.
-
-  The head can be used with a canned estimator. Example:
-
-  ```python
-  my_head = tf.contrib.estimator.multi_class_head(n_classes=3)
-  my_estimator = tf.contrib.estimator.DNNEstimator(
-      head=my_head,
-      hidden_units=...,
-      feature_columns=...)
-  ```
-
-  It can also be used with a custom `model_fn`. Example:
-
-  ```python
-  def _my_model_fn(features, labels, mode):
-    my_head = tf.contrib.estimator.multi_class_head(n_classes=3)
-    logits = tf.keras.Model(...)(features)
-
-    return my_head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
-        logits=logits)
-
-  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
-  ```
-
-  Args:
-    n_classes: Number of classes, must be greater than 2 (for 2 classes, use
-      `binary_classification_head`).
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.
-    label_vocabulary: A list or tuple of strings representing possible label
-      values. If it is not given, that means labels are already encoded as an
-      integer within [0, n_classes). If given, labels must be of string type and
-      have any value in `label_vocabulary`. Note that errors will be raised if
-      `label_vocabulary` is not provided but labels are strings.
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
-      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
-    loss_fn: Optional loss function.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-
-  Returns:
-    An instance of `_Head` for multi class classification.
-
-  Raises:
-    ValueError: if `n_classes`, `label_vocabulary` or `loss_reduction` is
-      invalid.
-  """
-  return head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint:disable=protected-access
-      n_classes=n_classes,
-      weight_column=weight_column,
-      label_vocabulary=label_vocabulary,
-      loss_reduction=loss_reduction,
-      loss_fn=loss_fn,
-      name=name)
-
-
-def binary_classification_head(
-    weight_column=None,
-    thresholds=None,
-    label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
-    loss_fn=None,
-    name=None):
-  """Creates a `_Head` for single label binary classification.
-
-  This head uses `sigmoid_cross_entropy_with_logits` loss.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, 1]`.
-  In many applications, the shape is `[batch_size, 1]`.
-
-  `labels` must be a dense `Tensor` with shape matching `logits`, namely
-  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
-  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
-  `labels` must be float `Tensor` with values in the interval `[0, 1]`.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
-
-  The loss is the weighted sum over the input dimensions. Namely, if the input
-  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
-  `batch_size`.
-
-  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
-  `(labels, logits, features)` as arguments and returns unreduced loss with
-  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support float `labels` with
-  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
-  the input labels before passing them to `loss_fn`.
-
-  The head can be used with a canned estimator. Example:
-
-  ```python
-  my_head = tf.contrib.estimator.binary_classification_head()
-  my_estimator = tf.contrib.estimator.DNNEstimator(
-      head=my_head,
-      hidden_units=...,
-      feature_columns=...)
-  ```
-
-  It can also be used with a custom `model_fn`. Example:
-
-  ```python
-  def _my_model_fn(features, labels, mode):
-    my_head = tf.contrib.estimator.binary_classification_head()
-    logits = tf.keras.Model(...)(features)
-
-    return my_head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
-        logits=logits)
-
-  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
-  ```
-
-  Args:
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.
-    thresholds: Iterable of floats in the range `(0, 1)`. For binary
-      classification metrics such as precision and recall, an eval metric is
-      generated for each threshold value. This threshold is applied to the
-      logistic values to determine the binary classification (i.e., above the
-      threshold is `true`, below is `false`.
-    label_vocabulary: A list or tuple of strings representing possible label
-      values. If it is not given, labels must be float with values within
-      [0, 1]. If given, labels must be string type and have any value in
-      `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
-      is not provided but labels are strings.
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
-      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
-    loss_fn: Optional loss function.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-
-  Returns:
-    An instance of `_Head` for binary classification.
-
-  Raises:
-    ValueError: If `thresholds` contains a value outside of `(0, 1)`.
-    ValueError: If `loss_reduction` is invalid.
-  """
-  return head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint:disable=protected-access
-      weight_column=weight_column,
-      thresholds=thresholds,
-      label_vocabulary=label_vocabulary,
-      loss_reduction=loss_reduction,
-      loss_fn=loss_fn,
-      name=name)
-
-
-def regression_head(weight_column=None,
-                    label_dimension=1,
-                    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
-                    loss_fn=None,
-                    inverse_link_fn=None,
-                    name=None):
-  """Creates a `_Head` for regression using the `mean_squared_error` loss.
-
-  The loss is the weighted sum over all input dimensions. Namely, if the input
-  labels have shape `[batch_size, label_dimension]`, the loss is the weighted
-  sum over both `batch_size` and `label_dimension`.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, label_dimension]`.
-  In many applications, the shape is `[batch_size, label_dimension]`.
-
-  The `labels` shape must match `logits`, namely
-  `[D0, D1, ... DN, label_dimension]`. If `label_dimension=1`, shape
-  `[D0, D1, ... DN]` is also supported.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
-  `[D0, D1, ... DN, label_dimension]`.
-
-  Supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
-  `(labels, logits, features)` as arguments and returns unreduced loss with
-  shape `[D0, D1, ... DN, label_dimension]`.
-
-  Also supports custom `inverse_link_fn`, also known as 'mean function'.
-  `inverse_link_fn` is only used in `PREDICT` mode. It takes `logits` as
-  argument and returns predicted values. This function is the inverse of the
-  link function defined in
-  https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function
-  Namely, for poisson regression, set `inverse_link_fn=tf.exp`.
-
-  The head can be used with a canned estimator. Example:
-
-  ```python
-  my_head = tf.contrib.estimator.regression_head()
-  my_estimator = tf.contrib.estimator.DNNEstimator(
-      head=my_head,
-      hidden_units=...,
-      feature_columns=...)
-  ```
-
-  It can also be used with a custom `model_fn`. Example:
-
-  ```python
-  def _my_model_fn(features, labels, mode):
-    my_head = tf.contrib.estimator.regression_head()
-    logits = tf.keras.Model(...)(features)
-
-    return my_head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
-        logits=logits)
-
-  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
-  ```
-
-  Args:
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.
-    label_dimension: Number of regression labels per example. This is the size
-      of the last dimension of the labels `Tensor` (typically, this has shape
-      `[batch_size, label_dimension]`).
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch and label dimension. Defaults to
-      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
-      `batch size * label_dimension`. See `tf.losses.Reduction`.
-    loss_fn: Optional loss function. Defaults to `mean_squared_error`.
-    inverse_link_fn: Optional inverse link function, also known as 'mean
-      function'. Defaults to identity.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-
-  Returns:
-    An instance of `_Head` for linear regression.
-
-  Raises:
-    ValueError: If `label_dimension` or `loss_reduction` is invalid.
-  """
-  return head_lib._regression_head(  # pylint:disable=protected-access
-      weight_column=weight_column,
-      label_dimension=label_dimension,
-      loss_reduction=loss_reduction,
-      loss_fn=loss_fn,
-      inverse_link_fn=inverse_link_fn,
-      name=name)
-
-
-def poisson_regression_head(
-    weight_column=None,
-    label_dimension=1,
-    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
-    compute_full_loss=True,
-    name=None):
-  """Creates a `_Head` for poisson regression using `tf.nn.log_poisson_loss`.
-
-  The loss is the weighted sum over all input dimensions. Namely, if the input
-  labels have shape `[batch_size, label_dimension]`, the loss is the weighted
-  sum over both `batch_size` and `label_dimension`.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, label_dimension]`.
-  In many applications, the shape is `[batch_size, label_dimension]`.
-
-  The `labels` shape must match `logits`, namely
-  `[D0, D1, ... DN, label_dimension]`. If `label_dimension=1`, shape
-  `[D0, D1, ... DN]` is also supported.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
-  `[D0, D1, ... DN, label_dimension]`.
-
-  This is implemented as a generalized linear model, see
-  https://en.wikipedia.org/wiki/Generalized_linear_model.
-
-  The head can be used with a canned estimator. Example:
-
-  ```python
-  my_head = tf.contrib.estimator.poisson_regression_head()
-  my_estimator = tf.contrib.estimator.DNNEstimator(
-      head=my_head,
-      hidden_units=...,
-      feature_columns=...)
-  ```
-
-  It can also be used with a custom `model_fn`. Example:
-
-  ```python
-  def _my_model_fn(features, labels, mode):
-    my_head = tf.contrib.estimator.poisson_regression_head()
-    logits = tf.keras.Model(...)(features)
-
-    return my_head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
-        logits=logits)
-
-  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
-  ```
-
-  Args:
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.
-    label_dimension: Number of regression labels per example. This is the size
-      of the last dimension of the labels `Tensor` (typically, this has shape
-      `[batch_size, label_dimension]`).
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch and label dimension. Defaults to
-      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
-      `batch size * label_dimension`. See `tf.losses.Reduction`.
-    compute_full_loss: Whether to include the constant `log(z!)` term in
-      computing the poisson loss. See `tf.nn.log_poisson_loss` for the full
-      documentation.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-
-  Returns:
-    An instance of `_Head` for poisson regression.
-
-  Raises:
-    ValueError: If `label_dimension` or `loss_reduction` is invalid.
-  """
-  def _poisson_loss(labels, logits):
-    return nn.log_poisson_loss(
-        targets=labels, log_input=logits, compute_full_loss=compute_full_loss)
-  return head_lib._regression_head(  # pylint:disable=protected-access
-      weight_column=weight_column,
-      label_dimension=label_dimension,
-      loss_reduction=loss_reduction,
-      loss_fn=_poisson_loss,
-      inverse_link_fn=math_ops.exp,
-      name=name)
-
-
-def logistic_regression_head(
-    weight_column=None,
-    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
-    name=None):
-  """Creates a `_Head` for logistic regression.
-
-  Uses `sigmoid_cross_entropy_with_logits` loss, which is the same as
-  `binary_classification_head`. The differences compared to
-  `binary_classification_head` are:
-
-  * Does not support `label_vocabulary`. Instead, labels must be float in the
-    range [0, 1].
-  * Does not calculate some metrics that do not make sense, such as AUC.
-  * In `PREDICT` mode, only returns logits and predictions
-    (`=tf.sigmoid(logits)`), whereas `binary_classification_head` also returns
-    probabilities, classes, and class_ids.
-  * Export output defaults to `RegressionOutput`, whereas
-    `binary_classification_head` defaults to `PredictOutput`.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, 1]`.
-  In many applications, the shape is `[batch_size, 1]`.
-
-  The `labels` shape must match `logits`, namely
-  `[D0, D1, ... DN]` or `[D0, D1, ... DN, 1]`.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]` or `[D0, D1, ... DN, 1]`.
-
-  This is implemented as a generalized linear model, see
-  https://en.wikipedia.org/wiki/Generalized_linear_model.
-
-  The head can be used with a canned estimator. Example:
-
-  ```python
-  my_head = tf.contrib.estimator.logistic_regression_head()
-  my_estimator = tf.contrib.estimator.DNNEstimator(
-      head=my_head,
-      hidden_units=...,
-      feature_columns=...)
-  ```
-
-  It can also be used with a custom `model_fn`. Example:
-
-  ```python
-  def _my_model_fn(features, labels, mode):
-    my_head = tf.contrib.estimator.logistic_regression_head()
-    logits = tf.keras.Model(...)(features)
-
-    return my_head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
-        logits=logits)
-
-  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
-  ```
-
-  Args:
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch and label dimension. Defaults to
-      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
-      `batch size * label_dimension`. See `tf.losses.Reduction`.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-
-  Returns:
-    An instance of `_Head` for logistic regression.
-
-  Raises:
-    ValueError: If `loss_reduction` is invalid.
-  """
-  def _logistic_loss(labels, logits):
-    labels = head_lib._assert_range(  # pylint:disable=protected-access
-        labels, n_classes=2, message='Labels must be in range [0, 1]')
-    return nn.sigmoid_cross_entropy_with_logits(
-        labels=labels, logits=logits)
-  return head_lib._regression_head(  # pylint:disable=protected-access
-      weight_column=weight_column,
-      label_dimension=1,
-      loss_reduction=loss_reduction,
-      loss_fn=_logistic_loss,
-      inverse_link_fn=math_ops.sigmoid,
-      name=name)
-
-
-def multi_label_head(n_classes,
-                     weight_column=None,
-                     thresholds=None,
-                     label_vocabulary=None,
-                     loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
-                     loss_fn=None,
-                     classes_for_class_based_metrics=None,
-                     name=None):
-  """Creates a `_Head` for multi-label classification.
-
-  Multi-label classification handles the case where each example may have zero
-  or more associated labels, from a discrete set. This is distinct from
-  `multi_class_head` which has exactly one label per example.
-
-  Uses `sigmoid_cross_entropy` loss average over classes and weighted sum over
-  the batch. Namely, if the input logits have shape `[batch_size, n_classes]`,
-  the loss is the average over `n_classes` and the weighted sum over
-  `batch_size`.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`. In many
-  applications, the shape is `[batch_size, n_classes]`.
-
-  Labels can be:
-
-  * A multi-hot tensor of shape `[D0, D1, ... DN, n_classes]`
-  * An integer `SparseTensor` of class indices. The `dense_shape` must be
-    `[D0, D1, ... DN, ?]` and the values within `[0, n_classes)`.
-  * If `label_vocabulary` is given, a string `SparseTensor`. The `dense_shape`
-    must be `[D0, D1, ... DN, ?]` and the values within `label_vocabulary` or a
-    multi-hot tensor of shape `[D0, D1, ... DN, n_classes]`.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
-
-  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
-  `(labels, logits, features)` as arguments and returns unreduced loss with
-  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support indicator `labels` with
-  shape `[D0, D1, ... DN, n_classes]`. Namely, the head applies
-  `label_vocabulary` to the input labels before passing them to `loss_fn`.
-
-  The head can be used with a canned estimator. Example:
-
-  ```python
-  my_head = tf.contrib.estimator.multi_label_head(n_classes=3)
-  my_estimator = tf.contrib.estimator.DNNEstimator(
-      head=my_head,
-      hidden_units=...,
-      feature_columns=...)
-  ```
-
-  It can also be used with a custom `model_fn`. Example:
-
-  ```python
-  def _my_model_fn(features, labels, mode):
-    my_head = tf.contrib.estimator.multi_label_head(n_classes=3)
-    logits = tf.keras.Model(...)(features)
-
-    return my_head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
-        logits=logits)
-
-  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
-  ```
-
-  Args:
-    n_classes: Number of classes, must be greater than 1 (for 1 class, use
-      `binary_classification_head`).
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.  Per-class weighting is
-      not supported.
-    thresholds: Iterable of floats in the range `(0, 1)`. Accuracy, precision
-      and recall metrics are evaluated for each threshold value. The threshold
-      is applied to the predicted probabilities, i.e. above the threshold is
-      `true`, below is `false`.
-    label_vocabulary: A list of strings represents possible label values. If it
-      is not given, that means labels are already encoded as integer within
-      [0, n_classes) or multi-hot Tensor. If given, labels must be SparseTensor
-      string type and have any value in `label_vocabulary`. Also there will be
-      errors if vocabulary is not provided and labels are string.
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
-      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
-    loss_fn: Optional loss function.
-    classes_for_class_based_metrics: List of integer class IDs or string class
-      names for which per-class metrics are evaluated. If integers, all must be
-      in the range `[0, n_classes - 1]`. If strings, all must be in
-      `label_vocabulary`.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-
-  Returns:
-    An instance of `_Head` for multi-label classification.
-
-  Raises:
-    ValueError: if `n_classes`, `thresholds`, `loss_reduction`, `loss_fn` or
-    `metric_class_ids` is invalid.
-  """
-  thresholds = tuple(thresholds) if thresholds else tuple()
-  if n_classes is None or n_classes < 2:
-    raise ValueError(
-        'n_classes must be > 1 for multi-class classification. '
-        'Given: {}'.format(n_classes))
-  for threshold in thresholds:
-    if (threshold <= 0.0) or (threshold >= 1.0):
-      raise ValueError(
-          'thresholds must be in (0, 1) range. Given: {}'.format(threshold))
-  if label_vocabulary is not None:
-    if not isinstance(label_vocabulary, (list, tuple)):
-      raise ValueError(
-          'label_vocabulary must be a list or tuple. '
-          'Given type: {}'.format(type(label_vocabulary)))
-    if len(label_vocabulary) != n_classes:
-      raise ValueError(
-          'Length of label_vocabulary must be n_classes ({}). '
-          'Given: {}'.format(n_classes, len(label_vocabulary)))
-  if loss_fn:
-    head_lib._validate_loss_fn_args(loss_fn)  # pylint:disable=protected-access
-  if (loss_reduction not in losses.Reduction.all() or
-      loss_reduction == losses.Reduction.NONE):
-    raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
-  classes_for_class_based_metrics = tuple(
-      [] if classes_for_class_based_metrics is None
-      else classes_for_class_based_metrics)
-  if classes_for_class_based_metrics:
-    if isinstance(classes_for_class_based_metrics[0], six.string_types):
-      if not label_vocabulary:
-        raise ValueError(
-            'label_vocabulary must be provided when '
-            'classes_for_class_based_metrics are sting.')
-      class_ids = []
-      for class_string in classes_for_class_based_metrics:
-        class_ids.append(label_vocabulary.index(class_string))
-      classes_for_class_based_metrics = tuple(class_ids)
-    else:
-      for class_id in classes_for_class_based_metrics:
-        if (class_id < 0) or (class_id >= n_classes):
-          raise ValueError(
-              'All classes_for_class_based_metrics must be in range [0, {}]. '
-              'Given: {}'.format(n_classes - 1, class_id))
-  return _MultiLabelHead(
-      n_classes=n_classes, weight_column=weight_column, thresholds=thresholds,
-      label_vocabulary=label_vocabulary, loss_reduction=loss_reduction,
-      loss_fn=loss_fn,
-      classes_for_class_based_metrics=classes_for_class_based_metrics,
-      name=name)
-
-
-class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
-  """`_Head` for multi-label classification."""
-
-  def __init__(self,
-               n_classes,
-               weight_column=None,
-               thresholds=None,
-               label_vocabulary=None,
-               loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
-               loss_fn=None,
-               classes_for_class_based_metrics=None,
-               name=None):
-    self._n_classes = n_classes
-    self._weight_column = weight_column
-    self._thresholds = thresholds
-    self._label_vocabulary = label_vocabulary
-    self._loss_reduction = loss_reduction
-    self._loss_fn = loss_fn
-    self._classes_for_class_based_metrics = classes_for_class_based_metrics
-    self._name = name
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def logits_dimension(self):
-    return self._n_classes
-
-  def _process_labels(self, labels):
-    if labels is None:
-      raise ValueError(
-          'You must provide a labels Tensor. Given: None. '
-          'Suggested troubleshooting steps: Check that your data contain '
-          'your label feature. Check that your input_fn properly parses and '
-          'returns labels.')
-    if isinstance(labels, sparse_tensor.SparseTensor):
-      if labels.dtype == dtypes.string:
-        label_ids_values = lookup_ops.index_table_from_tensor(
-            vocabulary_list=tuple(self._label_vocabulary),
-            name='class_id_lookup').lookup(labels.values)
-        label_ids = sparse_tensor.SparseTensor(
-            indices=labels.indices,
-            values=label_ids_values,
-            dense_shape=labels.dense_shape)
-        return math_ops.to_int64(
-            sparse_ops.sparse_to_indicator(label_ids, self._n_classes))
-      else:
-        err_msg = (
-            r'labels must be an integer SparseTensor with values in '
-            r'[0, {})'.format(self._n_classes))
-        assert_int = check_ops.assert_integer(
-            labels.values, message=err_msg)
-        assert_less = check_ops.assert_less(
-            labels.values,
-            ops.convert_to_tensor(self._n_classes, dtype=labels.dtype),
-            message=err_msg)
-        assert_greater = check_ops.assert_non_negative(
-            labels.values, message=err_msg)
-        with ops.control_dependencies(
-            [assert_int, assert_less, assert_greater]):
-          return math_ops.to_int64(
-              sparse_ops.sparse_to_indicator(labels, self._n_classes))
-    err_msg = (
-        r'labels must be an integer indicator Tensor with values in [0, 1]')
-    return head_lib._assert_range(labels, 2, message=err_msg)  # pylint:disable=protected-access,
-
-  def create_loss(self, features, mode, logits, labels):
-    """See `Head`."""
-    del mode  # Unused for this head.
-    logits = ops.convert_to_tensor(logits)
-    processed_labels = self._process_labels(labels)
-    processed_labels = head_lib._check_dense_labels_match_logits_and_reshape(  # pylint:disable=protected-access
-        labels=processed_labels, logits=logits,
-        expected_labels_dimension=self.logits_dimension)
-    if self._loss_fn:
-      unweighted_loss = head_lib._call_loss_fn(  # pylint:disable=protected-access
-          loss_fn=self._loss_fn, labels=processed_labels, logits=logits,
-          features=features, expected_loss_dim=1)
-    else:
-      unweighted_loss = losses.sigmoid_cross_entropy(
-          multi_class_labels=processed_labels, logits=logits,
-          reduction=losses.Reduction.NONE)
-      # Averages loss over classes.
-      unweighted_loss = math_ops.reduce_mean(
-          unweighted_loss, axis=-1, keepdims=True)
-    weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
-        features=features, weight_column=self._weight_column, logits=logits)
-    training_loss = losses.compute_weighted_loss(
-        unweighted_loss, weights=weights, reduction=self._loss_reduction)
-    return head_lib.LossSpec(
-        training_loss=training_loss,
-        unreduced_loss=unweighted_loss,
-        weights=weights,
-        processed_labels=processed_labels)
-
-  def _create_tpu_estimator_spec(
-      self, features, mode, logits, labels=None, optimizer=None,
-      train_op_fn=None, regularization_losses=None):
-    """Returns an `model_fn._TPUEstimatorSpec`.
-
-    Args:
-      features: Input `dict` of `Tensor` or `SparseTensor` objects.
-      mode: Estimator's `ModeKeys`.
-      logits: logits `Tensor` with shape `[D0, D1, ... DN, n_classes]`.
-        For many applications, the shape is `[batch_size, n_classes]`.
-      labels: Labels with shape matching `logits`. Can be multi-hot `Tensor`
-        with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with
-        `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when
-        `mode` equals `TRAIN` or `EVAL`.
-      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
-        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
-        updates variables and increments `global_step`.
-      train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Used if `optimizer` is `None`.
-      regularization_losses: A list of additional scalar losses to be added to
-        the training loss, such as regularization losses. These losses are
-        usually expressed as a batch average, so for best results users need to
-        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
-        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
-        avoid scaling errors.
-    Returns:
-      `model_fn._TPUEstimatorSpec`.
-    Raises:
-      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
-        mode, or if both are set.
-    """
-    with ops.name_scope(self._name, 'head'):
-      logits = head_lib._check_logits_final_dim(logits, self.logits_dimension)  # pylint:disable=protected-access
-
-      # Predict.
-      pred_keys = prediction_keys.PredictionKeys
-      with ops.name_scope(None, 'predictions', (logits,)):
-        probabilities = math_ops.sigmoid(logits, name=pred_keys.PROBABILITIES)
-        predictions = {
-            pred_keys.LOGITS: logits,
-            pred_keys.PROBABILITIES: probabilities,
-        }
-      if mode == model_fn.ModeKeys.PREDICT:
-        classifier_output = head_lib._classification_output(  # pylint:disable=protected-access
-            scores=probabilities, n_classes=self._n_classes,
-            label_vocabulary=self._label_vocabulary)
-        return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
-            mode=model_fn.ModeKeys.PREDICT,
-            predictions=predictions,
-            export_outputs={
-                _DEFAULT_SERVING_KEY: classifier_output,
-                head_lib._CLASSIFY_SERVING_KEY: classifier_output,  # pylint:disable=protected-access
-                head_lib._PREDICT_SERVING_KEY: (  # pylint:disable=protected-access
-                    export_output.PredictOutput(predictions))
-            })
-
-      (training_loss, unreduced_loss, weights,
-       processed_labels) = self.create_loss(
-           features=features, mode=mode, logits=logits, labels=labels)
-      if regularization_losses:
-        regularization_loss = math_ops.add_n(regularization_losses)
-        regularized_training_loss = math_ops.add_n(
-            [training_loss, regularization_loss])
-      else:
-        regularization_loss = None
-        regularized_training_loss = training_loss
-
-      # Eval.
-      if mode == model_fn.ModeKeys.EVAL:
-        return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
-            mode=model_fn.ModeKeys.EVAL,
-            predictions=predictions,
-            loss=regularized_training_loss,
-            eval_metrics=head_lib._create_eval_metrics_tuple(  # pylint:disable=protected-access
-                self._eval_metric_ops, {
-                    'labels': processed_labels,
-                    'probabilities': probabilities,
-                    'weights': weights,
-                    'unreduced_loss': unreduced_loss,
-                    'regularization_loss': regularization_loss,
-                }))
+from tensorflow_estimator.contrib.estimator.python.estimator import head
 
-      # Train.
-      if optimizer is not None:
-        if train_op_fn is not None:
-          raise ValueError('train_op_fn and optimizer cannot both be set.')
-        train_op = optimizer.minimize(
-            regularized_training_loss,
-            global_step=training_util.get_global_step())
-      elif train_op_fn is not None:
-        train_op = train_op_fn(regularized_training_loss)
-      else:
-        raise ValueError('train_op_fn and optimizer cannot both be None.')
-      train_op = head_lib._append_update_ops(train_op)  # pylint:disable=protected-access
-      # Only summarize mean_loss for SUM reduction to preserve backwards
-      # compatibility. Otherwise skip it to avoid unnecessary computation.
-      if self._loss_reduction == losses.Reduction.SUM:
-        example_weight_sum = math_ops.reduce_sum(
-            weights * array_ops.ones_like(unreduced_loss))
-        mean_loss = training_loss / example_weight_sum
-      else:
-        mean_loss = None
-    with ops.name_scope(''):
-      keys = metric_keys.MetricKeys
-      summary.scalar(
-          head_lib._summary_key(self._name, keys.LOSS),  # pylint:disable=protected-access
-          regularized_training_loss)
-      if mean_loss is not None:
-        summary.scalar(
-            head_lib._summary_key(self._name, keys.LOSS_MEAN),  # pylint:disable=protected-access
-            mean_loss)
-      if regularization_loss is not None:
-        summary.scalar(
-            head_lib._summary_key(self._name, keys.LOSS_REGULARIZATION),  # pylint:disable=protected-access
-            regularization_loss)
-    return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
-        mode=model_fn.ModeKeys.TRAIN,
-        predictions=predictions,
-        loss=regularized_training_loss,
-        train_op=train_op)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+head.__all__ = [s for s in dir(head) if not s.startswith('__')]
 
-  def _eval_metric_ops(
-      self, labels, probabilities, weights, unreduced_loss,
-      regularization_loss):
-    """Returns a dict of metrics for eval_metric_ops."""
-    with ops.name_scope(
-        None, 'metrics',
-        [labels, probabilities, weights, unreduced_loss, regularization_loss]):
-      keys = metric_keys.MetricKeys
-      metric_ops = {
-          # Estimator already adds a metric for loss.
-          head_lib._summary_key(self._name, keys.LOSS_MEAN):  # pylint:disable=protected-access
-              metrics_lib.mean(
-                  values=unreduced_loss,
-                  weights=weights,
-                  name=keys.LOSS_MEAN),
-          head_lib._summary_key(self._name, keys.AUC):  # pylint:disable=protected-access
-              metrics_lib.auc(labels=labels, predictions=probabilities,
-                              weights=weights, name=keys.AUC),
-          head_lib._summary_key(self._name, keys.AUC_PR):  # pylint:disable=protected-access
-              metrics_lib.auc(labels=labels, predictions=probabilities,
-                              weights=weights, curve='PR',
-                              name=keys.AUC_PR),
-      }
-      if regularization_loss is not None:
-        loss_regularization_key = head_lib._summary_key(  # pylint:disable=protected-access
-            self._name, keys.LOSS_REGULARIZATION)
-        metric_ops[loss_regularization_key] = (
-            metrics_lib.mean(
-                values=regularization_loss,
-                name=keys.LOSS_REGULARIZATION))
-      for threshold in self._thresholds:
-        accuracy_key = keys.ACCURACY_AT_THRESHOLD % threshold
-        metric_ops[head_lib._summary_key(self._name, accuracy_key)] = (  # pylint:disable=protected-access
-            head_lib._accuracy_at_threshold(  # pylint:disable=protected-access
-                labels=labels,
-                predictions=probabilities,
-                weights=weights,
-                threshold=threshold,
-                name=accuracy_key))
-        # Precision for positive examples.
-        precision_key = keys.PRECISION_AT_THRESHOLD % threshold
-        metric_ops[head_lib._summary_key(self._name, precision_key)] = (  # pylint:disable=protected-access
-            head_lib._precision_at_threshold(  # pylint:disable=protected-access
-                labels=labels,
-                predictions=probabilities,
-                weights=weights,
-                threshold=threshold,
-                name=precision_key))
-        # Recall for positive examples.
-        recall_key = keys.RECALL_AT_THRESHOLD % threshold
-        metric_ops[head_lib._summary_key(self._name, recall_key)] = (  # pylint:disable=protected-access
-            head_lib._recall_at_threshold(  # pylint:disable=protected-access
-                labels=labels,
-                predictions=probabilities,
-                weights=weights,
-                threshold=threshold,
-                name=recall_key))
-      for class_id in self._classes_for_class_based_metrics:
-        batch_rank = array_ops.rank(probabilities) - 1
-        begin = array_ops.concat(
-            [array_ops.zeros([batch_rank], dtype=dtypes.int32), [class_id]],
-            axis=0)
-        size = array_ops.concat(
-            [-1 * array_ops.ones([batch_rank], dtype=dtypes.int32), [1]],
-            axis=0)
-        class_probabilities = array_ops.slice(
-            probabilities, begin=begin, size=size)
-        class_labels = array_ops.slice(labels, begin=begin, size=size)
-        if self._label_vocabulary is None:
-          prob_key = keys.PROBABILITY_MEAN_AT_CLASS % class_id
-        else:
-          prob_key = (
-              keys.PROBABILITY_MEAN_AT_NAME % self._label_vocabulary[class_id])
-        metric_ops[head_lib._summary_key(self._name, prob_key)] = (  # pylint:disable=protected-access
-            head_lib._predictions_mean(  # pylint:disable=protected-access
-                predictions=class_probabilities,
-                weights=weights,
-                name=prob_key))
-        if self._label_vocabulary is None:
-          auc_key = keys.AUC_AT_CLASS % class_id
-        else:
-          auc_key = keys.AUC_AT_NAME % self._label_vocabulary[class_id]
-        metric_ops[head_lib._summary_key(self._name, auc_key)] = (  # pylint:disable=protected-access
-            head_lib._auc(  # pylint:disable=protected-access
-                labels=class_labels,
-                predictions=class_probabilities,
-                weights=weights,
-                name=auc_key))
-        if self._label_vocabulary is None:
-          auc_pr_key = keys.AUC_PR_AT_CLASS % class_id
-        else:
-          auc_pr_key = keys.AUC_PR_AT_NAME % self._label_vocabulary[class_id]
-        metric_ops[head_lib._summary_key(self._name, auc_pr_key)] = (  # pylint:disable=protected-access
-            head_lib._auc(  # pylint:disable=protected-access
-                labels=class_labels,
-                predictions=class_probabilities,
-                weights=weights,
-                curve='PR',
-                name=auc_pr_key))
-    return metric_ops
+from tensorflow_estimator.contrib.estimator.python.estimator.head import *
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
deleted file mode 100644
index c6e75f8d46f82fc546f3be12840651168a9641ce..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ /dev/null
@@ -1,1482 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for head."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-
-from tensorflow.contrib.estimator.python.estimator import head as head_lib
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.training import monitored_session
-
-
-_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
-
-def _initialize_variables(test_case, scaffold):
-  scaffold.finalize()
-  test_case.assertIsNone(scaffold.init_feed_dict)
-  test_case.assertIsNone(scaffold.init_fn)
-  scaffold.init_op.run()
-  scaffold.ready_for_local_init_op.eval()
-  scaffold.local_init_op.run()
-  scaffold.ready_op.eval()
-  test_case.assertIsNotNone(scaffold.saver)
-
-
-def _assert_simple_summaries(test_case, expected_summaries, summary_str,
-                             tol=1e-6):
-  """Assert summary the specified simple values.
-
-  Args:
-    test_case: test case.
-    expected_summaries: Dict of expected tags and simple values.
-    summary_str: Serialized `summary_pb2.Summary`.
-    tol: Tolerance for relative and absolute.
-  """
-  summary = summary_pb2.Summary()
-  summary.ParseFromString(summary_str)
-  test_case.assertAllClose(expected_summaries, {
-      v.tag: v.simple_value for v in summary.value
-  }, rtol=tol, atol=tol)
-
-
-def _assert_no_hooks(test_case, spec):
-  test_case.assertAllEqual([], spec.training_chief_hooks)
-  test_case.assertAllEqual([], spec.training_hooks)
-
-
-def _sigmoid(logits):
-  return 1 / (1 + np.exp(-logits))
-
-
-def _sigmoid_cross_entropy(labels, logits):
-  """Returns sigmoid cross entropy averaged over classes."""
-  sigmoid_logits = _sigmoid(logits)
-  unreduced_result = (
-      -labels * np.log(sigmoid_logits)
-      -(1 - labels) * np.log(1 - sigmoid_logits))
-  # Mean over classes
-  return np.mean(unreduced_result, axis=-1, keepdims=True)
-
-
-class MultiLabelHead(test.TestCase):
-
-  def setUp(self):
-    ops.reset_default_graph()
-
-  def test_n_classes_is_none(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'n_classes must be > 1 for multi-class classification\. Given: None'):
-      head_lib.multi_label_head(n_classes=None)
-
-  def test_n_classes_is_1(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'n_classes must be > 1 for multi-class classification\. Given: 1'):
-      head_lib.multi_label_head(n_classes=1)
-
-  def test_threshold_too_small(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'thresholds must be in \(0, 1\) range\. Given: 0\.0'):
-      head_lib.multi_label_head(n_classes=2, thresholds=[0., 0.5])
-
-  def test_threshold_too_large(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'thresholds must be in \(0, 1\) range\. Given: 1\.0'):
-      head_lib.multi_label_head(n_classes=2, thresholds=[0.5, 1.0])
-
-  def test_label_vocabulary_dict(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'label_vocabulary must be a list or tuple\. '
-        r'Given type: <(type|class) \'dict\'>'):
-      head_lib.multi_label_head(n_classes=2, label_vocabulary={'foo': 'bar'})
-
-  def test_label_vocabulary_wrong_size(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Length of label_vocabulary must be n_classes \(3\). Given: 2'):
-      head_lib.multi_label_head(n_classes=3, label_vocabulary=['foo', 'bar'])
-
-  def test_invalid_loss_reduction(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'Invalid loss_reduction: invalid_loss_reduction'):
-      head_lib.multi_label_head(
-          n_classes=3, loss_reduction='invalid_loss_reduction')
-    with self.assertRaisesRegexp(
-        ValueError, r'Invalid loss_reduction: none'):
-      head_lib.multi_label_head(
-          n_classes=3, loss_reduction=losses.Reduction.NONE)
-
-  def test_loss_fn_arg_labels_missing(self):
-    def _loss_fn(logits):
-      del logits  # Unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn must contain argument: labels\. '
-        r'Given arguments: \(\'logits\',\)'):
-      head_lib.multi_label_head(n_classes=3, loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_logits_missing(self):
-    def _loss_fn(labels):
-      del labels  # unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn must contain argument: logits\. '
-        r'Given arguments: \(\'labels\',\)'):
-      head_lib.multi_label_head(n_classes=3, loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_features_ok(self):
-    def _loss_fn(labels, logits, features):
-      del labels, logits, features  # Unused
-    head_lib.multi_label_head(n_classes=3, loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_invalid(self):
-    def _loss_fn(labels, logits, name=None):
-      del labels, logits, name  # Unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn has unexpected args: \[\'name\'\]'):
-      head_lib.multi_label_head(n_classes=3, loss_fn=_loss_fn)
-
-  def test_classes_for_class_based_metrics_invalid(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'All classes_for_class_based_metrics must be in range \[0, 2\]\. '
-        r'Given: -1'):
-      head_lib.multi_label_head(
-          n_classes=3, classes_for_class_based_metrics=[2, -1])
-
-  def test_classes_for_class_based_metrics_string_invalid(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'\'z\' is not in list'):
-      head_lib.multi_label_head(
-          n_classes=3, label_vocabulary=['a', 'b', 'c'],
-          classes_for_class_based_metrics=['c', 'z'])
-
-  def test_name(self):
-    head = head_lib.multi_label_head(n_classes=4, name='foo')
-    self.assertEqual('foo', head.name)
-
-  def test_predict(self):
-    n_classes = 4
-    head = head_lib.multi_label_head(n_classes)
-    self.assertEqual(n_classes, head.logits_dimension)
-
-    logits = np.array(
-        [[0., 1., 2., -1.], [-1., -2., -3., 1.]], dtype=np.float32)
-    expected_probabilities = _sigmoid(logits)
-    expected_export_classes = [[b'0', b'1', b'2', b'3']] * 2
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'classification'),
-        spec.export_outputs.keys())
-
-    # Assert predictions and export_outputs.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(logits,
-                          predictions[prediction_keys.PredictionKeys.LOGITS])
-      self.assertAllClose(
-          expected_probabilities,
-          predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-
-      self.assertAllClose(
-          expected_probabilities,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
-      self.assertAllEqual(
-          expected_export_classes,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].classes))
-
-  def test_predict_with_label_vocabulary(self):
-    n_classes = 4
-    head = head_lib.multi_label_head(
-        n_classes, label_vocabulary=['foo', 'bar', 'foobar', 'barfoo'])
-
-    logits = np.array(
-        [[0., 1., 2., -1.], [-1., -2., -3., 1.]], dtype=np.float32)
-    expected_export_classes = [[b'foo', b'bar', b'foobar', b'barfoo']] * 2
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertAllEqual(
-          expected_export_classes,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].classes))
-
-  def test_weight_should_not_impact_prediction(self):
-    n_classes = 4
-    head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
-    self.assertEqual(n_classes, head.logits_dimension)
-
-    logits = np.array(
-        [[0., 1., 2., -1.], [-1., -2., -3., 1.]], dtype=np.float32)
-    expected_probabilities = _sigmoid(logits)
-
-    weights_2x1 = [[1.], [2.]]
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array(((42,),), dtype=np.int32),
-            'example_weights': weights_2x1,
-        },
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    # Assert predictions and export_outputs.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(logits,
-                          predictions[prediction_keys.PredictionKeys.LOGITS])
-      self.assertAllClose(
-          expected_probabilities,
-          predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-
-  def test_eval_create_loss(self):
-    """Tests head.create_loss for eval mode."""
-    n_classes = 2
-    head = head_lib.multi_label_head(n_classes)
-
-    logits = np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = (labels * -log(sigmoid(logits)) +
-    #         (1 - labels) * -log(1 - sigmoid(logits))) / 2
-    expected_training_loss = 0.5 * np.sum(
-        _sigmoid_cross_entropy(labels=labels, logits=logits))
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_training_loss,
-                          actual_training_loss.eval())
-
-  def test_eval_create_loss_large_logits(self):
-    """Tests head.create_loss for eval mode and large logits."""
-    n_classes = 2
-    head = head_lib.multi_label_head(n_classes)
-
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # For large logits, this is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits
-    expected_training_loss = 0.5 * np.sum(
-        np.array([[(10. + 10.) / 2.], [(15. + 0.) / 2.]], dtype=np.float32))
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, actual_training_loss.eval(), atol=1e-4)
-
-  def test_eval_create_loss_labels_wrong_shape(self):
-    """Tests head.create_loss for eval mode when labels has the wrong shape."""
-    n_classes = 2
-    head = head_lib.multi_label_head(n_classes)
-
-    logits = np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32)
-    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels_placeholder)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[expected_labels_shape: \] \[2 2\] \[labels_shape: \] \[2 1\]'):
-        actual_training_loss.eval({
-            labels_placeholder: np.array([[1], [1]], dtype=np.int64)
-        })
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'labels shape must be \[D0, D1, ... DN, 2\]\..*'
-          r'\[Received shape: \] \[2\]'):
-        actual_training_loss.eval({
-            labels_placeholder: np.array([1, 1], dtype=np.int64)
-        })
-
-  def test_eval_create_loss_loss_fn(self):
-    """Tests head.create_loss for eval mode and custom loss_fn."""
-    loss = np.array([[1.], [2.]], dtype=np.float32)
-    logits_input = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels_input = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    def _loss_fn(labels, logits):
-      check_labels = control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
-          data=[labels])
-      check_logits = control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
-          data=[logits])
-      with ops.control_dependencies([check_labels, check_logits]):
-        return constant_op.constant(loss)
-    head = head_lib.multi_label_head(n_classes=2, loss_fn=_loss_fn)
-
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_input,
-        labels=labels_input)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(np.sum(loss) / 2., actual_training_loss.eval())
-
-  def test_eval_create_loss_loss_fn_wrong_shape(self):
-    """Tests custom loss_fn that returns Tensor of unexpected shape."""
-    loss = np.array([1., 2.], dtype=np.float32)
-    def _loss_fn(labels, logits):
-      del labels, logits  # Unused
-      return constant_op.constant(loss)
-    head = head_lib.multi_label_head(n_classes=2, loss_fn=_loss_fn)
-
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
-          r'\[logits_shape: \] \[2 2\] \[loss_shape: \] \[2\]'):
-        actual_training_loss.eval()
-
-  def test_eval_labels_none(self):
-    """Tests that error is raised when labels is None."""
-    head = head_lib.multi_label_head(n_classes=2)
-
-    with self.assertRaisesRegexp(
-        ValueError, r'You must provide a labels Tensor\. Given: None\.'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
-          labels=None)
-
-  def _test_eval(
-      self, head, logits, labels, expected_loss, expected_metrics,
-      features=None, regularization_losses=None):
-    spec = head.create_estimator_spec(
-        features=features or {},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels,
-        regularization_losses=regularization_losses)
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, and metrics.
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
-          rtol=tol,
-          atol=tol)
-
-  def test_eval(self):
-    n_classes = 2
-    head = head_lib.multi_label_head(n_classes)
-    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples, divide by batch_size.
-    expected_loss = 0.5 * np.sum(
-        _sigmoid_cross_entropy(labels=labels, logits=logits))
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss,
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.3333,
-        keys.AUC_PR: 0.7639,
-    }
-    self._test_eval(
-        head=head,
-        logits=logits,
-        labels=labels,
-        expected_loss=expected_loss,
-        expected_metrics=expected_metrics)
-
-  def test_eval_sparse_labels(self):
-    n_classes = 2
-    head = head_lib.multi_label_head(n_classes)
-    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
-    # Equivalent to multi_hot = [[1, 0], [1, 1]]
-    labels = sparse_tensor.SparseTensor(
-        values=[0, 0, 1],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    labels_multi_hot = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples, divide by batch_size.
-    expected_loss = 0.5 * np.sum(
-        _sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss,
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.3333,
-        keys.AUC_PR: 0.7639,
-    }
-    self._test_eval(
-        head=head,
-        logits=logits,
-        labels=labels,
-        expected_loss=expected_loss,
-        expected_metrics=expected_metrics)
-
-  def test_eval_with_regularization_losses(self):
-    n_classes = 2
-    head = head_lib.multi_label_head(
-        n_classes, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    regularization_losses = [1.5, 0.5]
-    expected_regularization_loss = 2.
-    # unregularized_loss = sum(
-    #     labels * -log(sigmoid(logits)) +
-    #     (1 - labels) * -log(1 - sigmoid(logits))) / batch_size
-    expected_unregularized_loss = np.sum(
-        _sigmoid_cross_entropy(labels=labels, logits=logits)) / 2.
-    expected_regularized_loss = (
-        expected_unregularized_loss + expected_regularization_loss)
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_unregularized_loss,
-        keys.LOSS_REGULARIZATION: expected_regularization_loss,
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.3333,
-        keys.AUC_PR: 0.7639,
-    }
-    self._test_eval(
-        head=head,
-        logits=logits,
-        labels=labels,
-        expected_loss=expected_regularized_loss,
-        expected_metrics=expected_metrics,
-        regularization_losses=regularization_losses)
-
-  def test_eval_with_label_vocabulary(self):
-    n_classes = 2
-    head = head_lib.multi_label_head(
-        n_classes, label_vocabulary=['class0', 'class1'])
-    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
-    # Equivalent to multi_hot = [[1, 0], [1, 1]]
-    labels = sparse_tensor.SparseTensor(
-        values=['class0', 'class0', 'class1'],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    labels_multi_hot = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples, divide by batch_size.
-    expected_loss = 0.5 * np.sum(
-        _sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss,
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.3333,
-        keys.AUC_PR: 0.7639,
-    }
-    self._test_eval(
-        head=head,
-        logits=logits,
-        labels=labels,
-        expected_loss=expected_loss,
-        expected_metrics=expected_metrics)
-
-  def test_eval_with_label_vocabulary_with_multi_hot_input(self):
-    n_classes = 2
-    head = head_lib.multi_label_head(
-        n_classes, label_vocabulary=['class0', 'class1'])
-    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
-    labels_multi_hot = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples, divide by batch_size.
-    expected_loss = 0.5 * np.sum(
-        _sigmoid_cross_entropy(labels=labels_multi_hot, logits=logits))
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss,
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.3333,
-        keys.AUC_PR: 0.7639,
-    }
-    self._test_eval(
-        head=head,
-        logits=logits,
-        labels=labels_multi_hot,
-        expected_loss=expected_loss,
-        expected_metrics=expected_metrics)
-
-  def test_eval_with_thresholds(self):
-    n_classes = 2
-    thresholds = [0.25, 0.5, 0.75]
-    head = head_lib.multi_label_head(n_classes, thresholds=thresholds)
-
-    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples, divide by batch_size.
-    expected_loss = 0.5 * np.sum(
-        _sigmoid_cross_entropy(labels=labels, logits=logits))
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss,
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.3333,
-        keys.AUC_PR: 0.7639,
-        keys.ACCURACY_AT_THRESHOLD % thresholds[0]: 2. / 4.,
-        keys.PRECISION_AT_THRESHOLD % thresholds[0]: 2. / 3.,
-        keys.RECALL_AT_THRESHOLD % thresholds[0]: 2. / 3.,
-        keys.ACCURACY_AT_THRESHOLD % thresholds[1]: 1. / 4.,
-        keys.PRECISION_AT_THRESHOLD % thresholds[1]: 1. / 2.,
-        keys.RECALL_AT_THRESHOLD % thresholds[1]: 1. / 3.,
-        keys.ACCURACY_AT_THRESHOLD % thresholds[2]: 2. / 4.,
-        keys.PRECISION_AT_THRESHOLD % thresholds[2]: 1. / 1.,
-        keys.RECALL_AT_THRESHOLD % thresholds[2]: 1. / 3.,
-    }
-
-    self._test_eval(
-        head=head,
-        logits=logits,
-        labels=labels,
-        expected_loss=expected_loss,
-        expected_metrics=expected_metrics)
-
-  def test_eval_with_classes_for_class_based_metrics(self):
-    head = head_lib.multi_label_head(
-        n_classes=2, classes_for_class_based_metrics=[0, 1])
-
-    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples, divide by batch_size.
-    expected_loss = 0.5 * np.sum(
-        _sigmoid_cross_entropy(labels=labels, logits=logits))
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss,
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.3333,
-        keys.AUC_PR: 0.7639,
-        keys.PROBABILITY_MEAN_AT_CLASS % 0: np.sum(_sigmoid(logits[:, 0])) / 2.,
-        keys.AUC_AT_CLASS % 0: 0.,
-        keys.AUC_PR_AT_CLASS % 0: 1.,
-        keys.PROBABILITY_MEAN_AT_CLASS % 1: np.sum(_sigmoid(logits[:, 1])) / 2.,
-        keys.AUC_AT_CLASS % 1: 1.,
-        keys.AUC_PR_AT_CLASS % 1: 1.,
-    }
-
-    self._test_eval(
-        head=head,
-        logits=logits,
-        labels=labels,
-        expected_loss=expected_loss,
-        expected_metrics=expected_metrics)
-
-  def test_eval_with_classes_for_class_based_metrics_string(self):
-    head = head_lib.multi_label_head(
-        n_classes=2, label_vocabulary=['a', 'b'],
-        classes_for_class_based_metrics=['a', 'b'])
-
-    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
-    labels = sparse_tensor.SparseTensor(
-        values=['a', 'a', 'b'],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    labels_onehot = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # Sum over examples, divide by batch_size.
-    expected_loss = 0.5 * np.sum(
-        _sigmoid_cross_entropy(labels=labels_onehot, logits=logits))
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # Average loss over examples.
-        keys.LOSS_MEAN: expected_loss,
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.3333,
-        keys.AUC_PR: 0.7639,
-        keys.PROBABILITY_MEAN_AT_NAME % 'a':
-            np.sum(_sigmoid(logits[:, 0])) / 2.,
-        keys.AUC_AT_NAME % 'a': 0.,
-        keys.AUC_PR_AT_NAME % 'a': 1.,
-        keys.PROBABILITY_MEAN_AT_NAME % 'b':
-            np.sum(_sigmoid(logits[:, 1])) / 2.,
-        keys.AUC_AT_NAME % 'b': 1.,
-        keys.AUC_PR_AT_NAME % 'b': 1.,
-    }
-
-    self._test_eval(
-        head=head,
-        logits=logits,
-        labels=labels,
-        expected_loss=expected_loss,
-        expected_metrics=expected_metrics)
-
-  def test_eval_with_weights(self):
-    n_classes = 2
-    head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
-
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, weighted sum over examples, divide by batch_size.
-    # loss = ( 1 * (10 + 10) / 2 + 2 * (15 + 0) / 2) / 2
-    expected_loss = 12.5
-
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array([[41], [42]], dtype=np.int32),
-            'example_weights': np.array([[1.], [2.]], dtype=np.float32),
-        },
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # Average loss over weighted examples (denominator is sum(weights)).
-        keys.LOSS_MEAN: expected_loss * (2. / 3.),
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.2000,
-        keys.AUC_PR: 0.7833,
-    }
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, and metrics.
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
-          rtol=tol,
-          atol=tol)
-
-  def test_train_create_loss_large_logits(self):
-    """Tests head.create_loss for train mode and large logits."""
-    n_classes = 2
-    head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
-
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    weights = np.array([[1.], [2.]], dtype=np.float32)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # For large logits, this is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits
-    expected_unreduced_loss = [[(10. + 10.) / 2.], [(15. + 0.) / 2.]]
-    expected_weights = [[1.], [2.]]
-    expected_training_loss = (1. * (10. + 10.) / 2. + 2. * (15. + 0.) / 2.) / 2.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features={
-            'x': np.array(((42,),), dtype=np.int32),
-            'example_weights': weights
-        },
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), atol=1e-4)
-      self.assertAllClose(
-          expected_unreduced_loss, unreduced_loss.eval(), atol=1e-4)
-      self.assertAllClose(expected_weights, actual_weights.eval())
-
-  def test_train_create_loss_loss_reduction(self):
-    """Tests head.create_loss with loss_reduction."""
-    n_classes = 2
-    head = head_lib.multi_label_head(
-        n_classes, weight_column='example_weights',
-        loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
-
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    weights = np.array([[1.], [2.]], dtype=np.float32)
-    # loss = labels * -log(sigmoid(logits)) +
-    #        (1 - labels) * -log(1 - sigmoid(logits))
-    # For large logits, this is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits
-    expected_unreduced_loss = [[(10. + 10.) / 2.], [(15. + 0.) / 2.]]
-    expected_weights = [[1.], [2.]]
-    expected_training_loss = (1. * (10. + 10.) / 2. + 2. * (15. + 0.) / 2.) / 2.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features={
-            'x': np.array(((42,),), dtype=np.int32),
-            'example_weights': weights
-        },
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), atol=1e-4)
-      self.assertAllClose(
-          expected_unreduced_loss, unreduced_loss.eval(), atol=1e-4)
-      self.assertAllClose(expected_weights, actual_weights.eval())
-
-  def test_train_labels_none(self):
-    """Tests that error is raised when labels is None."""
-    head = head_lib.multi_label_head(n_classes=2)
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    with self.assertRaisesRegexp(
-        ValueError, r'You must provide a labels Tensor\. Given: None\.'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
-          labels=None,
-          train_op_fn=_no_op_train_fn)
-
-  def test_train_invalid_indicator_labels(self):
-    head = head_lib.multi_label_head(n_classes=2)
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    # The value 2 is outside the allowed range.
-    labels = np.array([[2, 0], [1, 1]], dtype=np.int64)
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'labels must be an integer indicator Tensor with values in '
-          r'\[0, 1\]'):
-        sess.run(spec.loss)
-
-  def test_train_invalid_sparse_labels(self):
-    head = head_lib.multi_label_head(n_classes=2)
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    # The value 2 is outside the allowed range.
-    labels = sparse_tensor.SparseTensor(
-        values=[2, 0, 1],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'labels must be an integer SparseTensor with values in \[0, 2\)'):
-        sess.run(spec.loss)
-
-  def _test_train(self, head, logits, labels, expected_loss):
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=3)])
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    self.assertIsNotNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
-          train_result)
-      _assert_simple_summaries(
-          self, {metric_keys.MetricKeys.LOSS: expected_loss}, summary_str, tol)
-
-  def test_train(self):
-    head = head_lib.multi_label_head(n_classes=2)
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over examples, divide by batch_size.
-    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
-    expected_loss = 8.75
-    self._test_train(
-        head=head, logits=logits, labels=labels, expected_loss=expected_loss)
-
-  def test_train_sparse_labels(self):
-    head = head_lib.multi_label_head(n_classes=2)
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    # Equivalent to multi_hot = [[1, 0], [1, 1]]
-    labels = sparse_tensor.SparseTensor(
-        values=[0, 0, 1],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over examples, divide by batch_size.
-    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
-    expected_loss = 8.75
-    self._test_train(
-        head=head, logits=logits, labels=labels, expected_loss=expected_loss)
-
-  def test_train_with_label_vocabulary(self):
-    head = head_lib.multi_label_head(
-        n_classes=2, label_vocabulary=['class0', 'class1'])
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    # Equivalent to multi_hot = [[1, 0], [1, 1]]
-    labels = sparse_tensor.SparseTensor(
-        values=['class0', 'class0', 'class1'],
-        indices=[[0, 0], [1, 0], [1, 1]],
-        dense_shape=[2, 2])
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over examples, divide by batch_size.
-    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
-    expected_loss = 8.75
-    self._test_train(
-        head=head, logits=logits, labels=labels, expected_loss=expected_loss)
-
-  def test_train_with_optimizer(self):
-    head = head_lib.multi_label_head(n_classes=2)
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, sum over examples, divide by batch_size.
-    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2
-    expected_loss = 8.75
-    expected_train_result = 'my_train_op'
-
-    class _Optimizer(object):
-
-      def minimize(self, loss, global_step):
-        del global_step
-        return string_ops.string_join(
-            [constant_op.constant(expected_train_result),
-             string_ops.as_string(loss, precision=3)])
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        optimizer=_Optimizer())
-
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
-          train_result)
-
-  def test_train_with_update_ops(self):
-    head = head_lib.multi_label_head(n_classes=2)
-
-    with ops.Graph().as_default():
-      w = variables.Variable(1)
-      update_op = w.assign_add(1)
-      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
-
-      t = variables.Variable('')
-      expected_train_result = b'my_train_op'
-      def _train_op_fn(loss):
-        del loss
-        return t.assign(expected_train_result)
-
-      spec = head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
-          labels=np.array([[1, 0], [1, 1]], dtype=np.int64),
-          train_op_fn=_train_op_fn)
-
-      with self.cached_session() as sess:
-        _initialize_variables(self, spec.scaffold)
-        sess.run(spec.train_op)
-        w_value, t_value = sess.run([w, t])
-        self.assertEqual(2, w_value)
-        self.assertEqual(expected_train_result, t_value)
-
-  def test_train_with_regularization_losses(self):
-    head = head_lib.multi_label_head(
-        n_classes=2, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    regularization_losses = [1.5, 0.5]
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes and over batch and add regularization loss.
-    expected_loss = 35. / 4. + 2.
-    expected_summaries = {
-        metric_keys.MetricKeys.LOSS: expected_loss,
-        metric_keys.MetricKeys.LOSS_REGULARIZATION: 2.,
-    }
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=3)])
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        regularization_losses=regularization_losses)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
-          train_result)
-      _assert_simple_summaries(self, expected_summaries, summary_str, tol)
-
-  def test_train_with_weights(self):
-    n_classes = 2
-    head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
-
-    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
-    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # Average over classes, weighted sum over examples, divide by batch_size.
-    # loss = ( 1 * (10 + 10) / 2 + 2 * (15 + 0) / 2 ) / 2
-    expected_loss = 12.5
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=3)])
-
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array([[41], [42]], dtype=np.int32),
-            'example_weights': np.array([[1.], [2.]], dtype=np.float32),
-        },
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    self.assertIsNotNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
-          train_result)
-      _assert_simple_summaries(
-          self, {metric_keys.MetricKeys.LOSS: expected_loss,}, summary_str, tol)
-
-  def test_multi_dim_weighted_train_create_loss(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # unreduced_loss =
-    #     [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
-    #   = [[20/3, 10/3], [4, 8]]
-    expected_unreduced_loss = [[[20./3.], [10./3.]], [[4.], [8.]]]
-    # weights are reshaped to [2, 2, 1] to match logits.
-    expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
-    # loss = (1*20/3 + 1.5*10/3 + 2*4 + 2.5*8) / 4 = 9.9167
-    expected_training_loss = 9.9167
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    atol = 1.e-3
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), atol=atol)
-      self.assertAllClose(
-          expected_unreduced_loss, unreduced_loss.eval(), atol=atol)
-      self.assertAllClose(expected_weights, actual_weights.eval())
-
-  def test_multi_dim_weighted_train(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
-    #      = [[20/3, 10/3], [4, 8]]
-    # loss = (1*20/3 + 1.5*10/3 + 2*4 + 2.5*8) / 4 = 9.9167
-    expected_loss = 9.9167
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=3)])
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    atol = 1.e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, monitored_session.Scaffold())
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAllClose(expected_loss, loss, atol=atol)
-      self.assertEqual(
-          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
-          train_result)
-
-  def test_multi_dim_weights_wrong_inner_dim(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 1]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[1.], [2.]], dtype=np.float32)
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 1\]'):
-        spec.loss.eval()
-
-  def test_multi_dim_weights_wrong_outer_dim(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 2, 3]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[[1., 1., 1.], [1.5, 1.5, 1.5]],
-                        [[2., 2., 2.], [2.5, 2.5, 2.5]]], dtype=np.float32)
-    weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights_placeholder},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 2 3\]'):
-        spec.loss.eval({weights_placeholder: weights})
-
-  def test_multi_dim_weighted_eval(self):
-    """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
-    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
-
-    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
-    #      = [[20/3, 10/3], [4, 8]]
-    # loss = (1*20/3 + 1.5*10/3 + 2*4 + 2.5*8) / 4 = 9.9167
-    expected_loss = 9.9167
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_loss * (4. / np.sum(weights)),
-        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC: 0.4977,
-        keys.AUC_PR: 0.6645,
-    }
-    self._test_eval(
-        head=head,
-        features={'weights': weights},
-        logits=logits,
-        labels=labels,
-        expected_loss=expected_loss,
-        expected_metrics=expected_metrics)
-
-
-class PoissonRegressionHead(test.TestCase):
-
-  def setUp(self):
-    ops.reset_default_graph()
-
-  def test_train(self):
-    head = head_lib.poisson_regression_head()
-
-    # Create estimator spec.
-    logits = np.array([[0], [-1], [1]], dtype=np.float32)
-    labels = np.array([[1], [2], [3]], dtype=np.int32)
-    # With x = exp(logits), z = labels.
-    # loss = -ln(exp(-x) * (x^z) / z!)
-    #      = x - z * ln(x) + ln(z!)
-    #      = exp(logits) - labels * logits - ln(labels!)
-    # But for ln(z!) and z > 1, the Stirling approximation is used
-    # ln(z!) = z*ln(z) - z + 0.5*ln(2*pi*z)
-    # loss = [exp(0) - 1 * 0 + ln(1!),
-    #         exp(-1) - 2 * (-1) + 2*ln(2) - 2 + 0.5*ln(2*pi*2),
-    #         exp(1) - 3 * 1 + 3*ln(3) - 3 + 0.5*ln(2*pi*3)]
-    #      = [1.0, 3.020, 1.482]
-    # training_loss = (1.0 + 3.020 + 1.482) / 3
-    expected_loss = 1.834
-    atol = 0.001
-    expected_train_result = b'my_train_op'
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_near(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          atol=atol, name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run([spec.loss, spec.train_op])
-      self.assertAlmostEqual(expected_loss, loss, delta=atol)
-      self.assertEqual(expected_train_result, train_result)
-
-  def test_predict(self):
-    head = head_lib.poisson_regression_head()
-
-    # Create estimator spec.
-    logits = np.array([[0], [-1], [1]], dtype=np.float32)
-    expected_predictions = np.exp(logits)
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    # Assert spec contains expected tensors.
-    keys = prediction_keys.PredictionKeys
-    self.assertItemsEqual(
-        (keys.PREDICTIONS, keys.LOGITS), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[keys.PREDICTIONS].dtype)
-    self.assertEqual(dtypes.float32, spec.predictions[keys.LOGITS].dtype)
-
-    # Assert predictions.
-    with self.cached_session():
-      _initialize_variables(self, spec.scaffold)
-      self.assertAllClose(
-          expected_predictions, spec.predictions[keys.PREDICTIONS].eval())
-      self.assertAllClose(logits, spec.predictions[keys.LOGITS].eval())
-
-
-class LogisticRegressionHead(test.TestCase):
-
-  def setUp(self):
-    ops.reset_default_graph()
-
-  def test_train(self):
-    head = head_lib.logistic_regression_head()
-
-    # Create estimator spec.
-    logits = np.array([[0], [-1], [1]], dtype=np.float32)
-    labels = np.array([[.4], [.6], [.8]], dtype=np.float32)
-    # Following the documentation in
-    # tf.nn.sigmoid_cross_entropy_with_logits:
-    # With x = logits, z = labels.
-    # loss  = max(x, 0) - x * z + log(1 + exp(-abs(x)))
-    # loss = [0 - 0 * 0.4 + ln(1 + exp(-0)),
-    #         0 + 1 * 0.6 + ln(1 + exp(-1)),
-    #         1 - 1 * 0.8 + ln(1 + exp(-1))]
-    #      = [0.6931, 0.9133, 0.5133]
-    # training_loss = (0.6931 + 0.9133 + 0.5133) / 3
-    expected_loss = 0.7066
-    atol = 0.001
-    expected_train_result = b'my_train_op'
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_near(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          atol=atol, name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run([spec.loss, spec.train_op])
-      self.assertAlmostEqual(expected_loss, loss, delta=atol)
-      self.assertEqual(expected_train_result, train_result)
-
-  def test_train_labels_too_large(self):
-    head = head_lib.logistic_regression_head()
-
-    # Create estimator spec.
-    logits = np.array([[0], [-1], [1]], dtype=np.float32)
-    labels = np.array([[.4], [1.2], [.8]], dtype=np.float32)
-    expected_train_result = b'my_train_op'
-    def _train_op_fn(loss):
-      del loss
-      return constant_op.constant(expected_train_result)
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[Labels must be in range \[0, 1\]\] .* \[\[0.4\]\[1.2\]\[0.8\]\]'):
-        _ = sess.run(spec.loss)
-
-  def test_train_labels_negative(self):
-    head = head_lib.logistic_regression_head()
-
-    # Create estimator spec.
-    logits = np.array([[0], [-1], [1]], dtype=np.float32)
-    labels = np.array([[.4], [-0.2], [.8]], dtype=np.float32)
-    expected_train_result = b'my_train_op'
-    def _train_op_fn(loss):
-      del loss
-      return constant_op.constant(expected_train_result)
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[Labels must be in range \[0, 1\]\] .* \[\[0.4\]\[-0.2\]\[0.8\]\]'
-      ):
-        _ = sess.run(spec.loss)
-
-  def test_predict(self):
-    head = head_lib.logistic_regression_head()
-
-    # Create estimator spec.
-    logits = np.array([[0], [-1], [1]], dtype=np.float32)
-    expected_predictions = 1. / (1. + np.exp(-logits))
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    # Assert spec contains expected tensors.
-    keys = prediction_keys.PredictionKeys
-    self.assertItemsEqual(
-        (keys.PREDICTIONS, keys.LOGITS), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[keys.PREDICTIONS].dtype)
-    self.assertEqual(dtypes.float32, spec.predictions[keys.LOGITS].dtype)
-
-    # Assert predictions.
-    with self.cached_session():
-      _initialize_variables(self, spec.scaffold)
-      self.assertAllClose(
-          expected_predictions, spec.predictions[keys.PREDICTIONS].eval())
-      self.assertAllClose(logits, spec.predictions[keys.LOGITS].eval())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks.py b/tensorflow/contrib/estimator/python/estimator/hooks.py
index 49f7bbd32009cc80ef3fa70917ac26a8b752ef6d..f7ff5adcc7c106d8d1d071e95c0447f18f2e02b7 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,274 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Some useful session run hooks."""
+"""hooks python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import time
-
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-
-
-# pylint: disable=protected-access
-class InMemoryEvaluatorHook(training.SessionRunHook):
-  """Hook to run evaluation in training without a checkpoint.
-
-  Example:
-
-  ```python
-  def train_input_fn():
-    ...
-    return train_dataset
-
-  def eval_input_fn():
-    ...
-    return eval_dataset
-
-  estimator = tf.estimator.DNNClassifier(...)
-
-  evaluator = tf.contrib.estimator.InMemoryEvaluatorHook(
-      estimator, eval_input_fn)
-  estimator.train(train_input_fn, hooks=[evaluator])
-  ```
-
-  Current limitations of this approach are:
-
-  * It doesn't support multi-node distributed mode.
-  * It doesn't support saveable objects other than variables (such as boosted
-    tree support)
-  * It doesn't support custom saver logic (such as ExponentialMovingAverage
-    support)
-
-  """
-
-  def __init__(self,
-               estimator,
-               input_fn,
-               steps=None,
-               hooks=None,
-               name=None,
-               every_n_iter=100):
-    """Initializes a `InMemoryEvaluatorHook`.
-
-    Args:
-      estimator: A `tf.estimator.Estimator` instance to call evaluate.
-      input_fn:  Equivalent to the `input_fn` arg to `estimator.evaluate`. A
-        function that constructs the input data for evaluation.
-        See [Createing input functions](
-        https://tensorflow.org/guide/premade_estimators#create_input_functions)
-        for more information. The function should construct and return one of
-        the following:
-
-          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
-            tuple (features, labels) with same constraints as below.
-          * A tuple (features, labels): Where `features` is a `Tensor` or a
-            dictionary of string feature name to `Tensor` and `labels` is a
-            `Tensor` or a dictionary of string label name to `Tensor`. Both
-            `features` and `labels` are consumed by `model_fn`. They should
-            satisfy the expectation of `model_fn` from inputs.
-
-      steps: Equivalent to the `steps` arg to `estimator.evaluate`.  Number of
-        steps for which to evaluate model. If `None`, evaluates until `input_fn`
-        raises an end-of-input exception.
-      hooks: Equivalent to the `hooks` arg to `estimator.evaluate`. List of
-        `SessionRunHook` subclass instances. Used for callbacks inside the
-        evaluation call.
-      name:  Equivalent to the `name` arg to `estimator.evaluate`. Name of the
-        evaluation if user needs to run multiple evaluations on different data
-        sets, such as on training data vs test data. Metrics for different
-        evaluations are saved in separate folders, and appear separately in
-        tensorboard.
-      every_n_iter: `int`, runs the evaluator once every N training iteration.
-
-    Raises:
-      ValueError: if `every_n_iter` is non-positive or it's not a single machine
-        training
-    """
-    if every_n_iter is None or every_n_iter <= 0:
-      raise ValueError('invalid every_n_iter=%s.' % every_n_iter)
-    if (estimator.config.num_ps_replicas > 0 or
-        estimator.config.num_worker_replicas > 1):
-      raise ValueError(
-          'InMemoryEvaluator supports only single machine (aka Local) setting.')
-    self._estimator = estimator
-    self._input_fn = input_fn
-    self._steps = steps
-    self._name = name
-    self._every_n_iter = every_n_iter
-    self._eval_dir = os.path.join(self._estimator.model_dir, 'eval'
-                                  if not name else 'eval_' + name)
-
-    self._graph = None
-    self._hooks = estimator_lib._check_hooks_type(hooks)
-    self._hooks.extend(self._estimator._convert_eval_steps_to_hooks(steps))
-    self._timer = training.SecondOrStepTimer(every_steps=every_n_iter)
-
-  def begin(self):
-    """Build eval graph and restoring op."""
-    self._timer.reset()
-    self._iter_count = 0
-    self._graph = ops.Graph()
-    with self._graph.as_default():
-      (self._scaffold, self._update_op, self._eval_dict,
-       self._all_hooks) = self._estimator._evaluate_build_graph(
-           self._input_fn, self._hooks, checkpoint_path=None)
-
-      if self._scaffold.saver is not None:
-        raise ValueError('InMemoryEvaluator does not support custom saver')
-      if self._scaffold.init_fn is not None:
-        raise ValueError('InMemoryEvaluator does not support custom init_fn')
-
-      self._var_name_to_eval_var = {
-          v.name: v for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-      }
-      self._var_name_to_placeholder = {
-          v.name: array_ops.placeholder(v.dtype)
-          for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-      }
-
-  def after_create_session(self, session, coord):  # pylint: disable=unused-argument
-    """Does first run which shows the eval metrics before training."""
-    if ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS):
-      raise ValueError(
-          'InMemoryEvaluator does not support saveables other than global '
-          'variables.')
-    self._var_name_to_train_var = {
-        v.name: v for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    }
-    var_names_to_transfer = set(self._var_name_to_placeholder.keys()) & set(
-        self._var_name_to_train_var.keys())
-    # Filter training var names that are not exist in evaluation
-    self._var_name_to_train_var = {
-        v_name: self._var_name_to_train_var[v_name]
-        for v_name in var_names_to_transfer
-    }
-    # Filter eval var names that are not exist in training
-    self._var_name_to_eval_var = {
-        v_name: self._var_name_to_eval_var[v_name]
-        for v_name in var_names_to_transfer
-    }
-
-    with self._graph.as_default():
-      self._var_feed_op = control_flow_ops.group([
-          state_ops.assign(self._var_name_to_eval_var[v_name],
-                           self._var_name_to_placeholder[v_name])
-          for v_name in var_names_to_transfer
-      ])
-
-    self._evaluate(session)
-
-  def _evaluate(self, train_session):
-    var_name_to_value = train_session.run(self._var_name_to_train_var)
-    placeholder_to_value = {
-        self._var_name_to_placeholder[v_name]: var_name_to_value[v_name]
-        for v_name in var_name_to_value
-    }
-
-    def feed_variables(scaffold, session):
-      del scaffold
-      session.run(self._var_feed_op, feed_dict=placeholder_to_value)
-
-    scaffold = training.Scaffold(
-        init_fn=feed_variables, copy_from_scaffold=self._scaffold)
-
-    with self._graph.as_default():
-      self._estimator._evaluate_run(
-          checkpoint_path=None,
-          scaffold=scaffold,
-          update_op=self._update_op,
-          eval_dict=self._eval_dict,
-          all_hooks=self._all_hooks,
-          output_dir=self._eval_dir)
-
-    self._timer.update_last_triggered_step(self._iter_count)
-
-  def after_run(self, run_context, run_values):  # pylint: disable=unused-argument
-    """Runs evaluator."""
-    self._iter_count += 1
-    if self._timer.should_trigger_for_step(self._iter_count):
-      self._evaluate(run_context.session)
-
-  def end(self, session):  # pylint: disable=unused-argument
-    """Runs evaluator for final model."""
-    self._evaluate(session)
-
-
-class _StopAtCheckpointStepHook(training.SessionRunHook):
-  """Hook that requests stop at a specified step based on checkpoint.
-
-  Note: We recommend using 'make_stop_at_checkpoint_step_hook` to get the proper
-  hook.
-  """
-
-  def __init__(self, model_dir, last_step,
-               wait_after_file_check_secs=30):
-    """Initializes a `StopAtCheckpointStepHook`.
-
-    This hook requests stop after a last step has been reached. It checks latest
-    checkpoint to verify last step is written on disk or not.
-
-    Args:
-      model_dir: Directory to read global step from latest checkpoint.
-      last_step: Step after which to stop.
-      wait_after_file_check_secs: Reading same file by many workers may create
-      I/O issues. To throttle that we will wait given secs after each read of
-      the file.
-
-    Raises:
-      ValueError: If one of the arguments is invalid.
-    """
-    if last_step is None:
-      raise ValueError('last_step must be specified.')
-    if model_dir is None:
-      raise ValueError('model_dir must be specified.')
-
-    self._model_dir = model_dir
-    self._last_step = last_step
-    self._wait_after_file_check_secs = wait_after_file_check_secs
-
-  def begin(self):
-    self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
-    if self._global_step_tensor is None:
-      raise RuntimeError(
-          'Global step should be created to use StopAtCheckpointStepHook.')
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    return training.SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    global_step = run_values.results + 1
-    if global_step >= self._last_step:
-      # Check latest global step in the checkpoint to ensure that the targeted
-      # last step is written on disk.
-
-      step = estimator_lib._load_global_step_from_checkpoint_dir(
-          self._model_dir)
-      if step >= self._last_step:
-        run_context.request_stop()
-      else:
-        time.sleep(self._wait_after_file_check_secs)
-
-
-def make_stop_at_checkpoint_step_hook(estimator,
-                                      last_step,
-                                      wait_after_file_check_secs=30):
-  """Creates a proper StopAtCheckpointStepHook based on chief status."""
+from tensorflow_estimator.contrib.estimator.python.estimator import hooks
 
-  if estimator.config.is_chief:
-    return training.StopAtStepHook(last_step=last_step)
-  return _StopAtCheckpointStepHook(
-      model_dir=estimator.model_dir,
-      last_step=last_step,
-      wait_after_file_check_secs=wait_after_file_check_secs)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+hooks.__all__ = [s for s in dir(hooks) if not s.startswith('__')]
 
-# pylint: enable=protected-access
+from tensorflow_estimator.contrib.estimator.python.estimator.hooks import *
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks_test.py b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
deleted file mode 100644
index 62ffad56da324ea3765dfdac64f3ef00d9b17a38..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/hooks_test.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for hooks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import glob
-import json
-import os
-import tempfile
-import time
-
-from tensorflow.contrib.estimator.python.estimator import hooks as hooks_lib
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.summary import summary_iterator
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import training
-
-
-def summary_step_keyword_to_value_mapping(dir_):
-  writer_cache.FileWriterCache.clear()
-
-  # Get last Event written.
-  event_paths = glob.glob(os.path.join(dir_, 'events*'))
-  step_keyword_to_value = {}
-  for last_event in summary_iterator.summary_iterator(event_paths[-1]):
-    if last_event.step not in step_keyword_to_value:
-      step_keyword_to_value[last_event.step] = {}
-    if last_event.summary is not None:
-      for value in last_event.summary.value:
-        step_keyword_to_value[last_event.step][value.tag] = value.simple_value
-
-  return step_keyword_to_value
-
-
-def get_summary_value(dir_, step, keyword):
-  """Get summary value for given step and keyword."""
-
-  writer_cache.FileWriterCache.clear()
-  # Get last Event written.
-  event_paths = glob.glob(os.path.join(dir_, 'events*'))
-  print('XXX', event_paths)
-  for last_event in summary_iterator.summary_iterator(event_paths[-1]):
-    if last_event.step == step and last_event.summary is not None:
-      for value in last_event.summary.value:
-        if keyword in value.tag:
-          return value.simple_value
-  return None
-
-
-class InMemoryEvaluatorHookTest(test.TestCase):
-
-  def test_runs_eval_metrics(self):
-
-    def model_fn(features, labels, mode):
-      _ = labels
-      if estimator_lib.ModeKeys.TRAIN == mode:
-        with ops.control_dependencies([features]):
-          train_op = state_ops.assign_add(training.get_global_step(), 1)
-        return estimator_lib.EstimatorSpec(
-            mode, loss=constant_op.constant(3.), train_op=train_op)
-      if estimator_lib.ModeKeys.EVAL == mode:
-        return estimator_lib.EstimatorSpec(
-            mode,
-            loss=constant_op.constant(5.),
-            eval_metric_ops={'mean_of_features': metrics_lib.mean(features)})
-
-    estimator = estimator_lib.Estimator(model_fn=model_fn)
-
-    def input_fn():
-      return dataset_ops.Dataset.range(10)
-
-    evaluator = hooks_lib.InMemoryEvaluatorHook(
-        estimator, input_fn, every_n_iter=4)
-    estimator.train(input_fn, hooks=[evaluator])
-
-    self.assertTrue(os.path.isdir(estimator.eval_dir()))
-    step_keyword_to_value = summary_step_keyword_to_value_mapping(
-        estimator.eval_dir())
-
-    # 4.5 = sum(range(10))/10
-    # before training
-    self.assertEqual(4.5, step_keyword_to_value[0]['mean_of_features'])
-    # intervals (every_n_iter=4)
-    self.assertEqual(4.5, step_keyword_to_value[4]['mean_of_features'])
-    self.assertEqual(4.5, step_keyword_to_value[8]['mean_of_features'])
-    # end
-    self.assertEqual(4.5, step_keyword_to_value[10]['mean_of_features'])
-    self.assertEqual(set([0, 4, 8, 10]), set(step_keyword_to_value.keys()))
-
-  def test_uses_latest_variable_value(self):
-
-    def model_fn(features, labels, mode):
-      _ = labels
-      step = training.get_global_step()
-      w = variable_scope.get_variable(
-          'w',
-          shape=[],
-          initializer=init_ops.zeros_initializer(),
-          dtype=dtypes.int64)
-      if estimator_lib.ModeKeys.TRAIN == mode:
-        # to consume features, we have control dependency
-        with ops.control_dependencies([features]):
-          step_inc = state_ops.assign_add(training.get_global_step(), 1)
-        with ops.control_dependencies([step_inc]):
-          assign_w_to_step_plus_2 = w.assign(step + 2)
-        return estimator_lib.EstimatorSpec(
-            mode,
-            loss=constant_op.constant(3.),
-            train_op=assign_w_to_step_plus_2)
-      if estimator_lib.ModeKeys.EVAL == mode:
-        # to consume features, we have control dependency
-        with ops.control_dependencies([features]):
-          loss = constant_op.constant(5.)
-        return estimator_lib.EstimatorSpec(
-            mode,
-            loss=loss,
-            # w is constant in each step, so the mean.
-            # w = 0 if step==0 else step+2
-            eval_metric_ops={'mean_of_const': metrics_lib.mean(w)})
-
-    estimator = estimator_lib.Estimator(model_fn=model_fn)
-
-    def input_fn():
-      return dataset_ops.Dataset.range(10)
-
-    evaluator = hooks_lib.InMemoryEvaluatorHook(
-        estimator, input_fn, every_n_iter=4)
-    estimator.train(input_fn, hooks=[evaluator])
-
-    self.assertTrue(os.path.isdir(estimator.eval_dir()))
-    step_keyword_to_value = summary_step_keyword_to_value_mapping(
-        estimator.eval_dir())
-    # w = 0 if step==0 else step+2
-    self.assertEqual(0, step_keyword_to_value[0]['mean_of_const'])
-    self.assertEqual(6, step_keyword_to_value[4]['mean_of_const'])
-    self.assertEqual(12, step_keyword_to_value[10]['mean_of_const'])
-
-  def test_dnn_classifier(self):
-    embedding = feature_column_lib.embedding_column(
-        feature_column_lib.categorical_column_with_vocabulary_list(
-            'wire_cast', ['kima', 'omar', 'stringer']), 8)
-    dnn = estimator_lib.DNNClassifier(
-        feature_columns=[embedding], hidden_units=[3, 1])
-
-    def train_input_fn():
-      return dataset_ops.Dataset.from_tensors(({
-          'wire_cast': [['omar'], ['kima']]
-      }, [[0], [1]])).repeat(3)
-
-    def eval_input_fn():
-      return dataset_ops.Dataset.from_tensors(({
-          'wire_cast': [['stringer'], ['kima']]
-      }, [[0], [1]])).repeat(2)
-
-    evaluator = hooks_lib.InMemoryEvaluatorHook(
-        dnn, eval_input_fn, name='in-memory')
-    dnn.train(train_input_fn, hooks=[evaluator])
-    self.assertTrue(os.path.isdir(dnn.eval_dir('in-memory')))
-    step_keyword_to_value = summary_step_keyword_to_value_mapping(
-        dnn.eval_dir('in-memory'))
-
-    final_metrics = dnn.evaluate(eval_input_fn)
-    step = final_metrics[ops.GraphKeys.GLOBAL_STEP]
-    for summary_tag in final_metrics:
-      if summary_tag == ops.GraphKeys.GLOBAL_STEP:
-        continue
-      self.assertEqual(final_metrics[summary_tag],
-                       step_keyword_to_value[step][summary_tag])
-
-  def test_raise_error_with_multi_worker(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
-      dnn = estimator_lib.DNNClassifier(
-          feature_columns=[feature_column_lib.numeric_column('x')],
-          hidden_units=[3, 1])
-
-    def eval_input_fn():
-      pass
-
-    with self.assertRaisesRegexp(ValueError, 'supports only single machine'):
-      hooks_lib.InMemoryEvaluatorHook(dnn, eval_input_fn)
-
-  def test_raise_error_with_ps(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    with test.mock.patch.dict('os.environ',
-                              {'TF_CONFIG': json.dumps(tf_config)}):
-      dnn = estimator_lib.DNNClassifier(
-          feature_columns=[feature_column_lib.numeric_column('x')],
-          hidden_units=[3, 1])
-
-    def eval_input_fn():
-      pass
-
-    with self.assertRaisesRegexp(ValueError, 'supports only single machine'):
-      hooks_lib.InMemoryEvaluatorHook(dnn, eval_input_fn)
-
-  def test_raise_error_with_custom_saver_in_eval(self):
-
-    def model_fn(features, labels, mode):
-      _, _ = features, labels
-      return estimator_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(3.),
-          scaffold=training.Scaffold(saver=training.Saver()),
-          train_op=constant_op.constant(5.),
-          eval_metric_ops={
-              'mean_of_features': metrics_lib.mean(constant_op.constant(2.))
-          })
-
-    estimator = estimator_lib.Estimator(model_fn=model_fn)
-
-    def input_fn():
-      return dataset_ops.Dataset.range(10)
-
-    evaluator = hooks_lib.InMemoryEvaluatorHook(estimator, input_fn)
-    with self.assertRaisesRegexp(ValueError, 'does not support custom saver'):
-      evaluator.begin()
-
-  def test_raise_error_with_custom_init_fn_in_eval(self):
-
-    def model_fn(features, labels, mode):
-      _, _ = features, labels
-
-      def init_fn(scaffold, session):
-        _, _ = scaffold, session
-
-      return estimator_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(3.),
-          scaffold=training.Scaffold(init_fn=init_fn),
-          train_op=constant_op.constant(5.),
-          eval_metric_ops={
-              'mean_of_features': metrics_lib.mean(constant_op.constant(2.))
-          })
-
-    estimator = estimator_lib.Estimator(model_fn=model_fn)
-
-    def input_fn():
-      return dataset_ops.Dataset.range(10)
-
-    evaluator = hooks_lib.InMemoryEvaluatorHook(estimator, input_fn)
-    with self.assertRaisesRegexp(ValueError, 'does not support custom init_fn'):
-      evaluator.begin()
-
-  def test_raise_error_with_saveables_other_than_global_variables(self):
-
-    def model_fn(features, labels, mode):
-      _, _ = features, labels
-      w = variables.VariableV1(
-          initial_value=[0.],
-          trainable=False,
-          collections=[ops.GraphKeys.SAVEABLE_OBJECTS])
-      init_op = control_flow_ops.group(
-          [w.initializer, training.get_global_step().initializer])
-      return estimator_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(3.),
-          scaffold=training.Scaffold(init_op=init_op),
-          train_op=constant_op.constant(5.),
-          eval_metric_ops={
-              'mean_of_features': metrics_lib.mean(constant_op.constant(2.))
-          })
-
-    estimator = estimator_lib.Estimator(model_fn=model_fn)
-
-    def input_fn():
-      return dataset_ops.Dataset.range(10)
-
-    evaluator = hooks_lib.InMemoryEvaluatorHook(estimator, input_fn)
-    with self.assertRaisesRegexp(ValueError, 'does not support saveables'):
-      estimator.train(input_fn, hooks=[evaluator])
-
-
-class StopAtCheckpointStepHookTest(test.TestCase):
-
-  def test_do_not_stop_if_checkpoint_is_not_there(self):
-    with ops.Graph().as_default():
-      step = training.create_global_step()
-      assign_ten = step.assign(10)
-      no_op = control_flow_ops.no_op()
-      hook = hooks_lib._StopAtCheckpointStepHook(
-          model_dir=tempfile.mkdtemp(), last_step=10)
-      with training.SingularMonitoredSession(hooks=[hook]) as mon_sess:
-        mon_sess.raw_session().run(assign_ten)
-        with test.mock.patch.object(time, 'sleep') as mock_sleep:
-          mon_sess.run(no_op)
-          self.assertTrue(mock_sleep.called)
-        self.assertFalse(mon_sess.should_stop())
-
-  def test_do_not_stop_if_checkpoint_step_is_smaller(self):
-    model_dir = tempfile.mkdtemp()
-    with ops.Graph().as_default():
-      step = training.create_global_step()
-      assign_nine = step.assign(9)
-      assign_ten = step.assign(10)
-      no_op = control_flow_ops.no_op()
-      hook = hooks_lib._StopAtCheckpointStepHook(
-          model_dir=model_dir, last_step=10)
-      with tf_session.Session() as sess:
-        sess.run(assign_nine)
-        training.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
-      with training.SingularMonitoredSession(hooks=[hook]) as mon_sess:
-        mon_sess.raw_session().run(assign_ten)
-        with test.mock.patch.object(time, 'sleep') as mock_sleep:
-          mon_sess.run(no_op)
-          self.assertTrue(mock_sleep.called)
-        self.assertFalse(mon_sess.should_stop())
-
-  def test_stop_if_checkpoint_step_is_laststep(self):
-    model_dir = tempfile.mkdtemp()
-    with ops.Graph().as_default():
-      step = training.create_global_step()
-      assign_ten = step.assign(10)
-      no_op = control_flow_ops.no_op()
-      hook = hooks_lib._StopAtCheckpointStepHook(
-          model_dir=model_dir, last_step=10)
-      with tf_session.Session() as sess:
-        sess.run(assign_ten)
-        training.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
-      with training.SingularMonitoredSession(hooks=[hook]) as mon_sess:
-        mon_sess.raw_session().run(assign_ten)
-        with test.mock.patch.object(time, 'sleep') as mock_sleep:
-          mon_sess.run(no_op)
-          self.assertFalse(mock_sleep.called)
-        self.assertTrue(mon_sess.should_stop())
-
-  def test_creates_regular_stop_at_step_hook_for_chief(self):
-    # by default an estimator is in chief mode
-    dnn = estimator_lib.DNNClassifier(
-        feature_columns=[feature_column_lib.numeric_column('x')],
-        hidden_units=[3, 1])
-    hook = hooks_lib.make_stop_at_checkpoint_step_hook(dnn, 300)
-    self.assertIsInstance(hook, training.StopAtStepHook)
-    self.assertEqual(300, hook._last_step)
-
-  def test_creates_checkpoint_hook_for_workers(self):
-
-    class FakeWorkerConfig(estimator_lib.RunConfig):
-
-      @property
-      def is_chief(self):
-        return False
-
-    dnn = estimator_lib.DNNClassifier(
-        feature_columns=[feature_column_lib.numeric_column('x')],
-        hidden_units=[3, 1],
-        config=FakeWorkerConfig())
-    hook = hooks_lib.make_stop_at_checkpoint_step_hook(dnn, 300)
-    self.assertIsInstance(hook, hooks_lib._StopAtCheckpointStepHook)
-    self.assertEqual(300, hook._last_step)
-    self.assertEqual(dnn.model_dir, hook._model_dir)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/linear.py b/tensorflow/contrib/estimator/python/estimator/linear.py
deleted file mode 100644
index 2b68f24eb2d4c528bc1cb87e7d858014f66c0433..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/linear.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Linear estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import linear as linear_lib
-
-
-class LinearEstimator(estimator.Estimator):
-  """An estimator for TensorFlow linear models with user-specified head.
-
-  Example:
-
-  ```python
-  categorical_column_a = categorical_column_with_hash_bucket(...)
-  categorical_column_b = categorical_column_with_hash_bucket(...)
-
-  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
-
-  # Estimator using the default optimizer.
-  estimator = LinearEstimator(
-      head=tf.contrib.estimator.multi_label_head(n_classes=3),
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b])
-
-  # Or estimator using an optimizer with a learning rate decay.
-  estimator = LinearEstimator(
-      head=tf.contrib.estimator.multi_label_head(n_classes=3),
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b],
-      optimizer=lambda: tf.train.FtrlOptimizer(
-          learning_rate=tf.exponential_decay(
-              learning_rate=0.1,
-              global_step=tf.get_global_step(),
-              decay_steps=10000,
-              decay_rate=0.96))
-
-  # Or estimator using the FTRL optimizer with regularization.
-  estimator = LinearEstimator(
-      head=tf.contrib.estimator.multi_label_head(n_classes=3),
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b])
-      optimizer=tf.train.FtrlOptimizer(
-          learning_rate=0.1,
-          l1_regularization_strength=0.001
-      ))
-
-  def input_fn_train: # returns x, y (where y represents label's class index).
-    ...
-  estimator.train(input_fn=input_fn_train, steps=100)
-  def input_fn_eval: # returns x, y (where y represents label's class index).
-    ...
-  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
-  def input_fn_predict: # returns x, None
-    ...
-  predictions = estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-    `key=weight_column` whose value is a `Tensor`.
-  * for each `column` in `feature_columns`:
-    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
-      with `key` the id column name, the second with `key` the weight column
-      name. Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss and predicted output are determined by the specified head.
-
-  @compatibility(eager)
-  Estimators are not compatible with eager execution.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               head,
-               feature_columns,
-               model_dir=None,
-               optimizer='Ftrl',
-               config=None,
-               partitioner=None,
-               sparse_combiner='sum'):
-    """Initializes a `LinearEstimator` instance.
-
-    Args:
-      head: A `_Head` instance constructed with a method such as
-        `tf.contrib.estimator.multi_label_head`.
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
-        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
-        callable. Defaults to FTRL optimizer.
-      config: `RunConfig` object to configure the runtime settings.
-      partitioner: Optional. Partitioner for input layer.
-      sparse_combiner: A string specifying how to reduce if a categorical column
-        is multivalent.  One of "mean", "sqrtn", and "sum" -- these are
-        effectively different ways to do example-level normalization, which can
-        be useful for bag-of-words features. for more details, see
-        `tf.feature_column.linear_model`.
-    """
-    def _model_fn(features, labels, mode, config):
-      return linear_lib._linear_model_fn(  # pylint: disable=protected-access
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          feature_columns=tuple(feature_columns or []),
-          optimizer=optimizer,
-          partitioner=partitioner,
-          config=config,
-          sparse_combiner=sparse_combiner)
-    super(LinearEstimator, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/linear_test.py b/tensorflow/contrib/estimator/python/estimator/linear_test.py
deleted file mode 100644
index c41996b9c6871d294f157411662f2eb9d4c09e5c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/linear_test.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for linear.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.contrib.estimator.python.estimator import head as head_lib
-from tensorflow.contrib.estimator.python.estimator import linear
-from tensorflow.python.estimator.canned import linear_testing_utils
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import ops
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-
-
-def _linear_estimator_fn(
-    weight_column=None, label_dimension=1, *args, **kwargs):
-  """Returns a LinearEstimator that uses regression_head."""
-  return linear.LinearEstimator(
-      head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension,
-          # Tests in core (from which this test inherits) test the sum loss.
-          loss_reduction=losses.Reduction.SUM),
-      *args, **kwargs)
-
-
-class LinearEstimatorEvaluateTest(
-    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
-        self, _linear_estimator_fn)
-
-
-class LinearEstimatorPredictTest(
-    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
-        self, _linear_estimator_fn)
-
-
-class LinearEstimatorTrainTest(
-    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
-        self, _linear_estimator_fn)
-
-
-class LinearEstimatorIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(
-      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
-      label_dimension, batch_size):
-    feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))]
-    est = linear.LinearEstimator(
-        head=head_lib.regression_head(label_dimension=label_dimension),
-        feature_columns=feature_columns,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/logit_fns.py b/tensorflow/contrib/estimator/python/estimator/logit_fns.py
index c8b0dd62970e341a3c6b176278fe1c2adfcd8d20..5e418558663c27d3c3db0c5a5a8c152e76d590cb 100644
--- a/tensorflow/contrib/estimator/python/estimator/logit_fns.py
+++ b/tensorflow/contrib/estimator/python/estimator/logit_fns.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,85 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Aliases for logit_fn builders used by canned (core) tf.Estimator's.
+"""logit_fns python module.
 
-A logit_fn is an abstraction within model_fn that factors out the logit
-construction logic.  Its output can be fed into Heads or otherwise composed.  It
-should follow the following signature:
-
-Args:
-`features`: This is the first item returned from the `input_fn` passed to
-            `train`, `evaluate`, and `predict`. This should be a single
-            `Tensor` or `dict` of same, and is the only required argument.
-`mode`: Optional. Specifies if this training, evaluation or prediction. See
-        `ModeKeys`.
-`params`: Optional `dict` of hyperparameters.  Will receive what is passed to
-          Estimator in `params` parameter. This allows configuration of
-          Estimators from hyperparameter tuning.
-`config`: Optional configuration object. Will receive what is passed to
-          Estimator in `config` parameter, or the default `config`. Allows
-          updating things in your model_fn based on configuration such as
-          `num_ps_replicas`, or `model_dir`.
-
-Returns:
-    A Tensor representing the logits.
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
 """
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.estimator.canned import dnn as dnn_core
-from tensorflow.python.estimator.canned import linear as linear_core
-from tensorflow.python.framework import ops
-from tensorflow.python.util import function_utils
-
-# pylint: disable=protected-access
-dnn_logit_fn_builder = dnn_core._dnn_logit_fn_builder
-linear_logit_fn_builder = linear_core._linear_logit_fn_builder
-# pylint: enable=protected-access
-
-
-def call_logit_fn(logit_fn, features, mode, params, config):
-  """Calls logit_fn.
-
-  A utility function that calls the provided logit_fn with the relevant subset
-  of provided arguments.  Similar to tf.estimator._call_model_fn().
-
-  Args:
-    logit_fn: A logit_fn as defined above.
-    features: The features dict.
-    mode: TRAIN / EVAL / PREDICT ModeKeys.
-    params: The hyperparameter dict.
-    config: The configuration object.
-
-  Returns:
-    A logit Tensor, the output of logit_fn.
-
-  Raises:
-    ValueError: if logit_fn does not return a Tensor or a dictionary mapping
-      strings to Tensors.
-  """
-  logit_fn_args = function_utils.fn_args(logit_fn)
-  kwargs = {}
-  if 'mode' in logit_fn_args:
-    kwargs['mode'] = mode
-  if 'params' in logit_fn_args:
-    kwargs['params'] = params
-  if 'config' in logit_fn_args:
-    kwargs['config'] = config
-  logit_fn_results = logit_fn(features=features, **kwargs)
-
-  result_is_valid_dictionary = (
-      isinstance(logit_fn_results, dict) and
-      all([(isinstance(k, six.string_types) and isinstance(v, ops.Tensor))
-           for k, v in six.iteritems(logit_fn_results)]))
-  result_is_tensor = isinstance(logit_fn_results, ops.Tensor)
+from tensorflow_estimator.contrib.estimator.python.estimator import logit_fns
 
-  if not (result_is_valid_dictionary or result_is_tensor):
-    raise ValueError('logit_fn should return a Tensor or a dictionary mapping '
-                     'strings to Tensors.  logit_fn returned: %s' %
-                     logit_fn_results)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+logit_fns.__all__ = [s for s in dir(logit_fns) if not s.startswith('__')]
 
-  return logit_fn_results
+from tensorflow_estimator.contrib.estimator.python.estimator.logit_fns import *
diff --git a/tensorflow/contrib/estimator/python/estimator/logit_fns_test.py b/tensorflow/contrib/estimator/python/estimator/logit_fns_test.py
deleted file mode 100644
index 074ece6cca2865b9057ab5ce874a210d3d9ac2e0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/logit_fns_test.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""logit_fn tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.estimator.python.estimator import logit_fns
-from tensorflow.python.client import session
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.framework import constant_op
-from tensorflow.python.platform import test
-
-
-class LogitFnTest(test.TestCase):
-
-  def test_simple_call_logit_fn(self):
-    def dummy_logit_fn(features, mode):
-      if mode == model_fn.ModeKeys.TRAIN:
-        return features['f1']
-      else:
-        return features['f2']
-    features = {
-        'f1': constant_op.constant([[2., 3.]]),
-        'f2': constant_op.constant([[4., 5.]])
-    }
-    logit_fn_result = logit_fns.call_logit_fn(
-        dummy_logit_fn, features, model_fn.ModeKeys.EVAL, 'fake_params',
-        'fake_config')
-    with session.Session():
-      self.assertAllClose([[4., 5.]], logit_fn_result.eval())
-
-  def test_simple_call_multi_logit_fn(self):
-
-    def dummy_logit_fn(features):
-      return {u'head1': features['f1'], 'head2': features['f2']}
-
-    features = {
-        'f1': constant_op.constant([[2., 3.]]),
-        'f2': constant_op.constant([[4., 5.]])
-    }
-    logit_fn_result = logit_fns.call_logit_fn(dummy_logit_fn, features,
-                                              model_fn.ModeKeys.TRAIN,
-                                              'fake_params', 'fake_config')
-    with session.Session():
-      self.assertAllClose([[2., 3.]], logit_fn_result['head1'].eval())
-      self.assertAllClose([[4., 5.]], logit_fn_result['head2'].eval())
-
-  def test_invalid_logit_fn_results(self):
-
-    def invalid_logit_fn(features, params):
-      return [
-          features['f1'] * params['input_multiplier'],
-          features['f2'] * params['input_multiplier']
-      ]
-
-    features = {
-        'f1': constant_op.constant([[2., 3.]]),
-        'f2': constant_op.constant([[4., 5.]])
-    }
-    params = {'learning_rate': 0.001, 'input_multiplier': 2.0}
-    with self.assertRaisesRegexp(
-        ValueError, 'logit_fn should return a Tensor or a dictionary mapping '
-                    'strings to Tensors'):
-      logit_fns.call_logit_fn(invalid_logit_fn, features, 'fake_mode', params,
-                              'fake_config')
-
-  def test_invalid_logit_fn_results_dict(self):
-
-    def invalid_logit_fn(features):
-      return {'head1': features['f1'], 'head2': features['f2']}
-
-    features = {'f1': constant_op.constant([[2., 3.]]), 'f2': 'some string'}
-    with self.assertRaisesRegexp(
-        ValueError, 'logit_fn should return a Tensor or a dictionary mapping '
-                    'strings to Tensors'):
-      logit_fns.call_logit_fn(invalid_logit_fn, features, 'fake_mode',
-                              'fake_params', 'fake_config')
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index 6e793c830244e64cd11c4054918c18a8251be7ac..6cf2917df3e1822a6e87462e331da57bdb596413 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,413 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Abstractions for the head(s) of a model.
+"""multi_head python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
 """
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.summary import summary
-from tensorflow.python.training import training_util
-
-
-_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
-
-def multi_head(heads, head_weights=None):
-  """Creates a `_Head` for multi-objective learning.
-
-  This class merges the output of multiple `_Head` objects.
-  Specifically:
-  * For training, sums losses of each head, calls `train_op_fn` with this
-    final loss.
-  * For eval, merges metrics by adding `head.name` suffix to the keys in eval
-    metrics, such as `precision/head1`, `precision/head2`.
-  * For prediction, merges predictions and updates keys in prediction dict to a
-    2-tuple, `(head.name, prediction_key)`. Merges `export_outputs` such that
-    by default the first head is served.
-
-  Usage:
-
-  ```python
-  # In `input_fn` specify labels as a dict keyed by head name:
-  def input_fn():
-    features = ...
-    labels1 = ...
-    labels2 = ...
-    return features, {'head1': labels1, 'head2': labels2}
-
-  # In `model_fn`, specify logits as a dict keyed by head name:
-  def model_fn(features, labels, mode):
-    # Create simple heads and specify head name.
-    head1 = multi_class_head(n_classes=3, name='head1')
-    head2 = binary_classification_head(name='head2')
-    # Create multi-head from two simple heads.
-    head = multi_head([head1, head2])
-    # Create logits for each head, and combine them into a dict.
-    logits1, logits2 = logit_fn()
-    logits = {'head1': logits1, 'head2': logits2}
-    # Return the merged EstimatorSpec
-    return head.create_estimator_spec(..., logits=logits, ...)
-
-  # Create an estimator with this model_fn.
-  estimator = tf.estimator.Estimator(model_fn=model_fn)
-  estimator.train(input_fn=input_fn, steps=100)
-  ```
-
-  Also supports `logits` as a `Tensor` of shape
-  `[D0, D1, ... DN, logits_dimension]`. It will split the `Tensor` along the
-  last dimension and distribute it appropriately among the heads. E.g.:
-
-  ```python
-  def model_fn(features, labels, mode):
-    # Create simple heads and specify head name.
-    head1 = multi_class_head(n_classes=3, name='head1')
-    head2 = binary_classification_head(name='head2')
-    # Create multi-head from two simple heads.
-    head = multi_head([head1, head2])
-    # Create logits for the multihead.
-    logits = logit_fn(logits_dimension=head.logits_dimension)
-    # Return the merged EstimatorSpec
-    return head.create_estimator_spec(..., logits=logits, ...)
-  ```
-
-  Args:
-    heads: List or tuple of `_Head` instances. All heads must have `name`
-      specified. The first head in the list is the default used at serving time.
-    head_weights: Optional list of weights, same length as `heads`. Used when
-      merging losses to calculate the weighted sum of losses from each head. If
-      `None`, all losses are weighted equally.
-
-  Returns:
-    A instance of `_Head` that merges multiple heads.
-
-  Raises:
-    ValueError: If `heads` is empty.
-    ValueError: If any of the `heads` does not have `name` specified.
-    ValueError: If `heads` and `head_weights` have different size.
-  """
-  if head_weights:
-    if len(head_weights) != len(heads):
-      raise ValueError(
-          'heads and head_weights must have the same size. '
-          'Given len(heads): {}. Given len(head_weights): {}.'.format(
-              len(heads), len(head_weights)))
-  if not heads:
-    raise ValueError('Must specify heads. Given: {}'.format(heads))
-  for head in heads:
-    if not head.name:
-      raise ValueError(
-          'All given heads must have name specified. '
-          'Given: {}'.format(head))
-
-  return _MultiHead(
-      heads=tuple(heads),
-      head_weights=tuple(head_weights) if head_weights else tuple())
-
-
-def _no_op_train_fn(loss):
-  del loss
-  return control_flow_ops.no_op()
-
-
-def _merge_losses(losses, head_weights=None):
-  """Merges the given losses into one tensor."""
-  losses = tuple(losses)
-  with ops.name_scope(
-      'merge_losses', values=losses + (head_weights or tuple())):
-    if head_weights:
-      weighted_losses = []
-      for loss, weight in zip(losses, head_weights):
-        weighted_losses.append(math_ops.multiply(loss, weight))
-    else:
-      weighted_losses = losses
-    return math_ops.add_n(weighted_losses)
-
-
-def _default_export_output(export_outputs, head_name):
-  """Extracts the default export output from the given export_outputs dict."""
-  if len(export_outputs) == 1:
-    return next(six.itervalues(export_outputs))
-  for k, v in six.iteritems(export_outputs):
-    if k == _DEFAULT_SERVING_KEY:
-      return v
-  raise ValueError(
-      '{} did not specify default export_outputs. '
-      'Given: {} '
-      'Suggested fix: Use one of the heads in tf.contrib.estimator, or include '
-      'key {} in export_outputs.'.format(
-          head_name, export_outputs, _DEFAULT_SERVING_KEY))
-
-
-class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
-  """`_Head` for multi objective learning."""
-
-  def __init__(self, heads, head_weights):
-    self._logits_dimension = 0
-    for head in heads:
-      self._logits_dimension += head.logits_dimension
-
-    self._heads = heads
-    self._head_weights = head_weights
-
-  @property
-  def name(self):
-    return '_'.join([h.name for h in self._heads])
-
-  @property
-  def logits_dimension(self):
-    return self._logits_dimension
-
-  def create_loss(self, features, mode, logits, labels):
-    """See `Head`."""
-    if isinstance(logits, dict):
-      logits_dict = logits
-    else:
-      logits_dict = self._split_logits(logits)
-    training_losses = []
-    labels_by_head = {}
-    unreduced_losses_by_head = {}
-    example_weights_by_head = {}
-    for i, head in enumerate(self._heads):
-      (training_loss, unreduced_loss,
-       weights, processed_labels) = head.create_loss(
-           features, mode, logits_dict[head.name], labels[head.name])
-      training_losses.append(training_loss)
-      labels_by_head[head.name] = processed_labels
-      if self._head_weights:
-        head_weight = self._head_weights[i]
-        unreduced_losses_by_head[head.name] = math_ops.multiply(
-            unreduced_loss, head_weight)
-        example_weights_by_head[head.name] = math_ops.multiply(
-            weights, head_weight)
-      else:
-        unreduced_losses_by_head[head.name] = unreduced_loss
-        example_weights_by_head[head.name] = weights
-
-    training_losses = tuple(training_losses)
-    with ops.name_scope(
-        'merge_losses',
-        values=training_losses + (self._head_weights or tuple())):
-      if self._head_weights:
-        head_weighted_training_losses = []
-        for training_loss, head_weight in zip(
-            training_losses, self._head_weights):
-          head_weighted_training_losses.append(
-              math_ops.multiply(training_loss, head_weight))
-        merged_training_loss = math_ops.add_n(head_weighted_training_losses)
-      else:
-        merged_training_loss = math_ops.add_n(training_losses)
-
-    return head_lib.LossSpec(
-        training_loss=merged_training_loss,
-        unreduced_loss=unreduced_losses_by_head,
-        weights=example_weights_by_head,
-        processed_labels=labels_by_head)
-
-  # TODO(b/65403806): Support regularization_losses arg.
-  def create_estimator_spec(
-      self, features, mode, logits, labels=None, optimizer=None,
-      train_op_fn=None):
-    """See `_Head`."""
-    return self._create_estimator_spec(
-        features=features, mode=mode, logits=logits, labels=labels,
-        optimizer=optimizer, train_op_fn=train_op_fn, use_tpu=False)
-
-  def _create_tpu_estimator_spec(
-      self, features, mode, logits, labels=None, optimizer=None,
-      train_op_fn=None):
-    """See `_Head`."""
-    return self._create_estimator_spec(
-        features=features, mode=mode, logits=logits, labels=labels,
-        optimizer=optimizer, train_op_fn=train_op_fn, use_tpu=True)
-
-  def _create_estimator_spec(
-      self, features, mode, logits, labels=None, optimizer=None,
-      train_op_fn=None, use_tpu=False):
-    """Returns `EstimatorSpec` or `TPUEstimatorSpec`."""
-    if isinstance(logits, dict):
-      logits_dict = logits
-    else:
-      logits_dict = self._split_logits(logits)
-    if labels and not isinstance(labels, dict):
-      raise ValueError('labels must be a dict. Given: {}'.format(labels))
-
-    all_estimator_spec = []
-    for head in self._heads:
-      head_name = head.name
-      all_estimator_spec.append(
-          head.create_estimator_spec(
-              features=features,
-              mode=mode,
-              logits=logits_dict[head_name],
-              labels=labels[head_name] if labels else None,
-              train_op_fn=_no_op_train_fn))
-
-    if mode == model_fn.ModeKeys.TRAIN:
-      spec = self._merge_train(
-          all_estimator_spec=all_estimator_spec,
-          optimizer=optimizer,
-          train_op_fn=train_op_fn,
-          use_tpu=use_tpu)
-      with ops.name_scope(''):
-        summary.scalar(metric_keys.MetricKeys.LOSS, spec.loss)
-      return spec
-    if mode == model_fn.ModeKeys.PREDICT:
-      return self._merge_predict(all_estimator_spec, use_tpu=use_tpu)
-    if mode == model_fn.ModeKeys.EVAL:
-      return self._merge_eval(all_estimator_spec, use_tpu=use_tpu)
-    raise ValueError('mode={} unrecognized'.format(mode))
-
-  def _split_logits(self, logits):
-    """Splits logits along the last dimension and returns a dict."""
-    logits_dict = {}
-    with ops.name_scope(None, 'split_logits', values=[logits]):
-      logits = ops.convert_to_tensor(logits)
-      batch_shape = array_ops.shape(logits)[:-1]
-      zeros_like_batch_shape = array_ops.zeros_like(batch_shape)
-      minus_ones_like_batch_shape = -1 * array_ops.ones_like(batch_shape)
-      begin_idx = 0
-      for head in self._heads:
-        begin_tensor = array_ops.concat(
-            [zeros_like_batch_shape, [begin_idx]], axis=0)
-        size_tensor = array_ops.concat(
-            [minus_ones_like_batch_shape, [head.logits_dimension]], axis=0)
-        logits_dict[head.name] = array_ops.slice(
-            logits, begin=begin_tensor, size=size_tensor)
-        begin_idx += head.logits_dimension
-    return logits_dict
-
-  def _merge_train(
-      self, all_estimator_spec, optimizer, train_op_fn, use_tpu=False):
-    """Merges list of `EstimatorSpec` or `TPUEstimatorSpec` for training.
-
-    Args:
-      all_estimator_spec: list of `EstimatorSpec` or `TPUEstimatorSpec` for the
-        individual heads.
-      optimizer: `Optimizer` instance to create train op. See
-        `create_estimator_spec` documentation for more details.
-      train_op_fn: Function to create train op. Used if `optimizer` is `None`.
-      use_tpu: If `True`, returns `TPUEstimatorSpec`.
-
-    Returns:
-      `EstimatorSpec` or `TPUEstimatorSpec` that merges all heads for TRAIN.
-
-    Raises:
-      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
-        mode.
-    """
-    losses = []
-    for spec in all_estimator_spec:
-      losses.append(spec.loss)
-    loss = _merge_losses(losses, self._head_weights)
-    if optimizer is not None:
-      if train_op_fn is not None:
-        raise ValueError('train_op_fn and optimizer cannot both be set.')
-      train_op = optimizer.minimize(
-          loss, global_step=training_util.get_global_step())
-    elif train_op_fn is not None:
-      train_op = train_op_fn(loss)
-    else:
-      raise ValueError('train_op_fn and optimizer cannot both be None.')
-
-    spec_type = (
-        model_fn._TPUEstimatorSpec if use_tpu else model_fn.EstimatorSpec)  # pylint:disable=protected-access
-    return spec_type(
-        mode=model_fn.ModeKeys.TRAIN,
-        loss=loss,
-        train_op=train_op)
-
-  def _merge_predict(self, all_estimator_spec, use_tpu=False):
-    """Merges list of `EstimatorSpec` or `TPUEstimatorSpec` for prediction.
-
-    Args:
-      all_estimator_spec: list of `EstimatorSpec` or `TPUEstimatorSpec` for the
-        individual heads.
-      use_tpu: If `True`, returns `TPUEstimatorSpec`.
-
-    Returns:
-      `EstimatorSpec` or `TPUEstimatorSpec` that merges all heads for PREDICT.
-    """
-    predictions = {}
-    export_outputs = {
-        _DEFAULT_SERVING_KEY: _default_export_output(
-            all_estimator_spec[0].export_outputs,
-            self._heads[0].name),
-    }
-    merged_predict_outputs = {}
-    for head, spec in zip(self._heads, all_estimator_spec):
-      head_name = head.name
-      for k, v in six.iteritems(spec.export_outputs):
-        if k == _DEFAULT_SERVING_KEY:
-          key = head_name
-        else:
-          key = '%s/%s' % (head_name, k)
-        export_outputs[key] = v
-        if (k == head_lib._PREDICT_SERVING_KEY and  # pylint:disable=protected-access
-            isinstance(v, export_output_lib.PredictOutput)):
-          for kp, vp in six.iteritems(v.outputs):
-            key = '%s/%s' % (head_name, kp)
-            merged_predict_outputs[key] = vp
-      for k, v in six.iteritems(spec.predictions):
-        predictions[(head_name, k)] = v
-    export_outputs[head_lib._PREDICT_SERVING_KEY] = (  # pylint:disable=protected-access
-        export_output_lib.PredictOutput(merged_predict_outputs))
-
-    spec_type = (
-        model_fn._TPUEstimatorSpec if use_tpu else model_fn.EstimatorSpec)  # pylint:disable=protected-access
-    return spec_type(
-        mode=model_fn.ModeKeys.PREDICT,
-        predictions=predictions,
-        export_outputs=export_outputs)
-
-  def _merge_eval(self, all_estimator_spec, use_tpu=False):
-    """Merges list of `EstimatorSpec` for eval.
-
-    Args:
-      all_estimator_spec: list of `EstimatorSpec` for the individual heads.
-      use_tpu: If `True`, will raise `NotImplementedError`, because TPU is not
-        yet supported for eval.
+from tensorflow_estimator.contrib.estimator.python.estimator import multi_head
 
-    Returns:
-      `EstimatorSpec` that merges all heads for EVAL.
-    Raises:
-      NotImplementedError: If `use_tpu` is `True`.
-    """
-    if use_tpu:
-      raise NotImplementedError(
-          'TPU evaluation is not implemented for multi_head.')
-    predictions = {}
-    metrics = {}
-    losses = []
-    with ops.name_scope('merge_eval'):
-      for head, spec in zip(self._heads, all_estimator_spec):
-        losses.append(spec.loss)
-        head_name = head.name
-        # Loss metric is not added by default.
-        loss_name = head_lib._summary_key(  # pylint:disable=protected-access
-            head_name, metric_keys.MetricKeys.LOSS)
-        metrics[loss_name] = metrics_lib.mean(spec.loss, name=loss_name)
-        # Metric keys already contain head.name.
-        metrics.update(spec.eval_metric_ops or {})
-        for k, v in six.iteritems(spec.predictions):
-          predictions[(head_name, k)] = v
-      loss = _merge_losses(losses, self._head_weights)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+multi_head.__all__ = [s for s in dir(multi_head) if not s.startswith('__')]
 
-    return model_fn.EstimatorSpec(
-        mode=model_fn.ModeKeys.EVAL,
-        predictions=predictions,
-        loss=loss,
-        eval_metric_ops=metrics)
+from tensorflow_estimator.contrib.estimator.python.estimator.multi_head import *
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
deleted file mode 100644
index a602f87b4a2b4062efddf819522fb2d1eeceaabe..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ /dev/null
@@ -1,705 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for head."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-
-from tensorflow.contrib.estimator.python.estimator import head as head_lib
-from tensorflow.contrib.estimator.python.estimator import multi_head as multi_head_lib
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import signature_constants
-
-
-_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
-
-def _initialize_variables(test_case, scaffold):
-  scaffold.finalize()
-  test_case.assertIsNone(scaffold.init_feed_dict)
-  test_case.assertIsNone(scaffold.init_fn)
-  scaffold.init_op.run()
-  scaffold.ready_for_local_init_op.eval()
-  scaffold.local_init_op.run()
-  scaffold.ready_op.eval()
-  test_case.assertIsNotNone(scaffold.saver)
-
-
-def _assert_simple_summaries(test_case, expected_summaries, summary_str,
-                             tol=1e-6):
-  """Assert summary the specified simple values.
-
-  Args:
-    test_case: test case.
-    expected_summaries: Dict of expected tags and simple values.
-    summary_str: Serialized `summary_pb2.Summary`.
-    tol: Tolerance for relative and absolute.
-  """
-  summary = summary_pb2.Summary()
-  summary.ParseFromString(summary_str)
-  test_case.assertAllClose(expected_summaries, {
-      v.tag: v.simple_value for v in summary.value
-  }, rtol=tol, atol=tol)
-
-
-def _assert_no_hooks(test_case, spec):
-  test_case.assertAllEqual([], spec.training_chief_hooks)
-  test_case.assertAllEqual([], spec.training_hooks)
-
-
-def _sigmoid(logits):
-  return 1 / (1 + np.exp(-logits))
-
-
-class MultiHeadTest(test.TestCase):
-
-  def setUp(self):
-    ops.reset_default_graph()
-
-  def test_no_heads(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'Must specify heads\. Given: \[\]'):
-      multi_head_lib.multi_head(heads=[])
-
-  def test_head_name_missing(self):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3)
-    with self.assertRaisesRegexp(
-        ValueError, r'All given heads must have name specified\.'):
-      multi_head_lib.multi_head([head1, head2])
-
-  def test_head_weights_wrong_size(self):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'heads and head_weights must have the same size\. '
-        r'Given len\(heads\): 2. Given len\(head_weights\): 1\.'):
-      multi_head_lib.multi_head([head1, head2], head_weights=[1.])
-
-  def test_name(self):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
-    multi_head = multi_head_lib.multi_head([head1, head2])
-    self.assertEqual('head1_head2', multi_head.name)
-
-  def _test_predict_two_heads_logits_dict(self, use_tpu):
-    """Tests predict with logits as dict."""
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
-    multi_head = multi_head_lib.multi_head([head1, head2])
-
-    logits = {
-        'head1': np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32),
-        'head2': np.array([[2., -2., 2.], [-3., 2., -2.]], dtype=np.float32)
-    }
-    expected_probabilities = {
-        'head1': _sigmoid(logits['head1']),
-        'head2': _sigmoid(logits['head2']),
-    }
-
-    if use_tpu:
-      spec = multi_head._create_tpu_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.PREDICT,
-          logits=logits).as_estimator_spec()
-    else:
-      spec = multi_head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.PREDICT,
-          logits=logits)
-
-    self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
-         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
-        spec.export_outputs.keys())
-
-    # Assert predictions and export_outputs.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(
-          logits['head1'],
-          predictions[('head1', prediction_keys.PredictionKeys.LOGITS)])
-      self.assertAllClose(
-          logits['head2'],
-          predictions[('head2', prediction_keys.PredictionKeys.LOGITS)])
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          predictions[('head1', prediction_keys.PredictionKeys.PROBABILITIES)])
-      self.assertAllClose(
-          expected_probabilities['head2'],
-          predictions[('head2', prediction_keys.PredictionKeys.PROBABILITIES)])
-
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          sess.run(spec.export_outputs['head1'].scores))
-      self.assertAllClose(
-          expected_probabilities['head2'],
-          sess.run(spec.export_outputs['head2'].scores))
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          sess.run(
-              spec.export_outputs['predict'].outputs['head1/probabilities']))
-      self.assertAllClose(
-          expected_probabilities['head2'],
-          sess.run(
-              spec.export_outputs['predict'].outputs['head2/probabilities']))
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          sess.run(
-              spec.export_outputs['head1/predict'].outputs['probabilities']))
-      self.assertAllClose(
-          expected_probabilities['head2'],
-          sess.run(
-              spec.export_outputs['head2/predict'].outputs['probabilities']))
-
-  def test_predict_two_heads_logits_dict(self):
-    self._test_predict_two_heads_logits_dict(use_tpu=False)
-
-  def test_predict_two_heads_logits_dict_tpu(self):
-    self._test_predict_two_heads_logits_dict(use_tpu=True)
-
-  def test_predict_two_heads_logits_tensor(self):
-    """Tests predict with logits as Tensor."""
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
-    multi_head = multi_head_lib.multi_head([head1, head2])
-
-    logits = np.array(
-        [[-1., 1., 2., -2., 2.], [-1.5, 1., -3., 2., -2.]], dtype=np.float32)
-    expected_logits1 = np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32)
-    expected_logits2 = np.array([[2., -2., 2.], [-3., 2., -2.]],
-                                dtype=np.float32)
-    expected_probabilities = {
-        'head1': _sigmoid(expected_logits1),
-        'head2': _sigmoid(expected_logits2),
-    }
-
-    spec = multi_head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
-         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
-        spec.export_outputs.keys())
-
-    # Assert predictions and export_outputs.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(
-          expected_logits1,
-          predictions[('head1', prediction_keys.PredictionKeys.LOGITS)])
-      self.assertAllClose(
-          expected_logits2,
-          predictions[('head2', prediction_keys.PredictionKeys.LOGITS)])
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          predictions[('head1', prediction_keys.PredictionKeys.PROBABILITIES)])
-      self.assertAllClose(
-          expected_probabilities['head2'],
-          predictions[('head2', prediction_keys.PredictionKeys.PROBABILITIES)])
-
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
-      self.assertAllClose(
-          expected_probabilities['head1'],
-          sess.run(spec.export_outputs['head1'].scores))
-      self.assertAllClose(
-          expected_probabilities['head2'],
-          sess.run(spec.export_outputs['head2'].scores))
-
-  def test_predict_two_heads_logits_tensor_multi_dim(self):
-    """Tests predict with multi-dimensional logits of shape [2, 2, 5]."""
-    head1 = head_lib.regression_head(label_dimension=2, name='head1')
-    head2 = head_lib.regression_head(label_dimension=3, name='head2')
-    multi_head = multi_head_lib.multi_head([head1, head2])
-
-    logits = np.array(
-        [[[-1., 1., 2., -2., 2.], [-1., 1., 2., -2., 2.]],
-         [[-1.5, 1., -3., 2., -2.], [-1.5, 1., -3., 2., -2.]]],
-        dtype=np.float32)
-    expected_logits1 = np.array(
-        [[[-1., 1.], [-1., 1.]],
-         [[-1.5, 1.], [-1.5, 1.]]],
-        dtype=np.float32)
-    expected_logits2 = np.array(
-        [[[2., -2., 2.], [2., -2., 2.]],
-         [[-3., 2., -2.], [-3., 2., -2.]]],
-        dtype=np.float32)
-
-    spec = multi_head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/regression',
-         'head1/predict', 'head2', 'head2/regression', 'head2/predict'),
-        spec.export_outputs.keys())
-
-    # Assert predictions and export_outputs.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(
-          expected_logits1,
-          predictions[('head1', prediction_keys.PredictionKeys.PREDICTIONS)])
-      self.assertAllClose(
-          expected_logits2,
-          predictions[('head2', prediction_keys.PredictionKeys.PREDICTIONS)])
-
-      self.assertAllClose(
-          expected_logits1,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].value))
-      self.assertAllClose(
-          expected_logits1,
-          sess.run(spec.export_outputs['head1'].value))
-      self.assertAllClose(
-          expected_logits2,
-          sess.run(spec.export_outputs['head2'].value))
-
-  def test_eval_two_heads_with_weights(self):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
-    multi_head = multi_head_lib.multi_head(
-        [head1, head2], head_weights=[1., 2.])
-
-    logits = {
-        'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
-        'head2': np.array([[20., -20., 20.], [-30., 20., -20.]],
-                          dtype=np.float32),
-    }
-    labels = {
-        'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
-        'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
-    }
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # head1: expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
-    # head2: expected_unweighted_loss = [[20., 20., 20.], [30., 0., 0]]
-    # loss = ( (20 + 20 + 20) / 3 + (30 + 0 + 0) / 3 ) / 2 = 15
-    expected_loss_head1 = 8.75
-    expected_loss_head2 = 15.
-    expected_loss = 1. * expected_loss_head1 + 2. * expected_loss_head2
-
-    spec = multi_head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS + '/head1': expected_loss_head1,
-        keys.LOSS + '/head2': expected_loss_head2,
-        # Average loss over examples.
-        keys.LOSS_MEAN + '/head1': expected_loss_head1,
-        keys.LOSS_MEAN + '/head2': expected_loss_head2,
-        # auc and auc_pr cannot be reliably calculated for only 4-6 samples, but
-        # this assert tests that the algorithm remains consistent.
-        keys.AUC + '/head1': 0.1667,
-        keys.AUC + '/head2': 0.3333,
-        keys.AUC_PR + '/head1': 0.6667,
-        keys.AUC_PR + '/head2': 0.5000,
-    }
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, and metrics.
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
-          rtol=tol,
-          atol=tol)
-
-  def test_eval_tpu(self):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
-    multi_head = multi_head_lib.multi_head(
-        [head1, head2], head_weights=[1., 2.])
-
-    logits = {
-        'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
-        'head2': np.array([[20., -20., 20.], [-30., 20., -20.]],
-                          dtype=np.float32),
-    }
-    labels = {
-        'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
-        'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
-    }
-
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        r'TPU evaluation is not implemented for multi_head\.'):
-      multi_head._create_tpu_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=logits,
-          labels=labels)
-
-  def test_train_create_loss_one_head(self):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    multi_head = multi_head_lib.multi_head([head1])
-
-    logits = {'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)}
-    labels = {'head1': np.array([[1, 0], [1, 1]], dtype=np.int64)}
-    loss = multi_head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)[0]
-    tol = 1e-3
-    with self.cached_session():
-      # Unreduced loss of the head is [[(10 + 10) / 2], (15 + 0) / 2]
-      # (averaged over classes, averaged over examples).
-      self.assertAllClose(8.75, loss.eval(), rtol=tol, atol=tol)
-
-  def test_train_create_loss_two_heads_with_weights(self):
-    # Use different example weighting for each head weighting.
-    weights1 = np.array([[1.], [2.]], dtype=np.float32)
-    weights2 = np.array([[2.], [3.]])
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1',
-                                      weight_column='weights1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2',
-                                      weight_column='weights2')
-    multi_head = multi_head_lib.multi_head(
-        [head1, head2], head_weights=[1., 2.])
-
-    logits = {
-        'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
-        'head2': np.array([[20., -20., 20.], [-30., 20., -20.]],
-                          dtype=np.float32),
-    }
-    labels = {
-        'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
-        'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
-    }
-    training_loss, unreduced_losses, weights, _ = multi_head.create_loss(
-        features={
-            'x': np.array(((42,),), dtype=np.int32),
-            'weights1': weights1,
-            'weights2': weights2
-        },
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    tol = 1e-3
-    with self.cached_session():
-      # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
-      # = [10, 7.5]
-      # training_loss = (1 * 10 + 2 * 7.5) / 2 = 12.5
-      # head-weighted unreduced_loss = 1 * [10, 7.5]
-      self.assertAllClose(
-          [[10.], [7.5]], unreduced_losses['head1'].eval(), rtol=tol, atol=tol)
-      # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
-      # = [20, 10]
-      # training_loss = (2 * 20 + 3 * 10) / 2 = 35
-      # head-weighted unreduced_loss = 2 * [20, 10]
-      self.assertAllClose(
-          [[40.], [20.]], unreduced_losses['head2'].eval(), rtol=tol, atol=tol)
-      # head-weighted training_loss = 1 * 12.5 + 2 * 35 = 82.5
-      self.assertAllClose(82.5, training_loss.eval(), rtol=tol, atol=tol)
-      # head-weighted example weights
-      self.assertAllClose(
-          [[1.], [2.]], weights['head1'].eval(), rtol=tol, atol=tol)
-      self.assertAllClose(
-          [[4.], [6.]], weights['head2'].eval(), rtol=tol, atol=tol)
-
-  def test_train_create_loss_logits_tensor(self):
-    """Tests create_loss with logits Tensor."""
-    weights1 = np.array([[1.], [2.]], dtype=np.float32)
-    weights2 = np.array([[2.], [3.]])
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1',
-                                      weight_column='weights1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2',
-                                      weight_column='weights2')
-    multi_head = multi_head_lib.multi_head(
-        [head1, head2], head_weights=[1., 2.])
-
-    logits = np.array([[-10., 10., 20., -20., 20.],
-                       [-15., 10., -30., 20., -20.]], dtype=np.float32)
-    labels = {
-        'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
-        'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
-    }
-    training_loss, unreduced_losses, weights, _ = multi_head.create_loss(
-        features={
-            'x': np.array(((42,),), dtype=np.int32),
-            'weights1': weights1,
-            'weights2': weights2
-        },
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    tol = 1e-3
-    with self.cached_session():
-      # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
-      # = [10, 7.5]
-      # training_loss = (1 * 10 + 2 * 7.5) / 2 = 12.5
-      # head-weighted unreduced_loss = 1 * [10, 7.5]
-      self.assertAllClose(
-          [[10.], [7.5]], unreduced_losses['head1'].eval(), rtol=tol, atol=tol)
-      # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
-      # = [20, 10]
-      # training_loss = (2 * 20 + 3 * 10) / 2 = 35
-      # head-weighted unreduced_loss = 2 * [20, 10]
-      self.assertAllClose(
-          [[40.], [20.]], unreduced_losses['head2'].eval(), rtol=tol, atol=tol)
-      # head-weighted training_loss = 1 * 12.5 + 2 * 35 = 82.5
-      self.assertAllClose(82.5, training_loss.eval(), rtol=tol, atol=tol)
-      # head-weighted example weights
-      self.assertAllClose(
-          [[1.], [2.]], weights['head1'].eval(), rtol=tol, atol=tol)
-      self.assertAllClose(
-          [[4.], [6.]], weights['head2'].eval(), rtol=tol, atol=tol)
-
-  def test_train_create_loss_logits_tensor_multi_dim(self):
-    """Tests create_loss with multi-dimensional logits of shape [2, 2, 5]."""
-    head1 = head_lib.regression_head(label_dimension=2, name='head1')
-    head2 = head_lib.regression_head(label_dimension=3, name='head2')
-    multi_head = multi_head_lib.multi_head([head1, head2])
-
-    logits = np.array(
-        [[[-1., 1., 2., -2., 2.], [-1., 1., 2., -2., 2.]],
-         [[-1.5, 1.5, -2., 2., -2.], [-1.5, 1.5, -2., 2., -2.]]],
-        dtype=np.float32)
-    labels = {
-        'head1': np.array([[[1., 0.], [1., 0.]],
-                           [[1.5, 1.5], [1.5, 1.5]]], dtype=np.float32),
-        'head2': np.array([[[0., 1., 0.], [0., 1., 0.]],
-                           [[2., 2., 0.], [2., 2., 0.]]], dtype=np.float32),
-    }
-    # Loss for the first head:
-    # loss1 = ((1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
-    #          (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2) / 8
-    #       = 3.5
-    # Loss for the second head:
-    # loss2 = ((0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
-    #          (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2) / 12
-    #       = 6.167
-    expected_training_loss = 3.5 + 6.167
-
-    training_loss = multi_head.create_loss(
-        features={},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)[0]
-    tol = 1e-3
-    with self.cached_session():
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
-
-  def test_train_one_head(self):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    multi_head = multi_head_lib.multi_head([head1])
-
-    logits = {'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)}
-    labels = {'head1': np.array([[1, 0], [1, 1]], dtype=np.int64)}
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
-    expected_loss = 8.75
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=3)])
-
-    spec = multi_head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    self.assertIsNotNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
-          train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          metric_keys.MetricKeys.LOSS + '/head1': expected_loss,
-      }, summary_str, tol)
-
-  def test_train_one_head_with_optimizer(self):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    multi_head = multi_head_lib.multi_head([head1])
-
-    logits = {'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)}
-    labels = {'head1': np.array([[1, 0], [1, 1]], dtype=np.int64)}
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
-    expected_loss = 8.75
-    expected_train_result = 'my_train_op'
-
-    class _Optimizer(object):
-
-      def minimize(self, loss, global_step):
-        del global_step
-        return string_ops.string_join(
-            [constant_op.constant(expected_train_result),
-             string_ops.as_string(loss, precision=3)])
-
-    spec = multi_head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        optimizer=_Optimizer())
-
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
-          train_result)
-
-  def _test_train_two_heads_with_weights(self, use_tpu):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
-    multi_head = multi_head_lib.multi_head(
-        [head1, head2], head_weights=[1., 2.])
-
-    logits = {
-        'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
-        'head2': np.array([[20., -20., 20.], [-30., 20., -20.]],
-                          dtype=np.float32),
-    }
-    labels = {
-        'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
-        'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
-    }
-    # For large logits, sigmoid cross entropy loss is approximated as:
-    # loss = labels * (logits < 0) * (-logits) +
-    #        (1 - labels) * (logits > 0) * logits =>
-    # head1: expected_unweighted_loss = [[10., 10.], [15., 0.]]
-    # loss = ( (10 + 10) / 2 + (15 + 0) / 2 ) / 2 = 8.75
-    # head2: expected_unweighted_loss = [[20., 20., 20.], [30., 0., 0]]
-    # loss = ( (20 + 20 + 20) / 3 + (30 + 0 + 0) / 3 ) / 2 = 15
-    # Average over classes, weighted sum over batch and heads.
-    expected_loss_head1 = 8.75
-    expected_loss_head2 = 15.0
-    expected_loss = 1. * expected_loss_head1 + 2. * expected_loss_head2
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=3)])
-
-    if use_tpu:
-      spec = multi_head._create_tpu_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=logits,
-          labels=labels,
-          train_op_fn=_train_op_fn).as_estimator_spec()
-    else:
-      spec = multi_head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=logits,
-          labels=labels,
-          train_op_fn=_train_op_fn)
-
-    self.assertIsNotNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-3
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
-          train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          metric_keys.MetricKeys.LOSS + '/head1': expected_loss_head1,
-          metric_keys.MetricKeys.LOSS + '/head2': expected_loss_head2,
-      }, summary_str, tol)
-
-  def test_train_two_heads_with_weights(self):
-    self._test_train_two_heads_with_weights(use_tpu=False)
-
-  def test_train_two_heads_with_weights_tpu(self):
-    self._test_train_two_heads_with_weights(use_tpu=True)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index cda23aa437f954700b74dcb9294550eb9a8a8c5c..e6b0cf027ae39a465b4a108b3d384d6fb08250ed 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,819 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities to replicate model_fn's over local GPUs.
+"""replicate_model_fn python module.
 
-This file contains util that allow to replicate `Estimator.model_fn` over
-GPUs.  Replicated version of a `model_fn` is returned that can subsequently
-be used with `Estimator`.
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
 """
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import defaultdict
-from contextlib import contextmanager
-import copy
+from tensorflow_estimator.contrib.estimator.python.estimator import replicate_model_fn
 
-import six
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+replicate_model_fn.__all__ = [
+    s for s in dir(replicate_model_fn) if not s.startswith('__')
+]
 
-from tensorflow.core.framework import node_def_pb2
-from tensorflow.python.client import device_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import device as framework_device
-from tensorflow.python.framework import ops as ops_lib
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import device_setter as device_setter_lib
-from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.util import deprecation
-from tensorflow.python.util import function_utils
-
-
-@deprecation.deprecated(
-    '2018-05-31',
-    'Please use `tf.contrib.distribute.MirroredStrategy` instead.')
-def replicate_model_fn(model_fn,
-                       loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
-                       devices=None):
-  """Replicate `Estimator.model_fn` over GPUs.
-
-  The given `model_fn` specifies a single forward pass of a model.  To replicate
-  such a model over GPUs, each GPU gets its own instance of the forward pass
-  (a.k.a. a tower).  The input features and labels get sharded into the chunks
-  that correspond to the number of GPUs.  Each tower computes a loss based
-  on its input.  For each such loss, gradients are computed.  After that, the
-  available losses are aggregated to form aggregated loss.  Available
-  gradients are summed.  Then, they update weights using the specified
-  optimizer.
-
-  If `devices` are `None`, then all available GPUs are going to be used for
-  replication.  If no GPUs are available, then the model is going to be
-  placed on the CPU.
-
-  Two modes of local replication over available GPUs are supported:
-    1)  If exactly 1 GPU is detected, then variables and operations are placed
-        onto the GPU.
-    2)  If more than 1 GPU is detected, then variables are going to be placed on
-        the CPU.  Replicas of operations are placed on each individual GPU.
-
-  Here is an example of how one might use their `model_fn` to run over GPUs:
-    ```python
-       ...
-       def model_fn(...):  # See `model_fn` in `Estimator`.
-         loss = ...
-         optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
-         optimizer = tf.contrib.estimator.TowerOptimizer(optimizer)
-         if mode == tf.estimator.ModeKeys.TRAIN:
-           #  See the section below on `EstimatorSpec.train_op`.
-           return EstimatorSpec(mode=mode, loss=loss,
-                                train_op=optimizer.minimize(loss))
-
-         #  No change for `ModeKeys.EVAL` or `ModeKeys.PREDICT`.
-         return EstimatorSpec(...)
-       ...
-       classifier = tf.estimator.Estimator(
-         model_fn=tf.contrib.estimator.replicate_model_fn(model_fn))
-    ```
-
-  Please see `DNNClassifierIntegrationTest` for an example with a canned
-  Estimator.
-
-  On `EstimatorSpec.train_op`:
-  `model_fn` returns `EstimatorSpec.train_op` for
-  `tf.estimator.GraphKeys.TRAIN`. It is typically derived using an optimizer.
-  Towers are expected to populate it in the same way.  Gradients from all towers
-  are reduced and applied in the last tower.  To achieve that in the case of
-  multiple towers, `TowerOptimizer` needs to be used.  See `TowerOptimizer`.
-
-  On sharding input features and labels:
-  Input features and labels are split for consumption by each tower. They are
-  split across the dimension 0.  Features and labels need to be batch major.
-
-  On reduction algorithms:
-  Certain algorithms were chosen for aggregating results of computations on
-  multiple towers:
-    - Losses from all towers are reduced according to `loss_reduction`.
-    - Gradients from all towers are reduced according to `loss_reduction`
-      for each trainable variable.
-    - `eval_metrics_ops` are reduced per metric using `reduce_mean`.
-    - `EstimatorSpec.predictions` and `EstimatorSpec.export_outputs` are
-      reduced using concatenation.
-    - For all other fields of `EstimatorSpec` the values of the first tower
-      are taken.
-
-  On distribution of variables:
-  Variables are not duplicated between towers.  Instead, they are placed on a
-  single device as defined above and shared across towers.
-
-  On overhead:
-  If only one device is specified, then aggregation of loss and gradients
-  doesn't happen. Replication consists of placing `model_fn` onto the
-  specified device.
-
-  On current limitations:
-    - `predictions` are not supported for `ModeKeys.EVAL`.  They are required
-       for `tf.contrib.estimator.add_metrics`.
-
-  Args:
-    model_fn: `model_fn` as defined in `Estimator`.  See the section above about
-      the train_op argument of `EstimatorSpec`.
-    loss_reduction: controls whether losses are summed or averaged.
-    devices: Optional list of devices to replicate the model across.  This
-      argument can be used to replicate only on the subset of available GPUs.
-      If `None`, then all available GPUs are going to be used for replication.
-      If no GPUs are available, then the model is going to be placed on the CPU.
-
-  Raises:
-    ValueError: if there is no `loss_reduction` or if TowerOptimizer is
-      mis-used.
-
-  Returns:
-    A replicated version of the supplied `model_fn`. Returned function that
-      conforms to the requirements of `Estimator`'s `model_fn` and can be used
-      instead of the supplied `model_fn`.
-  """
-  return _replicate_model_fn_with_mode(
-      model_fn,
-      loss_reduction,
-      devices,
-      # TODO(isaprykin): Query the system configuration to choose modes other
-      # than `SHARED_LOCAL_PARAMETER_SERVER`, even though it is often
-      # appropriate.
-      mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER)
-
-
-class _VariableDistributionMode(object):
-  """Modes for variable distribution used for forcing a particular one.
-
-  Forcing a mode is meant for performance experimentation purposes rather than
-  for general use cases.
-  """
-
-  SHARED_LOCAL_PARAMETER_SERVER = 1
-  """Variables are placed on a single device and shared across all devices.
-
-  Two ways to achieve this distribution over available GPUs are supported:
-    1)  If exactly 1 GPU is detected, then variables and operations are placed
-        onto GPU.
-    2)  If more than 1 GPU is detected, then variables are going to be placed on
-        the CPU.  Replicas of operations are placed on each individual GPU.
-  """
-
-  SHARED_ROUND_ROBIN = 2
-  """Variables are placed on all devices in a round-robin fashion.
-
-  Every subsequent variable is placed on the next device.  There is only one
-  copy of each variable that is shared across all devices.
-  """
-
-
-def _replicate_model_fn_with_mode(
-    model_fn,
-    loss_reduction,
-    devices=None,
-    mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER):
-  """A version of `replicate_model_fn` that allows to specify a `mode`."""
-  if loss_reduction == losses.Reduction.NONE:
-    raise ValueError('Tower losses need to be reduced in some way, yet {} '
-                     'reduction is specified.'.format(loss_reduction))
-  if not devices:
-    devices = _get_local_devices('GPU') or _get_local_devices('CPU')
-
-  is_a_single_gpu_case = len(devices) == 1 and 'GPU' in devices[0].upper()
-  consolidation_device = devices[0] if is_a_single_gpu_case else '/CPU:0'
-
-  ps_devices = [consolidation_device]
-  if mode == _VariableDistributionMode.SHARED_ROUND_ROBIN:
-    ps_devices = devices
-
-  tf_logging.info('Replicating the `model_fn` across {}.  Variables are going '
-                  'to be placed on {}.  Consolidation device is going to be {}.'
-                  .format(devices, ps_devices, consolidation_device))
-
-  def single_device_model_fn(features, labels, mode, params=None, config=None):
-    """`model_fn` on a single device without reduction overhead."""
-    return _get_loss_towers(
-        model_fn=model_fn,
-        mode=mode,
-        features=[features],
-        labels=[labels],
-        params=params,
-        loss_reduction=loss_reduction,
-        config=config,
-        devices=devices,
-        local_ps_devices=ps_devices)[0]  # One device, so one spec is out.
-
-  def replicated_model_fn(features, labels, mode, params=None, config=None):
-    """Replicated version of `model_fn` to be used instead."""
-    feature_shards, label_shards = _split_batch(
-        features, labels, len(devices), device=consolidation_device)
-    tower_specs = _get_loss_towers(
-        model_fn=model_fn,
-        mode=mode,
-        features=feature_shards,
-        labels=label_shards,
-        params=params,
-        loss_reduction=loss_reduction,
-        config=config,
-        devices=devices,
-        local_ps_devices=ps_devices)
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      train_op = _minimize_towers(tower_specs)
-      return _train_spec(
-          tower_specs, train_op, aggregation_device=consolidation_device)
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      return _eval_spec(tower_specs, aggregation_device=consolidation_device)
-    elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return _predict_spec(tower_specs, aggregation_device=consolidation_device)
-
-  if len(devices) == 1:
-    return single_device_model_fn
-  else:
-    return replicated_model_fn
-
-
-class TowerOptimizer(optimizer_lib.Optimizer):
-  """Gathers gradients from all towers and reduces them in the last one."""
-
-  COLLECTION_FOR_GRAPH_STATES = 'replicate_model_fn_graph_states'
-
-  @deprecation.deprecated(
-      '2018-05-31',
-      'Please use `tf.contrib.distribute.MirroredStrategy` instead.')
-  def __init__(self, optimizer_or_optimizer_fn):
-    """Wrap an existing optimizer for gathering gradients across towers.
-
-    Each invocation of model_fn has to call the same optimizers in the same
-    order.
-
-    Multiple optimizers that use the same or different losses are supported.
-
-    If TowerOptimizer is used but `replicate_model_fn` isn't, then no
-    aggregation will happen.  All calls will simply be forwarded to the
-    underlying optimizer. The behavior is similar if there is only one tower.
-
-    If TowerOptimizer is used together with SyncReplicasOptimizer that wraps
-    the user's optimizer, then it's the SyncReplicasOptimizer that needs to be
-    wrapped with TowerOptimizer.
-
-    Args:
-      optimizer_or_optimizer_fn: an instance of optimizer to wrap.  That
-        instance is going to be used for optimizer-specific logic.  This can
-        also be a no-argument function that returns such an optimizer instance.
-    """
-    self._optimizer_or_optimizer_fn = optimizer_or_optimizer_fn
-
-  @staticmethod
-  def has_been_used():
-    return TowerOptimizer._graph_state().has_tower_optimizer_been_used
-
-  def get_slot(self, *args, **kwargs):
-    return self._get_optimizer().get_slot(*args, **kwargs)
-
-  def get_slot_names(self, *args, **kwargs):
-    return self._get_optimizer().get_slot_names(*args, **kwargs)
-
-  def get_name(self, *args, **kwargs):
-    return self._get_optimizer().get_name(*args, **kwargs)
-
-  def variables(self, *args, **kwargs):
-    return self._get_optimizer().variables(*args, **kwargs)
-
-  def compute_gradients(self, loss, *args, **kwargs):
-    """Compute gradients, but first, if needed, scale the loss."""
-    loss = _scale_loss(loss,
-                       self._graph_state().loss_reduction,
-                       self._graph_state().number_of_towers)
-    return self._get_optimizer().compute_gradients(loss, *args, **kwargs)
-
-  def apply_gradients(self, grads_and_vars, global_step=None, **kwargs):
-    """Collect gradients updates to apply them with the last tower."""
-    if self._graph_state().number_of_towers == 1:
-      # Avoid the overhead of reduction if there's only one tower.
-      #
-      # There assumed to be only one tower if aggregation-related methods were
-      # not called by `_get_loss_towers`, for example if the model_fn uses
-      # TowerEstimator, but `replicate_model_fn` isn't used.
-      return self._get_optimizer().apply_gradients(grads_and_vars, global_step,
-                                                   **kwargs)
-
-    self._graph_state().collect_gradients(grads_and_vars)
-
-    if not self._graph_state().is_the_last_tower:
-      with ops_lib.control_dependencies(_extract_tensors(grads_and_vars)):
-        return self._construct_no_op_train_op()
-    else:
-      # Gradients need to be gathered and applied in the scope of the first
-      # tower, so that the tensors are accessible via names without prefixes.
-      var_scope, name_scope = self._graph_state().scopes_of_the_first_tower
-      with variable_scope.variable_scope(var_scope):
-        with ops_lib.name_scope(name_scope):
-          return self._apply_gathered_gradients(global_step, **kwargs)
-
-  def _apply_gathered_gradients(self, global_step, **kwargs):
-    graph_state = self._graph_state()
-    optimizer = self._get_optimizer()
-
-    grad_lists = {}
-    for grad, var in graph_state.get_latest_gradients_from_all_towers():
-      if grad is not None:
-        grad_lists.setdefault(var, []).append(grad)
-
-    aggregated_grads = []
-    with ops_lib.name_scope('gradient_aggregating'):
-      for var, grads in six.iteritems(grad_lists):
-        grad = _compute_sum_on_device(grads, var.device)
-        aggregated_grads.append((grad, var))
-    return optimizer.apply_gradients(
-        aggregated_grads, global_step=global_step, **kwargs)
-
-  def _get_optimizer(self):
-    if callable(self._optimizer_or_optimizer_fn):
-      # If optimizer is given as a function then we need to wait till we are
-      # under the right graph context before constructing it.  That's why the
-      # optimizer is constructed in _get_optimizer() rather than __init__().
-      self._optimizer_or_optimizer_fn = self._optimizer_or_optimizer_fn()
-    self._graph_state().has_tower_optimizer_been_used = True
-    return self._optimizer_or_optimizer_fn
-
-  def _construct_no_op_train_op(self):
-    return control_flow_ops.no_op(name='train_op_placeholder')
-
-  @staticmethod
-  def _graph_state():
-    graph_states = ops_lib.get_default_graph().get_collection_ref(
-        TowerOptimizer.COLLECTION_FOR_GRAPH_STATES)
-    if not graph_states:
-      graph_states.append(TowerOptimizer._PerGraphState())
-    return graph_states[-1]
-
-  @staticmethod
-  def _did_towers_have_same_optimizer_calls():
-    graph_state = TowerOptimizer._graph_state()
-    return graph_state.did_towers_have_same_optimizer_calls()
-
-  @staticmethod
-  def _clear_graph_state():
-    # Clearing the Graph collection will prevent _PerGraphState from being
-    # serialized.
-    ops_lib.get_default_graph().clear_collection(
-        TowerOptimizer.COLLECTION_FOR_GRAPH_STATES)
-
-  class _PerGraphState(object):
-    """Gradient reduction related state of a Tensorflow graph."""
-
-    def __init__(self):
-      self._collected_grads_and_vars = defaultdict(list)
-      self._current_tower_index = 0
-      self._number_of_towers = 1
-      self._loss_reduction = None
-      # Scopes of the first tower that don't have a prefix:
-      self._variable_scope = None
-      self._name_scope = None
-      # If needed, alert that TowerOptimizer needs to be used with model_fn.
-      self._has_tower_optimizer_been_used = False
-
-    def collect_gradients(self, grads_and_vars):
-      self._collected_grads_and_vars[self._current_tower_index].append(
-          grads_and_vars)
-
-    def get_latest_gradients_from_all_towers(self):
-      """Get gradients across towers for the last called optimizer."""
-      grads_and_vars = []
-      index_of_last_gradients = len(
-          self._collected_grads_and_vars[self._current_tower_index]) - 1
-      for tower_id in range(self._current_tower_index + 1):
-        grads_and_vars.extend(
-            self._collected_grads_and_vars[tower_id][index_of_last_gradients])
-      return grads_and_vars
-
-    def set_reduction_across_towers(self, loss_reduction, number_of_towers):
-      self._loss_reduction = loss_reduction
-      self._number_of_towers = number_of_towers
-
-    @contextmanager
-    def tower(self, tower_id, var_scope, name_scope):
-      if tower_id == 0:
-        self._variable_scope = var_scope
-        self._name_scope = name_scope
-      self._current_tower_index = tower_id
-      yield
-
-    @property
-    def scopes_of_the_first_tower(self):
-      return self._variable_scope, self._name_scope
-
-    @property
-    def is_the_last_tower(self):
-      return self._current_tower_index == (self._number_of_towers - 1)
-
-    @property
-    def number_of_towers(self):
-      return self._number_of_towers
-
-    @property
-    def loss_reduction(self):
-      return self._loss_reduction
-
-    @property
-    def has_tower_optimizer_been_used(self):
-      return self._has_tower_optimizer_been_used
-
-    @has_tower_optimizer_been_used.setter
-    def has_tower_optimizer_been_used(self, value):
-      self._has_tower_optimizer_been_used = value
-
-    def did_towers_have_same_optimizer_calls(self):
-      total_number_of_grads = sum([
-          len(grads)
-          for _, grads in six.iteritems(self._collected_grads_and_vars)
-      ])
-      return total_number_of_grads % self._number_of_towers == 0
-
-
-def _get_local_devices(device_type):
-  local_device_protos = device_lib.list_local_devices()
-  return [
-      device.name
-      for device in local_device_protos
-      if device.device_type == device_type
-  ]
-
-
-def _split_batch(features, labels, number_of_shards, device):
-  """Split input features and labels into batches."""
-
-  def ensure_divisible_by_shards(sequence):
-    batch_size = ops_lib.convert_to_tensor(sequence).get_shape()[0]
-    if batch_size % number_of_shards != 0:
-      raise ValueError(
-          'Batch size {} needs to be divisible by the number of GPUs, which '
-          'is {}.'.format(batch_size, number_of_shards))
-
-  def split_dictionary(dictionary):
-    """Split a dictionary into shards."""
-    shards = [{} for _ in range(number_of_shards)]
-    for name, tensor in six.iteritems(dictionary):
-      if isinstance(tensor, sparse_tensor.SparseTensor):
-        for i, shard in enumerate(
-            sparse_ops.sparse_split(
-                sp_input=tensor, num_split=number_of_shards, axis=0)):
-          shards[i][name] = shard
-      else:
-        ensure_divisible_by_shards(tensor)
-        for i, shard in enumerate(array_ops.split(tensor, number_of_shards)):
-          shards[i][name] = shard
-    return shards
-
-  with ops_lib.name_scope('split_inputs'):
-    with ops_lib.device(device):
-      if isinstance(features, dict):
-        feature_shards = split_dictionary(features)
-      else:
-        ensure_divisible_by_shards(features)
-        feature_shards = array_ops.split(features, number_of_shards)
-
-      if labels is None:
-        label_shards = None
-      elif isinstance(labels, dict):
-        label_shards = split_dictionary(labels)
-      else:
-        ensure_divisible_by_shards(labels)
-        label_shards = array_ops.split(labels, number_of_shards)
-  return feature_shards, label_shards
-
-
-_DEFAULT_NAME_SCOPE_PATTERN = 'tower_{}'
-
-
-def _get_loss_towers(model_fn,
-                     mode,
-                     features,
-                     labels,
-                     params,
-                     config,
-                     devices,
-                     local_ps_devices,
-                     loss_reduction,
-                     name_scope_pattern=_DEFAULT_NAME_SCOPE_PATTERN):
-  """Replicate the loss computation across devices."""
-  tower_specs = []
-
-  model_fn_args = function_utils.fn_args(model_fn)
-  optional_params = {}
-  if 'params' in model_fn_args:
-    optional_params['params'] = copy.deepcopy(params)
-  if 'config' in model_fn_args:
-    optional_params['config'] = copy.deepcopy(config)
-
-  # pylint: disable=protected-access
-  round_robin_strategy = device_setter_lib._RoundRobinStrategy(
-      num_tasks=len(local_ps_devices))
-  TowerOptimizer._graph_state().set_reduction_across_towers(
-      loss_reduction, len(devices))
-
-  for i, device in enumerate(devices):
-    is_the_first_tower = (i == 0)
-
-    device_setter = _local_device_setter(
-        worker_device=device,
-        ps_devices=local_ps_devices,
-        ps_strategy=round_robin_strategy)
-
-    # We would like to preserve the names of the variables and ops that the user
-    # might be relying on. Names without a prefix are going to resolve to
-    # variables and ops of the first tower.
-    name_scope = name_scope_pattern
-    if is_the_first_tower:
-      name_scope = ''
-
-    with variable_scope.variable_scope(
-        '', reuse=not is_the_first_tower) as var_scope:
-      with ops_lib.name_scope(name_scope.format(i)) as name_scope:
-        with TowerOptimizer._graph_state().tower(
-            tower_id=i, var_scope=var_scope, name_scope=name_scope):
-          with ops_lib.device(device_setter):
-            labels_shard = None
-            if labels:
-              labels_shard = labels[i]
-
-            tower_spec = model_fn(
-                mode=mode,
-                features=features[i],
-                labels=labels_shard,
-                **optional_params)
-
-            if (tower_spec.train_op is not None and len(devices) > 1 and
-                not TowerOptimizer.has_been_used()):
-              raise ValueError('Please wrap optimizers with TowerOptimizer'
-                               ' in order to use replicate_model_fn with'
-                               ' multiple `devices`.')
-
-            # Scaling the loss here doesn't actually affect gradients.  Another
-            # instance of scaling happens inside the TowerOptimizer.
-            tower_spec = _scale_tower_loss(
-                tower_spec, loss_reduction, number_of_towers=len(devices))
-            tower_specs.append(tower_spec)
-
-  if not TowerOptimizer._did_towers_have_same_optimizer_calls():
-    raise ValueError('Each invocation of model_fn was supposed to make the same'
-                     ' optimizer calls.')
-  TowerOptimizer._clear_graph_state()
-  # pylint: enable=protected-access
-  return tower_specs
-
-
-def _local_device_setter(worker_device, ps_devices, ps_strategy):
-  """A device setter that puts distributes Var/Ops to PS/workers."""
-  ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
-
-  def local_device_chooser(op):
-    current_device = framework_device.DeviceSpec.from_string(op.device or '')
-
-    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
-    if node_def.op in ps_ops:
-      ps_device_spec = framework_device.DeviceSpec.from_string(
-          '{}'.format(ps_devices[ps_strategy(op)]))
-
-      ps_device_spec.merge_from(current_device)
-      return ps_device_spec.to_string()
-    else:
-      worker_device_spec = framework_device.DeviceSpec.from_string(
-          worker_device or '')
-      worker_device_spec.merge_from(current_device)
-      return worker_device_spec.to_string()
-
-  return local_device_chooser
-
-
-def _scale_tower_loss(tower_spec, loss_reduction, number_of_towers):
-  """Produce an EstimatorSpec with appropriately scaled loss."""
-  if tower_spec.loss is None:
-    return tower_spec
-
-  estimator_spec = _asdict(tower_spec)
-  estimator_spec['loss'] = _scale_loss(tower_spec.loss, loss_reduction,
-                                       number_of_towers)
-  return model_fn_lib.EstimatorSpec(**estimator_spec)
-
-
-def _scale_loss(loss, loss_reduction, number_of_towers):
-  """If needed, scale down the loss for averaging loss by summing."""
-  if loss is None:
-    return None
-  if number_of_towers == 1:
-    return loss
-
-  if loss_reduction != losses.Reduction.SUM:
-    return math_ops.div(loss, 1.0 * number_of_towers, name='averaged_loss')
-  else:
-    return loss
-
-
-def _minimize_towers(tower_specs):
-  """`train_op` of the last tower applies aggregated gradients."""
-  return tower_specs[-1].train_op
-
-
-def _compute_sum_on_device(values, device, name=None):
-  with ops_lib.device(device):
-    if isinstance(values[0], ops_lib.IndexedSlices):
-      if name:
-        raise ValueError('The name {} is not expected to be given to '
-                         'IndexedSlices {}'.format(name, values))
-
-      values_concat = array_ops.concat([v.values for v in values], axis=0)
-      indices_concat = array_ops.concat([v.indices for v in values], axis=0)
-      return ops_lib.IndexedSlices(values_concat, indices_concat,
-                                   values[0].dense_shape)
-    else:
-      return math_ops.add_n(values, name=name)
-
-
-def _train_spec(tower_specs,
-                train_op,
-                aggregation_device,
-                aggregated_loss_name='loss'):
-  """Populate replicated EstimatorSpec for `GraphKeys.TRAIN`."""
-  # Spec of the last tower is used as the template for the final spec, because
-  # some `EstimatorSpec.training_hooks` rely on calls made in model_fn.  For
-  # example, `SyncReplicasOptimizerHook` validates the
-  # `SyncReplicasOptimizer.apply_gradients` call. `TowerEstimator` makes that
-  # call only in the last tower.
-  estimator_spec = _asdict(tower_specs[-1])
-  estimator_spec['mode'] = model_fn_lib.ModeKeys.TRAIN
-  estimator_spec['train_op'] = train_op
-  estimator_spec['loss'] = _compute_sum_on_device(
-      [spec.loss for spec in tower_specs], aggregation_device,
-      aggregated_loss_name)
-  return model_fn_lib.EstimatorSpec(**estimator_spec)
-
-
-def _eval_spec(tower_specs, aggregation_device, aggregated_loss_name='loss'):
-  """Populate replicated EstimatorSpec for `GraphKeys.EVAL`."""
-  estimator_spec = _asdict(tower_specs[0])
-  estimator_spec['mode'] = model_fn_lib.ModeKeys.EVAL
-  estimator_spec['loss'] = _compute_sum_on_device(
-      [spec.loss for spec in tower_specs], aggregation_device,
-      aggregated_loss_name)
-
-  update_ops = []
-  for tower_spec in tower_specs:
-    for name, (_, update_op) in six.iteritems(tower_spec.eval_metric_ops):
-      update_ops.append(update_op)
-
-  with ops_lib.control_dependencies(update_ops):
-    reduced_update_op = _reduce_metric_variables(len(tower_specs))
-
-  eval_metric_ops = {}
-  for name, (metric_tensor, _) in six.iteritems(tower_specs[0].eval_metric_ops):
-    eval_metric_ops[name] = (metric_tensor, reduced_update_op)
-  estimator_spec['eval_metric_ops'] = eval_metric_ops
-  return model_fn_lib.EstimatorSpec(**estimator_spec)
-
-
-def _reduce_metric_variables(number_of_towers):
-  """Aggregate local variables used in metrics into the first tower."""
-  if number_of_towers == 1:
-    return control_flow_ops.no_op(name='no_eval_metric_reduction')
-
-  metric_variables = ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)
-  variables_per_tower = len(metric_variables) // number_of_towers
-
-  if len(metric_variables) % number_of_towers != 0:
-    raise ValueError(
-        'Different `EstimatorSpec.eval_metric_ops` across `model_fn()` calls.'
-        ' Expected {} local variables, but got {} instead.'.format(
-            variables_per_tower * number_of_towers, len(metric_variables)))
-
-  # `metric_variables` has the size of `variables_per_tower` x
-  #  number_of_towers.  Each tower is produced by calling the same model_fn.
-  #  First `variables_per_tower` correspond to the first tower.  Each such
-  #  variable has an replica at the `(variables_per_tower * i)` position, where
-  #  `i` is `[1.. number_of_towers]`.  We are going to add values from replicas
-  #  to each variable of the first tower.  We then zero out replica values, so
-  #  that `_reduce_metric_variables` operation is idempotent.  If a metric
-  #  is then computed based on local variables from the first tower, then the
-  #  resulting metric is an estimate for all `number_of_towers` towers.
-  ops = []
-  for i in range(0, variables_per_tower):
-    next_replica_id = i + variables_per_tower
-    replicas = [
-        metric_variables[replica_id]
-        for replica_id in range(next_replica_id, len(metric_variables),
-                                variables_per_tower)
-    ]  #  `replicas` doesn't contain the first-tower variable.
-
-    reduce_op = state_ops.assign_add(metric_variables[i],
-                                     math_ops.add_n(replicas))
-
-    with ops_lib.control_dependencies([reduce_op]):
-      for replica in replicas:
-        zeros_for_replica = array_ops.zeros(
-            array_ops.shape(replica), dtype=replica.dtype)
-        zero_out_replica_op = state_ops.assign(replica, zeros_for_replica)
-        ops.append(zero_out_replica_op)
-
-  return control_flow_ops.group(*ops)
-
-
-def _predict_spec(tower_specs, aggregation_device):
-  """Populate replicated EstimatorSpec for `GraphKeys.PREDICT`."""
-  estimator_spec = _asdict(tower_specs[0])
-  estimator_spec['mode'] = model_fn_lib.ModeKeys.PREDICT
-
-  with ops_lib.device(aggregation_device):
-    estimator_spec['predictions'] = _concat_tensor_dicts(
-        *[tower_spec.predictions for tower_spec in tower_specs])
-
-    export_outputs_dict = _dict_concat(
-        *[tower_spec.export_outputs for tower_spec in tower_specs])
-
-    export_outputs = {}
-    for name, export_output_list in six.iteritems(export_outputs_dict):
-      if isinstance(export_output_list[0], export_output_lib.PredictOutput):
-        export_outputs[name] = export_output_lib.PredictOutput(
-            outputs=_concat_tensor_dicts(*[
-                export_output.outputs for export_output in export_output_list
-            ]))
-      elif isinstance(export_output_list[0],
-                      export_output_lib.RegressionOutput):
-        export_outputs[name] = export_output_lib.RegressionOutput(
-            value=array_ops.concat(
-                [export_output.value for export_output in export_output_list],
-                axis=0))
-      elif isinstance(export_output_list[0],
-                      export_output_lib.ClassificationOutput):
-        scores = None
-        if export_output_list[0].scores is not None:
-          scores = array_ops.concat(
-              [export_output.scores for export_output in export_output_list],
-              axis=0)
-
-        classes = None
-        if export_output_list[0].classes is not None:
-          classes = array_ops.stack(
-              [export_output.classes for export_output in export_output_list],
-              axis=0)
-
-        export_outputs[name] = export_output_lib.ClassificationOutput(
-            scores=scores, classes=classes)
-
-  estimator_spec['export_outputs'] = export_outputs
-  return model_fn_lib.EstimatorSpec(**estimator_spec)
-
-
-def _concat_tensor_dicts(*tensor_dicts):
-  return {
-      name: array_ops.concat(tensors, axis=0, name=name)
-      for name, tensors in six.iteritems(_dict_concat(*tensor_dicts))
-  }
-
-
-def _extract_tensors(tensors_and_vars):
-  tensors = []
-  for tensor_and_var in tensors_and_vars:
-    tensor, _ = tensor_and_var
-    if isinstance(tensor, ops_lib.IndexedSlices):
-      tensors.append(tensor.values)
-    elif tensor is not None:
-      tensors.append(tensor)
-  return tensors
-
-
-def _dict_concat(*dicts):
-  list_dict = {}
-  for d in dicts:
-    if d is None:
-      continue
-
-    for k, v in six.iteritems(d):
-      list_dict.setdefault(k, []).append(v)
-  return list_dict
-
-
-def _asdict(namedtuple):
-  """Returns a namedtuple as a dictionary.
-
-  This is required because `_asdict()` in Python 3.x.x is broken in classes
-  that inherit from `collections.namedtuple`. See
-  https://bugs.python.org/issue24931 for more details.
-
-  Args:
-    namedtuple: An object that inherits from `collections.namedtuple`.
-
-  Returns:
-    A dictionary version of the tuple.
-  """
-  return {k: getattr(namedtuple, k) for k in namedtuple._fields}
+from tensorflow_estimator.contrib.estimator.python.estimator.replicate_model_fn import *
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
deleted file mode 100644
index 65229d67bbca4513d792b5c37717eedfe27424f1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
+++ /dev/null
@@ -1,1649 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for utilities that replicate `Estimator.model_fn` over GPUs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import re
-import shutil
-import tempfile
-from absl.testing import parameterized
-import numpy as np
-import six
-
-from tensorflow.contrib.estimator.python.estimator import replicate_model_fn
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.canned import dnn
-from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops as ops_lib
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import losses
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import adam
-from tensorflow.python.training import device_setter
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import training
-
-
-class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase,
-                                   parameterized.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  @parameterized.named_parameters(
-      ('PublicInterface', None),
-      ('ParameterServerMode', replicate_model_fn._VariableDistributionMode.
-       SHARED_LOCAL_PARAMETER_SERVER),
-      ('RoundRobinMode',
-       replicate_model_fn._VariableDistributionMode.SHARED_ROUND_ROBIN))
-  def test_complete_flow_with_mode(self, mode):
-    n_classes = 3
-    input_dimension = 2
-    batch_size = 12
-
-    data = np.linspace(
-        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
-    x_data = data.reshape(batch_size, input_dimension)
-    categorical_data = np.random.random_integers(
-        0, len(x_data), size=len(x_data))
-    y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data,
-           'categories': categorical_data},
-        y=y_data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data,
-           'categories': categorical_data},
-        y=y_data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data,
-           'categories': categorical_data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,)),
-        feature_column.embedding_column(
-            feature_column.categorical_column_with_vocabulary_list(
-                'categories',
-                vocabulary_list=np.linspace(
-                    0., len(x_data), len(x_data), dtype=np.int64)), 1)
-    ]
-
-    def optimizer_fn():
-      return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05)
-
-    estimator = dnn.DNNClassifier(
-        hidden_units=(2, 2),
-        # Adagrad is configured with `get_optimizer_instance`, so the function
-        # form of `TowerOptimizer.__init__` is used.
-        optimizer=replicate_model_fn.TowerOptimizer(optimizer_fn),
-        feature_columns=feature_columns,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    if not mode:  # Use the public `replicate_model_fn`.
-      model_fn = replicate_model_fn.replicate_model_fn(
-          estimator.model_fn, devices=['/gpu:0', '/gpu:1', '/gpu:2'])
-    else:
-      model_fn = replicate_model_fn._replicate_model_fn_with_mode(
-          estimator.model_fn,
-          devices=['/gpu:0', '/gpu:1', '/gpu:2'],
-          loss_reduction=losses.Reduction.SUM,
-          mode=mode)
-
-    estimator = estimator_lib.Estimator(
-        model_fn=model_fn,
-        model_dir=estimator.model_dir,
-        config=estimator.config,
-        params=estimator.params)
-
-    num_steps = 10
-    estimator.train(train_input_fn, steps=num_steps)
-
-    scores = estimator.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops_lib.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    predicted_proba = np.array([
-        x[prediction_keys.PredictionKeys.PROBABILITIES]
-        for x in estimator.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
-
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
-                                             serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-    # Nothing should be left in the graph so that it doesn't get serialized.
-    self.assertFalse(ops_lib.get_default_graph().get_collection_ref(
-        replicate_model_fn.TowerOptimizer.COLLECTION_FOR_GRAPH_STATES))
-
-  def _as_label(self, data_in_float):
-    return np.rint(data_in_float).astype(np.int64)
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-
-class ReplicateModelTest(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = replicate_model_fn.TowerOptimizer(
-        gradient_descent.GradientDescentOptimizer(params['learning_rate']))
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(loss))
-
-  @property
-  def params(self):
-    params = {}
-    params['learning_rate'] = 1.0
-    return params
-
-  def test_train(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn,
-          loss_reduction=losses.Reduction.SUM,
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # derivative of loss = (1*c - 1) + (2*c - 2) is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(7.0, session.run(c))
-
-  def test_train_with_mean_reduction(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.cached_session() as session:
-      # Add another trainable variable that doesn't produce a gradient to
-      # verify that None gradients are supported.
-      _ = variable_scope.get_variable(
-          'another_variable',
-          initializer=constant_op.constant(1, dtype=dtypes.float64),
-          dtype=dtypes.float64)
-
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, losses.Reduction.MEAN, devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = ((1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)) / 2.0
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # derivative of loss = (1*c - 1)/2 + (2*c - 2)/2 is 1.5.
-      # It's the same computation as without mean reduction, but the
-      # loss from every tower is scaled by 1/<number of towers>.
-      # new value of c = 10 - learning rate * 1.5 = 8.5
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(8.5, session.run(c))
-
-  def test_train_two_steps_collected_gradients_are_reset_between_steps(self):
-    with ops_lib.Graph().as_default():
-      features = array_ops.placeholder(dtypes.float64)
-      labels = array_ops.placeholder(dtypes.float64)
-
-      feature_inputs = np.array([[1.0], [2.0]]), np.array([[1.5], [2.5]])
-      label_inputs = np.array([[1.0], [2.0]]), np.array([[1.5], [2.5]])
-
-      # loss = feature * c - label
-      expected_losses = ((1.0 * 10 - 1.0) + (2.0 * 10 - 2.0),
-                         (1.5 * 7.0 - 1.5) + (2.5 * 7.0 - 2.5))
-      # Derivative of the loss is 1.0 + 2.0 for the first step and 1.5 + 2.5
-      # for the second.
-      expected_c = 10.0 - 3.0, 7.0 - 4.0
-
-      with self.cached_session() as session, variable_scope.variable_scope(
-          '', reuse=variable_scope.AUTO_REUSE):
-        replicated_model_fn = replicate_model_fn.replicate_model_fn(
-            self.model_fn,
-            loss_reduction=losses.Reduction.SUM,
-            devices=['/gpu:0', '/gpu:1'])
-        estimator_spec = replicated_model_fn(
-            features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-        session.run(variables.global_variables_initializer())
-
-        for feature_input, label_input, loss, weight in zip(
-            feature_inputs, label_inputs, expected_losses, expected_c):
-          feeds = {features: feature_input, labels: label_input}
-
-          self.assertEqual(loss, session.run(estimator_spec.loss, feeds))
-
-          session.run(estimator_spec.train_op, feeds)
-          c = variable_scope.get_variable('c', dtype=dtypes.float64)
-          self.assertEqual(weight, session.run(c, feeds))
-
-  def test_eval(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn,
-          loss_reduction=losses.Reduction.SUM,
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
-      session.run(variables.local_variables_initializer())
-      session.run(variables.global_variables_initializer())
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      session.run([a, b])
-      accuracy = session.run(accuracy)
-      auc = session.run(auc)
-
-      # loss[i] = features[i] * 10 - labels[i].
-      # Accuracy is 0.0 (no match) in the first tower.
-      # Accuracy is 1.0 (match) in the second tower, since the feature
-      # times weight "c" happened to be equal to the label.
-      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02))
-
-      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
-
-  def test_eval_with_mean_reduction(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, losses.Reduction.MEAN, devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
-      session.run(variables.local_variables_initializer())
-      session.run(variables.global_variables_initializer())
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      session.run([a, b])
-      accuracy = session.run(accuracy)
-      auc = session.run(auc)
-
-      # loss[i] = features[i] * 10 - labels[i].
-      # Accuracy is 0.0 (no match) in the first tower.
-      # Accuracy is 1.0 (match) in the second tower, since the feature
-      # times weight "c" happened to be equal to the label.
-      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02)) / 2.0
-
-      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
-
-  def test_predict(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.PREDICT, self.params)
-      session.run(variables.global_variables_initializer())
-
-      self.assertAllClose({
-          'probabilities': np.array([[0.1], [0.02]])
-      }, session.run(estimator_spec.predictions))
-
-  def test_train_single_tower(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, devices=['/gpu:0'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # loss' of c is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(7.0, session.run(c))
-
-  def test_eval_single_tower(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, devices=['/gpu:0'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
-      session.run(variables.local_variables_initializer())
-      session.run(variables.global_variables_initializer())
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      session.run([a, b])
-      accuracy = session.run(accuracy)
-      auc = session.run(auc)
-
-      # Accuracy is 0.0 (no match) in the first tower.
-      # Accuracy is 1.0 (match) in the second tower, since the feature
-      # times weight "c" happened to be equal to the label.
-      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02))
-
-      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
-
-  def test_predict_single_tower(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, devices=['/gpu:0'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.PREDICT, self.params)
-      session.run(variables.global_variables_initializer())
-
-      self.assertAllClose({
-          'probabilities': np.array([[0.1], [0.02]])
-      }, session.run(estimator_spec.predictions))
-
-  def test_batch_size_that_is_not_divisible_by_the_number_of_gpus(self):
-    features = np.array([[1.0], [2.0], [3.0]])
-    labels = np.array([[1.0], [2.0], [3.0]])
-
-    with self.assertRaisesRegexp(
-        ValueError, '.*Batch.+size.+needs.+to.+be.+divisible.+by.+GPUs.+'):
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, devices=['/gpu:0', '/gpu:1'])
-      _ = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-
-  def test_unsupported_loss_reduction(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 '.+none.+reduction.+is.+specified.+'):
-      _ = replicate_model_fn.replicate_model_fn(self.model_fn,
-                                                losses.Reduction.NONE)
-
-  def test_places_on_gpu_with_upper_case_spelling(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.cached_session():
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, devices=['/GPU:0'])
-      _ = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:0', c.device)
-
-  def test_places_on_gpu_with_lower_case_spelling(self):
-    features = np.array([[0.01], [0.002]])
-    labels = np.array([[0.01], [0.02]])
-
-    with self.cached_session():
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, devices=['/gpu:0'])
-      _ = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:0', c.device)
-
-
-class ReplicateAcrossASingleDeviceWithoutTowerOptimizer(
-    test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = gradient_descent.GradientDescentOptimizer(
-        params['learning_rate'])
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(loss))
-
-  @property
-  def params(self):
-    params = {}
-    params['learning_rate'] = 1.0
-    return params
-
-  def test_train_single_tower(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn, devices=['/gpu:0'])
-      estimator_spec = replicated_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # loss' of c is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(7.0, session.run(c))
-
-
-class MakeSureSyncReplicasOptimizerWorks(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    features = features['features']
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = gradient_descent.GradientDescentOptimizer(
-        params['learning_rate'])
-    optimizer = training.SyncReplicasOptimizer(
-        optimizer, replicas_to_aggregate=1)
-    sync_hook = optimizer.make_session_run_hook(True)
-    optimizer = replicate_model_fn.TowerOptimizer(optimizer)
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        training_hooks=[sync_hook],
-        predictions={'probabilities': predictions},
-        train_op=optimizer.minimize(
-            loss, global_step=training.get_global_step()))
-
-  @property
-  def params(self):
-    params = {}
-    params['learning_rate'] = 1.0
-    return params
-
-  def test_train_multiple_towers(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'features': features}, y=labels, batch_size=2, shuffle=False)
-
-    model_fn = replicate_model_fn.replicate_model_fn(
-        self.model_fn,
-        loss_reduction=losses.Reduction.SUM,
-        devices=['/gpu:0', '/gpu:1'])
-
-    estimator = estimator_lib.Estimator(
-        model_fn=model_fn, model_dir=tempfile.mkdtemp(), params=self.params)
-    estimator.train(train_input_fn, steps=1)
-
-    self.assertEqual(7.0, estimator.get_variable_value('c'))
-
-
-class ReplicateWithTwoOptimizersTest(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    side_effects = variable_scope.get_variable(
-        'side_effects',
-        initializer=constant_op.constant(0, dtype=dtypes.float64),
-        dtype=dtypes.float64,
-        trainable=False)
-
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    first_optimizer = replicate_model_fn.TowerOptimizer(
-        gradient_descent.GradientDescentOptimizer(1.0))
-    second_optimizer = replicate_model_fn.TowerOptimizer(
-        adam.AdamOptimizer(1.0))
-
-    with ops_lib.control_dependencies([side_effects.assign_add(1.0)]):
-      first_grads_and_vars = first_optimizer.compute_gradients(loss)
-
-    train_op = control_flow_ops.group(
-        [first_optimizer.apply_gradients(first_grads_and_vars),
-         second_optimizer.minimize(loss)])
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=train_op)
-
-  def test_train(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn,
-          loss_reduction=losses.Reduction.SUM,
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(features, labels,
-                                           model_fn_lib.ModeKeys.TRAIN, {})
-      session.run(variables.global_variables_initializer())
-
-      # loss = feature * c - label
-      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      # loss' of c is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      # Adam subtracts another ~1.
-      session.run(estimator_spec.train_op)
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertNear(6.0, session.run(c), 0.000001)
-
-        side_effects = variable_scope.get_variable(
-            'side_effects', dtype=dtypes.float64)
-        self.assertNear(2.0, session.run(side_effects), 0.000001)
-
-
-class ReplicateWithTwoLossesAndOneOptimizer(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._should_skip_optimizer = False
-    self._towers_left_before_skipping_optimizer = -1
-
-  def incorrectly_skip_optimizer_for_tower(self, tower_number):
-    self._should_skip_optimizer = True
-    self._towers_left_before_skipping_optimizer = tower_number
-
-  def should_skip_optimizer(self):
-    if not self._should_skip_optimizer:
-      return False
-    if self._towers_left_before_skipping_optimizer == 0:
-      return True
-    else:
-      self._towers_left_before_skipping_optimizer -= 1
-      return False
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-    d = variable_scope.get_variable(
-        'd',
-        initializer=constant_op.constant(2, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    another_predictions = math_ops.multiply(features, d)
-    another_loss = losses.absolute_difference(
-        labels=labels,
-        predictions=another_predictions,
-        reduction=losses.Reduction.SUM)
-    another_loss = math_ops.reduce_sum(another_loss)
-
-    total_loss = math_ops.add(loss, another_loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    train_ops = []
-
-    optimizer = replicate_model_fn.TowerOptimizer(
-        gradient_descent.GradientDescentOptimizer(1.0))
-    train_ops.append(optimizer.minimize(loss, var_list=[c]))
-    if not self.should_skip_optimizer():
-      another_optimizer = replicate_model_fn.TowerOptimizer(
-          gradient_descent.GradientDescentOptimizer(1.0))
-      train_ops.append(another_optimizer.minimize(another_loss, var_list=[d]))
-
-    train_op = control_flow_ops.group(train_ops)
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=total_loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=train_op)
-
-  def test_train(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.cached_session() as session:
-      replicated_model_fn = replicate_model_fn.replicate_model_fn(
-          self.model_fn,
-          loss_reduction=losses.Reduction.SUM,
-          devices=['/gpu:0', '/gpu:1'])
-      estimator_spec = replicated_model_fn(features, labels,
-                                           model_fn_lib.ModeKeys.TRAIN, {})
-      session.run(variables.global_variables_initializer())
-
-      # For each tower, loss = (feature * c - label) + (feature * d - label).
-      total_loss = (1.0 * 10 - 1.0 + 1.0 * 2.0 - 1.0) + (
-          2.0 * 10 - 2.0 + 2.0 * 2.0 - 2.0)
-      self.assertEqual(total_loss, session.run(estimator_spec.loss))
-
-      session.run(estimator_spec.train_op)
-
-      # loss' of c or loss' of d is 3.
-      # new value of c = 10 - learning rate * 3 = 7.0.
-      # new value of d = 2  - learning rate * 3 = -1.0.
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertNear(7.0, session.run(c), 0.000001)
-        d = variable_scope.get_variable('d', dtype=dtypes.float64)
-        self.assertNear(-1.0, session.run(d), 0.000001)
-
-  def test_different_optimizer_calls_within_towers(self):
-    self.incorrectly_skip_optimizer_for_tower(1)
-
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.cached_session(), ops_lib.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError, '.+was.+supposed.+to.+make.+same.+optimizer.+calls.+'):
-        replicated_model_fn = replicate_model_fn.replicate_model_fn(
-            self.model_fn, devices=['/gpu:0', '/gpu:1'])
-        _ = replicated_model_fn(features, labels, model_fn_lib.ModeKeys.TRAIN,
-                                {})
-
-
-class FailToWrapOptimizerInTheModelFn(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.multiply(features, c)
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-    loss = math_ops.reduce_sum(loss)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-
-    optimizer = gradient_descent.GradientDescentOptimizer(1.0)
-    train_op = optimizer.minimize(loss)
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=loss,
-        eval_metric_ops=metrics,
-        predictions={'probabilities': predictions},
-        train_op=train_op)
-
-  def test_train(self):
-    features = np.array([[1.0], [2.0]])
-    labels = np.array([[1.0], [2.0]])
-
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError,
-                                   'Please.+wrap.+with.+TowerOptimizer'):
-        replicated_model_fn = replicate_model_fn.replicate_model_fn(
-            self.model_fn, devices=['/gpu:0', '/gpu:1'])
-        _ = replicated_model_fn(features, labels, model_fn_lib.ModeKeys.TRAIN,
-                                {})
-
-
-class GetLossTowersTest(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(0.25, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.add(np.array([0.1, 0.2, 0.3, features[0]]), c)
-    labels = np.array([0.1, 0.2, 0.3, labels[0]])
-
-    loss = losses.absolute_difference(
-        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
-
-    return model_fn_lib.EstimatorSpec(mode=mode, loss=math_ops.reduce_sum(loss))
-
-  def test_gradients_are_computed(self):
-    with self.cached_session() as session:
-      tower_specs = replicate_model_fn._get_loss_towers(
-          self.model_fn,
-          mode=None,
-          features=[[0.6], [1.6]],
-          labels=[[0.6], [0.6]],
-          params=None,
-          config=None,
-          loss_reduction=losses.Reduction.SUM,
-          devices=['/gpu:0', '/gpu:1'],
-          local_ps_devices=['/gpu:0'],
-          name_scope_pattern='test_tower_{}')
-      session.run(variables.global_variables_initializer())
-
-      self.assertEqual(len(tower_specs), 2)
-
-      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
-      self.assertEqual('Sum:0', tower_specs[0].loss.name)
-      self.assertEqual(1.0, session.run(tower_specs[0].loss))
-
-      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
-      self.assertEqual('test_tower_1/Sum:0', tower_specs[1].loss.name)
-      # The input batch for the second tower had a loss that is 1.0
-      # bigger: 0.6 vs 1.6.
-      self.assertEqual(2.0, session.run(tower_specs[1].loss))
-
-      self.assertEqual(1, len(variables.global_variables()))
-      self.assertEqual(1, len(variables.trainable_variables()))
-
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(0.25, session.run(c))
-
-  def test_gradients_are_computed_with_mean_reduction(self):
-    with self.cached_session() as session:
-      tower_specs = replicate_model_fn._get_loss_towers(
-          self.model_fn,
-          mode=model_fn_lib.ModeKeys.EVAL,
-          features=[[0.6], [1.6]],
-          labels=[[0.6], [0.6]],
-          params=None,
-          loss_reduction=losses.Reduction.MEAN,
-          config=None,
-          devices=['/gpu:0', '/gpu:1'],
-          local_ps_devices=['/gpu:0'],
-          name_scope_pattern='test_tower_{}')
-      session.run(variables.global_variables_initializer())
-
-      self.assertEqual(len(tower_specs), 2)
-
-      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
-      self.assertEqual('averaged_loss:0', tower_specs[0].loss.name)
-      self.assertEqual(0.5, session.run(tower_specs[0].loss))
-
-      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
-      self.assertEqual('test_tower_1/averaged_loss:0', tower_specs[1].loss.name)
-      # The input batch for the second tower had a loss that is 1.0
-      # bigger: 0.6 vs 1.6.
-      self.assertEqual(1.0, session.run(tower_specs[1].loss))
-
-      self.assertEqual(1, len(variables.global_variables()))
-      self.assertEqual(1, len(variables.trainable_variables()))
-
-      with variable_scope.variable_scope('', reuse=True):
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual(0.25, session.run(c))
-
-  def test_variables_are_round_robined_correctly(self):
-    """Test that creates multiple variables and tests round-robin placement."""
-
-    def model_fn(mode, features, labels, params):
-      del params
-      for variable_name in ['a', 'b', 'c', 'd']:
-        c = variable_scope.get_variable(
-            variable_name,
-            initializer=constant_op.constant(0.25, dtype=dtypes.float64),
-            dtype=dtypes.float64)
-
-      predictions = math_ops.add(np.array([0.1, 0.2, 0.3, features[0]]), c)
-      labels = np.array([0.1, 0.2, 0.3, labels[0]])
-      loss = losses.absolute_difference(
-          labels=labels,
-          predictions=predictions,
-          reduction=losses.Reduction.SUM)
-      return model_fn_lib.EstimatorSpec(
-          mode=mode, loss=math_ops.reduce_sum(loss))
-
-    with self.cached_session() as session:
-      tower_specs = replicate_model_fn._get_loss_towers(
-          model_fn,
-          mode=None,
-          features=[[0.6], [1.6], [2.6]],
-          labels=[[0.6], [0.6], [2.6]],
-          params=None,
-          loss_reduction=losses.Reduction.SUM,
-          config=None,
-          devices=['/gpu:0', '/gpu:1', '/gpu:3'],
-          local_ps_devices=['/gpu:0', '/gpu:1', '/gpu:3'],
-          name_scope_pattern='test_tower_{}')
-      session.run(variables.global_variables_initializer())
-
-      self.assertEqual(len(tower_specs), 3)
-      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
-      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
-      self.assertEqual('/device:GPU:3', tower_specs[2].loss.device)
-
-      with variable_scope.variable_scope('', reuse=True):
-        a = variable_scope.get_variable('a', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:0', a.device)
-        b = variable_scope.get_variable('b', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:1', b.device)
-        c = variable_scope.get_variable('c', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:3', c.device)
-        d = variable_scope.get_variable('d', dtype=dtypes.float64)
-        self.assertEqual('/device:GPU:0', d.device)
-
-
-class SplitBatchTest(test_util.TensorFlowTestCase):
-
-  def evaluate_shards(self, first_list, second_list):
-    evaluate_items = lambda x: x.eval()
-    return list(map(evaluate_items, first_list)), list(
-        map(evaluate_items, second_list))
-
-  def assertSparseValuesEqual(self, a, b):
-    self.assertAllEqual(a.indices, b.indices)
-    self.assertAllEqual(a.values, b.values)
-    self.assertAllEqual(a.dense_shape, b.dense_shape)
-
-  def test_simple_half_split(self):
-    with self.cached_session():
-      features = [0.0, 1.0, 2.0, 3.0]
-      labels = [10.0, 11.0, 12.0, 13.0]
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      feature_shards, label_shards = self.evaluate_shards(
-          feature_shards, label_shards)
-
-      self.assertAllEqual([[0.0, 1.0], [2.0, 3.0]], feature_shards)
-      self.assertAllEqual([[10.0, 11.0], [12.0, 13.0]], label_shards)
-
-  def test_to_each_their_own(self):
-    with self.cached_session():
-      features = [0.0, 1.0, 2.0, 3.0]
-      labels = [10.0, 11.0, 12.0, 13.0]
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 4, device='/gpu:0')
-
-      feature_shards, label_shards = self.evaluate_shards(
-          feature_shards, label_shards)
-
-      self.assertAllEqual([[0.0], [1.0], [2.0], [3.0]], feature_shards)
-      self.assertAllEqual([[10.0], [11.0], [12.0], [13.0]], label_shards)
-
-  def test_one_batch(self):
-    with self.cached_session():
-      features = [0.0, 1.0, 2.0, 3.0]
-      labels = [10.0, 11.0, 12.0, 13.0]
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 1, device='/gpu:0')
-
-      feature_shards, label_shards = self.evaluate_shards(
-          feature_shards, label_shards)
-
-      self.assertAllEqual([[0.0, 1.0, 2.0, 3.0]], feature_shards)
-      self.assertAllEqual([[10.0, 11.0, 12.0, 13.0]], label_shards)
-
-  def test_half_split_in_dictionary(self):
-    with self.cached_session():
-      features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
-      labels = [10.0, 11.0, 12.0, 13.0]
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      self.assertAllEqual([0.0, 1.0], feature_shards[0]['first'].eval())
-      self.assertAllEqual([4.0, 5.0], feature_shards[0]['second'].eval())
-      self.assertAllEqual([2.0, 3.0], feature_shards[1]['first'].eval())
-      self.assertAllEqual([6.0, 7.0], feature_shards[1]['second'].eval())
-      self.assertAllEqual([10.0, 11.0], label_shards[0].eval())
-      self.assertAllEqual([12.0, 13.0], label_shards[1].eval())
-
-  def test_sparse_tensor_can_be_split_unevenly(self):
-    with self.cached_session():
-      features = {
-          'x':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [1, 2], [2, 2]],
-                  values=[1.0, 2.0, 3.0],
-                  dense_shape=[3, 4])
-      }
-      labels = np.array([[1.0], [2.0]])
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      self.assertSparseValuesEqual(
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [1, 2]], values=[1., 2.], dense_shape=[2, 4]),
-          feature_shards[0]['x'].eval())
-      self.assertSparseValuesEqual(
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 2]], values=[3.], dense_shape=[1, 4]),
-          feature_shards[1]['x'].eval())
-      self.assertAllEqual([[1.0]], label_shards[0].eval())
-      self.assertAllEqual([[2.0]], label_shards[1].eval())
-
-  def test_sparse_tensor_can_be_split_unevenly_repeated_row(self):
-    with self.cached_session():
-      features = {
-          'x':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [1, 0], [1, 1]],
-                  values=[1.0, 2.0, 3.0],
-                  dense_shape=[3, 4])
-      }
-      labels = np.array([[1.0], [2.0]])
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      self.assertSparseValuesEqual(
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [1, 0], [1, 1]],
-              values=[1., 2., 3.],
-              dense_shape=[2, 4]), feature_shards[0]['x'].eval())
-
-      second_batch = feature_shards[1]['x'].eval()
-      self.assertFalse(len(second_batch.indices))
-      self.assertFalse(len(second_batch.values))
-      self.assertAllEqual([1, 4], second_batch.dense_shape)
-      self.assertAllEqual([[1.0]], label_shards[0].eval())
-      self.assertAllEqual([[2.0]], label_shards[1].eval())
-
-  def test_one_batch_in_dictionary(self):
-    with self.cached_session() as session:  # pylint: disable=unused-variable
-      features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
-      labels = [10.0, 11.0, 12.0, 13.0]
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 1, device='/gpu:0')
-
-      self.assertAllEqual([0.0, 1.0, 2.0, 3.0],
-                          feature_shards[0]['first'].eval())
-      self.assertAllEqual([4.0, 5.0, 6.0, 7.0],
-                          feature_shards[0]['second'].eval())
-      self.assertAllEqual([10.0, 11.0, 12.0, 13.0], label_shards[0].eval())
-
-  def test_feature_and_label_dictionaries(self):
-    with self.cached_session() as session:  # pylint: disable=unused-variable
-      features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
-      labels = {'first': [10.0, 11.0], 'second': [12.0, 13.0]}
-
-      feature_shards, label_shards = replicate_model_fn._split_batch(
-          features, labels, 2, device='/gpu:0')
-
-      self.assertAllEqual([0.0, 1.0], feature_shards[0]['first'].eval())
-      self.assertAllEqual([4.0, 5.0], feature_shards[0]['second'].eval())
-      self.assertAllEqual([2.0, 3.0], feature_shards[1]['first'].eval())
-      self.assertAllEqual([6.0, 7.0], feature_shards[1]['second'].eval())
-      self.assertAllEqual([10.0], label_shards[0]['first'].eval())
-      self.assertAllEqual([12.0], label_shards[0]['second'].eval())
-      self.assertAllEqual([11], label_shards[1]['first'].eval())
-      self.assertAllEqual([13.0], label_shards[1]['second'].eval())
-
-
-class TrainSpecTest(test_util.TensorFlowTestCase):
-
-  expected_predictions = {}
-
-  def create_estimator_spec(self, loss):
-    return model_fn_lib.EstimatorSpec(
-        mode=model_fn_lib.ModeKeys.TRAIN,
-        loss=loss,
-        train_op=loss,  # Not used; currently required.
-        predictions=self.expected_predictions)
-
-  def create_constant_loss(self, loss_value):
-    return constant_op.constant(loss_value, dtype=dtypes.float64)
-
-  def test_example(self):
-    with self.cached_session() as session:
-      tower_losses = list(map(self.create_constant_loss, [2, 4, 6]))
-      tower_specs = list(map(self.create_estimator_spec, tower_losses))
-
-      expected_train_op = tower_losses[1]
-
-      estimator_spec = replicate_model_fn._train_spec(
-          tower_specs, expected_train_op, aggregation_device='/gpu:0')
-
-      self.assertEqual(expected_train_op, estimator_spec.train_op)
-      self.assertEqual(2 + 4 + 6, session.run(estimator_spec.loss))
-      self.assertEqual(self.expected_predictions, estimator_spec.predictions)
-
-
-class EvalSpecTest(test_util.TensorFlowTestCase):
-
-  def create_estimator_spec(self, loss, metrics):
-    return model_fn_lib.EstimatorSpec(
-        mode=model_fn_lib.ModeKeys.EVAL, loss=loss, eval_metric_ops=metrics)
-
-  def create_constant_loss(self, loss_value):
-    return constant_op.constant(loss_value, dtype=dtypes.float64)
-
-  def create_eval_metrics(self, noise):
-    predictions = np.array([0.1, 0.2, 0.3, 0.6 + noise])
-    labels = np.array([0.1, 0.2, 0.3, 0.6])
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions),
-        'auc': metrics_lib.auc(labels, predictions)
-    }
-    return metrics
-
-  def test_example(self):
-    with self.cached_session() as session:
-      tower_losses = map(self.create_constant_loss, [2, 4, 6])
-      tower_metrics = map(self.create_eval_metrics, [0, 0.2, 0.3])
-      tower_specs = [
-          self.create_estimator_spec(l, m)
-          for l, m in zip(tower_losses, tower_metrics)
-      ]
-      session.run(variables.local_variables_initializer())
-
-      estimator_spec = replicate_model_fn._eval_spec(
-          tower_specs, aggregation_device='/device:GPU:0')
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      self.assertEqual('/device:CPU:0', accuracy.device)
-      self.assertEqual('/device:CPU:0', auc.device)
-
-      session.run([a, b])
-      accuracy, auc = session.run([accuracy, auc])
-
-      self.assertNear((12 - 2) / 12, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertEqual(2 + 4 + 6, session.run(estimator_spec.loss))
-
-  def test_handles_single_tower(self):
-    with self.cached_session() as session:
-      tower_losses = map(self.create_constant_loss, [5])
-      tower_metrics = map(self.create_eval_metrics, [0.2])
-      tower_specs = [
-          self.create_estimator_spec(l, m)
-          for l, m in zip(tower_losses, tower_metrics)
-      ]
-      session.run(variables.local_variables_initializer())
-
-      estimator_spec = replicate_model_fn._eval_spec(
-          tower_specs, aggregation_device='/device:GPU:0')
-
-      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
-      auc, b = estimator_spec.eval_metric_ops['auc']
-
-      self.assertEqual('/device:CPU:0', accuracy.device)
-      self.assertEqual('/device:CPU:0', auc.device)
-
-      session.run([a, b])
-      accuracy = session.run(accuracy)
-      auc = session.run(auc)
-
-      self.assertNear((4 - 1) / 4, accuracy, 0.01)
-      self.assertEqual(0, auc)
-      self.assertEqual(5, session.run(estimator_spec.loss))
-
-
-class PredictSpecTest(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(0.25, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = math_ops.add(np.array([features[0], features[0]]), c)
-
-    return model_fn_lib.EstimatorSpec(
-        mode=model_fn_lib.ModeKeys.PREDICT,
-        predictions={
-            'probabilities': predictions
-        })
-
-  def test_example(self):
-    with self.cached_session() as session:
-      tower_specs = replicate_model_fn._get_loss_towers(
-          self.model_fn,
-          mode=None,
-          features=[[0.1], [0.2]],
-          loss_reduction=losses.Reduction.SUM,
-          labels=[[], []],
-          params=None,
-          config=None,
-          devices=['/gpu:0', '/gpu:1'],
-          local_ps_devices=['/gpu:0'],
-      )
-      session.run(variables.global_variables_initializer())
-
-      estimator_spec = replicate_model_fn._predict_spec(
-          tower_specs, aggregation_device='/gpu:0')
-
-      self.assertEqual('/device:GPU:0',
-                       estimator_spec.predictions['probabilities'].device)
-      self.assertAllClose({
-          'probabilities': np.array([0.35, 0.35, 0.45, 0.45])
-      }, session.run(estimator_spec.predictions))
-
-
-class ReduceMetricVariablesTest(test_util.TensorFlowTestCase):
-
-  def create_metric_variable(self, initial_value, name):
-    return variable_scope.variable(
-        initial_value,
-        trainable=False,
-        collections=[ops_lib.GraphKeys.METRIC_VARIABLES],
-        validate_shape=True,
-        name=name)
-
-  def create_tower_metrics(self, tower_id):
-    with variable_scope.variable_scope('', reuse=(tower_id != 0)):
-      self.create_metric_variable(1.3 * (tower_id + 1), 'total')
-      self.create_metric_variable(2.3 * (tower_id + 1), 'count')
-      self.create_metric_variable(
-          np.array([3.3, 3.5, 3.7]) * (tower_id + 1), 'total')
-
-  def test_example(self):
-    with self.cached_session() as session:
-      for tower_id in range(3):
-        self.create_tower_metrics(tower_id)
-
-      session.run(
-          variables.variables_initializer(
-              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
-
-      session.run(
-          replicate_model_fn._reduce_metric_variables(number_of_towers=3))
-
-      # 1st tower = 1.3, 2.3,  [3.3, 3.5, 3.7]
-      # 2nd tower = 2.6, 4.6,  [6.6, 7.0, 7.4]
-      # 3rd tower = 3.9, 6.9,  [9.9, 10.5, 11.1]
-      # Reduced =   7.8, 13.8, [19.8, 21.0, 22.2]
-      # Towers are accumulated in the first tower.
-      local_metrics = session.run(
-          ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES))
-
-      self.assertNear(7.8, local_metrics[0], 0.01)
-      self.assertNear(13.8, local_metrics[1], 0.01)
-      self.assertAllClose([19.8, 21., 22.1], local_metrics[2], 0.01)
-      self.assertNear(0.0, local_metrics[3], 0.01)
-      self.assertNear(0.0, local_metrics[4], 0.01)
-      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[5], 0.01)
-      self.assertNear(0.0, local_metrics[6], 0.01)
-      self.assertNear(0.0, local_metrics[7], 0.01)
-      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[8], 0.01)
-
-  def test_reduce_is_idempotent(self):
-    with self.cached_session() as session:
-      for tower_id in range(3):
-        self.create_tower_metrics(tower_id)
-
-      session.run(
-          variables.variables_initializer(
-              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
-
-      for _ in range(20):
-        session.run(
-            replicate_model_fn._reduce_metric_variables(number_of_towers=3))
-
-      local_metrics = session.run(
-          ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES))
-
-      self.assertNear(7.8, local_metrics[0], 0.01)
-      self.assertNear(13.8, local_metrics[1], 0.01)
-      self.assertAllClose([19.8, 21., 22.1], local_metrics[2], 0.01)
-      self.assertNear(0.0, local_metrics[3], 0.01)
-      self.assertNear(0.0, local_metrics[4], 0.01)
-      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[5], 0.01)
-      self.assertNear(0.0, local_metrics[6], 0.01)
-      self.assertNear(0.0, local_metrics[7], 0.01)
-      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[8], 0.01)
-
-  def test_handles_single_tower(self):
-    with self.cached_session() as session:
-      self.create_tower_metrics(0)
-      session.run(
-          variables.variables_initializer(
-              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
-
-      session.run(
-          replicate_model_fn._reduce_metric_variables(number_of_towers=1))
-
-      local_metrics = session.run(
-          ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES))
-
-      self.assertNear(1.3, local_metrics[0], 0.01)
-      self.assertNear(2.3, local_metrics[1], 0.01)
-      self.assertAllClose([3.3, 3.5, 3.7], local_metrics[2], 0.01)
-
-  def test_doesnt_accept_uneven_number_of_variables(self):
-    with self.cached_session() as session:
-      for tower_id in range(3):
-        self.create_tower_metrics(tower_id)
-      self.create_metric_variable(-1.0, 'oddball')
-
-      session.run(
-          variables.variables_initializer(
-              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
-
-      with self.assertRaisesRegexp(
-          ValueError, '.+Expected.+local.+variables.+but.+got.+instead.+'):
-        session.run(
-            replicate_model_fn._reduce_metric_variables(number_of_towers=3))
-
-
-class MergeExportOutputsTest(test_util.TensorFlowTestCase):
-
-  def model_fn(self, mode, features, labels, params):
-    c = variable_scope.get_variable(
-        'c',
-        initializer=constant_op.constant(10, dtype=dtypes.float64),
-        dtype=dtypes.float64)
-
-    predictions = {'probabilities': math_ops.multiply(features, c)}
-    loss = losses.absolute_difference(
-        labels=labels,
-        predictions=predictions['probabilities'],
-        reduction=losses.Reduction.SUM)
-
-    metrics = {
-        'accuracy': metrics_lib.accuracy(labels, predictions['probabilities']),
-        'auc': metrics_lib.auc(labels, predictions['probabilities'])
-    }
-    tensor_string_repr = str(features)
-    classes = constant_op.constant(
-        re.search('(split_inputs/split:[0-9])', tensor_string_repr).group(1),
-        dtype=dtypes.string)
-
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.PredictOutput(predictions),
-        'classification_output':
-            export_output.ClassificationOutput(predictions['probabilities'],
-                                               classes),
-        'classification_scores':
-            export_output.ClassificationOutput(
-                scores=predictions['probabilities']),
-        'classification_classes':
-            export_output.ClassificationOutput(classes=classes),
-        'regression_output':
-            export_output.RegressionOutput(predictions['probabilities']),
-    }
-
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        loss=math_ops.reduce_sum(loss),
-        eval_metric_ops=metrics,
-        predictions=predictions,
-        export_outputs=export_outputs)
-
-  def replicate_estimator_spec(self, session):
-    features = np.array([0.01, 0.002])
-    labels = np.array([0.01, 0.02])
-
-    replicated_model_fn = replicate_model_fn.replicate_model_fn(
-        self.model_fn, devices=['/gpu:0', '/gpu:1'])
-    estimator_spec = replicated_model_fn(features, labels,
-                                         model_fn_lib.ModeKeys.PREDICT, {})
-    session.run(variables.global_variables_initializer())
-    return estimator_spec
-
-  def test_merge_predict_output(self):
-    with self.cached_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllClose(
-          {
-              'probabilities': np.array([0.1, 0.02])
-          },
-          session.run(estimator_spec.export_outputs[
-              signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs))
-
-  def test_merge_classification_output_scores_classes(self):
-    with self.cached_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllClose(
-          [0.1, 0.02],
-          session.run(
-              estimator_spec.export_outputs['classification_output'].scores))
-      self.assertAllEqual(
-          [b'split_inputs/split:0', b'split_inputs/split:1'],
-          session.run(
-              estimator_spec.export_outputs['classification_output'].classes))
-
-  def test_merge_classification_output_scores(self):
-    with self.cached_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllClose(
-          [0.1, 0.02],
-          session.run(
-              estimator_spec.export_outputs['classification_scores'].scores))
-      self.assertEqual(
-          None, estimator_spec.export_outputs['classification_scores'].classes)
-
-  def test_merge_classification_output_classes(self):
-    with self.cached_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllEqual(
-          [b'split_inputs/split:0', b'split_inputs/split:1'],
-          session.run(
-              estimator_spec.export_outputs['classification_classes'].classes))
-      self.assertEqual(
-          None, estimator_spec.export_outputs['classification_classes'].scores)
-
-  def test_merge_regression_output(self):
-    with self.cached_session() as session:
-      estimator_spec = self.replicate_estimator_spec(session)
-      self.assertAllClose(
-          [0.1, 0.02],
-          session.run(estimator_spec.export_outputs['regression_output'].value))
-
-
-class GetLocalDevicesTest(test_util.TensorFlowTestCase):
-
-  def test_there_is_at_least_a_cpu(self):
-    self.assertTrue(replicate_model_fn._get_local_devices('CPU'))
-
-  def test_there_is_no_xpu(self):
-    self.assertFalse(
-        replicate_model_fn._get_local_devices('XPU'))  # XPU doesn't exist.
-
-  def test_whether_there_is_a_gpu(self):
-    if test.is_gpu_available():
-      self.assertTrue(len(replicate_model_fn._get_local_devices('GPU')))
-
-
-class LocalDeviceSetterTest(test_util.TensorFlowTestCase):
-
-  def test_vars_are_on_ps_but_ops_are_on_workers(self):
-    ps_devices = ['/device:GPU:3']
-    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
-
-    local_device_setter = replicate_model_fn._local_device_setter(
-        ps_devices=ps_devices,
-        ps_strategy=round_robin,
-        worker_device='/device:GPU:2')
-
-    with ops_lib.device(local_device_setter):
-      a = variables.Variable(0.01)
-      self.assertEqual('/device:GPU:3', a.device)
-
-      b = variables.Variable(0.02)
-      self.assertEqual('/device:GPU:3', b.device)
-
-      c = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:3', c.device)
-
-      a_op = array_ops.concat(a, axis=0)
-      self.assertEqual('/device:GPU:2', a_op.device)
-
-      b_op = array_ops.concat(b, axis=0)
-      self.assertEqual('/device:GPU:2', b_op.device)
-
-  def test_round_robin_placement(self):
-    ps_devices = [
-        '/device:GPU:0', '/device:GPU:1', '/device:GPU:3', '/device:GPU:4'
-    ]
-    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
-
-    local_device_setter = replicate_model_fn._local_device_setter(
-        ps_devices=ps_devices,
-        ps_strategy=round_robin,
-        worker_device='/device:GPU:2')
-
-    with ops_lib.device(local_device_setter):
-      a = variables.Variable(0.01)
-      self.assertEqual('/device:GPU:0', a.device)
-
-      b = variables.Variable(0.02)
-      self.assertEqual('/device:GPU:1', b.device)
-
-      c = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:3', c.device)
-
-      a_op = array_ops.concat(a, axis=0)
-      self.assertEqual('/device:GPU:2', a_op.device)
-
-      b_op = array_ops.concat(b, axis=0)
-      self.assertEqual('/device:GPU:2', b_op.device)
-
-      c = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:4', c.device)
-
-      d = variables.Variable(0.03)
-      self.assertEqual('/device:GPU:0', d.device)
-
-      c_op = array_ops.concat(c, axis=0)
-      self.assertEqual('/device:GPU:2', c_op.device)
-
-
-class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
-
-  def test_vectors(self):
-    with self.cached_session() as session:
-      total = replicate_model_fn._compute_sum_on_device(
-          [1.0, 2.0, 3.0, 4.0], device='/device:GPU:0', name='test_sum')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertEqual('test_sum', total.op.name)
-      self.assertEqual(10.0, session.run(total))
-
-  def test_tensors(self):
-    with self.cached_session() as session:
-      total = replicate_model_fn._compute_sum_on_device(
-          [[1.0, 2.0], [3.0, 4.0]], device='/device:GPU:0', name='test_sum')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertEqual('test_sum', total.op.name)
-      self.assertAllEqual([4.0, 6.0], session.run(total))
-
-  def test_indexedslices(self):
-    with self.cached_session() as session:
-      a = ops_lib.IndexedSlices(
-          constant_op.constant([1.0, 2.0]), [0, 1],
-          dense_shape=constant_op.constant([2]))
-      b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
-
-      total = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertAllEqual([4.0, 6.0],
-                          session.run(ops_lib.convert_to_tensor(total)))
-
-  def test_indexedslices_higher_dimensions(self):
-    with self.cached_session() as session:
-      a = ops_lib.IndexedSlices(
-          constant_op.constant([[1.0, 5.0], [2.0, 6.0]]), [0, 1],
-          dense_shape=constant_op.constant([2, 4]))
-      b = ops_lib.IndexedSlices(
-          constant_op.constant([[3.0, 7.0], [4.0, 8.0]]), [0, 1])
-
-      total = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertAllEqual([[4.0, 12.0], [6.0, 14.0]],
-                          session.run(ops_lib.convert_to_tensor(total)))
-
-  def test_indexedslices_some_dont_overlap(self):
-    with self.cached_session() as session:
-      a = ops_lib.IndexedSlices(
-          constant_op.constant([1.0, 2.0]), [0, 3],
-          dense_shape=constant_op.constant([4]))
-      b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
-
-      total = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0')
-
-      self.assertEqual('/device:GPU:0', total.device)
-      self.assertAllEqual([4.0, 4.0, 0.0, 2.0],
-                          session.run(ops_lib.convert_to_tensor(total)))
-
-  def test_no_name_for_indexslices(self):
-    a = ops_lib.IndexedSlices(
-        constant_op.constant([1.0, 2.0]), [0, 1],
-        dense_shape=constant_op.constant([2]))
-    b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
-
-    with self.assertRaisesRegexp(ValueError, '.+name.+not.+expected.+'):
-      _ = replicate_model_fn._compute_sum_on_device(
-          [a, b], device='/device:GPU:0', name='cant_name_indexslices')
-
-
-class ConcatTensorDictsTest(test_util.TensorFlowTestCase):
-
-  def test_example(self):
-    tensor_dicts = [
-        {
-            'a': np.array([1.0, 2.0]),
-            'b': np.array([11.0]),
-            'c': np.array([21.0]),
-        },
-        {
-            'a': np.array([3.0]),
-            'b': np.array([12.0, 13.0]),
-        },
-        {
-            'b': np.array([14.0]),
-        },
-    ]
-
-    with self.cached_session() as session:
-      self.assertAllClose({
-          'a': np.array([1.0, 2.0, 3.0]),
-          'b': np.array([11.0, 12.0, 13.0, 14.0]),
-          'c': np.array([21.0]),
-      }, session.run(replicate_model_fn._concat_tensor_dicts(*tensor_dicts)))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn.py b/tensorflow/contrib/estimator/python/estimator/rnn.py
index 98660bb7317ae76a7da7c90a5c890ab8e69037fe..c5ddd1b73590c9a2c48889303d823c427eb781f6 100644
--- a/tensorflow/contrib/estimator/python/estimator/rnn.py
+++ b/tensorflow/contrib/estimator/python/estimator/rnn.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,619 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Recurrent Neural Network estimators."""
+"""rnn python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.contrib.estimator.python.estimator import extenders
-from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.layers import core as core_layers
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import rnn
-from tensorflow.python.ops import rnn_cell
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.summary import summary
-from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.training import training_util
-
-
-# The defaults are historical artifacts of the initial implementation, but seem
-# reasonable choices.
-_DEFAULT_LEARNING_RATE = 0.05
-_DEFAULT_CLIP_NORM = 5.0
-
-_CELL_TYPES = {'basic_rnn': rnn_cell.BasicRNNCell,
-               'lstm': rnn_cell.BasicLSTMCell,
-               'gru': rnn_cell.GRUCell}
-
-# Indicates no value was provided by the user to a kwarg.
-USE_DEFAULT = object()
-
-
-def _single_rnn_cell(num_units, cell_type):
-  cell_type = _CELL_TYPES.get(cell_type, cell_type)
-  if not cell_type or not issubclass(cell_type, rnn_cell.RNNCell):
-    raise ValueError('Supported cell types are {}; got {}'.format(
-        list(_CELL_TYPES.keys()), cell_type))
-  return cell_type(num_units=num_units)
-
-
-def _make_rnn_cell_fn(num_units, cell_type='basic_rnn'):
-  """Convenience function to create `rnn_cell_fn` for canned RNN Estimators.
-
-  Args:
-    num_units: Iterable of integer number of hidden units per RNN layer.
-    cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
-      the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
-      `'gru'`.
-
-  Returns:
-    A function that takes a single argument, an instance of
-    `tf.estimator.ModeKeys`, and returns an instance derived from
-    `tf.nn.rnn_cell.RNNCell`.
-
-  Raises:
-    ValueError: If cell_type is not supported.
-  """
-  def rnn_cell_fn(mode):
-    # Unused. Part of the rnn_cell_fn interface since user specified functions
-    # may need different behavior across modes (e.g. dropout).
-    del mode
-    cells = [_single_rnn_cell(n, cell_type) for n in num_units]
-    if len(cells) == 1:
-      return cells[0]
-    return rnn_cell.MultiRNNCell(cells)
-  return rnn_cell_fn
-
-
-def _concatenate_context_input(sequence_input, context_input):
-  """Replicates `context_input` across all timesteps of `sequence_input`.
-
-  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
-  This value is appended to `sequence_input` on dimension 2 and the result is
-  returned.
-
-  Args:
-    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
-      padded_length, d0]`.
-    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
-
-  Returns:
-    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
-    d0 + d1]`.
-
-  Raises:
-    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
-      not have rank 2.
-  """
-  seq_rank_check = check_ops.assert_rank(
-      sequence_input,
-      3,
-      message='sequence_input must have rank 3',
-      data=[array_ops.shape(sequence_input)])
-  seq_type_check = check_ops.assert_type(
-      sequence_input,
-      dtypes.float32,
-      message='sequence_input must have dtype float32; got {}.'.format(
-          sequence_input.dtype))
-  ctx_rank_check = check_ops.assert_rank(
-      context_input,
-      2,
-      message='context_input must have rank 2',
-      data=[array_ops.shape(context_input)])
-  ctx_type_check = check_ops.assert_type(
-      context_input,
-      dtypes.float32,
-      message='context_input must have dtype float32; got {}.'.format(
-          context_input.dtype))
-  with ops.control_dependencies(
-      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
-    padded_length = array_ops.shape(sequence_input)[1]
-    tiled_context_input = array_ops.tile(
-        array_ops.expand_dims(context_input, 1),
-        array_ops.concat([[1], [padded_length], [1]], 0))
-  return array_ops.concat([sequence_input, tiled_context_input], 2)
-
-
-def _select_last_activations(activations, sequence_lengths):
-  """Selects the nth set of activations for each n in `sequence_length`.
-
-  Returns a `Tensor` of shape `[batch_size, k]`. If `sequence_length` is not
-  `None`, then `output[i, :] = activations[i, sequence_length[i] - 1, :]`. If
-  `sequence_length` is `None`, then `output[i, :] = activations[i, -1, :]`.
-
-  Args:
-    activations: A `Tensor` with shape `[batch_size, padded_length, k]`.
-    sequence_lengths: A `Tensor` with shape `[batch_size]` or `None`.
-  Returns:
-    A `Tensor` of shape `[batch_size, k]`.
-  """
-  with ops.name_scope(
-      'select_last_activations', values=[activations, sequence_lengths]):
-    activations_shape = array_ops.shape(activations)
-    batch_size = activations_shape[0]
-    padded_length = activations_shape[1]
-    output_units = activations_shape[2]
-    if sequence_lengths is None:
-      sequence_lengths = padded_length
-    start_indices = math_ops.to_int64(
-        math_ops.range(batch_size) * padded_length)
-    last_indices = start_indices + sequence_lengths - 1
-    reshaped_activations = array_ops.reshape(
-        activations, [batch_size * padded_length, output_units])
-
-    last_activations = array_ops.gather(reshaped_activations, last_indices)
-    last_activations.set_shape([activations.shape[0], activations.shape[2]])
-    return last_activations
-
-
-def _rnn_logit_fn_builder(output_units, rnn_cell_fn, sequence_feature_columns,
-                          context_feature_columns, input_layer_partitioner):
-  """Function builder for a rnn logit_fn.
-
-  Args:
-    output_units: An int indicating the dimension of the logit layer.
-    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
-      returns an object of type `tf.nn.rnn_cell.RNNCell`.
-    sequence_feature_columns: An iterable containing the `FeatureColumn`s
-      that represent sequential input.
-    context_feature_columns: An iterable containing the `FeatureColumn`s
-      that represent contextual input.
-    input_layer_partitioner: Partitioner for input layer.
-
-  Returns:
-    A logit_fn (see below).
-
-  Raises:
-    ValueError: If output_units is not an int.
-  """
-  if not isinstance(output_units, int):
-    raise ValueError('output_units must be an int.  Given type: {}'.format(
-        type(output_units)))
-
-  def rnn_logit_fn(features, mode):
-    """Recurrent Neural Network logit_fn.
-
-    Args:
-      features: This is the first item returned from the `input_fn`
-                passed to `train`, `evaluate`, and `predict`. This should be a
-                single `Tensor` or `dict` of same.
-      mode: Optional. Specifies if this training, evaluation or prediction. See
-            `ModeKeys`.
-
-    Returns:
-      A `Tensor` representing the logits.
-    """
-    with variable_scope.variable_scope(
-        'sequence_input_layer',
-        values=tuple(six.itervalues(features)),
-        partitioner=input_layer_partitioner):
-      sequence_input, sequence_length = seq_fc.sequence_input_layer(
-          features=features, feature_columns=sequence_feature_columns)
-      summary.histogram('sequence_length', sequence_length)
-
-      if context_feature_columns:
-        context_input = feature_column_lib.input_layer(
-            features=features,
-            feature_columns=context_feature_columns)
-        sequence_input = _concatenate_context_input(sequence_input,
-                                                    context_input)
-
-    cell = rnn_cell_fn(mode)
-    # Ignore output state.
-    rnn_outputs, _ = rnn.dynamic_rnn(
-        cell=cell,
-        inputs=sequence_input,
-        sequence_length=sequence_length,
-        dtype=dtypes.float32,
-        time_major=False)
-    last_activations = _select_last_activations(rnn_outputs, sequence_length)
-
-    with variable_scope.variable_scope('logits', values=(rnn_outputs,)):
-      logits = core_layers.dense(
-          last_activations,
-          units=output_units,
-          activation=None,
-          kernel_initializer=init_ops.glorot_uniform_initializer())
-    return logits
-
-  return rnn_logit_fn
-
-
-def _rnn_model_fn(features,
-                  labels,
-                  mode,
-                  head,
-                  rnn_cell_fn,
-                  sequence_feature_columns,
-                  context_feature_columns,
-                  optimizer='Adagrad',
-                  input_layer_partitioner=None,
-                  config=None):
-  """Recurrent Neural Net model_fn.
-
-  Args:
-    features: dict of `Tensor` and `SparseTensor` objects returned from
-      `input_fn`.
-    labels: `Tensor` of shape [batch_size, 1] or [batch_size] with labels.
-    mode: Defines whether this is training, evaluation or prediction.
-      See `ModeKeys`.
-    head: A `head_lib._Head` instance.
-    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
-      returns an object of type `tf.nn.rnn_cell.RNNCell`.
-    sequence_feature_columns: Iterable containing `FeatureColumn`s that
-      represent sequential model inputs.
-    context_feature_columns: Iterable containing `FeatureColumn`s that
-      represent model inputs not associated with a specific timestep.
-    optimizer: String, `tf.Optimizer` object, or callable that creates the
-      optimizer to use for training. If not specified, will use the Adagrad
-      optimizer with a default learning rate of 0.05 and gradient clip norm of
-      5.0.
-    input_layer_partitioner: Partitioner for input layer. Defaults
-      to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-    config: `RunConfig` object to configure the runtime settings.
-
-  Returns:
-    An `EstimatorSpec` instance.
-
-  Raises:
-    ValueError: If mode or optimizer is invalid, or features has the wrong type.
-  """
-  if not isinstance(features, dict):
-    raise ValueError('features should be a dictionary of `Tensor`s. '
-                     'Given type: {}'.format(type(features)))
-
-  # If user does not provide an optimizer instance, use the optimizer specified
-  # by the string with default learning rate and gradient clipping.
-  if not isinstance(optimizer, optimizer_lib.Optimizer):
-    optimizer = optimizers.get_optimizer_instance(
-        optimizer, learning_rate=_DEFAULT_LEARNING_RATE)
-    optimizer = extenders.clip_gradients_by_norm(optimizer, _DEFAULT_CLIP_NORM)
-
-  num_ps_replicas = config.num_ps_replicas if config else 0
-  partitioner = partitioned_variables.min_max_variable_partitioner(
-      max_partitions=num_ps_replicas)
-  with variable_scope.variable_scope(
-      'rnn',
-      values=tuple(six.itervalues(features)),
-      partitioner=partitioner):
-    input_layer_partitioner = input_layer_partitioner or (
-        partitioned_variables.min_max_variable_partitioner(
-            max_partitions=num_ps_replicas,
-            min_slice_size=64 << 20))
-
-    logit_fn = _rnn_logit_fn_builder(
-        output_units=head.logits_dimension,
-        rnn_cell_fn=rnn_cell_fn,
-        sequence_feature_columns=sequence_feature_columns,
-        context_feature_columns=context_feature_columns,
-        input_layer_partitioner=input_layer_partitioner)
-    logits = logit_fn(features=features, mode=mode)
-
-    def _train_op_fn(loss):
-      """Returns the op to optimize the loss."""
-      return optimizer.minimize(
-          loss,
-          global_step=training_util.get_global_step())
-
-    return head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        logits=logits)
-
-
-def _assert_rnn_cell_fn(rnn_cell_fn, num_units, cell_type):
-  """Assert arguments are valid and return rnn_cell_fn."""
-  if rnn_cell_fn and (num_units or cell_type != USE_DEFAULT):
-    raise ValueError(
-        'num_units and cell_type must not be specified when using rnn_cell_fn'
-    )
-  if not rnn_cell_fn:
-    if cell_type == USE_DEFAULT:
-      cell_type = 'basic_rnn'
-    rnn_cell_fn = _make_rnn_cell_fn(num_units, cell_type)
-  return rnn_cell_fn
-
-
-class RNNClassifier(estimator.Estimator):
-  """A classifier for TensorFlow RNN models.
-
-  Trains a recurrent neural network model to classify instances into one of
-  multiple classes.
-
-  Example:
-
-  ```python
-  token_sequence = sequence_categorical_column_with_hash_bucket(...)
-  token_emb = embedding_column(categorical_column=token_sequence, ...)
-
-  estimator = RNNClassifier(
-      sequence_feature_columns=[token_emb],
-      num_units=[32, 16], cell_type='lstm')
-
-  # Input builders
-  def input_fn_train: # returns x, y
-    pass
-  estimator.train(input_fn=input_fn_train, steps=100)
-
-  def input_fn_eval: # returns x, y
-    pass
-  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
-  def input_fn_predict: # returns x, None
-    pass
-  predictions = estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-    `key=weight_column` whose value is a `Tensor`.
-  * for each `column` in `sequence_feature_columns`:
-    - a feature with `key=column.name` whose `value` is a `SparseTensor`.
-  * for each `column` in `context_feature_columns`:
-    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
-      with `key` the id column name, the second with `key` the weight column
-      name. Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss is calculated by using softmax cross entropy.
-
-  @compatibility(eager)
-  Estimators are not compatible with eager execution.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               sequence_feature_columns,
-               context_feature_columns=None,
-               num_units=None,
-               cell_type=USE_DEFAULT,
-               rnn_cell_fn=None,
-               model_dir=None,
-               n_classes=2,
-               weight_column=None,
-               label_vocabulary=None,
-               optimizer='Adagrad',
-               loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
-               input_layer_partitioner=None,
-               config=None):
-    """Initializes a `RNNClassifier` instance.
-
-    Args:
-      sequence_feature_columns: An iterable containing the `FeatureColumn`s
-        that represent sequential input. All items in the set should either be
-        sequence columns (e.g. `sequence_numeric_column`) or constructed from
-        one (e.g. `embedding_column` with `sequence_categorical_column_*` as
-        input).
-      context_feature_columns: An iterable containing the `FeatureColumn`s
-        for contextual input. The data represented by these columns will be
-        replicated and given to the RNN at each timestep. These columns must be
-        instances of classes derived from `_DenseColumn` such as
-        `numeric_column`, not the sequential variants.
-      num_units: Iterable of integer number of hidden units per RNN layer. If
-        set, `cell_type` must also be specified and `rnn_cell_fn` must be
-        `None`.
-      cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
-        the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
-        `'gru'`. If set, `num_units` must also be specified and `rnn_cell_fn`
-        must be `None`.
-      rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
-        returns an object of type `tf.nn.rnn_cell.RNNCell` that will be used to
-        construct the RNN. If set, `num_units` and `cell_type` cannot be set.
-        This is for advanced users who need additional customization beyond
-        `num_units` and `cell_type`. Note that `tf.nn.rnn_cell.MultiRNNCell` is
-        needed for stacked RNNs.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      n_classes: Number of label classes. Defaults to 2, namely binary
-        classification. Must be > 1.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to down weight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      label_vocabulary: A list of strings represents possible label values. If
-        given, labels must be string type and have any value in
-        `label_vocabulary`. If it is not given, that means labels are
-        already encoded as integer or float within [0, 1] for `n_classes=2` and
-        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
-        Also there will be errors if vocabulary is not provided and labels are
-        string.
-      optimizer: An instance of `tf.Optimizer` or string specifying optimizer
-        type. Defaults to Adagrad optimizer.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`.
-      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
-        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-      config: `RunConfig` object to configure the runtime settings.
-
-    Raises:
-      ValueError: If `num_units`, `cell_type`, and `rnn_cell_fn` are not
-        compatible.
-    """
-    rnn_cell_fn = _assert_rnn_cell_fn(rnn_cell_fn, num_units, cell_type)
-
-    if n_classes == 2:
-      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-    else:
-      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
-          n_classes,
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-
-    def _model_fn(features, labels, mode, config):
-      return _rnn_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          rnn_cell_fn=rnn_cell_fn,
-          sequence_feature_columns=tuple(sequence_feature_columns or []),
-          context_feature_columns=tuple(context_feature_columns or []),
-          optimizer=optimizer,
-          input_layer_partitioner=input_layer_partitioner,
-          config=config)
-    super(RNNClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
-
-
-class RNNEstimator(estimator.Estimator):
-  """An Estimator for TensorFlow RNN models with user-specified head.
-
-  Example:
-
-  ```python
-  token_sequence = sequence_categorical_column_with_hash_bucket(...)
-  token_emb = embedding_column(categorical_column=token_sequence, ...)
-
-  estimator = RNNEstimator(
-      head=tf.contrib.estimator.regression_head(),
-      sequence_feature_columns=[token_emb],
-      num_units=[32, 16], cell_type='lstm')
-
-  # Or with custom RNN cell:
-  def rnn_cell_fn(mode):
-    cells = [ tf.contrib.rnn.LSTMCell(size) for size in [32, 16] ]
-    if mode == tf.estimator.ModeKeys.TRAIN:
-      cells = [ tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=0.5)
-                    for cell in cells ]
-    return tf.contrib.rnn.MultiRNNCell(cells)
-
-  estimator = RNNEstimator(
-      head=tf.contrib.estimator.regression_head(),
-      sequence_feature_columns=[token_emb],
-      rnn_cell_fn=rnn_cell_fn)
-
-  # Input builders
-  def input_fn_train: # returns x, y
-    pass
-  estimator.train(input_fn=input_fn_train, steps=100)
-
-  def input_fn_eval: # returns x, y
-    pass
-  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
-  def input_fn_predict: # returns x, None
-    pass
-  predictions = estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-  * if the head's `weight_column` is not `None`, a feature with
-    `key=weight_column` whose value is a `Tensor`.
-  * for each `column` in `sequence_feature_columns`:
-    - a feature with `key=column.name` whose `value` is a `SparseTensor`.
-  * for each `column` in `context_feature_columns`:
-    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
-      with `key` the id column name, the second with `key` the weight column
-      name. Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss and predicted output are determined by the specified head.
-
-  @compatibility(eager)
-  Estimators are not compatible with eager execution.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               head,
-               sequence_feature_columns,
-               context_feature_columns=None,
-               num_units=None,
-               cell_type=USE_DEFAULT,
-               rnn_cell_fn=None,
-               model_dir=None,
-               optimizer='Adagrad',
-               input_layer_partitioner=None,
-               config=None):
-    """Initializes a `RNNClassifier` instance.
-
-    Args:
-      head: A `_Head` instance constructed with a method such as
-        `tf.contrib.estimator.multi_label_head`. This specifies the model's
-        output and loss function to be optimized.
-      sequence_feature_columns: An iterable containing the `FeatureColumn`s
-        that represent sequential input. All items in the set should either be
-        sequence columns (e.g. `sequence_numeric_column`) or constructed from
-        one (e.g. `embedding_column` with `sequence_categorical_column_*` as
-        input).
-      context_feature_columns: An iterable containing the `FeatureColumn`s
-        for contextual input. The data represented by these columns will be
-        replicated and given to the RNN at each timestep. These columns must be
-        instances of classes derived from `_DenseColumn` such as
-        `numeric_column`, not the sequential variants.
-      num_units: Iterable of integer number of hidden units per RNN layer. If
-        set, `cell_type` must also be specified and `rnn_cell_fn` must be
-        `None`.
-      cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
-        the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
-        `'gru'`. If set, `num_units` must also be specified and `rnn_cell_fn`
-        must be `None`.
-      rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
-        returns an object of type `tf.nn.rnn_cell.RNNCell` that will be used to
-        construct the RNN. If set, `num_units` and `cell_type` cannot be set.
-        This is for advanced users who need additional customization beyond
-        `num_units` and `cell_type`. Note that `tf.nn.rnn_cell.MultiRNNCell` is
-        needed for stacked RNNs.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      optimizer: An instance of `tf.Optimizer` or string specifying optimizer
-        type. Defaults to Adagrad optimizer.
-      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
-        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-      config: `RunConfig` object to configure the runtime settings.
+from tensorflow_estimator.contrib.estimator.python.estimator import rnn
 
-    Raises:
-      ValueError: If `num_units`, `cell_type`, and `rnn_cell_fn` are not
-        compatible.
-    """
-    rnn_cell_fn = _assert_rnn_cell_fn(rnn_cell_fn, num_units, cell_type)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+rnn.__all__ = [s for s in dir(rnn) if not s.startswith('__')]
 
-    def _model_fn(features, labels, mode, config):
-      return _rnn_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          rnn_cell_fn=rnn_cell_fn,
-          sequence_feature_columns=tuple(sequence_feature_columns or []),
-          context_feature_columns=tuple(context_feature_columns or []),
-          optimizer=optimizer,
-          input_layer_partitioner=input_layer_partitioner,
-          config=config)
-    super(RNNEstimator, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+from tensorflow_estimator.contrib.estimator.python.estimator.rnn import *
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn_test.py b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
deleted file mode 100644
index 89506ee6615cd838b0fe651e13eb3e7dd35d2aef..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/rnn_test.py
+++ /dev/null
@@ -1,1185 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for rnn.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.contrib.estimator.python.estimator import head as head_lib
-from tensorflow.contrib.estimator.python.estimator import rnn
-from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.data.experimental.ops import readers
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.canned import parsing_utils
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column as fc
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.lib.io import python_io
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import rnn_cell
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import optimizer
-from tensorflow.python.training import training_util
-
-
-# Names of variables created by BasicRNNCell model.
-TOKEN_EMBEDDING_NAME = 'rnn/sequence_input_layer/input_layer/tokens_sequential_embedding/embedding_weights'
-CELL_WEIGHTS_NAME = 'rnn/rnn/basic_rnn_cell/kernel'
-CELL_BIAS_NAME = 'rnn/rnn/basic_rnn_cell/bias'
-MULTI_CELL_WEIGHTS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/kernel'
-MULTI_CELL_BIAS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/bias'
-LOGITS_WEIGHTS_NAME = 'rnn/logits/dense/kernel'
-LOGITS_BIAS_NAME = 'rnn/logits/dense/bias'
-
-
-def _assert_close(expected, actual, rtol=1e-04, name='assert_close'):
-  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
-    expected = ops.convert_to_tensor(expected, name='expected')
-    actual = ops.convert_to_tensor(actual, name='actual')
-    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
-    rtol = ops.convert_to_tensor(rtol, name='rtol')
-    return check_ops.assert_less(
-        rdiff,
-        rtol,
-        data=('Condition expected =~ actual did not hold element-wise:'
-              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
-              'rtol = ', rtol,),
-        name=scope)
-
-
-def create_checkpoint(rnn_weights, rnn_biases, logits_weights, logits_biases,
-                      global_step, model_dir):
-  """Create checkpoint file with provided model weights.
-
-  Args:
-    rnn_weights: Iterable of values of weights for the RNN cell.
-    rnn_biases: Iterable of values of biases for the RNN cell.
-    logits_weights: Iterable of values for matrix connecting RNN output to
-      logits.
-    logits_biases: Iterable of values for logits bias term.
-    global_step: Initial global step to save in checkpoint.
-    model_dir: Directory into which checkpoint is saved.
-  """
-  model_weights = {}
-  model_weights[CELL_WEIGHTS_NAME] = rnn_weights
-  model_weights[CELL_BIAS_NAME] = rnn_biases
-  model_weights[LOGITS_WEIGHTS_NAME] = logits_weights
-  model_weights[LOGITS_BIAS_NAME] = logits_biases
-
-  with ops.Graph().as_default():
-    # Create model variables.
-    for k, v in six.iteritems(model_weights):
-      variables_lib.Variable(v, name=k, dtype=dtypes.float32)
-
-    # Create non-model variables.
-    global_step_var = training_util.create_global_step()
-    assign_op = global_step_var.assign(global_step)
-
-    # Initialize vars and save checkpoint.
-    with monitored_session.MonitoredTrainingSession(
-        checkpoint_dir=model_dir) as sess:
-      sess.run(assign_op)
-
-
-class RNNLogitFnTest(test.TestCase):
-  """Tests correctness of logits calculated from _rnn_logit_fn_builder."""
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_logits(self, mode, rnn_units, logits_dimension, features_fn,
-                   sequence_feature_columns, context_feature_columns,
-                   expected_logits):
-    """Tests that the expected logits are calculated."""
-    with ops.Graph().as_default():
-      # Global step needed for MonitoredSession, which is in turn used to
-      # explicitly set variable weights through a checkpoint.
-      training_util.create_global_step()
-      # Use a variable scope here with 'rnn', emulating the rnn model_fn, so
-      # the checkpoint naming is shared.
-      with variable_scope.variable_scope('rnn'):
-        input_layer_partitioner = (
-            partitioned_variables.min_max_variable_partitioner(
-                max_partitions=0, min_slice_size=64 << 20))
-        logit_fn = rnn._rnn_logit_fn_builder(
-            output_units=logits_dimension,
-            rnn_cell_fn=rnn._make_rnn_cell_fn(rnn_units),
-            sequence_feature_columns=sequence_feature_columns,
-            context_feature_columns=context_feature_columns,
-            input_layer_partitioner=input_layer_partitioner)
-        # Features are constructed within this function, otherwise the Tensors
-        # containing the features would be defined outside this graph.
-        logits = logit_fn(features=features_fn(), mode=mode)
-        with monitored_session.MonitoredTrainingSession(
-            checkpoint_dir=self._model_dir) as sess:
-          self.assertAllClose(expected_logits, sess.run(logits), atol=1e-4)
-
-  def testOneDimLogits(self):
-    """Tests one-dimensional logits.
-
-    Intermediate values are rounded for ease in reading.
-    input_layer = [[[10]], [[5]]]
-    initial_state = [0, 0]
-    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
-                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
-                          = [[0.83, -0.91]]
-    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
-                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
-                          = [[0.53, -0.37]]
-    logits = [[-1*0.53 - 1*0.37 + 0.3]] = [[-0.6033]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1.], [1.]],
-        logits_biases=[0.3],
-        global_step=base_global_step,
-        model_dir=self._model_dir)
-
-    def features_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5.],
-                  indices=[[0, 0], [0, 1]],
-                  dense_shape=[1, 2]),
-      }
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-    context_feature_columns = []
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          rnn_units=[2],
-          logits_dimension=1,
-          features_fn=features_fn,
-          sequence_feature_columns=sequence_feature_columns,
-          context_feature_columns=context_feature_columns,
-          expected_logits=[[-0.6033]])
-
-  def testMultiDimLogits(self):
-    """Tests multi-dimensional logits.
-
-    Intermediate values are rounded for ease in reading.
-    input_layer = [[[10]], [[5]]]
-    initial_state = [0, 0]
-    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
-                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
-                          = [[0.83, -0.91]]
-    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
-                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
-                          = [[0.53, -0.37]]
-    logits = [[-1*0.53 - 1*0.37 + 0.3],
-              [0.5*0.53 + 0.3*0.37 + 0.4],
-              [0.2*0.53 - 0.1*0.37 + 0.5]
-           = [[-0.6033, 0.7777, 0.5698]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
-        logits_biases=[0.3, 0.4, 0.5],
-        global_step=base_global_step,
-        model_dir=self._model_dir)
-
-    def features_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5.],
-                  indices=[[0, 0], [0, 1]],
-                  dense_shape=[1, 2]),
-      }
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-    context_feature_columns = []
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          rnn_units=[2],
-          logits_dimension=3,
-          features_fn=features_fn,
-          sequence_feature_columns=sequence_feature_columns,
-          context_feature_columns=context_feature_columns,
-          expected_logits=[[-0.6033, 0.7777, 0.5698]])
-
-  def testMultiExampleMultiDim(self):
-    """Tests multiple examples and multi-dimensional logits.
-
-    Intermediate values are rounded for ease in reading.
-    input_layer = [[[10], [5]], [[2], [7]]]
-    initial_state = [[0, 0], [0, 0]]
-    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
-                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
-                             [tanh(.1*2 + .2*0 + .3*0 +.2),
-                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
-                          = [[0.83, -0.91], [0.38, 0.10]]
-    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
-                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
-                             [tanh(.1*7 + .2*.38 + .3*.10 +.2),
-                              tanh(-.2*7 - .3*.38 - .4*.10 +.5)]]
-                          = [[0.53, -0.37], [0.76, -0.78]
-    logits = [[-1*0.53 - 1*0.37 + 0.3,
-               0.5*0.53 + 0.3*0.37 + 0.4,
-               0.2*0.53 - 0.1*0.37 + 0.5],
-              [-1*0.76 - 1*0.78 + 0.3,
-               0.5*0.76 +0.3*0.78 + 0.4,
-               0.2*0.76 -0.1*0.78 + 0.5]]
-           = [[-0.6033, 0.7777, 0.5698], [-1.2473, 1.0170, 0.5745]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
-        logits_biases=[0.3, 0.4, 0.5],
-        global_step=base_global_step,
-        model_dir=self._model_dir)
-
-    def features_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5., 2., 7.],
-                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
-                  dense_shape=[2, 2]),
-      }
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))
-    ]
-    context_feature_columns = []
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          rnn_units=[2],
-          logits_dimension=3,
-          features_fn=features_fn,
-          sequence_feature_columns=sequence_feature_columns,
-          context_feature_columns=context_feature_columns,
-          expected_logits=[[-0.6033, 0.7777, 0.5698],
-                           [-1.2473, 1.0170, 0.5745]])
-
-  def testMultiExamplesDifferentLength(self):
-    """Tests multiple examples with different lengths.
-
-    Intermediate values are rounded for ease in reading.
-    input_layer = [[[10], [5]], [[2], [0]]]
-    initial_state = [[0, 0], [0, 0]]
-    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
-                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
-                             [tanh(.1*2 + .2*0 + .3*0 +.2),
-                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
-                          = [[0.83, -0.91], [0.38, 0.10]]
-    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
-                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
-                             [<ignored-padding>]]
-                          = [[0.53, -0.37], [<ignored-padding>]]
-    logits = [[-1*0.53 - 1*0.37 + 0.3],
-              [-1*0.38 + 1*0.10 + 0.3]]
-           = [[-0.6033], [0.0197]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1.], [1.]],
-        logits_biases=[0.3],
-        global_step=base_global_step,
-        model_dir=self._model_dir)
-
-    def features_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5., 2.],
-                  indices=[[0, 0], [0, 1], [1, 0]],
-                  dense_shape=[2, 2]),
-      }
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-    context_feature_columns = []
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          rnn_units=[2],
-          logits_dimension=1,
-          features_fn=features_fn,
-          sequence_feature_columns=sequence_feature_columns,
-          context_feature_columns=context_feature_columns,
-          expected_logits=[[-0.6033], [0.0197]])
-
-  def testMultiExamplesWithContext(self):
-    """Tests multiple examples with context features.
-
-    Intermediate values are rounded for ease in reading.
-    input_layer = [[[10, -0.5], [5, -0.5]], [[2, 0.8], [0, 0]]]
-    initial_state = [[0, 0], [0, 0]]
-    rnn_output_timestep_1 = [[tanh(.1*10 - 1*.5 + .2*0 + .3*0 +.2),
-                              tanh(-.2*10 - 0.9*.5 - .3*0 - .4*0 +.5)],
-                             [tanh(.1*2 + 1*.8 + .2*0 + .3*0 +.2),
-                              tanh(-.2*2 + .9*.8 - .3*0 - .4*0 +.5)]]
-                          = [[0.60, -0.96], [0.83, 0.68]]
-    rnn_output_timestep_2 = [[tanh(.1*5 - 1*.5 + .2*.60 - .3*.96 +.2),
-                              tanh(-.2*5 - .9*.5 - .3*.60 + .4*.96 +.5)],
-                             [<ignored-padding>]]
-                          = [[0.03, -0.63], [<ignored-padding>]]
-    logits = [[-1*0.03 - 1*0.63 + 0.3],
-              [-1*0.83 + 1*0.68 + 0.3]]
-           = [[-0.3662], [0.1414]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        # Context features weights are inserted between input and state weights.
-        rnn_weights=[[.1, -.2], [1., 0.9], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1.], [1.]],
-        logits_biases=[0.3],
-        global_step=base_global_step,
-        model_dir=self._model_dir)
-
-    def features_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5., 2.],
-                  indices=[[0, 0], [0, 1], [1, 0]],
-                  dense_shape=[2, 2]),
-          'context': [[-0.5], [0.8]],
-      }
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-    context_feature_columns = [fc.numeric_column('context', shape=(1,))]
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          rnn_units=[2],
-          logits_dimension=1,
-          features_fn=features_fn,
-          sequence_feature_columns=sequence_feature_columns,
-          context_feature_columns=context_feature_columns,
-          expected_logits=[[-0.3662], [0.1414]])
-
-  def testMultiExamplesMultiFeatures(self):
-    """Tests examples with multiple sequential feature columns.
-
-    Intermediate values are rounded for ease in reading.
-    input_layer = [[[1, 0, 10], [0, 1, 5]], [[1, 0, 2], [0, 0, 0]]]
-    initial_state = [[0, 0], [0, 0]]
-    rnn_output_timestep_1 = [[tanh(.5*1 + 1*0 + .1*10 + .2*0 + .3*0 +.2),
-                              tanh(-.5*1 - 1*0 - .2*10 - .3*0 - .4*0 +.5)],
-                             [tanh(.5*1 + 1*0 + .1*2 + .2*0 + .3*0 +.2),
-                              tanh(-.5*1 - 1*0 - .2*2 - .3*0 - .4*0 +.5)]]
-                          = [[0.94, -0.96], [0.72, -0.38]]
-    rnn_output_timestep_2 = [[tanh(.5*0 + 1*1 + .1*5 + .2*.94 - .3*.96 +.2),
-                              tanh(-.5*0 - 1*1 - .2*5 - .3*.94 + .4*.96 +.5)],
-                             [<ignored-padding>]]
-                          = [[0.92, -0.88], [<ignored-padding>]]
-    logits = [[-1*0.92 - 1*0.88 + 0.3],
-              [-1*0.72 - 1*0.38 + 0.3]]
-           = [[-1.5056], [-0.7962]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        # FeatureColumns are sorted alphabetically, so on_sale weights are
-        # inserted before price.
-        rnn_weights=[[.5, -.5], [1., -1.], [.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1.], [1.]],
-        logits_biases=[0.3],
-        global_step=base_global_step,
-        model_dir=self._model_dir)
-
-    def features_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5., 2.],
-                  indices=[[0, 0], [0, 1], [1, 0]],
-                  dense_shape=[2, 2]),
-          'on_sale':
-              sparse_tensor.SparseTensor(
-                  values=[0, 1, 0],
-                  indices=[[0, 0], [0, 1], [1, 0]],
-                  dense_shape=[2, 2]),
-      }
-
-    price_column = seq_fc.sequence_numeric_column('price', shape=(1,))
-    on_sale_column = fc.indicator_column(
-        seq_fc.sequence_categorical_column_with_identity(
-            'on_sale', num_buckets=2))
-    sequence_feature_columns = [price_column, on_sale_column]
-    context_feature_columns = []
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          rnn_units=[2],
-          logits_dimension=1,
-          features_fn=features_fn,
-          sequence_feature_columns=sequence_feature_columns,
-          context_feature_columns=context_feature_columns,
-          expected_logits=[[-1.5056], [-0.7962]])
-
-
-class RNNClassifierTrainingTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _assert_checkpoint(
-      self, n_classes, input_units, cell_units, expected_global_step):
-
-    shapes = {
-        name: shape for (name, shape) in
-        checkpoint_utils.list_variables(self._model_dir)
-    }
-
-    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
-    self.assertEqual(
-        expected_global_step,
-        checkpoint_utils.load_variable(
-            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
-
-    # RNN Cell variables.
-    if len(cell_units) > 1:
-      for i, cell_unit in enumerate(cell_units):
-        self.assertEqual([input_units + cell_unit, cell_unit],
-                         shapes[MULTI_CELL_WEIGHTS_NAME_PATTERN % i])
-        self.assertEqual([cell_unit],
-                         shapes[MULTI_CELL_BIAS_NAME_PATTERN % i])
-        input_units = cell_unit
-    elif len(cell_units) == 1:
-      self.assertEqual([input_units + cell_unit, cell_unit],
-                       shapes[CELL_WEIGHTS_NAME])
-      self.assertEqual([cell_unit], shapes[CELL_BIAS_NAME])
-
-    # Logits variables.
-    logits_dimension = n_classes if n_classes > 2 else 1
-    self.assertEqual([cell_units[-1], logits_dimension],
-                     shapes[LOGITS_WEIGHTS_NAME])
-    self.assertEqual([logits_dimension], shapes[LOGITS_BIAS_NAME])
-
-  def _mock_optimizer(self, expected_loss=None):
-    expected_var_names = [
-        '%s/part_0:0' % CELL_BIAS_NAME,
-        '%s/part_0:0' % CELL_WEIGHTS_NAME,
-        '%s/part_0:0' % LOGITS_BIAS_NAME,
-        '%s/part_0:0' % LOGITS_WEIGHTS_NAME,
-    ]
-
-    def _minimize(loss, global_step):
-      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertItemsEqual(
-          expected_var_names,
-          [var.name for var in trainable_vars])
-
-      # Verify loss. We can't check the value directly, so we add an assert op.
-      self.assertEquals(0, loss.shape.ndims)
-      if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
-      assert_loss = _assert_close(
-          math_ops.to_float(expected_loss, name='expected'),
-          loss,
-          name='assert_loss')
-      with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
-
-    mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer.Optimizer,
-        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
-    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
-
-    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
-    # So, return mock_optimizer itself for deepcopy.
-    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
-    return mock_optimizer
-
-  def testConflictingRNNCellFn(self):
-    col = seq_fc.sequence_categorical_column_with_hash_bucket(
-        'tokens', hash_bucket_size=10)
-    embed = fc.embedding_column(col, dimension=2)
-    cell_units = [4, 2]
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        'num_units and cell_type must not be specified when using rnn_cell_fn'):
-      rnn.RNNClassifier(
-          sequence_feature_columns=[embed],
-          rnn_cell_fn=lambda x: x,
-          num_units=cell_units)
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        'num_units and cell_type must not be specified when using rnn_cell_fn'):
-      rnn.RNNClassifier(
-          sequence_feature_columns=[embed],
-          rnn_cell_fn=lambda x: x,
-          cell_type='lstm')
-
-  def _testFromScratchWithDefaultOptimizer(self, n_classes):
-    def train_input_fn():
-      return {
-          'tokens':
-              sparse_tensor.SparseTensor(
-                  values=['the', 'cat', 'sat'],
-                  indices=[[0, 0], [0, 1], [0, 2]],
-                  dense_shape=[1, 3]),
-      }, [[1]]
-
-    col = seq_fc.sequence_categorical_column_with_hash_bucket(
-        'tokens', hash_bucket_size=10)
-    embed = fc.embedding_column(col, dimension=2)
-    input_units = 2
-
-    cell_units = [4, 2]
-    est = rnn.RNNClassifier(
-        sequence_feature_columns=[embed],
-        num_units=cell_units,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # Train for a few steps, and validate final checkpoint.
-    num_steps = 10
-    est.train(input_fn=train_input_fn, steps=num_steps)
-    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
-
-  def testBinaryClassFromScratchWithDefaultOptimizer(self):
-    self._testFromScratchWithDefaultOptimizer(n_classes=2)
-
-  def testMultiClassFromScratchWithDefaultOptimizer(self):
-    self._testFromScratchWithDefaultOptimizer(n_classes=4)
-
-  def testFromScratchWithCustomRNNCellFn(self):
-    def train_input_fn():
-      return {
-          'tokens':
-              sparse_tensor.SparseTensor(
-                  values=['the', 'cat', 'sat'],
-                  indices=[[0, 0], [0, 1], [0, 2]],
-                  dense_shape=[1, 3]),
-      }, [[1]]
-
-    col = seq_fc.sequence_categorical_column_with_hash_bucket(
-        'tokens', hash_bucket_size=10)
-    embed = fc.embedding_column(col, dimension=2)
-    input_units = 2
-    cell_units = [4, 2]
-    n_classes = 2
-
-    def rnn_cell_fn(mode):
-      del mode  # unused
-      cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units]
-      return rnn_cell.MultiRNNCell(cells)
-
-    est = rnn.RNNClassifier(
-        sequence_feature_columns=[embed],
-        rnn_cell_fn=rnn_cell_fn,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # Train for a few steps, and validate final checkpoint.
-    num_steps = 10
-    est.train(input_fn=train_input_fn, steps=num_steps)
-    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
-
-  def _testExampleWeight(self, n_classes):
-    def train_input_fn():
-      return {
-          'tokens':
-              sparse_tensor.SparseTensor(
-                  values=['the', 'cat', 'sat', 'dog', 'barked'],
-                  indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
-                  dense_shape=[2, 3]),
-          'w': [[1], [2]],
-      }, [[1], [0]]
-
-    col = seq_fc.sequence_categorical_column_with_hash_bucket(
-        'tokens', hash_bucket_size=10)
-    embed = fc.embedding_column(col, dimension=2)
-    input_units = 2
-
-    cell_units = [4, 2]
-    est = rnn.RNNClassifier(
-        num_units=cell_units,
-        sequence_feature_columns=[embed],
-        n_classes=n_classes,
-        weight_column='w',
-        model_dir=self._model_dir)
-
-    # Train for a few steps, and validate final checkpoint.
-    num_steps = 10
-    est.train(input_fn=train_input_fn, steps=num_steps)
-    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
-
-  def testBinaryClassWithExampleWeight(self):
-    self._testExampleWeight(n_classes=2)
-
-  def testMultiClassWithExampleWeight(self):
-    self._testExampleWeight(n_classes=4)
-
-  def testBinaryClassFromCheckpoint(self):
-    initial_global_step = 100
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1.], [1.]],
-        logits_biases=[0.3],
-        global_step=initial_global_step,
-        model_dir=self._model_dir)
-
-    def train_input_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5., 2.],
-                  indices=[[0, 0], [0, 1], [1, 0]],
-                  dense_shape=[2, 2]),
-      }, [[0], [1]]
-
-    # Uses same checkpoint and examples as testBinaryClassEvaluationMetrics.
-    # See that test for loss calculation.
-    mock_optimizer = self._mock_optimizer(expected_loss=0.559831)
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-    est = rnn.RNNClassifier(
-        num_units=[2],
-        sequence_feature_columns=sequence_feature_columns,
-        n_classes=2,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-    est.train(input_fn=train_input_fn, steps=10)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-
-  def testMultiClassFromCheckpoint(self):
-    initial_global_step = 100
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
-        logits_biases=[0.3, 0.4, 0.5],
-        global_step=initial_global_step,
-        model_dir=self._model_dir)
-
-    def train_input_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5., 2., 7.],
-                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
-                  dense_shape=[2, 2]),
-      }, [[0], [1]]
-
-    # Uses same checkpoint and examples as testMultiClassEvaluationMetrics.
-    # See that test for loss calculation.
-    mock_optimizer = self._mock_optimizer(expected_loss=1.331465)
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-    est = rnn.RNNClassifier(
-        num_units=[2],
-        sequence_feature_columns=sequence_feature_columns,
-        n_classes=3,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-    est.train(input_fn=train_input_fn, steps=10)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-
-
-def sorted_key_dict(unsorted_dict):
-  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
-
-
-class RNNClassifierEvaluationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def testBinaryClassEvaluationMetrics(self):
-    global_step = 100
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1.], [1.]],
-        logits_biases=[0.3],
-        global_step=global_step,
-        model_dir=self._model_dir)
-
-    def eval_input_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5., 2.],
-                  indices=[[0, 0], [0, 1], [1, 0]],
-                  dense_shape=[2, 2]),
-      }, [[0], [1]]
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-
-    est = rnn.RNNClassifier(
-        num_units=[2],
-        sequence_feature_columns=sequence_feature_columns,
-        n_classes=2,
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(eval_input_fn, steps=1)
-
-    # Uses identical numbers to testMultiExamplesWithDifferentLength.
-    # See that test for logits calculation.
-    # logits = [[-0.603282], [0.019719]]
-    # probability = exp(logits) / (1 + exp(logits)) = [[0.353593], [0.504930]]
-    # loss = -label * ln(p) - (1 - label) * ln(1 - p)
-    #      = [[0.436326], [0.683335]]
-    # sum_over_batch_size = (0.436326 + 0.683335)/2
-    expected_metrics = {
-        ops.GraphKeys.GLOBAL_STEP:
-            global_step,
-        metric_keys.MetricKeys.LOSS:
-            0.559831,
-        metric_keys.MetricKeys.LOSS_MEAN:
-            0.559831,
-        metric_keys.MetricKeys.ACCURACY:
-            1.0,
-        metric_keys.MetricKeys.PREDICTION_MEAN:
-            0.429262,
-        metric_keys.MetricKeys.LABEL_MEAN:
-            0.5,
-        metric_keys.MetricKeys.ACCURACY_BASELINE:
-            0.5,
-        # With default threshold of 0.5, the model is a perfect classifier.
-        metric_keys.MetricKeys.RECALL:
-            1.0,
-        metric_keys.MetricKeys.PRECISION:
-            1.0,
-        # Positive example is scored above negative, so AUC = 1.0.
-        metric_keys.MetricKeys.AUC:
-            1.0,
-        metric_keys.MetricKeys.AUC_PR:
-            1.0,
-    }
-    self.assertAllClose(
-        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
-
-  def testMultiClassEvaluationMetrics(self):
-    global_step = 100
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
-        logits_biases=[0.3, 0.4, 0.5],
-        global_step=global_step,
-        model_dir=self._model_dir)
-
-    def eval_input_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5., 2., 7.],
-                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
-                  dense_shape=[2, 2]),
-      }, [[0], [1]]
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-
-    est = rnn.RNNClassifier(
-        num_units=[2],
-        sequence_feature_columns=sequence_feature_columns,
-        n_classes=3,
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(eval_input_fn, steps=1)
-
-    # Uses identical numbers to testMultiExampleMultiDim.
-    # See that test for logits calculation.
-    # logits = [[-0.603282, 0.777708, 0.569756],
-    #           [-1.247356, 1.017018, 0.574481]]
-    # logits_exp = exp(logits) / (1 + exp(logits))
-    #            = [[0.547013, 2.176468, 1.767836],
-    #               [0.287263, 2.764937, 1.776208]]
-    # softmax_probabilities = logits_exp / logits_exp.sum()
-    #                       = [[0.121793, 0.484596, 0.393611],
-    #                          [0.059494, 0.572639, 0.367866]]
-    # loss = -1. * log(softmax[label])
-    #      = [[2.105432], [0.557500]]
-    # sum_over_batch_size = (2.105432 + 0.557500)/2
-    expected_metrics = {
-        ops.GraphKeys.GLOBAL_STEP: global_step,
-        metric_keys.MetricKeys.LOSS: 1.331465,
-        metric_keys.MetricKeys.LOSS_MEAN: 1.331466,
-        metric_keys.MetricKeys.ACCURACY: 0.5,
-    }
-
-    self.assertAllClose(
-        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
-
-
-class RNNClassifierPredictionTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def testBinaryClassPredictions(self):
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1.], [1.]],
-        logits_biases=[0.3],
-        global_step=0,
-        model_dir=self._model_dir)
-
-    def predict_input_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5.],
-                  indices=[[0, 0], [0, 1]],
-                  dense_shape=[1, 2]),
-      }
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-    label_vocabulary = ['class_0', 'class_1']
-
-    est = rnn.RNNClassifier(
-        num_units=[2],
-        sequence_feature_columns=sequence_feature_columns,
-        n_classes=2,
-        label_vocabulary=label_vocabulary,
-        model_dir=self._model_dir)
-    # Uses identical numbers to testOneDimLogits.
-    # See that test for logits calculation.
-    # logits = [-0.603282]
-    # logistic = exp(-0.6033) / (1 + exp(-0.6033)) = [0.353593]
-    # probabilities = [0.646407, 0.353593]
-    # class_ids = argmax(probabilities) = [0]
-    predictions = next(est.predict(predict_input_fn))
-    self.assertAllClose([-0.603282],
-                        predictions[prediction_keys.PredictionKeys.LOGITS])
-    self.assertAllClose([0.353593],
-                        predictions[prediction_keys.PredictionKeys.LOGISTIC])
-    self.assertAllClose(
-        [0.646407, 0.353593],
-        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-    self.assertAllClose([0],
-                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
-    self.assertEqual([b'class_0'],
-                     predictions[prediction_keys.PredictionKeys.CLASSES])
-
-  def testMultiClassPredictions(self):
-    create_checkpoint(
-        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
-        rnn_biases=[.2, .5],
-        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
-        logits_biases=[0.3, 0.4, 0.5],
-        global_step=0,
-        model_dir=self._model_dir)
-
-    def predict_input_fn():
-      return {
-          'price':
-              sparse_tensor.SparseTensor(
-                  values=[10., 5.],
-                  indices=[[0, 0], [0, 1]],
-                  dense_shape=[1, 2]),
-      }
-
-    sequence_feature_columns = [
-        seq_fc.sequence_numeric_column('price', shape=(1,))]
-    label_vocabulary = ['class_0', 'class_1', 'class_2']
-
-    est = rnn.RNNClassifier(
-        num_units=[2],
-        sequence_feature_columns=sequence_feature_columns,
-        n_classes=3,
-        label_vocabulary=label_vocabulary,
-        model_dir=self._model_dir)
-    # Uses identical numbers to testMultiDimLogits.
-    # See that test for logits calculation.
-    # logits = [-0.603282, 0.777708, 0.569756]
-    # logits_exp = exp(logits) = [0.547013, 2.176468, 1.767836]
-    # softmax_probabilities = logits_exp / logits_exp.sum()
-    #                       = [0.121793, 0.484596, 0.393611]
-    # class_ids = argmax(probabilities) = [1]
-    predictions = next(est.predict(predict_input_fn))
-    self.assertAllClose([-0.603282, 0.777708, 0.569756],
-                        predictions[prediction_keys.PredictionKeys.LOGITS])
-    self.assertAllClose(
-        [0.121793, 0.484596, 0.393611],
-        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-    self.assertAllClose([1],
-                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
-    self.assertEqual([b'class_1'],
-                     predictions[prediction_keys.PredictionKeys.CLASSES])
-
-
-class BaseRNNClassificationIntegrationTest(object):
-
-  def __init__(self, _create_estimator_fn):
-    self._create_estimator_fn = _create_estimator_fn
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, feature_columns, train_input_fn, eval_input_fn,
-                          predict_input_fn, n_classes, batch_size):
-    cell_units = [4, 2]
-    est = self._create_estimator_fn(feature_columns, n_classes, cell_units,
-                                    self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUATE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predicted_proba = np.array([
-        x[prediction_keys.PredictionKeys.PROBABILITIES]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
-
-    # EXPORT
-    feature_spec = parsing_utils.classifier_parse_example_spec(
-        feature_columns,
-        label_key='label',
-        label_dtype=dtypes.int64)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def testNumpyInputFn(self):
-    """Tests complete flow with numpy_input_fn."""
-    n_classes = 3
-    batch_size = 10
-    words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept']
-    # Numpy only supports dense input, so all examples will have same length.
-    # TODO(b/73160931): Update test when support for prepadded data exists.
-    sequence_length = 3
-
-    features = []
-    for _ in range(batch_size):
-      sentence = random.sample(words, sequence_length)
-      features.append(sentence)
-
-    x_data = np.array(features)
-    y_data = np.random.randint(n_classes, size=batch_size)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'tokens': x_data},
-        y=y_data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'tokens': x_data},
-        y=y_data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'tokens': x_data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    col = seq_fc.sequence_categorical_column_with_hash_bucket(
-        'tokens', hash_bucket_size=10)
-    embed = fc.embedding_column(col, dimension=2)
-    feature_columns = [embed]
-
-    self._test_complete_flow(
-        feature_columns=feature_columns,
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        n_classes=n_classes,
-        batch_size=batch_size)
-
-  def testParseExampleInputFn(self):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    n_classes = 3
-    batch_size = 10
-    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']
-
-    _, examples_file = tempfile.mkstemp()
-    writer = python_io.TFRecordWriter(examples_file)
-    for _ in range(batch_size):
-      sequence_length = random.randint(1, len(words))
-      sentence = random.sample(words, sequence_length)
-      label = random.randint(0, n_classes - 1)
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'tokens':
-                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                      value=sentence)),
-              'label':
-                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                      value=[label])),
-          }))
-      writer.write(example.SerializeToString())
-    writer.close()
-
-    col = seq_fc.sequence_categorical_column_with_hash_bucket(
-        'tokens', hash_bucket_size=10)
-    embed = fc.embedding_column(col, dimension=2)
-    feature_columns = [embed]
-    feature_spec = parsing_utils.classifier_parse_example_spec(
-        feature_columns,
-        label_key='label',
-        label_dtype=dtypes.int64)
-
-    def _train_input_fn():
-      dataset = readers.make_batched_features_dataset(
-          examples_file, batch_size, feature_spec)
-      return dataset.map(lambda features: (features, features.pop('label')))
-    def _eval_input_fn():
-      dataset = readers.make_batched_features_dataset(
-          examples_file, batch_size, feature_spec, num_epochs=1)
-      return dataset.map(lambda features: (features, features.pop('label')))
-    def _predict_input_fn():
-      dataset = readers.make_batched_features_dataset(
-          examples_file, batch_size, feature_spec, num_epochs=1)
-      def features_fn(features):
-        features.pop('label')
-        return features
-      return dataset.map(features_fn)
-
-    self._test_complete_flow(
-        feature_columns=feature_columns,
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        n_classes=n_classes,
-        batch_size=batch_size)
-
-
-def _rnn_classifier_fn(feature_columns, n_classes, cell_units, model_dir):
-  return rnn.RNNClassifier(
-      num_units=cell_units,
-      sequence_feature_columns=feature_columns,
-      n_classes=n_classes,
-      model_dir=model_dir)
-
-
-class RNNClassifierIntegrationTest(BaseRNNClassificationIntegrationTest,
-                                   test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    BaseRNNClassificationIntegrationTest.__init__(self, _rnn_classifier_fn)
-
-
-def _rnn_estimator_fn(feature_columns, n_classes, cell_units, model_dir):
-  return rnn.RNNEstimator(
-      head=head_lib.multi_class_head(n_classes=n_classes),
-      num_units=cell_units,
-      sequence_feature_columns=feature_columns,
-      model_dir=model_dir)
-
-
-class RNNEstimatorIntegrationTest(BaseRNNClassificationIntegrationTest,
-                                  test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    BaseRNNClassificationIntegrationTest.__init__(self, _rnn_estimator_fn)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py
index ce98e9987ec728fadf170e56fe4bfe24fc9a0105..031cae4071cafdb7097a33a2d2f3d07085305984 100644
--- a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py
+++ b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py
@@ -12,438 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Class that creates an Estimator from a SavedModel."""
+"""saved_model_estimator python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export as export_lib
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import constants
-from tensorflow.python.saved_model import loader_impl
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import training_util
-
-
-class SavedModelEstimator(estimator_lib.Estimator):
-  """Create an Estimator from a SavedModel.
-
-  Only SavedModels exported with
-  `tf.contrib.estimator.export_all_saved_models()` or
-  `tf.estimator.Estimator.export_savedmodel()` are supported for this class.
-
-  Example with `tf.estimator.DNNClassifier`:
-
-  **Step 1: Create and train DNNClassifier.**
-
-  ```python
-  feature1 = tf.feature_column.embedding_column(
-      tf.feature_column.categorical_column_with_vocabulary_list(
-          key='feature1', vocabulary_list=('green', 'yellow')), dimension=1)
-  feature2 = tf.feature_column.numeric_column(key='feature2', default_value=0.0)
-
-  classifier = tf.estimator.DNNClassifier(
-      hidden_units=[4,2], feature_columns=[feature1, feature2])
-
-  def input_fn():
-    features = {'feature1': tf.constant(['green', 'green', 'yellow']),
-                'feature2': tf.constant([3.5, 4.2, 6.1])}
-    label = tf.constant([1., 0., 0.])
-    return tf.data.Dataset.from_tensors((features, label)).repeat()
-
-  classifier.train(input_fn=input_fn, steps=10)
-  ```
-
-  **Step 2: Export classifier.**
-  First, build functions that specify the expected inputs.
-
-  ```python
-  # During train and evaluation, both the features and labels should be defined.
-  supervised_input_receiver_fn = (
-      tf.contrib.estimator.build_raw_supervised_input_receiver_fn(
-          {'feature1': tf.placeholder(dtype=tf.string, shape=[None]),
-           'feature2': tf.placeholder(dtype=tf.float32, shape=[None])},
-          tf.placeholder(dtype=tf.float32, shape=[None])))
-
-  # During predict mode, expect to receive a `tf.Example` proto, so a parsing
-  # function is used.
-  serving_input_receiver_fn = (
-      tf.estimator.export.build_parsing_serving_input_receiver_fn(
-          tf.feature_column.make_parse_example_spec([feature1, feature2])))
-  ```
-
-  Next, export the model as a SavedModel. A timestamped directory will be
-  created (for example `/tmp/export_all/1234567890`).
-
-  ```python
-  # Option 1: Save all modes (train, eval, predict)
-  export_dir = tf.contrib.estimator.export_all_saved_models(
-      classifier, '/tmp/export_all',
-      {tf.estimator.ModeKeys.TRAIN: supervised_input_receiver_fn,
-       tf.estimator.ModeKeys.EVAL: supervised_input_receiver_fn,
-       tf.estimator.ModeKeys.PREDICT: serving_input_receiver_fn})
-
-  # Option 2: Only export predict mode
-  export_dir = classifier.export_savedmodel(
-      '/tmp/export_predict', serving_input_receiver_fn)
-  ```
-
-  **Step 3: Create a SavedModelEstimator from the exported SavedModel.**
-
-  ```python
-  est = tf.contrib.estimator.SavedModelEstimator(export_dir)
-
-  # If all modes were exported, you can immediately evaluate and predict, or
-  # continue training. Otherwise only predict is available.
-  eval_results = est.evaluate(input_fn=input_fn, steps=1)
-  print(eval_results)
-
-  est.train(input_fn=input_fn, steps=20)
-
-  def predict_input_fn():
-    example = tf.train.Example()
-    example.features.feature['feature1'].bytes_list.value.extend(['yellow'])
-    example.features.feature['feature2'].float_list.value.extend([1.])
-    return {'inputs':tf.constant([example.SerializeToString()])}
-
-  predictions = est.predict(predict_input_fn)
-  print(next(predictions))
-  ```
-  """
-
-  def __init__(self, saved_model_dir, model_dir=None):
-    """Initialize a SavedModelEstimator.
-
-    The SavedModelEstimator loads its model function and variable values from
-    the graphs defined in the SavedModel. There is no option to pass in
-    `RunConfig` or `params` arguments, because the model function graph is
-    defined statically in the SavedModel.
-
-    Args:
-      saved_model_dir: Directory containing SavedModel protobuf and subfolders.
-      model_dir: Directory to save new checkpoints during training.
-
-    Raises:
-      NotImplementedError: If a DistributionStrategy is defined in the config.
-        Unless the SavedModelEstimator is subclassed, this shouldn't happen.
-    """
-    checkpoint = estimator_lib._get_saved_model_ckpt(saved_model_dir)  # pylint: disable=protected-access
-    vars_to_warm_start = [name for name, _ in
-                          checkpoint_utils.list_variables(checkpoint)]
-    warm_start_settings = estimator_lib.WarmStartSettings(
-        ckpt_to_initialize_from=checkpoint,
-        vars_to_warm_start=vars_to_warm_start)
-
-    super(SavedModelEstimator, self).__init__(
-        model_fn=self._model_fn_from_saved_model, model_dir=model_dir,
-        warm_start_from=warm_start_settings)
-    if self._train_distribution or self._eval_distribution:
-      raise NotImplementedError(
-          'SavedModelEstimator currently does not support '
-          'DistributionStrategy.')
-    self.saved_model_dir = saved_model_dir
-    self.saved_model_loader = loader_impl.SavedModelLoader(saved_model_dir)
-    self._available_modes = self._extract_available_modes()
-
-  def _extract_available_modes(self):
-    """Return list of modes found in SavedModel."""
-    available_modes = []
-    logging.info('Checking available modes for SavedModelEstimator.')
-    for mode in [model_fn_lib.ModeKeys.TRAIN, model_fn_lib.ModeKeys.EVAL,
-                 model_fn_lib.ModeKeys.PREDICT]:
-      try:
-        self._get_meta_graph_def_for_mode(mode)
-      except RuntimeError:
-        logging.warning('%s mode not found in SavedModel.' % mode)
-        continue
-
-      if self._get_signature_def_for_mode(mode) is not None:
-        available_modes.append(mode)
-
-    logging.info('Available modes for Estimator: %s' % available_modes)
-    return available_modes
-
-  def _validate_mode(self, mode):
-    """Make sure that mode can be run using the SavedModel."""
-    if mode not in self._available_modes:
-      raise RuntimeError('%s mode is not available in the SavedModel. Use '
-                         'saved_model_cli to check that the Metagraph for this '
-                         'mode has been exported.' % mode)
-
-  def _get_meta_graph_def_for_mode(self, mode):
-    tags = model_fn_lib.EXPORT_TAG_MAP[mode]
-    return self.saved_model_loader.get_meta_graph_def_from_tags(tags)
-
-  def _get_signature_def_for_mode(self, mode):
-    meta_graph_def = self._get_meta_graph_def_for_mode(mode)
-    sig_def_key = (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-                   if mode == model_fn_lib.ModeKeys.PREDICT else mode)
-    if sig_def_key not in meta_graph_def.signature_def:
-      logging.warning('Metagraph for mode %s was found, but SignatureDef with'
-                      ' key \"%s\" is missing.' % (mode, sig_def_key))
-      return None
-    return meta_graph_def.signature_def[sig_def_key]
-
-  def _create_and_assert_global_step(self, graph):
-    # Do nothing here. The global step variable will be created/loaded from the
-    # SavedModel. If a global step variable were created here, the result
-    # will be two duplicate global step variables, causing issues during
-    # the warm-start phase.
-    # Due to the global variable being created in the model function, this may
-    # cause issues when running DistributionStrategy. Thus, DistributionStrategy
-    # is not yet supported with SavedModelEstimator.
-    return None
-
-  def _model_fn_from_saved_model(self, features, labels, mode):
-    """Load a SavedModel graph and return an EstimatorSpec."""
-    # TODO(kathywu): Model function loads placeholders from the graph. Calling
-    # export_all_saved_models creates another placeholder for the inputs, on top
-    # of the original placeholders. There should be a way to avoid this.
-    self._validate_mode(mode)
-
-    g = ops.get_default_graph()
-    if  training_util.get_global_step(g) is not None:
-      raise RuntimeError(
-          'Graph must not contain a global step tensor before the SavedModel is'
-          ' loaded. Please make sure that the input function does not create a '
-          'global step.')
-
-    # Extract SignatureDef for information about the input and output tensors.
-    signature_def = self._get_signature_def_for_mode(mode)
-
-    # Generate input map for replacing the inputs in the SavedModel graph with
-    # the provided features and labels.
-    input_map = _generate_input_map(signature_def, features, labels)
-
-    # Create a list of the names of output tensors. When the graph is loaded,
-    # names of the output tensors may be remapped. This ensures that the correct
-    # tensors are returned in the EstimatorSpec.
-    output_tensor_names = [
-        value.name for value in six.itervalues(signature_def.outputs)]
-
-    # Load the graph. `output_tensors` contains output `Tensors` in the same
-    # same order as the `output_tensor_names` list.
-    tags = model_fn_lib.EXPORT_TAG_MAP[mode]
-    _, output_tensors = self.saved_model_loader.load_graph(
-        g, tags, input_map=input_map, return_elements=output_tensor_names)
-
-    # Create a scaffold from the MetaGraphDef that contains ops to initialize
-    # the graph. This should mirror the steps from _add_meta_graph_for_mode(),
-    # which creates a MetaGraphDef from the EstimatorSpec's scaffold.
-    scaffold = monitored_session.Scaffold(
-        local_init_op=loader_impl._get_main_op_tensor(  # pylint: disable=protected-access
-            self._get_meta_graph_def_for_mode(mode)))
-
-    # Ensure that a global step tensor has been created.
-    global_step_tensor = training_util.get_global_step(g)
-    training_util.assert_global_step(global_step_tensor)
-
-    # Extract values to return in the EstimatorSpec.
-    output_map = dict(zip(output_tensor_names, output_tensors))
-    outputs = {key: output_map[value.name]
-               for key, value in six.iteritems(signature_def.outputs)}
-
-    loss, predictions, metrics = _validate_and_extract_outputs(
-        mode, outputs, signature_def.method_name)
-
-    train_op = ops.get_collection(constants.TRAIN_OP_KEY)
-    if len(train_op) > 1:
-      raise RuntimeError('Multiple ops found in the train_op collection.')
-    train_op = None if not train_op else train_op[0]
-
-    _clear_saved_model_collections()
-    return model_fn_lib.EstimatorSpec(
-        scaffold=scaffold,
-        mode=mode,
-        loss=loss,
-        train_op=train_op,
-        predictions=predictions,
-        eval_metric_ops=metrics)
-
-
-def _clear_saved_model_collections():
-  """Clear collections that are expected empty when exporting a SavedModel.
-
-  The SavedModel builder uses these collections to track ops necessary to
-  restore the graph state. These collections are expected to be empty before
-  MetaGraphs are added to the builder.
-  """
-  del ops.get_collection_ref(constants.ASSETS_KEY)[:]
-  del ops.get_collection_ref(constants.LEGACY_INIT_OP_KEY)[:]
-  del ops.get_collection_ref(constants.MAIN_OP_KEY)[:]
-  del ops.get_collection_ref(constants.TRAIN_OP_KEY)[:]
-
-
-def _generate_input_map(signature_def, features, labels):
-  """Return dict mapping an input tensor name to a feature or label tensor.
-
-  Args:
-    signature_def: SignatureDef loaded from SavedModel
-    features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
-      `SparseTensor`, specifying the features to be passed to the model.
-    labels: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
-      `SparseTensor`, specifying the labels to be passed to the model. May be
-      `None`.
-
-  Returns:
-    dict mapping string names of inputs to features or labels tensors
-
-  Raises:
-    ValueError: if SignatureDef inputs are not completely mapped by the input
-      features and labels.
-  """
-  # pylint: disable=protected-access
-  if not isinstance(features, dict):
-    features = {export_lib._SINGLE_FEATURE_DEFAULT_NAME: features}
-  if labels is not None and not isinstance(labels, dict):
-    labels = {export_lib._SINGLE_LABEL_DEFAULT_NAME: labels}
-  # pylint: enable=protected-access
-
-  inputs = signature_def.inputs
-  input_map = {}
-  for key, tensor_info in six.iteritems(inputs):
-    input_name = tensor_info.name
-    if ':' in input_name:
-      input_name = input_name[:input_name.find(':')]
-
-    # When tensors are used as control inputs for operations, their names are
-    # prepended with a '^' character in the GraphDef. To handle possible control
-    # flow edge cases, control input names must be included in the input map.
-    control_dependency_name = '^' + input_name
-
-    if key in features:
-      _check_same_dtype_and_shape(features[key], tensor_info, key)
-      input_map[input_name] = input_map[control_dependency_name] = features[key]
-    elif labels is not None and key in labels:
-      _check_same_dtype_and_shape(labels[key], tensor_info, key)
-      input_map[input_name] = input_map[control_dependency_name] = labels[key]
-    else:
-      raise ValueError(
-          'Key \"%s\" not found in features or labels passed in to the model '
-          'function. All required keys: %s' % (key, inputs.keys()))
-
-  return input_map
-
-
-def _check_same_dtype_and_shape(tensor, tensor_info, name):
-  """Validate that tensor has the same properties as the TensorInfo proto.
-
-  Args:
-    tensor: a `Tensor` object.
-    tensor_info: a `TensorInfo` proto.
-    name: Name of the input (to identify Tensor if an error is raised).
-
-  Raises:
-    ValueError: If the tensor shape or dtype don't match the TensorInfo
-  """
-  dtype_error = (tensor.dtype != dtypes.DType(tensor_info.dtype))
-  shape_error = not tensor.shape.is_compatible_with(tensor_info.tensor_shape)
-
-  if dtype_error or shape_error:
-    msg = 'Tensor shape and/or dtype validation failed for input %s:' % name
-    if dtype_error:
-      msg += ('\n\tExpected dtype: %s, Got: %s'
-              % (dtypes.DType(tensor_info.dtype), tensor.dtype))
-    if shape_error:
-      msg += ('\n\tExpected shape: %s, Got: %s'
-              % (tensor_shape.TensorShape(tensor_info.tensor_shape),
-                 tensor.shape))
-
-    raise ValueError(msg)
-
-
-def _extract_eval_metrics(output_dict):
-  """Return a eval metric dict extracted from the output_dict.
-
-  Eval metrics consist of a value tensor and an update op. Both must be in the
-  passed-in tensor dictionary for an eval metric to be added to the returned
-  dictionary.
-
-  Args:
-    output_dict: a dict that maps strings to tensors.
-
-  Returns:
-    dict mapping strings to (value, update_op) tuples.
-  """
-  # pylint: disable=protected-access
-  metric_ops = {}
-  separator_char = export_output._SupervisedOutput._SEPARATOR_CHAR
-
-  for key, tensor in six.iteritems(output_dict):
-    split_key = key.split(separator_char)
-
-    # The metric name may contain the separator character, so recreate its name.
-    metric_name = separator_char.join(split_key[:-1])
-
-    if split_key[0] == export_output._SupervisedOutput.METRICS_NAME:
-      # If the key ends with the value suffix, and there is a corresponding
-      # key ending with the update_op suffix, then add tensors to metrics dict.
-      if split_key[-1] == export_output._SupervisedOutput.METRIC_VALUE_SUFFIX:
-        update_op = ''.join(
-            [metric_name, separator_char,
-             export_output._SupervisedOutput.METRIC_UPDATE_SUFFIX])
-        if update_op in output_dict:
-          update_op_tensor = output_dict[update_op]
-          metric_ops[metric_name] = (tensor, update_op_tensor)
-
-  # pylint: enable=protected-access
-  return metric_ops
-
-
-def _validate_and_extract_outputs(mode, output_dict, method_name):
-  """Extract values from SignatureDef output dictionary.
-
-  Args:
-    mode: One of the modes enumerated in `tf.estimator.ModeKeys`.
-    output_dict: dict of string SignatureDef keys to `Tensor`.
-    method_name: Method name of the SignatureDef as a string.
-
-  Returns:
-    Tuple of (
-      loss: `Tensor` object,
-      predictions: dictionary mapping string keys to `Tensor` objects,
-      metrics: dictionary mapping string keys to a tuple of two `Tensor` objects
-    )
-
-  Raises:
-    RuntimeError: raised if SignatureDef has an invalid method name for the mode
-  """
-  # pylint: disable=protected-access
-  loss, predictions, metrics = None, None, None
-
-  if mode == model_fn_lib.ModeKeys.PREDICT:
-    predictions = output_dict
-  else:
-    # Validate that the SignatureDef's method name matches the expected name for
-    # the given mode.
-    expected_method_name = signature_constants.SUPERVISED_TRAIN_METHOD_NAME
-    if mode == model_fn_lib.ModeKeys.EVAL:
-      expected_method_name = signature_constants.SUPERVISED_EVAL_METHOD_NAME
-    if method_name != expected_method_name:
-      raise RuntimeError(
-          'Invalid SignatureDef method name for mode %s.\n\tExpected: %s\n\t'
-          'Got: %s\nPlease ensure that the SavedModel was exported with '
-          '`tf.contrib.estimator.export_all_saved_models()`.' %
-          (mode, expected_method_name, method_name))
+from tensorflow_estimator.contrib.estimator.python.estimator import saved_model_estimator
 
-    # Extract loss, metrics and predictions from the output dict.
-    loss = output_dict[export_output._SupervisedOutput.LOSS_NAME]
-    metrics = _extract_eval_metrics(output_dict)
-    predictions = {
-        key: value for key, value in six.iteritems(output_dict)
-        if key.split(export_output._SupervisedOutput._SEPARATOR_CHAR)[0] == (
-            export_output._SupervisedOutput.PREDICTIONS_NAME)}
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+saved_model_estimator.__all__ = [
+    s for s in dir(saved_model_estimator) if not s.startswith('__')
+]
 
-  # pylint: enable=protected-access
-  return loss, predictions, metrics
+from tensorflow_estimator.contrib.estimator.python.estimator.saved_model_estimator import *
diff --git a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator_test.py b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator_test.py
deleted file mode 100644
index 718da1367ce69285f37269c5631fa0be2b050c97..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator_test.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for SavedModelEstimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-from tensorflow.contrib.estimator.python.estimator import export as contrib_export
-from tensorflow.contrib.estimator.python.estimator import saved_model_estimator
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import training
-
-
-def dummy_input_fn():
-  return dataset_ops.Dataset.from_tensors((
-      {'x': constant_op.constant([[1], [-2]], dtype=dtypes.int64)},
-      constant_op.constant([[4], [-3]], dtype=dtypes.float32))).repeat()
-
-
-def dummy_input_fn_features_only():
-  return dataset_ops.Dataset.from_tensors(
-      {'x': constant_op.constant([[5], [6]], dtype=dtypes.int64)}).repeat()
-
-
-def dummy_supervised_receiver_fn():
-  feature_spec = {
-      'x': array_ops.placeholder(
-          dtype=dtypes.int64, shape=(2, 1), name='feature_x'),
-      }
-  label_spec = array_ops.placeholder(
-      dtype=dtypes.float32, shape=[2, 1], name='truth')
-  return export.build_raw_supervised_input_receiver_fn(
-      feature_spec, label_spec)
-
-
-def dummy_serving_receiver_fn():
-  feature_spec = {'x': array_ops.placeholder(
-      dtype=dtypes.int64, shape=(2, 1), name='feature_x'),}
-  return export.build_raw_serving_input_receiver_fn(feature_spec)
-
-
-def model_fn_diff_modes(features, labels, mode):
-  _, _ = features, labels
-  v = variables.Variable(21, name='some_var')
-  train_op = None
-  loss = constant_op.constant(104)
-  if mode == model_fn_lib.ModeKeys.TRAIN:
-    loss = constant_op.constant(105)
-    predictions = constant_op.constant([501])
-    train_op = control_flow_ops.group(
-        state_ops.assign_add(training.get_global_step(), 1),
-        state_ops.assign_add(v, 3))
-  elif mode == model_fn_lib.ModeKeys.EVAL:
-    loss = constant_op.constant(106)
-    predictions = constant_op.constant([502])
-  else:
-    loss = constant_op.constant(107)
-    predictions = constant_op.constant([503])
-  return model_fn_lib.EstimatorSpec(
-      mode,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops={
-          'abs_err': metrics_lib.mean_absolute_error(
-              constant_op.constant(0), predictions)},
-      predictions=predictions)
-
-
-class SavedModelEstimatorTest(test.TestCase):
-
-  def setUp(self):
-    self.tmpdirs = []
-
-  def tearDown(self):
-    for tmpdir in self.tmpdirs:
-      # gfile.DeleteRecursively fails in the windows cmake test, so use shutil.
-      shutil.rmtree(tmpdir, ignore_errors=True)
-    self.tmpdirs = []
-
-  def _get_tmp_dir(self):
-    tmpdir = tempfile.mkdtemp()
-    self.tmpdirs.append(tmpdir)
-    return tmpdir
-
-  def _export_estimator(self, train=True, evaluate=True, predict=True,
-                        model_fn=model_fn_diff_modes):
-    est = estimator.Estimator(model_fn, self._get_tmp_dir())
-    est.train(input_fn=dummy_input_fn, steps=10)
-
-    input_receiver_fn_map = {}
-    if train:
-      input_receiver_fn_map[model_fn_lib.ModeKeys.TRAIN] = (
-          dummy_supervised_receiver_fn())
-    if evaluate:
-      input_receiver_fn_map[model_fn_lib.ModeKeys.EVAL] = (
-          dummy_supervised_receiver_fn())
-    if predict:
-      input_receiver_fn_map[model_fn_lib.ModeKeys.PREDICT] = (
-          dummy_serving_receiver_fn())
-
-    export_base_path = self._get_tmp_dir()
-    export_dir = contrib_export.export_all_saved_models(
-        est, export_base_path, input_receiver_fn_map)
-    return export_dir
-
-  def test_load_all_modes(self):
-    sme = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(), self._get_tmp_dir())
-    sme.train(input_fn=dummy_input_fn, steps=1)
-    sme.train(input_fn=dummy_input_fn, steps=2)
-    self.assertEqual(13, sme.get_variable_value('global_step'))
-    self.assertEqual(60, sme.get_variable_value('some_var'))
-
-    eval_results = sme.evaluate(dummy_input_fn, steps=5)
-
-    self.assertEqual(13, eval_results['global_step'])
-    self.assertEqual(106, eval_results['loss'])
-    self.assertEqual(502, eval_results['metrics/abs_err'])
-
-    predictions = next(sme.predict(dummy_input_fn_features_only))
-    self.assertDictEqual({'output': 503}, predictions)
-
-  def test_load_all_modes_no_train(self):
-    """Ensure that all functions can be used without requiring a ckpt."""
-    sme = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(), self._get_tmp_dir())
-    eval_results = sme.evaluate(dummy_input_fn, steps=5)
-    self.assertEqual(10, eval_results['global_step'])
-    self.assertEqual(106, eval_results['loss'])
-    self.assertEqual(502, eval_results['metrics/abs_err'])
-
-    predictions = next(sme.predict(dummy_input_fn_features_only))
-    self.assertDictEqual({'output': 503}, predictions)
-
-  def test_partial_exported_estimator(self):
-    sme1 = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(train=False, predict=False), self._get_tmp_dir())
-    sme1.evaluate(dummy_input_fn, steps=5)
-    with self.assertRaisesRegexp(RuntimeError, 'train mode is not available'):
-      sme1.train(input_fn=dummy_input_fn, steps=1)
-    with self.assertRaisesRegexp(RuntimeError, 'infer mode is not available'):
-      next(sme1.predict(dummy_input_fn_features_only))
-
-    sme2 = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(evaluate=False), self._get_tmp_dir())
-    sme2.train(input_fn=dummy_input_fn, steps=1)
-    next(sme2.predict(dummy_input_fn_features_only))
-    with self.assertRaisesRegexp(RuntimeError, 'eval mode is not available'):
-      sme2.evaluate(dummy_input_fn, steps=5)
-
-  def test_with_incorrect_input(self):
-    sme = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(), self._get_tmp_dir())
-
-    def bad_shape_input_fn():
-      return dataset_ops.Dataset.from_tensors((
-          {'x': constant_op.constant([1, 2], dtype=dtypes.int64)},
-          constant_op.constant([1, 2], dtype=dtypes.float32)))
-
-    with self.assertRaisesRegexp(ValueError, 'Expected shape'):
-      sme.train(bad_shape_input_fn, steps=1)
-
-    def bad_dtype_input_fn():
-      return dataset_ops.Dataset.from_tensors((
-          {'x': constant_op.constant([[1], [1]], dtype=dtypes.int32)},
-          constant_op.constant([[1], [1]], dtype=dtypes.int64)))
-
-    with self.assertRaisesRegexp(ValueError, 'Expected dtype'):
-      sme.train(bad_dtype_input_fn, steps=1)
-
-  def test_input_fn_with_global_step(self):
-    sme = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(), self._get_tmp_dir())
-
-    def bad_input_fn():
-      training.get_or_create_global_step()
-      return dataset_ops.Dataset.from_tensors((
-          {'x': constant_op.constant([[1], [1]], dtype=dtypes.int64)},
-          constant_op.constant([[1], [1]], dtype=dtypes.float32)))
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 'Graph must not contain a global step tensor'):
-      sme.train(bad_input_fn, steps=1)
-
-  def test_re_export_saved_model_serving_only(self):
-    sme = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(), self._get_tmp_dir())
-    sme.train(dummy_input_fn, steps=3)
-    self.assertEqual(13, sme.get_variable_value('global_step'))
-    self.assertEqual(60, sme.get_variable_value('some_var'))
-
-    predictions = next(sme.predict(dummy_input_fn_features_only))
-    self.assertDictEqual({'output': 503}, predictions)
-
-    # Export SavedModel, and test that the variable and prediction values are
-    # the same.
-    sme_export_dir = sme.export_savedmodel(
-        self._get_tmp_dir(), dummy_serving_receiver_fn())
-
-    sme2 = saved_model_estimator.SavedModelEstimator(
-        sme_export_dir, self._get_tmp_dir())
-    self.assertEqual(60, sme.get_variable_value('some_var'))
-    self.assertEqual(13, sme.get_variable_value('global_step'))
-
-    predictions = next(sme2.predict(dummy_input_fn_features_only))
-    self.assertDictEqual({'output': 503}, predictions)
-
-  def test_re_export_saved_model(self):
-    sme = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(), self._get_tmp_dir())
-    self.assertDictEqual(
-        {'loss': 106, 'metrics/abs_err': 502, 'global_step': 10},
-        sme.evaluate(dummy_input_fn, steps=1))
-
-    sme.train(dummy_input_fn, steps=3)
-    self.assertDictEqual(
-        {'loss': 106, 'metrics/abs_err': 502, 'global_step': 13},
-        sme.evaluate(dummy_input_fn, steps=1))
-    self.assertEqual(60, sme.get_variable_value('some_var'))
-
-    predictions = next(sme.predict(dummy_input_fn_features_only))
-    self.assertDictEqual({'output': 503}, predictions)
-
-    # Export SavedModel for all modes
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: dummy_supervised_receiver_fn(),
-        model_fn_lib.ModeKeys.EVAL: dummy_supervised_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: dummy_serving_receiver_fn()}
-    sme_export_dir = contrib_export.export_all_saved_models(
-        sme, self._get_tmp_dir(), input_receiver_fn_map)
-
-    sme2 = saved_model_estimator.SavedModelEstimator(
-        sme_export_dir, self._get_tmp_dir())
-    self.assertDictEqual(
-        {'loss': 106, 'metrics/abs_err': 502, 'global_step': 13},
-        sme.evaluate(dummy_input_fn, steps=1))
-    self.assertEqual(60, sme.get_variable_value('some_var'))
-
-    sme.train(dummy_input_fn, steps=7)
-    self.assertEqual(20, sme.get_variable_value('global_step'))
-
-    predictions = next(sme2.predict(dummy_input_fn_features_only))
-    self.assertDictEqual({'output': 503}, predictions)
-
-  def test_load_saved_model_from_serving_only(self):
-    def model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant([103]),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([502]),
-          export_outputs={'test': export_output.ClassificationOutput(
-              constant_op.constant([[32.]]))})
-
-    est = estimator.Estimator(model_fn, self._get_tmp_dir())
-    est.train(input_fn=dummy_input_fn, steps=10)
-
-    def serving_input_receiver_fn():
-      return export.ServingInputReceiver(
-          {'test-features': constant_op.constant([[1], [1]])},
-          array_ops.placeholder(dtype=dtypes.string))
-
-    export_dir = est.export_savedmodel(
-        self._get_tmp_dir(), serving_input_receiver_fn)
-
-    sme = saved_model_estimator.SavedModelEstimator(
-        export_dir, self._get_tmp_dir())
-
-    def input_fn():
-      return {'inputs': constant_op.constant('someinputstr')}
-
-    prediction = next(sme.predict(input_fn))
-    self.assertDictEqual({'scores': 32}, prediction)
-
-  def test_with_local_init_op(self):
-    def model_fn(features, labels, mode):
-      _, _ = features, labels
-      v = variables.Variable(21, name='some_var')
-      scaffold = monitored_session.Scaffold(
-          local_init_op=state_ops.assign_add(v, -3).op
-      )
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          scaffold=scaffold,
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          loss=array_ops.identity(v))
-    export_dir = self._export_estimator(predict=False, model_fn=model_fn)
-    sme = saved_model_estimator.SavedModelEstimator(
-        export_dir, self._get_tmp_dir())
-
-    eval_results1 = sme.evaluate(dummy_input_fn, steps=2)
-    self.assertEqual(15, eval_results1['loss'])
-
-    sme.train(dummy_input_fn, steps=1)
-    self.assertEqual(15, sme.get_variable_value('some_var'))
-
-    eval_results2 = sme.evaluate(dummy_input_fn, steps=5)
-    self.assertEqual(12, eval_results2['loss'])
-
-  def test_with_working_input_fn(self):
-    def model_fn(features, labels, mode):
-      loss = None
-      if labels is not None:
-        loss = labels[0][0] + labels[1][0]
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=loss,
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions={'features_0': array_ops.identity([features['x'][0][0]]),
-                       'features_1': array_ops.identity([features['x'][1][0]])})
-
-    sme = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(model_fn=model_fn), self._get_tmp_dir())
-    eval_results = sme.evaluate(dummy_input_fn, steps=1)
-    self.assertEqual(1, eval_results['loss'])
-
-    predictions = next(sme.predict(dummy_input_fn_features_only))
-    self.assertDictEqual({'features_0': 5, 'features_1': 6}, predictions)
-
-  def test_control_dependency(self):
-    # Control dependencies are saved with "^" appended to the start of the input
-    # name. The input map must include control dependencies as well.
-    def model_fn(features, labels, mode):
-      _ = labels
-      with ops.control_dependencies([features['x']]):
-        loss = features['x'][1][0]
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=loss,
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-    sme = saved_model_estimator.SavedModelEstimator(
-        self._export_estimator(train=False, predict=False, model_fn=model_fn),
-        self._get_tmp_dir())
-    sme.evaluate(dummy_input_fn, steps=1)  # Should run without error
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index f384d761a8430074f022c973d7ec3d46cd90f70b..3eb396a29ccdc0478384f9fa122465731740a30d 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -26,7 +26,7 @@ from tensorflow.contrib.factorization.python.ops import clustering_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export_output
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index 1ab5418fe4659cb0068ee8c3ca1442f6f723ee76..2f7cd131d3ed20df307ed231cce2ecb50ecfbceb 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -27,7 +27,7 @@ from sklearn.cluster import KMeans as SklearnKMeans
 # pylint: disable=g-import-not-at-top
 from tensorflow.contrib.factorization.python.ops import kmeans as kmeans_lib
 from tensorflow.python.estimator import run_config
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index aab7d0c9e8874269bfa5f33193b0dc0ba4bbc9cd..1cd83bdb5de7c2f6dc91c980750b49aca1a7790b 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -14,6 +14,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":sequence_feature_column",
+        ":sequence_feature_column_v2",
         "//tensorflow/python:util",
     ],
 )
@@ -27,11 +28,12 @@ py_library(
         "//tensorflow/python:check_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
     ],
 )
 
@@ -46,9 +48,71 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/feature_column:feature_column_py",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "sequence_feature_column_integration_test",
+    srcs = ["python/feature_column/sequence_feature_column_integration_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":sequence_feature_column",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/keras:layers",
+    ],
+)
+
+py_library(
+    name = "sequence_feature_column_v2",
+    srcs = ["python/feature_column/sequence_feature_column_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
+    ],
+)
+
+py_test(
+    name = "sequence_feature_column_v2_test",
+    srcs = ["python/feature_column/sequence_feature_column_v2_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":sequence_feature_column",
+        ":sequence_feature_column_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 05bcdac2caa77062f9a8a44a948d2897b439ea1f..9b3a5c58aaa9498257fc971ac60b97f31d5185d8 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -33,7 +33,6 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 
 # pylint: disable=protected-access
-# TODO(b/73827486): Support SequenceExample.
 
 
 def sequence_input_layer(
@@ -110,6 +109,7 @@ def sequence_input_layer(
     output_tensors = []
     sequence_lengths = []
     ordered_columns = []
+
     for column in sorted(feature_columns, key=lambda x: x.name):
       ordered_columns.append(column)
       with variable_scope.variable_scope(
@@ -121,17 +121,67 @@ def sequence_input_layer(
         # Flattens the final dimension to produce a 3D Tensor.
         num_elements = column._variable_shape.num_elements()
         shape = array_ops.shape(dense_tensor)
+        target_shape = [shape[0], shape[1], num_elements]
         output_tensors.append(
-            array_ops.reshape(
-                dense_tensor,
-                shape=array_ops.concat([shape[:2], [num_elements]], axis=0)))
+            array_ops.reshape(dense_tensor, shape=target_shape))
         sequence_lengths.append(sequence_length)
+
     fc._verify_static_batch_size_equality(output_tensors, ordered_columns)
     fc._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
     sequence_length = _assert_all_equal_and_return(sequence_lengths)
+
     return array_ops.concat(output_tensors, -1), sequence_length
 
 
+def concatenate_context_input(context_input, sequence_input):
+  """Replicates `context_input` across all timesteps of `sequence_input`.
+
+  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
+  This value is appended to `sequence_input` on dimension 2 and the result is
+  returned.
+
+  Args:
+    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
+    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
+      padded_length, d0]`.
+
+  Returns:
+    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
+    d0 + d1]`.
+
+  Raises:
+    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
+      not have rank 2.
+  """
+  seq_rank_check = check_ops.assert_rank(
+      sequence_input,
+      3,
+      message='sequence_input must have rank 3',
+      data=[array_ops.shape(sequence_input)])
+  seq_type_check = check_ops.assert_type(
+      sequence_input,
+      dtypes.float32,
+      message='sequence_input must have dtype float32; got {}.'.format(
+          sequence_input.dtype))
+  ctx_rank_check = check_ops.assert_rank(
+      context_input,
+      2,
+      message='context_input must have rank 2',
+      data=[array_ops.shape(context_input)])
+  ctx_type_check = check_ops.assert_type(
+      context_input,
+      dtypes.float32,
+      message='context_input must have dtype float32; got {}.'.format(
+          context_input.dtype))
+  with ops.control_dependencies(
+      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
+    padded_length = array_ops.shape(sequence_input)[1]
+    tiled_context_input = array_ops.tile(
+        array_ops.expand_dims(context_input, 1),
+        array_ops.concat([[1], [padded_length], [1]], 0))
+  return array_ops.concat([sequence_input, tiled_context_input], 2)
+
+
 def sequence_categorical_column_with_identity(
     key, num_buckets, default_value=None):
   """Returns a feature column that represents sequences of integers.
@@ -172,10 +222,8 @@ def sequence_categorical_column_with_identity(
     ValueError: if `default_value` is not in range `[0, num_buckets)`.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_identity(
-          key=key,
-          num_buckets=num_buckets,
-          default_value=default_value))
+      fc._categorical_column_with_identity(
+          key=key, num_buckets=num_buckets, default_value=default_value))
 
 
 def sequence_categorical_column_with_hash_bucket(
@@ -215,10 +263,8 @@ def sequence_categorical_column_with_hash_bucket(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_hash_bucket(
-          key=key,
-          hash_bucket_size=hash_bucket_size,
-          dtype=dtype))
+      fc._categorical_column_with_hash_bucket(
+          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
 
 
 def sequence_categorical_column_with_vocabulary_file(
@@ -274,7 +320,7 @@ def sequence_categorical_column_with_vocabulary_file(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key=key,
           vocabulary_file=vocabulary_file,
           vocabulary_size=vocabulary_size,
@@ -334,7 +380,7 @@ def sequence_categorical_column_with_vocabulary_list(
     ValueError: if `dtype` is not integer or string.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key=key,
           vocabulary_list=vocabulary_list,
           dtype=dtype,
@@ -453,9 +499,17 @@ class _SequenceNumericColumn(
         [array_ops.shape(dense_tensor)[:1], [-1], self._variable_shape],
         axis=0)
     dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape)
-    sequence_length = fc._sequence_length_from_sparse_tensor(
-        sp_tensor, num_elements=self._variable_shape.num_elements())
+
+    # Get the number of timesteps per example
+    # For the 2D case, the raw values are grouped according to num_elements;
+    # for the 3D case, the grouping happens in the third dimension, and
+    # sequence length is not affected.
+    num_elements = (self._variable_shape.num_elements()
+                    if sp_tensor.shape.ndims == 2 else 1)
+    seq_length = fc._sequence_length_from_sparse_tensor(
+        sp_tensor, num_elements=num_elements)
+
     return fc._SequenceDenseColumn.TensorSequenceLengthPair(
-        dense_tensor=dense_tensor, sequence_length=sequence_length)
+        dense_tensor=dense_tensor, sequence_length=seq_length)
 
 # pylint: enable=protected-access
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcc25b8de895a769f9e11b207c2092e23d029b1f
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
@@ -0,0 +1,281 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration test for sequence feature columns with SequenceExamples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import string
+import tempfile
+
+from google.protobuf import text_format
+
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class SequenceFeatureColumnIntegrationTest(test.TestCase):
+
+  def _make_sequence_example(self):
+    example = example_pb2.SequenceExample()
+    example.context.feature['int_ctx'].int64_list.value.extend([5])
+    example.context.feature['float_ctx'].float_list.value.extend([123.6])
+    for val in range(0, 10, 2):
+      feat = feature_pb2.Feature()
+      feat.int64_list.value.extend([val] * val)
+      example.feature_lists.feature_list['int_list'].feature.extend([feat])
+    for val in range(1, 11, 2):
+      feat = feature_pb2.Feature()
+      feat.bytes_list.value.extend([compat.as_bytes(str(val))] * val)
+      example.feature_lists.feature_list['str_list'].feature.extend([feat])
+
+    return example
+
+  def _build_feature_columns(self):
+    col = fc._categorical_column_with_identity('int_ctx', num_buckets=100)
+    ctx_cols = [
+        fc._embedding_column(col, dimension=10),
+        fc._numeric_column('float_ctx')
+    ]
+
+    identity_col = sfc.sequence_categorical_column_with_identity(
+        'int_list', num_buckets=10)
+    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
+        'bytes_list', hash_bucket_size=100)
+    seq_cols = [
+        fc._embedding_column(identity_col, dimension=10),
+        fc._embedding_column(bucket_col, dimension=20)
+    ]
+
+    return ctx_cols, seq_cols
+
+  def test_sequence_example_into_input_layer(self):
+    examples = [_make_sequence_example().SerializeToString()] * 100
+    ctx_cols, seq_cols = self._build_feature_columns()
+
+    def _parse_example(example):
+      ctx, seq = parsing_ops.parse_single_sequence_example(
+          example,
+          context_features=fc.make_parse_example_spec(ctx_cols),
+          sequence_features=fc.make_parse_example_spec(seq_cols))
+      ctx.update(seq)
+      return ctx
+
+    ds = dataset_ops.Dataset.from_tensor_slices(examples)
+    ds = ds.map(_parse_example)
+    ds = ds.batch(20)
+
+    # Test on a single batch
+    features = ds.make_one_shot_iterator().get_next()
+
+    # Tile the context features across the sequence features
+    seq_layer, _ = sfc.sequence_input_layer(features, seq_cols)
+    ctx_layer = fc.input_layer(features, ctx_cols)
+    input_layer = sfc.concatenate_context_input(ctx_layer, seq_layer)
+
+    rnn_layer = recurrent.RNN(recurrent.SimpleRNNCell(10))
+    output = rnn_layer(input_layer)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      features_r = sess.run(features)
+      self.assertAllEqual(features_r['int_list'].dense_shape, [20, 3, 6])
+
+      output_r = sess.run(output)
+      self.assertAllEqual(output_r.shape, [20, 10])
+
+
+class SequenceExampleParsingTest(test.TestCase):
+
+  def test_seq_ex_in_sequence_categorical_column_with_identity(self):
+    self._test_parsed_sequence_example(
+        'int_list', sfc.sequence_categorical_column_with_identity,
+        10, [3, 6], [2, 4, 6])
+
+  def test_seq_ex_in_sequence_categorical_column_with_hash_bucket(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_hash_bucket,
+        10, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_list(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_list,
+        list(string.ascii_lowercase), [3, 4],
+        [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_file(self):
+    _, fname = tempfile.mkstemp()
+    with open(fname, 'w') as f:
+      f.write(string.ascii_lowercase)
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_file,
+        fname, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def _test_parsed_sequence_example(
+      self, col_name, col_fn, col_arg, shape, values):
+    """Helper function to check that each FeatureColumn parses correctly.
+
+    Args:
+      col_name: string, name to give to the feature column. Should match
+        the name that the column will parse out of the features dict.
+      col_fn: function used to create the feature column. For example,
+        sequence_numeric_column.
+      col_arg: second arg that the target feature column is expecting.
+      shape: the expected dense_shape of the feature after parsing into
+        a SparseTensor.
+      values: the expected values at index [0, 2, 6] of the feature
+        after parsing into a SparseTensor.
+    """
+    example = _make_sequence_example()
+    columns = [
+        fc._categorical_column_with_identity('int_ctx', num_buckets=100),
+        fc._numeric_column('float_ctx'),
+        col_fn(col_name, col_arg)
+    ]
+    context, seq_features = parsing_ops.parse_single_sequence_example(
+        example.SerializeToString(),
+        context_features=fc.make_parse_example_spec(columns[:2]),
+        sequence_features=fc.make_parse_example_spec(columns[2:]))
+
+    with self.cached_session() as sess:
+      ctx_result, seq_result = sess.run([context, seq_features])
+      self.assertEqual(list(seq_result[col_name].dense_shape), shape)
+      self.assertEqual(
+          list(seq_result[col_name].values[[0, 2, 6]]), values)
+      self.assertEqual(list(ctx_result['int_ctx'].dense_shape), [1])
+      self.assertEqual(ctx_result['int_ctx'].values[0], 5)
+      self.assertEqual(list(ctx_result['float_ctx'].shape), [1])
+      self.assertAlmostEqual(ctx_result['float_ctx'][0], 123.6, places=1)
+
+
+_SEQ_EX_PROTO = """
+context {
+  feature {
+    key: "float_ctx"
+    value {
+      float_list {
+        value: 123.6
+      }
+    }
+  }
+  feature {
+    key: "int_ctx"
+    value {
+      int64_list {
+        value: 5
+      }
+    }
+  }
+}
+feature_lists {
+  feature_list {
+    key: "bytes_list"
+    value {
+      feature {
+        bytes_list {
+          value: "a"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "b"
+          value: "c"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "d"
+          value: "e"
+          value: "f"
+          value: "g"
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "float_list"
+    value {
+      feature {
+        float_list {
+          value: 1.0
+        }
+      }
+      feature {
+        float_list {
+          value: 3.0
+          value: 3.0
+          value: 3.0
+        }
+      }
+      feature {
+        float_list {
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "int_list"
+    value {
+      feature {
+        int64_list {
+          value: 2
+          value: 2
+        }
+      }
+      feature {
+        int64_list {
+          value: 4
+          value: 4
+          value: 4
+          value: 4
+        }
+      }
+      feature {
+        int64_list {
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def _make_sequence_example():
+  example = example_pb2.SequenceExample()
+  return text_format.Parse(_SEQ_EX_PROTO, example)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 45d7b740462ca21139e2e93e34b43668f1e08a94..d5f74028298ee7015f5b2e3aaee7d9330c1acac1 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -19,37 +19,75 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc_lib
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
 
-class SequenceInputLayerTest(test.TestCase):
+class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args_a': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'sparse_input_args_b': {
+           # example 0, ids [1]
+           # example 1, ids [2, 0]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 2, 0),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [2, 0]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_args_a': {
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           'indices': (
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 0, 0, 1),
+           'dense_shape': (2, 2, 2)},
+       'sparse_input_args_b': {
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[2], [0]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (1, 1, 1, 2, 0),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[5., 6., 14., 15., 16.], [2., 3., 14., 15., 16.]],
+           # feature 1, [a: 0, 0, b: 2, -], [a: 1, -, b: 0, -]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_embedding_column(
+      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
+      expected_sequence_length):
 
-  def test_embedding_column(self):
+    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
+    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
     vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [1]
-        # example 1, ids [2, 0]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-
     embedding_dimension_a = 2
     embedding_values_a = (
         (1., 2.),  # id 0
@@ -70,23 +108,17 @@ class SequenceInputLayerTest(test.TestCase):
         return embedding_values
       return _initializer
 
-    expected_input_layer = [
-        # example 0, ids_a [2], ids_b [1]
-        [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
-        # example 1, ids_a [0, 1], ids_b [2, 0]
-        [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],
-    ]
-    expected_sequence_length = [1, 2]
-
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=embedding_dimension_a,
+    embedding_column_a = fc._embedding_column(
+        categorical_column_a,
+        dimension=embedding_dimension_a,
         initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc.embedding_column(
-        categorical_column_b, dimension=embedding_dimension_b,
+    embedding_column_b = fc._embedding_column(
+        categorical_column_b,
+        dimension=embedding_dimension_b,
         initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -119,10 +151,9 @@ class SequenceInputLayerTest(test.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
+    embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -177,7 +208,7 @@ class SequenceInputLayerTest(test.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
     # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension,
         initializer=_get_initializer(embedding_dimension, embedding_values))
@@ -215,11 +246,11 @@ class SequenceInputLayerTest(test.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     with self.assertRaisesRegexp(
@@ -233,36 +264,63 @@ class SequenceInputLayerTest(test.TestCase):
           },
           feature_columns=shared_embedding_columns)
 
-  def test_indicator_column(self):
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args_a': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'sparse_input_args_b': {
+           # example 0, ids [1]
+           # example 1, ids [1, 0]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 1, 0),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [1, 0]
+           [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_args_a': {
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           'indices': (
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 0, 0, 1),
+           'dense_shape': (2, 2, 2)},
+       'sparse_input_args_b': {
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[1], [0]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (1, 1, 1, 1, 0),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[0., 0., 1., 0., 2.], [1., 1., 0., 0., 1.]],
+           # feature 1, [a: 0, 0, b: 1, -], [a: 1, -, b: 0, -]
+           [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_indicator_column(
+      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
+      expected_sequence_length):
+    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
+    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
+
     vocabulary_size_a = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
     vocabulary_size_b = 2
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [1]
-        # example 1, ids [1, 0]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 1, 0),
-        dense_shape=(2, 2))
-
-    expected_input_layer = [
-        # example 0, ids_a [2], ids_b [1]
-        [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
-        # example 1, ids_a [0, 1], ids_b [1, 0]
-        [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]],
-    ]
-    expected_sequence_length = [1, 2]
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc.indicator_column(categorical_column_b)
+    indicator_column_b = fc._indicator_column(categorical_column_b)
     input_layer, sequence_length = sfc.sequence_input_layer(
         features={
             'aaa': sparse_input_a,
@@ -286,9 +344,9 @@ class SequenceInputLayerTest(test.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -298,18 +356,34 @@ class SequenceInputLayerTest(test.TestCase):
           features={'aaa': sparse_input},
           feature_columns=[indicator_column_a])
 
-  def test_numeric_column(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    expected_input_layer = [
-        [[0.], [1.]],
-        [[10.], [0.]],
-    ]
-    expected_sequence_length = [2, 1]
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [0., 1]
+           # example 1, [10.]
+           'indices': ((0, 0), (0, 1), (1, 0)),
+           'values': (0., 1., 10.),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           [[0.], [1.]],
+           [[10.], [0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # feature 0, ids [[20, 3], [5]]
+           # feature 1, ids [[3], [8]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (20, 3, 5., 3., 8.),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           [[20.], [3.], [5.], [0.]],
+           [[3.], [0.], [8.], [0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_numeric_column(
+      self, sparse_input_args, expected_input_layer, expected_sequence_length):
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+
     numeric_column = sfc.sequence_numeric_column('aaa')
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -321,21 +395,40 @@ class SequenceInputLayerTest(test.TestCase):
       self.assertAllEqual(
           expected_sequence_length, sequence_length.eval(session=sess))
 
-  def test_numeric_column_multi_dim(self):
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [0., 1.,  2., 3., 4., 5., 6., 7.]
+           # example 1, [10., 11., 12., 13.]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
+                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 4)},
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      )
+  def test_numeric_column_multi_dim(
+      self, sparse_input_args, expected_input_layer, expected_sequence_length):
     """Tests sequence_input_layer for multi-dimensional numeric_column."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
-        # example 1, [[[10., 11.],  [12., 13.]]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
-                 (1, 0), (1, 1), (1, 2), (1, 3)),
-        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-        dense_shape=(2, 8))
-    # The output of numeric_column._get_dense_tensor should be flattened.
-    expected_input_layer = [
-        [[0., 1., 2., 3.], [4., 5., 6., 7.]],
-        [[10., 11., 12., 13.], [0., 0., 0., 0.]],
-    ]
-    expected_sequence_length = [2, 1]
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+
     numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -377,6 +470,138 @@ class SequenceInputLayerTest(test.TestCase):
           r'\[y \(sequence_input_layer/bbb/sequence_length:0\) = \] \[1 1\]'):
         sess.run(sequence_length)
 
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+           # example 1, [[[10., 11.],  [12., 13.]]]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_shape': [2, 2, 4]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 2),
+                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 4)},
+       'expected_shape': [2, 2, 4]},
+      )
+  def test_static_shape_from_tensors_numeric(
+      self, sparse_input_args, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    input_layer, _ = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected_shape': [4, 2, 3]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [0, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 0, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected_shape': [4, 2, 3]}
+      )
+  def test_static_shape_from_tensors_indicator(
+      self, sparse_input_args, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    indicator_column = fc._indicator_column(categorical_column)
+
+    input_layer, _ = sfc.sequence_input_layer(
+        features={'aaa': sparse_input}, feature_columns=[indicator_column])
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+
+class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
+  """Tests the utility fn concatenate_context_input."""
+
+  def test_concatenate_context_input(self):
+    seq_input = ops.convert_to_tensor(np.arange(12).reshape(2, 3, 2))
+    context_input = ops.convert_to_tensor(np.arange(10).reshape(2, 5))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    input_layer = sfc.concatenate_context_input(context_input, seq_input)
+
+    expected = np.array([
+        [[0, 1, 0, 1, 2, 3, 4], [2, 3, 0, 1, 2, 3, 4], [4, 5, 0, 1, 2, 3, 4]],
+        [[6, 7, 5, 6, 7, 8, 9], [8, 9, 5, 6, 7, 8, 9], [10, 11, 5, 6, 7, 8, 9]]
+    ], dtype=np.float32)
+    with monitored_session.MonitoredSession() as sess:
+      output = sess.run(input_layer)
+      self.assertAllEqual(expected, output)
+
+  @parameterized.named_parameters(
+      {'testcase_name': 'rank_lt_3',
+       'seq_input_arg': np.arange(100).reshape(10, 10)},
+      {'testcase_name': 'rank_gt_3',
+       'seq_input_arg': np.arange(100).reshape(5, 5, 2, 2)}
+      )
+  def test_sequence_input_throws_error(self, seq_input_arg):
+    seq_input = ops.convert_to_tensor(seq_input_arg)
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'sequence_input must have rank 3'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  @parameterized.named_parameters(
+      {'testcase_name': 'rank_lt_2',
+       'context_input_arg': np.arange(100)},
+      {'testcase_name': 'rank_gt_2',
+       'context_input_arg': np.arange(100).reshape(5, 5, 4)}
+      )
+  def test_context_input_throws_error(self, context_input_arg):
+    context_input = ops.convert_to_tensor(context_input_arg)
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'context_input must have rank 2'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  def test_integer_seq_input_throws_error(self):
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(
+        TypeError, 'sequence_input must have dtype float32'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  def test_integer_context_input_throws_error(self):
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(
+        TypeError, 'context_input must have dtype float32'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
 
 class InputLayerTest(test.TestCase):
   """Tests input_layer with sequence feature columns."""
@@ -393,8 +618,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
+    embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -416,7 +640,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -443,75 +667,83 @@ def _assert_sparse_tensor_indices_shape(test_case, expected, actual):
   test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
 
 
-class SequenceCategoricalColumnWithIdentityTest(test.TestCase):
-
-  def test_get_sparse_tensors(self):
-    column = sfc.sequence_categorical_column_with_identity(
-        'aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-    expected_sparse_ids = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=np.array((1, 2, 0), dtype=np.int64),
-        dense_shape=(2, 2, 1))
+class SequenceCategoricalColumnWithIdentityTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 2, 0),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           'values': np.array((1, 2, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': (6, 7, 8),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': (6, 7, 8),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
+    column = sfc.sequence_categorical_column_with_identity('aaa', num_buckets=9)
 
     id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
       _assert_sparse_tensor_value(
-          self,
-          expected_sparse_ids,
-          id_weight_pair.id_tensor.eval(session=sess))
-
-  def test_get_sparse_tensors_inputs3d(self):
-    """Tests _get_sparse_tensors when the input is already 3D Tensor."""
-    column = sfc.sequence_categorical_column_with_identity(
-        'aaa', num_buckets=3)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2, 1))
-
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        r'Column aaa expected ID tensor of rank 2\.\s*'
-        r'id_tensor shape:\s*\[2 2 1\]'):
-      id_weight_pair = column._get_sparse_tensors(
-          _LazyBuilder({'aaa': inputs}))
-      with monitored_session.MonitoredSession() as sess:
-        id_weight_pair.id_tensor.eval(session=sess)
-
-
-class SequenceCategoricalColumnWithHashBucketTest(test.TestCase):
-
-  def test_get_sparse_tensors(self):
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceCategoricalColumnWithHashBucketTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': ('omar', 'stringer', 'marlo'),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           # Ignored to avoid hash dependence in test.
+           'values': np.array((0, 0, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': ('omar', 'stringer', 'marlo'),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           # Ignored to avoid hash dependence in test.
+           'values': np.array((0, 0, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
     column = sfc.sequence_categorical_column_with_hash_bucket(
         'aaa', hash_bucket_size=10)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('omar', 'stringer', 'marlo'),
-        dense_shape=(2, 2))
-
-    expected_sparse_ids = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        # Ignored to avoid hash dependence in test.
-        values=np.array((0, 0, 0), dtype=np.int64),
-        dense_shape=(2, 2, 1))
 
     id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
       _assert_sparse_tensor_indices_shape(
-          self,
-          expected_sparse_ids,
-          id_weight_pair.id_tensor.eval(session=sess))
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
 
 
-class SequenceCategoricalColumnWithVocabularyFileTest(test.TestCase):
+class SequenceCategoricalColumnWithVocabularyFileTest(
+    test.TestCase, parameterized.TestCase):
 
   def _write_vocab(self, vocab_strings, file_name):
     vocab_file = os.path.join(self.get_temp_dir(), file_name)
@@ -527,68 +759,152 @@ class SequenceCategoricalColumnWithVocabularyFileTest(test.TestCase):
                                                         'wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
-  def test_get_sparse_tensors(self):
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': ('marlo', 'skywalker', 'omar'),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           'values': np.array((2, -1, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': ('omar', 'skywalker', 'marlo'),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': np.array((0, -1, 2), dtype=np.int64),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
     column = sfc.sequence_categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    expected_sparse_ids = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=np.array((2, -1, 0), dtype=np.int64),
-        dense_shape=(2, 2, 1))
 
     id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
       _assert_sparse_tensor_value(
-          self,
-          expected_sparse_ids,
-          id_weight_pair.id_tensor.eval(session=sess))
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
 
+  def test_get_sparse_tensors_dynamic_zero_length(self):
+    """Tests _get_sparse_tensors with a dynamic sequence length."""
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=np.zeros((0, 2)), values=[], dense_shape=(2, 0))
+    expected = sparse_tensor.SparseTensorValue(
+        indices=np.zeros((0, 3)),
+        values=np.array((), dtype=np.int64),
+        dense_shape=(2, 0, 1))
+    column = sfc.sequence_categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    input_placeholder_shape = list(inputs.dense_shape)
+    # Make second dimension (sequence length) dynamic.
+    input_placeholder_shape[1] = None
+    input_placeholder = array_ops.sparse_placeholder(
+        dtypes.string, shape=input_placeholder_shape)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({'aaa': input_placeholder}))
 
-class SequenceCategoricalColumnWithVocabularyListTest(test.TestCase):
-
-  def test_get_sparse_tensors(self):
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      result = id_weight_pair.id_tensor.eval(
+          session=sess, feed_dict={input_placeholder: inputs})
+      _assert_sparse_tensor_value(
+          self, expected, result)
+
+
+class SequenceCategoricalColumnWithVocabularyListTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': ('marlo', 'skywalker', 'omar'),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           'values': np.array((2, -1, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': ('omar', 'skywalker', 'marlo'),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': np.array((0, -1, 2), dtype=np.int64),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
     column = sfc.sequence_categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'))
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=('marlo', 'skywalker', 'omar'),
-        dense_shape=(2, 2))
-    expected_sparse_ids = sparse_tensor.SparseTensorValue(
-        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
-        values=np.array((2, -1, 0), dtype=np.int64),
-        dense_shape=(2, 2, 1))
 
     id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
     self.assertIsNone(id_weight_pair.weight_tensor)
     with monitored_session.MonitoredSession() as sess:
       _assert_sparse_tensor_value(
-          self,
-          expected_sparse_ids,
-          id_weight_pair.id_tensor.eval(session=sess))
-
-
-class SequenceEmbeddingColumnTest(test.TestCase):
-
-  def test_get_sequence_dense_tensor(self):
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceEmbeddingColumnTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected': [
+           # example 0, ids [2]
+           [[7., 11.], [0., 0.]],
+           # example 1, ids [0, 1]
+           [[1., 2.], [3., 5.]],
+           # example 2, ids []
+           [[0., 0.], [0., 0.]],
+           # example 3, ids [1]
+           [[3., 5.], [0., 0.]]]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [0, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 0, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected': [
+           # example 0, ids [[2]]
+           [[7., 11.], [0., 0.]],
+           # example 1, ids [[0, 1], [2]]
+           [[2, 3.5], [7., 11.]],
+           # example 2, ids []
+           [[0., 0.], [0., 0.]],
+           # example 3, ids [[1], [0, 2]]
+           [[3., 5.], [4., 6.5]]]}
+      )
+  def test_get_sequence_dense_tensor(self, inputs_args, expected):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
     vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 2))
-
     embedding_dimension = 2
     embedding_values = (
         (1., 2.),  # id 0
@@ -601,50 +917,51 @@ class SequenceEmbeddingColumnTest(test.TestCase):
       self.assertIsNone(partition_info)
       return embedding_values
 
-    expected_lookups = [
-        # example 0, ids [2]
-        [[7., 11.], [0., 0.]],
-        # example 1, ids [0, 1]
-        [[1., 2.], [3., 5.]],
-        # example 2, ids []
-        [[0., 0.], [0., 0.]],
-        # example 3, ids [1]
-        [[3., 5.], [0., 0.]],
-    ]
-
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(
         ('embedding_weights:0',), tuple([v.name for v in global_vars]))
     with monitored_session.MonitoredSession() as sess:
       self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval(session=sess))
-
-  def test_sequence_length(self):
+      self.assertAllEqual(expected, embedding_lookup.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 2),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2]}
+      )
+  def test_sequence_length(self, inputs_args, expected_sequence_length):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
     vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length = [1, 2]
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
       sequence_length = sess.run(sequence_length)
@@ -668,8 +985,7 @@ class SequenceEmbeddingColumnTest(test.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
@@ -739,7 +1055,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -785,7 +1101,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     expected_sequence_length_b = [2, 1]
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -836,7 +1152,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
 
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -855,56 +1171,89 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
           expected_sequence_length_b, sequence_length_b.eval(session=sess))
 
 
-class SequenceIndicatorColumnTest(test.TestCase):
-
-  def test_get_sequence_dense_tensor(self):
+class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected': [
+           # example 0, ids [2]
+           [[0., 0., 1.], [0., 0., 0.]],
+           # example 1, ids [0, 1]
+           [[1., 0., 0.], [0., 1., 0.]],
+           # example 2, ids []
+           [[0., 0., 0.], [0., 0., 0.]],
+           # example 3, ids [1]
+           [[0., 1., 0.], [0., 0., 0.]]]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [2, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 2, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected': [
+           # example 0, ids [[2]]
+           [[0., 0., 1.], [0., 0., 0.]],
+           # example 1, ids [[0, 1], [2]]
+           [[1., 1., 0.], [0., 0., 1.]],
+           # example 2, ids []
+           [[0., 0., 0.], [0., 0., 0.]],
+           # example 3, ids [[1], [2, 2]]
+           [[0., 1., 0.], [0., 0., 2.]]]}
+      )
+  def test_get_sequence_dense_tensor(self, inputs_args, expected):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
     vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 2))
-
-    expected_lookups = [
-        # example 0, ids [2]
-        [[0., 0., 1.], [0., 0., 0.]],
-        # example 1, ids [0, 1]
-        [[1., 0., 0.], [0., 1., 0.]],
-        # example 2, ids []
-        [[0., 0., 0.], [0., 0., 0.]],
-        # example 3, ids [1]
-        [[0., 1., 0.], [0., 0., 0.]],
-    ]
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_lookups, indicator_tensor.eval(session=sess))
-
-  def test_sequence_length(self):
+      self.assertAllEqual(expected, indicator_tensor.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 2),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2]}
+      )
+  def test_sequence_length(self, inputs_args, expected_sequence_length):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
     vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length = [1, 2]
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
       sequence_length = sess.run(sequence_length)
@@ -928,7 +1277,7 @@ class SequenceIndicatorColumnTest(test.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
@@ -938,7 +1287,7 @@ class SequenceIndicatorColumnTest(test.TestCase):
           expected_sequence_length, sequence_length.eval(session=sess))
 
 
-class SequenceNumericColumnTest(test.TestCase):
+class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
 
   def test_defaults(self):
     a = sfc.sequence_numeric_column('aaa')
@@ -971,25 +1320,37 @@ class SequenceNumericColumnTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, 'must be a callable'):
       sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
 
-  def test_get_sequence_dense_tensor(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    expected_dense_tensor = [
-        [[0.], [1.]],
-        [[10.], [0.]],
-    ]
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, values [0., 1]
+           # example 1, [10.]
+           'indices': ((0, 0), (0, 1), (1, 0)),
+           'values': (0., 1., 10.),
+           'dense_shape': (2, 2)},
+       'expected': [
+           [[0.], [1.]],
+           [[10.], [0.]]]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # feature 0, ids [[20, 3], [5]]
+           # feature 1, ids [[3], [8]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (20, 3, 5., 3., 8.),
+           'dense_shape': (2, 2, 2)},
+       'expected': [
+           [[20.], [3.], [5.], [0.]],
+           [[3.], [0.], [8.], [0.]]]},
+      )
+  def test_get_sequence_dense_tensor(self, inputs_args, expected):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
     numeric_column = sfc.sequence_numeric_column('aaa')
 
     dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
+      self.assertAllEqual(expected, dense_tensor.eval(session=sess))
 
   def test_get_sequence_dense_tensor_with_normalizer_fn(self):
 
@@ -1026,41 +1387,35 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
-  def test_get_sequence_dense_tensor_with_shape(self):
-    """Tests get_sequence_dense_tensor with shape !=(1,)."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
-        # example 1, [[10., 11., 12.]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
-                 (1, 0), (1, 1), (1, 2)),
-        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
-        dense_shape=(2, 6))
-    expected_dense_tensor = [
-        [[0., 1., 2.], [3., 4., 5.]],
-        [[10., 11., 12.], [0., 0., 0.]],
-    ]
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
-
-    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
-
-  def test_get_dense_tensor_multi_dim(self):
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+           # example 1, [[[10., 11.],  [12., 13.]]]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_dense_tensor': [
+           [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
+           [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]]]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           'indices': ((0, 0, 0), (0, 0, 2), (0, 0, 4), (0, 0, 6),
+                       (0, 1, 0), (0, 1, 2), (0, 1, 4), (0, 1, 6),
+                       (1, 0, 0), (1, 0, 2), (1, 0, 4), (1, 0, 6)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 8)},
+       'expected_dense_tensor': [
+           [[[0., 0.], [1., 0.]], [[2., 0.], [3., 0.]],
+            [[4., 0.], [5., 0.]], [[6., 0.], [7., 0.]]],
+           [[[10., 0.], [11., 0.]], [[12., 0.], [13., 0.]],
+            [[0., 0.], [0., 0.]], [[0., 0.], [0., 0.]]]]},
+      )
+  def test_get_dense_tensor_multi_dim(
+      self, sparse_input_args, expected_dense_tensor):
     """Tests get_sequence_dense_tensor for multi-dim numeric_column."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
-        # example 1, [[[10., 11.],  [12., 13.]]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
-                 (1, 0), (1, 1), (1, 2), (1, 3)),
-        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
-        dense_shape=(2, 8))
-    expected_dense_tensor = [
-        [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
-        [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]],
-    ]
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
     numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
     dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
@@ -1070,43 +1425,56 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
-  def test_sequence_length(self):
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
-        # example 1, [[10., 11., 12.]]
-        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
-                 (1, 0), (1, 1), (1, 2)),
-        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
-        dense_shape=(2, 6))
-    expected_sequence_length = [2, 1]
-    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2., 0., 1.),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 2],
+       'shape': (1,)},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2., 0., 1., 2.),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2],
+       'shape': (1,)},
+      {'testcase_name': '2D_with_shape',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2., 0., 1.),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 1],
+       'shape': (2,)},
+      {'testcase_name': '3D_with_shape',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2., 0., 1., 2.),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2],
+       'shape': (2,)},
+      )
+  def test_sequence_length(self, inputs_args, expected_sequence_length, shape):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=shape)
 
     _, sequence_length = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+        _LazyBuilder({'aaa': inputs}))
 
     with monitored_session.MonitoredSession() as sess:
       sequence_length = sess.run(sequence_length)
       self.assertAllEqual(expected_sequence_length, sequence_length)
       self.assertEqual(np.int64, sequence_length.dtype)
 
-  def test_sequence_length_with_shape(self):
-    """Tests _sequence_length with shape !=(1,)."""
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, values [[0.], [1]]
-        # example 1, [[10.]]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0., 1., 10.),
-        dense_shape=(2, 2))
-    expected_sequence_length = [2, 1]
-    numeric_column = sfc.sequence_numeric_column('aaa')
-
-    _, sequence_length = numeric_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d34ad161855476b6a4cd9a258521dbe122b4140
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
@@ -0,0 +1,558 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This API defines FeatureColumn for sequential input.
+
+NOTE: This API is a work in progress and will likely be changing frequently.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import collections
+
+
+from tensorflow.python.feature_column import feature_column as fc_old
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
+
+# pylint: disable=protected-access
+
+
+def sequence_input_layer(
+    features,
+    feature_columns,
+    weight_collections=None,
+    trainable=True):
+  """"Builds input layer for sequence input.
+
+  All `feature_columns` must be sequence dense columns with the same
+  `sequence_length`. The output of this method can be fed into sequence
+  networks, such as RNN.
+
+  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
+  `T` is the maximum sequence length for this batch, which could differ from
+  batch to batch.
+
+  If multiple `feature_columns` are given with `Di` `num_elements` each, their
+  outputs are concatenated. So, the final `Tensor` has shape
+  `[batch_size, T, D0 + D1 + ... + Dn]`.
+
+  Example:
+
+  ```python
+  rating = sequence_numeric_column('rating')
+  watches = sequence_categorical_column_with_identity(
+      'watches', num_buckets=1000)
+  watches_embedding = embedding_column(watches, dimension=10)
+  columns = [rating, watches]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    features: A dict mapping keys to tensors.
+    feature_columns: An iterable of dense sequence columns. Valid columns are
+      - `embedding_column` that wraps a `sequence_categorical_column_with_*`
+      - `sequence_numeric_column`.
+    weight_collections: A list of collection names to which the Variable will be
+      added. Note that variables will also be added to collections
+      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
+    trainable: If `True` also add the variable to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES`.
+
+  Returns:
+    An `(input_layer, sequence_length)` tuple where:
+    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
+        `T` is the maximum sequence length for this batch, which could differ
+        from batch to batch. `D` is the sum of `num_elements` for all
+        `feature_columns`.
+    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
+        length for each example.
+
+  Raises:
+    ValueError: If any of the `feature_columns` is the wrong type.
+  """
+  feature_columns = fc_old._normalize_feature_columns(feature_columns)
+  for c in feature_columns:
+    if not isinstance(c, fc_old._SequenceDenseColumn):
+      raise ValueError(
+          'All feature_columns must be of type _SequenceDenseColumn. '
+          'You can wrap a sequence_categorical_column with an embedding_column '
+          'or indicator_column. '
+          'Given (type {}): {}'.format(type(c), c))
+
+  with variable_scope.variable_scope(
+      None, default_name='sequence_input_layer', values=features.values()):
+    builder = fc_old._LazyBuilder(features)
+    output_tensors = []
+    sequence_lengths = []
+    ordered_columns = []
+
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name):
+        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
+            builder,
+            weight_collections=weight_collections,
+            trainable=trainable)
+        # Flattens the final dimension to produce a 3D Tensor.
+        num_elements = column._variable_shape.num_elements()
+        shape = array_ops.shape(dense_tensor)
+        target_shape = [shape[0], shape[1], num_elements]
+        output_tensors.append(
+            array_ops.reshape(dense_tensor, shape=target_shape))
+        sequence_lengths.append(sequence_length)
+
+    fc_old._verify_static_batch_size_equality(output_tensors, ordered_columns)
+    fc_old._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
+    sequence_length = _assert_all_equal_and_return(sequence_lengths)
+
+    return array_ops.concat(output_tensors, -1), sequence_length
+
+
+def concatenate_context_input(context_input, sequence_input):
+  """Replicates `context_input` across all timesteps of `sequence_input`.
+
+  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
+  This value is appended to `sequence_input` on dimension 2 and the result is
+  returned.
+
+  Args:
+    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
+    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
+      padded_length, d0]`.
+
+  Returns:
+    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
+    d0 + d1]`.
+
+  Raises:
+    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
+      not have rank 2.
+  """
+  seq_rank_check = check_ops.assert_rank(
+      sequence_input,
+      3,
+      message='sequence_input must have rank 3',
+      data=[array_ops.shape(sequence_input)])
+  seq_type_check = check_ops.assert_type(
+      sequence_input,
+      dtypes.float32,
+      message='sequence_input must have dtype float32; got {}.'.format(
+          sequence_input.dtype))
+  ctx_rank_check = check_ops.assert_rank(
+      context_input,
+      2,
+      message='context_input must have rank 2',
+      data=[array_ops.shape(context_input)])
+  ctx_type_check = check_ops.assert_type(
+      context_input,
+      dtypes.float32,
+      message='context_input must have dtype float32; got {}.'.format(
+          context_input.dtype))
+  with ops.control_dependencies(
+      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
+    padded_length = array_ops.shape(sequence_input)[1]
+    tiled_context_input = array_ops.tile(
+        array_ops.expand_dims(context_input, 1),
+        array_ops.concat([[1], [padded_length], [1]], 0))
+  return array_ops.concat([sequence_input, tiled_context_input], 2)
+
+
+def sequence_categorical_column_with_identity(
+    key, num_buckets, default_value=None):
+  """Returns a feature column that represents sequences of integers.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  watches = sequence_categorical_column_with_identity(
+      'watches', num_buckets=1000)
+  watches_embedding = embedding_column(watches, dimension=10)
+  columns = [watches_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    num_buckets: Range of inputs. Namely, inputs are expected to be in the
+      range `[0, num_buckets)`.
+    default_value: If `None`, this column's graph operations will fail for
+      out-of-range inputs. Otherwise, this value must be in the range
+      `[0, num_buckets)`, and will replace out-of-range inputs.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: if `num_buckets` is less than one.
+    ValueError: if `default_value` is not in range `[0, num_buckets)`.
+  """
+  return fc_old._SequenceCategoricalColumn(
+      fc_old._categorical_column_with_identity(
+          key=key, num_buckets=num_buckets, default_value=default_value))
+
+
+def sequence_categorical_column_with_hash_bucket(
+    key, hash_bucket_size, dtype=dtypes.string):
+  """A sequence of categorical terms where ids are set by hashing.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  tokens = sequence_categorical_column_with_hash_bucket(
+      'tokens', hash_bucket_size=1000)
+  tokens_embedding = embedding_column(tokens, dimension=10)
+  columns = [tokens_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    hash_bucket_size: An int > 1. The number of buckets.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: `hash_bucket_size` is not greater than 1.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return fc_old._SequenceCategoricalColumn(
+      fc_old._categorical_column_with_hash_bucket(
+          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
+
+
+def sequence_categorical_column_with_vocabulary_file(
+    key, vocabulary_file, vocabulary_size=None, num_oov_buckets=0,
+    default_value=None, dtype=dtypes.string):
+  """A sequence of categorical terms where ids use a vocabulary file.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  states = sequence_categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  states_embedding = embedding_column(states, dimension=10)
+  columns = [states_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored. If None, it is set to the length of `vocabulary_file`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    dtype: The type of features. Only string and integer types are supported.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: `vocabulary_file` is missing or cannot be opened.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return fc_old._SequenceCategoricalColumn(
+      fc_old._categorical_column_with_vocabulary_file(
+          key=key,
+          vocabulary_file=vocabulary_file,
+          vocabulary_size=vocabulary_size,
+          num_oov_buckets=num_oov_buckets,
+          default_value=default_value,
+          dtype=dtype))
+
+
+def sequence_categorical_column_with_vocabulary_list(
+    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+  """A sequence of categorical terms where ids use an in-memory list.
+
+  Pass this to `embedding_column` or `indicator_column` to convert sequence
+  categorical data into dense representation for input to sequence NN, such as
+  RNN.
+
+  Example:
+
+  ```python
+  colors = sequence_categorical_column_with_vocabulary_list(
+      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
+      num_oov_buckets=2)
+  colors_embedding = embedding_column(colors, dimension=3)
+  columns = [colors_embedding]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  input_layer, sequence_length = sequence_input_layer(features, columns)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature.
+    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
+      is mapped to the index of its value (if present) in `vocabulary_list`.
+      Must be castable to `dtype`.
+    dtype: The type of features. Only string and integer types are supported.
+      If `None`, it will be inferred from `vocabulary_list`.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
+      hash of the input value. A positive `num_oov_buckets` can not be specified
+      with `default_value`.
+
+  Returns:
+    A `_SequenceCategoricalColumn`.
+
+  Raises:
+    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: if `dtype` is not integer or string.
+  """
+  return fc_old._SequenceCategoricalColumn(
+      fc_old._categorical_column_with_vocabulary_list(
+          key=key,
+          vocabulary_list=vocabulary_list,
+          dtype=dtype,
+          default_value=default_value,
+          num_oov_buckets=num_oov_buckets))
+
+
+def sequence_numeric_column(
+    key,
+    shape=(1,),
+    default_value=0.,
+    dtype=dtypes.float32,
+    normalizer_fn=None):
+  """Returns a feature column that represents sequences of numeric data.
+
+  Example:
+
+  ```python
+  temperature = sequence_numeric_column('temperature')
+  columns = [temperature]
+
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  sequence_feature_layer = SequenceFeatureLayer(columns)
+  input_layer, sequence_length = sequence_feature_layer(features)
+
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+  outputs, state = tf.nn.dynamic_rnn(
+      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  ```
+
+  Args:
+    key: A unique string identifying the input features.
+    shape: The shape of the input data per sequence id. E.g. if `shape=(2,)`,
+      each example must contain `2 * sequence_length` values.
+    default_value: A single value compatible with `dtype` that is used for
+      padding the sparse data into a dense `Tensor`.
+    dtype: The type of values.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
+
+  Returns:
+    A `SequenceNumericColumn`.
+
+  Raises:
+    TypeError: if any dimension in shape is not an int.
+    ValueError: if any dimension in shape is not a positive integer.
+    ValueError: if `dtype` is not convertible to `tf.float32`.
+  """
+  shape = fc_old._check_shape(shape=shape, key=key)
+  if not (dtype.is_integer or dtype.is_floating):
+    raise ValueError('dtype must be convertible to float. '
+                     'dtype: {}, key: {}'.format(dtype, key))
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
+
+  return SequenceNumericColumn(
+      key,
+      shape=shape,
+      default_value=default_value,
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
+
+
+def _assert_all_equal_and_return(tensors, name=None):
+  """Asserts that all tensors are equal and returns the first one."""
+  with ops.name_scope(name, 'assert_all_equal', values=tensors):
+    if len(tensors) == 1:
+      return tensors[0]
+    assert_equal_ops = []
+    for t in tensors[1:]:
+      assert_equal_ops.append(check_ops.assert_equal(tensors[0], t))
+    with ops.control_dependencies(assert_equal_ops):
+      return array_ops.identity(tensors[0])
+
+
+class SequenceNumericColumn(
+    fc.SequenceDenseColumn,
+    collections.namedtuple(
+        'SequenceNumericColumn',
+        ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))):
+  """Represents sequences of numeric data."""
+
+  @property
+  def _is_v2_column(self):
+    return True
+
+  @property
+  def name(self):
+    """See `FeatureColumn` base class."""
+    return self.key
+
+  @property
+  def parse_example_spec(self):
+    """See `FeatureColumn` base class."""
+    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
+
+  def transform_feature(self, transformation_cache, state_manager):
+    """See `FeatureColumn` base class.
+
+    In this case, we apply the `normalizer_fn` to the input tensor.
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+
+    Returns:
+      Normalized input tensor.
+    """
+    input_tensor = transformation_cache.get(self.key, state_manager)
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return input_tensor
+
+  @property
+  def variable_shape(self):
+    """Returns a `TensorShape` representing the shape of sequence input."""
+    return tensor_shape.TensorShape(self.shape)
+
+  def get_sequence_dense_tensor(self, transformation_cache, state_manager):
+    """Returns a `TensorSequenceLengthPair`.
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+    """
+    sp_tensor = transformation_cache.get(self, state_manager)
+    dense_tensor = sparse_ops.sparse_tensor_to_dense(
+        sp_tensor, default_value=self.default_value)
+    # Reshape into [batch_size, T, variable_shape].
+    dense_shape = array_ops.concat(
+        [array_ops.shape(dense_tensor)[:1], [-1], self.variable_shape],
+        axis=0)
+    dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape)
+
+    # Get the number of timesteps per example
+    # For the 2D case, the raw values are grouped according to num_elements;
+    # for the 3D case, the grouping happens in the third dimension, and
+    # sequence length is not affected.
+    num_elements = (self.variable_shape.num_elements()
+                    if sp_tensor.shape.ndims == 2 else 1)
+    seq_length = fc_old._sequence_length_from_sparse_tensor(
+        sp_tensor, num_elements=num_elements)
+
+    return fc.SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=dense_tensor, sequence_length=seq_length)
+
+  # TODO(b/119409767): Implement parents, _{get,from}_config.
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+# pylint: enable=protected-access
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca4398a142065de0be7bee57cd7e54670bbae12e
--- /dev/null
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
@@ -0,0 +1,1508 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sequential_feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc_old
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
+from tensorflow.python.feature_column import feature_column as fc_old
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import monitored_session
+
+
+class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args_a': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'sparse_input_args_b': {
+           # example 0, ids [1]
+           # example 1, ids [2, 0]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 2, 0),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[5., 6., 14., 15., 16.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [2, 0]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]],],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_args_a': {
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           'indices': (
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 0, 0, 1),
+           'dense_shape': (2, 2, 2)},
+       'sparse_input_args_b': {
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[2], [0]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (1, 1, 1, 2, 0),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[5., 6., 14., 15., 16.], [2., 3., 14., 15., 16.]],
+           # feature 1, [a: 0, 0, b: 2, -], [a: 1, -, b: 0, -]
+           [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_embedding_column(
+      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
+      expected_sequence_length):
+
+    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
+    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
+    vocabulary_size = 3
+    embedding_dimension_a = 2
+    embedding_values_a = (
+        (1., 2.),  # id 0
+        (3., 4.),  # id 1
+        (5., 6.)  # id 2
+    )
+    embedding_dimension_b = 3
+    embedding_values_b = (
+        (11., 12., 13.),  # id 0
+        (14., 15., 16.),  # id 1
+        (17., 18., 19.)  # id 2
+    )
+    def _get_initializer(embedding_dimension, embedding_values):
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
+      return _initializer
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc_old._embedding_column(
+        categorical_column_a,
+        dimension=embedding_dimension_a,
+        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_b = fc_old._embedding_column(
+        categorical_column_b,
+        dimension=embedding_dimension_b,
+        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        # Test that columns are reordered alphabetically.
+        feature_columns=[embedding_column_b, embedding_column_a])
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('sequence_input_layer/aaa_embedding/embedding_weights:0',
+         'sequence_input_layer/bbb_embedding/embedding_weights:0'),
+        tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
+      self.assertAllEqual(embedding_values_b, global_vars[1].eval(session=sess))
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_embedding_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc_old._categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc_old._embedding_column(
+        categorical_column_a, dimension=2)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must be of '
+        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
+      _, _ = sfc.sequence_input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[embedding_column_a])
+
+  def test_shared_embedding_column(self):
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [1]
+        # example 1, ids [2, 0]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(1, 2, 0),
+        dense_shape=(2, 2))
+
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 4.),  # id 1
+        (5., 6.)  # id 2
+    )
+
+    def _get_initializer(embedding_dimension, embedding_values):
+
+      def _initializer(shape, dtype, partition_info):
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertEqual(dtypes.float32, dtype)
+        self.assertIsNone(partition_info)
+        return embedding_values
+
+      return _initializer
+
+    expected_input_layer = [
+        # example 0, ids_a [2], ids_b [1]
+        [[5., 6., 3., 4.], [0., 0., 0., 0.]],
+        # example 1, ids_a [0, 1], ids_b [2, 0]
+        [[1., 2., 5., 6.], [3., 4., 1., 2.]],
+    ]
+    expected_sequence_length = [1, 2]
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    # Test that columns are reordered alphabetically.
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension,
+        initializer=_get_initializer(embedding_dimension, embedding_values))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        feature_columns=shared_embedding_columns)
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('sequence_input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
+        tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_shared_embedding_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence shared embedding column."""
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc_old._categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc_old._categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_shared_embedding\. categorical_column must '
+        r'be of type _SequenceCategoricalColumn to use sequence_input_layer\.'):
+      _, _ = sfc.sequence_input_layer(
+          features={
+              'aaa': sparse_input_a,
+              'bbb': sparse_input_b
+          },
+          feature_columns=shared_embedding_columns)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args_a': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'sparse_input_args_b': {
+           # example 0, ids [1]
+           # example 1, ids [1, 0]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 1, 0),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           # example 0, ids_a [2], ids_b [1]
+           [[0., 0., 1., 0., 1.], [0., 0., 0., 0., 0.]],
+           # example 1, ids_a [0, 1], ids_b [1, 0]
+           [[1., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'sparse_input_args_a': {
+           # feature 0, ids [[2], [0, 1]]
+           # feature 1, ids [[0, 0], [1]]
+           'indices': (
+               (0, 0, 0), (0, 1, 0), (0, 1, 1),
+               (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 0, 0, 1),
+           'dense_shape': (2, 2, 2)},
+       'sparse_input_args_b': {
+           # feature 0, ids [[1, 1], [1]]
+           # feature 1, ids [[1], [0]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (1, 1, 1, 1, 0),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           # feature 0, [a: 2, -, b: 1, 1], [a: 0, 1, b: 1, -]
+           [[0., 0., 1., 0., 2.], [1., 1., 0., 0., 1.]],
+           # feature 1, [a: 0, 0, b: 1, -], [a: 1, -, b: 0, -]
+           [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_indicator_column(
+      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
+      expected_sequence_length):
+    sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
+    sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)
+
+    vocabulary_size_a = 3
+    vocabulary_size_b = 2
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size_a)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size_b)
+    indicator_column_b = fc_old._indicator_column(categorical_column_b)
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        # Test that columns are reordered alphabetically.
+        feature_columns=[indicator_column_b, indicator_column_a])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_indicator_column_with_non_sequence_categorical(self):
+    """Tests that error is raised for non-sequence categorical column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = fc_old._categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must be of '
+        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
+      _, _ = sfc.sequence_input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[indicator_column_a])
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [0., 1]
+           # example 1, [10.]
+           'indices': ((0, 0), (0, 1), (1, 0)),
+           'values': (0., 1., 10.),
+           'dense_shape': (2, 2)},
+       'expected_input_layer': [
+           [[0.], [1.]],
+           [[10.], [0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # feature 0, ids [[20, 3], [5]]
+           # feature 1, ids [[3], [8]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (20, 3, 5., 3., 8.),
+           'dense_shape': (2, 2, 2)},
+       'expected_input_layer': [
+           [[20.], [3.], [5.], [0.]],
+           [[3.], [0.], [8.], [0.]]],
+       'expected_sequence_length': [2, 2]},
+      )
+  def test_numeric_column(
+      self, sparse_input_args, expected_input_layer, expected_sequence_length):
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+
+    numeric_column = sfc_old.sequence_numeric_column('aaa')
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [0., 1.,  2., 3., 4., 5., 6., 7.]
+           # example 1, [10., 11., 12., 13.]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
+                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 4)},
+       'expected_input_layer': [
+           # The output of numeric_column._get_dense_tensor should be flattened.
+           [[0., 1., 2., 3.], [4., 5., 6., 7.]],
+           [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
+       'expected_sequence_length': [2, 1]},
+      )
+  def test_numeric_column_multi_dim(
+      self, sparse_input_args, expected_input_layer, expected_sequence_length):
+    """Tests sequence_input_layer for multi-dimensional numeric_column."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+
+    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+
+    input_layer, sequence_length = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+  def test_sequence_length_not_equal(self):
+    """Tests that an error is raised when sequence lengths are not equal."""
+    # Input a with sequence_length = [2, 1]
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+    # Input b with sequence_length = [1, 1]
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0)),
+        values=(1., 10.),
+        dense_shape=(2, 2))
+    numeric_column_a = sfc_old.sequence_numeric_column('aaa')
+    numeric_column_b = sfc_old.sequence_numeric_column('bbb')
+
+    _, sequence_length = sfc.sequence_input_layer(
+        features={
+            'aaa': sparse_input_a,
+            'bbb': sparse_input_b,
+        },
+        feature_columns=[numeric_column_a, numeric_column_b])
+
+    with monitored_session.MonitoredSession() as sess:
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[Condition x == y did not hold element-wise:\] '
+          r'\[x \(sequence_input_layer/aaa/sequence_length:0\) = \] \[2 1\] '
+          r'\[y \(sequence_input_layer/bbb/sequence_length:0\) = \] \[1 1\]'):
+        sess.run(sequence_length)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+           # example 1, [[[10., 11.],  [12., 13.]]]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_shape': [2, 2, 4]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
+           # example 1, [[10., 11., 12., 13.], []]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 2),
+                       (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 4)},
+       'expected_shape': [2, 2, 4]},
+      )
+  def test_static_shape_from_tensors_numeric(
+      self, sparse_input_args, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+
+    input_layer, _ = sfc.sequence_input_layer(
+        features={'aaa': sparse_input},
+        feature_columns=[numeric_column])
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected_shape': [4, 2, 3]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [0, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 0, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected_shape': [4, 2, 3]}
+      )
+  def test_static_shape_from_tensors_indicator(
+      self, sparse_input_args, expected_shape):
+    """Tests that we return a known static shape when we have one."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    indicator_column = fc_old._indicator_column(categorical_column)
+
+    input_layer, _ = sfc.sequence_input_layer(
+        features={'aaa': sparse_input}, feature_columns=[indicator_column])
+    shape = input_layer.get_shape()
+    self.assertEqual(shape, expected_shape)
+
+
+class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
+  """Tests the utility fn concatenate_context_input."""
+
+  def test_concatenate_context_input(self):
+    seq_input = ops.convert_to_tensor(np.arange(12).reshape(2, 3, 2))
+    context_input = ops.convert_to_tensor(np.arange(10).reshape(2, 5))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    input_layer = sfc.concatenate_context_input(context_input, seq_input)
+
+    expected = np.array([
+        [[0, 1, 0, 1, 2, 3, 4], [2, 3, 0, 1, 2, 3, 4], [4, 5, 0, 1, 2, 3, 4]],
+        [[6, 7, 5, 6, 7, 8, 9], [8, 9, 5, 6, 7, 8, 9], [10, 11, 5, 6, 7, 8, 9]]
+    ], dtype=np.float32)
+    with monitored_session.MonitoredSession() as sess:
+      output = sess.run(input_layer)
+      self.assertAllEqual(expected, output)
+
+  @parameterized.named_parameters(
+      {'testcase_name': 'rank_lt_3',
+       'seq_input_arg': np.arange(100).reshape(10, 10)},
+      {'testcase_name': 'rank_gt_3',
+       'seq_input_arg': np.arange(100).reshape(5, 5, 2, 2)}
+      )
+  def test_sequence_input_throws_error(self, seq_input_arg):
+    seq_input = ops.convert_to_tensor(seq_input_arg)
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'sequence_input must have rank 3'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  @parameterized.named_parameters(
+      {'testcase_name': 'rank_lt_2',
+       'context_input_arg': np.arange(100)},
+      {'testcase_name': 'rank_gt_2',
+       'context_input_arg': np.arange(100).reshape(5, 5, 4)}
+      )
+  def test_context_input_throws_error(self, context_input_arg):
+    context_input = ops.convert_to_tensor(context_input_arg)
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'context_input must have rank 2'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  def test_integer_seq_input_throws_error(self):
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    context_input = math_ops.cast(context_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(
+        TypeError, 'sequence_input must have dtype float32'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+  def test_integer_context_input_throws_error(self):
+    seq_input = ops.convert_to_tensor(np.arange(100).reshape(5, 5, 4))
+    context_input = ops.convert_to_tensor(np.arange(100).reshape(10, 10))
+    seq_input = math_ops.cast(seq_input, dtype=dtypes.float32)
+    with self.assertRaisesRegexp(
+        TypeError, 'context_input must have dtype float32'):
+      sfc.concatenate_context_input(context_input, seq_input)
+
+
+class InputLayerTest(test.TestCase):
+  """Tests input_layer with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc_old._embedding_column(
+        categorical_column_a, dimension=2)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type _SequenceCategoricalColumn\.'):
+      _ = fc_old.input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[embedding_column_a])
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type _SequenceCategoricalColumn\.'):
+      _ = fc_old.input_layer(
+          features={'aaa': sparse_input},
+          feature_columns=[indicator_column_a])
+
+
+def _assert_sparse_tensor_value(test_case, expected, actual):
+  _assert_sparse_tensor_indices_shape(test_case, expected, actual)
+
+  test_case.assertEqual(
+      np.array(expected.values).dtype, np.array(actual.values).dtype)
+  test_case.assertAllEqual(expected.values, actual.values)
+
+
+def _assert_sparse_tensor_indices_shape(test_case, expected, actual):
+  test_case.assertEqual(np.int64, np.array(actual.indices).dtype)
+  test_case.assertAllEqual(expected.indices, actual.indices)
+
+  test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
+  test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
+
+
+class SequenceCategoricalColumnWithIdentityTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (1, 2, 0),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           'values': np.array((1, 2, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': (6, 7, 8),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': (6, 7, 8),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
+    column = sfc.sequence_categorical_column_with_identity('aaa', num_buckets=9)
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceCategoricalColumnWithHashBucketTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': ('omar', 'stringer', 'marlo'),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           # Ignored to avoid hash dependence in test.
+           'values': np.array((0, 0, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': ('omar', 'stringer', 'marlo'),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           # Ignored to avoid hash dependence in test.
+           'values': np.array((0, 0, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
+    column = sfc.sequence_categorical_column_with_hash_bucket(
+        'aaa', hash_bucket_size=10)
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_indices_shape(
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceCategoricalColumnWithVocabularyFileTest(
+    test.TestCase, parameterized.TestCase):
+
+  def _write_vocab(self, vocab_strings, file_name):
+    vocab_file = os.path.join(self.get_temp_dir(), file_name)
+    with open(vocab_file, 'w') as f:
+      f.write('\n'.join(vocab_strings))
+    return vocab_file
+
+  def setUp(self):
+    super(SequenceCategoricalColumnWithVocabularyFileTest, self).setUp()
+
+    vocab_strings = ['omar', 'stringer', 'marlo']
+    self._wire_vocabulary_file_name = self._write_vocab(vocab_strings,
+                                                        'wire_vocabulary.txt')
+    self._wire_vocabulary_size = 3
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': ('marlo', 'skywalker', 'omar'),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           'values': np.array((2, -1, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': ('omar', 'skywalker', 'marlo'),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': np.array((0, -1, 2), dtype=np.int64),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
+    column = sfc.sequence_categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+  def test_get_sparse_tensors_dynamic_zero_length(self):
+    """Tests _get_sparse_tensors with a dynamic sequence length."""
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=np.zeros((0, 2)), values=[], dense_shape=(2, 0))
+    expected = sparse_tensor.SparseTensorValue(
+        indices=np.zeros((0, 3)),
+        values=np.array((), dtype=np.int64),
+        dense_shape=(2, 0, 1))
+    column = sfc.sequence_categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size)
+    input_placeholder_shape = list(inputs.dense_shape)
+    # Make second dimension (sequence length) dynamic.
+    input_placeholder_shape[1] = None
+    input_placeholder = array_ops.sparse_placeholder(
+        dtypes.string, shape=input_placeholder_shape)
+    id_weight_pair = column._get_sparse_tensors(
+        _LazyBuilder({'aaa': input_placeholder}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      result = id_weight_pair.id_tensor.eval(
+          session=sess, feed_dict={input_placeholder: inputs})
+      _assert_sparse_tensor_value(
+          self, expected, result)
+
+
+class SequenceCategoricalColumnWithVocabularyListTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': ('marlo', 'skywalker', 'omar'),
+           'dense_shape': (2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 1, 0)),
+           'values': np.array((2, -1, 0), dtype=np.int64),
+           'dense_shape': (2, 2, 1)}},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': ('omar', 'skywalker', 'marlo'),
+           'dense_shape': (2, 2, 2)},
+       'expected_args': {
+           'indices': ((0, 0, 2), (1, 0, 0), (1, 2, 0)),
+           'values': np.array((0, -1, 2), dtype=np.int64),
+           'dense_shape': (2, 2, 2)}}
+      )
+  def test_get_sparse_tensors(self, inputs_args, expected_args):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    expected = sparse_tensor.SparseTensorValue(**expected_args)
+    column = sfc.sequence_categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'))
+
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with monitored_session.MonitoredSession() as sess:
+      _assert_sparse_tensor_value(
+          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+
+
+class SequenceEmbeddingColumnTest(
+    test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected': [
+           # example 0, ids [2]
+           [[7., 11.], [0., 0.]],
+           # example 1, ids [0, 1]
+           [[1., 2.], [3., 5.]],
+           # example 2, ids []
+           [[0., 0.], [0., 0.]],
+           # example 3, ids [1]
+           [[3., 5.], [0., 0.]]]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [0, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 0, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected': [
+           # example 0, ids [[2]]
+           [[7., 11.], [0., 0.]],
+           # example 1, ids [[0, 1], [2]]
+           [[2, 3.5], [7., 11.]],
+           # example 2, ids []
+           [[0., 0.], [0., 0.]],
+           # example 3, ids [[1], [0, 2]]
+           [[3., 5.], [4., 6.5]]]}
+      )
+  def test_get_sequence_dense_tensor(self, inputs_args, expected):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    vocabulary_size = 3
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc_old._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': inputs}))
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
+      self.assertAllEqual(expected, embedding_lookup.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 2),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2]}
+      )
+  def test_sequence_length(self, inputs_args, expected_sequence_length):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    vocabulary_size = 3
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
+
+    _, sequence_length = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': inputs}))
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length = sess.run(sequence_length)
+      self.assertAllEqual(expected_sequence_length, sequence_length)
+      self.assertEqual(np.int64, sequence_length.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [2]
+        # example 2, ids [0, 1]
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids []
+        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 1, 2, 0, 1, 0]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
+
+    _, sequence_length = embedding_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+class SequenceSharedEmbeddingColumnTest(test.TestCase):
+
+  def test_get_sequence_dense_tensor(self):
+    vocabulary_size = 3
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 2))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [1]
+        # example 1, ids [0, 2]
+        # example 2, ids [0]
+        # example 3, ids []
+        indices=((0, 0), (1, 0), (1, 1), (2, 0)),
+        values=(1, 0, 2, 0),
+        dense_shape=(4, 2))
+
+    expected_lookups_a = [
+        # example 0, ids [2]
+        [[7., 11.], [0., 0.]],
+        # example 1, ids [0, 1]
+        [[1., 2.], [3., 5.]],
+        # example 2, ids []
+        [[0., 0.], [0., 0.]],
+        # example 3, ids [1]
+        [[3., 5.], [0., 0.]],
+    ]
+
+    expected_lookups_b = [
+        # example 0, ids [1]
+        [[3., 5.], [0., 0.]],
+        # example 1, ids [0, 2]
+        [[1., 2.], [7., 11.]],
+        # example 2, ids [0]
+        [[1., 2.], [0., 0.]],
+        # example 3, ids []
+        [[0., 0.], [0., 0.]],
+    ]
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    embedding_lookup_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input_a
+        }))[0]
+    embedding_lookup_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'bbb': sparse_input_b
+        }))[0]
+
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
+      self.assertAllEqual(
+          expected_lookups_a, embedding_lookup_a.eval(session=sess))
+      self.assertAllEqual(
+          expected_lookups_b, embedding_lookup_b.eval(session=sess))
+
+  def test_sequence_length(self):
+    vocabulary_size = 3
+
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+    expected_sequence_length_a = [1, 2]
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0, 2]
+        # example 1, ids [1]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0, 2, 1),
+        dense_shape=(2, 2))
+    expected_sequence_length_b = [2, 1]
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)
+
+    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input_a
+        }))[1]
+    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'bbb': sparse_input_b
+        }))[1]
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length_a = sess.run(sequence_length_a)
+      self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
+      self.assertEqual(np.int64, sequence_length_a.dtype)
+      sequence_length_b = sess.run(sequence_length_b)
+      self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
+      self.assertEqual(np.int64, sequence_length_b.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [2]
+        # example 2, ids [0, 1]
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids []
+        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        # example 2, ids []
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids [0, 1]
+        indices=((0, 0), (4, 0), (5, 0), (5, 1)),
+        values=(2, 1, 0, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
+    categorical_column_b = sfc.sequence_categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+
+    shared_embedding_columns = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)
+
+    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input_a
+        }))[1]
+    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
+        _LazyBuilder({
+            'bbb': sparse_input_b
+        }))[1]
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length_a, sequence_length_a.eval(session=sess))
+      self.assertAllEqual(
+          expected_sequence_length_b, sequence_length_b.eval(session=sess))
+
+
+class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           # example 2, ids []
+           # example 3, ids [1]
+           'indices': ((0, 0), (1, 0), (1, 1), (3, 0)),
+           'values': (2, 0, 1, 1),
+           'dense_shape': (4, 2)},
+       'expected': [
+           # example 0, ids [2]
+           [[0., 0., 1.], [0., 0., 0.]],
+           # example 1, ids [0, 1]
+           [[1., 0., 0.], [0., 1., 0.]],
+           # example 2, ids []
+           [[0., 0., 0.], [0., 0., 0.]],
+           # example 3, ids [1]
+           [[0., 1., 0.], [0., 0., 0.]]]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           # example 2, ids []
+           # example 3, ids [[1], [2, 2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0),
+                       (3, 0, 0), (3, 1, 0), (3, 1, 1)),
+           'values': (2, 0, 1, 2, 1, 2, 2),
+           'dense_shape': (4, 2, 2)},
+       'expected': [
+           # example 0, ids [[2]]
+           [[0., 0., 1.], [0., 0., 0.]],
+           # example 1, ids [[0, 1], [2]]
+           [[1., 1., 0.], [0., 0., 1.]],
+           # example 2, ids []
+           [[0., 0., 0.], [0., 0., 0.]],
+           # example 3, ids [[1], [2, 2]]
+           [[0., 1., 0.], [0., 0., 2.]]]}
+      )
+  def test_get_sequence_dense_tensor(self, inputs_args, expected):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    vocabulary_size = 3
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column = fc_old._indicator_column(categorical_column)
+
+    indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': inputs}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected, indicator_tensor.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2, 0, 1),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 2]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2, 0, 1, 2),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2]}
+      )
+  def test_sequence_length(self, inputs_args, expected_sequence_length):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    vocabulary_size = 3
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column = fc_old._indicator_column(categorical_column)
+
+    _, sequence_length = indicator_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': inputs}))
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length = sess.run(sequence_length)
+      self.assertAllEqual(expected_sequence_length, sequence_length)
+      self.assertEqual(np.int64, sequence_length.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [2]
+        # example 2, ids [0, 1]
+        # example 3, ids []
+        # example 4, ids [1]
+        # example 5, ids []
+        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 1, 2, 0, 1, 0]
+
+    categorical_column = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column = fc.indicator_column(categorical_column)
+
+    _, sequence_length = indicator_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+def _get_sequence_dense_tensor(column, features):
+  return column.get_sequence_dense_tensor(
+      fc.FeatureTransformationCache(features), None)
+
+
+class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
+
+  def test_defaults(self):
+    a = sfc.sequence_numeric_column('aaa')
+    self.assertEqual('aaa', a.key)
+    self.assertEqual('aaa', a.name)
+    self.assertEqual((1,), a.shape)
+    self.assertEqual(0., a.default_value)
+    self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
+
+  def test_shape_saved_as_tuple(self):
+    a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
+    self.assertEqual((1, 2), a.shape)
+
+  def test_shape_must_be_positive_integer(self):
+    with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
+      sfc.sequence_numeric_column('aaa', shape=[1.0])
+
+    with self.assertRaisesRegexp(
+        ValueError, 'shape dimensions must be greater than 0'):
+      sfc.sequence_numeric_column('aaa', shape=[0])
+
+  def test_dtype_is_convertible_to_float(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'dtype must be convertible to float'):
+      sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
+
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, values [0., 1]
+           # example 1, [10.]
+           'indices': ((0, 0), (0, 1), (1, 0)),
+           'values': (0., 1., 10.),
+           'dense_shape': (2, 2)},
+       'expected': [
+           [[0.], [1.]],
+           [[10.], [0.]]]},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # feature 0, ids [[20, 3], [5]]
+           # feature 1, ids [[3], [8]]
+           'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
+           'values': (20, 3, 5., 3., 8.),
+           'dense_shape': (2, 2, 2)},
+       'expected': [
+           [[20.], [3.], [5.], [0.]],
+           [[3.], [0.], [8.], [0.]]]},
+      )
+  def test_get_sequence_dense_tensor(self, inputs_args, expected):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    dense_tensor, _ = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': inputs})
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(expected, dense_tensor.eval(session=sess))
+
+  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
+
+    def _increment_two(input_sparse_tensor):
+      return sparse_ops.sparse_add(
+          input_sparse_tensor,
+          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
+      )
+
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+
+    # Before _increment_two:
+    #   [[0.], [1.]],
+    #   [[10.], [0.]],
+    # After _increment_two:
+    #   [[2.], [1.]],
+    #   [[10.], [2.]],
+    expected_dense_tensor = [
+        [[2.], [1.]],
+        [[10.], [2.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column(
+        'aaa', normalizer_fn=_increment_two)
+
+    dense_tensor, _ = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': sparse_input})
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'sparse_input_args': {
+           # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
+           # example 1, [[[10., 11.],  [12., 13.]]]
+           'indices': ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
+                       (0, 7), (1, 0), (1, 1), (1, 2), (1, 3)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 8)},
+       'expected_dense_tensor': [
+           [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
+           [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]]]},
+      {'testcase_name': '3D',
+       'sparse_input_args': {
+           'indices': ((0, 0, 0), (0, 0, 2), (0, 0, 4), (0, 0, 6),
+                       (0, 1, 0), (0, 1, 2), (0, 1, 4), (0, 1, 6),
+                       (1, 0, 0), (1, 0, 2), (1, 0, 4), (1, 0, 6)),
+           'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
+           'dense_shape': (2, 2, 8)},
+       'expected_dense_tensor': [
+           [[[0., 0.], [1., 0.]], [[2., 0.], [3., 0.]],
+            [[4., 0.], [5., 0.]], [[6., 0.], [7., 0.]]],
+           [[[10., 0.], [11., 0.]], [[12., 0.], [13., 0.]],
+            [[0., 0.], [0., 0.]], [[0., 0.], [0., 0.]]]]},
+      )
+  def test_get_dense_tensor_multi_dim(
+      self, sparse_input_args, expected_dense_tensor):
+    """Tests get_sequence_dense_tensor for multi-dim numeric_column."""
+    sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
+
+    dense_tensor, _ = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': sparse_input})
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
+  @parameterized.named_parameters(
+      {'testcase_name': '2D',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2., 0., 1.),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 2],
+       'shape': (1,)},
+      {'testcase_name': '3D',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2., 0., 1., 2.),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2],
+       'shape': (1,)},
+      {'testcase_name': '2D_with_shape',
+       'inputs_args': {
+           # example 0, ids [2]
+           # example 1, ids [0, 1]
+           'indices': ((0, 0), (1, 0), (1, 1)),
+           'values': (2., 0., 1.),
+           'dense_shape': (2, 2)},
+       'expected_sequence_length': [1, 1],
+       'shape': (2,)},
+      {'testcase_name': '3D_with_shape',
+       'inputs_args': {
+           # example 0, ids [[2]]
+           # example 1, ids [[0, 1], [2]]
+           'indices': ((0, 0, 0), (1, 0, 0), (1, 0, 1), (1, 1, 0)),
+           'values': (2., 0., 1., 2.),
+           'dense_shape': (2, 2, 2)},
+       'expected_sequence_length': [1, 2],
+       'shape': (2,)},
+      )
+  def test_sequence_length(self, inputs_args, expected_sequence_length, shape):
+    inputs = sparse_tensor.SparseTensorValue(**inputs_args)
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=shape)
+
+    _, sequence_length = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': inputs})
+
+    with monitored_session.MonitoredSession() as sess:
+      sequence_length = sess.run(sequence_length)
+      self.assertAllEqual(expected_sequence_length, sequence_length)
+      self.assertEqual(np.int64, sequence_length.dtype)
+
+  def test_sequence_length_with_empty_rows(self):
+    """Tests _sequence_length when some examples do not have ids."""
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values []
+        # example 1, values [[0.], [1.]]
+        # example 2, [[2.]]
+        # example 3, values []
+        # example 4, [[3.]]
+        # example 5, values []
+        indices=((1, 0), (1, 1), (2, 0), (4, 0)),
+        values=(0., 1., 2., 3.),
+        dense_shape=(6, 2))
+    expected_sequence_length = [0, 2, 1, 0, 1, 0]
+    numeric_column = sfc.sequence_numeric_column('aaa')
+
+    _, sequence_length = _get_sequence_dense_tensor(
+        numeric_column, {'aaa': sparse_input})
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_sequence_length, sequence_length.eval(session=sess))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 249debbdf6dff412a5be6cb1032fc4a3567c7d0b..dad50a3a73085526f65bd87c3d8549ceb75b3af4 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -1,15 +1,16 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
 package(default_visibility = [
     "//learning/brain:__subpackages__",
     "//tensorflow:__subpackages__",
+    "//tensorflow_model_optimization:__subpackages__",
 ])
 
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
@@ -46,6 +47,11 @@ tf_custom_op_py_library(
         ":variable_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//video/youtube/personalization:__subpackages__",
+    ],
     deps = [
         ":gen_variable_ops",
         "//tensorflow/contrib/util:util_py",
@@ -65,6 +71,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:smart_cond",
+        "//tensorflow/python:sort_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:state_ops_gen",
@@ -310,17 +317,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-py_test(
-    name = "sort_ops_test",
-    size = "medium",
-    srcs = ["python/ops/sort_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 95f5ba90aba6ff8d3f1f5b93bde2211ddf1c231b..e72e50585a3861d4527b66f89e1659d76c85960a 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -15,10 +15,6 @@
 
 """Framework utilities.
 
-See the
-[Contrib Framework](https://tensorflow.org/api_guides/python/contrib.framework)
-guide.
-
 @@assert_same_float_dtype
 @@assert_scalar
 @@assert_scalar_int
diff --git a/tensorflow/contrib/framework/python/framework/experimental_test.py b/tensorflow/contrib/framework/python/framework/experimental_test.py
index cfdc7df7d8fd4c1406bf447a79038ac33b11e047..00e04b83ac45a83e54eee7a6e4e146fb683c3d98 100644
--- a/tensorflow/contrib/framework/python/framework/experimental_test.py
+++ b/tensorflow/contrib/framework/python/framework/experimental_test.py
@@ -44,17 +44,18 @@ class ExperimentalTest(test.TestCase):
 
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
-    self.assertEqual("fn doc. (experimental)"
-                     "\n"
-                     "\nTHIS FUNCTION IS EXPERIMENTAL. It may change or "
-                     "be removed at any time, and without warning."
-                     "\n"
-                     "\nArgs:"
-                     "\n  arg0: Arg 0."
-                     "\n  arg1: Arg 1."
-                     "\n"
-                     "\nReturns:"
-                     "\n  Sum of args.", _fn.__doc__)
+    self.assertEqual(
+        "fn doc. (experimental)"
+        "\n"
+        "\nWarning: THIS FUNCTION IS EXPERIMENTAL. It may change "
+        "or be removed at any time, and without warning."
+        "\n"
+        "\nArgs:"
+        "\n  arg0: Arg 0."
+        "\n  arg1: Arg 1."
+        "\n"
+        "\nReturns:"
+        "\n  Sum of args.", _fn.__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual(3, _fn(1, 2))
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index 9b0b9b1e1bf51db9332806097c2b3ae14d0587ad..05788d2e820a6cc1ec67578f0d1b19448b674d2f 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -218,7 +218,6 @@ class WithShapeTest(test.TestCase):
         self.assertRaisesRegexp(errors_impl.OpError, "Wrong shape",
                                 tensor_2x2.eval, {tensor_no_shape: [42.0]})
 
-  @test_util.enable_c_shapes
   def test_with_shape_partial(self):
     with self.cached_session():
       tensor_partial_shape = array_ops.placeholder(dtypes.float32)
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops.py b/tensorflow/contrib/framework/python/ops/sort_ops.py
index 1921a77c1e96ee3531d1ed0f98e41c27c9d427ac..42184a4e55e292f7921702e3f8909ae54f717702 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops.py
+++ b/tensorflow/contrib/framework/python/ops/sort_ops.py
@@ -22,173 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
+from tensorflow.python.ops import sort_ops
 
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops as framework_ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-
-
-def sort(values, axis=-1, direction='ASCENDING', name=None):
-  """Sorts a tensor.
-
-  Args:
-    values: 1-D or higher numeric `Tensor`.
-    axis: The axis along which to sort. The default is -1, which sorts the last
-        axis.
-    direction: The direction in which to sort the values (`'ASCENDING'` or
-        `'DESCENDING'`).
-    name: Optional name for the operation.
-
-  Returns:
-    A `Tensor` with the same dtype and shape as `values`, with the elements
-        sorted along the given `axis`.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  with framework_ops.name_scope(name, 'sort'):
-    return _sort_or_argsort(values, axis, direction, return_argsort=False)
-
-
-def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
-  """Returns the indices of a tensor that give its sorted order along an axis.
-
-  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
-  `tf.sort(values)`. For higher dimensions, the output has the same shape as
-  `values`, but along the given axis, values represent the index of the sorted
-  element in that slice of the tensor at the given position.
-
-  Args:
-    values: 1-D or higher numeric `Tensor`.
-    axis: The axis along which to sort. The default is -1, which sorts the last
-        axis.
-    direction: The direction in which to sort the values (`'ASCENDING'` or
-        `'DESCENDING'`).
-    stable: If True, equal elements in the original tensor will not be
-        re-ordered in the returned order. Unstable sort is not yet implemented,
-        but will eventually be the default for performance reasons. If you
-        require a stable order, pass `stable=True` for forwards compatibility.
-    name: Optional name for the operation.
-
-  Returns:
-    An int32 `Tensor` with the same shape as `values`. The indices that would
-        sort each slice of the given `values` along the given `axis`.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  del stable  # Unused.
-  with framework_ops.name_scope(name, 'argsort'):
-    return _sort_or_argsort(values, axis, direction, return_argsort=True)
-
-
-def _sort_or_argsort(values, axis, direction, return_argsort):
-  """Internal sort/argsort implementation.
-
-  Args:
-    values: The input values.
-    axis: The axis along which to sort.
-    direction: 'ASCENDING' or 'DESCENDING'.
-    return_argsort: Whether to return the argsort result.
-
-  Returns:
-    Either the sorted values, or the indices of the sorted values in the
-        original tensor. See the `sort` and `argsort` docstrings.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  if direction not in _SORT_IMPL:
-    raise ValueError('%s should be one of %s' %
-                     (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
-  # Axis must be an integer, not a Tensor.
-  axis = framework_ops.convert_to_tensor(axis, name='axis')
-  axis_static = tensor_util.constant_value(axis)
-  if axis.shape.ndims != 0 or axis_static is None:
-    raise ValueError('axis must be a constant scalar')
-  axis_static = int(axis_static)  # Avoids NumPy casting error
-
-  values = framework_ops.convert_to_tensor(values, name='values')
-
-  return _SORT_IMPL[direction](values, axis_static, return_argsort)
-
-
-def _descending_sort(values, axis, return_argsort=False):
-  """Sorts values in reverse using `top_k`.
-
-  Args:
-    values: Tensor of numeric values.
-    axis: Index of the axis which values should be sorted along.
-    return_argsort: If False, return the sorted values. If True, return the
-        indices that would sort the values.
-
-  Returns:
-    The sorted values.
-  """
-  k = array_ops.shape(values)[axis]
-  rank = array_ops.rank(values)
-  static_rank = values.shape.ndims
-  # Fast path: sorting the last axis.
-  if axis == -1 or axis + 1 == values.get_shape().ndims:
-    top_k_input = values
-    transposition = None
-  else:
-    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
-    if axis < 0:
-      # Calculate the actual axis index if counting from the end. Use the static
-      # rank if available, or else make the axis back into a tensor.
-      axis += static_rank or rank
-    if static_rank is not None:
-      # Prefer to calculate the transposition array in NumPy and make it a
-      # constant.
-      transposition = constant_op.constant(
-          np.r_[
-              # Axes up to axis are unchanged.
-              np.arange(axis),
-              # Swap axis and rank - 1.
-              [static_rank - 1],
-              # Axes in [axis + 1, rank - 1) are unchanged.
-              np.arange(axis + 1, static_rank - 1),
-              # Swap axis and rank - 1.
-              [axis]],
-          name='transposition')
-    else:
-      # Generate the transposition array from the tensors.
-      transposition = array_ops.concat(
-          [
-              # Axes up to axis are unchanged.
-              math_ops.range(axis),
-              # Swap axis and rank - 1.
-              [rank - 1],
-              # Axes in [axis + 1, rank - 1) are unchanged.
-              math_ops.range(axis + 1, rank - 1),
-              # Swap axis and rank - 1.
-              [axis]
-          ],
-          axis=0)
-    top_k_input = array_ops.transpose(values, transposition)
-
-  values, indices = nn_ops.top_k(top_k_input, k)
-  return_value = indices if return_argsort else values
-  if transposition is not None:
-    # transposition contains a single cycle of length 2 (swapping 2 elements),
-    # so it is an involution (it is its own inverse).
-    return_value = array_ops.transpose(return_value, transposition)
-  return return_value
-
-
-def _ascending_sort(values, axis, return_argsort=False):
-  # Negate the values to get the ascending order from descending sort.
-  values_or_indices = _descending_sort(-values, axis, return_argsort)
-  # If not argsort, negate the values again.
-  return values_or_indices if return_argsort else -values_or_indices
-
-
-_SORT_IMPL = {
-    'ASCENDING': _ascending_sort,
-    'DESCENDING': _descending_sort,
-}
+sort = sort_ops.sort
+argsort = sort_ops.argsort
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 9d0e6e1335d0be3477b78abce94999122672ff05..f89d7ed0f45f919b17398de5d9449d12c08dd2f2 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -49,7 +49,6 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/losses",
     ],
 )
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
index 4ead66ca13e74bacc0e4679a8d5c4e0f23d04b69..9ab86329eaf0e6fd426aef1f552f4e27c2ad65de 100644
--- a/tensorflow/contrib/gan/README.md
+++ b/tensorflow/contrib/gan/README.md
@@ -34,28 +34,32 @@ mix TFGAN, native TF, and other custom frameworks
 TFGAN is composed of several parts which were design to exist independently.
 These include the following main pieces (explained in detail below).
 
-* [core](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py):
-provides the main infrastructure needed to train a GAN. Training occurs in four phases, and each phase
-can be completed by custom-code or by using a TFGAN library call.
-
-* [features](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/):
-Many common GAN operations and normalization techniques are implemented for you
-to use, such as instance normalization and conditioning.
-
-* [losses](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/losses/python/):
-Easily experiment with already-implemented and well-tested losses and penalties,
-such as the Wasserstein loss, gradient penalty, mutual information penalty, etc
-
-* [evaluation](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/):
-Use `Inception Score` or `Frechet Distance` with a pretrained Inception
-network to evaluate your unconditional generative model. You can also use
-your own pretrained classifier for more specific performance numbers, or use
-other methods for evaluating conditional generative models.
-
-* [examples](https://github.com/tensorflow/models/tree/master/research/gan/) and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb):
-See examples of how to use TFGAN to make GAN training easier, or use the more complicated examples to jumpstart your
-own project. These include unconditional and conditional GANs, InfoGANs,
-adversarial losses on existing networks, and image-to-image translation.
+*   [core](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py):
+    provides the main infrastructure needed to train a GAN. Training occurs in
+    four phases, and each phase can be completed by custom-code or by using a
+    TFGAN library call.
+
+*   [features](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/):
+    Many common GAN operations and normalization techniques are implemented for
+    you to use, such as instance normalization and conditioning.
+
+*   [losses](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/losses/python/):
+    Easily experiment with already-implemented and well-tested losses and
+    penalties, such as the Wasserstein loss, gradient penalty, mutual
+    information penalty, etc
+
+*   [evaluation](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/):
+    Use `Inception Score`, `Frechet Distance`, or `Kernel Distance` with a
+    pretrained Inception network to evaluate your unconditional generative
+    model. You can also use your own pretrained classifier for more specific
+    performance numbers, or use other methods for evaluating conditional
+    generative models.
+
+*   [examples](https://github.com/tensorflow/models/tree/master/research/gan/)
+    and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TFGAN to make
+    GAN training easier, or use the more complicated examples to jumpstart your
+    own project. These include unconditional and conditional GANs, InfoGANs,
+    adversarial losses on existing networks, and image-to-image translation.
 
 ## Training a GAN model
 
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 7243f150ce540cc96d1960511bc1500b7f917791..3593b501bb738b8f58dce4e40cffbdf410f136b3 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -112,7 +112,9 @@ class GANEstimator(estimator.Estimator):
                get_eval_metric_ops_fn=None,
                add_summaries=None,
                use_loss_summaries=True,
-               config=None):
+               config=None,
+               warm_start_from=None,
+               is_chief=True):
     """Initializes a GANEstimator instance.
 
     Args:
@@ -151,6 +153,10 @@ class GANEstimator(estimator.Estimator):
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
       config: `RunConfig` object to configure the runtime settings.
+      warm_start_from: A filepath to a checkpoint or saved model, or a
+        WarmStartSettings object to configure initialization.
+      is_chief: Whether or not this Estimator is running on a chief or worker.
+        Needs to be set appropriately if using SyncReplicasOptimizers.
 
     Raises:
       ValueError: If loss functions aren't callable.
@@ -184,10 +190,11 @@ class GANEstimator(estimator.Estimator):
       return _get_estimator_spec(
           mode, gan_model, generator_loss_fn, discriminator_loss_fn,
           get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-          get_hooks_fn, use_loss_summaries)
+          get_hooks_fn, use_loss_summaries, is_chief)
 
     super(GANEstimator, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config)
+        model_fn=_model_fn, model_dir=model_dir, config=config,
+        warm_start_from=warm_start_from)
 
 
 def _get_gan_model(
@@ -211,7 +218,7 @@ def _get_gan_model(
 def _get_estimator_spec(
     mode, gan_model, generator_loss_fn, discriminator_loss_fn,
     get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-    get_hooks_fn=None, use_loss_summaries=True):
+    get_hooks_fn=None, use_loss_summaries=True, is_chief=True):
   """Get the EstimatorSpec for the current mode."""
   if mode == model_fn_lib.ModeKeys.PREDICT:
     estimator_spec = model_fn_lib.EstimatorSpec(
@@ -232,7 +239,7 @@ def _get_estimator_spec(
               else discriminator_optimizer)
       get_hooks_fn = get_hooks_fn or tfgan_train.get_sequential_train_hooks()
       estimator_spec = _get_train_estimator_spec(
-          gan_model, gan_loss, gopt, dopt, get_hooks_fn)
+          gan_model, gan_loss, gopt, dopt, get_hooks_fn, is_chief=is_chief)
 
   return estimator_spec
 
@@ -317,11 +324,11 @@ def _get_eval_estimator_spec(gan_model, gan_loss, get_eval_metric_ops_fn=None,
 
 def _get_train_estimator_spec(
     gan_model, gan_loss, generator_optimizer, discriminator_optimizer,
-    get_hooks_fn, train_op_fn=tfgan_train.gan_train_ops):
+    get_hooks_fn, train_op_fn=tfgan_train.gan_train_ops, is_chief=True):
   """Return an EstimatorSpec for the train case."""
   scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
   train_ops = train_op_fn(gan_model, gan_loss, generator_optimizer,
-                          discriminator_optimizer)
+                          discriminator_optimizer, is_chief=is_chief)
   training_hooks = get_hooks_fn(train_ops)
   return model_fn_lib.EstimatorSpec(
       loss=scalar_loss,
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 83f8dd641fa9a641533161373c29c5d2f81746a1..bc9021050bc010ce75c3091fef868549686c0e90 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -33,9 +33,12 @@ from tensorflow.contrib.learn.python.learn.learn_io import graph_io
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.estimator import WarmStartSettings
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
@@ -45,6 +48,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import learning_rate_decay
+from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 
@@ -52,7 +56,8 @@ from tensorflow.python.training import training_util
 def generator_fn(noise_dict, mode):
   del mode
   noise = noise_dict['x']
-  return layers.fully_connected(noise, noise.shape[1].value)
+  return layers.fully_connected(noise, tensor_shape.dimension_value(
+      noise.shape[1]))
 
 
 def discriminator_fn(data, unused_conditioning, mode):
@@ -78,7 +83,7 @@ class GetGANModelTest(test.TestCase, parameterized.TestCase):
 
     self.assertEqual(generator_inputs, gan_model.generator_inputs)
     self.assertIsNotNone(gan_model.generated_data)
-    self.assertEqual(2, len(gan_model.generator_variables))  # 1 FC layer
+    self.assertLen(gan_model.generator_variables, 2)  # 1 FC layer
     self.assertIsNotNone(gan_model.generator_fn)
     if mode == model_fn_lib.ModeKeys.PREDICT:
       self.assertIsNone(gan_model.real_data)
@@ -91,7 +96,7 @@ class GetGANModelTest(test.TestCase, parameterized.TestCase):
       self.assertIsNotNone(gan_model.real_data)
       self.assertIsNotNone(gan_model.discriminator_real_outputs)
       self.assertIsNotNone(gan_model.discriminator_gen_outputs)
-      self.assertEqual(2, len(gan_model.discriminator_variables))  # 1 FC layer
+      self.assertLen(gan_model.discriminator_variables, 2)  # 1 FC layer
       self.assertIsNotNone(gan_model.discriminator_scope)
       self.assertIsNotNone(gan_model.discriminator_fn)
 
@@ -117,6 +122,7 @@ def get_dummy_gan_model():
 
 
 def dummy_loss_fn(gan_model, add_summaries=True):
+  del add_summaries
   return math_ops.reduce_sum(gan_model.discriminator_real_outputs -
                              gan_model.discriminator_gen_outputs)
 
@@ -164,6 +170,35 @@ class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
       self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
       self.assertIsNotNone(spec.eval_metric_ops)
 
+  def test_get_sync_estimator_spec(self):
+    """Make sure spec is loaded with sync hooks for sync opts."""
+
+    def get_sync_optimizer():
+      return sync_replicas_optimizer.SyncReplicasOptimizer(
+          training.GradientDescentOptimizer(learning_rate=1.0),
+          replicas_to_aggregate=1)
+
+    with ops.Graph().as_default():
+      self._gan_model = get_dummy_gan_model()
+      g_opt = get_sync_optimizer()
+      d_opt = get_sync_optimizer()
+
+      spec = estimator._get_estimator_spec(
+          model_fn_lib.ModeKeys.TRAIN,
+          self._gan_model,
+          generator_loss_fn=dummy_loss_fn,
+          discriminator_loss_fn=dummy_loss_fn,
+          get_eval_metric_ops_fn=get_metrics,
+          generator_optimizer=g_opt,
+          discriminator_optimizer=d_opt)
+
+      self.assertLen(spec.training_hooks, 4)
+      sync_opts = [
+          hook._sync_optimizer for hook in spec.training_hooks if
+          isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
+      self.assertLen(sync_opts, 2)
+      self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
+
 
 # TODO(joelshor): Add pandas test.
 class GANEstimatorIntegrationTest(test.TestCase):
@@ -317,5 +352,71 @@ class GANEstimatorIntegrationTest(test.TestCase):
         prediction_size=[batch_size, input_dim])
 
 
+class GANEstimatorWarmStartTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = self.get_temp_dir()
+    self.new_variable_name = 'new_var'
+    self.new_variable_value = [1, 2, 3]
+
+  def tearDown(self):
+    writer_cache.FileWriterCache.clear()
+
+  def _test_warm_start(self, warm_start_from=None):
+    """Tests whether WarmStartSettings work as intended."""
+    def generator_with_new_variable(noise_dict, mode):
+      variable_scope.get_variable(name=self.new_variable_name,
+                                  initializer=self.new_variable_value,
+                                  trainable=True)
+      return generator_fn(noise_dict, mode)
+
+    def train_input_fn():
+      data = np.zeros([3, 4])
+      return {'x': data}, data
+
+    est = estimator.GANEstimator(
+        generator_fn=generator_fn,
+        discriminator_fn=discriminator_fn,
+        generator_loss_fn=losses.wasserstein_generator_loss,
+        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+        generator_optimizer=training.GradientDescentOptimizer(1.0),
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        model_dir=self._model_dir)
+
+    est.train(train_input_fn, steps=1)
+
+    est_warm = estimator.GANEstimator(
+        generator_fn=generator_with_new_variable,
+        discriminator_fn=discriminator_fn,
+        generator_loss_fn=losses.wasserstein_generator_loss,
+        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+        generator_optimizer=training.GradientDescentOptimizer(1.0),
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        model_dir=None if warm_start_from else self._model_dir,
+        warm_start_from=warm_start_from)
+
+    est_warm.train(train_input_fn, steps=1)
+
+    return est_warm
+
+  def test_warm_start_error(self):
+    """Test if exception when reloading different estimators."""
+    with self.assertRaises(NotFoundError):
+      self._test_warm_start()
+
+  def test_warm_start_success(self):
+    """Test if GANEstimator allows explicit warm start variable assignment."""
+    # Regex matches all variable names in ckpt except for new_var.
+    var_regex = '^(?!.*%s.*)' % self.new_variable_name
+    warmstart = WarmStartSettings(ckpt_to_initialize_from=self._model_dir,
+                                  vars_to_warm_start=var_regex)
+    est_warm = self._test_warm_start(warm_start_from=warmstart)
+    full_variable_name = 'Generator/%s' % self.new_variable_name
+    self.assertIn(full_variable_name, est_warm.get_variable_names())
+    equal_vals = np.array_equal(est_warm.get_variable_value(full_variable_name),
+                                self.new_variable_value)
+    self.assertTrue(equal_vals)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index d914f549457a1e893ed43a3b8bc1ae5be7bb4303..a71ee53311c1c057a5b41be0331bf56ce1a82f74 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -14,8 +14,8 @@
 # ==============================================================================
 """Model evaluation tools for TFGAN.
 
-These methods come from https://arxiv.org/abs/1606.03498 and
-https://arxiv.org/abs/1706.08500.
+These methods come from https://arxiv.org/abs/1606.03498,
+https://arxiv.org/abs/1706.08500, and https://arxiv.org/abs/1801.01401.
 
 NOTE: This implementation uses the same weights as in
 https://github.com/openai/improved-gan/blob/master/inception_score/model.py,
@@ -40,6 +40,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import linalg_ops
@@ -64,6 +65,12 @@ __all__ = [
     'frechet_classifier_distance_from_activations',
     'mean_only_frechet_classifier_distance_from_activations',
     'diagonal_only_frechet_classifier_distance_from_activations',
+    'kernel_inception_distance',
+    'kernel_inception_distance_and_std',
+    'kernel_classifier_distance',
+    'kernel_classifier_distance_and_std',
+    'kernel_classifier_distance_from_activations',
+    'kernel_classifier_distance_and_std_from_activations',
     'INCEPTION_DEFAULT_IMAGE_SIZE',
 ]
 
@@ -734,3 +741,373 @@ frechet_inception_distance = functools.partial(
     frechet_classifier_distance,
     classifier_fn=functools.partial(
         run_inception, output_tensor=INCEPTION_FINAL_POOL))
+
+
+def kernel_classifier_distance(real_images,
+                               generated_images,
+                               classifier_fn,
+                               num_classifier_batches=1,
+                               max_block_size=1024,
+                               dtype=None):
+  """Kernel "classifier" distance for evaluating a generative model.
+
+  This is based on the Kernel Inception distance, but for an arbitrary
+  embedding.
+
+  This technique is described in detail in https://arxiv.org/abs/1801.01401.
+  Given two distributions P and Q of activations, this function calculates
+
+      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
+        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
+
+  where k is the polynomial kernel
+
+      k(x, y) = ( x^T y / dimension + 1 )^3.
+
+  This captures how different the distributions of real and generated images'
+  visual features are. Like the Frechet distance (and unlike the Inception
+  score), this is a true distance and incorporates information about the
+  target images. Unlike the Frechet score, this function computes an
+  *unbiased* and asymptotically normal estimator, which makes comparing
+  estimates across models much more intuitive.
+
+  The estimator used takes time quadratic in max_block_size. Larger values of
+  max_block_size will decrease the variance of the estimator but increase the
+  computational cost. This differs slightly from the estimator used by the
+  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
+
+  NOTE: the blocking code assumes that real_activations and
+  generated_activations are both in random order. If either is sorted in a
+  meaningful order, the estimator will behave poorly.
+
+  NOTE: This function consumes images, computes their activations, and then
+  computes the classifier score. If you would like to precompute many
+  activations for real and generated images for large batches, or to compute
+  multiple scores based on the same images, please use
+  kernel_clasifier_distance_from_activations(), which this method also uses.
+
+  Args:
+    real_images: Real images to use to compute Kernel Inception distance.
+    generated_images: Generated images to use to compute Kernel Inception
+      distance.
+    classifier_fn: A function that takes images and produces activations based
+      on a classifier.
+    num_classifier_batches: Number of batches to split images in to in order to
+      efficiently run them through the classifier network.
+    max_estimator_block_size: integer, default 1024. The distance estimator
+      splits samples into blocks for computational efficiency. Larger values are
+      more computationally expensive but decrease the variance of the distance
+      estimate.
+    dtype: if not None, coerce activations to this dtype before computations.
+
+  Returns:
+   The Kernel Inception Distance. A floating-point scalar of the same type
+   as the output of the activations.
+  """
+  return kernel_classifier_distance_and_std(
+      real_images,
+      generated_images,
+      classifier_fn,
+      num_classifier_batches=num_classifier_batches,
+      max_block_size=max_block_size,
+      dtype=dtype)[0]
+
+
+kernel_inception_distance = functools.partial(
+    kernel_classifier_distance,
+    classifier_fn=functools.partial(
+        run_inception, output_tensor=INCEPTION_FINAL_POOL))
+
+
+def kernel_classifier_distance_and_std(real_images,
+                                       generated_images,
+                                       classifier_fn,
+                                       num_classifier_batches=1,
+                                       max_block_size=1024,
+                                       dtype=None):
+  """Kernel "classifier" distance for evaluating a generative model.
+
+  This is based on the Kernel Inception distance, but for an arbitrary
+  embedding. Also returns an estimate of the standard error of the distance
+  estimator.
+
+  This technique is described in detail in https://arxiv.org/abs/1801.01401.
+  Given two distributions P and Q of activations, this function calculates
+
+      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
+        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
+
+  where k is the polynomial kernel
+
+      k(x, y) = ( x^T y / dimension + 1 )^3.
+
+  This captures how different the distributions of real and generated images'
+  visual features are. Like the Frechet distance (and unlike the Inception
+  score), this is a true distance and incorporates information about the
+  target images. Unlike the Frechet score, this function computes an
+  *unbiased* and asymptotically normal estimator, which makes comparing
+  estimates across models much more intuitive.
+
+  The estimator used takes time quadratic in max_block_size. Larger values of
+  max_block_size will decrease the variance of the estimator but increase the
+  computational cost. This differs slightly from the estimator used by the
+  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
+
+  NOTE: the blocking code assumes that real_activations and
+  generated_activations are both in random order. If either is sorted in a
+  meaningful order, the estimator will behave poorly.
+
+  NOTE: This function consumes images, computes their activations, and then
+  computes the classifier score. If you would like to precompute many
+  activations for real and generated images for large batches, or to compute
+  multiple scores based on the same images, please use
+  kernel_clasifier_distance_from_activations(), which this method also uses.
+
+  Args:
+    real_images: Real images to use to compute Kernel Inception distance.
+    generated_images: Generated images to use to compute Kernel Inception
+      distance.
+    classifier_fn: A function that takes images and produces activations based
+      on a classifier.
+    num_classifier_batches: Number of batches to split images in to in order to
+      efficiently run them through the classifier network.
+    max_estimator_block_size: integer, default 1024. The distance estimator
+      splits samples into blocks for computational efficiency. Larger values are
+      more computationally expensive but decrease the variance of the distance
+      estimate. Having a smaller block size also gives a better estimate of the
+      standard error.
+    dtype: if not None, coerce activations to this dtype before computations.
+
+  Returns:
+   The Kernel Inception Distance. A floating-point scalar of the same type
+     as the output of the activations.
+   An estimate of the standard error of the distance estimator (a scalar of
+     the same type).
+  """
+  real_images_list = array_ops.split(
+      real_images, num_or_size_splits=num_classifier_batches)
+  generated_images_list = array_ops.split(
+      generated_images, num_or_size_splits=num_classifier_batches)
+
+  real_imgs = array_ops.stack(real_images_list)
+  generated_imgs = array_ops.stack(generated_images_list)
+
+  # Compute the activations using the memory-efficient `map_fn`.
+  def compute_activations(elems):
+    return functional_ops.map_fn(
+        fn=classifier_fn,
+        elems=elems,
+        parallel_iterations=1,
+        back_prop=False,
+        swap_memory=True,
+        name='RunClassifier')
+
+  real_a = compute_activations(real_imgs)
+  gen_a = compute_activations(generated_imgs)
+
+  # Ensure the activations have the right shapes.
+  real_a = array_ops.concat(array_ops.unstack(real_a), 0)
+  gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
+
+  return kernel_classifier_distance_and_std_from_activations(
+      real_a, gen_a, max_block_size=max_block_size)
+
+
+kernel_inception_distance_and_std = functools.partial(
+    kernel_classifier_distance_and_std,
+    classifier_fn=functools.partial(
+        run_inception, output_tensor=INCEPTION_FINAL_POOL))
+
+
+def kernel_classifier_distance_from_activations(real_activations,
+                                                generated_activations,
+                                                max_block_size=1024,
+                                                dtype=None):
+  """Kernel "classifier" distance for evaluating a generative model.
+
+  This methods computes the kernel classifier distance from activations of
+  real images and generated images. This can be used independently of the
+  kernel_classifier_distance() method, especially in the case of using large
+  batches during evaluation where we would like to precompute all of the
+  activations before computing the classifier distance, or if we want to
+  compute multiple metrics based on the same images.
+
+  This technique is described in detail in https://arxiv.org/abs/1801.01401.
+  Given two distributions P and Q of activations, this function calculates
+
+      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
+        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
+
+  where k is the polynomial kernel
+
+      k(x, y) = ( x^T y / dimension + 1 )^3.
+
+  This captures how different the distributions of real and generated images'
+  visual features are. Like the Frechet distance (and unlike the Inception
+  score), this is a true distance and incorporates information about the
+  target images. Unlike the Frechet score, this function computes an
+  *unbiased* and asymptotically normal estimator, which makes comparing
+  estimates across models much more intuitive.
+
+  The estimator used takes time quadratic in max_block_size. Larger values of
+  max_block_size will decrease the variance of the estimator but increase the
+  computational cost. This differs slightly from the estimator used by the
+  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
+
+  NOTE: the blocking code assumes that real_activations and
+  generated_activations are both in random order. If either is sorted in a
+  meaningful order, the estimator will behave poorly.
+
+  Args:
+    real_activations: 2D Tensor containing activations of real data. Shape is
+      [batch_size, activation_size].
+    generated_activations: 2D Tensor containing activations of generated data.
+      Shape is [batch_size, activation_size].
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
+      estimate.
+    dtype: if not None, coerce activations to this dtype before computations.
+
+  Returns:
+   The Kernel Inception Distance. A floating-point scalar of the same type
+   as the output of the activations.
+  """
+  return kernel_classifier_distance_and_std_from_activations(
+      real_activations, generated_activations, max_block_size=max_block_size)[0]
+
+
+def kernel_classifier_distance_and_std_from_activations(real_activations,
+                                                        generated_activations,
+                                                        max_block_size=1024,
+                                                        dtype=None):
+  """Kernel "classifier" distance for evaluating a generative model.
+
+  This methods computes the kernel classifier distance from activations of
+  real images and generated images. This can be used independently of the
+  kernel_classifier_distance() method, especially in the case of using large
+  batches during evaluation where we would like to precompute all of the
+  activations before computing the classifier distance, or if we want to
+  compute multiple metrics based on the same images. It also returns a rough
+  estimate of the standard error of the estimator.
+
+  This technique is described in detail in https://arxiv.org/abs/1801.01401.
+  Given two distributions P and Q of activations, this function calculates
+
+      E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')]
+        - 2 E_{X ~ P, Y ~ Q}[k(X, Y)]
+
+  where k is the polynomial kernel
+
+      k(x, y) = ( x^T y / dimension + 1 )^3.
+
+  This captures how different the distributions of real and generated images'
+  visual features are. Like the Frechet distance (and unlike the Inception
+  score), this is a true distance and incorporates information about the
+  target images. Unlike the Frechet score, this function computes an
+  *unbiased* and asymptotically normal estimator, which makes comparing
+  estimates across models much more intuitive.
+
+  The estimator used takes time quadratic in max_block_size. Larger values of
+  max_block_size will decrease the variance of the estimator but increase the
+  computational cost. This differs slightly from the estimator used by the
+  original paper; it is the block estimator of https://arxiv.org/abs/1307.1954.
+  The estimate of the standard error will also be more reliable when there are
+  more blocks, i.e. when max_block_size is smaller.
+
+  NOTE: the blocking code assumes that real_activations and
+  generated_activations are both in random order. If either is sorted in a
+  meaningful order, the estimator will behave poorly.
+
+  Args:
+    real_activations: 2D Tensor containing activations of real data. Shape is
+      [batch_size, activation_size].
+    generated_activations: 2D Tensor containing activations of generated data.
+      Shape is [batch_size, activation_size].
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
+      estimate. Having a smaller block size also gives a better estimate of the
+      standard error.
+    dtype: if not None, coerce activations to this dtype before computations.
+
+  Returns:
+   The Kernel Inception Distance. A floating-point scalar of the same type
+     as the output of the activations.
+   An estimate of the standard error of the distance estimator (a scalar of
+     the same type).
+  """
+
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+  real_activations.shape[1].assert_is_compatible_with(
+      generated_activations.shape[1])
+
+  if dtype is None:
+    dtype = real_activations.dtype
+    assert generated_activations.dtype == dtype
+  else:
+    real_activations = math_ops.cast(real_activations, dtype)
+    generated_activations = math_ops.cast(generated_activations, dtype)
+
+  # Figure out how to split the activations into blocks of approximately
+  # equal size, with none larger than max_block_size.
+  n_r = array_ops.shape(real_activations)[0]
+  n_g = array_ops.shape(generated_activations)[0]
+
+  n_bigger = math_ops.maximum(n_r, n_g)
+  n_blocks = math_ops.to_int32(math_ops.ceil(n_bigger / max_block_size))
+
+  v_r = n_r // n_blocks
+  v_g = n_g // n_blocks
+
+  n_plusone_r = n_r - v_r * n_blocks
+  n_plusone_g = n_g - v_g * n_blocks
+
+  sizes_r = array_ops.concat([
+      array_ops.fill([n_blocks - n_plusone_r], v_r),
+      array_ops.fill([n_plusone_r], v_r + 1),
+  ], 0)
+  sizes_g = array_ops.concat([
+      array_ops.fill([n_blocks - n_plusone_g], v_g),
+      array_ops.fill([n_plusone_g], v_g + 1),
+  ], 0)
+
+  zero = array_ops.zeros([1], dtype=dtypes.int32)
+  inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0)
+  inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0)
+
+  dim = math_ops.cast(real_activations.shape[1], dtype)
+
+  def compute_kid_block(i):
+    'Compute the ith block of the KID estimate.'
+    r_s = inds_r[i]
+    r_e = inds_r[i + 1]
+    r = real_activations[r_s:r_e]
+    m = math_ops.cast(r_e - r_s, dtype)
+
+    g_s = inds_g[i]
+    g_e = inds_g[i + 1]
+    g = generated_activations[g_s:g_e]
+    n = math_ops.cast(g_e - g_s, dtype)
+
+    k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3
+    k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3
+    k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3
+    return (-2 * math_ops.reduce_mean(k_rg) +
+            (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) / (m * (m - 1)) +
+            (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n * (n - 1)))
+
+  ests = functional_ops.map_fn(
+      compute_kid_block, math_ops.range(n_blocks), dtype=dtype, back_prop=False)
+
+  mn = math_ops.reduce_mean(ests)
+
+  # nn_impl.moments doesn't use the Bessel correction, which we want here
+  n_blocks_ = math_ops.cast(n_blocks, dtype)
+  var = control_flow_ops.cond(
+      math_ops.less_equal(n_blocks, 1),
+      lambda: array_ops.constant(float('nan'), dtype=dtype),
+      lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) / (n_blocks_ - 1))
+
+  return mn, math_ops.sqrt(var / n_blocks_)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index d64dfd1576578435d0e3bd4e338fe2e9e4a6f6ab..dbff1d2a367e10adc607dafb4c571bb3607a3963 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -86,6 +86,42 @@ def _expected_fid(real_imgs, gen_imgs):
 def _expected_trace_sqrt_product(sigma, sigma_v):
   return np.trace(scp_linalg.sqrtm(np.dot(sigma, sigma_v)))
 
+
+def _expected_kid_and_std(real_imgs, gen_imgs, max_block_size=1024):
+  n_r, dim = real_imgs.shape
+  n_g = gen_imgs.shape[0]
+
+  n_blocks = int(np.ceil(max(n_r, n_g) / max_block_size))
+
+  sizes_r = np.full(n_blocks, n_r // n_blocks)
+  to_patch = n_r - n_blocks * (n_r // n_blocks)
+  if to_patch > 0:
+    sizes_r[-to_patch:] += 1
+  inds_r = np.r_[0, np.cumsum(sizes_r)]
+  assert inds_r[-1] == n_r
+
+  sizes_g = np.full(n_blocks, n_g // n_blocks)
+  to_patch = n_g - n_blocks * (n_g // n_blocks)
+  if to_patch > 0:
+    sizes_g[-to_patch:] += 1
+  inds_g = np.r_[0, np.cumsum(sizes_g)]
+  assert inds_g[-1] == n_g
+
+  ests = []
+  for i in range(n_blocks):
+    r = real_imgs[inds_r[i]:inds_r[i + 1]]
+    g = gen_imgs[inds_g[i]:inds_g[i + 1]]
+
+    k_rr = (np.dot(r, r.T) / dim + 1)**3
+    k_rg = (np.dot(r, g.T) / dim + 1)**3
+    k_gg = (np.dot(g, g.T) / dim + 1)**3
+    ests.append(-2 * k_rg.mean() +
+                k_rr[np.triu_indices_from(k_rr, k=1)].mean() +
+                k_gg[np.triu_indices_from(k_gg, k=1)].mean())
+
+  var = np.var(ests, ddof=1) if len(ests) > 1 else np.nan
+  return np.mean(ests), np.sqrt(var / len(ests))
+
 # A dummy GraphDef string with the minimum number of Ops.
 graphdef_string = """
 node {
@@ -272,6 +308,18 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     # Check that none of the model variables are trainable.
     self.assertListEqual([], variables.trainable_variables())
 
+  def test_kernel_inception_distance_graph(self):
+    """Test `frechet_inception_distance` graph construction."""
+    img = array_ops.ones([7, 299, 299, 3])
+    distance = _run_with_mock(classifier_metrics.kernel_inception_distance, img,
+                              img)
+
+    self.assertTrue(isinstance(distance, ops.Tensor))
+    distance.shape.assert_has_rank(0)
+
+    # Check that none of the model variables are trainable.
+    self.assertListEqual([], variables.trainable_variables())
+
   def test_run_inception_multicall(self):
     """Test that `run_inception` can be called multiple times."""
     for batch_size in (7, 3, 2):
@@ -411,6 +459,56 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     # Check that the FIDs increase monotonically.
     self.assertTrue(all(fid_a < fid_b for fid_a, fid_b in zip(fids, fids[1:])))
 
+  def test_kernel_classifier_distance_value(self):
+    """Test that `kernel_classifier_distance` gives the correct value."""
+    np.random.seed(0)
+
+    test_pool_real_a = np.float32(np.random.randn(512, 256))
+    test_pool_gen_a = np.float32(np.random.randn(512, 256) * 1.1 + .05)
+
+    kid_op = _run_with_mock(
+        classifier_metrics.kernel_classifier_distance_and_std,
+        test_pool_real_a,
+        test_pool_gen_a,
+        classifier_fn=lambda x: x,
+        max_block_size=600)
+
+    with self.test_session() as sess:
+      actual_kid, actual_std = sess.run(kid_op)
+
+    expected_kid, expected_std = _expected_kid_and_std(test_pool_real_a,
+                                                       test_pool_gen_a)
+
+    self.assertAllClose(expected_kid, actual_kid, 0.001)
+    self.assertAllClose(expected_std, actual_std, 0.001)
+
+  def test_kernel_classifier_distance_block_sizes(self):
+    """Test that `kernel_classifier_distance` works with unusual max_block_size
+
+    values..
+    """
+    np.random.seed(0)
+
+    test_pool_real_a = np.float32(np.random.randn(512, 256))
+    test_pool_gen_a = np.float32(np.random.randn(768, 256) * 1.1 + .05)
+
+    max_block_size = array_ops.placeholder(dtypes.int32, shape=())
+    kid_op = _run_with_mock(
+        classifier_metrics.kernel_classifier_distance_and_std_from_activations,
+        array_ops.constant(test_pool_real_a),
+        array_ops.constant(test_pool_gen_a),
+        max_block_size=max_block_size)
+
+    for block_size in [50, 512, 1000]:
+      with self.test_session() as sess:
+        actual_kid, actual_std = sess.run(kid_op, {max_block_size: block_size})
+
+      expected_kid, expected_std = _expected_kid_and_std(
+          test_pool_real_a, test_pool_gen_a, max_block_size=block_size)
+
+      self.assertAllClose(expected_kid, actual_kid, 0.001)
+      self.assertAllClose(expected_std, actual_std, 0.001)
+
   def test_trace_sqrt_product_value(self):
     """Test that `trace_sqrt_product` gives the correct value."""
     np.random.seed(0)
diff --git a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py b/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
index 2b7bb5f14e7f3d1b3f913d3426efaaae19079ffb..e4fac1976d605f1942947a747043d5c8b00392c1 100644
--- a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
+++ b/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
@@ -47,13 +47,13 @@ class ClipWeightsTest(test.TestCase):
     train_op1 = opt.minimize(loss, var_list=self.variables)
     train_op2 = opt_clip.minimize(loss, var_list=self.variables)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       sess.run(variables.global_variables_initializer())
       self.assertEqual(2.0, self.variables[0].eval())
       sess.run(train_op1)
       self.assertLess(0.1, self.variables[0].eval())
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       sess.run(variables.global_variables_initializer())
       self.assertEqual(2.0, self.variables[0].eval())
       sess.run(train_op2)
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
index e2594faf85bcf91cbe09f266e4d4211d20bdee17..364fa4eb461c62784803f0c309e3b7c5855df199 100644
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
@@ -64,6 +64,9 @@ def condition_tensor(tensor, conditioning):
   """
   tensor.shape[1:].assert_is_fully_defined()
   num_features = tensor.shape[1:].num_elements()
+  if conditioning.shape.ndims < 2:
+    raise ValueError('conditioning must be at least 2D, but saw shape: %s'
+                     % conditioning.shape)
 
   mapped_conditioning = layers.linear(
       layers.flatten(conditioning), num_features)
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
index 0aad769793761be69ee9d1e3416e44c7b3d8cea0..f5c7d53cf2c9aa08ba0074950983ef3ecd90168b 100644
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
+++ b/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
@@ -45,7 +45,7 @@ class ConditioningUtilsTest(test.TestCase):
           array_ops.placeholder(dtypes.float32, (5, None)),
           array_ops.placeholder(dtypes.float32, (5, 1)))
 
-    with self.assertRaisesRegexp(ValueError, 'expected min_ndim=2'):
+    with self.assertRaisesRegexp(ValueError, 'at least 2D'):
       conditioning_utils.condition_tensor(
           array_ops.placeholder(dtypes.float32, (5, 2)),
           array_ops.placeholder(dtypes.float32, (5)))
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
index 08584dcd656e3e7a079a3fa36f44742b5eac1178..3c9dfd6de024b1558bed2e3678606fef8bb4d677 100644
--- a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
@@ -37,7 +37,7 @@ class TensorPoolTest(test.TestCase):
     output_value = tensor_pool(input_value, pool_size=10)
     self.assertEqual(output_value.shape.as_list(), [None, None, 3])
 
-    with self.test_session(use_gpu=True) as session:
+    with self.session(use_gpu=True) as session:
       for i in range(10):
         session.run(output_value, {input_value: [[[i] * 3]]})
         session.run(output_value, {input_value: [[[i] * 3] * 2]})
@@ -49,7 +49,7 @@ class TensorPoolTest(test.TestCase):
     output_value = tensor_pool(input_value, pool_size=10)
     self.assertEqual(output_value.shape.as_list(), [])
 
-    with self.test_session(use_gpu=True) as session:
+    with self.session(use_gpu=True) as session:
       outs = []
       for i in range(50):
         out = session.run(output_value, {input_value: i})
@@ -67,7 +67,7 @@ class TensorPoolTest(test.TestCase):
         input_value, pool_size=10, pooling_probability=0.0)
     self.assertEqual(output_value.shape.as_list(), [])
 
-    with self.test_session(use_gpu=True) as session:
+    with self.session(use_gpu=True) as session:
       for i in range(50):
         out = session.run(output_value, {input_value: i})
         self.assertEqual(out, i)
@@ -83,7 +83,7 @@ class TensorPoolTest(test.TestCase):
         pooling_probability=pooling_probability)
     self.assertEqual(output_value.shape.as_list(), [])
 
-    with self.test_session(use_gpu=True) as session:
+    with self.session(use_gpu=True) as session:
       not_pooled = 0
       total = 1000
       for i in range(total):
@@ -104,7 +104,7 @@ class TensorPoolTest(test.TestCase):
     for output_value in output_values:
       self.assertEqual(output_value.shape.as_list(), [])
 
-    with self.test_session(use_gpu=True) as session:
+    with self.session(use_gpu=True) as session:
       for i in range(10):
         outs = session.run(output_values, {
             input_values[0]: i,
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
index 650eab97a3952e9aec2b489fffcc83c3bc49f2dd..f5c448db41c67adb4edd2634dd63a1840180df70 100644
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_impl.py
@@ -200,7 +200,7 @@ class VBN(object):
       del reduction_axes[axis]
 
       self._broadcast_shape = [1] * len(input_shape)
-      self._broadcast_shape[axis] = input_shape[axis].value
+      self._broadcast_shape[axis] = input_shape.dims[axis]
 
       self._example_reduction_axes = list(range(ndims))
       del self._example_reduction_axes[max(axis, self._batch_axis)]
diff --git a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
index 2fe06a287284ff994326d5a977a2e4d4634268ae..ecfbb8a432e3308863edd6f1343be55c1fe5753c 100644
--- a/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
+++ b/tensorflow/contrib/gan/python/features/python/virtual_batchnorm_test.py
@@ -59,7 +59,7 @@ class VirtualBatchnormTest(test.TestCase):
       mom_mean, mom_var = nn.moments(tensors, axes)
       vb_var = mean_sq - math_ops.square(vb_mean)
 
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         vb_mean_np, vb_var_np, mom_mean_np, mom_var_np = sess.run([
             vb_mean, vb_var, mom_mean, mom_var])
 
@@ -93,7 +93,7 @@ class VirtualBatchnormTest(test.TestCase):
       vb_mean = array_ops.squeeze(vb_mean, batch_axis)
       vb_variance = array_ops.squeeze(vb_variance, batch_axis)
 
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         vb_mean_np, vb_var_np, mom_mean_np, mom_var_np = sess.run([
             vb_mean, vb_variance, mom_mean, mom_variance])
 
@@ -116,7 +116,7 @@ class VirtualBatchnormTest(test.TestCase):
       vbn = virtual_batchnorm.VBN(batch, axis, batch_axis=batch_axis)
       vbn_normalized = vbn.reference_batch_normalization()
 
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         variables_lib.global_variables_initializer().run()
 
         bn_normalized_np, vbn_normalized_np = sess.run(
@@ -142,7 +142,7 @@ class VirtualBatchnormTest(test.TestCase):
       vb_normed = array_ops.squeeze(
           vbn(array_ops.expand_dims(examples[i], [0])), [0])
 
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         variables_lib.global_variables_initializer().run()
         bn_np, vb_np = sess.run([batch_normalized, vb_normed])
       self.assertAllClose(bn_np[i, ...], vb_np)
@@ -167,7 +167,7 @@ class VirtualBatchnormTest(test.TestCase):
     vbn = virtual_batchnorm.VBN(reference_batch)
     vbn_fixed_example = array_ops.squeeze(
         vbn(array_ops.expand_dims(fixed_example, 0)), 0)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       variables_lib.global_variables_initializer().run()
       vbn_fixed_example_np = vbn_fixed_example.eval()
 
@@ -180,7 +180,7 @@ class VirtualBatchnormTest(test.TestCase):
       minibatch = array_ops.stack([fixed_example] + examples)
       vbn_minibatch = vbn(minibatch)
       cur_vbn_fixed_example = vbn_minibatch[0, ...]
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         variables_lib.global_variables_initializer().run()
         cur_vbn_fixed_example_np = cur_vbn_fixed_example.eval()
       self.assertAllClose(vbn_fixed_example_np, cur_vbn_fixed_example_np)
@@ -219,7 +219,7 @@ class VirtualBatchnormTest(test.TestCase):
 
     self.assertEqual(4, len(contrib_variables_lib.get_variables()))
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       variables_lib.global_variables_initializer().run()
       sess.run(to_fetch)
 
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index 8bc4db8424f661bba65675f0cd1c2fc33696eda9..a0a86c6337eefa756a209635faa70db686a36247 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -36,7 +36,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables_lib
 from tensorflow.python.framework import ops
@@ -47,7 +46,6 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.distributions import distribution as ds
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.ops.losses import util
 from tensorflow.python.summary import summary
@@ -355,7 +353,8 @@ def wasserstein_gradient_penalty(
       raise ValueError('`generated_data` can\'t have unknown rank.')
 
     differences = generated_data - real_data
-    batch_size = differences.shape[0].value or array_ops.shape(differences)[0]
+    batch_size = differences.shape.dims[0].value or array_ops.shape(
+        differences)[0]
     alpha_shape = [batch_size] + [1] * (differences.shape.ndims - 1)
     alpha = random_ops.random_uniform(shape=alpha_shape)
     interpolates = real_data + (alpha * differences)
@@ -739,11 +738,16 @@ def least_squares_discriminator_loss(
 def _validate_distributions(distributions):
   if not isinstance(distributions, (list, tuple)):
     raise ValueError('`distributions` must be a list or tuple. Instead, '
-                     'found %s.', type(distributions))
+                     'found %s.' % type(distributions))
   for x in distributions:
-    if not isinstance(x, ds.Distribution):
+    # We used to check with `isinstance(x, tf.distributions.Distribution)`.
+    # However, distributions have migrated to `tfp.distributions.Distribution`,
+    # which is a new code repo, so we can't check this way anymore until
+    # TF-GAN is migrated to a new repo as well.
+    # This new check is not sufficient, but is a useful heuristic for now.
+    if not callable(getattr(x, 'log_prob', None)):
       raise ValueError('`distributions` must be a list of `Distributions`. '
-                       'Instead, found %s.', type(x))
+                       'Instead, found %s.' % type(x))
 
 
 def _validate_information_penalty_inputs(
@@ -816,7 +820,7 @@ def _numerically_stable_global_norm(tensor_list):
   Returns:
     A scalar tensor with the global norm.
   """
-  if np.all([x is None for x in tensor_list]):
+  if all(x is None for x in tensor_list):
     return 0.0
 
   list_max = math_ops.reduce_max([math_ops.reduce_max(math_ops.abs(x)) for x in
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
index b9ac1bf15138c7e7d15ab3ebdac605d84921b6e5..969b68449d9c82f9f9144a8657cd8932b38fd0f7 100644
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ b/tensorflow/contrib/gan/python/namedtuples.py
@@ -213,7 +213,8 @@ class GANTrainOps(
     collections.namedtuple('GANTrainOps', (
         'generator_train_op',
         'discriminator_train_op',
-        'global_step_inc_op'
+        'global_step_inc_op',
+        'train_hooks'
     ))):
   """GANTrainOps contains the training ops.
 
@@ -221,8 +222,17 @@ class GANTrainOps(
     generator_train_op: Op that performs a generator update step.
     discriminator_train_op: Op that performs a discriminator update step.
     global_step_inc_op: Op that increments the shared global step.
+    train_hooks: a list or tuple containing hooks related to training that need
+      to be populated when training ops are instantiated. Used primarily for
+      sync hooks.
   """
 
+  def __new__(cls, generator_train_op, discriminator_train_op,
+              global_step_inc_op, train_hooks=()):
+    return super(GANTrainOps, cls).__new__(cls, generator_train_op,
+                                           discriminator_train_op,
+                                           global_step_inc_op, train_hooks)
+
 
 class GANTrainSteps(
     collections.namedtuple('GANTrainSteps', (
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 9e5aea1498a7e9d47480af18cad9f80ede84c0f9..4c7bee41b33ce1fee46d374ca5fd1c0b603762f9 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -45,7 +45,6 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.distributions import distribution as ds
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
@@ -115,7 +114,7 @@ def gan_model(
     discriminator_gen_outputs = discriminator_fn(generated_data,
                                                  generator_inputs)
   with variable_scope.variable_scope(dis_scope, reuse=True):
-    real_data = ops.convert_to_tensor(real_data)
+    real_data = _convert_tensor_or_l_or_d(real_data)
     discriminator_real_outputs = discriminator_fn(real_data, generator_inputs)
 
   if check_shapes:
@@ -925,6 +924,7 @@ def gan_train_ops(
     generator_optimizer,
     discriminator_optimizer,
     check_for_unused_update_ops=True,
+    is_chief=True,
     # Optional args to pass directly to the `create_train_op`.
     **kwargs):
   """Returns GAN train ops.
@@ -940,6 +940,8 @@ def gan_train_ops(
     discriminator_optimizer: The optimizer for the discriminator updates.
     check_for_unused_update_ops: If `True`, throws an exception if there are
       update ops outside of the generator or discriminator scopes.
+    is_chief: Specifies whether or not the training is being run by the primary
+      replica during replica training.
     **kwargs: Keyword args to pass directly to
       `training.create_train_op` for both the generator and
       discriminator train op.
@@ -981,6 +983,9 @@ def gan_train_ops(
       kwargs, model.generator_scope.name, model.discriminator_scope.name,
       check_for_unused_update_ops)
 
+  # Get the sync hooks if these are needed.
+  sync_hooks = []
+
   generator_global_step = None
   if isinstance(generator_optimizer,
                 sync_replicas_optimizer.SyncReplicasOptimizer):
@@ -996,6 +1001,7 @@ def gan_train_ops(
         trainable=False,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     gen_update_ops += [generator_global_step.assign(global_step)]
+    sync_hooks.append(generator_optimizer.make_session_run_hook(is_chief))
   with ops.name_scope('generator_train'):
     gen_train_op = training.create_train_op(
         total_loss=loss.generator_loss,
@@ -1017,6 +1023,7 @@ def gan_train_ops(
         trainable=False,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     dis_update_ops += [discriminator_global_step.assign(global_step)]
+    sync_hooks.append(discriminator_optimizer.make_session_run_hook(is_chief))
   with ops.name_scope('discriminator_train'):
     disc_train_op = training.create_train_op(
         total_loss=loss.discriminator_loss,
@@ -1026,7 +1033,8 @@ def gan_train_ops(
         update_ops=dis_update_ops,
         **kwargs)
 
-  return namedtuples.GANTrainOps(gen_train_op, disc_train_op, global_step_inc)
+  return namedtuples.GANTrainOps(gen_train_op, disc_train_op, global_step_inc,
+                                 sync_hooks)
 
 
 # TODO(joelshor): Implement a dynamic GAN train loop, as in `Real-Time Adaptive
@@ -1067,13 +1075,24 @@ def get_sequential_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
                                      train_steps.generator_train_steps)
     discriminator_hook = RunTrainOpsHook(train_ops.discriminator_train_op,
                                          train_steps.discriminator_train_steps)
-    return [generator_hook, discriminator_hook]
+    return [generator_hook, discriminator_hook] + list(train_ops.train_hooks)
 
   return get_hooks
 
 
+def _num_joint_steps(train_steps):
+  g_steps = train_steps.generator_train_steps
+  d_steps = train_steps.discriminator_train_steps
+  # Get the number of each type of step that should be run.
+  num_d_and_g_steps = min(g_steps, d_steps)
+  num_g_steps = g_steps - num_d_and_g_steps
+  num_d_steps = d_steps - num_d_and_g_steps
+
+  return num_d_and_g_steps, num_g_steps, num_d_steps
+
+
 def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
-  """Returns a hooks function for sequential GAN training.
+  """Returns a hooks function for joint GAN training.
 
   When using these train hooks, IT IS RECOMMENDED TO USE `use_locking=True` ON
   ALL OPTIMIZERS TO AVOID RACE CONDITIONS.
@@ -1106,12 +1125,7 @@ def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
   Returns:
     A function that takes a GANTrainOps tuple and returns a list of hooks.
   """
-  g_steps = train_steps.generator_train_steps
-  d_steps = train_steps.discriminator_train_steps
-  # Get the number of each type of step that should be run.
-  num_d_and_g_steps = min(g_steps, d_steps)
-  num_g_steps = g_steps - num_d_and_g_steps
-  num_d_steps = d_steps - num_d_and_g_steps
+  num_d_and_g_steps, num_g_steps, num_d_steps = _num_joint_steps(train_steps)
 
   def get_hooks(train_ops):
     g_op = train_ops.generator_train_op
@@ -1121,7 +1135,7 @@ def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
     g_hook = RunTrainOpsHook(g_op, num_g_steps)
     d_hook = RunTrainOpsHook(d_op, num_d_steps)
 
-    return [joint_hook, g_hook, d_hook]
+    return [joint_hook, g_hook, d_hook] + list(train_ops.train_hooks)
 
   return get_hooks
 
@@ -1264,10 +1278,6 @@ def _validate_distributions(distributions_l, noise_l):
   if not isinstance(distributions_l, (tuple, list)):
     raise ValueError('`predicted_distributions` must be a list. Instead, found '
                      '%s.' % type(distributions_l))
-  for dist in distributions_l:
-    if not isinstance(dist, ds.Distribution):
-      raise ValueError('Every element in `predicted_distributions` must be a '
-                       '`tf.Distribution`. Instead, found %s.' % type(dist))
   if len(distributions_l) != len(noise_l):
     raise ValueError('Length of `predicted_distributions` %i must be the same '
                      'as the length of structured noise %i.' %
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index 64d670619905a427a84bee4b661228abca591fae..841f25cd7f1852767776eed2dcbf2522d8b0743b 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -519,7 +519,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase):
     """Test output type."""
     loss = train.gan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.GANLoss)
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
 
   @parameterized.named_parameters(
       ('cyclegan', create_cyclegan_model),
@@ -528,7 +528,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase):
   def test_cyclegan_output_type(self, get_gan_model_fn):
     loss = train.cyclegan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.CycleGANLoss)
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
 
   @parameterized.named_parameters(
       ('gan', create_gan_model, False),
@@ -759,7 +759,7 @@ class TensorPoolAdjusteModelTest(test.TestCase):
           # For [pool_size, ?), the pool is full, tensor2 must be equal to some
           # historical values of tensor1 (which is previously stored in the
           # pool).
-          self.assertTrue(any([(v == t2).all() for v in history_values]))
+          self.assertTrue(any((v == t2).all() for v in history_values))
 
   def _make_new_model_and_check(self, model, pool_size):
     pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=pool_size)
@@ -836,6 +836,9 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
 
     self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
 
+    # Make sure there are no training hooks populated accidentally.
+    self.assertEmpty(train_ops.train_hooks)
+
   # TODO(joelshor): Add a test to check that custom update op is run.
   @parameterized.named_parameters(
       ('gan', create_gan_model, False),
@@ -923,8 +926,15 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
         model, loss, generator_optimizer=g_opt, discriminator_optimizer=d_opt)
     self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
     # No new trainable variables should have been added.
-    self.assertEqual(num_trainable_vars,
-                     len(variables_lib.get_trainable_variables()))
+    self.assertLen(variables_lib.get_trainable_variables(), num_trainable_vars)
+
+    # Sync hooks should be populated in the GANTrainOps.
+    self.assertLen(train_ops.train_hooks, 2)
+    for hook in train_ops.train_hooks:
+      self.assertIsInstance(
+          hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)
+    sync_opts = [hook._sync_optimizer for hook in train_ops.train_hooks]
+    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
 
     g_sync_init_op = g_opt.get_init_tokens_op(num_tokens=1)
     d_sync_init_op = d_opt.get_init_tokens_op(num_tokens=1)
@@ -959,6 +969,32 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
       coord.request_stop()
       coord.join(g_threads + d_threads)
 
+  @parameterized.named_parameters(
+      ('is_chief', True),
+      ('is_not_chief', False),
+  )
+  def test_is_chief_in_train_hooks(self, is_chief):
+    """Make sure is_chief is propagated correctly to sync hooks."""
+    model = create_gan_model()
+    loss = train.gan_loss(model)
+    g_opt = get_sync_optimizer()
+    d_opt = get_sync_optimizer()
+    train_ops = train.gan_train_ops(
+        model,
+        loss,
+        g_opt,
+        d_opt,
+        is_chief=is_chief,
+        summarize_gradients=True,
+        colocate_gradients_with_ops=True)
+
+    self.assertLen(train_ops.train_hooks, 2)
+    for hook in train_ops.train_hooks:
+      self.assertIsInstance(
+          hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)
+    is_chief_list = [hook._is_chief for hook in train_ops.train_hooks]
+    self.assertListEqual(is_chief_list, [is_chief, is_chief])
+
 
 class GANTrainTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_train`."""
@@ -1036,6 +1072,44 @@ class GANTrainTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(np.isscalar(final_loss))
     self.assertEqual(17.0, final_loss)
 
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_train_hooks_exist_in_get_hooks_fn(self, create_gan_model_fn):
+    model = create_gan_model_fn()
+    loss = train.gan_loss(model)
+
+    g_opt = get_sync_optimizer()
+    d_opt = get_sync_optimizer()
+    train_ops = train.gan_train_ops(
+        model,
+        loss,
+        g_opt,
+        d_opt,
+        summarize_gradients=True,
+        colocate_gradients_with_ops=True)
+
+    sequential_train_hooks = train.get_sequential_train_hooks()(train_ops)
+    self.assertLen(sequential_train_hooks, 4)
+    sync_opts = [
+        hook._sync_optimizer for hook in sequential_train_hooks if
+        isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
+    self.assertLen(sync_opts, 2)
+    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
+
+    joint_train_hooks = train.get_joint_train_hooks()(train_ops)
+    self.assertLen(joint_train_hooks, 5)
+    sync_opts = [
+        hook._sync_optimizer for hook in joint_train_hooks if
+        isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
+    self.assertLen(sync_opts, 2)
+    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
+
 
 class PatchGANTest(test.TestCase, parameterized.TestCase):
   """Tests that functions work on PatchGAN style output."""
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index bb06f1c41c1d60f3c3b3639e3b32ea85161510b2..53587fcf3050f313c85485f77ce411cba7faccff 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <fstream>
 #include <list>
 #include <map>
-#include <set>
 
 #include <fcntl.h>
 #include <rdma/rdma_cma.h>
@@ -30,19 +29,17 @@ limitations under the License.
 #include <sys/epoll.h>
 
 #include "tensorflow/contrib/gdr/gdr.pb.h"
-#include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/common_runtime/pool_allocator.h"
 #include "tensorflow/core/common_runtime/process_state.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #endif  // GOOGLE_CUDA
-#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 
 namespace tensorflow {
 
@@ -70,14 +67,11 @@ bool IsGDRAvailable() {
 int TryToReadNumaNode(ibv_device* device) {
 #if defined(__APPLE__)
   LOG(INFO) << "OS X does not support NUMA - returning NUMA node 0";
-  return 0;
+  return port::kNUMANoAffinity;
 #elif defined(PLATFORM_WINDOWS)
   // Windows support for NUMA is not currently implemented. Return node 0.
-  return 0;
+  return port::kNUMANoAffinity;
 #else
-  VLOG(2) << "Trying to read NUMA node for device: " << device->name;
-  static const int kUnknownNumaNode = -1;
-
   auto filename = string(device->ibdev_path) + "/device/numa_node";
 
   std::ifstream ifs(filename.c_str());
@@ -91,12 +85,12 @@ int TryToReadNumaNode(ibv_device* device) {
                 << value
                 << "), but there must be at least one NUMA node"
                    ", so returning NUMA node zero";
-      return 0;
+      return port::kNUMANoAffinity;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
     return value;
   }
-  return kUnknownNumaNode;
+  return port::kNUMANoAffinity;
 #endif
 }
 
@@ -138,8 +132,6 @@ class GdrMemoryManager : public RemoteMemoryManager {
       Device* device, DeviceContext* device_context, bool on_host,
       StatusCallback done) override;
 
-  static void RegMemVisitors();
-
  protected:
   Status CreateEndpoint(const string& host, const string& port,
                         RdmaEndpointPtr& endpoint);
@@ -150,7 +142,8 @@ class GdrMemoryManager : public RemoteMemoryManager {
 
   ibv_mr* FindMemoryRegion(void* addr, size_t length);
 
-  void InsertMemoryRegion(void* addr, size_t length);
+  void InsertMemoryRegion(void* addr, size_t length,
+                          const std::string& allocator_name);
 
   void EvictMemoryRegion(void* addr, size_t length);
 
@@ -160,6 +153,7 @@ class GdrMemoryManager : public RemoteMemoryManager {
   RdmaEndpointPtr listening_;
   std::atomic<bool> stopped_;
   int epfd_;
+  int numa_node_;
 
   // Server side endpoints
   // Accessed sequentially in Run() so not protected by lock
@@ -190,46 +184,10 @@ GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
       port_(port),
       listening_(nullptr, EndpointDeleter),
       stopped_(true),
-      next_key_(0) {
-  static std::once_flag flag;
-  std::call_once(flag, []() { RegMemVisitors(); });
-}
+      next_key_(0) {}
 
 GdrMemoryManager::~GdrMemoryManager() { close(epfd_); }
 
-/*static*/ void GdrMemoryManager::RegMemVisitors() {
-  SubAllocator::Visitor alloc_visitor = [](void* ptr, int numa_node,
-                                           size_t num_bytes) {
-    GdrMemoryManager::Singleton().InsertMemoryRegion(
-        ptr, num_bytes, strings::StrCat("CPU:", numa_node));
-  };
-  SubAllocator::Visitor free_visitor = [](void* ptr, int numa_node,
-                                          size_t num_bytes) {
-    GdrMemoryManager::Singleton().EvictMemoryRegion(ptr, num_bytes);
-  };
-  ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
-  ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
-
-#if GOOGLE_CUDA
-  if (IsGDRAvailable()) {
-    int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
-
-    // Note we don't free allocated GPU memory so there is no free visitor
-    SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
-                                                  size_t num_bytes) {
-      RdmaMemoryMgr::Singleton().InsertMemoryRegion(
-          ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
-    };
-    GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
-                                                     cuda_alloc_visitor);
-    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
-                                                          alloc_visitor);
-    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
-    LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
-  }
-#endif  // GOOGLE_CUDA
-}
-
 Status GdrMemoryManager::Init() {
   epfd_ = epoll_create1(0);
   if (epfd_ == -1) {
@@ -289,6 +247,44 @@ Status GdrMemoryManager::Init() {
                                "cannot add server to epoll");
   }
 
+  numa_node_ = TryToReadNumaNode(listening_->verbs->device);
+
+  SubAllocator::Visitor alloc_visitor = [this](void* ptr, int numa_node,
+                                               size_t num_bytes) {
+    VLOG(2) << "Registering RDMA capable memory region on numa_node "
+            << numa_node;
+    InsertMemoryRegion(ptr, num_bytes, strings::StrCat("CPU:", numa_node));
+  };
+  SubAllocator::Visitor free_visitor = [this](void* ptr, int numa_node,
+                                              size_t num_bytes) {
+    VLOG(2) << "De-registering RDMA capable memory region on numa_node "
+            << numa_node;
+    EvictMemoryRegion(ptr, num_bytes);
+  };
+  ProcessState::singleton()->AddCPUAllocVisitor(alloc_visitor);
+  ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
+  LOG(INFO) << "Instrumenting CPU allocator(s)";
+
+#if GOOGLE_CUDA
+  for (int numa_idx = 0; numa_idx < port::NUMANumNodes(); ++numa_idx) {
+    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(numa_idx,
+                                                          alloc_visitor);
+    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(numa_idx,
+                                                         free_visitor);
+  }
+  if (IsGDRAvailable()) {
+    SubAllocator::Visitor cuda_alloc_visitor = [this](void* ptr, int gpu_id,
+                                                      size_t num_bytes) {
+      VLOG(2) << "Registering RDMA capable memory region on GPU " << gpu_id;
+      InsertMemoryRegion(ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
+    };
+    for (int numa_idx = 0; numa_idx < port::NUMANumNodes(); ++numa_idx) {
+      GPUProcessState::singleton()->AddGPUAllocVisitor(numa_idx,
+                                                       cuda_alloc_visitor);
+    }
+    VLOG(1) << "Instrumenting GPU allocator(s) for all Numas";
+  }
+#endif  // GOOGLE_CUDA
   return Status::OK();
 }
 
@@ -405,7 +401,7 @@ void GdrMemoryManager::TransportOptionsFromTensor(
   ibv_mr* mr = FindMemoryRegion(addr, length);
 
 #if GOOGLE_CUDA
-  if (!on_host) {
+  if (device->tensorflow_gpu_device_info() && !on_host) {
     Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
     Tensor* host_copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
     GPUUtil::CopyGPUTensorToCPU(
@@ -456,11 +452,27 @@ void GdrMemoryManager::TransportOptionsFromTensor(
 #endif
 
   if (mr == nullptr) {
-    done(errors::Unavailable("Cannot find pinned memory region"));
-    return;
+    Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
+    Tensor host_copy(alloc, tensor.dtype(), tensor.shape());
+
+    std::memcpy(DMAHelper::buffer(&host_copy)->data(), buffer->data(), length);
+    VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
+
+    buffer = DMAHelper::buffer(&host_copy);
+    addr = buffer->data();
+    length = buffer->size();
+
+    mr = FindMemoryRegion(addr, length);
+    if (mr == nullptr) {
+      done(errors::Unavailable("Cannot find pinned memory region"));
+      return;
+    }
+
+    buffer->Ref();
+  } else {
+    buffer->Ref();
   }
 
-  buffer->Ref();
   TensorKey tensor_key = next_key_++;
   {
     mutex_lock l(server_mu_);
@@ -470,7 +482,7 @@ void GdrMemoryManager::TransportOptionsFromTensor(
   uint64_t checksum = 0;
   if (VLOG_IS_ON(2)) {
 #ifdef GOOGLE_CUDA
-    if (!on_host) {
+    if (device->tensorflow_gpu_device_info() && !on_host) {
       checksum = GPUUtil::Checksum(device, device_context, tensor);
     } else {
       checksum = GPUUtil::Checksum(tensor);
@@ -508,7 +520,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
   Tensor host_copy;
 #if GOOGLE_CUDA
   if (mr == nullptr && !on_host) {
-    Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
+    Allocator* alloc =
+        GPUProcessState::singleton()->GetCUDAHostAllocator(numa_node_);
     host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
     buffer = DMAHelper::buffer(&host_copy);
     addr = buffer->data();
@@ -518,8 +531,18 @@ void GdrMemoryManager::TensorFromTransportOptions(
 #endif  // GOOGLE_CUDA
 
   if (mr == nullptr) {
-    done(errors::Unavailable("Cannot find pinned memory region"));
-    return;
+    Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
+    host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
+
+    buffer = DMAHelper::buffer(&host_copy);
+    addr = buffer->data();
+    length = buffer->size();
+
+    mr = FindMemoryRegion(addr, length);
+    if (mr == nullptr) {
+      done(errors::Unavailable("Cannot find pinned memory region"));
+      return;
+    }
   }
 
   decltype(clients_)::iterator iter;
@@ -568,7 +591,8 @@ void GdrMemoryManager::TensorFromTransportOptions(
   }
 
 #if GOOGLE_CUDA
-  if (host_copy.NumElements() > 0) {
+  if (device->tensorflow_gpu_device_info() && !on_host &&
+      host_copy.NumElements() > 0) {
     uint64_t checksum = 0;
     if (VLOG_IS_ON(2)) {
       checksum = GPUUtil::Checksum(host_copy);
@@ -598,6 +622,12 @@ void GdrMemoryManager::TensorFromTransportOptions(
   }
 #endif  // GOOGLE_CUDA
 
+  if ((on_host || !device->tensorflow_gpu_device_info()) &&
+      host_copy.NumElements() > 0) {
+    std::memcpy(DMAHelper::buffer(tensor)->data(), addr, length);
+    VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
+  }
+
   uint64_t end = Env::Default()->NowMicros();
 
   VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
@@ -607,7 +637,7 @@ void GdrMemoryManager::TensorFromTransportOptions(
   uint64_t checksum = 0;
   if (VLOG_IS_ON(2)) {
 #ifdef GOOGLE_CUDA
-    if (device->tensorflow_gpu_device_info() && (!on_host)) {
+    if (device->tensorflow_gpu_device_info() && !on_host) {
       checksum = GPUUtil::Checksum(device, device_context, *tensor);
     } else {
       checksum = GPUUtil::Checksum(*tensor);
@@ -668,7 +698,8 @@ ibv_mr* GdrMemoryManager::FindMemoryRegion(void* addr, size_t length) {
   }
 }
 
-void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length) {
+void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length,
+                                          const std::string& allocator_name) {
   if (length == 0) return;
   ibv_mr* mr = rdma_reg_read(listening_.get(), addr, length);
   if (mr != nullptr) {
@@ -676,7 +707,8 @@ void GdrMemoryManager::InsertMemoryRegion(void* addr, size_t length) {
     auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
     mrs_.insert(iter, {mr, &MRDeleter});
   } else {
-    LOG(WARNING) << "Cannot register memory region";
+    LOG(WARNING) << "Cannot register memory region allocated by "
+                 << allocator_name;
   }
 }
 
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 94f522c04e5a09ed2d9355fa675125c340407923..fbccbead03fc0d641db40ede661bf3677d44c45d 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -170,6 +170,14 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
     // Record "call" in active_ so that it can be aborted cleanly.
     RegisterCall(call);
 
+    // RendezvousMgr already aborted, shouldn't send RPC call any more
+    if (!call->status().ok()) {
+      done(call->status(), Args(), Args(), Tensor(), false);
+      session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      delete call;
+      return;
+    }
+
     // Start "call".
     Ref();
     call->Start([this, call, src_worker, rwi, done]() {
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
index 9025c992a4467f521d6d8d514e6a5e92f5492947..b3f48ec1dd9c75055f4e1ea76eb203b6ccf94718 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.cc
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -52,14 +52,15 @@ Status GdrServer::Init() {
       [this](const WorkerEnv* env) {
         return new GdrRendezvousMgr(env, remote_memory_manager_.get());
       };
-  WorkerCreationFunction worker_func = [this](WorkerEnv* env) {
+  WorkerCreationFunction worker_func = [this](WorkerEnv* env,
+                                              const ConfigProto& config) {
     return std::unique_ptr<GdrWorker>(
-        new GdrWorker(env, remote_memory_manager_.get()));
+        new GdrWorker(env, config, remote_memory_manager_.get()));
   };
-  TF_RETURN_IF_ERROR(
-      GrpcServer::Init(nullptr, rendezvous_mgr_func, nullptr, worker_func));
 
-  return remote_memory_manager_->Init();
+  TF_RETURN_IF_ERROR(remote_memory_manager_->Init());
+
+  return GrpcServer::Init(nullptr, rendezvous_mgr_func, nullptr, worker_func);
 }
 
 Status GdrServer::Start() {
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index ce1d8d2d73000559f03046aceacb169890ecc1b6..867cb83f42034c8e9061e333ea671457745f92c3 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -39,9 +39,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-GdrWorker::GdrWorker(WorkerEnv* worker_env,
+GdrWorker::GdrWorker(WorkerEnv* worker_env, const ConfigProto& config,
                      RemoteMemoryManager* remote_memory_manager)
-    : GrpcWorker(worker_env),
+    : GrpcWorker(worker_env, config),
       remote_memory_manager_(remote_memory_manager),
       recv_tensor_recent_request_ids_(100000) {}
 
diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h
index 65105ed997300aa77202301cdd8dddacb0309880..39f11e6bde5a1ca7ae91ead02279d22d70af027b 100644
--- a/tensorflow/contrib/gdr/gdr_worker.h
+++ b/tensorflow/contrib/gdr/gdr_worker.h
@@ -25,7 +25,8 @@ namespace tensorflow {
 
 class GdrWorker : public GrpcWorker {
  public:
-  GdrWorker(WorkerEnv* env, RemoteMemoryManager* remote_memory_manager);
+  GdrWorker(WorkerEnv* env, const ConfigProto& config,
+            RemoteMemoryManager* remote_memory_manager);
 
   // Serve the RecvTensorRequest but omit the tensor content and transmit it
   // out-of-band using GPU Direct RDMA whenever possible.
diff --git a/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py b/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
index 27aed091c249caa6e50748419a93f3579e6632a4..363a3c9b4a59eb8f769b16a11a3ade0643358c64 100644
--- a/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
+++ b/tensorflow/contrib/grid_rnn/python/kernel_tests/grid_rnn_test.py
@@ -696,8 +696,8 @@ class GridRNNCellTest(test.TestCase):
 
     for out, inp in zip(outputs, inputs):
       self.assertEqual(len(out), 1)
-      self.assertTrue(out[0].get_shape()[0].value is None)
-      self.assertEqual(out[0].get_shape()[1], num_units)
+      self.assertTrue(out[0].get_shape().dims[0].value is None)
+      self.assertEqual(out[0].get_shape().dims[1], num_units)
       self.assertEqual(out[0].dtype, inp.dtype)
 
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
index bcd2a34c4e791a2ab66a439109145d6b78c14e22..5f3af43a474a12787a111f8674b2b7bf0bb2481a 100644
--- a/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
+++ b/tensorflow/contrib/grid_rnn/python/ops/grid_rnn_cell.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from collections import namedtuple
 import functools
 
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -281,7 +282,8 @@ class GridRNNCell(rnn.RNNCell):
     """
     conf = self._config
 
-    if (inputs is not None and inputs.get_shape().with_rank(2)[1].value > 0 and
+    if (inputs is not None and
+        tensor_shape.dimension_value(inputs.shape.with_rank(2)[1]) > 0 and
         conf.inputs):
       if isinstance(inputs, tuple):
         if len(conf.inputs) != len(inputs):
@@ -291,7 +293,8 @@ class GridRNNCell(rnn.RNNCell):
       else:
         input_splits = array_ops.split(
             value=inputs, num_or_size_splits=len(conf.inputs), axis=1)
-      input_sz = input_splits[0].get_shape().with_rank(2)[1].value
+      input_sz = tensor_shape.dimension_value(
+          input_splits[0].shape.with_rank(2)[1])
 
       for i, j in enumerate(conf.inputs):
         input_project_m = vs.get_variable(
diff --git a/tensorflow/contrib/hadoop/BUILD b/tensorflow/contrib/hadoop/BUILD
index ccad31efa1dba92d954ff1cb455b6c9c784b29bc..178a8a6f08410bd9e5b61db47a3866ec6060a48c 100644
--- a/tensorflow/contrib/hadoop/BUILD
+++ b/tensorflow/contrib/hadoop/BUILD
@@ -7,12 +7,12 @@ exports_files(["LICENSE"])
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
-    "tf_custom_op_py_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
     "tf_kernel_library",
     "tf_py_test",
 )
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 filegroup(
     name = "test_data",
diff --git a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
index f7f1189bb93c611719186a697c40f371644f63a2..bc941ae9f23eaa5c46fcca95b9aba0ac0d87960a 100644
--- a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
+++ b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.contrib.hadoop.python.ops import hadoop_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -47,7 +48,7 @@ class SequenceFileDatasetTest(test.TestCase):
 
     dataset = hadoop_dataset_ops.SequenceFileDataset(filenames).repeat(
         num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
index bf398b838dfaaff6fdaf33a6cd7086ef13e43a3e..5c5599858ee6879a5703d65658bf4bbd881c7e72 100644
--- a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
+++ b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
@@ -20,10 +20,9 @@ from __future__ import print_function
 from tensorflow.contrib.hadoop.python.ops import gen_dataset_ops
 from tensorflow.contrib.hadoop.python.ops import hadoop_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 
 
 class SequenceFileDataset(dataset_ops.DatasetSource):
@@ -40,15 +39,12 @@ class SequenceFileDataset(dataset_ops.DatasetSource):
     For example:
 
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.contrib.hadoop.SequenceFileDataset("/foo/bar.seq")
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
     # Prints the (key, value) pairs inside a hadoop sequence file.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for key, value in dataset:
+      print(key, value)
     ```
 
     Args:
@@ -60,16 +56,10 @@ class SequenceFileDataset(dataset_ops.DatasetSource):
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.sequence_file_dataset(
-        self._filenames, nest.flatten(self.output_types))
-
-  @property
-  def output_classes(self):
-    return ops.Tensor, ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+        self._filenames, self._element_structure._flat_types)  # pylint: disable=protected-access
 
   @property
-  def output_types(self):
-    return dtypes.string, dtypes.string
+  def _element_structure(self):
+    return structure.NestedStructure(
+        (structure.TensorStructure(dtypes.string, []),
+         structure.TensorStructure(dtypes.string, [])))
diff --git a/tensorflow/contrib/ignite/BUILD b/tensorflow/contrib/ignite/BUILD
index 9393b702d11a2ef84586f712d30c26fe2a8972bb..2698b83a56a1121fa30f5b05ffa027b4dfd4ba95 100644
--- a/tensorflow/contrib/ignite/BUILD
+++ b/tensorflow/contrib/ignite/BUILD
@@ -22,48 +22,92 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_ops",
+        ":igfs_ops",
     ],
 )
 
 tf_custom_op_library(
-    name = "_dataset_ops.so",
-    srcs = ["ops/dataset_ops.cc"],
-    deps = [":dataset_kernels"],
+    name = "_ignite_ops.so",
+    srcs = [
+        "kernels/igfs/igfs.h",
+        "ops/dataset_ops.cc",
+        "ops/igfs_ops.cc",
+    ],
+    deps = [
+        ":dataset_kernels",
+        ":igfs_kernels",
+    ],
 )
 
 tf_gen_op_libs(
     op_lib_names = ["dataset_ops"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = ["igfs_ops"],
+    deps = [":igfs_kernels"],
+)
+
 cc_library(
-    name = "dataset_kernels",
+    name = "ignite_client",
     srcs = [
-        "kernels/ignite_dataset_ops.cc",
-        "kernels/ignite_client.h",
-        "kernels/ignite_byte_swapper.h",
-        "kernels/ignite_plain_client.h",
-        "kernels/ignite_ssl_wrapper.h",
-        "kernels/ignite_ssl_wrapper.cc",
-        "kernels/ignite_binary_object_parser.h",
-        "kernels/ignite_binary_object_parser.cc",
-        "kernels/ignite_dataset.h",
-        "kernels/ignite_dataset.cc",
-        "kernels/ignite_dataset_iterator.h",
-        "kernels/ignite_dataset_iterator.cc",
+        "kernels/client/ignite_client.h",
+        "kernels/client/ignite_byte_swapper.h",
+        "kernels/client/ignite_plain_client.h",
+        "kernels/client/ignite_ssl_wrapper.h",
+        "kernels/client/ignite_ssl_wrapper.cc",
     ] + if_not_windows([
-        "kernels/ignite_plain_client_unix.cc",
+        "kernels/client/ignite_plain_client_unix.cc",
     ]) + if_windows([
-        "kernels/ignite_plain_client_windows.cc",
+        "kernels/client/ignite_plain_client_windows.cc",
     ]),
     copts = if_windows([
         "-DWIN32_LEAN_AND_MEAN",
     ]),
     deps = [
         "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
         "@boringssl//:ssl",
         "@protobuf_archive//:protobuf_headers",
     ],
+)
+
+cc_library(
+    name = "dataset_kernels",
+    srcs = [
+        "kernels/dataset/ignite_binary_object_parser.cc",
+        "kernels/dataset/ignite_binary_object_parser.h",
+        "kernels/dataset/ignite_dataset.cc",
+        "kernels/dataset/ignite_dataset.h",
+        "kernels/dataset/ignite_dataset_iterator.cc",
+        "kernels/dataset/ignite_dataset_iterator.h",
+        "kernels/dataset/ignite_dataset_ops.cc",
+    ],
+    deps = [
+        ":ignite_client",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "igfs_kernels",
+    srcs = [
+        "kernels/igfs/igfs.cc",
+        "kernels/igfs/igfs.h",
+        "kernels/igfs/igfs_client.cc",
+        "kernels/igfs/igfs_client.h",
+        "kernels/igfs/igfs_extended_tcp_client.cc",
+        "kernels/igfs/igfs_extended_tcp_client.h",
+        "kernels/igfs/igfs_messages.cc",
+        "kernels/igfs/igfs_messages.h",
+        "kernels/igfs/igfs_random_access_file.cc",
+        "kernels/igfs/igfs_random_access_file.h",
+        "kernels/igfs/igfs_writable_file.cc",
+        "kernels/igfs/igfs_writable_file.h",
+    ],
+    deps = [":ignite_client"],
     alwayslink = 1,
 )
 
@@ -82,10 +126,29 @@ py_library(
     ],
 )
 
+py_library(
+    name = "igfs_ops",
+    srcs = [
+        "python/ops/igfs_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":igfs_op_loader",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
 tf_gen_op_wrapper_py(
     name = "gen_dataset_ops",
     out = "python/ops/gen_dataset_ops.py",
-    deps = ["//tensorflow/contrib/ignite:dataset_ops_op_lib"],
+    deps = [":dataset_ops_op_lib"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_igfs_ops",
+    out = "python/ops/gen_igfs_ops.py",
+    deps = [":igfs_ops_op_lib"],
 )
 
 tf_kernel_library(
@@ -97,13 +160,22 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
+tf_kernel_library(
+    name = "igfs_ops_kernels",
+    deps = [
+        ":igfs_kernels",
+        "//tensorflow/core:framework",
+    ],
+    alwayslink = 1,
+)
+
 tf_custom_op_py_library(
     name = "ignite_op_loader",
     srcs = ["python/ops/ignite_op_loader.py"],
-    dso = ["//tensorflow/contrib/ignite:_dataset_ops.so"],
+    dso = [":_ignite_ops.so"],
     kernels = [
         ":dataset_ops_kernels",
-        "//tensorflow/contrib/ignite:dataset_ops_op_lib",
+        ":dataset_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -113,6 +185,22 @@ tf_custom_op_py_library(
     ],
 )
 
+tf_custom_op_py_library(
+    name = "igfs_op_loader",
+    srcs = ["python/ops/igfs_op_loader.py"],
+    dso = [":_ignite_ops.so"],
+    kernels = [
+        ":igfs_ops_kernels",
+        ":igfs_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_igfs_ops",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
 # The Apache Ignite servers have to setup before the test and tear down
 # after the test manually. The docker engine has to be installed.
 #
@@ -122,8 +210,11 @@ tf_custom_op_py_library(
 # To tear down Apache Ignite servers:
 # $ bash ./python/tests/stop_ignite.sh
 tf_py_test(
-    name = "ignite_dataset_test",
-    srcs = ["python/tests/ignite_dataset_test.py"],
+    name = "ignite_test",
+    srcs = [
+        "python/tests/igfs_test.py",
+        "python/tests/ignite_dataset_test.py",
+    ],
     additional_deps = [
         ":ignite",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index 55c89d27996318dabb29bb15372411005301ebd9..5a8c650fb927be0c835aaceffc516c048195c7bf 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -1,19 +1,32 @@
-# Ignite Dataset
-
-- [Overview](#overview)
-- [Features](#features)
-  * [Distributed In-Memory Datasource](#distributed-in-memory-datasource)
-  * [Structured Objects](#structured-objects)
-  * [Distributed Training](#distributed-training)
-  * [SSL Connection](#ssl-connection)
-  * [Windows Support](#windows-support)
-- [Try it out](#try-it-out)
-- [Limitations](#limitations)
+# Apache Ignite Integration
+
+-   [Overview](#overview)
+-   [Features](#features)
+    *   [Distributed In-Memory Datasource](#distributed-in-memory-datasource)
+    *   [Structured Objects](#structured-objects)
+    *   [Distributed Training](#distributed-training)
+    *   [Distributed File System](#distributed-file-system)
+    *   [SSL Connection](#ssl-connection)
+    *   [Windows Support](#windows-support)
+-   [Try it out](#try-it-out)
+    *   [Ignite Dataset](#ignite-dataset)
+    *   [IGFS](#igfs)
+-   [Limitations](#limitations)
 
 ## Overview
 
-[Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed database, caching, and processing platform for
-transactional, analytical, and streaming workloads, delivering in-memory speeds at petabyte scale. This contrib package contains an integration between Apache Ignite and TensorFlow. The integration is based on [tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow side and [Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol) from Apache Ignite side. It allows to use Apache Ignite as a data source for neural network training, inference and all other computations supported by TensorFlow. 
+[Apache Ignite](https://ignite.apache.org/) is a memory-centric distributed
+database, caching, and processing platform for transactional, analytical, and
+streaming workloads, delivering in-memory speeds at petabyte scale. This contrib
+package contains an integration between Apache Ignite and TensorFlow. The
+integration is based on
+[tf.data](https://www.tensorflow.org/api_docs/python/tf/data) from TensorFlow
+side and
+[Binary Client Protocol](https://apacheignite.readme.io/v2.6/docs/binary-client-protocol)
+from Apache Ignite side. It allows to use Apache Ignite as a data source for
+neural network training, inference and all other computations supported by
+TensorFlow. Another part of this module is an integration with distributed file
+system based on Apache Ignite.
 
 ## Features
 
@@ -41,14 +54,12 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>> tf.enable_eager_execution()
+>>>
 >>> dataset = IgniteDataset(cache_name="SQL_PUBLIC_KITTEN_CACHE")
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   for _ in range(3):
->>>     print(sess.run(next_obj))
+>>> for element in dataset:
+>>>   print(element)
 
 {'key': 1, 'val': {'NAME': b'WARM KITTY'}}
 {'key': 2, 'val': {'NAME': b'SOFT KITTY'}}
@@ -61,23 +72,22 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>> tf.enable_eager_execution()
+>>>
 >>> dataset = IgniteDataset(cache_name="IMAGES")
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   print(sess.run(next_obj))
+>>> for element in dataset.take(1):
+>>>   print(element)
 
 {
-    'key': 'kitten.png', 
+    'key': 'kitten.png',
     'val': {
         'metadata': {
             'file_name': b'kitten.png',
             'label': b'little ball of fur',
-            width: 800, 
+            width: 800,
             height: 600
-        }, 
+        },
         'pixels': [0, 0, 0, 0, ..., 0]
     }
 }
@@ -87,13 +97,11 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>>
 >>> dataset = IgniteDataset(cache_name="IMAGES").map(lambda obj: obj['val']['pixels'])
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   print(sess.run(next_obj))
+>>> for element in dataset:
+>>>   print(element)
 
 [0, 0, 0, 0, ..., 0]
 ```
@@ -113,18 +121,18 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>>
 >>> dataset = IgniteDataset("IMAGES")
 >>>
 >>> # Compute gradients locally on every worker node.
->>> gradients = []    
+>>> gradients = []
 >>> for i in range(5):
 >>>     with tf.device("/job:WORKER/task:%d" % i):
->>>         device_iterator = dataset.make_one_shot_iterator()
+>>>         device_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
 >>>         device_next_obj = device_iterator.get_next()
 >>>         gradient = compute_gradient(device_next_obj)
->>>         gradients.append(gradient)        
->>>        
+>>>         gradients.append(gradient)
+>>>
 >>> # Aggregate them on master node.
 >>> result_gradient = tf.reduce_sum(gradients)
 >>>
@@ -132,7 +140,24 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 >>>     print(sess.run(result_gradient))
 ```
 
-High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well. 
+High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well.
+
+### Distributed File System
+
+In addition to database functionality Apache Ignite provides a distributed file
+system called [IGFS](https://ignite.apache.org/features/igfs.html). IGFS
+delivers a similar functionality to Hadoop HDFS, but only in-memory. In fact, in
+addition to its own APIs, IGFS implements Hadoop FileSystem API and can be
+transparently plugged into Hadoop or Spark deployments. This contrib package
+contains an integration between IGFS and TensorFlow. The integration is based
+on [custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys)
+from TensorFlow side and
+[IGFS Native API](https://ignite.apache.org/features/igfs.html) from Apache
+Ignite side. It has numerous uses, for example: * Checkpoints of state can be
+saved to IGFS for reliability and fault-tolerance. * Training processes
+communicate with TensorBoard by writing event files to a directory, which
+TensorBoard watches. IGFS allows this communication to work even when
+TensorBoard runs in a different process or machine.
 
 ### SSL Connection
 
@@ -141,9 +166,12 @@ Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikip
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
->>> dataset = IgniteDataset(cache_name="IMAGES", certfile="client.pem", cert_password="password", username="ignite", password="ignite")
->>> ...
+>>>
+>>> dataset = IgniteDataset(cache_name="IMAGES",
+                            certfile="client.pem",
+                            cert_password="password",
+                            username="ignite",
+                            password="ignite")
 ```
 
 ### Windows Support
@@ -152,7 +180,16 @@ Ignite Dataset is fully compatible with Windows. You can use it as part of Tenso
 
 ## Try it out
 
-The simplest way to try Ignite Dataset is to run a [Docker](https://www.docker.com/) container with Apache Ignite and loaded [MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with it using Ignite Dataset. Such container is available on Docker Hub: [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/). You need to start this container on your machine:
+Following examples will help you to easily start working with this module.
+
+### Ignite Dataset
+
+The simplest way to try Ignite Dataset is to run a
+[Docker](https://www.docker.com/) container with Apache Ignite and loaded
+[MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with
+it using Ignite Dataset. Such container is available on Docker Hub:
+[dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/).
+You need to start this container on your machine:
 
 ```
 docker run -it -p 10800:10800 dmitrievanthony/ignite-with-mnist
@@ -162,6 +199,35 @@ After that you will be able to work with it following way:
 
 ![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist.png "Ignite Dataset Mnist")
 
+### IGFS
+
+The simplest way to try IGFS with TensorFlow is to run
+[Docker](https://www.docker.com/) container with Apache Ignite and enabled IGFS
+and then interruct with it using TensorFlow
+[tf.gfile](https://www.tensorflow.org/api_docs/python/tf/gfile). Such container
+is available on Docker Hub:
+[dmitrievanthony/ignite-with-igfs](https://hub.docker.com/r/dmitrievanthony/ignite-with-igfs/).
+You need to start this container on your machine:
+
+```
+docker run -it -p 10500:10500 dmitrievanthony/ignite-with-igfs
+```
+
+After that you will be able to work with it following way:
+
+```python
+>>> import tensorflow as tf
+>>> import tensorflow.contrib.ignite.python.ops.igfs_ops
+>>>
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='w') as w:
+>>>   w.write("Hello, world!")
+>>>
+>>> with tf.gfile.Open("igfs:///hello.txt", mode='r') as r:
+>>>   print(r.read())
+
+Hello, world!
+```
+
 ## Limitations
 
 Presently, Ignite Dataset works with assumption that all objects in the cache have the same structure (homogeneous objects) and the cache contains at least one object. Another limitation concerns structured objects, Ignite Dataset does not support UUID, Maps and Object arrays that might be parts of an object structure.
diff --git a/tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..aac950fcc2aaf016959bbda876ac93df4baea417
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_BYTE_SWAPPER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_BYTE_SWAPPER_H_
+
+#include <stdint.h>
+#include "tensorflow/core/platform/byte_order.h"
+
+namespace tensorflow {
+
+class ByteSwapper {
+ public:
+  ByteSwapper(bool big_endian) { swap_ = big_endian == port::kLittleEndian; }
+
+  void SwapIfRequiredInt16(int16_t *x) const {
+    if (swap_) {
+      Swap16(x);
+    }
+  }
+
+  void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
+    if (swap_) {
+      Swap16(reinterpret_cast<int16_t *>(x));
+    }
+  }
+
+  void SwapIfRequiredInt32(int32_t *x) const {
+    if (swap_) {
+      Swap32(x);
+    }
+  }
+
+  void SwapIfRequiredFloat(float *x) const {
+    if (swap_) {
+      Swap32(reinterpret_cast<int32_t *>(x));
+    }
+  }
+
+  void SwapIfRequiredInt64(int64_t *x) const {
+    if (swap_) {
+      Swap64(x);
+    }
+  }
+
+  void SwapIfRequiredDouble(double *x) const {
+    if (swap_) {
+      Swap64(reinterpret_cast<int64_t *>(x));
+    }
+  }
+
+  void SwapIfRequiredInt16Arr(int16_t *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++) Swap16(&x[i]);
+    }
+  }
+
+  void SwapIfRequiredUnsignedInt16Arr(uint16_t *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++)
+        Swap16(reinterpret_cast<int16_t *>(&x[i]));
+    }
+  }
+
+  void SwapIfRequiredInt32Arr(int32_t *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++) Swap32(&x[i]);
+    }
+  }
+
+  void SwapIfRequiredFloatArr(float *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++)
+        Swap32(reinterpret_cast<int32_t *>(&x[i]));
+    }
+  }
+
+  void SwapIfRequiredInt64Arr(int64_t *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++) Swap64(&x[i]);
+    }
+  }
+
+  void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
+    if (swap_) {
+      for (int32_t i = 0; i < length; i++)
+        Swap64(reinterpret_cast<int64_t *>(&x[i]));
+    }
+  }
+
+ private:
+  void Swap16(int16_t *x) const {
+    *x = ((*x & 0xFF) << 8) | ((*x >> 8) & 0xFF);
+  }
+
+  void Swap32(int32_t *x) const {
+    *x = ((*x & 0xFF) << 24) | (((*x >> 8) & 0xFF) << 16) |
+         (((*x >> 16) & 0xFF) << 8) | ((*x >> 24) & 0xFF);
+  }
+
+  void Swap64(int64_t *x) const {
+    *x = ((*x & 0xFF) << 56) | (((*x >> 8) & 0xFF) << 48) |
+         (((*x >> 16) & 0xFF) << 40) | (((*x >> 24) & 0xFF) << 32) |
+         (((*x >> 32) & 0xFF) << 24) | (((*x >> 40) & 0xFF) << 16) |
+         (((*x >> 48) & 0xFF) << 8) | ((*x >> 56) & 0xFF);
+  }
+
+  bool swap_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_BYTE_SWAPPER_H_
diff --git a/tensorflow/contrib/ignite/kernels/client/ignite_client.h b/tensorflow/contrib/ignite/kernels/client/ignite_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..0da80769260d065c4ac6601c0e5cd7050b6b61cb
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_client.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_CLIENT_H_
+
+#include "tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class Client {
+ public:
+  Client(bool big_endian) : byte_swapper_(ByteSwapper(big_endian)) {}
+  virtual Status Connect() = 0;
+  virtual Status Disconnect() = 0;
+  virtual bool IsConnected() = 0;
+  virtual int GetSocketDescriptor() = 0;
+  virtual Status ReadData(uint8_t *buf, const int32_t length) = 0;
+  virtual Status WriteData(const uint8_t *buf, const int32_t length) = 0;
+
+  Status ReadByte(uint8_t *data) { return ReadData(data, 1); }
+
+  Status ReadShort(int16_t *data) {
+    TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 2));
+    byte_swapper_.SwapIfRequiredInt16(data);
+
+    return Status::OK();
+  }
+
+  Status ReadInt(int32_t *data) {
+    TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 4));
+    byte_swapper_.SwapIfRequiredInt32(data);
+
+    return Status::OK();
+  }
+
+  Status ReadLong(int64_t *data) {
+    TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 8));
+    byte_swapper_.SwapIfRequiredInt64(data);
+
+    return Status::OK();
+  }
+
+  Status WriteByte(const uint8_t data) { return WriteData(&data, 1); }
+
+  Status WriteShort(const int16_t data) {
+    int16_t tmp = data;
+    byte_swapper_.SwapIfRequiredInt16(&tmp);
+    return WriteData((uint8_t *)&tmp, 2);
+  }
+
+  Status WriteInt(const int32_t data) {
+    int32_t tmp = data;
+    byte_swapper_.SwapIfRequiredInt32(&tmp);
+    return WriteData((uint8_t *)&tmp, 4);
+  }
+
+  Status WriteLong(const int64_t data) {
+    int64_t tmp = data;
+    byte_swapper_.SwapIfRequiredInt64(&tmp);
+    return WriteData((uint8_t *)&tmp, 8);
+  }
+
+ private:
+  const ByteSwapper byte_swapper_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
similarity index 80%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
index 75424c19ee4b7df5378aa23cb41db1752e8d0651..546583246042855d179ebbb18b7dca711063b3f4 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_PLAIN_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_PLAIN_CLIENT_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
 
 namespace tensorflow {
 
@@ -40,4 +40,4 @@ class PlainClient : public Client {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_PLAIN_CLIENT_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_PLAIN_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
index cf672942c61e1239332711db12e62088737c4f41..54efb5b61761708a28dd031b8321ffba9a53ffa9 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_unix.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_unix.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
 
 #include <arpa/inet.h>
 #include <netdb.h>
diff --git a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
index dad5aace5fabe1df58bb9579bf578f4c35324315..a99a3ada558e51c13ed47eb72911eb5862e71a60 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_plain_client_windows.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_plain_client_windows.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
 
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
rename to tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
index ceb479b0846574a35d86002ebb9c3e8e1d3687ac..8f09c24a3bedda524264f30282a0ad019d515540 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.cc
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h"
 
 #include <openssl/err.h>
 #include <openssl/ssl.h>
diff --git a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
similarity index 82%
rename from tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
rename to tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
index 0406644bbaab3de816540ce85e84b489ea9fff12..543e03d1efc3ff186c9db399af18f7aa8ad2c450 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h
+++ b/tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_SSL_WRAPPER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_SSL_WRAPPER_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
 
 #include <openssl/ssl.h>
 
@@ -48,4 +48,4 @@ class SslWrapper : public Client {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_SSL_WRAPPER_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_CLIENT_IGNITE_SSL_WRAPPER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
similarity index 99%
rename from tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
index 2c8a7d44b07b43f788bcbc0850b5162cc14dd951..4218ec05f2c3486dd91e2188b674e01d6aadaa2b 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
similarity index 87%
rename from tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
index eb1f856643a790de6acaa82d4b8ad894fd364376..3e8a1a19623fab3e027db16228e0228e8ec4989a 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_BINARY_OBJECT_PARSER_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_BINARY_OBJECT_PARSER_H_
 
 #include <vector>
-#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_byte_swapper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -78,4 +78,4 @@ enum ObjectType {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BINARY_OBJECT_PARSER_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_BINARY_OBJECT_PARSER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
index c4a7d3c513a796c9d95b371bedc609fd75188817..ace96e7b09fcf314757367baed66f622b294e43c 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
similarity index 91%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
index 66bfdf2e2a168e59cd2fec8e2ac5b8fd482d5c15..db3bafb11f2a0047c22ece6d2bc1722afaa5ffdf 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_H_
 
 #include "tensorflow/core/framework/dataset.h"
 
@@ -60,4 +60,4 @@ class IgniteDataset : public DatasetBase {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
similarity index 98%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
index 5da9127aa6a3a4bc16347e6890cc1ba44406c0d5..ce8972f1e7fd59235556cb9514011f0b836077de 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h"
 
-#include "tensorflow/contrib/ignite/kernels/ignite_plain_client.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_ssl_wrapper.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_ssl_wrapper.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
similarity index 87%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
index c499e2c9ccfac5c15db08c8fd8b26c37aa0404f3..5868c2cb67f9d5c91654db8cf4bb4bbc072fc1ac 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_iterator.h
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_iterator.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_ITERATOR_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_ITERATOR_H_
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_client.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
+#include "tensorflow/contrib/ignite/kernels/client/ignite_client.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
@@ -96,4 +96,4 @@ constexpr int32_t kMinResLength = 12;
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_DATASET_ITERATOR_H_
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_DATASET_IGNITE_DATASET_ITERATOR_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
similarity index 97%
rename from tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
rename to tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
index f75b1c5ff55ca9ee493148ff79c2edd4b15ac42a..f2108775e29b53765138dcd971bec89d7a10ce40 100644
--- a/tensorflow/contrib/ignite/kernels/ignite_dataset_ops.cc
+++ b/tensorflow/contrib/ignite/kernels/dataset/ignite_dataset_ops.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <stdlib.h>
 
-#include "tensorflow/contrib/ignite/kernels/ignite_binary_object_parser.h"
-#include "tensorflow/contrib/ignite/kernels/ignite_dataset.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_binary_object_parser.h"
+#include "tensorflow/contrib/ignite/kernels/dataset/ignite_dataset.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ae2dbcc2cf5d0ae7e09a26a199dc0c3c80fe22c1
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs.cc
@@ -0,0 +1,331 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h"
+
+namespace tensorflow {
+
+static string GetEnvOrElse(const string &env, string default_value) {
+  const char *env_c_str = env.c_str();
+  return getenv(env_c_str) != nullptr ? getenv(env_c_str) : default_value;
+}
+
+static string MakeRelative(const string &a, const string &b) {
+  string max = a;
+  string min = b;
+  bool first = b.size() > a.size();
+
+  if (first) {
+    max = b;
+    min = a;
+  }
+
+  auto r = mismatch(min.begin(), min.end(), max.begin());
+  return string((first ? r.first : r.second), first ? min.end() : max.end());
+}
+
+string IGFS::TranslateName(const string &name) const {
+  StringPiece scheme, namenode, path;
+  io::ParseURI(name, &scheme, &namenode, &path);
+  return string(path.data(), path.length());
+}
+
+IGFS::IGFS()
+    : host_(GetEnvOrElse("IGFS_HOST", "localhost")),
+      port_([] {
+        int port;
+        if (strings::safe_strto32(GetEnvOrElse("IGFS_PORT", "10500").c_str(),
+                                  &port)) {
+          return port;
+        } else {
+          LOG(WARNING)
+              << "IGFS_PORT environment variable had an invalid value: "
+              << getenv("IGFS_PORT") << "\nUsing default port 10500.";
+          return 10500;
+        }
+      }()),
+      fs_name_(GetEnvOrElse("IGFS_FS_NAME", "default_fs")) {
+  LOG(INFO) << "IGFS created [host=" << host_ << ", port=" << port_
+            << ", fs_name=" << fs_name_ << "]";
+}
+
+IGFS::~IGFS() {
+  LOG(INFO) << "IGFS destroyed [host=" << host_ << ", port=" << port_
+            << ", fs_name=" << fs_name_ << "]";
+}
+
+Status IGFS::NewRandomAccessFile(const string &file_name,
+                                 std::unique_ptr<RandomAccessFile> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<OpenReadResponse> open_read_response(true);
+  TF_RETURN_IF_ERROR(client->OpenRead(&open_read_response, path));
+
+  int64 resource_id = open_read_response.res.stream_id;
+  result->reset(new IGFSRandomAccessFile(path, resource_id, std::move(client)));
+
+  LOG(INFO) << "New random access file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewWritableFile(const string &file_name,
+                             std::unique_ptr<WritableFile> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ExistsResponse> exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, path));
+
+  if (exists_response.res.exists) {
+    CtrlResponse<DeleteResponse> del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, path, false));
+  }
+
+  CtrlResponse<OpenCreateResponse> open_create_resp(false);
+  TF_RETURN_IF_ERROR(client->OpenCreate(&open_create_resp, path));
+
+  int64 resource_id = open_create_resp.res.stream_id;
+  result->reset(new IGFSWritableFile(path, resource_id, std::move(client)));
+
+  LOG(INFO) << "New writable file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewAppendableFile(const string &file_name,
+                               std::unique_ptr<WritableFile> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ExistsResponse> exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, file_name));
+
+  if (exists_response.res.exists) {
+    CtrlResponse<DeleteResponse> del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, file_name, false));
+  }
+
+  CtrlResponse<OpenAppendResponse> open_append_resp(false);
+  TF_RETURN_IF_ERROR(client->OpenAppend(&open_append_resp, file_name));
+
+  result->reset(new IGFSWritableFile(TranslateName(file_name),
+                                     open_append_resp.res.stream_id,
+                                     std::move(client)));
+
+  LOG(INFO) << "New appendable file completed successfully [file_name="
+            << file_name << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::NewReadOnlyMemoryRegionFromFile(
+    const string &file_name, std::unique_ptr<ReadOnlyMemoryRegion> *result) {
+  return errors::Unimplemented("IGFS does not support ReadOnlyMemoryRegion");
+}
+
+Status IGFS::FileExists(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  const string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ExistsResponse> exists_response(false);
+  TF_RETURN_IF_ERROR(client->Exists(&exists_response, path));
+
+  if (!exists_response.res.exists)
+    return errors::NotFound("File ", path, " not found");
+
+  LOG(INFO) << "File exists completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetChildren(const string &file_name, std::vector<string> *result) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+  path = path + "/";
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ListPathsResponse> list_paths_response(false);
+  TF_RETURN_IF_ERROR(client->ListPaths(&list_paths_response, path));
+
+  *result = std::vector<string>();
+  std::vector<IGFSPath> entries = list_paths_response.res.entries;
+
+  for (IGFSPath &value : entries)
+    result->push_back(MakeRelative(value.path, path));
+
+  LOG(INFO) << "Get children completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetMatchingPaths(const string &pattern,
+                              std::vector<string> *results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
+Status IGFS::DeleteFile(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<DeleteResponse> del_response(false);
+  TF_RETURN_IF_ERROR(client->Delete(&del_response, path, false));
+
+  if (!del_response.res.exists)
+    return errors::NotFound("File ", path, " not found");
+
+  LOG(INFO) << "Delete file completed successfully [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::CreateDir(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  const string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<MakeDirectoriesResponse> mkdir_response(false);
+  TF_RETURN_IF_ERROR(client->MkDir(&mkdir_response, path));
+
+  if (!mkdir_response.res.successful)
+    return errors::Unknown("Can't create directory ", path);
+
+  LOG(INFO) << "Create dir completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::DeleteDir(const string &file_name) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<ListFilesResponse> list_files_response(false);
+  TF_RETURN_IF_ERROR(client->ListFiles(&list_files_response, path));
+
+  if (!list_files_response.res.entries.empty()) {
+    return errors::FailedPrecondition("Can't delete a non-empty directory");
+  } else {
+    CtrlResponse<DeleteResponse> del_response(false);
+    TF_RETURN_IF_ERROR(client->Delete(&del_response, path, true));
+  }
+
+  LOG(INFO) << "Delete dir completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::GetFileSize(const string &file_name, uint64 *size) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<InfoResponse> info_response(false);
+  TF_RETURN_IF_ERROR(client->Info(&info_response, path));
+
+  *size = info_response.res.file_info.length;
+
+  LOG(INFO) << "Get file size completed successful [file_name=" << file_name
+            << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::RenameFile(const string &src, const string &dst) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string src_path = TranslateName(src);
+  string dst_path = TranslateName(dst);
+
+  if (FileExists(dst).ok()) DeleteFile(dst);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<RenameResponse> rename_response(false);
+  TF_RETURN_IF_ERROR(client->Rename(&rename_response, src_path, dst_path));
+
+  if (!rename_response.res.successful)
+    return errors::NotFound("File ", src_path, " not found");
+
+  LOG(INFO) << "Rename file completed successful [src=" << src
+            << ", dst=" << dst << "]";
+
+  return Status::OK();
+}
+
+Status IGFS::Stat(const string &file_name, FileStatistics *stats) {
+  std::unique_ptr<IGFSClient> client = CreateClient();
+  string path = TranslateName(file_name);
+
+  CtrlResponse<HandshakeResponse> handshake_response(true);
+  TF_RETURN_IF_ERROR(client->Handshake(&handshake_response));
+
+  CtrlResponse<InfoResponse> info_response(false);
+  TF_RETURN_IF_ERROR(client->Info(&info_response, path));
+
+  IGFSFile info = info_response.res.file_info;
+
+  *stats = FileStatistics(info.length, info.modification_time * 1000000,
+                          (info.flags & 0x1) != 0);
+
+  LOG(INFO) << "Stat completed successful [file_name=" << file_name << "]";
+
+  return Status::OK();
+}
+
+std::unique_ptr<IGFSClient> IGFS::CreateClient() const {
+  return std::unique_ptr<IGFSClient>(
+      new IGFSClient(host_, port_, fs_name_, ""));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs.h b/tensorflow/contrib/ignite/kernels/igfs/igfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c347e937f75e8eea108811e6a3189412e22a982
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFS : public FileSystem {
+ public:
+  IGFS();
+  ~IGFS();
+  Status NewRandomAccessFile(
+      const string& file_name,
+      std::unique_ptr<RandomAccessFile>* result) override;
+  Status NewWritableFile(const string& fname,
+                         std::unique_ptr<WritableFile>* result) override;
+  Status NewAppendableFile(const string& fname,
+                           std::unique_ptr<WritableFile>* result) override;
+  Status NewReadOnlyMemoryRegionFromFile(
+      const string& fname,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
+  Status FileExists(const string& fname) override;
+  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+  Status DeleteFile(const string& fname) override;
+  Status CreateDir(const string& name) override;
+  Status DeleteDir(const string& name) override;
+  Status GetFileSize(const string& fname, uint64* size) override;
+  Status RenameFile(const string& src, const string& target) override;
+  Status Stat(const string& fname, FileStatistics* stat) override;
+  string TranslateName(const string& name) const override;
+
+ private:
+  std::unique_ptr<IGFSClient> CreateClient() const;
+
+  const string host_;
+  const int port_;
+  const string fs_name_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f97c34fdd8b026a04506fd0ef9f3cc74129a9da
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+
+namespace tensorflow {
+
+IGFSClient::IGFSClient(const string &host, int port, const string &fs_name,
+                       const string &user_name)
+    : fs_name_(fs_name),
+      user_name_(user_name),
+      client_(ExtendedTCPClient(host, port, true)) {
+  client_.Connect();
+}
+
+IGFSClient::~IGFSClient() { client_.Disconnect(); }
+
+Status IGFSClient::SendRequestGetResponse(const Request &request,
+                                          Response *response) {
+  TF_RETURN_IF_ERROR(request.Write(&client_));
+  client_.reset();
+
+  if (response != nullptr) {
+    TF_RETURN_IF_ERROR(response->Read(&client_));
+    client_.reset();
+  }
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbec7b000779be8772e850a556affffa1b3b6803
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_client.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_CLIENT_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+class IGFSClient {
+ public:
+  IGFSClient(const string &host, int port, const string &fs_name,
+             const string &user_name);
+  ~IGFSClient();
+
+  Status Handshake(CtrlResponse<HandshakeResponse> *res) {
+    return SendRequestGetResponse(HandshakeRequest(fs_name_, {}), res);
+  }
+
+  Status ListFiles(CtrlResponse<ListFilesResponse> *res, const string &path) {
+    return SendRequestGetResponse(ListFilesRequest(user_name_, path), res);
+  }
+
+  Status ListPaths(CtrlResponse<ListPathsResponse> *res, const string &path) {
+    return SendRequestGetResponse(ListPathsRequest(user_name_, path), res);
+  }
+
+  Status Info(CtrlResponse<InfoResponse> *res, const string &path) {
+    return SendRequestGetResponse(InfoRequest(user_name_, path), res);
+  }
+
+  Status OpenCreate(CtrlResponse<OpenCreateResponse> *res, const string &path) {
+    return SendRequestGetResponse(OpenCreateRequest(user_name_, path), res);
+  }
+
+  Status OpenAppend(CtrlResponse<OpenAppendResponse> *res, const string &path) {
+    return SendRequestGetResponse(OpenAppendRequest(user_name_, path), res);
+  }
+
+  Status OpenRead(CtrlResponse<OpenReadResponse> *res, const string &path) {
+    return SendRequestGetResponse(OpenReadRequest(user_name_, path), res);
+  }
+
+  Status Exists(CtrlResponse<ExistsResponse> *res, const string &path) {
+    return SendRequestGetResponse(ExistsRequest(user_name_, path), res);
+  }
+
+  Status MkDir(CtrlResponse<MakeDirectoriesResponse> *res, const string &path) {
+    return SendRequestGetResponse(MakeDirectoriesRequest(user_name_, path),
+                                  res);
+  }
+
+  Status Delete(CtrlResponse<DeleteResponse> *res, const string &path,
+                bool recursive) {
+    return SendRequestGetResponse(DeleteRequest(user_name_, path, recursive),
+                                  res);
+  }
+
+  Status WriteBlock(int64_t stream_id, const uint8_t *data, int32_t len) {
+    return SendRequestGetResponse(WriteBlockRequest(stream_id, data, len),
+                                  nullptr);
+  }
+
+  Status ReadBlock(ReadBlockCtrlResponse *res, int64_t stream_id, int64_t pos,
+                   int32_t length) {
+    return SendRequestGetResponse(ReadBlockRequest(stream_id, pos, length),
+                                  res);
+  }
+
+  Status Close(CtrlResponse<CloseResponse> *res, int64_t stream_id) {
+    return SendRequestGetResponse(CloseRequest(stream_id), res);
+  }
+
+  Status Rename(CtrlResponse<RenameResponse> *res, const string &source,
+                const string &dest) {
+    return SendRequestGetResponse(RenameRequest(user_name_, source, dest), res);
+  }
+
+ private:
+  Status SendRequestGetResponse(const Request &request, Response *response);
+
+  const string fs_name_;
+  const string user_name_;
+  ExtendedTCPClient client_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea63436546d8b244b921206f9577c91b6578a775
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.cc
@@ -0,0 +1,144 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h"
+
+namespace tensorflow {
+
+ExtendedTCPClient::ExtendedTCPClient(const string &host, int port,
+                                     bool big_endian)
+    : PlainClient(host, port, big_endian), pos_(0) {}
+
+Status ExtendedTCPClient::ReadData(uint8_t *buf, const int32_t length) {
+  TF_RETURN_IF_ERROR(PlainClient::ReadData(buf, length));
+  pos_ += length;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteData(const uint8_t *buf, const int32_t length) {
+  TF_RETURN_IF_ERROR(PlainClient::WriteData(buf, length));
+  pos_ += length;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::Ignore(int n) {
+  uint8_t buf[n];
+  return ReadData(buf, n);
+}
+
+Status ExtendedTCPClient::SkipToPos(int target_pos) {
+  return Ignore(std::max(0, target_pos - pos_));
+}
+
+Status ExtendedTCPClient::ReadBool(bool *res) {
+  uint8_t buf = 0;
+  TF_RETURN_IF_ERROR(ReadData(&buf, 1));
+  *res = buf != 0;
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::ReadNullableString(string *res) {
+  bool is_empty = false;
+  TF_RETURN_IF_ERROR(ReadBool(&is_empty));
+
+  if (!is_empty) {
+    TF_RETURN_IF_ERROR(ReadString(res));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::ReadString(string *res) {
+  int16_t length;
+  TF_RETURN_IF_ERROR(ReadShort(&length));
+
+  uint8_t *buf = new uint8_t[length];
+  Status status = ReadData(buf, length);
+
+  if (status.ok()) res->assign(reinterpret_cast<char *>(buf), length);
+
+  delete[] buf;
+  return status;
+}
+
+Status ExtendedTCPClient::ReadStringMap(std::map<string, string> *res) {
+  int size;
+  TF_RETURN_IF_ERROR(ReadInt(&size));
+
+  for (int i = 0; i < size; i++) {
+    string key;
+    string val;
+    TF_RETURN_IF_ERROR(ReadString(&key));
+    TF_RETURN_IF_ERROR(ReadString(&val));
+
+    res->insert(std::pair<string, string>(std::move(key), std::move(val)));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteSize(std::map<string, string>::size_type s) {
+  return WriteInt(s);
+}
+
+Status ExtendedTCPClient::FillWithZerosUntil(int n) {
+  int to_skip = std::max(0, n - pos_);
+
+  for (int i = 0; i < to_skip; i++) {
+    TF_RETURN_IF_ERROR(WriteByte(0));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteBool(bool val) {
+  return WriteByte((char)(val ? 1 : 0));
+}
+
+Status ExtendedTCPClient::WriteString(string str) {
+  if (!str.empty()) {
+    TF_RETURN_IF_ERROR(WriteBool(false));
+    size_t l = str.length();
+    if (l > std::numeric_limits<int16_t>::max())
+      return errors::InvalidArgument("String is too long");
+
+    TF_RETURN_IF_ERROR(WriteShort(l));
+    TF_RETURN_IF_ERROR(WriteData(reinterpret_cast<const uint8_t *>(str.c_str()),
+                                 str.length()));
+  } else {
+    TF_RETURN_IF_ERROR(WriteBool(true));
+  }
+
+  return Status::OK();
+}
+
+Status ExtendedTCPClient::WriteStringMap(std::map<string, string> map) {
+  std::map<string, string>::size_type size = map.size();
+  TF_RETURN_IF_ERROR(WriteSize(size));
+
+  for (auto &x : map) {
+    TF_RETURN_IF_ERROR(WriteString(x.first));
+    TF_RETURN_IF_ERROR(WriteString(x.second));
+  }
+
+  return Status::OK();
+}
+
+void ExtendedTCPClient::reset() { pos_ = 0; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5de342fd0c20cf5b01b647756797631b8a3f203
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_EXTENDED_TCP_CLIENT_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_EXTENDED_TCP_CLIENT_H_
+
+#include "tensorflow/contrib/ignite/kernels/client/ignite_plain_client.h"
+
+namespace tensorflow {
+
+class ExtendedTCPClient : public PlainClient {
+ public:
+  ExtendedTCPClient(const string &host, int port, bool big_endian);
+  Status ReadData(uint8_t *buf, const int32_t length) override;
+  Status WriteData(const uint8_t *buf, const int32_t length) override;
+  Status Ignore(int n);
+  Status SkipToPos(int target_pos);
+  Status ReadBool(bool *res);
+  Status ReadNullableString(string *res);
+  Status ReadString(string *res);
+  Status ReadStringMap(std::map<string, string> *res);
+  Status WriteSize(std::map<string, string>::size_type s);
+  Status FillWithZerosUntil(int n);
+  Status WriteBool(bool val);
+  Status WriteString(string str);
+  Status WriteStringMap(std::map<string, string> map);
+  void reset();
+
+ private:
+  int pos_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_EXTENDED_TCP_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c63f40f35fa53bc51c44f574df50ad0c79fba91
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.cc
@@ -0,0 +1,344 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+Status IGFSPath::Read(ExtendedTCPClient *client) {
+  return client->ReadNullableString(&path);
+}
+
+Status IGFSFile::Read(ExtendedTCPClient *client) {
+  int32_t block_size;
+  int64_t group_block_size;
+  std::map<string, string> properties = {};
+  int64_t access_time;
+
+  bool has_path;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_path));
+  if (has_path) {
+    IGFSPath path = {};
+    TF_RETURN_IF_ERROR(path.Read(client));
+  }
+
+  TF_RETURN_IF_ERROR(client->ReadInt(&block_size));
+  TF_RETURN_IF_ERROR(client->ReadLong(&group_block_size));
+  TF_RETURN_IF_ERROR(client->ReadLong(&length));
+  TF_RETURN_IF_ERROR(client->ReadStringMap(&properties));
+  TF_RETURN_IF_ERROR(client->ReadLong(&access_time));
+  TF_RETURN_IF_ERROR(client->ReadLong(&modification_time));
+  TF_RETURN_IF_ERROR(client->ReadByte(&flags));
+
+  return Status::OK();
+}
+
+Request::Request(int32_t command_id) : command_id_(command_id) {}
+
+Status Request::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(client->WriteByte(0));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(8));
+  TF_RETURN_IF_ERROR(client->WriteInt(command_id_));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(24));
+
+  return Status::OK();
+}
+
+Status Response::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->Ignore(1));
+  TF_RETURN_IF_ERROR(client->SkipToPos(8));
+  TF_RETURN_IF_ERROR(client->ReadInt(&req_id));
+  TF_RETURN_IF_ERROR(client->SkipToPos(24));
+  TF_RETURN_IF_ERROR(client->ReadInt(&res_type));
+
+  bool has_error;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_error));
+
+  if (has_error) {
+    int32_t error_code;
+    string error_msg;
+    TF_RETURN_IF_ERROR(client->ReadString(&error_msg));
+    TF_RETURN_IF_ERROR(client->ReadInt(&error_code));
+
+    return errors::Unknown("Error [code=", error_code, ", message=\"",
+                           error_msg, "\"]");
+  }
+
+  TF_RETURN_IF_ERROR(client->SkipToPos(header_size_ + 5));
+  TF_RETURN_IF_ERROR(client->ReadInt(&length));
+  TF_RETURN_IF_ERROR(client->SkipToPos(header_size_ + response_header_size_));
+
+  return Status::OK();
+}
+
+PathCtrlRequest::PathCtrlRequest(int32_t command_id_, const string &user_name,
+                                 const string &path,
+                                 const string &destination_path, bool flag,
+                                 bool collocate,
+                                 const std::map<string, string> &properties)
+    : Request(command_id_),
+      user_name_(user_name),
+      path_(path),
+      destination_path_(destination_path),
+      flag_(flag),
+      collocate_(collocate),
+      props_(properties) {}
+
+Status PathCtrlRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(Request::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteString(user_name_));
+  TF_RETURN_IF_ERROR(WritePath(client, path_));
+  TF_RETURN_IF_ERROR(WritePath(client, destination_path_));
+  TF_RETURN_IF_ERROR(client->WriteBool(flag_));
+  TF_RETURN_IF_ERROR(client->WriteBool(collocate_));
+  TF_RETURN_IF_ERROR(client->WriteStringMap(props_));
+
+  return Status::OK();
+}
+
+Status PathCtrlRequest::WritePath(ExtendedTCPClient *client,
+                                  const string &path) const {
+  TF_RETURN_IF_ERROR(client->WriteBool(!path.empty()));
+  if (!path.empty()) TF_RETURN_IF_ERROR(client->WriteString(path));
+
+  return Status::OK();
+}
+
+Status StreamCtrlRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(client->WriteByte(0));
+  TF_RETURN_IF_ERROR(client->FillWithZerosUntil(8));
+  TF_RETURN_IF_ERROR(client->WriteInt(command_id_));
+  TF_RETURN_IF_ERROR(client->WriteLong(stream_id_));
+  TF_RETURN_IF_ERROR(client->WriteInt(length_));
+
+  return Status::OK();
+}
+
+StreamCtrlRequest::StreamCtrlRequest(int32_t command_id_, int64_t stream_id,
+                                     int32_t length)
+    : Request(command_id_), stream_id_(stream_id), length_(length) {}
+
+DeleteRequest::DeleteRequest(const string &user_name, const string &path,
+                             bool flag)
+    : PathCtrlRequest(DELETE_ID, user_name, path, {}, flag, true, {}) {}
+
+Status DeleteResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&exists));
+
+  return Status::OK();
+}
+
+ExistsRequest::ExistsRequest(const string &user_name, const string &path)
+    : PathCtrlRequest(EXISTS_ID, user_name, path, {}, false, true, {}) {}
+
+Status ExistsResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&exists));
+
+  return Status::OK();
+}
+
+HandshakeRequest::HandshakeRequest(const string &fs_name, const string &log_dir)
+    : Request(HANDSHAKE_ID), fs_name_(fs_name), log_dir_(log_dir) {}
+
+Status HandshakeRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(Request::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteString(fs_name_));
+  TF_RETURN_IF_ERROR(client->WriteString(log_dir_));
+
+  return Status::OK();
+}
+
+Status HandshakeResponse::Read(ExtendedTCPClient *client) {
+  int64_t block_size;
+  bool sampling;
+
+  TF_RETURN_IF_ERROR(client->ReadNullableString(&fs_name));
+  TF_RETURN_IF_ERROR(client->ReadLong(&block_size));
+
+  bool has_sampling_;
+  TF_RETURN_IF_ERROR(client->ReadBool(&has_sampling_));
+
+  if (has_sampling_) {
+    TF_RETURN_IF_ERROR(client->ReadBool(&sampling));
+  }
+
+  return Status::OK();
+}
+
+ListRequest::ListRequest(int32_t command_id_, const string &user_name,
+                         const string &path)
+    : PathCtrlRequest(command_id_, user_name, path, {}, false, true, {}) {}
+
+ListFilesRequest::ListFilesRequest(const string &user_name, const string &path)
+    : ListRequest(LIST_FILES_ID, user_name, path) {}
+
+ListPathsRequest::ListPathsRequest(const string &user_name, const string &path)
+    : ListRequest(LIST_PATHS_ID, user_name, path) {}
+
+OpenCreateRequest::OpenCreateRequest(const string &user_name,
+                                     const string &path)
+    : PathCtrlRequest(OPEN_CREATE_ID, user_name, path, {}, false, true, {}) {}
+
+Status OpenCreateRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteInt(replication_));
+  TF_RETURN_IF_ERROR(client->WriteLong(blockSize_));
+
+  return Status::OK();
+}
+
+Status OpenCreateResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+
+  return Status::OK();
+}
+
+OpenAppendRequest::OpenAppendRequest(const string &user_name,
+                                     const string &path)
+    : PathCtrlRequest(OPEN_APPEND_ID, user_name, path, {}, false, true, {}) {}
+
+Status OpenAppendRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  return Status::OK();
+}
+
+Status OpenAppendResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+
+  return Status::OK();
+}
+
+OpenReadRequest::OpenReadRequest(const string &user_name, const string &path,
+                                 bool flag,
+                                 int32_t sequential_reads_before_prefetch)
+    : PathCtrlRequest(OPEN_READ_ID, user_name, path, {}, flag, true, {}),
+      sequential_reads_before_prefetch_(sequential_reads_before_prefetch) {}
+
+OpenReadRequest::OpenReadRequest(const string &user_name, const string &path)
+    : OpenReadRequest(user_name, path, false, 0) {}
+
+Status OpenReadRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(PathCtrlRequest::Write(client));
+
+  if (flag_) {
+    TF_RETURN_IF_ERROR(client->WriteInt(sequential_reads_before_prefetch_));
+  }
+
+  return Status::OK();
+}
+
+Status OpenReadResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadLong(&stream_id));
+  TF_RETURN_IF_ERROR(client->ReadLong(&length));
+
+  return Status::OK();
+}
+
+InfoRequest::InfoRequest(const string &user_name, const string &path)
+    : PathCtrlRequest(INFO_ID, user_name, path, {}, false, true, {}) {}
+
+Status InfoResponse::Read(ExtendedTCPClient *client) {
+  file_info = IGFSFile();
+  TF_RETURN_IF_ERROR(file_info.Read(client));
+
+  return Status::OK();
+}
+
+MakeDirectoriesRequest::MakeDirectoriesRequest(const string &user_name,
+                                               const string &path)
+    : PathCtrlRequest(MKDIR_ID, user_name, path, {}, false, true, {}) {}
+
+Status MakeDirectoriesResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+CloseRequest::CloseRequest(int64_t streamId)
+    : StreamCtrlRequest(CLOSE_ID, streamId, 0) {}
+
+Status CloseResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+ReadBlockRequest::ReadBlockRequest(int64_t stream_id, int64_t pos,
+                                   int32_t length)
+    : StreamCtrlRequest(READ_BLOCK_ID, stream_id, length), pos(pos) {}
+
+Status ReadBlockRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(StreamCtrlRequest::Write(client));
+
+  TF_RETURN_IF_ERROR(client->WriteLong(pos));
+
+  return Status::OK();
+}
+
+Status ReadBlockResponse::Read(ExtendedTCPClient *client, int32_t length,
+                               uint8_t *dst) {
+  TF_RETURN_IF_ERROR(client->ReadData(dst, length));
+  successfully_read = length;
+
+  return Status::OK();
+}
+
+Status ReadBlockResponse::Read(ExtendedTCPClient *client) {
+  return Status::OK();
+}
+
+std::streamsize ReadBlockResponse::GetSuccessfullyRead() {
+  return successfully_read;
+}
+
+ReadBlockCtrlResponse::ReadBlockCtrlResponse(uint8_t *dst)
+    : CtrlResponse(false), dst(dst) {}
+
+Status ReadBlockCtrlResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(Response::Read(client));
+
+  res = ReadBlockResponse();
+  TF_RETURN_IF_ERROR(res.Read(client, length, dst));
+
+  return Status::OK();
+}
+
+WriteBlockRequest::WriteBlockRequest(int64_t stream_id, const uint8_t *data,
+                                     int32_t length)
+    : StreamCtrlRequest(WRITE_BLOCK_ID, stream_id, length), data(data) {}
+
+Status WriteBlockRequest::Write(ExtendedTCPClient *client) const {
+  TF_RETURN_IF_ERROR(StreamCtrlRequest::Write(client));
+  TF_RETURN_IF_ERROR(client->WriteData((uint8_t *)data, length_));
+
+  return Status::OK();
+}
+
+RenameRequest::RenameRequest(const string &user_name, const string &path,
+                             const string &destination_path)
+    : PathCtrlRequest(RENAME_ID, user_name, path, destination_path, false, true,
+                      {}) {}
+
+Status RenameResponse::Read(ExtendedTCPClient *client) {
+  TF_RETURN_IF_ERROR(client->ReadBool(&successful));
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h
new file mode 100644
index 0000000000000000000000000000000000000000..44a2928a2b2b48849c7ba4454e0e7848c2217b3b
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h
@@ -0,0 +1,356 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_MESSAGES_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_MESSAGES_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_extended_tcp_client.h"
+
+namespace tensorflow {
+
+enum CommandId {
+  HANDSHAKE_ID = 0,
+  EXISTS_ID = 2,
+  INFO_ID = 3,
+  RENAME_ID = 6,
+  DELETE_ID = 7,
+  MKDIR_ID = 8,
+  LIST_PATHS_ID = 9,
+  LIST_FILES_ID = 10,
+  OPEN_READ_ID = 13,
+  OPEN_APPEND_ID = 14,
+  OPEN_CREATE_ID = 15,
+  CLOSE_ID = 16,
+  READ_BLOCK_ID = 17,
+  WRITE_BLOCK_ID = 18,
+};
+
+class IGFSPath {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  string path;
+};
+
+class IGFSFile {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t length;
+  int64_t modification_time;
+  uint8_t flags;
+};
+
+class Request {
+ public:
+  Request(int32_t command_id);
+  virtual Status Write(ExtendedTCPClient *client) const;
+
+ protected:
+  const int32_t command_id_;
+};
+
+class Response {
+ public:
+  virtual Status Read(ExtendedTCPClient *client);
+
+  int32_t res_type;
+  int32_t req_id;
+  int32_t length;
+
+ protected:
+  static const int32_t header_size_ = 24;
+  static const int32_t response_header_size_ = 9;
+};
+
+class PathCtrlRequest : public Request {
+ public:
+  PathCtrlRequest(int32_t command_id, const string &user_name,
+                  const string &path, const string &destination_path, bool flag,
+                  bool collocate, const std::map<string, string> &properties);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  Status WritePath(ExtendedTCPClient *client, const string &path) const;
+
+  const string user_name_;
+  const string path_;
+  const string destination_path_;
+  const bool flag_;
+  const bool collocate_;
+  const std::map<string, string> props_;
+};
+
+class StreamCtrlRequest : public Request {
+ public:
+  StreamCtrlRequest(int32_t command_id, int64_t stream_id, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  int64_t stream_id_;
+  int32_t length_;
+};
+
+template <class R>
+class CtrlResponse : public Response {
+ public:
+  CtrlResponse(bool optional) : optional_(optional) {}
+  Status Read(ExtendedTCPClient *client) override {
+    TF_RETURN_IF_ERROR(Response::Read(client));
+
+    if (optional_) {
+      TF_RETURN_IF_ERROR(client->ReadBool(&has_content));
+
+      if (!has_content) return Status::OK();
+    }
+
+    res = R();
+    has_content = true;
+    TF_RETURN_IF_ERROR(res.Read(client));
+
+    return Status::OK();
+  }
+
+  R res;
+  bool has_content;
+
+ private:
+  bool optional_;
+};
+
+template <class T>
+class ListResponse {
+ public:
+  Status Read(ExtendedTCPClient *client) {
+    int32_t len;
+    TF_RETURN_IF_ERROR(client->ReadInt(&len));
+
+    entries.clear();
+
+    for (int32_t i = 0; i < len; i++) {
+      T f = {};
+      TF_RETURN_IF_ERROR(f.Read(client));
+      entries.push_back(f);
+    }
+
+    return Status::OK();
+  }
+
+  std::vector<T> entries;
+};
+
+class DeleteRequest : public PathCtrlRequest {
+ public:
+  DeleteRequest(const string &user_name, const string &path, bool flag);
+};
+
+class DeleteResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool exists;
+};
+
+class ExistsRequest : public PathCtrlRequest {
+ public:
+  explicit ExistsRequest(const string &user_name, const string &path);
+};
+
+class ExistsResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool exists;
+};
+
+class HandshakeRequest : public Request {
+ public:
+  HandshakeRequest(const string &fs_name, const string &log_dir);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  string fs_name_;
+  string log_dir_;
+};
+
+class HandshakeResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  string fs_name;
+};
+
+class ListRequest : public PathCtrlRequest {
+ public:
+  explicit ListRequest(int32_t command_id, const string &user_name,
+                       const string &path);
+};
+
+class ListFilesRequest : public ListRequest {
+ public:
+  ListFilesRequest(const string &user_name, const string &path);
+};
+
+class ListFilesResponse : public ListResponse<IGFSFile> {};
+
+class ListPathsRequest : public ListRequest {
+ public:
+  ListPathsRequest(const string &user_name, const string &path);
+};
+
+class ListPathsResponse : public ListResponse<IGFSPath> {};
+
+class OpenCreateRequest : public PathCtrlRequest {
+ public:
+  OpenCreateRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  int32_t replication_;
+  int64_t blockSize_;
+};
+
+class OpenCreateResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
+};
+
+class OpenAppendRequest : public PathCtrlRequest {
+ public:
+  explicit OpenAppendRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+};
+
+class OpenAppendResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
+};
+
+class OpenReadRequest : public PathCtrlRequest {
+ public:
+  OpenReadRequest(const string &user_name, const string &path, bool flag,
+                  int32_t seqReadsBeforePrefetch);
+  OpenReadRequest(const string &user_name, const string &path);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ protected:
+  /** Sequential reads before prefetch. */
+  int32_t sequential_reads_before_prefetch_;
+};
+
+class OpenReadResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  int64_t stream_id;
+  int64_t length;
+};
+
+class InfoRequest : public PathCtrlRequest {
+ public:
+  InfoRequest(const string &user_name, const string &path);
+};
+
+class InfoResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  IGFSFile file_info;
+};
+
+class MakeDirectoriesRequest : public PathCtrlRequest {
+ public:
+  MakeDirectoriesRequest(const string &userName, const string &path);
+};
+
+class MakeDirectoriesResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool successful;
+};
+
+/** Stream control requests. **/
+
+class CloseRequest : public StreamCtrlRequest {
+ public:
+  explicit CloseRequest(int64_t stream_id);
+};
+
+class CloseResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool successful;
+};
+
+class ReadBlockRequest : public StreamCtrlRequest {
+ public:
+  ReadBlockRequest(int64_t stream_id, int64_t pos, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  int64_t pos;
+};
+
+class ReadBlockResponse {
+ public:
+  Status Read(ExtendedTCPClient *client, int32_t length, uint8_t *dst);
+  Status Read(ExtendedTCPClient *client);
+  std::streamsize GetSuccessfullyRead();
+
+ private:
+  int32_t length;
+  std::streamsize successfully_read;
+};
+
+class ReadBlockCtrlResponse : public CtrlResponse<ReadBlockResponse> {
+ public:
+  ReadBlockCtrlResponse(uint8_t *dst);
+  Status Read(ExtendedTCPClient *client) override;
+
+ private:
+  uint8_t *dst;
+};
+
+class WriteBlockRequest : public StreamCtrlRequest {
+ public:
+  WriteBlockRequest(int64_t stream_id, const uint8_t *data, int32_t length);
+  Status Write(ExtendedTCPClient *client) const override;
+
+ private:
+  const uint8_t *data;
+};
+
+class RenameRequest : public PathCtrlRequest {
+ public:
+  RenameRequest(const string &user_name, const string &path,
+                const string &destination_path);
+};
+
+class RenameResponse {
+ public:
+  Status Read(ExtendedTCPClient *client);
+
+  bool successful;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_MESSAGES_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4c898f14e6d298e65f563f4493a822172c40851
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.cc
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+IGFSRandomAccessFile::IGFSRandomAccessFile(const string &file_name,
+                                           int64_t resource_id,
+                                           std::unique_ptr<IGFSClient> &&client)
+    : file_name_(file_name),
+      resource_id_(resource_id),
+      client_(std::move(client)) {}
+
+IGFSRandomAccessFile::~IGFSRandomAccessFile() {
+  CtrlResponse<CloseResponse> close_response = {false};
+  Status status = client_->Close(&close_response, resource_id_);
+
+  if (!status.ok()) LOG(ERROR) << status.ToString();
+}
+
+Status IGFSRandomAccessFile::Read(uint64 offset, size_t n, StringPiece *result,
+                                  char *scratch) const {
+  ReadBlockCtrlResponse response = ReadBlockCtrlResponse((uint8_t *)scratch);
+  TF_RETURN_IF_ERROR(client_->ReadBlock(&response, resource_id_, offset, n));
+
+  std::streamsize sz = response.res.GetSuccessfullyRead();
+  if (sz == 0) return errors::OutOfRange("End of file");
+
+  *result = StringPiece(scratch, sz);
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h
new file mode 100644
index 0000000000000000000000000000000000000000..b21369ff8a3b19774bcc743f93a5ec4ae1c9b49a
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_random_access_file.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_RANDOM_ACCESS_FILE_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_RANDOM_ACCESS_FILE_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFSRandomAccessFile : public RandomAccessFile {
+ public:
+  IGFSRandomAccessFile(const string &file_name, int64_t resource_id,
+                       std::unique_ptr<IGFSClient> &&client);
+  ~IGFSRandomAccessFile() override;
+  Status Read(uint64 offset, size_t n, StringPiece *result,
+              char *scratch) const override;
+
+ private:
+  const string file_name_;
+  const int64_t resource_id_;
+  std::unique_ptr<IGFSClient> client_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_RANDOM_ACCESS_FILE_H_
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c15ecb7deeb0cf5a8a040e0d1e4b70c732729474
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h"
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_messages.h"
+
+namespace tensorflow {
+
+IGFSWritableFile::IGFSWritableFile(const string &file_name, int64_t resource_id,
+                                   std::unique_ptr<IGFSClient> &&client)
+    : file_name_(file_name),
+      resource_id_(resource_id),
+      client_(std::move(client)) {}
+
+IGFSWritableFile::~IGFSWritableFile() {
+  if (resource_id_ >= 0) {
+    CtrlResponse<CloseResponse> close_response = {false};
+
+    Status status = client_->Close(&close_response, resource_id_);
+    if (!status.ok()) LOG(ERROR) << status.ToString();
+  }
+}
+
+Status IGFSWritableFile::Append(StringPiece data) {
+  return client_->WriteBlock(resource_id_, (uint8_t *)data.data(), data.size());
+}
+
+Status IGFSWritableFile::Close() {
+  int64_t resource_to_be_closed = resource_id_;
+  resource_id_ = -1;
+
+  CtrlResponse<CloseResponse> close_response = {false};
+  return client_->Close(&close_response, resource_to_be_closed);
+}
+
+Status IGFSWritableFile::Flush() { return Sync(); }
+
+Status IGFSWritableFile::Sync() {
+  CtrlResponse<CloseResponse> close_response = {false};
+  TF_RETURN_IF_ERROR(client_->Close(&close_response, resource_id_));
+
+  CtrlResponse<OpenAppendResponse> open_append_resp(false);
+  TF_RETURN_IF_ERROR(client_->OpenAppend(&open_append_resp, file_name_));
+
+  resource_id_ = open_append_resp.res.stream_id;
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h
new file mode 100644
index 0000000000000000000000000000000000000000..b406db17e0e350e2cef610bb05c40f658e100140
--- /dev/null
+++ b/tensorflow/contrib/ignite/kernels/igfs/igfs_writable_file.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_WRITABLE_FILE_H_
+#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_WRITABLE_FILE_H_
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs_client.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+
+class IGFSWritableFile : public WritableFile {
+ public:
+  IGFSWritableFile(const string &file_name, int64_t resource_id,
+                   std::unique_ptr<IGFSClient> &&client);
+  ~IGFSWritableFile() override;
+  Status Append(StringPiece data) override;
+  Status Close() override;
+  Status Flush() override;
+  Status Sync() override;
+
+ private:
+  const string file_name_;
+  int64_t resource_id_;
+  std::unique_ptr<IGFSClient> client_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGFS_IGFS_WRITABLE_FILE_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h b/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
deleted file mode 100644
index 46df3e39dc4ec6dd4ef5730a184264eaa9fc5872..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
-
-#include <stdint.h>
-#include "tensorflow/core/platform/byte_order.h"
-
-namespace tensorflow {
-
-class ByteSwapper {
- public:
-  ByteSwapper(bool big_endian) { swap_ = big_endian == port::kLittleEndian; }
-
-  inline void SwapIfRequiredInt16(int16_t *x) const {
-    if (swap_) {
-      Swap16(x);
-    }
-  }
-
-  inline void SwapIfRequiredUnsignedInt16(uint16_t *x) const {
-    if (swap_) {
-      Swap16(reinterpret_cast<int16_t *>(x));
-    }
-  }
-
-  inline void SwapIfRequiredInt32(int32_t *x) const {
-    if (swap_) {
-      Swap32(x);
-    }
-  }
-
-  inline void SwapIfRequiredFloat(float *x) const {
-    if (swap_) {
-      Swap32(reinterpret_cast<int32_t *>(x));
-    }
-  }
-
-  inline void SwapIfRequiredInt64(int64_t *x) const {
-    if (swap_) {
-      Swap64(x);
-    }
-  }
-
-  inline void SwapIfRequiredDouble(double *x) const {
-    if (swap_) {
-      Swap64(reinterpret_cast<int64_t *>(x));
-    }
-  }
-
-  inline void SwapIfRequiredInt16Arr(int16_t *x, int32_t length) const {
-    if (swap_) {
-      for (int32_t i = 0; i < length; i++) Swap16(&x[i]);
-    }
-  }
-
-  inline void SwapIfRequiredUnsignedInt16Arr(uint16_t *x,
-                                             int32_t length) const {
-    if (swap_) {
-      for (int32_t i = 0; i < length; i++)
-        Swap16(reinterpret_cast<int16_t *>(&x[i]));
-    }
-  }
-
-  inline void SwapIfRequiredInt32Arr(int32_t *x, int32_t length) const {
-    if (swap_) {
-      for (int32_t i = 0; i < length; i++) Swap32(&x[i]);
-    }
-  }
-
-  inline void SwapIfRequiredFloatArr(float *x, int32_t length) const {
-    if (swap_) {
-      for (int32_t i = 0; i < length; i++)
-        Swap32(reinterpret_cast<int32_t *>(&x[i]));
-    }
-  }
-
-  inline void SwapIfRequiredInt64Arr(int64_t *x, int32_t length) const {
-    if (swap_) {
-      for (int32_t i = 0; i < length; i++) Swap64(&x[i]);
-    }
-  }
-
-  inline void SwapIfRequiredDoubleArr(double *x, int32_t length) const {
-    if (swap_) {
-      for (int32_t i = 0; i < length; i++)
-        Swap64(reinterpret_cast<int64_t *>(&x[i]));
-    }
-  }
-
- private:
-  inline void Swap16(int16_t *x) const {
-    *x = ((*x & 0xFF) << 8) | ((*x >> 8) & 0xFF);
-  }
-
-  inline void Swap32(int32_t *x) const {
-    *x = ((*x & 0xFF) << 24) | (((*x >> 8) & 0xFF) << 16) |
-         (((*x >> 16) & 0xFF) << 8) | ((*x >> 24) & 0xFF);
-  }
-
-  inline void Swap64(int64_t *x) const {
-    *x = ((*x & 0xFF) << 56) | (((*x >> 8) & 0xFF) << 48) |
-         (((*x >> 16) & 0xFF) << 40) | (((*x >> 24) & 0xFF) << 32) |
-         (((*x >> 32) & 0xFF) << 24) | (((*x >> 40) & 0xFF) << 16) |
-         (((*x >> 48) & 0xFF) << 8) | ((*x >> 56) & 0xFF);
-  }
-
-  bool swap_;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_BYTE_SWAPPER_H_
diff --git a/tensorflow/contrib/ignite/kernels/ignite_client.h b/tensorflow/contrib/ignite/kernels/ignite_client.h
deleted file mode 100644
index 459b50b48fd95ad105bccaca4076160e0ef152ee..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/ignite/kernels/ignite_client.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
-#define TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
-
-#include "tensorflow/contrib/ignite/kernels/ignite_byte_swapper.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-
-class Client {
- public:
-  Client(bool big_endian) : byte_swapper_(ByteSwapper(big_endian)) {}
-  virtual Status Connect() = 0;
-  virtual Status Disconnect() = 0;
-  virtual bool IsConnected() = 0;
-  virtual int GetSocketDescriptor() = 0;
-  virtual Status ReadData(uint8_t *buf, const int32_t length) = 0;
-  virtual Status WriteData(const uint8_t *buf, const int32_t length) = 0;
-
-  inline Status ReadByte(uint8_t *data) { return ReadData(data, 1); }
-
-  inline Status ReadShort(int16_t *data) {
-    TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 2));
-    byte_swapper_.SwapIfRequiredInt16(data);
-
-    return Status::OK();
-  }
-
-  inline Status ReadInt(int32_t *data) {
-    TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 4));
-    byte_swapper_.SwapIfRequiredInt32(data);
-
-    return Status::OK();
-  }
-
-  inline Status ReadLong(int64_t *data) {
-    TF_RETURN_IF_ERROR(ReadData((uint8_t *)data, 8));
-    byte_swapper_.SwapIfRequiredInt64(data);
-
-    return Status::OK();
-  }
-
-  inline Status WriteByte(const uint8_t data) { return WriteData(&data, 1); }
-
-  inline Status WriteShort(const int16_t data) {
-    int16_t tmp = data;
-    byte_swapper_.SwapIfRequiredInt16(&tmp);
-    return WriteData((uint8_t *)&tmp, 2);
-  }
-
-  inline Status WriteInt(const int32_t data) {
-    int32_t tmp = data;
-    byte_swapper_.SwapIfRequiredInt32(&tmp);
-    return WriteData((uint8_t *)&tmp, 4);
-  }
-
-  inline Status WriteLong(const int64_t data) {
-    int64_t tmp = data;
-    byte_swapper_.SwapIfRequiredInt64(&tmp);
-    return WriteData((uint8_t *)&tmp, 8);
-  }
-
- private:
-  const ByteSwapper byte_swapper_;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CONTRIB_IGNITE_KERNELS_IGNITE_CLIENT_H_
diff --git a/tensorflow/contrib/ignite/ops/igfs_ops.cc b/tensorflow/contrib/ignite/ops/igfs_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..473bddff08b339d3b76a33d40fe34486acdbe151
--- /dev/null
+++ b/tensorflow/contrib/ignite/ops/igfs_ops.cc
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/env.h"
+
+#include "tensorflow/contrib/ignite/kernels/igfs/igfs.h"
+
+namespace tensorflow {
+
+REGISTER_FILE_SYSTEM("igfs", IGFS);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py b/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e1d6707d6400a7cd84016150d20973809aca20e
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/ops/igfs_op_loader.py
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python helper for loading IGFS ops and kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_dataset_ops = loader.load_op_library(
+    resource_loader.get_path_to_datafile("../../_ignite_ops.so"))
diff --git a/tensorflow/contrib/ignite/python/ops/igfs_ops.py b/tensorflow/contrib/ignite/python/ops/igfs_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..12b973b707730f6ba5b057b74a46b27d8f973ede
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/ops/igfs_ops.py
@@ -0,0 +1,40 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ignite File System for checkpointing and communication with TensorBoard.
+
+Apache Ignite is a memory-centric distributed database, caching, and
+processing platform for transactional, analytical, and streaming workloads,
+delivering in-memory speeds at petabyte scale. In addition to database
+functionality Apache Ignite provides a distributed file system called
+IGFS (https://ignite.apache.org/features/igfs.html). IGFS delivers a similar
+functionality to Hadoop HDFS, but only in-memory. In fact, in addition to
+its own APIs, IGFS implements Hadoop FileSystem API and can be transparently
+plugged into Hadoop or Spark deployments. This contrib package contains an
+integration between IGFS and TensorFlow.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import resource_loader
+
+file_system_library = os.path.join(resource_loader.get_data_files_path(),
+                                   "../../_ignite_ops.so")
+load_library.load_file_system_library(file_system_library)
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index 288d4853207176b215cd8a0cdcbfb2de5791ecb8..e4762c91b193f9c5e32fa2642e702e61e8e5e57f 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -22,19 +22,20 @@ import socket
 import ssl
 import struct
 
+import six
+
 from tensorflow.contrib.ignite.python.ops import gen_dataset_ops
 from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 
 
+@six.add_metaclass(abc.ABCMeta)
 class Readable(object):
-  """Readable abstract class that exposes methods to do reading-related
-
-     operations.
-  """
+  """Abstract class that exposes methods to do reading-related operations."""
 
   @abc.abstractmethod
   def __init__(self):
@@ -224,10 +225,7 @@ types = {
 
 
 class TypeTreeNode(object):
-  """TypeTreeNode class exposes methods to format object tree structure
-
-     data.
-  """
+  """TypeTreeNode class exposes methods to format object tree structure data."""
 
   def __init__(self, name, type_id, fields=None, permutation=None):
     """Constructs a new instance of TypeTreeNode.
@@ -689,14 +687,14 @@ class IgniteClient(TcpClient):
 
 
 class IgniteDataset(dataset_ops.DatasetSource):
-  """Apache Ignite is a memory-centric distributed database, caching, and
-
-     processing platform for transactional, analytical, and streaming workloads,
-     delivering in-memory speeds at petabyte scale. This contrib package
-     contains an integration between Apache Ignite and TensorFlow. The
-     integration is based on tf.data from TensorFlow side and Binary Client
-     Protocol from Apache Ignite side. It allows to use Apache Ignite as a
-     datasource for neural network training, inference and all other
+  """Apache Ignite is a memory-centric distributed database.
+
+     It acts as a caching and processing platform for transactional, analytical,
+     and streaming workloads, delivering in-memory speeds at petabyte scale.
+     This contrib package contains an integration between Apache Ignite and
+     TensorFlow. The integration is based on tf.data from TensorFlow side and
+     Binary Client Protocol from Apache Ignite side. It allows to use Apache
+     Ignite as a datasource for neural network training, inference and all other
      computations supported by TensorFlow. Ignite Dataset is based on Apache
      Ignite Binary Client Protocol.
   """
@@ -753,6 +751,9 @@ class IgniteDataset(dataset_ops.DatasetSource):
         self.cache_type.to_permutation(),
         dtype=dtypes.int32,
         name="permutation")
+    self._structure = structure.convert_legacy_structure(
+        self.cache_type.to_output_types(), self.cache_type.to_output_shapes(),
+        self.cache_type.to_output_classes())
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.ignite_dataset(self.cache_name, self.host, self.port,
@@ -760,13 +761,5 @@ class IgniteDataset(dataset_ops.DatasetSource):
                                           self.schema, self.permutation)
 
   @property
-  def output_classes(self):
-    return self.cache_type.to_output_classes()
-
-  @property
-  def output_shapes(self):
-    return self.cache_type.to_output_shapes()
-
-  @property
-  def output_types(self):
-    return self.cache_type.to_output_types()
+  def _element_structure(self):
+    return self._structure
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
index c9af7386cf0a26ed1a950130aa36caa7fb831fd0..e450e2d84ba31a7de925fdb78fc972a592c6ad8c 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_op_loader.py
@@ -21,4 +21,4 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.platform import resource_loader
 
 _dataset_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile("../../_dataset_ops.so"))
+    resource_loader.get_path_to_datafile("../../_ignite_ops.so"))
diff --git a/tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh b/tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5e39e16c05290f6b5786421670c69a3bd1e27add
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/bin/start-igfs.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+nohup apache-ignite-fabric/bin/ignite.sh /data/config/ignite-config-igfs.xml &
+sleep 5 # Wait Apache Ignite to be started
+
+tail -f nohup.out
diff --git a/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml b/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml
new file mode 100644
index 0000000000000000000000000000000000000000..5d81bf33226cad0d5cc0ea1fb5c5b55672494976
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/config/ignite-config-igfs.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<beans xmlns="http://www.springframework.org/schema/beans"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xmlns:util="http://www.springframework.org/schema/util"
+       xsi:schemaLocation="http://www.springframework.org/schema/beans
+       http://www.springframework.org/schema/beans/spring-beans.xsd
+       http://www.springframework.org/schema/util
+       http://www.springframework.org/schema/util/spring-util.xsd">
+
+  <bean class="org.apache.ignite.configuration.IgniteConfiguration">
+    <property name="fileSystemConfiguration">
+      <bean class="org.apache.ignite.configuration.FileSystemConfiguration">
+        <!-- Distinguished file system name. -->
+        <property name="name" value="default_fs"/>
+        <property name="managementPort" value="9000"/>
+        <property name="ipcEndpointEnabled" value="true"/>
+        <property name="defaultMode" value="PRIMARY"/>
+        <property name="ipcEndpointConfiguration">
+          <bean class="org.apache.ignite.igfs.IgfsIpcEndpointConfiguration">
+            <property name="host" value="" />
+            <property name="port" value="10500"/>
+            <property name="type" value="TCP"/>
+          </bean>
+        </property>
+      </bean>
+    </property>
+    <property name="discoverySpi">
+      <bean class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
+        <property name="ipFinder">
+          <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder">
+            <property name="addresses">
+              <list>
+                <value>127.0.0.1</value>
+              </list>
+            </property>
+          </bean>
+        </property>
+      </bean>
+    </property>
+  </bean>
+
+</beans>
diff --git a/tensorflow/contrib/ignite/python/tests/igfs_test.py b/tensorflow/contrib/ignite/python/tests/igfs_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cacfc568942e20200b7daf10599dde513a4a0a68
--- /dev/null
+++ b/tensorflow/contrib/ignite/python/tests/igfs_test.py
@@ -0,0 +1,215 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for IGFS."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.contrib.ignite.python.ops.igfs_ops  # pylint: disable=unused-import
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+class IGFSTest(test.TestCase):
+  """The Apache Ignite servers have to setup before the test and tear down
+
+     after the test manually. The docker engine has to be installed.
+
+     To setup Apache Ignite servers:
+     $ bash start_ignite.sh
+
+     To tear down Apache Ignite servers:
+     $ bash stop_ignite.sh
+  """
+
+  def test_create_file(self):
+    """Test create file.
+
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_create_file/1"
+    self.assertFalse(gfile.Exists(file_name))
+    # Create file.
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    # Check that file was created.
+    self.assertTrue(gfile.Exists(file_name))
+
+  def test_write_read_file(self):
+    """Test write/read file.
+
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_write_read_file/1"
+    rows = 10000
+    self.assertFalse(gfile.Exists(file_name))
+    # Write data.
+    with gfile.Open(file_name, mode="w") as w:
+      for i in range(rows):
+        w.write("This is row\n")
+    # Read data.
+    with gfile.Open(file_name, mode="r") as r:
+      lines = r.readlines()
+    # Check that data is equal.
+    self.assertEqual(rows, len(lines))
+    for i in range(rows):
+      self.assertEqual("This is row\n", lines[i])
+
+  def test_delete_recursively(self):
+    """Test delete recursively.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_delete_recursively/"
+    file_name = "igfs:///test_delete_recursively/1"
+    self.assertFalse(gfile.Exists(dir_name))
+    self.assertFalse(gfile.Exists(file_name))
+    gfile.MkDir(dir_name)
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    self.assertTrue(gfile.Exists(dir_name))
+    self.assertTrue(gfile.Exists(file_name))
+    # Delete directory recursively.
+    gfile.DeleteRecursively(dir_name)
+    # Check that directory was deleted.
+    self.assertFalse(gfile.Exists(dir_name))
+    self.assertFalse(gfile.Exists(file_name))
+
+  def test_copy(self):
+    """Test copy.
+
+    """
+    # Setup and check preconditions.
+    src_file_name = "igfs:///test_copy/1"
+    dst_file_name = "igfs:///test_copy/2"
+    self.assertFalse(gfile.Exists(src_file_name))
+    self.assertFalse(gfile.Exists(dst_file_name))
+    with gfile.Open(src_file_name, mode="w") as w:
+      w.write("42")
+    self.assertTrue(gfile.Exists(src_file_name))
+    self.assertFalse(gfile.Exists(dst_file_name))
+    # Copy file.
+    gfile.Copy(src_file_name, dst_file_name)
+    # Check that files are identical.
+    self.assertTrue(gfile.Exists(src_file_name))
+    self.assertTrue(gfile.Exists(dst_file_name))
+    with gfile.Open(dst_file_name, mode="r") as r:
+      data = r.read()
+    self.assertEqual("42", data)
+
+  def test_is_directory(self):
+    """Test is directory.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_is_directory/1"
+    file_name = "igfs:///test_is_directory/2"
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    gfile.MkDir(dir_name)
+    # Check that directory is a directory.
+    self.assertTrue(gfile.IsDirectory(dir_name))
+    # Check that file is not a directory.
+    self.assertFalse(gfile.IsDirectory(file_name))
+
+  def test_list_directory(self):
+    """Test list directory.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_list_directory/"
+    file_names = [
+        "igfs:///test_list_directory/1", "igfs:///test_list_directory/2/3"
+    ]
+    ch_dir_names = [
+        "igfs:///test_list_directory/4",
+    ]
+    for file_name in file_names:
+      with gfile.Open(file_name, mode="w") as w:
+        w.write("")
+    for ch_dir_name in ch_dir_names:
+      gfile.MkDir(ch_dir_name)
+    ls_expected_result = file_names + ch_dir_names
+    # Get list of files in directory.
+    ls_result = gfile.ListDirectory(dir_name)
+    # Check that list of files is correct.
+    self.assertEqual(len(ls_expected_result), len(ls_result))
+    for e in ["1", "2", "4"]:
+      self.assertTrue(e in ls_result)
+
+  def test_make_dirs(self):
+    """Test make dirs.
+
+    """
+    # Setup and check preconditions.
+    dir_name = "igfs:///test_make_dirs/"
+    self.assertFalse(gfile.Exists(dir_name))
+    # Make directory.
+    gfile.MkDir(dir_name)
+    # Check that directory was created.
+    self.assertTrue(gfile.Exists(dir_name))
+
+  def test_remove(self):
+    """Test remove.
+
+    """
+    # Setup and check preconditions.
+    file_name = "igfs:///test_remove/1"
+    self.assertFalse(gfile.Exists(file_name))
+    with gfile.Open(file_name, mode="w") as w:
+      w.write("")
+    self.assertTrue(gfile.Exists(file_name))
+    # Remove file.
+    gfile.Remove(file_name)
+    # Check that file was removed.
+    self.assertFalse(gfile.Exists(file_name))
+
+  def test_rename_file(self):
+    """Test rename file.
+
+    """
+    # Setup and check preconditions.
+    src_file_name = "igfs:///test_rename_file/1"
+    dst_file_name = "igfs:///test_rename_file/2"
+    with gfile.Open(src_file_name, mode="w") as w:
+      w.write("42")
+    self.assertTrue(gfile.Exists(src_file_name))
+    # Rename file.
+    gfile.Rename(src_file_name, dst_file_name)
+    # Check that only new name of file is available.
+    self.assertFalse(gfile.Exists(src_file_name))
+    self.assertTrue(gfile.Exists(dst_file_name))
+    with gfile.Open(dst_file_name, mode="r") as r:
+      data = r.read()
+    self.assertEqual("42", data)
+
+  def test_rename_dir(self):
+    """Test rename dir.
+
+    """
+    # Setup and check preconditions.
+    src_dir_name = "igfs:///test_rename_dir/1"
+    dst_dir_name = "igfs:///test_rename_dir/2"
+    gfile.MkDir(src_dir_name)
+    # Rename directory.
+    gfile.Rename(src_dir_name, dst_dir_name)
+    # Check that only new name of directory is available.
+    self.assertFalse(gfile.Exists(src_dir_name))
+    self.assertTrue(gfile.Exists(dst_dir_name))
+    self.assertTrue(gfile.IsDirectory(dst_dir_name))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
index ef29b5f14a4b2fea2400ec4d56a7ad2cf44cf2cb..ff5d4c458c859fd8e5e3ae65ee41a454d55d6538 100644
--- a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -21,6 +21,7 @@ import os
 
 from tensorflow.contrib.ignite import IgniteDataset
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -65,7 +66,7 @@ class IgniteDatasetTest(test.TestCase):
     self.assertEqual(dtypes.string, dataset.output_types["val"]["NAME"])
     self.assertEqual(dtypes.int64, dataset.output_types["val"]["VAL"])
 
-    it = dataset.make_one_shot_iterator()
+    it = dataset_ops.make_one_shot_iterator(dataset)
     ne = it.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/contrib/ignite/python/tests/start_ignite.sh b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
index a67bd44f2fb0d654ba07f022a5070c68df8e2ede..112e0dea844620de600e277bff3685dd7c42c49c 100755
--- a/tensorflow/contrib/ignite/python/tests/start_ignite.sh
+++ b/tensorflow/contrib/ignite/python/tests/start_ignite.sh
@@ -20,3 +20,7 @@ SCRIPT_PATH="$( cd "$(dirname "$0")" ; pwd -P )"
 # Start Apache Ignite with plain client listener.
 docker run -itd --name ignite-plain -p 42300:10800 \
 -v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-plain.sh
+
+# Start Apache Ignite with IGFS.
+docker run -itd --name ignite-igfs -p 10500:10500 \
+-v ${SCRIPT_PATH}:/data apacheignite/ignite:${IGNITE_VERSION} /data/bin/start-igfs.sh
\ No newline at end of file
diff --git a/tensorflow/contrib/ignite/python/tests/stop_ignite.sh b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
index 8f03dbd1ede61f548d3de9d9738f97667e75df3c..35b0f32d1b3e1373a231ff23f2b40c8ccc417baf 100755
--- a/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
+++ b/tensorflow/contrib/ignite/python/tests/stop_ignite.sh
@@ -15,5 +15,4 @@
 # ==============================================================================
 
 docker rm -f ignite-plain
-docker rm -f ignite-ssl
-docker rm -f ignite-ssl-auth
+docker rm -f ignite-igfs
\ No newline at end of file
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index da450480b30b548484e69c61c85667d6dd390417..c9d917fe20dbcef1aa4a8dae3db935bcef73b281 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -49,6 +49,7 @@ tf_kernel_library(
         "kernels/image_ops.h",
     ],
     deps = [
+        ":image_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -74,7 +75,6 @@ tf_custom_op_py_library(
     dso = [":python/ops/_image_ops.so"],
     kernels = [
         ":image_ops_kernels",
-        ":image_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -128,6 +128,26 @@ tf_custom_op_library(
     ],
 )
 
+tf_kernel_library(
+    name = "distort_image_ops_kernels",
+    srcs = [
+        "kernels/adjust_hsv_in_yiq_op.cc",
+        "kernels/adjust_hsv_in_yiq_op.h",
+    ],
+    gpu_srcs = [
+        "kernels/adjust_hsv_in_yiq_op_gpu.cu.cc",
+        "kernels/adjust_hsv_in_yiq_op.h",
+    ],
+    deps = [
+        ":distort_image_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:gpu_util_hdrs",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_test(
     name = "adjust_hsv_in_yiq_op_test",
     size = "small",
@@ -155,13 +175,16 @@ tf_gen_op_wrapper_py(
     deps = [":distort_image_ops_op_lib"],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "distort_image_py",
     srcs = [
         "__init__.py",
         "python/ops/distort_image_ops.py",
     ],
-    data = [":python/ops/_distort_image_ops.so"],
+    dso = [":python/ops/_distort_image_ops.so"],
+    kernels = [
+        ":distort_image_ops_kernels",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":distort_image_ops",
@@ -338,25 +361,36 @@ tf_gen_op_libs(
     op_lib_names = ["single_image_random_dot_stereograms_ops"],
 )
 
+tf_kernel_library(
+    name = "single_image_random_dot_stereograms_kernels",
+    srcs = [
+        "kernels/single_image_random_dot_stereograms_ops.cc",
+    ],
+    deps = [
+        ":single_image_random_dot_stereograms_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_gen_op_wrapper_py(
     name = "single_image_random_dot_stereograms_ops",
     deps = [":single_image_random_dot_stereograms_ops_op_lib"],
 )
 
-cc_library(
+alias(
     name = "image_ops_cc",
-    srcs = ["ops/image_ops.cc"],
-    deps = [
-        ":image_ops_kernels",
-        "//tensorflow/core:framework",
-    ],
-    alwayslink = 1,
+    actual = ":image_ops_op_lib",
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "single_image_random_dot_stereograms_py",
     srcs = glob(["python/ops/single*.py"]) + ["__init__.py"],
-    data = [":python/ops/_single_image_random_dot_stereograms.so"],
+    dso = [":python/ops/_single_image_random_dot_stereograms.so"],
+    kernels = [
+        ":single_image_random_dot_stereograms_kernels",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":image_py",
diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
index f230d93da4a9c01e8dee47aa258d9c28499469f1..91b8e8d0f93c5ac6af0e1863ab309eb97525f6a0 100755
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -58,6 +58,7 @@ from tensorflow.contrib.image.python.ops.distort_image_ops import adjust_hsv_in_
 from tensorflow.contrib.image.python.ops.distort_image_ops import random_hsv_in_yiq
 
 from tensorflow.contrib.image.python.ops.image_ops import angles_to_projective_transforms
+from tensorflow.contrib.image.python.ops.image_ops import bipartite_match
 from tensorflow.contrib.image.python.ops.image_ops import compose_transforms
 from tensorflow.contrib.image.python.ops.image_ops import connected_components
 from tensorflow.contrib.image.python.ops.image_ops import flat_transforms_to_matrices
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
index 478b716d88321101c971789f36c0ff8ecd3f418e..108da04494685f06f9afc26a26a5dadcdd99b0ff 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
@@ -115,7 +115,7 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, &tranformation_matrix](
+          [&input_data, &output_data, &tranformation_matrix](
               int64 start_channel, int64 end_channel) {
             // Applying projection matrix to input RGB vectors.
             const float* p = input_data.data() + start_channel * kChannelSize;
diff --git a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
index 24b790977dfdb675ff7bf0a119a08e243a30d3aa..ae9c7a611945e1445c933d74b9944054b3f0e0a4 100644
--- a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
@@ -24,7 +24,7 @@ from tensorflow.contrib.image.python.ops import dense_image_warp
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
@@ -259,7 +259,7 @@ class DenseImageWarpTest(test_util.TensorFlowTestCase):
 
     shape = [1, 2, 1, 1]
     msg = 'Should have raised an exception for invalid image size'
-    with self.assertRaises(ValueError, msg=msg):
+    with self.assertRaises(errors.InvalidArgumentError, msg=msg):
       self.check_interpolation_correctness(shape, 'float32', 'float32')
 
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index 4997c31a7fc7f4243d03b22fc9c01fb13a2a25a4..ba5cdfebf92c07e496ed588848d5859ff6a5bff2 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -281,6 +281,13 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
             value.eval(),
             np.array([[4, 4], [4, 4]]).astype(dtype.as_numpy_dtype()))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_transform_eager(self):
+    image = constant_op.constant([[1., 2.], [3., 4.]])
+    value = image_ops.transform(image, [1] * 8)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(self.evaluate(value), np.array([[4, 4], [4, 4]]))
+
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/contrib/image/python/ops/dense_image_warp.py b/tensorflow/contrib/image/python/ops/dense_image_warp.py
index f9b219ada492466919c615d8978e462e6c619d33..f7ced440720209cb05dfcd79395c51517f9de0d5 100644
--- a/tensorflow/contrib/image/python/ops/dense_image_warp.py
+++ b/tensorflow/contrib/image/python/ops/dense_image_warp.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -60,28 +61,38 @@ def _interpolate_bilinear(grid,
       msg = 'Grid must be 4 dimensional. Received size: '
       raise ValueError(msg + str(grid.get_shape()))
 
-    batch_size, height, width, channels = shape
+    batch_size, height, width, channels = (array_ops.shape(grid)[0],
+                                           array_ops.shape(grid)[1],
+                                           array_ops.shape(grid)[2],
+                                           array_ops.shape(grid)[3])
+
+    shape = [batch_size, height, width, channels]
     query_type = query_points.dtype
     grid_type = grid.dtype
 
-    if (len(query_points.get_shape()) != 3 or
-        query_points.get_shape()[2].value != 2):
-      msg = ('Query points must be 3 dimensional and size 2 in dim 2. Received '
-             'size: ')
-      raise ValueError(msg + str(query_points.get_shape()))
-
-    _, num_queries, _ = query_points.get_shape().as_list()
-
-    if height < 2 or width < 2:
-      msg = 'Grid must be at least batch_size x 2 x 2 in size. Received size: '
-      raise ValueError(msg + str(grid.get_shape()))
-
-    alphas = []
-    floors = []
-    ceils = []
-
-    index_order = [0, 1] if indexing == 'ij' else [1, 0]
-    unstacked_query_points = array_ops.unstack(query_points, axis=2)
+    with ops.control_dependencies([
+        check_ops.assert_equal(
+            len(query_points.get_shape()),
+            3,
+            message='Query points must be 3 dimensional.'),
+        check_ops.assert_equal(
+            array_ops.shape(query_points)[2],
+            2,
+            message='Query points must be size 2 in dim 2.')
+    ]):
+      num_queries = array_ops.shape(query_points)[1]
+
+    with ops.control_dependencies([
+        check_ops.assert_greater_equal(
+            height, 2, message='Grid height must be at least 2.'),
+        check_ops.assert_greater_equal(
+            width, 2, message='Grid width must be at least 2.')
+    ]):
+      alphas = []
+      floors = []
+      ceils = []
+      index_order = [0, 1] if indexing == 'ij' else [1, 0]
+      unstacked_query_points = array_ops.unstack(query_points, axis=2)
 
     for dim in index_order:
       with ops.name_scope('dim-' + str(dim)):
@@ -112,16 +123,18 @@ def _interpolate_bilinear(grid,
         alpha = array_ops.expand_dims(alpha, 2)
         alphas.append(alpha)
 
-    if batch_size * height * width > np.iinfo(np.int32).max / 8:
-      error_msg = """The image size or batch size is sufficiently large
-                     that the linearized addresses used by array_ops.gather
-                     may exceed the int32 limit."""
-      raise ValueError(error_msg)
-
-    flattened_grid = array_ops.reshape(grid,
-                                       [batch_size * height * width, channels])
-    batch_offsets = array_ops.reshape(
-        math_ops.range(batch_size) * height * width, [batch_size, 1])
+    with ops.control_dependencies([
+        check_ops.assert_less_equal(
+            math_ops.cast(batch_size * height * width, dtype=dtypes.float32),
+            np.iinfo(np.int32).max / 8,
+            message="""The image size or batch size is sufficiently large
+                       that the linearized addresses used by array_ops.gather
+                       may exceed the int32 limit.""")
+    ]):
+      flattened_grid = array_ops.reshape(
+          grid, [batch_size * height * width, channels])
+      batch_offsets = array_ops.reshape(
+          math_ops.range(batch_size) * height * width, [batch_size, 1])
 
     # This wraps array_ops.gather. We reshape the image data such that the
     # batch, y, and x coordinates are pulled into the first dimension.
@@ -182,7 +195,11 @@ def dense_image_warp(image, flow, name='dense_image_warp'):
                 of dimensions.
   """
   with ops.name_scope(name):
-    batch_size, height, width, channels = image.get_shape().as_list()
+    batch_size, height, width, channels = (array_ops.shape(image)[0],
+                                           array_ops.shape(image)[1],
+                                           array_ops.shape(image)[2],
+                                           array_ops.shape(image)[3])
+
     # The flow is defined on the image grid. Turn the flow into a list of query
     # points in the grid space.
     grid_x, grid_y = array_ops.meshgrid(
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index d4fb99a017faebe30384d739f22f4ff5fa986bc4..b25a6f7b5742917a032946fe03a0dab20e7dc1ad 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.contrib.image.ops import gen_image_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
@@ -271,8 +272,11 @@ def transform(images,
       raise TypeError("Images should have rank between 2 and 4.")
 
     if output_shape is None:
-      output_shape = tensor_util.constant_value(
-          array_ops.shape(images)[1:3]) or array_ops.shape(images)[1:3]
+      output_shape = array_ops.shape(images)[1:3]
+      if not context.executing_eagerly():
+        output_shape_value = tensor_util.constant_value(output_shape)
+        if output_shape_value is not None:
+          output_shape = output_shape_value
 
     output_shape = ops.convert_to_tensor(
         output_shape, dtypes.int32, name="output_shape")
diff --git a/tensorflow/contrib/image/python/ops/interpolate_spline.py b/tensorflow/contrib/image/python/ops/interpolate_spline.py
index f0b408faa3320741cf83b3aaec0f40030f906578..3a444d26c2a45278261b684da4fa4ac249d0d5cd 100644
--- a/tensorflow/contrib/image/python/ops/interpolate_spline.py
+++ b/tensorflow/contrib/image/python/ops/interpolate_spline.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -100,12 +101,12 @@ def _solve_interpolation(train_points, train_values, order,
   b, n, _ = array_ops.unstack(array_ops.shape(train_points), num=3)
 
   d = train_points.shape[-1]
-  if d.value is None:
+  if tensor_shape.dimension_value(d) is None:
     raise ValueError('The dimensionality of the input points (d) must be '
                      'statically-inferrable.')
 
   k = train_values.shape[-1]
-  if k.value is None:
+  if tensor_shape.dimension_value(k) is None:
     raise ValueError('The dimensionality of the output values (k) must be '
                      'statically-inferrable.')
 
diff --git a/tensorflow/contrib/image/python/ops/sparse_image_warp.py b/tensorflow/contrib/image/python/ops/sparse_image_warp.py
index 1ea8f705b7e6f522281de6384de0d42efab6a406..51449ff5e938946844fd4245215008a257e8b045 100644
--- a/tensorflow/contrib/image/python/ops/sparse_image_warp.py
+++ b/tensorflow/contrib/image/python/ops/sparse_image_warp.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.image.python.ops import interpolate_spline
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 
 
@@ -76,7 +77,7 @@ def _add_zero_flow_controls_at_boundary(control_point_locations,
     merged_control_point_flows: augmented set of control point flows
   """
 
-  batch_size = control_point_locations.get_shape()[0].value
+  batch_size = tensor_shape.dimension_value(control_point_locations.shape[0])
 
   boundary_point_locations = _get_boundary_locations(image_height, image_width,
                                                      boundary_points_per_edge)
diff --git a/tensorflow/contrib/integrate/python/ops/odes.py b/tensorflow/contrib/integrate/python/ops/odes.py
index 7b7ac4f347e30d20eb2f4889e0cae5669c975e4f..b7d77130bd03ba05aca3ab94ccf94eb2ed5d9347 100644
--- a/tensorflow/contrib/integrate/python/ops/odes.py
+++ b/tensorflow/contrib/integrate/python/ops/odes.py
@@ -540,7 +540,8 @@ def odeint(func,
         **options)
 
 
-class _FixedGridIntegrator(six.with_metaclass(abc.ABCMeta)):
+@six.add_metaclass(abc.ABCMeta)
+class _FixedGridIntegrator(object):
   """Base class for fixed-grid ODE integrators."""
 
   def integrate(self, evol_func, y0, time_grid, dt_grid, steps_on_intervals):
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
index adf027b8e714124cde2b4618546e20c6b7162e1f..69553c3bd15c9359a6ab879bc4e104bd5c30beac 100644
--- a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
+++ b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
@@ -22,8 +22,12 @@ if [ "$#" -ne 2 ]; then
   exit 1
 fi
 
+action=$1
 container=$2
-if [ "$1" == "start" ]; then
+if [ "$action" == "start" ]; then
+    echo pull spotify/kafka
+    docker pull spotify/kafka
+    echo pull spotify/kafka successfully
     docker run -d --rm --net=host --name=$container spotify/kafka
     echo Wait 5 secs until kafka is up and running
     sleep 5
@@ -33,12 +37,10 @@ if [ "$1" == "start" ]; then
     docker exec $container bash -c 'echo -e "D0\nD1\nD2\nD3\nD4\nD5\nD6\nD7\nD8\nD9" > /test'
     echo Produce test message
     docker exec $container bash -c '/opt/kafka_2.11-0.10.1.0/bin/kafka-console-producer.sh --topic test --broker-list 127.0.0.1:9092 < /test'
-
     echo Container $container started successfully
-elif [ "$1" == "stop" ]; then
+elif [ "$action" == "stop" ]; then
     docker rm -f $container
-
-    echo Container $container stopped successfully
+    echo Container $container removed successfully
 else
   echo "Usage: $0 start|stop <kafka container name>" >&2
   exit 1
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
index 7129f09e8b42e48a9c768fd4a66cde3d4da9d31d..2b86331099ccae03664462987ee0c141d766c10f 100644
--- a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 from tensorflow.contrib.kafka.python.ops import gen_dataset_ops
 from tensorflow.contrib.kafka.python.ops import kafka_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 
 
 class KafkaDataset(dataset_ops.DatasetSource):
@@ -63,13 +63,5 @@ class KafkaDataset(dataset_ops.DatasetSource):
                                          self._group, self._eof, self._timeout)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py
index 3327a9f9a613bfb56e6a25af0fe1c0ca18609035..9e19884df852c0fd259a55aef56c62b4189cd1da 100644
--- a/tensorflow/contrib/keras/api/keras/layers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 # Generic layers.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
diff --git a/tensorflow/contrib/keras/api/keras/utils/__init__.py b/tensorflow/contrib/keras/api/keras/utils/__init__.py
index 47cd01b924fb43e8a83836c58f8ced61e9e88268..3b9fa1b230b837a350d521c4165053c187786201 100644
--- a/tensorflow/contrib/keras/api/keras/utils/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/utils/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
 from tensorflow.python.keras.utils.vis_utils import plot_model
diff --git a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
index de7530231db4ea4f50996a67eb8c0d6936db9dd3..1626e55b9b3bc82bd96703bfab765ac6ad81f462 100644
--- a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
+++ b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
@@ -90,7 +90,7 @@ def _update_features_and_columns(features, feature_columns,
     mapped_column_name = column_name + "_MAPPED"
     # Construct new feature columns based on provided kernel_mappers.
     column_kernel_mappers = kernel_mappers_dict[feature_column]
-    new_dim = sum([mapper.output_dim for mapper in column_kernel_mappers])
+    new_dim = sum(mapper.output_dim for mapper in column_kernel_mappers)
     mapped_columns.add(
         layers.feature_column.real_valued_column(mapped_column_name, new_dim))
 
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/dense_kernel_mapper.py b/tensorflow/contrib/kernel_methods/python/mappers/dense_kernel_mapper.py
index db38b471520e1922392e7aaf8ee66d7f304248c9..04ecdbfdb6625766eb87c1527592e616c5cdfbf9 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/dense_kernel_mapper.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/dense_kernel_mapper.py
@@ -35,7 +35,6 @@ class DenseKernelMapper(object):
 
   This class is abstract. Users should not create instances of this class.
   """
-  __metaclass__ = abc.ABCMeta
 
   @abc.abstractmethod
   def map(self, input_tensor):
diff --git a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
index 75806dbbeb1819bb0a6965bbc384e02df9895210..20395395281768ac429984a1e3552cfd187527a2 100644
--- a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
+++ b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 from tensorflow.contrib.kinesis.python.ops import gen_dataset_ops
 from tensorflow.contrib.kinesis.python.ops import kinesis_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 
 
 class KinesisDataset(dataset_ops.DatasetSource):
@@ -34,15 +34,12 @@ class KinesisDataset(dataset_ops.DatasetSource):
 
   For example, we can construct and use the KinesisDataset as follows:
   ```python
+  tf.enable_eager_execution()
+
   dataset = tf.contrib.kinesis.KinesisDataset(
       "kinesis_stream_name", read_indefinitely=False)
-  next = dataset.make_one_shot_iterator().get_next()
-  with tf.Session() as sess:
-    while True:
-      try:
-        print(sess.run(nxt))
-      except tf.errors.OutOfRangeError:
-        break
+  for element in dataset:
+    print(element)
   ```
 
   Since Kinesis is a data streaming service, data may not be available
@@ -84,13 +81,5 @@ class KinesisDataset(dataset_ops.DatasetSource):
         self._stream, self._shard, self._read_indefinitely, self._interval)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index c8812d4b23f94102d093db878a709b090a3318d6..588f15b867c1fedbadd5a5d945d870a356549468 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -70,7 +70,10 @@ py_test(
         "python/ops/core_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "noasan",  # TODO(b/119323169)
+    ],
     deps = [
         ":_typecheck",
         ":core",
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index b4fe8cac74cb7d29b9646b6b968ccf37b3d6ea7a..9ca6f8df5dbe3c236c4cd85095176ce69ad9deaa 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -1,15 +1,16 @@
 # Description:
 #   contains parts of TensorFlow that are experimental or unstable and which are not supported.
 
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
 package(default_visibility = [
     "//learning/brain:__subpackages__",
     "//tensorflow:__subpackages__",
+    "//tensorflow_model_optimization:__subpackages__",
 ])
 
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "py_test")
@@ -77,6 +78,11 @@ tf_custom_op_py_library(
         ":sparse_feature_cross_op_op_lib",
     ],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//video/youtube/personalization:__subpackages__",
+    ],
     deps = [
         ":sparse_feature_cross_op",
         "//tensorflow/contrib/framework:framework_py",
@@ -252,7 +258,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -276,7 +282,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index af8e673f5906ad972408d30f23f2e8ba7e031a00..32f3006b749e3b34572a8d642054c0ec4c4664b0 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -14,10 +14,6 @@
 # ==============================================================================
 """Ops for building neural network layers, regularizers, summaries, etc.
 
-See the
-[Contrib Layers](https://tensorflow.org/api_guides/python/contrib.layers)
-guide.
-
 @@avg_pool2d
 @@avg_pool3d
 @@batch_norm
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index 60e1d85ea9c08a51763fdaf08853f8d9b67347e5..429d696daf0baf85f6a60aa4d299b513d90c5925 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -124,7 +124,8 @@ def safe_embedding_lookup_sparse(embedding_weights,
                                            sparse_weights]) as scope:
     # Reshape higher-rank sparse ids and weights to linear segment ids.
     original_shape = sparse_ids.dense_shape
-    original_rank_dim = sparse_ids.dense_shape.get_shape()[0]
+    original_rank_dim = tensor_shape.Dimension(tensor_shape.dimension_value(
+        sparse_ids.dense_shape.get_shape()[0]))
     original_rank = (
         array_ops.size(original_shape)
         if original_rank_dim.value is None
@@ -349,7 +350,7 @@ def _sampled_scattered_embedding_lookup(
       shape = params[p].get_shape()
       shape.assert_has_rank(1)
       shape.assert_is_fully_defined()
-      partition_sizes.append(shape[0].value)
+      partition_sizes.append(tensor_shape.dimension_value(shape[0]))
     num_params = sum(partition_sizes)  # Total number of parameters.
 
     # Assert the size of each partition.
@@ -779,16 +780,16 @@ def _embedding_lookup_with_distributed_aggregation(params,
         # Compute num_total_ids as the sum of dim-0 of params, then assign to
         # partitions based on a constant number of ids per partition. Optimize
         # if we already know the full shape statically.
-        dim_0_size = params[0].get_shape()[0]
+        dim_0_size = params[0].get_shape().dims[0]
         for p in xrange(1, np):
-          dim_0_size += params[p].get_shape()[0]
+          dim_0_size += params[p].get_shape().dims[0]
         if dim_0_size.value:
-          num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
+          num_total_ids = constant_op.constant(dim_0_size, flat_ids.dtype)
         else:
           dim_0_sizes = []
           for p in xrange(np):
-            if params[p].get_shape()[0].value is not None:
-              dim_0_sizes.append(params[p].get_shape()[0].value)
+            if params[p].get_shape().dims[0].value is not None:
+              dim_0_sizes.append(params[p].get_shape().dims[0].value)
             else:
               with ops.colocate_with(params[p]):
                 dim_0_sizes.append(array_ops.shape(params[p])[0])
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index 124515e5a6474f2cc1038830346e27277c6ceea7..295c721fceda6aaaf8672525ceed560308db6af7 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import itertools
 import math
+import sys
 
 import numpy as np
 
@@ -36,6 +37,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -48,11 +50,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= vocab_size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32)
+    embedding_weights = list(variable_scope.get_variable(
+        "embedding_weights",
         shape=[vocab_size, embed_dim],
-        slicing=[num_shards, 1],
-        initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32))
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
+        initializer=initializer))
     for w in embedding_weights:
       w.initializer.run()
     embedding_weights = [w.eval() for w in embedding_weights]
@@ -256,6 +260,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
                         embedding_weights, sparse_ids, sparse_weights)
 
 
+# pylint: disable=invalid-name
+def local_variable_scope():
+  """Create a variable scope named like the caller function."""
+  return variable_scope.variable_scope(sys._getframe(1).f_code.co_name)
+# pylint: enable=invalid-name
+
+
 class ScatteredEmbeddingLookupTest(test.TestCase):
 
   def setUp(self):
@@ -266,17 +277,18 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    embedding_weights = list(variable_scope.get_variable(
+        "embedding_weights",
         shape=[size],
-        slicing=[num_shards],
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
         initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0, dtype=dtypes.float32))
+            mean=0.0, stddev=1.0, dtype=dtypes.float32)))
     for w in embedding_weights:
       w.initializer.run()
     return embedding_weights
 
   def test_scattered_embedding_consistency(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant(["foo", "foo"])
 
@@ -288,7 +300,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
                           embedding_lookup_result[1])
 
   def test_scattered_embedding_multiple_partition(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights(num_shards=7)
       values = constant_op.constant([4, 4, 5])
 
@@ -304,7 +316,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
       self.assertGreater(embedding_diff, 0)
 
   def test_scattered_embedding_coverage(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       size = 8
       embedding_weights = self._random_weights(size=size, num_shards=3)
       values = constant_op.constant(["foo"])
@@ -316,7 +328,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
       self.assertEqual(len(np.unique(embedding_lookup_result[0])), size)
 
   def test_scattered_embedding_multi_dimension(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant([["foo", "bar", "bar"],
                                      ["bar", "bar", "foo"]])
@@ -329,7 +341,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
                           embedding_lookup_result[1][2])
 
   def test_scattered_embedding_lookup_sparse(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights(num_shards=3)
       sparse_tensor = sparse_tensor_lib.SparseTensor(
           values=["foo", "bar", "foo", "bar"],
@@ -358,7 +370,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
     embeds = np.random.randn(n_embed, d_embed)
     idx = np.random.randint(0, n_embed, idx_shape)
 
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedded_np = embeds[idx]
       embedded_tf = embedding_ops.embedding_lookup_unique(embeds, idx).eval()
 
@@ -370,7 +382,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
     idx = np.random.randint(0, 5, 10)
     idx2d = np.random.randint(0, 5, (10, 2))
 
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedded_np = embeds[idx]
       embedded_np2d = embeds[idx2d]
       embedded_tf = embedding_ops.embedding_lookup_unique(embeds, idx).eval()
@@ -398,17 +410,18 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    embedding_weights = list(variable_scope.get_variable(
+        "embedding_weights",
         shape=[size],
-        slicing=[num_shards],
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
         initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0, dtype=dtypes.float32))
+            mean=0.0, stddev=1.0, dtype=dtypes.float32)))
     for w in embedding_weights:
       w.initializer.run()
     return embedding_weights
 
   def test_hashed_embedding_consistency(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant(["foo", "foo"])
       # The first three sampled_candidates are equal, so the first three
@@ -429,7 +442,7 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
                           embedding_lookup_result[1][3])
 
   def test_hashed_embedding_multi_dimension(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant([["foo", "bar", "bar"],
                                      ["bar", "bar", "foo"]])
@@ -691,7 +704,6 @@ class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
       index += num_val
     return grouped_vals
 
-  @test_util.enable_c_shapes
   def testEmbeddingLookupSparse(self):
     vocab_size = 13
     batch_size = 10
diff --git a/tensorflow/contrib/layers/python/layers/encoders.py b/tensorflow/contrib/layers/python/layers/encoders.py
index f42112206d0db9d2e42bd4cff19f6a6533951d46..3671633c8d795034b13cb55fd6db87c453e9fa12 100644
--- a/tensorflow/contrib/layers/python/layers/encoders.py
+++ b/tensorflow/contrib/layers/python/layers/encoders.py
@@ -84,8 +84,7 @@ def bow_encoder(ids,
       if isinstance(ids, sparse_tensor.SparseTensor):
         raise TypeError('ids are expected to be dense Tensor, got: %s', ids)
       return math_ops.reduce_mean(
-          embedding_ops.embedding_lookup(embeddings, ids),
-          reduction_indices=1)
+          embedding_ops.embedding_lookup(embeddings, ids), axis=1)
 
 
 def embed_sequence(ids,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 53c8ae5d0893641c79a7f24851a10afc44a2144a..00d819ed0e9fe3a5644105a571beda100204631e 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -194,6 +194,7 @@ class _DeepEmbeddingLookupArguments(
   pass
 
 
+@six.add_metaclass(abc.ABCMeta)
 class _FeatureColumn(object):
   """Represents a feature column abstraction.
 
@@ -205,7 +206,6 @@ class _FeatureColumn(object):
   Following classes (_SparseColumn, _RealValuedColumn, ...) are concrete
   instances.
   """
-  __metaclass__ = abc.ABCMeta
 
   @abc.abstractproperty
   @deprecation.deprecated(
@@ -1015,8 +1015,7 @@ class _OneHotColumn(
         dense_id_tensor, depth=self.length, on_value=1.0, off_value=0.0)
 
     # Reduce to get a multi-hot per example.
-    return math_ops.reduce_sum(
-        one_hot_id_tensor, reduction_indices=[output_rank - 1])
+    return math_ops.reduce_sum(one_hot_id_tensor, axis=[output_rank - 1])
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 6fb4b9ff3534cab34c84de5d13fea7aff756556d..7e6eafaa0d6f60cfc28a4c422abac0b6d5a991fb 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -27,7 +27,7 @@ from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index d90d6ecf7f671a40a7ff2b066b6782c7421f9887..cab8da808b6413518ff4864cb0b03a42809260f1 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -27,7 +27,7 @@ import numpy as np
 
 from tensorflow.contrib.layers.python.layers import feature_column as fc
 from tensorflow.contrib.layers.python.layers import feature_column_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index a82d4c19510df2c6dcc3cdac2a808823795149a9..403b522ce45ac6ad98a321378626b87aaa7738aa 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base
 from tensorflow.python.layers import convolutional as convolutional_layers
 from tensorflow.python.layers import core as core_layers
@@ -274,7 +275,7 @@ def _fused_batch_norm(inputs,
                        ' Expected 2 or 4 but got %d' % (inputs.name,
                                                         original_rank))
     if original_rank == 2:
-      channels = inputs.get_shape()[-1].value
+      channels = inputs.get_shape().dims[-1].value
       if channels is None:
         raise ValueError('`C` dimension must be known but is None')
       new_shape = [-1, 1, 1, channels]
@@ -692,7 +693,7 @@ def batch_norm(inputs,
       # explicitly reshape the params to params_shape_broadcast when computing
       # the moments and the batch normalization.
       params_shape_broadcast = list(
-          [1, inputs_shape[1].value] + [1 for _ in range(2, inputs_rank)])
+          [1, inputs_shape.dims[1].value] + [1 for _ in range(2, inputs_rank)])
     else:
       moments_axes = list(range(inputs_rank - 1))
       params_shape = inputs_shape[-1:]
@@ -890,7 +891,7 @@ def bias_add(inputs,
     elif inputs_rank != 4 and data_format == DATA_FORMAT_NCHW:
       raise ValueError('Data format NCHW only supports 4D Tensor')
     axis = 1 if data_format == DATA_FORMAT_NCHW else -1
-    num_features = inputs_shape[axis].value
+    num_features = inputs_shape.dims[axis].value
     if num_features is None:
       raise ValueError('`C` dimension must be known but is None')
     biases_collections = utils.get_variable_collections(variables_collections,
@@ -1823,8 +1824,8 @@ def fully_connected(inputs,
     ValueError: If x has rank less than 2 or if its last dimension is not set.
   """
   if not isinstance(num_outputs, six.integer_types):
-    raise ValueError('num_outputs should be int or long, got %s.' %
-                     (num_outputs,))
+    raise ValueError('num_outputs type should be one of %s, got %s.' % (
+        list(six.integer_types), type(num_outputs)))
 
   layer_variable_getter = _build_variable_getter({
       'bias': 'biases',
@@ -1958,7 +1959,7 @@ class GDN(base.Layer):
     self._reparam_offset = reparam_offset
     self.data_format = data_format
     self._channel_axis()  # trigger ValueError early
-    self.input_spec = base.InputSpec(min_ndim=3, max_ndim=5)
+    self.input_spec = input_spec.InputSpec(min_ndim=3, max_ndim=5)
 
   def _channel_axis(self):
     try:
@@ -2010,12 +2011,12 @@ class GDN(base.Layer):
   def build(self, input_shape):
     channel_axis = self._channel_axis()
     input_shape = tensor_shape.TensorShape(input_shape)
-    num_channels = input_shape[channel_axis].value
+    num_channels = input_shape.dims[channel_axis].value
     if num_channels is None:
       raise ValueError('The channel dimension of the inputs to `GDN` '
                        'must be defined.')
     self._input_rank = input_shape.ndims
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         ndim=input_shape.ndims, axes={
             channel_axis: num_channels
         })
@@ -2100,7 +2101,7 @@ class GDN(base.Layer):
     input_shape = tensor_shape.TensorShape(input_shape)
     if not 3 <= input_shape.ndim <= 5:
       raise ValueError('`input_shape` must be of rank 3 to 5, inclusive.')
-    if input_shape[channel_axis].value is None:
+    if input_shape.dims[channel_axis].value is None:
       raise ValueError(
           'The channel dimension of `input_shape` must be defined.')
     return input_shape
@@ -2951,7 +2952,7 @@ def spatial_softmax(features,
       num_channels, height, width = static_shape[1], shape[2], shape[3]
     else:
       raise ValueError('data_format has to be either NCHW or NHWC.')
-    if num_channels.value is None:
+    if tensor_shape.dimension_value(num_channels) is None:
       raise ValueError('The num_channels dimension of the inputs to '
                        '`spatial_softmax` should be defined. Found `None`.')
 
@@ -2994,9 +2995,11 @@ def spatial_softmax(features,
       expected_y = math_ops.reduce_sum(
           pos_y * softmax_attention, [1], keepdims=True)
       expected_xy = array_ops.concat([expected_x, expected_y], 1)
-      feature_keypoints = array_ops.reshape(expected_xy,
-                                            [-1, num_channels.value * 2])
-      feature_keypoints.set_shape([None, num_channels.value * 2])
+      feature_keypoints = array_ops.reshape(
+          expected_xy,
+          [-1, tensor_shape.dimension_value(num_channels) * 2])
+      feature_keypoints.set_shape(
+          [None, tensor_shape.dimension_value(num_channels) * 2])
   return feature_keypoints
 
 
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 3b7ae72e9c460ee7a38f72b03e1c1ad48e335f57..d791418c9d0f887058ceb535092fa8122da1aa75 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -630,7 +630,7 @@ class ConvolutionTest(test.TestCase):
       expected_size = [None, num_filters, None, None]
       expected_size_dynamic = [5, num_filters, 7, 9]
 
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         images = array_ops.placeholder(np.float32,
                                        [None, input_size[1], None, None])
         output = layers_lib.convolution2d(
@@ -721,7 +721,7 @@ class Convolution2dTransposeTests(test.TestCase):
   def testOutputSizeWithStrideOneSamePaddingNCHW(self):
     # `NCHW` data format is only supported for `GPU` device.
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 32
         input_size = [5, 3, 10, 12]
         expected_size = [5, num_filters, 10, 12]
@@ -740,7 +740,7 @@ class Convolution2dTransposeTests(test.TestCase):
 
   def testOutputSizeWithStrideOneValidPaddingNCHW(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 32
         input_size = [5, 3, 10, 12]
         expected_size = [5, num_filters, 12, 14]
@@ -759,7 +759,7 @@ class Convolution2dTransposeTests(test.TestCase):
 
   def testOutputSizeWithStrideTwoValidPaddingNCHW(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 32
         input_size = [5, 3, 9, 11]
         expected_size = [5, num_filters, 19, 23]
@@ -779,7 +779,7 @@ class Convolution2dTransposeTests(test.TestCase):
 
   def testOutputSizeWith1x1StrideTwoSamePaddingNCHW(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 1
         input_size = [1, 1, 1, 1]
         expected_size = [1, num_filters, 2, 2]
@@ -799,7 +799,7 @@ class Convolution2dTransposeTests(test.TestCase):
 
   def testOutputSizeWith1x1StrideTwoValidPaddingNCHW(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 1
         input_size = [1, 1, 1, 1]
         expected_size = [1, num_filters, 2, 2]
@@ -817,7 +817,7 @@ class Convolution2dTransposeTests(test.TestCase):
 
   def testOutputSizeWith2x2StrideTwoSamePaddingNCHW(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 1
         input_size = [1, 1, 2, 2]
         expected_size = [1, num_filters, 4, 4]
@@ -835,7 +835,7 @@ class Convolution2dTransposeTests(test.TestCase):
 
   def testOutputSizeWith2x2StrideTwoValidPaddingNCHW(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 1
         input_size = [1, 1, 2, 2]
         expected_size = [1, num_filters, 4, 4]
@@ -853,7 +853,7 @@ class Convolution2dTransposeTests(test.TestCase):
 
   def testOutputSizeWithStride2x1NCHW(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 1
         input_size = [1, 1, 3, 2]
         expected_size = [1, num_filters, 6, 5]
@@ -871,7 +871,7 @@ class Convolution2dTransposeTests(test.TestCase):
 
   def testOutputSizeWithStride2x4NCHW(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 1
         input_size = [1, 1, 3, 2]
         expected_size = [1, num_filters, 6, 8]
@@ -889,7 +889,7 @@ class Convolution2dTransposeTests(test.TestCase):
 
   def testOutputSizeWithStride2x5NCHW(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         num_filters = 1
         input_size = [1, 1, 3, 2]
         expected_size = [1, num_filters, 6, 10]
@@ -1459,13 +1459,6 @@ class DropoutTest(test.TestCase):
 
 class FlattenTest(test.TestCase):
 
-  def testInvalidRank(self):
-    with ops.Graph().as_default() as g, self.session(g):
-      inputs = array_ops.placeholder(dtype=dtypes.float32)
-      inputs.set_shape(tensor_shape.TensorShape((5,)))
-      with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
-        _layers.flatten(inputs)
-
   def testUnknownLastDim(self):
     with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
@@ -1502,6 +1495,12 @@ class FlattenTest(test.TestCase):
                        images.get_shape().num_elements())
       self.assertEqual(output.get_shape()[0], images.get_shape()[0])
 
+  def testFlatten0D(self):
+    with self.cached_session():
+      scalars = random_ops.random_uniform((5,), seed=1, name='scalars')
+      output = _layers.flatten(scalars)
+      self.assertEqual(output.shape, (5, 1))
+
   def testFlattenBatchSize(self):
     height, width = 3, 3
     with self.cached_session() as sess:
@@ -2056,7 +2055,7 @@ class BatchNormTest(test.TestCase):
     channels = 3
     np.random.seed(1)
     use_gpu = fused
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.session(use_gpu=use_gpu) as sess:
       if data_format == 'NHWC':
         image_shape = (batch_size, height, width, channels)
         axis = (0, 1, 2)
@@ -2140,7 +2139,7 @@ class BatchNormTest(test.TestCase):
     channels = 3
     np.random.seed(1)
     use_gpu = fused
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.session(use_gpu=use_gpu) as sess:
       if data_format == 'NHWC':
         image_shape = (batch_size, height, width, channels)
         axis = (0, 1, 2)
@@ -2344,7 +2343,7 @@ class BatchNormTest(test.TestCase):
     np.random.seed(1)
     use_gpu = fused
     np.random.seed(1)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.session(use_gpu=use_gpu) as sess:
       if data_format == 'NHWC':
         image_shape = (batch_size, height, width, channels)
         axis = (0, 1, 2)
@@ -2491,7 +2490,7 @@ class BatchNormTest(test.TestCase):
     channels = 3
     np.random.seed(1)
     use_gpu = fused
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.session(use_gpu=use_gpu) as sess:
       if data_format == 'NHWC':
         image_shape = (batch_size, height, width, channels)
         axis = (0, 1, 2)
@@ -2576,7 +2575,7 @@ class BatchNormTest(test.TestCase):
     channels = 32
     np.random.seed(1)
     use_gpu = fused
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.session(use_gpu=use_gpu) as sess:
       if data_format == 'NHWC':
         image_shape = (batch_size, height, width, channels)
         axis = (0, 1, 2)
@@ -2674,7 +2673,7 @@ class BatchNormTest(test.TestCase):
 
   def _runBatchNormalizationWithFormat(self, shape, data_format, is_training):
     channels = shape[-1]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       images = np.arange(np.product(shape), dtype=np.float32).reshape(shape)
       beta = init_ops.constant_initializer(
           np.arange(2, channels + 2, dtype=np.float32))
@@ -2776,7 +2775,7 @@ class BatchNormTest(test.TestCase):
             'moving_variance': variance,
         },
         data_format='NCHW')
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       sess.run(variables_lib.global_variables_initializer())
       return sess.run(output)
 
@@ -3811,7 +3810,7 @@ class UnitNormTests(test.TestCase):
       image = random_ops.random_uniform((height, width, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), axis=dim))
 
       shape = [height, width, 3]
       del shape[dim]
@@ -3847,7 +3846,7 @@ class UnitNormTests(test.TestCase):
       image = array_ops.placeholder(dtypes.float32, (None, None, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), axis=dim))
 
       with self.cached_session():
         actual = norms.eval({image: placeholder_value})
diff --git a/tensorflow/contrib/layers/python/layers/regularizers_test.py b/tensorflow/contrib/layers/python/layers/regularizers_test.py
index 51faba30c74d64c54d3d2b11d2a11195cca6b759..5cb00b76847430be8ade9f4e4fc8f7372035485a 100644
--- a/tensorflow/contrib/layers/python/layers/regularizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/regularizers_test.py
@@ -141,7 +141,7 @@ class RegularizerTest(test.TestCase):
     dummy_regularizer = lambda x: math_ops.reduce_sum(2 * x)
     array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]]
     tensor_weights_list = [constant_op.constant(x) for x in array_weights_list]
-    expected = sum([2 * x for l in array_weights_list for x in l])
+    expected = sum(2 * x for l in array_weights_list for x in l)
     with self.cached_session():
       result = regularizers.apply_regularization(dummy_regularizer,
                                                  tensor_weights_list)
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 61185f65a9bd294003515456f891de0a68661a82..14065fcee51c014a1af227504eaaca1fa39941e1 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -24,6 +24,11 @@ py_library(
         exclude = ["python/learn/**/*_test.py"],
     ),
     srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//video/youtube/personalization:__subpackages__",
+    ],
     # This library should not depend on sklearn, even though some of the code
     # refers to it. (The code handles the presence of sklearn conditionally.)
     deps = [
@@ -269,6 +274,7 @@ py_test(
     name = "estimator_test",
     size = "medium",
     srcs = ["python/learn/estimators/estimator_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
         "manual",
diff --git a/tensorflow/contrib/learn/README.md b/tensorflow/contrib/learn/README.md
index d516bffc5e0327a3400068b35de5503e5a925a54..b0bff915a993c9a01e2e6d9ef9f71c14d2f29a73 100644
--- a/tensorflow/contrib/learn/README.md
+++ b/tensorflow/contrib/learn/README.md
@@ -7,12 +7,11 @@ warnings. A high-level overview is below.
 
 ## Canned Estimators
 
-Many canned estimators (subclasses of `Estimator`) have equivalents in core:
+Many canned estimators (subclasses of `Estimator`) have equivalents in core
+exposed under `tf.estimator`:
 `DNNClassifier`, `DNNRegressor`, `DNNEstimator`, `LinearClassifier`,
-`LinearRegressor`, `DNNLinearCombinedClassifier` and
-`DNNLinearCombinedRegressor`. They are exposed under `tf.estimator`.
-`DNNEstimator`, `LinearEstimator` and `DNNLinearCombinedEstimator`
-are exposed under `tf.contrib.estimator`.
+`LinearRegressor`, `LinearEstimator`, `DNNLinearCombinedClassifier`,
+`DNNLinearCombinedRegressor` and `DNNLinearCombinedEstimator`.
 
 To migrate to the new api, users need to take the following steps:
 
@@ -45,7 +44,7 @@ To migrate to the new api, users need to take the following steps:
   `tf.contrib.learn` classifiers and regressors supported labels with shape
   `[batch_size]`.
 * If you pass custom metrics from the `evaluate()` method call, use
-  `tf.contrib.estimator.add_metrics`.
+  `tf.estimator.add_metrics`.
 * Replace your `serving_input_fn` with a `serving_input_receiver_fn`.
   Note this should be entirely distinct from your training `input_fn`, so if you
   previously had one `input_fn` with different "modes", you should now factor
@@ -63,10 +62,10 @@ Some remaining estimators/classes:
   with a custom `model_fn`, or with `DNNEstimator`.
 * `StateSavingRnnEstimator`: Consider a custom `model_fn`.
 * SVM: Consider a custom `model_fn`.
-* `LinearComposableModel` and `DNNComposableModel`: Not supported. 
+* `LinearComposableModel` and `DNNComposableModel`: Not supported.
   Consider `tf.contrib.estimator.DNNEstimator`, or write a custom model_fn.
 * `MetricSpec`: Deprecated. For adding custom metrics to canned Estimators, use
-  `tf.contrib.estimator.add_metrics`.
+  `tf.estimator.add_metrics`.
 
 ## Estimator
 `tf.contrib.learn.Estimator` is migrated to `tf.estimator.Estimator`.
diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py
index 28a6f5aed99b1443ebcc9c391ec332e0febbb04b..7bf2ac62d76d67f0eb131f8f57c5c063955424fa 100644
--- a/tensorflow/contrib/learn/__init__.py
+++ b/tensorflow/contrib/learn/__init__.py
@@ -19,9 +19,6 @@ This module and all its submodules are deprecated. See
 [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
 for migration instructions.
 
-See the [Contrib Learn](https://tensorflow.org/api_guides/python/contrib.learn)
-guide.
-
 @@BaseEstimator
 @@Estimator
 @@Trainable
diff --git a/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py b/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
index 1f0e4663d060a3850e2002b27f809fde1db47e48..4c206839300b1c6b14b324b3d1ec2d70f7eca903 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/_sklearn.py
@@ -194,7 +194,7 @@ if TRY_IMPORT_SKLEARN:
   # pylint: disable=g-import-not-at-top,g-multiple-import,unused-import
   from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin
   from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
-  from sklearn.cross_validation import train_test_split
+  from sklearn.model_selection import train_test_split
   try:
     from sklearn.exceptions import NotFittedError
   except ImportError:
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index eabebb7e881558471c343c0573cc9a8f4a425312..10fbd60ba2df4c3f84169bf04f249d67dc14573f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -28,7 +28,6 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -38,11 +37,12 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 # The default learning rate of 0.05 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
@@ -150,10 +150,10 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
         "input_from_feature_columns",
         values=tuple(six.itervalues(features)),
         partitioner=input_layer_partitioner) as input_layer_scope:
-      if all([
+      if all(
           isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
           for fc in feature_columns
-      ]):
+      ):
         net = layers.input_from_feature_columns(
             columns_to_tensors=features,
             feature_columns=feature_columns,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 3d85533d92d17095bae9a69f229171e1bf61ba10..2ade6b7b6ce2678ec8df7c98ffaa5636ae9d4b1d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -38,7 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
@@ -236,10 +236,10 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
           "input_from_feature_columns",
           values=tuple(six.itervalues(features)),
           partitioner=input_layer_partitioner) as dnn_input_scope:
-        if all([
+        if all(
             isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
             for fc in dnn_feature_columns
-        ]):
+        ):
           net = layers.input_from_feature_columns(
               columns_to_tensors=features,
               feature_columns=dnn_feature_columns,
@@ -292,8 +292,8 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
         linear_parent_scope,
         values=tuple(six.itervalues(features)),
         partitioner=linear_partitioner) as scope:
-      if all([isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
-              for fc in linear_feature_columns]):
+      if all(isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
+             for fc in linear_feature_columns):
         if joint_linear_weights:
           linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns(
               columns_to_tensors=features,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 4e65c180d8bee9ab8fe9b1fbf32edc229c31af09..d46a873bfaa297e7f6242aa56e9d0bf0eb551867 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -36,7 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 2bd57597c2e9444b51b1dacfbe4180b443c95a3d..ee25cebd484f1e831fe8b6d3aa7290da7558adee 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -38,7 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index 1d8a59281a4934ad063362cba064e6cb3abff5a2..28c4964527bb034c8c6b1642366c6c82c1a72201 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -668,7 +668,7 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
         sequences = centers + noise
 
         inputs = array_ops.expand_dims(sequences, 2)
-        labels = math_ops.reduce_mean(sequences, reduction_indices=[1])
+        labels = math_ops.reduce_mean(sequences, axis=[1])
         return {'inputs': inputs}, labels
 
       return input_fn
@@ -722,8 +722,8 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
         inputs = array_ops.expand_dims(math_ops.to_float(random_sequence), 2)
         labels = math_ops.to_int32(
             array_ops.squeeze(
-                math_ops.reduce_sum(
-                    inputs, reduction_indices=[1]) > (sequence_length / 2.0)))
+                math_ops.reduce_sum(inputs, axis=[1]) > (
+                    sequence_length / 2.0)))
         return {'inputs': inputs}, labels
 
       return input_fn
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index c1de42782efb3497660affb3ef7162457977c150..9132b2209bce8005b323d058d6d176784a84b2d1 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -404,7 +404,6 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
   Users should not instantiate or subclass this class. Instead, use an
   `Estimator`.
   """
-  __metaclass__ = abc.ABCMeta
 
   # Note that for Google users, this is overridden with
   # learn_runner.EstimatorConfig.
@@ -1067,11 +1066,11 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
       chief_hooks = []
       if (self._config.save_checkpoints_secs or
           self._config.save_checkpoints_steps):
-        saver_hook_exists = any([
+        saver_hook_exists = any(
             isinstance(h, basic_session_run_hooks.CheckpointSaverHook)
             for h in (all_hooks + model_fn_ops.training_hooks + chief_hooks +
                       model_fn_ops.training_chief_hooks)
-        ])
+        )
         if not saver_hook_exists:
           chief_hooks = [
               basic_session_run_hooks.CheckpointSaverHook(
@@ -1433,13 +1432,12 @@ class Estimator(BaseEstimator):
                            'must specify no transforms.')
         untransformed_tags = graph_rewrite_specs[0].tags
 
-        # TODO(soergel): switch to main_op or otherwise update when dust settles
         builder.add_meta_graph_and_variables(
             session,
             untransformed_tags,
             signature_def_map=signature_def_map,
             assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
-            legacy_init_op=init_op,
+            main_op=init_op,
             strip_default_attrs=strip_default_attrs)
 
     # pylint: disable=protected-access
@@ -1495,7 +1493,7 @@ class Estimator(BaseEstimator):
 # pylint: disable=protected-access
 class SKCompat(sklearn.BaseEstimator):
   """Scikit learn wrapper for TensorFlow Learn Estimator.
-  
+
   THIS CLASS IS DEPRECATED. See
   [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
   for general migration instructions.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 9e5aaf3118dfed4ce64dd244a915860b5a2eef44..8a461a0bd7ba457fcf830769f23c6ca2860a2732 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -1368,7 +1368,7 @@ class ReplicaDeviceSetterTest(test.TestCase):
       table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
       input_string = constant_op.constant(['brain', 'salad', 'tank'])
       output = table.lookup(input_string)
-    self.assertDeviceEqual('/job:ps/task:0', table._table_ref.device)
+    self.assertDeviceEqual('/job:ps/task:0', table.resource_handle.device)
     self.assertDeviceEqual('/job:ps/task:0', output.device)
 
   def testMutableHashTableIsLocal(self):
@@ -1378,7 +1378,7 @@ class ReplicaDeviceSetterTest(test.TestCase):
       table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
       input_string = constant_op.constant(['brain', 'salad', 'tank'])
       output = table.lookup(input_string)
-    self.assertDeviceEqual('', table._table_ref.device)
+    self.assertDeviceEqual('', table.resource_handle.device)
     self.assertDeviceEqual('', output.device)
 
   def testTaskIsSetOnWorkerWhenJobNameIsSet(self):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index c6f79e00d5a5a584b0c5f8201a2576f02106a5b4..c1b97d8b49613ea49d9813954da3b7a63d3ba04c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -55,6 +55,7 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.deprecation import deprecated
 
 
+@six.add_metaclass(abc.ABCMeta)
 class Head(object):
   """Interface for the head/top of a model.
 
@@ -132,7 +133,6 @@ class Head(object):
       ... update train_op and hooks in ModelFnOps and return
     ```
   """
-  __metaclass__ = abc.ABCMeta
 
   @abc.abstractproperty
   def logits_dimension(self):
@@ -504,7 +504,6 @@ def no_op_train_fn(loss):
 
 class _SingleHead(Head):
   """Interface for a single head/top of a model."""
-  __metaclass__ = abc.ABCMeta
 
   def __init__(
       self, problem_type, logits_dimension, label_name=None,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index e100bc7a1e7be4896e9ab1c965775b5185b38897..9ee8d8004bf26224dd96a98bad109720c44d04f7 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -155,8 +155,8 @@ def _linear_model_fn(features, labels, mode, params, config=None):
       parent_scope,
       values=tuple(six.itervalues(features)),
       partitioner=partitioner) as scope:
-    if all([isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
-            for fc in feature_columns]):
+    if all(isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
+           for fc in feature_columns):
       if joint_weights:
         layer_fn = layers.joint_weighted_sum_from_feature_columns
       else:
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index 597ca4e86dbf66c86182f14a2a364b662d52fb0a..dfc76bfde6c0109f98093232b6f223d6938007f9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer as sdca_optimizer_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -1745,7 +1745,7 @@ class LinearRegressorTest(test.TestCase):
           'place_holder':
               constant_op.constant([[0.0]] * num_examples),
       }, constant_op.constant(
-          [[1 if i % 4 is 0 else 0] for i in range(num_examples)])
+          [[1 if i % 4 == 0 else 0] for i in range(num_examples)])
 
     place_holder = feature_column_lib.real_valued_column('place_holder')
     sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
diff --git a/tensorflow/contrib/learn/python/learn/evaluable.py b/tensorflow/contrib/learn/python/learn/evaluable.py
index 10881ca885599bc81386e15f814a2687d907f63b..5dedf548f73d27bf543dcfd9885490b8b7c9ac96 100644
--- a/tensorflow/contrib/learn/python/learn/evaluable.py
+++ b/tensorflow/contrib/learn/python/learn/evaluable.py
@@ -25,7 +25,10 @@ from __future__ import print_function
 
 import abc
 
+import six
 
+
+@six.add_metaclass(abc.ABCMeta)
 class Evaluable(object):
   """Interface for objects that are evaluatable by, e.g., `Experiment`.
 
@@ -33,7 +36,6 @@ class Evaluable(object):
   [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
   for general migration instructions.
   """
-  __metaclass__ = abc.ABCMeta
 
   @abc.abstractproperty
   def model_dir(self):
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
index 29552d24f1eaa0d85a99c8b09f69d007e7e4fe9f..59a67636ae275c5ca1df21685770baa7a960d667 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
@@ -27,7 +27,7 @@ from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn as core_n
 from tensorflow.python.util.deprecation import deprecated
 
 
-@deprecated(None, 'Use tf.estimator.inputs.numpy_input_fn.')
+@deprecated(None, 'Use tf.compat.v1.estimator.inputs.numpy_input_fn.')
 def numpy_input_fn(x,
                    y=None,
                    batch_size=128,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
index b4ef055f5ae484ec704ad42efcf2c00c4a7a4f56..e9df7258a358d9543f2bb386518d900bd6ddef74 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
@@ -53,7 +53,7 @@ PANDAS_DTYPES = {
 }
 
 
-@deprecated(None, 'Please use tf.estimator.inputs.pandas_input_fn')
+@deprecated(None, 'Please use tf.compat.v1.estimator.inputs.pandas_input_fn')
 def pandas_input_fn(x,
                     y=None,
                     batch_size=128,
diff --git a/tensorflow/contrib/learn/python/learn/trainable.py b/tensorflow/contrib/learn/python/learn/trainable.py
index a1a3f20dcd8cb5ff7baa559ac41d5e5c40780511..1ea9e5d67a95dfc3ba57151085051ec7aea14226 100644
--- a/tensorflow/contrib/learn/python/learn/trainable.py
+++ b/tensorflow/contrib/learn/python/learn/trainable.py
@@ -25,13 +25,15 @@ from __future__ import print_function
 
 import abc
 
+import six
 
+
+@six.add_metaclass(abc.ABCMeta)
 class Trainable(object):
   """Interface for objects that are trainable by, e.g., `Experiment`.
 
   THIS CLASS IS DEPRECATED.
   """
-  __metaclass__ = abc.ABCMeta
 
   @abc.abstractmethod
   def fit(self,
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index 8466dc36d13e223aed4f1dfe8e39a6f91c99fa55..d49834dc860a8b4341ddd3720fde52281f7474f7 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for SdcaModel."""
+"""Tests for SdcaModel (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index b98adf862bf1514b43d237196cb2de531a909479..c056a12fa5307a7e9ac4cf30e1386ddfd5cd7d75 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -12,7 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Proximal stochastic dual coordinate ascent optimizer for linear models."""
+# pylint: disable=line-too-long
+"""Proximal stochastic dual coordinate ascent optimizer for linear models (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
+# pylint: enable=line-too-long
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -22,9 +29,11 @@ import collections
 from six.moves import range
 
 from tensorflow.contrib.linear_optimizer.python.ops.sharded_mutable_dense_hashtable import ShardedMutableDenseHashTable
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework.ops import internal_convert_to_tensor
 from tensorflow.python.framework.ops import name_scope
 from tensorflow.python.ops import array_ops
@@ -38,6 +47,7 @@ from tensorflow.python.ops import variables as var_ops
 from tensorflow.python.ops.nn import log_poisson_loss
 from tensorflow.python.ops.nn import sigmoid_cross_entropy_with_logits
 from tensorflow.python.summary import summary
+from tensorflow.python.util import deprecation
 
 __all__ = ['SdcaModel']
 
@@ -46,7 +56,7 @@ __all__ = ['SdcaModel']
 class SdcaModel(object):
   """Stochastic dual coordinate ascent solver for linear models.
 
-    Loss functions supported:
+  Loss functions supported:
 
      * Binary logistic loss
      * Squared loss
@@ -107,6 +117,10 @@ class SdcaModel(object):
     ```
   """
 
+  @deprecation.deprecated(
+      None, 'This class is deprecated. To UPDATE or USE linear optimizers, '
+      'please check its latest version in core: '
+      'tensorflow_estimator/python/estimator/canned/linear_optimizer/.')
   def __init__(self, examples, variables, options):
     """Create a new sdca optimizer."""
 
@@ -151,7 +165,8 @@ class SdcaModel(object):
         default_value=[0.0, 0.0, 0.0, 0.0],
         # SdcaFprint never returns 0 or 1 for the low64 bits, so this a safe
         # empty_key (that will never collide with actual payloads).
-        empty_key=[0, 0])
+        empty_key=[0, 0],
+        deleted_key=[1, 1])
 
     summary.scalar('approximate_duality_gap', self.approximate_duality_gap())
     summary.scalar('examples_seen', self._hashtable.size())
@@ -202,7 +217,7 @@ class SdcaModel(object):
             with ops.colocate_with(v):
               # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109
               # is fixed.
-              slot_var = var_ops.Variable(
+              slot_var = var_ops.VariableV1(
                   initial_value=array_ops.zeros_like(v.initialized_value(),
                                                      dtypes.float32),
                   name=v.op.name + '_unshrinked/SDCAOptimizer')
@@ -214,7 +229,7 @@ class SdcaModel(object):
             # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109 is
             # fixed.
             self._slots['unshrinked_' + name].append(
-                var_ops.Variable(
+                var_ops.VariableV1(
                     array_ops.zeros_like(var.initialized_value(),
                                          dtypes.float32),
                     name=var.op.name + '_unshrinked/SDCAOptimizer'))
@@ -425,14 +440,15 @@ class SdcaModel(object):
           dim_0_size = self._get_first_dimension_size_statically(
               w, num_partitions)
 
-          if dim_0_size.value:
-            num_total_ids = constant_op.constant(dim_0_size.value,
-                                                 flat_ids.dtype)
+          if tensor_shape.dimension_value(dim_0_size):
+            num_total_ids = constant_op.constant(
+                tensor_shape.dimension_value(dim_0_size),
+                flat_ids.dtype)
           else:
             dim_0_sizes = []
             for p in range(num_partitions):
-              if w[p].get_shape()[0].value is not None:
-                dim_0_sizes.append(w[p].get_shape()[0].value)
+              if tensor_shape.dimension_value(w[p].shape[0]) is not None:
+                dim_0_sizes.append(tensor_shape.dimension_value(w[p].shape[0]))
               else:
                 with ops.colocate_with(w[p]):
                   dim_0_sizes.append(array_ops.shape(w[p])[0])
@@ -485,24 +501,44 @@ class SdcaModel(object):
         sparse_weights.append(batch_gathered_weights)
 
       # pylint: disable=protected-access
-      esu, sfw, dfw = gen_sdca_ops.sdca_optimizer(
-          sparse_example_indices,
-          sparse_feature_indices,
-          sparse_features_values,
-          self._convert_n_to_tensor(self._examples['dense_features']),
-          internal_convert_to_tensor(self._examples['example_weights']),
-          internal_convert_to_tensor(self._examples['example_labels']),
-          sparse_indices,
-          sparse_weights,
-          self._convert_n_to_tensor(self._slots[
-              'unshrinked_dense_features_weights']),
-          example_state_data,
-          loss_type=self._options['loss_type'],
-          l1=self._options['symmetric_l1_regularization'],
-          l2=self._symmetric_l2_regularization(),
-          num_loss_partitions=self._num_loss_partitions(),
-          num_inner_iterations=1,
-          adaptative=self._adaptive())
+      if compat.forward_compatible(year=2018, month=10, day=30):
+        esu, sfw, dfw = gen_sdca_ops.sdca_optimizer_v2(
+            sparse_example_indices,
+            sparse_feature_indices,
+            sparse_features_values,
+            self._convert_n_to_tensor(self._examples['dense_features']),
+            internal_convert_to_tensor(self._examples['example_weights']),
+            internal_convert_to_tensor(self._examples['example_labels']),
+            sparse_indices,
+            sparse_weights,
+            self._convert_n_to_tensor(self._slots[
+                'unshrinked_dense_features_weights']),
+            example_state_data,
+            loss_type=self._options['loss_type'],
+            l1=self._options['symmetric_l1_regularization'],
+            l2=self._symmetric_l2_regularization(),
+            num_loss_partitions=self._num_loss_partitions(),
+            num_inner_iterations=1,
+            adaptive=self._adaptive())
+      else:
+        esu, sfw, dfw = gen_sdca_ops.sdca_optimizer(
+            sparse_example_indices,
+            sparse_feature_indices,
+            sparse_features_values,
+            self._convert_n_to_tensor(self._examples['dense_features']),
+            internal_convert_to_tensor(self._examples['example_weights']),
+            internal_convert_to_tensor(self._examples['example_labels']),
+            sparse_indices,
+            sparse_weights,
+            self._convert_n_to_tensor(self._slots[
+                'unshrinked_dense_features_weights']),
+            example_state_data,
+            loss_type=self._options['loss_type'],
+            l1=self._options['symmetric_l1_regularization'],
+            l2=self._symmetric_l2_regularization(),
+            num_loss_partitions=self._num_loss_partitions(),
+            num_inner_iterations=1,
+            adaptative=self._adaptive())
       # pylint: enable=protected-access
 
       with ops.control_dependencies([esu]):
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index 5015fb0848107950dd27eb81431dd308f22858bc..a28394964a12013c43d85701b5a0ab5c559afd62 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sharded mutable dense hash table."""
+"""Sharded mutable dense hash table (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,9 +33,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 
 
-class ShardedMutableDenseHashTable(lookup.LookupInterface):
+# TODO(rohanj): This should subclass Checkpointable and implement
+# _gather_saveables_for_checkpoint.
+class ShardedMutableDenseHashTable(object):
   """A sharded version of MutableDenseHashTable.
 
   It is designed to be interface compatible with LookupInterface and
@@ -43,17 +51,23 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
 
   # TODO(andreasst): consider moving this to lookup module
 
+  @deprecation.deprecated(
+      None, 'This class is deprecated. To UPDATE or USE linear optimizers, '
+      'please check its latest version in core: '
+      'tensorflow_estimator/python/estimator/canned/linear_optimizer/.')
   def __init__(self,
                key_dtype,
                value_dtype,
                default_value,
                empty_key,
+               deleted_key,
                num_shards=1,
                checkpoint=True,
                name='ShardedMutableHashTable'):
+    self._key_dtype = key_dtype
+    self._value_dtype = value_dtype
     with ops.name_scope(name, 'sharded_mutable_hash_table') as scope:
-      super(ShardedMutableDenseHashTable, self).__init__(key_dtype,
-                                                         value_dtype, scope)
+      self._table_name = scope
       table_shards = []
       for i in range(num_shards):
         table_shards.append(
@@ -62,6 +76,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
                 value_dtype=value_dtype,
                 default_value=default_value,
                 empty_key=empty_key,
+                deleted_key=deleted_key,
                 checkpoint=checkpoint,
                 name='%s-%d-of-%d' % (name, i + 1, num_shards)))
       self._table_shards = table_shards
@@ -70,6 +85,10 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
       self._value_shape = self._table_shards[0]._value_shape
       # pylint: enable=protected-access
 
+  @property
+  def name(self):
+    return self._table_name
+
   @property
   def _num_shards(self):
     return len(self._table_shards)
@@ -90,7 +109,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
     if key_shape.ndims > 1:
       # If keys are a matrix (i.e. a single key is a vector), we use the first
       # element of each key vector to determine the shard.
-      keys = array_ops.slice(keys, [0, 0], [key_shape[0].value, 1])
+      keys = array_ops.slice(keys, [0, 0], [key_shape.dims[0].value, 1])
       keys = array_ops.reshape(keys, [-1])
     indices = math_ops.mod(math_ops.abs(keys), self._num_shards)
     return math_ops.cast(indices, dtypes.int32)
@@ -104,6 +123,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
                        keys.get_shape())
 
   def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values."""
     if keys.dtype.base_dtype != self._key_dtype:
       raise TypeError('Signature mismatch. Keys must be dtype %s, got %s.' %
                       (self._key_dtype, keys.dtype))
@@ -132,6 +152,7 @@ class ShardedMutableDenseHashTable(lookup.LookupInterface):
     return result
 
   def insert(self, keys, values, name=None):
+    """Inserts `keys` in a table."""
     self._check_keys(keys)
     num_shards = self._num_shards
     if num_shards == 1:
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py
index 553b116a3b3d76423d4700691fb6912101bebca4..2d1457f9e4cc576da696be191e718814dd9ff4e5 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for sharded_mutable_dense_hashtable.py."""
+"""Tests for sharded_mutable_dense_hashtable.py (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -33,6 +38,7 @@ class ShardedMutableDenseHashTableTest(TensorFlowTestCase):
       with self.cached_session():
         default_val = -1
         empty_key = 0
+        deleted_key = -1
         keys = constant_op.constant([11, 12, 13], dtypes.int64)
         values = constant_op.constant([0, 1, 2], dtypes.int64)
         table = ShardedMutableDenseHashTable(
@@ -40,6 +46,7 @@ class ShardedMutableDenseHashTableTest(TensorFlowTestCase):
             dtypes.int64,
             default_val,
             empty_key,
+            deleted_key,
             num_shards=num_shards)
         self.assertAllEqual(0, table.size().eval())
 
@@ -56,6 +63,7 @@ class ShardedMutableDenseHashTableTest(TensorFlowTestCase):
       with self.cached_session():
         default_val = [-0.1, 0.2]
         empty_key = [0, 1]
+        deleted_key = [1, 0]
         keys = constant_op.constant([[11, 12], [13, 14], [15, 16]],
                                     dtypes.int64)
         values = constant_op.constant([[0.5, 0.6], [1.5, 1.6], [2.5, 2.6]],
@@ -65,6 +73,7 @@ class ShardedMutableDenseHashTableTest(TensorFlowTestCase):
             dtypes.float32,
             default_val,
             empty_key,
+            deleted_key,
             num_shards=num_shards)
         self.assertAllEqual(0, table.size().eval())
 
@@ -81,6 +90,7 @@ class ShardedMutableDenseHashTableTest(TensorFlowTestCase):
   def testExportSharded(self):
     with self.cached_session():
       empty_key = -2
+      deleted_key = -3
       default_val = -1
       num_shards = 2
       keys = constant_op.constant([10, 11, 12], dtypes.int64)
@@ -90,6 +100,7 @@ class ShardedMutableDenseHashTableTest(TensorFlowTestCase):
           dtypes.int64,
           default_val,
           empty_key,
+          deleted_key,
           num_shards=num_shards)
       self.assertAllEqual(0, table.size().eval())
 
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
index 003795233ff2b28e33fc10388ef25efb63c43bb0..64730f8eed1ff9bfcd4a980dceb28abb98e39f73 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sparse feature column."""
+"""Sparse feature column (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +26,7 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework.ops import internal_convert_to_tensor
 from tensorflow.python.framework.ops import name_scope
+from tensorflow.python.util import deprecation
 
 
 class SparseFeatureColumn(object):
@@ -68,6 +74,10 @@ class SparseFeatureColumn(object):
   @@feature_values
   """
 
+  @deprecation.deprecated(
+      None, 'This class is deprecated. To UPDATE or USE linear optimizers, '
+      'please check its latest version in core: '
+      'tensorflow_estimator/python/estimator/canned/linear_optimizer/.')
   def __init__(self, example_indices, feature_indices, feature_values):
     """Creates a `SparseFeatureColumn` representation.
 
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py
index 51c4f68543da2f563481cc2d35b556796616cf9d..0ae780e1a100c7dadde7196803f2ae0d4bcb2334 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for sparse_feature_column.py."""
+"""Tests for sparse_feature_column.py (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 647667188238dc18b137eaad98356a79b3a549b4..7a5354222f103aa0f45adc513079e420bbbfd30c 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -524,7 +524,7 @@ class SDCALinearRegressorTest(test.TestCase):
           # LinearClassifier requires at least one column.
           'place_holder':
               constant_op.constant([[0.0]] * num_examples),
-      }, constant_op.constant([[1 if i % 4 is 0 else 0]
+      }, constant_op.constant([[1 if i % 4 == 0 else 0]
                                for i in range(num_examples)])
 
     with self._single_threaded_test_session():
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
deleted file mode 100644
index 787a85644c35c807df84f74cbce06f80fd0b004d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/BUILD
+++ /dev/null
@@ -1,358 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
-
-exports_files(glob([
-    "testdata/*.bin",
-    "testdata/*.pb",
-    "models/testdata/*",
-]))
-
-config_setting(
-    name = "mips",
-    values = {
-        "cpu": "mips",
-    },
-)
-
-config_setting(
-    name = "mips64",
-    values = {
-        "cpu": "mips64",
-    },
-)
-
-# Enables inclusion of TensorFlow kernels via the TF Lite Flex delegate.
-# WARNING: This build flag is experimental and subject to change.
-config_setting(
-    name = "with_tflite_flex",
-    define_values = {"with_tflite_flex": "true"},
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "schema_fbs_version",
-    hdrs = ["version.h"],
-)
-
-cc_library(
-    name = "arena_planner",
-    srcs = ["arena_planner.cc"],
-    hdrs = ["arena_planner.h"],
-    deps = [
-        ":graph_info",
-        ":memory_planner",
-        ":simple_memory_arena",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ],
-)
-
-cc_test(
-    name = "arena_planner_test",
-    size = "small",
-    srcs = ["arena_planner_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":arena_planner",
-        "//tensorflow/contrib/lite/testing:util",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-# Main library. No ops are included here.
-# TODO(aselle): Resolve problems preventing C99 usage.
-cc_library(
-    name = "context",
-    hdrs = ["context.h"],
-    deps = ["//tensorflow/contrib/lite/c:c_api_internal"],
-)
-
-cc_library(
-    name = "graph_info",
-    hdrs = ["graph_info.h"],
-    deps = ["//tensorflow/contrib/lite/c:c_api_internal"],
-)
-
-cc_library(
-    name = "memory_planner",
-    hdrs = ["memory_planner.h"],
-    deps = ["//tensorflow/contrib/lite/c:c_api_internal"],
-)
-
-cc_library(
-    name = "simple_memory_arena",
-    srcs = ["simple_memory_arena.cc"],
-    hdrs = ["simple_memory_arena.h"],
-    deps = ["//tensorflow/contrib/lite/c:c_api_internal"],
-)
-
-cc_library(
-    name = "builtin_op_data",
-    hdrs = [
-        "builtin_op_data.h",
-    ],
-    deps = ["//tensorflow/contrib/lite/c:c_api_internal"],
-)
-
-cc_library(
-    name = "kernel_api",
-    hdrs = [
-        "builtin_op_data.h",
-        "builtin_ops.h",
-        "context.h",
-        "context_util.h",
-    ],
-)
-
-exports_files(["builtin_ops.h"])
-
-cc_library(
-    name = "string",
-    hdrs = [
-        "string.h",
-    ],
-    deps = [
-        "//tensorflow/core:lib_platform",
-    ],
-)
-
-# TODO(ahentz): investigate dependency on gemm_support requiring usage of tf_copts.
-cc_library(
-    name = "framework",
-    srcs = [
-        "allocation.cc",
-        "graph_info.cc",
-        "interpreter.cc",
-        "model.cc",
-        "mutable_op_resolver.cc",
-        "optional_debug_tools.cc",
-        "stderr_reporter.cc",
-    ] + select({
-        "//tensorflow:android": [
-            "nnapi_delegate.cc",
-            "mmap_allocation.cc",
-        ],
-        "//tensorflow:windows": [
-            "nnapi_delegate_disabled.cc",
-            "mmap_allocation_disabled.cc",
-        ],
-        "//conditions:default": [
-            "nnapi_delegate_disabled.cc",
-            "mmap_allocation.cc",
-        ],
-    }),
-    hdrs = [
-        "allocation.h",
-        "context.h",
-        "context_util.h",
-        "error_reporter.h",
-        "graph_info.h",
-        "interpreter.h",
-        "model.h",
-        "mutable_op_resolver.h",
-        "nnapi_delegate.h",
-        "op_resolver.h",
-        "optional_debug_tools.h",
-        "stderr_reporter.h",
-    ],
-    copts = tflite_copts(),
-    linkopts = [
-    ] + select({
-        "//tensorflow:android": [
-            "-llog",
-        ],
-        "//conditions:default": [
-        ],
-    }),
-    deps = [
-        ":arena_planner",
-        ":graph_info",
-        ":memory_planner",
-        ":schema_fbs_version",
-        ":simple_memory_arena",
-        ":string",
-        ":util",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/core/api",
-        "//tensorflow/contrib/lite/kernels:eigen_support",
-        "//tensorflow/contrib/lite/kernels:gemm_support",
-        "//tensorflow/contrib/lite/nnapi:nnapi_lib",
-        "//tensorflow/contrib/lite/profiling:profiler",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-    ] + select({
-        ":with_tflite_flex": [
-            "//tensorflow/contrib/lite/delegates/flex:delegate",
-        ],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "string_util",
-    srcs = ["string_util.cc"],
-    hdrs = ["string_util.h"],
-    deps = [
-        ":framework",
-        ":string",
-    ],
-)
-
-cc_test(
-    name = "string_util_test",
-    size = "small",
-    srcs = ["string_util_test.cc"],
-    deps = [
-        ":framework",
-        ":string_util",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-# Test main interpreter
-cc_test(
-    name = "interpreter_test",
-    size = "small",
-    srcs = ["interpreter_test.cc"],
-    deps = [
-        ":framework",
-        ":string_util",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/core/api",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/kernels:kernel_util",
-        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-# Test graph utils
-cc_test(
-    name = "graph_info_test",
-    size = "small",
-    srcs = ["graph_info_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":framework",
-        ":string_util",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-# Test arena allocator
-cc_test(
-    name = "simple_memory_arena_test",
-    size = "small",
-    srcs = ["simple_memory_arena_test.cc"],
-    deps = [
-        ":simple_memory_arena",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-# Test model framework.
-cc_test(
-    name = "model_test",
-    size = "small",
-    srcs = ["model_test.cc"],
-    data = [
-        "testdata/0_subgraphs.bin",
-        "testdata/2_subgraphs.bin",
-        "testdata/empty_model.bin",
-        "testdata/multi_add_flex.bin",
-        "testdata/test_model.bin",
-        "testdata/test_model_broken.bin",
-    ],
-    deps = [
-        ":framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/core/api",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-# Test model framework with the flex library linked into the target.
-tf_cc_test(
-    name = "model_flex_test",
-    size = "small",
-    srcs = ["model_flex_test.cc"],
-    data = [
-        "testdata/multi_add_flex.bin",
-    ],
-    tags = ["no_windows"],  # TODO(b/116667551): No weak symbols with MSVC.
-    deps = [
-        ":framework",
-        "//tensorflow/contrib/lite/core/api",
-        "//tensorflow/contrib/lite/delegates/flex:delegate",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-# Test OpResolver.
-cc_test(
-    name = "mutable_op_resolver_test",
-    size = "small",
-    srcs = ["mutable_op_resolver_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":framework",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "util",
-    srcs = ["util.cc"],
-    hdrs = ["util.h"],
-    deps = [
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ],
-)
-
-cc_test(
-    name = "util_test",
-    size = "small",
-    srcs = ["util_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":util",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-# Test the serialization of a model with optional tensors.
-
-# Model tests
-
-#cc_library(
-#    name = "models_test_utils",
-#    testonly = 1,
-#    hdrs = ["models/test_utils.h"],
-#    deps = select({
-#        "//tensorflow:android": [],
-#        "//conditions:default": [
-#            "@com_google_absl//absl/strings",
-#            "//tensorflow/core:test",
-#        ],
-#    }),
-#)
diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
deleted file mode 100644
index a4b3d83efe09358cb8e7a5f673a96f28faa84d08..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# TensorFlow Lite
-
-TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded
-devices. It enables low-latency inference of on-device machine learning models
-with a small binary size and fast performance supporting hardware acceleration.
-
-See the documentation: https://www.tensorflow.org/lite/
-Documentation edits can be made here: [tensorflow/contrib/lite/g3doc](./g3doc/)
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
deleted file mode 100644
index 30901bd0fae9510ebea288288941218d6994d888..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Compatibility shim for new location of interface definitions.
-
-#ifndef TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
-#define TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-
-#endif  // TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/contrib/lite/c/BUILD b/tensorflow/contrib/lite/c/BUILD
deleted file mode 100644
index 663eb63cad0da0781cc2d07d1b78242bea2ee3c8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/c/BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-cc_library(
-    name = "c_api_internal",
-    srcs = ["c_api_internal.c"],
-    hdrs = [
-        "builtin_op_data.h",
-        "c_api_internal.h",
-    ],
-    visibility = [
-        "//tensorflow/contrib/lite:__subpackages__",
-    ],
-)
-
-# Test the C extension API code.
-cc_test(
-    name = "c_api_internal_test",
-    size = "small",
-    srcs = ["c_api_internal_test.cc"],
-    deps = [
-        ":c_api_internal",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_test(
-    name = "builtin_op_data_test",
-    size = "small",
-    srcs = ["builtin_op_data_test.cc"],
-    copts = ["-Wno-unused-variable"],
-    deps = [
-        ":c_api_internal",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/tensorflow/contrib/lite/c/builtin_op_data.h b/tensorflow/contrib/lite/c/builtin_op_data.h
deleted file mode 100644
index 44daf7adaa0e76300c7199df2a7267e21d340534..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/c/builtin_op_data.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_C_BUILTIN_OP_DATA_H_
-#define TENSORFLOW_CONTRIB_LITE_C_BUILTIN_OP_DATA_H_
-
-#include <stdint.h>
-
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// TODO(aselle): Consider using "if this then that" for testing.
-
-// IMPORTANT: All new members of structs must be added at the end to ensure
-// backwards compatibility.
-
-// Possible padding types (for convolutions)
-typedef enum {
-  kTfLitePaddingUnknown = 0,
-  kTfLitePaddingSame,
-  kTfLitePaddingValid,
-} TfLitePadding;
-
-typedef struct {
-  int width;
-  int height;
-} TfLitePaddingValues;
-
-// Possible fused activation functions.
-// TODO(aselle): rename to TfLiteActivation
-typedef enum {
-  kTfLiteActNone = 0,
-  kTfLiteActRelu,
-  kTfLiteActRelu1,
-  kTfLiteActRelu6,
-  kTfLiteActTanh,
-  kTfLiteActSignBit,
-  kTfLiteActSigmoid,
-} TfLiteFusedActivation;
-
-typedef struct {
-  TfLitePadding padding;
-  int stride_width;
-  int stride_height;
-  int dilation_width_factor;
-  int dilation_height_factor;
-  TfLiteFusedActivation activation;
-} TfLiteConvParams;
-
-typedef struct {
-  TfLitePadding padding;
-  int stride_width;
-  int stride_height;
-  int filter_width;
-  int filter_height;
-  TfLiteFusedActivation activation;
-  struct {
-    TfLitePaddingValues padding;
-  } computed;
-} TfLitePoolParams;
-
-typedef struct {
-  // Parameters for DepthwiseConv version 1 or above.
-  TfLitePadding padding;
-  int stride_width;
-  int stride_height;
-  int depth_multiplier;
-  TfLiteFusedActivation activation;
-  // Parameters for DepthwiseConv version 2 or above.
-  int dilation_width_factor;
-  int dilation_height_factor;
-} TfLiteDepthwiseConvParams;
-
-typedef struct {
-  int rank;
-  TfLiteFusedActivation activation;
-} TfLiteSVDFParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-} TfLiteRNNParams;
-
-typedef struct {
-  bool time_major;
-  TfLiteFusedActivation activation;
-} TfLiteSequenceRNNParams;
-
-typedef struct {
-  bool time_major;
-  TfLiteFusedActivation activation;
-  bool merge_outputs;
-} TfLiteBidirectionalSequenceRNNParams;
-
-typedef enum {
-  kTfLiteFullyConnectedWeightsFormatDefault = 0,
-  kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8 = 1,
-} TfLiteFullyConnectedWeightsFormat;
-
-typedef struct {
-  // Parameters for FullyConnected version 1 or above.
-  TfLiteFusedActivation activation;
-
-  // Parameters for FullyConnected version 2 or above.
-  TfLiteFullyConnectedWeightsFormat weights_format;
-} TfLiteFullyConnectedParams;
-
-typedef enum {
-  kTfLiteLshProjectionUnknown = 0,
-  kTfLiteLshProjectionSparse = 1,
-  kTfLiteLshProjectionDense = 2,
-} TfLiteLSHProjectionType;
-
-typedef struct {
-  TfLiteLSHProjectionType type;
-} TfLiteLSHProjectionParams;
-
-typedef struct {
-  float beta;
-} TfLiteSoftmaxParams;
-
-typedef struct {
-  int axis;
-  TfLiteFusedActivation activation;
-} TfLiteConcatenationParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-} TfLiteAddParams;
-
-typedef struct {
-} TfLiteSpaceToBatchNDParams;
-
-typedef struct {
-} TfLiteBatchToSpaceNDParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-} TfLiteMulParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-} TfLiteSubParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-} TfLiteDivParams;
-
-typedef struct {
-  TfLiteFusedActivation activation;
-} TfLiteL2NormParams;
-
-typedef struct {
-  int radius;
-  float bias;
-  float alpha;
-  float beta;
-} TfLiteLocalResponseNormParams;
-
-typedef enum {
-  kTfLiteLSTMFullKernel = 0,
-  kTfLiteLSTMBasicKernel
-} TfLiteLSTMKernelType;
-
-typedef struct {
-  // Parameters for LSTM version 1.
-  TfLiteFusedActivation activation;
-  float cell_clip;
-  float proj_clip;
-
-  // Parameters for LSTM version 2.
-  // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
-  TfLiteLSTMKernelType kernel_type;
-} TfLiteLSTMParams;
-
-typedef struct {
-  // Parameters for the LSTM kernel.
-  TfLiteFusedActivation activation;
-  float cell_clip;
-  float proj_clip;
-
-  // If true, store the outputs of both directions in the first output.
-  bool merge_outputs;
-} TfLiteBidirectionalSequenceLSTMParams;
-
-typedef struct {
-  bool align_corners;
-} TfLiteResizeBilinearParams;
-
-typedef struct {
-} TfLitePadParams;
-
-typedef struct {
-} TfLitePadV2Params;
-
-typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int shape[8];
-  int num_dimensions;
-} TfLiteReshapeParams;
-
-typedef struct {
-  int ngram_size;
-  int max_skip_size;
-  bool include_all_ngrams;
-} TfLiteSkipGramParams;
-
-typedef struct {
-  int block_size;
-} TfLiteSpaceToDepthParams;
-
-typedef struct {
-  TfLiteType in_data_type;
-  TfLiteType out_data_type;
-} TfLiteCastParams;
-
-typedef enum {
-  kTfLiteCombinerTypeSum = 0,
-  kTfLiteCombinerTypeMean = 1,
-  kTfLiteCombinerTypeSqrtn = 2,
-} TfLiteCombinerType;
-
-typedef struct {
-  TfLiteCombinerType combiner;
-} TfLiteEmbeddingLookupSparseParams;
-
-typedef struct {
-  int axis;
-} TfLiteGatherParams;
-
-typedef struct {
-} TfLiteTransposeParams;
-
-typedef struct {
-  bool keep_dims;
-} TfLiteReducerParams;
-
-typedef struct {
-  int num_splits;
-} TfLiteSplitParams;
-
-typedef struct {
-  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
-  // For now we will fix the maximum possible number of dimensions.
-  int squeeze_dims[8];
-  int num_squeeze_dims;
-} TfLiteSqueezeParams;
-
-typedef struct {
-  int begin_mask;
-  int end_mask;
-  int ellipsis_mask;
-  int new_axis_mask;
-  int shrink_axis_mask;
-} TfLiteStridedSliceParams;
-
-typedef struct {
-  TfLiteType output_type;
-} TfLiteArgMaxParams;
-
-typedef struct {
-  TfLiteType output_type;
-} TfLiteArgMinParams;
-
-typedef struct {
-  TfLitePadding padding;
-  int stride_width;
-  int stride_height;
-} TfLiteTransposeConvParams;
-
-typedef struct {
-  bool validate_indices;
-} TfLiteSparseToDenseParams;
-
-typedef struct {
-  TfLiteType out_type;
-} TfLiteShapeParams;
-
-typedef struct {
-  // Parameters supported by version 1:
-  float min;
-  float max;
-  int num_bits;
-
-  // Parameters supported by version 2:
-  bool narrow_range;
-} TfLiteFakeQuantParams;
-
-typedef struct {
-  int values_count;
-  int axis;
-} TfLitePackParams;
-
-typedef struct {
-  int axis;
-} TfLiteOneHotParams;
-
-typedef struct {
-  int num;
-  int axis;
-} TfLiteUnpackParams;
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-#endif  // TENSORFLOW_CONTRIB_LITE_C_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/contrib/lite/c/c_api_internal.c b/tensorflow/contrib/lite/c/c_api_internal.c
deleted file mode 100644
index 8a0c177b1948df9b98e68f6cc6f44628ea8407a3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/c/c_api_internal.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#ifndef TF_LITE_STATIC_MEMORY
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#endif  // TF_LITE_STATIC_MEMORY
-
-int TfLiteIntArrayGetSizeInBytes(int size) {
-  static TfLiteIntArray dummy;
-  return sizeof(dummy) + sizeof(dummy.data[0]) * size;
-}
-
-int TfLiteIntArrayEqual(TfLiteIntArray* a, TfLiteIntArray* b) {
-  if (a == b) return 1;
-  if (a == NULL || b == NULL) return 0;
-  if (a->size != b->size) return 0;
-  int i = 0;
-  for (; i < a->size; i++)
-    if (a->data[i] != b->data[i]) return 0;
-  return 1;
-}
-
-#ifndef TF_LITE_STATIC_MEMORY
-
-TfLiteIntArray* TfLiteIntArrayCreate(int size) {
-  TfLiteIntArray* ret =
-      (TfLiteIntArray*)malloc(TfLiteIntArrayGetSizeInBytes(size));
-  ret->size = size;
-  return ret;
-}
-
-void TfLiteIntArrayPrint(const char* s, TfLiteIntArray* a) {
-  printf("%s: length=%d [", s, a->size);
-  if (a->size) printf("%d", a->data[0]);
-  int i = 1;
-  for (; i < a->size; i++) {
-    printf(" %d", a->data[i]);
-  }
-  printf("]\n");
-}
-
-TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src) {
-  if (!src) return NULL;
-  TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size);
-  if (ret) {
-    memcpy(ret->data, src->data, src->size * sizeof(int));
-  }
-  return ret;
-}
-
-void TfLiteIntArrayFree(TfLiteIntArray* a) { free(a); }
-
-void TfLiteTensorDataFree(TfLiteTensor* t) {
-  if (t->allocation_type == kTfLiteDynamic && t->data.raw) {
-    free(t->data.raw);
-  }
-  t->data.raw = NULL;
-}
-
-void TfLiteTensorFree(TfLiteTensor* t) {
-  TfLiteTensorDataFree(t);
-  if (t->dims) TfLiteIntArrayFree(t->dims);
-  t->dims = NULL;
-}
-
-void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
-                       TfLiteQuantizationParams quantization, char* buffer,
-                       size_t size, TfLiteAllocationType allocation_type,
-                       const void* allocation, bool is_variable,
-                       TfLiteTensor* tensor) {
-  TfLiteTensorFree(tensor);
-  tensor->type = type;
-  tensor->name = name;
-  tensor->dims = dims;
-  tensor->params = quantization;
-  tensor->data.raw = buffer;
-  tensor->bytes = size;
-  tensor->allocation_type = allocation_type;
-  tensor->allocation = allocation;
-  tensor->is_variable = is_variable;
-}
-
-void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
-  if (tensor->allocation_type != kTfLiteDynamic) {
-    return;
-  }
-  if (!tensor->data.raw) {
-    tensor->data.raw = malloc(num_bytes);
-  } else if (num_bytes > tensor->bytes) {
-    tensor->data.raw = realloc(tensor->data.raw, num_bytes);
-  }
-  tensor->bytes = num_bytes;
-}
-#endif  // TF_LITE_STATIC_MEMORY
diff --git a/tensorflow/contrib/lite/c/c_api_internal.h b/tensorflow/contrib/lite/c/c_api_internal.h
deleted file mode 100644
index ee3dff6792a33a575e75fe7a1ef3dc7985be9c1d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/c/c_api_internal.h
+++ /dev/null
@@ -1,496 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// This file defines a C API for implementing operations in tflite.
-// These operations can be defined using c++ but the interface between
-// the interpreter and the operations are C.
-//
-// Summary of abstractions
-// TF_LITE_ENSURE - Self-sufficient error checking
-// TfLiteStatus - Status reporting
-// TfLiteIntArray - stores tensor shapes (dims),
-// TfLiteContext - allows an op to access the tensors
-// TfLiteTensor - tensor (a multidimensional array)
-// TfLiteNode - a single node or operation
-// TfLiteRegistration - the implementation of a conceptual operation.
-//
-// Some abstractions in this file are created and managed by Interpreter.
-#ifndef TENSORFLOW_CONTRIB_LITE_C_C_API_INTERNAL_H_
-#define TENSORFLOW_CONTRIB_LITE_C_C_API_INTERNAL_H_
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-typedef enum { kTfLiteOk = 0, kTfLiteError = 1 } TfLiteStatus;
-
-// The list of external context types known to TF Lite. This list exists solely
-// to avoid conflicts and to ensure ops can share the external contexts they
-// need. Access to the external contexts is controled by one of the
-// corresponding support files.
-typedef enum {
-  kTfLiteEigenContext = 0,     // include eigen_support.h to use.
-  kTfLiteGemmLowpContext = 1,  // include gemm_support.h to use.
-  kTfLiteEdgeTpuContext = 2,   // Placeholder for Edge TPU support.
-  kTfLiteMaxExternalContexts = 3
-} TfLiteExternalContextType;
-
-// An external context is a collection of information unrelated to the TF Lite
-// framework, but useful to a subset of the ops. TF Lite knows very little
-// about about the actual contexts, but it keeps a list of them, and is able to
-// refresh them if configurations like the number of recommended threads
-// change.
-typedef struct {
-  TfLiteExternalContextType type;
-  TfLiteStatus (*Refresh)(struct TfLiteContext* context);
-} TfLiteExternalContext;
-
-// Forward declare so GetNode can use this is in Context.
-typedef struct _TfLiteRegistration TfLiteRegistration;
-typedef struct _TfLiteDelegate TfLiteDelegate;
-
-#define kOptionalTensor (-1)
-
-// Fixed size list of integers. Used for dimensions and inputs/outputs tensor
-// indices
-typedef struct {
-  int size;
-// gcc 6.1+ have a bug where flexible members aren't properly handled
-// https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
-    __GNUC_MINOR__ >= 1
-  int data[0];
-#else
-  int data[];
-#endif
-} TfLiteIntArray;
-
-// Given the size (number of elements) in a TfLiteIntArray, calculate its size
-// in bytes.
-int TfLiteIntArrayGetSizeInBytes(int size);
-
-// Create a array of a given `size` (uninitialized entries).
-// This returns a pointer, that you must free using TfLiteIntArrayFree().
-TfLiteIntArray* TfLiteIntArrayCreate(int size);
-
-// Check if two tensors are equal. Returns 1 if they are equal, 0 otherwise.
-int TfLiteIntArrayEqual(TfLiteIntArray* a, TfLiteIntArray* b);
-
-// Create a copy of an array passed as `src`.
-// You are expected to free memory with TfLiteIntArrayFree
-TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src);
-
-// Free memory of array `v`.
-void TfLiteIntArrayFree(TfLiteIntArray* v);
-
-// Since we must not depend on any libraries, define a minimal subset of
-// error macros while avoiding names that have pre-conceived meanings like
-// assert and check.
-
-// Check whether value is true, and if not return kTfLiteError from
-// the current function (and report the error string msg).
-#define TF_LITE_ENSURE_MSG(context, value, msg)            \
-  do {                                                     \
-    if (!(value)) {                                        \
-      (context)->ReportError((context), __FILE__ " " msg); \
-      return kTfLiteError;                                 \
-    }                                                      \
-  } while (0)
-
-// Check whether the value `a` is true, and if not return kTfLiteError from
-// the current function, while also reporting the location of the error.
-#define TF_LITE_ENSURE(context, a)                                          \
-  do {                                                                      \
-    if (!(a)) {                                                             \
-      (context)->ReportError((context), "%s:%d %s was not true.", __FILE__, \
-                             __LINE__, #a);                                 \
-      return kTfLiteError;                                                  \
-    }                                                                       \
-  } while (0)
-
-#define TF_LITE_ENSURE_STATUS(a) \
-  do {                           \
-    if ((a) != kTfLiteOk) {      \
-      return kTfLiteError;       \
-    }                            \
-  } while (0)
-
-// Check whether the value `a == b` is true, and if not return kTfLiteError from
-// the current function, while also reporting the location of the error.
-// `a` and `b` may be evaluated more than once, so no side effects or
-// extremely expensive computations should be done.
-#define TF_LITE_ENSURE_EQ(context, a, b)                                       \
-  do {                                                                         \
-    if ((a) != (b)) {                                                          \
-      (context)->ReportError((context), "%s:%d %s != %s (%d != %d)", __FILE__, \
-                             __LINE__, #a, #b, (a), (b));                      \
-      return kTfLiteError;                                                     \
-    }                                                                          \
-  } while (0)
-
-#define TF_LITE_ENSURE_OK(context, status) \
-  do {                                     \
-    if ((status) != kTfLiteOk) {           \
-      return kTfLiteError;                 \
-    }                                      \
-  } while (0)
-
-// Single-precision complex data type compatible with the C99 definition.
-typedef struct {
-  float re, im;  // real and imaginary parts, respectively.
-} TfLiteComplex64;
-
-// Types supported by tensor
-typedef enum {
-  kTfLiteNoType = 0,
-  kTfLiteFloat32 = 1,
-  kTfLiteInt32 = 2,
-  kTfLiteUInt8 = 3,
-  kTfLiteInt64 = 4,
-  kTfLiteString = 5,
-  kTfLiteBool = 6,
-  kTfLiteInt16 = 7,
-  kTfLiteComplex64 = 8,
-} TfLiteType;
-
-// Parameters for asymmetric quantization. Quantized values can be converted
-// back to float using:
-//    real_value = scale * (quantized_value - zero_point);
-typedef struct {
-  float scale;
-  int32_t zero_point;
-} TfLiteQuantizationParams;
-
-// A union of pointers that points to memory for a given tensor.
-typedef union {
-  int* i32;
-  int64_t* i64;
-  float* f;
-  char* raw;
-  const char* raw_const;
-  uint8_t* uint8;
-  bool* b;
-  int16_t* i16;
-  TfLiteComplex64* c64;
-} TfLitePtrUnion;
-
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
-typedef enum {
-  kTfLiteMemNone = 0,
-  kTfLiteMmapRo,
-  kTfLiteArenaRw,
-  kTfLiteArenaRwPersistent,
-  kTfLiteDynamic,
-} TfLiteAllocationType;
-
-// The delegates should use zero or positive integers to represent handles.
-// -1 is reserved from unallocated status.
-typedef int TfLiteBufferHandle;
-const TfLiteBufferHandle kTfLiteNullBufferHandle = -1;
-
-// An tensor in the interpreter system which is a wrapper around a buffer of
-// data including a dimensionality (or NULL if not currently defined).
-typedef struct {
-  // The data type specification for data stored in `data`. This affects
-  // what member of `data` union should be used.
-  TfLiteType type;
-  // A union of data pointers. The appropriate type should be used for a typed
-  // tensor based on `type`.
-  TfLitePtrUnion data;
-  // A pointer to a structure representing the dimensionality interpretation
-  // that the buffer should have. NOTE: the product of elements of `dims`
-  // and the element datatype size should be equal to `bytes` below.
-  TfLiteIntArray* dims;
-  // Quantization information.
-  TfLiteQuantizationParams params;
-  // How memory is mapped
-  //  kTfLiteMmapRo: Memory mapped read only.
-  //  i.e. weights
-  //  kTfLiteArenaRw: Arena allocated read write memory
-  //  (i.e. temporaries, outputs).
-  TfLiteAllocationType allocation_type;
-  // The number of bytes required to store the data of this Tensor. I.e.
-  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
-  // type is kTfLiteFloat32 and dims = {3, 2} then
-  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
-  size_t bytes;
-
-  // An opaque pointer to a tflite::MMapAllocation
-  const void* allocation;
-
-  // Null-terminated name of this tensor.
-  const char* name;
-
-  // The delegate which knows how to handle `buffer_handle`.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteDelegate* delegate;
-
-  // An integer buffer handle that can be handled by `delegate`.
-  // The value is valid only when delegate is not null.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteBufferHandle buffer_handle;
-
-  // If the delegate uses its own buffer (e.g. GPU memory), the delegate is
-  // responsible to set data_is_stale to true.
-  // `delegate->CopyFromBufferHandle` can be called to copy the data from
-  // delegate buffer.
-  // WARNING: This is an // experimental interface that is subject to change.
-  bool data_is_stale;
-
-  // True if the tensor is a variable.
-  bool is_variable;
-} TfLiteTensor;
-
-// Free data memory of tensor `t`;
-void TfLiteTensorDataFree(TfLiteTensor* t);
-
-// Free memory of tensor `t`;
-void TfLiteTensorFree(TfLiteTensor* t);
-
-// Set all of a tensor's fields (and free any previously allocated data).
-void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
-                       TfLiteQuantizationParams quantization, char* buffer,
-                       size_t size, TfLiteAllocationType allocation_type,
-                       const void* allocation, bool is_variable,
-                       TfLiteTensor* tensor);
-
-// Resize the allocated data of a (dynamic) tensor. Tensors with allocation
-// types other than kTfLiteDynamic will be ignored.
-void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
-
-// A structure representing an instance of a node.
-// This structure only exhibits the inputs, outputs and user defined data, not
-// other features like the type.
-typedef struct {
-  // Inputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* inputs;
-
-  // Outputs to this node expressed as indices into the simulator's tensors.
-  TfLiteIntArray* outputs;
-
-  // Temporary tensors uses during the computations. This usually contains no
-  // tensors, but ops are allowed to change that if they need scratch space of
-  // any sort.
-  TfLiteIntArray* temporaries;
-
-  // Opaque data provided by the node implementer through `Registration.init`.
-  void* user_data;
-
-  // Opaque data provided to the node if the node is a builtin. This is usually
-  // a structure defined in builtin_op_data.h
-  void* builtin_data;
-
-  // Custom initial data. This is the opaque data provided in the flatbuffer.
-  // WARNING: This is an experimental interface that is subject to change.
-  const void* custom_initial_data;
-  int custom_initial_data_size;
-
-  // The pointer to the delegate. This is non-null only when the node is
-  // created by calling `interpreter.ModifyGraphWithDelegate`.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteDelegate* delegate;
-} TfLiteNode;
-
-typedef struct TfLiteContext {
-  // Number of tensors in the context.
-  size_t tensors_size;
-
-  // The execution plan contains a list of the node indices in execution
-  // order. execution_plan->size is the current number of nodes. And,
-  // execution_plan->data[0] is the first node that needs to be run.
-  // TfLiteDelegates can traverse the current execution plan by iterating
-  // through each member of this array and using GetNodeAndRegistration() to
-  // access details about a node. i.e.
-  // TfLiteIntArray* execution_plan;
-  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &execution_plan));
-  // for (int exec_index = 0; exec_index < execution_plan->size; exec_index++) {
-  //    int node_index = execution_plan->data[exec_index];
-  //    TfLiteNode* node;
-  //    TfLiteRegistration* reg;
-  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
-  // }
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*GetExecutionPlan)(struct TfLiteContext* context,
-                                   TfLiteIntArray** execution_plan);
-
-  // An array of tensors in the interpreter context (of length `tensors_size`)
-  TfLiteTensor* tensors;
-
-  // opaque full context ptr (an opaque c++ data structure)
-  void* impl_;
-
-  // Request memory pointer be resized. Updates dimensions on the tensor.
-  // NOTE: ResizeTensor takes ownership of newSize.
-  TfLiteStatus (*ResizeTensor)(struct TfLiteContext*, TfLiteTensor* tensor,
-                               TfLiteIntArray* new_size);
-  // Request that a error be reported with format string msg.
-  void (*ReportError)(struct TfLiteContext*, const char* msg, ...);
-
-  // Add `tensors_to_add` tensors, preserving pre-existing Tensor entries.  If
-  // non-null, the value pointed to by `first_new_tensor_index` will be set to
-  // the index of the first new tensor.
-  TfLiteStatus (*AddTensors)(struct TfLiteContext*, int tensors_to_add,
-                             int* first_new_tensor_index);
-
-  // Get a Tensor node by node_index.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus (*GetNodeAndRegistration)(struct TfLiteContext*, int node_index,
-                                         TfLiteNode** node,
-                                         TfLiteRegistration** registration);
-
-  // Replace ops with one or more stub delegate operations. This function
-  // does not take ownership of `nodes_to_replace`.
-  TfLiteStatus (*ReplaceSubgraphsWithDelegateKernels)(
-      struct TfLiteContext*, TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
-
-  // Number of threads that are recommended to subsystems like gemmlowp and
-  // eigen.
-  int recommended_num_threads;
-
-  // Access external contexts by type.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteExternalContext* (*GetExternalContext)(struct TfLiteContext*,
-                                               TfLiteExternalContextType);
-  // Set the value of a external context. Does not take ownership of the
-  // pointer.
-  // WARNING: This is an experimental interface that is subject to change.
-  void (*SetExternalContext)(struct TfLiteContext*, TfLiteExternalContextType,
-                             TfLiteExternalContext*);
-
-  // Flag for allowing float16 precision for FP32 calculation.
-  // default: false.
-  // WARNING: This is an experimental API and subject to change.
-  bool allow_fp32_relax_to_fp16;
-} TfLiteContext;
-
-typedef struct _TfLiteRegistration {
-  // Initializes the op from serialized data.
-  // If a built-in op:
-  //   `buffer` is the op's params data (TfLiteLSTMParams*).
-  //   `length` is zero.
-  // If custom op:
-  //   `buffer` is the op's `custom_options`.
-  //   `length` is the size of the buffer.
-  //
-  // Returns a type-punned (i.e. void*) opaque data (e.g. a primitive pointer
-  // or an instance of a struct).
-  //
-  // The returned pointer will be stored with the node in the `user_data` field,
-  // accessible within prepare and invoke functions below.
-  // NOTE: if the data is already in the desired format, simply implement this
-  // function to return `nullptr` and implement the free function to be a no-op.
-  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
-
-  // The pointer `buffer` is the data previously returned by an init invocation.
-  void (*free)(TfLiteContext* context, void* buffer);
-
-  // prepare is called when the inputs this node depends on have been resized.
-  // context->ResizeTensor() can be called to request output tensors to be
-  // resized.
-  //
-  // Returns kTfLiteOk on success.
-  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
-
-  // Execute the node (should read node->inputs and output to node->outputs).
-  // Returns kTfLiteOk on success.
-  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
-
-  // profiling_string is called during summarization of profiling information
-  // in order to group executions together. Providing a value here will cause a
-  // given op to appear multiple times is the profiling report. This is
-  // particularly useful for custom ops that can perform significantly
-  // different calculations depending on their `user-data`.
-  const char* (*profiling_string)(const TfLiteContext* context,
-                                  const TfLiteNode* node);
-
-  // Builtin codes. If this kernel refers to a builtin this is the code
-  // of the builtin. This is so we can do marshaling to other frameworks like
-  // NN API.
-  // Note: It is the responsibility of the registration binder to set this
-  // properly.
-  int32_t builtin_code;
-
-  // Custom op name. If the op is a builtin, this will be null.
-  // Note: It is the responsibility of the registration binder to set this
-  // properly.
-  // WARNING: This is an experimental interface that is subject to change.
-  const char* custom_name;
-
-  // The version of the op.
-  // Note: It is the responsibility of the registration binder to set this
-  // properly.
-  int version;
-} TfLiteRegistration;
-
-// WARNING: This is an experimental interface that is subject to change.
-typedef struct _TfLiteDelegate {
-  // Data that delegate needs to identify itself. This data is owned by the
-  // delegate. The delegate is owned in the user code, so the delegate is
-  // responsible for doing this when it is destroyed.
-  void* data_;
-
-  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
-  // delegate a view of the current graph through TfLiteContext*. It typically
-  // will look at the nodes and call ReplaceSubgraphsWithDelegateKernels()
-  // to ask the TensorFlow lite runtime to create macro-nodes to represent
-  // delegated subgraphs of the original graph.
-  TfLiteStatus (*Prepare)(TfLiteContext* context, TfLiteDelegate* delegate);
-
-  // Copy the data from delegate buffer handle to raw memory.
-  // This can be null if the delegate doesn't use its own buffer.
-  TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
-                                       TfLiteDelegate* delegate,
-                                       TfLiteBufferHandle buffer_handle,
-                                       void* data, size_t size);
-
-  // Copy the data from raw memory to delegate buffer handle.
-  // This can be null if the delegate doesn't use its own buffer.
-  TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
-                                     TfLiteDelegate* delegate,
-                                     TfLiteBufferHandle buffer_handle,
-                                     void* data, size_t size);
-
-  // Free the Delegate Buffer Handle. Note: This only frees the handle, but
-  // this doesn't release the underlying resource (e.g. textures). The
-  // resources are either owned by application layer or the delegate.
-  // This can be null if the delegate doesn't use its own buffer.
-  void (*FreeBufferHandle)(TfLiteContext* context, TfLiteDelegate* delegate,
-                           TfLiteBufferHandle* handle);
-} TfLiteDelegate;
-
-// WARNING: This is an experimental interface that is subject to change.
-//
-// Currently, TfLiteDelegateParams has to be allocated in a way that it's
-// trivially destructable. It will be stored as `builtin_data` field in
-// `TfLiteNode` of the delegate node.
-//
-// See also the `CreateDelegateParams` function in `interpreter.cc` details.
-typedef struct {
-  TfLiteDelegate* delegate;
-  TfLiteIntArray* nodes_to_replace;
-  TfLiteIntArray* input_tensors;
-  TfLiteIntArray* output_tensors;
-} TfLiteDelegateParams;
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_C_C_API_INTERNAL_H_
diff --git a/tensorflow/contrib/lite/core/api/BUILD b/tensorflow/contrib/lite/core/api/BUILD
deleted file mode 100644
index e4500534f348f15b47d3c3868461237e68fc3ac3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/core/api/BUILD
+++ /dev/null
@@ -1,57 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-
-cc_library(
-    name = "api",
-    srcs = [
-        "error_reporter.cc",
-        "flatbuffer_conversions.cc",
-        "op_resolver.cc",
-    ],
-    hdrs = [
-        "error_reporter.h",
-        "flatbuffer_conversions.h",
-        "op_resolver.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-    ],
-)
-
-cc_test(
-    name = "error_reporter_test",
-    size = "small",
-    srcs = ["error_reporter_test.cc"],
-    deps = [
-        ":api",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_test(
-    name = "op_resolver_test",
-    size = "small",
-    srcs = ["op_resolver_test.cc"],
-    deps = [
-        ":api",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_test(
-    name = "flatbuffer_conversions_test",
-    size = "small",
-    srcs = ["flatbuffer_conversions_test.cc"],
-    deps = [
-        ":api",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/tensorflow/contrib/lite/core/api/error_reporter.h b/tensorflow/contrib/lite/core/api/error_reporter.h
deleted file mode 100644
index a2f780b003fc213d28ba29d7783dbfe99088cccc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/core/api/error_reporter.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_CORE_API_ERROR_REPORTER_H_
-#define TENSORFLOW_CONTRIB_LITE_CORE_API_ERROR_REPORTER_H_
-
-#include <cstdarg>
-
-namespace tflite {
-
-// A functor that reports error to supporting system. Invoked similar to
-// printf.
-//
-// Usage:
-//  ErrorReporter foo;
-//  foo.Report("test %d", 5);
-// or
-//  va_list args;
-//  foo.Report("test %d", args); // where args is va_list
-//
-// Subclass ErrorReporter to provide another reporting destination.
-// For example, if you have a GUI program, you might redirect to a buffer
-// that drives a GUI error log box.
-class ErrorReporter {
- public:
-  virtual ~ErrorReporter() {}
-  virtual int Report(const char* format, va_list args) = 0;
-  int Report(const char* format, ...);
-  int ReportError(void*, const char* format, ...);
-};
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_CORE_API_ERROR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/core/api/op_resolver.h b/tensorflow/contrib/lite/core/api/op_resolver.h
deleted file mode 100644
index 5f5e6b27363b525094659719da9decf49dbeac45..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/core/api/op_resolver.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_CORE_API_OP_RESOLVER_H_
-#define TENSORFLOW_CONTRIB_LITE_CORE_API_OP_RESOLVER_H_
-
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-
-namespace tflite {
-
-// Abstract interface that returns TfLiteRegistrations given op codes or custom
-// op names. This is the mechanism that ops being referenced in the flatbuffer
-// model are mapped to executable function pointers (TfLiteRegistrations).
-class OpResolver {
- public:
-  // Finds the op registration for a builtin operator by enum code.
-  virtual const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
-                                           int version) const = 0;
-  // Finds the op registration of a custom operator by op name.
-  virtual const TfLiteRegistration* FindOp(const char* op,
-                                           int version) const = 0;
-  virtual ~OpResolver() {}
-};
-
-// Handles the logic for converting between an OperatorCode structure extracted
-// from a flatbuffer and information about a registered operator implementation.
-TfLiteStatus GetRegistrationFromOpCode(const OperatorCode* opcode,
-                                       const OpResolver& op_resolver,
-                                       ErrorReporter* error_reporter,
-                                       const TfLiteRegistration** registration);
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_CORE_API_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/delegates/flex/BUILD b/tensorflow/contrib/lite/delegates/flex/BUILD
deleted file mode 100644
index 9b89ed4f849e224d36adae7c3a7581ac542d4f0f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/BUILD
+++ /dev/null
@@ -1,202 +0,0 @@
-#
-# This is a TF Lite delegate that is powered by TensorFlow's Eager.
-#
-package(default_visibility = [
-    "//visibility:private",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-cc_library(
-    name = "buffer_map",
-    srcs = ["buffer_map.cc"],
-    hdrs = ["buffer_map.h"],
-    deps = [
-        ":util",
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite:kernel_api",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:framework",
-            "//tensorflow/core:protos_all_cc",
-        ],
-    }),
-)
-
-tf_cc_test(
-    name = "buffer_map_test",
-    size = "small",
-    srcs = ["buffer_map_test.cc"],
-    deps = [
-        ":buffer_map",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:util",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "delegate",
-    srcs = [
-        "delegate.cc",
-    ],
-    hdrs = [
-        "delegate.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":buffer_map",
-        ":delegate_data",
-        ":kernel",
-        ":util",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite:kernel_api",
-        "//tensorflow/contrib/lite:util",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:lib",
-        ],
-    }),
-    alwayslink = 1,
-)
-
-tf_cc_test(
-    name = "delegate_test",
-    size = "small",
-    srcs = ["delegate_test.cc"],
-    deps = [
-        ":delegate",
-        ":test_util",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "delegate_data",
-    srcs = ["delegate_data.cc"],
-    hdrs = ["delegate_data.h"],
-    deps = [
-        ":buffer_map",
-        "//tensorflow/core/common_runtime/eager:context",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:core_cpu",
-            "//tensorflow/core:lib",
-        ],
-    }),
-)
-
-tf_cc_test(
-    name = "delegate_data_test",
-    size = "small",
-    srcs = ["delegate_data_test.cc"],
-    deps = [
-        ":delegate_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:util",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "kernel",
-    srcs = ["kernel.cc"],
-    hdrs = ["kernel.h"],
-    deps = [
-        ":delegate_data",
-        ":util",
-        "@flatbuffers",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite:kernel_api",
-        "//tensorflow/contrib/lite:string",
-        "//tensorflow/contrib/lite/kernels:kernel_util",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/common_runtime/eager:execute",
-        "//tensorflow/core/common_runtime/eager:tensor_handle",
-    ] + select({
-        # TODO(b/111881878): The android_tensorflow_lib target pulls in the full
-        # set of core TensorFlow kernels. We may want to revisit this dependency
-        # to allow selective registration via build targets.
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:protos_all_cc",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:tensorflow",
-        ],
-    }),
-)
-
-tf_cc_test(
-    name = "kernel_test",
-    size = "small",
-    srcs = ["kernel_test.cc"],
-    deps = [
-        ":delegate_data",
-        ":kernel",
-        ":test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "test_util",
-    testonly = True,
-    srcs = ["test_util.cc"],
-    hdrs = ["test_util.h"],
-    deps = [
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/contrib/lite:string",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_absl//absl/memory",
-        "@flatbuffers",
-    ],
-)
-
-cc_library(
-    name = "util",
-    srcs = ["util.cc"],
-    hdrs = ["util.h"],
-    deps = [
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite:kernel_api",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:lib",
-            "//tensorflow/core:framework",
-        ],
-    }),
-)
-
-tf_cc_test(
-    name = "util_test",
-    size = "small",
-    srcs = ["util_test.cc"],
-    deps = [
-        ":util",
-        "//tensorflow/contrib/lite:string",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/tensorflow/contrib/lite/delegates/flex/buffer_map.cc b/tensorflow/contrib/lite/delegates/flex/buffer_map.cc
deleted file mode 100644
index 63e39196d96a176eca105e7b11107ab52fe528dd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/buffer_map.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/buffer_map.h"
-
-#include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/delegates/flex/util.h"
-#include "tensorflow/core/framework/allocation_description.pb.h"
-#include "tensorflow/core/framework/log_memory.h"
-
-namespace tflite {
-namespace flex {
-namespace {
-// A tensor buffer that is allocated, deallocated and populated by TF Lite.
-class TfLiteTensorBuffer : public tensorflow::TensorBuffer {
- public:
-  explicit TfLiteTensorBuffer(const TfLiteTensor* tensor) {
-    len_ = tensor->bytes;
-    // TODO(ahentz): if we can guarantee that TF Lite allocated tensors with
-    // the same alignment as TensorFlow (EIGEN_MAX_ALIGN_BYTES), then we can
-    // potentially eliminate the copy below.
-    data_ =
-        tensorflow::cpu_allocator()->AllocateRaw(EIGEN_MAX_ALIGN_BYTES, len_);
-    if (data_ != nullptr) {
-      if (tensorflow::LogMemory::IsEnabled()) {
-        tensorflow::LogMemory::RecordRawAllocation(
-            "TfLiteTensorBuffer_New",
-            tensorflow::LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID, len_,
-            data_, tensorflow::cpu_allocator());
-      }
-      std::memcpy(data_, tensor->data.raw, tensor->bytes);
-    }
-  }
-
-  ~TfLiteTensorBuffer() override {
-    if (tensorflow::LogMemory::IsEnabled() && data_ != nullptr) {
-      tensorflow::LogMemory::RecordRawDeallocation(
-          "TfLiteTensorBuffer_Delete",
-          tensorflow::LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID, data_,
-          tensorflow::cpu_allocator(), false);
-    }
-    tensorflow::cpu_allocator()->DeallocateRaw(data_);
-  }
-
-  void* data() const override { return data_; }
-  size_t size() const override { return len_; }
-
-  TensorBuffer* root_buffer() override { return this; }
-  void FillAllocationDescription(
-      tensorflow::AllocationDescription* proto) const override {
-    tensorflow::int64 rb = size();
-    proto->set_requested_bytes(rb);
-    proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
-  }
-
-  // Prevents input forwarding from mutating this buffer.
-  bool OwnsMemory() const override { return false; }
-
- private:
-  void* data_;
-  size_t len_;
-};
-}  // namespace
-
-BufferMap::BufferMap() {}
-
-BufferMap::~BufferMap() {}
-
-bool BufferMap::HasTensor(int tensor_index) const {
-  return id_to_tensor_.count(tensor_index) != 0;
-}
-
-tensorflow::Tensor BufferMap::GetTensor(int tensor_index) const {
-  return id_to_tensor_.at(tensor_index);
-}
-
-void BufferMap::SetFromTfLite(int tensor_index, const TfLiteTensor* tensor) {
-  tensorflow::TensorShape shape;
-  int num_dims = tensor->dims->size;
-  for (int i = 0; i < num_dims; ++i) {
-    shape.AddDim(tensor->dims->data[i]);
-  }
-  // TODO(ahentz): we assume this is a new tensor and allocate a new buffer
-  // for it. This is not always the best approach. For example, this might
-  // be a reallocation after resizing tensors. In that case we would be
-  // preferable to somehow reuse the buffer.
-  auto* buf = new TfLiteTensorBuffer(tensor);
-  tensorflow::Tensor t = tensorflow::TensorCApi::MakeTensor(
-      GetTensorFlowDataType(tensor->type), shape, buf);
-  buf->Unref();
-
-  SetFromTensorFlow(tensor_index, std::move(t));
-}
-
-void BufferMap::SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor) {
-  id_to_tensor_[tensor_index] = std::move(tensor);
-}
-
-}  // namespace flex
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/flex/buffer_map.h b/tensorflow/contrib/lite/delegates/flex/buffer_map.h
deleted file mode 100644
index 4ce886568a55773971bc0543ec973ec84c0aac1b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/buffer_map.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
-
-#include <map>
-
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/core/framework/tensor.h"
-
-namespace tflite {
-namespace flex {
-
-// Maps a TF Lite tensor index into a TensorFlow tensor.
-//
-// The TF Lite interpreter assigns integer indices to each of its tensors, but
-// the Flex delegate deals in terms of TensorFlow tensors. This class maps
-// from indices to tensors and allows the creation of new tensors to be
-// associated with a given index.
-class BufferMap {
- public:
-  BufferMap();
-  ~BufferMap();
-
-  // Returns true if the given 'tensor_index' has a corresponding
-  // tensorflow::Tensor.
-  bool HasTensor(int tensor_index) const;
-
-  // Returns the tensorflow::Tensor associated with the given 'tensor_index'.
-  // Precondition: HasTensor() is true.
-  tensorflow::Tensor GetTensor(int tensor_index) const;
-
-  // Associates the given tensorflow::Tensor with the given 'tensor_index'.
-  // Note that tensorflow Tensors share data buffers, so this method is only a
-  // shallow copy.
-  void SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor);
-
-  // Same as above but creates a new tensorflow::Tensor with a copy of the
-  // given TfLiteTensor's data.
-  void SetFromTfLite(int tensor_index, const TfLiteTensor* tensor);
-
- private:
-  std::map<int, tensorflow::Tensor> id_to_tensor_;
-};
-
-}  // namespace flex
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
diff --git a/tensorflow/contrib/lite/delegates/flex/buffer_map_test.cc b/tensorflow/contrib/lite/delegates/flex/buffer_map_test.cc
deleted file mode 100644
index bb80e25e8076bb95782e4137945ad1c7cd178aee..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/buffer_map_test.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/buffer_map.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/testing/util.h"
-#include "tensorflow/contrib/lite/util.h"
-
-namespace tflite {
-namespace flex {
-namespace {
-
-using ::testing::ElementsAre;
-
-// A bit of RAII to simplify handling of TfLiteTensors in the tests.
-using UniqueTfLiteTensor =
-    std::unique_ptr<TfLiteTensor, std::function<void(TfLiteTensor*)>>;
-
-template <typename T>
-UniqueTfLiteTensor MakeLiteTensor(const std::vector<int>& shape,
-                                  const std::vector<T>& data) {
-  auto tensor = UniqueTfLiteTensor(new TfLiteTensor, [](TfLiteTensor* t) {
-    TfLiteTensorDataFree(t);
-    TfLiteIntArrayFree(t->dims);
-    delete t;
-  });
-  tensor->allocation_type = kTfLiteDynamic;
-  tensor->type = typeToTfLiteType<T>();
-  tensor->dims = ConvertVectorToTfLiteIntArray(shape);
-  tensor->data.raw = nullptr;
-  TfLiteTensorRealloc(data.size() * sizeof(T), tensor.get());
-  memcpy(tensor->data.raw, data.data(), data.size() * sizeof(T));
-  return tensor;
-}
-
-template <typename T>
-tensorflow::Tensor MakeTensor(const std::vector<int>& shape,
-                              const std::vector<T>& data) {
-  BufferMap buffer_map;  // BufferMap is the easiest way to build the tensor.
-  UniqueTfLiteTensor t1 = MakeLiteTensor<T>(shape, data);
-  buffer_map.SetFromTfLite(0, t1.get());
-  return buffer_map.GetTensor(0);
-}
-
-std::vector<tensorflow::int64> GetTensorShape(const tensorflow::Tensor& t) {
-  std::vector<tensorflow::int64> shape(t.dims());
-  for (int i = 0; i < t.dims(); ++i) {
-    shape[i] = t.dim_size(i);
-  }
-  return shape;
-}
-
-template <typename T>
-std::vector<T> GetTensorData(const tensorflow::Tensor& t) {
-  const T* data = t.flat<T>().data();
-  return std::vector<T>(data, data + t.NumElements());
-}
-
-TEST(BufferMapTest, EmptyBuffer) {
-  BufferMap buffer_map;
-  EXPECT_FALSE(buffer_map.HasTensor(0));
-}
-
-TEST(BufferMapTest, SetFromTfLite) {
-  BufferMap buffer_map;
-
-  UniqueTfLiteTensor t =
-      MakeLiteTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
-  buffer_map.SetFromTfLite(0, t.get());
-  ASSERT_TRUE(buffer_map.HasTensor(0));
-
-  EXPECT_THAT(GetTensorData<float>(buffer_map.GetTensor(0)),
-              ElementsAre(0, 0, 0, 0.123f, 0, 0));
-
-  // Also check details of the tensor.
-  tensorflow::Tensor out_tensor = buffer_map.GetTensor(0);
-  ASSERT_EQ(out_tensor.dtype(), tensorflow::DT_FLOAT);
-  ASSERT_EQ(out_tensor.NumElements(), 6);
-  ASSERT_THAT(GetTensorShape(out_tensor), ElementsAre(1, 2, 1, 3));
-}
-
-TEST(BufferMapTest, SetFromTfLiteTwice) {
-  UniqueTfLiteTensor t1 =
-      MakeLiteTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
-  UniqueTfLiteTensor t2 =
-      MakeLiteTensor<int>({1, 2, 4}, {0, 0, 0, 3, 0, 0, 1, 2});
-
-  BufferMap buffer_map;
-  buffer_map.SetFromTfLite(0, t1.get());
-  buffer_map.SetFromTfLite(0, t2.get());
-
-  EXPECT_THAT(GetTensorData<int>(buffer_map.GetTensor(0)),
-              ElementsAre(0, 0, 0, 3, 0, 0, 1, 2));
-}
-
-TEST(BufferMapTest, SetFromTensorFlow) {
-  tensorflow::Tensor t1 =
-      MakeTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
-
-  BufferMap buffer_map;
-  buffer_map.SetFromTensorFlow(0, t1);
-
-  EXPECT_THAT(GetTensorData<float>(buffer_map.GetTensor(0)),
-              ElementsAre(0, 0, 0, 0.123f, 0, 0));
-
-  // Also check details of the tensor.
-  tensorflow::Tensor out_tensor = buffer_map.GetTensor(0);
-  ASSERT_EQ(out_tensor.dtype(), tensorflow::DT_FLOAT);
-  ASSERT_EQ(out_tensor.NumElements(), 6);
-  ASSERT_THAT(GetTensorShape(out_tensor), ElementsAre(1, 2, 1, 3));
-}
-
-TEST(BufferMapTest, SetFromTensorFlowTwice) {
-  tensorflow::Tensor t1 =
-      MakeTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
-  tensorflow::Tensor t2 = MakeTensor<int>({1, 2, 4}, {0, 0, 0, 3, 0, 0, 1, 2});
-  BufferMap buffer_map;
-  buffer_map.SetFromTensorFlow(0, t1);
-  buffer_map.SetFromTensorFlow(0, t2);
-
-  EXPECT_THAT(GetTensorData<int>(buffer_map.GetTensor(0)),
-              ElementsAre(0, 0, 0, 3, 0, 0, 1, 2));
-}
-
-TEST(BufferMapTest, TfLiteOverwritesTensorFlow) {
-  tensorflow::Tensor t1 =
-      MakeTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
-  UniqueTfLiteTensor t2 =
-      MakeLiteTensor<int>({1, 2, 4}, {0, 0, 0, 3, 0, 0, 1, 2});
-
-  BufferMap buffer_map;
-  buffer_map.SetFromTensorFlow(0, t1);
-  buffer_map.SetFromTfLite(0, t2.get());
-
-  EXPECT_THAT(GetTensorData<int>(buffer_map.GetTensor(0)),
-              ElementsAre(0, 0, 0, 3, 0, 0, 1, 2));
-}
-
-TEST(BufferMapTest, TensorFlowOverwritesTfLite) {
-  tensorflow::Tensor t1 =
-      MakeTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
-  UniqueTfLiteTensor t2 =
-      MakeLiteTensor<int>({1, 2, 4}, {0, 0, 0, 3, 0, 0, 1, 2});
-  BufferMap buffer_map;
-  buffer_map.SetFromTfLite(0, t2.get());
-  buffer_map.SetFromTensorFlow(0, t1);
-
-  EXPECT_THAT(GetTensorData<float>(buffer_map.GetTensor(0)),
-              ElementsAre(0, 0, 0, 0.123f, 0, 0));
-}
-
-}  // namespace
-}  // namespace flex
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/delegates/flex/delegate.cc b/tensorflow/contrib/lite/delegates/flex/delegate.cc
deleted file mode 100644
index c72b0cf51383897ce3afec0c39ed6bfe178d88c1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/delegate.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
-
-#include <vector>
-
-#include "tensorflow/contrib/lite/context_util.h"
-#include "tensorflow/contrib/lite/delegates/flex/buffer_map.h"
-#include "tensorflow/contrib/lite/delegates/flex/kernel.h"
-#include "tensorflow/contrib/lite/delegates/flex/util.h"
-#include "tensorflow/contrib/lite/util.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tflite {
-namespace flex {
-namespace delegate {
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  // Get the nodes in the current execution plan. Interpreter owns this array.
-  TfLiteIntArray* plan;
-  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
-
-  // Add all custom ops starting with "Flex" to list of supported nodes.
-  std::vector<int> supported_nodes;
-  for (int node_index : TfLiteIntArrayView(plan)) {
-    TfLiteNode* node;
-    TfLiteRegistration* registration;
-    TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
-        context, node_index, &node, &registration));
-
-    if (IsFlexOp(registration->custom_name)) {
-      supported_nodes.push_back(node_index);
-    }
-  }
-
-  // Request TFLite to partition the graph and make kernels for each independent
-  // subgraph.
-  TfLiteIntArray* size_and_nodes =
-      ConvertVectorToTfLiteIntArray(supported_nodes);
-  context->ReplaceSubgraphsWithDelegateKernels(context, GetKernel(),
-                                               size_and_nodes, delegate);
-  TfLiteIntArrayFree(size_and_nodes);
-  return kTfLiteOk;
-}
-
-TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
-                                  TfLiteDelegate* delegate,
-                                  TfLiteBufferHandle buffer_handle, void* data,
-                                  size_t size) {
-  BufferMap* buffer_map =
-      reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap(context);
-
-  if (!buffer_map->HasTensor(buffer_handle)) {
-    context->ReportError(context, "Invalid tensor index %d.", buffer_handle);
-    return kTfLiteError;
-  }
-
-  tensorflow::Tensor t = buffer_map->GetTensor(buffer_handle);
-  tensorflow::StringPiece t_data = t.tensor_data();
-
-  if (size != t_data.size()) {
-    context->ReportError(
-        context, "Not enough space to store TensorFlow's aligned buffer.");
-    return kTfLiteError;
-  }
-
-  memcpy(data, t_data.data(), t_data.size());
-  return kTfLiteOk;
-}
-
-}  // namespace delegate
-}  // namespace flex
-
-// Corresponding weak declaration found in lite/model.cc.
-std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
-AcquireFlexDelegate() {
-  return std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
-      tflite::FlexDelegate::Create().release(), [](TfLiteDelegate* delegate) {
-        delete reinterpret_cast<tflite::FlexDelegate*>(delegate);
-      });
-}
-
-std::unique_ptr<FlexDelegate> FlexDelegate::Create() {
-  std::unique_ptr<flex::DelegateData> delegate_data;
-  if (!flex::DelegateData::Create(&delegate_data).ok()) {
-    fprintf(stderr, "Unable to initialize TensorFlow context.\n");
-    return nullptr;
-  }
-
-  return std::unique_ptr<FlexDelegate>(
-      new FlexDelegate(std::move(delegate_data)));
-}
-
-FlexDelegate::FlexDelegate(std::unique_ptr<flex::DelegateData> delegate_data)
-    : TfLiteDelegate{
-          /*data_=*/delegate_data.get(),
-          /*nullptr,*/ &flex::delegate::Prepare,
-          /*CopyFromBufferHandle=*/&flex::delegate::CopyFromBufferHandle,
-          /*CopyToBufferHandle=*/nullptr,
-          /*FreeBufferHandle=*/nullptr},
-      delegate_data_(std::move(delegate_data)) {}
-
-FlexDelegate::~FlexDelegate() {}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/flex/kernel_test.cc b/tensorflow/contrib/lite/delegates/flex/kernel_test.cc
deleted file mode 100644
index 94a6f8b61ad28144f6b8d0d462338ab4176af168..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/kernel_test.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/kernel.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
-#include "tensorflow/contrib/lite/delegates/flex/test_util.h"
-
-namespace tflite {
-namespace flex {
-namespace {
-
-using ::testing::ContainsRegex;
-using ::testing::ElementsAre;
-
-TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
-                            const std::vector<int>& supported_nodes) {
-  TfLiteIntArray* size_and_nodes =
-      ConvertVectorToTfLiteIntArray(supported_nodes);
-  TF_LITE_ENSURE_STATUS(context->ReplaceSubgraphsWithDelegateKernels(
-      context, flex::GetKernel(), size_and_nodes, delegate));
-  TfLiteIntArrayFree(size_and_nodes);
-  return kTfLiteOk;
-}
-
-class KernelTest : public testing::FlexModelTest {
- public:
-  KernelTest() {
-    CHECK(DelegateData::Create(&delegate_data_).ok());
-    interpreter_.reset(new Interpreter(&error_reporter_));
-  }
-
-  ~KernelTest() override {
-    // The data needs to be released before the interpreter because the
-    // interpreter references the data.
-    delegate_data_.reset();
-    interpreter_.reset();
-  }
-
-  template <typename T>
-  void ConfigureDelegate(T prepare_function) {
-    delegate_.data_ = delegate_data_.get();
-    delegate_.FreeBufferHandle = nullptr;
-    delegate_.Prepare = prepare_function;
-    delegate_.CopyFromBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        void* data, size_t size) {
-      auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
-      tensorflow::StringPiece values = delegate_data->GetBufferMap(context)
-                                           ->GetTensor(buffer_handle)
-                                           .tensor_data();
-      memcpy(data, values.data(), values.size());
-      return kTfLiteOk;
-    };
-    CHECK(interpreter_->ModifyGraphWithDelegate(
-              &delegate_, /*allow_dynamic_tensors=*/true) == kTfLiteOk);
-  }
-
- private:
-  std::unique_ptr<DelegateData> delegate_data_;
-  TfLiteDelegate delegate_;
-};
-
-TEST_F(KernelTest, FullGraph) {
-  // Define the graph.
-  AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
-
-  AddTfOp(testing::kUnpack, {0}, {1, 2});
-  AddTfOp(testing::kUnpack, {3}, {4, 5});
-  AddTfOp(testing::kAdd, {1, 4}, {6});
-  AddTfOp(testing::kAdd, {2, 5}, {7});
-  AddTfOp(testing::kMul, {6, 7}, {8});
-
-  // Apply Delegate.
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1, 2, 3, 4});
-  });
-
-  // Define inputs.
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-  SetShape(3, {2, 2, 1});
-  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_TRUE(Invoke());
-
-  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
-  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
-}
-
-TEST_F(KernelTest, BadTensorFlowOp) {
-  AddTensors(2, {0}, {1}, kTfLiteFloat32, {3});
-  AddTfOp(testing::kNonExistent, {0}, {1});
-
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0});
-  });
-
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
-  ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("while processing attributes of 'NonExistentOp'"));
-}
-
-TEST_F(KernelTest, BadNumberOfOutputs) {
-  AddTensors(3, {0}, {1, 2}, kTfLiteFloat32, {3});
-  AddTfOp(testing::kIdentity, {0}, {1, 2});
-
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0});
-  });
-
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
-  ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("Unexpected number of outputs"));
-}
-
-TEST_F(KernelTest, IncompatibleNodeDef) {
-  AddTensors(2, {0}, {1}, kTfLiteFloat32, {3});
-
-  // Cast is a TF op, but we don't add the proper nodedef to it in AddTfOp.
-  AddTfOp(testing::kIncompatibleNodeDef, {0}, {1});
-
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0});
-  });
-
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
-  ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("while executing 'Cast' via Eager"));
-}
-
-TEST_F(KernelTest, WrongSetOfNodes) {
-  AddTensors(4, {0}, {3}, kTfLiteFloat32, {3});
-  AddTfOp(testing::kUnpack, {0}, {1, 2});
-  AddTfLiteMulOp({1, 2}, {3});
-
-  // Specify that testing::kMul (#1) is supported when it actually isn't.
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1});
-  });
-
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
-  ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("Invalid NodeDef in Flex op"));
-}
-
-TEST_F(KernelTest, MixedGraph) {
-  AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
-
-  AddTfOp(testing::kUnpack, {0}, {1, 2});
-  AddTfOp(testing::kUnpack, {3}, {4, 5});
-  AddTfOp(testing::kAdd, {1, 4}, {6});
-  AddTfOp(testing::kAdd, {2, 5}, {7});
-  AddTfLiteMulOp({6, 7}, {8});
-
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1, 2, 3});
-  });
-
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-  SetShape(3, {2, 2, 1});
-  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_TRUE(Invoke());
-
-  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
-  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
-}
-
-TEST_F(KernelTest, SplitGraph) {
-  AddTensors(10, {0}, {9}, kTfLiteFloat32, {3});
-
-  AddTfOp(testing::kUnpack, {0}, {1, 2});
-  AddTfOp(testing::kAdd, {1, 2}, {3});
-  AddTfOp(testing::kUnpack, {3}, {4, 5});
-
-  AddTfLiteMulOp({4, 5}, {6});
-
-  AddTfOp(testing::kUnpack, {6}, {7, 8});
-  AddTfOp(testing::kAdd, {7, 8}, {9});
-
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1, 2, 4, 5});
-  });
-
-  SetShape(0, {2, 2, 2, 1});
-  SetValues(0, {3.0f, 1.0f, 0.5f, -1.0f, 0.0f, 1.0f, 1.5f, 3.0f});
-
-  ASSERT_TRUE(Invoke());
-
-  ASSERT_THAT(GetShape(9), ElementsAre(1));
-  ASSERT_THAT(GetValues(9), ElementsAre(10.0f));
-}
-
-}  // namespace
-}  // namespace flex
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/delegates/flex/test_util.cc b/tensorflow/contrib/lite/delegates/flex/test_util.cc
deleted file mode 100644
index 69c336a01a57416bb331a897faba03ad75a38f95..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/test_util.cc
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/delegates/flex/test_util.h"
-
-#include "absl/memory/memory.h"
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/string.h"
-
-namespace tflite {
-namespace flex {
-namespace testing {
-
-bool FlexModelTest::Invoke() { return interpreter_->Invoke() == kTfLiteOk; }
-
-void FlexModelTest::SetShape(int tensor_index, const std::vector<int>& values) {
-  ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-}
-
-std::vector<int> FlexModelTest::GetShape(int tensor_index) {
-  std::vector<int> result;
-  auto* dims = interpreter_->tensor(tensor_index)->dims;
-  result.reserve(dims->size);
-  for (int i = 0; i < dims->size; ++i) {
-    result.push_back(dims->data[i]);
-  }
-  return result;
-}
-
-TfLiteType FlexModelTest::GetType(int tensor_index) {
-  return interpreter_->tensor(tensor_index)->type;
-}
-
-void FlexModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
-                               const std::vector<int>& outputs, TfLiteType type,
-                               const std::vector<int>& dims) {
-  interpreter_->AddTensors(num_tensors);
-  for (int i = 0; i < num_tensors; ++i) {
-    TfLiteQuantizationParams quant;
-    // Suppress explicit output type specification to ensure type inference
-    // works properly.
-    if (std::find(outputs.begin(), outputs.end(), i) != outputs.end()) {
-      type = kTfLiteFloat32;
-    }
-    CHECK_EQ(interpreter_->SetTensorParametersReadWrite(i, type,
-                                                        /*name=*/"",
-                                                        /*dims=*/dims, quant),
-             kTfLiteOk);
-  }
-
-  CHECK_EQ(interpreter_->SetInputs(inputs), kTfLiteOk);
-  CHECK_EQ(interpreter_->SetOutputs(outputs), kTfLiteOk);
-}
-
-void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
-                                   const std::vector<int>& outputs) {
-  static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-  reg.builtin_code = BuiltinOperator_MUL;
-  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    auto* i0 = &context->tensors[node->inputs->data[0]];
-    auto* o = &context->tensors[node->outputs->data[0]];
-    return context->ResizeTensor(context, o, TfLiteIntArrayCopy(i0->dims));
-  };
-  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    auto* i0 = &context->tensors[node->inputs->data[0]];
-    auto* i1 = &context->tensors[node->inputs->data[1]];
-    auto* o = &context->tensors[node->outputs->data[0]];
-    for (int i = 0; i < o->bytes / sizeof(float); ++i) {
-      o->data.f[i] = i0->data.f[i] * i1->data.f[i];
-    }
-    return kTfLiteOk;
-  };
-
-  CHECK_EQ(interpreter_->AddNodeWithParameters(inputs, outputs, nullptr, 0,
-                                               nullptr, &reg),
-           kTfLiteOk);
-}
-
-void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
-                            const std::vector<int>& outputs) {
-  auto attr = [](const string& key, const string& value) {
-    return " attr{ key: '" + key + "' value {" + value + "}}";
-  };
-
-  // Crude type attribution, will need fleshing out as more tests are added.
-  // TODO(b/113613439): Use nodedef string utilities to properly handle
-  // all types.
-  string type_attribute = attr("T", "type: DT_FLOAT");
-  if (interpreter_->tensor(inputs[0])->type == kTfLiteInt32) {
-    type_attribute = attr("T", "type: DT_INT32");
-  }
-
-  if (op == kUnpack) {
-    string attributes =
-        type_attribute + attr("num", "i: 2") + attr("axis", "i: 0");
-    AddTfOp("FlexUnpack", "Unpack", attributes, inputs, outputs);
-  } else if (op == kIdentity) {
-    string attributes = type_attribute;
-    AddTfOp("FlexIdentity", "Identity", attributes, inputs, outputs);
-  } else if (op == kAdd) {
-    string attributes = type_attribute;
-    AddTfOp("FlexAdd", "Add", attributes, inputs, outputs);
-  } else if (op == kMul) {
-    string attributes = type_attribute;
-    AddTfOp("FlexMul", "Mul", attributes, inputs, outputs);
-  } else if (op == kNonExistent) {
-    AddTfOp("NonExistentOp", "NonExistentOp", "", inputs, outputs);
-  } else if (op == kIncompatibleNodeDef) {
-    // "Cast" op is created without attributes - making it incompatible.
-    AddTfOp("FlexCast", "Cast", "", inputs, outputs);
-  }
-}
-
-void FlexModelTest::AddTfOp(const char* tflite_name, const string& tf_name,
-                            const string& nodedef_str,
-                            const std::vector<int>& inputs,
-                            const std::vector<int>& outputs) {
-  static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-  reg.builtin_code = BuiltinOperator_CUSTOM;
-  reg.custom_name = tflite_name;
-
-  tensorflow::NodeDef nodedef;
-  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
-      nodedef_str + " op: '" + tf_name + "'", &nodedef));
-  string serialized_nodedef;
-  CHECK(nodedef.SerializeToString(&serialized_nodedef));
-  flexbuffers::Builder fbb;
-  fbb.Vector([&]() {
-    fbb.String(nodedef.op());
-    fbb.String(serialized_nodedef);
-  });
-  fbb.Finish();
-
-  flexbuffers_.push_back(fbb.GetBuffer());
-  auto& buffer = flexbuffers_.back();
-  CHECK_EQ(interpreter_->AddNodeWithParameters(
-               inputs, outputs, reinterpret_cast<const char*>(buffer.data()),
-               buffer.size(), nullptr, &reg),
-           kTfLiteOk);
-}
-
-}  // namespace testing
-}  // namespace flex
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/flex/test_util.h b/tensorflow/contrib/lite/delegates/flex/test_util.h
deleted file mode 100644
index a8c81b90a3b8dc49ae058adb172456fe4d6e7172..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/test_util.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_TEST_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_TEST_UTIL_H_
-
-#include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-
-namespace tflite {
-namespace flex {
-namespace testing {
-
-enum TfOpType {
-  kUnpack,
-  kIdentity,
-  kAdd,
-  kMul,
-  // Represents an op that does not exist in TensorFlow.
-  kNonExistent,
-  // Represents an valid TensorFlow op where the NodeDef is incompatible.
-  kIncompatibleNodeDef,
-};
-
-// This class creates models with TF and TFLite ops. In order to use this class
-// to test the Flex delegate, implement a function that calls
-// interpreter->ModifyGraphWithDelegate.
-class FlexModelTest : public ::testing::Test {
- public:
-  FlexModelTest() {}
-  ~FlexModelTest() {}
-
-  bool Invoke();
-
-  // Sets the (typed) tensor's values at the given index.
-  template <typename T>
-  void SetTypedValues(int tensor_index, const std::vector<T>& values) {
-    memcpy(interpreter_->typed_tensor<T>(tensor_index), values.data(),
-           values.size() * sizeof(T));
-  }
-
-  // Returns the (typed) tensor's values at the given index.
-  template <typename T>
-  std::vector<T> GetTypedValues(int tensor_index) {
-    const TfLiteTensor* t = interpreter_->tensor(tensor_index);
-    const T* tdata = interpreter_->typed_tensor<T>(tensor_index);
-    return std::vector<T>(tdata, tdata + t->bytes / sizeof(T));
-  }
-
-  // Sets the tensor's values at the given index.
-  void SetValues(int tensor_index, const std::vector<float>& values) {
-    SetTypedValues<float>(tensor_index, values);
-  }
-
-  // Returns the tensor's values at the given index.
-  std::vector<float> GetValues(int tensor_index) {
-    return GetTypedValues<float>(tensor_index);
-  }
-
-  // Sets the tensor's shape at the given index.
-  void SetShape(int tensor_index, const std::vector<int>& values);
-
-  // Returns the tensor's shape at the given index.
-  std::vector<int> GetShape(int tensor_index);
-
-  // Returns the tensor's type at the given index.
-  TfLiteType GetType(int tensor_index);
-
-  const TestErrorReporter& error_reporter() const { return error_reporter_; }
-
-  // Adds `num_tensor` tensors to the model. `inputs` contains the indices of
-  // the input tensors and `outputs` contains the indices of the output
-  // tensors. All tensors are set to have `type` and `dims`.
-  void AddTensors(int num_tensors, const std::vector<int>& inputs,
-                  const std::vector<int>& outputs, TfLiteType type,
-                  const std::vector<int>& dims);
-
-  // Adds a TFLite Mul op. `inputs` contains the indices of the input tensors
-  // and `outputs` contains the indices of the output tensors.
-  void AddTfLiteMulOp(const std::vector<int>& inputs,
-                      const std::vector<int>& outputs);
-
-  // Adds a TensorFlow op. `inputs` contains the indices of the
-  // input tensors and `outputs` contains the indices of the output tensors.
-  // This function is limited to the set of ops defined in TfOpType.
-  void AddTfOp(TfOpType op, const std::vector<int>& inputs,
-               const std::vector<int>& outputs);
-
- protected:
-  std::unique_ptr<Interpreter> interpreter_;
-  TestErrorReporter error_reporter_;
-
- private:
-  // Helper method to add a TensorFlow op. tflite_names needs to start with
-  // "Flex" in order to work with the Flex delegate.
-  void AddTfOp(const char* tflite_name, const string& tf_name,
-               const string& nodedef_str, const std::vector<int>& inputs,
-               const std::vector<int>& outputs);
-
-  std::vector<std::vector<uint8_t>> flexbuffers_;
-};
-
-}  // namespace testing
-}  // namespace flex
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/delegates/flex/util.cc b/tensorflow/contrib/lite/delegates/flex/util.cc
deleted file mode 100644
index 829bc388bf4f613e82600edfc7363d0774d49878..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/util.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/util.h"
-
-namespace tflite {
-namespace flex {
-
-TfLiteStatus ConvertStatus(TfLiteContext* context,
-                           const tensorflow::Status& status) {
-  if (!status.ok()) {
-    context->ReportError(context, "%s", status.error_message().c_str());
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus CopyShapeAndType(TfLiteContext* context,
-                              const tensorflow::Tensor& src,
-                              TfLiteTensor* tensor) {
-  tensor->type = GetTensorFlowLiteType(static_cast<TF_DataType>(src.dtype()));
-  if (tensor->type == kTfLiteNoType) {
-    context->ReportError(context,
-                         "TF Lite does not support TensorFlow data type: %s",
-                         DataTypeString(src.dtype()).c_str());
-    return kTfLiteError;
-  }
-
-  int num_dims = src.dims();
-  TfLiteIntArray* shape = TfLiteIntArrayCreate(num_dims);
-  for (int j = 0; j < num_dims; ++j) {
-    // We need to cast from TensorFlow's int64 to TF Lite's int32. Let's
-    // make sure there's no overflow.
-    if (src.dim_size(j) >= std::numeric_limits<int>::max()) {
-      context->ReportError(context,
-                           "Dimension value in TensorFlow shape is larger than "
-                           "supported by TF Lite");
-      TfLiteIntArrayFree(shape);
-      return kTfLiteError;
-    }
-    shape->data[j] = static_cast<int>(src.dim_size(j));
-  }
-  return context->ResizeTensor(context, tensor, shape);
-}
-
-TF_DataType GetTensorFlowDataType(TfLiteType type) {
-  switch (type) {
-    case kTfLiteNoType:
-      return TF_FLOAT;
-    case kTfLiteFloat32:
-      return TF_FLOAT;
-    case kTfLiteInt16:
-      return TF_INT16;
-    case kTfLiteInt32:
-      return TF_INT32;
-    case kTfLiteUInt8:
-      return TF_UINT8;
-    case kTfLiteInt64:
-      return TF_INT64;
-    case kTfLiteComplex64:
-      return TF_COMPLEX64;
-    case kTfLiteString:
-      return TF_STRING;
-    case kTfLiteBool:
-      return TF_BOOL;
-  }
-}
-
-TfLiteType GetTensorFlowLiteType(TF_DataType type) {
-  switch (type) {
-    case TF_FLOAT:
-      return kTfLiteFloat32;
-    case TF_INT16:
-      return kTfLiteInt16;
-    case TF_INT32:
-      return kTfLiteInt32;
-    case TF_UINT8:
-      return kTfLiteUInt8;
-    case TF_INT64:
-      return kTfLiteInt64;
-    case TF_COMPLEX64:
-      return kTfLiteComplex64;
-    case TF_STRING:
-      return kTfLiteString;
-    case TF_BOOL:
-      return kTfLiteBool;
-    default:
-      return kTfLiteNoType;
-  }
-}
-
-}  // namespace flex
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/flex/util.h b/tensorflow/contrib/lite/delegates/flex/util.h
deleted file mode 100644
index 7f910e7316e67363a6e54389f1d0cc94b3e009a0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/util.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_UTIL_H_
-
-#include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tflite {
-namespace flex {
-
-// Converts a tensorflow:Status into a TfLiteStatus. If the original status
-// represented an error, reports it using the given 'context'.
-TfLiteStatus ConvertStatus(TfLiteContext* context,
-                           const tensorflow::Status& status);
-
-// Copies the given shape and type of the TensorFlow 'src' tensor into a TF Lite
-// 'tensor'. Logs an error and returns kTfLiteError if the shape or type can't
-// be converted.
-TfLiteStatus CopyShapeAndType(TfLiteContext* context,
-                              const tensorflow::Tensor& src,
-                              TfLiteTensor* tensor);
-
-// Returns the TF C API Data type that corresponds to the given TfLiteType.
-TF_DataType GetTensorFlowDataType(TfLiteType type);
-
-// Returns the TfLiteType that corresponds to the given TF C API Data type.
-TfLiteType GetTensorFlowLiteType(TF_DataType);
-
-}  // namespace flex
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_UTIL_H_
diff --git a/tensorflow/contrib/lite/delegates/flex/util_test.cc b/tensorflow/contrib/lite/delegates/flex/util_test.cc
deleted file mode 100644
index 5f049e7b0a0c1f7be28d33b532157c6f9211c7c1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/flex/util_test.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/util.h"
-
-#include <cstdarg>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/string.h"
-#include "tensorflow/contrib/lite/testing/util.h"
-
-namespace tflite {
-namespace flex {
-namespace {
-
-using tensorflow::DT_FLOAT;
-using tensorflow::DT_INT32;
-using tensorflow::Tensor;
-using ::testing::ElementsAre;
-
-struct TestContext : public TfLiteContext {
-  string error;
-  std::vector<int> new_size;
-};
-
-void ReportError(TfLiteContext* context, const char* format, ...) {
-  TestContext* c = static_cast<TestContext*>(context);
-  const size_t kBufferSize = 1024;
-  char temp_buffer[kBufferSize];
-
-  va_list args;
-  va_start(args, format);
-  vsnprintf(temp_buffer, kBufferSize, format, args);
-  va_end(args);
-
-  c->error = temp_buffer;
-}
-
-TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
-                          TfLiteIntArray* new_size) {
-  TestContext* c = static_cast<TestContext*>(context);
-  c->new_size.clear();
-  for (int i = 0; i < new_size->size; ++i) {
-    c->new_size.push_back(new_size->data[i]);
-  }
-  TfLiteIntArrayFree(new_size);
-  return kTfLiteOk;
-}
-
-TEST(UtilTest, ConvertStatus) {
-  TestContext context;
-  context.ReportError = ReportError;
-
-  EXPECT_EQ(ConvertStatus(&context, tensorflow::errors::Internal("Some Error")),
-            kTfLiteError);
-  EXPECT_EQ(context.error, "Some Error");
-
-  context.error.clear();
-  EXPECT_EQ(ConvertStatus(&context, tensorflow::Status()), kTfLiteOk);
-  EXPECT_TRUE(context.error.empty());
-}
-
-TEST(UtilTest, CopyShapeAndType) {
-  TestContext context;
-  context.ReportError = ReportError;
-  context.ResizeTensor = ResizeTensor;
-
-  TfLiteTensor dst;
-
-  EXPECT_EQ(CopyShapeAndType(&context, Tensor(), &dst), kTfLiteOk);
-  EXPECT_THAT(context.new_size, ElementsAre(0));
-  EXPECT_EQ(dst.type, kTfLiteFloat32);
-
-  EXPECT_EQ(CopyShapeAndType(&context, Tensor(DT_FLOAT, {1, 2}), &dst),
-            kTfLiteOk);
-  EXPECT_THAT(context.new_size, ElementsAre(1, 2));
-  EXPECT_EQ(dst.type, kTfLiteFloat32);
-
-  EXPECT_EQ(CopyShapeAndType(&context, Tensor(DT_INT32, {1, 2}), &dst),
-            kTfLiteOk);
-  EXPECT_THAT(context.new_size, ElementsAre(1, 2));
-  EXPECT_EQ(dst.type, kTfLiteInt32);
-
-  EXPECT_EQ(CopyShapeAndType(&context, Tensor(DT_FLOAT, {1LL << 44, 2}), &dst),
-            kTfLiteError);
-  EXPECT_EQ(context.error,
-            "Dimension value in TensorFlow shape is larger than supported by "
-            "TF Lite");
-
-  EXPECT_EQ(
-      CopyShapeAndType(&context, Tensor(tensorflow::DT_HALF, {1, 2}), &dst),
-      kTfLiteError);
-  EXPECT_EQ(context.error,
-            "TF Lite does not support TensorFlow data type: half");
-}
-
-TEST(UtilTest, TypeConversionsFromTFLite) {
-  EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteNoType));
-  EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteFloat32));
-  EXPECT_EQ(TF_INT16, GetTensorFlowDataType(kTfLiteInt16));
-  EXPECT_EQ(TF_INT32, GetTensorFlowDataType(kTfLiteInt32));
-  EXPECT_EQ(TF_UINT8, GetTensorFlowDataType(kTfLiteUInt8));
-  EXPECT_EQ(TF_INT64, GetTensorFlowDataType(kTfLiteInt64));
-  EXPECT_EQ(TF_COMPLEX64, GetTensorFlowDataType(kTfLiteComplex64));
-  EXPECT_EQ(TF_STRING, GetTensorFlowDataType(kTfLiteString));
-  EXPECT_EQ(TF_BOOL, GetTensorFlowDataType(kTfLiteBool));
-}
-
-TEST(UtilTest, TypeConversionsFromTensorFlow) {
-  EXPECT_EQ(kTfLiteFloat32, GetTensorFlowLiteType(TF_FLOAT));
-  EXPECT_EQ(kTfLiteInt16, GetTensorFlowLiteType(TF_INT16));
-  EXPECT_EQ(kTfLiteInt32, GetTensorFlowLiteType(TF_INT32));
-  EXPECT_EQ(kTfLiteUInt8, GetTensorFlowLiteType(TF_UINT8));
-  EXPECT_EQ(kTfLiteInt64, GetTensorFlowLiteType(TF_INT64));
-  EXPECT_EQ(kTfLiteComplex64, GetTensorFlowLiteType(TF_COMPLEX64));
-  EXPECT_EQ(kTfLiteString, GetTensorFlowLiteType(TF_STRING));
-  EXPECT_EQ(kTfLiteBool, GetTensorFlowLiteType(TF_BOOL));
-  EXPECT_EQ(kTfLiteNoType, GetTensorFlowLiteType(TF_RESOURCE));
-  EXPECT_EQ(kTfLiteNoType, GetTensorFlowLiteType(TF_VARIANT));
-}
-
-}  // namespace
-}  // namespace flex
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/delegates/nnapi/BUILD b/tensorflow/contrib/lite/delegates/nnapi/BUILD
deleted file mode 100644
index 4e7b2948fb920c3aaf9a6f4a9cdff7c476911e7a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/nnapi/BUILD
+++ /dev/null
@@ -1,37 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-licenses(["notice"])  # Apache 2.0
-
-cc_library(
-    name = "nnapi_delegate",
-    srcs = ["nnapi_delegate.cc"],
-    hdrs = ["nnapi_delegate.h"],
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:kernel_api",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:kernel_util",
-        "//tensorflow/contrib/lite/nnapi:nnapi_lib",
-    ],
-)
-
-tf_cc_test(
-    name = "nnapi_delegate_test",
-    size = "small",
-    srcs = ["nnapi_delegate_test.cc"],
-    tags = [
-        "no_oss",
-        "noasan",  # TODO(b/112326936): re-enable for asan once fixed.
-    ],
-    deps = [
-        ":nnapi_delegate",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
deleted file mode 100644
index d85e576284fac87519d7f4bb4bd76fe2619b59d5..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
+++ /dev/null
@@ -1,1220 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstdarg>
-#include <iostream>
-#include <memory>
-#include <vector>
-
-#include "tensorflow/contrib/lite/allocation.h"
-#include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/builtin_ops.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/context_util.h"
-#include "tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
-
-#ifdef __ANDROID__
-#include <sys/mman.h>
-#include <sys/system_properties.h>
-#include <unistd.h>
-#endif
-
-namespace tflite {
-namespace {
-
-// TODO(b/80621585): Consider printing error string, but don't for now to
-// minimize binary size.
-#define CHECK_NN(context, code)                                           \
-  if (code != ANEURALNETWORKS_NO_ERROR) {                                 \
-    context->ReportError(context, "NN API returned error (%d).\n", code); \
-    return kTfLiteError;                                                  \
-  }
-
-namespace {
-int32_t GetAndroidSdkVersion() {
-#ifdef __ANDROID__
-  const char* sdkProp = "ro.build.version.sdk";
-  char sdkVersion[PROP_VALUE_MAX];
-  int length = __system_property_get(sdkProp, sdkVersion);
-  if (length != 0) {
-    for (int i = 0; i < length; ++i) {
-      int digit = sdkVersion[i] - '0';
-      if (digit < 0 || digit > 9) {
-        // Non-numeric SDK version, assume it's higher then expected;
-        return std::numeric_limits<int32_t>::max();
-      }
-    }
-    return atoi(sdkVersion);
-  }
-#endif  // __ANDROID__
-  return 0;
-}
-
-constexpr int32_t kMinSdkVersionForNNAPI = 27;
-constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
-static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
-
-}  // namespace
-
-// RAII NN API Model Destructor for use with std::unique_ptr
-struct NNFreeModel {
-  void operator()(ANeuralNetworksModel* model) {
-    ANeuralNetworksModel_free(model);
-  }
-};
-// RAII NN API Compilation Destructor for use with std::unique_ptr
-struct NNFreeCompilation {
-  void operator()(ANeuralNetworksCompilation* model) {
-    ANeuralNetworksCompilation_free(model);
-  }
-};
-
-// Manage NNAPI shared memory handle
-class NNMemory {
- public:
-  NNMemory(const char* name, size_t size) {
-#ifdef __ANDROID__
-    byte_size_ = size;
-    fd_ = ASharedMemory_create(name, size);
-    data_ptr_ = reinterpret_cast<uint8_t*>(
-        mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
-    ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE, fd_, 0,
-                                       &nn_memory_handle_);
-#endif
-  }
-
-  ~NNMemory() {
-#ifdef __ANDROID__
-    if (data_ptr_) {
-      munmap(data_ptr_, byte_size_);
-    }
-    if (nn_memory_handle_) {
-      ANeuralNetworksMemory_free(nn_memory_handle_);
-    }
-    if (fd_ > 0) close(fd_);
-#endif
-  }
-
-  ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
-  uint8_t* get_data_ptr() { return data_ptr_; }
-
- private:
-#ifdef __ANDROID__
-  int fd_ = 0;
-  size_t byte_size_ = 0;
-#endif
-  uint8_t* data_ptr_ = nullptr;
-  ANeuralNetworksMemory* nn_memory_handle_ = nullptr;
-};  // namespace
-
-// Track tensor indices to NN API tensor indices mapping.
-class OperandMapping {
- public:
-  // Given a TFLite index return the ANN index. If it doesn't exist
-  // return -1.
-  int lite_index_to_ann(int index) const {
-    if (index < lite_tensor_to_ann_tensor_.size())
-      return lite_tensor_to_ann_tensor_[index];
-    else
-      return -1;
-  }
-
-  // NN API uses non tensor operands instead of structs. This creates one
-  // and returns the index. It uses a std::vector and resizes it as needed
-  // keeping -1 to unmapped values. Intermediate tensors likely will not
-  // be mapped.
-  int add_new_non_tensor_operand() { return next_ann_tensor_index_++; }
-
-  // Add a new mapping from `tflite_index` and return the NN API tensor index.
-  int add_new_ann_tensor_index(int tflite_index) {
-    if (tflite_index >= lite_tensor_to_ann_tensor_.size()) {
-      lite_tensor_to_ann_tensor_.resize(tflite_index + 1, -1);
-    }
-    int new_tensor_index = next_ann_tensor_index_++;
-    lite_tensor_to_ann_tensor_[tflite_index] = new_tensor_index;
-    return new_tensor_index;
-  }
-
- private:
-  // Next index of ann tensor
-  int next_ann_tensor_index_ = 0;
-
-  // Mapping from lite index. Use a std::vector for speed and code size
-  // rather than a map.
-  std::vector<int> lite_tensor_to_ann_tensor_;
-};
-
-// Abstract builder for building an op in the NN API graph. This handles
-// the disparity between TFLite and NN API operand types. NN API has singular
-// operands for both tensors and parameters, and TFLite separates the two.
-class NNAPIOpBuilder {
- public:
-  NNAPIOpBuilder(TfLiteContext* context, OperandMapping* tensor_mapping,
-                 ANeuralNetworksModel* nn_model)
-      : context_(context),
-        operand_mapping_(tensor_mapping),
-        nn_model_(nn_model) {}
-
-  TfLiteStatus AddScalarInt32Operand(int32_t value) {
-    return AddScalarOperand<int32_t>(value, ANEURALNETWORKS_INT32);
-  }
-
-  TfLiteStatus AddScalarFloat32Operand(float value) {
-    return AddScalarOperand<float>(value, ANEURALNETWORKS_FLOAT32);
-  }
-
-  TfLiteStatus AddVectorInt32Operand(const int32_t* values,
-                                     uint32_t num_values) {
-    return AddVectorOperand<int32_t>(values, num_values,
-                                     ANEURALNETWORKS_TENSOR_INT32);
-  }
-
-  TfLiteStatus AddVectorFloat32Operand(const float* values,
-                                       uint32_t num_values) {
-    return AddVectorOperand<float>(values, num_values,
-                                   ANEURALNETWORKS_TENSOR_FLOAT32);
-  }
-
-  TfLiteStatus AddPoolingParams(void* data) {
-    auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
-    AddScalarInt32Operand(builtin->padding);
-    AddScalarInt32Operand(builtin->stride_width);
-    AddScalarInt32Operand(builtin->stride_height);
-    AddScalarInt32Operand(builtin->filter_width);
-    AddScalarInt32Operand(builtin->filter_height);
-    AddScalarInt32Operand(builtin->activation);
-    return kTfLiteOk;
-  }
-
-  TfLiteStatus AddTensorInput(int tensor_index) {
-    int ann_index;
-    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
-    augmented_inputs_.push_back(ann_index);
-    return kTfLiteOk;
-  }
-
-  TfLiteStatus AddTensorOutput(int tensor_index) {
-    int ann_index;
-    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
-    augmented_outputs_.push_back(ann_index);
-    return kTfLiteOk;
-  }
-
-  TfLiteStatus AddAdditionalFloat32OutputTensor(uint32_t dimension_count) {
-    std::vector<uint32_t> dims(dimension_count, 0);
-    ANeuralNetworksOperandType operand_type{
-        .type = ANEURALNETWORKS_TENSOR_FLOAT32,
-        .dimensionCount = dimension_count,
-        .dimensions = dims.data()};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    augmented_outputs_.push_back(ann_operand);
-    return kTfLiteOk;
-  }
-
-  TfLiteStatus AddStateFloat32Tensor(int tensor_index,
-                                     int* ann_tensor_index_out) {
-    TfLiteTensor* tensor = &context_->tensors[tensor_index];
-    int ann_index = operand_mapping_->add_new_non_tensor_operand();
-
-    ANeuralNetworksOperandType operand_type{
-        ANEURALNETWORKS_TENSOR_FLOAT32,
-        static_cast<uint32_t>(tensor->dims->size),
-        reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
-        tensor->params.zero_point};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    augmented_outputs_.push_back(ann_index);
-
-    *ann_tensor_index_out = ann_index;
-    return kTfLiteOk;
-  }
-
-  // Adds a new NN API tensor that shadows the TF Lite tensor `tensor_index`.
-  // This returns the NN API tensor index corresponding to the created tensor.
-  // If another caller previously created a NN API tensor for `tensor_index`
-  // then the existing one is returned.
-  TfLiteStatus AddTensor(int tensor_index, int* ann_tensor_index_out) {
-    int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
-    if (ann_tensor_index != -1) {
-      *ann_tensor_index_out = ann_tensor_index;
-      return kTfLiteOk;
-    }
-    // Allocate a new tensor index
-    ann_tensor_index = operand_mapping_->add_new_ann_tensor_index(tensor_index);
-
-    // Parameters needed for new type.
-    int32_t nn_type = 0;
-    float scale = 0.0f;
-    int32_t zeroPoint = 0;
-    TfLiteTensor* tensor = &context_->tensors[tensor_index];
-    switch (tensor->type) {
-      case kTfLiteNoType:
-        // Tensors added during initialization of Ops don't have a type yet and
-        // should not be registered with the NNAPI.
-        *ann_tensor_index_out = -1;
-        return kTfLiteOk;
-      case kTfLiteFloat32:
-        nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
-        break;
-      case kTfLiteUInt8:
-        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
-        scale = tensor->params.scale;
-        zeroPoint = tensor->params.zero_point;
-        if (scale == 0) {
-          // TENSOR_QUANT8_ASYMM with zero scale is not valid in NNAPI.
-          scale = 1;
-        }
-        break;
-      case kTfLiteInt32:
-        nn_type = ANEURALNETWORKS_TENSOR_INT32;
-        scale = tensor->params.scale;
-        zeroPoint = tensor->params.zero_point;
-        break;
-      default:
-        context_->ReportError(context_, "Logic error in NN API Delegate.\n");
-        return kTfLiteError;
-    }
-
-    ANeuralNetworksOperandType operand_type{
-        nn_type, static_cast<uint32_t>(tensor->dims->size),
-        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-
-    if (tensor->allocation_type == kTfLiteMmapRo) {
-      // TODO(b/80630405): Use NNAPIAllocation.
-      CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
-                             nn_model_, ann_tensor_index, tensor->data.raw,
-                             tensor->bytes));
-    }
-
-    *ann_tensor_index_out = ann_tensor_index;
-    return kTfLiteOk;
-  }
-
-  // Finish emitting the op (of type `type`) into the NN API.
-  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
-    // Actually add a NN API operation
-    CHECK_NN(context_, ANeuralNetworksModel_addOperation(
-                           nn_model_, type,
-                           static_cast<uint32_t>(augmented_inputs_.size()),
-                           augmented_inputs_.data(),
-                           static_cast<uint32_t>(augmented_outputs_.size()),
-                           augmented_outputs_.data()));
-    augmented_inputs_.clear();
-    augmented_outputs_.clear();
-    return kTfLiteOk;
-  }
-
- private:
-  template <typename T>
-  TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
-    ANeuralNetworksOperandType operand_type{.type = nn_type};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
-                           nn_model_, ann_operand, &value, sizeof(T)));
-    augmented_inputs_.push_back(ann_operand);
-    return kTfLiteOk;
-  }
-
-  template <typename T>
-  TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
-                                int32_t nn_type) {
-    ANeuralNetworksOperandType operand_type{
-        .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    CHECK_NN(context_,
-             ANeuralNetworksModel_setOperandValue(
-                 nn_model_, ann_operand, values, sizeof(T) * num_values));
-    augmented_inputs_.push_back(ann_operand);
-    return kTfLiteOk;
-  }
-
-  // TfLiteContext for error handling. Must be named context for macros to
-  // work.
-  TfLiteContext* context_;
-
-  // Tracks relationship between indices
-  OperandMapping* operand_mapping_;
-
-  // The model
-  ANeuralNetworksModel* nn_model_;
-
-  // Inputs and outputs for the current op. These are augmented in the sense
-  // that NN API uses operands for all arguments, not just tensors, unlike
-  // TensorFlow lite.
-  std::vector<uint32_t> augmented_inputs_;
-  std::vector<uint32_t> augmented_outputs_;
-};
-
-struct NNAPIOpMappingArgs {
-  TfLiteContext* context;
-  NNAPIOpBuilder* builder;
-  TfLiteNode* node;
-  std::vector<int>* model_state_outputs;
-  std::vector<int>* model_state_tfl_inputs;
-};
-
-// The kernel that represents the subgraph of TF Lite being run on NN API.
-class NNAPIDelegateKernel {
- public:
-  NNAPIDelegateKernel() = default;
-
-  typedef ANeuralNetworksOperationType (*MappingFn)(
-      const NNAPIOpMappingArgs& mapping_args);
-
-  // Return a function that knows how to translate a node into its operands
-  // when called. You can use this function to see if a node is supported
-  // (i.e. that MappingFn is not nullptr).
-  MappingFn Map(TfLiteContext* context, int builtin_code, int version,
-                TfLiteNode* node) {
-    switch (builtin_code) {
-      case kTfLiteBuiltinAdd:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteAddParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_ADD;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinMul:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteMulParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_MUL;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinAveragePool2d:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            mapping_args.builder->AddPoolingParams(
-                mapping_args.node->builtin_data);
-            return ANEURALNETWORKS_AVERAGE_POOL_2D;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinMaxPool2d:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            mapping_args.builder->AddPoolingParams(
-                mapping_args.node->builtin_data);
-            return ANEURALNETWORKS_MAX_POOL_2D;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinL2Pool2d:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            mapping_args.builder->AddPoolingParams(
-                mapping_args.node->builtin_data);
-            return ANEURALNETWORKS_L2_POOL_2D;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinConv2d:
-        if (version == 1) {
-          auto builtin =
-              reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-          if (builtin->dilation_width_factor != 1 ||
-              builtin->dilation_height_factor != 1 || node->inputs->size != 3) {
-            // NNAPI does not support dilated Conv2D.
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteConvParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->padding);
-            mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
-            mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_CONV_2D;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinDepthwiseConv2d:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->padding);
-            mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
-            mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
-            mapping_args.builder->AddScalarInt32Operand(
-                builtin->depth_multiplier);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_DEPTHWISE_CONV_2D;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinFullyConnected:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_FULLY_CONNECTED;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinSoftmax:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
-            return ANEURALNETWORKS_SOFTMAX;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinReshape:
-        if (version == 1 && node->inputs->size == 2) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RESHAPE;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinSqueeze:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteSqueezeParams*>(
-                mapping_args.node->builtin_data);
-            // Note that we add the squeeze dimensions even if the dimensions
-            // were unspecified (empty), as NNAPI requires the operand.
-            mapping_args.builder->AddVectorInt32Operand(
-                builtin->squeeze_dims,
-                static_cast<uint32_t>(builtin->num_squeeze_dims));
-            return ANEURALNETWORKS_SQUEEZE;
-          };
-        } else {
-          return nullptr;
-        }
-      case kTfLiteBuiltinL2Normalization: {
-        auto builtin =
-            reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
-        if (builtin->activation != kTfLiteActNone) {
-          // NNAPI does not support activations
-          return nullptr;
-        }
-        return [](const NNAPIOpMappingArgs& mapping_args)
-                   -> ANeuralNetworksOperationType {
-          return ANEURALNETWORKS_L2_NORMALIZATION;
-        };
-      }
-      case kTfLiteBuiltinLocalResponseNormalization:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteLocalResponseNormParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->radius);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->bias);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->alpha);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
-            return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
-          };
-        } else {
-          // TODO(miaowang): clean-up code and return early in the unsupported
-          // case.
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinLshProjection:
-        if (version == 1) {
-          // NNAPI does not support sparse projection correctly (b/111751836).
-          if (reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data)
-                  ->type == kTfLiteLshProjectionSparse) {
-            return nullptr;
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteLSHProjectionParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->type);
-            return ANEURALNETWORKS_LSH_PROJECTION;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinConcatenation:
-        if (version == 1 &&
-            reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data)
-                    ->activation == kTfLiteActNone) {
-          if (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8) {
-            // NNAPI only support concatenating quantized tensor of the same
-            // scale and offset.
-            auto first_param = context->tensors[node->inputs->data[0]].params;
-            for (int i = 0; i < node->inputs->size; i++) {
-              auto curr_param = context->tensors[node->inputs->data[i]].params;
-              if (curr_param.scale != first_param.scale ||
-                  curr_param.zero_point != first_param.zero_point) {
-                return nullptr;
-              }
-            }
-          }
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->axis);
-            return ANEURALNETWORKS_CONCATENATION;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinDequantize:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_DEQUANTIZE;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinFloor:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_FLOOR;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinRelu:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RELU;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinReluN1To1:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RELU1;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinRelu6:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RELU6;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinLogistic:
-        if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_LOGISTIC;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinTanh:
-        // TODO(miaowang): add additional checks for the parameters.
-        if (version == 1 &&
-            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
-          // NNAPI only support float tanh.
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_TANH;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinSub:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
-            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
-          // NNAPI only support float sub.
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteSubParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_SUB;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinDiv:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
-            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
-          // NNAPI only support float div.
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteDivParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_DIV;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinPad:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
-            node->inputs->size == 2 &&
-            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
-          // NNAPI does not support specifying the padding value.
-          // NNAPI pads physical zero for quantized tensors, so only delegate
-          // float pad to NNAPI.
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_PAD;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinSpaceToBatchNd:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_SPACE_TO_BATCH_ND;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinStridedSlice:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->begin_mask);
-            mapping_args.builder->AddScalarInt32Operand(builtin->end_mask);
-            mapping_args.builder->AddScalarInt32Operand(
-                builtin->shrink_axis_mask);
-            return ANEURALNETWORKS_STRIDED_SLICE;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinTranspose:
-        // Note that the permutation input tensor value dictates the output
-        // dimensions.
-        // TODO(b/110888333): Support dynamically-sized tensors in delegates.
-        if ((version == 1) &&
-            (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) &&
-            (node->inputs->size > 1) &&
-            (context->tensors[node->inputs->data[1]].allocation_type ==
-             kTfLiteMmapRo)) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_TRANSPOSE;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinRnn:
-        // NNAPI only support float32 weights.
-        if (version == 1 && node->inputs->size == 5 &&
-            context->tensors[node->inputs->data[/*kWeightsTensor*/ 1]].type ==
-                kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            // NNAPI need both state_in and state_out.
-            int ann_index;
-            mapping_args.builder->AddStateFloat32Tensor(
-                mapping_args.node->inputs->data[/*kHiddenStateTensor*/ 4],
-                &ann_index);
-            mapping_args.model_state_outputs->push_back(ann_index);
-            mapping_args.model_state_tfl_inputs->push_back(
-                mapping_args.node->inputs->data[/*kHiddenStateTensor*/ 4]);
-            auto builtin = reinterpret_cast<TfLiteRNNParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_RNN;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinSvdf:
-        // NNAPI only support float32 weights.
-        if (version == 1 && node->inputs->size == 5 &&
-            context->tensors[node->inputs->data[/*kWeightsFeatureTensor*/ 1]]
-                    .type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            // NNAPI need both state_in and state_out.
-            int ann_index;
-            mapping_args.builder->AddStateFloat32Tensor(
-                mapping_args.node->inputs
-                    ->data[/*kInputActivationStateTensor*/ 4],
-                &ann_index);
-            mapping_args.model_state_outputs->push_back(ann_index);
-            mapping_args.model_state_tfl_inputs->push_back(
-                mapping_args.node->inputs
-                    ->data[/*kInputActivationStateTensor*/ 4]);
-
-            auto builtin = reinterpret_cast<TfLiteSVDFParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->rank);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            return ANEURALNETWORKS_SVDF;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinLstm:
-        // NNAPI only support float32 weights.
-        // TODO(miaowang): add loggings to indicate why the op is rejected.
-        if (version == 1 && node->inputs->size == 20 &&
-            context->tensors[node->inputs
-                                 ->data[/*kInputToOutputWeightsTensor*/ 4]]
-                    .type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteLSTMParams*>(
-                mapping_args.node->builtin_data);
-            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip);
-            mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip);
-
-            // Current NNAPI implementation requires the sratch_buffer as
-            // output.
-            mapping_args.builder->AddAdditionalFloat32OutputTensor(2);
-
-            // NNAPI need both state_in and state_out for cell_state and
-            // output_state.
-            int ann_index;
-            mapping_args.builder->AddStateFloat32Tensor(
-                mapping_args.node->inputs
-                    ->data[/*kInputActivationStateTensor*/ 18],
-                &ann_index);
-            mapping_args.model_state_outputs->push_back(ann_index);
-            mapping_args.model_state_tfl_inputs->push_back(
-                mapping_args.node->inputs
-                    ->data[/*kInputActivationStateTensor*/ 18]);
-            mapping_args.builder->AddStateFloat32Tensor(
-                mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19],
-                &ann_index);
-            mapping_args.model_state_outputs->push_back(ann_index);
-            mapping_args.model_state_tfl_inputs->push_back(
-                mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19]);
-
-            return ANEURALNETWORKS_LSTM;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinMean:
-        // NNAPI does not support generating a scalar as output for MEAN.
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
-            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 &&
-            context->tensors[node->outputs->data[0]].dims->size > 0) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            auto builtin = reinterpret_cast<TfLiteReducerParams*>(
-                mapping_args.node->builtin_data);
-            int32_t keep_dims = 0;
-            if (builtin->keep_dims) keep_dims = 1;
-            mapping_args.builder->AddScalarInt32Operand(keep_dims);
-            return ANEURALNETWORKS_MEAN;
-          };
-        } else {
-          return nullptr;
-        }
-      case kTfLiteBuiltinEmbeddingLookup:
-        // NNAPI only support float32 values.
-        if (version == 1 &&
-            context->tensors[node->inputs->data[1]].type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_EMBEDDING_LOOKUP;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      case kTfLiteBuiltinHashtableLookup:
-        // NNAPI only support float32 output.
-        if (version == 1 &&
-            context->tensors[node->outputs->data[0]].type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_HASHTABLE_LOOKUP;
-          };
-        } else {
-          return nullptr;
-        }
-        break;
-      default:
-        return nullptr;
-    }
-  }
-
-  // Initialize the kernel (a NN model).
-  TfLiteStatus Init(TfLiteContext* context,
-                    const TfLiteDelegateParams* params) {
-    for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
-      nodes_.push_back(node_index);
-    }
-
-    if (!nn_model_) {
-      ANeuralNetworksModel* model;
-      CHECK_NN(context, ANeuralNetworksModel_create(&model));
-      nn_model_.reset(model);
-
-      TF_LITE_ENSURE_STATUS(
-          BuildGraph(context, params->input_tensors, params->output_tensors));
-    }
-
-    if (!nn_compilation_) {
-      ANeuralNetworksCompilation* compilation;
-      CHECK_NN(context, ANeuralNetworksCompilation_create(nn_model_.get(),
-                                                          &compilation));
-      CHECK_NN(context, ANeuralNetworksCompilation_finish(compilation));
-      nn_compilation_.reset(compilation);
-    }
-    return kTfLiteOk;
-  }
-
-  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
-    ANeuralNetworksExecution* execution = nullptr;
-    CHECK_NN(context, ANeuralNetworksExecution_create(nn_compilation_.get(),
-                                                      &execution));
-
-    // Set the input tensor buffers. Note: we access tflite tensors using
-    // absolute indices but NN api indices inputs by relative indices.
-    int relative_input_index = 0;
-
-    size_t input_offset = 0;
-    for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
-      if (absolute_input_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor* tensor = &context->tensors[absolute_input_index];
-      // TODO(miaowang): make sure the delegation works with dequantized weights
-      // as intermediate tensors.
-      if (tensor->allocation_type != kTfLiteMmapRo) {
-        // copy data to pre-allocated shared memory.
-        memcpy(nn_input_memory_->get_data_ptr() + input_offset,
-               tensor->data.raw, tensor->bytes);
-        CHECK_NN(context, ANeuralNetworksExecution_setInputFromMemory(
-                              execution, relative_input_index, nullptr,
-                              nn_input_memory_->get_handle(), input_offset,
-                              tensor->bytes));
-        input_offset += tensor->bytes;
-        relative_input_index++;
-      }
-    }
-
-    // Set the output tensor buffers.
-    int relative_output_index = 0;
-    size_t output_offset = 0;
-    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
-      TfLiteTensor* tensor = &context->tensors[output_index];
-      CHECK_NN(context, ANeuralNetworksExecution_setOutputFromMemory(
-                            execution, relative_output_index, nullptr,
-                            nn_output_memory_->get_handle(), output_offset,
-                            tensor->bytes));
-      output_offset += tensor->bytes;
-      relative_output_index++;
-    }
-
-    // The state_out of previous invocation need to be mapped to state_in of
-    // current invocation.
-    for (size_t i = 0; i < model_state_tfl_inputs_.size(); i++) {
-      int state_tensor_idx = model_state_tfl_inputs_[i];
-      TfLiteTensor* tensor = &context->tensors[state_tensor_idx];
-      // Here we are using a deep copy for state_in tensors so that we are not
-      // reading and writing into the same buffer during a invocation.
-      // TODO(110369471): using double shared buffer to minimize the copies.
-      CHECK_NN(context, ANeuralNetworksExecution_setOutput(
-                            execution, relative_output_index, nullptr,
-                            tensor->data.raw, tensor->bytes));
-      relative_output_index++;
-    }
-    // Invoke ANN in blocking fashion.
-    ANeuralNetworksEvent* event = nullptr;
-    CHECK_NN(context, ANeuralNetworksExecution_startCompute(execution, &event));
-    CHECK_NN(context, ANeuralNetworksEvent_wait(event));
-    ANeuralNetworksEvent_free(event);
-    ANeuralNetworksExecution_free(execution);
-
-    // copy results from shared memory to the destination.
-    output_offset = 0;
-    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
-      TfLiteTensor* tensor = &context->tensors[output_index];
-      memcpy(tensor->data.raw,
-             nn_output_memory_->get_data_ptr() + output_offset, tensor->bytes);
-      output_offset += tensor->bytes;
-    }
-
-    return kTfLiteOk;
-  }
-
- private:
-  // ANN API state.
-  std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
-  std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
-      nn_compilation_;
-  // Node indices that this delegate is responsible for. Indices here
-  // indexes into the nodes array in the TfLiteContext.
-  std::vector<int> nodes_;
-  // Track indices we use
-  OperandMapping operand_mapping_;
-
-  std::vector<int> model_state_outputs_;
-  std::vector<int> model_state_tfl_inputs_;
-
-  std::unique_ptr<NNMemory> nn_input_memory_;
-  std::unique_ptr<NNMemory> nn_output_memory_;
-
-  TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
-    // The operand builder allows creating a single op. We create it at this
-    // reduced power position rather than in the for loop to avoid reallocating
-    // the vectors.
-    NNAPIOpBuilder builder(context, &operand_mapping_, nn_model_.get());
-    // Add Tensors
-    // allocate outside to avoid realloc
-    for (auto node_index : nodes_) {
-      // Obtain the op and registration.
-      TfLiteNode* node;
-      TfLiteRegistration* reg;
-      context->GetNodeAndRegistration(context, node_index, &node, &reg);
-      // Map inputs to NN API tensor indices.
-      for (auto input_index : TfLiteIntArrayView(node->inputs)) {
-        if (input_index == kOptionalTensor &&
-            (reg->builtin_code == kTfLiteBuiltinLstm ||
-             reg->builtin_code == kTfLiteBuiltinSvdf)) {
-          // properly handle the optional tensor for LSTM and SVDF.
-          // currently only support float32.
-          // TODO(miaowang): make sure this is also able to handle quantized
-          // tensor when supported by NNAPI.
-          TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0));
-        } else {
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index));
-        }
-      }
-      // Get op type and operands
-      int nn_op_type = Map(context, reg->builtin_code, reg->version, node)(
-          {context, &builder, node, &model_state_outputs_,
-           &model_state_tfl_inputs_});
-      // Map outputs to NN API tensor indices.
-      for (auto output_index : TfLiteIntArrayView(node->outputs)) {
-        TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index));
-      }
-
-      builder.FinalizeAddOperation(nn_op_type);
-    }
-    return kTfLiteOk;
-  }
-
-  TfLiteStatus BuildGraph(TfLiteContext* context,
-                          const TfLiteIntArray* input_tensors,
-                          const TfLiteIntArray* output_tensors) {
-    // Build the ops and tensors.
-    TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context));
-    // Map input and output tensor indices to ANN
-    std::vector<uint32_t> inputs;
-    inputs.reserve(input_tensors->size);
-    std::vector<uint32_t> outputs;
-    outputs.reserve(output_tensors->size);
-
-    size_t total_input_byte_size = 0;
-    // Make the TensorFlow lite inputs and outputs to ann_indices.
-    for (int i : TfLiteIntArrayView(input_tensors)) {
-      // Constant tensors are not NNAPI inputs.
-      if (i != kOptionalTensor &&
-          context->tensors[i].allocation_type != kTfLiteMmapRo) {
-        inputs.push_back(operand_mapping_.lite_index_to_ann(i));
-        total_input_byte_size += context->tensors[i].bytes;
-      }
-    }
-
-    size_t total_output_byte_size = 0;
-    for (int i : TfLiteIntArrayView(output_tensors)) {
-      outputs.push_back(operand_mapping_.lite_index_to_ann(i));
-      total_output_byte_size += context->tensors[i].bytes;
-    }
-
-    // Add state output tensors as model inputs
-    for (int i : model_state_outputs_) {
-      outputs.push_back(i);
-    }
-
-    // Tell ANN to declare inputs/outputs
-    CHECK_NN(context, ANeuralNetworksModel_identifyInputsAndOutputs(
-                          nn_model_.get(), inputs.size(), inputs.data(),
-                          outputs.size(), outputs.data()));
-
-    // Set relaxed computation mode for fp32 if possible.
-    if (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-      CHECK_NN(context,
-               ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-                   nn_model_.get(), context->allow_fp32_relax_to_fp16));
-    }
-
-    // Finalize the model
-    CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get()));
-
-    // Create shared memory pool for inputs and outputs.
-    nn_input_memory_.reset(new NNMemory("input_pool", total_input_byte_size));
-    nn_output_memory_.reset(
-        new NNMemory("output_pool", total_output_byte_size));
-
-    return kTfLiteOk;
-  }
-};
-
-}  // namespace
-
-// Return a NN API Delegate struct that can check for support of ops.
-TfLiteDelegate* NnApiDelegate() {
-  static TfLiteDelegate delegate = {
-      .data_ = nullptr,
-      .Prepare = [](TfLiteContext* context,
-                    TfLiteDelegate* delegate) -> TfLiteStatus {
-        // Do not check nodes_ if NN API is unavailable.
-        if (kAndroidSdkVersion < kMinSdkVersionForNNAPI || !NNAPIExists()) {
-          return kTfLiteOk;
-        }
-
-        std::vector<int> supported_nodes(1);
-        // We don't care about all nodes_, we only care about ones in the
-        // current plan.
-        TfLiteIntArray* plan;
-        TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
-        int total_supported_nodes = 0;
-
-        // Check for every node if it is supported
-        // TODO(b/80625235): Fix this to do more careful checking of versioning.
-        for (int node_index : TfLiteIntArrayView(plan)) {
-          TfLiteNode* node;
-          TfLiteRegistration* registration;
-          TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
-              context, node_index, &node, &registration));
-          NNAPIDelegateKernel dummy_kernel;
-          if (dummy_kernel.Map(context, registration->builtin_code,
-                               registration->version, node)) {
-            supported_nodes.push_back(node_index);
-          }
-          total_supported_nodes += 1;
-        }
-        // Put the size at the beginning of the array.
-        supported_nodes[0] = supported_nodes.size() - 1;
-
-        // NN API Delegate Registration (the pseudo kernel that will invoke NN
-        // API subgraphs)
-        static const TfLiteRegistration nnapi_delegate_kernel = {
-            .init = [](TfLiteContext* context, const char* buffer,
-                       size_t length) -> void* {
-              const TfLiteDelegateParams* params =
-                  reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-              NNAPIDelegateKernel* kernel_state = new NNAPIDelegateKernel;
-              kernel_state->Init(context, params);
-              return kernel_state;
-            },
-
-            .free = [](TfLiteContext* context, void* buffer) -> void {
-              delete reinterpret_cast<NNAPIDelegateKernel*>(buffer);
-            },
-
-            .prepare = [](TfLiteContext* context,
-                          TfLiteNode* node) -> TfLiteStatus {
-              // Since the underlying resize happened ahead of delegation
-              // worked. This does nothing.
-              return kTfLiteOk;
-            },
-
-            .invoke = [](TfLiteContext* context,
-                         TfLiteNode* node) -> TfLiteStatus {
-              NNAPIDelegateKernel* state =
-                  reinterpret_cast<NNAPIDelegateKernel*>(node->user_data);
-              return state->Invoke(context, node);
-            },
-
-            .builtin_code = kTfLiteBuiltinDelegate,
-        };
-
-        // Request TFLite to partition the graph and make kernels
-        // for each independent subgraph a new nnapi_delegate_kernel.
-        context->ReplaceSubgraphsWithDelegateKernels(
-            context, nnapi_delegate_kernel,
-            reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
-            delegate);
-        return kTfLiteOk;
-      }};
-
-  return &delegate;
-}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h
deleted file mode 100644
index 4852b7697432c30c1258e790b97ce2563e7f9711..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
-
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-
-namespace tflite {
-
-// Return a delegate that can be used to use the NN API.
-// e.g.
-//   NnApiDelegate* delegate = NnApiDelegate();
-//   interpreter->ModifyGraphWithDelegate(&delegate);
-// NnApiDelegate() returns a singleton, so you should not free this
-// pointer or worry about its lifetime.
-TfLiteDelegate* NnApiDelegate();
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/error_reporter.h b/tensorflow/contrib/lite/error_reporter.h
deleted file mode 100644
index 5c20eedc255ca6f7578873593aa86759fbeb490b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/error_reporter.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Compatibility shim for moved header location.
-#ifndef TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
-#define TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
-
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/stderr_reporter.h"
-
-#endif  // TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/examples/android/BUILD b/tensorflow/contrib/lite/examples/android/BUILD
deleted file mode 100644
index d180cb478566a9e5df24b2e67445f24a2f623215..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/android/BUILD
+++ /dev/null
@@ -1,61 +0,0 @@
-# Description:
-#   TensorFlow camera demo app for Android.
-
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-# Build the demo native demo lib from the original directory to reduce code
-# reuse. Note that the Java counterparts (ObjectTracker.java and
-# ImageUtils.java) are still duplicated.
-cc_library(
-    name = "tensorflow_native_libs",
-    srcs = [
-        "//tensorflow/examples/android:libtensorflow_demo.so",
-    ],
-    tags = [
-        "manual",
-        "notap",
-    ],
-)
-
-android_binary(
-    name = "tflite_demo",
-    srcs = glob([
-        "app/src/main/java/**/*.java",
-    ]),
-    aapt_version = "aapt",
-    # Package assets from assets dir as well as all model targets.
-    # Remove undesired models (and corresponding Activities in source)
-    # to reduce APK size.
-    assets = [
-        "//tensorflow/contrib/lite/examples/android/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
-        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
-        "@tflite_conv_actions_frozen//:conv_actions_frozen.tflite",
-        "//tensorflow/contrib/lite/examples/android/app/src/main/assets:conv_actions_labels.txt",
-        "@tflite_mobilenet_ssd//:mobilenet_ssd.tflite",
-        "@tflite_mobilenet_ssd_quant//:detect.tflite",
-        "//tensorflow/contrib/lite/examples/android/app/src/main/assets:box_priors.txt",
-        "//tensorflow/contrib/lite/examples/android/app/src/main/assets:coco_labels_list.txt",
-    ],
-    assets_dir = "",
-    custom_package = "org.tensorflow.lite.demo",
-    inline_constants = 1,
-    manifest = "app/src/main/AndroidManifest.xml",
-    nocompress_extensions = [
-        ".tflite",
-    ],
-    resource_files = glob(["app/src/main/res/**"]),
-    tags = [
-        "manual",
-        "notap",
-    ],
-    deps = [
-        ":tensorflow_native_libs",
-        "//tensorflow/contrib/lite/java:tensorflowlite",
-    ],
-)
diff --git a/tensorflow/contrib/lite/examples/android/app/README.md b/tensorflow/contrib/lite/examples/android/app/README.md
deleted file mode 100644
index 7347147f997540e67c2c713b597dc90d933c5cb8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/android/app/README.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# TF Lite Android App Example
-
-A simple Android example that demonstrates image classification and object
-detection using the camera, as well as speech recognition using the microphone.
-
-## Building in Android Studio with TensorFlow Lite AAR from JCenter.
-The build.gradle is configured to use TensorFlow Lite's nightly build.
-
-If you see a build error related to compatibility with Tensorflow Lite's Java
-API (example: method X is undefined for type Interpreter), there has likely been
-a backwards compatible change to the API. You will need to pull new app code
-that's compatible with the nightly build and may need to first wait a few days
-for our external and internal code to merge.
-
-## Building from Source with Bazel
-
-1. Follow the [Bazel steps for the TF Demo App](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel):
-
-  1. [Install Bazel and Android Prerequisites](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-bazel-and-android-prerequisites).
-     It's easiest with Android Studio.
-
-      - You'll need at least SDK version 23.
-      - Make sure to install the latest version of Bazel. Some distributions
-        ship with Bazel 0.5.4, which is too old.
-      - Bazel requires Android Build Tools `26.0.1` or higher.
-      - You also need to install the Android Support Repository, available
-        through Android Studio under `Android SDK Manager -> SDK Tools ->
-        Android Support Repository`.
-
-  2. [Edit your `WORKSPACE`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#edit-workspace)
-     to add SDK and NDK targets.
-
-     NOTE: As long as you have the SDK and NDK installed, the `./configure`
-     script will create these rules for you. Answer "Yes" when the script asks
-     to automatically configure the `./WORKSPACE`.
-
-      - Make sure the `api_level` in `WORKSPACE` is set to an SDK version that
-        you have installed.
-      - By default, Android Studio will install the SDK to `~/Android/Sdk` and
-        the NDK to `~/Android/Sdk/ndk-bundle`.
-
-2. Build this demo app with Bazel. The demo needs C++11. We configure the fat_apk_cpu flag to package support for 4 hardware variants. You may replace it with --config=android_arm64 on a 64-bit device and --config=android_arm for 32-bit device:
-
-  ```shell
-  bazel build -c opt --cxxopt='--std=c++11' --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
-    //tensorflow/contrib/lite/examples/android:tflite_demo
-  ```
-
-3. Install the demo on a
-   [debug-enabled device](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install):
-
-  ```shell
-  adb install bazel-bin/tensorflow/contrib/lite/examples/android/tflite_demo.apk
-  ```
diff --git a/tensorflow/contrib/lite/examples/android/app/download-models.gradle b/tensorflow/contrib/lite/examples/android/app/download-models.gradle
deleted file mode 100644
index c100e37c16f38a65f7b1f64a3f6e3eaa1477e8eb..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/android/app/download-models.gradle
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * download-models.gradle
- *     Downloads model files from ${MODEL_URL} into application's asset folder
- * Input:
- *     project.ext.TMP_DIR: absolute path to hold downloaded zip files
- *     project.ext.ASSET_DIR: absolute path to save unzipped model files
- * Output:
- *     3 model files will be downloaded into given folder of ext.ASSET_DIR
- */
-// hard coded model files
-// LINT.IfChange
-
-def models = ['conv_actions_tflite.zip',
-              'mobilenet_ssd_tflite_v1.zip',
-              'mobilenet_v1_224_android_quant_2017_11_08.zip',
-              'coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip']
-// LINT.ThenChange(//tensorflow/contrib/lite/examples/android/BUILD)
-
-// Root URL for model archives
-def MODEL_URL = 'https://storage.googleapis.com/download.tensorflow.org/models/tflite'
-
-buildscript {
-    repositories {
-        jcenter()
-    }
-    dependencies {
-        classpath 'de.undercouch:gradle-download-task:3.2.0'
-    }
-}
-
-import de.undercouch.gradle.tasks.download.Download
-task downloadFile(type: Download){
-    for (f in models) {
-        def modelUrl = MODEL_URL + "/" + f
-        println "Downloading ${f} from ${modelUrl}"
-        src modelUrl
-    }
-
-    dest new File(project.ext.TMP_DIR)
-    overwrite true
-}
-
-task extractModels(type: Copy) {
-    for (f in models) {
-        def localFile = f.split("/")[-1]
-        from zipTree(project.ext.TMP_DIR + '/' + localFile)
-    }
-
-    into file(project.ext.ASSET_DIR)
-    fileMode  0644
-    exclude '**/LICENSE'
-
-    def needDownload = false
-    for (f in models) {
-        def localFile = f.split("/")[-1]
-        if (!(new File(project.ext.TMP_DIR + '/' + localFile)).exists()) {
-            needDownload = true
-        }
-    }
-
-    if (needDownload) {
-        dependsOn downloadFile
-    }
-}
-
-tasks.whenTaskAdded { task ->
-    if (task.name == 'assembleDebug') {
-        task.dependsOn 'extractModels'
-    }
-    if (task.name == 'assembleRelease') {
-        task.dependsOn 'extractModels'
-    }
-}
-
diff --git a/tensorflow/contrib/lite/examples/android/build.gradle b/tensorflow/contrib/lite/examples/android/build.gradle
deleted file mode 100644
index 66a62a921a7f492df30b3de2e5dc4b68fc84f1d9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/android/build.gradle
+++ /dev/null
@@ -1,24 +0,0 @@
-// Top-level build file where you can add configuration options common to all sub-projects/modules.
-
-buildscript {
-    repositories {
-        jcenter()
-    }
-    dependencies {
-        classpath 'com.android.tools.build:gradle:3.0.1'
-
-        // NOTE: Do not place your application dependencies here; they belong
-        // in the individual module build.gradle files
-    }
-}
-
-allprojects {
-    repositories {
-        google()
-        jcenter()
-    }
-}
-
-task clean(type: Delete) {
-    delete rootProject.buildDir
-}
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.h b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.h
deleted file mode 100644
index fb5800e86d365b56f1b52147c3f9cc8d7211f8c3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <AVFoundation/AVFoundation.h>
-#import <UIKit/UIKit.h>
-
-#include <vector>
-
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-
-@interface CameraExampleViewController
-    : UIViewController<UIGestureRecognizerDelegate, AVCaptureVideoDataOutputSampleBufferDelegate> {
-  IBOutlet UIView* previewView;
-  AVCaptureVideoPreviewLayer* previewLayer;
-  AVCaptureVideoDataOutput* videoDataOutput;
-  dispatch_queue_t videoDataOutputQueue;
-  UIView* flashView;
-  BOOL isUsingFrontFacingCamera;
-  NSMutableDictionary* oldPredictionValues;
-  NSMutableArray* labelLayers;
-  AVCaptureSession* session;
-
-  std::vector<std::string> labels;
-  std::unique_ptr<tflite::FlatBufferModel> model;
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-
-  double total_latency;
-  int total_count;
-}
-@property(strong, nonatomic) CATextLayer* predictionTextLayer;
-
-- (IBAction)takePicture:(id)sender;
-- (IBAction)switchCameras:(id)sender;
-
-@end
diff --git a/tensorflow/contrib/lite/examples/ios/camera/Podfile b/tensorflow/contrib/lite/examples/ios/camera/Podfile
deleted file mode 100644
index f460693122af8353286ea7069d5db873fedfc9b3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/ios/camera/Podfile
+++ /dev/null
@@ -1,5 +0,0 @@
-platform :ios, '8.0'
-inhibit_all_warnings!
-
-target 'tflite_camera_example'
-       pod 'TensorFlowLite', '1.10.1'
diff --git a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
deleted file mode 100644
index 98d3b5bb8ad45bf34f6996b3361291896a451a6f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,405 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 46;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1C3C9DCA1ED3AB4200B8B5FA /* main.mm */; };
-		1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */; };
-		1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */; };
-		1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */; };
-		1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */; };
-		1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */; };
-		1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */ = {isa = PBXBuildFile; fileRef = 1CDB2D4D1ED3AA35007929E9 /* Info.plist */; };
-		54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */; };
-		AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = AC1F82641FBA3CBD0052BA77 /* labels.txt */; };
-		ACA1A4CA1FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
-		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
-		1C3C9DCA1ED3AB4200B8B5FA /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
-		1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tflite_camera_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; path = MainStoryboard_iPhone.storyboard; sourceTree = "<group>"; };
-		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
-		1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = System/Library/Frameworks/CoreMedia.framework; sourceTree = SDKROOT; };
-		1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
-		1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleAppDelegate.h; sourceTree = "<group>"; };
-		1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = CameraExampleAppDelegate.m; sourceTree = "<group>"; };
-		1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleViewController.h; sourceTree = "<group>"; };
-		1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CameraExampleViewController.mm; sourceTree = "<group>"; };
-		1CDB2D4D1ED3AA35007929E9 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tflite_camera_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
-		3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
-		55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.release.xcconfig"; sourceTree = "<group>"; };
-		AC1F82641FBA3CBD0052BA77 /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
-		ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_quant_v1_224.tflite; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		1C564C0A1ED3A92E00087306 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */,
-				1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */,
-				54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		24D7686C331131624F4454A0 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */,
-				1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */,
-				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
-				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
-				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
-				3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		3E9FC355632FB928EA23BEED /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */,
-				55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		591157921CF4011C00C31E3A = {
-			isa = PBXGroup;
-			children = (
-				1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */,
-				1C3C9DCA1ED3AB4200B8B5FA /* main.mm */,
-				1CDB2D4D1ED3AA35007929E9 /* Info.plist */,
-				1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */,
-				1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */,
-				1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */,
-				1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */,
-				59A3CFF31CF4E68100C4259F /* data */,
-				5911579C1CF4011C00C31E3A /* Products */,
-				3E9FC355632FB928EA23BEED /* Pods */,
-				24D7686C331131624F4454A0 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		5911579C1CF4011C00C31E3A /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		59A3CFF31CF4E68100C4259F /* data */ = {
-			isa = PBXGroup;
-			children = (
-				ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */,
-				AC1F82641FBA3CBD0052BA77 /* labels.txt */,
-			);
-			path = data;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		1C564C0C1ED3A92E00087306 /* tflite_camera_example */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example" */;
-			buildPhases = (
-				66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */,
-				1C564C091ED3A92E00087306 /* Sources */,
-				1C564C0A1ED3A92E00087306 /* Frameworks */,
-				1C564C0B1ED3A92E00087306 /* Resources */,
-				00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */,
-				5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = tflite_camera_example;
-			productName = tflite_camera_example;
-			productReference = 1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		591157931CF4011C00C31E3A /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastSwiftUpdateCheck = 0830;
-				LastUpgradeCheck = 0830;
-				ORGANIZATIONNAME = Google;
-				TargetAttributes = {
-					1C564C0C1ED3A92E00087306 = {
-						CreatedOnToolsVersion = 8.3.2;
-						DevelopmentTeam = EQHXZ8M8AV;
-						ProvisioningStyle = Automatic;
-					};
-				};
-			};
-			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example" */;
-			compatibilityVersion = "Xcode 3.2";
-			developmentRegion = English;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 591157921CF4011C00C31E3A;
-			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				1C564C0C1ED3A92E00087306 /* tflite_camera_example */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		1C564C0B1ED3A92E00087306 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				ACA1A4CA1FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite in Resources */,
-				1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */,
-				1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */,
-				AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Copy Pods Resources";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example-resources.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
-				"${PODS_ROOT}/Manifest.lock",
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputPaths = (
-				"$(DERIVED_FILE_DIR)/Pods-tflite_camera_example-checkManifestLockResult.txt",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		1C564C091ED3A92E00087306 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */,
-				1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */,
-				1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		1C564C361ED3A92E00087306 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				DEVELOPMENT_TEAM = EQHXZ8M8AV;
-				INFOPLIST_FILE = Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 3.0;
-			};
-			name = Debug;
-		};
-		1C564C371ED3A92E00087306 /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				DEVELOPMENT_TEAM = EQHXZ8M8AV;
-				INFOPLIST_FILE = Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule";
-				SWIFT_VERSION = 3.0;
-			};
-			name = Release;
-		};
-		591157B01CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(inherited)",
-				);
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		591157B11CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(inherited)",
-				);
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1C564C361ED3A92E00087306 /* Debug */,
-				1C564C371ED3A92E00087306 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B01CF4011D00C31E3A /* Debug */,
-				591157B11CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 591157931CF4011C00C31E3A /* Project object */;
-}
diff --git a/tensorflow/contrib/lite/examples/ios/simple/Podfile b/tensorflow/contrib/lite/examples/ios/simple/Podfile
deleted file mode 100644
index ddb77088d9f16fb55e8060a91504ebc44dd0b73e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/ios/simple/Podfile
+++ /dev/null
@@ -1,5 +0,0 @@
-platform :ios, '8.0'
-inhibit_all_warnings!
-
-target 'tflite_simple_example'
-       pod 'TensorFlowLite', '1.10.1'
diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
deleted file mode 100644
index fc55a78019b4a12b24231034a7e4b912869389f2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/label_image/BUILD
+++ /dev/null
@@ -1,71 +0,0 @@
-# Description:
-# TensorFlow Lite Example Label Image.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
-
-exports_files(glob([
-    "testdata/*.bmp",
-]))
-
-tf_cc_binary(
-    name = "label_image",
-    srcs = [
-        "get_top_n.h",
-        "get_top_n_impl.h",
-        "label_image.cc",
-    ],
-    linkopts = tflite_linkopts() + select({
-        "//tensorflow:android": [
-            "-pie",  # Android 5.0 and later supports only PIE
-            "-lm",  # some builtin ops, e.g., tanh, need -lm
-        ],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":bitmap_helpers",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ],
-)
-
-cc_library(
-    name = "bitmap_helpers",
-    srcs = ["bitmap_helpers.cc"],
-    hdrs = [
-        "bitmap_helpers.h",
-        "bitmap_helpers_impl.h",
-        "label_image.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite:string",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-    ],
-)
-
-cc_test(
-    name = "label_image_test",
-    srcs = [
-        "get_top_n.h",
-        "get_top_n_impl.h",
-        "label_image_test.cc",
-    ],
-    data = [
-        "testdata/grace_hopper.bmp",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":bitmap_helpers",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.md b/tensorflow/contrib/lite/examples/label_image/label_image.md
deleted file mode 100644
index 9ce32cf101897f2d41cd14a485aeb432344928a0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/label_image/label_image.md
+++ /dev/null
@@ -1,78 +0,0 @@
-label_image for TensorFlow Lite inspired by TensorFlow's label_image.
-
-To build label_image for Android, run $TENSORFLOW_ROOT/configure 
-and set Android NDK or configure NDK setting in 
-$TENSORFLOW_ROOT/WORKSPACE first.
- 
-To build it for android ARMv8:
-```
-> bazel build --config monolithic --cxxopt=-std=c++11 \
-  --crosstool_top=//external:android/crosstool \
-  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-  --cpu=arm64-v8a \
-  //tensorflow/contrib/lite/examples/label_image:label_image
-```
-or
-```
-> bazel build --config android_arm64 --config monolithic --cxxopt=-std=c++11 \
-  //tensorflow/contrib/lite/examples/label_image:label_image
-```
-
-To build it for android arm-v7a:
-```
-> bazel build --config monolithic --cxxopt=-std=c++11 \
-  --crosstool_top=//external:android/crosstool \
-  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-  --cpu=armeabi-v7a \
-  //tensorflow/contrib/lite/examples/label_image:label_image
-```
-or
-```
-> bazel build --config android_arm --config monolithic --cxxopt=-std=c++11 \
-  //tensorflow/contrib/lite/examples/label_image:label_image
-```
-
-Build it for desktop machines (tested on Ubuntu and OS X)
-```
-> bazel build --config opt --cxxopt=-std=c++11 //tensorflow/contrib/lite/examples/label_image:label_image
-```
-To run it. Prepare `./mobilenet_quant_v1_224.tflite`, `./grace_hopper.bmp`, and `./labels.txt`.
-
-Run it:
-```
-> ./label_image                                        
-Loaded model ./mobilenet_quant_v1_224.tflite
-resolved reporter
-invoked
-average time: 100.986 ms 
-0.439216: 653 military uniform
-0.372549: 458 bow tie
-0.0705882: 466 bulletproof vest
-0.0235294: 514 cornet
-0.0196078: 835 suit
-```
-Run `interpreter->Invoker()` 100 times:
-```
-> ./label_image   -c 100                               
-Loaded model ./mobilenet_quant_v1_224.tflite
-resolved reporter
-invoked
-average time: 33.4694 ms
-...
-```
-
-Run a floating point (`mobilenet_v1_1.0_224.tflite`) model,
-```
-> ./label_image -f 1 -m mobilenet_v1_1.0_224.tflite
-Loaded model mobilenet_v1_1.0_224.tflite
-resolved reporter
-invoked
-average time: 263.493 ms 
-0.88615: 653 military uniform
-0.0422316: 440 bearskin
-0.0109948: 466 bulletproof vest
-0.0105327: 401 academic gown
-0.00947104: 723 ping-pong bal
-```
-
-See the source code for other command line options.
diff --git a/tensorflow/contrib/lite/examples/minimal/BUILD b/tensorflow/contrib/lite/examples/minimal/BUILD
deleted file mode 100644
index b403628d6c457ce3fb67eac3675fd7bb9187deab..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/minimal/BUILD
+++ /dev/null
@@ -1,27 +0,0 @@
-# Description:
-#   TensorFlow Lite minimal example.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
-
-tf_cc_binary(
-    name = "minimal",
-    srcs = [
-        "minimal.cc",
-    ],
-    linkopts = tflite_linkopts() + select({
-        "//tensorflow:android": [
-            "-pie",  # Android 5.0 and later supports only PIE
-            "-lm",  # some builtin ops, e.g., tanh, need -lm
-        ],
-        "//conditions:default": [],
-    }),
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ],
-)
diff --git a/tensorflow/contrib/lite/examples/python/BUILD b/tensorflow/contrib/lite/examples/python/BUILD
deleted file mode 100644
index d337c3ddc43a23e50a5afdab93b16c0f61ccd538..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/python/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
-py_binary(
-    name = "label_image",
-    srcs = ["label_image.py"],
-    main = "label_image.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/lite/python:lite",
-    ],
-)
diff --git a/tensorflow/contrib/lite/examples/python/label_image.md b/tensorflow/contrib/lite/examples/python/label_image.md
deleted file mode 100644
index e81192a96c142f2b3e7e85d160166fdd37ccdc53..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/examples/python/label_image.md
+++ /dev/null
@@ -1,50 +0,0 @@
-
-With model, input image (grace_hopper.bmp), and labels file (labels.txt)
-in /tmp.
-
-The example input image and labels file are from TensorFlow repo and
-MobileNet V1 model files.
-
-```
-curl https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp > /tmp/grace_hopper.bmp
-
-curl  https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz  | tar xzv -C /tmp  mobilenet_v1_1.0_224/labels.txt
-mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/
-
-```
-
-Run
-
-```
-curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz | tar xzv -C /tmp
-bazel run --config opt //tensorflow/contrib/lite/examples/python:label_image
-```
-
-We can get results like
-
-```
-0.470588: military uniform
-0.337255: Windsor tie
-0.047059: bow tie
-0.031373: mortarboard
-0.019608: suit
-```
-
-Run
-
-```
-curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp
-bazel run --config opt //tensorflow/contrib/lite/examples/python:label_image \
--- --model_file /tmp/mobilenet_v1_1.0_224.tflite
-```
-
-We can get results like
-```
-0.728693: military uniform
-0.116163: Windsor tie
-0.035517: bow tie
-0.014874: mortarboard
-0.011758: bolo tie
-```
-
-Check [models](../../g3doc/models.md) for models hosted by Google.
diff --git a/tensorflow/contrib/lite/experimental/c/BUILD b/tensorflow/contrib/lite/experimental/c/BUILD
deleted file mode 100644
index 52e71619def71a0c2130539afe8e7d00e7a24894..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/c/BUILD
+++ /dev/null
@@ -1,111 +0,0 @@
-package(default_visibility = ["//visibility:private"])
-
-package_group(
-    name = "experimental",
-    packages = [
-        "//tensorflow/contrib/lite/experimental/...",
-    ],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow/contrib/lite:build_def.bzl",
-    "tflite_cc_shared_object",
-    "tflite_copts",
-    "tflite_jni_binary",
-)
-
-tflite_cc_shared_object(
-    name = "libtensorflowlite_c.so",
-    linkopts = select({
-        "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow/contrib/lite/experimental/c:exported_symbols.lds)",
-            "-Wl,-install_name,@rpath/libtensorflowlite_c.so",
-        ],
-        "//tensorflow:windows": [],
-        "//conditions:default": [
-            "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow/contrib/lite/experimental/c:version_script.lds)",
-        ],
-    }),
-    deps = [
-        ":c_api",
-        ":c_api_experimental",
-        ":exported_symbols.lds",
-        ":version_script.lds",
-    ],
-)
-
-cc_library(
-    name = "c_api_internal",
-    srcs = ["c_api.h"],
-    hdrs = ["c_api_internal.h"],
-    copts = tflite_copts(),
-    visibility = [
-        "//tensorflow/contrib/lite/experimental/c:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite:context",
-        "//tensorflow/contrib/lite:framework",
-    ],
-)
-
-cc_library(
-    name = "c_api",
-    srcs = ["c_api.cc"],
-    hdrs = ["c_api.h"],
-    copts = tflite_copts(),
-    visibility = [
-        ":experimental",
-    ],
-    deps = [
-        ":c_api_internal",
-        "//tensorflow/contrib/lite:context",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ],
-)
-
-cc_library(
-    name = "c_api_experimental",
-    srcs = ["c_api_experimental.cc"],
-    hdrs = ["c_api_experimental.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":c_api",
-        ":c_api_internal",
-        "//tensorflow/contrib/lite:kernel_api",
-    ],
-)
-
-cc_test(
-    name = "c_api_test",
-    size = "small",
-    srcs = ["c_api_test.cc"],
-    data = ["//tensorflow/contrib/lite:testdata/add.bin"],
-    deps = [
-        ":c_api",
-        "//tensorflow/contrib/lite:context",
-        "//tensorflow/contrib/lite:kernel_api",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_test(
-    name = "c_api_experimental_test",
-    size = "small",
-    srcs = ["c_api_experimental_test.cc"],
-    data = ["//tensorflow/contrib/lite:testdata/add.bin"],
-    deps = [
-        ":c_api",
-        ":c_api_experimental",
-        "//tensorflow/contrib/lite:kernel_api",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_internal.h b/tensorflow/contrib/lite/experimental/c/c_api_internal.h
deleted file mode 100644
index da3af3cad4c54865cfe778b79538e5800c284985..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/c/c_api_internal.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
-
-#include "tensorflow/contrib/lite/experimental/c/c_api.h"
-
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/op_resolver.h"
-
-// Internal structures used by the C API. These are likely to change and should
-// not be depended on.
-//
-// NOTE: This header does not follow C conventions and does not define a C API.
-// It is effectively an (internal) implementation detail of the C API.
-
-struct TFL_Model {
-  // Sharing is safe as FlatBufferModel is const.
-  std::shared_ptr<const tflite::FlatBufferModel> impl;
-};
-
-struct TFL_InterpreterOptions {
-  enum {
-    kDefaultNumThreads = -1,
-  };
-  int num_threads = kDefaultNumThreads;
-
-  tflite::MutableOpResolver op_resolver;
-
-  void (*error_reporter)(void* user_data, const char* format,
-                         va_list args) = nullptr;
-  void* error_reporter_user_data = nullptr;
-};
-
-struct TFL_Interpreter {
-  // Taking a reference to the (const) model data avoids lifetime-related issues
-  // and complexity with the TFL_Model's existence.
-  std::shared_ptr<const tflite::FlatBufferModel> model;
-
-  // The interpreter does not take ownership of the provided ErrorReporter
-  // instance, so we ensure its validity here. Note that the interpreter may use
-  // the reporter in its destructor, so it should be declared first.
-  std::unique_ptr<tflite::ErrorReporter> optional_error_reporter;
-
-  std::unique_ptr<tflite::Interpreter> impl;
-};
-
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_test.cc b/tensorflow/contrib/lite/experimental/c/c_api_test.cc
deleted file mode 100644
index 48a3714ec345a6f4bc4be8ebe937471a91c60218..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/c/c_api_test.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <array>
-
-#include "tensorflow/contrib/lite/experimental/c/c_api.h"
-
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/testing/util.h"
-
-namespace {
-
-TEST(CApiSimple, Smoke) {
-  TFL_Model* model = TFL_NewModelFromFile(
-      "tensorflow/contrib/lite/testdata/add.bin");
-  ASSERT_NE(model, nullptr);
-
-  TFL_InterpreterOptions* options = TFL_NewInterpreterOptions();
-  ASSERT_NE(options, nullptr);
-  TFL_InterpreterOptionsSetNumThreads(options, 2);
-
-  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options);
-  ASSERT_NE(interpreter, nullptr);
-
-  // The options/model can be deleted immediately after interpreter creation.
-  TFL_DeleteInterpreterOptions(options);
-  TFL_DeleteModel(model);
-
-  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
-  ASSERT_EQ(TFL_InterpreterGetInputTensorCount(interpreter), 1);
-  ASSERT_EQ(TFL_InterpreterGetOutputTensorCount(interpreter), 1);
-
-  std::array<int, 1> input_dims = {2};
-  ASSERT_EQ(TFL_InterpreterResizeInputTensor(interpreter, 0, input_dims.data(),
-                                             input_dims.size()),
-            kTfLiteOk);
-  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
-
-  TFL_Tensor* input_tensor = TFL_InterpreterGetInputTensor(interpreter, 0);
-  ASSERT_NE(input_tensor, nullptr);
-  EXPECT_EQ(TFL_TensorType(input_tensor), kTfLiteFloat32);
-  EXPECT_EQ(TFL_TensorNumDims(input_tensor), 1);
-  EXPECT_EQ(TFL_TensorDim(input_tensor, 0), 2);
-  EXPECT_EQ(TFL_TensorByteSize(input_tensor), sizeof(float) * 2);
-  EXPECT_NE(TFL_TensorData(input_tensor), nullptr);
-  EXPECT_STREQ(TFL_TensorName(input_tensor), "input");
-
-  std::array<float, 2> input = {1.f, 3.f};
-  ASSERT_EQ(TFL_TensorCopyFromBuffer(input_tensor, input.data(),
-                                     input.size() * sizeof(float)),
-            kTfLiteOk);
-
-  ASSERT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteOk);
-
-  const TFL_Tensor* output_tensor =
-      TFL_InterpreterGetOutputTensor(interpreter, 0);
-  ASSERT_NE(output_tensor, nullptr);
-  EXPECT_EQ(TFL_TensorType(output_tensor), kTfLiteFloat32);
-  EXPECT_EQ(TFL_TensorNumDims(output_tensor), 1);
-  EXPECT_EQ(TFL_TensorDim(output_tensor, 0), 2);
-  EXPECT_EQ(TFL_TensorByteSize(output_tensor), sizeof(float) * 2);
-  EXPECT_NE(TFL_TensorData(output_tensor), nullptr);
-  EXPECT_STREQ(TFL_TensorName(output_tensor), "output");
-
-  std::array<float, 2> output;
-  ASSERT_EQ(TFL_TensorCopyToBuffer(output_tensor, output.data(),
-                                   output.size() * sizeof(float)),
-            kTfLiteOk);
-  EXPECT_EQ(output[0], 3.f);
-  EXPECT_EQ(output[1], 9.f);
-
-  TFL_DeleteInterpreter(interpreter);
-}
-
-TEST(CApiSimple, ErrorReporter) {
-  TFL_Model* model = TFL_NewModelFromFile(
-      "tensorflow/contrib/lite/testdata/add.bin");
-  TFL_InterpreterOptions* options = TFL_NewInterpreterOptions();
-
-  // Install a custom error reporter into the interpreter by way of options.
-  tflite::TestErrorReporter reporter;
-  TFL_InterpreterOptionsSetErrorReporter(
-      options,
-      [](void* user_data, const char* format, va_list args) {
-        reinterpret_cast<tflite::TestErrorReporter*>(user_data)->Report(format,
-                                                                        args);
-      },
-      &reporter);
-  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options);
-
-  // The options/model can be deleted immediately after interpreter creation.
-  TFL_DeleteInterpreterOptions(options);
-  TFL_DeleteModel(model);
-
-  // Invoke the interpreter before tensor allocation.
-  EXPECT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteError);
-
-  // The error should propagate to the custom error reporter.
-  EXPECT_EQ(reporter.error_messages(),
-            "Invoke called on model that is not ready.");
-  EXPECT_EQ(reporter.num_calls(), 1);
-
-  TFL_DeleteInterpreter(interpreter);
-}
-
-}  // namespace
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md
deleted file mode 100644
index f480c49cd050de2192e9673f72c9e4d5c3c6ceff..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# TF Lite Experimental Unity Plugin
-
-This directory contains an experimental sample Unity (2017) Plugin, based on
-the experimental TF Lite C API. The sample demonstrates running inference within
-Unity by way of a C# `Interpreter` wrapper.
-
-Note that the native TF Lite plugin(s) *must* be built before using the Unity
-Plugin, and placed in Assets/TensorFlowLite/SDK/Plugins/. For the editor (note
-that this has only been tested on Linux; the syntax may differ on Mac/Windows):
-
-```sh
-bazel build -c opt --cxxopt=--std=c++11 \
-  //tensorflow/contrib/lite/experimental/c:libtensorflowlite_c.so
-```
-
-and for Android:
-
-```sh
-bazel build -c opt --cxxopt=--std=c++11 \
-  --crosstool_top=//external:android/crosstool \
-  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-  --cpu=armeabi-v7a \
-  //tensorflow/contrib/lite/experimental/c:libtensorflowlite_c.so
-```
-
-If you encounter issues with native plugin discovery on Mac ("Darwin")
-platforms, try renaming `libtensorflowlite_c.so` to `tensorflowlite_c.bundle`.
-Similarly, on Windows you'll likely need to rename `libtensorflowlite_c.so` to
-`tensorflowlite_c.dll`.
diff --git a/tensorflow/contrib/lite/experimental/kernels/BUILD b/tensorflow/contrib/lite/experimental/kernels/BUILD
deleted file mode 100644
index 4786cc62f93dc0a27efa02c2b436820867ab95f5..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/kernels/BUILD
+++ /dev/null
@@ -1,85 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-# ctc support classes imported directly from TensorFlow.
-cc_library(
-    name = "ctc_utils",
-    hdrs = [
-        "ctc_beam_entry.h",
-        "ctc_beam_scorer.h",
-        "ctc_beam_search.h",
-        "ctc_decoder.h",
-        "ctc_loss_util.h",
-    ],
-    deps = [
-        ":top_n",
-        "//tensorflow/contrib/lite/kernels/internal:types",
-        "//third_party/eigen3",
-    ],
-)
-
-# top_n support classes imported directly from TensorFlow.
-cc_library(
-    name = "top_n",
-    hdrs = [
-        "top_n.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite/kernels/internal:types",
-    ],
-)
-
-cc_library(
-    name = "experimental_ops",
-    srcs = [
-        "ctc_beam_search_decoder.cc",
-    ],
-    # Suppress warnings that are introduced by Eigen Tensor.
-    copts = tflite_copts() + [
-        "-Wno-error=reorder",
-    ] + select({
-        "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"],
-        "//conditions:default": [
-        ],
-    }),
-    deps = [
-        ":ctc_utils",
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/kernels:gemm_support",
-        "//tensorflow/contrib/lite/kernels:kernel_util",
-        "//tensorflow/contrib/lite/kernels:op_macros",
-        "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
-        "//tensorflow/contrib/lite/kernels/internal:optimized",
-        "//tensorflow/contrib/lite/kernels/internal:optimized_base",
-        "//tensorflow/contrib/lite/kernels/internal:quantization_util",
-        "//tensorflow/contrib/lite/kernels/internal:reference_base",
-        "//tensorflow/contrib/lite/kernels/internal:tensor",
-        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "ctc_beam_search_decoder_test",
-    size = "small",
-    srcs = ["ctc_beam_search_decoder_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":experimental_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
diff --git a/tensorflow/contrib/lite/experimental/micro/BUILD b/tensorflow/contrib/lite/experimental/micro/BUILD
deleted file mode 100644
index df1036bc8b9cc84f4b63ae2a771e3aa8f8989060..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/BUILD
+++ /dev/null
@@ -1,76 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow/contrib/lite/experimental/micro/testing:micro_test.bzl",
-    "tflite_micro_cc_test",
-)
-
-cc_library(
-    name = "micro_framework",
-    srcs = [
-        "micro_error_reporter.cc",
-        "micro_interpreter.cc",
-        "micro_mutable_op_resolver.cc",
-        "simple_tensor_allocator.cc",
-    ],
-    hdrs = [
-        "compatibility.h",
-        "micro_error_reporter.h",
-        "micro_interpreter.h",
-        "micro_mutable_op_resolver.h",
-        "simple_tensor_allocator.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/core/api",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-    ],
-)
-
-tflite_micro_cc_test(
-    name = "micro_error_reporter_test",
-    srcs = [
-        "micro_error_reporter_test.cc",
-    ],
-    deps = [
-        ":micro_framework",
-    ],
-)
-
-tflite_micro_cc_test(
-    name = "micro_mutable_op_resolver_test",
-    srcs = [
-        "micro_mutable_op_resolver_test.cc",
-    ],
-    deps = [
-        ":micro_framework",
-        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
-    ],
-)
-
-tflite_micro_cc_test(
-    name = "micro_interpreter_test",
-    srcs = [
-        "micro_interpreter_test.cc",
-    ],
-    deps = [
-        ":micro_framework",
-        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
-    ],
-)
-
-tflite_micro_cc_test(
-    name = "simple_tensor_allocator_test",
-    srcs = [
-        "simple_tensor_allocator_test.cc",
-    ],
-    deps = [
-        ":micro_framework",
-        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
-    ],
-)
diff --git a/tensorflow/contrib/lite/experimental/micro/README.md b/tensorflow/contrib/lite/experimental/micro/README.md
deleted file mode 100644
index 414cafde4d489eac36f739f163033bf27f0fc818..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/README.md
+++ /dev/null
@@ -1,114 +0,0 @@
-# TensorFlow Lite for Microcontrollers
-
-This an experimental port of TensorFlow Lite aimed at micro controllers and other devices with only kilobytes of memory. It doesn't require any operating system support, any standard C or C++ libraries, or dynamic memory allocation, so it's designed to be portable even to 'bare metal' systems. The core runtime fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword detection model, takes up a total of 22KB.
-
-The design goals are for the framework to be:
-
-- **Readable**: We want embedded software engineers to be able to understand what's required to run ML inference without having to study research papers. We've tried to keep the code base small, modular, and have reference implementations of all operations to help with this.
-
-- **Easy to modify**: We know that there are a lot of different platforms and requirements in the embedded world, and we don't expect to cover all of them in one framework. Instead, we're hoping that it can be a good starting point for developers to build on top of to meet their own needs. For example, we tried to make it easy to replace the implementations of key computational operators that are often crucial for performance, without having to touch the data flow and other runtime code. We want it to make more sense to use our workflow to handle things like model import and less-important operations, and customize the parts that matter, rather than having to reimplement everything in your own engine.
-
-- **Well-tested**: If you're modifying code, you need to know if your changes are correct. Having an easy way to test lets you develop much faster. To help there, we've written tests for all the components, and we've made sure that the tests can be run on almost any platform, with no dependencies apart from the ability to log text to a debug console somewhere. We also provide an easy way to run all the tests on-device as part of an automated test framework, and we use qemu/Renode emulation so that tests can be run even without physical devices present.
-
-- **Easy to integrate**: We want to be as open a system as possible, and use the best code available for each platform. To do that, we're going to rely on projects like [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html), [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to handle as much performance-critical code as possible. We know that there are an increasing number of options to accelerate neural networks on microcontrollers, so we're aiming to be a good host for deploying those hardware technologies too.
-
-- **Compatible**: We're using the same file schema, interpreter API, and kernel interface as regular TensorFlow Lite, so we leverage the large existing set of tools, documentation, and examples for the project. The biggest barrier to deploying ML models is getting them from a training environment into a form that's easy to run inference on, so we see reusing this rich ecosystem as being crucial to being easily usable. We also hope to integrate this experimental work back into the main codebase in the future.
-
-To meet those goals, we've made some tradeoffs:
-
-- **Simple C++**: To help with readability, our code is written in a modern version of C++, but we generally treat it as a "better C", rather relying on more complex features such as template meta-programming. As mentioned earlier, we avoid any use of dynamic memory allocation (new/delete) or the standard C/C++ libraries, so we believe this should still be fairly portable. It does mean that some older devices with C-only toolchains won't be supported, but we're hoping that the reference operator implementations (which are simple C-like functions) can still be useful in those cases. The interfaces are also designed to be C-only, so it should be possible to integrate the resulting library with pure C projects.
-
-- **Interpreted**: Code generation is a popular pattern for embedded code, because it gives standalone code that's easy to modify and step through, but we've chosen to go with an interpreted approach. In our internal microcontroller work we've found that using an extremely stripped-down interpreter with almost no dependencies gives us a lot of the same advantages, but is easier to maintain. For example, when new updates come out for the underlying library, you can just merge your local modifications in a single step, rather than having to regenerate new code and then patch in any changes you subsequently made. The coarse granularity of the interpreted primitives means that each operation call typically takes hundreds of thousands of instruction cycles at least, so we don't see noticeable performance gains from avoiding what's essentially a single switch statement at the interpreter level to call each operation. We're still working on improving the packaging though, for example we're considering having the ability to snapshot all the source files and headers used for a particular model, being able to compile the code and data together as a library, and then access it through a minimal set of C interface calls which hide the underlying complexity.
-
-- **Flatbuffers**: We represent our models using [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/schema/schema.fbs), with the difference that we always keep it in read-only program memory (typically flash) rather than relying on having a file system to read it from. This is a good fit because flatbuffer's serialized format is designed to be mapped into memory without requiring any extra memory allocations or modifications to access it. All of the functions to read model values work directly on the serialized bytes, and large sections of data like weights are directly accessible as sequential C-style arrays of their data type, with no strides or unpacking needed. We do get a lot of value from using flatbuffers, but there is a cost in complexity. The flat buffer library code is all inline [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/schema/schema_generated.h), but it isn't straightforward to inspect their implementations, and the model data structures aren't easy to comprehend from the debugger. The header for the schema itself also has to be periodically updated when new information is added to the file format, though we try to handle that transparently for most developers by checking in a pre-generated version.
-
-- **Code Duplication**: Some of the code in this prototype largely duplicates the logic in other parts of the TensorFlow Lite code base, for example the operator wrappers. We've tried to keep share as much as we can between the two interpreters, but there are some assumptions built into the original runtime that make this difficult. We'll be working on modularizing the main interpreter so that we can move to an entirely shared system.
-
-This initial preview release is designed to get early feedback, and is not intended to be a final product. It only includes enough operations to run a simple keyword recognition model, and the implementations are not optimized. We're hoping this will be a good way to get feedback and collaborate to improve the framework.
-
-## Getting Started
-
-Building requires a Linux or OS X machine.
-
- - Open a terminal
- - Download the TensorFlow source with `git clone https://github.com/tensorflow`
- - Enter the source root directory by running `cd tensorflow`
- - Download the dependencies by running `tensorflow/contrib/lite/experimental/micro/tools/make/download_dependencies.sh`. This may take a few minutes
- - Build and test the library with `make -f tensorflow/contrib/lite/experimental/micro/tools/make/Makefile test`
-
-You should see a series of compilation steps, followed by "~~~ALL TESTS PASSED~~~" for the various tests of the code that it will run. If there's an error, you should get an informative message from make about what went wrong.
-
-These tests are all built as simple binaries with few dependencies, so you can run them manually. For example, here's how to run the depthwise convolution test, and its output:
-
-```
-tensorflow/contrib/lite/experimental/micro/tools/make/gen/linux_x86_64/bin/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test
-
-Testing SimpleTest
-Testing SimpleTestQuantized
-Testing SimpleTestRelu
-Testing SimpleTestReluQuantized
-4/4 tests passed
-~ALL TESTS PASSED~~~
-```
-
-Looking at the [depthwise_conv_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test.cc) code, you'll see a sequence that looks like this:
-
-```
-...
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(SimpleTest) {
-...
-}
-...
-TF_LITE_MICRO_TESTS_END
-```
-
-These macros work a lot like [the Google test framework](https://github.com/google/googletest), but they don't require any dependencies and just write results to stderr, rather than aborting the program. If all the tests pass, then "~~~ALL TESTS PASSED~~~" is output, and the test harness that runs the binary during the make process knows that everything ran correctly. If there's an error, the lack of the expected string lets the harness know that the test failed.
-
-So, why are we running tests in this complicated way? So far, we've been building binaries that run locally on the Mac OS or Linux machine you're building on, but this approach becomes important when we're targeting simple micro controller devices.
-
-## Building for the "Blue Pill" STM32F103
-
-The goal of this library is to enable machine learning on resource-constrained micro controllers and DSPs, and as part of that we've targeted the ["Blue Pill" STM32F103-compatible development board](https://github.com/google/googletest) as a cheap and popular platform. It only has 20KB of RAM and 64KB of flash, so it's a good device to ensure we can run efficiently on small chips.
-
-It's fairly easy to [buy and wire up a physical board](https://github.com/google/stm32_bare_lib#wiring-up-your-blue-pill), but even if you don't have an actual device, the [Renode project](https://renode.io/) makes it easy to run a faithful emulation on your desktop machine. You'll need [Docker](https://www.docker.com/) installed, but once you have that set up, try running the following command:
-
-`make -f tensorflow/contrib/lite/experimental/micro/tools/make/Makefile TARGET=bluepill test`
-
-You should see a similar set of outputs as you did in the previous section, with the addition of some extra Docker logging messages. These are because we're using Docker to run the Renode micro controller emulation tool, and the tests themselves are being run on a simulated STM32F103 device. The communication channels between an embedded device and the host are quite limited, so the test harness looks at the output of the debug log to see if tests have passed, just as it did in the previous section. This makes it a very flexible way to run cross-platform tests, even when a platform has no operating system facilities, as long as it can output debugging text logs.
-
-To understand what's happening here, try running the same depthwise convolution test, but through the emulated device test harness, with the following command:
-
-```
-tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh \
-tensorflow/contrib/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test
-
-```
-
-You should see output that looks something like this:
-
-```
-Sending build context to Docker daemon   21.5kB
-Step 1/2 : FROM antmicro/renode:latest
- ---> 1b670a243e8f
-Step 2/2 : LABEL maintainer="Pete Warden <petewarden@google.com>"
- ---> Using cache
- ---> 3afcd410846d
-Successfully built 3afcd410846d
-Successfully tagged renode_bluepill:latest
-LOGS:
-...
-03:27:32.4340 [INFO] machine-0: Machine started.
-03:27:32.4790 [DEBUG] cpu.uartSemihosting: [+0.22s host +0s virt 0s virt from start] Testing SimpleTest
-03:27:32.4812 [DEBUG] cpu.uartSemihosting: [+2.21ms host +0s virt 0s virt from start]   Testing SimpleTestQuantized
-03:27:32.4833 [DEBUG] cpu.uartSemihosting: [+2.14ms host +0s virt 0s virt from start]   Testing SimpleTestRelu
-03:27:32.4834 [DEBUG] cpu.uartSemihosting: [+0.18ms host +0s virt 0s virt from start]   Testing SimpleTestReluQuantized
-03:27:32.4838 [DEBUG] cpu.uartSemihosting: [+0.4ms host +0s virt 0s virt from start]   4/4 tests passed
-03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+41µs host +0s virt 0s virt from start]   ~~~ALL TESTS PASSED~~~
-03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+5µs host +0s virt 0s virt from start]   
-...
-tensorflow/contrib/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test: PASS
-```
-
-There's a lot of output here, but you should be able to see that the same tests that were covered when we ran locally on the development machine show up in the debug logs here, along with the magic string "~~~ALL TESTS PASSED~~~". This is the exact same code as before, just compiled and run on the STM32F103 rather than your desktop. We hope that the simplicity of this testing approach will help make adding support for new platforms as easy as possible.
diff --git a/tensorflow/contrib/lite/experimental/micro/compatibility.h b/tensorflow/contrib/lite/experimental/micro/compatibility.h
deleted file mode 100644
index 4f0fd9f3120a5db74cdfb84e7b17a0f3656520bc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/compatibility.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_COMPATIBILITY_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_COMPATIBILITY_H_
-
-// C++ will automatically create class-specific delete operators for virtual
-// objects, which by default call the global delete function. For embedded
-// applications we want to avoid this, and won't be calling new/delete on these
-// objects, so we need to override the default implementation with one that does
-// nothing to avoid linking in ::delete().
-// This macro needs to be included in all subclasses of a virtual base class in
-// the private section.
-#ifdef TF_LITE_STATIC_MEMORY
-#define TF_LITE_REMOVE_VIRTUAL_DELETE \
-  void operator delete(void* p) {}
-#else
-#define TF_LITE_REMOVE_VIRTUAL_DELETE
-#endif
-
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_COMPATIBILITY_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD
deleted file mode 100644
index 447c584387c62d4c410928085a9ec62d22adb9c8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/BUILD
+++ /dev/null
@@ -1,28 +0,0 @@
-# Description:
-#   TensorFlow Lite microcontroller example.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow/contrib/lite/experimental/micro/testing:micro_test.bzl",
-    "tflite_micro_cc_test",
-)
-
-tflite_micro_cc_test(
-    name = "micro_speech_test",
-    srcs = [
-        "micro_speech_test.cc",
-        "tiny_conv_model_data.cc",
-        "tiny_conv_model_data.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
-        "//tensorflow/contrib/lite/experimental/micro/kernels:all_ops_resolver",
-        "//tensorflow/contrib/lite/experimental/micro/kernels:micro_ops",
-        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-    ],
-)
diff --git a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
deleted file mode 100644
index 86cd056a7216aa57126be3f6e660a7dcee0c6c44..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
-#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h"
-#include "tensorflow/contrib/lite/experimental/micro/micro_interpreter.h"
-#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/version.h"
-
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(TestInvoke) {
-  tflite::MicroErrorReporter micro_error_reporter;
-  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
-
-  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
-  if (model->version() != TFLITE_SCHEMA_VERSION) {
-    error_reporter->Report(
-        "Model provided is schema version %d not equal "
-        "to supported version %d.\n",
-        model->version(), TFLITE_SCHEMA_VERSION);
-  }
-  tflite::ops::micro::AllOpsResolver resolver;
-
-  const int tensor_arena_size = 10 * 1024;
-  uint8_t tensor_arena[tensor_arena_size];
-  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
-                                                 tensor_arena_size);
-
-  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
-                                       error_reporter);
-  TfLiteStatus invoke_status = interpreter.Invoke();
-  if (invoke_status != kTfLiteOk) {
-    error_reporter->Report("Invoke failed\n");
-  }
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
-
-  error_reporter->Report("Ran successfully\n");
-}
-
-TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc b/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
deleted file mode 100644
index f1f9e0e21994b0a79241690e533e4edc8bfe5565..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
+++ /dev/null
@@ -1,1672 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Automatically created from a TensorFlow Lite flatbuffer using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
-
-#include "tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
-
-const unsigned char g_tiny_conv_model_data[] = {
-    0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
-    0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
-    0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x4d, 0x00, 0x00,
-    0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0xf4, 0x47, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
-    0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74,
-    0x65, 0x64, 0x2e, 0x00, 0x09, 0x00, 0x00, 0x00, 0xd4, 0x47, 0x00, 0x00,
-    0x04, 0x03, 0x00, 0x00, 0xfc, 0x02, 0x00, 0x00, 0xf4, 0x02, 0x00, 0x00,
-    0x64, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
-    0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb8, 0xb3, 0xff, 0xff,
-    0x16, 0xb4, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0xd7, 0x02, 0x00, 0x00, 0x2f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0xb3, 0xff, 0xff,
-    0x46, 0xb4, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0xab, 0x00, 0x00, 0x00, 0x1e, 0xff, 0xff, 0xff, 0xed, 0xff, 0xff, 0xff,
-    0x4a, 0x00, 0x00, 0x00, 0x62, 0xb4, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
-    0x80, 0x02, 0x00, 0x00, 0xce, 0xad, 0xaf, 0x3c, 0xc8, 0xe9, 0xb0, 0x83,
-    0xa1, 0xbf, 0xb2, 0xb1, 0xab, 0xd0, 0xa7, 0x53, 0xa5, 0xe9, 0xb5, 0xac,
-    0xa2, 0xd3, 0xc4, 0x9e, 0x8b, 0xb2, 0x64, 0xb3, 0x9d, 0xa2, 0xae, 0xa6,
-    0xd5, 0xbe, 0x43, 0x9f, 0x9c, 0x54, 0xb5, 0xa8, 0x49, 0x78, 0x86, 0xa2,
-    0xa3, 0x55, 0x35, 0x96, 0x3d, 0x7f, 0xe2, 0xb5, 0xb0, 0x47, 0x28, 0xa9,
-    0x9d, 0xbb, 0xd6, 0xff, 0xb7, 0x79, 0x63, 0xb5, 0xaf, 0xa7, 0xab, 0x7e,
-    0xbc, 0xc7, 0xa0, 0xc3, 0xb1, 0xb6, 0xb2, 0xa1, 0xc2, 0xbb, 0x79, 0x57,
-    0xbe, 0xc1, 0xb7, 0xb0, 0x6b, 0xb7, 0xa5, 0x75, 0x97, 0xb8, 0xe7, 0xac,
-    0xad, 0x7e, 0xb1, 0x9b, 0xc3, 0xba, 0x6b, 0xa2, 0x7f, 0x58, 0xb9, 0x7a,
-    0x4c, 0x91, 0x74, 0x9e, 0xa7, 0x3d, 0xc2, 0x94, 0x75, 0xa1, 0xa4, 0xac,
-    0xab, 0x45, 0x2e, 0xb4, 0xb6, 0xbf, 0xc1, 0xdb, 0xaf, 0x6c, 0x67, 0xb1,
-    0xa9, 0xa6, 0xa8, 0xca, 0xc2, 0xc4, 0xb9, 0xbf, 0xb4, 0xb9, 0xaa, 0x9d,
-    0x9f, 0xb9, 0xb2, 0x71, 0xb2, 0xca, 0xbe, 0xaf, 0x5f, 0xbc, 0xa0, 0x5b,
-    0xa8, 0xb4, 0xa4, 0xa8, 0xd8, 0x69, 0xb7, 0x8a, 0xbc, 0xb8, 0xaf, 0x9c,
-    0x7c, 0x5d, 0xb3, 0x6b, 0x49, 0x95, 0x64, 0xa0, 0xa2, 0x49, 0xcb, 0x87,
-    0xa5, 0xb5, 0xa1, 0xb2, 0xa3, 0x40, 0x6d, 0x9f, 0xc5, 0xb6, 0xbb, 0xd4,
-    0x9c, 0x6d, 0x69, 0xa9, 0xa8, 0x91, 0xad, 0xb8, 0xd2, 0xc6, 0xaf, 0xb8,
-    0xac, 0xa9, 0xa2, 0xa7, 0x60, 0xa6, 0xa1, 0xc9, 0xb8, 0xd6, 0xcf, 0xb1,
-    0x56, 0xb4, 0xac, 0x40, 0xae, 0xbd, 0xbf, 0xa2, 0x54, 0x72, 0x9b, 0x8c,
-    0xc2, 0xb5, 0xc2, 0x9b, 0x64, 0x6d, 0xb4, 0x62, 0x4e, 0x9b, 0x6c, 0xa6,
-    0x8f, 0x4c, 0xca, 0x95, 0xb6, 0xbf, 0x92, 0xae, 0x9c, 0x49, 0xae, 0xb2,
-    0xc0, 0xb6, 0xbc, 0xd1, 0xa4, 0x7b, 0x64, 0xa0, 0xa6, 0x81, 0xac, 0xa6,
-    0xbd, 0xc8, 0xbc, 0xae, 0xaa, 0x9e, 0x61, 0xb1, 0x57, 0xac, 0xbf, 0xbf,
-    0xbb, 0xe0, 0xa6, 0xae, 0x47, 0xc9, 0xbc, 0x57, 0xb0, 0xb5, 0xc7, 0x98,
-    0xf4, 0x93, 0xb6, 0x70, 0xc3, 0xb3, 0xca, 0xab, 0x77, 0x9a, 0xac, 0x45,
-    0x5c, 0x9e, 0x9a, 0xa9, 0x9b, 0x35, 0xc0, 0x6f, 0xc6, 0xc7, 0x91, 0xb4,
-    0xa8, 0x3c, 0xce, 0xb8, 0xad, 0xb9, 0xb5, 0xdd, 0x9c, 0x6d, 0xbf, 0x91,
-    0xb2, 0x7d, 0xa0, 0xaf, 0x9f, 0xbd, 0xb9, 0xcf, 0x9b, 0x5d, 0x3f, 0xac,
-    0x64, 0xae, 0xaf, 0xb8, 0xbc, 0xb8, 0x86, 0xb5, 0x36, 0xcf, 0xb4, 0xa9,
-    0xad, 0xcd, 0xdb, 0xa4, 0x68, 0xa6, 0xa4, 0x67, 0xc8, 0xb7, 0xe5, 0xa4,
-    0x76, 0xb8, 0xa8, 0x28, 0x6b, 0xa5, 0xba, 0xad, 0x9f, 0x3a, 0xa5, 0x42,
-    0xc5, 0xb0, 0x88, 0xad, 0xa5, 0x4d, 0xea, 0x8a, 0xb8, 0xb5, 0xb3, 0xd9,
-    0xa0, 0x77, 0xbb, 0x92, 0x9e, 0x80, 0xbd, 0xbd, 0x6d, 0xcc, 0xab, 0x99,
-    0x88, 0x58, 0x4d, 0xb0, 0x6c, 0xbc, 0x96, 0xbd, 0xae, 0xab, 0x5b, 0xac,
-    0x2f, 0xc3, 0x9a, 0xbe, 0xac, 0xb3, 0x84, 0x9b, 0xe3, 0xaf, 0x95, 0x6b,
-    0xc2, 0xb5, 0xca, 0xb7, 0x4e, 0xbc, 0x9d, 0x24, 0x75, 0xa9, 0xd2, 0xae,
-    0xa0, 0x2b, 0x90, 0x34, 0xd1, 0xb5, 0x96, 0xae, 0xaa, 0x4d, 0xc1, 0xa3,
-    0xb1, 0xb4, 0xaa, 0xd2, 0x9c, 0x7d, 0xc0, 0x91, 0x91, 0x7a, 0xb8, 0x83,
-    0x44, 0xcb, 0xaf, 0x9b, 0x6b, 0x5b, 0x75, 0xb2, 0x62, 0xb6, 0xaa, 0xcb,
-    0x99, 0xa8, 0x63, 0xae, 0x24, 0xc7, 0x8a, 0xbe, 0xa9, 0xb6, 0xa0, 0xa1,
-    0x41, 0xac, 0x84, 0xb5, 0xb9, 0xb3, 0x9b, 0xad, 0x77, 0xbf, 0xa8, 0x7e,
-    0x82, 0xb9, 0xbe, 0xaa, 0xa3, 0x47, 0x6d, 0xb5, 0xc3, 0xb1, 0xbf, 0xa7,
-    0xb1, 0x57, 0x75, 0xb5, 0xb0, 0xb6, 0xb9, 0xce, 0xa4, 0x86, 0xb0, 0xa4,
-    0x98, 0x80, 0xc5, 0x3e, 0x90, 0xca, 0x9b, 0xa2, 0x5a, 0x50, 0xc5, 0xa5,
-    0xad, 0xc1, 0x9c, 0x91, 0x83, 0x8f, 0x21, 0xab, 0xac, 0xba, 0x70, 0xb4,
-    0xae, 0x85, 0x7e, 0xa7, 0xbd, 0xba, 0x7c, 0xb2, 0xb5, 0xb2, 0x7e, 0xb3,
-    0xc3, 0xcd, 0x82, 0xac, 0x9b, 0xb3, 0xa6, 0xb0, 0xbc, 0x6f, 0x52, 0xb9,
-    0xbf, 0xb1, 0xa6, 0xa4, 0xc1, 0x7a, 0x90, 0xc0, 0xae, 0xab, 0x94, 0xd8,
-    0xab, 0xa4, 0x98, 0xbb, 0x8b, 0x86, 0x94, 0x01, 0xad, 0xe7, 0xb1, 0x9b,
-    0x57, 0x48, 0xc1, 0x88, 0xbf, 0xcc, 0xb4, 0x4b, 0x62, 0x8b, 0x48, 0xa7,
-    0xbe, 0xe1, 0x80, 0xa6, 0xb3, 0x64, 0xaa, 0xa4, 0xcf, 0xba, 0x6d, 0xa6,
-    0xb8, 0xa0, 0x8f, 0xb3, 0xce, 0xc3, 0x87, 0xb2, 0xa0, 0xc0, 0x78, 0xb0,
-    0xb9, 0xaa, 0x40, 0xb8, 0xd8, 0xa3, 0x9a, 0xaa, 0xcc, 0xa2, 0x9f, 0xb9,
-    0xbe, 0xc2, 0x89, 0xd6, 0xc6, 0x9c, 0xa3, 0xc7, 0x94, 0xb6, 0xff, 0xff,
-    0x98, 0xb6, 0xff, 0xff, 0xf6, 0xb6, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
-    0xc0, 0x44, 0x00, 0x00, 0x4a, 0x4d, 0x59, 0x60, 0x5a, 0x45, 0x3d, 0x50,
-    0x4a, 0x43, 0x3d, 0x59, 0x3e, 0x49, 0x4a, 0x59, 0x45, 0x44, 0x41, 0x5d,
-    0x50, 0x2f, 0x4e, 0x34, 0x46, 0x48, 0x41, 0x4a, 0x4c, 0x3b, 0x4b, 0x3e,
-    0x49, 0x49, 0x43, 0x4b, 0x3e, 0x49, 0x47, 0x41, 0x3e, 0x4a, 0x46, 0x43,
-    0x41, 0x43, 0x47, 0x49, 0x4a, 0x4c, 0x46, 0x58, 0x3f, 0x4c, 0x4b, 0x4c,
-    0x4d, 0x4b, 0x45, 0x52, 0x45, 0x42, 0x52, 0x52, 0x48, 0x40, 0x46, 0x5f,
-    0x4c, 0x41, 0x47, 0x48, 0x48, 0x4c, 0x43, 0x61, 0x50, 0x4b, 0x49, 0x49,
-    0x46, 0x3f, 0x40, 0x67, 0x40, 0x4d, 0x45, 0x40, 0x40, 0x45, 0x47, 0x56,
-    0x44, 0x3a, 0x4a, 0x4c, 0x52, 0x48, 0x46, 0x50, 0x4b, 0x44, 0x51, 0x45,
-    0x40, 0x45, 0x45, 0x48, 0x4e, 0x4e, 0x43, 0x48, 0x44, 0x4b, 0x45, 0x4a,
-    0x53, 0x45, 0x4a, 0x4b, 0x3f, 0x43, 0x45, 0x53, 0x4d, 0x43, 0x46, 0x3f,
-    0x47, 0x4e, 0x51, 0x50, 0x48, 0x4f, 0x4f, 0x4a, 0x4a, 0x4e, 0x45, 0x4e,
-    0x46, 0x41, 0x4a, 0x46, 0x45, 0x47, 0x45, 0x4b, 0x50, 0x4c, 0x46, 0x45,
-    0x41, 0x47, 0x41, 0x47, 0x46, 0x4f, 0x3f, 0x4f, 0x4a, 0x51, 0x4f, 0x53,
-    0x54, 0x48, 0x51, 0x43, 0x4b, 0x48, 0x4d, 0x46, 0x48, 0x4f, 0x49, 0x44,
-    0x43, 0x53, 0x50, 0x59, 0x56, 0x3d, 0x45, 0x44, 0x48, 0x38, 0x3b, 0x5f,
-    0x39, 0x43, 0x43, 0x52, 0x46, 0x3e, 0x43, 0x58, 0x43, 0x1e, 0x50, 0x3c,
-    0x46, 0x4b, 0x46, 0x50, 0x3c, 0x37, 0x4c, 0x47, 0x47, 0x4b, 0x47, 0x54,
-    0x43, 0x3e, 0x47, 0x4f, 0x4b, 0x41, 0x53, 0x50, 0x42, 0x46, 0x4f, 0x4b,
-    0x4e, 0x3f, 0x49, 0x52, 0x4a, 0x4a, 0x49, 0x53, 0x52, 0x47, 0x52, 0x5a,
-    0x40, 0x42, 0x4d, 0x4b, 0x50, 0x43, 0x49, 0x59, 0x47, 0x4c, 0x4d, 0x50,
-    0x4e, 0x3c, 0x44, 0x61, 0x51, 0x49, 0x49, 0x46, 0x49, 0x47, 0x4b, 0x5a,
-    0x45, 0x4b, 0x43, 0x40, 0x44, 0x52, 0x4d, 0x54, 0x49, 0x47, 0x44, 0x48,
-    0x46, 0x48, 0x3e, 0x40, 0x45, 0x4f, 0x4d, 0x4b, 0x4c, 0x40, 0x3d, 0x40,
-    0x3e, 0x48, 0x50, 0x4e, 0x4c, 0x42, 0x48, 0x4b, 0x3d, 0x48, 0x4b, 0x44,
-    0x52, 0x4b, 0x49, 0x4f, 0x49, 0x3f, 0x47, 0x43, 0x4d, 0x3f, 0x53, 0x4e,
-    0x4a, 0x4f, 0x4e, 0x4e, 0x53, 0x42, 0x46, 0x4c, 0x44, 0x4c, 0x46, 0x51,
-    0x45, 0x48, 0x4a, 0x50, 0x47, 0x41, 0x45, 0x54, 0x4a, 0x44, 0x50, 0x49,
-    0x48, 0x50, 0x51, 0x4b, 0x50, 0x4c, 0x4a, 0x49, 0x43, 0x47, 0x50, 0x4a,
-    0x4d, 0x4c, 0x4e, 0x49, 0x42, 0x50, 0x52, 0x48, 0x45, 0x5a, 0x4e, 0x55,
-    0x51, 0x3d, 0x3d, 0x4d, 0x42, 0x32, 0x36, 0x64, 0x39, 0x4c, 0x41, 0x48,
-    0x44, 0x35, 0x43, 0x56, 0x47, 0x1e, 0x4b, 0x3e, 0x47, 0x3f, 0x43, 0x52,
-    0x51, 0x34, 0x41, 0x4d, 0x3e, 0x41, 0x41, 0x48, 0x3c, 0x4b, 0x45, 0x3b,
-    0x40, 0x43, 0x4c, 0x46, 0x46, 0x47, 0x3e, 0x4f, 0x4b, 0x48, 0x42, 0x47,
-    0x4e, 0x3e, 0x49, 0x47, 0x43, 0x43, 0x4e, 0x52, 0x51, 0x45, 0x3f, 0x54,
-    0x46, 0x44, 0x48, 0x5d, 0x3e, 0x4a, 0x47, 0x52, 0x53, 0x3a, 0x4f, 0x5d,
-    0x41, 0x4c, 0x48, 0x51, 0x43, 0x4b, 0x4b, 0x67, 0x48, 0x4b, 0x45, 0x4d,
-    0x4b, 0x43, 0x4a, 0x54, 0x4c, 0x46, 0x43, 0x4a, 0x4d, 0x43, 0x4c, 0x47,
-    0x4a, 0x48, 0x4d, 0x42, 0x4d, 0x48, 0x3f, 0x43, 0x4c, 0x44, 0x4e, 0x4c,
-    0x40, 0x45, 0x4b, 0x48, 0x47, 0x47, 0x3e, 0x4c, 0x52, 0x41, 0x44, 0x4e,
-    0x4d, 0x44, 0x49, 0x4d, 0x3d, 0x45, 0x48, 0x4f, 0x4c, 0x4a, 0x55, 0x51,
-    0x4d, 0x4c, 0x45, 0x4e, 0x46, 0x45, 0x44, 0x49, 0x4e, 0x44, 0x40, 0x48,
-    0x49, 0x44, 0x53, 0x51, 0x42, 0x41, 0x51, 0x49, 0x51, 0x45, 0x51, 0x3f,
-    0x4b, 0x3f, 0x52, 0x3c, 0x50, 0x4d, 0x4f, 0x4b, 0x44, 0x4f, 0x40, 0x52,
-    0x49, 0x4a, 0x50, 0x3f, 0x3d, 0x54, 0x4c, 0x53, 0x52, 0x45, 0x41, 0x43,
-    0x47, 0x2d, 0x40, 0x63, 0x3a, 0x51, 0x43, 0x4e, 0x40, 0x2b, 0x36, 0x5b,
-    0x4b, 0x12, 0x4d, 0x35, 0x4b, 0x3f, 0x44, 0x4a, 0x46, 0x31, 0x54, 0x48,
-    0x43, 0x42, 0x3d, 0x51, 0x41, 0x45, 0x49, 0x4b, 0x47, 0x49, 0x3d, 0x3e,
-    0x46, 0x3d, 0x4d, 0x48, 0x3d, 0x45, 0x48, 0x4b, 0x49, 0x52, 0x44, 0x4c,
-    0x45, 0x44, 0x45, 0x49, 0x50, 0x48, 0x45, 0x46, 0x45, 0x44, 0x52, 0x55,
-    0x46, 0x45, 0x4b, 0x3d, 0x42, 0x4a, 0x3e, 0x57, 0x48, 0x4b, 0x3c, 0x42,
-    0x4a, 0x46, 0x47, 0x6c, 0x54, 0x4b, 0x41, 0x49, 0x49, 0x50, 0x43, 0x56,
-    0x44, 0x43, 0x4d, 0x3e, 0x44, 0x41, 0x47, 0x40, 0x4a, 0x4b, 0x4d, 0x4d,
-    0x3e, 0x46, 0x45, 0x47, 0x3e, 0x42, 0x4a, 0x45, 0x49, 0x3d, 0x3f, 0x43,
-    0x40, 0x44, 0x47, 0x4a, 0x45, 0x4d, 0x4b, 0x4c, 0x43, 0x40, 0x3d, 0x3e,
-    0x4c, 0x4c, 0x42, 0x4d, 0x48, 0x4d, 0x49, 0x42, 0x51, 0x51, 0x4c, 0x4b,
-    0x53, 0x4f, 0x48, 0x4d, 0x40, 0x46, 0x45, 0x4b, 0x47, 0x47, 0x4b, 0x46,
-    0x54, 0x42, 0x42, 0x46, 0x46, 0x4a, 0x4c, 0x55, 0x3f, 0x3c, 0x52, 0x4b,
-    0x4b, 0x4d, 0x4e, 0x48, 0x53, 0x4c, 0x4b, 0x42, 0x52, 0x54, 0x50, 0x4b,
-    0x40, 0x5f, 0x58, 0x53, 0x50, 0x42, 0x35, 0x48, 0x39, 0x24, 0x3c, 0x5e,
-    0x41, 0x50, 0x3c, 0x51, 0x42, 0x26, 0x42, 0x56, 0x41, 0x0c, 0x3e, 0x3d,
-    0x48, 0x3e, 0x50, 0x4b, 0x3a, 0x2c, 0x43, 0x3d, 0x48, 0x3e, 0x43, 0x48,
-    0x4c, 0x3f, 0x4a, 0x3e, 0x51, 0x4a, 0x4f, 0x40, 0x47, 0x43, 0x50, 0x4c,
-    0x43, 0x4d, 0x3f, 0x45, 0x4d, 0x3e, 0x4c, 0x44, 0x51, 0x47, 0x4b, 0x51,
-    0x45, 0x49, 0x44, 0x3f, 0x46, 0x46, 0x46, 0x57, 0x49, 0x4c, 0x49, 0x4e,
-    0x47, 0x4c, 0x47, 0x5e, 0x43, 0x46, 0x45, 0x4b, 0x52, 0x49, 0x45, 0x5f,
-    0x47, 0x41, 0x46, 0x43, 0x4f, 0x3b, 0x43, 0x51, 0x46, 0x53, 0x4a, 0x4e,
-    0x4b, 0x43, 0x4e, 0x40, 0x48, 0x49, 0x46, 0x3f, 0x48, 0x50, 0x4b, 0x41,
-    0x4a, 0x47, 0x4b, 0x3d, 0x46, 0x49, 0x4b, 0x43, 0x43, 0x42, 0x3e, 0x47,
-    0x47, 0x4a, 0x45, 0x46, 0x51, 0x48, 0x51, 0x4e, 0x3f, 0x50, 0x44, 0x4b,
-    0x4d, 0x4e, 0x44, 0x4d, 0x3d, 0x49, 0x4a, 0x4e, 0x42, 0x51, 0x43, 0x42,
-    0x46, 0x3e, 0x48, 0x4b, 0x4f, 0x50, 0x3d, 0x48, 0x4c, 0x4f, 0x46, 0x44,
-    0x44, 0x48, 0x42, 0x4b, 0x48, 0x41, 0x43, 0x46, 0x4d, 0x49, 0x4f, 0x43,
-    0x41, 0x44, 0x3f, 0x3d, 0x45, 0x4f, 0x45, 0x41, 0x40, 0x58, 0x4f, 0x54,
-    0x5b, 0x4b, 0x3a, 0x47, 0x3d, 0x28, 0x3d, 0x57, 0x3e, 0x51, 0x3f, 0x47,
-    0x3f, 0x2e, 0x3e, 0x54, 0x4e, 0x0b, 0x41, 0x3d, 0x3b, 0x3d, 0x43, 0x47,
-    0x47, 0x28, 0x4d, 0x43, 0x43, 0x3b, 0x4e, 0x4a, 0x4d, 0x42, 0x51, 0x46,
-    0x4f, 0x3d, 0x4c, 0x3a, 0x49, 0x49, 0x4a, 0x43, 0x42, 0x4b, 0x47, 0x42,
-    0x42, 0x49, 0x3f, 0x4d, 0x46, 0x4a, 0x49, 0x4e, 0x42, 0x3c, 0x4a, 0x41,
-    0x4c, 0x40, 0x4d, 0x5a, 0x49, 0x46, 0x51, 0x46, 0x4b, 0x4c, 0x46, 0x62,
-    0x45, 0x42, 0x51, 0x4e, 0x4d, 0x3e, 0x4d, 0x5b, 0x4d, 0x43, 0x45, 0x50,
-    0x4b, 0x40, 0x50, 0x53, 0x4f, 0x4f, 0x51, 0x53, 0x46, 0x41, 0x4e, 0x3a,
-    0x4b, 0x47, 0x3f, 0x3e, 0x4d, 0x48, 0x53, 0x3f, 0x45, 0x42, 0x4c, 0x45,
-    0x55, 0x4c, 0x4b, 0x39, 0x4a, 0x45, 0x48, 0x4d, 0x47, 0x40, 0x48, 0x4f,
-    0x4d, 0x49, 0x3e, 0x41, 0x46, 0x4e, 0x40, 0x49, 0x4b, 0x47, 0x4c, 0x45,
-    0x44, 0x51, 0x4f, 0x4b, 0x48, 0x49, 0x44, 0x41, 0x43, 0x46, 0x51, 0x45,
-    0x40, 0x48, 0x4b, 0x42, 0x44, 0x4f, 0x53, 0x4d, 0x44, 0x46, 0x4e, 0x4c,
-    0x48, 0x50, 0x41, 0x45, 0x42, 0x48, 0x4d, 0x4d, 0x47, 0x45, 0x41, 0x45,
-    0x48, 0x58, 0x4e, 0x46, 0x43, 0x53, 0x57, 0x52, 0x5e, 0x42, 0x45, 0x4e,
-    0x39, 0x24, 0x32, 0x56, 0x47, 0x56, 0x49, 0x52, 0x46, 0x26, 0x3a, 0x51,
-    0x4b, 0x05, 0x3e, 0x43, 0x3f, 0x38, 0x4d, 0x4b, 0x4f, 0x27, 0x51, 0x46,
-    0x47, 0x41, 0x4a, 0x47, 0x4a, 0x3e, 0x44, 0x51, 0x3f, 0x3a, 0x43, 0x46,
-    0x4d, 0x49, 0x46, 0x52, 0x43, 0x48, 0x49, 0x3e, 0x47, 0x46, 0x4a, 0x4d,
-    0x47, 0x46, 0x52, 0x50, 0x44, 0x48, 0x4c, 0x47, 0x45, 0x41, 0x49, 0x5b,
-    0x4d, 0x4b, 0x47, 0x4c, 0x4a, 0x47, 0x45, 0x5b, 0x49, 0x46, 0x52, 0x47,
-    0x47, 0x3d, 0x55, 0x59, 0x40, 0x4b, 0x3e, 0x50, 0x42, 0x43, 0x40, 0x4f,
-    0x48, 0x3f, 0x47, 0x53, 0x4d, 0x44, 0x4e, 0x37, 0x4c, 0x43, 0x51, 0x4d,
-    0x46, 0x4e, 0x40, 0x41, 0x52, 0x44, 0x43, 0x4a, 0x50, 0x48, 0x47, 0x42,
-    0x48, 0x45, 0x50, 0x4d, 0x42, 0x52, 0x44, 0x43, 0x45, 0x43, 0x4c, 0x4d,
-    0x44, 0x51, 0x47, 0x48, 0x51, 0x4f, 0x48, 0x45, 0x49, 0x4a, 0x3e, 0x43,
-    0x4d, 0x4e, 0x4e, 0x46, 0x54, 0x4d, 0x49, 0x4d, 0x47, 0x46, 0x4b, 0x41,
-    0x4a, 0x49, 0x44, 0x45, 0x4d, 0x3e, 0x53, 0x50, 0x47, 0x4d, 0x4e, 0x43,
-    0x4f, 0x45, 0x4e, 0x4a, 0x47, 0x49, 0x4c, 0x4c, 0x4d, 0x54, 0x42, 0x4c,
-    0x43, 0x5d, 0x59, 0x50, 0x5e, 0x4b, 0x44, 0x43, 0x3c, 0x25, 0x31, 0x5b,
-    0x46, 0x5a, 0x50, 0x4d, 0x41, 0x2a, 0x41, 0x4f, 0x44, 0x00, 0x41, 0x3d,
-    0x43, 0x4b, 0x47, 0x45, 0x4e, 0x2e, 0x44, 0x46, 0x53, 0x3d, 0x43, 0x41,
-    0x44, 0x46, 0x49, 0x42, 0x45, 0x4f, 0x4d, 0x3a, 0x43, 0x3c, 0x47, 0x53,
-    0x43, 0x4e, 0x3f, 0x41, 0x4d, 0x50, 0x4b, 0x4c, 0x51, 0x47, 0x53, 0x4f,
-    0x45, 0x4a, 0x44, 0x45, 0x41, 0x46, 0x47, 0x50, 0x51, 0x3f, 0x3e, 0x41,
-    0x48, 0x45, 0x46, 0x5d, 0x45, 0x4a, 0x4c, 0x46, 0x4a, 0x49, 0x50, 0x51,
-    0x51, 0x4c, 0x4f, 0x47, 0x47, 0x42, 0x45, 0x47, 0x4e, 0x48, 0x46, 0x40,
-    0x45, 0x46, 0x4d, 0x3b, 0x4d, 0x52, 0x4c, 0x51, 0x49, 0x51, 0x47, 0x3d,
-    0x4d, 0x42, 0x4f, 0x4e, 0x43, 0x43, 0x45, 0x3a, 0x42, 0x50, 0x4c, 0x4a,
-    0x41, 0x53, 0x4c, 0x45, 0x51, 0x3f, 0x54, 0x43, 0x4b, 0x54, 0x56, 0x4d,
-    0x4f, 0x4a, 0x50, 0x4b, 0x44, 0x45, 0x4f, 0x4f, 0x47, 0x3e, 0x50, 0x4f,
-    0x4b, 0x48, 0x4d, 0x49, 0x55, 0x4d, 0x45, 0x4d, 0x4a, 0x53, 0x43, 0x46,
-    0x4c, 0x45, 0x41, 0x46, 0x49, 0x49, 0x4f, 0x4b, 0x49, 0x50, 0x52, 0x49,
-    0x41, 0x54, 0x44, 0x4c, 0x44, 0x63, 0x4a, 0x49, 0x40, 0x59, 0x52, 0x52,
-    0x59, 0x3f, 0x3e, 0x3e, 0x40, 0x25, 0x3c, 0x5c, 0x4f, 0x57, 0x44, 0x50,
-    0x41, 0x2a, 0x48, 0x4f, 0x43, 0x08, 0x47, 0x43, 0x49, 0x48, 0x4d, 0x49,
-    0x46, 0x2b, 0x48, 0x44, 0x4e, 0x47, 0x47, 0x43, 0x44, 0x3e, 0x4a, 0x52,
-    0x3f, 0x4a, 0x53, 0x42, 0x49, 0x47, 0x4c, 0x50, 0x43, 0x46, 0x46, 0x3c,
-    0x4c, 0x47, 0x4e, 0x4d, 0x42, 0x41, 0x53, 0x52, 0x4f, 0x40, 0x54, 0x50,
-    0x46, 0x43, 0x50, 0x56, 0x51, 0x48, 0x48, 0x48, 0x49, 0x39, 0x47, 0x5e,
-    0x4e, 0x4b, 0x4f, 0x4e, 0x43, 0x45, 0x42, 0x58, 0x4a, 0x3b, 0x48, 0x4d,
-    0x43, 0x3e, 0x4b, 0x43, 0x3c, 0x45, 0x46, 0x4b, 0x42, 0x42, 0x4e, 0x3d,
-    0x4b, 0x4e, 0x51, 0x52, 0x48, 0x3e, 0x4b, 0x3f, 0x4c, 0x4a, 0x4b, 0x4c,
-    0x46, 0x48, 0x3e, 0x48, 0x47, 0x4d, 0x4a, 0x46, 0x49, 0x4d, 0x4a, 0x48,
-    0x50, 0x4b, 0x40, 0x48, 0x4b, 0x52, 0x46, 0x50, 0x4f, 0x3e, 0x42, 0x44,
-    0x44, 0x42, 0x43, 0x49, 0x4f, 0x4f, 0x46, 0x42, 0x4a, 0x54, 0x42, 0x48,
-    0x50, 0x4f, 0x4f, 0x4c, 0x4c, 0x47, 0x52, 0x49, 0x4c, 0x45, 0x4a, 0x4d,
-    0x4a, 0x41, 0x47, 0x4a, 0x4d, 0x4a, 0x4c, 0x46, 0x51, 0x44, 0x4b, 0x49,
-    0x53, 0x5e, 0x45, 0x4a, 0x3b, 0x57, 0x5a, 0x4c, 0x59, 0x43, 0x3e, 0x4a,
-    0x3e, 0x20, 0x36, 0x5d, 0x47, 0x5b, 0x3f, 0x55, 0x3e, 0x24, 0x41, 0x52,
-    0x3f, 0x01, 0x49, 0x41, 0x40, 0x45, 0x42, 0x46, 0x49, 0x2a, 0x47, 0x40,
-    0x44, 0x3f, 0x42, 0x47, 0x4e, 0x42, 0x4b, 0x3d, 0x45, 0x4c, 0x47, 0x3d,
-    0x4c, 0x44, 0x48, 0x43, 0x43, 0x41, 0x4a, 0x3d, 0x48, 0x4b, 0x46, 0x4e,
-    0x4c, 0x45, 0x48, 0x4d, 0x54, 0x4d, 0x3e, 0x46, 0x3e, 0x47, 0x44, 0x4e,
-    0x48, 0x49, 0x53, 0x4b, 0x41, 0x45, 0x4c, 0x57, 0x52, 0x4e, 0x40, 0x48,
-    0x4d, 0x43, 0x44, 0x5a, 0x4a, 0x4c, 0x48, 0x4d, 0x3f, 0x52, 0x41, 0x50,
-    0x4a, 0x47, 0x3e, 0x43, 0x4c, 0x42, 0x48, 0x3e, 0x4f, 0x4b, 0x41, 0x43,
-    0x49, 0x40, 0x43, 0x36, 0x3f, 0x4b, 0x49, 0x49, 0x51, 0x43, 0x48, 0x40,
-    0x4c, 0x51, 0x4d, 0x4a, 0x49, 0x3f, 0x4b, 0x3d, 0x4f, 0x4b, 0x43, 0x4d,
-    0x46, 0x40, 0x46, 0x4d, 0x49, 0x48, 0x4d, 0x4c, 0x52, 0x4c, 0x49, 0x4f,
-    0x53, 0x40, 0x49, 0x53, 0x47, 0x43, 0x4c, 0x45, 0x42, 0x48, 0x42, 0x4e,
-    0x49, 0x43, 0x42, 0x40, 0x4f, 0x46, 0x50, 0x47, 0x51, 0x4a, 0x52, 0x45,
-    0x4c, 0x51, 0x48, 0x47, 0x40, 0x41, 0x52, 0x4f, 0x41, 0x5a, 0x53, 0x47,
-    0x42, 0x5f, 0x55, 0x4f, 0x53, 0x3e, 0x41, 0x49, 0x3d, 0x20, 0x3f, 0x54,
-    0x42, 0x5b, 0x49, 0x4d, 0x3d, 0x22, 0x3e, 0x48, 0x41, 0x01, 0x4c, 0x3d,
-    0x43, 0x4a, 0x46, 0x43, 0x4f, 0x2b, 0x49, 0x46, 0x47, 0x4a, 0x51, 0x3d,
-    0x4b, 0x44, 0x49, 0x41, 0x47, 0x47, 0x45, 0x3a, 0x44, 0x42, 0x40, 0x52,
-    0x46, 0x51, 0x4a, 0x41, 0x4a, 0x52, 0x44, 0x52, 0x4a, 0x40, 0x46, 0x45,
-    0x52, 0x4c, 0x4e, 0x42, 0x42, 0x48, 0x40, 0x4f, 0x4b, 0x4f, 0x51, 0x4c,
-    0x4e, 0x48, 0x4a, 0x5a, 0x46, 0x3d, 0x41, 0x50, 0x52, 0x4c, 0x44, 0x53,
-    0x4b, 0x4d, 0x4f, 0x49, 0x47, 0x4c, 0x48, 0x45, 0x48, 0x4a, 0x44, 0x4e,
-    0x4c, 0x40, 0x4d, 0x35, 0x40, 0x49, 0x4a, 0x51, 0x49, 0x4a, 0x46, 0x36,
-    0x46, 0x47, 0x4a, 0x4c, 0x40, 0x4e, 0x42, 0x38, 0x48, 0x45, 0x42, 0x49,
-    0x54, 0x4c, 0x3f, 0x49, 0x4c, 0x39, 0x47, 0x45, 0x4e, 0x4a, 0x42, 0x44,
-    0x4b, 0x53, 0x43, 0x40, 0x46, 0x51, 0x3d, 0x50, 0x4b, 0x43, 0x4a, 0x4c,
-    0x55, 0x54, 0x4a, 0x43, 0x48, 0x40, 0x44, 0x3f, 0x47, 0x45, 0x3e, 0x41,
-    0x49, 0x44, 0x4d, 0x49, 0x44, 0x41, 0x4a, 0x50, 0x44, 0x49, 0x4d, 0x47,
-    0x4a, 0x49, 0x46, 0x49, 0x40, 0x5b, 0x4d, 0x51, 0x47, 0x57, 0x49, 0x4f,
-    0x56, 0x46, 0x3a, 0x4a, 0x3e, 0x22, 0x36, 0x5c, 0x44, 0x56, 0x46, 0x48,
-    0x3a, 0x2d, 0x4a, 0x48, 0x44, 0x17, 0x41, 0x42, 0x40, 0x3d, 0x4e, 0x45,
-    0x40, 0x26, 0x43, 0x52, 0x41, 0x40, 0x44, 0x4a, 0x48, 0x42, 0x4f, 0x47,
-    0x46, 0x4c, 0x4a, 0x3b, 0x42, 0x3e, 0x3e, 0x49, 0x4e, 0x44, 0x4e, 0x49,
-    0x47, 0x41, 0x47, 0x44, 0x4c, 0x45, 0x4d, 0x49, 0x49, 0x48, 0x55, 0x3d,
-    0x4a, 0x45, 0x50, 0x4f, 0x46, 0x4c, 0x46, 0x45, 0x3c, 0x51, 0x4b, 0x5a,
-    0x46, 0x47, 0x54, 0x41, 0x44, 0x40, 0x4f, 0x53, 0x49, 0x46, 0x46, 0x48,
-    0x44, 0x40, 0x50, 0x49, 0x49, 0x43, 0x50, 0x41, 0x52, 0x4b, 0x46, 0x3e,
-    0x44, 0x44, 0x46, 0x4e, 0x47, 0x48, 0x3e, 0x38, 0x4c, 0x4c, 0x48, 0x43,
-    0x48, 0x3e, 0x50, 0x42, 0x51, 0x50, 0x4a, 0x48, 0x4a, 0x42, 0x44, 0x3d,
-    0x4a, 0x46, 0x46, 0x3d, 0x4e, 0x47, 0x3d, 0x48, 0x4c, 0x46, 0x50, 0x4d,
-    0x49, 0x45, 0x4a, 0x4c, 0x4c, 0x47, 0x4a, 0x42, 0x4a, 0x45, 0x50, 0x52,
-    0x4b, 0x4d, 0x4c, 0x43, 0x42, 0x53, 0x41, 0x45, 0x49, 0x41, 0x4b, 0x4c,
-    0x52, 0x54, 0x4b, 0x41, 0x48, 0x4c, 0x47, 0x4c, 0x41, 0x49, 0x4a, 0x47,
-    0x50, 0x59, 0x4e, 0x45, 0x3c, 0x5d, 0x53, 0x4c, 0x5a, 0x3e, 0x3a, 0x51,
-    0x3a, 0x22, 0x35, 0x59, 0x40, 0x5a, 0x43, 0x46, 0x41, 0x32, 0x44, 0x4b,
-    0x47, 0x04, 0x4c, 0x3a, 0x4a, 0x49, 0x48, 0x3d, 0x45, 0x2b, 0x50, 0x41,
-    0x3e, 0x44, 0x4f, 0x43, 0x4a, 0x3f, 0x48, 0x4b, 0x53, 0x49, 0x4b, 0x38,
-    0x44, 0x40, 0x48, 0x4c, 0x41, 0x3f, 0x47, 0x3e, 0x47, 0x49, 0x45, 0x42,
-    0x43, 0x3e, 0x46, 0x44, 0x53, 0x4d, 0x48, 0x44, 0x45, 0x42, 0x43, 0x53,
-    0x55, 0x49, 0x4d, 0x4b, 0x45, 0x44, 0x47, 0x5f, 0x48, 0x44, 0x4a, 0x48,
-    0x45, 0x4d, 0x4f, 0x5e, 0x4e, 0x46, 0x49, 0x49, 0x4d, 0x49, 0x44, 0x48,
-    0x4d, 0x41, 0x50, 0x48, 0x3d, 0x3f, 0x4d, 0x38, 0x46, 0x4a, 0x50, 0x4a,
-    0x45, 0x3e, 0x43, 0x36, 0x42, 0x48, 0x53, 0x54, 0x49, 0x43, 0x4b, 0x3a,
-    0x45, 0x48, 0x50, 0x45, 0x4a, 0x4c, 0x4a, 0x4d, 0x43, 0x4c, 0x55, 0x4e,
-    0x4c, 0x42, 0x45, 0x52, 0x52, 0x45, 0x46, 0x40, 0x54, 0x4c, 0x3d, 0x4e,
-    0x49, 0x4e, 0x44, 0x47, 0x45, 0x48, 0x4b, 0x50, 0x49, 0x4b, 0x44, 0x4b,
-    0x4f, 0x49, 0x47, 0x47, 0x53, 0x3f, 0x4b, 0x42, 0x45, 0x3e, 0x4d, 0x4d,
-    0x48, 0x51, 0x45, 0x40, 0x43, 0x43, 0x4e, 0x44, 0x51, 0x55, 0x4a, 0x3e,
-    0x45, 0x55, 0x58, 0x50, 0x50, 0x38, 0x44, 0x4f, 0x3b, 0x23, 0x3c, 0x55,
-    0x3c, 0x54, 0x49, 0x42, 0x44, 0x2f, 0x3e, 0x47, 0x42, 0x01, 0x42, 0x37,
-    0x3f, 0x42, 0x45, 0x45, 0x47, 0x2a, 0x52, 0x4b, 0x45, 0x3c, 0x47, 0x44,
-    0x44, 0x40, 0x50, 0x53, 0x48, 0x42, 0x4d, 0x36, 0x50, 0x3d, 0x49, 0x44,
-    0x4f, 0x4c, 0x4a, 0x42, 0x4d, 0x3e, 0x3d, 0x3f, 0x4e, 0x44, 0x4d, 0x4e,
-    0x54, 0x3d, 0x42, 0x46, 0x49, 0x47, 0x4b, 0x53, 0x45, 0x46, 0x47, 0x4a,
-    0x45, 0x3d, 0x4a, 0x5f, 0x51, 0x3e, 0x45, 0x45, 0x44, 0x3a, 0x4d, 0x57,
-    0x45, 0x47, 0x4d, 0x45, 0x4e, 0x4b, 0x51, 0x48, 0x4b, 0x4a, 0x3c, 0x4e,
-    0x51, 0x41, 0x4d, 0x36, 0x47, 0x4a, 0x46, 0x51, 0x4e, 0x4c, 0x52, 0x41,
-    0x55, 0x47, 0x41, 0x47, 0x4d, 0x47, 0x4b, 0x3d, 0x4a, 0x4a, 0x46, 0x49,
-    0x4d, 0x48, 0x46, 0x46, 0x4d, 0x52, 0x52, 0x48, 0x49, 0x3f, 0x4b, 0x4e,
-    0x4c, 0x49, 0x45, 0x47, 0x41, 0x4b, 0x44, 0x48, 0x52, 0x4b, 0x53, 0x44,
-    0x46, 0x4e, 0x44, 0x49, 0x52, 0x50, 0x46, 0x4b, 0x44, 0x43, 0x50, 0x49,
-    0x4a, 0x53, 0x45, 0x49, 0x52, 0x3f, 0x4a, 0x4e, 0x49, 0x4c, 0x4d, 0x4d,
-    0x40, 0x40, 0x3f, 0x4a, 0x47, 0x56, 0x51, 0x43, 0x40, 0x5a, 0x58, 0x52,
-    0x4f, 0x3d, 0x3d, 0x45, 0x38, 0x29, 0x33, 0x59, 0x45, 0x54, 0x3c, 0x42,
-    0x3f, 0x27, 0x3e, 0x49, 0x48, 0x06, 0x4a, 0x3f, 0x41, 0x49, 0x4c, 0x48,
-    0x46, 0x2b, 0x4a, 0x4f, 0x44, 0x46, 0x4c, 0x46, 0x4a, 0x3b, 0x4d, 0x4a,
-    0x40, 0x41, 0x45, 0x38, 0x51, 0x39, 0x46, 0x46, 0x41, 0x51, 0x4e, 0x41,
-    0x49, 0x44, 0x48, 0x4a, 0x4b, 0x46, 0x47, 0x46, 0x4a, 0x4c, 0x47, 0x48,
-    0x3d, 0x42, 0x50, 0x4f, 0x50, 0x4a, 0x4a, 0x48, 0x4a, 0x45, 0x45, 0x61,
-    0x4a, 0x4c, 0x49, 0x3d, 0x4b, 0x4a, 0x4a, 0x5a, 0x48, 0x49, 0x50, 0x4f,
-    0x42, 0x48, 0x3e, 0x44, 0x43, 0x3b, 0x4f, 0x54, 0x4b, 0x4a, 0x47, 0x31,
-    0x4a, 0x49, 0x47, 0x4e, 0x48, 0x48, 0x46, 0x42, 0x4a, 0x45, 0x4c, 0x49,
-    0x4b, 0x4e, 0x53, 0x43, 0x4c, 0x49, 0x4f, 0x4b, 0x46, 0x4c, 0x4b, 0x4e,
-    0x51, 0x4b, 0x49, 0x52, 0x44, 0x55, 0x45, 0x49, 0x4b, 0x4a, 0x50, 0x4c,
-    0x4d, 0x4a, 0x4b, 0x48, 0x41, 0x46, 0x47, 0x43, 0x4b, 0x3f, 0x54, 0x4a,
-    0x46, 0x49, 0x51, 0x48, 0x4e, 0x4a, 0x41, 0x52, 0x52, 0x4e, 0x53, 0x47,
-    0x42, 0x48, 0x43, 0x44, 0x54, 0x51, 0x40, 0x49, 0x4c, 0x48, 0x49, 0x44,
-    0x4c, 0x56, 0x52, 0x49, 0x3d, 0x59, 0x4f, 0x56, 0x56, 0x42, 0x46, 0x45,
-    0x3e, 0x28, 0x3f, 0x5b, 0x3f, 0x5a, 0x4c, 0x42, 0x44, 0x22, 0x3f, 0x46,
-    0x47, 0x0d, 0x3e, 0x41, 0x45, 0x49, 0x4a, 0x3b, 0x45, 0x2d, 0x4d, 0x4a,
-    0x44, 0x43, 0x49, 0x46, 0x4b, 0x47, 0x49, 0x45, 0x4e, 0x40, 0x4c, 0x3c,
-    0x42, 0x3e, 0x4b, 0x50, 0x48, 0x49, 0x4c, 0x42, 0x3c, 0x43, 0x50, 0x43,
-    0x49, 0x4e, 0x4e, 0x43, 0x46, 0x4c, 0x48, 0x4a, 0x43, 0x4c, 0x49, 0x4e,
-    0x47, 0x44, 0x50, 0x4c, 0x4a, 0x48, 0x47, 0x5f, 0x3f, 0x3e, 0x48, 0x4f,
-    0x4f, 0x49, 0x4a, 0x5f, 0x4e, 0x40, 0x4e, 0x48, 0x47, 0x44, 0x40, 0x4d,
-    0x3f, 0x4a, 0x53, 0x45, 0x3e, 0x50, 0x3f, 0x39, 0x50, 0x45, 0x45, 0x4b,
-    0x43, 0x41, 0x46, 0x41, 0x49, 0x47, 0x4b, 0x41, 0x3c, 0x4b, 0x46, 0x3f,
-    0x41, 0x4a, 0x4e, 0x4c, 0x49, 0x4c, 0x3f, 0x44, 0x53, 0x4c, 0x45, 0x49,
-    0x48, 0x4d, 0x48, 0x4a, 0x48, 0x4f, 0x45, 0x4d, 0x48, 0x4c, 0x41, 0x49,
-    0x42, 0x48, 0x53, 0x46, 0x4a, 0x46, 0x4b, 0x4f, 0x4c, 0x52, 0x4c, 0x51,
-    0x41, 0x4d, 0x49, 0x41, 0x49, 0x4f, 0x49, 0x42, 0x4a, 0x48, 0x51, 0x4a,
-    0x44, 0x4d, 0x55, 0x48, 0x47, 0x4d, 0x4d, 0x45, 0x42, 0x60, 0x4a, 0x51,
-    0x42, 0x54, 0x56, 0x56, 0x50, 0x4a, 0x3f, 0x4a, 0x40, 0x25, 0x3a, 0x59,
-    0x46, 0x58, 0x52, 0x46, 0x41, 0x28, 0x3d, 0x3e, 0x45, 0x13, 0x47, 0x41,
-    0x3d, 0x44, 0x48, 0x45, 0x49, 0x26, 0x46, 0x4c, 0x3b, 0x4a, 0x42, 0x47,
-    0x46, 0x41, 0x44, 0x52, 0x50, 0x4a, 0x4f, 0x40, 0x4b, 0x39, 0x42, 0x45,
-    0x4a, 0x4d, 0x4f, 0x3f, 0x42, 0x4f, 0x49, 0x45, 0x42, 0x4a, 0x46, 0x47,
-    0x48, 0x40, 0x4a, 0x46, 0x41, 0x3b, 0x48, 0x55, 0x4b, 0x4e, 0x4e, 0x48,
-    0x4b, 0x44, 0x46, 0x53, 0x48, 0x45, 0x4b, 0x53, 0x49, 0x43, 0x4a, 0x5c,
-    0x46, 0x45, 0x45, 0x49, 0x49, 0x49, 0x4c, 0x43, 0x4e, 0x4a, 0x41, 0x4a,
-    0x42, 0x43, 0x4a, 0x38, 0x44, 0x4a, 0x4b, 0x3f, 0x45, 0x49, 0x45, 0x38,
-    0x43, 0x40, 0x45, 0x4c, 0x47, 0x42, 0x3f, 0x42, 0x3e, 0x4a, 0x43, 0x50,
-    0x4a, 0x4e, 0x4f, 0x47, 0x4d, 0x49, 0x49, 0x47, 0x4a, 0x4d, 0x46, 0x4c,
-    0x4f, 0x3d, 0x52, 0x4a, 0x41, 0x44, 0x4b, 0x50, 0x4c, 0x52, 0x49, 0x50,
-    0x4b, 0x45, 0x49, 0x4d, 0x48, 0x55, 0x50, 0x47, 0x4e, 0x50, 0x4f, 0x48,
-    0x46, 0x4d, 0x4d, 0x41, 0x48, 0x51, 0x4b, 0x4c, 0x47, 0x51, 0x42, 0x42,
-    0x4d, 0x47, 0x43, 0x4c, 0x4c, 0x5a, 0x4e, 0x47, 0x3b, 0x59, 0x51, 0x57,
-    0x4c, 0x40, 0x46, 0x4c, 0x37, 0x2a, 0x35, 0x58, 0x44, 0x5b, 0x4c, 0x44,
-    0x3e, 0x2e, 0x3f, 0x43, 0x46, 0x23, 0x49, 0x3e, 0x41, 0x3f, 0x4b, 0x3e,
-    0x4e, 0x2f, 0x4d, 0x4a, 0x4e, 0x40, 0x4e, 0x41, 0x40, 0x3f, 0x4a, 0x42,
-    0x4d, 0x4c, 0x44, 0x47, 0x4e, 0x44, 0x40, 0x43, 0x4d, 0x49, 0x4f, 0x3d,
-    0x49, 0x3f, 0x51, 0x48, 0x42, 0x4a, 0x49, 0x47, 0x49, 0x46, 0x4a, 0x45,
-    0x45, 0x49, 0x53, 0x4d, 0x4c, 0x4e, 0x44, 0x50, 0x4b, 0x43, 0x4e, 0x5f,
-    0x3c, 0x40, 0x44, 0x46, 0x48, 0x4b, 0x42, 0x62, 0x4e, 0x50, 0x4c, 0x49,
-    0x4a, 0x4f, 0x44, 0x53, 0x42, 0x43, 0x49, 0x48, 0x4b, 0x3c, 0x4a, 0x37,
-    0x4c, 0x41, 0x49, 0x46, 0x46, 0x47, 0x43, 0x40, 0x4d, 0x4d, 0x4a, 0x48,
-    0x50, 0x4b, 0x50, 0x41, 0x44, 0x3e, 0x51, 0x47, 0x44, 0x4a, 0x44, 0x45,
-    0x48, 0x4d, 0x52, 0x4e, 0x44, 0x48, 0x4d, 0x43, 0x42, 0x45, 0x48, 0x52,
-    0x44, 0x42, 0x50, 0x42, 0x4d, 0x45, 0x48, 0x4d, 0x4f, 0x4e, 0x45, 0x49,
-    0x51, 0x48, 0x4f, 0x53, 0x4d, 0x4c, 0x48, 0x50, 0x4e, 0x4d, 0x50, 0x48,
-    0x49, 0x42, 0x4c, 0x42, 0x4b, 0x4b, 0x49, 0x48, 0x48, 0x49, 0x4a, 0x54,
-    0x44, 0x57, 0x4d, 0x4b, 0x3f, 0x56, 0x53, 0x5c, 0x50, 0x4e, 0x46, 0x49,
-    0x40, 0x24, 0x44, 0x58, 0x49, 0x54, 0x48, 0x49, 0x41, 0x22, 0x44, 0x3f,
-    0x48, 0x1c, 0x4d, 0x39, 0x3e, 0x4c, 0x3d, 0x4a, 0x48, 0x2d, 0x48, 0x3e,
-    0x3f, 0x3a, 0x46, 0x4e, 0x44, 0x43, 0x49, 0x51, 0x4d, 0x3c, 0x44, 0x41,
-    0x4e, 0x44, 0x42, 0x4c, 0x45, 0x48, 0x45, 0x46, 0x42, 0x46, 0x47, 0x42,
-    0x4f, 0x45, 0x47, 0x44, 0x48, 0x47, 0x4a, 0x42, 0x4d, 0x48, 0x3e, 0x53,
-    0x47, 0x4b, 0x44, 0x4b, 0x45, 0x4a, 0x50, 0x55, 0x4c, 0x45, 0x48, 0x43,
-    0x53, 0x3d, 0x4e, 0x5f, 0x42, 0x44, 0x4a, 0x4f, 0x3f, 0x48, 0x4e, 0x4b,
-    0x43, 0x48, 0x43, 0x41, 0x4a, 0x4b, 0x51, 0x39, 0x52, 0x46, 0x44, 0x49,
-    0x48, 0x45, 0x4c, 0x40, 0x45, 0x49, 0x51, 0x48, 0x45, 0x42, 0x45, 0x48,
-    0x40, 0x43, 0x3d, 0x47, 0x53, 0x54, 0x4d, 0x4a, 0x4a, 0x47, 0x48, 0x43,
-    0x4c, 0x46, 0x43, 0x4f, 0x49, 0x4c, 0x3f, 0x3d, 0x4b, 0x41, 0x40, 0x48,
-    0x4e, 0x4c, 0x4b, 0x40, 0x4c, 0x43, 0x49, 0x4d, 0x47, 0x4f, 0x47, 0x42,
-    0x47, 0x4a, 0x4d, 0x4f, 0x46, 0x4d, 0x51, 0x49, 0x48, 0x4d, 0x4e, 0x46,
-    0x47, 0x41, 0x44, 0x4d, 0x4b, 0x55, 0x4b, 0x4c, 0x41, 0x5e, 0x50, 0x45,
-    0x40, 0x55, 0x4b, 0x60, 0x55, 0x47, 0x3d, 0x4a, 0x42, 0x22, 0x46, 0x5a,
-    0x47, 0x53, 0x49, 0x44, 0x44, 0x27, 0x41, 0x4f, 0x3e, 0x22, 0x4a, 0x44,
-    0x49, 0x3e, 0x4e, 0x4d, 0x3f, 0x3a, 0x4c, 0x44, 0x4a, 0x44, 0x46, 0x51,
-    0x4f, 0x42, 0x4c, 0x4e, 0x39, 0x4b, 0x42, 0x39, 0x4b, 0x3e, 0x4f, 0x47,
-    0x4a, 0x4f, 0x3f, 0x4d, 0x43, 0x4c, 0x4a, 0x4b, 0x4b, 0x3d, 0x51, 0x46,
-    0x49, 0x4c, 0x47, 0x44, 0x43, 0x3d, 0x3c, 0x54, 0x4a, 0x47, 0x4d, 0x50,
-    0x4a, 0x46, 0x51, 0x62, 0x46, 0x4d, 0x4b, 0x46, 0x49, 0x3c, 0x50, 0x57,
-    0x47, 0x40, 0x3e, 0x4c, 0x4b, 0x3f, 0x55, 0x46, 0x3d, 0x45, 0x42, 0x4e,
-    0x50, 0x49, 0x46, 0x3a, 0x4c, 0x47, 0x4a, 0x49, 0x42, 0x42, 0x4a, 0x44,
-    0x42, 0x40, 0x49, 0x54, 0x46, 0x4b, 0x47, 0x45, 0x51, 0x47, 0x41, 0x42,
-    0x49, 0x50, 0x4e, 0x48, 0x4b, 0x4b, 0x47, 0x4a, 0x47, 0x49, 0x4b, 0x45,
-    0x4b, 0x54, 0x48, 0x54, 0x4b, 0x49, 0x51, 0x4a, 0x4a, 0x40, 0x46, 0x42,
-    0x44, 0x44, 0x4d, 0x4b, 0x47, 0x43, 0x45, 0x41, 0x3e, 0x49, 0x43, 0x51,
-    0x3e, 0x4b, 0x52, 0x46, 0x48, 0x3f, 0x4e, 0x51, 0x51, 0x49, 0x3f, 0x48,
-    0x4c, 0x4c, 0x52, 0x47, 0x43, 0x57, 0x44, 0x42, 0x40, 0x52, 0x50, 0x5d,
-    0x4f, 0x40, 0x42, 0x45, 0x46, 0x26, 0x3c, 0x51, 0x4b, 0x4e, 0x4b, 0x49,
-    0x46, 0x35, 0x49, 0x53, 0x49, 0x2b, 0x4d, 0x3e, 0x50, 0x44, 0x4f, 0x54,
-    0x46, 0x34, 0x49, 0x4d, 0x42, 0x45, 0x44, 0x4b, 0x52, 0x44, 0x52, 0x41,
-    0x4d, 0x4c, 0x52, 0x41, 0x49, 0x3a, 0x4e, 0x49, 0x40, 0x4b, 0x45, 0x4d,
-    0x4b, 0x4a, 0x47, 0x49, 0x45, 0x49, 0x4d, 0x50, 0x3e, 0x47, 0x44, 0x51,
-    0x4c, 0x41, 0x45, 0x50, 0x47, 0x41, 0x4a, 0x52, 0x4b, 0x3d, 0x4b, 0x5b,
-    0x4c, 0x4c, 0x4d, 0x3f, 0x47, 0x44, 0x49, 0x5d, 0x4a, 0x53, 0x44, 0x45,
-    0x45, 0x46, 0x3d, 0x4f, 0x50, 0x3b, 0x44, 0x4e, 0x40, 0x41, 0x4c, 0x3a,
-    0x4a, 0x45, 0x49, 0x48, 0x45, 0x4a, 0x45, 0x36, 0x45, 0x4d, 0x4c, 0x49,
-    0x3f, 0x47, 0x4d, 0x40, 0x53, 0x48, 0x49, 0x4c, 0x47, 0x4f, 0x42, 0x44,
-    0x45, 0x40, 0x4a, 0x4c, 0x49, 0x4f, 0x4b, 0x4d, 0x42, 0x45, 0x3e, 0x4a,
-    0x48, 0x4a, 0x49, 0x50, 0x4c, 0x53, 0x50, 0x45, 0x4b, 0x4c, 0x46, 0x4f,
-    0x44, 0x43, 0x54, 0x50, 0x3f, 0x48, 0x42, 0x4b, 0x43, 0x3f, 0x4d, 0x4c,
-    0x43, 0x49, 0x4a, 0x47, 0x54, 0x4b, 0x4f, 0x4d, 0x44, 0x47, 0x49, 0x4e,
-    0x4e, 0x55, 0x40, 0x46, 0x44, 0x56, 0x4e, 0x65, 0x4f, 0x3f, 0x43, 0x48,
-    0x39, 0x27, 0x43, 0x55, 0x4b, 0x4c, 0x44, 0x46, 0x42, 0x34, 0x44, 0x52,
-    0x43, 0x22, 0x4e, 0x41, 0x49, 0x48, 0x49, 0x51, 0x3b, 0x37, 0x4b, 0x40,
-    0x4f, 0x45, 0x53, 0x4c, 0x47, 0x46, 0x47, 0x4c, 0x3e, 0x44, 0x45, 0x49,
-    0x48, 0x50, 0x45, 0x40, 0x46, 0x4c, 0x47, 0x4d, 0x44, 0x48, 0x49, 0x50,
-    0x4f, 0x4a, 0x46, 0x55, 0x4e, 0x42, 0x4c, 0x4c, 0x50, 0x48, 0x3d, 0x55,
-    0x46, 0x3e, 0x4a, 0x4b, 0x4f, 0x46, 0x46, 0x60, 0x50, 0x3f, 0x55, 0x40,
-    0x42, 0x44, 0x48, 0x63, 0x50, 0x3d, 0x45, 0x4f, 0x4e, 0x41, 0x47, 0x48,
-    0x4a, 0x3c, 0x3d, 0x46, 0x3f, 0x42, 0x43, 0x37, 0x4f, 0x4f, 0x50, 0x47,
-    0x47, 0x4b, 0x52, 0x40, 0x3f, 0x44, 0x4a, 0x40, 0x4d, 0x44, 0x4e, 0x37,
-    0x43, 0x48, 0x47, 0x3f, 0x51, 0x4d, 0x45, 0x42, 0x41, 0x46, 0x3d, 0x53,
-    0x4f, 0x4b, 0x54, 0x45, 0x51, 0x40, 0x4a, 0x4a, 0x48, 0x4f, 0x43, 0x4a,
-    0x4f, 0x4c, 0x4c, 0x4f, 0x48, 0x4c, 0x44, 0x4e, 0x43, 0x46, 0x4f, 0x4a,
-    0x43, 0x41, 0x49, 0x49, 0x47, 0x53, 0x45, 0x49, 0x4e, 0x46, 0x4c, 0x4e,
-    0x3c, 0x49, 0x44, 0x45, 0x4c, 0x42, 0x49, 0x41, 0x48, 0x58, 0x54, 0x4d,
-    0x35, 0x52, 0x4e, 0x5b, 0x4f, 0x40, 0x3e, 0x46, 0x46, 0x36, 0x3d, 0x60,
-    0x4d, 0x49, 0x4a, 0x43, 0x44, 0x36, 0x49, 0x67, 0x4a, 0x2d, 0x4b, 0x40,
-    0x3f, 0x49, 0x43, 0x5f, 0x45, 0x3c, 0x49, 0x4c, 0x4a, 0x43, 0x48, 0x55,
-    0x49, 0x46, 0x49, 0x46, 0x44, 0x4e, 0x42, 0x4e, 0x40, 0x45, 0x42, 0x52,
-    0x4a, 0x40, 0x4a, 0x44, 0x40, 0x45, 0x54, 0x3d, 0x4c, 0x3e, 0x4c, 0x55,
-    0x4d, 0x45, 0x4d, 0x51, 0x4a, 0x4b, 0x44, 0x5b, 0x48, 0x3d, 0x3e, 0x46,
-    0x4f, 0x4d, 0x3f, 0x62, 0x4d, 0x45, 0x3f, 0x47, 0x47, 0x47, 0x44, 0x5b,
-    0x4b, 0x4f, 0x51, 0x4c, 0x4a, 0x47, 0x48, 0x5b, 0x47, 0x40, 0x4a, 0x47,
-    0x42, 0x44, 0x46, 0x46, 0x45, 0x48, 0x4a, 0x3f, 0x40, 0x4f, 0x48, 0x3a,
-    0x49, 0x52, 0x4a, 0x53, 0x43, 0x4c, 0x4b, 0x4a, 0x4a, 0x4a, 0x4e, 0x42,
-    0x4b, 0x46, 0x3d, 0x50, 0x51, 0x4b, 0x4b, 0x4f, 0x50, 0x4c, 0x4f, 0x4c,
-    0x4d, 0x41, 0x41, 0x3c, 0x40, 0x43, 0x54, 0x51, 0x48, 0x3d, 0x48, 0x51,
-    0x42, 0x42, 0x4c, 0x4e, 0x4d, 0x4b, 0x49, 0x43, 0x48, 0x47, 0x4b, 0x49,
-    0x49, 0x4e, 0x4d, 0x46, 0x4c, 0x52, 0x49, 0x49, 0x51, 0x4e, 0x45, 0x47,
-    0x44, 0x47, 0x42, 0x4a, 0x46, 0x59, 0x48, 0x48, 0x4b, 0x4f, 0x4c, 0x5e,
-    0x5c, 0x45, 0x3f, 0x48, 0x3d, 0x3f, 0x37, 0x5a, 0x4b, 0x4b, 0x45, 0x49,
-    0x3e, 0x42, 0x41, 0x6b, 0x49, 0x2d, 0x45, 0x43, 0x47, 0x45, 0x49, 0x61,
-    0x3d, 0x3b, 0x49, 0x43, 0x49, 0x4b, 0x4b, 0x55, 0x4b, 0x47, 0x46, 0x46,
-    0x48, 0x4d, 0x49, 0x4f, 0x4a, 0x4c, 0x42, 0x51, 0x41, 0x44, 0x45, 0x4f,
-    0x4e, 0x44, 0x3f, 0x55, 0x3e, 0x4a, 0x45, 0x50, 0x46, 0x42, 0x41, 0x49,
-    0x49, 0x47, 0x49, 0x61, 0x47, 0x40, 0x41, 0x4e, 0x4d, 0x4b, 0x4a, 0x5e,
-    0x52, 0x49, 0x4b, 0x52, 0x51, 0x55, 0x42, 0x61, 0x53, 0x4c, 0x48, 0x4a,
-    0x4e, 0x48, 0x48, 0x57, 0x4c, 0x40, 0x40, 0x48, 0x45, 0x43, 0x3e, 0x46,
-    0x43, 0x4a, 0x45, 0x45, 0x44, 0x4f, 0x44, 0x40, 0x49, 0x48, 0x4e, 0x49,
-    0x4a, 0x4e, 0x49, 0x51, 0x46, 0x4f, 0x47, 0x44, 0x42, 0x4d, 0x43, 0x4e,
-    0x4f, 0x4d, 0x44, 0x51, 0x47, 0x49, 0x40, 0x57, 0x4b, 0x49, 0x47, 0x4c,
-    0x4d, 0x4d, 0x3e, 0x47, 0x45, 0x41, 0x50, 0x4b, 0x4b, 0x45, 0x42, 0x4e,
-    0x48, 0x47, 0x4e, 0x4b, 0x56, 0x4c, 0x4f, 0x52, 0x51, 0x49, 0x4d, 0x4a,
-    0x4b, 0x52, 0x4d, 0x55, 0x4b, 0x4e, 0x4e, 0x4b, 0x51, 0x57, 0x47, 0x42,
-    0x49, 0x48, 0x56, 0x44, 0x52, 0x56, 0x53, 0x5a, 0x63, 0x53, 0x4c, 0x4c,
-    0x43, 0x56, 0x3c, 0x57, 0x47, 0x47, 0x4d, 0x52, 0x43, 0x48, 0x45, 0x5f,
-    0x45, 0x29, 0x47, 0x45, 0x48, 0x40, 0x41, 0x4b, 0x3f, 0x39, 0x49, 0x4e,
-    0x47, 0x55, 0x42, 0x56, 0x4d, 0x43, 0x48, 0x44, 0x45, 0x53, 0x43, 0x46,
-    0x49, 0x43, 0x49, 0x4a, 0x40, 0x4e, 0x4a, 0x4a, 0x47, 0x43, 0x45, 0x4d,
-    0x4a, 0x47, 0x3f, 0x53, 0x45, 0x43, 0x4b, 0x4c, 0x42, 0x47, 0x47, 0x5f,
-    0x48, 0x48, 0x46, 0x44, 0x50, 0x47, 0x41, 0x64, 0x4e, 0x46, 0x49, 0x4a,
-    0x4d, 0x55, 0x42, 0x55, 0x46, 0x3d, 0x49, 0x43, 0x52, 0x52, 0x47, 0x52,
-    0x4e, 0x46, 0x47, 0x41, 0x49, 0x4d, 0x50, 0x47, 0x42, 0x49, 0x41, 0x42,
-    0x4b, 0x48, 0x49, 0x42, 0x4d, 0x48, 0x51, 0x54, 0x43, 0x56, 0x4c, 0x52,
-    0x53, 0x4d, 0x54, 0x4a, 0x51, 0x50, 0x48, 0x4c, 0x4e, 0x48, 0x4c, 0x4c,
-    0x52, 0x49, 0x4a, 0x4e, 0x4e, 0x41, 0x4f, 0x53, 0x49, 0x52, 0x42, 0x4b,
-    0x50, 0x46, 0x50, 0x4a, 0x53, 0x56, 0x46, 0x4f, 0x4b, 0x49, 0x3d, 0x41,
-    0x4c, 0x52, 0x42, 0x50, 0x4d, 0x45, 0x4e, 0x51, 0x4b, 0x4c, 0x46, 0x42,
-    0x41, 0x4b, 0x40, 0x4a, 0x42, 0x57, 0x4f, 0x43, 0x40, 0x50, 0x4c, 0x51,
-    0x4f, 0x48, 0x3a, 0x4e, 0x51, 0x40, 0x49, 0x66, 0x4b, 0x42, 0x48, 0x3c,
-    0x5b, 0x47, 0x53, 0x40, 0x4a, 0x48, 0x35, 0x44, 0x5f, 0x50, 0x4a, 0x3c,
-    0x41, 0x45, 0x48, 0x3b, 0x42, 0x59, 0x43, 0x4b, 0x48, 0x49, 0x4a, 0x40,
-    0x4f, 0x5c, 0x50, 0x54, 0x53, 0x55, 0x4c, 0x4a, 0x43, 0x46, 0x49, 0x47,
-    0x49, 0x48, 0x4b, 0x43, 0x42, 0x44, 0x42, 0x46, 0x44, 0x3f, 0x4b, 0x42,
-    0x4d, 0x49, 0x41, 0x46, 0x47, 0x51, 0x51, 0x44, 0x4c, 0x54, 0x4e, 0x4b,
-    0x42, 0x52, 0x4e, 0x4c, 0x4b, 0x4a, 0x50, 0x4e, 0x44, 0x4b, 0x4e, 0x4e,
-    0x4f, 0x42, 0x4b, 0x48, 0x46, 0x43, 0x48, 0x54, 0x4b, 0x4e, 0x48, 0x4f,
-    0x4a, 0x4d, 0x43, 0x4e, 0x47, 0x50, 0x4a, 0x44, 0x47, 0x52, 0x46, 0x53,
-    0x4a, 0x40, 0x46, 0x54, 0x50, 0x4a, 0x47, 0x51, 0x49, 0x45, 0x4b, 0x4e,
-    0x4b, 0x46, 0x4c, 0x4c, 0x52, 0x47, 0x45, 0x45, 0x4a, 0x47, 0x4c, 0x52,
-    0x44, 0x51, 0x47, 0x42, 0x47, 0x43, 0x43, 0x49, 0x52, 0x5a, 0x55, 0x3e,
-    0x45, 0x4b, 0x4c, 0x46, 0x4f, 0x4b, 0x45, 0x49, 0x4a, 0x4e, 0x4a, 0x50,
-    0x3e, 0x4e, 0x42, 0x4e, 0x44, 0x55, 0x3d, 0x4a, 0x4d, 0x49, 0x4d, 0x42,
-    0x49, 0x4e, 0x50, 0x44, 0x4b, 0x3c, 0x41, 0x49, 0x51, 0x49, 0x3c, 0x4e,
-    0x4c, 0x39, 0x4c, 0x72, 0x44, 0x4b, 0x49, 0x42, 0x5f, 0x48, 0x4a, 0x48,
-    0x41, 0x4c, 0x43, 0x40, 0x62, 0x5e, 0x47, 0x3c, 0x4a, 0x4c, 0x55, 0x49,
-    0x4b, 0x52, 0x4e, 0x4b, 0x4d, 0x48, 0x4c, 0x3c, 0x3f, 0x4f, 0x4e, 0x48,
-    0x45, 0x55, 0x4a, 0x46, 0x48, 0x3d, 0x45, 0x44, 0x4b, 0x4a, 0x46, 0x3a,
-    0x4e, 0x44, 0x4d, 0x49, 0x49, 0x49, 0x40, 0x3e, 0x40, 0x47, 0x48, 0x43,
-    0x3f, 0x51, 0x46, 0x4c, 0x45, 0x4c, 0x49, 0x44, 0x3e, 0x57, 0x49, 0x4e,
-    0x48, 0x3f, 0x48, 0x47, 0x53, 0x4d, 0x50, 0x51, 0x49, 0x42, 0x45, 0x44,
-    0x49, 0x49, 0x46, 0x4b, 0x45, 0x49, 0x4f, 0x49, 0x46, 0x48, 0x4c, 0x55,
-    0x46, 0x51, 0x48, 0x4a, 0x48, 0x54, 0x4b, 0x5a, 0x4c, 0x47, 0x40, 0x47,
-    0x40, 0x55, 0x50, 0x52, 0x4a, 0x4b, 0x4f, 0x49, 0x4b, 0x50, 0x4b, 0x5b,
-    0x51, 0x53, 0x4f, 0x4e, 0x49, 0x48, 0x44, 0x52, 0x46, 0x4e, 0x47, 0x48,
-    0x44, 0x43, 0x49, 0x55, 0x48, 0x58, 0x4f, 0x46, 0x45, 0x53, 0x45, 0x4a,
-    0x4c, 0x4c, 0x49, 0x46, 0x47, 0x4d, 0x41, 0x4d, 0x4f, 0x59, 0x4a, 0x49,
-    0x46, 0x4e, 0x44, 0x49, 0x4d, 0x48, 0x54, 0x47, 0x48, 0x4e, 0x48, 0x43,
-    0x46, 0x41, 0x46, 0x44, 0x52, 0x46, 0x42, 0x4c, 0x4c, 0x31, 0x4d, 0x6f,
-    0x51, 0x4f, 0x4d, 0x43, 0x5c, 0x48, 0x49, 0x49, 0x46, 0x4c, 0x43, 0x3b,
-    0x5d, 0x63, 0x58, 0x46, 0x49, 0x45, 0x4e, 0x48, 0x49, 0x5d, 0x45, 0x50,
-    0x56, 0x4d, 0x57, 0x37, 0x40, 0x55, 0x43, 0x4b, 0x4e, 0x46, 0x4c, 0x3b,
-    0x3d, 0x4b, 0x49, 0x4b, 0x52, 0x47, 0x4d, 0x34, 0x4c, 0x4c, 0x47, 0x4e,
-    0x4d, 0x4c, 0x3d, 0x3f, 0x4a, 0x49, 0x44, 0x45, 0x4a, 0x54, 0x43, 0x44,
-    0x50, 0x4b, 0x4d, 0x4c, 0x4e, 0x48, 0x46, 0x51, 0x43, 0x48, 0x48, 0x48,
-    0x42, 0x44, 0x4e, 0x48, 0x47, 0x45, 0x48, 0x51, 0x53, 0x4a, 0x4f, 0x58,
-    0x42, 0x4d, 0x48, 0x4f, 0x4c, 0x45, 0x4a, 0x57, 0x4b, 0x43, 0x4d, 0x4b,
-    0x4a, 0x4e, 0x4c, 0x5f, 0x3f, 0x4f, 0x4a, 0x42, 0x4b, 0x48, 0x4d, 0x62,
-    0x4f, 0x4b, 0x50, 0x4c, 0x45, 0x49, 0x44, 0x53, 0x4a, 0x4f, 0x45, 0x56,
-    0x4b, 0x44, 0x41, 0x53, 0x49, 0x48, 0x4d, 0x49, 0x47, 0x4b, 0x46, 0x4c,
-    0x49, 0x4b, 0x4c, 0x54, 0x4f, 0x4b, 0x47, 0x49, 0x44, 0x4a, 0x4e, 0x53,
-    0x4f, 0x49, 0x54, 0x4e, 0x4a, 0x48, 0x42, 0x54, 0x51, 0x46, 0x4b, 0x52,
-    0x45, 0x48, 0x51, 0x4a, 0x40, 0x4a, 0x50, 0x45, 0x4a, 0x46, 0x49, 0x46,
-    0x54, 0x46, 0x42, 0x48, 0x50, 0x36, 0x4a, 0x6b, 0x46, 0x59, 0x51, 0x47,
-    0x5f, 0x4d, 0x43, 0x4d, 0x44, 0x4d, 0x42, 0x3b, 0x65, 0x6a, 0x56, 0x48,
-    0x4d, 0x4c, 0x52, 0x4a, 0x4d, 0x61, 0x52, 0x4b, 0x47, 0x4f, 0x48, 0x49,
-    0x3f, 0x5b, 0x45, 0x51, 0x48, 0x48, 0x4b, 0x3c, 0x3b, 0x4c, 0x54, 0x52,
-    0x4f, 0x51, 0x53, 0x31, 0x47, 0x4c, 0x45, 0x4a, 0x42, 0x4b, 0x47, 0x40,
-    0x41, 0x49, 0x4c, 0x46, 0x4b, 0x53, 0x46, 0x49, 0x44, 0x4b, 0x4e, 0x4b,
-    0x48, 0x51, 0x49, 0x4d, 0x4b, 0x3f, 0x42, 0x44, 0x45, 0x43, 0x46, 0x56,
-    0x42, 0x4b, 0x49, 0x4e, 0x4e, 0x53, 0x42, 0x5c, 0x4b, 0x46, 0x49, 0x46,
-    0x4e, 0x41, 0x42, 0x67, 0x41, 0x49, 0x4d, 0x48, 0x49, 0x4e, 0x3f, 0x61,
-    0x48, 0x4a, 0x40, 0x42, 0x4c, 0x51, 0x50, 0x63, 0x49, 0x44, 0x49, 0x47,
-    0x45, 0x4d, 0x49, 0x61, 0x3f, 0x48, 0x40, 0x41, 0x49, 0x49, 0x45, 0x57,
-    0x45, 0x46, 0x4d, 0x46, 0x4c, 0x4a, 0x4d, 0x4b, 0x43, 0x54, 0x4b, 0x49,
-    0x4c, 0x49, 0x41, 0x49, 0x4b, 0x47, 0x45, 0x4b, 0x44, 0x43, 0x46, 0x3f,
-    0x47, 0x47, 0x43, 0x4c, 0x49, 0x4c, 0x3d, 0x4d, 0x4b, 0x54, 0x4a, 0x4f,
-    0x44, 0x4c, 0x4b, 0x47, 0x4c, 0x45, 0x3d, 0x52, 0x58, 0x4b, 0x45, 0x4e,
-    0x48, 0x39, 0x53, 0x70, 0x4a, 0x5d, 0x4c, 0x4e, 0x5a, 0x4f, 0x46, 0x4b,
-    0x3e, 0x4f, 0x44, 0x3d, 0x66, 0x6b, 0x50, 0x4d, 0x4d, 0x57, 0x52, 0x4a,
-    0x4c, 0x5b, 0x4e, 0x53, 0x4d, 0x54, 0x50, 0x42, 0x3c, 0x5d, 0x4a, 0x4c,
-    0x56, 0x52, 0x50, 0x40, 0x48, 0x4c, 0x4d, 0x49, 0x49, 0x4f, 0x51, 0x38,
-    0x42, 0x49, 0x4d, 0x4f, 0x45, 0x40, 0x4d, 0x41, 0x4b, 0x4a, 0x47, 0x51,
-    0x4b, 0x53, 0x4c, 0x4a, 0x51, 0x4c, 0x42, 0x56, 0x48, 0x4a, 0x47, 0x58,
-    0x49, 0x46, 0x52, 0x4a, 0x45, 0x47, 0x51, 0x54, 0x4f, 0x50, 0x50, 0x53,
-    0x49, 0x4a, 0x4d, 0x56, 0x56, 0x4b, 0x4d, 0x45, 0x40, 0x4d, 0x48, 0x60,
-    0x4e, 0x56, 0x48, 0x4b, 0x47, 0x45, 0x47, 0x62, 0x4e, 0x4f, 0x41, 0x49,
-    0x48, 0x57, 0x44, 0x64, 0x4f, 0x4f, 0x49, 0x44, 0x49, 0x4c, 0x3f, 0x53,
-    0x40, 0x41, 0x4e, 0x4b, 0x4d, 0x54, 0x42, 0x53, 0x4e, 0x41, 0x49, 0x44,
-    0x41, 0x45, 0x4d, 0x4f, 0x47, 0x51, 0x45, 0x4a, 0x42, 0x45, 0x4e, 0x40,
-    0x4b, 0x52, 0x48, 0x47, 0x4e, 0x4f, 0x47, 0x41, 0x48, 0x53, 0x47, 0x47,
-    0x46, 0x42, 0x48, 0x4b, 0x42, 0x4c, 0x49, 0x4c, 0x45, 0x4c, 0x54, 0x45,
-    0x4c, 0x43, 0x4e, 0x49, 0x56, 0x47, 0x45, 0x4f, 0x4d, 0x3a, 0x58, 0x74,
-    0x49, 0x5b, 0x4c, 0x4f, 0x64, 0x4e, 0x45, 0x43, 0x44, 0x5b, 0x43, 0x41,
-    0x63, 0x70, 0x55, 0x45, 0x4a, 0x4a, 0x4d, 0x51, 0x4b, 0x5a, 0x51, 0x57,
-    0x54, 0x5b, 0x55, 0x44, 0x38, 0x57, 0x4e, 0x50, 0x4e, 0x56, 0x57, 0x3a,
-    0x3a, 0x4b, 0x57, 0x4c, 0x51, 0x53, 0x4d, 0x3b, 0x44, 0x43, 0x47, 0x4c,
-    0x48, 0x59, 0x51, 0x41, 0x43, 0x44, 0x51, 0x51, 0x4a, 0x54, 0x51, 0x4b,
-    0x4e, 0x45, 0x51, 0x4a, 0x49, 0x4a, 0x4f, 0x52, 0x4c, 0x3e, 0x4e, 0x55,
-    0x42, 0x46, 0x46, 0x4a, 0x42, 0x52, 0x49, 0x47, 0x4a, 0x56, 0x4f, 0x50,
-    0x46, 0x4f, 0x43, 0x51, 0x53, 0x46, 0x40, 0x60, 0x44, 0x4d, 0x46, 0x54,
-    0x3d, 0x49, 0x43, 0x64, 0x45, 0x4d, 0x50, 0x49, 0x4f, 0x4d, 0x53, 0x60,
-    0x4a, 0x52, 0x49, 0x47, 0x48, 0x5a, 0x48, 0x58, 0x4e, 0x4f, 0x43, 0x4f,
-    0x50, 0x51, 0x41, 0x52, 0x4c, 0x4d, 0x45, 0x42, 0x41, 0x4c, 0x44, 0x54,
-    0x4e, 0x4d, 0x4a, 0x47, 0x40, 0x4a, 0x3e, 0x47, 0x4c, 0x58, 0x46, 0x46,
-    0x55, 0x4c, 0x4d, 0x45, 0x49, 0x51, 0x53, 0x46, 0x46, 0x43, 0x43, 0x48,
-    0x52, 0x3d, 0x4b, 0x4e, 0x49, 0x47, 0x3f, 0x3d, 0x4f, 0x45, 0x44, 0x3f,
-    0x5a, 0x43, 0x4b, 0x4d, 0x51, 0x35, 0x54, 0x76, 0x4f, 0x5e, 0x4c, 0x50,
-    0x5a, 0x51, 0x46, 0x49, 0x44, 0x61, 0x4f, 0x41, 0x67, 0x72, 0x56, 0x4f,
-    0x42, 0x48, 0x4b, 0x52, 0x46, 0x60, 0x50, 0x4e, 0x4a, 0x5b, 0x5f, 0x46,
-    0x31, 0x5b, 0x4a, 0x48, 0x4b, 0x58, 0x51, 0x41, 0x37, 0x4e, 0x4f, 0x55,
-    0x51, 0x5c, 0x4f, 0x42, 0x4b, 0x4e, 0x4f, 0x54, 0x4f, 0x52, 0x43, 0x43,
-    0x48, 0x53, 0x53, 0x41, 0x4b, 0x49, 0x4e, 0x50, 0x46, 0x4c, 0x4f, 0x49,
-    0x42, 0x49, 0x4c, 0x4c, 0x4c, 0x41, 0x4e, 0x48, 0x47, 0x4c, 0x49, 0x53,
-    0x44, 0x46, 0x51, 0x53, 0x45, 0x52, 0x4e, 0x53, 0x50, 0x58, 0x42, 0x45,
-    0x44, 0x42, 0x48, 0x58, 0x4e, 0x4d, 0x54, 0x56, 0x4c, 0x46, 0x4a, 0x58,
-    0x48, 0x4f, 0x47, 0x51, 0x47, 0x4f, 0x4f, 0x5b, 0x41, 0x4e, 0x45, 0x45,
-    0x4a, 0x50, 0x3e, 0x57, 0x48, 0x4e, 0x41, 0x4c, 0x45, 0x51, 0x46, 0x4c,
-    0x46, 0x4f, 0x42, 0x45, 0x4b, 0x4c, 0x49, 0x4c, 0x44, 0x4f, 0x4e, 0x4d,
-    0x48, 0x56, 0x43, 0x48, 0x42, 0x54, 0x48, 0x43, 0x3e, 0x51, 0x43, 0x47,
-    0x47, 0x47, 0x49, 0x4d, 0x46, 0x4e, 0x52, 0x42, 0x48, 0x4e, 0x4c, 0x4a,
-    0x4d, 0x3e, 0x43, 0x40, 0x48, 0x41, 0x47, 0x4f, 0x5e, 0x49, 0x40, 0x4c,
-    0x50, 0x42, 0x56, 0x75, 0x51, 0x5e, 0x51, 0x4e, 0x62, 0x58, 0x49, 0x47,
-    0x51, 0x59, 0x46, 0x46, 0x6c, 0x72, 0x55, 0x44, 0x4c, 0x4a, 0x4d, 0x59,
-    0x53, 0x64, 0x4d, 0x51, 0x55, 0x5e, 0x59, 0x50, 0x30, 0x58, 0x50, 0x4c,
-    0x4c, 0x60, 0x59, 0x42, 0x32, 0x53, 0x50, 0x55, 0x4d, 0x53, 0x59, 0x43,
-    0x3e, 0x49, 0x4f, 0x52, 0x4d, 0x51, 0x47, 0x45, 0x4d, 0x4e, 0x53, 0x4e,
-    0x54, 0x4f, 0x4d, 0x4d, 0x4e, 0x40, 0x47, 0x53, 0x53, 0x49, 0x56, 0x4d,
-    0x4d, 0x3a, 0x4c, 0x4e, 0x45, 0x4a, 0x47, 0x45, 0x53, 0x4a, 0x4e, 0x52,
-    0x4d, 0x4e, 0x48, 0x56, 0x4e, 0x4a, 0x4d, 0x52, 0x49, 0x4e, 0x4e, 0x58,
-    0x47, 0x50, 0x4c, 0x54, 0x49, 0x42, 0x46, 0x54, 0x50, 0x54, 0x54, 0x46,
-    0x40, 0x49, 0x4b, 0x57, 0x4b, 0x59, 0x44, 0x46, 0x52, 0x55, 0x51, 0x55,
-    0x4f, 0x50, 0x4d, 0x4d, 0x48, 0x50, 0x4e, 0x49, 0x4e, 0x42, 0x45, 0x3f,
-    0x4d, 0x4f, 0x51, 0x47, 0x4a, 0x4c, 0x4b, 0x4b, 0x46, 0x4d, 0x44, 0x52,
-    0x4d, 0x44, 0x40, 0x4d, 0x54, 0x46, 0x54, 0x44, 0x4b, 0x46, 0x47, 0x45,
-    0x50, 0x45, 0x45, 0x4b, 0x4c, 0x48, 0x3f, 0x55, 0x4a, 0x45, 0x49, 0x4e,
-    0x40, 0x49, 0x4a, 0x41, 0x56, 0x4b, 0x49, 0x4e, 0x4a, 0x41, 0x50, 0x70,
-    0x56, 0x59, 0x4b, 0x55, 0x58, 0x59, 0x49, 0x47, 0x4a, 0x5a, 0x4c, 0x46,
-    0x62, 0x7b, 0x58, 0x51, 0x44, 0x47, 0x44, 0x57, 0x4f, 0x65, 0x4e, 0x50,
-    0x4d, 0x67, 0x5c, 0x4a, 0x2b, 0x61, 0x48, 0x4b, 0x4b, 0x5d, 0x5c, 0x48,
-    0x39, 0x50, 0x45, 0x4d, 0x53, 0x60, 0x53, 0x46, 0x42, 0x46, 0x50, 0x45,
-    0x4f, 0x4e, 0x46, 0x4a, 0x4d, 0x51, 0x54, 0x47, 0x59, 0x4b, 0x58, 0x4a,
-    0x50, 0x3d, 0x59, 0x48, 0x45, 0x4e, 0x4e, 0x47, 0x4f, 0x47, 0x4d, 0x4b,
-    0x52, 0x42, 0x4c, 0x48, 0x4a, 0x4f, 0x47, 0x43, 0x4e, 0x4c, 0x4d, 0x51,
-    0x49, 0x4f, 0x4c, 0x47, 0x47, 0x48, 0x47, 0x59, 0x4f, 0x4f, 0x53, 0x49,
-    0x4e, 0x4b, 0x4f, 0x5a, 0x50, 0x42, 0x47, 0x50, 0x4a, 0x54, 0x47, 0x5a,
-    0x43, 0x49, 0x47, 0x4e, 0x49, 0x4d, 0x43, 0x54, 0x4c, 0x53, 0x4e, 0x4e,
-    0x42, 0x43, 0x48, 0x46, 0x4f, 0x43, 0x43, 0x45, 0x51, 0x47, 0x4b, 0x4f,
-    0x56, 0x48, 0x48, 0x49, 0x46, 0x45, 0x4d, 0x52, 0x47, 0x4b, 0x46, 0x50,
-    0x3e, 0x4e, 0x4c, 0x43, 0x45, 0x4d, 0x53, 0x43, 0x46, 0x45, 0x44, 0x52,
-    0x45, 0x49, 0x49, 0x51, 0x3d, 0x4a, 0x4d, 0x46, 0x42, 0x41, 0x4e, 0x48,
-    0x5a, 0x49, 0x49, 0x49, 0x4f, 0x3d, 0x56, 0x68, 0x56, 0x67, 0x4b, 0x57,
-    0x5f, 0x5c, 0x40, 0x4a, 0x4a, 0x54, 0x4c, 0x47, 0x64, 0x7a, 0x54, 0x48,
-    0x46, 0x45, 0x46, 0x57, 0x4e, 0x61, 0x4f, 0x50, 0x4d, 0x64, 0x5b, 0x43,
-    0x2d, 0x60, 0x55, 0x51, 0x4c, 0x54, 0x4f, 0x4e, 0x2f, 0x50, 0x4f, 0x52,
-    0x50, 0x61, 0x54, 0x4b, 0x3d, 0x4c, 0x47, 0x51, 0x4a, 0x54, 0x4b, 0x42,
-    0x3b, 0x55, 0x47, 0x50, 0x4f, 0x49, 0x4a, 0x46, 0x43, 0x44, 0x45, 0x47,
-    0x46, 0x4b, 0x4f, 0x46, 0x43, 0x47, 0x4a, 0x4e, 0x51, 0x43, 0x55, 0x47,
-    0x4d, 0x46, 0x4c, 0x4c, 0x49, 0x4d, 0x43, 0x51, 0x47, 0x51, 0x52, 0x4a,
-    0x46, 0x4f, 0x49, 0x52, 0x50, 0x4a, 0x43, 0x53, 0x46, 0x4e, 0x50, 0x54,
-    0x45, 0x3a, 0x4a, 0x4a, 0x4c, 0x50, 0x4b, 0x54, 0x43, 0x4f, 0x4e, 0x45,
-    0x49, 0x4f, 0x46, 0x53, 0x4d, 0x51, 0x52, 0x53, 0x3d, 0x4a, 0x47, 0x4e,
-    0x43, 0x4a, 0x53, 0x48, 0x4a, 0x4c, 0x4a, 0x4a, 0x42, 0x53, 0x3e, 0x43,
-    0x4f, 0x4c, 0x47, 0x48, 0x54, 0x4d, 0x48, 0x48, 0x4e, 0x4c, 0x43, 0x51,
-    0x42, 0x49, 0x44, 0x3e, 0x49, 0x51, 0x4a, 0x4d, 0x4f, 0x49, 0x45, 0x44,
-    0x4e, 0x41, 0x48, 0x4b, 0x4c, 0x49, 0x46, 0x47, 0x5d, 0x4c, 0x4d, 0x50,
-    0x45, 0x40, 0x4e, 0x6a, 0x4f, 0x62, 0x53, 0x50, 0x5c, 0x5e, 0x4a, 0x4c,
-    0x50, 0x56, 0x52, 0x42, 0x60, 0x7e, 0x5b, 0x4b, 0x43, 0x41, 0x4c, 0x56,
-    0x46, 0x5f, 0x4d, 0x49, 0x43, 0x65, 0x5c, 0x4d, 0x2c, 0x61, 0x48, 0x4c,
-    0x44, 0x55, 0x5c, 0x49, 0x37, 0x54, 0x4e, 0x57, 0x52, 0x5c, 0x50, 0x49,
-    0x3e, 0x4d, 0x4f, 0x4f, 0x51, 0x4c, 0x48, 0x43, 0x4a, 0x5a, 0x4d, 0x4b,
-    0x4e, 0x58, 0x54, 0x49, 0x51, 0x42, 0x49, 0x4f, 0x46, 0x45, 0x52, 0x3d,
-    0x4b, 0x4b, 0x43, 0x54, 0x47, 0x47, 0x4c, 0x42, 0x4b, 0x49, 0x45, 0x46,
-    0x46, 0x4a, 0x51, 0x47, 0x47, 0x4f, 0x48, 0x4a, 0x3f, 0x4c, 0x4b, 0x57,
-    0x4a, 0x3f, 0x52, 0x4a, 0x56, 0x52, 0x4b, 0x54, 0x4c, 0x3e, 0x3f, 0x4f,
-    0x4b, 0x50, 0x4c, 0x53, 0x4a, 0x49, 0x46, 0x4e, 0x50, 0x48, 0x4f, 0x4b,
-    0x4a, 0x4e, 0x3e, 0x49, 0x45, 0x42, 0x42, 0x41, 0x47, 0x4b, 0x4f, 0x42,
-    0x49, 0x4c, 0x55, 0x4c, 0x4e, 0x42, 0x47, 0x42, 0x4b, 0x48, 0x46, 0x41,
-    0x46, 0x4e, 0x4d, 0x3f, 0x4f, 0x46, 0x4f, 0x4b, 0x4b, 0x4d, 0x50, 0x3e,
-    0x42, 0x43, 0x44, 0x4a, 0x49, 0x40, 0x4e, 0x43, 0x3e, 0x52, 0x3e, 0x44,
-    0x49, 0x43, 0x4d, 0x44, 0x62, 0x51, 0x42, 0x53, 0x51, 0x40, 0x4c, 0x64,
-    0x4f, 0x63, 0x4e, 0x5c, 0x5b, 0x5c, 0x48, 0x4d, 0x4a, 0x57, 0x4f, 0x42,
-    0x65, 0xfe, 0x5c, 0x4e, 0x47, 0x43, 0x4a, 0x58, 0x4e, 0x5e, 0x48, 0x4c,
-    0x51, 0x5e, 0x60, 0x56, 0x2f, 0x62, 0x54, 0x58, 0x51, 0x52, 0x55, 0x51,
-    0x36, 0x4b, 0x46, 0x51, 0x53, 0x5f, 0x46, 0x4c, 0x37, 0x4d, 0x4a, 0x45,
-    0x4b, 0x3f, 0x41, 0x42, 0x3f, 0x53, 0x4a, 0x48, 0x49, 0x4a, 0x4a, 0x45,
-    0x52, 0x3f, 0x52, 0x52, 0x45, 0x4d, 0x4f, 0x45, 0x46, 0x4a, 0x51, 0x48,
-    0x56, 0x47, 0x50, 0x3e, 0x46, 0x49, 0x4c, 0x51, 0x49, 0x54, 0x45, 0x4f,
-    0x4b, 0x4b, 0x49, 0x46, 0x4b, 0x4d, 0x49, 0x5c, 0x4d, 0x43, 0x47, 0x49,
-    0x48, 0x52, 0x46, 0x50, 0x51, 0x37, 0x50, 0x52, 0x4c, 0x4d, 0x4f, 0x51,
-    0x4f, 0x42, 0x50, 0x47, 0x48, 0x4e, 0x4d, 0x4c, 0x48, 0x48, 0x4a, 0x51,
-    0x49, 0x42, 0x50, 0x4f, 0x43, 0x4e, 0x47, 0x4b, 0x47, 0x4a, 0x44, 0x44,
-    0x4c, 0x51, 0x49, 0x44, 0x45, 0x45, 0x45, 0x48, 0x3f, 0x4a, 0x43, 0x49,
-    0x46, 0x49, 0x4c, 0x4d, 0x45, 0x50, 0x44, 0x45, 0x44, 0x55, 0x4a, 0x45,
-    0x48, 0x47, 0x4c, 0x43, 0x3f, 0x48, 0x42, 0x43, 0x43, 0x43, 0x48, 0x46,
-    0x5c, 0x51, 0x47, 0x51, 0x48, 0x40, 0x54, 0x66, 0x4e, 0x67, 0x4d, 0x5a,
-    0x60, 0x57, 0x47, 0x4d, 0x4d, 0x58, 0x53, 0x46, 0x66, 0x7e, 0x56, 0x48,
-    0x44, 0x4f, 0x49, 0x5c, 0x4a, 0x63, 0x50, 0x4c, 0x49, 0x56, 0x61, 0x50,
-    0x2c, 0x68, 0x4d, 0x51, 0x46, 0x4e, 0x5b, 0x51, 0x2e, 0x53, 0x54, 0x50,
-    0x46, 0x58, 0x44, 0x4f, 0x37, 0x48, 0x55, 0x50, 0x49, 0x49, 0x4e, 0x46,
-    0x43, 0x56, 0x52, 0x4e, 0x50, 0x4b, 0x50, 0x4c, 0x49, 0x40, 0x4d, 0x4f,
-    0x50, 0x41, 0x44, 0x39, 0x4b, 0x4d, 0x4b, 0x41, 0x51, 0x4d, 0x4c, 0x41,
-    0x3f, 0x52, 0x4e, 0x4b, 0x49, 0x53, 0x45, 0x43, 0x4d, 0x4f, 0x44, 0x4d,
-    0x4b, 0x53, 0x50, 0x4e, 0x45, 0x3f, 0x4e, 0x51, 0x50, 0x55, 0x4f, 0x51,
-    0x4d, 0x3d, 0x58, 0x3f, 0x46, 0x50, 0x50, 0x50, 0x56, 0x42, 0x49, 0x49,
-    0x50, 0x4f, 0x42, 0x4b, 0x4c, 0x45, 0x52, 0x41, 0x46, 0x43, 0x4c, 0x4a,
-    0x4c, 0x51, 0x4d, 0x4d, 0x4a, 0x49, 0x54, 0x49, 0x58, 0x53, 0x49, 0x45,
-    0x47, 0x4c, 0x4c, 0x44, 0x4e, 0x51, 0x4c, 0x4c, 0x47, 0x48, 0x4c, 0x4e,
-    0x49, 0x54, 0x4c, 0x51, 0x49, 0x48, 0x47, 0x45, 0x42, 0x49, 0x42, 0x51,
-    0x4e, 0x3f, 0x49, 0x41, 0x50, 0x3e, 0x4d, 0x50, 0x5c, 0x51, 0x4d, 0x56,
-    0x47, 0x48, 0x58, 0x65, 0x51, 0x6b, 0x56, 0x5b, 0x56, 0x55, 0x46, 0x49,
-    0x4b, 0x58, 0x59, 0x4a, 0x68, 0x79, 0x53, 0x46, 0x45, 0x4b, 0x53, 0x5d,
-    0x4b, 0x6f, 0x4e, 0x4f, 0x4c, 0x53, 0x5b, 0x52, 0x30, 0x63, 0x46, 0x57,
-    0x46, 0x50, 0x4b, 0x48, 0x2e, 0x4c, 0x46, 0x48, 0x44, 0x51, 0x46, 0x4a,
-    0x35, 0x55, 0x43, 0x4c, 0x43, 0x4d, 0x4e, 0x3e, 0x47, 0x56, 0x50, 0x4d,
-    0x44, 0x59, 0x4c, 0x51, 0x46, 0x42, 0x4e, 0x43, 0x4c, 0x44, 0x42, 0x3a,
-    0x40, 0x48, 0x46, 0x44, 0x45, 0x4a, 0x46, 0x3a, 0x53, 0x4c, 0x4d, 0x4c,
-    0x4a, 0x4f, 0x53, 0x40, 0x4b, 0x48, 0x54, 0x4b, 0x44, 0x59, 0x41, 0x50,
-    0x4e, 0x50, 0x55, 0x4d, 0x55, 0x41, 0x4a, 0x4f, 0x47, 0x43, 0x4e, 0x50,
-    0x52, 0x4c, 0x50, 0x4d, 0x47, 0x42, 0x4f, 0x4b, 0x47, 0x43, 0x41, 0x4a,
-    0x55, 0x3e, 0x50, 0x4b, 0x41, 0x49, 0x47, 0x49, 0x53, 0x4d, 0x48, 0x4b,
-    0x43, 0x43, 0x51, 0x44, 0x4d, 0x4c, 0x44, 0x50, 0x4d, 0x42, 0x49, 0x4e,
-    0x50, 0x50, 0x4c, 0x49, 0x49, 0x51, 0x46, 0x43, 0x4a, 0x4e, 0x53, 0x47,
-    0x43, 0x46, 0x40, 0x49, 0x47, 0x44, 0x44, 0x4d, 0x4b, 0x4b, 0x51, 0x4b,
-    0x45, 0x49, 0x47, 0x43, 0x56, 0x49, 0x4c, 0x54, 0x50, 0x3c, 0x4c, 0x5e,
-    0x51, 0x67, 0x4f, 0x57, 0x57, 0x53, 0x3e, 0x4e, 0x4e, 0x5e, 0x4b, 0x48,
-    0x5a, 0x78, 0x55, 0x4a, 0x3f, 0x4b, 0x4c, 0x5b, 0x53, 0x64, 0x4d, 0x53,
-    0x49, 0x57, 0x57, 0x58, 0x37, 0x62, 0x4f, 0x56, 0x44, 0x4e, 0x58, 0x4a,
-    0x30, 0x4f, 0x40, 0x4e, 0x47, 0x58, 0x52, 0x50, 0x35, 0x4d, 0x49, 0x52,
-    0x4e, 0x42, 0x46, 0x47, 0x44, 0x57, 0x54, 0x43, 0x4e, 0x56, 0x43, 0x49,
-    0x44, 0x40, 0x44, 0x41, 0x50, 0x49, 0x4b, 0x44, 0x4d, 0x52, 0x49, 0x43,
-    0x52, 0x54, 0x49, 0x3f, 0x49, 0x42, 0x49, 0x4a, 0x43, 0x3e, 0x50, 0x40,
-    0x46, 0x4b, 0x50, 0x4b, 0x53, 0x4b, 0x47, 0x52, 0x51, 0x4b, 0x47, 0x3f,
-    0x46, 0x4b, 0x4c, 0x57, 0x49, 0x47, 0x54, 0x49, 0x50, 0x50, 0x4d, 0x4a,
-    0x42, 0x4e, 0x51, 0x4c, 0x47, 0x47, 0x42, 0x43, 0x54, 0x43, 0x46, 0x47,
-    0x4d, 0x43, 0x54, 0x47, 0x43, 0x58, 0x48, 0x45, 0x4b, 0x46, 0x48, 0x3d,
-    0x47, 0x3f, 0x44, 0x4f, 0x4e, 0x46, 0x41, 0x40, 0x4d, 0x4d, 0x4d, 0x52,
-    0x54, 0x47, 0x4f, 0x51, 0x4f, 0x45, 0x45, 0x48, 0x4b, 0x4d, 0x44, 0x52,
-    0x51, 0x4b, 0x48, 0x4f, 0x49, 0x49, 0x46, 0x50, 0x54, 0x42, 0x44, 0x51,
-    0x58, 0x4e, 0x43, 0x58, 0x55, 0x40, 0x53, 0x5a, 0x51, 0x61, 0x51, 0x60,
-    0x53, 0x57, 0x45, 0x4f, 0x45, 0x5e, 0x51, 0x42, 0x61, 0x7a, 0x55, 0x47,
-    0x41, 0x4b, 0x4a, 0x5b, 0x4c, 0x65, 0x4f, 0x55, 0x46, 0x54, 0x65, 0x59,
-    0x36, 0x61, 0x54, 0x55, 0x48, 0x57, 0x52, 0x4e, 0x24, 0x4b, 0x49, 0x4d,
-    0x43, 0x57, 0x44, 0x51, 0x3b, 0x4f, 0x45, 0x40, 0x47, 0x4a, 0x43, 0x47,
-    0x46, 0x58, 0x50, 0x54, 0x4d, 0x50, 0x44, 0x42, 0x4a, 0x46, 0x4b, 0x4d,
-    0x4f, 0x4f, 0x4d, 0x40, 0x48, 0x4a, 0x53, 0x48, 0x49, 0x48, 0x4d, 0x39,
-    0x47, 0x4e, 0x44, 0x4c, 0x4b, 0x49, 0x44, 0x42, 0x4a, 0x45, 0x46, 0x46,
-    0x53, 0x4d, 0x49, 0x4f, 0x4e, 0x48, 0x50, 0x4a, 0x4c, 0x46, 0x56, 0x4b,
-    0x4b, 0x57, 0x4c, 0x49, 0x4a, 0x4a, 0x43, 0x4e, 0x56, 0x45, 0x50, 0x4c,
-    0x47, 0x55, 0x48, 0x46, 0x4e, 0x46, 0x45, 0x3f, 0x4a, 0x4c, 0x4c, 0x47,
-    0x4a, 0x51, 0x4e, 0x50, 0x40, 0x52, 0x45, 0x45, 0x4b, 0x46, 0x4f, 0x44,
-    0x51, 0x4a, 0x4e, 0x4d, 0x4c, 0x46, 0x42, 0x47, 0x4a, 0x4e, 0x46, 0x42,
-    0x4b, 0x4f, 0x4b, 0x4e, 0x4e, 0x46, 0x42, 0x50, 0x53, 0x51, 0x4f, 0x54,
-    0x45, 0x4f, 0x45, 0x42, 0x4c, 0x45, 0x40, 0x48, 0x59, 0x49, 0x49, 0x53,
-    0x4c, 0x43, 0x4b, 0x57, 0x54, 0x64, 0x4e, 0x5f, 0x5c, 0x59, 0x4b, 0x56,
-    0x49, 0x5d, 0x4f, 0x4b, 0x62, 0x73, 0x54, 0x45, 0x49, 0x50, 0x48, 0x5a,
-    0x50, 0x6d, 0x4a, 0x4e, 0x48, 0x55, 0x5d, 0x57, 0x38, 0x68, 0x52, 0x5a,
-    0x46, 0x56, 0x4c, 0x5a, 0x2e, 0x55, 0x49, 0x4f, 0x4a, 0x57, 0x4f, 0x54,
-    0x41, 0x53, 0x46, 0x43, 0x45, 0x47, 0x53, 0x4a, 0x42, 0x4f, 0x4d, 0x48,
-    0x4c, 0x49, 0x47, 0x48, 0x45, 0x49, 0x48, 0x53, 0x48, 0x52, 0x4a, 0x44,
-    0x4c, 0x49, 0x52, 0x4b, 0x47, 0x51, 0x42, 0x47, 0x49, 0x51, 0x3f, 0x45,
-    0x47, 0x4e, 0x53, 0x33, 0x55, 0x51, 0x55, 0x48, 0x4b, 0x51, 0x56, 0x47,
-    0x43, 0x55, 0x47, 0x42, 0x47, 0x4f, 0x47, 0x51, 0x46, 0x55, 0x4a, 0x4b,
-    0x50, 0x52, 0x4f, 0x43, 0x4b, 0x53, 0x4d, 0x3f, 0x4e, 0x56, 0x50, 0x49,
-    0x4d, 0x47, 0x51, 0x49, 0x4a, 0x52, 0x44, 0x43, 0x4d, 0x4e, 0x41, 0x51,
-    0x4c, 0x4d, 0x47, 0x48, 0x4f, 0x40, 0x50, 0x46, 0x43, 0x4d, 0x4e, 0x50,
-    0x43, 0x47, 0x4e, 0x46, 0x4f, 0x4b, 0x51, 0x4b, 0x4a, 0x57, 0x42, 0x51,
-    0x4c, 0x54, 0x52, 0x42, 0x4c, 0x42, 0x47, 0x54, 0x4a, 0x4a, 0x47, 0x4a,
-    0x3f, 0x46, 0x4e, 0x4c, 0x53, 0x50, 0x47, 0x53, 0x49, 0x44, 0x52, 0x5a,
-    0x4b, 0x65, 0x50, 0x5b, 0x57, 0x59, 0x4a, 0x48, 0x48, 0x5f, 0x55, 0x48,
-    0x5c, 0x78, 0x55, 0x48, 0x4a, 0x4b, 0x49, 0x4c, 0x46, 0x6b, 0x54, 0x57,
-    0x55, 0x4b, 0x59, 0x52, 0x38, 0x5b, 0x57, 0x56, 0x4b, 0x4f, 0x48, 0x4e,
-    0x34, 0x5a, 0x4e, 0x4f, 0x43, 0x4e, 0x4b, 0x4e, 0x36, 0x4d, 0x52, 0x48,
-    0x4d, 0x4c, 0x4c, 0x49, 0x51, 0x54, 0x45, 0x54, 0x4a, 0x4e, 0x52, 0x41,
-    0x4c, 0x45, 0x4a, 0x53, 0x55, 0x4b, 0x50, 0x47, 0x4e, 0x4d, 0x43, 0x51,
-    0x4e, 0x4a, 0x51, 0x46, 0x4e, 0x4d, 0x48, 0x3f, 0x43, 0x52, 0x56, 0x38,
-    0x52, 0x46, 0x43, 0x49, 0x40, 0x49, 0x53, 0x41, 0x47, 0x41, 0x41, 0x42,
-    0x4f, 0x4b, 0x46, 0x4b, 0x4a, 0x57, 0x4a, 0x45, 0x4b, 0x46, 0x47, 0x3c,
-    0x43, 0x46, 0x4f, 0x50, 0x4c, 0x53, 0x4f, 0x41, 0x4a, 0x4a, 0x40, 0x4a,
-    0x3e, 0x4e, 0x4d, 0x41, 0x4a, 0x42, 0x49, 0x4c, 0x51, 0x46, 0x4f, 0x43,
-    0x4b, 0x41, 0x50, 0x48, 0x4a, 0x40, 0x52, 0x45, 0x40, 0x40, 0x46, 0x48,
-    0x48, 0x52, 0x52, 0x41, 0x43, 0x49, 0x49, 0x4c, 0x44, 0x48, 0x50, 0x4a,
-    0x47, 0x48, 0x4c, 0x42, 0x49, 0x48, 0x52, 0x56, 0x4b, 0x41, 0x4e, 0x47,
-    0x52, 0x56, 0x4e, 0x56, 0x4b, 0x38, 0x50, 0x55, 0x5a, 0x63, 0x51, 0x5a,
-    0x54, 0x52, 0x44, 0x45, 0x47, 0x5e, 0x4c, 0x4a, 0x5e, 0x71, 0x56, 0x44,
-    0x4c, 0x4b, 0x4c, 0x4e, 0x49, 0x69, 0x50, 0x53, 0x4d, 0x5c, 0x59, 0x50,
-    0x36, 0x5d, 0x46, 0x5b, 0x51, 0x55, 0x55, 0x51, 0x36, 0x5a, 0x53, 0x56,
-    0x54, 0x4a, 0x55, 0x53, 0x3c, 0x52, 0x4a, 0x45, 0x4c, 0x56, 0x49, 0x46,
-    0x4f, 0x5b, 0x43, 0x4b, 0x49, 0x4c, 0x4b, 0x41, 0x44, 0x4b, 0x47, 0x4b,
-    0x4b, 0x54, 0x4a, 0x4c, 0x49, 0x44, 0x46, 0x46, 0x48, 0x49, 0x47, 0x4a,
-    0x40, 0x4e, 0x47, 0x53, 0x4a, 0x47, 0x4a, 0x3b, 0x48, 0x4b, 0x50, 0x51,
-    0x50, 0x44, 0x4d, 0x49, 0x42, 0x4b, 0x43, 0x48, 0x4a, 0x43, 0x4d, 0x4d,
-    0x49, 0x4d, 0x43, 0x4f, 0x50, 0x49, 0x47, 0x48, 0x48, 0x4f, 0x49, 0x41,
-    0x4c, 0x46, 0x47, 0x3e, 0x51, 0x4d, 0x4e, 0x42, 0x3d, 0x53, 0x4d, 0x3b,
-    0x53, 0x52, 0x4c, 0x4c, 0x43, 0x46, 0x43, 0x3d, 0x53, 0x48, 0x43, 0x4e,
-    0x45, 0x52, 0x4d, 0x4a, 0x44, 0x49, 0x47, 0x4c, 0x4e, 0x4c, 0x4a, 0x4e,
-    0x41, 0x48, 0x4b, 0x44, 0x4d, 0x4a, 0x4d, 0x44, 0x4a, 0x45, 0x4f, 0x52,
-    0x45, 0x3f, 0x4b, 0x48, 0x43, 0x41, 0x3d, 0x53, 0x53, 0x50, 0x4a, 0x56,
-    0x4d, 0x3e, 0x55, 0x4e, 0x56, 0x5e, 0x52, 0x52, 0x54, 0x50, 0x42, 0x4a,
-    0x4d, 0x5f, 0x4f, 0x49, 0x5d, 0x6f, 0x55, 0x4a, 0x47, 0x49, 0x4e, 0x4a,
-    0x43, 0x6e, 0x4e, 0x4f, 0x52, 0x59, 0x62, 0x4b, 0x3e, 0x5c, 0x4c, 0x4e,
-    0x45, 0x52, 0x43, 0x4d, 0x3c, 0x58, 0x52, 0x49, 0x48, 0x55, 0x53, 0x4e,
-    0x3d, 0x4e, 0x4c, 0x4b, 0x4b, 0x50, 0x4a, 0x47, 0x45, 0x62, 0x50, 0x49,
-    0x48, 0x4b, 0x55, 0x45, 0x46, 0x51, 0x41, 0x55, 0x54, 0x55, 0x50, 0x47,
-    0x46, 0x4d, 0x46, 0x4b, 0x41, 0x49, 0x4c, 0x40, 0x45, 0x4f, 0x52, 0x54,
-    0x45, 0x4d, 0x53, 0x3a, 0x4c, 0x55, 0x4e, 0x48, 0x44, 0x45, 0x56, 0x3c,
-    0x48, 0x46, 0x4b, 0x51, 0x53, 0x43, 0x41, 0x49, 0x4c, 0x52, 0x48, 0x42,
-    0x48, 0x3f, 0x4c, 0x38, 0x46, 0x50, 0x4a, 0x44, 0x50, 0x54, 0x4e, 0x38,
-    0x48, 0x42, 0x43, 0x4a, 0x4c, 0x44, 0x47, 0x42, 0x42, 0x46, 0x4a, 0x50,
-    0x47, 0x4b, 0x43, 0x40, 0x44, 0x46, 0x46, 0x4d, 0x50, 0x4a, 0x4e, 0x51,
-    0x44, 0x40, 0x50, 0x43, 0x52, 0x4d, 0x42, 0x4c, 0x50, 0x41, 0x4a, 0x4e,
-    0x45, 0x49, 0x4d, 0x40, 0x46, 0x51, 0x43, 0x4b, 0x48, 0x47, 0x42, 0x55,
-    0x4a, 0x41, 0x4f, 0x49, 0x4f, 0x4e, 0x47, 0x4c, 0x4a, 0x48, 0x50, 0x4e,
-    0x50, 0x57, 0x4e, 0x56, 0x56, 0x4e, 0x44, 0x48, 0x4a, 0x5b, 0x55, 0x49,
-    0x59, 0x67, 0x54, 0x46, 0x4f, 0x41, 0x4d, 0x4e, 0x4a, 0x63, 0x4d, 0x44,
-    0x53, 0x5b, 0x59, 0x4f, 0x43, 0x55, 0x56, 0x4e, 0x55, 0x4c, 0x4b, 0x54,
-    0x3c, 0x56, 0x4d, 0x50, 0x4f, 0x4a, 0x5a, 0x47, 0x48, 0x56, 0x4f, 0x4f,
-    0x50, 0x51, 0x48, 0x4e, 0x4d, 0x50, 0x4e, 0x45, 0x4b, 0x48, 0x4e, 0x44,
-    0x46, 0x4d, 0x43, 0x46, 0x41, 0x59, 0x53, 0x4b, 0x4a, 0x3e, 0x51, 0x47,
-    0x43, 0x48, 0x52, 0x3f, 0x43, 0x50, 0x4b, 0x4f, 0x41, 0x48, 0x43, 0x2e,
-    0x4d, 0x4e, 0x4c, 0x45, 0x45, 0x46, 0x4b, 0x43, 0x46, 0x49, 0x46, 0x4d,
-    0x47, 0x4e, 0x4d, 0x3c, 0x47, 0x4a, 0x52, 0x4e, 0x41, 0x50, 0x43, 0x3a,
-    0x50, 0x47, 0x4a, 0x45, 0x52, 0x4a, 0x4c, 0x3f, 0x42, 0x3d, 0x49, 0x48,
-    0x48, 0x4c, 0x42, 0x3a, 0x40, 0x47, 0x46, 0x4e, 0x44, 0x52, 0x46, 0x44,
-    0x4a, 0x44, 0x43, 0x49, 0x42, 0x45, 0x3f, 0x50, 0x4c, 0x44, 0x48, 0x43,
-    0x47, 0x4a, 0x48, 0x48, 0x3e, 0x45, 0x43, 0x48, 0x4a, 0x48, 0x53, 0x4b,
-    0x50, 0x49, 0x43, 0x4d, 0x53, 0x4f, 0x4b, 0x4b, 0x40, 0x42, 0x50, 0x4d,
-    0x53, 0x4e, 0x44, 0x4d, 0x45, 0x3d, 0x51, 0x51, 0x4f, 0x59, 0x4b, 0x51,
-    0x4a, 0x4e, 0x42, 0x40, 0x49, 0x5b, 0x4b, 0x43, 0x53, 0x60, 0x47, 0x49,
-    0x4a, 0x44, 0x44, 0x48, 0x4b, 0x60, 0x51, 0x3f, 0x4b, 0x5b, 0x4f, 0x4a,
-    0x4a, 0x50, 0x49, 0x46, 0x55, 0x50, 0x4b, 0x4c, 0x40, 0x4e, 0x51, 0x4f,
-    0x4b, 0x51, 0x54, 0x50, 0x48, 0x4e, 0x4a, 0x4f, 0x4d, 0x4e, 0x54, 0x4d,
-    0x41, 0x50, 0x4e, 0x47, 0x47, 0x47, 0x54, 0x3b, 0x51, 0x54, 0x50, 0x49,
-    0x48, 0x4c, 0x4e, 0x47, 0x3f, 0x3c, 0x4c, 0x43, 0x45, 0x42, 0x45, 0x37,
-    0x41, 0x52, 0x49, 0x47, 0x4e, 0x4a, 0x4b, 0x37, 0x48, 0x4d, 0x4e, 0x4a,
-    0x42, 0x56, 0x3d, 0x35, 0x48, 0x42, 0x4b, 0x4a, 0x44, 0x52, 0x40, 0x48,
-    0x4f, 0x49, 0x4f, 0x4c, 0x4d, 0x43, 0x49, 0x38, 0x4b, 0x42, 0x48, 0x42,
-    0x45, 0x45, 0x54, 0x3a, 0x47, 0x47, 0x52, 0x45, 0x4a, 0x48, 0x47, 0x39,
-    0x4d, 0x45, 0x54, 0x4b, 0x4e, 0x4f, 0x4e, 0x38, 0x4a, 0x4b, 0x48, 0x45,
-    0x4e, 0x43, 0x4e, 0x4e, 0x46, 0x4e, 0x4e, 0x50, 0x46, 0x4c, 0x42, 0x45,
-    0x4b, 0x46, 0x47, 0x4d, 0x49, 0x3f, 0x4f, 0x50, 0x46, 0x4a, 0x47, 0x4e,
-    0x4a, 0x3e, 0x50, 0x46, 0x47, 0x40, 0x4f, 0x47, 0x51, 0x4b, 0x43, 0x46,
-    0x4a, 0x42, 0x55, 0x4d, 0x46, 0x63, 0x49, 0x4e, 0x4f, 0x4f, 0x42, 0x45,
-    0x50, 0x57, 0x49, 0x3e, 0x57, 0x63, 0x45, 0x4a, 0x49, 0x50, 0x41, 0x4a,
-    0x48, 0x64, 0x4f, 0x42, 0x47, 0x58, 0x4b, 0x45, 0x43, 0x57, 0x49, 0x58,
-    0x51, 0x51, 0x47, 0x43, 0x51, 0x4b, 0x4a, 0x45, 0x50, 0x54, 0x4d, 0x4d,
-    0x3e, 0x4a, 0x50, 0x40, 0x51, 0x4f, 0x52, 0x48, 0x53, 0x49, 0x44, 0x4b,
-    0x51, 0x4b, 0x50, 0x42, 0x4d, 0x49, 0x4a, 0x46, 0x44, 0x50, 0x47, 0x3f,
-    0x48, 0x47, 0x41, 0x4a, 0x42, 0x52, 0x4a, 0x33, 0x50, 0x50, 0x54, 0x3f,
-    0x44, 0x4e, 0x51, 0x3c, 0x4e, 0x51, 0x48, 0x4b, 0x47, 0x49, 0x3f, 0x3d,
-    0x4e, 0x46, 0x4a, 0x41, 0x40, 0x50, 0x49, 0x40, 0x4a, 0x4b, 0x45, 0x50,
-    0x4e, 0x4d, 0x4b, 0x39, 0x4e, 0x4b, 0x48, 0x3c, 0x47, 0x44, 0x4c, 0x42,
-    0x45, 0x50, 0x3e, 0x54, 0x4d, 0x49, 0x48, 0x3c, 0x45, 0x42, 0x55, 0x4a,
-    0x41, 0x4f, 0x40, 0x3f, 0x47, 0x46, 0x46, 0x44, 0x4f, 0x47, 0x46, 0x44,
-    0x41, 0x40, 0x44, 0x48, 0x3e, 0x3c, 0x46, 0x3e, 0x4a, 0x45, 0x4c, 0x52,
-    0x47, 0x42, 0x47, 0x3f, 0x47, 0x4e, 0x4b, 0x53, 0x4a, 0x3d, 0x4d, 0x47,
-    0x4f, 0x3d, 0x4e, 0x43, 0x4f, 0x46, 0x43, 0x43, 0x46, 0x41, 0x4f, 0x42,
-    0x46, 0x57, 0x4d, 0x51, 0x49, 0x51, 0x4c, 0x44, 0x51, 0x4f, 0x46, 0x44,
-    0x54, 0x5d, 0x4f, 0x40, 0x59, 0x46, 0x53, 0x46, 0x48, 0x54, 0x43, 0x45,
-    0x4d, 0x51, 0x4f, 0x44, 0x44, 0x53, 0x49, 0x4e, 0x48, 0x46, 0x44, 0x4a,
-    0x4a, 0x42, 0x4c, 0x46, 0x54, 0x4f, 0x52, 0x47, 0x46, 0x44, 0x4c, 0x4d,
-    0x4c, 0x47, 0x4d, 0x40, 0x55, 0x58, 0x46, 0x46, 0x3f, 0x3e, 0x47, 0x36,
-    0x3f, 0x4d, 0x4b, 0x4d, 0x4f, 0x4f, 0x48, 0x34, 0x4d, 0x46, 0x46, 0x50,
-    0x50, 0x4b, 0x47, 0x45, 0x4e, 0x49, 0x50, 0x4f, 0x4a, 0x48, 0x4f, 0x39,
-    0x53, 0x4c, 0x4b, 0x56, 0x45, 0x4f, 0x55, 0x3a, 0x40, 0x53, 0x43, 0x4b,
-    0x47, 0x3d, 0x4c, 0x34, 0x4b, 0x4e, 0x4a, 0x4b, 0x4d, 0x49, 0x4e, 0x40,
-    0x4d, 0x48, 0x40, 0x4a, 0x4a, 0x4b, 0x4a, 0x42, 0x4c, 0x52, 0x43, 0x42,
-    0x44, 0x3f, 0x4e, 0x42, 0x44, 0x45, 0x40, 0x3d, 0x4b, 0x45, 0x4a, 0x43,
-    0x4b, 0x4b, 0x4e, 0x46, 0x55, 0x43, 0x44, 0x3f, 0x44, 0x43, 0x4b, 0x4b,
-    0x45, 0x51, 0x48, 0x49, 0x3d, 0x44, 0x4a, 0x4a, 0x50, 0x50, 0x47, 0x44,
-    0x4f, 0x3e, 0x3f, 0x43, 0x4c, 0x46, 0x4a, 0x4e, 0x4c, 0x52, 0x48, 0x4e,
-    0x48, 0x46, 0x45, 0x48, 0x41, 0x4f, 0x51, 0x48, 0x40, 0x4d, 0x4a, 0x4b,
-    0x4c, 0x51, 0x49, 0x50, 0x4e, 0x4b, 0x4a, 0x42, 0x49, 0x54, 0x4e, 0x43,
-    0x52, 0x47, 0x4a, 0x41, 0x42, 0x51, 0x48, 0x4a, 0x46, 0x45, 0x4a, 0x43,
-    0x4e, 0x4f, 0x41, 0x49, 0x4b, 0x42, 0x40, 0x4a, 0x50, 0x41, 0x42, 0x3f,
-    0x49, 0x4a, 0x40, 0x3e, 0x3f, 0x42, 0x4d, 0x51, 0x4e, 0x4e, 0x47, 0x41,
-    0x4e, 0x4e, 0x49, 0x4b, 0x41, 0x45, 0x51, 0x40, 0x45, 0x4c, 0x3f, 0x42,
-    0x4c, 0x45, 0x4d, 0x39, 0x46, 0x52, 0x4a, 0x4e, 0x4c, 0x49, 0x4e, 0x43,
-    0x43, 0x4c, 0x48, 0x46, 0x48, 0x49, 0x50, 0x3a, 0x3f, 0x49, 0x42, 0x4f,
-    0x42, 0x4d, 0x4e, 0x3f, 0x51, 0x4b, 0x4e, 0x4b, 0x51, 0x44, 0x43, 0x4a,
-    0x4a, 0x4c, 0x50, 0x48, 0x45, 0x47, 0x4d, 0x41, 0x47, 0x45, 0x51, 0x41,
-    0x42, 0x48, 0x4c, 0x39, 0x51, 0x45, 0x46, 0x53, 0x4b, 0x50, 0x46, 0x45,
-    0x4b, 0x4d, 0x42, 0x4b, 0x3f, 0x45, 0x4b, 0x4e, 0x50, 0x50, 0x47, 0x4a,
-    0x45, 0x40, 0x4b, 0x43, 0x3f, 0x4a, 0x41, 0x42, 0x51, 0x41, 0x4d, 0x42,
-    0x53, 0x48, 0x48, 0x49, 0x4b, 0x40, 0x42, 0x3d, 0x4f, 0x53, 0x49, 0x46,
-    0x46, 0x43, 0x42, 0x44, 0x46, 0x48, 0x3f, 0x46, 0x31, 0x43, 0x4d, 0x4b,
-    0x48, 0x4d, 0x4c, 0x43, 0x45, 0x53, 0x50, 0x40, 0x4a, 0x48, 0x45, 0x3b,
-    0x4f, 0x4d, 0x53, 0x4c, 0x44, 0x54, 0x50, 0x66, 0x3f, 0x45, 0x4c, 0x4c,
-    0x4a, 0x49, 0x49, 0x4a, 0x40, 0x52, 0x3e, 0x4c, 0x49, 0x40, 0x44, 0x49,
-    0x48, 0x3f, 0x45, 0x5b, 0x49, 0x4b, 0x4c, 0x44, 0x50, 0x4e, 0x4a, 0x4a,
-    0x49, 0x4e, 0x4f, 0x47, 0x46, 0x4b, 0x44, 0x3b, 0x4e, 0x4b, 0x48, 0x46,
-    0x45, 0x45, 0x3d, 0x35, 0x4c, 0x49, 0x54, 0x42, 0x51, 0x46, 0x49, 0x2d,
-    0x43, 0x4a, 0x53, 0x49, 0x49, 0x42, 0x4f, 0x40, 0x4e, 0x50, 0x54, 0x51,
-    0x4b, 0x45, 0x48, 0x35, 0x4d, 0x41, 0x51, 0x40, 0x41, 0x49, 0x4a, 0x3b,
-    0x45, 0x50, 0x48, 0x51, 0x51, 0x4d, 0x4c, 0x36, 0x47, 0x4a, 0x44, 0x45,
-    0x4d, 0x47, 0x43, 0x3a, 0x48, 0x40, 0x42, 0x4f, 0x4f, 0x4f, 0x4f, 0x43,
-    0x4a, 0x41, 0x4b, 0x53, 0x43, 0x46, 0x4f, 0x39, 0x46, 0x4a, 0x4d, 0x53,
-    0x41, 0x44, 0x4e, 0x44, 0x3f, 0x47, 0x4c, 0x4d, 0x4d, 0x43, 0x45, 0x3d,
-    0x43, 0x4b, 0x3e, 0x48, 0x42, 0x4c, 0x47, 0x42, 0x42, 0x50, 0x49, 0x4b,
-    0x43, 0x4e, 0x44, 0x44, 0x4c, 0x3d, 0x4c, 0x47, 0x4e, 0x42, 0x4b, 0x44,
-    0x4b, 0x44, 0x3f, 0x49, 0x33, 0x46, 0x4a, 0x4a, 0x42, 0x57, 0x5e, 0x4a,
-    0x46, 0x4f, 0x55, 0x3c, 0x4a, 0x4b, 0x4c, 0x43, 0x51, 0x59, 0x64, 0x51,
-    0x45, 0x60, 0x4b, 0x65, 0x46, 0x4a, 0x4e, 0x49, 0x41, 0x4b, 0x50, 0x5c,
-    0x48, 0x4b, 0x3e, 0x52, 0x4f, 0x2f, 0x4e, 0x4a, 0x45, 0x53, 0x48, 0x59,
-    0x4c, 0x4e, 0x4a, 0x4d, 0x49, 0x40, 0x52, 0x44, 0x49, 0x46, 0x4e, 0x46,
-    0x42, 0x4b, 0x4a, 0x4b, 0x4b, 0x4b, 0x4f, 0x52, 0x46, 0x50, 0x4d, 0x3d,
-    0x46, 0x4b, 0x4b, 0x40, 0x4d, 0x3f, 0x43, 0x33, 0x4e, 0x53, 0x4b, 0x4a,
-    0x45, 0x48, 0x4c, 0x2e, 0x48, 0x4f, 0x49, 0x42, 0x54, 0x4f, 0x4b, 0x2b,
-    0x55, 0x4e, 0x43, 0x4d, 0x4d, 0x47, 0x42, 0x3e, 0x48, 0x48, 0x4d, 0x54,
-    0x52, 0x4f, 0x43, 0x37, 0x4b, 0x42, 0x4b, 0x4e, 0x49, 0x49, 0x4b, 0x2e,
-    0x45, 0x4e, 0x48, 0x4e, 0x44, 0x49, 0x48, 0x30, 0x4c, 0x4b, 0x3f, 0x42,
-    0x4f, 0x4f, 0x4e, 0x38, 0x4f, 0x42, 0x54, 0x49, 0x41, 0x42, 0x45, 0x3a,
-    0x47, 0x43, 0x43, 0x4b, 0x49, 0x40, 0x4d, 0x38, 0x52, 0x4c, 0x3d, 0x4d,
-    0x43, 0x54, 0x4e, 0x41, 0x4a, 0x47, 0x44, 0x51, 0x47, 0x48, 0x41, 0x47,
-    0x4d, 0x41, 0x46, 0x4c, 0x4d, 0x46, 0x51, 0x4a, 0x49, 0x46, 0x4a, 0x42,
-    0x3a, 0x43, 0x4a, 0x4b, 0x43, 0x4c, 0x68, 0x44, 0x4b, 0x52, 0x50, 0x37,
-    0x4d, 0x4c, 0x57, 0x4c, 0x68, 0x62, 0x64, 0x4a, 0x3e, 0x64, 0x4b, 0x66,
-    0x48, 0x4d, 0x54, 0x57, 0x4b, 0x52, 0x49, 0x5c, 0x4d, 0x55, 0x51, 0x57,
-    0x4c, 0x3a, 0x48, 0x43, 0x3b, 0x43, 0x52, 0x5d, 0x45, 0x4e, 0x51, 0x4d,
-    0x4a, 0x55, 0x4e, 0x4c, 0x44, 0x51, 0x4c, 0x4f, 0x41, 0x4f, 0x4a, 0x43,
-    0x53, 0x48, 0x47, 0x49, 0x46, 0x52, 0x48, 0x3e, 0x4b, 0x4e, 0x4a, 0x50,
-    0x4f, 0x47, 0x3e, 0x2e, 0x4b, 0x51, 0x4a, 0x44, 0x4c, 0x49, 0x4f, 0x26,
-    0x48, 0x4f, 0x44, 0x51, 0x48, 0x3f, 0x4c, 0x30, 0x4e, 0x48, 0x4d, 0x48,
-    0x48, 0x44, 0x4b, 0x2f, 0x50, 0x41, 0x4d, 0x50, 0x52, 0x42, 0x45, 0x33,
-    0x4c, 0x48, 0x48, 0x3d, 0x46, 0x41, 0x43, 0x38, 0x45, 0x4f, 0x48, 0x4b,
-    0x41, 0x49, 0x4c, 0x2f, 0x53, 0x4c, 0x48, 0x4a, 0x47, 0x40, 0x4a, 0x31,
-    0x52, 0x40, 0x49, 0x4c, 0x3f, 0x48, 0x48, 0x39, 0x48, 0x3f, 0x45, 0x43,
-    0x40, 0x48, 0x3c, 0x40, 0x4c, 0x48, 0x48, 0x4d, 0x3e, 0x42, 0x4a, 0x3d,
-    0x4c, 0x45, 0x44, 0x46, 0x44, 0x45, 0x4a, 0x47, 0x52, 0x48, 0x4a, 0x4d,
-    0x3f, 0x49, 0x4c, 0x4c, 0x48, 0x44, 0x4c, 0x44, 0x3d, 0x41, 0x47, 0x45,
-    0x43, 0x4a, 0x5a, 0x3f, 0x48, 0x5d, 0x50, 0x35, 0x47, 0x4f, 0x5b, 0x46,
-    0x6e, 0x50, 0x6d, 0x44, 0x49, 0x6a, 0x53, 0x6b, 0x4b, 0x4b, 0x4f, 0x62,
-    0x45, 0x57, 0x48, 0x5b, 0x40, 0x4b, 0x4f, 0x63, 0x48, 0x3a, 0x4b, 0x42,
-    0x43, 0x53, 0x41, 0x5f, 0x54, 0x3e, 0x4d, 0x43, 0x3d, 0x4c, 0x46, 0x46,
-    0x49, 0x56, 0x4b, 0x45, 0x47, 0x45, 0x4e, 0x4f, 0x4c, 0x4d, 0x4f, 0x47,
-    0x49, 0x4b, 0x51, 0x33, 0x4b, 0x45, 0x4d, 0x41, 0x51, 0x4a, 0x43, 0x2a,
-    0x50, 0x4b, 0x4a, 0x4b, 0x4c, 0x52, 0x4c, 0x3b, 0x45, 0x4c, 0x51, 0x44,
-    0x4c, 0x48, 0x43, 0x35, 0x51, 0x50, 0x48, 0x49, 0x3f, 0x48, 0x3d, 0x3b,
-    0x52, 0x3f, 0x42, 0x4b, 0x49, 0x49, 0x47, 0x38, 0x4a, 0x4a, 0x41, 0x52,
-    0x41, 0x3e, 0x4b, 0x2f, 0x46, 0x4d, 0x49, 0x44, 0x46, 0x3b, 0x47, 0x36,
-    0x46, 0x3f, 0x49, 0x48, 0x47, 0x42, 0x42, 0x35, 0x44, 0x4b, 0x4d, 0x56,
-    0x50, 0x49, 0x43, 0x42, 0x4b, 0x3e, 0x53, 0x44, 0x4a, 0x43, 0x47, 0x38,
-    0x4a, 0x45, 0x4d, 0x3f, 0x46, 0x4a, 0x47, 0x3a, 0x4c, 0x3e, 0x47, 0x45,
-    0x46, 0x4b, 0x45, 0x49, 0x4a, 0x4b, 0x54, 0x49, 0x4a, 0x53, 0x4a, 0x4c,
-    0x45, 0x48, 0x53, 0x42, 0x4b, 0x47, 0x4e, 0x50, 0x3d, 0x51, 0x60, 0x3e,
-    0x53, 0x5d, 0x51, 0x30, 0x45, 0x50, 0x59, 0x4e, 0x62, 0x52, 0x68, 0x51,
-    0x45, 0x6c, 0x4c, 0x64, 0x4d, 0x47, 0x55, 0x61, 0x44, 0x57, 0x44, 0x58,
-    0x44, 0x4a, 0x53, 0x58, 0x47, 0x31, 0x3f, 0x4c, 0x43, 0x45, 0x48, 0x5e,
-    0x41, 0x43, 0x3f, 0x43, 0x51, 0x46, 0x48, 0x4b, 0x4d, 0x5b, 0x45, 0x4b,
-    0x48, 0x46, 0x3f, 0x45, 0x47, 0x45, 0x40, 0x4a, 0x51, 0x51, 0x3d, 0x3f,
-    0x43, 0x45, 0x4d, 0x4a, 0x47, 0x50, 0x49, 0x32, 0x4c, 0x5a, 0x55, 0x4f,
-    0x4c, 0x51, 0x43, 0x37, 0x40, 0x59, 0x49, 0x49, 0x4e, 0x4f, 0x47, 0x34,
-    0x40, 0x4c, 0x4a, 0x41, 0x4a, 0x47, 0x4a, 0x42, 0x4e, 0x4a, 0x48, 0x4e,
-    0x4e, 0x4e, 0x45, 0x39, 0x4e, 0x45, 0x45, 0x4e, 0x4c, 0x48, 0x4a, 0x35,
-    0x45, 0x4c, 0x49, 0x4f, 0x51, 0x43, 0x3c, 0x3a, 0x4a, 0x4a, 0x46, 0x48,
-    0x49, 0x42, 0x4e, 0x2f, 0x42, 0x4e, 0x45, 0x50, 0x51, 0x40, 0x45, 0x32,
-    0x4a, 0x4d, 0x44, 0x4e, 0x48, 0x48, 0x47, 0x2f, 0x48, 0x4b, 0x49, 0x44,
-    0x48, 0x4d, 0x46, 0x3b, 0x46, 0x4a, 0x41, 0x4e, 0x4e, 0x47, 0x54, 0x4b,
-    0x45, 0x49, 0x45, 0x44, 0x45, 0x48, 0x4a, 0x46, 0x55, 0x49, 0x47, 0x49,
-    0x4b, 0x42, 0x48, 0x4f, 0x3f, 0x52, 0x60, 0x39, 0x4b, 0x5e, 0x55, 0x2e,
-    0x48, 0x50, 0x59, 0x4f, 0x68, 0x5f, 0x64, 0x4f, 0x3b, 0x71, 0x50, 0x63,
-    0x4f, 0x50, 0x50, 0x6c, 0x4b, 0x55, 0x47, 0x5b, 0x4c, 0x40, 0x48, 0x59,
-    0x4f, 0x2e, 0x4b, 0x4c, 0x4e, 0x4e, 0x46, 0x61, 0x50, 0x41, 0x4c, 0x4a,
-    0x44, 0x3e, 0x3f, 0x47, 0x4b, 0x4f, 0x47, 0x4b, 0x47, 0x3d, 0x41, 0x49,
-    0x49, 0x3f, 0x4d, 0x44, 0x4a, 0x4d, 0x45, 0x41, 0x4d, 0x43, 0x49, 0x3c,
-    0x49, 0x57, 0x49, 0x3b, 0x49, 0x59, 0x3f, 0x4f, 0x4e, 0x49, 0x4e, 0x46,
-    0x52, 0x4e, 0x4c, 0x54, 0x4a, 0x48, 0x48, 0x3a, 0x44, 0x4a, 0x4f, 0x4a,
-    0x44, 0x4b, 0x43, 0x4d, 0x51, 0x42, 0x53, 0x4d, 0x52, 0x41, 0x4d, 0x43,
-    0x4e, 0x54, 0x4b, 0x42, 0x4b, 0x3f, 0x53, 0x45, 0x3f, 0x4a, 0x45, 0x50,
-    0x3f, 0x4c, 0x4f, 0x43, 0x46, 0x42, 0x4b, 0x4d, 0x4c, 0x3b, 0x48, 0x40,
-    0x4e, 0x4e, 0x49, 0x46, 0x4d, 0x4d, 0x52, 0x40, 0x4e, 0x4f, 0x46, 0x4a,
-    0x40, 0x4b, 0x4c, 0x40, 0x4f, 0x4a, 0x44, 0x41, 0x46, 0x3c, 0x40, 0x3d,
-    0x44, 0x48, 0x4a, 0x50, 0x46, 0x53, 0x46, 0x40, 0x44, 0x3e, 0x47, 0x43,
-    0x48, 0x3d, 0x4e, 0x3e, 0x48, 0x49, 0x4b, 0x49, 0x4c, 0x3e, 0x4c, 0x4a,
-    0x46, 0x4e, 0x62, 0x3c, 0x59, 0x60, 0x51, 0x29, 0x47, 0x52, 0x59, 0x4c,
-    0x67, 0x68, 0x68, 0x4e, 0x3b, 0x72, 0x4d, 0x68, 0x44, 0x4f, 0x53, 0x63,
-    0x47, 0x5a, 0x45, 0x4f, 0x4b, 0x37, 0x43, 0x5b, 0x4b, 0x3d, 0x44, 0x41,
-    0x4a, 0x4b, 0x3c, 0x64, 0x48, 0x38, 0x42, 0x3f, 0x48, 0x46, 0x4b, 0x46,
-    0x46, 0x4f, 0x46, 0x46, 0x44, 0x3c, 0x4b, 0x4f, 0x4d, 0x4a, 0x4b, 0x46,
-    0x4d, 0x4f, 0x4f, 0x3f, 0x3a, 0x4b, 0x55, 0x3c, 0x51, 0x56, 0x4d, 0x42,
-    0x52, 0x5a, 0x3e, 0x4b, 0x54, 0x57, 0x4e, 0x4d, 0x4e, 0x5b, 0x4e, 0x49,
-    0x4e, 0x3c, 0x40, 0x41, 0x40, 0x4d, 0x48, 0x42, 0x49, 0x4e, 0x4f, 0x47,
-    0x47, 0x48, 0x50, 0x49, 0x51, 0x46, 0x44, 0x45, 0x49, 0x46, 0x43, 0x48,
-    0x48, 0x49, 0x4d, 0x4c, 0x45, 0x4f, 0x4c, 0x45, 0x44, 0x40, 0x49, 0x45,
-    0x49, 0x51, 0x4b, 0x4b, 0x50, 0x4b, 0x48, 0x3d, 0x4e, 0x52, 0x4a, 0x47,
-    0x49, 0x41, 0x55, 0x3d, 0x48, 0x4d, 0x49, 0x48, 0x4e, 0x4c, 0x48, 0x3d,
-    0x3f, 0x4c, 0x4e, 0x53, 0x3e, 0x48, 0x4a, 0x3f, 0x54, 0x4d, 0x54, 0x4b,
-    0x47, 0x4e, 0x44, 0x48, 0x49, 0x4b, 0x4c, 0x49, 0x4d, 0x42, 0x52, 0x4b,
-    0x40, 0x3e, 0x54, 0x49, 0x55, 0x45, 0x47, 0x4d, 0x45, 0x5c, 0x60, 0x40,
-    0x57, 0x60, 0x5b, 0x27, 0x4a, 0x5a, 0x64, 0x53, 0x6a, 0x5a, 0x5f, 0x52,
-    0x3a, 0x72, 0x4b, 0x5f, 0x45, 0x56, 0x5f, 0x5f, 0x54, 0x5f, 0x39, 0x52,
-    0x51, 0x3e, 0x3b, 0x5a, 0x44, 0x32, 0x46, 0x50, 0x3a, 0x4f, 0x44, 0x5d,
-    0x4c, 0x41, 0x39, 0x3f, 0x45, 0x46, 0x3b, 0x43, 0x46, 0x51, 0x3c, 0x4c,
-    0x4b, 0x43, 0x4b, 0x51, 0x43, 0x48, 0x4d, 0x43, 0x38, 0x46, 0x46, 0x43,
-    0x44, 0x4a, 0x46, 0x49, 0x48, 0x50, 0x4e, 0x4a, 0x4e, 0x58, 0x4a, 0x49,
-    0x48, 0x4f, 0x4a, 0x49, 0x41, 0x57, 0x51, 0x50, 0x4b, 0x48, 0x47, 0x4b,
-    0x53, 0x3d, 0x4b, 0x4c, 0x4b, 0x4b, 0x55, 0x56, 0x45, 0x49, 0x46, 0x4c,
-    0x45, 0x51, 0x47, 0x50, 0x40, 0x4b, 0x4f, 0x4b, 0x4d, 0x4a, 0x4f, 0x50,
-    0x49, 0x53, 0x50, 0x46, 0x40, 0x48, 0x4a, 0x4a, 0x49, 0x4a, 0x42, 0x45,
-    0x4b, 0x45, 0x42, 0x45, 0x4e, 0x4e, 0x44, 0x41, 0x4b, 0x4a, 0x49, 0x3f,
-    0x41, 0x51, 0x48, 0x4c, 0x40, 0x41, 0x51, 0x42, 0x49, 0x49, 0x48, 0x42,
-    0x48, 0x4c, 0x4b, 0x3c, 0x49, 0x45, 0x42, 0x49, 0x4c, 0x46, 0x45, 0x43,
-    0x43, 0x48, 0x48, 0x41, 0x43, 0x42, 0x4c, 0x4b, 0x40, 0x45, 0x44, 0x46,
-    0x4c, 0x4b, 0x4e, 0x4d, 0x3f, 0x59, 0x55, 0x41, 0x56, 0x5a, 0x51, 0x30,
-    0x49, 0x5a, 0x63, 0x4d, 0x61, 0x5b, 0x64, 0x55, 0x34, 0x7a, 0x4c, 0x62,
-    0x3e, 0x5d, 0x56, 0x60, 0x48, 0x61, 0x3f, 0x54, 0x46, 0x40, 0x42, 0x56,
-    0x52, 0x35, 0x4c, 0x59, 0x45, 0x4c, 0x42, 0x60, 0x49, 0x3f, 0x4c, 0x3c,
-    0x52, 0x36, 0x46, 0x3d, 0x58, 0x4b, 0x41, 0x48, 0x3e, 0x45, 0x4e, 0x54,
-    0x4c, 0x56, 0x47, 0x44, 0x39, 0x4a, 0x4a, 0x4a, 0x46, 0x48, 0x4a, 0x48,
-    0x51, 0x4f, 0x4b, 0x49, 0x45, 0x4b, 0x44, 0x4c, 0x3e, 0x4c, 0x42, 0x59,
-    0x47, 0x55, 0x47, 0x47, 0x41, 0x44, 0x44, 0x4a, 0x44, 0x4b, 0x44, 0x46,
-    0x49, 0x5a, 0x48, 0x5d, 0x4f, 0x4a, 0x47, 0x50, 0x48, 0x4e, 0x44, 0x57,
-    0x49, 0x46, 0x42, 0x4d, 0x3d, 0x4a, 0x4a, 0x58, 0x41, 0x4d, 0x3c, 0x47,
-    0x42, 0x4e, 0x4d, 0x49, 0x44, 0x4b, 0x4c, 0x4b, 0x53, 0x42, 0x4a, 0x46,
-    0x4e, 0x56, 0x4b, 0x47, 0x50, 0x43, 0x4f, 0x48, 0x49, 0x50, 0x48, 0x50,
-    0x42, 0x4c, 0x4e, 0x3c, 0x41, 0x4f, 0x4a, 0x41, 0x44, 0x47, 0x4c, 0x42,
-    0x51, 0x4f, 0x53, 0x46, 0x4c, 0x4b, 0x48, 0x51, 0x47, 0x4b, 0x4c, 0x4d,
-    0x4d, 0x49, 0x3d, 0x44, 0x4b, 0x42, 0x43, 0x49, 0x51, 0x47, 0x4c, 0x4b,
-    0x4a, 0x50, 0x5b, 0x43, 0x5b, 0x68, 0x54, 0x31, 0x4c, 0x5d, 0x5c, 0x54,
-    0x63, 0x5a, 0x61, 0x54, 0x3d, 0x7a, 0x51, 0x5b, 0x40, 0x59, 0x5a, 0x62,
-    0x4c, 0x5e, 0x42, 0x58, 0x49, 0x3c, 0x38, 0x50, 0x54, 0x37, 0x42, 0x51,
-    0x4d, 0x4f, 0x42, 0x68, 0x4a, 0x40, 0x4e, 0x40, 0x3f, 0x3e, 0x3f, 0x40,
-    0x54, 0x52, 0x3e, 0x43, 0x46, 0x4a, 0x48, 0x51, 0x4e, 0x4d, 0x42, 0x47,
-    0x3f, 0x51, 0x47, 0x44, 0x3f, 0x4c, 0x46, 0x47, 0x4f, 0x55, 0x4b, 0x4e,
-    0x4c, 0x51, 0x40, 0x51, 0x47, 0x4a, 0x44, 0x5c, 0x48, 0x54, 0x4b, 0x46,
-    0x49, 0x4b, 0x53, 0x59, 0x43, 0x3e, 0x45, 0x4e, 0x4f, 0x58, 0x4b, 0x64,
-    0x41, 0x4b, 0x45, 0x4a, 0x4c, 0x51, 0x47, 0x57, 0x45, 0x46, 0x43, 0x4f,
-    0x4d, 0x4d, 0x49, 0x58, 0x4b, 0x52, 0x43, 0x4b, 0x45, 0x4c, 0x50, 0x4c,
-    0x4e, 0x4b, 0x40, 0x4c, 0x44, 0x4e, 0x4c, 0x47, 0x41, 0x55, 0x45, 0x4a,
-    0x4c, 0x48, 0x46, 0x41, 0x47, 0x52, 0x44, 0x4f, 0x48, 0x49, 0x4b, 0x47,
-    0x50, 0x4f, 0x42, 0x4a, 0x44, 0x4b, 0x52, 0x43, 0x45, 0x4e, 0x46, 0x49,
-    0x45, 0x52, 0x51, 0x45, 0x44, 0x41, 0x4c, 0x46, 0x4c, 0x4b, 0x44, 0x4d,
-    0x4f, 0x48, 0x44, 0x4d, 0x56, 0x48, 0x50, 0x4f, 0x3b, 0x4e, 0x55, 0x43,
-    0x52, 0x62, 0x57, 0x2c, 0x4d, 0x5e, 0x5e, 0x50, 0x64, 0x5b, 0x6a, 0x55,
-    0x39, 0x7d, 0x4b, 0x5e, 0x43, 0x54, 0x5d, 0x5c, 0x4d, 0x5c, 0x42, 0x51,
-    0x4c, 0x3d, 0x46, 0x51, 0x4c, 0x2a, 0x3e, 0x54, 0x47, 0x48, 0x46, 0x64,
-    0x42, 0x3d, 0x47, 0x3f, 0x42, 0x45, 0x49, 0x3b, 0x59, 0x50, 0x4c, 0x46,
-    0x4d, 0x44, 0x47, 0x4d, 0x4a, 0x50, 0x41, 0x48, 0x43, 0x50, 0x3e, 0x44,
-    0x4b, 0x53, 0x48, 0x49, 0x51, 0x51, 0x4d, 0x57, 0x49, 0x4f, 0x53, 0x50,
-    0x46, 0x4f, 0x41, 0x5d, 0x47, 0x46, 0x49, 0x51, 0x45, 0x41, 0x4a, 0x56,
-    0x4f, 0x4e, 0x4d, 0x4a, 0x3e, 0x55, 0x47, 0x65, 0x48, 0x51, 0x4d, 0x4e,
-    0x46, 0x43, 0x48, 0x5b, 0x48, 0x4f, 0x4f, 0x48, 0x4b, 0x4d, 0x4e, 0x5c,
-    0x4f, 0x4c, 0x54, 0x48, 0x4a, 0x4d, 0x4e, 0x4e, 0x44, 0x48, 0x43, 0x52,
-    0x41, 0x52, 0x48, 0x4f, 0x46, 0x4f, 0x51, 0x41, 0x44, 0x45, 0x41, 0x4b,
-    0x43, 0x4e, 0x4e, 0x42, 0x48, 0x41, 0x45, 0x43, 0x44, 0x43, 0x4c, 0x4c,
-    0x51, 0x54, 0x4c, 0x32, 0x46, 0x52, 0x4e, 0x49, 0x40, 0x4d, 0x43, 0x4f,
-    0x4a, 0x4d, 0x4d, 0x49, 0x46, 0x4c, 0x41, 0x4d, 0x41, 0x3a, 0x50, 0x4c,
-    0x5a, 0x4e, 0x49, 0x53, 0x4d, 0x53, 0x53, 0x3d, 0x52, 0x64, 0x55, 0x2a,
-    0x47, 0x5d, 0x61, 0x51, 0x5b, 0x5d, 0x66, 0x52, 0x3f, 0xfd, 0x55, 0x5a,
-    0x4b, 0x54, 0x5b, 0x60, 0x49, 0x5d, 0x43, 0x57, 0x47, 0x41, 0x45, 0x5e,
-    0x4c, 0x28, 0x3e, 0x40, 0x49, 0x4e, 0x40, 0x69, 0x4a, 0x44, 0x45, 0x43,
-    0x45, 0x3d, 0x39, 0x40, 0x4c, 0x53, 0x4b, 0x3d, 0x4e, 0x43, 0x48, 0x55,
-    0x4d, 0x50, 0x4d, 0x49, 0x4f, 0x48, 0x3e, 0x46, 0x47, 0x56, 0x40, 0x48,
-    0x46, 0x53, 0x50, 0x5d, 0x43, 0x54, 0x49, 0x47, 0x49, 0x4c, 0x48, 0x5d,
-    0x49, 0x51, 0x50, 0x3d, 0x41, 0x47, 0x48, 0x64, 0x4b, 0x44, 0x49, 0x41,
-    0x54, 0x48, 0x3d, 0x6b, 0x4c, 0x5a, 0x48, 0x4e, 0x40, 0x4c, 0x52, 0x5f,
-    0x54, 0x4a, 0x3f, 0x48, 0x43, 0x43, 0x44, 0x66, 0x49, 0x47, 0x43, 0x46,
-    0x47, 0x54, 0x42, 0x54, 0x4b, 0x4e, 0x49, 0x49, 0x49, 0x4b, 0x52, 0x4f,
-    0x43, 0x46, 0x4b, 0x49, 0x54, 0x4b, 0x40, 0x48, 0x47, 0x4a, 0x46, 0x47,
-    0x44, 0x47, 0x4c, 0x37, 0x3f, 0x49, 0x45, 0x44, 0x50, 0x49, 0x44, 0x36,
-    0x4d, 0x40, 0x45, 0x49, 0x53, 0x55, 0x44, 0x42, 0x47, 0x48, 0x46, 0x40,
-    0x4f, 0x4c, 0x41, 0x42, 0x52, 0x3a, 0x43, 0x46, 0x55, 0x51, 0x4e, 0x4f,
-    0x48, 0x51, 0x55, 0x48, 0x52, 0x66, 0x4e, 0x33, 0x49, 0x5b, 0x5f, 0x4b,
-    0x5f, 0x5b, 0x66, 0x52, 0x41, 0x7c, 0x4a, 0x59, 0x47, 0x59, 0x58, 0x67,
-    0x49, 0x5e, 0x44, 0x57, 0x49, 0x4c, 0x43, 0x56, 0x41, 0x27, 0x4c, 0x44,
-    0x51, 0x44, 0x42, 0x65, 0x49, 0x44, 0x40, 0x3d, 0x4d, 0x3e, 0x4c, 0x3c,
-    0x4f, 0x4b, 0x45, 0x44, 0x4d, 0x48, 0x47, 0x54, 0x4d, 0x4e, 0x44, 0x42,
-    0x47, 0x44, 0x3d, 0x49, 0x4e, 0x50, 0x49, 0x45, 0x58, 0x4a, 0x54, 0x5c,
-    0x41, 0x49, 0x4f, 0x42, 0x44, 0x4f, 0x4a, 0x62, 0x48, 0x50, 0x48, 0x43,
-    0x51, 0x53, 0x47, 0x6c, 0x40, 0x46, 0x3d, 0x46, 0x4a, 0x50, 0x43, 0x69,
-    0x49, 0x4f, 0x4a, 0x4c, 0x49, 0x46, 0x43, 0x6a, 0x48, 0x50, 0x49, 0x48,
-    0x48, 0x51, 0x4b, 0x65, 0x42, 0x4b, 0x4d, 0x48, 0x44, 0x4e, 0x49, 0x60,
-    0x44, 0x52, 0x42, 0x42, 0x47, 0x48, 0x4b, 0x51, 0x50, 0x4b, 0x3c, 0x4d,
-    0x4c, 0x44, 0x48, 0x55, 0x51, 0x4c, 0x55, 0x4e, 0x52, 0x4c, 0x4b, 0x39,
-    0x48, 0x42, 0x49, 0x49, 0x49, 0x50, 0x49, 0x32, 0x4e, 0x4b, 0x45, 0x4f,
-    0x42, 0x4b, 0x47, 0x50, 0x48, 0x45, 0x54, 0x49, 0x4c, 0x46, 0x40, 0x46,
-    0x43, 0x3d, 0x51, 0x44, 0x53, 0x4f, 0x54, 0x55, 0x43, 0x4f, 0x5b, 0x47,
-    0x53, 0x6c, 0x57, 0x2e, 0x50, 0x55, 0x5a, 0x4d, 0x57, 0x5d, 0x70, 0x50,
-    0x3f, 0x79, 0x4a, 0x5a, 0x4c, 0x58, 0x59, 0x63, 0x45, 0x69, 0x48, 0x58,
-    0x42, 0x4b, 0x43, 0x5c, 0x46, 0x28, 0x48, 0x49, 0x4c, 0x3f, 0x45, 0x58,
-    0x45, 0x44, 0x47, 0x40, 0x4c, 0x42, 0x3e, 0x37, 0x45, 0x54, 0x48, 0x3b,
-    0x4e, 0x48, 0x43, 0x4a, 0x50, 0x4a, 0x49, 0x46, 0x4c, 0x54, 0x3f, 0x4b,
-    0x4e, 0x56, 0x48, 0x49, 0x49, 0x4c, 0x51, 0x5f, 0x4d, 0x4b, 0x43, 0x4d,
-    0x47, 0x51, 0x43, 0x59, 0x45, 0x4e, 0x4f, 0x45, 0x44, 0x54, 0x44, 0x6d,
-    0x47, 0x51, 0x43, 0x4e, 0x4c, 0x4f, 0x43, 0x6d, 0x48, 0x53, 0x4b, 0x47,
-    0x49, 0x48, 0x46, 0x6a, 0x51, 0x4c, 0x4d, 0x45, 0x4e, 0x47, 0x46, 0x62,
-    0x4a, 0x54, 0x51, 0x4c, 0x47, 0x4d, 0x4a, 0x61, 0x3d, 0x50, 0x4c, 0x4c,
-    0x45, 0x3f, 0x3e, 0x54, 0x3d, 0x53, 0x48, 0x47, 0x52, 0x4b, 0x47, 0x51,
-    0x4f, 0x45, 0x4b, 0x4a, 0x4c, 0x46, 0x44, 0x37, 0x42, 0x50, 0x49, 0x4f,
-    0x51, 0x41, 0x44, 0x38, 0x54, 0x40, 0x51, 0x52, 0x3e, 0x43, 0x44, 0x47,
-    0x49, 0x4b, 0x4b, 0x46, 0x53, 0x54, 0x55, 0x4b, 0x4a, 0x37, 0x43, 0x4a,
-    0x51, 0x47, 0x51, 0x54, 0x43, 0x46, 0x56, 0x3d, 0x54, 0x66, 0x4f, 0x30,
-    0x45, 0x52, 0x5a, 0x43, 0x5c, 0x65, 0x5d, 0x52, 0x32, 0x77, 0x53, 0x5f,
-    0x4a, 0x5a, 0x4f, 0x5e, 0x4e, 0x61, 0x4b, 0x5b, 0x4a, 0x53, 0x3e, 0x61,
-    0x47, 0x24, 0x3e, 0x48, 0x4d, 0x43, 0x40, 0x53, 0x4e, 0x41, 0x43, 0x3d,
-    0x50, 0x49, 0x41, 0x3a, 0x4e, 0x4b, 0x48, 0x49, 0x48, 0x49, 0x46, 0x50,
-    0x4f, 0x4b, 0x47, 0x4b, 0x48, 0x52, 0x3e, 0x4d, 0x4d, 0x59, 0x4c, 0x3e,
-    0x52, 0x49, 0x4f, 0x5e, 0x54, 0x59, 0x47, 0x4d, 0x40, 0x4c, 0x4b, 0x64,
-    0x42, 0x4c, 0x53, 0x46, 0x4e, 0x50, 0x46, 0x6a, 0x41, 0x59, 0x44, 0x4b,
-    0x4f, 0x44, 0x52, 0x6c, 0x54, 0x4e, 0x46, 0x48, 0x42, 0x3d, 0x44, 0x67,
-    0x44, 0x4f, 0x47, 0x54, 0x4c, 0x4f, 0x43, 0x61, 0x4c, 0x54, 0x4f, 0x43,
-    0x49, 0x40, 0x4a, 0x5f, 0x4a, 0x52, 0x47, 0x43, 0x4c, 0x43, 0x49, 0x53,
-    0x4c, 0x4b, 0x43, 0x3d, 0x4e, 0x45, 0x49, 0x50, 0x44, 0x53, 0x4f, 0x48,
-    0x4b, 0x46, 0x44, 0x3c, 0x50, 0x42, 0x43, 0x40, 0x47, 0x43, 0x42, 0x34,
-    0x47, 0x42, 0x3f, 0x4a, 0x48, 0x42, 0x48, 0x4c, 0x42, 0x4c, 0x4e, 0x47,
-    0x48, 0x47, 0x51, 0x51, 0x4d, 0x3d, 0x3e, 0x4b, 0x54, 0x4c, 0x4c, 0x59,
-    0x4f, 0x50, 0x57, 0x3c, 0x54, 0x62, 0x54, 0x35, 0x3d, 0x5a, 0x5b, 0x47,
-    0x59, 0x63, 0x66, 0x4d, 0x3c, 0x79, 0x50, 0x5f, 0x45, 0x58, 0x4e, 0x5d,
-    0x48, 0x61, 0x43, 0x54, 0x47, 0x54, 0x4d, 0x54, 0x4b, 0x25, 0x41, 0x44,
-    0x4c, 0x4a, 0x3b, 0x52, 0x47, 0x3c, 0x45, 0x3c, 0x53, 0x44, 0x44, 0x40,
-    0x50, 0x4c, 0x45, 0x3a, 0x4c, 0x51, 0x44, 0x49, 0x4d, 0x52, 0x4d, 0x4b,
-    0x45, 0x52, 0x3d, 0x50, 0x4a, 0x58, 0x4a, 0x47, 0x4d, 0x47, 0x4e, 0x52,
-    0x4f, 0x4d, 0x4f, 0x49, 0x52, 0x52, 0x4c, 0x5e, 0x47, 0x4d, 0x46, 0x4d,
-    0x4c, 0x48, 0x50, 0x70, 0x41, 0x4a, 0x48, 0x3d, 0x45, 0x48, 0x45, 0x74,
-    0x47, 0x4c, 0x43, 0x4f, 0x4a, 0x4a, 0x40, 0x68, 0x52, 0x49, 0x3e, 0x3e,
-    0x4e, 0x4b, 0x4b, 0x69, 0x42, 0x4f, 0x45, 0x47, 0x3f, 0x45, 0x46, 0x56,
-    0x45, 0x4a, 0x47, 0x44, 0x52, 0x4b, 0x53, 0x4e, 0x4e, 0x46, 0x45, 0x40,
-    0x47, 0x4b, 0x53, 0x52, 0x53, 0x51, 0x4f, 0x46, 0x42, 0x43, 0x50, 0x3e,
-    0x48, 0x4e, 0x41, 0x53, 0x4d, 0x48, 0x48, 0x33, 0x40, 0x43, 0x4b, 0x42,
-    0x52, 0x4c, 0x42, 0x4e, 0x41, 0x4e, 0x4f, 0x50, 0x43, 0x49, 0x4d, 0x47,
-    0x4a, 0x3a, 0x3f, 0x51, 0x51, 0x44, 0x4e, 0x54, 0x40, 0x55, 0x59, 0x3c,
-    0x57, 0x67, 0x4e, 0x2e, 0x4c, 0x5b, 0x5b, 0x51, 0x58, 0x63, 0x62, 0x52,
-    0x3c, 0x72, 0x51, 0x5a, 0x4e, 0x53, 0x4a, 0x5c, 0x51, 0x69, 0x42, 0x51,
-    0x48, 0x54, 0x48, 0x57, 0x3e, 0x37, 0x3f, 0x4d, 0x4d, 0x4a, 0x35, 0x57,
-    0x4e, 0x40, 0x45, 0x4a, 0x45, 0x4e, 0x49, 0x40, 0x49, 0x53, 0x51, 0x44,
-    0x4a, 0x50, 0x4b, 0x4b, 0x50, 0x4f, 0x3e, 0x44, 0x45, 0x44, 0x4c, 0x51,
-    0x47, 0x51, 0x46, 0x42, 0x48, 0x50, 0x49, 0x4d, 0x43, 0x54, 0x52, 0x4d,
-    0x4e, 0x4f, 0x3f, 0x63, 0x54, 0x57, 0x41, 0x44, 0x4e, 0x50, 0x4e, 0x66,
-    0x41, 0x53, 0x4b, 0x4d, 0x4e, 0x4f, 0x43, 0x6d, 0x4e, 0x51, 0x49, 0x4f,
-    0x49, 0x4a, 0x4a, 0x6c, 0x4b, 0x4f, 0x3d, 0x47, 0x4d, 0x51, 0x3c, 0x66,
-    0x4b, 0x56, 0x3e, 0x4c, 0x41, 0x46, 0x45, 0x68, 0x47, 0x4b, 0x4a, 0x54,
-    0x53, 0x48, 0x51, 0x59, 0x45, 0x43, 0x50, 0x45, 0x4f, 0x45, 0x42, 0x55,
-    0x48, 0x52, 0x4c, 0x46, 0x52, 0x49, 0x47, 0x3d, 0x55, 0x48, 0x52, 0x52,
-    0x40, 0x4e, 0x47, 0x31, 0x45, 0x4f, 0x42, 0x4a, 0x4e, 0x50, 0x42, 0x4a,
-    0x49, 0x57, 0x46, 0x4b, 0x45, 0x4e, 0x4d, 0x46, 0x47, 0x43, 0x50, 0x4e,
-    0x4f, 0x4c, 0x53, 0x55, 0x45, 0x51, 0x5b, 0x3a, 0x52, 0x64, 0x54, 0x2d,
-    0x42, 0x59, 0x59, 0x45, 0x59, 0x67, 0x69, 0x53, 0x3f, 0x78, 0x50, 0x60,
-    0x4c, 0x4c, 0x5b, 0x53, 0x45, 0x63, 0x49, 0x63, 0x51, 0x4c, 0x41, 0x4e,
-    0x4b, 0x37, 0x45, 0x4e, 0x48, 0x4c, 0x39, 0x55, 0x44, 0x37, 0x3c, 0x49,
-    0x44, 0x56, 0x3e, 0x40, 0x4d, 0x45, 0x4c, 0x43, 0x42, 0x41, 0x40, 0x42,
-    0x57, 0x4f, 0x43, 0x3f, 0x52, 0x53, 0x51, 0x4b, 0x4b, 0x55, 0x46, 0x40,
-    0x49, 0x45, 0x40, 0x4f, 0x47, 0x58, 0x4b, 0x53, 0x4e, 0x52, 0x54, 0x5e,
-    0x4b, 0x51, 0x50, 0x44, 0x50, 0x4b, 0x4f, 0x70, 0x49, 0x4f, 0x4c, 0x50,
-    0x45, 0x56, 0x4b, 0x6b, 0x49, 0x52, 0x4a, 0x3f, 0x44, 0x4b, 0x48, 0x72,
-    0x4c, 0x47, 0x4e, 0x43, 0x46, 0x4c, 0x4f, 0x61, 0x4a, 0x52, 0x52, 0x46,
-    0x4a, 0x4d, 0x46, 0x65, 0x48, 0x4e, 0x4d, 0x4e, 0x46, 0x4e, 0x53, 0x59,
-    0x43, 0x49, 0x43, 0x47, 0x45, 0x47, 0x53, 0x50, 0x3e, 0x4d, 0x41, 0x46,
-    0x4c, 0x4a, 0x4c, 0x35, 0x3f, 0x4f, 0x50, 0x48, 0x47, 0x4d, 0x4c, 0x32,
-    0x45, 0x53, 0x43, 0x4d, 0x4e, 0x4a, 0x3e, 0x4b, 0x55, 0x4f, 0x53, 0x4c,
-    0x4a, 0x4d, 0x48, 0x53, 0x4f, 0x3a, 0x47, 0x4b, 0x4e, 0x4e, 0x51, 0x59,
-    0x41, 0x50, 0x57, 0x38, 0x5d, 0x63, 0x59, 0x2b, 0x45, 0x53, 0x5a, 0x4e,
-    0x5c, 0x60, 0x5e, 0x4c, 0x41, 0x6f, 0x53, 0x5c, 0x48, 0x53, 0x56, 0x54,
-    0x4b, 0x62, 0x46, 0x63, 0x47, 0x4e, 0x40, 0x51, 0x43, 0x36, 0x44, 0x42,
-    0x46, 0x51, 0x41, 0x54, 0x4e, 0x36, 0x40, 0x4b, 0x55, 0x49, 0x40, 0x3f,
-    0x4b, 0x42, 0x4a, 0x4a, 0x48, 0x47, 0x40, 0x43, 0x4d, 0x4f, 0x55, 0x3f,
-    0x53, 0x42, 0x4d, 0x56, 0x49, 0x51, 0x4f, 0x41, 0x3b, 0x48, 0x43, 0x4e,
-    0x4b, 0x5c, 0x4f, 0x45, 0x4a, 0x4c, 0x46, 0x66, 0x43, 0x45, 0x46, 0x48,
-    0x4f, 0x4e, 0x40, 0x71, 0x4b, 0x4e, 0x3e, 0x42, 0x4d, 0x52, 0x42, 0x71,
-    0x4c, 0x54, 0x4f, 0x3f, 0x4c, 0x43, 0x4a, 0x73, 0x48, 0x48, 0x4c, 0x4b,
-    0x4c, 0x4d, 0x40, 0x72, 0x3e, 0x51, 0x49, 0x48, 0x52, 0x53, 0x45, 0x65,
-    0x52, 0x4e, 0x4f, 0x44, 0x4c, 0x43, 0x4a, 0x5e, 0x3e, 0x56, 0x46, 0x55,
-    0x55, 0x43, 0x49, 0x51, 0x4f, 0x52, 0x49, 0x4d, 0x46, 0x47, 0x49, 0x3e,
-    0x51, 0x49, 0x41, 0x53, 0x42, 0x47, 0x46, 0x3b, 0x4d, 0x4e, 0x48, 0x44,
-    0x42, 0x48, 0x4c, 0x47, 0x42, 0x4e, 0x4a, 0x3e, 0x44, 0x54, 0x4a, 0x4d,
-    0x49, 0x41, 0x41, 0x53, 0x52, 0x4c, 0x4c, 0x56, 0x49, 0x4a, 0x5a, 0x3f,
-    0x5b, 0x5c, 0x59, 0x2f, 0x49, 0x52, 0x5a, 0x4e, 0x5a, 0x61, 0x67, 0x4c,
-    0x41, 0x6f, 0x5a, 0x5a, 0x40, 0x5a, 0x54, 0x4e, 0x49, 0x66, 0x45, 0x5a,
-    0x4a, 0x45, 0x44, 0x4b, 0x44, 0x36, 0x41, 0x4c, 0x45, 0x44, 0x3d, 0x51,
-    0x3f, 0x35, 0x3c, 0x46, 0x53, 0x5c, 0x3f, 0x3e, 0x50, 0x43, 0x46, 0x4b,
-    0x40, 0x54, 0x41, 0x47, 0x4b, 0x51, 0x41, 0x46, 0x4a, 0x4d, 0x51, 0x52,
-    0x43, 0x58, 0x45, 0x46, 0x4e, 0x46, 0x4a, 0x4b, 0x44, 0x54, 0x4c, 0x4c,
-    0x43, 0x59, 0x48, 0x61, 0x4e, 0x4f, 0x4d, 0x4d, 0x4a, 0x52, 0x4c, 0x6e,
-    0x49, 0x57, 0x48, 0x4d, 0x46, 0x46, 0x4d, 0x72, 0x4a, 0x4e, 0x47, 0x44,
-    0x49, 0x4f, 0x48, 0x73, 0x42, 0x40, 0x4d, 0x44, 0x4d, 0x57, 0x3e, 0x69,
-    0x50, 0x52, 0x4c, 0x55, 0x46, 0x4c, 0x44, 0x5f, 0x4b, 0x4d, 0x55, 0x4c,
-    0x48, 0x49, 0x4a, 0x5e, 0x47, 0x4b, 0x45, 0x53, 0x55, 0x53, 0x4d, 0x53,
-    0x47, 0x5c, 0x45, 0x4e, 0x4e, 0x52, 0x4c, 0x39, 0x4b, 0x4c, 0x49, 0x46,
-    0x4a, 0x4e, 0x4b, 0x33, 0x46, 0x47, 0x52, 0x41, 0x49, 0x4b, 0x4c, 0x48,
-    0x51, 0x53, 0x44, 0x4c, 0x4a, 0x45, 0x46, 0x49, 0x49, 0x4b, 0x50, 0x47,
-    0x4d, 0x4b, 0x4c, 0x4f, 0x44, 0x45, 0x58, 0x3c, 0x56, 0x5a, 0x56, 0x23,
-    0x4f, 0x4d, 0x5c, 0x4e, 0x59, 0x5a, 0x65, 0x43, 0x45, 0x66, 0x54, 0x5f,
-    0x45, 0x5e, 0x54, 0x4f, 0x48, 0x5f, 0x44, 0x59, 0x48, 0x46, 0x47, 0x49,
-    0x4d, 0x3c, 0x49, 0x54, 0x3e, 0x48, 0x43, 0x5b, 0x4a, 0x35, 0x41, 0x43,
-    0x4b, 0x55, 0x43, 0x38, 0x46, 0x42, 0x4a, 0x4e, 0x54, 0x4b, 0x4d, 0x46,
-    0x43, 0x4e, 0x44, 0x47, 0x56, 0x4c, 0x51, 0x57, 0x41, 0x4d, 0x43, 0x41,
-    0x51, 0x47, 0x41, 0x51, 0x51, 0x4f, 0x46, 0x50, 0x52, 0x4e, 0x4d, 0x60,
-    0x41, 0x49, 0x46, 0x50, 0x48, 0x56, 0x42, 0x6d, 0x40, 0x45, 0x44, 0x55,
-    0x40, 0x4e, 0x40, 0x7c, 0x47, 0x5a, 0x44, 0x44, 0x45, 0x56, 0x55, 0x71,
-    0x47, 0x4b, 0x4b, 0x45, 0x4f, 0x54, 0x4c, 0x73, 0x48, 0x55, 0x44, 0x4d,
-    0x4a, 0x47, 0x49, 0x5e, 0x4d, 0x52, 0x4e, 0x4c, 0x48, 0x52, 0x48, 0x58,
-    0x4c, 0x5a, 0x49, 0x4b, 0x53, 0x46, 0x4d, 0x4b, 0x48, 0x53, 0x41, 0x49,
-    0x4a, 0x56, 0x51, 0x3a, 0x4c, 0x4e, 0x4f, 0x51, 0x4c, 0x59, 0x47, 0x45,
-    0x4f, 0x50, 0x4a, 0x4f, 0x4d, 0x3f, 0x44, 0x4e, 0x42, 0x4a, 0x4a, 0x43,
-    0x46, 0x4e, 0x4c, 0x4f, 0x47, 0x47, 0x4c, 0x4b, 0x52, 0x50, 0x50, 0x4b,
-    0x42, 0x45, 0x54, 0x44, 0x54, 0x59, 0x4c, 0x2b, 0x4d, 0x4c, 0x55, 0x4e,
-    0x5c, 0x5b, 0x5a, 0x42, 0x47, 0x5e, 0x56, 0x59, 0x47, 0x65, 0x55, 0x4c,
-    0x4c, 0x59, 0x42, 0x5a, 0x4e, 0x46, 0x4e, 0x4b, 0x53, 0x46, 0x49, 0x56,
-    0x48, 0x58, 0x4b, 0x4f, 0x45, 0x38, 0x40, 0x44, 0x49, 0x51, 0x4a, 0x3b,
-    0x53, 0x40, 0x40, 0x48, 0x51, 0x49, 0x44, 0x46, 0x52, 0x4b, 0x4e, 0x45,
-    0x48, 0x5a, 0x4e, 0x57, 0x44, 0x53, 0x49, 0x40, 0x4c, 0x47, 0x41, 0x4f,
-    0x49, 0x55, 0x46, 0x50, 0x57, 0x5b, 0x48, 0x66, 0x50, 0x49, 0x51, 0x55,
-    0x55, 0x4f, 0x47, 0x72, 0x49, 0x4f, 0x41, 0x4c, 0x49, 0x42, 0x48, 0x75,
-    0x4a, 0x55, 0x45, 0x4a, 0x41, 0x51, 0x41, 0x70, 0x47, 0x49, 0x42, 0x52,
-    0x4f, 0x47, 0x46, 0x63, 0x4f, 0x53, 0x46, 0x4f, 0x49, 0x53, 0x52, 0x63,
-    0x4c, 0x59, 0x46, 0x41, 0x49, 0x51, 0x3e, 0x53, 0x45, 0x52, 0x51, 0x40,
-    0x4f, 0x4c, 0x41, 0x4c, 0x47, 0x4a, 0x46, 0x47, 0x53, 0x47, 0x48, 0x39,
-    0x53, 0x4b, 0x46, 0x4b, 0x50, 0x4c, 0x41, 0x40, 0x48, 0x4e, 0x49, 0x4e,
-    0x44, 0x53, 0x44, 0x4e, 0x53, 0x49, 0x49, 0x4e, 0x46, 0x3f, 0x45, 0x42,
-    0x4c, 0x47, 0x42, 0x4e, 0x49, 0x4a, 0x49, 0x44, 0x51, 0x48, 0x57, 0x4c,
-    0x4d, 0x60, 0x4e, 0x2d, 0x46, 0x4d, 0x58, 0x53, 0x5c, 0x56, 0x5e, 0x41,
-    0x3e, 0x66, 0x53, 0x5b, 0x49, 0x59, 0x5a, 0x55, 0x4e, 0x59, 0x46, 0x4a,
-    0x44, 0x42, 0x45, 0x3d, 0x4d, 0x45, 0x44, 0x4f, 0x4d, 0x53, 0x42, 0x5a,
-    0x43, 0x3c, 0x48, 0x4f, 0x44, 0x59, 0x3f, 0x33, 0x45, 0x48, 0x43, 0x45,
-    0x4d, 0x56, 0x48, 0x44, 0x3e, 0x48, 0x46, 0x4d, 0x44, 0x53, 0x46, 0x4e,
-    0x45, 0x52, 0x40, 0x46, 0x4c, 0x50, 0x4e, 0x4b, 0x4d, 0x46, 0x48, 0x46,
-    0x50, 0x52, 0x4e, 0x57, 0x3f, 0x4a, 0x49, 0x50, 0x53, 0x4e, 0x41, 0x66,
-    0x49, 0x4f, 0x40, 0x4b, 0x50, 0x4c, 0x4a, 0x70, 0x42, 0x51, 0x41, 0x4c,
-    0x50, 0x4f, 0x46, 0x60, 0x45, 0x47, 0x54, 0x4c, 0x49, 0x59, 0x52, 0x61,
-    0x4a, 0x53, 0x52, 0x4f, 0x4b, 0x4c, 0x46, 0x56, 0x4b, 0x54, 0x4f, 0x47,
-    0x53, 0x49, 0x4f, 0x50, 0x4a, 0x54, 0x45, 0x4e, 0x47, 0x48, 0x47, 0x42,
-    0x49, 0x44, 0x46, 0x46, 0x55, 0x4c, 0x4f, 0x36, 0x4c, 0x49, 0x3f, 0x4e,
-    0x45, 0x4b, 0x4b, 0x36, 0x48, 0x4f, 0x4b, 0x50, 0x45, 0x47, 0x49, 0x3f,
-    0x50, 0x4b, 0x52, 0x48, 0x4c, 0x41, 0x49, 0x43, 0x4e, 0x3c, 0x43, 0x45,
-    0x3e, 0x45, 0x48, 0x44, 0x4d, 0x48, 0x56, 0x47, 0x4b, 0x54, 0x52, 0x2b,
-    0x4d, 0x4e, 0x57, 0x4f, 0x57, 0x4f, 0x56, 0x43, 0x48, 0x5f, 0x4c, 0x51,
-    0x4d, 0x58, 0x4f, 0x4e, 0x50, 0x50, 0x48, 0x4a, 0x4d, 0x3f, 0x47, 0x40,
-    0x4b, 0x4a, 0x4e, 0x4b, 0x4a, 0x58, 0x42, 0x49, 0x3f, 0x42, 0x3d, 0x4d,
-    0x46, 0x53, 0x45, 0x3e, 0x4e, 0x49, 0x4f, 0x4a, 0x47, 0x46, 0x40, 0x3e,
-    0x4c, 0x4d, 0x4d, 0x45, 0x4a, 0x56, 0x40, 0x4a, 0x47, 0x57, 0x4f, 0x48,
-    0x4f, 0x48, 0x47, 0x49, 0x4e, 0x52, 0x50, 0x48, 0x42, 0x52, 0x43, 0x5a,
-    0x49, 0x42, 0x4f, 0x4f, 0x51, 0x51, 0x50, 0x5c, 0x4b, 0x43, 0x4b, 0x48,
-    0x50, 0x51, 0x4b, 0x6d, 0x53, 0x4e, 0x44, 0x4c, 0x4c, 0x51, 0x46, 0x5b,
-    0x44, 0x48, 0x4d, 0x4c, 0x46, 0x4f, 0x54, 0x54, 0x4e, 0x54, 0x42, 0x4e,
-    0x4c, 0x49, 0x49, 0x58, 0x49, 0x53, 0x53, 0x4a, 0x4e, 0x4b, 0x47, 0x53,
-    0x43, 0x55, 0x46, 0x51, 0x3d, 0x3d, 0x4c, 0x47, 0x4e, 0x51, 0x47, 0x48,
-    0x4b, 0x4c, 0x42, 0x3b, 0x43, 0x4f, 0x44, 0x4d, 0x54, 0x4b, 0x4a, 0x47,
-    0x4c, 0x42, 0x4b, 0x43, 0x41, 0x4e, 0x4d, 0x50, 0x45, 0x46, 0x41, 0x4a,
-    0x49, 0x49, 0x54, 0x47, 0x4c, 0x4b, 0x50, 0x4e, 0x3f, 0x43, 0x40, 0x41,
-    0x44, 0x54, 0x51, 0x47, 0x4c, 0x4b, 0x4f, 0x34, 0x4d, 0x4c, 0x4f, 0x49,
-    0x56, 0x4e, 0x4b, 0x3e, 0x48, 0x53, 0x4e, 0x56, 0x49, 0x4e, 0x4c, 0x40,
-    0x55, 0x4a, 0x46, 0x4f, 0x48, 0x4a, 0x55, 0x41, 0x55, 0x3d, 0x47, 0x51,
-    0x50, 0x51, 0x45, 0x51, 0x4b, 0x4e, 0x4a, 0x4f, 0x4b, 0x45, 0x42, 0x3c,
-    0x4e, 0x46, 0x47, 0x49, 0x4a, 0x4c, 0x48, 0x41, 0x4f, 0x4a, 0x44, 0x45,
-    0x4e, 0x4e, 0x43, 0x41, 0x4c, 0x47, 0x48, 0x49, 0x4c, 0x48, 0x4f, 0x4a,
-    0x4f, 0x4a, 0x4b, 0x45, 0x42, 0x40, 0x52, 0x55, 0x4f, 0x49, 0x44, 0x54,
-    0x49, 0x48, 0x51, 0x4d, 0x44, 0x4a, 0x4d, 0x49, 0x4e, 0x4e, 0x51, 0x5d,
-    0x42, 0x4d, 0x49, 0x3f, 0x48, 0x58, 0x40, 0x5e, 0x48, 0x4f, 0x49, 0x53,
-    0x45, 0x47, 0x4f, 0x53, 0x4d, 0x4f, 0x4d, 0x4d, 0x46, 0x55, 0x43, 0x51,
-    0x4f, 0x51, 0x4a, 0x4e, 0x49, 0x42, 0x49, 0x50, 0x47, 0x4d, 0x42, 0x47,
-    0x46, 0x50, 0x55, 0x47, 0x4d, 0x47, 0x3e, 0x51, 0x4d, 0x43, 0x44, 0x39,
-    0x4e, 0x4b, 0x41, 0x48, 0x52, 0x53, 0x4d, 0x39, 0x4d, 0x51, 0x4c, 0x46,
-    0x4e, 0x47, 0x49, 0x41, 0x45, 0x4a, 0x4a, 0x45, 0x50, 0x4a, 0x40, 0x48,
-    0x43, 0x47, 0x44, 0x50, 0x4d, 0x47, 0x4a, 0x47, 0x45, 0x57, 0x41, 0x34,
-    0x51, 0x40, 0x45, 0x44, 0x3c, 0x47, 0x46, 0x47, 0x44, 0x48, 0x42, 0x40,
-    0x37, 0x53, 0x4a, 0x43, 0x49, 0x4b, 0x43, 0x44, 0x4f, 0x4f, 0x48, 0x48,
-    0x53, 0x49, 0x4b, 0x48, 0x4e, 0x4c, 0x42, 0x45, 0x4c, 0x4a, 0x4a, 0x46,
-    0x47, 0x57, 0x3e, 0x46, 0x46, 0x45, 0x4a, 0x43, 0x46, 0x49, 0x43, 0x52,
-    0x3e, 0x48, 0x4a, 0x4b, 0x47, 0x47, 0x48, 0x4a, 0x4b, 0x4b, 0x4e, 0x44,
-    0x42, 0x44, 0x50, 0x41, 0x49, 0x49, 0x4d, 0x4b, 0x44, 0x46, 0x4a, 0x52,
-    0x4d, 0x47, 0x49, 0x4b, 0x4d, 0x49, 0x41, 0x48, 0x4b, 0x3f, 0x45, 0x4f,
-    0x51, 0x41, 0x55, 0x42, 0x49, 0x4b, 0x4b, 0x51, 0x4f, 0x4f, 0x42, 0x4e,
-    0x4e, 0x4a, 0x52, 0x41, 0x4f, 0x42, 0x48, 0x3d, 0x4a, 0x44, 0x50, 0x4b,
-    0x49, 0x45, 0x51, 0x46, 0x51, 0x44, 0x4d, 0x47, 0x4a, 0x4a, 0x4d, 0x49,
-    0x4d, 0x48, 0x4d, 0x4f, 0x4d, 0x44, 0x48, 0x4e, 0x4a, 0x4b, 0x40, 0x4f,
-    0x47, 0x3a, 0x41, 0x47, 0x4a, 0x4a, 0x4a, 0x48, 0x42, 0x41, 0x4d, 0x56,
-    0x3f, 0x52, 0x4d, 0x4c, 0x44, 0x48, 0x47, 0x4e, 0x51, 0x4c, 0x49, 0x47,
-    0x44, 0x4c, 0x4b, 0x47, 0x48, 0x46, 0x47, 0x4f, 0x43, 0x41, 0x3e, 0x47,
-    0x53, 0x4a, 0x46, 0x42, 0x46, 0x61, 0x43, 0x30, 0x4e, 0x52, 0x43, 0x45,
-    0x32, 0x4a, 0x45, 0x48, 0x51, 0x3e, 0x44, 0x3b, 0x3a, 0x63, 0x4c, 0x46,
-    0x4c, 0x49, 0x3d, 0x41, 0x52, 0x53, 0x43, 0x43, 0x45, 0x3d, 0x48, 0x40,
-    0x4b, 0x4a, 0x49, 0x48, 0x4d, 0x49, 0x4b, 0x4c, 0x3f, 0x4e, 0x4b, 0x47,
-    0x45, 0x4d, 0x3f, 0x4d, 0x43, 0x50, 0x48, 0x4b, 0x54, 0x3e, 0x44, 0x4e,
-    0x3e, 0x4c, 0x43, 0x4b, 0x4c, 0x4b, 0x3e, 0x49, 0x50, 0x52, 0x4a, 0x4a,
-    0x50, 0x50, 0x43, 0x4e, 0x49, 0x48, 0x51, 0x50, 0x47, 0x3d, 0x45, 0x4b,
-    0x47, 0x46, 0x4d, 0x4c, 0x45, 0x4d, 0x4a, 0x4d, 0x42, 0x4d, 0x47, 0x4f,
-    0x40, 0x43, 0x46, 0x51, 0x47, 0x4b, 0x43, 0x49, 0x49, 0x50, 0x4b, 0x4b,
-    0x46, 0x4a, 0x4c, 0x48, 0x49, 0x47, 0x4b, 0x56, 0x55, 0x4f, 0x49, 0x4f,
-    0x4f, 0x4e, 0x4b, 0x49, 0x4a, 0x4a, 0x49, 0x47, 0x44, 0x4b, 0x47, 0x50,
-    0x46, 0x4c, 0x46, 0x4c, 0x4b, 0x4e, 0x49, 0x57, 0x4d, 0x3e, 0x46, 0x47,
-    0x50, 0x45, 0x4f, 0x52, 0x3e, 0x4d, 0x49, 0x4a, 0x40, 0x49, 0x4f, 0x5c,
-    0x3e, 0x4a, 0x47, 0x45, 0x47, 0x41, 0x44, 0x3f, 0x4b, 0x4a, 0x52, 0x43,
-    0x41, 0x43, 0x43, 0x47, 0x55, 0x49, 0x42, 0x4c, 0x58, 0x4b, 0x42, 0x48,
-    0x4b, 0x5a, 0x36, 0x33, 0x53, 0x57, 0x4d, 0x4a, 0x37, 0x4c, 0x3e, 0x48,
-    0x43, 0x46, 0x39, 0x3c, 0x34, 0x65, 0x47, 0x3d, 0x47, 0x42, 0x3c, 0x3e,
-    0x45, 0x5b, 0x44, 0x3e, 0x45, 0x43, 0x46, 0x43, 0x59, 0x4e, 0x48, 0x46,
-    0x43, 0x3f, 0x46, 0x47, 0x4e, 0x53, 0x50, 0x4b, 0x4a, 0x3f, 0x4a, 0x54,
-    0x4c, 0x4a, 0x43, 0x50, 0x4c, 0x42, 0x4d, 0x55, 0x4d, 0x51, 0x51, 0x46,
-    0x49, 0x41, 0x50, 0x44, 0x4a, 0x4b, 0x4b, 0x43, 0x4b, 0x4e, 0x47, 0x4b,
-    0x3e, 0x4e, 0x44, 0x4d, 0x49, 0x41, 0x49, 0x44, 0x50, 0x4d, 0x45, 0x4e,
-    0x4b, 0x50, 0x45, 0x4c, 0x46, 0x4a, 0x46, 0x42, 0x50, 0x45, 0x48, 0x53,
-    0x4d, 0x44, 0x42, 0x50, 0x4c, 0x49, 0x45, 0x55, 0x4d, 0x42, 0x43, 0x41,
-    0x4c, 0x41, 0x4e, 0x4d, 0x42, 0x4e, 0x3f, 0x44, 0x4d, 0x4c, 0x4b, 0x4a,
-    0x47, 0x47, 0x4e, 0x54, 0x43, 0x40, 0x41, 0x55, 0x49, 0x49, 0x4e, 0x49,
-    0x52, 0x4e, 0x46, 0x58, 0x4b, 0x3d, 0x4a, 0x44, 0x4e, 0x47, 0x53, 0x58,
-    0x47, 0x42, 0x52, 0x46, 0x49, 0x4b, 0x47, 0x5a, 0x4c, 0x46, 0x46, 0x49,
-    0x4b, 0x4d, 0x3d, 0x48, 0x40, 0x54, 0x48, 0x4c, 0x4c, 0x44, 0x4c, 0x46,
-    0x47, 0x4b, 0x4d, 0x44, 0x5a, 0x4a, 0x3e, 0x46, 0x48, 0x53, 0x39, 0x30,
-    0x51, 0x60, 0x4d, 0x47, 0x35, 0x4f, 0x45, 0x45, 0x4a, 0x4b, 0x42, 0x3f,
-    0x38, 0x6c, 0x3d, 0x40, 0x44, 0x48, 0x3a, 0x3b, 0x46, 0x5e, 0x45, 0x3b,
-    0x47, 0x47, 0x45, 0x42, 0x53, 0x55, 0x44, 0x45, 0x46, 0x43, 0x48, 0x48,
-    0x52, 0x5d, 0x3e, 0x41, 0x53, 0x42, 0x48, 0x55, 0x49, 0x4d, 0x4a, 0x46,
-    0x52, 0x46, 0x51, 0x48, 0x44, 0x46, 0x48, 0x41, 0x49, 0x49, 0x49, 0x49,
-    0x41, 0x4d, 0x40, 0x4f, 0x45, 0x46, 0x45, 0x3f, 0x53, 0x40, 0x46, 0x43,
-    0x47, 0x4d, 0x50, 0x4c, 0x55, 0x48, 0x45, 0x47, 0x4f, 0x46, 0x42, 0x4d,
-    0x41, 0x48, 0x46, 0x4e, 0x42, 0x48, 0x48, 0x45, 0x41, 0x45, 0x48, 0x4a,
-    0x40, 0x49, 0x43, 0x4b, 0x48, 0x4a, 0x4c, 0x45, 0x4b, 0x48, 0x48, 0x4f,
-    0x40, 0x4b, 0x4a, 0x44, 0x50, 0x4a, 0x43, 0x50, 0x4c, 0x44, 0x46, 0x4c,
-    0x42, 0x44, 0x4e, 0x55, 0x47, 0x49, 0x48, 0x47, 0x52, 0x4e, 0x44, 0x59,
-    0x4e, 0x44, 0x4a, 0x48, 0x49, 0x4a, 0x42, 0x4e, 0x3e, 0x39, 0x51, 0x45,
-    0x4d, 0x49, 0x4f, 0x54, 0x51, 0x4b, 0x50, 0x44, 0x53, 0x4f, 0x4d, 0x48,
-    0x42, 0x45, 0x4e, 0x40, 0x4a, 0x48, 0x43, 0x48, 0x52, 0x54, 0x4d, 0x49,
-    0x5f, 0x53, 0x46, 0x4e, 0x3f, 0x5a, 0x36, 0x31, 0x52, 0x60, 0x4b, 0x4a,
-    0x32, 0x51, 0x40, 0x44, 0x46, 0x52, 0x44, 0x41, 0x3a, 0x6e, 0x41, 0x3e,
-    0x47, 0x3e, 0x3a, 0x2a, 0x44, 0x5a, 0x40, 0x3c, 0x4d, 0x48, 0x46, 0x3b,
-    0x5e, 0x58, 0x4d, 0x47, 0x51, 0x3a, 0x4b, 0x48, 0x5b, 0x5a, 0x54, 0x43,
-    0x50, 0x4c, 0x54, 0x54, 0x49, 0x47, 0x4f, 0x48, 0x50, 0x40, 0x4f, 0x4a,
-    0x42, 0x42, 0x3c, 0x41, 0x43, 0x4e, 0x53, 0x49, 0x4b, 0x4d, 0x49, 0x41,
-    0x4c, 0x3e, 0x40, 0x49, 0x40, 0x44, 0x49, 0x4f, 0x50, 0x4a, 0x42, 0x3a,
-    0x49, 0x4b, 0x47, 0x50, 0x49, 0x41, 0x52, 0x46, 0x3d, 0x44, 0x46, 0x43,
-    0x4b, 0x4b, 0x4d, 0x4b, 0x4e, 0x40, 0x45, 0x43, 0x48, 0x44, 0x55, 0x51,
-    0x4a, 0x46, 0x4e, 0x40, 0x53, 0x4a, 0x45, 0x41, 0x48, 0x48, 0x45, 0x4e,
-    0x4a, 0x48, 0x40, 0x4c, 0x54, 0x44, 0x42, 0x4d, 0x49, 0x43, 0x45, 0x4c,
-    0x43, 0x4f, 0x46, 0x3f, 0x46, 0x4f, 0x4b, 0x59, 0x46, 0x49, 0x54, 0x47,
-    0x49, 0x46, 0x45, 0x53, 0x4a, 0x49, 0x54, 0x45, 0x41, 0x45, 0x4c, 0x5e,
-    0x50, 0x3d, 0x4d, 0x49, 0x55, 0x4b, 0x49, 0x47, 0x4c, 0x4f, 0x43, 0x3d,
-    0x41, 0x4b, 0x43, 0x46, 0x4f, 0x4a, 0x4c, 0x54, 0x5e, 0x4e, 0x40, 0x4d,
-    0x3d, 0x59, 0x40, 0x28, 0x54, 0x5f, 0x4d, 0x4b, 0x36, 0x51, 0x3a, 0x47,
-    0x4a, 0x55, 0x42, 0x43, 0x3b, 0x72, 0x3b, 0x3d, 0x51, 0x42, 0x3f, 0x2d,
-    0x4b, 0x5a, 0x48, 0x44, 0x49, 0x49, 0x3d, 0x39, 0x56, 0x55, 0x46, 0x46,
-    0x4b, 0x43, 0x40, 0x4a, 0x52, 0x56, 0x4d, 0x45, 0x4b, 0x48, 0x40, 0x5a,
-    0x4e, 0x3a, 0x53, 0x48, 0x4c, 0x44, 0x49, 0x4e, 0x42, 0x47, 0x46, 0x40,
-    0x51, 0x42, 0x50, 0x4b, 0x43, 0x53, 0x44, 0x44, 0x46, 0x4c, 0x4c, 0x3c,
-    0x42, 0x45, 0x42, 0x45, 0x44, 0x4b, 0x52, 0x3d, 0x47, 0x4b, 0x4c, 0x4e,
-    0x52, 0x4a, 0x4e, 0x41, 0x3f, 0x46, 0x43, 0x54, 0x44, 0x53, 0x4e, 0x48,
-    0x40, 0x41, 0x4f, 0x45, 0x43, 0x3c, 0x52, 0x49, 0x40, 0x44, 0x4a, 0x3f,
-    0x4d, 0x4c, 0x4f, 0x47, 0x44, 0x47, 0x55, 0x47, 0x50, 0x4d, 0x4a, 0x4c,
-    0x50, 0x48, 0x47, 0x55, 0x4b, 0x4a, 0x52, 0x49, 0x3d, 0x3f, 0x4f, 0x51,
-    0x48, 0x4e, 0x42, 0x4e, 0x42, 0x48, 0x4e, 0x49, 0x4a, 0x50, 0x45, 0x54,
-    0x41, 0x43, 0x45, 0x4d, 0x48, 0x48, 0x48, 0x51, 0x53, 0x3e, 0x55, 0x44,
-    0x52, 0x56, 0x44, 0x4d, 0x4e, 0x48, 0x4b, 0x43, 0x48, 0x53, 0x48, 0x44,
-    0x49, 0x45, 0x4e, 0x50, 0x5d, 0x4a, 0x45, 0x4c, 0x45, 0x55, 0x43, 0x2e,
-    0x59, 0x60, 0x4e, 0x4d, 0x32, 0x53, 0x3e, 0x3f, 0x40, 0x63, 0x41, 0x48,
-    0x38, 0x73, 0x38, 0x46, 0x50, 0x3e, 0x3c, 0x23, 0x48, 0x61, 0x45, 0x3c,
-    0x41, 0x41, 0x36, 0x3b, 0x58, 0x56, 0x4a, 0x40, 0x4f, 0x44, 0x45, 0x4c,
-    0x5a, 0x56, 0x47, 0x3f, 0x4d, 0x4b, 0x46, 0x5d, 0x52, 0x47, 0x45, 0x4c,
-    0x4a, 0x52, 0x4f, 0x4f, 0x4f, 0x43, 0x4f, 0x47, 0x43, 0x46, 0x3c, 0x4c,
-    0x46, 0x55, 0x40, 0x53, 0x43, 0x3e, 0x42, 0x35, 0x51, 0x41, 0x42, 0x3f,
-    0x45, 0x3d, 0x41, 0x31, 0x4e, 0x47, 0x48, 0x42, 0x41, 0x45, 0x43, 0x38,
-    0x42, 0x40, 0x4a, 0x47, 0x4e, 0x43, 0x40, 0x43, 0x48, 0x49, 0x45, 0x4f,
-    0x44, 0x42, 0x4d, 0x42, 0x42, 0x3f, 0x46, 0x52, 0x3c, 0x3c, 0x47, 0x43,
-    0x46, 0x47, 0x45, 0x40, 0x4c, 0x44, 0x43, 0x4a, 0x4b, 0x4d, 0x4e, 0x46,
-    0x51, 0x45, 0x47, 0x4b, 0x45, 0x50, 0x40, 0x42, 0x4c, 0x4c, 0x4c, 0x4f,
-    0x44, 0x3c, 0x49, 0x3c, 0x3f, 0x45, 0x3f, 0x5c, 0x42, 0x3e, 0x4b, 0x4e,
-    0x50, 0x45, 0x42, 0x5c, 0x4c, 0x48, 0x50, 0x52, 0x50, 0x47, 0x4b, 0x44,
-    0x3d, 0x50, 0x55, 0x4c, 0x48, 0x3f, 0x4b, 0x44, 0x4a, 0x51, 0x42, 0x4c,
-    0x60, 0x51, 0x41, 0x4b, 0x46, 0x5c, 0x42, 0x2c, 0x55, 0x61, 0x50, 0x52,
-    0x37, 0x5a, 0x3f, 0x43, 0x43, 0x58, 0x3a, 0x4d, 0x3e, 0x72, 0x35, 0x3f,
-    0x58, 0x41, 0x40, 0x1f, 0x55, 0x63, 0x3f, 0x49, 0x41, 0x3e, 0x35, 0x41,
-    0x65, 0x54, 0x42, 0x45, 0x45, 0x3c, 0x44, 0x45, 0x59, 0x5a, 0x4d, 0x41,
-    0x51, 0x46, 0x49, 0x59, 0x4c, 0x41, 0x42, 0x44, 0x4a, 0x45, 0x3f, 0x4a,
-    0x4a, 0x44, 0x48, 0x48, 0x52, 0x40, 0x4a, 0x4a, 0x4d, 0x54, 0x44, 0x48,
-    0x54, 0x46, 0x49, 0x3b, 0x42, 0x4a, 0x4e, 0x46, 0x4a, 0x45, 0x4f, 0x30,
-    0x46, 0x41, 0x47, 0x46, 0x4b, 0x47, 0x46, 0x38, 0x4c, 0x3a, 0x4b, 0x46,
-    0x52, 0x48, 0x4f, 0x3e, 0x48, 0x4a, 0x48, 0x4b, 0x44, 0x45, 0x4a, 0x46,
-    0x3f, 0x4f, 0x40, 0x44, 0x43, 0x43, 0x4b, 0x39, 0x46, 0x43, 0x49, 0x49,
-    0x49, 0x4a, 0x44, 0x48, 0x4c, 0x41, 0x4d, 0x52, 0x4c, 0x4a, 0x46, 0x3d,
-    0x41, 0x4b, 0x41, 0x48, 0x45, 0x3b, 0x51, 0x54, 0x4a, 0x39, 0x4d, 0x41,
-    0x54, 0x46, 0x4c, 0x53, 0x48, 0x3e, 0x4a, 0x3d, 0x41, 0x52, 0x54, 0x63,
-    0x44, 0x4d, 0x4a, 0x43, 0x52, 0x4b, 0x52, 0x52, 0x4e, 0x41, 0x48, 0x42,
-    0x48, 0x4d, 0x49, 0x45, 0x51, 0x48, 0x3e, 0x47, 0x5a, 0x52, 0x4a, 0x4e,
-    0x3e, 0x59, 0x3c, 0x2e, 0x5c, 0x5b, 0x4c, 0x56, 0x30, 0x59, 0x3a, 0x48,
-    0x3d, 0x5c, 0x44, 0x49, 0x40, 0x7c, 0x3a, 0x48, 0x54, 0x40, 0x41, 0x28,
-    0x4d, 0x64, 0x46, 0x47, 0x49, 0x40, 0x30, 0x3a, 0x5f, 0x5b, 0x42, 0x37,
-    0x49, 0x45, 0x40, 0x43, 0x5b, 0x54, 0x48, 0x4d, 0x4a, 0x47, 0x51, 0x58,
-    0x4b, 0x3c, 0x4d, 0x46, 0x4b, 0x52, 0x4c, 0x58, 0x53, 0x46, 0x42, 0x45,
-    0x4c, 0x4a, 0x4d, 0x4e, 0x52, 0x4d, 0x46, 0x44, 0x46, 0x3f, 0x46, 0x34,
-    0x4f, 0x42, 0x44, 0x46, 0x44, 0x50, 0x47, 0x30, 0x44, 0x3c, 0x42, 0x46,
-    0x4f, 0x4a, 0x52, 0x30, 0x55, 0x4f, 0x45, 0x4a, 0x48, 0x4c, 0x4e, 0x35,
-    0x4e, 0x3c, 0x45, 0x4a, 0x45, 0x4a, 0x44, 0x3c, 0x4e, 0x4a, 0x51, 0x44,
-    0x49, 0x40, 0x4a, 0x40, 0x41, 0x44, 0x4f, 0x4c, 0x43, 0x45, 0x4b, 0x43,
-    0x3e, 0x3e, 0x4c, 0x44, 0x48, 0x48, 0x42, 0x42, 0x4d, 0x43, 0x50, 0x4d,
-    0x49, 0x3c, 0x45, 0x4f, 0x4c, 0x46, 0x4b, 0x48, 0x4d, 0x4d, 0x49, 0x55,
-    0x49, 0x3b, 0x40, 0x44, 0x4a, 0x4b, 0x4e, 0x5e, 0x43, 0x47, 0x45, 0x43,
-    0x4d, 0x4d, 0x49, 0x46, 0x4a, 0x44, 0x4e, 0x3e, 0x52, 0x41, 0x47, 0x47,
-    0x4a, 0x50, 0x48, 0x43, 0x5d, 0x4f, 0x49, 0x48, 0x43, 0x4f, 0x45, 0x3e,
-    0x5a, 0x69, 0x4d, 0x5a, 0x3a, 0x5d, 0x3a, 0x48, 0x42, 0x55, 0x3e, 0x48,
-    0x48, 0x7b, 0x37, 0x40, 0x57, 0x45, 0x48, 0x24, 0x50, 0x61, 0x4c, 0x4a,
-    0x44, 0x41, 0x34, 0x38, 0x65, 0x5b, 0x4f, 0x3c, 0x4d, 0x3a, 0x4a, 0x4c,
-    0x66, 0x55, 0x50, 0x47, 0x4d, 0x46, 0x47, 0x58, 0x4c, 0x48, 0x48, 0x48,
-    0x4e, 0x59, 0x4f, 0x4b, 0x45, 0x45, 0x4b, 0x54, 0x46, 0x51, 0x4f, 0x44,
-    0x42, 0x55, 0x48, 0x44, 0x48, 0x41, 0x53, 0x2e, 0x4d, 0x45, 0x44, 0x54,
-    0x4a, 0x44, 0x53, 0x34, 0x4c, 0x46, 0x47, 0x3f, 0x4c, 0x4b, 0x47, 0x36,
-    0x47, 0x41, 0x43, 0x40, 0x51, 0x46, 0x45, 0x33, 0x46, 0x3e, 0x47, 0x50,
-    0x3f, 0x48, 0x48, 0x37, 0x41, 0x41, 0x42, 0x3e, 0x45, 0x3d, 0x49, 0x3e,
-    0x4f, 0x42, 0x49, 0x4a, 0x46, 0x46, 0x48, 0x44, 0x49, 0x45, 0x46, 0x4a,
-    0x4a, 0x47, 0x48, 0x43, 0x44, 0x45, 0x3f, 0x4c, 0x4c, 0x49, 0x4d, 0x51,
-    0x4a, 0x4a, 0x49, 0x4c, 0x42, 0x4d, 0x4b, 0x4b, 0x4a, 0x42, 0x47, 0x4d,
-    0x3e, 0x4b, 0x47, 0x5c, 0x49, 0x3d, 0x4e, 0x41, 0x44, 0x49, 0x3e, 0x3e,
-    0x4b, 0x47, 0x4e, 0x45, 0x44, 0x4a, 0x4d, 0x4a, 0x4f, 0x46, 0x45, 0x52,
-    0x60, 0x53, 0x49, 0x50, 0x3d, 0x4f, 0x43, 0x3d, 0x52, 0x64, 0x52, 0x58,
-    0x39, 0x5f, 0x36, 0x4c, 0x45, 0x57, 0x42, 0x4b, 0x3f, 0x80, 0x34, 0x47,
-    0x58, 0x41, 0x45, 0x1b, 0x4b, 0x5e, 0x4c, 0x40, 0x44, 0x42, 0x39, 0x3a,
-    0x5e, 0x5b, 0x4b, 0x3a, 0x4b, 0x3f, 0x45, 0x3e, 0x69, 0x57, 0x4b, 0x45,
-    0x4b, 0x3f, 0x45, 0x55, 0x49, 0x49, 0x48, 0x47, 0x41, 0x4f, 0x42, 0x53,
-    0x49, 0x40, 0x42, 0x3e, 0x49, 0x47, 0x53, 0x47, 0x45, 0x51, 0x4a, 0x44,
-    0x44, 0x45, 0x4e, 0x2a, 0x45, 0x42, 0x4a, 0x4b, 0x46, 0x4d, 0x41, 0x30,
-    0x3d, 0x43, 0x3f, 0x48, 0x49, 0x44, 0x4d, 0x2e, 0x48, 0x4a, 0x4c, 0x51,
-    0x50, 0x46, 0x3e, 0x2c, 0x4d, 0x3f, 0x47, 0x46, 0x3c, 0x40, 0x4c, 0x38,
-    0x4f, 0x46, 0x47, 0x53, 0x3b, 0x3c, 0x4e, 0x3e, 0x49, 0x40, 0x43, 0x4c,
-    0x4d, 0x48, 0x45, 0x3c, 0x4d, 0x4c, 0x4d, 0x45, 0x3f, 0x49, 0x4a, 0x43,
-    0x4d, 0x41, 0x4b, 0x50, 0x4e, 0x46, 0x50, 0x44, 0x49, 0x44, 0x4e, 0x42,
-    0x4a, 0x43, 0x4c, 0x4c, 0x49, 0x49, 0x44, 0x4e, 0x4b, 0x3f, 0x4b, 0x5d,
-    0x41, 0x49, 0x4b, 0x46, 0x4e, 0x48, 0x45, 0x51, 0x4d, 0x45, 0x46, 0x45,
-    0x4b, 0x4e, 0x3c, 0x4d, 0x3d, 0x41, 0x47, 0x47, 0x64, 0x54, 0x41, 0x55,
-    0x47, 0x56, 0x44, 0x3b, 0x53, 0x66, 0x4f, 0x5e, 0x40, 0x5d, 0x38, 0x4a,
-    0x41, 0x59, 0x42, 0x48, 0x47, 0xff, 0x36, 0x49, 0x59, 0x41, 0x43, 0x1d,
-    0x4d, 0x5e, 0x44, 0x44, 0x50, 0x3f, 0x39, 0x40, 0x68, 0x5e, 0x4a, 0x41,
-    0x52, 0x41, 0x43, 0x41, 0x68, 0x51, 0x45, 0x48, 0x4c, 0x46, 0x4a, 0x5e,
-    0x4e, 0x40, 0x4d, 0x41, 0x41, 0x5c, 0x3f, 0x4e, 0x4c, 0x37, 0x48, 0x40,
-    0x46, 0x47, 0x4f, 0x43, 0x53, 0x52, 0x3d, 0x44, 0x47, 0x44, 0x3d, 0x34,
-    0x44, 0x42, 0x4a, 0x43, 0x4d, 0x3f, 0x53, 0x2e, 0x42, 0x47, 0x43, 0x4d,
-    0x45, 0x45, 0x47, 0x31, 0x4d, 0x39, 0x41, 0x4a, 0x4a, 0x4d, 0x4b, 0x35,
-    0x47, 0x4e, 0x4c, 0x40, 0x4a, 0x44, 0x44, 0x36, 0x3e, 0x49, 0x3f, 0x45,
-    0x46, 0x43, 0x4e, 0x3c, 0x4d, 0x47, 0x4c, 0x48, 0x4a, 0x4b, 0x48, 0x39,
-    0x46, 0x50, 0x4a, 0x4f, 0x46, 0x41, 0x44, 0x4a, 0x41, 0x4f, 0x4c, 0x4e,
-    0x55, 0x46, 0x43, 0x46, 0x4a, 0x48, 0x4e, 0x46, 0x42, 0x40, 0x4f, 0x56,
-    0x4c, 0x45, 0x4b, 0x46, 0x4a, 0x47, 0x42, 0x5e, 0x49, 0x4e, 0x46, 0x43,
-    0x4e, 0x42, 0x45, 0x48, 0x47, 0x48, 0x4f, 0x45, 0x47, 0x51, 0x4b, 0x4c,
-    0x51, 0x39, 0x4d, 0x48, 0x60, 0x57, 0x49, 0x52, 0x3d, 0x57, 0x46, 0x3d,
-    0x53, 0x68, 0x4b, 0x60, 0x40, 0x5a, 0x41, 0x4b, 0x46, 0x56, 0x46, 0x4c,
-    0x49, 0x7e, 0x2f, 0x48, 0x51, 0x42, 0x40, 0x20, 0x4b, 0x62, 0x4d, 0x41,
-    0x4f, 0x43, 0x3d, 0x35, 0x63, 0x63, 0x46, 0x3e, 0x4e, 0x47, 0x40, 0x40,
-    0x60, 0x52, 0x4c, 0x46, 0x49, 0x48, 0x4f, 0x56, 0x51, 0x47, 0x52, 0x4e,
-    0x4b, 0x59, 0x55, 0x4f, 0x48, 0x3d, 0x48, 0x4a, 0x4d, 0x50, 0x47, 0x47,
-    0x51, 0x52, 0x4d, 0x51, 0x45, 0x45, 0x47, 0x2d, 0x4d, 0x41, 0x43, 0x49,
-    0x4d, 0x40, 0x4a, 0x2f, 0x4f, 0x43, 0x46, 0x4a, 0x3e, 0x4a, 0x4a, 0x2b,
-    0x49, 0x4c, 0x4c, 0x3e, 0x41, 0x4c, 0x4a, 0x2b, 0x40, 0x44, 0x46, 0x4a,
-    0x40, 0x44, 0x42, 0x38, 0x52, 0x42, 0x46, 0x51, 0x53, 0x4e, 0x45, 0x31,
-    0x45, 0x47, 0x4f, 0x46, 0x49, 0x43, 0x45, 0x3b, 0x4b, 0x4b, 0x4b, 0x4c,
-    0x43, 0x4a, 0x4c, 0x43, 0x4e, 0x40, 0x52, 0x44, 0x48, 0x49, 0x47, 0x4b,
-    0x4e, 0x3d, 0x4e, 0x44, 0x48, 0x4d, 0x4f, 0x4f, 0x50, 0x36, 0x47, 0x41,
-    0x4a, 0x44, 0x45, 0x56, 0x4f, 0x4c, 0x50, 0x4b, 0x45, 0x3e, 0x45, 0x4e,
-    0x45, 0x45, 0x43, 0x40, 0x47, 0x4e, 0x45, 0x3e, 0x4a, 0x3f, 0x49, 0x50,
-    0x62, 0x55, 0x48, 0x56, 0x3e, 0x57, 0x4f, 0x3b, 0x55, 0x6c, 0x50, 0x5c,
-    0x3d, 0x54, 0x3d, 0x46, 0x43, 0x59, 0x3e, 0x51, 0x4d, 0x7b, 0x33, 0x47,
-    0x52, 0x43, 0x3f, 0x25, 0x4a, 0x6f, 0x49, 0x3e, 0x50, 0x40, 0x41, 0x30,
-    0x5e, 0x5c, 0x4a, 0x43, 0x4d, 0x42, 0x46, 0x3b, 0x63, 0x53, 0x4f, 0x43,
-    0x58, 0x48, 0x4b, 0x59, 0x50, 0x4e, 0x4b, 0x51, 0x4a, 0x55, 0x44, 0x46,
-    0x4c, 0x3d, 0x4c, 0x52, 0x44, 0x52, 0x4c, 0x41, 0x4f, 0x44, 0x4a, 0x47,
-    0x4e, 0x48, 0x49, 0x2e, 0x3e, 0x45, 0x4c, 0x48, 0x41, 0x47, 0x4d, 0x2e,
-    0x40, 0x4b, 0x4c, 0x42, 0x4d, 0x40, 0x4e, 0x2e, 0x43, 0x45, 0x4b, 0x43,
-    0x3e, 0x49, 0x55, 0x35, 0x43, 0x42, 0x42, 0x40, 0x4e, 0x46, 0x44, 0x37,
-    0x49, 0x41, 0x3f, 0x52, 0x47, 0x4b, 0x43, 0x33, 0x4b, 0x47, 0x4b, 0x4c,
-    0x4d, 0x4b, 0x3f, 0x42, 0x44, 0x40, 0x49, 0x41, 0x42, 0x49, 0x4b, 0x46,
-    0x4e, 0x4e, 0x47, 0x4e, 0x48, 0x48, 0x4b, 0x46, 0x51, 0x4b, 0x46, 0x4d,
-    0x47, 0x4f, 0x3e, 0x51, 0x46, 0x4e, 0x46, 0x4b, 0x47, 0x48, 0x4e, 0x55,
-    0x4c, 0x3d, 0x47, 0x51, 0x42, 0x45, 0x4f, 0x42, 0x52, 0x50, 0x44, 0x4c,
-    0x44, 0x44, 0x43, 0x4d, 0x40, 0x42, 0x4d, 0x4b, 0x5d, 0x4e, 0x47, 0x54,
-    0x47, 0x51, 0x43, 0x39, 0x58, 0x66, 0x4e, 0x5a, 0x41, 0x52, 0x36, 0x47,
-    0x45, 0x5f, 0x34, 0x50, 0x46, 0x79, 0x30, 0x48, 0x50, 0x45, 0x32, 0x22,
-    0x54, 0x64, 0x49, 0x46, 0x45, 0x3c, 0x42, 0x36, 0x65, 0x5c, 0x48, 0x3a,
-    0x4d, 0x4b, 0x47, 0x3e, 0x63, 0x56, 0x4a, 0x48, 0x51, 0x42, 0x4f, 0x5e,
-    0x4c, 0x44, 0x4b, 0x4c, 0x3d, 0x5a, 0x43, 0x4d, 0x42, 0x40, 0x4f, 0x4d,
-    0x3f, 0x3e, 0x46, 0x40, 0x49, 0x42, 0x49, 0x40, 0x49, 0x4c, 0x4a, 0x2e,
-    0x4b, 0x3f, 0x53, 0x4b, 0x48, 0x49, 0x3e, 0x34, 0x47, 0x4a, 0x4b, 0x46,
-    0x3b, 0x49, 0x46, 0x34, 0x4b, 0x48, 0x4c, 0x49, 0x49, 0x43, 0x4f, 0x2e,
-    0x44, 0x46, 0x48, 0x50, 0x46, 0x4e, 0x4a, 0x37, 0x4b, 0x4c, 0x4a, 0x50,
-    0x45, 0x4a, 0x48, 0x3b, 0x48, 0x44, 0x48, 0x4a, 0x41, 0x44, 0x52, 0x3f,
-    0x4c, 0x46, 0x4a, 0x45, 0x46, 0x49, 0x49, 0x36, 0x53, 0x3e, 0x48, 0x47,
-    0x3f, 0x42, 0x41, 0x4c, 0x42, 0x4a, 0x52, 0x46, 0x49, 0x3f, 0x48, 0x5a,
-    0x43, 0x42, 0x3d, 0x43, 0x4f, 0x44, 0x43, 0x65, 0x41, 0x41, 0x44, 0x4b,
-    0x50, 0x44, 0x53, 0x49, 0x41, 0x45, 0x4a, 0x4d, 0x40, 0x45, 0x4a, 0x4e,
-    0x50, 0x40, 0x51, 0x40, 0x5e, 0x50, 0x43, 0x5c, 0x47, 0x5a, 0x44, 0x4c,
-    0x54, 0x64, 0x4f, 0x63, 0x39, 0x58, 0x3c, 0x4a, 0x42, 0x5e, 0x3c, 0x4a,
-    0x48, 0x7b, 0x34, 0x4c, 0x4f, 0x44, 0x30, 0x24, 0x50, 0x65, 0x47, 0x39,
-    0x46, 0x3e, 0x3f, 0x33, 0x65, 0x5a, 0x44, 0x38, 0x50, 0x47, 0x4b, 0x3e,
-    0x5b, 0x53, 0x4a, 0x4d, 0x51, 0x40, 0x47, 0x59, 0x51, 0x42, 0x4f, 0x50,
-    0x45, 0x57, 0x46, 0x50, 0x3f, 0x3c, 0x4c, 0x4f, 0x46, 0x41, 0x4a, 0x3e,
-    0x4d, 0x45, 0x51, 0x48, 0x4e, 0x44, 0x4e, 0x35, 0x44, 0x3f, 0x44, 0x48,
-    0x3c, 0x4c, 0x49, 0x2c, 0x4a, 0x46, 0x48, 0x44, 0x4b, 0x42, 0x4b, 0x2f,
-    0x4e, 0x50, 0x4c, 0x4d, 0x44, 0x46, 0x3f, 0x39, 0x4d, 0x47, 0x45, 0x41,
-    0x42, 0x47, 0x4a, 0x3a, 0x40, 0x3e, 0x4a, 0x51, 0x3f, 0x47, 0x44, 0x37,
-    0x47, 0x4e, 0x47, 0x52, 0x45, 0x42, 0x4a, 0x3d, 0x43, 0x4d, 0x4d, 0x47,
-    0x48, 0x43, 0x44, 0x44, 0x47, 0x4e, 0x52, 0x4b, 0x4e, 0x50, 0x42, 0x47,
-    0x4b, 0x4b, 0x4e, 0x4c, 0x4e, 0x47, 0x50, 0x56, 0x46, 0x47, 0x4d, 0x49,
-    0x4d, 0x46, 0x49, 0x5f, 0x49, 0x42, 0x4d, 0x44, 0x40, 0x4b, 0x52, 0x45,
-    0x46, 0x4a, 0x4b, 0x49, 0x47, 0x4b, 0x42, 0x45, 0x42, 0x44, 0x46, 0x4c,
-    0x62, 0x4a, 0x44, 0x53, 0x43, 0x5a, 0x48, 0x49, 0x59, 0x68, 0x46, 0x61,
-    0x40, 0x5a, 0x3a, 0x4d, 0x45, 0x5e, 0x33, 0x4f, 0x4e, 0x74, 0x3e, 0x3e,
-    0x5a, 0x4b, 0x34, 0x31, 0x52, 0x6c, 0x44, 0x39, 0x4c, 0x3b, 0x39, 0x3a,
-    0x63, 0x65, 0x4b, 0x40, 0x50, 0x4d, 0x53, 0x4a, 0x69, 0x56, 0x54, 0x45,
-    0x4c, 0x4c, 0x50, 0x5b, 0x4d, 0x4f, 0x3d, 0x4b, 0x44, 0x47, 0x43, 0x47,
-    0x49, 0x3c, 0x49, 0x41, 0x41, 0x3f, 0x47, 0x43, 0x48, 0x47, 0x4c, 0x43,
-    0x4a, 0x40, 0x4d, 0x32, 0x4b, 0x4d, 0x44, 0x48, 0x46, 0x44, 0x50, 0x2f,
-    0x4e, 0x49, 0x53, 0x4b, 0x52, 0x47, 0x4b, 0x2b, 0x48, 0x4b, 0x4a, 0x4c,
-    0x4d, 0x4c, 0x43, 0x37, 0x48, 0x3c, 0x4b, 0x42, 0x51, 0x3f, 0x45, 0x3c,
-    0x49, 0x40, 0x42, 0x43, 0x4d, 0x4c, 0x3f, 0x3f, 0x4d, 0x43, 0x45, 0x42,
-    0x48, 0x42, 0x48, 0x39, 0x51, 0x4e, 0x46, 0x4f, 0x3e, 0x4c, 0x45, 0x3e,
-    0x3f, 0x3f, 0x43, 0x41, 0x4b, 0x4b, 0x43, 0x4d, 0x44, 0x3b, 0x48, 0x45,
-    0x3c, 0x4a, 0x48, 0x5b, 0x3c, 0x4b, 0x4c, 0x44, 0x46, 0x3e, 0x45, 0x57,
-    0x43, 0x42, 0x51, 0x4a, 0x46, 0x47, 0x43, 0x49, 0x42, 0x43, 0x50, 0x4e,
-    0x4e, 0x44, 0x41, 0x4e, 0x4e, 0x41, 0x48, 0x47, 0x5c, 0x53, 0x44, 0x54,
-    0x44, 0x5b, 0x45, 0x46, 0x55, 0x67, 0x4d, 0x5d, 0x40, 0x5a, 0x43, 0x4b,
-    0x43, 0x60, 0x3c, 0x4b, 0x41, 0x79, 0x41, 0x41, 0x58, 0x48, 0x40, 0x3b,
-    0x4f, 0x6c, 0x46, 0x3f, 0x53, 0x3a, 0x3d, 0x36, 0x5a, 0x57, 0x44, 0x41,
-    0x4c, 0x47, 0x4e, 0x48, 0x62, 0x60, 0x4a, 0x46, 0x51, 0x3e, 0x52, 0x5f,
-    0x4b, 0x46, 0x48, 0x4c, 0x4c, 0x55, 0x43, 0x46, 0x49, 0x3e, 0x41, 0x40,
-    0x4d, 0x47, 0x46, 0x3b, 0x51, 0x3a, 0x4a, 0x45, 0x50, 0x47, 0x51, 0x38,
-    0x44, 0x41, 0x40, 0x4b, 0x4d, 0x44, 0x4d, 0x28, 0x47, 0x3e, 0x44, 0x40,
-    0x49, 0x49, 0x40, 0x3c, 0x44, 0x4c, 0x48, 0x51, 0x46, 0x3e, 0x47, 0x2a,
-    0x41, 0x44, 0x49, 0x4c, 0x4e, 0x4e, 0x42, 0x3c, 0x49, 0x42, 0x43, 0x45,
-    0x4e, 0x4d, 0x50, 0x39, 0x42, 0x43, 0x48, 0x41, 0x3f, 0x40, 0x4e, 0x3a,
-    0x44, 0x3d, 0x49, 0x4d, 0x47, 0x45, 0x4b, 0x42, 0x4c, 0x4d, 0x3f, 0x3f,
-    0x4e, 0x4d, 0x4d, 0x4d, 0x4d, 0x45, 0x47, 0x43, 0x4c, 0x46, 0x47, 0x57,
-    0x4b, 0x42, 0x4d, 0x46, 0x4b, 0x4b, 0x43, 0x58, 0x48, 0x49, 0x4d, 0x47,
-    0x43, 0x49, 0x4b, 0x48, 0x46, 0x4f, 0x4f, 0x42, 0x4a, 0x43, 0x49, 0x4e,
-    0x4a, 0x47, 0x4c, 0x48, 0x5a, 0x57, 0x4a, 0x58, 0x49, 0x4f, 0x45, 0x47,
-    0x63, 0x66, 0x4d, 0x5e, 0x4b, 0x51, 0x45, 0x4a, 0x43, 0x5d, 0x33, 0x4b,
-    0x4e, 0x70, 0x42, 0x39, 0x57, 0x4a, 0x40, 0x3a, 0x51, 0x68, 0x45, 0x45,
-    0x4c, 0x44, 0x3a, 0x3a, 0x4f, 0x62, 0x49, 0x45, 0x53, 0x4c, 0x4e, 0x41,
-    0x63, 0x5e, 0x44, 0x44, 0x47, 0x43, 0x47, 0x59, 0x4c, 0x4b, 0x4c, 0x49,
-    0x3e, 0x43, 0x4c, 0x46, 0x4c, 0x38, 0x47, 0x46, 0x46, 0x47, 0x40, 0x44,
-    0x51, 0x3e, 0x40, 0x47, 0x3f, 0x45, 0x48, 0x2a, 0x42, 0x3e, 0x43, 0x46,
-    0x50, 0x4c, 0x4a, 0x2c, 0x49, 0x4b, 0x48, 0x48, 0x40, 0x4a, 0x4a, 0x37,
-    0x4e, 0x42, 0x4f, 0x4c, 0x41, 0x43, 0x45, 0x38, 0x4e, 0x3d, 0x41, 0x47,
-    0x42, 0x42, 0x43, 0x3b, 0x4a, 0x40, 0x48, 0x4a, 0x53, 0x44, 0x4d, 0x35,
-    0x51, 0x3c, 0x4e, 0x4e, 0x3e, 0x3f, 0x4b, 0x3c, 0x3e, 0x47, 0x41, 0x48,
-    0x40, 0x46, 0x4e, 0x44, 0x49, 0x42, 0x49, 0x44, 0x4b, 0x46, 0x46, 0x43,
-    0x4c, 0x4b, 0x49, 0x4d, 0x3d, 0x47, 0x43, 0x5c, 0x4a, 0x42, 0x47, 0x4e,
-    0x47, 0x40, 0x4c, 0x55, 0x3f, 0x45, 0x46, 0x49, 0x46, 0x48, 0x49, 0x4d,
-    0x4c, 0x41, 0x49, 0x40, 0x4a, 0x44, 0x42, 0x49, 0x52, 0x41, 0x49, 0x4a,
-    0x5c, 0x53, 0x47, 0x58, 0x49, 0x55, 0x4a, 0x4a, 0x62, 0x61, 0x4b, 0x57,
-    0x3c, 0x50, 0x42, 0x4c, 0x49, 0x5f, 0x3f, 0x4a, 0x42, 0x70, 0x40, 0x40,
-    0x4f, 0x46, 0x43, 0x43, 0x4d, 0x6c, 0x41, 0x3e, 0x4e, 0x49, 0x43, 0x38,
-    0x50, 0x57, 0x43, 0x39, 0x4a, 0x4f, 0x51, 0x3e, 0x5c, 0x57, 0x46, 0x49,
-    0x41, 0x40, 0x42, 0x4f, 0x4c, 0x45, 0x46, 0x4a, 0x4c, 0x4b, 0x43, 0x42,
-    0x4c, 0x3c, 0x47, 0x47, 0x4f, 0x44, 0x45, 0x3a, 0x4d, 0x3d, 0x4d, 0x3f,
-    0x46, 0x4f, 0x41, 0x37, 0x46, 0x45, 0x54, 0x47, 0x4e, 0x46, 0x47, 0x23,
-    0x48, 0x4e, 0x4a, 0x47, 0x45, 0x45, 0x4e, 0x33, 0x49, 0x4a, 0x4d, 0x4e,
-    0x49, 0x46, 0x49, 0x36, 0x48, 0x44, 0x53, 0x44, 0x4a, 0x45, 0x4a, 0x37,
-    0x45, 0x36, 0x4b, 0x4e, 0x50, 0x3f, 0x49, 0x38, 0x40, 0x43, 0x46, 0x4c,
-    0x43, 0x46, 0x4a, 0x3f, 0x45, 0x3d, 0x44, 0x47, 0x44, 0x42, 0x4a, 0x45,
-    0x47, 0x43, 0x4d, 0x4d, 0x44, 0x44, 0x4f, 0x4a, 0x4a, 0x41, 0x50, 0x50,
-    0x4b, 0x44, 0x54, 0x5c, 0x4b, 0x3a, 0x46, 0x4a, 0x4a, 0x43, 0x48, 0x5c,
-    0x4b, 0x43, 0x47, 0x3d, 0x3e, 0x54, 0x42, 0x47, 0x42, 0x4f, 0x4b, 0x4b,
-    0x46, 0x46, 0x46, 0x42, 0x42, 0x4b, 0x48, 0x45, 0x51, 0x4e, 0x49, 0x4d,
-    0x43, 0x56, 0x45, 0x40, 0x5a, 0x58, 0x4c, 0x55, 0x40, 0x4b, 0x4c, 0x51,
-    0x42, 0x59, 0x43, 0x46, 0x46, 0x69, 0x43, 0x3c, 0x54, 0x47, 0x3d, 0x41,
-    0x52, 0x64, 0x44, 0x38, 0x4f, 0x49, 0x3a, 0x3a, 0x55, 0x54, 0x45, 0x3e,
-    0x49, 0x44, 0x4e, 0x3f, 0x57, 0x50, 0x47, 0x43, 0x45, 0x48, 0x53, 0x5b,
-    0x53, 0x4d, 0x48, 0x4e, 0x48, 0x3a, 0x3e, 0x46, 0x42, 0x36, 0x50, 0x4d,
-    0x49, 0x4b, 0x4b, 0x45, 0x4c, 0x44, 0x50, 0x47, 0x3e, 0x49, 0x50, 0x37,
-    0x4c, 0x4b, 0x4a, 0x54, 0x4e, 0x43, 0x40, 0x25, 0x46, 0x42, 0x52, 0x3d,
-    0x44, 0x45, 0x51, 0x2e, 0x4a, 0x3d, 0x46, 0x46, 0x4c, 0x42, 0x48, 0x34,
-    0x44, 0x44, 0x44, 0x4c, 0x4f, 0x4b, 0x42, 0x3d, 0x45, 0x40, 0x47, 0x49,
-    0x43, 0x41, 0x3e, 0x39, 0x47, 0x4b, 0x50, 0x4a, 0x46, 0x47, 0x4e, 0x3b,
-    0x4e, 0x3e, 0x49, 0x4a, 0x50, 0x40, 0x43, 0x49, 0x48, 0x3c, 0x4f, 0x45,
-    0x4a, 0x41, 0x42, 0x48, 0x4b, 0x46, 0x4a, 0x50, 0x40, 0x49, 0x44, 0x54,
-    0x45, 0x45, 0x4a, 0x4b, 0x51, 0x51, 0x48, 0x53, 0x50, 0x3f, 0x50, 0x46,
-    0x44, 0x45, 0x51, 0x43, 0x4f, 0x3e, 0x41, 0x41, 0x46, 0x45, 0x45, 0x4c,
-    0x54, 0x3c, 0x4a, 0x4c, 0x5a, 0x4f, 0x46, 0x4b, 0x47, 0x4a, 0x43, 0x4c,
-    0x56, 0x5a, 0x4a, 0x53, 0x4c, 0x49, 0x46, 0x4c, 0x45, 0x59, 0x40, 0x4b,
-    0x48, 0x60, 0x3d, 0x42, 0x52, 0x3f, 0x42, 0x3d, 0x52, 0x5f, 0x46, 0x42,
-    0x4b, 0x4e, 0x4a, 0x3d, 0x52, 0x55, 0x53, 0x37, 0x47, 0x3e, 0x4a, 0x42,
-    0x51, 0x54, 0x48, 0x48, 0x4b, 0x48, 0x3e, 0x52, 0x41, 0x4e, 0x4c, 0x4f,
-    0x43, 0x3b, 0x4b, 0x4b, 0x4c, 0x40, 0x48, 0x49, 0x4d, 0x3a, 0x45, 0x3c,
-    0x53, 0x44, 0x48, 0x4d, 0x4b, 0x49, 0x46, 0x3c, 0x4d, 0x40, 0x51, 0x3f,
-    0x4c, 0x45, 0x44, 0x2f, 0x49, 0x51, 0x3f, 0x4d, 0x3e, 0x4e, 0x3c, 0x30,
-    0x3d, 0x48, 0x4f, 0x3f, 0x45, 0x45, 0x46, 0x3b, 0x4c, 0x46, 0x4d, 0x50,
-    0x4c, 0x3d, 0x41, 0x37, 0x3e, 0x3e, 0x4f, 0x4b, 0x4d, 0x4f, 0x45, 0x45,
-    0x4a, 0x47, 0x4a, 0x44, 0x43, 0x46, 0x51, 0x41, 0x4e, 0x39, 0x44, 0x4a,
-    0x4e, 0x49, 0x4a, 0x42, 0x49, 0x4b, 0x4e, 0x48, 0x49, 0x4a, 0x45, 0x4a,
-    0x45, 0x41, 0x4a, 0x4b, 0x42, 0x41, 0x48, 0x4a, 0x44, 0x3a, 0x46, 0x49,
-    0x54, 0x45, 0x44, 0x60, 0x4a, 0x4e, 0x45, 0x4a, 0x4a, 0x45, 0x4b, 0x49,
-    0x42, 0x44, 0x46, 0x50, 0x4b, 0x4b, 0x4e, 0x45, 0x48, 0x3e, 0x55, 0x42,
-    0x51, 0x49, 0x49, 0x44, 0x4e, 0x54, 0x53, 0x49, 0x4c, 0x63, 0x48, 0x5a,
-    0x50, 0x4b, 0x45, 0x49, 0x43, 0x57, 0x4c, 0x3f, 0x4d, 0x67, 0x3f, 0x47,
-    0x53, 0x49, 0x43, 0x44, 0x49, 0x61, 0x50, 0x47, 0x49, 0x49, 0x4a, 0x42,
-    0x4a, 0x51, 0x46, 0x43, 0x3f, 0x34, 0x40, 0x3a, 0x45, 0x54, 0x4c, 0x55,
-    0x40, 0x3c, 0x4a, 0x4d, 0x3e, 0x4d, 0x48, 0x51, 0x4c, 0x3e, 0x4c, 0x4f,
-    0x50, 0x47, 0x4d, 0x49, 0x4d, 0x4e, 0x45, 0x43, 0x41, 0x41, 0x40, 0x47,
-    0x43, 0x4a, 0x4a, 0x3c, 0x4c, 0x3d, 0x4e, 0x43, 0x41, 0x42, 0x4a, 0x30,
-    0x45, 0x4c, 0x45, 0x55, 0x46, 0x39, 0x43, 0x39, 0x45, 0x47, 0x48, 0x53,
-    0x4a, 0x48, 0x43, 0x38, 0x4f, 0x51, 0x4d, 0x4c, 0x41, 0x46, 0x40, 0x3d,
-    0x43, 0x4b, 0x40, 0x46, 0x47, 0x50, 0x4a, 0x43, 0x50, 0x4e, 0x45, 0x4f,
-    0x4d, 0x44, 0x4d, 0x3f, 0x4e, 0x48, 0x4a, 0x49, 0x44, 0x3d, 0x4a, 0x44,
-    0x40, 0x45, 0x49, 0x40, 0x4a, 0x44, 0x4f, 0x4a, 0x43, 0x4a, 0x4e, 0x52,
-    0x4d, 0x50, 0x48, 0x4c, 0x43, 0x45, 0x4d, 0x54, 0x4a, 0x49, 0x4c, 0x58,
-    0x4c, 0x48, 0x4c, 0x44, 0x4b, 0x4e, 0x52, 0x44, 0x49, 0x44, 0x47, 0x4e,
-    0x4b, 0x45, 0x49, 0x3e, 0x4c, 0x3b, 0x53, 0x3f, 0x51, 0x41, 0x3f, 0x44,
-    0x43, 0x4a, 0x4b, 0x43, 0x53, 0x57, 0x50, 0x53, 0x4f, 0x4b, 0x48, 0x51,
-    0x47, 0x49, 0x46, 0x4d, 0x4d, 0x5e, 0x44, 0x46, 0x56, 0x3d, 0x3c, 0x3e,
-    0x47, 0x55, 0x54, 0x46, 0x42, 0x49, 0x4f, 0x43, 0x48, 0x54, 0x51, 0x40,
-    0x44, 0x44, 0x47, 0x45, 0x4b, 0x59, 0x4d, 0x47, 0x40, 0x39, 0x48, 0x54,
-    0x43, 0x45, 0x44, 0x42, 0x4c, 0x3c, 0x4d, 0x42, 0x4b, 0x45, 0x42, 0x48,
-    0x51, 0x44, 0x45, 0x3f, 0x3d, 0x49, 0x4b, 0x4a, 0x41, 0x43, 0x4f, 0x3f,
-    0x51, 0x4b, 0x44, 0x46, 0x46, 0x44, 0x53, 0x3d, 0x47, 0x47, 0x43, 0x4b,
-    0x41, 0x43, 0x3c, 0x3b, 0x49, 0x47, 0x47, 0x49, 0x4b, 0x3d, 0x43, 0x43,
-    0x4b, 0x47, 0x45, 0x4e, 0x42, 0x4a, 0x4c, 0x3e, 0x51, 0x3e, 0x46, 0x44,
-    0x46, 0x43, 0x42, 0x42, 0x47, 0x4d, 0x51, 0x4b, 0x49, 0x44, 0x4d, 0x40,
-    0x50, 0x43, 0x41, 0x4c, 0x42, 0x49, 0x49, 0x4c, 0x42, 0x50, 0x48, 0x3f,
-    0x46, 0x42, 0x48, 0x57, 0x49, 0x4d, 0x47, 0x4e, 0x48, 0x4b, 0x46, 0x50,
-    0x47, 0x45, 0x52, 0x45, 0x4b, 0x48, 0x40, 0x5b, 0x4e, 0x43, 0x51, 0x48,
-    0x48, 0x4a, 0x4a, 0x4a, 0x52, 0x51, 0x4c, 0x4b, 0x42, 0x55, 0x4d, 0x46,
-    0x50, 0x40, 0x4a, 0x50, 0x51, 0x3e, 0x42, 0x4c, 0x43, 0x46, 0x4d, 0x46,
-    0x46, 0x4d, 0x4d, 0x52, 0x4e, 0x44, 0x45, 0x47, 0x49, 0x4c, 0x41, 0x44,
-    0x4d, 0x54, 0x4c, 0x4a, 0x54, 0x3e, 0x44, 0x43, 0x53, 0x55, 0x4b, 0x4a,
-    0x47, 0x47, 0x4f, 0x46, 0x4f, 0x4b, 0x51, 0x3f, 0x41, 0x4c, 0x43, 0x46,
-    0x55, 0x51, 0x40, 0x4b, 0x4f, 0x40, 0x47, 0x50, 0x4e, 0x4a, 0x46, 0x4e,
-    0x42, 0x4d, 0x48, 0x49, 0x48, 0x4a, 0x4a, 0x43, 0x49, 0x48, 0x44, 0x3b,
-    0x51, 0x46, 0x3d, 0x43, 0x47, 0x4a, 0x4f, 0x42, 0x4a, 0x50, 0x4f, 0x41,
-    0x45, 0x45, 0x43, 0x3c, 0x4c, 0x4c, 0x46, 0x4b, 0x3e, 0x44, 0x4b, 0x3a,
-    0x45, 0x50, 0x42, 0x48, 0x46, 0x47, 0x44, 0x3a, 0x53, 0x46, 0x4e, 0x4f,
-    0x43, 0x40, 0x46, 0x48, 0x4e, 0x45, 0x3f, 0x47, 0x48, 0x3f, 0x44, 0x4f,
-    0x44, 0x47, 0x4e, 0x47, 0x47, 0x49, 0x42, 0x43, 0x3f, 0x49, 0x4a, 0x53,
-    0x53, 0x4a, 0x4e, 0x4a, 0x49, 0x4d, 0x49, 0x41, 0x48, 0x4d, 0x4d, 0x4e,
-    0x4b, 0x45, 0x4d, 0x4a, 0x46, 0x4a, 0x46, 0x51, 0x4b, 0x47, 0x49, 0x45,
-    0x49, 0x49, 0x4b, 0x5c, 0x48, 0x42, 0x51, 0x4c, 0x41, 0x3f, 0x4c, 0x42,
-    0x4f, 0x45, 0x4b, 0x4a, 0x52, 0x48, 0x53, 0x4f, 0x40, 0x47, 0x41, 0x47,
-    0x68, 0xfb, 0xff, 0xff, 0x4c, 0xfc, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xe8, 0x03, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
-    0x58, 0x01, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, 0x00,
-    0x38, 0x02, 0x00, 0x00, 0x9c, 0x02, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00,
-    0x14, 0x03, 0x00, 0x00, 0xfe, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
-    0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x19, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00,
-    0xcc, 0xfc, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x17, 0xbf, 0xd2, 0x3f, 0x01, 0x00, 0x00, 0x00, 0x58, 0xec, 0xd1, 0x43,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6e, 0xfd, 0xff, 0xff,
-    0x00, 0x00, 0x00, 0x02, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
-    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x43, 0x6f, 0x6e, 0x76,
-    0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x34, 0xff, 0xff, 0xff,
-    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0xf5, 0xf7, 0x84, 0x3a, 0xc2, 0xfd, 0xff, 0xff,
-    0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-    0x1c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,
-    0x61, 0x70, 0x65, 0x5f, 0x31, 0x00, 0x00, 0x00, 0x94, 0xfd, 0xff, 0xff,
-    0x30, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x80, 0x3f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x43,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3a, 0xfe, 0xff, 0xff,
-    0x00, 0x00, 0x00, 0x02, 0x10, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
-    0x10, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d,
-    0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x0c, 0x00, 0x0c, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0xc5, 0x01, 0x2a, 0x3b, 0x96, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
-    0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x44, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x0a, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-    0x25, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f,
-    0x71, 0x75, 0x61, 0x6e, 0x74, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75,
-    0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61,
-    0x78, 0x56, 0x61, 0x72, 0x73, 0x00, 0x00, 0x00, 0x84, 0xfe, 0xff, 0xff,
-    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xf5, 0xf7, 0x84, 0x3a,
-    0x01, 0x00, 0x00, 0x00, 0x6e, 0x88, 0xae, 0x3d, 0x01, 0x00, 0x00, 0x00,
-    0xd4, 0x97, 0x30, 0xbe, 0x26, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
-    0x10, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
-    0x31, 0x00, 0x00, 0x00, 0xec, 0xfe, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
-    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x2f, 0xad, 0x18, 0x40, 0x01, 0x00, 0x00, 0x00,
-    0x02, 0x38, 0xa2, 0x43, 0x01, 0x00, 0x00, 0x00, 0x02, 0xf1, 0x8d, 0xc3,
-    0x8e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,
-    0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0x5c, 0xff, 0xff, 0xff,
-    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00,
-    0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x14, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x30, 0x11, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
-    0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e,
-    0x74, 0x5f, 0x31, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e,
-    0x74, 0x57, 0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56,
-    0x61, 0x72, 0x73, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73,
-    0x65, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00,
-    0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
-    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x31, 0x83, 0xce, 0x3a, 0x01, 0x00, 0x00, 0x00,
-    0x4d, 0x97, 0x92, 0x3e, 0x01, 0x00, 0x00, 0x00, 0x84, 0x75, 0xec, 0xbd,
-    0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
-    0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x14, 0x00, 0x1c, 0x00,
-    0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
-    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00,
-    0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
-    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
-    0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00,
-    0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
-    0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
-    0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
-const int g_tiny_conv_model_data_len = 19800;
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/BUILD b/tensorflow/contrib/lite/experimental/micro/kernels/BUILD
deleted file mode 100644
index a012f950e6f58f082d0a7c9ac0b4cd9018bcf40b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/kernels/BUILD
+++ /dev/null
@@ -1,107 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-load(
-    "//tensorflow/contrib/lite/experimental/micro/testing:micro_test.bzl",
-    "tflite_micro_cc_test",
-)
-
-cc_library(
-    name = "micro_ops",
-    srcs = [
-        "depthwise_conv.cc",
-        "fully_connected.cc",
-        "softmax.cc",
-    ],
-    hdrs = [
-    ],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
-        "//tensorflow/contrib/lite/kernels:kernel_util",
-        "//tensorflow/contrib/lite/kernels:op_macros",
-        "//tensorflow/contrib/lite/kernels:padding",
-        "//tensorflow/contrib/lite/kernels/internal:quantization_util",
-        "//tensorflow/contrib/lite/kernels/internal:reference_base",
-        "//tensorflow/contrib/lite/kernels/internal:tensor",
-    ],
-)
-
-cc_library(
-    name = "all_ops_resolver",
-    srcs = [
-        "all_ops_resolver.cc",
-    ],
-    hdrs = [
-        "all_ops_resolver.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        ":micro_ops",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
-    ],
-)
-
-cc_library(
-    name = "test_utils",
-    srcs = [
-    ],
-    hdrs = [
-        "test_utils.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/core/api",
-        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
-        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
-    ],
-)
-
-tflite_micro_cc_test(
-    name = "depthwise_conv_test",
-    srcs = [
-        "depthwise_conv_test.cc",
-    ],
-    deps = [
-        ":all_ops_resolver",
-        ":test_utils",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
-        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
-    ],
-)
-
-tflite_micro_cc_test(
-    name = "fully_connected_test",
-    srcs = [
-        "fully_connected_test.cc",
-    ],
-    deps = [
-        ":all_ops_resolver",
-        ":test_utils",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
-        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
-    ],
-)
-
-tflite_micro_cc_test(
-    name = "softmax_test",
-    srcs = [
-        "softmax_test.cc",
-    ],
-    deps = [
-        ":all_ops_resolver",
-        ":test_utils",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
-        "//tensorflow/contrib/lite/experimental/micro/testing:micro_test",
-    ],
-)
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h b/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h
deleted file mode 100644
index f836064a3f63443ff577e7ac7a8b791cbb2c24c5..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
-
-#include "tensorflow/contrib/lite/experimental/micro/compatibility.h"
-#include "tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-
-class AllOpsResolver : public MicroMutableOpResolver {
- public:
-  AllOpsResolver();
-
- private:
-  TF_LITE_REMOVE_VIRTUAL_DELETE
-};
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv.cc
deleted file mode 100644
index 4f17263181982afdaa1941194b88d58f0ef0ca74..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/padding.h"
-
-#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
-namespace {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteDepthwiseConvParams* params, int width,
-                             int height, int filter_width, int filter_height,
-                             int out_width, int out_height,
-                             const TfLiteType data_type, OpData* data) {
-  data->padding.height = ComputePadding(params->stride_height, 1, height,
-                                        filter_height, out_height);
-  data->padding.width =
-      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  return nullptr;
-}
-
-void Free(TfLiteContext* context, void* buffer) {}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  return kTfLiteOk;
-}
-
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
-}
-
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  tflite::DepthwiseParams op_params;
-  // Padding type is ignored, but still set.
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-
-  tflite::reference_ops::DepthwiseConv(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
-
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-  int out_width = ComputeOutSize(params->padding, width, filter_width,
-                                 params->stride_width);
-  int out_height = ComputeOutSize(params->padding, height, filter_height,
-                                  params->stride_height);
-  OpData local_data_object;
-  OpData* data = &local_data_object;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, out_width,
-                                        out_height, data_type, data));
-
-  // TODO(aselle): Consider whether float conv and quantized conv should be
-  // separate ops to avoid dispatch overhead here.
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat(context, node, params, data, input, filter, bias, output);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized(context, node, params, data, input, filter, bias, output);
-      break;
-    default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace depthwise_conv
-
-TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-  static TfLiteRegistration r = {depthwise_conv::Init, depthwise_conv::Free,
-                                 depthwise_conv::Prepare, depthwise_conv::Eval};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test.cc
deleted file mode 100644
index 169899c471dd44399b4d8a479cecbbbd78ba1215..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/kernels/depthwise_conv_test.cc
+++ /dev/null
@@ -1,406 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h"
-#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
-#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
-
-namespace tflite {
-namespace testing {
-namespace {
-
-void TestDepthwiseConvFloat(std::initializer_list<int> input_dims_data,
-                            std::initializer_list<float> input_data,
-                            std::initializer_list<int> filter_dims_data,
-                            std::initializer_list<float> filter_data,
-                            std::initializer_list<int> bias_dims_data,
-                            std::initializer_list<float> bias_data,
-                            std::initializer_list<float> expected_output_data,
-                            std::initializer_list<int> output_dims_data,
-                            TfLiteFusedActivation activation,
-                            float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* filter_dims = IntArrayFromInitializer(filter_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims, "input_tensor"),
-      CreateFloatTensor(filter_data, filter_dims, "filter_tensor"),
-      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
-      CreateFloatTensor(output_data, output_dims, "output_tensor"),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
-
-  ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  int input_depth = input_dims->data[3];
-  int output_depth = filter_dims->data[3];
-  int depth_mul = output_depth / input_depth;
-  TfLiteDepthwiseConvParams builtin_data = {
-      kTfLitePaddingValid, 1, 1, depth_mul, activation,
-  };
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
-}
-
-void TestDepthwiseConvQuantized(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<uint8_t> input_data, float input_min, float input_max,
-    std::initializer_list<int> filter_dims_data,
-    std::initializer_list<uint8_t> filter_data, float filter_min,
-    float filter_max, std::initializer_list<int> bias_dims_data,
-    std::initializer_list<int32_t> bias_data, float bias_min, float bias_max,
-    std::initializer_list<uint8_t> expected_output_data,
-    std::initializer_list<int> output_dims_data, float output_min,
-    float output_max, TfLiteFusedActivation activation, uint8_t* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* filter_dims = IntArrayFromInitializer(filter_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
-                            input_max),
-      CreateQuantizedTensor(filter_data, filter_dims, "filter_tensor",
-                            filter_min, filter_max),
-      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_min,
-                              bias_max),
-      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
-                            output_min, output_max),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
-
-  ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  int input_depth = input_dims->data[3];
-  int output_depth = filter_dims->data[3];
-  int depth_mul = output_depth / input_depth;
-  TfLiteDepthwiseConvParams builtin_data = {
-      kTfLitePaddingValid, 1, 1, depth_mul, activation,
-  };
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
-  }
-}
-
-}  // namespace
-}  // namespace testing
-}  // namespace tflite
-
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(SimpleTest) {
-  const int output_dims_count = 8;
-  float output_data[output_dims_count];
-  tflite::testing::TestDepthwiseConvFloat(  //
-      {4, 1, 3, 2, 2},                      // Input shape.
-      {
-          1, 2, 7, 8,    // Input values.
-          3, 4, 9, 10,   //
-          5, 6, 11, 12,  //
-      },
-      {4, 1, 2, 2, 4},  // Filters shape.
-      {
-          1, 2, 3, 4,        // Filters values.
-          -9, 10, -11, 12,   //
-          5, 6, 7, 8,        //
-          13, -14, 15, -16,  //
-      },
-      {1, 4},  // Bias shape.
-      {
-          1, 2, 3, 4,  // Bias values.
-      },
-      {
-          71, -34, 99, -20,  // Expected results.
-          91, -26, 127, -4,  //
-      },
-      {4, 1, 2, 1, 4},  // Output shape.
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantized) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float filter_min = -63.5f;
-  const float filter_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-  const int output_dims_count = 8;
-  uint8_t output_data[output_dims_count];
-
-  tflite::testing::TestDepthwiseConvQuantized(  //
-      {4, 1, 3, 2, 2},                          // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(7, input_min, input_max),
-          F2Q(8, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(9, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(5, input_min, input_max),
-          F2Q(6, input_min, input_max),
-          F2Q(11, input_min, input_max),
-          F2Q(12, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {4, 1, 2, 2, 4},       // Filter shape.
-      {
-          // Filter values.
-          F2Q(1, filter_min, filter_max),
-          F2Q(2, filter_min, filter_max),
-          F2Q(3, filter_min, filter_max),
-          F2Q(4, filter_min, filter_max),
-          F2Q(-9, filter_min, filter_max),
-          F2Q(10, filter_min, filter_max),
-          F2Q(-11, filter_min, filter_max),
-          F2Q(12, filter_min, filter_max),
-          F2Q(5, filter_min, filter_max),
-          F2Q(6, filter_min, filter_max),
-          F2Q(7, filter_min, filter_max),
-          F2Q(8, filter_min, filter_max),
-          F2Q(13, filter_min, filter_max),
-          F2Q(-14, filter_min, filter_max),
-          F2Q(15, filter_min, filter_max),
-          F2Q(-16, filter_min, filter_max),
-      },
-      filter_min, filter_max,  // Filter quantization range.
-      {1, 4},                  // Bias shape.
-      {
-          // Bias values.
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-          F2Q32(4, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(71, output_min, output_max),
-          F2Q(-34, output_min, output_max),
-          F2Q(99, output_min, output_max),
-          F2Q(-20, output_min, output_max),
-          F2Q(91, output_min, output_max),
-          F2Q(-26, output_min, output_max),
-          F2Q(127, output_min, output_max),
-          F2Q(-4, output_min, output_max),
-      },
-      {4, 1, 2, 1, 4},         // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestRelu) {
-  const int output_dims_count = 8;
-  float output_data[output_dims_count];
-  tflite::testing::TestDepthwiseConvFloat(  //
-      {4, 1, 3, 2, 2},                      // Input shape.
-      {
-          1, 2, 7, 8,    // Input values.
-          3, 4, 9, 10,   //
-          5, 6, 11, 12,  //
-      },
-      {4, 1, 2, 2, 4},  // Filters shape.
-      {
-          1, 2, 3, 4,        // Filters values.
-          -9, 10, -11, 12,   //
-          5, 6, 7, 8,        //
-          13, -14, 15, -16,  //
-      },
-      {1, 4},  // Bias shape.
-      {
-          1, 2, 3, 4,  // Bias values.
-      },
-      {
-          71, 0, 99, 0,   // Expected results.
-          91, 0, 127, 0,  //
-      },
-      {4, 1, 2, 1, 4},  // Output shape.
-      kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float filter_min = -63.5f;
-  const float filter_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-  const int output_dims_count = 8;
-  uint8_t output_data[output_dims_count];
-
-  tflite::testing::TestDepthwiseConvQuantized(  //
-      {4, 1, 3, 2, 2},                          // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(7, input_min, input_max),
-          F2Q(8, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(9, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(5, input_min, input_max),
-          F2Q(6, input_min, input_max),
-          F2Q(11, input_min, input_max),
-          F2Q(12, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {4, 1, 2, 2, 4},       // Filter shape.
-      {
-          // Filter values.
-          F2Q(1, filter_min, filter_max),
-          F2Q(2, filter_min, filter_max),
-          F2Q(3, filter_min, filter_max),
-          F2Q(4, filter_min, filter_max),
-          F2Q(-9, filter_min, filter_max),
-          F2Q(10, filter_min, filter_max),
-          F2Q(-11, filter_min, filter_max),
-          F2Q(12, filter_min, filter_max),
-          F2Q(5, filter_min, filter_max),
-          F2Q(6, filter_min, filter_max),
-          F2Q(7, filter_min, filter_max),
-          F2Q(8, filter_min, filter_max),
-          F2Q(13, filter_min, filter_max),
-          F2Q(-14, filter_min, filter_max),
-          F2Q(15, filter_min, filter_max),
-          F2Q(-16, filter_min, filter_max),
-      },
-      filter_min, filter_max,  // Filter quantization range.
-      {1, 4},                  // Bias shape.
-      {
-          // Bias values.
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-          F2Q32(4, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(71, output_min, output_max),
-          F2Q(0, output_min, output_max),
-          F2Q(99, output_min, output_max),
-          F2Q(0, output_min, output_max),
-          F2Q(91, output_min, output_max),
-          F2Q(0, output_min, output_max),
-          F2Q(127, output_min, output_max),
-          F2Q(0, output_min, output_max),
-      },
-      {4, 1, 2, 1, 4},         // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected.cc b/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected.cc
deleted file mode 100644
index 1e9e54cafb8c91af1b42d6d23396495ecad6e602..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h"
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-
-namespace tflite {
-namespace ops {
-namespace micro {
-namespace fully_connected {
-namespace {
-
-struct OpData {
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-  // The index of the temporary tensor where the quantized inputs are cached.
-  int input_quantized_index;
-};
-
-constexpr int kInputTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
-                             TfLiteType data_type, const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             OpData* data) {
-  TfLiteStatus status = kTfLiteOk;
-  if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
-        &data->output_activation_max));
-  }
-  return status;
-}
-
-}  // namespace
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  return nullptr;
-}
-
-void Free(TfLiteContext* context, void* buffer) {}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
-
-  tflite::FullyConnectedParams op_params;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-
-#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
-  reference_ops::FullyConnected(                                       \
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-      GetTensorShape(output), GetTensorData<output_data_type>(output), \
-      nullptr)
-  switch (output->type) {
-    case kTfLiteUInt8:
-      TF_LITE_FULLY_CONNECTED(uint8_t);
-      break;
-    case kTfLiteInt16:
-      TF_LITE_FULLY_CONNECTED(int16_t);
-      break;
-    default:
-      context->ReportError(
-          context,
-          "Quantized FullyConnected expects output data type uint8 or int16");
-      return kTfLiteError;
-  }
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-  tflite::FullyConnectedParams op_params;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-  tflite::reference_ops::FullyConnected(
-      op_params, GetTensorShape(input), GetTensorData<float>(input),
-      GetTensorShape(filter), GetTensorData<float>(filter),
-      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
-      GetTensorData<float>(output));
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TfLiteType data_type = input->type;
-  OpData local_data_object;
-  OpData* data = &local_data_object;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
-
-  switch (filter->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
-                       output);
-    case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
-
-    default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           filter->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace fully_connected
-
-TfLiteRegistration* Register_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {fully_connected::Init, fully_connected::Free,
-                                 fully_connected::Prepare,
-                                 fully_connected::Eval};
-  return &r;
-}
-
-}  // namespace micro
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected_test.cc
deleted file mode 100644
index b42bf4c3bca75572dbf8e1907e7fb94be24d41bd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/kernels/fully_connected_test.cc
+++ /dev/null
@@ -1,643 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h"
-#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
-#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
-
-namespace tflite {
-namespace testing {
-namespace {
-
-void TestFullyConnectedFloat(std::initializer_list<int> input_dims_data,
-                             std::initializer_list<float> input_data,
-                             std::initializer_list<int> weights_dims_data,
-                             std::initializer_list<float> weights_data,
-                             std::initializer_list<int> bias_dims_data,
-                             std::initializer_list<float> bias_data,
-                             std::initializer_list<float> expected_output_data,
-                             std::initializer_list<int> output_dims_data,
-                             TfLiteFusedActivation activation,
-                             float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* weights_dims = IntArrayFromInitializer(weights_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims, "input_tensor"),
-      CreateFloatTensor(weights_data, weights_dims, "weights_tensor"),
-      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
-      CreateFloatTensor(output_data, output_dims, "output_tensor"),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
-
-  ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteFullyConnectedParams builtin_data = {
-      activation,
-      kTfLiteFullyConnectedWeightsFormatDefault,
-  };
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
-}
-
-void TestFullyConnectedQuantized(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<uint8_t> input_data, float input_min, float input_max,
-    std::initializer_list<int> weights_dims_data,
-    std::initializer_list<uint8_t> weights_data, float weights_min,
-    float weights_max, std::initializer_list<int> bias_dims_data,
-    std::initializer_list<int32_t> bias_data, float bias_min, float bias_max,
-    std::initializer_list<uint8_t> expected_output_data,
-    std::initializer_list<int> output_dims_data, float output_min,
-    float output_max, TfLiteFusedActivation activation, uint8_t* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* weights_dims = IntArrayFromInitializer(weights_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
-                            input_max),
-      CreateQuantizedTensor(weights_data, weights_dims, "weights_tensor",
-                            weights_min, weights_max),
-      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_min,
-                              bias_max),
-      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
-                            output_min, output_max),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
-
-  ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteFullyConnectedParams builtin_data = {
-      activation,
-      kTfLiteFullyConnectedWeightsFormatDefault,
-  };
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
-  }
-}
-
-}  // namespace
-}  // namespace testing
-}  // namespace tflite
-
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(SimpleTest) {
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(  //
-      {2, 2, 10},                            // Input shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-      },
-      {2, 3, 10},  // Weights shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-      },
-      {1, 3},  // Bias shape.
-      {
-          1, 2, 3,  // Bias values.
-      },
-      {
-          24, 25, 26, 58, 59, 60,  // Expected results.
-      },
-      {2, 2, 3},  // Output shape.
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTest2) {
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(  //
-      {2, 2, 2},                             // Input shape.
-      {
-          1, 2,  // b = 0
-          2, 1,  // b = 1
-      },
-      {2, 1, 2},  // Weights shape.
-      {
-          2, 4,  // u = 0
-      },
-      {1, 1},  // Bias shape.
-      {
-          1,  // Bias values.
-      },
-      {
-          11, 9,  // Expected results.
-      },
-      {2, 2, 1},  // Output shape.
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestRelu) {
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(  //
-      {2, 2, 10},                            // Input shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-      },
-      {2, 3, 10},  // Weights shape.
-      {
-          1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 0
-          -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,  // u = 1
-          1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 2
-      },
-      {1, 3},  // Bias shape.
-      {
-          1, -2, 3,  // Bias values.
-      },
-      {
-          24, 0, 26, 58, 0, 60,  // Expected results.
-      },
-      {2, 2, 3},  // Output shape.
-      kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantized) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -63.5f;
-  const float weights_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {2, 2, 10},                                // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(25, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(59, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedRelu) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -63.5f;
-  const float weights_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {2, 2, 10},                                // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
-          F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max),
-          F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max),
-          F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max),
-          F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max),
-          F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(0, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(0, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(0, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedOutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
-  const float weights_min = -127.0f;
-  const float weights_max = 128.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 256.0f * (1 << 24);
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {2, 2, 10},                                // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(25, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(59, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTest4DInput) {
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(  //
-      {4, 1, 1, 5, 1},                       // Input shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-      },
-      {2, 3, 10},  // Weights shape.
-      {
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-      },
-      {1, 3},  // Bias shape.
-      {
-          1, 2, 3,  // Bias values.
-      },
-      {
-          24, 25, 26, 58, 59, 60,  // Expected results.
-      },
-      {2, 2, 3},  // Output shape.
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTest4DInputQuantized) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -63.5f;
-  const float weights_max = 64.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 64.0f * (1 << 24);
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {4, 1, 1, 5, 1},                           // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(25, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(59, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedOutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
-  const float weights_min = -127.0f;
-  const float weights_max = 128.0f;
-  const float bias_min = 0.0f;
-  const float bias_max = 256.0f * (1 << 24);
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized(  //
-      {4, 1, 1, 5, 1},                           // Input shape.
-      {
-          // Input values.
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantization range.
-      {2, 3, 10},            // Weights shape.
-      {
-          // Weight values.
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      },
-      weights_min, weights_max,  // Weights quantization range.
-      {1, 3},                    // Bias shape.
-      {
-          F2Q32(1, bias_min, bias_max),
-          F2Q32(2, bias_min, bias_max),
-          F2Q32(3, bias_min, bias_max),
-      },
-      bias_min, bias_max,  // Bias quantization range.
-      {
-          // Expected results.
-          F2Q(24, output_min, output_max),
-          F2Q(25, output_min, output_max),
-          F2Q(26, output_min, output_max),
-          F2Q(58, output_min, output_max),
-          F2Q(59, output_min, output_max),
-          F2Q(60, output_min, output_max),
-      },
-      {2, 2, 3},               // Output shape.
-      output_min, output_max,  // Output quantization range.
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc b/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc
deleted file mode 100644
index df7d87d62302e17e893fb0f6e50ea53740d5fb09..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/kernels/softmax_test.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h"
-#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
-#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
-
-namespace tflite {
-namespace testing {
-namespace {
-
-void TestSoftmaxFloat(std::initializer_list<int> input_dims_data,
-                      std::initializer_list<float> input_data,
-                      std::initializer_list<float> expected_output_data,
-                      std::initializer_list<int> output_dims_data,
-                      float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 2;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims, "input_tensor"),
-      CreateFloatTensor(output_data, output_dims, "output_tensor"),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
-
-  ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSoftmaxParams builtin_data = {1.0f};
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
-}
-
-void TestSoftmaxQuantized(std::initializer_list<int> input_dims_data,
-                          std::initializer_list<uint8_t> input_data,
-                          float input_min, float input_max,
-                          std::initializer_list<uint8_t> expected_output_data,
-                          std::initializer_list<int> output_dims_data,
-                          float output_min, float output_max,
-                          uint8_t* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 1;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
-                            input_max),
-      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
-                            output_min, output_max),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, &context);
-
-  ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteSoftmaxParams builtin_data = {1.0f};
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
-  }
-}
-
-}  // namespace
-}  // namespace testing
-}  // namespace tflite
-
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(SimpleTest) {
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
-  tflite::testing::TestSoftmaxFloat(  //
-      {2, 2, 5},                      // Input shape.
-      {
-          1.0, 2.0, 3.0, 4.0, 5.0,       // b = 0
-          -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 0
-      },
-      {
-          // Expected results.
-          0.011656231,
-          0.031684921,
-          0.086128544,
-          0.234121657,
-          0.636408647,
-          0.636408647,
-          0.234121657,
-          0.086128544,
-          0.031684921,
-          0.011656231,
-      },
-      {2, 2, 3},  // Output shape.
-      output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantized) {
-  using tflite::testing::F2Q;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float output_min = 0.0f;
-  const float output_max = (255.0f / 256.0f);
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestSoftmaxQuantized(  //
-      {2, 1, 5},                          // Input shape.
-      {
-          F2Q(1.0, input_min, input_max),
-          F2Q(2.0, input_min, input_max),
-          F2Q(3.0, input_min, input_max),
-          F2Q(4.0, input_min, input_max),
-          F2Q(5.0, input_min, input_max),
-      },
-      input_min, input_max,  // Input quantized range.
-      {
-          // Expected results.
-          F2Q(0.011656231, output_min, output_max),
-          F2Q(0.031684921, output_min, output_max),
-          F2Q(0.086128544, output_min, output_max),
-          F2Q(0.234121657, output_min, output_max),
-          F2Q(0.636408647, output_min, output_max),
-      },
-      {2, 1, 3},               // Output shape.
-      output_min, output_max,  // Output quantized range.
-      output_data);
-}
-
-TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h b/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h
deleted file mode 100644
index 33e54f7990af6cff4f8706d2889c335087581af4..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_ERROR_REPORTER_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_ERROR_REPORTER_H_
-
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/experimental/micro/compatibility.h"
-
-namespace tflite {
-
-class MicroErrorReporter : public ErrorReporter {
- public:
-  ~MicroErrorReporter() {}
-  int Report(const char* format, va_list args) override;
-
- private:
-  TF_LITE_REMOVE_VIRTUAL_DELETE
-};
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_ERROR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.cc b/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.cc
deleted file mode 100644
index 8c090a20a5fb9e6cb330a40c86236c549c28539e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
-
-#include "tensorflow/contrib/lite/core/api/flatbuffer_conversions.h"
-
-namespace tflite {
-namespace {
-
-TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
-                              ErrorReporter* reporter) {
-  switch (type) {
-    case kTfLiteFloat32:
-      *size = sizeof(float);
-      break;
-    case kTfLiteInt16:
-      *size = sizeof(int16_t);
-      break;
-    case kTfLiteInt32:
-      *size = sizeof(int32_t);
-      break;
-    case kTfLiteUInt8:
-      *size = sizeof(uint8_t);
-      break;
-    case kTfLiteInt64:
-      *size = sizeof(int64_t);
-      break;
-    case kTfLiteBool:
-      *size = sizeof(bool);
-      break;
-    case kTfLiteComplex64:
-      *size = sizeof(float) * 2;
-      break;
-    default:
-      reporter->Report(
-          "Only float32, int16, int32, int64, uint8, bool, complex64 "
-          "supported currently.");
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus BytesRequired(const tflite::Tensor& flatbuffer_tensor,
-                           size_t dims_size, size_t* bytes,
-                           ErrorReporter* error_reporter) {
-  TfLiteType tf_lite_type;
-  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
-                                          &tf_lite_type, error_reporter));
-  size_t type_size;
-  TF_LITE_ENSURE_STATUS(
-      TfLiteTypeSizeOf(tf_lite_type, &type_size, error_reporter));
-  *bytes = dims_size * type_size;
-  return kTfLiteOk;
-}
-
-}  // namespace
-
-TfLiteStatus SimpleTensorAllocator::AllocateTensor(
-    const tflite::Tensor& flatbuffer_tensor, int create_before,
-    int destroy_after,
-    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
-    ErrorReporter* error_reporter, TfLiteTensor* result) {
-  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
-                                          &result->type, error_reporter));
-  result->is_variable = flatbuffer_tensor.is_variable();
-
-  result->data.raw = nullptr;
-  result->bytes = 0;
-  if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) {
-    if (auto* array = buffer->data()) {
-      if (size_t array_size = array->size()) {
-        result->data.raw =
-            const_cast<char*>(reinterpret_cast<const char*>(array->data()));
-        TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, array_size,
-                                            &result->bytes, error_reporter));
-      }
-    }
-  }
-  if (result->data.raw) {
-    result->allocation_type = kTfLiteMmapRo;
-  } else {
-    int data_size = 1;
-    for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
-      data_size *= flatbuffer_tensor.shape()->Get(n);
-    }
-    TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, data_size,
-                                        &result->bytes, error_reporter));
-    result->data.raw = reinterpret_cast<char*>(AllocateMemory(result->bytes));
-    if (result->data.raw == nullptr) {
-      const char* tensor_name = flatbuffer_tensor.name()->c_str();
-      if (tensor_name == nullptr) {
-        tensor_name = "<None>";
-      }
-      error_reporter->Report(
-          "Couldn't allocate memory for tensor '%s', wanted %d bytes but only "
-          "%d were available",
-          tensor_name, result->bytes, (data_size_max_ - data_size_));
-      return kTfLiteError;
-    }
-    result->allocation_type = kTfLiteArenaRw;
-  }
-  result->dims = reinterpret_cast<TfLiteIntArray*>(
-      AllocateMemory(sizeof(int) * (flatbuffer_tensor.shape()->Length() + 1)));
-  result->dims->size = flatbuffer_tensor.shape()->Length();
-  for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
-    result->dims->data[n] = flatbuffer_tensor.shape()->Get(n);
-  }
-  if (flatbuffer_tensor.quantization()) {
-    result->params.scale = flatbuffer_tensor.quantization()->scale()->Get(0);
-    result->params.zero_point =
-        flatbuffer_tensor.quantization()->zero_point()->Get(0);
-  }
-  result->allocation = nullptr;
-  if (flatbuffer_tensor.name()) {
-    result->name = flatbuffer_tensor.name()->c_str();
-  } else {
-    result->name = "<No name>";
-  }
-  result->delegate = nullptr;
-  result->buffer_handle = 0;
-  result->data_is_stale = false;
-  return kTfLiteOk;
-}
-
-uint8_t* SimpleTensorAllocator::AllocateMemory(size_t size) {
-  if ((data_size_ + size) > data_size_max_) {
-    // TODO(petewarden): Add error reporting beyond returning null!
-    return nullptr;
-  }
-  uint8_t* result = data_;
-  data_ += size;
-  data_size_ += size;
-  return result;
-}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/BUILD b/tensorflow/contrib/lite/experimental/micro/testing/BUILD
deleted file mode 100644
index 0d23be5712ad1bc6d81cc467cce8c9927caece3d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/testing/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["test_linux_binary.sh"])
-
-cc_library(
-    name = "micro_test",
-    hdrs = [
-        "micro_test.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite/experimental/micro:micro_framework",
-    ],
-)
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/micro_test.h b/tensorflow/contrib/lite/experimental/micro/testing/micro_test.h
deleted file mode 100644
index 104509c9dc6123e84c45f26d03465f608f100310..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/testing/micro_test.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// An ultra-lightweight testing framework designed for use with microcontroller
-// applications. Its only dependency is on TensorFlow Lite's ErrorReporter
-// interface, where log messages are output. This is designed to be usable even
-// when no standard C or C++ libraries are available, and without any dynamic
-// memory allocation or reliance on global constructors.
-//
-// To build a test, you use syntax similar to gunit, but with some extra
-// decoration to create a hidden 'main' function containing each of the tests to
-// be run. Your code should look something like:
-// ----------------------------------------------------------------------------
-// #include "path/to/this/header"
-//
-// TF_LITE_MICRO_TESTS_BEGIN
-//
-// TF_LITE_MICRO_TEST(SomeTest) {
-//   TF_LITE_LOG_EXPECT_EQ(true, true);
-// }
-//
-// TF_LITE_MICRO_TESTS_END
-// ----------------------------------------------------------------------------
-// If you compile this for your platform, you'll get a normal binary that you
-// should be able to run. Executing it will output logging information like this
-// to stderr (or whatever equivalent is available and written to by
-// ErrorReporter):
-// ----------------------------------------------------------------------------
-// Testing SomeTest
-// 1/1 tests passed
-// ~~~ALL TESTS PASSED~~~
-// ----------------------------------------------------------------------------
-// This is designed to be human-readable, so you can just run tests manually,
-// but the string "~~~ALL TESTS PASSED~~~" should only appear if all of the
-// tests do pass. This makes it possible to integrate with automated test
-// systems by scanning the output logs and looking for that magic value.
-//
-// This framework is intended to be a rudimentary alternative to no testing at
-// all on systems that struggle to run more conventional approaches, so use with
-// caution!
-
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
-
-#include "tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h"
-
-namespace micro_test {
-extern int tests_passed;
-extern int tests_failed;
-extern bool is_test_complete;
-extern bool did_test_fail;
-extern tflite::ErrorReporter* reporter;
-}  // namespace micro_test
-
-#define TF_LITE_MICRO_TESTS_BEGIN              \
-  namespace micro_test {                       \
-  int tests_passed;                            \
-  int tests_failed;                            \
-  bool is_test_complete;                       \
-  bool did_test_fail;                          \
-  tflite::ErrorReporter* reporter;             \
-  }                                            \
-                                               \
-  int main(int argc, char** argv) {            \
-    micro_test::tests_passed = 0;              \
-    micro_test::tests_failed = 0;              \
-    tflite::MicroErrorReporter error_reporter; \
-    micro_test::reporter = &error_reporter;
-
-#define TF_LITE_MICRO_TESTS_END                                \
-  micro_test::reporter->Report(                                \
-      "%d/%d tests passed", micro_test::tests_passed,          \
-      (micro_test::tests_failed + micro_test::tests_passed));  \
-  if (micro_test::tests_failed == 0) {                         \
-    micro_test::reporter->Report("~~~ALL TESTS PASSED~~~\n");  \
-  } else {                                                     \
-    micro_test::reporter->Report("~~~SOME TESTS FAILED~~~\n"); \
-  }                                                            \
-  }
-
-// TODO(petewarden): I'm going to hell for what I'm doing to this poor for loop.
-#define TF_LITE_MICRO_TEST(name)                                           \
-  micro_test::reporter->Report("Testing %s", #name);                       \
-  for (micro_test::is_test_complete = false,                               \
-      micro_test::did_test_fail = false;                                   \
-       !micro_test::is_test_complete; micro_test::is_test_complete = true, \
-      micro_test::tests_passed += (micro_test::did_test_fail) ? 0 : 1,     \
-      micro_test::tests_failed += (micro_test::did_test_fail) ? 1 : 0)
-
-#define TF_LITE_MICRO_EXPECT(x)                                                \
-  do {                                                                         \
-    if (!(x)) {                                                                \
-      micro_test::reporter->Report(#x " failed at %s:%d", __FILE__, __LINE__); \
-      micro_test::did_test_fail = true;                                        \
-    }                                                                          \
-  } while (false)
-
-#define TF_LITE_MICRO_EXPECT_EQ(x, y)                                         \
-  do {                                                                        \
-    if ((x) != (y)) {                                                         \
-      micro_test::reporter->Report(#x " == " #y " failed at %s:%d", __FILE__, \
-                                   __LINE__);                                 \
-      micro_test::did_test_fail = true;                                       \
-    }                                                                         \
-  } while (false)
-
-#define TF_LITE_MICRO_EXPECT_NE(x, y)                                         \
-  do {                                                                        \
-    if ((x) == (y)) {                                                         \
-      micro_test::reporter->Report(#x " != " #y " failed at %s:%d", __FILE__, \
-                                   __LINE__);                                 \
-      micro_test::did_test_fail = true;                                       \
-    }                                                                         \
-  } while (false)
-
-#define TF_LITE_MICRO_EXPECT_NEAR(x, y, epsilon)                      \
-  do {                                                                \
-    auto delta = ((x) > (y)) ? ((x) - (y)) : ((y) - (x));             \
-    if (delta > epsilon) {                                            \
-      micro_test::reporter->Report(#x " near " #y " failed at %s:%d", \
-                                   __FILE__, __LINE__);               \
-      micro_test::did_test_fail = true;                               \
-    }                                                                 \
-  } while (false)
-
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh b/tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh
deleted file mode 100755
index 07742a8262f8cdf5981be2a057631d975cd04d33..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash -e
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Tests a 'bluepill' STM32F103 ELF by parsing the log output of Renode emulation.
-#
-# First argument is the ELF location.
-# Second argument is a regular expression that's required to be in the output logs
-# for the test to pass.
-
-declare -r ROOT_DIR=`pwd`
-declare -r TEST_TMPDIR=/tmp/test_bluepill_binary/
-declare -r MICRO_LOG_PATH=${TEST_TMPDIR}
-declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
-mkdir -p ${MICRO_LOG_PATH}
-
-docker build -t renode_bluepill \
-  -f ${ROOT_DIR}/tensorflow/contrib/lite/experimental/micro/testing/Dockerfile.bluepill \
-  ${ROOT_DIR}/tensorflow/contrib/lite/experimental/micro/testing/
-
-docker run \
-  --log-driver=none -a stdout -a stderr \
-  -v ${ROOT_DIR}:/workspace \
-  -v /tmp:/tmp \
-  -it renode_bluepill \
-  /bin/bash -c "renode -P 5000 --disable-xwt -e '
-\$bin?=@/workspace/$1
-s @/workspace/tensorflow/contrib/lite/experimental/micro/testing/bluepill.resc
-' 2>&1 >${MICRO_LOG_FILENAME}"
-
-echo "LOGS:"
-cat ${MICRO_LOG_FILENAME}
-
-if grep -q "$2" ${MICRO_LOG_FILENAME}
-then
-  echo "$1: PASS"
-  exit 0
-else
-  echo "$1: FAIL - '$2' not found in logs."
-  exit 1
-fi
-
diff --git a/tensorflow/contrib/lite/experimental/micro/tools/make/Makefile b/tensorflow/contrib/lite/experimental/micro/tools/make/Makefile
deleted file mode 100644
index 880bb4763cbbaf58db286ff142a822fbab60dfd8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/tools/make/Makefile
+++ /dev/null
@@ -1,166 +0,0 @@
-MAKEFILE_DIR := tensorflow/contrib/lite/experimental/micro/tools/make
-
-# Try to figure out the host system
-HOST_OS :=
-ifeq ($(OS),Windows_NT)
-	HOST_OS = windows
-else
-	UNAME_S := $(shell uname -s)
-	ifeq ($(UNAME_S),Linux)
-		HOST_OS := linux
-	endif
-	ifeq ($(UNAME_S),Darwin)
-		HOST_OS := osx
-	endif
-endif
-
-HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi)
-
-# Override these on the make command line to target a specific architecture. For example:
-# make -f tensorflow/contrib/lite/Makefile TARGET=rpi TARGET_ARCH=armv7l
-TARGET := $(HOST_OS)
-TARGET_ARCH := $(HOST_ARCH)
-
-INCLUDES := \
--I. \
--I$(MAKEFILE_DIR)/../../../../../ \
--I$(MAKEFILE_DIR)/../../../../../../ \
--I$(MAKEFILE_DIR)/downloads/ \
--I$(MAKEFILE_DIR)/downloads/gemmlowp \
--I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
--I$(OBJDIR)
-# This is at the end so any globally-installed frameworks like protobuf don't
-# override local versions in the source tree.
-INCLUDES += -I/usr/local/include
-
-TEST_SCRIPT := tensorflow/contrib/lite/experimental/micro/testing/test_linux_binary.sh
-
-MICROLITE_LIBS := -lm
-
-# There are no rules for compiling objects for the host system (since we don't
-# generate things like the protobuf compiler that require that), so all of
-# these settings are for the target compiler.
-CXXFLAGS := -O3 -DNDEBUG
-CXXFLAGS += --std=c++11 -g -DTF_LITE_STATIC_MEMORY
-CCFLAGS := -DNDEBUG -g -DTF_LITE_STATIC_MEMORY
-LDOPTS := -L/usr/local/lib
-ARFLAGS := -r
-TARGET_TOOLCHAIN_PREFIX :=
-CC_PREFIX :=
-
-# This library is the main target for this makefile. It will contain a minimal
-# runtime that can be linked in to other programs.
-MICROLITE_LIB_NAME := libtensorflow-microlite.a
-
-# Test binary for the microcontroller speech model.
-MICRO_SPEECH_TEST_SRCS := \
-tensorflow/contrib/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
-tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
-
-MICROLITE_TEST_SRCS := \
-$(wildcard tensorflow/contrib/lite/experimental/micro/*test.cc) \
-$(wildcard tensorflow/contrib/lite/experimental/micro/kernels/*test.cc)
-
-MICROLITE_CC_BASE_SRCS := \
-$(wildcard tensorflow/contrib/lite/experimental/micro/*.cc) \
-$(wildcard tensorflow/contrib/lite/experimental/micro/kernels/*.cc) \
-tensorflow/contrib/lite/c/c_api_internal.c \
-tensorflow/contrib/lite/core/api/error_reporter.cc \
-tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc \
-tensorflow/contrib/lite/core/api/op_resolver.cc \
-tensorflow/contrib/lite/kernels/kernel_util.cc \
-tensorflow/contrib/lite/kernels/internal/quantization_util.cc
-MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
-
-# These target-specific makefiles should modify or replace options like
-# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
-# based on platforms or architectures should happen within these files, to
-# keep this main makefile focused on the sources and dependencies.
-include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
-
-ALL_SRCS := \
-	$(MICRO_SPEECH_TEST_SRCS) \
-	$(MICROLITE_CC_SRCS) \
-	$(MICROLITE_TEST_SRCS)
-
-# Where compiled objects are stored.
-GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
-OBJDIR := $(GENDIR)obj/
-BINDIR := $(GENDIR)bin/
-LIBDIR := $(GENDIR)lib/
-
-MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
-
-MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
-
-CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
-CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
-AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
-
-MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
-
-MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
-
-MICROLITE_TEST_TARGETS := $(addprefix $(BINDIR), \
-$(patsubst %_test.cc,%.test_target,$(MICROLITE_TEST_SRCS)))
-
-# For normal manually-created TensorFlow C++ source files.
-$(OBJDIR)%.o: %.cc
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
-
-# For normal manually-created TensorFlow C source files.
-$(OBJDIR)%.o: %.c
-	@mkdir -p $(dir $@)
-	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
-
-# The target that's compiled if there's no command-line arguments.
-all: $(MICROLITE_LIB_PATH) $(MICRO_SPEECH_TEST_BINARY)
-
-microlite: $(MICROLITE_LIB_PATH)
-
-# Hack for generating schema file bypassing flatbuffer parsing
-tensorflow/contrib/lite/schema/schema_generated.h:
-	@cp -u tensorflow/contrib/lite/schema/schema_generated.h.OPENSOURCE tensorflow/contrib/lite/schema/schema_generated.h
-
-# Gathers together all the objects we've compiled into a single '.a' archive.
-$(MICROLITE_LIB_PATH): tensorflow/contrib/lite/schema/schema_generated.h $(MICROLITE_LIB_OBJS)
-	@mkdir -p $(dir $@)
-	$(AR) $(ARFLAGS) $(MICROLITE_LIB_PATH) $(MICROLITE_LIB_OBJS)
-
-$(MICRO_SPEECH_TEST_BINARY): $(MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(MICRO_SPEECH_TEST_BINARY) $(MICRO_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-micro_speech_test: $(MICRO_SPEECH_TEST_BINARY)
-micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
-
-test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
-	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-$(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $@ $< \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-$(BINDIR)%.test_target: $(BINDIR)%_test
-	$(TEST_SCRIPT) $< '~~~ALL TESTS PASSED~~~'
-
-$(info $(MICROLITE_TEST_TARGETS))
-
-test: test_micro_speech $(MICROLITE_TEST_TARGETS)
-
-# Gets rid of all generated files.
-clean:
-	rm -rf $(MAKEFILE_DIR)/gen
-
-$(DEPDIR)/%.d: ;
-.PRECIOUS: $(DEPDIR)/%.d
-.PRECIOUS: $(BINDIR)%_test
-
--include $(patsubst %,$(DEPDIR)/%.d,$(basename $(ALL_SRCS)))
diff --git a/tensorflow/contrib/lite/experimental/micro/tools/make/download_dependencies.sh b/tensorflow/contrib/lite/experimental/micro/tools/make/download_dependencies.sh
deleted file mode 100755
index 4c2ff8545dbdcc426bf62aaeb07ca22d8b17cc69..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/micro/tools/make/download_dependencies.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../../../.."
-
-DOWNLOADS_DIR=tensorflow/contrib/lite/experimental/micro/tools/make/downloads
-BZL_FILE_PATH=tensorflow/workspace.bzl
-
-# Ensure it is being run from repo root
-if [ ! -f $BZL_FILE_PATH ]; then
-  echo "Could not find ${BZL_FILE_PATH}":
-  echo "Likely you are not running this from the root directory of the repository.";
-  exit 1;
-fi
-
-GEMMLOWP_URL="https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37f7f98adcc7fc9f425.zip"
-FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz"
-CMSIS_URL="https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
-STM32_BARE_LIB_URL="https://github.com/google/stm32_bare_lib/archive/50e0da307a2821bb54af1f57b969e6b76cb89d32.zip"
-
-download_and_extract() {
-  local usage="Usage: download_and_extract URL DIR"
-  local url="${1:?${usage}}"
-  local dir="${2:?${usage}}"
-  echo "downloading ${url}" >&2
-  mkdir -p "${dir}"
-  if [[ "${url}" == *gz ]]; then
-    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
-  elif [[ "${url}" == *zip ]]; then
-    tempdir=$(mktemp -d)
-    tempdir2=$(mktemp -d)
-
-    curl -L ${url} > ${tempdir}/zipped.zip
-    unzip ${tempdir}/zipped.zip -d ${tempdir2}
-
-    # If the zip file contains nested directories, extract the files from the
-    # inner directory.
-    if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
-      # unzip has no strip components, so unzip to a temp dir, and move the
-      # files we want from the tempdir to destination.
-      cp -R ${tempdir2}/*/* ${dir}/
-    else
-      cp -R ${tempdir2}/* ${dir}/
-    fi
-    rm -rf ${tempdir2} ${tempdir}
-  fi
-
-  # Delete any potential BUILD files, which would interfere with Bazel builds.
-  find "${dir}" -type f -name '*BUILD' -delete
-}
-
-download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
-download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
-download_and_extract "${CMSIS_URL}" "${DOWNLOADS_DIR}/cmsis"
-download_and_extract "${STM32_BARE_LIB_URL}" "${DOWNLOADS_DIR}/stm32_bare_lib"
-
-echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/lite/experimental/writer/BUILD b/tensorflow/contrib/lite/experimental/writer/BUILD
deleted file mode 100644
index 82d39c00abd27d9931131317e9750bbf7face981..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/experimental/writer/BUILD
+++ /dev/null
@@ -1,66 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-cc_binary(
-    name = "option_writer_generator",
-    srcs = ["option_writer_generator.cc"],
-    deps = [
-        "//tensorflow/contrib/lite/schema:schema_fbs_with_reflection",
-        "@flatbuffers",
-    ],
-)
-
-cc_library(
-    name = "writer_lib",
-    srcs = [
-        "enum_mapping.h",
-        "writer_lib.cc",
-    ],
-    hdrs = [
-        "writer_lib.h",
-    ],
-    data = [
-        ":option_writer_gen",
-    ],
-    textual_hdrs = ["option_writer_generated.h"],
-    deps = [
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/schema:schema_fbs_with_reflection",
-    ],
-)
-
-cc_binary(
-    name = "writer",
-    srcs = ["writer.cc"],
-    deps = [
-        ":writer_lib",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ],
-)
-
-cc_test(
-    name = "writer_lib_test",
-    size = "small",
-    srcs = ["writer_lib_test.cc"],
-    deps = [
-        ":writer_lib",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-genrule(
-    name = "option_writer_gen",
-    outs = ["option_writer_generated.h"],
-    cmd = "$(location :option_writer_generator) $(@)",
-    tools = [":option_writer_generator"],
-)
diff --git a/tensorflow/contrib/lite/g3doc/_book.yaml b/tensorflow/contrib/lite/g3doc/_book.yaml
deleted file mode 100644
index de6914e5366acef53a853a73f791dcfa801d444c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/g3doc/_book.yaml
+++ /dev/null
@@ -1,63 +0,0 @@
-upper_tabs:
-# Tabs left of dropdown menu
-- include: /_upper_tabs_left.yaml
-- include: /versions/_upper_tabs_versions.yaml
-# Dropdown menu
-- name: Ecosystem
-  path: /ecosystem
-  is_default: true
-  menu:
-  - include: /ecosystem/_menu_toc.yaml
-  lower_tabs:
-    # Subsite tabs
-    other:
-    - name: Guide
-      contents:
-      - title: Overview
-        path: /lite/overview
-      - title: Developer guide
-        path: /lite/devguide
-      - title: Android demo app
-        path: /lite/demo_android
-      - title: iOS demo app
-        path: /lite/demo_ios
-      - title: Performance
-        path: /lite/performance
-      - break: true
-      - title: TensorFlow Lite APIs
-        path: /lite/apis
-      - title: Custom operators
-        path: /lite/custom_operators
-      - title: TensorFlow Lite ops versioning
-        path: /lite/ops_versioning
-      - title: TensorFlow Lite compatibility guide
-        path: /lite/tf_ops_compatibility
-      - title: List of hosted models
-        path: /lite/models
-      - title: TensorFlow Lite for iOS
-        path: /lite/ios
-      - title: TensorFlow Lite for Raspberry Pi
-        path: /lite/rpi
-
-      - title: TF Mobile
-        style: accordion
-        status: deprecated
-        section:
-        - title: Overview
-          path: /lite/tfmobile/
-        - title: Building TensorFlow on Android
-          path: /lite/tfmobile/android_build
-        - title: Building TensorFlow on IOS
-          path: /lite/tfmobile/ios_build
-        - title: Integrating TensorFlow libraries
-          path: /lite/tfmobile/linking_libs
-        - title: Preparing models for mobile deployment
-          path: /lite/tfmobile/prepare_models
-        - title: Optimizing for mobile
-          path: /lite/tfmobile/optimizing
-
-    - name: API
-      skip_translation: true
-      contents:
-      - title: API
-        path: /api_docs/python/tf/contrib/lite
diff --git a/tensorflow/contrib/lite/g3doc/_index.yaml b/tensorflow/contrib/lite/g3doc/_index.yaml
deleted file mode 100644
index bc66cc5dc1606537b7e186f3c825ab8335aa9e91..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/g3doc/_index.yaml
+++ /dev/null
@@ -1,218 +0,0 @@
-project_path: /lite/_project.yaml
-book_path: /lite/_book.yaml
-description: <!--no description-->
-landing_page:
-  custom_css_path: /site-assets/css/style.css
-  rows:
-  - heading: TensorFlow Lite is for mobile and embedded devices.
-    description: >
-      <p style="max-width: 75%;">
-        TensorFlow Lite is the official solution for running machine learning
-        models on mobile and embedded devices. It enables on&#8209;device machine
-        learning inference with low latency and a small binary size on Android,
-        iOS, and other operating systems.
-      </p>
-      <style>
-      .tfo-landing-row-heading {
-        padding-top: 0 !important;
-      }
-      .tfo-landing-row-heading h2 {
-        margin-top: 0 !important;
-      }
-      .tfo-landing-row-heading-list ol, .tfo-landing-row-heading-list ul {
-        margin-top: 0;
-      }
-      </style>
-
-  - classname: tfo-landing-row-heading tfo-landing-row-heading-list
-    heading: Many benefits
-    description: >
-      On-device ML inference is difficult because of the many constraints—TensorFlow Lite can solve these:
-    items:
-    - list:
-      - heading: Performance
-        description: >
-          TF Lite is fast with no noticeable accuracy loss—see the <a href="./performance">metrics</a>.
-        icon:
-          icon_name: lens
-          foreground: theme
-      - heading: Portability
-        description: >
-          <a href="https://developer.android.com/ndk/guides/neuralnetworks/" class="external">Android</a>,
-          iOS, and more specialized IoT devices.
-        icon:
-          icon_name: lens
-          foreground: theme
-    - list:
-      - heading: Low latency
-        description: >
-          Optimized float- and fixed-point CPU kernels, op&#8209;fusing, and more.
-        icon:
-          icon_name: lens
-          foreground: theme
-      - heading: Acceleration
-        description: >
-          Integration with GPU and internal/external accelerators.
-        icon:
-          icon_name: lens
-          foreground: theme
-    - list:
-      - heading: Small model size
-        description: >
-          Controlled dependencies, <a href="https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3" class="external">quantization</a>,
-          and op&nbsp;registration.
-        icon:
-          icon_name: lens
-          foreground: theme
-      - heading: Tooling
-        description: >
-          Conversion, compression, benchmarking, power-consumption, and more.
-        icon:
-          icon_name: lens
-          foreground: theme
-
-  - classname: devsite-landing-row-logos tfo-landing-row-heading
-    heading: Companies using TensorFlow Lite
-    items:
-    - custom_image:
-        path: ./images/landing-page/photos_logo.png
-      path: https://www.photos.google.com
-    - custom_image:
-        path: ./images/landing-page/gboard_logo.png
-      path: https://play.google.com/store/apps/details?id=com.google.android.inputmethod.latin&hl=en_US
-    - custom_image:
-        path: ./images/landing-page/gmail_logo.png
-      path: https://www.google.com/gmail/
-    - custom_image:
-        path: ./images/landing-page/assistant_logo.png
-      path: https://assistant.google.com/
-
-  - classname: devsite-landing-row-logos
-    items:
-    - custom_image:
-        path: ./images/landing-page/vsco_logo.png
-      path: https://vsco.co
-    - custom_image:
-        path: ./images/landing-page/shazam_logo.png
-      path: https://www.shazam.com/
-    - custom_image:
-        path: ./images/landing-page/nest_logo.png
-      path: https://nest.com/    
-    - custom_image:
-        path: ./images/landing-page/loseit_logo.png
-      path: https://www.loseit.com/
-
-  - classname: devsite-landing-row-no-image-background devsite-landing-row-67
-    background: grey
-    items:
-    - description: >
-        <em>“TensorFlow Lite helped us introduce machine learning and AI into our
-        app in an easy and streamlined way. We could reduce the size of our
-        models while keeping the accuracy high. This helped us create an amazing
-        fishing experience for our users by allowing them to identify any fish
-        species with just a photo.”</em>
-      image_path: ./images/landing-page/fishbrain_logo_big.png
-
-  - heading: How it works
-    items:
-    - heading: Build
-      icon:
-        icon_name: build
-      description: >
-        Build a new model or retrain an existing one, such as using transfer learning.
-      buttons:
-      - label: Read the developer guide
-        path: /lite/devguide
-        classname: button button-primary tfo-button-primary
-    - heading: Convert
-      icon:
-        icon_name: autorenew
-      description: >
-        Convert a TensorFlow model into a compressed flat buffer with the
-        TensorFlow Lite Optimizing Converter (TOCO).
-      buttons:
-      - label: Read the TOCO guide
-        path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/toco/g3doc/python_api.md
-        classname: button button-primary tfo-button-primary
-    - heading: Deploy
-      icon:
-        icon_name: bolt
-      description: >
-        Take the compressed <code>.tflite</code> file and load it into a mobile
-        or embedded device.<br/>
-        See the <a href="#build-your-first-tensorflow-lite-app">tutorials below</a> to build an app.
-
-  - heading: Build your first TensorFlow Lite app
-    background: grey
-    items:
-    - classname: tfo-landing-row-item-inset-white
-      heading: Get started
-      description: >
-        <ul>
-          <li>Beginner: <a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/" class="external">TensorFlow for Poets</a></li>
-          <li>Beginner: <a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-tflite/" class="external">TensorFlow for Poets 2: Android</a></li>
-          <li>Beginner: <a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-ios/" class="external">TensorFlow for Poets 2: iOS </a></li>
-          <li>Intermediate: <a href="https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193" class="external">Object detection tutorial</a>
-        </ul>
-    - classname: tfo-landing-row-item-inset-white
-      heading: Share your TensorFlow Lite story
-      description: >
-        We love to hear what you're working on—it may even get highlighted on
-        our social media! <a href="https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss" class="external">Tell us</a>.
-
-  - classname: devsite-landing-row-no-image-background devsite-landing-row-67
-    items:
-    - description: >
-        <p>
-          <em>“The release of TensorFlow Lite has allowed us to deploy an engaging
-          real-time experience to our users that eliminates the requirement
-          for a data connection. TensorFlow Lite’s ability to compress and
-          optimize the TensorFlow graph for mobile deployment has been
-          transformative in expanding the capabilities of Snap It.</em>
-        </p>
-        <p>
-          <em>Through TensorFlow Lite, our users can now enjoy a state of the
-          art, computer-vision-based food logging experience without worrying
-          about signal strength. We look forward to future collaborations
-          with the TensorFlow Lite team.”</em>
-        </p>
-      image_path: ./images/landing-page/loseit_logo_big.png
-
-  - classname: devsite-landing-row-cards
-    background: grey
-    heading: Updates
-    items:
-    - heading: Introducing the Model Optimization Toolkit
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
-      path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
-      buttons:
-      - label: Read on TensorFlow blog
-        path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
-    - heading: East Africa Cassava App
-      image_path: ./images/landing-page/detect_crop_disease_in_africa.png
-      path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
-      buttons:
-      - label: Read more
-        path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
-    - heading: Using TensorFlow Lite on Android
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
-      path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
-      buttons:
-      - label: Read on TensorFlow blog
-        path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
-
-  - classname: devsite-landing-row-cards
-    background: grey
-    items:
-    - heading: TensorFlow Lite at the Dev Summit
-      youtube_id: FAMfy7izB6A
-      buttons:
-      - label: Watch the video
-        path: https://www.youtube.com/watch?v=FAMfy7izB6A
-    - heading: TensorFlow Lite on GitHub
-      image_path: /ecosystem/images/github-card-16x9.png
-      path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite
-      buttons:
-      - label: View on GitHub
-        path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite
-    - classname: devsite-landing-row-item-hidden
diff --git a/tensorflow/contrib/lite/g3doc/devguide.md b/tensorflow/contrib/lite/g3doc/devguide.md
deleted file mode 100644
index 0eed5160009c07727f0c2985ebe963efc7bb9d8e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/g3doc/devguide.md
+++ /dev/null
@@ -1,232 +0,0 @@
-# TF Lite Developer Guide
-
-Using a TensorFlow Lite model in your mobile app requires multiple
-considerations: you must choose a pre-trained or custom model, convert the model
-to a TensorFLow Lite format, and finally, integrate the model in your app.
-
-## 1. Choose a model
-
-Depending on the use case, you can choose one of the popular open-sourced models,
-such as *InceptionV3* or *MobileNets*, and re-train these models with a custom
-data set or even build your own custom model.
-
-### Use a pre-trained model
-
-[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
-is a family of mobile-first computer vision models for TensorFlow designed to
-effectively maximize accuracy, while taking into consideration the restricted
-resources for on-device or embedded applications. MobileNets are small,
-low-latency, low-power models parameterized to meet the resource constraints for
-a variety of uses. They can be used for classification, detection, embeddings, and
-segmentation—similar to other popular large scale models, such as
-[Inception](https://arxiv.org/pdf/1602.07261.pdf). Google provides 16 pre-trained
-[ImageNet](http://www.image-net.org/challenges/LSVRC/) classification checkpoints
-for MobileNets that can be used in mobile projects of all sizes.
-
-[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model
-that achieves fairly high accuracy recognizing general objects with 1000 classes,
-for example, "Zebra", "Dalmatian", and "Dishwasher". The model extracts general
-features from input images using a convolutional neural network and classifies
-them based on those features with fully-connected and softmax layers.
-
-[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
-is an on-device model that provides one-touch replies for incoming text messages
-by suggesting contextually relevant messages. The model is built specifically for
-memory constrained devices, such as watches and phones, and has been successfully
-used in Smart Replies on Android Wear. Currently, this model is Android-specific.
-
-These pre-trained models are [available for download](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/models.md)
-
-### Re-train Inception-V3 or MobileNet for a custom data set
-
-These pre-trained models were trained on the *ImageNet* data set which contains
-1000 predefined classes. If these classes are not sufficient for your use case,
-the model will need to be re-trained. This technique is called
-*transfer learning* and starts with a model that has been already trained on a
-problem, then retrains the model on a similar problem. Deep learning from
-scratch can take days, but transfer learning is fairly quick. In order to do
-this, you need to generate a custom data set labeled with the relevant classes.
-
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
-codelab walks through the re-training process step-by-step. The code supports
-both floating point and quantized inference.
-
-### Train a custom model
-
-A developer may choose to train a custom model using Tensorflow (see the
-[TensorFlow tutorials](../tutorials/) for examples of building and training
-models). If you have already written a model, the first step is to export this
-to a `tf.GraphDef` file. This is required because some formats do not store the
-model structure outside the code, and we must communicate with other parts of the
-framework. See
-[Exporting the Inference Graph](https://github.com/tensorflow/models/blob/master/research/slim/README.md)
-to create .pb file for the custom model.
-
-TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to the
-[TensorFlow Lite & TensorFlow Compatibility Guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
-for supported operators and their usage. This set of operators will continue to
-grow in future Tensorflow Lite releases.
-
-
-## 2. Convert the model format
-
-The model generated (or downloaded) in the previous step is a *standard*
-Tensorflow model and you should now have a .pb or .pbtxt `tf.GraphDef` file.
-Models generated with transfer learning (re-training) or custom models must be
-converted—but, we must first freeze the graph to convert the model to the
-Tensorflow Lite format. This process uses several model formats:
-
-* `tf.GraphDef` (.pb) —A protobuf that represents the TensorFlow training or
-  computation graph. It contains operators, tensors, and variables definitions.
-* *CheckPoint* (.ckpt) —Serialized variables from a TensorFlow graph. Since this
-  does not contain a graph structure, it cannot be interpreted by itself.
-* `FrozenGraphDef` —A subclass of `GraphDef` that does not contain
-  variables. A `GraphDef` can be converted to a `FrozenGraphDef` by taking a
-  CheckPoint and a `GraphDef`, and converting each variable into a constant
-  using the value retrieved from the CheckPoint.
-* `SavedModel` —A `GraphDef` and CheckPoint with a signature that labels
-  input and output arguments to a model. A `GraphDef` and CheckPoint can be
-  extracted from a `SavedModel`.
-* *TensorFlow Lite model* (.tflite) —A serialized
-  [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
-  Lite operators and tensors for the TensorFlow Lite interpreter, similar to a
-  `FrozenGraphDef`.
-
-### Freeze Graph
-
-To use the `GraphDef` .pb file with TensorFlow Lite, you must have checkpoints
-that contain trained weight parameters. The .pb file only contains the structure
-of the graph. The process of merging the checkpoint values with the graph
-structure is called *freezing the graph*.
-
-You should have a checkpoints folder or download them for a pre-trained model
-(for example,
-[MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
-
-To freeze the graph, use the following command (changing the arguments):
-
-```
-freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
-  --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
-  --input_binary=true \
-  --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
-  --output_node_names=MobileNetV1/Predictions/Reshape_1
-```
-
-The `input_binary` flag must be enabled so the protobuf is read and written in
-a binary format. Set the `input_graph` and `input_checkpoint` files.
-
-The `output_node_names` may not be obvious outside of the code that built the
-model. The easiest way to find them is to visualize the graph, either with
-[TensorBoard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3)
-or `graphviz`.
-
-The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
-(.tflite) for use on Android or iOS devices. For Android, the Tensorflow
-Optimizing Converter tool supports both float and quantized models. To convert
-the frozen `GraphDef` to the .tflite format:
-
-```
-toco --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TFLITE \
-  --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
-  --inference_type=FLOAT \
-  --input_type=FLOAT \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --input_shapes=1,224,224,3
-```
-
-The `input_file` argument should reference the frozen `GraphDef` file
-containing the model architecture. The [frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
-file used here is available for download. `output_file` is where the TensorFlow
-Lite model will get generated. The `input_type` and `inference_type`
-arguments should be set to `FLOAT`, unless converting a
-<a href="https://www.tensorflow.org/performance/quantization">quantized model</a>.
-Setting the `input_array`, `output_array`, and `input_shape` arguments are not as
-straightforward. The easiest way to find these values is to explore the graph
-using Tensorboard. Reuse the arguments for specifying the output nodes for
-inference in the `freeze_graph` step.
-
-It is also possible to use the Tensorflow Optimizing Converter with protobufs
-from either Python or from the command line (see the 
-[toco_from_protos.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py)
-example). This allows you to integrate the conversion step into the model design
-workflow, ensuring the model is easily convertible to a mobile inference graph.
-For example:
-
-```python
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-out = tf.identity(val, name="out")
-
-with tf.Session() as sess:
-  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
-  open("converteds_model.tflite", "wb").write(tflite_model)
-```
-
-For usage, see the Tensorflow Optimizing Converter
-[command-line examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
-
-Refer to the
-[Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
-for troubleshooting help, and if that doesn't help, please
-[file an issue](https://github.com/tensorflow/tensorflow/issues).
-
-The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
-to visualize TensorFlow Lite models after conversion. To build the
-[visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/tools/visualize.py)
-tool:
-
-```sh
-bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
-```
-
-This generates an interactive HTML page listing subgraphs, operations, and a
-graph visualization.
-
-
-## 3. Use the TensorFlow Lite model for inference in a mobile app
-
-After completing the prior steps, you should now have a `.tflite` model file.
-
-### Android
-
-Since Android apps are written in Java and the core TensorFlow library is in C++,
-a JNI library is provided as an interface. This is only meant for inference—it
-provides the ability to load a graph, set up inputs, and run the model to
-calculate outputs.
-
-The open source Android demo app uses the JNI interface and is available
-[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
-You can also download a
-[prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-See the <a href="./demo_android.md">Android demo</a> guide for details.
-
-The <a href="./android_build.md">Android mobile</a> guide has instructions for
-installing TensorFlow on Android and setting up `bazel` and Android Studio.
-
-### iOS
-
-To integrate a TensorFlow model in an iOS app, see the
-[TensorFlow Lite for iOS](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md)
-guide and <a href="./demo_ios.md">iOS demo</a> guide.
-
-#### Core ML support
-
-Core ML is a machine learning framework used in Apple products. In addition to
-using Tensorflow Lite models directly in your applications, you can convert
-trained Tensorflow models to the
-[CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
-devices. To use the converter, refer to the
-[Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
-
-### Raspberry Pi
-
-Compile Tensorflow Lite for a Raspberry Pi by following the
-[RPi build instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/rpi.md)
-This compiles a static library file (`.a`) used to build your app. There are
-plans for Python bindings and a demo app.
diff --git a/tensorflow/contrib/lite/g3doc/overview.md b/tensorflow/contrib/lite/g3doc/overview.md
deleted file mode 100644
index 9d035a69211d7ced913e6d16061c6ad8ca912e64..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/g3doc/overview.md
+++ /dev/null
@@ -1,202 +0,0 @@
-
-# Introduction to TensorFlow Lite
-
-TensorFlow Lite is TensorFlow’s lightweight solution for mobile and embedded
-devices. It enables on-device machine learning inference with low latency and a
-small binary size. TensorFlow Lite also supports hardware acceleration with the
-[Android Neural Networks
-API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
-
-TensorFlow Lite uses many techniques for achieving low latency such as
-optimizing the kernels for mobile apps, pre-fused activations, and quantized
-kernels that allow smaller and faster (fixed-point math) models.
-
-Most of our TensorFlow Lite documentation is [on
-GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite)
-for the time being.
-
-## What does TensorFlow Lite contain?
-
-TensorFlow Lite supports a set of core operators, both quantized and
-float, which have been tuned for mobile platforms. They incorporate pre-fused
-activations and biases to further enhance performance and quantized
-accuracy. Additionally, TensorFlow Lite also supports using custom operations in
-models.
-
-TensorFlow Lite defines a new model file format, based on
-[FlatBuffers](https://google.github.io/flatbuffers/). FlatBuffers is an
-efficient open-source cross-platform serialization library. It is similar to
-[protocol buffers](https://developers.google.com/protocol-buffers/?hl=en), but
-the primary difference is that FlatBuffers does not need a parsing/unpacking
-step to a secondary representation before you can access data, often coupled
-with per-object memory allocation. Also, the code footprint of FlatBuffers is an
-order of magnitude smaller than protocol buffers.
-
-TensorFlow Lite has a new mobile-optimized interpreter, which has the key goals
-of keeping apps lean and fast. The interpreter uses a static graph ordering and
-a custom (less-dynamic) memory allocator to ensure minimal load, initialization,
-and execution latency.
-
-TensorFlow Lite provides an interface to leverage hardware acceleration, if
-available on the device. It does so via the
-[Android Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/index.html),
-available on Android 8.1 (API level 27) and higher.
-
-## Why do we need a new mobile-specific library?
-
-Machine Learning is changing the computing paradigm, and we see an emerging
-trend of new use cases on mobile and embedded devices. Consumer expectations are
-also trending toward natural, human-like interactions with their devices, driven
-by the camera and voice interaction models.
-
-There are several factors which are fueling interest in this domain:
-
-- Innovation at the silicon layer is enabling new possibilities for hardware
-  acceleration, and frameworks such as the Android Neural Networks API make it
-  easy to leverage these.
-
-- Recent advances in real-time computer-vision and spoken language understanding
-  have led to mobile-optimized benchmark models being open sourced
-  (e.g. MobileNets, SqueezeNet).
-
-- Widely-available smart appliances create new possibilities for
-  on-device intelligence.
-
-- Interest in stronger user data privacy paradigms where user data does not need
-  to leave the mobile device.
-
-- Ability to serve ‘offline’ use cases, where the device does not need to be
-  connected to a network.
-
-We believe the next wave of machine learning applications will have significant
-processing on mobile and embedded devices.
-
-## TensorFlow Lite highlights
-
-TensorFlow Lite provides:
-
-- A set of core operators, both quantized and float, many of which have been
-  tuned for mobile platforms.  These can be used to create and run custom
-  models.  Developers can also write their own custom operators and use them in
-  models.
-
-- A new [FlatBuffers](https://google.github.io/flatbuffers/)-based
-  model file format.
-
-- On-device interpreter with kernels optimized for faster execution on mobile.
-
-- TensorFlow converter to convert TensorFlow-trained models to the TensorFlow
-  Lite format.
-
-- Smaller in size: TensorFlow Lite is smaller than 300KB when all supported
-  operators are linked and less than 200KB when using only the operators needed
-  for supporting InceptionV3 and Mobilenet.
-
-- **Pre-tested models:**
-
-    All of the following models are guaranteed to work out of the box:
-
-    - Inception V3, a popular model for detecting the dominant objects
-      present in an image.
-
-    - [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md),
-      a family of mobile-first computer vision models designed to effectively
-      maximize accuracy while being mindful of the restricted resources for an
-      on-device or embedded application. They are small, low-latency, low-power
-      models parameterized to meet the resource constraints of a variety of use
-      cases. They can be built upon for classification, detection, embeddings
-      and segmentation. MobileNet models are smaller but [lower in
-      accuracy](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
-      than Inception V3.
-
-    - On Device Smart Reply, an on-device model which provides one-touch
-      replies for an incoming text message by suggesting contextually relevant
-      messages. The model was built specifically for memory constrained devices
-      such as watches & phones and it has been successfully used to surface
-      [Smart Replies on Android
-      Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
-      to all first-party and third-party apps.
-
-    Also see the complete list of
-    [TensorFlow Lite's supported models](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/models.md),
-    including the model sizes, performance numbers, and downloadable model files.
-
-- Quantized versions of the MobileNet model, which runs faster than the
-  non-quantized (float) version on CPU.
-
-- New Android demo app to illustrate the use of TensorFlow Lite with a quantized
-  MobileNet model for object classification.
-
-- Java and C++ API support
-
-
-## Getting Started
-
-We recommend you try out TensorFlow Lite with the pre-tested models indicated
-above. If you have an existing model, you will need to test whether your model
-is compatible with both the converter and the supported operator set.  To test
-your model, see the
-[documentation on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite).
-
-### Retrain Inception-V3 or MobileNet for a custom data set
-
-The pre-trained models mentioned above have been trained on the ImageNet data
-set, which consists of 1000 predefined classes. If those classes are not
-relevant or useful for your use case, you will need to retrain those
-models. This technique is called transfer learning, which starts with a model
-that has been already trained on a problem and will then be retrained on a
-similar problem. Deep learning from scratch can take days, but transfer learning
-can be done fairly quickly. In order to do this, you'll need to generate your
-custom data set labeled with the relevant classes.
-
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
-codelab walks through this process step-by-step. The retraining code supports
-retraining for both floating point and quantized inference.
-
-## TensorFlow Lite Architecture
-
-The following diagram shows the architectural design of TensorFlow Lite:
-
-<img src="https://www.tensorflow.org/images/tflite-architecture.jpg"
-     alt="TensorFlow Lite architecture diagram"
-     style="max-width:600px;">
-
-Starting with a trained TensorFlow model on disk, you'll convert that model to
-the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
-Converter. Then you can use that converted file in your mobile application.
-
-Deploying the TensorFlow Lite model file uses:
-
-- Java API: A convenience wrapper around the C++ API on Android.
-
-- C++ API: Loads the TensorFlow Lite Model File and invokes the Interpreter. The
-  same library is available on both Android and iOS.
-
-- Interpreter: Executes the model using a set of kernels. The interpreter
-  supports selective kernel loading; without kernels it is only 100KB, and 300KB
-  with all the kernels loaded. This is a significant reduction from the 1.5M
-  required by TensorFlow Mobile.
-
-- On select Android devices, the Interpreter will use the Android Neural
-  Networks API for hardware acceleration, or default to CPU execution if none
-  are available.
-
-You can also implement custom kernels using the C++ API that can be used by the
-Interpreter.
-
-## Future Work
-
-In future releases, TensorFlow Lite will support more models and built-in
-operators, contain performance improvements for both fixed point and floating
-point models, improvements to the tools to enable easier developer workflows and
-support for other smaller devices and more. As we continue development, we hope
-that TensorFlow Lite will greatly simplify the developer experience of targeting
-a model for small devices.
-
-Future plans include using specialized machine learning hardware to get the best
-possible performance for a particular model on a particular device.
-
-## Next Steps
-
-The TensorFlow Lite [GitHub repository](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite).
-contains additional docs, code samples, and demo applications.
diff --git a/tensorflow/contrib/lite/g3doc/performance.md b/tensorflow/contrib/lite/g3doc/performance.md
deleted file mode 100644
index 6b7943caf8fe4ac5d7a97361c35138898f9b5661..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/g3doc/performance.md
+++ /dev/null
@@ -1,38 +0,0 @@
-
-# Performance best practices
-
-Mobile and embedded devices have limited computational resources and it is important to keep your application resource efficient. We have compiled a list of best practices and strategies you can use to optimize your model and application when using Tensorflow Lite.
-
-## Choose the most efficient model for the problem
-Some models may be too large to run on embedded devices. Instead of large models it is better to use a slightly less precise but smaller model for embedded devices. Smaller models not only use less disk space and memory but are generally faster and more energy efficient. One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. Tensorflow Lite [models page](models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
-
-You can retrain the listed models on your own dataset by using transfer learning. Check out our transfer learning tutorial for
-[image classification](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0) and
- [object detection](https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193).
-
-
-## Profile your model
-Before starting any optimization, it is a good practice to profile and benchmark your model. Tensorflow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
-
-## Profile and optimize operators in the graph
-If a particular operator appears frequently in the model and based on profiling you find the operator consuming the most amount of time, you can look into optimizing the operator.
- This scenario should be rare as Tensorflow Lite has optimized versions for most ops. However you may be able to write a faster version of a custom op, if you know the constraints in which the operator is executed. Check out our [custom operator documentation](custom_operators.md).
-
-## Quantize your model
-If your model uses floating point weights or activations then it may be possible to reduce the size of model up to ~4x by using quantization and other model optimizations. Check out our [model optimization toolkit](https://www.tensorflow.org/performance/model_optimization) for details about optimizing your model. Fully quantized models can be remarkably power efficient as well.
-
-## Tweak the number of threads
-Tensorflow Lite supports multi-threaded kernels for many operators. You can increase the number of threads and speed up execution of operators. Increasing the number of threads will however make your model use more resources and power. For some applications latency may be more important than energy efficiency. You can increase the number of threads by setting the number of [interpreter](https://github.com/tensorflow/tensorflow/blob/1084594657a5d139102ac794f84d1427a710e39a/tensorflow/contrib/lite/interpreter.h#L337) threads.
-
-## Eliminate redundant copies
-Tensorflow Lite is optimized to reduce redundant copies. The APIs allow user to [mmap a model file](https://github.com/tensorflow/tensorflow/blob/9982fd6c8831cbd2f58954f79ea71f26660393bc/tensorflow/contrib/lite/model.h#L152) and avoid copies. If your application is not careful, there can be redundant copies when feeding the input to the model and reading output from the model. Make sure to eliminate redundant copies. If you are using higher level APIs like Java API, make sure to carefully check the documentation for performance caveats. For example, the Java API is a lot faster if ByteBuffers are used as [inputs](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L151).
-
-## Profile your application with platform specific tools
-Platform specific tools like [Android profiler](https://developer.android.com/studio/profile/android-profiler) and [Instruments](https://help.apple.com/instruments/mac/current/) provide a wealth of profiling information that can be used to debug your app. Sometimes the performance bug may be not in the model but in parts of application code that interact with the model. Make sure to familiarize yourself with platform specific profiling tools and best practices for your platform.
-
-## Use hardware accelerators available on the device
-Tensorflow Lite is working on adding support for accelerators like GPU and provides acceleration through [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/) on Android.
-You can utilize these hardware accelerator backends to improve the speed and efficiency of your model. To enable Neural Networks API call [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/contrib/lite/interpreter.h#L334) on the interpreter instance.
-
-## Need more help
-The Tensorflow team is happy to help diagnose and address specific performance issues you may be facing. Please file a bug on [github](https://github.com/tensorflow/tensorflow/issues) with details of the issue.
diff --git a/tensorflow/contrib/lite/g3doc/performance_benchmarks.md b/tensorflow/contrib/lite/g3doc/performance_benchmarks.md
deleted file mode 100644
index 28cb6aba6ec61d12d86e078e47665833df8afec7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/g3doc/performance_benchmarks.md
+++ /dev/null
@@ -1,174 +0,0 @@
-
-# Performance
-
-This document lists TensorFlow Lite performance benchmarks when running well
-known models on some Android and iOS devices.
-
-These performance benchmark numbers were generated with the
-[Android TFLite benchmark binary](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark)
-and the [iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark/ios).
-
-# Android performance benchmarks
-
-For Android benchmarks, the CPU affinity is set to use big cores on the device to
-reduce variance (see [details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark#reducing-variance-between-runs-on-android)).
-
-It assumes that models were download and unzipped to the
-`/data/local/tmp/tflite_models` directory. The benchmark binary is built
-using [these instructions](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark#on-android)
-and assumed in the `/data/local/tmp` directory.
-
-To run the benchmark:
-
-```
-adb shell taskset ${CPU_MASK} /data/local/tmp/benchmark_model \
-  --num_threads=1 \
-  --graph=/data/local/tmp/tflite_models/${GRAPH} \
-  --warmup_runs=1 \
-  --num_runs=50 \
-  --use_nnapi=false
-```
-
-Here, `${GRAPH}` is the name of model and `${CPU_MASK}` is the CPU affinity
-chosen according to the following table:
-
-Device | CPU_MASK |
--------| ----------
-Pixel 2 | f0 |
-Pixel xl | 0c |
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>Mean inference time (std dev)</th>
-    </tr>
-  </thead>
-  <tr>
-    <td rowspan = 2>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>166.5 ms (2.6 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>122.9 ms (1.8 ms)  </td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>69.5 ms (0.9 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>78.9 ms (2.2 ms)  </td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>273.8 ms (3.5 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>210.8 ms (4.2 ms)</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>234.0 ms (2.1 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>158.0 ms (2.1 ms)</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>2846.0 ms (15.0 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>1973.0 ms (15.0 ms)  </td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>3180.0 ms (11.7 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>2262.0 ms (21.0 ms)  </td>
-  </tr>
-
- </table>
-
-# iOS benchmarks
-
-To run iOS benchmarks, the [benchmark
-app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark/ios)
-was modified to include the appropriate model and `benchmark_params.json` was
-modified  to set `num_threads` to 1.
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>Mean inference time (std dev)</th>
-    </tr>
-  </thead>
-  <tr>
-    <td>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>32.2 ms (0.8 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>24.4 ms (0.8 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>60.3 ms (0.6 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>44.3 (0.7 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>iPhone 8</td>
-    <td>562.4 ms (18.2 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>661.0 ms (29.2 ms)</td>
-  </tr>
- </table>
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/index.md b/tensorflow/contrib/lite/g3doc/tfmobile/index.md
deleted file mode 100644
index 49ad35d4e6a18f266d88e330626bae8bf1fc499f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/g3doc/tfmobile/index.md
+++ /dev/null
@@ -1,282 +0,0 @@
-
-# Overview
-
-TensorFlow was designed to be a good deep learning solution for mobile
-platforms. Currently we have two solutions for deploying machine learning
-applications on mobile and embedded devices: TensorFlow for Mobile and
-<a href="../../lite">TensorFlow Lite</a>.
-
-## TensorFlow Lite versus TensorFlow Mobile
-
-Here are a few of the differences between the two:
-
-- TensorFlow Lite is an evolution of TensorFlow Mobile.  In most cases, apps
-  developed with TensorFlow Lite will have a smaller binary size, fewer
-  dependencies, and better performance.
-
-- TensorFlow Lite is in developer preview, so not all use cases are covered yet.
-  We expect you to use TensorFlow Mobile to cover production cases.
-
-- TensorFlow Lite supports only a limited set of operators, so not all models
-  will work on it by default. TensorFlow for Mobile has a fuller set of
-  supported functionality.
-
-TensorFlow Lite provides better performance and a small binary size on mobile
-platforms as well as the ability to leverage hardware acceleration if available
-on their platforms. In addition, it has many fewer dependencies so it can be
-built and hosted on simpler, more constrained device scenarios. TensorFlow Lite
-also allows targeting accelerators through the [Neural Networks
-API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
-
-TensorFlow Lite currently has coverage for a limited set of operators. While
-TensorFlow for Mobile supports only a constrained set of ops by default, in
-principle if you use an arbitrary operator in TensorFlow, it can be customized
-to build that kernel. Thus use cases which are not currently supported by
-TensorFlow Lite should continue to use TensorFlow for Mobile. As TensorFlow Lite
-evolves, it will gain additional operators, and the decision will be easier to
-make.
-
-
-## Introduction to TensorFlow Mobile
-
-TensorFlow was designed from the ground up to be a good deep learning solution
-for mobile platforms like Android and iOS. This mobile guide should help you
-understand how machine learning can work on mobile platforms and how to
-integrate TensorFlow into your mobile apps effectively and efficiently.
-
-## About this Guide
-
-This guide is aimed at developers who have a TensorFlow model that’s
-successfully working in a desktop environment, who want to integrate it into
-a mobile application, and cannot use TensorFlow Lite. Here are the
-main challenges you’ll face during that process:
-
-- Understanding how to use Tensorflow for mobile.
-- Building TensorFlow for your platform.
-- Integrating the TensorFlow library into your application.
-- Preparing your model file for mobile deployment.
-- Optimizing for latency, RAM usage, model file size, and binary size.
-
-## Common use cases for mobile machine learning
-
-**Why run TensorFlow on mobile?**
-
-Traditionally, deep learning has been associated with data centers and giant
-clusters of high-powered GPU machines. However, it can be very expensive and
-time-consuming to send all of the data a device has access to across a network
-connection. Running on mobile makes it possible to deliver very interactive
-applications in a way that’s not possible when you have to wait for a network
-round trip.
-
-Here are some common use cases for on-device deep learning:
-
-### Speech Recognition
-
-There are a lot of interesting applications that can be built with a
-speech-driven interface, and many of these require on-device processing. Most of
-the time a user isn’t giving commands, and so streaming audio continuously to a
-remote server would be a waste of bandwidth, since it would mostly be silence or
-background noises. To solve this problem it’s common to have a small neural
-network running on-device
-[listening out for a particular keyword](../tutorials/sequences/audio_recognition).
-Once that keyword has been spotted, the rest of the
-conversation can be transmitted over to the server for further processing if
-more computing power is needed.
-
-### Image Recognition
-
-It can be very useful for a mobile app to be able to make sense of a camera
-image. If your users are taking photos, recognizing what’s in them can help your
-camera apps apply appropriate filters, or label the photos so they’re easily
-findable. It’s important for embedded applications too, since you can use image
-sensors to detect all sorts of interesting conditions, whether it’s spotting
-endangered animals in the wild
-or
-[reporting how late your train is running](https://svds.com/tensorflow-image-recognition-raspberry-pi/).
-
-TensorFlow comes with several examples of recognizing the types of objects
-inside images along with a variety of different pre-trained models, and they can
-all be run on mobile devices. You can try out
-our
-[Tensorflow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/index.html#0) and
-[Tensorflow for Poets 2: Optimize for Mobile](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/index.html#0) codelabs to
-see how to take a pretrained model and run some very fast and lightweight
-training to teach it to recognize specific objects, and then optimize it to
-run on mobile.
-
-### Object Localization
-
-Sometimes it’s important to know where objects are in an image as well as what
-they are. There are lots of augmented reality use cases that could benefit a
-mobile app, such as guiding users to the right component when offering them
-help fixing their wireless network or providing informative overlays on top of
-landscape features. Embedded applications often need to count objects that are
-passing by them, whether it’s pests in a field of crops, or people, cars and
-bikes going past a street lamp.
-
-TensorFlow offers a pretrained model for drawing bounding boxes around people
-detected in images, together with tracking code to follow them over time. The
-tracking is especially important for applications where you’re trying to count
-how many objects are present over time, since it gives you a good idea when a
-new object enters or leaves the scene. We have some sample code for this
-available for Android [on
-GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android),
-and also a [more general object detection
-model](https://github.com/tensorflow/models/tree/master/research/object_detection/README.md)
-available as well.
-
-### Gesture Recognition
-
-It can be useful to be able to control applications with hand or other
-gestures, either recognized from images or through analyzing accelerometer
-sensor data. Creating those models is beyond the scope of this guide, but
-TensorFlow is an effective way of deploying them.
-
-### Optical Character Recognition
-
-Google Translate’s live camera view is a great example of how effective
-interactive on-device detection of text can be.
-
-<div class="video-wrapper">
-  <iframe class="devsite-embedded-youtube-video" data-video-id="06olHmcJjS0"
-            data-autohide="1" data-showinfo="0" frameborder="0" allowfullscreen>
-  </iframe>
-</div>
-
-There are multiple steps involved in recognizing text in images. You first have
-to identify the areas where the text is present, which is a variation on the
-object localization problem, and can be solved with similar techniques. Once you
-have an area of text, you then need to interpret it as letters, and then use a
-language model to help guess what words they represent. The simplest way to
-estimate what letters are present is to segment the line of text into individual
-letters, and then apply a simple neural network to the bounding box of each. You
-can get good results with the kind of models used for MNIST, which you can find
-in TensorFlow’s tutorials, though you may want a higher-resolution input.  A
-more advanced alternative is to use an LSTM model to process a whole line of
-text at once, with the model itself handling the segmentation into different
-characters.
-
-### Translation
-
-Translating from one language to another quickly and accurately, even if you
-don’t have a network connection, is an important use case. Deep networks are
-very effective at this sort of task, and you can find descriptions of a lot of
-different models in the literature. Often these are sequence-to-sequence
-recurrent models where you’re able to run a single graph to do the whole
-translation, without needing to run separate parsing stages.
-
-### Text Classification
-
-If you want to suggest relevant prompts to users based on what they’re typing or
-reading, it can be very useful to understand the meaning of the text. This is
-where text classification comes in. Text classification is an umbrella term
-that covers everything from sentiment analysis to topic discovery. You’re likely
-to have your own categories or labels that you want to apply, so the best place
-to start is with an example
-like
-[Skip-Thoughts](https://github.com/tensorflow/models/tree/master/research/skip_thoughts/),
-and then train on your own examples.
-
-### Voice Synthesis
-
-A synthesized voice can be a great way of giving users feedback or aiding
-accessibility, and recent advances such as
-[WaveNet](https://deepmind.com/blog/wavenet-generative-model-raw-audio/) show
-that deep learning can offer very natural-sounding speech.
-
-## Mobile machine learning and the cloud
-
-These examples of use cases give an idea of how on-device networks can
-complement cloud services. Cloud has a great deal of computing power in a
-controlled environment, but running on devices can offer higher interactivity.
-In situations where the cloud is unavailable, or your cloud capacity is limited,
-you can provide an offline experience, or reduce cloud workload by processing
-easy cases on device.
-
-Doing on-device computation can also signal when it's time to switch to working
-on the cloud. A good example of this is hotword detection in speech. Since
-devices are able to constantly listen out for the keywords, this then triggers a
-lot of traffic to cloud-based speech recognition once one is recognized. Without
-the on-device component, the whole application wouldn’t be feasible, and this
-pattern exists across several other applications as well. Recognizing that some
-sensor input is interesting enough for further processing makes a lot of
-interesting products possible.
-
-## What hardware and software should you have?
-
-TensorFlow runs on Ubuntu Linux, Windows 10, and OS X. For a list of all
-supported operating systems and instructions to install TensorFlow, see
-<a href="https://www.tensorflow.org/install">Installing Tensorflow</a>.
-
-Note that some of the sample code we provide for mobile TensorFlow requires you
-to compile TensorFlow from source, so you’ll need more than just `pip install`
-to work through all the sample code.
-
-To try out the mobile examples, you’ll need a device set up for development,
-using
-either [Android Studio](https://developer.android.com/studio/install.html),
-or [XCode](https://developer.apple.com/xcode/) if you're developing for iOS.
-
-## What should you do before you get started?
-
-Before thinking about how to get your solution on mobile:
-
-1. Determine whether your problem is solvable by mobile machine learning
-2. Create a labelled dataset to define your problem
-3. Pick an effective model for the problem
-
-We'll discuss these in more detail below.
-
-### Is your problem solvable by mobile machine learning?
-
-Once you have an idea of the problem you want to solve, you need to make a plan
-of how to build your solution. The most important first step is making sure that
-your problem is actually solvable, and the best way to do that is to mock it up
-using humans in the loop.
-
-For example, if you want to drive a robot toy car using voice commands, try
-recording some audio from the device and listen back to it to see if you can
-make sense of what’s being said. Often you’ll find there are problems in the
-capture process, such as the motor drowning out speech or not being able to hear
-at a distance, and you should tackle these problems before investing in the
-modeling process.
-
-Another example would be giving photos taken from your app to people see if they
-can classify what’s in them, in the way you’re looking for. If they can’t do
-that (for example, trying to estimate calories in food from photos may be
-impossible because all white soups look the same), then you’ll need to redesign
-your experience to cope with that. A good rule of thumb is that if a human can’t
-handle the task then it will be difficult to train a computer to do better.
-
-### Create a labelled dataset
-
-After you’ve solved any fundamental issues with your use case, you need to
-create a labeled dataset to define what problem you’re trying to solve. This
-step is extremely important, more than picking which model to use. You want it
-to be as representative as possible of your actual use case, since the model
-will only be effective at the task you teach it. It’s also worth investing in
-tools to make labeling the data as efficient and accurate as possible. For
-example, if you’re able to switch from having to click a button on a web
-interface to simple keyboard shortcuts, you may be able to speed up the
-generation process a lot. You should also start by doing the initial labeling
-yourself, so you can learn about the difficulties and likely errors, and
-possibly change your labeling or data capture process to avoid them. Once you
-and your team are able to consistently label examples (that is once you
-generally agree on the same labels for most examples), you can then try and
-capture your knowledge in a manual and teach external raters how to run the same
-process.
-
-### Pick an effective model
-
-The next step is to pick an effective model to use. You might be able to avoid
-training a model from scratch if someone else has already implemented a model
-similar to what you need; we have a repository of models implemented in
-TensorFlow [on GitHub](https://github.com/tensorflow/models) that you can look
-through. Lean towards the simplest model you can find, and try to get started as
-soon as you have even a small amount of labelled data, since you’ll get the best
-results when you’re able to iterate quickly. The shorter the time it takes to
-try training a model and running it in its real application, the better overall
-results you’ll see. It’s common for an algorithm to get great training accuracy
-numbers but then fail to be useful within a real application because there’s a
-mismatch between the dataset and real usage. Prototype end-to-end usage as soon
-as possible to create a consistent user experience.
diff --git a/tensorflow/contrib/lite/graph_info.cc b/tensorflow/contrib/lite/graph_info.cc
deleted file mode 100644
index e60ed2c2463cb621015ba725ca030e8d8c02f3c7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/graph_info.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/graph_info.h"
-#include <algorithm>
-
-namespace tflite {
-
-namespace {
-
-// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite
-// C api uses. Can't use the google array_view, since we can't depend on even
-// absl for embedded device reasons.
-// TODO(aselle): Move this into central utilities.
-class TfLiteIntArrayView {
- public:
-  // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null
-  // and this view does not take ownership of it.
-  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
-      : int_array_(int_array) {}
-
-  typedef const int* const_iterator;
-  const_iterator begin() const { return int_array_->data; }
-  const_iterator end() const { return &int_array_->data[int_array_->size]; }
-
-  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
-  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
-
- private:
-  const TfLiteIntArray* int_array_;
-};
-
-// Helper class that actually performs partitioning by subgraph.
-// Outputs to a provided `subgraphs` structure.
-//
-// Example usage:
-// PartitionGraphIntoIndependentSubgraphsImpl partitioner(
-//     info, nodes_to_part, subgraphs);
-// partitioner.Partition();
-class PartitionGraphIntoIndependentSubgraphsImpl {
- public:
-  PartitionGraphIntoIndependentSubgraphsImpl(
-      const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
-      std::vector<Subgraph>* subgraphs)
-      : info_(info),
-        subgraphs_(subgraphs),
-        node_type_(info->num_nodes(), Subgraph::kTfNonPartition) {
-    // Populate the node_type_ map.
-    for (auto node_index : TfLiteIntArrayView(nodes_to_partition)) {
-      node_type_[node_index] = Subgraph::kTfPartition;
-    }
-  }
-
-  // Actually partition the graph.
-  void Partition() {
-    // Initialize here to make Partition() re-entrant.
-    subgraphs_->clear();
-    tensor_epochs_.clear();
-    tensor_epochs_.resize(info_->num_tensors(), kEpochAlwaysReady);
-    node_epochs_.clear();
-    node_epochs_.resize(info_->num_nodes(), kEpochNotReady);
-    // Set computed tensors to be kEpochNotReady (initializer set everything to
-    // AlwaysReady).
-    for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
-      const TfLiteNode& node = info_->node(node_index);
-      for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
-        tensor_epochs_[output_tensor_index] = kEpochNotReady;
-      }
-    }
-
-    // Do a graph traversal where each iteration in the loop is an epoch
-    // that corresponds to a subgraph that only contains nodes that are of
-    // the same node_type_.
-    while (true) {
-      BuildSubgraph();
-      if (subgraphs_->back().nodes.empty()) {
-        subgraphs_->pop_back();
-        break;
-      }
-    }
-
-    // Mark model outputs as subgraph outputs. All the rest have already been
-    // identified.
-    for (int output_index : info_->outputs()) {
-      int output_epoch = tensor_epochs_[output_index];
-      Subgraph& output_subgraph = (*subgraphs_)[output_epoch];
-      output_subgraph.output_tensors.push_back(output_index);
-    }
-    // Make sure every subgraph's inputs and outputs are unique. Since the
-    // list of inputs and outputs is generated in a way that produces
-    // duplicates.
-    for (Subgraph& subgraph : *subgraphs_) {
-      // Sort and uniquefy using standard library algorithms.
-      auto uniquefy = [](std::vector<int>* items) {
-        std::sort(items->begin(), items->end());
-        auto last = std::unique(items->begin(), items->end());
-        items->erase(last, items->end());
-      };
-      uniquefy(&subgraph.input_tensors);
-      uniquefy(&subgraph.output_tensors);
-    }
-  }
-
- private:
-  // Special integer values needed for tensor_epochs_ and node_epochs_.
-  enum {
-    // The node or tensor is not ready to be assigned an epoch. e.g. a node's
-    // inputs have not all been assigned epochs.
-    kEpochNotReady = -1,
-    // Used for tensor_epochs_. This means that the tensor is always ready.
-    // e.g. an input to the whole model or a constant that has no dependencies.
-    kEpochAlwaysReady = -2
-  };
-
-  // Updates the  node `node_index` and returns true if it is assigned to an
-  // epoch. False is returned if the node is already set to an epoch, its inputs
-  // are not all assigned to epochs, or if it cannot be assigned to the current
-  // epoch since the epoch's node_type doesn't match.
-  bool UpdateNode(int node_index) {
-    const TfLiteNode& node = info_->node(node_index);
-    Subgraph& current_subgraph = subgraphs_->back();
-    int current_epoch = subgraphs_->size() - 1;
-    // Check if node is already done.
-    if (node_epochs_[node_index] != kEpochNotReady) {
-      return false;
-    }
-    // See if all dependencies of this node are already assigned to a
-    // subgraph.
-    for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
-      if (tensor_epochs_[input_tensor_index] == kEpochNotReady) {
-        return false;
-      }
-    }
-    // When we are starting a new epoch, the first ready node defines
-    // the type of that epoch.
-    if (current_subgraph.type == Subgraph::kTfUnexplored) {
-      current_subgraph.type = node_type_[node_index];
-    }
-    // The node gets assigned to this epoch if it is the same type as
-    // the epoch's assigned type. Note, if this is the current ready
-    // node encountered during this epoch, this condition will be
-    // automatically true.
-    if (current_subgraph.type == node_type_[node_index]) {
-      node_epochs_[node_index] = current_epoch;
-      current_subgraph.nodes.push_back(node_index);
-      // All outputs of this node now are assigned to this epoch as
-      // well.
-      for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
-        tensor_epochs_[output_tensor_index] = current_epoch;
-      }
-      // Look at our inputs one more time to update that tensor's
-      // epochs' outputs
-      for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
-        int input_epoch = tensor_epochs_[input_tensor_index];
-        int node_epoch = current_epoch;
-        if (input_epoch != node_epoch) {
-          current_subgraph.input_tensors.push_back(input_tensor_index);
-          // Set inputs to be outputs of the subgraph where they reside.
-          // the if condition makes sure inputs to the whole computation
-          // are not included (i.e. those initialized to -2 above).
-          if (input_epoch >= 0) {
-            Subgraph& input_subgraph = (*subgraphs_)[input_epoch];
-            input_subgraph.output_tensors.push_back(input_tensor_index);
-          }
-        }
-      }
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  // Completely populates the current subgraph by doing graph traversal
-  void BuildSubgraph() {
-    subgraphs_->emplace_back(Subgraph());
-    // loop until no more nodes can be updated.
-    while (true) {
-      bool did_something = false;
-      for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
-        if (UpdateNode(node_index)) {
-          did_something = true;
-        }
-      }
-      if (!did_something) return;
-    }
-  }
-
-  // Temporary data needed for partitioning.
-  const GraphInfo* info_;
-  // List of subgraphs to populate
-  std::vector<Subgraph>* subgraphs_;
-  std::vector<Subgraph::Type> node_type_;
-  // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
-  // is an input or constant.
-  std::vector<int> tensor_epochs_;
-  // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned.
-  std::vector<int> node_epochs_;
-};
-
-}  // namespace
-
-TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
-    const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
-    std::vector<Subgraph>* subgraphs) {
-  PartitionGraphIntoIndependentSubgraphsImpl(info, nodes_to_partition,
-                                             subgraphs)
-      .Partition();
-  return kTfLiteOk;
-}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/graph_info.h b/tensorflow/contrib/lite/graph_info.h
deleted file mode 100644
index 8ee83827bb3fdf59b88d8304ad781cae98140b75..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/graph_info.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
-#define TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
-
-#include <vector>
-
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-
-namespace tflite {
-
-// Basic information about an inference graph, where execution nodes
-// are connected via tensors.
-class GraphInfo {
- public:
-  virtual ~GraphInfo() {}
-
-  // Total number of tensors in the graph.
-  virtual size_t num_tensors() const = 0;
-
-  // Returns a tensor given its index which is expected to be between 0 and
-  // num_tensors().
-  virtual TfLiteTensor* tensor(size_t index) = 0;
-
-  // Total number of nodes in the graph.
-  virtual size_t num_nodes() const = 0;
-
-  // Returns a node given its index which is expected to be between 0 and
-  // num_nodes().
-  virtual const TfLiteNode& node(size_t index) const = 0;
-
-  // Returns the indices of the input tensors.
-  virtual const std::vector<int>& inputs() const = 0;
-
-  // Returns the indices of the output tensors.
-  virtual const std::vector<int>& outputs() const = 0;
-
-  // Returns the indices of the variable tensors.
-  virtual const std::vector<int>& variables() const = 0;
-};
-
-// Represents a subgraph of a TensorFlow Lite graph.
-struct Subgraph {
-  enum Type {
-    kTfUnexplored = 0,  // temporarily used during creation
-    kTfPartition,
-    kTfNonPartition
-  };
-  Type type = kTfUnexplored;
-  // Nodes within the subgraph
-  std::vector<int> nodes;
-  // Tensors that stride output from another subgraph that this depends on,
-  // or global inputs to the TensorFlow Lite full graph.
-  std::vector<int> input_tensors;
-  // Outputs that are consumed by other subgraphs or are global output tensors.
-  // All output tensors of the nodes in the subgraph that do not appear in this
-  // list are intermediate results that can be potentially elided.
-  std::vector<int> output_tensors;
-};
-
-// Partitions a list of node indices `nodes_to_partition` into subgraphs.
-// Each subgraph is in dependency order (i.e. all members of the subgraph).
-// `subgraphs` is assumed to be empty.
-TfLiteStatus PartitionGraphIntoIndependentSubgraphs(
-    const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
-    std::vector<Subgraph>* subgraphs);
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_GRAPH_INFO_H_
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
deleted file mode 100644
index 88e41ffc55d2b666bb4837c12dccb2ebcdcaac33..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/interpreter.cc
+++ /dev/null
@@ -1,1015 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/interpreter.h"
-
-#include <cassert>
-#include <cstdarg>
-#include <cstdint>
-#include <cstring>
-
-#include "tensorflow/contrib/lite/arena_planner.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/context_util.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/graph_info.h"
-#include "tensorflow/contrib/lite/memory_planner.h"
-#include "tensorflow/contrib/lite/nnapi_delegate.h"
-#include "tensorflow/contrib/lite/profiling/profiler.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/util.h"
-
-namespace tflite {
-namespace {
-
-TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
-                           const TfLiteRegistration& registration,
-                           int node_index, const char* message) {
-  context->ReportError(
-      context, "Node number %d (%s) %s.\n", node_index,
-      registration.custom_name
-          ? registration.custom_name
-          : EnumNameBuiltinOperator(
-                static_cast<BuiltinOperator>(registration.builtin_code)),
-      message);
-  return kTfLiteError;
-}
-
-// Stub method which returns kTfLiteError when the function is forbidden.
-// We're registrating this function to several different function to save
-// compiled binary size. Please note the restrictions:
-// * The type of first parameter have to be `TfLiteContext*`.
-// * All paramteters must be trivailly destructible. (E.g. No C++ class)
-TfLiteStatus ForbiddenContextFunction(TfLiteContext* context, ...) {
-  context->ReportError(context,
-                       "The function is forbidden if not calling in delegate.");
-  return kTfLiteError;
-}
-
-// Set the ForbiddenContextFunction to a compatible function pointer.
-template <typename FunctionType>
-void SetForbiddenContextFunction(FunctionType* func) {
-  *func = reinterpret_cast<FunctionType>(ForbiddenContextFunction);
-}
-
-// Returns true if at least one tensor in the given list is kTfLiteDynamic.
-template <typename TensorIntArray>
-bool HasDynamicTensorImpl(const TfLiteContext& context,
-                          const TensorIntArray& int_array) {
-  for (int i : int_array) {
-    const TfLiteTensor& tensor = context.tensors[i];
-    if (tensor.allocation_type == kTfLiteDynamic) {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace
-
-// A trivial implementation of GraphInfo around the Interpreter.
-// NOTE: this interpreter info represents the subset of the
-// graph that is executed according to execution plan. Thus,
-// the indices are execution plan indices rather than raw node
-// indices.
-class InterpreterInfo : public GraphInfo {
- public:
-  explicit InterpreterInfo(Interpreter* interpreter)
-      : interpreter_(interpreter) {}
-
-  size_t num_tensors() const override { return interpreter_->tensors_size(); }
-  TfLiteTensor* tensor(size_t index) override {
-    return interpreter_->tensor(index);
-  }
-  size_t num_nodes() const override {
-    return interpreter_->execution_plan().size();
-  }
-  const TfLiteNode& node(size_t index) const override {
-    int node_index = interpreter_->execution_plan()[index];
-    return interpreter_->node_and_registration(node_index)->first;
-  }
-  const std::vector<int>& inputs() const override {
-    return interpreter_->inputs();
-  }
-  const std::vector<int>& outputs() const override {
-    return interpreter_->outputs();
-  }
-  const std::vector<int>& variables() const override {
-    return interpreter_->variables();
-  }
-
- public:
-  Interpreter* interpreter_;
-};
-
-Interpreter::Interpreter(ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
-  context_.impl_ = static_cast<void*>(this);
-  context_.ResizeTensor = ResizeTensor;
-  context_.ReportError = ReportError;
-  context_.AddTensors = AddTensors;
-  context_.tensors = nullptr;
-  context_.tensors_size = 0;
-  context_.allow_fp32_relax_to_fp16 = false;
-  context_.recommended_num_threads = -1;
-  context_.GetExternalContext = GetExternalContext;
-  context_.SetExternalContext = SetExternalContext;
-
-  // Invalid to call these these except from TfLiteDelegate
-  SwitchToKernelContext();
-
-  // Reserve some space for the tensors to avoid excessive resizing.
-  tensors_.reserve(kTensorsReservedCapacity);
-  nodes_and_registration_.reserve(kTensorsReservedCapacity);
-  next_execution_plan_index_to_prepare_ = 0;
-
-  for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
-    external_contexts_[i] = nullptr;
-  }
-
-  UseNNAPI(false);
-}
-
-Interpreter::~Interpreter() {
-  for (auto& nodeAndReg : nodes_and_registration_) {
-    TfLiteNode& node = nodeAndReg.first;
-    TfLiteIntArrayFree(node.inputs);
-    TfLiteIntArrayFree(node.outputs);
-    TfLiteIntArrayFree(node.temporaries);
-    if (node.builtin_data) free(node.builtin_data);
-    OpFree(nodeAndReg.second, node.user_data);
-    node.builtin_data = nullptr;
-  }
-
-  for (int i = 0; i < context_.tensors_size; i++) {
-    TfLiteTensor* tensor = &context_.tensors[i];
-    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
-        tensor->delegate->FreeBufferHandle != nullptr) {
-      tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
-                                         &tensor->buffer_handle);
-    }
-    TfLiteTensorFree(tensor);
-  }
-}
-
-TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
-    TfLiteContext* context, TfLiteRegistration registration,
-    const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->ReplaceSubgraphsWithDelegateKernels(registration, nodes_to_replace,
-                                            delegate);
-}
-
-namespace {
-
-// Copy a std::vector<int> to an existing TfLiteIntArray.
-// This is a low-level data manipulation function, and it's caller's
-// responsibility to ensure TfLiteIntArray has enough size.
-void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
-                                TfLiteIntArray* arr) {
-  arr->size = vec.size();
-  memcpy(arr->data, vec.data(), sizeof(int) * arr->size);
-}
-
-// This function allocates a continuous memory space that contains a
-// TfLiteDelegateParams followed by a several TfLiteIntArray.
-// When calling `free` at TfLiteDelegateParams*, all the allocated space
-// will be freed together.
-//
-// +-----------------------------------+
-// | TfLiteDelegateParams              |
-// | TfLiteDelegate* delegate;         |
-// | TfLiteIntArray* nodes_to_replace; |--\
-// | TfLiteIntArray* input_tensors;    |--+--\
-// | TfLiteIntArray* output_tensors;   |--+--+--\
-// +-----------------------------------+  |  |  |
-// | TfLiteIntArray (variable size)    |<-/  |  |
-// +-----------------------------------+     |  |
-// | TfLiteIntArray (variable size)    |<----/  |
-// +-----------------------------------+        |
-// | TfLiteIntArray (variable size)    |<-------/
-// +-----------------------------------+
-TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
-                                           const Subgraph& subgraph) {
-  // Step 1: Calculate the allocation size.
-  int allocation_size = sizeof(TfLiteDelegateParams);
-
-  int nodes_to_replace_size =
-      TfLiteIntArrayGetSizeInBytes(subgraph.nodes.size());
-  allocation_size += nodes_to_replace_size;
-
-  int input_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(subgraph.input_tensors.size());
-  allocation_size += input_tensors_size;
-
-  int output_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(subgraph.output_tensors.size());
-  allocation_size += output_tensors_size;
-
-  // Step 2: Allocate the memory.
-  // Use `char*` for conveniently step through the allocated space by bytes.
-  char* allocation = reinterpret_cast<char*>(malloc(allocation_size));
-
-  // Step 3: Fill all data structures structures.
-  TfLiteDelegateParams* params =
-      reinterpret_cast<TfLiteDelegateParams*>(allocation);
-  params->delegate = delegate;
-  allocation += sizeof(TfLiteDelegateParams);
-
-  params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(subgraph.nodes, params->nodes_to_replace);
-  allocation += nodes_to_replace_size;
-
-  params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(subgraph.input_tensors, params->input_tensors);
-  allocation += input_tensors_size;
-
-  params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(subgraph.output_tensors, params->output_tensors);
-  allocation += output_tensors_size;
-
-  return params;
-}
-
-}  // namespace
-
-TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
-    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
-    TfLiteDelegate* delegate) {
-  // Annotate the registration as DELEGATE op.
-  registration.builtin_code = BuiltinOperator_DELEGATE;
-
-  // Analyze the graph to find all independent subgraphs that are either
-  // fully not-this-delegate or this-delegate computation.
-  InterpreterInfo info(this);
-  std::vector<Subgraph> subgraphs;
-  PartitionGraphIntoIndependentSubgraphs(&info, nodes_to_replace, &subgraphs);
-
-  execution_plan_.clear();
-  for (auto& subgraph : subgraphs) {
-    // Subgraphs calimed by the delegate should have a "macro" op created, the
-    // other subgraphs (kTfNonPartition) just have their nodes added back to
-    // the execution plan.
-    switch (subgraph.type) {
-      case Subgraph::kTfNonPartition:
-        for (auto it = subgraph.nodes.begin(); it != subgraph.nodes.end();
-             ++it) {
-          execution_plan_.push_back(*it);
-        }
-        break;
-      case Subgraph::kTfPartition: {
-        int node_index;
-
-        TfLiteDelegateParams* params = CreateDelegateParams(delegate, subgraph);
-        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
-            subgraph.input_tensors, subgraph.output_tensors, nullptr, 0, params,
-            &registration, &node_index));
-
-        // Initialize the output tensors's delegate-related fields.
-        for (int tensor_index : subgraph.output_tensors) {
-          TfLiteTensor* tensor = &tensors_[tensor_index];
-          TF_LITE_ENSURE(&context_, tensor->delegate == nullptr ||
-                                        tensor->delegate == delegate);
-          tensor->delegate = delegate;
-        }
-
-        // Associate the node with the delegate.
-        TfLiteNode* node = &nodes_and_registration_[node_index].first;
-        node->delegate = delegate;
-      } break;
-      case Subgraph::kTfUnexplored:
-        return kTfLiteError;
-        break;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteExternalContext* Interpreter::GetExternalContext(
-    TfLiteExternalContextType type) {
-  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
-    return external_contexts_[type];
-  }
-  return nullptr;
-}
-
-TfLiteExternalContext* Interpreter::GetExternalContext(
-    struct TfLiteContext* context, TfLiteExternalContextType type) {
-  return static_cast<Interpreter*>(context->impl_)->GetExternalContext(type);
-}
-
-void Interpreter::SetExternalContext(TfLiteExternalContextType type,
-                                     TfLiteExternalContext* ctx) {
-  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
-    external_contexts_[type] = ctx;
-  }
-}
-
-void Interpreter::SetExternalContext(struct TfLiteContext* context,
-                                     TfLiteExternalContextType type,
-                                     TfLiteExternalContext* ctx) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->SetExternalContext(type, ctx);
-}
-
-// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
-// this memory and it is only guaranteed to exist during the invocation of the
-// delegate prepare.
-TfLiteStatus Interpreter::GetExecutionPlan(TfLiteIntArray** execution_plan) {
-  // TODO(aselle): Do not make a copy here
-  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
-  *execution_plan = plan_cache_.get();
-  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
-                "TfLiteIntArray and execution_plan do not contain same type.");
-  std::memcpy(plan_cache_->data, execution_plan_.data(),
-              sizeof(plan_cache_->data[0]) * execution_plan_.size());
-  return kTfLiteOk;
-}
-
-// WARNING: This is an experimental interface that is subject to change.
-// Entry point for C node plugin API to get the execution plan
-TfLiteStatus Interpreter::GetExecutionPlan(struct TfLiteContext* context,
-                                           TfLiteIntArray** execution_plan) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->GetExecutionPlan(execution_plan);
-}
-
-TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
-  TF_LITE_ENSURE_OK(&context_,
-                    CheckTensorIndices("inputs", inputs.data(), inputs.size()));
-  inputs_ = std::move(inputs);
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::SetOutputs(std::vector<int> outputs) {
-  TF_LITE_ENSURE_OK(
-      &context_, CheckTensorIndices("outputs", outputs.data(), outputs.size()));
-  outputs_ = std::move(outputs);
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
-  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("variables", variables.data(),
-                                                  variables.size()));
-  variables_ = std::move(variables);
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::CheckTensorIndices(const char* label,
-                                             const int* indices, int length) {
-  // Making sure kOptionalTensor is not re-defined to something other than -1.
-  static_assert(kOptionalTensor == -1, "kOptionalTensor should be defined -1");
-
-  for (int i = 0; i < length; i++) {
-    int index = indices[i];
-    // Continue if index == kOptionalTensor before additional comparisons below,
-    // size_t(-1) is always >= context_tensors_size.
-    if (index == kOptionalTensor) {
-      continue;
-    }
-    if (index < 0 || static_cast<size_t>(index) >= context_.tensors_size) {
-      ReportError(&context_, "Invalid tensor index %d in %s\n", index, label);
-      consistent_ = false;
-      return kTfLiteError;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
-                                        size_t dims_size, size_t* bytes) {
-  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
-  // MultiplyWithoutOverflow.
-  TF_LITE_ENSURE(&context_, bytes != nullptr);
-  size_t count = 1;
-  for (int k = 0; k < dims_size; k++) count *= dims[k];
-  switch (type) {
-    case kTfLiteFloat32:
-      *bytes = sizeof(float) * count;
-      break;
-    case kTfLiteInt16:
-      *bytes = sizeof(int16_t) * count;
-      break;
-    case kTfLiteInt32:
-      *bytes = sizeof(int32_t) * count;
-      break;
-    case kTfLiteUInt8:
-      *bytes = sizeof(uint8_t) * count;
-      break;
-    case kTfLiteInt64:
-      *bytes = sizeof(int64_t) * count;
-      break;
-    case kTfLiteBool:
-      *bytes = sizeof(bool) * count;
-      break;
-    case kTfLiteComplex64:
-      *bytes = sizeof(std::complex<float>) * count;
-      break;
-    default:
-      ReportError(&context_,
-                  "Only float32, int16, int32, int64, uint8, bool, complex64 "
-                  "supported currently.");
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::AllocateTensors() {
-  if (!consistent_) {
-    ReportError(&context_, "AllocateTensors() called on inconsistent model.");
-    return kTfLiteError;
-  }
-
-  // Explicit (re)allocation is necessary if nodes have been changed or tensors
-  // have been resized. For inputs marked as dynamic, we can't short-circuit the
-  // allocation as the client may have done the resize manually.
-  if (state_ != kStateUninvokable && !HasDynamicTensorImpl(context_, inputs_)) {
-    return kTfLiteOk;
-  }
-
-  next_execution_plan_index_to_prepare_ = 0;
-  if (memory_planner_) {
-    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
-  }
-
-  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-
-  state_ = kStateInvokable;
-
-  // Reset the variable tensors to zero after (re)allocating the tensors.
-  // Developers shouldn't rely on the side effect of this function to reset
-  // variable tesnsors. They should call `ResetVariableTensors` directly
-  // instead.
-  ResetVariableTensors();
-
-  return kTfLiteOk;
-}
-
-// TODO(ycling): Support non-zero default values.
-TfLiteStatus Interpreter::ResetVariableTensors() {
-  for (auto& tensor : tensors_) {
-    if (!tensor.is_variable) {
-      continue;
-    }
-
-    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
-    // allocated after the initial `PrepareOpsAndTensors()` is called.
-    TF_LITE_ENSURE_EQ(&context_, tensor.allocation_type,
-                      kTfLiteArenaRwPersistent);
-    TF_LITE_ENSURE(&context_, tensor.data.raw != nullptr);
-
-    memset(tensor.data.raw, 0, tensor.bytes);
-  }
-  return kTfLiteOk;
-}
-
-void Interpreter::ReserveNodes(int count) {
-  nodes_and_registration_.reserve(count);
-}
-
-TfLiteStatus Interpreter::AddNodeWithParameters(
-    const std::vector<int>& inputs, const std::vector<int>& outputs,
-    const char* init_data, size_t init_data_size, void* builtin_data,
-    const TfLiteRegistration* registration, int* node_index) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(&context_,
-                "AddNodeWithParameters is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-  state_ = kStateUninvokable;
-
-  std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
-                                                              free);
-
-  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("node inputs", inputs.data(),
-                                                  inputs.size()));
-  TF_LITE_ENSURE_OK(
-      &context_,
-      CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
-
-  int new_node_index = nodes_and_registration_.size();
-  if (node_index) *node_index = new_node_index;
-  nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
-  auto& node_and_reg = nodes_and_registration_.back();
-  TfLiteNode& node = node_and_reg.first;
-  if (node.inputs) TfLiteIntArrayFree(node.inputs);
-  if (node.outputs) TfLiteIntArrayFree(node.outputs);
-  if (node.temporaries) TfLiteIntArrayFree(node.temporaries);
-
-  // NOTE, here we are not using move semantics yet, since our internal
-  // representation isn't std::vector, but in the future we would like to avoid
-  // copies, so we want the interface to take r-value references now.
-  node.inputs = ConvertVectorToTfLiteIntArray(inputs);
-  node.outputs = ConvertVectorToTfLiteIntArray(outputs);
-  node.temporaries = TfLiteIntArrayCreate(0);
-  if (init_data) {
-    node.user_data = OpInit(*registration, init_data, init_data_size);
-  } else {
-    node.user_data =
-        OpInit(*registration,
-               reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
-  }
-
-  node.builtin_data = builtin_data_deleter.release();
-  // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
-  // properly for nodes generated by ReplaceSubgraphsWithDelegateKernels.
-
-  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
-    // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
-    // `Operator` table is passed in.
-    node.custom_initial_data = init_data;
-    node.custom_initial_data_size = init_data_size;
-  } else {
-    node.custom_initial_data = nullptr;
-    node.custom_initial_data_size = 0;
-  }
-
-  node.delegate = nullptr;
-  node_and_reg.second = *registration;
-  execution_plan_.push_back(new_node_index);
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
-                                            const std::vector<int>& dims) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(&context_,
-                "ResizeInputTensor is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-
-  // TODO(aselle): All bounds checks can be implemented as one-sided bounds
-  // checks by casting to unsigned for efficiency. Profile before doing this.
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  TfLiteTensor* tensor = &context_.tensors[tensor_index];
-
-  // Short-circuit the state change if the dimensions don't change, avoiding
-  // unnecessary (re)allocations.
-  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
-    return kTfLiteOk;
-  }
-
-  state_ = kStateUninvokable;
-  return ResizeTensorImpl(tensor, ConvertVectorToTfLiteIntArray(dims));
-}
-
-bool HasDynamicTensor(const TfLiteContext& context,
-                      const TfLiteIntArray* int_array) {
-  return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
-}
-
-TfLiteStatus Interpreter::PrepareOpsStartingAt(
-    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
-  for (int execution_plan_index = first_execution_plan_index;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    int node_index = execution_plan_[execution_plan_index];
-    TfLiteNode& node = nodes_and_registration_[node_index].first;
-    const TfLiteRegistration& registration =
-        nodes_and_registration_[node_index].second;
-    EnsureTensorsVectorCapacity();
-    if (OpPrepare(registration, &node) == kTfLiteError) {
-      return ReportOpError(&context_, node, registration, node_index,
-                           "failed to prepare");
-    }
-
-    *last_execution_plan_index_prepared = execution_plan_index;
-
-    // Discontinue if the node has dynamic outputs. Note that we don't
-    // stop for dynamic temporary tensors since they won't affect the
-    // sizes of other tensors in the graph.
-    if (HasDynamicTensor(context_, node.outputs)) {
-      break;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::PrepareOpsAndTensors() {
-  if (!memory_planner_) {
-    memory_planner_.reset(new ArenaPlanner(
-        &context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this)),
-        /*preserve_inputs=*/true, /*preserve_intermediates*/ false));
-    memory_planner_->PlanAllocations();
-  }
-
-  int last_exec_plan_index_prepared = 0;
-
-  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
-      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
-  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
-      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
-
-  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::Invoke() {
-  if (!consistent_) {
-    ReportError(&context_, "Invoke called on model that is not consistent.");
-    return kTfLiteError;
-  }
-  if (state_ == kStateUninvokable) {
-    ReportError(&context_, "Invoke called on model that is not ready.");
-    return kTfLiteError;
-  }
-
-  TfLiteStatus status = kTfLiteOk;
-  if (nnapi_delegate_) {
-    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
-      TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
-      return kTfLiteOk;
-    } else {
-      // TODO(aselle): In the future, we would like this to be an
-      // automatic tflite CPU fallback.
-      ReportError(&context_,
-                  "NNAPI was requested, but dependent sized tensors "
-                  "being used.\n");
-      return kTfLiteError;
-    }
-  }
-
-  // Invocations are always done in node order.
-  // Note that calling Invoke repeatedly will cause the original memory plan to
-  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
-  // called.
-  // TODO(b/71913981): we should force recalculation in the presence of dynamic
-  // tensors, because they may have new value which in turn may affect shapes
-  // and allocations.
-  for (int execution_plan_index = 0;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
-      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-      TF_LITE_ENSURE(&context_, next_execution_plan_index_to_prepare_ >=
-                                    execution_plan_index);
-    }
-    int node_index = execution_plan_[execution_plan_index];
-    TfLiteNode& node = nodes_and_registration_[node_index].first;
-    const TfLiteRegistration& registration =
-        nodes_and_registration_[node_index].second;
-    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
-
-    // TODO(ycling): This is an extra loop through inputs to check if the data
-    // need to be copied from Delegate buffer to raw memory, which is often not
-    // needed. We may want to cache this in prepare to know if this needs to be
-    // done for a node or not.
-    for (int i = 0; i < node.inputs->size; ++i) {
-      int tensor_index = node.inputs->data[i];
-      if (tensor_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor* tensor = &tensors_[tensor_index];
-      if (tensor->delegate && tensor->delegate != node.delegate &&
-          tensor->data_is_stale) {
-        EnsureTensorDataIsReadable(tensor_index);
-      }
-    }
-
-    EnsureTensorsVectorCapacity();
-    tensor_resized_since_op_invoke_ = false;
-    if (OpInvoke(registration, &node) == kTfLiteError) {
-      status = ReportOpError(&context_, node, registration, node_index,
-                             "failed to invoke");
-    }
-
-    // Force execution prep for downstream ops if the latest op triggered the
-    // resize of a dynamic tensor.
-    if (tensor_resized_since_op_invoke_ &&
-        HasDynamicTensor(context_, node.outputs)) {
-      next_execution_plan_index_to_prepare_ = execution_plan_index + 1;
-    }
-  }
-
-  if (!allow_buffer_handle_output_) {
-    for (int tensor_index : outputs_) {
-      EnsureTensorDataIsReadable(tensor_index);
-    }
-  }
-
-  return status;
-}
-
-TfLiteStatus Interpreter::ResizeTensor(TfLiteContext* context,
-                                       TfLiteTensor* tensor,
-                                       TfLiteIntArray* new_size) {
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function ResizeTensorImpl
-  // (this function is static).
-  return static_cast<Interpreter*>(context->impl_)
-      ->ResizeTensorImpl(tensor, new_size);
-}
-
-void Interpreter::ReportErrorImpl(const char* format, va_list args) {
-  error_reporter_->Report(format, args);
-}
-
-void Interpreter::ReportError(TfLiteContext* context, const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  auto* f = static_cast<Interpreter*>(context->impl_);
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function ReportErrorImpl
-  // (this function is static).
-  f->ReportErrorImpl(format, args);
-  va_end(args);
-}
-
-TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
-                                     int* first_new_tensor_index) {
-  int base_index = tensors_.size();
-  if (first_new_tensor_index) *first_new_tensor_index = base_index;
-  tensors_.resize(tensors_.size() + tensors_to_add);
-  for (int i = base_index; i < tensors_.size(); i++) {
-    memset(&tensors_[i], 0, sizeof(tensors_[i]));
-    tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
-  }
-  context_.tensors = tensors_.data();
-  context_.tensors_size = tensors_.size();
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::AddTensors(TfLiteContext* context, int tensors_to_add,
-                                     int* first_new_tensor_index) {
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function AddTensors
-  // (this function is static).
-  return static_cast<Interpreter*>(context->impl_)
-      ->AddTensors(tensors_to_add, first_new_tensor_index);
-}
-
-TfLiteStatus Interpreter::GetNodeAndRegistration(
-    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
-  TF_LITE_ENSURE(&context_, node_index < nodes_size() && node_index >= 0);
-  TF_LITE_ENSURE(&context_, node != nullptr && registration != nullptr);
-  *node = &nodes_and_registration_[node_index].first;
-  *registration = &nodes_and_registration_[node_index].second;
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::GetNodeAndRegistration(
-    struct TfLiteContext* context, int node_index, TfLiteNode** node,
-    TfLiteRegistration** registration) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->GetNodeAndRegistration(node_index, node, registration);
-}
-
-TfLiteStatus Interpreter::SetTensorParametersReadOnly(
-    int tensor_index, TfLiteType type, const char* name, const size_t rank,
-    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
-    size_t bytes, const Allocation* allocation) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(
-        &context_,
-        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  // For most tensors we know exactly how much memory is necessary so we can
-  // ensure the buffer is large enough. However, we need to skip string tensors
-  // because their sizes change with the contents of the individual strings.
-  if (type != kTfLiteString) {
-    size_t required_bytes;
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, rank, &required_bytes));
-    TF_LITE_ENSURE_EQ(&context_, required_bytes, bytes);
-  }
-
-  TfLiteTensor& tensor = context_.tensors[tensor_index];
-  if (type == tensor.type &&
-      EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
-    // Fast path which does not invalidate the invokable property.
-    TfLiteTensorDataFree(&tensor);
-    tensor.data.raw = const_cast<char*>(buffer);
-    if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
-    tensor.params = quantization;
-    tensor.allocation_type = kTfLiteMmapRo;
-    tensor.allocation = allocation;
-  } else {
-    state_ = kStateUninvokable;
-    TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                      quantization, const_cast<char*>(buffer), bytes,
-                      kTfLiteMmapRo, allocation, false, &tensor);
-  }
-  return kTfLiteOk;
-}
-
-// Set description of inputs/outputs/data/fptrs for node `node_index`.
-// This variant assumes an external buffer has been allocated of size
-// bytes. The lifetime of buffer must be ensured to be greater or equal
-// to Interpreter.
-TfLiteStatus Interpreter::SetTensorParametersReadWrite(
-    int tensor_index, TfLiteType type, const char* name, const size_t rank,
-    const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(
-        &context_,
-        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  size_t required_bytes = 0;
-  if (type != kTfLiteString) {
-    // These types will be allocated in our arena so we need to record how
-    // many bytes we will need based on the dimensions. String tensors are
-    // allocated dynamically and we can't know ahead of time how much space
-    // they will require.
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, rank, &required_bytes));
-  }
-
-  TfLiteAllocationType allocation_type = kTfLiteArenaRw;
-  if (type == kTfLiteString) {
-    if (is_variable) {
-      // We don't have a real use case for string variable tensor.
-      ReportError(&context_, "String variable tensor isn't supported.");
-      return kTfLiteError;
-    }
-    allocation_type = kTfLiteDynamic;
-  } else if (is_variable) {
-    allocation_type = kTfLiteArenaRwPersistent;
-  }
-
-  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                    quantization,
-                    /*buffer=*/nullptr, required_bytes, allocation_type,
-                    nullptr, is_variable, &context_.tensors[tensor_index]);
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
-  for (int node_index : new_plan) {
-    TF_LITE_ENSURE(&context_, node_index >= 0 && node_index < nodes_size());
-  }
-  execution_plan_ = new_plan;
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
-                                           TfLiteIntArray* new_size) {
-  // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
-  if (tensor->allocation_type == kTfLiteArenaRw ||
-      tensor->allocation_type == kTfLiteDynamic ||
-      tensor->allocation_type == kTfLiteArenaRwPersistent) {
-    tensor_resized_since_op_invoke_ |=
-        TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
-    if (tensor->type != kTfLiteString) {
-      size_t bytesRequired;
-      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
-                                          new_size->size, &bytesRequired);
-      if (status != kTfLiteOk) {
-        TfLiteIntArrayFree(new_size);
-        return kTfLiteError;
-      }
-
-      // Realloc space for kTfLiteDynamic tensors.
-      TfLiteTensorRealloc(bytesRequired, tensor);
-      tensor->bytes = bytesRequired;
-    }
-    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
-    tensor->dims = new_size;
-
-    if (tensor->allocation_type != kTfLiteDynamic) {
-      tensor->data.raw = nullptr;
-    }
-  } else {
-    // kTfLiteMmapRo tensors are stored in the flatbuffer and are therefore
-    // of fixed size.
-    TfLiteIntArrayFree(new_size);
-    ReportError(&context_, "Attempting to resize a fixed-size tensor.");
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-void Interpreter::UseNNAPI(bool enable) {
-  // TODO(aselle): This is a workaround for finding if NNAPI exists.
-  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
-  // prefixed.
-  if (!NNAPIDelegate::IsSupported()) enable = false;
-  if (!enable) {
-    nnapi_delegate_.reset();
-  } else if (!nnapi_delegate_) {
-    nnapi_delegate_.reset(new NNAPIDelegate);
-  }
-}
-
-void Interpreter::SetNumThreads(int num_threads) {
-  context_.recommended_num_threads = num_threads;
-
-  for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
-    auto* c = external_contexts_[i];
-    if (c && c->Refresh) {
-      c->Refresh(&context_);
-    }
-  }
-}
-
-void Interpreter::SwitchToDelegateContext() {
-  context_.GetNodeAndRegistration = GetNodeAndRegistration;
-  context_.ReplaceSubgraphsWithDelegateKernels =
-      ReplaceSubgraphsWithDelegateKernels;
-  context_.GetExecutionPlan = GetExecutionPlan;
-}
-
-void Interpreter::SwitchToKernelContext() {
-  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
-  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
-  SetForbiddenContextFunction(&context_.GetExecutionPlan);
-}
-
-TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate,
-                                                  bool allow_dynamic_tensors) {
-  if (!allow_dynamic_tensors) {
-    int last_execution_plan_index_prepared;
-    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
-                                     0, &last_execution_plan_index_prepared));
-
-    bool has_dynamic_tensors = true;
-    // Dynamic tensors exist if not all nodes can be prepared.
-    if (last_execution_plan_index_prepared + 1 == execution_plan_.size()) {
-      // If all the nodes can be prepared, check if the last node has dynamic
-      // tensors.
-      int node_index = execution_plan_[last_execution_plan_index_prepared];
-      TfLiteNode& node = nodes_and_registration_[node_index].first;
-      if (!HasDynamicTensor(context_, node.outputs)) {
-        has_dynamic_tensors = false;
-      }
-    }
-    if (has_dynamic_tensors) {
-      ReportError(&context_, "Attempting to resize a fixed-size tensor.");
-      return kTfLiteError;
-    }
-  }
-
-  // TODO(aselle): Consider if it is worth storing pointers to delegates.
-  // Setup additional context interface.
-  SwitchToDelegateContext();
-
-  TfLiteStatus status = delegate->Prepare(&context_, delegate);
-
-  // Remove additional context info.
-  SwitchToKernelContext();
-
-  TF_LITE_ENSURE_OK(&context_, status);
-
-  if (!allow_dynamic_tensors) {
-    // Reset the state to force tensor/op reallocation.
-    state_ = kStateUninvokable;
-    TF_LITE_ENSURE_OK(&context_, AllocateTensors());
-    TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable);
-    // After using a delegate which doesn't support dynamic tensors, make the
-    // entire graph immutable.
-    state_ = kStateInvokableAndImmutable;
-  }
-
-  return status;
-}
-
-TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
-                                          TfLiteBufferHandle buffer_handle,
-                                          TfLiteDelegate* delegate) {
-  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
-  TfLiteTensor* tensor = &tensors_[tensor_index];
-
-  TF_LITE_ENSURE(&context_,
-                 tensor->delegate == nullptr || tensor->delegate == delegate);
-  tensor->delegate = delegate;
-  if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
-    TF_LITE_ENSURE(&context_, tensor->delegate->FreeBufferHandle != nullptr);
-    tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
-                                       &tensor->buffer_handle);
-  }
-  tensor->buffer_handle = buffer_handle;
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
-                                          TfLiteBufferHandle* buffer_handle,
-                                          TfLiteDelegate** delegate) {
-  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
-  TfLiteTensor* tensor = &tensors_[tensor_index];
-
-  *delegate = tensor->delegate;
-  *buffer_handle = tensor->buffer_handle;
-
-  return kTfLiteOk;
-}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
deleted file mode 100644
index 651a97e9dc84350569514528ae5635ec040d607f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/interpreter.h
+++ /dev/null
@@ -1,695 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Main abstraction controlling the tflite interpreter.
-// See context.h for the API for defining operations (TfLiteRegistration).
-#ifndef TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
-#define TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
-
-#include <complex>
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-#include "tensorflow/contrib/lite/allocation.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/memory_planner.h"
-#include "tensorflow/contrib/lite/profiling/profiler.h"
-#include "tensorflow/contrib/lite/stderr_reporter.h"
-
-namespace tflite {
-
-// Map statically from a c++ type to a TfLiteType (used below for safe casts).
-template <class T>
-constexpr TfLiteType typeToTfLiteType() {
-  return kTfLiteNoType;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<int>() {
-  return kTfLiteInt32;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<int16_t>() {
-  return kTfLiteInt16;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<int64_t>() {
-  return kTfLiteInt64;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<float>() {
-  return kTfLiteFloat32;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<unsigned char>() {
-  return kTfLiteUInt8;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<bool>() {
-  return kTfLiteBool;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<std::complex<float>>() {
-  return kTfLiteComplex64;
-}
-template <>
-constexpr TfLiteType typeToTfLiteType<string>() {
-  return kTfLiteString;
-}
-
-// Forward declare since NNAPIDelegate uses Interpreter.
-class NNAPIDelegate;
-
-// An interpreter for a graph of nodes that input and output from tensors.
-// Each node of the graph processes a set of input tensors and produces a
-// set of output Tensors. All inputs/output tensors are referenced by index.
-//
-// Usage:
-//
-// -- Create basic model
-// Interpreter foo(2, 1);
-// foo.SetTensorParametersReadWrite(0, ...);
-// foo.SetTensorParametersReadOnly(1, ...);
-// foo.SetNodeParameters(0, ...)
-//
-// -- Resize input array to 1 length.
-// foo.ResizeInputTensor(0, 1);
-// foo.AllocateTensors();
-// -- Install array data
-// foo.typed_tensor<float>(0)[0] = 3;
-// foo.Invoke();
-// foo.typed_tensor<float>(0)[0] = 4;
-// foo.Invoke();
-// -- Resize input array and set data.
-// foo.ResizeInputTensor(0, 2);
-// foo.AllocateTensors();
-// foo.typed_tensor<float>(0)[0] = 4;
-// foo.typed_tensor<float>(0)[1] = 8;
-// foo.Invoke();
-//
-
-struct TfLiteIntArrayDeleter {
-  void operator()(TfLiteIntArray* a) {
-    if (a) TfLiteIntArrayFree(a);
-  }
-};
-
-class Interpreter {
- public:
-  // Instantiate an interpreter. All errors associated with reading and
-  // processing this model will be forwarded to the error_reporter object.
-  //
-  // Note, if error_reporter is nullptr, then a default StderrReporter is
-  // used. Ownership of 'error_reporter' remains with the caller.
-  explicit Interpreter(ErrorReporter* error_reporter = DefaultErrorReporter());
-
-  ~Interpreter();
-
-  Interpreter(const Interpreter&) = delete;
-  Interpreter& operator=(const Interpreter&) = delete;
-
-  // Functions to build interpreter
-
-  // Provide a list of tensor indexes that are inputs to the model.
-  // Each index is bound check and this modifies the consistent_ flag of the
-  // interpreter.
-  TfLiteStatus SetInputs(std::vector<int> inputs);
-
-  // Provide a list of tensor indexes that are outputs to the model
-  // Each index is bound check and this modifies the consistent_ flag of the
-  // interpreter.
-  TfLiteStatus SetOutputs(std::vector<int> outputs);
-
-  // Provide a list of tensor indexes that are variable tensors.
-  // Each index is bound check and this modifies the consistent_ flag of the
-  // interpreter.
-  TfLiteStatus SetVariables(std::vector<int> variables);
-
-  // Ensure the internal node storage memory allocates at least `count`
-  // spots for node. NOTE, this doesn't actually add operators. This is an
-  // efficiency optimization that is subject to change.
-  void ReserveNodes(int count);
-
-  // Adds a node with the given parameters and returns the index of the new
-  // node in `node_index` (optionally). Interpreter will take ownership of
-  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
-  // remains with the caller.
-  TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
-                                     const std::vector<int>& outputs,
-                                     const char* init_data,
-                                     size_t init_data_size, void* builtin_data,
-                                     const TfLiteRegistration* registration,
-                                     int* node_index = nullptr);
-
-  // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
-  // The value pointed to by `first_new_tensor_index` will be set to the
-  // index of the first new tensor if `first_new_tensor_index` is non-null.
-  TfLiteStatus AddTensors(int tensors_to_add,
-                          int* first_new_tensor_index = nullptr);
-
-  // Set description of inputs/outputs/data/fptrs for node `node_index`.
-  // This variant assumes an external buffer has been allocated of size
-  // bytes. The lifetime of buffer must be ensured to be greater or equal
-  // to Interpreter.
-  inline TfLiteStatus SetTensorParametersReadOnly(
-      int tensor_index, TfLiteType type, const char* name,
-      const std::vector<int>& dims, TfLiteQuantizationParams quantization,
-      const char* buffer, size_t bytes,
-      const Allocation* allocation = nullptr) {
-    return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
-                                       dims.data(), quantization, buffer, bytes,
-                                       allocation);
-  }
-
-  TfLiteStatus SetTensorParametersReadOnly(
-      int tensor_index, TfLiteType type, const char* name, const size_t rank,
-      const int* dims, TfLiteQuantizationParams quantization,
-      const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
-
-  // Set description of inputs/outputs/data/fptrs for node `node_index`.
-  // This variant assumes an external buffer has been allocated of size
-  // bytes. The lifetime of buffer must be ensured to be greater or equal
-  // to Interpreter.
-  inline TfLiteStatus SetTensorParametersReadWrite(
-      int tensor_index, TfLiteType type, const char* name,
-      const std::vector<int>& dims, TfLiteQuantizationParams quantization,
-      bool is_variable = false) {
-    return SetTensorParametersReadWrite(tensor_index, type, name, dims.size(),
-                                        dims.data(), quantization, is_variable);
-  }
-  TfLiteStatus SetTensorParametersReadWrite(
-      int tensor_index, TfLiteType type, const char* name, const size_t rank,
-      const int* dims, TfLiteQuantizationParams quantization,
-      bool is_variable = false);
-
-  // Functions to access tensor data
-
-  // Read only access to list of inputs.
-  const std::vector<int>& inputs() const { return inputs_; }
-
-  // Return the name of a given input. The given index must be between 0 and
-  // inputs().size().
-  const char* GetInputName(int index) const {
-    return context_.tensors[inputs_[index]].name;
-  }
-
-  // Read only access to list of outputs.
-  const std::vector<int>& outputs() const { return outputs_; }
-
-  // Read only access to list of variable tensors.
-  const std::vector<int>& variables() const { return variables_; }
-
-  // Return the name of a given output. The given index must be between 0 and
-  // outputs().size().
-  const char* GetOutputName(int index) const {
-    return context_.tensors[outputs_[index]].name;
-  }
-
-  // Return the number of tensors in the model.
-  size_t tensors_size() const { return context_.tensors_size; }
-
-  // Return the number of ops in the model.
-  size_t nodes_size() const { return nodes_and_registration_.size(); }
-
-  // WARNING: Experimental interface, subject to change
-  const std::vector<int>& execution_plan() const { return execution_plan_; }
-
-  // WARNING: Experimental interface, subject to change
-  // Overrides execution plan. This bounds checks indices sent in.
-  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
-
-  // Get a mutable tensor data structure.
-  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
-  // read/write access to structure
-  TfLiteTensor* tensor(int tensor_index) {
-    if (tensor_index >= context_.tensors_size || tensor_index < 0)
-      return nullptr;
-    return &context_.tensors[tensor_index];
-  }
-
-  // Get an immutable tensor data structure.
-  const TfLiteTensor* tensor(int tensor_index) const {
-    if (tensor_index >= context_.tensors_size || tensor_index < 0)
-      return nullptr;
-    return &context_.tensors[tensor_index];
-  }
-
-  // Get a pointer to an operation and registration data structure if in bounds.
-  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
-      int node_index) const {
-    if (node_index >= nodes_and_registration_.size() || node_index < 0)
-      return nullptr;
-    return &nodes_and_registration_[node_index];
-  }
-
-  // Perform a checked cast to the appropriate tensor type (mutable pointer
-  // version).
-  template <class T>
-  T* typed_tensor(int tensor_index) {
-    if (TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
-      if (tensor_ptr->type == typeToTfLiteType<T>()) {
-        return reinterpret_cast<T*>(tensor_ptr->data.raw);
-      }
-    }
-    return nullptr;
-  }
-
-  // Perform a checked cast to the appropriate tensor type (immutable pointer
-  // version).
-  template <class T>
-  const T* typed_tensor(int tensor_index) const {
-    if (const TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
-      if (tensor_ptr->type == typeToTfLiteType<T>()) {
-        return reinterpret_cast<const T*>(tensor_ptr->data.raw);
-      }
-    }
-    return nullptr;
-  }
-
-  // Return a mutable pointer into the data of a given input tensor. The given
-  // index must be between 0 and inputs().size().
-  template <class T>
-  T* typed_input_tensor(int index) {
-    return typed_tensor<T>(inputs_[index]);
-  }
-
-  // Return an immutable pointer into the data of a given input tensor. The
-  // given index must be between 0 and inputs().size().
-  template <class T>
-  const T* typed_input_tensor(int index) const {
-    return typed_tensor<T>(inputs_[index]);
-  }
-
-  // Return a mutable pointer into the data of a given output tensor. The given
-  // index must be between 0 and outputs().size().
-  template <class T>
-  T* typed_output_tensor(int index) {
-    return typed_tensor<T>(outputs_[index]);
-  }
-
-  // Return an immutable pointer into the data of a given output tensor. The
-  // given index must be between 0 and outputs().size().
-  template <class T>
-  const T* typed_output_tensor(int index) const {
-    return typed_tensor<T>(outputs_[index]);
-  }
-
-  // Change the dimensionality of a given tensor. Note, this is only acceptable
-  // for tensor indices that are inputs.
-  // Returns status of failure or success.
-  // TODO(aselle): Consider implementing ArraySlice equivalent to make this
-  //   more adept at accepting data without an extra copy. Use absl::ArraySlice
-  //   if our partners determine that dependency is acceptable.
-  TfLiteStatus ResizeInputTensor(int tensor_index,
-                                 const std::vector<int>& dims);
-
-  // Update allocations for all tensors. This will redim dependent tensors using
-  // the input tensor dimensionality as given. This is relatively expensive.
-  // If you know that your sizes are not changing, you need not call this.
-
-  // Returns status of success or failure.
-  TfLiteStatus AllocateTensors();
-
-  // Invoke the interpreter (run the whole graph in dependency order).
-  //
-  // NOTE: It is possible that the interpreter is not in a ready state
-  // to evaluate (i.e. if a ResizeTensor() has been performed without an
-  // AllocateTensors().
-  // Returns status of success or failure.
-  TfLiteStatus Invoke();
-
-  // Enable or disable the NN API (true to enable)
-  void UseNNAPI(bool enable);
-
-  // Set the number of threads available to the interpreter.
-  void SetNumThreads(int num_threads);
-
-  // Allow float16 precision for FP32 calculation when possible.
-  // default: not allow.
-  // WARNING: This is an experimental API and subject to change.
-  void SetAllowFp16PrecisionForFp32(bool allow) {
-    context_.allow_fp32_relax_to_fp16 = allow;
-  }
-
-  // Get the half precision flag.
-  // WARNING: This is an experimental API and subject to change.
-  bool GetAllowFp16PrecisionForFp32() const {
-    return context_.allow_fp32_relax_to_fp16;
-  }
-
-  // Owning handle to a TfLiteDelegate instance.
-  using TfLiteDelegatePtr =
-      std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
-
-  // Allow a delegate to look at the graph and modify the graph to handle
-  // parts of the graph themselves. After this is called, the graph may
-  // contain new nodes that replace 1 more nodes.
-  // WARNING: This is an experimental API and subject to change.
-  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate,
-                                       bool allow_dynamic_tensors = false);
-
-  // Ensure the data in `tensor.data` is readable. In case delegate is used,
-  // it might require to copy the data from delegate buffer to raw memory.
-  // WARNING: This is an experimental API and subject to change.
-  TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
-    TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
-    TfLiteTensor* tensor = &tensors_[tensor_index];
-    if (tensor->data_is_stale) {
-      TF_LITE_ENSURE(&context_, tensor->delegate != nullptr);
-      TF_LITE_ENSURE(&context_,
-                     tensor->buffer_handle != kTfLiteNullBufferHandle);
-      // This can be null if the delegate doesn't use its own buffer.
-      TF_LITE_ENSURE(&context_,
-                     tensor->delegate->CopyFromBufferHandle != nullptr);
-      tensor->delegate->CopyFromBufferHandle(&context_, tensor->delegate,
-                                             tensor->buffer_handle,
-                                             tensor->data.raw, tensor->bytes);
-      tensor->data_is_stale = false;
-    }
-    return kTfLiteOk;
-  }
-
-  // Set the delegate buffer handle to a tensor. It can be called in the
-  // following cases:
-  // 1. Set the buffer handle to a tensor that's not being written by a
-  //    delegate. For example, feeding an OpenGL texture as the input of the
-  //    inference graph.
-  // 2. Set the buffer handle to a tensor that uses the same delegate.
-  //    For example, set an OpenGL texture as the output of inference, while
-  //    the node which produces output is an OpenGL delegate node.
-  // WARNING: This is an experimental API and subject to change.
-  TfLiteStatus SetBufferHandle(int tensor_index,
-                               TfLiteBufferHandle buffer_handle,
-                               TfLiteDelegate* delegate);
-
-  // Get the delegate buffer handle, and the delegate which can process the
-  // buffer handle.
-  // WARNING: This is an experimental API and subject to change.
-  TfLiteStatus GetBufferHandle(int tensor_index,
-                               TfLiteBufferHandle* buffer_handle,
-                               TfLiteDelegate** delegate);
-
-  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
-
-  profiling::Profiler* GetProfiler() { return profiler_; }
-
-  // The default capacity of `tensors_` vector.
-  static constexpr int kTensorsReservedCapacity = 128;
-  // The capacity headroom of `tensors_` vector before calling ops'
-  // `prepare` and `invoke` function. In these functions, it's guaranteed
-  // allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
-  // pointers to existing tensors.
-  static constexpr int kTensorsCapacityHeadroom = 16;
-
-  // Set if buffer handle output is allowed.
-  //
-  // When using hardware delegation, Interpreter will make the data of output
-  // tensors available in `tensor->data` by default. If the application can
-  // consume the buffer handle directly (e.g. reading output from OpenGL
-  // texture), it can set this flag to false, so Interpreter won't copy the data
-  // from buffer handle to CPU memory.
-  // WARNING: This is an experimental API and subject to change.
-  void SetAllowBufferHandleOutput(bool allow_buffer_handle_output) {
-    allow_buffer_handle_output_ = allow_buffer_handle_output;
-  }
-
-  // Reset all variable tensors to the default value.
-  // If a variable tensor doesn't have a buffer, reset it to zero.
-  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
-  // to the value of the buffer.
-  // WARNING: This is an experimental API and subject to change.
-  TfLiteStatus ResetVariableTensors();
-
-  // Retrieve an operator's description of its work, for profiling purposes.
-  const char* OpProfilingString(const TfLiteRegistration& op_reg,
-                                const TfLiteNode* node) const {
-    if (op_reg.profiling_string == nullptr) return nullptr;
-    return op_reg.profiling_string(&context_, node);
-  }
-
-  // Set the value of an external context.
-  void SetExternalContext(TfLiteExternalContextType type,
-                          TfLiteExternalContext* ctx);
-
- private:
-  friend class InterpreterBuilder;
-  friend class InterpreterTest;
-
-  // Prevent 'context_' from accessing functions that are only available to
-  // delegated kernels.
-  void SwitchToKernelContext();
-
-  // Add delegate-only functions to 'context_'.
-  void SwitchToDelegateContext();
-
-  // Give 'op_reg' a chance to initialize itself using the contents of
-  // 'buffer'.
-  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
-               size_t length) {
-    if (op_reg.init == nullptr) return nullptr;
-    return op_reg.init(&context_, buffer, length);
-  }
-
-  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
-  void OpFree(const TfLiteRegistration& op_reg, void* buffer) {
-    if (op_reg.free == nullptr) return;
-    if (buffer) {
-      op_reg.free(&context_, buffer);
-    }
-  }
-
-  // Prepare the given 'node' for execution.
-  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node) {
-    if (op_reg.prepare == nullptr) return kTfLiteOk;
-    return op_reg.prepare(&context_, node);
-  }
-
-  // Invoke the operator represented by 'node'.
-  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node) {
-    if (op_reg.invoke == nullptr) return kTfLiteError;
-    return op_reg.invoke(&context_, node);
-  }
-
-  // Call OpPrepare() for as many ops as possible, allocating memory for their
-  // tensors. If an op containing dynamic tensors is found, preparation will be
-  // postponed until this function is called again. This allows the interpreter
-  // to wait until Invoke() to resolve the sizes of dynamic tensors.
-  TfLiteStatus PrepareOpsAndTensors();
-
-  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
-  // dynamic tensors is found or all ops have been prepared. Fill
-  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
-  // the last in the graph.
-  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
-                                    int* last_execution_plan_index_prepared);
-
-  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
-  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
-  // `context_` whenever this std::vector is reallocated. Currently this
-  // only happens in `AddTensors()`.
-  std::vector<TfLiteTensor> tensors_;
-
-  // Check if an array of tensor indices are valid with respect to the Tensor
-  // array.
-  // NOTE: this changes consistent_ to be false if indices are out of bounds.
-  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
-                                  int length);
-
-  // Compute the number of bytes required to represent a tensor with dimensions
-  // specified by the array dims (of length dims_size). Returns the status code
-  // and bytes.
-  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
-                             size_t* bytes);
-
-  // Request an tensor be resized implementation. If the given tensor is of
-  // type kTfLiteDynamic it will also be allocated new memory.
-  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
-
-  // Report a detailed error string (will be printed to stderr).
-  // TODO(aselle): allow user of class to provide alternative destinations.
-  void ReportErrorImpl(const char* format, va_list args);
-
-  // Entry point for C node plugin API to request an tensor be resized.
-  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
-                                   TfLiteIntArray* new_size);
-  // Entry point for C node plugin API to report an error.
-  static void ReportError(TfLiteContext* context, const char* format, ...);
-
-  // Entry point for C node plugin API to add new tensors.
-  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
-                                 int* first_new_tensor_index);
-
-  // WARNING: This is an experimental API and subject to change.
-  // Entry point for C API ReplaceSubgraphsWithDelegateKernels
-  static TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
-      TfLiteContext* context, TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
-
-  // Update the execution graph to replace some of the nodes with stub
-  // nodes. Specifically any node index that has `nodes[index]==1` will be
-  // slated for replacement with a delegate kernel specified by registration.
-  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
-      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
-      TfLiteDelegate* delegate);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Gets the internal pointer to a TensorFlow lite node by node_index.
-  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
-                                      TfLiteRegistration** registration);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Entry point for C node plugin API to get a node by index.
-  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
-                                             int node_index, TfLiteNode** node,
-                                             TfLiteRegistration** registration);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
-  // owns this memory and it is only guaranteed to exist during the invocation
-  // of the delegate prepare.
-  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Entry point for C node plugin API to get the execution plan.
-  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
-                                       TfLiteIntArray** execution_plan);
-
-  // Retrieve an existing external context by type.
-  TfLiteExternalContext* GetExternalContext(TfLiteExternalContextType type);
-  static TfLiteExternalContext* GetExternalContext(
-      struct TfLiteContext* context, TfLiteExternalContextType type);
-
-  // Set the value of an external context.
-  static void SetExternalContext(struct TfLiteContext* context,
-                                 TfLiteExternalContextType type,
-                                 TfLiteExternalContext* ctx);
-
-  // Variant of the public ModifyGraphWithDelegate method that additionally
-  // Assumes ownership of the provided delegate.
-  // WARNING: This is an experimental API and subject to change.
-  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegatePtr delegate,
-                                       bool allow_dynamic_tensors = false) {
-    // Note that we retain ownership of the delegate even if graph modification
-    // fails, as delegate use will be in an indeterminate state at that point.
-    owned_delegates_.push_back(std::move(delegate));
-    return ModifyGraphWithDelegate(owned_delegates_.back().get(),
-                                   allow_dynamic_tensors);
-  }
-
-  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
-  // capacity. Calling this function may invalidate existing pointers to
-  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
-  // more tensors won't invalidate the pointer to existing tensors.
-  void EnsureTensorsVectorCapacity() {
-    const size_t required_capacity = tensors_size() + kTensorsCapacityHeadroom;
-    if (required_capacity > tensors_.capacity()) {
-      tensors_.reserve(required_capacity);
-      context_.tensors = tensors_.data();
-    }
-  }
-
-  // The state of the Interpreter.
-  enum State {
-    // The interpreter isn't ready to be invoked.
-    // `AllocateTensor` need to be called to enter an invokable state.
-    kStateUninvokable = 0,
-    // The interpreter is ready to be invoked.
-    kStateInvokable,
-    // The interpreter is ready to be invoked, and graph can't be further
-    // modified. The interpreter will enter this state when calling
-    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
-    kStateInvokableAndImmutable,
-  };
-  State state_ = kStateUninvokable;
-
-  // A pure C data structure used to communicate with the pure C plugin
-  // interface. To avoid copying tensor metadata, this is also the definitive
-  // structure to store tensors.
-  TfLiteContext context_;
-
-  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
-  // function pointers to actual implementation.
-  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
-      nodes_and_registration_;
-
-  // Whether the model is consistent. That is to say if the inputs and outputs
-  // of every node and the global inputs and outputs are valid indexes into
-  // the tensor array.
-  bool consistent_ = true;
-
-  // Array of indices representing the tensors that are inputs to the
-  // interpreter.
-  std::vector<int> inputs_;
-
-  // Array of indices representing the tensors that are outputs to the
-  // interpreter.
-  std::vector<int> outputs_;
-
-  // Array of indices representing the tensors that are variable tensors.
-  std::vector<int> variables_;
-
-  // The error reporter delegate that tflite will forward queries errors to.
-  ErrorReporter* error_reporter_;
-
-  // Index of the next node to prepare.
-  // During Invoke(), Interpreter will allocate input tensors first, which are
-  // known to be fixed size. Then it will allocate outputs from nodes as many
-  // as possible. When there is a node that produces dynamic sized tensor.
-  // Interpreter will stop allocating tensors, set the value of next allocate
-  // node id, and execute the node to generate the output tensor before continue
-  // to allocate successors. This process repeats until all nodes are executed.
-  // NOTE: this relies on the order of nodes that is in topological order.
-  int next_execution_plan_index_to_prepare_;
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // This is a list of node indices (to index into nodes_and_registration).
-  // This represents a valid topological sort (dependency ordered) execution
-  // plan. In particular, it is valid for this ordering to contain only a
-  // subset of the node indices.
-  std::vector<int> execution_plan_;
-
-  // In the future, we'd like a TfLiteIntArray compatible representation.
-  // TODO(aselle): replace execution_plan_ with this.
-  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
-
-  // Whether to delegate to NN API
-  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
-
-  // List of delegates that have been installed and are owned by this
-  // interpreter instance. Useful if client delegate ownership is burdensome.
-  // WARNING: This is an experimental API and subject to change.
-  // TODO(b/116667551): Use TfLiteExternalContext for storing state.
-  std::vector<TfLiteDelegatePtr> owned_delegates_;
-
-  std::unique_ptr<MemoryPlanner> memory_planner_;
-
-  bool allow_buffer_handle_output_ = false;
-
-  // Tracking bit for whether a tensor was resized in the course of an op
-  // invocation. This is a useful hint to ensure that dynamic tensor outputs
-  // trigger downstream reallocation after op invocation.
-  bool tensor_resized_since_op_invoke_ = false;
-
-  // Profiler for this interpreter instance.
-  profiling::Profiler* profiler_ = nullptr;
-
-  // List of active external contexts.
-  TfLiteExternalContext* external_contexts_[kTfLiteMaxExternalContexts];
-};
-
-}  // namespace tflite
-#endif  // TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
deleted file mode 100644
index 098ba7e7731d833678fbd5eab9cce3f022570f23..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/BUILD
+++ /dev/null
@@ -1,177 +0,0 @@
-# Description:
-# TensorFlow Lite Java API.
-
-package(default_visibility = [
-    "//tensorflow/contrib/lite/java/ovic:__pkg__",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_jni_binary")
-load("//tensorflow/contrib/lite/java:aar_with_jni.bzl", "aar_with_jni")
-
-# Building tensorflow-lite.aar including 4 variants of .so
-# To build an aar for release, run below command:
-# bazel build --cxxopt='--std=c++11' -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
-# tensorflow/contrib/lite/java:tensorflow-lite
-aar_with_jni(
-    name = "tensorflow-lite",
-    android_library = ":tensorflowlite",
-)
-
-android_library(
-    name = "tensorflowlite",
-    srcs = glob(
-        [
-            "src/main/java/org/tensorflow/lite/*.java",
-        ],
-    ),
-    manifest = "AndroidManifest.xml",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":tflite_runtime",
-        "@org_checkerframework_qual",
-    ],
-)
-
-android_library(
-    name = "tensorflowlite_java",
-    srcs = glob(
-        [
-            "src/main/java/org/tensorflow/lite/*.java",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@org_checkerframework_qual",
-    ],
-)
-
-java_library(
-    name = "tensorflowlitelib",
-    srcs = glob(
-        [
-            "src/main/java/org/tensorflow/lite/*.java",
-        ],
-    ),
-    javacopts = JAVACOPTS,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":libtensorflowlite_jni.so",
-        "//tensorflow/contrib/lite/java/src/main/native",
-        "@org_checkerframework_qual",
-    ],
-)
-
-java_test(
-    name = "TensorFlowLiteTest",
-    size = "small",
-    srcs = ["src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java"],
-    javacopts = JAVACOPTS,
-    tags = ["no_oss"],
-    test_class = "org.tensorflow.lite.TensorFlowLiteTest",
-    deps = [
-        ":libtensorflowlite_jni.so",
-        ":tensorflowlitelib",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
-
-java_test(
-    name = "DataTypeTest",
-    size = "small",
-    srcs = ["src/test/java/org/tensorflow/lite/DataTypeTest.java"],
-    javacopts = JAVACOPTS,
-    tags = ["no_oss"],
-    test_class = "org.tensorflow.lite.DataTypeTest",
-    deps = [
-        ":libtensorflowlite_jni.so",
-        ":tensorflowlitelib",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
-
-java_test(
-    name = "NativeInterpreterWrapperTest",
-    size = "small",
-    srcs = ["src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java"],
-    data = [
-        "src/testdata/add.bin",
-        "src/testdata/int32.bin",
-        "src/testdata/int64.bin",
-        "src/testdata/invalid_model.bin",
-        "src/testdata/uint8.bin",
-        "src/testdata/with_custom_op.lite",
-    ],
-    javacopts = JAVACOPTS,
-    tags = ["no_oss"],
-    test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
-    deps = [
-        ":libtensorflowlite_jni.so",
-        ":tensorflowlitelib",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
-
-# TODO: generate large models at runtime, instead of storing them.
-java_test(
-    name = "InterpreterTest",
-    size = "small",
-    srcs = ["src/test/java/org/tensorflow/lite/InterpreterTest.java"],
-    data = [
-        "src/testdata/add.bin",
-        "src/testdata/mobilenet.tflite.bin",
-    ],
-    javacopts = JAVACOPTS,
-    tags = ["no_oss"],
-    test_class = "org.tensorflow.lite.InterpreterTest",
-    visibility = ["//visibility:private"],
-    deps = [
-        ":libtensorflowlite_jni.so",
-        ":tensorflowlitelib",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
-
-java_test(
-    name = "TensorTest",
-    size = "small",
-    srcs = ["src/test/java/org/tensorflow/lite/TensorTest.java"],
-    data = [
-        "src/testdata/add.bin",
-    ],
-    javacopts = JAVACOPTS,
-    tags = ["no_oss"],
-    test_class = "org.tensorflow.lite.TensorTest",
-    deps = [
-        ":tensorflowlitelib",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
-
-filegroup(
-    name = "libtensorflowlite_jni",
-    srcs = select({
-        "//conditions:default": [":libtensorflowlite_jni.so"],
-    }),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "tflite_runtime",
-    srcs = ["libtensorflowlite_jni.so"],
-    visibility = ["//visibility:public"],
-)
-
-tflite_jni_binary(
-    name = "libtensorflowlite_jni.so",
-    deps = [
-        "//tensorflow/contrib/lite/java/src/main/native",
-    ],
-)
diff --git a/tensorflow/contrib/lite/java/demo/README.md b/tensorflow/contrib/lite/java/demo/README.md
deleted file mode 100644
index c04b2a61942430108891c612ae410d04d373c840..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/demo/README.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# TF Lite Android Image Classifier App Example
-
-A simple Android example that demonstrates image classification using the camera.
-
-## Building in Android Studio with TensorFlow Lite AAR from JCenter.
-The build.gradle is configured to use TensorFlow Lite's nightly build.
-
-If you see a build error related to compatibility with Tensorflow Lite's Java API (example: method X is
-undefined for type Interpreter), there has likely been a backwards compatible
-change to the API. You will need to pull new app code that's compatible with the
-nightly build and may need to first wait a few days for our external and internal
-code to merge.
-
-## Building from Source with Bazel
-
-1. Follow the [Bazel steps for the TF Demo App](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel):
-
-  1. [Install Bazel and Android Prerequisites](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-bazel-and-android-prerequisites).
-     It's easiest with Android Studio.
-
-      - You'll need at least SDK version 23.
-      - Make sure to install the latest version of Bazel. Some distributions
-        ship with Bazel 0.5.4, which is too old.
-      - Bazel requires Android Build Tools `26.0.1` or higher.
-      - You also need to install the Android Support Repository, available
-        through Android Studio under `Android SDK Manager -> SDK Tools ->
-        Android Support Repository`.
-
-  2. [Edit your `WORKSPACE`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#edit-workspace)
-     to add SDK and NDK targets.
-
-     NOTE: As long as you have the SDK and NDK installed, the `./configure`
-     script will create these rules for you. Answer "Yes" when the script asks
-     to automatically configure the `./WORKSPACE`.
-
-      - Make sure the `api_level` in `WORKSPACE` is set to an SDK version that
-        you have installed.
-      - By default, Android Studio will install the SDK to `~/Android/Sdk` and
-        the NDK to `~/Android/Sdk/ndk-bundle`.
-
-2. Build the app with Bazel. The demo needs C++11:
-
-  ```shell
-  bazel build -c opt --cxxopt='--std=c++11' \
-    //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
-  ```
-
-3. Install the demo on a
-   [debug-enabled device](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install):
-
-  ```shell
-  adb install bazel-bin/tensorflow/contrib/lite/java/demo/app/src/main/TfLiteCameraDemo.apk
-  ```
diff --git a/tensorflow/contrib/lite/java/demo/app/build.gradle b/tensorflow/contrib/lite/java/demo/app/build.gradle
deleted file mode 100644
index 05301ebf88c12cc95f71d5efd74062d76e598e1d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/demo/app/build.gradle
+++ /dev/null
@@ -1,89 +0,0 @@
-apply plugin: 'com.android.application'
-
-android {
-    compileSdkVersion 26
-    buildToolsVersion "26.0.1"
-    defaultConfig {
-        applicationId "android.example.com.tflitecamerademo"
-        // Required by Camera2 API.
-        minSdkVersion 21
-        targetSdkVersion 26
-        versionCode 1
-        versionName "1.0"
-
-        // Remove this block.
-        jackOptions {
-            enabled true
-        }
-    }
-    lintOptions {
-        abortOnError false
-    }
-    buildTypes {
-        release {
-            minifyEnabled false
-            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
-        }
-    }
-    aaptOptions {
-        noCompress "tflite"
-    }
-
-    compileOptions {
-        sourceCompatibility JavaVersion.VERSION_1_8
-        targetCompatibility JavaVersion.VERSION_1_8
-    }
-}
-
-repositories {
-    maven {
-        url 'https://google.bintray.com/tensorflow'
-    }
-}
-
-dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    compile 'com.android.support:appcompat-v7:25.2.0'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:25.2.0'
-    compile 'com.android.support:support-annotations:25.3.1'
-    compile 'com.android.support:support-v13:25.2.0'
-
-    compile 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
-}
-
-def modelDownloadUrl = "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
-def localCache = "build/intermediates/mobilenet_v1_224_android_quant_2017_11_08.zip"
-def targetFolder = "src/main/assets"
-
-task downloadModel(type: DownloadUrlTask) {
-    doFirst {
-        println "Downloading ${modelDownloadUrl}"
-    }
-    sourceUrl = "${modelDownloadUrl}"
-    target = file("${localCache}")
-}
-
-task unzipModel(type: Copy, dependsOn: 'downloadModel') {
-    doFirst {
-        println "Unzipping ${localCache}"
-    }
-    from zipTree("${localCache}")
-    into "${targetFolder}"
-}
-
-// Ensure the model file is downloaded and extracted before every build
-preBuild.dependsOn unzipModel
-
-class DownloadUrlTask extends DefaultTask {
-    @Input
-    String sourceUrl
-
-    @OutputFile
-    File target
-
-    @TaskAction
-    void download() {
-        ant.get(src: sourceUrl, dest: target)
-    }
-}
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
deleted file mode 100644
index 5ad738389eb8bc1d875fc888c1336fb3fa140eee..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
-
-package(default_visibility = ["//visibility:private"])
-
-licenses(["notice"])  # Apache 2.0
-
-android_binary(
-    name = "TfLiteCameraDemo",
-    srcs = glob(["java/**/*.java"]),
-    aapt_version = "aapt",
-    assets = [
-        "//tensorflow/contrib/lite/java/demo/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
-        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
-    ],
-    assets_dir = "",
-    custom_package = "com.example.android.tflitecamerademo",
-    manifest = "AndroidManifest.xml",
-    nocompress_extensions = [
-        ".tflite",
-    ],
-    resource_files = glob(["res/**"]),
-    # In some platforms we don't have an Android SDK/NDK and this target
-    # can't be built. We need to prevent the build system from trying to
-    # use the target in that case.
-    tags = ["manual"],
-    deps = [
-        "//tensorflow/contrib/lite/java:tensorflowlite",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@androidsdk//com.android.support:support-v13-25.2.0",
-        "@androidsdk//com.android.support:support-v4-25.2.0",
-    ],
-)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
deleted file mode 100644
index ef8a9e08450d72e392815756606f5ef8301cdd58..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
+++ /dev/null
@@ -1,67 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2014 The Android Open Source Project
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<LinearLayout
-    xmlns:android="http://schemas.android.com/apk/res/android"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#bb7700"
-    android:orientation="horizontal">
-
-  <com.example.android.tflitecamerademo.AutoFitTextureView
-      android:id="@+id/texture"
-      android:layout_width="0dp"
-      android:layout_height="match_parent"
-      android:layout_weight=".8"/>
-
-  <LinearLayout
-      android:layout_width="0dp"
-      android:layout_height="match_parent"
-      android:layout_weight=".2"
-      android:orientation="vertical">
-
-    <ImageView
-        android:id="@+id/logoview"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:scaleType="centerInside"
-        android:src="@drawable/logo"/>
-
-    <ToggleButton
-        android:id="@+id/button"
-        android:layout_width="match_parent"
-        android:layout_height="wrap_content"
-        android:textOff="@string/tflite"
-        android:textOn="@string/nnapi"/>
-    <NumberPicker
-        android:id="@+id/np"
-        android:layout_width="wrap_content"
-        android:layout_height="47dp"
-        android:layout_gravity="center_horizontal"
-        android:visibility="visible"/>
-
-    <TextView
-        android:id="@+id/text"
-        android:textStyle="bold"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:paddingTop="20dp"
-        android:textColor="#FFF"
-        android:textSize="20sp"/>
-
-  </LinearLayout>
-</LinearLayout>
-
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
deleted file mode 100644
index ddb099a950c2f83d7b2867f8f35d96885229536d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
+++ /dev/null
@@ -1,95 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2014 The Android Open Source Project
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#bb7700">
-
-    <com.example.android.tflitecamerademo.AutoFitTextureView
-        android:id="@+id/texture"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:layout_weight="1" />
-
-    <LinearLayout
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_above="@+id/bottom_info_view"
-        android:layout_alignParentEnd="false"
-        android:layout_alignParentStart="true"
-        android:layout_alignParentTop="false"
-        android:background="#bb7700"
-        android:orientation="vertical"
-        android:weightSum="100">
-
-        <ImageView
-            android:id="@+id/logoview2"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_weight="30"
-            android:scaleType="fitStart"
-            android:src="@drawable/logo" />
-
-        <TextView
-            android:id="@+id/text"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_alignParentBottom="true"
-            android:layout_alignParentEnd="true"
-            android:layout_alignParentRight="true"
-            android:layout_weight="30"
-            android:textColor="#FFF"
-            android:textSize="20sp"
-            android:textStyle="bold" />
-
-    </LinearLayout>
-    <LinearLayout
-        android:orientation="horizontal"
-        android:background="#513400"
-        android:layout_alignParentBottom="true"
-
-        android:layout_width="match_parent"
-        android:id="@+id/bottom_info_view"
-        android:layout_marginBottom="10dp"
-        android:layout_height="50dp">
-        <TextView
-            android:layout_width="wrap_content"
-            android:layout_height="match_parent"
-            android:textColor="@android:color/white"
-            android:textAlignment="center"
-            android:gravity="center"
-            android:text="Threads:"/>
-        <NumberPicker
-            android:id="@+id/np"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:theme="@style/AppTheme.Picker"
-            android:visibility="visible" />
-        <ToggleButton
-            android:id="@+id/button"
-            android:textOff="@string/tflite"
-            android:textOn="@string/nnapi"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:background="#0000000f"
-            android:textColor="@android:color/white" />
-    </LinearLayout>
-
-
-</RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
deleted file mode 100644
index e567009a424ed77384bee193c47d4f4d253f5767..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ /dev/null
@@ -1,94 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
- Copyright 2014 The Android Open Source Project
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
-    android:layout_width="match_parent"
-    android:layout_height="match_parent"
-    android:background="#bb7700">
-
-    <com.example.android.tflitecamerademo.AutoFitTextureView
-        android:id="@+id/texture"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:layout_weight="1" />
-
-    <LinearLayout
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:layout_above="@+id/bottom_info_view"
-        android:layout_alignParentEnd="false"
-        android:layout_alignParentStart="true"
-        android:layout_alignParentTop="false"
-        android:background="#bb7700"
-        android:orientation="vertical"
-        android:weightSum="100">
-
-        <ImageView
-            android:id="@+id/logoview2"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_weight="30"
-            android:scaleType="fitStart"
-            android:src="@drawable/logo" />
-
-        <TextView
-            android:id="@+id/text"
-            android:layout_width="match_parent"
-            android:layout_height="wrap_content"
-            android:layout_alignParentBottom="true"
-            android:layout_alignParentEnd="true"
-            android:layout_alignParentRight="true"
-            android:layout_weight="30"
-            android:textColor="#FFF"
-            android:textSize="20sp"
-            android:textStyle="bold" />
-
-    </LinearLayout>
-    <LinearLayout
-        android:orientation="horizontal"
-        android:background="#aa7700"
-        android:layout_alignParentBottom="true"
-
-        android:layout_width="match_parent"
-        android:id="@+id/bottom_info_view"
-        android:layout_marginBottom="10dp"
-        android:layout_height="50dp">
-        <TextView
-            android:layout_width="wrap_content"
-            android:layout_height="match_parent"
-            android:textColor="@android:color/white"
-            android:textAlignment="center"
-            android:gravity="center"
-            android:text="@string/threads" />
-        <NumberPicker
-            android:id="@+id/np"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:theme="@style/AppTheme.Picker"
-            android:visibility="visible" />
-        <ToggleButton
-            android:id="@+id/button"
-            android:textOff="@string/tflite"
-            android:textOn="@string/nnapi"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:background="#0000000f"
-            android:textColor="@android:color/white" />
-
-    </LinearLayout>
-</RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/ovic/BUILD b/tensorflow/contrib/lite/java/ovic/BUILD
deleted file mode 100644
index ea9b9ed4b66a601981f4c402f7f8a4f6749e07fd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/ovic/BUILD
+++ /dev/null
@@ -1,132 +0,0 @@
-# Description:
-# OVIC Benchmarker Java API.
-
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
-
-# Build targets for OVIC classification.
-java_test(
-    name = "OvicClassifierTest",
-    size = "medium",
-    srcs = ["src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
-    data = [
-        "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
-        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
-    ],
-    javacopts = JAVACOPTS,
-    tags = ["no_oss"],
-    test_class = "org.tensorflow.ovic.OvicClassifierTest",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib_java",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
-
-java_binary(
-    name = "ovic_validator",
-    srcs = ["src/main/java/org/tensorflow/ovic/OvicValidator.java"],
-    data = [
-        "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
-    ],
-    main_class = "org.tensorflow.ovic.OvicValidator",
-    tags = ["no_oss"],
-    deps = [
-        "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib_java",
-    ],
-)
-
-android_library(
-    name = "ovicbenchmarkerlib",
-    srcs = [
-        "src/main/java/org/tensorflow/ovic/OvicBenchmarker.java",
-        "src/main/java/org/tensorflow/ovic/OvicClassificationResult.java",
-        "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
-        "src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java",
-    ],
-    manifest = "//tensorflow/contrib/lite/java:AndroidManifest.xml",
-    tags = ["no_oss"],
-    deps = [
-        "//tensorflow/contrib/lite/java:tensorflowlite",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
-    ],
-)
-
-java_library(
-    name = "ovicbenchmarkerlib_java",
-    srcs = [
-        "src/main/java/org/tensorflow/ovic/OvicClassificationResult.java",
-        "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
-    ],
-    javacopts = JAVACOPTS,
-    tags = ["no_oss"],
-    deps = [
-        "//tensorflow/contrib/lite/java:libtensorflowlite_jni.so",
-        "//tensorflow/contrib/lite/java:tensorflowlite_java",
-        "//tensorflow/contrib/lite/java/src/main/native",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
-    ],
-)
-
-# Build targets for OVIC detection.
-java_test(
-    name = "OvicDetectorTest",
-    size = "medium",
-    srcs = ["src/test/java/org/tensorflow/ovic/OvicDetectorTest.java"],
-    data = [
-        "//tensorflow/contrib/lite/java/ovic/src/testdata:coco_labels.txt",
-        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
-        "@tflite_mobilenet_ssd_quant//:detect.tflite",
-    ],
-    javacopts = JAVACOPTS,
-    tags = ["no_oss"],
-    test_class = "org.tensorflow.ovic.OvicDetectorTest",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/contrib/lite/java/ovic:ovicdetectionbenchmarkerlib_java",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
-
-android_library(
-    name = "ovicdetectionbenchmarkerlib",
-    srcs = [
-        "src/main/java/org/tensorflow/ovic/BoundingBox.java",
-        "src/main/java/org/tensorflow/ovic/OvicBenchmarker.java",
-        "src/main/java/org/tensorflow/ovic/OvicDetectionResult.java",
-        "src/main/java/org/tensorflow/ovic/OvicDetector.java",
-        "src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java",
-    ],
-    manifest = "//tensorflow/contrib/lite/java:AndroidManifest.xml",
-    deps = [
-        "//tensorflow/contrib/lite/java:tensorflowlite",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
-    ],
-)
-
-java_library(
-    name = "ovicdetectionbenchmarkerlib_java",
-    srcs = [
-        "src/main/java/org/tensorflow/ovic/BoundingBox.java",
-        "src/main/java/org/tensorflow/ovic/OvicDetectionResult.java",
-        "src/main/java/org/tensorflow/ovic/OvicDetector.java",
-    ],
-    javacopts = JAVACOPTS,
-    deps = [
-        "//tensorflow/contrib/lite/java:libtensorflowlite_jni.so",
-        "//tensorflow/contrib/lite/java:tensorflowlite_java",
-        "//tensorflow/contrib/lite/java/src/main/native",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
-    ],
-)
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
deleted file mode 100644
index df77bfaab3251c0ebe2e377e84d11965fdb821dd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ /dev/null
@@ -1,158 +0,0 @@
-# Benchmarker for LPIRC Workshop at CVPR 2018
-
-This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
-
-## Pre-requisite
-
-Follow the steps [here](https://www.tensorflow.org/lite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
-
-## Test the benchmarker:
-
-The testing utilities helps the developers (you) to make sure that your submissions in TfLite format will be processed as expected in the competition's benchmarking system.
-
-Note: for now the tests only provides correctness checks, i.e. classifier predicts the correct category on the test image, but no on-device latency measurements. To test the latency measurement functionality, the tests will print the latency running on a desktop computer, which is not indicative of the on-device run-time.
-We are releasing an benchmarker Apk that would allow developers to measure latency on their own devices.
-
-### Obtain the sample models
-
-The test data (models and images) should be downloaded automatically for you by Bazel. In case they are not, you can manually install them as below.
-
-Note: all commands should be called from your tensorflow installation folder (under this folder you should find `tensorflow/contrib/lite`).
-
-
-* Download the [testdata package](https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip):
-
-```sh
-curl -L https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip -o /tmp/ovic.zip
-```
-
-* Unzip the package into the testdata folder:
-
-```sh
-unzip -j /tmp/ovic.zip -d tensorflow/contrib/lite/java/ovic/src/testdata/
-```
-
-### Run tests
-
-You can run test with Bazel as below. This helps to ensure that the installation is correct.
-
-```sh
-bazel test --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:OvicClassifierTest --cxxopt=-Wno-all --test_output=all
-```
-
-### Test your submissions
-
-Once you have a submission that follows the instructions from the [competition site](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018), you can verify it in two ways:
-
-#### Validate using randomly generated images
-
-You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
-
-```sh
-bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
-bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
-```
-
-Successful validation should print the following message to terminal:
-
-```
-Successfully validated /path/to/my_model.lite.
-
-```
-
-#### Test that the model produces sensible outcomes
-
-You can go a step further to verify that the model produces results as expected. This helps you catch bugs during TOCO conversion (e.g. using the wrong mean and std values).
-
-* Move your submission to the testdata folder:
-
-```sh
-cp /path/to/my_model.lite tensorflow/contrib/lite/java/ovic/src/testdata/
-```
-
-* Resize the test image to the resolutions that are expected by your submission:
-
-The test images can be found at `tensorflow/contrib/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these images if your image resolutions are 128x128 or 224x224.
-
-* Add your model and test image to the BUILD rule at `tensorflow/contrib/lite/java/ovic/src/testdata/BUILD`:
-
-```JSON
-filegroup(
-    name = "ovic_testdata",
-    srcs = [
-        "@tflite_ovic_testdata//:float_model.lite",
-        "@tflite_ovic_testdata//:low_res_model.lite",
-        "@tflite_ovic_testdata//:quantized_model.lite",
-        "@tflite_ovic_testdata//:test_image_128.jpg",
-        "@tflite_ovic_testdata//:test_image_224.jpg"
-        "my_model.lite",        # <--- Your submission.
-        "my_test_image.jpg",    # <--- Your test image.
-    ],
-    ...
-```
-
-* Modify `OvicClassifierTest.java` to test your model.
-
-Change `TEST_IMAGE_PATH` to `my_test_image.jpg`. Change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://www.tensorflow.org/performance/quantization).
-
-Now you can run the bazel tests to catch any runtime issues with the submission.
-
-Note: Please make sure that your submission passes the test. If a submission fails to pass the test it will not be processed by the submission server.
-
-## Measure on-device latency
-
-We provide two ways to measure the on-device latency of your submission. The first is through our competition server, which is reliable and repeatable, but is limited to a few trials per day. The second is through the benchmarker Apk, which requires a device and may not be as accurate as the server, but has a fast turn-around and no access limitations. We recommend that the participants use the benchmarker apk for early development, and reserve the competition server for evaluating promising submissions.
-
-### Running the benchmarker app
-
-Make sure that you have followed instructions in [Test your submissions](#test-your-submissions) to add your model to the testdata folder and to the corresponding build rules.
-
-Modify `tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java`:
-
-* Add your model to the benchmarker apk by changing `MODEL_PATH` and `TEST_IMAGE_PATH` below to your submission and test image.
-
-```
-  private static final String TEST_IMAGE_PATH = "my_test_image.jpg";
-  private static final String MODEL_PATH = "my_model.lite";
-```
-
-* Adjust the benchmark parameters when needed:
-
-You can chnage the length of each experiment, and the processor affinity below. `BIG_CORE_MASK` is an integer whose binary encoding represents the set of used cores. This number is phone-specific. For example, Pixel 2 has 8 cores: the 4 little cores are represented by the 4 less significant bits, and the 4 big cores by the 4 more significant bits. Therefore a mask value of 16, or in binary `00010000`, represents using only the first big core. The mask 32, or in binary `00100000` uses the second big core and should deliver identical results as the mask 16 because the big cores are interchangeable.
-
-```
-  /** Wall time for each benchmarking experiment. */
-  private static final double WALL_TIME = 3000;
-  /** Maximum number of iterations in each benchmarking experiment. */
-  private static final int MAX_ITERATIONS = 100;
-  /** Mask for binding to a single big core. Pixel 1 (4), Pixel 2 (16). */
-  private static final int BIG_CORE_MASK = 16;
-```
-
-Note: You'll need ROOT access to the phone to change processor affinity.
-
-* Build and install the app.
-
-```
-bazel build -c opt --cxxopt=--std=c++11 --cxxopt=-Wno-all //tensorflow/contrib/lite/java/ovic/demo/app:ovic_benchmarker_binary
-adb install -r bazel-bin/tensorflow/contrib/lite/java/ovic/demo/app/ovic_benchmarker_binary.apk
-```
-
-Start the app and click the `Start` button in dark green. The button should turn bright green, signaling that the experiment is running. The benchmarking results will be displayed after about the `WALL_TIME` you specified above. For example:
-
-```
-my_model.lite: Average latency=158.6ms after 20 runs.
-```
-
-### Sample latencies
-
-Note: the benchmarking results can be quite different depending on the background processes running on the phone. A few things that help stabilize the app's readings are placing the phone on a cooling plate, restarting the phone, and shutting down internet access.
-
-| Model                | Pixel 1 latency (ms)  | Pixel 2 latency (ms) |
-| -------------------- |:---------------------:| --------------------:|
-|  float_model.lite    | 120                   | 155                  |
-| quantized_model.lite | 85                    | 74                   |
-|  low_res_model.lite  | 4.2                   | 4.0                  |
-
-Since Pixel 2 has excellent support for 8-bit quantized models, we strongly recommend you to check out the [quantization training tutorial](https://www.tensorflow.org/performance/quantization).
-
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/BUILD b/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
deleted file mode 100644
index f567358ea33966ea8fdb422749662e22111c5fcc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
-
-# Sample app for OVIC benchmarking.
-licenses(["notice"])  # Apache 2.0
-
-android_binary(
-    name = "ovic_benchmarker_binary",
-    srcs = [
-        "OvicBenchmarkerActivity.java",
-    ],
-    aapt_version = "aapt",
-    assets = [
-        "//tensorflow/contrib/lite/java/ovic/src/testdata:coco_labels.txt",
-        "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
-        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
-        "@tflite_mobilenet_ssd_quant//:detect.tflite",
-    ],
-    assets_dir = "",
-    custom_package = "ovic.demo.app",
-    manifest = "AndroidManifest.xml",
-    nocompress_extensions = [
-        ".lite",
-        ".tflite",
-    ],
-    resource_files = glob(["res/**"]),
-    tags = ["manual"],
-    deps = [
-        "//tensorflow/contrib/lite/java:tensorflowlite",
-        "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib",
-        "//tensorflow/contrib/lite/java/ovic:ovicdetectionbenchmarkerlib",
-        "@androidsdk//com.android.support:support-v13-25.2.0",
-        "@androidsdk//com.android.support:support-v4-25.2.0",
-    ],
-)
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
deleted file mode 100644
index baa14baf920fee0b0a2feecee7e65ef5a9e96f95..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*Copyright 2018 Google LLC
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    https://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-package org.tensorflow.ovic;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PrintStream;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.Random;
-
-/** Validate a submission model. */
-public class OvicValidator {
-  private static void printUsage(PrintStream s) {
-    s.println("Java program that validates a submission model.");
-    s.println();
-    s.println("Usage: ovic_validator <submission file>");
-    s.println();
-    s.println("Where:");
-    s.println("<submission file> is the model in TfLite format;");
-  }
-
-  public static void main(String[] args) {
-    if (args.length != 1) {
-      printUsage(System.err);
-      System.exit(1);
-    }
-    final String labelPath =
-        "tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
-
-    final String modelFile = args[0];
-    try {
-      File labelsfile = new File(labelPath);
-      InputStream labelsInputStream = new FileInputStream(labelsfile);
-      MappedByteBuffer model = loadModelFile(modelFile);
-      OvicClassifier classifier = new OvicClassifier(labelsInputStream, model);
-      ByteBuffer imgData = createByteBufferForClassifier(classifier);
-      OvicClassificationResult testResult = classifier.classifyByteBuffer(imgData);
-      if (testResult.topKClasses.isEmpty()) {
-        throw new RuntimeException("Failed to return top K predictions.");
-      }
-      System.out.printf("Successfully validated %s.%n", modelFile);
-    } catch (Exception e) {
-      System.out.println(e.getMessage());
-      System.out.printf("Failed to validate %s.%n", modelFile);
-    }
-  }
-
-  private static ByteBuffer createByteBufferForClassifier(OvicClassifier classifier) {
-    if (classifier == null) {
-      throw new RuntimeException("Cannot create image buffer with the classifier.");
-    }
-    int[] inputDims = classifier.getInputDims();
-    int imgHeight = inputDims[1];
-    int imgWidth = inputDims[2];
-    ByteBuffer imgData = ByteBuffer.allocateDirect(imgHeight * imgWidth * 3);
-    imgData.order(ByteOrder.nativeOrder());
-    Random rand = new Random();
-    for (int y = 0; y < imgHeight; y++) {
-      for (int x = 0; x < imgWidth; x++) {
-        int val = rand.nextInt();
-        imgData.put((byte) ((val >> 16) & 0xFF));
-        imgData.put((byte) ((val >> 8) & 0xFF));
-        imgData.put((byte) (val & 0xFF));
-      }
-    }
-    return imgData;
-  }
-
-  private static MappedByteBuffer loadModelFile(String modelFilePath) throws IOException {
-    File modelfile = new File(modelFilePath);
-    FileInputStream inputStream = new FileInputStream(modelfile);
-    FileChannel fileChannel = inputStream.getChannel();
-    long startOffset = 0L;
-    long declaredLength = fileChannel.size();
-    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-  }
-}
diff --git a/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD b/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD
deleted file mode 100644
index 051aa2204efd37fbcb12fb8ae67195780ffffad6..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-# Testdata for OVIC benchmarker demo App and tests.
-licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "ovic_testdata",
-    srcs = [
-        "@tflite_ovic_testdata//:float_model.lite",
-        "@tflite_ovic_testdata//:low_res_model.lite",
-        "@tflite_ovic_testdata//:quantized_model.lite",
-        "@tflite_ovic_testdata//:test_image_128.jpg",
-        "@tflite_ovic_testdata//:test_image_224.jpg",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-exports_files(
-    [
-        "labels.txt",
-        "coco_labels.txt",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
deleted file mode 100644
index 711638a9f995ce270cd362b93a7bcfca990430dc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow.lite;
-
-/** Static utility methods loading the TensorFlowLite runtime. */
-public final class TensorFlowLite {
-
-  private static final String LIBNAME = "tensorflowlite_jni";
-
-  private TensorFlowLite() {}
-
-  /** Returns the version of the underlying TensorFlowLite runtime. */
-  public static native String version();
-
-  /**
-   * Load the TensorFlowLite runtime C library.
-   */
-  static boolean init() {
-    try {
-      System.loadLibrary(LIBNAME);
-      return true;
-    } catch (UnsatisfiedLinkError e) {
-      System.err.println("TensorFlowLite: failed to load native library: " + e.getMessage());
-      return false;
-    }
-  }
-
-  static {
-    init();
-  }
-}
diff --git a/tensorflow/contrib/lite/java/src/main/native/BUILD b/tensorflow/contrib/lite/java/src/main/native/BUILD
deleted file mode 100644
index 4b4e1c21d818dc56803ff31d83d19dea2ac08707..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/src/main/native/BUILD
+++ /dev/null
@@ -1,96 +0,0 @@
-# Description:
-# Java Native Interface (JNI) library intended for implementing the
-# TensorFlow Lite Java API using the TensorFlow Lite CC library.
-
-package(default_visibility = ["//visibility:public"])
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-
-licenses(["notice"])  # Apache 2.0
-
-cc_library(
-    name = "native_framework_only",
-    srcs = [
-        "exception_jni.cc",
-        "nativeinterpreterwrapper_jni.cc",
-        "tensor_jni.cc",
-        "tensorflow_lite_jni.cc",
-    ] + select({
-        # The Android toolchain makes "jni.h" available in the include path.
-        # For non-Android toolchains, generate jni.h and jni_md.h.
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            ":jni.h",
-            ":jni_md.h",
-        ],
-    }),
-    hdrs = [
-        "exception_jni.h",
-        "nativeinterpreterwrapper_jni.h",
-        "tensor_jni.h",
-        "tensorflow_lite_jni.h",
-    ],
-    copts = tflite_copts(),
-    includes = select({
-        "//tensorflow:android": [],
-        "//conditions:default": ["."],
-    }),
-    linkopts = [
-        "-lm",
-        "-ldl",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite:context",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-    ],
-    alwayslink = 1,
-)
-
-# Silly rules to make
-# #include <jni.h>
-# in the source headers work
-# (in combination with the "includes" attribute of the tf_cuda_library rule
-# above. Not needed when using the Android toolchain).
-#
-# Inspired from:
-# https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
-# but hopefully there is a simpler alternative to this.
-genrule(
-    name = "copy_jni_h",
-    srcs = ["@bazel_tools//tools/jdk:jni_header"],
-    outs = ["jni.h"],
-    cmd = "cp -f $< $@",
-)
-
-genrule(
-    name = "copy_jni_md_h",
-    srcs = select({
-        "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
-        "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
-    }),
-    outs = ["jni_md.h"],
-    cmd = "cp -f $< $@",
-)
-
-# This includes all ops. If you want a smaller binary, you should copy and
-# modify builtin_ops_jni.cc.  You should then link your binary against both
-# ":native_framework_only" and your own version of ":native_builtin_ops".
-cc_library(
-    name = "native",
-    srcs = [
-        "builtin_ops_jni.cc",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        ":native_framework_only",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ],
-    alwayslink = 1,
-)
-
-exports_files(
-    [
-        "version_script.lds",
-    ],
-)
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
deleted file mode 100644
index d3378f5f145deef375b38777fa27046993e15a6c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/java/src/main/native/tensor_jni.h"
-#include <cstring>
-#include <memory>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/java/src/main/native/exception_jni.h"
-
-namespace {
-
-// Convenience handle for obtaining a TfLiteTensor given an interpreter and
-// tensor index.
-//
-// Historically, the Java Tensor class used a TfLiteTensor pointer as its native
-// handle. However, this approach isn't generally safe, as the interpreter may
-// invalidate all TfLiteTensor* handles during inference or allocation.
-class TensorHandle {
- public:
-  TensorHandle(tflite::Interpreter* interpreter, int tensor_index)
-      : interpreter_(interpreter), tensor_index_(tensor_index) {}
-
-  TfLiteTensor* tensor() const { return interpreter_->tensor(tensor_index_); }
-
- private:
-  tflite::Interpreter* const interpreter_;
-  const int tensor_index_;
-};
-
-TfLiteTensor* GetTensorFromHandle(JNIEnv* env, jlong handle) {
-  if (handle == 0) {
-    throwException(env, kIllegalArgumentException,
-                   "Internal error: Invalid handle to TfLiteTensor.");
-    return nullptr;
-  }
-  return reinterpret_cast<TensorHandle*>(handle)->tensor();
-}
-
-size_t elementByteSize(TfLiteType data_type) {
-  // The code in this file makes the assumption that the
-  // TensorFlow TF_DataTypes and the Java primitive types
-  // have the same byte sizes. Validate that:
-  switch (data_type) {
-    case kTfLiteFloat32:
-      static_assert(sizeof(jfloat) == 4,
-                    "Interal error: Java float not compatible with "
-                    "kTfLiteFloat");
-      return 4;
-    case kTfLiteInt32:
-      static_assert(sizeof(jint) == 4,
-                    "Interal error: Java int not compatible with kTfLiteInt");
-      return 4;
-    case kTfLiteUInt8:
-      static_assert(sizeof(jbyte) == 1,
-                    "Interal error: Java byte not compatible with "
-                    "kTfLiteUInt8");
-      return 1;
-    case kTfLiteInt64:
-      static_assert(sizeof(jlong) == 8,
-                    "Interal error: Java long not compatible with "
-                    "kTfLiteInt64");
-      return 8;
-    default:
-      return 0;
-  }
-}
-
-size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
-                                void* dst, size_t dst_size) {
-  jarray array = static_cast<jarray>(object);
-  const int num_elements = env->GetArrayLength(array);
-  size_t to_copy = num_elements * elementByteSize(type);
-  if (to_copy > dst_size) {
-    throwException(env, kIllegalStateException,
-                   "Internal error: cannot write Java array of %d bytes to "
-                   "Tensor of %d bytes",
-                   to_copy, dst_size);
-    return 0;
-  }
-  switch (type) {
-    case kTfLiteFloat32: {
-      jfloatArray float_array = static_cast<jfloatArray>(array);
-      jfloat* float_dst = static_cast<jfloat*>(dst);
-      env->GetFloatArrayRegion(float_array, 0, num_elements, float_dst);
-      return to_copy;
-    }
-    case kTfLiteInt32: {
-      jintArray int_array = static_cast<jintArray>(array);
-      jint* int_dst = static_cast<jint*>(dst);
-      env->GetIntArrayRegion(int_array, 0, num_elements, int_dst);
-      return to_copy;
-    }
-    case kTfLiteInt64: {
-      jlongArray long_array = static_cast<jlongArray>(array);
-      jlong* long_dst = static_cast<jlong*>(dst);
-      env->GetLongArrayRegion(long_array, 0, num_elements, long_dst);
-      return to_copy;
-    }
-    case kTfLiteUInt8: {
-      jbyteArray byte_array = static_cast<jbyteArray>(array);
-      jbyte* byte_dst = static_cast<jbyte*>(dst);
-      env->GetByteArrayRegion(byte_array, 0, num_elements, byte_dst);
-      return to_copy;
-    }
-    default: {
-      throwException(env, kUnsupportedOperationException,
-                     "DataType error: TensorFlowLite currently supports float "
-                     "(32 bits), int (32 bits), byte (8 bits), and long "
-                     "(64 bits), support for other types (DataType %d in this "
-                     "case) will be added in the future",
-                     kTfLiteFloat32, type);
-      return 0;
-    }
-  }
-}
-
-size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
-                               const void* src, size_t src_size, jarray dst) {
-  const int len = env->GetArrayLength(dst);
-  const size_t size = len * elementByteSize(data_type);
-  if (size > src_size) {
-    throwException(
-        env, kIllegalStateException,
-        "Internal error: cannot fill a Java array of %d bytes with a Tensor of "
-        "%d bytes",
-        size, src_size);
-    return 0;
-  }
-  switch (data_type) {
-    case kTfLiteFloat32: {
-      jfloatArray float_array = static_cast<jfloatArray>(dst);
-      env->SetFloatArrayRegion(float_array, 0, len,
-                               static_cast<const jfloat*>(src));
-      return size;
-    }
-    case kTfLiteInt32: {
-      jintArray int_array = static_cast<jintArray>(dst);
-      env->SetIntArrayRegion(int_array, 0, len, static_cast<const jint*>(src));
-      return size;
-    }
-    case kTfLiteInt64: {
-      jlongArray long_array = static_cast<jlongArray>(dst);
-      env->SetLongArrayRegion(long_array, 0, len,
-                              static_cast<const jlong*>(src));
-      return size;
-    }
-    case kTfLiteUInt8: {
-      jbyteArray byte_array = static_cast<jbyteArray>(dst);
-      env->SetByteArrayRegion(byte_array, 0, len,
-                              static_cast<const jbyte*>(src));
-      return size;
-    }
-    default: {
-      throwException(env, kIllegalStateException,
-                     "DataType error: invalid DataType(%d)", data_type);
-    }
-  }
-  return 0;
-}
-
-size_t readMultiDimensionalArray(JNIEnv* env, TfLiteType data_type, char* src,
-                                 size_t src_size, int dims_left, jarray dst) {
-  if (dims_left == 1) {
-    return readOneDimensionalArray(env, data_type, src, src_size, dst);
-  } else {
-    jobjectArray ndarray = static_cast<jobjectArray>(dst);
-    int len = env->GetArrayLength(ndarray);
-    size_t size = 0;
-    for (int i = 0; i < len; ++i) {
-      jarray row = static_cast<jarray>(env->GetObjectArrayElement(ndarray, i));
-      size += readMultiDimensionalArray(env, data_type, src + size,
-                                        src_size - size, dims_left - 1, row);
-      env->DeleteLocalRef(row);
-      if (env->ExceptionCheck()) return size;
-    }
-    return size;
-  }
-}
-
-size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
-                                  int dims_left, char** dst, int dst_size) {
-  if (dims_left <= 1) {
-    return writeOneDimensionalArray(env, src, type, *dst, dst_size);
-  } else {
-    jobjectArray ndarray = static_cast<jobjectArray>(src);
-    int len = env->GetArrayLength(ndarray);
-    size_t sz = 0;
-    for (int i = 0; i < len; ++i) {
-      jobject row = env->GetObjectArrayElement(ndarray, i);
-      char* next_dst = *dst + sz;
-      sz += writeMultiDimensionalArray(env, row, type, dims_left - 1, &next_dst,
-                                       dst_size - sz);
-      env->DeleteLocalRef(row);
-      if (env->ExceptionCheck()) return sz;
-    }
-    return sz;
-  }
-}
-
-}  // namespace
-
-JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_Tensor_create(
-    JNIEnv* env, jclass clazz, jlong interpreter_handle, jint tensor_index) {
-  tflite::Interpreter* interpreter =
-      reinterpret_cast<tflite::Interpreter*>(interpreter_handle);
-  return reinterpret_cast<jlong>(new TensorHandle(interpreter, tensor_index));
-}
-
-JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_delete(JNIEnv* env,
-                                                              jclass clazz,
-                                                              jlong handle) {
-  delete reinterpret_cast<TensorHandle*>(handle);
-}
-
-JNIEXPORT jobject JNICALL Java_org_tensorflow_lite_Tensor_buffer(JNIEnv* env,
-                                                                 jclass clazz,
-                                                                 jlong handle) {
-  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
-  if (tensor == nullptr) return nullptr;
-  if (tensor->data.raw == nullptr) {
-    throwException(env, kIllegalArgumentException,
-                   "Internal error: Tensor hasn't been allocated.");
-    return nullptr;
-  }
-  return env->NewDirectByteBuffer(static_cast<void*>(tensor->data.raw),
-                                  static_cast<jlong>(tensor->bytes));
-}
-
-JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_writeDirectBuffer(
-    JNIEnv* env, jclass clazz, jlong handle, jobject src) {
-  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
-  if (tensor == nullptr) return;
-
-  char* src_data_raw = static_cast<char*>(env->GetDirectBufferAddress(src));
-  if (!src_data_raw) {
-    throwException(env, kIllegalArgumentException,
-                   "Input ByteBuffer is not a direct buffer");
-    return;
-  }
-
-  tensor->data.raw = src_data_raw;
-}
-
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
-                                                          jclass clazz,
-                                                          jlong handle,
-                                                          jobject value) {
-  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
-  if (tensor == nullptr) return;
-  int num_dims = tensor->dims->size;
-  if (num_dims == 0) {
-    throwException(env, kIllegalArgumentException,
-                   "Internal error: Cannot copy empty/scalar Tensors.");
-    return;
-  }
-  readMultiDimensionalArray(env, tensor->type, tensor->data.raw, tensor->bytes,
-                            num_dims, static_cast<jarray>(value));
-}
-
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
-                                                           jclass clazz,
-                                                           jlong handle,
-                                                           jobject src) {
-  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
-  if (tensor == nullptr) return;
-  if (tensor->data.raw == nullptr) {
-    throwException(env, kIllegalArgumentException,
-                   "Internal error: Target Tensor hasn't been allocated.");
-    return;
-  }
-  if (tensor->dims->size == 0) {
-    throwException(env, kIllegalArgumentException,
-                   "Internal error: Cannot copy empty/scalar Tensors.");
-    return;
-  }
-  writeMultiDimensionalArray(env, src, tensor->type, tensor->dims->size,
-                             &tensor->data.raw, tensor->bytes);
-}
-
-JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_dtype(JNIEnv* env,
-                                                             jclass clazz,
-                                                             jlong handle) {
-  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
-  if (tensor == nullptr) return 0;
-  return static_cast<jint>(tensor->type);
-}
-
-JNIEXPORT jintArray JNICALL
-Java_org_tensorflow_lite_Tensor_shape(JNIEnv* env, jclass clazz, jlong handle) {
-  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
-  if (tensor == nullptr) return nullptr;
-  int num_dims = tensor->dims->size;
-  jintArray result = env->NewIntArray(num_dims);
-  env->SetIntArrayRegion(result, 0, num_dims, tensor->dims->data);
-  return result;
-}
-
-JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_numBytes(JNIEnv* env,
-                                                                jclass clazz,
-                                                                jlong handle) {
-  const TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
-  if (tensor == nullptr) return 0;
-  return static_cast<jint>(tensor->bytes);
-}
diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
deleted file mode 100644
index af1d99ef41e6413d8ef2c6f478aaa8f9e3931ff8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-# Description:
-# Internal helper function to test TF Lite API.
-
-load("@build_bazel_rules_android//android:rules.bzl", "android_library")
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-android_library(
-    name = "testhelper",
-    srcs = glob(
-        [
-            "*.java",
-        ],
-    ),
-    deps = [
-        "//tensorflow/contrib/lite/java:tensorflowlite_java",
-    ],
-)
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
deleted file mode 100644
index 95e387814d53eac774c27b57a016e3845372d29f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ /dev/null
@@ -1,1333 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_opts_nortti_if_android")
-
-# Suppress warnings that are introduced by Eigen Tensor.
-EXTRA_EIGEN_COPTS = select({
-    "//tensorflow:ios": [
-        "-Wno-error=invalid-partial-specialization",
-        "-Wno-error=reorder",
-    ],
-    "//tensorflow:windows": [
-        "/DEIGEN_HAS_C99_MATH",
-        "/DEIGEN_AVOID_STL_ARRAY",
-    ],
-    "//conditions:default": ["-Wno-error=reorder"],
-})
-
-tf_cc_test(
-    name = "optional_tensor_test",
-    size = "small",
-    srcs = ["optional_tensor_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "test_util",
-    testonly = 1,
-    srcs = ["test_util.cc"],
-    hdrs = ["test_util.h"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
-        "//tensorflow/contrib/lite/testing:util",
-        "//tensorflow/core:tflite_portable_logging",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "eigen_support",
-    srcs = [
-        "eigen_support.cc",
-    ],
-    hdrs = [
-        "eigen_support.h",
-    ],
-    copts = tflite_copts() + EXTRA_EIGEN_COPTS,
-    deps = [
-        ":op_macros",
-        "//tensorflow/contrib/lite:arena_planner",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels/internal:optimized",
-    ],
-)
-
-cc_library(
-    name = "gemm_support",
-    srcs = [
-        "gemm_support.cc",
-    ],
-    hdrs = [
-        "gemm_support.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        ":op_macros",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "@gemmlowp",
-    ],
-)
-
-cc_library(
-    name = "activation_functor",
-    hdrs = [
-        "activation_functor.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ],
-)
-
-cc_library(
-    name = "op_macros",
-    hdrs = [
-        "op_macros.h",
-    ],
-)
-
-cc_library(
-    name = "kernel_util",
-    srcs = [
-        "kernel_util.cc",
-    ],
-    hdrs = [
-        "kernel_util.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels/internal:round",
-        "//tensorflow/contrib/lite/kernels/internal:types",
-    ],
-)
-
-tf_cc_test(
-    name = "kernel_util_test",
-    size = "small",
-    srcs = ["kernel_util_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":kernel_util",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "test_util_test",
-    size = "small",
-    srcs = ["test_util_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":test_util",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "padding",
-    srcs = [],
-    hdrs = ["padding.h"],
-    deps = [
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ],
-)
-
-cc_library(
-    name = "builtin_op_kernels",
-    srcs = [
-        "activations.cc",
-        "add.cc",
-        "arg_min_max.cc",
-        "audio_spectrogram.cc",
-        "basic_rnn.cc",
-        "batch_to_space_nd.cc",
-        "bidirectional_sequence_lstm.cc",
-        "bidirectional_sequence_rnn.cc",
-        "cast.cc",
-        "comparisons.cc",
-        "concatenation.cc",
-        "conv.cc",
-        "depthwise_conv.cc",
-        "dequantize.cc",
-        "detection_postprocess.cc",
-        "div.cc",
-        "elementwise.cc",
-        "embedding_lookup.cc",
-        "embedding_lookup_sparse.cc",
-        "exp.cc",
-        "expand_dims.cc",
-        "fake_quant.cc",
-        "floor.cc",
-        "floor_div.cc",
-        "fully_connected.cc",
-        "gather.cc",
-        "hashtable_lookup.cc",
-        "l2norm.cc",
-        "layer_norm_lstm.cc",
-        "local_response_norm.cc",
-        "logical.cc",
-        "lsh_projection.cc",
-        "lstm.cc",
-        "maximum_minimum.cc",
-        "mfcc.cc",
-        "mul.cc",
-        "neg.cc",
-        "one_hot.cc",
-        "pack.cc",
-        "pad.cc",
-        "pooling.cc",
-        "pow.cc",
-        "reduce.cc",
-        "relu1.cc",
-        "reshape.cc",
-        "resize_bilinear.cc",
-        "select.cc",
-        "shape.cc",
-        "skip_gram.cc",
-        "slice.cc",
-        "space_to_batch_nd.cc",
-        "space_to_depth.cc",
-        "sparse_output_fully_connected.cc",
-        "sparse_to_dense.cc",
-        "split.cc",
-        "squeeze.cc",
-        "strided_slice.cc",
-        "sub.cc",
-        "svdf.cc",
-        "tile.cc",
-        "topk_v2.cc",
-        "transpose.cc",
-        "transpose_conv.cc",
-        "unidirectional_sequence_lstm.cc",
-        "unidirectional_sequence_rnn.cc",
-        "unpack.cc",
-        "zeros_like.cc",
-    ],
-    hdrs = [
-    ],
-    copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
-    visibility = ["//visibility:private"],
-    deps = [
-        ":activation_functor",
-        ":eigen_support",
-        ":kernel_util",
-        ":op_macros",
-        ":padding",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite:util",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:gemm_support",
-        "//tensorflow/contrib/lite/kernels/internal:audio_utils",
-        "//tensorflow/contrib/lite/kernels/internal:kernel_utils",
-        "//tensorflow/contrib/lite/kernels/internal:optimized",
-        "//tensorflow/contrib/lite/kernels/internal:optimized_base",
-        "//tensorflow/contrib/lite/kernels/internal:quantization_util",
-        "//tensorflow/contrib/lite/kernels/internal:reference_base",
-        "//tensorflow/contrib/lite/kernels/internal:tensor",
-        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
-        "@farmhash_archive//:farmhash",
-        "@flatbuffers",
-    ],
-)
-
-cc_library(
-    name = "builtin_ops",
-    srcs = ["register.cc"],
-    hdrs = ["register.h"],
-    deps = [
-        ":builtin_op_kernels",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:util",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "audio_spectrogram_test",
-    size = "small",
-    srcs = ["audio_spectrogram_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "mfcc_test",
-    size = "small",
-    srcs = ["mfcc_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "detection_postprocess_test",
-    size = "small",
-    srcs = ["detection_postprocess_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "relu1_test",
-    size = "small",
-    srcs = ["relu1_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "sparse_output_fully_connected_test",
-    size = "small",
-    srcs = ["sparse_output_fully_connected_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "activations_test",
-    size = "small",
-    srcs = ["activations_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "add_test",
-    size = "small",
-    srcs = ["add_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "arg_min_max_test",
-    size = "small",
-    srcs = ["arg_min_max_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "div_test",
-    size = "small",
-    srcs = ["div_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "sub_test",
-    size = "small",
-    srcs = ["sub_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "transpose_test",
-    size = "small",
-    srcs = ["transpose_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "//tensorflow/contrib/lite/kernels/internal:reference",
-        "//tensorflow/contrib/lite/kernels/internal:reference_base",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "space_to_batch_nd_test",
-    size = "small",
-    srcs = ["space_to_batch_nd_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "batch_to_space_nd_test",
-    size = "small",
-    srcs = ["batch_to_space_nd_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "cast_test",
-    size = "small",
-    srcs = ["cast_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "concatenation_test",
-    size = "small",
-    srcs = ["concatenation_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "conv_test",
-    size = "small",
-    srcs = ["conv_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "depthwise_conv_test",
-    size = "small",
-    srcs = ["depthwise_conv_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "dequantize_test",
-    size = "small",
-    srcs = ["dequantize_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "basic_rnn_test",
-    size = "small",
-    srcs = ["basic_rnn_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "bidirectional_sequence_lstm_test",
-    size = "small",
-    srcs = ["bidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "floor_test",
-    size = "small",
-    srcs = ["floor_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "elementwise_test",
-    size = "small",
-    srcs = ["elementwise_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "unidirectional_sequence_lstm_test",
-    size = "small",
-    srcs = ["unidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "bidirectional_sequence_rnn_test",
-    size = "small",
-    srcs = ["bidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "unidirectional_sequence_rnn_test",
-    size = "small",
-    srcs = ["unidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "l2norm_test",
-    size = "small",
-    srcs = ["l2norm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "exp_test",
-    size = "small",
-    srcs = ["exp_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "fake_quant_test",
-    size = "small",
-    srcs = ["fake_quant_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "maximum_minimum_test",
-    size = "small",
-    srcs = ["maximum_minimum_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "reduce_test",
-    size = "small",
-    srcs = ["reduce_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "mul_test",
-    size = "small",
-    srcs = ["mul_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "pad_test",
-    size = "small",
-    srcs = ["pad_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "reshape_test",
-    size = "small",
-    srcs = ["reshape_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "gather_test",
-    size = "small",
-    srcs = ["gather_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "topk_v2_test",
-    size = "small",
-    srcs = ["topk_v2_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "resize_bilinear_test",
-    size = "small",
-    srcs = ["resize_bilinear_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "svdf_test",
-    size = "small",
-    srcs = ["svdf_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "embedding_lookup_test",
-    size = "small",
-    srcs = ["embedding_lookup_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "embedding_lookup_sparse_test",
-    size = "small",
-    srcs = ["embedding_lookup_sparse_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "fully_connected_test",
-    size = "small",
-    srcs = ["fully_connected_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
-        "@com_google_absl//absl/memory",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "local_response_norm_test",
-    size = "small",
-    srcs = ["local_response_norm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "pooling_test",
-    size = "small",
-    srcs = ["pooling_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "softmax_test",
-    size = "small",
-    srcs = ["softmax_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "//tensorflow/contrib/lite/kernels/internal:reference_base",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "log_softmax_test",
-    size = "small",
-    srcs = ["log_softmax_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "//tensorflow/contrib/lite/kernels/internal:reference_base",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "lsh_projection_test",
-    size = "small",
-    srcs = ["lsh_projection_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "hashtable_lookup_test",
-    size = "small",
-    srcs = ["hashtable_lookup_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "layer_norm_lstm_test",
-    size = "small",
-    srcs = ["layer_norm_lstm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "lstm_test",
-    size = "small",
-    srcs = ["lstm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "skip_gram_test",
-    size = "small",
-    srcs = ["skip_gram_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "space_to_depth_test",
-    size = "small",
-    srcs = ["space_to_depth_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "split_test",
-    size = "small",
-    srcs = ["split_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "squeeze_test",
-    size = "small",
-    srcs = ["squeeze_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "strided_slice_test",
-    size = "small",
-    srcs = ["strided_slice_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "tile_test",
-    size = "small",
-    srcs = ["tile_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "comparisons_test",
-    size = "small",
-    srcs = [
-        "comparisons_test.cc",
-    ],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "neg_test",
-    size = "small",
-    srcs = ["neg_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "select_test",
-    size = "small",
-    srcs = [
-        "select_test.cc",
-    ],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "slice_test",
-    size = "small",
-    srcs = [
-        "slice_test.cc",
-    ],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "transpose_conv_test",
-    size = "small",
-    srcs = ["transpose_conv_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "expand_dims_test",
-    size = "small",
-    srcs = ["expand_dims_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "sparse_to_dense_test",
-    size = "small",
-    srcs = ["sparse_to_dense_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "shape_test",
-    size = "small",
-    srcs = ["shape_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "pow_test",
-    size = "small",
-    srcs = ["pow_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "pack_test",
-    size = "small",
-    srcs = ["pack_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "one_hot_test",
-    size = "small",
-    srcs = ["one_hot_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "logical_test",
-    size = "small",
-    srcs = ["logical_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "unpack_test",
-    size = "small",
-    srcs = ["unpack_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "floor_div_test",
-    size = "small",
-    srcs = ["floor_div_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-tf_cc_test(
-    name = "zeros_like_test",
-    size = "small",
-    srcs = ["zeros_like_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
deleted file mode 100644
index 95b025c1b30cc627cf5858ec17f8ff7c57f7bd95..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class BatchToSpaceNDOpModel : public SingleOpModel {
- public:
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
-  }
-
-  void SetBlockShape(std::initializer_list<int> data) {
-    PopulateTensor<int>(block_shape_, data);
-  }
-
-  void SetCrops(std::initializer_list<int> data) {
-    PopulateTensor<int>(crops_, data);
-  }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- protected:
-  int input_;
-  int block_shape_;
-  int crops_;
-  int output_;
-};
-
-// Tests case where block_shape and crops are const tensors.
-//
-// Example usage is as follows:
-//    BatchToSpaceNDOpConstModel m(input_shape, block_shape, crops);
-//    m.SetInput(input_data);
-//    m.Invoke();
-class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
- public:
-  BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
-                             std::initializer_list<int> block_shape,
-                             std::initializer_list<int> crops) {
-    input_ = AddInput(TensorType_FLOAT32);
-    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
-    crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
-    output_ = AddOutput(TensorType_FLOAT32);
-
-    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
-                 BuiltinOptions_BatchToSpaceNDOptions,
-                 CreateBatchToSpaceNDOptions(builder_).Union());
-    BuildInterpreter({input_shape});
-  }
-};
-
-// Tests case where block_shape and crops are non-const tensors.
-//
-// Example usage is as follows:
-//    BatchToSpaceNDOpDynamicModel m(input_shape);
-//    m.SetInput(input_data);
-//    m.SetBlockShape(block_shape);
-//    m.SetPaddings(crops);
-//    m.Invoke();
-class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
- public:
-  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
-    block_shape_ = AddInput(TensorType_INT32);
-    crops_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-
-    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
-                 BuiltinOptions_BatchToSpaceNDOptions,
-                 CreateBatchToSpaceNDOptions(builder_).Union());
-    BuildInterpreter({input_shape, {2}, {2, 2}});
-  }
-};
-
-TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
-  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
-                                               4, 8, 11, 15, 12, 16}));
-}
-
-TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
-  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  m.SetBlockShape({2, 2});
-  m.SetCrops({0, 0, 0, 0});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
-                                               4, 8, 11, 15, 12, 16}));
-}
-
-TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
-               "Cannot allocate tensors");
-}
-
-TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
-  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, -1}),
-               "crops.3. >= 0 was not true.");
-}
-
-TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
-  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  m.SetBlockShape({2, 2});
-  m.SetCrops({0, 0, -1, 0});
-  EXPECT_DEATH(m.Invoke(), "crops.2. >= 0 was not true.");
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
deleted file mode 100644
index 0532528f52177a6e151acc6e1e6050d733f80ff7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
+++ /dev/null
@@ -1,1267 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace bidirectional_sequence_lstm {
-
-// Input Tensors of size {max_time, n_batch, n_input}
-constexpr int kInputTensor = 0;
-
-// Forward LSTM cell tensors.
-// Input weight tensors of size: {n_cell, n_input}
-constexpr int kFwInputToInputWeightsTensor = 1;  // Optional
-constexpr int kFwInputToForgetWeightsTensor = 2;
-constexpr int kFwInputToCellWeightsTensor = 3;
-constexpr int kFwInputToOutputWeightsTensor = 4;
-
-// Recurrent weight tensors of size {n_cell, n_output}
-constexpr int kFwRecurrentToInputWeightsTensor = 5;  // Optional
-constexpr int kFwRecurrentToForgetWeightsTensor = 6;
-constexpr int kFwRecurrentToCellWeightsTensor = 7;
-constexpr int kFwRecurrentToOutputWeightsTensor = 8;
-
-// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kFwCellToInputWeightsTensor = 9;    // Optional
-constexpr int kFwCellToForgetWeightsTensor = 10;  // Optional
-constexpr int kFwCellToOutputWeightsTensor = 11;  // Optional
-
-// Gates bias tensors of size {n_cell}
-constexpr int kFwInputGateBiasTensor = 12;  // Optional
-constexpr int kFwForgetGateBiasTensor = 13;
-constexpr int kFwCellGateBiasTensor = 14;
-constexpr int kFwOutputGateBiasTensor = 15;
-
-// Projection weight tensor of size {n_output, n_cell}
-constexpr int kFwProjectionWeightsTensor = 16;  // Optional
-// Projection bias tensor of size {n_output}
-constexpr int kFwProjectionBiasTensor = 17;  // Optional
-
-// Backward LSTM cell tensors.
-// Input weight tensors of size: {n_cell, n_input}
-constexpr int kBwInputToInputWeightsTensor = 18;  // Optional
-constexpr int kBwInputToForgetWeightsTensor = 19;
-constexpr int kBwInputToCellWeightsTensor = 20;
-constexpr int kBwInputToOutputWeightsTensor = 21;
-
-// Recurrent weight tensors of size {n_cell, n_output}
-constexpr int kBwRecurrentToInputWeightsTensor = 22;  // Optional
-constexpr int kBwRecurrentToForgetWeightsTensor = 23;
-constexpr int kBwRecurrentToCellWeightsTensor = 24;
-constexpr int kBwRecurrentToOutputWeightsTensor = 25;
-
-// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kBwCellToInputWeightsTensor = 26;   // Optional
-constexpr int kBwCellToForgetWeightsTensor = 27;  // Optional
-constexpr int kBwCellToOutputWeightsTensor = 28;  // Optional
-
-// Gates bias tensors of size {n_cell}
-constexpr int kBwInputGateBiasTensor = 29;  // Optional
-constexpr int kBwForgetGateBiasTensor = 30;
-constexpr int kBwCellGateBiasTensor = 31;
-constexpr int kBwOutputGateBiasTensor = 32;
-
-// Projection weight tensor of size {n_output, n_cell}
-constexpr int kBwProjectionWeightsTensor = 33;  // Optional
-// Projection bias tensor of size {n_output}
-constexpr int kBwProjectionBiasTensor = 34;  // Optional
-
-// Stateful input tensors that are variables and will be modified by the Op.
-// Activation state tensors of size {n_batch, n_output}
-constexpr int kFwInputActivationStateTensor = 35;
-// Cell state tensors of size {n_batch, n_cell}
-constexpr int kFwInputCellStateTensor = 36;
-// Activation state tensors of size {n_batch, n_output}
-constexpr int kBwInputActivationStateTensor = 37;
-// Cell state tensors of size {n_batch, n_cell}
-constexpr int kBwInputCellStateTensor = 38;
-
-// Auxiliary input and weights when stacking.
-constexpr int kAuxInputTensor = 39;  // Optional
-// Forward weights.
-constexpr int kFwAuxInputToInputWeightsTensor = 40;   // Optional
-constexpr int kFwAuxInputToForgetWeightsTensor = 41;  // Optional
-constexpr int kFwAuxInputToCellWeightsTensor = 42;    // Optional
-constexpr int kFwAuxInputToOutputWeightsTensor = 43;  // Optional
-// Backward weights.
-constexpr int kBwAuxInputToInputWeightsTensor = 44;   // Optional
-constexpr int kBwAuxInputToForgetWeightsTensor = 45;  // Optional
-constexpr int kBwAuxInputToCellWeightsTensor = 46;    // Optional
-constexpr int kBwAuxInputToOutputWeightsTensor = 47;  // Optional
-
-// Output tensors.
-constexpr int kFwOutputTensor = 0;
-constexpr int kBwOutputTensor = 1;  // Ignored if merge_outputs is set.
-
-// Temporary tensors.
-enum TemporaryTensor {
-  // Scratch buffers for input, forget, etc. gates
-  kFwScratchBuffer = 0,
-  kBwScratchBuffer = 1,
-  // Quantized tensors needed for the hybrid kernel.
-  kInputQuantized = 2,
-  kAuxInputQuantized = 3,  // Quantized tensor needed for auxiliary input.
-  kFwActivationStateQuantized = 4,
-  kBwActivationStateQuantized = 5,
-  kFwCellStateQuantized = 6,
-  kBwCellStateQuantized = 7,
-  kScalingFactors = 8,
-  kProductScalingFactors = 9,
-  kRecoveredCellWeights = 10,
-  kNumTemporaryTensors = 11
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* scratch_tensor_index = new int;
-  context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index);
-  return scratch_tensor_index;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<int*>(buffer);
-}
-
-// Check that input tensor dimensions matches with each other.
-TfLiteStatus CheckLstmTensorDimensions(
-    TfLiteContext* context, TfLiteNode* node, int n_input, int n_output,
-    int n_cell, int input_to_input_weights_tensor,
-    int input_to_forget_weights_tensor, int input_to_cell_weights_tensor,
-    int input_to_output_weights_tensor, int recurrent_to_input_weights_tensor,
-    int recurrent_to_forget_weights_tensor,
-    int recurrent_to_cell_weights_tensor,
-    int recurrent_to_output_weights_tensor, int cell_to_input_weights_tensor,
-    int cell_to_forget_weights_tensor, int cell_to_output_weights_tensor,
-    int input_gate_bias_tensor, int forget_gate_bias_tensor,
-    int cell_gate_bias_tensor, int output_gate_bias_tensor,
-    int projection_weights_tensor, int projection_bias_tensor) {
-  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
-      node->builtin_data);
-
-  // Making sure clipping parameters have valid values.
-  // == 0 means no clipping
-  //  > 0 means clipping
-  TF_LITE_ENSURE(context, params->cell_clip >= 0);
-  TF_LITE_ENSURE(context, params->proj_clip >= 0);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, input_to_input_weights_tensor);
-  if (input_to_input_weights) {
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
-  }
-
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, input_to_forget_weights_tensor);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, input_to_cell_weights_tensor);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, recurrent_to_input_weights_tensor);
-  if (recurrent_to_input_weights) {
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
-                      n_cell);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
-                      n_output);
-  }
-
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, recurrent_to_forget_weights_tensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
-                    n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
-                    n_output);
-
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, recurrent_to_cell_weights_tensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
-                    n_output);
-
-  // We make sure the input-gate's parameters are either both present (regular
-  // LSTM) or not at all (CIFG-LSTM).
-  const bool cifg_weights_all_or_none =
-      ((input_to_input_weights != nullptr) &&
-       (recurrent_to_input_weights != nullptr)) ||
-      ((input_to_input_weights == nullptr) &&
-       (recurrent_to_input_weights == nullptr));
-  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, cell_to_input_weights_tensor);
-  if (cell_to_input_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, cell_to_forget_weights_tensor);
-  if (cell_to_forget_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, cell_to_output_weights_tensor);
-  if (cell_to_output_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
-  }
-
-  // Making sure the peephole weights are there all or none.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool peephole_weights_all_or_none =
-      ((cell_to_input_weights != nullptr || use_cifg) &&
-       (cell_to_forget_weights != nullptr) &&
-       (cell_to_output_weights != nullptr)) ||
-      ((cell_to_input_weights == nullptr) &&
-       (cell_to_forget_weights == nullptr) &&
-       (cell_to_output_weights == nullptr));
-  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
-
-  // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, input_gate_bias_tensor);
-  if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
-  } else {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, forget_gate_bias_tensor);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* cell_bias =
-      GetInput(context, node, cell_gate_bias_tensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, output_gate_bias_tensor);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, projection_weights_tensor);
-  if (projection_weights) {
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
-  }
-
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, projection_bias_tensor);
-  if (projection_bias) {
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
-  }
-
-  // Making sure the projection tensors are consistent:
-  // 1) If projection weight is not present, then projection bias should not be
-  // present.
-  // 2) If projection weight is present, then projection bias is optional.
-  // TODO(ghodrat): make sure this is correct.
-  const bool projecton_tensors_consistent =
-      ((projection_weights != nullptr) || (projection_bias == nullptr));
-  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
-                                        TfLiteNode* node, int n_input,
-                                        int n_output, int n_cell) {
-  TF_LITE_ENSURE_OK(
-      context,
-      CheckLstmTensorDimensions(
-          context, node, n_input, n_output, n_cell,
-          kFwInputToInputWeightsTensor, kFwInputToForgetWeightsTensor,
-          kFwInputToCellWeightsTensor, kFwInputToOutputWeightsTensor,
-          kFwRecurrentToInputWeightsTensor, kFwRecurrentToForgetWeightsTensor,
-          kFwRecurrentToCellWeightsTensor, kFwRecurrentToOutputWeightsTensor,
-          kFwCellToInputWeightsTensor, kFwCellToForgetWeightsTensor,
-          kFwCellToOutputWeightsTensor, kFwInputGateBiasTensor,
-          kFwForgetGateBiasTensor, kFwCellGateBiasTensor,
-          kFwOutputGateBiasTensor, kFwProjectionWeightsTensor,
-          kFwProjectionBiasTensor));
-
-  TF_LITE_ENSURE_OK(
-      context,
-      CheckLstmTensorDimensions(
-          context, node, n_input, n_output, n_cell,
-          kBwInputToInputWeightsTensor, kBwInputToForgetWeightsTensor,
-          kBwInputToCellWeightsTensor, kBwInputToOutputWeightsTensor,
-          kBwRecurrentToInputWeightsTensor, kBwRecurrentToForgetWeightsTensor,
-          kBwRecurrentToCellWeightsTensor, kBwRecurrentToOutputWeightsTensor,
-          kBwCellToInputWeightsTensor, kBwCellToForgetWeightsTensor,
-          kBwCellToOutputWeightsTensor, kBwInputGateBiasTensor,
-          kBwForgetGateBiasTensor, kBwCellGateBiasTensor,
-          kBwOutputGateBiasTensor, kBwProjectionWeightsTensor,
-          kBwProjectionBiasTensor));
-
-  // Check if Forward and Backward tensors match along required dimensions.
-  return kTfLiteOk;
-}
-
-// Resize the output and scratch tensors based on the sizes of the input
-// tensors. Also check that the size of the input tensors match each other.
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
-  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
-      node->builtin_data);
-
-  // Check we have all the inputs and outputs we need.
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 48);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size,
-                    params->merge_outputs ? 1 : 2);
-
-  // Inferring batch size, number of outputs and sequence length and
-  // number of cells from the input tensors.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-
-  const TfLiteTensor* fw_input_to_output_weights =
-      GetInput(context, node, kFwInputToOutputWeightsTensor);
-  const int n_fw_cell = fw_input_to_output_weights->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->data[1],
-                    n_input);
-
-  const TfLiteTensor* bw_input_to_output_weights =
-      GetInput(context, node, kBwInputToOutputWeightsTensor);
-  const int n_bw_cell = bw_input_to_output_weights->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1],
-                    n_input);
-
-  const TfLiteTensor* fw_recurrent_to_output_weights =
-      GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->data[0],
-                    n_fw_cell);
-  const int n_fw_output = fw_recurrent_to_output_weights->dims->data[1];
-
-  const TfLiteTensor* bw_recurrent_to_output_weights =
-      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0],
-                    n_bw_cell);
-  const int n_bw_output = bw_recurrent_to_output_weights->dims->data[1];
-
-  // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(
-      context, CheckInputTensorDimensions(context, node, n_input, n_fw_output,
-                                          n_fw_cell));
-
-  // Get (optional) auxiliary inputs and weights.
-  const TfLiteTensor* aux_input =
-      GetOptionalInputTensor(context, node, kAuxInputTensor);
-  const TfLiteTensor* fw_aux_input_to_input_weights =
-      GetOptionalInputTensor(context, node, kFwAuxInputToInputWeightsTensor);
-  const TfLiteTensor* fw_aux_input_to_forget_weights =
-      GetOptionalInputTensor(context, node, kFwAuxInputToForgetWeightsTensor);
-  const TfLiteTensor* fw_aux_input_to_cell_weights =
-      GetOptionalInputTensor(context, node, kFwAuxInputToCellWeightsTensor);
-  const TfLiteTensor* fw_aux_input_to_output_weights =
-      GetOptionalInputTensor(context, node, kFwAuxInputToOutputWeightsTensor);
-  const TfLiteTensor* bw_aux_input_to_input_weights =
-      GetOptionalInputTensor(context, node, kBwAuxInputToInputWeightsTensor);
-  const TfLiteTensor* bw_aux_input_to_forget_weights =
-      GetOptionalInputTensor(context, node, kBwAuxInputToForgetWeightsTensor);
-  const TfLiteTensor* bw_aux_input_to_cell_weights =
-      GetOptionalInputTensor(context, node, kBwAuxInputToCellWeightsTensor);
-  const TfLiteTensor* bw_aux_input_to_output_weights =
-      GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
-
-  const bool aux_inputs_all_or_none =
-      ((aux_input != nullptr) && (fw_aux_input_to_cell_weights != nullptr) &&
-       (fw_aux_input_to_forget_weights != nullptr) &&
-       (fw_aux_input_to_output_weights != nullptr) &&
-       (bw_aux_input_to_cell_weights != nullptr) &&
-       (bw_aux_input_to_forget_weights != nullptr) &&
-       (bw_aux_input_to_output_weights != nullptr)) ||
-      ((fw_aux_input_to_cell_weights == nullptr) &&
-       (fw_aux_input_to_forget_weights == nullptr) &&
-       (fw_aux_input_to_output_weights == nullptr) &&
-       (bw_aux_input_to_cell_weights == nullptr) &&
-       (bw_aux_input_to_forget_weights == nullptr) &&
-       (bw_aux_input_to_output_weights == nullptr));
-  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
-  const bool has_aux_input = (aux_input != nullptr);
-
-  if (has_aux_input) {
-    // Check that aux_input has the same dimensions (except last) as the input.
-    TF_LITE_ASSERT_EQ(aux_input->dims->data[0], input->dims->data[0]);
-    TF_LITE_ASSERT_EQ(aux_input->dims->data[1], input->dims->data[1]);
-  }
-
-  // Get the pointer to output, activation_state and cell_state buffer tensors.
-  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
-  TfLiteTensor* fw_activation_state =
-      GetVariableInput(context, node, kFwInputActivationStateTensor);
-  TfLiteTensor* fw_cell_state =
-      GetVariableInput(context, node, kFwInputCellStateTensor);
-
-  // Check the shape of input state tensors.
-  // These tensor may be 1D or 2D. It's fine as long as the total size is
-  // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(fw_activation_state),
-                    n_batch * n_fw_output);
-  TF_LITE_ENSURE_EQ(context, NumElements(fw_cell_state), n_batch * n_fw_cell);
-
-  // Resize the output tensors.
-  TfLiteIntArray* fw_output_size = TfLiteIntArrayCreate(3);
-  fw_output_size->data[0] = max_time;
-  fw_output_size->data[1] = n_batch;
-  fw_output_size->data[2] =
-      params->merge_outputs ? n_bw_output + n_fw_output : n_fw_output;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, fw_output, fw_output_size));
-
-  // The weights are of consistent type, so it suffices to check one.
-  const bool is_hybrid_op = (fw_input_to_output_weights->type == kTfLiteUInt8);
-
-  TfLiteIntArrayFree(node->temporaries);
-  if (is_hybrid_op) {
-    node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
-  } else {
-    node->temporaries = TfLiteIntArrayCreate(2);  // the two scratch buffers.
-  }
-  // Create a scratch buffer tensor.
-  node->temporaries->data[kFwScratchBuffer] = *scratch_tensor_index;
-  TfLiteTensor* fw_scratch_buffer =
-      GetTemporary(context, node, kFwScratchBuffer);
-  fw_scratch_buffer->type = input->type;
-  fw_scratch_buffer->allocation_type = kTfLiteArenaRw;
-
-  const TfLiteTensor* fw_input_to_input_weights =
-      GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
-  if (has_aux_input) {
-    TF_LITE_ENSURE_EQ(context, fw_aux_input_to_input_weights->dims->data[0],
-                      fw_input_to_input_weights->dims->data[0]);
-  }
-  const bool fw_use_cifg = (fw_input_to_input_weights == nullptr);
-  TfLiteIntArray* fw_scratch_buffer_size = TfLiteIntArrayCreate(2);
-  fw_scratch_buffer_size->data[0] = n_batch;
-  if (fw_use_cifg) {
-    // Reserving space for Cell, Forget, Output gates
-    fw_scratch_buffer_size->data[1] = n_fw_cell * 3;
-  } else {
-    // Reserving space for Input, Cell, Forget, Output gates
-    fw_scratch_buffer_size->data[1] = n_fw_cell * 4;
-  }
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_scratch_buffer,
-                                                   fw_scratch_buffer_size));
-  // Same for the backward cell.
-
-  // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(
-      context, CheckInputTensorDimensions(context, node, n_input, n_bw_output,
-                                          n_bw_cell));
-
-  // Get the pointer to activation_state and cell_state buffer tensors.
-  TfLiteTensor* bw_activation_state =
-      GetVariableInput(context, node, kBwInputActivationStateTensor);
-  TfLiteTensor* bw_cell_state =
-      GetVariableInput(context, node, kBwInputCellStateTensor);
-
-  // Resize the output tensors.
-  if (!params->merge_outputs) {
-    TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
-    TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3);
-    bw_output_size->data[0] = max_time;
-    bw_output_size->data[1] = n_batch;
-    bw_output_size->data[2] = n_bw_output;
-    TF_LITE_ENSURE_OK(
-        context, context->ResizeTensor(context, bw_output, bw_output_size));
-  }
-
-  // Check the shape of input state tensors.
-  // These tensor may be 1D or 2D. It's fine as long as the total size is
-  // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(bw_activation_state),
-                    n_batch * n_bw_output);
-  TF_LITE_ENSURE_EQ(context, NumElements(bw_cell_state), n_batch * n_bw_cell);
-
-  // Create a scratch buffer tensor.
-  node->temporaries->data[kBwScratchBuffer] =
-      *(scratch_tensor_index) + kBwScratchBuffer;
-  TfLiteTensor* bw_scratch_buffer =
-      GetTemporary(context, node, kBwScratchBuffer);
-  bw_scratch_buffer->type = input->type;
-  bw_scratch_buffer->allocation_type = kTfLiteArenaRw;
-
-  const TfLiteTensor* bw_input_to_input_weights =
-      GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
-  if (has_aux_input) {
-    TF_LITE_ENSURE_EQ(context, bw_aux_input_to_input_weights->dims->data[0],
-                      bw_input_to_input_weights->dims->data[0]);
-  }
-  const bool bw_use_cifg = (bw_input_to_input_weights == nullptr);
-  TfLiteIntArray* bw_scratch_buffer_size = TfLiteIntArrayCreate(2);
-  bw_scratch_buffer_size->data[0] = n_batch;
-  if (bw_use_cifg) {
-    // Reserving space for Cell, Forget, Output gates
-    bw_scratch_buffer_size->data[1] = n_bw_cell * 3;
-  } else {
-    // Reserving space for Input, Cell, Forget, Output gates
-    bw_scratch_buffer_size->data[1] = n_bw_cell * 4;
-  }
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_scratch_buffer,
-                                                   bw_scratch_buffer_size));
-  if (is_hybrid_op) {
-    // Allocate temporary tensors to store quantized values of input, aux_input
-    // (if present), activation_state and cell_state tensors.
-    node->temporaries->data[kInputQuantized] =
-        *scratch_tensor_index + kInputQuantized;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-
-    if (has_aux_input) {
-      node->temporaries->data[kAuxInputQuantized] =
-          *scratch_tensor_index + kAuxInputQuantized;
-      TfLiteTensor* aux_input_quantized =
-          GetTemporary(context, node, kAuxInputQuantized);
-      aux_input_quantized->type = kTfLiteUInt8;
-      aux_input_quantized->allocation_type = kTfLiteArenaRw;
-      if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
-        TfLiteIntArray* aux_input_quantized_size =
-            TfLiteIntArrayCopy(aux_input->dims);
-        TF_LITE_ENSURE_OK(context,
-                          context->ResizeTensor(context, aux_input_quantized,
-                                                aux_input_quantized_size));
-      }
-    }
-
-    node->temporaries->data[kFwActivationStateQuantized] =
-        *scratch_tensor_index + kFwActivationStateQuantized;
-    TfLiteTensor* fw_activation_state_quantized =
-        GetTemporary(context, node, kFwActivationStateQuantized);
-    fw_activation_state_quantized->type = kTfLiteUInt8;
-    fw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(fw_activation_state_quantized->dims,
-                             fw_activation_state->dims)) {
-      TfLiteIntArray* fw_activation_state_quantized_size =
-          TfLiteIntArrayCopy(fw_activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, fw_activation_state_quantized,
-                                         fw_activation_state_quantized_size));
-    }
-    node->temporaries->data[kBwActivationStateQuantized] =
-        *scratch_tensor_index + kBwActivationStateQuantized;
-    TfLiteTensor* bw_activation_state_quantized =
-        GetTemporary(context, node, kBwActivationStateQuantized);
-    bw_activation_state_quantized->type = kTfLiteUInt8;
-    bw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(bw_activation_state_quantized->dims,
-                             bw_activation_state->dims)) {
-      TfLiteIntArray* bw_activation_state_quantized_size =
-          TfLiteIntArrayCopy(bw_activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, bw_activation_state_quantized,
-                                         bw_activation_state_quantized_size));
-    }
-    node->temporaries->data[kFwCellStateQuantized] =
-        *scratch_tensor_index + kFwCellStateQuantized;
-    TfLiteTensor* fw_cell_state_quantized =
-        GetTemporary(context, node, kFwCellStateQuantized);
-    fw_cell_state_quantized->type = kTfLiteUInt8;
-    fw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(fw_cell_state_quantized->dims,
-                             fw_cell_state->dims)) {
-      TfLiteIntArray* fw_cell_state_quantized_size =
-          TfLiteIntArrayCopy(fw_cell_state->dims);
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, fw_cell_state_quantized,
-                                              fw_cell_state_quantized_size));
-    }
-    node->temporaries->data[kBwCellStateQuantized] =
-        *scratch_tensor_index + kBwCellStateQuantized;
-    TfLiteTensor* bw_cell_state_quantized =
-        GetTemporary(context, node, kBwCellStateQuantized);
-    bw_cell_state_quantized->type = kTfLiteUInt8;
-    bw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(bw_cell_state_quantized->dims,
-                             bw_cell_state->dims)) {
-      TfLiteIntArray* bw_cell_state_quantized_size =
-          TfLiteIntArrayCopy(bw_cell_state->dims);
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, bw_cell_state_quantized,
-                                              bw_cell_state_quantized_size));
-    }
-
-    // Allocate temporary tensors to store scaling factors and product scaling
-    // factors. The latter is a convenience storage which allows to quantize
-    // a vector once (which produces the scaling factors) and multiply it with
-    // different matrices (which requires multiplying the scaling factors with
-    // the scaling factor of the matrix).
-    node->temporaries->data[kScalingFactors] =
-        *scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, kScalingFactors);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-    scaling_factors_size->data[0] = n_batch;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
-    }
-    node->temporaries->data[kProductScalingFactors] =
-        *scratch_tensor_index + kProductScalingFactors;
-    TfLiteTensor* prod_scaling_factors =
-        GetTemporary(context, node, kProductScalingFactors);
-    prod_scaling_factors->type = kTfLiteFloat32;
-    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
-    prod_scaling_factors_size->data[0] = n_batch;
-    if (!TfLiteIntArrayEqual(prod_scaling_factors->dims,
-                             prod_scaling_factors_size)) {
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, prod_scaling_factors,
-                                              prod_scaling_factors_size));
-    }
-
-    // Allocate a temporary tensor to store the recovered cell weights. Since
-    // this is used for diagonal matrices, only need to store n_cell values.
-    node->temporaries->data[kRecoveredCellWeights] =
-        *scratch_tensor_index + kRecoveredCellWeights;
-    TfLiteTensor* recovered_cell_weights =
-        GetTemporary(context, node, kRecoveredCellWeights);
-    recovered_cell_weights->type = kTfLiteFloat32;
-    recovered_cell_weights->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1);
-    recovered_cell_weights_size->data[0] = n_fw_cell;
-    if (!TfLiteIntArrayEqual(recovered_cell_weights->dims,
-                             recovered_cell_weights_size)) {
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, recovered_cell_weights,
-                                              recovered_cell_weights_size));
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
-    const TfLiteTensor* aux_input_to_input_weights,
-    const TfLiteTensor* aux_input_to_forget_weights,
-    const TfLiteTensor* aux_input_to_cell_weights,
-    const TfLiteTensor* aux_input_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
-    TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-  const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
-
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  // Index the scratch buffers pointers to the global scratch buffer.
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  const float* input_to_input_weights_ptr =
-      (use_cifg) ? nullptr : input_to_input_weights->data.f;
-  const float* recurrent_to_input_weights_ptr =
-      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
-  const float* input_gate_bias_ptr =
-      (use_cifg) ? nullptr : input_gate_bias->data.f;
-  const float* cell_to_input_weights_ptr =
-      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
-  const float* cell_to_forget_weights_ptr =
-      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
-  const float* cell_to_output_weights_ptr =
-      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
-  const float* projection_weights_ptr =
-      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  float* aux_input_ptr = nullptr;
-  float* aux_input_to_input_weights_ptr = nullptr;
-  float* aux_input_to_forget_weights_ptr = nullptr;
-  float* aux_input_to_cell_weights_ptr = nullptr;
-  float* aux_input_to_output_weights_ptr = nullptr;
-  if (aux_input_size > 0) {
-    aux_input_ptr = aux_input->data.f;
-    aux_input_to_input_weights_ptr = aux_input_to_input_weights->data.f;
-    aux_input_to_forget_weights_ptr = aux_input_to_forget_weights->data.f;
-    aux_input_to_cell_weights_ptr = aux_input_to_cell_weights->data.f;
-    aux_input_to_output_weights_ptr = aux_input_to_output_weights->data.f;
-  }
-
-  // Loop through the sequence.
-  const int input_step = n_batch * n_input;
-  const int output_step = n_batch * output->dims->data[2];
-  for (int t = 0; t < max_time; t++) {
-    // If this is the forward_sequence, step forward, otherwise step backwards.
-    const int t_rel = forward_sequence ? t : max_time - t - 1;
-    const float* input_ptr = input->data.f + t_rel * input_step;
-    float* output_ptr_time =
-        output->data.f + t_rel * output_step + output_offset;
-
-    kernel_utils::LstmStepWithAuxInput(
-        input_ptr, input_to_input_weights_ptr, input_to_forget_weights->data.f,
-        input_to_cell_weights->data.f, input_to_output_weights->data.f,
-        aux_input_ptr, aux_input_to_input_weights_ptr,
-        aux_input_to_forget_weights_ptr, aux_input_to_cell_weights_ptr,
-        aux_input_to_output_weights_ptr, recurrent_to_input_weights_ptr,
-        recurrent_to_forget_weights->data.f, recurrent_to_cell_weights->data.f,
-        recurrent_to_output_weights->data.f, cell_to_input_weights_ptr,
-        cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
-        input_gate_bias_ptr, forget_gate_bias->data.f, cell_bias->data.f,
-        output_gate_bias->data.f, projection_weights_ptr, projection_bias_ptr,
-        params, n_batch, n_cell, n_input, aux_input_size, n_output,
-        activation_state->data.f, cell_state->data.f, input_gate_scratch,
-        forget_gate_scratch, cell_scratch, output_gate_scratch,
-        output_ptr_time);
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
-    const TfLiteTensor* aux_input_to_input_weights,
-    const TfLiteTensor* aux_input_to_forget_weights,
-    const TfLiteTensor* aux_input_to_cell_weights,
-    const TfLiteTensor* aux_input_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence, int output_offset,
-    TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors,
-    TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
-    TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
-    TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
-    TfLiteTensor* output_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output) {
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-  const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  int8_t* input_to_input_weights_ptr = nullptr;
-  float input_to_input_weights_scale = 1.0f;
-  int8_t* recurrent_to_input_weights_ptr = nullptr;
-  float recurrent_to_input_weights_scale = 1.0f;
-  float* input_gate_bias_ptr = nullptr;
-  if (!use_cifg) {
-    input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
-    recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
-    input_gate_bias_ptr = input_gate_bias->data.f;
-    input_to_input_weights_scale = input_to_input_weights->params.scale;
-    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
-  }
-
-  int8_t* cell_to_input_weights_ptr = nullptr;
-  int8_t* cell_to_forget_weights_ptr = nullptr;
-  int8_t* cell_to_output_weights_ptr = nullptr;
-  float cell_to_input_weights_scale = 1.0f;
-  float cell_to_forget_weights_scale = 1.0f;
-  float cell_to_output_weights_scale = 1.0f;
-  if (use_peephole) {
-    if (!use_cifg) {
-      cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
-      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
-    }
-    cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
-    cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
-    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
-    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
-  }
-
-  const int8_t* projection_weights_ptr =
-      (projection_weights == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
-  const float projection_weights_scale =
-      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
-  const float input_to_forget_weights_scale =
-      input_to_forget_weights->params.scale;
-  const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
-  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
-  const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
-  const float input_to_output_weights_scale =
-      input_to_output_weights->params.scale;
-  const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
-  const float recurrent_to_forget_weights_scale =
-      recurrent_to_forget_weights->params.scale;
-  const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
-  const float recurrent_to_cell_weights_scale =
-      recurrent_to_cell_weights->params.scale;
-  const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
-  const float recurrent_to_output_weights_scale =
-      recurrent_to_output_weights->params.scale;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* output_state_ptr = output_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-
-  // Temporary storage for quantized values and scaling factors.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  int8_t* quantized_aux_input_ptr =
-      (aux_input_quantized == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(aux_input_quantized->data.uint8);
-  int8_t* quantized_output_state_ptr =
-      reinterpret_cast<int8_t*>(output_state_quantized->data.uint8);
-  int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
-  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
-
-  // Auxiliary input and weights.
-  float* aux_input_ptr = nullptr;
-  int8_t* aux_input_to_input_weights_ptr = nullptr;
-  int8_t* aux_input_to_forget_weights_ptr = nullptr;
-  int8_t* aux_input_to_cell_weights_ptr = nullptr;
-  int8_t* aux_input_to_output_weights_ptr = nullptr;
-  float aux_input_to_input_weights_scale = 0.0f;
-  float aux_input_to_forget_weights_scale = 0.0f;
-  float aux_input_to_cell_weights_scale = 0.0f;
-  float aux_input_to_output_weights_scale = 0.0f;
-  if (aux_input_size > 0) {
-    aux_input_ptr = aux_input->data.f;
-    aux_input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_input_weights->data.uint8);
-    aux_input_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_forget_weights->data.uint8);
-    aux_input_to_cell_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_cell_weights->data.uint8);
-    aux_input_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_output_weights->data.uint8);
-    aux_input_to_input_weights_scale = aux_input_to_input_weights->params.scale;
-    aux_input_to_forget_weights_scale =
-        aux_input_to_forget_weights->params.scale;
-    aux_input_to_cell_weights_scale = aux_input_to_cell_weights->params.scale;
-    aux_input_to_output_weights_scale =
-        aux_input_to_output_weights->params.scale;
-  }
-
-  // Feed the sequence into the LSTM step-by-step.
-  const int input_step = n_batch * n_input;
-  const int output_step = n_batch * output->dims->data[2];
-  for (int t = 0; t < max_time; t++) {
-    // If this is the forward_sequence, step forward, otherwise step backwards.
-    const int t_rel = forward_sequence ? t : max_time - t - 1;
-    const float* input_ptr = input->data.f + t_rel * input_step;
-    float* output_ptr = output->data.f + t_rel * output_step + output_offset;
-
-    kernel_utils::LstmStepWithAuxInput(
-        input_ptr, input_to_input_weights_ptr, input_to_input_weights_scale,
-        input_to_forget_weights_ptr, input_to_forget_weights_scale,
-        input_to_cell_weights_ptr, input_to_cell_weights_scale,
-        input_to_output_weights_ptr, input_to_output_weights_scale,
-        aux_input_ptr, aux_input_to_input_weights_ptr,
-        aux_input_to_input_weights_scale, aux_input_to_forget_weights_ptr,
-        aux_input_to_forget_weights_scale, aux_input_to_cell_weights_ptr,
-        aux_input_to_cell_weights_scale, aux_input_to_output_weights_ptr,
-        aux_input_to_output_weights_scale, recurrent_to_input_weights_ptr,
-        recurrent_to_input_weights_scale, recurrent_to_forget_weights_ptr,
-        recurrent_to_forget_weights_scale, recurrent_to_cell_weights_ptr,
-        recurrent_to_cell_weights_scale, recurrent_to_output_weights_ptr,
-        recurrent_to_output_weights_scale, cell_to_input_weights_ptr,
-        cell_to_input_weights_scale, cell_to_forget_weights_ptr,
-        cell_to_forget_weights_scale, cell_to_output_weights_ptr,
-        cell_to_output_weights_scale, input_gate_bias_ptr, forget_gate_bias_ptr,
-        cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-        projection_weights_scale, projection_bias_ptr, params, n_batch, n_cell,
-        n_input, aux_input_size, n_output, input_gate_scratch,
-        forget_gate_scratch, cell_scratch, output_gate_scratch,
-        scaling_factors_ptr, prod_scaling_factors_ptr,
-        recovered_cell_weights_ptr, quantized_input_ptr,
-        quantized_aux_input_ptr, quantized_output_state_ptr,
-        quantized_cell_state_ptr, output_state_ptr, cell_state_ptr, output_ptr);
-  }
-
-  return kTfLiteOk;
-}
-
-// The LSTM Op engine.
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
-      node->builtin_data);
-
-  // Input tensor.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-
-  // Tensors for the forward cell.
-  const TfLiteTensor* fw_input_to_input_weights =
-      GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
-  const TfLiteTensor* fw_input_to_forget_weights =
-      GetInput(context, node, kFwInputToForgetWeightsTensor);
-  const TfLiteTensor* fw_input_to_cell_weights =
-      GetInput(context, node, kFwInputToCellWeightsTensor);
-  const TfLiteTensor* fw_input_to_output_weights =
-      GetInput(context, node, kFwInputToOutputWeightsTensor);
-
-  const TfLiteTensor* fw_recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kFwRecurrentToInputWeightsTensor);
-  const TfLiteTensor* fw_recurrent_to_forget_weights =
-      GetInput(context, node, kFwRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* fw_recurrent_to_cell_weights =
-      GetInput(context, node, kFwRecurrentToCellWeightsTensor);
-  const TfLiteTensor* fw_recurrent_to_output_weights =
-      GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
-
-  const TfLiteTensor* fw_cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kFwCellToInputWeightsTensor);
-  const TfLiteTensor* fw_cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kFwCellToForgetWeightsTensor);
-  const TfLiteTensor* fw_cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kFwCellToOutputWeightsTensor);
-
-  const TfLiteTensor* fw_input_gate_bias =
-      GetOptionalInputTensor(context, node, kFwInputGateBiasTensor);
-  const TfLiteTensor* fw_forget_gate_bias =
-      GetInput(context, node, kFwForgetGateBiasTensor);
-  const TfLiteTensor* fw_cell_bias =
-      GetInput(context, node, kFwCellGateBiasTensor);
-  const TfLiteTensor* fw_output_gate_bias =
-      GetInput(context, node, kFwOutputGateBiasTensor);
-
-  const TfLiteTensor* fw_projection_weights =
-      GetOptionalInputTensor(context, node, kFwProjectionWeightsTensor);
-  const TfLiteTensor* fw_projection_bias =
-      GetOptionalInputTensor(context, node, kFwProjectionBiasTensor);
-
-  TfLiteTensor* fw_activation_state =
-      GetVariableInput(context, node, kFwInputActivationStateTensor);
-  TfLiteTensor* fw_cell_state =
-      GetVariableInput(context, node, kFwInputCellStateTensor);
-  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
-
-  // Tensors for the backward cell.
-  const TfLiteTensor* bw_input_to_input_weights =
-      GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
-  const TfLiteTensor* bw_input_to_forget_weights =
-      GetInput(context, node, kBwInputToForgetWeightsTensor);
-  const TfLiteTensor* bw_input_to_cell_weights =
-      GetInput(context, node, kBwInputToCellWeightsTensor);
-  const TfLiteTensor* bw_input_to_output_weights =
-      GetInput(context, node, kBwInputToOutputWeightsTensor);
-
-  const TfLiteTensor* bw_recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kBwRecurrentToInputWeightsTensor);
-  const TfLiteTensor* bw_recurrent_to_forget_weights =
-      GetInput(context, node, kBwRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* bw_recurrent_to_cell_weights =
-      GetInput(context, node, kBwRecurrentToCellWeightsTensor);
-  const TfLiteTensor* bw_recurrent_to_output_weights =
-      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
-
-  const TfLiteTensor* bw_cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kBwCellToInputWeightsTensor);
-  const TfLiteTensor* bw_cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kBwCellToForgetWeightsTensor);
-  const TfLiteTensor* bw_cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kBwCellToOutputWeightsTensor);
-
-  const TfLiteTensor* bw_input_gate_bias =
-      GetOptionalInputTensor(context, node, kBwInputGateBiasTensor);
-  const TfLiteTensor* bw_forget_gate_bias =
-      GetInput(context, node, kBwForgetGateBiasTensor);
-  const TfLiteTensor* bw_cell_bias =
-      GetInput(context, node, kBwCellGateBiasTensor);
-  const TfLiteTensor* bw_output_gate_bias =
-      GetInput(context, node, kBwOutputGateBiasTensor);
-
-  const TfLiteTensor* bw_projection_weights =
-      GetOptionalInputTensor(context, node, kBwProjectionWeightsTensor);
-  const TfLiteTensor* bw_projection_bias =
-      GetOptionalInputTensor(context, node, kBwProjectionBiasTensor);
-
-  // State tensors.
-  TfLiteTensor* bw_activation_state =
-      GetVariableInput(context, node, kBwInputActivationStateTensor);
-  TfLiteTensor* bw_cell_state =
-      GetVariableInput(context, node, kBwInputCellStateTensor);
-  TfLiteTensor* bw_output = params->merge_outputs
-                                ? nullptr
-                                : GetOutput(context, node, kBwOutputTensor);
-
-  // Temporary tensors.
-  TfLiteTensor* fw_scratch_buffer =
-      GetTemporary(context, node, kFwScratchBuffer);
-  TfLiteTensor* bw_scratch_buffer =
-      GetTemporary(context, node, kBwScratchBuffer);
-
-  // (Optional) auxiliary inputs.
-  const TfLiteTensor* aux_input =
-      GetOptionalInputTensor(context, node, kAuxInputTensor);
-  const TfLiteTensor* fw_aux_input_to_input_weights =
-      GetOptionalInputTensor(context, node, kFwAuxInputToInputWeightsTensor);
-  const TfLiteTensor* fw_aux_input_to_forget_weights =
-      GetOptionalInputTensor(context, node, kFwAuxInputToForgetWeightsTensor);
-  const TfLiteTensor* fw_aux_input_to_cell_weights =
-      GetOptionalInputTensor(context, node, kFwAuxInputToCellWeightsTensor);
-  const TfLiteTensor* fw_aux_input_to_output_weights =
-      GetOptionalInputTensor(context, node, kFwAuxInputToOutputWeightsTensor);
-  const TfLiteTensor* bw_aux_input_to_input_weights =
-      GetOptionalInputTensor(context, node, kBwAuxInputToInputWeightsTensor);
-  const TfLiteTensor* bw_aux_input_to_forget_weights =
-      GetOptionalInputTensor(context, node, kBwAuxInputToForgetWeightsTensor);
-  const TfLiteTensor* bw_aux_input_to_cell_weights =
-      GetOptionalInputTensor(context, node, kBwAuxInputToCellWeightsTensor);
-  const TfLiteTensor* bw_aux_input_to_output_weights =
-      GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
-
-  // Populate a TfLiteLSTMParams struct for the evaluation functions.
-  TfLiteLSTMParams lstm_params = {params->activation, params->cell_clip,
-                                  params->proj_clip, kTfLiteLSTMFullKernel};
-
-  const int bw_output_offset =
-      params->merge_outputs ? fw_recurrent_to_output_weights->dims->data[1] : 0;
-  const auto actual_bw_output = params->merge_outputs ? fw_output : bw_output;
-
-  switch (fw_input_to_output_weights->type) {
-    case kTfLiteFloat32: {
-      TfLiteStatus fw_pass_status = EvalFloat(
-          input, fw_input_to_input_weights, fw_input_to_forget_weights,
-          fw_input_to_cell_weights, fw_input_to_output_weights,
-          fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
-          fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights,
-          fw_cell_to_input_weights, fw_cell_to_forget_weights,
-          fw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights,
-          fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
-          fw_aux_input_to_output_weights, fw_input_gate_bias,
-          fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias,
-          fw_projection_weights, fw_projection_bias, &lstm_params,
-          /*forward_sequence=*/true, /*output_offset=*/0, fw_scratch_buffer,
-          fw_activation_state, fw_cell_state, fw_output);
-      TF_LITE_ENSURE_OK(context, fw_pass_status);
-
-      TfLiteStatus bw_pass_status = EvalFloat(
-          input, bw_input_to_input_weights, bw_input_to_forget_weights,
-          bw_input_to_cell_weights, bw_input_to_output_weights,
-          bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
-          bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
-          bw_cell_to_input_weights, bw_cell_to_forget_weights,
-          bw_cell_to_output_weights, aux_input, bw_aux_input_to_input_weights,
-          bw_aux_input_to_forget_weights, bw_aux_input_to_cell_weights,
-          bw_aux_input_to_output_weights, bw_input_gate_bias,
-          bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias,
-          bw_projection_weights, bw_projection_bias, &lstm_params,
-          /*forward_sequence=*/false, bw_output_offset, bw_scratch_buffer,
-          bw_activation_state, bw_cell_state, actual_bw_output);
-      TF_LITE_ENSURE_OK(context, bw_pass_status);
-      return kTfLiteOk;
-    }
-    case kTfLiteUInt8: {
-      TfLiteTensor* input_quantized =
-          GetTemporary(context, node, kInputQuantized);
-      TfLiteTensor* aux_input_quantized =
-          GetTemporary(context, node, kAuxInputQuantized);
-      TfLiteTensor* fw_activation_state_quantized =
-          GetTemporary(context, node, kFwActivationStateQuantized);
-      TfLiteTensor* bw_activation_state_quantized =
-          GetTemporary(context, node, kBwActivationStateQuantized);
-      TfLiteTensor* fw_cell_state_quantized =
-          GetTemporary(context, node, kFwCellStateQuantized);
-      TfLiteTensor* bw_cell_state_quantized =
-          GetTemporary(context, node, kBwCellStateQuantized);
-      TfLiteTensor* scaling_factors =
-          GetTemporary(context, node, kScalingFactors);
-      TfLiteTensor* prod_scaling_factors =
-          GetTemporary(context, node, kProductScalingFactors);
-      TfLiteTensor* recovered_cell_weights =
-          GetTemporary(context, node, kRecoveredCellWeights);
-
-      TfLiteStatus fw_pass_status = EvalHybrid(
-          input, fw_input_to_input_weights, fw_input_to_forget_weights,
-          fw_input_to_cell_weights, fw_input_to_output_weights,
-          fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
-          fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights,
-          fw_cell_to_input_weights, fw_cell_to_forget_weights,
-          fw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights,
-          fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
-          fw_aux_input_to_output_weights, fw_input_gate_bias,
-          fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias,
-          fw_projection_weights, fw_projection_bias, &lstm_params,
-          /*forward_sequence=*/true, /*output_offset=*/0, fw_scratch_buffer,
-          scaling_factors, prod_scaling_factors, recovered_cell_weights,
-          input_quantized, aux_input_quantized, fw_activation_state_quantized,
-          fw_cell_state_quantized, fw_activation_state, fw_cell_state,
-          fw_output);
-      TF_LITE_ENSURE_OK(context, fw_pass_status);
-
-      TfLiteStatus bw_pass_status = EvalHybrid(
-          input, bw_input_to_input_weights, bw_input_to_forget_weights,
-          bw_input_to_cell_weights, bw_input_to_output_weights,
-          bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
-          bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
-          bw_cell_to_input_weights, bw_cell_to_forget_weights,
-          bw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights,
-          fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
-          fw_aux_input_to_output_weights, bw_input_gate_bias,
-          bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias,
-          bw_projection_weights, bw_projection_bias, &lstm_params,
-          /*forward_sequence=*/false, bw_output_offset, bw_scratch_buffer,
-          scaling_factors, prod_scaling_factors, recovered_cell_weights,
-          input_quantized, aux_input_quantized, bw_activation_state_quantized,
-          bw_cell_state_quantized, bw_activation_state, bw_cell_state,
-          actual_bw_output);
-      TF_LITE_ENSURE_OK(context, bw_pass_status);
-      return kTfLiteOk;
-    }
-    default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           fw_input_to_output_weights->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace bidirectional_sequence_lstm
-
-TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM() {
-  static TfLiteRegistration r = {
-      bidirectional_sequence_lstm::Init, bidirectional_sequence_lstm::Free,
-      bidirectional_sequence_lstm::Prepare, bidirectional_sequence_lstm::Eval};
-  return &r;
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
deleted file mode 100644
index c22a457a71d928a139877c78cd39bd3c2ec7e283..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
+++ /dev/null
@@ -1,541 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <cstdio>
-#include <iostream>
-#include <limits>
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace bidirectional_sequence_rnn {
-
-constexpr int kInputTensor = 0;
-// Forward and backward cell tensors.
-constexpr int kFwWeightsTensor = 1;
-constexpr int kFwRecurrentWeightsTensor = 2;
-constexpr int kFwBiasTensor = 3;
-constexpr int kFwHiddenStateTensor = 4;
-constexpr int kBwWeightsTensor = 5;
-constexpr int kBwRecurrentWeightsTensor = 6;
-constexpr int kBwBiasTensor = 7;
-constexpr int kBwHiddenStateTensor = 8;
-// Auxiliary inputs.
-constexpr int kAuxInputTensor = 9;       // Optional.
-constexpr int kFwAuxWeightsTensor = 10;  // Optional.
-constexpr int kBwAuxWeightsTensor = 11;  // Optional.
-// Output tensors.
-constexpr int kFwOutputTensor = 0;
-constexpr int kBwOutputTensor = 1;  // Only if merge_outputs is false.
-
-// Temporary tensors.
-enum TemporaryTensor {
-  kInputQuantized = 0,
-  kFwHiddenStateQuantized = 1,
-  kBwHiddenStateQuantized = 2,
-  kScalingFactors = 3,
-  kAuxInputQuantized = 4,
-  kNumTemporaryTensors = 5
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* scratch_tensor_index = new int;
-  context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index);
-  return scratch_tensor_index;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<int*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceRNNParams*>(
-      node->builtin_data);
-
-  // Check we have all the inputs and outputs we need.
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 12);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size,
-                    params->merge_outputs ? 1 : 2);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* fw_input_weights =
-      GetInput(context, node, kFwWeightsTensor);
-  const TfLiteTensor* fw_recurrent_weights =
-      GetInput(context, node, kFwRecurrentWeightsTensor);
-  const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor);
-  const TfLiteTensor* fw_hidden_state =
-      GetInput(context, node, kFwHiddenStateTensor);
-  const TfLiteTensor* bw_input_weights =
-      GetInput(context, node, kBwWeightsTensor);
-  const TfLiteTensor* bw_recurrent_weights =
-      GetInput(context, node, kBwRecurrentWeightsTensor);
-  const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor);
-  const TfLiteTensor* bw_hidden_state =
-      GetInput(context, node, kBwHiddenStateTensor);
-
-  const TfLiteTensor* aux_input =
-      GetOptionalInputTensor(context, node, kAuxInputTensor);
-  const TfLiteTensor* fw_aux_input_weights =
-      GetOptionalInputTensor(context, node, kFwAuxWeightsTensor);
-  const TfLiteTensor* bw_aux_input_weights =
-      GetOptionalInputTensor(context, node, kBwAuxWeightsTensor);
-
-  const bool aux_inputs_all_or_none =
-      ((aux_input != nullptr) && (fw_aux_input_weights != nullptr) &&
-       (bw_aux_input_weights != nullptr)) ||
-      ((aux_input == nullptr) && (fw_aux_input_weights == nullptr) &&
-       (bw_aux_input_weights == nullptr));
-  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
-  const bool has_aux_input = (aux_input != nullptr);
-
-  // Check all the parameters of tensor match within themselves and match the
-  // input configuration.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-
-  TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
-  const int batch_size = input->dims->data[0];
-  const int max_time = input->dims->data[1];
-  const int fw_num_units = fw_input_weights->dims->data[0];
-  const int bw_num_units = bw_input_weights->dims->data[0];
-  TF_LITE_ASSERT_EQ(input->dims->data[2], fw_input_weights->dims->data[1]);
-  TF_LITE_ASSERT_EQ(input->dims->data[2], bw_input_weights->dims->data[1]);
-  TF_LITE_ASSERT_EQ(fw_input_weights->dims->data[0], fw_bias->dims->data[0]);
-  TF_LITE_ASSERT_EQ(bw_input_weights->dims->data[0], bw_bias->dims->data[0]);
-  TF_LITE_ASSERT_EQ(fw_recurrent_weights->dims->data[0],
-                    fw_bias->dims->data[0]);
-  TF_LITE_ASSERT_EQ(bw_recurrent_weights->dims->data[1],
-                    bw_bias->dims->data[0]);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(fw_hidden_state), 2);
-  TF_LITE_ENSURE_EQ(context, fw_hidden_state->dims->data[0], batch_size);
-  TF_LITE_ENSURE_EQ(context, fw_hidden_state->dims->data[1], fw_num_units);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(bw_hidden_state), 2);
-  TF_LITE_ENSURE_EQ(context, bw_hidden_state->dims->data[0], batch_size);
-  TF_LITE_ENSURE_EQ(context, bw_hidden_state->dims->data[1], bw_num_units);
-
-  if (has_aux_input) {
-    // Check that aux_input has the same dimensions (except last) as the input.
-    TF_LITE_ASSERT_EQ(aux_input->dims->data[0], input->dims->data[0]);
-    TF_LITE_ASSERT_EQ(aux_input->dims->data[1], input->dims->data[1]);
-    // Check that aux_input_weights has the same dimensions (except last) as
-    // the input_weights.
-    TF_LITE_ASSERT_EQ(fw_aux_input_weights->dims->data[0], fw_num_units);
-    TF_LITE_ASSERT_EQ(bw_aux_input_weights->dims->data[0], bw_num_units);
-    TF_LITE_ASSERT_EQ(aux_input->dims->data[2],
-                      fw_aux_input_weights->dims->data[1]);
-    TF_LITE_ASSERT_EQ(aux_input->dims->data[2],
-                      bw_aux_input_weights->dims->data[1]);
-  }
-
-  const bool is_hybrid_op =
-      (fw_input_weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32);
-
-  if (is_hybrid_op) {
-    int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
-
-    TfLiteIntArrayFree(node->temporaries);
-    if (has_aux_input) {
-      node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
-    } else {
-      // No need to create a temporary tensor for the non-existent aux_input.
-      node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors - 1);
-    }
-
-    node->temporaries->data[kInputQuantized] =
-        *scratch_tensor_index + kInputQuantized;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-
-    node->temporaries->data[kFwHiddenStateQuantized] =
-        *scratch_tensor_index + kFwHiddenStateQuantized;
-    TfLiteTensor* fw_hidden_state_quantized =
-        GetTemporary(context, node, kFwHiddenStateQuantized);
-    fw_hidden_state_quantized->type = kTfLiteUInt8;
-    fw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(fw_hidden_state_quantized->dims,
-                             fw_hidden_state->dims)) {
-      TfLiteIntArray* fw_hidden_state_quantized_size =
-          TfLiteIntArrayCopy(fw_hidden_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, fw_hidden_state_quantized,
-                                         fw_hidden_state_quantized_size));
-    }
-
-    node->temporaries->data[kBwHiddenStateQuantized] =
-        *scratch_tensor_index + kBwHiddenStateQuantized;
-    TfLiteTensor* bw_hidden_state_quantized =
-        GetTemporary(context, node, kBwHiddenStateQuantized);
-    bw_hidden_state_quantized->type = kTfLiteUInt8;
-    bw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(bw_hidden_state_quantized->dims,
-                             bw_hidden_state->dims)) {
-      TfLiteIntArray* bw_hidden_state_quantized_size =
-          TfLiteIntArrayCopy(bw_hidden_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, bw_hidden_state_quantized,
-                                         bw_hidden_state_quantized_size));
-    }
-
-    // Allocate temporary tensors to store scaling factors of quantization.
-    node->temporaries->data[kScalingFactors] =
-        *scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, kScalingFactors);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-    scaling_factors_size->data[0] = batch_size;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
-    }
-
-    if (has_aux_input) {
-      node->temporaries->data[kAuxInputQuantized] =
-          *scratch_tensor_index + kAuxInputQuantized;
-      TfLiteTensor* aux_input_quantized =
-          GetTemporary(context, node, kAuxInputQuantized);
-      aux_input_quantized->type = kTfLiteUInt8;
-      aux_input_quantized->allocation_type = kTfLiteArenaRw;
-      if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
-        TfLiteIntArray* aux_input_quantized_size =
-            TfLiteIntArrayCopy(aux_input->dims);
-        TF_LITE_ENSURE_OK(context,
-                          context->ResizeTensor(context, aux_input_quantized,
-                                                aux_input_quantized_size));
-      }
-    }
-  }
-
-  // Resize outputs.
-  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
-  TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3);
-  fw_output_size_array->data[0] = batch_size;
-  fw_output_size_array->data[1] = max_time;
-  fw_output_size_array->data[2] =
-      params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
-  TF_LITE_ENSURE_OK(
-      context, context->ResizeTensor(context, fw_output, fw_output_size_array));
-  if (!params->merge_outputs) {
-    TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
-    TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
-    bw_output_size_array->data[0] = batch_size;
-    bw_output_size_array->data[1] = max_time;
-    bw_output_size_array->data[2] = bw_num_units;
-    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_output,
-                                                     bw_output_size_array));
-  }
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
-    const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
-    const TfLiteTensor* bw_input_weights,
-    const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
-    const TfLiteTensor* aux_input, const TfLiteTensor* fw_aux_input_weights,
-    const TfLiteTensor* bw_aux_input_weights,
-    const TfLiteBidirectionalSequenceRNNParams* params,
-    TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
-    TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
-  const int batch_size = input->dims->data[0];
-  const int max_time = input->dims->data[1];
-  const int input_size = input->dims->data[2];
-  const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
-
-  const int fw_num_units = fw_input_weights->dims->data[0];
-  const float* fw_bias_ptr = fw_bias->data.f;
-  const float* fw_input_weights_ptr = fw_input_weights->data.f;
-  const float* fw_recurrent_weights_ptr = fw_recurrent_weights->data.f;
-
-  const int bw_num_units = bw_input_weights->dims->data[0];
-  const float* bw_bias_ptr = bw_bias->data.f;
-  const float* bw_input_weights_ptr = bw_input_weights->data.f;
-  const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f;
-
-  const float* fw_aux_input_weights_ptr = (fw_aux_input_weights != nullptr)
-                                              ? fw_aux_input_weights->data.f
-                                              : nullptr;
-  const float* bw_aux_input_weights_ptr = (bw_aux_input_weights != nullptr)
-                                              ? bw_aux_input_weights->data.f
-                                              : nullptr;
-
-  const int fw_output_step =
-      params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
-  const int bw_output_step =
-      params->merge_outputs ? fw_num_units + bw_num_units : bw_num_units;
-  for (int b = 0; b < batch_size; b++) {
-    // Forward cell.
-    float* fw_hidden_state_ptr_batch =
-        fw_hidden_state->data.f + b * fw_num_units;
-    float* fw_output_offset = fw_output->data.f + b * fw_output_step * max_time;
-    for (int s = 0; s < max_time; s++) {
-      const float* input_ptr_batch =
-          input->data.f + b * input_size * max_time + s * input_size;
-      const float* aux_input_ptr_batch =
-          (aux_input != nullptr)
-              ? aux_input->data.f + b * input_size * max_time + s * input_size
-              : nullptr;
-      float* output_ptr_batch = fw_output_offset + s * fw_output_step;
-
-      kernel_utils::RnnBatchStep(
-          input_ptr_batch, fw_input_weights_ptr, aux_input_ptr_batch,
-          fw_aux_input_weights_ptr, fw_recurrent_weights_ptr, fw_bias_ptr,
-          input_size, aux_input_size, fw_num_units, /*batch_size=*/1,
-          params->activation, fw_hidden_state_ptr_batch, output_ptr_batch);
-    }
-    // Backward cell.
-    float* bw_hidden_state_ptr_batch =
-        bw_hidden_state->data.f + b * bw_num_units;
-    float* bw_output_offset =
-        params->merge_outputs
-            ? fw_output->data.f + b * bw_output_step * max_time + fw_num_units
-            : bw_output->data.f + b * bw_output_step * max_time;
-    for (int s = max_time - 1; s >= 0; s--) {
-      const float* input_ptr_batch =
-          input->data.f + b * input_size * max_time + s * input_size;
-      const float* aux_input_ptr_batch =
-          (aux_input != nullptr)
-              ? aux_input->data.f + b * input_size * max_time + s * input_size
-              : nullptr;
-      float* output_ptr_batch = bw_output_offset + s * bw_output_step;
-
-      kernel_utils::RnnBatchStep(
-          input_ptr_batch, bw_input_weights_ptr, aux_input_ptr_batch,
-          bw_aux_input_weights_ptr, bw_recurrent_weights_ptr, bw_bias_ptr,
-          input_size, aux_input_size, bw_num_units, /*batch_size=*/1,
-          params->activation, bw_hidden_state_ptr_batch, output_ptr_batch);
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
-    const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
-    const TfLiteTensor* bw_input_weights,
-    const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
-    const TfLiteTensor* aux_input, const TfLiteTensor* aux_fw_input_weights,
-    const TfLiteTensor* aux_bw_input_weights,
-    const TfLiteBidirectionalSequenceRNNParams* params,
-    TfLiteTensor* scaling_factors, TfLiteTensor* input_quantized,
-    TfLiteTensor* aux_input_quantized, TfLiteTensor* fw_hidden_state_quantized,
-    TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
-    TfLiteTensor* bw_hidden_state_quantized, TfLiteTensor* bw_hidden_state,
-    TfLiteTensor* bw_output) {
-  const int batch_size = input->dims->data[0];
-  const int max_time = input->dims->data[1];
-  const int input_size = input->dims->data[2];
-  const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
-
-  const int fw_num_units = fw_input_weights->dims->data[0];
-  const float* fw_bias_ptr = fw_bias->data.f;
-  const int8_t* fw_input_weights_ptr =
-      reinterpret_cast<const int8_t*>(fw_input_weights->data.uint8);
-  float fw_input_weights_scale = fw_input_weights->params.scale;
-  const int8_t* fw_recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(fw_recurrent_weights->data.uint8);
-  float fw_recurrent_weights_scale = fw_recurrent_weights->params.scale;
-
-  const int bw_num_units = bw_input_weights->dims->data[0];
-  const float* bw_bias_ptr = bw_bias->data.f;
-  const int8_t* bw_input_weights_ptr =
-      reinterpret_cast<const int8_t*>(bw_input_weights->data.uint8);
-  float bw_input_weights_scale = bw_input_weights->params.scale;
-  const int8_t* bw_recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(bw_recurrent_weights->data.uint8);
-  float bw_recurrent_weights_scale = bw_recurrent_weights->params.scale;
-
-  // Set the auxiliary pointers and scales if needed.
-  int8_t* aux_fw_input_weights_ptr = nullptr;
-  float aux_fw_input_weights_scale = 0.0f;
-  int8_t* aux_bw_input_weights_ptr = nullptr;
-  float aux_bw_input_weights_scale = 0.0f;
-  int8_t* aux_quantized_input_ptr = nullptr;
-  if (aux_input_size > 0) {
-    aux_fw_input_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_fw_input_weights->data.uint8);
-    aux_fw_input_weights_scale = aux_fw_input_weights->params.scale;
-    aux_bw_input_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_bw_input_weights->data.uint8);
-    aux_bw_input_weights_scale = aux_bw_input_weights->params.scale;
-    aux_quantized_input_ptr = reinterpret_cast<int8_t*>(aux_input_quantized);
-  }
-
-  // Initialize temporary storage for quantized values.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  int8_t* fw_quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(fw_hidden_state_quantized->data.uint8);
-  int8_t* bw_quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(bw_hidden_state_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-
-  const int fw_output_step =
-      params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
-  const int bw_output_step =
-      params->merge_outputs ? fw_num_units + bw_num_units : bw_num_units;
-  for (int b = 0; b < batch_size; b++) {
-    // Forward cell.
-    float* fw_hidden_state_ptr_batch =
-        fw_hidden_state->data.f + b * fw_num_units;
-    float* fw_output_offset = fw_output->data.f + b * fw_output_step * max_time;
-    for (int s = 0; s < max_time; s++) {
-      const float* input_ptr_batch =
-          input->data.f + b * input_size * max_time + s * input_size;
-      const float* aux_input_ptr_batch =
-          (aux_input != nullptr)
-              ? aux_input->data.f + b * input_size * max_time + s * input_size
-              : nullptr;
-      float* output_ptr_batch = fw_output_offset + s * fw_output_step;
-
-      kernel_utils::RnnBatchStep(
-          input_ptr_batch, fw_input_weights_ptr, fw_input_weights_scale,
-          aux_input_ptr_batch, aux_fw_input_weights_ptr,
-          aux_fw_input_weights_scale, fw_recurrent_weights_ptr,
-          fw_recurrent_weights_scale, fw_bias_ptr, input_size, aux_input_size,
-          fw_num_units, /*batch_size=*/1, params->activation,
-          quantized_input_ptr, aux_quantized_input_ptr,
-          fw_quantized_hidden_state_ptr, scaling_factors_ptr,
-          fw_hidden_state_ptr_batch, output_ptr_batch);
-    }
-    // Backward cell.
-    float* bw_hidden_state_ptr_batch =
-        bw_hidden_state->data.f + b * bw_num_units;
-    float* bw_output_offset =
-        params->merge_outputs
-            ? fw_output->data.f + b * bw_output_step * max_time
-            : bw_output->data.f + b * bw_output_step * max_time;
-    for (int s = max_time - 1; s >= 0; s--) {
-      const float* input_ptr_batch =
-          input->data.f + b * input_size * max_time + s * input_size;
-      const float* aux_input_ptr_batch =
-          (aux_input != nullptr)
-              ? aux_input->data.f + b * input_size * max_time + s * input_size
-              : nullptr;
-      float* output_ptr_batch = bw_output_offset + s * bw_output_step;
-
-      kernel_utils::RnnBatchStep(
-          input_ptr_batch, bw_input_weights_ptr, bw_input_weights_scale,
-          aux_input_ptr_batch, aux_bw_input_weights_ptr,
-          aux_bw_input_weights_scale, bw_recurrent_weights_ptr,
-          bw_recurrent_weights_scale, bw_bias_ptr, input_size, aux_input_size,
-          bw_num_units, /*batch_size=*/1, params->activation,
-          quantized_input_ptr, aux_quantized_input_ptr,
-          bw_quantized_hidden_state_ptr, scaling_factors_ptr,
-          bw_hidden_state_ptr_batch, output_ptr_batch);
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceRNNParams*>(
-      node->builtin_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* fw_input_weights =
-      GetInput(context, node, kFwWeightsTensor);
-  const TfLiteTensor* fw_recurrent_weights =
-      GetInput(context, node, kFwRecurrentWeightsTensor);
-  const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor);
-  const TfLiteTensor* bw_input_weights =
-      GetInput(context, node, kBwWeightsTensor);
-  const TfLiteTensor* bw_recurrent_weights =
-      GetInput(context, node, kBwRecurrentWeightsTensor);
-  const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor);
-
-  // Get auxiliary inputs.
-  const TfLiteTensor* aux_input =
-      GetOptionalInputTensor(context, node, kAuxInputTensor);
-  const TfLiteTensor* fw_aux_input_weights =
-      GetOptionalInputTensor(context, node, kFwAuxWeightsTensor);
-  const TfLiteTensor* bw_aux_input_weights =
-      GetOptionalInputTensor(context, node, kBwAuxWeightsTensor);
-
-  TfLiteTensor* fw_hidden_state =
-      GetVariableInput(context, node, kFwHiddenStateTensor);
-  TfLiteTensor* bw_hidden_state =
-      GetVariableInput(context, node, kBwHiddenStateTensor);
-
-  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
-  TfLiteTensor* bw_output = params->merge_outputs
-                                ? nullptr
-                                : GetOutput(context, node, kBwOutputTensor);
-
-  switch (fw_input_weights->type) {
-    case kTfLiteFloat32:
-      return EvalFloat(input, fw_input_weights, fw_recurrent_weights, fw_bias,
-                       bw_input_weights, bw_recurrent_weights, bw_bias,
-                       aux_input, fw_aux_input_weights, bw_aux_input_weights,
-                       params, fw_hidden_state, fw_output, bw_hidden_state,
-                       bw_output);
-    case kTfLiteUInt8: {
-      TfLiteTensor* input_quantized =
-          GetTemporary(context, node, kInputQuantized);
-      TfLiteTensor* fw_hidden_state_quantized =
-          GetTemporary(context, node, kFwHiddenStateQuantized);
-      TfLiteTensor* bw_hidden_state_quantized =
-          GetTemporary(context, node, kBwHiddenStateQuantized);
-      TfLiteTensor* scaling_factors =
-          GetTemporary(context, node, kScalingFactors);
-      TfLiteTensor* aux_input_quantized =
-          (aux_input != nullptr)
-              ? GetTemporary(context, node, kAuxInputQuantized)
-              : nullptr;
-
-      return EvalHybrid(input, fw_input_weights, fw_recurrent_weights, fw_bias,
-                        bw_input_weights, bw_recurrent_weights, bw_bias,
-                        aux_input, fw_aux_input_weights, bw_aux_input_weights,
-                        params, scaling_factors, input_quantized,
-                        aux_input_quantized, fw_hidden_state_quantized,
-                        fw_hidden_state, fw_output, bw_hidden_state_quantized,
-                        bw_hidden_state, bw_output);
-    }
-    default:
-      context->ReportError(context, "Type not currently supported.");
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace bidirectional_sequence_rnn
-
-TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN() {
-  static TfLiteRegistration r = {
-      bidirectional_sequence_rnn::Init, bidirectional_sequence_rnn::Free,
-      bidirectional_sequence_rnn::Prepare, bidirectional_sequence_rnn::Eval};
-  return &r;
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
deleted file mode 100644
index 19958844a1af876bf26251d5ef3ff249a087ffcc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/kernels/padding.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace depthwise_conv {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-// This file has three implementation of DepthwiseConv.
-enum KernelType {
-  kReference,
-  kGenericOptimized,  // Neon-free
-  kNeonOptimized,
-};
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  // This is a builtin op, so we don't use the contents in 'buffer', if any.
-  // Instead, we allocate a new object to carry information from Prepare() to
-  // Eval().
-  return new OpData;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
-
-  // TODO(ahentz): use could use GetOptionalInputTensor() here, but we need to
-  // decide whether we are OK with optional tensors being completely absent, as
-  // opposed to having -1 as their index.
-  bool hasBias = NumInputs(node) == 3;
-
-  TF_LITE_ENSURE(context, hasBias || NumInputs(node) == 2);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias = nullptr;
-
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 4);
-
-  // The parameter 'depth_multiplier' is redundant, so we check here to make
-  // sure it is consistent with the given dimensions.
-  TF_LITE_ENSURE_EQ(context,
-                    params->depth_multiplier * SizeOfDimension(input, 3),
-                    SizeOfDimension(filter, 3));
-
-  const TfLiteType data_type = input->type;
-  TF_LITE_ENSURE(context,
-                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
-  TF_LITE_ENSURE_EQ(context, output->type, data_type);
-  TF_LITE_ENSURE_EQ(context, filter->type, data_type);
-
-  if (hasBias) {
-    bias = GetInput(context, node, kBiasTensor);
-    if (data_type == kTfLiteUInt8) {
-      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
-      TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
-    } else {
-      TF_LITE_ENSURE_EQ(context, bias->type, data_type);
-    }
-    TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
-    TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 3),
-                      SizeOfDimension(bias, 0));
-  }
-
-  int channels_out = SizeOfDimension(filter, 3);
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-  int batches = SizeOfDimension(input, 0);
-
-  // Matching GetWindowedOutputSize in TensorFlow.
-  auto padding = params->padding;
-  auto compute_out_size = [padding](int image_size, int filter_size, int stride,
-                                    int dilation_rate) -> int {
-    int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
-    return padding == kTfLitePaddingSame
-               ? (image_size + stride - 1) / stride
-               : padding == kTfLitePaddingValid
-                     ? (image_size - effective_filter_size + stride) / stride
-                     : 0;
-  };
-
-  int out_width = compute_out_size(width, filter_width, params->stride_width,
-                                   params->dilation_width_factor);
-  int out_height =
-      compute_out_size(height, filter_height, params->stride_height,
-                       params->dilation_height_factor);
-
-  data->padding.height =
-      ComputePadding(params->stride_height, params->dilation_height_factor,
-                     height, filter_height, out_height);
-  data->padding.width =
-      ComputePadding(params->stride_width, params->dilation_width_factor, width,
-                     filter_width, out_width);
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
-  }
-
-  TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
-  outputSize->data[0] = batches;
-  outputSize->data[1] = out_height;
-  outputSize->data[2] = out_width;
-  outputSize->data[3] = channels_out;
-  return context->ResizeTensor(context, output, outputSize);
-}
-
-template <KernelType kernel_type>
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-
-  void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&,
-                         const float*, const RuntimeShape&, const float*,
-                         const RuntimeShape&, const float*, const RuntimeShape&,
-                         float*);
-  if (kernel_type == kReference) {
-    depthwise_conv = &reference_ops::DepthwiseConv;
-  } else {
-    depthwise_conv = &optimized_ops::DepthwiseConv;
-  }
-
-  DepthwiseParams op_params;
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.float_activation_min = output_activation_min;
-  op_params.float_activation_max = output_activation_max;
-  depthwise_conv(op_params, GetTensorShape(input), GetTensorData<float>(input),
-                 GetTensorShape(filter), GetTensorData<float>(filter),
-                 GetTensorShape(bias), GetTensorData<float>(bias),
-                 GetTensorShape(output), GetTensorData<float>(output));
-}
-
-template <KernelType kernel_type>
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
-  auto input_offset = -input->params.zero_point;
-  auto filter_offset = -filter->params.zero_point;
-  auto output_offset = output->params.zero_point;
-
-  void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&,
-                         const uint8*, const RuntimeShape&, const uint8*,
-                         const RuntimeShape&, const int32*, const RuntimeShape&,
-                         uint8*);
-
-  if (kernel_type == kReference) {
-    depthwise_conv = &reference_ops::DepthwiseConv;
-  } else {
-    depthwise_conv = &optimized_ops::DepthwiseConv;
-  }
-
-  DepthwiseParams op_params;
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.stride_width = params->stride_width;
-  op_params.stride_height = params->stride_height;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-  depthwise_conv(op_params, GetTensorShape(input),
-                 GetTensorData<uint8_t>(input), GetTensorShape(filter),
-                 GetTensorData<uint8_t>(filter), GetTensorShape(bias),
-                 GetTensorData<int32_t>(bias), GetTensorShape(output),
-                 GetTensorData<uint8_t>(output));
-}
-
-template <KernelType kernel_type>
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  const TfLiteTensor* bias =
-      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
-
-  // TODO(aselle): Consider whether float conv and quantized conv should be
-  // separate ops to avoid dispatch overhead here.
-  switch (input->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
-                             output);
-      break;
-    case kTfLiteUInt8:
-      EvalQuantized<kernel_type>(context, node, params, data, input, filter,
-                                 bias, output);
-      break;
-    default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace depthwise_conv
-
-TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF() {
-  static TfLiteRegistration r = {
-      depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
-      depthwise_conv::Eval<depthwise_conv::kReference>};
-  return &r;
-}
-
-TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT() {
-  static TfLiteRegistration r = {
-      depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
-      depthwise_conv::Eval<depthwise_conv::kGenericOptimized>};
-  return &r;
-}
-
-TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT() {
-  static TfLiteRegistration r = {
-      depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
-      depthwise_conv::Eval<depthwise_conv::kNeonOptimized>};
-  return &r;
-}
-
-TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
-#ifdef USE_NEON
-  return Register_DEPTHWISE_CONVOLUTION_NEON_OPT();
-#else
-  return Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT();
-#endif
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
deleted file mode 100644
index 4a33a0319d0dc3fd56cd3a173518d4fe49ace3ec..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
+++ /dev/null
@@ -1,455 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstdarg>
-#include <gtest/gtest.h>
-#include "absl/memory/memory.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-
-namespace ops {
-namespace builtin {
-
-TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF();
-TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT();
-TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT();
-
-}  // namespace builtin
-}  // namespace ops
-
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
- public:
-  // TODO(ahentz): Also test different activation types, bias, padding types,
-  // stride values.
-  BaseDepthwiseConvolutionOpModel(TfLiteRegistration* registration,
-                                  const TensorData& input,
-                                  const TensorData& filter,
-                                  const TensorData& output,
-                                  Padding padding_type,
-                                  int dilation_factor = 1) {
-    input_ = AddInput(input);
-    filter_ = AddInput(filter);
-
-    int bias_size = GetShape(filter_)[3];
-    if (input.type == TensorType_FLOAT32) {
-      bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
-    } else {
-      // This is a quantized version. The scale of 'bias' depends on the scales
-      // of input and filter. Supposedly this is correctly set during quantized
-      // training.
-      auto bias_scale = GetScale(input_) * GetScale(filter_);
-      TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
-      bias_ = AddInput(bias);
-    }
-
-    output_ = AddOutput(output);
-
-    int input_depth = GetShape(input_)[3];
-    int output_depth = GetShape(filter_)[3];
-    int depth_mul = output_depth / input_depth;
-
-    SetBuiltinOp(
-        BuiltinOperator_DEPTHWISE_CONV_2D,
-        BuiltinOptions_DepthwiseConv2DOptions,
-        CreateDepthwiseConv2DOptions(builder_, padding_type, 1, 1, depth_mul,
-                                     ActivationFunctionType_NONE,
-                                     dilation_factor, dilation_factor)
-            .Union());
-
-    resolver_ = absl::make_unique<SingleOpResolver>(
-        BuiltinOperator_DEPTHWISE_CONV_2D, registration);
-
-    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
-  }
-
- protected:
-  int input_;
-  int filter_;
-  int bias_;
-  int output_;
-};
-
-class DepthwiseConvolutionOpModel : public BaseDepthwiseConvolutionOpModel {
- public:
-  using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
-
-  void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
-
-  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-};
-
-const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
-    {"Reference", ops::builtin::Register_DEPTHWISE_CONVOLUTION_REF()},
-    {"GenericOptimized",
-     ops::builtin::Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT()},
-    {"NeonOptimized", ops::builtin::Register_DEPTHWISE_CONVOLUTION_NEON_OPT()},
-});
-
-class DepthwiseConvolutionOpTest : public SingleOpTest {
- protected:
-  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMap;
-  }
-};
-
-TEST_P(DepthwiseConvolutionOpTest, SimpleTest) {
-  DepthwiseConvolutionOpModel m(GetRegistration(),
-                                {TensorType_FLOAT32, {1, 3, 2, 2}},
-                                {TensorType_FLOAT32, {1, 2, 2, 4}},
-                                {TensorType_FLOAT32, {}}, Padding_VALID);
-
-  m.SetInput({
-      1, 2, 7, 8,    // column 1
-      3, 4, 9, 10,   // column 2
-      5, 6, 11, 12,  // column 3
-  });
-  m.SetFilter({
-      1, 2, 3, 4,        //
-      -9, 10, -11, 12,   //
-      5, 6, 7, 8,        //
-      13, -14, 15, -16,  //
-  });
-  m.SetBias({1, 2, 3, 4});
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 71, -34, 99, -20,  //
-                                 91, -26, 127, -4,  //
-                             }));
-}
-
-TEST_P(DepthwiseConvolutionOpTest, SimpleDilatedTestPaddingValid) {
-  const int depth = 1;
-  const int image_width = 9;
-  const int image_height = 9;
-  const int image_batch_count = 1;
-  const int filter_size = 3;
-  const int filter_count = 1;
-  const int dilation_factor = 3;
-  DepthwiseConvolutionOpModel m(
-      GetRegistration(),
-      {TensorType_FLOAT32,
-       {image_batch_count, image_height, image_width, depth}},
-      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
-      {TensorType_FLOAT32, {}}, Padding_VALID, dilation_factor);
-
-  // The image matrix is:
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // clang-format off
-  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 1, 1, 1, 0, 0, 0,
-              0, 0, 0, 1, 1, 1, 0, 0, 0,
-              0, 0, 0, 1, 1, 1, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0});
-  // clang-format on
-  // The filter matrix is:
-  // | 1 | 2 | 3 |
-  // | 4 | 5 | 6 |
-  // | 7 | 8 | 9 |
-  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
-  // No bias for this test.
-  m.SetBias({0});
-  m.Invoke();
-
-  // Since the dilation rate is 3 this will reduce the size of the output from
-  // 10x10 to 3x3 of all 5s. Specifically:
-  // | 5 | 5 | 5 |
-  // | 5 | 5 | 5 |
-  // | 5 | 5 | 5 |
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
-}
-
-TEST_P(DepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
-  const int depth = 1;
-  const int image_width = 3;
-  const int image_height = 3;
-  const int image_batch_count = 1;
-  const int filter_size = 2;
-  const int filter_count = 1;
-  const int dilation_factor = 2;
-  DepthwiseConvolutionOpModel m(
-      GetRegistration(),
-      {TensorType_FLOAT32,
-       {image_batch_count, image_height, image_width, depth}},
-      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
-      {TensorType_FLOAT32, {}}, Padding_SAME, dilation_factor);
-
-  // The image matrix is:
-  // | 1 | 1 | 1 |
-  // | 1 | 1 | 1 |
-  // | 1 | 1 | 1 |
-  m.SetInput({1, 1, 1, 1, 1, 1, 1, 1, 1});
-  // The filter matrix is:
-  // | 1 | 2 |
-  // | 3 | 4 |
-  m.SetFilter({1, 2, 3, 4});
-  // No bias for this test.
-  m.SetBias({0});
-  m.Invoke();
-
-  // Output:
-  // | 4 | 7 | 3 |
-  // | 6 |10 | 4 |
-  // | 2 | 3 | 1 |
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
-}
-
-class QuantizedDepthwiseConvolutionOpModel
-    : public BaseDepthwiseConvolutionOpModel {
- public:
-  using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
-
-  void SetInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
-  }
-
-  void SetFilter(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(filter_, data);
-  }
-
-  void SetBias(std::initializer_list<float> data) {
-    QuantizeAndPopulate<int32_t>(bias_, data);
-  }
-
-  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
-  std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
-  }
-};
-
-class QuantizedDepthwiseConvolutionOpTest : public SingleOpTest {
- protected:
-  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMap;
-  }
-};
-
-// In this test we set the input and output scales so that the results match
-// exactly the 'non-quantized' version.
-TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) {
-  QuantizedDepthwiseConvolutionOpModel m(
-      GetRegistration(), {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64},
-      {TensorType_UINT8, {1, 2, 2, 4}, -63.5, 64},
-      {TensorType_UINT8, {}, -127, 128}, Padding_VALID);
-
-  m.SetInput({
-      1, 2, 7, 8,    // column 1
-      3, 4, 9, 10,   // column 2
-      5, 6, 11, 12,  // column 3
-  });
-  m.SetFilter({
-      1, 2, 3, 4,        //
-      -9, 10, -11, 12,   //
-      5, 6, 7, 8,        //
-      13, -14, 15, -16,  //
-  });
-  m.SetBias({1, 2, 3, 4});
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                            {
-                                                71, -34, 99, -20,  //
-                                                91, -26, 127, -4,  //
-                                            },
-                                            1e-5)));
-  // For good  measure, let's also verify the quantized values:
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 198, 93, 226, 107,   //
-                                 218, 101, 254, 123,  //
-                             }));
-}
-
-TEST_P(QuantizedDepthwiseConvolutionOpTest,
-       SimpleTestQuantizedFilterMultiplierGreaterThan1) {
-  QuantizedDepthwiseConvolutionOpModel quant_op(
-      GetRegistration(), {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64},
-      {TensorType_UINT8, {1, 2, 2, 4}, -128.5, 128},
-      {TensorType_UINT8, {}, -127, 128}, Padding_VALID);
-  DepthwiseConvolutionOpModel float_op(GetRegistration(),
-                                       {TensorType_FLOAT32, {1, 3, 2, 2}},
-                                       {TensorType_FLOAT32, {1, 2, 2, 4}},
-                                       {TensorType_FLOAT32, {}}, Padding_VALID);
-
-  std::initializer_list<float> input = {
-      1, 2, 7,  8,   // column 1
-      3, 4, 9,  10,  // column 2
-      5, 6, 11, 12,  // column 3
-  };
-  std::initializer_list<float> filter = {
-      1,  2,   3,   4,    //
-      -9, 10,  -11, 12,   //
-      5,  6,   7,   8,    //
-      13, -14, 15,  -16,  //
-  };
-  std::initializer_list<float> bias = {1, 2, 3, 4};
-
-  quant_op.SetInput(input);
-  quant_op.SetFilter(filter);
-  quant_op.SetBias(bias);
-  quant_op.Invoke();
-
-  float_op.SetInput(input);
-  float_op.SetFilter(filter);
-  float_op.SetBias(bias);
-  float_op.Invoke();
-
-  EXPECT_THAT(quant_op.GetDequantizedOutput(),
-              ElementsAreArray(ArrayFloatNear(float_op.GetOutput(), 1)));
-}
-
-TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingValid) {
-  const int depth = 1;
-  const int image_width = 9;
-  const int image_height = 9;
-  const int image_batch_count = 1;
-  const int filter_size = 3;
-  const int filter_count = 1;
-  const int dilation_factor = 3;
-  QuantizedDepthwiseConvolutionOpModel m(
-      GetRegistration(),
-      {TensorType_UINT8,
-       {image_batch_count, image_height, image_width, depth},
-       0,
-       255},
-      {TensorType_UINT8,
-       {depth, filter_size, filter_size, filter_count},
-       0,
-       255},
-      {TensorType_UINT8, {}, 0, 255}, Padding_VALID, dilation_factor);
-
-  // The image matrix is:
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
-  // clang-format off
-  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 1, 1, 1, 0, 0, 0,
-              0, 0, 0, 1, 1, 1, 0, 0, 0,
-              0, 0, 0, 1, 1, 1, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0,
-              0, 0, 0, 0, 0, 0, 0, 0, 0});
-  // clang-format on
-  // The filter matrix is:
-  // | 1 | 2 | 3 |
-  // | 4 | 5 | 6 |
-  // | 7 | 8 | 9 |
-  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
-  // No bias for this test.
-  m.SetBias({0});
-  m.Invoke();
-
-  // Since the dilation rate is 3 this will reduce the size of the output from
-  // 10x10 to 3x3 of all 5s. Specifically:
-  // | 5 | 5 | 5 |
-  // | 5 | 5 | 5 |
-  // | 5 | 5 | 5 |
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
-}
-
-TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
-  const int depth = 1;
-  const int image_width = 3;
-  const int image_height = 3;
-  const int image_batch_count = 1;
-  const int filter_size = 2;
-  const int filter_count = 1;
-  const int dilation_factor = 2;
-  QuantizedDepthwiseConvolutionOpModel m(
-      GetRegistration(),
-      {TensorType_UINT8,
-       {image_batch_count, image_height, image_width, depth},
-       0,
-       255},
-      {TensorType_UINT8,
-       {depth, filter_size, filter_size, filter_count},
-       0,
-       255},
-      {TensorType_UINT8, {}, 0, 255}, Padding_SAME, dilation_factor);
-
-  // The image matrix is:
-  // | 1 | 1 | 1 |
-  // | 1 | 1 | 1 |
-  // | 1 | 1 | 1 |
-  m.SetInput({1, 1, 1, 1, 1, 1, 1, 1, 1});
-  // The filter matrix is:
-  // | 1 | 2 |
-  // | 3 | 4 |
-  m.SetFilter({1, 2, 3, 4});
-  // No bias for this test.
-  m.SetBias({0});
-  m.Invoke();
-
-  // Output:
-  // | 4 | 7 | 3 |
-  // | 6 |10 | 4 |
-  // | 2 | 3 | 1 |
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
-}
-
-INSTANTIATE_TEST_CASE_P(
-    DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
-    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
-
-INSTANTIATE_TEST_CASE_P(
-    QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest,
-    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/dequantize.cc b/tensorflow/contrib/lite/kernels/dequantize.cc
deleted file mode 100644
index 59bf64e0afabc44a984a9797cabbcfcde531f1f6..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/dequantize.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <string.h>
-#include <vector>
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace dequantize {
-
-struct OpContext {
-  OpContext(TfLiteContext* context, TfLiteNode* node) {
-    input = GetInput(context, node, 0);
-    output = GetOutput(context, node, 0);
-  }
-  const TfLiteTensor* input;
-  TfLiteTensor* output;
-};
-
-struct OpData {
-  // This boolean value is only used when the input tensor is constant.
-  bool float_dequantized_weights_initialized;
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData();
-  op_data->float_dequantized_weights_initialized = false;
-  return op_data;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  OpContext op_context(context, node);
-
-  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8);
-
-  op_context.output->type = kTfLiteFloat32;
-  // If the input tensor is constant, we can persist the dequantized value in
-  // the output tensor. Otherwise we run dequantize upon each eval.
-  if (IsConstantTensor(op_context.input)) {
-    op_context.output->allocation_type = kTfLiteArenaRwPersistent;
-  }
-  return context->ResizeTensor(context, op_context.output,
-                               TfLiteIntArrayCopy(op_context.input->dims));
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-  OpContext op_context(context, node);
-  if (IsConstantTensor(op_context.input) &&
-      op_data->float_dequantized_weights_initialized) {
-    return kTfLiteOk;
-  }
-
-  tflite::DequantizationParams op_params;
-  op_params.zero_point = op_context.input->params.zero_point;
-  op_params.scale = op_context.input->params.scale;
-  optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
-                            GetTensorData<uint8_t>(op_context.input),
-                            GetTensorShape(op_context.output),
-                            GetTensorData<float>(op_context.output));
-
-  if (IsConstantTensor(op_context.input)) {
-    op_data->float_dequantized_weights_initialized = true;
-  }
-
-  return kTfLiteOk;
-}
-
-}  // namespace dequantize
-
-TfLiteRegistration* Register_DEQUANTIZE_OPT() {
-  static TfLiteRegistration r = {dequantize::Init, dequantize::Free,
-                                 dequantize::Prepare, dequantize::Eval};
-  return &r;
-}
-
-TfLiteRegistration* Register_DEQUANTIZE() { return Register_DEQUANTIZE_OPT(); }
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/dequantize_test.cc b/tensorflow/contrib/lite/kernels/dequantize_test.cc
deleted file mode 100644
index fcd74206177a0a97db168338e3619d4b95c052a9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/dequantize_test.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class DequantizeOpModel : public SingleOpModel {
- public:
-  DequantizeOpModel(std::initializer_list<int> shape, float min, float max) {
-    input_ = AddInput({TensorType_UINT8, shape, min, max});
-    output_ = AddOutput({TensorType_FLOAT32, shape});
-    SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
-                 CreateDequantizeOptions(builder_).Union());
-
-    BuildInterpreter({GetShape(input_)});
-  }
-
-  void SetInput(std::initializer_list<uint8_t> data) {
-    PopulateTensor(input_, data);
-  }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
- private:
-  int input_;
-  int output_;
-};
-
-TEST(SplitOpTest, FourDimensional) {
-  DequantizeOpModel m({2, 5}, -63.5, 64);
-
-  m.SetInput({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray(ArrayFloatNear(
-                  {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc b/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc
deleted file mode 100644
index 1e8caebd820248e2e2bc031e08ba671b28084198..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/detection_postprocess_test.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <functional>
-#include <memory>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_DETECTION_POSTPROCESS();
-
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::ElementsAreArray;
-
-class BaseDetectionPostprocessOpModel : public SingleOpModel {
- public:
-  BaseDetectionPostprocessOpModel(const TensorData& input1,
-                            const TensorData& input2,
-                            const TensorData& input3,
-                            const TensorData& output1,
-                            const TensorData& output2,
-                            const TensorData& output3,
-                            const TensorData& output4) {
-    input1_ = AddInput(input1);
-    input2_ = AddInput(input2);
-    input3_ = AddInput(input3);
-    output1_ = AddOutput(output1);
-    output2_ = AddOutput(output2);
-    output3_ = AddOutput(output3);
-    output4_ = AddOutput(output4);
-
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {
-      fbb.Int("max_detections", 3);
-      fbb.Int("max_classes_per_detection", 1);
-      fbb.Float("nms_score_threshold", 0.0);
-      fbb.Float("nms_iou_threshold", 0.5);
-      fbb.Int("num_classes", 2);
-      fbb.Float("y_scale", 10.0);
-      fbb.Float("x_scale", 10.0);
-      fbb.Float("h_scale", 5.0);
-      fbb.Float("w_scale", 5.0);
-    });
-    fbb.Finish();
-    SetCustomOp("TFLite_Detection_PostProcess", fbb.GetBuffer(),
-                Register_DETECTION_POSTPROCESS);
-    BuildInterpreter({GetShape(input1_), GetShape(input2_), GetShape(input3_)});
-  }
-
-  int input1() { return input1_; }
-  int input2() { return input2_; }
-  int input3() { return input3_; }
-
-  template <class T>
-  void SetInput1(std::initializer_list<T> data) {
-    PopulateTensor<T>(input1_, data);
-  }
-
-  template <class T>
-  void SetInput2(std::initializer_list<T> data) {
-    PopulateTensor<T>(input2_, data);
-  }
-
-  template <class T>
-  void SetInput3(std::initializer_list<T> data) {
-    PopulateTensor<T>(input3_, data);
-  }
-
-  template <class T>
-  std::vector<T> GetOutput1() {
-    return ExtractVector<T>(output1_);
-  }
-
-  template <class T>
-  std::vector<T> GetOutput2() {
-    return ExtractVector<T>(output2_);
-  }
-
-  template <class T>
-  std::vector<T> GetOutput3() {
-    return ExtractVector<T>(output3_);
-  }
-
-  template <class T>
-  std::vector<T> GetOutput4() {
-    return ExtractVector<T>(output4_);
-  }
-
-  std::vector<int> GetOutputShape1() { return GetTensorShape(output1_); }
-  std::vector<int> GetOutputShape2() { return GetTensorShape(output2_); }
-  std::vector<int> GetOutputShape3() { return GetTensorShape(output3_); }
-  std::vector<int> GetOutputShape4() { return GetTensorShape(output4_); }
-
- protected:
-  int input1_;
-  int input2_;
-  int input3_;
-  int output1_;
-  int output2_;
-  int output3_;
-  int output4_;
-};
-
-TEST(DetectionPostprocessOpTest, FloatTest) {
-  BaseDetectionPostprocessOpModel m(
-      {TensorType_FLOAT32, {1, 6, 4}}, {TensorType_FLOAT32, {1, 6, 3}},
-      {TensorType_FLOAT32, {6, 4}}, {TensorType_FLOAT32, {}},
-      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
-      {TensorType_FLOAT32, {}});
-
-  // six boxes in center-size encoding
-  m.SetInput1<float>({0.0, 0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
-                      0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-                      0.0, 1.0,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0});
-  // class scores - two classes with background
-  m.SetInput2<float>({0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0.,
-                      .5, .4, 0., .3, .2});
-  // six anchors in center-size encoding
-  m.SetInput3<float>({0.5, 0.5,  1.0, 1.0, 0.5, 0.5,   1.0, 1.0,
-                      0.5, 0.5,  1.0, 1.0, 0.5, 10.5,  1.0, 1.0,
-                      0.5, 10.5, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0});
-  // Same boxes in box-corner encoding:
-  // { 0.0, 0.0, 1.0, 1.0,
-  //   0.0, 0.1, 1.0, 1.1,
-  //   0.0, -0.1, 1.0, 0.9,
-  //   0.0, 10.0, 1.0, 11.0,
-  //   0.0, 10.1, 1.0, 11.1,
-  //   0.0, 100.0, 1.0, 101.0}
-  m.Invoke();
-  // detection_boxes
-  // in center-size
-  std::vector<int> output_shape1 = m.GetOutputShape1();
-  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
-  EXPECT_THAT(
-      m.GetOutput1<float>(),
-      ElementsAreArray(ArrayFloatNear(
-          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
-          1e-1)));
-  // detection_classes
-  std::vector<int> output_shape2 = m.GetOutputShape2();
-  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
-  EXPECT_THAT(m.GetOutput2<float>(),
-              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
-  // detection_scores
-  std::vector<int> output_shape3 = m.GetOutputShape3();
-  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
-  EXPECT_THAT(m.GetOutput3<float>(),
-              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
-  // num_detections
-  std::vector<int> output_shape4 = m.GetOutputShape4();
-  EXPECT_THAT(output_shape4, ElementsAre(1));
-  EXPECT_THAT(m.GetOutput4<float>(),
-              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
-}
-
-TEST(DetectionPostprocessOpTest, QuantizedTest) {
-  BaseDetectionPostprocessOpModel m(
-      {TensorType_UINT8, {1, 6, 4}, -1.0, 1.0},
-      {TensorType_UINT8, {1, 6, 3}, 0.0, 1.0},
-      {TensorType_UINT8, {6, 4}, 0.0, 100.5}, {TensorType_FLOAT32, {}},
-      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
-      {TensorType_FLOAT32, {}});
-  // six boxes in center-size encoding
-  std::vector<std::initializer_list<float>> inputs1 = {
-      {0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
-       0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,  0.0, 0.0}};
-  m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
-  // class scores - two classes with background
-  std::vector<std::initializer_list<float>> inputs2 = {
-      {0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0., .5, .4, 0., .3,
-       .2}};
-  m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
-  // six anchors in center-size encoding
-  std::vector<std::initializer_list<float>> inputs3 = {
-      {0.5, 0.5,  1.0, 1.0, 0.5, 0.5,  1.0, 1.0, 0.5, 0.5,   1.0, 1.0,
-       0.5, 10.5, 1.0, 1.0, 0.5, 10.5, 1.0, 1.0, 0.5, 100.5, 1.0, 1.0}};
-  m.QuantizeAndPopulate<uint8_t>(m.input3(), inputs3[0]);
-  m.Invoke();
-  // detection_boxes
-  // in center-size
-  std::vector<int> output_shape1 = m.GetOutputShape1();
-  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
-  EXPECT_THAT(
-      m.GetOutput1<float>(),
-      ElementsAreArray(ArrayFloatNear(
-          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
-          3e-1)));
-  // detection_classes
-  std::vector<int> output_shape2 = m.GetOutputShape2();
-  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
-  EXPECT_THAT(m.GetOutput2<float>(),
-              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
-  // detection_scores
-  std::vector<int> output_shape3 = m.GetOutputShape3();
-  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
-  EXPECT_THAT(m.GetOutput3<float>(),
-              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
-  // num_detections
-  std::vector<int> output_shape4 = m.GetOutputShape4();
-  EXPECT_THAT(output_shape4, ElementsAre(1));
-  EXPECT_THAT(m.GetOutput4<float>(),
-              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
-}
-}  // namespace
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
deleted file mode 100644
index f6d2f76dbebbdc51fe160aae361e216c3c87116e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ /dev/null
@@ -1,502 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/gemm_support.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace fully_connected {
-
-// This file has four implementations of FullyConnected
-enum KernelType {
-  kReference,
-  kGenericOptimized,  // Neon-free
-  kNeonOptimized,
-  kPie,  // Used by the PIE team
-};
-
-struct OpData {
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-  // The index of the temporary tensor where the quantized inputs are cached.
-  int input_quantized_index;
-};
-
-constexpr int kInputTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-constexpr int kShuffledInputWorkspaceTensor = 1;
-constexpr int kScratchBufferTensor = 1;
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  // This is a builtin op, so we don't use the contents in 'buffer', if any.
-  // Instead, we allocate a new object to carry information from Prepare() to
-  // Eval().
-  gemm_support::IncrementUsageCounter(context);
-  auto* op_data = new OpData();
-  context->AddTensors(context, 1, &op_data->input_quantized_index);
-  return op_data;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  gemm_support::DecrementUsageCounter(context);
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
-
-  // Check we have all the inputs and outputs we need.
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 3);
-  // Shuffled formats need a workspace to store the shuffled input activations.
-  const int expected_outputs_count =
-      params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault ? 1
-                                                                          : 2;
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, expected_outputs_count);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  // Check all the parameters of tensor match within themselves and match the
-  // input configuration.
-  int input_size = 1;
-  for (int i = 0; i < input->dims->size; i++) {
-    input_size *= input->dims->data[i];
-  }
-
-  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2);
-  const int batch_size = input_size / filter->dims->data[1];
-  const int num_units = filter->dims->data[0];
-
-  TF_LITE_ENSURE_EQ(context, input_size, batch_size * filter->dims->data[1]);
-  if (bias) {
-    TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
-  }
-
-  // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
-  TfLiteType data_type = input->type;
-  if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
-        context, params->activation, output, &data->output_activation_min,
-        &data->output_activation_max));
-  }
-
-  // If we have to perform on-the-fly quantization (with quantized weights and
-  // float inputs) first we need to quantize the inputs. Allocate a temporary
-  // buffer to store the intermediate quantized values.
-  if (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8) {
-    TfLiteIntArrayFree(node->temporaries);
-    node->temporaries = TfLiteIntArrayCreate(1);
-    node->temporaries->data[0] = data->input_quantized_index;
-
-    TfLiteTensor* input_quantized =
-        &context->tensors[node->temporaries->data[0]];
-    input_quantized->type = kTfLiteUInt8;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-
-    // TODO(raziel): add this logic to ResizeTensor.
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-  }
-
-  // Resize output.
-  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
-  output_size_array->data[0] = batch_size;
-  output_size_array->data[1] = num_units;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, output, output_size_array));
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
-                     TfLiteFullyConnectedParams* params, OpData* data,
-                     const TfLiteTensor* input, const TfLiteTensor* filter,
-                     const TfLiteTensor* bias, TfLiteTensor* output) {
-  int total_input_size = 1;
-  for (int i = 0; i < input->dims->size; i++) {
-    total_input_size *= input->dims->data[i];
-  }
-
-  int input_size = filter->dims->data[1];
-  const int batch_size = total_input_size / filter->dims->data[1];
-  const int num_units = filter->dims->data[0];
-
-  // Output = bias if bias tensor exists.
-  if (bias) {
-    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
-                                          output->data.f);
-  } else {
-    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
-  }
-
-  // Compute output += weight * input
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      filter->data.f, num_units, input_size, input->data.f, batch_size,
-      output->data.f, /*result_stride=*/1);
-
-  // Apply activation function
-  tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
-                                        params->activation, output->data.f);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalPieQuantized(TfLiteContext* context, TfLiteNode* node,
-                              TfLiteFullyConnectedParams* params, OpData* data,
-                              const TfLiteTensor* input,
-                              const TfLiteTensor* filter,
-                              const TfLiteTensor* bias,
-                              TfLiteTensor* input_quantized,
-                              TfLiteTensor* output) {
-  // Check the types for this hybrid Op.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteUInt8);
-  TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-
-  int total_input_size = 1;
-  for (int i = 0; i < input->dims->size; i++) {
-    total_input_size *= input->dims->data[i];
-  }
-
-  const int input_size = filter->dims->data[1];
-  const int batch_size = total_input_size / filter->dims->data[1];
-  const int num_units = filter->dims->data[0];
-
-  // Output = bias if bias tensor exists.
-  if (bias) {
-    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
-                                          output->data.f);
-  } else {
-    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
-  }
-
-  // Save matrix multiplication computation for all zero input.
-  if (tensor_utils::IsZeroVector(input->data.f, total_input_size)) {
-    tensor_utils::ApplyActivationToVector(output->data.f,
-                                          batch_size * num_units,
-                                          params->activation, output->data.f);
-    return kTfLiteOk;
-  }
-
-  // Quantize input from float to uint8 + quantization params (scaling factor).
-  float min, max;
-  float* scaling_factors = new float[batch_size];
-
-  // Quantize each batch independently.
-  for (int b = 0; b < batch_size; ++b) {
-    const int offset = b * input_size;
-    tensor_utils::SymmetricQuantizeFloats(
-        input->data.f + offset, input_size,
-        reinterpret_cast<int8_t*>(input_quantized->data.uint8) + offset, &min,
-        &max, &scaling_factors[b]);
-    // Incorporate scaling of the filter.
-    scaling_factors[b] *= filter->params.scale;
-  }
-
-  // Compute output += weight * quantized_input
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      reinterpret_cast<int8_t*>(filter->data.uint8), num_units, input_size,
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8), scaling_factors,
-      batch_size, output->data.f, /*result_stride=*/1);
-
-  // Apply activation function to floats.
-  tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
-                                        params->activation, output->data.f);
-  delete[] scaling_factors;
-
-  return kTfLiteOk;
-}
-
-#define TF_LITE_MACRO_DISPATCH(macro_name, params, target_namespace) \
-  if (params->activation == kTfLiteActNone) {                        \
-    macro_name(target_namespace, kNone);                             \
-  }                                                                  \
-  if (params->activation == kTfLiteActRelu) {                        \
-    macro_name(target_namespace, kRelu);                             \
-  }                                                                  \
-  if (params->activation == kTfLiteActRelu6) {                       \
-    macro_name(target_namespace, kRelu6);                            \
-  }
-
-template <KernelType kernel_type>
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
-  gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
-
-  int32_t input_offset = -input->params.zero_point;
-  int32_t filter_offset = -filter->params.zero_point;
-  int32_t output_offset = output->params.zero_point;
-#define TF_LITE_FULLY_CONNECTED(type, output_data_type)                  \
-  {                                                                      \
-    FullyConnectedParams op_params;                                      \
-    op_params.input_offset = input_offset;                               \
-    op_params.weights_offset = filter_offset;                            \
-    op_params.output_offset = output_offset;                             \
-    op_params.output_multiplier = data->output_multiplier;               \
-    op_params.output_shift = -data->output_shift;                        \
-    op_params.quantized_activation_min = data->output_activation_min;    \
-    op_params.quantized_activation_max = data->output_activation_max;    \
-    type::FullyConnected(                                                \
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-        GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-        GetTensorShape(output), GetTensorData<output_data_type>(output), \
-        gemm_context);                                                   \
-  }
-  if (kernel_type == kReference) {
-    switch (output->type) {
-      case kTfLiteUInt8:
-        TF_LITE_FULLY_CONNECTED(reference_ops, uint8_t);
-        break;
-      case kTfLiteInt16:
-        TF_LITE_FULLY_CONNECTED(reference_ops, int16_t);
-        break;
-      default:
-        context->ReportError(
-            context,
-            "Quantized FullyConnected expects output data type uint8 or int16");
-        return kTfLiteError;
-    }
-  } else if (kernel_type == kPie && input->type == kTfLiteFloat32) {
-    // Pie currently only supports quantized models and float inputs/outputs.
-    TfLiteTensor* input_quantized =
-        &context->tensors[node->temporaries->data[0]];
-    return EvalPieQuantized(context, node, params, data, input, filter, bias,
-                            input_quantized, output);
-  } else {
-    switch (output->type) {
-      case kTfLiteUInt8:
-        TF_LITE_FULLY_CONNECTED(optimized_ops, uint8_t);
-        break;
-      case kTfLiteInt16:
-        TF_LITE_FULLY_CONNECTED(optimized_ops, int16_t);
-        break;
-      default:
-        context->ReportError(
-            context,
-            "Quantized FullyConnected expects output data type uint8 or int16");
-        return kTfLiteError;
-    }
-  }
-#undef TF_LITE_FULLY_CONNECTED
-
-  return kTfLiteOk;
-}
-
-template <KernelType kernel_type>
-TfLiteStatus EvalShuffledQuantized(TfLiteContext* context, TfLiteNode* node,
-                                   TfLiteFullyConnectedParams* params,
-                                   OpData* data, const TfLiteTensor* input,
-                                   const TfLiteTensor* filter,
-                                   const TfLiteTensor* bias,
-                                   TfLiteTensor* output,
-                                   TfLiteTensor* shuffled_input_workspace) {
-  gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
-
-  // TODO(b/110697972) decide more consistently if / how / where we want
-  // to perform this kind of runtime data type checks.
-  if (input->type != kTfLiteUInt8 || filter->type != kTfLiteUInt8 ||
-      bias->type != kTfLiteInt32 || output->type != kTfLiteInt16 ||
-      shuffled_input_workspace->type != kTfLiteUInt8) {
-    context->ReportError(context, "Unexpected data type");
-    return kTfLiteError;
-  }
-
-#define TF_LITE_SHUFFLED_FULLY_CONNECTED(type)                           \
-  {                                                                      \
-    FullyConnectedParams op_params;                                      \
-    op_params.output_multiplier = data->output_multiplier;               \
-    op_params.output_shift = -data->output_shift;                        \
-    op_params.quantized_activation_min = data->output_activation_min;    \
-    op_params.quantized_activation_max = data->output_activation_max;    \
-    type::ShuffledFullyConnected(                                        \
-        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
-        GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
-        GetTensorShape(output), GetTensorData<int16_t>(output),          \
-        GetTensorData<uint8_t>(shuffled_input_workspace), gemm_context); \
-  }
-  if (kernel_type == kReference) {
-    TF_LITE_SHUFFLED_FULLY_CONNECTED(reference_ops);
-  } else {
-    TF_LITE_SHUFFLED_FULLY_CONNECTED(optimized_ops);
-  }
-#undef TF_LITE_SHUFFLED_FULLY_CONNECTED
-
-  return kTfLiteOk;
-}
-
-template <KernelType kernel_type>
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
-                           &output_activation_max);
-#define TF_LITE_FULLY_CONNECTED(type)                                         \
-  {                                                                           \
-    FullyConnectedParams op_params;                                           \
-    op_params.float_activation_min = output_activation_min;                   \
-    op_params.float_activation_max = output_activation_max;                   \
-    type::FullyConnected(op_params, GetTensorShape(input),                    \
-                         GetTensorData<float>(input), GetTensorShape(filter), \
-                         GetTensorData<float>(filter), GetTensorShape(bias),  \
-                         GetTensorData<float>(bias), GetTensorShape(output),  \
-                         GetTensorData<float>(output));                       \
-  }
-  if (kernel_type == kReference) {
-    TF_LITE_FULLY_CONNECTED(reference_ops);
-  } else if (kernel_type == kPie) {
-    return EvalPie(context, node, params, data, input, filter, bias, output);
-  } else {
-    TF_LITE_FULLY_CONNECTED(optimized_ops);
-  }
-#undef TF_LITE_FULLY_CONNECTED
-
-  return kTfLiteOk;
-}
-
-#undef TF_LITE_MACRO_DISPATCH
-
-template <KernelType kernel_type>
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  switch (filter->type) {  // Already know in/out types are same.
-    case kTfLiteFloat32:
-      return EvalFloat<kernel_type>(context, node, params, data, input, filter,
-                                    bias, output);
-    case kTfLiteUInt8:
-      if (params->weights_format ==
-          kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8) {
-        TfLiteTensor* shuffled_input_workspace =
-            GetOutput(context, node, kShuffledInputWorkspaceTensor);
-        return EvalShuffledQuantized<kernel_type>(context, node, params, data,
-                                                  input, filter, bias, output,
-                                                  shuffled_input_workspace);
-      } else if (params->weights_format ==
-                 kTfLiteFullyConnectedWeightsFormatDefault) {
-        return EvalQuantized<kernel_type>(context, node, params, data, input,
-                                          filter, bias, output);
-      } else {
-        context->ReportError(context,
-                             "Unhandled fully-connected weights format");
-        return kTfLiteError;
-      }
-    default:
-      context->ReportError(context, "Type %d not currently supported.",
-                           filter->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace fully_connected
-
-TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
-      fully_connected::Eval<fully_connected::kReference>};
-  return &r;
-}
-
-TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
-      fully_connected::Eval<fully_connected::kNeonOptimized>};
-  return &r;
-}
-
-TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
-      fully_connected::Eval<fully_connected::kGenericOptimized>};
-  return &r;
-}
-
-TfLiteRegistration* Register_FULLY_CONNECTED_PIE() {
-  static TfLiteRegistration r = {fully_connected::Init, fully_connected::Free,
-                                 fully_connected::Prepare,
-                                 fully_connected::Eval<fully_connected::kPie>};
-  return &r;
-}
-
-TfLiteRegistration* Register_FULLY_CONNECTED() {
-  // TODO(ahentz): We don't have a dedicated quantized version of the PIE
-  // kernel. For now, the quantized version just defer to the corresponding
-  // optimized MINI kernel. At some point we will allow different libraries to
-  // be built with different kernels, but for now we have to pick one here.
-  return Register_FULLY_CONNECTED_PIE();
-#ifdef USE_NEON
-  return Register_FULLY_CONNECTED_NEON_OPT();
-#else
-  return Register_FULLY_CONNECTED_GENERIC_OPT();
-#endif
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
deleted file mode 100644
index 08b43209466a1b85613ae41d5aa776194f992c60..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc
+++ /dev/null
@@ -1,766 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for TFLite FULLY_CONNECTED op.
-
-#include <iomanip>
-#include <random>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/memory/memory.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-
-namespace ops {
-namespace builtin {
-
-TfLiteRegistration* Register_FULLY_CONNECTED_REF();
-TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT();
-TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT();
-TfLiteRegistration* Register_FULLY_CONNECTED_PIE();
-
-}  // namespace builtin
-}  // namespace ops
-
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::ElementsAreArray;
-
-static float fully_connected_input[] = {
-    0.503691, 0.196961, 0.521017, 0.554248, 0.288678, 0.792476, 0.561653,
-    0.462230, 0.650736, 0.163132, 0.029658, 0.411544, 0.470539, 0.572390,
-    0.538755, 0.212030, 0.264309, 0.193908, 0.777480, 0.745661, 0.423314,
-    0.470804, 0.175501, 0.492225, 0.192743, 0.540183, 0.372514, 0.446550,
-    0.498173, 0.126472, 0.132706, 0.001864, 0.323433, 0.653723, 0.556112,
-    0.612111, 0.446199, 0.117765, 0.074341, 0.096935, 0.280897, 0.103999,
-    0.508479, 0.751437, 0.676389, 0.047234, 0.963467, 0.940698, 0.241142,
-    0.740947, 0.686359, 0.664456, 0.211751, 0.861860, 0.156681, 0.404494,
-    0.402043, 0.529195, 0.851044, 0.900216, 0.655667, 0.983750, 0.902081,
-    0.979100, 0.637473, 0.458193, 0.591211, 0.083671, 0.575958, 0.665552,
-    0.180606, 0.856856, 0.769551, 0.689086, 0.608293, 0.445940, 0.736320,
-    0.571760, 0.386637, 0.977461, 0.312707, 0.072996, 0.641918, 0.524458,
-    0.934856, 0.798598, 0.928951, 0.336899, 0.327793, 0.779995, 0.237115,
-    0.983460, 0.763746, 0.139196, 0.962560, 0.401218, 0.597389, 0.553771,
-    0.484890, 0.173347, 0.219322, 0.665496, 0.030203, 0.988873, 0.354582,
-    0.638496, 0.434813, 0.090902, 0.210256, 0.821450, 0.068363, 0.522962,
-    0.894446, 0.710280, 0.047420, 0.829302, 0.508879, 0.976371, 0.166202,
-    0.836672, 0.756367, 0.403317, 0.820132, 0.520112, 0.542513, 0.782691,
-    0.921330, 0.139902};
-
-static float fully_connected_golden_output[] = {
-    0,        0.0732134,   0,        0,          0,         0.280859,
-    0,        0.128927,    0,        0.0777251,  0,         0.270268,
-    0.271435, 0.0173503,   0.335465, 0.235562,
-
-    0,        0.0745866,   0,        0.051611,   0,         0.253876,
-    0,        0.0814873,   0,        0.104104,   0,         0.248529,
-    0.264194, 0,           0.302973, 0.166252,
-
-    0,        0.0170409,   0,        0.0509851,  0,         0.212834,
-    0,        0.0208326,   0,        0.129932,   0.203978,  0.103428,
-    0.298051, 0,           0.332233, 0.00445903,
-
-    0,        0.125246,    0,        0.0735336,  0,         0.0910256,
-    0,        0,           0,        0.18933,    0.378111,  0.0712443,
-    0.277298, 0.0123414,   0.267454, 0,
-
-    0,        0.14687,     0,        0.155495,   0.0300215, 0.147256,
-    0,        0,           0,        0.156412,   0.434914,  0.0461529,
-    0.246508, 0,           0.363138, 0,
-
-    0,        0,           0,        0.0212949,  0,         0.301708,
-    0,        0.35497,     0,        0.406223,   0.0260211, 0.049195,
-    0.197161, 0,           0.37316,  0,
-
-    0,        0.221783,    0,        0,          0.0116515, 0.281945,
-    0,        0,           0,        0,          0.285626,  0.181773,
-    0.296401, 0.170452,    0.367135, 0.142597,
-
-    0,        0,           0,        0,          0,         0.418886,
-    0,        0.291063,    0,        0.227541,   0.0424759, 0.27589,
-    0.398286, 0.177146,    0.40359,  0.121452,
-
-    0,        0.0834884,   0,        0,          0,         0.287441,
-    0,        0.0046838,   0,        0.0122087,  0,         0.217376,
-    0.140183, 0.0948412,   0.436677, 0.0589876,
-
-    0,        0.0289969,   0,        0.0921397,  0,         0.396802,
-    0,        0.0126157,   0,        0.0968433,  0,         0.172271,
-    0.173295, 0.0664741,   0.53645,  0.00915603,
-
-    0,        0,           0,        0,          0,         0.147942,
-    0,        0.263795,    0,        0.39782,    0,         0.382435,
-    0.561072, 0.0579847,   0.145712, 0.13508,
-
-    0,        0,           0,        0.16382,    0,         0.322294,
-    0,        0.163798,    0,        0.405211,   0.367953,  0.076852,
-    0.342473, 0.0834118,   0.377537, 0,
-
-    0,        0.206,       0,        0,          0,         0.375769,
-    0,        0,           0,        0,          0,         0.125165,
-    0,        0.105591,    0.52055,  0.0536445,
-
-    0,        0.259261,    0,        0,          0,         0.247707,
-    0,        0,           0,        0,          0,         0.215862,
-    0.149153, 0.224678,    0.359519, 0.129419,
-
-    0,        0.17611,     0,        0.280895,   0,         0.576484,
-    0,        0.000418848, 0,        0,          0,         0.151112,
-    0.211902, 0,           0.566341, 0.106305,
-
-    0,        0.0246284,   0,        0,          0,         0.196267,
-    0,        0.0248624,   0,        0.265635,   0,         0.436199,
-    0.408079, 0.134514,    0.328489, 0.411368};
-
-class BaseFullyConnectedOpModel : public SingleOpModel {
- public:
-  // TODO(ahentz): test different activation types too.
-  BaseFullyConnectedOpModel(
-      TfLiteRegistration* registration, int units, int batches,
-      const TensorData& input, const TensorData& output = {TensorType_FLOAT32},
-      ActivationFunctionType activation_func = ActivationFunctionType_RELU,
-      FullyConnectedOptionsWeightsFormat weights_format =
-          FullyConnectedOptionsWeightsFormat_DEFAULT)
-      : batches_(batches), units_(units) {
-    int total_input_size = 1;
-    for (int i = 0; i < input.shape.size(); ++i) {
-      total_input_size *= input.shape[i];
-    }
-    input_size_ = total_input_size / batches_;
-
-    input_ = AddInput(input);
-    weights_ =
-        AddInput({input.type, {units_, input_size_}, input.min, input.max});
-
-    if (input.type == TensorType_FLOAT32) {
-      bias_ = AddInput({TensorType_FLOAT32, {units_}});
-    } else {
-      // This is a quantized version. The scale of 'bias' depends on the scales
-      // of input and filter. Supposedly this is correctly set during quantized
-      // training.
-      auto bias_scale = GetScale(input_) * GetScale(weights_);
-      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
-      bias_ = AddInput(bias);
-    }
-
-    output_ = AddOutput(output);
-    if (weights_format != FullyConnectedOptionsWeightsFormat_DEFAULT) {
-      AddOutput({TensorType_UINT8, input.shape});
-    }
-
-    SetBuiltinOp(
-        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
-        CreateFullyConnectedOptions(builder_, activation_func, weights_format)
-            .Union());
-    resolver_ = absl::make_unique<SingleOpResolver>(
-        BuiltinOperator_FULLY_CONNECTED, registration);
-    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
-  }
-
-  int input_size() { return input_size_; }
-  int num_units() { return units_; }
-  int num_batches() { return batches_; }
-
- protected:
-  int input_;
-  int weights_;
-  int bias_;
-  int output_;
-
-  int batches_;
-  int units_;
-  int input_size_;
-};
-
-class FloatFullyConnectedOpModel : public BaseFullyConnectedOpModel {
- public:
-  using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel;
-
-  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
-
-  void SetWeights(const std::vector<float>& f) { PopulateTensor(weights_, f); }
-
-  void SetInput(const std::vector<float>& data) {
-    PopulateTensor(input_, data);
-  }
-  void SetInput(int offset, float* begin, float* end) {
-    PopulateTensor(input_, offset, begin, end);
-  }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-};
-
-class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
- public:
-  using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel;
-
-  void SetBias(const std::vector<float>& data) {
-    QuantizeAndPopulate<int32_t>(bias_, data);
-  }
-  void SetWeights(const std::vector<float>& data) {
-    QuantizeAndPopulate<uint8_t>(weights_, data);
-  }
-  void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth,
-                            int output_depth) {
-    std::vector<float> shuffled_data(data.size());
-    CHECK_EQ(input_depth % 16, 0);
-    CHECK_EQ(output_depth % 4, 0);
-    float* shuffled_data_ptr = shuffled_data.data();
-    for (int block_o = 0; block_o < output_depth; block_o += 4) {
-      for (int block_i = 0; block_i < input_depth; block_i += 16) {
-        for (int o = 0; o < 4; o++) {
-          for (int i = 0; i < 16; i++) {
-            *shuffled_data_ptr++ =
-                data[(block_o + o) * input_depth + block_i + i];
-          }
-        }
-      }
-    }
-    TfLiteTensor* t = interpreter_->tensor(weights_);
-    auto quantized_data =
-        Quantize<uint8_t>(shuffled_data, t->params.scale, t->params.zero_point);
-    for (uint8_t& q : quantized_data) {
-      q ^= 0x80;
-    }
-    PopulateTensor(weights_, 0, quantized_data.data(),
-                   quantized_data.data() + quantized_data.size());
-  }
-  void SetInput(const std::vector<float>& data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
-  }
-
-  template <typename T>
-  std::vector<T> GetOutput() {
-    return ExtractVector<T>(output_);
-  }
-
-  template <typename T>
-  std::vector<float> GetDequantizedOutput() {
-    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
-                         GetZeroPoint(output_));
-  }
-};
-
-// In the hybrid model the weights are quantized (to uint8). But the bias,
-// input (and output) are expected to be in float precision.
-class HybridFullyConnectedOpModel : public SingleOpModel {
- public:
-  HybridFullyConnectedOpModel(int units, int batches, const TensorData& input,
-                              const TensorData& weights,
-                              const TensorData& output = {TensorType_FLOAT32})
-      : batches_(batches), units_(units) {
-    int total_input_size = 1;
-    for (int i = 0; i < input.shape.size(); ++i) {
-      total_input_size *= input.shape[i];
-    }
-    input_size_ = total_input_size / batches_;
-
-    input_ = AddInput(input);
-    weights_ = AddInput(weights);
-
-    TensorData bias{TensorType_FLOAT32, {units_}};
-    bias_ = AddInput(bias);
-
-    output_ = AddOutput(output);
-
-    SetBuiltinOp(
-        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
-        CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
-            .Union());
-    resolver_ = absl::make_unique<SingleOpResolver>(
-        BuiltinOperator_FULLY_CONNECTED,
-        ops::builtin::Register_FULLY_CONNECTED_PIE());
-    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
-  }
-  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
-  void SetWeights(const std::vector<float>& data) {
-    SymmetricQuantizeAndPopulate(weights_, data);
-  }
-
-  void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
-  int input_size() { return input_size_; }
-  int num_units() { return units_; }
-  int num_batches() { return batches_; }
-
- protected:
-  int input_;
-  int weights_;
-  int bias_;
-  int output_;
-
-  int batches_;
-  int units_;
-  int input_size_;
-};
-
-const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
-    {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
-    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
-    {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
-    {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
-});
-
-class FloatFullyConnectedOpTest : public SingleOpTest {
- protected:
-  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMap;
-  }
-};
-
-const auto kKernelMapNoPie = new std::map<string, TfLiteRegistration*>({
-    {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
-    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
-    {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
-});
-
-class QuantizedFullyConnectedOpTest : public SingleOpTest {
- protected:
-  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMapNoPie;
-  }
-};
-
-const auto kKernelMapPie = new std::map<string, TfLiteRegistration*>({
-    {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
-});
-
-// Hybrid mode is used by the Pie quantized kernel.
-class HybridFullyConnectedOpTest : public SingleOpTest {
- protected:
-  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMapPie;
-  }
-};
-
-// TODO(ahentz): add more small tests like this one, focused on making sure the
-// calculations are correct.
-TEST_P(FloatFullyConnectedOpTest, SimpleTest) {
-  FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/3, /*batches=*/2,
-                               /*input=*/{TensorType_FLOAT32, {2, 10}});
-  m.SetWeights({
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-  });
-  m.SetBias({1, 2, 3});
-
-  m.SetInput({
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  });
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
-}
-
-TEST_P(FloatFullyConnectedOpTest, SimpleTest2) {
-  FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/1, /*batches=*/2,
-                               /*input=*/{TensorType_FLOAT32, {2, 2}});
-  m.SetWeights({
-      2, 4,  // u = 0
-  });
-  m.SetBias({1});
-
-  m.SetInput({
-      1, 2,  // b = 0
-      2, 1,  // b = 1
-  });
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAre(11, 9));
-}
-
-TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
-  QuantizedFullyConnectedOpModel m(
-      GetRegistration(), /*units=*/3, /*batches*/ 2,
-      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
-      /*output=*/{TensorType_UINT8, {}, -127, 128});
-
-  // input_product_scale < output_scale was not true.
-  m.SetWeights({
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-  });
-  m.SetBias({1, 2, 3});
-
-  m.SetInput({
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  });
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
-              ElementsAreArray(ArrayFloatNear({
-                  24, 25, 26,  //
-                  58, 59, 60,  //
-              })));
-  EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAre(151, 152, 153, 185, 186, 187));
-}
-
-TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTestQuantizedOutputMultiplierGreaterThan1) {
-  // real_multiplier = 2.
-  QuantizedFullyConnectedOpModel m(
-      GetRegistration(), /*units=*/3, /*batches*/ 2,
-      /*input=*/{TensorType_UINT8, {2, 10}, -127, 128},
-      /*output=*/{TensorType_UINT8, {}, -63.5, 64});
-
-  m.SetWeights({
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-  });
-  m.SetBias({1, 2, 3});
-
-  m.SetInput({
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  });
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
-              ElementsAreArray(ArrayFloatNear({
-                  24, 25, 26,  // first batch
-                  58, 59, 60,  // second batch
-              })));
-  EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAre(175, 177, 179, 243, 245, 247));
-}
-
-void SimpleTestQuantizedInt16OutputCase(
-    TfLiteRegistration* registration, int input_depth, int output_depth,
-    int batches, FullyConnectedOptionsWeightsFormat weights_format) {
-  const uint8_t kWeightsZeroPoint = 128;
-  const float kWeightsScale = 1.f / 128.f;
-  const uint8_t kInputZeroPoint = 128;
-  const float kInputScale = 1.f / 128.f;
-  const float kInputMin = (0 - kInputZeroPoint) * kInputScale;
-  const float kInputMax = (255 - kInputZeroPoint) * kInputScale;
-  // Output ranges in [-8..8] encoded as int16
-  const float kOutputScale = 8.f / 32768.f;
-  const float kOutputMin = -32768 * kOutputScale;
-  const float kOutputMax = 32767 * kOutputScale;
-
-  QuantizedFullyConnectedOpModel m(
-      registration, output_depth, batches,
-      /*input=*/
-      {TensorType_UINT8, {batches, input_depth}, kInputMin, kInputMax},
-      /*output=*/{TensorType_INT16, {}, kOutputMin, kOutputMax},
-      /*activation_func=*/ActivationFunctionType_NONE, weights_format);
-
-  std::mt19937 random_engine;
-  std::uniform_int_distribution<uint8_t> weights_dist;
-
-  std::vector<float> weights_data(input_depth * output_depth);
-  for (auto& w : weights_data) {
-    uint8_t q = weights_dist(random_engine);
-    w = (q - kWeightsZeroPoint) * kWeightsScale;
-  }
-
-  // Based on weights_format, enforce any shape requirement for that format/path
-  // and set the (possibly shuffled) weights.
-  switch (weights_format) {
-    case FullyConnectedOptionsWeightsFormat_DEFAULT:
-      m.SetWeights(weights_data);
-      break;
-    case FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8:
-      // The shuffled path currently supports only a restrictive subset of
-      // shapes, described by the following assertions:
-      CHECK_EQ(input_depth % 16, 0);
-      CHECK_EQ(output_depth % 4, 0);
-      CHECK(batches == 1 || batches == 4);
-      m.ShuffleAndSetWeights(weights_data, input_depth, output_depth);
-      break;
-    default:
-      LOG(FATAL) << "Unhandled weights format";
-  }
-
-  std::uniform_int_distribution<uint8_t> input_dist;
-  std::vector<float> input_data(input_depth * batches);
-  for (auto& i : input_data) {
-    uint8_t q = input_dist(random_engine);
-    i = (q - kInputZeroPoint) * kInputScale;
-  }
-
-  std::vector<float> bias_data(output_depth);
-  // As the output ranges in [-8, 8], it's reasonable to have bias values
-  // in [-1, 1], this won't result in too much saturation.
-  std::uniform_real_distribution<float> bias_dist(-1.f, 1.f);
-  for (auto& b : bias_data) {
-    b = bias_dist(random_engine);
-  }
-
-  m.SetBias(bias_data);
-  m.SetInput(input_data);
-
-  m.Invoke();
-
-  std::vector<float> expected_output_data(output_depth * batches);
-  for (int b = 0; b < batches; b++) {
-    for (int o = 0; o < output_depth; o++) {
-      float accum = bias_data[o];
-      for (int i = 0; i < input_depth; i++) {
-        accum +=
-            input_data[b * input_depth + i] * weights_data[o * input_depth + i];
-      }
-      accum = std::min(accum, kOutputMax);
-      accum = std::max(accum, kOutputMin);
-      expected_output_data[b * output_depth + o] = accum;
-    }
-  }
-
-  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
-              ElementsAreArray(ArrayFloatNear(expected_output_data, 3e-4f)));
-}
-
-TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTestQuantizedInt16OutputDefaultWeights) {
-  for (int input_depth : {1, 3, 10, 100}) {
-    for (int output_depth : {1, 3, 10, 100}) {
-      for (int batch : {1, 3, 10, 100}) {
-        SimpleTestQuantizedInt16OutputCase(
-            GetRegistration(), input_depth, output_depth, batch,
-            FullyConnectedOptionsWeightsFormat_DEFAULT);
-      }
-    }
-  }
-}
-
-TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTestQuantizedInt16OutputShuffled4x16Int8Weights) {
-  // The shuffled weights block shape is 4x16. The shape of the weights matrix
-  // is: rows = output_depth, cols = input_depth. It must be a multiple of 4x16.
-  // This means that output_depth must be a multiple of 4, and input_deth must
-  // be a multiple of 16.
-  for (int input_depth_numblocks : {1, 3}) {
-    for (int output_depth_numblocks : {1, 3}) {
-      int input_depth = 16 * input_depth_numblocks;
-      int output_depth = 4 * output_depth_numblocks;
-      // The fast shuffled path is currently supporting only batch sizes of 1
-      // and 4. The idea is that the whole point of that path is to go as fast
-      // as possible for small batch size, which requires fully specializing
-      // it for each batch size, and for larger batch sizes the generic
-      // gemmlowp-based implementation is fast enough.
-      for (int batch : {1, 4}) {
-        SimpleTestQuantizedInt16OutputCase(
-            GetRegistration(), input_depth, output_depth, batch,
-            FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8);
-      }
-    }
-  }
-}
-
-TEST(HybridFullyConnectedOpTest, SimpleTestQuantized) {
-  HybridFullyConnectedOpModel m(
-      /*units=*/3, /*batches=*/2,
-      /*input=*/{TensorType_FLOAT32, {2, 10}},
-      /*weights=*/{TensorType_UINT8, {3, 10}, -63.5, 64});  // PIE
-
-  m.SetWeights({
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-  });
-  m.SetBias({1, 2, 3});
-
-  m.SetInput({
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  });
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
-                                 {
-                                     24, 25, 26,  //
-                                     58, 59, 60,  //
-                                 },
-                                 /*max_abs_error=*/1.3f)));
-}
-
-TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput) {
-  // Note that it is not required that the first dimension be the number of
-  // batches. All we care is that the input can be evenly distributed in
-  // batches. In this case, we need the input to have multiples of '2'.
-  FloatFullyConnectedOpModel m(GetRegistration(),
-                               /*units=*/3, /*batches=*/2,
-                               /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}});
-  m.SetWeights({
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-  });
-  m.SetBias({1, 2, 3});
-
-  m.SetInput({
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // first batch
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // second batch
-  });
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 24, 25, 26,  // first batch
-                                 58, 59, 60,  // second batch
-                             }));
-}
-
-TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
-  QuantizedFullyConnectedOpModel m(
-      GetRegistration(), /*units=*/3, /*batches=*/2,
-      /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
-      /*output=*/{TensorType_UINT8, {}, -127, 128});
-
-  // input_product_scale < output_scale was not true.
-  m.SetWeights({
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-  });
-  m.SetBias({1, 2, 3});
-
-  m.SetInput({
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  });
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
-              ElementsAreArray(ArrayFloatNear({
-                  24, 25, 26,  //
-                  58, 59, 60,  //
-              })));
-  EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAre(151, 152, 153, 185, 186, 187));
-}
-
-TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) {
-  // real_multiplier = 2.
-  QuantizedFullyConnectedOpModel m(
-      GetRegistration(), /*units=*/3, /*batches=*/2,
-      /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128},
-      /*output=*/{TensorType_UINT8, {}, -63.5, 64});
-
-  m.SetWeights({
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-  });
-  m.SetBias({1, 2, 3});
-
-  m.SetInput({
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  });
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
-              ElementsAreArray(ArrayFloatNear({
-                  24, 25, 26,  // first batch
-                  58, 59, 60,  // second batch
-              })));
-  EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAre(175, 177, 179, 243, 245, 247));
-}
-
-INSTANTIATE_TEST_CASE_P(
-    FloatFullyConnectedOpTest, FloatFullyConnectedOpTest,
-    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
-
-INSTANTIATE_TEST_CASE_P(
-    QuantizedFullyConnectedOpTest, QuantizedFullyConnectedOpTest,
-    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
-
-// TODO(ahentz): Reconsider this test. Having arbitrary weights makes it hard
-// to debug errors and doesn't necessarily test all the important details.
-TEST_P(FloatFullyConnectedOpTest, BlackBoxTest) {
-  FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/16, /*batches=*/2,
-                               /*input=*/{TensorType_FLOAT32, {2, 8}});
-  m.SetWeights(
-      {0.091327,  0.103366,  -0.316505, -0.083120, 0.149366,  -0.196636,
-       -0.123672, 0.062800,  0.063031,  0.191670,  -0.062001, -0.061504,
-       -0.275581, 0.059388,  -0.118497, -0.079224, 0.109758,  0.008307,
-       -0.062657, -0.060962, -0.049782, -0.106719, -0.319482, -0.103650,
-       0.266455,  0.051517,  -0.123448, 0.322464,  0.043282,  -0.173782,
-       -0.190381, 0.002013,  0.096086,  0.131157,  0.031164,  0.100638,
-       -0.312191, -0.080923, -0.101318, -0.116614, 0.142238,  0.086540,
-       -0.139154, 0.174268,  -0.073161, 0.080072,  0.006874,  0.229382,
-       -0.104321, -0.176035, -0.208587, -0.001019, -0.162032, 0.080824,
-       -0.025021, 0.074460,  -0.252595, -0.161750, -0.136403, 0.008308,
-       0.005710,  0.096600,  0.289839,  0.218816,  -0.304651, -0.070958,
-       0.054598,  0.147113,  -0.139112, -0.072798, -0.163335, -0.167863,
-       -0.128762, -0.035780, 0.117262,  0.017177,  0.263335,  -0.176612,
-       0.262961,  -0.093654, -0.339283, 0.333071,  0.180827,  0.287583,
-       0.066350,  -0.197947, -0.114449, -0.236035, 0.103532,  -0.034284,
-       0.093299,  -0.145361, 0.054001,  0.250570,  0.157010,  -0.143480,
-       -0.139061, -0.048873, 0.067557,  0.139038,  0.324106,  0.227041,
-       0.037793,  -0.225747, -0.241619, 0.357835,  0.135762,  -0.306764,
-       -0.125982, 0.091916,  0.266587,  0.030135,  0.265148,  0.141627,
-       0.020120,  0.083815,  -0.124556, -0.100124, -0.048159, 0.181172,
-       0.302309,  -0.041084, 0.146334,  -0.061511, -0.232605, 0.281324,
-       0.145408,  -0.221897});
-  m.SetBias({-0.160594, 0.205770, -0.078307, -0.077984, 0.001937, 0.015860,
-             0.036810, 0.012346, 0.001028, 0.038551, 0.075415, 0.020804,
-             0.048478, -0.032270, 0.175688, -0.085662});
-
-  const int input_sequence_size = sizeof(fully_connected_input) /
-                                  sizeof(float) /
-                                  (m.input_size() * m.num_batches());
-  for (int i = 0; i < input_sequence_size; i++) {
-    // TODO(ahentz): This is what the original test was doing: two equal
-    // batches per invocation. We could instead use two different batches.
-    float* batch_start = fully_connected_input + i * m.input_size();
-    float* batch_end = batch_start + m.input_size();
-    m.SetInput(0, batch_start, batch_end);
-    m.SetInput(m.input_size(), batch_start, batch_end);
-
-    m.Invoke();
-
-    float* golden_start = fully_connected_golden_output + i * m.num_units();
-    float* golden_end = golden_start + m.num_units();
-    std::vector<float> expected;
-    expected.insert(expected.end(), golden_start, golden_end);
-    expected.insert(expected.end(), golden_start, golden_end);
-
-    EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
-  }
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/gather.cc b/tensorflow/contrib/lite/kernels/gather.cc
deleted file mode 100644
index b5afeb1a7bd5528328bd5585d9696b3362cbe3a3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/gather.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <string.h>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/string_util.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace gather {
-constexpr int kInputTensor = 0;
-constexpr int kInputPositions = 1;
-constexpr int kOutputTensor = 0;
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  const auto* params =
-      reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  // Only INT32 positions are supported.
-  TF_LITE_ENSURE_EQ(context, positions->type, kTfLiteInt32);
-  // Assign to output the input type.
-  output->type = input->type;
-  // TODO(mgubin): Only default axis == 0 is supported.
-  TF_LITE_ENSURE_EQ(context, params->axis, 0);
-  // Check conditions for different types.
-  switch (input->type) {
-    case kTfLiteFloat32:
-    case kTfLiteUInt8:
-    case kTfLiteInt32: {
-      // Fully supported by reference_ops::Gather.
-    } break;
-
-    case kTfLiteString: {
-      // Only 1D input is supported.
-      TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
-    } break;
-    default:
-      context->ReportError(
-          context, "Only float32 and string types are supported, got %d",
-          input->type);
-      return kTfLiteError;
-  }
-  const int num_dimensions =
-      NumDimensions(input) + NumDimensions(positions) - 1;
-  TF_LITE_ENSURE(context, params->axis <= num_dimensions);
-  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
-  int output_index = 0;
-  for (int i = 0; i < params->axis; ++i) {
-    output_shape->data[output_index++] = input->dims->data[i];
-  }
-  for (int i = 0; i < positions->dims->size; ++i) {
-    output_shape->data[output_index++] = positions->dims->data[i];
-  }
-  for (int i = params->axis + 1; i < input->dims->size; ++i) {
-    output_shape->data[output_index++] = input->dims->data[i];
-  }
-  return context->ResizeTensor(context, output, output_shape);
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const int input_rank = NumDimensions(input);
-#define TF_LITE_GATHER(data_type, index_type)                              \
-  {                                                                        \
-    tflite::GatherParams op_params;                                        \
-    op_params.input_rank = input_rank;                                     \
-    optimized_ops::Gather(                                                 \
-        op_params, GetTensorShape(input), GetTensorData<data_type>(input), \
-        GetTensorShape(positions), GetTensorData<index_type>(positions),   \
-        GetTensorShape(output), GetTensorData<data_type>(output));         \
-  }
-  switch (input->type) {
-    case kTfLiteFloat32:
-      TF_LITE_GATHER(float, int32_t);
-      break;
-    case kTfLiteUInt8:
-      TF_LITE_GATHER(uint8_t, int32_t);
-      break;
-    case kTfLiteInt32:
-      TF_LITE_GATHER(int32_t, int32_t);
-      break;
-    case kTfLiteString: {
-      // TODO(mgubin): Currently support only for 1D output tensors.
-      DynamicBuffer buffer;
-      const int32* indexes = positions->data.i32;
-      const int num_strings = GetStringCount(input);
-      for (int i = 0; i < positions->dims->data[0]; ++i) {
-        const int pos = indexes[i];
-        TF_LITE_ENSURE(context, pos < num_strings);
-        const auto string_ref = GetString(input, pos);
-        buffer.AddString(string_ref.str, string_ref.len);
-      }
-      buffer.WriteToTensor(output);
-    } break;
-    default:
-      return kTfLiteError;
-  }
-#undef TF_LITE_GATHER
-  return kTfLiteOk;
-}
-}  // namespace gather
-
-TfLiteRegistration* Register_GATHER() {
-  static TfLiteRegistration r = {nullptr, nullptr, gather::Prepare,
-                                 gather::Eval};
-  return &r;
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/gather_test.cc b/tensorflow/contrib/lite/kernels/gather_test.cc
deleted file mode 100644
index 1b48884e0907c67919f65680ab2f096481551eb7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/gather_test.cc
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class GatherOpModel : public SingleOpModel {
- public:
-  GatherOpModel(std::initializer_list<int> input_shape, TensorType input_type,
-                std::initializer_list<int> positions_shape) {
-    input_ = AddInput(input_type);
-    positions_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(input_type);
-    SetBuiltinOp(BuiltinOperator_GATHER, BuiltinOptions_GatherOptions,
-                 CreateGatherOptions(builder_, 0).Union());
-    BuildInterpreter({input_shape, positions_shape});
-  }
-
-  void SetInputFloat(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
-  }
-
-  void SetInputUint8(std::initializer_list<uint8_t> data) {
-    PopulateTensor<uint8_t>(input_, data);
-  }
-
-  void SetInput(std::initializer_list<string> data) {
-    PopulateStringTensor(input_, data);
-  }
-
-  void SetPositions(std::initializer_list<int> data) {
-    PopulateTensor<int>(positions_, data);
-  }
-
-  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
-  std::vector<uint8_t> GetOutputUint8() {
-    return ExtractVector<uint8_t>(output_);
-  }
-  std::vector<string> GetOutputString() {
-    return ExtractVector<string>(output_);
-  }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- protected:
-  int input_;
-  int positions_;
-  int output_;
-};
-
-TEST(GatherOpTest, Shuffle) {
-  GatherOpModel m({2, 2}, TensorType_FLOAT32, {2});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({1, 0});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
-              ElementsAreArray(ArrayFloatNear({0.7, 0.8, -2, 0.2})));
-}
-
-TEST(GatherOpTest, Test0DIndex) {
-  GatherOpModel m({2, 2}, TensorType_FLOAT32, {});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({1});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-}
-
-TEST(GatherOpTest, Test0DIndexWith0DResult) {
-  // 0D tensor is special case in current TFLite. Test it once to make sure
-  // existing workarounds are fine with it.
-  GatherOpModel m({3}, TensorType_FLOAT32, {});
-  m.SetInputFloat({1.0, 2.0, 3.0});
-  m.SetPositions({1});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0})));
-  EXPECT_TRUE(m.GetOutputShape().empty());
-}
-
-TEST(GatherOpTest, Test2DIndexWith2DResult) {
-  GatherOpModel m({3}, TensorType_FLOAT32, {1, 2});
-  m.SetInputFloat({1.0, 2.0, 3.0});
-  m.SetPositions({1, 0});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({2.0, 1.0})));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-}
-
-TEST(FloatGatherOpTest, Duplicate) {
-  GatherOpModel m({1, 2, 2}, TensorType_FLOAT32, {2});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({0, 0});
-  m.Invoke();
-  EXPECT_THAT(
-      m.GetOutputFloat(),
-      ElementsAreArray(ArrayFloatNear({-2, 0.2, 0.7, 0.8, -2, 0.2, 0.7, 0.8})));
-}
-
-TEST(FloatGatherOpTest, Slice) {
-  GatherOpModel m({4, 1}, TensorType_FLOAT32, {2});
-  m.SetInputFloat({-2.0, 0.2, 0.7, 0.8});
-  m.SetPositions({1, 3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray(ArrayFloatNear({0.2, 0.8})));
-}
-
-TEST(Uint8tGatherOpTest, Shuffle) {
-  GatherOpModel m({2, 2}, TensorType_UINT8, {2});
-  m.SetInputUint8({133, 134, 14, 15});
-  m.SetPositions({1, 0});
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutputUint8(), ElementsAreArray({14, 15, 133, 134}));
-}
-
-TEST(GatherOpTest, SimpleString) {
-  GatherOpModel m({3}, TensorType_STRING, {2});
-  m.SetInput({"A", "B", "C"});
-  m.SetPositions({0, 2});
-  m.Invoke();
-  ASSERT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutputString(), ElementsAreArray({"A", "C"}));
-}
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
deleted file mode 100644
index afb5ec05df4429bc89acfd81b71b9c081f90dfc9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ /dev/null
@@ -1,678 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
-
-tflite_deps_intel = [
-    "@arm_neon_2_x86_sse",
-]
-
-HARD_FP_FLAGS_IF_APPLICABLE = select({
-    "//tensorflow:android_arm": ["-mfloat-abi=softfp"],
-    "//tensorflow:android_arm64": ["-mfloat-abi=softfp"],
-    "//tensorflow:android_armeabi": ["-mfloat-abi=softfp"],
-    "//conditions:default": [],
-})
-
-NEON_FLAGS_IF_APPLICABLE = select({
-    ":arm": [
-        "-O3",
-        "-mfpu=neon",
-    ],
-    ":armeabi-v7a": [
-        "-O3",
-        "-mfpu=neon",
-    ],
-    ":armv7a": [
-        "-O3",
-        "-mfpu=neon",
-    ],
-    "//conditions:default": [
-        "-O3",
-    ],
-})
-
-cc_library(
-    name = "types",
-    srcs = [],
-    hdrs = [
-        "compatibility.h",
-        "types.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite/kernels:op_macros",
-        "@com_google_absl//absl/base:core_headers",
-    ],
-)
-
-config_setting(
-    name = "arm",
-    values = {
-        "cpu": "arm",
-    },
-)
-
-config_setting(
-    name = "arm64-v8a",
-    values = {
-        "cpu": "arm64-v8a",
-    },
-)
-
-config_setting(
-    name = "armv7a",
-    values = {
-        "cpu": "armv7a",
-    },
-)
-
-config_setting(
-    name = "armeabi-v7a",
-    values = {
-        "cpu": "armeabi-v7a",
-    },
-)
-
-config_setting(
-    name = "haswell",
-    values = {
-        "cpu": "haswell",
-    },
-)
-
-config_setting(
-    name = "ios_x86_64",
-    values = {
-        "cpu": "ios_x86_64",
-    },
-)
-
-config_setting(
-    name = "ios_armv7",
-    values = {
-        "cpu": "ios_armv7",
-    },
-)
-
-config_setting(
-    name = "ios_arm64",
-    values = {
-        "cpu": "ios_arm64",
-    },
-)
-
-config_setting(
-    name = "k8",
-    values = {
-        "cpu": "k8",
-    },
-)
-
-config_setting(
-    name = "x86",
-    values = {
-        "cpu": "x86",
-    },
-)
-
-config_setting(
-    name = "x86_64",
-    values = {
-        "cpu": "x86_64",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {
-        "cpu": "darwin",
-    },
-)
-
-config_setting(
-    name = "darwin_x86_64",
-    values = {
-        "cpu": "darwin_x86_64",
-    },
-)
-
-config_setting(
-    name = "freebsd",
-    values = {
-        "cpu": "freebsd",
-    },
-)
-
-cc_library(
-    name = "optimized_base",
-    srcs = [],
-    hdrs = [
-        "common.h",
-        "optimized/depthwiseconv_float.h",
-        "optimized/depthwiseconv_uint8.h",
-        "optimized/depthwiseconv_uint8_3x3_filter.h",
-        "optimized/optimized_ops.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        ":quantization_util",
-        ":strided_slice_logic",
-        ":types",
-        ":reference_base",
-        ":round",
-        ":tensor_utils",
-        "//third_party/eigen3",
-        "@gemmlowp",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ] + select({
-        ":haswell": tflite_deps_intel,
-        ":ios_x86_64": tflite_deps_intel,
-        ":k8": tflite_deps_intel,
-        ":x86": tflite_deps_intel,
-        ":x86_64": tflite_deps_intel,
-        ":darwin": tflite_deps_intel,
-        ":darwin_x86_64": tflite_deps_intel,
-        ":freebsd": tflite_deps_intel,
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "legacy_optimized_base",
-    srcs = [],
-    hdrs = [
-        "common.h",
-        "optimized/depthwiseconv_float.h",
-        "optimized/depthwiseconv_uint8.h",
-        "optimized/depthwiseconv_uint8_3x3_filter.h",
-        "optimized/legacy_optimized_ops.h",
-        "optimized/optimized_ops.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        ":quantization_util",
-        ":strided_slice_logic",
-        ":tensor_utils",
-        ":types",
-        ":legacy_reference_base",
-        ":round",
-        "//third_party/eigen3",
-        "@gemmlowp",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ] + select({
-        ":haswell": tflite_deps_intel,
-        ":ios_x86_64": tflite_deps_intel,
-        ":k8": tflite_deps_intel,
-        ":x86": tflite_deps_intel,
-        ":x86_64": tflite_deps_intel,
-        ":darwin": tflite_deps_intel,
-        ":darwin_x86_64": tflite_deps_intel,
-        ":freebsd": tflite_deps_intel,
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "optimized",
-    hdrs = [
-        "optimized/cblas_conv.h",
-        "optimized/cblas_reference.h",
-        "optimized/eigen_spatial_convolutions.h",
-        "optimized/eigen_tensor_reduced_instantiations_oss.h",
-        "optimized/multithreaded_conv.h",
-        # FIXME(petewarden) - This should be removed, since it's a header from the
-        # :tensor dependency below.
-        "tensor.h",
-    ],
-    deps = [
-        ":optimized_base",
-        ":tensor",
-        ":types",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//third_party/eigen3",
-    ],
-)
-
-cc_test(
-    name = "tensor_test",
-    srcs = ["tensor_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":tensor",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "round",
-    srcs = [],
-    hdrs = ["round.h"],
-)
-
-cc_library(
-    name = "quantization_util",
-    srcs = ["quantization_util.cc"],
-    hdrs = [
-        "compatibility.h",
-        "quantization_util.h",
-    ],
-    deps = [
-        ":round",
-        ":types",
-        "//tensorflow/contrib/lite/kernels:op_macros",
-    ],
-)
-
-cc_test(
-    name = "quantization_util_test",
-    srcs = ["quantization_util_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":quantization_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "strided_slice_logic",
-    srcs = [],
-    hdrs = [
-        "strided_slice_logic.h",
-    ],
-    deps = [
-        ":types",
-    ],
-)
-
-cc_library(
-    name = "reference_base",
-    srcs = [],
-    hdrs = [
-        "common.h",
-        "reference/depthwiseconv_float.h",
-        "reference/depthwiseconv_uint8.h",
-        "reference/fully_connected.h",
-        "reference/reference_ops.h",
-        "reference/softmax.h",
-    ],
-    deps = [
-        ":quantization_util",
-        ":round",
-        ":strided_slice_logic",
-        ":types",
-        "@gemmlowp",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:op_macros",
-    ] + select({
-        ":haswell": tflite_deps_intel,
-        ":ios_x86_64": tflite_deps_intel,
-        ":k8": tflite_deps_intel,
-        ":x86": tflite_deps_intel,
-        ":x86_64": tflite_deps_intel,
-        ":darwin": tflite_deps_intel,
-        ":darwin_x86_64": tflite_deps_intel,
-        ":freebsd": tflite_deps_intel,
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "legacy_reference_base",
-    srcs = [],
-    hdrs = [
-        "common.h",
-        "reference/depthwiseconv_float.h",
-        "reference/depthwiseconv_uint8.h",
-        "reference/fully_connected.h",
-        "reference/legacy_reference_ops.h",
-        "reference/reference_ops.h",
-        "reference/softmax.h",
-    ],
-    deps = [
-        ":quantization_util",
-        ":round",
-        ":strided_slice_logic",
-        ":types",
-        "@gemmlowp",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:op_macros",
-    ] + select({
-        ":haswell": tflite_deps_intel,
-        ":ios_x86_64": tflite_deps_intel,
-        ":k8": tflite_deps_intel,
-        ":x86": tflite_deps_intel,
-        ":x86_64": tflite_deps_intel,
-        ":darwin": tflite_deps_intel,
-        ":darwin_x86_64": tflite_deps_intel,
-        ":freebsd": tflite_deps_intel,
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "tensor",
-    hdrs = [
-        "tensor.h",
-        "tensor_ctypes.h",
-    ],
-    deps = [
-        ":types",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ],
-)
-
-# Deprecated version of :tensor, kept for backwards compatibility.
-cc_library(
-    name = "reference",
-    hdrs = [
-        "tensor.h",
-        "tensor_ctypes.h",
-    ],
-    deps = [
-        ":types",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ],
-)
-
-cc_library(
-    name = "portable_tensor_utils",
-    srcs = [
-        "reference/portable_tensor_utils.cc",
-    ],
-    hdrs = [
-        "reference/portable_tensor_utils.h",
-    ],
-    deps = [
-        ":round",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:activation_functor",
-        "//tensorflow/contrib/lite/kernels:op_macros",
-    ],
-)
-
-cc_library(
-    name = "neon_tensor_utils",
-    srcs = [
-        "optimized/neon_tensor_utils.cc",
-        "reference/portable_tensor_utils.cc",
-        "reference/portable_tensor_utils.h",
-    ],
-    hdrs = [
-        "common.h",
-        "compatibility.h",
-        "optimized/cpu_check.h",
-        "optimized/neon_tensor_utils.h",
-        "optimized/tensor_utils_impl.h",
-    ],
-    copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
-    deps = [
-        ":cpu_check",
-        ":round",
-        ":types",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:activation_functor",
-        "//tensorflow/contrib/lite/kernels:op_macros",
-        "@arm_neon_2_x86_sse",
-        "@gemmlowp",
-    ],
-)
-
-cc_library(
-    name = "kernel_utils",
-    srcs = ["kernel_utils.cc"],
-    hdrs = ["kernel_utils.h"],
-    deps = [
-        ":tensor_utils",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-    ],
-)
-
-# Audio support classes imported directly from TensorFlow.
-cc_library(
-    name = "audio_utils",
-    srcs = [
-        "mfcc.cc",
-        "mfcc_dct.cc",
-        "mfcc_mel_filterbank.cc",
-        "spectrogram.cc",
-    ],
-    hdrs = [
-        "mfcc.h",
-        "mfcc_dct.h",
-        "mfcc_mel_filterbank.h",
-        "spectrogram.h",
-    ],
-    deps = [
-        "//third_party/fft2d:fft2d_headers",
-        "@fft2d",
-    ],
-)
-
-cc_library(
-    name = "tensor_utils",
-    srcs = [
-        "tensor_utils.cc",
-    ],
-    hdrs = [
-        "common.h",
-        "compatibility.h",
-        "optimized/cpu_check.h",
-        "optimized/neon_tensor_utils.h",
-        "optimized/tensor_utils_impl.h",
-        "reference/portable_tensor_utils.h",
-        "tensor_utils.h",
-        "types.h",
-    ],
-    copts = NEON_FLAGS_IF_APPLICABLE,
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "@arm_neon_2_x86_sse",
-        "//tensorflow/contrib/lite/kernels:op_macros",
-        "@gemmlowp",
-    ] + select({
-        ":arm": [
-            ":neon_tensor_utils",
-        ],
-        ":arm64-v8a": [
-            ":neon_tensor_utils",
-        ],
-        ":armeabi-v7a": [
-            ":neon_tensor_utils",
-        ],
-        ":armv7a": [
-            ":neon_tensor_utils",
-        ],
-        ":haswell": [
-            ":neon_tensor_utils",
-        ],
-        ":ios_armv7": [
-            ":neon_tensor_utils",
-        ],
-        ":ios_arm64": [
-            ":neon_tensor_utils",
-        ],
-        ":ios_x86_64": [
-            ":neon_tensor_utils",
-        ],
-        ":x86_64": [
-            ":neon_tensor_utils",
-        ],
-        ":x86": [
-            ":neon_tensor_utils",
-        ],
-        ":k8": [
-            ":neon_tensor_utils",
-        ],
-        ":darwin": [
-            ":neon_tensor_utils",
-        ],
-        ":darwin_x86_64": [
-            ":neon_tensor_utils",
-        ],
-        "//conditions:default": [
-            ":portable_tensor_utils",
-        ],
-    }),
-)
-
-cc_library(
-    name = "test_util",
-    srcs = ["test_util.cc"],
-    hdrs = ["test_util.h"],
-    deps = [
-        ":types",
-        "//tensorflow/contrib/lite:string",
-    ],
-)
-
-cc_test(
-    name = "tensor_utils_test",
-    srcs = ["tensor_utils_test.cc"],
-    copts = NEON_FLAGS_IF_APPLICABLE,
-    linkopts = select({
-        "//tensorflow:android": [
-            "-fPIE -pie",
-        ],
-        "//conditions:default": [],
-    }),
-    linkstatic = 1,
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":tensor_utils",
-        "//tensorflow/contrib/lite/c:c_api_internal",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "depthwiseconv_float_test",
-    srcs = ["depthwiseconv_float_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":optimized_base",
-        ":reference_base",
-        ":test_util",
-        ":types",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "depthwiseconv_quantized_test",
-    srcs = ["depthwiseconv_quantized_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":optimized_base",
-        ":reference_base",
-        ":test_util",
-        ":types",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "resize_bilinear_test",
-    srcs = ["resize_bilinear_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":optimized_base",
-        ":reference_base",
-        ":test_util",
-        ":types",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "softmax_quantized_test",
-    timeout = "long",
-    srcs = [
-        "softmax_quantized_test.cc",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":optimized_base",
-        ":quantization_util",
-        ":reference_base",
-        ":test_util",
-        "//tensorflow/contrib/lite:string",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "logsoftmax_quantized_test",
-    timeout = "long",
-    srcs = [
-        "logsoftmax_quantized_test.cc",
-    ],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":optimized_base",
-        ":quantization_util",
-        ":reference_base",
-        ":test_util",
-        "//tensorflow/contrib/lite:string",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_test(
-    name = "log_quantized_test",
-    srcs = ["log_quantized_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":optimized_base",
-        ":reference_base",
-        "//tensorflow/contrib/lite:string",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "cpu_check",
-    hdrs = [
-        "optimized/cpu_check.h",
-    ],
-    deps = [
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "@androidndk//:cpufeatures",
-            ],
-            "//conditions:default": [],
-        },
-    ),
-)
-
-cc_test(
-    name = "batch_to_space_nd_test",
-    srcs = ["batch_to_space_nd_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":optimized_base",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
-
-tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc
deleted file mode 100644
index 5a2901ac8c297265e542cc30d3127fe774c19e78..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/batch_to_space_nd_test.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-
-#include <gtest/gtest.h>
-
-namespace tflite {
-namespace {
-
-// A light wrapper of GetIndexRange which returns a pair of start / end
-// indices.
-std::pair<int, int> GetIndexRange(int spatial_index_dim, int block_shape_dim,
-                                  int input_dim, int output_dim) {
-  int index_start = 0;
-  int index_end = 0;
-  optimized_ops::GetIndexRange(spatial_index_dim, block_shape_dim, input_dim,
-                               output_dim, &index_start, &index_end);
-  return {index_start, index_end};
-}
-
-TEST(BatchToSpaceNDTest, TestIndexRange) {
-  // Simple test case, no cropping.
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/6,
-                          /*input_dim=*/1, /*output_dim=*/6),
-            std::make_pair(0, 1));
-
-  // No cropping and input_dim > 1.
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/2, /*block_shape_dim=*/6,
-                          /*input_dim=*/5, /*output_dim=*/30),
-            std::make_pair(0, 5));
-
-  // With small cropping values (can be either at the beginning or at the end).
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/2,
-                          /*input_dim=*/3, /*output_dim=*/4),
-            std::make_pair(0, 2));
-
-  // With positive cropping values at the beginning.
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-2, /*block_shape_dim=*/2,
-                          /*input_dim=*/3, /*output_dim=*/4),
-            std::make_pair(1, 3));
-
-  // Large crop at the beginning.
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
-                          /*input_dim=*/7, /*output_dim=*/5),
-            std::make_pair(6, 7));
-
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-26, /*block_shape_dim=*/5,
-                          /*input_dim=*/7, /*output_dim=*/5),
-            std::make_pair(6, 7));
-
-  // Large crop at the end.
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
-                          /*input_dim=*/7, /*output_dim=*/5),
-            std::make_pair(0, 1));
-
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/4, /*block_shape_dim=*/5,
-                          /*input_dim=*/7, /*output_dim=*/5),
-            std::make_pair(0, 1));
-
-  // Rounding up incorrectly will fail this test.
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/5,
-                          /*input_dim=*/7, /*output_dim=*/5),
-            std::make_pair(0, 1));
-
-  // Extreme cropping with output of a single spatial location.
-  // Valid position 1, when large crop at the end.
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
-                          /*input_dim=*/7, /*output_dim=*/1),
-            std::make_pair(0, 1));
-
-  // Valid position 2, when large crop at the beginning.
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
-                          /*input_dim=*/7, /*output_dim=*/1),
-            std::make_pair(6, 7));
-
-  // Invalid positions.
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/1, /*block_shape_dim=*/5,
-                          /*input_dim=*/7, /*output_dim=*/1),
-            std::make_pair(0, 0));
-  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-29, /*block_shape_dim=*/5,
-                          /*input_dim=*/7, /*output_dim=*/1),
-            std::make_pair(6, 6));
-}
-
-}  // namespace
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
deleted file mode 100644
index e67fee11b8d24d386d3b7c5efa4b07463fb8024a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/common.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
-
-#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
-#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#endif
-#endif
-
-#ifndef USE_NEON
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#define USE_NEON
-#include <arm_neon.h>
-#endif
-
-#if defined __GNUC__ && defined __SSE4_1__
-#define USE_NEON
-
-#define OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#pragma GCC diagnostic ignored "-Wattributes"
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wnarrowing"
-#pragma GCC diagnostic ignored "-Wsequence-point"
-
-#include "NEON_2_SSE.h"
-
-#pragma GCC diagnostic pop
-#endif
-#endif
-
-#include "fixedpoint/fixedpoint.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-namespace tflite {
-
-inline void GetActivationMinMax(FusedActivationFunctionType ac,
-                                float* output_activation_min,
-                                float* output_activation_max) {
-  switch (ac) {
-    case FusedActivationFunctionType::kNone:
-      *output_activation_min = std::numeric_limits<float>::lowest();
-      *output_activation_max = std::numeric_limits<float>::max();
-      break;
-    case FusedActivationFunctionType::kRelu:
-      *output_activation_min = 0.f;
-      *output_activation_max = std::numeric_limits<float>::max();
-      break;
-    case FusedActivationFunctionType::kRelu1:
-      *output_activation_min = -1.f;
-      *output_activation_max = 1.f;
-      break;
-    case FusedActivationFunctionType::kRelu6:
-      *output_activation_min = 0.f;
-      *output_activation_max = 6.f;
-      break;
-  }
-}
-
-inline float ActivationFunctionWithMinMax(float x, float output_activation_min,
-                                          float output_activation_max) {
-  return std::min(std::max(x, output_activation_min), output_activation_max);
-}
-
-// Legacy function, left for compatibility only.
-template <FusedActivationFunctionType Ac>
-float ActivationFunction(float x) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-  return ActivationFunctionWithMinMax(x, output_activation_min,
-                                      output_activation_max);
-}
-
-inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
-    int32 x, int32 quantized_multiplier, int left_shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  return RoundingDivideByPOT(
-      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
-}
-
-inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
-    int32 x, int32 quantized_multiplier, int left_shift) {
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
-                                           quantized_multiplier);
-}
-
-inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
-                                           int shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  int left_shift = shift > 0 ? shift : 0;
-  int right_shift = shift > 0 ? 0 : -shift;
-  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                                 x * (1 << left_shift), quantized_multiplier),
-                             right_shift);
-}
-
-template <typename T>
-int CountLeadingZeros(T integer_input) {
-  static_assert(std::is_unsigned<T>::value,
-                "Only unsigned integer types handled.");
-#if defined(__GNUC__)
-  return integer_input ? __builtin_clz(integer_input) : 0;
-#else
-  const T one_in_leading_positive = static_cast<T>(1)
-                                    << (std::numeric_limits<T>::digits - 1);
-  int leading_zeros = 0;
-  while (integer_input < one_in_leading_positive) {
-    integer_input <<= 1;
-    ++leading_zeros;
-  }
-  return leading_zeros;
-#endif
-}
-
-// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
-// BROADCASTING.
-//
-// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
-// rectangular array of numbers.
-//
-// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
-// However, as Dims<N> is to be deprecated, this class exists as an adaptor
-// to enable simple unoptimized implementations of element-wise broadcasting
-// operations.
-template <int N>
-struct NdArrayDesc {
-  // The "extent" of each dimension. Indices along dimension d must be in the
-  // half-open interval [0, extents[d]).
-  int extents[N];
-
-  // The number of *elements* (not bytes) between consecutive indices of each
-  // dimension.
-  int strides[N];
-};
-
-// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
-// BROADCASTING.
-//
-// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
-inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
-                            int i3) {
-  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
-  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
-  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
-  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
-  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
-         i3 * desc.strides[3];
-}
-
-// Given the dimensions of the operands for an element-wise binary broadcast,
-// adjusts them so that they can be directly iterated over with simple loops.
-// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
-// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
-//
-// This function assumes that the two input shapes are compatible up to
-// broadcasting and the shorter one has already been prepended with 1s to be the
-// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
-// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
-// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
-// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
-//
-// When two shapes are compatible up to broadcasting, for each dimension d,
-// the input extents are either equal, or one of them is 1.
-//
-// This function performs the following for each dimension d:
-// - If the extents are equal, then do nothing since the loop that walks over
-//   both of the input arrays is correct.
-// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
-//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
-//   array0 to be referenced *at any index* in dimension d and still access the
-//   same slice.
-template <int N>
-inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
-                                                const Dims<N>& input1_dims,
-                                                NdArrayDesc<N>* desc0_out,
-                                                NdArrayDesc<N>* desc1_out) {
-  TFLITE_DCHECK(desc0_out != nullptr);
-  TFLITE_DCHECK(desc1_out != nullptr);
-
-  // Copy dims to desc.
-  for (int i = 0; i < N; ++i) {
-    desc0_out->extents[i] = input0_dims.sizes[i];
-    desc0_out->strides[i] = input0_dims.strides[i];
-    desc1_out->extents[i] = input1_dims.sizes[i];
-    desc1_out->strides[i] = input1_dims.strides[i];
-  }
-
-  // Walk over each dimension. If the extents are equal do nothing.
-  // Otherwise, set the desc with extent 1 to have extent equal to the other and
-  // stride 0.
-  for (int i = 0; i < N; ++i) {
-    const int extent0 = ArraySize(input0_dims, i);
-    const int extent1 = ArraySize(input1_dims, i);
-    if (extent0 != extent1) {
-      if (extent0 == 1) {
-        desc0_out->strides[i] = 0;
-        desc0_out->extents[i] = extent1;
-      } else {
-        TFLITE_DCHECK_EQ(extent1, 1);
-        desc1_out->strides[i] = 0;
-        desc1_out->extents[i] = extent0;
-      }
-    }
-  }
-}
-
-template <int N>
-inline void NdArrayDescsForElementwiseBroadcast(
-    const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
-    NdArrayDesc<N>* desc0_out, NdArrayDesc<N>* desc1_out) {
-  TFLITE_DCHECK(desc0_out != nullptr);
-  TFLITE_DCHECK(desc1_out != nullptr);
-
-  auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape);
-  auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape);
-
-  // Copy dims to desc, calculating strides.
-  int desc0_stride = 1;
-  int desc1_stride = 1;
-  for (int i = N - 1; i >= 0; --i) {
-    desc0_out->extents[i] = extended_input0_shape.Dims(i);
-    desc0_out->strides[i] = desc0_stride;
-    desc0_stride *= extended_input0_shape.Dims(i);
-    desc1_out->extents[i] = extended_input1_shape.Dims(i);
-    desc1_out->strides[i] = desc1_stride;
-    desc1_stride *= extended_input1_shape.Dims(i);
-  }
-
-  // Walk over each dimension. If the extents are equal do nothing.
-  // Otherwise, set the desc with extent 1 to have extent equal to the other and
-  // stride 0.
-  for (int i = 0; i < N; ++i) {
-    const int extent0 = extended_input0_shape.Dims(i);
-    const int extent1 = extended_input1_shape.Dims(i);
-    if (extent0 != extent1) {
-      if (extent0 == 1) {
-        desc0_out->strides[i] = 0;
-        desc0_out->extents[i] = extent1;
-      } else {
-        TFLITE_DCHECK_EQ(extent1, 1);
-        desc1_out->strides[i] = 0;
-        desc1_out->extents[i] = extent0;
-      }
-    }
-  }
-}
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
deleted file mode 100644
index 7c176e0fa1c8e8c8b6a094dbeb1025f2be091b3d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/compatibility.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
-
-#include <cstdint>
-
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-#ifndef TFLITE_DCHECK
-#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE
-#endif
-
-#ifndef TFLITE_DCHECK_EQ
-#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE
-#endif
-
-#ifndef TFLITE_DCHECK_NE
-#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE
-#endif
-
-#ifndef TFLITE_DCHECK_GE
-#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
-#endif
-
-#ifndef TFLITE_DCHECK_GT
-#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE
-#endif
-
-#ifndef TFLITE_DCHECK_LE
-#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
-#endif
-
-#ifndef TFLITE_DCHECK_LT
-#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE
-#endif
-
-// TODO(ahentz): Clean up: We should stick to the DCHECK versions.
-#ifndef TFLITE_CHECK
-#define TFLITE_CHECK(condition) (condition) ? (void)0 : TFLITE_ABORT
-#endif
-
-#ifndef TFLITE_CHECK_EQ
-#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ABORT
-#endif
-
-#ifndef TFLITE_CHECK_NE
-#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ABORT
-#endif
-
-#ifndef TFLITE_CHECK_GE
-#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ABORT
-#endif
-
-#ifndef TFLITE_CHECK_GT
-#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ABORT
-#endif
-
-#ifndef TFLITE_CHECK_LE
-#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ABORT
-#endif
-
-#ifndef TFLITE_CHECK_LT
-#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
-#endif
-
-// TODO(ahentz): Clean up.
-using int8 = std::int8_t;
-using uint8 = std::uint8_t;
-using int16 = std::int16_t;
-using uint16 = std::uint16_t;
-using int32 = std::int32_t;
-using uint32 = std::uint32_t;
-
-// TFLITE_DEPRECATED()
-//
-// Duplicated from absl/base/macros.h to avoid pulling in that library.
-// Marks a deprecated class, struct, enum, function, method and variable
-// declarations. The macro argument is used as a custom diagnostic message (e.g.
-// suggestion of a better alternative).
-//
-// Example:
-//
-//   class TFLITE_DEPRECATED("Use Bar instead") Foo {...};
-//   TFLITE_DEPRECATED("Use Baz instead") void Bar() {...}
-//
-// Every usage of a deprecated entity will trigger a warning when compiled with
-// clang's `-Wdeprecated-declarations` option. This option is turned off by
-// default, but the warnings will be reported by clang-tidy.
-#if defined(__clang__) && __cplusplus >= 201103L
-#define TFLITE_DEPRECATED(message) __attribute__((deprecated(message)))
-#endif
-
-#ifndef TFLITE_DEPRECATED
-#define TFLITE_DEPRECATED(message)
-#endif
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/depthwiseconv_quantized_test.cc
deleted file mode 100644
index 9414e109c302510a6fd434b410f0cbb575023e76..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <sys/types.h>
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <cstdlib>
-#include <iterator>
-#include <limits>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
-
-namespace tflite {
-namespace {
-
-// Runs the DepthwiseConv and compares against the reference implementation.
-template <FusedActivationFunctionType Ac>
-int TestOneDepthwiseConvWithGivenOutputShift(
-    const std::uint8_t* input_data, const RuntimeShape& input_shape,
-    std::int32_t input_offset, const std::uint8_t* filter_data,
-    const RuntimeShape& filter_shape, std::int32_t filter_offset,
-    const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride,
-    int pad_width, int pad_height, int depth_multiplier,
-    std::int32_t output_offset, std::int32_t output_multiplier,
-    int output_shift, std::int32_t output_activation_min,
-    std::int32_t output_activation_max, const RuntimeShape& output_shape) {
-  const int output_buffer_size = output_shape.FlatSize();
-  std::vector<std::uint8_t> output_data(output_buffer_size);
-  std::vector<std::uint8_t> reference_output_data(output_buffer_size);
-
-  tflite::DepthwiseParams op_params;
-  op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = pad_width;
-  op_params.padding_values.height = pad_height;
-  op_params.stride_width = stride;
-  op_params.stride_height = stride;
-  op_params.dilation_width_factor = 1;
-  op_params.dilation_height_factor = 1;
-  op_params.depth_multiplier = depth_multiplier;
-  op_params.quantized_activation_min = output_activation_min;
-  op_params.quantized_activation_max = output_activation_max;
-  op_params.input_offset = input_offset;
-  op_params.weights_offset = filter_offset;
-  op_params.output_offset = output_offset;
-  op_params.output_multiplier = output_multiplier;
-  op_params.output_shift = -output_shift;
-  reference_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape,
-                               filter_data, bias_shape, bias_data, output_shape,
-                               reference_output_data.data());
-  optimized_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape,
-                               filter_data, bias_shape, bias_data, output_shape,
-                               output_data.data());
-  int saturated_min = 0;
-  int saturated_max = 0;
-  std::vector<int> diff(output_buffer_size);
-  std::int64_t sum_diff = 0;
-  std::int64_t sum_abs_diff = 0;
-  for (int i = 0; i < output_buffer_size; i++) {
-    diff[i] = static_cast<int>(output_data[i]) -
-              static_cast<int>(reference_output_data[i]);
-    sum_diff += diff[i];
-    sum_abs_diff += std::abs(diff[i]);
-    saturated_min += output_data[i] == output_activation_min;
-    saturated_max += output_data[i] == output_activation_max;
-  }
-  // These stats help understand test failures.
-  std::sort(std::begin(diff), std::end(diff));
-  const int min_diff = diff.front();
-  const int max_diff = diff.back();
-  const int median_diff = diff[diff.size() / 2];
-  const float mean_diff = static_cast<float>(sum_diff) / output_buffer_size;
-  const float mean_abs_diff =
-      static_cast<float>(sum_abs_diff) / output_buffer_size;
-  // Normally we should require bit-for-bit exact results. Unfortunately a bug
-  // in the Intel arm_neon_sse.h translation header that we use for x86 tests
-  // causes 1-bit inaccuracy in
-  // the vqrdmulh_n_s32 intrinsic, which causes off-by-1 errors in quantized
-  // DepthwiseConv ops. So we have to live with a few off-by-one errors for now,
-  // yet still ensure that no more than a small minority of values are wrong.
-  EXPECT_TRUE(std::abs(mean_diff) < 1e-5f && mean_abs_diff < 1e-5f &&
-              std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 &&
-              std::abs(max_diff) <= 1);
-  if (saturated_min > 2 * saturated_max) {
-    return -1;
-  }
-  if (saturated_max > 2 * saturated_min) {
-    return 1;
-  }
-  return 0;
-}
-
-// The point of this function is that we can't practically know which
-// output_shift value to pass to test DepthwiseConv. It's not easy to guess (we
-// could do some
-// statistics for large size, but they would be fragile at smaller sizes), and
-// guessing wrong would mean that all the values get saturated so the test
-// becomes
-// vacuous. So we just bisect our way to reasonable output_shift values.
-template <FusedActivationFunctionType Ac>
-void TestOneDepthwiseConvBisectOutputShift(
-    const std::uint8_t* input_data, const RuntimeShape& input_shape,
-    std::int32_t input_offset, const std::uint8_t* filter_data,
-    const RuntimeShape& filter_shape, std::int32_t filter_offset,
-    const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride,
-    int pad_width, int pad_height, int depth_multiplier,
-    std::int32_t output_offset, std::int32_t output_multiplier,
-    int output_activation_bisect_start, int output_activation_bisect_end,
-    std::int32_t output_activation_min, std::int32_t output_activation_max,
-    const RuntimeShape& output_shape) {
-  ASSERT_LT(output_activation_bisect_start, output_activation_bisect_end)
-      << "Bisection failed ?!?!";
-  int output_shift_bisect_midpoint =
-      (output_activation_bisect_start + output_activation_bisect_end) / 2;
-  int bisect_result = TestOneDepthwiseConvWithGivenOutputShift<Ac>(
-      input_data, input_shape, input_offset, filter_data, filter_shape,
-      filter_offset, bias_data, bias_shape, stride, pad_width, pad_height,
-      depth_multiplier, output_offset, output_multiplier,
-      output_shift_bisect_midpoint, output_activation_min,
-      output_activation_max, output_shape);
-  // At this point we know that the test succeeded (otherwise it would have
-  // aborted).
-  if (bisect_result == 0) {
-    // The result isn't particularly saturated on one or the other side.
-    // All good, we're done.
-    return;
-  }
-  if (output_activation_bisect_start == output_activation_bisect_end - 1) {
-    // There is still some saturation on one side, but the bisection is
-    // finished anyways. We're done; nothing more we can do about it. This
-    // happens
-    // in particular when using an activation with a narrow range.
-    return;
-  }
-  // Continue the bisection based on the present result.
-  int new_output_activation_bisect_start = bisect_result == 1
-                                               ? output_shift_bisect_midpoint
-                                               : output_activation_bisect_start;
-  int new_output_activation_bisect_end = bisect_result == 1
-                                             ? output_activation_bisect_end
-                                             : output_shift_bisect_midpoint;
-  TestOneDepthwiseConvBisectOutputShift<Ac>(
-      input_data, input_shape, input_offset, filter_data, filter_shape,
-      filter_offset, bias_data, bias_shape, stride, pad_width, pad_height,
-      depth_multiplier, output_offset, output_multiplier,
-      new_output_activation_bisect_start, new_output_activation_bisect_end,
-      output_activation_min, output_activation_max, output_shape);
-}
-
-template <FusedActivationFunctionType Ac>
-void TestOneDepthwiseConv(
-    const std::uint8_t* input_data, const RuntimeShape& input_shape,
-    std::int32_t input_offset, const std::uint8_t* filter_data,
-    const RuntimeShape& filter_shape, std::int32_t filter_offset,
-    const std::int32_t* bias_data, const RuntimeShape& bias_shape, int stride,
-    int pad_width, int pad_height, int depth_multiplier,
-    std::int32_t output_offset, std::int32_t output_multiplier,
-    std::int32_t output_activation_min, std::int32_t output_activation_max,
-    const RuntimeShape& output_shape) {
-  TestOneDepthwiseConvBisectOutputShift<Ac>(
-      input_data, input_shape, input_offset, filter_data, filter_shape,
-      filter_offset, bias_data, bias_shape, stride, pad_width, pad_height,
-      depth_multiplier, output_offset, output_multiplier, 0, 32,
-      output_activation_min, output_activation_max, output_shape);
-}
-
-void TestOneDepthwiseConv(
-    FusedActivationFunctionType Ac, const std::uint8_t* input_data,
-    const RuntimeShape& input_shape, std::int32_t input_offset,
-    const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
-    std::int32_t filter_offset, const std::int32_t* bias_data,
-    const RuntimeShape& bias_shape, int stride, int pad_width, int pad_height,
-    int depth_multiplier, std::int32_t output_offset,
-    std::int32_t output_multiplier, std::int32_t output_activation_min,
-    std::int32_t output_activation_max, const RuntimeShape& output_shape) {
-#define TOCO_HANDLE_CASE(AC_TYPE)                                            \
-  if (AC_TYPE == Ac) {                                                       \
-    TestOneDepthwiseConv<AC_TYPE>(                                           \
-        input_data, input_shape, input_offset, filter_data, filter_shape,    \
-        filter_offset, bias_data, bias_shape, stride, pad_width, pad_height, \
-        depth_multiplier, output_offset, output_multiplier,                  \
-        output_activation_min, output_activation_max, output_shape);         \
-    return;                                                                  \
-  }
-  TOCO_HANDLE_CASE(FusedActivationFunctionType::kNone)
-  TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu)
-  TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu1)
-  TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu6)
-#undef TOCO_HANDLE_CASE
-}
-
-bool TryTestDepthwiseConv(int batch, int input_depth, int input_width,
-                          int input_height, int filter_width, int filter_height,
-                          int depth_multiplier, int stride,
-                          int dilation_width_factor, int dilation_height_factor,
-                          PaddingType padding_type) {
-  const int output_depth = input_depth * depth_multiplier;
-  // The optimized DepthwiseConv implementation currently uses a fixed-size
-  // accumulator buffer on the stack, with that size. This currently means
-  // that it does not support larger output depths. It CHECK's for it,
-  // so it's safe in the sense that if a larger output depth was encountered,
-  // it would explicitly fail. We just need to adjust our testing to that
-  // constraint.
-  const int kMaxSupportedOutputDepth = 1024;
-  if (output_depth > kMaxSupportedOutputDepth) {
-    return false;
-  }
-  const auto ac = RandomElement(std::vector<FusedActivationFunctionType>(
-      {FusedActivationFunctionType::kNone, FusedActivationFunctionType::kRelu,
-       FusedActivationFunctionType::kRelu6,
-       FusedActivationFunctionType::kRelu1}));
-  int output_activation_min = 0;
-  int output_activation_max = 255;
-  if (ac != FusedActivationFunctionType::kNone && UniformRandomInt(0, 1)) {
-    output_activation_min = UniformRandomInt(0, 50);
-    output_activation_max = UniformRandomInt(200, 255);
-  }
-  const std::int32_t output_multiplier =
-      UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
-  const std::int32_t input_offset = UniformRandomInt(-256, 0);
-  const std::int32_t filter_offset = UniformRandomInt(-256, 0);
-  const std::int32_t output_offset = UniformRandomInt(-256, 0);
-  RuntimeShape input_shape_inference(
-      {batch, input_height, input_width, input_depth});
-  RuntimeShape output_shape_inference;
-  int pad_width, pad_height;
-  if (!ComputeConvSizes(input_shape_inference, output_depth, filter_width,
-                        filter_height, stride, dilation_width_factor,
-                        dilation_height_factor, padding_type,
-                        &output_shape_inference, &pad_width, &pad_height)) {
-    return false;
-  }
-  RuntimeShape filter_shape_inference(
-      {1, filter_height, filter_width, output_depth});
-  RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
-  const int input_buffer_size = input_shape_inference.FlatSize();
-  const int filter_buffer_size = filter_shape_inference.FlatSize();
-  std::vector<std::uint8_t> input_data(input_buffer_size);
-  std::vector<std::uint8_t> filter_data(filter_buffer_size);
-  std::vector<std::int32_t> bias_data(output_depth);
-  FillRandom(&input_data);
-  FillRandom(&filter_data);
-  FillRandom(&bias_data, -10000, 10000);
-  TestOneDepthwiseConv(ac, input_data.data(), input_shape_inference,
-                       input_offset, filter_data.data(), filter_shape_inference,
-                       filter_offset, bias_data.data(), bias_shape_inference,
-                       stride, pad_width, pad_height, depth_multiplier,
-                       output_offset, output_multiplier, output_activation_min,
-                       output_activation_max, output_shape_inference);
-  return true;
-}
-
-// This function picks some random DepthwiseConv params, which may or may not
-// be legal. If they're not legal, it returns false. If they're legal,
-// it runs the DepthwiseConv test and returns true. This allows the caller
-// to loop until a test has been run.
-bool TryTestOneDepthwiseConv() {
-  // We have to pick a lot of positive values, where we are particularly
-  // interested in small values because they are most likely to be special
-  // cases in optimized implementations, and secondarily because they allow
-  // tests to run fast, which means we can run more tests and get more
-  // coverage.
-  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
-  const int input_depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
-  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
-  const int filter_width = ExponentialRandomPositiveInt(0.9f, 4, 10);
-  const int filter_height = ExponentialRandomPositiveInt(0.9f, 4, 10);
-  const int depth_multiplier = ExponentialRandomPositiveInt(0.8f, 6, 50);
-  const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
-  const int dilation_width_factor = RandomElement(std::vector<int>({1, 2, 4}));
-  const int dilation_height_factor = RandomElement(std::vector<int>({1, 2, 4}));
-  const auto padding_type =
-      UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
-
-  return TryTestDepthwiseConv(batch, input_depth, input_width, input_height,
-                              filter_width, filter_height, depth_multiplier,
-                              stride, dilation_width_factor,
-                              dilation_height_factor, padding_type);
-}
-
-// Tests parameters for the 3x3 filter kernel.
-bool TryTestOneDepthwiseConv3x3Filter() {
-  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
-  const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
-  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
-  const int filter_width = 3;
-  const int filter_height = 3;
-  const int depth_multiplier = 1;
-  const int stride = UniformRandomInt(1, 2);
-  // We don't support dilations in the 3x3 filter.
-  const int dilation_width_factor = 1;
-  const int dilation_height_factor = 1;
-  // Although the kernel supports only kValid padding, we test that kSame
-  // is using the correct code path.
-  const auto padding_type =
-      UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
-
-  return TryTestDepthwiseConv(batch, input_depth, input_width, input_height,
-                              filter_width, filter_height, depth_multiplier,
-                              stride, dilation_width_factor,
-                              dilation_height_factor, padding_type);
-}
-
-void TestOneDepthwiseConv() {
-  while (!TryTestOneDepthwiseConv()) {
-  }
-}
-
-void TestOneDepthwiseConv3x3Filter() {
-  while (!TryTestOneDepthwiseConv3x3Filter()) {
-  }
-}
-
-TEST(TestDepthwiseConv, TestDepthwiseConv) {
-  const int kTestsToRun = 10 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv();
-  }
-}
-
-TEST(TestDepthwiseConv3x3Filter, TestDepthwiseConv) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter();
-  }
-}
-
-}  // namespace
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
deleted file mode 100644
index 56e93678781503a123e10296b3a8e9f7aee40ffc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ /dev/null
@@ -1,771 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
-
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-
-namespace tflite {
-namespace kernel_utils {
-
-void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
-                  const float* recurrent_weights_ptr, const float* bias_ptr,
-                  int input_size, int num_units, int batch_size,
-                  TfLiteFusedActivation activation,
-                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
-  RnnBatchStep(input_ptr_batch, input_weights_ptr,
-               /*aux_input_ptr_batch=*/nullptr,
-               /*aux_input_weights_ptr=*/nullptr, recurrent_weights_ptr,
-               bias_ptr, input_size, /*aux_input_size=*/0, num_units,
-               batch_size, activation, hidden_state_ptr_batch,
-               output_ptr_batch);
-}
-
-void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
-                  const float* aux_input_ptr_batch,
-                  const float* aux_input_weights_ptr,
-                  const float* recurrent_weights_ptr, const float* bias_ptr,
-                  int input_size, int aux_input_size, int num_units,
-                  int batch_size, TfLiteFusedActivation activation,
-                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
-  // Output = bias
-  tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
-                                        output_ptr_batch);
-  // Output += input * input_weights
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
-      output_ptr_batch, /*result_stride=*/1);
-  // Output += aux_input * aux_input_weights (if they are not empty).
-  if (aux_input_size > 0) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch,
-        batch_size, output_ptr_batch, /*result_stride=*/1);
-  }
-  // Output += recurrent_weights * hidden_state
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
-      batch_size, output_ptr_batch, /*result_stride=*/1);
-  // Output = activation(Output) and update hidden_state
-  tensor_utils::ApplyActivationToVector(
-      output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
-  tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size,
-                                        hidden_state_ptr_batch);
-}
-
-void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
-                  float input_weights_scale,
-                  const int8_t* recurrent_weights_ptr,
-                  float recurrent_weights_scale, const float* bias_ptr,
-                  int input_size, int num_units, int batch_size,
-                  TfLiteFusedActivation activation,
-                  int8_t* quantized_input_ptr_batch,
-                  int8_t* quantized_hidden_state_ptr_batch,
-                  float* scaling_factors, float* hidden_state_ptr_batch,
-                  float* output_ptr_batch) {
-  RnnBatchStep(input_ptr_batch, input_weights_ptr, input_weights_scale,
-               /*aux_input_ptr_batch=*/nullptr,
-               /*aux_input_weights_ptr=*/nullptr,
-               /*aux_input_weights_scale=*/0.0f, recurrent_weights_ptr,
-               recurrent_weights_scale, bias_ptr, input_size,
-               /*aux_input_size=*/0, num_units, batch_size, activation,
-               quantized_input_ptr_batch,
-               /*aux_quantized_input_ptr_batch=*/nullptr,
-               quantized_hidden_state_ptr_batch, scaling_factors,
-               hidden_state_ptr_batch, output_ptr_batch);
-}
-
-void RnnBatchStep(
-    const float* input_ptr_batch, const int8_t* input_weights_ptr,
-    float input_weights_scale, const float* aux_input_ptr_batch,
-    const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
-    const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
-    const float* bias_ptr, int input_size, int aux_input_size, int num_units,
-    int batch_size, TfLiteFusedActivation activation,
-    int8_t* quantized_input_ptr_batch, int8_t* aux_quantized_input_ptr_batch,
-    int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
-    float* hidden_state_ptr_batch, float* output_ptr_batch) {
-  // Output = bias
-  tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
-                                        output_ptr_batch);
-
-  // Save quantization and matmul computation for all zero input.
-  if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
-    // Quantize input from float to uint8 + quantization params (scaling
-    // factor).
-    float unused_min, unused_max;
-    // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
-    // whichever is faster.
-    for (int b = 0; b < batch_size; ++b) {
-      const int offset = b * input_size;
-      tensor_utils::SymmetricQuantizeFloats(
-          input_ptr_batch + offset, input_size,
-          quantized_input_ptr_batch + offset, &unused_min, &unused_max,
-          &scaling_factors[b]);
-      scaling_factors[b] *= input_weights_scale;
-    }
-
-    // Output += input * input_weights
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
-        scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
-  }
-
-  if (aux_input_ptr_batch &&
-      !tensor_utils::IsZeroVector(aux_input_ptr_batch,
-                                  batch_size * aux_input_size)) {
-    float unused_min, unused_max;
-    for (int b = 0; b < batch_size; ++b) {
-      const int offset = b * aux_input_size;
-      tensor_utils::SymmetricQuantizeFloats(
-          aux_input_ptr_batch + offset, aux_input_size,
-          aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
-          &scaling_factors[b]);
-      scaling_factors[b] *= aux_input_weights_scale;
-    }
-
-    // Output += aux_input * aux_input_weights
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_weights_ptr, num_units, aux_input_size,
-        aux_quantized_input_ptr_batch, scaling_factors, batch_size,
-        output_ptr_batch, /*result_stride=*/1);
-  }
-
-  // Save quantization and matmul computation for all zero input.
-  if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
-                                  batch_size * num_units)) {
-    // Quantize hidden_state
-    float unused_min, unused_max;
-    for (int b = 0; b < batch_size; ++b) {
-      const int offset = b * num_units;
-      tensor_utils::SymmetricQuantizeFloats(
-          hidden_state_ptr_batch + offset, num_units,
-          quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
-          &scaling_factors[b]);
-      scaling_factors[b] *= recurrent_weights_scale;
-    }
-
-    // Output += recurrent_weights * hidden_state
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_weights_ptr, num_units, num_units,
-        quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
-        output_ptr_batch, /*result_stride=*/1);
-  }
-
-  // Output = activation(Output) and update hidden_state
-  tensor_utils::ApplyActivationToVector(
-      output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
-  tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size,
-                                        hidden_state_ptr_batch);
-}
-
-void LstmStep(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr,
-    float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr_batch) {
-  LstmStepWithAuxInput(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr,
-      input_to_cell_weights_ptr, input_to_output_weights_ptr,
-      /*aux_input_ptr_batch=*/nullptr,
-      /*aux_input_to_input_weights_ptr=*/nullptr,
-      /*aux_input_to_forget_weights_ptr=*/nullptr,
-      /*aux_input_to_cell_weights_ptr=*/nullptr,
-      /*aux_input_to_output_weights_ptr=*/nullptr,
-      recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
-      recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
-      cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
-      cell_to_output_weights_ptr, input_gate_bias_ptr, forget_gate_bias_ptr,
-      cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-      projection_bias_ptr, params, n_batch, n_cell, n_input, /*n_aux_input=*/0,
-      n_output, output_state_ptr, cell_state_ptr, input_gate_scratch,
-      forget_gate_scratch, cell_scratch, output_gate_scratch, output_ptr_batch);
-}
-
-void LstmStepWithAuxInput(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr, const float* aux_input_ptr_batch,
-    const float* aux_input_to_input_weights_ptr,
-    const float* aux_input_to_forget_weights_ptr,
-    const float* aux_input_to_cell_weights_ptr,
-    const float* aux_input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
-    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr_batch) {
-  // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
-  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-  // Initialize scratch buffers with bias.
-  if (!use_cifg) {
-    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
-                                          input_gate_scratch);
-  }
-  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
-                                        forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                        cell_scratch);
-  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
-                                        output_gate_scratch);
-
-  // For each batch and cell: compute input_weight * input.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-        input_gate_scratch, /*result_stride=*/1);
-  }
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      forget_gate_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      output_gate_scratch, /*result_stride=*/1);
-
-  // If auxiliary input is available then compute aux_input_weight * aux_input
-  if (aux_input_ptr_batch != nullptr) {
-    if (!use_cifg) {
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          aux_input_to_input_weights_ptr, n_cell, n_aux_input,
-          aux_input_ptr_batch, n_batch, input_gate_scratch,
-          /*result_stride=*/1);
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_forget_weights_ptr, n_cell, n_aux_input,
-        aux_input_ptr_batch, n_batch, forget_gate_scratch, /*result_stride=*/1);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr_batch,
-        n_batch, cell_scratch, /*result_stride=*/1);
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        aux_input_to_output_weights_ptr, n_cell, n_aux_input,
-        aux_input_ptr_batch, n_batch, output_gate_scratch, /*result_stride=*/1);
-  }
-
-  // For each batch and cell: compute recurrent_weight * output_state.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch, /*result_stride=*/1);
-  }
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, forget_gate_scratch,
-      /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, output_gate_scratch,
-      /*result_stride=*/1);
-
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  }
-  if (params->cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                             params->cell_clip, cell_state_ptr);
-  }
-
-  // For each batch and cell: update the output gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        params->activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
-
-  // For each batch: update the projection and output_state.
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_ptr_batch);
-    } else {
-      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
-        output_ptr_batch, /*result_stride=*/1);
-    if (params->proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
-                               params->proj_clip, output_ptr_batch);
-    }
-  } else {
-    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                             output_ptr_batch);
-  }
-  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                           output_state_ptr);
-}
-
-void LstmStep(
-    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-    float input_to_input_weights_scale,
-    const int8_t* input_to_forget_weights_ptr,
-    float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
-    const int8_t* input_to_output_weights_ptr,
-    float input_to_output_weights_scale,
-    const int8_t* recurrent_to_input_weights_ptr,
-    float recurrent_to_input_weights_scale,
-    const int8_t* recurrent_to_forget_weights_ptr,
-    float recurrent_to_forget_weights_scale,
-    const int8_t* recurrent_to_cell_weights_ptr,
-    float recurrent_to_cell_weights_scale,
-    const int8_t* recurrent_to_output_weights_ptr,
-    float recurrent_to_output_weights_scale,
-    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
-    const int8_t* cell_to_forget_weights_ptr,
-    float cell_to_forget_weights_scale,
-    const int8_t* cell_to_output_weights_ptr,
-    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-    float projection_weights_scale, const float* projection_bias_ptr,
-    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
-    int n_output, float* input_gate_scratch, float* forget_gate_scratch,
-    float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
-    float* product_scaling_factors, float* recovered_cell_weights,
-    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
-    float* cell_state_ptr, float* output_ptr_batch) {
-  LstmStepWithAuxInput(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale,
-      input_to_forget_weights_ptr, input_to_forget_weights_scale,
-      input_to_cell_weights_ptr, input_to_cell_weights_scale,
-      input_to_output_weights_ptr, input_to_output_weights_scale,
-      /*aux_input_ptr_batch=*/nullptr,
-      /*aux_input_to_input_weights_ptr=*/nullptr,
-      /*aux_input_to_input_weights_scale=*/0.0f,
-      /*aux_input_to_forget_weights_ptr=*/nullptr,
-      /*aux_input_to_forget_weights_scale=*/0.0f,
-      /*aux_input_to_cell_weights_ptr=*/nullptr,
-      /*aux_input_to_cell_weights_scale=*/0.0f,
-      /*aux_input_to_output_weights_ptr=*/nullptr,
-      /*aux_input_to_output_weights_scale=*/0.0f,
-      recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
-      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
-      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
-      recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
-      cell_to_input_weights_ptr, cell_to_input_weights_scale,
-      cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
-      cell_to_output_weights_ptr, cell_to_output_weights_scale,
-      input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
-      output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale,
-      projection_bias_ptr, params, n_batch, n_cell, n_input,
-      /*n_aux_input=*/0, n_output, input_gate_scratch, forget_gate_scratch,
-      cell_scratch, output_gate_scratch, scaling_factors,
-      product_scaling_factors, recovered_cell_weights,
-      quantized_input_ptr_batch,
-      /*quantized_aux_input_ptr_batch=*/nullptr, quantized_output_state_ptr,
-      quantized_cell_state_ptr, output_state_ptr, cell_state_ptr,
-      output_ptr_batch);
-    }
-
-    void LstmStepWithAuxInput(
-        const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-        float input_to_input_weights_scale,
-        const int8_t* input_to_forget_weights_ptr,
-        float input_to_forget_weights_scale,
-        const int8_t* input_to_cell_weights_ptr,
-        float input_to_cell_weights_scale,
-        const int8_t* input_to_output_weights_ptr,
-        float input_to_output_weights_scale, const float* aux_input_ptr_batch,
-        const int8_t* aux_input_to_input_weights_ptr,
-        float aux_input_to_input_weights_scale,
-        const int8_t* aux_input_to_forget_weights_ptr,
-        float aux_input_to_forget_weights_scale,
-        const int8_t* aux_input_to_cell_weights_ptr,
-        float aux_input_to_cell_weights_scale,
-        const int8_t* aux_input_to_output_weights_ptr,
-        float aux_input_to_output_weights_scale,
-        const int8_t* recurrent_to_input_weights_ptr,
-        float recurrent_to_input_weights_scale,
-        const int8_t* recurrent_to_forget_weights_ptr,
-        float recurrent_to_forget_weights_scale,
-        const int8_t* recurrent_to_cell_weights_ptr,
-        float recurrent_to_cell_weights_scale,
-        const int8_t* recurrent_to_output_weights_ptr,
-        float recurrent_to_output_weights_scale,
-        const int8_t* cell_to_input_weights_ptr,
-        float cell_to_input_weights_scale,
-        const int8_t* cell_to_forget_weights_ptr,
-        float cell_to_forget_weights_scale,
-        const int8_t* cell_to_output_weights_ptr,
-        float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
-        const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-        const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-        float projection_weights_scale, const float* projection_bias_ptr,
-        const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
-        int n_aux_input, int n_output, float* input_gate_scratch,
-        float* forget_gate_scratch, float* cell_scratch,
-        float* output_gate_scratch, float* scaling_factors,
-        float* product_scaling_factors, float* recovered_cell_weights,
-        int8_t* quantized_input_ptr_batch,
-        int8_t* quantized_aux_input_ptr_batch,
-        int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
-        float* output_state_ptr, float* cell_state_ptr,
-        float* output_ptr_batch) {
-      // Since we have already checked that weights are all there or none, we
-      // can check the existense of only one to the get the condition.
-      const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-      const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-      // Initialize scratch buffers with bias.
-      if (!use_cifg) {
-        tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
-                                              n_batch, input_gate_scratch);
-      }
-      tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell,
-                                            n_batch, forget_gate_scratch);
-      tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                            cell_scratch);
-      tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell,
-                                            n_batch, output_gate_scratch);
-
-      if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
-        // Save quantization and matmul computation for all zero input.
-        float unused_min, unused_max;
-        for (int b = 0; b < n_batch; ++b) {
-          const int offset = b * n_input;
-          tensor_utils::SymmetricQuantizeFloats(
-              input_ptr_batch + offset, n_input,
-              quantized_input_ptr_batch + offset, &unused_min, &unused_max,
-              &scaling_factors[b]);
-        }
-        // For each batch and cell: compute input_weight * input.
-        if (!use_cifg) {
-          for (int b = 0; b < n_batch; ++b) {
-            product_scaling_factors[b] =
-                scaling_factors[b] * input_to_input_weights_scale;
-          }
-          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-              input_to_input_weights_ptr, n_cell, n_input,
-              quantized_input_ptr_batch, product_scaling_factors, n_batch,
-              input_gate_scratch, /*result_stride=*/1);
-        }
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * input_to_forget_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            input_to_forget_weights_ptr, n_cell, n_input,
-            quantized_input_ptr_batch, product_scaling_factors, n_batch,
-            forget_gate_scratch,
-            /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * input_to_cell_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            input_to_cell_weights_ptr, n_cell, n_input,
-            quantized_input_ptr_batch, product_scaling_factors, n_batch,
-            cell_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * input_to_output_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            input_to_output_weights_ptr, n_cell, n_input,
-            quantized_input_ptr_batch, product_scaling_factors, n_batch,
-            output_gate_scratch,
-            /*result_stride=*/1);
-      }
-
-      if (aux_input_ptr_batch != nullptr &&
-          !tensor_utils::IsZeroVector(aux_input_ptr_batch, n_batch * n_input)) {
-        // Save quantization and matmul computation for all zero input.
-        float unused_min, unused_max;
-        for (int b = 0; b < n_batch; ++b) {
-          const int offset = b * n_input;
-          tensor_utils::SymmetricQuantizeFloats(
-              aux_input_ptr_batch + offset, n_input,
-              quantized_aux_input_ptr_batch + offset, &unused_min, &unused_max,
-              &scaling_factors[b]);
-        }
-        // For each batch and cell: compute input_weight * input.
-        if (!use_cifg) {
-          for (int b = 0; b < n_batch; ++b) {
-            product_scaling_factors[b] =
-                scaling_factors[b] * aux_input_to_input_weights_scale;
-          }
-          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-              aux_input_to_input_weights_ptr, n_cell, n_input,
-              quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
-              input_gate_scratch, /*result_stride=*/1);
-        }
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * aux_input_to_forget_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            aux_input_to_forget_weights_ptr, n_cell, n_input,
-            quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
-            forget_gate_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * aux_input_to_cell_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            aux_input_to_cell_weights_ptr, n_cell, n_input,
-            quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
-            cell_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * aux_input_to_output_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            aux_input_to_output_weights_ptr, n_cell, n_input,
-            quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
-            output_gate_scratch, /*result_stride=*/1);
-      }
-
-      if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
-        // Save quantization and matmul computation for all zero input.
-        float unused_min, unused_max;
-        for (int b = 0; b < n_batch; ++b) {
-          const int offset = b * n_output;
-          tensor_utils::SymmetricQuantizeFloats(
-              output_state_ptr + offset, n_output,
-              quantized_output_state_ptr + offset, &unused_min, &unused_max,
-              &scaling_factors[b]);
-        }
-        // For each batch and cell: compute recurrent_weight * output_state.
-        if (!use_cifg) {
-          for (int b = 0; b < n_batch; ++b) {
-            product_scaling_factors[b] =
-                scaling_factors[b] * recurrent_to_input_weights_scale;
-          }
-          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-              recurrent_to_input_weights_ptr, n_cell, n_output,
-              quantized_output_state_ptr, product_scaling_factors, n_batch,
-              input_gate_scratch, /*result_stride=*/1);
-        }
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * recurrent_to_forget_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            recurrent_to_forget_weights_ptr, n_cell, n_output,
-            quantized_output_state_ptr, product_scaling_factors, n_batch,
-            forget_gate_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * recurrent_to_cell_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            recurrent_to_cell_weights_ptr, n_cell, n_output,
-            quantized_output_state_ptr, product_scaling_factors, n_batch,
-            cell_scratch, /*result_stride=*/1);
-
-        for (int b = 0; b < n_batch; ++b) {
-          product_scaling_factors[b] =
-              scaling_factors[b] * recurrent_to_output_weights_scale;
-        }
-        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-            recurrent_to_output_weights_ptr, n_cell, n_output,
-            quantized_output_state_ptr, product_scaling_factors, n_batch,
-            output_gate_scratch, /*result_stride=*/1);
-      }
-
-      // Save quantization and matmul computation for all zero input.
-      bool is_cell_state_all_zeros =
-          tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
-
-      // For each batch and cell: update input gate.
-      if (!use_cifg) {
-        if (use_peephole && !is_cell_state_all_zeros) {
-          tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell,
-                                             cell_to_input_weights_scale,
-                                             recovered_cell_weights);
-          tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-              recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-              input_gate_scratch);
-        }
-        tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                           input_gate_scratch);
-      }
-
-      // For each batch and cell: update forget gate.
-      if (use_peephole && !is_cell_state_all_zeros) {
-        tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell,
-                                           cell_to_forget_weights_scale,
-                                           recovered_cell_weights);
-        tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-            recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-            forget_gate_scratch);
-      }
-      tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                         forget_gate_scratch);
-
-      // For each batch and cell: update the cell.
-      tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch,
-                                             cell_state_ptr, n_batch * n_cell,
-                                             cell_state_ptr);
-      tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                            params->activation, cell_scratch);
-      if (use_cifg) {
-        tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                                 forget_gate_scratch);
-        tensor_utils::VectorVectorCwiseProductAccumulate(
-            cell_scratch, forget_gate_scratch, n_batch * n_cell,
-            cell_state_ptr);
-      } else {
-        tensor_utils::VectorVectorCwiseProductAccumulate(
-            cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
-      }
-      if (params->cell_clip > 0.0) {
-        tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
-                                 params->cell_clip, cell_state_ptr);
-      }
-
-      is_cell_state_all_zeros =
-          tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
-      // For each batch and cell: update the output gate.
-      if (use_peephole && !is_cell_state_all_zeros) {
-        tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell,
-                                           cell_to_output_weights_scale,
-                                           recovered_cell_weights);
-        tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-            recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
-            output_gate_scratch);
-      }
-      tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                         output_gate_scratch);
-      tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                            params->activation, cell_scratch);
-      tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                             n_batch * n_cell,
-                                             output_gate_scratch);
-
-      // For each batch: update the projection and output_state.
-      const bool use_projection_weight = (projection_weights_ptr != nullptr);
-      const bool use_projection_bias = (projection_bias_ptr != nullptr);
-      if (use_projection_weight) {
-        if (use_projection_bias) {
-          tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                                n_batch, output_ptr_batch);
-        } else {
-          tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
-        }
-        if (!tensor_utils::IsZeroVector(output_gate_scratch,
-                                        n_batch * n_cell)) {
-          // Save quantization and matmul computation for all zero input.
-          float unused_min, unused_max;
-          for (int b = 0; b < n_batch; ++b) {
-            const int offset = b * n_cell;
-            tensor_utils::SymmetricQuantizeFloats(
-                output_gate_scratch + offset, n_cell,
-                quantized_cell_state_ptr + offset, &unused_min, &unused_max,
-                &scaling_factors[b]);
-          }
-          for (int b = 0; b < n_batch; ++b) {
-            product_scaling_factors[b] =
-                scaling_factors[b] * projection_weights_scale;
-          }
-          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-              projection_weights_ptr, n_output, n_cell,
-              quantized_cell_state_ptr, product_scaling_factors, n_batch,
-              output_ptr_batch,
-              /*result_stride=*/1);
-        }
-        if (params->proj_clip > 0.0) {
-          tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
-                                   params->proj_clip, output_ptr_batch);
-        }
-      } else {
-        tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                                 output_ptr_batch);
-      }
-      tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                               output_state_ptr);
-    }
-
-}  // namespace kernel_utils
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
deleted file mode 100644
index b5558cce55ad4b116903ff30da446a33d631d5b5..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-
-namespace tflite {
-namespace kernel_utils {
-
-// Performs an RNN batch inference step for inputs specified by input_ptr_batch.
-// The RNN cell is specified by the pointers to its input and recurrent weights,
-// and biases, along with the input size, number of units, activation.
-//
-// The pointers to the hidden state and the output are updated as a result.
-//
-// The pointers with the suffix "_batch" point to data aligned in batch_major
-// order, and each step processes batch_size many inputs from input_ptr_batch,
-// and updates batch_size many outputs and hidden states.
-void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
-                  const float* recurrent_weights_ptr, const float* bias_ptr,
-                  int input_size, int num_units, int batch_size,
-                  TfLiteFusedActivation activation,
-                  float* hidden_state_ptr_batch, float* output_ptr_batch);
-
-// Same as above but includes an auxiliary input with the corresponding weights.
-void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
-                  const float* aux_input_ptr_batch,
-                  const float* aux_input_weights_ptr,
-                  const float* recurrent_weights_ptr, const float* bias_ptr,
-                  int input_size, int aux_input_size, int num_units,
-                  int batch_size, TfLiteFusedActivation activation,
-                  float* hidden_state_ptr_batch, float* output_ptr_batch);
-
-// Performs a quantized RNN batch inference step. Same as above, but for
-// quantization purposes, we also pass in quantized_hidden_state_ptr_batch and
-// quantized_input_ptr_batch pointers for temporary storage of the quantized
-// values of hidden_state_ptr_batch and input_ptr_batch, respectively.
-// These temporary storages are expected to be preallocated to the same size as
-// the respective pointers.
-// An additional preallocated temporary storage 'scaling_factors' (of size
-// batch_size) is used to store the scaling factors of the quantization (used
-// for recovery).
-// {input,recurrent}_weights_scale params are used for dequantization/recovery.
-void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
-                  float input_weights_scale,
-                  const int8_t* recurrent_weights_ptr,
-                  float recurrent_weights_scale, const float* bias_ptr,
-                  int input_size, int num_units, int batch_size,
-                  TfLiteFusedActivation activation,
-                  int8_t* quantized_input_ptr_batch,
-                  int8_t* quantized_hidden_state_ptr_batch,
-                  float* scaling_factors, float* hidden_state_ptr_batch,
-                  float* output_ptr_batch);
-
-void RnnBatchStep(
-    const float* input_ptr_batch, const int8_t* input_weights_ptr,
-    float input_weights_scale, const float* aux_input_ptr_batch,
-    const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
-    const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
-    const float* bias_ptr, int input_size, int aux_input_size, int num_units,
-    int batch_size, TfLiteFusedActivation activation,
-    int8_t* quantized_input_ptr_batch, int8_t* aux_quantized_input_ptr_batch,
-    int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
-    float* hidden_state_ptr_batch, float* output_ptr_batch);
-
-// Performs an LSTM batch inference step for input specified by input_ptr_batch.
-// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
-// biases (*_bias_ptr), and buffers (*_scratch), along with additional
-// parameters:
-//  - params: various LSTM params including activation, clipping, etc.,
-//  - n_batch: size of batch,
-//  - n_cell: number of cells (or units),
-//  - n_input: the input size,
-//  - n_output: the output size.
-//
-// The pointers to the cell and output state and the output are updated.
-//
-// The pointers with the suffix "_batch" point to data aligned in batch_major
-// order, and each step processes batch_size many inputs from input_ptr_batch,
-// and updates batch_size many cell and output states.
-void LstmStep(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_output, float* output_state_ptr,
-    float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr_batch);
-
-// Same as above but includes an auxiliary input with the corresponding weights.
-void LstmStepWithAuxInput(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr, const float* aux_input_ptr_batch,
-    const float* aux_input_to_input_weights_ptr,
-    const float* aux_input_to_forget_weights_ptr,
-    const float* aux_input_to_cell_weights_ptr,
-    const float* aux_input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
-    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* output_ptr_batch);
-
-// Same as above but with quantized weight matrices. In detail:
-// Input of size 'n_batch * n_input':
-//   input_ptr_batch
-//
-// LSTM weights:
-// Quantized input weights of size 'n_cell * n_input':
-//   input_to_input_weights            - optional (can be nullptr)
-//   input_to_forget_weights
-//   input_to_cell_weights
-//   input_to_input_weights
-// Quantized recurrent weights of size 'n_cell * n_output':
-//   recurrent_to_input_weights        - optional
-//   recurrent_to_forget_weights
-//   recurrent_to_cell_weights
-//   recurrent_to_input_weights
-// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
-//   cell_to_input_weights             - optional
-//   cell_to_cell_weights              - optional
-//   cell_to_output_weights            - optional
-// Quantized projection weights of size 'n_output * n_cell'
-//   projection_weights_ptr            - optional
-// Weight scales (scalars) for each of the weights above.
-//   input_to_input_weights_scale      - optional
-//   input_to_forget_weights_scale
-//   input_to_cell_weights_scale
-//   input_to_output_weights_scale
-//   recurrent_to_input_weights_scale  - optional
-//   recurrent_to_forget_weights_scale
-//   recurrent_to_cell_weights_scale
-//   recurrent_to_output_weights_scale
-//   cell_to_input_weights_scale,
-//   cell_to_forget_weights_scale,
-//   cell_to_output_weights_scale,
-//   projection_weights_scale          - optional
-// Gate biases of size 'n_cell':
-//   input_gate_bias_ptr               - optional
-//   forget_gate_bias_ptr
-//   cell_gate_bias_ptr
-//   output_gate_bias_ptr
-//
-// Temporary pre-allocated storage for quantized values:
-//   quantized_input_ptr_batch (same size as input_ptr_batch)
-//   quantized_output_state_ptr (same size as output_state_ptr)
-//   quantized_cell_state_ptr (same size as cell_state_ptr)
-// Temporary pre-allocated storage for recovered values:
-//   recovered_cell_weights (same size as cell_to_*_weights)
-//
-// Outputs:
-//   output_state_ptr - size 'n_batch * n_output'
-//   cell_state_ptr   - size 'n_batch * n_cell'
-//   output_ptr_batch - size 'n_batch * n_output'
-void LstmStep(
-    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-    float input_to_input_weights_scale,
-    const int8_t* input_to_forget_weights_ptr,
-    float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
-    const int8_t* input_to_output_weights_ptr,
-    float input_to_output_weights_scale,
-    const int8_t* recurrent_to_input_weights_ptr,
-    float recurrent_to_input_weights_scale,
-    const int8_t* recurrent_to_forget_weights_ptr,
-    float recurrent_to_forget_weights_scale,
-    const int8_t* recurrent_to_cell_weights_ptr,
-    float recurrent_to_cell_weights_scale,
-    const int8_t* recurrent_to_output_weights_ptr,
-    float recurrent_to_output_weights_scale,
-    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
-    const int8_t* cell_to_forget_weights_ptr,
-    float cell_to_forget_weights_scale,
-    const int8_t* cell_to_output_weights_ptr,
-    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-    float projection_weights_scale, const float* projection_bias_ptr,
-    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
-    int n_output, float* input_gate_scratch, float* forget_gate_scratch,
-    float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
-    float* product_scaling_factors, float* recovered_cell_weights,
-    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
-    float* cell_state_ptr, float* output_ptr_batch);
-
-void LstmStepWithAuxInput(
-    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-    float input_to_input_weights_scale,
-    const int8_t* input_to_forget_weights_ptr,
-    float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
-    const int8_t* input_to_output_weights_ptr,
-    float input_to_output_weights_scale, const float* aux_input_ptr_batch,
-    const int8_t* aux_input_to_input_weights_ptr,
-    float aux_input_to_input_weights_scale,
-    const int8_t* aux_input_to_forget_weights_ptr,
-    float aux_input_to_forget_weights_scale,
-    const int8_t* aux_input_to_cell_weights_ptr,
-    float aux_input_to_cell_weights_scale,
-    const int8_t* aux_input_to_output_weights_ptr,
-    float aux_input_to_output_weights_scale,
-    const int8_t* recurrent_to_input_weights_ptr,
-    float recurrent_to_input_weights_scale,
-    const int8_t* recurrent_to_forget_weights_ptr,
-    float recurrent_to_forget_weights_scale,
-    const int8_t* recurrent_to_cell_weights_ptr,
-    float recurrent_to_cell_weights_scale,
-    const int8_t* recurrent_to_output_weights_ptr,
-    float recurrent_to_output_weights_scale,
-    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
-    const int8_t* cell_to_forget_weights_ptr,
-    float cell_to_forget_weights_scale,
-    const int8_t* cell_to_output_weights_ptr,
-    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-    float projection_weights_scale, const float* projection_bias_ptr,
-    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
-    int n_aux_input, int n_output, float* input_gate_scratch,
-    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
-    float* scaling_factors, float* product_scaling_factors,
-    float* recovered_cell_weights, int8_t* quantized_input_ptr_batch,
-    int8_t* quantized_aux_input_ptr_batch, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
-    float* cell_state_ptr, float* output_ptr_batch);
-
-}  // namespace kernel_utils
-}  // namespace tflite
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc.cc b/tensorflow/contrib/lite/kernels/internal/mfcc.cc
deleted file mode 100644
index eafe0c7afee6fabd5a4a258aa5176e23f5e8d62a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/mfcc.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <math.h>
-
-#include "tensorflow/contrib/lite/kernels/internal/mfcc.h"
-
-namespace tflite {
-namespace internal {
-
-const double kDefaultUpperFrequencyLimit = 4000;
-const double kDefaultLowerFrequencyLimit = 20;
-const double kFilterbankFloor = 1e-12;
-const int kDefaultFilterbankChannelCount = 40;
-const int kDefaultDCTCoefficientCount = 13;
-
-Mfcc::Mfcc()
-    : initialized_(false),
-      lower_frequency_limit_(kDefaultLowerFrequencyLimit),
-      upper_frequency_limit_(kDefaultUpperFrequencyLimit),
-      filterbank_channel_count_(kDefaultFilterbankChannelCount),
-      dct_coefficient_count_(kDefaultDCTCoefficientCount) {}
-
-bool Mfcc::Initialize(int input_length, double input_sample_rate) {
-  bool initialized = mel_filterbank_.Initialize(
-      input_length, input_sample_rate, filterbank_channel_count_,
-      lower_frequency_limit_, upper_frequency_limit_);
-  initialized &=
-      dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_);
-  initialized_ = initialized;
-  return initialized;
-}
-
-void Mfcc::Compute(const std::vector<double>& spectrogram_frame,
-                   std::vector<double>* output) const {
-  if (!initialized_) {
-    // LOG(ERROR) << "Mfcc not initialized.";
-    return;
-  }
-  std::vector<double> working;
-  mel_filterbank_.Compute(spectrogram_frame, &working);
-  for (int i = 0; i < working.size(); ++i) {
-    double val = working[i];
-    if (val < kFilterbankFloor) {
-      val = kFilterbankFloor;
-    }
-    working[i] = log(val);
-  }
-  dct_.Compute(working, output);
-}
-
-}  // namespace internal
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
deleted file mode 100644
index 2d96da65c33bd4d1d132501dfaa49148f2c26484..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
-
-// The Conv implementation based on CBLAS interface. This is only used on iOS
-// for now, utilizing Apple's Accelerate framework.
-
-#if TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
-#include <Accelerate/Accelerate.h>
-#else
-#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h"
-#endif
-
-#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-
-namespace tflite {
-namespace cblas_ops {
-
-inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
-                 const float* input_data, const RuntimeShape& filter_shape,
-                 const float* filter_data, const RuntimeShape& bias_shape,
-                 const float* bias_data, const RuntimeShape& output_shape,
-                 float* output_data, const RuntimeShape& im2col_shape,
-                 float* im2col_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-  gemmlowp::ScopedProfilingLabel label("Conv/cblas");
-
-  const float* gemm_input_data = nullptr;
-  const RuntimeShape* gemm_input_shape = nullptr;
-  const int filter_width = filter_shape.Dims(2);
-  const int filter_height = filter_shape.Dims(1);
-  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
-                           filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
-    TFLITE_DCHECK(im2col_data);
-    ConvParams op_params;
-    op_params.padding_type = PaddingType::kSame;
-    op_params.padding_values.width = pad_width;
-    op_params.padding_values.height = pad_height;
-    op_params.stride_width = stride_width;
-    op_params.stride_height = stride_height;
-    op_params.dilation_width_factor = dilation_width_factor;
-    op_params.dilation_height_factor = dilation_height_factor;
-    optimized_ops::Im2col(op_params, filter_height, filter_width, 0,
-                          input_shape, input_data, im2col_shape, im2col_data);
-
-    gemm_input_data = im2col_data;
-    gemm_input_shape = &im2col_shape;
-  } else {
-    TFLITE_DCHECK(!im2col_data);
-    gemm_input_data = input_data;
-    gemm_input_shape = &input_shape;
-  }
-
-  // The following code computes matrix multiplication c = a * transponse(b)
-  // with CBLAS, where:
-  // * `a` is a matrix with dimensions (m, k).
-  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
-  // * `c` is a matrix with dimensions (m, n).
-  // The naming of variables are aligned with CBLAS specification here.
-  const float* a = gemm_input_data;
-  const float* b = filter_data;
-  float* c = output_data;
-  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
-  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
-  int n = output_shape.Dims(3);
-  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
-  // The stride of matrix a, b and c respectively.
-  int stride_a = k;
-  int stride_b = k;
-  int stride_c = n;
-
-  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
-              stride_a, b, stride_b, 0.0f, c, stride_c);
-
-  optimized_ops::AddBiasAndEvalActivationFunction(
-      output_activation_min, output_activation_max, bias_shape, bias_data,
-      output_shape, output_data);
-}
-
-}  // namespace cblas_ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h b/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
deleted file mode 100644
index 6acc513805c9398c304f3e24175d3bd6c96938f6..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cblas_reference.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
-
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-
-// The reference implementation for a small subset of CBLAS interface.
-// This is only used for testing CBLAS implementation, and should never be used
-// in production code.
-
-namespace tflite {
-namespace cblas_ops {
-
-// The following code follows the original CBLAS specification, and it might
-// conflict with the TensorFlow naming convention.
-// TODO(ycling): Find another way to test CBLAS with bazel, without writing
-// a reference implementation by ourselves.
-enum CBLAS_ORDER { CblasRowMajor = 0, CblasColMajor = 1 };
-
-enum CBLAS_TRANSPOSE { CblasNoTrans = 0, CblasTrans = 1, CblasConjTrans = 2 };
-
-// A reference implementation for matrix multiplication.
-// The following code computes, c = a * transponse(b) matrix multiplication
-// with CBLAS, where:
-// * `a` is a matrix with dimensions (m, k).
-// * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
-// * `c` is a matrix with dimensions (m, n).
-// The naming of variables is aligned with CBLAS specification here.
-void cblas_sgemm(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE trans_a,
-                 const enum CBLAS_TRANSPOSE trans_b, const int m, const int n,
-                 const int k, const float alpha, const float *a,
-                 const int stride_a, const float *b, const int stride_b,
-                 const float beta, float *c, const int stride_c) {
-  TFLITE_DCHECK(order == CblasRowMajor);
-  TFLITE_DCHECK(trans_a == CblasNoTrans);
-  TFLITE_DCHECK(trans_b == CblasTrans);
-  TFLITE_DCHECK(beta == 0.0f);
-  for (int row = 0; row < m; ++row) {
-    for (int col = 0; col < n; ++col) {
-      // If `beta` non-zero, multiple it with the original values in output.
-      // Otherwise, ignore the original value in output completely.
-      float value = 0.0f;
-      for (int idx = 0; idx < k; ++idx) {
-        value += alpha * a[stride_a * row + idx] * b[stride_b * col + idx];
-      }
-      c[stride_c * row + col] = value;
-    }
-  }
-}
-
-}  // namespace cblas_ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
deleted file mode 100644
index d8dd7bba897ab84ef81ef6425108577be5f02735..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ /dev/null
@@ -1,1098 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
-
-#include "public/gemmlowp.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-namespace tflite {
-namespace optimized_ops {
-
-// Implementation of float DepthwiseConv
-
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-struct FloatDepthwiseConvKernel {};
-
-#ifdef USE_NEON
-
-template <>
-struct FloatDepthwiseConvKernel<false, 8, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Load the filters
-    float32x4_t filter[2];
-    for (int i = 0; i < 2; i++) {
-      filter[i] = vld1q_f32(filter_ptr + 4 * i);
-    }
-    int outp = 0;
-    // Handle 2 output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the inputs
-      float32x4_t input[4];
-      for (int i = 0; i < 4; i++) {
-        input[i] = vld1q_f32(input_ptr + 4 * i);
-      }
-      input_ptr += 16;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
-      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
-      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
-      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      float32x4_t input[2];
-      for (int i = 0; i < 2; i++) {
-        input[i] = vld1q_f32(input_ptr + 4 * i);
-      }
-      input_ptr += 8;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<false, 2, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    const float32x2_t filters = vld1_f32(filter_ptr);
-    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
-    int outp = 0;
-    // Handle 8 output pixels at a time.
-    for (; outp <= num_output_pixels - 8; outp += 8) {
-      // Load the inputs
-      float32x4_t input[4];
-      for (int i = 0; i < 4; i++) {
-        input[i] = vld1q_f32(input_ptr + 4 * i);
-      }
-      input_ptr += 16;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle 4 output pixels at a time.
-    for (; outp <= num_output_pixels - 4; outp += 4) {
-      // Load the inputs
-      float32x4_t input[2];
-      for (int i = 0; i < 2; i++) {
-        input[i] = vld1q_f32(input_ptr + 4 * i);
-      }
-      input_ptr += 8;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-    }
-    // Handle 2 output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the inputs
-      const float32x4_t input = vld1q_f32(input_ptr);
-      input_ptr += 4;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
-      // Multiply-accumulate
-      acc = vmlaq_f32(acc, input, filters_dup2);
-      // Store the accumulators back to acc_buffer
-      vst1q_f32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-    }
-    // Handle 1 output pixel at a time
-    for (; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      const float32x2_t input = vld1_f32(input_ptr);
-      input_ptr += 2;
-      // Load the accumulators from acc_buffer
-      float32x2_t acc = vld1_f32(acc_buffer_ptr);
-      // Multiply-accumulate
-      acc = vmla_f32(acc, input, filters);
-      // Store the accumulators back to acc_buffer
-      vst1_f32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 2;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const float* local_filter_ptr = filter_ptr;
-      const float* local_input_ptr = input_ptr;
-      int ic = 0;
-      // Handle 16 input channels at a time.
-      for (; ic <= input_depth - 16; ic += 16) {
-        // Load the filters
-        float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
-        float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
-        float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
-        float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
-        local_filter_ptr += 16;
-        // Load the inputs
-        float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
-        float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
-        float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
-        float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
-        local_input_ptr += 16;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
-        float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
-        float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
-        float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
-        // Multiply-accumulate
-        acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
-        acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
-        acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
-        acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
-        // Store the accumulators back to acc_buffer
-        vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
-        vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
-        vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
-        vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
-        acc_buffer_ptr += 16;
-      }
-      // Handle 4 input channels at a time.
-      for (; ic <= input_depth - 4; ic += 4) {
-        // Load the filters
-        float32x4_t filter;
-        filter = vld1q_f32(local_filter_ptr);
-        local_filter_ptr += 4;
-        // Load the inputs
-        float32x4_t input;
-        input = vld1q_f32(local_input_ptr);
-        local_input_ptr += 4;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc;
-        acc = vld1q_f32(acc_buffer_ptr);
-        // Multiply-accumulate
-        acc = vmlaq_f32(acc, input, filter);
-        // Store the accumulators back to acc_buffer
-        vst1q_f32(acc_buffer_ptr, acc);
-        acc_buffer_ptr += 4;
-      }
-      // Handle one input channel at a time.
-      for (; ic < input_depth; ic++) {
-        const float input_val = *local_input_ptr++;
-        const float filter_val = *local_filter_ptr++;
-        *acc_buffer_ptr++ += filter_val * input_val;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 8> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const float* local_filter_ptr = filter_ptr;
-      const float* local_input_ptr = input_ptr;
-      int ic = 0;
-      // Handle 2 input channels at a time.
-      for (; ic <= input_depth - 2; ic += 2) {
-        // Load the filters
-        float32x4_t filter[4];
-        for (int i = 0; i < 4; i++) {
-          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
-        }
-        local_filter_ptr += 16;
-        // Load the inputs
-        const float32x2_t input = vld1_f32(local_input_ptr);
-        local_input_ptr += 2;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc[4];
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
-        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
-        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
-        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 4; i++) {
-          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 16;
-      }
-      // Handle one input channel at a time.
-      for (; ic < input_depth; ic++) {
-        // Load the filters
-        float32x4_t filter[2];
-        for (int i = 0; i < 2; i++) {
-          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
-        }
-        local_filter_ptr += 8;
-        // Load the inputs
-        const float input_val = *local_input_ptr++;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc[2];
-        for (int i = 0; i < 2; i++) {
-          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        for (int i = 0; i < 2; i++) {
-          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
-        }
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 2; i++) {
-          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 8;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-// Note this implementation is very slow for input_depths < 8
-// (e.g. comparable to reference implementation) see, specializations for
-// input_depth=3 below.
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 2> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const float* local_filter_ptr = filter_ptr;
-      const float* local_input_ptr = input_ptr;
-      int ic = 0;
-      // Handle 8 input channels at a time.
-      for (; ic <= input_depth - 8; ic += 8) {
-        // Load the filters
-        float32x4_t filter[4];
-        for (int i = 0; i < 4; i++) {
-          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
-        }
-        local_filter_ptr += 16;
-        // Load the inputs
-        float32x4x2_t input_dup2[2];
-        for (int i = 0; i < 2; i++) {
-          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
-          input_dup2[i] = vzipq_f32(input, input);
-        }
-        local_input_ptr += 8;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc[4];
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
-        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
-        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
-        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 4; i++) {
-          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 16;
-      }
-      // Handle 4 input channels at a time.
-      for (; ic <= input_depth - 4; ic += 4) {
-        // Load the filters
-        float32x2_t filter[4];
-        for (int i = 0; i < 4; i++) {
-          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
-        }
-        local_filter_ptr += 8;
-        // Load the inputs
-        const float32x4_t input = vld1q_f32(local_input_ptr);
-        local_input_ptr += 4;
-        // Load the accumulators from acc_buffer
-        float32x2_t acc[4];
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
-        }
-        // Multiply-accumulate
-        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
-        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
-        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
-        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 4; i++) {
-          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
-        }
-        acc_buffer_ptr += 8;
-      }
-      // Handle 2 input channels at a time.
-      for (; ic <= input_depth - 2; ic += 2) {
-        // Load the filters
-        const float32x4_t filter = vld1q_f32(local_filter_ptr);
-        local_filter_ptr += 4;
-        // Load the inputs
-        const float32x2_t input = vld1_f32(local_input_ptr);
-        local_input_ptr += 2;
-        // Load the accumulators from acc_buffer
-        float32x2_t acc[2];
-        for (int i = 0; i < 2; i++) {
-          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
-        }
-        // Multiply-accumulate
-        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
-        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 2; i++) {
-          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
-        }
-        acc_buffer_ptr += 4;
-      }
-      // Handle one input channel at a time.
-      for (; ic < input_depth; ic++) {
-        // Load the inputs
-        const float input_val = *local_input_ptr++;
-        // Multiply-accumulate
-        for (int i = 0; i < 2; i++) {
-          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
-        }
-        local_filter_ptr += 2;
-        acc_buffer_ptr += 2;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 3, 2> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Load the filters
-    float32x2_t filter[3];
-    for (int i = 0; i < 3; i++) {
-      filter[i] = vld1_f32(filter_ptr + 2 * i);
-    }
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const float32x2_t input01 = vld1_f32(input_ptr);
-      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
-      // Load the accumulators from acc_buffer
-      float32x2_t acc[3];
-      for (int i = 0; i < 3; i++) {
-        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
-      }
-      // Multiply-accumulate for each input channel there 2 outputs
-      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
-      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
-      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 3; i++) {
-        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
-      }
-      acc_buffer_ptr += 6;
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 3, 4> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Load the filters
-    float32x4_t filter[3];
-    for (int i = 0; i < 3; i++) {
-      filter[i] = vld1q_f32(filter_ptr + 4 * i);
-    }
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // NOTE: we only want 3 values, so we read it as two ops where
-      // the second op just duplicates the lane
-      const float32x2_t input01 = vld1_f32(input_ptr);
-      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[3];
-      for (int i = 0; i < 3; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate all outputs.
-      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
-      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
-      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 3; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 12;
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 1, 8> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Load the filters
-    float32x4_t filter[2];
-    for (int i = 0; i < 2; i++) {
-      filter[i] = vld1q_f32(filter_ptr + 4 * i);
-    }
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      const float input_val = *input_ptr;
-      input_ptr += input_ptr_increment;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 1, 32> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Load the filters
-    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
-    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
-    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
-    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
-    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
-    float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
-    float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
-    float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
-
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      const float input_val = *input_ptr;
-      input_ptr += input_ptr_increment;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
-      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
-      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
-      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
-      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
-      float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
-      float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
-      float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
-      // Multiply-accumulate
-      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
-      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
-      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
-      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
-      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
-      acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
-      acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
-      acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
-      // Store the accumulators back to acc_buffer
-      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
-      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
-      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
-      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
-      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
-      vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
-      vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
-      vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
-      acc_buffer_ptr += 32;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 1, 20> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Load the filters
-    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
-    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
-    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
-    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
-    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
-
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      const float input_val = *input_ptr;
-      input_ptr += input_ptr_increment;
-      // Load the accumulators from acc_buffer
-      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
-      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
-      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
-      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
-      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
-      // Multiply-accumulate
-      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
-      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
-      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
-      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
-      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
-      // Store the accumulators back to acc_buffer
-      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
-      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
-      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
-      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
-      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
-      acc_buffer_ptr += 20;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 0, 16> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const float* local_filter_ptr = filter_ptr;
-      const float* local_input_ptr = input_ptr;
-      for (int ic = 0; ic < input_depth; ic++) {
-        // Load the filters
-        float32x4_t filter[4];
-        for (int i = 0; i < 4; i++) {
-          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
-        }
-        local_filter_ptr += 16;
-        // Load the inputs
-        const float input_val = *local_input_ptr++;
-        // Load the accumulators from acc_buffer
-        float32x4_t acc[4];
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        for (int i = 0; i < 4; i++) {
-          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
-        }
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 4; i++) {
-          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 16;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 8, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    // Load the filters
-    float32x4_t filter[2];
-    for (int i = 0; i < 2; i++) {
-      filter[i] = vld1q_f32(filter_ptr + 4 * i);
-    }
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      float32x4_t input[2];
-      for (int i = 0; i < 2; i++) {
-        input[i] = vld1q_f32(input_ptr + 4 * i);
-      }
-      // Load the accumulators from acc_buffer
-      float32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 2, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    float32x2_t filter = vld1_f32(filter_ptr);
-    float32x4_t filter_x4 = vcombine_f32(filter, filter);
-    int outp = 0;
-
-    // Handle two output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the inputs
-      float32x2_t input_1 = vld1_f32(input_ptr);
-      input_ptr += input_ptr_increment;
-      float32x2_t input_2 = vld1_f32(input_ptr);
-      input_ptr += input_ptr_increment;
-      float32x4_t input = vcombine_f32(input_1, input_2);
-
-      // Load the accumulators from acc_buffer
-      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
-
-      // Multiply-accumulate
-      acc = vmlaq_f32(acc, input, filter_x4);
-
-      // Store the accumulators back to acc_buffer
-      vst1q_f32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      float32x2_t input = vld1_f32(input_ptr);
-      input_ptr += input_ptr_increment;
-
-      // Load the accumulators from acc_buffer
-      float32x2_t acc = vld1_f32(acc_buffer_ptr);
-
-      // Multiply-accumulate
-      acc = vmla_f32(acc, input, filter);
-
-      // Store the accumulators back to acc_buffer
-      vst1_f32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 2;
-    }
-  }
-};
-
-template <>
-struct FloatDepthwiseConvKernel<true, 4, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const float* input_ptr, int input_ptr_increment,
-                  const float* filter_ptr, float* acc_buffer_ptr) {
-    float32x4_t filter = vld1q_f32(filter_ptr);
-
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the inputs
-      float32x4_t input = vld1q_f32(input_ptr);
-      // Load the accumulators from acc_buffer
-      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
-      // Multiply-accumulate
-      acc = vmlaq_f32(acc, input, filter);
-      // Store the accumulators back to acc_buffer
-      vst1q_f32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-#endif
-
-// Accumulates the effect of one row of the filter, on a segment of one row
-// of the output, accessing the corresponding one row of the input.
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
-                                int input_depth, int input_width,
-                                const float* input_data, int pad_width,
-                                int depth_multiplier, int filter_width,
-                                const float* filter_data,
-                                int out_x_buffer_start, int out_x_buffer_end,
-                                int output_depth, float* acc_buffer) {
-#ifdef GEMMLOWP_PROFILING
-  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
-#endif
-  // Sanity check parameters. This is important in particular to ensure
-  // that we keep the number of template instantiations minimal, so we don't
-  // increase binary size unnecessarily.
-  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
-  static_assert(kFixedInputDepth || kAllowStrided, "");
-  TFLITE_DCHECK(stride == 1 || kAllowStrided);
-  if (kFixedInputDepth) {
-    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
-  }
-  if (kFixedDepthMultiplier) {
-    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
-  }
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  const int input_ptr_increment = stride * input_depth;
-  const float* filter_base_ptr = filter_data;
-  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-    // For the current (filter_x, filter_y) point in the filter,
-    // compute the boundaries of the corresponding output row segment.
-    int out_x_loop_start_unclampled = 0;
-    int out_x_loop_end_unclampled = 0;
-    if (kAllowStrided) {
-      if (stride == 2) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 1) / 2;
-      } else if (stride == 4) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 3) / 4;
-      } else {
-        out_x_loop_start_unclampled =
-            (pad_width - filter_x + stride - 1) / stride;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + stride - 1) / stride;
-      }
-    } else {
-      out_x_loop_start_unclampled = pad_width - filter_x;
-      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
-    }
-    // The kernel will have to iterate on the segment of the
-    // output row that starts at out_x_loop_start and out_x_loop_end.
-    const int out_x_loop_start =
-        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
-    const int out_x_loop_end =
-        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
-
-    float* acc_buffer_ptr =
-        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
-    const float* input_ptr = input_data + in_x_origin * input_depth;
-    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
-    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
-                             kFixedDepthMultiplier>::Run(num_output_pixels,
-                                                         input_depth,
-                                                         depth_multiplier,
-                                                         input_ptr,
-                                                         input_ptr_increment,
-                                                         filter_base_ptr,
-                                                         acc_buffer_ptr);
-    filter_base_ptr += output_depth;
-  }
-}
-
-// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
-inline void FloatDepthwiseConvAccumRowGeneric(
-    int stride, int dilation_factor, int input_depth, int input_width,
-    const float* input_data, int pad_width, int depth_multiplier,
-    int filter_width, const float* filter_data, int out_x_buffer_start,
-    int out_x_buffer_end, int output_depth, float* acc_buffer) {
-  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
-#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-  LOG(FATAL)
-      << "\n\n"
-      << "*****************************************************************\n"
-      << "* This tfmini inference code was about to use the slow generic\n"
-      << "* fallback implementation for a DepthwiseConv op, and we want you\n"
-      << "* to be aware of that so that you will know why you get terrible\n"
-      << "* performance.\n"
-      << "*\n"
-      << "* If you would like to carry on with the slow code, compile\n"
-      << "* with this preprocessor token defined:\n"
-      << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
-      << "*\n"
-      << "* The right thing to do, if you care about performance, is to add\n"
-      << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
-      << "* The relevant parameters defining your case are:\n"
-      << "* stride = " << stride << "\n"
-      << "* input_depth = " << input_depth << "\n"
-      << "* depth_multiplier = " << depth_multiplier << "\n"
-      << "* dilation_factor = " << dilation_factor << "\n"
-      << "*\n"
-      << "* Please do not hesitate to contact benoitjacob@ with this\n"
-      << "* information.\n"
-      << "*****************************************************************\n";
-#endif  // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#endif  // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-  const float* filter_base_ptr = filter_data;
-  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-    const int out_x_loop_start = std::max(
-        out_x_buffer_start,
-        (pad_width - dilation_factor * filter_x + stride - 1) / stride);
-    const int out_x_loop_end = std::min(
-        out_x_buffer_end,
-        (pad_width + input_width - dilation_factor * filter_x + stride - 1) /
-            stride);
-
-    float* acc_buffer_ptr =
-        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin =
-        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
-    const float* input_ptr = input_data + in_x_origin * input_depth;
-    const int input_ptr_increment = (stride - 1) * input_depth;
-    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
-      const float* filter_ptr = filter_base_ptr;
-      for (int ic = 0; ic < input_depth; ++ic) {
-        const float input_val = *input_ptr++;
-        for (int m = 0; m < depth_multiplier; m++) {
-          const float filter_val = *filter_ptr++;
-          *acc_buffer_ptr++ += filter_val * input_val;
-        }
-      }
-      input_ptr += input_ptr_increment;
-    }
-    filter_base_ptr += output_depth;
-  }
-}
-
-// Initializes the accumulator buffer with bias values.
-inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
-                                       const float* bias_data,
-                                       float* acc_buffer) {
-  // TODO(benoitjacob): This might need optimized specializations
-  // for small output_depth values, if that ever becomes an important
-  // case (like it was for some quantized DepthwiseConv cases).
-  for (int i = 0; i < num_output_pixels; i++) {
-    memcpy(acc_buffer + i * output_depth, bias_data,
-           sizeof(acc_buffer[0]) * output_depth);
-  }
-}
-
-inline void DepthwiseConv(
-    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const float* input_data, const RuntimeShape& filter_shape,
-    const float* filter_data, const RuntimeShape& bias_shape,
-    const float* bias_data, const RuntimeShape& output_shape,
-    float* output_data) {
-  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  static const int kAccBufferMaxSize = 2048;
-  float acc_buffer[kAccBufferMaxSize];
-  TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
-  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
-  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
-  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
-                   kAccBufferActualSize);
-  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
-  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
-
-  // row_accum_func will point to the core accumulation function to be used
-  // for this DepthwiseConv op.
-  using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
-  row_accum_func_t row_accum_func = nullptr;
-
-#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
-                                        FIXED_DEPTH_MULTIPLIER)           \
-  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
-      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
-      depth_multiplier == FIXED_DEPTH_MULTIPLIER &&                       \
-      dilation_height_factor == 1 && dilation_width_factor == 1) {        \
-    row_accum_func =                                                      \
-        FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
-                                   FIXED_DEPTH_MULTIPLIER>;               \
-  }
-
-#ifdef USE_NEON
-  // We go over our list of kernels by decreasing order of preference
-  // for the cases where multiple kernels could apply.
-
-  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
-
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
-
-  // Next come the strided kernels: AllowStrided=true, fixed input depth.
-  // They are a bit less efficient, but allow stride!=1.
-
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
-
-  // Finally, the kernels allowing a variable input depth,
-  // these are the least efficient but most general kernels.
-
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
-
-#endif  // USE_NEON
-
-#undef TFMINI_USE_DEPTHWISECONV_KERNEL
-
-  // No matching fast kernel found, use slow fallback.
-  if (!row_accum_func) {
-    row_accum_func = FloatDepthwiseConvAccumRowGeneric;
-  }
-
-  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
-  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
-  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
-
-  // Now that we have determined row_accum_func, we can start work.
-  float* output_ptr = output_data;
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      const int in_y_origin = (out_y * stride_height) - pad_height;
-      const int filter_y_start =
-          std::max(0, (-in_y_origin + dilation_height_factor - 1) /
-                          dilation_height_factor);
-      const int filter_y_end =
-          std::min(filter_height,
-                   (input_height - in_y_origin + dilation_height_factor - 1) /
-                       dilation_height_factor);
-      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
-           out_x_buffer_start += kOutputPixelsInAccBuffer) {
-        const int out_x_buffer_end = std::min(
-            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
-        // We call a 'pixel' a group of activation that share all but the
-        // 'depth'/'channel' coordinate. num_output_pixels is the number of
-        // output pixels that we will accumulate in this loop iteration.
-        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
-        // Initialize our local accumulator with the bias values, so we don't
-        // have to add them later.
-        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
-                                   acc_buffer);
-        // Accumulation loop. Most of the time should be spent in here.
-        for (int filter_y = filter_y_start; filter_y < filter_y_end;
-             ++filter_y) {
-          const int in_y = in_y_origin + dilation_height_factor * filter_y;
-          row_accum_func(
-              stride_width, dilation_width_factor, input_depth, input_width,
-              input_data + in_y * input_height_stride + b * input_batch_stride,
-              pad_width, depth_multiplier, filter_width,
-              filter_data + filter_y * filter_height_stride, out_x_buffer_start,
-              out_x_buffer_end, output_depth, acc_buffer);
-        }
-        // Finished accumulating. Now store to destination.
-        const int num_output_values = output_depth * num_output_pixels;
-        int i = 0;
-// TODO(benoitjacob) optimized code goes here
-#ifdef USE_NEON
-        // Handle 16 values at a time
-        for (; i <= num_output_values - 16; i += 16) {
-          float32x4_t acc[4];
-          for (int k = 0; k < 4; k++) {
-            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
-          }
-          for (int k = 0; k < 4; k++) {
-            acc[k] = vmaxq_f32(
-                vdupq_n_f32(output_activation_min),
-                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
-          }
-          for (int k = 0; k < 4; k++) {
-            vst1q_f32(output_ptr + 4 * k, acc[k]);
-          }
-          output_ptr += 16;
-        }
-        // Handle 4 values at a time
-        for (; i <= num_output_values - 4; i += 4) {
-          float32x4_t acc = vld1q_f32(acc_buffer + i);
-
-          acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
-                          vminq_f32(vdupq_n_f32(output_activation_max), acc));
-
-          vst1q_f32(output_ptr, acc);
-          output_ptr += 4;
-        }
-#endif
-        // Handle leftover values, one by one. This is very slow.
-        for (; i < num_output_values; i++) {
-          float acc = acc_buffer[i];
-          acc = std::max(output_activation_min,
-                         std::min(output_activation_max, acc));
-
-          *output_ptr++ = acc;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace optimized_ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
deleted file mode 100644
index 803eff292a3f383c1e1cceff7c07d434b0c48587..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ /dev/null
@@ -1,1999 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
-
-#include "fixedpoint/fixedpoint.h"
-#include "public/gemmlowp.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-namespace tflite {
-namespace optimized_ops {
-
-// Implementation of quantized DepthwiseConv
-
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-struct QuantizedDepthwiseConvKernel {};
-
-#ifdef USE_NEON
-template <>
-struct QuantizedDepthwiseConvKernel<true, 8, 2> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8x2_t filter_u8;
-    filter_u8.val[0] = vld1_u8(filter_ptr);
-    filter_u8.val[1] = vld1_u8(filter_ptr + 8);
-    int16x8_t filter[2];
-    for (int i = 0; i < 2; i++) {
-      filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
-                            vdupq_n_s16(filter_offset));
-    }
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer
-      int32x4x2_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
-        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
-      }
-      // Load the inputs, add input_offset.
-      const uint8x8_t input_u8 = vld1_u8(input_ptr);
-      input_ptr += input_ptr_increment;
-      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-      // Duplicate the input values, 2-fold
-      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
-                                  vget_low_s16(input_dup2.val[i]));
-        acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
-                                  vget_high_s16(input_dup2.val[i]));
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
-        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
-      }
-      acc_buffer_ptr += 16;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 8, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
-    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
-    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-
-    int outp = 0;
-    // Handle 2 output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the accumulators from acc_buffer.
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8[2];
-      for (int i = 0; i < 2; i++) {
-        input_u8[i] = vld1_u8(input_ptr + 8 * i);
-      }
-      input_ptr += 16;
-      int16x8_t input[2];
-      for (int i = 0; i < 2; i++) {
-        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
-      }
-      for (int i = 0; i < 2; i++) {
-        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
-      }
-      // Multiply-accumulate.
-      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
-      acc[1] =
-          vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
-      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
-      acc[3] =
-          vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle 1 output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer.
-      int32x4_t acc[2];
-      acc[0] = vld1q_s32(acc_buffer_ptr);
-      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
-
-      // Load the inputs, add input_offset.
-      const uint8x8_t input_u8 = vld1_u8(input_ptr);
-      input_ptr += 8;
-      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-      // Multiply-accumulate.
-      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
-      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
-      // Store the accumulators back to acc_buffer
-      vst1q_s32(acc_buffer_ptr, acc[0]);
-      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
-      acc_buffer_ptr += 8;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 4, 2> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
-    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
-    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-
-    int outp = 0;
-    // Handle 2 output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Load the inputs, add input_offset.
-      const uint8x8_t input_u8 = vld1_u8(input_ptr);
-      input_ptr += 8;
-      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-      // Duplicate the input values, 2-fold
-      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
-                                   vget_low_s16(input_dup2.val[i]));
-        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
-                                   vget_high_s16(input_dup2.val[i]));
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
-      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
-      input_ptr += 4;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-      // Duplicate the input values, 2-fold
-      const int16x4x2_t input_dup2 = vzip_s16(input, input);
-      // Multiply-accumulate
-      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
-      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 2, 8> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    int16x8_t filter[2];
-    for (int i = 0; i < 2; i++) {
-      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
-      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
-      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-    }
-    int outp = 0;
-    // Handle two output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the accumulators from acc_buffer.
-      int32x4_t acc[8];
-      for (int i = 0; i < 8; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
-      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
-      input_ptr += 4;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-      // Multiply-accumulate.
-      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
-      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
-      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
-      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
-      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
-      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
-      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
-      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
-      // Store the accumulators back to acc_buffer.
-      for (int i = 0; i < 8; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 32;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer.
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_ptr += 2;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
-      // Multiply-accumulate.
-      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
-      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
-      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
-      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
-
-      // Store the accumulators back to acc_buffer.
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 2, 2> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8 = vdup_n_u8(0);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
-    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
-    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
-    const int16x4_t filter_s16 =
-        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
-    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
-    int outp = 0;
-    // Handle 4 output pixels at a time.
-    for (; outp <= num_output_pixels - 4; outp += 4) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-
-      // Load the inputs, add input_offset.
-      const uint8x8_t input_u8 = vld1_u8(input_ptr);
-      input_ptr += 8;
-      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-      // Duplicate the input values, 2-fold
-      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
-      // Multiply-accumulate
-      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
-      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
-      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
-      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
-
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_ptr += 2;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-      // Duplicate the input values, 2-fold
-      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
-      // Multiply-accumulate
-      acc = vmlal_s16(acc, filter, input_dup2);
-      // Store the accumulators back to acc_buffer
-      vst1q_s32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 2, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8 = vdup_n_u8(0);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
-    const int16x4_t filter_s16 =
-        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
-    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
-    int outp = 0;
-    // Handle 8 output pixels at a time.
-    for (; outp <= num_output_pixels - 8; outp += 8) {
-      // Load the accumulators from acc_buffer.
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8[2];
-      for (int i = 0; i < 2; i++) {
-        input_u8[i] = vld1_u8(input_ptr + 8 * i);
-      }
-      input_ptr += 16;
-      int16x8_t input[2];
-      for (int i = 0; i < 2; i++) {
-        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
-      }
-      for (int i = 0; i < 2; i++) {
-        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
-      }
-
-      // Multiply-accumulate.
-      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
-      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
-      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
-      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
-      // Store the accumulators back to acc_buffer.
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle 4 output pixels at a time.
-    for (; outp <= num_output_pixels - 4; outp += 4) {
-      // Load the accumulators from acc_buffer.
-      int32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Load the inputs, add input_offset.
-      const uint8x8_t input_u8 = vld1_u8(input_ptr);
-      input_ptr += 8;
-      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-
-      // Multiply-accumulate.
-      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
-      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
-      // Store the accumulators back to acc_buffer.
-      for (int i = 0; i < 2; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-    }
-    // Handle 2 output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the accumulators from acc_buffer.
-      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
-      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
-      input_ptr += 4;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
-      // Multiply-accumulate.
-      acc = vmlal_s16(acc, filter, input);
-      // Store the accumulators back to acc_buffer.
-      vst1q_s32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-    }
-    // Handle 1 output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer.
-      int32x2_t acc = vld1_s32(acc_buffer_ptr);
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_ptr += 2;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
-      // Multiply-accumulate.
-      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
-      // Store the accumulators back to acc_buffer.
-      vst1_s32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 2;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 1, 2> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8 = vdup_n_u8(0);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
-    const int16x4_t filter_s16 =
-        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
-    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
-    int outp = 0;
-    // Handle 8 output pixels at a time.
-    for (; outp <= num_output_pixels - 8; outp += 8) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-
-      // Load the inputs, add input_offset.
-      const uint8x8_t input_u8 = vld1_u8(input_ptr);
-      input_ptr += 8;
-      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-      // Duplicate the input values, 2-fold
-      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
-      // Multiply-accumulate
-      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
-      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
-      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
-      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer
-      int32x2_t acc = vld1_s32(acc_buffer_ptr);
-
-      // Load the inputs, add input_offset.
-      const uint32 input = *input_ptr++ + input_offset;
-
-      // Multiply-accumulate
-      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
-      // Store the accumulators back to acc_buffer
-      vst1_s32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 2;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 1, 4> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8 = vdup_n_u8(0);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
-    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
-    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
-    const int16x4_t filter_s16 =
-        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
-    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
-    int outp = 0;
-    // Handle 8 output pixels at a time.
-    for (; outp <= num_output_pixels - 8; outp += 8) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[8];
-      for (int i = 0; i < 8; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vld1_u8(input_ptr);
-      input_ptr += 8;
-      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-
-      // Multiply-accumulate
-      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
-      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
-      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
-      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
-      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
-      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
-      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
-      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
-
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 8; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 32;
-    }
-    // Handle 4 output pixels at a time.
-    for (; outp <= num_output_pixels - 4; outp += 4) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
-      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
-      input_ptr += 4;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
-      // Multiply-accumulate
-      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
-      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
-      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
-      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
-
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
-
-      // Load the inputs, add input_offset.
-      const uint32 input = *input_ptr++ + input_offset;
-
-      // Multiply-accumulate
-      acc = vmlal_n_s16(acc, filter, input);
-      // Store the accumulators back to acc_buffer
-      vst1q_s32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 4, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8 = vdup_n_u8(0);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
-    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
-    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
-    const int16x4_t filter_s16 =
-        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
-    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
-    int outp = 0;
-    // Handle 4 output pixels at a time.
-    for (; outp <= num_output_pixels - 4; outp += 4) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Load the inputs, add input_offset.
-      int16x8_t input[2];
-      for (int i = 0; i < 2; i++) {
-        const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
-        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-      }
-      input_ptr += 16;
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[2 * i + 0] =
-            vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
-        acc[2 * i + 1] =
-            vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc;
-      acc = vld1q_s32(acc_buffer_ptr);
-
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
-      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
-      input_ptr += 4;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-      // Multiply-accumulate
-      acc = vmlal_s16(acc, filter, input);
-      // Store the accumulators back to acc_buffer
-      vst1q_s32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 4, 4> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    int16x8_t filter[2];
-    for (int i = 0; i < 2; i++) {
-      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
-      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
-      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-    }
-
-    int outp = 0;
-    // Handle 2 output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[8];
-      for (int i = 0; i < 8; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vld1_u8(input_ptr);
-      input_ptr += 8;
-      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-
-      // Multiply-accumulate
-      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
-                              vget_low_s16(input), 0);
-      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
-                              vget_low_s16(input), 1);
-      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
-                              vget_low_s16(input), 2);
-      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
-                              vget_low_s16(input), 3);
-      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
-                              vget_high_s16(input), 0);
-      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
-                              vget_high_s16(input), 1);
-      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
-                              vget_high_s16(input), 2);
-      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
-                              vget_high_s16(input), 3);
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 8; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 32;
-    }
-    // Handle one output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
-      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
-      input_ptr += 4;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
-      // Multiply-accumulate
-      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
-      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
-      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
-      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 0, 3> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // We will have to duplicate bytes in a NEON register, 3-fold.
-    // We will do that by register-level table-look-up using VTBL instructions.
-    // Here we prepare the registers containing the table-lookup indices.
-    static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
-                                                   {2, 3, 3, 3, 4, 4, 4, 5},
-                                                   {5, 5, 6, 6, 6, 7, 7, 7}};
-    uint8x8_t dup3_indices[3];
-    for (int i = 0; i < 3; i++) {
-      dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
-    }
-
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const uint8* local_filter_ptr = filter_ptr;
-      const uint8* local_input_ptr = input_ptr;
-      int ic = 0;
-      // Handle 8 input channels at a time.
-      for (; ic <= input_depth - 8; ic += 8) {
-        // Load the filters, add filter_offset.
-        int16x8_t filter[3];
-        uint8x8x3_t filter_u8;
-        filter_u8.val[0] = vld1_u8(local_filter_ptr);
-        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
-        filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
-        local_filter_ptr += 24;
-        for (int i = 0; i < 3; i++) {
-          const int16x8_t filter_s16 =
-              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
-          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-        }
-        // Load the inputs, duplicate 3-fold, add input_offset.
-        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
-        local_input_ptr += 8;
-
-        uint8x8_t input_u8_dup3[3];
-        for (int i = 0; i < 3; i++) {
-          input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
-        }
-        int16x8_t input_dup3[3];
-        for (int i = 0; i < 3; i++) {
-          const int16x8_t input_s16_dup3 =
-              vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
-          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
-        }
-        // Load the accumulators from acc_buffer
-        int32x4x3_t acc[2];
-        for (int i = 0; i < 2; i++) {
-          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
-          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
-          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
-        }
-        // Multiply-accumulate
-        for (int j = 0; j < 3; j++) {
-          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
-                                    vget_low_s16(filter[j]));
-          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
-                                    vget_high_s16(filter[j]));
-        }
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 2; i++) {
-          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
-          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
-          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
-        }
-        acc_buffer_ptr += 24;
-      }
-      // Handle one input channel at a time.
-      for (; ic < input_depth; ic++) {
-        const int16 input_val = *local_input_ptr++ + input_offset;
-        for (int i = 0; i < 3; i++) {
-          const int16 filter_val = local_filter_ptr[i] + filter_offset;
-          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
-        }
-        local_filter_ptr += 3;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 0, 2> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const uint8* local_filter_ptr = filter_ptr;
-      const uint8* local_input_ptr = input_ptr;
-      int ic = 0;
-      // Handle 8 input channels at a time.
-      for (; ic <= input_depth - 8; ic += 8) {
-        // Load the filters, add filter_offset.
-        int16x8_t filter[2];
-        uint8x8x2_t filter_u8;
-        filter_u8.val[0] = vld1_u8(local_filter_ptr);
-        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
-        local_filter_ptr += 16;
-        for (int i = 0; i < 2; i++) {
-          const int16x8_t filter_s16 =
-              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
-          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-        }
-        // Load the inputs, add input_offset, duplicate 2-fold.
-        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
-        local_input_ptr += 8;
-        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
-        // Load the accumulators from acc_buffer.
-        int32x4x2_t acc[2];
-        for (int i = 0; i < 2; i++) {
-          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
-          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
-        }
-        // Multiply-accumulate.
-        for (int j = 0; j < 2; j++) {
-          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
-                                    vget_low_s16(input_dup2.val[j]));
-          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
-                                    vget_high_s16(input_dup2.val[j]));
-        }
-        // Store the accumulators back to acc_buffer.
-        for (int i = 0; i < 2; i++) {
-          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
-          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
-        }
-        acc_buffer_ptr += 16;
-      }
-      // Handle one input channel at a time.
-      for (; ic < input_depth; ic++) {
-        // Load the inputs.
-        const int16 input_val = *local_input_ptr++ + input_offset;
-        for (int i = 0; i < 2; i++) {
-          const int16 filter_val = local_filter_ptr[i] + filter_offset;
-          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
-        }
-        local_filter_ptr += 2;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 0, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      const uint8* local_filter_ptr = filter_ptr;
-      const uint8* local_input_ptr = input_ptr;
-      int ic = 0;
-      // Handle 16 input channels at a time.
-      for (; ic <= input_depth - 16; ic += 16) {
-        // Load the filters, add filter_offset.
-        uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
-        uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
-        local_filter_ptr += 16;
-        int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
-        int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
-        filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
-        filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
-        // Load the inputs, add input_offset.
-        uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
-        uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
-        local_input_ptr += 16;
-        int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
-        int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
-        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
-        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
-        // Load the accumulators from acc_buffer
-        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
-        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
-        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
-        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
-        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
-        acc_1 =
-            vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
-        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
-        acc_3 =
-            vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
-        // Store the accumulators back to acc_buffer
-        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
-        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
-        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
-        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
-        acc_buffer_ptr += 16;
-      }
-      // Handle 8 input channels at a time.
-      for (; ic <= input_depth - 8; ic += 8) {
-        // Load the filters, add filter_offset.
-        const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
-        local_filter_ptr += 8;
-        const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
-        const int16x8_t filter =
-            vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-        // Load the inputs, add input_offset.
-        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
-        local_input_ptr += 8;
-        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-        // Load the accumulators from acc_buffer
-        int32x4_t acc[2];
-        for (int i = 0; i < 2; i++) {
-          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-        }
-        // Multiply-accumulate
-        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
-        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
-        // Store the accumulators back to acc_buffer
-        for (int i = 0; i < 2; i++) {
-          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-        }
-        acc_buffer_ptr += 8;
-      }
-      // Handle one input channel at a time.
-      for (; ic < input_depth; ic++) {
-        const int16 input_val = *local_input_ptr++ + input_offset;
-        const int16 filter_val = *local_filter_ptr++ + filter_offset;
-        *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
-      }
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 16, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8[2];
-    for (int i = 0; i < 2; i++) {
-      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
-    }
-    int16x8_t filter[2];
-    for (int i = 0; i < 2; i++) {
-      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
-    }
-    for (int i = 0; i < 2; i++) {
-      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
-    }
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8[2];
-      for (int i = 0; i < 2; i++) {
-        input_u8[i] = vld1_u8(input_ptr + 8 * i);
-      }
-      input_ptr += input_ptr_increment;
-      int16x8_t input[2];
-      for (int i = 0; i < 2; i++) {
-        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
-      }
-      for (int i = 0; i < 2; i++) {
-        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
-      }
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
-                                   vget_low_s16(filter[i]));
-        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
-                                   vget_high_s16(filter[i]));
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 8, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
-    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
-    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the inputs, add input_offset.
-      const uint8x8_t input_u8 = vld1_u8(input_ptr);
-      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
-      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
-      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-      input_ptr += input_ptr_increment;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 1, 16> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8[2];
-    for (int i = 0; i < 2; i++) {
-      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
-    }
-    int16x8_t filter[2];
-    for (int i = 0; i < 2; i++) {
-      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
-    }
-    for (int i = 0; i < 2; i++) {
-      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
-    }
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      uint8 input_u8 = *input_ptr;
-      input_ptr += input_ptr_increment;
-      int16 input = static_cast<int16>(input_u8 + input_offset);
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[4];
-      for (int i = 0; i < 4; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      for (int i = 0; i < 2; i++) {
-        acc[2 * i + 0] =
-            vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
-        acc[2 * i + 1] =
-            vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
-      }
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 4; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 16;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 1, 32> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
-    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
-    uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
-    uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
-    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
-    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
-    int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
-    int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
-    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
-    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
-    filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
-    filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      uint8 input_u8 = *input_ptr;
-      input_ptr += input_ptr_increment;
-      int16 input = static_cast<int16>(input_u8 + input_offset);
-      // Load the accumulators from acc_buffer
-      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
-      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
-      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
-      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
-      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
-      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
-      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
-      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
-      // Multiply-accumulate
-      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
-      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
-      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
-      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
-      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
-      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
-      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
-      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
-      // Store the accumulators back to acc_buffer
-      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
-      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
-      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
-      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
-      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
-      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
-      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
-      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
-      acc_buffer_ptr += 32;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 1, 20> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
-    // We load the first 16 bytes into filter_u8_{0,1} as usual.
-    // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
-    // This is redundant: the first 4 bytes of filter_u8_x are the same
-    // as the last 4 bytes of filter_u8_x.
-    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
-    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
-    uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
-    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
-    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
-    int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
-    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
-    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
-    filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      uint8 input_u8 = *input_ptr;
-      input_ptr += input_ptr_increment;
-      int16 input = static_cast<int16>(input_u8 + input_offset);
-      // Load the accumulators from acc_buffer
-      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
-      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
-      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
-      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
-      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
-      // Multiply-accumulate
-      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
-      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
-      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
-      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
-      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
-      // Store the accumulators back to acc_buffer
-      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
-      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
-      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
-      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
-      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
-      acc_buffer_ptr += 20;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 1, 8> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
-    const int16x8_t filter = vaddq_s16(
-        vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      uint8 input_u8 = *input_ptr;
-      input_ptr += input_ptr_increment;
-      int16 input = static_cast<int16>(input_u8 + input_offset);
-      // Load the accumulators from acc_buffer
-      int32x4_t acc[2];
-      for (int i = 0; i < 2; i++) {
-        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
-      }
-      // Multiply-accumulate
-      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
-      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
-      // Store the accumulators back to acc_buffer
-      for (int i = 0; i < 2; i++) {
-        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
-      }
-      acc_buffer_ptr += 8;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 2, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8 = vdup_n_u8(0);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
-    const int16x4_t filter_s16 =
-        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
-    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
-    int outp = 0;
-
-    // Handle 2 output pixels at a time.
-    for (; outp <= num_output_pixels - 2; outp += 2) {
-      // Load the accumulators from acc_buffer.
-      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
-      // Load the inputs, add input_offset.
-      uint16x4_t input_u16 = vdup_n_u16(0);
-      input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
-                                input_u16, 0);
-      input_ptr += input_ptr_increment;
-      input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
-                                input_u16, 1);
-      input_ptr += input_ptr_increment;
-      const int16x4_t input_s16 = vreinterpret_s16_u16(
-          vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
-      // Multiply-accumulate.
-      acc = vmlal_s16(acc, filter, input);
-      // Store the accumulators back to acc_buffer.
-      vst1q_s32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-    }
-
-    // Handle 1 output pixel at a time.
-    for (; outp < num_output_pixels; outp++) {
-      // Load the accumulators from acc_buffer.
-      int32x2_t acc = vld1_s32(acc_buffer_ptr);
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vdup_n_u8(0);
-      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-      input_ptr += input_ptr_increment;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-
-      // Multiply-accumulate.
-      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
-      // Store the accumulators back to acc_buffer.
-      vst1_s32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 2;
-    }
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<true, 4, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    if (num_output_pixels <= 0) {
-      return;
-    }
-
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8 = vdup_n_u8(0);
-    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
-    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
-    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
-    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
-    const int16x4_t filter_s16 =
-        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
-    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
-
-    int outp = 0;
-
-    // Handle one output pixel at a time until second to the last pixel. Second
-    // to the last because we read eight input pixels while only processing
-    // four.
-    for (; outp < num_output_pixels - 1; outp++) {
-      // Load the accumulators from acc_buffer
-      int32x4_t acc;
-      acc = vld1q_s32(acc_buffer_ptr);
-
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8 = vld1_u8(input_ptr);
-      input_ptr += input_ptr_increment;
-      const int16x4_t input_s16 =
-          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-      // Multiply-accumulate
-      acc = vmlal_s16(acc, filter, input);
-      // Store the accumulators back to acc_buffer
-      vst1q_s32(acc_buffer_ptr, acc);
-      acc_buffer_ptr += 4;
-    }
-
-    // Handle the last output pixel.
-    // Load the accumulators from acc_buffer
-    int32x4_t acc;
-    acc = vld1q_s32(acc_buffer_ptr);
-
-    // Load the inputs, add input_offset.
-    uint8x8_t input_u8 = vdup_n_u8(0);
-    input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
-    input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
-    input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
-    input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
-    const int16x4_t input_s16 =
-        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
-    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
-    // Multiply-accumulate
-    acc = vmlal_s16(acc, filter, input);
-    // Store the accumulators back to acc_buffer
-    vst1q_s32(acc_buffer_ptr, acc);
-  }
-};
-
-template <>
-struct QuantizedDepthwiseConvKernel<false, 12, 1> {
-  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
-                  const uint8* input_ptr, int16 input_offset,
-                  int input_ptr_increment, const uint8* filter_ptr,
-                  int16 filter_offset, int32* acc_buffer_ptr) {
-    // Load the filters, add filter_offset.
-    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
-    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
-    int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
-    int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
-    filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
-    filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
-    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
-    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
-    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
-
-    // Handle one output pixel at a time.
-    for (int outp = 0; outp < num_output_pixels; outp++) {
-      // Load the inputs, add input_offset.
-      uint8x8_t input_u8_0 = vld1_u8(input_ptr);
-      uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
-      input_ptr += input_ptr_increment;
-      int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
-      int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
-      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
-      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
-
-      // Load the accumulators from acc_buffer
-      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
-      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
-      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
-
-      // Multiply-accumulate
-      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
-      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
-      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
-
-      // Store the accumulators back to acc_buffer
-      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
-      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
-      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
-
-      acc_buffer_ptr += 12;
-    }
-  }
-};
-#endif
-
-// Accumulates the effect of one row of the filter, on a segment of one row
-// of the output, accessing the corresponding one row of the input.
-template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
-void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
-                                    int input_depth, int input_width,
-                                    const uint8* input_data, int16 input_offset,
-                                    int pad_width, int depth_multiplier,
-                                    int filter_width, const uint8* filter_data,
-                                    int16 filter_offset, int out_x_buffer_start,
-                                    int out_x_buffer_end, int output_depth,
-                                    int32* acc_buffer) {
-#ifdef GEMMLOWP_PROFILING
-  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
-#endif
-  // Sanity check parameters. This is important in particular to ensure
-  // that we keep the number of template instantiations minimal, so we don't
-  // increase binary size unnecessarily.
-  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
-  static_assert(kFixedInputDepth || kAllowStrided, "");
-  TFLITE_DCHECK(stride == 1 || kAllowStrided);
-  if (kFixedInputDepth) {
-    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
-  }
-  if (kFixedDepthMultiplier) {
-    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
-  }
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  const int input_ptr_increment = stride * input_depth;
-  const uint8* filter_base_ptr = filter_data;
-  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-    // For the current (filter_x, filter_y) point in the filter,
-    // compute the boundaries of the corresponding output row segment.
-    int out_x_loop_start_unclampled = 0;
-    int out_x_loop_end_unclampled = 0;
-    if (kAllowStrided) {
-      if (stride == 2) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 1) / 2;
-      } else if (stride == 4) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 3) / 4;
-      } else {
-        out_x_loop_start_unclampled =
-            (pad_width - filter_x + stride - 1) / stride;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + stride - 1) / stride;
-      }
-    } else {
-      out_x_loop_start_unclampled = pad_width - filter_x;
-      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
-    }
-    // The kernel will have to iterate on the segment of the
-    // output row that starts at out_x_loop_start and out_x_loop_end.
-    const int out_x_loop_start =
-        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
-    const int out_x_loop_end =
-        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
-
-    int32* acc_buffer_ptr =
-        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
-    const uint8* input_ptr = input_data + in_x_origin * input_depth;
-    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
-    QuantizedDepthwiseConvKernel<
-        kAllowStrided, kFixedInputDepth,
-        kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
-                                    depth_multiplier, input_ptr, input_offset,
-                                    input_ptr_increment, filter_base_ptr,
-                                    filter_offset, acc_buffer_ptr);
-    filter_base_ptr += output_depth;
-  }
-}
-
-// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
-inline void QuantizedDepthwiseConvAccumRowGeneric(
-    int stride, int dilation_factor, int input_depth, int input_width,
-    const uint8* input_data, int16 input_offset, int pad_width,
-    int depth_multiplier, int filter_width, const uint8* filter_data,
-    int16 filter_offset, int out_x_buffer_start, int out_x_buffer_end,
-    int output_depth, int32* acc_buffer) {
-  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
-#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-  LOG(FATAL)
-      << "\n\n"
-      << "*****************************************************************\n"
-      << "* This tfmini inference code was about to use the slow generic\n"
-      << "* fallback implementation for a DepthwiseConv op, and we want you\n"
-      << "* to be aware of that so that you will know why you get terrible\n"
-      << "* performance.\n"
-      << "*\n"
-      << "* If you would like to carry on with the slow code, compile\n"
-      << "* with this preprocessor token defined:\n"
-      << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
-      << "*\n"
-      << "* The right thing to do, if you care about performance, is to add\n"
-      << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
-      << "* The relevant parameters defining your case are:\n"
-      << "* stride = " << stride << "\n"
-      << "* input_depth = " << input_depth << "\n"
-      << "* depth_multiplier = " << depth_multiplier << "\n"
-      << "* dilation_factor = " << dilation_factor << "\n"
-      << "*\n"
-      << "* Please do not hesitate to contact benoitjacob@ with this\n"
-      << "* information.\n"
-      << "*****************************************************************\n";
-#endif  // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#endif  // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-  const uint8* filter_base_ptr = filter_data;
-  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-    const int out_x_loop_start = std::max(
-        out_x_buffer_start,
-        (pad_width - dilation_factor * filter_x + stride - 1) / stride);
-    const int out_x_loop_end = std::min(
-        out_x_buffer_end,
-        (pad_width + input_width - dilation_factor * filter_x + stride - 1) /
-            stride);
-
-    int32* acc_buffer_ptr =
-        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin =
-        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
-    const uint8* input_ptr = input_data + in_x_origin * input_depth;
-    const int input_ptr_increment = (stride - 1) * input_depth;
-    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
-      const uint8* filter_ptr = filter_base_ptr;
-      for (int ic = 0; ic < input_depth; ++ic) {
-        const int16 input_val = *input_ptr++ + input_offset;
-        for (int m = 0; m < depth_multiplier; m++) {
-          const int16 filter_val = *filter_ptr++ + filter_offset;
-          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
-        }
-      }
-      input_ptr += input_ptr_increment;
-    }
-    filter_base_ptr += output_depth;
-  }
-}
-
-// Initializes the accumulator buffer with bias values.
-inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
-                                       const int32* bias_data,
-                                       int32* acc_buffer) {
-  int i = 0;
-#ifdef USE_NEON
-  if (output_depth == 1) {
-    const int32x4_t b = vdupq_n_s32(bias_data[0]);
-    for (; i <= num_output_pixels - 16; i += 16) {
-      vst1q_s32(acc_buffer + i + 0, b);
-      vst1q_s32(acc_buffer + i + 4, b);
-      vst1q_s32(acc_buffer + i + 8, b);
-      vst1q_s32(acc_buffer + i + 12, b);
-    }
-    for (; i <= num_output_pixels - 4; i += 4) {
-      vst1q_s32(acc_buffer + i, b);
-    }
-  } else if (output_depth == 2) {
-    int32x4_t b = vdupq_n_s32(bias_data[0]);
-    b = vsetq_lane_s32(bias_data[1], b, 1);
-    b = vsetq_lane_s32(bias_data[1], b, 3);
-    for (; i <= num_output_pixels - 8; i += 8) {
-      vst1q_s32(acc_buffer + 2 * i + 0, b);
-      vst1q_s32(acc_buffer + 2 * i + 4, b);
-      vst1q_s32(acc_buffer + 2 * i + 8, b);
-      vst1q_s32(acc_buffer + 2 * i + 12, b);
-    }
-    for (; i <= num_output_pixels - 2; i += 2) {
-      vst1q_s32(acc_buffer + 2 * i, b);
-    }
-  } else if (output_depth == 4) {
-    const int32x4_t b = vld1q_s32(bias_data);
-    for (; i <= num_output_pixels - 4; i += 4) {
-      vst1q_s32(acc_buffer + 4 * i + 0, b);
-      vst1q_s32(acc_buffer + 4 * i + 4, b);
-      vst1q_s32(acc_buffer + 4 * i + 8, b);
-      vst1q_s32(acc_buffer + 4 * i + 12, b);
-    }
-    for (; i < num_output_pixels; i++) {
-      vst1q_s32(acc_buffer + 4 * i, b);
-    }
-  } else if (output_depth == 8) {
-    const int32x4_t b0 = vld1q_s32(bias_data);
-    const int32x4_t b1 = vld1q_s32(bias_data + 4);
-    for (; i <= num_output_pixels - 2; i += 2) {
-      vst1q_s32(acc_buffer + 8 * i + 0, b0);
-      vst1q_s32(acc_buffer + 8 * i + 4, b1);
-      vst1q_s32(acc_buffer + 8 * i + 8, b0);
-      vst1q_s32(acc_buffer + 8 * i + 12, b1);
-    }
-    for (; i < num_output_pixels; i++) {
-      vst1q_s32(acc_buffer + 8 * i + 0, b0);
-      vst1q_s32(acc_buffer + 8 * i + 4, b1);
-    }
-  } else if (output_depth == 16) {
-    const int32x4_t b0 = vld1q_s32(bias_data);
-    const int32x4_t b1 = vld1q_s32(bias_data + 4);
-    const int32x4_t b2 = vld1q_s32(bias_data + 8);
-    const int32x4_t b3 = vld1q_s32(bias_data + 12);
-    for (; i < num_output_pixels; i++) {
-      vst1q_s32(acc_buffer + 16 * i + 0, b0);
-      vst1q_s32(acc_buffer + 16 * i + 4, b1);
-      vst1q_s32(acc_buffer + 16 * i + 8, b2);
-      vst1q_s32(acc_buffer + 16 * i + 12, b3);
-    }
-  }
-#endif
-  for (; i < num_output_pixels; i++) {
-    memcpy(acc_buffer + i * output_depth, bias_data,
-           sizeof(acc_buffer[0]) * output_depth);
-  }
-}
-
-inline void DepthwiseConv(
-    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
-  gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  TFLITE_DCHECK_GE(dilation_width_factor, 1);
-  TFLITE_DCHECK_GE(dilation_height_factor, 1);
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-#ifdef USE_NEON
-  const bool shift_left = (output_shift > 0);
-  const int32 multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
-#endif
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
-// Jetson TX-2. This compiler does not support the offsetof() macro.
-#if defined(__aarch64__) && !defined(GOOGLE_L4T)
-  // Call kernel optimized for depthwise convolutions using 3x3 filters if
-  // parameters are supported.
-  if (Fast3x3FilterKernelSupported(
-          input_shape, filter_shape, stride_width, stride_height,
-          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
-          depth_multiplier, output_shape, output_shift)) {
-    DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
-                           filter_data, bias_shape, bias_data, output_shape,
-                           output_data);
-    return;
-  }
-#endif
-
-  static const int kAccBufferMaxSize = 2048;
-  int32 acc_buffer[kAccBufferMaxSize];
-  TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
-  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
-  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
-  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
-                   kAccBufferActualSize);
-  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
-  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
-
-  // row_accum_func will point to the core accumulation function to be used
-  // for this DepthwiseConv op.
-  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
-  row_accum_func_t row_accum_func = nullptr;
-
-#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
-                                        FIXED_DEPTH_MULTIPLIER)           \
-  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
-      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
-      depth_multiplier == FIXED_DEPTH_MULTIPLIER &&                       \
-      dilation_width_factor == 1 && dilation_height_factor == 1) {        \
-    row_accum_func =                                                      \
-        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
-                                       FIXED_DEPTH_MULTIPLIER>;           \
-  }
-
-#ifdef USE_NEON
-  // We go over our list of kernels by decreasing order of preference
-  // for the cases where multiple kernels could apply.
-
-  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
-
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
-
-  // Next come the strided kernels: AllowStrided=true, fixed input depth.
-  // They are a bit less efficient, but allow stride!=1.
-
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
-
-  // Finally, the kernels allowing a variable input depth,
-  // these are the least efficient but most general kernels.
-
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
-  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
-#endif  // USE_NEON
-
-  // No matching fast kernel found, use slow fallback.
-  if (!row_accum_func) {
-    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
-  }
-
-#undef TFMINI_USE_DEPTHWISECONV_KERNEL
-
-  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
-  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
-  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
-
-  // Now that we have determined row_accum_func, we can start work.
-  uint8* output_ptr = output_data;
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      const int in_y_origin = (out_y * stride_height) - pad_height;
-      const int filter_y_start =
-          std::max(0, (-in_y_origin + dilation_height_factor - 1) /
-                          dilation_height_factor);
-      const int filter_y_end =
-          std::min(filter_height,
-                   (input_height - in_y_origin + dilation_height_factor - 1) /
-                       dilation_height_factor);
-      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
-           out_x_buffer_start += kOutputPixelsInAccBuffer) {
-        const int out_x_buffer_end = std::min(
-            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
-        // We call a 'pixel' a group of activation that share all but the
-        // 'depth'/'channel' coordinate. num_output_pixels is the number of
-        // output pixels that we will accumulate in this loop iteration.
-        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
-        // Initialize our local accumulator with the bias values, so we don't
-        // have to add them later.
-        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
-                                   acc_buffer);
-        // Accumulation loop. Most of the time should be spent in here.
-        for (int filter_y = filter_y_start; filter_y < filter_y_end;
-             ++filter_y) {
-          const int in_y = in_y_origin + dilation_height_factor * filter_y;
-          row_accum_func(
-              stride_width, dilation_width_factor, input_depth, input_width,
-              input_data + in_y * input_height_stride + b * input_batch_stride,
-              input_offset, pad_width, depth_multiplier, filter_width,
-              filter_data + filter_y * filter_height_stride, filter_offset,
-              out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
-        }
-        // Finished accumulating int32 values. Now need to convert them to
-        // the final 8bit form and store them.
-        gemmlowp::ScopedProfilingLabel label("downquantize+store");
-        const int num_output_values = output_depth * num_output_pixels;
-        int i = 0;
-#ifdef USE_NEON
-        using gemmlowp::RoundingDivideByPOT;
-        const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-        const int32x4_t output_activation_min_vec =
-            vdupq_n_s32(output_activation_min);
-        const int32x4_t output_activation_max_vec =
-            vdupq_n_s32(output_activation_max);
-        // Handle 16 values at once.
-        // This allows us to issue 4 mutually independent int32
-        // multiplications (vqrdmulh), which should alleviate most of their
-        // high latency.
-        for (; i <= num_output_values - 16; i += 16) {
-          int32x4_t acc[4];
-          for (int j = 0; j < 4; j++) {
-            acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
-          }
-
-          if (!shift_left) {
-            // Fixed-point multiplication.
-            for (int j = 0; j < 4; j++) {
-              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
-            }
-            for (int j = 0; j < 4; j++) {
-              acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
-            }
-          } else {
-            // Fixed-point multiplication.
-            for (int j = 0; j < 4; j++) {
-              acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
-              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
-            }
-          }
-          // Add the output offset.
-          for (int j = 0; j < 4; j++) {
-            acc[j] = vaddq_s32(acc[j], output_offset_vec);
-          }
-          // Apply the activation function.
-          for (int j = 0; j < 4; j++) {
-            acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
-          }
-          for (int j = 0; j < 4; j++) {
-            acc[j] = vminq_s32(acc[j], output_activation_max_vec);
-          }
-          // Saturating cast to uint8 and store to destination.
-          int16x4_t acc_s16[4];
-          for (int j = 0; j < 4; j++) {
-            acc_s16[j] = vqmovn_s32(acc[j]);
-          }
-          const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
-          const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
-          const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
-          const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
-          vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
-          output_ptr += 16;
-        }
-        // Handle 8 values at once.
-        // Not as good as 16 (now we're only issuing 2 mutually independent
-        // vqrdmulh instructions, so we're probably paying for their high
-        // latency).
-        for (; i <= num_output_values - 8; i += 8) {
-          int32x4_t acc0 = vld1q_s32(acc_buffer + i);
-          int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
-          if (!shift_left) {
-            // Fixed-point multiplication.
-            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
-            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
-            // Rounding right shift.
-            acc0 = RoundingDivideByPOT(acc0, -output_shift);
-            acc1 = RoundingDivideByPOT(acc1, -output_shift);
-          } else {
-            // Fixed-point multiplication.
-            acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
-            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
-
-            acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
-            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
-          }
-          // Add the output offset.
-          acc0 = vaddq_s32(acc0, output_offset_vec);
-          acc1 = vaddq_s32(acc1, output_offset_vec);
-          // Apply the activation function.
-          acc0 = vmaxq_s32(acc0, output_activation_min_vec);
-          acc1 = vmaxq_s32(acc1, output_activation_min_vec);
-          acc0 = vminq_s32(acc0, output_activation_max_vec);
-          acc1 = vminq_s32(acc1, output_activation_max_vec);
-          // Saturating cast to uint8 and store to destination.
-          const int16x4_t acc0_s16 = vqmovn_s32(acc0);
-          const int16x4_t acc1_s16 = vqmovn_s32(acc1);
-          const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
-          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
-          vst1_u8(output_ptr, res_u8);
-          output_ptr += 8;
-        }
-        // Handle 4 values at once. Now we're paying the full price of the
-        // high latency of vqrdmulh. Also, storing only 4 bytes at the end
-        // (without any alignment) can only be done 1 byte at a time.
-        // Yet, that is still worth doing to minimize the amount of leftover
-        // that will have to go through the very slow scalar code.
-        for (; i <= num_output_values - 4; i += 4) {
-          int32x4_t acc = vld1q_s32(acc_buffer + i);
-          if (!shift_left) {
-            // Fixed-point multiplication.
-            acc = vqrdmulhq_n_s32(acc, output_multiplier);
-            // Rounding right shift.
-            acc = RoundingDivideByPOT(acc, -output_shift);
-          } else {
-            // Fixed-point multiplication.
-            acc = vmulq_n_s32(acc, multiplier_power_of_two);
-            acc = vqrdmulhq_n_s32(acc, output_multiplier);
-          }
-          // Add the output offset.
-          acc = vaddq_s32(acc, output_offset_vec);
-          // Apply the activation function.
-          acc = vmaxq_s32(acc, output_activation_min_vec);
-          acc = vminq_s32(acc, output_activation_max_vec);
-          // Saturating cast to uint8 and store to destination.
-          const int16x4_t acc_s16 = vqmovn_s32(acc);
-          const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
-          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
-          vst1_lane_u8(output_ptr + 0, res_u8, 0);
-          vst1_lane_u8(output_ptr + 1, res_u8, 1);
-          vst1_lane_u8(output_ptr + 2, res_u8, 2);
-          vst1_lane_u8(output_ptr + 3, res_u8, 3);
-          output_ptr += 4;
-        }
-#endif  // USE_NEON
-
-        // Handle leftover values, one by one. This is very slow.
-        for (; i < num_output_values; i++) {
-          int32 acc = acc_buffer[i];
-          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
-                                              output_shift);
-          acc += output_offset;
-          acc = std::max(acc, output_activation_min);
-          acc = std::min(acc, output_activation_max);
-          *output_ptr++ = static_cast<uint8>(acc);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace optimized_ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
deleted file mode 100644
index 544ef16ce18a36e52acb8813021800189150b13f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+++ /dev/null
@@ -1,369 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/round.h"
-
-namespace tflite {
-
-namespace {
-// These constants are used to manipulate the binary representation of doubles.
-// Double-precision binary64 floating point format is:
-// Bit |  63  |  62-52   |   51-0   |
-//     | Sign | Exponent | Fraction |
-// To avoid 64-bit integers as much as possible, I break this into high and
-// low 32-bit chunks. High is:
-// Bit |  31  |  30-20   |      19-0     |
-//     | Sign | Exponent | High Fraction |
-// Low is:
-// Bit |     31-0     |
-//     | Low Fraction |
-// We then access the components through logical bit-wise operations to
-// extract the parts needed, with the positions and masks derived from the
-// layout shown above.
-constexpr uint64_t kSignMask = 0x8000000000000000LL;
-constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
-constexpr int32_t kExponentShift = 52;
-constexpr int32_t kExponentBias = 1023;
-constexpr uint32_t kExponentIsBadNum = 0x7ff;
-constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
-constexpr uint32_t kFractionShift = 22;
-constexpr uint32_t kFractionRoundingMask = 0x003fffff;
-constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
-}  // namespace
-
-void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
-                        int* shift) {
-  if (double_multiplier == 0.) {
-    *quantized_multiplier = 0;
-    *shift = 0;
-    return;
-  }
-#ifdef TFLITE_EMULATE_FLOAT
-  // If we're trying to avoid the use of floating-point instructions (for
-  // example on microcontrollers) then use an alternative implementation
-  // that only requires integer and bitwise operations. To enable this, you
-  // need to set the define during the build process for your platform.
-  int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
-#else   // TFLITE_EMULATE_FLOAT
-  const double q = std::frexp(double_multiplier, shift);
-  auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
-#endif  // TFLITE_EMULATE_FLOAT
-  TFLITE_CHECK(q_fixed <= (1ll << 31));
-  if (q_fixed == (1ll << 31)) {
-    q_fixed /= 2;
-    ++*shift;
-  }
-  TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
-  *quantized_multiplier = static_cast<int32_t>(q_fixed);
-}
-
-void QuantizeMultiplierGreaterThanOne(double double_multiplier,
-                                      int32_t* quantized_multiplier,
-                                      int* left_shift) {
-  TFLITE_CHECK_GT(double_multiplier, 1.);
-  QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
-  TFLITE_CHECK_GE(*left_shift, 0);
-}
-
-void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
-                                         int32_t* quantized_multiplier,
-                                         int* left_shift) {
-  TFLITE_CHECK_LT(double_multiplier, 1.);
-  TFLITE_CHECK_GT(double_multiplier, 0.);
-  int shift;
-  QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
-  TFLITE_CHECK_LE(shift, 0);
-  *left_shift = shift;
-}
-
-int64_t IntegerFrExp(double input, int* shift) {
-  // Make sure our assumptions about the double layout hold.
-  TFLITE_CHECK_EQ(8, sizeof(double));
-
-  // We want to access the bits of the input double value directly, which is
-  // tricky to do safely, so use a union to handle the casting.
-  union {
-    double double_value;
-    uint64_t double_as_uint;
-  } cast_union;
-  cast_union.double_value = input;
-  const uint64_t u = cast_union.double_as_uint;
-
-  // If the bitfield is all zeros apart from the sign bit, this is a normalized
-  // zero value, so return standard values for this special case.
-  if ((u & ~kSignMask) == 0) {
-    *shift = 0;
-    return 0;
-  }
-
-  // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
-  // the exponent, and distinguished by whether the fractions are zero or
-  // non-zero.
-  const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
-  if (exponent_part == kExponentIsBadNum) {
-    *shift = std::numeric_limits<int>::max();
-    if (u & kFractionMask) {
-      // NaN, so just return zero (with the exponent set to INT_MAX).
-      return 0;
-    } else {
-      // Infinity, so return +/- INT_MAX.
-      if (u & kSignMask) {
-        return std::numeric_limits<int64_t>::min();
-      } else {
-        return std::numeric_limits<int64_t>::max();
-      }
-    }
-  }
-
-  // The shift is fairly easy to extract from the high bits of the double value,
-  // just by masking it out and applying a bias. The std::frexp() implementation
-  // always returns values between 0.5 and 1.0 though, whereas the exponent
-  // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
-  // interface.
-  *shift = (exponent_part - kExponentBias) + 1;
-
-  // There's an implicit high bit in the double format definition, so make sure
-  // we include that at the top, and then reconstruct the rest of the fractional
-  // value from the remaining fragments.
-  int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
-
-  // We're cutting off some bits at the bottom, so to exactly match the standard
-  // frexp implementation here we'll apply rounding by adding one to the least
-  // significant bit of the result if the discarded portion is over half of the
-  // maximum.
-  if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
-    fraction += 1;
-  }
-  // Negate the fraction if the sign bit was set.
-  if (u & kSignMask) {
-    fraction *= -1;
-  }
-
-  return fraction;
-}
-
-double DoubleFromFractionAndShift(int64_t fraction, int shift) {
-  union {
-    double double_value;
-    uint64_t double_as_uint;
-  } result;
-
-  // Detect NaNs and infinities.
-  if (shift == std::numeric_limits<int>::max()) {
-    if (fraction == 0) {
-      return NAN;
-    } else if (fraction > 0) {
-      return INFINITY;
-    } else {
-      return -INFINITY;
-    }
-  }
-
-  // Return a normalized zero for a zero fraction.
-  if (fraction == 0) {
-    result.double_as_uint = 0;
-    return result.double_value;
-  }
-
-  bool is_negative = (fraction < 0);
-  int64_t encoded_fraction = is_negative ? -fraction : fraction;
-  int64_t encoded_shift = (shift - 1);
-  while (encoded_fraction < 0x40000000) {
-    encoded_fraction *= 2;
-    encoded_shift -= 1;
-  }
-  while (encoded_fraction > 0x80000000) {
-    encoded_fraction /= 2;
-    encoded_shift += 1;
-  }
-  encoded_fraction -= 0x40000000;
-  if (encoded_shift < -1022) {
-    encoded_shift = -1023;
-  } else if (encoded_shift > 1022) {
-    encoded_shift = 1023;
-  }
-  encoded_shift += kExponentBias;
-  uint64_t encoded_sign = is_negative ? kSignMask : 0;
-  result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
-                          (encoded_fraction << kFractionShift);
-  return result.double_value;
-}
-
-double IntegerDoubleMultiply(double a, double b) {
-  int a_shift;
-  const int64_t a_fraction = IntegerFrExp(a, &a_shift);
-  int b_shift;
-  const int64_t b_fraction = IntegerFrExp(b, &b_shift);
-  // Detect NaNs and infinities.
-  if (a_shift == std::numeric_limits<int>::max() ||
-      (b_shift == std::numeric_limits<int>::max())) {
-    return NAN;
-  }
-  const int result_shift = a_shift + b_shift + 1;
-  const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
-  return DoubleFromFractionAndShift(result_fraction, result_shift);
-}
-
-int IntegerDoubleCompare(double a, double b) {
-  int a_shift;
-  const int64_t a_fraction = IntegerFrExp(a, &a_shift);
-  int b_shift;
-  const int64_t b_fraction = IntegerFrExp(b, &b_shift);
-
-  // Detect NaNs and infinities.
-  if (a_shift == std::numeric_limits<int>::max() ||
-      (b_shift == std::numeric_limits<int>::max())) {
-    return 1;
-  }
-
-  if ((a_fraction == 0) && (b_fraction < 0)) {
-    return 1;
-  } else if ((a_fraction < 0) && (b_fraction == 0)) {
-    return -1;
-  } else if (a_shift < b_shift) {
-    return -1;
-  } else if (a_shift > b_shift) {
-    return 1;
-  } else if (a_fraction < b_fraction) {
-    return -1;
-  } else if (a_fraction > b_fraction) {
-    return 1;
-  } else {
-    return 0;
-  }
-}
-
-void PreprocessSoftmaxScaling(double beta, double input_scale,
-                              int input_integer_bits,
-                              int32_t* quantized_multiplier, int* left_shift) {
-  // If the overall multiplier (input and beta) is large, then exp() of an
-  // input difference of 1 scaled by this will be large.  In other words, we
-  // can cap the multiplier and know that, when it is used, the output will be
-  // (round to) zero wherever the input is not at the maximum value.
-
-  // If the overall scale is less than one, and input_integer_bits=0, then the
-  // result is double equivalent of Q0.31 (actually with more precision). Thus
-  // this generates a Q(input_integer_bits).(31-input_integer_bits)
-  // representation.
-#ifdef TFLITE_EMULATE_FLOAT
-  const double input_beta = IntegerDoubleMultiply(beta, input_scale);
-  int shift;
-  int64_t fraction = IntegerFrExp(input_beta, &shift);
-  shift += (31 - input_integer_bits);
-  double input_beta_real_multiplier =
-      DoubleFromFractionAndShift(fraction, shift);
-  if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
-    input_beta_real_multiplier = (1ll << 31) - 1.0;
-  }
-#else   // TFLITE_EMULATE_FLOAT
-  const double input_beta_real_multiplier = std::min(
-      beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
-#endif  // TFLITE_EMULATE_FLOAT
-
-  QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
-                                   quantized_multiplier, left_shift);
-}
-
-void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
-                                    int input_integer_bits,
-                                    int32_t* quantized_multiplier,
-                                    int* left_shift,
-                                    int32_t* reverse_scaling_divisor,
-                                    int* reverse_scaling_left_shift) {
-  PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
-                           quantized_multiplier, left_shift);
-
-  // Also calculate what amounts to the inverse scaling factor for the input.
-  const double real_reverse_scaling_divisor =
-      (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
-  tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
-                                              reverse_scaling_divisor,
-                                              reverse_scaling_left_shift);
-}
-
-int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
-#ifdef TFLITE_EMULATE_FLOAT
-  int64_t result = (1 << input_integer_bits) - 1;
-  result <<= (31 - input_integer_bits);
-  result >>= input_left_shift;
-  return result;
-#else   // TFLITE_EMULATE_FLOAT
-  const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
-                                    (1ll << (31 - input_integer_bits)) /
-                                    (1ll << input_left_shift);
-  // Tighten bound using floor.  Suppose that we could use the exact value.
-  // After scaling the difference, the result would be at the maximum.  Thus we
-  // must ensure that our value has lower magnitude.
-  return static_cast<int>(std::floor(max_input_rescaled));
-#endif  // TFLITE_EMULATE_FLOAT
-}
-
-void NudgeQuantizationRange(const float min, const float max,
-                            const int quant_min, const int quant_max,
-                            float* nudged_min, float* nudged_max,
-                            float* nudged_scale) {
-  // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
-  const float quant_min_float = static_cast<float>(quant_min);
-  const float quant_max_float = static_cast<float>(quant_max);
-  *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
-  const float zero_point_from_min = quant_min_float - min / *nudged_scale;
-  uint16 nudged_zero_point;
-  if (zero_point_from_min < quant_min_float) {
-    nudged_zero_point = static_cast<uint16>(quant_min);
-  } else if (zero_point_from_min > quant_max_float) {
-    nudged_zero_point = static_cast<uint16>(quant_max);
-  } else {
-    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
-  }
-  *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
-  *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
-}
-
-void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
-                       const float nudged_max, const float* input_data,
-                       float* output_data, const float size) {
-  // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
-  const float inv_nudged_scale = 1.0f / nudged_scale;
-
-  for (int i = 0; i < size; i++) {
-    const float src_val = input_data[i];
-    const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
-    const float clamped_shifted = clamped - nudged_min;
-    const float dst_val =
-        TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
-        nudged_min;
-    output_data[i] = dst_val;
-  }
-}
-
-bool CheckedLog2(const float x, int* log2_result) {
-  // Using TfLiteRound instead of std::round and std::log instead of
-  // std::log2 to work around these fuctions being missing in a toolchain
-  // used in some TensorFlow tests as of May 2018.
-  const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
-  const float x_log2_rounded = TfLiteRound(x_log2);
-  const float x_log2_fracpart = x_log2 - x_log2_rounded;
-
-  *log2_result = static_cast<int>(x_log2_rounded);
-  return std::abs(x_log2_fracpart) < 1e-3;
-}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
deleted file mode 100644
index d74a1bac97f86cba1e63e9141a9d00ded3c63c8f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
-
-#include <cmath>
-#include <cstdint>
-#include <limits>
-
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-#include "tensorflow/contrib/lite/kernels/internal/round.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-namespace tflite {
-
-// Given the min and max values of a float array, return
-// reasonable quantization parameters to use for this array.
-template <typename T>
-QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
-                                            bool narrow_range) {
-  const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
-  const T qmax = std::numeric_limits<T>::max();
-  const double qmin_double = qmin;
-  const double qmax_double = qmax;
-  // 0 should always be a representable value. Let's assume that the initial
-  // min,max range contains 0.
-  TFLITE_CHECK_LE(rmin, 0.);
-  TFLITE_CHECK_GE(rmax, 0.);
-  if (rmin == rmax) {
-    // Special case where the min,max range is a point. Should be {0}.
-    TFLITE_CHECK_EQ(rmin, 0.);
-    TFLITE_CHECK_EQ(rmax, 0.);
-    QuantizationParams quantization_params;
-    quantization_params.zero_point = 0;
-    quantization_params.scale = 0.;
-    return quantization_params;
-  }
-
-  // General case.
-  //
-  // First determine the scale.
-  const double scale = (rmax - rmin) / (qmax_double - qmin_double);
-
-  // Zero-point computation.
-  // First the initial floating-point computation. The zero-point can be
-  // determined from solving an affine equation for any known pair
-  // (real value, corresponding quantized value).
-  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-  // The arithmetic error on the zero point computed from either pair
-  // will be roughly machine_epsilon * (sum of absolute values of terms)
-  // so we want to use the variant that adds the smaller terms.
-  const double zero_point_from_min = qmin_double - rmin / scale;
-  const double zero_point_from_max = qmax_double - rmax / scale;
-  const double zero_point_from_min_error =
-      std::abs(qmin_double) + std::abs(rmin / scale);
-  const double zero_point_from_max_error =
-      std::abs(qmax_double) + std::abs(rmax / scale);
-
-  const double zero_point_double =
-      zero_point_from_min_error < zero_point_from_max_error
-          ? zero_point_from_min
-          : zero_point_from_max;
-
-  // Now we need to nudge the zero point to be an integer
-  // (our zero points are integer, and this is motivated by the requirement
-  // to be able to represent the real value "0" exactly as a quantized value,
-  // which is required in multiple places, for example in Im2col with SAME
-  // padding).
-  T nudged_zero_point = 0;
-  if (zero_point_double < qmin_double) {
-    nudged_zero_point = qmin;
-  } else if (zero_point_double > qmax_double) {
-    nudged_zero_point = qmax;
-  } else {
-    nudged_zero_point = static_cast<T>(round(zero_point_double));
-  }
-  // The zero point should always be in the range of quantized value,
-  // [qmin, qmax].
-  TFLITE_CHECK_GE(nudged_zero_point, qmin);
-  TFLITE_CHECK_LE(nudged_zero_point, qmax);
-
-  // Finally, store the result nudged quantization params.
-  QuantizationParams quantization_params;
-  quantization_params.zero_point = nudged_zero_point;
-  quantization_params.scale = scale;
-  return quantization_params;
-}
-
-template <typename T>
-QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
-  return ChooseQuantizationParams<T>(rmin, rmax, false);
-}
-
-// Converts a floating-point number to an integer. For all inputs x where
-// static_cast<IntOut>(x) is legal according to the C++ standard, the result
-// is identical to that cast (i.e. the result is x with its fractional part
-// truncated whenever that is representable as IntOut).
-//
-// static_cast would cause undefined behavior for the following cases, which
-// have well-defined behavior for this function:
-//
-//  1. If x is NaN, the result is zero.
-//
-//  2. If the truncated form of x is above the representable range of IntOut,
-//     the result is std::numeric_limits<IntOut>::max().
-//
-//  3. If the truncated form of x is below the representable range of IntOut,
-//     the result is std::numeric_limits<IntOut>::min().
-//
-// Note that cases #2 and #3 cover infinities as well as finite numbers.
-//
-// The range of FloatIn must include the range of IntOut, otherwise
-// the results are undefined.
-// TODO(sfeuz): Replace by absl::SafeCast once available.
-template <class IntOut, class FloatIn>
-IntOut SafeCast(FloatIn x) {
-  static_assert(!std::numeric_limits<FloatIn>::is_integer,
-                "FloatIn is integer");
-  static_assert(std::numeric_limits<IntOut>::is_integer,
-                "IntOut is not integer");
-  static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
-
-  // Special case NaN, for which the logic below doesn't work.
-  if (std::isnan(x)) {
-    return 0;
-  }
-
-  // Negative values all clip to zero for unsigned results.
-  if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
-    return 0;
-  }
-
-  // Handle infinities.
-  if (std::isinf(x)) {
-    return x < 0 ? std::numeric_limits<IntOut>::min()
-                 : std::numeric_limits<IntOut>::max();
-  }
-
-  // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
-  // unless x is zero in which case exp == 0. Note that this implies that the
-  // magnitude of x is strictly less than 2^exp.
-  int exp = 0;
-  std::frexp(x, &exp);
-
-  // Let N be the number of non-sign bits in the representation of IntOut. If
-  // the magnitude of x is strictly less than 2^N, the truncated version of x
-  // is representable as IntOut. The only representable integer for which this
-  // is not the case is kMin for signed types (i.e. -2^N), but that is covered
-  // by the fall-through below.
-  if (exp <= std::numeric_limits<IntOut>::digits) {
-    return x;
-  }
-
-  // Handle numbers with magnitude >= 2^N.
-  return x < 0 ? std::numeric_limits<IntOut>::min()
-               : std::numeric_limits<IntOut>::max();
-}
-
-// Decompose a double multiplier into a Q0.31 int32 representation of its
-// significand, and shift representation of NEGATIVE its exponent ---
-// this is intended as a RIGHT-shift.
-//
-// Restricted to the case where the multiplier < 1 (and non-negative).
-void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
-                                         int32_t* quantized_multiplier,
-                                         int* left_shift);
-
-// Decompose a double multiplier into a Q0.31 int32 representation of its
-// significand, and shift representation of its exponent.
-//
-// Restricted to the case where the multiplier > 1.
-void QuantizeMultiplierGreaterThanOne(double double_multiplier,
-                                      int32_t* quantized_multiplier,
-                                      int* left_shift);
-
-// Decompose a double multiplier into a Q0.31 int32 representation of its
-// significand, and shift representation of its exponent.
-//
-// Handles an arbitrary positive multiplier. The 'shift' output-value is
-// basically the 'floating-point exponent' of the multiplier:
-// Negative for a right-shift (when the multiplier is <1), positive for a
-// left-shift (when the multiplier is >1)
-void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
-                        int* shift);
-
-// Splits a double input value into a returned fraction, and a shift value from
-// the exponent, using only bitwise and integer operations to support
-// microcontrollers and other environments without floating-point support.
-//
-// This is designed to be a replacement for how std::frexp() is used within the
-// QuantizeMultiplier() function, and so has a different signature than the
-// standard version, returning a 64-bit integer rather than a double. This
-// result has a maximum value of 1<<31, with the fraction expressed as a
-// proportion of that maximum.
-//
-// std::frexp() returns NaNs and infinities unmodified, but since we're
-// returning integers that can't represent those values, instead we return
-// a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
-// result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
-// std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
-// result in return values that end up truncating some bits at the end,
-// reflecting the loss of precision inherent in denormalization.
-int64_t IntegerFrExp(double input, int* shift);
-
-// Converts an integer fraction in the format produced by IntegerFrExp (where
-// 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
-// IEEE binary64 double format result. The implementation uses only integer and
-// bitwise operators, so no floating point hardware support or emulation is
-// needed. This is here so quantized operations can run non-time-critical
-// preparation calculations on microcontrollers and other platforms without
-// float support.
-double DoubleFromFractionAndShift(int64_t fraction, int shift);
-
-// Performs a multiplication of two numbers in double format, using only integer
-// and bitwise instructions. This is aimed at supporting housekeeping functions
-// for quantized operations on microcontrollers without floating-point hardware.
-double IntegerDoubleMultiply(double a, double b);
-
-// Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
-// greater than b. It is implemented using only integer and logical instructions
-// so that it can be easily run on microcontrollers for quantized operations.
-int IntegerDoubleCompare(double a, double b);
-
-// This first creates a multiplier in a double equivalent of
-// Q(input_integer_bits).(31-input_integer_bits) representation, with extra
-// precision in the double's fractional bits.  It then splits the result into
-// significand and exponent.
-void PreprocessSoftmaxScaling(double beta, double input_scale,
-                              int input_integer_bits,
-                              int32_t* quantized_multiplier, int* left_shift);
-// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
-void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
-                                    int input_integer_bits,
-                                    int32_t* quantized_multiplier,
-                                    int* left_shift,
-                                    int32_t* reverse_scaling_divisor,
-                                    int* reverse_scaling_left_shift);
-// Calculate the largest input that will result in a within-bounds intermediate
-// result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
-// it must not overflow before we reduce the value by multiplication by the
-// input multiplier.  The negative radius is used as the minimum difference in
-// Softmax.
-int CalculateInputRadius(int input_integer_bits, int input_left_shift);
-
-// Nudges a min/max quantization range to ensure zero is zero.
-// Gymnastics with nudged zero point is to ensure that real zero maps to
-// an integer, which is required for e.g. zero-padding in convolutional layers.
-// Outputs nudged_min, nudged_max, nudged_scale.
-void NudgeQuantizationRange(const float min, const float max,
-                            const int quant_min, const int quant_max,
-                            float* nudged_min, float* nudged_max,
-                            float* nudged_scale);
-
-// Fake quantizes (quantizes and dequantizes) input_data using the scale,
-// nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
-// in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
-void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
-                       const float nudged_max, const float* input_data,
-                       float* output_data, const float size);
-
-// If x is approximately a power of two (with any positive or negative
-// exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
-// returns false.
-bool CheckedLog2(const float x, int* log2_result);
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
deleted file mode 100644
index 11224270a4b17f4299703eaaa0dfd49b42b2a321..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
-
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-namespace tflite {
-namespace reference_ops {
-
-inline void DepthwiseConv(
-    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const float* input_data, const RuntimeShape& filter_shape,
-    const float* filter_data, const RuntimeShape& bias_shape,
-    const float* bias_data, const RuntimeShape& output_shape,
-    float* output_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int ic = 0; ic < input_depth; ++ic) {
-          for (int m = 0; m < depth_multiplier; m++) {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            float total = 0.f;
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  float input_value =
-                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                  float filter_value = filter_data[Offset(
-                      filter_shape, 0, filter_y, filter_x, oc)];
-                  total += (input_value * filter_value);
-                }
-              }
-            }
-            float bias_value = 0.0f;
-            if (bias_data) {
-              bias_value = bias_data[oc];
-            }
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                ActivationFunctionWithMinMax(total + bias_value,
-                                             output_activation_min,
-                                             output_activation_max);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // end namespace reference_ops
-}  // end namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
deleted file mode 100644
index eab28e6c84c77fd9a12a97203e05b79c3ab5fb31..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
-
-#include <algorithm>
-
-#include "fixedpoint/fixedpoint.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-namespace tflite {
-namespace reference_ops {
-
-inline void DepthwiseConv(
-    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int ic = 0; ic < input_depth; ++ic) {
-          for (int m = 0; m < depth_multiplier; m++) {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  int32 input_val =
-                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                  int32 filter_val = filter_data[Offset(
-                      filter_shape, 0, filter_y, filter_x, oc)];
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
-                }
-              }
-            }
-            if (bias_data) {
-              acc += bias_data[oc];
-            }
-            acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
-                                                output_shift);
-            acc += output_offset;
-            acc = std::max(acc, output_activation_min);
-            acc = std::min(acc, output_activation_max);
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                static_cast<uint8>(acc);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // end namespace reference_ops
-}  // end namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/internal/resize_bilinear_test.cc
deleted file mode 100644
index 15df31f75a69b9c0076eb4978e06707b5966417d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/resize_bilinear_test.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-namespace tflite {
-namespace {
-template <typename T>
-void TestOneResizeBilinear(int batch, int depth, int input_width,
-                           int input_height, int output_width,
-                           int output_height, float error_threshold) {
-  RuntimeShape input_dims_inference({batch, input_height, input_width, depth});
-  RuntimeShape output_dims_inference(
-      {batch, output_height, output_width, depth});
-
-  const int input_buffer_size = input_dims_inference.FlatSize();
-  const int output_buffer_size = output_dims_inference.FlatSize();
-
-  std::vector<T> input_data(input_buffer_size, 0);
-  std::vector<T> reference_output_data(output_buffer_size, 0);
-  // Initialize the output data with something other than zero, so we can catch
-  // issue with kernels failing to initialize the output.
-  std::vector<T> output_data(output_buffer_size, 3);
-
-  const T min_amplitude = static_cast<T>(0);
-  const T max_amplitude = static_cast<T>(255);
-  FillRandom(&input_data, min_amplitude, max_amplitude);
-
-  RuntimeShape output_size_dims({1, 1, 1, 2});
-  std::vector<int32> output_size_data = {output_height, output_width};
-
-  tflite::ResizeBilinearParams op_params;
-  op_params.align_corners = false;
-
-  reference_ops::ResizeBilinear(op_params, input_dims_inference,
-                                input_data.data(), output_size_dims,
-                                output_size_data.data(), output_dims_inference,
-                                reference_output_data.data());
-  optimized_ops::ResizeBilinear(
-      op_params, input_dims_inference, input_data.data(), output_size_dims,
-      output_size_data.data(), output_dims_inference, output_data.data());
-
-  double sum_diff = 0;
-  float max_abs_val = 0;
-  for (int i = 0; i < output_buffer_size; i++) {
-    sum_diff += std::abs(static_cast<float>(output_data[i]) -
-                         static_cast<float>(reference_output_data[i]));
-    max_abs_val = std::max(
-        max_abs_val, std::abs(static_cast<float>(reference_output_data[i])));
-  }
-
-  if (sum_diff != 0.f) {
-    const float mean_diff = static_cast<float>(sum_diff / output_buffer_size);
-    const float relative_error = std::abs(mean_diff) / max_abs_val;
-    ASSERT_LT(relative_error, error_threshold);
-  }
-}
-
-TEST(ResizeBilinear, TestResizeBilinear8Bit) {
-  const int kTestsToRun = 100 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
-    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
-    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
-
-    TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height,
-                                 output_width, output_height, 0.025);
-  }
-}
-
-TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
-  const int kTestsToRun = 100 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
-    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
-    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int output_width = input_width * 2;
-    const int output_height = input_height * 2;
-
-    TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height,
-                                 output_width, output_height, 1e-5);
-  }
-}
-
-TEST(ResizeBilinear, TestResizeBilinear) {
-  const int kTestsToRun = 100 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
-    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
-    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
-
-    TestOneResizeBilinear<float>(batch, depth, input_width, input_height,
-                                 output_width, output_height, 1e-5);
-  }
-}
-
-TEST(ResizeBilinear2x2, TestResizeBilinear) {
-  const int kTestsToRun = 100 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
-    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
-    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
-    const int output_width = input_width * 2;
-    const int output_height = input_height * 2;
-
-    TestOneResizeBilinear<float>(batch, depth, input_width, input_height,
-                                 output_width, output_height, 1e-5);
-  }
-}
-}  // namespace
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
deleted file mode 100644
index f4181b18a8f46fd9bef4b81a210a6b8134a4e9d0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-
-#ifndef USE_NEON
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#define USE_NEON
-#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
-#endif  //  USE_NEON
-
-#ifdef USE_NEON
-#include "tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h"
-#else
-#include "tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h"
-#endif  // USE_NEON
diff --git a/tensorflow/contrib/lite/kernels/internal/test_util.cc b/tensorflow/contrib/lite/kernels/internal/test_util.cc
deleted file mode 100644
index 75d568ae3aaf9b186ffda0a1415f75ffb3e8c46b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/test_util.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
-
-#include <cmath>
-#include <iterator>
-
-namespace tflite {
-
-// this is a copied from an internal function in propagate_fixed_sizes.cc
-bool ComputeConvSizes(const RuntimeShape& input_shape, int output_depth,
-                      int filter_width, int filter_height, int stride,
-                      int dilation_width_factor, int dilation_height_factor,
-                      PaddingType padding_type, RuntimeShape* output_shape,
-                      int* pad_width, int* pad_height) {
-  const int input_width = input_shape.Dims(2);
-  const int input_height = input_shape.Dims(1);
-  const int batch = input_shape.Dims(0);
-
-  int dilated_filter_width = dilation_width_factor * (filter_width - 1) + 1;
-  int dilated_filter_height = dilation_height_factor * (filter_height - 1) + 1;
-
-  int output_height = 0;
-  int output_width = 0;
-  if (padding_type == PaddingType::kValid) {
-    output_height = (input_height + stride - dilated_filter_height) / stride;
-    output_width = (input_width + stride - dilated_filter_width) / stride;
-  } else if (padding_type == PaddingType::kSame) {
-    output_height = (input_height + stride - 1) / stride;
-    output_width = (input_width + stride - 1) / stride;
-  } else {
-    return false;
-  }
-
-  if (output_width <= 0 || output_height <= 0) {
-    return false;
-  }
-
-  *pad_height = std::max(
-      0, ((output_height - 1) * stride + dilated_filter_height - input_height) /
-             2);
-  *pad_width = std::max(
-      0,
-      ((output_width - 1) * stride + dilated_filter_width - input_width) / 2);
-
-  output_shape->BuildFrom({batch, output_height, output_width, output_depth});
-  return true;
-}
-
-std::mt19937& RandomEngine() {
-  static std::mt19937 engine;
-  return engine;
-}
-
-int UniformRandomInt(int min, int max) {
-  std::uniform_int_distribution<int> dist(min, max);
-  return dist(RandomEngine());
-}
-
-float UniformRandomFloat(float min, float max) {
-  std::uniform_real_distribution<float> dist(min, max);
-  return dist(RandomEngine());
-}
-
-int ExponentialRandomPositiveInt(float percentile, int percentile_val,
-                                 int max_val) {
-  const float lambda =
-      -std::log(1.f - percentile) / static_cast<float>(percentile_val);
-  std::exponential_distribution<float> dist(lambda);
-  float val;
-  do {
-    val = dist(RandomEngine());
-  } while (!val || !std::isfinite(val) || val > max_val);
-  return static_cast<int>(std::ceil(val));
-}
-
-float ExponentialRandomPositiveFloat(float percentile, float percentile_val,
-                                     float max_val) {
-  const float lambda =
-      -std::log(1.f - percentile) / static_cast<float>(percentile_val);
-  std::exponential_distribution<float> dist(lambda);
-  float val;
-  do {
-    val = dist(RandomEngine());
-  } while (!std::isfinite(val) || val > max_val);
-  return val;
-}
-
-void FillRandom(std::vector<float>* vec, float min, float max) {
-  std::uniform_real_distribution<float> dist(min, max);
-  auto gen = std::bind(dist, RandomEngine());
-  std::generate(std::begin(*vec), std::end(*vec), gen);
-}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/test_util.h b/tensorflow/contrib/lite/kernels/internal/test_util.h
deleted file mode 100644
index e4a383bedfc034a5398a3a1a78082e49c38f4afe..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/test_util.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
-
-#include <algorithm>
-#include <functional>
-#include <iterator>
-#include <limits>
-#include <random>
-#include <vector>
-
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-namespace tflite {
-
-// Computes output and padding dimensions.
-bool ComputeConvSizes(const RuntimeShape& input_shape, int output_depth,
-                      int filter_width, int filter_height, int stride,
-                      int dilation_width_factor, int dilation_height_factor,
-                      PaddingType padding_type, RuntimeShape* output_shape,
-                      int* pad_width, int* pad_height);
-
-// Returns a mt19937 random engine.
-std::mt19937& RandomEngine();
-
-// Returns a random integer uniformly distributed between |min| and |max|.
-int UniformRandomInt(int min, int max);
-
-// Returns a random float uniformly distributed between |min| and |max|.
-float UniformRandomFloat(float min, float max);
-
-// Returns a random element in |v|.
-template <typename T>
-const T& RandomElement(const std::vector<T>& v) {
-  return v[UniformRandomInt(0, v.size() - 1)];
-}
-
-// Returns a random exponentially distributed integer.
-int ExponentialRandomPositiveInt(float percentile, int percentile_val,
-                                 int max_val);
-
-// Returns a random exponentially distributed float.
-float ExponentialRandomPositiveFloat(float percentile, float percentile_val,
-                                     float max_val);
-
-// Fills a vector with random floats between |min| and |max|.
-void FillRandom(std::vector<float>* vec, float min, float max);
-
-// Fills a vector with random numbers between |min| and |max|.
-template <typename T>
-void FillRandom(std::vector<T>* vec, T min, T max) {
-  std::uniform_int_distribution<T> dist(min, max);
-  auto gen = std::bind(dist, RandomEngine());
-  std::generate(std::begin(*vec), std::end(*vec), gen);
-}
-
-// Fills a vector with random numbers.
-template <typename T>
-void FillRandom(std::vector<T>* vec) {
-  FillRandom(vec, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
-}
-
-template <typename T>
-void FillRandom(typename std::vector<T>::iterator begin_it,
-                typename std::vector<T>::iterator end_it, T min, T max) {
-  std::uniform_int_distribution<T> dist(min, max);
-  auto gen = std::bind(dist, RandomEngine());
-  std::generate(begin_it, end_it, gen);
-}
-
-// Fill with a "skyscraper" pattern, in which there is a central section (across
-// the depth) with higher values than the surround.
-template <typename T>
-void FillRandomSkyscraper(std::vector<T>* vec, int depth,
-                          double middle_proportion, uint8 middle_min,
-                          uint8 sides_max) {
-  for (auto base_it = std::begin(*vec); base_it != std::end(*vec);
-       base_it += depth) {
-    auto left_it = base_it + std::ceil(0.5 * depth * (1.0 - middle_proportion));
-    auto right_it =
-        base_it + std::ceil(0.5 * depth * (1.0 + middle_proportion));
-    FillRandom(base_it, left_it, std::numeric_limits<T>::min(), sides_max);
-    FillRandom(left_it, right_it, middle_min, std::numeric_limits<T>::max());
-    FillRandom(right_it, base_it + depth, std::numeric_limits<T>::min(),
-               sides_max);
-  }
-}
-
-}  // namespace tflite
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
deleted file mode 100644
index 64a39dd2a2d3d473fbd7ba266133d9992e5a4672..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ /dev/null
@@ -1,1017 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
-
-#include <algorithm>
-#include <cstring>
-
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-
-namespace tflite {
-
-enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
-enum class PaddingType : uint8 { kNone, kSame, kValid };
-
-struct PaddingValues {
-  int16 width;
-  int16 height;
-};
-
-// This enumeration allows for non-default formats for the weights array
-// of a fully-connected operator, allowing the use of special optimized
-// runtime paths.
-enum class FullyConnectedWeightsFormat : uint8 {
-  // Default format (flat 2D layout, the inner contiguous dimension
-  // is input_depth, the outer non-contiguous dimension is output_depth)
-  kDefault,
-  // Summary: optimized layout for fast CPU runtime implementation,
-  // aimed specifically at ARM CPUs at the moment, and specialized for
-  // 8-bit quantized layers.
-  //
-  // The use case we're concerned with here is: 8-bit quantization,
-  // large weights matrix that doesn't fit in cache (e.g. 4096x2048 in
-  // a key application that drove this), very small batch size (e.g. 1 -- 4).
-  //
-  // Even with 8-bit quantization of weights, the performance of memory
-  // accesses to the weights can become the dominant issue when
-  // the batch size is small, so each weight value is used in only a few
-  // arithmetic ops, i.e. the fully-connected node has a low arithmetic
-  // intensity. The specific issues that arise are of three kinds:
-  // (1) One may, ideally, max out DRAM bandwidth, i.e. be truly memory
-  //     bound. That's the "good" issue to run into.
-  // (2) One may run into sub-optimal pre-fetching: the data hasn't been
-  //     prefetched into the cache by the time we need it.
-  // (3) One may run into cache aliasing: multiple values that are
-  //     pre-fetched, alias each other in the L1 cache (which typically
-  //     has only 4-way set associativity in ARM CPUs) and thus evict
-  //     each other before we get to using them.
-  //
-  // The point of this shuffling is to avoid issues (2) and (3) so that
-  // we get as fast as possible given only the hard constraint (1).
-  // This is achieved by turning the difficulty into a solution: the
-  // difficulty, that each value loaded from memory is used only in
-  // one kernel iteration, making this operation memory-intensive, hints at
-  // the solution, of shuffling the weights so that they are stored in the
-  // exact order as the kernel needs to load them, so that the memory
-  // accesses made by the kernel are trivial. This solves (2) because the
-  // trivial memory access pattern allows the CPU's automatic prefetching
-  // to perform very well (no need even for preload instructions), and this
-  // solves (3) because the values being loaded concurrently are now
-  // contiguous in the address space, thus don't alias each other in the cache.
-  //
-  // On ARM, we typically want our kernel to process a 4x16 block of weights
-  // at a time, because:
-  //   - 16 is the number of bytes in a NEON register.
-  //   - 4 is how many rows we need to handle concurrently in the kernel in
-  //     order to have sufficient mutual independence of instructions to
-  //     maximize arithmetic throughput.
-  //
-  // Finally, the 'Int8' part in the name refers to the fact that this
-  // weights format has each weights value encoded as a signed int8 value,
-  // even if the data type of the weights buffer is uint8.  This is intended
-  // to save runtime kernels the effort to have to XOR the top bit of these
-  // bytes before using them in signed arithmetic, see this file for more
-  // explanations on the 'signed int8 trick' in matrix multiplication kernels:
-  //
-  //   tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
-  //
-  kShuffled4x16Int8,
-};
-
-// Quantization parameters, determining the mapping of quantized values
-// to real values (i.e. determining how quantized values are mathematically
-// interpreted).
-//
-// The correspondence is as follows:
-//
-//   real_value = scale * (quantized_value - zero_point);
-//
-// In other words, zero_point designates which quantized value corresponds to
-// the real 0 value, and scale designates the difference between the real values
-// corresponding to consecutive quantized values differing by 1.
-struct QuantizationParams {
-  int32 zero_point = 0;
-  double scale = 0.0;
-};
-
-template <int N>
-struct Dims {
-  int sizes[N];
-  int strides[N];
-};
-
-class RuntimeShape {
- public:
-  // Shapes with dimensions up to 4 are stored directly in the structure, while
-  // larger shapes are separately allocated.
-  static constexpr int kMaxSmallSize = 4;
-
-  RuntimeShape& operator=(RuntimeShape const&) = delete;
-
-  RuntimeShape() : size_(0) {}
-
-  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
-    if (dimensions_count > kMaxSmallSize) {
-#ifdef TF_LITE_STATIC_MEMORY
-      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
-#endif  // TF_LITE_STATIC_MEMORY
-    }
-  }
-
-  RuntimeShape(int shape_size, int32 value) : size_(0) {
-    Resize(shape_size);
-    for (int i = 0; i < shape_size; ++i) {
-      SetDim(i, value);
-    }
-  }
-
-  RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
-    ReplaceWith(dimensions_count, dims_data);
-  }
-
-  RuntimeShape(const std::initializer_list<int> init_list) : size_(0) {
-    BuildFrom(init_list);
-  }
-
-  // Avoid using this constructor.  We should be able to delete it when C++17
-  // rolls out.
-  RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
-    if (size_ > kMaxSmallSize) {
-      dims_pointer_ = new int32[size_];
-    }
-    std::memcpy(DimsData(), other.DimsData(), sizeof(int32) * size_);
-  }
-
-  bool operator==(const RuntimeShape& comp) const {
-    return this->size_ == comp.size_ &&
-           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32)) == 0;
-  }
-
-  ~RuntimeShape() {
-    if (size_ > kMaxSmallSize) {
-#ifdef TF_LITE_STATIC_MEMORY
-      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      delete[] dims_pointer_;
-#endif  // TF_LITE_STATIC_MEMORY
-    }
-  }
-
-  inline int32 DimensionsCount() const { return size_; }
-  inline int32 Dims(int i) const {
-    TFLITE_DCHECK_GE(i, 0);
-    TFLITE_DCHECK_LT(i, size_);
-    return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
-  }
-  inline void SetDim(int i, int32 val) {
-    TFLITE_DCHECK_GE(i, 0);
-    TFLITE_DCHECK_LT(i, size_);
-    if (size_ > kMaxSmallSize) {
-      dims_pointer_[i] = val;
-    } else {
-      dims_[i] = val;
-    }
-  }
-
-  inline int32* DimsData() {
-    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
-  }
-  inline const int32* DimsData() const {
-    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
-  }
-  // The caller must ensure that the shape is no bigger than 4-D.
-  inline const int32* DimsDataUpTo4D() const { return dims_; }
-
-  inline void Resize(int dimensions_count) {
-    if (size_ > kMaxSmallSize) {
-#ifdef TF_LITE_STATIC_MEMORY
-      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      delete[] dims_pointer_;
-#endif  // TF_LITE_STATIC_MEMORY
-    }
-    size_ = dimensions_count;
-    if (dimensions_count > kMaxSmallSize) {
-#ifdef TF_LITE_STATIC_MEMORY
-      TFLITE_CHECK(false && "No shape resizing supported on this platform");
-#else   // TF_LITE_STATIC_MEMORY
-      dims_pointer_ = new int32[dimensions_count];
-#endif  // TF_LITE_STATIC_MEMORY
-    }
-  }
-
-  inline void ReplaceWith(int dimensions_count, const int32* dims_data) {
-    Resize(dimensions_count);
-    int32* dst_dims = DimsData();
-    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32));
-  }
-
-  template <typename T>
-  inline void BuildFrom(const T& src_iterable) {
-    const int dimensions_count =
-        std::distance(src_iterable.begin(), src_iterable.end());
-    Resize(dimensions_count);
-    int32* data = DimsData();
-    for (auto it : src_iterable) {
-      *data = it;
-      ++data;
-    }
-  }
-
-  // This will probably be factored out. Old code made substantial use of 4-D
-  // shapes, and so this function is used to extend smaller shapes. Note that
-  // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
-  // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
-  // inputs should already be 4-D, so this function should not be needed.
-  inline static RuntimeShape ExtendedShape(int new_shape_size,
-                                           const RuntimeShape& shape) {
-    return RuntimeShape(new_shape_size, shape, 1);
-  }
-
-  inline void BuildFrom(const std::initializer_list<int> init_list) {
-    BuildFrom<const std::initializer_list<int>>(init_list);
-  }
-
-  // Returns the total count of elements, that is the size when flattened into a
-  // vector.
-  inline int FlatSize() const {
-    int buffer_size = 1;
-    const int* dims_data = DimsData();
-    for (int i = 0; i < size_; i++) {
-      const int dim = dims_data[i];
-      TFLITE_DCHECK_GE(dim, 1);
-      buffer_size *= dim;
-    }
-    return buffer_size;
-  }
-
-  bool operator!=(const RuntimeShape& comp) const { return !((*this) == comp); }
-
- private:
-  // For use only by ExtendedShape(), written to guarantee (return-value) copy
-  // elision in C++17.
-  // This creates a shape padded to the desired size with the specified value.
-  RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
-      : size_(0) {
-    TFLITE_CHECK_GE(new_shape_size, shape.DimensionsCount());
-    TFLITE_CHECK_LE(new_shape_size, kMaxSmallSize);
-    Resize(new_shape_size);
-    const int size_increase = new_shape_size - shape.DimensionsCount();
-    for (int i = 0; i < size_increase; ++i) {
-      SetDim(i, pad_value);
-    }
-    std::memcpy(DimsData() + size_increase, shape.DimsData(),
-                sizeof(int32) * shape.DimensionsCount());
-  }
-
-  int32 size_;
-  union {
-    int32 dims_[kMaxSmallSize];
-    int32* dims_pointer_;
-  };
-};
-
-// Converts inference-style shape to legacy tflite::Dims<4>.
-inline tflite::Dims<4> ToRuntimeDims(const tflite::RuntimeShape& array_shape) {
-  tflite::Dims<4> result;
-  const int dimensions_count = array_shape.DimensionsCount();
-  TFLITE_CHECK_LE(dimensions_count, 4);
-  int cum_prod = 1;
-  for (int i = 0; i < 4; i++) {
-    const int new_dim =
-        (i < dimensions_count) ? array_shape.Dims(dimensions_count - 1 - i) : 1;
-    result.sizes[i] = new_dim;
-    result.strides[i] = cum_prod;
-    cum_prod *= new_dim;
-  }
-  return result;
-}
-
-// TODO(b/80418076): Move to legacy ops file, update invocations.
-inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
-  return RuntimeShape(
-      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
-}
-
-// Gets next index to iterate through a multidimensional array.
-inline bool NextIndex(const int num_dims, const int* dims, int* current) {
-  if (num_dims == 0) {
-    return false;
-  }
-  TFLITE_DCHECK(dims != nullptr);
-  TFLITE_DCHECK(current != nullptr);
-  int carry = 1;
-  for (int idx = num_dims - 1; idx >= 0; --idx) {
-    int current_val = current[idx] + carry;
-    TFLITE_DCHECK_GE(dims[idx], current_val);
-    if (dims[idx] == current_val) {
-      current[idx] = 0;
-    } else {
-      current[idx] = current_val;
-      carry = 0;
-      break;
-    }
-  }
-  return (carry == 0);
-}
-
-// Gets offset of index if reducing on axis. When reducing, the flattened offset
-// will not change, if the input index changes on the given axis. For example,
-// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
-// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
-// offset.
-// TODO(kanlig): uses Dims to represent dimensions.
-inline size_t ReducedOutputOffset(const int num_dims, const int* dims,
-                                  const int* index, const int num_axis,
-                                  const int* axis) {
-  if (num_dims == 0) {
-    return 0;
-  }
-  TFLITE_DCHECK(dims != nullptr);
-  TFLITE_DCHECK(index != nullptr);
-  size_t offset = 0;
-  for (int idx = 0; idx < num_dims; ++idx) {
-    // if we need to skip this axis
-    bool is_axis = false;
-    if (axis != nullptr) {
-      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
-        if (idx == axis[axis_idx]) {
-          is_axis = true;
-          break;
-        }
-      }
-    }
-    if (!is_axis) {
-      offset = offset * static_cast<size_t>(dims[idx]) +
-               static_cast<size_t>(index[idx]);
-    }
-  }
-  return offset;
-}
-
-inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
-  TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4);
-  const int* dims_data = shape.DimsDataUpTo4D();
-  TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]);
-  TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]);
-  TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]);
-  TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]);
-  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
-}
-
-inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
-  TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
-  TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
-  TFLITE_DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
-  TFLITE_DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
-  return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
-         i3 * dims.strides[3];
-}
-
-inline int Offset(const Dims<4>& dims, int* index) {
-  return Offset(dims, index[0], index[1], index[2], index[3]);
-}
-
-inline int Offset(const RuntimeShape& shape, int* index) {
-  return Offset(shape, index[0], index[1], index[2], index[3]);
-}
-
-// Get array size, DCHECKing that the dim index is in range.
-//
-// Note that this will be phased out with Dims<4>, since RuntimeShape::Dims()
-// already performs this check.
-template <int N>
-int ArraySize(const Dims<N>& array, int index) {
-  TFLITE_DCHECK(index >= 0 && index < N);
-  return array.sizes[index];
-}
-
-// Get common array size, DCHECKing that they all agree.
-template <typename ArrayType1, typename ArrayType2>
-int MatchingArraySize(const ArrayType1& array1, int index1,
-                      const ArrayType2& array2, int index2) {
-  TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
-  return ArraySize(array1, index1);
-}
-
-template <typename ArrayType1, typename ArrayType2, typename... Args>
-int MatchingArraySize(const ArrayType1& array1, int index1,
-                      const ArrayType2& array2, int index2, Args... args) {
-  TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
-  return MatchingArraySize(array1, index1, args...);
-}
-
-// Get common shape dim, DCHECKing that they all agree.
-inline int MatchingDim(const RuntimeShape& shape1, int index1,
-                       const RuntimeShape& shape2, int index2) {
-  TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
-  return shape1.Dims(index1);
-}
-
-template <typename... Args>
-int MatchingDim(const RuntimeShape& shape1, int index1,
-                const RuntimeShape& shape2, int index2, Args... args) {
-  TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
-  return MatchingDim(shape1, index1, args...);
-}
-
-// Will be phased out with Dims<4>, replaced by RuntimeShape::FlatSize().
-template <int N>
-inline int FlatSize(const Dims<N>& dims) {
-  int flat_size = 1;
-  for (int i = 0; i < N; ++i) {
-    flat_size *= dims.sizes[i];
-  }
-  return flat_size;
-}
-
-TFLITE_DEPRECATED("Prefer FlatSize.")
-inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
-  return FlatSize(dims);
-}
-
-// Flat size calculation, checking that dimensions match with one or more other
-// arrays.
-inline int MatchingFlatSize(const RuntimeShape& shape,
-                            const RuntimeShape& check_shape_0) {
-  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
-  const int dims_count = shape.DimensionsCount();
-  for (int i = 0; i < dims_count; ++i) {
-    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
-  }
-  return shape.FlatSize();
-}
-
-inline int MatchingFlatSize(const RuntimeShape& shape,
-                            const RuntimeShape& check_shape_0,
-                            const RuntimeShape& check_shape_1) {
-  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
-  const int dims_count = shape.DimensionsCount();
-  for (int i = 0; i < dims_count; ++i) {
-    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
-  }
-  return MatchingFlatSize(shape, check_shape_1);
-}
-
-inline int MatchingFlatSize(const RuntimeShape& shape,
-                            const RuntimeShape& check_shape_0,
-                            const RuntimeShape& check_shape_1,
-                            const RuntimeShape& check_shape_2) {
-  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
-  const int dims_count = shape.DimensionsCount();
-  for (int i = 0; i < dims_count; ++i) {
-    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
-  }
-  return MatchingFlatSize(shape, check_shape_1, check_shape_2);
-}
-
-inline int MatchingFlatSize(const RuntimeShape& shape,
-                            const RuntimeShape& check_shape_0,
-                            const RuntimeShape& check_shape_1,
-                            const RuntimeShape& check_shape_2,
-                            const RuntimeShape& check_shape_3) {
-  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
-  const int dims_count = shape.DimensionsCount();
-  for (int i = 0; i < dims_count; ++i) {
-    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
-  }
-  return MatchingFlatSize(shape, check_shape_1, check_shape_2, check_shape_3);
-}
-
-// Flat size calculation, checking that dimensions match with one or more other
-// arrays.
-template <int N>
-inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
-  for (int i = 0; i < N; ++i) {
-    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
-  }
-  return FlatSize(dims);
-}
-
-template <int N>
-inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
-                            const Dims<N>& check_dims_1) {
-  for (int i = 0; i < N; ++i) {
-    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
-  }
-  return MatchingFlatSize(dims, check_dims_1);
-}
-
-template <int N>
-inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
-                            const Dims<N>& check_dims_1,
-                            const Dims<N>& check_dims_2) {
-  for (int i = 0; i < N; ++i) {
-    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
-  }
-  return MatchingFlatSize(dims, check_dims_1, check_dims_2);
-}
-
-template <int N>
-inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
-                            const Dims<N>& check_dims_1,
-                            const Dims<N>& check_dims_2,
-                            const Dims<N>& check_dims_3) {
-  for (int i = 0; i < N; ++i) {
-    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
-  }
-  return MatchingFlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
-}
-
-// Data is required to be contiguous, and so many operators can use either the
-// full array flat size or the flat size with one dimension skipped (commonly
-// the depth).
-template <int N>
-inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
-  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < N);
-  int flat_size = 1;
-  for (int i = 0; i < N; ++i) {
-    flat_size *= (i == skip_dim) ? 1 : dims.sizes[i];
-  }
-  return flat_size;
-}
-
-// A combination of MatchingFlatSize() and FlatSizeSkipDim().
-template <int N>
-inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
-                                   const Dims<N>& check_dims_0) {
-  for (int i = 0; i < N; ++i) {
-    if (i != skip_dim) {
-      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
-    }
-  }
-  return FlatSizeSkipDim(dims, skip_dim);
-}
-
-template <int N>
-inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
-                                   const Dims<N>& check_dims_0,
-                                   const Dims<N>& check_dims_1) {
-  for (int i = 0; i < N; ++i) {
-    if (i != skip_dim) {
-      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
-    }
-  }
-  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1);
-}
-
-template <int N>
-inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
-                                   const Dims<N>& check_dims_0,
-                                   const Dims<N>& check_dims_1,
-                                   const Dims<N>& check_dims_2) {
-  for (int i = 0; i < N; ++i) {
-    if (i != skip_dim) {
-      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
-    }
-  }
-  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2);
-}
-
-template <int N>
-inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
-                                   const Dims<N>& check_dims_0,
-                                   const Dims<N>& check_dims_1,
-                                   const Dims<N>& check_dims_2,
-                                   const Dims<N>& check_dims_3) {
-  for (int i = 0; i < N; ++i) {
-    if (i != skip_dim) {
-      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
-    }
-  }
-  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2,
-                                 check_dims_3);
-}
-
-// Data is required to be contiguous, and so many operators can use either the
-// full array flat size or the flat size with one dimension skipped (commonly
-// the depth).
-inline int FlatSizeSkipDim(const RuntimeShape& shape, int skip_dim) {
-  const int dims_count = shape.DimensionsCount();
-  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < dims_count);
-  const auto* dims_data = shape.DimsData();
-  int flat_size = 1;
-  for (int i = 0; i < dims_count; ++i) {
-    flat_size *= (i == skip_dim) ? 1 : dims_data[i];
-  }
-  return flat_size;
-}
-
-// A combination of MatchingFlatSize() and FlatSizeSkipDim().
-inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
-                                   const RuntimeShape& check_shape_0) {
-  const int dims_count = shape.DimensionsCount();
-  for (int i = 0; i < dims_count; ++i) {
-    if (i != skip_dim) {
-      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
-    }
-  }
-  return FlatSizeSkipDim(shape, skip_dim);
-}
-
-inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
-                                   const RuntimeShape& check_shape_0,
-                                   const RuntimeShape& check_shape_1) {
-  const int dims_count = shape.DimensionsCount();
-  for (int i = 0; i < dims_count; ++i) {
-    if (i != skip_dim) {
-      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
-    }
-  }
-  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1);
-}
-
-inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
-                                   const RuntimeShape& check_shape_0,
-                                   const RuntimeShape& check_shape_1,
-                                   const RuntimeShape& check_shape_2) {
-  const int dims_count = shape.DimensionsCount();
-  for (int i = 0; i < dims_count; ++i) {
-    if (i != skip_dim) {
-      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
-    }
-  }
-  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1, check_shape_2);
-}
-
-inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
-                                   const RuntimeShape& check_shape_0,
-                                   const RuntimeShape& check_shape_1,
-                                   const RuntimeShape& check_shape_2,
-                                   const RuntimeShape& check_shape_3) {
-  const int dims_count = shape.DimensionsCount();
-  for (int i = 0; i < dims_count; ++i) {
-    if (i != skip_dim) {
-      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
-    }
-  }
-  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1, check_shape_2,
-                                 check_shape_3);
-}
-
-template <int N>
-bool IsPackedWithoutStrides(const Dims<N>& dims) {
-  int expected_stride = 1;
-  for (int d = 0; d < N; d++) {
-    if (dims.strides[d] != expected_stride) return false;
-    expected_stride *= dims.sizes[d];
-  }
-  return true;
-}
-
-template <int N>
-void ComputeStrides(Dims<N>* dims) {
-  dims->strides[0] = 1;
-  for (int d = 1; d < N; d++) {
-    dims->strides[d] = dims->strides[d - 1] * dims->sizes[d - 1];
-  }
-}
-
-enum class BroadcastableOpCategory : uint8 {
-  kNone,
-  kNonBroadcast,               // Matching input shapes.
-  kFirstInputBroadcastsFast,   // Fivefold nested loops.
-  kSecondInputBroadcastsFast,  // Fivefold nested loops.
-  kGenericBroadcast,           // Fall-back.
-};
-
-struct MinMax {
-  float min;
-  float max;
-};
-static_assert(sizeof(MinMax) == 8, "");
-
-struct ActivationParams {
-  FusedActivationFunctionType activation_type;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
-};
-
-// For Add, Sub, Mul ops.
-struct ArithmeticParams {
-  // Shape dependent / common to data / op types.
-  BroadcastableOpCategory broadcast_category;
-  // uint8 inference params.
-  int32 input1_offset;
-  int32 input2_offset;
-  int32 output_offset;
-  int32 output_multiplier;
-  int output_shift;
-  // Add / Sub, not Mul, uint8 inference params.
-  int left_shift;
-  int32 input1_multiplier;
-  int input1_shift;
-  int32 input2_multiplier;
-  int input2_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
-  // float activation params.
-  float float_activation_min;
-  float float_activation_max;
-
-  // Processed output dimensions.
-  // Let input "a" be the one that broadcasts in the faster-changing dimension.
-  // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and
-  // {b0, b1, b2, b3, b4},
-  // broadcast_shape[4] = b0 = a0.
-  // broadcast_shape[3] = b1; a1 = 1.
-  // broadcast_shape[2] = b2 = a2.
-  // broadcast_shape[1] = a3; b3 = 1.
-  // broadcast_shape[0] = b4 = a4.
-  int broadcast_shape[5];
-};
-
-struct ConcatenationParams {
-  int8 axis;
-  const int32* input_zeropoint;
-  const float* input_scale;
-  uint16 inputs_count;
-  int32 output_zeropoint;
-  float output_scale;
-};
-
-struct ComparisonParams {
-  // uint8 inference params.
-  int left_shift;
-  int32 input1_offset;
-  int32 input1_multiplier;
-  int input1_shift;
-  int32 input2_offset;
-  int32 input2_multiplier;
-  int input2_shift;
-  // Shape dependent / common to inference types.
-  bool is_broadcast;
-};
-
-struct ConvParams {
-  PaddingType padding_type;
-  PaddingValues padding_values;
-  // TODO(starka): This was just "stride", so check that width+height is OK.
-  int16 stride_width;
-  int16 stride_height;
-  int16 dilation_width_factor;
-  int16 dilation_height_factor;
-  // uint8 inference params.
-  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
-  int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
-  // float activation params.
-  float float_activation_min;
-  float float_activation_max;
-};
-
-struct DepthToSpaceParams {
-  int32 block_size;
-};
-
-struct DepthwiseParams {
-  PaddingType padding_type;
-  PaddingValues padding_values;
-  int16 stride_width;
-  int16 stride_height;
-  int16 dilation_width_factor;
-  int16 dilation_height_factor;
-  int16 depth_multiplier;
-  // uint8 inference params.
-  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
-  int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
-  // float activation params.
-  float float_activation_min;
-  float float_activation_max;
-};
-
-struct DequantizationParams {
-  double scale;
-  int32 zero_point;
-};
-
-struct FakeQuantParams {
-  MinMax minmax;
-  int32 num_bits;
-};
-
-struct FullyConnectedParams {
-  // uint8 inference params.
-  // TODO(b/65838351): Use smaller types if appropriate.
-  int32 input_offset;
-  int32 weights_offset;
-  int32 output_offset;
-  int32 output_multiplier;
-  int output_shift;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
-  // float activation params.
-  float float_activation_min;
-  float float_activation_max;
-  FullyConnectedWeightsFormat weights_format;
-};
-
-struct GatherParams {
-  int16 input_rank;
-  int16 axis;
-};
-
-struct L2NormalizationParams {
-  // uint8 inference params.
-  int32 input_zero_point;
-};
-
-struct LocalResponseNormalizationParams {
-  int32 range;
-  double bias;
-  double alpha;
-  double beta;
-};
-
-struct LogisticParams {
-  // uint8 inference params.
-  int32 input_zero_point;
-  int32 input_range_radius;
-  int32 input_multiplier;
-  int input_left_shift;
-};
-
-struct LstmCellParams {
-  int32 weights_zero_point;
-  int32 accum_multiplier;
-  int accum_shift;
-  int state_integer_bits;
-};
-
-struct MeanParams {
-  int8 axis_count;
-  int16 axis[4];
-};
-
-struct PackParams {
-  int8 axis;
-  const int32* input_zeropoint;
-  const float* input_scale;
-  uint16 inputs_count;
-  int32 output_zeropoint;
-  float output_scale;
-};
-
-struct PadParams {
-  int8 left_padding_count;
-  int32 left_padding[4];
-  int8 right_padding_count;
-  int32 right_padding[4];
-};
-
-struct PoolParams {
-  FusedActivationFunctionType activation;
-  PaddingType padding_type;
-  PaddingValues padding_values;
-  int stride_height;
-  int stride_width;
-  int filter_height;
-  int filter_width;
-  // uint8, etc, activation params.
-  int32 quantized_activation_min;
-  int32 quantized_activation_max;
-  // float activation params.
-  float float_activation_min;
-  float float_activation_max;
-};
-
-struct ReshapeParams {
-  int8 shape_count;
-  int32 shape[4];
-};
-
-struct ResizeBilinearParams {
-  bool align_corners;
-};
-
-struct SliceParams {
-  int8 begin_count;
-  int32 begin[4];
-  int8 size_count;
-  int32 size[4];
-};
-
-struct SoftmaxParams {
-  // beta is not really used (not a Tensorflow parameter) and not implemented
-  // for LogSoftmax.
-  double beta;
-  // uint8 inference params.  Used even when beta defaults to 1.0.
-  int32 input_multiplier;
-  int32 input_left_shift;
-  // Reverse scaling is only used by LogSoftmax.
-  int32 reverse_scaling_divisor;
-  int32 reverse_scaling_right_shift;
-  int diff_min;
-};
-
-struct SpaceToBatchParams {
-  // "Zero" padding for uint8 means padding with the output offset.
-  int32 output_offset;
-};
-
-struct SpaceToDepthParams {
-  int32 block_size;
-};
-
-struct SplitParams {
-  // Graphs that split into, say, 2000 nodes are encountered.  The indices in
-  // OperatorEdges are of type uint16.
-  uint16 num_split;
-  int16 axis;
-};
-
-struct SqueezeParams {
-  int8 squeeze_dims_count;
-  int32 squeeze_dims[4];
-};
-
-struct StridedSliceParams {
-  int8 start_indices_count;
-  int16 start_indices[4];
-  int8 stop_indices_count;
-  int16 stop_indices[4];
-  int8 strides_count;
-  int16 strides[4];
-
-  int16 begin_mask;
-  int16 ellipsis_mask;
-  int16 end_mask;
-  int16 new_axis_mask;
-  int16 shrink_axis_mask;
-};
-
-struct TanhParams {
-  int32 input_zero_point;
-  int32 input_range_radius;
-  int32 input_multiplier;
-  int input_left_shift;
-};
-
-struct TransposeParams {
-  int8 perm_count;
-  int32 perm[4];
-};
-
-struct UnpackParams {
-  uint16 num_split;
-  int16 axis;
-};
-
-template <typename P>
-inline void SetActivationParams(float min, float max, P* params) {
-  params->float_activation_min = min;
-  params->float_activation_max = max;
-}
-
-template <typename P>
-inline void SetActivationParams(int32 min, int32 max, P* params) {
-  params->quantized_activation_min = min;
-  params->quantized_activation_max = max;
-}
-
-template <typename P>
-inline void GetActivationParams(const P& params, int32* min, int32* max) {
-  *min = params.quantized_activation_min;
-  *max = params.quantized_activation_max;
-}
-
-template <typename P>
-inline void GetActivationParams(const P& params, float* min, float* max) {
-  *min = params.float_activation_min;
-  *max = params.float_activation_max;
-}
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/layer_norm_lstm_test.cc b/tensorflow/contrib/lite/kernels/layer_norm_lstm_test.cc
deleted file mode 100644
index 479f6a7d3c0cdb969f73a83fa28cd1c79940f807..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/layer_norm_lstm_test.cc
+++ /dev/null
@@ -1,664 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for TFLite Layer Norm LSTM op.
-
-#include <memory>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_LAYER_NORM_LSTM();
-
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class LayerNormLSTMOpModel : public SingleOpModel {
- public:
-  LayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                       bool use_cifg, bool use_peephole,
-                       bool use_projection_weights, bool use_projection_bias,
-                       float cell_clip, float proj_clip,
-                       const std::vector<std::vector<int>>& input_shapes,
-                       const TensorType& weight_type = TensorType_FLOAT32)
-      : n_batch_(n_batch),
-        n_input_(n_input),
-        n_cell_(n_cell),
-        n_output_(n_output) {
-    input_ = AddInput(TensorType_FLOAT32);
-
-    if (use_cifg) {
-      input_to_input_weights_ = AddNullInput();
-    } else {
-      input_to_input_weights_ = AddInput(weight_type);
-    }
-
-    input_to_forget_weights_ = AddInput(weight_type);
-    input_to_cell_weights_ = AddInput(weight_type);
-    input_to_output_weights_ = AddInput(weight_type);
-
-    if (use_cifg) {
-      recurrent_to_input_weights_ = AddNullInput();
-    } else {
-      recurrent_to_input_weights_ = AddInput(weight_type);
-    }
-
-    recurrent_to_forget_weights_ = AddInput(weight_type);
-    recurrent_to_cell_weights_ = AddInput(weight_type);
-    recurrent_to_output_weights_ = AddInput(weight_type);
-
-    if (use_peephole) {
-      if (use_cifg) {
-        cell_to_input_weights_ = AddNullInput();
-      } else {
-        cell_to_input_weights_ = AddInput(weight_type);
-      }
-      cell_to_forget_weights_ = AddInput(weight_type);
-      cell_to_output_weights_ = AddInput(weight_type);
-    } else {
-      cell_to_input_weights_ = AddNullInput();
-      cell_to_forget_weights_ = AddNullInput();
-      cell_to_output_weights_ = AddNullInput();
-    }
-
-    input_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-    forget_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-    cell_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-    output_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-
-    if (use_cifg) {
-      input_gate_bias_ = AddNullInput();
-    } else {
-      input_gate_bias_ = AddInput(TensorType_FLOAT32);
-    }
-    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_bias_ = AddInput(TensorType_FLOAT32);
-    output_gate_bias_ = AddInput(TensorType_FLOAT32);
-
-    if (use_projection_weights) {
-      projection_weights_ = AddInput(weight_type);
-      if (use_projection_bias) {
-        projection_bias_ = AddInput(TensorType_FLOAT32);
-      } else {
-        projection_bias_ = AddNullInput();
-      }
-    } else {
-      projection_weights_ = AddNullInput();
-      projection_bias_ = AddNullInput();
-    }
-
-    // Adding the 2 state tensors.
-    output_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true);
-    cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
-
-    output_ = AddOutput(TensorType_FLOAT32);
-
-    // Set up and pass in custom options using flexbuffer.
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {
-      fbb.Int("cell_clip", cell_clip);
-      fbb.Int("proj_clip", proj_clip);
-      fbb.String("fused_activation_function", "TANH");
-    });
-    fbb.Finish();
-    SetCustomOp("LAYER_NORM_LSTM", fbb.GetBuffer(), Register_LAYER_NORM_LSTM);
-    BuildInterpreter(input_shapes);
-  }
-
-  void SetInputToInputWeights(std::initializer_list<float> f) {
-    PopulateTensor(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(std::initializer_list<float> f) {
-    PopulateTensor(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(std::initializer_list<float> f) {
-    PopulateTensor(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(std::initializer_list<float> f) {
-    PopulateTensor(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
-    PopulateTensor(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
-    PopulateTensor(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
-    PopulateTensor(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
-    PopulateTensor(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(std::initializer_list<float> f) {
-    PopulateTensor(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(std::initializer_list<float> f) {
-    PopulateTensor(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(std::initializer_list<float> f) {
-    PopulateTensor(cell_to_output_weights_, f);
-  }
-
-  void SetInputLayerNormWeights(std::initializer_list<float> f) {
-    PopulateTensor(input_layer_norm_weights_, f);
-  }
-
-  void SetForgetLayerNormWeights(std::initializer_list<float> f) {
-    PopulateTensor(forget_layer_norm_weights_, f);
-  }
-
-  void SetCellLayerNormWeights(std::initializer_list<float> f) {
-    PopulateTensor(cell_layer_norm_weights_, f);
-  }
-
-  void SetOutputLayerNormWeights(std::initializer_list<float> f) {
-    PopulateTensor(output_layer_norm_weights_, f);
-  }
-
-  void SetInputGateBias(std::initializer_list<float> f) {
-    PopulateTensor(input_gate_bias_, f);
-  }
-
-  void SetForgetGateBias(std::initializer_list<float> f) {
-    PopulateTensor(forget_gate_bias_, f);
-  }
-
-  void SetCellBias(std::initializer_list<float> f) {
-    PopulateTensor(cell_bias_, f);
-  }
-
-  void SetOutputGateBias(std::initializer_list<float> f) {
-    PopulateTensor(output_gate_bias_, f);
-  }
-
-  void SetProjectionWeights(std::initializer_list<float> f) {
-    PopulateTensor(projection_weights_, f);
-  }
-
-  void SetProjectionBias(std::initializer_list<float> f) {
-    PopulateTensor(projection_bias_, f);
-  }
-
-  void SetInput(int offset, const float* begin, const float* end) {
-    PopulateTensor(input_, offset, const_cast<float*>(begin),
-                   const_cast<float*>(end));
-  }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
-  int num_inputs() { return n_input_; }
-  int num_outputs() { return n_output_; }
-  int num_cells() { return n_cell_; }
-  int num_batches() { return n_batch_; }
-
- protected:
-  int input_;
-  int input_to_input_weights_;
-  int input_to_forget_weights_;
-  int input_to_cell_weights_;
-  int input_to_output_weights_;
-
-  int recurrent_to_input_weights_;
-  int recurrent_to_forget_weights_;
-  int recurrent_to_cell_weights_;
-  int recurrent_to_output_weights_;
-
-  int cell_to_input_weights_;
-  int cell_to_forget_weights_;
-  int cell_to_output_weights_;
-
-  int input_layer_norm_weights_;
-  int forget_layer_norm_weights_;
-  int cell_layer_norm_weights_;
-  int output_layer_norm_weights_;
-
-  int input_gate_bias_;
-  int forget_gate_bias_;
-  int cell_bias_;
-  int output_gate_bias_;
-
-  int projection_weights_;
-  int projection_bias_;
-
-  int output_state_;
-  int cell_state_;
-
-  int output_;
-
-  int n_batch_;
-  int n_input_;
-  int n_cell_;
-  int n_output_;
-};
-
-class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
- public:
-  HybridLayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                             bool use_cifg, bool use_peephole,
-                             bool use_projection_weights,
-                             bool use_projection_bias, float cell_clip,
-                             float proj_clip,
-                             const std::vector<std::vector<int>>& input_shapes)
-      : LayerNormLSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg,
-                             use_peephole, use_projection_weights,
-                             use_projection_bias, cell_clip, proj_clip,
-                             input_shapes, TensorType_UINT8) {}
-
-  void SetInputToInputWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
-  }
-
-  void SetInputLayerNormWeights(std::initializer_list<float> f) {
-    PopulateTensor(input_layer_norm_weights_, f);
-  }
-
-  void SetForgetLayerNormWeights(std::initializer_list<float> f) {
-    PopulateTensor(forget_layer_norm_weights_, f);
-  }
-
-  void SetCellLayerNormWeights(std::initializer_list<float> f) {
-    PopulateTensor(cell_layer_norm_weights_, f);
-  }
-
-  void SetOutputLayerNormWeights(std::initializer_list<float> f) {
-    PopulateTensor(output_layer_norm_weights_, f);
-  }
-
-  void SetProjectionWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(projection_weights_, f);
-  }
-};
-
-class BaseLayerNormLstmTest : public ::testing::Test {
- protected:
-  // Weights of the Layer Norm LSTM model. Some are optional.
-  std::initializer_list<float> input_to_input_weights_;
-  std::initializer_list<float> input_to_cell_weights_;
-  std::initializer_list<float> input_to_forget_weights_;
-  std::initializer_list<float> input_to_output_weights_;
-  std::initializer_list<float> input_gate_bias_;
-  std::initializer_list<float> cell_gate_bias_;
-  std::initializer_list<float> forget_gate_bias_;
-  std::initializer_list<float> output_gate_bias_;
-  std::initializer_list<float> recurrent_to_input_weights_;
-  std::initializer_list<float> recurrent_to_cell_weights_;
-  std::initializer_list<float> recurrent_to_forget_weights_;
-  std::initializer_list<float> recurrent_to_output_weights_;
-  std::initializer_list<float> cell_to_input_weights_;
-  std::initializer_list<float> cell_to_forget_weights_;
-  std::initializer_list<float> cell_to_output_weights_;
-  std::initializer_list<float> input_layer_norm_weights_;
-  std::initializer_list<float> forget_layer_norm_weights_;
-  std::initializer_list<float> cell_layer_norm_weights_;
-  std::initializer_list<float> output_layer_norm_weights_;
-  std::initializer_list<float> projection_weights_;
-
-  // Layer Norm LSTM input is stored as num_batch x num_inputs vector.
-  std::vector<std::vector<float>> layer_norm_lstm_input_;
-
-  // Compares output up to tolerance to the result of the layer_norm_lstm given
-  // the input.
-  void VerifyGoldens(const std::vector<std::vector<float>>& input,
-                     const std::vector<std::vector<float>>& output,
-                     LayerNormLSTMOpModel* layer_norm_lstm,
-                     float tolerance = 1e-5) {
-    const int num_batches = input.size();
-    EXPECT_GT(num_batches, 0);
-    const int num_inputs = layer_norm_lstm->num_inputs();
-    EXPECT_GT(num_inputs, 0);
-    const int input_sequence_size = input[0].size() / num_inputs;
-    EXPECT_GT(input_sequence_size, 0);
-    for (int i = 0; i < input_sequence_size; ++i) {
-      for (int b = 0; b < num_batches; ++b) {
-        const float* batch_start = input[b].data() + i * num_inputs;
-        const float* batch_end = batch_start + num_inputs;
-
-        layer_norm_lstm->SetInput(b * layer_norm_lstm->num_inputs(),
-                                  batch_start, batch_end);
-      }
-
-      layer_norm_lstm->Invoke();
-
-      const int num_outputs = layer_norm_lstm->num_outputs();
-      std::vector<float> expected;
-      for (int b = 0; b < num_batches; ++b) {
-        const float* golden_start_batch = output[b].data() + i * num_outputs;
-        const float* golden_end_batch = golden_start_batch + num_outputs;
-        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
-      }
-      EXPECT_THAT(layer_norm_lstm->GetOutput(),
-                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
-    }
-  }
-};
-
-class NoCifgPeepholeProjectionNoClippingLayerNormLstmTest
-    : public BaseLayerNormLstmTest {
-  void SetUp() override {
-    input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
-                               0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
-                               -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
-
-    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
-                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
-                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
-
-    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
-                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
-                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
-
-    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
-                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
-                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
-
-    input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
-
-    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
-
-    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
-
-    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
-
-    recurrent_to_input_weights_ = {-0.2, -0.3, 0.4,  0.1,  -0.5, 0.9,
-                                   -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
-
-    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
-                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
-
-    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
-                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
-
-    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
-                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
-
-    cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
-
-    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
-
-    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
-
-    input_layer_norm_weights_ = {0.1, 0.2, 0.3, 0.5};
-    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
-    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
-    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
-
-    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
-                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
-
-    layer_norm_lstm_input_ = {
-        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
-         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
-         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
-         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
-
-        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
-         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
-         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
-         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
-    };
-  }
-};
-
-TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       LayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  LayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {n_cell},  // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0244077, 0.128027, -0.00170918,  // seq 0
-          0.0137642, 0.140751, 0.0395835,    // seq 1
-          -0.00459231, 0.155278, 0.0837377,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.00692428, 0.0848741, 0.063445,  // seq 0
-          -0.00403912, 0.139963, 0.072681,   // seq 1
-          0.00752706, 0.161903, 0.0561371,   // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       HybridLayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  HybridLayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {n_cell},  // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0244576, 0.127847, -0.00181765,  // seq 0
-          0.0137518, 0.140892, 0.0402234,    // seq 1
-          -0.0048839, 0.155096, 0.0840309,   // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.00728636, 0.0843957, 0.0634786,  // seq 0
-          -0.00448382, 0.139278, 0.0737372,   // seq 1
-          0.00734616, 0.161793, 0.0560238,    // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-}  // namespace
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
deleted file mode 100644
index 5b996d00bcbfb1b5db8e8362eb906da7e4d1c612..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ /dev/null
@@ -1,1023 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/gemm_support.h"
-#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace lstm {
-
-struct OpData {
-  // Which kernel type to use. Full kernel (20 inputs) or basic kernel
-  // (5 inputs).
-  TfLiteLSTMKernelType kernel_type;
-
-  // These fields are only used by full kernel.
-  int activation_state_tensor_index;
-  int cell_state_tensor_index;
-  int scratch_tensor_index;
-};
-
-// For full inputs kernel (20-inputs).
-namespace full {
-
-// Input Tensors of size {n_batch, n_input}
-constexpr int kInputTensor = 0;
-
-// Input weight tensors of size: {n_cell, n_input}
-constexpr int kInputToInputWeightsTensor = 1;  // Optional
-constexpr int kInputToForgetWeightsTensor = 2;
-constexpr int kInputToCellWeightsTensor = 3;
-constexpr int kInputToOutputWeightsTensor = 4;
-
-// Recurrent weight tensors of size {n_cell, n_output}
-constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
-constexpr int kRecurrentToForgetWeightsTensor = 6;
-constexpr int kRecurrentToCellWeightsTensor = 7;
-constexpr int kRecurrentToOutputWeightsTensor = 8;
-
-// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kCellToInputWeightsTensor = 9;    // Optional
-constexpr int kCellToForgetWeightsTensor = 10;  // Optional
-constexpr int kCellToOutputWeightsTensor = 11;  // Optional
-
-// Gates bias tensors of size {n_cell}
-constexpr int kInputGateBiasTensor = 12;  // Optional
-constexpr int kForgetGateBiasTensor = 13;
-constexpr int kCellGateBiasTensor = 14;
-constexpr int kOutputGateBiasTensor = 15;
-
-// Projection weight tensor of size {n_output, n_cell}
-constexpr int kProjectionWeightsTensor = 16;  // Optional
-// Projection bias tensor of size {n_output}
-constexpr int kProjectionBiasTensor = 17;  // Optional
-
-// These state tensors are defined as variable tensors, and will be modified by
-// this op.
-constexpr int kInputActivationStateTensor = 18;
-constexpr int kInputCellStateTensor = 19;
-
-// Output tensors.
-constexpr int kOutputTensor = 0;
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData();
-  op_data->kernel_type = kTfLiteLSTMFullKernel;
-  context->AddTensors(context, /*tensors_to_add=*/7,
-                      &op_data->scratch_tensor_index);
-  return op_data;
-}
-
-// Check that input tensor dimensions matches with each other.
-TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
-                                        TfLiteNode* node, int n_input,
-                                        int n_output, int n_cell) {
-  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-
-  // Making sure clipping parameters have valid values.
-  // == 0 means no clipping
-  //  > 0 means clipping
-  TF_LITE_ENSURE(context, params->cell_clip >= 0);
-  TF_LITE_ENSURE(context, params->proj_clip >= 0);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  if (input_to_input_weights != nullptr) {
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
-  }
-
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (recurrent_to_input_weights != nullptr) {
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
-                      n_cell);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
-                      n_output);
-  }
-
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
-                    n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
-                    n_output);
-
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
-                    n_output);
-
-  // We make sure the input-gate's parameters are either both present (regular
-  // LSTM) or not at all (CIFG-LSTM).
-  const bool cifg_weights_all_or_none =
-      ((input_to_input_weights != nullptr) &&
-       (recurrent_to_input_weights != nullptr)) ||
-      ((input_to_input_weights == nullptr) &&
-       (recurrent_to_input_weights == nullptr));
-  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  if (cell_to_input_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  if (cell_to_forget_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-  if (cell_to_output_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
-  }
-
-  // Making sure the peephole weights are there all or none.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool peephole_weights_all_or_none =
-      ((cell_to_input_weights != nullptr || use_cifg) &&
-       (cell_to_forget_weights != nullptr) &&
-       (cell_to_output_weights != nullptr)) ||
-      ((cell_to_input_weights == nullptr) &&
-       (cell_to_forget_weights == nullptr) &&
-       (cell_to_output_weights == nullptr));
-  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
-
-  // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
-  } else {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  if (projection_weights != nullptr) {
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
-  }
-
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-  if (projection_bias != nullptr) {
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
-  }
-
-  // Making sure the projection tensors are consistent:
-  // 1) If projection weight is not present, then projection bias should not be
-  // present.
-  // 2) If projection weight is present, then projection bias is optional.
-  // TODO(ghodrat): make sure this is correct.
-  const bool projection_tensors_consistent =
-      ((projection_weights != nullptr) || (projection_bias == nullptr));
-  TF_LITE_ENSURE(context, projection_tensors_consistent == true);
-
-  return kTfLiteOk;
-}
-
-// Resize the output, state tensors based on the sizes of the input tensors.
-// Allocate a temporary scratch tensor. Also check that the sizes of the input
-// tensors match each other.
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 20);
-
-  op_data->activation_state_tensor_index =
-      node->inputs->data[kInputActivationStateTensor];
-  op_data->cell_state_tensor_index = node->inputs->data[kInputCellStateTensor];
-
-  // Inferring batch size, number of outputs and number of cells from the
-  // input tensors.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE(context, input->dims->size > 1);
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-  const int n_cell = input_to_output_weights->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
-                    n_cell);
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input,
-                                                        n_output, n_cell));
-
-  // Get the pointer to output, activation_state and cell_state tensors.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TfLiteTensor* activation_state =
-      &context->tensors[op_data->activation_state_tensor_index];
-  TfLiteTensor* cell_state =
-      &context->tensors[op_data->cell_state_tensor_index];
-
-  // Check the shape of input state tensors.
-  // These tensor may be 1D or 2D. It's fine as long as the total size is
-  // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
-  TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
-
-  // Resize the output tensors.
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(2);
-  output_size->data[0] = n_batch;
-  output_size->data[1] = n_output;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, output, output_size));
-
-  // The weights are of consistent type, so it suffices to check one.
-  // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
-  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
-                             input->type == kTfLiteFloat32);
-
-  TfLiteIntArrayFree(node->temporaries);
-  if (is_hybrid_op) {
-    node->temporaries = TfLiteIntArrayCreate(7);
-  } else {
-    node->temporaries = TfLiteIntArrayCreate(1);
-  }
-  node->temporaries->data[0] = op_data->scratch_tensor_index;
-
-  // Create a scratch buffer tensor.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-  scratch_buffer->type = input->type;
-  scratch_buffer->allocation_type = kTfLiteArenaRw;
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-  scratch_buffer_size->data[0] = n_batch;
-  if (use_cifg) {
-    // Reserving space for Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 3;
-  } else {
-    // Reserving space for Input, Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 4;
-  }
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
-                                                   scratch_buffer_size));
-
-  if (is_hybrid_op) {
-    // Allocate temporary tensors to store quantized values of input,
-    // activation_state and cell_state tensors.
-    node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-    input_quantized->type = kTfLiteUInt8;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-    node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
-    TfLiteTensor* activation_state_quantized =
-        GetTemporary(context, node, /*index=*/2);
-    activation_state_quantized->type = kTfLiteUInt8;
-    activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
-                             activation_state->dims)) {
-      TfLiteIntArray* activation_state_quantized_size =
-          TfLiteIntArrayCopy(activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, activation_state_quantized,
-                                         activation_state_quantized_size));
-    }
-    node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
-    TfLiteTensor* cell_state_quantized =
-        GetTemporary(context, node, /*index=*/3);
-    cell_state_quantized->type = kTfLiteUInt8;
-    cell_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
-      TfLiteIntArray* cell_state_quantized_size =
-          TfLiteIntArrayCopy(cell_state->dims);
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, cell_state_quantized,
-                                              cell_state_quantized_size));
-    }
-
-    // Allocate temporary tensors to store scaling factors and product scaling
-    // factors. The latter is a convenience storage which allows to quantize
-    // a vector once (which produces the scaling factors) and multiply it with
-    // different matrices (which requires multiplying the scaling factors with
-    // the scaling factor of the matrix).
-    node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-    scaling_factors_size->data[0] = n_batch;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
-    }
-    node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
-    TfLiteTensor* prod_scaling_factors =
-        GetTemporary(context, node, /*index=*/5);
-    prod_scaling_factors->type = kTfLiteFloat32;
-    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
-    prod_scaling_factors_size->data[0] = n_batch;
-    if (!TfLiteIntArrayEqual(prod_scaling_factors->dims,
-                             prod_scaling_factors_size)) {
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, prod_scaling_factors,
-                                              prod_scaling_factors_size));
-    }
-
-    // Allocate a temporary tensor to store the recovered cell weights. Since
-    // this is used for diagonal matrices, only need to store n_cell values.
-    node->temporaries->data[6] = op_data->scratch_tensor_index + 6;
-    TfLiteTensor* recovered_cell_weights =
-        GetTemporary(context, node, /*index=*/6);
-    recovered_cell_weights->type = kTfLiteFloat32;
-    recovered_cell_weights->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1);
-    recovered_cell_weights_size->data[0] = n_cell;
-    if (!TfLiteIntArrayEqual(recovered_cell_weights->dims,
-                             recovered_cell_weights_size)) {
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, recovered_cell_weights,
-                                              recovered_cell_weights_size));
-    }
-  }
-  return kTfLiteOk;
-}
-
-// The LSTM Op engine.
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output) {
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  const float* input_to_input_weights_ptr =
-      (use_cifg) ? nullptr : input_to_input_weights->data.f;
-  const float* recurrent_to_input_weights_ptr =
-      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
-  const float* input_gate_bias_ptr =
-      (use_cifg) ? nullptr : input_gate_bias->data.f;
-  const float* cell_to_input_weights_ptr =
-      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
-  const float* cell_to_forget_weights_ptr =
-      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
-  const float* cell_to_output_weights_ptr =
-      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
-  const float* projection_weights_ptr =
-      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_ptr_batch = input->data.f;
-  const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f;
-  const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f;
-  const float* input_to_output_weights_ptr = input_to_output_weights->data.f;
-  const float* recurrent_to_forget_weights_ptr =
-      recurrent_to_forget_weights->data.f;
-  const float* recurrent_to_cell_weights_ptr =
-      recurrent_to_cell_weights->data.f;
-  const float* recurrent_to_output_weights_ptr =
-      recurrent_to_output_weights->data.f;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-  float* output_ptr_batch = output->data.f;
-
-  kernel_utils::LstmStep(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr,
-      input_to_cell_weights_ptr, input_to_output_weights_ptr,
-      recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
-      recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
-      cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
-      cell_to_output_weights_ptr, input_gate_bias_ptr, forget_gate_bias_ptr,
-      cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-      projection_bias_ptr, params, n_batch, n_cell, n_input, n_output,
-      activation_state_ptr, cell_state_ptr, input_gate_scratch,
-      forget_gate_scratch, cell_scratch, output_gate_scratch, output_ptr_batch);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
-    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
-    TfLiteTensor* activation_state_quantized,
-    TfLiteTensor* cell_state_quantized, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  int8_t* input_to_input_weights_ptr = nullptr;
-  float input_to_input_weights_scale = 1.0f;
-  int8_t* recurrent_to_input_weights_ptr = nullptr;
-  float recurrent_to_input_weights_scale = 1.0f;
-  float* input_gate_bias_ptr = nullptr;
-  if (!use_cifg) {
-    input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
-    recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
-    input_gate_bias_ptr = input_gate_bias->data.f;
-    input_to_input_weights_scale = input_to_input_weights->params.scale;
-    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
-  }
-
-  int8_t* cell_to_input_weights_ptr = nullptr;
-  int8_t* cell_to_forget_weights_ptr = nullptr;
-  int8_t* cell_to_output_weights_ptr = nullptr;
-  float cell_to_input_weights_scale = 1.0f;
-  float cell_to_forget_weights_scale = 1.0f;
-  float cell_to_output_weights_scale = 1.0f;
-  if (use_peephole) {
-    if (!use_cifg) {
-      cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
-      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
-    }
-    cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
-    cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
-    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
-    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
-  }
-
-  const int8_t* projection_weights_ptr =
-      (projection_weights == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
-  const float projection_weights_scale =
-      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_ptr_batch = input->data.f;
-  const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
-  const float input_to_forget_weights_scale =
-      input_to_forget_weights->params.scale;
-  const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
-  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
-  const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
-  const float input_to_output_weights_scale =
-      input_to_output_weights->params.scale;
-  const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
-  const float recurrent_to_forget_weights_scale =
-      recurrent_to_forget_weights->params.scale;
-  const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
-  const float recurrent_to_cell_weights_scale =
-      recurrent_to_cell_weights->params.scale;
-  const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
-  const float recurrent_to_output_weights_scale =
-      recurrent_to_output_weights->params.scale;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-  float* output_ptr_batch = output->data.f;
-
-  // Temporary storage for quantized values and scaling factors.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  int8_t* quantized_activation_state_ptr =
-      reinterpret_cast<int8_t*>(activation_state_quantized->data.uint8);
-  int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
-  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
-
-  kernel_utils::LstmStep(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale,
-      input_to_forget_weights_ptr, input_to_forget_weights_scale,
-      input_to_cell_weights_ptr, input_to_cell_weights_scale,
-      input_to_output_weights_ptr, input_to_output_weights_scale,
-      recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
-      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
-      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
-      recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
-      cell_to_input_weights_ptr, cell_to_input_weights_scale,
-      cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
-      cell_to_output_weights_ptr, cell_to_output_weights_scale,
-      input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
-      output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale,
-      projection_bias_ptr, params, n_batch, n_cell, n_input, n_output,
-      input_gate_scratch, forget_gate_scratch, cell_scratch,
-      output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
-      recovered_cell_weights_ptr, quantized_input_ptr,
-      quantized_activation_state_ptr, quantized_cell_state_ptr,
-      activation_state_ptr, cell_state_ptr, output_ptr_batch);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-
-  // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-
-  TfLiteTensor* activation_state =
-      &context->tensors[op_data->activation_state_tensor_index];
-  TfLiteTensor* cell_state =
-      &context->tensors[op_data->cell_state_tensor_index];
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  // TODO(mirkov): add a check that weights are all uint8s or all floats.
-  switch (input_to_output_weights->type) {
-    case kTfLiteFloat32: {
-      return EvalFloat(input, input_to_input_weights, input_to_forget_weights,
-                       input_to_cell_weights, input_to_output_weights,
-                       recurrent_to_input_weights, recurrent_to_forget_weights,
-                       recurrent_to_cell_weights, recurrent_to_output_weights,
-                       cell_to_input_weights, cell_to_forget_weights,
-                       cell_to_output_weights, input_gate_bias,
-                       forget_gate_bias, cell_bias, output_gate_bias,
-                       projection_weights, projection_bias, params,
-                       scratch_buffer, activation_state, cell_state, output);
-    }
-    case kTfLiteUInt8: {
-      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-      TfLiteTensor* activation_state_quantized =
-          GetTemporary(context, node, /*index=*/2);
-      TfLiteTensor* cell_state_quantized =
-          GetTemporary(context, node, /*index=*/3);
-      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
-      TfLiteTensor* prod_scaling_factors =
-          GetTemporary(context, node, /*index=*/5);
-      TfLiteTensor* recovered_cell_weights =
-          GetTemporary(context, node, /*index=*/6);
-      return EvalHybrid(
-          input, input_to_input_weights, input_to_forget_weights,
-          input_to_cell_weights, input_to_output_weights,
-          recurrent_to_input_weights, recurrent_to_forget_weights,
-          recurrent_to_cell_weights, recurrent_to_output_weights,
-          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
-          input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias,
-          projection_weights, projection_bias, params, scratch_buffer,
-          scaling_factors, prod_scaling_factors, recovered_cell_weights,
-          input_quantized, activation_state_quantized, cell_state_quantized,
-          activation_state, cell_state, output);
-    }
-    default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           input_to_output_weights->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace full
-
-// For basic kernel (5-inputs).
-namespace basic {
-
-enum InputTensor {
-  kInputData = 0,
-  kInputPrevActivation = 1,
-  kInputWeights = 2,
-  kInputBiases = 3,
-  kInputPrevState = 4,
-  kInputNum = 5,
-};
-
-enum OutputTensor {
-  kOutputActivation = 0,
-  kOutputState = 1,
-  kOutputConcatTemp = 2,
-  kOutputActivationTemp = 3,
-  kOutputNum = 4,
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData();
-  op_data->kernel_type = kTfLiteLSTMBasicKernel;
-  // `scratch_tensor_index` is unused in this kernel.
-  op_data->scratch_tensor_index = -1;
-  return op_data;
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE(context, node->inputs->size == kInputNum);
-  TF_LITE_ENSURE(context, node->outputs->size == kOutputNum);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputData);
-  const TfLiteTensor* prev_activation =
-      GetInput(context, node, kInputPrevActivation);
-  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
-  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
-  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
-
-  TF_LITE_ENSURE_EQ(context, input->dims->size, 2);
-  const int num_batches = input->dims->data[0];
-  const int input_depth = input->dims->data[1];
-
-  TF_LITE_ENSURE_EQ(context, prev_activation->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, prev_activation->dims->data[0], num_batches);
-  const int activation_depth = prev_activation->dims->data[1];
-  const int total_depth = input_depth + activation_depth;
-
-  TF_LITE_ENSURE_EQ(context, weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, weights->dims->data[0], 4 * activation_depth);
-  TF_LITE_ENSURE_EQ(context, weights->dims->data[1], total_depth);
-
-  TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, bias->dims->data[0], 4 * activation_depth);
-
-  TF_LITE_ENSURE_EQ(context, prev_state->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, prev_state->dims->data[0], num_batches);
-  TF_LITE_ENSURE_EQ(context, prev_state->dims->data[1], activation_depth);
-
-  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
-  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
-  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
-  TfLiteTensor* activation_temp =
-      GetOutput(context, node, kOutputActivationTemp);
-
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(
-                                 context, activation_out,
-                                 TfLiteIntArrayCopy(prev_activation->dims)));
-  TF_LITE_ENSURE_OK(
-      context, context->ResizeTensor(context, state_out,
-                                     TfLiteIntArrayCopy(prev_state->dims)));
-
-  TfLiteIntArray* concat_temp_size = TfLiteIntArrayCreate(2);
-  concat_temp_size->data[0] = num_batches;
-  concat_temp_size->data[1] = total_depth;
-  TF_LITE_ENSURE_OK(
-      context, context->ResizeTensor(context, concat_temp, concat_temp_size));
-  TfLiteIntArray* activation_temp_size = TfLiteIntArrayCreate(2);
-  activation_temp_size->data[0] = num_batches;
-  activation_temp_size->data[1] = 4 * activation_depth;
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, activation_temp,
-                                                   activation_temp_size));
-
-  // Set the state tensors as persistent.
-  for (auto index : {kInputPrevActivation, kInputPrevState}) {
-    TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]];
-    tensor->allocation_type = kTfLiteArenaRwPersistent;
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputData);
-  const TfLiteTensor* prev_activation =
-      GetInput(context, node, kInputPrevActivation);
-  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
-  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
-  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
-
-  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
-  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
-  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
-  TfLiteTensor* activation_temp =
-      GetOutput(context, node, kOutputActivationTemp);
-
-  if (input->type == kTfLiteFloat32 &&
-      prev_activation->type == kTfLiteFloat32 &&
-      weights->type == kTfLiteFloat32 && bias->type == kTfLiteFloat32 &&
-      prev_state->type == kTfLiteFloat32 && state_out->type == kTfLiteFloat32 &&
-      activation_out->type == kTfLiteFloat32 &&
-      concat_temp->type == kTfLiteFloat32 &&
-      activation_temp->type == kTfLiteFloat32) {
-    tflite::LstmCellParams op_params;
-    // Float LSTM cell does not need parameters to be set: leave untouched.
-    optimized_ops::LstmCell(
-        op_params,
-        // Inputs.
-        GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(prev_activation), GetTensorData<float>(prev_activation),
-        GetTensorShape(weights), GetTensorData<float>(weights),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(prev_state), GetTensorData<float>(prev_state),
-        // Outputs.
-        GetTensorShape(state_out), GetTensorData<float>(state_out),
-        GetTensorShape(activation_out), GetTensorData<float>(activation_out),
-        GetTensorShape(concat_temp), GetTensorData<float>(concat_temp),
-        GetTensorShape(activation_temp), GetTensorData<float>(activation_temp));
-  } else if (input->type == kTfLiteUInt8 &&
-             prev_activation->type == kTfLiteUInt8 &&
-             weights->type == kTfLiteUInt8 && bias->type == kTfLiteInt32 &&
-             prev_state->type == kTfLiteInt16 &&
-             state_out->type == kTfLiteInt16 &&
-             activation_out->type == kTfLiteUInt8 &&
-             concat_temp->type == kTfLiteUInt8 &&
-             activation_temp->type == kTfLiteInt16) {
-    gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
-    int state_scale_log2_rounded;
-    if (!CheckedLog2(state_out->params.scale, &state_scale_log2_rounded)) {
-      context->ReportError(
-          context,
-          "The internal state of a LSTM cell must have a power-of-two scale.");
-      return kTfLiteError;
-    }
-    const int state_integer_bits = 15 + state_scale_log2_rounded;
-    if (state_integer_bits != 4) {
-      context->ReportError(context,
-                           "The only case of quantized LstmCell currently "
-                           "supported is with StateIntegerBits==4");
-      return kTfLiteError;
-    }
-
-    double real_accum_multiplier = 4096 * bias->params.scale;
-    int32 accum_multiplier;
-    int accum_shift;
-    tflite::QuantizeMultiplier(real_accum_multiplier, &accum_multiplier,
-                               &accum_shift);
-    tflite::LstmCellParams op_params;
-    op_params.weights_zero_point = weights->params.zero_point;
-    op_params.accum_multiplier = accum_multiplier;
-    op_params.accum_shift = accum_shift;
-    optimized_ops::LstmCell<4>(
-        op_params,
-        // Inputs.
-        GetTensorShape(input), GetTensorData<uint8_t>(input),
-        GetTensorShape(prev_activation),
-        GetTensorData<uint8_t>(prev_activation), GetTensorShape(weights),
-        GetTensorData<uint8_t>(weights), GetTensorShape(bias),
-        GetTensorData<int32_t>(bias), GetTensorShape(prev_state),
-        GetTensorData<int16_t>(prev_state),
-        // Outputs.
-        GetTensorShape(state_out), GetTensorData<int16_t>(state_out),
-        GetTensorShape(activation_out), GetTensorData<uint8_t>(activation_out),
-        GetTensorShape(concat_temp), GetTensorData<uint8_t>(concat_temp),
-        GetTensorShape(activation_temp),
-        GetTensorData<int16_t>(activation_temp), gemm_context);
-  } else {
-    context->ReportError(context,
-                         "Unsupported combination of data types for LstmCell");
-    return kTfLiteError;
-  }
-
-  // TODO(ycling): Investigate if this copy can be avoided with the 5-inputs
-  // LSTM kernel.
-  memcpy(prev_activation->data.raw, activation_out->data.raw,
-         activation_out->bytes);
-  memcpy(prev_state->data.raw, state_out->data.raw, state_out->bytes);
-
-  return kTfLiteOk;
-}
-
-}  // namespace basic
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  gemm_support::IncrementUsageCounter(context);
-
-  const auto* params = reinterpret_cast<const TfLiteLSTMParams*>(buffer);
-  switch (params->kernel_type) {
-    case kTfLiteLSTMFullKernel:
-      return full::Init(context, buffer, length);
-    case kTfLiteLSTMBasicKernel:
-      return basic::Init(context, buffer, length);
-  }
-}
-void Free(TfLiteContext* context, void* buffer) {
-  gemm_support::DecrementUsageCounter(context);
-
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const auto* op_data = reinterpret_cast<const OpData*>(node->user_data);
-  switch (op_data->kernel_type) {
-    case kTfLiteLSTMFullKernel:
-      return full::Prepare(context, node);
-    case kTfLiteLSTMBasicKernel:
-      return basic::Prepare(context, node);
-  }
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const auto* op_data = reinterpret_cast<const OpData*>(node->user_data);
-  switch (op_data->kernel_type) {
-    case kTfLiteLSTMFullKernel:
-      return full::Eval(context, node);
-    case kTfLiteLSTMBasicKernel:
-      return basic::Eval(context, node);
-  }
-}
-
-}  // namespace lstm
-
-TfLiteRegistration* Register_LSTM() {
-  static TfLiteRegistration r = {lstm::Init, lstm::Free, lstm::Prepare,
-                                 lstm::Eval};
-  return &r;
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/mfcc.cc b/tensorflow/contrib/lite/kernels/mfcc.cc
deleted file mode 100644
index 5153ce5634c33e829c3742e4d11a22a18f0d2f79..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/mfcc.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/internal/mfcc.h"
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h"
-#include "tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-namespace mfcc {
-
-enum KernelType {
-  kReference,
-};
-
-typedef struct {
-  float upper_frequency_limit;
-  float lower_frequency_limit;
-  int filterbank_channel_count;
-  int dct_coefficient_count;
-} TfLiteMfccParams;
-
-constexpr int kInputTensorWav = 0;
-constexpr int kInputTensorRate = 1;
-constexpr int kOutputTensor = 0;
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new TfLiteMfccParams;
-
-  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
-
-  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
-  data->upper_frequency_limit = m["upper_frequency_limit"].AsInt64();
-  data->lower_frequency_limit = m["lower_frequency_limit"].AsInt64();
-  data->filterbank_channel_count = m["filterbank_channel_count"].AsInt64();
-  data->dct_coefficient_count = m["dct_coefficient_count"].AsInt64();
-  return data;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<TfLiteMfccParams*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
-
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
-  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TF_LITE_ENSURE_EQ(context, NumDimensions(inputWav), 3);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(inputRate), 1);
-
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, inputWav->type, output->type);
-
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
-  output_size->data[0] = inputWav->dims->data[0];
-  output_size->data[1] = inputWav->dims->data[1];
-  output_size->data[2] = params->dct_coefficient_count;
-
-  return context->ResizeTensor(context, output, output_size);
-}
-
-// Input is a single squared-magnitude spectrogram frame. The input spectrum
-// is converted to linear magnitude and weighted into bands using a
-// triangular mel filterbank, and a discrete cosine transform (DCT) of the
-// values is taken. Output is populated with the lowest dct_coefficient_count
-// of these values.
-template <KernelType kernel_type>
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
-
-  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
-  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  const int32 sample_rate = *GetTensorData<int>(inputRate);
-
-  const int spectrogram_channels = inputWav->dims->data[2];
-  const int spectrogram_samples = inputWav->dims->data[1];
-  const int audio_channels = inputWav->dims->data[0];
-
-  internal::Mfcc mfcc;
-  mfcc.set_upper_frequency_limit(params->upper_frequency_limit);
-  mfcc.set_lower_frequency_limit(params->lower_frequency_limit);
-  mfcc.set_filterbank_channel_count(params->filterbank_channel_count);
-  mfcc.set_dct_coefficient_count(params->dct_coefficient_count);
-
-  mfcc.Initialize(spectrogram_channels, sample_rate);
-
-  const float* spectrogram_flat = GetTensorData<float>(inputWav);
-  float* output_flat = GetTensorData<float>(output);
-
-  for (int audio_channel = 0; audio_channel < audio_channels; ++audio_channel) {
-    for (int spectrogram_sample = 0; spectrogram_sample < spectrogram_samples;
-         ++spectrogram_sample) {
-      const float* sample_data =
-          spectrogram_flat +
-          (audio_channel * spectrogram_samples * spectrogram_channels) +
-          (spectrogram_sample * spectrogram_channels);
-      std::vector<double> mfcc_input(sample_data,
-                                     sample_data + spectrogram_channels);
-      std::vector<double> mfcc_output;
-      mfcc.Compute(mfcc_input, &mfcc_output);
-      TF_LITE_ENSURE_EQ(context, params->dct_coefficient_count,
-                        mfcc_output.size());
-      float* output_data = output_flat +
-                           (audio_channel * spectrogram_samples *
-                            params->dct_coefficient_count) +
-                           (spectrogram_sample * params->dct_coefficient_count);
-      for (int i = 0; i < params->dct_coefficient_count; ++i) {
-        output_data[i] = mfcc_output[i];
-      }
-    }
-  }
-
-  return kTfLiteOk;
-}
-
-}  // namespace mfcc
-
-TfLiteRegistration* Register_MFCC() {
-  static TfLiteRegistration r = {mfcc::Init, mfcc::Free, mfcc::Prepare,
-                                 mfcc::Eval<mfcc::kReference>};
-  return &r;
-}
-
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc
deleted file mode 100644
index f41147b2d6433addc63538fbd8c4338d749535d3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/reshape.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <string.h>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace reshape {
-
-constexpr int kInputTensor = 0;
-constexpr int kShapeTensor = 1;
-constexpr int kOutputTensor = 0;
-
-TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node,
-                          TfLiteIntArray* output_shape) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  // Tensorflow's Reshape allows one of the shape components to have the
-  // special -1 value, meaning it will be calculated automatically based on the
-  // input. Here we calculate what that dimension should be so that the number
-  // of output elements in the same as the number of input elements.
-  int num_input_elements = NumElements(input);
-
-  int num_output_elements = 1;
-  int stretch_dim = -1;
-  for (int i = 0; i < output_shape->size; ++i) {
-    int value = output_shape->data[i];
-    if (value == -1) {
-      TF_LITE_ENSURE_EQ(context, stretch_dim, -1);
-      stretch_dim = i;
-    } else {
-      num_output_elements *= value;
-    }
-  }
-  if (stretch_dim != -1) {
-    output_shape->data[stretch_dim] = num_input_elements / num_output_elements;
-    num_output_elements *= output_shape->data[stretch_dim];
-  }
-
-  TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
-  return context->ResizeTensor(context, output, output_shape);
-}
-
-TfLiteStatus ResizeOutputWithShapeTensor(TfLiteContext* context,
-                                         TfLiteNode* node) {
-  const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
-
-  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(shape->dims->data[0]);
-  for (int i = 0; i < output_shape->size; ++i) {
-    output_shape->data[i] = shape->data.i32[i];
-  }
-  return ResizeOutput(context, node, output_shape);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
-
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  // Attempt to use shape tensor if it exists.
-  if (NumInputs(node) == 2) {
-    const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
-    // Check if the shape tensor is valid.
-    if (shape->dims->size == 1 && shape->type == kTfLiteInt32) {
-      // Set the output tensor as dynamic if the shape isn't constnat.
-      if (!IsConstantTensor(shape)) {
-        TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-        SetTensorToDynamic(output);
-        return kTfLiteOk;
-      }
-      // Shape is constant. Resize now.
-      return ResizeOutputWithShapeTensor(context, node);
-    }
-  }
-  // The function is returned above this line if the shape tensor is usable.
-  // Now fallback to the shape parameter in `TfLiteReshapeParams`.
-  int num_dimensions = params->num_dimensions;
-  if (num_dimensions == 1 && params->shape[0] == 0) {
-    // Legacy tflite models use a shape parameter of [0] to indicate scalars,
-    // so adjust accordingly. TODO(b/111614235): Allow zero-sized buffers during
-    // toco conversion.
-    num_dimensions = 0;
-  }
-  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
-  for (int i = 0; i < num_dimensions; ++i) {
-    output_shape->data[i] = params->shape[i];
-  }
-  return ResizeOutput(context, node, output_shape);
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  if (IsDynamicTensor(output)) {
-    TF_LITE_ENSURE_OK(context, ResizeOutputWithShapeTensor(context, node));
-  }
-
-  memcpy(output->data.raw, input->data.raw, input->bytes);
-
-  return kTfLiteOk;
-}
-
-}  // namespace reshape
-
-TfLiteRegistration* Register_RESHAPE() {
-  static TfLiteRegistration r = {nullptr, nullptr, reshape::Prepare,
-                                 reshape::Eval};
-  return &r;
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/reshape_test.cc b/tensorflow/contrib/lite/kernels/reshape_test.cc
deleted file mode 100644
index 52d71350d3ba9a27bf9a8df7a194161c4fb7f87c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/reshape_test.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-using ::testing::IsEmpty;
-
-class ReshapeOpModel : public SingleOpModel {
- public:
-  ReshapeOpModel(std::initializer_list<int> input_shape,
-                 std::initializer_list<int> new_shape,
-                 bool use_shape_input_tensor = false) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    int shape_input_tensor =
-        use_shape_input_tensor ? AddInput(TensorType_INT32) : -1;
-    SetBuiltinOp(
-        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
-        CreateReshapeOptions(builder_, builder_.CreateVector<int>(new_shape))
-            .Union());
-    if (use_shape_input_tensor) {
-      BuildInterpreter({input_shape, GetShape(shape_input_tensor)});
-      PopulateTensor<int>(shape_input_tensor, new_shape);
-    } else {
-      BuildInterpreter({input_shape});
-    }
-  }
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
-  }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- private:
-  int input_;
-  int output_;
-};
-
-TEST(ReshapeOpTest, MismatchedDimensions) {
-  EXPECT_DEATH(ReshapeOpModel({1, 2, 4, 1}, {2, 1}),
-               "num_input_elements != num_output_elements");
-}
-
-TEST(ReshapeOpTest, TooManyDimensions) {
-  EXPECT_DEATH(
-      ReshapeOpModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 2, 3, 4, 5, 6, 7, 8, 9}),
-      "Found too many dimensions");
-}
-
-TEST(ReshapeOpTest, TooManySpecialDimensions) {
-  EXPECT_DEATH(ReshapeOpModel({1, 2, 4, 1}, {-1, -1, 2, 4}),
-               "stretch_dim != -1");
-}
-
-TEST(ReshapeOpTest, SimpleTest) {
-  ReshapeOpModel m({1, 2, 4, 1}, {2, 2, 2});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
-}
-
-TEST(ReshapeOpTest, ShapeTensorInput) {
-  ReshapeOpModel m({1, 2, 4, 1}, {2, 2, 2}, /*use_shape_input_tensor=*/true);
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
-}
-
-TEST(ReshapeOpTest, WithStretchDimension) {
-  ReshapeOpModel m({1, 2, 4, 1}, {2, 1, -1});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 4}));
-}
-
-TEST(ReshapeOpTest, ScalarOutput) {
-  ReshapeOpModel m({1}, {});
-  m.SetInput({3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
-  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-}
-
-TEST(ReshapeOpTest, LegacyScalarOutput) {
-  ReshapeOpModel m({1}, {0});
-  m.SetInput({3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
-  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
deleted file mode 100644
index f4289105f7931ae572f219a61b5479287aff926a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ /dev/null
@@ -1,314 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-using uint8 = std::uint8_t;
-
-class ResizeBilinearOpModel : public SingleOpModel {
- public:
-  ResizeBilinearOpModel(const TensorData& input,
-                        std::initializer_list<int> size_data = {}) {
-    bool const_size = size_data.size() != 0;
-    input_ = AddInput(input);
-    if (const_size) {
-      size_ = AddConstInput(TensorType_INT32, size_data, {2});
-    } else {
-      size_ = AddInput({TensorType_INT32, {2}});
-    }
-    output_ = AddOutput(input.type);
-    SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
-                 BuiltinOptions_ResizeBilinearOptions,
-                 CreateResizeBilinearOptions(builder_).Union());
-    if (const_size) {
-      BuildInterpreter({GetShape(input_)});
-    } else {
-      BuildInterpreter({GetShape(input_), GetShape(size_)});
-    }
-  }
-
-  template <typename T>
-  void SetInput(std::initializer_list<T> data) {
-    PopulateTensor(input_, data);
-  }
-  void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); }
-
-  template <typename T>
-  std::vector<T> GetOutput() {
-    return ExtractVector<T>(output_);
-  }
-
- private:
-  int input_;
-  int size_;
-  int output_;
-};
-
-TEST(ResizeBilinearOpTest, HorizontalResize) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
-  m.SetInput<float>({3, 6});
-  m.SetSize({1, 3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<float>(),
-              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
-
-  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3});
-  const_m.SetInput<float>({3, 6});
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<float>(),
-              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
-}
-
-TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
-  m.SetInput<uint8>({3, 6});
-  m.SetSize({1, 3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<uint8>(),
-              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
-
-  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 1, 2, 1}}, {1, 3});
-  const_m.SetInput<uint8>({3, 6});
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<uint8>(),
-              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
-}
-
-TEST(ResizeBilinearOpTest, VerticalResize) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
-  m.SetInput<float>({3, 9});
-  m.SetSize({3, 1});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<float>(),
-              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
-
-  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1});
-  const_m.SetInput<float>({3, 9});
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<float>(),
-              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
-}
-
-TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
-  m.SetInput<uint8>({3, 9});
-  m.SetSize({3, 1});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<uint8>(),
-              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
-
-  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 1, 1}}, {3, 1});
-  const_m.SetInput<uint8>({3, 9});
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<uint8>(),
-              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
-}
-
-TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
-  m.SetInput<float>({
-      3, 6,  //
-      9, 12  //
-  });
-  m.SetSize({3, 3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
-                                        3, 5, 6,    //
-                                        7, 9, 10,   //
-                                        9, 11, 12,  //
-                                    })));
-
-  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3});
-  const_m.SetInput<float>({
-      3, 6,  //
-      9, 12  //
-  });
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
-                                              3, 5, 6,    //
-                                              7, 9, 10,   //
-                                              9, 11, 12,  //
-                                          })));
-}
-
-TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
-  m.SetInput<uint8>({
-      3, 6,  //
-      9, 12  //
-  });
-  m.SetSize({3, 3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                        3, 5, 6,    //
-                                        7, 9, 10,   //
-                                        9, 11, 12,  //
-                                    })));
-
-  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 1}}, {3, 3});
-  const_m.SetInput<uint8>({
-      3, 6,  //
-      9, 12  //
-  });
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                              3, 5, 6,    //
-                                              7, 9, 10,   //
-                                              9, 11, 12,  //
-                                          })));
-}
-
-TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
-  m.SetInput<float>({
-      3, 6,   //
-      9, 12,  //
-      4, 10,  //
-      10, 16  //
-  });
-  m.SetSize({3, 3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
-                                        3, 5, 6,     //
-                                        7, 9, 10,    //
-                                        9, 11, 12,   //
-                                        4, 8, 10,    //
-                                        8, 12, 14,   //
-                                        10, 14, 16,  //
-                                    })));
-
-  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3});
-  const_m.SetInput<float>({
-      3, 6,   //
-      9, 12,  //
-      4, 10,  //
-      10, 16  //
-  });
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
-                                              3, 5, 6,     //
-                                              7, 9, 10,    //
-                                              9, 11, 12,   //
-                                              4, 8, 10,    //
-                                              8, 12, 14,   //
-                                              10, 14, 16,  //
-                                          })));
-}
-
-TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
-  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
-  m.SetInput<float>({
-      3, 4, 6, 10,    //
-      9, 10, 12, 16,  //
-  });
-  m.SetSize({3, 3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
-                                        3, 4, 5, 8, 6, 10,      //
-                                        7, 8, 9, 12, 10, 14,    //
-                                        9, 10, 11, 14, 12, 16,  //
-                                    })));
-
-  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3});
-  const_m.SetInput<float>({
-      3, 4, 6, 10,    //
-      9, 10, 12, 16,  //
-  });
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
-                                              3, 4, 5, 8, 6, 10,      //
-                                              7, 8, 9, 12, 10, 14,    //
-                                              9, 10, 11, 14, 12, 16,  //
-                                          })));
-}
-
-TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
-  m.SetInput<uint8>({
-      3, 6,   //
-      9, 12,  //
-      4, 10,  //
-      12, 16  //
-  });
-  m.SetSize({3, 3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                        3, 5, 6,     //
-                                        7, 9, 10,    //
-                                        9, 11, 12,   //
-                                        4, 8, 10,    //
-                                        9, 12, 14,   //
-                                        12, 14, 16,  //
-                                    })));
-
-  ResizeBilinearOpModel const_m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3});
-  const_m.SetInput<uint8>({
-      3, 6,   //
-      9, 12,  //
-      4, 10,  //
-      12, 16  //
-  });
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                              3, 5, 6,     //
-                                              7, 9, 10,    //
-                                              9, 11, 12,   //
-                                              4, 8, 10,    //
-                                              9, 12, 14,   //
-                                              12, 14, 16,  //
-                                          })));
-}
-
-TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
-  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
-  m.SetInput<uint8>({
-      3, 4, 6, 10,     //
-      10, 12, 14, 16,  //
-  });
-  m.SetSize({3, 3});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                        3, 4, 5, 8, 6, 10,       //
-                                        7, 9, 10, 12, 11, 14,    //
-                                        10, 12, 12, 14, 14, 16,  //
-                                    })));
-
-  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3});
-  const_m.SetInput<uint8>({
-      3, 4, 6, 10,     //
-      10, 12, 14, 16,  //
-  });
-  const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                              3, 4, 5, 8, 6, 10,       //
-                                              7, 9, 10, 12, 11, 14,    //
-                                              10, 12, 12, 14, 14, 16,  //
-                                          })));
-}
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/shape.cc b/tensorflow/contrib/lite/kernels/shape.cc
deleted file mode 100644
index 66d4c9e5c1a430b621d873012b6ba392afae4157..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/shape.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace shape {
-
-constexpr int kInputTensor = 0;
-constexpr int kOutputTensor = 0;
-
-template <typename OutType>
-void ExtractShape(const TfLiteTensor* input, OutType* output_data) {
-  for (int i = 0; i < NumDimensions(input); ++i) {
-    output_data[i] = SizeOfDimension(input, i);
-  }
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  auto* params = reinterpret_cast<TfLiteShapeParams*>(node->builtin_data);
-  switch (params->out_type) {
-    case kTfLiteInt32:
-      output->type = kTfLiteInt32;
-      break;
-    case kTfLiteInt64:
-      output->type = kTfLiteInt64;
-      break;
-    default:
-      context->ReportError(context, "Unknown shape output data type: %d",
-                           params->out_type);
-      return kTfLiteError;
-  }
-
-  // Shape always produces a 1-dimensional output tensor, where each output
-  // element is the length of the corresponding input tensor's dimension.
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(1);
-  output_size->data[0] = NumDimensions(input);
-  return context->ResizeTensor(context, output, output_size);
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TFLITE_DCHECK_EQ(NumDimensions(output), 1);
-  TFLITE_DCHECK_EQ(SizeOfDimension(output, 0), NumDimensions(input));
-
-  switch (output->type) {
-    case kTfLiteInt32:
-      ExtractShape(input, GetTensorData<int32_t>(output));
-      break;
-    case kTfLiteInt64:
-      ExtractShape(input, GetTensorData<int64_t>(output));
-      break;
-    default:
-      return kTfLiteError;
-  }
-
-  return kTfLiteOk;
-}
-
-}  // namespace shape
-
-TfLiteRegistration* Register_SHAPE() {
-  static TfLiteRegistration r = {nullptr, nullptr, shape::Prepare, shape::Eval};
-  return &r;
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/shape_test.cc b/tensorflow/contrib/lite/kernels/shape_test.cc
deleted file mode 100644
index 27b48f4e992a8f02d56815bd1bd9074f5b41f400..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/shape_test.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <initializer_list>
-
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-template <typename T>
-class ShapeOpModel : public SingleOpModel {
- public:
-  ShapeOpModel(std::initializer_list<int> input_shape, TensorType input_type,
-               TensorType output_type) {
-    input_ = AddInput(input_type);
-    output_ = AddOutput(output_type);
-    SetBuiltinOp(BuiltinOperator_SHAPE, BuiltinOptions_ShapeOptions,
-                 CreateShapeOptions(builder_, output_type).Union());
-    BuildInterpreter({input_shape});
-  }
-
-  TfLiteStatus InvokeWithResult() { return interpreter_->Invoke(); }
-
-  int input() { return input_; }
-
-  int32_t GetOutputSize() { return GetTensorSize(output_); }
-  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- private:
-  int input_;
-  int output_;
-};
-
-TEST(ShapeOpTest, OutTypeInt) {
-  ShapeOpModel<int32_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
-                              TensorType_INT32);
-  model.Invoke();
-
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
-}
-
-TEST(ShapeOpTest, OutTypeInt64) {
-  ShapeOpModel<int64_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
-                              TensorType_INT64);
-  model.Invoke();
-
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
-}
-
-TEST(ShapeOpTest, ScalarTensor) {
-  ShapeOpModel<int32_t> model({}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
-
-  EXPECT_EQ(model.GetOutputSize(), 0);
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({0}));
-}
-
-TEST(ShapeOpTest, EmptyTensor) {
-  ShapeOpModel<int32_t> model({1, 0}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
-
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/softmax_test.cc b/tensorflow/contrib/lite/kernels/softmax_test.cc
deleted file mode 100644
index bd66980226cee0cfd3cf3e81476c60db3d58951c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/softmax_test.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for TFLite SOFTMAX op.
-
-#include <iomanip>
-#include <memory>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-class SoftmaxOpModel : public SingleOpModel {
- public:
-  SoftmaxOpModel(int batches, int size, float beta)
-      : batches_(batches), input_size_(size), beta_(beta) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(BuiltinOperator_SOFTMAX, BuiltinOptions_SoftmaxOptions,
-                 CreateSoftmaxOptions(builder_, beta_).Union());
-    BuildInterpreter({{batches_, input_size_}});
-  }
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-
-  void SetInput(int offset, float* begin, float* end) {
-    PopulateTensor(input_, offset, begin, end);
-  }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
- private:
-  int input_;
-  int output_;
-
-  int batches_;
-  int input_size_;
-  float beta_;
-};
-
-TEST(SoftmaxOpTest, SimpleTest) {
-  SoftmaxOpModel m(/*batches=*/2, /*size=*/5, /*beta=*/1.0);
-  m.SetInput({
-      1.0, 2.0, 3.0, 4.0, 5.0,       // b = 0
-      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 0
-  });
-
-  m.Invoke();
-
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray(ArrayFloatNear(
-          {0.011656231, 0.031684921, 0.086128544, 0.234121657, 0.636408647,
-           0.636408647, 0.234121657, 0.086128544, 0.031684921, 0.011656231},
-          1e-6)));
-}
-
-TEST(SoftmaxOpTest, CompareWithTFminiBetaEq1) {
-  const int batch_size = 2;
-  const int input_size = 5;
-  const float beta = 1.0;
-  static float input_buffer[] = {
-      1.0,  2.0,  3.0,  4.0,  5.0,   // b = 0
-      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 1
-  };
-
-  SoftmaxOpModel m(batch_size, input_size, beta);
-
-  m.SetInput(0, input_buffer, input_buffer + input_size * batch_size);
-
-  m.Invoke();
-
-  std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]);
-  auto input_shape = RuntimeShape({batch_size, 1, 1, input_size});
-  SoftmaxParams params;
-  params.beta = beta;
-  tflite::reference_ops::Softmax(params, input_shape, input_buffer, input_shape,
-                                 output_buffer.get());
-
-  std::vector<float> expected;
-  expected.insert(expected.end(), output_buffer.get(),
-                  output_buffer.get() + input_size * batch_size);
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected, 1e-6)));
-}
-
-TEST(SoftmaxOpTest, CompareWithTFminiBetaNotEq1) {
-  const int batch_size = 2;
-  const int input_size = 5;
-  const float beta = 0.5;
-  static float input_buffer[] = {
-      1.0,  2.0,  3.0,  4.0,  5.0,   // b = 0
-      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 1
-  };
-
-  SoftmaxOpModel m(batch_size, input_size, beta);
-
-  m.SetInput(0, input_buffer, input_buffer + input_size * batch_size);
-
-  m.Invoke();
-
-  std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]);
-  auto input_shape = RuntimeShape({batch_size, 1, 1, input_size});
-  SoftmaxParams params;
-  params.beta = beta;
-  tflite::reference_ops::Softmax(params, input_shape, input_buffer, input_shape,
-                                 output_buffer.get());
-
-  std::vector<float> expected;
-  expected.insert(expected.end(), output_buffer.get(),
-                  output_buffer.get() + input_size * batch_size);
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected, 1e-6)));
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/split.cc b/tensorflow/contrib/lite/kernels/split.cc
deleted file mode 100644
index dab887bf9ccac0ff43cb5f7bd11033657aaf1fd2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/split.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <string.h>
-#include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace split {
-
-struct OpContext {
-  OpContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteSplitParams*>(node->builtin_data);
-    axis = GetInput(context, node, 0);
-    input = GetInput(context, node, 1);
-  }
-  TfLiteSplitParams* params;
-  const TfLiteTensor* axis;
-  const TfLiteTensor* input;
-};
-
-TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
-  for (int i = 0; i < NumOutputs(node); ++i) {
-    SetTensorToDynamic(GetOutput(context, node, i));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node,
-                                 const TfLiteTensor* axis,
-                                 const TfLiteTensor* input, int num_splits) {
-  int axis_value = GetTensorData<int>(axis)[0];
-  if (axis_value < 0) {
-    axis_value += NumDimensions(input);
-  }
-
-  const int input_size = SizeOfDimension(input, axis_value);
-  TF_LITE_ENSURE_MSG(context, input_size % num_splits == 0,
-                     "Not an even split");
-  const int slice_size = input_size / num_splits;
-
-  for (int i = 0; i < NumOutputs(node); ++i) {
-    TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
-    output_dims->data[axis_value] = slice_size;
-    TfLiteTensor* output = GetOutput(context, node, i);
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_dims));
-  }
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
-
-  OpContext op_context(context, node);
-
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), op_context.params->num_splits);
-
-  auto input_type = op_context.input->type;
-  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
-                              input_type == kTfLiteUInt8 ||
-                              input_type == kTfLiteInt16);
-  for (int i = 0; i < NumOutputs(node); ++i) {
-    GetOutput(context, node, i)->type = input_type;
-  }
-
-  // If we know the contents of the 'axis' tensor, resize all outputs.
-  // Otherwise, wait until Eval().
-  if (IsConstantTensor(op_context.axis)) {
-    return ResizeOutputTensors(context, node, op_context.axis, op_context.input,
-                               op_context.params->num_splits);
-  } else {
-    return UseDynamicOutputTensors(context, node);
-  }
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  OpContext op_context(context, node);
-
-  // When the 'axis' tensor is non-const we can't resize output tensors in
-  // Prepare(), and we have to do it now.
-  if (!IsConstantTensor(op_context.axis)) {
-    TF_LITE_ENSURE_OK(
-        context,
-        ResizeOutputTensors(context, node, op_context.axis, op_context.input,
-                            op_context.params->num_splits));
-  }
-
-  int axis_value = GetTensorData<int>(op_context.axis)[0];
-  if (axis_value < 0) {
-    axis_value += NumDimensions(op_context.input);
-  }
-
-  // TODO(ahentz): Our usage of VectorOfTensors could be optimized by
-  // calculating it in Prepare, unless we defer shape calculation.
-  // TODO(ahentz): We can improve the optimized_ops version to handle other
-  // cases too.
-#define TF_LITE_SPLIT(scalar)                                         \
-  VectorOfTensors<scalar> all_outputs(*context, *node->outputs);      \
-  tflite::SplitParams op_params;                                      \
-  op_params.num_split = NumOutputs(node);                             \
-  op_params.axis = axis_value;                                        \
-  if (axis_value == 0) {                                              \
-    optimized_ops::Split(op_params, GetTensorShape(op_context.input), \
-                         GetTensorData<scalar>(op_context.input),     \
-                         all_outputs.shapes(), all_outputs.data());   \
-  } else {                                                            \
-    reference_ops::Split(op_params, GetTensorShape(op_context.input), \
-                         GetTensorData<scalar>(op_context.input),     \
-                         all_outputs.shapes(), all_outputs.data());   \
-  }
-  switch (op_context.input->type) {
-    case kTfLiteFloat32: {
-      TF_LITE_SPLIT(float);
-      break;
-    }
-    case kTfLiteUInt8: {
-      TF_LITE_SPLIT(uint8_t);
-      break;
-    }
-    case kTfLiteInt16: {
-      TF_LITE_SPLIT(int16_t);
-      break;
-    }
-    default:
-      context->ReportError(
-          context,
-          "Only float32, uint8 and int16 are currently supported, got %d.",
-          op_context.input->type);
-      return kTfLiteError;
-  }
-#undef TF_LITE_SPLIT
-
-  return kTfLiteOk;
-}
-
-}  // namespace split
-
-TfLiteRegistration* Register_SPLIT() {
-  static TfLiteRegistration r = {nullptr, nullptr, split::Prepare, split::Eval};
-  return &r;
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/split_test.cc b/tensorflow/contrib/lite/kernels/split_test.cc
deleted file mode 100644
index 61a0759c6475795c06a9b55d3586d2b818f298b2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/split_test.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-constexpr int kAxisIsATensor = -1000;
-
-class SplitOpModel : public SingleOpModel {
- public:
-  SplitOpModel(const TensorData& input, int num_splits,
-               int axis = kAxisIsATensor) {
-    if (axis == kAxisIsATensor) {
-      axis_ = AddInput({TensorType_INT32, {1}});
-    } else {
-      axis_ = AddConstInput(TensorType_INT32, {axis}, {1});
-    }
-    input_ = AddInput(input);
-    for (int i = 0; i < num_splits; ++i) {
-      outputs_.push_back(AddOutput(input.type));
-    }
-    SetBuiltinOp(BuiltinOperator_SPLIT, BuiltinOptions_SplitOptions,
-                 CreateSplitOptions(builder_, num_splits).Union());
-    if (axis == kAxisIsATensor) {
-      BuildInterpreter({GetShape(axis_), GetShape(input_)});
-    } else {
-      BuildInterpreter({{}, GetShape(input_)});
-    }
-  }
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-  void SetAxis(int axis) { PopulateTensor(axis_, {axis}); }
-
-  std::vector<float> GetOutput(int i) {
-    return ExtractVector<float>(outputs_[i]);
-  }
-  std::vector<int> GetOutputShape(int i) { return GetTensorShape(outputs_[i]); }
-
- private:
-  int input_;
-  int axis_;
-  std::vector<int> outputs_;
-};
-
-using TensorValues = std::initializer_list<float>;
-
-void Check(int axis, int num_splits, std::initializer_list<int> input_shape,
-           std::initializer_list<int> output_shape,
-           const TensorValues& input_data,
-           const std::vector<TensorValues>& output_data) {
-  auto debug = [&](int i) {
-    std::stringstream ss;
-    ss << "for output tensor " << i << " axis=" << axis
-       << " and num_splits=" << num_splits;
-    return ss.str();
-  };
-  SplitOpModel m({TensorType_FLOAT32, input_shape}, num_splits);
-  m.SetInput(input_data);
-  m.SetAxis(axis);
-  m.Invoke();
-  for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(m.GetOutput(i), ElementsAreArray(output_data[i])) << debug(i);
-    EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shape))
-        << debug(i);
-  }
-
-  SplitOpModel const_m({TensorType_FLOAT32, input_shape}, num_splits, axis);
-  const_m.SetInput(input_data);
-  const_m.Invoke();
-  for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(const_m.GetOutput(i), ElementsAreArray(output_data[i]))
-        << debug(i);
-    EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shape))
-        << debug(i);
-  }
-}
-
-TEST(SplitOpTest, FourDimensional) {
-  Check(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 9, 10, 11, 12},
-            {5, 6, 7, 8, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 5, 6, 9, 10, 13, 14},
-            {3, 4, 7, 8, 11, 12, 15, 16},
-        });
-  Check(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 3, 5, 7, 9, 11, 13, 15},
-            {2, 4, 6, 8, 10, 12, 14, 16},
-        });
-}
-
-TEST(SplitOpTest, OneDimensional) {
-  Check(/*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
-        {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
-}
-
-TEST(SplitOpTest, NegativeAxis) {
-  Check(/*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
deleted file mode 100644
index 05a7c23ba10ef717ee3debf0a6316885d4612746..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-
-#include "tensorflow/contrib/lite/version.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tflite {
-
-using ::testing::FloatNear;
-using ::testing::Matcher;
-
-std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
-                                           float max_abs_error) {
-  std::vector<Matcher<float>> matchers;
-  matchers.reserve(values.size());
-  for (const float& v : values) {
-    matchers.emplace_back(FloatNear(v, max_abs_error));
-  }
-  return matchers;
-}
-
-int SingleOpModel::AddInput(const TensorData& t, bool is_variable) {
-  int id = AddTensor<float>(t, {}, is_variable);
-  inputs_.push_back(id);
-  return id;
-}
-
-int SingleOpModel::AddNullInput() {
-  int id = kOptionalTensor;
-  inputs_.push_back(id);
-  return id;
-}
-
-int SingleOpModel::AddOutput(const TensorData& t) {
-  int id = AddTensor<float>(t, {});
-  outputs_.push_back(id);
-  return id;
-}
-
-void SingleOpModel::SetBuiltinOp(BuiltinOperator type,
-                                 BuiltinOptions builtin_options_type,
-                                 flatbuffers::Offset<void> builtin_options) {
-  opcodes_.push_back(CreateOperatorCode(builder_, type, 0));
-  operators_.push_back(CreateOperator(
-      builder_, /*opcode_index=*/0, builder_.CreateVector<int32_t>(inputs_),
-      builder_.CreateVector<int32_t>(outputs_), builtin_options_type,
-      builtin_options,
-      /*custom_options=*/0, CustomOptionsFormat_FLEXBUFFERS));
-}
-
-void SingleOpModel::SetCustomOp(
-    const string& name, const std::vector<uint8_t>& custom_option,
-    const std::function<TfLiteRegistration*()>& registration) {
-  custom_registrations_[name] = registration;
-  opcodes_.push_back(
-      CreateOperatorCodeDirect(builder_, BuiltinOperator_CUSTOM, name.data()));
-  operators_.push_back(CreateOperator(
-      builder_, /*opcode_index=*/0, builder_.CreateVector<int32_t>(inputs_),
-      builder_.CreateVector<int32_t>(outputs_), BuiltinOptions_NONE, 0,
-      builder_.CreateVector<uint8_t>(custom_option),
-      CustomOptionsFormat_FLEXBUFFERS));
-}
-
-void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
-                                     bool allow_fp32_relax_to_fp16) {
-  auto opcodes = builder_.CreateVector(opcodes_);
-  auto operators = builder_.CreateVector(operators_);
-  auto tensors = builder_.CreateVector(tensors_);
-  auto inputs = builder_.CreateVector<int32_t>(inputs_);
-  auto outputs = builder_.CreateVector<int32_t>(outputs_);
-  // Create a single subgraph
-  std::vector<flatbuffers::Offset<SubGraph>> subgraphs;
-  auto subgraph = CreateSubGraph(builder_, tensors, inputs, outputs, operators);
-  subgraphs.push_back(subgraph);
-  auto subgraphs_flatbuffer = builder_.CreateVector(subgraphs);
-
-  auto buffers = builder_.CreateVector(buffers_);
-  auto description = builder_.CreateString("programmatic model");
-  builder_.Finish(CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
-                              subgraphs_flatbuffer, description, buffers));
-
-  auto* model = GetModel(builder_.GetBufferPointer());
-
-  if (!resolver_) {
-    auto resolver = new ops::builtin::BuiltinOpResolver();
-    for (const auto& reg : custom_registrations_) {
-      resolver->AddCustom(reg.first.data(), reg.second());
-    }
-    resolver_ = std::unique_ptr<OpResolver>(resolver);
-  }
-  CHECK(InterpreterBuilder(model, *resolver_)(&interpreter_) == kTfLiteOk);
-
-  CHECK(interpreter_ != nullptr);
-
-  int i = 0;
-  for (const auto& shape : input_shapes) {
-    int input_idx = interpreter_->inputs()[i++];
-    if (input_idx == kOptionalTensor) continue;
-    if (shape.empty()) continue;
-    CHECK(interpreter_->ResizeInputTensor(input_idx, shape) == kTfLiteOk);
-  }
-
-  interpreter_->SetAllowFp16PrecisionForFp32(allow_fp32_relax_to_fp16);
-
-  // Modify delegate with function.
-  if (apply_delegate_fn_) {
-    apply_delegate_fn_(interpreter_.get());
-  }
-
-  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
-      << "Cannot allocate tensors";
-  interpreter_->ResetVariableTensors();
-}
-
-void SingleOpModel::Invoke() { CHECK(interpreter_->Invoke() == kTfLiteOk); }
-
-int32_t SingleOpModel::GetTensorSize(int index) const {
-  TfLiteTensor* t = interpreter_->tensor(index);
-  CHECK(t);
-  int total_size = 1;
-  for (int i = 0; i < t->dims->size; ++i) {
-    total_size *= t->dims->data[i];
-  }
-  return total_size;
-}
-
-template <>
-std::vector<string> SingleOpModel::ExtractVector(int index) {
-  TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
-  CHECK(tensor_ptr != nullptr);
-  const int num_strings = GetStringCount(tensor_ptr);
-  std::vector<string> result;
-  result.reserve(num_strings);
-  for (int i = 0; i < num_strings; ++i) {
-    const auto str = GetString(tensor_ptr, i);
-    result.emplace_back(str.str, str.len);
-  }
-  return result;
-}
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
deleted file mode 100644
index 84deb0e0e8d618bbc2ab19921bf04919427e5e51..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ /dev/null
@@ -1,386 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/testing/util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tflite {
-
-// A gmock matcher that check that elements of a float vector match to a given
-// tolerance.
-std::vector<::testing::Matcher<float>> ArrayFloatNear(
-    const std::vector<float>& values, float max_abs_error = 1e-5);
-
-template <typename T>
-inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
-                               int32_t zero_point) {
-  std::vector<T> q;
-  for (float f : data) {
-    q.push_back(static_cast<T>(std::max<float>(
-        std::numeric_limits<T>::min(),
-        std::min<float>(std::numeric_limits<T>::max(),
-                        std::round(zero_point + (f / scale))))));
-  }
-  return q;
-}
-
-template <typename T>
-inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
-                                     int32_t zero_point) {
-  std::vector<float> f;
-  for (T q : data) {
-    f.push_back(scale * (q - zero_point));
-  }
-  return f;
-}
-
-// A test model that contains a single operator. All operator inputs and
-// output are external to the model, so the tests can directly access them.
-// Typical usage:
-//    SingleOpModel m;
-//    int a = m.AddInput({TensorType_FLOAT32, a_shape});
-//    int b = m.AddInput({TensorType_FLOAT32, b_shape});
-//    int c = m.AddOutput({TensorType_FLOAT32, {}});
-//    m.SetBuiltinOp(...);
-//    m.BuildInterpreter({GetShape(a), GetShape(b)});
-//    m.PopulateTensor(a, {...});
-//    m.PopulateTensor(b, {...});
-//    m.Invoke();
-//    EXPECT_THAT(m.ExtractVector<float>(c), ArrayFloatNear({...}));
-//
-
-// A helper struct to construct test tensors. This is particularly useful for
-// quantized tensor which must have their scale and zero_point defined before
-// the actual data is known. This mimics what happens in practice: quantization
-// parameters are calculate during training.
-struct TensorData {
-  TensorType type;
-  std::vector<int> shape;
-  float min;
-  float max;
-  float scale;
-  int32_t zero_point;
-};
-
-class SingleOpResolver : public OpResolver {
- public:
-  SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration)
-      : op_(op), registration_(*registration) {
-    registration_.builtin_code = static_cast<int32_t>(op);
-    registration_.version = 1;
-  }
-  const TfLiteRegistration* FindOp(BuiltinOperator op,
-                                   int version) const override {
-    if (op == op_) {
-      return &registration_;
-    }
-    return nullptr;
-  }
-  const TfLiteRegistration* FindOp(const char* op, int version) const override {
-    return nullptr;
-  }
-
- private:
-  const BuiltinOperator op_;
-  TfLiteRegistration registration_;
-};
-
-class SingleOpModel {
- public:
-  SingleOpModel() {}
-  ~SingleOpModel() {}
-
-  // Set a function callback that is run right after graph is prepared
-  // that allows applying external delegates. This is useful for testing
-  // other runtimes like NN API or GPU.
-  void SetApplyDelegate(std::function<void(Interpreter*)> apply_delegate_fn) {
-    apply_delegate_fn_ = apply_delegate_fn;
-  }
-
-  // Copying or assignment is disallowed to simplify ownership semantics.
-  SingleOpModel(const SingleOpModel&) = delete;
-  SingleOpModel& operator=(const SingleOpModel&) = delete;
-
-  // Add a TensorType input tensor and return its index.
-  int AddInput(TensorType type, bool is_variable = false) {
-    return AddInput(TensorData{type}, is_variable);
-  }
-  int AddInput(const TensorData& t, bool is_variable = false);
-
-  // Templated version of AddConstInput().
-  template <typename T>
-  int AddConstInput(TensorType type, std::initializer_list<T> data,
-                    std::initializer_list<int> shape) {
-    int id = AddTensor(TensorData{type, shape}, data);
-    inputs_.push_back(id);
-    return id;
-  }
-
-  // Add a null input tensor (optional input) and return kOptionalTensor.
-  int AddNullInput();
-
-  // Add a TensorType output tensor and return its index.
-  int AddOutput(TensorType type) { return AddOutput(TensorData{type}); }
-  int AddOutput(const TensorData& t);
-
-  template <typename T>
-  void QuantizeAndPopulate(int index, const std::vector<float>& data) {
-    TfLiteTensor* t = interpreter_->tensor(index);
-    auto q = Quantize<T>(data, t->params.scale, t->params.zero_point);
-    PopulateTensor(index, 0, q.data(), q.data() + q.size());
-  }
-
-  void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) {
-    TfLiteTensor* t = interpreter_->tensor(index);
-    const int length = data.size();
-    std::vector<int8_t> q(length);
-    float min, max, scaling_factor;
-    tensor_utils::SymmetricQuantizeFloats(data.data(), length, q.data(), &min,
-                                          &max, &scaling_factor);
-    // Update quantization params.
-    t->params.scale = scaling_factor;
-    t->params.zero_point = 0;
-    PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
-                   reinterpret_cast<uint8_t*>(q.data() + q.size()));
-  }
-
-  const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
-
-  float GetScale(int id) { return tensor_data_.at(id).scale; }
-  int32_t GetZeroPoint(int id) { return tensor_data_.at(id).zero_point; }
-
-  // Define the operator in this model.
-  void SetBuiltinOp(BuiltinOperator type, BuiltinOptions builtin_options_type,
-                    flatbuffers::Offset<void> builtin_options);
-  void SetCustomOp(const string& name,
-                   const std::vector<uint8_t>& custom_option,
-                   const std::function<TfLiteRegistration*()>& registeration);
-
-  // Build the interpreter for this model. Also, resize and allocate all
-  // tensors given the shapes of the inputs.
-  void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
-                        bool allow_fp32_relax_to_fp16 = false);
-
-  void Invoke();
-
-  void PopulateStringTensor(int index, const std::vector<string>& content) {
-    auto tensor = interpreter_->tensor(index);
-    DynamicBuffer buf;
-    for (const string& s : content) {
-      buf.AddString(s.data(), s.length());
-    }
-    buf.WriteToTensor(tensor);
-  }
-
-  // Populate the tensor given its index.
-  // TODO(b/110696148) clean up and merge with vector-taking variant below.
-  template <typename T>
-  void PopulateTensor(int index, const std::initializer_list<T>& data) {
-    T* v = interpreter_->typed_tensor<T>(index);
-    CHECK(v) << "No tensor with index '" << index << "'.";
-    for (T f : data) {
-      *v = f;
-      ++v;
-    }
-  }
-
-  // Populate the tensor given its index.
-  // TODO(b/110696148) clean up and merge with initializer_list-taking variant
-  // above.
-  template <typename T>
-  void PopulateTensor(int index, const std::vector<T>& data) {
-    T* v = interpreter_->typed_tensor<T>(index);
-    CHECK(v) << "No tensor with index '" << index << "'.";
-    for (T f : data) {
-      *v = f;
-      ++v;
-    }
-  }
-
-  // Partially populate the tensor, starting at the given offset.
-  template <typename T>
-  void PopulateTensor(int index, int offset, T* begin, T* end) {
-    T* v = interpreter_->typed_tensor<T>(index);
-    memcpy(v + offset, begin, (end - begin) * sizeof(T));
-  }
-
-  // Return a vector with the flattened contents of a tensor.
-  template <typename T>
-  std::vector<T> ExtractVector(int index) {
-    T* v = interpreter_->typed_tensor<T>(index);
-    CHECK(v);
-    return std::vector<T>(v, v + GetTensorSize(index));
-  }
-
-  std::vector<int> GetTensorShape(int index) {
-    std::vector<int> result;
-    TfLiteTensor* t = interpreter_->tensor(index);
-    for (int i = 0; i < t->dims->size; ++i) {
-      result.push_back(t->dims->data[i]);
-    }
-    return result;
-  }
-
-  void SetResolver(std::unique_ptr<OpResolver> resolver) {
-    resolver_ = std::move(resolver);
-  }
-
- protected:
-  int32_t GetTensorSize(int index) const;
-
-  flatbuffers::FlatBufferBuilder builder_;
-  std::unique_ptr<tflite::Interpreter> interpreter_;
-  std::unique_ptr<OpResolver> resolver_;
-
- private:
-  // TODO(gavinbelson): sync this method with
-  // //tensorflow/contrib/lite/kernels/internal/quantization_util.h?l=31
-  template <typename T>
-  std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
-    // These are required by many quantized operations.
-    CHECK_LE(f_min, 0);
-    CHECK_GE(f_max, 0);
-    T q_min = std::numeric_limits<T>::min();
-    T q_max = std::numeric_limits<T>::max();
-    float range = q_max - q_min;
-    float scale = (f_max - f_min) / range;
-    int32_t zero_point = std::min(
-        q_max,
-        std::max(q_min, static_cast<T>(std::round(q_min - f_min / scale))));
-    return {scale, zero_point};
-  }
-
-  template <typename T>
-  int AddTensor(TensorData t, std::initializer_list<T> data,
-                bool is_variable = false) {
-    int id = tensors_.size();
-
-    // This is slightly different depending on whether we are adding a
-    // quantized or a regular tensor.
-    bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
-
-    flatbuffers::Offset<QuantizationParameters> q_params = 0;
-
-    if (is_quantized) {
-      if (t.min != 0 || t.max != 0) {
-        if (t.type == TensorType_UINT8) {
-          std::tie(t.scale, t.zero_point) =
-              QuantizationParams<uint8_t>(t.min, t.max);
-        } else if (t.type == TensorType_INT32) {
-          std::tie(t.scale, t.zero_point) =
-              QuantizationParams<int32_t>(t.min, t.max);
-        } else if (t.type == TensorType_INT16) {
-          std::tie(t.scale, t.zero_point) =
-              QuantizationParams<int16_t>(t.min, t.max);
-        } else {
-          LOG(FATAL) << "No support for the requested quantized type";
-        }
-        t.min = 0;
-        t.max = 0;
-      }
-
-      q_params = CreateQuantizationParameters(
-          builder_, /*min=*/0, /*max=*/0,
-          builder_.CreateVector<float>({t.scale}),
-          builder_.CreateVector<int64_t>({t.zero_point}));
-    }
-
-    int buffer_id = 0;
-    if (data.size()) {
-      // Initialize buffers list with empty buffer to allow for non-const
-      // tensors.
-      if (buffers_.empty()) {
-        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
-      }
-
-      // Add data as a Buffer to buffers list.
-      buffer_id = buffers_.size();
-      auto data_buffer =
-          builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.begin()),
-                                sizeof(T) * data.size());
-      buffers_.push_back(CreateBuffer(builder_, data_buffer));
-    }
-
-    tensors_.push_back(CreateTensor(builder_,
-                                    builder_.CreateVector<int>(t.shape), t.type,
-                                    /*buffer=*/buffer_id,
-                                    /*name=*/0, q_params, is_variable));
-
-    tensor_data_[id] = t;
-
-    return id;
-  }
-
-  std::map<int, TensorData> tensor_data_;
-  std::vector<int32_t> inputs_;
-  std::vector<int32_t> outputs_;
-  std::vector<flatbuffers::Offset<Tensor>> tensors_;
-  std::vector<flatbuffers::Offset<OperatorCode>> opcodes_;
-  std::vector<flatbuffers::Offset<Operator>> operators_;
-  std::vector<flatbuffers::Offset<Buffer>> buffers_;
-  std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
-  // A function pointer that gets called after the interpreter is created but
-  // before evaluation happens. This is useful for applying a delegate.
-  std::function<void(Interpreter*)> apply_delegate_fn_;
-};
-
-// Base class for single op unit tests.
-// The tests are parameterized to test multiple kernels for a single op.
-// The parameters are strings like "optimized" and "reference" to have better
-// readability in test reports.
-//
-// To use this class:
-// * Define a constant map from strings to TfLiteRegistration.
-// * Implement a test class that inherits SingleOpTest.
-// * Instantiate the test cases with SingleOpTest::GetKernelTags helper
-//   function.
-// * Call GetRegistration to get the TfLiteRegistration to be used before
-//   building the interpreter.
-class SingleOpTest : public ::testing::TestWithParam<string> {
- public:
-  static std::vector<string> GetKernelTags(
-      const std::map<string, TfLiteRegistration*>& kernel_map) {
-    std::vector<string> tags;
-    for (auto it : kernel_map) {
-      tags.push_back(it.first);
-    }
-    return tags;
-  }
-
- protected:
-  virtual const std::map<string, TfLiteRegistration*>& GetKernelMap() = 0;
-  TfLiteRegistration* GetRegistration() {
-    return GetKernelMap().at(GetParam());
-  }
-};
-
-// Strings have a special implementation that is in test_util.cc
-template <>
-std::vector<string> SingleOpModel::ExtractVector(int index);
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/tile_test.cc b/tensorflow/contrib/lite/kernels/tile_test.cc
deleted file mode 100644
index e73ca7b7504f6fe891f310d181b0039893f18852..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/tile_test.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-class TileOpModel : public SingleOpModel {
- public:
-  TileOpModel(std::initializer_list<int> input_shape, TensorType input_type,
-              TensorType multiply_type) {
-    input_ = AddInput(input_type);
-    multipliers_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(input_type);
-    SetBuiltinOp(BuiltinOperator_TILE, BuiltinOptions_TileOptions, 0);
-    BuildInterpreter({input_shape, {static_cast<int>(input_shape.size())}});
-  }
-
-  void SetInputFloat(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
-  }
-
-  void SetInputUInt8(std::initializer_list<uint8_t> data) {
-    PopulateTensor<uint8_t>(input_, data);
-  }
-
-  void SetInputInt32(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(input_, data);
-  }
-
-  void SetInputInt64(std::initializer_list<int64_t> data) {
-    PopulateTensor<int64_t>(input_, data);
-  }
-
-  void SetMultipliers(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(multipliers_, data);
-  }
-
-  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
-
-  std::vector<uint8_t> GetOutputUInt8() { return ExtractVector<uint8_t>(output_); }
-
-  std::vector<int32_t> GetOutputInt32() { return ExtractVector<int32_t>(output_); }
-
-  std::vector<int64_t> GetOutputInt64() {
-    return ExtractVector<int64_t>(output_);
-  }
-
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- protected:
-  int input_;
-  int multipliers_;
-  int output_;
-};
-
-TEST(TileTest, Float32Vector) {
-  TileOpModel m({3}, TensorType_FLOAT32, TensorType_INT32);
-  m.SetInputFloat({1.f, 2.f, 3.f});
-  m.SetMultipliers({2});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(),
-              ElementsAreArray({1.f, 2.f, 3.f, 1.f, 2.f, 3.f}));
-}
-
-TEST(TileTest, Float32Matrix) {
-  TileOpModel m({2, 3}, TensorType_FLOAT32, TensorType_INT32);
-  m.SetInputFloat({
-      11.f,
-      12.f,
-      13.f,
-      21.f,
-      22.f,
-      23.f,
-  });
-  m.SetMultipliers({2, 1});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray({
-                                      11.f,
-                                      12.f,
-                                      13.f,
-                                      21.f,
-                                      22.f,
-                                      23.f,
-                                      11.f,
-                                      12.f,
-                                      13.f,
-                                      21.f,
-                                      22.f,
-                                      23.f,
-                                  }));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
-}
-
-TEST(TileTest, Float32HighDimension) {
-  TileOpModel m({1, 2, 3}, TensorType_FLOAT32, TensorType_INT32);
-  m.SetInputFloat({
-      11.f,
-      12.f,
-      13.f,
-      21.f,
-      22.f,
-      23.f,
-  });
-  m.SetMultipliers({2, 3, 1});
-  m.Invoke();
-  EXPECT_THAT(
-      m.GetOutputFloat(),
-      ElementsAreArray({11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
-                        21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f,
-                        11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
-                        21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f}));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 3}));
-}
-
-TEST(TileTest, Uint8Matrix) {
-  TileOpModel m({2, 3}, TensorType_UINT8, TensorType_INT32);
-  m.SetInputUInt8({
-      11,
-      12,
-      13,
-      21,
-      22,
-      23,
-  });
-  m.SetMultipliers({2, 1});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputUInt8(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
-}
-
-TEST(TileTest, Int32Matrix) {
-  TileOpModel m({2, 3}, TensorType_INT32, TensorType_INT32);
-  m.SetInputInt32({
-      11,
-      12,
-      13,
-      21,
-      22,
-      23,
-  });
-  m.SetMultipliers({2, 1});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputInt32(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
-}
-
-TEST(TileTest, Int64Matrix) {
-  TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT32);
-  m.SetInputInt64({
-      11,
-      12,
-      13,
-      21,
-      22,
-      23,
-  });
-  m.SetMultipliers({2, 1});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
-}
-
-TEST(TileTest, Int64Matrix64Multipliers) {
-  TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT64);
-  m.SetInputInt64({
-      11,
-      12,
-      13,
-      21,
-      22,
-      23,
-  });
-  m.SetMultipliers({2, 1});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                      11,
-                                      12,
-                                      13,
-                                      21,
-                                      22,
-                                      23,
-                                  }));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
-}
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv.cc b/tensorflow/contrib/lite/kernels/transpose_conv.cc
deleted file mode 100644
index 1c4a5ee91d038cb222820c4d35fc713f8f41cb63..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/transpose_conv.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/kernels/padding.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace transpose_conv {
-
-constexpr int kOutputShapeTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kDataInputTensor = 2;
-constexpr int kOutputTensor = 0;
-
-TfLiteStatus ResizeOutputShape(TfLiteContext* context,
-                               const TfLiteTensor* output_shape,
-                               TfLiteTensor* output) {
-  // Currently only support int32 for output shape.
-  if (output_shape->type != kTfLiteInt32) {
-    context->ReportError(context, "Output shape is %d, not int32.",
-                         output_shape->type);
-    return kTfLiteError;
-  }
-  const int output_dimensions = NumElements(output_shape);
-  TfLiteIntArray* output_shape_array = TfLiteIntArrayCreate(output_dimensions);
-  for (int i = 0; i < output_dimensions; ++i) {
-    output_shape_array->data[i] = GetTensorData<int32_t>(output_shape)[i];
-  }
-
-  return context->ResizeTensor(context, output, output_shape_array);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
-  const TfLiteTensor* output_shape =
-      GetInput(context, node, kOutputShapeTensor);
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 4);
-
-  // Currently only supports float32.
-  const TfLiteType data_type = input->type;
-  TF_LITE_ENSURE(context, data_type == kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, output->type, data_type);
-  TF_LITE_ENSURE_EQ(context, weights->type, data_type);
-
-  // Ensure that weights and inputs have the same channel dimension.
-  // Note: TOCO will reorder weights in the following format: OHWI.
-  TF_LITE_ENSURE_EQ(context, SizeOfDimension(input, 3),
-                    SizeOfDimension(weights, 3));
-
-  if (!IsConstantTensor(output_shape)) {
-    SetTensorToDynamic(output);
-    return kTfLiteOk;
-  }
-  return ResizeOutputShape(context, output_shape, output);
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* output_shape =
-      GetInput(context, node, kOutputShapeTensor);
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  const auto* params =
-      reinterpret_cast<TfLiteTransposeConvParams*>(node->builtin_data);
-
-  if (IsDynamicTensor(output)) {
-    TF_LITE_ENSURE_OK(context,
-                      ResizeOutputShape(context, output_shape, output));
-  }
-
-  // Get height and width of the output image.
-  const int width = SizeOfDimension(output, 2);
-  const int height = SizeOfDimension(output, 1);
-  const int filter_width = SizeOfDimension(weights, 1);
-  const int filter_height = SizeOfDimension(weights, 2);
-
-  const int stride_width = params->stride_width;
-  const int stride_height = params->stride_height;
-
-  const TfLitePaddingValues& padding_size =
-      ComputePaddingHeightWidth(stride_height, stride_width, 1, height, width,
-                                filter_height, filter_width, params->padding);
-
-  // Currently only support float32.
-  switch (input->type) {
-    case kTfLiteFloat32: {
-      tflite::ConvParams op_params;
-      op_params.padding_type = PaddingType::kSame;
-      op_params.padding_values.width = padding_size.width;
-      op_params.padding_values.height = padding_size.height;
-      op_params.stride_width = stride_width;
-      op_params.stride_height = stride_height;
-
-      reference_ops::TransposeConv(
-          op_params, GetTensorShape(input), GetTensorData<float>(input),
-          GetTensorShape(weights), GetTensorData<float>(weights),
-          GetTensorShape(output), GetTensorData<float>(output),
-          // Last two args specify im2col which reference_ops ignores.
-          // (Note this does not lead to a performance regression, as the
-          // previous optimized version was just a copy of the reference code.)
-          // TODO(b/110208176): Allocate im2col tensors and switch to
-          // optimized_ops.
-          GetTensorShape(output), GetTensorData<float>(output));
-      break;
-    }
-    default:
-      context->ReportError(context, "Type %d, not currently supported.",
-                           input->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace transpose_conv
-
-TfLiteRegistration* Register_TRANSPOSE_CONV() {
-  static TfLiteRegistration r = {nullptr, nullptr, transpose_conv::Prepare,
-                                 transpose_conv::Eval};
-  return &r;
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv_test.cc b/tensorflow/contrib/lite/kernels/transpose_conv_test.cc
deleted file mode 100644
index 55df8971806ed0baae9f5bcaebd24fb8065ec300..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/transpose_conv_test.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstdarg>
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class TransposeConvOpModel : public SingleOpModel {
- public:
-  TransposeConvOpModel(std::initializer_list<int> input_shape,
-                       std::initializer_list<int> filter_shape, Padding padding,
-                       int stride_w, int stride_h) {
-    output_shape_ = AddInput(TensorType_INT32);
-    filter_ = AddInput(TensorType_FLOAT32);
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(
-        BuiltinOperator_TRANSPOSE_CONV, BuiltinOptions_TransposeConvOptions,
-        CreateTransposeConvOptions(builder_, padding, stride_w, stride_h)
-            .Union());
-    BuildInterpreter({{4}, filter_shape, input_shape});
-  }
-
-  int output_shape() { return output_shape_; }
-  int filter() { return filter_; }
-  int input() { return input_; }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- private:
-  int output_shape_;
-  int filter_;
-  int input_;
-  int output_;
-};
-
-// Test case:
-// output = tf.nn.conv2d_backprop_input(
-//     tf.constant([ 1, 4, 4, 1 ]),
-//     tf.constant(np.arange(1, 10), shape=[ 3, 3, 1, 1 ], dtype=tf.float32),
-//     tf.constant(np.arange(1, 17), shape=[ 1, 4, 4, 1 ], dtype=tf.float32),
-//     [1, 1, 1, 1 ],
-//     "SAME")
-TEST(TransposeConvOpModelTest, SimpleTest) {
-  TransposeConvOpModel m({1, 4, 4, 1}, {1, 3, 3, 1}, Padding_SAME, 1, 1);
-  m.PopulateTensor<int>(m.output_shape(), {1, 4, 4, 1});
-  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  m.PopulateTensor<float>(
-      m.input(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray({29, 62, 83, 75, 99, 192, 237, 198, 207, 372,
-                                417, 330, 263, 446, 485, 365}));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-}
-
-// Test case:
-// filter = tf.constant(np.arange(1, 19),
-//                      shape=[ 3, 3, 1, 2 ],
-//                      dtype=tf.float32)
-// output = tf.nn.conv2d_backprop_input(
-//     tf.constant([ 1, 4, 4, 1 ]),
-//     filter,
-//     tf.constant(np.arange(1, 33), shape=[ 1, 4, 4, 2 ], dtype=tf.float32),
-//     [1, 1, 1, 1 ],
-//     "SAME")
-// And filter value is derived by:
-// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[18, 1])
-TEST(TransposeConvOpModelTest, TwoFiltersTest) {
-  TransposeConvOpModel m({1, 4, 4, 2}, {1, 3, 3, 2}, Padding_SAME, 1, 1);
-  m.PopulateTensor<int>(m.output_shape(), {1, 4, 4, 1});
-  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-                                       13, 14, 15, 16, 17, 18});
-  m.PopulateTensor<float>(
-      m.input(),
-      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray({184, 412, 568, 528, 678, 1347, 1689, 1434, 1494,
-                                2715, 3057, 2442, 1968, 3352, 3652, 2760}));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-}
-
-// Test case:
-// filter = tf.constant(np.arange(1, 19),
-//                      shape=[ 3, 3, 1, 2 ],
-//                      dtype=tf.float32)
-// output = tf.nn.conv2d_backprop_input(
-//     tf.constant([ 1, 6, 6, 1 ]),
-//     filter,
-//     tf.constant(np.arange(1, 33), shape=[ 1, 4, 4, 2 ], dtype=tf.float32),
-//     [1, 1, 1, 1 ],
-//     "VALID")
-// And filter value is derived by:
-// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[1, 18])
-TEST(TransposeConvOpModelTest, PaddingValidTest) {
-  TransposeConvOpModel m({1, 4, 4, 2}, {1, 3, 3, 2}, Padding_VALID, 1, 1);
-  m.PopulateTensor<int>(m.output_shape(), {1, 6, 6, 1});
-  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-                                       13, 14, 15, 16, 17, 18});
-  m.PopulateTensor<float>(
-      m.input(),
-      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
-  m.Invoke();
-
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray({5,    22,   59,   101,  114,  83,   52,   184,  412,
-                        568,  528,  344,  237,  678,  1347, 1689, 1434, 879,
-                        597,  1494, 2715, 3057, 2442, 1431, 856,  1968, 3352,
-                        3652, 2760, 1548, 689,  1534, 2543, 2729, 2010, 1103}));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 6, 6, 1}));
-}
-
-// Test case:
-// filter = tf.constant(np.arange(1, 10),
-//                      shape=[ 3, 3, 1, 1 ],
-//                      dtype=tf.float32)
-// output = tf.nn.conv2d_backprop_input(
-//     tf.constant([ 1, 5, 5, 1 ]),
-//     filter,
-//     tf.constant(np.arange(1, 5), shape=[ 1, 2, 2, 1 ], dtype=tf.float32),
-//     [1, 2, 2, 1 ],
-//     "VALID")
-TEST(TransposeConvOpModelTest, StrideValidTest) {
-  TransposeConvOpModel m({1, 2, 2, 1}, {1, 3, 3, 1}, Padding_VALID, 2, 2);
-  m.PopulateTensor<int>(m.output_shape(), {1, 5, 5, 1});
-  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9});
-  m.PopulateTensor<float>(m.input(), {1, 2, 3, 4});
-  m.Invoke();
-
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray({1,  2,  5,  4,  6,  4,  5,  14, 10, 12, 10, 14, 36,
-                        24, 30, 12, 15, 34, 20, 24, 21, 24, 55, 32, 36}));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 5, 5, 1}));
-}
-
-// Test case:
-// filter = tf.constant(np.arange(1, 19),
-//                      shape=[ 3, 3, 2, 1 ],
-//                      dtype=tf.float32)
-// output = tf.nn.conv2d_backprop_input(
-//     tf.constant([ 1, 5, 5, 2 ]),
-//     filter,
-//     tf.constant(np.arange(1, 5), shape=[ 1, 2, 2, 1 ], dtype=tf.float32),
-//     [1, 2, 2, 1 ],
-//     "VALID")
-TEST(TransposeConvOpModelTest, MultiChannelTest) {
-  TransposeConvOpModel m({1, 2, 2, 1}, {2, 3, 3, 1}, Padding_VALID, 2, 2);
-  m.PopulateTensor<int>(m.output_shape(), {1, 5, 5, 2});
-  m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6,
-                                       8, 10, 12, 14, 16, 18});
-  m.PopulateTensor<float>(m.input(), {1, 2, 3, 4});
-  m.Invoke();
-
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray({1,  2,  3,  4,  7,  10,  6,   8,  10, 12, 7,  8,  9,
-                        10, 25, 28, 18, 20, 22,  24,  16, 20, 24, 28, 62, 72,
-                        42, 48, 54, 60, 21, 24,  27,  30, 61, 68, 36, 40, 44,
-                        48, 39, 42, 45, 48, 103, 110, 60, 64, 68, 72}));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 5, 5, 2}));
-}
-
-// Test case:
-// filter = tf.constant(np.random.randint(1, 10, size=9),
-//                      shape=[ 3, 3, 1, 1 ],
-//                      dtype=tf.float32)
-// output = tf.nn.conv2d_backprop_input(
-//     tf.constant([ 1, 3, 4, 1 ]),
-//     filter,
-//     tf.constant([323, 521], shape=[ 1, 1, 2, 1], dtype=tf.float32),
-//     [1, 3, 3, 1 ],
-//     "SAME")
-// And filter value is derived by:
-// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[-1])
-TEST(TransposeConvOpModelTest, AccuracyTest) {
-  TransposeConvOpModel m({1, 1, 2, 1}, {1, 3, 3, 1}, Padding_SAME, 3, 3);
-  m.PopulateTensor<int>(m.output_shape(), {1, 3, 4, 1});
-  m.PopulateTensor<float>(m.filter(), {9, 5, 6, 9, 8, 5, 3, 1, 4});
-  m.PopulateTensor<float>(m.input(), {323, 521});
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
-                                 {1615., 1938., 4689., 2605., 2584., 1615.,
-                                  4689., 4168., 323., 1292., 1563., 521.})));
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 4, 1}));
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
deleted file mode 100644
index 63817bd886508eb8159815bea22504c62a4f0fea..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+++ /dev/null
@@ -1,805 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-
-namespace tflite {
-namespace ops {
-namespace builtin {
-namespace unidirectional_sequence_lstm {
-
-// Input Tensors of size {max_time, n_batch, n_input}
-constexpr int kInputTensor = 0;
-
-// Input weight tensors of size: {n_cell, n_input}
-constexpr int kInputToInputWeightsTensor = 1;  // Optional
-constexpr int kInputToForgetWeightsTensor = 2;
-constexpr int kInputToCellWeightsTensor = 3;
-constexpr int kInputToOutputWeightsTensor = 4;
-
-// Recurrent weight tensors of size {n_cell, n_output}
-constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
-constexpr int kRecurrentToForgetWeightsTensor = 6;
-constexpr int kRecurrentToCellWeightsTensor = 7;
-constexpr int kRecurrentToOutputWeightsTensor = 8;
-
-// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kCellToInputWeightsTensor = 9;    // Optional
-constexpr int kCellToForgetWeightsTensor = 10;  // Optional
-constexpr int kCellToOutputWeightsTensor = 11;  // Optional
-
-// Gates bias tensors of size {n_cell}
-constexpr int kInputGateBiasTensor = 12;  // Optional
-constexpr int kForgetGateBiasTensor = 13;
-constexpr int kCellGateBiasTensor = 14;
-constexpr int kOutputGateBiasTensor = 15;
-
-// Projection weight tensor of size {n_output, n_cell}
-constexpr int kProjectionWeightsTensor = 16;  // Optional
-// Projection bias tensor of size {n_output}
-constexpr int kProjectionBiasTensor = 17;  // Optional
-
-// Stateful input tensors that are variables and will be modified by the Op.
-// Activation state tensor of size {n_batch, n_output}
-constexpr int kInputActivationStateTensor = 18;
-// Cell state tensor of size {n_batch, n_cell}
-constexpr int kInputCellStateTensor = 19;
-
-// Output tensors.
-constexpr int kOutputTensor = 0;
-
-// Temporary tensors
-enum TemporaryTensor {
-  kScratchBuffer = 0,
-  kInputQuantized = 1,
-  kOutputStateQuantized = 2,
-  kCellStateQuantized = 3,
-  kScalingFactors = 4,
-  kProductScalingFactors = 5,
-  kRecoveredCellWeights = 6,
-  kNumTemporaryTensors = 7
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* scratch_tensor_index = new int();
-  context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index);
-  return scratch_tensor_index;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<int*>(buffer);
-}
-
-// Check that input tensor dimensions matches with each other.
-TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
-                                        TfLiteNode* node, int n_input,
-                                        int n_output, int n_cell) {
-  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-
-  // Making sure clipping parameters have valid values.
-  // == 0 means no clipping
-  //  > 0 means clipping
-  TF_LITE_ENSURE(context, params->cell_clip >= 0);
-  TF_LITE_ENSURE(context, params->proj_clip >= 0);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  if (input_to_input_weights) {
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
-  }
-
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (recurrent_to_input_weights) {
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
-                      n_cell);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
-                      n_output);
-  }
-
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
-                    n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
-                    n_output);
-
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
-                    n_output);
-
-  // We make sure the input-gate's parameters are either both present (regular
-  // LSTM) or not at all (CIFG-LSTM).
-  const bool cifg_weights_all_or_none =
-      ((input_to_input_weights != nullptr) &&
-       (recurrent_to_input_weights != nullptr)) ||
-      ((input_to_input_weights == nullptr) &&
-       (recurrent_to_input_weights == nullptr));
-  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  if (cell_to_input_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  if (cell_to_forget_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-  if (cell_to_output_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
-  }
-
-  // Making sure the peephole weights are there all or none.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool peephole_weights_all_or_none =
-      ((cell_to_input_weights != nullptr || use_cifg) &&
-       (cell_to_forget_weights != nullptr) &&
-       (cell_to_output_weights != nullptr)) ||
-      ((cell_to_input_weights == nullptr) &&
-       (cell_to_forget_weights == nullptr) &&
-       (cell_to_output_weights == nullptr));
-  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
-
-  // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
-  } else {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  if (projection_weights) {
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
-  }
-
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-  if (projection_bias) {
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
-  }
-
-  // Making sure the projection tensors are consistent:
-  // 1) If projection weight is not present, then projection bias should not be
-  // present.
-  // 2) If projection weight is present, then projection bias is optional.
-  // TODO(ghodrat): make sure this is correct.
-  const bool projecton_tensors_consistent =
-      ((projection_weights != nullptr) || (projection_bias == nullptr));
-  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
-
-  return kTfLiteOk;
-}
-
-// Resize the output and  state tensors based on the sizes of the input tensors.
-// Allocate a temprory scratch tensor. Also check that the sizes of the input
-// tensors match each other.
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
-
-  // Check we have all the inputs and outputs we need.
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 20);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Inferring batch size, number of outputs and sequence length and
-  // number of cells from the input tensors.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE(context, input->dims->size > 1);
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-  const int n_cell = input_to_output_weights->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
-                    n_cell);
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input,
-                                                        n_output, n_cell));
-
-  // Get the pointer to output, activation_state and cell_state buffer tensors.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
-
-  // Check the shape of input state tensors.
-  // These tensor may be 1D or 2D. It's fine as long as the total size is
-  // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
-  TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
-
-  // Resize the output tensors.
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
-  output_size->data[0] = max_time;
-  output_size->data[1] = n_batch;
-  output_size->data[2] = n_output;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, output, output_size));
-
-  // The weights are of consistent type, so it suffices to check one.
-  // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
-  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
-                             input->type == kTfLiteFloat32);
-
-  TfLiteIntArrayFree(node->temporaries);
-  if (is_hybrid_op) {
-    node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
-  } else {
-    node->temporaries = TfLiteIntArrayCreate(1);
-  }
-  node->temporaries->data[0] = *scratch_tensor_index;
-
-  // Create a scratch buffer tensor.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer);
-  scratch_buffer->type = input->type;
-  scratch_buffer->allocation_type = kTfLiteArenaRw;
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-  scratch_buffer_size->data[0] = n_batch;
-  if (use_cifg) {
-    // Reserving space for Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 3;
-  } else {
-    // Reserving space for Input, Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 4;
-  }
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
-                                                   scratch_buffer_size));
-
-  if (is_hybrid_op) {
-    // Allocate temporary tensors to store quantized values of input,
-    // activation_state and cell_state tensors.
-    node->temporaries->data[kInputQuantized] =
-        *scratch_tensor_index + kInputQuantized;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-    node->temporaries->data[kOutputStateQuantized] =
-        *scratch_tensor_index + kOutputStateQuantized;
-    TfLiteTensor* activation_state_quantized =
-        GetTemporary(context, node, kOutputStateQuantized);
-    activation_state_quantized->type = kTfLiteUInt8;
-    activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
-                             activation_state->dims)) {
-      TfLiteIntArray* activation_state_quantized_size =
-          TfLiteIntArrayCopy(activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, activation_state_quantized,
-                                         activation_state_quantized_size));
-    }
-    node->temporaries->data[kCellStateQuantized] =
-        *scratch_tensor_index + kCellStateQuantized;
-    TfLiteTensor* cell_state_quantized =
-        GetTemporary(context, node, kCellStateQuantized);
-    cell_state_quantized->type = kTfLiteUInt8;
-    cell_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
-      TfLiteIntArray* cell_state_quantized_size =
-          TfLiteIntArrayCopy(cell_state->dims);
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, cell_state_quantized,
-                                              cell_state_quantized_size));
-    }
-
-    // Allocate temporary tensors to store scaling factors and product scaling
-    // factors. The latter is a convenience storage which allows to quantize
-    // a vector once (which produces the scaling factors) and multiply it with
-    // different matrices (which requires multiplying the scaling factors with
-    // the scaling factor of the matrix).
-    node->temporaries->data[kScalingFactors] =
-        *scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, kScalingFactors);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-    scaling_factors_size->data[0] = n_batch;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
-    }
-    node->temporaries->data[kProductScalingFactors] =
-        *scratch_tensor_index + kProductScalingFactors;
-    TfLiteTensor* prod_scaling_factors =
-        GetTemporary(context, node, kProductScalingFactors);
-    prod_scaling_factors->type = kTfLiteFloat32;
-    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
-    prod_scaling_factors_size->data[0] = n_batch;
-    if (!TfLiteIntArrayEqual(prod_scaling_factors->dims,
-                             prod_scaling_factors_size)) {
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, prod_scaling_factors,
-                                              prod_scaling_factors_size));
-    }
-
-    // Allocate a temporary tensor to store the recovered cell weights. Since
-    // this is used for diagonal matrices, only need to store n_cell values.
-    node->temporaries->data[kRecoveredCellWeights] =
-        *scratch_tensor_index + kRecoveredCellWeights;
-    TfLiteTensor* recovered_cell_weights =
-        GetTemporary(context, node, kRecoveredCellWeights);
-    recovered_cell_weights->type = kTfLiteFloat32;
-    recovered_cell_weights->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1);
-    recovered_cell_weights_size->data[0] = n_cell;
-    if (!TfLiteIntArrayEqual(recovered_cell_weights->dims,
-                             recovered_cell_weights_size)) {
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, recovered_cell_weights,
-                                              recovered_cell_weights_size));
-    }
-  }
-  return kTfLiteOk;
-}
-
-// The LSTM Op engine.
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
-    TfLiteTensor* output) {
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  const float* input_to_input_weights_ptr =
-      (use_cifg) ? nullptr : input_to_input_weights->data.f;
-  const float* recurrent_to_input_weights_ptr =
-      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
-  const float* input_gate_bias_ptr =
-      (use_cifg) ? nullptr : input_gate_bias->data.f;
-  const float* cell_to_input_weights_ptr =
-      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
-  const float* cell_to_forget_weights_ptr =
-      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
-  const float* cell_to_output_weights_ptr =
-      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
-  const float* projection_weights_ptr =
-      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f;
-  const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f;
-  const float* input_to_output_weights_ptr = input_to_output_weights->data.f;
-  const float* recurrent_to_forget_weights_ptr =
-      recurrent_to_forget_weights->data.f;
-  const float* recurrent_to_cell_weights_ptr =
-      recurrent_to_cell_weights->data.f;
-  const float* recurrent_to_output_weights_ptr =
-      recurrent_to_output_weights->data.f;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-
-  // Feed the sequence into the LSTM step-by-step.
-  for (int t = 0; t < max_time; t++) {
-    const float* input_ptr_batch = input->data.f + t * n_batch * n_input;
-    float* output_ptr_batch = output->data.f + t * n_batch * n_output;
-
-    kernel_utils::LstmStep(
-        input_ptr_batch, input_to_input_weights_ptr,
-        input_to_forget_weights_ptr, input_to_cell_weights_ptr,
-        input_to_output_weights_ptr, recurrent_to_input_weights_ptr,
-        recurrent_to_forget_weights_ptr, recurrent_to_cell_weights_ptr,
-        recurrent_to_output_weights_ptr, cell_to_input_weights_ptr,
-        cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
-        input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
-        output_gate_bias_ptr, projection_weights_ptr, projection_bias_ptr,
-        params, n_batch, n_cell, n_input, n_output, activation_state_ptr,
-        cell_state_ptr, input_gate_scratch, forget_gate_scratch, cell_scratch,
-        output_gate_scratch, output_ptr_batch);
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
-    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
-    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
-    TfLiteTensor* activation_state_quantized,
-    TfLiteTensor* cell_state_quantized, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
-  const int n_input = input->dims->data[2];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  int8_t* input_to_input_weights_ptr = nullptr;
-  float input_to_input_weights_scale = 1.0f;
-  int8_t* recurrent_to_input_weights_ptr = nullptr;
-  float recurrent_to_input_weights_scale = 1.0f;
-  float* input_gate_bias_ptr = nullptr;
-  if (!use_cifg) {
-    input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
-    recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
-    input_gate_bias_ptr = input_gate_bias->data.f;
-    input_to_input_weights_scale = input_to_input_weights->params.scale;
-    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
-  }
-
-  int8_t* cell_to_input_weights_ptr = nullptr;
-  int8_t* cell_to_forget_weights_ptr = nullptr;
-  int8_t* cell_to_output_weights_ptr = nullptr;
-  float cell_to_input_weights_scale = 1.0f;
-  float cell_to_forget_weights_scale = 1.0f;
-  float cell_to_output_weights_scale = 1.0f;
-  if (use_peephole) {
-    if (!use_cifg) {
-      cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
-      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
-    }
-    cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
-    cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
-    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
-    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
-  }
-
-  const int8_t* projection_weights_ptr =
-      (projection_weights == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
-  float projection_weights_scale =
-      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-
-  // Required tensors, pointers are non-null.
-  const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
-  const float input_to_forget_weights_scale =
-      input_to_forget_weights->params.scale;
-  const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
-  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
-  const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
-  const float input_to_output_weights_scale =
-      input_to_output_weights->params.scale;
-  const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
-  const float recurrent_to_forget_weights_scale =
-      recurrent_to_forget_weights->params.scale;
-  const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
-  const float recurrent_to_cell_weights_scale =
-      recurrent_to_cell_weights->params.scale;
-  const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
-  const float recurrent_to_output_weights_scale =
-      recurrent_to_output_weights->params.scale;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-
-  // Temporary storage for quantized values and scaling factors.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  int8_t* quantized_activation_state_ptr =
-      reinterpret_cast<int8_t*>(activation_state_quantized->data.uint8);
-  int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
-  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
-
-  // Feed the sequence into the LSTM step-by-step.
-  for (int t = 0; t < max_time; t++) {
-    const float* input_ptr_batch = input->data.f + t * n_batch * n_input;
-    float* output_ptr_batch = output->data.f + t * n_batch * n_output;
-
-    kernel_utils::LstmStep(
-        input_ptr_batch, input_to_input_weights_ptr,
-        input_to_input_weights_scale, input_to_forget_weights_ptr,
-        input_to_forget_weights_scale, input_to_cell_weights_ptr,
-        input_to_cell_weights_scale, input_to_output_weights_ptr,
-        input_to_output_weights_scale, recurrent_to_input_weights_ptr,
-        recurrent_to_input_weights_scale, recurrent_to_forget_weights_ptr,
-        recurrent_to_forget_weights_scale, recurrent_to_cell_weights_ptr,
-        recurrent_to_cell_weights_scale, recurrent_to_output_weights_ptr,
-        recurrent_to_output_weights_scale, cell_to_input_weights_ptr,
-        cell_to_input_weights_scale, cell_to_forget_weights_ptr,
-        cell_to_forget_weights_scale, cell_to_output_weights_ptr,
-        cell_to_output_weights_scale, input_gate_bias_ptr, forget_gate_bias_ptr,
-        cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-        projection_weights_scale, projection_bias_ptr, params, n_batch, n_cell,
-        n_input, n_output, input_gate_scratch, forget_gate_scratch,
-        cell_scratch, output_gate_scratch, scaling_factors_ptr,
-        prod_scaling_factors_ptr, recovered_cell_weights_ptr,
-        quantized_input_ptr, quantized_activation_state_ptr,
-        quantized_cell_state_ptr, activation_state_ptr, cell_state_ptr,
-        output_ptr_batch);
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-
-  // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-
-  TfLiteTensor* activation_state =
-      GetVariableInput(context, node, kInputActivationStateTensor);
-  TfLiteTensor* cell_state =
-      GetVariableInput(context, node, kInputCellStateTensor);
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  switch (input_to_output_weights->type) {
-    case kTfLiteFloat32: {
-      return EvalFloat(input, input_to_input_weights, input_to_forget_weights,
-                       input_to_cell_weights, input_to_output_weights,
-                       recurrent_to_input_weights, recurrent_to_forget_weights,
-                       recurrent_to_cell_weights, recurrent_to_output_weights,
-                       cell_to_input_weights, cell_to_forget_weights,
-                       cell_to_output_weights, input_gate_bias,
-                       forget_gate_bias, cell_bias, output_gate_bias,
-                       projection_weights, projection_bias, params,
-                       scratch_buffer, activation_state, cell_state, output);
-    }
-    case kTfLiteUInt8: {
-      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-      TfLiteTensor* activation_state_quantized =
-          GetTemporary(context, node, /*index=*/2);
-      TfLiteTensor* cell_state_quantized =
-          GetTemporary(context, node, /*index=*/3);
-      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
-      TfLiteTensor* prod_scaling_factors =
-          GetTemporary(context, node, /*index=*/5);
-      TfLiteTensor* recovered_cell_weights =
-          GetTemporary(context, node, /*index=*/6);
-      return EvalHybrid(
-          input, input_to_input_weights, input_to_forget_weights,
-          input_to_cell_weights, input_to_output_weights,
-          recurrent_to_input_weights, recurrent_to_forget_weights,
-          recurrent_to_cell_weights, recurrent_to_output_weights,
-          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
-          input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias,
-          projection_weights, projection_bias, params, scratch_buffer,
-          scaling_factors, prod_scaling_factors, recovered_cell_weights,
-          input_quantized, activation_state_quantized, cell_state_quantized,
-          activation_state, cell_state, output);
-    }
-    default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           input_to_output_weights->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-}  // namespace unidirectional_sequence_lstm
-
-TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM() {
-  static TfLiteRegistration r = {unidirectional_sequence_lstm::Init,
-                                 unidirectional_sequence_lstm::Free,
-                                 unidirectional_sequence_lstm::Prepare,
-                                 unidirectional_sequence_lstm::Eval};
-  return &r;
-}
-
-}  // namespace builtin
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh b/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
deleted file mode 100755
index 6195426d6d441e858fbe225c132b409ac0a0be32..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/lib_package/create_ios_frameworks.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash -x
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# TODO(ycling): Refactoring - Move this script into `tools/make`.
-set -e
-
-echo "Starting"
-TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
-
-TMP_DIR=$(mktemp -d)
-echo "Package dir: " $TMP_DIR
-FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
-FW_DIR_TFLITE=$FW_DIR/tensorflow_lite.framework
-FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
-
-echo "Creating target Headers directories"
-mkdir -p $FW_DIR_TFLITE_HDRS
-
-echo "Headers, populating: TensorFlow Lite"
-cd $TFLITE_DIR/../../..
-
-find tensorflow/contrib/lite -name '*.h' \
-    -not -path 'tensorflow/contrib/lite/tools/*' \
-    -not -path 'tensorflow/contrib/lite/examples/*' \
-    -not -path 'tensorflow/contrib/lite/gen/*' \
-    -not -path 'tensorflow/contrib/lite/toco/*' \
-    -not -path 'tensorflow/contrib/lite/nnapi/*' \
-    -not -path 'tensorflow/contrib/lite/java/*' \
-    | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
-cd $FW_DIR_TFLITE_HDRS
-tar xf tmp.tar
-rm -f tmp.tar
-
-echo "Headers, populating: Flatbuffer"
-cd $TFLITE_DIR/tools/make/downloads/flatbuffers/include/
-find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
-cd $FW_DIR_TFLITE_HDRS
-tar xf tmp.tar
-rm -f tmp.tar
-
-cd $TFLITE_DIR/../../..
-echo "Generate master LICENSE file and copy to target"
-bazel build //tensorflow/tools/lib_package:clicenses_generate
-cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
-   $FW_DIR_TFLITE
-
-echo "Copying static libraries"
-cp $TFLITE_DIR/tools/make/gen/lib/libtensorflow-lite.a \
-   $FW_DIR_TFLITE/tensorflow_lite
-
-# This is required, otherwise they interfere with the documentation of the
-# pod at cocoapods.org.
-echo "Remove all README files"
-cd $FW_DIR_TFLITE_HDRS
-find . -type f -name README\* -exec rm -f {} \;
-find . -type f -name readme\* -exec rm -f {} \;
-
-TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
-echo "Moving results to target: " $TARGET_GEN_LOCATION
-cd $FW_DIR
-zip -q -r tensorflow_lite.framework.zip tensorflow_lite.framework -x .DS_Store
-rm -rf $TARGET_GEN_LOCATION
-mkdir -p $TARGET_GEN_LOCATION
-cp -r tensorflow_lite.framework.zip $TARGET_GEN_LOCATION
-
-echo "Cleaning up"
-rm -rf $TMP_DIR
-
-echo "Finished"
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
deleted file mode 100644
index 6abdfcd079af2416d0e6be1be3a48d21616c342e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/model.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Deserialization infrastructure for tflite. Provides functionality
-// to go from a serialized tflite model in flatbuffer format to an
-// interpreter.
-//
-// using namespace tflite;
-// StderrReporter error_reporter;
-// auto model = FlatBufferModel::BuildFromFile("interesting_model.tflite",
-//                                             &error_reporter);
-// MyOpResolver resolver;  // You need to subclass OpResolver to provide
-//                         // implementations.
-// InterpreterBuilder builder(*model, resolver);
-// std::unique_ptr<Interpreter> interpreter;
-// if(builder(&interpreter) == kTfLiteOk) {
-//   .. run model inference with interpreter
-// }
-//
-// OpResolver must be defined to provide your kernel implementations to the
-// interpreter. This is environment specific and may consist of just the builtin
-// ops, or some custom operators you defined to extend tflite.
-#ifndef TENSORFLOW_CONTRIB_LITE_MODEL_H_
-#define TENSORFLOW_CONTRIB_LITE_MODEL_H_
-
-#include <memory>
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/core/api/op_resolver.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/mutable_op_resolver.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-
-namespace tflite {
-
-// Abstract interface that verifies whether a given model is legit.
-// It facilitates the use-case to verify and build a model without loading it
-// twice.
-class TfLiteVerifier {
- public:
-  // Returns true if the model is legit.
-  virtual bool Verify(const char* data, int length,
-                      ErrorReporter* reporter) = 0;
-  virtual ~TfLiteVerifier() {}
-};
-
-// An RAII object that represents a read-only tflite model, copied from disk,
-// or mmapped. This uses flatbuffers as the serialization format.
-class FlatBufferModel {
- public:
-  // Builds a model based on a file.
-  // Caller retains ownership of `error_reporter` and must ensure its lifetime
-  // is longer than the FlatBufferModel instance.
-  // Returns a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> BuildFromFile(
-      const char* filename,
-      ErrorReporter* error_reporter = DefaultErrorReporter());
-
-  // Verifies whether the content of the file is legit, then builds a model
-  // based on the file.
-  // Caller retains ownership of `error_reporter` and must ensure its lifetime
-  // is longer than the FlatBufferModel instance.
-  // Returns a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
-      const char* filename, TfLiteVerifier* verifier = nullptr,
-      ErrorReporter* error_reporter = DefaultErrorReporter());
-
-  // Builds a model based on a pre-loaded flatbuffer. The caller retains
-  // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
-  // its lifetime is longer than the FlatBufferModel instance.
-  // Returns a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
-      const char* buffer, size_t buffer_size,
-      ErrorReporter* error_reporter = DefaultErrorReporter());
-
-  // Builds a model directly from a flatbuffer pointer. The caller retains
-  // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
-  // its lifetime is longer than the FlatBufferModel instance.
-  // Returns a nullptr in case of failure.
-  static std::unique_ptr<FlatBufferModel> BuildFromModel(
-      const tflite::Model* model_spec,
-      ErrorReporter* error_reporter = DefaultErrorReporter());
-
-  // Releases memory or unmaps mmaped memory.
-  ~FlatBufferModel();
-
-  // Copying or assignment is disallowed to simplify ownership semantics.
-  FlatBufferModel(const FlatBufferModel&) = delete;
-  FlatBufferModel& operator=(const FlatBufferModel&) = delete;
-
-  bool initialized() const { return model_ != nullptr; }
-  const tflite::Model* operator->() const { return model_; }
-  const tflite::Model* GetModel() const { return model_; }
-  ErrorReporter* error_reporter() const { return error_reporter_; }
-  const Allocation* allocation() const { return allocation_; }
-
-  // Returns true if the model identifier is correct (otherwise false and
-  // reports an error).
-  bool CheckModelIdentifier() const;
-
- private:
-  // Loads a model from a given allocation. FlatBufferModel will take over the
-  // ownership of `allocation`, and delete it in destructor. The ownership of
-  // `error_reporter`remains with the caller and must have lifetime at least
-  // as much as FlatBufferModel. This is to allow multiple models to use the
-  // same ErrorReporter instance.
-  FlatBufferModel(Allocation* allocation,
-                  ErrorReporter* error_reporter = DefaultErrorReporter());
-
-  // Loads a model from Model flatbuffer. The `model` has to remain alive and
-  // unchanged until the end of this flatbuffermodel's lifetime.
-  FlatBufferModel(const Model* model, ErrorReporter* error_reporter);
-
-  // Flatbuffer traverser pointer. (Model* is a pointer that is within the
-  // allocated memory of the data allocated by allocation's internals.
-  const tflite::Model* model_ = nullptr;
-  // The error reporter to use for model errors and subsequent errors when
-  // the interpreter is created
-  ErrorReporter* error_reporter_;
-  // The allocator used for holding memory of the model.
-  Allocation* allocation_ = nullptr;
-};
-
-// Build an interpreter capable of interpreting `model`.
-//
-// model: a scoped model whose lifetime must be at least as long as
-//   the interpreter. In principle multiple interpreters can be made from
-//   a single model.
-// op_resolver: An instance that implements the Resolver interface which maps
-//   custom op names and builtin op codes to op registrations.
-// reportError: a functor that is called to report errors that handles
-//   printf var arg semantics. The lifetime of the reportError object must
-//   be greater than or equal to the Interpreter created by operator().
-//
-// Returns a kTfLiteOk when successful and sets interpreter to a valid
-// Interpreter. Note: the user must ensure the model lifetime is at least as
-// long as interpreter's lifetime.
-class InterpreterBuilder {
- public:
-  InterpreterBuilder(const FlatBufferModel& model,
-                     const OpResolver& op_resolver);
-  // Builds an interpreter given only the raw flatbuffer Model object (instead
-  // of a FlatBufferModel). Mostly used for testing.
-  // If `error_reporter` is null, then DefaultErrorReporter() is used.
-  InterpreterBuilder(const ::tflite::Model* model,
-                     const OpResolver& op_resolver,
-                     ErrorReporter* error_reporter = DefaultErrorReporter());
-  ~InterpreterBuilder();
-  InterpreterBuilder(const InterpreterBuilder&) = delete;
-  InterpreterBuilder& operator=(const InterpreterBuilder&) = delete;
-  TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter);
-  TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter,
-                          int num_threads);
-
- private:
-  TfLiteStatus BuildLocalIndexToRegistrationMapping();
-  TfLiteStatus ParseNodes(
-      const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
-      Interpreter* interpreter);
-  TfLiteStatus ParseTensors(
-      const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
-      const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
-      Interpreter* interpreter);
-
-  const ::tflite::Model* model_;
-  const OpResolver& op_resolver_;
-  ErrorReporter* error_reporter_;
-
-  std::vector<const TfLiteRegistration*> flatbuffer_op_index_to_registration_;
-  std::vector<BuiltinOperator> flatbuffer_op_index_to_registration_types_;
-  const Allocation* allocation_ = nullptr;
-};
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
deleted file mode 100644
index b969bea5dcff2f5347ba0aa90f649ba3de89702b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/model_test.cc
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <fcntl.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#include "tensorflow/contrib/lite/model.h"
-
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/testing/util.h"
-
-// Comparison for TfLiteRegistration. Since TfLiteRegistration is a C object,
-// we must declare this in global namespace, so argument-dependent operator
-// lookup works.
-inline bool operator==(const TfLiteRegistration& a,
-                       const TfLiteRegistration& b) {
-  return a.invoke == b.invoke && a.init == b.init && a.prepare == b.prepare &&
-         a.free == b.free;
-}
-
-namespace tflite {
-
-// Provide a dummy operation that does nothing.
-namespace {
-void* dummy_init(TfLiteContext*, const char*, size_t) { return nullptr; }
-void dummy_free(TfLiteContext*, void*) {}
-TfLiteStatus dummy_resize(TfLiteContext*, TfLiteNode*) { return kTfLiteOk; }
-TfLiteStatus dummy_invoke(TfLiteContext*, TfLiteNode*) { return kTfLiteOk; }
-TfLiteRegistration dummy_reg = {dummy_init, dummy_free, dummy_resize,
-                                dummy_invoke};
-}  // namespace
-
-// Provide a trivial resolver that returns a constant value no matter what
-// op is asked for.
-class TrivialResolver : public OpResolver {
- public:
-  explicit TrivialResolver(TfLiteRegistration* constant_return = nullptr)
-      : constant_return_(constant_return) {}
-  // Find the op registration of a custom operator by op name.
-  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
-                                   int version) const override {
-    return constant_return_;
-  }
-  // Find the op registration of a custom operator by op name.
-  const TfLiteRegistration* FindOp(const char* op, int version) const override {
-    return constant_return_;
-  }
-
- private:
-  TfLiteRegistration* constant_return_;
-};
-
-TEST(BasicFlatBufferModel, TestNonExistantFiles) {
-  ASSERT_TRUE(!FlatBufferModel::BuildFromFile("/tmp/tflite_model_1234"));
-}
-
-// Make sure a model with nothing in it loads properly.
-TEST(BasicFlatBufferModel, TestEmptyModelsAndNullDestination) {
-  auto model = FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/empty_model.bin");
-  ASSERT_TRUE(model);
-  // Now try to build it into a model.
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(InterpreterBuilder(*model, TrivialResolver())(&interpreter),
-            kTfLiteOk);
-  ASSERT_NE(interpreter, nullptr);
-  ASSERT_NE(InterpreterBuilder(*model, TrivialResolver())(nullptr), kTfLiteOk);
-}
-
-// Make sure currently unsupported # of subgraphs are checked
-// TODO(aselle): Replace this test when multiple subgraphs are supported.
-TEST(BasicFlatBufferModel, TestZeroAndMultipleSubgraphs) {
-  auto m1 = FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/0_subgraphs.bin");
-  ASSERT_TRUE(m1);
-  std::unique_ptr<Interpreter> interpreter1;
-  ASSERT_NE(InterpreterBuilder(*m1, TrivialResolver())(&interpreter1),
-            kTfLiteOk);
-
-  auto m2 = FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/2_subgraphs.bin");
-  ASSERT_TRUE(m2);
-  std::unique_ptr<Interpreter> interpreter2;
-  ASSERT_NE(InterpreterBuilder(*m2, TrivialResolver())(&interpreter2),
-            kTfLiteOk);
-}
-
-// Test what happens if we cannot bind any of the ops.
-TEST(BasicFlatBufferModel, TestModelWithoutNullRegistrations) {
-  auto model = FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/test_model.bin");
-  ASSERT_TRUE(model);
-  // Check that we get an error code and interpreter pointer is reset.
-  std::unique_ptr<Interpreter> interpreter(new Interpreter);
-  ASSERT_NE(InterpreterBuilder(*model, TrivialResolver(nullptr))(&interpreter),
-            kTfLiteOk);
-  ASSERT_EQ(interpreter, nullptr);
-}
-
-// Make sure model is read to interpreter propelrly
-TEST(BasicFlatBufferModel, TestModelInInterpreter) {
-  auto model = FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/test_model.bin");
-  ASSERT_TRUE(model);
-  // Check that we get an error code and interpreter pointer is reset.
-  std::unique_ptr<Interpreter> interpreter(new Interpreter);
-  ASSERT_EQ(
-      InterpreterBuilder(*model, TrivialResolver(&dummy_reg))(&interpreter),
-      kTfLiteOk);
-  ASSERT_NE(interpreter, nullptr);
-  ASSERT_EQ(interpreter->tensors_size(), 4);
-  ASSERT_EQ(interpreter->nodes_size(), 2);
-  std::vector<int> inputs = {0, 1};
-  std::vector<int> outputs = {2, 3};
-  ASSERT_EQ(interpreter->inputs(), inputs);
-  ASSERT_EQ(interpreter->outputs(), outputs);
-
-  EXPECT_EQ(std::string(interpreter->GetInputName(0)), "input0");
-  EXPECT_EQ(std::string(interpreter->GetInputName(1)), "input1");
-  EXPECT_EQ(std::string(interpreter->GetOutputName(0)), "out1");
-  EXPECT_EQ(std::string(interpreter->GetOutputName(1)), "out2");
-
-  // Make sure all input tensors are correct
-  TfLiteTensor* i0 = interpreter->tensor(0);
-  ASSERT_EQ(i0->type, kTfLiteFloat32);
-  ASSERT_NE(i0->data.raw, nullptr);  // mmapped
-  ASSERT_EQ(i0->allocation_type, kTfLiteMmapRo);
-  TfLiteTensor* i1 = interpreter->tensor(1);
-  ASSERT_EQ(i1->type, kTfLiteFloat32);
-  ASSERT_EQ(i1->data.raw, nullptr);
-  ASSERT_EQ(i1->allocation_type, kTfLiteArenaRw);
-  TfLiteTensor* o0 = interpreter->tensor(2);
-  ASSERT_EQ(o0->type, kTfLiteFloat32);
-  ASSERT_EQ(o0->data.raw, nullptr);
-  ASSERT_EQ(o0->allocation_type, kTfLiteArenaRw);
-  TfLiteTensor* o1 = interpreter->tensor(3);
-  ASSERT_EQ(o1->type, kTfLiteFloat32);
-  ASSERT_EQ(o1->data.raw, nullptr);
-  ASSERT_EQ(o1->allocation_type, kTfLiteArenaRw);
-
-  // Check op 0 which has inputs {0, 1} outputs {2}.
-  {
-    const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg0 =
-        interpreter->node_and_registration(0);
-    ASSERT_NE(node_and_reg0, nullptr);
-    const TfLiteNode& node0 = node_and_reg0->first;
-    const TfLiteRegistration& reg0 = node_and_reg0->second;
-    TfLiteIntArray* desired_inputs = TfLiteIntArrayCreate(2);
-    desired_inputs->data[0] = 0;
-    desired_inputs->data[1] = 1;
-    TfLiteIntArray* desired_outputs = TfLiteIntArrayCreate(1);
-    desired_outputs->data[0] = 2;
-    ASSERT_TRUE(TfLiteIntArrayEqual(node0.inputs, desired_inputs));
-    ASSERT_TRUE(TfLiteIntArrayEqual(node0.outputs, desired_outputs));
-    TfLiteIntArrayFree(desired_inputs);
-    TfLiteIntArrayFree(desired_outputs);
-    ASSERT_EQ(reg0, dummy_reg);
-  }
-
-  // Check op 1 which has inputs {2} outputs {3}.
-  {
-    const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg1 =
-        interpreter->node_and_registration(1);
-    ASSERT_NE(node_and_reg1, nullptr);
-    const TfLiteNode& node1 = node_and_reg1->first;
-    const TfLiteRegistration& reg1 = node_and_reg1->second;
-    TfLiteIntArray* desired_inputs = TfLiteIntArrayCreate(1);
-    TfLiteIntArray* desired_outputs = TfLiteIntArrayCreate(1);
-    desired_inputs->data[0] = 2;
-    desired_outputs->data[0] = 3;
-    ASSERT_TRUE(TfLiteIntArrayEqual(node1.inputs, desired_inputs));
-    ASSERT_TRUE(TfLiteIntArrayEqual(node1.outputs, desired_outputs));
-    TfLiteIntArrayFree(desired_inputs);
-    TfLiteIntArrayFree(desired_outputs);
-    ASSERT_EQ(reg1, dummy_reg);
-  }
-}
-
-// Test that loading a model with TensorFlow ops fails when the flex delegate is
-// not linked into the target.
-TEST(FlexModel, FailureWithoutFlexDelegate) {
-  auto model = FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/multi_add_flex.bin");
-  ASSERT_TRUE(model);
-
-  // Note that creation will succeed when using the BuiltinOpResolver, but
-  // unless the appropriate delegate is linked into the target or the client
-  // explicitly installs the delegate, execution will fail.
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(InterpreterBuilder(*model,
-                               ops::builtin::BuiltinOpResolver{})(&interpreter),
-            kTfLiteOk);
-  ASSERT_TRUE(interpreter);
-
-  // As the flex ops weren't resolved implicitly by the flex delegate, runtime
-  // allocation and execution will fail.
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteError);
-}
-
-// This tests on a flatbuffer that defines a shape of 2 to be a memory mapped
-// buffer. But the buffer is provided to be only 1 element.
-TEST(BasicFlatBufferModel, TestBrokenMmap) {
-  ASSERT_FALSE(FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/test_model_broken.bin"));
-}
-
-TEST(BasicFlatBufferModel, TestNullModel) {
-  // Check that we get an error code and interpreter pointer is reset.
-  std::unique_ptr<Interpreter> interpreter(new Interpreter);
-  ASSERT_NE(
-      InterpreterBuilder(nullptr, TrivialResolver(&dummy_reg))(&interpreter),
-      kTfLiteOk);
-  ASSERT_EQ(interpreter.get(), nullptr);
-}
-
-// Mocks the verifier by setting the result in ctor.
-class FakeVerifier : public tflite::TfLiteVerifier {
- public:
-  explicit FakeVerifier(bool result) : result_(result) {}
-  bool Verify(const char* data, int length,
-              tflite::ErrorReporter* reporter) override {
-    return result_;
-  }
-
- private:
-  bool result_;
-};
-
-TEST(BasicFlatBufferModel, TestWithTrueVerifier) {
-  FakeVerifier verifier(true);
-  ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
-      "tensorflow/contrib/lite/testdata/test_model.bin",
-      &verifier));
-}
-
-TEST(BasicFlatBufferModel, TestWithFalseVerifier) {
-  FakeVerifier verifier(false);
-  ASSERT_FALSE(FlatBufferModel::VerifyAndBuildFromFile(
-      "tensorflow/contrib/lite/testdata/test_model.bin",
-      &verifier));
-}
-
-TEST(BasicFlatBufferModel, TestWithNullVerifier) {
-  ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
-      "tensorflow/contrib/lite/testdata/test_model.bin", nullptr));
-}
-
-// This makes sure the ErrorReporter is marshalled from FlatBufferModel to
-// the Interpreter.
-TEST(BasicFlatBufferModel, TestCustomErrorReporter) {
-  TestErrorReporter reporter;
-  auto model = FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/empty_model.bin",
-      &reporter);
-  ASSERT_TRUE(model);
-
-  std::unique_ptr<Interpreter> interpreter;
-  TrivialResolver resolver;
-  InterpreterBuilder(*model, resolver)(&interpreter);
-  ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
-  ASSERT_EQ(reporter.num_calls(), 1);
-}
-
-// This makes sure the ErrorReporter is marshalled from FlatBufferModel to
-// the Interpreter.
-TEST(BasicFlatBufferModel, TestNullErrorReporter) {
-  auto model = FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/empty_model.bin", nullptr);
-  ASSERT_TRUE(model);
-
-  std::unique_ptr<Interpreter> interpreter;
-  TrivialResolver resolver;
-  InterpreterBuilder(*model, resolver)(&interpreter);
-  ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
-}
-
-// Test that loading model directly from a Model flatbuffer works.
-TEST(BasicFlatBufferModel, TestBuildFromModel) {
-  TestErrorReporter reporter;
-  FileCopyAllocation model_allocation(
-      "tensorflow/contrib/lite/testdata/test_model.bin", &reporter);
-  ASSERT_TRUE(model_allocation.valid());
-  ::flatbuffers::Verifier verifier(
-      reinterpret_cast<const uint8_t*>(model_allocation.base()),
-      model_allocation.bytes());
-  ASSERT_TRUE(VerifyModelBuffer(verifier));
-  const Model* model_fb = ::tflite::GetModel(model_allocation.base());
-
-  auto model = FlatBufferModel::BuildFromModel(model_fb);
-  ASSERT_TRUE(model);
-
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(
-      InterpreterBuilder(*model, TrivialResolver(&dummy_reg))(&interpreter),
-      kTfLiteOk);
-  ASSERT_NE(interpreter, nullptr);
-}
-
-// TODO(aselle): Add tests for serialization of builtin op data types.
-// These tests will occur with the evaluation tests of individual operators,
-// not here.
-
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/models/BUILD b/tensorflow/contrib/lite/models/BUILD
deleted file mode 100644
index efa47b06fa7f06cc6312535713ec582af4705d85..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-# Model tests
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-
-exports_files(glob([
-    "testdata/*",
-]))
diff --git a/tensorflow/contrib/lite/models/smartreply/BUILD b/tensorflow/contrib/lite/models/smartreply/BUILD
deleted file mode 100644
index 9d88c396ba69948e3ae285c913a4499a1409b93a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/smartreply/BUILD
+++ /dev/null
@@ -1,89 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
-
-licenses(["notice"])  # Apache 2.0
-
-gen_selected_ops(
-    name = "smartreply_ops",
-    model = "@tflite_smartreply//:smartreply.tflite",
-)
-
-cc_library(
-    name = "custom_ops",
-    srcs = [
-        "ops/extract_feature.cc",
-        "ops/normalize.cc",
-        "ops/predict.cc",
-        ":smartreply_ops",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/strings",
-        "@com_googlesource_code_re2//:re2",
-        "@farmhash_archive//:farmhash",
-    ],
-)
-
-cc_library(
-    name = "predictor_lib",
-    srcs = ["predictor.cc"],
-    hdrs = ["predictor.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":custom_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/strings",
-        "@com_googlesource_code_re2//:re2",
-    ],
-)
-
-cc_test(
-    name = "extract_feature_op_test",
-    size = "small",
-    srcs = ["ops/extract_feature_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":custom_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@farmhash_archive//:farmhash",
-    ],
-)
-
-cc_test(
-    name = "normalize_op_test",
-    size = "small",
-    srcs = ["ops/normalize_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":custom_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_test(
-    name = "predict_op_test",
-    size = "small",
-    srcs = ["ops/predict_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":custom_ops",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
deleted file mode 100644
index 2e5033dab1356e2dbf2eef2b8c14e1ac7fc7566c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
+++ /dev/null
@@ -1,68 +0,0 @@
-load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow/contrib/lite:build_def.bzl",
-    "tflite_copts",
-    "tflite_jni_binary",
-)
-
-filegroup(
-    name = "assets",
-    srcs = [
-        "@tflite_smartreply//:model_files",
-    ],
-)
-
-android_binary(
-    name = "SmartReplyDemo",
-    srcs = glob(["java/**/*.java"]),
-    aapt_version = "aapt",
-    assets = [":assets"],
-    assets_dir = "",
-    custom_package = "com.example.android.smartreply",
-    manifest = "AndroidManifest.xml",
-    nocompress_extensions = [
-        ".tflite",
-    ],
-    resource_files = glob(["res/**"]),
-    tags = ["manual"],
-    deps = [
-        ":smartreply_runtime",
-        "@androidsdk//com.android.support:support-v13-25.2.0",
-        "@androidsdk//com.android.support:support-v4-25.2.0",
-    ],
-)
-
-cc_library(
-    name = "smartreply_runtime",
-    srcs = ["libsmartreply_jni.so"],
-    visibility = ["//visibility:public"],
-)
-
-tflite_jni_binary(
-    name = "libsmartreply_jni.so",
-    deps = [
-        ":smartreply_jni_lib",
-    ],
-)
-
-cc_library(
-    name = "smartreply_jni_lib",
-    srcs = [
-        "smartreply_jni.cc",
-    ],
-    copts = tflite_copts(),
-    linkopts = [
-        "-lm",
-        "-ldl",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/models/smartreply:predictor_lib",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/contrib/lite/models/smartreply/g3doc/README.md b/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
deleted file mode 100644
index a6d75648b3f3da98afd85daad6c2234e73a802e8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
+++ /dev/null
@@ -1,146 +0,0 @@
-# Smart Reply Model
-
-## What is On-Device Smart Reply Model?
-
-Smart Replies are contextually relevant, one-touch responses that help the user
-to reply to an incoming text message (or email) efficiently and effortlessly.
-Smart Replies have been highly successful across several Google products
-including
-[Gmail](https://www.blog.google/products/gmail/save-time-with-smart-reply-in-gmail/),
-[Inbox](https://www.blog.google/products/gmail/computer-respond-to-this-email/)
-and
-[Allo](https://blog.google/products/allo/google-allo-smarter-messaging-app/).
-
-The On-device Smart Reply model is targeted towards text chat use cases. It has
-a completely different architecture from its cloud-based counterparts, and is
-built specifically for memory constraints devices such as phones & watches. It
-has been successfully used to provide [Smart Replies on Android
-Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
-to all first- & third-party apps.
-
-The on-device model comes with several benefits. It is:
-
-*   **Faster**: The model resides on the device and does not require internet
-    connectivity. Thus, the inference is very fast and has an average latency of
-    only a few milliseconds.
-*   **Resource efficient**: The model has a small memory footprint on
-    the device.
-*   **Privacy-friendly**: The user data never leaves the device and this
-    eliminates any privacy restrictions.
-
-A caveat, though, is that the on-device model has lower triggering rate than its
-cloud counterparts (triggering rate is the percentage of times the model
-suggests a response for an incoming message).
-
-## When to use this Model?
-
-The On-Device Smart Reply model is aimed towards improving the messaging
-experience for day-to-day conversational chat messages. We recommend using this
-model for similar use cases. Some sample messages on which the model does well
-are provided in this [tsv
-file](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/testdata/smartreply_samples.tsv)
-for reference. The file format is:
-
-```
-   {incoming_message  smart_reply1   [smart_reply2]   [smart_reply3]}
-```
-
-For the current model, we see a triggering rate of about 30-40% for messages
-which are similar to those provided in the tsv file above.
-
-In case the model does not trigger any response, the system falls back to
-suggesting replies from a fixed back-off set that was compiled from popular
-response intents observed in chat conversations. Some of the fallback responses
-are `Ok, Yes, No, 👍, ☺`.
-
-The model can only be used for inference at this time (i.e. it cannot be custom
-trained). If you are interested to know how the model was trained, please refer
-to this [blog
-post](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
-and [research paper](https://arxiv.org/pdf/1708.00630).
-
-## How to use this Model?
-
-We have provided a pre-built demo APK that you can download, install and test on
-your phone ([demo APK
-here](http://download.tensorflow.org/deps/tflite/SmartReplyDemo.apk)).
-
-The On-Device Smart Reply demo App works in the following way:
-
-1.  Android app links to the JNI binary with a predictor library.
-
-2.  In the predictor library, `GetSegmentPredictions` is called with a list of input
-    strings.
-
-    2.1 The input string can be 1-3 most recent messages of the conversations in
-    form of string vector. The model will run on these input sentences and
-    provide Smart Replies corresponding to them.
-
-    2.2 The function performs some preprocessing on input data which includes:
-
-    *   Sentence splitting: The input message will be split into sentences if
-        message has more than one sentence. Eg: a message like “How are you?
-        Want to grab lunch?” will be broken down into 2 different sentences.
-    *   Normalization: The individual sentences will be normalized by converting
-        them into lower cases, removing unnecessary punctuations, etc. Eg: “how
-        are you????” will be converted to “how are you?” (refer for NORMALIZE op
-        for more details).
-
-        The input string content will be converted to tensors.
-
-    2.3 The function then runs the prediction model on the input tensors.
-
-    2.4 The function also performs some post-processing which includes
-    aggregating the model predictions for the input sentences from 2.2 and
-    returning the appropriate responses.
-
-3.  Finally, it gets response(s) from `std::vector<PredictorResponse>`, and
-    returns back to Android app. Responses are sorted in descending order of
-    confidence score.
-
-## Ops and Functionality Supported
-
-Following are the ops supported for using On-Device Smart Reply model:
-
-*   **NORMALIZE**
-
-    This is a custom op which normalizes the sentences by:
-
-    *   Converting all sentences into lower case.
-    *   Removing unnecessary punctuations (eg: “how are you????” → “how are
-        you?”).
-    *   Expanding sentences wherever necessary (eg: “ I’m home” → “I am home”).
-
-*   **SKIP_GRAM**
-
-    This is an op inside TensorFlow Lite that converts sentences into a list of
-    skip grams. The configurable parameters are `ngram_size` and
-    `max_skip_size`. For the model provided, the values for these parameters are
-    set to 3 & 2 respectively.
-
-*   **EXTRACT_FEATURES**
-
-    This is a custom op that hashes skip grams to features represented as
-    integers. Longer skip-grams are allocated higher weights.
-
-*   **LSH_PROJECTION**
-
-    This is an op inside TensorFlow Lite that projects input features to a
-    corresponding bit vector space using Locality Sensitive Hashing (LSH).
-
-*   **PREDICT**
-
-    This is a custom op that runs the input features through the projection
-    model (details [here](https://arxiv.org/pdf/1708.00630.pdf)), computes the
-    appropriate response labels along with weights for the projected features,
-    and aggregates the response labels and weights together.
-
-*   **HASHTABLE_LOOKUP**
-
-    This is an op inside TensorFlow Lite that uses label id from predict op and
-    looks up the response text from the given label id.
-
-## Further Information
-
-*   Open source code
-    [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/smartreply/).
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/README.md b/tensorflow/contrib/lite/models/testdata/g3doc/README.md
deleted file mode 100644
index 1c47e00aae2a0e76ba04004a2fc3cc02ec4536f7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/models/testdata/g3doc/README.md
+++ /dev/null
@@ -1,143 +0,0 @@
-## Speech Model Tests
-
-Sample test data has been provided for speech related models in Tensorflow Lite
-to help users working with speech models to verify and test their models.
-
-For the hotword, speaker-id and automatic speech recognition sample models, the
-architecture assumes that the models receive their input from a speech
-pre-processing module. The speech pre-processing module receives the audio
-signal and produces features for the encoder neural network and uses some
-typical signal processing algorithms, like FFT and spectral subtraction, and
-ultimately produces a log-mel filterbank (the log of the triangular mel filters
-applied to the power spectra). The text-to-speech model assumes that the inputs
-are linguistic features describing characteristics of phonemes, syllables,
-words, phrases, and sentence. The outputs are acoustic features including
-mel-cepstral coefficients, log fundamental frequency, and band aperiodicity.
-The pre-processing modules for these models are not provided in the open source
-version of TensorFlow Lite.
-
-The following sections describe the architecture of the sample models at a high
-level:
-
-### Hotword Model
-
-The hotword model is the neural network model we use for keyphrase/hotword
-spotting (i.e. "okgoogle" detection). It is the entry point for voice
-interaction (e.g. Google search app on Android devices or Google Home, etc.).
-The speech hotword model block diagram is shown in Figure below. It has an input
-size of 40 (float), an output size of 7 (float), one Svdf layer, and four fully
-connected layers with the corresponding parameters as shown in figure below.
-
-![hotword_model](hotword.svg "Hotword model")
-
-### Speaker-id Model
-
-The speaker-id model is the neural network model we use for speaker
-verification. It runs after the hotword triggers. The speech speaker-id model
-block diagram is shown in Figure below. It has an input size of 80 (float), an
-output size of 64 (float), three Lstm layers, and one fully connected layers
-with the corresponding parameters as shown in figure below.
-
-![speakerid_model](speakerid.svg "Speaker-id model")
-
-### Text-to-speech (TTS) Model
-
-The text-to-speech model is the neural network model used to generate speech
-from text. The speech text-to-speech model’s block diagram is shown
-in Figure below. It has and input size of 334 (float), an output size of 196
-(float), two fully connected layers, three Lstm layers, and one recurrent layer
-with the corresponding parameters as shown in the figure.
-
-![tts_model](tts.svg "TTS model")
-
-### Automatic Speech Recognizer (ASR) Acoustic Model (AM)
-
-The acoustic model for automatic speech recognition is the neural network model
-for matching phonemes to the input audio features. It generates posterior
-probabilities of phonemes from speech frontend features (log-mel filterbanks).
-It has an input size of 320 (float), an output size of 42 (float), five LSTM
-layers and one fully connected layers with a Softmax activation function, with
-the corresponding parameters as shown in the figure.
-
-![asr_am_model](asr_am.svg "ASR AM model")
-
-### Automatic Speech Recognizer (ASR) Language Model (LM)
-
-The language model for automatic speech recognition is the neural network model
-for predicting the probability of a word given previous words in a sentence.
-It generates posterior probabilities of the next word based from a sequence of
-words. The words are encoded as indices in a fixed size dictionary.
-The model has two inputs both of size one (integer): the current word index and
-next word index, an output size of one (float): the log probability. It consists
-of three embedding layer, three LSTM layers, followed by a multiplication, a
-fully connected layers and an addition.
-The corresponding parameters as shown in the figure.
-
-![asr_lm_model](asr_lm.svg "ASR LM model")
-
-### Endpointer Model
-
-The endpointer model is the neural network model for predicting end of speech
-in an utterance. More precisely, it generates posterior probabilities of various
-events that allow detection of speech start and end events.
-It has an input size of 40 (float) which are speech frontend features
-(log-mel filterbanks), and an output size of four corresponding to:
-speech, intermediate non-speech, initial non-speech, and final non-speech.
-The model consists of a convolutional layer, followed by a fully-connected
-layer, two LSTM layers, and two additional fully-connected layers.
-The corresponding parameters as shown in the figure.
-![endpointer_model](endpointer.svg "Endpointer model")
-
-
-## Speech models test input/output generation
-
-As mentioned above the input to models are generated from a pre-processing
-module (output of a log-mel filterbank, or linguistic features), and the outputs
-are generated by running the equivalent TensorFlow model by feeding them the
-same input.
-
-## Link to the open source code
-
-### Models:
-
-[Speech hotword model (Svdf
-rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
-
-[Speech hotword model (Svdf
-rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
-
-[Speaker-id
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
-
-[TTS
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
-
-[ASR AM
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
-
-### Test benches
-
-[Speech hotword model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_hotword_model_test.cc)
-
-[Speaker-id model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc)
-
-[TTS model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_tts_model_test.cc)
-
-[ASR AM model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc)
-
-[ASR LM model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc)
-
-[Endpointer model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_endpointer_model_test.cc)
-
-## Android Support
-The models have been tested on Android phones, using the following tests:
-
-[Hotword] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/android/BUILD?rcl=172930882&l=25)
-
-[Speaker-id] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/android/BUILD?rcl=172930882&l=36)
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
deleted file mode 100644
index f23a0ccb807ca159c380267b1c5a877d24483c97..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ /dev/null
@@ -1,855 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/nnapi_delegate.h"
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
-
-#ifdef __ANDROID__
-#include <android/log.h>
-#include <sys/system_properties.h>
-#endif
-
-namespace tflite {
-
-void logError(const char* format, ...) {
-  // stderr is convenient for native tests, but is not captured for apps
-  va_list args_for_stderr;
-  va_start(args_for_stderr, format);
-  vfprintf(stderr, format, args_for_stderr);
-  va_end(args_for_stderr);
-  fprintf(stderr, "\n");
-  fflush(stderr);
-#ifdef __ANDROID__
-  // produce logcat output for general consumption
-  va_list args_for_log;
-  va_start(args_for_log, format);
-  __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
-  va_end(args_for_log);
-#endif
-}
-
-#define FATAL(...)       \
-  logError(__VA_ARGS__); \
-  exit(1);
-
-// TODO(aselle): Change the error model to use status codes.
-#define CHECK_TFLITE_SUCCESS(x)                                           \
-  if (x != kTfLiteOk) {                                                   \
-    FATAL("Aborting since tflite returned failure nnapi_delegate.cc:%d.", \
-          __LINE__);                                                      \
-  }
-
-#define CHECK_NN(x)                                                     \
-  if (x != ANEURALNETWORKS_NO_ERROR) {                                  \
-    FATAL("Aborting since NNAPI returned failure nnapi_delegate.cc:%d", \
-          __LINE__);                                                    \
-  }
-
-#define RETURN_ERROR_IF_TFLITE_FAILED(x)                                       \
-  if (x != kTfLiteOk) {                                                        \
-    logError(                                                                  \
-        "Returning error since TFLite returned failure nnapi_delegate.cc:%d.", \
-        __LINE__);                                                             \
-    return kTfLiteError;                                                       \
-  }
-
-#define RETURN_ERROR_IF_NN_FAILED(x)                                          \
-  if (x != ANEURALNETWORKS_NO_ERROR) {                                        \
-    logError(                                                                 \
-        "Returning error since NNAPI returned failure nnapi_delegate.cc:%d.", \
-        __LINE__);                                                            \
-    return kTfLiteError;                                                      \
-  }
-
-// Tracking of NNAPI operand ids
-static const int64_t kOperandIdNotSet = -1;
-static const int64_t kOperandNotNeeded = -2;
-
-namespace {
-
-int32_t GetAndroidSdkVersion() {
-#ifdef __ANDROID__
-  const char* sdkProp = "ro.build.version.sdk";
-  char sdkVersion[PROP_VALUE_MAX];
-  int length = __system_property_get(sdkProp, sdkVersion);
-  if (length != 0) {
-    for (int i = 0; i < length; ++i) {
-      int digit = sdkVersion[i] - '0';
-      if (digit < 0 || digit > 9) {
-        // Non-numeric SDK version, assume it's higher then expected;
-        return 0xFFFF;
-      }
-    }
-    return atoi(sdkVersion);
-  }
-  FATAL("No %s prop", sdkProp);
-#endif  // __ANDROID__
-  return 0;
-}
-
-int32_t GetAndroidSdkVersionCached() {
-  static int32_t androidSdkVersion = GetAndroidSdkVersion();
-  return androidSdkVersion;
-}
-
-}  // namespace
-
-NNAPIAllocation::NNAPIAllocation(const char* filename,
-                                 ErrorReporter* error_reporter)
-    : MMAPAllocation(filename, error_reporter) {
-  if (mmapped_buffer_ != MAP_FAILED)
-    CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ,
-                                                mmap_fd_, 0, &handle_));
-}
-
-NNAPIAllocation::~NNAPIAllocation() {
-  if (handle_) {
-    ANeuralNetworksMemory_free(handle_);
-  }
-}
-
-NNAPIDelegate::~NNAPIDelegate() {
-  if (nn_compiled_model_) {
-    ANeuralNetworksCompilation_free(nn_compiled_model_);
-    nn_compiled_model_ = nullptr;
-  }
-  if (nn_model_) {
-    ANeuralNetworksModel_free(nn_model_);
-    nn_model_ = nullptr;
-    // TODO(aselle): Is this thread-safe and callable multiple times?
-  }
-  // ANeuralNetworksShutdown();
-}
-
-// Adds the tensors of the interpreter to the NN API model.
-TfLiteStatus addTensorOperands(tflite::Interpreter* interpreter,
-                               ANeuralNetworksModel* nn_model,
-                               uint32_t* no_of_operands_added,
-                               std::vector<int64_t>* nnapi_ids) {
-  uint32_t next_id = 0;
-  for (size_t i = 0; i < interpreter->tensors_size(); i++) {
-    // Skip temporaries and RNN back-edges.
-    if ((*nnapi_ids)[i] == kOperandNotNeeded) continue;
-
-    (*nnapi_ids)[i] = int64_t(next_id);
-
-    int32_t nn_type = 0;
-    // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
-    float scale = 0.0f;
-    int32_t zeroPoint = 0;
-    TfLiteTensor* tensor = interpreter->tensor(i);
-    switch (tensor->type) {
-      case kTfLiteNoType:
-        // Tensors added during initialization of Ops don't have a type yet and
-        // should not be registered with the NNAPI.
-        continue;
-      case kTfLiteFloat32:
-        nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
-        break;
-      case kTfLiteUInt8:
-        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
-        scale = tensor->params.scale;
-        zeroPoint = tensor->params.zero_point;
-        break;
-      case kTfLiteInt32:
-        nn_type = ANEURALNETWORKS_TENSOR_INT32;
-        scale = tensor->params.scale;
-        zeroPoint = tensor->params.zero_point;
-        break;
-      default:
-        logError("Unsupported tensor type %d", tensor->type);
-        return kTfLiteError;
-    }
-    if (tensor->dims->size == 0) {
-      logError("NNAPI doesn't support tensors with rank 0 (index %d name %s)",
-               i, tensor->name);
-      return kTfLiteError;
-    }
-    if (tensor->dims->size > 4) {
-      logError("NNAPI doesn't support tensors with rank > 4 (index %d name %s)",
-               i, tensor->name);
-      return kTfLiteError;
-    }
-    // TODO(aselle): Note, many of these are intermediate results. Do I need
-    // to ever specify these sizes. I am currently below doing setValue
-    // on all of them, but I shouldn't in the future.
-    // Answer(jeanluc): If all the operators can set the dimension correctly,
-    // you won't need to.
-    ANeuralNetworksOperandType operand_type{
-        nn_type, static_cast<uint32_t>(tensor->dims->size),
-        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
-    RETURN_ERROR_IF_NN_FAILED(
-        ANeuralNetworksModel_addOperand(nn_model, &operand_type));
-    // TODO(aselle): Based on Michael's suggestion, limiting this to read
-    // only memory
-    if (tensor->allocation_type == kTfLiteMmapRo) {
-      if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
-              static_cast<const Allocation*>(tensor->allocation))) {
-        RETURN_ERROR_IF_NN_FAILED(
-            ANeuralNetworksModel_setOperandValueFromMemory(
-                nn_model, next_id, alloc->memory(),
-                alloc->offset(tensor->data.raw), tensor->bytes));
-      } else {
-        RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_setOperandValue(
-            nn_model, next_id, tensor->data.raw, tensor->bytes));
-      }
-    } else if (tensor->bytes == 0) {
-      // These size 0 tensors are optional tensors reserved.
-      RETURN_ERROR_IF_NN_FAILED(
-          ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
-    }
-
-    ++next_id;
-  }
-  *no_of_operands_added = next_id;
-  return kTfLiteOk;
-}
-
-void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count,
-                        std::vector<uint32_t>* into,
-                        const std::vector<int64_t>& map) {
-  for (size_t i = 0; i < from_ids_count; i++) {
-    int from_id = from_ids_buf[i];
-    if (from_id == kOptionalTensor) {
-      into->push_back(from_id);
-    } else {
-      into->push_back(map[from_id]);
-    }
-  }
-}
-
-// Adds the operations and their parameters to the NN API model.
-// 'next-id' is the operand ID of the next operand of the model.
-TfLiteStatus AddOpsAndParams(
-    tflite::Interpreter* interpreter, ANeuralNetworksModel* nn_model,
-    uint32_t next_id, std::vector<int>* model_state_inputs,
-    std::vector<int>* model_state_outputs,
-    const std::vector<int64_t>& tensor_id_to_nnapi_id) {
-  for (size_t i = 0; i < interpreter->nodes_size(); i++) {
-    const auto* node_and_registration = interpreter->node_and_registration(i);
-    const TfLiteNode& node = node_and_registration->first;
-    const TfLiteRegistration& registration = node_and_registration->second;
-    tflite::BuiltinOperator builtin =
-        static_cast<tflite::BuiltinOperator>(registration.builtin_code);
-
-    // Add the parameters.
-    std::vector<uint32_t> augmented_inputs, augmented_outputs;
-    MapAndAddTensorIds(node.inputs->data, node.inputs->size, &augmented_inputs,
-                       tensor_id_to_nnapi_id);
-    MapAndAddTensorIds(node.outputs->data, node.outputs->size,
-                       &augmented_outputs, tensor_id_to_nnapi_id);
-
-    auto add_scalar_int32 = [&nn_model, &augmented_inputs,
-                             &next_id](int value) {
-      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
-                                                    sizeof(int32_t)))
-      augmented_inputs.push_back(next_id++);
-    };
-
-    auto add_scalar_float32 = [&nn_model, &augmented_inputs,
-                               &next_id](float value) {
-      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
-                                                    sizeof(float)))
-      augmented_inputs.push_back(next_id++);
-    };
-
-    auto add_vector_int32 = [&](const int* values, uint32_t num_values) {
-      ANeuralNetworksOperandType operand_type{
-          .type = ANEURALNETWORKS_TENSOR_INT32,
-          .dimensionCount = 1,
-          .dimensions = &num_values};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(
-          nn_model, next_id, values, sizeof(int32_t) * num_values));
-      augmented_inputs.push_back(next_id++);
-    };
-
-    // Handle state tensors of RNN, LSTM, SVDF.
-    // For each state_out tensor, a corresponding state_in operand needs to be
-    // created for NNAPI.
-    auto duplicate_state_tensor_float32 =
-        [interpreter, &nn_model, &next_id, &augmented_inputs,
-         &model_state_inputs, &model_state_outputs](int tensor_id) {
-          const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
-          ANeuralNetworksOperandType operand_type{
-              ANEURALNETWORKS_TENSOR_FLOAT32,
-              static_cast<uint32_t>(tensor->dims->size),
-              reinterpret_cast<uint32_t*>(tensor->dims->data),
-              tensor->params.scale, tensor->params.zero_point};
-          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
-          augmented_inputs.push_back(next_id);
-          model_state_inputs->push_back(next_id);
-          model_state_outputs->push_back(tensor_id);
-          next_id++;
-        };
-    auto check_and_add_activation = [&add_scalar_int32](int activation) {
-      if (activation > kTfLiteActRelu6) {
-        logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
-        return kTfLiteError;
-      }
-      add_scalar_int32(activation);
-      return kTfLiteOk;
-    };
-
-    auto add_add_params = [&add_scalar_int32](void* data) {
-      auto* builtin = reinterpret_cast<TfLiteAddParams*>(data);
-      if (builtin->activation > kTfLiteActRelu6) {
-        logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
-        return kTfLiteError;
-      }
-      add_scalar_int32(builtin->activation);
-      return kTfLiteOk;
-    };
-
-    auto add_pooling_params = [&add_scalar_int32,
-                               &check_and_add_activation](void* data) {
-      auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
-      add_scalar_int32(builtin->padding);
-      add_scalar_int32(builtin->stride_width);
-      add_scalar_int32(builtin->stride_height);
-      add_scalar_int32(builtin->filter_width);
-      add_scalar_int32(builtin->filter_height);
-      return check_and_add_activation(builtin->activation);
-    };
-
-    auto add_convolution_params = [&add_scalar_int32,
-                                   &check_and_add_activation](void* data) {
-      auto builtin = reinterpret_cast<TfLiteConvParams*>(data);
-      add_scalar_int32(builtin->padding);
-      add_scalar_int32(builtin->stride_width);
-      add_scalar_int32(builtin->stride_height);
-      return check_and_add_activation(builtin->activation);
-    };
-
-    auto add_depthwise_conv_params = [&add_scalar_int32,
-                                      &check_and_add_activation](void* data) {
-      auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data);
-      add_scalar_int32(builtin->padding);
-      add_scalar_int32(builtin->stride_width);
-      add_scalar_int32(builtin->stride_height);
-      add_scalar_int32(builtin->depth_multiplier);
-      return check_and_add_activation(builtin->activation);
-    };
-
-    auto add_fully_connected_params = [&check_and_add_activation](void* data) {
-      auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data);
-      return check_and_add_activation(builtin->activation);
-    };
-
-    auto add_concatenation_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data);
-      add_scalar_int32(builtin->axis);
-      if (builtin->activation != kTfLiteActNone) {
-        logError("Concatenation does not support fused activation in NNAPI");
-        return kTfLiteError;
-      }
-      return kTfLiteOk;
-    };
-
-    auto add_softmax_params = [&add_scalar_float32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data);
-      add_scalar_float32(builtin->beta);
-    };
-
-    auto add_space_to_depth_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data);
-      add_scalar_int32(builtin->block_size);
-    };
-
-    auto add_lstm_params = [&add_scalar_int32,
-                            &add_scalar_float32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data);
-      add_scalar_int32(builtin->activation);
-      add_scalar_float32(builtin->cell_clip);
-      add_scalar_float32(builtin->proj_clip);
-    };
-
-    // LSTM in NNAPI requires scratch tensor as an output operand.
-    auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model,
-                                            &next_id, &augmented_outputs]() {
-      if (node.temporaries->size == 0) return;
-      int scratch_buffer_index = node.temporaries->data[0];
-      const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index);
-      ANeuralNetworksOperandType operand_type{
-          ANEURALNETWORKS_TENSOR_FLOAT32,
-          static_cast<uint32_t>(tensor->dims->size),
-          reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
-          tensor->params.zero_point};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
-      augmented_outputs.insert(augmented_outputs.begin(), next_id++);
-    };
-
-    auto add_mean_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteReducerParams*>(data);
-      add_scalar_int32(builtin->keep_dims);
-    };
-
-    auto add_svdf_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data);
-      add_scalar_int32(builtin->rank);
-      add_scalar_int32(builtin->activation);
-    };
-
-    auto add_rnn_params = [&add_scalar_int32](void* data) {
-      auto builtin = reinterpret_cast<TfLiteRNNParams*>(data);
-      add_scalar_int32(builtin->activation);
-    };
-
-    auto add_squeeze_params = [&](void* data) {
-      const auto* builtin = reinterpret_cast<TfLiteSqueezeParams*>(data);
-      // Note that we add the squeeze dimensions even if the dimensions were
-      // unspecified (empty), as NNAPI requires the operand.
-      add_vector_int32(builtin->squeeze_dims,
-                       static_cast<uint32_t>(builtin->num_squeeze_dims));
-    };
-
-    // Handle optional input tensors.
-    auto add_optional_tensors = [&nn_model, &augmented_inputs,
-                                 &next_id](int nn_type) {
-      for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
-        if (augmented_inputs[idx] == kOptionalTensor) {
-          const std::vector<uint32_t> dim = {0, 0};
-          ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
-          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-          CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id,
-                                                        nullptr, 0))
-          augmented_inputs[idx] = next_id++;
-        }
-      }
-    };
-
-    int nnapi_version = 10;
-    ANeuralNetworksOperationType nn_op_type;
-
-    switch (builtin) {
-      case tflite::BuiltinOperator_ADD:
-        nn_op_type = ANEURALNETWORKS_ADD;
-        RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
-        break;
-      case tflite::BuiltinOperator_MUL:
-        nn_op_type = ANEURALNETWORKS_MUL;
-        RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
-        break;
-      case tflite::BuiltinOperator_AVERAGE_POOL_2D:
-        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D;
-        break;
-      case tflite::BuiltinOperator_MAX_POOL_2D:
-        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_MAX_POOL_2D;
-        break;
-      case tflite::BuiltinOperator_L2_POOL_2D:
-        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_L2_POOL_2D;
-        break;
-      case tflite::BuiltinOperator_CONV_2D: {
-        auto builtin = reinterpret_cast<TfLiteConvParams*>(node.builtin_data);
-        if (builtin->dilation_width_factor != 1 ||
-            builtin->dilation_height_factor != 1 || node.inputs->size != 3) {
-          logError("NNAPI does not support dilated Conv2D.");
-          return kTfLiteError;
-        }
-      }
-        RETURN_ERROR_IF_TFLITE_FAILED(
-            add_convolution_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_CONV_2D;
-        break;
-      case tflite::BuiltinOperator_RELU:
-        nn_op_type = ANEURALNETWORKS_RELU;
-        break;
-      case tflite::BuiltinOperator_RELU6:
-        nn_op_type = ANEURALNETWORKS_RELU6;
-        break;
-      case tflite::BuiltinOperator_TANH:
-        nn_op_type = ANEURALNETWORKS_TANH;
-        break;
-      case tflite::BuiltinOperator_FLOOR:
-        nn_op_type = ANEURALNETWORKS_FLOOR;
-        break;
-      case tflite::BuiltinOperator_LOGISTIC:
-        nn_op_type = ANEURALNETWORKS_LOGISTIC;
-        break;
-      case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
-        RETURN_ERROR_IF_TFLITE_FAILED(
-            add_depthwise_conv_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
-        break;
-      case tflite::BuiltinOperator_CONCATENATION:
-        RETURN_ERROR_IF_TFLITE_FAILED(
-            add_concatenation_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_CONCATENATION;
-        break;
-      case tflite::BuiltinOperator_SOFTMAX:
-        add_softmax_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_SOFTMAX;
-        break;
-      case tflite::BuiltinOperator_FULLY_CONNECTED:
-        RETURN_ERROR_IF_TFLITE_FAILED(
-            add_fully_connected_params(node.builtin_data));
-        nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
-        break;
-      case tflite::BuiltinOperator_RESHAPE:
-        if (node.inputs->size != 2) {
-          logError("NNAPI only supports 2-input RESHAPE");
-          return kTfLiteError;
-        }
-        nn_op_type = ANEURALNETWORKS_RESHAPE;
-        // add_reshape_params(node.builtin_data);
-        break;
-      case tflite::BuiltinOperator_SPACE_TO_DEPTH:
-        add_space_to_depth_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
-        break;
-      case tflite::BuiltinOperator_LSTM: {
-        if (node.inputs->size + /* no of params */ 3 != 21) {
-          logError("NNAPI only supports 21-input LSTMs");
-          return kTfLiteError;
-        }
-        duplicate_state_tensor_float32(
-            node.outputs->data[/*kOutputStateTensor*/ 0]);
-        duplicate_state_tensor_float32(
-            node.outputs->data[/*kCellStateTensor*/ 1]);
-        add_lstm_params(node.builtin_data);
-        add_lstm_scratch_tensor_float32();
-        add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32);
-        nn_op_type = ANEURALNETWORKS_LSTM;
-        break;
-      }
-      case tflite::BuiltinOperator_SVDF: {
-        duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]);
-        add_svdf_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_SVDF;
-        break;
-      }
-      case tflite::BuiltinOperator_RNN: {
-        duplicate_state_tensor_float32(
-            node.outputs->data[/*kHiddenStateTensor*/ 0]);
-        add_rnn_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_RNN;
-        break;
-      }
-      case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
-        nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP;
-        break;
-      case tflite::BuiltinOperator_PAD:
-        nnapi_version = 11;  // require NNAPI 1.1
-        nn_op_type = ANEURALNETWORKS_PAD;
-        break;
-      case tflite::BuiltinOperator_MEAN:
-        nnapi_version = 11;  // require NNAPI 1.1
-        add_mean_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_MEAN;
-        break;
-      case tflite::BuiltinOperator_DIV:
-        nnapi_version = 11;  // require NNAPI 1.1
-        nn_op_type = ANEURALNETWORKS_DIV;
-        RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
-            reinterpret_cast<TfLiteDivParams*>(node.builtin_data)->activation));
-        break;
-      case tflite::BuiltinOperator_SUB:
-        nnapi_version = 11;  // require NNAPI 1.1
-        nn_op_type = ANEURALNETWORKS_SUB;
-        RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
-            reinterpret_cast<TfLiteSubParams*>(node.builtin_data)->activation));
-        break;
-      case tflite::BuiltinOperator_SQUEEZE:
-        nnapi_version = 11;  // requires NNAPI 1.1
-        add_squeeze_params(node.builtin_data);
-        nn_op_type = ANEURALNETWORKS_SQUEEZE;
-        break;
-      case tflite::BuiltinOperator_TRANSPOSE:
-        // The permutation input tensor value dictates the output dimensions.
-        // TODO(b/110888333): Support dynamically-sized tensors in delegates.
-        if ((node.inputs->size > 1) &&
-            (interpreter->tensor(node.inputs->data[1])->allocation_type !=
-             kTfLiteMmapRo)) {
-          logError("NNAPI does not yet support dynamic tensors.");
-          return kTfLiteError;
-        }
-        nnapi_version = 11;  // require NNAPI 1.1
-        nn_op_type = ANEURALNETWORKS_TRANSPOSE;
-        break;
-      case tflite::BuiltinOperator_L2_NORMALIZATION:
-        nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION;
-        if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data)
-                ->activation != kTfLiteActNone) {
-          logError(
-              "NNAPI does not support L2Normalization with fused activations");
-          return kTfLiteError;
-        }
-        if ((node.inputs->size > 0) &&
-            (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) {
-          logError("NNAPI only supports input rank 4 for L2Normalization");
-          return kTfLiteError;
-        }
-        break;
-      case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
-        if (interpreter->tensor(node.outputs->data[0])->type !=
-            kTfLiteFloat32) {
-          logError("NNAPI only support HASHTABLE_LOOKUP with float32 output",
-                   builtin);
-          return kTfLiteError;
-        }
-        nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP;
-        break;
-      case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
-      case tflite::BuiltinOperator_LSH_PROJECTION:
-      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
-      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
-      case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
-      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
-      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
-      case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
-      case tflite::BuiltinOperator_PADV2:
-      case tflite::BuiltinOperator_RESIZE_BILINEAR:
-      case tflite::BuiltinOperator_CALL:
-      case tflite::BuiltinOperator_SKIP_GRAM:
-      case tflite::BuiltinOperator_RELU_N1_TO_1:
-      case tflite::BuiltinOperator_GATHER:
-      case tflite::BuiltinOperator_SPACE_TO_BATCH_ND:
-      case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
-      case tflite::BuiltinOperator_TOPK_V2:
-      case tflite::BuiltinOperator_SPLIT:
-      case tflite::BuiltinOperator_STRIDED_SLICE:
-      case tflite::BuiltinOperator_EXP:
-      case tflite::BuiltinOperator_LOG_SOFTMAX:
-      case tflite::BuiltinOperator_DEQUANTIZE:
-      case tflite::BuiltinOperator_DELEGATE:
-      case tflite::BuiltinOperator_CAST:
-      case tflite::BuiltinOperator_PRELU:
-      case tflite::BuiltinOperator_MAXIMUM:
-      case tflite::BuiltinOperator_MINIMUM:
-      case tflite::BuiltinOperator_ARG_MAX:
-      case tflite::BuiltinOperator_ARG_MIN:
-      case tflite::BuiltinOperator_GREATER:
-      case tflite::BuiltinOperator_GREATER_EQUAL:
-      case tflite::BuiltinOperator_LESS:
-      case tflite::BuiltinOperator_LESS_EQUAL:
-      case tflite::BuiltinOperator_NEG:
-      case tflite::BuiltinOperator_SELECT:
-      case tflite::BuiltinOperator_SLICE:
-      case tflite::BuiltinOperator_SIN:
-      case tflite::BuiltinOperator_LOG:
-      case tflite::BuiltinOperator_TRANSPOSE_CONV:
-      case tflite::BuiltinOperator_TILE:
-      case tflite::BuiltinOperator_EXPAND_DIMS:
-      case tflite::BuiltinOperator_SPARSE_TO_DENSE:
-      case tflite::BuiltinOperator_EQUAL:
-      case tflite::BuiltinOperator_NOT_EQUAL:
-      case tflite::BuiltinOperator_SUM:
-      case tflite::BuiltinOperator_REDUCE_MAX:
-      case tflite::BuiltinOperator_REDUCE_MIN:
-      case tflite::BuiltinOperator_REDUCE_PROD:
-      case tflite::BuiltinOperator_SQRT:
-      case tflite::BuiltinOperator_RSQRT:
-      case tflite::BuiltinOperator_SHAPE:
-      case tflite::BuiltinOperator_POW:
-      case tflite::BuiltinOperator_FAKE_QUANT:
-      case tflite::BuiltinOperator_PACK:
-      case tflite::BuiltinOperator_LOGICAL_OR:
-      case tflite::BuiltinOperator_ONE_HOT:
-      case tflite::BuiltinOperator_LOGICAL_AND:
-      case tflite::BuiltinOperator_LOGICAL_NOT:
-      case tflite::BuiltinOperator_UNPACK:
-      case tflite::BuiltinOperator_FLOOR_DIV:
-      case tflite::BuiltinOperator_REDUCE_ANY:
-      case tflite::BuiltinOperator_SQUARE:
-      case tflite::BuiltinOperator_ZEROS_LIKE:
-      case tflite::BuiltinOperator_FILL:
-        logError("Op code %d is currently not delegated to NNAPI", builtin);
-        return kTfLiteError;
-        break;
-      case tflite::BuiltinOperator_CUSTOM:
-        logError("Custom operations are not supported when using NNAPI.");
-        return kTfLiteError;
-        break;
-    }
-
-    if (nnapi_version == 11 && GetAndroidSdkVersionCached() < 28) {
-      logError("Op %d needs NNAPI1.1", builtin);
-      return kTfLiteError;
-    }
-
-    // Add the operation.
-    RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_addOperation(
-        nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
-        augmented_inputs.data(),
-        static_cast<uint32_t>(augmented_outputs.size()),
-        reinterpret_cast<uint32_t*>(augmented_outputs.data())));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
-  if (nn_model_ && nn_compiled_model_) return model_status_;
-
-  // TODO(aselle): This is not correct. need to handle resize invalidation.
-  if (!nn_model_) {
-    CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
-
-    // Find which tensors should be added to NNAPI. TFLite has temporaries
-    // and RNN back-edges which are are not valid for NNAPI. We look through all
-    // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with
-    // kOperandIdNotSet. addTensorOperands will replace those with the
-    // corresponding NNAPI operand ids and skip kOperandNotNeeded entries.
-    std::vector<int64_t> tensor_id_to_nnapi_id(interpreter->tensors_size(),
-                                               kOperandNotNeeded);
-    auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf,
-                                                       size_t count) {
-      for (int j = 0; j < count; j++) {
-        auto tensor_id = buf[j];
-        if (tensor_id != kOptionalTensor) {
-          tensor_id_to_nnapi_id[tensor_id] = kOperandIdNotSet;
-        }
-      }
-    };
-    for (size_t i = 0; i < interpreter->nodes_size(); i++) {
-      const auto* node_and_registration = interpreter->node_and_registration(i);
-      const TfLiteNode& node = node_and_registration->first;
-      set_ids_to_not_set(node.inputs->data, node.inputs->size);
-      set_ids_to_not_set(node.outputs->data, node.outputs->size);
-    }
-    set_ids_to_not_set(interpreter->inputs().data(),
-                       interpreter->inputs().size());
-    set_ids_to_not_set(interpreter->outputs().data(),
-                       interpreter->outputs().size());
-
-    uint32_t next_id = 0;
-    RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands(
-        interpreter, nn_model_, &next_id, &tensor_id_to_nnapi_id));
-    RETURN_ERROR_IF_TFLITE_FAILED(
-        AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
-                        &model_states_outputs_, tensor_id_to_nnapi_id));
-
-    std::vector<uint32_t> augmented_inputs;
-    MapAndAddTensorIds(interpreter->inputs().data(),
-                       interpreter->inputs().size(), &augmented_inputs,
-                       tensor_id_to_nnapi_id);
-    augmented_inputs.insert(augmented_inputs.end(),
-                            model_states_inputs_.begin(),
-                            model_states_inputs_.end());
-    std::vector<uint32_t> augmented_outputs;
-    MapAndAddTensorIds(interpreter->outputs().data(),
-                       interpreter->outputs().size(), &augmented_outputs,
-                       tensor_id_to_nnapi_id);
-    MapAndAddTensorIds(model_states_outputs_.data(),
-                       model_states_outputs_.size(), &augmented_outputs,
-                       tensor_id_to_nnapi_id);
-
-    CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
-        nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
-        reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
-        static_cast<uint32_t>(augmented_outputs.size()),
-        reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
-
-    if (GetAndroidSdkVersionCached() >= 28) {
-      CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-          nn_model_, interpreter->GetAllowFp16PrecisionForFp32()));
-    }
-    CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
-  }
-  if (!nn_compiled_model_) {
-    CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_));
-    CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_));
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
-  if (!nn_model_) {
-    model_status_ = BuildGraph(interpreter);
-    if (model_status_ != kTfLiteOk) {
-      logError("Failed to build graph for NNAPI");
-    }
-  }
-  if (model_status_ != kTfLiteOk) {
-    return model_status_;
-  }
-
-  ANeuralNetworksExecution* execution = nullptr;
-  CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
-
-  // Currently perform deep copy of input buffer
-  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
-    int input = interpreter->inputs()[i];
-    // TODO(aselle): Is this what we want or do we want input instead?
-    // TODO(aselle): This should be called setInputValue maybe to be cons.
-    TfLiteTensor* tensor = interpreter->tensor(input);
-    CHECK_NN(ANeuralNetworksExecution_setInput(
-        execution, i, nullptr, tensor->data.raw, tensor->bytes));
-  }
-
-  // Tell nn api where to place final data.
-  for (size_t i = 0; i < interpreter->outputs().size(); i++) {
-    int output = interpreter->outputs()[i];
-    TfLiteTensor* tensor = interpreter->tensor(output);
-    CHECK_NN(ANeuralNetworksExecution_setOutput(
-        execution, i, nullptr, tensor->data.raw, tensor->bytes));
-  }
-
-  // The state_out of previous invocation need to be mapped to state_in of
-  // current invocation.
-  for (size_t i = 0; i < model_states_outputs_.size(); i++) {
-    int state_tensor_idx = model_states_outputs_[i];
-    TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx);
-    // Here we are using a deep copy for state_in tensors so that we are not
-    // reading and writing into the same buffer during a invocation.
-    // TODO(miaowang): using double shared buffer to minimize the copies.
-    CHECK_NN(ANeuralNetworksExecution_setInput(
-        execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw,
-        tensor->bytes));
-    // Tell NNAPI where to output the state_out.
-    CHECK_NN(ANeuralNetworksExecution_setOutput(
-        execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw,
-        tensor->bytes));
-  }
-
-  // Currently use blocking compute.
-  ANeuralNetworksEvent* event = nullptr;
-  CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
-  CHECK_NN(ANeuralNetworksEvent_wait(event));
-  ANeuralNetworksEvent_free(event);
-  ANeuralNetworksExecution_free(execution);
-
-#if 0
-  printf("From the NN API:\n");
-  TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
-  if (float* data =
-          interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
-    size_t num = tensor->bytes / sizeof(float);
-    for (float* p = data; p < data + num; p++) {
-      printf(" %f", *p);
-    }
-    printf("\n");
-  }
-#endif
-
-  return kTfLiteOk;
-}
-
-bool NNAPIDelegate::IsSupported() { return NNAPIExists(); }
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/nnapi_delegate.h b/tensorflow/contrib/lite/nnapi_delegate.h
deleted file mode 100644
index 22359d557e61e3ca3a977803276f1c67a2229c22..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/nnapi_delegate.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
-#define TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
-
-#include "tensorflow/contrib/lite/allocation.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-
-class ANeuralNetworksModel;
-class ANeuralNetworksMemory;
-class ANeuralNetworksCompilation;
-
-namespace tflite {
-
-class NNAPIAllocation : public MMAPAllocation {
- public:
-  NNAPIAllocation(const char* filename, ErrorReporter* error_reporter);
-  ~NNAPIAllocation();
-
-  size_t offset(const void* ptr) const {
-    auto signed_offset = reinterpret_cast<const uint8_t*>(ptr) -
-                         reinterpret_cast<const uint8_t*>(mmapped_buffer_);
-
-    return static_cast<size_t>(signed_offset);
-  }
-
-  ANeuralNetworksMemory* memory() const { return handle_; }
-  bool valid() const override { return handle_ != nullptr; }
-
- private:
-  mutable ANeuralNetworksMemory* handle_ = nullptr;
-};
-
-class NNAPIDelegate {
- public:
-  ~NNAPIDelegate();
-
-  // Convert a tflite graph to NNAPI
-  TfLiteStatus BuildGraph(Interpreter* interpreter);
-
-  // Run
-  TfLiteStatus Invoke(Interpreter* interpreter);
-
-  // Whether the current platform supports NNAPI delegation.
-  static bool IsSupported();
-
- private:
-  // The NN API model handle
-  ANeuralNetworksModel* nn_model_ = nullptr;
-  // The NN API compilation handle
-  ANeuralNetworksCompilation* nn_compiled_model_ = nullptr;
-  // Model status
-  TfLiteStatus model_status_ = kTfLiteOk;
-
-  // List of state tensors for LSTM, RNN, SVDF.
-  // NN API does not allow ops to maintain states across multiple
-  // invocations. We need to manually create state input tensors from
-  // corresponding state output tensors of TFLite operations, and map them
-  // correctly.
-  std::vector<int> model_states_inputs_;   // holds NNAPI operand ids
-  std::vector<int> model_states_outputs_;  // holds TFLite tensor ids
-};
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/op_resolver.h b/tensorflow/contrib/lite/op_resolver.h
deleted file mode 100644
index e93134cbdecd58cb11e6be4d777549b7c63f6595..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/op_resolver.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Compatibility shim for moved header location.
-#ifndef TENSORFLOW_CONTRIB_LITE_OP_RESOLVER_H_
-#define TENSORFLOW_CONTRIB_LITE_OP_RESOLVER_H_
-
-#include "tensorflow/contrib/lite/core/api/op_resolver.h"
-#include "tensorflow/contrib/lite/mutable_op_resolver.h"
-
-#endif  // TENSORFLOW_CONTRIB_LITE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/profiling/BUILD b/tensorflow/contrib/lite/profiling/BUILD
deleted file mode 100644
index 1172722f7a70771af73eb07571349e431755471c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/profiling/BUILD
+++ /dev/null
@@ -1,84 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-
-common_copts = [
-    "-Wall",
-] + tflite_copts()
-
-cc_library(
-    name = "profiler",
-    hdrs = ["profiler.h"],
-    copts = common_copts,
-    deps = [":profile_buffer"],
-)
-
-cc_test(
-    name = "profiler_test",
-    srcs = ["profiler_test.cc"],
-    copts = ["-DTFLITE_PROFILING_ENABLED"],
-    defines = ["TFLITE_PROFILING_ENABLED"],
-    deps = [
-        ":profiler",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "profile_buffer",
-    hdrs = ["profile_buffer.h"],
-    copts = common_copts,
-    deps = [":time"],
-)
-
-cc_library(
-    name = "time",
-    srcs = ["time.cc"],
-    hdrs = ["time.h"],
-    copts = common_copts,
-)
-
-cc_library(
-    name = "profile_summarizer",
-    srcs = ["profile_summarizer.cc"],
-    hdrs = ["profile_summarizer.h"],
-    copts = common_copts,
-    deps = [
-        ":profiler",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/core:stats_calculator_portable",
-    ],
-)
-
-cc_test(
-    name = "profile_summarizer_test",
-    srcs = ["profile_summarizer_test.cc"],
-    copts = common_copts,
-    tags = ["no_oss"],
-    deps = [
-        ":profile_summarizer",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/kernels:kernel_util",
-        "//tensorflow/contrib/lite/kernels:test_util",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_test(
-    name = "profile_buffer_test",
-    srcs = ["profile_buffer_test.cc"],
-    copts = ["-DTFLITE_PROFILING_ENABLED"],
-    defines = ["TFLITE_PROFILING_ENABLED"],
-    deps = [
-        ":profile_buffer",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 916788f21500046a33c88016b7d13d7a46430fbe..893ddd78231c8a0d819cbe5776e6873bdab57355 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -1,191 +1,12 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-
-filegroup(
-    name = "interpreter_test_data",
-    srcs = glob(["**/testdata/*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-py_library(
-    name = "interpreter",
-    srcs = [
-        "interpreter.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/contrib/lite/python/interpreter_wrapper:tensorflow_wrap_interpreter_wrapper",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "interpreter_test",
-    srcs = ["interpreter_test.py"],
-    data = [":interpreter_test_data"],
-    srcs_version = "PY2AND3",
-    tags = ["no_oss"],
-    deps = [
-        ":interpreter",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_binary(
-    name = "tflite_convert",
-    srcs = ["tflite_convert.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":lite",
-    ],
-)
+licenses(["notice"])
 
+# DO NOT USE THIS TARGET. TensorFlow Lite has moved to tensorflow/lite.
 py_library(
     name = "lite",
-    srcs = ["lite.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":convert",
-        ":convert_saved_model",
-        ":interpreter",
-        ":lite_constants",
-        ":op_hint",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python/saved_model:constants",
-        "//tensorflow/python/saved_model:loader",
-        "//tensorflow/python/tools:freeze_graph_lib",
-    ],
-)
-
-py_test(
-    name = "lite_test",
-    srcs = ["lite_test.py"],
-    data = ["@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_windows",
-    ],
-    deps = [
-        ":lite",
-    ],
-)
-
-py_library(
-    name = "lite_constants",
-    srcs = ["lite_constants.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
-    ],
-)
-
-py_library(
-    name = "convert",
-    srcs = ["convert.py"],
+    srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        ":lite_constants",
-        "//tensorflow/contrib/lite/toco:model_flags_proto_py",
-        "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
-        "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco",
-        "//tensorflow/contrib/lite/toco/python:toco_from_protos",
-        "//tensorflow/python:platform",
-    ],
-)
-
-py_library(
-    name = "op_hint",
-    srcs = ["op_hint.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/graph_editor:graph_editor_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_test(
-    name = "convert_test",
-    srcs = ["convert_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no-internal-py3",
-        "no_oss",
-    ],
-    deps = [
-        ":convert",
-        ":interpreter",
-        ":op_hint",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
-    ],
-)
-
-py_library(
-    name = "convert_saved_model",
-    srcs = ["convert_saved_model.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow/contrib/lite:__subpackages__"],
-    deps = [
-        ":convert",
-        "//tensorflow/contrib/saved_model:saved_model_py",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/tools:freeze_graph_lib",
-    ],
-)
-
-py_binary(
-    name = "create_custom_op",
-    srcs = ["create_custom_op.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
-        "@absl_py//absl/flags",
-    ],
-)
-
-py_test(
-    name = "convert_saved_model_test",
-    srcs = ["convert_saved_model_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_windows",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":convert_saved_model",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/keras",
-        "//tensorflow/python/ops/losses",
-        "//tensorflow/python/saved_model",
+        "//tensorflow/lite/python:lite",
     ],
 )
diff --git a/tensorflow/contrib/lite/python/__init__.py b/tensorflow/contrib/lite/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27b1ffb251e76469092eb613d3c381718d8dc4fd
--- /dev/null
+++ b/tensorflow/contrib/lite/python/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow import lite
+
+import warnings as _warnings
+
+WARNING = ("WARNING: TF Lite has moved from tf.contrib.lite to tf.lite. Please "
+           "update your imports. This will be a breaking error in TensorFlow "
+           "version 2.0.")
+_warnings.warn(WARNING, PendingDeprecationWarning)
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
deleted file mode 100644
index 1bf42d7551f98250969b33377a5a22446692fe46..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/python/convert.py
+++ /dev/null
@@ -1,377 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Converts a frozen graph into a TFLite FlatBuffer."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import enum  # pylint: disable=g-bad-import-order
-
-import os as _os
-import platform as _platform
-import subprocess as _subprocess
-import tempfile as _tempfile
-
-from tensorflow.contrib.lite.python import lite_constants
-from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
-from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.python.platform import resource_loader as _resource_loader
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-# Lazy load since some of the performance benchmark skylark rules
-# break dependencies.
-_toco_python = LazyLoader(
-    "tensorflow_wrap_toco", globals(),
-    "tensorflow.contrib.lite.toco.python."
-    "tensorflow_wrap_toco")
-del LazyLoader
-
-# Find the toco_from_protos binary using the resource loader if using from
-# bazel, otherwise we are in a pip where console_scripts already has
-# the toco_from_protos tool.
-if lite_constants.EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
-  _toco_from_proto_bin = ""
-else:
-  _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
-      "../toco/python/toco_from_protos")
-
-if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
-  _toco_from_proto_bin = "toco_from_protos"
-
-
-class ConverterMode(enum.Enum):
-  """Enum class defining the converters available to generate TFLite models.
-
-  WARNING: Experimental interface, subject to change.
-  """
-  # Convert model using TOCO such that all ops are TensorFlow Lite native ops.
-  #
-  # This is the only supported mode for any models that contain operations that
-  # cannot be resolved in TensorFlow.
-  DEFAULT = "DEFAULT"
-
-  # Convert model using TOCO such that only unsupported operations are
-  # represented as TensorFlow ops.
-  # WARNING: Experimental interface, subject to change.
-  TOCO_FLEX = "TOCO_FLEX"
-
-  # Convert model using TOCO such that all operations are represented as
-  # TensorFlow ops.
-  # WARNING: Experimental interface, subject to change.
-  TOCO_FLEX_ALL = "TOCO_FLEX_ALL"
-
-  def __str__(self):
-    return self.value
-
-
-def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
-  """Convert `input_data_str` according to model and toco parameters.
-
-  Unless you know what you are doing consider using
-  the more friendly `tf.contrib.lite.toco_convert`.
-
-  Args:
-    model_flags_str: Serialized proto describing model properties, see
-      `toco/model_flags.proto`.
-    toco_flags_str: Serialized proto describing conversion properties, see
-      `toco/toco_flags.proto`.
-    input_data_str: Input data in serialized form (e.g. a graphdef is common)
-  Returns:
-    Converted model in serialized form (e.g. a TFLITE model is common).
-  Raises:
-    RuntimeError: When conversion fails, an exception is raised with the error
-      message embedded.
-  """
-  # TODO(aselle): When toco does not use fatal errors for failure, we can
-  # switch this on.
-  if not _toco_from_proto_bin:
-    return _toco_python.TocoConvert(
-        model_flags_str, toco_flags_str, input_data_str)
-
-  with _tempfile.NamedTemporaryFile() as fp_toco, \
-           _tempfile.NamedTemporaryFile() as fp_model, \
-           _tempfile.NamedTemporaryFile() as fp_input, \
-           _tempfile.NamedTemporaryFile() as fp_output:
-    fp_model.write(model_flags_str)
-    fp_toco.write(toco_flags_str)
-    fp_input.write(input_data_str)
-    fp_model.flush()
-    fp_toco.flush()
-    fp_input.flush()
-
-    cmd = [
-        _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name,
-        fp_output.name
-    ]
-    cmdline = " ".join(cmd)
-    is_windows = _platform.system() == "Windows"
-    proc = _subprocess.Popen(
-        cmdline,
-        shell=True,
-        stdout=_subprocess.PIPE,
-        stderr=_subprocess.STDOUT,
-        close_fds=not is_windows)
-    stdout, stderr = proc.communicate()
-    exitcode = proc.returncode
-    if exitcode == 0:
-      stuff = fp_output.read()
-      return stuff
-    else:
-      raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" %
-                         (stdout, stderr))
-
-
-def tensor_name(x):
-  return x.name.split(":")[0]
-
-
-def build_toco_convert_protos(input_tensors,
-                              output_tensors,
-                              inference_type=lite_constants.FLOAT,
-                              inference_input_type=None,
-                              input_format=lite_constants.TENSORFLOW_GRAPHDEF,
-                              input_shapes=None,
-                              output_format=lite_constants.TFLITE,
-                              quantized_input_stats=None,
-                              default_ranges_stats=None,
-                              drop_control_dependency=True,
-                              reorder_across_fake_quant=False,
-                              allow_custom_ops=False,
-                              change_concat_input_ranges=False,
-                              post_training_quantize=False,
-                              dump_graphviz_dir=None,
-                              dump_graphviz_video=False,
-                              converter_mode=ConverterMode.DEFAULT,
-                              allow_nonexistent_arrays=False):
-  """Builds protocol buffers describing a conversion of a model using TOCO.
-
-  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
-  case the default `input_format` and `output_format` are sufficient.
-
-  Args:
-    input_tensors: List of input tensors. Type and shape are computed using
-      `foo.get_shape()` and `foo.dtype`.
-    output_tensors: List of output tensors (only .name is used from this).
-    inference_type: Target data type of real-number arrays in the output file.
-      Must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
-    inference_input_type: Target data type of real-number input arrays. Allows
-      for a different type for input arrays in the case of quantization.
-      Must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
-    input_format: Type of data to read Currently must be
-      `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
-    input_shapes: Input array shape. It needs to be a list of the same length
-      as `input_tensors`, or None. (default None)
-    output_format: Output file format. Currently must be `{TFLITE,
-      GRAPHVIZ_DOT}`. (default TFLITE)
-    quantized_input_stats: List of tuples of floats representing the mean and
-      standard deviation. Each tuple maps to the corresponding input tensor.
-      Only need if `inference_input_type` is `QUANTIZED_UINT8`.
-      real_input_value = (quantized_input_value - mean_value) / std_dev_value.
-      (default None)
-    default_ranges_stats: Tuple of integers representing (min, max) range values
-      for all arrays without a specified range. Intended for experimenting with
-      quantization via "dummy quantization". (default None)
-    drop_control_dependency: Boolean indicating whether to drop control
-      dependencies silently. This is due to TFLite not supporting control
-      dependencies. (default True)
-    reorder_across_fake_quant: Boolean indicating whether to reorder FakeQuant
-      nodes in unexpected locations. Used when the location of the FakeQuant
-      nodes is preventing graph transformations necessary to convert the graph.
-      Results in a graph that differs from the quantized training graph,
-      potentially causing differing arithmetic behavior. (default False)
-    allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When false any unknown operation is an error. When true, custom ops are
-      created for any op that is unknown. The developer will need to provide
-      these to the TensorFlow Lite runtime with a custom resolver.
-      (default False)
-    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
-      inputs and outputs of the concat operator for quantized models. Changes
-      the ranges of concat operator overlap when true. (default False)
-    post_training_quantize: Boolean indicating whether to quantize the weights
-      of the converted float model. Model size will be reduced and there will be
-      latency improvements (at the cost of accuracy).
-      (default False)
-    dump_graphviz_dir: Full filepath of folder to dump the graphs at various
-      stages of processing GraphViz .dot files. Preferred over
-      --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
-      output file. (default None)
-    dump_graphviz_video: Boolean indicating whether to dump the graph after
-      every graph transformation. (default False)
-    converter_mode: Experimental flag, subject to change. ConverterMode
-      indicating which converter to use. (default ConverterMode.DEFAULT)
-    allow_nonexistent_arrays: Allow specifying array names that don't exist
-      or are unused in the final graph.  (default False)
-
-  Returns:
-    model_flags, toco_flags: two protocol buffers describing the conversion
-    process.
-
-  Raises:
-    ValueError: If the input tensor type is unknown
-    RuntimeError: If TOCO fails to convert (in which case the runtime error's
-      error text will contain the TOCO error log)
-  """
-  toco = _toco_flags_pb2.TocoFlags()
-  toco.input_format = input_format
-  toco.output_format = output_format
-  toco.inference_type = inference_type
-  if inference_input_type:
-    toco.inference_input_type = inference_input_type
-  else:
-    toco.inference_input_type = toco.inference_type
-  toco.drop_control_dependency = drop_control_dependency
-  toco.reorder_across_fake_quant = reorder_across_fake_quant
-  toco.allow_custom_ops = allow_custom_ops
-  toco.post_training_quantize = post_training_quantize
-  if default_ranges_stats:
-    toco.default_ranges_min = default_ranges_stats[0]
-    toco.default_ranges_max = default_ranges_stats[1]
-  if dump_graphviz_dir:
-    toco.dump_graphviz_dir = dump_graphviz_dir
-  toco.dump_graphviz_include_video = dump_graphviz_video
-  if converter_mode == ConverterMode.TOCO_FLEX:
-    toco.allow_flex_ops = True
-  elif converter_mode == ConverterMode.TOCO_FLEX_ALL:
-    toco.allow_flex_ops = True
-    toco.force_flex_ops = True
-
-  model = _model_flags_pb2.ModelFlags()
-  model.change_concat_input_ranges = change_concat_input_ranges
-  for idx, input_tensor in enumerate(input_tensors):
-    input_array = model.input_arrays.add()
-    if toco.inference_input_type == lite_constants.QUANTIZED_UINT8:
-      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
-    input_array.name = tensor_name(input_tensor)
-    if input_shapes is None:
-      shape = input_tensor.get_shape()
-    else:
-      shape = input_shapes[idx]
-    input_array.shape.dims.extend(map(int, shape))
-
-  for output_tensor in output_tensors:
-    model.output_arrays.append(tensor_name(output_tensor))
-
-  model.allow_nonexistent_arrays = allow_nonexistent_arrays
-
-  return model, toco
-
-
-def toco_convert_graph_def(input_data, input_arrays_with_shape, output_arrays,
-                           *args, **kwargs):
-  """"Convert a model using TOCO.
-
-  This function is used to convert GraphDefs that cannot be loaded into
-  TensorFlow to TFLite. Conversion can be customized by providing arguments
-  that are forwarded to `build_toco_convert_protos` (see documentation for
-  details).
-
-  Args:
-    input_data: Input data (i.e. often `sess.graph_def`),
-    input_arrays_with_shape: Tuple of strings representing input tensor names
-      and list of integers representing input shapes
-      (e.g., [("foo" : [1, 16, 16, 3])]). Use only when graph cannot be loaded
-      into TensorFlow and when `input_tensors` is None. (default None)
-    output_arrays: List of output tensors to freeze graph with. Use only when
-      graph cannot be loaded into TensorFlow and when `output_tensors` is None.
-      (default None)
-    *args: See `build_toco_convert_protos`,
-    **kwargs: See `build_toco_convert_protos`.
-
-  Returns:
-    The converted data. For example if TFLite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
-
-  Raises:
-    Defined in `build_toco_convert_protos`.
-  """
-  model_flags, toco_flags = build_toco_convert_protos(
-      input_tensors=[], output_tensors=[], *args, **kwargs)
-
-  for idx, (name, shape) in enumerate(input_arrays_with_shape):
-    input_array = model_flags.input_arrays.add()
-    if kwargs["inference_type"] == lite_constants.QUANTIZED_UINT8:
-      input_array.mean_value, input_array.std_value = kwargs[
-          "quantized_input_stats"][idx]
-    input_array.name = name
-    input_array.shape.dims.extend(map(int, shape))
-
-  for name in output_arrays:
-    model_flags.output_arrays.append(name)
-
-  data = toco_convert_protos(model_flags.SerializeToString(),
-                             toco_flags.SerializeToString(),
-                             input_data.SerializeToString())
-  return data
-
-
-def toco_convert_impl(input_data, input_tensors, output_tensors, *args,
-                      **kwargs):
-  """"Convert a model using TOCO.
-
-  Typically this function is used to convert from TensorFlow GraphDef to TFLite.
-  Conversion can be customized by providing arguments that are forwarded to
-  `build_toco_convert_protos` (see documentation for details).
-
-  Args:
-    input_data: Input data (i.e. often `sess.graph_def`),
-    input_tensors: List of input tensors. Type and shape are computed using
-      `foo.get_shape()` and `foo.dtype`.
-    output_tensors: List of output tensors (only .name is used from this).
-    *args: See `build_toco_convert_protos`,
-    **kwargs: See `build_toco_convert_protos`.
-
-  Returns:
-    The converted data. For example if TFLite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
-
-  Raises:
-    Defined in `build_toco_convert_protos`.
-  """
-  model_flags, toco_flags = build_toco_convert_protos(
-      input_tensors, output_tensors, *args, **kwargs)
-  data = toco_convert_protos(model_flags.SerializeToString(),
-                             toco_flags.SerializeToString(),
-                             input_data.SerializeToString())
-  return data
-
-
-@deprecation.deprecated(None, "Use `lite.TFLiteConverter` instead.")
-def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
-  """Convert a model using TOCO.
-
-  Typically this function is used to convert from TensorFlow GraphDef to TFLite.
-  Conversion can be customized by providing arguments that are forwarded to
-  `build_toco_convert_protos` (see documentation for details). This function has
-  been deprecated. Please use `lite.TFLiteConverter` instead.
-
-  Args:
-    input_data: Input data (i.e. often `sess.graph_def`),
-    input_tensors: List of input tensors. Type and shape are computed using
-      `foo.get_shape()` and `foo.dtype`.
-    output_tensors: List of output tensors (only .name is used from this).
-    *args: See `build_toco_convert_protos`,
-    **kwargs: See `build_toco_convert_protos`.
-
-  Returns:
-    The converted data. For example if TFLite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
-
-  Raises:
-    Defined in `build_toco_convert_protos`.
-  """
-  return toco_convert_impl(input_data, input_tensors, output_tensors, *args,
-                           **kwargs)
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
deleted file mode 100644
index 92c4ebb2465c2abaa1cefd020e69b2f7ad6a54a5..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ /dev/null
@@ -1,459 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TFLite SavedModel conversion test cases.
-
-  - Tests converting simple SavedModel graph to TFLite FlatBuffer.
-  - Tests converting simple SavedModel graph to frozen graph.
-  - Tests converting MNIST SavedModel to TFLite FlatBuffer.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-from tensorflow.contrib.lite.python import convert_saved_model
-from tensorflow.python import keras
-from tensorflow.python.client import session
-from tensorflow.python.estimator import estimator_lib as estimator
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
-from tensorflow.python.layers import layers
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import saved_model
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training import training as train
-
-
-class TensorFunctionsTest(test_util.TensorFlowTestCase):
-
-  def testGetTensorsValid(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
-
-    tensors = convert_saved_model.get_tensors_from_tensor_names(
-        sess.graph, ["Placeholder"])
-    self.assertEqual("Placeholder:0", tensors[0].name)
-
-  def testGetTensorsInvalid(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    _ = in_tensor + in_tensor
-    sess = session.Session()
-
-    with self.assertRaises(ValueError) as error:
-      convert_saved_model.get_tensors_from_tensor_names(sess.graph,
-                                                        ["invalid-input"])
-    self.assertEqual("Invalid tensors 'invalid-input' were found.",
-                     str(error.exception))
-
-  def testSetTensorShapeValid(self):
-    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
-
-    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
-    self.assertEqual([5, 3, 5], tensor.shape.as_list())
-
-  def testSetTensorShapeNoneValid(self):
-    tensor = array_ops.placeholder(dtype=dtypes.float32)
-    self.assertEqual(None, tensor.shape)
-
-    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
-    self.assertEqual([1, 3, 5], tensor.shape.as_list())
-
-  def testSetTensorShapeInvalid(self):
-    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
-
-    convert_saved_model.set_tensor_shapes([tensor],
-                                          {"invalid-input": [5, 3, 5]})
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
-
-  def testSetTensorShapeEmpty(self):
-    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
-
-    convert_saved_model.set_tensor_shapes([tensor], {})
-    self.assertEqual([None, 3, 5], tensor.shape.as_list())
-
-
-class FreezeSavedModelTest(test_util.TensorFlowTestCase):
-
-  def _createSimpleSavedModel(self, shape):
-    """Create a simple SavedModel on the fly."""
-    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
-    with session.Session() as sess:
-      in_tensor = array_ops.placeholder(shape=shape, dtype=dtypes.float32)
-      out_tensor = in_tensor + in_tensor
-      inputs = {"x": in_tensor}
-      outputs = {"y": out_tensor}
-      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
-    return saved_model_dir
-
-  def _createSavedModelTwoInputArrays(self, shape):
-    """Create a simple SavedModel."""
-    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
-    with session.Session() as sess:
-      in_tensor_1 = array_ops.placeholder(
-          shape=shape, dtype=dtypes.float32, name="inputB")
-      in_tensor_2 = array_ops.placeholder(
-          shape=shape, dtype=dtypes.float32, name="inputA")
-      out_tensor = in_tensor_1 + in_tensor_2
-      inputs = {"x": in_tensor_1, "y": in_tensor_2}
-      outputs = {"z": out_tensor}
-      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
-    return saved_model_dir
-
-  def _getArrayNames(self, tensors):
-    return [tensor.name for tensor in tensors]
-
-  def _getArrayShapes(self, tensors):
-    dims = []
-    for tensor in tensors:
-      dim_tensor = []
-      for dim in tensor.shape:
-        if isinstance(dim, tensor_shape.Dimension):
-          dim_tensor.append(dim.value)
-        else:
-          dim_tensor.append(dim)
-      dims.append(dim_tensor)
-    return dims
-
-  def _convertSavedModel(self,
-                         saved_model_dir,
-                         input_arrays=None,
-                         input_shapes=None,
-                         output_arrays=None,
-                         tag_set=None,
-                         signature_key=None):
-    if tag_set is None:
-      tag_set = set([tag_constants.SERVING])
-    if signature_key is None:
-      signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-    graph_def, in_tensors, out_tensors = convert_saved_model.freeze_saved_model(
-        saved_model_dir=saved_model_dir,
-        input_arrays=input_arrays,
-        input_shapes=input_shapes,
-        output_arrays=output_arrays,
-        tag_set=tag_set,
-        signature_key=signature_key)
-    return graph_def, in_tensors, out_tensors
-
-  def testSimpleSavedModel(self):
-    """Test a SavedModel."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    _, in_tensors, out_tensors = self._convertSavedModel(saved_model_dir)
-
-    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
-    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
-    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
-
-  def testSimpleSavedModelWithNoneBatchSizeInShape(self):
-    """Test a SavedModel with None in input tensor's shape."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
-    _, in_tensors, out_tensors = self._convertSavedModel(saved_model_dir)
-
-    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
-    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
-    self.assertEqual(self._getArrayShapes(in_tensors), [[None, 16, 16, 3]])
-
-  def testSimpleSavedModelWithInvalidSignatureKey(self):
-    """Test a SavedModel that fails due to an invalid signature_key."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    with self.assertRaises(ValueError) as error:
-      self._convertSavedModel(saved_model_dir, signature_key="invalid-key")
-    self.assertEqual(
-        "No 'invalid-key' in the SavedModel's SignatureDefs. "
-        "Possible values are 'serving_default'.", str(error.exception))
-
-  def testSimpleSavedModelWithInvalidOutputArray(self):
-    """Test a SavedModel that fails due to invalid output arrays."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    with self.assertRaises(ValueError) as error:
-      self._convertSavedModel(saved_model_dir, output_arrays=["invalid-output"])
-    self.assertEqual("Invalid tensors 'invalid-output' were found.",
-                     str(error.exception))
-
-  def testSimpleSavedModelWithWrongInputArrays(self):
-    """Test a SavedModel that fails due to invalid input arrays."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-
-    # Check invalid input_arrays.
-    with self.assertRaises(ValueError) as error:
-      self._convertSavedModel(saved_model_dir, input_arrays=["invalid-input"])
-    self.assertEqual("Invalid tensors 'invalid-input' were found.",
-                     str(error.exception))
-
-    # Check valid and invalid input_arrays.
-    with self.assertRaises(ValueError) as error:
-      self._convertSavedModel(
-          saved_model_dir, input_arrays=["Placeholder", "invalid-input"])
-    self.assertEqual("Invalid tensors 'invalid-input' were found.",
-                     str(error.exception))
-
-  def testSimpleSavedModelWithCorrectArrays(self):
-    """Test a SavedModel with correct input_arrays and output_arrays."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
-    _, in_tensors, out_tensors = self._convertSavedModel(
-        saved_model_dir=saved_model_dir,
-        input_arrays=["Placeholder"],
-        output_arrays=["add"])
-
-    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
-    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
-    self.assertEqual(self._getArrayShapes(in_tensors), [[None, 16, 16, 3]])
-
-  def testSimpleSavedModelWithCorrectInputArrays(self):
-    """Test a SavedModel with correct input_arrays and input_shapes."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    _, in_tensors, out_tensors = self._convertSavedModel(
-        saved_model_dir=saved_model_dir,
-        input_arrays=["Placeholder"],
-        input_shapes={"Placeholder": [1, 16, 16, 3]})
-
-    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
-    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
-    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
-
-  def testTwoInputArrays(self):
-    """Test a simple SavedModel."""
-    saved_model_dir = self._createSavedModelTwoInputArrays(shape=[1, 16, 16, 3])
-
-    _, in_tensors, out_tensors = self._convertSavedModel(
-        saved_model_dir=saved_model_dir, input_arrays=["inputB", "inputA"])
-
-    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
-    self.assertEqual(self._getArrayNames(in_tensors), ["inputA:0", "inputB:0"])
-    self.assertEqual(
-        self._getArrayShapes(in_tensors), [[1, 16, 16, 3], [1, 16, 16, 3]])
-
-  def testSubsetInputArrays(self):
-    """Test a SavedModel with a subset of the input array names of the model."""
-    saved_model_dir = self._createSavedModelTwoInputArrays(shape=[1, 16, 16, 3])
-
-    # Check case where input shape is given.
-    _, in_tensors, out_tensors = self._convertSavedModel(
-        saved_model_dir=saved_model_dir,
-        input_arrays=["inputA"],
-        input_shapes={"inputA": [1, 16, 16, 3]})
-
-    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
-    self.assertEqual(self._getArrayNames(in_tensors), ["inputA:0"])
-    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
-
-    # Check case where input shape is None.
-    _, in_tensors, out_tensors = self._convertSavedModel(
-        saved_model_dir=saved_model_dir, input_arrays=["inputA"])
-
-    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
-    self.assertEqual(self._getArrayNames(in_tensors), ["inputA:0"])
-    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
-
-  def testMultipleMetaGraphDef(self):
-    """Test saved model with multiple MetaGraphDefs."""
-    saved_model_dir = os.path.join(self.get_temp_dir(), "savedmodel_two_mgd")
-    builder = saved_model.builder.SavedModelBuilder(saved_model_dir)
-    with session.Session(graph=ops.Graph()) as sess:
-      # MetaGraphDef 1
-      in_tensor = array_ops.placeholder(shape=[1, 28, 28], dtype=dtypes.float32)
-      out_tensor = in_tensor + in_tensor
-      sig_input_tensor = saved_model.utils.build_tensor_info(in_tensor)
-      sig_input_tensor_signature = {"x": sig_input_tensor}
-      sig_output_tensor = saved_model.utils.build_tensor_info(out_tensor)
-      sig_output_tensor_signature = {"y": sig_output_tensor}
-      predict_signature_def = (
-          saved_model.signature_def_utils.build_signature_def(
-              sig_input_tensor_signature, sig_output_tensor_signature,
-              saved_model.signature_constants.PREDICT_METHOD_NAME))
-      signature_def_map = {
-          saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-              predict_signature_def
-      }
-      builder.add_meta_graph_and_variables(
-          sess,
-          tags=[saved_model.tag_constants.SERVING, "additional_test_tag"],
-          signature_def_map=signature_def_map)
-
-      # MetaGraphDef 2
-      builder.add_meta_graph(tags=["tflite"])
-      builder.save(True)
-
-    # Convert to tflite
-    _, in_tensors, out_tensors = self._convertSavedModel(
-        saved_model_dir=saved_model_dir,
-        tag_set=set([saved_model.tag_constants.SERVING, "additional_test_tag"]))
-
-    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
-    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
-    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 28, 28]])
-
-
-class Model(keras.Model):
-  """Model to recognize digits in the MNIST dataset.
-
-  Train and export SavedModel, used for testOnflyTrainMnistSavedModel
-
-  Network structure is equivalent to:
-  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
-  and
-  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
-
-  But written as a ops.keras.Model using the layers API.
-  """
-
-  def __init__(self, data_format):
-    """Creates a model for classifying a hand-written digit.
-
-    Args:
-      data_format: Either "channels_first" or "channels_last".
-        "channels_first" is typically faster on GPUs while "channels_last" is
-        typically faster on CPUs. See
-        https://www.tensorflow.org/performance/performance_guide#data_formats
-    """
-    super(Model, self).__init__()
-    self._input_shape = [-1, 28, 28, 1]
-
-    self.conv1 = layers.Conv2D(
-        32, 5, padding="same", data_format=data_format, activation=nn.relu)
-    self.conv2 = layers.Conv2D(
-        64, 5, padding="same", data_format=data_format, activation=nn.relu)
-    self.fc1 = layers.Dense(1024, activation=nn.relu)
-    self.fc2 = layers.Dense(10)
-    self.dropout = layers.Dropout(0.4)
-    self.max_pool2d = layers.MaxPooling2D(
-        (2, 2), (2, 2), padding="same", data_format=data_format)
-
-  def __call__(self, inputs, training):
-    """Add operations to classify a batch of input images.
-
-    Args:
-      inputs: A Tensor representing a batch of input images.
-      training: A boolean. Set to True to add operations required only when
-        training the classifier.
-
-    Returns:
-      A logits Tensor with shape [<batch_size>, 10].
-    """
-    y = array_ops.reshape(inputs, self._input_shape)
-    y = self.conv1(y)
-    y = self.max_pool2d(y)
-    y = self.conv2(y)
-    y = self.max_pool2d(y)
-    y = layers.flatten(y)
-    y = self.fc1(y)
-    y = self.dropout(y, training=training)
-    return self.fc2(y)
-
-
-def model_fn(features, labels, mode, params):
-  """The model_fn argument for creating an Estimator."""
-  model = Model(params["data_format"])
-  image = features
-  if isinstance(image, dict):
-    image = features["image"]
-
-  if mode == estimator.ModeKeys.PREDICT:
-    logits = model(image, training=False)
-    predictions = {
-        "classes": math_ops.argmax(logits, axis=1),
-        "probabilities": nn.softmax(logits),
-    }
-    return estimator.EstimatorSpec(
-        mode=estimator.ModeKeys.PREDICT,
-        predictions=predictions,
-        export_outputs={
-            "classify": estimator.export.PredictOutput(predictions)
-        })
-
-  elif mode == estimator.ModeKeys.TRAIN:
-    optimizer = train.AdamOptimizer(learning_rate=1e-4)
-
-    logits = model(image, training=True)
-    loss = losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-    return estimator.EstimatorSpec(
-        mode=estimator.ModeKeys.TRAIN,
-        loss=loss,
-        train_op=optimizer.minimize(loss, train.get_or_create_global_step()))
-
-  elif mode == estimator.ModeKeys.EVAL:
-    logits = model(image, training=False)
-    loss = losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-    return estimator.EstimatorSpec(
-        mode=estimator.ModeKeys.EVAL,
-        loss=loss,
-        eval_metric_ops={
-            "accuracy":
-                ops.metrics.accuracy(
-                    labels=labels, predictions=math_ops.argmax(logits, axis=1)),
-        })
-
-
-def dummy_input_fn():
-  image = random_ops.random_uniform([100, 784])
-  labels = random_ops.random_uniform([100, 1], maxval=9, dtype=dtypes.int32)
-  return image, labels
-
-
-class FreezeSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
-
-  def testTrainedMnistSavedModel(self):
-    """Test mnist SavedModel, trained with dummy data and small steps."""
-    # Build classifier
-    classifier = estimator.Estimator(
-        model_fn=model_fn,
-        params={
-            "data_format": "channels_last"  # tflite format
-        })
-
-    # Train and pred for serving
-    classifier.train(input_fn=dummy_input_fn, steps=2)
-    image = array_ops.placeholder(dtypes.float32, [None, 28, 28])
-    pred_input_fn = estimator.export.build_raw_serving_input_receiver_fn({
-        "image": image,
-    })
-
-    # Export SavedModel
-    saved_model_dir = os.path.join(self.get_temp_dir(), "mnist_savedmodel")
-    classifier.export_savedmodel(saved_model_dir, pred_input_fn)
-
-    # Convert to tflite and test output
-    saved_model_name = os.listdir(saved_model_dir)[0]
-    saved_model_final_dir = os.path.join(saved_model_dir, saved_model_name)
-
-    # TODO(zhixianyan): no need to limit output_arrays to `Softmax'
-    # once b/74205001 fixed and argmax implemented in tflite.
-    result = convert_saved_model.freeze_saved_model(
-        saved_model_dir=saved_model_final_dir,
-        input_arrays=None,
-        input_shapes=None,
-        output_arrays=["Softmax"],
-        tag_set=set([tag_constants.SERVING]),
-        signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)
-
-    self.assertTrue(result)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD b/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD
deleted file mode 100644
index 69ee95c320b72b68052c6f76f32c1493707f34b1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
-
-cc_library(
-    name = "interpreter_wrapper_lib",
-    srcs = ["interpreter_wrapper.cc"],
-    hdrs = ["interpreter_wrapper.h"],
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//third_party/py/numpy:headers",
-        "//third_party/python_runtime:headers",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_py_wrap_cc(
-    name = "tensorflow_wrap_interpreter_wrapper",
-    srcs = [
-        "interpreter_wrapper.i",
-    ],
-    deps = [
-        ":interpreter_wrapper_lib",
-        "//third_party/python_runtime:headers",
-    ],
-)
diff --git a/tensorflow/contrib/lite/python/lite_constants.py b/tensorflow/contrib/lite/python/lite_constants.py
deleted file mode 100644
index 195d7a732f337676937c7af5137d4dea84989c03..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/python/lite_constants.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Constants for TFLite."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
-from tensorflow.python.util.all_util import remove_undocumented
-
-# Enum types from the protobuf promoted to the API
-FLOAT = _types_pb2.FLOAT
-INT32 = _types_pb2.INT32
-INT64 = _types_pb2.INT64
-STRING = _types_pb2.STRING
-QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
-TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
-TFLITE = _toco_flags_pb2.TFLITE
-GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
-
-# Currently the default mode of operation is to shell to another python process
-# to protect against crashes. However, it breaks some dependent targets because
-# it forces us to depend on an external py_binary. The experimental API doesn't
-# have that drawback.
-EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
-
-
-_allowed_symbols = [
-    "FLOAT",
-    "INT32",
-    "INT64",
-    "STRING",
-    "QUANTIZED_UINT8",
-    "TENSORFLOW_GRAPHDEF",
-    "TFLITE",
-    "GRAPHVIZ_DOT",
-    "EXPERIMENTAL_USE_TOCO_API_DIRECTLY",
-]
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/schema/BUILD b/tensorflow/contrib/lite/schema/BUILD
deleted file mode 100644
index d892466c7a1d9c953644bd4e91a468a2e9702bde..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/schema/BUILD
+++ /dev/null
@@ -1,99 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
-
-py_binary(
-    name = "upgrade_schema",
-    srcs = [
-        "upgrade_schema.py",
-    ],
-    data = [
-        "schema_v0.fbs",
-        "schema_v1.fbs",
-        "schema_v2.fbs",
-        "schema_v3.fbs",
-        "@flatbuffers//:flatc",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:platform",
-    ],
-)
-
-# TODO(wvo): re-enable this test once latest FlatBuffers has landed.
-
-py_test(
-    name = "upgrade_schema_test",
-    size = "small",
-    srcs = ["upgrade_schema_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_oss",
-        "no_pip",
-        "notap",
-    ],
-    deps = [
-        ":upgrade_schema",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-exports_files([
-    "schema_v0.fbs",
-    "schema_v1.fbs",
-    "schema_v2.fbs",
-    "schema_v3.fbs",
-])
-
-load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
-
-# Generic schema for inference on device.
-flatbuffer_cc_library(
-    name = "schema_fbs",
-    srcs = ["schema.fbs"],
-)
-
-# Generic schema for inference on device (but with reflections makes bigger).
-flatbuffer_cc_library(
-    name = "schema_fbs_with_reflection",
-    srcs = ["schema.fbs"],
-    flatc_args = [
-        "--reflect-types",
-        "--reflect-names",
-        "--no-union-value-namespacing",
-        "--gen-object-api",
-    ],
-    gen_reflections = True,
-    out_prefix = "reflection/",
-)
-
-# Schema test to make sure we don't introduce backward incompatible changes
-# to schemas.
-cc_test(
-    name = "flatbuffer_compatibility_test",
-    size = "small",
-    srcs = ["flatbuffer_compatibility_test.cc"],
-    data = [
-        "schema.fbs",
-        "schema_v3.fbs",
-    ],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        "//tensorflow/core:lib_platform",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers//:flatc_library",
-    ],
-)
-
-tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/BUILD b/tensorflow/contrib/lite/schema/builtin_ops_header/BUILD
deleted file mode 100644
index 4a627761daf45b0fddd7b99e8a9c3d0d0ed2ee5e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/BUILD
+++ /dev/null
@@ -1,45 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-cc_library(
-    name = "generator",
-    srcs = ["generator.cc"],
-    hdrs = ["generator.h"],
-    deps = [
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-    ],
-)
-
-cc_binary(
-    name = "generate",
-    srcs = ["generate.cc"],
-    deps = [
-        ":generator",
-    ],
-)
-
-cc_test(
-    name = "generator_test",
-    srcs = ["generator_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":generator",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_test(
-    name = "consistency_test",
-    srcs = ["consistency_test.cc"],
-    data = [
-        "//tensorflow/contrib/lite:builtin_ops.h",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":generator",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/README.md b/tensorflow/contrib/lite/schema/builtin_ops_header/README.md
deleted file mode 100644
index f20d4f664e62fdd52e55339e45b9603307a2b671..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# Builtin Ops Header Generator.
-
-This directory contains a code generator to generate a pure C header for
-builtin op definition.
-
-Whenever you add a new builtin op, please execute:
-
-```sh
-bazel run \
-  //tensorflow/contrib/lite/schema/builtin_ops_header:generate > \
-  tensorflow/contrib/lite/builtin_ops.h
-```
diff --git a/tensorflow/contrib/lite/special_rules.bzl b/tensorflow/contrib/lite/special_rules.bzl
deleted file mode 100644
index 54083c49182c707620cbd231b957405cfe24be92..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/special_rules.bzl
+++ /dev/null
@@ -1,6 +0,0 @@
-"""External versions of build rules that differ outside of Google."""
-
-def tflite_portable_test_suite(**kwargs):
-  """This is a no-op outside of Google."""
-  _ignore = [kwargs]
-  pass
diff --git a/tensorflow/contrib/lite/string_util_test.cc b/tensorflow/contrib/lite/string_util_test.cc
deleted file mode 100644
index a583a9184be91b0b51ac3719bf734a1a6cf563ca..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/string_util_test.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/string_util.h"
-
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/testing/util.h"
-
-namespace tflite {
-
-TEST(StringUtil, TestStringUtil) {
-  Interpreter interpreter;
-  interpreter.AddTensors(3);
-
-  TfLiteTensor* t0 = interpreter.tensor(0);
-  t0->type = kTfLiteString;
-  t0->allocation_type = kTfLiteDynamic;
-
-  TfLiteTensor* t1 = interpreter.tensor(1);
-  t1->type = kTfLiteString;
-  t1->allocation_type = kTfLiteDynamic;
-
-  char data[] = {1, 0, 0, 0, 12, 0, 0, 0, 15, 0, 0, 0, 'X', 'Y', 'Z'};
-
-  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, {}, data,
-                                          15);
-  TfLiteTensor* t2 = interpreter.tensor(2);
-  interpreter.AllocateTensors();
-
-  char s0[] = "ABC";
-  string s1 = "DEFG";
-  char s2[] = "";
-
-  // Write strings to tensors
-  DynamicBuffer buf0;
-  buf0.AddString(s0, 3);
-  DynamicBuffer buf1;
-  buf1.AddString(s1.data(), s1.length());
-  buf0.AddString(s2, 0);
-  buf0.WriteToTensor(t0);
-  buf1.WriteToTensor(t1);
-
-  // Read strings from tensors.
-  ASSERT_EQ(GetStringCount(t0), 2);
-  StringRef str_ref;
-  str_ref = GetString(t0, 0);
-  ASSERT_EQ(string(str_ref.str, str_ref.len), "ABC");
-  str_ref = GetString(t0, 1);
-  ASSERT_EQ(string(str_ref.str, str_ref.len), "");
-  ASSERT_EQ(t0->bytes, 19);
-
-  ASSERT_EQ(GetStringCount(t1), 1);
-  str_ref = GetString(t1, 0);
-  ASSERT_EQ(string(str_ref.str, str_ref.len), "DEFG");
-  ASSERT_EQ(t1->bytes, 16);
-
-  ASSERT_EQ(GetStringCount(t2), 1);
-  str_ref = GetString(t2, 0);
-  ASSERT_EQ(string(str_ref.str, str_ref.len), "XYZ");
-  ASSERT_EQ(t2->bytes, 15);
-}
-
-TEST(StringUtil, TestAddJoinedString) {
-  Interpreter interpreter;
-  interpreter.AddTensors(1);
-  TfLiteTensor* t0 = interpreter.tensor(0);
-  t0->type = kTfLiteString;
-  t0->allocation_type = kTfLiteDynamic;
-
-  char s0[] = "ABC";
-  char s1[] = "DEFG";
-  char s2[] = "";
-  char s3[] = "XYZ";
-
-  DynamicBuffer buf;
-  buf.AddJoinedString({{s0, 3}, {s1, 4}, {s2, 0}, {s3, 3}}, ' ');
-  buf.WriteToTensor(t0);
-
-  ASSERT_EQ(GetStringCount(t0), 1);
-  StringRef str_ref;
-  str_ref = GetString(t0, 0);
-  ASSERT_EQ(string(str_ref.str, str_ref.len), "ABC DEFG  XYZ");
-  ASSERT_EQ(t0->bytes, 25);
-}
-
-TEST(StringUtil, TestEmptyList) {
-  Interpreter interpreter;
-  interpreter.AddTensors(1);
-  TfLiteTensor* t0 = interpreter.tensor(0);
-  t0->type = kTfLiteString;
-  t0->allocation_type = kTfLiteDynamic;
-  DynamicBuffer buf;
-  buf.WriteToTensor(t0);
-
-  ASSERT_EQ(GetStringCount(t0), 0);
-  ASSERT_EQ(t0->bytes, 8);
-}
-
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/testdata/add.bin b/tensorflow/contrib/lite/testdata/add.bin
deleted file mode 100644
index aef0fe3d82c9d92dc444076d3b46e05af1923f46..0000000000000000000000000000000000000000
Binary files a/tensorflow/contrib/lite/testdata/add.bin and /dev/null differ
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
deleted file mode 100644
index f0bfec23385151c047aba063cae951334f359222..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/testing/BUILD
+++ /dev/null
@@ -1,394 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow/contrib/lite:build_def.bzl",
-    "gen_zip_test",
-    "generated_test_models_all",
-)
-load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-    "py_test",
-)
-
-[gen_zip_test(
-    name = "zip_test_%s" % test_name,
-    size = "large",
-    srcs = ["generated_examples_zip_test.cc"],
-    args = [
-    ] + select({
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            "--zip_file_path=$(location :zip_%s)" % test_name,
-            # TODO(angerson) We may be able to add an external unzip binary instead
-            # of relying on an existing one for OSS builds.
-            "--unzip_binary_path=/usr/bin/unzip",
-        ],
-    }),
-    conversion_mode = conversion_mode,
-    data = [
-        ":zip_%s" % test_name,
-    ],
-    shard_count = 20,
-    tags = [
-        "gen_zip_test",
-        "no_oss",
-        "tflite_not_portable_intentional",
-    ],
-    test_name = test_name,
-    deps = [
-        ":parse_testdata_lib",
-        ":tflite_driver",
-        ":util",
-        "@com_google_googletest//:gtest",
-        "@com_googlesource_code_re2//:re2",
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ] + select({
-        "//conditions:default": [
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
-        ],
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
-        ],
-    }),
-) for conversion_mode, test_name in generated_test_models_all()]
-
-test_suite(
-    name = "generated_zip_tests",
-    tags = [
-        "gen_zip_test",
-    ],
-)
-
-py_binary(
-    name = "generate_examples",
-    srcs = ["generate_examples.py"],
-    data = [
-        "//tensorflow/contrib/lite/toco",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":generate_examples_report",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:graph_util",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "generate_examples_report",
-    srcs = ["generate_examples_report.py"],
-    srcs_version = "PY2AND3",
-)
-
-cc_library(
-    name = "parse_testdata_lib",
-    srcs = ["parse_testdata.cc"],
-    hdrs = ["parse_testdata.h"],
-    deps = [
-        ":message",
-        ":split",
-        ":test_runner",
-        "//tensorflow/contrib/lite:framework",
-    ],
-)
-
-cc_library(
-    name = "message",
-    srcs = ["message.cc"],
-    hdrs = ["message.h"],
-    deps = [":tokenize"],
-)
-
-cc_test(
-    name = "message_test",
-    srcs = ["message_test.cc"],
-    deps = [
-        ":message",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "split",
-    srcs = ["split.cc"],
-    hdrs = ["split.h"],
-    deps = [
-        "//tensorflow/contrib/lite:string",
-    ],
-)
-
-cc_test(
-    name = "split_test",
-    size = "small",
-    srcs = ["split_test.cc"],
-    deps = [
-        ":split",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "join",
-    hdrs = ["join.h"],
-    deps = ["//tensorflow/contrib/lite:string"],
-)
-
-cc_test(
-    name = "join_test",
-    size = "small",
-    srcs = ["join_test.cc"],
-    deps = [
-        ":join",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "tflite_driver",
-    srcs = ["tflite_driver.cc"],
-    hdrs = ["tflite_driver.h"],
-    deps = [
-        ":split",
-        ":test_runner",
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/delegates/flex:delegate",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ],
-)
-
-tf_cc_test(
-    name = "tflite_driver_test",
-    size = "small",
-    srcs = ["tflite_driver_test.cc"],
-    data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":tflite_driver",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "tokenize",
-    srcs = ["tokenize.cc"],
-    hdrs = ["tokenize.h"],
-    deps = [
-        "//tensorflow/contrib/lite:string",
-    ],
-)
-
-cc_test(
-    name = "tokenize_test",
-    srcs = ["tokenize_test.cc"],
-    deps = [
-        ":tokenize",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "test_runner",
-    hdrs = ["test_runner.h"],
-    deps = [
-        "//tensorflow/contrib/lite:string",
-    ],
-)
-
-cc_library(
-    name = "util",
-    hdrs = ["util.h"],
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string",
-        "//tensorflow/contrib/lite/core/api",
-    ],
-)
-
-cc_test(
-    name = "test_runner_test",
-    srcs = ["test_runner_test.cc"],
-    deps = [
-        ":test_runner",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_binary(
-    name = "nnapi_example",
-    srcs = ["nnapi_example.cc"],
-    deps = [
-        ":parse_testdata_lib",
-        ":tflite_driver",
-        "//tensorflow/contrib/lite/nnapi:nnapi_lib",
-    ],
-)
-
-cc_library(
-    name = "tf_driver",
-    srcs = ["tf_driver.cc"],
-    hdrs = ["tf_driver.h"],
-    deps = [
-        ":join",
-        ":split",
-        ":test_runner",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-    ],
-)
-
-cc_test(
-    name = "tf_driver_test",
-    size = "small",
-    srcs = ["tf_driver_test.cc"],
-    data = ["//tensorflow/contrib/lite:testdata/multi_add.pb"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":tf_driver",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "generate_testspec",
-    srcs = ["generate_testspec.cc"],
-    hdrs = ["generate_testspec.h"],
-    deps = [
-        ":join",
-        ":split",
-        ":tf_driver",
-        "//tensorflow/contrib/lite:string",
-        "//tensorflow/core:framework",
-    ],
-)
-
-cc_test(
-    name = "generate_testspec_test",
-    size = "small",
-    srcs = ["generate_testspec_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":generate_testspec",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "tflite_diff_util",
-    srcs = ["tflite_diff_util.cc"],
-    hdrs = ["tflite_diff_util.h"],
-    deps = [
-        ":generate_testspec",
-        ":parse_testdata_lib",
-        ":tflite_driver",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string",
-    ],
-)
-
-cc_library(
-    name = "tflite_diff_flags",
-    hdrs = ["tflite_diff_flags.h"],
-    deps = [
-        ":split",
-        ":tflite_diff_util",
-    ] + select({
-        "//conditions:default": [
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-        ],
-    }),
-)
-
-tf_cc_test(
-    name = "tflite_diff_example_test",
-    size = "medium",
-    srcs = ["tflite_diff_example_test.cc"],
-    args = [
-        "--tensorflow_model=third_party/tensorflow/contrib/lite/testdata/multi_add.pb",
-        "--tflite_model=third_party/tensorflow/contrib/lite/testdata/multi_add.bin",
-        "--input_layer=a,b,c,d",
-        "--input_layer_type=float,float,float,float",
-        "--input_layer_shape=1,3,4,3:1,3,4,3:1,3,4,3:1,3,4,3",
-        "--output_layer=x,y",
-    ],
-    data = [
-        "//tensorflow/contrib/lite:testdata/multi_add.bin",
-        "//tensorflow/contrib/lite:testdata/multi_add.pb",
-    ],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_oss",  # needs test data
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":tflite_diff_flags",
-        ":tflite_diff_util",
-    ],
-)
-
-cc_binary(
-    name = "tflite_diff",
-    srcs = ["tflite_diff_example_test.cc"],
-    deps = [
-        ":tflite_diff_flags",
-        ":tflite_diff_util",
-    ],
-)
-
-py_binary(
-    name = "model_coverage_lib",
-    srcs = ["//tensorflow/contrib/lite/testing:model_coverage/model_coverage_lib.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    visibility = ["//tensorflow/contrib/lite:__subpackages__"],
-    deps = [
-        "//tensorflow/contrib/lite/python:lite",
-        "//tensorflow/python:platform",
-    ],
-)
-
-py_test(
-    name = "model_coverage_lib_test",
-    srcs = ["//tensorflow/contrib/lite/testing:model_coverage/model_coverage_lib_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-        "notap",
-    ],
-    deps = [
-        ":model_coverage_lib",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/testing/split.cc b/tensorflow/contrib/lite/testing/split.cc
deleted file mode 100644
index 5836f4ff049b70c00d22524a3bf3327074281f3a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/testing/split.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/testing/split.h"
-
-namespace tflite {
-namespace testing {
-
-std::vector<std::pair<size_t, size_t>> SplitToPos(const string& s,
-                                                  const string& delimiter) {
-  std::vector<std::pair<size_t, size_t>> fields;
-  if (delimiter.length() == 0) {
-    fields.emplace_back(0, s.length());
-    return fields;
-  }
-  size_t pos = 0;
-  size_t start = 0;
-  while ((pos = s.find(delimiter, start)) != string::npos) {
-    if (pos != start) {
-      fields.emplace_back(start, pos);
-    }
-    start = pos + delimiter.length();
-  }
-  if (start != s.length()) {
-    fields.emplace_back(start, s.length());
-  }
-  return fields;
-}
-
-}  // namespace testing
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/split_test.cc b/tensorflow/contrib/lite/testing/split_test.cc
deleted file mode 100644
index 76b918cbcd83ef43c52057b84bcc2a8f4ff6b8f7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/testing/split_test.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/testing/split.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-namespace tflite {
-namespace testing {
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::Pair;
-
-TEST(SplitTest, SplitToPos) {
-  EXPECT_THAT(SplitToPos("test;:1-2-3 ;: test", ";:"),
-              ElementsAre(Pair(0, 4), Pair(6, 12), Pair(14, 19)));
-  EXPECT_THAT(SplitToPos("test;:1-2-3 ;: test", ":"),
-              ElementsAre(Pair(0, 5), Pair(6, 13), Pair(14, 19)));
-  EXPECT_THAT(SplitToPos("test", ":"), ElementsAre(Pair(0, 4)));
-  EXPECT_THAT(SplitToPos("test ", ":"), ElementsAre(Pair(0, 5)));
-  EXPECT_THAT(SplitToPos("", ":"), ElementsAre());
-  EXPECT_THAT(SplitToPos("test ", ""), ElementsAre(Pair(0, 5)));
-  EXPECT_THAT(SplitToPos("::::", ":"), ElementsAre());
-}
-
-TEST(SplitTest, SplitString) {
-  EXPECT_THAT(Split<string>("A;B;C", ";"), ElementsAre("A", "B", "C"));
-}
-
-TEST(SplitTest, SplitFloat) {
-  EXPECT_THAT(Split<float>("1.0 B 1e-5", " "), ElementsAre(1.0, 0.0, 1e-5));
-}
-
-TEST(SplitTest, SplitInt) {
-  EXPECT_THAT(Split<int>("1,-1,258", ","), ElementsAre(1, -1, 258));
-}
-
-TEST(SplitTest, SplitUint8) {
-  EXPECT_THAT(Split<uint8_t>("1,-1,258", ","), ElementsAre(1, 255, 2));
-}
-
-TEST(SplitTest, SplitBool) {
-  EXPECT_THAT(Split<bool>("1, 0, 0, 1", ","),
-              ElementsAre(true, false, false, true));
-}
-
-}  // namespace
-}  // namespace testing
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
deleted file mode 100644
index 30381ba028352e32a4220231eda45204889c05fb..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/testing/tf_driver.h"
-
-#include <fstream>
-#include <iostream>
-
-#include "tensorflow/contrib/lite/testing/join.h"
-#include "tensorflow/contrib/lite/testing/split.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-
-namespace tflite {
-namespace testing {
-
-namespace {
-
-tensorflow::Tensor CreateTensor(const tensorflow::DataType type,
-                                const std::vector<int64_t>& dim) {
-  tensorflow::TensorShape shape{tensorflow::gtl::ArraySlice<tensorflow::int64>{
-      reinterpret_cast<const tensorflow::int64*>(dim.data()), dim.size()}};
-  return {type, shape};
-}
-
-template <typename T>
-void FillTensorWithData(tensorflow::Tensor* tensor, const string& csv_values) {
-  auto data = tensor->flat<T>();
-
-  const auto& values = testing::Split<T>(csv_values, ",");
-  for (int i = 0; i < values.size(); i++) {
-    data(i) = values[i];
-  }
-}
-
-template <typename T>
-void FillTensorWithZeros(tensorflow::Tensor* tensor) {
-  auto data = tensor->flat<T>();
-  for (int i = 0; i < tensor->NumElements(); i++) {
-    data(i) = 0;
-  }
-}
-
-template <typename T>
-string TensorDataToCsvString(const tensorflow::Tensor& tensor) {
-  const auto& data = tensor.flat<T>();
-  return Join(data.data(), data.size(), ",");
-}
-
-}  // namespace
-
-TfDriver::TfDriver(const std::vector<string>& input_layer,
-                   const std::vector<string>& input_layer_type,
-                   const std::vector<string>& input_layer_shape,
-                   const std::vector<string>& output_layer)
-    : input_names_(input_layer), output_names_(output_layer) {
-  CHECK_EQ(input_layer.size(), input_layer_type.size());
-  CHECK_EQ(input_layer.size(), input_layer_shape.size());
-
-  input_ids_.resize(input_layer.size());
-  input_tensors_.reserve(input_layer.size());
-  input_types_.resize(input_layer.size());
-  input_shapes_.resize(input_layer.size());
-  for (int i = 0; i < input_layer.size(); i++) {
-    input_ids_[i] = i;
-    input_tensors_[input_layer[i]] = {};
-    CHECK(DataTypeFromString(input_layer_type[i], &input_types_[i]));
-    input_shapes_[i] = Split<int64_t>(input_layer_shape[i], ",");
-  }
-
-  output_ids_.resize(output_layer.size());
-  output_tensors_.reserve(output_layer.size());
-  for (int i = 0; i < output_layer.size(); i++) {
-    output_ids_[i] = i;
-  }
-}
-
-void TfDriver::LoadModel(const string& bin_file_path) {
-  if (!IsValid()) return;
-  std::ifstream model(bin_file_path);
-  if (model.fail()) {
-    Invalidate("Failed to find the model " + bin_file_path);
-    return;
-  }
-
-  tensorflow::GraphDef graphdef;
-  if (!graphdef.ParseFromIstream(&model)) {
-    Invalidate("Failed to parse tensorflow graphdef");
-    return;
-  }
-
-  tensorflow::SessionOptions options;
-  session_.reset(tensorflow::NewSession(options));
-  auto status = session_->Create(graphdef);
-  if (!status.ok()) {
-    Invalidate("Failed to create session. " + status.error_message());
-  }
-}
-
-void TfDriver::SetInput(int id, const string& csv_values) {
-  if (!IsValid()) return;
-
-  auto tensor = CreateTensor(input_types_[id], input_shapes_[id]);
-  switch (input_types_[id]) {
-    case tensorflow::DT_FLOAT: {
-      FillTensorWithData<float>(&tensor, csv_values);
-      break;
-    }
-    case tensorflow::DT_INT32: {
-      FillTensorWithData<int32_t>(&tensor, csv_values);
-      break;
-    }
-    case tensorflow::DT_UINT8: {
-      FillTensorWithData<uint8_t>(&tensor, csv_values);
-      break;
-    }
-    default:
-      fprintf(stderr, "Unsupported type %d in SetInput\n", input_types_[id]);
-      Invalidate("Unsupported tensor data type");
-      return;
-  }
-  input_tensors_[input_names_[id]] = tensor;
-}
-
-void TfDriver::ResetTensor(int id) {
-  if (!IsValid()) return;
-  auto tensor = input_tensors_[input_names_[id]];
-  switch (input_types_[id]) {
-    case tensorflow::DT_FLOAT: {
-      FillTensorWithZeros<float>(&tensor);
-      break;
-    }
-    case tensorflow::DT_INT32: {
-      FillTensorWithZeros<int32_t>(&tensor);
-      break;
-    }
-    default:
-      fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
-      Invalidate("Unsupported tensor data type");
-      return;
-  }
-}
-
-void TfDriver::ReshapeTensor(int id, const string& csv_values) {
-  input_shapes_[id] = Split<int64_t>(csv_values, ",");
-  input_tensors_[input_names_[id]] =
-      CreateTensor(input_types_[id], input_shapes_[id]);
-  ResetTensor(id);
-}
-
-string TfDriver::ReadOutput(int id) {
-  if (!IsValid()) return "";
-  switch (output_tensors_[id].dtype()) {
-    case tensorflow::DT_FLOAT:
-      return TensorDataToCsvString<float>(output_tensors_[id]);
-    case tensorflow::DT_INT32:
-      return TensorDataToCsvString<int32_t>(output_tensors_[id]);
-    case tensorflow::DT_UINT8:
-      return TensorDataToCsvString<uint8_t>(output_tensors_[id]);
-    default:
-      fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
-      Invalidate("Unsupported tensor data type");
-      return "";
-  }
-}
-
-void TfDriver::Invoke() {
-  if (!IsValid()) return;
-  auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
-                              output_names_, {}, &output_tensors_);
-  if (!status.ok()) {
-    Invalidate(
-        "Failed to run input data on graph. Make sure the correct value is "
-        "defined for the input and output arrays.");
-  }
-}
-
-}  // namespace testing
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tf_driver_test.cc b/tensorflow/contrib/lite/testing/tf_driver_test.cc
deleted file mode 100644
index c0faa4676adc3e846ad398bb203b77b99a2ba360..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/testing/tf_driver_test.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/testing/tf_driver.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-namespace tflite {
-namespace testing {
-namespace {
-
-using ::testing::ElementsAre;
-
-TEST(TfDriverTest, SimpleTest) {
-  std::unique_ptr<TfDriver> runner(
-      new TfDriver({"a", "b", "c", "d"}, {"float", "float", "float", "float"},
-                   {"1,8,8,3", "1,8,8,3", "1,8,8,3", "1,8,8,3"}, {"x", "y"}));
-
-  runner->LoadModel(
-      "third_party/tensorflow/contrib/lite/testdata/multi_add.pb");
-  EXPECT_TRUE(runner->IsValid()) << runner->GetErrorMessage();
-
-  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
-  ASSERT_THAT(runner->GetOutputs(), ElementsAre(0, 1));
-
-  for (int i : {0, 1, 2, 3}) {
-    runner->ReshapeTensor(i, "1,2,2,1");
-  }
-  ASSERT_TRUE(runner->IsValid());
-
-  runner->SetInput(0, "0.1,0.2,0.3,0.4");
-  runner->SetInput(1, "0.001,0.002,0.003,0.004");
-  runner->SetInput(2, "0.001,0.002,0.003,0.004");
-  runner->SetInput(3, "0.01,0.02,0.03,0.04");
-  runner->ResetTensor(2);
-  runner->Invoke();
-
-  ASSERT_EQ(runner->ReadOutput(0), "0.101,0.202,0.303,0.404");
-  ASSERT_EQ(runner->ReadOutput(1), "0.011,0.022,0.033,0.044");
-}
-
-}  // namespace
-}  // namespace testing
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
deleted file mode 100644
index ef49e6f8bc30a63144521571046d9dcbd22df22e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/testing/tflite_driver.h"
-
-#include <iostream>
-
-#include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
-#include "tensorflow/contrib/lite/testing/split.h"
-
-namespace tflite {
-namespace testing {
-
-namespace {
-
-// Returns the value in the given position in a tensor.
-template <typename T>
-T Value(const TfLitePtrUnion& data, int index);
-template <>
-float Value(const TfLitePtrUnion& data, int index) {
-  return data.f[index];
-}
-template <>
-int32_t Value(const TfLitePtrUnion& data, int index) {
-  return data.i32[index];
-}
-template <>
-int64_t Value(const TfLitePtrUnion& data, int index) {
-  return data.i64[index];
-}
-template <>
-uint8_t Value(const TfLitePtrUnion& data, int index) {
-  return data.uint8[index];
-}
-template <>
-bool Value(const TfLitePtrUnion& data, int index) {
-  return data.b[index];
-}
-
-template <typename T>
-void SetTensorData(const std::vector<T>& values, TfLitePtrUnion* data) {
-  T* input_ptr = reinterpret_cast<T*>(data->raw);
-  for (const T& v : values) {
-    *input_ptr = v;
-    ++input_ptr;
-  }
-}
-
-}  // namespace
-
-class TfLiteDriver::Expectation {
- public:
-  Expectation() {
-    data_.raw = nullptr;
-    num_elements_ = 0;
-  }
-  ~Expectation() { delete[] data_.raw; }
-  template <typename T>
-  void SetData(const string& csv_values) {
-    const auto& values = testing::Split<T>(csv_values, ",");
-    num_elements_ = values.size();
-    data_.raw = new char[num_elements_ * sizeof(T)];
-    SetTensorData(values, &data_);
-  }
-
-  bool Check(bool verbose, const TfLiteTensor& tensor) {
-    switch (tensor.type) {
-      case kTfLiteFloat32:
-        return TypedCheck<float>(verbose, tensor);
-      case kTfLiteInt32:
-        return TypedCheck<int32_t>(verbose, tensor);
-      case kTfLiteInt64:
-        return TypedCheck<int64_t>(verbose, tensor);
-      case kTfLiteUInt8:
-        return TypedCheck<uint8_t>(verbose, tensor);
-      case kTfLiteBool:
-        return TypedCheck<bool>(verbose, tensor);
-      default:
-        fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
-        return false;
-    }
-  }
-
- private:
-  template <typename T>
-  bool TypedCheck(bool verbose, const TfLiteTensor& tensor) {
-    // TODO(ahentz): must find a way to configure the tolerance.
-    constexpr double kRelativeThreshold = 1e-2f;
-    constexpr double kAbsoluteThreshold = 1e-4f;
-
-    size_t tensor_size = tensor.bytes / sizeof(T);
-
-    if (tensor_size != num_elements_) {
-      std::cerr << "Expected a tensor with " << num_elements_
-                << " elements, got " << tensor_size << std::endl;
-      return false;
-    }
-
-    bool good_output = true;
-    for (int i = 0; i < tensor_size; ++i) {
-      float computed = Value<T>(tensor.data, i);
-      float reference = Value<T>(data_, i);
-      float diff = std::abs(computed - reference);
-      bool error_is_large = false;
-      // For very small numbers, try absolute error, otherwise go with
-      // relative.
-      if (std::abs(reference) < kRelativeThreshold) {
-        error_is_large = (diff > kAbsoluteThreshold);
-      } else {
-        error_is_large = (diff > kRelativeThreshold * std::abs(reference));
-      }
-      if (error_is_large) {
-        good_output = false;
-        if (verbose) {
-          std::cerr << "  index " << i << ": got " << computed
-                    << ", but expected " << reference << std::endl;
-        }
-      }
-    }
-    return good_output;
-  }
-
-  TfLitePtrUnion data_;
-  size_t num_elements_;
-};
-
-TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
-    : use_nnapi_(use_nnapi) {
-  if (delegate_name == "FLEX") {
-    delegate_ = FlexDelegate::Create();
-  }
-}
-
-TfLiteDriver::~TfLiteDriver() {}
-
-void TfLiteDriver::AllocateTensors() {
-  if (must_allocate_tensors_) {
-    if (interpreter_->AllocateTensors() != kTfLiteOk) {
-      Invalidate("Failed to allocate tensors");
-      return;
-    }
-    ResetLSTMStateTensors();
-    must_allocate_tensors_ = false;
-  }
-}
-
-void TfLiteDriver::LoadModel(const string& bin_file_path) {
-  if (!IsValid()) return;
-
-  model_ = FlatBufferModel::BuildFromFile(GetFullPath(bin_file_path).c_str());
-  if (!model_) {
-    Invalidate("Failed to mmap model " + bin_file_path);
-    return;
-  }
-  ops::builtin::BuiltinOpResolver builtins;
-  InterpreterBuilder(*model_, builtins)(&interpreter_);
-  if (!interpreter_) {
-    Invalidate("Failed build interpreter");
-    return;
-  }
-  interpreter_->UseNNAPI(use_nnapi_);
-
-  if (delegate_) {
-    if (interpreter_->ModifyGraphWithDelegate(delegate_.get(),
-                                              /*allow_dynamic_tensors=*/true) !=
-        kTfLiteOk) {
-      Invalidate("Unable to the build graph using the delegate");
-      return;
-    }
-  }
-
-  must_allocate_tensors_ = true;
-}
-
-void TfLiteDriver::ResetTensor(int id) {
-  if (!IsValid()) return;
-  auto* tensor = interpreter_->tensor(id);
-  memset(tensor->data.raw, 0, tensor->bytes);
-}
-
-void TfLiteDriver::ReshapeTensor(int id, const string& csv_values) {
-  if (!IsValid()) return;
-  if (interpreter_->ResizeInputTensor(
-          id, testing::Split<int>(csv_values, ",")) != kTfLiteOk) {
-    Invalidate("Failed to resize input tensor " + std::to_string(id));
-    return;
-  }
-  must_allocate_tensors_ = true;
-}
-
-void TfLiteDriver::SetInput(int id, const string& csv_values) {
-  if (!IsValid()) return;
-  auto* tensor = interpreter_->tensor(id);
-  switch (tensor->type) {
-    case kTfLiteFloat32: {
-      const auto& values = testing::Split<float>(csv_values, ",");
-      if (!CheckSizes<float>(tensor->bytes, values.size())) return;
-      SetTensorData(values, &tensor->data);
-      break;
-    }
-    case kTfLiteInt32: {
-      const auto& values = testing::Split<int32_t>(csv_values, ",");
-      if (!CheckSizes<int32_t>(tensor->bytes, values.size())) return;
-      SetTensorData(values, &tensor->data);
-      break;
-    }
-    case kTfLiteInt64: {
-      const auto& values = testing::Split<int64_t>(csv_values, ",");
-      if (!CheckSizes<int64_t>(tensor->bytes, values.size())) return;
-      SetTensorData(values, &tensor->data);
-      break;
-    }
-    case kTfLiteUInt8: {
-      const auto& values = testing::Split<uint8_t>(csv_values, ",");
-      if (!CheckSizes<uint8_t>(tensor->bytes, values.size())) return;
-      SetTensorData(values, &tensor->data);
-      break;
-    }
-    case kTfLiteBool: {
-      const auto& values = testing::Split<bool>(csv_values, ",");
-      if (!CheckSizes<bool>(tensor->bytes, values.size())) return;
-      SetTensorData(values, &tensor->data);
-      break;
-    }
-    default:
-      fprintf(stderr, "Unsupported type %d in SetInput\n", tensor->type);
-      Invalidate("Unsupported tensor data type");
-      return;
-  }
-}
-
-void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
-  if (!IsValid()) return;
-  auto* tensor = interpreter_->tensor(id);
-  if (expected_output_.count(id) != 0) {
-    fprintf(stderr, "Overridden expectation for tensor %d\n", id);
-    Invalidate("Overridden expectation");
-  }
-  expected_output_[id].reset(new Expectation);
-  switch (tensor->type) {
-    case kTfLiteFloat32:
-      expected_output_[id]->SetData<float>(csv_values);
-      break;
-    case kTfLiteInt32:
-      expected_output_[id]->SetData<int32_t>(csv_values);
-      break;
-    case kTfLiteInt64:
-      expected_output_[id]->SetData<int64_t>(csv_values);
-      break;
-    case kTfLiteUInt8:
-      expected_output_[id]->SetData<uint8_t>(csv_values);
-      break;
-    case kTfLiteBool:
-      expected_output_[id]->SetData<bool>(csv_values);
-      break;
-    default:
-      fprintf(stderr, "Unsupported type %d in SetExpectation\n", tensor->type);
-      Invalidate("Unsupported tensor data type");
-      return;
-  }
-}
-
-void TfLiteDriver::Invoke() {
-  if (!IsValid()) return;
-  if (interpreter_->Invoke() != kTfLiteOk) {
-    Invalidate("Failed to invoke interpreter");
-  }
-}
-
-bool TfLiteDriver::CheckResults() {
-  if (!IsValid()) return false;
-  bool success = true;
-  for (const auto& p : expected_output_) {
-    int id = p.first;
-    auto* tensor = interpreter_->tensor(id);
-    if (!p.second->Check(/*verbose=*/false, *tensor)) {
-      // Do not invalidate anything here. Instead, simply output the
-      // differences and return false. Invalidating would prevent all
-      // subsequent invocations from running..
-      std::cerr << "There were errors in invocation '" << GetInvocationId()
-                << "', output tensor '" << id << "':" << std::endl;
-      p.second->Check(/*verbose=*/true, *tensor);
-      success = false;
-      SetOverallSuccess(false);
-    }
-  }
-  expected_output_.clear();
-  return success;
-}
-
-void TfLiteDriver::ResetLSTMStateTensors() {
-  interpreter_->ResetVariableTensors();
-}
-
-}  // namespace testing
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.h b/tensorflow/contrib/lite/testing/tflite_driver.h
deleted file mode 100644
index dc2a4e58773a9e069aa4420907c068039252c418..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/testing/tflite_driver.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
-
-#include <map>
-
-#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/testing/test_runner.h"
-
-namespace tflite {
-namespace testing {
-
-// A test runner that feeds inputs into TF Lite and verifies its outputs.
-class TfLiteDriver : public TestRunner {
- public:
-  explicit TfLiteDriver(bool use_nnapi, const string& delegate = "");
-  ~TfLiteDriver() override;
-
-  void LoadModel(const string& bin_file_path) override;
-  const std::vector<int>& GetInputs() override {
-    return interpreter_->inputs();
-  }
-  const std::vector<int>& GetOutputs() override {
-    return interpreter_->outputs();
-  }
-  void ReshapeTensor(int id, const string& csv_values) override;
-  void AllocateTensors() override;
-  void ResetTensor(int id) override;
-  void SetInput(int id, const string& csv_values) override;
-  void SetExpectation(int id, const string& csv_values) override;
-  void Invoke() override;
-  bool CheckResults() override;
-  string ReadOutput(int id) override { return "no-op"; }
-
- private:
-  void ResetLSTMStateTensors();
-
-  class Expectation;
-
-  std::unique_ptr<FlexDelegate> delegate_;
-  bool use_nnapi_ = false;
-  std::unique_ptr<FlatBufferModel> model_;
-  std::unique_ptr<Interpreter> interpreter_;
-  std::map<int, std::unique_ptr<Expectation>> expected_output_;
-  bool must_allocate_tensors_ = true;
-};
-
-}  // namespace testing
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
diff --git a/tensorflow/contrib/lite/testing/util.h b/tensorflow/contrib/lite/testing/util.h
deleted file mode 100644
index 925791d3908dc569a05f7c6b632448266c08c48f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/testing/util.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
-
-#include <cstdio>
-
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/string.h"
-
-namespace tflite {
-
-// An ErrorReporter that collects error message in a string, in addition
-// to printing to stderr.
-class TestErrorReporter : public ErrorReporter {
- public:
-  int Report(const char* format, va_list args) override {
-    char buffer[1024];
-    int size = vsnprintf(buffer, sizeof(buffer), format, args);
-    fprintf(stderr, "%s", buffer);
-    error_messages_ += buffer;
-    num_calls_++;
-    return size;
-  }
-
-  void Reset() {
-    num_calls_ = 0;
-    error_messages_.clear();
-  }
-
-  int num_calls() const { return num_calls_; }
-  const string& error_messages() const { return error_messages_; }
-
- private:
-  int num_calls_ = 0;
-  string error_messages_;
-};
-
-inline void LogToStderr() {
-#ifdef PLATFORM_GOOGLE
-  FLAGS_logtostderr = true;
-#endif
-}
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
deleted file mode 100644
index 96b88b60fc650981bc880c309a38836d694b3ad0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/BUILD
+++ /dev/null
@@ -1,430 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library_cc",
-    "tf_proto_library_py",
-)
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_binary",
-    "tf_cc_test",
-    "tf_copts",
-)
-
-tf_proto_library_cc(
-    name = "types_proto",
-    srcs = ["types.proto"],
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_cc(
-    name = "toco_flags_proto",
-    srcs = ["toco_flags.proto"],
-    protodeps = [":types_proto"],
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_cc(
-    name = "model_flags_proto",
-    srcs = ["model_flags.proto"],
-    protodeps = [":types_proto"],
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_py(
-    name = "types_proto",
-    srcs = [
-        "types.proto",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_py(
-    name = "toco_flags_proto",
-    srcs = [
-        "toco_flags.proto",
-    ],
-    protodeps = [":types_proto"],
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_py(
-    name = "model_flags_proto",
-    srcs = [
-        "model_flags.proto",
-    ],
-    protodeps = [":types_proto"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "tensorflow_core_cc_protos_all",
-    deps = ["//tensorflow/core:protos_all_cc"],
-)
-
-cc_library(
-    name = "runtime",
-    hdrs = [
-        "runtime/common.h",
-        "runtime/types.h",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/contrib/lite/kernels/internal:reference_base",
-        "//tensorflow/contrib/lite/kernels/internal:types",
-    ],
-)
-
-# :model offers the core data structures representing a model (a.k.a. "graph")
-# for tooling purposes (not needed at inference runtime).
-# That includes the top-level Model structure, and the lower-level Operator,
-# Array, Buffer structures, etc.
-cc_library(
-    name = "model",
-    hdrs = [
-        "model.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":model_flags_proto_cc",
-        ":runtime",
-        ":toco_port",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/types:optional",
-    ],
-)
-
-cc_library(
-    name = "toco_graphviz_dump_options",
-    srcs = [
-        "toco_graphviz_dump_options.cc",
-    ],
-    hdrs = [
-        "toco_graphviz_dump_options.h",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "toco_cmdline_flags",
-    srcs = [
-        "toco_cmdline_flags.cc",
-    ],
-    hdrs = [
-        "toco_cmdline_flags.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":model_cmdline_flags",
-        ":toco_flags_proto_cc",
-        ":toco_port",
-        ":types_proto_cc",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-    ],
-)
-
-cc_library(
-    name = "model_cmdline_flags",
-    srcs = [
-        "model_cmdline_flags.cc",
-    ],
-    hdrs = [
-        "args.h",
-        "model_cmdline_flags.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":model_flags_proto_cc",
-        ":toco_graphviz_dump_options",
-        ":toco_port",
-        ":types_proto_cc",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "toco_port",
-    srcs = [
-        "toco_port.cc",
-    ],
-    hdrs = [
-        "format_port.h",
-        "toco_port.h",
-        "toco_types.h",
-    ],
-    deps = [
-        # Placeholder for internal file dependency.
-        "@protobuf_archive//:protobuf_headers",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-cc_library(
-    name = "graph_transformations",
-    srcs = [
-        "graph_transformations/convert_expanddims_to_reshape.cc",
-        "graph_transformations/convert_pure_conv_to_depthwise.cc",
-        "graph_transformations/convert_reorder_axes.cc",
-        "graph_transformations/convert_squeeze_to_reshape.cc",
-        "graph_transformations/convert_trivial_addn_to_add.cc",
-        "graph_transformations/convert_trivial_pack_to_reshape.cc",
-        "graph_transformations/convert_trivial_tile_to_concat.cc",
-        "graph_transformations/convert_trivial_transpose_to_reshape.cc",
-        "graph_transformations/create_im2col_arrays.cc",
-        "graph_transformations/dequantize.cc",
-        "graph_transformations/drop_fake_quant.cc",
-        "graph_transformations/drop_im2col_arrays.cc",
-        "graph_transformations/ensure_bias_vectors.cc",
-        "graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc",
-        "graph_transformations/fuse_activation_functions.cc",
-        "graph_transformations/fuse_binary_into_following_affine.cc",
-        "graph_transformations/fuse_binary_into_preceding_affine.cc",
-        "graph_transformations/fuse_broadcast_into_following_binary.cc",
-        "graph_transformations/graph_transformations.cc",
-        "graph_transformations/hardcode_min_max.cc",
-        "graph_transformations/identify_dilated_conv.cc",
-        "graph_transformations/identify_l2_normalization.cc",
-        "graph_transformations/identify_l2_pool.cc",
-        "graph_transformations/identify_lstm.cc",
-        "graph_transformations/identify_lstm_merge_inputs.cc",
-        "graph_transformations/identify_lstm_split_inputs.cc",
-        "graph_transformations/identify_prelu.cc",
-        "graph_transformations/identify_relu1.cc",
-        "graph_transformations/lstm_utils.cc",
-        "graph_transformations/make_initial_dequantize_operator.cc",
-        "graph_transformations/merge_reshape_into_preceding_transpose.cc",
-        "graph_transformations/move_binary_operator_before_reshape.cc",
-        "graph_transformations/propagate_activation_function_into_constants.cc",
-        "graph_transformations/propagate_array_data_types.cc",
-        "graph_transformations/propagate_default_min_max.cc",
-        "graph_transformations/propagate_fake_quant_num_bits.cc",
-        "graph_transformations/propagate_fixed_sizes.cc",
-        "graph_transformations/quantization_util.cc",
-        "graph_transformations/quantization_util.h",
-        "graph_transformations/quantize.cc",
-        "graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc",
-        "graph_transformations/remove_final_dequantize_op.cc",
-        "graph_transformations/remove_tensorflow_assert.cc",
-        "graph_transformations/remove_tensorflow_identity.cc",
-        "graph_transformations/remove_trivial_binary.cc",
-        "graph_transformations/remove_trivial_concatenation.cc",
-        "graph_transformations/remove_trivial_concatenation_input.cc",
-        "graph_transformations/remove_trivial_fake_quant.cc",
-        "graph_transformations/remove_trivial_passthrough.cc",
-        "graph_transformations/remove_trivial_passthrough.h",
-        "graph_transformations/remove_trivial_quantized_activation_func.cc",
-        "graph_transformations/remove_trivial_quantized_min_max.cc",
-        "graph_transformations/remove_trivial_reshape.cc",
-        "graph_transformations/remove_trivial_slice.cc",
-        "graph_transformations/remove_unused_op.cc",
-        "graph_transformations/reorder_elementwise_unary.cc",
-        "graph_transformations/reorder_reshape_transpose.cc",
-        "graph_transformations/resolve_batch_normalization.cc",
-        "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
-        "graph_transformations/resolve_constant_binary.cc",
-        "graph_transformations/resolve_constant_concatenation.cc",
-        "graph_transformations/resolve_constant_fake_quant.cc",
-        "graph_transformations/resolve_constant_fill.cc",
-        "graph_transformations/resolve_constant_gather.cc",
-        "graph_transformations/resolve_constant_pack.cc",
-        "graph_transformations/resolve_constant_random_uniform.cc",
-        "graph_transformations/resolve_constant_range.cc",
-        "graph_transformations/resolve_constant_reshape.cc",
-        "graph_transformations/resolve_constant_select.cc",
-        "graph_transformations/resolve_constant_shape_or_rank.cc",
-        "graph_transformations/resolve_constant_slice.cc",
-        "graph_transformations/resolve_constant_strided_slice.cc",
-        "graph_transformations/resolve_constant_tile.cc",
-        "graph_transformations/resolve_constant_transpose.cc",
-        "graph_transformations/resolve_constant_unary.cc",
-        "graph_transformations/resolve_fake_quant_args_from_vars.cc",
-        "graph_transformations/resolve_gather_attributes.cc",
-        "graph_transformations/resolve_multiply_by_zero.cc",
-        "graph_transformations/resolve_pad_attributes.cc",
-        "graph_transformations/resolve_padv2_attributes.cc",
-        "graph_transformations/resolve_reduce_attributes.cc",
-        "graph_transformations/resolve_reorder_axes.cc",
-        "graph_transformations/resolve_reshape_attributes.cc",
-        "graph_transformations/resolve_slice_attributes.cc",
-        "graph_transformations/resolve_space_to_batch_nd_attributes.cc",
-        "graph_transformations/resolve_squeeze_attributes.cc",
-        "graph_transformations/resolve_strided_slice_attributes.cc",
-        "graph_transformations/resolve_tensorflow_concat.cc",
-        "graph_transformations/resolve_tensorflow_matmul.cc",
-        "graph_transformations/resolve_tensorflow_merge.cc",
-        "graph_transformations/resolve_tensorflow_switch.cc",
-        "graph_transformations/resolve_transpose_attributes.cc",
-        "graph_transformations/shuffle_fc_weights.cc",
-        "graph_transformations/unfuse_activation_functions.cc",
-        "graph_transformations/unpartition_embedding_lookup.cc",
-        "graph_transformations/unroll_batch_matmul.cc",
-    ],
-    hdrs = [
-        "graph_transformations/graph_transformations.h",
-        "graph_transformations/lstm_utils.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":model",
-        ":model_flags_proto_cc",
-        ":runtime",
-        ":toco_port",
-        ":tooling_util",
-        "//tensorflow/contrib/lite/kernels/internal:quantization_util",
-        "//tensorflow/contrib/lite/kernels/internal:strided_slice_logic",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-# :toco_tooling is the library providing the offline tooling functionality
-# exposed by the :toco command-line tool.
-cc_library(
-    name = "toco_tooling",
-    srcs = [
-        "allocate_transient_arrays.cc",
-        "export_tensorflow.cc",
-        "import_tensorflow.cc",
-        "tensorflow_util.cc",
-        "toco_tooling.cc",
-    ],
-    hdrs = [
-        "allocate_transient_arrays.h",
-        "export_tensorflow.h",
-        "import_tensorflow.h",
-        "tensorflow_util.h",
-        "toco_tooling.h",
-    ],
-    copts = tf_copts() + select({
-        "//tensorflow:darwin": ["-DTOCO_SUPPORT_PORTABLE_PROTOS=0"],
-        "//conditions:default": [],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":graph_transformations",
-        ":model",
-        ":model_flags_proto_cc",
-        ":types_proto_cc",
-        ":runtime",
-        ":toco_graphviz_dump_options",
-        ":toco_flags_proto_cc",
-        ":toco_port",
-        ":tooling_util",
-        "@protobuf_archive//:protobuf_headers",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/contrib/lite/toco/tensorflow_graph_matching:resolve_cluster",
-        "//tensorflow/contrib/lite/toco/tflite:export",
-        "//tensorflow/contrib/lite/toco/tflite:import",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ] + select({
-        # Placeholder for internal darwin rule.
-        "//conditions:default": [],
-    }),
-)
-
-tf_cc_test(
-    name = "import_tensorflow_test",
-    srcs = ["import_tensorflow_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":toco_tooling",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "tooling_util",
-    srcs = [
-        "dump_graphviz.cc",
-        "tooling_util.cc",
-    ],
-    hdrs = [
-        "dump_graphviz.h",
-        "tooling_util.h",
-    ],
-    copts = tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":model",
-        ":model_flags_proto_cc",
-        ":runtime",
-        ":toco_flags_proto_cc",
-        ":toco_graphviz_dump_options",
-        ":toco_port",
-        ":types_proto_cc",
-        "//tensorflow/contrib/lite/kernels/internal:types",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings",
-        "@com_googlesource_code_re2//:re2",
-        "@protobuf_archive//:protobuf_headers",
-    ],
-)
-
-tf_cc_test(
-    name = "tooling_util_test",
-    srcs = ["tooling_util_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":model",
-        ":tooling_util",
-        "//tensorflow/core:lib",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-# :toco is the main public command-line tool exposing the functionality
-# of the :toco_tooling library.
-tf_cc_binary(
-    name = "toco",
-    srcs = ["toco.cc"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":model",
-        ":model_cmdline_flags",
-        ":model_flags_proto_cc",
-        ":toco_cmdline_flags",
-        ":toco_flags_proto_cc",
-        ":toco_port",
-        ":toco_tooling",
-        ":types_proto_cc",
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core:lib",
-        # We cannot embed the core:ops dependency directly into :toco_tooling as
-        # it can conflict with downstream deps when toco is used as a library.
-        "//tensorflow/core:ops",
-    ],
-)
-
-tf_cc_test(
-    name = "toco_port_test",
-    srcs = ["toco_port_test.cc"],
-    data = [
-        "toco_port_test.cc",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":toco_port",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/contrib/lite/toco/README.md b/tensorflow/contrib/lite/toco/README.md
deleted file mode 100644
index 2db6a627ab59604a99cafe3b38df08b70092d989..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# TOCO: TensorFlow Lite Optimizing Converter
-
-The TensorFlow Lite Optimizing Converter converts TensorFlow graphs into
-TensorFlow Lite graphs. There are additional usages that are also detailed in
-the usage documentation.
-
-## Usage documentation
-
-Usage information is given in these documents:
-
-*   [Command-line glossary](g3doc/cmdline_reference.md)
-*   [Command-line examples](g3doc/cmdline_examples.md)
-*   [Python API examples](g3doc/python_api.md)
-
-## Where the converter fits in the TensorFlow landscape
-
-Once an application developer has a trained TensorFlow model, TOCO will accept
-that model and generate a TensorFlow Lite
-[FlatBuffer](https://google.github.io/flatbuffers/) file. TOCO currently supports
-[SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators),
-frozen graphs (models generated via
-[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)),
-and `tf.Keras` model files.  The TensorFlow Lite FlatBuffer file can be shipped
-to client devices, generally mobile devices, where the TensorFlow Lite
-interpreter handles them on-device.  This flow is represented in the diagram
-below.
-
-![drawing](g3doc/toco_landscape.svg)
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
deleted file mode 100644
index aba7536cbd3fbec509390158896e078e6379c848..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ /dev/null
@@ -1,370 +0,0 @@
-# TensorFlow Lite Optimizing Converter command-line examples
-
-This page provides examples on how to use TOCO via command line. It is
-complemented by the following documents:
-
-*   [README](../README.md)
-*   [Command-line glossary](cmdline_reference.md)
-*   [Python API examples](python_api.md)
-
-Table of contents:
-
-*   [Command-line tools](#tools)
-    *   [Converting models prior to TensorFlow 1.9.](#pre-tensorflow-1.9)
-*   [Basic examples](#basic)
-    *   [Convert a TensorFlow GraphDef](#graphdef)
-    *   [Convert a TensorFlow SavedModel](#savedmodel)
-    *   [Convert a tf.keras model](#keras)
-*   [Quantization](#quantization)
-    *   [Convert a TensorFlow GraphDef for quantized inference](#graphdef-quant)
-    *   [Use "dummy-quantization" to try out quantized inference on a float
-        graph](#dummy-quant)
-*   [Specifying input and output arrays](#specifying-input-and-output-arrays)
-    *   [Multiple input arrays](#multiple-input-arrays)
-    *   [Multiple output arrays](#multiple-output-arrays)
-    *   [Specifying subgraphs](#specifying-subgraphs)
-*   [Graph visualizations](#graph-visualizations)
-    *   [Using --output_format=GRAPHVIZ_DOT](#using-output-format-graphviz-dot)
-    *   [Using --dump_graphviz_dir](#using-dump-graphviz-dir)
-    *   [Graph "video" logging](#graph-video-logging)
-    *   [Legend for the graph visualizations](#graphviz-legend)
-
-## Command-line tools <a name="tools"></a>
-
-There are two approaches to running TOCO via command line.
-
-*   `tflite_convert`: Starting from TensorFlow 1.9, the command-line tool
-    `tflite_convert` will be installed as part of the Python package. All of the
-    examples below use `tflite_convert` for simplicity.
-    *   Example: `tflite_convert --output_file=...`
-*   `bazel`: In order to run the latest version of TOCO, [clone the TensorFlow
-    repository](https://www.tensorflow.org/install/source)
-    and use `bazel`. This is the recommended approach for converting models that
-    utilize new features that were not supported by TOCO in TensorFlow 1.9.
-    *   Example: `bazel run
-        //tensorflow/contrib/lite/python:tflite_convert --
-        --output_file=...`
-
-### Converting models prior to TensorFlow 1.9. <a name="pre-tensorflow-1.9"></a>
-
-The recommended approach for using TOCO prior to TensorFlow 1.9 is the [Python
-API](python_api.md#pre-tensorflow-1.9). If a command line tool is desired, the
-`toco` command line tool was available in TensorFlow 1.7. Enter `toco --help` in
-Terminal for additional details on the command-line flags available. There were
-no command line tools in TensorFlow 1.8.
-
-## Basic examples <a name="basic"></a>
-
-The following section shows examples of how to convert a basic float-point model
-from each of the supported data formats into a TensorFlow Lite FlatBuffers.
-
-### Convert a TensorFlow GraphDef <a name="graphdef"></a>
-
-The follow example converts a basic TensorFlow GraphDef (frozen by
-[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py))
-into a TensorFlow Lite FlatBuffer to perform floating-point inference. Frozen
-graphs contain the variables stored in Checkpoint files as Const ops.
-
-```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1
-```
-
-The value for `input_shapes` is automatically determined whenever possible.
-
-### Convert a TensorFlow SavedModel <a name="savedmodel"></a>
-
-The follow example converts a basic TensorFlow SavedModel into a Tensorflow Lite
-FlatBuffer to perform floating-point inference.
-
-```
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --saved_model_dir=/tmp/saved_model
-```
-
-[SavedModel](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators)
-has fewer required flags than frozen graphs due to access to additional data
-contained within the SavedModel. The values for `--input_arrays` and
-`--output_arrays` are an aggregated, alphabetized list of the inputs and outputs
-in the [SignatureDefs](https://www.tensorflow.org/serving/signature_defs) within
-the
-[MetaGraphDef](https://www.tensorflow.org/guide/saved_model#apis_to_build_and_load_a_savedmodel)
-specified by `--saved_model_tag_set`. As with the GraphDef, the value for
-`input_shapes` is automatically determined whenever possible.
-
-There is currently no support for MetaGraphDefs without a SignatureDef or for
-MetaGraphDefs that use the [`assets/`
-directory](https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory).
-
-### Convert a tf.Keras model <a name="keras"></a>
-
-The following example converts a `tf.keras` model into a TensorFlow Lite
-Flatbuffer. The `tf.keras` file must contain both the model and the weights.
-
-```
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --keras_model_file=/tmp/keras_model.h5
-```
-
-## Quantization
-
-### Convert a TensorFlow GraphDef for quantized inference <a name="graphdef-quant"></a>
-
-TOCO is compatible with fixed point quantization models described
-[here](https://www.tensorflow.org/performance/quantization). These are float
-models with
-[`FakeQuant*`](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization)
-ops inserted at the boundaries of fused layers to record min-max range
-information. This generates a quantized inference workload that reproduces the
-quantization behavior that was used during training.
-
-The following command generates a quantized TensorFlow Lite FlatBuffer from a
-"quantized" TensorFlow GraphDef.
-
-```
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --graph_def_file=/tmp/some_quantized_graph.pb \
-  --inference_type=QUANTIZED_UINT8 \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --mean_values=128 \
-  --std_dev_values=127
-```
-
-### Use \"dummy-quantization\" to try out quantized inference on a float graph <a name="dummy-quant"></a>
-
-In order to evaluate the possible benefit of generating a quantized graph, TOCO
-allows "dummy-quantization" on float graphs. The flags `--default_ranges_min`
-and `--default_ranges_max` accept plausible values for the min-max ranges of the
-values in all arrays that do not have min-max information. "Dummy-quantization"
-will produce lower accuracy but will emulate the performance of a correctly
-quantized model.
-
-The example below contains a model using Relu6 activation functions. Therefore,
-a reasonable guess is that most activation ranges should be contained in [0, 6].
-
-```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
-tflite_convert \
-  --output_file=/tmp/foo.cc \
-  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --inference_type=QUANTIZED_UINT8 \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --default_ranges_min=0 \
-  --default_ranges_max=6 \
-  --mean_values=128 \
-  --std_dev_values=127
-```
-
-## Specifying input and output arrays
-
-### Multiple input arrays
-
-The flag `input_arrays` takes in a comma-separated list of input arrays as seen
-in the example below. This is useful for models or subgraphs with multiple
-inputs.
-
-```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
-  | tar xzv -C /tmp
-tflite_convert \
-  --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
-  --output_file=/tmp/foo.tflite \
-  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
-  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
-  --output_arrays=InceptionV1/Logits/Predictions/Reshape_1
-```
-
-Note that `input_shapes` is provided as a colon-separated list. Each input shape
-corresponds to the input array at the same position in the respective list.
-
-### Multiple output arrays
-
-The flag `output_arrays` takes in a comma-separated list of output arrays as
-seen in the example below. This is useful for models or subgraphs with multiple
-outputs.
-
-```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
-  | tar xzv -C /tmp
-tflite_convert \
-  --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
-  --output_file=/tmp/foo.tflite \
-  --input_arrays=input \
-  --output_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu
-```
-
-### Specifying subgraphs
-
-Any array in the input file can be specified as an input or output array in
-order to extract subgraphs out of an input graph file. TOCO discards the parts
-of the graph outside of the specific subgraph. Use [graph
-visualizations](#graph-visualizations) to identify the input and output arrays
-that make up the desired subgraph.
-
-The follow command shows how to extract a single fused layer out of a TensorFlow
-GraphDef.
-
-```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
-  | tar xzv -C /tmp
-tflite_convert \
-  --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
-  --output_file=/tmp/foo.pb \
-  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
-  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
-  --output_arrays=InceptionV1/InceptionV1/Mixed_3b/concat_v2
-```
-
-Note that the final representation in TensorFlow Lite FlatBuffers tends to have
-coarser granularity than the very fine granularity of the TensorFlow GraphDef
-representation. For example, while a fully-connected layer is typically
-represented as at least four separate ops in TensorFlow GraphDef (Reshape,
-MatMul, BiasAdd, Relu...), it is typically represented as a single "fused" op
-(FullyConnected) in the converter's optimized representation and in the final
-on-device representation. As the level of granularity gets coarser, some
-intermediate arrays (say, the array between the MatMul and the BiasAdd in the
-TensorFlow GraphDef) are dropped.
-
-When specifying intermediate arrays as `--input_arrays` and `--output_arrays`,
-it is desirable (and often required) to specify arrays that are meant to survive
-in the final form of the graph, after fusing. These are typically the outputs of
-activation functions (since everything in each layer until the activation
-function tends to get fused).
-
-## Logging
-
-
-## Graph visualizations
-
-TOCO can export a graph to the Graphviz Dot format for easy visualization via
-either the `--output_format` flag or the `--dump_graphviz_dir` flag. The
-subsections below outline the use cases for each.
-
-### Using `--output_format=GRAPHVIZ_DOT` <a name="using-output-format-graphviz-dot"></a>
-
-The first way to get a Graphviz rendering is to pass `GRAPHVIZ_DOT` into
-`--output_format`. This results in a plausible visualization of the graph. This
-reduces the requirements that exist during conversion from a TensorFlow GraphDef
-to a TensorFlow Lite FlatBuffer. This may be useful if the conversion to TFLite
-is failing.
-
-```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
-tflite_convert \
-  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.dot \
-  --output_format=GRAPHVIZ_DOT \
-  --input_shape=1,128,128,3 \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1
-```
-
-The resulting `.dot` file can be rendered into a PDF as follows:
-
-```
-dot -Tpdf -O /tmp/foo.dot
-```
-
-And the resulting `.dot.pdf` can be viewed in any PDF viewer, but we suggest one
-with a good ability to pan and zoom across a very large page. Google Chrome does
-well in that respect.
-
-```
-google-chrome /tmp/foo.dot.pdf
-```
-
-Example PDF files are viewable online in the next section.
-
-### Using `--dump_graphviz_dir`
-
-The second way to get a Graphviz rendering is to pass the `--dump_graphviz_dir`
-flag, specifying a destination directory to dump Graphviz rendering to. Unlike
-the previous approach, this one retains the original output format. This
-provides a visualization of the actual graph resulting from a specific
-conversion process.
-
-```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
-tflite_convert \
-  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.tflite \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --dump_graphviz_dir=/tmp
-```
-
-This generates a few files in the destination directory. The two most important
-files are `toco_AT_IMPORT.dot` and `/tmp/toco_AFTER_TRANSFORMATIONS.dot`.
-`toco_AT_IMPORT.dot` represents the original graph containing only the
-transformations done at import time. This tends to be a complex visualization
-with limited information about each node. It is useful in situations where a
-conversion command fails.
-
-`toco_AFTER_TRANSFORMATIONS.dot` represents the graph after all transformations
-were applied to it, just before it is exported. Typically, this is a much
-smaller graph with more information about each node.
-
-As before, these can be rendered to PDFs:
-
-```
-dot -Tpdf -O /tmp/toco_*.dot
-```
-
-Sample output files can be seen here:
-
-*   [toco_AT_IMPORT.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AT_IMPORT.dot.pdf)
-*   [toco_AFTER_TRANSFORMATIONS.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AFTER_TRANSFORMATIONS.dot.pdf).
-
-### Graph "video" logging
-
-When `--dump_graphviz_dir` is used, one may additionally pass
-`--dump_graphviz_video`. This causes a graph visualization to be dumped after
-each individual graph transformation, resulting in thousands of files.
-Typically, one would then bisect into these files to understand when a given
-change was introduced in the graph.
-
-### Legend for the graph visualizations <a name="graphviz-legend"></a>
-
-*   Operators are red square boxes with the following hues of red:
-    *   Most operators are
-        <span style="background-color:#db4437;color:white;border:1px;border-style:solid;border-color:black;padding:1px">bright
-        red</span>.
-    *   Some typically heavy operators (e.g. Conv) are rendered in a
-        <span style="background-color:#c53929;color:white;border:1px;border-style:solid;border-color:black;padding:1px">darker
-        red</span>.
-*   Arrays are octogons with the following colors:
-    *   Constant arrays are
-        <span style="background-color:#4285f4;color:white;border:1px;border-style:solid;border-color:black;padding:1px">blue</span>.
-    *   Activation arrays are gray:
-        *   Internal (intermediate) activation arrays are
-            <span style="background-color:#f5f5f5;border:1px;border-style:solid;border-color:black;border:1px;border-style:solid;border-color:black;padding:1px">light
-            gray</span>.
-        *   Those activation arrays that are designated as `--input_arrays` or
-            `--output_arrays` are
-            <span style="background-color:#9e9e9e;border:1px;border-style:solid;border-color:black;padding:1px">dark
-            gray</span>.
-    *   RNN state arrays are green. Because of the way that the converter
-        represents RNN back-edges explicitly, each RNN state is represented by a
-        pair of green arrays:
-        *   The activation array that is the source of the RNN back-edge (i.e.
-            whose contents are copied into the RNN state array after having been
-            computed) is
-            <span style="background-color:#b7e1cd;border:1px;border-style:solid;border-color:black;padding:1px">light
-            green</span>.
-        *   The actual RNN state array is
-            <span style="background-color:#0f9d58;color:white;border:1px;border-style:solid;border-color:black;padding:1px">dark
-            green</span>. It is the destination of the RNN back-edge updating
-            it.
diff --git a/tensorflow/contrib/lite/toco/g3doc/python_api.md b/tensorflow/contrib/lite/toco/g3doc/python_api.md
deleted file mode 100644
index 8c31c3dca865640ee1a60cbcc93b741f2d7d52cf..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/g3doc/python_api.md
+++ /dev/null
@@ -1,274 +0,0 @@
-# TensorFlow Lite Optimizing Converter & Interpreter Python API reference
-
-This page provides examples on how to use TOCO and the TensorFlow Lite
-interpreter via the Python API. It is complemented by the following documents:
-
-*   [README](../README.md)
-*   [Command-line examples](cmdline_examples.md)
-*   [Command-line glossary](cmdline_reference.md)
-
-Table of contents:
-
-*   [High-level overview](#high-level-overview)
-*   [API](#api)
-*   [Basic examples](#basic)
-    *   [Exporting a GraphDef from tf.Session](#basic-graphdef-sess)
-    *   [Exporting a GraphDef from file](#basic-graphdef-file)
-    *   [Exporting a SavedModel](#basic-savedmodel)
-    *   [Exporting a tf.keras File](#basic-keras-file)
-*   [Complex examples](#complex)
-    *   [Exporting a quantized GraphDef](#complex-quant)
-*   [TensorFlow Lite Python interpreter](#interpreter)
-    *   [Using the interpreter from a model file](#interpreter-file)
-    *   [Using the interpreter from model data](#interpreter-data)
-*   [Additional instructions](#additional-instructions)
-    *   [Build from source code](#latest-package)
-    *   [Converting models prior to TensorFlow 1.9.](#pre-tensorflow-1.9)
-
-## High-level overview
-
-While the TensorFlow Lite Optimizing Converter can be used from the command
-line, it is often convenient to use it as part of a Python model build and
-training script. This is so that conversion can be part of your model
-development pipeline. This allows you to know early and often that you are
-designing a model that can be targeted to devices with mobile.
-
-## API
-
-The API for converting TensorFlow models to TensorFlow Lite as of TensorFlow 1.9
-is `tf.contrib.lite.TocoConverter`. The API for calling the Python intepreter is
-`tf.contrib.lite.Interpreter`.
-
-**NOTE**: As of TensorFlow 1.12, the API for converting TensorFlow models to
-TFLite will be renamed to `TFLiteConverter`. `TFLiteConverter` is semantically
-identically to `TocoConverter`. The API is available at
-`tf.contrib.lite.TFLiteConverter` as of the Sept 26 `tf-nightly`.
-
-`TocoConverter` provides class methods based on the original format of the
-model. `TocoConverter.from_session()` is available for GraphDefs.
-`TocoConverter.from_saved_model()` is available for SavedModels.
-`TocoConverter.from_keras_model_file()` is available for `tf.Keras` files.
-Example usages for simple float-point models are shown in
-[Basic Examples](#basic). Examples usages for more complex models is shown in
-[Complex Examples](#complex).
-
-**NOTE**: Currently, `TocoConverter` will cause a fatal error to the Python
-interpreter when the conversion fails. This will be remedied as soon as
-possible.
-
-## Basic examples <a name="basic"></a>
-
-The following section shows examples of how to convert a basic float-point model
-from each of the supported data formats into a TensorFlow Lite FlatBuffers.
-
-### Exporting a GraphDef from tf.Session <a name="basic-graphdef-sess"></a>
-
-The following example shows how to convert a TensorFlow GraphDef into a
-TensorFlow Lite FlatBuffer from a `tf.Session` object.
-
-```python
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-var = tf.get_variable("weights", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + var
-out = tf.identity(val, name="out")
-
-with tf.Session() as sess:
-  sess.run(tf.global_variables_initializer())
-  converter = tf.contrib.lite.TocoConverter.from_session(sess, [img], [out])
-  tflite_model = converter.convert()
-  open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-### Exporting a GraphDef from file <a name="basic-graphdef-file"></a>
-
-The following example shows how to convert a TensorFlow GraphDef into a
-TensorFlow Lite FlatBuffer when the GraphDef is stored in a file. Both `.pb` and
-`.pbtxt` files are accepted.
-
-The example uses
-[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
-The function only supports GraphDefs frozen via
-[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
-
-```python
-import tensorflow as tf
-
-graph_def_file = "/path/to/Downloads/mobilenet_v1_1.0_224/frozen_graph.pb"
-input_arrays = ["input"]
-output_arrays = ["MobilenetV1/Predictions/Softmax"]
-
-converter = tf.contrib.lite.TocoConverter.from_frozen_graph(
-  graph_def_file, input_arrays, output_arrays)
-tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-### Exporting a SavedModel <a name="basic-savedmodel"></a>
-
-The following example shows how to convert a SavedModel into a TensorFlow Lite
-FlatBuffer.
-
-```python
-import tensorflow as tf
-
-converter = tf.contrib.lite.TocoConverter.from_saved_model(saved_model_dir)
-tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-For more complex SavedModels, the optional parameters that can be passed into
-`TocoConverter.from_saved_model()` are `input_arrays`, `input_shapes`,
-`output_arrays`, `tag_set` and `signature_key`. Details of each parameter are
-available by running `help(tf.contrib.lite.TocoConverter)`.
-
-### Exporting a tf.keras File <a name="basic-keras-file"></a>
-
-The following example shows how to convert a `tf.keras` model into a TensorFlow
-Lite FlatBuffer.
-
-```python
-import tensorflow as tf
-
-converter = tf.contrib.lite.TocoConverter.from_keras_model_file("keras_model.h5")
-tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-The `tf.keras` file must contain both the model and the weights. A comprehensive
-example including model construction can be seen below.
-
-```python
-import numpy as np
-import tensorflow as tf
-
-# Generate tf.keras model.
-model = tf.keras.models.Sequential()
-model.add(tf.keras.layers.Dense(2, input_shape=(3,)))
-model.add(tf.keras.layers.RepeatVector(3))
-model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(3)))
-model.compile(loss=tf.keras.losses.MSE,
-              optimizer=tf.keras.optimizers.RMSprop(lr=0.0001),
-              metrics=[tf.keras.metrics.categorical_accuracy],
-              sample_weight_mode='temporal')
-
-x = np.random.random((1, 3))
-y = np.random.random((1, 3, 3))
-model.train_on_batch(x, y)
-model.predict(x)
-
-# Save tf.keras model in HDF5 format.
-keras_file = "keras_model.h5"
-tf.keras.models.save_model(model, keras_file)
-
-# Convert to TensorFlow Lite model.
-converter = tf.contrib.lite.TocoConverter.from_keras_model_file(keras_file)
-tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-## Complex examples <a name="complex"></a>
-
-For models where the default value of the attributes is not sufficient, the
-attribute's values should be set before calling `convert()`. In order to call
-any constants use `tf.contrib.lite.constants.<CONSTANT_NAME>` as seen below with
-`QUANTIZED_UINT8`. Run `help(tf.contrib.lite.TocoConverter)` in the Python
-terminal for detailed documentation on the attributes.
-
-Although the examples are demonstrated on GraphDefs containing only constants.
-The same logic can be applied irrespective of the input data format.
-
-### Exporting a quantized GraphDef <a name="complex-quant"></a>
-
-The following example shows how to convert a quantized model into a TensorFlow
-Lite FlatBuffer.
-
-```python
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-val = img + const
-out = tf.fake_quant_with_min_max_args(val, min=0., max=1., name="output")
-
-with tf.Session() as sess:
-  converter = tf.contrib.lite.TocoConverter.from_session(sess, [img], [out])
-  converter.inference_type = tf.contrib.lite.constants.QUANTIZED_UINT8
-  input_arrays = converter.get_input_arrays()
-  converter.quantized_input_stats = {input_arrays[0] : (0., 1.)}  # mean, std_dev
-  tflite_model = converter.convert()
-  open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-## TensorFlow Lite Python interpreter <a name="interpreter"></a>
-
-### Using the interpreter from a model file <a name="interpreter-file"></a>
-
-The following example shows how to use the TensorFlow Lite Python interpreter
-when provided a TensorFlow Lite FlatBuffer file. The example also demonstrates
-how to run inference on random input data. Run
-`help(tf.contrib.lite.Interpreter)` in the Python terminal to get detailed
-documentation on the interpreter.
-
-```python
-import numpy as np
-import tensorflow as tf
-
-# Load TFLite model and allocate tensors.
-interpreter = tf.contrib.lite.Interpreter(model_path="converted_model.tflite")
-interpreter.allocate_tensors()
-
-# Get input and output tensors.
-input_details = interpreter.get_input_details()
-output_details = interpreter.get_output_details()
-
-# Test model on random input data.
-input_shape = input_details[0]['shape']
-input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
-interpreter.set_tensor(input_details[0]['index'], input_data)
-
-interpreter.invoke()
-output_data = interpreter.get_tensor(output_details[0]['index'])
-print(output_data)
-```
-
-### Using the interpreter from model data <a name="interpreter-data"></a>
-
-The following example shows how to use the TensorFlow Lite Python interpreter
-when starting with the TensorFlow Lite Flatbuffer model previously loaded. This
-example shows an end-to-end use case, starting from building the TensorFlow
-model.
-
-```python
-import numpy as np
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-val = img + const
-out = tf.identity(val, name="out")
-
-with tf.Session() as sess:
-  converter = tf.contrib.lite.TocoConverter.from_session(sess, [img], [out])
-  tflite_model = converter.convert()
-
-# Load TFLite model and allocate tensors.
-interpreter = tf.contrib.lite.Interpreter(model_content=tflite_model)
-interpreter.allocate_tensors()
-```
-
-## Additional instructions
-
-### Build from source code <a name="latest-package"></a>
-
-In order to run the latest version of the TOCO Python API, clone the TensorFlow
-repository, configure the installation, and build and install the pip package.
-Detailed instructions are available
-[here](https://www.tensorflow.org/install/source).
-
-### Converting models prior to TensorFlow 1.9. <a name="pre-tensorflow-1.9"></a>
-
-To use TOCO in TensorFlow 1.7 and TensorFlow 1.8, use the `toco_convert`
-function. Run `help(tf.contrib.lite.toco_convert)` to get details about accepted
-parameters.
diff --git a/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg b/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg
deleted file mode 100644
index 335debde57a1576ecca97f7a7398811973cc7bcc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg
+++ /dev/null
@@ -1 +0,0 @@
-<svg version="1.1" viewBox="0.0 0.0 720.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path fill="#434343" d="m338.49512 374.66016q-0.609375 0 -1.171875 -0.140625q-0.546875 -0.15625 -0.96875 -0.421875q-0.25 -0.15625 -0.359375 -0.296875q-0.09375 -0.140625 -0.09375 -0.34375q0 -0.171875 0.09375 -0.28125q0.109375 -0.109375 0.265625 -0.109375q0.171875 0 0.46875 0.1875q0.40625 0.25 0.796875 0.390625q0.390625 0.140625 0.984375 0.140625q0.71875 0 1.109375 -0.25q0.40625 -0.265625 0.40625 -0.734375q0 -0.296875 -0.15625 -0.46875q-0.140625 -0.1875 -0.5 -0.328125q-0.359375 -0.140625 -1.046875 -0.296875q-1.171875 -0.25 -1.6875 -0.671875q-0.5 -0.421875 -0.5 -1.15625q0 -0.578125 0.3125 -1.015625q0.328125 -0.4375 0.890625 -0.6875q0.5625 -0.265625 1.28125 -0.265625q0.53125 0 1.015625 0.140625q0.484375 0.140625 0.859375 0.390625q0.453125 0.328125 0.453125 0.671875q0 0.171875 -0.109375 0.296875q-0.109375 0.125 -0.25 0.125q-0.15625 0 -0.484375 -0.234375q-0.375 -0.234375 -0.703125 -0.359375q-0.328125 -0.140625 -0.828125 -0.140625q-0.625 0 -1.015625 0.28125q-0.375 0.265625 -0.375 0.734375q0 0.296875 0.140625 0.484375q0.140625 0.171875 0.46875 0.3125q0.328125 0.140625 0.9375 0.28125q0.90625 0.1875 1.40625 0.4375q0.5 0.234375 0.703125 0.578125q0.21875 0.34375 0.21875 0.890625q0 0.828125 -0.703125 1.34375q-0.703125 0.515625 -1.859375 0.515625zm9.241241 -1.59375q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.5551147 -0.8125q0.546875 -0.03125 0.546875 0.453125q0 0.21875 -0.125 0.34375q-0.109375 0.125 -0.40625 0.15625l-0.390625 0.03125q-0.890625 0.078125 -1.328125 0.640625q-0.4375 0.546875 -0.4375 1.296875l0 3.234375q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.359375 0.140625q0.140625 0.140625 0.140625 0.375l0 0.75q0.28125 -0.578125 0.796875 -0.890625q0.515625 -0.3125 1.1875 -0.359375l0.1875 -0.015625zm6.157959 0.328125q0.15625 -0.3125 0.46875 -0.3125q0.203125 0 0.359375 0.140625q0.15625 0.125 0.15625 0.328125q0 0.109375 -0.046875 0.203125l-2.59375 5.609375q-0.078125 0.171875 -0.25 0.28125q-0.15625 0.09375 -0.34375 0.09375q-0.171875 0 -0.328125 -0.09375q-0.15625 -0.109375 -0.25 -0.28125l-2.59375 -5.609375q-0.046875 -0.09375 -0.046875 -0.1875q0 -0.203125 0.171875 -0.34375q0.1875 -0.15625 0.390625 -0.15625q0.140625 0 0.265625 0.078125q0.125 0.078125 0.1875 0.234375l2.234375 5.0l2.21875 -4.984375zm7.2099915 4.796875q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.5551453 -0.8125q0.546875 -0.03125 0.546875 0.453125q0 0.21875 -0.125 0.34375q-0.109375 0.125 -0.40625 0.15625l-0.390625 0.03125q-0.890625 0.078125 -1.328125 0.640625q-0.4375 0.546875 -0.4375 1.296875l0 3.234375q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.359375 0.140625q0.140625 0.140625 0.140625 0.375l0 0.75q0.28125 -0.578125 0.796875 -0.890625q0.515625 -0.3125 1.1875 -0.359375l0.1875 -0.015625z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path fill="#434343" d="m134.36497 56.831844q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm9.004181 -1.421875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.839676 -0.75q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm5.84729 6.0625q-0.56248474 0 -1.0624847 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.87498474 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0624847 -0.234375 -1.5156097 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.1562347 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.56248474 0 -0.90623474 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84373474 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.2131653 0q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm7.1288147 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm1.970398 6.03125q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.5434265 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm4.721527 0.015625q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm12.222534 -4.9375q0.125 -0.28125 0.390625 -0.28125q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.078125 -0.03125 0.171875l-1.984375 5.046875q-0.078125 0.15625 -0.21875 0.25q-0.140625 0.078125 -0.296875 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-1.65625 -4.21875l-1.640625 4.21875q-0.0625 0.15625 -0.203125 0.25q-0.140625 0.078125 -0.3125 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-1.984375 -5.03125q-0.046875 -0.09375 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.171875 -0.140625 0.359375 -0.140625q0.296875 0 0.40625 0.296875l1.65625 4.421875l1.6875 -4.390625q0.078125 -0.15625 0.203125 -0.234375q0.125 -0.09375 0.265625 -0.09375q0.15625 0 0.28125 0.09375q0.125 0.078125 0.1875 0.234375l1.6875 4.375l1.65625 -4.40625zm12.637604 5.09375q0.046875 0.09375 0.046875 0.203125q0 0.171875 -0.140625 0.296875q-0.140625 0.125 -0.328125 0.125q-0.296875 0 -0.421875 -0.296875l-0.84375 -1.9375l-4.53125 0l-0.859375 1.9375q-0.125 0.296875 -0.421875 0.296875q-0.1875 0 -0.34375 -0.125q-0.140625 -0.125 -0.140625 -0.3125q0 -0.09375 0.046875 -0.1875l3.4375 -7.640625q0.078125 -0.15625 0.21875 -0.234375q0.140625 -0.09375 0.3125 -0.09375q0.171875 0 0.3125 0.09375q0.15625 0.078125 0.21875 0.234375l3.4375 7.640625zm-5.859375 -2.421875l3.8125 0l-1.90625 -4.3125l-1.90625 4.3125zm7.78656 3.046875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm4.9744263 4.34375q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm4.4157715 0.015625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m409.42255 374.66803q-0.90625 0 -1.609375 -0.40625q-0.6875 -0.421875 -1.078125 -1.171875q-0.375 -0.765625 -0.375 -1.765625q0 -1.0 0.390625 -1.765625q0.40625 -0.78125 1.109375 -1.203125q0.703125 -0.4375 1.625 -0.4375q0.5 0 1.0 0.140625q0.5 0.140625 0.875 0.40625q0.234375 0.171875 0.328125 0.328125q0.109375 0.140625 0.109375 0.328125q0 0.1875 -0.109375 0.3125q-0.09375 0.109375 -0.25 0.109375q-0.09375 0 -0.203125 -0.046875q-0.09375 -0.046875 -0.171875 -0.09375q-0.078125 -0.0625 -0.09375 -0.078125q-0.359375 -0.234375 -0.671875 -0.359375q-0.3125 -0.140625 -0.765625 -0.140625q-0.96875 0 -1.515625 0.671875q-0.53125 0.65625 -0.53125 1.828125q0 1.171875 0.53125 1.8125q0.546875 0.640625 1.515625 0.640625q0.453125 0 0.78125 -0.125q0.328125 -0.140625 0.65625 -0.375q0.15625 -0.09375 0.28125 -0.15625q0.140625 -0.0625 0.234375 -0.0625q0.140625 0 0.234375 0.125q0.109375 0.109375 0.109375 0.296875q0 0.171875 -0.09375 0.3125q-0.09375 0.140625 -0.34375 0.3125q-0.375 0.25 -0.90625 0.40625q-0.515625 0.15625 -1.0625 0.15625zm4.2591553 -0.03125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -8.46875q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 8.46875q0 0.25 -0.15625 0.390625q-0.15625 0.140625 -0.375 0.140625zm3.092102 0q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.234375 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 5.625q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125zm0 -8.09375q-0.3125 0 -0.515625 -0.171875q-0.203125 -0.1875 -0.203125 -0.5q0 -0.296875 0.203125 -0.484375q0.203125 -0.1875 0.515625 -0.1875q0.328125 0 0.515625 0.1875q0.203125 0.1875 0.203125 0.484375q0 0.3125 -0.203125 0.5q-0.1875 0.171875 -0.515625 0.171875zm7.5765076 6.53125q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.6020203 -0.84375q2.328125 0 2.328125 2.578125l0 3.609375q0 0.25 -0.140625 0.390625q-0.140625 0.140625 -0.390625 0.140625q-0.25 0 -0.40625 -0.140625q-0.140625 -0.140625 -0.140625 -0.390625l0 -3.546875q0 -0.90625 -0.359375 -1.3125q-0.34375 -0.421875 -1.125 -0.421875q-0.890625 0 -1.421875 0.546875q-0.53125 0.546875 -0.53125 1.484375l0 3.25q0 0.25 -0.140625 0.390625q-0.140625 0.140625 -0.390625 0.140625q-0.25 0 -0.40625 -0.140625q-0.140625 -0.140625 -0.140625 -0.390625l0 -5.625q0 -0.234375 0.140625 -0.375q0.15625 -0.15625 0.40625 -0.15625q0.234375 0 0.375 0.15625q0.140625 0.140625 0.140625 0.359375l0 0.6875q0.328125 -0.609375 0.890625 -0.921875q0.578125 -0.3125 1.3125 -0.3125zm7.304718 5.875q0.46875 0.03125 0.46875 0.421875q0 0.21875 -0.171875 0.34375q-0.171875 0.109375 -0.5 0.078125l-0.359375 -0.015625q-1.0625 -0.09375 -1.578125 -0.640625q-0.5 -0.5625 -0.5 -1.703125l0 -3.34375l-0.890625 0q-0.234375 0 -0.359375 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.203125 0.125 -0.3125q0.125 -0.125 0.359375 -0.125l0.890625 0l0 -1.515625q0 -0.25 0.140625 -0.390625q0.15625 -0.140625 0.40625 -0.140625q0.234375 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 1.515625l1.484375 0q0.203125 0 0.328125 0.125q0.140625 0.109375 0.140625 0.3125q0 0.1875 -0.140625 0.296875q-0.125 0.109375 -0.328125 0.109375l-1.484375 0l0 3.40625q0 0.734375 0.296875 1.0625q0.296875 0.3125 0.90625 0.359375l0.359375 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path fill="#000000" d="m237.0857 213.5031q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm4.248535 1.71875q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.417801 3.875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.199051 4.46875q0.203125 0 0.296875 0.109375q0.109375 0.09375 0.109375 0.265625q0 0.1875 -0.109375 0.296875q-0.09375 0.09375 -0.296875 0.09375l-4.203125 0q-0.203125 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.3125q0 -0.1875 0.140625 -0.359375l3.546875 -4.28125l-3.28125 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l4.0625 0q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.3125q0 0.1875 -0.140625 0.359375l-3.5625 4.28125l3.421875 0zm6.2547913 -0.59375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm3.3865662 5.875q-0.171875 0 -0.28125 -0.09375q-0.109375 -0.09375 -0.109375 -0.21875q0 -0.140625 0.109375 -0.234375q0.109375 -0.09375 0.28125 -0.09375l5.21875 0q0.171875 0 0.28125 0.09375q0.109375 0.09375 0.109375 0.234375q0 0.125 -0.109375 0.21875q-0.109375 0.09375 -0.28125 0.09375l-5.21875 0zm11.2500305 -6.609375q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 5.09375q0 1.296875 -0.671875 1.96875q-0.671875 0.671875 -1.984375 0.671875q-1.28125 0 -2.140625 -0.515625q-0.421875 -0.234375 -0.421875 -0.546875q0 -0.171875 0.078125 -0.28125q0.09375 -0.109375 0.234375 -0.109375q0.125 0 0.4375 0.171875q0.421875 0.21875 0.828125 0.34375q0.40625 0.140625 0.96875 0.140625q0.859375 0 1.28125 -0.453125q0.4375 -0.453125 0.4375 -1.3125l0 -1.03125q-0.25 0.5625 -0.78125 0.859375q-0.515625 0.296875 -1.21875 0.296875q-0.765625 0 -1.359375 -0.359375q-0.59375 -0.359375 -0.9375 -1.015625q-0.328125 -0.65625 -0.328125 -1.515625q0 -0.875 0.328125 -1.53125q0.34375 -0.65625 0.9375 -1.015625q0.59375 -0.359375 1.359375 -0.359375q0.6875 0 1.203125 0.296875q0.515625 0.296875 0.78125 0.84375l0 -0.640625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625zm-2.28125 4.984375q0.84375 0 1.3125 -0.546875q0.484375 -0.5625 0.484375 -1.546875q0 -0.984375 -0.46875 -1.53125q-0.46875 -0.5625 -1.328125 -0.5625q-0.84375 0 -1.34375 0.5625q-0.484375 0.546875 -0.484375 1.53125q0 0.984375 0.484375 1.546875q0.5 0.546875 1.34375 0.546875zm7.4695435 -4.984375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.20282 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.331665 6.046875q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm5.2167664 -6.046875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.45282 -4.9375q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-3.375 7.28125q-0.0625 0.125 -0.171875 0.1875q-0.109375 0.078125 -0.234375 0.078125q-0.1875 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.09375 0.046875 -0.1875l0.84375 -1.8125l-2.375 -5.140625q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875z" fill-rule="nonzero"/><path fill="#f4cccc" d="m132.49081 319.42978l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m132.49081 319.42978l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m163.01448 339.50836q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm8.160431 0.03125q-1.171875 0 -2.046875 -0.515625q-0.859375 -0.53125 -1.328125 -1.5q-0.46875 -0.984375 -0.46875 -2.296875q0 -1.34375 0.453125 -2.3125q0.46875 -0.984375 1.328125 -1.5q0.875 -0.53125 2.0625 -0.53125q1.1875 0 2.0625 0.53125q0.875 0.515625 1.328125 1.5q0.46875 0.96875 0.46875 2.296875q0 1.3125 -0.46875 2.296875q-0.46875 0.984375 -1.34375 1.515625q-0.859375 0.515625 -2.046875 0.515625zm0 -0.84375q1.34375 0 2.09375 -0.90625q0.75 -0.90625 0.75 -2.578125q0 -1.6875 -0.75 -2.578125q-0.734375 -0.90625 -2.09375 -0.90625q-1.34375 0 -2.09375 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.09375 0.90625zm9.214935 0.84375q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm8.077179 0q-1.171875 0 -2.046875 -0.515625q-0.859375 -0.53125 -1.328125 -1.5q-0.46875 -0.984375 -0.46875 -2.296875q0 -1.34375 0.453125 -2.3125q0.46875 -0.984375 1.328125 -1.5q0.875 -0.53125 2.0625 -0.53125q1.1875 0 2.0625 0.53125q0.875 0.515625 1.328125 1.5q0.46875 0.96875 0.46875 2.296875q0 1.3125 -0.46875 2.296875q-0.46875 0.984375 -1.34375 1.515625q-0.859375 0.515625 -2.046875 0.515625zm0 -0.84375q1.34375 0 2.09375 -0.90625q0.75 -0.90625 0.75 -2.578125q0 -1.6875 -0.75 -2.578125q-0.734375 -0.90625 -2.09375 -0.90625q-1.34375 0 -2.09375 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.09375 0.90625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m314.7006 332.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm5.113556 0q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.6840515 -0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -7.5625q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.171875l3.875 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-4.375 0zm6.3394165 0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm4.987152 6.515625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#000000" d="m303.37402 346.47687q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.5434265 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm4.674652 -6.046875q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm4.3300476 -5.28125q0.765625 0 1.34375 0.375q0.59375 0.359375 0.921875 1.046875q0.328125 0.6875 0.328125 1.59375q0 0.90625 -0.328125 1.59375q-0.328125 0.6875 -0.921875 1.078125q-0.578125 0.375 -1.34375 0.375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 0.640625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.203125q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.59375q0.46875 -0.59375 0.46875 -1.65625q0 -1.046875 -0.46875 -1.625q-0.46875 -0.578125 -1.328125 -0.578125q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.687164 -5.25q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm4.8726807 -1.71875q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm3.9360352 0q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm5.873535 6.328125q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m443.6039 332.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm5.113556 0q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.6840515 -0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -7.5625q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.171875l3.875 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-4.375 0zm6.3394165 0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm4.987152 6.515625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.908142 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#000000" d="m429.9527 346.47687q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm5.237152 1.234375q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.56604 5.28125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm4.282898 -0.015625q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.14032 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.5896606 4.53125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m371.61902 334.89435l41.417297 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m371.61902 334.89435l37.990234 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m409.60925 334.89435l-1.1245728 1.1246033l3.0897522 -1.1246033l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m587.0588 293.13934q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.375 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84375 0 1.5625 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.15625 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.109375 0 2.03125 -0.328125l0 -2.578125l-1.75 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.234375 0zm2.8911743 4.46875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm7.7869263 4.375q-1.65625 0 -2.515625 -0.859375q-0.84375 -0.859375 -0.84375 -2.546875l0 -4.703125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.78125q0 1.25 0.609375 1.875q0.609375 0.609375 1.78125 0.609375q1.171875 0 1.765625 -0.609375q0.609375 -0.625 0.609375 -1.875l0 -4.78125q0 -0.234375 0.140625 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.703125q0 1.671875 -0.859375 2.546875q-0.859375 0.859375 -2.5 0.859375z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m584.63763 339.50812q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm5.0302734 -0.03125q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm7.7869263 4.375q-1.65625 0 -2.515625 -0.859375q-0.84375 -0.859375 -0.84375 -2.546875l0 -4.703125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.78125q0 1.25 0.609375 1.875q0.609375 0.609375 1.78125 0.609375q1.171875 0 1.765625 -0.609375q0.609375 -0.625 0.609375 -1.875l0 -4.78125q0 -0.234375 0.140625 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.703125q0 1.671875 -0.859375 2.546875q-0.859375 0.859375 -2.5 0.859375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m219.98688 334.92584l64.12598 -0.03149414" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.98688 334.92584l60.698914 -0.029815674" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m280.68576 334.89603l-1.1240234 1.1251526l3.0892334 -1.1260986l-3.090332 -1.1230774z" fill-rule="evenodd"/><path fill="#d9ead3" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m437.52493 135.68242l73.763794 0l0 31.748032l-73.763794 0z" fill-rule="evenodd"/><path fill="#000000" d="m448.0718 156.20241q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm8.3211975 -5.140625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.767517 -5.28125q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm10.15921 0.75q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm8.691681 -5.71875q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-3.375 7.28125q-0.0625 0.125 -0.171875 0.1875q-0.109375 0.078125 -0.234375 0.078125q-0.1875 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.09375 0.046875 -0.1875l0.84375 -1.8125l-2.375 -5.140625q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875zm4.902405 -0.328125q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.76532 -0.640625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m544.40155 135.68242l100.0 0l0 31.748032l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m554.9328 156.26491q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm5.3845215 -6.046875q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.456726 -1.703125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm4.248535 1.71875q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm6.3444214 0.765625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.47876 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm4.283142 -5.265625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.782898 0q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm4.7008057 6.046875q-0.8125 0 -1.453125 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.6875 -0.34375 -1.578125q0 -0.90625 0.359375 -1.59375q0.359375 -0.703125 0.984375 -1.078125q0.640625 -0.390625 1.46875 -0.390625q0.453125 0 0.90625 0.125q0.453125 0.125 0.78125 0.359375q0.21875 0.140625 0.3125 0.28125q0.09375 0.140625 0.09375 0.3125q0 0.171875 -0.09375 0.28125q-0.09375 0.09375 -0.234375 0.09375q-0.078125 0 -0.1875 -0.046875q-0.09375 -0.046875 -0.15625 -0.09375q-0.0625 -0.046875 -0.09375 -0.0625q-0.3125 -0.203125 -0.59375 -0.3125q-0.28125 -0.125 -0.6875 -0.125q-0.875 0 -1.359375 0.59375q-0.484375 0.59375 -0.484375 1.65625q0 1.046875 0.484375 1.625q0.484375 0.578125 1.359375 0.578125q0.40625 0 0.703125 -0.109375q0.296875 -0.125 0.59375 -0.328125q0.140625 -0.09375 0.25 -0.15625q0.125 -0.0625 0.203125 -0.0625q0.140625 0 0.21875 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.15625 -0.09375 0.28125q-0.078125 0.125 -0.296875 0.28125q-0.34375 0.234375 -0.8125 0.375q-0.46875 0.125 -0.953125 0.125zm6.029297 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.830017 -5.265625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm5.1851807 0q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#d9ead3" d="m31.874912 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m31.874912 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m67.27695 264.03653q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.359375l0 7.578125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.234375 0 -0.375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -3.4375l-5.062496 0l0 3.4375q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.234375 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 3.296875l5.062496 0l0 -3.296875q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.375 -0.140625zm3.0648193 8.515625q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm6.5711823 0.90625q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm9.0746765 -5.359375q0.8125 0 1.40625 0.34375q0.609375 0.328125 0.9375 0.9375q0.328125 0.59375 0.328125 1.390625q0 0.78125 -0.359375 1.40625q-0.359375 0.625 -1.0 0.96875q-0.640625 0.328125 -1.484375 0.328125q-0.734375 0 -1.453125 -0.25q-0.703125 -0.265625 -1.1875 -0.734375q-0.203125 -0.171875 -0.203125 -0.40625q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.234375 -0.125q0.171875 0 0.34375 0.140625q0.515625 0.4375 1.046875 0.640625q0.53125 0.203125 1.109375 0.203125q0.890625 0 1.390625 -0.5q0.5 -0.5 0.5 -1.359375q0 -0.84375 -0.5 -1.359375q-0.5 -0.515625 -1.359375 -0.515625q-1.09375 0 -1.78125 0.84375q-0.15625 0.171875 -0.40625 0.171875q-0.15625 0 -0.28125 -0.09375q-0.109375 -0.109375 -0.109375 -0.296875l0 -4.125q0 -0.21875 0.125 -0.34375q0.125 -0.125 0.359375 -0.125l4.21875 0q0.21875 0 0.34375 0.109375q0.125 0.09375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.125 0.109375 -0.34375 0.109375l-3.734375 0l0 3.015625q0.34375 -0.328125 0.78125 -0.5q0.453125 -0.171875 0.984375 -0.171875z" fill-rule="nonzero"/><path fill="#d9ead3" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m215.10997 150.37688q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.375 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84375 0 1.5625 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.15625 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.109375 0 2.03125 -0.328125l0 -2.578125l-1.75 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.234375 0zm5.1568146 -1.5625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.2028046 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.5035553 5.984375q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm10.461807 -0.515625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.480301 -2.453125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m260.00964 265.61465q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm8.9496765 -6.03125q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.767273 6.046875q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm8.535065 -0.046875q0.203125 0 0.296875 0.109375q0.109375 0.09375 0.109375 0.265625q0 0.1875 -0.109375 0.296875q-0.09375 0.09375 -0.296875 0.09375l-4.203125 0q-0.203125 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.3125q0 -0.1875 0.140625 -0.359375l3.546875 -4.28125l-3.28125 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l4.0625 0q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.3125q0 0.1875 -0.140625 0.359375l-3.5625 4.28125l3.421875 0zm6.2547913 -0.59375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.8396606 -0.75q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125z" fill-rule="nonzero"/><path fill="#000000" d="m258.07846 275.1459q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.3749847 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84373474 0 1.5624847 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.1562347 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.1093597 0 2.0312347 -0.328125l0 -2.578125l-1.7499847 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.2343597 0zm5.15683 -1.5625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.2027893 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.5035706 5.984375q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm10.461792 -0.515625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.480316 -2.453125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 232.16667l0 20.377945" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 232.16667l0 16.950867" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.85565 249.11754l-1.1246033 -1.124588l1.1246033 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#f4cccc" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m58.725647 87.669235q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.9706573 -6.984375q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm1.8266602 7.75q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm8.498016 -0.8125q0.171875 0.15625 0.171875 0.359375q0 0.15625 -0.140625 0.296875q-0.140625 0.140625 -0.3125 0.140625q-0.15625 0 -0.328125 -0.140625l-4.484375 -3.921875l0 3.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 3.4375l4.28125 -3.796875q0.125 -0.140625 0.3125 -0.140625q0.171875 0 0.296875 0.140625q0.140625 0.140625 0.140625 0.3125q0 0.171875 -0.15625 0.328125l-3.875 3.421875l4.09375 3.5625zm5.8329315 -0.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.792801 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm6.3444214 0.765625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m132.49081 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m132.49081 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m152.20152 88.37367q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.484375 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-4.015625 0l0 2.9375l3.78125 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.78125 0l0 3.078125l4.015625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-4.484375 0zm8.31218 0.078125q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.4787903 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm1.8769073 0.765625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm8.799652 1.234375q1.9375 0 1.9375 2.3125l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.328125 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.890625 -0.359375q-0.734375 0 -1.15625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.90625 -0.359375q-0.71875 0 -1.140625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.359375 -0.140625q0.203125 0 0.328125 0.125q0.140625 0.125 0.140625 0.34375l0 0.578125q0.265625 -0.515625 0.734375 -0.78125q0.46875 -0.28125 1.078125 -0.28125q1.375 0 1.78125 1.140625q0.265625 -0.515625 0.78125 -0.828125q0.515625 -0.3125 1.171875 -0.3125zm6.0990753 0q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.8144073 0.78125q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm7.1287994 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m269.00754 88.46742q-0.90625 0 -1.734375 -0.265625q-0.8125 -0.265625 -1.3125 -0.734375q-0.171875 -0.15625 -0.171875 -0.40625q0 -0.171875 0.09375 -0.296875q0.09375 -0.125 0.234375 -0.125q0.15625 0 0.328125 0.125q1.109375 0.859375 2.546875 0.859375q1.03125 0 1.578125 -0.390625q0.5625 -0.390625 0.5625 -1.125q0 -0.421875 -0.265625 -0.671875q-0.265625 -0.265625 -0.703125 -0.421875q-0.4375 -0.15625 -1.15625 -0.328125q-0.984375 -0.21875 -1.625 -0.46875q-0.625 -0.265625 -1.015625 -0.734375q-0.390625 -0.46875 -0.390625 -1.21875q0 -0.71875 0.390625 -1.265625q0.390625 -0.5625 1.09375 -0.875q0.703125 -0.3125 1.59375 -0.3125q0.84375 0 1.5625 0.265625q0.734375 0.25 1.234375 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.125 0 -0.34375 -0.140625q-0.59375 -0.46875 -1.09375 -0.65625q-0.5 -0.203125 -1.21875 -0.203125q-0.984375 0 -1.546875 0.421875q-0.546875 0.40625 -0.546875 1.15625q0 0.625 0.484375 0.953125q0.484375 0.3125 1.5 0.5625q1.09375 0.25 1.71875 0.484375q0.625 0.21875 1.03125 0.671875q0.421875 0.4375 0.421875 1.171875q0 0.71875 -0.390625 1.265625q-0.390625 0.53125 -1.109375 0.828125q-0.703125 0.296875 -1.609375 0.296875zm5.0446777 -0.03125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm2.784027 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm8.799652 1.234375q1.9375 0 1.9375 2.3125l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.328125 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.890625 -0.359375q-0.734375 0 -1.15625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.90625 -0.359375q-0.71875 0 -1.140625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.359375 -0.140625q0.203125 0 0.328125 0.125q0.140625 0.125 0.140625 0.34375l0 0.578125q0.265625 -0.515625 0.734375 -0.78125q0.46875 -0.28125 1.078125 -0.28125q1.375 0 1.78125 1.140625q0.265625 -0.515625 0.78125 -0.828125q0.515625 -0.3125 1.171875 -0.3125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m297.8283 154.87688q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm7.358429 -6.078125q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm8.37854 4.625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.308441 5.3125q-0.8125 0 -1.453125 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.6875 -0.34375 -1.578125q0 -0.90625 0.359375 -1.59375q0.359375 -0.703125 0.984375 -1.078125q0.640625 -0.390625 1.46875 -0.390625q0.453125 0 0.90625 0.125q0.453125 0.125 0.78125 0.359375q0.21875 0.140625 0.3125 0.28125q0.09375 0.140625 0.09375 0.3125q0 0.171875 -0.09375 0.28125q-0.09375 0.09375 -0.234375 0.09375q-0.078125 0 -0.1875 -0.046875q-0.09375 -0.046875 -0.15625 -0.09375q-0.0625 -0.046875 -0.09375 -0.0625q-0.3125 -0.203125 -0.59375 -0.3125q-0.28125 -0.125 -0.6875 -0.125q-0.875 0 -1.359375 0.59375q-0.484375 0.59375 -0.484375 1.65625q0 1.046875 0.484375 1.625q0.484375 0.578125 1.359375 0.578125q0.40625 0 0.703125 -0.109375q0.296875 -0.125 0.59375 -0.328125q0.140625 -0.09375 0.25 -0.15625q0.125 -0.0625 0.203125 -0.0625q0.140625 0 0.21875 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.15625 -0.09375 0.28125q-0.078125 0.125 -0.296875 0.28125q-0.34375 0.234375 -0.8125 0.375q-0.46875 0.125 -0.953125 0.125zm7.998047 -0.84375q0.203125 0.171875 0.203125 0.375q0 0.1875 -0.125 0.328125q-0.125 0.125 -0.3125 0.125q-0.15625 0 -0.328125 -0.140625l-3.125 -2.703125l0 2.359375q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 4.875l2.859375 -2.625q0.15625 -0.140625 0.328125 -0.140625q0.1875 0 0.3125 0.140625q0.140625 0.125 0.140625 0.296875q0 0.203125 -0.171875 0.359375l-2.375 2.109375l2.59375 2.265625zm4.2812805 -5.21875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm6.67157 0.796875q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm4.722534 0.78125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm5.237152 1.234375q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.5660706 5.28125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.361267 0.78125q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m233.89502 131.35573l-1.124588 -1.124588l1.124588 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m326.24146 131.35573l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m579.47955 247.1612q0.203125 0 0.328125 0.140625q0.125 0.125 0.125 0.359375l0 7.578125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.234375 0 -0.390625 -0.203125l-4.984375 -6.65625l0 6.359375q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.21875 0 -0.34375 -0.140625q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.40625 0.203125l4.96875 6.65625l0 -6.359375q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.34375 -0.140625zm8.868103 0q0.203125 0 0.328125 0.140625q0.125 0.125 0.125 0.359375l0 7.578125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.234375 0 -0.390625 -0.203125l-4.984375 -6.65625l0 6.359375q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.21875 0 -0.34375 -0.140625q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.40625 0.203125l4.96875 6.65625l0 -6.359375q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.34375 -0.140625zm12.917175 7.953125q0.046875 0.09375 0.046875 0.203125q0 0.171875 -0.140625 0.296875q-0.140625 0.125 -0.328125 0.125q-0.296875 0 -0.421875 -0.296875l-0.84375 -1.9375l-4.53125 0l-0.859375 1.9375q-0.125 0.296875 -0.421875 0.296875q-0.1875 0 -0.34375 -0.125q-0.140625 -0.125 -0.140625 -0.3125q0 -0.09375 0.046875 -0.1875l3.4375 -7.640625q0.078125 -0.15625 0.21875 -0.234375q0.140625 -0.09375 0.3125 -0.09375q0.171875 0 0.3125 0.09375q0.15625 0.078125 0.21875 0.234375l3.4375 7.640625zm-5.859375 -2.421875l3.8125 0l-1.90625 -4.3125l-1.90625 4.3125zm7.78656 3.046875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm4.9744263 4.34375q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m589.5417 213.87056q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm2.7480469 0q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm2.7479858 0q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m75.62294 283.52823l0 17.950958l100.62993 0l0 17.954529" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m75.62295 283.52823l0 17.950928l100.62992 0l0 14.527496" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m176.25287 316.00665l-1.124588 -1.1246033l1.124588 3.0897827l1.124588 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85654 283.52823l0 17.950958l-100.62991 0l0 17.954529" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85654 283.52823l0 17.950928l-100.62991 0l0 14.527496" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m176.22662 316.00665l-1.124588 -1.1246033l1.124588 3.0897827l1.124588 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 0.06298828l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 334.95734l-1.1245728 1.1246033l3.0897827 -1.1246033l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -41.858246l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -41.858246l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 293.0361l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.1246033z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -83.74802l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -83.74802l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 251.14633l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -125.60629l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -125.60629l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 209.28806l-1.1245728 1.124588l3.0897827 -1.124588l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m233.88803 165.75919l0 17.70752l42.960632 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.88805 165.75919l0 17.70752l42.960617 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.84866 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.86572 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#d9ead3" d="m132.49171 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m132.49171 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m146.9475 272.6459q-0.90625 0 -1.734375 -0.265625q-0.8125 -0.265625 -1.3125 -0.734375q-0.171875 -0.15625 -0.171875 -0.40625q0 -0.171875 0.09375 -0.296875q0.09375 -0.125 0.234375 -0.125q0.15625 0 0.328125 0.125q1.109375 0.859375 2.546875 0.859375q1.03125 0 1.578125 -0.390625q0.5625 -0.390625 0.5625 -1.125q0 -0.421875 -0.265625 -0.671875q-0.265625 -0.265625 -0.703125 -0.421875q-0.4375 -0.15625 -1.15625 -0.328125q-0.984375 -0.21875 -1.625 -0.46875q-0.625 -0.265625 -1.015625 -0.734375q-0.390625 -0.46875 -0.390625 -1.21875q0 -0.71875 0.390625 -1.265625q0.390625 -0.5625 1.09375 -0.875q0.703125 -0.3125 1.59375 -0.3125q0.84375 0 1.5625 0.265625q0.734375 0.25 1.234375 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.125 0 -0.34375 -0.140625q-0.59375 -0.46875 -1.09375 -0.65625q-0.5 -0.203125 -1.21875 -0.203125q-0.984375 0 -1.546875 0.421875q-0.546875 0.40625 -0.546875 1.15625q0 0.625 0.484375 0.953125q0.484375 0.3125 1.5 0.5625q1.09375 0.25 1.71875 0.484375q0.625 0.21875 1.03125 0.671875q0.421875 0.4375 0.421875 1.171875q0 0.71875 -0.390625 1.265625q-0.390625 0.53125 -1.109375 0.828125q-0.703125 0.296875 -1.609375 0.296875zm6.9353027 -6.078125q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm8.578796 -4.96875q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-2.34375 5.046875q-0.0625 0.15625 -0.21875 0.25q-0.140625 0.078125 -0.3125 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-2.328125 -5.046875q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875zm6.480545 4.296875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.589676 -3.28125q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.328125l0 7.625q0 0.21875 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.640625q-0.265625 0.546875 -0.78125 0.84375q-0.5 0.296875 -1.1875 0.296875q-0.765625 0 -1.359375 -0.375q-0.578125 -0.390625 -0.90625 -1.078125q-0.328125 -0.6875 -0.328125 -1.59375q0 -0.90625 0.328125 -1.59375q0.328125 -0.6875 0.90625 -1.046875q0.59375 -0.375 1.359375 -0.375q0.6875 0 1.1875 0.296875q0.515625 0.296875 0.78125 0.84375l0 -3.203125q0 -0.21875 0.125 -0.34375q0.125 -0.125 0.359375 -0.125zm-2.25 7.796875q0.84375 0 1.296875 -0.578125q0.46875 -0.59375 0.46875 -1.65625q0 -1.0625 -0.46875 -1.640625q-0.453125 -0.578125 -1.296875 -0.578125q-0.859375 0 -1.34375 0.578125q-0.46875 0.578125 -0.46875 1.625q0 1.0625 0.46875 1.65625q0.484375 0.59375 1.34375 0.59375zm12.202805 -7.796875q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.359375l0 7.59375q0 0.21875 -0.125 0.359375q-0.109375 0.125 -0.328125 0.125q-0.21875 0 -0.328125 -0.125q-0.109375 -0.140625 -0.109375 -0.359375l0 -6.125l-2.59375 4.984375q-0.171875 0.34375 -0.5 0.34375q-0.3125 0 -0.484375 -0.34375l-2.625 -4.921875l0 6.0625q0 0.21875 -0.109375 0.359375q-0.109375 0.125 -0.328125 0.125q-0.21875 0 -0.34375 -0.125q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.59375q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.3125 0 0.484375 0.34375l3.046875 5.84375l3.015625 -5.84375q0.09375 -0.1875 0.203125 -0.265625q0.125 -0.078125 0.28125 -0.078125zm4.8576965 8.59375q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm8.925674 -7.796875q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.328125l0 7.625q0 0.21875 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.640625q-0.265625 0.546875 -0.78125 0.84375q-0.5 0.296875 -1.1875 0.296875q-0.765625 0 -1.359375 -0.375q-0.578125 -0.390625 -0.90625 -1.078125q-0.328125 -0.6875 -0.328125 -1.59375q0 -0.90625 0.328125 -1.59375q0.328125 -0.6875 0.90625 -1.046875q0.59375 -0.375 1.359375 -0.375q0.6875 0 1.1875 0.296875q0.515625 0.296875 0.78125 0.84375l0 -3.203125q0 -0.21875 0.125 -0.34375q0.125 -0.125 0.359375 -0.125zm-2.25 7.796875q0.84375 0 1.296875 -0.578125q0.46875 -0.59375 0.46875 -1.65625q0 -1.0625 -0.46875 -1.640625q-0.453125 -0.578125 -1.296875 -0.578125q-0.859375 0 -1.34375 0.578125q-0.46875 0.578125 -0.46875 1.625q0 1.0625 0.46875 1.65625q0.484375 0.59375 1.34375 0.59375zm9.06218 -0.640625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm4.386551 5.296875q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m176.23885 99.34974l0 153.19684" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m176.23885 99.34974l0 149.76978" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m176.23885 249.1195l-1.124588 -1.124588l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m176.23975 283.52823l0 17.950958l0.06298828 0l0 17.954529" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m176.23975 283.52823l0 17.950928l0.06298828 0l0 14.527496" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m176.30273 316.00665l-1.1245728 -1.1246033l1.1245728 3.0897827l1.124588 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m75.62205 99.34843l0 153.19684" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m75.62205 99.34843l0 149.76978" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m75.62205 249.1182l-1.1245804 -1.124588l1.1245804 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m99.50131 100.0l0 76.0l54.992126 0l0 76.0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.50131 100.0l0 76.0l54.992126 0l0 72.57292" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m154.49344 248.5729l-1.124588 -1.1245728l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
deleted file mode 100644
index 1688586733b0434c7fc98686a19f0ceb8092f33b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-namespace {
-
-template <ArrayDataType A>
-void DequantizeBuffer(Array* array) {
-  const auto old_data = array->GetBuffer<A>().data;
-  array->buffer = nullptr;
-  array->data_type = ArrayDataType::kFloat;
-  auto& new_data = array->GetMutableBuffer<ArrayDataType::kFloat>().data;
-  new_data.resize(old_data.size());
-  const auto& qparams = array->GetQuantizationParams();
-  for (int i = 0; i < old_data.size(); i++) {
-    new_data[i] = qparams.scale * (old_data[i] - qparams.zero_point);
-  }
-}
-
-std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
-    Model* model, const string& array_name) {
-  for (auto it = model->operators.begin(); it != model->operators.end(); ++it) {
-    for (const auto& input : it->get()->inputs) {
-      if (input == array_name) {
-        return it;
-      }
-    }
-  }
-  return model->operators.end();
-}
-
-void ClearArrayQuantizationParams(const string& array_name, Model* model) {
-  auto* array = &model->GetArray(array_name);
-  CHECK(array->quantization_params);
-  for (auto& input_array : *model->flags.mutable_input_arrays()) {
-    if (input_array.name() == array_name) {
-      auto& qparams = *array->quantization_params;
-      const double new_std_value = 1. / qparams.scale;
-      const double new_mean_value = qparams.zero_point;
-      if (input_array.has_std_value()) {
-        CHECK_LE(std::abs(new_std_value - input_array.std_value()), 0.001);
-      } else {
-        input_array.set_std_value(new_std_value);
-      }
-      if (input_array.has_mean_value()) {
-        CHECK_LE(std::abs(new_mean_value - input_array.mean_value()), 0.001);
-      } else {
-        input_array.set_mean_value(new_mean_value);
-      }
-    }
-  }
-  array->quantization_params = nullptr;
-}
-
-bool DequantizeArray(const string& array_name,
-                     GraphTransformation* transformation, Model* model) {
-  auto* array = &model->GetArray(array_name);
-  if (!array->quantization_params) {
-    return false;
-  }
-  transformation->AddMessageF("Dequantizing array: %s", array_name);
-
-  // Dequantize any buffer
-  if (array->buffer) {
-    if (array->data_type == ArrayDataType::kUint8) {
-      DequantizeBuffer<ArrayDataType::kUint8>(array);
-    } else if (array->data_type == ArrayDataType::kInt32) {
-      DequantizeBuffer<ArrayDataType::kInt32>(array);
-    } else {
-      LOG(FATAL) << "Unhandled data type";
-    }
-    CHECK(array->data_type == ArrayDataType::kFloat);
-    CHECK(array->buffer->type == ArrayDataType::kFloat);
-
-    // Clear quantization params, officially makes this a non-quantized array.
-    ClearArrayQuantizationParams(array_name, model);
-    return true;
-  } else {
-    array->data_type = ArrayDataType::kFloat;
-  }
-
-  // Clear quantization params, officially makes this a non-quantized array.
-  ClearArrayQuantizationParams(array_name, model);
-
-  if (array->buffer) {
-    return true;
-  }
-
-  auto* op_outputting_array = GetOpWithOutput(*model, array_name);
-  if (op_outputting_array) {
-    if (op_outputting_array->type == OperatorType::kReshape) {
-      return true;
-    }
-  }
-
-  // If there was no minmax info, we can return now. Indeed,
-  // the below only serves to create a FakeQuant node, but some arrays are
-  // quantized without MinMax (see the CHECK above) and that corresponds to
-  // places where a FakeQuant node is actually not wanted, because the
-  // quantization params are meant to be inferred in another way (e.g. bias
-  // vector for a Conv op, see their special-casing in quantize.cc).
-  if (!array->minmax) {
-    return true;
-  }
-
-  // Determine whether to insert a FakeQuant before or after
-  // this array.
-  bool must_insert_fakequant_before = false;
-  bool must_insert_fakequant_after = false;
-  if (IsInputArray(*model, array_name)) {
-    must_insert_fakequant_after = true;
-  }
-  for (const string& output_array : model->flags.output_arrays()) {
-    if (array_name == output_array) {
-      must_insert_fakequant_before = true;
-    }
-  }
-  for (const auto& rnn_state : model->flags.rnn_states()) {
-    if (array_name == rnn_state.state_array()) {
-      must_insert_fakequant_after = true;
-    }
-    if (array_name == rnn_state.back_edge_source_array()) {
-      must_insert_fakequant_before = true;
-    }
-  }
-  CHECK(!(must_insert_fakequant_before && must_insert_fakequant_after));
-
-  // Create and insert the FakeQuant node
-  auto* fakequant_op = new FakeQuantOperator;
-  model->operators.emplace(FindFirstOpWithInput(model, array_name),
-                           fakequant_op);
-  const string& new_array_name = AvailableArrayName(*model, array_name);
-  auto& new_array = model->GetOrCreateArray(new_array_name);
-  new_array.data_type = ArrayDataType::kFloat;
-  new_array.copy_shape(array->shape());
-  new_array.GetOrCreateMinMax() = array->GetMinMax();
-  fakequant_op->minmax.reset(new MinMax);
-  *fakequant_op->minmax = array->GetMinMax();
-  fakequant_op->narrow_range = array->narrow_range;
-  if (must_insert_fakequant_before) {
-    for (const auto& op : model->operators) {
-      for (string& output : op->outputs) {
-        if (output == array_name) {
-          output = new_array_name;
-        }
-      }
-    }
-    fakequant_op->inputs = {new_array_name};
-    fakequant_op->outputs = {array_name};
-  } else {
-    for (const auto& op : model->operators) {
-      for (string& input : op->inputs) {
-        if (input == array_name) {
-          input = new_array_name;
-        }
-      }
-    }
-    fakequant_op->inputs = {array_name};
-    fakequant_op->outputs = {new_array_name};
-  }
-  return true;
-}
-
-}  // namespace
-
-bool Dequantize::Run(Model* model, std::size_t op_index) {
-  const auto op_it = model->operators.begin() + op_index;
-  auto* op = op_it->get();
-
-  if (op->type == OperatorType::kDequantize) {
-    auto& input_array = model->GetArray(op->inputs[0]);
-    if (input_array.data_type == ArrayDataType::kFloat) {
-      return false;
-    }
-    if (input_array.final_data_type != ArrayDataType::kFloat) {
-      return false;
-    }
-    input_array.data_type = ArrayDataType::kFloat;
-    input_array.quantization_params = nullptr;
-    auto& output_array = model->GetArray(op->outputs[0]);
-    output_array.data_type = ArrayDataType::kFloat;
-    output_array.quantization_params = nullptr;
-    return RemoveTrivialPassthroughOp(this, model, op_index);
-  }
-
-  std::vector<string> arrays;
-  for (const string& input : op->inputs) {
-    arrays.push_back(input);
-  }
-  for (const string& output : op->outputs) {
-    arrays.push_back(output);
-  }
-  bool changed = false;
-  for (const string& array : arrays) {
-    if (!model->IsOptionalArray(array)) {
-      changed |= DequantizeArray(array, this, model);
-    }
-  }
-
-  return changed;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
deleted file mode 100644
index 95558ef5ece9a78825daf0203e2f6f6fee6f3cda..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool DropFakeQuant::Run(Model* model, std::size_t op_index) {
-  const auto fakequant_it = model->operators.begin() + op_index;
-  auto* fakequant_base_op = fakequant_it->get();
-  if (fakequant_base_op->type != OperatorType::kFakeQuant) {
-    return false;
-  }
-  auto* fakequant_op = static_cast<FakeQuantOperator*>(fakequant_base_op);
-
-  if (!fakequant_op->minmax) {
-    return false;
-  }
-
-  const auto& output_array = model->GetArray(fakequant_op->outputs[0]);
-  if (!output_array.minmax) {
-    return false;
-  }
-
-  // Drop min/max inputs
-  for (int i = 1; i < fakequant_op->inputs.size(); i++) {
-    if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
-      model->EraseArray(fakequant_op->inputs[i]);
-    }
-  }
-  fakequant_op->inputs.resize(1);
-
-  return RemoveTrivialPassthroughOp(this, model, op_index);
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc b/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
deleted file mode 100644
index f7fd878b7e8b1c834125130ea2a778cecefd3de0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool DropIm2colArrays::Run(Model* model, std::size_t op_index) {
-  auto conv_it = model->operators.begin() + op_index;
-  if (conv_it->get()->type != OperatorType::kConv) {
-    return false;
-  }
-  auto* conv_op = static_cast<ConvOperator*>(conv_it->get());
-  if (conv_op->outputs.size() < 2) {
-    // Conv op does not have im2col.
-    return false;
-  }
-
-  // Drop the im2col array.
-  CHECK_EQ(conv_op->outputs.size(), 2);
-  model->EraseArray(conv_op->outputs[1]);
-  conv_op->outputs.resize(1);
-  AddMessageF("Dropped an im2col array for %s", LogName(*conv_op));
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
deleted file mode 100644
index 44733391f5a1d9ebf9a24f4f31b425a35354e1fc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.cc
+++ /dev/null
@@ -1,276 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool InferQuantizedDataTypeFromFakeQuant(
-    const FakeQuantOperator& op, ArrayDataType* out_quantized_data_type) {
-  if (op.num_bits <= 8) {
-    *out_quantized_data_type = ArrayDataType::kUint8;
-    return true;
-  } else if (op.num_bits <= 16) {
-    *out_quantized_data_type = ArrayDataType::kInt16;
-    return true;
-  } else {
-    *out_quantized_data_type = ArrayDataType::kNone;
-    return false;
-  }
-}
-
-bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
-                                        double* out_min_value,
-                                        double* out_max_value) {
-  switch (data_type) {
-    case ArrayDataType::kUint8:
-      *out_min_value = 0;
-      *out_max_value = 255;
-      return true;
-    case ArrayDataType::kInt16:
-      *out_min_value = -32768;
-      *out_max_value = 32767;
-      return true;
-    default:
-      return false;
-  }
-}
-
-ArrayDataType GetQuantizedDataType(const Array& array,
-                                   ArrayDataType default_type) {
-  switch (array.final_data_type) {
-    case ArrayDataType::kInt8:
-    case ArrayDataType::kUint8:
-    case ArrayDataType::kInt16:
-    case ArrayDataType::kUint16:
-    case ArrayDataType::kInt32:
-    case ArrayDataType::kUint32:
-    case ArrayDataType::kInt64:
-    case ArrayDataType::kUint64:
-      return array.final_data_type;
-    case ArrayDataType::kFloat:
-    case ArrayDataType::kNone:
-      return default_type;
-    default:
-      LOG(FATAL) << "Unhandled final quantization type "
-                 << static_cast<int>(array.final_data_type);
-  }
-}
-
-template <ArrayDataType A>
-void ChooseQuantizationParamsForArrayAndQuantizedDataType(
-    const Array& array, QuantizationParams* quantization_params) {
-  *quantization_params = ::tflite::ChooseQuantizationParams<DataType<A>>(
-      array.minmax->min, array.minmax->max, array.narrow_range);
-}
-
-void ChooseQuantizationParamsForArrayAndQuantizedDataType(
-    const Array& array, ArrayDataType quantized_data_type,
-    QuantizationParams* quantization_params) {
-  switch (quantized_data_type) {
-    case ArrayDataType::kInt8:
-      ChooseQuantizationParamsForArrayAndQuantizedDataType<
-          ArrayDataType::kInt8>(array, quantization_params);
-      break;
-    case ArrayDataType::kUint8:
-      ChooseQuantizationParamsForArrayAndQuantizedDataType<
-          ArrayDataType::kUint8>(array, quantization_params);
-      break;
-    case ArrayDataType::kInt16:
-      ChooseQuantizationParamsForArrayAndQuantizedDataType<
-          ArrayDataType::kInt16>(array, quantization_params);
-      break;
-    case ArrayDataType::kUint16:
-      ChooseQuantizationParamsForArrayAndQuantizedDataType<
-          ArrayDataType::kUint16>(array, quantization_params);
-      break;
-    case ArrayDataType::kInt32:
-      ChooseQuantizationParamsForArrayAndQuantizedDataType<
-          ArrayDataType::kInt32>(array, quantization_params);
-      break;
-    case ArrayDataType::kUint32:
-      ChooseQuantizationParamsForArrayAndQuantizedDataType<
-          ArrayDataType::kUint32>(array, quantization_params);
-      break;
-    case ArrayDataType::kInt64:
-      ChooseQuantizationParamsForArrayAndQuantizedDataType<
-          ArrayDataType::kInt64>(array, quantization_params);
-      break;
-    case ArrayDataType::kUint64:
-      ChooseQuantizationParamsForArrayAndQuantizedDataType<
-          ArrayDataType::kUint64>(array, quantization_params);
-      break;
-    case ArrayDataType::kFloat:
-    case ArrayDataType::kNone:
-    default:
-      LOG(FATAL) << "Unhandled final quantization type "
-                 << static_cast<int>(quantized_data_type);
-  }
-}
-
-namespace {
-
-template <ArrayDataType A>
-std::unique_ptr<GenericBuffer> QuantizeBuffer(
-    const Array& array, const QuantizationParams& quantization_params) {
-  const GenericBuffer& buffer = *array.buffer;
-  const auto inverse_scale = 1. / quantization_params.scale;
-  CHECK(buffer.type == ArrayDataType::kFloat);
-  const auto& float_buffer =
-      static_cast<const Buffer<ArrayDataType::kFloat>&>(buffer);
-  auto* quantized_buffer = new Buffer<A>;
-  quantized_buffer->data.resize(float_buffer.data.size());
-  for (std::size_t i = 0; i < float_buffer.data.size(); i++) {
-    const float src_val = float_buffer.data[i];
-    double scaled_val;  // Astonishingly, using 'float' degrades accuracy just
-                        // enough to make a few tests fail!
-    if (quantization_params.scale == 0) {
-      CHECK_EQ(src_val, 0) << "The quantization scale for this array is 0, "
-                           << "so all its values should be 0.";
-      scaled_val = quantization_params.zero_point;
-    } else {
-      scaled_val = quantization_params.zero_point + inverse_scale * src_val;
-    }
-    auto integer_val = tflite::SafeCast<DataType<A>>(std::round(scaled_val));
-    // In addition to its effect on the choice of quantization params upstream
-    // of here, narrow_range also means nudge the min quantized value by +1,
-    // so e.g. uint8 values get constrained to [1, 255].
-    if (integer_val == std::numeric_limits<DataType<A>>::min() &&
-        array.narrow_range) {
-      integer_val++;
-    }
-    quantized_buffer->data[i] = integer_val;
-  }
-  return std::unique_ptr<GenericBuffer>(quantized_buffer);
-}
-
-template <ArrayDataType A>
-void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name,
-                   const QuantizationParams& quantization_params) {
-  auto& array = model->GetArray(name);
-  CHECK(array.data_type == ArrayDataType::kFloat);
-  CHECK(!array.quantization_params);
-  array.GetOrCreateQuantizationParams() = quantization_params;
-  if (array.buffer) {
-    array.buffer = QuantizeBuffer<A>(array, quantization_params);
-  }
-  array.data_type = A;
-  array.final_data_type = A;
-  transformation->AddMessageF(
-      "Quantized array %s to %s zero_point=%g, scale=%g", name,
-      ArrayDataTypeName(array.data_type), quantization_params.zero_point,
-      quantization_params.scale);
-}
-
-}  // namespace
-
-void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
-                   const QuantizationParams& quantization_params) {
-  ArrayDataType adjusted_data_type = quantized_data_type;
-  auto& array = model->GetArray(name);
-  if (array.final_data_type == ArrayDataType::kInt16) {
-    adjusted_data_type = array.final_data_type;
-  }
-
-  switch (adjusted_data_type) {
-    case ArrayDataType::kUint8:
-      return QuantizeArray<ArrayDataType::kUint8>(transformation, model, name,
-                                                  quantization_params);
-    case ArrayDataType::kInt16:
-      return QuantizeArray<ArrayDataType::kInt16>(transformation, model, name,
-                                                  quantization_params);
-    case ArrayDataType::kInt32:
-      return QuantizeArray<ArrayDataType::kInt32>(transformation, model, name,
-                                                  quantization_params);
-    default:
-      LOG(FATAL) << "Unhandled case.";
-  }
-}
-
-bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
-                                 const Array& array, double clamp_min,
-                                 double clamp_max) {
-  ArrayDataType quantized_data_type =
-      GetQuantizedDataType(array, array.data_type);
-  if (quantized_data_type == ArrayDataType::kNone ||
-      quantized_data_type == ArrayDataType::kFloat) {
-    // The array is not (or never will be) quantized.
-    return false;
-  }
-
-  QuantizationParams quantization_params;
-  if (!array.quantization_params) {
-    if (!array.minmax) {
-      transformation->AddMessageF("No quantization params and no minmax");
-      return false;
-    } else {
-      // Work around cases where we are asking for this prior to the Quantize
-      // transformation having added the quantization_params.
-      ChooseQuantizationParamsForArrayAndQuantizedDataType(
-          array, quantized_data_type, &quantization_params);
-      transformation->AddMessageF(
-          "No quantization params - infering from data type %s with minmax "
-          "%g,%g as zero_point=%g, scale=%g",
-          ArrayDataTypeName(quantized_data_type), array.minmax->min,
-          array.minmax->max, quantization_params.zero_point,
-          quantization_params.scale);
-    }
-  } else {
-    quantization_params = array.GetQuantizationParams();
-  }
-
-  double quantized_min, quantized_max;
-  CHECK(GetQuantizedDataTypeNumericalRange(quantized_data_type, &quantized_min,
-                                           &quantized_max))
-      << "Type is not quantized";
-
-  bool has_nontrivial_min_bound = false;
-  bool has_nontrivial_max_bound = false;
-
-  double lowest_representable_output =
-      (quantized_min - quantization_params.zero_point) *
-      quantization_params.scale;
-  if (lowest_representable_output < clamp_min) {
-    has_nontrivial_min_bound = true;
-    transformation->AddMessageF(
-        "Quantized activation function is not trivial: "
-        "the lowest representable output value %g"
-        " less than the clamp min bound %g.",
-        lowest_representable_output, clamp_min);
-  }
-
-  double highest_representable_output =
-      (quantized_max - quantization_params.zero_point) *
-      quantization_params.scale;
-  if (highest_representable_output > clamp_max) {
-    has_nontrivial_max_bound = true;
-    transformation->AddMessageF(
-        "Quantized activation function is not trivial: "
-        "the highest representable output value %g"
-        " is greater than the clamp max bound %g.",
-        highest_representable_output, clamp_max);
-  }
-
-  return !has_nontrivial_min_bound && !has_nontrivial_max_bound;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h b/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
deleted file mode 100644
index cf093c6f17b45839156dae0d06ca2fc7e5e2f3c6..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
-
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-
-namespace toco {
-
-// Gets the target quantized data type of an array based on the fake quant op.
-// For example, if the num_bits is 8 the data type will be kUint8.
-bool InferQuantizedDataTypeFromFakeQuant(
-    const FakeQuantOperator& op, ArrayDataType* out_quantized_data_type);
-
-// Gets the min/max numerical range for the given quantized data type.
-// For example, kUint8 will return [0,255].
-// Returns true if the ranges were set and false if the type is not quantized.
-bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
-                                        double* out_min_value,
-                                        double* out_max_value);
-
-// Returns the quantized data type of an array, falling back to the provided
-// default data type.
-ArrayDataType GetQuantizedDataType(const Array& array,
-                                   ArrayDataType default_type);
-
-// Chooses the quantization params for a given array and a given target
-// quantized data type (which may not be the array's current data type).
-void ChooseQuantizationParamsForArrayAndQuantizedDataType(
-    const Array& array, ArrayDataType quantized_data_type,
-    QuantizationParams* quantization_params);
-
-// Quantizes an array by setting its data type and (if constant) quantizing
-// all values in the array.
-void QuantizeArray(GraphTransformation* transformation, Model* model,
-                   const string& name, ArrayDataType quantized_data_type,
-                   const QuantizationParams& quantization_params);
-
-// Returns true if the given array, when quantized, contains only values between
-// the provided clamp min/max.
-// Either clamp_min or clamp_max may be +/-infinity to indicate that the value
-// is unbounded on that side.
-bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
-                                 const Array& array, double clamp_min,
-                                 double clamp_max);
-
-}  // namespace toco
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_identity.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_identity.cc
deleted file mode 100644
index 7ec7752f25dad1c24b821733c0e6dafbd1cd8bf2..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_identity.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool RemoveTensorFlowIdentity::Run(Model* model, std::size_t op_index) {
-  const auto passthru_it = model->operators.begin() + op_index;
-  const auto* passthru_op = passthru_it->get();
-  if (passthru_op->type != OperatorType::kIdentity) {
-    return false;
-  }
-
-  return RemoveTrivialPassthroughOp(this, model, op_index);
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation.cc
deleted file mode 100644
index 3ceb93d8eedbb3743be112e6bd03cfe3e6f74d13..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool RemoveTrivialConcatenation::Run(Model* model, std::size_t op_index) {
-  const auto concat_it = model->operators.begin() + op_index;
-  auto* concat_op = concat_it->get();
-  if (concat_op->type != OperatorType::kConcatenation) {
-    return false;
-  }
-  if (concat_op->inputs.size() != 1) {
-    return false;
-  }
-  return RemoveTrivialPassthroughOp(this, model, op_index);
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
deleted file mode 100644
index b8b35161d77e5b6dd8c30e03959dba3c60d1d56c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolveBatchToSpaceNDAttributes::Run(Model* model, std::size_t op_index) {
-  const auto op_it = model->operators.begin() + op_index;
-  if (op_it->get()->type != OperatorType::kBatchToSpaceND) return false;
-
-  auto* op = static_cast<BatchToSpaceNDOperator*>(op_it->get());
-
-  // The attributes are resolved only when the 3 attributes (block_shape,
-  // before_crops, after_crops) are all constant.
-  if (!op->block_shape.empty()) {
-    return false;
-  }
-
-  CHECK_EQ(op->inputs.size(), 3);
-  if (!IsConstantParameterArray(*model, op->inputs[1]) ||
-      !IsConstantParameterArray(*model, op->inputs[2]))
-    return false;
-
-  // Handle crops
-  const auto& crops_array = model->GetArray(op->inputs[2]);
-  if (!crops_array.has_shape()) return false;
-  const std::vector<int>& crops_dims = crops_array.shape().dims();
-  if (crops_dims.size() != 2) {
-    // Code only handles crops of 2 dimensions. Perhaps another transformation
-    // will delete this op.
-    return false;
-  }
-  const std::vector<int>& crops_buffer =
-      crops_array.GetBuffer<ArrayDataType::kInt32>().data;
-  for (int i = 0; i < crops_dims[0]; ++i) {
-    op->before_crops.push_back(crops_buffer[i * 2]);
-    op->after_crops.push_back(crops_buffer[i * 2 + 1]);
-  }
-
-  // Handle block_shape
-  const auto& block_shape_array = model->GetArray(op->inputs[1]);
-  if (!block_shape_array.has_shape()) return false;
-  const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
-  CHECK_EQ(block_shape_dims.size(), 1);
-  const std::vector<int>& block_shape_buffer =
-      block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
-  for (int i = 0; i < block_shape_dims[0]; ++i) {
-    op->block_shape.push_back(block_shape_buffer[i]);
-  }
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
deleted file mode 100644
index 36d7dad0ce9de81ec132ef992538b6022916bfbd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-namespace {
-
-// Gathers data from axis 0.
-template <ArrayDataType Type>
-inline void Gather(const Array& input_array, int input_rank,
-                   const Array& coords_array, Array* output_array) {
-  const Shape& input_shape = input_array.shape();
-  const std::vector<DataType<Type>>& input_data =
-      input_array.GetBuffer<Type>().data;
-  const Shape& coords_shape = coords_array.shape();
-  const std::vector<int32>& coords_data =
-      coords_array.GetBuffer<ArrayDataType::kInt32>().data;
-
-  const Shape& output_shape = output_array->shape();
-  std::vector<DataType<Type>>& output_data =
-      output_array->GetMutableBuffer<Type>().data;
-  output_data.resize(RequiredBufferSizeForShape(output_shape));
-
-  int rev_input_rank = input_shape.dimensions_count() - 1 - (input_rank - 1);
-  CHECK_EQ(coords_shape.dims(0), output_array->shape().dims(rev_input_rank));
-
-  int stride = 1;
-  for (int i = input_shape.dimensions_count() - 1; i >= input_rank - 1; --i) {
-    stride *= input_shape.dims(i);
-  }
-
-  for (int i = 0; i < coords_shape.dims(0); ++i) {
-    DCHECK_GE(coords_data[i], 0);
-    DCHECK_LT(coords_data[i], input_shape.dims(rev_input_rank));
-    DataType<Type>* out = output_data.data() + i * stride;
-    const DataType<Type>* in = input_data.data() + coords_data[i] * stride;
-    memcpy(out, in, sizeof(DataType<Type>) * stride);
-  }
-}
-
-}  // namespace
-
-// Resolves a constant Gather operation.
-// This simply performs the gather and produces the output array with the
-// appropriate values.
-bool ResolveConstantGather::Run(Model* model, std::size_t op_index) {
-  auto it = model->operators.begin() + op_index;
-  const auto* base_op = it->get();
-  if (base_op->type != OperatorType::kGather) {
-    return false;
-  }
-  const auto* op = static_cast<const GatherOperator*>(base_op);
-
-  CHECK_GE(op->inputs.size(), 2);
-  CHECK_EQ(op->outputs.size(), 1);
-  auto& output_array = model->GetArray(op->outputs[0]);
-  if (output_array.data_type == ArrayDataType::kNone) {
-    // Yield until the output type has been set by PropagateArrayDataTypes.
-    return false;
-  }
-  if (!output_array.has_shape()) {
-    // Yield until the output shape has been set by PropagateFixedShapes.
-    return false;
-  }
-
-  if (!op->axis) {
-    // Yield until axis has been set by ResolveGatherAttributes.
-    return false;
-  }
-  if (op->axis.value() != 0) {
-    // Only handling axis=0 for now.
-    AddMessageF("%s has axis %d; only axis=0 is supported", LogName(*op),
-                op->axis.value());
-    return false;
-  }
-
-  // We require constant inputs.
-  if (!IsConstantParameterArray(*model, op->inputs[0]) ||
-      !IsConstantParameterArray(*model, op->inputs[1])) {
-    return false;
-  }
-  const Array& input_array = model->GetArray(op->inputs[0]);
-  const Array& coords_array = model->GetArray(op->inputs[1]);
-  CHECK(coords_array.data_type == ArrayDataType::kInt32)
-      << "Only int32 indices are supported";
-
-  // Copy min/max info if present. The ranges of the selected values may be
-  // a subset of the original range but we want to ensure the quantization
-  // params stay the same.
-  if (input_array.minmax) {
-    const auto& input_minmax = input_array.GetMinMax();
-    auto& output_minmax = output_array.GetOrCreateMinMax();
-    output_minmax.min = input_minmax.min;
-    output_minmax.max = input_minmax.max;
-  }
-
-  CHECK(!output_array.buffer);
-  switch (output_array.data_type) {
-    case ArrayDataType::kFloat:
-      Gather<ArrayDataType::kFloat>(input_array, op->input_rank, coords_array,
-                                    &output_array);
-      break;
-    case ArrayDataType::kUint8:
-      Gather<ArrayDataType::kUint8>(input_array, op->input_rank, coords_array,
-                                    &output_array);
-      break;
-    case ArrayDataType::kInt32:
-      Gather<ArrayDataType::kInt32>(input_array, op->input_rank, coords_array,
-                                    &output_array);
-      break;
-    case ArrayDataType::kInt64:
-      Gather<ArrayDataType::kInt64>(input_array, op->input_rank, coords_array,
-                                    &output_array);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported data type given to Gather op with output \""
-                 << op->outputs[0] << "\"";
-      break;
-  }
-
-  // Erase input arrays if no longer used after we remove the op.
-  DeleteArrayIfUsedOnce(op->inputs[0], model);
-  DeleteArrayIfUsedOnce(op->inputs[1], model);
-
-  // Erase the operator.
-  model->operators.erase(it);
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
deleted file mode 100644
index 1a0ba9e2bc7235720b59210cdd6affa089613077..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_range.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolveConstantRange::Run(Model* model, std::size_t op_index) {
-  const auto it = model->operators.begin() + op_index;
-  auto* base_op = it->get();
-  if (base_op->type != OperatorType::kRange) {
-    return false;
-  }
-  auto* op = static_cast<RangeOperator*>(base_op);
-
-  CHECK_EQ(op->inputs.size(), 3);
-  const auto& start_array = model->GetArray(op->inputs[0]);
-  if (!start_array.has_shape()) {
-    // Yield until all input dims have been resolved.
-    return false;
-  }
-  const auto& limit_array = model->GetArray(op->inputs[1]);
-  if (!limit_array.has_shape()) {
-    // Yield until all input dims have been resolved.
-    return false;
-  }
-  const auto& delta_array = model->GetArray(op->inputs[2]);
-  if (!delta_array.has_shape()) {
-    // Yield until all input dims have been resolved.
-    return false;
-  }
-
-  for (const auto& input : op->inputs) {
-    if (!IsConstantParameterArray(*model, input)) {
-      // yield if any input is mutable
-      return false;
-    }
-  }
-
-  CHECK_EQ(op->outputs.size(), 1);
-  auto& output_array = model->GetArray(op->outputs[0]);
-  if (output_array.data_type == ArrayDataType::kNone) {
-    // Yield until the output type has been set by PropagateArrayDataTypes
-    return false;
-  }
-
-  CHECK_EQ(RequiredBufferSizeForShape(start_array.shape()), 1)
-      << "Range op inputs must be scalar.";
-  CHECK_EQ(RequiredBufferSizeForShape(limit_array.shape()), 1)
-      << "Range op inputs must be scalar.";
-  CHECK_EQ(RequiredBufferSizeForShape(delta_array.shape()), 1)
-      << "Range op inputs must be scalar.";
-
-  CHECK(start_array.data_type == ArrayDataType::kInt32)
-      << "Range op inputs must be int32.";
-  CHECK(limit_array.data_type == ArrayDataType::kInt32)
-      << "Range op inputs must be int32.";
-  CHECK(delta_array.data_type == ArrayDataType::kInt32)
-      << "Range op inputs must be int32.";
-
-  // Compute buffer contents
-  int start = start_array.GetBuffer<ArrayDataType::kInt32>().data[0];
-  int limit = limit_array.GetBuffer<ArrayDataType::kInt32>().data[0];
-  int delta = delta_array.GetBuffer<ArrayDataType::kInt32>().data[0];
-  auto& buffer = output_array.GetMutableBuffer<ArrayDataType::kInt32>();
-  buffer.data.clear();
-  for (int32 val = start; val < limit; val += delta) {
-    buffer.data.push_back(val);
-  }
-  CHECK_EQ(floor((limit - start) / delta), buffer.data.size());
-  CHECK_EQ(buffer.data.size(), output_array.shape().dims()[0]);
-
-  // Delete the input array if no longer used
-  if (IsDiscardableArray(*model, op->inputs[0]) &&
-      CountOpsWithInput(*model, op->inputs[0]) == 1) {
-    model->EraseArray(op->inputs[0]);
-  }
-  if (IsDiscardableArray(*model, op->inputs[1]) &&
-      CountOpsWithInput(*model, op->inputs[1]) == 1) {
-    model->EraseArray(op->inputs[1]);
-  }
-  if (IsDiscardableArray(*model, op->inputs[2]) &&
-      CountOpsWithInput(*model, op->inputs[2]) == 1) {
-    model->EraseArray(op->inputs[2]);
-  }
-
-  // Delete the operator
-  model->operators.erase(it);
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
deleted file mode 100644
index e880a3f44dab376e5e441e3d6c0f747ee8490489..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_select.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-// Resolves a constant Select operation.
-//
-// This implementation is looking strictly for all-or-nothing on the select
-// condition. It's possible to enhance this by looking per-element and possibly
-// producing a Mul op.
-bool ResolveConstantSelect::Run(Model* model, std::size_t op_index) {
-  auto it = model->operators.begin() + op_index;
-  const auto* base_op = it->get();
-  if (base_op->type != OperatorType::kSelect) {
-    return false;
-  }
-  const auto* op = static_cast<const SelectOperator*>(base_op);
-
-  CHECK_GE(op->inputs.size(), 3);
-  CHECK_EQ(op->outputs.size(), 1);
-  auto& output_array = model->GetArray(op->outputs[0]);
-  if (output_array.data_type == ArrayDataType::kNone) {
-    // Yield until the output type has been set by PropagateArrayDataTypes.
-    return false;
-  }
-  if (!output_array.has_shape()) {
-    // Yield until the output shape has been set by PropagateFixedShapes.
-    return false;
-  }
-
-  // We require the cond input to be constant.
-  if (!IsConstantParameterArray(*model, op->inputs[0])) {
-    return false;
-  }
-  const Array& cond_array = model->GetArray(op->inputs[0]);
-  CHECK(cond_array.data_type == ArrayDataType::kBool)
-      << "Only bool conditions are supported";
-  const auto& cond_data = cond_array.GetBuffer<ArrayDataType::kBool>().data;
-  if (cond_data.empty()) {
-    return false;
-  }
-
-  // Check if the condition is the same for all elements.
-  bool cond_value = cond_data[0];
-  for (size_t i = 1; i < cond_data.size(); ++i) {
-    if (cond_data[i] != cond_value) {
-      AddMessageF(
-          "Cannot resolve %s as constant; cond_array has differing "
-          "per-element values",
-          LogName(*op));
-      return false;
-    }
-  }
-
-  // Pass-through the selected input.
-  return RemoveTrivialPassthroughOp(this, model, op_index, cond_value ? 1 : 2);
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
deleted file mode 100644
index c698a9567af17938aa8bf827a1941ac14b068053..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ /dev/null
@@ -1,298 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <string.h>
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
-  auto& output_array = model->GetArray(op.outputs[0]);
-  if (output_array.minmax) {
-    return false;
-  }
-  const auto& input_array = model->GetArray(op.inputs[0]);
-  if (!input_array.minmax) {
-    return false;
-  }
-  const auto& input_minmax = input_array.GetMinMax();
-  CHECK(!output_array.minmax);
-  auto& output_minmax = output_array.GetOrCreateMinMax();
-  output_minmax.min = input_minmax.min;
-  output_minmax.max = input_minmax.max;
-  return true;
-}
-
-bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
-  const auto unary_it = model->operators.begin() + op_index;
-  const auto* unary_op = unary_it->get();
-  // Test for unary ops of types that we know how to resolve.
-  switch (unary_op->type) {
-    case OperatorType::kCast:
-    case OperatorType::kExp:
-    case OperatorType::kLog:
-    case OperatorType::kNeg:
-    case OperatorType::kRsqrt:
-    case OperatorType::kSqrt:
-    case OperatorType::kSquare:
-    case OperatorType::kSum:
-    case OperatorType::kReduceMin:  //  Reduction Min
-    case OperatorType::kReduceMax:  //  Reduction Max
-    case OperatorType::kReshape:
-    case OperatorType::kRelu6:
-    case OperatorType::kRelu1:
-    case OperatorType::kRelu:
-      break;
-    default:
-      return false;
-  }
-
-  // Check if the input is a constant parameter.
-  if (!IsConstantParameterArray(*model, unary_op->inputs[0])) {
-    return false;
-  }
-
-  // if the unary op involves a tensor required by a rnn state, ignore it
-  for (const auto& rnn_state : model->flags.rnn_states()) {
-    if (unary_op->inputs[0] == rnn_state.back_edge_source_array()) {
-      return false;
-    }
-    if (unary_op->inputs[0] == rnn_state.state_array()) {
-      return false;
-    }
-  }
-
-  auto& output_array = model->GetArray(unary_op->outputs[0]);
-  if (!output_array.has_shape()) {
-    // Yield until the output array dims have been resolved.
-    return false;
-  }
-
-  // At the moment we don't want to care about fused activation functions.
-  // The idea is that we should do the present constants-propagation before
-  // activation functions get fused.
-  if (unary_op->fused_activation_function !=
-      FusedActivationFunctionType::kNone) {
-    AddMessageF(
-        "Not resolving constant %s "
-        " because it has a fused activation function",
-        LogName(*unary_op));
-    return false;
-  }
-
-  // The min-max is only copied for ops that copy data without arithmetic.
-  // In future trivial transpose, etc, can be handled here.
-  if (unary_op->type == OperatorType::kReshape) {
-    CopyMinMaxFromFirstInput(*unary_op, model);
-  }
-
-  const auto& input_array = model->GetArray(unary_op->inputs[0]);
-  // We have already tested above for existence of buffers (synonymous to being
-  // a constant param).
-  CHECK(input_array.buffer);
-  std::vector<DataType<ArrayDataType::kFloat>> const* input_float_data;
-  if (unary_op->type == OperatorType::kCast) {
-    CastOperator const* cast_op = static_cast<CastOperator const*>(unary_op);
-    if (cast_op->dst_data_type != ArrayDataType::kFloat) {
-      AddMessageF(
-          "Not resolving constant %s because we currently only support casting "
-          "to float",
-          LogName(*unary_op));
-      return false;
-    }
-    if (cast_op->src_data_type != input_array.buffer->type) {
-      AddMessageF(
-          "Not resolving constant %s because cast op source type does not "
-          "match input type",
-          LogName(*unary_op));
-    }
-  } else {
-    if (input_array.buffer->type != ArrayDataType::kFloat) {
-      return false;
-    }
-    input_float_data = &(input_array.GetBuffer<ArrayDataType::kFloat>().data);
-  }
-
-  // Create a float buffer on the output array, which are always constant.
-  const Shape& output_shape = output_array.shape();
-  const int output_dims_count = output_shape.dimensions_count();
-  const int output_buffer_size = RequiredBufferSizeForShape(output_shape);
-  auto& output_float_data =
-      output_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
-  output_float_data.resize(output_buffer_size);
-
-  const Shape& input_shape = input_array.shape();
-  const int input_buffer_size = RequiredBufferSizeForShape(input_shape);
-  if (unary_op->type == OperatorType::kCast) {
-    for (int i = 0; i < output_buffer_size; i++) {
-      float outval = 0.0f;
-      if (input_array.buffer->type == ArrayDataType::kFloat) {
-        outval = static_cast<float>(
-            input_array.GetBuffer<ArrayDataType::kFloat>().data[i]);
-      } else if (input_array.buffer->type == ArrayDataType::kUint8) {
-        outval = static_cast<float>(
-            input_array.GetBuffer<ArrayDataType::kUint8>().data[i]);
-      } else if (input_array.buffer->type == ArrayDataType::kInt32) {
-        outval = static_cast<float>(
-            input_array.GetBuffer<ArrayDataType::kInt32>().data[i]);
-      } else if (input_array.buffer->type == ArrayDataType::kInt64) {
-        outval = static_cast<float>(
-            input_array.GetBuffer<ArrayDataType::kInt64>().data[i]);
-      } else {
-        LOG(FATAL) << "Unsupported cast op input type";
-      }
-      output_float_data[i] = outval;
-    }
-  } else if (unary_op->type == OperatorType::kReshape) {
-    CHECK(input_buffer_size == output_buffer_size);
-    output_float_data = *input_float_data;
-  } else if (unary_op->type == OperatorType::kSum) {
-    CHECK_EQ(unary_op->inputs.size(), 2) << "Sum needs 2 inputs";
-    if (!IsConstantParameterArray(*model, unary_op->inputs[1])) {
-      AddMessageF("Axis input is non-constant");
-      return false;
-    }
-    auto& axis_array = model->GetArray(unary_op->inputs[1]);
-    CHECK(axis_array.data_type == ArrayDataType::kInt32);
-    int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
-    CHECK_LT(axis, input_shape.dimensions_count()) << "Axis out of bounds";
-
-    // We currently only handle reduction on axis 0.
-    CHECK_EQ(axis, 0) << "Only reduction along axis 0 is supported";
-    // We currently only handle 1-D and 2-D input tensors.
-    CHECK_LE(input_shape.dimensions_count(), 2) << "Rank >2 not yet supported";
-    // We only support keep_dims=true; shape prop will need to change otherwise.
-    auto sum_op = static_cast<const TensorFlowSumOperator*>(unary_op);
-    CHECK(sum_op->keep_dims) << "Only keep_dims=true is supported";
-
-    std::vector<int> indices(input_shape.dimensions_count());
-    for (int i = 0; i < input_shape.dims(1); ++i) {
-      indices[1] = i;
-      float sum = 0.f;
-      for (int j = 0; j < input_shape.dims(0); ++j) {
-        indices[0] = j;
-        sum += (*input_float_data)[Offset(input_shape, indices)];
-      }
-      output_float_data[i] = sum;
-    }
-  } else if (unary_op->type == OperatorType::kReduceMin) {
-    // At the moment only full reduction across all dimensions is supported.
-    // TODO(starka): Output should not be padded.
-    for (int i = 0; i < output_dims_count; i++) {
-      CHECK_EQ(output_shape.dims(i), 1);
-    }
-    float min = (*input_float_data)[0];
-    for (int i = 0; i < input_buffer_size; i++) {
-      min = std::min(min, (*input_float_data)[i]);
-    }
-    output_float_data[0] = min;
-  } else if (unary_op->type == OperatorType::kReduceMax) {
-    // At the moment only full reduction across all dimensions is supported.
-    // TODO(starka): Output should not be padded.
-    for (int i = 0; i < output_dims_count; i++) {
-      CHECK_EQ(output_shape.dims(i), 1);
-    }
-    float max = (*input_float_data)[0];
-    for (int i = 0; i < input_buffer_size; i++) {
-      max = std::max(max, (*input_float_data)[i]);
-    }
-    output_float_data[0] = max;
-  } else if (unary_op->type == OperatorType::kExp ||
-             unary_op->type == OperatorType::kNeg ||
-             unary_op->type == OperatorType::kLog ||
-             unary_op->type == OperatorType::kRsqrt ||
-             unary_op->type == OperatorType::kSqrt ||
-             unary_op->type == OperatorType::kSquare) {
-    // Element-wise ops. Should have perfectly matching sizes here.
-    for (int i = 0; i < output_dims_count; i++) {
-      CHECK_EQ(output_shape.dims(i), input_shape.dims(i));
-    }
-
-    for (int i = 0; i < output_buffer_size; i++) {
-      const float val = (*input_float_data)[i];
-      float outval = 0.f;
-      if (unary_op->type == OperatorType::kExp) {
-        outval = std::exp(val);
-      } else if (unary_op->type == OperatorType::kNeg) {
-        outval = -val;
-      } else if (unary_op->type == OperatorType::kLog) {
-        outval = std::log(val);
-      } else if (unary_op->type == OperatorType::kRsqrt) {
-        outval = 1.0f / std::sqrt(val);
-      } else if (unary_op->type == OperatorType::kSqrt) {
-        outval = std::sqrt(val);
-      } else if (unary_op->type == OperatorType::kSquare) {
-        outval = val * val;
-      } else {
-        LOG(FATAL) << "should not get here.";
-      }
-      output_float_data[i] = outval;
-    }
-  } else if (unary_op->type == OperatorType::kRelu6 ||
-             unary_op->type == OperatorType::kRelu1 ||
-             unary_op->type == OperatorType::kRelu) {
-    for (size_t i = 0; i < output_buffer_size; ++i) {
-      const float value = (*input_float_data)[i];
-      float new_value = 0.0f;
-      switch (unary_op->type) {
-        case OperatorType::kRelu: {
-          static constexpr float kLower = 0;
-          new_value = value < kLower ? kLower : value;
-          break;
-        }
-        case OperatorType::kRelu1: {
-          static constexpr float kUpper = 1;
-          static constexpr float kLower = -1;
-          new_value = value > kUpper ? kUpper : value < kLower ? kLower : value;
-          break;
-        }
-        case OperatorType::kRelu6: {
-          static constexpr float kUpper = 6;
-          static constexpr float kLower = 0;
-          new_value = value > kUpper ? kUpper : value < kLower ? kLower : value;
-          break;
-        }
-        default:
-          LOG(FATAL) << "Unsupported activation function "
-                     << LogName(*unary_op);
-          return false;
-      }
-      output_float_data[i] = new_value;
-    }
-  } else {
-    LOG(FATAL) << "should not get here.";
-  }
-  for (const auto& input : unary_op->inputs) {
-    if (CountOpsWithInput(*model, input) == 1) {
-      model->EraseArray(input);
-    }
-  }
-  AddMessageF("Resolved constant %s to the equivalent constant array",
-              LogName(*unary_op));
-  model->operators.erase(unary_it);
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_gather_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_gather_attributes.cc
deleted file mode 100644
index ce825c91af428c866ca9f83b765399f209606af9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_gather_attributes.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolveGatherAttributes::Run(Model* model, std::size_t op_index) {
-  auto* gather_op = model->operators[op_index].get();
-  if (gather_op->type != OperatorType::kGather) return false;
-  auto* op = static_cast<GatherOperator*>(gather_op);
-
-  if (op->axis) {
-    // Attributes already resolved
-    return false;
-  }
-  if (op->inputs.size() != 3) return false;
-  if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
-
-  const auto& indices_array = model->GetArray(op->inputs[2]);
-  if (!indices_array.has_shape()) return false;
-  const auto& axis_data = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
-  CHECK_EQ(axis_data.size(), 1)
-      << "Multidimensional gather not supported on " << LogName(*op);
-  op->axis = {axis_data[0]};
-
-  // Drop the axis array as we no longer need it.
-  DeleteArrayIfUsedOnce(op->inputs[2], model);
-  op->inputs.resize(2);
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
deleted file mode 100644
index 8a8e723cf7b2d77ec199e3817464a068bf85afdd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolvePadAttributes::Run(Model* model, std::size_t op_index) {
-  const auto pad_it = model->operators.begin() + op_index;
-  auto* pad_op = pad_it->get();
-  if (pad_op->type != OperatorType::kPad) return false;
-
-  auto* op = static_cast<PadOperator*>(pad_op);
-  if (!op->left_padding.empty()) return false;
-
-  CHECK_EQ(op->inputs.size(), 2);
-  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
-
-  const auto& array = model->GetArray(op->inputs[1]);
-  if (!array.has_shape()) return false;
-
-  const std::vector<int>& dims = array.shape().dims();
-  CHECK_EQ(dims.size(), 2);
-
-  std::vector<int> buffer = array.GetBuffer<ArrayDataType::kInt32>().data;
-
-  for (int i = 0; i < dims[0]; ++i) {
-    op->left_padding.push_back(buffer[i * 2]);
-    op->right_padding.push_back(buffer[i * 2 + 1]);
-  }
-
-  // TODO(dkalenichenko): Delete the extra input?
-
-  return true;
-}
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_padv2_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_padv2_attributes.cc
deleted file mode 100644
index ebb023e34223a57a2ad5708662d9c443949fcd0a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_padv2_attributes.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolvePadV2Attributes::Run(Model* model, std::size_t op_index) {
-  const auto pad_it = model->operators.begin() + op_index;
-  auto* pad_op = pad_it->get();
-  if (pad_op->type != OperatorType::kPadV2) return false;
-
-  auto* op = static_cast<PadV2Operator*>(pad_op);
-  if (!op->left_padding.empty()) return false;
-
-  CHECK_EQ(op->inputs.size(), 3);
-  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
-
-  const auto& array = model->GetArray(op->inputs[1]);
-  if (!array.has_shape()) return false;
-
-  const std::vector<int>& dims = array.shape().dims();
-  CHECK_EQ(dims.size(), 2);
-
-  std::vector<int> buffer = array.GetBuffer<ArrayDataType::kInt32>().data;
-
-  for (int i = 0; i < dims[0]; ++i) {
-    op->left_padding.push_back(buffer[i * 2]);
-    op->right_padding.push_back(buffer[i * 2 + 1]);
-  }
-
-  // TODO(dkalenichenko): Delete the extra input?
-
-  return true;
-}
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reduce_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reduce_attributes.cc
deleted file mode 100644
index 73198ac7c032fc67d8ed85259bc779c5c06e1e16..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reduce_attributes.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-template <typename T>
-bool ResolveAttributes(Model* model, T* op) {
-  if (!op->axis.empty()) {
-    // Attributes already resolved
-    return false;
-  }
-  if (op->inputs.size() != 2) return false;
-  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
-
-  const Array& indices_array = model->GetArray(op->inputs[1]);
-  if (!indices_array.has_shape()) return false;
-  op->axis = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
-  return true;
-}
-
-bool ResolveReduceAttributes::Run(Model* model, std::size_t op_index) {
-  Operator* op = model->operators[op_index].get();
-  switch (op->type) {
-    case OperatorType::kMean:
-      return ResolveAttributes(model, static_cast<MeanOperator*>(op));
-    case OperatorType::kSum:
-      return ResolveAttributes(model, static_cast<TensorFlowSumOperator*>(op));
-    case OperatorType::kReduceProd:
-      return ResolveAttributes(model, static_cast<TensorFlowProdOperator*>(op));
-    case OperatorType::kReduceMin:
-      return ResolveAttributes(model, static_cast<TensorFlowMinOperator*>(op));
-    case OperatorType::kReduceMax:
-      return ResolveAttributes(model, static_cast<TensorFlowMaxOperator*>(op));
-    case OperatorType::kAny:
-      return ResolveAttributes(model, static_cast<TensorFlowMaxOperator*>(op));
-    default:
-      return false;
-  }
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
deleted file mode 100644
index b615c9a545695e5d14fa5809e0c38a770f23ea24..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <iterator>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolveReshapeAttributes::Run(Model* model, std::size_t op_index) {
-  const auto reshape_it = model->operators.begin() + op_index;
-  auto* reshape_op = reshape_it->get();
-  if (reshape_op->type != OperatorType::kReshape) {
-    return false;
-  }
-
-  auto* op = static_cast<TensorFlowReshapeOperator*>(reshape_op);
-
-  if (!op->shape.empty()) return false;
-
-  if (IsConstantParameterArray(*model, reshape_op->inputs[1])) {
-    const auto& constant_input_array = model->GetArray(reshape_op->inputs[1]);
-    op->shape = constant_input_array.GetBuffer<ArrayDataType::kInt32>().data;
-  }
-
-  if (op->shape.empty()) return false;
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
deleted file mode 100644
index e760d08e5a6c2f56db6b11fee922b701d33dd1a0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolveSliceAttributes::Run(Model* model, std::size_t op_index) {
-  const auto slice_it = model->operators.begin() + op_index;
-  auto* slice_op = slice_it->get();
-  if (slice_op->type != OperatorType::kSlice) return false;
-
-  auto* op = static_cast<SliceOperator*>(slice_op);
-  if (!op->begin.empty()) return false;
-
-  CHECK_EQ(op->inputs.size(), 3);
-  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
-  if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
-
-  const auto& begin_array = model->GetArray(op->inputs[1]);
-  if (!begin_array.has_shape()) return false;
-
-  const auto& size_array = model->GetArray(op->inputs[2]);
-  if (!size_array.has_shape()) return false;
-
-  op->begin = begin_array.GetBuffer<ArrayDataType::kInt32>().data;
-  op->size = size_array.GetBuffer<ArrayDataType::kInt32>().data;
-
-  // TODO(dkalenichenko): Delete the extra inputs?
-
-  return true;
-}
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
deleted file mode 100644
index fab50bec1fc5ec50cecba53845457931ed59c0b8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolveSpaceToBatchNDAttributes::Run(Model* model, std::size_t op_index) {
-  const auto op_it = model->operators.begin() + op_index;
-  if (op_it->get()->type != OperatorType::kSpaceToBatchND) return false;
-
-  auto* op = static_cast<SpaceToBatchNDOperator*>(op_it->get());
-
-  // The attributes are resolved only when the 3 attributes (block_shape,
-  // before_paddings, after_paddings) are all constant.
-  if (!op->block_shape.empty()) {
-    return false;
-  }
-
-  const int block_shape_index = 1;
-  const int paddings_index = 2;
-
-  CHECK_EQ(op->inputs.size(), 3);
-  if (!IsConstantParameterArray(*model, op->inputs[block_shape_index]) ||
-      !IsConstantParameterArray(*model, op->inputs[paddings_index]))
-    return false;
-
-  // Handle paddings.
-  const auto& paddings_array = model->GetArray(op->inputs[paddings_index]);
-  if (!paddings_array.has_shape()) return false;
-  const std::vector<int>& paddings_dims = paddings_array.shape().dims();
-  if (paddings_dims.size() != 2) {
-    // Code only handles padding of 2 dimensions. Perhaps another transformation
-    // will delete this op.
-    return false;
-  }
-  const std::vector<int>& paddings_buffer =
-      paddings_array.GetBuffer<ArrayDataType::kInt32>().data;
-  for (int i = 0; i < paddings_dims[0]; ++i) {
-    op->before_paddings.push_back(paddings_buffer[i * 2]);
-    op->after_paddings.push_back(paddings_buffer[i * 2 + 1]);
-  }
-
-  // Handle block_shape.
-  const auto& block_shape_array =
-      model->GetArray(op->inputs[block_shape_index]);
-  if (!block_shape_array.has_shape()) return false;
-  const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
-  CHECK_EQ(block_shape_dims.size(), 1);
-  const std::vector<int>& block_shape_buffer =
-      block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
-  for (int i = 0; i < block_shape_dims[0]; ++i) {
-    op->block_shape.push_back(block_shape_buffer[i]);
-  }
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_squeeze_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
deleted file mode 100644
index e8bb85704e1c750300079681b5a12f6a488b6b48..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolveSqueezeAttributes::Run(Model* model, std::size_t op_index) {
-  auto* squeeze_op = model->operators[op_index].get();
-  if (squeeze_op->type != OperatorType::kSqueeze) {
-    return false;
-  }
-  DCHECK_EQ(squeeze_op->inputs.size(), 1);
-  DCHECK_EQ(squeeze_op->outputs.size(), 1);
-
-  // If the output is consumed by a reshape op, it's a trivial squeeze.
-  if (CountOpsWithInput(*model, squeeze_op->outputs[0]) == 1) {
-    const auto* next_op = GetOpWithInput(*model, squeeze_op->outputs[0]);
-    if (next_op->type == OperatorType::kReshape) {
-      AddMessageF(
-          "%s is trivial because its output is only consumed by a "
-          "Reshape op",
-          LogName(*squeeze_op));
-
-      return RemoveTrivialPassthroughOp(this, model, op_index);
-    }
-  }
-  return false;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc
deleted file mode 100644
index a657ee00af66bd431f96c361e12d5213e203b3df..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_transpose_attributes.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ResolveTransposeAttributes::Run(Model* model, std::size_t op_index) {
-  const auto op_it = model->operators.begin() + op_index;
-  if (op_it->get()->type != OperatorType::kTranspose) return false;
-
-  auto* op = static_cast<TransposeOperator*>(op_it->get());
-  if (!op->perm.empty()) return false;
-
-  CHECK_EQ(op->inputs.size(), 2);
-  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
-
-  // Handling perm.
-  const auto& perm_array = model->GetArray(op->inputs[1]);
-  if (!perm_array.has_shape()) return false;
-
-  const std::vector<int>& perm_dims = perm_array.shape().dims();
-  CHECK_EQ(perm_dims.size(), 1);
-
-  std::vector<int> perm_buffer =
-      perm_array.GetBuffer<ArrayDataType::kInt32>().data;
-  for (int i = 0; i < perm_dims[0]; ++i) {
-    op->perm.push_back(perm_buffer[i]);
-  }
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
deleted file mode 100644
index acf1e3ede5197e899527f8874831165c7ebbf431..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-
-tf_cc_test(
-    name = "lstm_utils_test",
-    srcs = ["lstm_utils_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        "//tensorflow/contrib/lite/toco:graph_transformations",
-        "//tensorflow/contrib/lite/toco:model",
-        "//tensorflow/contrib/lite/toco:tooling_util",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-tf_cc_test(
-    name = "resolve_constant_concatenation_test",
-    srcs = ["resolve_constant_concatenation_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        "//tensorflow/contrib/lite/toco:graph_transformations",
-        "//tensorflow/contrib/lite/toco:model",
-        "//tensorflow/contrib/lite/toco:tooling_util",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow_test.cc b/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
deleted file mode 100644
index 8a236d444460d45942f8644ccbec0b5e5fc18048..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
+++ /dev/null
@@ -1,288 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/toco/import_tensorflow.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace toco {
-
-using tensorflow::AttrValue;
-using tensorflow::DT_BOOL;
-using tensorflow::DT_FLOAT;
-using tensorflow::DT_INT32;
-using tensorflow::DT_INT64;
-using tensorflow::DT_QUINT8;
-using tensorflow::DT_STRING;
-using tensorflow::NodeDef;
-using tensorflow::Status;
-
-namespace internal {
-using ConverterType = tensorflow::Status (*)(
-    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
-    Model* model);
-using ConverterMapType = std::unordered_map<std::string, ConverterType>;
-
-ConverterMapType GetTensorFlowNodeConverterMap();
-Status ImportTensorFlowNode(const NodeDef&, const TensorFlowImportFlags&,
-                            Model*, const ConverterMapType&);
-}  // namespace internal
-
-namespace {
-
-Status ImportNode(const NodeDef& node, Model* model) {
-  const auto converter = internal::GetTensorFlowNodeConverterMap();
-  return internal::ImportTensorFlowNode(node, TensorFlowImportFlags(), model,
-                                        converter);
-}
-
-Status ImportNode(const NodeDef& node) {
-  Model model;
-  return ImportNode(node, &model);
-}
-
-NodeDef BuildNode(
-    const std::string& op,
-    const std::vector<std::initializer_list<int>>& output_shapes) {
-  NodeDef node;
-  node.set_op(op);
-  node.set_name("Node1");
-  node.add_input();
-  node.set_input(0, "Node0");
-
-  AttrValue::ListValue* shapes =
-      (*node.mutable_attr())["_output_shapes"].mutable_list();
-  for (const auto& output_shape : output_shapes) {
-    tensorflow::TensorShapeProto* shape = shapes->add_shape();
-    for (int64_t output_shape_dim : output_shape) {
-      auto shape_dim = shape->add_dim();
-      shape_dim->set_size(output_shape_dim);
-    }
-  }
-
-  return node;
-}
-
-class ShapeImportTest : public ::testing::TestWithParam<tensorflow::DataType> {
- protected:
-  ShapeImportTest() {}
-
-  void BuildConstNode(std::initializer_list<int64_t> shape,
-                      tensorflow::DataType dtype, int64_t num_elements,
-                      NodeDef* node) {
-    node->set_op("Const");
-    node->set_name("Node1");
-
-    // An attribute describing the type of this const node.
-    AttrValue dtype_attr;
-    SetAttrValue(dtype, &dtype_attr);
-    (*node->mutable_attr())["dtype"] = dtype_attr;
-
-    // An attribute describing the content of this const node.
-    tensorflow::TensorProto t;
-    t.set_dtype(dtype);
-    auto* s = t.mutable_tensor_shape();
-    for (auto d : shape) {
-      s->add_dim()->set_size(d);
-    }
-
-    // TODO(ahentz): also need to test via tensor_content()
-    switch (dtype) {
-      case DT_FLOAT:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_float_val(i / 10000.0);
-        }
-        break;
-      case DT_INT32:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_int_val(i % std::numeric_limits<int>::max());
-        }
-        break;
-      case DT_QUINT8:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_int_val(i % std::numeric_limits<uint8_t>::max());
-        }
-        break;
-      case DT_INT64:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_int64_val(i);
-        }
-        break;
-      case DT_STRING:
-        break;
-      case DT_BOOL:
-        for (int64_t i = 0; i < num_elements; ++i) {
-          t.add_bool_val(i % 2);
-        }
-        break;
-      default:
-        break;
-    }
-
-    AttrValue value_attr;
-    SetAttrValue(t, &value_attr);
-    (*node->mutable_attr())["value"] = value_attr;
-  }
-};
-
-class TypeImportTest : public ::testing::TestWithParam<
-                           std::pair<tensorflow::DataType, ArrayDataType>> {
- protected:
-  TypeImportTest() {}
-
-  void BuildUnaryNode(const std::string& op_name, tensorflow::DataType dtype,
-                      NodeDef* node) {
-    node->set_op(op_name);
-    node->set_name("Node1");
-
-    node->add_input();
-    node->set_input(0, "Node0");
-
-    AttrValue dtype_attr;
-    SetAttrValue(dtype, &dtype_attr);
-    (*node->mutable_attr())["T"] = dtype_attr;
-  }
-};
-
-std::vector<tensorflow::DataType> TestTypes() {
-  return {DT_FLOAT, DT_INT32, DT_INT64, DT_BOOL, DT_QUINT8};
-}
-
-TEST_P(ShapeImportTest, ShapeElementIsNegative) {
-  NodeDef node;
-  BuildConstNode({1, -2, 10}, GetParam(), 0, &node);
-  auto status = ImportNode(node);
-  EXPECT_EQ(
-      status.error_message(),
-      "Tensor shape should not include negative values\n\t (while processing "
-      "node 'Node1')");
-}
-INSTANTIATE_TEST_CASE_P(ShapeElementIsNegative, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
-
-TEST_P(ShapeImportTest, ShapeElementTooLarge) {
-  NodeDef node;
-  BuildConstNode({3000000000}, GetParam(), 0, &node);
-  auto status = ImportNode(node);
-  EXPECT_EQ(status.error_message(),
-            "Shape element overflows\n\t (while processing node 'Node1')");
-}
-INSTANTIATE_TEST_CASE_P(ShapeElementTooLarge, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
-
-TEST_P(ShapeImportTest, ShapeTooLarge) {
-  NodeDef node;
-  BuildConstNode({1000000, 2000000, 2000000, 2000000}, GetParam(), 0, &node);
-  auto status = ImportNode(node);
-  EXPECT_EQ(status.error_message(),
-            "Tensor shape is too large\n\t (while processing node 'Node1')");
-}
-INSTANTIATE_TEST_CASE_P(ShapeTooLarge, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
-
-TEST_P(ShapeImportTest, ValidShapeButZeroElements) {
-  NodeDef node;
-  BuildConstNode({1, 2, 2, 2}, GetParam(), 0, &node);
-  auto status = ImportNode(node);
-  EXPECT_THAT(status.error_message(),
-              ::testing::MatchesRegex(
-                  "Neither input_content .0. nor .*_val .0. have the right "
-                  "dimensions .8. for this .* tensor\n\t .while processing "
-                  "node 'Node1'."));
-}
-INSTANTIATE_TEST_CASE_P(ValidShapeButZeroElements, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
-
-std::vector<std::pair<tensorflow::DataType, ArrayDataType>> UnaryTestTypes() {
-  return {{DT_FLOAT, ArrayDataType::kFloat},
-          {DT_INT32, ArrayDataType::kInt32},
-          {DT_INT64, ArrayDataType::kInt64}};
-}
-
-TEST_P(TypeImportTest, BasicTypeInference) {
-  NodeDef node;
-  BuildUnaryNode("Atan", GetParam().first, &node);
-
-  Model model;
-  EXPECT_TRUE(ImportNode(node, &model).ok());
-
-  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
-  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
-  const TensorFlowUnsupportedOperator* op =
-      static_cast<const TensorFlowUnsupportedOperator*>(
-          model.operators[0].get());
-  ASSERT_THAT(op->output_data_types, ::testing::ElementsAre(GetParam().second));
-}
-INSTANTIATE_TEST_CASE_P(BasicTypeInference, TypeImportTest,
-                        ::testing::ValuesIn(UnaryTestTypes()));
-
-TEST(ImportTest, FailedTypeInference) {
-  // Create a unary op with no Type ("T") annotation.
-  NodeDef node;
-  node.set_op("Atan");
-  node.set_name("Node1");
-  node.add_input();
-  node.set_input(0, "Node0");
-
-  Model model;
-  EXPECT_TRUE(ImportNode(node, &model).ok());
-
-  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
-  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
-  const TensorFlowUnsupportedOperator* op =
-      static_cast<const TensorFlowUnsupportedOperator*>(
-          model.operators[0].get());
-  ASSERT_TRUE(op->output_data_types.empty());
-}
-
-TEST(ImportTest, UnsupportedOpWithOutputShapes) {
-  // Create an unsupported op with output shapes.
-  Model model;
-  EXPECT_TRUE(ImportNode(BuildNode("Atan", {{1, 2}, {2, 3}}), &model).ok());
-  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
-  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
-  const TensorFlowUnsupportedOperator* op =
-      static_cast<const TensorFlowUnsupportedOperator*>(
-          model.operators[0].get());
-
-  // The output shapes should be imported.
-  ASSERT_EQ(op->output_shapes.size(), 2);
-  ASSERT_THAT(op->output_shapes[0].dims(), ::testing::ElementsAre(1, 2));
-  ASSERT_THAT(op->output_shapes[1].dims(), ::testing::ElementsAre(2, 3));
-}
-
-TEST(ImportTest, UnsupportedOpWithWildcardOutputShapes) {
-  // Create an unsupported op with wildcard output shapes.
-  Model model;
-  EXPECT_TRUE(ImportNode(BuildNode("Atan", {{-1, 2}}), &model).ok());
-  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
-  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
-  const TensorFlowUnsupportedOperator* op =
-      static_cast<const TensorFlowUnsupportedOperator*>(
-          model.operators[0].get());
-
-  // Wildcard shapes aren't yet supported.
-  ASSERT_TRUE(op->output_shapes.empty());
-}
-
-}  // namespace
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
deleted file mode 100644
index 6e207fdf54d98225f5a553b398c7a0d6452d05b3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/model.h
+++ /dev/null
@@ -1,2120 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
-
-#include <complex>
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "absl/types/optional.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/toco_port.h"
-#include "tensorflow/contrib/lite/toco/toco_types.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-using tflite::QuantizationParams;
-
-enum class OperatorType : uint8 {
-  kNone,
-  // General-purpose neural network operators.
-  kAdd,
-  kAddN,
-  kAveragePool,
-  kBatchMatMul,
-  kBatchNormalization,
-  kConv,
-  kConcatenation,
-  kDepthwiseConv,
-  kDepthToSpace,
-  kSpaceToDepth,
-  kDequantize,
-  kDiv,
-  kExp,
-  kExpandDims,
-  kFill,
-  kFloorDiv,
-  kFloorMod,
-  kFullyConnected,
-  kL2Normalization,
-  kL2Pool,
-  kLstmCell,
-  kLocalResponseNormalization,
-  kLog,
-  kLogistic,
-  kMaxPool,
-  kFakeQuant,
-  kMul,
-  kOneHot,
-  kRandomUniform,
-  kRange,
-  kRank,
-  kRelu,
-  kRelu1,
-  kRelu6,
-  kPRelu,
-  kSoftmax,
-  kLogSoftmax,
-  kSub,
-  kTanh,
-  kTransposeConv,
-  kCast,
-  kFloor,
-  kGather,
-  kResizeBilinear,
-  kSin,
-  kSpaceToBatchND,
-  kPack,
-  kBatchToSpaceND,
-  kPad,
-  kPadV2,
-  kReduceProd,  // Reduction product
-  kStridedSlice,
-  kSlice,
-  kSqueeze,
-  kMean,
-  kArgMax,
-  // The SVDF Op is a decomposition of a densely connected Op into
-  // low rank filters. For details:
-  // https://research.google.com/pubs/pub43813.html
-  kSvdf,
-  // Special operators used for importing TensorFlow nodes.
-  // The general intent is to have some graph transformation either
-  // drop them or rewrite them as general-purpose operators.
-  kAll,
-  kAssert,
-  kConcat,
-  kConcatV2,
-  kGreater,
-  kGreaterEqual,
-  kIdentity,
-  kLess,
-  kLessEqual,
-  kReduceMax,  //  Reduction Max
-  kMaximum,    //  Element-wise Maximum
-  kReduceMin,  //  Reduction Min
-  kMinimum,    //  Element-wise Minimum
-  kMatMul,
-  kMerge,
-  kNeg,
-  kReshape,
-  kRsqrt,
-  kShape,
-  kSplit,
-  kSqrt,
-  kSquare,
-  kSum,
-  kSwitch,
-  kTile,
-  kTranspose,
-  kTopK_V2,
-  kDynamicPartition,
-  kDynamicStitch,
-  // An unsupported TF operation. It's only needed to be able to represent TF
-  // graph internally and is expected to be dropped by graph transformations.
-  kUnsupported,
-  // Finally, TensorFlow uses different conventions for axes ordering,
-  // see AxesOrder, and this cannot always be resolved at the time of importing
-  // nodes, as TensorFlow parameters may be constant-expression subgraphs
-  // instead of being given as plain constant arrays. So we need to insert
-  // special nodes in the graph to shuffle axes.
-  kReorderAxes,
-  kSelect,
-  kSparseToDense,
-  kEqual,
-  kNotEqual,
-  kPow,
-  kArgMin,
-  kAny,
-  kLogicalAnd,
-  kLogicalNot,
-  kLogicalOr,
-  kCTCBeamSearchDecoder,
-  kUnpack,
-  kZerosLike,
-};
-
-// Helper to deal with TensorFlow arrays using a different ordering of
-// dimensions
-// ("axes") than our own.
-// TODO(benoitjacob): Ultimately, we shouldn't have any "ordering" of axes,
-// we should have associative arrays mapping symbolic axes identifiers (like
-// "output_depth") to dimensions. We would then not need this anymore.
-enum class AxesOrder {
-  kOneAxis,  // one-dimensional array, one unique axis.
-  kCR,       // column-major matrix storage order. Our standard.
-  kRC,       // row-major matrix storage order. TensorFlow default.
-  kOHWI,     // Our standard for conv weights
-  kHWIO,     // TensorFlow conv weights
-  k1HWO,     // Our standard for DepthwiseConv weights
-  kHWIM,     // TensorFlow DepthwiseConv weights
-  kNHWC,     // TensorFlow activations
-  kHWOI,     // TensorFlow back-prop conv weights
-};
-
-// The type of the scalars in an array.
-// Note that the type does not by itself tell whether the values in the array
-// are non-quantized (can be accessed directly) or quantized (must be
-// interpreted in conjunction with QuantizationParams).
-//
-// In practice though:
-//   float values are never quantized
-//   uint8 values are always quantized
-//   int32 values are sometimes quantized (depending on whether
-//   QuantizationParams are present).
-//   complex values are never quantized
-//   other types are never quantized at the moment.
-//
-// kNone means that we don't know the data type yet, or that we don't care
-// because we'll be dropping the array anyway (e.g. some exotic array types
-// may be involved only in debug-only subgraphs that we may not be interested
-// in actually supporting).
-enum class ArrayDataType : uint8 {
-  kNone,  // 0
-  kBool,
-  kFloat,
-  kInt8,
-  kUint8,
-  kInt16,  // 5
-  kUint16,
-  kInt32,
-  kUint32,
-  kInt64,
-  kUint64,  // 10
-  kString,
-  kComplex64,
-};
-
-// Compile-time logic to map ArrayDataType to the corresponding C++ scalar type
-template <ArrayDataType A>
-struct DataTypeImpl {};
-template <>
-struct DataTypeImpl<ArrayDataType::kNone> {
-  typedef int Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kBool> {
-  typedef bool Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kFloat> {
-  typedef float Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kInt8> {
-  typedef int8 Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kUint8> {
-  typedef uint8 Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kInt16> {
-  typedef int16 Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kUint16> {
-  typedef uint16 Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kInt32> {
-  typedef int32 Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kUint32> {
-  typedef uint32 Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kInt64> {
-  typedef int64 Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kUint64> {
-  typedef uint64 Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kString> {
-  typedef string Type;
-};
-template <>
-struct DataTypeImpl<ArrayDataType::kComplex64> {
-  typedef std::complex<float> Type;
-};
-
-template <ArrayDataType A>
-using DataType = typename DataTypeImpl<A>::Type;
-
-// Base class for type-specific buffer types.
-struct GenericBuffer {
-  // Non-default-constructible: only ArrayDataType-specific subclass
-  // objects may be constructed.
-  GenericBuffer() = delete;
-  // Non-copyable-or-movable: we should only store pointers-to-Buffer
-  // in containers, not Operators themselves, so there should be no
-  // copy or move.
-  GenericBuffer(const GenericBuffer&) = delete;
-  GenericBuffer(const GenericBuffer&&) = delete;
-
-  // We need a virtual destructor so we can store pointers-to-Buffer
-  // in containers and have the containers call the right subclass destructor.
-  virtual ~GenericBuffer() {}
-
-  virtual int Length() const = 0;
-
-  const ArrayDataType type;
-
- protected:
-  // Constructor used by subclasses for specific ArrayDataType's.
-  explicit GenericBuffer(ArrayDataType t) : type(t) {}
-};
-
-// Type-specific buffer, containing type-specific storage.
-template <ArrayDataType A>
-struct Buffer : GenericBuffer {
-  Buffer() : GenericBuffer(A) {}
-
-  int Length() const override { return data.size(); }
-
-  std::vector<DataType<A>> data;
-};
-
-class Shape {
- public:
-  // For Shape, we stick to half-way encapsulation for now:
-  // we hide the raw dims_ member, but expose it raw by accessors
-  // because from some brainstorming, it's not at all easy to
-  // anticipate which flavor of more hermetic encapsulation would
-  // actually buy us future-proof-ness without being needlessly
-  // cumbersome.
-  Shape() {}
-  Shape(std::initializer_list<int> dim_list) : dims_(dim_list) {}
-
-  void ReplaceDims(std::initializer_list<int> dim_list) {
-    dims_ = std::vector<int>(dim_list);
-  }
-
-  const std::vector<int>& dims() const { return dims_; }
-  std::vector<int>* mutable_dims() { return &dims_; }
-  const int dimensions_count() const { return dims_.size(); }
-
-  // We still have that one convenience accessor to avoid
-  // the awkward double bracket issue:  shape.dims()[i].
-  int dims(int i) const {
-    // Always check for out-of-bounds accesses, even in optimized builds where
-    // standard assertions are disabled. Out-of-bounds access here is a common
-    // occurrence.
-    CHECK_GE(i, 0);
-    CHECK_GT(dims_.size(), i);
-    return dims_[i];
-  }
-
-  bool operator==(const Shape& comp) const {
-    return (this->dims_ == comp.dims());
-  }
-
-  bool operator!=(const Shape& comp) const { return !((*this) == comp); }
-
- private:
-  std::vector<int> dims_;
-};
-
-// Base class for all operator classes.
-struct Operator {
-  // Non-default-constructible: only OperatorType-specific subclass
-  // objects may be constructed.
-  Operator() = delete;
-  // Non-copyable-or-movable: we should only store pointers-to-Operator
-  // in containers, not Operators themselves, so there should be no
-  // copy or move.
-  Operator(const Operator&) = delete;
-  Operator(const Operator&&) = delete;
-
-  // We need a virtual destructor so we can store pointers-to-Operator
-  // in containers and have the containers call the right subclass destructor.
-  virtual ~Operator() {}
-
-  // The specific type of operator. Corresponds 1:1 to subclasses.
-  const OperatorType type;
-
-  // The activation function that may be fused into this operator,
-  // or None if no activation function is fused.
-  FusedActivationFunctionType fused_activation_function;
-
-  // Input arrays: either activation arrays or constant array parameters.
-  // We refer to them by their name, not by their address; the mapping of
-  // names to addresses is given by the Model, which owns both Operator's and
-  // Array's. Thus, an Operator on its own doesn't contain much information,
-  // it is meant to be used in conjunction with the Model that owns it.
-  std::vector<string> inputs;
-
-  // Output activation arrays. Same comments as for inputs apply here too.
-  std::vector<string> outputs;
-
-  // If true, the array has more outputs than are listed in the 'outputs'
-  // member. These need to be resolved by some graph transformation.
-  // This flag is only here to indicate that an operator should not be
-  // discarded as unused, even if from its 'outputs' member alone it
-  // looks unused.
-  bool unresolved_outputs = false;
-
- protected:
-  // Constructor used by subclasses for specific OperatorType's.
-  explicit Operator(OperatorType t)
-      : type(t),
-        fused_activation_function(FusedActivationFunctionType::kNone) {}
-};
-
-// Padding types for Conv-like operators. This is how padding is typically
-// specified in model files. But for inference, we will need to resolve this
-// to a FixedPadding, see below.
-enum class PaddingType { kNone, kSame, kValid };
-
-// Padding as resolved for a specific layer shape, as needed for inference.
-// For a given layer shape, a given padding type will resolve to a choice of
-// a number of padding rows and columns, which we call the padding height and
-// width respectively.
-struct FixedPadding {
-  int width = 0;
-  int height = 0;
-};
-
-// "Universal" padding struct containing both a generic PaddingType (as
-// represented in a model file), and a FixedPadding (as needed for inference).
-// The latter is resolved during the PropagateFixedSizes pass.
-struct Padding {
-  FixedPadding& GetOrCreateFixedPadding() {
-    if (!fixed) {
-      FixedPadding* ptr = new FixedPadding;
-      fixed = std::unique_ptr<FixedPadding>(ptr);
-    }
-    return *fixed;
-  }
-
-  Padding() : type(PaddingType::kNone) {}
-  PaddingType type;
-  std::unique_ptr<FixedPadding> fixed;
-};
-
-// "Convolutional" layer, as represented in model files.
-//
-// Inputs:
-//   inputs[0]: required: the input activations array
-//   inputs[1]: required: the Conv weights
-//   inputs[2]: optional: the bias vector, specifying the biases for each output
-//   channel.
-//
-// Outputs:
-//   outputs[0]: required: the output activations array
-//   outputs[1]: optional: the intermediate array of im2col-replicated input
-//                         activations. Present when targeting implementations
-//                         of Conv layers as Im2col+GEMM.
-//
-// TensorFlow equivalent: Conv2D
-struct ConvOperator : Operator {
-  ConvOperator() : Operator(OperatorType::kConv) {}
-  Padding padding;
-  int stride_width = 0;
-  int stride_height = 0;
-  // A dilation_rate of 0 is invalid and this field is an optional attribute.
-  // Thus initializing it to 1 to allow default conv behavior when the
-  // attribute is not present.
-  int dilation_width_factor = 1;
-  int dilation_height_factor = 1;
-};
-
-// CTCBeamSearchDecoder operator:
-//
-// Inputs:
-//   inputs[0]: required: the logits.
-//   inputs[1]: required: sequence length.
-//   inputs[2]: optional: beam width.
-//   inputs[3]: optional: top paths.
-//   inputs[4]: optional: merge repeated.
-//
-//  Outputs:
-//    outputs[0]: deocoded.
-//    outputs[1]: log probability.
-//
-// TensorFlow equivalent: CTCBeamSearchDecoder
-struct CTCBeamSearchDecoderOperator : Operator {
-  CTCBeamSearchDecoderOperator()
-      : Operator(OperatorType::kCTCBeamSearchDecoder) {}
-  int beam_width;
-  int top_paths;
-  bool merge_repeated = true;
-};
-
-// Depthwise-separable convolution operator.
-//
-// Inputs:
-//   inputs[0]: required: the input activations array
-//   inputs[1]: required: the DepthwiseConv weights
-//   inputs[2]: optional: the bias vector, specifying the biases for each output
-//   channel.
-//
-// TensorFlow equivalent: DepthwiseConv2dNative
-struct DepthwiseConvOperator : Operator {
-  DepthwiseConvOperator() : Operator(OperatorType::kDepthwiseConv) {}
-  Padding padding;
-  int stride_height = 0;
-  int stride_width = 0;
-  int depth_multiplier = 0;
-  // A dilation_rate of 0 is invalid and this field is an optional attribute.
-  // Thus initializing it to 1 to allow default conv behavior when the
-  // attribute is not present.
-  int dilation_width_factor = 1;
-  int dilation_height_factor = 1;
-};
-
-// Depth-to-space transform operator.
-//
-// Inputs:
-//   inputs[0]: required: the input activations array
-//
-// TensorFlow equivalent: DepthToSpace
-struct DepthToSpaceOperator : Operator {
-  DepthToSpaceOperator() : Operator(OperatorType::kDepthToSpace) {}
-  int block_size = 0;
-};
-
-// Space-to-depth transform operator.
-//
-// Inputs:
-//   inputs[0]: required: the input activations array
-//
-// TensorFlow equivalent: SpaceToDepth
-struct SpaceToDepthOperator : Operator {
-  SpaceToDepthOperator() : Operator(OperatorType::kSpaceToDepth) {}
-  int block_size = 0;
-};
-
-// Fully-connected operator.
-//
-// Inputs:
-//   inputs[0]: required: the input activations array
-//   inputs[1]: required: the FullyConnected weights
-//   inputs[2]: optional: the bias vector, specifying the biases for each output
-//   channel.
-//
-// TensorFlow equivalent: a pair consisting of a Reshape node reshaping the
-// input activations as a matrix, followed by a MatMul node.
-struct FullyConnectedOperator : Operator {
-  FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {}
-  FullyConnectedWeightsFormat weights_format =
-      FullyConnectedWeightsFormat::kDefault;
-};
-
-// Dequantization operator, converting a quantized array of integers with
-// quantization parameters specifying how these integers correspond to real
-// numbers
-// (see QuantizationParams) to an output activations array of floating-point
-// values.
-//
-// In floating-point image models, there is typically a Dequantization operator
-// at the very beginning, converting the input image RGB data, consisting of
-// uint8 integer values, to floating-point input activations. That is where
-// image model parameters such as "mean_value" and "std_value" are typically
-// handled.
-//
-// This is the only operator type that converts from quantized to
-// floating-point,
-// and there is at the moment no operator type at all to convert from
-// floating-point
-// to quantized. Every other operator does either float->float or
-// quantized->quantized.
-//
-// Inputs:
-//   inputs[0]: required: the input quantized activations array
-//
-// TensorFlow equivalent: Dequantize
-struct DequantizeOperator : Operator {
-  DequantizeOperator() : Operator(OperatorType::kDequantize) {}
-};
-
-// Batch-normalization operator.
-//
-// We only support batch-normalization using pre-learned moments, so this is
-// just
-// computing (input - mean) * multiplier + offset. As such, this can be
-// expressed as a combination of Add and Mul nodes, and indeed this is how
-// we break it down during tooling for the purpose of fusing it into
-// other operators.
-//
-// Inputs:
-//   inputs[0]: required: the input activations array
-//   inputs[1]: required: the learned mean array
-//   inputs[2]: required: the learned multiplier array
-//   inputs[3]: required: the learned offset array
-//
-// TensorFlow equivalent: a combination of Add and Mul nodes
-struct BatchNormalizationOperator : Operator {
-  BatchNormalizationOperator()
-      : Operator(OperatorType::kBatchNormalization),
-        global_normalization(false) {}
-  bool global_normalization;
-};
-
-// L2-normalization operator.
-//
-// Inputs:
-//   inputs[0]: required: the input activations array
-//
-// TensorFlow equivalent: none. In TensorFlow, L2 normalization is implemented
-// by a sub-graph of operators implementing L2-normalization
-// from lower-level arithmetic nodes; during tooling, we identify such
-// sub-graphs
-// and replace them by L2NormalizationOperator's. See IdentifyL2Normalization.
-struct L2NormalizationOperator : Operator {
-  L2NormalizationOperator() : Operator(OperatorType::kL2Normalization) {}
-};
-
-// LSTM Cell operator.
-//
-// Inputs:
-//   inputs[0]: required: the input data array
-//   inputs[1]: required: the previous output activations array
-//   inputs[2]: required: the learned weights array
-//   inputs[3]: required: the learned biases array
-//   inputs[4]: required: the previous output state
-//   outputs[0]: required: the output activations array
-//   outputs[1]: required: the new state array
-//
-// TensorFlow equivalent: none. In TensorFlow, an LSTM is implemented
-// with a sub-graph of lower-level arithmetic nodes; during tooling, we identify
-// such sub-graphs and replace them with LstmCells. See IdentifyLstmCell().
-struct LstmCellOperator : Operator {
-  enum Inputs {
-    DATA_INPUT = 0,
-    PREV_ACTIV_INPUT = 1,
-    WEIGHTS_INPUT = 2,
-    BIASES_INPUT = 3,
-    PREV_STATE_INPUT = 4,
-    NUM_INPUTS = 5
-  };
-  enum Outputs {
-    ACTIV_OUTPUT = 0,
-    STATE_OUTPUT = 1,
-    CONCAT_TEMP = 2,
-    ACTIV_TEMP = 3,
-    NUM_OUTPUTS = 4
-  };
-  enum KernelType {
-    KERNEL_BASIC = 0,
-    KERNEL_FULL = 1,
-  };
-
-  LstmCellOperator()
-      : Operator(OperatorType::kLstmCell), kernel_type(KERNEL_BASIC) {}
-
-  KernelType kernel_type;
-};
-
-// Element-wise multiplication operator.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side array
-//   inputs[1]: required: the right-hand side array
-//
-// TensorFlow equivalent: Mul
-struct MulOperator : Operator {
-  MulOperator() : Operator(OperatorType::kMul) {}
-};
-
-// Element-wise Relu operator:
-//   x -> max(0, x)
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Relu
-struct ReluOperator : Operator {
-  ReluOperator() : Operator(OperatorType::kRelu) {}
-};
-
-// Element-wise Relu1 operator:
-//   x -> min(max(x, -1), 1)
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: none. We can construct the operator with Minimum
-// and Maximum operations
-struct Relu1Operator : Operator {
-  Relu1Operator() : Operator(OperatorType::kRelu1) {}
-};
-
-// Element-wise Relu6 operator:
-//   x -> max(0, min(6, x))
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Relu6
-struct Relu6Operator : Operator {
-  Relu6Operator() : Operator(OperatorType::kRelu6) {}
-};
-
-// PRelu
-//   f(x) = alpha * x for x < 0, f(x) = x for x >= 0.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: the alpha array
-//
-// Equivalent to keras.layers.PReLU.
-struct PReluOperator : Operator {
-  PReluOperator() : Operator(OperatorType::kPRelu) {}
-};
-
-// Element-wise Logistic operator:
-//   x -> Logistic(x) = 1 / (1 + exp(-x))
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Sigmoid
-struct LogisticOperator : Operator {
-  LogisticOperator() : Operator(OperatorType::kLogistic) {}
-};
-
-// Element-wise natural log operator:
-//   x -> ln(x)
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Log
-struct LogOperator : Operator {
-  LogOperator() : Operator(OperatorType::kLog) {}
-};
-
-// Element-wise Tanh operator:
-//   x -> Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Tanh
-struct TanhOperator : Operator {
-  TanhOperator() : Operator(OperatorType::kTanh) {}
-};
-
-// Element-wise Sin operator:
-//   x -> Sin(x) = sin(x)
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Sin
-struct SinOperator : Operator {
-  SinOperator() : Operator(OperatorType::kSin) {}
-};
-
-// Element-wise addition operator.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side array
-//   inputs[1]: required: the right-hand side array
-//
-// TensorFlow equivalent: Add
-struct AddOperator : Operator {
-  AddOperator() : Operator(OperatorType::kAdd) {}
-};
-
-// Element-wise addition operator for N inputs.
-//
-// Inputs:
-//   inputs[i]: The i-th array to add together to form the output.
-//
-// TensorFlow equivalent: AddN
-struct AddNOperator : Operator {
-  AddNOperator() : Operator(OperatorType::kAddN) {}
-};
-
-// Concatenation operator: concatenates its inputs
-// along the axis.
-//
-// Inputs: this operator accepts any number >= 1 of inputs.
-//   inputs[i]: the i-th array to concatenate.
-//
-// TensorFlow equivalent: Concat.
-struct ConcatenationOperator : Operator {
-  ConcatenationOperator() : Operator(OperatorType::kConcatenation) {}
-  int axis = 0;
-};
-
-// Reordering dimensions. Used only during tooling to transform graphs from
-// the TensorFlow format.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: none. This is only useful to convert between formats.
-struct ReorderAxesOperator : Operator {
-  ReorderAxesOperator() : Operator(OperatorType::kReorderAxes) {}
-  AxesOrder input_axes_order;
-  AxesOrder output_axes_order;
-};
-
-// Average-pooling operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: AveragePool
-struct AveragePoolOperator : Operator {
-  AveragePoolOperator() : Operator(OperatorType::kAveragePool) {}
-  Padding padding;
-  int stride_height = 0;
-  int stride_width = 0;
-  int kheight = 0;
-  int kwidth = 0;
-};
-
-// Local response normalization operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: LRN
-struct LocalResponseNormalizationOperator : Operator {
-  LocalResponseNormalizationOperator()
-      : Operator(OperatorType::kLocalResponseNormalization) {}
-
-  int range = 0;
-  float bias = 0.f;
-  float alpha = 0.f;
-  float beta = 0.f;
-};
-
-// Max-pooling operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: MaxPool
-struct MaxPoolOperator : Operator {
-  MaxPoolOperator() : Operator(OperatorType::kMaxPool) {}
-  Padding padding;
-  int stride_height = 0;
-  int stride_width = 0;
-  int kheight = 0;
-  int kwidth = 0;
-};
-
-// L2-pooling operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: none. Can be shimmed by squaring+avgpool+sqrt.
-struct L2PoolOperator : Operator {
-  L2PoolOperator() : Operator(OperatorType::kL2Pool) {}
-  Padding padding;
-  int stride_height = 0;
-  int stride_width = 0;
-  int kheight = 0;
-  int kwidth = 0;
-};
-
-// The expected [min, max] range of values in a given array.
-// Used for quantization only.
-// This information typically comes from special nodes found in quantized
-// models, see FakeQuantOperator, and is used during quantization to resolve
-// actual quantization parameters (see QuantizationParams).
-struct MinMax {
-  double min = 0.;
-  double max = 0.;
-};
-
-inline bool operator==(const MinMax& m1, const MinMax& m2) {
-  return m1.min == m2.min && m1.max == m2.max;
-}
-
-// Fake-quantization operator. This does two things:
-//   - Annotate its input and output arrays with MinMax information,
-//   - Arithmetic-wise, this operator rounds incoming activation values
-//     to the nearest representable value on the scale of 256
-//     values from the min to the max value dictated by its MinMax info.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: optional: the 'min' value, if it has not yet been resolved
-//              to a constant.
-//   inputs[2]: optional: the 'max' value, if it has not yet been resolved
-//              to a constant.
-//
-// TensorFlow equivalent: FakeQuantWithMinMaxVars, FakeQuantWithMinMaxArgs.
-struct FakeQuantOperator : Operator {
-  FakeQuantOperator() : Operator(OperatorType::kFakeQuant) {}
-  std::unique_ptr<MinMax> minmax;
-  int num_bits = 8;
-  bool narrow_range = false;
-};
-
-// Element-wise division operator.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side array
-//   inputs[1]: required: the right-hand side array
-//
-// TensorFlow equivalent: Div
-struct DivOperator : Operator {
-  DivOperator() : Operator(OperatorType::kDiv) {}
-};
-
-// Element-wise identity (x->x) operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Identity
-struct TensorFlowIdentityOperator : Operator {
-  TensorFlowIdentityOperator() : Operator(OperatorType::kIdentity) {}
-};
-
-// Batch matrix multiplication operator. This comes from the (deprecated)
-// tf.batch_matmul or a tf.matmul that has rank 3. dims(0) is the batch count
-// and it can be trivially unrolled into a series of matmuls on each element.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side matrix
-//   inputs[1]: required: the right-hand side matrix
-//
-// TensorFlow equivalent: MatMul
-struct BatchMatMulOperator : Operator {
-  BatchMatMulOperator() : Operator(OperatorType::kBatchMatMul) {}
-};
-
-// General matrix multiplication operator. We don't want to support general
-// matrix multiplication at inference time, so we resolve it during tooling
-// to more specific operator types, namely, FullyConnected.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side matrix
-//   inputs[1]: required: the right-hand side matrix
-//
-// TensorFlow equivalent: MatMul
-struct TensorFlowMatMulOperator : Operator {
-  TensorFlowMatMulOperator() : Operator(OperatorType::kMatMul) {}
-  bool transpose_a = false;
-  bool transpose_b = false;
-};
-
-// Padding operator. Pads a tensor with zeros.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: the padding array
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of
-// `input` in that dimension.
-//
-// TensorFlow equivalent: Pad
-struct PadOperator : Operator {
-  PadOperator() : Operator(OperatorType::kPad) {}
-
-  std::vector<int> left_padding;
-  std::vector<int> right_padding;
-};
-
-// PaddingV2 operator. Pads a tensor with the given constant value.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: the padding array
-//   inputs[2]: required: the scalar constant_values
-//
-// This operation pads input according to the paddings and constant_values you
-// specify. paddings is an integer tensor with shape [Dn, 2], where n is the
-// rank of input. For each dimension D of input, paddings[D, 0] indicates how
-// many padding values to add before the contents of input in that dimension,
-// and paddings[D, 1] indicates how many padding values to add after the
-// contents of input in that dimension. constant_values is a scalar tensor of
-// the same type as input that indicates the value to use for padding input.
-//
-// TensorFlow equivalent: PadV2
-struct PadV2Operator : Operator {
-  PadV2Operator() : Operator(OperatorType::kPadV2) {}
-
-  std::vector<int> left_padding;
-  std::vector<int> right_padding;
-};
-
-// Strided slice operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: the begin array
-//   inputs[2]: required: the end array
-//   inputs[3]: optional: the strides array
-//
-// TensorFlow equivalent: StridedSlice
-struct StridedSliceOperator : Operator {
-  StridedSliceOperator() : Operator(OperatorType::kStridedSlice) {}
-
-  std::vector<int> start_indices;
-  std::vector<int> stop_indices;
-  std::vector<int> strides;
-
-  int begin_mask;
-  int ellipsis_mask;
-  int end_mask;
-  int new_axis_mask;
-  int shrink_axis_mask;
-
-  StridedSliceOperator(const StridedSliceOperator& other)
-      : Operator(OperatorType::kStridedSlice) {
-    inputs = other.inputs;
-    outputs = other.outputs;
-
-    start_indices = other.start_indices;
-    stop_indices = other.stop_indices;
-    strides = other.strides;
-
-    begin_mask = other.begin_mask;
-    ellipsis_mask = other.ellipsis_mask;
-    end_mask = other.end_mask;
-    new_axis_mask = other.new_axis_mask;
-    shrink_axis_mask = other.shrink_axis_mask;
-  }
-
-  void PadIndices(int dim_count) {
-    // Add indices and mask bits to fully include extra dimensions
-    CHECK_GE(dim_count, start_indices.size());
-    CHECK_EQ(start_indices.size(), stop_indices.size());
-    CHECK_EQ(stop_indices.size(), strides.size());
-
-    for (int i = start_indices.size(); i < dim_count; i++) {
-      start_indices.push_back(0);
-      stop_indices.push_back(0);
-      strides.push_back(1);
-      begin_mask |= 1 << i;
-      end_mask |= 1 << i;
-    }
-  }
-
-  void ReverseIndices() {
-    CHECK_EQ(start_indices.size(), stop_indices.size());
-    CHECK_EQ(stop_indices.size(), strides.size());
-
-    std::reverse(start_indices.begin(), start_indices.end());
-    std::reverse(stop_indices.begin(), stop_indices.end());
-    std::reverse(strides.begin(), strides.end());
-
-    begin_mask = toco::port::ReverseBits32(static_cast<uint32>(begin_mask)) >>
-                 (32 - start_indices.size());
-    ellipsis_mask =
-        toco::port::ReverseBits32(static_cast<uint32>(ellipsis_mask)) >>
-        (32 - start_indices.size());
-    end_mask = toco::port::ReverseBits32(static_cast<uint32>(end_mask)) >>
-               (32 - start_indices.size());
-    new_axis_mask =
-        toco::port::ReverseBits32(static_cast<uint32>(new_axis_mask)) >>
-        (32 - start_indices.size());
-    shrink_axis_mask =
-        toco::port::ReverseBits32(static_cast<uint32>(shrink_axis_mask)) >>
-        (32 - start_indices.size());
-  }
-};
-
-// Reshaping operator, reshaping its input array to a two-dimensional shape
-// (a "matrix"). This is used in the TensorFlow format, in conjunction with
-// MatMul nodes, to implement fully-connected layers.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Reshape --- except that we only support a special case
-// here, where the output shape is a matrix (2D) shape.
-struct TensorFlowReshapeOperator : Operator {
-  TensorFlowReshapeOperator() : Operator(OperatorType::kReshape) {}
-  std::vector<int> shape;
-};
-
-// Removes dimensions of size 1 from the shape of a tensor.
-// https://www.tensorflow.org/api_docs/python/tf/squeeze
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Squeeze
-struct SqueezeOperator : Operator {
-  SqueezeOperator() : Operator(OperatorType::kSqueeze) {}
-
-  std::vector<int> squeeze_dims;
-};
-
-// Inputs:
-//   inputs[0]: required: the output shape
-//   inputs[1]: required: the weights
-//   inputs[2]: required: the input activations array
-//   NOTE: The input activations is NOT the first input.
-//
-//
-// Outputs:
-//   outputs[0]: required: the output activations array
-//
-// TensorFlow equivalent: Conv2DBackpropInput
-struct TransposeConvOperator : Operator {
-  enum Inputs {
-    OUTPUT_SHAPE = 0,
-    WEIGHTS = 1,
-    DATA_INPUT = 2,
-  };
-
-  TransposeConvOperator() : Operator(OperatorType::kTransposeConv) {}
-  Padding padding;
-  int stride_width = 0;
-  int stride_height = 0;
-  // Dilation is possible with transpose convolution, but Tensorflow does not
-  // currently support it, so we omit it.
-};
-
-// Given a tensor input, this operation calculates element-wise exponential
-// (y = e^x).
-//
-// Inputs:
-//   inputs[0]: required: input tensor
-//
-// TensorFlow equivalent: Exp
-struct ExpOperator : Operator {
-  ExpOperator() : Operator(OperatorType::kExp) {}
-};
-
-// Given a tensor input, this operation inserts a dimension of 1 at the
-// dimension index axis of input's shape. The dimension index axis starts at
-// zero; if you specify a negative number for axis it is counted backward from
-// the end.
-//
-// Inputs:
-//   inputs[0]: required: input tensor
-//   inputs[1]: required: 0-D (scalar). Specifies the dimension index at which
-//   to expand the shape of input
-//
-// TensorFlow equivalent: ExpandDims
-struct ExpandDimsOperator : Operator {
-  ExpandDimsOperator() : Operator(OperatorType::kExpandDims) {}
-};
-
-// Ceates a tensor of shape dims and fills it with the given scalar value.
-// Output type will be the same as the given scalar value.
-//
-// Inputs:
-//   inputs[0]: required: 1-D (int32) - the shape of the output tensor
-//   inputs[1]: required: 0-D (scalar) - value to fill the tensor with
-//
-// TensorFlow equivalent: Fill
-struct FillOperator : Operator {
-  FillOperator() : Operator(OperatorType::kFill) {}
-};
-
-// Element-wise floor division operator.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side array
-//   inputs[1]: required: the right-hand side array
-//
-// TensorFlow equivalent: FloorDiv
-struct FloorDivOperator : Operator {
-  FloorDivOperator() : Operator(OperatorType::kFloorDiv) {}
-};
-
-// Element-wise floor mod operator.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side array
-//   inputs[1]: required: the right-hand side array
-//
-// TensorFlow equivalent: FloorMod
-struct FloorModOperator : Operator {
-  FloorModOperator() : Operator(OperatorType::kFloorMod) {}
-};
-
-struct RandomUniformOperator : Operator {
-  RandomUniformOperator() : Operator(OperatorType::kRandomUniform) {}
-  ArrayDataType dtype = ArrayDataType::kNone;
-  int64 seed;
-  int64 seed2;
-};
-
-// Creates a sequence of numbers that begins at start and extends by increments
-// of delta up to but not including limit.
-//
-// The dtype of the resulting tensor is inferred from the inputs unless it is
-// provided explicitly.
-//
-// Inputs:
-//   inputs[0]: required: the start
-//   inputs[1]: required: the limit
-//   inputs[2]: required: the delta
-//
-// TensorFlow equivalent: Range
-struct RangeOperator : Operator {
-  RangeOperator() : Operator(OperatorType::kRange) {}
-  ArrayDataType dtype = ArrayDataType::kNone;
-};
-
-// Rank operator. Extracts the rank of the tensor.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// This operation outputs a 0-D integer tensor representing the rank of
-// the input.
-//
-// TensorFlow equivalent: Rank.  We currently assume that the output is int32
-// and not int64.  The output type could be stored herein.
-struct RankOperator : Operator {
-  RankOperator() : Operator(OperatorType::kRank) {}
-};
-
-// Element-wise negation (-x) operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Neg
-struct NegOperator : Operator {
-  NegOperator() : Operator(OperatorType::kNeg) {}
-};
-
-// Element-wise select operator choosing elements from inputs[1] or input[2]
-//
-// Inputs:
-//  inputs[0]: required: boolean mask per index
-//  inputs[1]: required: tensor of values if true
-//  inputs[2]: required: tensor of values if false
-//
-//  TensorFlow equivalent: Select
-struct SelectOperator : Operator {
-  SelectOperator() : Operator(OperatorType::kSelect) {}
-};
-
-// Element-wise reciprocal-square-root (x^-0.5) operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Rsqrt
-struct TensorFlowRsqrtOperator : Operator {
-  TensorFlowRsqrtOperator() : Operator(OperatorType::kRsqrt) {}
-};
-
-// Stacks a list of rank-R tensors into one rank-(R+1) tensor.
-//
-// Packs the list of tensors in values into a tensor with rank one higher than
-// each tensor in values, by packing them along the axis dimension. Given a list
-// of length N of tensors of shape (A, B, C);.
-//
-// Inputs: this operator accepts any number >= 1 of inputs.
-//   inputs[i]: the i-th array to merge.
-//
-// TensorFlow equivalent: Pack
-struct PackOperator : Operator {
-  PackOperator() : Operator(OperatorType::kPack) {}
-  int values_count;
-  int axis = 0;
-  ArrayDataType dtype = ArrayDataType::kNone;
-};
-
-// Shape operator. Extracts the shape of the tensor.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// This operation outputs a 1-D integer tensor representing the shape of
-// the input.
-//
-// TensorFlow equivalent: Shape.
-struct TensorFlowShapeOperator : Operator {
-  TensorFlowShapeOperator() : Operator(OperatorType::kShape) {}
-  ArrayDataType output_data_type = ArrayDataType::kInt32;
-};
-
-// Element-wise square-root (x^0.5) operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Sqrt
-struct TensorFlowSqrtOperator : Operator {
-  TensorFlowSqrtOperator() : Operator(OperatorType::kSqrt) {}
-};
-
-// Element-wise square (x*x) operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Square
-struct TensorFlowSquareOperator : Operator {
-  TensorFlowSquareOperator() : Operator(OperatorType::kSquare) {}
-};
-
-// Transposes a tensor.
-//
-// By default, this operation performs a regular matrix transpose on 2-D input
-// tensors.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Transpose
-struct TransposeOperator : Operator {
-  TransposeOperator() : Operator(OperatorType::kTranspose) {}
-  std::vector<int> perm;
-};
-
-// Element-wise subtraction operator.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side array
-//   inputs[1]: required: the right-hand side array
-//
-// TensorFlow equivalent: Sub
-struct SubOperator : Operator {
-  SubOperator() : Operator(OperatorType::kSub) {}
-};
-
-// Sum reduction: computes the sum of all of entries across the axes.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Sum
-struct TensorFlowSumOperator : Operator {
-  TensorFlowSumOperator() : Operator(OperatorType::kSum) {}
-  std::vector<int> axis;
-  bool keep_dims = false;
-};
-
-// Prod reduction: computes the product of all of entries across the axes.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Prod
-struct TensorFlowProdOperator : Operator {
-  TensorFlowProdOperator() : Operator(OperatorType::kReduceProd) {}
-  std::vector<int> axis;
-  bool keep_dims = false;
-};
-
-// TensorFlow Tile equivalent. Refer to TensorFlow documentation for details.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: int array with length of rank(input[0])
-struct TensorFlowTileOperator : Operator {
-  TensorFlowTileOperator() : Operator(OperatorType::kTile) {}
-};
-
-// TensorFlow Slice equivalent. Refer to TensorFlow documentation for details.
-struct SliceOperator : Operator {
-  SliceOperator() : Operator(OperatorType::kSlice) {}
-
-  std::vector<int> begin;
-  std::vector<int> size;
-};
-
-// TensorFlow Split equivalent. Refer to TensorFlow documentation for details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-struct TensorFlowSplitOperator : Operator {
-  TensorFlowSplitOperator() : Operator(OperatorType::kSplit) {}
-  int num_split = 0;
-};
-
-// TensorFlow Concat equivalent. Refer to TensorFlow documentation for details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-// Concretely, once the concat dim becomes known, if it is the depth
-// dimension then we can change this op into a DepthConcatenation op.
-// Otherwise, we hope for some other graph transformation to drop this node.
-struct TensorFlowConcatOperator : Operator {
-  TensorFlowConcatOperator() : Operator(OperatorType::kConcat) {}
-};
-
-// TensorFlow ConcatV2 equivalent. Refer to TensorFlow documentation for
-// details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-// Concretely, once the concat dim becomes known, if it is the depth
-// dimension then we can change this op into a DepthConcatenation op.
-// Otherwise, we hope for some other graph transformation to drop this node.
-struct TensorFlowConcatV2Operator : Operator {
-  TensorFlowConcatV2Operator() : Operator(OperatorType::kConcatV2) {}
-};
-
-// TensorFlow Merge equivalent. Refer to TensorFlow documentation for details.
-//
-// Inputs: this operator accepts any number >= 1 of inputs.
-//   inputs[i]: the i-th array to merge.
-//
-// It is expected that graph transformations will drop all but exactly one
-// of the inputs, at which point the Merge node will be equivalent to an
-// Identity node forwarding the remaining input.
-//
-// Note: We do not currently support runtime control flow: we only support
-// control flow that can be resolved at tooling time (independently of input
-// activations).
-struct TensorFlowMergeOperator : Operator {
-  TensorFlowMergeOperator() : Operator(OperatorType::kMerge) {}
-};
-
-// TensorFlow Switch equivalent. Refer to TensorFlow documentation for details.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: the boolean predicate, given as an array of size 1
-//     and of type kBool, will determine which output gets selected.
-//
-// Outputs: a TensorFlow Switch node always has exactly two outputs. Depending
-// on the boolean value that the input predicate resolves to (see note below),
-// one or the other of the outputs will be 'selected': the input array will be
-// forwarded to the 'selected output' as if by a Identity node, while the other
-// output will be discarded, and any graph edge connecting that discarded output
-// will be dropped. The rule for selecting outputs is as follows:
-//   outputs[0] will be selected if the input predicate resolves to 'true'.
-//   outputs[1] will be selected if the input predicate resolves to 'false'.
-//
-// Note: We do not currently support runtime control flow: we only support
-// control flow that can be resolved at tooling time (independently of input
-// activations).
-struct TensorFlowSwitchOperator : Operator {
-  TensorFlowSwitchOperator() : Operator(OperatorType::kSwitch) {}
-};
-
-// TensorFlow All equivalent. Refer to TensorFlow documentation for details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-// Typically, this is only used as an input to an Assert node, so can be
-// removed as an unused node as we drop Assert nodes.
-struct TensorFlowAllOperator : Operator {
-  TensorFlowAllOperator() : Operator(OperatorType::kAll) {}
-};
-
-// TensorFlow Assert equivalent. Refer to TensorFlow documentation for details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-// Typically, we just drop Assert nodes.
-struct TensorFlowAssertOperator : Operator {
-  TensorFlowAssertOperator() : Operator(OperatorType::kAssert) {}
-};
-
-// TensorFlow Less equivalent. Refer to TensorFlow documentation for details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-// Typically, this is only used as an input to an Assert node, so can be
-// removed as an unused node as we drop Assert nodes.
-struct TensorFlowLessOperator : Operator {
-  TensorFlowLessOperator() : Operator(OperatorType::kLess) {}
-};
-
-// TensorFlow LessEqual equivalent. Refer to TensorFlow documentation for
-// details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-// Typically, this is only used as an input to an Assert node, so can be
-// removed as an unused node as we drop Assert nodes.
-struct TensorFlowLessEqualOperator : Operator {
-  TensorFlowLessEqualOperator() : Operator(OperatorType::kLessEqual) {}
-};
-
-// TensorFlow Less equivalent. Refer to TensorFlow documentation for details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-// Typically, this is only used as an input to an Assert node, so can be
-// removed as an unused node as we drop Assert nodes.
-struct TensorFlowGreaterOperator : Operator {
-  TensorFlowGreaterOperator() : Operator(OperatorType::kGreater) {}
-};
-
-// TensorFlow GreaterEqual equivalent. Refer to TensorFlow documentation for
-// details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-// Typically, this is only used as an input to an Assert node, so can be
-// removed as an unused node as we drop Assert nodes.
-struct TensorFlowGreaterEqualOperator : Operator {
-  TensorFlowGreaterEqualOperator() : Operator(OperatorType::kGreaterEqual) {}
-};
-
-// TensorFlow Equal equivalent. Refer to TensorFlow documentation for
-// details.
-// Not fully supported, just a placeholder to handle TensorFlow graphs and
-// support graph transformations to other operator types by matching sub-graphs.
-// Typically, this is only used as an input to an Assert node, so can be
-// removed as an unused node as we drop Assert nodes.
-struct TensorFlowEqualOperator : Operator {
-  TensorFlowEqualOperator() : Operator(OperatorType::kEqual) {}
-};
-
-// TensorFlow Not Equal equivalent. Refer to TensorFlow documentation for
-// details.
-struct TensorFlowNotEqualOperator : Operator {
-  TensorFlowNotEqualOperator() : Operator(OperatorType::kNotEqual) {}
-};
-
-// Max reduction: computes the max of all of entries across the axes.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Max
-struct TensorFlowMaxOperator : Operator {
-  TensorFlowMaxOperator() : Operator(OperatorType::kReduceMax) {}
-  std::vector<int> axis;
-  bool keep_dims = false;
-};
-
-// Min reduction: computes the min of all of entries across the axes.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Min
-struct TensorFlowMinOperator : Operator {
-  TensorFlowMinOperator() : Operator(OperatorType::kReduceMin) {}
-  std::vector<int> axis;
-  bool keep_dims = false;
-};
-
-// Element-wise maximum operator. Currently it only supports scalar as
-// the second operand.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side array
-//   inputs[1]: required: the right-hand side array
-//
-// TensorFlow equivalent: Maximum
-struct TensorFlowMaximumOperator : Operator {
-  TensorFlowMaximumOperator() : Operator(OperatorType::kMaximum) {}
-};
-
-// Element-wise minimum operator. Currently it only supports scalar as
-// the second operand.
-//
-// Inputs:
-//   inputs[0]: required: the left-hand side array
-//   inputs[1]: required: the right-hand side array
-//
-// TensorFlow equivalent: Minimum
-struct TensorFlowMinimumOperator : Operator {
-  TensorFlowMinimumOperator() : Operator(OperatorType::kMinimum) {}
-};
-
-// General TF operation, unsupported by tf.mini. Expected to be dropped by
-// graph transformations.
-struct TensorFlowUnsupportedOperator : Operator {
-  TensorFlowUnsupportedOperator() : Operator(OperatorType::kUnsupported) {}
-
-  // The original TF operation type. Used for diagnostic purposes.
-  string tensorflow_op;
-  // A serialized tensorflow::NodeDef string.
-  string tensorflow_node_def;
-  // A boolean indicating if the unsupported op should be treated as quantized.
-  bool quantized = false;
-  // A boolean indicating if the unsupported op output should allow float values
-  // in quantized mode.
-  bool support_output_type_float_in_quantized_op = false;
-  // Output data types
-  std::vector<ArrayDataType> output_data_types;
-  // Output shapes.
-  std::vector<Shape> output_shapes;
-};
-
-// Softmax activation function.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Softmax
-struct SoftmaxOperator : Operator {
-  SoftmaxOperator() : Operator(OperatorType::kSoftmax) {}
-  float beta = 0.f;
-};
-
-// LogSoftmax activation function.
-//
-// Inputs:
-//   inputs[0]: required: the logits input array
-//
-// TensorFlow equivalent: LogSoftmax
-struct LogSoftmaxOperator : Operator {
-  LogSoftmaxOperator() : Operator(OperatorType::kLogSoftmax) {}
-
-  // LogSoftmax can in principal have very large negative output, depending on
-  // the input size.  However, input x_i that is less than x_max-10 is
-  // accumulated as exp(x_i-x_max), which is truncated to zero.
-  //
-  // Since we effectively disregard smallish inputs in the normalizing factor,
-  // we also drop them in the output (set to minimum output), and in doing so
-  // make better use of the quantization range / resolution.
-  static constexpr float kOutputRangeMin = -16.0;
-};
-
-// Cast operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Cast
-struct CastOperator : Operator {
-  CastOperator() : Operator(OperatorType::kCast) {}
-  ArrayDataType src_data_type = ArrayDataType::kNone;
-  ArrayDataType dst_data_type = ArrayDataType::kNone;
-};
-
-// Floor operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Floor
-struct FloorOperator : Operator {
-  FloorOperator() : Operator(OperatorType::kFloor) {}
-};
-
-// Gather operator. It gathers slices from params according to indices.
-// Only 1-D indices are supported at the moment.
-//
-// Inputs:
-//   inputs[0]: required: the params array
-//   inputs[1]: required: the indices to gather
-//   inputs[2]: optional: axis
-//
-// TensorFlow equivalent: Gather
-struct GatherOperator : Operator {
-  GatherOperator() : Operator(OperatorType::kGather) {}
-  // Axis is populated explicitly or implicitly from the axis input by
-  // ResolveGatherAttributes. An empty axis indicates that the axis has not yet
-  // be resolved.
-  absl::optional<int> axis;
-  int input_rank = 0;
-};
-
-// ArgMax operator. It returns the index of the maximum value along axis.
-//
-// Inputs:
-//   inputs[0]: required: the input tensor
-//
-// TensorFlow equivalent: ArgMax
-struct ArgMaxOperator : Operator {
-  ArgMaxOperator() : Operator(OperatorType::kArgMax) {}
-  ArrayDataType output_data_type = ArrayDataType::kInt64;
-};
-
-// ArgMin operator. It returns the index of the minimum value along axis.
-//
-// Inputs:
-//   inputs[0]: required: the input tensor
-//
-// TensorFlow equivalent: ArgMin
-struct ArgMinOperator : Operator {
-  ArgMinOperator() : Operator(OperatorType::kArgMin) {}
-  ArrayDataType output_data_type = ArrayDataType::kInt64;
-};
-
-// ResizeBilinear operator. It resizes input images with bilinear interpolation.
-// It does not support align_corners at the moment.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: the new image size
-//
-// TensorFlow equivalent: ResizeBilinear
-struct ResizeBilinearOperator : Operator {
-  ResizeBilinearOperator() : Operator(OperatorType::kResizeBilinear) {}
-
-  bool align_corners = false;
-};
-
-// SpaceToBatchND operator. It divides spatial dimensions into a grid of
-// blocks and interleaves these blocks with the batch dimension. Currently,
-// only 2-d blocks are supported.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: the block shape
-//   inputs[2]: required: the paddings
-//
-// TensorFlow equivalent: SpaceToBatchND
-struct SpaceToBatchNDOperator : Operator {
-  SpaceToBatchNDOperator() : Operator(OperatorType::kSpaceToBatchND) {}
-
-  std::vector<int> block_shape;
-  std::vector<int> before_paddings;
-  std::vector<int> after_paddings;
-};
-
-// BatchToSpaceND operator. Rearranges data from batch into blocks of
-// spatial data. Currently, only 2-d blocks are supported.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: the block shape
-//   inputs[2]: required: the crops
-//
-// TensorFlow equivalent: BatchToSpaceND
-struct BatchToSpaceNDOperator : Operator {
-  BatchToSpaceNDOperator() : Operator(OperatorType::kBatchToSpaceND) {}
-
-  std::vector<int> block_shape;
-  std::vector<int> before_crops;
-  std::vector<int> after_crops;
-};
-
-// Mean operator.
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//
-// TensorFlow equivalent: Mean
-struct MeanOperator : Operator {
-  MeanOperator() : Operator(OperatorType::kMean) {}
-
-  std::vector<int> axis;
-  bool keep_dims = false;
-};
-
-// Svdf operator:
-//
-// Inputs:
-//   inputs[0]: required: the input array
-//   inputs[1]: required: weights_feature
-//   inputs[2]: required: weights_time
-//   inputs[3]: optional: bias
-struct SvdfOperator : Operator {
-  SvdfOperator() : Operator(OperatorType::kSvdf) {}
-  int rank;
-};
-
-// TopKV2 operator.
-//
-// Inputs:
-//    input tensor and top_k scalar.
-struct TopKV2Operator : Operator {
-  TopKV2Operator() : Operator(OperatorType::kTopK_V2) {}
-};
-
-// DynamicPartition operator:
-//
-// Inputs:
-//  inputs[0]: required: data.
-//  inputs[1]: required: partitions.
-//
-// TensorFlow equivalent: DynamicPartition
-struct DynamicPartitionOperator : Operator {
-  DynamicPartitionOperator() : Operator(OperatorType::kDynamicPartition) {}
-  int num_partitions;
-};
-
-// DynamicStitch operator:
-//
-// Inputs:
-//  inputs[0,N): required: indices.
-//  inputs[N,2N): required: data.
-//
-// TensorFlow equivalent: DynamicStitch/ParallelDynamicStitch
-struct DynamicStitchOperator : Operator {
-  DynamicStitchOperator() : Operator(OperatorType::kDynamicStitch) {}
-  int num_partitions;
-};
-
-// SparseToDense operator:
-//
-// Inputs:
-// Inputs[0]: required: sparse_indices.
-// Inputs[1]: required: output_shape.
-// Inputs[2]: required: sparse_values.
-//
-// TensorFlow equivalent: SparseToDense.
-struct SparseToDenseOperator : Operator {
-  SparseToDenseOperator() : Operator(OperatorType::kSparseToDense) {}
-  bool validate_indices;
-};
-
-// Pow operator:
-//
-// Inputs:
-// Inputs[0]: required: A tensor.
-// Inputs[1]: required: A tensor.
-//
-// TensorFlow equivalent: Pow.
-struct PowOperator : Operator {
-  PowOperator() : Operator(OperatorType::kPow) {}
-};
-
-// Any operator:
-//
-// Inputs:
-// Inputs[0]: required: A boolean input tensor.
-// Inputs[1]: required: reduction_indices.
-//
-// TensorFlow equivalent: tf.reduce_any.
-struct TensorFlowAnyOperator : Operator {
-  TensorFlowAnyOperator() : Operator(OperatorType::kAny) {}
-  std::vector<int> axis;
-  bool keep_dims = false;
-};
-
-// LogicalAnd operator:
-//
-// Inputs:
-// Inputs[0]: required: A boolean tensor.
-// Inputs[1]: required: A boolean tensor.
-//
-// TensorFlow equivalent: tf.logical_and.
-struct LogicalAndOperator : Operator {
-  LogicalAndOperator() : Operator(OperatorType::kLogicalAnd) {}
-};
-
-// LogicalNot operator:
-//
-// Inputs:
-// Inputs[0]: required: A boolean tensor.
-//
-// TensorFlow equivalent: tf.logical_not.
-struct LogicalNotOperator : Operator {
-  LogicalNotOperator() : Operator(OperatorType::kLogicalNot) {}
-};
-
-// OneHot operator:
-//
-// Inputs:
-// Inputs[0]: required: indices.
-// Inputs[1]: required: depth.
-// Inputs[2]: required: on_value.
-// Inputs[3]: required: off_value.
-//
-// TensorFlow equivalent: OneHot.
-struct OneHotOperator : Operator {
-  enum Inputs {
-    INDICES_INPUT = 0,
-    DEPTH_INPUT = 1,
-    ON_VALUE_INPUT = 2,
-    OFF_VALUE_INPUT = 3,
-  };
-
-  OneHotOperator() : Operator(OperatorType::kOneHot) {}
-  int axis = -1;
-};
-
-// LogicalOr operator:
-//
-// Inputs:
-// Inputs[0]: required: A Bool tensor.
-// Inputs[1]: required: A Bool tensor.
-//
-// TensorFlow equivalent: LogicalOr.
-struct LogicalOrOperator : Operator {
-  LogicalOrOperator() : Operator(OperatorType::kLogicalOr) {}
-};
-
-// Unpack operator:
-//
-// Inputs:
-// Inputs[0]: required: A boolean input tensor.
-// Inputs[1]: required: reduction_indices.
-//
-// TensorFlow equivalent: tf.unstack.
-struct UnpackOperator : Operator {
-  UnpackOperator() : Operator(OperatorType::kUnpack) {}
-  int num;
-  int axis;
-  ArrayDataType dtype = ArrayDataType::kNone;
-};
-
-// ZerosLike operator:
-//
-// Inputs:
-// inputs[0]: required: the input array
-//
-// TensorFlow equivalent: tf.zeros_like
-struct TensorFlowZerosLikeOperator : Operator {
-  TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {}
-};
-
-// Alloc's are used for transient arrays only. An Alloc specifies which interval
-// of the "transient_data" workspace buffer passed to inference functions, is to
-// be used for the transient array at hand. The 'start' and 'end' values are
-// offsets from the start of the workspace buffer, expressed in bytes.
-struct Alloc {
-  int64 start = 0;
-  int64 end = 0;
-};
-
-inline bool operator<(const Alloc& a, const Alloc& b) {
-  return a.start < b.start;
-}
-
-// Array represents an array (either a constant parameter array or an
-// activations array) in a Model.
-struct Array {
-  template <ArrayDataType A>
-  const Buffer<A>& GetBuffer() const {
-    DCHECK(buffer);
-    DCHECK(buffer->type == A);
-    return *static_cast<const Buffer<A>*>(buffer.get());
-  }
-  template <ArrayDataType A>
-  Buffer<A>& GetMutableBuffer() {
-    if (!buffer) {
-      Buffer<A>* ptr = new Buffer<A>;
-      buffer = std::unique_ptr<GenericBuffer>(ptr);
-    }
-    DCHECK(buffer);
-    DCHECK(buffer->type == A);
-    return *static_cast<Buffer<A>*>(buffer.get());
-  }
-  Alloc& GetOrCreateAlloc() {
-    if (!alloc) {
-      alloc = std::unique_ptr<Alloc>(new Alloc);
-    }
-    return *alloc;
-  }
-  MinMax& GetOrCreateMinMax() {
-    if (!minmax) {
-      minmax = std::unique_ptr<MinMax>(new MinMax);
-    }
-    return *minmax;
-  }
-  MinMax& GetMinMax() const {
-    DCHECK(minmax);
-    return *minmax;
-  }
-  QuantizationParams& GetOrCreateQuantizationParams() {
-    if (!quantization_params) {
-      quantization_params =
-          std::unique_ptr<QuantizationParams>(new QuantizationParams);
-    }
-    return *quantization_params;
-  }
-  QuantizationParams& GetQuantizationParams() const {
-    DCHECK(quantization_params);
-    return *quantization_params;
-  }
-
-  // The data type of the actual elements of this array, that is:
-  //  - If there is a buffer (see 'buffer' member), it must be of the same
-  //    type.
-  //  - If there is no buffer, meaning that this is a runtime (i.e. activations)
-  //    array, then this specifies the type of elements that there will be
-  //    at runtime.
-  //
-  // Note that this only specifies the storage type of elements; this does
-  // not specify whether these are to be treated as 'real' or 'quantized'
-  // values.
-  // That is decided by whether the 'quantization_params' member is null.
-  ArrayDataType data_type = ArrayDataType::kNone;
-  // The final value that data_type should have at the end of graph
-  // transformations
-  ArrayDataType final_data_type = ArrayDataType::kNone;
-  // The dimensions of this array --- this specifies both sizes and strides
-  // (the storage layout).
-  //
-  // Issues with shape handling that remain include:
-  //   - No way to distinguish between 0-dimensional dims and missing dims.
-  //   - No way to describe dims that may be runtime-variable.
-  //   - Addressing of dims by integer index differs in different graph formats
-  //     (TensorFlow vs. other frameworks vs. what we have informally grown
-  //     within toco).
-  //     This is currently quite messy; see ReorderAxesOperator which is how we
-  //     bridge some of these discrepancies at the moment. This is overdue for
-  //     a redesign; I'm thinking that it would be nice to have more flexible
-  //     dims that allow mapping 1:1, cleanly, dims as they are in various
-  //     formats,
-  //     then explicitly convert between different conventions.
-
-  // Proto-style accessors
-  bool has_shape() const { return array_shape != nullptr; }
-  const Shape& shape() const {
-    CHECK(has_shape());
-    return *array_shape;
-  }
-  Shape* mutable_shape() {
-    if (!array_shape) {
-      array_shape.reset(new Shape);
-    }
-    return array_shape.get();
-  }
-  void copy_shape(const Shape& src_shape) { *mutable_shape() = src_shape; }
-  void clear_shape() { array_shape = nullptr; }
-
-  // The constant buffer backing this array. This is non-null if and only if
-  // this is a constant parameter array. Conversely, this is null for
-  // activations arrays.
-  //
-  // Note that this buffer is pure storage. In the case of quantized values,
-  // it only stores the quantized values, it does not know by itself about the
-  // quantization parameters necessary to interprete these values, that is
-  // in the separate 'quantization_params' field. In fact, this 'buffer' field
-  // does no even know whether values are quantized. It only has a data_type,
-  // which must equal the 'data_type' member here, and which only describes
-  // the storage type of element, does not tell whether they are quantized i.e.
-  // whether they are to be interpreted with quantization_params.
-  std::unique_ptr<GenericBuffer> buffer;
-  // Only for activation arrays (i.e. when 'buffer' is null).
-  // Only for code generation.
-  //
-  // Describes the allocation of this array within the workspace buffer
-  // allocated
-  // for all transient arrays.
-  std::unique_ptr<Alloc> alloc;
-  // Describes the [min, max] range of values
-  // to be assumed when determining quantization_params.
-  //
-  // Only used for quantization. In fact, only used for determining
-  // quantization_params.
-  //
-  // Used for both constant arrays (those having a 'buffer') and non-constant
-  // arrays (activations). Indeed, it is important to use the same min-max range
-  // as was used during training, even if that min-max range is slightly wrong
-  // w.r.t. actual buffer elements. Doing otherwise would defeat the point of
-  // re-training for quantization.
-  std::unique_ptr<MinMax> minmax;
-  // Quantization parameters. The non-null-ness of this pointer is what
-  // defines whether this array is quantized or not.
-  //
-  // If this is non-null, then these quantization parameters are to be used
-  // to assign a meaning as real numbers to the elements of this array.
-  std::unique_ptr<QuantizationParams> quantization_params;
-  // narrow_range is a detail of how toco handles FakeQuant operators with
-  // narrow_range, see
-  // https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_vars
-  //
-  // For more context about what that is useful for, see the big comment in
-  // graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
-  //
-  // The narrow_range flag applies only to quantized arrays, and changes
-  // their quantization in the following way when it is set to 'true':
-  // 1. The computation of {zero_point, scale} from {min, max} needs to be
-  //    amended so that the real min value will get quantized to
-  //    (min_quantized_value + 1) instead of just (min_quantized_value).
-  //    E.g. for uint8 quantization, the real min value should get quantized to
-  //    the uint8 value 1, not 0.
-  // 2. Quantized values should get clamped to the interval
-  //    [min_quantized_value + 1, max_value]. Equivalently, the
-  //    min_quantized_value should get nudged to (min_quantized_value + 1).
-  // The reason why 1. does not imply 2. is that real values may not belong to
-  // the stated [min, max] interval. Concretely, weights recorded at the last
-  // learning step may not fall in the [min, max] interval recorded over
-  // previous learning steps, as the values evolve across learning steps.
-  //
-  // Rationale why this is directly a field on Array:
-  // - This can't be just a field on FakeQuantOperator, because
-  //   FakeQuantOperators are gone (DropFakeQuant) before we get to using that
-  //   information (Quantize). We need a place to store that bit in the interim.
-  // - This can't be in QuantizationParams because we need to record this
-  //   ahead of quantization, and QuantizationParams are only created during
-  //   quantization.
-  // - This could be in MinMax, but that would be an abuse of what MinMax is
-  //   about, and would break existing code that assumes that a MinMax is just
-  //   a min and a max. Unlike MinMax which is agnostic as to the quantized
-  //   data type, narrow_range refers to values in the quantized data type.
-  bool narrow_range = false;
-
- private:
-  std::unique_ptr<Shape> array_shape;
-};
-
-// Our Model struct, represents an entire model (our "top-level" struct).
-// Owns everything.
-class Model {
- public:
-  using ArrayMap = std::unordered_map<string, std::unique_ptr<Array>>;
-
-  bool HasArray(const string& name) const { return arrays.count(name) > 0; }
-  Array& GetArray(const string& name) const {
-    DCHECK(HasArray(name)) << "Array not found: " << name;
-    return *arrays.at(name);
-  }
-  Array& GetOrCreateArray(const string& name) {
-    // Make sure name is not used by an optional array
-    DCHECK(!optional_arrays.count(name));
-    if (!HasArray(name)) {
-      Array* ptr = new Array;
-      arrays[name] = std::unique_ptr<Array>(ptr);
-    }
-    Array& result = GetArray(name);
-    return result;
-  }
-  void CreateOptionalArray(const string& name) {
-    DCHECK(!arrays.count(name) && !optional_arrays.count(name));
-    optional_arrays.insert(name);
-  }
-  bool IsOptionalArray(const string& name) const {
-    return optional_arrays.count(name);
-  }
-
-  // Note that this invalidates all array iterators.
-  void EraseArray(const string& name) { arrays.erase(name); }
-  void EraseArrays(std::function<bool(const string&)> discardable) {
-    for (auto it = arrays.begin(); it != arrays.end();) {
-      if (discardable(it->first)) {
-        it = arrays.erase(it);
-      } else {
-        ++it;
-      }
-    }
-  }
-  const ArrayMap& GetArrayMap() const { return arrays; }
-  ArrayMap& GetMutableArrayMap() { return arrays; }
-
-  int64 ArithmeticOpsCount() const { return ops_count; }
-
-  // Optional arrays are used for optional tensors,
-  // these tensors do not have data, but with reserved names as op inputs.
-  std::set<string> optional_arrays;
-
-  // The list of operators. Notice how it's a list of unique_ptr's, implying
-  // that the Model is what owns Operator's and keeps them alive.
-  std::vector<std::unique_ptr<Operator>> operators;
-
-  // Generic flags, a place where we combine information passed to us via
-  // command-line parameters (e.g. --input_width=N) with information that
-  // we may or may not find in the input model file.
-  ModelFlags flags;
-  // For code-generation only: required size of the transient_data buffer
-  std::size_t transient_data_size = 0;
-  // For code-generation only: required alignment of the transient_data buffer
-  std::size_t transient_data_alignment = 0;
-  // Arithmetic operations performed in the model.
-  int64 ops_count = 0;
-
- private:
-  // The associative array mapping names to Array's.
-  // Notice how it's a container of unique_ptr's, implying
-  // that the Model is what owns Array's and keeps them alive.
-  // The Operator's refer to these Array's by their name strings, not by their
-  // addresses. See Operator::inputs, Operator::outputs.
-  std::unordered_map<string, std::unique_ptr<Array>> arrays;
-};
-}  // namespace toco
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
diff --git a/tensorflow/contrib/lite/toco/python/BUILD b/tensorflow/contrib/lite/toco/python/BUILD
deleted file mode 100644
index cf97ba7084d48e55a1874e77d3817aa721de7de9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/python/BUILD
+++ /dev/null
@@ -1,61 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "py_binary")
-
-cc_library(
-    name = "toco_python_api",
-    srcs = ["toco_python_api.cc"],
-    hdrs = ["toco_python_api.h"],
-    deps = [
-        "//tensorflow/contrib/lite/toco:model_flags_proto_cc",
-        "//tensorflow/contrib/lite/toco:toco_flags_proto_cc",
-        "//tensorflow/contrib/lite/toco:toco_graphviz_dump_options",
-        "//tensorflow/contrib/lite/toco:toco_port",
-        "//tensorflow/contrib/lite/toco:toco_tooling",
-        "//tensorflow/core:lib",
-        "//third_party/python_runtime:headers",
-    ],
-)
-
-tf_py_wrap_cc(
-    name = "tensorflow_wrap_toco",
-    srcs = ["toco.i"],
-    deps = [
-        ":toco_python_api",
-        "//tensorflow/contrib/lite/toco:model_flags_proto_cc",
-        "//tensorflow/contrib/lite/toco:toco_flags_proto_cc",
-        "//third_party/python_runtime:headers",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-py_binary(
-    name = "toco_from_protos",
-    srcs = ["toco_from_protos.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tensorflow_wrap_toco",
-        "//tensorflow/python:platform",
-    ],
-)
-
-tf_py_test(
-    name = "toco_from_protos_test",
-    srcs = ["toco_from_protos_test.py"],
-    additional_deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/lite/toco:model_flags_proto_py",
-        "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
-    ],
-    data = [
-        ":toco_from_protos",
-    ],
-    tags = [
-        "no_oss",
-        "no_pip",
-    ],
-)
diff --git a/tensorflow/contrib/lite/toco/runtime/common.h b/tensorflow/contrib/lite/toco/runtime/common.h
deleted file mode 100644
index 3c6828840c4a963a4a68774ec5d559b7f80baf22..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/runtime/common.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
-
-#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
-#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#endif
-#endif
-
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
diff --git a/tensorflow/contrib/lite/toco/runtime/types.h b/tensorflow/contrib/lite/toco/runtime/types.h
deleted file mode 100644
index 207f2c1706ef4cc12572e381c38f61a504ece232..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/runtime/types.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
-
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-
-namespace toco {
-
-// TODO(ahentz): These are just stopgaps for now, untils we move all
-// the code over to tflite.
-using tflite::Dims;
-using tflite::FullyConnectedWeightsFormat;
-using tflite::FusedActivationFunctionType;
-using tflite::RequiredBufferSizeForDims;
-
-}  // namespace toco
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
deleted file mode 100644
index ea1fc2827ead7e7442bbf7f569e3ea88c3b0de57..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
+++ /dev/null
@@ -1,91 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-
-cc_library(
-    name = "cluster_utils",
-    srcs = [
-        "cluster_utils.cc",
-    ],
-    hdrs = [
-        "cluster_utils.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite/toco:toco_port",
-    ],
-)
-
-cc_library(
-    name = "cluster",
-    srcs = [
-        "cluster.cc",
-    ],
-    hdrs = [
-        "cluster.h",
-    ],
-    deps = [
-        ":cluster_utils",
-        "//tensorflow/contrib/lite/toco:model",
-        "//tensorflow/contrib/lite/toco:tooling_util",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-cc_library(
-    name = "resolve_svdf",
-    srcs = [
-        "resolve_svdf.cc",
-    ],
-    hdrs = [
-        "resolve_svdf.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cluster",
-        ":cluster_utils",
-        "//tensorflow/contrib/lite/toco:model",
-        "//tensorflow/contrib/lite/toco:toco_port",
-        "//tensorflow/contrib/lite/toco:tooling_util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@protobuf_archive//:protobuf_headers",
-    ],
-)
-
-tf_cc_test(
-    name = "resolve_svdf_test",
-    srcs = ["resolve_svdf_test.cc"],
-    tags = ["no_oss"],
-    deps = [
-        ":cluster",
-        ":cluster_utils",
-        ":resolve_cluster",
-        ":resolve_svdf",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "resolve_cluster",
-    srcs = [
-        "resolve_cluster.cc",
-    ],
-    hdrs = [
-        "resolve_cluster.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cluster",
-        ":cluster_utils",
-        ":resolve_svdf",
-        "//tensorflow/contrib/lite/toco:tooling_util",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
deleted file mode 100644
index 71cdb7703e98a7bb53eaeb189625b8931b327d20..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ /dev/null
@@ -1,147 +0,0 @@
-package(
-    # To suppress build cleaner error about inclusion of schema_generate.h.
-    features = ["-layering_check"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-
-cc_library(
-    name = "operator",
-    srcs = [
-        "operator.cc",
-    ],
-    hdrs = [
-        "builtin_operator.h",
-        "custom_operator.h",
-        "operator.h",
-        "simple_operator.h",
-    ],
-    deps = [
-        ":types",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/contrib/lite/toco:graph_transformations",
-        "//tensorflow/contrib/lite/toco:model",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:ptr_util",
-        "@com_google_absl//absl/memory",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "operator_test",
-    srcs = [
-        "operator_test.cc",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":operator",
-        "//tensorflow/contrib/lite/toco:tooling_util",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers",
-    ],
-)
-
-cc_library(
-    name = "types",
-    srcs = [
-        "types.cc",
-    ],
-    hdrs = [
-        "types.h",
-    ],
-    deps = [
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/contrib/lite/toco:model",
-    ],
-)
-
-tf_cc_test(
-    name = "types_test",
-    srcs = [
-        "types_test.cc",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":types",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "export",
-    srcs = [
-        "export.cc",
-    ],
-    hdrs = [
-        "export.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":operator",
-        ":types",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/contrib/lite/toco:model",
-        "//tensorflow/contrib/lite/toco:tooling_util",
-        "//tensorflow/contrib/lite/tools/optimize:quantize_weights",
-        "@com_google_absl//absl/strings",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "export_test",
-    srcs = [
-        "export_test.cc",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":export",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "import",
-    srcs = [
-        "import.cc",
-    ],
-    hdrs = [
-        "import.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":operator",
-        ":types",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/contrib/lite/toco:model",
-        "//tensorflow/contrib/lite/toco:tooling_util",
-        "//tensorflow/contrib/lite/tools:verifier",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "import_test",
-    srcs = [
-        "import_test.cc",
-    ],
-    tags = ["no_oss"],
-    deps = [
-        ":import",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "@com_google_googletest//:gtest_main",
-        "@flatbuffers",
-    ],
-)
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
deleted file mode 100644
index 45ca7f7f0c9b517b4ddb3286e3151338fb700f71..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ /dev/null
@@ -1,500 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tflite/export.h"
-
-#include "flatbuffers/flexbuffers.h"
-#include "absl/strings/str_join.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/toco/tflite/operator.h"
-#include "tensorflow/contrib/lite/toco/tflite/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/contrib/lite/tools/optimize/quantize_weights.h"
-#include "tensorflow/contrib/lite/version.h"
-
-namespace toco {
-
-namespace tflite {
-
-using flatbuffers::FlatBufferBuilder;
-using flatbuffers::Offset;
-using flatbuffers::Vector;
-using ::tflite::Buffer;
-using ::tflite::BuiltinOperator;
-using ::tflite::BuiltinOperator_CUSTOM;
-using ::tflite::BuiltinOperator_MAX;
-using ::tflite::BuiltinOperator_MIN;
-using ::tflite::CreateBuffer;
-using ::tflite::CreateModel;
-using ::tflite::CreateOperator;
-using ::tflite::CreateTensor;
-using ::tflite::Operator;
-using ::tflite::OperatorCode;
-using ::tflite::SubGraph;
-using ::tflite::Tensor;
-
-namespace {
-
-// Check if a TensorFlow Op is a control flow op by its name.
-bool IsControlFlowOp(const string& tensorflow_op) {
-  // Technically this is equalivent to `::tensorflow::Node::IsControlFlow()`.
-  // It requires to construct a `::tensorflow::Graph` to use that helper
-  // function, so we simply hardcode the list of control flow ops here.
-  if (tensorflow_op == "Switch" || tensorflow_op == "RefSwitch" ||
-      tensorflow_op == "Merge" || tensorflow_op == "RefMerge" ||
-      tensorflow_op == "Enter" || tensorflow_op == "RefEnter" ||
-      tensorflow_op == "Exit" || tensorflow_op == "RefExit" ||
-      tensorflow_op == "NextIteration" || tensorflow_op == "RefNextIteration") {
-    return true;
-  }
-  // TODO(ycling): Also check how to handle Variable ops and Assign ops.
-  return false;
-}
-
-details::OperatorKey GetOperatorKey(
-    const ::toco::Operator& op,
-    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
-    bool allow_flex_ops) {
-  string custom_code;
-  if (op.type == OperatorType::kUnsupported) {
-    const TensorFlowUnsupportedOperator& unsupported_op =
-        static_cast<const TensorFlowUnsupportedOperator&>(op);
-    custom_code = unsupported_op.tensorflow_op;
-  }
-  int version = 1;
-  if (ops_by_type.count(op.type) != 0) {
-    version = ops_by_type.at(op.type)->GetVersion(op);
-  }
-  return details::OperatorKey(op.type, custom_code, version, allow_flex_ops);
-}
-
-void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
-                        string* file_contents) {
-  const uint8_t* buffer = builder.GetBufferPointer();
-  int size = builder.GetSize();
-  *file_contents = string(reinterpret_cast<const char*>(buffer), size);
-}
-
-}  // Anonymous namespace.
-
-namespace details {
-
-OperatorKey::OperatorKey(OperatorType type, const std::string& custom_code,
-                         int version, bool allow_flex_ops) {
-  this->type = type;
-  this->custom_code = custom_code;
-  this->version = version;
-
-  if (type == OperatorType::kUnsupported) {
-    // TODO(b/113715895): When `allow_flex_ops` is on, for now there's no way
-    // to populate a regular custom op. We need to find a way to fix this.
-    if (allow_flex_ops) {
-      // Memorize the original TensorFlow op name.
-      this->flex_tensorflow_op = custom_code;
-      // Prefix the custom code of the flex op.
-      this->custom_code = string(::tflite::kFlexCustomCodePrefix) + custom_code;
-      this->is_flex_op = true;
-
-      if (IsControlFlowOp(this->flex_tensorflow_op)) {
-        is_unsupported_flex_op = true;
-      }
-    }
-  }
-}
-
-void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
-  // First find a list of unique array names.
-  std::set<string> names;
-  for (const auto& array_pair : model.GetArrayMap()) {
-    names.insert(array_pair.first);
-  }
-
-  // Now assign indices to them and fill in the map.
-  int index = 0;
-  for (const auto& name : names) {
-    (*tensors_map)[name] = index;
-    ++index;
-  }
-}
-
-void LoadOperatorsMap(
-    const Model& model, OperatorsMap* operators_map,
-    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
-    bool allow_flex_ops) {
-  // First find a list of unique operator types.
-  std::set<OperatorKey> keys;
-  for (const auto& op : model.operators) {
-    keys.insert(GetOperatorKey(*op, ops_by_type, allow_flex_ops));
-  }
-  // Now assign indices to them and fill in the map.
-  int index = 0;
-  for (const auto& key : keys) {
-    (*operators_map)[key] = index;
-    ++index;
-  }
-}
-}  // namespace details
-
-Offset<Vector<Offset<Tensor>>> ExportTensors(
-    const Model& model, const details::TensorsMap& tensors_map,
-    FlatBufferBuilder* builder, std::vector<const Array*>* buffers_to_write,
-    const std::set<int32_t>& variable_tensor_indices) {
-  // In the end we will need to produce a vector sorted by the indices of the
-  // tensors in the tensors_map.
-  std::map<int, Offset<Tensor>> ordered_tensors;
-
-  for (const auto& array_pair : model.GetArrayMap()) {
-    const string& tensor_name = array_pair.first;
-    const toco::Array& array = *array_pair.second;
-
-    int buffer_index = buffers_to_write->size();
-    auto type = DataType::Serialize(array.data_type);
-    buffers_to_write->push_back(&array);
-
-    std::vector<int> shape;
-    if (array.has_shape()) {
-      for (int d : array.shape().dims()) {
-        shape.push_back(d);
-      }
-    }
-
-    Offset<Vector<float>> min;
-    Offset<Vector<float>> max;
-    Offset<Vector<float>> scale;
-    Offset<Vector<int64_t>> zero_point;
-    if (array.minmax) {
-      min = builder->CreateVector(
-          std::vector<float>{static_cast<float>(array.minmax->min)});
-      max = builder->CreateVector(
-          std::vector<float>{static_cast<float>(array.minmax->max)});
-    }
-    if (array.quantization_params) {
-      scale = builder->CreateVector(std::vector<float>{
-          static_cast<float>(array.quantization_params->scale)});
-      zero_point = builder->CreateVector(
-          std::vector<int64_t>{array.quantization_params->zero_point});
-    }
-    auto q_param = ::tflite::CreateQuantizationParameters(*builder, min, max,
-                                                          scale, zero_point);
-
-    int index = tensors_map.at(tensor_name);
-    bool is_variable =
-        variable_tensor_indices.find(index) != variable_tensor_indices.end();
-    ordered_tensors[index] =
-        CreateTensor(*builder, builder->CreateVector(shape), type, buffer_index,
-                     builder->CreateString(tensor_name), q_param, is_variable);
-  }
-
-  std::vector<Offset<Tensor>> tensor_vector;
-  tensor_vector.reserve(ordered_tensors.size());
-  for (const auto& tensor : ordered_tensors) {
-    tensor_vector.push_back(tensor.second);
-  }
-
-  return builder->CreateVector(tensor_vector);
-}
-
-Offset<Vector<int32_t>> ExportInputTensors(
-    const Model& model, const details::TensorsMap& tensors_map,
-    FlatBufferBuilder* builder) {
-  std::vector<int32_t> inputs;
-  for (const auto& input : model.flags.input_arrays()) {
-    inputs.push_back(tensors_map.at(input.name()));
-  }
-  return builder->CreateVector<int32_t>(inputs);
-}
-
-Offset<Vector<int32_t>> ExportOutputTensors(
-    const Model& model, const details::TensorsMap& tensors_map,
-    FlatBufferBuilder* builder) {
-  std::vector<int32_t> outputs;
-  for (const string& output : model.flags.output_arrays()) {
-    outputs.push_back(tensors_map.at(output));
-  }
-  return builder->CreateVector<int32_t>(outputs);
-}
-
-Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
-    const Model& model,
-    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
-    const details::OperatorsMap& operators_map, FlatBufferBuilder* builder,
-    std::set<string>* unsupported_ops, const ExportParams& params) {
-  // Map from operator name to TF Lite enum value, for all builtins.
-  std::map<string, BuiltinOperator> builtin_ops;
-  for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
-    BuiltinOperator op = static_cast<BuiltinOperator>(i);
-    string name = EnumNameBuiltinOperator(op);
-    if (op != BuiltinOperator_CUSTOM && !name.empty()) {
-      builtin_ops[name] = op;
-    }
-  }
-
-  // We will need to produce a vector of codes in the same order as they
-  // appear in the operators_map.
-  std::map<int, Offset<OperatorCode>> ordered_opcodes;
-
-  for (const auto& op : model.operators) {
-    const details::OperatorKey operator_key =
-        GetOperatorKey(*op, ops_by_type, params.allow_flex_ops);
-    int op_index = operators_map.at(operator_key);
-    int op_version = operator_key.version;
-
-    string name = HelpfulOperatorTypeName(*op);
-    bool is_builtin = false;
-    if (ops_by_type.count(op->type) != 0) {
-      name = ops_by_type.at(op->type)->name();
-      is_builtin = (builtin_ops.count(name) > 0);
-    }
-
-    if (is_builtin) {
-      ordered_opcodes[op_index] =
-          CreateOperatorCode(*builder, builtin_ops[name], 0, op_version);
-    } else {
-      // This could be a kUnsupported, in which case we should be
-      // able to retrieve the original Tensorflow name from the OperatorKey, or
-      // this could be a proper TOCO operator that is completely unknown to TF
-      // Lite.
-      if (!operator_key.custom_code.empty()) {
-        name = operator_key.custom_code;
-      }
-      // Either way, this is an operator that is not supported by TF Lite,
-      // so we output it as a custom op and add it to the error summary.
-      if (unsupported_ops) {
-        unsupported_ops->insert(name);
-      }
-      ordered_opcodes[op_index] =
-          CreateOperatorCode(*builder, BuiltinOperator_CUSTOM,
-                             builder->CreateString(name), op_version);
-    }
-  }
-
-  std::vector<Offset<OperatorCode>> opcode_vector;
-  opcode_vector.reserve(ordered_opcodes.size());
-  for (const auto& opcode : ordered_opcodes) {
-    opcode_vector.push_back(opcode.second);
-  }
-
-  return builder->CreateVector(opcode_vector);
-}
-
-Offset<Vector<Offset<Operator>>> ExportOperators(
-    const Model& model,
-    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
-    const details::OperatorsMap& operators_map,
-    const details::TensorsMap& tensors_map, FlatBufferBuilder* builder,
-    std::set<int32_t>* variable_tensor_indices, const ExportParams& params) {
-  variable_tensor_indices->clear();
-
-  // The operators are in execution order, so we just follow tf.mini order.
-  std::vector<Offset<Operator>> op_vector;
-  for (const auto& op : model.operators) {
-    std::vector<int32_t> inputs;
-    for (const string& input : op->inputs) {
-      // -1 is the ID for optional tensor in TFLite output
-      int id = model.IsOptionalArray(input) ? -1 : tensors_map.at(input);
-      inputs.push_back(id);
-    }
-    std::vector<int32_t> outputs;
-    for (const string& output : op->outputs) {
-      outputs.push_back(tensors_map.at(output));
-    }
-
-    int op_index = operators_map.at(
-        GetOperatorKey(*op, ops_by_type, params.allow_flex_ops));
-
-    auto tflite_op_it = ops_by_type.find(op->type);
-    BaseOperator* tflite_op = tflite_op_it == ops_by_type.end()
-                                  ? nullptr
-                                  : tflite_op_it->second.get();
-
-    // This is a custom op unless we can find it in ops_by_type, and even then
-    // it could be a custom op (such as kUnsupported).
-    auto options = Options::Custom(0);
-
-    std::vector<bool> mutating_input_variables;
-    if (tflite_op) {
-      options = tflite_op->Serialize(*op, builder);
-      mutating_input_variables = tflite_op->GetMutatingInputVariables(*op);
-
-      if (!mutating_input_variables.empty()) {
-        for (int i = 0; i < op->inputs.size(); ++i) {
-          if (!mutating_input_variables[i]) {
-            continue;
-          }
-          int32_t variable_tensor_index = tensors_map.at(op->inputs[i]);
-          variable_tensor_indices->insert(variable_tensor_index);
-        }
-      }
-    }
-    // The only supported CustomOptionFormat is FLEXBUFFERS now.
-    op_vector.push_back(CreateOperator(
-        *builder, op_index, builder->CreateVector(inputs),
-        builder->CreateVector(outputs), options.type, options.builtin,
-        options.custom, ::tflite::CustomOptionsFormat_FLEXBUFFERS,
-        builder->CreateVector(mutating_input_variables)));
-  }
-
-  return builder->CreateVector(op_vector);
-}
-
-Offset<Vector<Offset<Buffer>>> ExportBuffers(
-    const Model& model, const std::vector<const Array*>& buffers_to_write,
-    FlatBufferBuilder* builder) {
-  std::vector<Offset<Buffer>> buffer_vector;
-  size_t index = 0;
-  for (const Array* array_ptr : buffers_to_write) {
-    const Array& array = *array_ptr;
-    Offset<Vector<uint8_t>> data_buffer = DataBuffer::Serialize(array, builder);
-    buffer_vector.push_back(CreateBuffer(*builder, data_buffer));
-    index++;
-  }
-  return builder->CreateVector(buffer_vector);
-}
-
-void Export(const Model& model, string* output_file_contents,
-            const ExportParams& params) {
-  const auto ops_by_type = BuildOperatorByTypeMap(params.allow_flex_ops);
-  Export(model, output_file_contents, params, ops_by_type);
-}
-
-void Export(
-    const Model& model, string* output_file_contents,
-    const ExportParams& params,
-    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
-  flatbuffers::FlatBufferBuilder builder(/*initial_size=*/10240);
-
-  details::TensorsMap tensors_map;
-  details::LoadTensorsMap(model, &tensors_map);
-
-  details::OperatorsMap operators_map;
-  details::LoadOperatorsMap(model, &operators_map, ops_by_type,
-                            params.allow_flex_ops);
-
-  std::vector<const Array*> buffers_to_write;
-  Array empty_array;
-  buffers_to_write.push_back(&empty_array);
-
-  std::set<string> unsupported_ops;
-  auto op_codes = ExportOperatorCodes(model, ops_by_type, operators_map,
-                                      &builder, &unsupported_ops, params);
-
-  for (const auto& op : model.operators) {
-    if (op->type == OperatorType::kFakeQuant) {
-      LOG(WARNING) << "FAKE_QUANT operation " << LogName(*op)
-                   << " was not converted. If running quantized make sure you "
-                      "are passing --inference_type=QUANTIZED_UINT8 and values "
-                      "for --std_values and --mean_values.";
-    }
-  }
-  if (!unsupported_ops.empty()) {
-    if (!params.allow_custom_ops) {
-      // Remove ExpandDims and ReorderAxes from unimplemented list unless they
-      // compose the list. Both ops are removed during graph transformations.
-      // However, if an op is unimplemented earlier in the model, the graph
-      // transformation is unable to run because the output shape is not
-      // defined. This causes unnecessary confusion during model conversion
-      // time.
-      std::set<string> unsupported_ops_final;
-      for (const auto& op_type : unsupported_ops) {
-        if (op_type != "ReorderAxes" && op_type != "ExpandDims") {
-          unsupported_ops_final.insert(op_type);
-        }
-      }
-      if (unsupported_ops_final.empty()) {
-        unsupported_ops_final = unsupported_ops;
-      }
-
-      LOG(QFATAL)
-          << "Some of the operators in the model are not supported by "
-             "the standard TensorFlow Lite runtime. If you have a custom "
-             "implementation for them you can disable this error with "
-             "--allow_custom_ops, or by setting allow_custom_ops=True "
-             "when calling tf.contrib.lite.TFLiteConverter(). Here is a list "
-             "of operators for which  you will need custom implementations: "
-          << absl::StrJoin(unsupported_ops_final, ", ") << ".";
-    }
-
-    std::set<string> unsupported_control_flow_ops;
-    // Check if unsupported ops contains control flow ops. It's impossible
-    // to implement these ops as custom ops at the moment.
-    for (const auto& op : unsupported_ops) {
-      if (IsControlFlowOp(op)) {
-        unsupported_control_flow_ops.insert(op);
-      }
-    }
-    if (!unsupported_control_flow_ops.empty()) {
-      LOG(QFATAL)
-          << "TensorFlow Lite currently doesn't support control flow ops: "
-          << absl::StrJoin(unsupported_control_flow_ops, ", ") << ".";
-    }
-  }
-
-  std::set<string> unsupported_flex_ops;
-  for (const auto& it : operators_map) {
-    const details::OperatorKey& key = it.first;
-    if (key.is_unsupported_flex_op) {
-      unsupported_flex_ops.insert(key.custom_code);
-    }
-  }
-
-  if (!unsupported_flex_ops.empty()) {
-    LOG(QFATAL) << "Some of the operators in the model are not supported by "
-                   "TensorFlow Flex runtime: "
-                << absl::StrJoin(unsupported_flex_ops, ", ") << ".";
-  }
-
-  std::set<int32_t> variable_tensor_indices;
-  auto ops = ExportOperators(model, ops_by_type, operators_map, tensors_map,
-                             &builder, &variable_tensor_indices, params);
-
-  auto tensors = ExportTensors(model, tensors_map, &builder, &buffers_to_write,
-                               variable_tensor_indices);
-  auto inputs = ExportInputTensors(model, tensors_map, &builder);
-  auto outputs = ExportOutputTensors(model, tensors_map, &builder);
-
-  // TODO(aselle): add support to toco for multiple subgraphs.
-  auto subgraph = CreateSubGraph(builder, tensors, inputs, outputs, ops,
-                                 /* name */ 0);
-  std::vector<flatbuffers::Offset<SubGraph>> subgraphs = {subgraph};
-
-  auto buffers = ExportBuffers(model, buffers_to_write, &builder);
-  auto description = builder.CreateString("TOCO Converted.");
-  auto new_model_location =
-      CreateModel(builder, TFLITE_SCHEMA_VERSION, op_codes,
-                  builder.CreateVector(subgraphs), description, buffers);
-  ::tflite::FinishModelBuffer(builder, new_model_location);
-
-  if (params.quantize_weights) {
-    // Call the quantize_weights tool.
-    LOG(INFO) << "Quantizing TFLite model after conversion to flatbuffer. "
-                 "dump_graphviz will only output the model before this "
-                 "transformation. To visualize the output graph use "
-                 "lite/tools/optimize.py.";
-    flatbuffers::FlatBufferBuilder q_builder(/*initial_size=*/10240);
-    const uint8_t* buffer = builder.GetBufferPointer();
-    const ::tflite::Model* input_model = ::tflite::GetModel(buffer);
-    if (::tflite::optimize::QuantizeWeights(&q_builder, input_model) !=
-        kTfLiteOk) {
-      LOG(QFATAL) << "Quantize weights transformation failed.";
-    }
-    WriteModelToString(q_builder, output_file_contents);
-  } else {
-    WriteModelToString(builder, output_file_contents);
-  }
-}
-
-}  // namespace tflite
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
deleted file mode 100644
index 9efb282c6c5d8ccca89d0ff204c53dfd46d19b33..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/tflite/export.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
-
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tflite/operator.h"
-#include "tensorflow/contrib/lite/util.h"
-
-namespace toco {
-
-namespace tflite {
-
-// The parameters for exporting a TFLite model.
-struct ExportParams {
-  bool allow_custom_ops = false;
-  bool allow_flex_ops = false;
-  bool quantize_weights = false;
-};
-
-// Transform the given tf.mini model into a TF Lite flatbuffer and deposit the
-// result in the given string.
-void Export(const Model& model, string* output_file_contents,
-            const ExportParams& params);
-
-// Export API with custom TFLite operator mapping.
-void Export(
-    const Model& model, string* output_file_contents,
-    const ExportParams& params,
-    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
-
-// This is for backward-compatibility.
-// TODO(ycling): Remove the deprecated entry functions.
-inline void Export(const Model& model, bool allow_custom_ops,
-                   bool quantize_weights, string* output_file_contents) {
-  ExportParams params;
-  params.allow_custom_ops = allow_custom_ops;
-  params.quantize_weights = quantize_weights;
-  Export(model, output_file_contents, params);
-}
-
-// This is for backward-compatibility.
-// TODO(ycling): Remove the deprecated entry functions.
-inline void Export(
-    const Model& model, bool allow_custom_ops, bool quantize_weights,
-    string* output_file_contents,
-    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
-  ExportParams params;
-  params.allow_custom_ops = allow_custom_ops;
-  params.quantize_weights = quantize_weights;
-  Export(model, output_file_contents, params, ops_by_type);
-}
-
-// This is for backward-compatibility.
-// TODO(ycling): Remove the deprecated entry functions.
-inline void Export(const Model& model, string* output_file_contents) {
-  ExportParams params;
-  params.allow_custom_ops = true;
-  Export(model, output_file_contents, params);
-  Export(model, true, false, output_file_contents);
-}
-
-namespace details {
-
-// A maps from tensor name to its final position in the TF Lite buffer.
-using TensorsMap = std::unordered_map<string, int>;
-
-// A key to identify an operator.
-// Only when `type` is `kUnsupported`, `custom_code` is filled to
-// identify which operation is used.
-struct OperatorKey {
-  OperatorKey(OperatorType type, const std::string& custom_code, int version,
-              bool allow_flex_ops = false);
-
-  // Only `type`, `custom_code` and `version` is used to compute hash and
-  // identity.
-  OperatorType type;
-  std::string custom_code;
-  int version;
-
-  // THe fields below are not used to compute hash and identity.
-  bool is_flex_op = false;
-  bool is_unsupported_flex_op = false;
-  // The original TensorFlow op name for the flex op. Filled only when
-  // `is_flex_op` is true.
-  std::string flex_tensorflow_op;
-
-  bool operator<(const OperatorKey& other) const {
-    if (type < other.type) return true;
-    else if (type > other.type)
-      return false;
-    else if (custom_code < other.custom_code)
-      return true;
-    else if (custom_code > other.custom_code)
-      return false;
-    else
-      return version < other.version;
-  }
-
-  bool operator==(const OperatorKey& other) const {
-    return type == other.type && custom_code == other.custom_code &&
-           version == other.version;
-  }
-
-  struct Hash {
-    size_t operator()(const OperatorKey& key) const {
-      return ::tflite::CombineHashes(
-          {std::hash<size_t>()(static_cast<size_t>(key.type)),
-           std::hash<std::string>()(key.custom_code),
-           std::hash<int>()(key.version)});
-    }
-  };
-};
-
-// A maps from operator type to its final position in the TF Lite buffer.
-using OperatorsMap = std::unordered_map<OperatorKey, int, OperatorKey::Hash>;
-
-void LoadTensorsMap(const Model& model, TensorsMap* tensors_map);
-void LoadOperatorsMap(
-    const Model& model, OperatorsMap* operators_map,
-    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
-    bool allow_flex_ops);
-
-}  // namespace details
-}  // namespace tflite
-}  // namespace toco
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
deleted file mode 100644
index a71a64d56f60bc83f0076b5192c20a762a2c213d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ /dev/null
@@ -1,360 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tflite/export.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/toco/tflite/builtin_operator.h"
-#include "tensorflow/contrib/lite/toco/tflite/operator.h"
-#include "tensorflow/contrib/lite/toco/tflite/types.h"
-
-namespace toco {
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAre;
-
-class ExportTest : public ::testing::Test {
- protected:
-  // This is a very simplistic model. We are not interested in testing all the
-  // details here, since tf.mini's testing framework will be exercising all the
-  // conversions multiple times, and the conversion of operators is tested by
-  // separate unittests.
-  void BuildTestModel() {
-    input_model_.GetOrCreateArray("tensor_one");
-    input_model_.GetOrCreateArray("tensor_two");
-    {
-      auto* op = new ConvOperator;
-      op->padding.type = PaddingType::kSame;
-      input_model_.operators.emplace_back(op);
-    }
-    input_model_.operators.emplace_back(new AddOperator);
-    {
-      auto* op = new TensorFlowUnsupportedOperator;
-      op->tensorflow_op = "MyCrazyOp";
-      input_model_.operators.emplace_back(op);
-    }
-    // Note that Sub is not know to TF Lite, so it gets exported as a custom
-    // op (and no options).
-    input_model_.operators.emplace_back(new SubOperator);
-  }
-
-  void BuildQuantizableTestModel() {
-    input_model_.GetOrCreateArray("inputs");
-    Array& weight_array = input_model_.GetOrCreateArray("weights");
-
-    // Make the buffer large enough for QuantizeWeights transformation to take
-    // effect.
-    int buf_size = 1296;
-    auto weight_buf = absl::make_unique<float[]>(buf_size);
-    for (int i = 0; i < buf_size; i++) {
-      // Fill the array with some garbage values.
-      weight_buf[i] = static_cast<float>(i % 128);
-    }
-
-    weight_array.data_type = ArrayDataType::kFloat;
-
-    // Initialize shape for the input array.
-    Shape* weight_array_shape = weight_array.mutable_shape();
-    std::vector<int>* weight_array_shape_dim =
-        weight_array_shape->mutable_dims();
-    weight_array_shape_dim->resize(4, 6);
-    auto& weight_array_buffer =
-        weight_array.GetMutableBuffer<ArrayDataType::kFloat>();
-    weight_array_buffer.data.resize(buf_size);
-    float* buf_ptr =
-        weight_array.GetMutableBuffer<ArrayDataType::kFloat>().data.data();
-    std::copy(weight_buf.get(), weight_buf.get() + buf_size, buf_ptr);
-
-    {
-      auto* op = new ConvOperator;
-      op->padding.type = PaddingType::kSame;
-      op->inputs = {"inputs", "weights"};
-      input_model_.operators.emplace_back(op);
-    }
-    input_model_.operators.emplace_back(new AddOperator);
-  }
-
-  Model input_model_;
-};
-
-TEST_F(ExportTest, LoadTensorsMap) {
-  BuildTestModel();
-
-  details::TensorsMap tensors;
-  details::LoadTensorsMap(input_model_, &tensors);
-  EXPECT_EQ(0, tensors["tensor_one"]);
-  EXPECT_EQ(1, tensors["tensor_two"]);
-}
-
-TEST_F(ExportTest, LoadOperatorsMap) {
-  BuildTestModel();
-
-  details::OperatorsMap operators;
-  const auto ops_by_type = BuildOperatorByTypeMap();
-  // TODO(ycling): Add a test for allow_flex_ops.
-  details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
-  EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "", 1)]);
-  EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "", 1)]);
-  EXPECT_EQ(2, operators[details::OperatorKey(OperatorType::kSub, "", 1)]);
-  EXPECT_EQ(3, operators[details::OperatorKey(OperatorType::kUnsupported,
-                                              "MyCrazyOp", 1)]);
-}
-
-TEST_F(ExportTest, Export) {
-  BuildTestModel();
-
-  string result;
-  Export(input_model_, true, false, &result);
-
-  auto* model = ::tflite::GetModel(result.data());
-
-  std::vector<string> names;
-  for (const ::tflite::OperatorCode* opcode : *model->operator_codes()) {
-    if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
-      names.push_back(string("builtin:") + ::tflite::EnumNameBuiltinOperator(
-                                               opcode->builtin_code()));
-    } else {
-      names.push_back(string("custom:") + opcode->custom_code()->c_str());
-    }
-  }
-
-  EXPECT_THAT(names, ElementsAre("builtin:ADD", "builtin:CONV_2D",
-                                 "builtin:SUB", "custom:MyCrazyOp"));
-
-  std::vector<uint32_t> indices;
-  auto operators = (*model->subgraphs())[0]->operators();
-  EXPECT_EQ(operators->Length(), 4);
-  for (const auto* op : *operators) {
-    indices.push_back(op->opcode_index());
-  }
-
-  EXPECT_THAT(indices, ElementsAre(1, 0, 3, 2));
-}
-
-TEST_F(ExportTest, QuantizeWeights) {
-  // Sanity check for quantize_weights parameter.
-  BuildQuantizableTestModel();
-  string unquantized_result;
-  Export(input_model_, true, /*quantize_weights*/ false, &unquantized_result);
-
-  BuildQuantizableTestModel();
-  string quantized_result;
-  Export(input_model_, true, /*quantize_weights*/ true, &quantized_result);
-
-  // The quantized models should be smaller.
-  EXPECT_LT(quantized_result.size(), unquantized_result.size());
-}
-
-// This test is based on a hypothetical scenario that dilation is supported
-// only in Conv version 2. So Toco populates version=1 when dialation
-// parameters are all 1, and version=2 otehrwise.
-class FakeConvolutionOperator
-    : public BuiltinOperator<ConvOperator, ::tflite::Conv2DOptions,
-                             ::tflite::BuiltinOptions_Conv2DOptions> {
- public:
-  FakeConvolutionOperator()
-      : BuiltinOperator(::tflite::BuiltinOperator_CONV_2D,
-                        OperatorType::kConv) {}
-
-  // Returning the op version according to the op parameters.
-  int GetVersion(const Operator& op) const override {
-    const TocoOperator& conv_op = static_cast<const TocoOperator&>(op);
-    if (conv_op.dilation_width_factor != 1 ||
-        conv_op.dilation_height_factor != 1) {
-      // Version 2 if dilation is used.
-      return 2;
-    }
-    return 1;
-  }
-
-  // Note: The read / write code doesn't need to be changed if we stick with
-  // the restrictions:
-  // * Only adding parameters at the bottom of the Flatbuffer tables.
-  // * When the default value of parameters are used, the op works consistently
-  //   with the previous version.
-  flatbuffers::Offset<TfLiteOptions> WriteOptions(
-      const TocoOperator& op,
-      flatbuffers::FlatBufferBuilder* builder) const override {
-    auto padding = Padding::Serialize(op.padding.type);
-    auto activation_function =
-        ActivationFunction::Serialize(op.fused_activation_function);
-    return ::tflite::CreateConv2DOptions(*builder, padding, op.stride_width,
-                                         op.stride_height, activation_function,
-                                         op.dilation_width_factor,
-                                         op.dilation_height_factor);
-  }
-
-  void ReadOptions(const TfLiteOptions& options,
-                   TocoOperator* op) const override {
-    op->padding.type = Padding::Deserialize(options.padding());
-    op->stride_width = options.stride_w();
-    op->stride_height = options.stride_h();
-    op->dilation_width_factor = options.dilation_w_factor();
-    op->dilation_height_factor = options.dilation_h_factor();
-    op->fused_activation_function =
-        ActivationFunction::Deserialize(options.fused_activation_function());
-  }
-};
-
-class VersionedOpExportTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    input_model_.GetOrCreateArray("input");
-    input_model_.GetOrCreateArray("filter");
-    input_model_.GetOrCreateArray("output");
-  }
-  void AddConvOp(bool use_dialation) {
-    {
-      auto* op = new ConvOperator;
-      op->inputs.push_back("input");
-      op->inputs.push_back("filter");
-      op->inputs.push_back("output");
-
-      op->padding.type = PaddingType::kSame;
-      op->stride_width = 1;
-      op->stride_height = 1;
-      if (use_dialation) {
-        op->dilation_width_factor = 2;
-        op->dilation_height_factor = 2;
-      } else {
-        op->dilation_width_factor = 1;
-        op->dilation_height_factor = 1;
-      }
-      input_model_.operators.emplace_back(op);
-    }
-  }
-
-  std::map<OperatorType, std::unique_ptr<BaseOperator>>
-  BuildFakeOperatorByTypeMap() {
-    std::map<OperatorType, std::unique_ptr<BaseOperator>> result;
-    result[OperatorType::kConv] =
-        std::unique_ptr<BaseOperator>(new FakeConvolutionOperator);
-    return result;
-  }
-
-  Model input_model_;
-};
-
-TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV1) {
-  AddConvOp(false);
-
-  details::OperatorsMap operators;
-  const auto ops_by_type = BuildFakeOperatorByTypeMap();
-  details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
-
-  EXPECT_EQ(1, operators.size());
-  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 1)));
-}
-
-TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV2) {
-  AddConvOp(true);
-
-  details::OperatorsMap operators;
-  const auto ops_by_type = BuildFakeOperatorByTypeMap();
-  details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
-
-  EXPECT_EQ(1, operators.size());
-  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 2)));
-}
-
-TEST_F(VersionedOpExportTest, LoadOperatorsMapWithBothVersions) {
-  AddConvOp(false);
-  AddConvOp(true);
-
-  details::OperatorsMap operators;
-  const auto ops_by_type = BuildFakeOperatorByTypeMap();
-  details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
-
-  EXPECT_EQ(2, operators.size());
-  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 1)));
-  EXPECT_EQ(1, operators.at(details::OperatorKey(OperatorType::kConv, "", 2)));
-}
-
-TEST_F(VersionedOpExportTest, Export) {
-  AddConvOp(false);
-  AddConvOp(true);
-
-  string result;
-  const auto ops_by_type = BuildFakeOperatorByTypeMap();
-  Export(input_model_, true, false, &result, ops_by_type);
-
-  auto* model = ::tflite::GetModel(result.data());
-  auto operator_codes = model->operator_codes();
-
-  // Verify that 2 operator codes are populdated. Both are CONV_2D but with
-  // different versions.
-  EXPECT_EQ(2, operator_codes->size());
-  EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
-            (*operator_codes)[0]->builtin_code());
-  EXPECT_EQ(1, (*operator_codes)[0]->version());
-  EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
-            (*operator_codes)[1]->builtin_code());
-  EXPECT_EQ(2, (*operator_codes)[1]->version());
-
-  // Verify that the 2 operators points to the correct indices of the operation
-  // codes.
-  auto operators = (*model->subgraphs())[0]->operators();
-  EXPECT_EQ(2, operators->size());
-  EXPECT_EQ(0, (*operators)[0]->opcode_index());
-  EXPECT_EQ(1, (*operators)[1]->opcode_index());
-}
-
-TEST(OperatorKeyTest, TestBuiltinOp) {
-  details::OperatorKey key(OperatorType::kConv, "", 2);
-  EXPECT_EQ(key.type, OperatorType::kConv);
-  EXPECT_EQ(key.custom_code, "");
-  EXPECT_EQ(key.version, 2);
-}
-
-TEST(OperatorKeyTest, TestFlexOp) {
-  {
-    details::OperatorKey key(OperatorType::kUnsupported, "SomeUnsupportedOp", 1,
-                             false);
-    EXPECT_EQ(key.type, OperatorType::kUnsupported);
-    // It shouldn't be converted to Flex op if `allow_flex_op` is false.
-    EXPECT_EQ(key.custom_code, "SomeUnsupportedOp");
-    EXPECT_EQ(key.version, 1);
-    EXPECT_FALSE(key.is_flex_op);
-  }
-
-  {
-    details::OperatorKey key(OperatorType::kUnsupported, "SomeUnsupportedOp", 1,
-                             true);
-    EXPECT_EQ(key.type, OperatorType::kUnsupported);
-    // Verify that the custom op name is prefixed by "Flex" and `is_flex_op`
-    // is true.
-    EXPECT_EQ(key.custom_code, "FlexSomeUnsupportedOp");
-    EXPECT_EQ(key.version, 1);
-    EXPECT_TRUE(key.is_flex_op);
-  }
-}
-
-TEST(OperatorKeyTest, TestFlexWithControlFlowOp) {
-  details::OperatorKey key(OperatorType::kUnsupported, "Merge", 1, true);
-  EXPECT_EQ(key.type, OperatorType::kUnsupported);
-  EXPECT_EQ(key.custom_code, "FlexMerge");
-  EXPECT_EQ(key.version, 1);
-  EXPECT_TRUE(key.is_flex_op);
-  // The control flow ops should be marked as unsupported.
-  EXPECT_TRUE(key.is_unsupported_flex_op);
-}
-
-// TODO(ahentz): tests for tensors, inputs, outputs, opcodes and operators.
-
-}  // namespace
-}  // namespace tflite
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/types.h b/tensorflow/contrib/lite/toco/tflite/types.h
deleted file mode 100644
index 3923756fc94e3175a6505740a96cce8d614c3990..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/tflite/types.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
-
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-
-namespace toco {
-
-namespace tflite {
-
-struct DataType {
-  static ::tflite::TensorType Serialize(ArrayDataType array_data_type);
-  static ArrayDataType Deserialize(int tensor_type);
-};
-
-struct DataBuffer {
-  using FlatBufferOffset = flatbuffers::Offset<flatbuffers::Vector<uint8_t>>;
-
-  // Build the flatbuffer representation of a toco's Array and return the
-  // corresponding offset into the flatbuffer. Note that data from the array
-  // will be copied into the flatbuffer.
-  static FlatBufferOffset Serialize(const Array& array,
-                                    flatbuffers::FlatBufferBuilder* builder);
-  // Copy data from the given tensor into toco's Array.
-  static void Deserialize(const ::tflite::Tensor& tensor,
-                          const ::tflite::Buffer& buffer, Array* array);
-};
-
-struct Padding {
-  static ::tflite::Padding Serialize(PaddingType padding_type);
-  static PaddingType Deserialize(int padding);
-};
-
-struct ActivationFunction {
-  static ::tflite::ActivationFunctionType Serialize(
-      FusedActivationFunctionType faf_type);
-  static FusedActivationFunctionType Deserialize(int activation_function);
-};
-
-}  // namespace tflite
-
-}  // namespace toco
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/toco.cc b/tensorflow/contrib/lite/toco/toco.cc
deleted file mode 100644
index 0b460bd178a49cafefd3438b7ae1c38a07b2ab7c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/toco.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstdio>
-#include <memory>
-#include <string>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_cmdline_flags.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
-#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/toco_port.h"
-#include "tensorflow/contrib/lite/toco/toco_tooling.h"
-#include "tensorflow/contrib/lite/toco/toco_types.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-namespace {
-
-// Checks the permissions of the output file to ensure it is writeable.
-void CheckOutputFilePermissions(const Arg<string>& output_file) {
-  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
-  QCHECK(port::file::Writable(output_file.value()).ok())
-      << "Specified output_file is not writable: " << output_file.value()
-      << ".\n";
-}
-
-// Checks the permissions of the frozen model file.
-void CheckFrozenModelPermissions(const Arg<string>& input_file) {
-  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
-  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
-      << "Specified input_file does not exist: " << input_file.value() << ".\n";
-  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
-      << "Specified input_file exists, but is not readable: "
-      << input_file.value() << ".\n";
-}
-
-// Reads the contents of the GraphDef from either the frozen graph file or the
-// SavedModel directory. If it reads the SavedModel directory, it updates the
-// ModelFlags and TocoFlags accordingly.
-void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
-                   const ParsedModelFlags& parsed_model_flags,
-                   TocoFlags* toco_flags, ModelFlags* model_flags,
-                   string* graph_def_contents) {
-  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
-
-  // Ensure savedmodel_directory is not set.
-  QCHECK(!parsed_toco_flags.savedmodel_directory.specified())
-      << "Use `tensorflow/contrib/lite/python/tflite_convert` script with "
-      << "SavedModel directories.\n";
-
-  // Checks the input file permissions and reads the contents.
-  CheckFrozenModelPermissions(parsed_toco_flags.input_file);
-  CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
-                                graph_def_contents, port::file::Defaults())
-            .ok());
-}
-
-void ToolMain(const ParsedTocoFlags& parsed_toco_flags,
-              const ParsedModelFlags& parsed_model_flags) {
-  ModelFlags model_flags;
-  ReadModelFlagsFromCommandLineFlags(parsed_model_flags, &model_flags);
-
-  TocoFlags toco_flags;
-  ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
-
-  string graph_def_contents;
-  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
-                &model_flags, &graph_def_contents);
-  CheckOutputFilePermissions(parsed_toco_flags.output_file);
-
-  std::unique_ptr<Model> model =
-      Import(toco_flags, model_flags, graph_def_contents);
-  Transform(toco_flags, model.get());
-  string output_file_contents;
-  Export(toco_flags, *model, toco_flags.allow_custom_ops(),
-         &output_file_contents);
-  CHECK(port::file::SetContents(parsed_toco_flags.output_file.value(),
-                                output_file_contents, port::file::Defaults())
-            .ok());
-}
-
-}  // namespace
-}  // namespace toco
-
-int main(int argc, char** argv) {
-  toco::string msg;
-  toco::ParsedTocoFlags parsed_toco_flags;
-  toco::ParsedModelFlags parsed_model_flags;
-
-  // If no args were specified, give a help string to be helpful.
-  int* effective_argc = &argc;
-  char** effective_argv = argv;
-  if (argc == 1) {
-    // No arguments, so manufacture help argv.
-    static int dummy_argc = 2;
-    static char* dummy_argv[] = {argv[0], const_cast<char*>("--help")};
-    effective_argc = &dummy_argc;
-    effective_argv = dummy_argv;
-  }
-
-  // Parse toco flags and command flags in sequence, each one strips off args,
-  // giving InitGoogle a chance to handle all remaining arguments.
-  bool toco_success = toco::ParseTocoFlagsFromCommandLineFlags(
-      effective_argc, effective_argv, &msg, &parsed_toco_flags);
-  bool model_success = toco::ParseModelFlagsFromCommandLineFlags(
-      effective_argc, effective_argv, &msg, &parsed_model_flags);
-  if (!toco_success || !model_success || !msg.empty()) {
-    fprintf(stderr, "%s", msg.c_str());
-    fflush(stderr);
-    return 1;
-  }
-  toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
-  toco::ToolMain(parsed_toco_flags, parsed_model_flags);
-}
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.h b/tensorflow/contrib/lite/toco/toco_tooling.h
deleted file mode 100644
index e731c149eef412d3048a1d5f84145ce6ff87208d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/toco_tooling.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
-
-#include <memory>
-#include <string>
-
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
-
-namespace toco {
-
-// Imports the input file into a Model object.
-std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
-                              const ModelFlags& model_flags,
-                              const string& input_file_contents);
-
-// Transforms a Model. The resulting Model is ready to be passed
-// to Export with the exact same toco_flags.
-void Transform(const TocoFlags& toco_flags, Model* model);
-
-// Exports the Model, which must be of the 'lowered' form returned by
-// Transform, to a file of the format given by
-// toco_flags.output_format().
-void Export(const TocoFlags& toco_flags, const Model& model,
-            bool allow_custom_ops, string* output_file_contents);
-
-// This if for backward-compatibility with internal tools.
-inline void Export(const TocoFlags& toco_flags, const Model& model,
-                   string* output_file_contents) {
-  Export(toco_flags, model, true, output_file_contents);
-}
-
-}  // namespace toco
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
deleted file mode 100644
index 0b268264031f4f1e86b2956a75bde173a945ddf4..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/BUILD
+++ /dev/null
@@ -1,98 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-
-common_copts = ["-Wall"]
-
-py_binary(
-    name = "visualize",
-    srcs = ["visualize.py"],
-    data = [
-        "//tensorflow/contrib/lite/schema:schema.fbs",
-        "//tensorflow/python:platform",
-        "@flatbuffers//:flatc",
-    ],
-    srcs_version = "PY2AND3",
-)
-
-tf_cc_binary(
-    name = "generate_op_registrations",
-    srcs = ["gen_op_registration_main.cc"],
-    deps = [
-        "//tensorflow/contrib/lite/tools:gen_op_registration",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "gen_op_registration",
-    srcs = ["gen_op_registration.cc"],
-    hdrs = ["gen_op_registration.h"],
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string",
-        "@com_googlesource_code_re2//:re2",
-    ],
-)
-
-cc_test(
-    name = "gen_op_registration_test",
-    srcs = ["gen_op_registration_test.cc"],
-    data = [
-        "//tensorflow/contrib/lite:testdata/0_subgraphs.bin",
-        "//tensorflow/contrib/lite:testdata/2_subgraphs.bin",
-        "//tensorflow/contrib/lite:testdata/empty_model.bin",
-        "//tensorflow/contrib/lite:testdata/test_model.bin",
-        "//tensorflow/contrib/lite:testdata/test_model_broken.bin",
-    ],
-    tags = [
-        "no_oss",
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":gen_op_registration",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "verifier",
-    srcs = ["verifier.cc"],
-    hdrs = ["verifier.h"],
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-    ],
-)
-
-cc_test(
-    name = "verifier_test",
-    size = "small",
-    srcs = ["verifier_test.cc"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":verifier",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/contrib/lite/testing:util",
-        "//tensorflow/core:framework_lite",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/tools/accuracy/BUILD b/tensorflow/contrib/lite/tools/accuracy/BUILD
deleted file mode 100644
index 1b60d6a60d39ccb59613871d1f438b31c16fec7a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/accuracy/BUILD
+++ /dev/null
@@ -1,328 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
-load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
-
-common_linkopts = tflite_linkopts() + select({
-    "//conditions:default": [],
-    "//tensorflow:android": [
-        "-pie",
-        "-llog",
-    ],
-})
-
-cc_library(
-    name = "utils",
-    srcs = ["utils.cc"],
-    hdrs = ["utils.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-            ],
-        },
-    ),
-)
-
-tf_cc_test(
-    name = "utils_test",
-    srcs = ["utils_test.cc"],
-    args = [
-        "--test_model_file=$(location //tensorflow/contrib/lite:testdata/multi_add.bin)",
-    ],
-    data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":utils",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework_internal",
-                "//tensorflow/core:lib",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "run_tflite_model_op",
-    srcs = ["run_tflite_model_op.cc"],
-    copts = tflite_copts(),
-    deps = [
-        ":utils",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:tensorflow",
-                "//tensorflow/core:protos_all_cc",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:framework",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:ops",
-            ],
-        },
-    ),
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "android_required_build_flags",
-    srcs = ["android_required_build_flags.cc"],
-    copts = tflite_copts(),
-)
-
-tf_cc_test(
-    name = "run_tflite_model_op_test",
-    srcs = ["run_tflite_model_op_test.cc"],
-    args = [
-        "--test_model_file=$(location //tensorflow/contrib/lite:testdata/multi_add.bin)",
-    ],
-    data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-        ":run_tflite_model_op",
-        ":android_required_build_flags",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:framework",
-                "//tensorflow/core:framework_internal",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:ops",
-                "//tensorflow/core:protos_all_cc",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "stage",
-    hdrs = ["stage.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/cc:scope",
-    ],
-)
-
-cc_library(
-    name = "file_reader_stage",
-    srcs = ["file_reader_stage.cc"],
-    hdrs = ["file_reader_stage.h"],
-    deps = [
-        ":stage",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-    ],
-)
-
-tf_cc_test(
-    name = "file_reader_stage_test",
-    srcs = ["file_reader_stage_test.cc"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":file_reader_stage",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core/kernels:android_whole_file_read_ops",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "run_tflite_model_stage",
-    srcs = ["run_tflite_model_stage.cc"],
-    hdrs = ["run_tflite_model_stage.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":run_tflite_model_op",
-        ":stage",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-    ],
-)
-
-cc_library(
-    name = "accuracy_eval_stage",
-    hdrs = ["accuracy_eval_stage.h"],
-    copts = tflite_copts(),
-    deps = [
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "eval_pipeline",
-    srcs = ["eval_pipeline.cc"],
-    hdrs = ["eval_pipeline.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":accuracy_eval_stage",
-        ":stage",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:core_cpu",
-            ],
-        },
-    ),
-)
-
-tf_cc_test(
-    name = "eval_pipeline_test",
-    srcs = ["eval_pipeline_test.cc"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":eval_pipeline",
-        "//tensorflow/cc:cc_ops",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:ops",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "eval_pipeline_builder",
-    srcs = ["eval_pipeline_builder.cc"],
-    hdrs = ["eval_pipeline_builder.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":eval_pipeline",
-        ":accuracy_eval_stage",
-        ":stage",
-        "@com_google_absl//absl/memory",
-        "//tensorflow/cc:cc_ops",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:ops",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
-tf_cc_test(
-    name = "eval_pipeline_builder_test",
-    srcs = ["eval_pipeline_builder_test.cc"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":eval_pipeline_builder",
-        "//tensorflow/cc:cc_ops",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:ops",
-                "//tensorflow/core:tensorflow",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "csv_writer",
-    hdrs = ["csv_writer.h"],
-    copts = tflite_copts(),
-    deps = select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:lib",
-            ],
-        },
-    ),
-)
-
-tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD
deleted file mode 100644
index 98e2835b2ebd2f7918a939fb89aebec0fd54fb43..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/BUILD
+++ /dev/null
@@ -1,182 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
-load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
-
-common_linkopts = tflite_linkopts() + select({
-    "//conditions:default": [],
-    "//tensorflow:android": [
-        "-pie",
-        "-llog",
-    ],
-})
-
-cc_library(
-    name = "inception_preprocessing",
-    srcs = ["inception_preprocessing.cc"],
-    hdrs = ["inception_preprocessing.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags",
-        "//tensorflow/contrib/lite/tools/accuracy:stage",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core/kernels:android_tensorflow_image_op",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:tensorflow",
-                "//tensorflow/core:protos_all_cc",
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:framework",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:ops",
-            ],
-        },
-    ),
-)
-
-tf_cc_test(
-    name = "inception_preprocessing_test",
-    srcs = ["inception_preprocessing_test.cc"],
-    args = [
-        "--test_image=$(location :testdata/grace_hopper.jpg)",
-    ],
-    data = [":testdata/grace_hopper.jpg"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = [
-        "no_oss",  # b/114307765
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":inception_preprocessing",
-        "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:core_cpu",
-                "//tensorflow/core:framework_internal",
-                "//tensorflow/core:lib",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "imagenet_topk_eval",
-    srcs = ["imagenet_topk_eval.cc"],
-    hdrs = ["imagenet_topk_eval.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/contrib/lite/tools/accuracy:accuracy_eval_stage",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-                "//tensorflow/core:lib",
-            ],
-        },
-    ),
-)
-
-tf_cc_test(
-    name = "imagenet_topk_eval_test",
-    srcs = ["imagenet_topk_eval_test.cc"],
-    linkopts = common_linkopts,
-    linkstatic = 1,
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":imagenet_topk_eval",
-        "@com_google_googletest//:gtest",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core:android_tensorflow_test_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:framework",
-            ],
-        },
-    ),
-)
-
-cc_library(
-    name = "imagenet_model_evaluator",
-    srcs = ["imagenet_model_evaluator.cc"],
-    hdrs = ["imagenet_model_evaluator.h"],
-    copts = tflite_copts(),
-    deps = [
-        ":imagenet_topk_eval",
-        ":inception_preprocessing",
-        "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags",
-        "//tensorflow/contrib/lite/tools/accuracy:eval_pipeline",
-        "//tensorflow/contrib/lite/tools/accuracy:eval_pipeline_builder",
-        "//tensorflow/contrib/lite/tools/accuracy:file_reader_stage",
-        "//tensorflow/contrib/lite/tools/accuracy:run_tflite_model_stage",
-        "//tensorflow/contrib/lite/tools/accuracy:utils",
-        "@com_google_absl//absl/memory",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-                "//tensorflow/core/kernels:android_whole_file_read_ops",
-                "//tensorflow/core/kernels:android_tensorflow_image_op",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:tensorflow",
-                "//tensorflow/core:lib_internal",
-                "//tensorflow/core:framework_internal",
-                "//tensorflow/core:framework",
-                "//tensorflow/core:lib",
-                "//tensorflow/core:core_cpu",
-            ],
-        },
-    ),
-)
-
-tf_cc_binary(
-    name = "imagenet_accuracy_eval",
-    srcs = ["imagenet_accuracy_eval.cc"],
-    copts = tflite_copts(),
-    linkopts = common_linkopts,
-    deps = [
-        ":imagenet_model_evaluator",
-        ":imagenet_topk_eval",
-        "@com_google_absl//absl/memory",
-        "//tensorflow/contrib/lite/tools/accuracy:android_required_build_flags",
-        "//tensorflow/contrib/lite/tools/accuracy:csv_writer",
-    ] + select(
-        {
-            "//tensorflow:android": [
-                "//tensorflow/core:android_tensorflow_lib",
-            ],
-            "//conditions:default": [
-                "//tensorflow/core:lib",
-                "//tensorflow/core:framework_internal",
-            ],
-        },
-    ),
-)
-
-tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md
deleted file mode 100644
index 362ea3ac34f60a93ec242bf11306c5798b982035..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/README.md
+++ /dev/null
@@ -1,146 +0,0 @@
-## Accuracy evaluation for ILSVRC 2012 (Imagenet Large Scale Visual Recognition Challenge) image classification task
-
-This binary can evaluate the accuracy of TFLite models trained for the [ILSVRC 2012 image classification task]
-(http://www.image-net.org/challenges/LSVRC/2012/).
-The binary takes the path to validation images and labels as inputs. It outputs the accuracy after running the TFLite model on the validation sets.
-
-To run the binary download the ILSVRC 2012 devkit [see instructions](#downloading-ilsvrc) and run the [`generate_validation_ground_truth` script](#ground-truth-label-generation) to generate the ground truth labels.
-
-## Parameters
-The binary takes the following parameters:
-
-*   `model_file` : `string` \
-    Path to the TFlite model file.
-
-*   `ground_truth_images_path`: `string` \
-    The path to the directory containing ground truth images.
-
-*   `ground_truth_labels`: `string` \
-    Path to ground truth labels file. This file should contain the same number of labels as    the number images in the ground truth directory. The labels are assumed to be in the
-    same order as the sorted filename of images. See [ground truth label generation](#ground-truth-label-generation)
-    section for more information about how to generate labels for images.
-
-*    `model_output_labels`: `string` \
-    Path to the file containing labels, that is used to interpret the output of
-    the model. E.g. in case of mobilenets, this is the path to
-    `mobilenet_labels.txt` where each label is in the same order as the output
-    1001 dimension tensor.
-
-*   `output_path`: `string` \
-    This is the path to the output file. The output is a CSV file that has top-10 accuracies in each row. Each line of output file is the cumulative accuracy after processing images in a sorted order. So first line is accuracy after processing the first image, second line is accuracy after procesing first two images. The last line of the file is accuracy after processing the entire validation set.
-
-and the following optional parameters:
-
-*   `blacklist_file_path`: `string` \
-    Path to blacklist file. This file contains the indices of images that are blacklisted for evaluation. 1762 images are blacklisted in ILSVRC dataset. For details please refer to readme.txt of ILSVRC2014 devkit.
-
-*   `num_images`: `int` (default=0) \
-    The number of images to process, if 0, all images in the directory are processed otherwise only num_images will be processed.
-
-*   `num_threads`: `int` (default=4) \
-    The number of threads to use for evaluation.
-
-
-## Downloading ILSVRC
-In order to use this tool to run evaluation on the full 50K ImageNet dataset,
-download the data set from http://image-net.org/request.
-
-## Ground truth label generation
-The ILSVRC 2012 devkit `validation_ground_truth.txt` contains IDs that correspond to synset of the image. 
-The accuracy binary however expects the ground truth labels to contain the actual name of 
-category instead of synset ids. A conversion script has been provided to convert the validation ground truth to
-category labels. The `validation_ground_truth.txt` can be converted by the following steps:
-
-```
-ILSVRC_2012_DEVKIT_DIR=[set to path to ILSVRC 2012 devkit]
-VALIDATION_LABELS=[set to  path to output]
-
-python generate_validation_labels.py -- \
---ilsvrc_devkit_dir=${ILSVRC_2012_DEVKIT_DIR} \
---validation_labels_output=${VALIDATION_LABELS}
-```
-
-## Running the binary
-
-### On Android
-
-(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android for configuring NDK and SDK.
-
-(1) Build using the following command:
-
-```
-bazel build -c opt \
-  --config=android_arm \
-  --config=monolithic \
-  --cxxopt='--std=c++11' \
-  --copt=-D__ANDROID_TYPES_FULL__ \
-  --copt=-DSUPPORT_SELECTIVE_REGISTRATION \
-  //tensorflow/contrib/lite/tools/accuracy/ilsvrc:imagenet_accuracy_eval
-```
-
-(2) Connect your phone. Push the binary to your phone with adb push
-     (make the directory if required):
-
-```
-adb push bazel-bin/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval /data/local/tmp
-```
-
-(3) Make the binary executable.
-
-```
-adb shell chmod +x /data/local/tmp/imagenet_accuracy_eval
-```
-
-(4) Push the TFLite model  that you need to test. For example:
-
-```
-adb push mobilenet_quant_v1_224.tflite /data/local/tmp
-```
-
-(5) Push the imagenet images to device, make sure device has sufficient storage available before pushing the dataset:
-
-```
-adb shell mkdir /data/local/tmp/ilsvrc_images && \
-adb push ${IMAGENET_IMAGES_DIR} /data/local/tmp/ilsvrc_images
-```
-
-(6) Push the generated validation ground labels to device.
-
-```
-adb push ${VALIDATION_LABELS} /data/local/tmp/ilsvrc_validation_labels.txt
-```
-
-(7) Push the model labels text file to device.
-
-```
-adb push ${MODEL_LABELS_TXT} /data/local/tmp/model_output_labels.txt
-```
-
-(8) Run the binary.
-
-```
-adb shell /data/local/tmp/imagenet_accuracy_eval \
-  --model_file=/data/local/tmp/mobilenet_quant_v1_224.tflite \
-  --ground_truth_images_path=/data/local/tmp/ilsvrc_images \
-  --ground_truth_labels=/data/local/tmp/ilsvrc_validation_labels.txt \
-  --model_output_labels=/data/local/tmp/model_output_labels.txt \
-  --output_file_path=/data/local/tmp/accuracy_output.txt \
-  --num_images=0 # Run on all images.
-```
-
-###  On Desktop
-
-(1) Build and run using the following command:
-
-```
-bazel run -c opt \
-  --cxxopt='--std=c++11' \
-  -- \
-  //tensorflow/contrib/lite/tools/accuracy/ilsvrc:imagenet_accuracy_eval \
-  --model_file=mobilenet_quant_v1_224.tflite \
-  --ground_truth_images_path=${IMAGENET_IMAGES_DIR} \
-  --ground_truth_labels=${VALIDATION_LABELS} \
-  --model_output_labels=${MODEL_LABELS_TXT} \
-  --output_file_path=/tmp/accuracy_output.txt \
-  --num_images=0 # Run on all images.
-```
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
deleted file mode 100644
index 71bf61657ea165e43099c32d83512e7d8431a346..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ /dev/null
@@ -1,145 +0,0 @@
-package(default_visibility = [
-    "//visibility:public",
-])
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
-
-common_copts = ["-Wall"] + tflite_copts()
-
-cc_library(
-    name = "logging",
-    hdrs = ["logging.h"],
-    copts = common_copts,
-)
-
-cc_binary(
-    name = "benchmark_model",
-    srcs = [
-        "benchmark_main.cc",
-    ],
-    copts = common_copts,
-    linkopts = tflite_linkopts() + select({
-        "//tensorflow:android": [
-            "-pie",  # Android 5.0 and later supports only PIE
-            "-lm",  # some builtin ops, e.g., tanh, need -lm
-        ],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":benchmark_tflite_model_lib",
-        ":logging",
-    ],
-)
-
-cc_binary(
-    name = "benchmark_model_plus_flex",
-    srcs = [
-        "benchmark_main.cc",
-    ],
-    copts = common_copts,
-    linkopts = tflite_linkopts() + select({
-        "//tensorflow:android": [
-            "-pie",  # Android 5.0 and later supports only PIE
-            "-lm",  # some builtin ops, e.g., tanh, need -lm
-        ],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":benchmark_tflite_model_lib",
-        ":logging",
-        "//tensorflow/contrib/lite/delegates/flex:delegate",
-    ],
-)
-
-cc_test(
-    name = "benchmark_test",
-    srcs = ["benchmark_test.cc"],
-    args = [
-        "--graph=$(location //tensorflow/contrib/lite:testdata/multi_add.bin)",
-    ],
-    data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
-    tags = [
-        "tflite_not_portable_android",
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":benchmark_tflite_model_lib",
-        ":command_line_flags",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "command_line_flags",
-    srcs = ["command_line_flags.cc"],
-    hdrs = ["command_line_flags.h"],
-    copts = common_copts,
-)
-
-cc_test(
-    name = "command_line_flags_test",
-    srcs = ["command_line_flags_test.cc"],
-    copts = common_copts,
-    visibility = ["//visibility:private"],
-    deps = [
-        ":command_line_flags",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "benchmark_tflite_model_lib",
-    srcs = [
-        "benchmark_tflite_model.cc",
-        "logging.h",
-    ],
-    hdrs = ["benchmark_tflite_model.h"],
-    copts = common_copts,
-    deps = [
-        ":benchmark_model_lib",
-        ":logging",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/profiling:profile_summarizer",
-    ],
-)
-
-cc_library(
-    name = "benchmark_params",
-    srcs = [
-        "benchmark_params.cc",
-    ],
-    hdrs = ["benchmark_params.h"],
-    copts = common_copts,
-    deps = [":logging"],
-)
-
-cc_library(
-    name = "benchmark_model_lib",
-    srcs = [
-        "benchmark_model.cc",
-    ],
-    hdrs = ["benchmark_model.h"],
-    copts = common_copts,
-    deps = [
-        ":benchmark_params",
-        ":command_line_flags",
-        ":logging",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/profiling:profile_summarizer",
-        "//tensorflow/contrib/lite/profiling:profiler",
-        "//tensorflow/contrib/lite/profiling:time",
-        "//tensorflow/core:stats_calculator_portable",
-    ],
-)
-
-tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/tools/benchmark/README.md b/tensorflow/contrib/lite/tools/benchmark/README.md
deleted file mode 100644
index 8d997639fb7a363f911b1183dfb05d8138e4c531..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/benchmark/README.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# TFLite Model Benchmark Tool
-
-## Description
-
-A simple C++ binary to benchmark a TFLite model and its individual operators,
-both on desktop machines and on Android. The binary takes a TFLite model,
-generates random inputs and then repeatedly runs the model for specified number
-of runs. Aggregrate latency statistics are reported after running the benchmark.
-
-The instructions below are for running the binary on Desktop and Android,
-for iOS please use the
-[iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark/ios).
-
-## Parameters
-
-The binary takes the following required parameters:
-
-*   `graph`: `string` \
-    The path to the TFLite model file.
-
-and the following optional parameters:
-
-*   `num_threads`: `int` (default=1) \
-    The number of threads to use for running TFLite interpreter.
-*   `warmup_runs`: `int` (default=1) \
-    The number of warmup runs to do before starting the benchmark.
-*   `num_runs`: `int` (default=50) \
-    The number of runs. Increase this to reduce variance.
-*   `run_delay`: `float` (default=-1.0) \
-    The delay in seconds between subsequent benchmark runs. Non-positive values
-    mean use no delay.
-*   `use_nnapi`: `bool` (default=false) \
-    Whether to use [Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/).
-    This API is available on recent Android devices.
-
-## To build/install/run
-
-### On Android:
-
-(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android to edit the `WORKSPACE` to configure the android NDK/SDK.
-
-(1) Build for your specific platform, e.g.:
-
-```
-bazel build -c opt \
-  --config=android_arm \
-  --cxxopt='--std=c++11' \
-  tensorflow/contrib/lite/tools/benchmark:benchmark_model
-```
-
-(2) Connect your phone. Push the binary to your phone with adb push
-     (make the directory if required):
-
-```
-adb push bazel-bin/tensorflow/contrib/lite/tools/benchmark/benchmark_model /data/local/tmp
-```
-
-(3) Make the binary executable.
-
-```
-adb shell chmod +x /data/local/tmp/benchmark_model
-```
-
-(4) Push the compute graph that you need to test. For example:
-
-```
-adb push mobilenet_quant_v1_224.tflite /data/local/tmp
-```
-
-(5) Run the benchmark. For example:
-
-```
-adb shell /data/local/tmp/benchmark_model \
-  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
-  --num_threads=4
-```
-
-### On desktop:
-(1) build the binary
-
-```
-bazel build -c opt tensorflow/contrib/lite/tools/benchmark:benchmark_model
-```
-
-(2) Run on your compute graph, similar to the Android case but without the need of adb shell.
-For example:
-
-```
-bazel-bin/tensorflow/contrib/lite/tools/benchmark/benchmark_model \
-  --graph=mobilenet_quant_v1_224.tflite \
-  --num_threads=4
-```
-
-The MobileNet graph used as an example here may be downloaded from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip).
-
-
-## Reducing variance between runs on Android.
-
-Most modern Android phones use [ARM big.LITTLE](https://en.wikipedia.org/wiki/ARM_big.LITTLE)
-architecture where some cores are more power hungry but faster than other cores.
-When running benchmarks on these phones there can be significant variance
-between different runs of the benchmark. One way to reduce variance between runs
-is to set the [CPU affinity](https://en.wikipedia.org/wiki/Processor_affinity)
-before running the benchmark. On Android this can be done using the `taskset`
-command.
-E.g. for running the benchmark on big cores on Pixel 2 with a single thread one
-can use the following command:
-
-```
-adb shell taskset f0 /data/local/tmp/benchmark_model \
-  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
-  --num_threads=1
-```
-
-where `f0` is the affinity mask for big cores on Pixel 2.
-Note: The affinity mask varies with the device.
-
-## Profiling model operators
-The benchmark model binary also allows you to profile operators and give execution times of each operator. To do this,
-compile the binary with a compiler flag that enables profiling to be compiled in. Pass **--copt=-DTFLITE_PROFILING_ENABLED**
-to compile benchmark with profiling support.
-For example, to compile with profiling support on Android, add this flag to the previous command:
-
-```
-bazel build -c opt \
-  --config=android_arm \
-  --cxxopt='--std=c++11' \
-  --copt=-DTFLITE_PROFILING_ENABLED \
-  tensorflow/contrib/lite/tools/benchmark:benchmark_model
-```
-This compiles TFLite with profiling enabled, now you can run the benchmark binary like before. The binary will produce detailed statistics for each operation similar to those shown below:
-
-```
-
-============================== Run Order ==============================
-	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
-	                 CONV_2D	    0.000	    4.269	    4.269	  0.107%	  0.107%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
-	       DEPTHWISE_CONV_2D	    4.270	    2.150	    2.150	  0.054%	  0.161%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6]
-	                 CONV_2D	    6.421	    6.107	    6.107	  0.153%	  0.314%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   12.528	    1.366	    1.366	  0.034%	  0.348%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_depthwise/Relu6]
-	                 CONV_2D	   13.895	    4.195	    4.195	  0.105%	  0.454%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   18.091	    1.260	    1.260	  0.032%	  0.485%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_depthwise/Relu6]
-	                 CONV_2D	   19.352	    6.652	    6.652	  0.167%	  0.652%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   26.005	    0.698	    0.698	  0.018%	  0.670%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_depthwise/Relu6]
-	                 CONV_2D	   26.703	    3.344	    3.344	  0.084%	  0.754%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   30.047	    0.646	    0.646	  0.016%	  0.770%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6]
-	                 CONV_2D	   30.694	    5.800	    5.800	  0.145%	  0.915%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   36.495	    0.331	    0.331	  0.008%	  0.924%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6]
-	                 CONV_2D	   36.826	    2.838	    2.838	  0.071%	  0.995%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   39.665	    0.439	    0.439	  0.011%	  1.006%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6]
-	                 CONV_2D	   40.105	    5.293	    5.293	  0.133%	  1.139%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   45.399	    0.352	    0.352	  0.009%	  1.147%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6]
-	                 CONV_2D	   45.752	    5.322	    5.322	  0.133%	  1.281%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   51.075	    0.357	    0.357	  0.009%	  1.290%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6]
-	                 CONV_2D	   51.432	    5.693	    5.693	  0.143%	  1.433%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   57.126	    0.366	    0.366	  0.009%	  1.442%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6]
-	                 CONV_2D	   57.493	    5.472	    5.472	  0.137%	  1.579%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   62.966	    0.364	    0.364	  0.009%	  1.588%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6]
-	                 CONV_2D	   63.330	    5.404	    5.404	  0.136%	  1.724%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   68.735	    0.155	    0.155	  0.004%	  1.728%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6]
-	                 CONV_2D	   68.891	    2.970	    2.970	  0.074%	  1.802%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   71.862	    0.206	    0.206	  0.005%	  1.807%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6]
-	                 CONV_2D	   72.069	    5.888	    5.888	  0.148%	  1.955%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
-	         AVERAGE_POOL_2D	   77.958	    0.036	    0.036	  0.001%	  1.956%	     0.000	        0	[MobilenetV1/Logits/AvgPool_1a/AvgPool]
-	                 CONV_2D	   77.994	    1.445	    1.445	  0.036%	  1.992%	     0.000	        0	[MobilenetV1/Logits/Conv2d_1c_1x1/BiasAdd]
-	                 RESHAPE	   79.440	    0.002	    0.002	  0.000%	  1.992%	     0.000	        0	[MobilenetV1/Predictions/Reshape]
-	                 SOFTMAX	   79.443	    0.029	    0.029	  0.001%	  1.993%	     0.000	        0	[MobilenetV1/Predictions/Softmax]
-
-============================== Top by Computation Time ==============================
-	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
-	                 CONV_2D	   19.352	    6.652	    6.652	  0.167%	  0.167%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
-	                 CONV_2D	    6.421	    6.107	    6.107	  0.153%	  0.320%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
-	                 CONV_2D	   72.069	    5.888	    5.888	  0.148%	  0.468%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
-	                 CONV_2D	   30.694	    5.800	    5.800	  0.145%	  0.613%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
-	                 CONV_2D	   51.432	    5.693	    5.693	  0.143%	  0.756%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
-	                 CONV_2D	   57.493	    5.472	    5.472	  0.137%	  0.893%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
-	                 CONV_2D	   63.330	    5.404	    5.404	  0.136%	  1.029%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6]
-	                 CONV_2D	   45.752	    5.322	    5.322	  0.133%	  1.162%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
-	                 CONV_2D	   40.105	    5.293	    5.293	  0.133%	  1.295%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
-	                 CONV_2D	    0.000	    4.269	    4.269	  0.107%	  1.402%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
-
-Number of nodes executed: 31
-============================== Summary by node type ==============================
-	             [Node type]	  [count]	  [avg ms]	    [avg %]	    [cdf %]	  [mem KB]	[times called]
-	                 CONV_2D	       15	     1.406	    89.270%	    89.270%	     0.000	        0
-	       DEPTHWISE_CONV_2D	       13	     0.169	    10.730%	   100.000%	     0.000	        0
-	                 SOFTMAX	        1	     0.000	     0.000%	   100.000%	     0.000	        0
-	                 RESHAPE	        1	     0.000	     0.000%	   100.000%	     0.000	        0
-	         AVERAGE_POOL_2D	        1	     0.000	     0.000%	   100.000%	     0.000	        0
-
-Timings (microseconds): count=50 first=79449 curr=81350 min=77385 max=88213 avg=79732 std=1929
-Memory (bytes): count=0
-31 nodes observed
-
-
-Average inference timings in us: Warmup: 83235, Init: 38467, no stats: 79760.9
-```
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
deleted file mode 100644
index f86c0445b0525cd053c733b18bb7f1205d310d43..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/tools/benchmark/benchmark_model.h"
-
-#include <time.h>
-
-#include <iostream>
-#include <sstream>
-
-#include "tensorflow/contrib/lite/profiling/time.h"
-#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
-
-namespace {
-void SleepForSeconds(double sleep_seconds) {
-  if (sleep_seconds <= 0.0) {
-    return;
-  }
-  // Convert the run_delay string into a timespec.
-  timespec req;
-  req.tv_sec = static_cast<time_t>(sleep_seconds);
-  req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
-  // If requested, sleep between runs for an arbitrary amount of time.
-  // This can be helpful to determine the effect of mobile processor
-  // scaling and thermal throttling.
-#ifdef PLATFORM_WINDOWS
-  Sleep(sleep_seconds * 1000);
-#else
-  nanosleep(&req, nullptr);
-#endif
-}
-
-}  // namespace
-
-namespace tflite {
-namespace benchmark {
-using tensorflow::Stat;
-
-BenchmarkParams BenchmarkModel::DefaultParams() {
-  BenchmarkParams params;
-  params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(50));
-  params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
-  params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
-  params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
-  params.AddParam("output_prefix", BenchmarkParam::Create<std::string>(""));
-  params.AddParam("warmup_runs", BenchmarkParam::Create<int32_t>(1));
-  return params;
-}
-
-BenchmarkModel::BenchmarkModel() : params_(DefaultParams()) {}
-
-void BenchmarkLoggingListener::OnBenchmarkEnd(const BenchmarkResults &results) {
-  auto inference_us = results.inference_time_us();
-  auto init_us = results.startup_latency_us();
-  auto warmup_us = results.warmup_time_us();
-  TFLITE_LOG(INFO) << "Average inference timings in us: "
-                   << "Warmup: " << warmup_us.avg() << ", "
-                   << "Init: " << init_us << ", "
-                   << "no stats: " << inference_us.avg();
-}
-
-std::vector<Flag> BenchmarkModel::GetFlags() {
-  return {
-      CreateFlag<int32_t>("num_runs", &params_, "number of runs"),
-      CreateFlag<float>("run_delay", &params_, "delay between runs in seconds"),
-      CreateFlag<int32_t>("num_threads", &params_, "number of threads"),
-      CreateFlag<std::string>("benchmark_name", &params_, "benchmark name"),
-      CreateFlag<std::string>("output_prefix", &params_,
-                              "benchmark output prefix"),
-      CreateFlag<int32_t>("warmup_runs", &params_,
-                          "how many runs to initialize model"),
-  };
-}
-
-void BenchmarkModel::LogParams() {
-  TFLITE_LOG(INFO) << "Num runs: [" << params_.Get<int32_t>("num_runs") << "]";
-  TFLITE_LOG(INFO) << "Inter-run delay (seconds): ["
-                   << params_.Get<float>("run_delay") << "]";
-  TFLITE_LOG(INFO) << "Num threads: [" << params_.Get<int32_t>("num_threads")
-                   << "]";
-  TFLITE_LOG(INFO) << "Benchmark name: ["
-                   << params_.Get<std::string>("benchmark_name") << "]";
-  TFLITE_LOG(INFO) << "Output prefix: ["
-                   << params_.Get<std::string>("output_prefix") << "]";
-  TFLITE_LOG(INFO) << "Warmup runs: [" << params_.Get<int32_t>("warmup_runs")
-                   << "]";
-}
-
-void BenchmarkModel::PrepareInputsAndOutputs() {}
-
-Stat<int64_t> BenchmarkModel::Run(int num_times, RunType run_type) {
-  Stat<int64_t> run_stats;
-  TFLITE_LOG(INFO) << "Running benchmark for " << num_times << " iterations ";
-  for (int run = 0; run < num_times; run++) {
-    PrepareInputsAndOutputs();
-    listeners_.OnSingleRunStart(run_type);
-    int64_t start_us = profiling::time::NowMicros();
-    RunImpl();
-    int64_t end_us = profiling::time::NowMicros();
-    listeners_.OnSingleRunEnd();
-
-    run_stats.UpdateStat(end_us - start_us);
-    SleepForSeconds(params_.Get<float>("run_delay"));
-  }
-
-  std::stringstream stream;
-  run_stats.OutputToStream(&stream);
-  TFLITE_LOG(INFO) << stream.str() << std::endl;
-
-  return run_stats;
-}
-
-bool BenchmarkModel::ValidateParams() { return true; }
-
-void BenchmarkModel::Run(int argc, char **argv) {
-  if (!ParseFlags(argc, argv)) {
-    return;
-  }
-  Run();
-}
-
-void BenchmarkModel::Run() {
-  ValidateParams();
-  LogParams();
-
-  listeners_.OnBenchmarkStart(params_);
-  int64_t initialization_start_us = profiling::time::NowMicros();
-  Init();
-  int64_t initialization_end_us = profiling::time::NowMicros();
-  int64_t startup_latency_us = initialization_end_us - initialization_start_us;
-  TFLITE_LOG(INFO) << "Initialized session in " << startup_latency_us / 1e3
-                   << "ms";
-
-  uint64_t input_bytes = ComputeInputBytes();
-  Stat<int64_t> warmup_time_us =
-      Run(params_.Get<int32_t>("warmup_runs"), WARMUP);
-  Stat<int64_t> inference_time_us =
-      Run(params_.Get<int32_t>("num_runs"), REGULAR);
-  listeners_.OnBenchmarkEnd(
-      {startup_latency_us, input_bytes, warmup_time_us, inference_time_us});
-}
-
-bool BenchmarkModel::ParseFlags(int argc, char **argv) {
-  auto flag_list = GetFlags();
-  const bool parse_result =
-      Flags::Parse(&argc, const_cast<const char **>(argv), flag_list);
-  if (!parse_result) {
-    std::string usage = Flags::Usage(argv[0], flag_list);
-    TFLITE_LOG(ERROR) << usage;
-    return false;
-  }
-  return true;
-}
-
-}  // namespace benchmark
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_test.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_test.cc
deleted file mode 100644
index b697bb394db9b967dfaaff649517dcc23e85ccb0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_test.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/testing/util.h"
-#include "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
-#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
-
-namespace {
-const std::string* g_model_path = nullptr;
-}
-
-namespace tflite {
-namespace benchmark {
-namespace {
-
-BenchmarkParams CreateParams() {
-  BenchmarkParams params;
-  params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(2));
-  params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
-  params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
-  params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
-  params.AddParam("output_prefix", BenchmarkParam::Create<std::string>(""));
-  params.AddParam("warmup_runs", BenchmarkParam::Create<int32_t>(1));
-  params.AddParam("graph", BenchmarkParam::Create<std::string>(*g_model_path));
-  params.AddParam("input_layer", BenchmarkParam::Create<std::string>(""));
-  params.AddParam("input_layer_shape", BenchmarkParam::Create<std::string>(""));
-  params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
-  return params;
-}
-
-TEST(BenchmarkTest, DoesntCrash) {
-  ASSERT_THAT(g_model_path, testing::NotNull());
-
-  BenchmarkTfLiteModel benchmark(CreateParams());
-  benchmark.Run();
-}
-
-}  // namespace
-}  // namespace benchmark
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  std::string model_path;
-  std::vector<tflite::Flag> flags = {
-      tflite::Flag::CreateFlag("graph", &model_path, "Path to model file.")};
-  g_model_path = &model_path;
-  const bool parse_result =
-      tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flags);
-  if (!parse_result) {
-    std::cerr << tflite::Flags::Usage(argv[0], flags);
-    return 1;
-  }
-
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
deleted file mode 100644
index 25a302b2aaea400ea66e76dae3e6add71180e1cc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
-#include "tensorflow/contrib/lite/tools/benchmark/benchmark_model.h"
-
-namespace tflite {
-namespace benchmark {
-
-// Dumps profiling events if profiling is enabled
-class ProfilingListener : public BenchmarkListener {
- public:
-  explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
-
-  void SetInterpreter(Interpreter* interpreter);
-
-  void OnSingleRunStart(RunType run_type) override;
-
-  void OnSingleRunEnd() override;
-
-  void OnBenchmarkEnd(const BenchmarkResults& results) override;
-
- private:
-  Interpreter* interpreter_;
-  profiling::Profiler profiler_;
-  profiling::ProfileSummarizer summarizer_;
-  bool has_profiles_;
-};
-
-// Benchmarks a TFLite model by running tflite interpreter.
-class BenchmarkTfLiteModel : public BenchmarkModel {
- public:
-  BenchmarkTfLiteModel();
-  BenchmarkTfLiteModel(BenchmarkParams params);
-  virtual ~BenchmarkTfLiteModel() {}
-
-  std::vector<Flag> GetFlags() override;
-  void LogParams() override;
-  bool ValidateParams() override;
-  uint64_t ComputeInputBytes() override;
-  void Init() override;
-  void RunImpl() override;
-
-  struct InputLayerInfo {
-    std::string name;
-    std::vector<int> shape;
-  };
-
- protected:
-  void PrepareInputsAndOutputs() override;
-
- private:
-  std::unique_ptr<tflite::FlatBufferModel> model;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  std::vector<InputLayerInfo> inputs;
-  ProfilingListener profiling_listener_;
-};
-
-}  // namespace benchmark
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/README.md b/tensorflow/contrib/lite/tools/benchmark/ios/README.md
deleted file mode 100644
index 46144f7bf8e142b960d3fe1068686e366bb6c198..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/benchmark/ios/README.md
+++ /dev/null
@@ -1,43 +0,0 @@
-# TFLite iOS benchmark app.
-
-## Description
-
-An iOS app to benchmark TFLite models.
-
-The app reads benchmark parameters from a JSON file named `benchmark_params.json`
-in its `benchmark_data` directory. Any downloaded models for benchmarking should
-also be placed in `benchmark_data` directory.
-
-The JSON file specifies the name of the model file and other benchmarking
-parameters like inputs to the model, type of inputs, number of iterations,
-number of threads. The default values in the JSON file are for the
-Mobilenet_1.0_224 model
-([paper](https://arxiv.org/pdf/1704.04861.pdf),
-[tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz))
-
-## To build/install/run
-
-- Follow instructions at
-[iOS build for TFLite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/ios.md)
-to build TFLite.
-
-Running
-
-```bash
-tensorflow/contrib/lite/build_ios_universal_lib.sh
-```
-will also build `tensorflow/contrib/lite/gen/lib/benchmark-lib.a` .
-
-- Now copy the downloaded model file to `benchmark_data` directory. 
-
-- Modify `benchmark_params.json` change the `input_layer`, `input_layer_shape`
-and other benchmark parameters.
-
-- Change `Build Phases -> Copy Bundle Resources` and add the model file to the
-resources that need to be copied.
-
-- Ensure that `Build Phases -> Link Binary With Library` contains the 
-`Accelerate framework` and `tensorflow/contrib/lite/gen/lib/benchmark-lib.a`.
-
-- Now try running the app. The app has a single button that runs the benchmark
-  on the model and displays results in a text view below.
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
deleted file mode 100644
index b908f733d49b56a6b41ebea4185f1fe8c11edc60..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,381 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 50;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		6FE7579A20D59CE500F01636 /* benchmark_params.json in Resources */ = {isa = PBXBuildFile; fileRef = 6FE7579920D59CE500F01636 /* benchmark_params.json */; };
-		6FE7579D20D5A5E000F01636 /* benchmark-lib.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 6FE7579C20D5A5E000F01636 /* benchmark-lib.a */; };
-		6FE7579F20D5A6A700F01636 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 6FE7579E20D5A6A700F01636 /* Accelerate.framework */; };
-		6FE757A120D5AB8100F01636 /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = 6FE757A020D5AB8000F01636 /* mobilenet_v1_1.0_224.tflite */; };
-		6FE93FFD20D592D8008C9FE4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 6FE93FFC20D592D8008C9FE4 /* AppDelegate.m */; };
-		6FE9400020D592D8008C9FE4 /* BenchmarkViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 6FE93FFF20D592D8008C9FE4 /* BenchmarkViewController.mm */; };
-		6FE9400320D592D8008C9FE4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 6FE9400120D592D8008C9FE4 /* Main.storyboard */; };
-		6FE9400520D592DA008C9FE4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 6FE9400420D592DA008C9FE4 /* Assets.xcassets */; };
-		6FE9400B20D592DA008C9FE4 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6FE9400A20D592DA008C9FE4 /* main.m */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		6FE7579920D59CE500F01636 /* benchmark_params.json */ = {isa = PBXFileReference; lastKnownFileType = text.json; path = benchmark_params.json; sourceTree = "<group>"; };
-		6FE7579C20D5A5E000F01636 /* benchmark-lib.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "benchmark-lib.a"; path = "$SRCROOT/../../../../../../../tensorflow/contrib/lite/gen/lib/benchmark-lib.a"; sourceTree = "<group>"; };
-		6FE7579E20D5A6A700F01636 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
-		6FE757A020D5AB8000F01636 /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
-		6FE93FF820D592D8008C9FE4 /* TFLiteBenchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TFLiteBenchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		6FE93FFB20D592D8008C9FE4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
-		6FE93FFC20D592D8008C9FE4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
-		6FE93FFE20D592D8008C9FE4 /* BenchmarkViewController.h */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.h; path = BenchmarkViewController.h; sourceTree = "<group>"; };
-		6FE93FFF20D592D8008C9FE4 /* BenchmarkViewController.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = BenchmarkViewController.mm; sourceTree = "<group>"; };
-		6FE9400220D592D8008C9FE4 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
-		6FE9400420D592DA008C9FE4 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-		6FE9400920D592DA008C9FE4 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		6FE9400A20D592DA008C9FE4 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		6FE93FF520D592D8008C9FE4 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				6FE7579F20D5A6A700F01636 /* Accelerate.framework in Frameworks */,
-				6FE7579D20D5A5E000F01636 /* benchmark-lib.a in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		6FE7579820D59C8B00F01636 /* benchmark_data */ = {
-			isa = PBXGroup;
-			children = (
-				6FE757A020D5AB8000F01636 /* mobilenet_v1_1.0_224.tflite */,
-				6FE7579920D59CE500F01636 /* benchmark_params.json */,
-			);
-			path = benchmark_data;
-			sourceTree = "<group>";
-		};
-		6FE7579B20D5A5E000F01636 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				6FE7579E20D5A6A700F01636 /* Accelerate.framework */,
-				6FE7579C20D5A5E000F01636 /* benchmark-lib.a */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		6FE93FEF20D592D8008C9FE4 = {
-			isa = PBXGroup;
-			children = (
-				6FE93FFA20D592D8008C9FE4 /* TFLiteBenchmark */,
-				6FE93FF920D592D8008C9FE4 /* Products */,
-				6FE7579B20D5A5E000F01636 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		6FE93FF920D592D8008C9FE4 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				6FE93FF820D592D8008C9FE4 /* TFLiteBenchmark.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		6FE93FFA20D592D8008C9FE4 /* TFLiteBenchmark */ = {
-			isa = PBXGroup;
-			children = (
-				6FE7579820D59C8B00F01636 /* benchmark_data */,
-				6FE93FFB20D592D8008C9FE4 /* AppDelegate.h */,
-				6FE93FFC20D592D8008C9FE4 /* AppDelegate.m */,
-				6FE93FFE20D592D8008C9FE4 /* BenchmarkViewController.h */,
-				6FE93FFF20D592D8008C9FE4 /* BenchmarkViewController.mm */,
-				6FE9400120D592D8008C9FE4 /* Main.storyboard */,
-				6FE9400420D592DA008C9FE4 /* Assets.xcassets */,
-				6FE9400920D592DA008C9FE4 /* Info.plist */,
-				6FE9400A20D592DA008C9FE4 /* main.m */,
-			);
-			path = TFLiteBenchmark;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		6FE93FF720D592D8008C9FE4 /* TFLiteBenchmark */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 6FE9400E20D592DA008C9FE4 /* Build configuration list for PBXNativeTarget "TFLiteBenchmark" */;
-			buildPhases = (
-				6FE93FF420D592D8008C9FE4 /* Sources */,
-				6FE93FF520D592D8008C9FE4 /* Frameworks */,
-				6FE93FF620D592D8008C9FE4 /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = TFLiteBenchmark;
-			productName = TFLiteBenchmark;
-			productReference = 6FE93FF820D592D8008C9FE4 /* TFLiteBenchmark.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		6FE93FF020D592D8008C9FE4 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 1000;
-				ORGANIZATIONNAME = Example;
-				TargetAttributes = {
-					6FE93FF720D592D8008C9FE4 = {
-						CreatedOnToolsVersion = 10.0;
-					};
-				};
-			};
-			buildConfigurationList = 6FE93FF320D592D8008C9FE4 /* Build configuration list for PBXProject "TFLiteBenchmark" */;
-			compatibilityVersion = "Xcode 9.3";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 6FE93FEF20D592D8008C9FE4;
-			productRefGroup = 6FE93FF920D592D8008C9FE4 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				6FE93FF720D592D8008C9FE4 /* TFLiteBenchmark */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		6FE93FF620D592D8008C9FE4 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				6FE757A120D5AB8100F01636 /* mobilenet_v1_1.0_224.tflite in Resources */,
-				6FE9400520D592DA008C9FE4 /* Assets.xcassets in Resources */,
-				6FE9400320D592D8008C9FE4 /* Main.storyboard in Resources */,
-				6FE7579A20D59CE500F01636 /* benchmark_params.json in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		6FE93FF420D592D8008C9FE4 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				6FE9400020D592D8008C9FE4 /* BenchmarkViewController.mm in Sources */,
-				6FE9400B20D592DA008C9FE4 /* main.m in Sources */,
-				6FE93FFD20D592D8008C9FE4 /* AppDelegate.m in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXVariantGroup section */
-		6FE9400120D592D8008C9FE4 /* Main.storyboard */ = {
-			isa = PBXVariantGroup;
-			children = (
-				6FE9400220D592D8008C9FE4 /* Base */,
-			);
-			name = Main.storyboard;
-			sourceTree = "<group>";
-		};
-/* End PBXVariantGroup section */
-
-/* Begin XCBuildConfiguration section */
-		6FE9400C20D592DA008C9FE4 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.0;
-				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-				ONLY_ACTIVE_ARCH = YES;
-				OTHER_CFLAGS = "";
-				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				SDKROOT = iphoneos;
-			};
-			name = Debug;
-		};
-		6FE9400D20D592DA008C9FE4 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu11;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 11.0;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				OTHER_CFLAGS = "";
-				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				SDKROOT = iphoneos;
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		6FE9400F20D592DA008C9FE4 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Automatic;
-				"HEADER_SEARCH_PATHS[arch=*]" = (
-					$SRCROOT/../../../../../../../,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/eigen,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/gemmlowp,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/neon_2_sse,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/farmhash/src,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/flatbuffers/include,
-				);
-				INFOPLIST_FILE = TFLiteBenchmark/Info.plist;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../../tensorflow/contrib/lite/gen/lib;
-				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				TARGETED_DEVICE_FAMILY = "1,2";
-				"USER_HEADER_SEARCH_PATHS[arch=*]" = "";
-			};
-			name = Debug;
-		};
-		6FE9401020D592DA008C9FE4 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CODE_SIGN_STYLE = Automatic;
-				"HEADER_SEARCH_PATHS[arch=*]" = (
-					$SRCROOT/../../../../../../../,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/eigen,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/gemmlowp,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/neon_2_sse,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/farmhash/src,
-					$SRCROOT/../../../../../../../tensorflow/contrib/lite/downloads/flatbuffers/include,
-				);
-				INFOPLIST_FILE = TFLiteBenchmark/Info.plist;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../../tensorflow/contrib/lite/gen/lib;
-				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		6FE93FF320D592D8008C9FE4 /* Build configuration list for PBXProject "TFLiteBenchmark" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				6FE9400C20D592DA008C9FE4 /* Debug */,
-				6FE9400D20D592DA008C9FE4 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		6FE9400E20D592DA008C9FE4 /* Build configuration list for PBXNativeTarget "TFLiteBenchmark" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				6FE9400F20D592DA008C9FE4 /* Debug */,
-				6FE9401020D592DA008C9FE4 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 6FE93FF020D592D8008C9FE4 /* Project object */;
-}
diff --git a/tensorflow/contrib/lite/tools/make/Makefile b/tensorflow/contrib/lite/tools/make/Makefile
deleted file mode 100644
index 16012a3fb16398003eb6cc934e6fe91318b2849a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/make/Makefile
+++ /dev/null
@@ -1,225 +0,0 @@
-# Find where we're running from, so we can store generated files here.
-ifeq ($(origin MAKEFILE_DIR), undefined)
-	MAKEFILE_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
-endif
-
-# Try to figure out the host system
-HOST_OS :=
-ifeq ($(OS),Windows_NT)
-	HOST_OS = windows
-else
-	UNAME_S := $(shell uname -s)
-	ifeq ($(UNAME_S),Linux)
-		HOST_OS := linux
-	endif
-	ifeq ($(UNAME_S),Darwin)
-		HOST_OS := osx
-	endif
-endif
-
-HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi)
-
-# Override these on the make command line to target a specific architecture. For example:
-# make -f tensorflow/contrib/lite/Makefile TARGET=rpi TARGET_ARCH=armv7l
-TARGET := $(HOST_OS)
-TARGET_ARCH := $(HOST_ARCH)
-
-INCLUDES := \
--I. \
--I$(MAKEFILE_DIR)/../../../../../ \
--I$(MAKEFILE_DIR)/../../../../../../ \
--I$(MAKEFILE_DIR)/downloads/ \
--I$(MAKEFILE_DIR)/downloads/eigen \
--I$(MAKEFILE_DIR)/downloads/absl \
--I$(MAKEFILE_DIR)/downloads/gemmlowp \
--I$(MAKEFILE_DIR)/downloads/neon_2_sse \
--I$(MAKEFILE_DIR)/downloads/farmhash/src \
--I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
--I$(OBJDIR)
-# This is at the end so any globally-installed frameworks like protobuf don't
-# override local versions in the source tree.
-INCLUDES += -I/usr/local/include
-
-# These are the default libraries needed, but they can be added to or
-# overridden by the platform-specific settings in target makefiles.
-LIBS := \
--lstdc++ \
--lpthread \
--lm \
--lz
-
-# There are no rules for compiling objects for the host system (since we don't
-# generate things like the protobuf compiler that require that), so all of
-# these settings are for the target compiler.
-CXXFLAGS := -O3 -DNDEBUG
-CCFLAGS := ${CXXFLAGS}
-CXXFLAGS += --std=c++11
-CFLAGS :=
-LDOPTS := -L/usr/local/lib
-ARFLAGS := -r
-TARGET_TOOLCHAIN_PREFIX :=
-CC_PREFIX :=
-
-# This library is the main target for this makefile. It will contain a minimal
-# runtime that can be linked in to other programs.
-LIB_NAME := libtensorflow-lite.a
-
-# Benchmark static library and binary
-BENCHMARK_LIB_NAME := benchmark-lib.a
-BENCHMARK_BINARY_NAME := benchmark_model
-
-# A small example program that shows how to link against the library.
-MINIMAL_SRCS := \
-tensorflow/contrib/lite/examples/minimal/minimal.cc
-
-# What sources we want to compile, must be kept in sync with the main Bazel
-# build files.
-
-PROFILER_SRCS := \
-	tensorflow/contrib/lite/profiling/time.cc
-PROFILE_SUMMARIZER_SRCS := \
-	tensorflow/contrib/lite/profiling/profile_summarizer.cc \
-	tensorflow/core/util/stats_calculator.cc
-
-CORE_CC_ALL_SRCS := \
-$(wildcard tensorflow/contrib/lite/*.cc) \
-$(wildcard tensorflow/contrib/lite/*.c) \
-$(wildcard tensorflow/contrib/lite/c/*.c) \
-$(wildcard tensorflow/contrib/lite/core/api/*.cc)
-ifneq ($(BUILD_TYPE),micro)
-CORE_CC_ALL_SRCS += \
-$(wildcard tensorflow/contrib/lite/kernels/*.cc) \
-$(wildcard tensorflow/contrib/lite/kernels/internal/*.cc) \
-$(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.cc) \
-$(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.cc) \
-$(PROFILER_SRCS) \
-$(wildcard tensorflow/contrib/lite/kernels/*.c) \
-$(wildcard tensorflow/contrib/lite/kernels/internal/*.c) \
-$(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.c) \
-$(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.c) \
-$(wildcard tensorflow/contrib/lite/tools/make/downloads/farmhash/src/farmhash.cc) \
-$(wildcard tensorflow/contrib/lite/tools/make/downloads/fft2d/fftsg.c)
-endif
-# Remove any duplicates.
-CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
-CORE_CC_EXCLUDE_SRCS := \
-$(wildcard tensorflow/contrib/lite/*test.cc) \
-$(wildcard tensorflow/contrib/lite/*/*test.cc) \
-$(wildcard tensorflow/contrib/lite/*/*/*test.cc) \
-$(wildcard tensorflow/contrib/lite/*/*/*/*test.cc) \
-$(wildcard tensorflow/contrib/lite/kernels/test_util.cc) \
-$(MINIMAL_SRCS)
-ifeq ($(BUILD_TYPE),micro)
-CORE_CC_EXCLUDE_SRCS += \
-tensorflow/contrib/lite/mmap_allocation.cc \
-tensorflow/contrib/lite/nnapi_delegate.cc
-endif
-# Filter out all the excluded files.
-TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
-
-# Benchmark sources
-BENCHMARK_SRCS_DIR := tensorflow/contrib/lite/tools/benchmark
-BENCHMARK_ALL_SRCS := $(TFLITE_CC_SRCS) \
-	$(wildcard $(BENCHMARK_SRCS_DIR)/*.cc) \
-	$(PROFILE_SUMMARIZER_SRCS)
-
-BENCHMARK_SRCS := $(filter-out \
-	$(wildcard $(BENCHMARK_SRCS_DIR)/*_test.cc), \
-    $(BENCHMARK_ALL_SRCS))
-
-# These target-specific makefiles should modify or replace options like
-# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
-# based on platforms or architectures should happen within these files, to
-# keep this main makefile focused on the sources and dependencies.
-include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
-
-ALL_SRCS := \
-	$(MINIMAL_SRCS) \
-	$(PROFILER_SRCS) \
-	$(PROFILER_SUMMARY_SRCS) \
-	$(TF_LITE_CC_SRCS) \
-	$(BENCHMARK_SRCS)
-
-# Where compiled objects are stored.
-GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
-OBJDIR := $(GENDIR)obj/
-BINDIR := $(GENDIR)bin/
-LIBDIR := $(GENDIR)lib/
-
-LIB_PATH := $(LIBDIR)$(LIB_NAME)
-BENCHMARK_LIB := $(LIBDIR)$(BENCHMARK_LIB_NAME)
-BENCHMARK_BINARY := $(BINDIR)$(BENCHMARK_BINARY_NAME)
-MINIMAL_BINARY := $(BINDIR)minimal
-
-CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
-CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
-AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
-
-MINIMAL_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MINIMAL_SRCS))))
-
-LIB_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(TF_LITE_CC_SRCS))))
-
-BENCHMARK_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(BENCHMARK_SRCS))))
-
-# For normal manually-created TensorFlow C++ source files.
-$(OBJDIR)%.o: %.cc
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
-# For normal manually-created TensorFlow C++ source files.
-$(OBJDIR)%.o: %.c
-	@mkdir -p $(dir $@)
-	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
-
-# The target that's compiled if there's no command-line arguments.
-all: $(LIB_PATH)  $(MINIMAL_BINARY) $(BENCHMARK_BINARY)
-
-# The target that's compiled for micro-controllers
-micro: $(LIB_PATH)
-
-# Hack for generating schema file bypassing flatbuffer parsing
-tensorflow/contrib/lite/schema/schema_generated.h:
-	@cp -u tensorflow/contrib/lite/schema/schema_generated.h.OPENSOURCE tensorflow/contrib/lite/schema/schema_generated.h
-
-# Gathers together all the objects we've compiled into a single '.a' archive.
-$(LIB_PATH): tensorflow/contrib/lite/schema/schema_generated.h $(LIB_OBJS)
-	@mkdir -p $(dir $@)
-	$(AR) $(ARFLAGS) $(LIB_PATH) $(LIB_OBJS)
-
-$(MINIMAL_BINARY): $(MINIMAL_OBJS) $(LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(MINIMAL_BINARY) $(MINIMAL_OBJS) \
-	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
-
-$(BENCHMARK_LIB) : $(LIB_PATH) $(BENCHMARK_OBJS)
-	@mkdir -p $(dir $@)
-	$(AR) $(ARFLAGS) $(BENCHMARK_LIB) $(LIB_OBJS) $(BENCHMARK_OBJS)
-
-benchmark_lib: $(BENCHMARK_LIB)
-
-$(BENCHMARK_BINARY) : $(BENCHMARK_LIB)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(BENCHMARK_BINARY) \
-	$(LIBFLAGS) $(BENCHMARK_LIB) $(LDFLAGS) $(LIBS)
-
-benchmark: $(BENCHMARK_BINARY)
-
-# Gets rid of all generated files.
-clean:
-	rm -rf $(MAKEFILE_DIR)/gen
-
-# Gets rid of target files only, leaving the host alone. Also leaves the lib
-# directory untouched deliberately, so we can persist multiple architectures
-# across builds for iOS and Android.
-cleantarget:
-	rm -rf $(OBJDIR)
-	rm -rf $(BINDIR)
-
-$(DEPDIR)/%.d: ;
-.PRECIOUS: $(DEPDIR)/%.d
-
--include $(patsubst %,$(DEPDIR)/%.d,$(basename $(ALL_SRCS)))
diff --git a/tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh
deleted file mode 100755
index fe056945a652b04d078947f58bfe6ab60aa1f387..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash -x
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../.."
-
-# Build library for supported architectures and packs them in a fat binary.
-make_library() {
-    for arch in x86_64 armv7 armv7s arm64
-    do
-        make -f tensorflow/contrib/lite/tools/make/Makefile TARGET=ios TARGET_ARCH=${arch} \
-        -j 8
-    done
-    mkdir -p tensorflow/contrib/lite/tools/make/gen/lib
-    lipo \
-    tensorflow/contrib/lite/tools/make/gen/ios_x86_64/lib/${1} \
-    tensorflow/contrib/lite/tools/make/gen/ios_armv7/lib/${1} \
-    tensorflow/contrib/lite/tools/make/gen/ios_armv7s/lib/${1} \
-    tensorflow/contrib/lite/tools/make/gen/ios_arm64/lib/${1} \
-    -create \
-    -output tensorflow/contrib/lite/tools/make/gen/lib/${1}
-}
-
-make_library libtensorflow-lite.a
-make_library benchmark-lib.a
diff --git a/tensorflow/contrib/lite/tools/make/download_dependencies.sh b/tensorflow/contrib/lite/tools/make/download_dependencies.sh
deleted file mode 100755
index 3570f9a38d3fdc435e5c0caeb04da39c422710f1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/make/download_dependencies.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -e
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../.."
-
-DOWNLOADS_DIR=tensorflow/contrib/lite/tools/make/downloads
-BZL_FILE_PATH=tensorflow/workspace.bzl
-
-# Ensure it is being run from repo root
-if [ ! -f $BZL_FILE_PATH ]; then
-  echo "Could not find ${BZL_FILE_PATH}":
-  echo "Likely you are not running this from the root directory of the repository.";
-  exit 1;
-fi
-
-EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
-GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
-ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
-NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
-FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
-FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz"
-FFT2D_URL="https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz"
-
-# TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
-#                   so work around it by patching the source.
-replace_by_sed() {
-  local regex="${1}"
-  shift
-  # Detect the version of sed by the return value of "--version" flag. GNU-sed
-  # supports "--version" while BSD-sed doesn't.
-  if ! sed --version >/dev/null 2>&1; then
-    # BSD-sed.
-    sed -i '' -e "${regex}" "$@"
-  else
-    # GNU-sed.
-    sed -i -e "${regex}" "$@"
-  fi
-}
-
-download_and_extract() {
-  local usage="Usage: download_and_extract URL DIR"
-  local url="${1:?${usage}}"
-  local dir="${2:?${usage}}"
-  echo "downloading ${url}" >&2
-  mkdir -p "${dir}"
-  if [[ "${url}" == *gz ]]; then
-    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
-  elif [[ "${url}" == *zip ]]; then
-    tempdir=$(mktemp -d)
-    tempdir2=$(mktemp -d)
-
-    curl -L ${url} > ${tempdir}/zipped.zip
-    unzip ${tempdir}/zipped.zip -d ${tempdir2}
-
-    # If the zip file contains nested directories, extract the files from the
-    # inner directory.
-    if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
-      # unzip has no strip components, so unzip to a temp dir, and move the
-      # files we want from the tempdir to destination.
-      cp -R ${tempdir2}/*/* ${dir}/
-    else
-      cp -R ${tempdir2}/* ${dir}/
-    fi
-    rm -rf ${tempdir2} ${tempdir}
-  fi
-
-  # Delete any potential BUILD files, which would interfere with Bazel builds.
-  find "${dir}" -type f -name '*BUILD' -delete
-}
-
-download_and_extract "${EIGEN_URL}" "${DOWNLOADS_DIR}/eigen"
-download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
-download_and_extract "${GOOGLETEST_URL}" "${DOWNLOADS_DIR}/googletest"
-download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
-download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
-download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
-download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
-download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
-
-replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
-  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
-replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#static uint32x2_t p2ui_CONJ_XOR;// = vld1_u32( conj_XOR_DATA ); - Removed by scripts#' \
-  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
-replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \
-  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
-
-echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/lite/tools/optimize/BUILD b/tensorflow/contrib/lite/tools/optimize/BUILD
deleted file mode 100644
index 51ccaedc23d0abfda83295879b007f2479d0c571..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/tools/optimize/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-# TODO(suharshs): Write quantize_weights tests that use small exportable files.
-# Then we can remove this file.
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
-
-cc_library(
-    name = "quantize_weights",
-    srcs = ["quantize_weights.cc"],
-    hdrs = ["quantize_weights.h"],
-    deps = [
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
-        "//tensorflow/contrib/lite/schema:schema_fbs",
-        "//tensorflow/core:tflite_portable_logging",
-        "@com_google_absl//absl/memory",
-        "@flatbuffers",
-    ],
-)
diff --git a/tensorflow/contrib/lite/util.cc b/tensorflow/contrib/lite/util.cc
deleted file mode 100644
index 6aa35b52277910aea7ad0ea8753c2bad095b1f1f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/util.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/contrib/lite/util.h"
-
-#include <cstring>
-
-namespace tflite {
-
-bool IsFlexOp(const char* custom_name) {
-  return custom_name && strncmp(custom_name, kFlexCustomCodePrefix,
-                                strlen(kFlexCustomCodePrefix)) == 0;
-}
-
-TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input) {
-  return ConvertArrayToTfLiteIntArray(input.size(), input.data());
-}
-
-TfLiteIntArray* ConvertArrayToTfLiteIntArray(const int rank, const int* dims) {
-  TfLiteIntArray* output = TfLiteIntArrayCreate(rank);
-  for (size_t i = 0; i < rank; i++) {
-    output->data[i] = dims[i];
-  }
-  return output;
-}
-
-bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
-                                 const int* b) {
-  if (!a) return false;
-  if (a->size != b_size) return false;
-  for (int i = 0; i < a->size; ++i) {
-    if (a->data[i] != b[i]) return false;
-  }
-  return true;
-}
-
-size_t CombineHashes(std::initializer_list<size_t> hashes) {
-  size_t result = 0;
-  // Hash combiner used by TensorFlow core.
-  for (size_t hash : hashes) {
-    result = result ^
-             (hash + 0x9e3779b97f4a7800ULL + (result << 10) + (result >> 4));
-  }
-  return result;
-}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/util.h b/tensorflow/contrib/lite/util.h
deleted file mode 100644
index 31292a6f8131f78f0939b4ebbcb46dfd9c3312df..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/util.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file provides general C++ utility functions in TFLite.
-// For example: Converting between `TfLiteIntArray`, `std::vector` and
-// Flatbuffer vectors. These functions can't live in `context.h` since it's pure
-// C.
-
-#ifndef TENSORFLOW_CONTRIB_LITE_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_UTIL_H_
-
-#include <vector>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-
-namespace tflite {
-
-// The prefix of Flex op custom code.
-// This will be matched agains the `custom_code` field in `OperatorCode`
-// Flatbuffer Table.
-// WARNING: This is an experimental API and subject to change.
-constexpr char kFlexCustomCodePrefix[] = "Flex";
-
-// Checks whether the prefix of the custom name indicates the operation is an
-// Flex operation.
-bool IsFlexOp(const char* custom_name);
-
-// Converts a `std::vector` to a `TfLiteIntArray`. The caller takes ownership
-// of the returned pointer.
-TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input);
-
-// Converts an array (of the given size) to a `TfLiteIntArray`. The caller
-// takes ownership of the returned pointer, and must make sure 'dims' has at
-// least 'rank' elemnts.
-TfLiteIntArray* ConvertArrayToTfLiteIntArray(const int rank, const int* dims);
-
-// Checks whether a `TfLiteIntArray` and an int array have matching elements.
-// The caller must guarantee that 'b' has at least 'b_size' elements.
-bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
-                                 const int* b);
-
-size_t CombineHashes(std::initializer_list<size_t> hashes);
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_UTIL_H_
diff --git a/tensorflow/contrib/lite/util_test.cc b/tensorflow/contrib/lite/util_test.cc
deleted file mode 100644
index 25f3aded7140ff1075a52d5d30e270ee049c7c88..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/util_test.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <vector>
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/util.h"
-
-namespace tflite {
-namespace {
-
-TEST(ConvertVectorToTfLiteIntArray, TestWithVector) {
-  std::vector<int> input = {1, 2};
-  TfLiteIntArray* output = ConvertVectorToTfLiteIntArray(input);
-  ASSERT_NE(output, nullptr);
-  EXPECT_EQ(output->size, 2);
-  EXPECT_EQ(output->data[0], 1);
-  EXPECT_EQ(output->data[1], 2);
-  TfLiteIntArrayFree(output);
-}
-
-TEST(ConvertVectorToTfLiteIntArray, TestWithEmptyVector) {
-  std::vector<int> input;
-  TfLiteIntArray* output = ConvertVectorToTfLiteIntArray(input);
-  ASSERT_NE(output, nullptr);
-  EXPECT_EQ(output->size, 0);
-  TfLiteIntArrayFree(output);
-}
-
-TEST(UtilTest, IsFlexOp) {
-  EXPECT_TRUE(IsFlexOp("Flex"));
-  EXPECT_TRUE(IsFlexOp("FlexOp"));
-  EXPECT_FALSE(IsFlexOp("flex"));
-  EXPECT_FALSE(IsFlexOp("Fle"));
-  EXPECT_FALSE(IsFlexOp("OpFlex"));
-  EXPECT_FALSE(IsFlexOp(nullptr));
-  EXPECT_FALSE(IsFlexOp(""));
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index f83765a48d8d3adaec84460e32c34aa68a35ab09..e52fb5ab1431e086f99b4033a6216636a83bad79 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -42,7 +42,6 @@ from tensorflow.python.ops.lookup_ops import TextFileIndex
 from tensorflow.python.ops.lookup_ops import TextFileInitializer
 from tensorflow.python.ops.lookup_ops import TextFileStringTableInitializer
 # pylint: enable=unused-import
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.util.deprecation import deprecated
 
@@ -92,7 +91,7 @@ def index_table_from_tensor(mapping,
   The bucket ID range is `[mapping size, mapping size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `tf.tables_initializer.run()` or `table.initializer.run()` once.
 
   Elements in `mapping` cannot have duplicates, otherwise when executing the
   table initializer op, it will throw a `FailedPreconditionError`.
@@ -203,7 +202,7 @@ def index_to_string_table_from_tensor(mapping, default_value="UNK", name=None):
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `tf.tables_initializer.run()` or `table.initializer.run()` once.
 
   Elements in `mapping` cannot have duplicates, otherwise when executing the
   table initializer op, it will throw a `FailedPreconditionError`.
@@ -289,11 +288,11 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   return table.lookup(tensor)
 
 
-class MutableHashTable(LookupInterface, checkpointable.CheckpointableBase):
+class MutableHashTable(LookupInterface):
   """A generic mutable hash table implementation.
 
-  Data can be inserted by calling the insert method. It does not support
-  initialization via the init method.
+  Data can be inserted by calling the insert method and removed by calling the
+  remove method. It does not support initialization via the init method.
 
   Example usage:
 
@@ -339,43 +338,56 @@ class MutableHashTable(LookupInterface, checkpointable.CheckpointableBase):
     self._default_value = ops.convert_to_tensor(default_value,
                                                 dtype=value_dtype)
     self._value_shape = self._default_value.get_shape()
+    self._checkpoint = checkpoint
+    self._key_dtype = key_dtype
+    self._value_dtype = value_dtype
+    self._name = name
 
-    executing_eagerly = context.executing_eagerly()
-    if executing_eagerly and shared_name is None:
+    if context.executing_eagerly() and shared_name is None:
       # TODO(allenl): This will leak memory due to kernel caching by the
       # shared_name attribute value (but is better than the alternative of
       # sharing everything by default when executing eagerly; hopefully creating
       # tables in a loop is uncommon).
       shared_name = "table_%d" % (ops.uid(),)
+    self._shared_name = shared_name
+    super(MutableHashTable, self).__init__(key_dtype, value_dtype)
+
+    self._resource_handle = self.create_resource()
+    if checkpoint:
+      saveable = MutableHashTable._Saveable(self, name)
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+
+  def create_resource(self):
     # The table must be shared if checkpointing is requested for multi-worker
     # training to work correctly. Use the node name if no shared_name has been
     # explicitly specified.
-    use_node_name_sharing = checkpoint and shared_name is None
+    use_node_name_sharing = self._checkpoint and self._shared_name is None
     if self._default_value.get_shape().ndims == 0:
-      self._table_ref = gen_lookup_ops.mutable_hash_table_v2(
-          shared_name=shared_name,
+      table_ref = gen_lookup_ops.mutable_hash_table_v2(
+          shared_name=self._shared_name,
           use_node_name_sharing=use_node_name_sharing,
-          key_dtype=key_dtype,
-          value_dtype=value_dtype,
-          name=name)
+          key_dtype=self._key_dtype,
+          value_dtype=self._value_dtype,
+          name=self._name)
     else:
-      self._table_ref = gen_lookup_ops.mutable_hash_table_of_tensors_v2(
-          shared_name=shared_name,
+      table_ref = gen_lookup_ops.mutable_hash_table_of_tensors_v2(
+          shared_name=self._shared_name,
           use_node_name_sharing=use_node_name_sharing,
-          key_dtype=key_dtype,
-          value_dtype=value_dtype,
+          key_dtype=self._key_dtype,
+          value_dtype=self._value_dtype,
           value_shape=self._default_value.get_shape(),
-          name=name)
-    if executing_eagerly:
-      op_name = None
+          name=self._name)
+
+    if context.executing_eagerly():
+      self._table_name = None
     else:
-      op_name = self._table_ref.op.name.split("/")[-1]
-    super(MutableHashTable, self).__init__(key_dtype, value_dtype,
-                                           op_name)
+      self._table_name = table_ref.op.name.split("/")[-1]
+    return table_ref
 
-    if checkpoint:
-      saveable = MutableHashTable._Saveable(self, name)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+  @property
+  def name(self):
+    return self._table_name
 
   def size(self, name=None):
     """Compute the number of elements in this table.
@@ -386,10 +398,40 @@ class MutableHashTable(LookupInterface, checkpointable.CheckpointableBase):
     Returns:
       A scalar tensor containing the number of elements in this table.
     """
-    with ops.name_scope(name, "%s_Size" % self._name,
-                        [self._table_ref]) as name:
-      with ops.colocate_with(self._table_ref):
-        return gen_lookup_ops.lookup_table_size_v2(self._table_ref, name=name)
+    with ops.name_scope(name, "%s_Size" % self.name,
+                        [self.resource_handle]) as name:
+      with ops.colocate_with(self.resource_handle):
+        return gen_lookup_ops.lookup_table_size_v2(
+            self.resource_handle, name=name)
+
+  def remove(self, keys, name=None):
+    """Removes `keys` and its associated values from the table.
+
+    If a key is not present in the table, it is silently ignored.
+
+    Args:
+      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
+        key type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(
+        name, "%s_lookup_table_remove" % self.name,
+        (self.resource_handle, keys, self._default_value)) as name:
+      # pylint: disable=protected-access
+      op = gen_lookup_ops.lookup_table_remove_v2(
+          self.resource_handle, keys, name=name)
+
+    return op
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -408,12 +450,13 @@ class MutableHashTable(LookupInterface, checkpointable.CheckpointableBase):
     Raises:
       TypeError: when `keys` do not match the table data types.
     """
-    with ops.name_scope(name, "%s_lookup_table_find" % self._name,
-                        (self._table_ref, keys, self._default_value)) as name:
+    with ops.name_scope(
+        name, "%s_lookup_table_find" % self.name,
+        (self.resource_handle, keys, self._default_value)) as name:
       keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
-      with ops.colocate_with(self._table_ref):
+      with ops.colocate_with(self.resource_handle):
         values = gen_lookup_ops.lookup_table_find_v2(
-            self._table_ref, keys, self._default_value, name=name)
+            self.resource_handle, keys, self._default_value, name=name)
     return values
 
   def insert(self, keys, values, name=None):
@@ -433,14 +476,14 @@ class MutableHashTable(LookupInterface, checkpointable.CheckpointableBase):
       TypeError: when `keys` or `values` doesn't match the table data
         types.
     """
-    with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
-                        [self._table_ref, keys, values]) as name:
+    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
+                        [self.resource_handle, keys, values]) as name:
       keys = ops.convert_to_tensor(keys, self._key_dtype, name="keys")
       values = ops.convert_to_tensor(values, self._value_dtype, name="values")
-      with ops.colocate_with(self._table_ref):
+      with ops.colocate_with(self.resource_handle):
         # pylint: disable=protected-access
         op = gen_lookup_ops.lookup_table_insert_v2(
-            self._table_ref, keys, values, name=name)
+            self.resource_handle, keys, values, name=name)
     return op
 
   def export(self, name=None):
@@ -453,11 +496,11 @@ class MutableHashTable(LookupInterface, checkpointable.CheckpointableBase):
       A pair of tensors with the first tensor containing all keys and the
         second tensors containing all values in the table.
     """
-    with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
-                        [self._table_ref]) as name:
-      with ops.colocate_with(self._table_ref):
+    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
+                        [self.resource_handle]) as name:
+      with ops.colocate_with(self.resource_handle):
         exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self._table_ref, self._key_dtype, self._value_dtype, name=name)
+            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
     return exported_keys, exported_values
 
   def _gather_saveables_for_checkpoint(self):
@@ -479,19 +522,19 @@ class MutableHashTable(LookupInterface, checkpointable.CheckpointableBase):
     def restore(self, restored_tensors, restored_shapes):
       del restored_shapes  # unused
       # pylint: disable=protected-access
-      with ops.colocate_with(self.op._table_ref):
+      with ops.colocate_with(self.op.resource_handle):
         return gen_lookup_ops.lookup_table_import_v2(
-            self.op._table_ref, restored_tensors[0], restored_tensors[1])
+            self.op.resource_handle, restored_tensors[0], restored_tensors[1])
 
 
-class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
+class MutableDenseHashTable(LookupInterface):
   """A generic mutable hash table implementation using tensors as backing store.
 
-  Data can be inserted by calling the insert method. It does not support
-  initialization via the init method.
+  Data can be inserted by calling the insert method and removed by calling the
+  remove method. It does not support initialization via the init method.
 
   It uses "open addressing" with quadratic reprobing to resolve collisions.
-  Compared to `MutableHashTable` the insert and lookup operations in a
+  Compared to `MutableHashTable` the insert, remove and lookup operations in a
   `MutableDenseHashTable` are typically faster, but memory usage can be higher.
   However, `MutableDenseHashTable` does not require additional memory for
   temporary tensors created during checkpointing and restore operations.
@@ -502,7 +545,9 @@ class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
   table = tf.contrib.lookup.MutableDenseHashTable(key_dtype=tf.int64,
                                                   value_dtype=tf.int64,
                                                   default_value=-1,
-                                                  empty_key=0)
+                                                  empty_key=0,
+                                                  deleted_key=-1)
+
   sess.run(table.insert(keys, values))
   out = table.lookup(query_keys)
   print(out.eval())
@@ -516,6 +561,7 @@ class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
                value_dtype,
                default_value,
                empty_key,
+               deleted_key,
                initial_num_buckets=None,
                shared_name=None,
                name="MutableDenseHashTable",
@@ -530,7 +576,7 @@ class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
       value_dtype: the type of the value tensors.
       default_value: The value to use if a key is missing in the table.
       empty_key: the key to use to represent empty buckets internally. Must not
-        be used in insert or lookup operations.
+        be used in insert, remove or lookup operations.
       initial_num_buckets: the initial number of buckets.
       shared_name: If non-empty, this table will be shared under
         the given name across multiple sessions.
@@ -538,48 +584,67 @@ class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
       checkpoint: if True, the contents of the table are saved to and restored
         from checkpoints. If `shared_name` is empty for a checkpointed table, it
         is shared using the table node name.
+      deleted_key: the key to use to represent deleted buckets internally. Must
+        not be used in insert, remove or lookup operations and be different from
+        the empty_key.
 
     Returns:
-      A `MutableHashTable` object.
+      A `MutableDenseHashTable` object.
 
     Raises:
       ValueError: If checkpoint is True and no name was specified.
     """
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=value_dtype, name="default_value")
+    self._key_dtype = key_dtype
+    self._value_dtype = value_dtype
+    self._initial_num_buckets = initial_num_buckets
     self._value_shape = self._default_value.get_shape()
+    self._checkpoint = checkpoint
+    self._name = name
 
-    # The table must be shared if checkpointing is requested for multi-worker
-    # training to work correctly. Use the node name if no shared_name has been
-    # explicitly specified.
-    use_node_name_sharing = checkpoint and shared_name is None
-    empty_key = ops.convert_to_tensor(
+    self._empty_key = ops.convert_to_tensor(
         empty_key, dtype=key_dtype, name="empty_key")
-    executing_eagerly = context.executing_eagerly()
-    if executing_eagerly and shared_name is None:
+    self._deleted_key = ops.convert_to_tensor(
+        deleted_key, dtype=key_dtype, name="deleted_key")
+    if context.executing_eagerly() and shared_name is None:
       # TODO(allenl): This will leak memory due to kernel caching by the
       # shared_name attribute value (but is better than the alternative of
       # sharing everything by default when executing eagerly; hopefully creating
       # tables in a loop is uncommon).
       shared_name = "table_%d" % (ops.uid(),)
-    self._table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
-        empty_key=empty_key,
-        shared_name=shared_name,
+    self._shared_name = shared_name
+    super(MutableDenseHashTable, self).__init__(key_dtype, value_dtype)
+
+    self._resource_handle = self.create_resource()
+    if checkpoint:
+      saveable = MutableDenseHashTable._Saveable(self, name)
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+
+  def create_resource(self):
+    # The table must be shared if checkpointing is requested for multi-worker
+    # training to work correctly. Use the node name if no shared_name has been
+    # explicitly specified.
+    use_node_name_sharing = self._checkpoint and self._shared_name is None
+    table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
+        empty_key=self._empty_key,
+        deleted_key=self._deleted_key,
+        shared_name=self._shared_name,
         use_node_name_sharing=use_node_name_sharing,
-        value_dtype=value_dtype,
+        value_dtype=self._value_dtype,
         value_shape=self._value_shape,
-        initial_num_buckets=initial_num_buckets,
-        name=name)
-    if executing_eagerly:
-      op_name = None
+        initial_num_buckets=self._initial_num_buckets,
+        name=self._name)
+    if context.executing_eagerly():
+      self._table_name = None
     else:
-      op_name = self._table_ref.op.name.split("/")[-1]
-    super(MutableDenseHashTable, self).__init__(
-        key_dtype, value_dtype, op_name)
+      self._table_name = table_ref.op.name.split("/")[-1]
+    return table_ref
 
-    if checkpoint:
-      saveable = MutableDenseHashTable._Saveable(self, name)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+  @property
+  def name(self):
+    return self._table_name
 
   def size(self, name=None):
     """Compute the number of elements in this table.
@@ -590,10 +655,11 @@ class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
     Returns:
       A scalar tensor containing the number of elements in this table.
     """
-    with ops.name_scope(name, "%s_Size" % self._name,
-                        [self._table_ref]) as name:
-      with ops.colocate_with(self._table_ref):
-        return gen_lookup_ops.lookup_table_size_v2(self._table_ref, name=name)
+    with ops.name_scope(name, "%s_Size" % self.name,
+                        [self.resource_handle]) as name:
+      with ops.colocate_with(self.resource_handle):
+        return gen_lookup_ops.lookup_table_size_v2(
+            self.resource_handle, name=name)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -612,12 +678,12 @@ class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
     Raises:
       TypeError: when `keys` do not match the table data types.
     """
-    with ops.name_scope(name, "%s_lookup_table_find" % self._name,
-                        [self._table_ref, keys]) as name:
+    with ops.name_scope(name, "%s_lookup_table_find" % self.name,
+                        [self.resource_handle, keys]) as name:
       keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
-      with ops.colocate_with(self._table_ref):
+      with ops.colocate_with(self.resource_handle):
         values = gen_lookup_ops.lookup_table_find_v2(
-            self._table_ref, keys, self._default_value, name=name)
+            self.resource_handle, keys, self._default_value, name=name)
 
     return values
 
@@ -638,16 +704,45 @@ class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
       TypeError: when `keys` or `values` doesn't match the table data
         types.
     """
-    with ops.name_scope(name, "%s_lookup_table_insert" % self._name,
-                        [self._table_ref, keys, values]) as name:
+    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
+                        [self.resource_handle, keys, values]) as name:
       keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
       values = ops.convert_to_tensor(
           values, dtype=self._value_dtype, name="values")
-      with ops.colocate_with(self._table_ref):
+      with ops.colocate_with(self.resource_handle):
         op = gen_lookup_ops.lookup_table_insert_v2(
-            self._table_ref, keys, values, name=name)
+            self.resource_handle, keys, values, name=name)
       return op
 
+  def remove(self, keys, name=None):
+    """Removes `keys` and its associated values from the table.
+
+    If a key is not present in the table, it is silently ignored.
+
+    Args:
+      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
+        key type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(
+        name, "%s_lookup_table_remove" % self.name,
+        (self.resource_handle, keys, self._default_value)) as name:
+      # pylint: disable=protected-access
+      op = gen_lookup_ops.lookup_table_remove_v2(
+          self.resource_handle, keys, name=name)
+
+    return op
+
   def export(self, name=None):
     """Returns tensors of all keys and values in the table.
 
@@ -658,11 +753,11 @@ class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
       A pair of tensors with the first tensor containing all keys and the
         second tensors containing all values in the table.
     """
-    with ops.name_scope(name, "%s_lookup_table_export_values" % self._name,
-                        [self._table_ref]) as name:
-      with ops.colocate_with(self._table_ref):
+    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
+                        [self.resource_handle]) as name:
+      with ops.colocate_with(self.resource_handle):
         exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self._table_ref, self._key_dtype, self._value_dtype, name=name)
+            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
 
     return exported_keys, exported_values
 
@@ -686,6 +781,6 @@ class MutableDenseHashTable(LookupInterface, checkpointable.CheckpointableBase):
     def restore(self, restored_tensors, restored_shapes):
       del restored_shapes  # unused
       # pylint: disable=protected-access
-      with ops.colocate_with(self.op._table_ref):
+      with ops.colocate_with(self.op.resource_handle):
         return gen_lookup_ops.lookup_table_import_v2(
-            self.op._table_ref, restored_tensors[0], restored_tensors[1])
+            self.op.resource_handle, restored_tensors[0], restored_tensors[1])
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 9e9345e875648f1de927e5591a6e2a8094856921..9b2c2dd87cc8a92fbb6b45504939be3788b60839 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -25,6 +25,7 @@ import six
 from tensorflow.contrib import lookup
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import counter
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -50,7 +51,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       self.assertAllEqual(3, table.size().eval())
 
@@ -74,7 +75,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       self.assertAllEqual(3, table.size().eval())
 
@@ -94,7 +95,7 @@ class HashTableOpTest(test.TestCase):
           lookup.KeyValueTensorInitializer(
               keys, values, value_dtype=dtypes.int64),
           default_val)
-      table.init.run()
+      table.initializer.run()
 
       self.assertAllEqual(3, table.size().eval())
 
@@ -111,7 +112,7 @@ class HashTableOpTest(test.TestCase):
       values = np.array([0, 1, 2], dtype=np.int64)
       table = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       self.assertAllEqual(3, table.size().eval())
 
@@ -156,7 +157,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -171,7 +172,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       sp_indices = [[0, 0], [0, 1], [1, 0]]
       sp_shape = [2, 2]
@@ -194,7 +195,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       # Ref types do not produce a lookup signature mismatch.
       input_string_ref = variables.Variable("brain")
@@ -238,10 +239,10 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       with self.assertRaisesOpError("Table already initialized"):
-        table.init.run()
+        table.initializer.run()
 
   def testInitializationWithInvalidDimensions(self):
     with self.cached_session():
@@ -273,13 +274,13 @@ class HashTableOpTest(test.TestCase):
 
     # Init the table in the first session.
     with session1:
-      table.init.run()
+      table.initializer.run()
       self.assertAllEqual(3, table.size().eval())
 
     # Init the table in the second session and verify that we do not get a
     # "Table already initialized" error.
     with session2:
-      table.init.run()
+      table.initializer.run()
       self.assertAllEqual(3, table.size().eval())
 
   def testHashTableInt32String(self):
@@ -289,7 +290,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant(["brain", "salad", "surgery"])
       table = lookup.HashTable(
           lookup.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       input_tensor = constant_op.constant([0, 1, -1])
       output = table.lookup(input_tensor)
@@ -303,13 +304,17 @@ class MutableHashTableOpTest(test.TestCase):
   def testMutableHashTable(self):
     with self.cached_session():
       default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
                                       default_val)
       self.assertAllEqual(0, table.size().eval())
 
       table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+
+      remove_string = constant_op.constant(["tarkus", "tank"])
+      table.remove(remove_string).run()
       self.assertAllEqual(3, table.size().eval())
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
@@ -472,13 +477,18 @@ class MutableHashTableOpTest(test.TestCase):
   def testMutableHashTableOfTensors(self):
     with self.cached_session():
       default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+      values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]],
+                                    dtypes.int64)
       table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
                                       default_val)
       self.assertAllEqual(0, table.size().eval())
 
       table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+
+      remove_string = constant_op.constant(["tarkus", "tank"])
+      table.remove(remove_string).run()
       self.assertAllEqual(3, table.size().eval())
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
@@ -624,6 +634,26 @@ class MutableHashTableOpTest(test.TestCase):
       result = output.eval()
       self.assertAllEqual([0, 1, 3, -1], result)
 
+  def testMutableHashTableRemoveHighRank(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
+      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
+
+      table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+
+      remove_string = constant_op.constant(["salad", "tarkus"])
+      table.remove(remove_string).run()
+      self.assertAllEqual(3, table.size().eval())
+
+      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
+      output = table.lookup(input_string)
+
+      result = output.eval()
+      self.assertAllEqual([0, -1, 3, -1], result)
+
   def testMutableHashTableOfTensorsFindHighRank(self):
     with self.cached_session():
       default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
@@ -645,6 +675,30 @@ class MutableHashTableOpTest(test.TestCase):
       self.assertAllEqual(
           [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
 
+  def testMutableHashTableOfTensorsRemoveHighRank(self):
+    with self.test_session():
+      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
+                                    dtypes.int64)
+      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
+
+      table.insert(keys, values).run()
+      self.assertAllEqual(3, table.size().eval())
+
+      remove_string = constant_op.constant([["brain", "tank"]])
+      table.remove(remove_string).run()
+      self.assertAllEqual(2, table.size().eval())
+
+      input_string = constant_op.constant([["brain", "salad"],
+                                           ["surgery", "tank"]])
+      output = table.lookup(input_string)
+      self.assertAllEqual([2, 2, 3], output.get_shape())
+
+      result = output.eval()
+      self.assertAllEqual(
+          [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result)
+
   def testMultipleMutableHashTables(self):
     with self.cached_session() as sess:
       default_val = -1
@@ -792,13 +846,22 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
   def testBasic(self):
     with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
+
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       table = lookup.MutableDenseHashTable(
-          dtypes.int64, dtypes.int64, default_value=-1, empty_key=0)
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1)
       self.assertAllEqual(0, table.size().eval())
 
       table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+
+      remove_string = constant_op.constant([12, 15], dtypes.int64)
+      table.remove(remove_string).run()
       self.assertAllEqual(3, table.size().eval())
 
       input_string = constant_op.constant([11, 12, 15], dtypes.int64)
@@ -806,17 +869,26 @@ class MutableDenseHashTableOpTest(test.TestCase):
       self.assertAllEqual([3], output.get_shape())
 
       result = output.eval()
-      self.assertAllEqual([0, 1, -1], result)
+      self.assertAllEqual([0, -1, -1], result)
 
   def testBasicBool(self):
     with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([True, True, True], dtypes.bool)
+
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([True, True, True, True], dtypes.bool)
       table = lookup.MutableDenseHashTable(
-          dtypes.int64, dtypes.bool, default_value=False, empty_key=0)
+          dtypes.int64,
+          dtypes.bool,
+          default_value=False,
+          empty_key=0,
+          deleted_key=-1)
       self.assertAllEqual(0, table.size().eval())
 
       table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+
+      remove_string = constant_op.constant([11, 15], dtypes.int64)
+      table.remove(remove_string).run()
       self.assertAllEqual(3, table.size().eval())
 
       input_string = constant_op.constant([11, 12, 15], dtypes.int64)
@@ -824,14 +896,30 @@ class MutableDenseHashTableOpTest(test.TestCase):
       self.assertAllEqual([3], output.get_shape())
 
       result = output.eval()
-      self.assertAllEqual([True, True, False], result)
+      self.assertAllEqual([False, True, False], result)
+
+  def testSameEmptyAndDeletedKey(self):
+    with self.cached_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "deleted_key"):
+        table = lookup.MutableDenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=42,
+            deleted_key=42)
+        self.assertAllEqual(0, table.size().eval())
 
   def testLookupUnknownShape(self):
     with self.cached_session():
       keys = constant_op.constant([11, 12, 13], dtypes.int64)
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup.MutableDenseHashTable(
-          dtypes.int64, dtypes.int64, default_value=-1, empty_key=0)
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1)
 
       table.insert(keys, values).run()
       self.assertAllEqual(3, table.size().eval())
@@ -844,45 +932,60 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
   def testMapStringToFloat(self):
     with self.cached_session():
-      keys = constant_op.constant(["a", "b", "c"], dtypes.string)
-      values = constant_op.constant([0.0, 1.1, 2.2], dtypes.float32)
+
+      keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string)
+      values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32)
       default_value = constant_op.constant(-1.5, dtypes.float32)
       table = lookup.MutableDenseHashTable(
           dtypes.string,
           dtypes.float32,
           default_value=default_value,
-          empty_key="")
+          empty_key="",
+          deleted_key="$")
       self.assertAllEqual(0, table.size().eval())
 
       table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+
+      remove_string = constant_op.constant(["b", "e"])
+      table.remove(remove_string).run()
       self.assertAllEqual(3, table.size().eval())
 
-      input_string = constant_op.constant(["a", "b", "d"], dtypes.string)
+      input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string)
       output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
+      self.assertAllEqual([4], output.get_shape())
 
       result = output.eval()
-      self.assertAllClose([0, 1.1, -1.5], result)
+      self.assertAllClose([0, -1.5, 3.3, -1.5], result)
 
   def testMapInt64ToFloat(self):
     for float_dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
-        keys = constant_op.constant([11, 12, 13], dtypes.int64)
-        values = constant_op.constant([0.0, 1.1, 2.2], float_dtype)
+
+        keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+        values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype)
         default_value = constant_op.constant(-1.5, float_dtype)
         table = lookup.MutableDenseHashTable(
-            dtypes.int64, float_dtype, default_value=default_value, empty_key=0)
+            dtypes.int64,
+            float_dtype,
+            default_value=default_value,
+            empty_key=0,
+            deleted_key=-1)
         self.assertAllEqual(0, table.size().eval())
 
         table.insert(keys, values).run()
+        self.assertAllEqual(4, table.size().eval())
+
+        remove_string = constant_op.constant([12, 15], dtypes.int64)
+        table.remove(remove_string).run()
         self.assertAllEqual(3, table.size().eval())
 
-        input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+        input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
         output = table.lookup(input_string)
-        self.assertAllEqual([3], output.get_shape())
+        self.assertAllEqual([4], output.get_shape())
 
         result = output.eval()
-        self.assertAllClose([0, 1.1, -1.5], result)
+        self.assertAllClose([0, -1.5, 3.3, -1.5], result)
 
   def testVectorValues(self):
     with self.cached_session():
@@ -895,6 +998,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
           dtypes.int64,
           default_value=default_value,
           empty_key=0,
+          deleted_key=-1,
           initial_num_buckets=4)
       self.assertAllEqual(0, table.size().eval())
 
@@ -908,26 +1012,35 @@ class MutableDenseHashTableOpTest(test.TestCase):
       self.assertAllEqual(4, table.size().eval())
       self.assertAllEqual(8, len(table.export()[0].eval()))
 
-      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+      remove_string = constant_op.constant([12, 16], dtypes.int64)
+      table.remove(remove_string).run()
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(8, len(table.export()[0].eval()))
+
+      input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
       output = table.lookup(input_string)
-      self.assertAllEqual(
-          [3, 4], output.shape, msg="Saw shape: %s" % output.shape)
+      self.assertAllEqual([4, 4],
+                          output.shape,
+                          msg="Saw shape: %s" % output.shape)
 
       result = output.eval()
-      self.assertAllEqual([[0, 1, 2, 3], [3, 4, 5, 6], [-1, -2, -3, -4]],
-                          result)
+      self.assertAllEqual(
+          [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]],
+          result)
 
   def testVectorKeys(self):
     with self.cached_session():
       keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64)
       values = constant_op.constant([10, 11, 12], dtypes.int64)
       empty_key = constant_op.constant([0, 3], dtypes.int64)
+      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
       default_value = constant_op.constant(-1, dtypes.int64)
       table = lookup.MutableDenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=default_value,
           empty_key=empty_key,
+          deleted_key=deleted_key,
           initial_num_buckets=8)
       self.assertAllEqual(0, table.size().eval())
 
@@ -940,13 +1053,18 @@ class MutableDenseHashTableOpTest(test.TestCase):
       self.assertAllEqual(4, table.size().eval())
       self.assertAllEqual(8, len(table.export()[0].eval()))
 
-      input_string = constant_op.constant([[0, 1], [1, 2], [0, 2]],
+      remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64)
+      table.remove(remove_string).run()
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(8, len(table.export()[0].eval()))
+
+      input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]],
                                           dtypes.int64)
       output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
+      self.assertAllEqual([4], output.get_shape())
 
       result = output.eval()
-      self.assertAllEqual([10, 11, -1], result)
+      self.assertAllEqual([10, -1, 12, -1], result)
 
   def testResize(self):
     with self.cached_session():
@@ -957,6 +1075,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
           dtypes.int64,
           default_value=-1,
           empty_key=0,
+          deleted_key=-1,
           initial_num_buckets=4)
       self.assertAllEqual(0, table.size().eval())
 
@@ -964,31 +1083,42 @@ class MutableDenseHashTableOpTest(test.TestCase):
       self.assertAllEqual(3, table.size().eval())
       self.assertAllEqual(4, len(table.export()[0].eval()))
 
-      keys2 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
-      values2 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
+      keys2 = constant_op.constant([12, 99], dtypes.int64)
+      table.remove(keys2).run()
+      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(4, len(table.export()[0].eval()))
+
+      keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
+      values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
 
-      table.insert(keys2, values2).run()
-      self.assertAllEqual(7, table.size().eval())
+      table.insert(keys3, values3).run()
+      self.assertAllEqual(6, table.size().eval())
       self.assertAllEqual(16, len(table.export()[0].eval()))
 
-      keys3 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
+      keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
                                    dtypes.int64)
-      output = table.lookup(keys3)
-      self.assertAllEqual([-1, 0, 1, 3, 4, 5, 6, 7, -1], output.eval())
+      output = table.lookup(keys4)
+      self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], output.eval())
 
   def testExport(self):
     with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([1, 2, 3], dtypes.int64)
+
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([1, 2, 3, 4], dtypes.int64)
       table = lookup.MutableDenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=-1,
           empty_key=100,
+          deleted_key=200,
           initial_num_buckets=8)
       self.assertAllEqual(0, table.size().eval())
 
       table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+
+      keys2 = constant_op.constant([12, 15], dtypes.int64)
+      table.remove(keys2).run()
       self.assertAllEqual(3, table.size().eval())
 
       exported_keys, exported_values = table.export()
@@ -1005,8 +1135,8 @@ class MutableDenseHashTableOpTest(test.TestCase):
       pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0]
       # sort by key
       pairs = pairs[pairs[:, 0].argsort()]
-      self.assertAllEqual([[11, 1], [12, 2], [13, 3], [100, 0], [100, 0],
-                           [100, 0], [100, 0], [100, 0]], pairs)
+      self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0],
+                           [100, 0], [100, 0], [200, 2]], pairs)
 
   def testSaveRestore(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
@@ -1015,13 +1145,15 @@ class MutableDenseHashTableOpTest(test.TestCase):
     with self.session(graph=ops.Graph()) as sess:
       default_value = -1
       empty_key = 0
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      deleted_key = -1
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       table = lookup.MutableDenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=default_value,
           empty_key=empty_key,
+          deleted_key=deleted_key,
           name="t1",
           checkpoint=True,
           initial_num_buckets=32)
@@ -1030,6 +1162,11 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
       self.assertAllEqual(0, table.size().eval())
       table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      keys2 = constant_op.constant([12, 15], dtypes.int64)
+      table.remove(keys2).run()
       self.assertAllEqual(3, table.size().eval())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
@@ -1043,6 +1180,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
           dtypes.int64,
           default_value=default_value,
           empty_key=empty_key,
+          deleted_key=deleted_key,
           name="t1",
           checkpoint=True,
           initial_num_buckets=64)
@@ -1062,7 +1200,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
       input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
       output = table.lookup(input_string)
-      self.assertAllEqual([-1, 0, 1, 2, -1], output.eval())
+      self.assertAllEqual([-1, 0, -1, 2, 3], output.eval())
 
   @test_util.run_in_graph_and_eager_modes
   def testObjectSaveRestore(self):
@@ -1071,6 +1209,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
     default_value = -1
     empty_key = 0
+    deleted_key = -1
     keys = constant_op.constant([11, 12, 13], dtypes.int64)
     values = constant_op.constant([0, 1, 2], dtypes.int64)
     save_table = lookup.MutableDenseHashTable(
@@ -1078,6 +1217,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
         dtypes.int64,
         default_value=default_value,
         empty_key=empty_key,
+        deleted_key=deleted_key,
         name="t1",
         checkpoint=True,
         initial_num_buckets=32)
@@ -1097,6 +1237,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
         dtypes.int64,
         default_value=default_value,
         empty_key=empty_key,
+        deleted_key=deleted_key,
         name="t1",
         checkpoint=True,
         initial_num_buckets=64)
@@ -1124,14 +1265,18 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
       default_value = constant_op.constant([-1, -2], dtypes.int64)
-      keys = constant_op.constant([[11, 12], [11, 14], [13, 14]], dtypes.int64)
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
+                                  dtypes.int64)
+      values = constant_op.constant([[0, 1], [2, 3], [2, 4], [4, 5]],
+                                    dtypes.int64)
       table = lookup.MutableDenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=default_value,
           empty_key=empty_key,
+          deleted_key=deleted_key,
           name="t1",
           checkpoint=True,
           initial_num_buckets=32)
@@ -1140,6 +1285,11 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
       self.assertAllEqual(0, table.size().eval())
       table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      keys2 = constant_op.constant([[12, 13], [16, 17]], dtypes.int64)
+      table.remove(keys2).run()
       self.assertAllEqual(3, table.size().eval())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
@@ -1149,12 +1299,14 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
       default_value = constant_op.constant([-1, -2], dtypes.int64)
       table = lookup.MutableDenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=default_value,
           empty_key=empty_key,
+          deleted_key=deleted_key,
           name="t1",
           checkpoint=True,
           initial_num_buckets=64)
@@ -1184,14 +1336,17 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
       default_value = constant_op.constant(-1, dtypes.int64)
-      keys = constant_op.constant([[11, 12], [11, 14], [13, 14]], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
+                                  dtypes.int64)
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       table = lookup.MutableDenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=default_value,
           empty_key=empty_key,
+          deleted_key=deleted_key,
           name="t2",
           checkpoint=True,
           initial_num_buckets=32)
@@ -1200,6 +1355,11 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
       self.assertAllEqual(0, table.size().eval())
       table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      keys2 = constant_op.constant([[12, 13], [15, 16]], dtypes.int64)
+      table.remove(keys2).run()
       self.assertAllEqual(3, table.size().eval())
       self.assertAllEqual(32, len(table.export()[0].eval()))
 
@@ -1209,12 +1369,14 @@ class MutableDenseHashTableOpTest(test.TestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
       default_value = constant_op.constant(-1, dtypes.int64)
       table = lookup.MutableDenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=default_value,
           empty_key=empty_key,
+          deleted_key=deleted_key,
           name="t2",
           checkpoint=True,
           initial_num_buckets=64)
@@ -1235,7 +1397,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
       input_string = constant_op.constant(
           [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
       output = table.lookup(input_string)
-      self.assertAllEqual([0, 1, -1, 2, -1], output.eval())
+      self.assertAllEqual([0, 1, -1, 3, -1], output.eval())
 
   def testReprobe(self):
     with self.cached_session():
@@ -1248,6 +1410,7 @@ class MutableDenseHashTableOpTest(test.TestCase):
           dtypes.int64,
           default_value=-1,
           empty_key=0,
+          deleted_key=-1,
           initial_num_buckets=8)
       self.assertAllEqual(0, table.size().eval())
 
@@ -1267,7 +1430,11 @@ class MutableDenseHashTableOpTest(test.TestCase):
       keys = constant_op.constant([11, 0, 13], dtypes.int64)
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup.MutableDenseHashTable(
-          dtypes.int64, dtypes.int64, default_value=-1, empty_key=12)
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=12,
+          deleted_key=-1)
       self.assertAllEqual(0, table.size().eval())
 
       table.insert(keys, values).run()
@@ -1283,19 +1450,35 @@ class MutableDenseHashTableOpTest(test.TestCase):
   def testErrors(self):
     with self.cached_session():
       table = lookup.MutableDenseHashTable(
-          dtypes.int64, dtypes.int64, default_value=-1, empty_key=0)
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1)
 
       # Inserting the empty key returns an error
-      keys = constant_op.constant([11, 0], dtypes.int64)
-      values = constant_op.constant([0, 1], dtypes.int64)
+      keys1 = constant_op.constant([11, 0], dtypes.int64)
+      values1 = constant_op.constant([0, 1], dtypes.int64)
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "empty_key"):
-        table.insert(keys, values).run()
+        table.insert(keys1, values1).run()
 
       # Looking up the empty key returns an error
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "empty_key"):
-        table.lookup(keys).eval()
+        table.lookup(keys1).eval()
+
+      # Inserting the deleted key returns an error
+      keys2 = constant_op.constant([11, -1], dtypes.int64)
+      values2 = constant_op.constant([0, 1], dtypes.int64)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "deleted_key"):
+        table.insert(keys2, values2).run()
+
+      # Looking up the empty key returns an error
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "deleted_key"):
+        table.lookup(keys2).eval()
 
       # Arbitrary tensors of keys are not supported
       keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
@@ -1312,11 +1495,43 @@ class MutableDenseHashTableOpTest(test.TestCase):
           dtypes.int64,
           default_value=-1,
           empty_key=17,
+          deleted_key=-1,
           initial_num_buckets=12)
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "Number of buckets must be"):
         self.assertAllEqual(0, table2.size().eval())
 
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          "Empty and deleted keys must have same shape"):
+        table3 = lookup.MutableDenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=42,
+            deleted_key=[1, 2])
+        self.assertAllEqual(0, table3.size().eval())
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Empty and deleted keys cannot be equal"):
+        table4 = lookup.MutableDenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=42,
+            deleted_key=42)
+        self.assertAllEqual(0, table4.size().eval())
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Empty and deleted keys cannot be equal"):
+        table5 = lookup.MutableDenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=[1, 2, 3],
+            deleted_key=[1, 2, 3])
+        self.assertAllEqual(0, table5.size().eval())
+
 
 class IndexTableFromFile(test.TestCase):
 
@@ -1455,7 +1670,7 @@ class IndexTableFromFile(test.TestCase):
       table = lookup.index_table_from_file(
           vocabulary_file=vocabulary_file, vocab_size=4)
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                              "Invalid vocab_size", table.init.run)
+                              "Invalid vocab_size", table.initializer.run)
 
   def test_index_table_from_file_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
@@ -1503,14 +1718,14 @@ class KeyValueTensorInitializerTest(test.TestCase):
       init = lookup.KeyValueTensorInitializer(
           ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
       table = lookup.HashTable(init, default_value=-1)
-      table.init.run()
+      table.initializer.run()
 
   def test_int64(self):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup.KeyValueTensorInitializer(
           (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64)
       table = lookup.HashTable(init, default_value=-1)
-      table.init.run()
+      table.initializer.run()
 
   def test_int32(self):
     with ops.Graph().as_default(), self.cached_session():
@@ -1519,7 +1734,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
       table = lookup.HashTable(init, default_value=-1)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "No OpKernel was registered"):
-        table.init.run()
+        table.initializer.run()
 
 
 class IndexTableFromTensor(test.TestCase):
@@ -1807,7 +2022,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                                    dtypes.int64,
                                    lookup.TextFileIndex.LINE_NUMBER),
         default_value)
-    self.evaluate(table.init)
+    self.evaluate(table.initializer)
 
     output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
 
@@ -1826,7 +2041,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                                      dtypes.int64,
                                      lookup.TextFileIndex.LINE_NUMBER),
           default_value)
-      table.init.run()
+      table.initializer.run()
 
       output = table.lookup(
           constant_op.constant((42, 1, 11), dtype=dtypes.int64))
@@ -1845,7 +2060,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           lookup.TextFileInitializer(vocabulary_file, dtypes.int64,
                                      key_index, dtypes.string, value_index),
           default_value)
-      table.init.run()
+      table.initializer.run()
 
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       output = table.lookup(input_values)
@@ -1867,7 +2082,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           lookup.TextFileInitializer(vocabulary_file, dtypes.string,
                                      key_index, dtypes.int64, value_index),
           default_value)
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
@@ -1889,7 +2104,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                                      key_index, dtypes.int64, value_index),
           default_value)
       with self.assertRaisesOpError("is not a valid"):
-        table.init.run()
+        table.initializer.run()
 
   def testInvalidDataType(self):
     vocabulary_file = self._createVocabFile("one_column_3.txt")
@@ -1917,7 +2132,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           default_value)
 
       with self.assertRaisesOpError("Invalid number of columns"):
-        table.init.run()
+        table.initializer.run()
 
   def testInitializeSameTableWithMultipleNodes(self):
     vocabulary_file = self._createVocabFile("one_column_5.txt")
@@ -1986,7 +2201,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           default_value)
 
       # Initialize from file.
-      table1.init.run()
+      table1.initializer.run()
       self.assertEquals(vocab_size, table1.size().eval())
 
       vocabulary_file2 = self._createVocabFile("one_column7.txt")
@@ -2001,7 +2216,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
               vocab_size=vocab_size),
           default_value)
       with self.assertRaisesOpError("Invalid vocab_size"):
-        table2.init.run()
+        table2.initializer.run()
 
       vocab_size = 1
       vocabulary_file3 = self._createVocabFile("one_column3.txt")
@@ -2016,7 +2231,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           default_value)
 
       # Smaller vocab size reads only vocab_size records.
-      table3.init.run()
+      table3.initializer.run()
       self.assertEquals(vocab_size, table3.size().eval())
 
   def testFeedVocabularyName(self):
@@ -2034,11 +2249,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       # Initialize with non existing file (old_file.txt) should fail.
       # TODO(yleon): Update message, which might change per FileSystem.
       with self.assertRaisesOpError("old_file.txt"):
-        table.init.run()
+        table.initializer.run()
 
       # Initialize the model feeding the vocabulary file.
       filenames = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
-      table.init.run(feed_dict={filenames[0]: vocabulary_file})
+      table.initializer.run(feed_dict={filenames[0]: vocabulary_file})
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -2080,7 +2295,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
               vocab_file, vocab_size=vocab_size),
           default_value)
 
-      table.init.run()
+      table.initializer.run()
 
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
 
@@ -2097,7 +2312,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           lookup.TextFileIdTableInitializer(
               vocab_file, vocab_size=vocab_size),
           default_value)
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
@@ -2115,7 +2330,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
           lookup.TextFileIdTableInitializer(
               vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
           default_value)
-      table.init.run()
+      table.initializer.run()
 
       out = table.lookup(
           constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
@@ -2144,7 +2359,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               default_value),
           oov_buckets)
 
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
@@ -2166,7 +2381,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           oov_buckets,
           key_dtype=dtypes.int32)
 
-      table.init.run()
+      table.initializer.run()
 
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
 
@@ -2187,7 +2402,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               default_value),
           oov_buckets)
 
-      table.init.run()
+      table.initializer.run()
 
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
 
@@ -2202,7 +2417,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       # Set a table that only uses hash buckets, for each input value returns
       # an id calculated by fingerprint("input") mod oov_buckets.
       table = lookup.IdTableWithHashBuckets(None, oov_buckets)
-      table.init.run()
+      table.initializer.run()
 
       values = constant_op.constant(("brain", "salad", "surgery"))
 
@@ -2224,7 +2439,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       # an id calculated by fingerprint("input") mod oov_buckets.
       table = lookup.IdTableWithHashBuckets(
           None, oov_buckets, key_dtype=dtypes.int32)
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant([42, 1, -1000], dtype=dtypes.int32)
 
@@ -2306,7 +2521,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               shared_name=shared_name),
           oov_buckets)
 
-      table1.init.run()
+      table1.initializer.run()
 
       input_string_1 = constant_op.constant(
           ["brain", "salad", "surgery", "UNK"])
@@ -2322,7 +2537,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       oov_buckets = 1
 
       # Underlying lookup table already initialized in previous session.
-      # No need to call table2.init.run()
+      # No need to call table2.initializer.run()
       table2 = lookup.IdTableWithHashBuckets(
           lookup.HashTable(
               lookup.TextFileIdTableInitializer(
@@ -2391,7 +2606,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
                   vocab_file, vocab_size=3),
               -1),
           1)
-      table.init.run()
+      table.initializer.run()
 
       sp_ids = table.lookup(sp_features)
 
@@ -2420,7 +2635,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               -1),
           1,
           key_dtype=dtypes.int32)
-      table.init.run()
+      table.initializer.run()
 
       sp_ids = table.lookup(sp_features)
 
@@ -2449,7 +2664,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               -1),
           1,
           key_dtype=dtypes.int64)
-      table.init.run()
+      table.initializer.run()
 
       sp_ids = table.lookup(sp_features)
 
@@ -2523,7 +2738,7 @@ class MutableHashTableBenchmark(test.Benchmark):
 
   def benchmark_many_repeated_scalar_insert_scalar(self):
     table = self._create_table()
-    c = counter.Counter().make_one_shot_iterator().get_next()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
     value = variables.Variable(1.0)
     insert = table.insert(c, value)
     size = table.size()
@@ -2544,7 +2759,7 @@ class MutableHashTableBenchmark(test.Benchmark):
 
   def benchmark_many_repeated_batch_32_insert_scalar(self):
     table = self._create_table()
-    c = counter.Counter().make_one_shot_iterator().get_next()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
     value = variables.Variable([1.0] * 32)
     insert = table.insert(32 * c + list(range(32)), value)
     size = table.size()
@@ -2558,7 +2773,11 @@ class MutableDenseHashTableBenchmark(MutableHashTableBenchmark):
 
   def _create_table(self):
     return lookup.MutableDenseHashTable(
-        dtypes.int64, dtypes.float32, default_value=0.0, empty_key=-1)
+        dtypes.int64,
+        dtypes.float32,
+        default_value=0.0,
+        empty_key=-1,
+        deleted_key=-2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 651de4e2f446b2da39b000cde2541872116cbdba..709a042bbcefb89125f7e4cd14a0d7ecd2b53281 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -59,39 +59,12 @@ def _scale_losses(losses, weights):
   """
   # First, compute the sum of the losses over all elements:
   start_index = max(0, weights.get_shape().ndims)
-  reduction_indices = list(range(start_index, losses.get_shape().ndims))
-  reduced_losses = math_ops.reduce_sum(
-      losses, reduction_indices=reduction_indices)
+  axis = list(range(start_index, losses.get_shape().ndims))
+  reduced_losses = math_ops.reduce_sum(losses, axis=axis)
   reduced_losses = math_ops.multiply(reduced_losses, weights)
   return math_ops.reduce_sum(reduced_losses)
 
 
-def _safe_div(numerator, denominator, name="value"):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-    name: An optional name for the returned op.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator),
-      name=name)
-
-
 def _safe_mean(losses, num_present):
   """Computes a safe mean of the losses.
 
@@ -104,7 +77,7 @@ def _safe_mean(losses, num_present):
       then zero is returned.
   """
   total_loss = math_ops.reduce_sum(losses)
-  return _safe_div(total_loss, num_present)
+  return math_ops.div_no_nan(total_loss, num_present, name="value")
 
 
 @deprecated("2016-12-30", "Use tf.losses.compute_weighted_loss instead.")
@@ -184,10 +157,9 @@ def _num_present(losses, weights, per_batch=False):
 
   # First, count the number of nonzero weights:
   if weights.get_shape().ndims >= 1:
-    reduction_indices = list(range(1, weights.get_shape().ndims))
+    axis = list(range(1, weights.get_shape().ndims))
     num_nonzero_per_batch = math_ops.reduce_sum(
-        math_ops.to_float(math_ops.not_equal(weights, 0)),
-        reduction_indices=reduction_indices)
+        math_ops.to_float(math_ops.not_equal(weights, 0)), axis=axis)
 
   # Next, determine the number of elements that weights would broadcast to:
   broadcast_dims = array_ops.slice(
@@ -603,17 +575,20 @@ def mean_pairwise_squared_error(predictions,
     if weights.get_shape().ndims is None:
       raise ValueError("weights.get_shape().ndims cannot be None")
 
-    reduction_indices = list(range(1, diffs.get_shape().ndims))
+    axis = list(range(1, diffs.get_shape().ndims))
 
     sum_squares_diff_per_batch = math_ops.reduce_sum(
-        math_ops.square(diffs), reduction_indices=reduction_indices)
+        math_ops.square(diffs), axis=axis)
     num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, num_present_per_batch)
+    term1 = 2.0 * math_ops.div_no_nan(
+        sum_squares_diff_per_batch, num_present_per_batch, name="value")
 
-    sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
-    term2 = 2.0 * _safe_div(
-        math_ops.square(sum_diff), math_ops.square(num_present_per_batch))
+    sum_diff = math_ops.reduce_sum(diffs, axis=axis)
+    term2 = 2.0 * math_ops.div_no_nan(
+        math_ops.square(sum_diff),
+        math_ops.square(num_present_per_batch),
+        name="value")
 
     loss = _scale_losses(term1 - term2, weights)
 
@@ -668,7 +643,7 @@ def cosine_distance(predictions,
 
     radial_diffs = math_ops.multiply(predictions, labels)
     losses = 1 - math_ops.reduce_sum(
-        radial_diffs, reduction_indices=[
+        radial_diffs, axis=[
             axis,
         ])
     return compute_weighted_loss(losses, weights, scope=scope)
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 36125c198e01c177f66b78931ac30e38e17fc409..7ea6e34cf50ed8e292f11314550d992c3dde34c0 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -208,6 +208,16 @@ endif
 # override local versions in the source tree.
 INCLUDES += -I/usr/local/include
 
+# If `$(WITH_TFLITE_FLEX)` is `true`, this Makefile will build a library
+# for TensorFlow Lite Flex runtime.
+# Farmhash and Flatbuffer is required for TensorFlow Lite Flex runtime.
+ifeq ($(WITH_TFLITE_FLEX), true)
+	HOST_INCLUDES += -I$(MAKEFILE_DIR)/downloads/farmhash/src
+	HOST_INCLUDES += -I$(MAKEFILE_DIR)/downloads/flatbuffers/include
+	INCLUDES += -I$(MAKEFILE_DIR)/downloads/farmhash/src
+	INCLUDES += -I$(MAKEFILE_DIR)/downloads/flatbuffers/include
+endif
+
 LIBS := \
 $(TARGET_NSYNC_LIB) \
 -lstdc++ \
@@ -283,7 +293,7 @@ ifeq ($(TARGET),ANDROID)
 	else
 		ANDROID_HOST_OS_ARCH := $(ANDROID_HOST_OS_ARCH)-$(HOST_ARCH)
 	endif
-    
+
 	ifndef ANDROID_ARCH
 		ANDROID_ARCH := armeabi-v7a
 	endif
@@ -330,7 +340,7 @@ ifeq ($(TARGET),ANDROID)
 		BIN_PREFIX := x86_64-linux-android
 		MARCH_OPTION :=
 	endif
-    
+
 	ifndef NDK_ROOT
     $(error "NDK_ROOT is not defined.")
 	endif
@@ -717,6 +727,57 @@ tensorflow/core/util/reporter.cc \
 tensorflow/tools/benchmark/benchmark_model.cc \
 tensorflow/tools/benchmark/benchmark_model_main.cc
 
+# If `$(WITH_TFLITE_FLEX)` is `true`, this Makefile will build a library
+# for TensorFlow Lite Flex runtime.
+# Adding the following dependencies>
+# * TensorFlow Eager Runtime.
+# * TensorFlow Lite Runtime.
+# * TensorFlow Lite Flex Delegate.
+ifeq ($(WITH_TFLITE_FLEX), true)
+	EAGER_CC_ALL_SRCS += $(wildcard tensorflow/core/common_runtime/eager/*.cc)
+	EAGER_CC_EXCLUDE_SRCS := $(wildcard tensorflow/core/common_runtime/eager/*test.cc)
+	EAGER_CC_SRCS := $(filter-out $(EAGER_CC_EXCLUDE_SRCS), $(EAGER_CC_ALL_SRCS))
+	TF_CC_SRCS += $(EAGER_CC_SRCS)
+
+	TF_LITE_CORE_CC_ALL_SRCS := \
+	$(wildcard tensorflow/lite/*.cc) \
+	$(wildcard tensorflow/lite/*.c) \
+	$(wildcard tensorflow/lite/c/*.c) \
+	$(wildcard tensorflow/lite/core/api/*.cc)
+
+	TF_LITE_CORE_CC_ALL_SRCS += \
+	$(wildcard tensorflow/lite/kernels/*.cc) \
+	$(wildcard tensorflow/lite/kernels/internal/*.cc) \
+	$(wildcard tensorflow/lite/kernels/internal/optimized/*.cc) \
+	$(wildcard tensorflow/lite/kernels/internal/reference/*.cc) \
+	$(PROFILER_SRCS) \
+	$(wildcard tensorflow/lite/kernels/*.c) \
+	$(wildcard tensorflow/lite/kernels/internal/*.c) \
+	$(wildcard tensorflow/lite/kernels/internal/optimized/*.c) \
+	$(wildcard tensorflow/lite/kernels/internal/reference/*.c) \
+	$(wildcard tensorflow/lite/delegates/flex/*.cc)
+
+	# Hack. This shouldn't be here?
+	TF_LITE_CORE_CC_ALL_SRCS += \
+	$(wildcard tensorflow/contrib/makefile/downloads/farmhash/src/farmhash.cc) \
+
+	# Remove any duplicates.
+	TF_LITE_CORE_CC_ALL_SRCS := $(sort $(TF_LITE_CORE_CC_ALL_SRCS))
+	TF_LITE_CORE_CC_EXCLUDE_SRCS := \
+	$(wildcard tensorflow/lite/*test.cc) \
+	$(wildcard tensorflow/lite/*/*test.cc) \
+	$(wildcard tensorflow/lite/*/*/*test.cc) \
+	$(wildcard tensorflow/lite/*/*/*/*test.cc) \
+	$(wildcard tensorflow/lite/kernels/test_util.cc) \
+	$(wildcard tensorflow/lite/delegates/flex/test_util.cc) \
+	$(wildcard tensorflow/lite/nnapi_delegate.cc) \
+	$(wildcard tensorflow/lite/mmap_allocation_disabled.cc)
+
+	# Filter out all the excluded files.
+	TF_LITE_CC_SRCS := $(filter-out $(TF_LITE_CORE_CC_EXCLUDE_SRCS), $(TF_LITE_CORE_CC_ALL_SRCS))
+	TF_CC_SRCS += $(TF_LITE_CC_SRCS)
+endif
+
 ifdef HEXAGON_LIBS
 	TF_CC_SRCS += \
 tensorflow/cc/framework/scope.cc \
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 6c3b02e12b3082be8bfcc316c4c6122931eb5f76..1293e59cbcba86115e99b505b1f0672a01526462 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -142,7 +142,7 @@ First, download and install JetPack for Android version 3.2 or greater from [Nvi
 git clone https://github.com/tensorflow/tensorflow.git
 cd tensorflow
 JETPACK=$HOME/JetPack_Android_3.2
-TEGRA_LIBS="$JETPACK/cuDNN/aarch64/cuda/lib64/libcudnn.so  $JETPACK/cuda-9.0/extras/CUPTI/lib64/libcupti.so $JETPACK/cuda/targets/aarch64-linux-androideabi/lib64/libcufft.so"
+TEGRA_LIBS="$JETPACK/cuDNN/aarch64/cuda/lib64/libcudnn.so  $JETPACK/cuda/extras/CUPTI/lib64/libcupti.so $JETPACK/cuda/targets/aarch64-linux-androideabi/lib64/libcufft.so"
 ```
 
 #### Building all CUDA-enabled native binaries:
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index fb9e77ae1bcfc3404f1fdf90ab2697a4e79a9836..dc29694449729fe80d072aa06118ba0a8e64ca54 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -34,7 +34,7 @@ echo "********************************************************************"
 echo "TensorFlow Lite is the recommended library for mobile and embedded machine learning inference."
 echo "You are currently using an older version. Please switch over to TensorFlow Lite."
 echo ""
-echo "Link to the code: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite"
+echo "Link to the code: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite"
 echo "********************************************************************"
 echo ""
 
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index 1d4677ef4bd1e8811998d1464e63902544153a49..9a8059ce50041f21d00884896783a02e6285d55c 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -35,11 +35,10 @@ echo "********************************************************************"
 echo "TensorFlow Lite is the recommended library for mobile and embedded machine learning inference."
 echo "You are currently using an older version. Please switch over to TensorFlow Lite."
 echo ""
-echo "Link to the code: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite"
+echo "Link to the code: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite"
 echo "********************************************************************"
 echo ""
 
-DEFAULT_ARCH="i386 x86_64 armv7 armv7s arm64"
 while getopts "a:g:T" opt_name; do
   case "$opt_name" in
     a) BUILD_ARCH="${OPTARG}";;
@@ -138,7 +137,7 @@ if [[ ! -z "${BUILD_ARCH}" ]]; then
 fi
 
 # build the ios tensorflow libraries.
-echo "Building TensorFlow with flags: ${TF_SCRIPT_FLAGS} -f ${TF_CC_FLAGS}"
+echo "Building TensorFlow with command: ${TF_SCRIPT_FLAGS} -f ${TF_CC_FLAGS}"
 tensorflow/contrib/makefile/compile_ios_tensorflow.sh ${TF_SCRIPT_FLAGS} -f "${TF_CC_FLAGS}"
 
 # Creates a static universal library in
diff --git a/tensorflow/contrib/makefile/build_all_ios_with_tflite.sh b/tensorflow/contrib/makefile/build_all_ios_with_tflite.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8d34911f154101d4c0f4a02e69842986056b4b63
--- /dev/null
+++ b/tensorflow/contrib/makefile/build_all_ios_with_tflite.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This shell script is used to build TensorFlow Lite Flex runtime for iOS.
+# It compiles TensorFlow Lite and TensorFlow codebases together, and enable a
+# route to use TensorFlow kernels in TensorFlow Lite.
+#
+# After the script is executed, the multi-architecture static libraries will be
+# created under: `tensorflow/contrib/makefile/gen/lib/`.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+TOP_SRCDIR="${SCRIPT_DIR}/../../../"
+cd ${TOP_SRCDIR}
+
+# Exporting `WITH_TFLITE_FLEX`. The flag will be propagated all the way
+# down to Makefile.
+export WITH_TFLITE_FLEX="true"
+# Execute `build_all_ios.sh` and propagate all parameters.
+tensorflow/contrib/makefile/build_all_ios.sh $*
+
+# Copy all the libraries required for TFLite Flex runtime together.
+cd "${TOP_SRCDIR}/tensorflow/contrib/makefile"
+cp 'downloads/nsync/builds/lipo.ios.c++11/nsync.a' 'gen/lib/'
+cp 'gen/protobuf_ios/lib/libprotobuf.a' 'gen/lib/'
+cp 'gen/lib/libtensorflow-core.a' 'gen/lib/libtensorflow-lite.a'
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index dc9b17a62783817ec9a2998c4d5548c0f05e073b..2a5232b476712a96f84be0f4725beb78bc138297 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -30,11 +30,13 @@ EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE
 GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-# Note: The Protobuf source in `tensorflow/workspace.bzl` in TensorFlow
-# 1.10 branch does not work. `make distclean` fails and blocks the build
-# process. For now we're hardcoding to the version which is used by
-# TensorFlow 1.9.
-PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz"
+
+# Note: The protobuf repo needs to be cloned due to its submodules.
+# These variables contain the GitHub repo and the sha, from `tensorflow/workspace.bzl`,
+# from which to clone it from and checkout to.
+readonly PROTOBUF_REPO="https://github.com/protocolbuffers/protobuf.git"
+readonly PROTOBUF_TAG="$(grep -o 'https://github.com/protocolbuffers/protobuf/archive/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1 | awk '{print substr($0, index($0, "archive") + 8, index($0, "tar") - index($0, "archive") - 9) }')"
+
 # TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' once
 # the archive has been propagated in mirror.bazel.build.
 RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
@@ -43,6 +45,10 @@ DOUBLE_CONVERSION_URL="$(grep -o "https.*google/double-conversion.*\.zip" "${BZL
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 
+# Required for TensorFlow Lite Flex runtime.
+FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
+FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz"
+
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
 replace_by_sed() {
@@ -87,17 +93,46 @@ download_and_extract() {
   find "${dir}" -type f -name '*BUILD' -delete
 }
 
+function clone_repository() {
+  local repo_url="${1}"
+  local destination_directory="${2}"
+  local commit_sha="${3}"
+
+  if [[ -d "${destination_directory}" ]]; then
+    rm -rf "${destination_directory}"
+  fi
+
+  git clone "${repo_url}" "${destination_directory}"
+
+  pushd "$(pwd)" 1>/dev/null
+
+  cd "${destination_directory}"
+
+  if [[ -n "${commit_sha}" ]]; then
+    git checkout "${PROTOBUF_TAG}"
+  fi
+
+  git submodule update --init
+
+  popd 1>/dev/null
+}
+
 download_and_extract "${EIGEN_URL}" "${DOWNLOADS_DIR}/eigen"
 download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
 download_and_extract "${GOOGLETEST_URL}" "${DOWNLOADS_DIR}/googletest"
 download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync"
-download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
 download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
 download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
 download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion"
 download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive"
 
+# Required for TensorFlow Lite Flex runtime.
+download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
+download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+
+clone_repository "${PROTOBUF_REPO}" "${DOWNLOADS_DIR}/protobuf" "${PROTOBUF_TAG}"
+
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#static uint32x2_t p2ui_CONJ_XOR;// = vld1_u32( conj_XOR_DATA ); - Removed by scripts#' \
diff --git a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
index 0d8df93d1116afc1a56f598a9ea776010ae38fd0..87c73ec1ca610cac6d63468887bc350bada5910b 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
@@ -27,6 +27,7 @@ tensorflow/core/grappler/costs/op_performance_data.pb.cc
 tensorflow/core/lib/core/error_codes.pb.cc
 tensorflow/core/protobuf/cluster.pb.cc
 tensorflow/core/protobuf/config.pb.cc
+tensorflow/core/protobuf/eager_service.pb.cc
 tensorflow/core/protobuf/debug.pb.cc
 tensorflow/core/protobuf/device_properties.pb.cc
 tensorflow/core/protobuf/meta_graph.pb.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
index d982df93193268e1eb68a892592a48fbfbc50155..4120ea52ec5255b1efce7a6ce6890fc79c1e4831 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@@ -29,6 +29,7 @@ tensorflow/core/protobuf/cluster.pb.h
 tensorflow/core/protobuf/config.pb.h
 tensorflow/core/protobuf/debug.pb.h
 tensorflow/core/protobuf/device_properties.pb.h
+tensorflow/core/protobuf/eager_service.pb.h
 tensorflow/core/protobuf/meta_graph.pb.h
 tensorflow/core/protobuf/named_tensor.pb.h
 tensorflow/core/protobuf/queue_runner.pb.h
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 91af933cfff695c9426fbceebfeb7cbc5eaaf80d..655c7eefcb978d40c8bc16a23685e03ed71bfb63 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -42,6 +42,7 @@ tensorflow/core/kernels/conv_grad_filter_ops.cc
 tensorflow/core/kernels/conv_grad_input_ops.cc
 tensorflow/core/kernels/conv_grad_ops.cc
 tensorflow/core/kernels/conv_ops.cc
+tensorflow/core/kernels/conv_ops_3d.cc
 tensorflow/core/kernels/conv_ops_fused.cc
 tensorflow/core/kernels/conv_ops_using_gemm.cc
 tensorflow/core/kernels/crop_and_resize_op.cc
@@ -52,6 +53,8 @@ tensorflow/core/kernels/cwise_op_add_2.cc
 tensorflow/core/kernels/cwise_op_bitwise_and.cc
 tensorflow/core/kernels/cwise_op_bitwise_or.cc
 tensorflow/core/kernels/cwise_op_bitwise_xor.cc
+tensorflow/core/kernels/cwise_op_cos.cc
+tensorflow/core/kernels/cwise_op_cosh.cc
 tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_equal_to_1.cc
 tensorflow/core/kernels/cwise_op_equal_to_2.cc
@@ -86,10 +89,13 @@ tensorflow/core/kernels/cwise_op_rsqrt.cc
 tensorflow/core/kernels/cwise_op_select.cc
 tensorflow/core/kernels/cwise_op_sigmoid.cc
 tensorflow/core/kernels/cwise_op_sign.cc
+tensorflow/core/kernels/cwise_op_sin.cc
+tensorflow/core/kernels/cwise_op_sinh.cc
 tensorflow/core/kernels/cwise_op_sqrt.cc
 tensorflow/core/kernels/cwise_op_square.cc
 tensorflow/core/kernels/cwise_op_squared_difference.cc
 tensorflow/core/kernels/cwise_op_sub.cc
+tensorflow/core/kernels/cwise_op_tan.cc
 tensorflow/core/kernels/cwise_op_tanh.cc
 tensorflow/core/kernels/cwise_op_xdivy.cc
 tensorflow/core/kernels/cwise_op_xlogy.cc
@@ -151,14 +157,15 @@ tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc
+tensorflow/core/kernels/multinomial_op.cc
 tensorflow/core/kernels/no_op.cc
 tensorflow/core/kernels/non_max_suppression_op.cc
 tensorflow/core/kernels/one_hot_op.cc
-tensorflow/core/kernels/ops_util.cc
 tensorflow/core/kernels/pack_op.cc
 tensorflow/core/kernels/pad_op.cc
 tensorflow/core/kernels/padding_fifo_queue.cc
 tensorflow/core/kernels/padding_fifo_queue_op.cc
+tensorflow/core/kernels/pooling_ops_3d.cc
 tensorflow/core/kernels/pooling_ops_common.cc
 tensorflow/core/kernels/population_count_op.cc
 tensorflow/core/kernels/quantization_utils.cc
@@ -244,7 +251,9 @@ tensorflow/core/kernels/spectrogram_op.cc
 tensorflow/core/kernels/split_lib_cpu.cc
 tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_v_op.cc
+tensorflow/core/kernels/stack.cc
 tensorflow/core/kernels/stack_ops.cc
+tensorflow/core/kernels/stateless_random_ops.cc
 tensorflow/core/kernels/strided_slice_op.cc
 tensorflow/core/kernels/strided_slice_op_inst_0.cc
 tensorflow/core/kernels/strided_slice_op_inst_1.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 8bec3e3e01f7913944cda8dafc5f1993e96570bd..2712e906d719e72dacb60e213205ad68895f905f 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -34,6 +34,7 @@ tensorflow/core/lib/core/error_codes.proto
 tensorflow/core/protobuf/cluster.proto
 tensorflow/core/protobuf/config.proto
 tensorflow/core/protobuf/debug.proto
+tensorflow/core/protobuf/eager_service.proto
 tensorflow/core/protobuf/device_properties.proto
 tensorflow/core/protobuf/meta_graph.proto
 tensorflow/core/protobuf/named_tensor.proto
diff --git a/tensorflow/contrib/metrics/python/metrics/classification.py b/tensorflow/contrib/metrics/python/metrics/classification.py
index 7053907da05b487df73481e3ced269bb69b8deae..9aabc4bec3053871e3ff6cd3a88fd76d293f48cc 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification.py
@@ -18,13 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics_impl
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribution_strategy_context
 
 # TODO(nsilberman): move into metrics/python/ops/
 
@@ -167,15 +167,15 @@ def f1_score(labels, predictions, weights=None, num_thresholds=200,
           (precision_at_t + recall_at_t + epsilon))
       return math_ops.reduce_max(f1_at_thresholds)
 
-    def f1_across_towers(_, values):
+    def f1_across_replicas(_, values):
       best_f1 = compute_best_f1_score(tp=values['tp'], fp=values['fp'],
                                       fn=values['fn'], name='value')
       if metrics_collections:
         ops.add_to_collections(metrics_collections, best_f1)
       return best_f1
 
-    best_f1 = distribution_strategy_context.get_tower_context().merge_call(
-        f1_across_towers, values)
+    best_f1 = distribution_strategy_context.get_replica_context().merge_call(
+        f1_across_replicas, args=(values,))
 
     update_op = compute_best_f1_score(tp=update_ops['tp'], fp=update_ops['fp'],
                                       fn=update_ops['fn'], name='update')
diff --git a/tensorflow/contrib/metrics/python/metrics/classification_test.py b/tensorflow/contrib/metrics/python/metrics/classification_test.py
index d6a670f97b32a29129cb9ea0cd71c5a2b7597a47..e789d2cb9dfbac7b1e145be48b3f707af3fd4e18 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification_test.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification_test.py
@@ -291,12 +291,11 @@ class F1ScoreTest(test.TestCase):
 
     labels = labels.astype(np.float32)
     predictions = predictions.astype(np.float32)
-    tf_predictions, tf_labels = (dataset_ops.Dataset
-                                 .from_tensor_slices((predictions, labels))
-                                 .repeat()
-                                 .batch(batch_size)
-                                 .make_one_shot_iterator()
-                                 .get_next())
+    tf_predictions, tf_labels = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset
+        .from_tensor_slices((predictions, labels))
+        .repeat()
+        .batch(batch_size)).get_next()
     f1, f1_op = classification.f1_score(tf_labels, tf_predictions,
                                         num_thresholds=3)
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index bbf5d3f30c9f7fd0cbe2ad78da15ff3eb34ae2c5..7b432f8bd20989c6d95310bcaca88d44ce3e0d1f 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -45,24 +45,6 @@ from tensorflow.python.util.deprecation import deprecated
 _EPSILON = 1e-7
 
 
-def _safe_div(numerator, denominator, name):
-  """Divides two values, returning 0 if the denominator is <= 0.
-
-  Args:
-    numerator: A real `Tensor`.
-    denominator: A real `Tensor`, with dtype matching `numerator`.
-    name: Name for the returned op.
-
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.truediv(numerator, denominator),
-      0,
-      name=name)
-
-
 @deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
             'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
@@ -3238,22 +3220,20 @@ def streaming_covariance(predictions,
 
     # We update the means by Delta=Error*BatchCount/(BatchCount+PrevCount)
     # batch_mean_prediction is E[x_B] in the update equation
-    batch_mean_prediction = _safe_div(
-        math_ops.reduce_sum(weighted_predictions), batch_count,
-        'batch_mean_prediction')
-    delta_mean_prediction = _safe_div(
-        (batch_mean_prediction - mean_prediction) * batch_count, update_count,
-        'delta_mean_prediction')
+    batch_mean_prediction = math_ops.div_no_nan(
+        math_ops.reduce_sum(weighted_predictions), batch_count)
+    delta_mean_prediction = math_ops.div_no_nan(
+        (batch_mean_prediction - mean_prediction) * batch_count, update_count)
     update_mean_prediction = state_ops.assign_add(mean_prediction,
                                                   delta_mean_prediction)
     # prev_mean_prediction is E[x_A] in the update equation
     prev_mean_prediction = update_mean_prediction - delta_mean_prediction
 
     # batch_mean_label is E[y_B] in the update equation
-    batch_mean_label = _safe_div(
-        math_ops.reduce_sum(weighted_labels), batch_count, 'batch_mean_label')
-    delta_mean_label = _safe_div((batch_mean_label - mean_label) * batch_count,
-                                 update_count, 'delta_mean_label')
+    batch_mean_label = math_ops.div_no_nan(
+        math_ops.reduce_sum(weighted_labels), batch_count)
+    delta_mean_label = math_ops.div_no_nan(
+        (batch_mean_label - mean_label) * batch_count, update_count)
     update_mean_label = state_ops.assign_add(mean_label, delta_mean_label)
     # prev_mean_label is E[y_A] in the update equation
     prev_mean_label = update_mean_label - delta_mean_label
@@ -3436,7 +3416,7 @@ def streaming_mean_cosine_distance(predictions,
   predictions.get_shape().assert_is_compatible_with(labels.get_shape())
   radial_diffs = math_ops.multiply(predictions, labels)
   radial_diffs = math_ops.reduce_sum(
-      radial_diffs, reduction_indices=[
+      radial_diffs, axis=[
           dim,
       ], keepdims=True)
   mean_distance, update_op = streaming_mean(radial_diffs, weights, None, None,
@@ -3915,8 +3895,8 @@ def cohen_kappa(labels,
       po_sum = math_ops.reduce_sum(po)
       total = math_ops.reduce_sum(pe_row)
       pe_sum = math_ops.reduce_sum(
-          metrics_impl._safe_div(  # pylint: disable=protected-access
-              pe_row * pe_col, total, None))
+          math_ops.div_no_nan(
+              math_ops.to_double(pe_row * pe_col), math_ops.to_double(total)))
       po_sum, pe_sum, total = (math_ops.to_double(po_sum),
                                math_ops.to_double(pe_sum),
                                math_ops.to_double(total))
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
index 1b0383d24c0c472b4875d15c3650e37dfd2439e1..c922d0cd11fda3c51a51ceccf69798df7ce75f26 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 
 def _GetExampleIter(inputs):
   dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
-  return dataset.make_one_shot_iterator()
+  return dataset_ops.make_one_shot_iterator(dataset)
 
 
 class FixedLossScaleManagerTest(test.TestCase):
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
index 9009df0eefec13146090ba5fc2096e71ba6eb89d..33f9a43e803ea845a25bba284e41e5a0e6228dad 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
@@ -132,7 +132,7 @@ class LossScaleOptimizerTest(test.TestCase):
 
     x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
-    itr = dataset.make_one_shot_iterator()
+    itr = dataset_ops.make_one_shot_iterator(dataset)
 
     lr = 1
     opt = gd.GradientDescentOptimizer(lr)
@@ -182,7 +182,7 @@ class LossScaleOptimizerTest(test.TestCase):
 
     x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
-    itr = dataset.make_one_shot_iterator()
+    itr = dataset_ops.make_one_shot_iterator(dataset)
 
     lr = 1
     init_loss_scale = 8
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index b313024e2852caf2385454771b289ad0162cc463..45a60d79482787df4564ae3360f8252af93c7a26 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -51,7 +51,7 @@ The pruning library allows for specification of the following hyper parameters:
 | begin_pruning_step | integer | 0 | The global step at which to begin pruning |
 | end_pruning_step   | integer | -1 | The global step at which to terminate pruning. Defaults to -1 implying that pruning continues till  the training stops |
 | weight_sparsity_map | list of strings | [""] | list of weight variable name (or layer name):target sparsity pairs. Eg. [conv1:0.9,conv2/kernel:0.8]. For layers/weights not in this list, sparsity as specified by the target_sparsity hyperparameter is used. |
-| threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds |
+| threshold_decay | float | 0.0 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
 | nbins | integer | 256 | Number of bins to use for histogram computation. Note: When running on TPUs, a large (>1024) value for `nbins` may adversely affect the training time. |
 | block_height|integer | 1 | Number of rows in a block for block sparse matrices|
diff --git a/tensorflow/contrib/model_pruning/python/layers/core_layers.py b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
index 764ab620bc2227ff5e8e3f473d689e0e133e83d4..1fa5c8cb485704a5fccc486e823bbc4050bf505a 100644
--- a/tensorflow/contrib/model_pruning/python/layers/core_layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
@@ -119,15 +120,15 @@ class _MaskedConv(base.Layer):
     self.bias_initializer = bias_initializer
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
-    self.input_spec = base.InputSpec(ndim=self.rank + 2)
+    self.input_spec = input_spec.InputSpec(ndim=self.rank + 2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     channel_axis = 1 if self.data_format == 'channels_first' else -1
-    if input_shape[channel_axis].value is None:
+    if tensor_shape.dimension_value(input_shape[channel_axis]) is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
+    input_dim = tensor_shape.dimension_value(input_shape[channel_axis])
     kernel_shape = self.kernel_size + (input_dim, self.filters)
     self.mask = self.add_variable(
         name='mask',
@@ -171,7 +172,7 @@ class _MaskedConv(base.Layer):
           dtype=self.dtype)
     else:
       self.bias = None
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         ndim=self.rank + 2, axes={channel_axis: input_dim})
     self.built = True
 
@@ -393,19 +394,19 @@ class MaskedFullyConnected(base.Layer):
     self.bias_initializer = bias_initializer
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
-    self.input_spec = base.InputSpec(min_ndim=2)
+    self.input_spec = input_spec.InputSpec(min_ndim=2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1].value is None:
+    if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
-    self.input_spec = base.InputSpec(
-        min_ndim=2, axes={-1: input_shape[-1].value})
+    self.input_spec = input_spec.InputSpec(
+        min_ndim=2, axes={-1: tensor_shape.dimension_value(input_shape[-1])})
 
     self.kernel = self.add_variable(
         'kernel',
-        shape=[input_shape[-1].value, self.units],
+        shape=[tensor_shape.dimension_value(input_shape[-1]), self.units],
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
         dtype=self.dtype,
@@ -413,7 +414,7 @@ class MaskedFullyConnected(base.Layer):
 
     self.mask = self.add_variable(
         name='mask',
-        shape=[input_shape[-1].value, self.units],
+        shape=[tensor_shape.dimension_value(input_shape[-1]), self.units],
         initializer=init_ops.ones_initializer(),
         trainable=False,
         dtype=self.dtype)
@@ -470,7 +471,7 @@ class MaskedFullyConnected(base.Layer):
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     input_shape = input_shape.with_rank_at_least(2)
-    if input_shape[-1].value is None:
+    if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError(
           'The innermost dimension of input_shape must be defined, but saw: %s'
           % input_shape)
diff --git a/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py b/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py
index 5f6c6aea74f2965ccfe552a58cde290b5506ef12..2959019d6d8eac489e0cc5ece61ea59ce725d604 100644
--- a/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py
+++ b/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py
@@ -94,7 +94,7 @@ class MaskedBasicLSTMCell(tf_rnn.BasicLSTMCell):
 
     self.built = False
 
-    input_depth = inputs_shape[1].value
+    input_depth = inputs_shape.dims[1].value
     h_depth = self._num_units
     self._mask = self.add_variable(
         name="mask",
@@ -243,7 +243,7 @@ class MaskedLSTMCell(tf_rnn.LSTMCell):
 
     self.built = False
 
-    input_depth = inputs_shape[1].value
+    input_depth = inputs_shape.dims[1].value
     h_depth = self._num_units
     self._mask = self.add_variable(
         name="mask",
@@ -304,7 +304,7 @@ class MaskedLSTMCell(tf_rnn.LSTMCell):
       c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
       m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
 
-    input_size = inputs.get_shape().with_rank(2)[1]
+    input_size = inputs.get_shape().with_rank(2).dims[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 67e58ff15d83888d55364c02d4e04483ab86d096..f6b4373edd0544555dd16a373802d2feb5d674b1 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -204,13 +204,13 @@ def get_pruning_hparams():
       begin_pruning_step=0,
       end_pruning_step=-1,
       weight_sparsity_map=[''],
-      threshold_decay=0.9,
+      threshold_decay=0.0,
       pruning_frequency=10,
       nbins=256,
       block_height=1,
       block_width=1,
       block_pooling_function='AVG',
-      initial_sparsity=0,
+      initial_sparsity=0.0,
       target_sparsity=0.5,
       sparsity_function_begin_step=0,
       sparsity_function_end_step=100,
@@ -456,13 +456,14 @@ class Pruning(object):
 
       pool_window = [self._block_dim[0], self._block_dim[1]]
       pool_fn = pruning_utils.factorized_pool
-
+      squeeze_axis = None
       if not self._spec.use_tpu:
         pool_fn = nn_ops.pool
         abs_weights = array_ops.reshape(
             abs_weights,
             [1, abs_weights.get_shape()[0],
              abs_weights.get_shape()[1], 1])
+        squeeze_axis = [0, 3]
 
       pooled_weights = pool_fn(
           abs_weights,
@@ -473,7 +474,7 @@ class Pruning(object):
           name=weights.op.name + '_pooled')
 
       if pooled_weights.get_shape().ndims != 2:
-        pooled_weights = array_ops.squeeze(pooled_weights)
+        pooled_weights = array_ops.squeeze(pooled_weights, axis=squeeze_axis)
 
       smoothed_threshold, new_mask = self._update_mask(pooled_weights,
                                                        threshold)
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
index 91b0bb7f6003c047e4dcf342695f433edbc11614..14fc51229ab53a77e8089040e8a8576babd0fafd 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -188,7 +188,6 @@ def _histogram(values, value_range, nbins=100, dtype=dtypes.int32, name=None):
   with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
     values = ops.convert_to_tensor(values, name='values')
     values = array_ops.reshape(values, [-1])
-    value_range = ops.convert_to_tensor(value_range, name='value_range')
     nbins_float = np.float32(nbins)
 
     # Map tensor values that fall within value_range to [0, 1].
@@ -250,7 +249,6 @@ def compute_cdf(values, value_range, **kwargs):
   name = kwargs.get('name', None)
   with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
     values = ops.convert_to_tensor(values, name='values')
-    value_range = ops.convert_to_tensor(value_range, name='value_range')
     nbins_float = np.float32(nbins)
 
     # Map tensor values that fall within value_range to [0, 1].
@@ -336,7 +334,7 @@ def factorized_pool(input_tensor,
         padding=padding)
 
   return array_ops.squeeze(
-      array_ops.transpose(width_pooling, perm=[0, 1, 3, 2]))
+      array_ops.transpose(width_pooling, perm=[0, 1, 3, 2]), axis=[0, 1])
 
 
 def determine_partitioned_axis(partitioned_variable):
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
index 0aca843497611552d922715514118cac003c29b2..d6f2bfcb6c2e2beda912eb538d8a4a0a17b486b3 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -85,8 +85,28 @@ class PruningUtilsTest(test.TestCase):
 
 
 @parameterized.named_parameters(
-    ("1x1", [1, 1]), ("4x4", [4, 4]), ("6x6", [6, 6]), ("1x4", [1, 4]),
-    ("4x1", [4, 1]), ("1x8", [1, 8]), ("8x1", [8, 1]))
+    ("Input_32x32_block_1x1", [32, 32], [1, 1]),
+    # block size 6x6
+    ("Input_3x3_block_6x6", [3, 3], [6, 6]),
+    ("Input_32x32_block_6x6", [32, 32], [6, 6]),
+    ("Input_2x32_block_6x6", [2, 32], [6, 6]),
+    ("Input_32x2_block_6x6", [32, 2], [6, 6]),
+    ("Input_30x30_block_6x6", [30, 30], [6, 6]),
+    # block size 4x4
+    ("Input_32x32_block_4x4", [32, 32], [4, 4]),
+    ("Input_2x32_block_4x4", [2, 32], [4, 4]),
+    ("Input_32x2_block_4x4", [32, 2], [4, 4]),
+    ("Input_30x30_block_4x4", [30, 30], [4, 4]),
+    # block size 1x4
+    ("Input_32x32_block_1x4", [32, 32], [1, 4]),
+    ("Input_2x32_block_1x4", [2, 32], [1, 4]),
+    ("Input_32x2_block_1x4", [32, 2], [1, 4]),
+    ("Input_30x30_block_1x4", [30, 30], [1, 4]),
+    # block size 4x1
+    ("Input_32x32_block_4x1", [32, 32], [4, 1]),
+    ("Input_2x32_block_4x1", [2, 32], [4, 1]),
+    ("Input_32x2_block_4x1", [32, 2], [4, 1]),
+    ("Input_30x30_block_4x1", [30, 30], [4, 1]))
 class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
 
   def _compare_pooling_methods(self, weights, pooling_kwargs):
@@ -97,9 +117,11 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
               array_ops.reshape(
                   weights,
                   [1, weights.get_shape()[0],
-                   weights.get_shape()[1], 1]), **pooling_kwargs))
+                   weights.get_shape()[1], 1]), **pooling_kwargs),
+          axis=[0, 3])
       pooled_weights_factorized_pool = pruning_utils.factorized_pool(
           weights, **pooling_kwargs)
+
       self.assertAllClose(pooled_weights_tf.eval(),
                           pooled_weights_factorized_pool.eval())
 
@@ -113,8 +135,8 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
           [expanded_tensor, kronecker_product])
       self.assertAllEqual(expanded_tensor_val, kronecker_product_val)
 
-  def testFactorizedAvgPool(self, window_shape):
-    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+  def testFactorizedAvgPool(self, input_shape, window_shape):
+    weights = variable_scope.get_variable("weights", shape=input_shape)
     pooling_kwargs = {
         "window_shape": window_shape,
         "pooling_type": "AVG",
@@ -123,8 +145,8 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
     }
     self._compare_pooling_methods(weights, pooling_kwargs)
 
-  def testFactorizedMaxPool(self, window_shape):
-    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+  def testFactorizedMaxPool(self, input_shape, window_shape):
+    weights = variable_scope.get_variable("weights", shape=input_shape)
     pooling_kwargs = {
         "window_shape": window_shape,
         "pooling_type": "MAX",
@@ -133,8 +155,8 @@ class PruningUtilsParameterizedTest(test.TestCase, parameterized.TestCase):
     }
     self._compare_pooling_methods(weights, pooling_kwargs)
 
-  def testExpandTensor(self, block_dim):
-    weights = random_ops.random_normal(shape=[1024, 512])
+  def testExpandTensor(self, input_shape, block_dim):
+    weights = random_ops.random_normal(shape=input_shape)
     self._compare_expand_tensor_with_kronecker_product(weights, block_dim)
 
 
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
deleted file mode 100644
index 9a9d4802608dc94bf70082e1585de3890a5dbabf..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/nccl/BUILD
+++ /dev/null
@@ -1,177 +0,0 @@
-# Description:
-#   Wrap NVIDIA (https://github.com/NVIDIA/nccl) NCCL with tensorflow ops.
-#   APIs are meant to change over time.
-
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_custom_op_library",
-    "tf_gen_op_libs",
-    "tf_gen_op_wrapper_py",
-)
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow:tensorflow.bzl", "if_not_windows_cuda")
-
-tf_custom_op_library(
-    name = "python/ops/_nccl_ops.so",
-    srcs = [
-        "ops/nccl_ops.cc",
-    ],
-    gpu_srcs = if_not_windows_cuda([
-        "kernels/nccl_manager.cc",
-        "kernels/nccl_manager.h",
-        "kernels/nccl_ops.cc",
-    ]),
-    deps = [] + if_cuda([
-        "@local_config_nccl//:nccl",
-        "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:protos_all_proto_text",
-    ]),
-)
-
-tf_cuda_cc_test(
-    name = "nccl_manager_test",
-    size = "medium",
-    srcs = if_cuda(
-        [
-            "kernels/nccl_manager.cc",
-            "kernels/nccl_manager.h",
-            "kernels/nccl_manager_test.cc",
-        ],
-        [],
-    ),
-    # Disabled on jenkins until errors finding nvmlShutdown are found.
-    tags = [
-        "manual",
-        "multi_gpu",
-        "no_oss",
-        "noguitar",
-        "notap",
-    ],
-    deps =
-        if_cuda([
-            "@local_config_nccl//:nccl",
-            "//tensorflow/core:cuda",
-            "//tensorflow/core:test",
-            "//tensorflow/core:test_main",
-            "//tensorflow/core:testlib",
-        ]),
-)
-
-tf_kernel_library(
-    name = "nccl_kernels",
-    srcs = if_cuda([
-        "kernels/nccl_manager.cc",
-        "kernels/nccl_manager.h",
-        "kernels/nccl_ops.cc",
-        "kernels/nccl_rewrite.cc",
-    ]),
-    deps = if_cuda([
-        "@local_config_nccl//:nccl",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
-    ]),
-    alwayslink = 1,
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["nccl_ops"],
-    deps = [
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "nccl_ops",
-    deps = [":nccl_ops_op_lib"],
-)
-
-# Test only nccl ops lib without dso to test behavior when NCCL lib is not
-# installed. See nccl_dependency_test for more details.
-#
-# Users should use the public nccl_py lib that also adds the dso.
-tf_custom_op_py_library(
-    name = "nccl_ops_lib_without_dso",
-    srcs = [
-        "__init__.py",
-        "python/ops/nccl_ops.py",
-    ],
-    kernels = if_cuda([":nccl_kernels"]) + [
-        ":nccl_ops_op_lib",
-    ],
-    deps = [
-        ":nccl_ops",
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:device",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-tf_custom_op_py_library(
-    name = "nccl_py",
-    dso = [":python/ops/_nccl_ops.so"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":nccl_ops_lib_without_dso",
-    ],
-)
-
-cuda_py_test(
-    name = "nccl_ops_test",
-    size = "small",
-    srcs = ["python/ops/nccl_ops_test.py"],
-    additional_deps = [
-        ":nccl_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-    # Disabled on jenkins until errors finding nvmlShutdown are found.
-    tags = [
-        "manual",
-        "multi_gpu",
-        "no_oss",
-        "noguitar",
-        "notap",
-    ],
-)
-
-cuda_py_test(
-    name = "nccl_dependency_test",
-    size = "small",
-    srcs = ["python/ops/nccl_dependency_test.py"],
-    additional_deps = [
-        ":nccl_ops_lib_without_dso",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform_test",
-    ],
-    # Disable this test internally as static linking is used internally and only
-    # run for OSS to verify that NCCL is an optional dynamic dependency.
-    tags = [
-        "manual",
-        "noguitar",
-        "notap",
-    ],
-)
diff --git a/tensorflow/contrib/nccl/__init__.py b/tensorflow/contrib/nccl/__init__.py
deleted file mode 100644
index 4a682cb70369e1ae6edec67730618ab6a1ba6f47..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/nccl/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functions for using NVIDIA nccl collective ops.
-
-@@all_max
-@@all_min
-@@all_prod
-@@all_sum
-@@reduce_sum
-@@broadcast
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.nccl.python.ops.nccl_ops import all_max
-from tensorflow.contrib.nccl.python.ops.nccl_ops import all_min
-from tensorflow.contrib.nccl.python.ops.nccl_ops import all_prod
-from tensorflow.contrib.nccl.python.ops.nccl_ops import all_sum
-from tensorflow.contrib.nccl.python.ops.nccl_ops import broadcast
-from tensorflow.contrib.nccl.python.ops.nccl_ops import reduce_sum
-
-from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__)
diff --git a/tensorflow/contrib/nccl/kernels/nccl_ops.cc b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
deleted file mode 100644
index c2b76caef38a4af248387b65701b8f8936e8431f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ /dev/null
@@ -1,246 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#include <vector>
-
-#include "third_party/nccl/nccl.h"
-#include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-namespace tensorflow {
-namespace {
-
-// Base class for all communicator ops that use nccl.
-//
-// About memory management and stream syncing:
-// 1. The nccl communicator has a stream for each rank.
-// 2. For input tensors to the communicator, the compute stream is passed to the
-//    NcclManager which will do a needed
-//    communicator_stream.ThenWaitFor(input_tensor_stream).
-// 3. The done_callback of the async kernel is not called by the
-//    NcclManager until after the communicator kernel is complete. This
-//    is enough to a) keep the input tensor data valid for the lifetime of the
-//    collective; and b) ensure the data in the output tensor is available
-//    when the async op kernel's done callback is called.
-class NcclAsyncOpBase : public AsyncOpKernel {
- public:
-  explicit NcclAsyncOpBase(OpKernelConstruction* c) : AsyncOpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("num_devices", &num_devices_));
-    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &collective_prefix_));
-  }
-
-  string GetCollectiveKey(OpKernelContext* c) {
-    return strings::StrCat(collective_prefix_, ";", c->step_id(), ";",
-                           c->frame_iter().frame_id, ":",
-                           c->frame_iter().iter_id);
-  }
-
-  int num_devices() const { return num_devices_; }
-
- private:
-  int num_devices_;
-  string collective_prefix_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(NcclAsyncOpBase);
-};
-
-class NcclReduceOpBase : public NcclAsyncOpBase {
- public:
-  explicit NcclReduceOpBase(OpKernelConstruction* c) : NcclAsyncOpBase(c) {
-    string reduction;
-    OP_REQUIRES_OK(c, c->GetAttr("reduction", &reduction));
-    if (reduction == "min") {
-      reduction_op_ = ncclMin;
-    } else if (reduction == "max") {
-      reduction_op_ = ncclMax;
-    } else if (reduction == "sum") {
-      reduction_op_ = ncclSum;
-    } else if (reduction == "prod") {
-      reduction_op_ = ncclProd;
-    } else {
-      OP_REQUIRES_OK(c,
-                     errors::InvalidArgument("Invalid reduction: ", reduction));
-    }
-  }
-
-  ncclRedOp_t reduction_op() const { return reduction_op_; }
-
- private:
-  ncclRedOp_t reduction_op_;
-};
-
-// To execute a single all-reduce, this kernel is called once for each of the
-// <k> devices in the communicator.
-class NcclAllReduceOpKernel : public NcclReduceOpBase {
- public:
-  explicit NcclAllReduceOpKernel(OpKernelConstruction* c)
-      : NcclReduceOpBase(c) {}
-
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    const Tensor* in_t = &c->input(0);
-    Tensor* out_t;
-    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, in_t->shape(), &out_t), done);
-
-    auto actual_done = [c, done](Status s) {
-      OP_REQUIRES_OK_ASYNC(c, s, done);
-      done();
-    };
-
-    auto* compute_stream = c->op_device_context()->stream();
-    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
-    NcclManager::instance()->AddToAllReduce(
-        num_devices(), GetCollectiveKey(c), reduction_op(),
-        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
-        compute_stream, in_t, out_t, std::move(actual_done));
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("NcclAllReduce").Device(DEVICE_GPU),
-                        NcclAllReduceOpKernel);
-
-// To execute a single reduce, this kernel is called once for all but one of the
-// <k> devices in the communicator, and NcclReduceRecvKernel is called once for
-// the remaining device.
-class NcclReduceSendKernel : public NcclReduceOpBase {
- public:
-  explicit NcclReduceSendKernel(OpKernelConstruction* c)
-      : NcclReduceOpBase(c) {}
-
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    auto actual_done = [c, done](Status s) {
-      OP_REQUIRES_OK_ASYNC(c, s, done);
-      done();
-    };
-
-    auto* compute_stream = c->op_device_context()->stream();
-    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
-    NcclManager::instance()->AddReduceSend(
-        num_devices(), GetCollectiveKey(c), reduction_op(),
-        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
-        compute_stream, &c->input(0), std::move(actual_done));
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("_NcclReduceSend").Device(DEVICE_GPU),
-                        NcclReduceSendKernel);
-
-// To execute a single reduce, this kernel is called once for one devices, and
-// NcclReduceSendKernel is called for all other <k-1> devices in the
-// communicator.
-class NcclReduceRecvKernel : public NcclReduceOpBase {
- public:
-  explicit NcclReduceRecvKernel(OpKernelConstruction* c)
-      : NcclReduceOpBase(c) {}
-
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    const Tensor& in_t = c->input(0);
-    Tensor* out_t;
-    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, in_t.shape(), &out_t), done);
-
-    auto actual_done = [c, done](Status s) {
-      OP_REQUIRES_OK_ASYNC(c, s, done);
-      done();
-    };
-
-    auto* compute_stream = c->op_device_context()->stream();
-    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
-    NcclManager::instance()->AddReduceRecv(
-        num_devices(), GetCollectiveKey(c), reduction_op(),
-        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
-        compute_stream, &in_t, out_t, std::move(actual_done));
-  }
-
- private:
-  ncclRedOp_t reduction_op_;
-};
-REGISTER_KERNEL_BUILDER(Name("_NcclReduceRecv").Device(DEVICE_GPU),
-                        NcclReduceRecvKernel);
-
-// To execute a single broadcast, this kernel is called once for one device, and
-// NcclBroadcastRecvKernel is called for all other <k-1> devices in the
-// communicator.
-class NcclBroadcastSendKernel : public NcclAsyncOpBase {
- public:
-  explicit NcclBroadcastSendKernel(OpKernelConstruction* c)
-      : NcclAsyncOpBase(c) {}
-
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    auto actual_done = [c, done](Status s) {
-      OP_REQUIRES_OK_ASYNC(c, s, done);
-      done();
-    };
-
-    auto* compute_stream = c->op_device_context()->stream();
-    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
-    NcclManager::instance()->AddBroadcastSend(
-        num_devices(), GetCollectiveKey(c), compute_stream->parent(),
-        gpu_info->gpu_id, gpu_info->event_mgr, compute_stream, &c->input(0),
-        std::move(actual_done));
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("_NcclBroadcastSend").Device(DEVICE_GPU),
-                        NcclBroadcastSendKernel);
-
-// To execute a single broadcast, this kernel is called once for all but one of
-// the <k> devices in the communicator, and NcclBroadcastSendKernel is called
-// once for the remaining device.
-class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
- public:
-  explicit NcclBroadcastRecvKernel(OpKernelConstruction* c)
-      : NcclAsyncOpBase(c) {}
-
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    const Tensor& shape_t = c->input(0);
-    TensorShape shape;
-    OP_REQUIRES_OK_ASYNC(
-        c, TensorShapeUtils::MakeShape(shape_t.vec<int32>(), &shape), done);
-    Tensor* out_t;
-    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape, &out_t), done);
-
-    auto actual_done = [c, done](Status s) {
-      OP_REQUIRES_OK_ASYNC(c, s, done);
-      done();
-    };
-
-    auto* compute_stream = c->op_device_context()->stream();
-    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
-    NcclManager::instance()->AddBroadcastRecv(
-        num_devices(), GetCollectiveKey(c), compute_stream->parent(),
-        gpu_info->gpu_id, gpu_info->event_mgr, compute_stream, out_t,
-        std::move(actual_done));
-  }
-};
-REGISTER_KERNEL_BUILDER(
-    Name("_NcclBroadcastRecv").Device(DEVICE_GPU).HostMemory("shape"),
-    NcclBroadcastRecvKernel);
-
-// Define stub kernels for the ops that get replaced post placement.
-class NcclStubKernel : public AsyncOpKernel {
- public:
-  explicit NcclStubKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {}
-  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    c->SetStatus(errors::Unimplemented(
-        "This op should be replaced during graph optimization."));
-    done();
-  }
-};
-REGISTER_KERNEL_BUILDER(Name("NcclBroadcast").Device(DEVICE_GPU),
-                        NcclStubKernel);
-REGISTER_KERNEL_BUILDER(Name("NcclReduce").Device(DEVICE_GPU), NcclStubKernel);
-
-}  // namespace
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/nccl/ops/nccl_ops.cc b/tensorflow/contrib/nccl/ops/nccl_ops.cc
deleted file mode 100644
index a353a34b80add119fcdc8bc4230eddf5a77b30e8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/nccl/ops/nccl_ops.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-
-namespace tensorflow {
-
-using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
-
-REGISTER_OP("NcclAllReduce")
-    .Input("input: T")
-    .Output("data: T")
-    .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {half, float, float64, int32, int64}")
-    .Attr("num_devices: int")
-    .Attr("shared_name: string")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Outputs a tensor containing the reduction across all input tensors passed to ops
-within the same `shared_name.
-
-The graph should be constructed so if one op runs with shared_name value `c`,
-then `num_devices` ops will run with shared_name value `c`.  Failure to do so
-will cause the graph execution to fail to complete.
-
-input: the input to the reduction
-data: the value of the reduction across all `num_devices` devices.
-reduction: the reduction operation to perform.
-num_devices: The number of devices participating in this reduction.
-shared_name: Identifier that shared between ops of the same reduction.
-)doc");
-
-// Note: This op has no kernel implementation, but is replaced by
-// _NcclReduceSend and _NcclReduceRecv during graph optimization stage.
-REGISTER_OP("NcclReduce")
-    .Input("input: num_devices * T")
-    .Output("data: T")
-    .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {half, float, float64, int32, int64}")
-    .Attr("num_devices: int")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Reduces `input` from `num_devices` using `reduction` to a single device.
-
-The graph should be constructed so that all inputs have a valid device
-assignment, and the op itself is assigned one of these devices.
-
-input: The input to the reduction.
-data: the value of the reduction across all `num_devices` devices.
-reduction: the reduction operation to perform.
-    )doc");
-
-REGISTER_OP("_NcclReduceSend")
-    .Input("input: T")
-    .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {half, float, float64, int32, int64}")
-    .Attr("num_devices: int")
-    .Attr("shared_name: string")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Replacement node for NcclReduce.
-
-Reduces `input` to the NcclReduceRecv op registered in the same `shared_name`.
-The graph should be constructed so that 'num_devices-1' devices run
-`_NcclReduceSend` and one device runs _NcclReduceRecv op with shared_name value
-`c`. Failure to do so will cause the graph execution to fail to complete.
-
-input: The input to the reduction.
-reduction: the reduction operation to perform.
-num_devices: The number of devices participating in this reduction.
-shared_name: Identifier that is shared between ops of the same reduce.
-    )doc");
-
-REGISTER_OP("_NcclReduceRecv")
-    .Input("input: T")
-    .Output("data: T")
-    .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {half, float, float64, int32, int64}")
-    .Attr("num_devices: int")
-    .Attr("shared_name: string")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Replacement node for NcclReduce.
-
-Reduces 'input' from this op and the NcclReduceSend ops registered in the same
-`shared_name`.
-The graph should be constructed so that 'num_devices-1' devices run
-`_NcclReduceSend` and one device runs _NcclReduceRecv op with shared_name value
-`c`. Failure to do so will cause the graph execution to fail to complete.
-
-input: The input to the reduction.
-data: The reduced data received from this op and the NcclReduceSend op.
-reduction: the reduction operation to perform.
-num_devices: The number of devices participating in this reduction.
-shared_name: Identifier that is shared between ops of the same reduce.
-    )doc");
-
-// Note: This op has no kernel implementation, but is replaced by
-// _NcclBroadcastSend and _NcclBroadcastRecv during graph optimization stage.
-REGISTER_OP("NcclBroadcast")
-    .Input("input: T")
-    .Output("output: T")
-    .Attr("T: {half, float, float64, int32, int64}")
-    .Attr("shape: shape")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-Sends `input` to all devices that are connected to the output.
-
-The graph should be constructed so that all ops connected to the output have a
-valid device assignment, and the op itself is assigned one of these devices.
-
-input: The input to the broadcast.
-output: The same as input.
-shape: The shape of the input tensor.
-    )doc");
-
-REGISTER_OP("_NcclBroadcastSend")
-    .Input("input: T")
-    .Attr("T: {half, float, float64, int32, int64}")
-    .Attr("num_devices: int")
-    .Attr("shared_name: string")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-Replacement node for NcclBroadcast.
-
-Sends `input` to the _NcclBroadcastRecv ops registered in the same
-`shared_name`.
-The graph should be constructed so that one device runs `_NcclBroadcastSend` and
-`num_devices-1` devices run _NcclBroadcastRecv ops with shared_name value `c`.
-Failure to do so will cause the graph execution to fail to complete.
-
-input: The input to the broadcast.
-num_devices: The number of devices participating in this reduction.
-shared_name: Identifier that is shared between ops of the same broadcast.
-    )doc");
-
-REGISTER_OP("_NcclBroadcastRecv")
-    .Input("shape: int32")
-    .Output("output: T")
-    .Attr("T: {half, float, float64, int32, int64}")
-    .Attr("num_devices: int")
-    .Attr("shared_name: string")
-    .SetIsStateful()
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle out;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
-      c->set_output(0, out);
-      return Status::OK();
-    })
-    .Doc(R"doc(
-Replacement node for NcclBroadcast.
-
-Sends data of shape `shape` from the _NcclBroadcastSend op registered in the
-same `shared_name`.
-The graph should be constructed so that one device runs `_NcclBroadcastSend` and
-`num_devices-1` devices run _NcclBroadcastRecv ops with shared_name value `c`.
-Failure to do so will cause the graph execution to fail to complete.
-
-shape: The shape of the output.
-output: The broadcast data received from the NcclBroadcastSend op.
-num_devices: The number of devices participating in this reduction.
-shared_name: Identifier that is shared between ops of the same broadcast.
-    )doc");
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py b/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py
deleted file mode 100644
index c766080dbee7c9a6f4383ef6fa8cade7bba158af..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/nccl/python/ops/nccl_dependency_test.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Dependency test for nccl to test behavior when NCCL is not installed."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import nccl
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import tf_inspect
-
-
-class NcclDependencyTest(test.TestCase):
-  """Verifies that importing nccl ops lib does not fail even if NCCL is not
-  installed but nccl ops throws an exception on use if NCCL is not installed.
-  """
-
-  def test_nccl_ops(self):
-    """Tests behavior of nccl ops when NCCL is not installed."""
-
-    public_methods = [
-        m[0]
-        for m in tf_inspect.getmembers(nccl, tf_inspect.isfunction)
-        if not m[0].startswith('_')
-    ]
-    for method_name in public_methods:
-      with ops.device('/device:CPU:0'):
-        tensor = constant_op.constant(1)
-
-      if method_name == 'broadcast':
-        arg = tensor
-      else:
-        arg = [tensor]
-
-      nccl_op = getattr(nccl, method_name)
-      with ops.device('/device:CPU:0'):
-        with self.assertRaisesRegexp(errors_impl.NotFoundError,
-                                     r'cannot open shared object file'):
-          nccl_op(arg)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 61d8b94eca27427754cb2806f33d95e5643c660f..a1e220924f3a7c37aa9c9f3c3c1cc479b9a95bc0 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -129,7 +129,7 @@ class AdaMaxOptimizerTest(test.TestCase):
 
   def testSparseDevicePlacement(self):
     for index_dtype in [dtypes.int32, dtypes.int64]:
-      with self.test_session(force_gpu=test.is_gpu_available()):
+      with self.cached_session(force_gpu=test.is_gpu_available()):
         # If a GPU is available, tests that all optimizer ops can be placed on
         # it (i.e. they have GPU kernels).
         var = variables.Variable([[1.0], [2.0]])
diff --git a/tensorflow/contrib/opt/python/training/addsign_test.py b/tensorflow/contrib/opt/python/training/addsign_test.py
index 6150fa117fa17f1d5bb668a71f1abcd78d1f89b8..2c74acd9fff805aeaeec64e75c38eaa60ede66fd 100644
--- a/tensorflow/contrib/opt/python/training/addsign_test.py
+++ b/tensorflow/contrib/opt/python/training/addsign_test.py
@@ -66,7 +66,7 @@ class AddSignTest(test.TestCase):
                  alpha=1.0,
                  beta=0.9):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         # Initialize variables for numpy implementation.
         m0, m1 = 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -169,7 +169,7 @@ class AddSignTest(test.TestCase):
                   alpha=1.0,
                   beta=0.9):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         # Initialize variables for numpy implementation.
         m0, m1 = 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
index 6c203e5519e6a66d20e2509eca3c74eb66bf32c7..fa1a7aaff0aa59a6a64b1f0bf836a273926d785d 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import saver
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training.saving import saveable_object_util
 
 LOCAL_VARIABLE_NAME = 'local_center_variable'
 GLOBAL_VARIABLE_NAME = 'global_center_variable'
@@ -424,7 +425,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     if var_list is None:
       var_list = variables.trainable_variables()
     if not isinstance(var_list, dict):
-      var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
+      var_list = saveable_object_util.op_list_to_dict(var_list)
 
     swapped_var_list = {}
     for key, var in var_list.items():
@@ -464,4 +465,4 @@ class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
 
   def after_create_session(self, session, coord):
     """Run initialization ops"""
-    session.run(self._variable_init_op)
\ No newline at end of file
+    session.run(self._variable_init_op)
diff --git a/tensorflow/contrib/opt/python/training/external_optimizer.py b/tensorflow/contrib/opt/python/training/external_optimizer.py
index 82ebca7f20306e5658c8321716e39f9c7f8b8970..e5e52f7dc3a70892322c65ac968c14a9c3115df4 100644
--- a/tensorflow/contrib/opt/python/training/external_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/external_optimizer.py
@@ -429,7 +429,7 @@ def _accumulate(list_):
 
 
 def _get_shape_tuple(tensor):
-  return tuple(dim.value for dim in tensor.get_shape())
+  return tuple(tensor.get_shape().as_list())
 
 
 def _prod(array):
diff --git a/tensorflow/contrib/opt/python/training/ggt.py b/tensorflow/contrib/opt/python/training/ggt.py
index cae952d8f50acbc3a176697fb3989db6c9ac3e9b..6dc17fe5a5210fa1700e1382016e40fa0a792917 100644
--- a/tensorflow/contrib/opt/python/training/ggt.py
+++ b/tensorflow/contrib/opt/python/training/ggt.py
@@ -21,6 +21,7 @@ import collections
 import numpy as np
 from tensorflow.contrib.optimizer_v2 import optimizer_v2
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
@@ -120,7 +121,7 @@ class GGTOptimizer(optimizer_v2.OptimizerV2):
     # Construct ordered dictionary for variable dimensions, sorted by name.
     shape_dict = {}
     for v in var_list:
-      shape_dict[v.name] = np.prod(v.get_shape()).value
+      shape_dict[v.name] = tensor_shape.dimension_value(np.prod(v.get_shape()))
     self.shape_dict = collections.OrderedDict(
         sorted(shape_dict.items(), key=lambda t: t[0]))
 
diff --git a/tensorflow/contrib/opt/python/training/lars_optimizer.py b/tensorflow/contrib/opt/python/training/lars_optimizer.py
index a8dafd9a4cb9c669400f74b545b3c165bd49b2a2..bc18177b6d0b1d3f4fc58236bbc3d445fb73d80d 100644
--- a/tensorflow/contrib/opt/python/training/lars_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lars_optimizer.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -162,3 +163,14 @@ class LARSOptimizer(optimizer.Optimizer):
         math_ops.cast(self._momentum_tensor, grad.dtype),
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov)
+
+  def _prepare(self):
+    learning_rate = self._learning_rate
+    if callable(learning_rate):
+      learning_rate = learning_rate()
+    self._learning_rate_tensor = ops.convert_to_tensor(
+        learning_rate, name="learning_rate")
+    momentum = self._momentum
+    if callable(momentum):
+      momentum = momentum()
+    self._momentum_tensor = ops.convert_to_tensor(momentum, name="momentum")
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py
index 089ecf597dfb1890137883d1c05d3c40db3112b5..65ad724b3c3b7f224cab3af398f8e54e0c1721da 100644
--- a/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_optimizer_test.py
@@ -107,7 +107,7 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
   @parameterized.parameters([False, True])
   def testSparseDevicePlacement(self, use_resource):
     for index_dtype in [dtypes.int32, dtypes.int64]:
-      with self.test_session(force_gpu=test.is_gpu_available()):
+      with self.cached_session(force_gpu=test.is_gpu_available()):
         # If a GPU is available, tests that all optimizer ops can be placed on
         # it (i.e. they have GPU kernels).
         if use_resource:
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index 9ce50bfe1054072b315adecb87f1ba729dfe0d83..bf3e5c51f78cc3ca3c7c77009c9cf428c4988953 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import moving_averages
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object_util
 
 
 class MovingAverageOptimizer(optimizer.Optimizer):
@@ -106,6 +107,32 @@ class MovingAverageOptimizer(optimizer.Optimizer):
       self._swapped_variable_name_map[v_avg.op.name] = v.op.name
     return control_flow_ops.group(train_op, ma_op, name='train_with_avg')
 
+  def _find_swapped_variable(self, v_name_to_tensor, v_name, tensor):
+    """Returns name of swapped variable for given tensor.
+
+    Args:
+      v_name_to_tensor: Mapping from variable names to tensors.
+      v_name: name of the variable for which swapped variable should be returned
+      tensor: Tensor which correspond to variable for which swapped variable
+        should be returned.
+
+    Returns:
+      Tensor which correspond to swapped variable.
+
+    Raises:
+      ValueError: If swapped variable could not be found in v_name_to_tensor.
+    """
+    swapped_v_name = self._swapped_variable_name_map.get(v_name, None)
+    if swapped_v_name is None:
+      return tensor
+    else:
+      if swapped_v_name in v_name_to_tensor:
+        return v_name_to_tensor[swapped_v_name]
+      else:
+        raise ValueError(
+            ('Variable to swap %s is not part of variables to save. '
+             'This breaks MovingAverageOptimizer.') % swapped_v_name)
+
   def swapping_saver(self, var_list=None, name='swapping_saver', **kwargs):
     """Create a saver swapping moving averages and variables.
 
@@ -139,35 +166,35 @@ class MovingAverageOptimizer(optimizer.Optimizer):
     if var_list is None:
       var_list = variables.global_variables()
     if not isinstance(var_list, dict):
-      var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
-
-    # OpListToDict converts variables to tensors. We make sure we can get
-    # the unique variable name for normal and resource vaiables.
-    def get_v_name(tensor):
-      if tensor.op.type == 'ReadVariableOp':
-        return tensor.op.inputs[0].op.name
-      else:
-        return tensor.op.name
+      var_list = saveable_object_util.op_list_to_dict(var_list)
 
     v_name_to_tensor = {}
-    for tensor in six.itervalues(var_list):
-      v_name = get_v_name(tensor)
-      v_name_to_tensor[v_name] = tensor
+    for k, tensor_or_list in six.iteritems(var_list):
+      # For each partitioned variable OpListToDict returns list of constituent
+      # parts instead of single tensor.
+      if (isinstance(tensor_or_list, list)
+          or isinstance(tensor_or_list, variables.PartitionedVariable)):
+        for tensor in tensor_or_list:
+          v_name = tensor.op.name
+          v_name_to_tensor[v_name] = tensor
+      else:
+        v_name_to_tensor[k] = tensor_or_list
 
     # Now swap variables and moving averages
     swapped_var_list = {}
-    for k, tensor in six.iteritems(var_list):
-      v_name = get_v_name(tensor)
-      swapped_v_name = self._swapped_variable_name_map.get(v_name, None)
-      tensor_to_save = tensor
-      if swapped_v_name is not None:
-        if swapped_v_name in v_name_to_tensor:
-          tensor_to_save = v_name_to_tensor[swapped_v_name]
-        else:
-          raise ValueError(
-              ('Variable to swap %s is not part of variables to save. '
-               'This breaks MovingAverageOptimizer.') % swapped_v_name)
-      swapped_var_list[k] = tensor_to_save
+    for k, tensor_or_list in six.iteritems(var_list):
+      if isinstance(tensor_or_list, list):
+        tensor_list_to_save = []
+        for tensor in tensor_or_list:
+          v_name = tensor.op.name
+          swapped_variable = self._find_swapped_variable(v_name_to_tensor,
+                                                         v_name,
+                                                         tensor)
+          tensor_list_to_save.append(swapped_variable)
+        swapped_var_list[k] = tensor_list_to_save
+      else:
+        swapped_var_list[k] = self._find_swapped_variable(
+            v_name_to_tensor, k, tensor_or_list)
 
     # Build the swapping saver.
     return saver.Saver(swapped_var_list, name=name, **kwargs)
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index f22e7245285a8b2716645f9789eb5997928a22d2..643403eea6f88bcb33aa96d6539bc9a45a109c6b 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -26,6 +26,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -43,97 +45,171 @@ class MovingAverageOptimizerTest(test.TestCase):
     # Test that MovingAverageOptimizer works with resource variables.
     self._helpTestRun(use_resource=True)
 
-  def _helpTestRun(self, use_resource=False):
+  def testRunUsePartitionedVars(self):
+    # Test that MovingAverageOptimizer works with partitioned variables.
+    self._helpTestRun(use_partitioned_vars=True)
+
+  def testRunUseResourcePartitionedVars(self):
+    # Test that MovingAverageOptimizer works with resource and partitioned
+    # variables.
+    self._helpTestRun(use_partitioned_vars=True, use_resource=True)
+
+  def _helpTestRun(self, use_resource=False, use_partitioned_vars=False):
+    # Partitioned variables are represented as a "collection" of partitions.
+    # To simplify the test and reuse as much code as possible we employ
+    # following test strategy for partitioned variables.
+    #
+    # In the case of non-partitioned variables test runs on variables with
+    # shape [2].
+    #
+    # In the case of partitioned variables we use shape [4] with two partitions,
+    # thus each partition has shape [2].
+    # For partitioned variables the test is run twice (for loop over
+    # variable_part_names), first time on the first partition of each variable,
+    # second time on the second partition of each variable.
+    variable_part_names = ['part_0', 'part_1'] if use_partitioned_vars else ['']
     for sequential_update in [True, False]:
       for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-        with self.session(graph=ops.Graph()) as sess:
-          orig_val0 = [1.0, 2.0]
-          orig_val1 = [3.0, 4.0]
-          var0 = variable_scope.get_variable(
-              'var0',
-              initializer=constant_op.constant(orig_val0, dtype=dtype),
-              use_resource=use_resource)
-          var1 = variable_scope.get_variable(
-              'var1',
-              initializer=constant_op.constant(orig_val1, dtype=dtype),
-              use_resource=use_resource)
-          grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-          grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-
-          opt = moving_average_optimizer.MovingAverageOptimizer(
-              gradient_descent.GradientDescentOptimizer(learning_rate=2.0),
-              average_decay=0.5,
-              sequential_update=sequential_update)
-          save_dir = tempfile.mkdtemp(
-              prefix=os.path.join(self.get_temp_dir(), 'run_1'))
-          save_path = os.path.join(save_dir, 'model')
-          update = opt.apply_gradients(
-              list(six.moves.zip([grads0, grads1], [var0, var1])))
-          global_vars = variables.global_variables()
-          ema_var0 = [
-              v for v in global_vars
-              if v.op.name == 'var0/ExponentialMovingAverage'
-          ][0]
-          ema_var1 = [
-              v for v in global_vars
-              if v.op.name == 'var1/ExponentialMovingAverage'
-          ][0]
-          perturb = control_flow_ops.group([
-              state_ops.assign_add(var0, [1.0, 1.0]),
-              state_ops.assign_add(var1, [2.0, 2.0]),
-              state_ops.assign_add(ema_var0, [3.0, 3.0]),
-              state_ops.assign_add(ema_var1, [4.0, 4.0])
-          ])
-
-          # Test that saver with missing ema variables will fail.
-          with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
-            opt.swapping_saver(var_list=[var0])
-
-          train_saver = opt.swapping_saver()
-          train_saver_subset = opt.swapping_saver(var_list=[var0, ema_var0])
-          inference_saver = saver.Saver()
-          variables.global_variables_initializer().run()
-          # Step 1.
-          update.run()
-          self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
-          self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
-          if sequential_update:
-            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
-          # Test that the swapping saver save/restore operation is identity.
-          train_saver.save(sess, save_path)
-          train_saver.restore(sess, save_path)
-          self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
-          self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
-          if sequential_update:
-            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
-          # Test that the subset saver saves the EMA variable as well.
-          if sequential_update:
-            subset_save_path = save_path + '_subset'
-            train_saver_subset.save(sess, subset_save_path)
-            perturb.run()
-            self.assertAllCloseAccordingToType([1.8, 2.8], var0.eval())
-            self.assertAllCloseAccordingToType([3.9, 4.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
-            self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restoring should only restore var0 and ema_var0.
-            train_saver_subset.restore(sess, subset_save_path)
+        for var_part_name in variable_part_names:
+          with self.session(graph=ops.Graph()) as sess:
+            orig_val0 = [1.0, 2.0]
+            orig_val1 = [3.0, 4.0]
+            grads0 = [0.1, 0.1]
+            grads1 = [0.01, 0.01]
+            if use_partitioned_vars:
+              # Use partitioned variables.
+              # Create partitioned and duplicate each value used as initial
+              # value of variables.
+              partitioner = partitioned_variables.fixed_size_partitioner(
+                  num_shards=2)
+              orig_val0 = orig_val0 * 2
+              orig_val1 = orig_val1 * 2
+              grads0 = grads0 * 2
+              grads1 = grads1 * 2
+            else:
+              # Regular (non-partitioned) variables.
+              partitioner = None
+            var0 = variable_scope.get_variable(
+                'var0',
+                initializer=constant_op.constant(orig_val0, dtype=dtype),
+                use_resource=use_resource,
+                partitioner=partitioner)
+            var1 = variable_scope.get_variable(
+                'var1',
+                initializer=constant_op.constant(orig_val1, dtype=dtype),
+                use_resource=use_resource,
+                partitioner=partitioner)
+            # Make a fake loss, such that gradient(loss, var0) == grads0
+            # and gradient(loss, var1) == grads1
+            grads0 = constant_op.constant(grads0, dtype=dtype)
+            grads1 = constant_op.constant(grads1, dtype=dtype)
+            loss = (math_ops.reduce_sum(grads0 * var0)
+                    + math_ops.reduce_sum(grads1 * var1))
+
+            opt = moving_average_optimizer.MovingAverageOptimizer(
+                gradient_descent.GradientDescentOptimizer(learning_rate=2.0),
+                average_decay=0.5,
+                sequential_update=sequential_update)
+            save_dir = tempfile.mkdtemp(
+                prefix=os.path.join(self.get_temp_dir(), 'run_1'))
+            save_path = os.path.join(save_dir, 'model')
+
+            update = opt.minimize(loss)
+
+            # Get variables and their EMAs. In case of partitioned variables
+            # get proper part of each variable.
+            def _get_variable(var_name, part_name, ema):
+              """Returns variable of it's moving average by name."""
+              matches = [
+                  v for v in variables.global_variables()
+                  if ((var_name in v.op.name)
+                      and (part_name in v.op.name)
+                      and (('ExponentialMovingAverage' in v.op.name) == ema))
+              ]
+              self.assertEqual(len(matches), 1)
+              return matches[0]
+            var0 = _get_variable('var0', var_part_name, ema=False)
+            var1 = _get_variable('var1', var_part_name, ema=False)
+            ema_var0 = _get_variable('var0', var_part_name, ema=True)
+            ema_var1 = _get_variable('var1', var_part_name, ema=True)
+
+            perturb = control_flow_ops.group([
+                state_ops.assign_add(var0, [1.0, 1.0]),
+                state_ops.assign_add(var1, [2.0, 2.0]),
+                state_ops.assign_add(ema_var0, [3.0, 3.0]),
+                state_ops.assign_add(ema_var1, [4.0, 4.0])
+            ])
+
+            # Test that saver with missing ema variables will fail.
+            with self.assertRaisesRegexp(ValueError, r'Variable to swap'):
+              opt.swapping_saver(var_list=[var0])
+
+            train_saver = opt.swapping_saver()
+            train_saver_subset = opt.swapping_saver(var_list=[var0, ema_var0])
+            inference_saver = saver.Saver()
+            variables.global_variables_initializer().run()
+            # Step 1.
+            update.run()
             self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
-            self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
-            self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
-            self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
-            # Restore back to previous state.
+            self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
+            if sequential_update:
+              self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
+            # Test that the swapping saver save/restore operation is identity.
+            train_saver.save(sess, save_path)
             train_saver.restore(sess, save_path)
+            self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
+            self.assertAllCloseAccordingToType([2.98, 3.98], var1.eval())
+            if sequential_update:
+              self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([2.99, 3.99], ema_var1.eval())
+            # Test that the subset saver saves the EMA variable as well.
+            if sequential_update:
+              subset_save_path = save_path + '_subset'
+              train_saver_subset.save(sess, subset_save_path)
+              perturb.run()
+              self.assertAllCloseAccordingToType([1.8, 2.8], var0.eval())
+              self.assertAllCloseAccordingToType([3.9, 4.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
+              self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
+              # Restoring should only restore var0 and ema_var0.
+              train_saver_subset.restore(sess, subset_save_path)
+              self.assertAllCloseAccordingToType([0.8, 1.8], var0.eval())
+              self.assertAllCloseAccordingToType([0.9, 1.9], ema_var0.eval())
+              self.assertAllCloseAccordingToType([4.98, 5.98], var1.eval())
+              self.assertAllCloseAccordingToType([6.99, 7.99], ema_var1.eval())
+              # Restore back to previous state.
+              train_saver.restore(sess, save_path)
 
-          # If updates are parallel, this is not always true after the 1st step.
-          if sequential_update:
+            # If updates are parallel,
+            # this is not always true after the 1st step.
+            if sequential_update:
+              # Test that the normal saver will have the averaged variables.
+              # We test that the average values are between the original value
+              # and the most recent variable values (since they are an average
+              # of the two).
+              val0 = var0.eval()
+              val1 = var1.eval()
+              train_saver.save(sess, save_path)
+              inference_saver.restore(sess, save_path)
+              avg_val0 = var0.eval()
+              avg_val1 = var1.eval()
+              for i in six.moves.range(len(val0)):
+                self.assertLess(val0[i], avg_val0[i])
+                self.assertLess(avg_val0[i], orig_val0[i])
+                self.assertLess(val1[i], avg_val1[i])
+                self.assertLess(avg_val1[i], orig_val1[i])
+              train_saver.restore(sess, save_path)
+            # Step 2.
+            update.run()
             # Test that the normal saver will have the averaged variables.
-            # We test that the average values are between the original value
-            # and the most recent variable values (since they are an average
-            # of the two).
+            # We test that the average values are between the original value and
+            # the most recent variable values (since they are an average of the
+            # two).
             val0 = var0.eval()
             val1 = var1.eval()
+            self.assertAllCloseAccordingToType([0.6, 1.6], val0)
+            self.assertAllCloseAccordingToType([2.96, 3.96], val1)
             train_saver.save(sess, save_path)
             inference_saver.restore(sess, save_path)
             avg_val0 = var0.eval()
@@ -143,26 +219,6 @@ class MovingAverageOptimizerTest(test.TestCase):
               self.assertLess(avg_val0[i], orig_val0[i])
               self.assertLess(val1[i], avg_val1[i])
               self.assertLess(avg_val1[i], orig_val1[i])
-            train_saver.restore(sess, save_path)
-          # Step 2.
-          update.run()
-          # Test that the normal saver will have the averaged variables.
-          # We test that the average values are between the original value and
-          # the most recent variable values (since they are an average of the
-          # two).
-          val0 = var0.eval()
-          val1 = var1.eval()
-          self.assertAllCloseAccordingToType([0.6, 1.6], val0)
-          self.assertAllCloseAccordingToType([2.96, 3.96], val1)
-          train_saver.save(sess, save_path)
-          inference_saver.restore(sess, save_path)
-          avg_val0 = var0.eval()
-          avg_val1 = var1.eval()
-          for i in six.moves.range(len(val0)):
-            self.assertLess(val0[i], avg_val0[i])
-            self.assertLess(avg_val0[i], orig_val0[i])
-            self.assertLess(val1[i], avg_val1[i])
-            self.assertLess(avg_val1[i], orig_val1[i])
 
   def testFailWhenSaverCreatedBeforeInitialized(self):
     with self.cached_session():
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
index 44a8890cb107440b79cf8fbbdfcfda503b1c910f..960826407b66b4efa3c2693efb6d2e17c4b47b33 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -83,14 +84,14 @@ class NadamOptimizer(adam.AdamOptimizer):
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
       # m_bar = (1 - beta1) * g_t + beta1 * m_t
-      m_bar = m_scaled_g_values + beta1_t * m_t
+      m_bar = m_scaled_g_values + beta1_t * array_ops.gather(m_t, indices)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, "v")
     v_scaled_g_values = (grad * grad) * (1 - beta2_t)
     v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
-    v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(
-        var, lr * m_bar / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+    v_t_slice = array_ops.gather(v_t, indices)
+    v_sqrt = math_ops.sqrt(v_t_slice)
+    var_update = scatter_add(var, indices, -lr * m_bar / (v_sqrt + epsilon_t))
     return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
index 85e05ce71cec6ef897cadb7d123e630febb3c064..a4372f64874e7591dbceac901fad6c941209bef9 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -52,14 +52,19 @@ def nadam_update_numpy(param,
 class NadamOptimizerTest(test.TestCase):
 
   def doTestSparse(self, use_resource=False):
+    # need to use a larger value of epsilon here so that
+    # np.sqrt(v_t) + epsilon doesn't get rounded to 0 when
+    # the dtype is half and np.sqrt(v_t) = 0, as is the case
+    # when the gradient is 0
+    sparse_epsilon = 1e-7
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
 
         if use_resource:
           var0 = resource_variable_ops.ResourceVariable(var0_np)
@@ -67,21 +72,21 @@ class NadamOptimizerTest(test.TestCase):
         else:
           var0 = variables.Variable(var0_np)
           var1 = variables.Variable(var1_np)
-        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
-            constant_op.constant(grads0_np),
-            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
-        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
         grads1 = ops.IndexedSlices(
-            constant_op.constant(grads1_np),
-            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
-        opt = nadam_optimizer.NadamOptimizer()
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = nadam_optimizer.NadamOptimizer(epsilon=sparse_epsilon)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
@@ -91,8 +96,10 @@ class NadamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
           update.run()
 
-          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0,
+                                               epsilon=sparse_epsilon)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1,
+                                               epsilon=sparse_epsilon)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
diff --git a/tensorflow/contrib/opt/python/training/powersign_test.py b/tensorflow/contrib/opt/python/training/powersign_test.py
index 1cf9901dc07637675bf49abb764b4bc3cf0224ae..f2c87b588390e6f22700caf54c88044e8556e8e2 100644
--- a/tensorflow/contrib/opt/python/training/powersign_test.py
+++ b/tensorflow/contrib/opt/python/training/powersign_test.py
@@ -67,7 +67,7 @@ class PowerSignTest(test.TestCase):
                  base=math.e,
                  beta=0.9):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         # Initialize variables for numpy implementation.
         m0, m1 = 0.0, 0.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
@@ -173,7 +173,7 @@ class PowerSignTest(test.TestCase):
                   py_sign_decay_fn=None,
                   base=math.e,
                   beta=0.9):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
         # Initialize variables for numpy implementation.
         m0, m1 = 0.0, 0.0
diff --git a/tensorflow/contrib/opt/python/training/shampoo.py b/tensorflow/contrib/opt/python/training/shampoo.py
index f161521b979b6107396ce0e001480fa28a462c72..e542f46892a3cea60b758a1a95ce2f20d5f29a67 100644
--- a/tensorflow/contrib/opt/python/training/shampoo.py
+++ b/tensorflow/contrib/opt/python/training/shampoo.py
@@ -108,7 +108,8 @@ class ShampooOptimizer(optimizer.Optimizer):
       precond_update_interval: We should update the preconditioners after
                                this many steps. Default = 1. Usually less than
                                svd_interval.
-      epsilon:  epsilon * I_n is added to each mat_gbar_j for stability
+      epsilon:  epsilon * I_n is added to each mat_gbar_j for stability for
+                non-diagonal version of shampoo.
       alpha:  total power of the preconditioners.
       use_iterative_root: should the optimizer use SVD (faster) or the
                           iterative root method (for TPU) for finding the
@@ -394,15 +395,20 @@ class ShampooOptimizer(optimizer.Optimizer):
           assert self._mat_gbar_decay == 1.0
           mat_g_updated = state_ops.scatter_add(mat_g, indices,
                                                 mat_gbar_weight_t * grad_outer)
-          mat_h = math_ops.pow(
-              array_ops.gather(mat_g_updated, indices) + self._epsilon,
-              neg_alpha)
+          mat_g_updated_slice = array_ops.gather(mat_g_updated, indices)
+          mat_h = array_ops.where(
+              math_ops.greater(mat_g_updated_slice, 0),
+              math_ops.pow(mat_g_updated_slice, neg_alpha),
+              array_ops.zeros_like(mat_g_updated_slice))
         else:
           mat_g_updated = self._weighted_average(mat_g,
                                                  self._mat_gbar_decay,
                                                  mat_gbar_decay_t,
                                                  mat_gbar_weight_t * grad_outer)
-          mat_h = math_ops.pow(mat_g_updated + self._epsilon, neg_alpha)
+          mat_h = array_ops.where(
+              math_ops.greater(mat_g_updated, 0),
+              math_ops.pow(mat_g_updated, neg_alpha),
+              array_ops.zeros_like(mat_g_updated))
 
         # Need to do the transpose to ensure that the tensor becomes
         # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above.
diff --git a/tensorflow/contrib/opt/python/training/shampoo_test.py b/tensorflow/contrib/opt/python/training/shampoo_test.py
index a2fd8fbd871d2634791d123f833f7a7940862592..e88c8221a0d0ba17d9f691abecc03e6c88968a0a 100644
--- a/tensorflow/contrib/opt/python/training/shampoo_test.py
+++ b/tensorflow/contrib/opt/python/training/shampoo_test.py
@@ -279,7 +279,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
       # Update rule is var = var - lr * gg^{-0.5} * grad
       # lr = 1
       mat_g = (grad_np * grad_np)
-      new_val_np = init_var_np - np.power(mat_g + RIDGE_EPSILON, -0.5) * grad_np
+      new_val_np = init_var_np - np.power(mat_g, -0.5) * grad_np
 
       self.assertAllCloseAccordingToType(
           new_val_np, new_val, atol=TOLERANCE, rtol=TOLERANCE)
@@ -288,7 +288,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
       new_val = sess.run(var)
 
       mat_g += (grad_np_2 * grad_np_2)
-      new_val_np -= np.power(mat_g + RIDGE_EPSILON, -0.5) * grad_np_2
+      new_val_np -= np.power(mat_g, -0.5) * grad_np_2
 
       self.assertAllCloseAccordingToType(
           new_val_np, new_val, atol=TOLERANCE, rtol=TOLERANCE)
@@ -339,7 +339,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
 
       mat_g1 = np.sum(
           grad_np * grad_np, axis=1, keepdims=True) / grad_np.shape[0]
-      mat_left = np.power(mat_g1 + RIDGE_EPSILON, -0.25)
+      mat_left = np.power(mat_g1, -0.25)
       mat_g2 = np.dot(grad_np.transpose(), grad_np) / grad_np.shape[1]
       mat_right = np_power(mat_g2 + RIDGE_EPSILON * np.eye(size[1]), -0.25)
       new_val_np = init_var_np - np.dot(grad_np * mat_left, mat_right)
@@ -353,7 +353,7 @@ class ShampooTest(test.TestCase, parameterized.TestCase):
 
       mat_g1 += np.sum(
           grad_np_2 * grad_np_2, axis=1, keepdims=True) / grad_np_2.shape[0]
-      mat_left = np.power(mat_g1 + RIDGE_EPSILON, -0.25)
+      mat_left = np.power(mat_g1, -0.25)
       mat_g2 += np.dot(grad_np_2.transpose(), grad_np_2) / grad_np_2.shape[1]
       mat_right = np_power(mat_g2 + RIDGE_EPSILON * np.eye(size[1]), -0.25)
       new_val_np -= np.dot(grad_np_2 * mat_left, mat_right)
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
index 200b0d200826a6212a236680327f4daf7d07831f..8b8065c678e11e8fc237e71cf1d392ced5c22ada 100644
--- a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
@@ -59,6 +59,23 @@ class DecoupledWeightDecayExtension(object):
   Note that this extension decays weights BEFORE applying the update based
   on the gradient, i.e. this extension only has the desired behaviour for
   optimizers which do not depend on the value of'var' in the update step!
+  
+  Note: when applying a decay to the learning rate, be sure to manually apply
+  the decay to the `weight_decay` as well. For example:
+
+  ```python
+    schedule = tf.train.piecewise_constant(tf.train.get_global_step(), 
+                                           [10000, 15000], [1e-0, 1e-1, 1e-2])
+    lr = 1e-1 * schedule()
+    wd = lambda: 1e-4 * schedule()
+
+    # ...
+
+    optimizer = tf.contrib.opt.MomentumWOptimizer(learning_rate=lr,
+                                                  weight_decay=wd,
+                                                  momentum=0.9,
+                                                  use_nesterov=True)
+  ```
   """
 
   def __init__(self, weight_decay, **kwargs):
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 3ba3ee29ec79687df522eb330665a2ce80061682..6e401406308604970677003aeea0f15c64cc74b6 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -48,7 +48,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
@@ -56,6 +55,8 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
     ],
 )
 
diff --git a/tensorflow/contrib/optimizer_v2/adagrad.py b/tensorflow/contrib/optimizer_v2/adagrad.py
index dab1e02716a9db23509eeb4a0f74200d0a5ae30e..346c3fbd2c82f5a830ba6b4d52aafdca8e9aff5f 100644
--- a/tensorflow/contrib/optimizer_v2/adagrad.py
+++ b/tensorflow/contrib/optimizer_v2/adagrad.py
@@ -66,15 +66,17 @@ class AdagradOptimizer(optimizer_v2.OptimizerV2):
     for v in var_list:
       dtype = v.dtype.base_dtype
       if v.get_shape().is_fully_defined():
-        init = init_ops.constant_initializer(self._initial_accumulator_value,
-                                             dtype=dtype)
+        init = init_ops.constant_initializer(
+            self._initial_accumulator_value, dtype=dtype)
       else:
+
         def init(v=v, dtype=dtype):
           # Use a Tensor instead of initializer if variable does not have
           # static shape.
-          init_constant = gen_array_ops.fill(array_ops.shape(v),
-                                             self._initial_accumulator_value)
+          init_constant = gen_array_ops.fill(
+              array_ops.shape(v), self._initial_accumulator_value)
           return math_ops.cast(init_constant, dtype)
+
       state.create_slot_with_initializer(v, init, v.get_shape(), dtype,
                                          "accumulator")
 
diff --git a/tensorflow/contrib/optimizer_v2/adagrad_test.py b/tensorflow/contrib/optimizer_v2/adagrad_test.py
index debaaaeeba998e6d41f1d2134b4ba4ce3f6b55c8..320e41567ff2400e857d7ed5f6b838b3d544b7c1 100644
--- a/tensorflow/contrib/optimizer_v2/adagrad_test.py
+++ b/tensorflow/contrib/optimizer_v2/adagrad_test.py
@@ -68,9 +68,6 @@ class AdagradOptimizerTest(test.TestCase):
   def testBasicResource(self):
     self.doTestBasic(use_locking=False, use_resource=True)
 
-  def testBasicLocked(self):
-    self.doTestBasic(use_locking=True)
-
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 04b1552b61ae45cb8370e94a0b8988913600708d..248ffb1f7eb5dc27112ddf9b8670344904065ed0 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -102,10 +102,10 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
   def _create_vars(self, var_list, state):
     # Non-slot variables end up on the same device(s).
-    state.create_non_slot(initial_value=lambda: state.get_hyper("beta1"),
-                          name="beta1_power")
-    state.create_non_slot(initial_value=lambda: state.get_hyper("beta2"),
-                          name="beta2_power")
+    state.create_non_slot(
+        initial_value=lambda: state.get_hyper("beta1"), name="beta1_power")
+    state.create_non_slot(
+        initial_value=lambda: state.get_hyper("beta2"), name="beta2_power")
 
     # Create slots for the first and second moments.
     for v in var_list:
@@ -117,28 +117,34 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
     v = state.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators(state)
     return training_ops.apply_adam(
-        var, m, v,
+        var,
+        m,
+        v,
         math_ops.cast(beta1_power, var.dtype.base_dtype),
         math_ops.cast(beta2_power, var.dtype.base_dtype),
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         state.get_hyper("beta1", var.dtype.base_dtype),
         state.get_hyper("beta2", var.dtype.base_dtype),
         state.get_hyper("epsilon", var.dtype.base_dtype),
-        grad, use_locking=self._use_locking).op
+        grad,
+        use_locking=self._use_locking).op
 
   def _resource_apply_dense(self, grad, var, state):
     m = state.get_slot(var, "m")
     v = state.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators(state)
     return training_ops.resource_apply_adam(
-        var.handle, m.handle, v.handle,
+        var.handle,
+        m.handle,
+        v.handle,
         math_ops.cast(beta1_power, grad.dtype.base_dtype),
         math_ops.cast(beta2_power, grad.dtype.base_dtype),
         state.get_hyper("learning_rate", grad.dtype.base_dtype),
         state.get_hyper("beta1", grad.dtype.base_dtype),
         state.get_hyper("beta2", grad.dtype.base_dtype),
         state.get_hyper("epsilon", grad.dtype.base_dtype),
-        grad, use_locking=self._use_locking)
+        grad,
+        use_locking=self._use_locking)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add, state):
     beta1_power, beta2_power = self._get_beta_accumulators(state)
@@ -152,8 +158,7 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = state.get_slot(var, "m")
     m_scaled_g_values = grad * (1 - beta1_t)
-    m_t = state_ops.assign(m, m * beta1_t,
-                           use_locking=self._use_locking)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
@@ -163,9 +168,8 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
     v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(var,
-                                      lr * m_t / (v_sqrt + epsilon_t),
-                                      use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
   def _apply_sparse(self, grad, var, state):
@@ -177,21 +181,18 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
   def _resource_scatter_add(self, x, i, v):
     with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(
-            x.handle, i, v)]):
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
       return x.value()
 
   def _resource_apply_sparse(self, grad, var, indices, state):
-    return self._apply_sparse_shared(
-        grad, var, indices, self._resource_scatter_add, state)
+    return self._apply_sparse_shared(grad, var, indices,
+                                     self._resource_scatter_add, state)
 
   def _finish(self, state):
     # Update the power accumulators.
     beta1_power, beta2_power = self._get_beta_accumulators(state)
     update_beta1 = beta1_power.assign(
-        beta1_power * state.get_hyper("beta1"),
-        use_locking=self._use_locking)
+        beta1_power * state.get_hyper("beta1"), use_locking=self._use_locking)
     update_beta2 = beta2_power.assign(
-        beta2_power * state.get_hyper("beta2"),
-        use_locking=self._use_locking)
+        beta2_power * state.get_hyper("beta2"), use_locking=self._use_locking)
     return control_flow_ops.group(update_beta1, update_beta2)
diff --git a/tensorflow/contrib/optimizer_v2/adam_test.py b/tensorflow/contrib/optimizer_v2/adam_test.py
index b1ad0ade427df2abd209381a7020374850e19fa5..b55739f788ed81723a84e5534c1da4e281333482 100644
--- a/tensorflow/contrib/optimizer_v2/adam_test.py
+++ b/tensorflow/contrib/optimizer_v2/adam_test.py
@@ -109,7 +109,7 @@ class AdamOptimizerTest(test.TestCase):
 
   def testSparseDevicePlacement(self):
     for index_dtype in [dtypes.int32, dtypes.int64]:
-      with self.test_session(force_gpu=test.is_gpu_available()):
+      with self.cached_session(force_gpu=test.is_gpu_available()):
         # If a GPU is available, tests that all optimizer ops can be placed on
         # it (i.e. they have GPU kernels).
         var = variables.Variable([[1.0], [2.0]])
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index e13b82d1d27b07b6563f509e02901e4bcce4de8b..72019b31540a943582ebb4699013d9dcfc10769f 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -143,10 +143,11 @@ class CheckpointingTests(test.TestCase):
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
-    # The Dense layers also save get_config() JSON
-    expected_checkpoint_names.extend(
-        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
-         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
+    # The optimizer and Dense layers also save get_config() JSON
+    expected_checkpoint_names.extend([
+        "model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
+        "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"
+    ])
     named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
@@ -171,11 +172,10 @@ class CheckpointingTests(test.TestCase):
                      serialized_graph.nodes[0].children[1].local_name)
     optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
         1].node_id]
-    self.assertEqual("beta1_power",
-                     optimizer_node.children[0].local_name)
-    self.assertEqual("beta1_power",
-                     serialized_graph.nodes[optimizer_node.children[0].node_id]
-                     .attributes[0].full_name)
+    self.assertEqual("beta1_power", optimizer_node.children[0].local_name)
+    self.assertEqual(
+        "beta1_power", serialized_graph.nodes[optimizer_node.children[0]
+                                              .node_id].attributes[0].full_name)
     self.assertEqual(
         "my_model/dense/kernel",
         serialized_graph.nodes[optimizer_node.slot_variables[0]
@@ -241,9 +241,10 @@ class CheckpointingTests(test.TestCase):
     on_create_model = MyModel()
     on_create_optimizer = adam.AdamOptimizer(
         0.001,
-        # Preserve beta1_power and beta2_power when appying gradients so we can
-        # test that they've been restored correctly.
-        beta1=1.0, beta2=1.0)
+        # Preserve beta_1_power and beta_2_power when appying gradients
+        # so we can test that they've been restored correctly.
+        beta1=1.0,
+        beta2=1.0)
     on_create_root = util.Checkpoint(
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
@@ -263,9 +264,9 @@ class CheckpointingTests(test.TestCase):
     dummy_var = resource_variable_ops.ResourceVariable([1.])
     on_create_optimizer.minimize(loss=dummy_var.read_value)
     status.assert_consumed()
-    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+    beta_1_power, beta_2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta_1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta_2_power))
 
   # TODO(allenl): Debug garbage created by this test in python3.
   def testDeferredRestorationUsageEager(self):
@@ -556,8 +557,8 @@ class CheckpointingTests(test.TestCase):
         self.evaluate(first_variable.assign([1.]))
         self.evaluate(optimizer.get_slot(
             var=first_variable, name="m").assign([2.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(3.))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta_1_power.assign(3.))
 
       # Save and load in a second graph
       second_graph = ops.Graph()
@@ -571,29 +572,29 @@ class CheckpointingTests(test.TestCase):
         self.evaluate(second_variable.assign([4.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([5.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(6.))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta_1_power.assign(6.))
         save_path = second_root_checkpointable.save(checkpoint_prefix)
         self.evaluate(second_variable.assign([7.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([8.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta_1_power))
         status = second_root_checkpointable.restore(save_path)
         status.assert_consumed().run_restore_ops()
         self.assertAllEqual([4.], self.evaluate(second_variable))
         self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
             var=second_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta_1_power))
 
       # Check that the first graph is unmolested
       with first_graph.as_default(), first_session.as_default():
         self.assertAllEqual([1.], self.evaluate(first_variable))
         self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
             var=first_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(3., self.evaluate(beta1_power))
+        beta_1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta_1_power))
 
 
 class TemplateTests(test.TestCase):
@@ -659,8 +660,8 @@ class CheckpointCompatibilityTests(test.TestCase):
     self.evaluate(model._named_dense.bias.assign([1.]))
     self.evaluate(optimizer.get_slot(
         var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
+    beta_1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta_1_power.assign(3.))
     return root_checkpointable
 
   def _set_sentinels(self, root_checkpointable):
@@ -669,8 +670,8 @@ class CheckpointCompatibilityTests(test.TestCase):
         root_checkpointable.optimizer.get_slot(
             var=root_checkpointable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
+    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta_1_power.assign(103.))
 
   def _check_sentinels(self, root_checkpointable):
     self.assertAllEqual(
@@ -678,8 +679,8 @@ class CheckpointCompatibilityTests(test.TestCase):
     self.assertAllEqual([2.], self.evaluate(
         root_checkpointable.optimizer.get_slot(
             var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
+    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta_1_power))
 
   def _write_name_based_checkpoint(self):
     checkpoint_directory = self.get_temp_dir()
@@ -750,8 +751,7 @@ class CheckpointCompatibilityTests(test.TestCase):
       save_path = root.save(file_prefix=checkpoint_prefix)
     with context.graph_mode():
       save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
-          graph=save_graph):
+      with save_graph.as_default(), self.test_session(graph=save_graph):
         root = self._initialized_model()
         self._set_sentinels(root)
         root.restore(save_path).assert_consumed().run_restore_ops()
diff --git a/tensorflow/contrib/optimizer_v2/gradient_descent.py b/tensorflow/contrib/optimizer_v2/gradient_descent.py
index 945c8de5595394341077ae13cae3161c71ad4f98..d103a55a3576eb192322bcabde13f55363d4af78 100644
--- a/tensorflow/contrib/optimizer_v2/gradient_descent.py
+++ b/tensorflow/contrib/optimizer_v2/gradient_descent.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""GradientDescent optimizer for TensorFlow."""
+"""Momentum for TensorFlow."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -56,11 +56,11 @@ class GradientDescentOptimizer(optimizer_v2.OptimizerV2):
     return training_ops.resource_apply_gradient_descent(
         handle.handle, lr, grad, use_locking=self._use_locking)
 
-  def _resource_apply_sparse_duplicate_indices(
-      self, grad, handle, indices, state):
+  def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices,
+                                               state):
     lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
-    return resource_variable_ops.resource_scatter_add(
-        handle.handle, indices, -grad * lr)
+    return resource_variable_ops.resource_scatter_add(handle.handle, indices,
+                                                      -grad * lr)
 
   def _apply_sparse_duplicate_indices(self, grad, var, state):
     delta = ops.IndexedSlices(
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 53e27c08c494c3d86d1af54d302459634a1f47b0..7fb23abc38d9dc101204ed83808aebe5a8ef1e78 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -22,6 +22,11 @@ from __future__ import print_function
 
 import abc
 
+import six
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -32,14 +37,13 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 
 
+@six.add_metaclass(abc.ABCMeta)
 class _OptimizableVariable(object):
   """Interface for abstracting over variables in the optimizers."""
 
@@ -181,7 +185,8 @@ def _resolve(value, name):
 
 def _is_dynamic(value):
   """Returns true if __init__ arg `value` should be re-evaluated each step."""
-  if callable(value): return True
+  if callable(value):
+    return True
   # Don't need to do anything special in graph mode, since dynamic values
   # will propagate correctly automatically.
   # TODO(josh11b): Add per-device caching across steps using variables for
@@ -212,9 +217,11 @@ class _OptimizerV2State(object):
     # parameter value converted to a Tensor. Other items have dtype keys
     # with that Tensor cast to that dtype.
     with ops.init_scope():
-      self._hyper = {name: {None: ops.convert_to_tensor(value, name=name)}
-                     for name, (dynamic, value) in sorted(hyper.items())
-                     if not dynamic}
+      self._hyper = {
+          name: {
+              None: ops.convert_to_tensor(value, name=name)
+          } for name, (dynamic, value) in sorted(hyper.items()) if not dynamic
+      }
     self._slots = {}
     self._non_slot_dict = {}
     # Extra state to help Optimizers implement Checkpointable. Holds information
@@ -230,9 +237,11 @@ class _OptimizerV2State(object):
     ret._non_slot_dict = self._non_slot_dict
     ret._deferred_dependencies = self._deferred_dependencies
     ret._deferred_slot_restorations = self._deferred_slot_restorations
-    ret._hyper = {name: {None: _resolve(value, name)}
-                  for name, (dynamic, value) in sorted(hyper.items())
-                  if dynamic}
+    ret._hyper = {
+        name: {
+            None: _resolve(value, name)
+        } for name, (dynamic, value) in sorted(hyper.items()) if dynamic
+    }
     ret._hyper.update(self._hyper)
     ret._non_slot_devices = non_slot_devices
     ret._distribution = distribution
@@ -270,8 +279,8 @@ class _OptimizerV2State(object):
       var: A `Variable` object.
       val: A `Tensor`.  The initial value of the slot.
       slot_name: Name for the slot.
-      optional_op_name: Name to use when scoping the Variable that
-        needs to be created for the slot.
+      optional_op_name: Name to use when scoping the Variable that needs to be
+        created for the slot.
 
     Returns:
       A `Variable` object.
@@ -282,13 +291,17 @@ class _OptimizerV2State(object):
       new_slot_variable = slot_creator.create_slot(
           var, val, optional_op_name or self._op_name)
       self._restore_slot_variable(
-          slot_name=slot_name, variable=var,
-          slot_variable=new_slot_variable)
+          slot_name=slot_name, variable=var, slot_variable=new_slot_variable)
       named_slots[var_key] = new_slot_variable
     return named_slots[var_key]
 
-  def create_slot_with_initializer(self, var, initializer, shape, dtype,
-                                   slot_name, optional_op_name=None):
+  def create_slot_with_initializer(self,
+                                   var,
+                                   initializer,
+                                   shape,
+                                   dtype,
+                                   slot_name,
+                                   optional_op_name=None):
     """Find or create a slot for a variable, using an Initializer.
 
     Args:
@@ -297,8 +310,8 @@ class _OptimizerV2State(object):
       shape: Shape of the initial value of the slot.
       dtype: Type of the value of the slot.
       slot_name: Name for the slot.
-      optional_op_name: Name to use when scoping the Variable that
-        needs to be created for the slot.
+      optional_op_name: Name to use when scoping the Variable that needs to be
+        created for the slot.
 
     Returns:
       A `Variable` object.
@@ -309,8 +322,7 @@ class _OptimizerV2State(object):
       new_slot_variable = slot_creator.create_slot_with_initializer(
           var, initializer, shape, dtype, optional_op_name or self._op_name)
       self._restore_slot_variable(
-          slot_name=slot_name, variable=var,
-          slot_variable=new_slot_variable)
+          slot_name=slot_name, variable=var, slot_variable=new_slot_variable)
       named_slots[var_key] = new_slot_variable
     return named_slots[var_key]
 
@@ -320,8 +332,8 @@ class _OptimizerV2State(object):
     Args:
       var: A `Variable` object.
       slot_name: Name for the slot.
-      optional_op_name: Name to use when scoping the Variable that
-        needs to be created for the slot.
+      optional_op_name: Name to use when scoping the Variable that needs to be
+        created for the slot.
 
     Returns:
       A `Variable` object.
@@ -332,14 +344,15 @@ class _OptimizerV2State(object):
       new_slot_variable = slot_creator.create_zeros_slot(
           var, optional_op_name or self._op_name)
       self._restore_slot_variable(
-          slot_name=slot_name, variable=var,
-          slot_variable=new_slot_variable)
+          slot_name=slot_name, variable=var, slot_variable=new_slot_variable)
       named_slots[var_key] = new_slot_variable
     return named_slots[var_key]
 
-  def _create_or_restore_slot_variable(
-      self, slot_variable_position, slot_name, variable,
-      optional_op_name=None):
+  def _create_or_restore_slot_variable(self,
+                                       slot_variable_position,
+                                       slot_name,
+                                       variable,
+                                       optional_op_name=None):
     """Restore a slot variable's value, possibly creating it.
 
     Called when a variable which has an associated slot variable is created or
@@ -358,8 +371,8 @@ class _OptimizerV2State(object):
         indicating the slot variable `Checkpointable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
-      optional_op_name: Name to use when scoping the Variable that
-        needs to be created for the slot.
+      optional_op_name: Name to use when scoping the Variable that needs to be
+        created for the slot.
     """
     slot_variable = self.get_slot(var=variable, name=slot_name)
     if (slot_variable is None and context.executing_eagerly() and
@@ -393,9 +406,8 @@ class _OptimizerV2State(object):
       # UID in case slot variables have their own dependencies, in which case
       # those could differ between restores.
       variable_key = _var_key_v2(variable)
-      self._deferred_slot_restorations.setdefault(
-          slot_name, {}).setdefault(variable_key, []).append(
-              slot_variable_position)
+      self._deferred_slot_restorations.setdefault(slot_name, {}).setdefault(
+          variable_key, []).append(slot_variable_position)
 
   def get_slot(self, var, name):
     """Return a slot named `name` created for `var` by the Optimizer.
@@ -433,8 +445,9 @@ class _OptimizerV2State(object):
     """Add an extra variable, not associated with a slot."""
     v = self._non_slot_dict.get(name, None)
     if v is None:
-      if colocate_with is None: colocate_with = self._non_slot_devices
-      with self._distribution.colocate_vars_with(colocate_with):
+      if colocate_with is None:
+        colocate_with = self._non_slot_devices
+      with self._distribution.extended.colocate_vars_with(colocate_with):
         # TODO(josh11b): Use get_variable() except for the legacy Adam use case.
         v = variable_scope.variable(initial_value, name=name, trainable=False)
       self._non_slot_dict[name] = v
@@ -453,8 +466,8 @@ class _OptimizerV2State(object):
         slot_name, {}).pop(variable_key, [])
     # Iterate over restores, highest restore UID first to minimize the number
     # of assignments.
-    deferred_restorations.sort(key=lambda position: position.restore_uid,
-                               reverse=True)
+    deferred_restorations.sort(
+        key=lambda position: position.restore_uid, reverse=True)
     for checkpoint_position in deferred_restorations:
       checkpoint_position.restore(slot_variable)
 
@@ -611,9 +624,9 @@ class OptimizerV2(optimizer_v1.Optimizer):
     # Optimizer._create_slots was replaced by _create_vars in OptimizerV2.
     if (self.__class__._create_slots.__code__ is not  # pylint: disable=protected-access
         OptimizerV2._create_slots.__code__):
-      raise RuntimeError("Override _create_vars instead of _create_slots when "
-                         "descending from OptimizerV2 (class %s)" %
-                         self.__class__.__name__)
+      raise RuntimeError(
+          "Override _create_vars instead of _create_slots when "
+          "descending from OptimizerV2 (class %s)" % self.__class__.__name__)
     if not name:
       raise ValueError("Must specify the optimizer name")
 
@@ -622,16 +635,16 @@ class OptimizerV2(optimizer_v1.Optimizer):
     # Map from graph_key to state for that graph. We use the graph_key
     # since it works in both eager and graph mode, and gives the outer
     # graph inside functions.
-    tower_context = distribution_strategy_context.get_tower_context()
-    if tower_context is None:
-      # In a cross-tower context for a DistributionStrategy, which means
-      # only one Optimizer will be created, not one per tower.
+    replica_context = distribute_ctx.get_replica_context()
+    if replica_context is None:
+      # In a cross-replica context for a DistributionStrategy, which means
+      # only one Optimizer will be created, not one per replica.
       self._per_graph_state = {}
     else:
-      # We use get_tower_context().merge_call() to get a single dict
+      # We use get_replica_context().merge_call() to get a single dict
       # shared across all model replicas when running with a
       # DistributionStrategy.
-      self._per_graph_state = tower_context.merge_call(lambda _: {})
+      self._per_graph_state = replica_context.merge_call(lambda _: {})
 
     # Hyper parameters, and whether they should be re-evaluated every step.
     self._hyper = {}
@@ -639,11 +652,16 @@ class OptimizerV2(optimizer_v1.Optimizer):
   def _set_hyper(self, name, value):
     self._hyper[name] = (_is_dynamic(value), value)
 
-  def minimize(self, loss, global_step=None, var_list=None,
-               gate_gradients=GATE_OP, aggregation_method=None,
-               colocate_gradients_with_ops=False, name=None,
-               grad_loss=None, stop_gradients=None,
-               scale_loss_by_num_towers=None):
+  def minimize(self,
+               loss,
+               global_step=None,
+               var_list=None,
+               gate_gradients=GATE_OP,
+               aggregation_method=None,
+               name=None,
+               grad_loss=None,
+               stop_gradients=None,
+               scale_loss_by_num_replicas=None):
     """Add operations to minimize `loss` by updating `var_list`.
 
     This method simply combines calls `compute_gradients()` and
@@ -653,24 +671,22 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     Args:
       loss: A `Tensor` containing the value to minimize.
-      global_step: Optional `Variable` to increment by one after the
-        variables have been updated.
+      global_step: Optional `Variable` to increment by one after the variables
+        have been updated.
       var_list: Optional list or tuple of `Variable` objects to update to
-        minimize `loss`.  Defaults to the list of variables collected in
-        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
+        minimize `loss`.  Defaults to the list of variables collected in the
+        graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
       gate_gradients: How to gate the computation of gradients.  Can be
         `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
       aggregation_method: Specifies the method used to combine gradient terms.
         Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with
-        the corresponding op.
       name: Optional name for the returned operation.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
-      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
-        down by the number of towers. By default, auto-detects whether this
-        is needed.
+      scale_loss_by_num_replicas: Optional boolean. If true, scale the loss down
+        by the number of replicas. By default, auto-detects whether this is
+        needed.
 
     Returns:
       An Operation that updates the variables in `var_list`.  If `global_step`
@@ -686,16 +702,18 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Minimization (and gradient computation) is done with respect to the
     elements of `var_list` if not None, else with respect to any trainable
     variables created during the execution of the `loss` function.
-    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
-    `grad_loss` are ignored when eager execution is enabled.
+    `gate_gradients`, `aggregation_method`, and `grad_loss` are ignored when
+    eager execution is enabled.
     @end_compatibility
     """
     grads_and_vars = self.compute_gradients(
-        loss, var_list=var_list, gate_gradients=gate_gradients,
+        loss,
+        var_list=var_list,
+        gate_gradients=gate_gradients,
         aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
-        grad_loss=grad_loss, stop_gradients=stop_gradients,
-        scale_loss_by_num_towers=scale_loss_by_num_towers)
+        grad_loss=grad_loss,
+        stop_gradients=stop_gradients,
+        scale_loss_by_num_replicas=scale_loss_by_num_replicas)
 
     vars_with_grad = [v for g, v in grads_and_vars if g is not None]
     if not vars_with_grad:
@@ -704,15 +722,17 @@ class OptimizerV2(optimizer_v1.Optimizer):
           " that do not support gradients, between variables %s and loss %s." %
           ([str(v) for _, v in grads_and_vars], loss))
 
-    return self.apply_gradients(grads_and_vars, global_step=global_step,
-                                name=name)
+    return self.apply_gradients(
+        grads_and_vars, global_step=global_step, name=name)
 
-  def compute_gradients(self, loss, var_list=None,
+  def compute_gradients(self,
+                        loss,
+                        var_list=None,
                         gate_gradients=GATE_OP,
                         aggregation_method=None,
-                        colocate_gradients_with_ops=False,
-                        grad_loss=None, stop_gradients=None,
-                        scale_loss_by_num_towers=None):
+                        grad_loss=None,
+                        stop_gradients=None,
+                        scale_loss_by_num_replicas=None):
     """Compute gradients of `loss` for the variables in `var_list`.
 
     This is the first part of `minimize()`.  It returns a list
@@ -722,24 +742,22 @@ class OptimizerV2(optimizer_v1.Optimizer):
     given variable.
 
     Args:
-      loss: A Tensor containing the value to minimize or a callable taking
-        no arguments which returns the value to minimize. When eager execution
-        is enabled it must be a callable.
+      loss: A Tensor containing the value to minimize or a callable taking no
+        arguments which returns the value to minimize. When eager execution is
+        enabled it must be a callable.
       var_list: Optional list or tuple of `tf.Variable` to update to minimize
-        `loss`.  Defaults to the list of variables collected in the graph
-        under the key `GraphKeys.TRAINABLE_VARIABLES`.
+        `loss`.  Defaults to the list of variables collected in the graph under
+        the key `GraphKeys.TRAINABLE_VARIABLES`.
       gate_gradients: How to gate the computation of gradients.  Can be
         `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
       aggregation_method: Specifies the method used to combine gradient terms.
         Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with
-        the corresponding op.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
-      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
-        down by the number of towers. By default, auto-detects whether this
-        is needed.
+      scale_loss_by_num_replicas: Optional boolean. If true, scale the loss down
+        by the number of replicas. By default, auto-detects whether this is
+        needed.
 
     Returns:
       A list of (gradient, variable) pairs. Variable is always present, but
@@ -752,8 +770,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
         not callable.
 
     @compatibility(eager)
-    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
-    and `colocate_gradients_with_ops` are ignored.
+    When eager execution is enabled, `gate_gradients`, and `aggregation_method`
+    are ignored.
     @end_compatibility
     """
     # TODO(josh11b): Test that we handle weight decay in a reasonable way.
@@ -763,52 +781,36 @@ class OptimizerV2(optimizer_v1.Optimizer):
           tape.watch(var_list)
         loss_value = loss()
 
-        # Scale loss for number of towers (callable-loss case). In this case,
+        # Scale loss for number of replicas (callable-loss case). In this case,
         # we have to be careful to call distribute_lib.get_loss_reduction()
         # *after* loss() is evaluated, so we know what loss reduction it uses.
-        if scale_loss_by_num_towers is None:
-          scale_loss_by_num_towers = (
-              distribute_lib.get_loss_reduction() ==
-              variable_scope.VariableAggregation.MEAN)
-        if scale_loss_by_num_towers:
-          num_towers = distribution_strategy_context.get_distribution_strategy(
-          ).num_towers
-          if num_towers > 1:
-            loss_value *= 1. / num_towers
+        loss_value = self._scale_loss(loss_value, scale_loss_by_num_replicas)
 
       if var_list is None:
         var_list = tape.watched_variables()
       grads = tape.gradient(loss_value, var_list, grad_loss)
       return list(zip(grads, var_list))
     if context.executing_eagerly():
-      raise RuntimeError(
-          "`loss` passed to Optimizer.compute_gradients should "
-          "be a function when eager execution is enabled.")
-
-    # Scale loss for number of towers (non-callable-loss case).
-    if scale_loss_by_num_towers is None:
-      scale_loss_by_num_towers = (
-          distribute_lib.get_loss_reduction() ==
-          variable_scope.VariableAggregation.MEAN)
-    if scale_loss_by_num_towers:
-      num_towers = distribution_strategy_context.get_distribution_strategy(
-      ).num_towers
-      if num_towers > 1:
-        loss *= 1. / num_towers
-
-    if gate_gradients not in [optimizer_v1.Optimizer.GATE_NONE,
-                              optimizer_v1.Optimizer.GATE_OP,
-                              optimizer_v1.Optimizer.GATE_GRAPH]:
-      raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
-                       "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" %
-                       gate_gradients)
+      raise RuntimeError("`loss` passed to Optimizer.compute_gradients should "
+                         "be a function when eager execution is enabled.")
+
+    # Scale loss for number of replicas (non-callable-loss case).
+    loss = self._scale_loss(loss, scale_loss_by_num_replicas)
+
+    if gate_gradients not in [
+        optimizer_v1.Optimizer.GATE_NONE, optimizer_v1.Optimizer.GATE_OP,
+        optimizer_v1.Optimizer.GATE_GRAPH
+    ]:
+      raise ValueError(
+          "gate_gradients must be one of: Optimizer.GATE_NONE, "
+          "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" % gate_gradients)
     self._assert_valid_dtypes([loss])
     if grad_loss is not None:
       self._assert_valid_dtypes([grad_loss])
     if var_list is None:
       var_list = (
-          variables.trainable_variables() +
-          ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+          variables.trainable_variables() + ops.get_collection(
+              ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
     else:
       var_list = nest.flatten(var_list)
     # pylint: disable=protected-access
@@ -819,19 +821,34 @@ class OptimizerV2(optimizer_v1.Optimizer):
       raise ValueError("No variables to optimize.")
     var_refs = [p.target() for p in processors]
     grads = gradients.gradients(
-        loss, var_refs, grad_ys=grad_loss,
+        loss,
+        var_refs,
+        grad_ys=grad_loss,
         gate_gradients=(gate_gradients == optimizer_v1.Optimizer.GATE_OP),
         aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
         stop_gradients=stop_gradients)
     if gate_gradients == optimizer_v1.Optimizer.GATE_GRAPH:
       grads = control_flow_ops.tuple(grads)
     grads_and_vars = list(zip(grads, var_list))
-    self._assert_valid_dtypes(
-        [v for g, v in grads_and_vars
-         if g is not None and v.dtype != dtypes.resource])
+    self._assert_valid_dtypes([
+        v for g, v in grads_and_vars
+        if g is not None and v.dtype != dtypes.resource
+    ])
     return grads_and_vars
 
+  @staticmethod
+  def _scale_loss(loss_value, scale_loss_by_num_replicas):
+    """Scale loss for the number of replicas."""
+    if scale_loss_by_num_replicas is None:
+      scale_loss_by_num_replicas = (
+          distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN)
+    if scale_loss_by_num_replicas:
+      num_replicas = \
+        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      if num_replicas > 1:
+        loss_value *= 1. / num_replicas
+    return loss_value
+
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     """Apply gradients to variables.
 
@@ -841,10 +858,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Args:
       grads_and_vars: List of (gradient, variable) pairs as returned by
         `compute_gradients()`.
-      global_step: Optional `Variable` to increment by one after the
-        variables have been updated.
-      name: Optional name for the returned operation.  Default to the
-        name passed to the `Optimizer` constructor.
+      global_step: Optional `Variable` to increment by one after the variables
+        have been updated.
+      name: Optional name for the returned operation.  Default to the name
+        passed to the `Optimizer` constructor.
 
     Returns:
       An `Operation` that applies the specified gradients. If `global_step`
@@ -866,8 +883,9 @@ class OptimizerV2(optimizer_v1.Optimizer):
     if not filtered:
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, v in grads_and_vars],))
-    return distribution_strategy_context.get_tower_context().merge_call(
-        self._distributed_apply, filtered, global_step=global_step, name=name)
+    return distribute_ctx.get_replica_context().merge_call(
+        self._distributed_apply, args=(filtered,),
+        kwargs={"global_step": global_step, "name": name})
 
   def _get_or_create_state(self, var_list=None):
     """Either looks up or creates `_OptimizerV2State`.
@@ -902,8 +920,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
     """`apply_gradients` for use with a `DistributionStrategy`."""
-    reduced_grads = distribution.batch_reduce(
-        variable_scope.VariableAggregation.SUM, grads_and_vars)
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
     grads_and_vars = zip(reduced_grads, var_list)
 
@@ -919,7 +937,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     with ops.name_scope(name, self._name) as name:
       per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list)
       # Include the current value of any dynamic hyper parameters in `state`.
-      non_slot_devices = distribution.non_slot_devices(var_list)
+      non_slot_devices = distribution.extended.non_slot_devices(var_list)
       state = per_graph_state._copy_with_dynamic_hyper(  # pylint: disable=protected-access
           self._hyper, distribution, non_slot_devices)
 
@@ -941,9 +959,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
         try:
           g = ops.convert_to_tensor_or_indexed_slices(g)
         except TypeError:
-          raise TypeError(
-              "Gradient must be convertible to a Tensor"
-              " or IndexedSlices, or None: %s" % g)
+          raise TypeError("Gradient must be convertible to a Tensor"
+                          " or IndexedSlices, or None: %s" % g)
         if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
           raise TypeError(
               "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
@@ -957,14 +974,16 @@ class OptimizerV2(optimizer_v1.Optimizer):
         # `update_op`.
         # TODO(josh11b): Make different state objects for each device to
         # avoid needing to set the device_policy.
-        with ops.name_scope("update_" + scope_name), \
-            context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+        device_policy = context.context().device_policy(
+            context.DEVICE_PLACEMENT_SILENT)
+        with ops.name_scope("update_" + scope_name), device_policy:
           return processor.update_op(self, g, state)
 
       # Use the processors to update the variables.
       update_ops = []
       for grad, var in grads_and_vars:
-        update_ops.extend(distribution.update(var, update, grad, grouped=False))
+        update_ops.extend(distribution.extended.update(
+            var, update, args=(grad,), group=False))
 
       # Give the child class a chance to do something after applying
       # gradients
@@ -976,12 +995,12 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
       update_ops = control_flow_ops.group(update_ops)
       with ops.control_dependencies([update_ops]):
-        finish_updates = distribution.update_non_slot(
-            non_slot_devices, finish, grouped=False)
-      # We said grouped=False, which means finish_updates is always a list.
-      # It will be [None] when finish() returns None.
-      if finish_updates == [None]:
-        finish_updates = [update_ops]
+        finish_updates = distribution.extended.update_non_slot(
+            non_slot_devices, finish, group=False)
+      # We said group=False, which means finish_updates is always a tuple.
+      # It will be (None,) when finish() returns None.
+      if finish_updates == (None,):
+        finish_updates = (update_ops,)
 
       # Update `global_step` (if any).
       if global_step is None:
@@ -992,8 +1011,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
           def update_global_step(global_step, name):
             return global_step.assign_add(1, read_value=False, name=name)
 
-          apply_updates = distribution.update(
-              global_step, update_global_step, name)
+          apply_updates = distribution.extended.update(
+              global_step, update_global_step, args=(name,))
 
       # Add the training op to the TRAIN_OP graph collection in graph mode.
       if not eager_execution:
@@ -1057,13 +1076,11 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     Args:
       var_list: A list of `Variable` objects.
-      state: An object with these methods:
-        `create_slot(var, val, slot_name, optional_op_name)`,
-        `create_slot_with_initializer(`
-            `var, initializer, shape, dtype, slot_name, optional_op_name)`,
-        `zeros_slot(var, slot_name, optional_op_name)`,
-        `create_non_slot_variable(initial_value, name, colocate_with)`,
-        `get_hyper(name)`
+      state: An object with these methods: `create_slot(var, val, slot_name,
+        optional_op_name)`, `create_slot_with_initializer(` `var, initializer,
+        shape, dtype, slot_name, optional_op_name)`, `zeros_slot(var, slot_name,
+        optional_op_name)`, `create_non_slot_variable(initial_value, name,
+        colocate_with)`, `get_hyper(name)`
     """
     # No slots needed by default
     pass
@@ -1101,8 +1118,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     Args:
       grad: a `Tensor` representing the gradient.
-      handle: a `Tensor` of dtype `resource` which points to the variable
-       to be updated.
+      handle: a `Tensor` of dtype `resource` which points to the variable to be
+        updated.
       state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
         and `get_hyper(name)` methods.
 
@@ -1111,8 +1128,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
     """
     raise NotImplementedError()
 
-  def _resource_apply_sparse_duplicate_indices(
-      self, grad, handle, indices, state):
+  def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices,
+                                               state):
     """Add ops to apply sparse gradients to `handle`, with repeated indices.
 
     Optimizers which override this method must deal with repeated indices. See
@@ -1125,10 +1142,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     Args:
       grad: a `Tensor` representing the gradient for the affected indices.
-      handle: a `Tensor` of dtype `resource` which points to the variable
-       to be updated.
-      indices: a `Tensor` of integral type representing the indices for
-       which the gradient is nonzero. Indices may be repeated.
+      handle: a `Tensor` of dtype `resource` which points to the variable to be
+        updated.
+      indices: a `Tensor` of integral type representing the indices for which
+        the gradient is nonzero. Indices may be repeated.
       state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
         and `get_hyper(name)` methods.
 
@@ -1139,8 +1156,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
     summed_grad, unique_indices = optimizer_v1._deduplicate_indexed_slices(
         values=grad, indices=indices)
     # pylint: enable=protected-access
-    return self._resource_apply_sparse(
-        summed_grad, handle, unique_indices, state)
+    return self._resource_apply_sparse(summed_grad, handle, unique_indices,
+                                       state)
 
   def _resource_apply_sparse(self, grad, handle, indices, state):
     """Add ops to apply sparse gradients to the variable `handle`.
@@ -1152,10 +1169,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     Args:
       grad: a `Tensor` representing the gradient for the affected indices.
-      handle: a `Tensor` of dtype `resource` which points to the variable
-       to be updated.
-      indices: a `Tensor` of integral type representing the indices for
-       which the gradient is nonzero. Indices are unique.
+      handle: a `Tensor` of dtype `resource` which points to the variable to be
+        updated.
+      indices: a `Tensor` of integral type representing the indices for which
+        the gradient is nonzero. Indices are unique.
       state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
         and `get_hyper(name)` methods.
 
@@ -1301,8 +1318,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
     state = self._get_or_create_state()
     return state._deferred_dependencies  # pylint: disable=protected-access
 
-  def _create_or_restore_slot_variable(
-      self, slot_variable_position, slot_name, variable):
+  def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
+                                       variable):
     """Checkpointable: Restore a slot variable's value, possibly creating it.
 
     Called when a variable which has an associated slot variable is created or
@@ -1325,8 +1342,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
   # Unsupported parent methods
   # --------------
   def _slot_dict(self, slot_name):
-    raise NotImplementedError(
-        "_slot_dict() method unsupported in OptimizerV2")
+    raise NotImplementedError("_slot_dict() method unsupported in OptimizerV2")
 
   def _get_or_make_slot(self, var, val, slot_name, op_name):
     raise NotImplementedError(
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop.py b/tensorflow/contrib/optimizer_v2/rmsprop.py
index 3de53405ec16d93f20273ec60f8fc6cfc96e7e39..12175cedd361627581098580099140445e84f33f 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop.py
@@ -21,7 +21,7 @@ A detailed description of rmsprop.
 - maintain a moving (discounted) average of the square of gradients
 - divide gradient by the root of this average
 
-mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mean_square = rho * mean_square{t-1} + (1-rho) * gradient ** 2
 mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square)
 delta = - mom
 
@@ -30,8 +30,8 @@ This implementation of RMSProp uses plain momentum, not Nesterov momentum.
 The centered version additionally maintains a moving (discounted) average of the
 gradients, and uses that average to estimate the variance:
 
-mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
-mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mean_grad = rho * mean_square{t-1} + (1-rho) * gradient
+mean_square = rho * mean_square{t-1} + (1-rho) * gradient ** 2
 mom = momentum * mom{t-1} + learning_rate * g_t /
     sqrt(mean_square - mean_grad**2)
 delta = - mom
@@ -51,7 +51,8 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
   """Optimizer that implements the RMSProp algorithm.
 
   See the
-  [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  [paper]
+  (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
   """
 
   def __init__(self,
@@ -106,8 +107,8 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
 
   def _create_vars(self, var_list, state):
     for v in var_list:
-      init_rms = state.get_hyper(
-          "epsilon", v.dtype.base_dtype) * array_ops.ones_like(v)
+      init_rms = state.get_hyper("epsilon",
+                                 v.dtype.base_dtype) * array_ops.ones_like(v)
       state.create_slot_with_initializer(v, init_rms, v.get_shape(),
                                          v.dtype.base_dtype, "rms")
       if self._centered:
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
index 44301ffe9e5cc9a4ead6462887ec669811f2cc38..202c1e9afc0623a5837aa82480f1b406834007ee 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -89,7 +89,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
   def testDense(self, dtype, param_value):
     (learning_rate, decay, momentum, epsilon, centered, use_resource) = tuple(
         param_value)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # Initialize variables for numpy implementation.
       var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
       grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
@@ -157,8 +157,11 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
         self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
         self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
         self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-        self.assertAllCloseAccordingToType(var0_np, var0.eval())
-        self.assertAllCloseAccordingToType(var1_np, var1.eval())
+        # TODO(b/117393988): Reduce tolerances for float16.
+        self.assertAllCloseAccordingToType(
+            var0_np, var0.eval(), half_rtol=3e-3, half_atol=3e-3)
+        self.assertAllCloseAccordingToType(
+            var1_np, var1.eval(), half_rtol=3e-3, half_atol=3e-3)
 
   @parameterized.parameters([dtypes.float32, dtypes.float64])
   def testMinimizeSparseResourceVariable(self, dtype):
@@ -210,7 +213,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
   def testSparse(self, dtype, param_value):
     (learning_rate, decay, momentum, epsilon, centered, _) = tuple(
         param_value)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # Initialize variables for numpy implementation.
       var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
       grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
@@ -284,7 +287,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters(_DATA_TYPES)
   def testWithoutMomentum(self, dtype):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       var0 = variables.Variable([1.0, 2.0], dtype=dtype)
       var1 = variables.Variable([3.0, 4.0], dtype=dtype)
       grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -350,7 +353,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters(_DATA_TYPES)
   def testWithMomentum(self, dtype):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       var0 = variables.Variable([1.0, 2.0], dtype=dtype)
       var1 = variables.Variable([3.0, 4.0], dtype=dtype)
       grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index f2171efc959362c1e4392fefbd5842f0883571d7..c980a9342e45ee8818f076e21f260abcdbd5fcfe 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -9,10 +9,10 @@ load(
     "tf_cc_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
-    "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 cc_library(
     name = "all_ops",
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index d50b52b8ff1ce8188ab52c6968d716378efd9daa..53a3bc63e1d770b451846c45370fdee9ffa72d70 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -42,6 +42,7 @@ py_library(
     name = "saved_model_predictor",
     srcs = ["saved_model_predictor.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//learning/brain/contrib/learn/tpu:__subpackages__"],
     deps = [
         ":base_predictor",
         "//tensorflow/contrib/saved_model:saved_model_py",
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 94a2d9672dba74d19cb0801aa8680e921c238c97..b35c4fde1a2c704880e023a0c3ac1e0766493514 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -205,6 +205,10 @@ py_test(
     size = "large",
     srcs = ["python/quantize_parameterized_test.py"],
     srcs_version = "PY2AND3",
+    # TODO(b/118839526): Re-enable msan test.
+    tags = [
+        "nomsan",
+    ],
     deps = [
         ":fold_batch_norms",
         ":quantize",
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 0ab19c91bb036ad24beee3d99624e788d086a9a5..9085d9fa719520ac84ef6f8e07d7fa335bef5605 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -28,7 +28,7 @@ Since it's difficult to add these fake quantization operations to all the
 required locations in the model, there's a function available that rewrites the
 training graph. To create a fake quantized training graph:
 
-```
+```python
 # Build forward pass of model.
 loss = tf.losses.get_total_loss()
 
@@ -51,7 +51,7 @@ The rewritten *eval graph* is non-trivially different from the *training graph*
 since the quantization ops affect the batch normalization step. Because of this,
 we've added a separate rewrite for the *eval graph*:
 
-```
+```python
 # Build eval model
 logits = tf.nn.softmax_cross_entropy_with_logits_v2(...)
 
@@ -145,7 +145,7 @@ Mobilenet-v2, and Inception-v3) using this tool:
 </figure>
 
 Our pre-trained models are available in the
-<a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/models.md#image-classification-quantized-models" class="external">TensorFlow Lite model repository</a>. The code used to generate
+<a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models.md#image-classification-quantized-models" class="external">TensorFlow Lite model repository</a>. The code used to generate
 these models <a href="https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1_train.py" class="external">is available</a>.
 
 
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 7575b1b6cd6b0a9315c1b1e5e50293dc54a953b6..e0c6da00d86fe4c5f881bcab7b444182da092b8f 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -145,7 +145,7 @@ def _FindFusedBatchNorms(graph):
   Args:
     graph: Graph to inspect.
 
-  Yields:
+  Returns:
     _FusedBatchNormMatches.
   """
   input_pattern = graph_matcher.OpTypePattern('*')
@@ -169,8 +169,15 @@ def _FindFusedBatchNorms(graph):
           graph_matcher.OpTypePattern('*'),
           graph_matcher.OpTypePattern('*')
       ])
+  # Identity between conv/matmul and bn
+  layer_pattern_with_identity = graph_matcher.OpTypePattern(
+      'Identity',
+      inputs=[
+          graph_matcher.OneofPattern([batch_to_space_pattern, layer_pattern])
+      ])
   layer_output_pattern = graph_matcher.OneofPattern(
-      [layer_pattern, batch_to_space_pattern])
+      [layer_pattern_with_identity, layer_pattern, batch_to_space_pattern])
+
   # MatMul has a Reshape between it and FusedBatchNorm.
   matmul_reshape_pattern = graph_matcher.OpTypePattern(
       'Reshape',
@@ -188,6 +195,11 @@ def _FindFusedBatchNorms(graph):
       'Reshape', inputs=[batch_norm_pattern,
                          graph_matcher.OpTypePattern('*')])
 
+  batch_norm_identity_pattern = graph_matcher.OpTypePattern(
+      'Identity', inputs=[batch_norm_pattern, matmul_bn_output_reshape_pattern])
+
+  bn_identity_matcher = graph_matcher.GraphMatcher(batch_norm_identity_pattern)
+
   bn_matcher = graph_matcher.GraphMatcher(
       graph_matcher.OneofPattern(
           [matmul_bn_output_reshape_pattern, batch_norm_pattern]))
@@ -200,7 +212,17 @@ def _FindFusedBatchNorms(graph):
   moving_avg_mul_matcher = graph_matcher.GraphMatcher(
       moving_average_mul_pattern)
 
-  for match_result in bn_matcher.match_graph(graph):
+  def _GetLayerMatch(match_result):
+    """Populates a layer match object containing ops/tensors for folding BNs.
+
+    Args:
+      match_result: Matched result from graph matcher
+
+    Returns:
+      layer_op: Matching conv/fc op prior to batch norm
+      BatchNormMatch: _BatchNormMatch containing all required batch norm
+      parameters.
+    """
     moving_mean_tensor = None
     moving_variance_tensor = None
     bn_decay_mean_tensor = None
@@ -208,7 +230,11 @@ def _FindFusedBatchNorms(graph):
     batch_to_space_op = None
     layer_op = match_result.get_op(layer_pattern)
     layer_tensor = match_result.get_tensor(layer_pattern)
+    bn_id_op = match_result.get_op(batch_norm_identity_pattern)
     bn_op = match_result.get_op(batch_norm_pattern)
+    if bn_id_op is None:
+      bn_id_op = bn_op
+
     batch_epsilon = bn_op.get_attr('epsilon')
 
     # In the MatMul case, the output of batch norm is reshaped back into a
@@ -219,13 +245,13 @@ def _FindFusedBatchNorms(graph):
       # If the matcher didn't match matmul_bn_output_reshape, there will be
       # another match for this 'MatMul' later, so we can skip this one.
       if output_reshape_op is None:
-        continue
+        return None, None
       output_tensor = output_reshape_op.outputs[0]
 
     # Ensure that the output tensor has consumers, otherwise this is a dangling
     # node and not a match.
     if not output_tensor.consumers():
-      continue
+      return None, None
 
     batch_to_space_op = match_result.get_op(batch_to_space_pattern)
     input_tensor = match_result.get_tensor(input_pattern)
@@ -277,7 +303,7 @@ def _FindFusedBatchNorms(graph):
       mean_tensor = match_result.get_tensor(mean_pattern)
       variance_tensor = match_result.get_tensor(variance_pattern)
 
-    yield _BatchNormMatch(
+    return layer_op, _BatchNormMatch(
         layer_op=layer_op,
         bn_op=bn_op,
         output_tensor=output_tensor,
@@ -294,6 +320,26 @@ def _FindFusedBatchNorms(graph):
         batch_epsilon=batch_epsilon,
         batch_to_space_op=batch_to_space_op)
 
+  layer_matches = []
+  # We use matched_layer_set to ensure that layers aren't matched multiple
+  # times.
+  matched_layer_set = set()
+  for match_result in bn_identity_matcher.match_graph(graph):
+    layer_op, layer_match = _GetLayerMatch(match_result)
+    if layer_op is not None:
+      if layer_op not in matched_layer_set:
+        matched_layer_set.add(layer_op)
+        layer_matches.append(layer_match)
+
+  for match_result in bn_matcher.match_graph(graph):
+    layer_op, layer_match = _GetLayerMatch(match_result)
+    if layer_op is not None:
+      if layer_op not in matched_layer_set:
+        matched_layer_set.add(layer_op)
+        layer_matches.append(layer_match)
+
+  return layer_matches
+
 
 def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay):
   """Computes batch norm correction params.
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index 3f8063cc022726cb745d42aba3c834c71e876e70..77b3f62e9d62085cad87599c53f8a914ccba3f43 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -48,26 +48,32 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
   def _RunTestOverParameters(self, test_fn):
     parameters_list = [
         # (relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm,
-        # freeze_batch_norm_delay)
-        (nn_ops.relu6, 'Relu6', False, False, False, 100),
-        (nn_ops.relu, 'Relu', False, False, False, None),
-        (nn_ops.relu6, 'Relu6', True, False, False, 100),
-        (nn_ops.relu, 'Relu', True, False, False, None),
-        (nn_ops.relu6, 'Relu6', False, True, False, 100),
-        (nn_ops.relu, 'Relu', False, True, False, None),
-        (nn_ops.relu6, 'Relu6', True, True, False, 100),
-        (nn_ops.relu, 'Relu', True, True, False, None),
+        # freeze_batch_norm_delay, insert identity node)
+        (nn_ops.relu6, 'Relu6', False, False, False, 100, False),
+        (nn_ops.relu, 'Relu', False, False, False, None, False),
+        (nn_ops.relu6, 'Relu6', True, False, False, 100, False),
+        (nn_ops.relu, 'Relu', True, False, False, None, False),
+        (nn_ops.relu6, 'Relu6', False, True, False, 100, False),
+        (nn_ops.relu, 'Relu', False, True, False, None, False),
+        (nn_ops.relu6, 'Relu6', True, True, False, 100, False),
+        (nn_ops.relu, 'Relu', True, True, False, None, False),
         # Fused batch norm always has scaling enabled.
-        (nn_ops.relu6, 'Relu6', False, True, True, None),
-        (nn_ops.relu, 'Relu', False, True, True, 100),
-        (nn_ops.relu6, 'Relu6', True, True, True, None),
-        (nn_ops.relu, 'Relu', True, True, True, 100),
+        (nn_ops.relu6, 'Relu6', False, True, True, None, False),
+        (nn_ops.relu, 'Relu', False, True, True, 100, False),
+        (nn_ops.relu6, 'Relu6', True, True, True, None, False),
+        (nn_ops.relu, 'Relu', True, True, True, 100, False),
+        (nn_ops.relu6, 'Relu6', False, True, True, None, True),
+        (nn_ops.relu, 'Relu', False, True, True, 100, True),
+        (nn_ops.relu6, 'Relu6', True, True, True, None, True),
+        (nn_ops.relu, 'Relu', True, True, True, 100, True),
     ]
     for params in parameters_list:
-      test_fn(params[0], params[1], params[2], params[3], params[4], params[5])
+      test_fn(params[0], params[1], params[2], params[3], params[4], params[5],
+              params[6])
 
   def _TestFoldConv2d(self, relu, relu_op_name, with_bypass, has_scaling,
-                      fused_batch_norm, freeze_batch_norm_delay):
+                      fused_batch_norm, freeze_batch_norm_delay,
+                      insert_identity_node):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Args:
@@ -79,6 +85,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       fused_batch_norm: Bool, when true the batch norm is fused.
       freeze_batch_norm_delay: None or the number of steps after which training
       switches to using frozen mean and variance
+      insert_identity_node: Bool, insert identity node between conv and batch
+      norm
     """
     g = ops.Graph()
     with g.as_default():
@@ -87,18 +95,42 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       out_depth = 3 if with_bypass else 32
       stride = 1 if with_bypass else 2
       activation_fn = None if with_bypass else relu
-      scope = 'test/test2' if with_bypass else 'test'
-      node = conv2d(
-          inputs,
-          out_depth, [5, 5],
-          stride=stride,
-          padding='SAME',
-          weights_initializer=self._WeightInit(0.09),
-          activation_fn=activation_fn,
-          normalizer_fn=batch_norm,
-          normalizer_params=self._BatchNormParams(
-              scale=has_scaling, fused=fused_batch_norm),
-          scope=scope)
+      name = 'test/test2' if with_bypass else 'test'
+      if insert_identity_node:
+        with g.name_scope(name):
+          node = conv2d(
+              inputs,
+              out_depth, [5, 5],
+              stride=stride,
+              padding='SAME',
+              weights_initializer=self._WeightInit(0.09),
+              activation_fn=None,
+              normalizer_fn=None,
+              biases_initializer=None)
+          conv_out = array_ops.identity(node, name='conv_out')
+
+          node = batch_norm(
+              conv_out,
+              center=True,
+              scale=has_scaling,
+              decay=1.0 - 0.003,
+              fused=fused_batch_norm)
+          if activation_fn is not None:
+            node = activation_fn(node)
+          conv_name = name + '/Conv'
+      else:
+        node = conv2d(
+            inputs,
+            out_depth, [5, 5],
+            stride=stride,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=activation_fn,
+            normalizer_fn=batch_norm,
+            normalizer_params=self._BatchNormParams(
+                scale=has_scaling, fused=fused_batch_norm),
+            scope=name)
+        conv_name = name
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
@@ -106,31 +138,30 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       fold_batch_norms.FoldBatchNorms(
           g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
-    folded_mul = g.get_operation_by_name(scope + '/mul_fold')
+    folded_mul = g.get_operation_by_name(conv_name + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/correction_mult',
-        self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
+        conv_name + '/correction_mult',
+        self._BatchNormMultiplierName(conv_name, has_scaling, fused_batch_norm)
     ])
-    self._AssertOutputGoesToOps(folded_mul, g, [scope + '/Conv2D_Fold'])
+    self._AssertOutputGoesToOps(folded_mul, g, [conv_name + '/Conv2D_Fold'])
 
-    folded_conv = g.get_operation_by_name(scope + '/Conv2D_Fold')
+    folded_conv = g.get_operation_by_name(conv_name + '/Conv2D_Fold')
     self.assertEqual(folded_conv.type, 'Conv2D')
     self._AssertInputOpsAre(folded_conv,
-                            [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
+                            [conv_name + '/mul_fold', inputs.op.name])
+    self._AssertOutputGoesToOps(folded_conv, g, [conv_name + '/post_conv_mul'])
 
-    folded_add = g.get_operation_by_name(scope + '/add_fold')
+    folded_add = g.get_operation_by_name(conv_name + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/correction_add',
-        self._BathNormBiasName(scope, fused_batch_norm)
+        conv_name + '/correction_add',
+        self._BathNormBiasName(conv_name, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
     if freeze_batch_norm_delay is not None:
-      self._AssertMovingAveragesAreFrozen(g, scope)
-
+      self._AssertMovingAveragesAreFrozen(g, name)
 
     for op in g.get_operations():
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
@@ -143,7 +174,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
                               relu_op_name='Relu',
                               has_scaling=True,
                               fused_batch_norm=False,
-                              freeze_batch_norm_delay=None):
+                              freeze_batch_norm_delay=None,
+                              insert_identity_node=False):
     """Tests folding cases for a network with multiple layers.
 
     Args:
@@ -153,6 +185,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       fused_batch_norm: Bool, when true the batch norm is fused.
       freeze_batch_norm_delay: None or the number of steps after which training
       switches to using frozen mean and variance
+      insert_identity_node: Bool, insert identity node between conv and batch
+      norm
     """
     g = ops.Graph()
     with g.as_default():
@@ -225,9 +259,14 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     for op in g.get_operations():
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
 
-  def _TestFoldConv2dUnknownShape(self, relu, relu_op_name, with_bypass,
-                                  has_scaling, fused_batch_norm,
-                                  freeze_batch_norm_delay):
+  def _TestFoldConv2dUnknownShape(self,
+                                  relu,
+                                  relu_op_name,
+                                  with_bypass,
+                                  has_scaling,
+                                  fused_batch_norm,
+                                  freeze_batch_norm_delay,
+                                  insert_identity_node=False):
     """Tests folding cases: inputs -> Conv2d with batch norm -> Relu*.
 
     Tests that folding works even with an input shape where some dimensions are
@@ -242,6 +281,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       fused_batch_norm: Bool, when true the batch norm is fused.
       freeze_batch_norm_delay: None or the number of steps after which training
       switches to using frozen mean and variance
+      insert_identity_node: Bool, insert identity node between conv and batch
+      norm
     """
     g = ops.Graph()
     with g.as_default():
@@ -298,9 +339,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
   def testFoldConv2dUnknownShape(self):
     self._RunTestOverParameters(self._TestFoldConv2dUnknownShape)
 
-  def _TestFoldFullyConnectedLayer(self, relu, relu_op_name, with_bypass,
-                                   has_scaling, fused_batch_norm,
-                                   freeze_batch_norm_delay):
+  def _TestFoldFullyConnectedLayer(
+      self, relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm,
+      freeze_batch_norm_delay, insert_identity_node):
     """Tests folding cases: inputs -> FC with batch norm -> Relu*.
 
     Args:
@@ -312,6 +353,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       fused_batch_norm: Bool, when true the batch norm is fused.
       freeze_batch_norm_delay: None or the number of steps after which training
       switches to using frozen mean and variance
+      insert_identity_node: Bool, insert identity node between conv and batch
+      norm
     """
     g = ops.Graph()
     with g.as_default():
@@ -319,16 +362,40 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       inputs = array_ops.zeros((batch_size, depth))
       out_depth = 256 if with_bypass else 128
       activation_fn = None if with_bypass else relu
-      scope = 'test/test2' if with_bypass else 'test'
-      node = fully_connected(
-          inputs,
-          out_depth,
-          weights_initializer=self._WeightInit(0.03),
-          activation_fn=activation_fn,
-          normalizer_fn=batch_norm,
-          normalizer_params=self._BatchNormParams(
-              scale=has_scaling, fused=fused_batch_norm),
-          scope=scope)
+      name = 'test/test2' if with_bypass else 'test'
+      insert_identity_node = fused_batch_norm
+      if insert_identity_node:
+        with g.name_scope(name):
+          node = fully_connected(
+              inputs,
+              out_depth,
+              weights_initializer=self._WeightInit(0.03),
+              activation_fn=None,
+              normalizer_fn=None,
+              biases_initializer=None)
+          node = array_ops.identity(node, name='fc_out')
+
+          node = batch_norm(
+              node,
+              center=True,
+              scale=has_scaling,
+              decay=1.0 - 0.003,
+              fused=fused_batch_norm)
+          if activation_fn is not None:
+            node = activation_fn(node)
+          fc_name = name + '/fully_connected'
+      else:
+
+        node = fully_connected(
+            inputs,
+            out_depth,
+            weights_initializer=self._WeightInit(0.03),
+            activation_fn=activation_fn,
+            normalizer_fn=batch_norm,
+            normalizer_params=self._BatchNormParams(
+                scale=has_scaling, fused=fused_batch_norm),
+            scope=name)
+        fc_name = name
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
@@ -336,30 +403,30 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       fold_batch_norms.FoldBatchNorms(
           g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
-    folded_mul = g.get_operation_by_name(scope + '/mul_fold')
+    folded_mul = g.get_operation_by_name(fc_name + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     self._AssertInputOpsAre(folded_mul, [
-        scope + '/correction_mult',
-        self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm)
+        fc_name + '/correction_mult',
+        self._BatchNormMultiplierName(fc_name, has_scaling, fused_batch_norm)
     ])
-    self._AssertOutputGoesToOps(folded_mul, g, [scope + '/MatMul_Fold'])
+    self._AssertOutputGoesToOps(folded_mul, g, [fc_name + '/MatMul_Fold'])
 
-    folded_conv = g.get_operation_by_name(scope + '/MatMul_Fold')
+    folded_conv = g.get_operation_by_name(fc_name + '/MatMul_Fold')
     self.assertEqual(folded_conv.type, 'MatMul')
     self._AssertInputOpsAre(folded_conv,
-                            [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
+                            [fc_name + '/mul_fold', inputs.op.name])
+    self._AssertOutputGoesToOps(folded_conv, g, [fc_name + '/post_conv_mul'])
 
-    folded_add = g.get_operation_by_name(scope + '/add_fold')
+    folded_add = g.get_operation_by_name(fc_name + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/correction_add',
-        self._BathNormBiasName(scope, fused_batch_norm)
+        fc_name + '/correction_add',
+        self._BathNormBiasName(fc_name, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
     if freeze_batch_norm_delay is not None:
-      self._AssertMovingAveragesAreFrozen(g, scope)
+      self._AssertMovingAveragesAreFrozen(g, name)
 
     for op in g.get_operations():
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
@@ -369,7 +436,7 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
 
   def _TestFoldDepthwiseConv2d(self, relu, relu_op_name, with_bypass,
                                has_scaling, fused_batch_norm,
-                               freeze_batch_norm_delay):
+                               freeze_batch_norm_delay, insert_identity_node):
     """Tests folding: inputs -> DepthwiseConv2d with batch norm -> Relu*.
 
     Args:
@@ -380,7 +447,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
       freeze_batch_norm_delay: None or the number of steps after which training
-      switches to using frozen mean and variance
+      insert_identity_node: Bool, insert identity node between conv and batch
+        norm switches to using frozen mean and variance
     """
     g = ops.Graph()
     with g.as_default():
@@ -388,19 +456,44 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       inputs = array_ops.zeros((batch_size, height, width, 3))
       stride = 1 if with_bypass else 2
       activation_fn = None if with_bypass else relu
-      scope = 'test/test2' if with_bypass else 'test'
-      node = separable_conv2d(
-          inputs,
-          None, [5, 5],
-          stride=stride,
-          depth_multiplier=1.0,
-          padding='SAME',
-          weights_initializer=self._WeightInit(0.09),
-          activation_fn=activation_fn,
-          normalizer_fn=batch_norm,
-          normalizer_params=self._BatchNormParams(
-              scale=has_scaling, fused=fused_batch_norm),
-          scope=scope)
+      name = 'test/test2' if with_bypass else 'test'
+      if insert_identity_node:
+        with g.name_scope(name):
+          node = separable_conv2d(
+              inputs,
+              None, [5, 5],
+              stride=stride,
+              depth_multiplier=1.0,
+              padding='SAME',
+              weights_initializer=self._WeightInit(0.09),
+              activation_fn=None,
+              normalizer_fn=None,
+              biases_initializer=None)
+          node = array_ops.identity(node, name='sep_conv_out')
+
+          node = batch_norm(
+              node,
+              center=True,
+              scale=has_scaling,
+              decay=1.0 - 0.003,
+              fused=fused_batch_norm)
+          if activation_fn is not None:
+            node = activation_fn(node)
+          sep_conv_name = name + '/SeparableConv2d'
+      else:
+        node = separable_conv2d(
+            inputs,
+            None, [5, 5],
+            stride=stride,
+            depth_multiplier=1.0,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=activation_fn,
+            normalizer_fn=batch_norm,
+            normalizer_params=self._BatchNormParams(
+                scale=has_scaling, fused=fused_batch_norm),
+            scope=name)
+        sep_conv_name = name
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
@@ -408,40 +501,43 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       fold_batch_norms.FoldBatchNorms(
           g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
-    folded_mul = g.get_operation_by_name(scope + '/mul_fold')
+    folded_mul = g.get_operation_by_name(sep_conv_name + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     if fused_batch_norm:
-      scale_reshape_op_name = scope + '/BatchNorm_Fold/scale_reshape'
+      scale_reshape_op_name = sep_conv_name + '/BatchNorm_Fold/scale_reshape'
     else:
-      scale_reshape_op_name = scope + '/scale_reshape'
-    self._AssertInputOpsAre(folded_mul,
-                            [scope + '/correction_mult', scale_reshape_op_name])
-    self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold'])
+      scale_reshape_op_name = sep_conv_name + '/scale_reshape'
+    self._AssertInputOpsAre(
+        folded_mul, [sep_conv_name + '/correction_mult', scale_reshape_op_name])
+    self._AssertOutputGoesToOps(folded_mul, g,
+                                [sep_conv_name + '/depthwise_Fold'])
 
     scale_reshape = g.get_operation_by_name(scale_reshape_op_name)
     self.assertEqual(scale_reshape.type, 'Reshape')
     self._AssertInputOpsAre(scale_reshape, [
-        self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm),
+        self._BatchNormMultiplierName(sep_conv_name, has_scaling,
+                                      fused_batch_norm),
         scale_reshape_op_name + '/shape'
     ])
-    self._AssertOutputGoesToOps(scale_reshape, g, [scope + '/mul_fold'])
+    self._AssertOutputGoesToOps(scale_reshape, g, [sep_conv_name + '/mul_fold'])
 
-    folded_conv = g.get_operation_by_name(scope + '/depthwise_Fold')
+    folded_conv = g.get_operation_by_name(sep_conv_name + '/depthwise_Fold')
     self.assertEqual(folded_conv.type, 'DepthwiseConv2dNative')
     self._AssertInputOpsAre(folded_conv,
-                            [scope + '/mul_fold', inputs.op.name])
-    self._AssertOutputGoesToOps(folded_conv, g, [scope + '/post_conv_mul'])
+                            [sep_conv_name + '/mul_fold', inputs.op.name])
+    self._AssertOutputGoesToOps(folded_conv, g,
+                                [sep_conv_name + '/post_conv_mul'])
 
-    folded_add = g.get_operation_by_name(scope + '/add_fold')
+    folded_add = g.get_operation_by_name(sep_conv_name + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/correction_add',
-        self._BathNormBiasName(scope, fused_batch_norm)
+        sep_conv_name + '/correction_add',
+        self._BathNormBiasName(sep_conv_name, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
     if freeze_batch_norm_delay is not None:
-      self._AssertMovingAveragesAreFrozen(g, scope)
+      self._AssertMovingAveragesAreFrozen(g, name)
 
     for op in g.get_operations():
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
@@ -450,7 +546,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
     self._RunTestOverParameters(self._TestFoldDepthwiseConv2d)
 
   def _TestFoldAtrousConv2d(self, relu, relu_op_name, with_bypass, has_scaling,
-                            fused_batch_norm, freeze_batch_norm_delay):
+                            fused_batch_norm, freeze_batch_norm_delay,
+                            insert_identity_node):
     """Tests folding: inputs -> AtrousConv2d with batch norm -> Relu*.
 
     Args:
@@ -461,7 +558,9 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       has_scaling: Bool, when true the batch norm has scaling.
       fused_batch_norm: Bool, when true the batch norm is fused.
       freeze_batch_norm_delay: None or the number of steps after which training
-      switches to using frozen mean and variance
+        switches to using frozen mean and variance
+      insert_identity_node: Bool, insert identity node between conv and batch
+        norm
     """
     g = ops.Graph()
     with g.as_default():
@@ -469,19 +568,44 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       inputs = array_ops.zeros((batch_size, height, width, 3))
       dilation_rate = 2
       activation_fn = None if with_bypass else relu
-      scope = 'test/test2' if with_bypass else 'test'
-      node = separable_conv2d(
-          inputs,
-          None, [3, 3],
-          rate=dilation_rate,
-          depth_multiplier=1.0,
-          padding='SAME',
-          weights_initializer=self._WeightInit(0.09),
-          activation_fn=activation_fn,
-          normalizer_fn=batch_norm,
-          normalizer_params=self._BatchNormParams(
-              scale=has_scaling, fused=fused_batch_norm),
-          scope=scope)
+      name = 'test/test2' if with_bypass else 'test'
+      if insert_identity_node:
+        with g.name_scope(name):
+          node = separable_conv2d(
+              inputs,
+              None, [3, 3],
+              rate=dilation_rate,
+              depth_multiplier=1.0,
+              padding='SAME',
+              weights_initializer=self._WeightInit(0.09),
+              activation_fn=None,
+              normalizer_fn=None,
+              biases_initializer=None)
+          node = array_ops.identity(node, name='sep_conv_out')
+
+          node = batch_norm(
+              node,
+              center=True,
+              scale=has_scaling,
+              decay=1.0 - 0.003,
+              fused=fused_batch_norm)
+          if activation_fn is not None:
+            node = activation_fn(node)
+          sep_conv_name = name + '/SeparableConv2d'
+      else:
+        node = separable_conv2d(
+            inputs,
+            None, [3, 3],
+            rate=dilation_rate,
+            depth_multiplier=1.0,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=activation_fn,
+            normalizer_fn=batch_norm,
+            normalizer_params=self._BatchNormParams(
+                scale=has_scaling, fused=fused_batch_norm),
+            scope=name)
+        sep_conv_name = name
       if with_bypass:
         node = math_ops.add(inputs, node, name='test/Add')
         relu(node, name='test/' + relu_op_name)
@@ -489,45 +613,48 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       fold_batch_norms.FoldBatchNorms(
           g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
 
-    folded_mul = g.get_operation_by_name(scope + '/mul_fold')
+    folded_mul = g.get_operation_by_name(sep_conv_name + '/mul_fold')
     self.assertEqual(folded_mul.type, 'Mul')
     if fused_batch_norm:
-      scale_reshape_op_name = scope + '/BatchNorm_Fold/scale_reshape'
+      scale_reshape_op_name = sep_conv_name + '/BatchNorm_Fold/scale_reshape'
     else:
-      scale_reshape_op_name = scope + '/scale_reshape'
-    self._AssertInputOpsAre(folded_mul,
-                            [scope + '/correction_mult', scale_reshape_op_name])
-    self._AssertOutputGoesToOps(folded_mul, g, [scope + '/depthwise_Fold'])
+      scale_reshape_op_name = sep_conv_name + '/scale_reshape'
+    self._AssertInputOpsAre(
+        folded_mul, [sep_conv_name + '/correction_mult', scale_reshape_op_name])
+    self._AssertOutputGoesToOps(folded_mul, g,
+                                [sep_conv_name + '/depthwise_Fold'])
 
     scale_reshape = g.get_operation_by_name(scale_reshape_op_name)
     self.assertEqual(scale_reshape.type, 'Reshape')
     self._AssertInputOpsAre(scale_reshape, [
-        self._BatchNormMultiplierName(scope, has_scaling, fused_batch_norm),
+        self._BatchNormMultiplierName(sep_conv_name, has_scaling,
+                                      fused_batch_norm),
         scale_reshape_op_name + '/shape'
     ])
-    self._AssertOutputGoesToOps(scale_reshape, g, [scope + '/mul_fold'])
+    self._AssertOutputGoesToOps(scale_reshape, g, [sep_conv_name + '/mul_fold'])
 
-    folded_conv = g.get_operation_by_name(scope + '/depthwise_Fold')
+    folded_conv = g.get_operation_by_name(sep_conv_name + '/depthwise_Fold')
     self.assertEqual(folded_conv.type, 'DepthwiseConv2dNative')
-    self._AssertInputOpsAre(
-        folded_conv, [scope + '/mul_fold', scope + '/depthwise/SpaceToBatchND'])
+    self._AssertInputOpsAre(folded_conv, [
+        sep_conv_name + '/mul_fold', sep_conv_name + '/depthwise/SpaceToBatchND'
+    ])
     if fused_batch_norm:
       self._AssertOutputGoesToOps(folded_conv, g,
-                                  [scope + '/BatchToSpaceND_Fold'])
+                                  [sep_conv_name + '/BatchToSpaceND_Fold'])
     else:
-      self._AssertOutputGoesToOps(folded_conv, g,
-                                  [scope + '/depthwise/BatchToSpaceND_Fold'])
+      self._AssertOutputGoesToOps(
+          folded_conv, g, [sep_conv_name + '/depthwise/BatchToSpaceND_Fold'])
 
-    folded_add = g.get_operation_by_name(scope + '/add_fold')
+    folded_add = g.get_operation_by_name(sep_conv_name + '/add_fold')
     self.assertEqual(folded_add.type, 'Add')
     self._AssertInputOpsAre(folded_add, [
-        scope + '/correction_add',
-        self._BathNormBiasName(scope, fused_batch_norm)
+        sep_conv_name + '/correction_add',
+        self._BathNormBiasName(sep_conv_name, fused_batch_norm)
     ])
     output_op_names = ['test/Add' if with_bypass else 'test/' + relu_op_name]
     self._AssertOutputGoesToOps(folded_add, g, output_op_names)
     if freeze_batch_norm_delay is not None:
-      self._AssertMovingAveragesAreFrozen(g, scope)
+      self._AssertMovingAveragesAreFrozen(g, name)
 
     for op in g.get_operations():
       self.assertFalse('//' in op.name, 'Double slash in op %s' % op.name)
@@ -535,9 +662,14 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
   def testFoldAtrousConv2d(self):
     self._RunTestOverParameters(self._TestFoldAtrousConv2d)
 
-  def _TestCompareFoldAndUnfolded(self, relu, relu_op_name, with_bypass,
-                                  has_scaling, fused_batch_norm,
-                                  freeze_batch_norm_delay):
+  def _TestCompareFoldAndUnfolded(self,
+                                  relu,
+                                  relu_op_name,
+                                  with_bypass,
+                                  has_scaling,
+                                  fused_batch_norm,
+                                  freeze_batch_norm_delay,
+                                  insert_identity_node=False):
     """Tests that running folded and unfolded BN returns the same results.
 
     Args:
@@ -549,6 +681,8 @@ class FoldBatchNormsTest(test_util.TensorFlowTestCase):
       fused_batch_norm: Bool, when true the batch norm is fused.
       freeze_batch_norm_delay: None or the number of steps after which training
       switches to using frozen mean and variance
+      insert_identity_node: Bool, insert identity node between conv and batch
+      norm
     """
     random_seed.set_random_seed(1234)
     unfolded_g = ops.Graph()
diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py
index aa3ca991c060b208ec71ae27e1ddc75df8a2c723..cfbf5bf30f9ba224afdef0c849e33fe7915cf583 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher.py
@@ -21,7 +21,10 @@ from __future__ import print_function
 import abc
 import itertools
 
+import six
 
+
+@six.add_metaclass(abc.ABCMeta)
 class Pattern(object):
   """The parent class of all patterns (e.g. OpTypePattern and OneofPattern)."""
 
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index d9dc7fa62e11b47e639664f23b08bbeb9ff8bde2..8619708cdaecd78bcc7de0e8e0cbf2baa11bf6a2 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -49,7 +49,7 @@ def _ModelVariable(name,
                    collections=None,
                    trainable=None):
   collections = list(collections or [])
-  collections += [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES]
+  collections += [ops.GraphKeys.GLOBAL_VARIABLES]
   return variable_scope.get_variable(
       name,
       shape=shape,
@@ -62,12 +62,13 @@ def LastValueQuantize(inputs,
                       per_channel=False,
                       init_min=-6.0,
                       init_max=6.0,
-                      vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
+                      vars_collection=None,
                       name_prefix='LastValueQuant',
                       reuse=None,
                       is_training=True,
                       num_bits=8,
-                      narrow_range=False):
+                      narrow_range=False,
+                      symmetric=False):
   """Adds a layer that collects quantization ranges as last input ranges.
 
   LastValueQuantize creates variables called 'min' and 'max', representing the
@@ -88,6 +89,8 @@ def LastValueQuantize(inputs,
     num_bits: Number of bits to use for quantization, must be between 2 and 8.
     narrow_range: Whether to use the narrow quantization range
       [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1].
+    symmetric: If true, use symmetric quantization limits instead of training
+      the minimum and maximum of each quantization range separately.
   Returns:
     a tensor containing quantized values.
   """
@@ -104,17 +107,18 @@ def LastValueQuantize(inputs,
     else:
       min_max_shape = []
 
+    vars_collections = [vars_collection] if vars_collection else []
     min_var = _ModelVariable(
         'min',
         shape=min_max_shape,
         initializer=init_ops.constant_initializer(init_min),
-        collections=[vars_collection],
+        collections=vars_collections,
         trainable=False)
     max_var = _ModelVariable(
         'max',
         shape=min_max_shape,
         initializer=init_ops.constant_initializer(init_max),
-        collections=[vars_collection],
+        collections=vars_collections,
         trainable=False)
     if not is_training:
       return _FakeQuantWithMinMaxVars(
@@ -134,26 +138,40 @@ def LastValueQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_min = math_ops.reduce_min(
-            inputs, reduction_indices=reduce_dims, name='BatchMin')
+            inputs, axis=reduce_dims, name='BatchMin')
       else:
         batch_min = inputs
     else:
       batch_min = math_ops.reduce_min(inputs, name='BatchMin')
-    # TFLite requires that 0.0 if always in the [min; max] range.
-    batch_min = math_ops.minimum(batch_min, 0.0)
-    assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast')
 
     if per_channel:
       if input_dim >= 2:
         batch_max = math_ops.reduce_max(
-            inputs, reduction_indices=reduce_dims, name='BatchMax')
+            inputs, axis=reduce_dims, name='BatchMax')
       else:
         batch_max = inputs
     else:
       batch_max = math_ops.reduce_max(inputs, name='BatchMax')
-    # TFLite requires that 0.0 if always in the [min; max] range.
-    batch_max = math_ops.maximum(batch_max, 0.0)
-    assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast')
+
+    if symmetric:
+      if narrow_range:
+        min_max_ratio = -1
+      else:
+        # In two's complement notation, the negative range is slightly larger
+        # than the positive range.
+        min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits)
+
+      # TFLite requires that 0.0 if always in the [min; max] range. Because
+      # batch_min <= batch_max, it follows that range_min <= 0 <= range_max.
+      range_min = math_ops.minimum(batch_min, batch_max / min_max_ratio)
+      range_max = math_ops.maximum(batch_max, batch_min * min_max_ratio)
+    else:
+      # TFLite requires that 0.0 if always in the [min; max] range.
+      range_min = math_ops.minimum(batch_min, 0.0)
+      range_max = math_ops.maximum(batch_max, 0.0)
+
+    assign_min = state_ops.assign(min_var, range_min, name='AssignMinLast')
+    assign_max = state_ops.assign(max_var, range_max, name='AssignMaxLast')
 
     return _FakeQuantWithMinMaxVars(
         inputs,
@@ -174,7 +192,8 @@ def MovingAvgQuantize(inputs,
                       reuse=None,
                       is_training=True,
                       num_bits=8,
-                      narrow_range=False):
+                      narrow_range=False,
+                      symmetric=False):
   """Adds a layer that collects quantization ranges as EMAs of input ranges.
 
   MovingAvgQuantize creates variables called 'min' and 'max', representing the
@@ -196,6 +215,8 @@ def MovingAvgQuantize(inputs,
     num_bits: Number of bits to use for quantization, must be between 2 and 8.
     narrow_range: Whether to use the narrow quantization range
       [1; 2^num_bits - 1] or wide range [0; 2^num_bits - 1].
+    symmetric: If true, use symmetric quantization limits instead of training
+      the minimum and maximum of each quantization range separately.
   Returns:
     a tensor containing quantized values.
   """
@@ -212,17 +233,18 @@ def MovingAvgQuantize(inputs,
     else:
       min_max_shape = []
 
+    vars_collections = [vars_collection] if vars_collection else []
     min_var = _ModelVariable(
         'min',
         shape=min_max_shape,
         initializer=init_ops.constant_initializer(init_min),
-        collections=[vars_collection],
+        collections=vars_collections,
         trainable=False)
     max_var = _ModelVariable(
         'max',
         shape=min_max_shape,
         initializer=init_ops.constant_initializer(init_max),
-        collections=[vars_collection],
+        collections=vars_collections,
         trainable=False)
     if not is_training:
       return _FakeQuantWithMinMaxVars(
@@ -241,28 +263,42 @@ def MovingAvgQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_min = math_ops.reduce_min(
-            inputs, reduction_indices=reduce_dims, name='BatchMin')
+            inputs, axis=reduce_dims, name='BatchMin')
       else:
         batch_min = inputs
     else:
       batch_min = math_ops.reduce_min(inputs, name='BatchMin')
-    # B-eng requires that 0.0 if always in the [min; max] range.
-    batch_min = math_ops.minimum(batch_min, 0.0)
-    assign_min = moving_averages.assign_moving_average(
-        min_var, batch_min, ema_decay, name='AssignMinEma')
 
     if per_channel:
       if input_dim >= 2:
         batch_max = math_ops.reduce_max(
-            inputs, reduction_indices=reduce_dims, name='BatchMax')
+            inputs, axis=reduce_dims, name='BatchMax')
       else:
         batch_max = inputs
     else:
       batch_max = math_ops.reduce_max(inputs, name='BatchMax')
-    # B-eng requires that 0.0 if always in the [min; max] range.
-    batch_max = math_ops.maximum(batch_max, 0.0)
+
+    if symmetric:
+      if narrow_range:
+        min_max_ratio = -1
+      else:
+        # In two's complement notation, the negative range is slightly larger
+        # than the positive range.
+        min_max_ratio = -((1 << num_bits) - 2) / (1 << num_bits)
+
+      # TFLite requires that 0.0 if always in the [min; max] range. Because
+      # batch_min <= batch_max, it follows that range_min <= 0 <= range_max.
+      range_min = math_ops.minimum(batch_min, batch_max / min_max_ratio)
+      range_max = math_ops.maximum(batch_max, batch_min * min_max_ratio)
+    else:
+      # TFLite requires that 0.0 if always in the [min; max] range.
+      range_min = math_ops.minimum(batch_min, 0.0)
+      range_max = math_ops.maximum(batch_max, 0.0)
+
+    assign_min = moving_averages.assign_moving_average(
+        min_var, range_min, ema_decay, name='AssignMinEma')
     assign_max = moving_averages.assign_moving_average(
-        max_var, batch_max, ema_decay, name='AssignMaxEma')
+        max_var, range_max, ema_decay, name='AssignMaxEma')
 
     return _FakeQuantWithMinMaxVars(
         inputs,
diff --git a/tensorflow/contrib/quantize/python/quant_ops_test.py b/tensorflow/contrib/quantize/python/quant_ops_test.py
index a45840009b758881c14fb64b2d39af6cd4ec4bc4..36d2af94e059cdc75b758bbf607d26c4e1ee73e9 100644
--- a/tensorflow/contrib/quantize/python/quant_ops_test.py
+++ b/tensorflow/contrib/quantize/python/quant_ops_test.py
@@ -29,51 +29,55 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 _MIN_MAX_VARS = 'min_max_vars'
+_SYMMETRIC_RANGE_RATIO = 0.9921875  # 127 / 128
 
 
 class QuantOpsTest(googletest.TestCase):
 
   def testLastValueQuantizeTrainingAssign(self):
-    g = ops.Graph()
-    with session.Session(graph=g) as sess:
-      x = array_ops.placeholder(dtypes.float32, shape=[2])
-      y = quant_ops.LastValueQuantize(
-          x,
-          init_min=0.0,
-          init_max=0.0,
-          is_training=True,
-          vars_collection=_MIN_MAX_VARS)
+    min_value, max_value = self._GetMinMaxValues(quant_ops.LastValueQuantize,
+                                                 [[-1, 1]])
+    self.assertEqual(min_value, -1.0)
+    self.assertEqual(max_value, 1.0)
 
-      # Run the step.
-      sess.run(variables.global_variables_initializer())
-      sess.run(y, feed_dict={x: [-1.0, 1.0]})
-      # Now check that the min_max_vars were, in fact, updated.
-      min_value, max_value = self._GetMinMaxValues(sess)
-      self.assertEqual(min_value, -1.0)
-      self.assertEqual(max_value, 1.0)
+  def testLastValueSymmetricQuantizeTrainingAssign(self):
+    min_value, max_value = self._GetMinMaxValues(
+        quant_ops.LastValueQuantize,
+        [[-_SYMMETRIC_RANGE_RATIO, _SYMMETRIC_RANGE_RATIO]],
+        symmetric=True,
+        narrow_range=False)
+    self.assertEqual(min_value, -1.0)
+    self.assertEqual(max_value, _SYMMETRIC_RANGE_RATIO)
+
+  def testLastValueSymmetricQuantizeNarrowRangeTrainingAssign(self):
+    min_value, max_value = self._GetMinMaxValues(
+        quant_ops.LastValueQuantize, [[-1, 0.5]],
+        symmetric=True,
+        narrow_range=True)
+    self.assertEqual(min_value, -1.0)
+    self.assertEqual(max_value, 1)
 
   def testMovingAvgQuantizeTrainingAssign(self):
-    g = ops.Graph()
-    with session.Session(graph=g) as sess:
-      x = array_ops.placeholder(dtypes.float32, shape=[2])
-      y = quant_ops.MovingAvgQuantize(
-          x,
-          init_min=0.0,
-          init_max=0.0,
-          is_training=True,
-          vars_collection=_MIN_MAX_VARS)
+    min_value, max_value = self._GetMinMaxValues(quant_ops.MovingAvgQuantize,
+                                                 [[-1, 1], [0, 0]])
+    self.assertAlmostEqual(min_value, -0.5, delta=1e-3)
+    self.assertAlmostEqual(max_value, 0.5, delta=1e-3)
 
-      # Run the step.
-      sess.run(variables.global_variables_initializer())
-      # Do two runs to avoid zero debias.
-      sess.run(y, feed_dict={x: [-1.0, 1.0]})
-      sess.run(y, feed_dict={x: [0.0, 0.0]})
-      # Now check that the min_max_vars were, in fact, updated.
-      min_value, max_value = self._GetMinMaxValues(sess)
-      self.assertGreater(min_value, -1.0)
-      self.assertLess(min_value, 0.0)
-      self.assertGreater(max_value, 0.0)
-      self.assertLess(max_value, 1.0)
+  def testMovingAvgSymmetricQuantizeTrainingAssign(self):
+    min_value, max_value = self._GetMinMaxValues(
+        quant_ops.MovingAvgQuantize, [[-1, 0.5], [0, 0]], symmetric=True)
+    self.assertAlmostEqual(min_value, -0.5, delta=1e-3)
+    self.assertAlmostEqual(max_value, 0.5 * _SYMMETRIC_RANGE_RATIO, delta=1e-3)
+    self.assertAlmostEqual(max_value, min_value * -_SYMMETRIC_RANGE_RATIO)
+
+  def testMovingAvgSymmetricQuantizeNarrowRangeTrainingAssign(self):
+    min_value, max_value = self._GetMinMaxValues(
+        quant_ops.MovingAvgQuantize, [[-1, 0.5], [0, 0]],
+        symmetric=True,
+        narrow_range=True)
+    self.assertAlmostEqual(min_value, -0.5, delta=1e-3)
+    self.assertAlmostEqual(max_value, 0.5, delta=1e-3)
+    self.assertAlmostEqual(max_value, -min_value)
 
   def testVariablesNotPartitioned_LastValue(self):
     # Variables added should not use a default partiioner since they are
@@ -105,14 +109,31 @@ class QuantOpsTest(googletest.TestCase):
             is_training=True,
             vars_collection=_MIN_MAX_VARS)
 
-  def _GetMinMaxValues(self, sess):
-    min_max_vars = ops.get_collection(_MIN_MAX_VARS)
-    self.assertEqual(len(min_max_vars), 2)
-    min_idx = 0 if 'min' in min_max_vars[0].name else 1
-    max_idx = (min_idx + 1) % 2
-    min_var, max_var = min_max_vars[min_idx], min_max_vars[max_idx]
-    min_max_values = sess.run([min_var, max_var])
-    return min_max_values[0], min_max_values[1]
+  def _GetMinMaxValues(self, quantize_fn, input_values, **kwds):
+    g = ops.Graph()
+    with session.Session(graph=g) as sess:
+      x = array_ops.placeholder(dtypes.float32, shape=[2])
+      y = quantize_fn(
+          x,
+          init_min=0.0,
+          init_max=0.0,
+          is_training=True,
+          vars_collection=_MIN_MAX_VARS,
+          **kwds)
+
+      # Run the step.
+      sess.run(variables.global_variables_initializer())
+      for input_elem in input_values:
+        sess.run(y, feed_dict={x: input_elem})
+
+      # Now check that the min_max_vars were, in fact, updated.
+      min_max_vars = ops.get_collection(_MIN_MAX_VARS)
+      self.assertEqual(len(min_max_vars), 2)
+      min_idx = 0 if 'min' in min_max_vars[0].name else 1
+      max_idx = (min_idx + 1) % 2
+      min_var, max_var = min_max_vars[min_idx], min_max_vars[max_idx]
+      min_max_values = sess.run([min_var, max_var])
+      return min_max_values[0], min_max_values[1]
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 5e63d33db85a511f33afa1f22647aba755b85cbf..21d1b1213090273b5abd8e012f8711db98c94347 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -32,13 +32,22 @@ from tensorflow.python.platform import tf_logging as logging
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
 
 # Activations that are supported by the quantization rewrite.
-_ACTIVATION_TYPES = {'Relu', 'Relu6'}
+_ACTIVATION_TYPES = {'Relu', 'Relu6', 'Identity'}
+
+_RELU_TYPES = {'Relu', 'Relu6'}
+
+_QUANTIZATION_OP = {'FakeQuantWithMinMaxVars'}
+_VALID_SRC_OP = {'Add', 'Mul'}
+_INTERMEDIATE_OP = {'Add', 'Mul'}
+_PASS_THROUGH_OP = {'Reshape', 'Identity', 'BatchToSpaceND', 'SpaceToBatchND'}
+_VALID_ACTIVATION_OP = {'Relu', 'Relu6'}
 
 
 def Quantize(graph,
              is_training,
              weight_bits=8,
              activation_bits=8,
+             symmetric=False,
              ema_decay=0.999,
              quant_delay=None,
              vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
@@ -56,6 +65,8 @@ def Quantize(graph,
     is_training: Whether quantizing training graph or eval graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
+    symmetric: (Optional) If true, use symmetric quantization limits instead of
+      training the minimum and maximum of each quantization range separately.
     ema_decay: (Optional) Float, EMA decay parameter.  EMA is used to update
       quantization intervals for quantizing activations (see here about EMA:
       https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average).
@@ -73,50 +84,57 @@ def Quantize(graph,
     scope += '/'
 
   input_to_ops_map = input_to_ops.InputToOps(graph)
+  quantized_ops = set()
   for layer_match in _FindLayersToQuantize(graph):
     # Quantize the weights.
     context = _GetContextFromOp(layer_match.layer_op)
 
     # If `scope` is given, only quantize it if the consumer of weights
     # (the layer op) is in the right scope.
-    _InsertQuantOp(
-        context,
-        'weights_quant',
-        layer_match.weight_tensor.op, [layer_match.layer_op],
-        is_training,
-        moving_avg=False,
-        ema_decay=ema_decay,
-        quant_delay=quant_delay,
-        narrow_range=True,
-        vars_collection=vars_collection,
-        bits=weight_bits,
-        consumer_scope=scope)
+    if layer_match.weight_tensor is not None:
+      _InsertQuantOp(
+          context,
+          'weights_quant',
+          layer_match.weight_tensor.op,
+          input_to_ops_map.ConsumerOperations(layer_match.weight_tensor.op),
+          is_training,
+          moving_avg=False,
+          ema_decay=ema_decay,
+          quant_delay=quant_delay,
+          narrow_range=True,
+          vars_collection=vars_collection,
+          bits=weight_bits,
+          symmetric=symmetric,
+          consumer_scope=scope)
 
     # Quantize the activations.
-    consumer_ops = input_to_ops_map.ConsumerOperations(
-        layer_match.activation_op)
-    add_context = context
-    if layer_match.bypass_op:
-      pattern_match_result = re.search(r'^(.*)/([^/]+)', context)
-      if pattern_match_result is not None:
-        add_context = pattern_match_result.group(1)
-      else:
-        add_context = ''
-    # If `scope` is given, only quantize it if the producer of weights
-    # (usually it's the layer op) is in the right scope.
-    _InsertQuantOp(
-        add_context,
-        'act_quant',
-        layer_match.activation_op,
-        consumer_ops,
-        is_training,
-        moving_avg=True,
-        ema_decay=ema_decay,
-        quant_delay=quant_delay,
-        vars_collection=vars_collection,
-        bits=activation_bits,
-        init_min=0.0,
-        producer_scope=scope)
+    if layer_match.activation_op is not None:
+      consumer_ops = input_to_ops_map.ConsumerOperations(
+          layer_match.activation_op)
+      add_context = context
+      if layer_match.bypass_op:
+        pattern_match_result = re.search(r'^(.*)/([^/]+)', context)
+        if pattern_match_result is not None:
+          add_context = pattern_match_result.group(1)
+        else:
+          add_context = ''
+      # If `scope` is given, only quantize it if the producer of weights
+      # (usually it's the layer op) is in the right scope.
+      _InsertQuantOp(
+          add_context,
+          'act_quant',
+          layer_match.activation_op,
+          consumer_ops,
+          is_training,
+          moving_avg=True,
+          ema_decay=ema_decay,
+          quant_delay=quant_delay,
+          vars_collection=vars_collection,
+          bits=activation_bits,
+          symmetric=symmetric,
+          init_min=0.0,
+          producer_scope=scope)
+      quantized_ops.add(layer_match.activation_op)
 
     # Quantize the inputs and output to the bypass (if it exists). The input to
     # the bypass is the bias add, and the output is the activation.
@@ -126,20 +144,23 @@ def Quantize(graph,
       _InsertQuantOp(
           context,
           'conv_quant',
-          layer_match.bias_add_op, [layer_match.bypass_op],
+          layer_match.bias_add_op,
+          input_to_ops_map.ConsumerOperations(layer_match.bias_add_op),
           is_training,
           moving_avg=True,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
           bits=activation_bits,
+          symmetric=symmetric,
           producer_scope=scope,
           consumer_scope=scope)
+      quantized_ops.add(layer_match.bias_add_op)
       # Make sure the op following this isn't an activation. In which case, we
       # shouldn't quantize it, since the activation will be Fused into the
       # Add at inference time.
       consumers = input_to_ops_map.ConsumerOperations(layer_match.bypass_op)
-      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+      if any(consumer.type in _ACTIVATION_TYPES for consumer in consumers):
         logging.info('Skipping %s, because its followed by an activation.',
                      layer_match.bypass_op.name)
       else:
@@ -154,8 +175,10 @@ def Quantize(graph,
             quant_delay=quant_delay,
             vars_collection=vars_collection,
             bits=activation_bits,
+            symmetric=symmetric,
             producer_scope=scope,
             consumer_scope=scope)
+        quantized_ops.add(layer_match.bypass_op)
 
     # Quantize bypass ops that occur after the activation.
     if layer_match.post_activation_bypass_op is not None:
@@ -172,7 +195,7 @@ def Quantize(graph,
       # Add at inference time.
       consumers = input_to_ops_map.ConsumerOperations(
           layer_match.post_activation_bypass_op)
-      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+      if any(consumer.type in _RELU_TYPES for consumer in consumers):
         logging.info('Skipping %s, because its followed by an activation.',
                      layer_match.post_activation_bypass_op.name)
       else:
@@ -187,7 +210,117 @@ def Quantize(graph,
             quant_delay=quant_delay,
             vars_collection=vars_collection,
             bits=activation_bits,
+            symmetric=symmetric,
             producer_scope=scope)
+        quantized_ops.add(layer_match.post_activation_bypass_op)
+
+  _QuantizeActivationLayers(
+      quantized_ops,
+      graph,
+      is_training,
+      activation_bits,
+      ema_decay,
+      quant_delay,
+      vars_collection,
+      scope=scope)
+
+
+def _QuantizeActivationLayers(quantized_ops,
+                              graph,
+                              is_training,
+                              activation_bits=8,
+                              ema_decay=0.999,
+                              quant_delay=None,
+                              vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
+                              scope=None):
+  """Quantize intermediate activation tensors after addition and multiplication.
+
+  Args:
+    quantized_ops: Set of previously quantized activation ops.
+    graph: Graph to modify.
+    is_training: Whether quantizing training graph or eval graph.
+    activation_bits: Number of bits to use for quantizing activations.
+    ema_decay: (Optional) Float, EMA decay parameter.  EMA is used to update
+      quantization intervals for quantizing activations (see here about EMA:
+      https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average).
+    quant_delay: (Optional, default None) Int, count of global steps for which
+      to delay quantization.  This helps weights stabilize at the start of
+      training.
+    vars_collection: (Optional) Collection where to store the variables for
+      quantization interval ends.
+    scope: The scope to be transformed. If it's not None, only the ops which are
+      in this scope will be transformed.
+
+  Raises:
+    ValueError: When quantization fails.
+  """
+  input_to_ops_map = input_to_ops.InputToOps(graph)
+  for op in (op for op in graph.get_operations()):
+    if _CheckIfQuantizableOp(op, quantized_ops):
+      logging.info('Inserting fake quant op activation_%s_quant after %s',
+                   op.type, op.name)
+      consumers = input_to_ops_map.ConsumerOperations(op)
+      _InsertQuantOp(
+          op.name,
+          'activation_' + op.type + '_quant',
+          op,
+          consumers,
+          is_training,
+          moving_avg=True,
+          ema_decay=ema_decay,
+          quant_delay=quant_delay,
+          vars_collection=vars_collection,
+          bits=activation_bits,
+          producer_scope=scope)
+
+
+def _CheckIfQuantizableOp(src_op, quantized_ops):
+  """Check if the output of an op should be quantized.
+
+  Args:
+    src_op: op to be checked
+    quantized_ops: Set of previously quantized activation ops.
+
+  Returns:
+    Boolean specifying if output should be quantized or not.
+  """
+  src_op_name = set([src_op.type])
+  if src_op in quantized_ops:
+    return False
+  if not src_op_name.intersection(_VALID_SRC_OP):
+    return False
+
+  # If src op is an add or a mul and the output is immediately
+  # followed by an activation skip
+  if len(src_op.outputs) == 1 and len(src_op.outputs[0].consumers()) == 1:
+    op_consumers = src_op.outputs[0].consumers()
+    if set([op_consumers[0].type]).intersection(_VALID_ACTIVATION_OP):
+      logging.info('Skipping quant after %s', src_op.name)
+      return False
+  # Is an Add or a Mul
+  input_ops = src_op.inputs
+
+  for op in input_ops:
+    curr_op = op.op
+    curr_op_type = set([curr_op.type])
+    while curr_op_type.intersection(_PASS_THROUGH_OP):
+      # Walk back through pass through ops
+      curr_op = curr_op.inputs[0].op
+      curr_op_type = set([curr_op.type])
+      # Now at a valid or quantizable op, need to check if
+      # atleast one of the inputs to a valid op is connected
+      # to a quantizable op via pass through ops
+
+    if (curr_op_type.intersection(_QUANTIZATION_OP) or
+        curr_op.name.find('delayed_quant/Merge') > 0):
+      return True
+
+    if curr_op_type.intersection(_INTERMEDIATE_OP):
+      # Check if atleast one input to intermediate_op are quantizable
+      for input_op in curr_op.inputs:
+        if _CheckIfQuantizableOp(input_op.op, quantized_ops):
+          return True
+  return False
 
 
 def _FindLayersToQuantize(graph):
@@ -384,10 +517,11 @@ def _FindLayersToQuantize(graph):
       bias_add_op = match_result.get_op(folded_bias_add_pattern)
     bypass_op = match_result.get_op(bypass_pattern)
     if layer_op not in matched_layer_set:
-      matched_layer_set.add(layer_op)
-      layer_matches.append(
-          _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op, None,
-                      bias_add_op))
+      if not _IsSkipLayer(activation_op):
+        matched_layer_set.add(layer_op)
+        layer_matches.append(
+            _LayerMatch(layer_op, weight_tensor, activation_op, bypass_op, None,
+                        bias_add_op))
 
   # Match the final layer, where there may not be an activation and instead
   # the output of the final BiasAdd must be quantized. So we treat the BiasAdd
@@ -415,6 +549,8 @@ def _FindLayersToQuantize(graph):
   for match_result in sep_conv_matcher.match_graph(graph):
     layer_op = match_result.get_op(layer_pattern)
     weight_tensor = match_result.get_tensor(weight_identity_pattern)
+    if weight_tensor is None:
+      weight_tensor = match_result.get_tensor(weight_resource_var_pattern)
     activation_op = match_result.get_op(layer_pattern)
     if layer_op not in matched_layer_set:
       matched_layer_set.add(layer_op)
@@ -424,6 +560,32 @@ def _FindLayersToQuantize(graph):
   return layer_matches
 
 
+def _IsSkipLayer(activation_op):
+  """Skip quantizing conv->identity->Batch norm layers.
+
+  Args:
+    activation_op: Activation op detected by layer matching pattern
+
+  Returns:
+    skip_layer: boolean, true when conv->identity->batch norm is detected.
+  """
+
+  # Exclude quantization of conv->identity->BN,
+  # After folding, this part corresponds to estimation of mean and variance
+  # and should not be quantized.
+  skip_layer = False
+  if activation_op.type == 'Identity' and len(activation_op.outputs) == 1:
+    if len(activation_op.outputs[0].consumers()) == 1:
+      consumer = activation_op.outputs[0].consumers()[0]
+      if consumer.type == 'FusedBatchNorm':
+        skip_layer = True
+        logging.info(
+            'Skipping quantizing %s, because it is the output of a conv/fc '
+            'followed by a identity, feeding a fused batch norm.',
+            activation_op.name)
+  return skip_layer
+
+
 class _LayerMatch(object):
   """Contains all information related to a matched Layer."""
 
@@ -488,6 +650,7 @@ def _InsertQuantOp(context,
                    init_min=-6.0,
                    init_max=6.0,
                    bits=8,
+                   symmetric=False,
                    ema_decay=0.999,
                    quant_delay=None,
                    vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
@@ -508,6 +671,8 @@ def _InsertQuantOp(context,
     init_min: Starting minimum value for the new quantization op.
     init_max: Starting maximum value for the new quantization op.
     bits: Number of bits to use for quantization, must be between 2 and 8.
+    symmetric: (Optional) If true, use symmetric quantization limits instead of
+      training the minimum and maximum of each quantization range separately.
     ema_decay: (Optional) Float, EMA decay parameter.  EMA is used to update
       quantization intervals for quantizing activations (see here about EMA:
       https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average).
@@ -571,6 +736,7 @@ def _InsertQuantOp(context,
             ema_decay=ema_decay,
             is_training=is_training,
             num_bits=bits,
+            symmetric=symmetric,
             narrow_range=narrow_range,
             vars_collection=vars_collection,
             name_prefix=name_prefix))
@@ -582,6 +748,7 @@ def _InsertQuantOp(context,
             init_max=init_max,
             is_training=is_training,
             num_bits=bits,
+            symmetric=symmetric,
             narrow_range=narrow_range,
             vars_collection=vars_collection,
             name_prefix=name_prefix))
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 484493f1b2a64ae68b16a03ac74e75a5e84bb3de..2a256a3c51cbf91d34c2639bf1adb74deffa2fed 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -27,6 +27,7 @@ def _create_graph(input_graph=None,
                   is_training=True,
                   weight_bits=8,
                   activation_bits=8,
+                  symmetric=False,
                   quant_delay=None,
                   freeze_bn_delay=None,
                   scope=None):
@@ -43,6 +44,8 @@ def _create_graph(input_graph=None,
     is_training: Whether quantizing training or eval graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
+    symmetric: If true, use symmetric quantization limits instead of training
+      the minimum and maximum of each quantization range separately.
     quant_delay: Number of steps after which weights and activations are
       quantized during training.
     freeze_bn_delay: Number of steps after which moving mean and variance are
@@ -74,6 +77,7 @@ def _create_graph(input_graph=None,
         quant_delay=quant_delay,
         weight_bits=weight_bits,
         activation_bits=activation_bits,
+        symmetric=symmetric,
         scope=scope)
 
 
@@ -142,6 +146,7 @@ def create_eval_graph(input_graph=None):
 def experimental_create_training_graph(input_graph=None,
                                        weight_bits=8,
                                        activation_bits=8,
+                                       symmetric=False,
                                        quant_delay=0,
                                        freeze_bn_delay=None,
                                        scope=None):
@@ -173,6 +178,8 @@ def experimental_create_training_graph(input_graph=None,
       default graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
+    symmetric: If true, use symmetric quantization limits instead of training
+      the minimum and maximum of each quantization range separately.
     quant_delay: Number of steps after which weights and activations are
       quantized during training.
     freeze_bn_delay: Number of steps after which moving mean and variance are
@@ -192,6 +199,7 @@ def experimental_create_training_graph(input_graph=None,
       is_training=True,
       weight_bits=weight_bits,
       activation_bits=activation_bits,
+      symmetric=symmetric,
       quant_delay=quant_delay,
       freeze_bn_delay=freeze_bn_delay,
       scope=scope)
@@ -200,6 +208,7 @@ def experimental_create_training_graph(input_graph=None,
 def experimental_create_eval_graph(input_graph=None,
                                    weight_bits=8,
                                    activation_bits=8,
+                                   symmetric=False,
                                    quant_delay=None,
                                    scope=None):
   """Rewrites an eval input_graph in place for simulated quantization.
@@ -219,6 +228,8 @@ def experimental_create_eval_graph(input_graph=None,
       default graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
+    symmetric: If true, use symmetric quantization limits instead of training
+      the minimum and maximum of each quantization range separately.
     quant_delay: Number of steps after which weights and activations are
       quantized during eval.
     scope: The scope to be transformed. If it's not None, only the ops which
@@ -233,6 +244,7 @@ def experimental_create_eval_graph(input_graph=None,
       is_training=False,
       weight_bits=weight_bits,
       activation_bits=activation_bits,
+      symmetric=symmetric,
       quant_delay=quant_delay,
       scope=scope)
 
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index e80d2183a69096f1148160126b025dbaacbcb137..9aa6e2c24d44c9c81f72cc6d1cfc7c9c4e3e15e5 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import quantize_graph
 from tensorflow.python import training
@@ -27,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import template
 from tensorflow.python.platform import googletest
 
 
@@ -48,6 +51,8 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     rewrite_fns = [
         quantize_graph.create_training_graph,
         quantize_graph.experimental_create_training_graph,
+        functools.partial(
+            quantize_graph.experimental_create_training_graph, symmetric=True),
     ]
     for fn in rewrite_fns:
       test_fn(fn)
@@ -56,6 +61,8 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     rewrite_fns = [
         quantize_graph.create_eval_graph,
         quantize_graph.experimental_create_eval_graph,
+        functools.partial(
+            quantize_graph.experimental_create_eval_graph, symmetric=True),
     ]
     for fn in rewrite_fns:
       test_fn(fn)
@@ -267,6 +274,51 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       graph_def_after = str(g.as_graph_def())
       self.assertEqual(graph_def_before, graph_def_after)
 
+  def testIdentityNode(self):
+    self._RunTestOverAllRewrites(self._TestIdentityNode)
+
+  def _TestIdentityNode(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      self._LayerWithIdentity()
+
+    rewrite_fn(graph)
+    op_names = [op.name for op in graph.get_operations()]
+    self.assertTrue(any('test/Conv/weights_quant' in name for name in op_names))
+    self.assertTrue(any('test/Conv/act_quant' in name for name in op_names))
+    bn_out_identity = graph.get_operation_by_name('test/bn_out')
+    self._AssertInputOpsAre(bn_out_identity, [
+        'test/Conv/add_fold',
+    ])
+
+    conv_out_identity = graph.get_operation_by_name('test/conv_out')
+    self._AssertOutputGoesToOps(conv_out_identity, graph,
+                                ['test/BatchNorm/FusedBatchNorm'])
+
+  def testActivationQuantization(self):
+    self._RunTestOverAllRewrites(self._TestActivationQuantization)
+
+  def _TestActivationQuantization(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      _ = self._LayerWithActivationProcessing()
+
+    rewrite_fn(graph)
+    # Check if outputs of multipliers and adds are quantized.
+
+    mul_op = graph.get_operation_by_name('test/Mul')
+    self._AssertOutputGoesToOps(
+        mul_op, graph,
+        ['test/Mul/activation_Mul_quant/FakeQuantWithMinMaxVars'])
+    mul_op = graph.get_operation_by_name('test/Mul_1')
+    self._AssertOutputGoesToOps(
+        mul_op, graph,
+        ['test/Mul_1/activation_Mul_quant/FakeQuantWithMinMaxVars'])
+    add_op = graph.get_operation_by_name('test/add')
+    self._AssertOutputGoesToOps(
+        add_op, graph,
+        ['test/add/activation_Add_quant/FakeQuantWithMinMaxVars'])
+
   def testRewriteWithScope(self):
     self._RunTestOverExperimentalRewritesWithScope(
         self._TestRewriteWithScope, 'scope1')
@@ -306,6 +358,82 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     # No ops should be inserted or removed.
     self.assertEqual(op_names_before_rewrite, op_names_after_rewrite)
 
+  def testActivationRewriteWithScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestActivationRewriteWithScope, 'scope1')
+
+  def _TestActivationRewriteWithScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      output = self._LayerWithIdentity(scope='scope1')
+      with ops.name_scope('scope2'):
+        output = nn_ops.relu6(output)
+        scaled_output1 = math_ops.mul(2.0, output)
+        scaled_output2 = math_ops.mul(3.0, output)
+        output = scaled_output1 + scaled_output2
+      rewrite_fn(graph)
+
+      op_names = [op.name for op in graph.get_operations()]
+      # The weights and activation of scope1 is quantized, but not scope2.
+      self.assertTrue(any('scope1/Conv/act_quant' in name for name in op_names))
+      self.assertTrue(
+          any('scope1/Conv/weights_quant' in name for name in op_names))
+
+      for op_name in op_names:
+        if op_name.startswith('scope2'):
+          self.assertTrue('FakeQuant' not in op_name)
+
+  def testActivationRewriteWithNonMatchingScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestActivationRewriteWithNonMatchingScope, 'NonExistingScope')
+
+  def _TestActivationRewriteWithNonMatchingScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      self._LayerWithActivationProcessing()
+
+    rewrite_fn(graph)
+    op_types_after_rewrite = set([op.type for op in graph.get_operations()])
+    self.assertFalse(
+        op_types_after_rewrite.intersection('FakeQuantWithMinMaxVars'))
+    # No fake quant ops should be inserted.
+
+  def testWithSharedWeights(self):
+
+    self._RunTestOverAllRewrites(self._TestWithSharedWeights)
+    self._RunTestOverTrainingRewrites(self._TestRewriteWithSharedWeights)
+
+  def _TestRewriteWithSharedWeights(self, rewrite_fn, quant_delay=1):
+    self._TestWithSharedWeights(rewrite_fn, quant_delay)
+
+  def _TestWithSharedWeights(self, rewrite_fn, quant_delay=None):
+    with ops.Graph().as_default() as g:
+      conv = template.make_template('shared_weights_conv', self._ConvLayer)
+      conv()
+      conv()
+      if quant_delay is None:
+        rewrite_fn()
+      else:
+        rewrite_fn(quant_delay=quant_delay)
+
+    conv_ops = [op for op in g.get_operations() if op.type == 'Conv2D']
+    weights_quants = [
+        op for op in g.get_operations()
+        if 'weights_quant' in op.name and op.type == 'FakeQuantWithMinMaxVars'
+    ]
+    # Check that the shared weights variable is not quantized multiple times
+    self.assertTrue(len(weights_quants) == 1)
+    weights_quant_tensor = weights_quants[0].outputs[0]
+    if quant_delay:
+      delayed_weights_quants = [
+          op for op in g.get_operations()
+          if 'weights_quant' in op.name and op.type == 'Merge'
+      ]
+      self.assertTrue(len(delayed_weights_quants) == 1)
+      weights_quant_tensor = delayed_weights_quants[0].outputs[0]
+    # Check that the Conv2D operations get the quantized weights
+    self.assertTrue(all(weights_quant_tensor in op.inputs for op in conv_ops))
+
   def _ConvLayer(
       self, input_tensor=None, scope='test', pre_activation_bypass=False,
       post_activation_bypass=False):
@@ -328,6 +456,85 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
         output += input_tensor
     return output
 
+  def _LayerWithIdentity(self,
+                         input_tensor=None,
+                         scope='test',
+                         post_activation_bypass=False):
+    """Add a basic conv, identity, batch norm with skip to the default graph."""
+    batch_size, height, width, depth = 5, 128, 128, 3
+    if input_tensor is None:
+      input_tensor = array_ops.zeros((batch_size, height, width, depth))
+    weight_init = init_ops.truncated_normal_initializer
+    with ops.name_scope(scope):
+      output = layers.conv2d(
+          input_tensor,
+          depth, [5, 5],
+          padding='SAME',
+          weights_initializer=weight_init(0.09),
+          activation_fn=None,
+          normalizer_fn=None,
+          biases_initializer=None)
+      output = array_ops.identity(output, name='conv_out')
+
+      output = layers.batch_norm(
+          output, center=True, scale=True, decay=1.0 - 0.003, fused=True)
+
+      output = array_ops.identity(output, name='bn_out')
+      if post_activation_bypass:
+        output += input_tensor
+    return output
+
+  def _LayerWithActivationProcessing(self,
+                                     input_tensor=None,
+                                     scope='test',
+                                     post_activation_bypass=False):
+
+    batch_size, height, width, depth = 5, 128, 128, 3
+    if input_tensor is None:
+      input_tensor = array_ops.zeros((batch_size, height, width, depth))
+    weight_init = init_ops.truncated_normal_initializer
+    with ops.name_scope(scope):
+      output = layers.conv2d(
+          input_tensor,
+          depth, [5, 5],
+          padding='SAME',
+          weights_initializer=weight_init(0.09),
+          activation_fn=None,
+          normalizer_fn=None,
+          biases_initializer=None)
+
+      output = layers.batch_norm(
+          output, center=True, scale=True, decay=1.0 - 0.003, fused=True)
+
+      output = nn_ops.relu6(output)
+      scaled_output1 = math_ops.mul(2.0, output)
+      scaled_output2 = math_ops.mul(3.0, output)
+      output = scaled_output1 + scaled_output2
+    return output
+
+  def _AssertInputOpsAre(self, op, in_op_names):
+    """Asserts that all inputs to op come from in_op_names (disregarding order).
+
+    Args:
+      op: Operation to check inputs for.
+      in_op_names: List of strings, operations where all op's inputs should come
+        from.
+    """
+    expected_inputs = [in_op_name + ':0' for in_op_name in in_op_names]
+    self.assertItemsEqual([t.name for t in op.inputs], expected_inputs)
+
+  def _AssertOutputGoesToOps(self, op, graph, out_op_names):
+    """Asserts that outputs from op go to out_op_names (and perhaps others).
+
+    Args:
+      op: Operation to check outputs for.
+      graph: Graph where output operations are located.
+      out_op_names: List of strings, operations where op's outputs should go.
+    """
+    for out_op_name in out_op_names:
+      out_op = graph.get_operation_by_name(out_op_name)
+      self.assertIn(op.outputs[0].name, [str(t.name) for t in out_op.inputs])
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 212d902a3c64791adb50e7b3fa4a487f41b5bfbd..5681a213fe5eafb0814088ed34cc2253767c1d7e 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import quantize
 from tensorflow.python.framework import ops
@@ -26,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
@@ -525,6 +527,43 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       self.assertTrue(
           'FakeQuantWithMinMaxVars' in [i.op.type for i in reshape.op.inputs])
 
+  def testSeparableConvWithResourceVar(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      with variable_scope.variable_scope('', use_resource=True):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        kernel_size, depth_multiplier = 3, 1
+        depthwise_shape = [kernel_size, kernel_size, depth, depth_multiplier]
+        depthwise_weights = variables.model_variable(
+            'depthwise_weights', shape=depthwise_shape)
+        strides = [1, 1, 1, 1]
+        with variable_scope.variable_scope('depthwise_conv_1'):
+          conv1 = nn.depthwise_conv2d(
+              input1, depthwise_weights, strides, padding='SAME')
+        with variable_scope.variable_scope('depthwise_conv_2'):
+          conv2 = nn.depthwise_conv2d(
+              conv1, depthwise_weights, strides, padding='SAME')
+          math_ops.add(conv2, input1, name='add')
+
+    quantize.Quantize(graph, True)
+
+    # Test that the weights and activations of all convs have been quantized.
+    quant_node_name = 'FakeQuantWithMinMaxVars'
+    weights_quant = graph.get_operation_by_name(
+        'depthwise_conv_1/weights_quant/' + quant_node_name)
+    self.assertEqual(weights_quant.type, quant_node_name)
+    act_quant = graph.get_operation_by_name('depthwise_conv_1/act_quant/' +
+                                            quant_node_name)
+    self.assertEqual(act_quant.type, quant_node_name)
+
+    weights_quant = graph.get_operation_by_name(
+        'depthwise_conv_2/weights_quant/' + quant_node_name)
+    self.assertEqual(weights_quant.type, quant_node_name)
+    act_quant = graph.get_operation_by_name('depthwise_conv_2/act_quant/' +
+                                            quant_node_name)
+    self.assertEqual(act_quant.type, quant_node_name)
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/rate/BUILD b/tensorflow/contrib/rate/BUILD
index c461a7145e27c4238161cec989448be807acd543..76db9aecf615d0a94f65cd7ea799db245828db1c 100644
--- a/tensorflow/contrib/rate/BUILD
+++ b/tensorflow/contrib/rate/BUILD
@@ -34,6 +34,11 @@ py_test(
     name = "rate_test",
     size = "small",
     srcs = ["rate_test.py"],
+    tags = [
+        "manual",  # TODO(b/120555555)
+        "no_oss",  # TODO(b/120555555)
+        "notap",  # TODO(b/120555555)
+    ],
     deps = [
         ":rate",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/rate/rate.py b/tensorflow/contrib/rate/rate.py
index 24d586479a61631461e41bda507f95a3c167f754..d948066b36426e73171d5efa4c4ed0d84c9e3341 100644
--- a/tensorflow/contrib/rate/rate.py
+++ b/tensorflow/contrib/rate/rate.py
@@ -108,13 +108,6 @@ class Rate(object):
   def variables(self):
     return self._vars
 
-  def _safe_div(self, numerator, denominator, name):
-    t = math_ops.truediv(numerator, denominator)
-    zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-    condition = math_ops.greater(denominator, zero)
-    zero = math_ops.cast(zero, t.dtype)
-    return array_ops.where(condition, t, zero, name=name)
-
   def _add_variable(self, name, shape=None, dtype=None):
     """Private method for adding variables to the graph."""
     if self._built:
@@ -148,4 +141,6 @@ class Rate(object):
     state_ops.assign(self.prev_values, values)
     state_ops.assign(self.prev_denominator, denominator)
 
-    return self._safe_div(self.numer, self.denom, name="safe_rate")
+    return math_ops.div_no_nan(self.numer,
+                               math_ops.maximum(self.denom, 0),
+                               name="safe_rate")
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
index 2def4f3f176b8d4d26c2c94168e9698f14649d94..edcef3adeaa22a5b5b1da47ee3f17dc04b737199 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.cc
@@ -30,6 +30,11 @@ using thread::ThreadPool;
 
 namespace functor {
 
+#define Sum(a, b) ((a) + (b))
+#define Prod(a, b) ((a) * (b))
+#define Max(a, b) ((a) > (b) ? (a) : (b))
+#define Min(a, b) ((a) < (b) ? (a) : (b))
+
 #define CPUReduceSliceFunctorReduceop(reduceop, beginning)                    \
   template <typename T, typename Index>                                       \
   struct ReduceSliceFunctor##reduceop<CPUDevice, T, Index> {                  \
@@ -234,6 +239,11 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_GPU_REDUCE_SLICE_KERNELS_ALL);
 #undef REGISTER_GPU_REDUCE_SLICE_KERNELS
 #undef REGISTER_GPU_REDUCE_SLICE_KERNELS_ALL
 
+#undef Sum
+#undef Prod
+#undef Min
+#undef Max
+
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
index 69ef521c0120104e23bdb844539282a3bcea3525..12bff1e9161783d897ea1d5bb3327c5d0f92f652 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops.h
@@ -21,11 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
-#define Sum(a, b) ((a) + (b))
-#define Prod(a, b) ((a) * (b))
-#define Max(a, b) ((a) > (b) ? (a) : (b))
-#define Min(a, b) ((a) < (b) ? (a) : (b))
-
 namespace tensorflow {
 
 class OpKernelContext;
diff --git a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
index 9f2be03d718364058da6b63add8752c046798c5b..204b83f7f5f118f418815edb6c482b1c06673845 100644
--- a/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
+++ b/tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc
@@ -29,6 +29,11 @@ using GPUDevice = Eigen::GpuDevice;
 
 namespace functor {
 
+#define Sum(a, b) ((a) + (b))
+#define Prod(a, b) ((a) * (b))
+#define Max(a, b) ((a) > (b) ? (a) : (b))
+#define Min(a, b) ((a) < (b) ? (a) : (b))
+
 #define GPUReduceSliceFunctorReduceop(reduceop, beginning)                     \
   template <typename T, typename Index>                                        \
   __global__ void ReduceSliceDeviceKernel##reduceop(                           \
@@ -94,6 +99,11 @@ TF_CALL_REAL_NUMBER_TYPES(DEFINE_GPU_SPECS)
 #undef DEFINE_GPU_REDUCEOP_SPECS_INDEX
 #undef DEFINE_GPU_SPECS
 
+#undef Sum
+#undef Prod
+#undef Min
+#undef Max
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/resampler/BUILD b/tensorflow/contrib/resampler/BUILD
index 48345d7030bea431152bbed934af9f500f2c15c5..bbf109967595a73a0fc4bacaf34859b30c2376fc 100644
--- a/tensorflow/contrib/resampler/BUILD
+++ b/tensorflow/contrib/resampler/BUILD
@@ -7,12 +7,13 @@ package(default_visibility = ["//visibility:public"])
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
-    "tf_custom_op_py_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
     "tf_kernel_library",
 )
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 tf_custom_op_py_library(
     name = "resampler_py",
@@ -20,7 +21,6 @@ tf_custom_op_py_library(
     dso = [":python/ops/_resampler_ops.so"],
     kernels = [
         ":resampler_ops_kernels",
-        ":resampler_ops_op_lib",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -40,11 +40,25 @@ tf_custom_op_py_library(
 
 tf_kernel_library(
     name = "resampler_ops_kernels",
+    srcs = [
+        "kernels/resampler_ops.cc",
+        "kernels/resampler_ops.h",
+    ],
+    gpu_srcs = [
+        "kernels/resampler_ops_gpu.cu.cc",
+        "kernels/resampler_ops.h",
+    ],
     prefix = "resampler_ops",
     deps = [
+        ":resampler_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla/kernels:resampler_ops",
+        ],
+        "//conditions:default": [],
+    }),
     alwayslink = 1,
 )
 
@@ -85,3 +99,26 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
     ],
 )
+
+tf_xla_py_test(
+    name = "resampler_ops_xla_test",
+    size = "small",
+    srcs = ["xla/resampler_ops_xla_test.py"],
+    disabled_backends = [
+        # TODO(b/74459949) Support BatchDot in CPU backend.
+        "cpu",
+        "cpu_ondemand",
+    ],
+    # TODO(b/112295522): the OSS build will not likely work in the short to medium term, currently it is blocked by the fact that bazel does not allow py_library to depend on cc_library: https://github.com/bazelbuild/bazel/issues/701 which may not be resolvable.
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/compiler/tf2xla/kernels:resampler_ops",
+        "//tensorflow/contrib/resampler:resampler_ops",
+        "//tensorflow/contrib/resampler:resampler_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/resampler/ops/resampler_ops.cc b/tensorflow/contrib/resampler/ops/resampler_ops.cc
index 5ab212032e50ace9545762bebda5679f68fbf77c..f785d4ee5fcd63212882ccf736bfc61c35d68545 100644
--- a/tensorflow/contrib/resampler/ops/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/ops/resampler_ops.cc
@@ -25,7 +25,7 @@ REGISTER_OP("Resampler")
     .Input("data: T")
     .Input("warp: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle data;
       ShapeHandle warp;
@@ -48,7 +48,7 @@ REGISTER_OP("ResamplerGrad")
     .Input("grad_output: T")
     .Output("grad_data: T")
     .Output("grad_warp: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(0));
       c->set_output(1, c->input(1));
diff --git a/tensorflow/contrib/resampler/python/ops/resampler_ops.py b/tensorflow/contrib/resampler/python/ops/resampler_ops.py
index 8b632527f6b1fc08454c77deac181b4f9c4e5f5f..0ee224a47821b0530304aa448eaca0b1b9f59d02 100644
--- a/tensorflow/contrib/resampler/python/ops/resampler_ops.py
+++ b/tensorflow/contrib/resampler/python/ops/resampler_ops.py
@@ -39,7 +39,9 @@ def resampler(data, warp, name="resampler"):
       data_num_channels]` containing 2D data that will be resampled.
     warp: Tensor of minimum rank 2 containing the coordinates at which
       resampling will be performed. Since only bilinear interpolation is
-      currently supported, the last dimension of the `warp` tensor must be 2.
+      currently supported, the last dimension of the `warp` tensor must be 2,
+      representing the (x, y) coordinate where x is the index for width and y is
+      the index for height.
     name: Optional name of the op.
 
   Returns:
diff --git a/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8ca0eab276b39f025d018edebb78eed7a8433bb
--- /dev/null
+++ b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
@@ -0,0 +1,205 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for resampler ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.contrib import resampler
+from tensorflow.contrib.resampler.ops import gen_resampler_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ResamplerOpsTest(xla_test.XLATestCase):
+
+  def _assertForwardOpMatchesExpected(self, image_np, warp_np, expected):
+    with self.test_session() as sess, self.test_scope():
+      input_image = array_ops.placeholder(image_np.dtype)
+      warp = array_ops.placeholder(warp_np.dtype)
+      resampled = resampler.resampler(input_image, warp, name='resampler')
+      out = sess.run(resampled, {input_image: image_np, warp: warp_np})
+
+      self.assertAllCloseAccordingToType(
+          expected, out, rtol=5e-3, half_rtol=1e-2, bfloat16_rtol=3e-2)
+
+  def _assertBackwardOpMatchesExpected(self, input_np, warp_np, grad_output_np,
+                                       expected_grad_data, expected_grad_warp):
+    with self.cached_session() as sess, self.test_scope():
+      input_image = array_ops.placeholder(input_np.dtype)
+      warp = array_ops.placeholder(warp_np.dtype)
+      grad_output = array_ops.placeholder(grad_output_np.dtype)
+
+      grad_data, grad_warp = gen_resampler_ops.resampler_grad(
+          input_image, warp, grad_output)
+
+      grad_data_tf, grad_warp_tf = sess.run([grad_data, grad_warp], {
+          input_image: input_np,
+          warp: warp_np,
+          grad_output: grad_output_np
+      })
+
+      self.assertAllCloseAccordingToType(
+          expected_grad_warp, grad_warp_tf, half_rtol=1e-2, bfloat16_rtol=3e-2)
+      self.assertAllCloseAccordingToType(
+          expected_grad_data, grad_data_tf, half_rtol=1e-2, bfloat16_rtol=3e-2)
+
+  def testSimple(self):
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [0, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2]
+      warp_data = [0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[26.42]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+      grad_output = np.ones([1, 1], dtype=dtype)
+
+      expected_grad_data = [[[[0.12], [0.27999997]], [[0.18000001],
+                                                      [0.42000002]]]]
+
+      expected_grad_warp = [[26.60000038, 38.20000076]]
+
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
+  def testMultiChannel(self):
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 3]
+      input_rgb_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+      input_np = np.array(input_rgb_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2]
+      warp_data = [0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[59.58000183, 146.94000244, 107.37999725]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+      grad_output = np.ones([1, 3], dtype=dtype)
+
+      expected_grad_data = [[[[0.12, 0.12, 0.12],
+                              [0.27999997, 0.27999997, 0.27999997]],
+                             [[0.18000001, 0.18000001, 0.18000001],
+                              [0.42000002, 0.42000002, 0.42000002]]]]
+
+      expected_grad_warp = [[199, 30]]
+
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
+  def testBatch2Height3byWidth3RGB(self):
+    for dtype in self.float_types:
+      input_shape = [2, 3, 3, 3]
+      input_rgb_data = [
+          0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1, 30, 105, 2, 40, 115,
+          3, 50, 125, 4, 60, 135, 5, 70, 145, 6, 0, 5, 13, 54, 135, 226, 37, 8,
+          234, 90, 255, 1, 30, 105, 2, 40, 115, 3, 50, 125, 4, 60, 135, 5, 70,
+          145, 6
+      ]
+      input_np = np.array(input_rgb_data, dtype=dtype).reshape(input_shape)
+
+      # 2 batches and 2 samples for each batch.
+      warp_shape = [2, 2, 2]
+      warp_data = [0.7, 0.6, 1, 0.7, 0.9, 1.2, 1.3, 1.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+
+      expected_forward = [[[43.92, 128.4, 65.86], [37.2, 114., 69.2]],
+                          [[40.6, 122.8, 2.5], [51., 126, 4.1]]]
+
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected_forward)
+
+      expected_grad_data = [[[[0.12, 0.12, 0.12],
+                              [0.57999998, 0.57999998, 0.57999998],
+                              [0., 0., 0.]],
+                             [[0.18000001, 0.18000001, 0.18000001],
+                              [1.12, 1.12, 1.12], [0., 0., 0.]],
+                             [[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]]],
+                            [[[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]],
+                             [[0.08000001, 0.08000001, 0.08000001],
+                              [0.99999988, 0.99999988, 0.99999988],
+                              [0.11999997, 0.11999997, 0.11999997]],
+                             [[0.02000001, 0.02000001, 0.02000001],
+                              [0.60000008, 0.60000008, 0.60000008],
+                              [0.17999998, 0.17999998, 0.17999998]]]]
+      expected_grad_warp = [[[33.39999008, -96.20000458], [-26.10000229,
+                                                           -278.]],
+                            [[-162.99998474, 39.99999619], [21., 63.]]]
+
+      grad_output = np.ones([2, 2, 3], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
+  def testOutOfBoundWarps(self):
+    # (x, y) are both less than 0.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-1, -1, 0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [27.62]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # One of (x, y) is less than 0.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-1, 0.1, 0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [27.62]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # Both of (x, y) are greater than image size.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-0.1, 0.1, 1.2, 2.1]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [0.0]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # One of (x, y) is greater than image size.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [0.1, -0.1, 1.2, 0.1]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [0.0]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 1385a9ddc1a50a86fe26d30281317489cdf5dfca..e124867415f94fb5052f34f50363ea718d71053b 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -4,10 +4,10 @@
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
 package(default_visibility = ["//visibility:public"])
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load(
@@ -19,10 +19,6 @@ load(
     "tf_kernel_library",
     "tf_gen_op_wrapper_py",
 )
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_kernel_tests_linkstatic",
-)
 
 cc_library(
     name = "all_ops",
@@ -200,6 +196,7 @@ cuda_py_tests(
     srcs = ["python/kernel_tests/lstm_ops_test.py"],
     additional_deps = [
         ":rnn_py",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -290,7 +287,7 @@ tf_cc_test(
     name = "ops/gru_ops_test",
     size = "small",
     srcs = ["ops/gru_ops_test.cc"],
-    data = [":python/ops/_gru_ops.so"],
+    kernels = [":gru_ops_kernels"],
     tags = ["noasan"],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
@@ -310,7 +307,9 @@ tf_cc_test(
     name = "ops/lstm_ops_test",
     size = "small",
     srcs = ["ops/lstm_ops_test.cc"],
-    data = [":python/ops/_lstm_ops.so"],
+    kernels = [
+        ":lstm_ops_kernels",
+    ],
     tags = ["noasan"],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
@@ -334,16 +333,29 @@ tf_gen_op_libs(
 )
 
 tf_kernel_library(
-    name = "gru_ops_kernels",
+    name = "blas_gemm",
     srcs = [
         "kernels/blas_gemm.cc",
-        "kernels/blas_gemm.h",
     ],
+    hdrs = ["kernels/blas_gemm.h"],
     gpu_srcs = [
         "kernels/blas_gemm.h",
     ],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "gru_ops_kernels",
     prefix = "kernels/gru_ops",
     deps = [
+        ":blas_gemm",
+        ":gru_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:eigen_helpers",
@@ -353,15 +365,10 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "lstm_ops_kernels",
-    srcs = [
-        "kernels/blas_gemm.cc",
-        "kernels/blas_gemm.h",
-    ],
-    gpu_srcs = [
-        "kernels/blas_gemm.h",
-    ],
     prefix = "kernels/lstm_ops",
     deps = [
+        ":blas_gemm",
+        ":lstm_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:eigen_helpers",
diff --git a/tensorflow/contrib/rnn/__init__.py b/tensorflow/contrib/rnn/__init__.py
index 026bf08ced33cf0d663cf0940e8bea3f3f2aca28..cbc8af5350276bf3398cf29a24554fd27e0621ee 100644
--- a/tensorflow/contrib/rnn/__init__.py
+++ b/tensorflow/contrib/rnn/__init__.py
@@ -14,8 +14,6 @@
 # ==============================================================================
 """RNN Cells and additional RNN operations.
 
-See [Contrib RNN](https://tensorflow.org/api_guides/python/contrib.rnn) guide.
-
 <!--From core-->
 @@RNNCell
 @@LayerRNNCell
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.cc b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
index 45d22b739b8c597c7ebda85968aa44cd599a798c..56ec86418de51d7aa5f02e75b17fa88a91d5a2a9 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.cc
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.cc
@@ -38,8 +38,9 @@ namespace functor {
 template <typename T>
 void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
                                      bool transb, uint64 m, uint64 n, uint64 k,
-                                     T alpha, const T* a, int lda, const T* b,
-                                     int ldb, T beta, T* c, int ldc) {
+                                     float alpha, const T* a, int lda,
+                                     const T* b, int ldb, float beta, T* c,
+                                     int ldc) {
 #if GOOGLE_CUDA
   se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
                                  se::blas::Transpose::kTranspose};
@@ -60,8 +61,8 @@ void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
 #endif
 }
 
+template struct TensorCuBlasGemm<Eigen::half>;
 template struct TensorCuBlasGemm<float>;
-template struct TensorCuBlasGemm<double>;
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.h b/tensorflow/contrib/rnn/kernels/blas_gemm.h
index a52c934233af3dc63e1a60d70fac6a9eba6a655b..d37210d4b81203287fb633adc309688a35d093bb 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.h
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.h
@@ -28,8 +28,18 @@ namespace functor {
 template <typename T>
 struct TensorCuBlasGemm {
   void operator()(OpKernelContext* ctx, bool transa, bool transb, uint64 m,
-                  uint64 n, uint64 k, T alpha, const T* a, int lda, const T* b,
-                  int ldb, T beta, T* c, int ldc);
+                  uint64 n, uint64 k, float alpha, const T* a, int lda,
+                  const T* b, int ldb, float beta, T* c, int ldc);
+};
+
+template <typename T>
+struct gemm_compute_type {
+  typedef T type;
+};
+
+template <>
+struct gemm_compute_type<Eigen::half> {
+  typedef float type;
 };
 
 template <typename Device, typename T, bool USE_CUBLAS>
@@ -38,8 +48,10 @@ struct TensorBlasGemm;
 template <typename Device, typename T>
 struct TensorBlasGemm<Device, T, true /* USE_CUBLAS */> {
   static void compute(OpKernelContext* ctx, const Device& d, bool transa,
-                      bool transb, T alpha, typename TTypes<T>::ConstMatrix a,
-                      typename TTypes<T>::ConstMatrix b, T beta,
+                      bool transb, typename gemm_compute_type<T>::type alpha,
+                      typename TTypes<T>::ConstMatrix a,
+                      typename TTypes<T>::ConstMatrix b,
+                      typename gemm_compute_type<T>::type beta,
                       typename TTypes<T>::Matrix c) {
     int64 m = c.dimensions()[0];
     int64 n = c.dimensions()[1];
@@ -54,19 +66,23 @@ struct TensorBlasGemm<Device, T, true /* USE_CUBLAS */> {
 template <typename Device, typename T>
 struct TensorBlasGemm<Device, T, false /* USE_CUBLAS */> {
   static void compute(OpKernelContext* ctx, const Device& d, bool transa,
-                      bool transb, T alpha, typename TTypes<T>::ConstMatrix a,
-                      typename TTypes<T>::ConstMatrix b, T beta,
+                      bool transb, typename gemm_compute_type<T>::type alpha,
+                      typename TTypes<T>::ConstMatrix a,
+                      typename TTypes<T>::ConstMatrix b,
+                      typename gemm_compute_type<T>::type beta,
                       typename TTypes<T>::Matrix c) {
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
     contract_pairs[0] =
         Eigen::IndexPair<Eigen::DenseIndex>(transa == false, transb == true);
-    if (alpha == T(1) && beta == T(0)) {
+    if (alpha == typename gemm_compute_type<T>::type(1.f) &&
+        beta == typename gemm_compute_type<T>::type(0.f)) {
       c.device(d) = a.contract(b, contract_pairs);
-    } else if (alpha == T(1) && beta == T(1)) {
+    } else if (alpha == typename gemm_compute_type<T>::type(1.f) &&
+               beta == typename gemm_compute_type<T>::type(1.f)) {
       c.device(d) += a.contract(b, contract_pairs);
     } else {
-      c.device(d) = c.constant(alpha) * a.contract(b, contract_pairs) +
-                    c.constant(beta) * c;
+      c.device(d) = c.constant(T(alpha)) * a.contract(b, contract_pairs) +
+                    c.constant(T(beta)) * c;
     }
   }
 };
diff --git a/tensorflow/contrib/rnn/kernels/gru_ops.h b/tensorflow/contrib/rnn/kernels/gru_ops.h
index 3e2cb39e64bb3f0b22ea66c5601af36c5fb9b0fd..38be58fa104f8b30e4aede6d18330960fc30dcb5 100644
--- a/tensorflow/contrib/rnn/kernels/gru_ops.h
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.h
@@ -88,7 +88,9 @@ struct GRUBlockCellFprop : public GRUCell {
     typename TTypes<T>::ConstMatrix const_x_h_prev(x_h_prev.data(),
                                                    x_h_prev.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, false, false, T(1), const_x_h_prev, w_ru, T(0), r_u_bar);
+        ctx, d, false, false, typename gemm_compute_type<T>::type(1.f),
+        const_x_h_prev, w_ru, typename gemm_compute_type<T>::type(0.f),
+        r_u_bar);
 
     // Creating a bias matrix for adding by broadcasting 'b_ru'
     Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({batch_size_, 1});
@@ -107,7 +109,8 @@ struct GRUBlockCellFprop : public GRUCell {
     typename TTypes<T>::ConstMatrix const_x_h_prevr(x_h_prevr.data(),
                                                     x_h_prevr.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, false, false, T(1), const_x_h_prevr, w_c, T(0), c);
+        ctx, d, false, false, typename gemm_compute_type<T>::type(1.f),
+        const_x_h_prevr, w_c, typename gemm_compute_type<T>::type(0.f), c);
 
     Eigen::array<Eigen::DenseIndex, 2> b_c_shape({1, b_c.dimensions()[0]});
     c.device(d) += (b_c.reshape(b_c_shape).broadcast(broadcast_shape));
@@ -148,9 +151,10 @@ struct GRUBlockCellBprop : public GRUCell {
     // [2nd_component_of_d_x d_h_prevr] = d_c_bar X w_c^T
     typename TTypes<T>::ConstMatrix const_d_c_bar(d_c_bar.data(),
                                                   d_c_bar.dimensions());
-    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(ctx, d, false, true, T(1),
-                                                   const_d_c_bar, w_c, T(0),
-                                                   d_x_comp2_and_h_prevr);
+    TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
+        ctx, d, false, true, typename gemm_compute_type<T>::type(1.f),
+        const_d_c_bar, w_c, typename gemm_compute_type<T>::type(0.f),
+        d_x_comp2_and_h_prevr);
 
     d_hr.device(d) = d_x_comp2_and_h_prevr.slice(h_offsets(), h_extends());
     d_r_bar.device(d) = (d_hr * h_prev * r) * (r.constant(T(1)) - r);
@@ -164,7 +168,8 @@ struct GRUBlockCellBprop : public GRUCell {
     typename TTypes<T>::ConstMatrix const_d_r_bar_u_bar(
         d_r_bar_u_bar.data(), d_r_bar_u_bar.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, false, true, T(1), const_d_r_bar_u_bar, w_ru, T(0),
+        ctx, d, false, true, typename gemm_compute_type<T>::type(1.f),
+        const_d_r_bar_u_bar, w_ru, typename gemm_compute_type<T>::type(0.f),
         d_x_comp1_and_h_prev_comp1);
 
     // d_x = d_x_comp1 + d_x_comp2
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.cc b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
index 5e7cf0ce84d332bd24088cd78995f7843813328b..d369bc12ae88dafb4e3ca0095a08bcc3ee09bf70 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -44,7 +44,7 @@ namespace functor {
 template <typename T>
 void LSTMBlockCellFpropWithEigen(
     const LSTMBlockCell& cell, OpKernelContext* ctx, const CPUDevice& d,
-    const T forget_bias, const T cell_clip, bool use_peephole,
+    const float forget_bias, const float cell_clip, bool use_peephole,
     typename TTypes<T>::ConstMatrix x, typename TTypes<T>::ConstMatrix cs_prev,
     typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
     typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
@@ -61,7 +61,8 @@ void LSTMBlockCellFpropWithEigen(
   // states1 = xh * w + b
   typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
   TensorBlasGemm<CPUDevice, T, false /* USE_CUBLAS */>::compute(
-      ctx, d, false, false, T(1), const_xh, w, T(0), icfo);
+      ctx, d, false, false, typename gemm_compute_type<T>::type(1.f), const_xh,
+      w, typename gemm_compute_type<T>::type(0.f), icfo);
   Eigen::array<Eigen::DenseIndex, 2> b_shape({1, b.dimensions()[0]});
   Eigen::array<Eigen::DenseIndex, 2> broadcast_shape({cell.batch_size(), 1});
   icfo.device(d) += b.reshape(b_shape).broadcast(broadcast_shape);
@@ -87,11 +88,11 @@ void LSTMBlockCellFpropWithEigen(
   if (use_peephole) {
     auto f_peep = cs_prev * wcf.reshape(p_shape).broadcast(p_broadcast_shape);
     f.device(d) = (icfo.slice(cell.icfo_f_offsets(), cell.cell_extents()) +
-                   f.constant(forget_bias) + f_peep)
+                   f.constant(T(forget_bias)) + f_peep)
                       .sigmoid();
   } else {
     f.device(d) = (icfo.slice(cell.icfo_f_offsets(), cell.cell_extents()) +
-                   f.constant(forget_bias))
+                   f.constant(T(forget_bias)))
                       .sigmoid();
   }
 
@@ -100,7 +101,7 @@ void LSTMBlockCellFpropWithEigen(
 
   if (cell_clip > 0.0f) {
     cs.device(d) =
-        cs.binaryExpr(cs.constant(cell_clip), Eigen::scalar_clip_op<T>());
+        cs.binaryExpr(cs.constant(T(cell_clip)), Eigen::scalar_clip_op<T>());
   }
 
   // co = tanh(cs)
@@ -177,53 +178,55 @@ void LSTMBlockCellBpropWithEigen(
   }
 }
 
-#define DEFINE_CPU_SPECS(T)                                                    \
-  template <>                                                                  \
-  void LSTMBlockCellFprop<CPUDevice, T, false /* USE_CUBLAS */>::operator()(   \
-      OpKernelContext* ctx, const CPUDevice& d, const T forget_bias,           \
-      const T cell_clip, bool use_peephole, typename TTypes<T>::ConstMatrix x, \
-      typename TTypes<T>::ConstMatrix cs_prev,                                 \
-      typename TTypes<T>::ConstMatrix h_prev,                                  \
-      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
-      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
-      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,           \
-      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,             \
-      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,              \
-      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,            \
-      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h) {         \
-    LSTMBlockCellFpropWithEigen<T>(                                            \
-        *this, ctx, d, forget_bias, cell_clip, use_peephole, x, cs_prev,       \
-        h_prev, w, wci, wcf, wco, b, xh, i, cs, f, o, ci, co, icfo, h);        \
-  }                                                                            \
-  template <>                                                                  \
-  void LSTMBlockCellBprop<CPUDevice, T, false /* USE_CUBLAS */>::operator()(   \
-      OpKernelContext* ctx, const CPUDevice& d, bool use_peephole,             \
-      typename TTypes<T>::ConstMatrix x,                                       \
-      typename TTypes<T>::ConstMatrix cs_prev,                                 \
-      typename TTypes<T>::ConstMatrix h_prev,                                  \
-      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
-      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
-      typename TTypes<T>::ConstVec b, typename TTypes<T>::ConstMatrix i,       \
-      typename TTypes<T>::ConstMatrix cs, typename TTypes<T>::ConstMatrix f,   \
-      typename TTypes<T>::ConstMatrix o, typename TTypes<T>::ConstMatrix ci,   \
-      typename TTypes<T>::ConstMatrix co,                                      \
-      typename TTypes<T>::ConstMatrix cs_grad,                                 \
-      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_,  \
-      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,          \
-      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,            \
-      typename TTypes<T>::Matrix dicfo,                                        \
-      typename TTypes<T>::Matrix cs_prev_grad,                                 \
-      typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,      \
-      typename TTypes<T>::Vec wco_grad) {                                      \
-    LSTMBlockCellBpropWithEigen<CPUDevice, T, false /* USE_CUBLAS */>(         \
-        *this, ctx, d, use_peephole, x, cs_prev, h_prev, w, wci, wcf, wco, b,  \
-        i, cs, f, o, ci, co, cs_grad, h_grad, do_, dcs, dci, df, di, dicfo,    \
-        cs_prev_grad, wci_grad, wcf_grad, wco_grad);                           \
-  }                                                                            \
-  template struct LSTMBlockCellFprop<CPUDevice, T, false /* USE_CUBLAS */>;    \
+#define DEFINE_CPU_SPECS(T)                                                   \
+  template <>                                                                 \
+  void LSTMBlockCellFprop<CPUDevice, T, false /* USE_CUBLAS */>::operator()(  \
+      OpKernelContext* ctx, const CPUDevice& d, const float forget_bias,      \
+      const float cell_clip, bool use_peephole,                               \
+      typename TTypes<T>::ConstMatrix x,                                      \
+      typename TTypes<T>::ConstMatrix cs_prev,                                \
+      typename TTypes<T>::ConstMatrix h_prev,                                 \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,    \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,     \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,          \
+      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,            \
+      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,             \
+      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,           \
+      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h) {        \
+    LSTMBlockCellFpropWithEigen<T>(                                           \
+        *this, ctx, d, forget_bias, cell_clip, use_peephole, x, cs_prev,      \
+        h_prev, w, wci, wcf, wco, b, xh, i, cs, f, o, ci, co, icfo, h);       \
+  }                                                                           \
+  template <>                                                                 \
+  void LSTMBlockCellBprop<CPUDevice, T, false /* USE_CUBLAS */>::operator()(  \
+      OpKernelContext* ctx, const CPUDevice& d, bool use_peephole,            \
+      typename TTypes<T>::ConstMatrix x,                                      \
+      typename TTypes<T>::ConstMatrix cs_prev,                                \
+      typename TTypes<T>::ConstMatrix h_prev,                                 \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,    \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,     \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::ConstMatrix i,      \
+      typename TTypes<T>::ConstMatrix cs, typename TTypes<T>::ConstMatrix f,  \
+      typename TTypes<T>::ConstMatrix o, typename TTypes<T>::ConstMatrix ci,  \
+      typename TTypes<T>::ConstMatrix co,                                     \
+      typename TTypes<T>::ConstMatrix cs_grad,                                \
+      typename TTypes<T>::ConstMatrix h_grad, typename TTypes<T>::Matrix do_, \
+      typename TTypes<T>::Matrix dcs, typename TTypes<T>::Matrix dci,         \
+      typename TTypes<T>::Matrix df, typename TTypes<T>::Matrix di,           \
+      typename TTypes<T>::Matrix dicfo,                                       \
+      typename TTypes<T>::Matrix cs_prev_grad,                                \
+      typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,     \
+      typename TTypes<T>::Vec wco_grad) {                                     \
+    LSTMBlockCellBpropWithEigen<CPUDevice, T, false /* USE_CUBLAS */>(        \
+        *this, ctx, d, use_peephole, x, cs_prev, h_prev, w, wci, wcf, wco, b, \
+        i, cs, f, o, ci, co, cs_grad, h_grad, do_, dcs, dci, df, di, dicfo,   \
+        cs_prev_grad, wci_grad, wcf_grad, wco_grad);                          \
+  }                                                                           \
+  template struct LSTMBlockCellFprop<CPUDevice, T, false /* USE_CUBLAS */>;   \
   template struct LSTMBlockCellBprop<CPUDevice, T, false /* USE_CUBLAS */>;
 
 DEFINE_CPU_SPECS(float);
+DEFINE_CPU_SPECS(Eigen::half);
 #undef DEFINE_CPU_SPECS
 
 }  // namespace functor
@@ -372,30 +375,31 @@ class LSTMBlockCellOp : public OpKernel {
       Name("LSTMBlockCell").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       LSTMBlockCellOp<CPUDevice, T, false>);
 REGISTER_KERNEL(float);
-// REGISTER_KERNEL(double);
+REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                    \
-  template <>                                                                  \
-  void LSTMBlockCellFprop<GPUDevice, T, true>::operator()(                     \
-      OpKernelContext* ctx, const GPUDevice& d, const T forget_bias,           \
-      const T cell_clip, bool use_peephole, typename TTypes<T>::ConstMatrix x, \
-      typename TTypes<T>::ConstMatrix cs_prev,                                 \
-      typename TTypes<T>::ConstMatrix h_prev,                                  \
-      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
-      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,      \
-      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,           \
-      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,             \
-      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,              \
-      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,            \
-      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h);          \
-                                                                               \
+#define DECLARE_GPU_SPEC(T)                                                \
+  template <>                                                              \
+  void LSTMBlockCellFprop<GPUDevice, T, true>::operator()(                 \
+      OpKernelContext* ctx, const GPUDevice& d, const float forget_bias,   \
+      const float cell_clip, bool use_peephole,                            \
+      typename TTypes<T>::ConstMatrix x,                                   \
+      typename TTypes<T>::ConstMatrix cs_prev,                             \
+      typename TTypes<T>::ConstMatrix h_prev,                              \
+      typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci, \
+      typename TTypes<T>::ConstVec wcf, typename TTypes<T>::ConstVec wco,  \
+      typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,       \
+      typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,         \
+      typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,          \
+      typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,        \
+      typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h);      \
+                                                                           \
   extern template struct LSTMBlockCellFprop<GPUDevice, T, true>;
 
 DECLARE_GPU_SPEC(float);
-// DECLARE_GPU_SPEC(double);
+DECLARE_GPU_SPEC(Eigen::half);
 #undef DECLARE_GPU_SPEC
 }  // end namespace functor
 
@@ -405,6 +409,7 @@ DECLARE_GPU_SPEC(float);
       LSTMBlockCellOp<GPUDevice, T, true>);
 
 REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(Eigen::half);
 // REGISTER_GPU_KERNEL(double);
 #undef REGISTER_GPU_KERNEL
 #endif  // GOOGLE_CUDA
@@ -629,9 +634,9 @@ class LSTMBlockCellGradOp : public OpKernel {
 
     const Device& device = ctx->eigen_device<Device>();
 
-    functor::TensorZero<Device, T>()(device, wci_grad_tensor->flat<float>());
-    functor::TensorZero<Device, T>()(device, wcf_grad_tensor->flat<float>());
-    functor::TensorZero<Device, T>()(device, wco_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, wci_grad_tensor->flat<T>());
+    functor::TensorZero<Device, T>()(device, wcf_grad_tensor->flat<T>());
+    functor::TensorZero<Device, T>()(device, wco_grad_tensor->flat<T>());
 
     functor::LSTMBlockCellBprop<Device, T, USE_CUBLAS>(batch_size, input_size,
                                                        cell_size)(
@@ -657,7 +662,7 @@ class LSTMBlockCellGradOp : public OpKernel {
       Name("LSTMBlockCellGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       LSTMBlockCellGradOp<CPUDevice, T, false>);
 REGISTER_KERNEL(float);
-// REGISTER_KERNEL(double);
+REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -688,6 +693,7 @@ namespace functor {
                                             true /* USE_CUBLAS */>;
 
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
 // DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
@@ -698,6 +704,7 @@ DECLARE_GPU_SPEC(float);
       LSTMBlockCellGradOp<GPUDevice, T, true>);
 
 REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(Eigen::half);
 // REGISTER_GPU_KERNEL(double);
 #undef REGISTER_GPU_KERNEL
 #endif  // GOOGLE_CUDA
@@ -984,10 +991,10 @@ class BlockLSTMOp : public OpKernel {
       Tensor cs_tensor = cs_out->Slice(seq_len_max, timelen);
       Tensor h_tensor = h_out->Slice(seq_len_max, timelen);
 
-      functor::TensorUnalignedZero<Device, T>()(
-          device, cs_tensor.unaligned_flat<float>());
-      functor::TensorUnalignedZero<Device, T>()(
-          device, h_tensor.unaligned_flat<float>());
+      functor::TensorUnalignedZero<Device, T>()(device,
+                                                cs_tensor.unaligned_flat<T>());
+      functor::TensorUnalignedZero<Device, T>()(device,
+                                                h_tensor.unaligned_flat<T>());
     }
   }
 
@@ -1002,7 +1009,7 @@ class BlockLSTMOp : public OpKernel {
       Name("BlockLSTM").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       BlockLSTMOp<CPUDevice, T, false>);
 REGISTER_KERNEL(float);
-// REGISTER_KERNEL(double);
+REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -1021,6 +1028,7 @@ namespace functor {
   extern template struct TensorUnalignedZero<GPUDevice, T>;
 
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
 // DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // end namespace functor
@@ -1033,6 +1041,7 @@ DECLARE_GPU_SPEC(float);
                           BlockLSTMOp<GPUDevice, T, true>);
 
 REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(Eigen::half);
 // REGISTER_GPU_KERNEL(double);
 #undef REGISTER_GPU_KERNEL
 #endif  // GOOGLE_CUDA
@@ -1195,16 +1204,15 @@ class BlockLSTMGradOp : public OpKernel {
 
     const Device& device = ctx->eigen_device<Device>();
 
-    functor::TensorZero<Device, T>()(device, cs_grad_tensor.flat<float>());
-    functor::TensorZero<Device, T>()(device,
-                                     cs_prev_grad_tensor->flat<float>());
-    functor::TensorZero<Device, T>()(device, h_grad_tensor.flat<float>());
-    functor::TensorZero<Device, T>()(device, h_prev_grad_tensor->flat<float>());
-    functor::TensorZero<Device, T>()(device, w_grad_tensor->flat<float>());
-    functor::TensorZero<Device, T>()(device, wci_grad_tensor->flat<float>());
-    functor::TensorZero<Device, T>()(device, wcf_grad_tensor->flat<float>());
-    functor::TensorZero<Device, T>()(device, wco_grad_tensor->flat<float>());
-    functor::TensorZero<Device, T>()(device, b_grad_tensor->flat<float>());
+    functor::TensorZero<Device, T>()(device, cs_grad_tensor.flat<T>());
+    functor::TensorZero<Device, T>()(device, cs_prev_grad_tensor->flat<T>());
+    functor::TensorZero<Device, T>()(device, h_grad_tensor.flat<T>());
+    functor::TensorZero<Device, T>()(device, h_prev_grad_tensor->flat<T>());
+    functor::TensorZero<Device, T>()(device, w_grad_tensor->flat<T>());
+    functor::TensorZero<Device, T>()(device, wci_grad_tensor->flat<T>());
+    functor::TensorZero<Device, T>()(device, wcf_grad_tensor->flat<T>());
+    functor::TensorZero<Device, T>()(device, wco_grad_tensor->flat<T>());
+    functor::TensorZero<Device, T>()(device, b_grad_tensor->flat<T>());
 
     const int64 seq_len_max = seq_len_max_tensor->scalar<int64>()();
     SliceHelper<Device, T> slicer(ctx);
@@ -1276,7 +1284,7 @@ class BlockLSTMGradOp : public OpKernel {
       Name("BlockLSTMGrad").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       BlockLSTMGradOp<CPUDevice, T, false>);
 REGISTER_KERNEL(float);
-// REGISTER_KERNEL(double);
+REGISTER_KERNEL(Eigen::half);
 #undef REGISTER_KERNEL
 
 #if GOOGLE_CUDA
@@ -1331,6 +1339,7 @@ namespace functor {
   extern template struct BlockLSTMBprop<GPUDevice, T, true>;
 
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
 // DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // end namespace functor
@@ -1343,6 +1352,7 @@ DECLARE_GPU_SPEC(float);
                           BlockLSTMGradOp<GPUDevice, T, true>);
 
 REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(Eigen::half);
 // REGISTER_GPU_KERNEL(double);
 #undef REGISTER_GPU_KERNEL
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops.h b/tensorflow/contrib/rnn/kernels/lstm_ops.h
index d23cedc234b8c0e1a784346f28164ae79b8cbf89..5ca1dad6552cff857d33232d8197fe069036841a 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops.h
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.h
@@ -77,8 +77,7 @@ template <typename Device, typename T>
 struct TensorZeroPadding {
   void operator()(const Device& d, const int64 time_idx,
                   typename TTypes<int64>::ConstVec seq_len,
-                  typename TTypes<float>::Vec mask,
-                  typename TTypes<float>::Matrix m) {
+                  typename TTypes<T>::Vec mask, typename TTypes<T>::Matrix m) {
     // mask is shape [batch_size].
     mask.device(d) = seq_len.constant(time_idx) < seq_len;
 
@@ -154,18 +153,21 @@ struct LSTMBlockCellFprop : public LSTMBlockCell {
                      const int cell_size)
       : LSTMBlockCell(batch_size, input_size, cell_size) {}
 
-  void operator()(
-      OpKernelContext* ctx, const Device& d, const T forget_bias,
-      const T cell_clip, bool use_peephole, typename TTypes<T>::ConstMatrix x,
-      typename TTypes<T>::ConstMatrix cs_prev,
-      typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
-      typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
-      typename TTypes<T>::ConstVec wco, typename TTypes<T>::ConstVec b,
-      typename TTypes<T>::Matrix xh, typename TTypes<T>::Matrix i,
-      typename TTypes<T>::Matrix cs, typename TTypes<T>::Matrix f,
-      typename TTypes<T>::Matrix o, typename TTypes<T>::Matrix ci,
-      typename TTypes<T>::Matrix co, typename TTypes<T>::Matrix icfo,
-      typename TTypes<T>::Matrix h);
+  void operator()(OpKernelContext* ctx, const Device& d,
+                  const float forget_bias, const float cell_clip,
+                  bool use_peephole, typename TTypes<T>::ConstMatrix x,
+                  typename TTypes<T>::ConstMatrix cs_prev,
+                  typename TTypes<T>::ConstMatrix h_prev,
+                  typename TTypes<T>::ConstMatrix w,
+                  typename TTypes<T>::ConstVec wci,
+                  typename TTypes<T>::ConstVec wcf,
+                  typename TTypes<T>::ConstVec wco,
+                  typename TTypes<T>::ConstVec b, typename TTypes<T>::Matrix xh,
+                  typename TTypes<T>::Matrix i, typename TTypes<T>::Matrix cs,
+                  typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,
+                  typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,
+                  typename TTypes<T>::Matrix icfo,
+                  typename TTypes<T>::Matrix h);
 };
 
 // See lstm_ops.cc for CPUDevice implementation and lstm_ops_gpu.cu.cc for
@@ -261,7 +263,7 @@ struct BlockLSTMBprop : public LSTMBlockCell {
     typename TTypes<T>::ConstMatrix const_dicfo(dicfo.data(),
                                                 dicfo.dimensions());
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, false, true, T(1), const_dicfo, w, T(0), xh_grad);
+        ctx, d, false, true, 1.f, const_dicfo, w, 0.f, xh_grad);
 
     // xh.
     xh.slice(xh_x_offsets(), xh_x_extents()).device(d) = x;
@@ -274,7 +276,7 @@ struct BlockLSTMBprop : public LSTMBlockCell {
 
     // w_grad.
     TensorBlasGemm<Device, T, USE_CUBLAS>::compute(
-        ctx, d, true, false, T(1), const_xh, const_dicfo, T(1), w_grad);
+        ctx, d, true, false, 1.f, const_xh, const_dicfo, 1.f, w_grad);
 
     // b_grad.
     b_grad.device(d) += dicfo.sum(Eigen::array<int, 1>({0}));
diff --git a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
index 6d3758fef15e7130b740a377d8bcd41d31203299..15ae95f13cffa5d1469d737b23f2a83b9e5a694f 100644
--- a/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc
@@ -31,6 +31,49 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace {
 
+struct FloatToHalf {
+  __host__ __device__ EIGEN_STRONG_INLINE Eigen::half operator()(
+      const float& x) const {
+    return Eigen::half_impl::float_to_half_rtne(x);
+  }
+};
+
+template <typename U, typename T>
+__host__ __device__ EIGEN_STRONG_INLINE
+    typename std::enable_if<!std::is_same<T, U>::value, U>::type
+    strict_cast(T t);
+
+template <typename U, typename T>
+__host__ __device__ EIGEN_STRONG_INLINE
+    typename std::enable_if<std::is_same<T, U>::value, U>::type
+    strict_cast(T t) {
+  return t;
+}
+
+template <>
+__host__ __device__ EIGEN_STRONG_INLINE Eigen::half
+strict_cast<Eigen::half, float>(float t) {
+  return FloatToHalf()(t);
+}
+
+}  // namespace
+
+template <typename T>
+struct TensorZero<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat t) {
+    t.device(d) = t.constant(strict_cast<T>(0.f));
+  }
+};
+
+template <typename T>
+struct TensorUnalignedZero<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::UnalignedFlat t) {
+    t.device(d) = t.constant(strict_cast<T>(0.f));
+  }
+};
+
+namespace {
+
 // Adds bias, applies non-linearities and gates.
 //
 // Launch with a 2D setup such that there is one thread per (example,
@@ -42,12 +85,15 @@ namespace {
 template <typename T, bool use_peephole>
 __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
                            const T* wci, const T* wcf, const T* wco, T* o, T* h,
-                           T* ci, T* cs, T* co, T* i, T* f, const T forget_bias,
-                           const T cell_clip, const int batch_size,
-                           const int cell_size) {
+                           T* ci, T* cs, T* co, T* i, T* f,
+                           const float forget_bias, const float cell_clip,
+                           const int batch_size, const int cell_size) {
   const int batch_id = blockIdx.x * blockDim.x + threadIdx.x;
   const int act_id = blockIdx.y * blockDim.y + threadIdx.y;
 
+  T forget_bias_t = strict_cast<T>(forget_bias);
+  T cell_clip_t = strict_cast<T>(cell_clip);
+
   if (batch_id >= batch_size || act_id >= cell_size) return;
 
   // The following code assumes the input arrays are of the following
@@ -95,7 +141,7 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
   //
   const int gid = batch_id * cell_size * 4 + act_id;
   const int cid = batch_id * cell_size + act_id;
-  Eigen::internal::scalar_sigmoid_op<T> sigmoid_op;
+  Eigen::internal::scalar_logistic_op<T> sigmoid_op;
   Eigen::internal::scalar_tanh_op<T> tanh_op;
   Eigen::scalar_clip_op<T> clip_op;
 
@@ -115,16 +161,16 @@ __global__ void lstm_gates(const T* icfo, const T* b, const T* cs_prev,
   T f_local;
   if (use_peephole) {
     f_local = sigmoid_op(icfo[2 * cell_size + gid] + b[2 * cell_size + act_id] +
-                         forget_bias + cs_prev[cid] * wcf[act_id]);
+                         forget_bias_t + cs_prev[cid] * wcf[act_id]);
   } else {
     f_local = sigmoid_op(icfo[2 * cell_size + gid] + b[2 * cell_size + act_id] +
-                         forget_bias);
+                         forget_bias_t);
   }
   f[cid] = f_local;
 
   T cs_local = i_local * ci_local + f_local * cs_prev[cid];
-  if (cell_clip > 0.0) {
-    cs_local = clip_op(cs_local, cell_clip);
+  if (cell_clip > 0.0f) {
+    cs_local = clip_op(cs_local, cell_clip_t);
   }
   cs[cid] = cs_local;
 
@@ -174,8 +220,8 @@ __global__ void concat_xh(T* xh, const T* x, const T* h_prev,
 
 template <typename T>
 void LSTMBlockCellFpropWithCUDA(
-    OpKernelContext* ctx, const GPUDevice& d, const T forget_bias,
-    const T cell_clip, bool use_peephole, typename TTypes<T>::ConstMatrix x,
+    OpKernelContext* ctx, const GPUDevice& d, const float forget_bias,
+    const float cell_clip, bool use_peephole, typename TTypes<T>::ConstMatrix x,
     typename TTypes<T>::ConstMatrix cs_prev,
     typename TTypes<T>::ConstMatrix h_prev, typename TTypes<T>::ConstMatrix w,
     typename TTypes<T>::ConstVec wci, typename TTypes<T>::ConstVec wcf,
@@ -202,7 +248,8 @@ void LSTMBlockCellFpropWithCUDA(
   // states1 = xh * w
   typename TTypes<T>::ConstMatrix const_xh(xh.data(), xh.dimensions());
   TensorBlasGemm<GPUDevice, T, true /* USE_CUBLAS */>::compute(
-      ctx, d, false, false, T(1), const_xh, w, T(0), icfo);
+      ctx, d, false, false, typename gemm_compute_type<T>::type(1.f), const_xh,
+      w, typename gemm_compute_type<T>::type(0.f), icfo);
 
   // Add bias, apply non-linearities and gating.
   //
@@ -357,8 +404,9 @@ void LSTMBlockCellBpropWithCUDA(
   template struct TensorAdd<GPUDevice, T>;                                     \
   template <>                                                                  \
   void LSTMBlockCellFprop<GPUDevice, T, true /* USE_CUBLAS */>::operator()(    \
-      OpKernelContext* ctx, const GPUDevice& d, const T forget_bias,           \
-      const T cell_clip, bool use_peephole, typename TTypes<T>::ConstMatrix x, \
+      OpKernelContext* ctx, const GPUDevice& d, const float forget_bias,       \
+      const float cell_clip, bool use_peephole,                                \
+      typename TTypes<T>::ConstMatrix x,                                       \
       typename TTypes<T>::ConstMatrix cs_prev,                                 \
       typename TTypes<T>::ConstMatrix h_prev,                                  \
       typename TTypes<T>::ConstMatrix w, typename TTypes<T>::ConstVec wci,     \
@@ -368,10 +416,10 @@ void LSTMBlockCellBpropWithCUDA(
       typename TTypes<T>::Matrix f, typename TTypes<T>::Matrix o,              \
       typename TTypes<T>::Matrix ci, typename TTypes<T>::Matrix co,            \
       typename TTypes<T>::Matrix icfo, typename TTypes<T>::Matrix h) {         \
-    LSTMBlockCellFpropWithCUDA(ctx, d, forget_bias, cell_clip, use_peephole,   \
-                               x, cs_prev, h_prev, w, wci, wcf, wco, b, xh, i, \
-                               cs, f, o, ci, co, icfo, h, batch_size_,         \
-                               cell_size_, input_size_);                       \
+    LSTMBlockCellFpropWithCUDA<T>(ctx, d, forget_bias, cell_clip,              \
+                                  use_peephole, x, cs_prev, h_prev, w, wci,    \
+                                  wcf, wco, b, xh, i, cs, f, o, ci, co, icfo,  \
+                                  h, batch_size_, cell_size_, input_size_);    \
   }                                                                            \
   template <>                                                                  \
   void LSTMBlockCellBprop<GPUDevice, T, true /* USE_CUBLAS */>::operator()(    \
@@ -403,6 +451,7 @@ void LSTMBlockCellBpropWithCUDA(
   template struct BlockLSTMBprop<GPUDevice, T, true /* USE_CUBLAS */>;
 
 DEFINE_GPU_SPECS(float);
+DEFINE_GPU_SPECS(Eigen::half);
 // DEFINE_GPU_SPECS(double);
 #undef DEFINE_GPU_SPECS
 
diff --git a/tensorflow/contrib/rnn/ops/lstm_ops.cc b/tensorflow/contrib/rnn/ops/lstm_ops.cc
index 699cc6c88a4634334b2621a7f48cbbeae1dc9a45..1679e355184f0622ba5e82dd9334edab3fe4c6f3 100644
--- a/tensorflow/contrib/rnn/ops/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/ops/lstm_ops.cc
@@ -41,7 +41,7 @@ REGISTER_OP("LSTMBlockCell")
     .Attr("forget_bias: float = 1.0")
     .Attr("cell_clip: float = 3.0")
     .Attr("use_peephole: bool = false")
-    .Attr("T: {float}")
+    .Attr("T: {half, float}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle x, cs_prev;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &x));
@@ -128,7 +128,7 @@ REGISTER_OP("LSTMBlockCellGrad")
     .Output("wcf_grad: T")
     .Output("wco_grad: T")
     .Attr("use_peephole: bool")
-    .Attr("T: {float}")
+    .Attr("T: {half, float}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle x, cs_prev;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &x));
@@ -196,7 +196,7 @@ REGISTER_OP("BlockLSTM")
     .Attr("forget_bias: float = 1.0")
     .Attr("cell_clip: float = 3.0")
     .Attr("use_peephole: bool = false")
-    .Attr("T: {float}")
+    .Attr("T: {half, float}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle x, b;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &x));
@@ -288,7 +288,7 @@ REGISTER_OP("BlockLSTMGrad")
     .Output("wco_grad: T")
     .Output("b_grad: T")
     .Attr("use_peephole: bool")
-    .Attr("T: {float}")
+    .Attr("T: {half, float}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle x, cs_prev, h_prev, w, wci, wco, wcf, b;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &x));
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index be0306cb0703765984c810bfdae3ad6f1f8441f0..7d57b0413a3bb51c35e670ce3fdb2cc818f44a58 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -695,7 +695,7 @@ class RNNCellTest(test.TestCase):
       return
 
     gpu_dev = test.gpu_device_name()
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 1, 3])
@@ -757,9 +757,10 @@ class RNNCellTest(test.TestCase):
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
         m = array_ops.zeros([1, 4])
-        _, ml = rnn_cell_impl.MultiRNNCell(
+        multi_rnn_cell = rnn_cell_impl.MultiRNNCell(
             [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-            state_is_tuple=False)(x, m)
+            state_is_tuple=False)
+        _, ml = multi_rnn_cell(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run(ml, {
             x.name: np.array([[1., 1.]]),
@@ -767,6 +768,9 @@ class RNNCellTest(test.TestCase):
         })
         # The numbers in results were not calculated, this is just a smoke test.
         self.assertAllClose(res, [[0.175991, 0.175991, 0.13248, 0.13248]])
+        self.assertEqual(len(multi_rnn_cell.weights), 2 * 4)
+        self.assertTrue(
+            [x.dtype == dtypes.float32 for x in multi_rnn_cell.weights])
 
   def testMultiRNNCellWithStateTuple(self):
     with self.cached_session() as sess:
@@ -902,7 +906,7 @@ class DropoutWrapperTest(test.TestCase):
 
   def testDropoutWrapperKeepNoOutput(self):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
     res = self._testDropoutWrapper(
         input_keep_prob=keep_all,
         output_keep_prob=keep_none,
@@ -918,7 +922,7 @@ class DropoutWrapperTest(test.TestCase):
 
   def testDropoutWrapperKeepNoStateExceptLSTMCellMemory(self):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
     # Even though we dropout state, by default DropoutWrapper never
     # drops out the memory ("c") term of an LSTMStateTuple.
     res = self._testDropoutWrapper(
@@ -939,7 +943,7 @@ class DropoutWrapperTest(test.TestCase):
 
   def testDropoutWrapperKeepNoInput(self):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
     true_full_output = np.array(
         [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
         dtype=np.float32)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index f31ad53d3c4274ca7a9027ea2e3bdea5424bd567..ef372b947cedf71e9d44423f10cc43375b467cd9 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -211,7 +211,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(out.get_shape(), inp.get_shape())
       self.assertEqual(out.dtype, inp.dtype)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       input_value = np.random.randn(batch_size, input_size)
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
 
@@ -227,7 +227,7 @@ class RNNTest(test.TestCase):
   def testDropout(self):
     cell = Plus1RNNCell()
     full_dropout_cell = rnn_cell.DropoutWrapper(
-        cell, input_keep_prob=1e-12, seed=0)
+        cell, input_keep_prob=1e-6, seed=0)
     (name, dep), = full_dropout_cell._checkpoint_dependencies
     self.assertIs(dep, cell)
     self.assertEqual("cell", name)
@@ -247,7 +247,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(out.get_shape().as_list(), inp.get_shape().as_list())
       self.assertEqual(out.dtype, inp.dtype)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       input_value = np.random.randn(batch_size, input_size)
       values = sess.run(outputs + [state], feed_dict={inputs[0]: input_value})
       full_dropout_values = sess.run(
@@ -274,7 +274,7 @@ class RNNTest(test.TestCase):
           cell, inputs, sequence_length=sequence_length, dtype=dtypes.float32)
     self.assertEqual(len(dynamic_outputs), len(inputs))
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       input_value = np.random.randn(batch_size, input_size)
       dynamic_values = sess.run(
           dynamic_outputs,
@@ -310,7 +310,7 @@ class RNNTest(test.TestCase):
                                      1.0 * (2 + 1) * np.ones((input_size)))))
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops_lib.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -372,7 +372,7 @@ class LSTMTest(test.TestCase):
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       cell = rnn_cell.LSTMCell(
@@ -394,7 +394,7 @@ class LSTMTest(test.TestCase):
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       cell = rnn_cell.LSTMCell(
@@ -424,7 +424,7 @@ class LSTMTest(test.TestCase):
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, 2 * num_units)
@@ -562,7 +562,7 @@ class LSTMTest(test.TestCase):
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
@@ -659,7 +659,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
 
@@ -692,7 +692,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
           array_ops.placeholder(dtypes.float64, shape=(None, input_size))
@@ -728,7 +728,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
@@ -784,7 +784,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       sequence_length = array_ops.placeholder(dtypes.int64)
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
@@ -1117,7 +1117,7 @@ class LSTMTest(test.TestCase):
           state_is_tuple=False)
 
     ########### Step 1: Run static graph and generate readouts
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       if in_graph_mode:
         concat_inputs = array_ops.placeholder(
             dtypes.float32, shape=(time_steps, batch_size, input_size))
@@ -1177,7 +1177,7 @@ class LSTMTest(test.TestCase):
             static_individual_variable_gradients, feed_dict=feeds)
 
     ########## Step 2: Run dynamic graph and generate readouts
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       if in_graph_mode:
         concat_inputs = array_ops.placeholder(
             dtypes.float32, shape=(time_steps, batch_size, input_size))
@@ -1337,7 +1337,7 @@ class BidirectionalRNNTest(test.TestCase):
     return input_value, inputs, outputs, state_fw, state_bw, sequence_length
 
   def _testBidirectionalRNN(self, use_shape):
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createBidirectionalRNN(use_shape, True))
       variables_lib.global_variables_initializer().run()
@@ -1384,7 +1384,7 @@ class BidirectionalRNNTest(test.TestCase):
       self.assertAllClose(s_fw, s_bw)
 
   def _testBidirectionalRNNWithoutSequenceLength(self, use_shape):
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, _ = (
           self._createBidirectionalRNN(use_shape, False))
       variables_lib.global_variables_initializer().run()
@@ -1472,7 +1472,7 @@ class BidirectionalRNNTest(test.TestCase):
 
   def _testBidirectionalDynamicRNN(self, use_shape, use_state_tuple,
                                    use_time_major, use_sequence_length):
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createBidirectionalDynamicRNN(
               use_shape, use_state_tuple, use_time_major, use_sequence_length))
@@ -1549,7 +1549,7 @@ class BidirectionalRNNTest(test.TestCase):
     # REMARKS: factory(scope) is a function accepting a scope
     #          as an argument, such scope can be None, a string
     #          or a VariableScope instance.
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops_lib.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -1868,7 +1868,7 @@ class StateSaverRNNTest(test.TestCase):
     batch_size = 2
     state_saver = TestStateSaver(batch_size, 2 * num_units)
 
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops_lib.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           self._factory(scope=scope, state_saver=state_saver)
@@ -1945,7 +1945,7 @@ class GRUTest(test.TestCase):
 
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
       concat_inputs = array_ops.placeholder(
           dtypes.float32, shape=(time_steps, batch_size, input_size))
 
@@ -1967,7 +1967,7 @@ class GRUTest(test.TestCase):
       sess.run([outputs_dynamic, state_dynamic], feed_dict=feeds)
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops_lib.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -2253,7 +2253,7 @@ class RawRNNTest(test.TestCase):
           np.ones((max_time, batch_size, 1), np.int64), output_vals[1])
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.test_session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops_lib.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -2370,7 +2370,7 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
           sequence_length=sequence_length,
           dtype=dtypes.float32)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
       run_metadata = config_pb2.RunMetadata()
       variables_lib.global_variables_initializer().run()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
index b865466cc75aa67fcd192f7726f65141409b896a..50d0da6eaf26bbb621907193f2915eaf3c9eadba 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/gru_ops_test.py
@@ -41,7 +41,7 @@ from tensorflow.python.training import gradient_descent
 class GRUBlockCellTest(test.TestCase):
 
   def testNoneDimsWithDynamicRNN(self):
-    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 4
       cell_size = 5
       input_size = 6
@@ -58,7 +58,7 @@ class GRUBlockCellTest(test.TestCase):
       sess.run(output, feed)
 
   def testBlockGRUToGRUCellSingleStep(self):
-    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 4
       cell_size = 5
       input_size = 6
@@ -91,7 +91,7 @@ class GRUBlockCellTest(test.TestCase):
         self.assertAllClose(block, basic)
 
   def testBlockGRUToGRUCellMultiStep(self):
-    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 2
       cell_size = 3
       input_size = 3
@@ -150,7 +150,7 @@ class GRUBlockCellTest(test.TestCase):
       self.assertAllClose(block_res[1], block_res[1])
 
   def testDerivativeOfBlockGRUToGRUCellSingleStep(self):
-    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 2
       cell_size = 3
       input_size = 4
@@ -220,7 +220,7 @@ class GRUBlockCellTest(test.TestCase):
     cell_size = 3
     input_size = 4
     time_steps = 2
-    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       # Random initializers.
       seed = 1994
       initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=seed)
@@ -287,7 +287,7 @@ class GRUBlockCellTest(test.TestCase):
       self.assertAllClose(block, basic)
 
   def testGradient(self):
-    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 1
       cell_size = 3
       input_size = 2
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
index ffd24218944e150a32b1b915288ab1df90afb45c..d5700d2a200f6cdac06183366c0d11ec3531235b 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/lstm_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.rnn.python.kernel_tests import benchmarking
@@ -27,6 +28,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import rnn
@@ -38,7 +41,70 @@ from tensorflow.python.platform import test
 block_lstm = lstm_ops._block_lstm  # pylint: disable=protected-access
 
 
-def blocks_match(sess, use_peephole):
+class _MaskedRandomUniformInitializer(init_ops.RandomUniform):
+  """Initializer for uniform dist tensors with trailing bits zeroed-out.
+
+  Allow returning tensors with last few mantissa bits set to 0. This potentially
+  helps avoid getting into precision issues when testing low precision (float16)
+  computation.
+  """
+
+  def __init__(self,
+               minval=0,
+               maxval=None,
+               seed=None,
+               dtype=dtypes.float16,
+               num_valid_mantissa_bits=4):
+    """Constructor.
+
+    Args:
+      minval: A python scalar or a scalar tensor. Lower bound of the range of
+        random values to generate.
+      maxval: A python scalar or a scalar tensor. Upper bound of the range of
+        random values to generate.  Defaults to 1 for float types.
+      seed: A Python integer. Used to create random seeds. See
+        `tf.set_random_seed` for behavior.
+      dtype: The data type. Only supports tf.float16 for now.
+      num_valid_mantissa_bits: number of non-zero mantissa bits, default to 4.
+
+    Raises:
+      ValueError: An error if `dtype` is not tf.float16.
+    """
+    if dtype not in (dtypes.float16,):
+      raise ValueError("dtype: %s not supported" % dtype.name)
+
+    super(_MaskedRandomUniformInitializer, self).__init__(
+        minval=minval, maxval=maxval, seed=seed, dtype=dtype)
+    self._num_mantissa_bits = 10
+    self._num_valid_mantissa_bits = num_valid_mantissa_bits
+
+  def __call__(self, shape, dtype=dtypes.float16, partition_info=None):
+    if dtype and dtype != dtypes.float16:
+      raise ValueError("dtype: %s not supported" % dtype.name)
+    res = super(_MaskedRandomUniformInitializer, self).__call__(
+        shape, dtype, partition_info)
+    # get uint16 view of the underlying buffer.
+    res = gen_array_ops.bitcast(res, dtypes.uint16)
+
+    # mask the last `shift` mantissa bits.
+    shift = self._num_mantissa_bits - self._num_valid_mantissa_bits
+    mask = (0xffff >> shift) << shift
+    res = gen_bitwise_ops.bitwise_and(res, mask)
+
+    # restore float16 view.
+    return gen_array_ops.bitcast(res, dtype)
+
+
+def _get_initializer(init_bound, dtype, seed):
+  if dtype == dtypes.float16:
+    return _MaskedRandomUniformInitializer(
+        -init_bound, init_bound, dtype=dtype, seed=seed)
+  else:
+    return init_ops.random_uniform_initializer(
+        -init_bound, init_bound, dtype=dtype, seed=seed)
+
+
+def blocks_match(sess, use_peephole, dtype=dtypes.float32, cell_clip=None):
   batch_size = 2
   input_size = 3
   cell_size = 4
@@ -47,36 +113,42 @@ def blocks_match(sess, use_peephole):
   inputs = []
   for _ in range(sequence_length):
     inp = ops.convert_to_tensor(
-        np.random.randn(batch_size, input_size), dtype=dtypes.float32)
+        np.random.randn(batch_size, input_size), dtype=dtype)
     inputs.append(inp)
   stacked_inputs = array_ops.stack(inputs)
 
-  initializer = init_ops.random_uniform_initializer(-0.01, 0.01, seed=19890212)
+  init_bound = 1e-1 if dtype == dtypes.float16 else 1e-2
+  initializer = _get_initializer(init_bound, dtype=dtype, seed=19890212)
 
   with variable_scope.variable_scope("test", initializer=initializer):
     # magic naming so that the cells pick up these variables and reuse them
     if use_peephole:
       wci = variable_scope.get_variable(
-          "rnn/lstm_cell/w_i_diag", shape=[cell_size], dtype=dtypes.float32)
+          "rnn/lstm_cell/w_i_diag", shape=[cell_size], dtype=dtype)
       wcf = variable_scope.get_variable(
-          "rnn/lstm_cell/w_f_diag", shape=[cell_size], dtype=dtypes.float32)
+          "rnn/lstm_cell/w_f_diag", shape=[cell_size], dtype=dtype)
       wco = variable_scope.get_variable(
-          "rnn/lstm_cell/w_o_diag", shape=[cell_size], dtype=dtypes.float32)
+          "rnn/lstm_cell/w_o_diag", shape=[cell_size], dtype=dtype)
 
     w = variable_scope.get_variable(
         "rnn/lstm_cell/kernel",
         shape=[input_size + cell_size, cell_size * 4],
-        dtype=dtypes.float32)
+        dtype=dtype)
     b = variable_scope.get_variable(
         "rnn/lstm_cell/bias",
         shape=[cell_size * 4],
-        dtype=dtypes.float32,
+        dtype=dtype,
         initializer=init_ops.zeros_initializer())
 
     basic_cell = rnn_cell.LSTMCell(
-        cell_size, use_peepholes=use_peephole, state_is_tuple=True, reuse=True)
+        cell_size,
+        use_peepholes=use_peephole,
+        cell_clip=cell_clip,
+        dtype=dtype,
+        state_is_tuple=True,
+        reuse=True)
     basic_outputs_op, basic_state_op = rnn.static_rnn(
-        basic_cell, inputs, dtype=dtypes.float32)
+        basic_cell, inputs, dtype=dtype)
 
     if use_peephole:
       _, _, _, _, _, _, block_outputs_op = block_lstm(
@@ -87,7 +159,7 @@ def blocks_match(sess, use_peephole):
           wci=wci,
           wcf=wcf,
           wco=wco,
-          cell_clip=0,
+          cell_clip=cell_clip,
           use_peephole=True)
     else:
       _, _, _, _, _, _, block_outputs_op = block_lstm(
@@ -95,13 +167,15 @@ def blocks_match(sess, use_peephole):
           inputs,
           w,
           b,
-          cell_clip=0)
+          cell_clip=cell_clip)
 
     fused_cell = lstm_ops.LSTMBlockFusedCell(
-        cell_size, cell_clip=0, use_peephole=use_peephole, reuse=True,
+        cell_size,
+        cell_clip=cell_clip,
+        use_peephole=use_peephole,
+        reuse=True,
         name="rnn/lstm_cell")
-    fused_outputs_op, fused_state_op = fused_cell(
-        stacked_inputs, dtype=dtypes.float32)
+    fused_outputs_op, fused_state_op = fused_cell(stacked_inputs, dtype=dtype)
 
     sess.run([variables.global_variables_initializer()])
     basic_outputs, basic_state = sess.run([basic_outputs_op, basic_state_op[0]])
@@ -127,10 +201,22 @@ def blocks_match(sess, use_peephole):
             block_wgrads, fused_wgrads)
 
 
-class LSTMBlockCellTest(test.TestCase):
+class LSTMBlockCellTest(test.TestCase, parameterized.TestCase):
+
+  TEST_CASES = ({
+      "testcase_name": "Fp32",
+      "dtype": dtypes.float32,
+      "rtol": 1e-6,
+      "atol": 1e-6
+  }, {
+      "testcase_name": "Fp16",
+      "dtype": dtypes.float16,
+      "rtol": 8e-3,
+      "atol": 8e-4
+  })
 
   def testNoneDimsWithDynamicRNN(self):
-    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       batch_size = 4
       num_steps = 5
       input_dim = 6
@@ -147,7 +233,7 @@ class LSTMBlockCellTest(test.TestCase):
       sess.run(output, feed)
 
   def testLSTMBlockCell(self):
-    with self.test_session(use_gpu=True, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
         x = array_ops.zeros([1, 2])
@@ -175,7 +261,7 @@ class LSTMBlockCellTest(test.TestCase):
         self.assertAllClose(res[4], [[0.24024698, 0.24024698]])
 
   def testCompatibleNames(self):
-    with self.test_session(use_gpu=True, graph=ops.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       cell = rnn_cell.LSTMCell(10)
       pcell = rnn_cell.LSTMCell(10, use_peepholes=True)
       inputs = [array_ops.zeros([4, 5])] * 6
@@ -186,7 +272,7 @@ class LSTMBlockCellTest(test.TestCase):
           for v in variables.trainable_variables()
       }
 
-    with self.test_session(use_gpu=True, graph=ops.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       cell = lstm_ops.LSTMBlockCell(10)
       pcell = lstm_ops.LSTMBlockCell(10, use_peephole=True)
       inputs = [array_ops.zeros([4, 5])] * 6
@@ -197,7 +283,7 @@ class LSTMBlockCellTest(test.TestCase):
           for v in variables.trainable_variables()
       }
 
-    with self.test_session(use_gpu=True, graph=ops.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       cell = lstm_ops.LSTMBlockFusedCell(10)
       pcell = lstm_ops.LSTMBlockFusedCell(10, use_peephole=True)
       inputs = array_ops.stack([array_ops.zeros([4, 5])] * 6)
@@ -212,7 +298,7 @@ class LSTMBlockCellTest(test.TestCase):
     self.assertEqual(basic_names, fused_names)
 
   def testLSTMBasicToBlockCell(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       x = array_ops.zeros([1, 2])
       x_values = np.random.randn(1, 2)
 
@@ -262,7 +348,7 @@ class LSTMBlockCellTest(test.TestCase):
         self.assertAllClose(basic, block)
 
   def testLSTMBasicToBlockCellPeeping(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       x = array_ops.zeros([1, 2])
       x_values = np.random.randn(1, 2)
 
@@ -314,45 +400,47 @@ class LSTMBlockCellTest(test.TestCase):
       for basic, block in zip(basic_res, block_res):
         self.assertAllClose(basic, block)
 
-  def testLSTMBasicToBlock(self):
-    with self.test_session(use_gpu=True) as sess:
+  def LSTMBasicToBlockTestHelper(self,
+                                 dtype=dtypes.float32,
+                                 use_peephole=False,
+                                 cell_clip=None,
+                                 rtol=1e-6,
+                                 atol=1e-6):
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       (basic_state, fused_state, basic_outputs, block_outputs, fused_outputs,
        basic_grads, block_grads, fused_grads, basic_wgrads, block_wgrads,
        fused_wgrads) = blocks_match(
-           sess, use_peephole=False)
+           sess, use_peephole=use_peephole, dtype=dtype, cell_clip=cell_clip)
 
-      self.assertAllClose(basic_outputs, block_outputs)
-      self.assertAllClose(basic_grads, block_grads)
+      self.assertAllClose(basic_outputs, block_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(basic_grads, block_grads, rtol=rtol, atol=atol)
       for basic, block in zip(basic_wgrads, block_wgrads):
-        self.assertAllClose(basic, block, rtol=1e-6, atol=1e-6)
+        self.assertAllClose(basic, block, rtol=rtol, atol=atol)
 
-      self.assertAllClose(basic_outputs, fused_outputs)
-      self.assertAllClose(basic_state, fused_state)
-      self.assertAllClose(basic_grads, fused_grads)
-      for basic, fused in zip(block_wgrads, fused_wgrads):
-        self.assertAllClose(basic, fused, rtol=1e-6, atol=1e-6)
+      self.assertAllClose(basic_outputs, fused_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(basic_state, fused_state, rtol=rtol, atol=atol)
+      self.assertAllClose(basic_grads, fused_grads, rtol=rtol, atol=atol)
+      for basic, fused in zip(basic_wgrads, fused_wgrads):
+        self.assertAllClose(basic, fused, rtol=rtol, atol=atol)
 
-  def testLSTMBasicToBlockPeeping(self):
-    with self.test_session(use_gpu=True) as sess:
-      (basic_state, fused_state, basic_outputs, block_outputs, fused_outputs,
-       basic_grads, block_grads, fused_grads, basic_wgrads, block_wgrads,
-       fused_wgrads) = blocks_match(
-           sess, use_peephole=True)
+  @parameterized.named_parameters(*TEST_CASES)
+  def testLSTMBasicToBlock(self, dtype, rtol, atol):
+    self.LSTMBasicToBlockTestHelper(
+        dtype, use_peephole=False, rtol=rtol, atol=atol)
 
-      self.assertAllClose(basic_outputs, block_outputs)
-      self.assertAllClose(basic_grads, block_grads)
-      for basic, block in zip(basic_wgrads, block_wgrads):
-        self.assertAllClose(basic, block, rtol=1e-6, atol=1e-6)
+  @parameterized.named_parameters(*TEST_CASES)
+  def testLSTMBasicToBlockPeeping(self, dtype, rtol, atol):
+    self.LSTMBasicToBlockTestHelper(
+        dtype, use_peephole=True, rtol=rtol, atol=atol)
 
-      self.assertAllClose(basic_outputs, fused_outputs)
-      self.assertAllClose(basic_state, fused_state)
-      self.assertAllClose(basic_grads, fused_grads)
-      for basic, fused in zip(block_wgrads, fused_wgrads):
-        self.assertAllClose(basic, fused, rtol=1e-6, atol=1e-6)
+  @parameterized.named_parameters(*TEST_CASES)
+  def testLSTMBasicToBlockCellClip(self, dtype, rtol, atol):
+    self.LSTMBasicToBlockTestHelper(
+        dtype, use_peephole=True, cell_clip=0.5, rtol=rtol, atol=atol)
 
   def testLSTMFusedSequenceLengths(self):
     """Verify proper support for sequence lengths in LSTMBlockFusedCell."""
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       batch_size = 3
       input_size = 4
       cell_size = 5
@@ -444,16 +532,21 @@ class BenchmarkLSTMBlock(test.Benchmark):
         "batch_size": [1, 8, 13, 32, 67, 128],
         "cell_size": [128, 250, 512, 650, 1024, 1350],
         "time_steps": [40],
-        "use_gpu": [True, False]
+        "use_gpu": [True, False],
+        "dtype": ["float32", "float16"],
     }):
+      dtype = dtypes.float32 if config["dtype"] == "float32" else dtypes.float16
       with ops.Graph().as_default():
         with benchmarking.device(use_gpu=config["use_gpu"]):
           inputs = variable_scope.get_variable(
               "x",
-              [config["time_steps"], config["batch_size"], config["cell_size"]])
-          cell = lstm_ops.LSTMBlockCell(config["cell_size"])
-          outputs = rnn.dynamic_rnn(
-              cell, inputs, time_major=True, dtype=dtypes.float32)
+              dtype=dtype,
+              shape=[
+                  config["time_steps"], config["batch_size"],
+                  config["cell_size"]
+              ])
+          cell = lstm_ops.LSTMBlockCell(config["cell_size"], dtype=dtype)
+          outputs = rnn.dynamic_rnn(cell, inputs, time_major=True, dtype=dtype)
           init_op = variables.global_variables_initializer()
 
         with session.Session() as sess:
@@ -464,12 +557,14 @@ class BenchmarkLSTMBlock(test.Benchmark):
         # is set, this will produce a copy-paste-able CSV file.
         print(",".join(
             map(str, [
-                config["batch_size"], config["cell_size"], config["cell_size"],
-                config["time_steps"], config["use_gpu"], wall_time
+                config["dtype"], config["batch_size"], config["cell_size"],
+                config["cell_size"], config["time_steps"], config["use_gpu"],
+                wall_time
             ])))
         benchmark_name_template = "_".join([
-            "LSTMBlockCell_fprop", "BS%(batch_size)i", "CS%(cell_size)i",
-            "IS%(cell_size)i", "TS%(time_steps)i", "gpu_%(use_gpu)s"
+            "LSTMBlockCell_fprop", "DT_%(dtype)s", "BS%(batch_size)i",
+            "CS%(cell_size)i", "IS%(cell_size)i", "TS%(time_steps)i",
+            "gpu_%(use_gpu)s"
         ])
 
         self.report_benchmark(
@@ -488,8 +583,10 @@ class BenchmarkLSTMBlock(test.Benchmark):
         "batch_size": [1, 8, 13, 32, 67, 128],
         "cell_size": [128, 250, 512, 650, 1024, 1350],
         "time_steps": [40],
-        "use_gpu": [True, False]
+        "use_gpu": [True, False],
+        "dtype": ["float32", "float16"],
     }):
+      dtype = dtypes.float32 if config["dtype"] == "float32" else dtypes.float16
       with ops.Graph().as_default():
         with benchmarking.device(use_gpu=config["use_gpu"]):
           time_steps = config["time_steps"]
@@ -498,21 +595,21 @@ class BenchmarkLSTMBlock(test.Benchmark):
           inputs = variable_scope.get_variable(
               "x", [time_steps, batch_size, cell_size],
               trainable=False,
-              dtype=dtypes.float32)
+              dtype=dtype)
           with variable_scope.variable_scope(
               "rnn", reuse=variable_scope.AUTO_REUSE):
             w = variable_scope.get_variable(
                 "rnn/lstm_cell/kernel",
                 shape=[input_size + cell_size, cell_size * 4],
-                dtype=dtypes.float32)
+                dtype=dtype)
             b = variable_scope.get_variable(
                 "rnn/lstm_cell/bias",
                 shape=[cell_size * 4],
-                dtype=dtypes.float32,
+                dtype=dtype,
                 initializer=init_ops.zeros_initializer())
-            cell = lstm_ops.LSTMBlockCell(cell_size)
+            cell = lstm_ops.LSTMBlockCell(cell_size, dtype=dtype)
             outputs = rnn.dynamic_rnn(
-                cell, inputs, time_major=True, dtype=dtypes.float32)
+                cell, inputs, time_major=True, dtype=dtype)
           grads = gradients_impl.gradients(outputs, [inputs, w, b])
           init_op = variables.global_variables_initializer()
 
@@ -524,12 +621,13 @@ class BenchmarkLSTMBlock(test.Benchmark):
         # is set, this will produce a copy-paste-able CSV file.
         print(",".join(
             map(str, [
-                batch_size, cell_size, cell_size, time_steps, config["use_gpu"],
-                wall_time
+                config["dtype"], batch_size, cell_size, cell_size, time_steps,
+                config["use_gpu"], wall_time
             ])))
         benchmark_name_template = "_".join([
-            "LSTMBlockCell_bprop", "BS%(batch_size)i", "CS%(cell_size)i",
-            "IS%(cell_size)i", "TS%(time_steps)i", "gpu_%(use_gpu)s"
+            "LSTMBlockCell_bprop", "DT_%(dtype)s", "BS%(batch_size)i",
+            "CS%(cell_size)i", "IS%(cell_size)i", "TS%(time_steps)i",
+            "gpu_%(use_gpu)s"
         ])
 
         self.report_benchmark(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 0a272000156fef20765ce57b23a8f1afe0b13f89..aa1d7d2b01b4595bbb03ba8e867e93db759cbd52 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -1120,6 +1120,71 @@ class RNNCellTest(test.TestCase):
             r"input size \(3\) must be divisible by number_of_groups \(2\)"):
           gcell(glstm_input, gcell_zero_state)
 
+  def testCFNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope("root"):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = contrib_rnn_cell.CFNCell(
+            units=2,
+            kernel_initializer=initializers.Constant(0.5))
+        g, _ = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.17188203, 0.17188203]])
+      with variable_scope.variable_scope("other"):
+        # Test CFN with input_size != num_units.
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        cell = contrib_rnn_cell.CFNCell(
+            units=2,
+            kernel_initializer=initializers.Constant(0.5))
+        g, _ = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.15535763, 0.15535763]])
+
+  def testCFNCellEndToEnd(self):
+    with self.cached_session() as sess:
+      input_shape = 10
+      output_shape = 5
+      timestep = 4
+      batch = 100
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = utils.to_categorical(y_train)
+      cell = contrib_rnn_cell.CFNCell(output_shape)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape))
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape))
+
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32)
+      self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(state.shape.as_list(), [None, output_shape])
+      loss = losses.softmax_cross_entropy(predict, state)
+      train_op = training.GradientDescentOptimizer(0.001).minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      _, outputs, state = sess.run(
+          [train_op, outputs, state], {inputs: x_train, predict: y_train})
+
+      self.assertEqual(len(outputs), batch)
+      self.assertEqual(len(state), batch)
+
   def testMinimalRNNCell(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
index eda8cb3c12f7b7e3dac9b21347854bda4dcfe729..32df1db964a9467a921e5bf9747d4a7ebc17bb9e 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_test.py
@@ -99,7 +99,7 @@ class StackBidirectionalRNNTest(test.TestCase):
     return input_value, inputs, outputs, state_fw, state_bw, sequence_length
 
   def _testStackBidirectionalRNN(self, use_gpu, use_shape):
-    with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createStackBidirectionalRNN(use_gpu, use_shape, True))
       variables.global_variables_initializer().run()
@@ -159,7 +159,7 @@ class StackBidirectionalRNNTest(test.TestCase):
     # - Check that the state_5 and state_5' (forward and backward) are the
     #   same for the first layer (it does not apply for the second layer since
     #   it has forward-backward dependencies).
-    with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
       batch_size = 2
       # Create states placeholders.
       initial_states_fw = [
@@ -281,7 +281,7 @@ class StackBidirectionalRNNTest(test.TestCase):
 
   def _testStackBidirectionalDynamicRNN(self, use_gpu, use_shape,
                                         use_state_tuple):
-    with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createStackBidirectionalDynamicRNN(use_gpu, use_shape,
                                                    use_state_tuple))
@@ -343,7 +343,7 @@ class StackBidirectionalRNNTest(test.TestCase):
     # - Check that the state_5 and state_5' (forward and backward) are the
     #   same for the first layer (it does not apply for the second layer since
     #   it has forward-backward dependencies).
-    with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
       batch_size = 2
       # Create states placeholders.
       initial_states_fw = [
@@ -414,7 +414,7 @@ class StackBidirectionalRNNTest(test.TestCase):
     # REMARKS: factory(scope) is a function accepting a scope
     #          as an argument, such scope can be None, a string
     #          or a VariableScope instance.
-    with self.test_session(use_gpu=True, graph=ops.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py b/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
index 645f82624bf67b96ffc8520289b293b45f0e69e2..6da58747e8905fae3b85b085991487d74f80b734 100644
--- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
@@ -83,11 +83,11 @@ class _Linear(object):
     for shape in shapes:
       if shape.ndims != 2:
         raise ValueError("linear is expecting 2D arguments: %s" % shapes)
-      if shape[1].value is None:
+      if shape.dims[1].value is None:
         raise ValueError("linear expects shape[1] to be provided for shape %s, "
                          "but saw %s" % (shape, shape[1]))
       else:
-        total_arg_size += shape[1].value
+        total_arg_size += shape.dims[1].value
 
     dtype = [a.dtype for a in args][0]
 
@@ -156,11 +156,11 @@ def _linear(args,
   for shape in shapes:
     if shape.ndims != 2:
       raise ValueError("linear is expecting 2D arguments: %s" % shapes)
-    if shape[1].value is None:
+    if shape.dims[1].value is None:
       raise ValueError("linear expects shape[1] to be provided for shape %s, "
                        "but saw %s" % (shape, shape[1]))
     else:
-      total_arg_size += shape[1].value
+      total_arg_size += shape.dims[1].value
 
   dtype = [a.dtype for a in args][0]
 
diff --git a/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py b/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py
index b7393d8b9880715cb381e1050b5ea757e36f2372..f90fd40990a32de18d5650dde9ff361ace77821e 100644
--- a/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/fused_rnn_cell.py
@@ -20,10 +20,13 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import rnn
 
 
+@six.add_metaclass(abc.ABCMeta)
 class FusedRNNCell(object):
   """Abstract object representing a fused RNN cell.
 
@@ -38,8 +41,6 @@ class FusedRNNCell(object):
   Every `FusedRNNCell` must implement `__call__` with the following signature.
   """
 
-  __metaclass__ = abc.ABCMeta
-
   @abc.abstractmethod
   def __call__(self,
                inputs,
diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py
index 81ca12317be484ba420b7bbfac822e91d6d38bff..251a933eaec826b08266123245d9aef8573d3e06 100644
--- a/tensorflow/contrib/rnn/python/ops/gru_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py
@@ -20,7 +20,8 @@ from __future__ import print_function
 from tensorflow.contrib.rnn.ops import gen_gru_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -164,7 +165,7 @@ class GRUBlockCell(LayerRNNCell):
       num_units = cell_size
     self._cell_size = num_units
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -176,7 +177,7 @@ class GRUBlockCell(LayerRNNCell):
 
   def build(self, input_shape):
     # Check if the input size exist.
-    input_size = input_shape[1].value
+    input_size = tensor_shape.dimension_value(input_shape[1])
     if input_size is None:
       raise ValueError("Expecting input_size to be set.")
 
@@ -221,7 +222,7 @@ class GRUBlockCellV2(GRUBlockCell):
 
   def build(self, input_shape):
     """GRU cell."""
-    input_size = input_shape[1].value
+    input_size = tensor_shape.dimension_value(input_shape[1])
     if input_size is None:
       raise ValueError("Expecting input_size to be set.")
 
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 9e61fc54d10c1b75786450060e428c73974760a7..b043026bc556a8879b15b432829baf8136250c0e 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -19,10 +19,13 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 from tensorflow.contrib.rnn.ops import gen_lstm_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -113,7 +116,7 @@ def _lstm_block_cell(x,
     ValueError: If cell_size is None.
   """
   if wci is None:
-    cell_size = cs_prev.get_shape().with_rank(2)[1].value
+    cell_size = cs_prev.get_shape().with_rank(2).dims[1].value
     if cell_size is None:
       raise ValueError("cell_size from `cs_prev` should not be None.")
     wci = array_ops.constant(0, dtype=dtypes.float32, shape=[cell_size])
@@ -154,7 +157,7 @@ def _block_lstm(seq_len_max,
 
   Args:
     seq_len_max: A `Tensor` of type `int64`.
-    x: A list of at least 1 `Tensor` objects of the same type in: `float32`.
+    x: A list of at least 1 `Tensor` objects of the same type.
     w: A `Tensor`. Must have the same type as `x`.
     b: A `Tensor`. Must have the same type as `x`.
     cs_prev: A `Tensor`. Must have the same type as `x`.
@@ -187,21 +190,22 @@ def _block_lstm(seq_len_max,
   Raises:
     ValueError: If `b` does not have a valid shape.
   """
-  batch_size = x[0].get_shape().with_rank(2)[0].value
-  cell_size4 = b.get_shape().with_rank(1)[0].value
+  dtype = x[0].dtype
+  batch_size = x[0].get_shape().with_rank(2).dims[0].value
+  cell_size4 = b.get_shape().with_rank(1).dims[0].value
   if cell_size4 is None:
     raise ValueError("`b` shape must not be None.")
   cell_size = cell_size4 / 4
   zero_state = None
   if cs_prev is None or h_prev is None:
     zero_state = array_ops.constant(
-        0, dtype=dtypes.float32, shape=[batch_size, cell_size])
+        0, dtype=dtype, shape=[batch_size, cell_size])
   if cs_prev is None:
     cs_prev = zero_state
   if h_prev is None:
     h_prev = zero_state
   if wci is None:
-    wci = array_ops.constant(0, dtype=dtypes.float32, shape=[cell_size])
+    wci = array_ops.constant(0, dtype=dtype, shape=[cell_size])
     wcf = wci
     wco = wci
 
@@ -238,13 +242,13 @@ def _LSTMBlockCellGrad(op, *grad):
   (i, cs, f, o, ci, co, _) = op.outputs
   (_, cs_grad, _, _, _, _, h_grad) = grad
 
-  batch_size = x.get_shape().with_rank(2)[0].value
+  batch_size = x.get_shape().with_rank(2).dims[0].value
   if batch_size is None:
     batch_size = -1
-  input_size = x.get_shape().with_rank(2)[1].value
+  input_size = x.get_shape().with_rank(2).dims[1].value
   if input_size is None:
     raise ValueError("input_size from `x` should not be None.")
-  cell_size = cs_prev.get_shape().with_rank(2)[1].value
+  cell_size = cs_prev.get_shape().with_rank(2).dims[1].value
   if cell_size is None:
     raise ValueError("cell_size from `cs_prev` should not be None.")
 
@@ -346,6 +350,7 @@ class LSTMBlockCell(LayerRNNCell):
                forget_bias=1.0,
                cell_clip=None,
                use_peephole=False,
+               dtype=None,
                reuse=None,
                name="lstm_cell"):
     """Initialize the basic LSTM cell.
@@ -355,6 +360,7 @@ class LSTMBlockCell(LayerRNNCell):
       forget_bias: float, The bias added to forget gates (see above).
       cell_clip: An optional `float`. Defaults to `-1` (no clipping).
       use_peephole: Whether to use peephole connections or not.
+      dtype: the variable dtype of this layer. Default to tf.float32.
       reuse: (optional) boolean describing whether to reuse variables in an
         existing scope.  If not `True`, and the existing scope already has the
         given variables, an error is raised.
@@ -366,7 +372,7 @@ class LSTMBlockCell(LayerRNNCell):
       When restoring from CudnnLSTM-trained checkpoints, must use
       CudnnCompatibleLSTMBlockCell instead.
     """
-    super(LSTMBlockCell, self).__init__(_reuse=reuse, name=name)
+    super(LSTMBlockCell, self).__init__(_reuse=reuse, dtype=dtype, name=name)
     self._num_units = num_units
     self._forget_bias = forget_bias
     self._use_peephole = use_peephole
@@ -380,7 +386,7 @@ class LSTMBlockCell(LayerRNNCell):
         "scope": "lstm_cell"
     }
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -391,10 +397,10 @@ class LSTMBlockCell(LayerRNNCell):
     return self._num_units
 
   def build(self, inputs_shape):
-    if not inputs_shape[1].value:
+    if not inputs_shape.dims[1].value:
       raise ValueError(
           "Expecting inputs_shape[1] to be set: %s" % str(inputs_shape))
-    input_size = inputs_shape[1].value
+    input_size = inputs_shape.dims[1].value
     self._kernel = self.add_variable(
         self._names["W"], [input_size + self._num_units, self._num_units * 4])
     self._bias = self.add_variable(
@@ -417,7 +423,7 @@ class LSTMBlockCell(LayerRNNCell):
       wcf = self._w_f_diag
       wco = self._w_o_diag
     else:
-      wci = wcf = wco = array_ops.zeros([self._num_units])
+      wci = wcf = wco = array_ops.zeros([self._num_units], dtype=self.dtype)
 
     (cs_prev, h_prev) = state
     (_, cs, _, _, _, _, h) = _lstm_block_cell(
@@ -437,6 +443,7 @@ class LSTMBlockCell(LayerRNNCell):
     return h, new_state
 
 
+@six.add_metaclass(abc.ABCMeta)
 class LSTMBlockWrapper(base_layer.Layer):
   """This is a helper class that provides housekeeping for LSTM cells.
 
@@ -509,10 +516,10 @@ class LSTMBlockWrapper(base_layer.Layer):
     inputs_shape = inputs.get_shape().with_rank(3)
     if not inputs_shape[2]:
       raise ValueError("Expecting inputs_shape[2] to be set: %s" % inputs_shape)
-    batch_size = inputs_shape[1].value
+    batch_size = inputs_shape.dims[1].value
     if batch_size is None:
       batch_size = array_ops.shape(inputs)[1]
-    time_len = inputs_shape[0].value
+    time_len = inputs_shape.dims[0].value
     if time_len is None:
       time_len = array_ops.shape(inputs)[0]
 
@@ -596,30 +603,33 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
                cell_clip=None,
                use_peephole=False,
                reuse=None,
+               dtype=None,
                name="lstm_fused_cell"):
     """Initialize the LSTM cell.
 
     Args:
       num_units: int, The number of units in the LSTM cell.
       forget_bias: float, The bias added to forget gates (see above).
-      cell_clip: clip the cell to this value. Default is no cell clipping.
+      cell_clip: clip the cell to this value. Defaults is no cell clipping.
       use_peephole: Whether to use peephole connections or not.
       reuse: (optional) boolean describing whether to reuse variables in an
         existing scope.  If not `True`, and the existing scope already has the
         given variables, an error is raised.
+      dtype: the dtype of variables of this layer.
       name: String, the name of the layer. Layers with the same name will
         share weights, but to avoid mistakes we require reuse=True in such
         cases.  By default this is "lstm_cell", for variable-name compatibility
         with `tf.nn.rnn_cell.LSTMCell`.
     """
-    super(LSTMBlockFusedCell, self).__init__(_reuse=reuse, name=name)
+    super(LSTMBlockFusedCell, self).__init__(
+        _reuse=reuse, name=name, dtype=dtype)
     self._num_units = num_units
     self._forget_bias = forget_bias
     self._cell_clip = cell_clip if cell_clip is not None else -1
     self._use_peephole = use_peephole
 
     # Inputs must be 3-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=3)
+    self.input_spec = input_spec.InputSpec(ndim=3)
 
   @property
   def num_units(self):
@@ -627,7 +637,7 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
     return self._num_units
 
   def build(self, input_shape):
-    input_size = input_shape[2].value
+    input_size = input_shape.dims[2].value
     self._kernel = self.add_variable(
         "kernel", [input_size + self._num_units, self._num_units * 4])
     self._bias = self.add_variable(
@@ -669,7 +679,7 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
     """
 
     inputs_shape = inputs.get_shape().with_rank(3)
-    time_len = inputs_shape[0].value
+    time_len = inputs_shape.dims[0].value
     if time_len is None:
       time_len = array_ops.shape(inputs)[0]
 
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 59a61af7b3d2d7f2f537c015754b6b0098749d17..8a1c09f171e6108174671e3122d5ff4c0b236003 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_array_ops
@@ -251,11 +251,13 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
       m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
 
     dtype = inputs.dtype
-    input_size = inputs.get_shape().with_rank(2)[1]
+    input_size = inputs.get_shape().with_rank(2).dims[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
     concat_w = _get_concat_variable(
-        "W", [input_size.value + num_proj, 3 * self._num_units], dtype,
+        "W",
+        [input_size.value + num_proj, 3 * self._num_units],
+        dtype,
         self._num_unit_shards)
 
     b = vs.get_variable(
@@ -429,7 +431,7 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
 
     # initialize the first freq state to be zero
     m_prev_freq = array_ops.zeros(
-        [inputs.shape[0].value or inputs.get_shape()[0], self._num_units],
+        [inputs.shape.dims[0].value or inputs.get_shape()[0], self._num_units],
         dtype)
     for fq in range(len(freq_inputs)):
       c_prev = array_ops.slice(state, [0, 2 * fq * self._num_units],
@@ -480,7 +482,7 @@ class TimeFreqLSTMCell(rnn_cell_impl.RNNCell):
     Raises:
       ValueError: if input_size cannot be inferred from static shape inference.
     """
-    input_size = input_feat.get_shape().with_rank(2)[-1].value
+    input_size = input_feat.get_shape().with_rank(2).dims[-1].value
     if input_size is None:
       raise ValueError("Cannot infer input_size from static shape inference.")
     num_feats = int(
@@ -636,7 +638,8 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
       ValueError: if an input_size was specified and the provided inputs have
         a different dimension.
     """
-    batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
+    batch_size = tensor_shape.dimension_value(
+        inputs.shape[0]) or array_ops.shape(inputs)[0]
     freq_inputs = self._make_tf_features(inputs)
     m_out_lst = []
     state_out_lst = []
@@ -886,7 +889,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
     Raises:
       ValueError: if input_size cannot be inferred from static shape inference.
     """
-    input_size = input_feat.get_shape().with_rank(2)[-1].value
+    input_size = input_feat.get_shape().with_rank(2).dims[-1].value
     if input_size is None:
       raise ValueError("Cannot infer input_size from static shape inference.")
     if slice_offset > 0:
@@ -910,7 +913,7 @@ class GridLSTMCell(rnn_cell_impl.RNNCell):
     if not self._start_freqindex_list:
       if len(self._num_frequency_blocks) != 1:
         raise ValueError("Length of num_frequency_blocks"
-                         " is not 1, but instead is %d",
+                         " is not 1, but instead is %d" %
                          len(self._num_frequency_blocks))
       num_feats = int(
           (input_size - self._feature_size) / (self._frequency_skip)) + 1
@@ -1058,7 +1061,8 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
       ValueError: if an input_size was specified and the provided inputs have
         a different dimension.
     """
-    batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
+    batch_size = tensor_shape.dimension_value(
+        inputs.shape[0]) or array_ops.shape(inputs)[0]
     fwd_inputs = self._make_tf_features(inputs)
     if self._backward_slice_offset:
       bwd_inputs = self._make_tf_features(inputs, self._backward_slice_offset)
@@ -1110,7 +1114,7 @@ _Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
 class AttentionCellWrapper(rnn_cell_impl.RNNCell):
   """Basic attention cell wrapper.
 
-  Implementation based on https://arxiv.org/abs/1409.0473.
+  Implementation based on https://arxiv.org/abs/1601.06733.
   """
 
   def __init__(self,
@@ -1289,7 +1293,7 @@ class HighwayWrapper(rnn_cell_impl.RNNCell):
       return self._cell.zero_state(batch_size, dtype)
 
   def _highway(self, inp, out):
-    input_size = inp.get_shape().with_rank(2)[1].value
+    input_size = inp.get_shape().with_rank(2).dims[1].value
     carry_weight = vs.get_variable("carry_w", [input_size, input_size])
     carry_bias = vs.get_variable(
         "carry_b", [input_size],
@@ -1536,7 +1540,7 @@ class NASCell(rnn_cell_impl.RNNCell):
     (c_prev, m_prev) = state
 
     dtype = inputs.dtype
-    input_size = inputs.get_shape().with_rank(2)[1]
+    input_size = inputs.get_shape().with_rank(2).dims[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
     # Variables for the NAS cell. W_m is all matrices multiplying the
@@ -1674,7 +1678,7 @@ class UGRNNCell(rnn_cell_impl.RNNCell):
     """
     sigmoid = math_ops.sigmoid
 
-    input_size = inputs.get_shape().with_rank(2)[1]
+    input_size = inputs.get_shape().with_rank(2).dims[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
@@ -1785,7 +1789,7 @@ class IntersectionRNNCell(rnn_cell_impl.RNNCell):
     sigmoid = math_ops.sigmoid
     tanh = math_ops.tanh
 
-    input_size = inputs.get_shape().with_rank(2)[1]
+    input_size = inputs.get_shape().with_rank(2).dims[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
@@ -2362,11 +2366,12 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
     """
     (c_prev, m_prev) = state
 
-    self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
+    self._batch_size = tensor_shape.dimension_value(
+        inputs.shape[0]) or array_ops.shape(inputs)[0]
 
     # If the input size is statically-known, calculate and validate its group
     # size.  Otherwise, use the output group size.
-    input_size = inputs.shape[1].value
+    input_size = tensor_shape.dimension_value(inputs.shape[1])
     if input_size is None:
       raise ValueError("input size must be statically known")
     if input_size % self._number_of_groups != 0:
@@ -2587,11 +2592,11 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
     for shape in shapes:
       if shape.ndims != 2:
         raise ValueError("linear is expecting 2D arguments: %s" % shapes)
-      if shape[1].value is None:
+      if tensor_shape.dimension_value(shape[1]) is None:
         raise ValueError("linear expects shape[1] to be provided for shape %s, "
                          "but saw %s" % (shape, shape[1]))
       else:
-        total_arg_size += shape[1].value
+        total_arg_size += tensor_shape.dimension_value(shape[1])
 
     dtype = [a.dtype for a in args][0]
 
@@ -2649,7 +2654,7 @@ class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
     (c_prev, m_prev) = state
 
     dtype = inputs.dtype
-    input_size = inputs.get_shape().with_rank(2)[1]
+    input_size = inputs.get_shape().with_rank(2).dims[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
     scope = vs.get_variable_scope()
@@ -2747,7 +2752,7 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
     self._activation = activation or math_ops.tanh
 
     # Restrict inputs to be 2-dimensional matrices
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -2758,11 +2763,11 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
     return self._num_units
 
   def build(self, inputs_shape):
-    if inputs_shape[1].value is None:
+    if tensor_shape.dimension_value(inputs_shape[1]) is None:
       raise ValueError(
           "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape)
 
-    input_depth = inputs_shape[1].value
+    input_depth = tensor_shape.dimension_value(inputs_shape[1])
 
     # pylint: disable=protected-access
     self._kernel = self.add_variable(
@@ -2935,11 +2940,11 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
     for shape in shapes:
       if shape.ndims != 2:
         raise ValueError("linear is expecting 2D arguments: %s" % shapes)
-      if shape[1].value is None:
+      if tensor_shape.dimension_value(shape[1]) is None:
         raise ValueError("linear expects shape[1] to be provided for shape %s, "
                          "but saw %s" % (shape, shape[1]))
       else:
-        total_arg_size += shape[1].value
+        total_arg_size += tensor_shape.dimension_value(shape[1])
 
     dtype = [a.dtype for a in args][0]
 
@@ -2955,7 +2960,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
         st = 0
         with ops.control_dependencies(None):
           for i in range(len(args)):
-            en = st + shapes[i][1].value
+            en = st + tensor_shape.dimension_value(shapes[i][1])
             wn.append(
                 self._normalize(weights[st:en, :], name="norm_{}".format(i)))
             st = en
@@ -3009,7 +3014,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
     sigmoid = math_ops.sigmoid
     c, h = state
 
-    input_size = inputs.get_shape().with_rank(2)[1]
+    input_size = inputs.get_shape().with_rank(2).dims[1]
     if input_size.value is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
@@ -3084,7 +3089,7 @@ class IndRNNCell(rnn_cell_impl.LayerRNNCell):
     super(IndRNNCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
@@ -3098,11 +3103,11 @@ class IndRNNCell(rnn_cell_impl.LayerRNNCell):
     return self._num_units
 
   def build(self, inputs_shape):
-    if inputs_shape[1].value is None:
+    if tensor_shape.dimension_value(inputs_shape[1]) is None:
       raise ValueError(
           "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape)
 
-    input_depth = inputs_shape[1].value
+    input_depth = tensor_shape.dimension_value(inputs_shape[1])
     # pylint: disable=protected-access
     self._kernel_w = self.add_variable(
         "%s_w" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
@@ -3178,7 +3183,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     super(IndyGRUCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
@@ -3194,11 +3199,11 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     return self._num_units
 
   def build(self, inputs_shape):
-    if inputs_shape[1].value is None:
+    if tensor_shape.dimension_value(inputs_shape[1]) is None:
       raise ValueError(
           "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape)
 
-    input_depth = inputs_shape[1].value
+    input_depth = tensor_shape.dimension_value(inputs_shape[1])
     # pylint: disable=protected-access
     self._gate_kernel_w = self.add_variable(
         "gates/%s_w" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
@@ -3318,7 +3323,7 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
     super(IndyLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._forget_bias = forget_bias
@@ -3335,11 +3340,11 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
     return self._num_units
 
   def build(self, inputs_shape):
-    if inputs_shape[1].value is None:
+    if tensor_shape.dimension_value(inputs_shape[1]) is None:
       raise ValueError(
           "Expected inputs.shape[-1] to be known, saw shape: %s" % inputs_shape)
 
-    input_depth = inputs_shape[1].value
+    input_depth = tensor_shape.dimension_value(inputs_shape[1])
     # pylint: disable=protected-access
     self._kernel_w = self.add_variable(
         "%s_w" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
@@ -3439,7 +3444,7 @@ class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
     super(MinimalRNNCell, self).__init__(name=name, dtype=dtype, **kwargs)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self.units = units
     self.activation = activations.get(activation)
@@ -3492,12 +3497,13 @@ class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
         static shape inference.
     """
     input_size = inputs.get_shape()[1]
-    if input_size.value is None:
+    if tensor_shape.dimension_value(input_size) is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
     feedforward_weight, gate_weight = array_ops.split(
         value=self.kernel,
-        num_or_size_splits=[input_size.value, 2 * self.units],
+        num_or_size_splits=[tensor_shape.dimension_value(input_size),
+                            2 * self.units],
         axis=0)
 
     feedforward = math_ops.matmul(inputs, feedforward_weight)
@@ -3510,3 +3516,132 @@ class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
 
     new_h = u * state + (1 - u) * feedforward
     return new_h, new_h
+
+
+class CFNCell(rnn_cell_impl.LayerRNNCell):
+  """Chaos Free Network cell.
+
+  The implementation is based on:
+
+    https://openreview.net/pdf?id=S1dIzvclg
+
+  Thomas Laurent, James von Brecht.
+  "A recurrent neural network without chaos." ICLR, 2017.
+
+  A CFN cell first projects the input to the hidden space. The hidden state
+  goes through a contractive mapping. The new hidden state is then calcuated
+  as a linear combination of the projected input and the contracted previous
+  hidden state, using decoupled input and forget gates.
+  """
+
+  def __init__(self,
+               units,
+               activation="tanh",
+               kernel_initializer="glorot_uniform",
+               bias_initializer="ones",
+               name=None,
+               dtype=None,
+               **kwargs):
+    """Initialize the parameters for a CFN cell.
+
+    Args:
+      units: int, The number of units in the CFN cell.
+      activation: Nonlinearity to use. Default: `tanh`.
+      kernel_initializer: Initializer for the `kernel` weights
+        matrix. Default: `glorot_uniform`.
+      bias_initializer: The initializer to use for the bias in the
+        gates. Default: `ones`.
+      name: String, the name of the cell.
+      dtype: Default dtype of the cell.
+      **kwargs: Dict, keyword named properties for common cell attributes.
+    """
+    super(CFNCell, self).__init__(name=name, dtype=dtype, **kwargs)
+
+    # Inputs must be 2-dimensional.
+    self.input_spec = input_spec.InputSpec(ndim=2)
+
+    self.units = units
+    self.activation = activations.get(activation)
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  @property
+  def state_size(self):
+    return self.units
+
+  @property
+  def output_size(self):
+    return self.units
+
+  def build(self, inputs_shape):
+    if inputs_shape[-1] is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
+                       % str(inputs_shape))
+
+    input_size = inputs_shape[-1]
+    # pylint: disable=protected-access
+    # `self.kernel` contains V_{\theta}, V_{\eta}, W.
+    # `self.recurrent_kernel` contains U_{\theta}, U_{\eta}.
+    # `self.bias` contains b_{\theta}, b_{\eta}.
+    self.kernel = self.add_weight(
+        shape=[input_size, 3 * self.units],
+        name=rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+        initializer=self.kernel_initializer)
+    self.recurrent_kernel = self.add_weight(
+        shape=[self.units, 2 * self.units],
+        name="recurrent_%s" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+        initializer=self.kernel_initializer)
+    self.bias = self.add_weight(
+        shape=[2 * self.units],
+        name=rnn_cell_impl._BIAS_VARIABLE_NAME,
+        initializer=self.bias_initializer)
+    # pylint: enable=protected-access
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Run one step of CFN.
+
+    Args:
+      inputs: input Tensor, must be 2-D, `[batch, input_size]`.
+      state: state Tensor, must be 2-D, `[batch, state_size]`.
+
+    Returns:
+      A tuple containing:
+
+      - Output: A `2-D` tensor with shape `[batch_size, state_size]`.
+      - New state: A `2-D` tensor with shape `[batch_size, state_size]`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    input_size = inputs.get_shape()[-1]
+    if tensor_shape.dimension_value(input_size) is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    # The variable names u, v, w, b are consistent with the notations in the
+    # original paper.
+    v, w = array_ops.split(
+        value=self.kernel,
+        num_or_size_splits=[2 * self.units, self.units],
+        axis=1)
+    u = self.recurrent_kernel
+    b = self.bias
+
+    gates = math_ops.matmul(state, u) + math_ops.matmul(inputs, v)
+    gates = nn_ops.bias_add(gates, b)
+    gates = math_ops.sigmoid(gates)
+    theta, eta = array_ops.split(value=gates,
+                                 num_or_size_splits=2,
+                                 axis=1)
+
+    proj_input = math_ops.matmul(inputs, w)
+
+    # The input gate is (1 - eta), which is different from the original paper.
+    # This is for the propose of initialization. With the default
+    # bias_initializer `ones`, the input gate is initialized to a small number.
+    new_h = theta * self.activation(state) + (1 - eta) * self.activation(
+        proj_input)
+
+    return new_h, new_h
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 291ff83791c7cded2dccc4719bb12e84f00afa42..269443b2c6508bb618d30f64487b1a6a84e8646f 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -82,7 +82,6 @@ py_library(
     name = "keras_saved_model",
     srcs = ["python/saved_model/keras_saved_model.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python:array_ops",
@@ -103,7 +102,10 @@ py_test(
     size = "medium",
     srcs = ["python/saved_model/keras_saved_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_oss",  # TODO(b/119349471): Re-enable
+        "no_windows",
+    ],
     deps = [
         ":keras_saved_model",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
index 2c5c8c4afdc5778e3bb182d0a492d20e758baf14..ffba514bb96f5ce8d963cb0a0482738eafe88355 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -19,16 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import six
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import keras as estimator_keras_util
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export as export_helpers
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import models as models_lib
 from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.metrics import Metric
 from tensorflow.python.keras.models import model_from_json
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
@@ -72,6 +74,25 @@ def save_keras_model(
   share variables. To use the train graph with evaluation or prediction graphs,
   create a new checkpoint if variable values have been updated.
 
+  Example:
+
+  ```python
+  import tensorflow as tf
+
+  # Create a tf.keras model.
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
+  model.summary()
+
+  # Save the tf.keras model in the SavedModel format.
+  saved_to_path = tf.contrib.saved_model.save_keras_model(
+        model, '/tmp/my_simple_tf_keras_saved_model')
+
+  # Load the saved keras model back.
+  model_prime = tf.contrib.saved_model.load_keras_model(saved_to_path)
+  model_prime.summary()
+  ```
+
   Args:
     model: A `tf.keras.Model` to be saved.
     saved_model_path: a string specifying the path to the SavedModel directory.
@@ -85,15 +106,26 @@ def save_keras_model(
     String path to the SavedModel folder, a subdirectory of `saved_model_path`.
 
   Raises:
-    NotImplementedError: If the passed in model is a subclassed model.
+    NotImplementedError: If the model is a subclassed model.
+    ValueError: If a Sequential model does not have input shapes defined by the
+      user, and is not built.
   """
   if not model._is_graph_network:
-    raise NotImplementedError
+    if isinstance(model, sequential.Sequential):
+      # If input shape is not directly set in the model, the exported model
+      # will assume that the inputs have the same shape as the shape the model
+      # was built model with.
+      if not model.built:
+        raise ValueError(
+            'Sequential model must be built before it can be exported.')
+    else:
+      raise NotImplementedError(
+          'Exporting subclassed models is not yet supported.')
 
   export_dir = export_helpers.get_timestamped_export_dir(saved_model_path)
   temp_export_dir = export_helpers.get_temp_export_dir(export_dir)
 
-  builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
+  builder = saved_model_builder._SavedModelBuilder(temp_export_dir)
 
   # Manually save variables to export them in an object-based checkpoint. This
   # skips the `builder.add_meta_graph_and_variables()` step, which saves a
@@ -195,9 +227,10 @@ def _export_mode(
       g.add_to_collection(ops.GraphKeys.GLOBAL_STEP, clone.optimizer.iterations)
 
     # Extract update and train ops from train/test/predict functions.
+    train_op = None
     if mode == model_fn_lib.ModeKeys.TRAIN:
       clone._make_train_function()
-      builder._add_train_op(clone.train_function.updates_op)
+      train_op = clone.train_function.updates_op
     elif mode == model_fn_lib.ModeKeys.EVAL:
       clone._make_test_function()
     else:
@@ -232,7 +265,8 @@ def _export_mode(
         model_fn_lib.EXPORT_TAG_MAP[mode],
         signature_def_map=_create_signature_def_map(clone, mode),
         saver=saver_lib.Saver(clone_var_list),
-        main_op=variables.local_variables_initializer())
+        init_op=variables.local_variables_initializer(),
+        train_op=train_op)
     return None
 
 
@@ -245,42 +279,40 @@ def _create_signature_def_map(model, mode):
     inputs_dict.update(targets_dict)
   outputs_dict = {name: x
                   for name, x in zip(model.output_names, model.outputs)}
+  metrics = estimator_keras_util._convert_keras_metrics_to_estimator(model)
+
+  # Add metric variables to the `LOCAL_VARIABLES` collection. Metric variables
+  # are by default not added to any collections. We are doing this here, so
+  # that metric variables get initialized.
+  local_vars = set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
+  vars_to_add = set()
+  if metrics is not None:
+    for key, value in six.iteritems(metrics):
+      if isinstance(value, Metric):
+        vars_to_add.update(value.variables)
+        # Convert Metric instances to (value_tensor, update_op) tuple.
+        metrics[key] = (value.result(), value.updates[0])
+  # Remove variables that are in the local variables collection already.
+  vars_to_add = vars_to_add.difference(local_vars)
+  for v in vars_to_add:
+    ops.add_to_collection(ops.GraphKeys.LOCAL_VARIABLES, v)
+
   export_outputs = model_fn_lib.export_outputs_for_mode(
       mode,
       predictions=outputs_dict,
       loss=model.total_loss if model.optimizer else None,
-      metrics=estimator_keras_util._convert_keras_metrics_to_estimator(model))
+      metrics=metrics)
   return export_helpers.build_all_signature_defs(
       inputs_dict,
       export_outputs=export_outputs,
       serving_only=(mode == model_fn_lib.ModeKeys.PREDICT))
 
 
-def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):
+def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
   """Assert model and clone contain the same checkpointable objects."""
 
-  def get_non_optimizer_objects(m, g):
-    """Gather set of model and optimizer checkpointable objects."""
-    # Set default graph because optimizer.variables() returns optimizer
-    # variables defined in the default graph.
-    with g.as_default():
-      all_objects = set(checkpointable_utils.list_objects(m))
-      optimizer_and_variables = set()
-      for obj in all_objects:
-        if isinstance(obj, optimizers.TFOptimizer):
-          optimizer_and_variables.update(checkpointable_utils.list_objects(obj))
-          optimizer_and_variables.update(set(obj.optimizer.variables()))
-      return all_objects - optimizer_and_variables
-
-  model_objects = get_non_optimizer_objects(model, model_graph)
-  clone_objects = get_non_optimizer_objects(clone, clone_graph)
-
-  if len(model_objects) != len(clone_objects):
-    raise errors.InternalError(
-        None, None,
-        'Model and clone must use the same variables.'
-        '\n\tModel variables: %s\n\t Clone variables: %s'
-        % (model_objects, clone_objects))
+  # TODO(fchollet, kathywu): make sure this works in eager mode.
+  return True
 
 
 def load_keras_model(saved_model_path):
@@ -291,6 +323,25 @@ def load_keras_model(saved_model_path):
      from metagraph).
   2) loading model weights from checkpoint.
 
+  Example:
+
+  ```python
+  import tensorflow as tf
+
+  # Create a tf.keras model.
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
+  model.summary()
+
+  # Save the tf.keras model in the SavedModel format.
+  saved_to_path = tf.contrib.saved_model.save_keras_model(
+        model, '/tmp/my_simple_tf_keras_saved_model')
+
+  # Load the saved keras model back.
+  model_prime = tf.contrib.saved_model.load_keras_model(saved_to_path)
+  model_prime.summary()
+  ```
+
   Args:
     saved_model_path: a string specifying the path to an existing SavedModel.
 
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
index 060c5045235ced50adf38222a0152a1700a252e8..93d73e1b484ed810fb347b13e95022dfca3584c2 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
@@ -29,14 +29,12 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import training as training_module
@@ -150,8 +148,6 @@ class TestModelSavingandLoading(test.TestCase):
       x = np.random.random((1, 3))
       y = np.random.random((1, 3))
       model.train_on_batch(x, y)
-      model.train_on_batch(x, y)
-
       ref_y = model.predict(x)
 
       temp_saved_model = self._save_model_dir()
@@ -237,6 +233,15 @@ def sequential_model(uses_learning_phase):
   return model
 
 
+def sequential_model_without_input_shape(uses_learning_phase):
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(2))
+  model.add(keras.layers.Dense(3))
+  if uses_learning_phase:
+    model.add(LayerWithLearningPhase())
+  return model
+
+
 def load_model(sess, path, mode):
   tags = model_fn_lib.EXPORT_TAG_MAP[mode]
   sig_def_key = (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -248,7 +253,7 @@ def load_model(sess, path, mode):
   outputs = {
       k: sess.graph.get_tensor_by_name(v.name)
       for k, v in meta_graph_def.signature_def[sig_def_key].outputs.items()}
-  return inputs, outputs
+  return inputs, outputs, meta_graph_def
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -260,16 +265,46 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
     return os.path.join(temp_dir, dirname)
 
   @parameterized.parameters(
-      (functional_model, True, training_module.AdadeltaOptimizer(), True),
-      (functional_model, True, training_module.AdadeltaOptimizer(), False),
-      (functional_model, False, None, False),
-      (sequential_model, True, training_module.AdadeltaOptimizer(), True),
-      (sequential_model, True, training_module.AdadeltaOptimizer(), False),
-      (sequential_model, False, None, False))
+      {
+          'model_builder': functional_model,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': True},
+      {
+          'model_builder': functional_model,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': False},
+      {
+          'model_builder': functional_model,
+          'uses_learning_phase': False,
+          'optimizer': None,
+          'train_before_export': False},
+      {
+          'model_builder': sequential_model,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': True},
+      {
+          'model_builder': sequential_model,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': False},
+      {
+          'model_builder': sequential_model,
+          'uses_learning_phase': False,
+          'optimizer': None,
+          'train_before_export': False},
+      {
+          'model_builder': sequential_model_without_input_shape,
+          'uses_learning_phase': True,
+          'optimizer': training_module.AdadeltaOptimizer(),
+          'train_before_export': False})
   def testSaveAndLoadSavedModelExport(
       self, model_builder, uses_learning_phase, optimizer, train_before_export):
     saved_model_path = self._save_model_dir()
     with self.session(graph=ops.Graph()):
+      np.random.seed(130)
       input_arr = np.random.random((1, 3))
       target_arr = np.random.random((1, 3))
 
@@ -295,8 +330,8 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
     # Load predict graph, and test predictions
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs = load_model(sess, output_path,
-                                   model_fn_lib.ModeKeys.PREDICT)
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      model_fn_lib.ModeKeys.PREDICT)
 
       predictions = sess.run(outputs[output_name],
                              {inputs[input_name]: input_arr})
@@ -305,33 +340,41 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
     if optimizer:
       # Load eval graph, and test predictions, loss and metric values
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs = load_model(sess, output_path,
-                                     model_fn_lib.ModeKeys.EVAL)
-
-        eval_results = sess.run(outputs, {inputs[input_name]: input_arr,
-                                          inputs[target_name]: target_arr})
+        inputs, outputs, _ = load_model(sess, output_path,
+                                        model_fn_lib.ModeKeys.EVAL)
+
+        # First obtain the loss and predictions, and run the metric update op by
+        # feeding in the inputs and targets.
+        loss, predictions, _ = sess.run(
+            (outputs['loss'], outputs['predictions/' + output_name],
+             outputs['metrics/mean_absolute_error/update_op']), {
+                 inputs[input_name]: input_arr,
+                 inputs[target_name]: target_arr
+             })
+
+        # The metric value should be run after the update op, to ensure that it
+        # reflects the correct value.
+        metric_value = sess.run(outputs['metrics/mean_absolute_error/value'])
 
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
-        self.assertAllClose(ref_loss, eval_results['loss'], atol=1e-05)
-        self.assertAllClose(
-            ref_mae, eval_results['metrics/mae/update_op'], atol=1e-05)
-        self.assertAllClose(
-            ref_predict, eval_results['predictions/' + output_name], atol=1e-05)
+        self.assertAllClose(ref_loss, loss, atol=1e-05)
+        self.assertAllClose(ref_mae, metric_value, atol=1e-05)
+        self.assertAllClose(ref_predict, predictions, atol=1e-05)
 
       # Load train graph, and check for the train op, and prediction values
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs = load_model(sess, output_path,
-                                     model_fn_lib.ModeKeys.TRAIN)
+        inputs, outputs, meta_graph_def = load_model(
+            sess, output_path, model_fn_lib.ModeKeys.TRAIN)
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
         self.assertIn('loss', outputs)
-        self.assertIn('metrics/mae/update_op', outputs)
-        self.assertIn('metrics/mae/value', outputs)
+        self.assertIn('metrics/mean_absolute_error/update_op', outputs)
+        self.assertIn('metrics/mean_absolute_error/value', outputs)
         self.assertIn('predictions/' + output_name, outputs)
 
         # Train for a step
-        train_op = ops.get_collection(constants.TRAIN_OP_KEY)
+        train_op = loader_impl.get_train_op(meta_graph_def)
         train_outputs, _ = sess.run(
             [outputs, train_op], {inputs[input_name]: input_arr,
                                   inputs[target_name]: target_arr})
@@ -358,8 +401,8 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       output_path = keras_saved_model.save_keras_model(
           model, saved_model_path, custom_objects={'relu6': relu6})
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs = load_model(sess, output_path,
-                                   model_fn_lib.ModeKeys.PREDICT)
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      model_fn_lib.ModeKeys.PREDICT)
       input_name = model.input_names[0]
       output_name = model.output_names[0]
       predictions = sess.run(
@@ -420,10 +463,12 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
       clone.train_on_batch(input_arr, target_arr)
 
+  def testSaveSeqModelWithoutInputShapesRaisesError(self):
+    """A Sequential model that hasn't been built should raise an error."""
+    model = sequential_model_without_input_shape(True)
     with self.assertRaisesRegexp(
-        errors.InternalError, 'Model and clone must use the same variables.'):
-      keras_saved_model._assert_same_non_optimizer_objects(
-          model, model_graph, clone, clone_graph)
+        ValueError, 'must be built'):
+      keras_saved_model.save_keras_model(model, '')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 1f3b533de9592d1ed3b27980e25911fe94a829f6..922f21b98b35dfff19c8c605a25e89c5d2da8d98 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -154,13 +154,13 @@ class AttentionWrapperTest(test.TestCase):
 
     if attention_layer_sizes is not None:
       # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
-      attention_depth = sum([attention_layer_size or encoder_output_depth
-                             for attention_layer_size in attention_layer_sizes])
+      attention_depth = sum(attention_layer_size or encoder_output_depth
+                            for attention_layer_size in attention_layer_sizes)
     elif attention_layers is not None:
       # Compute sum of attention_layers output depth.
       attention_depth = sum(
           attention_layer.compute_output_shape(
-              [batch_size, cell_depth + encoder_output_depth])[-1].value
+              [batch_size, cell_depth + encoder_output_depth]).dims[-1].value
           for attention_layer in attention_layers)
     else:
       attention_depth = encoder_output_depth * len(create_attention_mechanisms)
@@ -181,7 +181,7 @@ class AttentionWrapperTest(test.TestCase):
         for creator, depth in zip(create_attention_mechanisms,
                                   attention_mechanism_depths)]
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with vs.variable_scope(
           'root',
           initializer=init_ops.random_normal_initializer(stddev=0.01, seed=3)):
@@ -724,7 +724,7 @@ class AttentionWrapperTest(test.TestCase):
   def testBahdanauMonotonicHard(self):
     # Run attention mechanism with mode='hard', make sure probabilities are hard
     b, t, u, d = 10, 20, 30, 40
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       a = wrapper.BahdanauMonotonicAttention(
           d,
           random_ops.random_normal((b, t, u)),
@@ -898,7 +898,7 @@ class AttentionWrapperTest(test.TestCase):
   def testLuongMonotonicHard(self):
     # Run attention mechanism with mode='hard', make sure probabilities are hard
     b, t, u, d = 10, 20, 30, 40
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       a = wrapper.LuongMonotonicAttention(
           d,
           random_ops.random_normal((b, t, u)),
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index fa3f074c67ce44e592146f451e7c2fac0617c63a..b7f9f3fb090356a1c8d2bfb5044712ff93e267ce 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -50,7 +50,7 @@ class BasicDecoderTest(test.TestCase):
     cell_depth = 10
     output_layer_depth = 3
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       inputs = np.random.randn(batch_size, max_time,
                                input_depth).astype(np.float32)
       cell = rnn_cell.LSTMCell(cell_depth)
@@ -136,7 +136,7 @@ class BasicDecoderTest(test.TestCase):
     start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
     end_token = 1
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       embeddings = np.random.randn(vocabulary_size,
                                    input_depth).astype(np.float32)
       cell = rnn_cell.LSTMCell(vocabulary_size)
@@ -209,7 +209,7 @@ class BasicDecoderTest(test.TestCase):
     start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
     end_token = 1
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           "testStepWithSampleEmbeddingHelper",
           initializer=init_ops.constant_initializer(0.01)):
@@ -278,7 +278,7 @@ class BasicDecoderTest(test.TestCase):
     input_depth = 7
     vocabulary_size = 10
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       inputs = np.random.randn(
           batch_size, max_time, input_depth).astype(np.float32)
       embeddings = np.random.randn(
@@ -371,7 +371,7 @@ class BasicDecoderTest(test.TestCase):
     else:
       auxiliary_inputs = None
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       inputs = np.random.randn(batch_size, max_time,
                                input_depth).astype(np.float32)
       cell = rnn_cell.LSTMCell(cell_depth)
@@ -523,7 +523,7 @@ class BasicDecoderTest(test.TestCase):
         lambda x: array_ops.one_hot(x, vocabulary_size, dtype=dtypes.float32))
     end_fn = lambda sample_ids: math_ops.equal(sample_ids, end_token)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           "testStepWithInferenceHelper",
           initializer=init_ops.constant_initializer(0.01)):
@@ -604,7 +604,7 @@ class BasicDecoderTest(test.TestCase):
     next_inputs_fn = math_ops.to_float
     end_fn = lambda sample_ids: sample_ids[:, end_token]
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           "testStepWithInferenceHelper",
           initializer=init_ops.constant_initializer(0.01)):
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
index 9662a5780a083f41060cfa6624f249ed328d8112..b41734d214e98cd24be0c98ee67f7cb5e58b7a61 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_ops_test.py
@@ -49,7 +49,7 @@ class GatherTreeTest(test.TestCase):
         parent_ids=parent_ids,
         max_sequence_lengths=max_sequence_lengths,
         end_token=end_token)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(expected_result, beams.eval())
 
   def testBadParentValuesOnCPU(self):
@@ -93,7 +93,7 @@ class GatherTreeTest(test.TestCase):
           parent_ids=parent_ids,
           max_sequence_lengths=max_sequence_lengths,
           end_token=end_token)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(expected_result, beams.eval())
 
   def testGatherTreeBatch(self):
@@ -103,7 +103,7 @@ class GatherTreeTest(test.TestCase):
     max_sequence_lengths = [0, 1, 2, 4, 7, 8, 9, 10, 11, 0]
     end_token = 5
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       step_ids = np.random.randint(
           0, high=end_token + 1, size=(max_time, batch_size, beam_width))
       parent_ids = np.random.randint(
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
index b549cbf568f254cbf18456145af751a8245dd379..4c25489fade320f2f2218354343021a71af01baf 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_test.py
@@ -44,7 +44,7 @@ class DynamicDecodeRNNTest(test.TestCase):
     cell_depth = 10
     max_out = max(sequence_length)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       if time_major:
         inputs = np.random.randn(max_time, batch_size,
                                  input_depth).astype(np.float32)
@@ -126,7 +126,7 @@ class DynamicDecodeRNNTest(test.TestCase):
     cell_depth = 10
     max_out = max(sequence_length)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       inputs = np.random.randn(batch_size, max_time,
                                input_depth).astype(np.float32)
 
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
index 35c601a4bcf795ab951218851a3699b3288a69b1..5aa32b532ffcf5772f6ace26662f5e5471cf6923 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.platform import test
 class LossTest(test.TestCase):
 
   def testSequenceLoss(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with variable_scope.variable_scope(
           'root', initializer=init_ops.constant_initializer(0.5)):
         batch_size = 2
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 0ba32cd3bf8a374f5f55bdc6b2325b03443cd545..77e9f848b137911b53e1b4df5dd740fe38af55bb 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -108,13 +108,14 @@ def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
         maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
         dtype=nest.flatten(memory)[0].dtype)
     seq_len_batch_size = (
-        memory_sequence_length.shape[0].value
+        tensor_shape.dimension_value(memory_sequence_length.shape[0])
         or array_ops.shape(memory_sequence_length)[0])
   def _maybe_mask(m, seq_len_mask):
     rank = m.get_shape().ndims
     rank = rank if rank is not None else array_ops.rank(m)
     extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
-    m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
+    m_batch_size = tensor_shape.dimension_value(
+        m.shape[0]) or array_ops.shape(m)[0]
     if memory_sequence_length is not None:
       message = ("memory_sequence_length and memory tensor batch sizes do not "
                  "match.")
@@ -215,9 +216,10 @@ class _BaseAttentionMechanism(AttentionMechanism):
           self.memory_layer(self._values) if self.memory_layer  # pylint: disable=not-callable
           else self._values)
       self._batch_size = (
-          self._keys.shape[0].value or array_ops.shape(self._keys)[0])
-      self._alignments_size = (self._keys.shape[1].value or
-                               array_ops.shape(self._keys)[1])
+          tensor_shape.dimension_value(self._keys.shape[0]) or
+          array_ops.shape(self._keys)[0])
+      self._alignments_size = (tensor_shape.dimension_value(self._keys.shape[1])
+                               or array_ops.shape(self._keys)[1])
 
   @property
   def memory_layer(self):
@@ -463,7 +465,8 @@ def _bahdanau_score(processed_query, keys, normalize):
   """
   dtype = processed_query.dtype
   # Get the number of hidden units from the trailing dimension of keys
-  num_units = keys.shape[2].value or array_ops.shape(keys)[2]
+  num_units = tensor_shape.dimension_value(
+      keys.shape[2]) or array_ops.shape(keys)[2]
   # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
   processed_query = array_ops.expand_dims(processed_query, 1)
   v = variable_scope.get_variable(
@@ -649,8 +652,9 @@ def monotonic_attention(p_choose_i, previous_attention, mode):
   previous_attention = ops.convert_to_tensor(
       previous_attention, name="previous_attention")
   if mode == "recursive":
-    # Use .shape[0].value when it's not None, or fall back on symbolic shape
-    batch_size = p_choose_i.shape[0].value or array_ops.shape(p_choose_i)[0]
+    # Use .shape[0] when it's not None, or fall back on symbolic shape
+    batch_size = tensor_shape.dimension_value(
+        p_choose_i.shape[0]) or array_ops.shape(p_choose_i)[0]
     # Compute [1, 1 - p_choose_i[0], 1 - p_choose_i[1], ..., 1 - p_choose_i[-2]]
     shifted_1mp_choose_i = array_ops.concat(
         [array_ops.ones((batch_size, 1)), 1 - p_choose_i[:, :-1]], 1)
@@ -1035,8 +1039,8 @@ def hardmax(logits, name=None):
   """
   with ops.name_scope(name, "Hardmax", [logits]):
     logits = ops.convert_to_tensor(logits, name="logits")
-    if logits.get_shape()[-1].value is not None:
-      depth = logits.get_shape()[-1].value
+    if tensor_shape.dimension_value(logits.get_shape()[-1]) is not None:
+      depth = tensor_shape.dimension_value(logits.get_shape()[-1])
     else:
       depth = array_ops.shape(logits)[-1]
     return array_ops.one_hot(
@@ -1224,15 +1228,16 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             "layer per attention_mechanism, saw: %d vs %d"
             % (len(self._attention_layers), len(attention_mechanisms)))
       self._attention_layer_size = sum(
-          layer.compute_output_shape(
+          tensor_shape.dimension_value(layer.compute_output_shape(
               [None,
-               cell.output_size + mechanism.values.shape[-1].value])[-1].value
+               cell.output_size + tensor_shape.dimension_value(
+                   mechanism.values.shape[-1])])[-1])
           for layer, mechanism in zip(
               self._attention_layers, attention_mechanisms))
     else:
       self._attention_layers = None
       self._attention_layer_size = sum(
-          attention_mechanism.values.get_shape()[-1].value
+          tensor_shape.dimension_value(attention_mechanism.values.shape[-1])
           for attention_mechanism in attention_mechanisms)
 
     self._cell = cell
@@ -1246,7 +1251,7 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
       else:
         final_state_tensor = nest.flatten(initial_cell_state)[-1]
         state_batch_size = (
-            final_state_tensor.shape[0].value
+            tensor_shape.dimension_value(final_state_tensor.shape[0])
             or array_ops.shape(final_state_tensor)[0])
         error_message = (
             "When constructing AttentionWrapper %s: " % self._base_name +
@@ -1412,7 +1417,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
     cell_output, next_cell_state = self._cell(cell_inputs, cell_state)
 
     cell_batch_size = (
-        cell_output.shape[0].value or array_ops.shape(cell_output)[0])
+        tensor_shape.dimension_value(cell_output.shape[0]) or
+        array_ops.shape(cell_output)[0])
     error_message = (
         "When applying AttentionWrapper %s: " % self.name +
         "Non-matching batch sizes between the memory "
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index 605e3143fd2459d098ee967568e9f2fa0073d0c5..ab36848f13ab3078cd232c18f140188e12db703b 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -85,7 +85,8 @@ def _tile_batch(t, multiplier):
   tiling = [1] * (t.shape.ndims + 1)
   tiling[1] = multiplier
   tiled_static_batch_size = (
-      t.shape[0].value * multiplier if t.shape[0].value is not None else None)
+      t.shape.dims[0].value * multiplier
+      if t.shape.dims[0].value is not None else None)
   tiled = array_ops.tile(array_ops.expand_dims(t, 1), tiling)
   tiled = array_ops.reshape(tiled,
                             array_ops.concat(
@@ -138,9 +139,9 @@ def gather_tree_from_array(t, parent_ids, sequence_length):
     A `Tensor` which is a stacked `TensorArray` of the same size and type as
     `t` and where beams are sorted in each `Tensor` according to `parent_ids`.
   """
-  max_time = parent_ids.shape[0].value or array_ops.shape(parent_ids)[0]
-  batch_size = parent_ids.shape[1].value or array_ops.shape(parent_ids)[1]
-  beam_width = parent_ids.shape[2].value or array_ops.shape(parent_ids)[2]
+  max_time = parent_ids.shape.dims[0].value or array_ops.shape(parent_ids)[0]
+  batch_size = parent_ids.shape.dims[1].value or array_ops.shape(parent_ids)[1]
+  beam_width = parent_ids.shape.dims[2].value or array_ops.shape(parent_ids)[2]
 
   # Generate beam ids that will be reordered by gather_tree.
   beam_ids = array_ops.expand_dims(
@@ -191,9 +192,9 @@ def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
   reshaped to [batch_size, beam_size, -1].
   """
   reshaped_shape = tensor_shape.TensorShape([batch_size, beam_width, None])
-  if (batch_size is not None and shape[0].value is not None
+  if (batch_size is not None and shape.dims[0].value is not None
       and (shape[0] != batch_size * beam_width
-           or (shape.ndims >= 2 and shape[1].value is not None
+           or (shape.ndims >= 2 and shape.dims[1].value is not None
                and (shape[0] != batch_size or shape[1] != beam_width)))):
     tf_logging.warn("TensorArray reordering expects elements to be "
                     "reshapable to %s which is incompatible with the "
@@ -722,7 +723,7 @@ def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
   total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs
 
   # Calculate the continuation lengths by adding to all continuing beams.
-  vocab_size = logits.shape[-1].value or array_ops.shape(logits)[-1]
+  vocab_size = logits.shape.dims[-1].value or array_ops.shape(logits)[-1]
   lengths_to_add = array_ops.one_hot(
       indices=array_ops.fill([batch_size, beam_width], end_token),
       depth=vocab_size,
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index 6bd58c4d322c04d4d14d04678e24a05c0f876208..5e4f130b31483204a111e2f778fa5d0fc4526fea 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -4,129 +4,11 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
-load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
-
 py_library(
     name = "signal_py",
-    srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "test_util",
-    srcs = ["python/kernel_tests/test_util.py"],
+    srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:tf_optimizer",
-        "//tensorflow/python:training",
-    ],
-)
-
-cuda_py_tests(
-    name = "mel_ops_test",
-    srcs = ["python/kernel_tests/mel_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        ":test_util",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-cuda_py_tests(
-    name = "mfcc_ops_test",
-    srcs = ["python/kernel_tests/mfcc_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-)
-
-cuda_py_tests(
-    name = "reconstruction_ops_test",
-    srcs = ["python/kernel_tests/reconstruction_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_tests(
-    name = "shape_ops_test",
-    srcs = ["python/kernel_tests/shape_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        ":test_util",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_tests(
-    name = "spectral_ops_test",
-    size = "large",
-    srcs = ["python/kernel_tests/spectral_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-    tags = ["nomac"],
-)
-
-cuda_py_tests(
-    name = "window_ops_test",
-    srcs = ["python/kernel_tests/window_ops_test.py"],
-    additional_deps = [
-        ":signal_py",
-        ":test_util",
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/signal",
     ],
 )
diff --git a/tensorflow/contrib/signal/__init__.py b/tensorflow/contrib/signal/__init__.py
index d088e744346aac0aa8675b95d7b792379fc7b019..d01f5ccf51c132082a419ec7db49045ef8bab725 100644
--- a/tensorflow/contrib/signal/__init__.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -14,6 +14,9 @@
 # ==============================================================================
 """Signal processing operations.
 
+`tf.contrib.signal` has been renamed to `tf.signal`. `tf.contrib.signal` will be
+removed in TensorFlow 2.0.
+
 See the
 [Contrib Signal](https://tensorflow.org/api_guides/python/contrib.signal)
 guide.
@@ -39,18 +42,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops.mel_ops import linear_to_mel_weight_matrix
-from tensorflow.contrib.signal.python.ops.mfcc_ops import mfccs_from_log_mel_spectrograms
-from tensorflow.contrib.signal.python.ops.reconstruction_ops import overlap_and_add
-from tensorflow.contrib.signal.python.ops.shape_ops import frame
+from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
+from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
+from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
+from tensorflow.python.ops.signal.shape_ops import frame
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
+from tensorflow.python.ops.signal.spectral_ops import stft
+from tensorflow.python.ops.signal.window_ops import hamming_window
+from tensorflow.python.ops.signal.window_ops import hann_window
+
+from tensorflow.python.util.all_util import remove_undocumented
+
 # `frame` used to be named `frames`, which is a noun and not a verb.
 # Keep an alias to `frames` for backwards compatibility.
-from tensorflow.contrib.signal.python.ops.shape_ops import frame as frames
-from tensorflow.contrib.signal.python.ops.spectral_ops import inverse_stft
-from tensorflow.contrib.signal.python.ops.spectral_ops import inverse_stft_window_fn
-from tensorflow.contrib.signal.python.ops.spectral_ops import stft
-from tensorflow.contrib.signal.python.ops.window_ops import hamming_window
-from tensorflow.contrib.signal.python.ops.window_ops import hann_window
+frames = frame
 
-from tensorflow.python.util.all_util import remove_undocumented
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/signal/python/__init__.py b/tensorflow/contrib/signal/python/__init__.py
deleted file mode 100644
index e672d1146c53a813613c9076c0cb6056f7081441..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/signal/python/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Signal ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
diff --git a/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py
deleted file mode 100644
index 5c9b2ac51811b02d7519f796d5bff340b35863ec..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/signal/python/kernel_tests/reconstruction_ops_test.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for reconstruction_ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.signal.python.ops import reconstruction_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class ReconstructionOpsTest(test.TestCase):
-
-  def __init__(self, *args, **kwargs):
-    super(ReconstructionOpsTest, self).__init__(*args, **kwargs)
-    self.batch_size = 3
-    self.frames = 3
-    self.samples = 5
-
-    self.bases = np.array(range(2, 5))
-    exponents = np.array(range(self.frames * self.samples))
-    powers = np.power(self.bases[:, np.newaxis], exponents[np.newaxis, :])
-
-    self.powers = np.reshape(powers, [self.batch_size, self.frames,
-                                      self.samples])
-    self.frame_hop = 2
-
-    # Hand computed example using powers of unique numbers: this is easily
-    # verified.
-    self.expected_string = ["1", "10", "100100", "1001000", "10010010000",
-                            "100100000000", "1001000000000", "10000000000000",
-                            "100000000000000"]
-
-  def test_all_ones(self):
-    signal = constant_op.constant(np.ones((3, 5)), dtype=dtypes.int64)
-    reconstruction = reconstruction_ops.overlap_and_add(signal, 2)
-
-    with self.test_session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
-
-      expected_output = np.array([1, 1, 2, 2, 3, 2, 2, 1, 1])
-
-      self.assertAllClose(output, expected_output)
-
-  def test_simple(self):
-    def make_input(frame_length, num_frames=3):
-      """Generate a tensor of num_frames frames of frame_length."""
-      return np.reshape(np.arange(1, num_frames * frame_length + 1),
-                        (-1, frame_length))
-
-    # List of (signal, expected_result, frame_hop).
-    configurations = [
-        # All hop lengths on a frame length of 2.
-        (make_input(2), [1, 5, 9, 6], 1),
-        (make_input(2), [1, 2, 3, 4, 5, 6], 2),
-
-        # All hop lengths on a frame length of 3.
-        (make_input(3), [1, 6, 15, 14, 9], 1),
-        (make_input(3), [1, 2, 7, 5, 13, 8, 9], 2),
-        (make_input(3), [1, 2, 3, 4, 5, 6, 7, 8, 9], 3),
-
-        # All hop lengths on a frame length of 4.
-        (make_input(4), [1, 7, 18, 21, 19, 12], 1),
-        (make_input(4), [1, 2, 8, 10, 16, 18, 11, 12], 2),
-        (make_input(4), [1, 2, 3, 9, 6, 7, 17, 10, 11, 12], 3),
-        (make_input(4), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 4),
-    ]
-
-    with self.test_session(use_gpu=True):
-      for signal, expected, frame_hop in configurations:
-        reconstruction = reconstruction_ops.overlap_and_add(
-            np.array(signal), frame_hop).eval()
-        expected_output = np.array(expected)
-        self.assertAllClose(reconstruction, expected_output)
-
-  def test_powers(self):
-    signal = constant_op.constant(np.squeeze(self.powers[0, :, :]),
-                                  dtype=dtypes.int64)
-    reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
-
-    with self.test_session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
-      string_output = [np.base_repr(x, self.bases[0]) for x in output]
-
-      self.assertEqual(string_output, self.expected_string)
-
-  def test_batch(self):
-    signal = constant_op.constant(self.powers, dtype=dtypes.int64)
-    reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
-
-    with self.test_session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
-
-      accumulator = True
-      for i in range(self.batch_size):
-        string_output = [np.base_repr(x, self.bases[i]) for x in output[i, :]]
-        accumulator = accumulator and (string_output == self.expected_string)
-
-      self.assertTrue(accumulator)
-
-  def test_one_element_batch(self):
-    input_matrix = np.squeeze(self.powers[0, :, :])
-    input_matrix = input_matrix[np.newaxis, :, :].astype(float)
-    signal = constant_op.constant(input_matrix, dtype=dtypes.float32)
-    reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
-
-    with self.test_session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
-
-      string_output = [np.base_repr(int(x), self.bases[0]) for x in
-                       np.squeeze(output)]
-
-      self.assertEqual(output.shape, (1, 9))
-      self.assertEqual(string_output, self.expected_string)
-
-  def test_gradient(self):
-    configurations = [
-        ((1, 128), 1),
-        ((5, 35), 17),
-        ((10, 128), 128),
-        ((2, 10, 128), 127),
-        ((2, 2, 10, 128), 126),
-        ((2, 2, 2, 10, 128), 125),
-    ]
-
-    for shape, frame_hop in configurations:
-      with self.test_session(use_gpu=True) as sess:
-        signal = array_ops.zeros(shape)
-        reconstruction = reconstruction_ops.overlap_and_add(signal, frame_hop)
-        loss = math_ops.reduce_sum(reconstruction)
-        # Increasing any sample in the input frames by one will increase the sum
-        # of all the samples in the reconstruction by 1, so the gradient should
-        # be all ones, no matter the shape or hop.
-        gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
-        self.assertTrue((gradient == 1.0).all())
-
-  def test_gradient_batch(self):
-    with self.test_session(use_gpu=True) as sess:
-      signal = array_ops.zeros((2, 10, 10))
-      frame_hop = 10
-      reconstruction = reconstruction_ops.overlap_and_add(signal, frame_hop)
-
-      # Multiply the first batch-item's reconstruction by zeros. This will block
-      # gradient from flowing into the first batch item from the loss. Multiply
-      # the second batch item by the integers from 0 to 99. Since there is zero
-      # overlap, the gradient for this batch item will be 0-99 shaped as (10,
-      # 10).
-      reconstruction *= array_ops.stack(
-          [array_ops.zeros((100,)), math_ops.to_float(math_ops.range(100))])
-      loss = math_ops.reduce_sum(reconstruction)
-
-      # Verify that only the second batch item receives gradient.
-      gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
-      expected_gradient = np.stack([
-          np.zeros((10, 10)),
-          np.reshape(np.arange(100).astype(np.float32), (10, 10))])
-      self.assertAllEqual(expected_gradient, gradient)
-
-  def test_gradient_numerical(self):
-    with self.test_session(use_gpu=True):
-      shape = (2, 10, 10)
-      framed_signal = array_ops.zeros(shape)
-      frame_hop = 10
-      reconstruction = reconstruction_ops.overlap_and_add(
-          framed_signal, frame_hop)
-      error = test.compute_gradient_error(
-          framed_signal, shape, reconstruction, [2, 100])
-      self.assertLess(error, 2e-5)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py b/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
deleted file mode 100644
index 4db8dc2ca090534f2cda66bd55c30dfa389b860a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Signal reconstruction via overlapped addition of frames."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.signal.python.ops import shape_ops
-from tensorflow.contrib.signal.python.ops import util_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-
-
-def _shuffle_to_front(input_tensor, k):
-  """Shuffles the last `k` indices of `input_tensor` to the front.
-
-  Transposes `input_tensor` to have the last `k` indices at the front. The input
-  may have arbitrary rank and unknown shape.
-
-  Args:
-    input_tensor: A `Tensor` of arbitrary rank and unknown shape.
-    k: A scalar `Tensor` specifying how many indices to shuffle.
-
-  Returns:
-    A transposed version of `input_tensor` with `k` indices shuffled to the
-    front.
-
-  Raises:
-    ValueError: If `input_tensor` is not at least rank `k` or `k` is not scalar.
-  """
-  k = ops.convert_to_tensor(k, name="k")
-  k.shape.with_rank(0)
-  k_static = tensor_util.constant_value(k)
-  if k_static is not None:
-    input_tensor.shape.with_rank_at_least(k_static)
-
-  rank = array_ops.rank(input_tensor)
-  outer_indices, inner_indices = array_ops.split(math_ops.range(rank),
-                                                 [rank - k, k])
-  permutation = array_ops.concat([inner_indices, outer_indices], 0)
-
-  return array_ops.transpose(input_tensor, perm=permutation)
-
-
-def overlap_and_add(signal, frame_step, name=None):
-  """Reconstructs a signal from a framed representation.
-
-  Adds potentially overlapping frames of a signal with shape
-  `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
-  The resulting tensor has shape `[..., output_size]` where
-
-      output_size = (frames - 1) * frame_step + frame_length
-
-  Args:
-    signal: A [..., frames, frame_length] `Tensor`. All dimensions may be
-      unknown, and rank must be at least 2.
-    frame_step: An integer or scalar `Tensor` denoting overlap offsets. Must be
-      less than or equal to `frame_length`.
-    name: An optional name for the operation.
-
-  Returns:
-    A `Tensor` with shape `[..., output_size]` containing the overlap-added
-    frames of `signal`'s inner-most two dimensions.
-
-  Raises:
-    ValueError: If `signal`'s rank is less than 2, `frame_step` is not a scalar
-      integer or `frame_step` is greater than `frame_length`.
-  """
-  with ops.name_scope(name, "overlap_and_add", [signal, frame_step]):
-    signal = ops.convert_to_tensor(signal, name="signal")
-    signal.shape.with_rank_at_least(2)
-    frame_step = ops.convert_to_tensor(frame_step, name="frame_step")
-    frame_step.shape.assert_has_rank(0)
-    if not frame_step.dtype.is_integer:
-      raise ValueError("frame_step must be an integer. Got %s" %
-                       frame_step.dtype)
-
-    signal_shape = array_ops.shape(signal)
-
-    # All dimensions that are not part of the overlap-and-add. Can be empty for
-    # rank 2 inputs.
-    outer_dimensions = signal_shape[:-2]
-
-    # If frame_length and frame_step are known at graph construction time, check
-    # frame_step is less than or equal to frame_length.
-    frame_step_static = tensor_util.constant_value(frame_step)
-    if (frame_step_static is not None and signal.shape.ndims is not None and
-        signal.shape[-1].value is not None):
-      if frame_step_static > signal.shape[-1].value:
-        raise ValueError(
-            "frame_step (%d) must be less than or equal to "
-            "frame_length (%d)" % (
-                frame_step_static, signal.shape[-1].value))
-      # If frame_length is equal to frame_step, there's no overlap so just
-      # reshape the tensor.
-      if frame_step_static == signal.shape[-1].value:
-        return array_ops.reshape(signal, array_ops.concat(
-            [outer_dimensions, [-1]], 0))
-
-    signal_rank = array_ops.rank(signal)
-    frames = signal_shape[-2]
-    frame_length = signal_shape[-1]
-
-    subframe_length = util_ops.gcd(frame_length, frame_step)
-    subframe_step = frame_step // subframe_length
-    subframes_per_frame = frame_length // subframe_length
-    output_size = frame_step * (frames - 1) + frame_length
-    output_subframes = output_size // subframe_length
-
-    # To avoid overlap-adding sample-by-sample, we overlap-add at the "subframe"
-    # level, where a subframe is gcd(frame_length, frame_step). Reshape signal
-    # from [..., frames, frame_length] into [..., subframes, subframe_length].
-    subframe_shape = array_ops.concat(
-        [outer_dimensions, [-1, subframe_length]], 0)
-    subframe_signal = array_ops.reshape(signal, subframe_shape)
-
-    # Now we shuffle the last [subframes, subframe_length] dimensions to the
-    # front.
-    # TODO(rjryan): Add an axis argument to unsorted_segment_sum so we can
-    # avoid this pair of transposes.
-    subframe_signal = _shuffle_to_front(subframe_signal, 2)
-
-    # Use unsorted_segment_sum to add overlapping subframes together.
-    segment_ids = array_ops.reshape(shape_ops.frame(
-        math_ops.range(output_subframes), subframes_per_frame, subframe_step,
-        pad_end=False), [-1])
-    result = math_ops.unsorted_segment_sum(subframe_signal, segment_ids,
-                                           num_segments=output_subframes)
-
-    # result is a [subframes, subframe_length, ...outer_dimensions] tensor. We
-    # return a [...outer_dimensions, output_size] tensor with a transpose and
-    # reshape.
-    result_shape = array_ops.concat([outer_dimensions, [output_size]], 0)
-    return array_ops.reshape(_shuffle_to_front(result, signal_rank - 2),
-                             result_shape)
diff --git a/tensorflow/contrib/signal/python/ops/spectral_ops.py b/tensorflow/contrib/signal/python/ops/spectral_ops.py
deleted file mode 100644
index a8b5deff6ca3a4a756d31b904e577f08f6155fd7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/signal/python/ops/spectral_ops.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Spectral operations (e.g. Short-time Fourier Transform)."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-import numpy as np
-
-from tensorflow.contrib.signal.python.ops import reconstruction_ops
-from tensorflow.contrib.signal.python.ops import shape_ops
-from tensorflow.contrib.signal.python.ops import window_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
-
-
-def stft(signals, frame_length, frame_step, fft_length=None,
-         window_fn=functools.partial(window_ops.hann_window, periodic=True),
-         pad_end=False, name=None):
-  """Computes the [Short-time Fourier Transform][stft] of `signals`.
-
-  Implemented with GPU-compatible ops and supports gradients.
-
-  Args:
-    signals: A `[..., samples]` `float32` `Tensor` of real-valued signals.
-    frame_length: An integer scalar `Tensor`. The window length in samples.
-    frame_step: An integer scalar `Tensor`. The number of samples to step.
-    fft_length: An integer scalar `Tensor`. The size of the FFT to apply.
-      If not provided, uses the smallest power of 2 enclosing `frame_length`.
-    window_fn: A callable that takes a window length and a `dtype` keyword
-      argument and returns a `[window_length]` `Tensor` of samples in the
-      provided datatype. If set to `None`, no windowing is used.
-    pad_end: Whether to pad the end of `signals` with zeros when the provided
-      frame length and step produces a frame that lies partially past its end.
-    name: An optional name for the operation.
-
-  Returns:
-    A `[..., frames, fft_unique_bins]` `Tensor` of `complex64` STFT values where
-    `fft_unique_bins` is `fft_length // 2 + 1` (the unique components of the
-    FFT).
-
-  Raises:
-    ValueError: If `signals` is not at least rank 1, `frame_length` is
-      not scalar, or `frame_step` is not scalar.
-
-  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
-  """
-  with ops.name_scope(name, 'stft', [signals, frame_length,
-                                     frame_step]):
-    signals = ops.convert_to_tensor(signals, name='signals')
-    signals.shape.with_rank_at_least(1)
-    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
-    frame_length.shape.assert_has_rank(0)
-    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
-    frame_step.shape.assert_has_rank(0)
-
-    if fft_length is None:
-      fft_length = _enclosing_power_of_two(frame_length)
-    else:
-      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
-
-    framed_signals = shape_ops.frame(
-        signals, frame_length, frame_step, pad_end=pad_end)
-
-    # Optionally window the framed signals.
-    if window_fn is not None:
-      window = window_fn(frame_length, dtype=framed_signals.dtype)
-      framed_signals *= window
-
-    # spectral_ops.rfft produces the (fft_length/2 + 1) unique components of the
-    # FFT of the real windowed signals in framed_signals.
-    return spectral_ops.rfft(framed_signals, [fft_length])
-
-
-def inverse_stft_window_fn(frame_step,
-                           forward_window_fn=functools.partial(
-                               window_ops.hann_window, periodic=True),
-                           name=None):
-  """Generates a window function that can be used in `inverse_stft`.
-
-  Constructs a window that is equal to the forward window with a further
-  pointwise amplitude correction.  `inverse_stft_window_fn` is equivalent to
-  `forward_window_fn` in the case where it would produce an exact inverse.
-
-  See examples in `inverse_stft` documentation for usage.
-
-  Args:
-    frame_step: An integer scalar `Tensor`. The number of samples to step.
-    forward_window_fn: window_fn used in the forward transform, `stft`.
-    name: An optional name for the operation.
-
-  Returns:
-    A callable that takes a window length and a `dtype` keyword argument and
-      returns a `[window_length]` `Tensor` of samples in the provided datatype.
-      The returned window is suitable for reconstructing original waveform in
-      inverse_stft.
-  """
-  with ops.name_scope(name, 'inverse_stft_window_fn', [forward_window_fn]):
-    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
-    frame_step.shape.assert_has_rank(0)
-
-  def inverse_stft_window_fn_inner(frame_length, dtype):
-    """Computes a window that can be used in `inverse_stft`.
-
-    Args:
-      frame_length: An integer scalar `Tensor`. The window length in samples.
-      dtype: Data type of waveform passed to `stft`.
-
-    Returns:
-      A window suitable for reconstructing original waveform in `inverse_stft`.
-
-    Raises:
-      ValueError: If `frame_length` is not scalar, `forward_window_fn` is not a
-      callable that takes a window length and a `dtype` keyword argument and
-      returns a `[window_length]` `Tensor` of samples in the provided datatype
-      `frame_step` is not scalar, or `frame_step` is not scalar.
-    """
-    with ops.name_scope(name, 'inverse_stft_window_fn', [forward_window_fn]):
-      frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
-      frame_length.shape.assert_has_rank(0)
-
-      # Use equation 7 from Griffin + Lim.
-      forward_window = forward_window_fn(frame_length, dtype=dtype)
-      denom = math_ops.square(forward_window)
-      overlaps = -(-frame_length // frame_step)  # Ceiling division.
-      denom = array_ops.pad(denom, [(0, overlaps * frame_step - frame_length)])
-      denom = array_ops.reshape(denom, [overlaps, frame_step])
-      denom = math_ops.reduce_sum(denom, 0, keepdims=True)
-      denom = array_ops.tile(denom, [overlaps, 1])
-      denom = array_ops.reshape(denom, [overlaps * frame_step])
-
-      return forward_window / denom[:frame_length]
-  return inverse_stft_window_fn_inner
-
-
-def inverse_stft(stfts,
-                 frame_length,
-                 frame_step,
-                 fft_length=None,
-                 window_fn=functools.partial(window_ops.hann_window,
-                                             periodic=True),
-                 name=None):
-  """Computes the inverse [Short-time Fourier Transform][stft] of `stfts`.
-
-  To reconstruct an original waveform, a complimentary window function should
-  be used in inverse_stft. Such a window function can be constructed with
-  tf.contrib.signal.inverse_stft_window_fn.
-
-  Example:
-
-  ```python
-  frame_length = 400
-  frame_step = 160
-  waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
-  stft = tf.contrib.signal.stft(waveform, frame_length, frame_step)
-  inverse_stft = tf.contrib.signal.inverse_stft(
-      stft, frame_length, frame_step,
-      window_fn=tf.contrib.signal.inverse_stft_window_fn(frame_step))
-  ```
-
-  if a custom window_fn is used in stft, it must be passed to
-  inverse_stft_window_fn:
-
-  ```python
-  frame_length = 400
-  frame_step = 160
-  window_fn = functools.partial(window_ops.hamming_window, periodic=True),
-  waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
-  stft = tf.contrib.signal.stft(
-      waveform, frame_length, frame_step, window_fn=window_fn)
-  inverse_stft = tf.contrib.signal.inverse_stft(
-      stft, frame_length, frame_step,
-      window_fn=tf.contrib.signal.inverse_stft_window_fn(
-         frame_step, forward_window_fn=window_fn))
-  ```
-
-  Implemented with GPU-compatible ops and supports gradients.
-
-  Args:
-    stfts: A `complex64` `[..., frames, fft_unique_bins]` `Tensor` of STFT bins
-      representing a batch of `fft_length`-point STFTs where `fft_unique_bins`
-      is `fft_length // 2 + 1`
-    frame_length: An integer scalar `Tensor`. The window length in samples.
-    frame_step: An integer scalar `Tensor`. The number of samples to step.
-    fft_length: An integer scalar `Tensor`. The size of the FFT that produced
-      `stfts`. If not provided, uses the smallest power of 2 enclosing
-      `frame_length`.
-    window_fn: A callable that takes a window length and a `dtype` keyword
-      argument and returns a `[window_length]` `Tensor` of samples in the
-      provided datatype. If set to `None`, no windowing is used.
-    name: An optional name for the operation.
-
-  Returns:
-    A `[..., samples]` `Tensor` of `float32` signals representing the inverse
-    STFT for each input STFT in `stfts`.
-
-  Raises:
-    ValueError: If `stfts` is not at least rank 2, `frame_length` is not scalar,
-      `frame_step` is not scalar, or `fft_length` is not scalar.
-
-  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
-  """
-  with ops.name_scope(name, 'inverse_stft', [stfts]):
-    stfts = ops.convert_to_tensor(stfts, name='stfts')
-    stfts.shape.with_rank_at_least(2)
-    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
-    frame_length.shape.assert_has_rank(0)
-    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
-    frame_step.shape.assert_has_rank(0)
-    if fft_length is None:
-      fft_length = _enclosing_power_of_two(frame_length)
-    else:
-      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
-      fft_length.shape.assert_has_rank(0)
-
-    real_frames = spectral_ops.irfft(stfts, [fft_length])
-
-    # frame_length may be larger or smaller than fft_length, so we pad or
-    # truncate real_frames to frame_length.
-    frame_length_static = tensor_util.constant_value(frame_length)
-    # If we don't know the shape of real_frames's inner dimension, pad and
-    # truncate to frame_length.
-    if (frame_length_static is None or
-        real_frames.shape.ndims is None or
-        real_frames.shape[-1].value is None):
-      real_frames = real_frames[..., :frame_length]
-      real_frames_rank = array_ops.rank(real_frames)
-      real_frames_shape = array_ops.shape(real_frames)
-      paddings = array_ops.concat(
-          [array_ops.zeros([real_frames_rank - 1, 2],
-                           dtype=frame_length.dtype),
-           [[0, math_ops.maximum(0, frame_length - real_frames_shape[-1])]]], 0)
-      real_frames = array_ops.pad(real_frames, paddings)
-    # We know real_frames's last dimension and frame_length statically. If they
-    # are different, then pad or truncate real_frames to frame_length.
-    elif real_frames.shape[-1].value > frame_length_static:
-      real_frames = real_frames[..., :frame_length_static]
-    elif real_frames.shape[-1].value < frame_length_static:
-      pad_amount = frame_length_static - real_frames.shape[-1].value
-      real_frames = array_ops.pad(real_frames,
-                                  [[0, 0]] * (real_frames.shape.ndims - 1) +
-                                  [[0, pad_amount]])
-
-    # The above code pads the inner dimension of real_frames to frame_length,
-    # but it does so in a way that may not be shape-inference friendly.
-    # Restore shape information if we are able to.
-    if frame_length_static is not None and real_frames.shape.ndims is not None:
-      real_frames.set_shape([None] * (real_frames.shape.ndims - 1) +
-                            [frame_length_static])
-
-    # Optionally window and overlap-add the inner 2 dimensions of real_frames
-    # into a single [samples] dimension.
-    if window_fn is not None:
-      window = window_fn(frame_length, dtype=stfts.dtype.real_dtype)
-      real_frames *= window
-    return reconstruction_ops.overlap_and_add(real_frames, frame_step)
-
-
-def _enclosing_power_of_two(value):
-  """Return 2**N for integer N such that 2**N >= value."""
-  value_static = tensor_util.constant_value(value)
-  if value_static is not None:
-    return constant_op.constant(
-        int(2**np.ceil(np.log(value_static) / np.log(2.0))), value.dtype)
-  return math_ops.cast(
-      math_ops.pow(2.0, math_ops.ceil(
-          math_ops.log(math_ops.to_float(value)) / math_ops.log(2.0))),
-      value.dtype)
diff --git a/tensorflow/contrib/slim/python/slim/data/data_decoder.py b/tensorflow/contrib/slim/python/slim/data/data_decoder.py
index 5a32be6c5a329068a65655b0b7be020fcd22ea18..46d33597e429123f37bfa0f3c3b8b2dc5098fe7d 100644
--- a/tensorflow/contrib/slim/python/slim/data/data_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/data_decoder.py
@@ -39,12 +39,13 @@ from __future__ import print_function
 
 import abc
 
+import six
 
+
+@six.add_metaclass(abc.ABCMeta)
 class DataDecoder(object):
   """An abstract class which is used to decode data for a provider."""
 
-  __metaclass__ = abc.ABCMeta
-
   @abc.abstractmethod
   def decode(self, data, items):
     """Decodes the data to returns the tensors specified by the list of items.
diff --git a/tensorflow/contrib/slim/python/slim/data/data_provider.py b/tensorflow/contrib/slim/python/slim/data/data_provider.py
index a49c0969d96bf7eef0200269e168941f9b8433a5..3252b4fe8470f5f9733c67c2cf44ad888ae3d2c7 100644
--- a/tensorflow/contrib/slim/python/slim/data/data_provider.py
+++ b/tensorflow/contrib/slim/python/slim/data/data_provider.py
@@ -38,7 +38,10 @@ from __future__ import print_function
 
 import abc
 
+import six
 
+
+@six.add_metaclass(abc.ABCMeta)
 class DataProvider(object):
   """Maps a list of requested data items to tensors from a data source.
 
@@ -46,7 +49,6 @@ class DataProvider(object):
   method which returns arbitrary types of data. No assumption is made about the
   source of the data nor the mechanism for providing it.
   """
-  __metaclass__ = abc.ABCMeta
 
   def __init__(self, items_to_tensors, num_samples):
     """Constructs the Data Provider.
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index a6ce45c20365d9893895101476c9711065bfc511..1b2b6acacca838f95cb758ae88f79263993ca69e 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -25,6 +25,8 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 from tensorflow.contrib.slim.python.slim.data import data_decoder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -37,6 +39,7 @@ from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
 
 
+@six.add_metaclass(abc.ABCMeta)
 class ItemHandler(object):
   """Specifies the item-to-Features mapping for tf.parse_example.
 
@@ -45,8 +48,6 @@ class ItemHandler(object):
   parsing.
   """
 
-  __metaclass__ = abc.ABCMeta
-
   def __init__(self, keys):
     """Constructs the handler with the name of the tf.Feature keys to use.
 
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py b/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py
index 8fcd7aeef6a6964902666a4f3c17e05b0c7b52ee..f31bdbd399c9de4f2f5d557b75b1ece6d64a765e 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.contrib.solvers.python.ops import lanczos
 from tensorflow.contrib.solvers.python.ops import util
 from tensorflow.python.framework import constant_op
@@ -80,7 +81,8 @@ if __name__ == "__main__":
     for shape in [[4, 4], [7, 4], [5, 8]]:
       for orthogonalize in True, False:
         for steps in range(1, min(shape) + 1):
-          for use_static_shape in True, False:
+          # TF2 does not support placeholders so we skip it
+          for use_static_shape in set([True, tf2.enabled()]):
             arg_string = "%s_%s_%s_%s_staticshape_%s" % (
                 dtype.__name__, "_".join(map(str, shape)), orthogonalize, steps,
                 use_static_shape)
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py b/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py
index 2a9100903aae5689919a6b25fcb18ff192f250b3..841a41a2339824ab8ca15f4bdd74be697cd6fe9f 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.contrib.solvers.python.ops import least_squares
 from tensorflow.contrib.solvers.python.ops import util
 from tensorflow.python.framework import constant_op
@@ -76,7 +77,8 @@ def _get_least_squares_tests(dtype_, use_static_shape_, shape_):
 if __name__ == "__main__":
   for dtype in np.float32, np.float64:
     for shape in [[4, 4], [8, 5], [3, 7]]:
-      for use_static_shape in True, False:
+      # TF2 does not support placeholders under eager so we skip it
+      for use_static_shape in set([True, tf2.enabled()]):
         arg_string = "%s_%s_staticshape_%s" % (dtype.__name__,
                                                "_".join(map(str, shape)),
                                                use_static_shape)
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
index a0e6eb87bc06fb1303a7eb86fa6760458f20a9b9..10807f7a80617e56abeb6d13ce419a49a2269aac 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.contrib.solvers.python.ops import linear_equations
 from tensorflow.contrib.solvers.python.ops import util
 from tensorflow.python.framework import constant_op
@@ -113,7 +114,8 @@ def _get_linear_equations_tests(dtype_, use_static_shape_, shape_):
 if __name__ == "__main__":
   for dtype in np.float32, np.float64:
     for size in 1, 4, 10:
-      for use_static_shape in True, False:
+      # TF2 does not support placeholders under eager so we skip it
+      for use_static_shape in set([True, tf2.enabled()]):
         shape = [size, size]
         arg_string = "%s_%s_staticshape_%s" % (dtype.__name__, size,
                                                use_static_shape)
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
index 360e7dbe75f595ff61fb83379089294371203813..7743f5b4a7fbd2d80ff77a130ee50b9ea7e261ee 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_loss_test.py
@@ -109,6 +109,42 @@ class SparsemaxLossTest(test.TestCase):
         np_loss, tf_loss_out, half_atol=1e-2, half_rtol=5e-3)
     self.assertShapeEqual(np_loss, tf_loss_op)
 
+  def _test_sparsemax_loss_of_nan(self, dtype, random, use_gpu):
+    """check sparsemax-loss transfers nan"""
+    q = np.asarray([[0, 0, 1], [0, 0, 1], [0, 0, 1]])
+    z_nan = np.asarray([[0, np.nan, 0], [0, np.nan, np.nan],
+                        [np.nan, np.nan, np.nan]]).astype(dtype)
+
+    _, tf_loss_nan = self._tf_sparsemax_loss(z_nan, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([np.nan, np.nan, np.nan], tf_loss_nan)
+
+  def _test_sparsemax_loss_of_inf(self, dtype, random, use_gpu):
+    """check sparsemax-loss is infinity safe"""
+    q = np.asarray([[0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]])
+    z_neg = np.asarray([
+        [0, -np.inf, 0],
+        [0, -np.inf, -np.inf],
+        [-np.inf, -np.inf, 0],
+        [-np.inf, -np.inf, -np.inf],
+    ]).astype(dtype)
+    z_pos = np.asarray([[0, np.inf, 0], [0, np.inf,
+                                         np.inf], [np.inf, np.inf, 0],
+                        [np.inf, np.inf, np.inf]]).astype(dtype)
+    z_mix = np.asarray([[0, np.inf, 0], [0, np.inf, -np.inf],
+                        [-np.inf, np.inf, 0], [-np.inf, np.inf,
+                                               -np.inf]]).astype(dtype)
+
+    _, tf_loss_neg = self._tf_sparsemax_loss(z_neg, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([0.25, np.inf, 0, np.nan], tf_loss_neg)
+
+    _, tf_loss_pos = self._tf_sparsemax_loss(z_pos, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([np.nan, np.nan, np.nan, np.nan],
+                                       tf_loss_pos)
+
+    _, tf_loss_mix = self._tf_sparsemax_loss(z_mix, q, dtype, use_gpu)
+    self.assertAllCloseAccordingToType([np.nan, np.nan, np.nan, np.nan],
+                                       tf_loss_mix)
+
   def _test_constant_add(self, dtype, random, use_gpu):
     """check sparsemax-loss proposition 3"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10))
@@ -198,6 +234,10 @@ class SparsemaxLossTest(test.TestCase):
 
     self._test_sparsemax_loss_against_numpy(dtype, random, use_gpu=False)
 
+    self._test_sparsemax_loss_of_nan(dtype, random, use_gpu=False)
+
+    self._test_sparsemax_loss_of_inf(dtype, random, use_gpu=False)
+
     self._test_constant_add(dtype, random, use_gpu=False)
 
     self._test_sparsemax_loss_positive(dtype, random, use_gpu=False)
diff --git a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
index 259e62bd864fba3cc7d9aa387e02c8319438d658..c95b9da1e4c2c144e2b79a88028874c27a7f8b96 100644
--- a/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
+++ b/tensorflow/contrib/sparsemax/python/kernel_tests/sparsemax_test.py
@@ -87,6 +87,46 @@ class SparsemaxTest(test.TestCase):
         p_sparemax, tf_sparsemax_out, half_atol=5e-3)
     self.assertShapeEqual(p_sparemax, tf_sparsemax_op)
 
+  def _test_sparsemax_of_nan(self, dtype, random, use_gpu):
+    """check sparsemax transfers nan"""
+    z_nan = np.asarray([
+        [0, np.nan, 0],
+        [0, np.nan, np.nan],
+        [np.nan, np.nan, np.nan],
+    ]).astype(dtype)
+
+    _, tf_sparsemax_nan = self._tf_sparsemax(z_nan, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [[np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan],
+         [np.nan, np.nan, np.nan]], tf_sparsemax_nan)
+
+  def _test_sparsemax_of_inf(self, dtype, random, use_gpu):
+    """check sparsemax is infinity safe"""
+    z_neg = np.asarray([
+        [0, -np.inf, 0],
+        [0, -np.inf, -np.inf],
+        [-np.inf, -np.inf, -np.inf],
+    ]).astype(dtype)
+    z_pos = np.asarray([[0, np.inf, 0], [0, np.inf, np.inf],
+                        [np.inf, np.inf, np.inf]]).astype(dtype)
+    z_mix = np.asarray([[0, np.inf, 0], [0, np.inf, -np.inf],
+                        [-np.inf, np.inf, -np.inf]]).astype(dtype)
+
+    _, tf_sparsemax_neg = self._tf_sparsemax(z_neg, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [[0.5, 0, 0.5], [1, 0, 0], [np.nan, np.nan, np.nan]], tf_sparsemax_neg)
+
+    _, tf_sparsemax_pos = self._tf_sparsemax(z_pos, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [[np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan],
+         [np.nan, np.nan, np.nan]], tf_sparsemax_pos)
+
+    _, tf_sparsemax_mix = self._tf_sparsemax(z_mix, dtype, use_gpu)
+    self.assertAllCloseAccordingToType(
+        [[np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan],
+         [np.nan, np.nan, np.nan]], tf_sparsemax_mix)
+
+
   def _test_sparsemax_of_zero(self, dtype, random, use_gpu):
     """check sparsemax proposition 1, part 1"""
     z = np.zeros((1, 10))
@@ -97,7 +137,7 @@ class SparsemaxTest(test.TestCase):
     self.assertAllCloseAccordingToType(p_sparemax, tf_sparsemax_out)
     self.assertShapeEqual(p_sparemax, tf_sparsemax_op)
 
-  def _test_sparsemax_of_inf(self, dtype, random, use_gpu):
+  def _test_sparsemax_of_to_inf(self, dtype, random, use_gpu):
     """check sparsemax proposition 1, part 2"""
     z = random.uniform(low=-3, high=3, size=(test_obs, 10))
 
@@ -210,10 +250,14 @@ class SparsemaxTest(test.TestCase):
 
     self._test_sparsemax_against_numpy(dtype, random, use_gpu=False)
 
-    self._test_sparsemax_of_zero(dtype, random, use_gpu=False)
+    self._test_sparsemax_of_nan(dtype, random, use_gpu=False)
 
     self._test_sparsemax_of_inf(dtype, random, use_gpu=False)
 
+    self._test_sparsemax_of_zero(dtype, random, use_gpu=False)
+
+    self._test_sparsemax_of_to_inf(dtype, random, use_gpu=False)
+
     self._test_constant_add(dtype, random, use_gpu=False)
 
     self._test_permutation(dtype, random, use_gpu=False)
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index e617af2ff1b731eddb5b72469a1cd67e7cfd163f..f79c93f34750e96ec6bd7cdf4d0dad9981d6a2b3 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -49,7 +49,14 @@ def sparsemax(logits, name=None):
     obs = array_ops.shape(logits)[0]
     dims = array_ops.shape(logits)[1]
 
-    z = logits - math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis]
+    # In the paper, they call the logits z.
+    # The mean(logits) can be substracted from logits to make the algorithm
+    # more numerically stable. the instability in this algorithm comes mostly
+    # from the z_cumsum. Substacting the mean will cause z_cumsum to be close
+    # to zero. However, in practise the numerical instability issues are very
+    # minor and substacting the mean causes extra issues with inf and nan
+    # input.
+    z = logits
 
     # sort z
     z_sorted, _ = nn.top_k(z, k=dims)
@@ -64,10 +71,24 @@ def sparsemax(logits, name=None):
     k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1)
 
     # calculate tau(z)
-    indices = array_ops.stack([math_ops.range(0, obs), k_z - 1], axis=1)
+    # If there are inf values or all values are -inf, the k_z will be zero,
+    # this is mathematically invalid and will also cause the gather_nd to fail.
+    # Prevent this issue for now by setting k_z = 1 if k_z = 0, this is then
+    # fixed later (see p_safe) by returning p = nan. This results in the same
+    # behavior as softmax.
+    k_z_safe = math_ops.maximum(k_z, 1)
+    indices = array_ops.stack([math_ops.range(0, obs), k_z_safe - 1], axis=1)
     tau_sum = array_ops.gather_nd(z_cumsum, indices)
     tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype)
 
     # calculate p
-    return math_ops.maximum(
+    p = math_ops.maximum(
         math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis])
+    # If k_z = 0 or if z = nan, then the input is invalid
+    p_safe = array_ops.where(
+        math_ops.logical_or(
+            math_ops.equal(k_z, 0), math_ops.is_nan(z_cumsum[:, -1])),
+        array_ops.fill([obs, dims], math_ops.cast(float("nan"), logits.dtype)),
+        p)
+
+    return p_safe
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
index 582d1e6136df4d3ad3c8108ae9607d5fef519145..c0438f16bc857cbda248eb2791883ae39e1c5eb1 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax_loss.py
@@ -47,14 +47,30 @@ def sparsemax_loss(logits, sparsemax, labels, name=None):
     sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
     labels = ops.convert_to_tensor(labels, name="labels")
 
-    shifted_logits = logits - \
-        math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis]
+    # In the paper, they call the logits z.
+    # A constant can be substracted from logits to make the algorithm
+    # more numerically stable in theory. However, there are really no major
+    # source numerical instability in this algorithm.
+    z = logits
 
     # sum over support
-    support = math_ops.cast(sparsemax > 0, sparsemax.dtype)
-    sum_s = support * sparsemax * (shifted_logits - 0.5 * sparsemax)
+    # Use a conditional where instead of a multiplication to support z = -inf.
+    # If z = -inf, and there is no support (sparsemax = 0), a multiplication
+    # would cause 0 * -inf = nan, which is not correct in this case.
+    sum_s = array_ops.where(
+        math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
+        sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax))
 
     # - z_k + ||q||^2
-    q_part = labels * (0.5 * labels - shifted_logits)
+    q_part = labels * (0.5 * labels - z)
+    # Fix the case where labels = 0 and z = -inf, where q_part would
+    # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
+    # z = -inf should be consideredself.
+    # The code below also coveres the case where z = inf. Howeverm in this
+    # caose the sparsemax will be nan, which means the sum_s will also be nan,
+    # therefor this case doesn't need addtional special treatment.
+    q_part_safe = array_ops.where(
+        math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)),
+        array_ops.zeros_like(z), q_part)
 
-    return math_ops.reduce_sum(sum_s + q_part, axis=1)
+    return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
index a217397c1a219508c74382bf0df4bf9aa0a15aff..f16d99f64c1cfeab23871d2ab17649a8744c88cc 100644
--- a/tensorflow/contrib/stateless/BUILD
+++ b/tensorflow/contrib/stateless/BUILD
@@ -11,11 +11,13 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 py_library(
     name = "stateless",
-    srcs = ["__init__.py"],
+    srcs = [
+        "__init__.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:stateless_random_ops_gen",
+        "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python:util",
     ],
 )
@@ -25,10 +27,6 @@ cuda_py_test(
     srcs = ["python/kernel_tests/stateless_random_ops_test.py"],
     additional_deps = [
         ":stateless",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_ops",
     ],
 )
diff --git a/tensorflow/contrib/stateless/__init__.py b/tensorflow/contrib/stateless/__init__.py
index fe23fe0dd86d9e8267a1dba945e6dd1daec3f4af..1a3a78bb5bead8c140743a053b41db850aaad0c1 100644
--- a/tensorflow/contrib/stateless/__init__.py
+++ b/tensorflow/contrib/stateless/__init__.py
@@ -14,6 +14,10 @@
 # ==============================================================================
 """Stateless random ops which take seed as a tensor input.
 
+DEPRECATED: Use `tf.random.stateless_uniform` rather than
+`tf.contrib.stateless.stateless_random_uniform`, and similarly for the other
+routines.
+
 Instead of taking `seed` as an attr which initializes a mutable state within
 the op, these random ops take `seed` as an input, and the random numbers are
 a deterministic function of `shape` and `seed`.
@@ -32,16 +36,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.gen_stateless_random_ops import *
+from tensorflow.python.ops.stateless_random_ops import stateless_random_uniform
+from tensorflow.python.ops.stateless_random_ops import stateless_random_normal
+from tensorflow.python.ops.stateless_random_ops import stateless_truncated_normal
+from tensorflow.python.ops.stateless_random_ops import stateless_multinomial
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-ops.NotDifferentiable("StatelessMultinomial")
-ops.NotDifferentiable("StatelessRandomNormal")
-ops.NotDifferentiable("StatelessRandomUniform")
-ops.NotDifferentiable("StatelessTruncatedNormal")
-
 remove_undocumented(__name__)
diff --git a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
index d724a5c014d2f9f5f6e3a6704341bcb8c429ae06..8373cf62dcb1b2405658f097c16a476a81822bf6 100644
--- a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
+++ b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,135 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for stateless random ops."""
+"""Tests for tf.contrib.stateless API.
+
+The real tests are in python/kernel_tests/random/stateless_random_ops_test.py.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 from tensorflow.contrib import stateless
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
 
-CASES = [(stateless.stateless_random_uniform, random_ops.random_uniform),
-         (stateless.stateless_random_normal, random_ops.random_normal),
-         (stateless.stateless_truncated_normal, random_ops.truncated_normal)]
-
-
-def invert_philox(key, value):
-  """Invert the Philox bijection."""
-  key = np.array(key, dtype=np.uint32)
-  value = np.array(value, dtype=np.uint32)
-  step = np.array([0x9E3779B9, 0xBB67AE85], dtype=np.uint32)
-  for n in range(10)[::-1]:
-    key0, key1 = key + n * step
-    v0 = value[3] * 0x991a7cdb & 0xffffffff
-    v2 = value[1] * 0x6d7cae67 & 0xffffffff
-    hi0 = v0 * 0xD2511F53 >> 32
-    hi1 = v2 * 0xCD9E8D57 >> 32
-    v1 = hi1 ^ value[0] ^ key0
-    v3 = hi0 ^ value[2] ^ key1
-    value = v0, v1, v2, v3
-  return np.array(value)
-
 
 class StatelessOpsTest(test.TestCase):
 
-  def testMatchStateful(self):
-    # Stateless ops should be the same as stateful ops on the first call
-    # after seed scrambling.
-    key = 0x3ec8f720, 0x02461e29
-    for seed in (7, 17), (11, 5), (2, 3):
-      preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
-      preseed = preseed[::2] | preseed[1::2] << 32
-      random_seed.set_random_seed(seed[0])
-      with self.test_session(use_gpu=True):
-        for stateless_op, stateful_op in CASES:
-          for shape in (), (3,), (2, 5):
-            stateful = stateful_op(shape, seed=seed[1])
-            pure = stateless_op(shape, seed=preseed)
-            self.assertAllEqual(stateful.eval(), pure.eval())
-
-  def testDeterminism(self):
-    # Stateless values should be equal iff the seeds are equal (roughly)
-    with self.test_session(use_gpu=True):
-      for seed_type in [dtypes.int32, dtypes.int64]:
-        seed_t = array_ops.placeholder(seed_type, shape=[2])
-        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
-        for stateless_op, _ in CASES:
-          for shape in (), (3,), (2, 5):
-            pure = stateless_op(shape, seed=seed_t)
-            values = [(seed, pure.eval(feed_dict={seed_t: seed}))
-                      for seed in seeds]
-            for s0, v0 in values:
-              for s1, v1 in values:
-                self.assertEqual(s0 == s1, np.all(v0 == v1))
-
-  def testShapeType(self):
-    with self.test_session(use_gpu=True):
-      for shape_dtype in [dtypes.int32, dtypes.int64]:
-        seed_t = array_ops.placeholder(dtypes.int64, shape=[2])
-        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
-        for stateless_op, _ in CASES:
-          for shape in (), (3,), (2, 5):
-            pure = stateless_op(constant_op.constant(shape, dtype=shape_dtype),
-                                seed=seed_t)
-            values = [(seed, pure.eval(feed_dict={seed_t: seed}))
-                      for seed in seeds]
-            for s0, v0 in values:
-              for s1, v1 in values:
-                self.assertEqual(s0 == s1, np.all(v0 == v1))
-
-  def testMatchStatefulMultinomial(self):
-    # Stateless ops should be the same as stateful ops on the first call
-    # after seed scrambling.
-    key = 0x3ec8f720, 0x02461e29
-    num_samples = 4
-    for logits_dtype in np.float16, np.float32, np.float64:
-      for output_dtype in dtypes.int32, dtypes.int64:
-        for seed in (7, 17), (11, 5), (2, 3):
-          preseed = invert_philox(key,
-                                  (seed[0], 0, seed[1], 0)).astype(np.uint64)
-          preseed = preseed[::2] | preseed[1::2] << 32
-          random_seed.set_random_seed(seed[0])
-          with self.test_session(use_gpu=True):
-            for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
-                                                      [0.25, 0.75]]):
-              logits_t = constant_op.constant(logits, dtype=logits_dtype)
-              stateful = random_ops.multinomial(
-                  logits_t,
-                  num_samples,
-                  seed=seed[1],
-                  output_dtype=output_dtype)
-              pure = stateless.stateless_multinomial(
-                  logits_t,
-                  num_samples,
-                  seed=preseed,
-                  output_dtype=output_dtype)
-              self.assertAllEqual(stateful.eval(), pure.eval())
-
-  def testDeterminismMultinomial(self):
-    # Stateless values should be equal iff the seeds are equal (roughly)
-    num_samples = 10
-    with self.test_session(use_gpu=True):
-      for seed_type in [dtypes.int32, dtypes.int64]:
-        seed_t = array_ops.placeholder(seed_type, shape=[2])
-        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
-        for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
-                                                  [0.25, 0.75]]):
-          pure = stateless.stateless_multinomial(
-              logits, num_samples, seed=seed_t)
-          values = [
-              (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
-          ]
-          for s0, v0 in values:
-            for s1, v1 in values:
-              self.assertEqual(s0 == s1, np.all(v0 == v1))
+  def testAPI(self):
+    self.assertIs(stateless.stateless_random_uniform,
+                  stateless_random_ops.stateless_random_uniform)
+    self.assertIs(stateless.stateless_random_normal,
+                  stateless_random_ops.stateless_random_normal)
+    self.assertIs(stateless.stateless_truncated_normal,
+                  stateless_random_ops.stateless_truncated_normal)
+    self.assertIs(stateless.stateless_multinomial,
+                  stateless_random_ops.stateless_multinomial)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 4d1807130c57039976dfa57c27bb0d4807e75212..10e4556dacbc17ec02c2bd698389b04d517d7076 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -152,6 +152,27 @@ class EagerFileTest(test_util.TensorFlowTestCase):
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
+  def testRecordEveryNGlobalSteps(self):
+    step = training_util.get_or_create_global_step()
+    logdir = tempfile.mkdtemp()
+
+    def run_step():
+      summary_ops.scalar('scalar', i, step=step)
+      step.assign_add(1)
+
+    with summary_ops.create_file_writer(
+        logdir).as_default(), summary_ops.record_summaries_every_n_global_steps(
+            2, step):
+      for i in range(10):
+        run_step()
+      # And another 10 steps as a graph function.
+      run_step_fn = function.defun(run_step)
+      for i in range(10):
+        run_step_fn()
+
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(len(events), 11)
+
   def testMaxQueue(self):
     logs = tempfile.mkdtemp()
     with summary_ops.create_file_writer(
@@ -279,12 +300,9 @@ class EagerDbTest(summary_test_util.SummaryDbTest):
 
   def testDbURIOpen(self):
     tmpdb_path = os.path.join(self.get_temp_dir(), 'tmpDbURITest.sqlite')
-    tmpdb_uri = six.moves.urllib_parse.urljoin("file:", tmpdb_path)
-    tmpdb_writer = summary_ops.create_db_writer(
-        tmpdb_uri,
-        "experimentA",
-        "run1",
-        "user1")
+    tmpdb_uri = six.moves.urllib_parse.urljoin('file:', tmpdb_path)
+    tmpdb_writer = summary_ops.create_db_writer(tmpdb_uri, 'experimentA',
+                                                'run1', 'user1')
     with summary_ops.always_record_summaries():
       with tmpdb_writer.as_default():
         summary_ops.scalar('t1', 2.0)
diff --git a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
index 596c59ead3460aa63eeff44d5a11a4a8c5cde0da..290c16fe3966791ea78986539750caf938a37322 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.tensor_forest.python.ops import gen_model_ops
 
 # pylint: disable=unused-import
@@ -28,10 +30,12 @@ from tensorflow.contrib.tensor_forest.python.ops.gen_model_ops import update_mod
 # pylint: enable=unused-import
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 
 _model_ops = loader.load_op_library(
@@ -88,6 +92,59 @@ class TreeVariableSavable(saver.BaseSaverBuilder.SaveableObject):
           params=self.params.serialized_params_proto)
 
 
+class TreeVariable(tracking.TrackableResource):
+  """A tree model."""
+
+  def __init__(self, params, tree_config, stats_handle, name, container=None):
+    self._params = params
+    self._tree_config = tree_config
+    self._stats_handle = stats_handle
+    self._name = name
+    self._container = container
+    self._init_op = None
+    super(TreeVariable, self).__init__()
+    self._resource_handle = self.create_resource()
+
+  def create_resource(self):
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      shared_name = "tree_variable_%d" % (ops.uid(),)
+    else:
+      shared_name = self._name
+    return gen_model_ops.decision_tree_resource_handle_op(
+        self._container, shared_name=shared_name, name=self._name)
+
+  def initialize(self):
+    return gen_model_ops.create_tree_variable(
+        self.resource_handle,
+        self._tree_config,
+        params=self._params.serialized_params_proto)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_model_ops.tree_is_initialized_op(self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {
+        "tree_variable":
+            functools.partial(
+                TreeVariableSavable,
+                params=self._params,
+                tree_handle=self.resource_handle,
+                stats_handle=self._stats_handle,
+                create_op=self._init_op)
+    }
+
+
 def tree_variable(params, tree_config, stats_handle, name, container=None):
   r"""Creates a tree model and returns a handle to it.
 
@@ -102,18 +159,13 @@ def tree_variable(params, tree_config, stats_handle, name, container=None):
     A `Tensor` of type mutable `string`. The handle to the tree.
   """
   with ops.name_scope(name, "TreeVariable") as name:
-    resource_handle = gen_model_ops.decision_tree_resource_handle_op(
-        container, shared_name=name, name=name)
-
-    create_op = gen_model_ops.create_tree_variable(
-        resource_handle,
-        tree_config,
-        params=params.serialized_params_proto)
-    is_initialized_op = gen_model_ops.tree_is_initialized_op(resource_handle)
+    tree_var = TreeVariable(params, tree_config, stats_handle, name, container)
+    resource_handle = tree_var.resource_handle
+    create_op = tree_var.initializer
+    is_initialized_op = tree_var.is_initialized()
     # Adds the variable to the savable list.
-    saveable = TreeVariableSavable(params, resource_handle, stats_handle,
-                                   create_op,
-                                   resource_handle.name)
+    saveable = tree_var._gather_saveables_for_checkpoint()["tree_variable"](  # pylint: disable=protected-access
+        name=resource_handle.name)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     resources.register_resource(resource_handle, create_op, is_initialized_op)
     return resource_handle
diff --git a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
index 44d486edecc4e4f7ba8a9b6d680178298813621b..9184198cd4c8fd2a7609714d094d5ef2b6868658 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.tensor_forest.python.ops import gen_stats_ops
 # pylint: disable=unused-import
 from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import finalize_tree
@@ -25,10 +27,12 @@ from tensorflow.contrib.tensor_forest.python.ops.gen_stats_ops import process_in
 # pylint: enable=unused-import
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 
 _stats_ops = loader.load_op_library(
@@ -84,8 +88,58 @@ class FertileStatsVariableSavable(saver.BaseSaverBuilder.SaveableObject):
           params=self.params.serialized_params_proto)
 
 
-def fertile_stats_variable(params, stats_config, name,
-                           container=None):
+class FertileStatsVariable(tracking.TrackableResource):
+  """A Fertile stats variable."""
+
+  def __init__(self, params, stats_config, name, container=None):
+    self._params = params
+    self._stats_config = stats_config
+    self._name = name
+    self._container = container
+    self._init_op = None
+    super(FertileStatsVariable, self).__init__()
+    self._resource_handle = self.create_resource()
+
+  def create_resource(self):
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      shared_name = "fertile_stats_variable_%d" % (ops.uid(),)
+    else:
+      shared_name = self._name
+    return gen_stats_ops.fertile_stats_resource_handle_op(
+        self._container, shared_name=shared_name, name=self._name)
+
+  def initialize(self):
+    return gen_stats_ops.create_fertile_stats_variable(
+        self.resource_handle,
+        self._stats_config,
+        params=self._params.serialized_params_proto)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_stats_ops.fertile_stats_is_initialized_op(self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {
+        "fertile_stats_variable":
+            functools.partial(
+                FertileStatsVariableSavable,
+                params=self._params,
+                stats_handle=self.resource_handle,
+                create_op=self.initializer)
+    }
+
+
+def fertile_stats_variable(params, stats_config, name, container=None):
   r"""Creates a stats object and returns a handle to it.
 
   Args:
@@ -98,17 +152,15 @@ def fertile_stats_variable(params, stats_config, name,
     A `Tensor` of type mutable `string`. The handle to the stats.
   """
   with ops.name_scope(name, "FertileStatsVariable") as name:
-    resource_handle = gen_stats_ops.fertile_stats_resource_handle_op(
-        container, shared_name=name, name=name)
-
-    create_op = gen_stats_ops.create_fertile_stats_variable(
-        resource_handle, stats_config,
-        params=params.serialized_params_proto)
-    is_initialized_op = gen_stats_ops.fertile_stats_is_initialized_op(
-        resource_handle)
+    fertile_stats_var = FertileStatsVariable(params, stats_config, name,
+                                             container)
+    resource_handle = fertile_stats_var.resource_handle
+    create_op = fertile_stats_var.initializer
+    is_initialized_op = fertile_stats_var.is_initialized()
     # Adds the variable to the savable list.
-    saveable = FertileStatsVariableSavable(params, resource_handle, create_op,
-                                           resource_handle.name)
+    saveable = (
+        fertile_stats_var._gather_saveables_for_checkpoint()[  # pylint: disable=protected-access
+            "fertile_stats_variable"](name=resource_handle.name))
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
     resources.register_resource(resource_handle, create_op, is_initialized_op)
     return resource_handle
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
index 3f24f58f03aac2ba6d368d7eccf8731f611a81b4..22b6f09d0cd88068f7bedabe7687920420a3028f 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -73,7 +73,16 @@ class SummaryFileWriter : public SummaryWriterInterface {
     e->set_step(global_step);
     e->set_wall_time(GetWallTime());
     Summary::Value* v = e->mutable_summary()->add_value();
-    t.AsProtoTensorContent(v->mutable_tensor());
+
+    if (t.dtype() == DT_STRING) {
+      // Treat DT_STRING specially, so that tensor_util.MakeNdarray in Python
+      // can convert the TensorProto to string-type numpy array. MakeNdarray
+      // does not work with strings encoded by AsProtoTensorContent() in
+      // tensor_content.
+      t.AsProtoField(v->mutable_tensor());
+    } else {
+      t.AsProtoTensorContent(v->mutable_tensor());
+    }
     v->set_tag(tag);
     if (!serialized_metadata.empty()) {
       v->mutable_metadata()->ParseFromString(serialized_metadata);
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
index cd3f712256f2293ed725745f8cbe48109856ef86..ffbfb9533e887e54b0f5bdfde11dadce21073a94 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
 
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -104,6 +105,23 @@ TEST_F(SummaryFileWriterTest, WriteTensor) {
                                   CHECK_EQ(e.summary().value_size(), 1);
                                   EXPECT_EQ(e.summary().value(0).tag(), "name");
                                 }));
+  TF_CHECK_OK(SummaryTestHelper(
+      "string_tensor_test",
+      [](SummaryWriterInterface* writer) {
+        Tensor hello(DT_STRING, TensorShape({}));
+        hello.scalar<string>()() = "hello";
+        TF_RETURN_IF_ERROR(writer->WriteTensor(
+            2, hello, "name", SummaryMetadata().SerializeAsString()));
+        TF_RETURN_IF_ERROR(writer->Flush());
+        return Status::OK();
+      },
+      [](const Event& e) {
+        EXPECT_EQ(e.step(), 2);
+        CHECK_EQ(e.summary().value_size(), 1);
+        EXPECT_EQ(e.summary().value(0).tag(), "name");
+        EXPECT_EQ(e.summary().value(0).tensor().dtype(), DT_STRING);
+        EXPECT_EQ(e.summary().value(0).tensor().string_val()[0], "hello");
+      }));
 }
 
 TEST_F(SummaryFileWriterTest, WriteScalar) {
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 9e8979bce49847fb41c6a3e2080971c042636106..784acce444a8d0c066f1b7ae6c1b5d7d65405549 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -29,6 +29,10 @@ load(
     "if_tensorrt",
 )
 
+exports_files(glob([
+    "test/testdata/*",
+]))
+
 tf_cuda_cc_test(
     name = "tensorrt_test_cc",
     size = "small",
@@ -312,16 +316,53 @@ tf_cuda_cc_test(
     ],
     deps = [
         ":trt_conversion",
+        "@com_google_googletest//:gtest",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "convert_nodes_test",
+    size = "medium",
+    srcs = ["convert/convert_nodes_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_logging",
+        ":trt_conversion",
+        ":trt_plugins",
+        "@com_google_googletest//:gtest",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_config_tensorrt//:nv_infer",
     ]),
 )
@@ -454,8 +495,9 @@ cuda_py_tests(
         "test/memory_alignment_test.py",
         "test/multi_connection_neighbor_engine_test.py",
         "test/neighboring_engine_test.py",
+        "test/quantization_test.py",
         "test/rank_two_test.py",
-        "test/unary_test.py",
+        "test/reshape_transpose_test.py",
         "test/vgg_block_nchw_test.py",
         "test/vgg_block_test.py",
     ],
@@ -471,6 +513,49 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_tests(
+    name = "tf_trt_integration_test_no_oss",
+    srcs = [
+        "test/unary_test.py",
+    ],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_oss",  # TODO(b/117274186): re-enable in OSS after crash fixed
+        "no_pip",  # TODO(b/117274186): re-enable in OSS after crash fixed
+        "no_windows",
+        "nomac",
+    ],
+)
+
+cuda_py_test(
+    name = "quantization_mnist_test",
+    srcs = ["test/quantization_mnist_test.py"],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/keras:keras",
+        "//tensorflow/python/estimator:estimator",
+    ],
+    data = [
+        "test/testdata/checkpoint",
+        "test/testdata/model.ckpt-46900.data-00000-of-00001",
+        "test/testdata/model.ckpt-46900.index",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_pip",
+        "no_tap",  # It is not able to download the mnist data.
+        "no_windows",
+        "nomac",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["convert/utils.cc"],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 7ad9bf22d352b7bdf7c7bc0fc839eef03101f2ab..ae211a93c3279ff1d6de2f9c9a4b849fc8cd578d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -81,59 +81,95 @@ std::vector<int> GetLoadedTensorRTVersion() {
   return {ver_major, ver_minor, ver_patch};
 }
 
-namespace {
+TrtCandidateSelector::TrtCandidateSelector(
+    const grappler::GraphProperties& graph_properties, int precision_mode)
+    : graph_properties_(graph_properties), precision_mode_(precision_mode) {}
 
-bool IsTensorRTCandidate(const tensorflow::Node* node) {
+Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
+  // TODO(laigd): move this set to TrtNodeValidator where it should belong.
   // LINT.IfChange
-  // TODO(jie): Segmentation shouldn't associated with op name.
-  //            Split it into a registration for each kernel.
   static const std::set<string> candidate_ops = {
-    "Identity",
-    "Snapshot",
-    "Const",
-    "Conv2D",
-    "MaxPool",
-    "BiasAdd",
-    "Relu",
-    "Add",
-    "Mul",
-    "Sub",
-    "Rsqrt",
-    "Pad",
-    "Mean",
-    "AvgPool",
-    "ConcatV2",
-    "DepthwiseConv2dNative",
-    "FusedBatchNorm",
-    "FusedBatchNormV2",
-    "Div",
-    "RealDiv",
-    "Rsqrt",
-    "Reciprocal",
-    "Exp",
-    "Log",
-    "Sqrt",
-    "Abs",
-    "Neg",
-#if NV_TENSORRT_MAJOR > 3
-    "MatMul",
-    "BatchMatMul",
-    "Softmax",
-    "Minimum",
-    "Maximum",
-    "TopKV2",
-    "Sum",
-    "Prod",
-    "Max",
-    "Min",
-#endif
-    // TODO(ben,jie): ...
+      "Identity",
+      "Snapshot",
+      "Const",
+      "Conv2D",
+      "MaxPool",
+      "BiasAdd",
+      "Relu",
+      "Sigmoid",
+      "Tanh",
+      "Add",
+      "Mul",
+      "Sub",
+      "Rsqrt",
+      "Pad",
+      "Mean",
+      "AvgPool",
+      "ConcatV2",
+      "DepthwiseConv2dNative",
+      "FusedBatchNorm",
+      "FusedBatchNormV2",
+      "Div",
+      "RealDiv",
+      "Rsqrt",
+      "Reciprocal",
+      "Exp",
+      "Log",
+      "Sqrt",
+      "Abs",
+      "Neg",
+      "Transpose",
+      "Reshape",
+      "MatMul",
+      "BatchMatMul",
+      "Softmax",
+      "Minimum",
+      "Maximum",
+      "TopKV2",
+      "Sum",
+      "Prod",
+      "Max",
+      "Min",
+      "Relu6",
+      "Square",
+      "ExpandDims",
+      "Squeeze",
   };
+  bool is_supported_op_type =
+      (candidate_ops.count(node->type_string()) ||
+       PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
+  static const std::set<string> quantize_ops = {
+      "QuantizeAndDequantizeV2",
+      "QuantizeAndDequantizeV3",
+      "FakeQuantWithMinMaxVars",
+      "FakeQuantWithMinMaxArgs",
+  };
+  // In INT8 mode, we will always apply the quantization ranges provided by
+  // these ops to the relevant tensors. This happens regardless of the value of
+  // use_calibration.
+  if (precision_mode_ == INT8MODE && quantize_ops.count(node->type_string())) {
+    is_supported_op_type = true;
+  }
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
-  return (candidate_ops.count(node->type_string()) ||
-          PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
+  if (!is_supported_op_type) {
+    return errors::Unimplemented("Op type ", node->type_string(),
+                                 " is not supported");
+  }
+
+  std::vector<const Edge*> input_edges;
+  TF_RETURN_IF_ERROR(node->input_edges(&input_edges));
+  std::vector<std::pair<const NodeDef*, int>> input_node_and_ports;
+  input_node_and_ports.reserve(input_edges.size());
+  for (const Edge* input_edge : input_edges) {
+    input_node_and_ports.emplace_back(&input_edge->src()->def(),
+                                      input_edge->src_output());
+  }
+  return validator_.ValidateNode(node->def(), input_node_and_ports,
+                                 graph_properties_);
 }
 
+namespace {
+
 tensorflow::Status BuildNodeMap(
     const tensorflow::Graph& graph,
     std::unordered_map<string, tensorflow::Node*>* node_map) {
@@ -152,7 +188,7 @@ tensorflow::Status BuildNodeMap(
 tensorflow::Status ConvertCalibGraphToInferGraph(
     const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
     bool is_dyn_op) {
-  VLOG(0) << "Starting Calib Conversion";
+  LOG(INFO) << "Starting Calib Conversion";
   infer_graph->CopyFrom(graph_def);
   auto trt_rm = TRTResourceManager::instance();
   auto calib_rm = trt_rm->getManager("TRTCalibration");
@@ -202,18 +238,19 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode, int minimum_segment_size, bool is_dyn_op,
-    int max_cached_engines, std::vector<int> cached_engine_batches) {
+    int max_cached_engines, std::vector<int> cached_engine_batches,
+    bool use_calibration) {
   // Create GrapplerItem.
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
   item.graph = graph_def;
 
-  // TODO(aaroey): we should have used single machine cluster like the
-  // following, but the problem is then wrap_conversion will depend on
-  // direct_session and cause double linking problems. To fix this we need to
-  // fix or get rid of the swig dependency. Here we use VirtualCluster
-  // as a work around, and we need to create a session to initialize the
-  // underlying device before calling this method.
+// TODO(aaroey): we should have used single machine cluster like the
+// following, but the problem is then wrap_conversion will depend on
+// direct_session and cause double linking problems. To fix this we need to
+// fix or get rid of the swig dependency. Here we use VirtualCluster
+// as a work around, and we need to create a session to initialize the
+// underlying device before calling this method.
 #if 0
   // Create single machine cluster. Note that this will create a session and
   // initialize the gpu devices.
@@ -246,7 +283,9 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 #endif
 
   // Create RewriterConfig.
-  tensorflow::RewriterConfig rw_cfg;
+  tensorflow::ConfigProto config_proto;
+  auto& rw_cfg =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   // TODO(aaroey): use only const folding and layout for the time being since
   // new optimizers break the graph for trt.
   rw_cfg.add_optimizers("constfold");
@@ -267,9 +306,10 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       list->add_i(batch);
     }
   }
+  parameters["use_calibration"].set_b(use_calibration);
 
   // Run optimizer.
-  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
+  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, config_proto);
   TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, new_graph_def));
 
   if (VLOG_IS_ON(5)) {
@@ -415,7 +455,8 @@ tensorflow::Status GetEngineInfo(
                  << "but this shouldn't have happened";
     info->device = *segment_devices.begin();
   } else {
-    LOG(ERROR) << "Can't find a device placement for the op!";
+    VLOG(1) << "No device is assigned to the segment. "
+            << "A device will be assigned during graph execution (inference).";
   }
   return Status::OK();
 }
@@ -546,27 +587,38 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
       }
     }
   }
+  // We don't support segments with no inputs. Fall back to native TF here to
+  // avoid crash later. Constant folding should've folded the ops that make up
+  // these segments.
+  if (inputs.empty()) {
+    return tensorflow::errors::Internal(
+        "Segment has no inputs (possible "
+        "constfold failure)");
+  }
+
+  const bool calibrate_int8 =
+      (info.precision_mode == INT8MODE && info.use_calibration);
+  // Build the engine and get its serialized representation.
   string segment_string;
-  if (info.engine_type == EngineInfo::EngineType::TRTStatic ||
-      info.precision_mode == INT8MODE) {
+  if (info.engine_type == EngineInfo::EngineType::TRTStatic || calibrate_int8) {
     // Create static engine for fp32/fp16 mode, and test validity of the engine
-    // for int8 mode. We don't want engine to fail at the calibration time.
-    // So we are constructing a FP32 engine here to check its validity, and if
-    // it is a valid engine then we put the serialized graphdef to the op.
-    // Otherwise we skip node creation for this engine.
+    // for int8 calibration mode. We don't want engine to fail at the
+    // calibration time. So we are constructing a FP32 engine here to check its
+    // validity, and if it is a valid engine then we put the serialized graphdef
+    // to the op. Otherwise we skip node creation for this engine.
     Logger trt_logger;
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     // TODO(sami): What happens if 1st dim is not batch?
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
-        info.segment_graph_def,
-        info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode,
+        info.segment_graph_def, calibrate_int8 ? FP32MODE : info.precision_mode,
         max_batch_size, info.max_workspace_size_bytes, input_shapes,
         &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
+        info.use_calibration,
         /*convert_successfully=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
     segment_string =
         string((const char*)engine_data->data(), engine_data->size());
-    if (info.precision_mode == INT8MODE) {
+    if (calibrate_int8) {
       // See above comment about why not putting this inside the 'else' branch.
       segment_string = info.segment_graph_def.SerializeAsString();
     }
@@ -578,7 +630,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
   // conversion.
   string prec_string;
   TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string));
-  if (info.precision_mode == INT8MODE &&
+  if (info.precision_mode == INT8MODE && calibrate_int8 &&
       !TRTResourceManager::instance()->getManager("TRTCalibration")) {
     LOG(ERROR) << "Failed to construct calibration storage";
   }
@@ -614,6 +666,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
           .Attr("cached_engine_batches", {max_batch_size})
           .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
           .Attr("precision_mode", prec_string)
+          .Attr("use_calibration", info.use_calibration)
           .Attr("OutT", out_types)
           .Finalize(&trt_node);
   if (!status.ok()) {
@@ -846,13 +899,18 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
+  TrtCandidateSelector candidate_selector(*params.graph_properties,
+                                          params.precision_mode);
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      &graph, IsTensorRTCandidate, InputEdgeValidator(*params.graph_properties),
-      OutputEdgeValidator(), segment_options, &initial_segments));
-  if (initial_segments.size() > 1) {
-    VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
+      &graph,
+      std::bind(&TrtCandidateSelector::IsTensorRTCandidate, &candidate_selector,
+                std::placeholders::_1),
+      // Input validation is already done by TrtCandidateSelector, so we don't
+      // need to check the input edges.
+      [](const Edge* edge) { return true; }, OutputEdgeValidator(),
+      segment_options, &initial_segments));
+  LOG(INFO) << "Number of TensorRT candidate segments: "
             << initial_segments.size();
-  }
 
   // Get the EngineInfo for each segment.
   std::unordered_map<string, tensorflow::Node*> node_map;
@@ -878,13 +936,17 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       continue;
     }
     curr_engine.precision_mode = params.precision_mode;
-    curr_engine.engine_type =
-        (params.is_dyn_op || params.precision_mode == INT8MODE
-             ? EngineInfo::EngineType::TRTDynamic
-             : EngineInfo::EngineType::TRTStatic);
+    if (params.use_calibration && params.precision_mode != INT8MODE) {
+      return errors::InvalidArgument(
+          "Calibration with FP32 or FP16 is not supported.");
+    }
+    curr_engine.engine_type = ((params.is_dyn_op || params.use_calibration)
+                                   ? EngineInfo::EngineType::TRTDynamic
+                                   : EngineInfo::EngineType::TRTStatic);
+    curr_engine.use_calibration = params.use_calibration;
     curr_engine.cached_engine_batches = params.cached_engine_batches;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
-    StrAppend(&curr_engine.engine_name, "my_trt_op_", t);
+    StrAppend(&curr_engine.engine_name, "TRTEngineOp_", t);
     status = RegisterSegmentFunctionToFunctionLibrary(
         &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
     if (!status.ok()) {
@@ -900,7 +962,7 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     converted_segments.push_back(std::move(curr_segment));
 
     if (VLOG_IS_ON(8)) {
-      string fname = curr_engine.engine_name;
+      string fname = engine_segments.back().engine_name;
       StrAppend(&fname, ".pb");
       std::fstream f;
       f.open(fname.c_str(), std::fstream::out | std::fstream::binary);
@@ -945,9 +1007,9 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
                                 &graph, alloc.get(), &engine_nodes);
     // If status is ok, we successfully added the node to the graph and can
     // remove segment ops. Otherwise graph is not modified.
-    const string msg = StrCat("Engine ", engine.engine_name,
-                              " creation for segment ", i, ", composed of ",
-                              converted_segments.at(i).first.size(), " nodes");
+    string msg = StrCat("TensorRT node ", engine.engine_name,
+                        " added for segment ", i, " consisting of ",
+                        converted_segments.at(i).first.size(), " nodes");
     if (status.ok()) {
       LOG(INFO) << msg << " succeeded.";
       for (auto node_name : converted_segments.at(i).first) {
@@ -955,7 +1017,14 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       }
     } else {
       // Graph is not modified.
-      LOG(WARNING) << msg << " failed: " << status << ". Skipping...";
+      LOG(WARNING) << msg << " failed: " << status << ". Fallback to TF...";
+    }
+    if (VLOG_IS_ON(1)) {
+      msg = "Segment consists of nodes: ";
+      for (const string& node_name : converted_segments.at(i).first) {
+        StrAppend(&msg, node_name, ", ");
+      }
+      VLOG(1) << msg;
     }
   }
   cudaSetDevice(old_cuda_device);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 3525202369841fd0b76583cdd26de2247fcdfff3..1f39f56f6392ba33af3d74fec12c326ed4451cb6 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -31,6 +31,30 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
+// Helper class for the segmenter to determine whether given TF node is
+// supported by TRT.
+class TrtCandidateSelector {
+ public:
+  TrtCandidateSelector(const grappler::GraphProperties& graph_properties,
+                       int precision_mode);
+
+  // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added
+  // to TRT subgraph and later converted into TRT engine.
+  Status IsTensorRTCandidate(const tensorflow::Node* node);
+
+ private:
+  // The TF-TRT node converter used to verify whether individual node is
+  // supported. It will operate in validation-only mode.
+  TrtNodeValidator validator_;
+
+  // GraphProperties of the graph whose nodes are to be validated by
+  // IsTensorRTCandidate().
+  const grappler::GraphProperties& graph_properties_;
+
+  // Quantization ops are only converted when using quantized precisions.
+  const int precision_mode_;
+};
+
 struct ConversionParams {
   ConversionParams()
       : input_graph_def(nullptr),
@@ -43,6 +67,7 @@ struct ConversionParams {
         cluster(nullptr),
         is_dyn_op(false),
         fixed_input_size(true),
+        use_calibration(true),
         max_cached_engines(1) {}
   const tensorflow::GraphDef* input_graph_def;
   const std::vector<string>* output_names;
@@ -56,6 +81,7 @@ struct ConversionParams {
   bool is_dyn_op;  //  Whether to create engine on conversion or execution time
   bool fixed_input_size;   // Assume non-batch ranks of input tensors are fixed
   int max_cached_engines;  // maximum number of cached engines
+  bool use_calibration;
   std::vector<int> cached_engine_batches;  // list of cached engines
 };
 
@@ -75,7 +101,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode = 1, int minimum_segment_size = 3,
     bool is_dyn_op = false, int max_cached_engines = 1,
-    std::vector<int> cached_engine_batches = {});
+    std::vector<int> cached_engine_batches = {}, bool use_calibration = true);
 
 // Method to call from optimization pass
 tensorflow::Status ConvertAfterShapes(ConversionParams& params);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
index 8146bed4b0541ca86fee5f9402f2d606cd012047..2d2bfeb192c1893824c7b30bfad593c62c203392 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -33,6 +38,91 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
+// TODO(laigd): put this into some test utils file.
+void ExpectStatus(Status status, error::Code code = error::OK,
+                  const char* substr = nullptr) {
+  EXPECT_EQ(code, status.code())
+      << status << " vs expected error code \"" << error::Code_Name(code)
+      << "\" and message \"" << substr << "\"";
+  if (substr) {
+    EXPECT_THAT(status.error_message(), ::testing::HasSubstr(substr)) << status;
+  }
+}
+
+TEST(TrtCandidateSelector, Basics) {
+  // Create a graph containing both TRT-compatible and TRT-incompatible nodes
+  // and use it to test TrtCandidateSelector::IsTensorRTCandidate().
+  const std::vector<int32> input_shape_array{2, 2};
+  TensorShape input_shape;
+  TF_EXPECT_OK(TensorShapeUtils::MakeShape(input_shape_array, &input_shape));
+
+  Scope s = Scope::NewRootScope();
+  ops::Placeholder::Attrs feed_attrs;
+  TF_EXPECT_OK(
+      TensorShapeUtils::MakeShape(input_shape_array, &feed_attrs.shape_));
+
+  // Compatible input.
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT, feed_attrs);
+  auto const_1 = ops::Const(s.WithOpName("const_1"), 1.0f, input_shape);
+
+  // Compatible MatMul.
+  auto matmul = ops::MatMul(s.WithOpName("matmul"), feed, const_1);
+
+  // Incompatible MatMul.
+  ops::MatMul::Attrs matmul_attrs;
+  matmul_attrs.transpose_a_ = true;
+  auto incompatible_matmul = ops::MatMul(s.WithOpName("incompatible_matmul"),
+                                         feed, const_1, matmul_attrs);
+
+  // Unsupported op.
+  auto unsupported_op = ops::Sin(s.WithOpName("sin"), feed);
+
+  // Incompatible input.
+  auto incompatible_feed = ops::Placeholder(s.WithOpName("feed"), DT_DOUBLE);
+  auto const_2 = ops::Const(s.WithOpName("const_2"), 1.0, input_shape);
+  // Compatible op with incompatible input.
+  auto matmul_with_incompatible_input =
+      ops::MatMul(s.WithOpName("matmul_with_incompatible_input"),
+                  incompatible_feed, const_2);
+
+  // Quantize ops.
+  auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
+  auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("quantize"), feed,
+                                               quantize_attrs);
+
+  // Get GrapplerItem and GraphProperties.
+  grappler::GrapplerItem item;
+  TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+  Tensor feed_tensor(DT_FLOAT, input_shape);
+  item.feed.push_back(std::make_pair("feed", feed_tensor));
+  grappler::GraphProperties graph_properties(item);
+  TF_EXPECT_OK(graph_properties.InferStatically(true));
+
+  for (const int precision_mode : {FP32MODE, INT8MODE}) {
+    TrtCandidateSelector selector(graph_properties, precision_mode);
+    TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node()));
+    ExpectStatus(
+        selector.IsTensorRTCandidate(incompatible_matmul.operation.node()),
+        error::INVALID_ARGUMENT,
+        "transpose_a is not supported for TensorRT FullyConnected "
+        "(op: MatMul), at: incompatible_matmul");
+    ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
+                 error::UNIMPLEMENTED, "Op type Sin is not supported");
+    ExpectStatus(
+        selector.IsTensorRTCandidate(
+            matmul_with_incompatible_input.operation.node()),
+        error::INTERNAL,
+        "Failed to convert input with index 0 to a TRT_TensorOrWeights");
+    if (precision_mode == INT8MODE) {
+      TF_EXPECT_OK(selector.IsTensorRTCandidate(quantize.operation.node()));
+    } else {
+      ExpectStatus(selector.IsTensorRTCandidate(quantize.operation.node()),
+                   error::UNIMPLEMENTED,
+                   "Op type FakeQuantWithMinMaxArgs is not supported");
+    }
+  }
+}
+
 class FakeCluster : public grappler::Cluster {
  public:
   FakeCluster() : Cluster(0) {}
@@ -48,8 +138,7 @@ class FakeCluster : public grappler::Cluster {
   }
   Status Run(const GraphDef& graph_def,
              const std::vector<std::pair<string, Tensor>>& feed,
-             const std::vector<string>& fetch,
-             RunMetadata* metadata) override {
+             const std::vector<string>& fetch, RunMetadata* metadata) override {
     return Status::OK();
   }
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 0ce891782e571200aad0db86d81a03b0072d3a3b..777a80bbc4da7a260cf85d0a7bc5ec16f4cd3cab 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -17,12 +17,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstring>
-#include <list>
 #include <map>
 #include <memory>
 #include <set>
 #include <unordered_map>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -56,10 +54,10 @@ limitations under the License.
 // would work!
 #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 
-#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                               \
-  do {                                                                   \
-    return tensorflow::errors::Internal(                                 \
-        "TFTRT::", __FUNCTION__, "failed to add TRT layer, at: ", node); \
+#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                                \
+  do {                                                                    \
+    return tensorflow::errors::Internal(                                  \
+        "TFTRT::", __FUNCTION__, " failed to add TRT layer, at: ", node); \
   } while (0)
 
 #define TFTRT_RETURN_ERROR_IF_FALSE(status, node) \
@@ -87,25 +85,22 @@ using ::tensorflow::str_util::Split;
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
 
-namespace {
-
 inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
                                        nvinfer1::DataType* trt_dtype) {
   switch (tf_dtype) {
     case tensorflow::DataType::DT_FLOAT:
       *trt_dtype = nvinfer1::DataType::kFLOAT;
       break;
+    // TODO(aaroey): this should be DT_QINT8 which is not a well supported type.
     case tensorflow::DataType::DT_INT8:
       *trt_dtype = nvinfer1::DataType::kINT8;
       break;
     case tensorflow::DataType::DT_HALF:
       *trt_dtype = nvinfer1::DataType::kHALF;
       break;
-#if NV_TENSORRT_MAJOR > 3
     case tensorflow::DataType::DT_INT32:
       *trt_dtype = nvinfer1::DataType::kINT32;
       break;
-#endif
     default:
       return tensorflow::errors::InvalidArgument(
           "Unsupported data type ", tensorflow::DataTypeString(tf_dtype));
@@ -113,76 +108,153 @@ inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
   return tensorflow::Status::OK();
 }
 
-void GetInputProperties(const grappler::GraphProperties& graph_properties,
-                        const Node* outside_node, const int out_port,
-                        PartialTensorShape* shape,
-                        tensorflow::DataType* dtype) {
-  if (graph_properties.HasOutputProperties(outside_node->name())) {
-    auto output_params =
-        graph_properties.GetOutputProperties(outside_node->name());
+template <typename TensorShapeType>
+inline nvinfer1::Dims TensorShapeToTrtDims(const TensorShapeType& shape,
+                                           bool ignore_first_dim) {
+  nvinfer1::Dims trt_dims;
+  const int offset = (ignore_first_dim ? 1 : 0);
+  for (int i = offset; i < shape.dims(); i++) {
+    trt_dims.d[i - offset] = shape.dim_size(i);
+  }
+  trt_dims.nbDims = shape.dims() - offset;
+  return trt_dims;
+}
+
+Status TensorShapeArrayToTrtDims(const std::vector<int>& shape,
+                                 nvinfer1::Dims* out,
+                                 bool ignore_first_dim = false) {
+  PartialTensorShape tensor_shape;
+  TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(shape, &tensor_shape));
+  *out = TensorShapeToTrtDims(tensor_shape, ignore_first_dim);
+  return tensorflow::Status::OK();
+}
+
+void GetOutputProperties(const grappler::GraphProperties& graph_properties,
+                         const Node* node, const int out_port,
+                         PartialTensorShape* shape,
+                         tensorflow::DataType* dtype) {
+  if (graph_properties.HasOutputProperties(node->name())) {
+    auto output_params = graph_properties.GetOutputProperties(node->name());
     auto out_shape = output_params.at(out_port);
     *dtype = out_shape.dtype();
     *shape = out_shape.shape();
   } else {
-    VLOG(0) << "Unknown output shape" << outside_node->name();
-    *dtype = outside_node->output_type(out_port);
+    LOG(INFO) << "Unknown output shape" << node->name();
+    *dtype = node->output_type(out_port);
   }
 }
 
-void GetOutputProperties(const grappler::GraphProperties& graph_properties,
-                         const Node* outside_node, const int in_port,
-                         PartialTensorShape* shape,
-                         tensorflow::DataType* dtype) {
-  if (graph_properties.HasInputProperties(outside_node->name())) {
-    auto input_params =
-        graph_properties.GetInputProperties(outside_node->name());
+void GetInputProperties(const grappler::GraphProperties& graph_properties,
+                        const Node* node, const int in_port,
+                        PartialTensorShape* shape,
+                        tensorflow::DataType* dtype) {
+  if (graph_properties.HasInputProperties(node->name())) {
+    auto input_params = graph_properties.GetInputProperties(node->name());
     auto in_shape = input_params.at(in_port);
     *dtype = in_shape.dtype();
     *shape = in_shape.shape();
   } else {
-    *dtype = outside_node->input_type(in_port);
+    *dtype = node->input_type(in_port);
   }
 }
 
-tensorflow::Status ValidateInputProperties(const PartialTensorShape& shape,
-                                           const tensorflow::DataType dtype,
-                                           nvinfer1::DataType* trt_dtype) {
-  // TODO(aaroey): some of these checks also apply to IsTensorRTCandidate(), so
-  // put them there instead.
+Status ValidateTensorProperties(const string& producer_node_type,
+                                const tensorflow::DataType dtype,
+                                const PartialTensorShape& shape,
+                                bool validation_only,
+                                nvinfer1::DataType* trt_dtype,
+                                nvinfer1::Dims* trt_dims, int* batch_size) {
+  // Convert data type.
   TF_RETURN_IF_ERROR(ConvertDType(dtype, trt_dtype));
+
+  // Convert shape.
   if (shape.dims() < 0) {
-    return tensorflow::errors::InvalidArgument("Input tensor rank is unknown.");
+    return errors::InvalidArgument("Input tensor rank is unknown.");
+  }
+  if (shape.dims() > nvinfer1::Dims::MAX_DIMS + 1) {  // +1 for batch dim
+    return errors::OutOfRange("Input tensor rank is greater than ",
+                              nvinfer1::Dims::MAX_DIMS + 1);
   }
-  if (shape.dims() > 9) {
-    return tensorflow::errors::OutOfRange(
-        "Input tensor rank is greater than 8.");
+  if (producer_node_type != "Const" && shape.dims() < 2) {
+    return errors::InvalidArgument(
+        "Input tensor with rank<2 is not supported since the first dimension "
+        "is treated as batch dimension by TRT");
   }
+  *trt_dims = TensorShapeToTrtDims(shape, /*ignore_first_dim=*/true);
+  *batch_size = shape.dim_size(0);
+
+  if (validation_only) return Status::OK();
+  // Following are validations at runtime.
+
   for (int d = 1; d < shape.dims(); ++d) {
     if (shape.dim_size(d) < 0) {
-      return tensorflow::errors::InvalidArgument(
+      return errors::InvalidArgument(
           "Input tensor with shape ", shape.DebugString(),
-          " has an unknown non-batch dimemension at dim ", d);
+          " has an unknown non-batch dimension at dim ", d);
     }
   }
   return Status::OK();
 }
 
+string DebugString(const nvinfer1::DimensionType type) {
+  switch (type) {
+    case nvinfer1::DimensionType::kSPATIAL:
+      return "kSPATIAL";
+    case nvinfer1::DimensionType::kCHANNEL:
+      return "kCHANNEL";
+    case nvinfer1::DimensionType::kINDEX:
+      return "kINDEX";
+    case nvinfer1::DimensionType::kSEQUENCE:
+      return "kSEQUENCE";
+    default:
+      return StrCat(static_cast<int>(type), "=unknown");
+  }
+}
+
+string DebugString(const nvinfer1::DataType trt_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      return "kFLOAT";
+    case nvinfer1::DataType::kHALF:
+      return "kHALF";
+    case nvinfer1::DataType::kINT8:
+      return "kINT8";
+    case nvinfer1::DataType::kINT32:
+      return "kINT32";
+    default:
+      return "Invalid TRT data type";
+  }
+}
+
 string DebugString(const nvinfer1::Dims& dims) {
   string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d=");
-  for (int i = 0; i < nvinfer1::Dims::MAX_DIMS; ++i) {
-    StrAppend(&out, dims.d[i], ",");
+  for (int i = 0; i < dims.nbDims; ++i) {
+    StrAppend(&out, dims.d[i], "[", DebugString(dims.type[i]), "],");
+  }
+  StrAppend(&out, ")");
+  return out;
+}
+
+string DebugString(const nvinfer1::Permutation& permutation, int len) {
+  string out = "nvinfer1::Permutation(";
+  for (int i = 0; i < len; ++i) {
+    StrAppend(&out, permutation.order[i], ",");
   }
   StrAppend(&out, ")");
   return out;
 }
 
-// Return whether or not the broadcast is feasible;
-bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
-                               const bool operand_l_is_tensor,
-                               const nvinfer1::Dims& operand_r,
-                               const bool operand_r_is_tensor,
-                               nvinfer1::Dims* operand_l_new_shape,
-                               nvinfer1::Dims* operand_r_new_shape) {
+string DebugString(const nvinfer1::ITensor& tensor) {
+  return StrCat("nvinfer1::ITensor(@", reinterpret_cast<uintptr_t>(&tensor),
+                ", name=", tensor.getName(),
+                ", dtype=", DebugString(tensor.getType()),
+                ", dims=", DebugString(tensor.getDimensions()), ")");
+}
+
+Status Converter::GetTrtBroadcastShape(
+    const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r,
+    nvinfer1::Dims* operand_l_new_dims,
+    nvinfer1::Dims* operand_r_new_dims) const {
   // ***************************************************************************
   // TensorRT Elementwise op supports broadcast but requires both tensor to be
   // of Identical rank
@@ -207,52 +279,59 @@ bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
   // -> T: 1 1 1 -1 3 5 1
   // -> W: 1 1 1  1 3 5 1
   // ***************************************************************************
-  const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
-  const size_t element_size = sizeof(operand_l.d[0]);
-
-  // fill in dimensions
-  int l_s[max_nb_dims];
-  std::fill(l_s, l_s + max_nb_dims, 1);
-  int l_d = operand_l_is_tensor ? operand_l.nbDims + 1 : operand_l.nbDims;
-  int r_s[max_nb_dims];
-  std::fill(r_s, r_s + max_nb_dims, 1);
-  int r_d = operand_r_is_tensor ? operand_r.nbDims + 1 : operand_r.nbDims;
-
-  int max_d = std::max(l_d, r_d);
-  std::memcpy(l_s + max_d - operand_l.nbDims, operand_l.d,
-              operand_l.nbDims * element_size);
-  std::memcpy(r_s + max_d - operand_r.nbDims, operand_r.d,
-              operand_r.nbDims * element_size);
-
-  // set -1 for batch dimension, since batch size is not supposed to be
-  // broadcasted
-  if (operand_l_is_tensor) {
-    if (max_d != l_d) {  // if broadcast beyond batch dimension, fail
-      return false;
-    }
-    l_s[0] = -1;
-  }
-  if (operand_r_is_tensor) {
-    if (max_d != r_d) {  // if broadcast beyond batch dimension, fail
-      return false;
-    }
-    r_s[0] = -1;
+  if (!operand_l.is_tensor() && !operand_r.is_tensor()) {
+    return errors::InvalidArgument(
+        "Broadcasting requires at least one of the operands be tensors");
   }
 
-  // compare broadcast feasibility
-  for (int i = max_d - 1; i >= 0; i--) {
-    if ((l_s[i] != r_s[i]) && (l_s[i] != 1) && (r_s[i] != 1)) {
-      return false;
+  const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
+  auto compute_output_dims =
+      [max_nb_dims](const TRT_TensorOrWeights& input, int broadcast_num_dims,
+                    int* output_dims_array, nvinfer1::Dims* output_dims) {
+        const nvinfer1::Dims input_dims = input.GetTrtDims();
+        std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
+        std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
+                  output_dims_array + broadcast_num_dims - input_dims.nbDims);
+        if (input.is_tensor()) {
+          const int true_input_dims = input_dims.nbDims + 1;
+          if (true_input_dims < broadcast_num_dims) {
+            return errors::InvalidArgument(
+                "Broadcasting beyond batch dimension is not supported ",
+                "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
+                broadcast_num_dims, ")");
+          }
+          // Set the batch dimension to -1, since batch size is not supposed to
+          // be broadcasted.
+          output_dims_array[0] = -1;
+        }
+        // Copy to output dimensions (stripping the batch dimension).
+        output_dims->nbDims = broadcast_num_dims - 1;
+        std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
+                  output_dims->d);
+        return Status::OK();
+      };
+
+  // Compute the output dimensions.
+  const int broadcast_num_dims =
+      std::max(operand_l.GetTrtDims().nbDims + (operand_l.is_tensor() ? 1 : 0),
+               operand_r.GetTrtDims().nbDims + (operand_r.is_tensor() ? 1 : 0));
+  int output_l[max_nb_dims], output_r[max_nb_dims];
+  TF_RETURN_IF_ERROR(compute_output_dims(operand_l, broadcast_num_dims,
+                                         output_l, operand_l_new_dims));
+  TF_RETURN_IF_ERROR(compute_output_dims(operand_r, broadcast_num_dims,
+                                         output_r, operand_r_new_dims));
+
+  // Compare broadcast feasibility
+  for (int i = 0; i < broadcast_num_dims; ++i) {
+    if ((output_l[i] != output_r[i]) && (output_l[i] != 1) &&
+        (output_r[i] != 1)) {
+      return errors::InvalidArgument(
+          "Infeasible broadcast scheme (", "batch_dim: ", output_l[0], ", ",
+          DebugString(*operand_l_new_dims), " vs ", "batch_dim: ", output_r[0],
+          ", ", DebugString(*operand_r_new_dims), ")");
     }
   }
-
-  // output new TensorRT Dimension (stripping the batch dimension)
-  operand_l_new_shape->nbDims = max_d - 1;
-  std::memcpy(operand_l_new_shape->d, l_s + 1, (max_d - 1) * element_size);
-  operand_r_new_shape->nbDims = max_d - 1;
-  std::memcpy(operand_r_new_shape->d, r_s + 1, (max_d - 1) * element_size);
-
-  return true;
+  return Status::OK();
 }
 
 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
@@ -268,7 +347,7 @@ inline bool DimsEqual(const nvinfer1::Dims& dim_l,
   return true;
 }
 
-inline nvinfer1::Dims GetTensorShape(const tensorflow::Tensor& tensor) {
+inline nvinfer1::Dims GetTrtDimsForTensor(const tensorflow::Tensor& tensor) {
   nvinfer1::Dims dims;
   dims.nbDims = tensor.dims();
   for (int i = 0; i < dims.nbDims; i++) {
@@ -277,11 +356,22 @@ inline nvinfer1::Dims GetTensorShape(const tensorflow::Tensor& tensor) {
   return dims;
 }
 
-inline int64_t GetShapeSize(const nvinfer1::Dims& shape) {
-  // Returns total number of elements in shape
+inline bool HasStaticShape(const nvinfer1::Dims& dims) {
+  if (dims.nbDims < 0) return false;
+  for (int d = 0; d < dims.nbDims; ++d) {
+    if (dims.d[d] < 0) return false;
+  }
+  return true;
+}
+
+// Returns total number of elements in dims. Returning 0 means either some dim
+// is 0 or the number of dims is 0.
+// Note that for TF scalar constant, we always convert to dims [1].
+int64_t TrtDimsNumElements(const nvinfer1::Dims& dims) {
+  if (dims.nbDims == 0) return 0;
   int64_t count = 1;
-  for (int d = 0; d < shape.nbDims; ++d) {
-    count *= shape.d[d];
+  for (int d = 0; d < dims.nbDims; ++d) {
+    count *= dims.d[d];
   }
   return count;
 }
@@ -320,133 +410,149 @@ string GetCommonNameScope(const string& op_name_a, const string& op_name_b) {
   return op_name_a.substr(0, last_scope_separator);
 }
 
-// Class to convert TF weight to TRT weight.
-class TRT_ShapedWeights {
- public:
-  TRT_ShapedWeights(tensorflow::DataType type, const void* values,
-                    nvinfer1::Dims shape)
-      : shape_(shape), type_(type), values_(values), empty_weight_flag_(false) {
-    // Note: this->shape.type[] is not used
-  }
+TRT_ShapedWeights::TRT_ShapedWeights(DataType type) : type_(type) {
+  shape_.nbDims = 0;
+}
 
-  explicit TRT_ShapedWeights(tensorflow::DataType type)
-      : shape_(), type_(type), values_(nullptr), empty_weight_flag_(true) {}
+TRT_ShapedWeights::TRT_ShapedWeights(DataType type, nvinfer1::Dims dims,
+                                     Tensor tensor)
+    : shape_(dims), type_(type), tensor_(tensor) {}
 
-  // TODO(aaroey): use rvalue reference.
-  TRT_ShapedWeights(const TRT_ShapedWeights& rhs)
-      : shape_(rhs.shape_),
-        type_(rhs.type_),
-        values_(rhs.values_),
-        empty_weight_flag_(rhs.empty_weight_flag_) {}
+TRT_ShapedWeights::TRT_ShapedWeights(const TRT_ShapedWeights& rhs)
+    : shape_(rhs.shape_), type_(rhs.type_), tensor_(rhs.tensor_) {}
 
-  // TODO(aaroey): use GetShapeSize() instead.
-  int64_t count() const {
-    int64_t c = 1;
-    for (int i = 0; i < shape_.nbDims; i++) c *= shape_.d[i];
-    return c;
-  }
+int64_t TRT_ShapedWeights::count() const { return TrtDimsNumElements(shape_); }
 
-  nvinfer1::Weights GetWeightsForTRT() const {
-    nvinfer1::DataType trt_type(nvinfer1::DataType::kFLOAT);
-    TF_CHECK_OK(ConvertDType(type_, &trt_type));
-    if (empty_weight_flag_) return nvinfer1::Weights{trt_type, nullptr, 0};
+nvinfer1::Weights TRT_ShapedWeights::GetTrtWeights() const {
+  nvinfer1::DataType trt_type(nvinfer1::DataType::kFLOAT);
+  TF_CHECK_OK(ConvertDType(type_, &trt_type));
+  return nvinfer1::Weights{trt_type, GetValues(), count()};
+}
 
-    // Note: this->shape.type[] is not used
-    return nvinfer1::Weights{trt_type, GetValues(), GetShapeSize(shape_)};
-  }
+size_t TRT_ShapedWeights::size_bytes() const {
+  return this->count() * tensorflow::DataTypeSize(this->type_);
+}
 
-  const void* GetValues() const { return values_; }
+string TRT_ShapedWeights::DebugString() const {
+  return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_),
+                ", type=", DataTypeString(type_),
+                ", values=", reinterpret_cast<uintptr_t>(GetValues()), ")");
+}
 
-  // TODO(aaroey): get rid of this method.
-  void SetValues(const void* values) { values_ = values; }
+// A fake ITensor implementation used to check whether the TF-TRT converter can
+// handle specific node. We only need shape and type information, and the
+// converter won't (and shouldn't) use this to build the TRT network.
+class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor {
+ public:
+  SimpleITensor(nvinfer1::DataType trt_dtype, const nvinfer1::Dims& trt_dims)
+      : trt_dtype_(trt_dtype), trt_dims_(trt_dims) {}
 
-  size_t size_bytes() const {
-    int type_size = tensorflow::DataTypeSize(this->type_);
-    return this->count() * type_size;
-  }
+  void setName(const char* name) override {}
 
-  // Default converter
-  operator nvinfer1::Weights() const { return GetWeightsForTRT(); }
+  const char* getName() const override { return ""; }
 
-  string DebugString() const {
-    return StrCat(
-        "TRT_ShapedWeights(shape=", convert::DebugString(shape_), ", type=",
-        type_, ", values=", reinterpret_cast<uintptr_t>(values_),
-        ", empty_weight_flag=", empty_weight_flag_, ")");
+  void setDimensions(nvinfer1::Dims dimensions) override {
+    trt_dims_ = dimensions;
   }
 
-  // TODO(aaroey): make these private.
-  nvinfer1::Dims shape_;
-  tensorflow::DataType type_;
+  nvinfer1::Dims getDimensions() const override { return trt_dims_; }
 
- private:
-  // TODO(aaroey): this should not be const as it's always from TRTWeightStore.
-  const void* values_;
-  bool empty_weight_flag_;
-};
+  void setType(nvinfer1::DataType trt_dtype) override {
+    trt_dtype_ = trt_dtype;
+  }
 
-class TRT_TensorOrWeights {
- public:
-  explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor)
-      : tensor_(tensor), weights_(DT_FLOAT), variant_(TRT_NODE_TENSOR) {}
+  nvinfer1::DataType getType() const override { return trt_dtype_; }
 
-  explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
-      : tensor_(nullptr), weights_(weights), variant_(TRT_NODE_WEIGHTS) {}
+  bool isNetworkInput() const override { return false; }
 
-  // TODO(aaroey): use rvalue reference.
-  TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
-      : tensor_(rhs.tensor_), weights_(rhs.weights_), variant_(rhs.variant_) {}
+  bool isNetworkOutput() const override { return false; }
 
-  ~TRT_TensorOrWeights() {}
+  void setBroadcastAcrossBatch(bool broadcastAcrossBatch) override {}
 
-  bool is_tensor() const { return variant_ == TRT_NODE_TENSOR; }
-  bool is_weights() const { return variant_ == TRT_NODE_WEIGHTS; }
+  bool getBroadcastAcrossBatch() const override { return false; }
 
-  nvinfer1::ITensor* tensor() {
-    CHECK(is_tensor());
-    return tensor_;
+  nvinfer1::TensorLocation getLocation() const override {
+    // This is arbitrary, since we don't use it.
+    return nvinfer1::TensorLocation::kDEVICE;
   }
 
-  const nvinfer1::ITensor* tensor() const {
-    CHECK(is_tensor());
-    return tensor_;
-  }
+  void setLocation(nvinfer1::TensorLocation location) override {}
 
-  TRT_ShapedWeights& weights() {
-    CHECK(is_weights());
-    return weights_;
-  }
+#if NV_TENSORRT_MAJOR >= 5
+  bool setDynamicRange(float min, float max) override { return true; }
 
-  const TRT_ShapedWeights& weights() const {
-    CHECK(is_weights());
-    return weights_;
-  }
+  float getDynamicRange() const override { return 0; }
+#endif
 
-  nvinfer1::Dims shape() const {
-    if (is_tensor()) {
-      return tensor()->getDimensions();
-    } else {
-      return weights().shape_;
-    }
-  }
+ private:
+  nvinfer1::DataType trt_dtype_;
+  nvinfer1::Dims trt_dims_;
+};
 
-  string DebugString() const {
-    string output = "TRT_TensorOrWeights(type=";
-    if (is_tensor()) {
-      StrAppend(&output, "tensor @", reinterpret_cast<uintptr_t>(tensor_),
-                ", shape=", convert::DebugString(tensor_->getDimensions()));
-    } else {
-      StrAppend(&output, "weights=", weights_.DebugString());
-    }
-    StrAppend(&output, ")");
-    return output;
+TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::ITensor* tensor,
+                                         int batch_size)
+    : tensor_(tensor),
+      batch_size_(batch_size),
+      initialized_(true),
+      is_tensor_(true) {}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
+                                         const nvinfer1::Dims& trt_dims,
+                                         int batch_size)
+    : simple_itensor_(new SimpleITensor(trt_dtype, trt_dims)),
+      batch_size_(batch_size),
+      initialized_(true),
+      is_tensor_(true) {}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_ShapedWeights& weights)
+    : weights_(weights), initialized_(true), is_tensor_(false) {}
+
+TRT_TensorOrWeights::TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs)
+    : tensor_(rhs.tensor_),
+      simple_itensor_(rhs.simple_itensor_),
+      batch_size_(rhs.batch_size_),
+      weights_(rhs.weights_),
+      initialized_(rhs.initialized_),
+      is_tensor_(rhs.is_tensor_) {}
+
+void TRT_TensorOrWeights::operator=(const TRT_TensorOrWeights& rhs) {
+  tensor_ = rhs.tensor_;
+  simple_itensor_ = rhs.simple_itensor_;
+  batch_size_ = rhs.batch_size_;
+  weights_ = rhs.weights_;
+  initialized_ = rhs.initialized_;
+  is_tensor_ = rhs.is_tensor_;
+}
+
+nvinfer1::ITensor* TRT_TensorOrWeights::tensor() {
+  CHECK(is_tensor());
+  return tensor_ == nullptr ? simple_itensor_.get() : tensor_;
+}
+
+const nvinfer1::ITensor* TRT_TensorOrWeights::tensor() const {
+  CHECK(is_tensor());
+  return tensor_ == nullptr ? simple_itensor_.get() : tensor_;
+}
+
+nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
+  if (is_tensor()) {
+    return tensor()->getDimensions();
+  } else {
+    return weights().shape_;
   }
+}
 
- private:
-  nvinfer1::ITensor* tensor_;
-  TRT_ShapedWeights weights_;
-  enum { TRT_NODE_TENSOR, TRT_NODE_WEIGHTS } variant_;
-};
+string TRT_TensorOrWeights::DebugString() const {
+  string output = "TRT_TensorOrWeights(type=";
+  if (is_tensor()) {
+    StrAppend(&output, "tensor=", convert::DebugString(*tensor()),
+              ", batch_size=", batch_size_);
+  } else {
+    StrAppend(&output, "weights=", weights_.DebugString());
+  }
+  StrAppend(&output, ")");
+  return output;
+}
 
 class TFAttrs {
  public:
@@ -503,12 +609,6 @@ std::vector<float> TFAttrs::get<std::vector<float>>(const string& key) const {
   return std::vector<float>(attr.begin(), attr.end());
 }
 
-template <>
-std::vector<string> TFAttrs::get<std::vector<string>>(const string& key) const {
-  auto attr = this->at(key)->list().s();
-  return std::vector<string>(attr.begin(), attr.end());
-}
-
 template <>
 nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const {
   nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
@@ -606,11 +706,10 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
   // TRT requires GKcRS, while TF depthwise has RSCK where c=1, C=G
   const int c = iweights.shape_.d[2] / num_groups;
   const int k = iweights.shape_.d[3] * num_groups;
-  VLOG(2) << "num_groups: " << num_groups
-          << "c" << iweights.shape_.d[2] << " then " << c
-          << "k" << iweights.shape_.d[3] << " then " << k
-          << "r" << iweights.shape_.d[0] << " then " << r
-          << "s" << iweights.shape_.d[1] << " then " << s;
+  VLOG(2) << "num_groups: " << num_groups << "c" << iweights.shape_.d[2]
+          << " then " << c << "k" << iweights.shape_.d[3] << " then " << k
+          << "r" << iweights.shape_.d[0] << " then " << r << "s"
+          << iweights.shape_.d[1] << " then " << s;
   oweights->shape_.d[0] = k / num_groups;
   oweights->shape_.d[1] = c * num_groups;
   oweights->shape_.d[2] = r;
@@ -640,182 +739,515 @@ void ReorderRSCKToKCRS(const TRT_ShapedWeights& iweights,
   }
 }
 
-class Converter;
-
-using OpConverter =
-    std::function<tensorflow::Status(Converter&, const tensorflow::NodeDef&,
-                                     const std::vector<TRT_TensorOrWeights>&,
-                                     std::vector<TRT_TensorOrWeights>*)>;
+TRT_ShapedWeights TrtWeightStore::GetTempWeights(tensorflow::DataType type,
+                                                 const nvinfer1::Dims& dims) {
+  TensorShape shape;
+  // TODO(laigd): make it return a status.
+  TF_CHECK_OK(TensorShapeUtils::MakeShape(dims.d, dims.nbDims, &shape));
+  // TODO(jie): check weights size_bytes. 0 means type error
+  Tensor tensor(type, shape);
+  TRT_ShapedWeights weights(type, dims, tensor);
+  store_.emplace_back(std::move(tensor));
+  return weights;
+}
 
-class Converter {
- public:
-  explicit Converter(nvinfer1::INetworkDefinition* trt_network,
-                     TRTWeightStore* ws, bool fp16)
-      : trt_network_(trt_network), weight_store_(ws), fp16_(fp16) {
-    this->register_op_converters();
+TrtNodeValidator::TrtNodeValidator() { RegisterOpValidators(); }
+
+Status TrtNodeValidator::ConvertToTensorOrWeights(
+    const NodeDef& node_def, int output_port,
+    const grappler::GraphProperties& graph_properties,
+    TRT_TensorOrWeights* tensor_or_weights) {
+  if (node_def.op() == "Const") {
+    if (output_port != 0) {
+      return errors::InvalidArgument("Const node should only have one output.");
+    }
+    // The output of the conversion will be used as input to other nodes to
+    // determine whether TRT supports those nodes. If it cannot convert the
+    // Const, it's very likely we cannot treat it as a tensor and make it an
+    // input to the TRT network, since TRT removes the first dimension and
+    // treats it as batch size. Also, it's not likely that the converter can
+    // support the op, and performance may suffer even if it can, so we just
+    // simply return error if the conversion fails.
+    std::vector<TRT_TensorOrWeights> inputs;
+    return ConvertConstToWeights(node_def, inputs, tensor_or_weights);
+  }
+  if (!graph_properties.HasOutputProperties(node_def.name())) {
+    return errors::InvalidArgument("Shape and data type are unknown");
   }
 
-  TRTWeightStore* weight_store() { return weight_store_; }
+  // Validate and convert shape and dtype.
+  const auto& output_params =
+      graph_properties.GetOutputProperties(node_def.name());
+  const auto& tensor_properties = output_params.at(output_port);
+  const DataType dtype = tensor_properties.dtype();
+  const PartialTensorShape shape = tensor_properties.shape();
+  nvinfer1::DataType trt_dtype;
+  nvinfer1::Dims trt_dims;
+  int batch_size = -1;
+  TF_RETURN_IF_ERROR(ValidateTensorProperties(
+      node_def.op(), dtype, shape, /*validation_only_=*/true, &trt_dtype,
+      &trt_dims, &batch_size));
+
+  // Adds a fake ITensor. This is fine since op converter operates in
+  // validation-only mode and it won't (and shouldn't) use the tensor to do
+  // any TRT network operations.
+  *tensor_or_weights = TRT_TensorOrWeights(trt_dtype, trt_dims, batch_size);
+  return Status::OK();
+}
 
-  TRT_ShapedWeights get_temp_weights(tensorflow::DataType type,
-                                     nvinfer1::Dims shape) {
-    TRT_ShapedWeights weights(type, nullptr, shape);
-    // TODO(jie): check weights size_bytes. 0 means type error
-    weight_store_->store_.push_back(std::vector<uint8_t>(weights.size_bytes()));
-    weights.SetValues(weight_store_->store_.back().data());
-    return weights;
-  }
+Status TrtNodeValidator::ValidateNode(
+    const tensorflow::NodeDef& node_def,
+    const std::vector<std::pair<const NodeDef*, int>>& input_node_and_ports,
+    const grappler::GraphProperties& graph_properties) {
+  // Convert input NodeDef and corresponding output ports to
+  // TRT_TensorOrWeights.
+  std::vector<TRT_TensorOrWeights> inputs;
+  for (int i = 0; i < input_node_and_ports.size(); ++i) {
+    const auto& pair = input_node_and_ports[i];
+    TRT_TensorOrWeights tensor_or_weights;
+    Status status = ConvertToTensorOrWeights(
+        *pair.first, pair.second, graph_properties, &tensor_or_weights);
+    if (!status.ok()) {
+      return errors::Internal(
+          "Failed to convert input with index ", i,
+          " to a TRT_TensorOrWeights: ", status.error_message());
+    }
+    inputs.push_back(tensor_or_weights);
+  }
+
+  // Validate the node.
+  const auto iter = op_validators_.find(node_def.op());
+  if (iter == op_validators_.end()) {
+    // If validator is not registered, it means no validation is needed.
+    return Status::OK();
+  }
+
+  OpConverter validator = iter->second;
+  OpConverterParams params(
+      /*arg_converter=*/nullptr, node_def, inputs, /*arg_outputs=*/nullptr,
+      /*arg_validation_only=*/true, &weight_store_);
+  return validator(&params);
+}
 
-  // TODO(aaroey): fix all the namings.
-  bool isFP16() { return fp16_; }
+Status TrtNodeValidator::ConvertConstToWeights(
+    const NodeDef& const_node_def,
+    const std::vector<TRT_TensorOrWeights>& inputs,
+    TRT_TensorOrWeights* output) {
+  std::vector<TRT_TensorOrWeights> outputs;
+  OpConverterParams params(
+      /*arg_converter=*/nullptr, const_node_def, inputs, &outputs,
+      /*arg_validation_only=*/true, &weight_store_);
+  Status status = op_validators_["Const"](&params);
+  if (status.ok() && output) *output = outputs[0];
+  return status;
+}
 
-  TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
-    return this->get_temp_weights(weights.type_, weights.shape_);
-  }
+Converter::Converter(nvinfer1::INetworkDefinition* trt_network,
+                     int precision_mode, bool use_calibration)
+    : trt_network_(trt_network),
+      precision_mode_(precision_mode),
+      use_calibration_(use_calibration) {
+  this->RegisterOpConverters();
+}
 
-  tensorflow::Status convert_node(const tensorflow::NodeDef& node_def) {
-    std::vector<TRT_TensorOrWeights> inputs;
-    TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs));
-    const string& op = node_def.op();
-    std::vector<TRT_TensorOrWeights> outputs;
-    if (PluginFactoryTensorRT::GetInstance()->IsPlugin(op)) {
-      TF_RETURN_IF_ERROR(plugin_converter_(*this, node_def, inputs, &outputs));
-    } else {
-      if (!op_registry_.count(op)) {
-        return tensorflow::errors::Unimplemented(
-            "No converter registered for op: " + op);
-      }
-      OpConverter op_converter = op_registry_.at(op);
-      TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
-    }
-    for (size_t i = 0; i < outputs.size(); ++i) {
-      TRT_TensorOrWeights& output = outputs[i];
-      // TODO(jie): tf protobuf seems to be omitting the :0 suffix
-      string output_name = node_def.name();
-      if (i != 0) output_name = StrCat(output_name, ":", i);
-      // We need to check the name before setting it. For Identity op where the
-      // output is the input, if its input is one of the engine input, setting
-      // the name here will overwrite engine input bindings which will cause
-      // runtime error.
-      if (output.is_tensor()) {
-        const char* tensor_name = output.tensor()->getName();
-        if (tensor_name == nullptr || std::strlen(tensor_name) == 0) {
-          output.tensor()->setName(output_name.c_str());
-        }
-      }
-      VLOG(2) << "Adding out tensor " << output_name << ": "
-              << output.DebugString();
-      if (!trt_tensors_.insert({output_name, output}).second) {
-        return tensorflow::errors::AlreadyExists(
-            "Output tensor already exists for op: " + op);
+Status Converter::ConvertNode(const NodeDef& node_def) {
+  std::vector<TRT_TensorOrWeights> inputs, outputs;
+  TF_RETURN_IF_ERROR(this->GetInputs(node_def, &inputs));
+
+  OpConverterParams params(this, node_def, inputs, &outputs,
+                           /*arg_validation_only=*/false, &weight_store_);
+  const string& op = node_def.op();
+  if (PluginFactoryTensorRT::GetInstance()->IsPlugin(op)) {
+    TF_RETURN_IF_ERROR(plugin_converter_(&params));
+  } else {
+    if (!op_registry_.count(op)) {
+      return errors::Unimplemented("No converter registered for op: " + op);
+    }
+    OpConverter op_converter = op_registry_.at(op);
+    TF_RETURN_IF_ERROR(op_converter(&params));
+  }
+
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    TRT_TensorOrWeights& output = outputs[i];
+    string output_name = node_def.name();
+    if (i != 0) output_name = StrCat(output_name, ":", i);
+    // We need to check the name before setting it. If the input is one of the
+    // engine input, setting the name here will overwrite engine input
+    // bindings which will cause runtime error.
+    if (output.is_tensor()) {
+      const char* tensor_name = output.tensor()->getName();
+      if (!tensorflow::str_util::StartsWith(tensor_name, kInputPHName)) {
+        // TRT initializes tensor names as "(Unnamed ITensor* N)". We rename
+        // them to match their corresponding TensorFlow name.
+        // Note: ITensors that we create internally within TF-TRT which are
+        // not inputs or outputs of a node will not be renamed. This is a
+        // potential cause of confusion if an error message or warning
+        // mentions the unnamed tensor.
+        output.tensor()->setName(output_name.c_str());
       }
     }
-    return tensorflow::Status::OK();
+    VLOG(2) << "Adding out tensor " << output_name << ": "
+            << output.DebugString();
+    Status status = AddTensorOrWeights(output_name, output);
+    if (!status.ok()) {
+      return Status(status.code(),
+                    StrCat("Failed to add output for node ", node_def.name(),
+                           ": ", status.error_message()));
+    }
+  }
+  return Status::OK();
+}
+
+Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
+                                 const nvinfer1::Dims& dims, int batch_size) {
+  // We verify the batch size only for the input nodes, and rely on individual
+  // op converter to ensure the batch size of the outputs is not changed.
+  // TODO(laigd): we need to test this properties.
+  Status status = MaybeUpdateBatchSize(batch_size);
+  if (!status.ok()) {
+    return Status(status.code(), StrCat("Batch size doesn't match for tensor ",
+                                        name, ": ", status.error_message()));
+  }
+  nvinfer1::ITensor* tensor = network()->addInput(name.c_str(), dtype, dims);
+  if (tensor == nullptr) {
+    return errors::InvalidArgument("Failed to create Input layer tensor ", name,
+                                   " rank=", dims.nbDims);
+  }
+  status = AddTensorOrWeights(name, TRT_TensorOrWeights(tensor));
+  if (!status.ok()) {
+    return Status(status.code(), StrCat("Failed to add input tensor ", name,
+                                        ": ", status.error_message()));
   }
+  return Status::OK();
+}
 
-  nvinfer1::INetworkDefinition* network() { return trt_network_; }
+Status Converter::RenameAndMarkOutputTensors(
+    const std::vector<std::pair<string, string>>& output_tensors) {
+  for (const auto& output : output_tensors) {
+    TRT_TensorOrWeights tensor_or_weights;
+    TF_RETURN_IF_ERROR(GetTensorOrWeights(output.first, &tensor_or_weights));
+    if (!tensor_or_weights.is_tensor()) {
+      return errors::InvalidArgument("Output ", output.first,
+                                     " is weights not tensor");
+    }
+    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
+    if (tensor == nullptr) {
+      return errors::NotFound("Output tensor not found: ", output.first);
+    }
+    tensor->setName(output.second.c_str());
+    VLOG(1) << "Marking output tensor " << output.first << ", as output tensor "
+            << output.second;
+    network()->markOutput(*tensor);
+  }
+  return Status::OK();
+}
 
-  TRT_TensorOrWeights get_tensor(const string& name) {
-    if (!trt_tensors_.count(name)) {
-      return TRT_TensorOrWeights(nullptr);
+Status Converter::MaybeUpdateBatchSize(int batch_size) {
+  // OK iff either is unknown or they equal to each other.
+  if (this->batch_size_ < 0 || batch_size < 0 ||
+      this->batch_size_ == batch_size) {
+    if (this->batch_size_ < 0 && batch_size >= 0) {
+      this->batch_size_ = batch_size;
     }
-    return trt_tensors_.at(name);
+    return Status::OK();
   }
+  return errors::InvalidArgument(
+      "Provided batch size does not match converter batch size: ", batch_size,
+      " vs ", batch_size_);
+}
+
+Status Converter::AddTensorOrWeights(const string& name,
+                                     TRT_TensorOrWeights input) {
+  // Set the batch size of the tensor, using batch size collected from the
+  // input tensors to the TRT subgraph at the beginning of the conversion.
+  // We rely on the individual op converter to understand the semantics of the
+  // TF node, and make sure it doesn't change the batch size nor introduce
+  // intra-element dependency inside the batch.
+  if (input.is_tensor()) input.set_batch_size(batch_size_);
+  if (trt_tensors_.insert({name, std::move(input)}).second) return Status::OK();
+  return errors::AlreadyExists("tensor/weights ", name, " already exist.");
+}
 
-  bool insert_input_tensor(const string& name, nvinfer1::ITensor* tensor) {
-    return trt_tensors_.insert({name, TRT_TensorOrWeights(tensor)}).second;
+Status Converter::GetTensorOrWeights(const string& name,
+                                     TRT_TensorOrWeights* output) {
+  if (!trt_tensors_.count(name)) {
+    return errors::NotFound("Tensor or weights with name ", name,
+                            " could not be found.");
   }
+  *output = trt_tensors_.at(name);
+  return Status::OK();
+}
 
-  nvinfer1::ITensor* TransposeTensor(nvinfer1::ITensor* input_tensor,
-                                     const std::vector<int>& order) {
-    const auto dims = input_tensor->getDimensions();
+Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
+                                  const std::vector<int>& order_with_batch_dim,
+                                  const nvinfer1::ITensor** output_tensor) {
+  const auto dims = input_tensor->getDimensions();
 
-    // TODO(jie): change the return to status and properly exit
-    if (order.size() - 1 != size_t(dims.nbDims))
-      LOG(ERROR) << "Dimension does not match, fail gracefully";
+  if (order_with_batch_dim.size() - 1 != size_t(dims.nbDims)) {
+    return tensorflow::errors::InvalidArgument(
+        "Rank of perm for transpose does not match with that of the input.");
+  }
+  if (order_with_batch_dim[0] != 0) {
+    return tensorflow::errors::Unimplemented(
+        "Transpose at batch dimension is not supported.");
+  }
 
-    nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor);
-    if (layer == nullptr) {
-      return nullptr;
+  nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose");
+  MarkQuantizationRangesAsInferrable(input_tensor, layer->getOutput(0));
+
+  nvinfer1::Permutation permutation;
+  for (int32_t i = 0; i < dims.nbDims; ++i) {
+    permutation.order[i] = order_with_batch_dim[i + 1] - 1;
+  }
+  VLOG(1) << "TransposeTensor permutation: "
+          << DebugString(permutation, dims.nbDims);
+  layer->setFirstTranspose(permutation);
+
+  nvinfer1::Dims reshape_dims;
+  reshape_dims.nbDims = dims.nbDims;
+  for (int32_t i = 0; i < reshape_dims.nbDims; ++i) {
+    reshape_dims.d[i] = 0;
+    // TODO(aaroey): why not transposing the types as well?
+    reshape_dims.type[i] = dims.type[i];
+  }
+  layer->setReshapeDimensions(reshape_dims);
+
+  *output_tensor = layer->getOutput(0);
+  return tensorflow::Status::OK();
+}
+
+Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
+                                 float* out_min, float* out_max) const {
+  switch (weights.type_) {
+    case DataType::DT_FLOAT: {
+      auto inp = static_cast<float const*>(weights.GetValues());
+      auto result = std::minmax_element(inp, inp + weights.count());
+      *out_min = *result.first;
+      *out_max = *result.second;
+      break;
     }
-    nvinfer1::Permutation permutation;
-    for (int32_t i = 0; i < dims.nbDims; ++i) {
-      permutation.order[i] = order[i + 1] - 1;
+    case DataType::DT_HALF: {
+      auto inp = static_cast<Eigen::half const*>(weights.GetValues());
+      auto result = std::minmax_element(inp, inp + weights.count());
+      *out_min = Eigen::half_impl::half_to_float(*result.first);
+      *out_max = Eigen::half_impl::half_to_float(*result.second);
+      break;
+    }
+    case DataType::DT_INT32: {
+      auto inp = static_cast<int const*>(weights.GetValues());
+      auto result = std::minmax_element(inp, inp + weights.count());
+      *out_min = static_cast<float>(*result.first);
+      *out_max = static_cast<float>(*result.second);
+      break;
     }
-    layer->setFirstTranspose(permutation);
+    default:
+      return errors::Unimplemented(
+          "Data type not supported for GetWeightRange: ",
+          DataTypeString(weights.type_));
+  }
+  return Status::OK();
+}
 
-    nvinfer1::Dims reshape_dims;
-    reshape_dims.nbDims = dims.nbDims;
-    for (int32_t i = 0; i < reshape_dims.nbDims; ++i) {
-      reshape_dims.d[i] = 0;
-      reshape_dims.type[i] = dims.type[i];
+Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
+                                        const nvinfer1::Dims& dims,
+                                        const nvinfer1::ITensor** tensor) {
+  // If -1 is not used for one of the dims, we can check if the shapes are
+  // compatible.
+  bool can_check_shapes = true;
+  for (int i = 0; i < dims.nbDims; i++) {
+    if (dims.d[i] == -1) {
+      can_check_shapes = false;
+      break;
     }
-    layer->setReshapeDimensions(reshape_dims);
-    return layer->getOutput(0);
+  }
+  if (can_check_shapes &&
+      TrtDimsNumElements(input.GetTrtDims()) != TrtDimsNumElements(dims)) {
+    return errors::InvalidArgument("Reshape shapes are not compatible (",
+                                   DebugString(input.GetTrtDims()), " vs ",
+                                   DebugString(dims), ")");
   }
 
- private:
-  std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
-  std::unordered_map<string, OpConverter> op_registry_;
-  OpConverter plugin_converter_;
-  nvinfer1::INetworkDefinition* trt_network_;
-  std::list<std::vector<uint8_t>> temp_bufs_;
-
-  // TODO(aaroey): inline the definition of TRTWeightStore here, and add APIs to
-  // operate the stored weights instead of operating it directly.
-  TRTWeightStore* weight_store_;
-
-  bool fp16_;
-
-  void register_op_converters();
-
-  tensorflow::Status get_inputs(const tensorflow::NodeDef& node_def,
-                                std::vector<TRT_TensorOrWeights>* inputs) {
-    for (auto const& input_name : node_def.input()) {
-      /*************************************************************************
-       * TODO(jie): handle case 1) here.
-       * Normalizes the inputs and extracts associated metadata:
-       * 1) Inputs can contain a colon followed by a suffix of characters.
-       *    That suffix may be a single number (e.g. inputName:1) or several
-       *    word characters separated from a number by a colon
-       *    (e.g. inputName:foo:1). The
-       *    latter case is used to denote inputs and outputs of functions.
-       * 2) Control dependency inputs contain caret at the beginning and we
-       *    remove this and annotate the edge as a control dependency.
-       ************************************************************************/
-      // skip control nodes
-      if (input_name[0] == '^') continue;
-      string name = input_name;
-      auto last = name.find_last_of(':');
-      // TODO(aaroey): use TensorId
-      if (last != string::npos && last + 2 == name.size() &&
-          name[last + 1] == '0') {
-        name.erase(last);
+  if (input.is_tensor()) {
+    if (DimsEqual(input.GetTrtDims(), dims)) {
+      *tensor = input.tensor();
+    } else {
+      nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(
+          *const_cast<nvinfer1::ITensor*>(input.tensor()));
+      TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
+      layer->setReshapeDimensions(dims);
+      MarkQuantizationRangesAsInferrable(
+          const_cast<nvinfer1::ITensor*>(input.tensor()), layer->getOutput(0));
+      *tensor = layer->getOutput(0);
+    }
+  } else {
+    nvinfer1::IConstantLayer* layer =
+        this->network()->addConstant(dims, input.weights().GetTrtWeights());
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
+    *tensor = layer->getOutput(0);
+    if (precision_mode() == INT8MODE && !use_calibration()) {
+      // If we are in int8 mode and not calibrating, we need to explicitly set a
+      // quantization range for the output tensor of the IConstantLayer. Here we
+      // set the range to [min(weights), max(weights)].
+      float min_range = 0.0f;
+      float max_range = 0.0f;
+      TF_RETURN_IF_ERROR(
+          GetWeightRange(input.weights(), &min_range, &max_range));
+      // Avoid setting range to 0 because TRT will throw an error. If the
+      // weights are zero then the range doesn't matter: using 127.0f should
+      // ensure the quantized weight will be exactly zero.
+      if (min_range == 0.0f && max_range == 0.0f) {
+        min_range = -127.0f;
+        max_range = 127.0f;
       }
+      ProvideQuantizationRange(const_cast<nvinfer1::ITensor*>(*tensor),
+                               min_range, max_range);
+    }
+  }
+  return tensorflow::Status::OK();
+}
 
-      if (trt_tensors_.count(name)) {
-        TRT_TensorOrWeights& input = trt_tensors_.at(name);
-        inputs->push_back(input);
-        VLOG(2) << "Retrieved input " << name << ": " << input.DebugString();
+void Converter::MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input,
+                                                   nvinfer1::ITensor* output) {
+  quantization_infer_.push_back({input, output});
+  quantization_infer_.push_back({output, input});
+}
+
+void Converter::ProvideQuantizationRange(nvinfer1::ITensor* tensor,
+                                         float min_range, float max_range) {
+  float symmetric_range = std::max(std::abs(min_range), std::abs(max_range));
+  quantization_ranges_[tensor] = symmetric_range;
+}
+
+void Converter::MaybeApplyQuantizationRanges() {
+  if (precision_mode() != INT8MODE) return;
+
+  // Infer ranges across marked ops.
+  PropagateQuantizationRanges();
+  // Apply ranges.
+#if NV_TENSORRT_MAJOR >= 5
+  for (auto pair : quantization_ranges_) {
+    nvinfer1::ITensor* tensor = pair.first;
+    const float range = pair.second;
+    VLOG(1) << "Setting range for: " << tensor->getName() << ": " << range;
+    // TODO(laigd): if 'tensor' already has a range set which doesn't match
+    // 'range', it should report error.
+    tensor->setDynamicRange(-range, range);
+  }
+#endif
+
+  // Warn user about tensors that are missing ranges. If TRT fuses some layers
+  // then these tensors may not actually be required, which is why this is
+  // just a warning. If we are still missing ranges even after fusion,
+  // Builder::buildCudaEngine() will return nullptr and we will catch the
+  // error at that point.
+  if (!use_calibration()) {
+    // Get all tensors from network
+    std::set<nvinfer1::ITensor*> all_tensors;
+    for (int i = 0; i < this->network()->getNbLayers(); i++) {
+      nvinfer1::ILayer* layer = this->network()->getLayer(i);
+      for (int j = 0; j < layer->getNbInputs(); j++) {
+        all_tensors.insert(layer->getInput(j));
+      }
+      for (int j = 0; j < layer->getNbOutputs(); j++) {
+        all_tensors.insert(layer->getOutput(j));
+      }
+    }
+    // Find tensors with no ranges
+    for (auto tensor : all_tensors) {
+      if (!quantization_ranges_.count(tensor)) {
+        // Note: there may be some warnings for "(Unnamed ITensor* N)". These
+        // are tensors which are created internally by TF-TRT. The ranges for
+        // these unnamed ITensors are always inferred from user provided ranges,
+        // thus there will also be a warning for the range(s) the user missed.
+        LOG(WARNING) << "Quantization range was not found for "
+                     << tensor->getName() << ". "
+                     << "This is okay if TensorRT does not need the range "
+                     << "(e.g. due to node fusion).";
+      }
+    }
+  }
+}
+
+void Converter::PropagateQuantizationRanges() {
+  // Propagate ranges across edges in quantization_infer_ until no new
+  // information is added.
+  // Note: this function modifies quantization_infer_, it might be better to
+  // modify a copy instead if we for some reason need quantization_infer_
+  // later.
+  bool information_added = true;
+  while (information_added) {
+    information_added = false;
+    for (auto it = quantization_infer_.begin();
+         it != quantization_infer_.end();) {
+      auto input_tensor_range = quantization_ranges_.find(it->first);
+      auto output_tensor_range = quantization_ranges_.find(it->second);
+      if (input_tensor_range != quantization_ranges_.end() &&
+          output_tensor_range == quantization_ranges_.end()) {
+        // Input has range but output doesn't: copy range
+        // TODO(laigd): consider reporting error if it a different range is
+        // already set.
+        quantization_ranges_[it->second] = input_tensor_range->second;
+        information_added = true;
+        VLOG(1) << "Copy quantization range: " << it->first->getName() << " -> "
+                << it->second->getName();
+      }
+      // We can remove edges when the output range is known
+      if (quantization_ranges_.find(it->second) != quantization_ranges_.end()) {
+        it = quantization_infer_.erase(it);
       } else {
-        // TODO(aaroey): this should not happen, make it a CHECK.
-        // TODO(aaroey): use StrCat for pattern like this.
-        string msg("Node ");
-        StrAppend(&msg, node_def.name(), " should have an input named '", name,
-                  "' but it is not available");
-        LOG(ERROR) << msg;
-        return tensorflow::errors::InvalidArgument(msg);
+        ++it;
       }
     }
-    return tensorflow::Status::OK();
   }
-};
+}
+
+Status Converter::GetInputs(const tensorflow::NodeDef& node_def,
+                            std::vector<TRT_TensorOrWeights>* inputs) const {
+  for (auto const& input_name : node_def.input()) {
+    /*************************************************************************
+     * TODO(jie): handle case 1) here.
+     * Normalizes the inputs and extracts associated metadata:
+     * 1) Inputs can contain a colon followed by a suffix of characters.
+     *    That suffix may be a single number (e.g. inputName:1) or several
+     *    word characters separated from a number by a colon
+     *    (e.g. inputName:foo:1). The
+     *    latter case is used to denote inputs and outputs of functions.
+     * 2) Control dependency inputs contain caret at the beginning and we
+     *    remove this and annotate the edge as a control dependency.
+     ************************************************************************/
+    // skip control nodes
+    if (input_name[0] == '^') continue;
+    string name = input_name;
+    auto last = name.find_last_of(':');
+    // TODO(aaroey): use TensorId
+    if (last != string::npos && last + 2 == name.size() &&
+        name[last + 1] == '0') {
+      name.erase(last);
+    }
+
+    if (trt_tensors_.count(name)) {
+      TRT_TensorOrWeights input = trt_tensors_.at(name);
+      inputs->push_back(input);
+      VLOG(2) << "Retrieved input " << name << ": " << input.DebugString();
+    } else {
+      // TODO(aaroey): this should not happen, make it a CHECK.
+      // TODO(aaroey): use StrCat for pattern like this.
+      string msg("Node ");
+      StrAppend(&msg, node_def.name(), " should have an input named '", name,
+                "' but it is not available");
+      LOG(ERROR) << msg;
+      return tensorflow::errors::InvalidArgument(msg);
+    }
+  }
+  return tensorflow::Status::OK();
+}
 
-TRT_ShapedWeights ConvertFP32ToFP16(Converter& ctx,
+TRT_ShapedWeights ConvertFP32ToFP16(TrtWeightStore* store,
                                     const TRT_ShapedWeights& weights_src) {
   auto dtype_new = tensorflow::DataType::DT_HALF;
   TRT_ShapedWeights weights =
-      ctx.get_temp_weights(dtype_new, weights_src.shape_);
+      store->GetTempWeights(dtype_new, weights_src.shape_);
   const float* src = static_cast<const float*>(weights_src.GetValues());
   Eigen::half* dst = const_cast<Eigen::half*>(
       static_cast<Eigen::half const*>(weights.GetValues()));
@@ -826,12 +1258,11 @@ TRT_ShapedWeights ConvertFP32ToFP16(Converter& ctx,
 }
 
 // ****************************************************************************
-// Constant folding functions
-// TODO(jie): once optimizer kicks in, we should have done constant folding
-// there.
+// Constant folding functions for weights.
+// TODO(laigd): we should probably use eigen directly.
 // *****************************************************************************
 struct LambdaFactory {
-  enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB, RECIP };
+  enum class OP_CATEGORY : int { RSQRT = 0, NEG, RECIP };
   OP_CATEGORY op;
 
   template <typename T>
@@ -846,84 +1277,10 @@ struct LambdaFactory {
       case OP_CATEGORY::RECIP:
         return [](T t) -> T { return 1.0 / t; };
       default:
-        VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+        LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
         return nullptr;
     }
   }
-
-  template <typename T>
-  std::function<T(T, T)> binary() {
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [](T l, T r) -> T { return l + r; };
-      case OP_CATEGORY::SUB:
-        return [](T l, T r) -> T { return l - r; };
-      case OP_CATEGORY::MUL:
-        return [](T l, T r) -> T { return l * r; };
-      default:
-        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [](T l, T r) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
-
-  template <typename T>
-  std::function<T(T)> broadcast_r(T val) {
-    VLOG(2) << "LAMBDA VAL : " << val;
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l + val;
-        };
-      case OP_CATEGORY::SUB:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l - val;
-        };
-      case OP_CATEGORY::MUL:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l * val;
-        };
-      default:
-        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [val](T l) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
-
-  template <typename T>
-  std::function<T(T)> broadcast_l(T val) {
-    VLOG(2) << "LAMBDA VAL : " << val;
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val + l;
-        };
-      case OP_CATEGORY::SUB:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val - l;
-        };
-      case OP_CATEGORY::MUL:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val * l;
-        };
-      default:
-        LOG(ERROR) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [val](T l) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
 };
 
 template <>
@@ -931,15 +1288,18 @@ std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
   switch (op) {
     case OP_CATEGORY::RSQRT: {
       VLOG(2) << "RSQRT GETS DONE";
-      return [](Eigen::half t) -> Eigen::half {
+      return [](Eigen::half t) {
         return Eigen::half(1.0 / sqrt(static_cast<float>(t)));
       };
     }
     case OP_CATEGORY::NEG:
-      return [](Eigen::half t) -> Eigen::half { return -t; };
-    // TODO(aaroey): can we support RECIP?
+      return [](Eigen::half t) { return -t; };
+    case OP_CATEGORY::RECIP:
+      return [](Eigen::half t) {
+        return Eigen::half(1.0 / static_cast<float>(t));
+      };
     default:
-      VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+      LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
       return nullptr;
   }
 }
@@ -971,120 +1331,48 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status BinaryCompute(const TRT_ShapedWeights& iweights_l,
-                                 const TRT_ShapedWeights& iweights_r,
-                                 TRT_ShapedWeights* oweights,
-                                 LambdaFactory binary_op) {
-  // Assume iweights_l.type == iweight_r.type
-  CHECK_EQ(iweights_l.type_, oweights->type_);
-  CHECK_EQ(iweights_r.type_, oweights->type_);
-  VLOG(2) << "SANITY CHECK!";
-
-  switch (iweights_l.type_) {
-    case tensorflow::DataType::DT_FLOAT: {
-      auto inp_l = static_cast<const float*>(iweights_l.GetValues());
-      auto inp_r = static_cast<const float*>(iweights_r.GetValues());
-      auto oup = static_cast<float*>(const_cast<void*>(oweights->GetValues()));
-
-      if (iweights_l.count() != iweights_r.count()) {
-        // We only supports broadcast of RankZero
-        if (iweights_l.count() == 1) {
-          // TODO(aaroey): Remove loggings like this.
-          VLOG(2) << "I bet it is not working!" << (*inp_l);
-          std::transform(inp_r, inp_r + iweights_r.count(), oup,
-                         binary_op.broadcast_l<float>(*inp_l));
-        } else if (iweights_r.count() == 1) {
-          VLOG(2) << "I bet it is not working!" << (*inp_r);
-          std::transform(inp_l, inp_l + iweights_l.count(), oup,
-                         binary_op.broadcast_r<float>(*inp_r));
-        } else {
-          return tensorflow::errors::Unimplemented(
-              "Binary op with non-rankZero broadcast not supported");
-        }
-      } else {
-        std::transform(inp_l, inp_l + iweights_l.count(), inp_r, oup,
-                       binary_op.binary<float>());
-      }
-      break;
-    }
-    case tensorflow::DataType::DT_HALF: {
-      auto inp_l = static_cast<const Eigen::half*>(iweights_l.GetValues());
-      auto inp_r = static_cast<const Eigen::half*>(iweights_r.GetValues());
-      auto oup =
-          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues()));
-
-      if (iweights_l.count() != iweights_r.count()) {
-        // We only supports broadcast of RankZero
-        if (iweights_l.count() == 1) {
-          VLOG(2) << "I bet it is not working!" << (*inp_l);
-          std::transform(inp_r, inp_r + iweights_r.count(), oup,
-                         binary_op.broadcast_l<Eigen::half>(*inp_l));
-        } else if (iweights_r.count() == 1) {
-          VLOG(2) << "I bet it is not working!" << (*inp_r);
-          std::transform(inp_l, inp_l + iweights_l.count(), oup,
-                         binary_op.broadcast_r<Eigen::half>(*inp_r));
-        } else {
-          return tensorflow::errors::Unimplemented(
-              "Binary op with non-rankZero broadcast not supported");
-        }
-      } else {
-        std::transform(inp_l, inp_l + iweights_l.count(), inp_r, oup,
-                       binary_op.binary<Eigen::half>());
-      }
-      break;
-    }
-    default:
-      return tensorflow::errors::Unimplemented(
-          "Data type not supported: " +
-          tensorflow::DataTypeString(iweights_l.type_));
-  }
-
-  return tensorflow::Status::OK();
-}
-
+// If swapped_inputs is false, 'tensor' is the left operand and 'weights' is the
+// right operand. If swapped_inputs is true, those two are swapped.
+//
 // TODO(jie): broadcast is needed yet not implemented.
-// Only implemented channel wise for the time being
-tensorflow::Status BinaryTensorOpWeight(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const nvinfer1::ITensor* tensor, TRT_ShapedWeights weights,
-    bool swapped_inputs, std::vector<TRT_TensorOrWeights>* outputs) {
-  // tensor is the left operand while weights is the right operand;
-  // when swapped_inputs set to true, those two are swapped.
-  // TODO(aaroey): use a set.
-  if (node_def.op() != "Sub" && node_def.op() != "Add" &&
-      node_def.op() != "Mul" && node_def.op() != "Div" &&
-      node_def.op() != "RealDiv") {
-    return tensorflow::errors::Unimplemented(
-        "op not supported: " + node_def.op() + ", at: " + node_def.name());
-  }
-
-  // Check type consistency
-  nvinfer1::DataType ttype;
-  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &ttype));
+// Only implemented channel wise for the time being.
+Status BinaryTensorOpWeight(OpConverterParams* params,
+                            const nvinfer1::ITensor* tensor,
+                            TRT_ShapedWeights weights, bool swapped_inputs) {
+  static const std::unordered_set<string> supported_ops = {"Sub", "Add", "Mul",
+                                                           "Div", "RealDiv"};
+  const auto& node_def = params->node_def;
+  if (!supported_ops.count(node_def.op())) {
+    return errors::Unimplemented(node_def.op(), " is not supported, at ",
+                                 node_def.name());
+  }
+
+  // Check type consistency.
+  nvinfer1::DataType trt_dtype;
+  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &trt_dtype));
 
-  // Check scale mode
+  // Check scale mode.
   auto dims_w = weights.shape_;
-  auto dims_t = tensor->getDimensions();
+  const auto dims_t = tensor->getDimensions();
 
   // TODO(jie): addScale checks for input tensor dimension
   if (dims_t.nbDims != 3) {
-    return tensorflow::errors::InvalidArgument(
-        "addScale requires tensor with rank 3, " + node_def.name());
+    return errors::InvalidArgument("addScale requires tensor with rank 3, at ",
+                                   node_def.name());
   }
 
-  // default to element-wise
+  // Default to element-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
   // TODO(jie): maybe use a permutation instead to support more cases;
-  bool permutation_flag = false;
+  bool need_to_permute = false;
 
   if (weights.count() == 1) {
-    VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
-    // no broadcasting on Batch dimension;
-    VLOG(2) << "WEIGHTS DIM: " << dims_w.nbDims
-            << " tensor DIM: " << dims_t.nbDims;
+    VLOG(2) << "weights dims: " << DebugString(dims_w)
+            << "; tensor dims: " << DebugString(dims_t);
+    // Make sure no broadcasting on batch dimension.
     if (dims_w.nbDims == dims_t.nbDims + 1) {
       if (dims_w.d[0] == 1) {
         for (int i = 1; i < dims_w.nbDims; i++) {
@@ -1092,83 +1380,85 @@ tensorflow::Status BinaryTensorOpWeight(
         }
         dims_w.nbDims--;
       } else {
-        return tensorflow::errors::InvalidArgument(
-            "Binary op cannot operate on batch, " + node_def.name());
+        return errors::InvalidArgument("Binary op cannot operate on batch, at ",
+                                       node_def.name());
       }
     }
 
     if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) {
       scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      // default is element;
+      // Default is element-wise
       for (int i = 1; i < dims_w.nbDims; i++) {
         if (dims_w.d[i] != dims_t.d[i]) {
-          // if dimension does not match, switch back to channel;
-          VLOG(2) << "channel";
+          // If dimension does not match, switch back to per-channel
           scale_mode = nvinfer1::ScaleMode::kCHANNEL;
           break;
         }
       }
-      // if channel as candidate, validate it
+      // If the mode is per-channel, since channel dimension is assumed to be
+      // the third to last dimension, we need to make sure all other dimensions
+      // have size 1.
       if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
         for (int i = 1; i < dims_w.nbDims; i++) {
           if (dims_w.d[i] != 1)
-            return tensorflow::errors::InvalidArgument(
-                "Weight shape not compatible at, " + node_def.name());
+            return errors::InvalidArgument(
+                "Weight dims not compatible for channel-wise broadcast at ",
+                node_def.name());
         }
-      } else {
-        VLOG(2) << "elementwise";
       }
     } else if (dims_w.nbDims == 1 &&
                dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) {
-      // channel wise and broadcast required;
-      permutation_flag = true;
+      // Channel wise and broadcast required. We compare the last dimension of
+      // the tensor shape because of tensorflow default broadcasting rules.
+      need_to_permute = true;
       scale_mode = nvinfer1::ScaleMode::kCHANNEL;
     } else {
-      return tensorflow::errors::InvalidArgument(
-          "Weight shape not compatible at, " + node_def.name());
+      return errors::InvalidArgument("Weight dims not compatible at ",
+                                     node_def.name());
     }
   }
+  // TODO(laigd): we should add validation_only support in TransposeTensor() and
+  // PrepareTensorForShape().
+  if (params->validation_only) return Status::OK();
 
-  // transpose last dimension
+  // Transpose last dimension.
   std::vector<int> permutation(dims_t.nbDims + 1);
-  if (permutation_flag) {
-    if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
-      // we swap the last dimension into channel for trt.
-      // because of tensorflow default broadcasting rules.
-      for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
-        permutation[i] = i;
-      }
-      permutation[1] = dims_t.nbDims;
-      permutation[dims_t.nbDims] = 1;
-      tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
-                                   permutation);
-      TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name());
-    } else {
-      return tensorflow::errors::InvalidArgument(
-          "Transpose cannot be applied, " + node_def.name());
+  if (need_to_permute) {
+    // We swap the last dimension into channel for trt, because of tensorflow
+    // default broadcasting rules.
+    for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
+      permutation[i] = i;
     }
+    permutation[1] = dims_t.nbDims;
+    permutation[dims_t.nbDims] = 1;
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(tensor), permutation, &tensor));
   }
 
-  if (ctx.isFP16()) {
-    weights = ConvertFP32ToFP16(ctx, weights);
+  if (params->converter->precision_mode() == FP16MODE) {
+    weights = ConvertFP32ToFP16(params->weight_store, weights);
   }
 
-  // prepare weights
+  // Prepare weights
   TRT_ShapedWeights shift_weights(weights.type_);
   TRT_ShapedWeights scale_weights(weights.type_);
   TRT_ShapedWeights power_weights(weights.type_);
 
-  // Maybe I should do a switch
   if (node_def.op() == "Sub") {
     if (swapped_inputs) {
       shift_weights = weights;
-      nvinfer1::IUnaryLayer* layer =
-          ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor),
-                                  nvinfer1::UnaryOperation::kNEG);
+      nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
+          *const_cast<nvinfer1::ITensor*>(tensor),
+          nvinfer1::UnaryOperation::kNEG);
       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+      // Since quantization ranges are symmetric, the same range as the input
+      // will work for the negation of the input.
+      params->converter->MarkQuantizationRangesAsInferrable(
+          const_cast<nvinfer1::ITensor*>(tensor), layer->getOutput(0));
       tensor = layer->getOutput(0);
     } else {
-      TRT_ShapedWeights neg_weights = ctx.get_temp_weights_like(weights);
+      TRT_ShapedWeights neg_weights =
+          params->weight_store->GetTempWeights(weights);
       LambdaFactory unary_op;
       unary_op.op = LambdaFactory::OP_CATEGORY::NEG;
       TF_RETURN_IF_ERROR(UnaryCompute(weights, &neg_weights, unary_op));
@@ -1176,14 +1466,34 @@ tensorflow::Status BinaryTensorOpWeight(
     }
   } else if (node_def.op() == "Div" || node_def.op() == "RealDiv") {
     if (swapped_inputs) {
+      // We need to infer the quantization range for this intermediate tensor.
+      //
+      //   x -> [Recip] -> 1/x -> [Scale] -> s/x
+      //                    ^
+      //            need range for this
+      //
+      // We have the quantization scales for x and s/x - can we divide the scale
+      // for s/x by s? Only if it is a scalar.
+      //
+      // Because of this issue, fall back to BinaryTensorOpTensor if we are
+      // doing INT8 with no calibration. There is most likely no performance
+      // penalty by falling back here.
+      if (params->converter->precision_mode() == INT8MODE &&
+          !params->converter->use_calibration()) {
+        return errors::Unimplemented(
+            "Intermediate quantization range cannot be determined without"
+            " calibration. Falling back to BinaryTensorOpTensor for ",
+            node_def.op(), ", at ", node_def.name());
+      }
       scale_weights = weights;
-      nvinfer1::IUnaryLayer* layer =
-          ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor),
-                                  nvinfer1::UnaryOperation::kRECIP);
+      nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
+          *const_cast<nvinfer1::ITensor*>(tensor),
+          nvinfer1::UnaryOperation::kRECIP);
       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
       tensor = layer->getOutput(0);
     } else {
-      TRT_ShapedWeights recip_weights = ctx.get_temp_weights_like(weights);
+      TRT_ShapedWeights recip_weights =
+          params->weight_store->GetTempWeights(weights);
       LambdaFactory unary_op;
       unary_op.op = LambdaFactory::OP_CATEGORY::RECIP;
       TF_RETURN_IF_ERROR(UnaryCompute(weights, &recip_weights, unary_op));
@@ -1194,44 +1504,44 @@ tensorflow::Status BinaryTensorOpWeight(
   } else if (node_def.op() == "Add") {
     shift_weights = weights;
   } else {
-    return tensorflow::errors::Unimplemented("Binary op not supported: " +
-                                             node_def.op());
+    // This should not happen.
+    return errors::Unimplemented("Binary op not supported at ", node_def.op());
   }
 
-  nvinfer1::IScaleLayer* layer = ctx.network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), scale_mode, shift_weights,
-      scale_weights, power_weights);
+  nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
+      *const_cast<nvinfer1::ITensor*>(tensor), scale_mode,
+      shift_weights.GetTrtWeights(), scale_weights.GetTrtWeights(),
+      power_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  // transpose back dimension
-  if (permutation_flag) {
-    output_tensor = ctx.TransposeTensor(output_tensor, permutation);
-    TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name());
+  const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  // Transpose back dimension
+  if (need_to_permute) {
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(output_tensor), permutation,
+        &output_tensor));
   }
 
   // Pass the output
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
   return tensorflow::Status::OK();
 }
 
 enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
 
-tensorflow::Status ConvertConv2DHelper(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs, int group) {
+tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-
   TFAttrs attrs(node_def);
 
   int h_index = 2;
   int w_index = 3;
   auto data_format = attrs.get<string>("data_format");
   if (data_format == "NHWC") {
-    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
-                                 {0, 3, 1, 2});
-    TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name());
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}, &tensor));
     h_index = 1;
     w_index = 2;
     // TODO(jie): transpose it
@@ -1250,11 +1560,13 @@ tensorflow::Status ConvertConv2DHelper(
     return tensorflow::errors::Internal(
         "Conv2D expects kernel of dimension 4, at: " + node_def.name());
   }
-  if (ctx.isFP16()) {
-    weights_rsck = ConvertFP32ToFP16(ctx, inputs.at(1).weights());
+  if (params->converter->precision_mode() == FP16MODE) {
+    weights_rsck =
+        ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
   }
 
-  TRT_ShapedWeights weights = ctx.get_temp_weights_like(weights_rsck);
+  TRT_ShapedWeights weights =
+      params->weight_store->GetTempWeights(weights_rsck);
   ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
   TRT_ShapedWeights biases(weights.type_);
   const int noutput = weights.shape_.d[0] * num_groups;
@@ -1290,88 +1602,58 @@ tensorflow::Status ConvertConv2DHelper(
     VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
             << padding[1].first << padding[1].second;
     VLOG(2) << "TENSOR before: " << DebugString(tensor->getDimensions());
-    auto pad_layer = ctx.network()->addPadding(
+    auto pad_layer = params->converter->network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
+    params->converter->MarkQuantizationRangesAsInferrable(
+        const_cast<nvinfer1::ITensor*>(tensor), pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
     VLOG(2) << "TENSOR after: " << DebugString(tensor->getDimensions());
   }
 
   nvinfer1::IConvolutionLayer* layer =
-      ctx.network()->addConvolution(*const_cast<nvinfer1::ITensor*>(tensor),
-                                    noutput, kernel_size, weights, biases);
+      params->converter->network()->addConvolution(
+          *const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size,
+          weights.GetTrtWeights(), biases.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
   layer->setName(node_def.name().c_str());
   layer->setNbGroups(num_groups);
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   VLOG(2) << "TENSOR out: " << DebugString(output_tensor->getDimensions());
   VLOG(2) << "data_format: " << data_format;
   if (data_format == "NHWC") {
     // TODO(jie): transpose it back!
-    output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
-    TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name());
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(output_tensor), {0, 2, 3, 1},
+        &output_tensor));
   }
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertConv2DHelper(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs, ConvolutionType type) {
+tensorflow::Status ConvertConv2DHelper(OpConverterParams* params,
+                                       ConvolutionType type) {
   switch (type) {
     case ConvolutionType::DEFAULT:
-      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 1);
+      return ConvertConv2DHelper(params, 1);
     case ConvolutionType::DEPTHWISE_CONV:
-      return ConvertConv2DHelper(ctx, node_def, inputs, outputs, 0);
+      return ConvertConv2DHelper(params, 0);
   }
   return tensorflow::errors::Unimplemented("unsupported convolution type at, " +
-                                           node_def.name());
+                                           params->node_def.name());
 }
 
-// Helper function converts input into tensor with shape specified by dims.
-bool PrepareTensorForShape(Converter& ctx, const TRT_TensorOrWeights& input,
-                           const nvinfer1::Dims& dims,
-                           const nvinfer1::ITensor** tensor) {
-  if (input.is_tensor()) {
-    if (DimsEqual(input.shape(), dims)) {
-      *tensor = input.tensor();
-    } else {
-      nvinfer1::IShuffleLayer* layer = ctx.network()->addShuffle(
-          *const_cast<nvinfer1::ITensor*>(input.tensor()));
-      if (layer != nullptr) {
-        layer->setReshapeDimensions(dims);
-        *tensor = layer->getOutput(0);
-      } else {
-        return false;
-      }
-    }
-  } else {
-#if NV_TENSORRT_MAJOR > 3
-    nvinfer1::IConstantLayer* layer =
-        ctx.network()->addConstant(dims, input.weights());
-    if (layer != nullptr) {
-      *tensor = layer->getOutput(0);
-    } else {
-      return false;
-    }
-#else
-    return false;
-#endif
-  }
-  return true;
-}
-
-tensorflow::Status BinaryTensorOpTensor(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r,
-    std::vector<TRT_TensorOrWeights>* outputs) {
+Status BinaryTensorOpTensor(OpConverterParams* params,
+                            const TRT_TensorOrWeights& operand_l,
+                            const TRT_TensorOrWeights& operand_r) {
+  const auto& node_def = params->node_def;
   static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
       {"Add", nvinfer1::ElementWiseOperation::kSUM},
       {"Mul", nvinfer1::ElementWiseOperation::kPROD},
@@ -1381,59 +1663,62 @@ tensorflow::Status BinaryTensorOpTensor(
       {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
       {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
   };
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end()) {
+    return errors::Unimplemented("Binary op ", node_def.op(),
+                                 " not supported at: ", node_def.name());
+  }
 
-  const nvinfer1::ITensor* tensor_l;
-  const nvinfer1::ITensor* tensor_r;
-
-  nvinfer1::Dims dim_l;
-  nvinfer1::Dims dim_r;
-
-  if (!TensorRTGetBroadcastShape(operand_l.shape(), operand_l.is_tensor(),
-                                 operand_r.shape(), operand_r.is_tensor(),
-                                 &dim_l, &dim_r)) {
-    return tensorflow::errors::InvalidArgument(
-        "Binary op broadcast scheme not supported by TensorRT op: " +
-        node_def.op() + ", at: " + node_def.name());
+  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
+  Status status = params->converter->GetTrtBroadcastShape(
+      operand_l, operand_r, &broadcasted_dims_l, &broadcasted_dims_r);
+  if (!status.ok()) {
+    return errors::InvalidArgument(
+        "Unsupported binary op broadcast scheme for op ", node_def.name(), ": ",
+        status.error_message());
   }
+  if (params->validation_only) return Status::OK();
 
-  TFTRT_RETURN_ERROR_IF_FALSE(
-      PrepareTensorForShape(ctx, operand_l, dim_l, &tensor_l), node_def.name());
-  TFTRT_RETURN_ERROR_IF_FALSE(
-      PrepareTensorForShape(ctx, operand_r, dim_r, &tensor_r), node_def.name());
+  const nvinfer1::ITensor* tensor_l = nullptr;
+  const nvinfer1::ITensor* tensor_r = nullptr;
+  status = params->converter->PrepareTensorForShape(
+      operand_l, broadcasted_dims_l, &tensor_l);
+  if (status.ok()) {
+    status = params->converter->PrepareTensorForShape(
+        operand_r, broadcasted_dims_r, &tensor_r);
+  }
+  if (!status.ok()) {
+    return errors::Internal("Failed to convert binary op ", node_def.name(),
+                            ": ", status.error_message());
+  }
 
-  // get trt type & shape
+  // Check type consistency.
   TFAttrs attrs(node_def);
-  // maybe this part has to be moved into the block of rsqrt later
   nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
-
-  // check type consistency
-  TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype);
-  TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype);
-  auto op_pair = ops.find(node_def.op());
-  if (op_pair == ops.end()) {
-    return tensorflow::errors::Unimplemented(
-        "binary op: ", node_def.op(), " not supported at: ", node_def.name());
-  }
-
-  nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
-      // TODO(aaroey): will tensor_l/tensor_r get modified?
-      *const_cast<nvinfer1::ITensor*>(tensor_l),
-      *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
+  TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype)
+      << DebugString(tensor_l->getType()) << " vs " << DebugString(dtype);
+  TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype)
+      << DebugString(tensor_r->getType()) << " vs " << DebugString(dtype);
+
+  // Add ElementWise layer.
+  nvinfer1::IElementWiseLayer* layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor_l),
+          *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // pass the output
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  // Pass the output
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertPlugin(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertPlugin(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   // prepare input
   std::vector<nvinfer1::ITensor*> all_inputs;
+  all_inputs.reserve(inputs.size());
   for (auto input : inputs) {
     all_inputs.emplace_back(const_cast<nvinfer1::ITensor*>(input.tensor()));
   }
@@ -1457,36 +1742,291 @@ tensorflow::Status ConvertPlugin(Converter& ctx,
     }
   }
 
-  nvinfer1::IPluginLayer* layer = ctx.network()->addPlugin(
+  nvinfer1::IPluginLayer* layer = params->converter->network()->addPlugin(
       &all_inputs[0], static_cast<int>(inputs.size()), *plugin);
 
   for (int i = 0; i < layer->getNbOutputs(); i++) {
     nvinfer1::ITensor* output_tensor = layer->getOutput(i);
-    outputs->push_back(TRT_TensorOrWeights(output_tensor));
+    params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertTranspose(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights()) {
+    return tensorflow::errors::InvalidArgument(
+        "Input expects tensor and weights, at ", params->node_def.name());
+  }
+
+  // Get the permutation from weights.
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  const int* weights_ptr =
+      static_cast<int*>(const_cast<void*>(weights.GetValues()));
+  std::vector<int> perm(weights_ptr, weights_ptr + weights.count());
+
+  // Verify the permutation.
+  nvinfer1::ITensor* input_tensor =
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
+  if (perm.size() - 1 != size_t(input_tensor->getDimensions().nbDims)) {
+    return errors::InvalidArgument(
+        "Rank of perm for transpose does not match with that of the input.");
+  }
+  if (perm[0] != 0) {
+    return errors::Unimplemented(
+        "Transpose at batch dimension is not supported.");
+  }
+
+  if (params->validation_only) return Status::OK();
+
+  // Start conversion.
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_RETURN_IF_ERROR(
+      params->converter->TransposeTensor(input_tensor, perm, &output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertReshape(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 2 || !inputs.at(1).is_weights()) {
+    return tensorflow::errors::InvalidArgument(
+        "Input expects weights for shape, at ", node_def.name());
   }
+
+  TRT_TensorOrWeights input_tensor = inputs.at(0);
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (weights.count() == 0) {
+    return tensorflow::errors::Unimplemented(
+        "Reshape to shape=[] is not supported, at ", node_def.name());
+  }
+
+  const int* weights_ptr =
+      static_cast<int*>(const_cast<void*>(weights.GetValues()));
+
+  // Check that it doesn't change the batch dimension. This check is
+  // conservative, for example, when the first dim of the shape is -1 and input
+  // tensor shape is not fixed, it is still possible that the reshape doesn't
+  // change the batch dim, but as long as there is a possibility that it could
+  // change the batch dim, it reject the conversion. The parameters are:
+  //
+  // * reshape_batch_dim: the value of the first dim of the input shape constant
+  // * reshape_dims: all other dims of the input shape constant
+  // * input_batch_dim: the value of the first dim of the input tensor to
+  //   reshape
+  // * input_dims: all other dims of the input tensor to reshape
+  //
+  // The validation logic is:
+  //
+  // if input_batch_dim is fixed:
+  //   if reshape_batch_dim == input_batch_dim:
+  //     ok
+  //   elif reshape_batch_dim == -1 (meaning reshape_dims are fixed) and
+  //        input_dims are fixed and
+  //        prod(input_dims) == prod(reshape_dims)
+  //     ok
+  //   else:
+  //     not ok
+  // elif input_dims are fixed:
+  //   if reshape_dims are fixed and
+  //      prod(input_dims) == prod(reshape_dims):
+  //     ok
+  //   else:
+  //     not ok
+  // else:
+  //   not ok
+
+  const int input_batch_dim = input_tensor.batch_size();
+  const int reshape_batch_dim = weights_ptr[0];
+  const nvinfer1::Dims input_dims = input_tensor.GetTrtDims();
+
+  nvinfer1::Dims reshape_dims;
+  reshape_dims.nbDims = weights.count() - 1;
+  for (int i = 1; i < weights.count(); i++) {
+    reshape_dims.d[i - 1] = weights_ptr[i];
+  }
+
+  // Check that it doesn't change the batch dimension according to the logic
+  // mentioned above.
+  bool reshape_may_change_batch_dim = false;
+  if (input_batch_dim > 0) {        // Batch size is fixed.
+    if (reshape_batch_dim == -1) {  // Other dims of the shape must be fixed.
+      if (!HasStaticShape(input_dims) ||
+          TrtDimsNumElements(reshape_dims) != TrtDimsNumElements(input_dims)) {
+        reshape_may_change_batch_dim = true;
+      }
+    } else if (reshape_batch_dim != input_batch_dim) {
+      reshape_may_change_batch_dim = true;
+    }
+  } else if (HasStaticShape(input_dims)) {
+    if (!HasStaticShape(reshape_dims) ||
+        TrtDimsNumElements(reshape_dims) != TrtDimsNumElements(input_dims)) {
+      reshape_may_change_batch_dim = true;
+    }
+  } else {
+    reshape_may_change_batch_dim = true;
+  }
+  VLOG(1) << "input_batch_dim=" << input_batch_dim
+          << ", input_dims=" << DebugString(input_dims)
+          << "\nreshape_batch_dim=" << reshape_batch_dim
+          << ", reshape_dims=" << DebugString(reshape_dims);
+  if (reshape_may_change_batch_dim) {
+    const string msg = StrCat(
+        "Reshape on batch dimension is not supported, at ", node_def.name());
+    return errors::Unimplemented(msg);
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Start conversion.
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      input_tensor, reshape_dims, &output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertExpandDims(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 2) {
+    return tensorflow::errors::InvalidArgument(
+        "Two inputs expected for ExpandDims, at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "ExpandDims expects tensor for input, at ", node_def.name());
+  }
+  if (!inputs.at(1).is_weights()) {
+    return tensorflow::errors::InvalidArgument(
+        "ExpandDims expects weights for axis, at ", node_def.name());
+  }
+  // Get input shape as vector.
+  TRT_TensorOrWeights input_tensor = inputs.at(0);
+  const nvinfer1::Dims dims = input_tensor.GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dim back.
+  input_dims.insert(input_dims.begin(), -1);
+  const int input_rank = input_dims.size();
+  // Get axis to expand on.
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (weights.count() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "ExpandDims axis must be a scalar, at ", node_def.name());
+  }
+  const int* weights_ptr =
+      static_cast<int*>(const_cast<void*>(weights.GetValues()));
+  int axis = weights_ptr[0];
+  // Make sure axis is valid.
+  if ((axis < (-input_rank - 1)) || (axis > input_rank)) {
+    return tensorflow::errors::InvalidArgument(
+        "Axis for ExpandDims is invalid, must be in the range "
+        "[-rank(input) - 1, rank(input)], at ",
+        node_def.name());
+  }
+  // Convert negative axis to corresponding positive axis.
+  if (axis < 0) axis += input_rank + 1;
+  if (axis == 0) {
+    return tensorflow::errors::Unimplemented(
+        "Modifying batch dimension is not supported for ExpandDims, at ",
+        node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
+
+  // ExpandDims: Insert new dim of size 1.
+  input_dims.insert(input_dims.begin() + axis, 1);
+  // Reshape tensor.
+  nvinfer1::Dims new_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
+                                               /*ignore_first_dim=*/true));
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      input_tensor, new_dims, &output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "One input expected for Squeeze, at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "Squeeze expects tensor for input, at ", node_def.name());
+  }
+  // Get input shape.
+  TRT_TensorOrWeights input_tensor = inputs.at(0);
+  const nvinfer1::Dims dims = input_tensor.GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dim back.
+  input_dims.insert(input_dims.begin(), -1);
+  const int input_rank = input_dims.size();
+  // Mark axes to remove by setting them to 0.
+  TFAttrs attrs(node_def);
+  auto squeeze_dims = attrs.get<std::vector<int>>("squeeze_dims");
+  if (squeeze_dims.size() == 0) {
+    return tensorflow::errors::Unimplemented(
+        "Squeeze is only implemented for explicit dims, at ", node_def.name());
+  }
+  for (int axis : squeeze_dims) {
+    // Make sure axis is valid.
+    if ((axis < -input_rank) || (axis >= input_rank)) {
+      return tensorflow::errors::InvalidArgument(
+          "Axis for Squeeze is invalid, must be in the range "
+          "[-rank(input), rank(input)), at ",
+          node_def.name());
+    }
+    // Convert negative axis to corresponding positive axis.
+    if (axis < 0) axis += input_rank;
+    // Don't squeeze batch dim.
+    if (axis == 0) {
+      return tensorflow::errors::Unimplemented(
+          "Cannot squeeze batch dimension, at ", node_def.name());
+    }
+    // Make sure target dimension is size 1.
+    if (input_dims[axis] != 1) {
+      return tensorflow::errors::InvalidArgument(
+          "Cannot squeeze a dimension which isn't size 1, at ",
+          node_def.name());
+    }
+    // Mark dim for removal by setting to 0.
+    input_dims[axis] = 0;
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Remove all dims which are equal to 0.
+  input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0),
+                   input_dims.end());
+  // Reshape tensor.
+  nvinfer1::Dims new_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
+                                               /*ignore_first_dim=*/true));
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      input_tensor, new_dims, &output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertConv2D(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
-                             ConvolutionType::DEFAULT);
+tensorflow::Status ConvertConv2D(OpConverterParams* params) {
+  return ConvertConv2DHelper(params, ConvolutionType::DEFAULT);
 }
 
-tensorflow::Status ConvertConv2DDepthwise(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  return ConvertConv2DHelper(ctx, node_def, inputs, outputs,
-                             ConvolutionType::DEPTHWISE_CONV);
+tensorflow::Status ConvertConv2DDepthwise(OpConverterParams* params) {
+  return ConvertConv2DHelper(params, ConvolutionType::DEPTHWISE_CONV);
 }
 
-tensorflow::Status ConvertPool(Converter& ctx,
-                               const tensorflow::NodeDef& node_def,
-                               const std::vector<TRT_TensorOrWeights>& inputs,
-                               std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertPool(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   TFAttrs attrs(node_def);
 
@@ -1496,9 +2036,8 @@ tensorflow::Status ConvertPool(Converter& ctx,
   if (data_format == "NHWC") {
     h_index = 1;
     w_index = 2;
-    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
-                                 {0, 3, 1, 2});
-    TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name());
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}, &tensor));
   }
 
   nvinfer1::PoolingType type;
@@ -1538,330 +2077,533 @@ tensorflow::Status ConvertPool(Converter& ctx,
       padding[1].first != padding[1].second) {
     VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
             << padding[1].first << padding[1].second;
-    auto pad_layer = ctx.network()->addPadding(
+    auto pad_layer = params->converter->network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
+    params->converter->MarkQuantizationRangesAsInferrable(
+        const_cast<nvinfer1::ITensor*>(tensor), pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
   }
 
-  nvinfer1::IPoolingLayer* layer = ctx.network()->addPooling(
+  nvinfer1::IPoolingLayer* layer = params->converter->network()->addPooling(
       *const_cast<nvinfer1::ITensor*>(tensor), type, ksize);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  // TODO(tmorris): Average pooling may not be entirely safe to infer
+  // quantization range through (at least forwards - backwards should be fine).
+  // Max pooling is okay.
+  params->converter->MarkQuantizationRangesAsInferrable(
+      const_cast<nvinfer1::ITensor*>(tensor), layer->getOutput(0));
 
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
   layer->setName(node_def.name().c_str());
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   if (data_format == "NHWC") {
-    output_tensor = ctx.TransposeTensor(output_tensor, {0, 2, 3, 1});
-    TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name());
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(output_tensor), {0, 2, 3, 1},
+        &output_tensor));
   }
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertActivation(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertActivation(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        node_def.op(), " expects one input, at ", node_def.name());
+  }
+  if (!inputs.at(0).is_tensor()) {
+    return tensorflow::errors::Unimplemented(
+        node_def.op(), " is only implemented for tensors, at ",
+        node_def.name());
+  }
+  static const std::unordered_map<string, nvinfer1::ActivationType> ops{
+      {"Relu", nvinfer1::ActivationType::kRELU},
+      {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
+      {"Tanh", nvinfer1::ActivationType::kTANH},
+  };
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end()) {
+    return tensorflow::errors::Unimplemented(
+        "Activation op: ", node_def.op(),
+        " not supported at: ", node_def.name());
+  }
+  if (params->validation_only) return tensorflow::Status::OK();
+
+  // Start conversion.
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  nvinfer1::IActivationLayer* layer = ctx.network()->addActivation(
-      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::ActivationType::kRELU);
+  nvinfer1::IActivationLayer* layer =
+      params->converter->network()->addActivation(
+          *const_cast<nvinfer1::ITensor*>(tensor), op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  // Set quantization range for output of Sigmoid, Tanh.
+  if (node_def.op() == "Sigmoid") {
+    params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 1.0f);
+  } else if (node_def.op() == "Tanh") {
+    params->converter->ProvideQuantizationRange(output_tensor, -1.0f, 1.0f);
+  }
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertScale(Converter& ctx,
-                                const tensorflow::NodeDef& node_def,
-                                const std::vector<TRT_TensorOrWeights>& inputs,
-                                std::vector<TRT_TensorOrWeights>* outputs) {
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
+Status ConvertQuantize(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if ((inputs.size() == 0) ||
+      (node_def.op() == "FakeQuantWithMinMaxArgs" && inputs.size() != 1) ||
+      (node_def.op() == "FakeQuantWithMinMaxVars" && inputs.size() != 3) ||
+      (node_def.op() == "QuantizeAndDequantizeV2" && inputs.size() != 3) ||
+      (node_def.op() == "QuantizeAndDequantizeV3" && inputs.size() != 4)) {
+    return errors::InvalidArgument("Invalid number of inputs for ",
+                                   node_def.op(), ", at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    // TensorRT will automatically quantize weights, so we will ignore ranges
+    // for weights.
+    params->outputs->push_back(inputs.at(0));
+    return Status::OK();
+  }
+  float min_range = 0.0f;
+  float max_range = 0.0f;
+  if (node_def.op() == "FakeQuantWithMinMaxArgs") {
+    // Get ranges via node attributes.
+    TFAttrs attrs(node_def);
+    if (attrs.count("min") == 0 || attrs.count("max") == 0) {
+      return errors::InvalidArgument("Min or max attribute not found for ",
+                                     node_def.op(), " at ", node_def.name());
+    }
+    min_range = attrs.get<float>("min");
+    max_range = attrs.get<float>("max");
+  } else if (node_def.op() == "FakeQuantWithMinMaxVars" ||
+             node_def.op() == "QuantizeAndDequantizeV2" ||
+             node_def.op() == "QuantizeAndDequantizeV3") {
+    // Get ranges via inputs.
+    if (!inputs.at(1).is_weights() || !inputs.at(2).is_weights()) {
+      return errors::InvalidArgument("Min and max inputs for ", node_def.op(),
+                                     " must be weights not tensors, at ",
+                                     node_def.name());
+    }
+    auto get_weights_value = [&inputs](int index) {
+      auto raw_weights = static_cast<float*>(
+          const_cast<void*>(inputs.at(index).weights().GetValues()));
+      return raw_weights[0];
+    };
+    min_range = get_weights_value(1);
+    max_range = get_weights_value(2);
+  } else {
+    return errors::InvalidArgument("Unknown quantization op ", node_def.op(),
+                                   ", at ", node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Store ranges for tensor
+  params->converter->ProvideQuantizationRange(
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()), min_range,
+      max_range);
+  // Sometimes, TRT may not quantize a tensor, either because it chooses to
+  // execute a higher precision kernel or because of op fusion. In these cases,
+  // accuracy will suffer if the model was trained to expect quantization at
+  // that tensor. We should consider adding a clip(tensor, min_range, max_range)
+  // operation here to ensure that any arbitrarily placed quantize node will
+  // execute as expected. However, this will negatively affect performance. If
+  // users train their models in a way which models inference as close as
+  // possible (i.e. not quantizing in place where fusion will occur), then there
+  // is no problem with the current implementation.
+  params->outputs->push_back(inputs.at(0));
+  return Status::OK();
+}
+
+// TODO(pdavoodi): we should update relu6 implementation once TensorRT supports
+// Relu6 natively.
+tensorflow::Status ConvertRelu6(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "Invalid number of inputs for Relu6, at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
     return tensorflow::errors::Unimplemented(
-        "ConvertScale only supports tensor<op>weight: ", node_def.name());
+        "Relu6 is only implemented for tensors, not weights, at ",
+        node_def.name());
   }
+  if (params->validation_only) return Status::OK();
+  // ***************************************************************************
+  // TensorRT does not implement Relu6 natively. This function converts Relu6 op
+  // to available TensorRT ops: Relu6(x) = min(Relu(x), 6)
+  // ***************************************************************************
 
+  // Input Tensor
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  TRT_ShapedWeights weights = inputs.at(1).weights();
-  if (ctx.isFP16()) {
-    weights = ConvertFP32ToFP16(ctx, inputs.at(1).weights());
-  }
 
-  TRT_ShapedWeights empty_weights(weights.type_);
-  TFAttrs attrs(node_def);
+  // Relu operation i.e. Relu(x) = max(0, x)
+  nvinfer1::IActivationLayer* relu_layer =
+      params->converter->network()->addActivation(
+          *const_cast<nvinfer1::ITensor*>(tensor),
+          nvinfer1::ActivationType::kRELU);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(relu_layer, node_def.name());
+
+  // Large range of relu is problematic during quantization in INT8 precision
+  // mode. Setting dynamic range of relu = [0.f, 6.0f] helps with quantization.
+  // TRT only uses dynamic ranges in INT8 precision mode,
+  // and this does not affect the FP32 path.
+  params->converter->ProvideQuantizationRange(relu_layer->getOutput(0), 0.0f,
+                                              6.0f);
+
+  // Create a constant layer to store the floating point weight i.e. 6.0f This
+  // tensor will be broadcasted uniformly during elementwise `min` operation.
+  // The constant has to have the same rank as the input in order for TRT to
+  // broadcast
+  nvinfer1::Dims dims;
+  dims.nbDims = relu_layer->getOutput(0)->getDimensions().nbDims;
+  for (int i = 0; i < dims.nbDims; i++) {
+    dims.d[i] = 1;
+  }
+  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
+      tensorflow::DataType::DT_FLOAT, dims);
+  auto weights_ptr =
+      static_cast<float*>(const_cast<void*>(weights.GetValues()));
+  weights_ptr[0] = 6.0f;
+  nvinfer1::IConstantLayer* const6_layer =
+      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(const6_layer, node_def.name());
+  params->converter->ProvideQuantizationRange(const6_layer->getOutput(0), 0.0f,
+                                              6.0f);
+
+  // ElementWise Min Operation
+  // Min op is a nop for INT8 execution path, as the input tensor
+  // to this layer will only have values in range [0.f, 6.0f].
+  const nvinfer1::ITensor* tensor_l = relu_layer->getOutput(0);
+  const nvinfer1::ITensor* tensor_r = const6_layer->getOutput(0);
+  nvinfer1::IElementWiseLayer* relu6_layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor_l),
+          *const_cast<nvinfer1::ITensor*>(tensor_r),
+          nvinfer1::ElementWiseOperation::kMIN);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = relu6_layer->getOutput(0);
+  params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
+
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+}
 
-  const auto data_format = attrs.get<string>("data_format");
-  int channel_index;
-  const auto dims = tensor->getDimensions();
-  if (data_format == "NHWC") {
-    //  1). NHWC is really N+C
-    channel_index = dims.nbDims - 1;  // batch dimension is implicit here!
-  } else {
-    //  2). NCHW is really N+CHW
-    channel_index = dims.nbDims - 3;  // batch dimension is implicit here!
+tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights()) {
+    return errors::InvalidArgument("Input expects tensor and weights, at ",
+                                   node_def.name());
   }
+  if (params->validation_only) return Status::OK();
 
-  nvinfer1::Permutation permutation;
-  for (int32_t i = 0; i < dims.nbDims; ++i) {
-    permutation.order[i] = i;
-  }
+  nvinfer1::ITensor* tensor =
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
+  const nvinfer1::Dims original_dims = tensor->getDimensions();
+  TFAttrs attrs(node_def);
+  const string data_format = attrs.get<string>("data_format");
+  const int channel_index =
+      (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
 
-  if (channel_index >= 0) {
+  nvinfer1::Permutation permutation;
+  if (channel_index != 0) {
+    // Permute the dimensions so that the channel dimension is the first
+    // dimension.
+    for (int i = 0; i < original_dims.nbDims; ++i) {
+      permutation.order[i] = i;
+    }
     permutation.order[0] = channel_index;
     permutation.order[channel_index] = 0;
-  } else {
-    return tensorflow::errors::Unimplemented(
-        "TFTRT::BiasAdd cannot apply on batch dimension, at ", node_def.name());
+    VLOG(1) << "ConvertBiasAdd permutation: "
+            << DebugString(permutation, original_dims.nbDims);
   }
 
   // TensorRT addScale requires input to be of rank 3, we need to apply
-  // transpose as well as reshape
-  if (channel_index != 0 || dims.nbDims != 3) {
+  // transpose as well as reshape.
+  // TODO(laigd): this doesn't match what the TRT doc says, fix the doc?
+  if (channel_index != 0 || original_dims.nbDims != 3) {
     nvinfer1::IShuffleLayer* shuffle_layer =
-        ctx.network()->addShuffle(*const_cast<nvinfer1::ITensor*>(tensor));
+        params->converter->network()->addShuffle(*tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
+    params->converter->MarkQuantizationRangesAsInferrable(
+        tensor, shuffle_layer->getOutput(0));
+
+    // NOTE(laigd): for some reason we need to apply the reshape
+    // unconditionally. The default shape has nbDims==-1 and it seems the
+    // behavior is undefined in some cases.
     nvinfer1::Dims reshape_dims;
     reshape_dims.nbDims = 3;
-    reshape_dims.d[0] = 0;                          // 0 copy from the input
-    reshape_dims.d[1] = dims.nbDims >= 2 ? 0 : 1;   // 0 copy from the input
-    reshape_dims.d[2] = dims.nbDims >= 3 ? -1 : 1;  // -1 infer from the rest
+    // 0 means copying from input; -1 means inferring from the rest.
+    reshape_dims.d[0] = 0;
+    reshape_dims.d[1] = original_dims.nbDims >= 2 ? 0 : 1;
+    reshape_dims.d[2] = original_dims.nbDims >= 3 ? -1 : 1;
+    shuffle_layer->setReshapeDimensions(reshape_dims);
+
     if (channel_index != 0) {
-      // maybe we do not need this check. concerned about TRT optimization
       shuffle_layer->setFirstTranspose(permutation);
     }
-    shuffle_layer->setReshapeDimensions(reshape_dims);
     tensor = shuffle_layer->getOutput(0);
   }
 
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (params->converter->precision_mode() == FP16MODE) {
+    weights = ConvertFP32ToFP16(params->weight_store, weights);
+  }
   nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
   if (weights.shape_.d[0] == 1) {
     mode = nvinfer1::ScaleMode::kUNIFORM;
   }
 
-  nvinfer1::IScaleLayer* layer =
-      ctx.network()->addScale(*const_cast<nvinfer1::ITensor*>(tensor), mode,
-                              weights, empty_weights, empty_weights);
+  TRT_ShapedWeights empty_weights(weights.type_);
+  nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
+      *tensor, mode, weights.GetTrtWeights(), empty_weights.GetTrtWeights(),
+      empty_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // restore transpose & reshape
-  if (channel_index != 0 || dims.nbDims != 3) {
-    nvinfer1::IShuffleLayer* shuffle_layer = ctx.network()->addShuffle(
-        *const_cast<nvinfer1::ITensor*>(output_tensor));
+  // Restore transpose & reshape.
+  if (channel_index != 0 || original_dims.nbDims != 3) {
+    nvinfer1::IShuffleLayer* shuffle_layer =
+        params->converter->network()->addShuffle(*output_tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
-    nvinfer1::Dims reshape_dims = dims;
-    int tmp = reshape_dims.d[channel_index];
-    reshape_dims.d[channel_index] = reshape_dims.d[0];
-    reshape_dims.d[0] = tmp;
+    // NOTE: for same reason as mentioned above we need to apply the reshape
+    // unconditionally.
+    nvinfer1::Dims reshape_dims = original_dims;
+    if (channel_index != 0) {
+      // NOTE: according to NVIDIA dimension types are deprecated, so we don't
+      // need to copy them back.
+      reshape_dims.d[channel_index] = original_dims.d[0];
+      reshape_dims.d[0] = original_dims.d[channel_index];
+    }
     shuffle_layer->setReshapeDimensions(reshape_dims);
+
     if (channel_index != 0) {
       shuffle_layer->setSecondTranspose(permutation);
     }
+    params->converter->MarkQuantizationRangesAsInferrable(
+        output_tensor, shuffle_layer->getOutput(0));
     output_tensor = shuffle_layer->getOutput(0);
   }
 
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+}
+
+Status GetTensorDimsWithProtoShape(const Tensor& tensor,
+                                   int tensor_proto_array_len,
+                                   nvinfer1::Dims* dims) {
+  if (tensor.dims() > 0) {
+    *dims = GetTrtDimsForTensor(tensor);
+    if (TrtDimsNumElements(*dims) != tensor_proto_array_len &&
+        tensor_proto_array_len != 1) {
+      return errors::InvalidArgument(
+          "Broadcast on weights only supports kCHANNEL and kUNIFORM");
+    }
+  } else {
+    dims->nbDims = 1;
+    // No dimension provided. Flatten it.
+    dims->d[0] = tensor_proto_array_len;
+    dims->type[0] = nvinfer1::DimensionType::kSPATIAL;
+    for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; ++i) {
+      dims->d[i] = 0;
+    }
+  }
+  return Status::OK();
 }
 
-tensorflow::Status ConvertConst(Converter& ctx,
-                                const tensorflow::NodeDef& node_def,
-                                const std::vector<TRT_TensorOrWeights>& inputs,
-                                std::vector<TRT_TensorOrWeights>* outputs) {
-  const auto& weights_tensor = node_def.attr().at("value").tensor();
+template <typename CType>
+Status TfTensorToTrtWeights(const DataType dtype, const Tensor& tensor,
+                            const CType* tensor_proto_array,
+                            int tensor_proto_array_len, TrtWeightStore* store,
+                            TRT_ShapedWeights* weights) {
+  nvinfer1::Dims weight_dims;
+  TF_RETURN_IF_ERROR(GetTensorDimsWithProtoShape(tensor, tensor_proto_array_len,
+                                                 &weight_dims));
+  *weights = store->GetTempWeights(dtype, weight_dims);
+  void* dst = const_cast<void*>(weights->GetValues());
+  if (tensor_proto_array_len == 1) {
+    std::fill_n((CType*)dst, TrtDimsNumElements(weight_dims),
+                *tensor_proto_array);
+  } else {
+    memcpy(dst, tensor_proto_array, weights->size_bytes());
+  }
+  return Status::OK();
+}
 
-  // Get trt type & shape
+// Convert a Const NodeDef to TRT_ShapedWeights. This is a special converter, it
+// always ignores the params->validation_only parameter but adds the converted
+// weights to params->outputs. We did this since TrtNodeValidator needs the
+// weights as input to other nodes, and use it to determine whether those nodes
+// are supported by TRT.
+tensorflow::Status ConvertConst(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (!inputs.empty()) {
+    return errors::InvalidArgument(
+        "Constant node is expected to have empty input list: ",
+        node_def.name());
+  }
   TFAttrs attrs(node_def);
-  const tensorflow::DataType dtype = attrs.get<tensorflow::DataType>("dtype");
+  const DataType dtype = attrs.get<tensorflow::DataType>("dtype");
+  // We always convert the integer constants to kINT32, since TRT kINT8 is for
+  // quantized inference.
+  const DataType converted_dtype =
+      (dtype == DT_INT16 || dtype == DT_INT8 || dtype == DT_UINT8 ? DT_INT32
+                                                                  : dtype);
+  nvinfer1::DataType trt_dtype;
+  TF_RETURN_IF_ERROR(ConvertDType(converted_dtype, &trt_dtype));
 
   // Create shaped weights as output
+  const auto& tensor_proto = node_def.attr().at("value").tensor();
   tensorflow::Tensor tensor;
-  if (!tensor.FromProto(weights_tensor)) {
+  if (!tensor.FromProto(tensor_proto)) {
     return tensorflow::errors::Internal("Cannot parse weight tensor proto: ",
                                         node_def.name());
   }
 
-  TRT_ShapedWeights weights(dtype);
-  // TODO(aaroey): we should choose the array using dtype and shape.
-  if (!weights_tensor.float_val().empty()) {
-    VLOG(2) << "SCALAR!!!" << node_def.name();
-    nvinfer1::Dims scalar_shape;
-    if (tensor.dims() > 0) {
-      VLOG(2) << "dimensions: " << tensor.dims();
-      VLOG(2) << "size: " << weights_tensor.float_val_size();
-      scalar_shape = GetTensorShape(tensor);
-      VLOG(2) << "details: ";
-      for (int i = 0; i < scalar_shape.nbDims; i++)
-        VLOG(2) << scalar_shape.d[i];
-      if (GetShapeSize(scalar_shape) != weights_tensor.float_val_size() &&
-          weights_tensor.float_val_size() != 1) {
-        LOG(ERROR) << "Broadcast on weights only supports kCHANNEL and"
-                   << " kUNIFORM, at: " << node_def.name();
-        string err_str("Broadcast method is not supported for '");
-        StrAppend(&err_str, node_def.name(), "' of type ", node_def.op());
-        return tensorflow::errors::InvalidArgument(err_str);
-      }
-    } else {
-      VLOG(2) << "Dimensions: " << tensor.dims();
-      scalar_shape.nbDims = 1;
-      // no dimension provided. flatten it
-      scalar_shape.d[0] = weights_tensor.float_val_size();
-      scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
-      for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
-        scalar_shape.d[i] = 0;
+  TRT_ShapedWeights weights(converted_dtype);
+  if (tensor.NumElements() == 0) {
+    // Do nothing.
+  } else if (!tensor_proto.float_val().empty()) {
+    TF_RETURN_IF_ERROR(TfTensorToTrtWeights(
+        converted_dtype, tensor, tensor_proto.float_val().begin(),
+        tensor_proto.float_val_size(), params->weight_store, &weights));
+  } else if (!tensor_proto.int_val().empty()) {
+    TF_RETURN_IF_ERROR(TfTensorToTrtWeights(
+        converted_dtype, tensor, tensor_proto.int_val().begin(),
+        tensor_proto.int_val_size(), params->weight_store, &weights));
+  } else if (!tensor_proto.half_val().empty()) {
+    // TODO(aaroey): implement fp16 conversion.
+    return errors::Unimplemented("fp16 constant is not supported yet.");
+  } else if (!tensor_proto.tensor_content().empty()) {
+    // TODO(aaroey): fp16 will remain in half format and is not converted to
+    // fp32, but the converter currently uses all float weights as fp32. Fix
+    // this.
+    const auto& content = tensor_proto.tensor_content();
+    if (content.size() > 0) {
+      const int dtype_size = tensorflow::DataTypeSize(dtype);
+      if (content.size() % dtype_size != 0) {
+        return errors::FailedPrecondition("Tensor content size ",
+                                          content.size(),
+                                          " is not a multiple of ", dtype_size);
       }
-    }
-    // TODO(aaroey): use GetShapeSize().
-    size_t len_data = tensorflow::DataTypeSize(dtype);
-    for (int i = 0; i < scalar_shape.nbDims; i++) len_data *= scalar_shape.d[i];
-    ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-    void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-    if (weights_tensor.float_val_size() == 1) {
-      std::fill_n((float*)dst, GetShapeSize(scalar_shape),
-                  *weights_tensor.float_val().begin());
-    } else {
-      // TODO(aaroey): get rid of this copy as RepeatedField is always
-      // contiguous make a local copy first to flatten doesn't have to be
-      // contiguous
-      std::vector<float> tensor_data(weights_tensor.float_val().begin(),
-                                     weights_tensor.float_val().end());
-      memcpy(dst, tensor_data.data(), len_data);  // store into weight store
-    }
-    VLOG(2) << "create shape details: ";
-    for (int i = 0; i < scalar_shape.nbDims; i++) VLOG(2) << scalar_shape.d[i];
-    weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
-  } else if (!weights_tensor.int_val().empty()) {
-    // TODO(aaroey): this is very similar to the above code for float, merge
-    // them.
-    VLOG(2) << "int!!!" << node_def.name();
-    nvinfer1::Dims scalar_shape;
-    if (tensor.dims() > 0) {
-      VLOG(2) << "dimensions: " << tensor.dims();
-      scalar_shape = GetTensorShape(tensor);
-      if (GetShapeSize(scalar_shape) != weights_tensor.int_val_size() &&
-          weights_tensor.int_val_size() != 1) {
-        LOG(WARNING) << "Broadcast on weights only supports kCHANNEL and"
-                     << " kUNIFORM, at: " << node_def.name();
-        string err_str("Broadcast method is not supported for '");
-        StrAppend(&err_str, node_def.name(), "' of type ", node_def.op());
-        return tensorflow::errors::InvalidArgument(err_str);
+      nvinfer1::Dims weights_dim;
+      TF_RETURN_IF_ERROR(GetTensorDimsWithProtoShape(
+          tensor, content.size() / dtype_size, &weights_dim));
+      const int64_t size_bytes = TrtDimsNumElements(weights_dim) * dtype_size;
+      if (content.size() != size_bytes) {
+        return errors::FailedPrecondition(
+            "Tensor size and TensorProto content size mismatch: ", size_bytes,
+            " vs ", content.size());
+      } else if (tensor.NumElements() != content.size() / dtype_size) {
+        return errors::FailedPrecondition(
+            "Tensor elements count and TensorProto content size mismatch: ",
+            tensor.NumElements(), " vs ", content.size() / dtype_size);
       }
-    } else {
-      VLOG(2) << "dimensions: " << tensor.dims();
-      scalar_shape.nbDims = 1;
-      // no dimension provided. flatten it
-      scalar_shape.d[0] = weights_tensor.int_val_size();
-      scalar_shape.type[0] = nvinfer1::DimensionType::kSPATIAL;
-      for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; i++) {
-        scalar_shape.d[i] = 0;
-        scalar_shape.type[i] = nvinfer1::DimensionType::kSPATIAL;
+      weights =
+          params->weight_store->GetTempWeights(converted_dtype, weights_dim);
+      if (dtype_size == tensorflow::DataTypeSize(converted_dtype)) {
+        port::CopyToArray(content, static_cast<char*>(
+                                       const_cast<void*>(weights.GetValues())));
+      } else {
+        // Copy out the weights as original data type.
+        std::vector<uint8_t> temp_weights(content.size());
+        port::CopyToArray(content,
+                          reinterpret_cast<char*>(temp_weights.data()));
+        int32* dst =
+            static_cast<int32*>(const_cast<void*>(weights.GetValues()));
+        // Copy to the weight store as converted data type.
+        if (dtype == DT_INT16) {
+          int16* data = reinterpret_cast<int16*>(temp_weights.data());
+          std::copy(data, data + tensor.NumElements(), dst);
+        } else if (dtype == DT_INT8) {
+          int8* data = reinterpret_cast<int8*>(temp_weights.data());
+          std::copy(data, data + tensor.NumElements(), dst);
+        } else if (dtype == DT_UINT8) {
+          uint8* data = reinterpret_cast<uint8*>(temp_weights.data());
+          std::copy(data, data + tensor.NumElements(), dst);
+        } else {
+          return errors::FailedPrecondition(
+              "Unexpected data type: ", DataTypeString(dtype),
+              " at: ", node_def.name());
+        }
       }
     }
-    // we should not have converted
-    size_t len_data = tensorflow::DataTypeSize(dtype);
-    for (int i = 0; i < scalar_shape.nbDims; i++) len_data *= scalar_shape.d[i];
-    size_t len_tensor = weights_tensor.int_val_size() * sizeof(int32);
-    len_data = std::max(len_data, len_tensor);
-    ctx.weight_store()->store_.push_back(std::vector<uint8_t>(len_data));
-    void* dst = static_cast<void*>(&(ctx.weight_store()->store_.back()[0]));
-    if (weights_tensor.int_val_size() == 1) {
-      std::fill_n((int*)dst, GetShapeSize(scalar_shape),
-                  *weights_tensor.int_val().begin());
-    } else {
-      // TODO(aaroey): get rid of this copy as RepeatedField is always
-      // contiguous make a local copy first to flatten doesn't have to be
-      // contiguous
-      std::vector<int32> tensor_data(weights_tensor.int_val().begin(),
-                                     weights_tensor.int_val().end());
-      memcpy(dst, tensor_data.data(), len_tensor);  // store into weight store
-    }
-    weights = TRT_ShapedWeights(dtype, dst, scalar_shape);
-  } else if (!weights_tensor.tensor_content().empty()) {
-    // obsolete method.
-    // After optimization path, we do not see weights in this format.
-    // TODO(aaroey): why?
-    // fp16 conversion technically should be needed here.
-    VLOG(2) << "TENSOR!!!" << node_def.name();
-    const auto& content = weights_tensor.tensor_content();
-
-    weights = ctx.get_temp_weights(dtype, GetTensorShape(tensor));
-    if (content.size() > 0) {
-      const int dtype_size = tensorflow::DataTypeSize(dtype);
-      CHECK_EQ(0, content.size() % dtype_size)
-          << "Tensor content size (" << content.size()
-          << ") is not a multiple of " << dtype_size;
-      port::CopyToArray(
-          content, static_cast<char*>(const_cast<void*>(weights.GetValues())));
-    }
   } else {
-    return tensorflow::errors::Unimplemented("Not supported constant type, at ",
-                                             node_def.name());
+    return errors::Unimplemented("Not supported constant type, at ",
+                                 node_def.name());
   }
-  // Pass the output
-  outputs->push_back(TRT_TensorOrWeights(weights));
-  return tensorflow::Status::OK();
+  if (params->outputs != nullptr) {
+    params->outputs->push_back(TRT_TensorOrWeights(weights));
+  }
+  return Status::OK();
 }
 
-tensorflow::Status ConvertIdentity(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  outputs->push_back(inputs.at(0));
+tensorflow::Status ConvertIdentity(OpConverterParams* params) {
+  // TODO(tmorris): TRT's Identity layer does not get optimized away as of TRT
+  // 5.0, however once we know that it does it would be nice to use that
+  // instead.
+  params->outputs->push_back(params->inputs.at(0));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertBinary(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
+Status ConvertBinary(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   if (inputs.size() != 2) {
-    return tensorflow::errors::FailedPrecondition(
-        "Binary ops require two tensor input, at ", node_def.name());
+    return errors::InvalidArgument("Binary ops require two inputs, at ",
+                                   node_def.name());
   }
 
   // Constant folding should have been done by TensorFlow
-
   if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         "Constant folding is falled back to TensorFlow, binary op received "
         "both input as constant at: ",
         node_def.name());
   }
 
-  // Try to convert into Scale layer first (for better performance)
+  // TODO(tmorris): TRT plans to deprecate IScaleLayer and will replace it with
+  // IElementwiseLayer. At that point, we can remove BinaryTensorOpWeight. For
+  // now, the performance will be slightly better with IScaleLayer because it
+  // can be fused in more situations. However, most of the benefits of
+  // IScaleLayer are when the layer performs both a shift and a scale, which we
+  // don't do except for convolutions.
+  //
+  // Try to convert into Scale layer first (for better performance).
   // Since scale layer supports restricted broadcast policy and op types, we
   // allow failure and try to handle it through Elementwise op
-  // (BinaryTensorOpTensor)
-  Status status = tensorflow::Status::OK();
+  // (BinaryTensorOpTensor).
+  Status status = Status::OK();
   if (inputs.at(0).is_tensor() && inputs.at(1).is_weights()) {
-    status = BinaryTensorOpWeight(ctx, node_def, inputs.at(0).tensor(),
-                                  inputs.at(1).weights(), false, outputs);
+    status = BinaryTensorOpWeight(params, inputs.at(0).tensor(),
+                                  inputs.at(1).weights(), false);
   } else if (inputs.at(0).is_weights() && inputs.at(1).is_tensor()) {
-    status = BinaryTensorOpWeight(ctx, node_def, inputs.at(1).tensor(),
-                                  inputs.at(0).weights(), true, outputs);
-#if NV_TENSORRT_MAJOR == 3
-  } else {
-#else
+    status = BinaryTensorOpWeight(params, inputs.at(1).tensor(),
+                                  inputs.at(0).weights(), true);
   }
+  // If both input are tensors, or one of them is weights but the conversion
+  // above failed, try the conversion using BinaryTensorOpTensor.
   if ((inputs.at(0).is_tensor() && inputs.at(1).is_tensor()) || !status.ok()) {
-#endif
-    status = BinaryTensorOpTensor(ctx, node_def, inputs.at(0), inputs.at(1),
-                                  outputs);
+    if (!status.ok()) VLOG(1) << status;
+    status = BinaryTensorOpTensor(params, inputs.at(0), inputs.at(1));
   }
   return status;
 }
 
-tensorflow::Status ConvertUnary(Converter& ctx,
-                                const tensorflow::NodeDef& node_def,
-                                const std::vector<TRT_TensorOrWeights>& inputs,
-                                std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertUnary(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   static const std::unordered_map<string, nvinfer1::UnaryOperation> ops{
       {"Neg", nvinfer1::UnaryOperation::kNEG},
       {"Exp", nvinfer1::UnaryOperation::kEXP},
@@ -1876,30 +2618,38 @@ tensorflow::Status ConvertUnary(Converter& ctx,
         "Unary ops require single tensor input, at ", node_def.name());
   }
 
-#if NV_TENSORRT_MAJOR == 3
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "Constant folding for unary op is not supported", node_def.name());
-  }
-#endif
-
   // TODO(jie): check type
-  const nvinfer1::ITensor* tensor;
-  TFTRT_RETURN_ERROR_IF_FALSE(
-      PrepareTensorForShape(ctx, inputs.at(0), inputs.at(0).shape(), &tensor),
-      node_def.name());
+  const nvinfer1::ITensor* tensor = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      inputs.at(0), inputs.at(0).GetTrtDims(), &tensor));
 
   nvinfer1::IUnaryLayer* layer;
   if (node_def.op() == "Rsqrt") {
-    layer = ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor),
-                                    nvinfer1::UnaryOperation::kSQRT);
+    // We will need a quantization range for intermediate tensor if not using
+    // calibration.
+    //
+    //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
+    //                     ^
+    //               need range here
+    if (params->converter->precision_mode() == INT8MODE &&
+        !params->converter->use_calibration()) {
+      return errors::Unimplemented(
+          "Intermediate quantization range cannot be determined without"
+          " calibration for Rsqrt, consider replacing with "
+          "Sqrt -> FakeQuant -> Reciprocal ops, at ",
+          node_def.name());
+    }
+    layer = params->converter->network()->addUnary(
+        *const_cast<nvinfer1::ITensor*>(tensor),
+        nvinfer1::UnaryOperation::kSQRT);
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
     tensor = layer->getOutput(0);
-    layer = ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor),
-                                    nvinfer1::UnaryOperation::kRECIP);
+    layer = params->converter->network()->addUnary(
+        *const_cast<nvinfer1::ITensor*>(tensor),
+        nvinfer1::UnaryOperation::kRECIP);
   } else if (ops.count(node_def.op()) != 0) {
-    layer = ctx.network()->addUnary(*const_cast<nvinfer1::ITensor*>(tensor),
-                                    ops.at(node_def.op()));
+    layer = params->converter->network()->addUnary(
+        *const_cast<nvinfer1::ITensor*>(tensor), ops.at(node_def.op()));
   } else {
     return tensorflow::errors::InvalidArgument(
         "Binary op: ", node_def.op(), " not supported, at ", node_def.name());
@@ -1907,111 +2657,56 @@ tensorflow::Status ConvertUnary(Converter& ctx,
 
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
   return tensorflow::Status::OK();
 }
 
-#if NV_TENSORRT_MAJOR == 3
-tensorflow::Status ConvertReducePool(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "Input expects tensor and weights, at", node_def.name());
-  }
-
-  // Implement tensor binaryOp weight [channel wise] for now;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  const auto dims = tensor->getDimensions();
-  // Restore implicit batch dimension
-  const int nb_dims = dims.nbDims + 1;
-
-  TRT_ShapedWeights index_list = inputs.at(1).weights();
-  TFAttrs attrs(node_def);
-  auto index_type = attrs.get<tensorflow::DataType>("Tidx");
-
-  // Only expect to handle INT32 as attributes for now
-  if (index_type != tensorflow::DataType::DT_INT32) {
-    return tensorflow::errors::Unimplemented("Tidx supports only DT_INT32");
-  }
-  const auto index_list_data =
-      static_cast<int*>(const_cast<void*>(index_list.GetValues()));
-
-  if (nb_dims != 4) {
-    return tensorflow::errors::InvalidArgument(
-        "TRT only support reduce on 4 dimensional tensors, at",
-        node_def.name());
-  }
-  if (index_list.count() > 2) {
-    return tensorflow::errors::InvalidArgument(
-        "TRT cannot support reduce on more than 2 dimensions, at",
-        node_def.name());
-  }
-
-  std::set<int> idx_set;
-  // We cannot operate on Channel. permutation flag used to transpose tensor
-  int permuted_index = -1;
-  for (int i = 0; i < index_list.count(); i++) {
-    if (index_list_data[i] == 0) {
-      return tensorflow::errors::InvalidArgument("TRT cannot reduce at 0, at",
-                                                 node_def.name());
-    }
-    if (index_list_data[i] == 1) permuted_index = 1;
-    idx_set.emplace(index_list_data[i]);
+tensorflow::Status ConvertSquare(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument("Square expects one input, at ",
+                                               node_def.name());
   }
-
-  std::vector<int> permutation_order(nb_dims);
-  nvinfer1::DimsHW pool_kernel;
-  if (permuted_index == 1) {
-    for (int i = 2; i < nb_dims; i++) {
-      if (idx_set.count(i) == 0) {
-        permuted_index = i;
-        break;
-      }
-    }
-    for (int i = 0; i < nb_dims; i++) permutation_order[i] = i;
-
-    permutation_order[permuted_index] = 1;
-    permutation_order[1] = permuted_index;
-
-    // Apply permutation before extracting dimension for pool_kernel
-    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
-                                 permutation_order);
-    TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name());
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "Square is only implemented for tensors, at ", node_def.name());
   }
+  if (params->validation_only) return Status::OK();
 
-  // Apply permutation before extracting dimension for pool_kernel
-  pool_kernel.d[0] = (idx_set.count(2) || permuted_index == 2) ? dims.d[1] : 1;
-  pool_kernel.d[1] = (idx_set.count(3) || permuted_index == 3) ? dims.d[2] : 1;
-
-  nvinfer1::ITensor* output_tensor;
+  // Constant 2 with same rank as input
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  for (int i = 0; i < dims.nbDims; i++) {
+    dims.d[i] = 1;
+  }
+  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
+      tensorflow::DataType::DT_FLOAT, dims);
+  auto weights_ptr =
+      static_cast<float*>(const_cast<void*>(weights.GetValues()));
+  weights_ptr[0] = 2.f;
+  nvinfer1::IConstantLayer* const2_layer =
+      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(const2_layer, node_def.name());
+
+  // ElementWise Pow Operation
+  const nvinfer1::ITensor* tensor_l = inputs.at(0).tensor();
+  const nvinfer1::ITensor* tensor_r = const2_layer->getOutput(0);
+  nvinfer1::IElementWiseLayer* layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor_l),
+          *const_cast<nvinfer1::ITensor*>(tensor_r),
+          nvinfer1::ElementWiseOperation::kPOW);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  if (node_def.op() == "Mean") {
-    nvinfer1::IPoolingLayer* layer =
-        ctx.network()->addPooling(*const_cast<nvinfer1::ITensor*>(tensor),
-                                  nvinfer1::PoolingType::kAVERAGE, pool_kernel);
-    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-    output_tensor = layer->getOutput(0);
-  } else {
-    return tensorflow::errors::Unimplemented("Op not supported ", node_def.op(),
-                                             " , at ", node_def.name());
-  }
-  if (permuted_index != -1) {
-    // Apply permutation before extracting dimension for pool_kernel
-    output_tensor = ctx.TransposeTensor(
-        const_cast<nvinfer1::ITensor*>(output_tensor), permutation_order);
-    TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name());
-  }
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
-#elif NV_TENSORRT_MAJOR > 3
-tensorflow::Status ConvertReduce(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
+
+tensorflow::Status ConvertReduce(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights()) {
     return tensorflow::errors::InvalidArgument(
@@ -2065,20 +2760,18 @@ tensorflow::Status ConvertReduce(Converter& ctx,
   }
 
   const auto keep_dims = attrs.get<bool>("keep_dims");
-  nvinfer1::ILayer* layer =
-      ctx.network()->addReduce(*const_cast<nvinfer1::ITensor*>(tensor),
-                               reduce_operation, axes, keep_dims);
+  nvinfer1::ILayer* layer = params->converter->network()->addReduce(
+      *const_cast<nvinfer1::ITensor*>(tensor), reduce_operation, axes,
+      keep_dims);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
-  outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return tensorflow::Status::OK();
 }
-#endif
 
-tensorflow::Status ConvertPad(Converter& ctx,
-                              const tensorflow::NodeDef& node_def,
-                              const std::vector<TRT_TensorOrWeights>& inputs,
-                              std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertPad(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   // TODO(aaroey): make a routine for this check and reuse it.
   if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
       !inputs.at(1).is_weights()) {
@@ -2122,7 +2815,7 @@ tensorflow::Status ConvertPad(Converter& ctx,
 
   // No padding at all, we should exit
   if (pad_index.size() == 0) {
-    outputs->push_back(inputs.at(0));
+    params->outputs->push_back(inputs.at(0));
     return tensorflow::Status::OK();
   }
 
@@ -2152,9 +2845,8 @@ tensorflow::Status ConvertPad(Converter& ctx,
   std::vector<int32_t> permuted_pad_index(pad_index);
   if (pad_index[0] == 1) {
     legit_pad = false;
-    tensor = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor),
-                                 {0, 3, 2, 1});
-    TFTRT_RETURN_ERROR_IF_NULLPTR(tensor, node_def.name());
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 2, 1}, &tensor));
     permuted_pad_index[0] = 3;
   }
 
@@ -2169,25 +2861,25 @@ tensorflow::Status ConvertPad(Converter& ctx,
     }
   }
 
-  nvinfer1::IPaddingLayer* layer = ctx.network()->addPadding(
+  nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
       *const_cast<nvinfer1::ITensor*>(tensor), pre_padding, post_padding);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   if (!legit_pad) {
-    output_tensor = ctx.TransposeTensor(
-        const_cast<nvinfer1::ITensor*>(output_tensor), {0, 3, 2, 1});
-    TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name());
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(output_tensor), {0, 3, 2, 1},
+        &output_tensor));
   }
 
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertConcat(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertConcat(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   // not including the last input (axis) here
   int input_size = static_cast<int>(inputs.size()) - 1;
 
@@ -2226,18 +2918,6 @@ tensorflow::Status ConvertConcat(Converter& ctx,
     index = dim.nbDims + index + 1;
   }
 
-#if NV_TENSORRT_MAJOR == 3
-  // incase we need permutation;
-  std::vector<int> permutation_order(dim.nbDims + 1);
-
-  for (int i = 0; i < dim.nbDims + 1; i++) permutation_order[i] = i;
-
-  if (index != 1) {
-    permutation_order[1] = index;
-    permutation_order[index] = 1;
-  }
-#endif
-
   std::vector<nvinfer1::ITensor const*> inputs_vec;
   // Shap chack (all input tensor should have same shape)
   // starting from 0 since we are probably also doing transpose here;
@@ -2258,41 +2938,24 @@ tensorflow::Status ConvertConcat(Converter& ctx,
       }
     }
 
-#if NV_TENSORRT_MAJOR == 3
-    // TRT3 does concatenation only on channel!
-    if (index != 1) {
-      tensor_i = ctx.TransposeTensor(const_cast<nvinfer1::ITensor*>(tensor_i),
-                                     permutation_order);
-      TFTRT_RETURN_ERROR_IF_NULLPTR(tensor_i, node_def.name());
-    }
-#endif
     inputs_vec.push_back(tensor_i);
   }
 
   // nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
-  nvinfer1::IConcatenationLayer* layer = ctx.network()->addConcatenation(
-      const_cast<nvinfer1::ITensor* const*>(inputs_vec.data()),
-      inputs_vec.size());
+  nvinfer1::IConcatenationLayer* layer =
+      params->converter->network()->addConcatenation(
+          const_cast<nvinfer1::ITensor* const*>(inputs_vec.data()),
+          inputs_vec.size());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-#if NV_TENSORRT_MAJOR > 3
   layer->setAxis(index - 1);
-#endif
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-
-#if NV_TENSORRT_MAJOR == 3
-  if (index != 1) {
-    output_tensor = ctx.TransposeTensor(output_tensor, permutation_order);
-    TFTRT_RETURN_ERROR_IF_NULLPTR(output_tensor, node_def.name());
-  }
-#endif
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertFusedBatchNorm(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   TFAttrs attrs(node_def);
   float epsilon = attrs.get<float>("epsilon");
   auto data_format = attrs.get<string>("data_format");
@@ -2341,9 +3004,9 @@ tensorflow::Status ConvertFusedBatchNorm(
   //  We could technically have two weights with different shape.
   //  that requires two addScale op, arguably less performant
   TRT_ShapedWeights combined_scale_weights =
-      ctx.get_temp_weights_like(*ptr_shape_weights);
+      params->weight_store->GetTempWeights(*ptr_shape_weights);
   TRT_ShapedWeights combined_offset_weights =
-      ctx.get_temp_weights_like(*ptr_shape_weights);
+      params->weight_store->GetTempWeights(*ptr_shape_weights);
 
   const Eigen::half* cast_vals_array[4];
   const float* vals_array[4];
@@ -2398,22 +3061,22 @@ tensorflow::Status ConvertFusedBatchNorm(
 
   nvinfer1::ScaleMode mode = nweight == 1 ? nvinfer1::ScaleMode::kUNIFORM
                                           : nvinfer1::ScaleMode::kCHANNEL;
-  nvinfer1::IScaleLayer* layer =
-      ctx.network()->addScale(*const_cast<nvinfer1::ITensor*>(tensor), mode,
-                              combined_offset_weights.GetWeightsForTRT(),
-                              combined_scale_weights.GetWeightsForTRT(),
-                              dummy_power_weights.GetWeightsForTRT());
+  nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
+      *const_cast<nvinfer1::ITensor*>(tensor), mode,
+      combined_offset_weights.GetTrtWeights(),
+      combined_scale_weights.GetTrtWeights(),
+      dummy_power_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-#if NV_TENSORRT_MAJOR > 3
-tensorflow::Status ConvertMatMulHelper(
-    Converter& ctx, TRT_TensorOrWeights tensor_input,
-    TRT_ShapedWeights weights_raw, bool transpose_weight, string node_name,
-    std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertMatMulHelper(OpConverterParams* params,
+                                       TRT_TensorOrWeights tensor_input,
+                                       TRT_ShapedWeights weights_raw,
+                                       bool transpose_weight,
+                                       string node_name) {
   nvinfer1::ITensor* output_tensor;
   if (!tensor_input.is_tensor()) {
     return tensorflow::errors::InvalidArgument("Input 0 expects tensor");
@@ -2424,8 +3087,7 @@ tensorflow::Status ConvertMatMulHelper(
   if (transpose_weight) {
     weights = weights_raw;
   } else {
-    TRT_ShapedWeights weights_ck = weights_raw;
-    weights = ctx.get_temp_weights_like(weights_ck);
+    weights = params->weight_store->GetTempWeights(weights_raw);
     ReorderCKtoKC(weights_raw, &weights);
   }
   TRT_ShapedWeights biases(weights.type_);
@@ -2436,67 +3098,61 @@ tensorflow::Status ConvertMatMulHelper(
   while (input_dim.nbDims != 3) {
     input_dim.d[input_dim.nbDims++] = 1;
   }
-  TFTRT_RETURN_ERROR_IF_FALSE(
-      PrepareTensorForShape(ctx, tensor_input, input_dim, &tensor), node_name);
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      tensor_input, input_dim, &tensor));
 
-  nvinfer1::IFullyConnectedLayer* layer = ctx.network()->addFullyConnected(
-      *const_cast<nvinfer1::ITensor*>(tensor), noutput, weights, biases);
+  nvinfer1::IFullyConnectedLayer* layer =
+      params->converter->network()->addFullyConnected(
+          *const_cast<nvinfer1::ITensor*>(tensor), noutput,
+          weights.GetTrtWeights(), biases.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name);
   output_tensor = layer->getOutput(0);
 
-  const nvinfer1::ITensor* temp_tensor;
+  const nvinfer1::ITensor* temp_tensor = nullptr;
   auto output_dim = output_tensor->getDimensions();
   output_dim.nbDims = 1;
-  TFTRT_RETURN_ERROR_IF_FALSE(
-      PrepareTensorForShape(ctx, TRT_TensorOrWeights(output_tensor), output_dim,
-                            &temp_tensor),
-      node_name);
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      TRT_TensorOrWeights(output_tensor), output_dim, &temp_tensor));
   output_tensor = const_cast<nvinfer1::ITensor*>(temp_tensor);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
 // inputs are both two dimensional (tensorflow::ops::MatMul)
-tensorflow::Status ConvertMatMul(Converter& ctx,
-                                 const tensorflow::NodeDef& node_def,
-                                 const std::vector<TRT_TensorOrWeights>& inputs,
-                                 std::vector<TRT_TensorOrWeights>* outputs) {
-  if (!inputs.at(0).is_tensor()) {
-    return tensorflow::errors::InvalidArgument("Input 0 expects tensor, at" +
-                                               node_def.name());
+tensorflow::Status ConvertMatMul(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights()) {
+    return errors::InvalidArgument("Input expects tensor and weights, at ",
+                                   node_def.name());
   }
 
   TFAttrs attrs(node_def);
   // TODO(jie): INT32 should be converted?
   tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
-  if (tf_dtype != tensorflow::DataType::DT_FLOAT &&
-      tf_dtype != tensorflow::DataType::DT_HALF) {
-    return tensorflow::errors::Unimplemented(
-        "data type is not supported, for node " + node_def.name() + " got " +
-        tensorflow::DataTypeString(tf_dtype));
+  if (tf_dtype != DataType::DT_FLOAT && tf_dtype != DataType::DT_HALF) {
+    return errors::Unimplemented("Data type is not supported, for node ",
+                                 node_def.name(), " got ",
+                                 DataTypeString(tf_dtype));
   }
   bool transpose_a = attrs.get<bool>("transpose_a");
   bool transpose_b = attrs.get<bool>("transpose_b");
 
   // FullyConnected:
   if (transpose_a) {
-    return tensorflow::errors::Internal(
-        "Transpose_a is not supported for TensorRT FullyConnected (op: " +
-        node_def.op() + "), at: " + node_def.name());
-  }
-  if (inputs.at(1).is_tensor()) {
-    return tensorflow::errors::Internal(
-        "Operand 1 must be constant for TensorRT FullyConnected (op: " +
-        node_def.op() + "), at: " + node_def.name());
+    return errors::InvalidArgument(
+        "transpose_a is not supported for TensorRT FullyConnected (op: ",
+        node_def.op(), "), at: ", node_def.name());
   }
-  return ConvertMatMulHelper(ctx, inputs.at(0), inputs.at(1).weights(),
-                             transpose_b, node_def.name(), outputs);
+  if (params->validation_only) return Status::OK();
+  return ConvertMatMulHelper(params, inputs.at(0), inputs.at(1).weights(),
+                             transpose_b, node_def.name());
 }
 
-tensorflow::Status ConvertBatchMatMul(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertBatchMatMul(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   TFAttrs attrs(node_def);
 
   // TODO(jie): INT32 should be converted?
@@ -2511,12 +3167,12 @@ tensorflow::Status ConvertBatchMatMul(
   bool transpose_a = attrs.get<bool>("adj_x");
   bool transpose_b = attrs.get<bool>("adj_y");
 
-  auto dims = inputs.at(0).shape();
+  auto dims = inputs.at(0).GetTrtDims();
   if (dims.nbDims == 1) {  // NC * CK is only supported through fully connected
     if (transpose_a == false && inputs.at(0).is_tensor() &&
         inputs.at(1).is_weights()) {
-      return ConvertMatMulHelper(ctx, inputs.at(0), inputs.at(1).weights(),
-                                 transpose_b, node_def.name(), outputs);
+      return ConvertMatMulHelper(params, inputs.at(0), inputs.at(1).weights(),
+                                 transpose_b, node_def.name());
     } else {
       return tensorflow::errors::InvalidArgument(
           "Invalid configuration for MatMul, at: " + node_def.name());
@@ -2525,10 +3181,10 @@ tensorflow::Status ConvertBatchMatMul(
 
   const nvinfer1::ITensor* tensor_l;
   const nvinfer1::ITensor* tensor_r;
-  auto dims_l = inputs.at(0).shape();
-  auto dims_r = inputs.at(1).shape();
+  auto dims_l = inputs.at(0).GetTrtDims();
+  auto dims_r = inputs.at(1).GetTrtDims();
   if (inputs.at(0).is_weights()) {
-    if (inputs.at(0).shape().d[0] != 1) {
+    if (inputs.at(0).GetTrtDims().d[0] != 1) {
       return tensorflow::errors::InvalidArgument(
           "Input 0 as weight assumes broadcast across batch for MatMul, at: " +
           node_def.name());
@@ -2540,7 +3196,7 @@ tensorflow::Status ConvertBatchMatMul(
     }
   }
   if (inputs.at(1).is_weights()) {
-    if (inputs.at(1).shape().d[0] != 1) {
+    if (inputs.at(1).GetTrtDims().d[0] != 1) {
       return tensorflow::errors::InvalidArgument(
           "Input 1 as weight assumes broadcast across batch for MatMul, at: " +
           node_def.name());
@@ -2551,29 +3207,24 @@ tensorflow::Status ConvertBatchMatMul(
       dims_r.nbDims--;
     }
   }
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      inputs.at(0), dims_l, &tensor_l));
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      inputs.at(1), dims_r, &tensor_r));
 
-  TFTRT_RETURN_ERROR_IF_FALSE(
-      PrepareTensorForShape(ctx, inputs.at(0), dims_l, &tensor_l),
-      node_def.name());
-  TFTRT_RETURN_ERROR_IF_FALSE(
-      PrepareTensorForShape(ctx, inputs.at(1), dims_r, &tensor_r),
-      node_def.name());
-
-  nvinfer1::IMatrixMultiplyLayer* layer = ctx.network()->addMatrixMultiply(
-      *const_cast<nvinfer1::ITensor*>(tensor_l), transpose_a,
-      *const_cast<nvinfer1::ITensor*>(tensor_r), transpose_b);
+  nvinfer1::IMatrixMultiplyLayer* layer =
+      params->converter->network()->addMatrixMultiply(
+          *const_cast<nvinfer1::ITensor*>(tensor_l), transpose_a,
+          *const_cast<nvinfer1::ITensor*>(tensor_r), transpose_b);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
-#endif
 
-#if NV_TENSORRT_MAJOR > 3
-tensorflow::Status ConvertSoftmax(
-    Converter& ctx, const tensorflow::NodeDef& node_def,
-    const std::vector<TRT_TensorOrWeights>& inputs,
-    std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertSoftmax(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
   int nbDims = tensor->getDimensions().nbDims;
@@ -2582,23 +3233,22 @@ tensorflow::Status ConvertSoftmax(
         "TensorRT Softmax cannot apply on batch dimension, at" +
         node_def.name());
   }
-  nvinfer1::ISoftMaxLayer* layer =
-      ctx.network()->addSoftMax(*const_cast<nvinfer1::ITensor*>(tensor));
+  nvinfer1::ISoftMaxLayer* layer = params->converter->network()->addSoftMax(
+      *const_cast<nvinfer1::ITensor*>(tensor));
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   // Tensorflow SoftMax assumes applying softmax on the last dimension.
   layer->setAxes(1 << (nbDims - 1));
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  // Quantization range for SoftMax is always (0, 1)
+  params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 1.0f);
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
-#endif
 
-#if NV_TENSORRT_MAJOR > 3
-tensorflow::Status ConvertTopK(Converter& ctx,
-                               const tensorflow::NodeDef& node_def,
-                               const std::vector<TRT_TensorOrWeights>& inputs,
-                               std::vector<TRT_TensorOrWeights>* outputs) {
+tensorflow::Status ConvertTopK(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
   int nbDims = tensor->getDimensions().nbDims;
@@ -2621,44 +3271,65 @@ tensorflow::Status ConvertTopK(Converter& ctx,
         " not implemented, at: " + node_def.name());
   }
 
-  nvinfer1::ITopKLayer* layer = ctx.network()->addTopK(
+  nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
       *const_cast<nvinfer1::ITensor*>(tensor), op, k, reducedAxes);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   nvinfer1::ITensor* output_value_tensor = layer->getOutput(0);
   nvinfer1::ITensor* output_indices_tensor = layer->getOutput(1);
-  outputs->push_back(TRT_TensorOrWeights(output_value_tensor));
-  outputs->push_back(TRT_TensorOrWeights(output_indices_tensor));
+  params->outputs->push_back(TRT_TensorOrWeights(output_value_tensor));
+  params->outputs->push_back(TRT_TensorOrWeights(output_indices_tensor));
   return tensorflow::Status::OK();
 }
-#endif
 
-void Converter::register_op_converters() {
-  // vgg_16 slim implementation
+static void RegisterValidatableOpConverters(
+    std::unordered_map<string, OpConverter>* registration) {
+  // TODO(laigd): support all op types.
+  (*registration)["BiasAdd"] = ConvertBiasAdd;
+  (*registration)["Const"] = ConvertConst;
+  (*registration)["Transpose"] = ConvertTranspose;
+  (*registration)["Reshape"] = ConvertReshape;
+  (*registration)["MatMul"] = ConvertMatMul;
+  (*registration)["Relu6"] = ConvertRelu6;
+  (*registration)["Square"] = ConvertSquare;
+  (*registration)["ExpandDims"] = ConvertExpandDims;
+  (*registration)["Squeeze"] = ConvertSqueeze;
+
+  for (auto quantization_op_type :
+       {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3",
+        "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs"}) {
+    (*registration)[quantization_op_type] = ConvertQuantize;
+  }
+  for (auto binary_op_type :
+       {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum"}) {
+    (*registration)[binary_op_type] = ConvertBinary;
+  }
+  for (auto activation_op_type : {"Relu", "Sigmoid", "Tanh"}) {
+    (*registration)[activation_op_type] = ConvertActivation;
+  }
+}
+
+void TrtNodeValidator::RegisterOpValidators() {
+  RegisterValidatableOpConverters(&op_validators_);
+}
+
+void Converter::RegisterOpConverters() {
+  RegisterValidatableOpConverters(&op_registry_);
+
   op_registry_["Conv2D"] = ConvertConv2D;
   op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
-  op_registry_["Relu"] = ConvertActivation;
   op_registry_["MaxPool"] = ConvertPool;
   op_registry_["AvgPool"] = ConvertPool;
-  op_registry_["BiasAdd"] = ConvertScale;
-  op_registry_["Const"] = ConvertConst;
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
   op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
 
-  // resnet_50_v1 slim implementation
-  op_registry_["Add"] = ConvertBinary;
-  op_registry_["Mul"] = ConvertBinary;
-  op_registry_["Sub"] = ConvertBinary;
   op_registry_["Pad"] = ConvertPad;
 
   op_registry_["ConcatV2"] = ConvertConcat;
   op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
   op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
 
-  op_registry_["Div"] = ConvertBinary;
-  op_registry_["RealDiv"] = ConvertBinary;
-
   op_registry_["Rsqrt"] = ConvertUnary;
   op_registry_["Reciprocal"] = ConvertUnary;
   op_registry_["Exp"] = ConvertUnary;
@@ -2666,35 +3337,26 @@ void Converter::register_op_converters() {
   op_registry_["Sqrt"] = ConvertUnary;
   op_registry_["Abs"] = ConvertUnary;
   op_registry_["Neg"] = ConvertUnary;
-#if NV_TENSORRT_MAJOR == 3
-  op_registry_["Mean"] = ConvertReducePool;
-#endif
-#if NV_TENSORRT_MAJOR > 3
+
   op_registry_["Sum"] = ConvertReduce;
   op_registry_["Prod"] = ConvertReduce;
   op_registry_["Max"] = ConvertReduce;
   op_registry_["Min"] = ConvertReduce;
   op_registry_["Mean"] = ConvertReduce;
-  op_registry_["Maximum"] = ConvertBinary;
-  op_registry_["Minimum"] = ConvertBinary;
   op_registry_["Softmax"] = ConvertSoftmax;
-  op_registry_["MatMul"] = ConvertMatMul;
   op_registry_["BatchMatMul"] = ConvertBatchMatMul;
   op_registry_["TopKV2"] = ConvertTopK;
-#endif
 
   plugin_converter_ = ConvertPlugin;
 }
 
-}  // namespace
-
 tensorflow::Status ConvertGraphDefToEngine(
     const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
     size_t max_workspace_size_bytes,
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
-    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully) {
   engine->reset();
   if (convert_successfully) *convert_successfully = false;
@@ -2704,14 +3366,16 @@ tensorflow::Status ConvertGraphDefToEngine(
       nvinfer1::createInferBuilder(*logger));
   builder->setMaxBatchSize(max_batch_size);
   builder->setMaxWorkspaceSize(max_workspace_size_bytes);
-#if NV_TENSORRT_MAJOR > 3
   builder->setGpuAllocator(allocator);
-#endif
   if (precision_mode == FP16MODE) {
     builder->setHalf2Mode(true);
   } else if (precision_mode == INT8MODE) {
     builder->setInt8Mode(true);
-    builder->setInt8Calibrator(calibrator);
+    if (use_calibration) {
+      builder->setInt8Calibrator(calibrator);
+    } else {
+      builder->setInt8Calibrator(nullptr);
+    }
   }
 
   // Create the network.
@@ -2721,11 +3385,10 @@ tensorflow::Status ConvertGraphDefToEngine(
     return tensorflow::errors::Internal(
         "Failed to create TensorRT network object");
   }
-  auto ws = std::unique_ptr<TRTWeightStore>(new TRTWeightStore());
 
   // Build the network
   VLOG(1) << "Starting engine conversion ";
-  Converter converter(trt_network.get(), ws.get(), precision_mode == FP16MODE);
+  Converter converter(trt_network.get(), precision_mode, use_calibration);
   std::vector<std::pair<string, string>> output_tensors;
   // Graph nodes are already topologically sorted during construction
   for (const auto& node_def : gdef.node()) {
@@ -2739,10 +3402,13 @@ tensorflow::Status ConvertGraphDefToEngine(
         return tensorflow::errors::InvalidArgument(
             "Failed to parse slot number from ", node_name);
       }
-      nvinfer1::DataType dtype;
+      nvinfer1::DataType trt_dtype;
+      nvinfer1::Dims trt_dims;
+      int batch_size = -1;
       auto shape = input_shapes.at(slot_number);
-      auto status = ValidateInputProperties(
-          shape, node_def.attr().at("dtype").type(), &dtype);
+      auto status = ValidateTensorProperties(
+          node_def.op(), node_def.attr().at("dtype").type(), shape,
+          /*validation_only=*/false, &trt_dtype, &trt_dims, &batch_size);
       if (!status.ok()) {
         const string error_message =
             StrCat("Validation failed for ", node_name, " and input slot ",
@@ -2750,29 +3416,13 @@ tensorflow::Status ConvertGraphDefToEngine(
         LOG(WARNING) << error_message;
         return Status(status.code(), error_message);
       }
-
-#if NV_TENSORRT_MAJOR == 3
-      nvinfer1::DimsCHW input_dim;
-#elif NV_TENSORRT_MAJOR > 3
-      nvinfer1::Dims input_dim;
-#endif
-      for (int i = 1; i < shape.dims(); i++) {
-        input_dim.d[i - 1] = shape.dim_size(i);
-      }
-      input_dim.nbDims = shape.dims() - 1;
-      nvinfer1::ITensor* input_tensor =
-          converter.network()->addInput(node_name.c_str(), dtype, input_dim);
-      if (!input_tensor) {
-        return tensorflow::errors::InvalidArgument(
-            "Failed to create Input layer tensor ", node_name,
-            " rank=", shape.dims() - 1);
-      }
       VLOG(2) << "Adding engine input tensor " << node_name << " with shape "
-              << DebugString(input_dim);
-      if (!converter.insert_input_tensor(node_name, input_tensor)) {
-        return tensorflow::errors::AlreadyExists(
-            "Output tensor already exists for op: " + node_name);
-      }
+              << DebugString(trt_dims);
+      // TODO(laigd): the conversion should always happen at runtime where all
+      // the shapes are known, and we can provide a mode to generate the
+      // engines offline, by calling sess.run() and cache/serialize the engines.
+      TF_RETURN_IF_ERROR(
+          converter.AddInputTensor(node_name, trt_dtype, trt_dims, batch_size));
     } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) &&
                (node_def.op() == "Identity")) {
       int32 slot_number = -1;
@@ -2788,28 +3438,15 @@ tensorflow::Status ConvertGraphDefToEngine(
     } else {
       VLOG(2) << "Converting node: " << node_def.name() << " , "
               << node_def.op();
-      TF_RETURN_IF_ERROR(converter.convert_node(node_def));
-    }
-  }
-  for (const auto& output : output_tensors) {
-    auto tensor_or_weights = converter.get_tensor(output.first);
-    if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument(
-          "Output node '" + output.first + "' is weights not tensor");
-    }
-    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
-    tensor->setName(output.second.c_str());
-    if (!tensor) {
-      return tensorflow::errors::NotFound("Output tensor not found: " +
-                                          output.first);
+      TF_RETURN_IF_ERROR(converter.ConvertNode(node_def));
     }
-    VLOG(1) << "Marking output tensor " << output.first << ", as output tensor "
-            << output.second;
-
-    converter.network()->markOutput(*tensor);
   }
+  TF_RETURN_IF_ERROR(converter.RenameAndMarkOutputTensors(output_tensors));
   if (convert_successfully) *convert_successfully = true;
 
+  // Apply user provided quantization ranges to tensors
+  converter.MaybeApplyQuantizationRanges();
+
   // Build the engine.
   VLOG(1) << "Starting engine creation";
   engine->reset(builder->buildCudaEngine(*converter.network()));
@@ -2843,14 +3480,14 @@ tensorflow::Status ConvertSegmentToGraphDef(
     tensorflow::DataType dtype;
     tensorflow::PartialTensorShape partial_shape;
     if (connection.is_input_edge) {
-      GetInputProperties(graph_properties,
-                         graph->FindNodeId(connection.outside_id),
-                         connection.outside_port, &partial_shape, &dtype);
-      connection.outside_shape = partial_shape;
-    } else {
       GetOutputProperties(graph_properties,
                           graph->FindNodeId(connection.outside_id),
                           connection.outside_port, &partial_shape, &dtype);
+      connection.outside_shape = partial_shape;
+    } else {
+      GetInputProperties(graph_properties,
+                         graph->FindNodeId(connection.outside_id),
+                         connection.outside_port, &partial_shape, &dtype);
       connection.inside_shape = partial_shape;
     }
     connection.connection_type = dtype;
@@ -2954,47 +3591,11 @@ tensorflow::Status ConvertSegmentToGraphDef(
     }
   }
   *common_scope = local_scope;
-  VLOG(0) << "Segment @scope '" << local_scope << "', converted to graph";
+  VLOG(1) << "Converted TensorRT candidate segment @scope '" << local_scope
+          << "' to a GraphDef";
   return tensorflow::Status::OK();
 }
 
-bool InputEdgeValidator::operator()(const tensorflow::Edge* in_edge) const {
-  if (in_edge->IsControlEdge()) return true;
-  PartialTensorShape shape;
-  tensorflow::DataType dtype;
-  GetInputProperties(graph_properties_, in_edge->src(), in_edge->src_output(),
-                     &shape, &dtype);
-  nvinfer1::DataType trt_dtype;
-  Status status = ValidateInputProperties(shape, dtype, &trt_dtype);
-  if (!status.ok()) {
-    VLOG(1) << "--> Need to remove input node " << in_edge->dst()->name()
-            << ": " << status;
-    return false;
-  }
-
-
-  if (in_edge->src()->type_string() != "Const" &&
-#if NV_TENSORRT_MAJOR == 3
-      // TRT 3.x only support 4 dimensional input tensor.
-      shape.dims() != 4) {
-#else
-      // Single dimensional input tensor is not supported since the first
-      // dimension is treated as batch dimension.
-      shape.dims() < 2) {
-#endif
-    VLOG(1) << "--> Need to remove input node " << in_edge->dst()->name()
-            << " which has an input at port " << in_edge->dst_input() << " with"
-#if NV_TENSORRT_MAJOR == 3
-            << " #dim!=4"
-#else
-            << " #dim<2"
-#endif
-            << " and is not a const: " << shape;
-    return false;
-  }
-  return true;
-}
-
 bool OutputEdgeValidator::operator()(const tensorflow::Edge* out_edge) const {
   if (out_edge->IsControlEdge()) return true;
   if (out_edge->src()->type_string() == "Const") {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 9274027e6327dbb29f30f5353fe449b57449d0fa..54e19b73957bccdae2b23bd3556de9ad00b864e5 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -33,6 +34,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -90,7 +92,8 @@ struct EngineInfo {
   EngineInfo()
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
-        precision_mode(FP32MODE) {}
+        precision_mode(FP32MODE),
+        use_calibration(true) {}
 
   string engine_name;
   string device;
@@ -107,6 +110,7 @@ struct EngineInfo {
   int maximum_cached_engines;
   std::vector<int> cached_engine_batches;
   int precision_mode;
+  bool use_calibration;
 };
 
 // Constructs a graphdef from the segment in the given graph. Adds placeholder
@@ -143,24 +147,9 @@ tensorflow::Status ConvertGraphDefToEngine(
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
-    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully);
 
-// Helper class for the segmenter to determine whether an input edge to the TRT
-// segment is valid.
-class InputEdgeValidator {
- public:
-  InputEdgeValidator(const grappler::GraphProperties& graph_properties)
-      : graph_properties_(graph_properties) {}
-
-  // Return true if the specified edge is eligible to be an input edge of the
-  // TRT segment.
-  bool operator()(const tensorflow::Edge* in_edge) const;
-
- private:
-  const grappler::GraphProperties& graph_properties_;
-};
-
 // Helper class for the segmenter to determine whether an output edge from the
 // TRT segment is valid.
 class OutputEdgeValidator {
@@ -170,6 +159,384 @@ class OutputEdgeValidator {
   bool operator()(const tensorflow::Edge* out_edge) const;
 };
 
+string DebugString(const nvinfer1::Dims& dims);
+string DebugString(const nvinfer1::ITensor& tensor);
+int64_t TrtDimsNumElements(const nvinfer1::Dims& dims);
+
+// Class to convert TF compile-time constants (e.g. Const nodes) to TRT weight.
+class TRT_ShapedWeights {
+ public:
+  explicit TRT_ShapedWeights(DataType type = DT_FLOAT);
+
+  // Copy from another weights.
+  //
+  // NOTE: this does not copy the underlying buffer but only increase its
+  // reference count.
+  TRT_ShapedWeights(const TRT_ShapedWeights& rhs);
+
+  nvinfer1::Weights GetTrtWeights() const;
+
+  void* GetValues() const {
+    return const_cast<char*>(tensor_.tensor_data().data());
+  }
+
+  int64_t count() const;
+
+  size_t size_bytes() const;
+
+  string DebugString() const;
+
+  // TODO(aaroey): make these private.
+  nvinfer1::Dims shape_;  // Note: shape.type[] is not used.
+  tensorflow::DataType type_;
+
+ private:
+  // This constructor is only used by TrtWeightStore, which creates the
+  // underlying buffer.
+  TRT_ShapedWeights(DataType type, nvinfer1::Dims dims, Tensor tensor);
+
+  Tensor tensor_;
+
+  friend class TrtWeightStore;
+};
+
+// Container for TRT_ShapedWeights. We need this container because, TRT doesn't
+// manage the lifetime of the weights buffer, it only keeps a pointer to it and
+// requires that the data referenced by the pointer be available until the
+// building of engine is complete. For more information see
+// https://docs.nvidia.com/deeplearning/sdk/tensorrt-api/c_api/classnvinfer1_1_1_weights.html
+//
+// TODO(laigd): consider adding garbage collection to the unused weights.
+class TrtWeightStore {
+ public:
+  // Get a TRT_ShapedWeights with 'type' and 'dims'.
+  TRT_ShapedWeights GetTempWeights(tensorflow::DataType type,
+                                   const nvinfer1::Dims& dims);
+
+  // Get a TRT_ShapedWeights with the same data type and dimensions as
+  // 'weights'.
+  TRT_ShapedWeights GetTempWeights(const TRT_ShapedWeights& weights) {
+    return GetTempWeights(weights.type_, weights.shape_);
+  }
+
+ private:
+  // The backend storage of the TRT_ShapedWeights.
+  std::vector<Tensor> store_;
+};
+
+// Represents a TRT-style input to a TF node, it can be either a
+// nvinfer1::ITensor, or TRT_ShapedWeights which is compile-time constant.
+//
+// TODO(laigd): maybe rename it to TrtArgument, or mimic XlaCompiler::Argument.
+class TRT_TensorOrWeights {
+ public:
+  TRT_TensorOrWeights() {}
+
+  // Constructor that makes it an ITensor, doesn't take ownership of 'tensor'.
+  // This is used by Converter when building the TRT network, where the ITensor
+  // is owned by the TRT network being built. See comment for 'tensor_' below.
+  explicit TRT_TensorOrWeights(nvinfer1::ITensor* tensor, int batch_size = -1);
+
+  // Constructor that makes it an ITensor by creating one using provided data
+  // type and shape, and takes ownership of the created ITensor. This is used by
+  // TrtNodeValidator to encapsulate the type and shape information for
+  // validation of graph nodes, and the created ITensor is fake and temporary,
+  // and should not be used to build any TRT network. See comment for
+  // 'simple_itensor_' below.
+  explicit TRT_TensorOrWeights(nvinfer1::DataType trt_dtype,
+                               const nvinfer1::Dims& trt_dims, int batch_size);
+
+  // Constructor that makes it a TRT_TensorOrWeights.
+  explicit TRT_TensorOrWeights(const TRT_ShapedWeights& weights);
+
+  TRT_TensorOrWeights(const TRT_TensorOrWeights& rhs);
+
+  void operator=(const TRT_TensorOrWeights& rhs);
+
+  bool is_tensor() const { return initialized_ && is_tensor_; }
+  bool is_weights() const { return initialized_ && !is_tensor_; }
+
+  nvinfer1::ITensor* tensor();
+
+  const nvinfer1::ITensor* tensor() const;
+
+  TRT_ShapedWeights& weights() {
+    CHECK(is_weights());
+    return weights_;
+  }
+
+  const TRT_ShapedWeights& weights() const {
+    CHECK(is_weights());
+    return weights_;
+  }
+
+  nvinfer1::Dims GetTrtDims() const;
+
+  int batch_size() const { return batch_size_; }
+
+  string DebugString() const;
+
+ private:
+  class SimpleITensor;
+
+  void set_batch_size(int batch_size) { batch_size_ = batch_size; }
+
+  // When it represents an ITensor, the ITensor can be either passed by the
+  // caller via the constructor that takes an ITensor* as parameter, or be
+  // created as a SimpleITensor.
+  //
+  // In the first case, the ITensor pointer is stored in 'tensor_' below, and
+  // the ITensor itself is not owned by this class. This method is used by
+  // Converter (e.g. AddInputTensor) and op converters during TRT network
+  // construction, where the TRT network owns the ITensor.
+  //
+  // In the second case, the created SimpleITensor is stored in
+  // 'simple_itensor_' below and is owned by this class. SimpleITensor is a fake
+  // implementation of ITensor and is used only by TrtNodeValidator to validate
+  // the graph nodes.
+  nvinfer1::ITensor* tensor_ = nullptr;  // Not owned.
+  std::shared_ptr<SimpleITensor> simple_itensor_ = nullptr;
+
+  // First dimension of the TF tensor (NOT tensor_) that is represented by
+  // tensor_ is treated as the "batch dimension" by TRT, and tensor_'s
+  // dimensions (obtained via tensor_->getDimensions()) do not contain the batch
+  // dimension. For example, when a TF tensor with shape (A,B,C) is represented
+  // in TRT, tensor_->getDimensions() will be (B,C) and batch_size_ will be A.
+  //
+  // This requires that all tensors in the subgraph that is converted to a TRT
+  // engine have the same batch size are represented by the first dimension of
+  // their shape, and Converter will verify this during conversion. The drawback
+  // is that currently it cannot convert a graph that doesn't have the batch
+  // size represented in the shapes or the batch sizes are different. See
+  // b/118387490 for more details.
+  int batch_size_ = -1;
+
+  TRT_ShapedWeights weights_;
+  bool initialized_ = false;
+  bool is_tensor_ = false;
+
+  friend class Converter;
+};
+
+class Converter;
+
+// Parameters for each op converter.
+struct OpConverterParams {
+  OpConverterParams(Converter* arg_converter,
+                    const tensorflow::NodeDef& arg_node_def,
+                    const std::vector<TRT_TensorOrWeights>& arg_inputs,
+                    std::vector<TRT_TensorOrWeights>* arg_outputs,
+                    bool arg_validation_only, TrtWeightStore* arg_weight_store)
+      : converter(arg_converter),
+        node_def(arg_node_def),
+        inputs(arg_inputs),
+        outputs(arg_outputs),
+        validation_only(arg_validation_only),
+        weight_store(arg_weight_store) {}
+
+  Converter* converter;
+  const tensorflow::NodeDef& node_def;
+  const std::vector<TRT_TensorOrWeights>& inputs;
+  std::vector<TRT_TensorOrWeights>* outputs;
+  const bool validation_only;
+  TrtWeightStore* weight_store;
+};
+
+using OpConverter = std::function<Status(OpConverterParams*)>;
+
+// Class to verify if specific TF node is supported by TRT.
+class TrtNodeValidator {
+ public:
+  TrtNodeValidator();
+
+  // Validate the node, and return ok if it's supported by TRT.
+  //
+  // - 'node_def' is the node to validate.
+  // - 'input_node_and_ports' are the input NodeDefs and their output ports that
+  //   are connected to 'node_def' in the TF graph.
+  // - 'graph_properties' is the GraphProperties of the graph where 'node_def'
+  //   belongs. It is used to get the shape and data type information of a
+  //   tensor for validation purpose.
+  Status ValidateNode(
+      const NodeDef& node_def,
+      const std::vector<std::pair<const NodeDef*, int>>& input_node_and_ports,
+      const grappler::GraphProperties& graph_properties);
+
+ private:
+  void RegisterOpValidators();
+
+  // Convert a Const node to a TRT_TensorOrWeights.
+  Status ConvertConstToWeights(const NodeDef& const_node_def,
+                               const std::vector<TRT_TensorOrWeights>& inputs,
+                               TRT_TensorOrWeights* output);
+
+  // Convert the output tensor at 'output_port' of 'node_def' to a
+  // TRT_TensorOrWeights which will be later used as an input to other nodes and
+  // passed to ValidateNode() below.
+  Status ConvertToTensorOrWeights(
+      const NodeDef& node_def, int output_port,
+      const grappler::GraphProperties& graph_properties,
+      TRT_TensorOrWeights* tensor_or_weights);
+
+  // Stores all the validators by op type. If no validator is registered for
+  // specific op, it means no validation is needed and ValidateNode() will
+  // return OK.
+  std::unordered_map<string, OpConverter> op_validators_;
+
+  // Store the weights added during validation. Some validations (e.g.
+  // validation for Const node) may produce weights.
+  TrtWeightStore weight_store_;
+
+  friend class ValidatorTest;
+  friend class OpConverterTest;
+};
+
+// Class to convert TF nodes to TRT network.
+class Converter {
+ public:
+  Converter(nvinfer1::INetworkDefinition* trt_network, int precision_mode,
+            bool use_calibration);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Methods used by the TRT engine builder to build a TRT network from a TF
+  // function/subgraph.
+
+  // Convert the node to TRT network.
+  Status ConvertNode(const tensorflow::NodeDef& node_def);
+
+  // Add input tensor to the TRT network with given 'name', 'dtype', 'dims' and
+  // 'batch_size'.
+  Status AddInputTensor(const string& name, nvinfer1::DataType dtype,
+                        const nvinfer1::Dims& dims, int batch_size);
+
+  // Mark the tensors with names specified by output_tensors[i].first as output
+  // of the TRT network, and set their names in the TRT network as
+  // output_tensors[i].second. The tensor names (output_tensors[i].first) are
+  // standard TF tensor names, i.e. node names followed by output slot number
+  // (or just the node name if the tensor is the first output of the node).
+  Status RenameAndMarkOutputTensors(
+      const std::vector<std::pair<string, string>>& output_tensors);
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Methods used by op converters to convert individual TF node and add layers
+  // to the TRT network.
+
+  // Op converters (e.g. ConvertReshape) need to access the TRT network in order
+  // to add TRT layers.
+  nvinfer1::INetworkDefinition* network() { return trt_network_; }
+
+  // What precision are we targeting?
+  int precision_mode() const { return precision_mode_; }
+
+  // Calibration will be or was previously performed on this network?
+  bool use_calibration() const { return use_calibration_; }
+
+  // This should be called on the inputs and outputs of any layer we create
+  // where we know that the quantization range does not change during that
+  // operation. (e.g. Reshape, Transpose, Identity, MaxPool).
+  void MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input,
+                                          nvinfer1::ITensor* output);
+
+  // This function should be called when we know the quantization range of a
+  // tensor, either from a quantize/dequantize node or when the output is a
+  // fixed range (e.g. SoftMax, Relu6, Sigmoid).
+  void ProvideQuantizationRange(nvinfer1::ITensor* tensor, float min_range,
+                                float max_range);
+
+  // Should be called when full TRT network has been constructed and before
+  // building the engine.
+  void MaybeApplyQuantizationRanges();
+
+  // Below are helper methods for op converters to add different layers to the
+  // TRT network.
+
+  // Transpose 'input_tensor' with given permutation 'order_with_batch_dim' to
+  // 'output_tensor'. The permutation 'order_with_batch_dim' contains the batch
+  // dimension which should always be 0.
+  Status TransposeTensor(nvinfer1::ITensor* input_tensor,
+                         const std::vector<int>& order_with_batch_dim,
+                         const nvinfer1::ITensor** output_tensor);
+
+  // Converts 'input' into 'tensor' with shape specified by 'dims'.
+  Status PrepareTensorForShape(const TRT_TensorOrWeights& input,
+                               const nvinfer1::Dims& dims,
+                               const nvinfer1::ITensor** tensor);
+
+  // Return OK if the broadcast scheme is supported and compute the shapes after
+  // broadcasting.
+  Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
+                              const TRT_TensorOrWeights& operand_r,
+                              nvinfer1::Dims* operand_l_new_dims,
+                              nvinfer1::Dims* operand_r_new_dims) const;
+
+ private:
+  // Verify the provided batch_size is consistent with batch_size_ and update it
+  // if necessary.
+  Status MaybeUpdateBatchSize(int batch_size);
+
+  // Add the provided tensor/weights to the map trt_tensors_.
+  Status AddTensorOrWeights(const string& name, TRT_TensorOrWeights input);
+
+  // Get the tensor/weights from trt_tensors_ by 'name'.
+  Status GetTensorOrWeights(const string& name, TRT_TensorOrWeights* output);
+
+  // Get the inputs of 'node_def' from trt_tensors_.
+  Status GetInputs(const tensorflow::NodeDef& node_def,
+                   std::vector<TRT_TensorOrWeights>* inputs) const;
+
+  void RegisterOpConverters();
+
+  void PropagateQuantizationRanges();
+
+  // Gets the min and max value in a TRT_ShapedWeights
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const;
+
+  // Registered op converters by op type.
+  std::unordered_map<string, OpConverter> op_registry_;
+
+  // Tensors/weights added during construction of trt_network_.
+  std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
+
+  // Special op converter for custom plugins.
+  OpConverter plugin_converter_;
+
+  // The TRT networking being built.
+  nvinfer1::INetworkDefinition* trt_network_;
+
+  // Store the weights added during construction of trt_network_.
+  TrtWeightStore weight_store_;
+
+  // During conversion, this table is populated with quantization ranges per
+  // tensor. MaybeApplyQuantizationRanges() will use this table to set the TRT
+  // quantization ranges. Since TRT only supports symmetric ranges, we will
+  // store the range as a single float = max(abs(min_range), abs(max_range)).
+  // Range refers to the floating point values, e.g. min_range = 0.0f, max_range
+  // = 6.0f for Relu6.
+  std::unordered_map<nvinfer1::ITensor*, float> quantization_ranges_;
+
+  // Edges where quantization ranges can be inferred (copied) across ops - from
+  // first tensor to second tensor. PropagateQuantizationRanges() will propagate
+  // known ranges from quantization_ranges_ across these edges, adding the new
+  // ranges to quantization_ranges_ so that they can be applied in
+  // MaybeApplyQuantizationRanges().
+  std::vector<std::pair<nvinfer1::ITensor*, nvinfer1::ITensor*>>
+      quantization_infer_;
+
+  const int precision_mode_;
+
+  const bool use_calibration_;
+
+  // Batch size of inputs to trt_network_ added by AddInputTensor(). During
+  // network construction it will update this, use it to verify the batch
+  // size of all inputs are compatible, and make sure individual TF node is
+  // acceptable by TRT.
+  int batch_size_ = -1;
+
+  friend class ConverterTest;
+  friend class OpConverterTest;
+};
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c37a43dd5def9daf3c5d70720c6db2aab20db077
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
@@ -0,0 +1,2357 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
+#include "tensorflow/core/public/session.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+using ::tensorflow::strings::StrCat;
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+// TODO(laigd): put this into some test utils file.
+void ExpectStatus(Status status, error::Code code = error::OK,
+                  const char* substr = nullptr) {
+  EXPECT_EQ(code, status.code())
+      << status << " vs expected error code \"" << error::Code_Name(code)
+      << "\" and message \"" << substr << "\"";
+  if (substr) {
+    EXPECT_THAT(status.error_message(), ::testing::HasSubstr(substr)) << status;
+  }
+}
+
+nvinfer1::Dims GetTestDims(const std::vector<int>& d) {
+  nvinfer1::Dims dims;
+  dims.nbDims = d.size();
+  for (int i = 0; i < d.size(); ++i) {
+    dims.d[i] = d[i];
+  }
+  return dims;
+}
+
+nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) {
+  switch (tf_dtype) {
+    case DT_FLOAT:
+      return nvinfer1::DataType::kFLOAT;
+    case DT_HALF:
+      return nvinfer1::DataType::kHALF;
+    case DT_INT32:
+      return nvinfer1::DataType::kINT32;
+    default:
+      QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype);
+  }
+}
+
+DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      return DT_FLOAT;
+    case nvinfer1::DataType::kHALF:
+      return DT_HALF;
+    case nvinfer1::DataType::kINT32:
+      return DT_INT32;
+    default:
+      QCHECK(false) << "Unexpected data type " << static_cast<int>(trt_dtype);
+  }
+}
+
+NodeDef MakeNodeDef(const string& name, const string& op,
+                    const std::vector<string>& inputs) {
+  NodeDef node_def;
+  node_def.set_name(name);
+  node_def.set_op(op);
+  for (const string& input : inputs) {
+    node_def.add_input(input);
+  }
+  return node_def;
+}
+
+template <typename T>
+NodeDef MakeConstNodeDef(const string& name, const std::vector<T>& vals,
+                         const TensorShape& shape) {
+  Scope s = Scope::NewRootScope();
+  Tensor t = ::tensorflow::test::AsTensor<T>(vals, shape);
+  auto const_op = ops::Const(s.WithOpName(name), t);
+  return const_op.node()->def();
+}
+
+template <typename T>
+NodeDef MakeConstNodeDef(const string& name, const std::vector<T>& vals) {
+  TensorShape shape;
+  const std::vector<int32> shape_dims = {static_cast<int32>(vals.size())};
+  TF_EXPECT_OK(TensorShapeUtils::MakeShape(shape_dims, &shape));
+  return MakeConstNodeDef(name, vals, shape);
+}
+
+bool TrtDimsEquals(const nvinfer1::Dims& lhs, const nvinfer1::Dims& rhs) {
+  if (lhs.nbDims != rhs.nbDims) return false;
+  for (int i = 0; i < lhs.nbDims; ++i) {
+    if (lhs.d[i] != rhs.d[i]) return false;
+    // We don't check the types in the tests.
+  }
+  return true;
+}
+
+bool TrtDimsEqualsArray(const std::vector<int>& lhs,
+                        const nvinfer1::Dims& rhs) {
+  return TrtDimsEquals(GetTestDims(lhs), rhs);
+}
+
+// TODO(laigd): define a parameterized matcher that can compare against the
+// vector.
+void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
+                              const nvinfer1::Dims& rhs) {
+  EXPECT_TRUE(TrtDimsEqualsArray(lhs, rhs))
+      << "expected: " << DebugString(GetTestDims(lhs)) << "\n"
+      << "  actual: " << DebugString(rhs);
+}
+
+template <typename T>
+void ExpectArrayNear(const std::vector<T>& lhs, const std::vector<T>& rhs) {
+  ASSERT_EQ(lhs.size(), rhs.size());
+  for (int i = 0; i < lhs.size(); i++) {
+    EXPECT_FLOAT_EQ(lhs[i], rhs[i]);
+  }
+}
+
+// Eigen::half cannot implicitly convert to float which is required for
+// EXPECT_FLOAT_EQ.
+template <>
+void ExpectArrayNear(const std::vector<Eigen::half>& lhs,
+                     const std::vector<Eigen::half>& rhs) {
+  ASSERT_EQ(lhs.size(), rhs.size());
+  for (int i = 0; i < lhs.size(); i++) {
+    EXPECT_FLOAT_EQ(Eigen::half_impl::half_to_float(lhs[i]),
+                    Eigen::half_impl::half_to_float(rhs[i]));
+  }
+}
+
+bool TrtShapedWeightsEquals(const TRT_ShapedWeights& lhs,
+                            const TRT_ShapedWeights& rhs) {
+  return TrtDimsEquals(lhs.shape_, rhs.shape_) && lhs.type_ == rhs.type_ &&
+         lhs.GetValues() == rhs.GetValues();
+}
+
+template <typename T>
+void ValidateWeights(const TRT_ShapedWeights& weights,
+                     const std::vector<int>& expected_dims,
+                     const std::vector<T>& expected_value) {
+  ExpectTrtDimsEqualsArray(expected_dims, weights.shape_);
+  ASSERT_EQ(expected_value.size(), weights.count()) << weights.DebugString();
+  const T* actual_values = static_cast<const T*>(weights.GetValues());
+  for (int i = 0; i < expected_value.size(); ++i) {
+    EXPECT_EQ(expected_value[i], actual_values[i]);
+  }
+}
+
+// Fake ITensor implementation for testing purposes.
+class FakeITensor : public nvinfer1::ITensor {
+ public:
+  FakeITensor() : dynamic_range_(0.0f) {}
+
+  FakeITensor(const nvinfer1::Dims& dims) : dims_(dims), dynamic_range_(0.0f) {}
+
+  FakeITensor(const std::vector<int>& dims)
+      : dims_(GetTestDims(dims)), dynamic_range_(0.0f) {}
+
+  void setName(const char* name) override { name_ = name; }
+
+  const char* getName() const override { return name_.c_str(); }
+
+  void setDimensions(nvinfer1::Dims dimensions) override { dims_ = dimensions; }
+
+  nvinfer1::Dims getDimensions() const override { return dims_; }
+
+  void setType(nvinfer1::DataType type) override { type_ = type; }
+
+  nvinfer1::DataType getType() const override { return type_; }
+
+  bool isNetworkInput() const override { return false; }
+
+  bool isNetworkOutput() const override { return false; }
+
+  void setBroadcastAcrossBatch(bool broadcastAcrossBatch) override {}
+
+  bool getBroadcastAcrossBatch() const override { return false; }
+
+  nvinfer1::TensorLocation getLocation() const override { return location_; }
+
+  void setLocation(nvinfer1::TensorLocation location) override {
+    location_ = location;
+  }
+
+#if NV_TENSORRT_MAJOR >= 5
+  bool setDynamicRange(float min, float max) override {
+    dynamic_range_ = std::max(std::abs(min), std::abs(max));
+    return true;
+  }
+
+  float getDynamicRange() const override { return dynamic_range_; }
+#endif
+
+ private:
+  string name_;
+  nvinfer1::Dims dims_;
+  nvinfer1::DataType type_;
+  nvinfer1::TensorLocation location_;
+  float dynamic_range_;
+};
+
+TEST(TRT_ShapedWeights_Test, Basic) {
+  // Test constructor with no arguments.
+  {
+    TRT_ShapedWeights weights;
+    TRT_ShapedWeights copy(weights);
+    for (auto ptr : {&weights, &copy}) {
+      nvinfer1::Weights trt_weights = ptr->GetTrtWeights();
+      EXPECT_EQ(nvinfer1::DataType::kFLOAT, trt_weights.type);
+      EXPECT_EQ(nullptr, trt_weights.values);
+      EXPECT_EQ(0, trt_weights.count);
+
+      EXPECT_EQ(nullptr, ptr->GetValues());
+      EXPECT_EQ(0, ptr->count());
+      EXPECT_EQ(0, ptr->size_bytes());
+    }
+  }
+  // Test constructor with DataType argument.
+  {
+    TRT_ShapedWeights weights(DT_FLOAT);
+    TRT_ShapedWeights copy(weights);
+    for (auto ptr : {&weights, &copy}) {
+      nvinfer1::Weights trt_weights = ptr->GetTrtWeights();
+      EXPECT_EQ(nvinfer1::DataType::kFLOAT, trt_weights.type);
+      EXPECT_EQ(nullptr, trt_weights.values);
+      EXPECT_EQ(0, trt_weights.count);
+
+      EXPECT_EQ(nullptr, ptr->GetValues());
+      EXPECT_EQ(0, ptr->count());
+      EXPECT_EQ(0, ptr->size_bytes());
+    }
+  }
+  // Test constructor with DataType and nvinfer1::Dims arguments.
+  {
+    TrtWeightStore store;
+    TRT_ShapedWeights weights =
+        store.GetTempWeights(DT_FLOAT, GetTestDims({2, 5}));
+    TRT_ShapedWeights copy(weights);
+    for (auto ptr : {&weights, &copy}) {
+      nvinfer1::Weights trt_weights = ptr->GetTrtWeights();
+      EXPECT_EQ(nvinfer1::DataType::kFLOAT, trt_weights.type);
+      EXPECT_NE(nullptr, trt_weights.values);
+      EXPECT_EQ(10, trt_weights.count);
+
+      EXPECT_EQ(trt_weights.values, ptr->GetValues());
+      EXPECT_EQ(10, ptr->count());
+      EXPECT_EQ(40, ptr->size_bytes());
+    }
+    // Test that it doesn't copy the underlying buffer.
+    EXPECT_EQ(weights.GetValues(), copy.GetValues());
+  }
+}
+
+TEST(TRT_TensorOrWeights_Test, Basic) {
+  // Test constructor with no arguments.
+  {
+    TRT_TensorOrWeights tw;
+    TRT_TensorOrWeights copy(tw);
+    TRT_TensorOrWeights assigned;
+    assigned = tw;
+    for (auto ptr : {&tw, &copy, &assigned}) {
+      EXPECT_EQ(false, ptr->is_tensor());
+      EXPECT_EQ(false, ptr->is_weights());
+      EXPECT_EQ(-1, ptr->batch_size());
+    }
+  }
+
+  // Test constructor with ITensor and batch size argument.
+  {
+    nvinfer1::Dims dims;
+    dims.nbDims = 1;
+    dims.d[0] = 1;
+    FakeITensor itensor(dims);
+    TRT_TensorOrWeights tw(&itensor);
+    TRT_TensorOrWeights tw1(&itensor, /*batch_size=*/1);
+
+    for (auto original_ptr : {&tw, &tw1}) {
+      TRT_TensorOrWeights copy(*original_ptr);
+      TRT_TensorOrWeights assigned;
+      assigned = *original_ptr;
+
+      for (auto ptr : {original_ptr, &copy, &assigned}) {
+        EXPECT_EQ(true, ptr->is_tensor());
+        EXPECT_EQ(false, ptr->is_weights());
+        if (original_ptr == &tw) {
+          EXPECT_EQ(-1, ptr->batch_size());
+        } else {
+          EXPECT_EQ(1, ptr->batch_size());
+        }
+        EXPECT_EQ(&itensor, ptr->tensor());
+        ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
+      }
+    }
+  }
+  // Test constructor which creates and owns an ITensor.
+  {
+    nvinfer1::Dims dims;
+    dims.nbDims = 1;
+    dims.d[0] = 1;
+    TRT_TensorOrWeights tw(nvinfer1::DataType::kFLOAT, dims, /*batch_size=*/1);
+    TRT_TensorOrWeights copy(tw);
+    TRT_TensorOrWeights assigned;
+    assigned = tw;
+
+    for (auto ptr : {&tw, &copy, &assigned}) {
+      EXPECT_EQ(true, ptr->is_tensor());
+      EXPECT_EQ(false, ptr->is_weights());
+      EXPECT_EQ(1, ptr->batch_size());
+      EXPECT_NE(nullptr, ptr->tensor());
+      ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
+    }
+  }
+  // Test constructor with TRT_ShapedWeights argument.
+  {
+    TRT_ShapedWeights weights;
+    TRT_TensorOrWeights tw(weights);
+    TRT_TensorOrWeights copy(tw);
+    TRT_TensorOrWeights assigned;
+    assigned = tw;
+    for (auto ptr : {&tw, &copy, &assigned}) {
+      EXPECT_EQ(false, ptr->is_tensor());
+      EXPECT_EQ(true, ptr->is_weights());
+      EXPECT_TRUE(TrtShapedWeightsEquals(weights, ptr->weights()));
+
+      nvinfer1::Dims dims;
+      dims.nbDims = 0;
+      ExpectTrtDimsEqualsArray({}, ptr->GetTrtDims());
+    }
+  }
+}
+
+class ValidatorTest : public ::testing::Test {
+ public:
+  void AddOpValidator(const string& op_name, OpConverter op_validator) {
+    validator_.op_validators_[op_name] = op_validator;
+  }
+
+  Status ConvertToTensorOrWeights(
+      const NodeDef& node_def, int output_port,
+      const grappler::GraphProperties& graph_properties,
+      TRT_TensorOrWeights* tensor_or_weights) {
+    return validator_.ConvertToTensorOrWeights(
+        node_def, output_port, graph_properties, tensor_or_weights);
+  }
+
+ protected:
+  TrtNodeValidator validator_;
+};
+
+TEST_F(ValidatorTest, ConvertToTensorOrWeights) {
+  // Convert Const.
+  {
+    NodeDef node_def = MakeConstNodeDef<float>("my_const", {1.0f, 2.0f});
+    TRT_TensorOrWeights output;
+    grappler::GrapplerItem item;
+    grappler::GraphProperties graph_properties(item);
+    ExpectStatus(ConvertToTensorOrWeights(node_def, /*output_port=*/0,
+                                          graph_properties, &output));
+    ValidateWeights<float>(output.weights(), {2}, {1.0, 2.0});
+  }
+
+  // Helper method to run ConvertToTensorOrWeights() with predefined parameters.
+  auto convert_to_tensor_or_weights = [this](const std::vector<int64>& dims,
+                                             TRT_TensorOrWeights* output) {
+    Scope s = Scope::NewRootScope();
+    const auto attrs = ops::Placeholder::Shape(PartialTensorShape{dims});
+    auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT, attrs);
+    auto add = ops::Add(s.WithOpName("add"), feed, feed);
+
+    grappler::GrapplerItem item;
+    TF_EXPECT_OK(s.ToGraphDef(&item.graph));
+    grappler::GraphProperties graph_properties(item);
+    TF_EXPECT_OK(graph_properties.InferStatically(true));
+    const NodeDef& node_def = add.operation.node()->def();
+    return this->ConvertToTensorOrWeights(node_def, /*output_port=*/0,
+                                          graph_properties, output);
+  };
+  // Convert non-Const with #dims > nvinfer1::Dims::MAX_DIMS+1.
+  {
+    TRT_TensorOrWeights output;
+    ExpectStatus(
+        convert_to_tensor_or_weights(
+            std::vector<int64>(nvinfer1::Dims::MAX_DIMS + 2, 1), &output),
+        error::OUT_OF_RANGE, "Input tensor rank is greater than 9");
+  }
+  // Convert non-Const with #dims < 2.
+  {
+    TRT_TensorOrWeights output;
+    ExpectStatus(
+        convert_to_tensor_or_weights({1}, &output), error::INVALID_ARGUMENT,
+        "Input tensor with rank<2 is not supported since the first dimension "
+        "is treated as batch dimension by TRT");
+  }
+  // Convert non-Const. We test the case where the non-batch dimemsion is
+  // unknown as well, to make sure the validator allows that.
+  for (const int32 non_batch_dim : {-1, 2}) {
+    const int32 batch_size = 12;
+    TRT_TensorOrWeights output;
+    ExpectStatus(
+        convert_to_tensor_or_weights({batch_size, non_batch_dim}, &output));
+    EXPECT_EQ(true, output.is_tensor());
+    EXPECT_EQ(batch_size, output.batch_size());
+    EXPECT_NE(nullptr, output.tensor());
+    ExpectTrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims());
+  }
+}
+
+TEST_F(ValidatorTest, ValidateNode) {
+  grappler::GrapplerItem item;
+  grappler::GraphProperties graph_properties(item);
+
+  bool start_conversion = false;
+  bool should_fail = false;
+  auto op_converter = [&start_conversion,
+                       &should_fail](OpConverterParams* params) -> Status {
+    if (should_fail) return errors::InvalidArgument("");
+    if (!params->validation_only) start_conversion = true;
+    return Status::OK();
+  };
+  NodeDef node_def = MakeNodeDef("my_op", "MyOp", {});
+
+  // Validator not registered, validation should pass.
+  TF_EXPECT_OK(validator_.ValidateNode(node_def, {}, graph_properties));
+
+  // Register validator.
+  AddOpValidator("MyOp", op_converter);
+  TF_EXPECT_OK(validator_.ValidateNode(node_def, {}, graph_properties));
+  EXPECT_EQ(false, start_conversion);
+
+  // Let the converter return error.
+  should_fail = true;
+  ExpectStatus(validator_.ValidateNode(node_def, {}, graph_properties),
+               error::INVALID_ARGUMENT);
+}
+
+class ConverterTest : public ::testing::Test {
+ public:
+  ConverterTest() {
+    builder_.reset(nvinfer1::createInferBuilder(logger_));
+    network_.reset(builder_->createNetwork());
+    converter_.reset(new Converter(network_.get(),
+                                   /*precision_mode=*/FP32MODE,
+                                   /*use_calibration=*/false));
+    weight_store_ = &converter_->weight_store_;
+  }
+
+  void AddOpConverter(const string& op_name, OpConverter op_converter) {
+    converter_->op_registry_[op_name] = op_converter;
+  }
+
+  // Below we expose private methods of Converter for testing.
+
+  Status MaybeUpdateBatchSize(int batch_size) {
+    return converter_->MaybeUpdateBatchSize(batch_size);
+  }
+
+  Status AddTensorOrWeights(const string& name, TRT_TensorOrWeights input) {
+    return converter_->AddTensorOrWeights(name, input);
+  }
+
+  Status GetTensorOrWeights(const string& name, TRT_TensorOrWeights* output) {
+    return converter_->GetTensorOrWeights(name, output);
+  }
+
+  Status GetInputs(const NodeDef& node_def,
+                   std::vector<TRT_TensorOrWeights>* inputs) const {
+    return converter_->GetInputs(node_def, inputs);
+  }
+
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const {
+    return converter_->GetWeightRange(weights, out_min, out_max);
+  }
+
+  void PropagateQuantizationRanges() {
+    converter_->PropagateQuantizationRanges();
+  }
+
+  int batch_size() const { return converter_->batch_size_; }
+
+  std::unordered_map<nvinfer1::ITensor*, float>& quantization_ranges() {
+    return converter_->quantization_ranges_;
+  }
+
+ private:
+  Logger logger_;
+  // These members are ordered in a way such that the destruction order is:
+  // converter_ -> network_ -> builder_
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;
+
+ protected:
+  std::unique_ptr<Converter> converter_;
+  TrtWeightStore* weight_store_;
+};
+
+TEST_F(ConverterTest, ConvertNode) {
+  FakeITensor output_tensors[2];
+  auto op_converter = [&output_tensors](OpConverterParams* params) -> Status {
+    nvinfer1::Dims dims = params->inputs[0].tensor()->getDimensions();
+    for (int i = 0; i < 2; ++i) {
+      dims.d[0] += 1;
+      output_tensors[i].setDimensions(dims);
+      params->outputs->push_back(TRT_TensorOrWeights(&output_tensors[i]));
+    }
+    return Status::OK();
+  };
+  NodeDef node_def = MakeNodeDef("my_op", "MyOp", {"my_input"});
+  TF_EXPECT_OK(converter_->AddInputTensor(
+      "my_input", nvinfer1::DataType::kFLOAT, GetTestDims({123}), 1));
+
+  // Converter not registered.
+  ExpectStatus(converter_->ConvertNode(node_def), error::UNIMPLEMENTED,
+               "No converter registered for op: MyOp");
+
+  // Register the converter and retry.
+  AddOpConverter("MyOp", op_converter);
+  TF_EXPECT_OK(converter_->ConvertNode(node_def));
+
+  TRT_TensorOrWeights actual_output_1;
+  TF_EXPECT_OK(GetTensorOrWeights("my_op", &actual_output_1));
+  EXPECT_EQ(&output_tensors[0], actual_output_1.tensor());
+  EXPECT_EQ(124, actual_output_1.tensor()->getDimensions().d[0]);
+
+  TRT_TensorOrWeights actual_output_2;
+  TF_EXPECT_OK(GetTensorOrWeights("my_op:1", &actual_output_2));
+  EXPECT_EQ(&output_tensors[1], actual_output_2.tensor());
+  EXPECT_EQ(125, actual_output_2.tensor()->getDimensions().d[0]);
+}
+
+TEST_F(ConverterTest, AddAndGetInputs) {
+  NodeDef node_def;
+  node_def.add_input("^control_input");
+  node_def.add_input("input");
+  node_def.add_input("input:0");
+  node_def.add_input("input:1");
+  node_def.add_input("weird_input:2:3:4:0");
+
+  TF_EXPECT_OK(converter_->AddInputTensor("input", nvinfer1::DataType::kFLOAT,
+                                          GetTestDims({1}), 1));
+  TF_EXPECT_OK(converter_->AddInputTensor("input:1", nvinfer1::DataType::kINT32,
+                                          GetTestDims({2, 3}), 1));
+  TF_EXPECT_OK(converter_->AddInputTensor(
+      "weird_input:2:3:4", nvinfer1::DataType::kHALF, GetTestDims({5, 3}), 1));
+
+  std::vector<TRT_TensorOrWeights> inputs;
+  TF_EXPECT_OK(GetInputs(node_def, &inputs));
+
+  EXPECT_EQ(4, inputs.size());
+  EXPECT_EQ(inputs[0].tensor(), inputs[1].tensor());
+
+  EXPECT_EQ(nvinfer1::DataType::kFLOAT, inputs[0].tensor()->getType());
+  EXPECT_EQ(nvinfer1::DataType::kINT32, inputs[2].tensor()->getType());
+  EXPECT_EQ(nvinfer1::DataType::kHALF, inputs[3].tensor()->getType());
+  ExpectTrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions());
+  ExpectTrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions());
+  ExpectTrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions());
+}
+
+TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
+  // Test that the tensor are actually named and marked as output after
+  // Converter::RenameAndMarkOutputTensors() is called.
+
+  // Register a custom converter which shuffles the input. We use it to build a
+  // TRT network whose output will be later marked.
+  std::vector<nvinfer1::ITensor*> output_tensors;
+  auto op_converter = [&output_tensors](OpConverterParams* params) -> Status {
+    nvinfer1::Permutation perm;
+    perm.order[0] = 1;
+    perm.order[1] = 0;
+    for (int i = 0; i < 2; ++i) {
+      nvinfer1::ITensor* input_tensor =
+          const_cast<nvinfer1::ITensor*>(params->inputs[0].tensor());
+      nvinfer1::IShuffleLayer* layer =
+          params->converter->network()->addShuffle(*input_tensor);
+      layer->setFirstTranspose(perm);
+      nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+      params->outputs->emplace_back(output_tensor);
+      output_tensors.push_back(output_tensor);
+    }
+    TRT_ShapedWeights output_weights(DT_FLOAT);
+    params->outputs->emplace_back(output_weights);
+    return Status::OK();
+  };
+  AddOpConverter("MyOp", op_converter);
+
+  // Run the conversion.
+  NodeDef node_def = MakeNodeDef("my_op", "MyOp", {"my_input"});
+  TF_EXPECT_OK(converter_->AddInputTensor(
+      "my_input", nvinfer1::DataType::kFLOAT, GetTestDims({1, 2}), 1));
+  TF_EXPECT_OK(converter_->ConvertNode(node_def));
+
+  // Mark a weight as output, should fail.
+  ExpectStatus(
+      converter_->RenameAndMarkOutputTensors({{"my_op:2", "my_output"}}),
+      error::INVALID_ARGUMENT, "Output my_op:2 is weights not tensor");
+
+  // Mark tensors as output, should pass.
+  TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(
+      {{"my_op", "my_output"}, {"my_op:1", "my_output_1"}}));
+  EXPECT_EQ(2, output_tensors.size());
+  for (auto output_tensor : output_tensors) {
+    ExpectTrtDimsEqualsArray({2, 1}, output_tensor->getDimensions());
+  }
+  EXPECT_EQ("my_output", string(output_tensors[0]->getName()));
+  EXPECT_EQ("my_output_1", string(output_tensors[1]->getName()));
+}
+
+TEST_F(ConverterTest, TransposeTensor) {
+  nvinfer1::ITensor* input_tensor = converter_->network()->addInput(
+      "", nvinfer1::DataType::kFLOAT, GetTestDims({2, 3, 5}));
+  const nvinfer1::ITensor* output_tensor = nullptr;
+
+  // Rank doesn't match.
+  ExpectStatus(
+      converter_->TransposeTensor(input_tensor, {0, 1}, &output_tensor),
+      error::INVALID_ARGUMENT,
+      "Rank of perm for transpose does not match with that of the input");
+
+  // Transpose at batch dimension.
+  ExpectStatus(
+      converter_->TransposeTensor(input_tensor, {1, 0, 2, 3}, &output_tensor),
+      error::UNIMPLEMENTED, "Transpose at batch dimension is not supported.");
+
+  // OK.
+  TF_EXPECT_OK(
+      converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, &output_tensor));
+  ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions());
+}
+
+TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
+  nvinfer1::ITensor* input_tensor = converter_->network()->addInput(
+      "", nvinfer1::DataType::kFLOAT, GetTestDims({2, 3, 5}));
+  TRT_TensorOrWeights tw(input_tensor);
+  const nvinfer1::ITensor* output_tensor = nullptr;
+
+  // Shape size doesn't match.
+  ExpectStatus(converter_->PrepareTensorForShape(tw, GetTestDims({2, 3, 6}),
+                                                 &output_tensor),
+               error::INVALID_ARGUMENT, "Reshape shapes are not compatible");
+
+  // TODO(aaroey): we should check the case where uninferred dimensions are not
+  // an exact divisor of input dim ensions, e.g. for dims {-1, 7}.
+
+  // Infer shape, ok.
+  TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({-1, 2}),
+                                                 &output_tensor));
+  ExpectTrtDimsEqualsArray({15, 2}, output_tensor->getDimensions());
+
+  // Regular shape.
+  TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
+                                                 &output_tensor));
+  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
+}
+
+TEST_F(ConverterTest, PrepareTensorForShape_Weights) {
+  TRT_ShapedWeights weights =
+      weight_store_->GetTempWeights(DT_FLOAT, GetTestDims({2, 3, 5}));
+  TRT_TensorOrWeights tw(weights);
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
+                                                 &output_tensor));
+  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
+}
+
+TEST_F(ConverterTest, MaybeUpdateBatchSize) {
+  EXPECT_EQ(-1, batch_size());
+
+  TF_EXPECT_OK(MaybeUpdateBatchSize(-1));
+  EXPECT_EQ(-1, batch_size());
+
+  TF_EXPECT_OK(MaybeUpdateBatchSize(123));
+  EXPECT_EQ(123, batch_size());
+
+  TF_EXPECT_OK(MaybeUpdateBatchSize(123));
+  EXPECT_EQ(123, batch_size());
+
+  TF_EXPECT_OK(MaybeUpdateBatchSize(-1));
+  EXPECT_EQ(123, batch_size());
+
+  ExpectStatus(MaybeUpdateBatchSize(124), error::INVALID_ARGUMENT,
+               "Provided batch size does not match converter batch size");
+}
+
+TEST_F(ConverterTest, AddAndGetTensorOrWeights) {
+  // Add a tensor.
+  FakeITensor fake_tensor;
+  TRT_TensorOrWeights tensor(&fake_tensor);
+  EXPECT_EQ(-1, tensor.batch_size());
+  TF_EXPECT_OK(MaybeUpdateBatchSize(123));
+  TF_EXPECT_OK(AddTensorOrWeights("my_tensor", tensor));
+
+  // Get the added tensor.
+  TRT_TensorOrWeights added_tensor;
+  TF_EXPECT_OK(GetTensorOrWeights("my_tensor", &added_tensor));
+  EXPECT_EQ(123, added_tensor.batch_size());
+
+  // Add the same tensor again.
+  ExpectStatus(AddTensorOrWeights("my_tensor", tensor), error::ALREADY_EXISTS,
+               "tensor/weights my_tensor already exist");
+}
+
+template <typename T>
+void TestGetWeightRange(ConverterTest* test, TrtWeightStore* weight_store) {
+  TRT_ShapedWeights weights =
+      weight_store->GetTempWeights(DataTypeToEnum<T>::v(), GetTestDims({2, 3}));
+  const std::vector<T> values = {T(3), T(1), T(2), T(6), T(5), T(4)};
+  memcpy(const_cast<void*>(weights.GetValues()), values.data(),
+         weights.size_bytes());
+
+  float out_min = 0.0f;
+  float out_max = 0.0f;
+  TF_EXPECT_OK(test->GetWeightRange(weights, &out_min, &out_max));
+  EXPECT_EQ(1.0f, out_min);
+  EXPECT_EQ(6.0f, out_max);
+}
+
+TEST_F(ConverterTest, GetWeightRange) {
+  TestGetWeightRange<float>(this, weight_store_);
+  TestGetWeightRange<Eigen::half>(this, weight_store_);
+  TestGetWeightRange<int32>(this, weight_store_);
+}
+
+TEST_F(ConverterTest, ProvideQuantizationRange) {
+  FakeITensor fake_tensor;
+  // Assymetric range
+  converter_->ProvideQuantizationRange(&fake_tensor, 0.0f, 6.0f);
+  EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]);
+  converter_->ProvideQuantizationRange(&fake_tensor, 1.0f, 6.0f);
+  EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]);
+  converter_->ProvideQuantizationRange(&fake_tensor, -8.0f, 6.0f);
+  EXPECT_EQ(8.0f, quantization_ranges()[&fake_tensor]);
+  converter_->ProvideQuantizationRange(&fake_tensor, -8.123f, -6.123f);
+  EXPECT_EQ(8.123f, quantization_ranges()[&fake_tensor]);
+  // Symmetric range
+  converter_->ProvideQuantizationRange(&fake_tensor, -6.123f, 6.123f);
+  EXPECT_EQ(6.123f, quantization_ranges()[&fake_tensor]);
+}
+
+TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
+  // input -> infer1 -> infer2 -> infer3
+  FakeITensor input, infer_1, infer_2, infer_3;
+  FakeITensor not_infer;
+  Converter int8_converter(/*trt_network=*/nullptr, INT8MODE,
+                           /*use_calibration=*/true);
+  int8_converter.ProvideQuantizationRange(&input, -5.0f, 5.0f);
+  int8_converter.ProvideQuantizationRange(&not_infer, -100.0f, 100.0f);
+  int8_converter.MarkQuantizationRangesAsInferrable(&input, &infer_1);
+  int8_converter.MarkQuantizationRangesAsInferrable(&infer_1, &infer_2);
+  int8_converter.MarkQuantizationRangesAsInferrable(&infer_2, &infer_3);
+
+  // Input range should be inferred along the chain and applied to tensors.
+  int8_converter.MaybeApplyQuantizationRanges();
+#if NV_TENSORRT_MAJOR >= 5
+  EXPECT_EQ(input.getDynamicRange(), 5.0f);
+  EXPECT_EQ(infer_1.getDynamicRange(), 5.0f);
+  EXPECT_EQ(infer_2.getDynamicRange(), 5.0f);
+  EXPECT_EQ(infer_3.getDynamicRange(), 5.0f);
+  EXPECT_EQ(not_infer.getDynamicRange(), 100.0f);
+#endif
+}
+
+TEST_F(ConverterTest, PropagateQuantizationRanges) {
+  // infer0 <-> infer1 <-> infer2 <-> infer3
+  //              |
+  //            infer4 <-> infer5
+  FakeITensor infer[6];
+  FakeITensor not_infer;
+  converter_->ProvideQuantizationRange(&infer[4], -5.0f, 5.0f);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[0], &infer[1]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[1], &infer[2]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[3], &infer[2]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[1]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[5]);
+
+  // Input range should be inferred along the chain.
+  PropagateQuantizationRanges();
+  auto ranges = quantization_ranges();
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(5.0f, ranges[&infer[i]]);
+  }
+  EXPECT_EQ(ranges.count(&not_infer), 0);
+}
+
+TEST_F(ConverterTest, GetTrtBroadcastShape) {
+  const bool kIsTensor = true;
+  const bool kIsNotTensor = false;
+  auto symmetric_test = [this](const std::vector<int>& operand_1_shape,
+                               const std::vector<int>& operand_2_shape,
+                               const bool operand_1_is_tensor,
+                               const bool operand_2_is_tensor,
+                               const std::vector<int>& expected_operand_1_shape,
+                               const std::vector<int>& expected_operand_2_shape,
+                               error::Code expected_code = error::OK,
+                               const char* expected_error_msg_substr = nullptr,
+                               const int operand_1_batch_size = -1,
+                               const int operand_2_batch_size = -1) {
+    auto create_tensor_or_weights = [](const std::vector<int>& shape,
+                                       bool is_tensor, int batch_size = -1) {
+      if (is_tensor) {
+        return TRT_TensorOrWeights{nvinfer1::DataType::kFLOAT,
+                                   GetTestDims(shape), batch_size};
+      }
+      TRT_ShapedWeights weights;
+      weights.shape_ = GetTestDims(shape);
+      return TRT_TensorOrWeights(weights);
+    };
+
+    nvinfer1::Dims operand_1_new_dims, operand_2_new_dims;
+    TRT_TensorOrWeights operand_1 = create_tensor_or_weights(
+        operand_1_shape, operand_1_is_tensor, operand_1_batch_size);
+    TRT_TensorOrWeights operand_2 = create_tensor_or_weights(
+        operand_2_shape, operand_2_is_tensor, operand_2_batch_size);
+
+    // operand_1 broadcast operand_2
+    ExpectStatus(
+        this->converter_->GetTrtBroadcastShape(
+            operand_1, operand_2, &operand_1_new_dims, &operand_2_new_dims),
+        expected_code, expected_error_msg_substr);
+    if (expected_code == error::OK) {
+      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
+      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+    }
+    // operand_2 broadcast operand_1
+    ExpectStatus(
+        this->converter_->GetTrtBroadcastShape(
+            operand_2, operand_1, &operand_2_new_dims, &operand_1_new_dims),
+        expected_code, expected_error_msg_substr);
+    if (expected_code == error::OK) {
+      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
+      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+    }
+  };
+
+  // Both inputs are weights.
+  symmetric_test(
+      {1}, {1}, kIsNotTensor, kIsNotTensor, {}, {}, error::INVALID_ARGUMENT,
+      "Broadcasting requires at least one of the operands be tensors");
+
+  // One tensor and one weights.
+  symmetric_test({1, 1, 1}, {2}, kIsTensor, kIsNotTensor, {1, 1, 1}, {1, 1, 2});
+  symmetric_test({1, 1, 2}, {2}, kIsTensor, kIsNotTensor, {1, 1, 2}, {1, 1, 2});
+  symmetric_test({1, 3, 2}, {1}, kIsTensor, kIsNotTensor, {1, 3, 2}, {1, 1, 1});
+  symmetric_test({1, 1, 1}, {2, 3}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {1, 2, 3});
+  symmetric_test({1, 1, 1}, {2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {2, 3, 4});
+  symmetric_test({1, 1, 1}, {1, 2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {2, 3, 4});
+  symmetric_test({1, 3, 4}, {1, 2, 1, 4}, kIsTensor, kIsNotTensor, {1, 3, 4},
+                 {2, 1, 4});
+  symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme");
+  symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme",
+                 /*operand_1_batch_size=*/2);
+  symmetric_test({1, 1, 1}, {1, 1, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 4 vs broadcast #dims 5)");
+
+  // Both inputs are tensors.
+  symmetric_test({1, 1, 1}, {1, 1}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 3 vs broadcast #dims 4)");
+  symmetric_test({1, 3, 4}, {2, 1, 4}, kIsTensor, kIsTensor, {1, 3, 4},
+                 {2, 1, 4});
+  symmetric_test({1, 1, 1}, {1, 1, 1, 1}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 4 vs broadcast #dims 5)");
+}
+
+// Class to test various op converters, using both a TrtNodeValidator and
+// Converter.
+class OpConverterTest : public ::testing::Test {
+ public:
+  OpConverterTest() : scope_(Scope::NewRootScope()) {
+    QCHECK_EQ(0, cudaStreamCreate(&stream_));
+    Reset();
+  }
+
+  ~OpConverterTest() override { QCHECK_EQ(0, cudaStreamDestroy(stream_)); }
+
+  Status GetTensorOrWeights(const string& name, TRT_TensorOrWeights* output) {
+    return converter_->GetTensorOrWeights(name, output);
+  }
+
+  void Reset() {
+    validator_.reset(nullptr);
+    converter_.reset(nullptr);
+
+    // Reset the INetworkDefinition.
+    engine_.reset(nullptr);
+    network_.reset(nullptr);
+    builder_.reset(nvinfer1::createInferBuilder(logger_));
+    network_.reset(builder_->createNetwork());
+    builder_->setMaxBatchSize(1);
+
+    // Reset the validator and converter.
+    validator_.reset(new TrtNodeValidator);
+    converter_.reset(new Converter(network_.get(),
+                                   /*precision_mode=*/FP32MODE,
+                                   /*use_calibration=*/false));
+
+    // Reset other related artifacts.
+    scope_ = Scope::NewRootScope();
+    validator_inputs_.clear();
+  }
+
+  // TODO(laigd): test fp16 and int8 support.
+  template <typename T>
+  void BuildAndRun(
+      const std::vector<std::pair<const char*, const std::vector<T>>>&
+          input_data,
+      const char* output_name, std::vector<T>* output_data) {
+    // Mark the output tensor as TRT engine output.
+    TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(
+        {{string(output_name), string(output_name)}}));
+
+    // Build the TRT engine.
+    ASSERT_EQ(nullptr, engine_.get());
+    engine_.reset(builder_->buildCudaEngine(*converter_->network()));
+    CHECK_NOTNULL(engine_.get());
+
+    // Execute the TRT engine.
+    ASSERT_LE(input_data.size() + 1, 3);
+    void* buffers[3];
+    for (const auto name_and_data : input_data) {
+      const int input_size = name_and_data.second.size() * sizeof(T);
+      const int input_index = engine_->getBindingIndex(name_and_data.first);
+      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size));
+      ASSERT_EQ(
+          0, cudaMemcpyAsync(buffers[input_index], name_and_data.second.data(),
+                             input_size, cudaMemcpyHostToDevice, stream_));
+    }
+
+    const int output_size = output_data->size() * sizeof(T);
+    const int output_index = engine_->getBindingIndex(output_name);
+    ASSERT_EQ(0, cudaMalloc(&buffers[output_index], output_size));
+
+    ASSERT_EQ(engine_->getNbBindings(), input_data.size() + 1);
+
+    TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context(
+        engine_->createExecutionContext());
+    execution_context->enqueue(/*batchSize=*/1, buffers, stream_, nullptr);
+    ASSERT_EQ(0, cudaMemcpyAsync(output_data->data(), buffers[output_index],
+                                 output_size, cudaMemcpyDeviceToHost, stream_));
+    cudaStreamSynchronize(stream_);
+
+    for (int i = 0; i < input_data.size() + 1; ++i) {
+      ASSERT_EQ(0, cudaFree(buffers[i]));
+    }
+  }
+
+  bool HasStaticShape(const nvinfer1::Dims& dims) const {
+    if (dims.nbDims < 0) return false;
+    for (int i = 0; i < dims.nbDims; ++i) {
+      if (dims.d[i] < 0) return false;
+    }
+    return true;
+  }
+
+  // Add ITensor for both validation and conversion.
+  void AddTestTensor(
+      const char* name, const std::vector<int32>& dims, int batch_size = 1,
+      nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
+    DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
+    ops::Placeholder::Attrs attrs;
+    TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_));
+    attrs.shape_.InsertDim(0, batch_size);
+    auto input = ops::Placeholder(scope_.WithOpName(name), tf_dtype, attrs);
+    validator_inputs_[name] = input.operation.node()->def();
+
+    // Add a real ITensor for conversion conditionally.
+    const nvinfer1::Dims trt_dims = GetTestDims(dims);
+    if (HasStaticShape(trt_dims)) {
+      TF_EXPECT_OK(
+          converter_->AddInputTensor(name, trt_dtype, trt_dims, batch_size));
+      ASSERT_EQ(batch_size, converter_->batch_size_);
+    }
+  }
+
+  // Add weights for both validation and conversion.
+  template <typename T>
+  void AddTestWeights(const char* name, const std::vector<int>& dims,
+                      const std::vector<T>& values) {
+    const DataType dtype = DataTypeToEnum<T>::v();
+    const nvinfer1::Dims trt_dims = GetTestDims(dims);
+    const int64_t num_elements = TrtDimsNumElements(trt_dims);
+    QCHECK_EQ(num_elements, values.size())
+        << num_elements << " vs " << values.size();
+    TRT_ShapedWeights weights(dtype);
+    if (num_elements) {
+      weights = converter_->weight_store_.GetTempWeights(dtype, trt_dims);
+      QCHECK_EQ(weights.size_bytes(), sizeof(T) * values.size())
+          << weights.size_bytes() << " vs " << sizeof(T) * values.size();
+      memcpy(const_cast<void*>(weights.GetValues()), values.data(),
+             weights.size_bytes());
+    }
+    // Add weights for validation.
+    TensorShape shape;
+    TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &shape));
+    validator_inputs_[name] = MakeConstNodeDef<T>(name, values, shape);
+    // Add weights for conversion.
+    TF_EXPECT_OK(
+        converter_->AddTensorOrWeights(name, TRT_TensorOrWeights{weights}));
+  }
+
+  // Test validation in validation-only mode.
+  void RunValidation(const NodeDef& node_def,
+                     error::Code expected_code = error::OK,
+                     const char* expected_msg_substr = nullptr) {
+    std::vector<std::pair<const NodeDef*, int>> input_node_and_ports;
+    for (const string& input : node_def.input()) {
+      input_node_and_ports.emplace_back(&validator_inputs_[input], 0);
+    }
+    grappler::GrapplerItem item;
+    TF_EXPECT_OK(scope_.ToGraphDef(&item.graph));
+    grappler::GraphProperties graph_properties(item);
+    TF_EXPECT_OK(graph_properties.InferStatically(true));
+
+    ExpectStatus(validator_->ValidateNode(node_def, input_node_and_ports,
+                                          graph_properties),
+                 expected_code, expected_msg_substr);
+  }
+
+  void RunConversion(const NodeDef& node_def,
+                     error::Code expected_code = error::OK,
+                     const char* expected_msg_substr = nullptr) {
+    ExpectStatus(converter_->ConvertNode(node_def), expected_code,
+                 expected_msg_substr);
+  }
+
+  // Helper method to run both validation and conversion, when the expected
+  // output are same.
+  void RunValidationAndConversion(const NodeDef& node_def,
+                                  error::Code expected_code = error::OK,
+                                  const char* expected_msg_substr = nullptr,
+                                  bool should_run_conversion = true) {
+    RunValidation(node_def, expected_code, expected_msg_substr);
+    if (should_run_conversion) {
+      RunConversion(node_def, expected_code, expected_msg_substr);
+    }
+  }
+
+  // Expose quantization_ranges_ for tests
+  std::unordered_map<nvinfer1::ITensor*, float>& quantization_ranges() {
+    return converter_->quantization_ranges_;
+  }
+
+  std::unique_ptr<Converter> converter_;
+  std::unique_ptr<TrtNodeValidator> validator_;
+
+ private:
+  Logger logger_;
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+  cudaStream_t stream_;
+  // Used to create placeholders with shape and data type information. The
+  // created placeholders will be used as inputs to the node to be verified,
+  // thus we need the shape and data type information to get a non-empty
+  // GraphProperties.
+  // TODO(laigd): consider use this Scope to create the NodeDef to verify.
+  Scope scope_;
+  std::unordered_map<string, NodeDef> validator_inputs_;
+};
+
+template <DataType dtype, typename InputCType, typename OutputCType>
+void TestConvertConst(OpConverterTest* test) {
+  NodeDef node_def;
+  node_def.set_name("my_const");
+  node_def.set_op("Const");
+
+  auto reset_and_test = [&node_def, test](
+                            const Tensor& tensor, const bool as_tensor_content,
+                            const std::vector<int>& expected_dims,
+                            const std::vector<OutputCType>& expected_value) {
+    test->Reset();
+
+    auto& attr = *node_def.mutable_attr();
+    if (as_tensor_content) {
+      tensor.AsProtoTensorContent(attr["value"].mutable_tensor());
+    } else {
+      tensor.AsProtoField(attr["value"].mutable_tensor());
+    }
+    test->RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_const", &output));
+    ValidateWeights(output.weights(), expected_dims, expected_value);
+  };
+
+  auto& attr = *node_def.mutable_attr();
+  attr["dtype"].set_type(dtype);
+  {
+    // By default empty tensor will pick DT_FLOAT as data type and we fix it
+    // here.
+    attr["value"].mutable_tensor()->set_dtype(dtype);
+    Tensor t;  // Empty tensor.
+    reset_and_test(t, false, {}, {});
+  }
+  {
+    Tensor t = ::tensorflow::test::AsScalar<InputCType>(12);
+    reset_and_test(t, false, {1}, {12});
+    reset_and_test(t, true, {1}, {12});
+  }
+  {
+    Tensor t = ::tensorflow::test::AsTensor<InputCType>({1, 2});
+    reset_and_test(t, false, {2}, {1, 2});
+    reset_and_test(t, true, {2}, {1, 2});
+  }
+  {
+    Tensor t = ::tensorflow::test::AsTensor<InputCType>({1, 2, 3, 4, 5, 6},
+                                                        TensorShape({2, 3}));
+    reset_and_test(t, false, {2, 3}, {1, 2, 3, 4, 5, 6});
+    reset_and_test(t, true, {2, 3}, {1, 2, 3, 4, 5, 6});
+  }
+}
+
+TEST_F(OpConverterTest, ConvertConst) {
+  {
+    Reset();
+    NodeDef node_def = MakeNodeDef("my_const", "Const", {"input"});
+    AddTestTensor("input", {1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Constant node is expected to have empty input list: my_const");
+  }
+  {
+    Reset();
+    NodeDef node_def = MakeConstNodeDef<double>("my_const", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Unsupported data type double");
+  }
+
+  TestConvertConst<DT_FLOAT, float, float>(this);
+  TestConvertConst<DT_INT8, int8, int32>(this);
+  TestConvertConst<DT_INT32, int32, int32>(this);
+}
+
+TEST_F(OpConverterTest, ConvertTranspose) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_transpose", "Transpose", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Input expects tensor and weights, at my_transpose");
+  }
+
+  // Get the NodeDef for Transpose.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
+  auto transpose = ops::Transpose(s.WithOpName("my_transpose"), input, weights);
+  const NodeDef& node_def = transpose.operation.node()->def();
+
+  {
+    // Permutation is a tensor, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights", {3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Input expects tensor and weights, at my_transpose");
+  }
+  {
+    // Transpose at batch dimension, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("weights", {4}, {1, 0, 2, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Transpose at batch dimension is not supported");
+  }
+  {
+    // Permutation rank doesn't match, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("weights", {3}, {0, 1, 2});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Rank of perm for transpose does not match with that of the input.");
+  }
+  {
+    // Ok.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("weights", {4}, {0, 3, 1, 2});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_transpose", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions());
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_transpose",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(1, 4, 2, 5, 3, 6));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertReshape) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_reshape", "Reshape", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Input expects weights for shape, at my_reshape");
+  }
+
+  // Get the NodeDef for Reshape.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
+  auto reshape = ops::Reshape(s.WithOpName("my_reshape"), input, weights);
+  const NodeDef& node_def = reshape.operation.node()->def();
+
+  {
+    // Shape is a tensor, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights", {3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Input expects weights for shape, at my_reshape");
+  }
+  {
+    // Reshape to scalar, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("weights", {0}, {});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Reshape to shape=[] is not supported, at my_reshape");
+  }
+
+  struct TestParams {
+    TestParams(int input_batch_size, const std::vector<int>& input_tensor_dims,
+               const std::vector<int>& input_shape)
+        : batch_size(input_batch_size),
+          tensor_dims(input_tensor_dims),
+          shape(input_shape) {}
+    int batch_size;
+    std::vector<int> tensor_dims;
+    std::vector<int> shape;
+  };
+
+  // Reshape at batch dimension, should fail.
+  const int kReshapeBatchDimsCases = 5;
+  TestParams params[kReshapeBatchDimsCases] = {
+      TestParams{1, {1, 2, 3}, {3, 1, 1, 2}},
+      TestParams{1, {1, 2, -1}, {-1, 1, 1, 2}},
+      TestParams{1, {1, 2, 3}, {-1, 1, 1, 2}},
+      TestParams{-1, {1, 2, 3}, {1, 1, 1, 2}},
+      TestParams{-1, {-1, 2, 3}, {1, 1, 1, 6}},  // TODO(laigd): it should pass.
+  };
+  for (int i = 0; i < kReshapeBatchDimsCases; ++i) {
+    Reset();
+    const std::vector<int>& dims = params[i].tensor_dims;
+    AddTestTensor("input", dims, params[i].batch_size);
+    AddTestWeights<int32>("weights", {4}, params[i].shape);
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Reshape on batch dimension is not supported, at my_reshape",
+        /*should_run_conversion=*/(dims[0] > 0 && dims[1] > 0 && dims[2] > 0));
+  }
+
+  // Reshape on non batch dimensions, ok.
+  const int kReshapeOKCases = 3;
+  TestParams ok_params[kReshapeOKCases] = {
+      TestParams{-1, {1, 2, 3}, {-1, 1, 3, 2}},
+      TestParams{1, {1, 2, 3}, {-1, 1, 3, 2}},
+      TestParams{1, {1, 2, 3}, {1, 1, 3, 2}},
+  };
+  for (int i = 0; i < kReshapeOKCases; ++i) {
+    Reset();
+    AddTestTensor("input", ok_params[i].tensor_dims, ok_params[i].batch_size);
+    AddTestWeights<int32>("weights", {4}, ok_params[i].shape);
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_reshape", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions());
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_reshape",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertMatMul) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_matmul", "MatMul", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Input expects tensor and weights, at my_matmul");
+  }
+
+  // Get the NodeDef for MatMul.
+  auto get_matmul_nodedef = [](DataType dtype, bool transpose_a,
+                               bool transpose_b) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
+    const auto matmul_attrs =
+        ops::MatMul::TransposeA(transpose_a).TransposeB(transpose_b);
+    auto matmul =
+        ops::MatMul(s.WithOpName("my_matmul"), input, weights, matmul_attrs);
+    return matmul.operation.node()->def();
+  };
+
+  {
+    // Unsupported data type.
+    Reset();
+    NodeDef node_def = get_matmul_nodedef(DT_INT32, false, false);
+    AddTestTensor("input", {2}, /*batch_size=*/1, nvinfer1::DataType::kINT32);
+    AddTestWeights<int32>("weights", {2, 1}, {3, 5});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Data type is not supported, for node my_matmul got int32");
+  }
+  // transpose_a is set.
+  for (bool transpose_b : {false, true}) {
+    Reset();
+    NodeDef node_def =
+        get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/true, transpose_b);
+    AddTestTensor("input", {2}, /*batch_size=*/1);
+    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "transpose_a is not supported for TensorRT FullyConnected");
+  }
+  // OK.
+  for (bool transpose_b : {false, true}) {
+    Reset();
+    NodeDef node_def =
+        get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/false, transpose_b);
+    AddTestTensor("input", {2}, /*batch_size=*/1);
+    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
+
+    std::vector<float> output_data(2);
+    BuildAndRun<float>({{"input", {0, 1}}}, "my_matmul", &output_data);
+    if (transpose_b) {
+      EXPECT_THAT(output_data, ElementsAre(1, 3));
+    } else {
+      EXPECT_THAT(output_data, ElementsAre(2, 3));
+    }
+  }
+}
+
+template <DataType dtype>
+void TestConvertBiasAdd(OpConverterTest* test) {
+  // Get the NodeDef for BiasAdd.
+  auto get_biasadd_nodedef = [](const string& data_format) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
+    const auto biasadd_attrs = ops::BiasAdd::DataFormat(data_format);
+    auto biasadd =
+        ops::BiasAdd(s.WithOpName("my_biasadd"), input, weights, biasadd_attrs);
+    return biasadd.operation.node()->def();
+  };
+
+  typedef typename EnumToDataType<dtype>::Type CType;
+  for (const string& data_format : {"NHWC", "NCHW"}) {
+    for (const int trt_input_rank : {1, 2, 3, 4}) {
+      test->Reset();
+      NodeDef node_def = get_biasadd_nodedef(data_format);
+
+      // Add input, dims_array will be like {2, 1, ..., 1, 3}
+      std::vector<int32> dims_array(trt_input_rank, 1);
+      if (trt_input_rank == 1) {
+        dims_array[0] = (data_format == "NHWC" ? 3 : 2);
+      } else {
+        dims_array[0] = 2;
+        dims_array[trt_input_rank - 1] = 3;
+      }
+      test->AddTestTensor("input", dims_array, /*batch_size=*/1,
+                          TfDataTypeToTrt(dtype));
+
+      // Add bias weights.
+      const int channel_size = (data_format == "NHWC" ? 3 : 2);
+      std::vector<CType> bias(channel_size);
+      for (int i = 0; i < channel_size; ++i) {
+        bias[i] = CType(i + 1);  // bias will be {1, 2, 3, ...}
+      }
+      test->AddTestWeights<CType>("weights", {channel_size}, bias);
+
+      // Run the conversion.
+      test->RunValidationAndConversion(node_def);
+      TRT_TensorOrWeights output;
+      TF_EXPECT_OK(test->GetTensorOrWeights("my_biasadd", &output));
+      EXPECT_TRUE(output.is_tensor());
+      ExpectTrtDimsEqualsArray(dims_array, output.tensor()->getDimensions());
+
+      // Build and run the engine.
+      const int num_input = TrtDimsNumElements(GetTestDims(dims_array));
+      ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
+                num_input);
+      std::vector<CType> output_data(num_input);
+      test->BuildAndRun<CType>(
+          {{"input", std::vector<CType>(num_input, CType(0))}}, "my_biasadd",
+          &output_data);
+      if (trt_input_rank == 1) {
+        if (data_format == "NHWC") {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3)));
+        } else {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2)));
+        }
+      } else {
+        if (data_format == "NHWC") {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3),
+                                               CType(1), CType(2), CType(3)));
+        } else {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(1), CType(1),
+                                               CType(2), CType(2), CType(2)));
+        }
+      }
+    }
+  }
+}
+
+TEST_F(OpConverterTest, ConvertBiasAdd) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_biasadd", "BiasAdd", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Input expects tensor and weights, at my_biasadd");
+  }
+
+  // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test
+  // DT_INT32 type here.
+  TestConvertBiasAdd<DT_FLOAT>(this);
+  TestConvertBiasAdd<DT_HALF>(this);
+}
+
+template <typename OpType>
+NodeDef GetBinaryOpNodeDef(const string& input_name_l,
+                           const string& input_name_r, DataType dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input_l = ops::Placeholder(s.WithOpName(input_name_l), dtype);
+  auto input_r = ops::Placeholder(s.WithOpName(input_name_r), dtype);
+  auto op = OpType(s.WithOpName("my_binary"), input_l, input_r);
+  return op.operation.node()->def();
+}
+
+void CheckAddedLayers(OpConverterTest* test, bool expect_scale_layer) {
+  bool element_wise_layer_found = false;
+  bool scale_layer_found = false;
+  for (int i = 0; i < test->converter_->network()->getNbLayers(); i++) {
+    nvinfer1::ILayer* layer = test->converter_->network()->getLayer(i);
+    if (dynamic_cast<nvinfer1::IScaleLayer*>(layer)) {
+      scale_layer_found = true;
+    } else if (dynamic_cast<nvinfer1::IElementWiseLayer*>(layer)) {
+      element_wise_layer_found = true;
+    }
+  }
+  EXPECT_EQ(expect_scale_layer, scale_layer_found);
+  EXPECT_NE(expect_scale_layer, element_wise_layer_found);
+}
+
+template <typename OpType, DataType dtype>
+void TestBinaryTensorOpWeightNoBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  for (auto swap_inputs : {false, true}) {
+    test->Reset();
+    NodeDef node_def;
+    if (swap_inputs) {
+      node_def = GetBinaryOpNodeDef<OpType>("weights", "input", dtype);
+    } else {
+      node_def = GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
+    }
+
+    const std::vector<CType> operand1{CType(3), CType(7.5)};
+    const std::vector<CType> operand2{CType(2), CType(3)};
+
+    // It requires the dims to be at least of rank 3 to apply an IScaleLayer.
+    test->AddTestTensor("input", /*dims=*/{1, 1, 2}, /*batch_size=*/1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestWeights<CType>("weights", /*dims=*/{1, 1, 2},
+                                /*values=*/swap_inputs ? operand1 : operand2);
+    test->RunValidationAndConversion(node_def);
+
+    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+    CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+    // Check the dims of the output ITensor.
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 1, 2}, output.tensor()->getDimensions());
+
+    std::vector<CType> output_data(2);
+    test->BuildAndRun<CType>(
+        {{"input",
+          /*input_data=*/swap_inputs ? operand2 : operand1}},
+        "my_binary", &output_data);
+    if (node_def.op() == "Add") {
+      EXPECT_THAT(output_data, ElementsAre(CType(5), CType(10.5)));
+    } else if (node_def.op() == "Sub") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1), CType(4.5)));
+    } else if (node_def.op() == "Mul") {
+      EXPECT_THAT(output_data, ElementsAre(CType(6), CType(22.5)));
+    } else if (node_def.op() == "Div") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+    } else if (node_def.op() == "RealDiv") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+    } else {
+      ASSERT_TRUE(false);
+    }
+  }
+}
+
+template <DataType dtype>
+void TestBinaryTensorOpWeightWithChannelWiseBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
+  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
+  const std::vector<CType> weights{CType(10), CType(20)};
+  // There are two types of valid dim pairs which requires channel-wise
+  // broadcasting:
+  // - input dims (X Y Z) vs weights dims (X 1 1)
+  // - input dims (X Y Z) vs weights dims (Z)
+  // Here X=Z=2 and Y=1.
+  for (auto weights_dims : std::vector<std::vector<int>>{{2, 1, 1}, {2}}) {
+    test->Reset();
+    test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestWeights<CType>("weights", weights_dims, weights);
+    test->RunValidationAndConversion(node_def);
+
+    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+    CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+    // Check the dims of the output ITensor.
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
+
+    std::vector<CType> output_data(4);
+    test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+    if (weights_dims.size() == 1) {
+      EXPECT_THAT(output_data,
+                  ElementsAre(CType(11), CType(22), CType(13), CType(24)));
+    } else {
+      EXPECT_THAT(output_data,
+                  ElementsAre(CType(11), CType(12), CType(23), CType(24)));
+    }
+  }
+}
+
+template <DataType dtype>
+void TestBinaryTensorOpWeightWithUniformlyBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
+  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
+  const std::vector<CType> weights{CType(10)};
+  test->Reset();
+  test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestWeights<CType>("weights", {1, 1, 1, 1}, weights);
+  test->RunValidationAndConversion(node_def);
+
+  // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+  CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+  // Check the dims of the output ITensor.
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
+
+  std::vector<CType> output_data(4);
+  test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+  EXPECT_THAT(output_data,
+              ElementsAre(CType(11), CType(12), CType(13), CType(14)));
+}
+
+template <typename OpType>
+void TestBinaryTensorOpWeightFallback(OpConverterTest* test,
+                                      const std::vector<int32>& input_dims,
+                                      const std::vector<int>& weights_dims,
+                                      error::Code code = error::OK,
+                                      const char* error_msg_substr = nullptr,
+                                      const int input_batch_size = 1) {
+  const DataType dtype = DT_FLOAT;
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const size_t num_inputs = TrtDimsNumElements(GetTestDims(input_dims));
+  const size_t num_weights = TrtDimsNumElements(GetTestDims(weights_dims));
+
+  test->Reset();
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
+  test->AddTestTensor("input", /*dims=*/input_dims, input_batch_size,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestWeights<CType>(
+      "weights", /*dims=*/weights_dims,
+      /*values=*/std::vector<CType>(num_weights, CType(1)));
+  test->RunValidationAndConversion(node_def, code, error_msg_substr);
+  if (code != error::OK) return;
+
+  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
+  CheckAddedLayers(test, /*expect_scale_layer=*/false);
+
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+
+  // Check the dims of the output ITensor.
+  std::vector<int> expected_output_dims = input_dims;
+  for (int i = expected_output_dims.size() - 1, j = weights_dims.size() - 1;
+       i >= 0 && j >= 0; --i, --j) {
+    if (expected_output_dims[i] == 1) {
+      expected_output_dims[i] = weights_dims[j];
+    }
+  }
+  ExpectTrtDimsEqualsArray(expected_output_dims,
+                           output.tensor()->getDimensions());
+
+  // Check the result of running the engine.
+  const int expected_num_outputs =
+      TrtDimsNumElements(GetTestDims(expected_output_dims));
+  std::vector<CType> output_data(expected_num_outputs);
+  test->BuildAndRun<CType>(
+      {{"input",
+        /*input_data=*/std::vector<CType>(num_inputs, CType(2))}},
+      "my_binary", &output_data);
+  if (node_def.op() == "Add") {
+    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
+                                 expected_num_outputs, CType(3))));
+  } else if (node_def.op() == "Minimum") {
+    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
+                                 expected_num_outputs, CType(1))));
+  } else {
+    ASSERT_TRUE(false);
+  }
+}
+
+template <typename OpType, DataType dtype>
+void TestBinaryTensorOpTensor(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  test->Reset();
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<OpType>("input1", "input2", dtype);
+  test->AddTestTensor("input1", /*dims=*/{1, 2}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestTensor("input2", /*dims=*/{2, 1}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->RunValidationAndConversion(node_def);
+
+  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
+  CheckAddedLayers(test, /*expect_scale_layer=*/false);
+
+  // Check output dims.
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
+
+  std::vector<CType> output_data(4);
+  // After broadcasting first input becomes {3, 6, 3, 6} and second input
+  // becomes {2, 3, 2, 3}.
+  test->BuildAndRun<CType>(
+      {{"input1", {CType(3), CType(6)}}, {"input2", {CType(2), CType(3)}}},
+      "my_binary", &output_data);
+  if (node_def.op() == "Add") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(5), CType(8), CType(6), CType(9)));
+  } else if (node_def.op() == "Sub") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1), CType(4), CType(0), CType(3)));
+  } else if (node_def.op() == "Mul") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(6), CType(12), CType(9), CType(18)));
+  } else if (node_def.op() == "Div") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+  } else if (node_def.op() == "RealDiv") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+  } else if (node_def.op() == "Minimum") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(2), CType(2), CType(3), CType(3)));
+  } else if (node_def.op() == "Maximum") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(3), CType(6), CType(3), CType(6)));
+  } else {
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_F(OpConverterTest, ConvertBinary) {
+  // Input size doesn't match, should fail.
+  for (size_t num_inputs = 0; num_inputs < 2; ++num_inputs) {
+    Reset();
+    NodeDef node_def = MakeNodeDef("my_add", "Add", {num_inputs, "input"});
+    AddTestTensor("input", {1}, /*batch_size=*/1, nvinfer1::DataType::kFLOAT);
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Binary ops require two inputs, at my_add");
+  }
+  {
+    // Both inputs are weights.
+    Reset();
+    NodeDef node_def = MakeNodeDef("my_add", "Add", {"weights1", "weights2"});
+    AddTestWeights<float>("weights1", {1}, {1});
+    AddTestWeights<float>("weights2", {1}, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Constant folding is falled back to TensorFlow, binary op received "
+        "both input as constant at: my_add");
+  }
+
+  // Test BinaryTensorOpWeight() without broadcasting.
+  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_FLOAT>(this);
+#if 0
+  // TODO(b/119560144): it doesn't support FP16 constants and the following test
+  // will fail.
+  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_HALF>(this);
+#endif
+
+  // Test BinaryTensorOpWeight() with channel-wise broadcasting.
+  TestBinaryTensorOpWeightWithChannelWiseBroadcast<DT_FLOAT>(this);
+
+  // Test BinaryTensorOpWeight() with uniformly broadcasting.
+  TestBinaryTensorOpWeightWithUniformlyBroadcast<DT_FLOAT>(this);
+
+  // Test BinaryTensorOpWeight() falling back to BinaryTensorOpTensor().
+  // Unsupported op.
+  TestBinaryTensorOpWeightFallback<ops::Minimum>(this, {1, 1, 1}, {1});
+  // Rank of input tensor dimension <3.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1}, {1});
+  // Broadcast on batch dimension, should fail.
+  TestBinaryTensorOpWeightFallback<ops::Add>(
+      this, {1, 1, 1}, {2, 1, 1, 1}, error::INVALID_ARGUMENT,
+      "Unsupported binary op broadcast scheme for op my_binary",
+      /*input_batch_size=*/2);
+  // Incompatible dims with per-channel mode.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1, 1}, {1, 2, 1});
+  // Incompatible dims.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 2, 1}, {2});
+
+  // Test BinaryTensorOpTensor() with broadcasting.
+  TestBinaryTensorOpTensor<ops::Add, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Sub, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Mul, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Div, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::RealDiv, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Minimum, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Maximum, DT_FLOAT>(this);
+
+  TestBinaryTensorOpTensor<ops::Add, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Sub, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Mul, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Div, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::RealDiv, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Minimum, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Maximum, DT_HALF>(this);
+}
+
+TEST_F(OpConverterTest, ConvertQuantize) {
+  for (const string& op :
+       {"FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars",
+        "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3"}) {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_quantize", op, {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        StrCat("Invalid number of inputs for ", op, ", at my_quantize")
+            .c_str());
+  }
+  {
+    // FakeQuantWithMinMaxArgs attributes are empty, should fail.
+    NodeDef node_def =
+        MakeNodeDef("my_quantize", "FakeQuantWithMinMaxArgs", {"input"});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Min or max attribute not found for FakeQuantWithMinMaxArgs "
+        "at my_quantize");
+  }
+  {
+    // FakeQuantWithMinMaxArgs ranges set via attributes, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
+    auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("my_quantize"),
+                                                 input, quantize_attrs);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // FakeQuantWithMinMaxVars ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::FakeQuantWithMinMaxVars(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // QuantizeAndDequantizeV2 ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::QuantizeAndDequantizeV2(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // QuantizeAndDequantizeV2 Range inputs are tensors, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::QuantizeAndDequantizeV2(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights_min", {1});
+    AddTestTensor("weights_max", {1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Min and max inputs for QuantizeAndDequantizeV2 must be weights not "
+        "tensors, at my_quantize");
+  }
+  {
+    // QuantizeAndDequantizeV3 ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto num_bits = ops::Placeholder(s.WithOpName("num_bits"), DT_INT32);
+    auto quantize = ops::QuantizeAndDequantizeV3(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max, num_bits);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    AddTestWeights<int>("num_bits", {1}, {8});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+}
+
+TEST_F(OpConverterTest, ConvertRelu6) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_relu6", "Relu6", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Invalid number of inputs for Relu6, at my_relu6");
+  }
+
+  // Get the NodeDef for Relu6.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto relu6 = ops::Relu6(s.WithOpName("my_relu6"), input);
+  const NodeDef node_def = relu6.operation.node()->def();
+  {
+    // Input is weights, should fail.
+    Reset();
+    AddTestWeights<float>("input", {1}, {1.0f});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Relu6 is only implemented for tensors, not weights, at my_relu6");
+  }
+  {
+    // Clip tensor values and set quantization ranges, ok.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_relu6", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {-100, -1, 0, 3, 5, 9}}}, "my_relu6",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(0, 0, 0, 3, 5, 6));
+  }
+}
+
+template <DataType dtype>
+void TestConvertSquare(OpConverterTest* test) {
+  test->Reset();
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+  auto square = ops::Square(s.WithOpName("my_square"), input);
+  NodeDef node_def = square.operation.node()->def();
+
+  test->AddTestTensor("input", {1, 20});
+  test->RunValidationAndConversion(node_def);
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_square", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({1, 20}, output.tensor()->getDimensions());
+
+  const int num_inputs = 20;
+  std::vector<CType> input_data(num_inputs);
+  std::vector<CType> expected_output_data(num_inputs);
+  for (int i = 0; i < 20; i++) {
+    const CType value = CType(i - 9);
+    input_data[i] = value;
+    expected_output_data[i] = value * value;
+  }
+  std::vector<CType> output_data(num_inputs);
+  test->BuildAndRun<CType>({{"input", input_data}}, "my_square", &output_data);
+  ExpectArrayNear(expected_output_data, output_data);
+}
+
+TEST_F(OpConverterTest, ConvertSquare) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_square", "Square", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Square expects one input, at my_square");
+  }
+  {
+    // Input is weights, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto square = ops::Square(s.WithOpName("my_square"), input);
+    NodeDef node_def = square.operation.node()->def();
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Square is only implemented for tensors, at my_square");
+  }
+
+  // OK. Note that kINT32 is not supported by IElementWiseLayer, so we don't
+  // test DT_INT32 type here.
+  TestConvertSquare<DT_FLOAT>(this);
+  // TODO(tmorris): Looks like there may be a bug with this layer for FP16
+  // inputs. Disabling for now.
+  // TestConvertSquare<DT_HALF>(this);
+}
+
+TEST_F(OpConverterTest, ConvertActivation) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_act", "Relu", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Relu expects one input, at my_act");
+  }
+  {
+    // Input is weights, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto relu = ops::Relu(s.WithOpName("my_act"), input);
+    const NodeDef& node_def = relu.operation.node()->def();
+    AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Relu is only implemented for tensors, at my_act");
+  }
+
+  // Get nodedef for activation layer.
+  auto get_act_nodedef = [](string op_name) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    if (op_name == "Relu") {
+      auto act = ops::Relu(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
+    } else if (op_name == "Sigmoid") {
+      auto act = ops::Sigmoid(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
+    } else if (op_name == "Tanh") {
+      auto act = ops::Tanh(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
+    }
+    EXPECT_TRUE(false);
+    return NodeDef();
+  };
+  // Get expected output for activation layer.
+  auto get_act_output = [](string op_name, float input) -> float {
+    if (op_name == "Relu") {
+      return (input > 0.0f) ? input : 0.0f;
+    } else if (op_name == "Sigmoid") {
+      return 1.0f / (1.0f + std::exp(-input));
+    } else if (op_name == "Tanh") {
+      return std::tanh(input);
+    }
+    EXPECT_TRUE(false);
+    return 0;
+  };
+
+  // Ok.
+  for (string op_name : {"Relu", "Sigmoid", "Tanh"}) {
+    Reset();
+    NodeDef node_def = get_act_nodedef(op_name);
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_act", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+
+    const std::vector<float> input_data = {-100, -2, -1, 0, 1, 100};
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", input_data}}, "my_act", &output_data);
+    for (int i = 0; i < input_data.size(); i++) {
+      const float expected_output = get_act_output(op_name, input_data[i]);
+      EXPECT_FLOAT_EQ(output_data[i], expected_output);
+    }
+  }
+}
+
+TEST_F(OpConverterTest, ConvertExpandDims) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_expanddims", "ExpandDims", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Two inputs expected for ExpandDims, at my_expanddims");
+  }
+
+  // Get the NodeDef for ExpandDims.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
+  auto expanddims =
+      ops::ExpandDims(s.WithOpName("my_expanddims"), input, weights);
+  const NodeDef& node_def = expanddims.operation.node()->def();
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<int32>("weights", {1}, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "ExpandDims expects tensor for input, at my_expanddims");
+  }
+  {
+    // Axis is a tensor, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights", {3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "ExpandDims expects weights for axis, at my_expanddims");
+  }
+  {
+    // Add dim at batch dimension, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("weights", {1}, {0});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Modifying batch dimension is not supported for ExpandDims, at "
+        "my_expanddims");
+  }
+  {
+    // Add dim at batch dimension via negative axis, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    // Input is rank 4 (batch dim included)
+    AddTestWeights<int32>("weights", {1}, {-5});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Modifying batch dimension is not supported for ExpandDims, at "
+        "my_expanddims");
+  }
+  {
+    // Axis > rank(input), should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    // Input is rank 4 (batch dim included)
+    AddTestWeights<int32>("weights", {1}, {5});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for ExpandDims is invalid, must be in the range "
+        "[-rank(input) - 1, rank(input)], at my_expanddims");
+  }
+  {
+    // Axis < -rank(input)-1, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    // Input is rank 4 (batch dim included)
+    AddTestWeights<int32>("weights", {1}, {-6});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for ExpandDims is invalid, must be in the range "
+        "[-rank(input) - 1, rank(input)], at my_expanddims");
+  }
+
+  struct TestParams {
+    TestParams(const std::vector<int>& input_dims, int axis,
+               const std::vector<int>& expected_output_dims)
+        : input_dims(input_dims),
+          axis(axis),
+          expected_output_dims(expected_output_dims) {}
+    std::vector<int> input_dims;
+    int axis;
+    std::vector<int> expected_output_dims;
+  };
+
+  // Ok.
+  const int kExpandDimsOKCases = 8;
+  TestParams ok_params[kExpandDimsOKCases] = {
+      TestParams{{2, 3}, 1, {1, 2, 3}}, TestParams{{2, 3}, -3, {1, 2, 3}},
+      TestParams{{2, 3}, 3, {2, 3, 1}}, TestParams{{2, 3}, -1, {2, 3, 1}},
+      TestParams{{2, 3}, 2, {2, 1, 3}}, TestParams{{2, 3}, -2, {2, 1, 3}},
+      TestParams{{6}, 1, {1, 6}},       TestParams{{6}, -1, {6, 1}},
+  };
+  for (int i = 0; i < kExpandDimsOKCases; ++i) {
+    Reset();
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<int32>("weights", {1}, {ok_params[i].axis});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_expanddims", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_expanddims",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertSqueeze) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_squeeze", "Squeeze", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "One input expected for Squeeze, at my_squeeze");
+  }
+  {
+    // No attrs, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto squeeze = ops::Squeeze(s.WithOpName("my_squeeze"), input);
+    const NodeDef& node_def = squeeze.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Squeeze is only implemented for explicit dims, at my_squeeze");
+  }
+
+  // Get the NodeDef for Squeeze.
+  auto get_squeeze_nodedef = [](std::vector<int> axis) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    ops::Squeeze::Attrs squeeze_attrs;
+    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);
+    auto squeeze =
+        ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
+    return squeeze.operation.node()->def();
+  };
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({0});
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Squeeze expects tensor for input, at my_squeeze");
+  }
+  {
+    // Squeeze batch dim, should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({0});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Cannot squeeze batch dimension, at my_squeeze");
+  }
+  {
+    // Squeeze batch dim via negative axis, should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({-4});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Cannot squeeze batch dimension, at my_squeeze");
+  }
+  {
+    // Squeeze >= rank(input), should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({4});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for Squeeze is invalid, must be in the range "
+        "[-rank(input), rank(input)), at my_squeeze");
+  }
+  {
+    // Squeeze < -rank(input), should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({-5});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for Squeeze is invalid, must be in the range "
+        "[-rank(input), rank(input)), at my_squeeze");
+  }
+
+  struct TestParams {
+    TestParams(const std::vector<int>& input_dims, const std::vector<int>& axis,
+               const std::vector<int>& expected_output_dims)
+        : input_dims(input_dims),
+          axis(axis),
+          expected_output_dims(expected_output_dims) {}
+    std::vector<int> input_dims;
+    std::vector<int> axis;
+    std::vector<int> expected_output_dims;
+  };
+
+  // Ok.
+  const int kSqueezeOKCases = 10;
+  TestParams ok_params[kSqueezeOKCases] = {
+      TestParams{{1, 2, 3}, {1}, {2, 3}},
+      TestParams{{1, 2, 3}, {-3}, {2, 3}},
+      TestParams{{2, 3, 1}, {3}, {2, 3}},
+      TestParams{{2, 3, 1}, {-1}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {1, 3, 5}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {3, 1, 5}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {-1, -3, -5}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {1, -3, 5}, {2, 3}},
+      TestParams{{1, 6}, {1}, {6}},
+      TestParams{{6, 1}, {2}, {6}},
+  };
+  for (int i = 0; i < kSqueezeOKCases; ++i) {
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef(ok_params[i].axis);
+    AddTestTensor("input", ok_params[i].input_dims);
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_squeeze", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_squeeze",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+  }
+}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
index ff4fba58bfccd7d9c4d744daa3646c3ee14190ad..c1688d4db88a270dcd202989f89a677ed10576d9 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
@@ -67,109 +67,111 @@ tensorflow::Status TRTOptimizationPass::Init(
     TF_RETURN_IF_ERROR(GetPrecisionMode(
         Uppercase(params.at("precision_mode").s()), &precision_mode_));
   }
+  if (params.count("use_calibration")) {
+    use_calibration_ = params.at("use_calibration").b();
+  }
   return tensorflow::Status::OK();
 }
 
 void TRTOptimizationPass::PrintDebugInfo(
     tensorflow::grappler::Cluster* cluster,
     const tensorflow::grappler::GrapplerItem& item) {
-  VLOG(1) << "Cluster = " << cluster;
+  LOG(INFO) << "Cluster = " << cluster;
   string offset("  ");
   string offset2 = StrCat(offset, offset);
   string offset3 = StrCat(offset2, offset);
   string offset4 = StrCat(offset2, offset2);
   if (cluster) {
-    VLOG(1) << offset << "type             = " << cluster->type();
-    VLOG(1) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
+    LOG(INFO) << offset << "type             = " << cluster->type();
+    LOG(INFO) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
     const auto dev_names = cluster->GetDeviceNames();
     if (dev_names.size()) {
-      VLOG(1) << offset << " Device names:";
+      LOG(INFO) << offset << " Device names:";
       for (const auto s : dev_names) {
-        VLOG(1) << offset2 << s;
+        LOG(INFO) << offset2 << s;
       }
     }
     std::unordered_map<string, uint64> peak_mem;
     auto status = cluster->GetPeakMemoryUsage(&peak_mem);
     if (status == tensorflow::Status::OK()) {
-      VLOG(1) << offset << "Peak Memory Usage :";
+      LOG(INFO) << offset << "Peak Memory Usage :";
       for (auto s : peak_mem) {
-        VLOG(1) << offset2 << s.first << " = " << s.second;
+        LOG(INFO) << offset2 << s.first << " = " << s.second;
       }
     }
 
     const auto dev_props = cluster->GetDevices();
     if (dev_props.size()) {
-      VLOG(1) << offset << "Device properties:";
+      LOG(INFO) << offset << "Device properties:";
       for (auto k : dev_props) {
-        VLOG(1) << offset2 << k.first;
+        LOG(INFO) << offset2 << k.first;
         const auto& dt = k.second;
-        VLOG(1) << offset3 << "type          = " << dt.type();
-        VLOG(1) << offset3 << "vendor        = " << dt.vendor();
-        VLOG(1) << offset3 << "model         = " << dt.model();
-        VLOG(1) << offset3 << "frequency     = " << dt.frequency();
-        VLOG(1) << offset3 << "num cores     = " << dt.num_cores();
-        VLOG(1) << offset3 << "num registers = " << dt.num_registers();
-        VLOG(1) << offset3 << "L1 cache size = " << dt.l1_cache_size();
-        VLOG(1) << offset3 << "L2 cache size = " << dt.l2_cache_size();
-        VLOG(1) << offset3 << "L3 cache size = " << dt.l3_cache_size();
-        VLOG(1) << offset3 << "SHMem per SMP = "
-                << dt.shared_memory_size_per_multiprocessor();
-        VLOG(1) << offset3 << "memory size   = " << dt.memory_size();
-        VLOG(1) << offset3 << "bandwidth     = " << dt.bandwidth();
+        LOG(INFO) << offset3 << "type          = " << dt.type();
+        LOG(INFO) << offset3 << "vendor        = " << dt.vendor();
+        LOG(INFO) << offset3 << "model         = " << dt.model();
+        LOG(INFO) << offset3 << "frequency     = " << dt.frequency();
+        LOG(INFO) << offset3 << "num cores     = " << dt.num_cores();
+        LOG(INFO) << offset3 << "num registers = " << dt.num_registers();
+        LOG(INFO) << offset3 << "L1 cache size = " << dt.l1_cache_size();
+        LOG(INFO) << offset3 << "L2 cache size = " << dt.l2_cache_size();
+        LOG(INFO) << offset3 << "L3 cache size = " << dt.l3_cache_size();
+        LOG(INFO) << offset3 << "SHMem per SMP = "
+                  << dt.shared_memory_size_per_multiprocessor();
+        LOG(INFO) << offset3 << "memory size   = " << dt.memory_size();
+        LOG(INFO) << offset3 << "bandwidth     = " << dt.bandwidth();
         if (dt.environment_size()) {
-          VLOG(1) << offset3 << "environment   :";
+          LOG(INFO) << offset3 << "environment   :";
           for (const auto e : dt.environment()) {
-            VLOG(1) << offset4 << e.first << " = " << e.second;
+            LOG(INFO) << offset4 << e.first << " = " << e.second;
           }
         }
       }
     }
   }
-  VLOG(1) << "item: " << item.id;
+  LOG(INFO) << "item: " << item.id;
   if (item.feed.size()) {
-    VLOG(1) << offset << "Feeds  :";
+    LOG(INFO) << offset << "Feeds  :";
     for (const auto& f : item.feed) {
       const auto& shape = f.second.shape();
-      VLOG(1) << offset2 << f.first << " = shaped " << shape.DebugString();
+      LOG(INFO) << offset2 << f.first << " = shaped " << shape.DebugString();
     }
   } else {
-    VLOG(1) << offset << "No Feeds";
+    LOG(INFO) << offset << "No Feeds";
   }
   if (item.fetch.size()) {
-    VLOG(1) << offset << "Fetches  :";
+    LOG(INFO) << offset << "Fetches  :";
     for (const auto& f : item.fetch) {
-      VLOG(1) << offset2 << f;
+      LOG(INFO) << offset2 << f;
     }
   } else {
-    VLOG(1) << offset << "No Fetches";
+    LOG(INFO) << offset << "No Fetches";
   }
 
   if (item.init_ops.size()) {
-    VLOG(1) << offset << "init ops  :";
+    LOG(INFO) << offset << "init ops  :";
     for (const auto& f : item.init_ops) {
-      VLOG(1) << offset2 << f;
+      LOG(INFO) << offset2 << f;
     }
   } else {
-    VLOG(1) << offset << "No init ops";
+    LOG(INFO) << offset << "No init ops";
   }
-  VLOG(1) << "Save Op = " << item.save_op;
-  VLOG(1) << "Restore Op = " << item.restore_op;
-  VLOG(1) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor;
+  LOG(INFO) << "Save Op = " << item.save_op;
+  LOG(INFO) << "Restore Op = " << item.restore_op;
+  LOG(INFO) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor;
   if (item.keep_ops.size()) {
-    VLOG(1) << offset << "keep ops  :";
+    LOG(INFO) << offset << "keep ops  :";
     for (const auto& f : item.keep_ops) {
-      VLOG(1) << offset2 << f;
+      LOG(INFO) << offset2 << f;
     }
   } else {
-    VLOG(1) << offset << "No keep ops";
+    LOG(INFO) << offset << "No keep ops";
   }
-  VLOG(3) << item.graph.DebugString();
   for (const auto dev : cluster->GetDeviceSet()->devices()) {
     const auto& pname = dev->parsed_name();
-    VLOG(1) << "Device name= " << dev->name()
-            << " parsedname job= " << pname.job << " id= " << pname.id
-            << " has_id: " << pname.has_id << " has_job: " << pname.has_job
-            << "has_type: " << pname.has_type << " type =" << pname.type;
+    LOG(INFO) << "Device name= " << dev->name()
+              << " parsedname job= " << pname.job << " id= " << pname.id
+              << " has_id: " << pname.has_id << " has_job: " << pname.has_job
+              << "has_type: " << pname.has_type << " type =" << pname.type;
   }
 }
 
@@ -188,8 +190,8 @@ tensorflow::Status TRTOptimizationPass::Optimize(
     *optimized_graph = item.graph;
     return tensorflow::Status::OK();
   }
-  if (VLOG_IS_ON(1)) {
-    VLOG(2) << CurrentStackTrace();
+  if (VLOG_IS_ON(3)) {
+    LOG(INFO) << CurrentStackTrace();
     PrintDebugInfo(cluster, item);
   }
   int max_dim = -1;
@@ -223,6 +225,12 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
   tensorflow::tensorrt::convert::ConversionParams cp;
 
+  if (use_calibration_ && precision_mode_ != INT8MODE) {
+    LOG(ERROR) << "Calibration with FP32 or FP16 is not implemented. "
+               << "Falling back to use_calibration = False.";
+    use_calibration_ = false;
+  }
+
   std::vector<string> nodes_to_preserve;
   for (const auto& n : item.NodesToPreserve()) {
     auto tokens = str_util::Split(n, ":");
@@ -251,8 +259,8 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   cp.is_dyn_op = is_dynamic_op_;
   cp.cached_engine_batches = batches_;
   cp.max_cached_engines = max_cached_batches_;
+  cp.use_calibration = use_calibration_;
   auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(cp);
-  VLOG(2) << optimized_graph->DebugString();
   VLOG(1) << "Returning from " << name_;
   return status;
 }
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
index 71b51d13681cb3f75dad034f3fb0f73dea2bacc1..3e8dc0978e43e2e9ba07aaa09f74acfe8e59b9a7 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
@@ -38,7 +38,8 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
         maximum_batch_size_(-1),
         is_dynamic_op_(false),
         max_cached_batches_(1),
-        max_workspace_size_bytes_(256LL << 20) {
+        max_workspace_size_bytes_(256LL << 20),
+        use_calibration_(true) {
     VLOG(1) << "Constructing " << name_;
   }
 
@@ -67,6 +68,7 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
   std::vector<int> batches_;
   int max_cached_batches_;
   int64_t max_workspace_size_bytes_;
+  bool use_calibration_;
 };
 
 }  // namespace convert
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 88cf8d5980304f61736f22189ec3cae3bb1e47eb..bad568644bb1f8d01d4cb0a7c853ec47d6f19e45 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -124,8 +124,10 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context,
                  context->GetAttr("segment_funcdef_name", &funcdef_name_));
   OP_REQUIRES_OK(context, GetPrecisionMode(precision_string, &precision_mode_));
-  calibration_mode_ =
-      (precision_mode_ == INT8MODE && calibration_data.size() == 0);
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("use_calibration", &use_calibration_));
+  calibration_mode_ = (use_calibration_ && precision_mode_ == INT8MODE &&
+                       calibration_data.size() == 0);
   if (calibration_data.size()) {
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
@@ -149,9 +151,6 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
 
 void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
                                        AsyncHelper* helper) {
-  if (!calibration_mode_) {
-    VLOG(1) << "Executing native engine";
-  }
   std::vector<Tensor> inputs;
   std::vector<Tensor>* outputs = new std::vector<Tensor>();
   if (native_func_ == tensorflow::kInvalidHandle) {
@@ -172,7 +171,7 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
     inputs.push_back(ctx->input(i));
   }
   helper->Ref();  // Increment count for calculating native graph
-  VLOG(1) << "Executing native segment " << name();
+  VLOG(1) << "Executing native segment: " << name();
   lib->Run(opts, native_func_, inputs, outputs,
            [this, ctx, outputs, helper](const tensorflow::Status& s) {
              tensorflow::core::ScopedUnref sc(helper);
@@ -192,6 +191,7 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
 
 void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
                                      AsyncHelper* helper) {
+  VLOG(1) << "Executing TRT calibration: " << name();
   helper->Ref();
   tensorflow::core::ScopedUnref sc(helper);
   // TODO(aaroey): remove the ResourceMgr singleton.
@@ -303,12 +303,13 @@ bool TRTEngineOp::ExecuteTrtEngine(
     OpKernelContext* ctx, const int num_batch,
     nvinfer1::ICudaEngine* trt_engine_ptr,
     nvinfer1::IExecutionContext* trt_execution_context_ptr) {
+  VLOG(1) << "Executing TRT engine: " << name();
   const bool kRetry = true;
   const int num_binding = ctx->num_inputs() + ctx->num_outputs();
   std::vector<void*> buffers(num_binding);
   for (int i = 0; i < ctx->num_inputs(); i++) {
     const string input_name = StrCat(kInputPHName, i);
-    const size_t binding_index =
+    const int binding_index =
         trt_engine_ptr->getBindingIndex(input_name.c_str());
     if (binding_index == -1) {
       LOG(ERROR) << "Input node not found, at " << input_name;
@@ -333,11 +334,9 @@ bool TRTEngineOp::ExecuteTrtEngine(
       case nvinfer1::DataType::kINT8:
         LOG(ERROR) << "INT8 inputs are not supported yet!";
         return kRetry;
-#if NV_TENSORRT_MAJOR > 3
       case nvinfer1::DataType::kINT32:
         buffers[binding_index] = (void*)(input_tensor.flat<int32>().data());
         break;
-#endif
       default:
         LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
         return kRetry;
@@ -347,7 +346,7 @@ bool TRTEngineOp::ExecuteTrtEngine(
   for (int i = 0; i < ctx->num_outputs(); i++) {
     // Create an output tensor
     const string output_name = StrCat(kOutputPHName, i);
-    const size_t binding_index =
+    const int binding_index =
         trt_engine_ptr->getBindingIndex(output_name.c_str());
     Tensor* output_tensor = nullptr;
 
@@ -387,12 +386,10 @@ bool TRTEngineOp::ExecuteTrtEngine(
       case nvinfer1::DataType::kINT8:
         LOG(WARNING) << "int8 is not supported yet!";
         return kRetry;
-#if NV_TENSORRT_MAJOR > 3
       case nvinfer1::DataType::kINT32:
         buffers[binding_index] =
             reinterpret_cast<void*>(output_tensor->flat<int32>().data());
         break;
-#endif
       default:
         LOG(WARNING) << "Unknown TRT data type: " << static_cast<int>(dtype);
         return kRetry;
@@ -457,13 +454,11 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
       return null_pair;
     }
     TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
-#if NV_TENSORRT_MAJOR > 3
     auto allocator = GetAllocator(ctx);
     if (allocator == nullptr) {
       return null_pair;
     }
     infer->setGpuAllocator(allocator);
-#endif
     TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
         infer->deserializeCudaEngine(serialized_segment_.c_str(),
                                      serialized_segment_.size(),
@@ -487,25 +482,24 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
   if (engine_it == engine_map_.end() &&
       engine_map_.size() < (size_t)max_cached_engines_) {
     nvinfer1::IGpuAllocator* allocator = nullptr;
-#if NV_TENSORRT_MAJOR > 3
     allocator = GetAllocator(ctx);
     if (allocator == nullptr) {
       return null_pair;
     }
-#endif
     std::vector<tensorflow::PartialTensorShape> shapes;
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       shapes.emplace_back(ctx->input(i).shape());
     }
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     bool convert_successfully = false;
-    VLOG(0) << name() << " Constructing a new engine with batch size "
-            << batch_size;
+    LOG(INFO) << "Building a new TensorRT engine for " << name()
+              << " with batch size " << batch_size;
     // Up to this point, calibrator_ can never be empty, since otherwise it
     // means calibration_mode_ is true and this path won't get executed.
     auto status = convert::ConvertGraphDefToEngine(
         segment_graph_, precision_mode_, batch_size, workspace_size_, shapes,
-        &logger, allocator, calibrator_.get(), &engine, &convert_successfully);
+        &logger, allocator, calibrator_.get(), &engine, use_calibration_,
+        &convert_successfully);
     if (!status.ok()) {
       if (convert_successfully) {
         // This means it fail to build the engine even when the network is built
@@ -575,8 +569,8 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
   const int64 workspace_size_bytes = workspace_size_;
   cres->thr_.reset(new std::thread([cres, label, segment_graph, shapes,
                                     platform_gpu_id, workspace_size_bytes]() {
-    VLOG(0) << "Starting calibration thread on device " << platform_gpu_id
-            << ", Calibration Resource @ " << cres;
+    LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
+              << ", Calibration Resource @ " << cres;
     auto err = cudaSetDevice(platform_gpu_id);
     if (err != cudaSuccess) {
       // TODO(aaroey): should return error here.
@@ -594,6 +588,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
         *segment_graph, INT8MODE, cres->calibrator_->getBatchSize(),
         workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(),
         cres->calibrator_.get(), &cres->engine_,
+        /*use_calibration=*/true,
         /*convert_successfully=*/nullptr);
     if (!s.ok()) {
       LOG(ERROR) << "Calibration failed: " << s;
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 8fe06758914261035c90a6fda3f114a63a8ac93a..b545f497f32d5a1a6960b748467ca189b7debf6c 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -130,6 +130,10 @@ class TRTEngineOp : public AsyncOpKernel {
 
   // The finalized calibrator for inference.
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
+
+  // If true, create calibration graph for INT8 mode. Otherwise, we are using
+  // user-provided quantization ranges.
+  bool use_calibration_;
 };
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
index e0c7b6272379a20e3dacb6cd7c3b39de735d844d..92405906eb76b043bc08b68e25e16ab40197dddf 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -39,18 +40,19 @@ REGISTER_OP("TRTEngineOp")
     .Attr("cached_engine_batches: list(int) = []")
     .Attr("max_cached_engines_count: int = 1")
     .Attr("workspace_size_bytes: int")
-    .Attr("precision_mode: {'FP32', 'FP16', 'INT8', 'INT8CALIB'}")
+    .Attr("precision_mode: {'FP32', 'FP16', 'INT8'}")
     .Attr("calibration_data: string = ''")
+    .Attr("use_calibration: bool = true")
     .Input("in_tensor: InT")
-    .Output("out_tensor: OutT");
-// TODO(jie): TF requires concrete output shape for concrete input shapes.
-// This is tricky for batch dimension, since we cannot ensure which input
-// would carry the correct batch dimension (for the current stage of the
-// implementation, we do require all input tensor to carry the same batch
-// size, but this could change in the future). Hence we disable shape
-// inference function as a workaround.
-// .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
-
+    .Output("out_tensor: OutT")
+    // TODO(jie): TF requires concrete output shape for concrete input shapes.
+    // This is tricky for batch dimension, since we cannot ensure which input
+    // would carry the correct batch dimension (for the current stage of the
+    // implementation, we do require all input tensor to carry the same batch
+    // size, but this could change in the future). Hence we disable shape
+    // inference function as a workaround.
+    // .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
+    .SetShapeFn(shape_inference::UnknownShape);
 }  // namespace tensorflow
 
 #endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 369e73b5a6edc53ddbaf82d50660fa50aba5c324..203b2697babe32b45523109708cbf062dceee33b 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -63,16 +63,20 @@ class TrtPrecisionMode(object):
     return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
 
 
-def tensorrt_rewriter_config(max_batch_size=1,
-                             max_workspace_size_bytes=2 << 20,
-                             precision_mode=TrtPrecisionMode.FP32,
-                             minimum_segment_size=3,
-                             is_dynamic_op=False,
-                             maximum_cached_engines=1,
-                             cached_engine_batch_sizes=None):
+def get_tensorrt_rewriter_config(rewriter_config=None,
+                                 max_batch_size=1,
+                                 max_workspace_size_bytes=2 << 20,
+                                 precision_mode=TrtPrecisionMode.FP32,
+                                 minimum_segment_size=3,
+                                 is_dynamic_op=False,
+                                 maximum_cached_engines=1,
+                                 cached_engine_batch_sizes=None,
+                                 use_calibration=True):
   """Returns a RewriterConfig proto for TRT transformation.
 
   Args:
+    rewriter_config: a template RewriterConfig proto used to create a
+      TRT-enabled RewriterConfig. If None, it will use a default one.
     max_batch_size: max size for the input batch
     max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
       engine can use at execution time. This corresponds to the 'workspaceSize'
@@ -92,23 +96,45 @@ def tensorrt_rewriter_config(max_batch_size=1,
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
+    use_calibration: this argument is ignored if precision_mode is not INT8. If
+      set to True, a calibration graph will be created to calibrate the missing
+      ranges. The calibration graph must be converted to an inference graph
+      using calib_graph_to_infer_graph() after running calibration. if set to
+      False, quantization nodes will be expected for every tensor in the graph
+      (exlcuding those which will be fused). If a range is missing, an error
+      will occur. Please note that accuracy may be negatively affected if there
+      is a mismatch between which tensors TRT quantizes and which tensors were
+      trained with fake quantization.
 
   Returns:
     A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
 
   Raises:
-    TypeError: if the provided precision mode is invalid.
-    ValueError: if len(cached_engine_batch_sizes) exceed maximum_cached_engines.
+    TypeError: if any of the parameters are of unexpected type.
+    ValueError: if any of the parameters are of unexpected value.
   """
+  if rewriter_config is not None and not isinstance(
+      rewriter_config, rewriter_config_pb2.RewriterConfig):
+    raise TypeError("rewriter_config should be a RewriterConfig proto.")
+
+  rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
+  if rewriter_config is None:
+    # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
+    # need to run constant folding again.
+    rewriter_config_with_trt.optimizers.extend(
+        ["constfold", "layout", "constfold"])
+    rewriter_config_with_trt.meta_optimizer_iterations = (
+        rewriter_config_pb2.RewriterConfig.ONE)
+  else:
+    rewriter_config_with_trt.CopyFrom(rewriter_config)
+
   if precision_mode.upper() not in TrtPrecisionMode.supported_precision_modes():
     raise ValueError(("precision mode '{}' is not supported."
                       "It should be one of {}").format(
                           precision_mode,
                           TrtPrecisionMode.supported_precision_modes))
 
-  rewriter_cfg = rewriter_config_pb2.RewriterConfig()
-  rewriter_cfg.optimizers.extend(["constfold", "layout"])
-  optimizer = rewriter_cfg.custom_optimizers.add()
+  optimizer = rewriter_config_with_trt.custom_optimizers.add()
   optimizer.name = "TensorRTOptimizer"
   optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
   optimizer.parameter_map["max_batch_size"].i = max_batch_size
@@ -125,7 +151,8 @@ def tensorrt_rewriter_config(max_batch_size=1,
                        "maximum_cached_engines items.")
     optimizer.parameter_map["cached_engine_batches"].list.i.extend(
         cached_engine_batch_sizes)
-  return rewriter_cfg
+  optimizer.parameter_map["use_calibration"].b = use_calibration
+  return rewriter_config_with_trt
 
 
 def create_inference_graph(input_graph_def,
@@ -137,6 +164,7 @@ def create_inference_graph(input_graph_def,
                            is_dynamic_op=False,
                            maximum_cached_engines=1,
                            cached_engine_batch_sizes=None,
+                           use_calibration=True,
                            input_saved_model_dir=None,
                            input_saved_model_tags=None,
                            output_saved_model_dir=None,
@@ -168,6 +196,15 @@ def create_inference_graph(input_graph_def,
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
+    use_calibration: this argument is ignored if precision_mode is not INT8. If
+      set to True, a calibration graph will be created to calibrate the missing
+      ranges. The calibration graph must be converted to an inference graph
+      using calib_graph_to_infer_graph() after running calibration. if set to
+      False, quantization nodes will be expected for every tensor in the graph
+      (exlcuding those which will be fused). If a range is missing, an error
+      will occur. Please note that accuracy may be negatively affected if there
+      is a mismatch between which tensors TRT quantizes and which tensors were
+      trained with fake quantization.
     input_saved_model_dir: the directory to load the SavedModel which contains
       the input graph to transforms. Used only when input_graph_def is None.
     input_saved_model_tags: list of tags to load the SavedModel.
@@ -175,8 +212,9 @@ def create_inference_graph(input_graph_def,
       returned GraphDef and save it to the specified directory. This option only
       works when the input graph is loaded from a SavedModel, i.e. when
       input_saved_model_dir is specified and input_graph_def is None.
-    session_config: the ConfigProto used to create a Session. If not specified,
-      a default ConfigProto will be used.
+    session_config: the ConfigProto used to create a Session. It's also used as
+      a template to create a TRT-enabled ConfigProto for conversion. If not
+      specified, a default ConfigProto will be used.
 
   Returns:
     A GraphDef transformed from input_graph_def (or the SavedModel graph def
@@ -306,21 +344,30 @@ def create_inference_graph(input_graph_def,
       grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
           output_collection)
 
-  # Create RewriterConfig.
-  rewriter_cfg = tensorrt_rewriter_config(
-      max_batch_size, max_workspace_size_bytes, precision_mode,
+  # Create TRT-enabled ConfigProto.
+  session_config_with_trt = config_pb2.ConfigProto()
+  session_config_with_trt.CopyFrom(session_config)
+  rewriter_config = None
+  if (session_config_with_trt.HasField("graph_options") and
+      session_config_with_trt.graph_options.HasField("rewrite_options")):
+    rewriter_config = session_config_with_trt.graph_options.rewrite_options
+  rewriter_config_with_trt = get_tensorrt_rewriter_config(
+      rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode,
       minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-      cached_engine_batch_sizes)
+      cached_engine_batch_sizes, use_calibration)
+  session_config_with_trt.graph_options.rewrite_options.CopyFrom(
+      rewriter_config_with_trt)
 
   # Run Grappler.
   transformed_graph_def = tf_optimizer.OptimizeGraph(
-      rewriter_cfg, grappler_meta_graph_def, graph_id=b"tf_graph")
+      session_config_with_trt, grappler_meta_graph_def, graph_id=b"tf_graph")
 
   # Optionally write the transformed graphdef as SavedModel.
   if output_saved_model_dir is not None:
     saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
     with ops.Graph().as_default():
       importer.import_graph_def(transformed_graph_def, name="")
+      # We don't use TRT here.
       with session.Session(config=session_config) as sess:
         saved_model_builder.add_meta_graph_and_variables(
             sess,
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert_test.py b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
index 52cb0bd9f9ec0a43f74068bc19b5dd05e66ee5af..a7b2d2ea50543ba85c5a13dd6ca320e794ca47f1 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert_test.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
 # pylint: enable=unused-import
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
@@ -46,9 +47,10 @@ from tensorflow.python.tools import saved_model_utils
 class TrtConvertTest(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration python API."""
 
-  def testTensorrtRewriterConfig(self):
-    """Test case for trt_convert.tensorrt_rewriter_config()."""
-    rewriter_cfg = trt_convert.tensorrt_rewriter_config(
+  def testGetTensorrtRewriterConfig(self):
+    """Test case for trt_convert.get_tensorrt_rewriter_config()."""
+    rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
+        rewriter_config=None,
         max_batch_size=128,
         max_workspace_size_bytes=1234,
         precision_mode="INT8",
@@ -56,6 +58,10 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         is_dynamic_op=True,
         maximum_cached_engines=2,
         cached_engine_batch_sizes=[1, 128])
+    self.assertEqual(["constfold", "layout", "constfold"],
+                     rewriter_cfg.optimizers)
+    self.assertEqual(rewriter_config_pb2.RewriterConfig.ONE,
+                     rewriter_cfg.meta_optimizer_iterations)
     trt_optimizer = None
     for optimizer in rewriter_cfg.custom_optimizers:
       if optimizer.name == "TensorRTOptimizer":
@@ -156,7 +162,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
       node_name_to_op = {node.name: node.op for node in graph_def.node}
       self.assertEqual({
           "input": "Placeholder",
-          "my_trt_op_0": "TRTEngineOp",
+          "TRTEngineOp_0": "TRTEngineOp",
           "output": "Identity"
       }, node_name_to_op)
 
@@ -182,11 +188,12 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([[[4.0]]] * batch_size, result)
     execute_engine_test_value = ("done" if expect_engine_is_run else "")
     execute_native_segment_test_value = ("" if expect_engine_is_run else "done")
-    self.assertEqual(execute_engine_test_value,
-                     trt_convert.get_test_value("my_trt_op_0:ExecuteTrtEngine"))
+    self.assertEqual(
+        execute_engine_test_value,
+        trt_convert.get_test_value("TRTEngineOp_0:ExecuteTrtEngine"))
     self.assertEqual(
         execute_native_segment_test_value,
-        trt_convert.get_test_value("my_trt_op_0:ExecuteNativeSegment"))
+        trt_convert.get_test_value("TRTEngineOp_0:ExecuteNativeSegment"))
 
   def testCreateInferenceGraph_MinimumSegmentSize(self):
     if not trt_convert.is_tensorrt_enabled():
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
index a9425864dd377fd70825b1b139b4e0e907a89fb4..7a2e93414aed56525eaeac876cdac20404bcf6ab 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
@@ -54,7 +54,6 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space) {
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#if NV_TENSORRT_MAJOR > 2
 
 namespace tensorflow {
 namespace tensorrt {
@@ -116,6 +115,5 @@ void TRTDeviceAllocator::free(void* memory) {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
index dc9862b16ca380c20531b49e6ccbc62b2375cad6..f857a9de055ee7668f0bf9bc97e030354505081b 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
@@ -35,16 +35,6 @@ void* Align(uint64_t alignment, uint64_t size, void*& ptr, uint64_t& space);
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#if NV_TENSORRT_MAJOR == 3
-// Define interface here temporarily until TRT 4.0 is released
-namespace nvinfer1 {
-class IGpuAllocator {
- public:
-  virtual void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) = 0;
-  virtual void free(void* memory) = 0;
-};
-}  // namespace nvinfer1
-#endif
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index d7d56cb95e033ea55bd3aa385a707e7a7cfc557b..aac9e5c7bd725fc10bcaa04536ebc7be071b4d4c 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -39,7 +39,8 @@ namespace tensorrt {
 class TRTCalibrationResource : public tensorflow::ResourceBase {
  public:
   ~TRTCalibrationResource() {
-    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+    LOG(INFO) << "Destroying Calibration Resource " << std::endl
+              << DebugString();
     builder_.reset();
     engine_.reset();
     // We need to manually destroy the builder and engine before the allocator
@@ -70,28 +71,6 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
   std::unique_ptr<std::thread> thr_;
 };
 
-class TRTWeightStore {
- public:
-  TRTWeightStore() {}
-
-  virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
-
-  string DebugString() {
-    std::stringstream oss;
-    size_t len_bytes = 0;
-    for (const auto& v : store_) {
-      len_bytes += v.size() * sizeof(uint8_t);
-    }
-    oss << " Number of entries     = " << store_.size() << std::endl
-        << " Total number of bytes = "
-        << store_.size() * sizeof(std::vector<uint8_t>) + len_bytes
-        << std::endl;
-    return oss.str();
-  }
-
-  std::list<std::vector<uint8_t>> store_;
-};
-
 }  // namespace tensorrt
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index c82d4a018392be19a0bae5893158c7180f15acc3..6abc5226ccf96e472df77269bee6186726e5768d 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -33,6 +33,7 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
 
 // A simple graph representation to mirror tensorflow::Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
@@ -389,7 +390,7 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
 
 tensorflow::Status SegmentGraph(
     const tensorflow::Graph* tf_graph,
-    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const std::function<Status(const tensorflow::Node*)>& candidate_fn,
     const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn,
     const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments) {
@@ -406,15 +407,42 @@ tensorflow::Status SegmentGraph(
   // Use a union-find to collect the nodes that belong to the same
   // segment. A node value of nullptr indicates that the node is not a candidate
   // for TRT.
+  std::unordered_set<string> unsupported_ops;
+  int num_unsupported_ops = 0;
   std::vector<UnionFind<SimpleNode*>> node_segments;
   for (int i = 0; i < graph->num_node_ids(); ++i) {
     SimpleNode* node = graph->FindNodeId(i);
-    if (options.exclude_node_list.count(node->name()) != 0 ||
-        !candidate_fn(node->tf_node())) {
+    if (options.exclude_node_list.count(node->name()) != 0) {
+      VLOG(1) << "Not a TF-TRT candidate, "
+              << "(Op type: " << node->tf_node()->type_string() << "), "
+              << "(Op name: " << node->name() << "), "
+              << "(Reason: excluded by segmenter option)";
+      unsupported_ops.emplace(node->tf_node()->type_string());
+      num_unsupported_ops++;
       node = nullptr;
+    } else {
+      const Status status = candidate_fn(node->tf_node());
+      if (!status.ok()) {
+        VLOG(1) << "Not a TF-TRT candidate, "
+                << "(Op type: " << node->tf_node()->type_string() << "), "
+                << "(Op name: " << node->name() << "), "
+                << "(Reason: " << status << ")";
+        unsupported_ops.emplace(node->tf_node()->type_string());
+        num_unsupported_ops++;
+        node = nullptr;
+      }
     }
     node_segments.emplace_back(node);
   }
+  string msg = StrCat(
+      "There are ", num_unsupported_ops, " ops of ", unsupported_ops.size(),
+      " different types in the graph that", " are not converted to TensorRT: ");
+  for (const auto& elem : unsupported_ops) {
+    StrAppend(&msg, elem, ", ");
+  }
+  LOG(INFO) << msg << "(For more information see "
+            << "https://docs.nvidia.com/deeplearning"
+            << "/dgx/integrate-tf-trt/index.html#support-ops).";
 
   // The segmentation algorithm below visits nodes in reverse topological order
   // and attempts to merge nodes along output edges. That means that subgraphs
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 8c44eb782aa37052680d0e06023f29dc65e327c6..b9693aad1b764515459db6833b05221ea5b3a2d1 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -43,7 +43,7 @@ struct SegmentOptions {
 // Get the subgraphs of a graph that can be handled by TensorRT.
 //
 // @param graph tensorflow::Graph of the network
-// @param candidate_fn A function that returns true for a Node* if
+// @param candidate_fn A function that returns OK for a Node* if
 // that node can be handled by TensorRT.
 // @param segments Returns the TensorRT segments/subgraphs. Each entry
 // in the vector describes a subgraph by giving a set of the names of
@@ -51,7 +51,7 @@ struct SegmentOptions {
 // @return the status.
 tensorflow::Status SegmentGraph(
     const tensorflow::Graph* tf_graph,
-    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const std::function<Status(const tensorflow::Node*)>& candidate_fn,
     const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn,
     const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments);
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 5937fa8259a39339e92b150862d195ee1f23f70a..4805ef9c61a7784a1c08cf5eaf504691bc9dbedc 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -34,10 +34,13 @@ namespace ops = ::tensorflow::ops;
 
 class SegmentTest : public ::testing::Test {
  protected:
-  std::function<bool(const tensorflow::Node*)> MakeCandidateFn(
+  std::function<Status(const tensorflow::Node*)> MakeCandidateFn(
       const std::set<string>& node_names) {
-    return [node_names](const tensorflow::Node* node) -> bool {
-      return node_names.find(node->name()) != node_names.end();
+    return [node_names](const tensorflow::Node* node) -> Status {
+      if (node_names.find(node->name()) != node_names.end()) {
+        return Status::OK();
+      }
+      return errors::NotFound("");
     };
   }
 
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py
index 7e9ffb05ab0741de7daa43c509897cf88e7be8c0..ff317e43e1e6ff1c0b869ae8dc6d1fda8f0ce126 100644
--- a/tensorflow/contrib/tensorrt/test/base_test.py
+++ b/tensorflow/contrib/tensorrt/test/base_test.py
@@ -56,8 +56,9 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
             strides=[1, 2, 2, 1],
             padding="SAME",
             name="conv")
-        bias = constant_op.constant(
-            [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtype)
+        bias = constant_op.constant([4., 1.5, 2., 3., 5., 7.],
+                                    name="bias",
+                                    dtype=dtype)
         added = nn.bias_add(conv, bias, name="bias_add")
         relu = nn.relu(added, "relu")
         identity = array_ops.identity(relu, "identity")
@@ -73,11 +74,12 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-    # breaks the connection check, fix it.
-    # - my_trt_op_0 should have ["weights", "conv", "bias", "bias_add",
-    #   "relu", "identity", "max_pool"]
-    return ["my_trt_op_0"]
+    return {
+        "TRTEngineOp_0": [
+            "weights", "conv", "bias", "bias_add", "relu", "identity",
+            "max_pool"
+        ]
+    }
 
 
 class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
@@ -92,7 +94,7 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+          dtype=dtype, shape=input_dims, name=input_name)
       with g.device("/GPU:0"):
         conv_filter = constant_op.constant(
             [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
@@ -105,10 +107,10 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
             padding="SAME",
             name="conv")
         c1 = constant_op.constant(
-            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c1")
+            np.random.randn(12, 12, 6), dtype=dtype, name="c1")
         p = math_ops.mul(conv, c1, name="mul")
         c2 = constant_op.constant(
-            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c2")
+            np.random.randn(12, 12, 6), dtype=dtype, name="c2")
         q = math_ops.div(conv, c2, name="div")
 
         edge = self.trt_incompatible_op(q, name="incompatible")
@@ -129,12 +131,21 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-    # breaks the connection check, fix it.
-    # - my_trt_op_0 should have ["mul", "sub", "div1", "mul1", "add1",
-    #   "add", "sub1"];
-    # - my_trt_op_1 should have ["weights","conv", "div"]
-    return ["my_trt_op_0", "my_trt_op_1"]
+    return {
+        "TRTEngineOp_0": [
+            "add", "add1", "c1", "div1", "mul", "mul1", "sub", "sub1"
+        ],
+        "TRTEngineOp_1": ["c2", "conv", "div", "weights"]
+    }
+
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    return super(
+        SimpleMultiEnginesTest, self
+    ).GetConversionParams(run_params)._replace(
+        # Disable layout optimizer, since it'll add Transpose(Const, Const) to
+        # the graph and breaks the conversion check.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
 
 
 class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
@@ -143,7 +154,7 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     """Setup method."""
     super(PartiallyConvertedTestA, self).setUp()
     # Let it fail to build the second engine.
-    trt_convert.add_test_value("my_trt_op_1:CreateTRTNode", "fail")
+    trt_convert.add_test_value("TRTEngineOp_1:CreateTRTNode", "fail")
 
   def GetParams(self):
     """Create a graph containing two segment."""
@@ -180,14 +191,16 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return {
         # Only the first engine is built.
-        "my_trt_op_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"]
+        "TRTEngineOp_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"]
     }
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
     # Disable the test in fp16 mode since multiple matmul and add ops together
     # can cause overflow.
-    return run_params.precision_mode != "FP16"
+    return ((run_params.precision_mode != "FP16") and
+            not (trt_test.IsQuantizationMode(run_params.precision_mode) and
+                 not run_params.use_calibration))
 
 
 class PartiallyConvertedTestB(PartiallyConvertedTestA):
@@ -197,13 +210,13 @@ class PartiallyConvertedTestB(PartiallyConvertedTestA):
     super(PartiallyConvertedTestB, self).setUp()
     # Let it fail to build the first engine.
     trt_convert.clear_test_values("")
-    trt_convert.add_test_value("my_trt_op_0:CreateTRTNode", "fail")
+    trt_convert.add_test_value("TRTEngineOp_0:CreateTRTNode", "fail")
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
         # Only the second engine is built.
-        "my_trt_op_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
+        "TRTEngineOp_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
     }
 
 
@@ -247,8 +260,8 @@ class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["add", "add1", "mul"],
-        "my_trt_op_1": ["add2", "add3", "mul1"]
+        "TRTEngineOp_0": ["add", "add1", "mul"],
+        "TRTEngineOp_1": ["add2", "add3", "mul1"]
     }
 
 
@@ -279,7 +292,7 @@ class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return {"my_trt_op_0": ["c", "add", "add1", "mul"]}
+    return {"TRTEngineOp_0": ["c", "add", "add1", "mul"]}
 
 
 class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
@@ -314,12 +327,12 @@ class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["add2", "add3", "mul1"],
+        "TRTEngineOp_0": ["add2", "add3", "mul1"],
         # Why segment ["add", "add1", "mul"] was assigned segment id 1
         # instead of 0: the parent node of this segment is actually const
         # node 'c', but it's removed later since it's const output of the
         # segment which is not allowed.
-        "my_trt_op_1": ["add", "add1", "mul"]
+        "TRTEngineOp_1": ["add", "add1", "mul"]
     }
 
 
@@ -363,8 +376,8 @@ class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["c1", "add", "add1", "mul"],
-        "my_trt_op_1": ["c2", "add2", "add3", "mul1"]
+        "TRTEngineOp_0": ["c1", "add", "add1", "mul"],
+        "TRTEngineOp_1": ["c2", "add2", "add3", "mul1"]
     }
 
 
diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
index 2f153c6f2fc588e28676ac640c7a613ec0117c58..f42308ecb7c8f8a107e78008abd3f470ddc85975 100644
--- a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
@@ -50,17 +50,22 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
       w2 = array_ops.placeholder(dtype=dtype, shape=w2_dims, name=w2_name)
       with g.device("/GPU:0"):
         b = constant_op.constant(np.random.randn(12, 5, 12, 7), dtype=dtype)
-        c = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
-        d = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
         x1 = math_ops.matmul(inp, b)
+        c = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
         x1 = x1 + c
+
         x2 = math_ops.matmul(inp, w1)
+        d = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
         x2 = x2 * d
-        e = gen_array_ops.reshape(inp, [12, 40, 12])
+
+        e = self.trt_incompatible_op(inp)
+        e = gen_array_ops.reshape(e, [12, 40, 12])
         x3 = math_ops.matmul(e, w2)
         f = constant_op.constant(np.random.randn(40, 1), dtype=dtype)
         x3 = x3 + f
         x3 = gen_array_ops.reshape(x3, [12, 5, 8, 7])
+        x3 = self.trt_incompatible_op(x3)
+
         out = x1 + x2 + x3
       array_ops.squeeze(out, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
@@ -74,12 +79,12 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     if (run_params.dynamic_engine and
         not trt_test.IsQuantizationMode(run_params.precision_mode)):
-      return ["my_trt_op_0", "my_trt_op_1"]
-    return ["my_trt_op_1"]
+      return ["TRTEngineOp_0", "TRTEngineOp_1"]
+    return ["TRTEngineOp_1"]
 
   def ExpectedEnginesToRun(self, run_params):
     """Return the expected engines to run."""
-    return ["my_trt_op_1"]
+    return ["TRTEngineOp_1"]
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
index d2f65344dafa19597bfced04650bffcb2f3c4dc6..053b38ff1c0578c58f39dd6dc0630d1401a105af 100644
--- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
@@ -33,110 +33,102 @@ from tensorflow.python.platform import test
 
 class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
 
+  def _ConstOp(self, shape):
+    return constant_op.constant(np.random.randn(*shape), dtype=dtypes.float32)
+
   def GetParams(self):
     """Testing conversion of BiasAdd MatMul in TF-TRT conversion."""
-    dtype = dtypes.float32
     input_name = "input"
-    input_dims = [48, 12]
+    input_matrix_rows = 4
+    input_matrix_columns = 144
+    # Note that tf.nn.bias_add supports up to 5 dimensions.
+    input_dims = [input_matrix_rows, input_matrix_columns]
     output_name = "output"
     g = ops.Graph()
     with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      x = array_ops.placeholder(
+          dtype=dtypes.float32, shape=input_dims, name=input_name)
 
-      b = constant_op.constant(np.random.randn(12, 4), dtype=dtype)
+      b = self._ConstOp((input_matrix_columns, 4))
       x1 = math_ops.matmul(x, b)
-      b = constant_op.constant(np.random.randn(1, 4), dtype=dtype)
+      b = self._ConstOp((1, 4))
       x1 = x1 + b
 
-      b = constant_op.constant(np.random.randn(48, 4), dtype=dtype)
-      x2 = math_ops.matmul(x, b, transpose_a=True)
-      x2 = gen_array_ops.reshape(x2, [48, 1])
+      b = self._ConstOp((input_matrix_rows, 144))
+      x2 = self.trt_incompatible_op(x)
+      x2 = math_ops.matmul(x2, b, transpose_a=True)
+      x2 = gen_array_ops.reshape(x2, [4, -1])
+      x2 = self.trt_incompatible_op(x2)
 
-      b = constant_op.constant(np.random.randn(4, 12), dtype=dtype)
+      b = self._ConstOp((4, input_matrix_columns))
       x3 = math_ops.matmul(x, b, transpose_b=True)
 
-      b = constant_op.constant(np.random.randn(16, 48), dtype=dtype)
-      x4 = math_ops.matmul(x, b, transpose_b=True, transpose_a=True)
-      x4 = gen_array_ops.reshape(x4, [48, 4])
+      b = self._ConstOp((16, input_matrix_rows))
+      x4 = self.trt_incompatible_op(x)
+      x4 = math_ops.matmul(x4, b, transpose_b=True, transpose_a=True)
+      x4 = gen_array_ops.reshape(x4, [4, -1])
+      x4 = self.trt_incompatible_op(x4)
 
-      x5 = gen_array_ops.reshape(x, [4, 144])
-      b = constant_op.constant(np.random.randn(144, 48), dtype=dtype)
-      x5 = math_ops.matmul(x5, b)
-      b = constant_op.constant(np.random.randn(48), dtype=dtype)
+      b = self._ConstOp((input_matrix_columns, 48))
+      x5 = math_ops.matmul(x, b)
+      b = self._ConstOp((48,))
       x5 = nn.bias_add(x5, b)
-      x5 = gen_array_ops.reshape(x5, [48, 4])
+      x5 = gen_array_ops.reshape(x5, [4, -1])
 
-      x6 = gen_array_ops.reshape(x, [4, 12, 12])
-      b = constant_op.constant(np.random.randn(12), dtype=dtype)
+      x6 = gen_array_ops.reshape(x, [4, 24, 6])
+      b = self._ConstOp((6,))
       x6 = nn.bias_add(x6, b, data_format="NHWC")
-      x6 = gen_array_ops.reshape(x6, [48, -1])
+      x6 = gen_array_ops.reshape(x6, [4, -1])
 
-      x7 = gen_array_ops.reshape(x, [4, 12, 3, 4])
-      b = constant_op.constant(np.random.randn(4), dtype=dtype)
+      x7 = gen_array_ops.reshape(x, [4, 12, 4, 3])
+      b = self._ConstOp((3,))
       x7 = nn.bias_add(x7, b, data_format="NHWC")
-      x7 = gen_array_ops.reshape(x7, [48, -1])
+      x7 = gen_array_ops.reshape(x7, [4, -1])
 
-      x8 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
-      b = constant_op.constant(np.random.randn(2), dtype=dtype)
+      x8 = gen_array_ops.reshape(x, [4, 4, 3, 2, 6])
+      b = self._ConstOp((6,))
       x8 = nn.bias_add(x8, b, data_format="NHWC")
-      x8 = gen_array_ops.reshape(x8, [48, -1])
+      x8 = gen_array_ops.reshape(x8, [4, -1])
 
       x9 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
-      b = constant_op.constant(np.random.randn(3), dtype=dtype)
+      b = self._ConstOp((12,))
       x9 = nn.bias_add(x9, b, data_format="NCHW")
-      x9 = gen_array_ops.reshape(x9, [48, -1])
+      x9 = gen_array_ops.reshape(x9, [4, -1])
 
-      x10 = gen_array_ops.reshape(x, [4, 12, 3, 4])
-      b = constant_op.constant(np.random.randn(12), dtype=dtype)
+      x10 = gen_array_ops.reshape(x, [4, 3, 4, 12])
+      b = self._ConstOp((3,))
       x10 = nn.bias_add(x10, b, data_format="NCHW")
-      x10 = gen_array_ops.reshape(x10, [48, -1])
+      x10 = gen_array_ops.reshape(x10, [4, -1])
 
-      x11 = gen_array_ops.reshape(x, [4, 12, 12])
-      b = constant_op.constant(np.random.randn(4), dtype=dtype)
+      x11 = gen_array_ops.reshape(x, [4, 6, 24])
+      b = self._ConstOp((6,))
       x11 = nn.bias_add(x11, b, data_format="NCHW")
-      x11 = gen_array_ops.reshape(x11, [48, -1])
+      x11 = gen_array_ops.reshape(x11, [4, -1])
 
-      out = array_ops.concat(
-          [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11], axis=-1)
+      out = array_ops.concat([x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11],
+                             axis=-1)
       out = array_ops.squeeze(out, name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
         input_dims=[input_dims],
         output_names=[output_name],
-        expected_output_dims=[(48, 89)])
+        expected_output_dims=[(4, 6680)])
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
-    return super(BiasaddMatMulTest,
-                 self).GetConversionParams(run_params)._replace(
-                     max_batch_size=48, maximum_cached_engines=2)
-
-  def _ValidEngines(self):
-    """Engines expected to build and run."""
-    return [
-        "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_6",
-        "my_trt_op_7", "my_trt_op_8", "my_trt_op_9"
-    ]
-
-  def _InvalidEngines(self):
-    """Engines that will cause conversion error at building time."""
-    return ["my_trt_op_3", "my_trt_op_4", "my_trt_op_5"]
+    conversion_params = super(BiasaddMatMulTest,
+                              self).GetConversionParams(run_params)
+    return conversion_params._replace(
+        max_batch_size=4,
+        maximum_cached_engines=1,
+        # Disable layout optimizer, since it will convert BiasAdd with NHWC
+        # format to NCHW format under four dimentional input.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    # In dynamic engine mode the engines are built in execution time, not in
-    # conversion time, so build errors occurs later. Here three of the engines
-    # will be failed to built but the corresponding engine op are still created.
-    # TODO(aaroey, jjsjann123): fix this.
-    if (run_params.dynamic_engine and
-        not trt_test.IsQuantizationMode(run_params.precision_mode)):
-      return self._ValidEngines() + self._InvalidEngines()
-    return self._ValidEngines()
-
-  def ExpectedEnginesToRun(self, run_params):
-    """Return the expected engines to run."""
-    return self._ValidEngines()
+    return ["TRTEngineOp_0"]
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
index f126ed4238c4ba360a191947e237bba5bfb4be01..169835956c046dd675e967daa05fd81405662e38 100644
--- a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -26,85 +26,39 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
 
+  def _ConstOp(self, shape):
+    return constant_op.constant(np.random.randn(*shape), dtype=dtypes.float32)
+
   def GetParams(self):
     """Tests for scale & elementwise layers in TF-TRT."""
-    dtype = dtypes.float32
     input_name = "input"
     input_dims = [10, 24, 24, 20]
     output_name = "output"
     g = ops.Graph()
     with g.as_default():
-      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
-      # scale
-      a = constant_op.constant(np.random.randn(1), dtype=dtype)
-      f = x + a
-      x = math_ops.sigmoid(f)
-      # scale
-      a = constant_op.constant(np.random.randn(1), dtype=dtype)
-      f = a + x
-      x = math_ops.sigmoid(f)
-      # scale
-      a = constant_op.constant(np.random.randn(24, 1, 1), dtype=dtype)
-      f = x + a
-      x = math_ops.sigmoid(f)
-      # scale
-      a = constant_op.constant(np.random.randn(24, 1, 1), dtype=dtype)
-      f = a + x
-      x = math_ops.sigmoid(f)
-      # scale
-      a = constant_op.constant(np.random.randn(24, 24, 20), dtype=dtype)
-      f = a + x
-      x = math_ops.sigmoid(f)
-      # scale
-      a = constant_op.constant(np.random.randn(24, 24, 20), dtype=dtype)
-      f = x + a
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(20), dtype=dtype)
-      f = x + a
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(20), dtype=dtype)
-      f = a + x
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(1, 24, 1, 1), dtype=dtype)
-      f = a + x
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(1, 24, 1, 1), dtype=dtype)
-      f = x + a
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(1, 24, 24, 1), dtype=dtype)
-      f = a + x
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(1, 24, 24, 1), dtype=dtype)
-      f = x + a
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(1, 24, 24, 20), dtype=dtype)
-      f = a + x
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(1, 24, 24, 20), dtype=dtype)
-      f = x + a
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(24, 20), dtype=dtype)
-      f = a + x
-      x = math_ops.sigmoid(f)
-      # elementwise
-      a = constant_op.constant(np.random.randn(24, 20), dtype=dtype)
-      f = x + a
-      x = math_ops.sigmoid(f)
+      x = array_ops.placeholder(
+          dtype=dtypes.float32, shape=input_dims, name=input_name)
+      for weights_shape in [
+          (1,),  # scale
+          (24, 1, 1),  # scale
+          (24, 24, 20),  # scale
+          (20,),  # elementwise
+          (1, 24, 1, 1),  # elementwise
+          (1, 24, 24, 1),  # elementwise
+          (1, 24, 24, 20),  # elementwise
+          (24, 20),  # elementwise
+      ]:
+        a = self._ConstOp(weights_shape)
+        f = x + a
+        x = self.trt_incompatible_op(f)
+        a = self._ConstOp(weights_shape)
+        f = a + x
+        x = self.trt_incompatible_op(f)
       gen_array_ops.reshape(x, [5, -1], name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
@@ -115,24 +69,7 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return [
-        "my_trt_op_0",
-        "my_trt_op_1",
-        "my_trt_op_2",
-        "my_trt_op_3",
-        "my_trt_op_4",
-        "my_trt_op_5",
-        "my_trt_op_6",
-        "my_trt_op_7",
-        "my_trt_op_8",
-        "my_trt_op_9",
-        "my_trt_op_10",
-        "my_trt_op_11",
-        "my_trt_op_12",
-        "my_trt_op_13",
-        "my_trt_op_14",
-        "my_trt_op_15",
-    ]
+    return ["TRTEngineOp_%d" % i for i in range(16)]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/contrib/tensorrt/test/concatenation_test.py
index 465cb022964df046bf03a481bb1c6b65750aa883..c3576f81d97afe7e0e42cd10413971911e97774c 100644
--- a/tensorflow/contrib/tensorrt/test/concatenation_test.py
+++ b/tensorflow/contrib/tensorrt/test/concatenation_test.py
@@ -79,7 +79,7 @@ class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
index e32f0478661caaab5386339c819b524656baf066..c1c883312d867b60b88ac14318041f9750ca41e6 100644
--- a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
@@ -64,7 +64,7 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ['my_trt_op_0']
+    return ['TRTEngineOp_0']
 
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
index bc7c90081ff38a832b523948db10c02de7acefc2..104bac43a0b1166dcddee9920991582f33e93316 100644
--- a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
@@ -68,7 +68,7 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
index 11be4feaf7bf8ce6c8bd16f1546dc17450c342f1..293f93d8a78bc8ab06002d6fc01cb8d6a0738698 100644
--- a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -25,8 +25,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
@@ -60,14 +58,14 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
       b = constant_op.constant(
           np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
       q = conv - b
-      edge = math_ops.sigmoid(q)
+      edge = self.trt_incompatible_op(q)
 
       b = constant_op.constant(
           np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
       d = b + conv
-      edge3 = math_ops.sigmoid(d)
+      edge3 = self.trt_incompatible_op(d)
 
-      edge1 = gen_math_ops.tan(conv)
+      edge1 = self.trt_incompatible_op(conv)
       t = t - edge1
       q = q + edge
       t = t + q
@@ -83,7 +81,7 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0", "my_trt_op_1"]
+    return ["TRTEngineOp_0", "TRTEngineOp_1"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
index eddeafa38bc71743ac6c9d8e5e8db76f28ca7bf4..3e1e4b088ba200db2184dd64092cbc642a17cb3a 100644
--- a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
@@ -66,8 +66,8 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["bias", "mul", "sub"],
-        "my_trt_op_1": ["weights", "conv"]
+        "TRTEngineOp_0": ["bias", "mul", "sub"],
+        "TRTEngineOp_1": ["weights", "conv"]
     }
 
 
diff --git a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cbef89e23949ba5ceaab34e0f683fd906bf0ce
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
@@ -0,0 +1,290 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TRT INT8 conversion without calibration on Mnist model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensorrt.python import trt_convert
+# pylint: disable=unused-import
+from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+# pylint: enable=unused-import
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import data
+from tensorflow.python import keras
+from tensorflow.python.estimator.estimator import Estimator
+from tensorflow.python.estimator.model_fn import EstimatorSpec
+from tensorflow.python.estimator.model_fn import ModeKeys
+from tensorflow.python.estimator.run_config import RunConfig
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.datasets import mnist
+from tensorflow.python.layers import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
+from tensorflow.python.training import saver
+from tensorflow.python.training.adam import AdamOptimizer
+from tensorflow.python.training.checkpoint_management import latest_checkpoint
+from tensorflow.python.training.training_util import get_global_step
+
+INPUT_NODE_NAME = 'input'
+OUTPUT_NODE_NAME = 'output'
+
+
+class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
+
+  def _BuildGraph(self, x):
+
+    def _Quantize(x, r):
+      x = gen_array_ops.quantize_and_dequantize_v2(x, -r, r)
+      return x
+
+    def _DenseLayer(x, num_inputs, num_outputs, quantization_range, name):
+      """Dense layer with quantized outputs.
+
+      Args:
+        x: input to the dense layer
+        num_inputs: number of input columns of x
+        num_outputs: number of output columns
+        quantization_range: the min/max range for quantization
+        name: name of the variable scope
+
+      Returns:
+        The output of the layer.
+      """
+      with variable_scope.variable_scope(name):
+        kernel = variable_scope.get_variable(
+            'kernel',
+            shape=[num_inputs, num_outputs],
+            dtype=dtypes.float32,
+            initializer=keras.initializers.glorot_uniform())
+        bias = variable_scope.get_variable(
+            'bias',
+            shape=[num_outputs],
+            dtype=dtypes.float32,
+            initializer=keras.initializers.zeros())
+        x = math_ops.matmul(x, kernel)
+        x = _Quantize(x, quantization_range)
+        x = nn.bias_add(x, bias)
+        x = _Quantize(x, quantization_range)
+      return x
+
+    x = _Quantize(x, 1)
+    # Conv + Bias + Relu6
+    x = layers.conv2d(x, filters=32, kernel_size=3, use_bias=True)
+    x = nn.relu6(x)
+    # Conv + Bias + Relu6
+    x = layers.conv2d(x, filters=64, kernel_size=3, use_bias=True)
+    x = nn.relu6(x)
+    # Reduce
+    x = math_ops.reduce_mean(x, [1, 2])
+    x = _Quantize(x, 6)
+    # FC1
+    x = _DenseLayer(x, 64, 512, 6, name='dense')
+    x = nn.relu6(x)
+    # FC2
+    x = _DenseLayer(x, 512, 10, 25, name='dense_1')
+    x = array_ops.identity(x, name=OUTPUT_NODE_NAME)
+    return x
+
+  def _GetGraphDef(self, use_trt, max_batch_size, model_dir):
+    """Get the frozen mnist GraphDef.
+
+    Args:
+      use_trt: whether use TF-TRT to convert the graph.
+      max_batch_size: the max batch size to apply during TF-TRT conversion.
+      model_dir: the model directory to load the checkpoints.
+
+    Returns:
+      The frozen mnist GraphDef.
+    """
+    graph = ops.Graph()
+    with self.session(graph=graph) as sess:
+      with graph.device('/GPU:0'):
+        x = array_ops.placeholder(
+            shape=(None, 28, 28, 1), dtype=dtypes.float32, name=INPUT_NODE_NAME)
+        self._BuildGraph(x)
+      # Load weights
+      mnist_saver = saver.Saver()
+      checkpoint_file = latest_checkpoint(model_dir)
+      mnist_saver.restore(sess, checkpoint_file)
+      # Freeze
+      graph_def = graph_util.convert_variables_to_constants(
+          sess, sess.graph_def, output_node_names=[OUTPUT_NODE_NAME])
+    # Convert with TF-TRT
+    if use_trt:
+      logging.info('Number of nodes before TF-TRT conversion: %d',
+                   len(graph_def.node))
+      graph_def = trt_convert.create_inference_graph(
+          graph_def,
+          outputs=[OUTPUT_NODE_NAME],
+          max_batch_size=max_batch_size,
+          precision_mode='INT8',
+          max_workspace_size_bytes=4096 << 19,
+          minimum_segment_size=2,
+          use_calibration=False,
+      )
+      logging.info('Number of nodes after TF-TRT conversion: %d',
+                   len(graph_def.node))
+      num_engines = len(
+          [1 for n in graph_def.node if str(n.op) == 'TRTEngineOp'])
+      self.assertEqual(1, num_engines)
+    return graph_def
+
+  def _Run(self, is_training, use_trt, batch_size, num_epochs, model_dir):
+    """Train or evaluate the model.
+
+    Args:
+      is_training: whether to train or evaluate the model. In training mode,
+        quantization will be simulated where the quantize_and_dequantize_v2 are
+        placed.
+      use_trt: if true, use TRT INT8 mode for evaluation, which will perform
+        real quantization. Otherwise use native TensorFlow which will perform
+        simulated quantization. Ignored if is_training is True.
+      batch_size: batch size.
+      num_epochs: how many epochs to train. Ignored if is_training is False.
+      model_dir: where to save or load checkpoint.
+
+    Returns:
+      The Estimator evaluation result.
+    """
+    # Get dataset
+    train_data, test_data = mnist.load_data()
+
+    def _PreprocessFn(x, y):
+      x = math_ops.cast(x, dtypes.float32)
+      x = array_ops.expand_dims(x, axis=2)
+      x = 2.0 * (x / 255.0) - 1.0
+      y = math_ops.cast(y, dtypes.int32)
+      return x, y
+
+    def _EvalInputFn():
+      mnist_x, mnist_y = test_data
+      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset.apply(
+          data.experimental.map_and_batch(
+              map_func=_PreprocessFn,
+              batch_size=batch_size,
+              num_parallel_calls=8))
+      dataset = dataset.repeat(count=1)
+      iterator = data.make_one_shot_iterator(dataset)
+      features, labels = iterator.get_next()
+      return features, labels
+
+    def _TrainInputFn():
+      mnist_x, mnist_y = train_data
+      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset.shuffle(2 * len(mnist_x))
+      dataset = dataset.apply(
+          data.experimental.map_and_batch(
+              map_func=_PreprocessFn,
+              batch_size=batch_size,
+              num_parallel_calls=8))
+      dataset = dataset.repeat(count=num_epochs)
+      iterator = data.make_one_shot_iterator(dataset)
+      features, labels = iterator.get_next()
+      return features, labels
+
+    def _ModelFn(features, labels, mode):
+      if is_training:
+        logits_out = self._BuildGraph(features)
+      else:
+        graph_def = self._GetGraphDef(use_trt, batch_size, model_dir)
+        logits_out = importer.import_graph_def(
+            graph_def,
+            input_map={INPUT_NODE_NAME: features},
+            return_elements=[OUTPUT_NODE_NAME + ':0'],
+            name='')[0]
+
+      loss = losses.sparse_softmax_cross_entropy(
+          labels=labels, logits=logits_out)
+      summary.scalar('loss', loss)
+
+      classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out')
+      accuracy = metrics.accuracy(
+          labels=labels, predictions=classes_out, name='acc_op')
+      summary.scalar('accuracy', accuracy[1])
+
+      if mode == ModeKeys.EVAL:
+        return EstimatorSpec(
+            mode, loss=loss, eval_metric_ops={'accuracy': accuracy})
+      elif mode == ModeKeys.TRAIN:
+        optimizer = AdamOptimizer(learning_rate=1e-2)
+        train_op = optimizer.minimize(loss, global_step=get_global_step())
+        return EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+    config_proto = config_pb2.ConfigProto()
+    config_proto.gpu_options.allow_growth = True
+    estimator = Estimator(
+        model_fn=_ModelFn,
+        model_dir=model_dir if is_training else None,
+        config=RunConfig(session_config=config_proto))
+
+    if is_training:
+      estimator.train(_TrainInputFn)
+    results = estimator.evaluate(_EvalInputFn)
+    logging.info('accuracy: %s', str(results['accuracy']))
+    return results
+
+  # To generate the checkpoint, set a different model_dir and call self._Run()
+  # by setting is_training=True and num_epochs=1000, e.g.:
+  # model_dir = '/tmp/quantization_mnist'
+  # self._Run(
+  #     is_training=True,
+  #     use_trt=False,
+  #     batch_size=128,
+  #     num_epochs=100,
+  #     model_dir=model_dir)
+  def testEval(self):
+    if not trt_convert.is_tensorrt_enabled():
+      return
+    model_dir = test.test_src_dir_path('contrib/tensorrt/test/testdata')
+
+    accuracy_tf_native = self._Run(
+        is_training=False,
+        use_trt=False,
+        batch_size=128,
+        num_epochs=None,
+        model_dir=model_dir)['accuracy']
+    logging.info('accuracy_tf_native: %f', accuracy_tf_native)
+    self.assertAllClose(accuracy_tf_native, 0.9662)
+
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return
+
+    accuracy_tf_trt = self._Run(
+        is_training=False,
+        use_trt=True,
+        batch_size=128,
+        num_epochs=None,
+        model_dir=model_dir)['accuracy']
+    logging.info('accuracy_tf_trt: %f', accuracy_tf_trt)
+    self.assertAllClose(accuracy_tf_trt, 0.9677)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/quantization_test.py b/tensorflow/contrib/tensorrt/test/quantization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e425a3674635650d7292ab072178e98932e6b824
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/quantization_test.py
@@ -0,0 +1,144 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.python import trt_convert
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def _GetParams(add_quantization_nodes, dtype=dtypes.float32):
+  input_name = "input"
+  input_dims = [8, 8]
+  output_name = "output"
+
+  def _Quantize(x, r):
+    if add_quantization_nodes:
+      x = gen_array_ops.fake_quant_with_min_max_vars(x, -r, r)
+    return x
+
+  g = ops.Graph()
+  with g.as_default():
+    x = array_ops.placeholder(
+        dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+    x = _Quantize(x, 10.0)
+    x = x + 5
+    x = _Quantize(x, 15.0)
+    x = x - 5
+    x = _Quantize(x, 10.0)
+    x = x * 0.1
+    x = _Quantize(x, 1.0)
+    w = constant_op.constant(np.ones((8, 1)), dtype=dtypes.float32)
+    x = math_ops.matmul(x, w)
+    x = _Quantize(x, 10.0)
+    x = array_ops.identity(x, name=output_name)
+
+  return trt_test.TfTrtIntegrationTestParams(
+      gdef=g.as_graph_def(),
+      input_names=[input_name],
+      input_dims=[input_dims],
+      output_names=[output_name],
+      expected_output_dims=[(8, 1)])
+
+
+class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment with no quantization ranges."""
+    return _GetParams(add_quantization_nodes=False)
+
+  def ShouldRunTest(self, run_params):
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return False
+    # Only test static engine mode, with or without calibration.
+    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.use_optimizer and not run_params.dynamic_engine)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    if run_params.use_calibration:
+      # In static engine mode with calibration, it should build a calibration
+      # engine.
+      return ["TRTEngineOp_0"]
+    # In static engine mode without calibration, the engine building will fail
+    # since no quantization ranges are set, which results in no TRT nodes.
+    return []
+
+
+class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment with no quantization ranges."""
+    return _GetParams(add_quantization_nodes=True)
+
+  def ShouldRunTest(self, run_params):
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return False
+    # Test static/dynamic engine with/without calibration.
+    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.use_optimizer)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+
+class NonQuantizedPrecisionsWithRangesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment with no quantization ranges."""
+    return _GetParams(add_quantization_nodes=True)
+
+  def ShouldRunTest(self, run_params):
+    # Only test FP32/FP16 mode.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    # The fake quant ops are not supported in FP32/FP16 mode, and will split the
+    # graph into three TRT segments.
+    return ["TRTEngineOp_0", "TRTEngineOp_1", "TRTEngineOp_2", "TRTEngineOp_3"]
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/rank_two_test.py b/tensorflow/contrib/tensorrt/test/rank_two_test.py
index 74a4a059257ffde4c86df1f18b3ce35c3790ec7a..563232fc12675d9e1b32b7ab461591af57beadb9 100644
--- a/tensorflow/contrib/tensorrt/test/rank_two_test.py
+++ b/tensorflow/contrib/tensorrt/test/rank_two_test.py
@@ -51,8 +51,10 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
         c = constant_op.constant(3.0, name="c%d_3" % i)
         q = math_ops.add(q, c, name="add%d_3" % i)
         if i == 0:
+          axis = constant_op.constant(-1, dtype=dtypes.int32, name="axis")
           for j in range(2):
-            q = array_ops.expand_dims(q, -1, name="expand%d_%d" % (i, j))
+            q = array_ops.expand_dims(q, axis, name="expand%d_%d" % (i, j))
+          q = self.trt_incompatible_op(q)
         q = gen_math_ops.reciprocal(q, name="reciprocal%d" % i)
         outputs.append(q)
       # Combine both paths
@@ -68,11 +70,11 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": [
+        "TRTEngineOp_0": [
             "add0_1", "add0_2", "add0_3", "c0_1", "c0_2", "c0_3", "abs0_1",
-            "abs0_2"
+            "abs0_2", "expand0_0", "expand0_1", "axis"
         ],
-        "my_trt_op_1": [
+        "TRTEngineOp_1": [
             "add", "add1_1", "add1_2", "add1_3", "c1_1", "c1_2", "c1_3",
             "abs1_1", "abs1_2", "reciprocal0", "reciprocal1"
         ],
diff --git a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py b/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..207944468ab0b038abfe01f0096d7dc220d064ed
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
@@ -0,0 +1,152 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Basic tests for TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ReshapeTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 24, 24, 2]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      outputs = []
+      # Here we test two types of reshapes, one changes the batch dimension and
+      # the other does not. Note that we're not able to test reshaping to
+      # scalar, since TRT requires input tensor to be of rank at least 2, so a
+      # reshape with scalar input will be filtered out of the segment before
+      # conversion.
+      with g.device("/GPU:0"):
+        # These reshapes happen at batch dimension, thus conversion should fail.
+        for shape in [[2, 50, 24, 24, 2], [-1, 50, 24, 24, 2],
+                      [2, 50, -1, 24, 2]]:
+          incompatible_reshape = array_ops.reshape(inp, shape)
+          reshape_back = array_ops.reshape(incompatible_reshape,
+                                           [-1, 24, 24, 2])
+          outputs.append(self.trt_incompatible_op(reshape_back))
+        # Add another block with many reshapes that don't change the batch
+        # dimension.
+        compatible_reshape = array_ops.reshape(
+            inp, [-1, 24 * 24, 2], name="reshape-0")
+        compatible_reshape = array_ops.reshape(
+            compatible_reshape, [100, 24, -1], name="reshape-1")
+        compatible_reshape = array_ops.reshape(
+            compatible_reshape, [100, 24 * 2, 24], name="reshape-2")
+        compatible_reshape = array_ops.reshape(
+            compatible_reshape, [-1, 24, 24 * 2], name="reshape-3")
+        compatible_reshape = array_ops.reshape(
+            compatible_reshape, [-1, 6, 4, 24, 2], name="reshape-4")
+        compatible_reshape = array_ops.reshape(
+            compatible_reshape, [-1, 6, 4, 6, 4, 2, 1], name="reshape-5")
+        compatible_reshape = array_ops.reshape(
+            compatible_reshape, [-1, 24, 24, 2], name="reshape-6")
+        outputs.append(self.trt_incompatible_op(compatible_reshape))
+      math_ops.add_n(outputs, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        output_names=[output_name],
+        expected_output_dims=[tuple(input_dims)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        "TRTEngineOp_0": ["reshape-%d" % i for i in range(7)] +
+                         ["reshape-%d/shape" % i for i in range(7)]
+    }
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    return (not trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.dynamic_engine)
+
+
+class TransposeTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 24, 24, 2]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        # Add a block with compatible transposes.
+        compatible_transpose = array_ops.transpose(
+            inp, [0, 3, 1, 2], name="transpose-1")
+        compatible_transpose = array_ops.transpose(
+            compatible_transpose, [0, 2, 3, 1], name="transposeback")
+
+        # Add an incompatible op so the first block will not be in the same
+        # subgraph where the following block belongs.
+        bridge = self.trt_incompatible_op(compatible_transpose)
+
+        # Add a block with incompatible transposes.
+        #
+        # Note: by default Grappler will run the TRT optimizer twice. At the
+        # first time it will group the two transpose ops below to same segment
+        # then fail the conversion due to the expected batch dimension problem.
+        # At the second time, since the input of bridge op is TRTEngineOp_0, it
+        # will fail to do shape inference which then cause conversion to fail.
+        # TODO(laigd): support shape inference, make TRT optimizer run only
+        # once, and fix this.
+        incompatible_transpose = array_ops.transpose(
+            bridge, [2, 1, 0, 3], name="transpose-2")
+        excluded_transpose = array_ops.transpose(
+            incompatible_transpose, [0, 2, 3, 1], name="transpose-3")
+      array_ops.identity(excluded_transpose, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        output_names=[output_name],
+        expected_output_dims=[(24, 100, 2, 24)])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {
+        "TRTEngineOp_0": [
+            "transpose-1", "transpose-1/perm", "transposeback",
+            "transposeback/perm"
+        ]
+    }
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    return (not trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.dynamic_engine)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/testdata/checkpoint b/tensorflow/contrib/tensorrt/test/testdata/checkpoint
new file mode 100644
index 0000000000000000000000000000000000000000..a603e1aec91adab04fd9801ba05a2ee9adfbb6e8
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/testdata/checkpoint
@@ -0,0 +1,3 @@
+model_checkpoint_path: "model.ckpt-46900"
+all_model_checkpoint_paths: "model.ckpt-0"
+all_model_checkpoint_paths: "model.ckpt-46900"
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..88a998f184b275121e1e76eb51d2310da149f10a
Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 differ
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index
new file mode 100644
index 0000000000000000000000000000000000000000..537976571337508ab1798d33646c51d62a146ecc
Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index differ
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
index 4f935a76650d216f51bf95bdf8763262b6d19e54..495a9391a1e818a6078988161c9bf72f6143737f 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.tensorrt.python import trt_convert
 from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
 # pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
@@ -42,14 +43,15 @@ TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [
     "gdef", "input_names", "input_dims", "output_names", "expected_output_dims"
 ])
 
-RunParams = namedtuple(
-    "RunParams",
-    ["use_optimizer", "precision_mode", "dynamic_engine", "test_name"])
+RunParams = namedtuple("RunParams", [
+    "use_optimizer", "precision_mode", "dynamic_engine", "test_name",
+    "use_calibration"
+])
 
 ConversionParams = namedtuple("ConversionParams", [
     "max_batch_size", "max_workspace_size_bytes", "precision_mode",
     "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines",
-    "cached_engine_batch_sizes"
+    "cached_engine_batch_sizes", "rewriter_config", "use_calibration"
 ])
 
 PRECISION_MODES = ["FP32", "FP16", "INT8"]
@@ -65,6 +67,34 @@ class GraphState(object):
   INFERENCE = 2
 
 
+def OptimizerDisabledRewriterConfig():
+  """Returns a RewriterConfig with all default Grappler optimizers disabled."""
+  rewriter_config = rewriter_config_pb2.RewriterConfig()
+
+  # Turn off all default Grappler optimizers.
+  off = rewriter_config_pb2.RewriterConfig.OFF
+  rewriter_config.layout_optimizer = off
+  rewriter_config.constant_folding = off
+  rewriter_config.shape_optimization = off
+  rewriter_config.remapping = off
+  rewriter_config.arithmetic_optimization = off
+  rewriter_config.dependency_optimization = off
+  rewriter_config.loop_optimization = off
+  rewriter_config.function_optimization = off
+  rewriter_config.debug_stripper = off
+  rewriter_config.disable_model_pruning = True
+  rewriter_config.scoped_allocator_optimization = off
+  rewriter_config.memory_optimization = (
+      rewriter_config_pb2.RewriterConfig.NO_MEM_OPT)
+  rewriter_config.pin_to_host_optimization = off
+  rewriter_config.auto_parallel.enable = False
+
+  # Run only once for each enabled optimizer.
+  rewriter_config.meta_optimizer_iterations = (
+      rewriter_config_pb2.RewriterConfig.ONE)
+  return rewriter_config
+
+
 class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration."""
 
@@ -138,11 +168,16 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         minimum_segment_size=2,
         is_dynamic_op=run_params.dynamic_engine,
         maximum_cached_engines=1,
-        cached_engine_batch_sizes=None)
+        cached_engine_batch_sizes=None,
+        rewriter_config=None,
+        use_calibration=run_params.use_calibration)
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
-    return True
+    # This setting combination requires quantization nodes to be present in
+    # order to build the engine.
+    return not (IsQuantizationMode(run_params.precision_mode) and
+                not run_params.use_calibration)
 
   def VerifyRunForEngine(self, engine_name, graph_state, expect_run=True):
     """Verify the state of a particular engine after sess.run()."""
@@ -193,31 +228,35 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _PrepareRun(self, graph_state):
     """Set up necessary testing environment before calling sess.run()."""
     # Clear test values added by TRTEngineOp.
-    trt_convert.clear_test_values("my_trt_op_.*:ExecuteTrtEngine")
-    trt_convert.clear_test_values("my_trt_op_.*:ExecuteCalibration")
-    trt_convert.clear_test_values("my_trt_op_.*:ExecuteNativeSegment")
+    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteTrtEngine")
+    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteCalibration")
+    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteNativeSegment")
+
+  def _GetGPUOptions(self):
+    gpu_options = config_pb2.GPUOptions()
+    gpu_options.allow_growth = True
+    return gpu_options
 
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
     if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
-      trt_params = self.GetConversionParams(run_params)
-      rewriter_cfg = trt_convert.tensorrt_rewriter_config(
-          trt_params.max_batch_size, trt_params.max_workspace_size_bytes,
-          trt_params.precision_mode, trt_params.minimum_segment_size,
-          trt_params.is_dynamic_op, trt_params.maximum_cached_engines,
-          trt_params.cached_engine_batch_sizes)
+      conversion_params = self.GetConversionParams(run_params)
+      rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
+          conversion_params.rewriter_config, conversion_params.max_batch_size,
+          conversion_params.max_workspace_size_bytes,
+          conversion_params.precision_mode,
+          conversion_params.minimum_segment_size,
+          conversion_params.is_dynamic_op,
+          conversion_params.maximum_cached_engines,
+          conversion_params.cached_engine_batch_sizes,
+          conversion_params.use_calibration)
 
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
       graph_options = config_pb2.GraphOptions()
 
-    gpu_options = config_pb2.GPUOptions()
-    gpu_options.allow_growth = True
-    if trt_convert.get_linked_tensorrt_version()[0] == 3:
-      gpu_options.per_process_gpu_memory_fraction = 0.50
-
     config = config_pb2.ConfigProto(
-        gpu_options=gpu_options, graph_options=graph_options)
+        gpu_options=self._GetGPUOptions(), graph_options=graph_options)
     return config
 
   def _ExpectTestValue(self, engine_name, method, expected_value):
@@ -285,18 +324,25 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _GetTrtGraphDef(self, run_params, gdef):
     """Return trt converted graphdef."""
     params = self._GetParamsCached()
-    trt_params = self.GetConversionParams(run_params)
-    logging.info(trt_params)
+    conversion_params = self.GetConversionParams(run_params)
+    logging.info(conversion_params)
+
+    config_for_trt = config_pb2.ConfigProto(gpu_options=self._GetGPUOptions())
+    if conversion_params.rewriter_config is not None:
+      config_for_trt.graph_options.rewrite_options.CopyFrom(
+          conversion_params.rewriter_config)
     return trt_convert.create_inference_graph(
         input_graph_def=gdef,
         outputs=params.input_names + params.output_names,
-        max_batch_size=trt_params.max_batch_size,
-        max_workspace_size_bytes=trt_params.max_workspace_size_bytes,
-        precision_mode=trt_params.precision_mode,
-        minimum_segment_size=trt_params.minimum_segment_size,
-        is_dynamic_op=trt_params.is_dynamic_op,
-        maximum_cached_engines=trt_params.maximum_cached_engines,
-        cached_engine_batch_sizes=trt_params.cached_engine_batch_sizes)
+        max_batch_size=conversion_params.max_batch_size,
+        max_workspace_size_bytes=conversion_params.max_workspace_size_bytes,
+        precision_mode=conversion_params.precision_mode,
+        minimum_segment_size=conversion_params.minimum_segment_size,
+        is_dynamic_op=conversion_params.is_dynamic_op,
+        maximum_cached_engines=conversion_params.maximum_cached_engines,
+        cached_engine_batch_sizes=conversion_params.cached_engine_batch_sizes,
+        use_calibration=conversion_params.use_calibration,
+        session_config=config_for_trt)
 
   def _WriteGraph(self, run_params, gdef, graph_state):
     if graph_state == GraphState.ORIGINAL:
@@ -395,10 +441,12 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_engine = not node.attr["static_engine"].b
         self.assertEqual(run_params.dynamic_engine, is_dynamic_engine,
                          node.name)
+        self.assertEqual(node.attr["use_calibration"].b,
+                         run_params.use_calibration, node.name)
 
         has_calibration_data = len(node.attr["calibration_data"].s)
         if (IsQuantizationMode(run_params.precision_mode) and
-            graph_state == GraphState.INFERENCE):
+            run_params.use_calibration and graph_state == GraphState.INFERENCE):
           self.assertTrue(has_calibration_data, node.name)
         else:
           self.assertFalse(has_calibration_data, node.name)
@@ -433,6 +481,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       # types.
       scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
       dims = params.input_dims[i]
+      # TODO(laigd): add debug options. E.g. we can set the input data to be
+      # continuous natural numbers:
+      # seq = np.arange(np.prod(dims))
+      # seq.resize(dims)
+      # input_data.append(scale * seq.astype(dtype))
       input_data.append((scale * np.random.random_sample(dims)).astype(dtype))
     self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
 
@@ -444,7 +497,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
                                 config_no_trt, GraphState.ORIGINAL)
 
     # Run calibration if necessary.
-    if IsQuantizationMode(run_params.precision_mode):
+    if (IsQuantizationMode(run_params.precision_mode) and
+        run_params.use_calibration):
 
       calib_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
       logging.info("Running calibration graph, config:\n%s", str(calib_config))
@@ -514,27 +568,38 @@ def _AddTests(test_class):
 
   use_optimizer_options = [False, True]
   dynamic_engine_options = [False, True]
-  for (use_optimizer, precision_mode, dynamic_engine) in itertools.product(
-      use_optimizer_options, PRECISION_MODES, dynamic_engine_options):
+  use_calibration_options = [False, True]
+  opts = itertools.product(use_optimizer_options, PRECISION_MODES,
+                           dynamic_engine_options, use_calibration_options)
+  for (use_optimizer, precision_mode, dynamic_engine, use_calibration) in opts:
     if IsQuantizationMode(precision_mode):
       if use_optimizer:
         # TODO(aaroey): if use_optimizer is True we need to get the inference
         # graphdef using custom python wrapper class, which is not currently
         # supported yet.
         continue
-      if not dynamic_engine:
+      if use_calibration and not dynamic_engine:
+        # Static engine with use_calibration=False will be static, so we want to
+        # test that. If use_calibration=True, only dynamic op is supported.
         # TODO(aaroey): construction of static calibration engine is not
         # supported yet.
         continue
+    else:
+      if use_calibration:
+        # Don't calibrate in FP32 or FP16 mode
+        continue
 
     conversion = "OptimizerConversion" if use_optimizer else "ToolConversion"
-    engine_type = ("DynamicEngine" if dynamic_engine else "StaticEngine")
-    test_name = "%s_%s_%s" % (conversion, precision_mode, engine_type)
+    engine_type = "DynamicEngine" if dynamic_engine else "StaticEngine"
+    calibration_type = "UseCalibration" if use_calibration else "NoCalibration"
+    test_name = "%s_%s_%s_%s" % (conversion, engine_type, precision_mode,
+                                 calibration_type)
     run_params = RunParams(
         use_optimizer=use_optimizer,
         precision_mode=precision_mode,
         dynamic_engine=dynamic_engine,
-        test_name=test_name)
+        test_name=test_name,
+        use_calibration=use_calibration)
     setattr(test_class, "testTfTrt_" + test_name, _GetTest(run_params))
 
 
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/contrib/tensorrt/test/unary_test.py
index 8736bfb6449b3c25a411ec081ad58b1f8be84617..b6e5e32db1236684a06c2d44298b9a3d39667152 100644
--- a/tensorflow/contrib/tensorrt/test/unary_test.py
+++ b/tensorflow/contrib/tensorrt/test/unary_test.py
@@ -106,10 +106,7 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return [
-        "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3",
-        "my_trt_op_4"
-    ]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
index b0271a04b364864b841c2ec9fe53aac74611b2c3..b29626d2c28b4def716aef9e2703b669b5e46374 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
@@ -76,7 +76,7 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
index d7c165784bfe14bb5faffd266770328237a3eb80..9b0b189626050f678c71e9abbf7eb5296440d879 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
@@ -67,7 +67,7 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/text/BUILD b/tensorflow/contrib/text/BUILD
index 38d91f7e496d47ac74415da3bae91bad7f431dce..a434c120393fa7c149c645cf80438f53fddc7bd1 100644
--- a/tensorflow/contrib/text/BUILD
+++ b/tensorflow/contrib/text/BUILD
@@ -13,11 +13,11 @@ load(
     "//tensorflow:tensorflow.bzl",
     "py_test",
     "tf_custom_op_library",
-    "tf_custom_op_py_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
     "tf_kernel_library",
 )
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_custom_op_py_library(
     name = "text_py",
diff --git a/tensorflow/contrib/text/python/ops/skip_gram_ops_test.py b/tensorflow/contrib/text/python/ops/skip_gram_ops_test.py
index 832d34d60d0553ae54a54d2f41eb2e27370535f6..49260f272eeb27bcaeb7cf314969d067811f2582 100644
--- a/tensorflow/contrib/text/python/ops/skip_gram_ops_test.py
+++ b/tensorflow/contrib/text/python/ops/skip_gram_ops_test.py
@@ -339,7 +339,7 @@ class SkipGramOpsTest(test.TestCase):
         lookup.KeyValueTensorInitializer(keys, values), -1)
 
     with self.cached_session():
-      vocab_freq_table.init.run()
+      vocab_freq_table.initializer.run()
 
       # No vocab_freq_table specified - output should be the same as input.
       no_table_output = skip_gram_ops._filter_input(
@@ -396,7 +396,7 @@ class SkipGramOpsTest(test.TestCase):
         lookup.KeyValueTensorInitializer(keys, values), -1)
 
     with self.cached_session():
-      vocab_freq_table.init.run()
+      vocab_freq_table.initializer.run()
       output = skip_gram_ops._filter_input(
           input_tensor=input_tensor,
           vocab_freq_table=vocab_freq_table,
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index 7faf2b9b24acfd71f0ffa6d4a8477a34ff3ed321..f40e76f554e8815aac96344d8cb0b911bafdd712 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -1,24 +1,23 @@
 # tfprof: TensorFlow Profiler and Beyond
 
-<h1>Please use `tf.profiler.xxx` instead of `tf.contrib.tfprof.xxx`</h1>
-<h1>Full Document in <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/README.md">tensorflow/core/profiler/README.md</a><h1>
+<h1>Full Document in
+<a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/README.md">tensorflow/core/profiler/README.md</a><h1>
 
 ### Features
 
-* Profile model architectures
-  * parameters, tensor shapes, float operations, device placement, etc.
-* Profile model performance
-  * execution time, memory consumption
-  * Profile multiple steps.
-* Auto profile and advise.
-  * accelerator utilization check
-  * expensive operation check
-  * operation configuration check
-  * distributed runtime check (Not OSS)
+*   Profile model architectures
+    *   parameters, tensor shapes, float operations, device placement, etc.
+*   Profile model performance
+    *   execution time, memory consumption
+    *   Profile multiple steps.
+*   Auto profile and advise.
+    *   accelerator utilization check
+    *   expensive operation check
+    *   operation configuration check
+    *   distributed runtime check (Not OSS)
 
 ### Interfaces
 
-* Python API
-* Command Line
-* Visualization
-* C++ API (Not public, contact us if needed.)
+*   Python API
+*   Command Line
+*   Visualization
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index b1c7475442c58b9a190c818b752760a4fb4fe6f0..a7c7230dd2fed6cf14bb272df4e9bfb8f3699493 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -254,8 +254,8 @@ def train_and_predict(
   if export_directory is None:
     export_directory = tempfile.mkdtemp()
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
-  export_location = estimator.export_savedmodel(
-      export_directory, input_receiver_fn)
+  export_location = estimator.export_saved_model(export_directory,
+                                                 input_receiver_fn)
   # Warm up and predict using the SavedModel
   with tf.Graph().as_default():
     with tf.Session() as session:
diff --git a/tensorflow/contrib/timeseries/examples/multivariate.py b/tensorflow/contrib/timeseries/examples/multivariate.py
index e81cb18ad7b928a6fd2a748ea6b258c49cf722ae..6b60542e2200615ca004722627fa743ca9729b3b 100644
--- a/tensorflow/contrib/timeseries/examples/multivariate.py
+++ b/tensorflow/contrib/timeseries/examples/multivariate.py
@@ -66,8 +66,8 @@ def multivariate_train_and_sample(
   if export_directory is None:
     export_directory = tempfile.mkdtemp()
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
-  export_location = estimator.export_savedmodel(
-      export_directory, input_receiver_fn)
+  export_location = estimator.export_saved_model(export_directory,
+                                                 input_receiver_fn)
   with tf.Graph().as_default():
     numpy.random.seed(1)  # Make the example a bit more deterministic
     with tf.Session() as session:
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index c230919168b937b26c68e141e15f0762ad70f3e6..4b90b596b28efec83aa349782c4874d79b6817c7 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -104,8 +104,10 @@ py_test(
     srcs = [
         "estimators_test.py",
     ],
+    shard_count = 3,
     srcs_version = "PY2AND3",
     tags = [
+        "no_mac",
         "no_pip_gpu",  # b/63391119
         "nomsan",  # Takes too long to run.
         "notsan",  # b/67865658
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index 9bbe87e301678c7acb57846555e3f97273c8d806..bcadf4094e1e79fff1685515f2bde0b88f717cac 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -102,12 +102,12 @@ class FlatPredictionModel(training.Model):
       [batch size, output window size, num_features], where num_features is the
       same as the constructor argument.
     """
-    if input_window_features.shape[1].value == 0:
+    if input_window_features.shape.dims[1].value == 0:
       # TODO(allenl): Make reshape()'s static shape information work on
       # zero-size Tensors? Currently this special case is required because
       # otherwise the Dense layers get unknown last dimensions.
       activation = self._output_flatten(output_window_features)
-    elif output_window_features.shape[2].value == 0:
+    elif output_window_features.shape.dims[2].value == 0:
       activation = self._input_flatten(input_window_features)
     else:
       activation = array_ops.concat(
@@ -438,7 +438,7 @@ class ARModel(model.TimeSeriesModel):
       output_window_features = array_ops.zeros(
           [batch_size, self.output_window_size, 0],
           dtype=self.dtype)
-    static_batch_size = times.get_shape()[0].value
+    static_batch_size = times.get_shape().dims[0].value
     input_window_features.set_shape(
         [static_batch_size, self.input_window_size, input_feature_size])
     output_window_features.set_shape(
@@ -772,7 +772,7 @@ class ARModel(model.TimeSeriesModel):
       # windows matching self.window_size (as with training), but this looping
       # allows easy plotting of "in-sample" predictions.
       times.get_shape().assert_has_rank(2)
-      static_window_size = times.get_shape()[1].value
+      static_window_size = times.get_shape().dims[1].value
       if (static_window_size is not None
           and static_window_size < self.window_size):
         raise ValueError(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index af68aa03cf6583dc474eda6cda2e648fa1c3d08d..146ed9f27134e3e2a6c74627b6b78e53d65155f0 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -32,7 +32,7 @@ from tensorflow.contrib.timeseries.python.timeseries.state_space_models.filterin
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.estimator.export import export_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index 6ec7184c6839ebc4375432fa54c9b880729b9550..7d780559f976516823611f3fe0ded056e4be088c 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -30,7 +30,7 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
@@ -98,8 +98,8 @@ class TimeSeriesRegressorTest(test.TestCase):
     ) = list(second_estimator.predict(input_fn=predict_input_fn))
     self.assertAllEqual([10, 1], estimator_predictions["mean"].shape)
     input_receiver_fn = first_estimator.build_raw_serving_input_receiver_fn()
-    export_location = first_estimator.export_savedmodel(self.get_temp_dir(),
-                                                        input_receiver_fn)
+    export_location = first_estimator.export_saved_model(
+        self.get_temp_dir(), input_receiver_fn)
     with ops.Graph().as_default():
       with session.Session() as sess:
         signatures = loader.load(sess, [tag_constants.SERVING], export_location)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 04d17bc123eae5128315657948dc29e59cd2c941..8f692d94da45bfaed6c72cf75d525346865aea34 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -38,7 +38,7 @@ from tensorflow.core.example import example_pb2
 
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -402,8 +402,8 @@ class OneShotTests(parameterized.TestCase):
     self.assertIn("average_loss", result)
     self.assertNotIn(feature_keys.State.STATE_TUPLE, result)
     input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
-    export_location = estimator.export_savedmodel(_new_temp_dir(),
-                                                  input_receiver_fn)
+    export_location = estimator.export_saved_model(_new_temp_dir(),
+                                                   input_receiver_fn)
     graph = ops.Graph()
     with graph.as_default():
       with session_lib.Session() as session:
@@ -438,7 +438,7 @@ class OneShotTests(parameterized.TestCase):
         output = session.run(fetches, feed_dict=feeds)
         self.assertEqual((2, 15, 5), output["mean"].shape)
     # Build a parsing input function, then make a tf.Example for it to parse.
-    export_location = estimator.export_savedmodel(
+    export_location = estimator.export_saved_model(
         _new_temp_dir(),
         estimator.build_one_shot_parsing_serving_input_receiver_fn(
             filtering_length=20, prediction_length=15))
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index 03da2b82e5f8810d2a83a3d2bc03d8d285703861..aab330643862c1ccf073d2a0e34e1c475b1ec15f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -262,8 +262,8 @@ def batch_times_matrix(batch, matrix, adj_x=False, adj_y=False):
   assert matrix.get_shape().ndims == 2
   if adj_x:
     batch = array_ops.transpose(batch, [0, 2, 1])
-  batch_dimension = batch.get_shape()[0].value
-  first_dimension = batch.get_shape()[1].value
+  batch_dimension = batch.get_shape().dims[0].value
+  first_dimension = batch.get_shape().dims[1].value
   tensor_batch_shape = array_ops.shape(batch)
   if batch_dimension is None:
     batch_dimension = tensor_batch_shape[0]
@@ -543,20 +543,25 @@ class TupleOfTensorsLookup(lookup.LookupInterface):
   overhead.
   """
 
-  def __init__(
-      self, key_dtype, default_values, empty_key, name, checkpoint=True):
+  def __init__(self,
+               key_dtype,
+               default_values,
+               empty_key,
+               deleted_key,
+               name,
+               checkpoint=True):
     default_values_flat = nest.flatten(default_values)
-    self._hash_tables = nest.pack_sequence_as(
-        default_values,
-        [TensorValuedMutableDenseHashTable(
+    self._hash_tables = nest.pack_sequence_as(default_values, [
+        TensorValuedMutableDenseHashTable(
             key_dtype=key_dtype,
             value_dtype=default_value.dtype.base_dtype,
             default_value=default_value,
             empty_key=empty_key,
+            deleted_key=deleted_key,
             name=name + "_{}".format(table_number),
             checkpoint=checkpoint)
-         for table_number, default_value
-         in enumerate(default_values_flat)])
+        for table_number, default_value in enumerate(default_values_flat)
+    ])
     self._name = name
 
   def lookup(self, keys):
@@ -797,7 +802,7 @@ class InputStatisticsFromMiniBatch(object):
             array_ops.shape(times)[1] - 1, self._dtype))
     # Co-locate updates with their variables to minimize race conditions when
     # updating statistics.
-    with ops.colocate_with(auxiliary_variables.max_time_seen):
+    with ops.device(auxiliary_variables.max_time_seen.device):
       # There is a race condition if this value is being updated from multiple
       # workers. However, it should eventually reach the correct value if the
       # last chunk is presented enough times.
@@ -805,16 +810,16 @@ class InputStatisticsFromMiniBatch(object):
           auxiliary_variables.max_time_seen,
           gen_math_ops.maximum(auxiliary_variables.max_time_seen,
                                math_ops.reduce_max(times)))
-    with ops.colocate_with(auxiliary_variables.chunk_count):
+    with ops.device(auxiliary_variables.chunk_count.device):
       chunk_count_assign = state_ops.assign_add(auxiliary_variables.chunk_count,
                                                 array_ops.shape(
                                                     times,
                                                     out_type=dtypes.int64)[0])
-    with ops.colocate_with(auxiliary_variables.inter_observation_duration_sum):
+    with ops.device(auxiliary_variables.inter_observation_duration_sum.device):
       inter_observation_duration_assign = state_ops.assign_add(
           auxiliary_variables.inter_observation_duration_sum,
           math_ops.reduce_sum(batch_inter_observation_duration))
-    with ops.colocate_with(auxiliary_variables.example_count):
+    with ops.device(auxiliary_variables.example_count.device):
       example_count_assign = state_ops.assign_add(
           auxiliary_variables.example_count,
           array_ops.size(times, out_type=dtypes.int64))
@@ -824,11 +829,11 @@ class InputStatisticsFromMiniBatch(object):
     # the series are then members of fewer chunks. For series which are much
     # longer than the chunk size (the usual/expected case), this effect becomes
     # irrelevant.
-    with ops.colocate_with(auxiliary_variables.overall_feature_sum):
+    with ops.device(auxiliary_variables.overall_feature_sum.device):
       overall_feature_sum_assign = state_ops.assign_add(
           auxiliary_variables.overall_feature_sum,
           math_ops.reduce_sum(values, axis=[0, 1]))
-    with ops.colocate_with(auxiliary_variables.overall_feature_sum_of_squares):
+    with ops.device(auxiliary_variables.overall_feature_sum_of_squares.device):
       overall_feature_sum_of_squares_assign = state_ops.assign_add(
           auxiliary_variables.overall_feature_sum_of_squares,
           math_ops.reduce_sum(values**2, axis=[0, 1]))
@@ -864,7 +869,7 @@ class InputStatisticsFromMiniBatch(object):
             state_ops.assign(statistics.series_start_moments.mean, mean),
             state_ops.assign(statistics.series_start_moments.variance,
                              variance))
-      with ops.colocate_with(statistics.start_time):
+      with ops.device(statistics.start_time.device):
         series_start_update = control_flow_ops.cond(
             # Update moments whenever we even match the lowest time seen so far,
             # to ensure that series start statistics are eventually updated to
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py
index c0de42b15b3367aea4c076f4058641ae99b9219e..91265b9b2e6dad09dcca19cbd9f2f25763f3dd43 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils_test.py
@@ -223,10 +223,12 @@ class TestLookupTable(test.TestCase):
     hash_table = math_utils.TupleOfTensorsLookup(
         key_dtype=dtypes.int64,
         default_values=[[
-            array_ops.ones([3, 2], dtype=dtypes.float32), array_ops.zeros(
-                [5], dtype=dtypes.float64)
-        ], array_ops.ones([7, 7], dtype=dtypes.int64)],
+            array_ops.ones([3, 2], dtype=dtypes.float32),
+            array_ops.zeros([5], dtype=dtypes.float64)
+        ],
+                        array_ops.ones([7, 7], dtype=dtypes.int64)],
         empty_key=-1,
+        deleted_key=-2,
         name="test_lookup")
     def stack_tensor(base_tensor):
       return array_ops.stack([base_tensor + 1, base_tensor + 2])
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model.py b/tensorflow/contrib/timeseries/python/timeseries/model.py
index 7644764a7459db3951fe9a2790389713dd412a8f..a8cd4287e0003de300b7114cf3f88d21d3239e6e 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/model.py
@@ -21,13 +21,16 @@ from __future__ import print_function
 import abc
 import collections
 
+import six
+
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
 
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -52,11 +55,10 @@ ModelOutputs = collections.namedtuple(  # pylint: disable=invalid-name
     ])
 
 
+@six.add_metaclass(abc.ABCMeta)
 class TimeSeriesModel(object):
   """Base class for creating generative time series models."""
 
-  __metaclass__ = abc.ABCMeta
-
   def __init__(self,
                num_features,
                exogenous_feature_columns=None,
@@ -712,7 +714,7 @@ class SequentialTimeSeriesModel(TimeSeriesModel):
         `outputs` and computed in state_update_fn.
     """
     times = ops.convert_to_tensor(times, dtype=dtypes.int64)
-    window_static_shape = times.get_shape()[1].value
+    window_static_shape = tensor_shape.dimension_value(times.shape[1])
     if self._static_unrolling_window_size_threshold is None:
       static_unroll = False
     else:
@@ -789,7 +791,7 @@ class SequentialTimeSeriesModel(TimeSeriesModel):
         [_window_size_tensor_array(self.dtype) for _ in outputs]]
     if static_unroll:
       arguments = initial_loop_arguments
-      for step_number in range(times.get_shape()[1].value):
+      for step_number in range(tensor_shape.dimension_value(times.shape[1])):
         arguments = _state_update_step(
             array_ops.constant(step_number, dtypes.int32), *arguments[1:],
             reuse=(step_number > 0))  # Variable sharing between steps
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_management.py b/tensorflow/contrib/timeseries/python/timeseries/state_management.py
index 13eecd4d822faaeb9553c7723c6842cdcb38aa3f..138406c6168f48339f6b96102d26e868f36083d3 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_management.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_management.py
@@ -149,6 +149,7 @@ class ChainingStateManager(_OverridableStateManager):
         key_dtype=dtypes.int64,
         default_values=self._start_state,
         empty_key=-1,
+        deleted_key=-2,
         name="cached_states",
         checkpoint=self._checkpoint_state)
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 3c07a74ed8af9e3ab70408f9b43cb62b6bd4c7f2..125750e7639ad40c481472a93353e6fb7055be96 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -40,7 +40,10 @@ py_test(
     timeout = "long",  # Moderate but for asan
     srcs = ["state_space_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_mac",
+        "no_windows",  # TODO: needs investigation on Windows
+    ],
     deps = [
         ":state_space_model",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
index 7fa538a16ecd7dcf39beeb001992fd7927cee70b..e9e2ac0aaf4c4d6c41f5007662f261af3de9bbd1 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 from tensorflow.contrib import distributions
 
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
@@ -32,11 +34,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
 
 
+@six.add_metaclass(abc.ABCMeta)
 class FilteringStepPostprocessor(object):
   """Base class for processors that are applied after each filter step."""
 
-  __metaclass__ = abc.ABCMeta
-
   @abc.abstractmethod
   def process_filtering_step(self, current_times, current_values,
                              predicted_state, filtered_state, outputs):
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py
index d04c72100749fd0c96ac74cccbaeb93b7fcd5db4..2ecc7eafdaf1e3dc3a76f99f995e39261e333da7 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model.py
@@ -35,6 +35,7 @@ from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -510,7 +511,7 @@ class StateSpaceModel(model.SequentialTimeSeriesModel):
     estimated_state, estimated_state_covariance, previous_times = state
     state_transition = ops.convert_to_tensor(
         self.get_state_transition(), dtype=self.dtype)
-    state_dimension = state_transition.get_shape()[0].value
+    state_dimension = tensor_shape.dimension_value(state_transition.shape[0])
     # Learning the observation model would be redundant since we transform
     # `exogenous_values` to the state space via a linear transformation anyway.
     observation_model = linalg_ops.eye(
@@ -572,8 +573,9 @@ class StateSpaceModel(model.SequentialTimeSeriesModel):
     start_mean, start_covariance, previous_times = state
     with variable_scope.variable_scope("exogenous_noise_increasing_mean"):
       mean_addition = layers.fully_connected(
-          exogenous_values, start_mean.get_shape()[1].value, activation_fn=None)
-    state_dimension = start_covariance.get_shape()[1].value
+          exogenous_values,
+          tensor_shape.dimension_value(start_mean.shape[1]), activation_fn=None)
+    state_dimension = tensor_shape.dimension_value(start_covariance.shape[1])
     with variable_scope.variable_scope("exogenous_noise_increasing_covariance"):
       covariance_addition = (
           math_utils.transform_to_covariance_matrices(
@@ -712,7 +714,7 @@ class StateSpaceModel(model.SequentialTimeSeriesModel):
     """
     with variable_scope.variable_scope(self._variable_scope):
       state_dimension = ops.convert_to_tensor(
-          self.get_state_transition()).get_shape()[0].value
+          self.get_state_transition()).get_shape().dims[0].value
       if self._configuration.trainable_start_state:
         base_covariance = math_utils.variable_covariance_matrix(
             state_dimension, "prior_state_var",
@@ -742,7 +744,7 @@ class StateSpaceModel(model.SequentialTimeSeriesModel):
     with variable_scope.variable_scope(self._variable_scope):
       state_transition = ops.convert_to_tensor(
           self.get_state_transition(), dtype=self.dtype)
-      state_dimension = state_transition.get_shape()[0].value
+      state_dimension = state_transition.get_shape().dims[0].value
       return variable_scope.get_variable(
           name="prior_state_mean",
           shape=[state_dimension],
@@ -920,7 +922,7 @@ class StateSpaceModel(model.SequentialTimeSeriesModel):
       self, minimum_initial_variance=1e-5):
     state_noise_transform = ops.convert_to_tensor(
         self.get_noise_transform(), dtype=self.dtype)
-    state_noise_dimension = state_noise_transform.get_shape()[1].value
+    state_noise_dimension = state_noise_transform.get_shape().dims[1].value
     if self._input_statistics is not None:
       feature_variance = self._scale_variance(
           self._input_statistics.series_start_moments.variance)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
index 80126ac786e7fc41d334076fc19bea7d091e19ab..26857ba235e4fe13904ca4f1334f4662a795f8a8 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/state_space_model_test.py
@@ -185,9 +185,8 @@ class StateSpaceEquivalenceTests(test.TestCase):
             "exogenous": [-1., -2., -3., -4.]
         }))
     estimator.train(combined_input_fn, steps=1)
-    export_location = estimator.export_savedmodel(
-        self.get_temp_dir(),
-        estimator.build_raw_serving_input_receiver_fn())
+    export_location = estimator.export_saved_model(
+        self.get_temp_dir(), estimator.build_raw_serving_input_receiver_fn())
     with ops.Graph().as_default() as graph:
       random_model.initialize_graph()
       with self.session(graph=graph) as session:
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/test_utils.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/test_utils.py
index 5f127700d99f1a9cf2549e2fdb57ce6090440ac7..f7f5024b34218ceb04d13bb351f6d2d302069bce 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/test_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/test_utils.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.timeseries.python.timeseries import math_utils
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -35,7 +36,7 @@ def transition_power_test_template(test_case, model, num_steps):
   transition_matrix = ops.convert_to_tensor(
       model.get_state_transition(), dtype=model.dtype)
   step_number = array_ops.placeholder(shape=[], dtype=dtypes.int64)
-  state_dimension = transition_matrix.get_shape()[0].value
+  state_dimension = tensor_shape.dimension_value(transition_matrix.shape[0])
   previous_matrix = array_ops.placeholder(
       shape=[state_dimension, state_dimension], dtype=transition_matrix.dtype)
   true_single_step_update = math_ops.matmul(previous_matrix,
@@ -63,8 +64,8 @@ def noise_accumulator_test_template(test_case, model, num_steps):
       model.get_state_transition(), dtype=model.dtype)
   noise_transform = ops.convert_to_tensor(
       model.get_noise_transform(), dtype=model.dtype)
-  state_dimension = transition_matrix.get_shape()[0].value
-  state_noise_dimension = noise_transform.get_shape()[1].value
+  state_dimension = tensor_shape.dimension_value(transition_matrix.shape[0])
+  state_noise_dimension = tensor_shape.dimension_value(noise_transform.shape[1])
   gen_noise_addition = math_utils.sign_magnitude_positive_definite(
       raw=random_ops.random_normal(
           shape=[state_noise_dimension, state_noise_dimension],
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
index 6746dd7b433466c473402e0e8374377093a73492..ddee749b498121ee62c13bde59680269bc497d23 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/varma.py
@@ -52,6 +52,7 @@ from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries.state_space_models import state_space_model
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
@@ -191,7 +192,8 @@ class VARMA(state_space_model.StateSpaceModel):
       initial_transition_noise_scale = 0.
     state_noise_transform = ops.convert_to_tensor(
         self.get_noise_transform(), dtype=self.dtype)
-    state_noise_dimension = state_noise_transform.get_shape()[1].value
+    state_noise_dimension = tensor_shape.dimension_value(
+        state_noise_transform.shape[1])
     return math_utils.variable_covariance_matrix(
         state_noise_dimension, "state_transition_noise",
         dtype=self.dtype,
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 10ed1c289109610b25b998eee332a3ddbf940dc3..4bf3a0463d9046eea2f60e9154fca1357e728215 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -14,10 +14,12 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 package(
     default_visibility = [
         "//cloud/vmm/testing/tests/tpu:__subpackages__",
+        "//knowledge/cerebra/sense/im2query:__subpackages__",
         "//learning/brain:__subpackages__",
         "//learning/deepmind:__subpackages__",
         "//medical/pathology:__subpackages__",
         "//tensorflow:__subpackages__",
+        "//vr/perception:__subpackages__",
     ],
 )
 
@@ -69,8 +71,6 @@ py_library(
     deps = [
         ":async_checkpoint",
         ":tpu_lib",
-        "//tensorflow/compiler/xla/experimental/xla_sharding",
-        "//tensorflow/compiler/xla/python_api:xla_shape",
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -79,6 +79,7 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:summary_ops_v2",
@@ -86,6 +87,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:util",
         "@six_archive//:six",
     ],
 )
@@ -174,6 +176,7 @@ tf_custom_op_py_library(
     deps = [
         ":profiler",
         ":tpu_ops",
+        "//tensorflow/contrib/compiler:xla",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -191,6 +194,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":keras_support",  # split out to avoid cycle with tpu_strategy
+        ":tpu_embedding",
         ":tpu_estimator",
         ":tpu_lib",
     ],
@@ -207,11 +211,13 @@ py_library(
         "//cloud/vmm/testing/tests/tpu:__subpackages__",
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
-        "//third_party/cloud_tpu/models/keras:__subpackages__",
+        "//third_party/cloud_tpu/models/keras_colab:__subpackages__",
+        "//third_party/cloud_tpu/models/mnist_keras:__subpackages__",
+        "//third_party/cloud_tpu/models/resnet50_keras:__subpackages__",
     ],
     deps = [
         ":tpu_lib",
-        "//tensorflow/contrib/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/distribute",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
@@ -242,6 +248,7 @@ py_library(
         "python/tpu/bfloat16.py",
         "python/tpu/device_assignment.py",
         "python/tpu/session_support.py",
+        "python/tpu/tensor_tracer.py",
         "python/tpu/topology.py",
         "python/tpu/tpu.py",
         "python/tpu/tpu_feed.py",
@@ -256,7 +263,10 @@ py_library(
         ":datasets",
         ":profiler",
         ":tpu_py",
-        "//tensorflow/contrib/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/compiler/xla/experimental/xla_sharding",
+        "//tensorflow/compiler/xla/python_api:xla_shape",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
+        "//tensorflow/contrib/compiler:xla",
         "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
         "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_py",
         "//tensorflow/contrib/tpu/proto:topology_proto_py",
@@ -302,6 +312,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         ":datasets",
     ],
+    flaky = 1,  # TODO(b/117363808): fails 1/1000 OSS runs
     grpc_enabled = True,
 )
 
@@ -352,17 +363,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "tpu_function_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_function_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
 tf_py_test(
     name = "tpu_config_test",
     size = "small",
@@ -394,3 +394,21 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
 )
+
+py_library(
+    name = "tpu_embedding",
+    srcs = ["python/tpu/tpu_embedding.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/tpu:tpu_ops",
+        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 38d1c3049ef7185f2f9f448361029d066678cdae..541fbf33a302a4d850422885fdbbc438bd6b9b7b 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -94,13 +94,6 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library(
-    name = "tf_op_stats_proto",
-    srcs = ["tf_op_stats.proto"],
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
 tf_proto_library(
     name = "tpu_profiler_analysis_proto",
     srcs = ["tpu_profiler_analysis.proto"],
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index b4b06a40a2c8aaa97ff82baf93c8f2d55a587e37..ef35e84ba5205fb76e5afe77e670d87197ca8405 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -98,7 +98,7 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
   if (!status.ok()) {
     return errors::Internal(
         "Failed to convert op profile to json. Skipping... ",
-        string(status.error_message()));
+        string(status.message()));
   }
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, json));
   if (os) {
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 63641e00c5dbf4b4e635ecfea8bef98c7d0b7075..a081c4354a779d37140338793e66844c3fcf7a12 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -90,12 +90,12 @@ def main(unused_argv=None):
   tf_version = tf.__version__
   print('TensorFlow version %s detected' % tf_version)
 
-  if FLAGS.service_addr is None and FLAGS.tpu is None:
+  if not FLAGS.service_addr and not FLAGS.tpu:
     sys.exit('You must specify either --service_addr or --tpu.')
 
   tpu_cluster_resolver = None
-  if FLAGS.service_addr is not None:
-    if FLAGS.tpu is not None:
+  if FLAGS.service_addr:
+    if FLAGS.tpu:
       tf.logging.warn('Both --service_addr and --tpu are set. Ignoring '
                       '--tpu and using --service_addr.')
     service_addr = FLAGS.service_addr
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
deleted file mode 100644
index f88dc516361b90dd4b7e42e75a11d8d8c6651a8d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ /dev/null
@@ -1,255 +0,0 @@
-// This proto describes the format of tensorflow operation level stats for
-// profiling (in tensorboard) purpose.
-
-syntax = "proto2";
-
-package tensorflow.tpu;
-
-// Result proto for OpMetrics.
-message OpMetricsResult {
-  // True if this OP is executed on the device; False if it is executed on the
-  // host.
-  optional bool on_device = 1;
-  reserved 2;  // was uint32 id.
-  // Name of this OP.
-  optional string name = 3;
-  // Rank of this OP.
-  optional uint64 rank = 4;
-  // The starting time in cycles of the last instance of this OP executed.
-  optional double last_starttime_in_cycles = 5;
-  // The ending time in cycles of the last instance of this OP executed.
-  optional double last_endtime_in_cycles = 6;
-  // If this OP (say A), is an immediate child of another OP (say B), this field
-  // stores the sum of duration in microseconds of A inside B. If A appears more
-  // than once in B, the duration of all A's appearances will be added together.
-  // This sum will be reset after the self-time of B is calculated so that it
-  // can be reused for a new parent OP.
-  optional double sum_of_duration_in_us_as_children = 7;
-  // Number of instances that this OP occurred.
-  optional uint64 occurrences = 8;
-  // Total time in microseconds spent in this OP (accumulated
-  // over all of its occurrences).
-  optional double total_time_in_us = 9;
-  // Total self time in microseconds spent in this OP
-  // (accumulated over all of its occurrences).
-  optional double total_self_time_in_us = 10;
-  // The total self time as a fraction of sum of all OP's
-  // total self time on the host.
-  optional double host_total_self_time_as_fraction_of_all_op_time = 11;
-  // Cumulative total self time in fraction on the host.
-  optional double host_cumulative_total_self_time_as_fraction_of_all_op_time =
-      12;
-  // The total self time as a fraction of sum of all OP's
-  // total self time on the device.
-  optional double device_total_self_time_as_fraction_of_all_op_time = 13;
-  // Cumulative total self time in fraction on the device.
-  optional double device_cumulative_total_self_time_as_fraction_of_all_op_time =
-      14;
-  // Total number of FLOPs incurred by this OP.
-  optional double total_flops = 15;
-  // Total number of bytes accessed by this OP.
-  optional double total_bytes_accessed = 16;
-  // Total time in microseconds that special hw unit 1 is occupied by this OP.
-  optional double unit1_occupancy_in_us = 17;
-  // Total time in microseconds that special hw unit 2 is occupied by this OP.
-  optional double unit2_occupancy_in_us = 18;
-  // Total memory stall time in microseconds.
-  optional double total_memory_stall_in_us = 19;
-}
-
-// Result proto for OpMetricsDb.
-message OpMetricsDbResult {
-  // A bunch of OpMetricsResults.
-  repeated OpMetricsResult metrics_db = 1;
-  // The total host infeed-enqueue duration in picoseconds.
-  optional uint64 total_host_infeed_enq_duration_ps = 2;
-  // The total of the difference between the start times of two
-  // consecutive infeed-enqueues (per host) in picoseconds.
-  optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
-  // The total device time in microseconds.
-  optional double total_device_time_in_us = 4;
-  // The total host time in microseconds.
-  optional double total_host_time_in_us = 5;
-}
-
-// Result proto for StepInfo.
-message StepInfoResult {
-  // The (micro) step number.
-  optional uint32 step_num = 1;
-  // The step duration in picoseconds.
-  optional uint64 duration_ps = 2;
-  // The infeed duration in picoseconds.
-  optional uint64 infeed_duration_ps = 3;
-  // The outfeed duration in picoseconds.
-  optional uint64 host_outfeed_ps = 8;
-  // The start time of this step in picoseconds.
-  optional uint64 begin_ps = 4;
-  // The waiting time within this step in picoseconds.
-  optional uint64 wait_duration_ps = 5;
-  // The unit b outfeed duration in picoseconds.
-  optional uint64 unit_b_outfeed_ps = 9;
-  // The time spent on cross-replica-sum in picoseconds.
-  optional uint64 crs_duration_ps = 6;
-  // Percentage of unit b time spent on infeed.
-  optional double unit_b_infeed_percent = 7;
-}
-
-// Result proto for a sequence of steps.
-message StepSequenceResult {
-  // A sequence of StepInfoResults.
-  repeated StepInfoResult step_sequence = 1;
-}
-
-// Result proto for a StepDatabase.
-message StepDatabaseResult {
-  // A map from core_id to StepSequenceResult.
-  map<uint32, StepSequenceResult> step_sequence_per_core = 1;
-}
-
-// Result proto for looping-related metrics.
-message LoopingResult {
-  // The total iteration time in nanoseconds.
-  optional double iteration_time_ns = 1;
-  // The total number of iterations.
-  optional int32 num_iterations = 2;
-  // The total computation time in nanoseconds.
-  optional double computation_time_ns = 3;
-  // The total number of computations.
-  optional int32 num_computations = 4;
-}
-
-// Result proto for HloExtraInfo.
-message HloExtraInfoResult {
-  // Category of the HLO op given by the compiler.
-  optional string category = 1;
-  // The long name of the HLO that includes the dimensions.
-  optional string long_name = 2;
-  // The per-TPU-core batch size inferred from this HLO.
-  optional int64 per_core_batch_size = 3;
-}
-
-// Result proto for HloExtraInfoMap.
-message HloExtraInfoMapResult {
-  // A map from HLO name to HloExtraInfo.
-  map<string, HloExtraInfoResult> hlo_extrainfo_map = 1;
-}
-
-// Result proto for host-independent job information.
-message HostIndependentJobInfoResult {
-  // The change-list number of this build.
-  optional int64 change_list = 1;
-  // The time of this build.
-  optional int64 build_time = 2;
-  // The target of this build.
-  optional string build_target = 3;
-}
-
-// Result proto for host-dependent job information.
-message HostDependentJobInfoResult {
-  // This ID of the host where the job was run on.
-  optional string host_id = 1;
-  // The command line used to run the job.
-  optional string command_line = 2;
-  // The start time of the job on this host.
-  optional int64 start_time = 3;
-}
-
-// Result proto for RunEnvironment (the run environment of a profiling session).
-message RunEnvironmentResult {
-  // Number of hosts used.
-  optional int32 host_count = 1;
-  // The type of TPU used.
-  optional string tpu_type = 2;
-  // The number of TPU cores used.
-  optional int32 tpu_core_count = 3;
-  // The per-TPU-core batch size.
-  optional int32 per_core_batch_size = 4;
-  // Host-independent job information.
-  optional HostIndependentJobInfoResult host_independent_job_info = 5;
-  // Host-dependent job information.
-  repeated HostDependentJobInfoResult host_dependent_job_info = 6;
-}
-
-// The types of host operations that are tracked.
-enum HostOp {
-  // Invalid host op.
-  kINVALIDHostOp = 0;
-  // Each of host op type has two parts:
-  // (1) the stage where the op happens and (2) the op name.
-  // stage = Input Data Producer, op = Get Next Batch.
-  kInputDataProducerGetNextBatch = 1;
-  // stage = Input Data Producer, op = Session Run.
-  kInputDataProducerSessionRun = 2;
-  // stage = Input Data Producer, op = Forward Batch.
-  kInputDataProducerForwardBatch = 3;
-  // stage = Infeed Thread, op = Get Next Batch.
-  kInfeedThreadGetNextBatch = 4;
-  // stage = Infeed Thread, op = Session Run.
-  kInfeedThreadSessionRun = 5;
-  // stage = Infeed Thread, op = Forward Batch.
-  kInfeedThreadForwardBatch = 6;
-  // stage = Outfeed Thread, op = Get Next Batch.
-  kOutfeedThreadGetNextBatch = 7;
-  // stage = Outfeed Thread, op = Session Run.
-  kOutfeedThreadSessionRun = 8;
-  // stage = Outfeed Thread, op = Forward Batch.
-  kOutfeedThreadForwardBatch = 9;
-}
-
-// Result proto for the host ops per TPU step.
-message HostOpsPerTpuStep {
-  // Whether the data in this message is valid.
-  optional bool valid = 1 [default = false];
-  // The current TPU step number.
-  optional uint32 tpu_step_num = 2;
-  // The beginning time of the current TPU step on the device in picoseconds.
-  optional uint64 tpu_step_begin_ps = 3;
-  // The ending time of the current TPU step on the device in picoseconds.
-  optional uint64 tpu_step_end_ps = 4;
-  // For each possible host operation, maps to the difference between the TPU
-  // step number that the host op targets and the current TPU step number.
-  // The key is HostOp, value is the step difference.
-  map<int32, int32> step_diffs = 5;
-}
-
-message HostOpsDetailsPerCore {
-  // Map from core id to HostOpsPerTpuStep.
-  map<int32, HostOpsPerTpuStep> core_map = 1;
-}
-
-message HostOpsDetailsPerHost {
-  // Map from hostname to a map from core id to HostOpsPerTpuStep.
-  map<string, HostOpsDetailsPerCore> host_map = 1;
-}
-
-// Result proto for the host ops for all TPU steps.
-message HostOpsResult {
-  reserved 1;  // (was repeated HostOpsPerTpuStep host_op_sequence)
-  // A sequence of records with one for each TPU step. Each record
-  // is a map from hostname to a map from core id to HostOpsPerTpuStep.
-  repeated HostOpsDetailsPerHost hostops_details = 2;
-}
-
-// Result proto for TfStatsHelper.
-message TfOpStats {
-  // The result for the TF-metric database.
-  optional OpMetricsDbResult tf_metrics_db = 1;
-  // The result for the HLO-metric database.
-  optional OpMetricsDbResult hlo_metrics_db = 2;
-  // The result for the step database.
-  optional StepDatabaseResult step_db = 3;
-  // The result for the looping-related metrics.
-  optional LoopingResult looping = 4;
-  // The result for the HloExtraInfoMap.
-  optional HloExtraInfoMapResult hlo_extrainfo_map = 5;
-  // Overall matrix unit utilization in percentage.
-  optional double matrix_unit_utilization_percent = 6;
-  // The run environment of this profiling session.
-  optional RunEnvironmentResult run_environment = 7;
-  // The result for the host operations.
-  optional HostOpsResult host_ops = 8;
-  // A map from core ID to name.
-  map<uint32, string> core_id_to_name_map = 9;
-  // The result for hw unit b stats.
-  optional bytes unit_b_stats = 10;
-}
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index 8529b48c15fa749a7c2978003f47aafca17da82f..aae1ab1d37a166303883e3a07a7a01efe2feab51 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -62,9 +62,9 @@ message FtrlParameters {
 // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
 // use_non_lazy_adam is enabled, use_gradient_accumulation is also required in
 // order to get correct results; a warning will be printed otherwise (which may
-// change to an error in the future). If use_max_with_epsilon is set, the Adam
+// change to an error in the future). If use_sum_inside_sqrt is set, the Adam
 // variable update formula will be changed from m / (sqrt(v) + epsilon) to
-// m / max(sqrt(v), abs(epsilon)); this option improves the performance of TPU
+// m / sqrt(v + epsilon**2); this option improves the performance of TPU
 // training and is not expected to harm model quality.
 message AdamParameters {
   float beta1 = 3;
@@ -73,7 +73,7 @@ message AdamParameters {
   float initial_m = 6;
   float initial_v = 7;
   bool use_non_lazy_adam = 8;
-  bool use_max_with_epsilon = 9;
+  bool use_sum_inside_sqrt = 10;
 }
 
 // https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer
@@ -154,6 +154,14 @@ message OptimizationParameters {
   // updates; not present means no limits are applied.
   ClippingLimits gradient_clipping_limits = 7;
 
+  // Amount of weight decay to apply; see weight_decay_optimizers.py for
+  // details. Almost all optimizers are supported with this option (MDL Adagrad
+  // Light does not work, and SGD does not behave as expected if it is enabled).
+  // Although there is no check, users who want weight decay will probably also
+  // want to enable gradient accumulation as well so that the decay will happen
+  // once per minibatch.
+  float weight_decay_factor = 16;
+
   // Whether to use gradient accumulation (do two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
   // apply them using the actual optimization algorithm). This feature is
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index 968adccf2b82e80bd008e54d3af614bd74852795..6a6eba282a12d68cc3cd4e46a46a1b4190fb737b 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -137,10 +137,18 @@ if platform.system() != "Windows":
     """
     return gen_tpu_ops.collective_permute(x, source_target_pairs, name=name)
 
+  @ops.RegisterGradient("CollectivePermute")
+  def _collective_permute_grad(op, grad):
+    # The gradient of a collective permute operation is also a collective
+    # permute, but with source/target pairs reversed. The gradient with respect
+    # to input argument `source_target_pairs` is `None`.
+    source_target_pairs = op.inputs[1][:, ::-1]
+    return [gen_tpu_ops.collective_permute(grad, source_target_pairs), None]
+
   @ops.RegisterGradient("CrossReplicaSum")
   def _cross_replica_sum_grad(op, grad):
     # The gradient of a cross replica sum is also a cross-replica sum.
-    # The graident with respect to group_assignment is None.
+    # The gradient with respect to group_assignment is None.
     return [gen_tpu_ops.cross_replica_sum(grad, op.inputs[1]), None]
 
   # This extra type checking exists to give a more helpful error message in
diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
index 20b7ba09976f39a94414f985631ce997cca5236a..1b09ce173a64ba3f93ec019c8fd65dc4710f0fcf 100644
--- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
+++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
@@ -69,6 +69,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
       raise ValueError("You cannot provide both saver and scaffold.")
     self._saver = saver
     self._save_thread = None
+    self._write_graph_thread = None
     self._checkpoint_dir = checkpoint_dir
     self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
     self._scaffold = scaffold
@@ -79,6 +80,8 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
     self._summary_writer = None
     self._global_step_tensor = None
 
+    self._last_checkpoint_step = None
+
   def _set_steps_per_run(self, steps_per_run):
     self._steps_per_run = steps_per_run
 
@@ -97,9 +100,14 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
     # We do write graph and saver_def at the first call of before_run.
     # We cannot do this in begin, since we let other hooks to change graph and
     # add variables in begin. Graph is finalized after all begin calls.
-    training_util.write_graph(
-        ops.get_default_graph().as_graph_def(add_shapes=True),
-        self._checkpoint_dir, "graph.pbtxt")
+    def _write_graph_fn(self):
+      training_util.write_graph(
+          ops.get_default_graph().as_graph_def(add_shapes=True),
+          self._checkpoint_dir, "graph.pbtxt")
+    self._write_graph_thread = threading.Thread(target=_write_graph_fn,
+                                                args=[self])
+    self._write_graph_thread.start()
+
     saver_def = self._get_saver().saver_def if self._get_saver() else None
     graph = ops.get_default_graph()
     meta_graph_def = meta_graph.create_meta_graph_def(
@@ -114,25 +122,24 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
     return SessionRunArgs(self._global_step_tensor)
 
   def after_run(self, run_context, run_values):
-    stale_global_step = run_values.results
-    if self._timer.should_trigger_for_step(stale_global_step +
-                                           self._steps_per_run):
-      # get the real value after train op.
-      global_step = run_context.session.run(self._global_step_tensor)
-      if self._timer.should_trigger_for_step(global_step):
-        self._timer.update_last_triggered_step(global_step)
-        if self._save(run_context.session, global_step):
-          run_context.request_stop()
+    global_step = run_context.session.run(self._global_step_tensor)
+    if self._timer.should_trigger_for_step(global_step):
+      self._timer.update_last_triggered_step(global_step)
+      logging.info("Triggering checkpoint. %s", global_step)
+      if self._save(run_context.session, global_step):
+        run_context.request_stop()
 
   def end(self, session):
     if self._save_thread:
       logging.info("Waiting for any pending checkpoints to finish.")
       self._save_thread.join()
+    if self._write_graph_thread:
+      logging.info("Waiting for any pending write_graph to finish.")
+      self._write_graph_thread.join()
 
     last_step = session.run(self._global_step_tensor)
 
-    # Save the last checkpoint synchronously if needed.
-    if last_step != self._timer.last_triggered_step():
+    if self._last_checkpoint_step != last_step:
       self._save(session, last_step, asynchronous=False)
 
     for l in self._listeners:
@@ -158,15 +165,17 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
           SessionLog(
               status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
           step)
+
+      for l in self._listeners:
+        l.after_save(session, step)
+
       end_time = time.time()
       logging.info("Checkpoint actual writing time: (%.3f sec)",
                    end_time - start_time)
       logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
 
-    for l in self._listeners:
-      l.before_save(session, step)
-
     if not asynchronous:
+      self._last_checkpoint_step = step
       _save_fn()
       return
 
@@ -176,6 +185,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
         logging.info("Saver thread still in progress, skipping checkpoint.")
         return
 
+    self._last_checkpoint_step = step
     self._save_thread = threading.Thread(target=_save_fn)
     self._save_thread.start()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index c694e9c1bca10d9930492c29dd1c3cbc7f7f5d04..8d6245390fc3fa005c92d01bc9b64ddb47583582 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -133,7 +133,7 @@ def StreamingFilesDataset(files,
   with ops.device('/job:%s' % file_reader_job):
     if isinstance(files, str):
       source_dataset = dataset_ops.Dataset.list_files(files)
-    elif isinstance(files, dataset_ops.Dataset):
+    elif isinstance(files, dataset_ops.DatasetV2):
       source_dataset = files
     else:
       raise ValueError('files was not a string or a dataset: %s' % files)
@@ -156,7 +156,7 @@ def StreamingFilesDataset(files,
 
     source_dataset = source_dataset.prefetch(1)
 
-    source_iterator = source_dataset.make_one_shot_iterator()
+    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
     source_handle = source_iterator.string_handle()
 
   @function.Defun(dtypes.string)
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index b58d05eac56f3586e183333f7c1a3867ee57456c..52d87b800401c3e584da9843916cfc7a767c082a 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -70,7 +70,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'text_line.*.txt'), filetype='text')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -94,7 +94,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'tf_record*'), filetype='tfrecord')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -121,7 +121,7 @@ class DatasetsTest(test.TestCase):
 
     dataset = datasets.StreamingFilesDataset(filenames, filetype='tfrecord')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -154,7 +154,7 @@ class DatasetsTest(test.TestCase):
         os.path.join(self.get_temp_dir(), 'fixed_length*'),
         filetype=FixedLengthFile)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -177,7 +177,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         dataset_ops.Dataset.range(10), filetype=gen_dataset)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
index b9e2a4287a97d2f2d8cb2fb73a12d3f24f090007..6906501ecf90c8e577aa0becf2dba818deb19df4 100644
--- a/tensorflow/contrib/tpu/python/tpu/device_assignment.py
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -25,20 +25,29 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib.tpu.python.tpu.topology import Topology
 
 
-def _tpu_device_name(job, task, device):
-  """Returns the device name for the TPU `device` on `task` of `job`."""
-  if job is None:
-    return "/task:%d/device:TPU:%d" % (task, device)
-  else:
-    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
+def _compute_task_and_cores_to_replicas(core_assignment, topology):
+  """Computes a nested dict which maps task and logical core to replicas."""
+  task_and_cores_to_replicas = {}
+  for replica in xrange(core_assignment.shape[0]):
+    for logical_core in xrange(core_assignment.shape[1]):
+      coordinates = core_assignment[replica, logical_core, :]
+      task_id = topology.task_ordinal_at_coordinates(coordinates)
+      if task_id not in task_and_cores_to_replicas:
+        task_and_cores_to_replicas[task_id] = {}
+      if logical_core not in task_and_cores_to_replicas[task_id]:
+        task_and_cores_to_replicas[task_id][logical_core] = set()
 
+      task_and_cores_to_replicas[task_id][logical_core].add(replica)
 
-def _tpu_host_device_name(job, task):
-  """Returns the device name for the CPU device on `task` of `job`."""
-  if job is None:
-    return "/task:%d/device:CPU:0" % task
-  else:
-    return "/job:%s/task:%d/device:CPU:0" % (job, task)
+  task_to_sorted_replica_id = {}
+
+  for task, core_to_replicas in task_and_cores_to_replicas.items():
+    core_to_sorted_replicas = {}
+    for core, replicas in core_to_replicas.items():
+      core_to_sorted_replicas[core] = sorted(replicas)
+
+    task_to_sorted_replica_id[task] = core_to_sorted_replicas
+  return task_to_sorted_replica_id
 
 
 class DeviceAssignment(object):
@@ -68,10 +77,7 @@ class DeviceAssignment(object):
     core_assignment = np.asarray(core_assignment, dtype=np.int32)
 
     self._topology = topology
-    self._topology_tasks, self._topology_devices = (
-        self._invert_topology(topology))
 
-    topology_rank = self._topology_tasks.ndim
     if core_assignment.ndim != 3:
       raise ValueError("core_assignment must be a rank 3 numpy array, "
                        "got shape {}".format(core_assignment.shape))
@@ -79,52 +85,15 @@ class DeviceAssignment(object):
     self._num_replicas = core_assignment.shape[0]
     self._num_cores_per_replica = core_assignment.shape[1]
 
-    if core_assignment.shape[-1] != topology_rank:
+    if core_assignment.shape[-1] != topology.mesh_rank:
       raise ValueError(
           "minor dimension of core_assignment must have size equal to topology "
-          "rank ({}), got shape {}".format(topology_rank,
+          "rank ({}), got shape {}".format(topology.mesh_rank,
                                            core_assignment.shape))
 
     self._core_assignment = core_assignment
-    self._task_and_cores_to_replicas = self._compute_task_and_cores_to_replicas(
-        self._core_assignment, self._topology_tasks)
-
-  def _invert_topology(self, topology):
-    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
-    mesh_shape = topology.mesh_shape
-    tasks = np.full(list(mesh_shape), -1, dtype=np.int32)
-    devices = np.full(list(mesh_shape), -1, dtype=np.int32)
-    for task in xrange(topology.device_coordinates.shape[0]):
-      for device in xrange(topology.device_coordinates.shape[1]):
-        x, y, z = topology.device_coordinates[task, device, :]
-        tasks[x, y, z] = task
-        devices[x, y, z] = device
-    return tasks, devices
-
-  def _compute_task_and_cores_to_replicas(self, core_assignment,
-                                          topology_tasks):
-    """Computes a nested dict which maps task and logical core to replicas."""
-    task_and_cores_to_replicas = {}
-    for replica in xrange(core_assignment.shape[0]):
-      for logical_core in xrange(core_assignment.shape[1]):
-        x, y, z = core_assignment[replica, logical_core, :]
-        task_id = topology_tasks[x, y, z]
-        if task_id not in task_and_cores_to_replicas:
-          task_and_cores_to_replicas[task_id] = {}
-        if logical_core not in task_and_cores_to_replicas[task_id]:
-          task_and_cores_to_replicas[task_id][logical_core] = set()
-
-        task_and_cores_to_replicas[task_id][logical_core].add(replica)
-
-    task_to_sorted_replica_id = {}
-
-    for task, core_to_replicas in task_and_cores_to_replicas.items():
-      core_to_sorted_replicas = {}
-      for core, replicas in core_to_replicas.items():
-        core_to_sorted_replicas[core] = sorted(replicas)
-
-      task_to_sorted_replica_id[task] = core_to_sorted_replicas
-    return task_to_sorted_replica_id
+    self._task_and_cores_to_replicas = _compute_task_and_cores_to_replicas(
+        self._core_assignment, topology)
 
   @property
   def topology(self):
@@ -179,18 +148,17 @@ class DeviceAssignment(object):
   def tpu_ordinal(self, replica=0, logical_core=0):
     """Returns the ordinal of the TPU device assigned to a logical core."""
     coordinates = self._coordinates(replica, logical_core)
-    return self._topology_devices[coordinates]
+    return self._topology.tpu_device_ordinal_at_coordinates(coordinates)
 
   def host_device(self, replica=0, logical_core=0, job=None):
     """Returns the CPU device attached to a logical core."""
     coordinates = self._coordinates(replica, logical_core)
-    return _tpu_host_device_name(job, self._topology_tasks[coordinates])
+    return self._topology.cpu_device_name_at_coordinates(coordinates, job=job)
 
   def tpu_device(self, replica=0, logical_core=0, job=None):
     """Returns the name of the TPU device assigned to a logical core."""
     coordinates = self._coordinates(replica, logical_core)
-    return _tpu_device_name(job, self._topology_tasks[coordinates],
-                            self._topology_devices[coordinates])
+    return self._topology.tpu_device_name_at_coordinates(coordinates, job=job)
 
 
 def device_assignment(topology,
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index a3a7fd8bb0b02ccd53964cb1960052571a50943e..4ce194590342555a7c4e9e119bf51e516a37a715 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -81,6 +81,7 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import models
 from tensorflow.python.keras import optimizers as keras_optimizers
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.layers import embeddings
@@ -97,14 +98,25 @@ from tensorflow.python.platform import tf_logging as logging
 
 # TODO(b/114775106): temporary shim to optionally initialize the TPU
 # This increases the odds our session is initialized, but shouldn't be needed.
+_TEST_REWRITE_OP = None
+
+
 def _maybe_initialize_tpu(session):
   """Initialize the TPU if it has not already been initialized."""
+  global _TEST_REWRITE_OP
   try:
+    # Try to use cached version to avoid another ground of graph optimization.
+    test_rewrite_op = _TEST_REWRITE_OP
+    if (test_rewrite_op is None or
+        test_rewrite_op[0].graph != ops.get_default_graph()):
+
+      def test_op():
+        return constant_op.constant(1) + constant_op.constant(1)
 
-    def test_op():
-      return constant_op.constant(1) + constant_op.constant(1)
+      test_rewrite_op = tpu.rewrite(test_op)
+      _TEST_REWRITE_OP = test_rewrite_op
 
-    session.run(tpu.rewrite(test_op))
+    session.run(test_rewrite_op)
   except errors.FailedPreconditionError as _:
     session.run(tpu.initialize_system())
 
@@ -121,7 +133,7 @@ def _tpu_session_context():
 An error occurred connecting or initializing your TPU.
 
 The session has been reset. re-run keras_to_tpu_model to create a new session.
-""" + e)
+""" + str(e))
 
 
 def setup_tpu_session(cluster_resolver):
@@ -280,13 +292,14 @@ def _cross_replica_concat(tensor, core_id, num_cores, name):
   """
 
   input_dtype = tensor.dtype
-  if input_dtype not in [dtypes.float32, dtypes.int32]:
-    raise TypeError('For model replication, only (float32 and int32) is '
-                    'supported for model outputs and targets. Got {} for '
+  if input_dtype not in [dtypes.bfloat16, dtypes.float32, dtypes.int32]:
+    raise TypeError('For model replication, only (bfloat16, float32 and int32) '
+                    'is supported for model outputs and targets. Got {} for '
                     '{}.'.format(input_dtype, name))
 
   batch_size = tensor.shape[0]
-  mask = math_ops.to_float(math_ops.equal(range(num_cores), core_id))
+  mask = math_ops.to_float(
+      math_ops.equal(np.arange(num_cores, dtype=np.int32), core_id))
   mask = array_ops.reshape(mask, [num_cores] + [1] * tensor.shape.ndims)
   result = mask * math_ops.to_float(tensor)
   local_tensor_with_holes = array_ops.reshape(result,
@@ -361,7 +374,7 @@ def _replicated_optimizer(opt):
     return KerasCrossShardOptimizer(opt)
 
 
-def _clone_optimizer(optimizer, config=None):
+def _clone_optimizer(optimizer, config=None, worker_name=None):
   """Returns a cloned optimizer with the provided optimizer.config or config."""
   if not isinstance(optimizer, keras_optimizers.Optimizer):
     # In the first call to tpu_model(model), Keras may not have wrapped the TF
@@ -376,7 +389,10 @@ def _clone_optimizer(optimizer, config=None):
   if config is None:
     config = optimizer.get_config()
   logging.info('Cloning %s %s', optimizer.__class__.__name__, config)
-  return optimizer.__class__.from_config(config)
+  with ops.device(
+      '%s/device:CPU:0' % ('/job:%s' % worker_name if worker_name else '')):
+    # Explicitly put optimizer parameter variables on TPU worker.
+    return optimizer.__class__.from_config(config)
 
 
 class TPURewriteContext(object):
@@ -423,7 +439,7 @@ class TPURewriteContext(object):
 
     self._default_placeholder = array_ops.placeholder
     self._default_name_scope = ops.name_scope
-    self._default_make_variable = base_layer.make_variable
+    self._default_make_variable = base_layer_utils.make_variable
     self._default_random_normal = random_ops.random_normal
     self._default_qr = gen_linalg_ops.qr
 
@@ -471,14 +487,14 @@ class TPURewriteContext(object):
     gen_linalg_ops.qr = qr
 
     ops.name_scope = _name_scope
-    base_layer.make_variable = variable_scope.get_variable
+    base_layer_utils.make_variable = variable_scope.get_variable
     logging.info('Overriding default placeholder.')
     return
 
   def __exit__(self, exc_type, exc_val, exc_tb):
     array_ops.placeholder = self._default_placeholder
     ops.name_scope = self._default_name_scope
-    base_layer.make_variable = self._default_make_variable
+    base_layer_utils.make_variable = self._default_make_variable
     random_ops.random_normal = self._default_random_normal
     gen_linalg_ops.qr = self._default_qr
 
@@ -528,6 +544,7 @@ class TPUInfeedInstance(object):
     pass
 
 
+@six.add_metaclass(abc.ABCMeta)
 class TPUInfeedManager(object):
   """TPUInfeedManager manages the data infeeding of data to a TPU computation.
 
@@ -712,7 +729,7 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
     dummy_x_shape[0] *= tpu_assignment.num_towers
     dummy_y_shape = dataset.output_shapes[1].as_list()
     dummy_y_shape[0] *= tpu_assignment.num_towers
-    self._iterator = dataset.make_initializable_iterator()
+    self._iterator = dataset_ops.make_initializable_iterator(dataset)
     K.get_session().run(self._iterator.initializer)
 
     self._get_next_ops = []
@@ -753,7 +770,7 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
 
   def _verify_dataset_shape(self, dataset):
     """Verifies a dataset is of an appropriate shape for TPUs."""
-    if not isinstance(dataset, dataset_ops.Dataset):
+    if not isinstance(dataset, dataset_ops.DatasetV2):
       raise ValueError('The function passed as the `x` parameter did not '
                        'return a `tf.data.Dataset`.')
     if not isinstance(dataset.output_classes, tuple):
@@ -955,13 +972,14 @@ class TPUFunction(object):
               self._tpu_assignment.num_towers):
             if not self._cloned_optimizer:
               self._cloned_optimizer = _clone_optimizer(
-                  self.model.cpu_optimizer)
+                  self.model.cpu_optimizer,
+                  worker_name=self._tpu_assignment.worker_name)
 
             self._cloned_model = models.clone_model(self.model)
 
             # When running on more than one core, concatenate outputs at the end
             # of processing. In backprop stage, the gradients will be
-            # calculdated according to the local inputs as gradient of
+            # calculated according to the local inputs as gradient of
             # cross-replica-concat being zero for any outputs other than those
             # from mlocal core so the loss calculation is identical.
             num_towers = self.model._tpu_assignment.num_towers
@@ -972,6 +990,12 @@ class TPUFunction(object):
                       name='model output ({})'.format(o.name))
                   for o in self._cloned_model.outputs
               ]
+              # Recast all low precision outputs back to float32 since we only
+              # casted the inputs to bfloat16 and not targets. This is done so
+              # that we can preserve precision when calculating the loss value.
+              if new_outputs and new_outputs[0].dtype == dtypes.bfloat16:
+                new_outputs = [
+                    math_ops.cast(o, dtypes.float32) for o in new_outputs]
               self._cloned_model.outputs = new_outputs
               tpu_targets = [
                   _cross_replica_concat(
@@ -982,14 +1006,17 @@ class TPUFunction(object):
                   for tensor in tpu_targets
               ]
 
-            if is_training or is_test:
+          if is_training or is_test:
+            with variable_scope.variable_scope(
+                'metrics', reuse=variable_scope.AUTO_REUSE):
               self._cloned_model.compile(
                   optimizer=_replicated_optimizer(self._cloned_optimizer),
                   loss=self.model.loss,
                   loss_weights=self.model.loss_weights,
-                  metrics=metrics_module.clone_metrics(self.model.metrics),
+                  metrics=metrics_module.clone_metrics(
+                      self.model._compile_metrics),
                   weighted_metrics=metrics_module.clone_metrics(
-                      self.model.weighted_metrics),
+                      self.model._compile_weighted_metrics),
                   target_tensors=tpu_targets,
               )
 
@@ -1001,29 +1028,29 @@ class TPUFunction(object):
           # the Momentum optimizer) when _make_train_function is invoked.
           with keras_tpu_variables.replicated_variable_for_optimizer(
               self._tpu_assignment.num_towers):
-            self._cloned_model._make_train_function()
+            self._cloned_model._make_fit_function()
         else:
-          self._cloned_model._make_train_function()
+          self._cloned_model._make_fit_function()
 
         self._outfeed_spec = [
             tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
-            for tensor in self._cloned_model.train_function.outputs
+            for tensor in self._cloned_model._fit_function.outputs
         ]
         return [
-            self._cloned_model.train_function.updates_op,
+            self._cloned_model._fit_function.updates_op,
             tpu_ops.outfeed_enqueue_tuple(
-                self._cloned_model.train_function.outputs,
+                self._cloned_model._fit_function.outputs,
                 name='outfeed-enqueue-train')
         ]
       elif is_test:
-        self._cloned_model._make_test_function()
+        self._cloned_model._make_eval_function()
         self._outfeed_spec = [
             tensor_spec.TensorSpec(tensor.shape, tensor.dtype, tensor.name)
-            for tensor in self._cloned_model.test_function.outputs
+            for tensor in self._cloned_model._eval_function.outputs
         ]
         return [
             tpu_ops.outfeed_enqueue_tuple(
-                self._cloned_model.test_function.outputs,
+                self._cloned_model._eval_function.outputs,
                 name='outfeed-enqueue-test')
         ]
       elif is_predict:
@@ -1049,7 +1076,7 @@ class TPUFunction(object):
     # `execute op` replicates `_model_fn` `num_replicas` times, with each shard
     # running on a different logical core.
     compile_op, execute_op = tpu.split_compile_and_replicate(
-        _model_fn, inputs=[[]] * self._tpu_assignment.num_towers)
+        _model_fn, inputs=[[] for _ in range(self._tpu_assignment.num_towers)])
 
     # Generate CPU side operations to enqueue features/labels and dequeue
     # outputs from the model call.
@@ -1159,13 +1186,9 @@ class TPUFunction(object):
       # pipelined loop.
       return None, None
 
-    if (self.model.uses_learning_phase and
-        not isinstance(K.learning_phase(), int)):
+    if isinstance(inputs[-1], int):
       # Remove the learning_phase flag at the end. We currently hard code the
       # learning_phase in TPUFunction.
-      assert isinstance(inputs[-1], int), (
-          'Expect the final element be learning_phase flag. Got {}'.format(
-              inputs[-1]))
       inputs = inputs[:-1]
 
     if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or
@@ -1194,7 +1217,7 @@ class TPUFunction(object):
     """
     # TODO(xiejw): Decide how to reduce outputs, or discard all but first.
     if self.execution_mode == model_fn_lib.ModeKeys.PREDICT:
-      outputs = [[]] * len(self._outfeed_spec)
+      outputs = [[] for _ in range(len(self._outfeed_spec))]
       outputs_per_replica = len(self._outfeed_spec)
 
       for i in range(self._tpu_assignment.num_towers):
@@ -1353,6 +1376,9 @@ class KerasTPUModel(models.Model):
     self.predict_function = None
     self.test_function = None
     self.train_function = None
+    self._fit_function = None
+    self._eval_function = None
+    self._stateful_metric_functions = []
 
     cluster_resolver = strategy._tpu_cluster_resolver
     self._tpu_name_or_address = cluster_resolver.get_master()
@@ -1367,13 +1393,22 @@ class KerasTPUModel(models.Model):
       self.compile(
           self._cpu_model.optimizer,
           self._cpu_model.loss,
-          self._cpu_model.metrics,
+          self._cpu_model._compile_metrics,
           self._cpu_model.loss_weights,
           self._cpu_model.sample_weight_mode,
-          self._cpu_model.weighted_metrics,
+          self._cpu_model._compile_weighted_metrics,
           self._cpu_model.target_tensors,
       )
 
+    # This flag must be disabled upon model mutation, such as changing the model
+    # layers or recompiling the model to use a different optimizer. New function
+    # definitions are generated whenever this flag is disabled, ensuring that
+    # internal graph functions are always using the current model structure.
+    #
+    # Requires declaration here because this constructor skips the
+    # Model constructor.
+    self._built_graph_functions = False
+
   def get_config(self):
     return {
         'cpu_model': self._cpu_model,
@@ -1431,7 +1466,7 @@ class KerasTPUModel(models.Model):
       assert not self._numpy_to_infeed_manager_list  # Ensure empty.
 
       infeed_managers = []  # Managers to clean up at the end of the fit call.
-      if isinstance(x, dataset_ops.Dataset):
+      if isinstance(x, dataset_ops.DatasetV2):
         # TODO(b/111413240): Support taking a tf.data.Dataset directly.
         raise ValueError(
             'Taking a Dataset directly is not yet supported. Please '
@@ -1457,7 +1492,7 @@ class KerasTPUModel(models.Model):
           y = infeed_manager.dummy_y
           infeed_managers.append((x, infeed_manager))
 
-      if isinstance(validation_data, dataset_ops.Dataset):
+      if isinstance(validation_data, dataset_ops.DatasetV2):
         # TODO(b/111413240): Support taking a tf.data.Dataset directly.
         raise ValueError(
             'Taking a Dataset directly is not yet supported. Please '
@@ -1481,10 +1516,12 @@ class KerasTPUModel(models.Model):
 
       self._numpy_to_infeed_manager_list = infeed_managers
       try:
-        if not kwargs.get('_pipeline', True):
-          logging.info('Running non-pipelined training loop (`_pipeline=%s`).',
-                       kwargs['_pipeline'])
+        pipeline = kwargs.get('_pipeline', True)
+        if '_pipeline' in kwargs:
           kwargs.pop('_pipeline')
+        if not pipeline:
+          logging.info('Running non-pipelined training loop (`_pipeline=%s`).',
+                       pipeline)
           return super(KerasTPUModel, self).fit(
               x, y, batch_size, epochs, verbose, callbacks, validation_split,
               validation_data, shuffle, class_weight, sample_weight,
@@ -1503,11 +1540,18 @@ class KerasTPUModel(models.Model):
                verbose=1,
                sample_weight=None,
                steps=None):
-    assert not self._numpy_to_infeed_manager_list  # Ensure empty.
+    original_numpy_to_infeed_manager_list = []
+    if self._numpy_to_infeed_manager_list:
+      # evaluate call may be executed as callbacks during the training. In this
+      # case, _numpy_to_infeed_manager_list is not empty, so save it for
+      # recovery at the end of evaluate call.
+      original_numpy_to_infeed_manager_list = self._numpy_to_infeed_manager_list
+      self._numpy_to_infeed_manager_list = []
 
     with _tpu_session_context():
-      infeed_managers = []  # Managers to clean up at the end of the fit call.
-      if isinstance(x, dataset_ops.Dataset):
+      # Managers to clean up at the end of the evaluate call.
+      infeed_managers = []
+      if isinstance(x, dataset_ops.DatasetV2):
         # TODO(b/111413240): Support taking a tf.data.Dataset directly.
         raise ValueError(
             'Taking a Dataset directly is not yet supported. Please '
@@ -1536,7 +1580,8 @@ class KerasTPUModel(models.Model):
         return super(KerasTPUModel, self).evaluate(x, y, batch_size, verbose,
                                                    sample_weight, steps)
       finally:
-        self._numpy_to_infeed_manager_list = []
+        self._numpy_to_infeed_manager_list = (
+            original_numpy_to_infeed_manager_list)
 
   def _pipeline_fit(self, x, y, batch_size, epochs, verbose, callbacks,
                     validation_split, validation_data, shuffle, class_weight,
@@ -1605,7 +1650,7 @@ class KerasTPUModel(models.Model):
     self._make_train_function()
     sample_weights = sample_weights or []
     val_sample_weights = val_sample_weights or []
-    if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
+    if not isinstance(K.learning_phase(), int):
       ins = inputs + targets + sample_weights + [1]
     else:
       ins = inputs + targets + sample_weights
@@ -1631,14 +1676,10 @@ class KerasTPUModel(models.Model):
         callbacks,
         self,
         do_validation=do_validation,
-        val_inputs=val_inputs,
-        val_targets=val_targets,
-        val_sample_weights=val_sample_weights,
         batch_size=batch_size,
         epochs=epochs,
         steps_per_epoch=steps_per_epoch,
         samples=num_training_samples,
-        validation_steps=validation_steps,
         verbose=verbose,
         count_mode=count_mode)
 
@@ -1655,7 +1696,7 @@ class KerasTPUModel(models.Model):
     callbacks.on_train_begin()
     for epoch in range(initial_epoch, epochs):
       # Reset stateful metrics
-      for m in self.stateful_metric_functions:
+      for m in self.metrics:
         m.reset_states()
       # Update callbacks
       callbacks.on_epoch_begin(epoch)
@@ -1878,7 +1919,7 @@ class KerasTPUModel(models.Model):
     if validation_data:
       if (isinstance(validation_data, iterator_ops.Iterator) or
           isinstance(validation_data, iterator_ops.EagerIterator) or
-          isinstance(validation_data, dataset_ops.Dataset)):
+          isinstance(validation_data, dataset_ops.DatasetV2)):
         raise ValueError('KerasTPUModel cannot handle a Dataset or Iterator '
                          'for validation_data. Please instead pass a function '
                          'that returns a `tf.data.Dataset`.')
@@ -1952,10 +1993,21 @@ class KerasTPUModel(models.Model):
   def optimizer(self, optimizer):
     self._optimizer = optimizer
 
+  @property
+  def metrics(self):
+    if self._tpu_model:
+      return self._tpu_model.metrics
+    return self._stateful_metric_functions
+
+  @metrics.setter
+  def metrics(self, metrics):
+    self._stateful_metric_functions = metrics
+
   def _make_train_function(self):
     if not self.train_function:
       self.train_function = TPUFunction(
-          self, model_fn_lib.ModeKeys.TRAIN,
+          self,
+          model_fn_lib.ModeKeys.TRAIN,
           tpu_assignment=self._tpu_assignment)
 
     return self.train_function
@@ -1966,6 +2018,21 @@ class KerasTPUModel(models.Model):
           self, model_fn_lib.ModeKeys.EVAL, tpu_assignment=self._tpu_assignment)
     return self.test_function
 
+  def _make_fit_function(self):
+    if not self._fit_function:
+      self._fit_function = TPUFunction(
+          self,
+          model_fn_lib.ModeKeys.TRAIN,
+          tpu_assignment=self._tpu_assignment)
+
+    return self._fit_function
+
+  def _make_eval_function(self):
+    if not self._eval_function:
+      self._eval_function = TPUFunction(
+          self, model_fn_lib.ModeKeys.EVAL, tpu_assignment=self._tpu_assignment)
+    return self._eval_function
+
   def _make_predict_function(self):
     if not self.predict_function:
       self.predict_function = TPUFunction(
@@ -1998,6 +2065,9 @@ class KerasTPUModel(models.Model):
 
     logging.info('Setting weights on TPU model.')
     cloned_model.set_weights(weights)
+    if self._tpu_model.optimizer is None:
+      # tpu_model may not be compiled, e.g., loading weights and then predict.
+      return
     for k, v in six.iteritems(cpu_optimizer_config):
       opt_var = getattr(self._tpu_model.optimizer, k)
       if isinstance(opt_var, variables.Variable):
@@ -2052,6 +2122,10 @@ class KerasTPUModel(models.Model):
     self._cpu_model.set_weights(weights)
     self._tpu_weights_initialized = False
 
+  def load_weights(self, filepath, by_name=False):
+    self._cpu_model.load_weights(filepath, by_name)
+    self._tpu_weights_initialized = False
+
 
 # pylint: disable=bad-continuation
 def _validate_shapes(model):
@@ -2152,10 +2226,10 @@ def tpu_model(model, strategy=None):
     cpu_model.compile(
         _clone_optimizer(model.optimizer, optimizer_config),
         model.loss,
-        metrics_module.clone_metrics(model.metrics),
+        metrics_module.clone_metrics(model._compile_metrics),
         model.loss_weights,
         model.sample_weight_mode,
-        metrics_module.clone_metrics(model.weighted_metrics),
+        metrics_module.clone_metrics(model._compile_weighted_metrics),
     )
 
   if model_weights:
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
index 004b1012e55515ea7b3e463998c78bf66cac9f56..8b0b240dc7302c203a22349d583323327fc4480b 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 
@@ -216,6 +217,10 @@ class ReplicatedVariable(object):
   def get(self):
     return self._primary_var
 
+  @property
+  def _in_graph_mode(self):
+    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
+
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
     pass
@@ -227,7 +232,7 @@ class ReplicatedVariable(object):
       return self._primary_var._dense_var_to_tensor(dtype, name, as_ref)
     # pylint: enable=protected-access
     if dtype is not None and dtype != self.dtype:
-      return NotImplemented
+      return math_ops.cast(self._read_variable_op(), dtype)
     if as_ref:
       return self.handle
     else:
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index 05264f5a46fcd404a3185306c00ed6a4cdf06f77..3e463823c820a3ef8628324f77e1a9caf8d385d5 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -35,18 +35,27 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 
+_WATCHDOG = None
+
 
 class CoordinatorShutdownException(Exception):
   """Raised when the coordinator needs to shutdown."""
   pass
 
 
+def _clone_session(session, graph=None):
+  return session_lib.Session(
+      target=session.sess_str,
+      config=session._config,  # pylint: disable=protected-access
+      graph=graph if graph else session.graph)
+
+
 def _make_heartbeat_op(session, device, request_ph):
   """Return a heartbeat op or None if heartbeats are not supported by device."""
   try:
     # Test if we can connect in a isolated graph + session
     with ops.Graph().as_default():
-      with session_lib.Session(target=session.sess_str) as temp_session:
+      with _clone_session(session) as temp_session:
         with ops.device(device):
           heartbeat_op = tpu_ops.worker_heartbeat('')
           options = config_pb2.RunOptions(timeout_in_ms=5000)
@@ -175,7 +184,10 @@ class WorkerHeartbeatManager(object):
 def all_worker_devices(session):
   """Return a list of devices for each worker in the system."""
   devices = session.list_devices()
-  return [device.name for device in devices if ':CPU:' in device.name]
+  return [
+      device.name for device in devices
+      if ':CPU:' in device.name and 'coordinator' not in device.name
+  ]
 
 
 class WatchdogManager(threading.Thread):
@@ -215,45 +227,84 @@ class WatchdogManager(threading.Thread):
     self.ping_interval = ping_interval
     self.shutdown_timeout = shutdown_timeout
     self.daemon = True
+    self._config = session._config  # pylint: disable=protected-access
+    self._target = session.sess_str
     self._running = False
+    self._devices = devices
+
+    self._graph = None
+    self._session = None
+    self._worker_manager = None
+
+  def _reset_manager(self):
+    """Reset the graph, session and worker manager."""
     self._graph = ops.Graph()
     self._session = session_lib.Session(
-        target=session.sess_str,
+        target=self._target,
         graph=self._graph,
+        config=self._config,
     )
 
+    if self._devices is None:
+      self._devices = all_worker_devices(self._session)
+
     with self._graph.as_default():
-      if devices is None:
-        devices = all_worker_devices(self._session)
       self._worker_manager = WorkerHeartbeatManager.from_devices(
-          self._session, devices)
+          self._session, self._devices)
 
-  def configure_and_run(self):
-    logging.info('Enabling worker watchdog.')
-    self._running = True
     self._worker_manager.configure(
         event_pb2.WorkerHeartbeatRequest(
             watchdog_config=event_pb2.WatchdogConfig(
                 timeout_ms=self.shutdown_timeout * 1000,)))
 
+  def configure_and_run(self):
+    logging.info('Enabling watchdog timer with %d second timeout '
+                 'and %d second ping interval.',
+                 self.shutdown_timeout, self.ping_interval)
+    self._reset_manager()
+    self._running = True
     self.start()
 
-  def __enter__(self):
-    self.configure_and_run()
-
-  def __exit__(self, exc_type, exc_val, exc_tb):
-    logging.info('Disabling worker watchdog.')
+  def stop(self):
+    logging.info('Stopping worker watchdog.')
     self._worker_manager.configure(
         event_pb2.WorkerHeartbeatRequest(
             watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,)))
     self._running = False
     self.join()
 
+  def __enter__(self):
+    self.configure_and_run()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    self.stop()
+
   def run(self):
     # Don't fetch logs or adjust timing: just ping the watchdog.
+    #
+    # If we hit an exception, reset our session as it is likely broken.
     while self._running:
-      self._worker_manager.ping(request=None)
-      time.sleep(self.ping_interval)
+      try:
+        self._worker_manager.ping(request=None)
+        time.sleep(self.ping_interval)
+      except errors.OpError as e:
+        # Catch any TF errors that occur so we don't stop sending heartbeats
+        logging.debug('Caught error while sending heartbeat: %s', e)
+        self._reset_manager()
+
+
+def start_worker_watchdog(session,
+                          devices=None,
+                          ping_interval=60,
+                          shutdown_timeout=3600):
+  """Start global worker watchdog to shutdown workers on coordinator exit."""
+  global _WATCHDOG
+  if _WATCHDOG is None:
+    # Ensure we can send a few pings before we timeout!
+    ping_interval = min(shutdown_timeout / 10., ping_interval)
+    _WATCHDOG = WatchdogManager(session, devices, ping_interval,
+                                shutdown_timeout)
+    _WATCHDOG.configure_and_run()
 
 
 class GracefulShutdownHook(session_run_hook.SessionRunHook):
@@ -292,8 +343,7 @@ class GracefulShutdownHook(session_run_hook.SessionRunHook):
 
     with self._graph.as_default():
       logging.info('Installing graceful shutdown hook.')
-      self._session = session_lib.Session(
-          target=training_session.sess_str, graph=self._graph)
+      self._session = _clone_session(training_session, self._graph)
       self._workers = WorkerHeartbeatManager.from_devices(
           self._session, all_worker_devices(self._session))
       self._heartbeat_supported = self._workers.num_workers() > 0
diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
new file mode 100644
index 0000000000000000000000000000000000000000..70baea203cc6174bebc7d90646045efae5f2391d
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
@@ -0,0 +1,553 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========================================================================
+"""A utility to trace tensor values on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path
+import re
+
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+
+_TRACER_LOG_PREFIX = ' [>>>TT>>>]'
+_DEVICE_TYPE_TPU = 'tpu'
+_DEVICE_TYPE_CPU = 'cpu'
+_GLOBAL_STEP_OP_NAME = 'GLOBAL-STEP'
+_TRACE_MODE_NAN_INF = 'nan-inf'
+_TRACE_MODE_PART_TENSOR = 'part-tensor'
+_TRACE_MODE_PART_TENSOR_SIZE = 3
+_TRACE_MODE_FULL_TENSOR = 'full-tensor'
+_RECORD_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
+_RECORD_SHOULD_NOT_TRACE = 'not-traced-should-not-trace'
+_RECORD_FILTERED_OUT = 'not-traced-filtered-out'
+_RECORD_SCALAR = 'not-traced-scalar'
+_RECORD_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
+_RECORD_GET_TRACED = 'get-traced'
+_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
+_MARKER_SECTION_END = '!!!!!!! section-end:'
+_SECTION_NAME_CONFIG = 'configuration'
+_SECTION_NAME_REASON = 'reason'
+_SECTION_NAME_OP_LIST = 'op-list'
+_SECTION_NAME_GRAPH = 'graph'
+_FIELD_NAME_VERSION = 'version:'
+_FIELD_NAME_DEVICE = 'device:'
+_FIELD_NAME_TRACE_MODE = 'trace-mode:'
+_FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
+_FIELD_NAME_NUM_OPS = 'number-of-ops:'
+_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
+_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
+_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
+_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
+_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
+_FLAG_NAME_ENABLE = 'enable'
+_FLAG_NAME_TRACE_MODE = 'trace_mode'
+_FLAG_NAME_INTERESTING_OPS = 'interesting_ops'
+_FLAG_NAME_TRACE_FILE = 'trace_file_path'
+_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
+_FLAG_NAME_OP_RANGE = 'op_range'
+_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
+_OUTPUT_STREAM_ESCAPE = 'file://'
+_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
+
+
+class TensorTracer(object):
+  """A software construct for tracing tensor values in a TF graph on TPU.
+
+  This utility is disabled by default. It can be enabled by setting
+  the TENSOR_TRACER_FLAGS env variable as:
+    export TENSOR_TRACER_FLAGS="--enable=1"
+  If it is enabled, it will trace the output tensor values of
+  selected Ops in the graph. It has two outputs: (1) the traces and (2)
+  a report. The traces are dumped to a specified local file on the TPU
+  host. The report is printed to the log.info of the TPU job.
+  By passing options via the env variable, users can change:
+     (1) the trace mode (e.g., detecting NaN/Inf, printing partial or
+         full tensor values)
+     (2) which Ops to be traced (via op.name or op.type)
+     (3) output trace file path.
+  """
+
+  @staticmethod
+  def _match_next_flag(flags, pos):
+    """Returns the match for the next TensorTracer flag."""
+
+    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match
+    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match
+    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
+    return match
+
+  @staticmethod
+  def print_flag_values():
+    """Prints all TensorTracer flags passed via environment variables."""
+
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
+    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
+                                                   tensor_tracer_flags)
+    result += 'Individual flag value:\n'
+    pos = 0
+    while True:
+      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      flag_value = match.group(2)
+      result += '  %s: %s\n'%(flag_name, flag_value)
+      pos = match.end()
+    result += '\n'
+    return result
+
+  @staticmethod
+  def get_flag_value(wanted_flag_name):
+    """Returns the value of a TensorTracer flags."""
+
+    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return ''
+    pos = 0
+    while True:
+      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        return ''
+      flag_name = match.group(1)
+      flag_value = match.group(2)
+      if flag_name == wanted_flag_name:
+        return flag_value
+      pos = match.end()
+    return ''
+
+  @staticmethod
+  def is_enabled():
+    """Returns True if TensorTracer is enabled."""
+
+    flag_value = TensorTracer.get_flag_value(_FLAG_NAME_ENABLE)
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  @staticmethod
+  def use_test_undeclared_outputs_dir():
+    """Decides the output directory of the trace file.
+
+    Args:
+       None.
+
+    Returns:
+       True if the output trace file should be written to the
+       test-undeclared-outputs-directory defined via an
+       env variable.
+    """
+
+    flag_value = TensorTracer.get_flag_value(
+        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  @staticmethod
+  def check_device_type(device_type):
+    """Checks if the given device type is valid."""
+
+    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
+      raise ValueError('Invalid device_type "%s"'%device_type)
+
+  @staticmethod
+  def check_trace_mode(trace_mode):
+    """Checks if the given trace mode is valid."""
+
+    valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
+                         _TRACE_MODE_FULL_TENSOR]
+    if trace_mode not in valid_trace_modes:
+      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
+                       'Valid trace modes are: %s'%(trace_mode,
+                                                    valid_trace_modes))
+
+  @staticmethod
+  def should_trace(device_type, op):
+    """Returns True if the given Op should be traced."""
+
+    if device_type != _DEVICE_TYPE_TPU:
+      raise ValueError('Non TPU device type is not supported')
+    if control_flow_util.IsInCond(op):
+      return False
+    if op.type in ['Reshape', 'ArgMin', 'ArgMax']:
+      return False
+    # pylint: disable=protected-access
+    return tpu._TPU_REPLICATE_ATTR in op.node_def.attr
+    # pylint: enable=protected-access
+
+  @staticmethod
+  def reason(op_idx, details):
+    """Returns why the Op at op_idx is traced or not."""
+    return '%d %s'%(op_idx, details)
+
+  @staticmethod
+  def topological_sort(g):
+    """Performs topological sort on the given graph.
+
+    Args:
+       g: the graph.
+
+    Returns:
+       A pair where the first element indicates if the topological
+       sort succeeded (True if there is no cycle found; False if a
+       cycle is found) and the second element is either the sorted
+       list of nodes or the cycle of nodes found.
+    """
+
+    def visit(op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops):
+      """Recursively visits all Ops in a graph.
+
+      Args:
+         op: the current Op being visited.
+         cycle: a cycle of Ops found.
+         permanently_marked_ops: the set of Ops that were already visited.
+         temporarily_marked_ops: the set of Ops that we have visited during
+                                 the current descent.
+         sorted_ops: the list of Ops sorted in topological order.
+      """
+
+      if cycle:
+        return
+      if op in permanently_marked_ops:
+        return
+      if op in temporarily_marked_ops:
+        cycle = temporarily_marked_ops
+        return
+      temporarily_marked_ops.add(op)
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        for consumer_op in out_tensor.consumers():
+          visit(consumer_op, cycle, permanently_marked_ops,
+                temporarily_marked_ops, sorted_ops)
+      # pylint: disable=protected-access
+      for ctrl_output_op in op._control_outputs:
+      # pylint: enable=protected-access
+        visit(ctrl_output_op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops)
+      temporarily_marked_ops.remove(op)
+      permanently_marked_ops.add(op)
+      sorted_ops.insert(0, op)
+
+    graph_cycle = set([])
+    sorted_ops = []
+    permanently_marked_ops = set([])
+    temporarily_marked_ops = set([])
+    unsorted_ops = g.get_operations()
+    for op in unsorted_ops:
+      visit(op, graph_cycle, permanently_marked_ops,
+            temporarily_marked_ops, sorted_ops)
+    if graph_cycle:
+      return (False, graph_cycle)
+    else:
+      assert len(unsorted_ops) == len(sorted_ops)
+      return (True, sorted_ops)
+
+  def __init__(self):
+    """Initializes a TensorTracer.
+
+    Sets the various member fields from the flags (if given) or the defaults.
+    """
+    self._version = 'use-outside-compilation'
+    self._device_type = None
+    self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
+    if not self._trace_mode:
+      self._trace_mode = _TRACE_MODE_NAN_INF
+    TensorTracer.check_trace_mode(self._trace_mode)
+    self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
+    self._instrument_records = {}
+    interesting_ops = TensorTracer.get_flag_value(_FLAG_NAME_INTERESTING_OPS)
+    self._selected_ops = interesting_ops.split()
+    self._set_trace_file_path()
+    self._set_op_range()
+    self._num_replicas = None
+    self._replica_id = None
+
+  def _add_replica_id_to_graph(self, num_replicas, result_tensor):
+    """Adds nodes for computing the replica ID to the graph."""
+
+    if not num_replicas:
+      self._replica_id = 'unknown'
+      return result_tensor
+
+    self._num_replicas = num_replicas
+
+    with ops.control_dependencies(None):
+      # Uses None as dependency to run outside of TPU graph rewrites.
+      self._replica_id = tpu_ops.tpu_replicated_input(
+          list(range(self._num_replicas)),
+          name='tt_replica_id')
+    use_replica_id = array_ops.identity(self._replica_id).op
+    with ops.control_dependencies([use_replica_id]):
+      # Adds a control dependency from the result_tensor to
+      # the replica_id to ensure that replica_id will be added to the graph.
+      return array_ops.identity(result_tensor)
+
+  def _set_trace_file_path(self):
+    """Sets the path of the output trace file."""
+
+    self._trace_file_path = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_FILE)
+    if not self._trace_file_path:
+      raise ValueError('--%s is not set in the environment variable %s'
+                       %(_FLAG_NAME_TRACE_FILE, _FLAGS_ENV_VAR))
+    elif TensorTracer.use_test_undeclared_outputs_dir():
+      if os.path.isabs(self._trace_file_path):
+        raise ValueError('If use_test_undeclared_outputs_dir is set,'
+                         'trace_file_path cannot be an absolute path (%s)'
+                         %self._trace_file_path)
+      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+      self._trace_file_path = os.path.join(outputs_dir,
+                                           self._trace_file_path)
+
+  def _set_op_range(self):
+    """Sets the index range of the Ops that we will consider tracing."""
+
+    op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
+    if not op_range:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    match = _OP_RANGE_PAT.match(op_range)
+    if not match:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    self._op_range = (int(match.group(1)), int(match.group(2)))
+
+  def _inside_op_range(self, idx):
+    """Return True if the given index is inside the selected range."""
+
+    if idx < self._op_range[0]:
+      return False
+    return self._op_range[1] < 0 or idx <= self._op_range[1]
+
+  def _write_report(self, content):
+    """Writes the given content to the report."""
+
+    logging.info('%s %s'%(_TRACER_LOG_PREFIX, content))
+
+  def _is_selected_op(self, op_name):
+    """Returns True if the Op with op_name is selected to be traced."""
+
+    if not self._selected_ops:
+      return True
+    if op_name in self._selected_ops:
+      return True
+    return False
+
+  def _write_config_section(self):
+    """Writes the config section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
+    self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
+    self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
+
+  def _write_reason_section(self):
+    """Writes the reason section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON))
+    for key in sorted(self._instrument_records):
+      self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
+
+  def _write_op_list_section(self, op_list):
+    """Writes the Op-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
+    for i in range(0, len(op_list)):
+      self._write_report('%d "%s" %s\n'%(i, op_list[i].name, op_list[i].type))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
+
+  def _write_graph_section(self, succeed, sorted_or_cycle):
+    """Writes the graph section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED,
+                                  succeed))
+    l = list(sorted_or_cycle)
+    for i in range(0, len(l)):
+      self._write_report('%d "%s"\n'%(i, l[i].name))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
+
+  def _make_tensor_trace_fun(self, op_name, output_idx):
+    """Makes the tensor tracing function called by outside compilation.
+
+    Args:
+      op_name: the name of the Op that outputs the tensor to be traced.
+      output_idx: which output of the Op it is (0 means the first output).
+
+    Returns:
+      A function to be passed as the first argument to outside compilation.
+
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _print_tensor(op_name, output_idx, num_elements, tensor, output_tensor):
+      """Prints a tensor value to a file.
+
+      Args:
+        op_name: the name of the Op that outputs the tensor to be printed.
+        output_idx: which output of the Op it is (0 means the first output).
+        num_elements: number of elements to print.
+        tensor: the tensor needs to be returned.
+        output_tensor: the tensor needs to be printed.
+
+      Returns:
+        The same tensor passed via the "tensor" argument.
+      """
+      msg = '"%s:%d" '%(op_name, output_idx)
+      output_stream = _OUTPUT_STREAM_ESCAPE + self._trace_file_path
+      print_op = logging_ops.print_v2(msg, array_ops.shape(output_tensor),
+                                      ' @', self._replica_id,
+                                      '\n', output_tensor,
+                                      summarize=num_elements,
+                                      output_stream=output_stream)
+      with ops.control_dependencies([print_op]):
+        return array_ops.identity(tensor).op
+
+    def _detect_nan_inf(tensor):
+      """Trace function for detecting any NaN/Inf in the tensor."""
+
+      if tensor.dtype.is_floating:
+        # Since host can't handle bf16, always convert tensor to f32.
+        tensor = math_ops.cast(tensor, dtypes.float32)
+        output_tensor = math_ops.reduce_any(
+            gen_math_ops.logical_or(gen_math_ops.is_nan(tensor),
+                                    gen_math_ops.is_inf(tensor)))
+      else:
+        output_tensor = constant_op.constant(0)
+      return _print_tensor(op_name, output_idx, 1, tensor, output_tensor)
+
+    def _show_global_step(tensor):
+      """Trace function for printing the global step count."""
+
+      return _print_tensor(op_name, output_idx, 1, tensor, tensor)
+
+    def _show_part_tensor(tensor):
+      """Trace function for printing part of the tensor."""
+
+      return _print_tensor(op_name, output_idx, self._part_tensor_size,
+                           tensor, tensor)
+
+    def _show_full_tensor(tensor):
+      """Trace function for printing the entire tensor."""
+
+      return _print_tensor(op_name, output_idx, -1, tensor, tensor)
+
+    if op_name == _GLOBAL_STEP_OP_NAME:
+      return _show_global_step
+    if self._trace_mode == _TRACE_MODE_NAN_INF:
+      return _detect_nan_inf
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return _show_part_tensor
+    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+      return _show_full_tensor
+
+    raise RuntimeError('Tensor trace fun for %s is not yet implemented'
+                       %self._trace_mode)
+
+  def trace_tpu(self, graph, result_tensor, num_replicas=None):
+    """Traces the tensors generated by TPU Ops in a TF graph.
+
+    Args:
+      graph: the graph of Ops.
+      result_tensor: a result tensor of evaluating the graph.
+      num_replicas: number of replicas used on the TPU.
+
+    Returns:
+      A tuple (result_tensor_copy, tracing_ops), where:
+        result_tensor_copy: an exact copy of result_tensor
+        tracing_ops: a list of tracing ops. If this list
+                     is non empty, the caller of this function
+                     should pose control dependencies upon these
+                     Ops so that they will be executed when the
+                     graph is evaluated.
+    """
+
+    self._device_type = _DEVICE_TYPE_TPU
+    TensorTracer.check_device_type(self._device_type)
+    result_tensor_copy = self._add_replica_id_to_graph(num_replicas,
+                                                       result_tensor)
+    self._write_config_section()
+    tracing_ops = []
+    operations = graph.get_operations()
+    self._write_op_list_section(operations)
+    # Does the topological sort before adding any nodes to the graph.
+    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
+    for op_id, op in enumerate(operations):
+      if not self._inside_op_range(op_id):
+        self._instrument_records[op.name] = TensorTracer.reason(
+            op_id, _RECORD_OUTSIDE_OP_RANGE)
+        continue
+      if not TensorTracer.should_trace(self._device_type, op):
+        self._instrument_records[op.name] = TensorTracer.reason(
+            op_id, _RECORD_SHOULD_NOT_TRACE)
+        continue
+      if not self._is_selected_op(op.name):
+        self._instrument_records[op.name] = TensorTracer.reason(
+            op_id, _RECORD_FILTERED_OUT)
+        continue
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        if not out_tensor.get_shape().is_fully_defined():
+          self._instrument_records[out_tensor.name] = TensorTracer.reason(
+              op_id, _RECORD_DYNAMIC_SHAPE)
+          continue  # cannot trace tensors with dynamic shape.
+        rank = len(out_tensor.shape)
+        if rank < 1:
+          self._instrument_records[out_tensor.name] = TensorTracer.reason(
+              op_id, _RECORD_SCALAR)
+          continue  # cannot trace scalar.
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _RECORD_GET_TRACED)
+        consumers = out_tensor.consumers()
+        trace_op = tpu.outside_compilation(
+            self._make_tensor_trace_fun(op.name, i), out_tensor)
+        if consumers:
+          for consumer_op in consumers:
+            # pylint: disable=protected-access
+            consumer_op._add_control_input(trace_op)
+            # pylint: enable=protected-access
+        else:
+          # if there is no consumer, we will add the control dependence later
+          # when we add the control dependency to the output operations.
+          tracing_ops.append(trace_op)
+
+    self._write_reason_section()
+    self._write_graph_section(succeed, sorted_or_cycle)
+
+    return (result_tensor_copy, tracing_ops)
diff --git a/tensorflow/contrib/tpu/python/tpu/topology.py b/tensorflow/contrib/tpu/python/tpu/topology.py
index ab89c6aa8ca3a5e62bdfab991ee1a9335397c070..6ae718cc2c9716587849aeee8abcd0a1de82a9ae 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology.py
+++ b/tensorflow/contrib/tpu/python/tpu/topology.py
@@ -19,10 +19,27 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.tpu.proto import topology_pb2
 
 
+def _tpu_device_name(job, task, device):
+  """Returns the device name for the TPU `device` on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:TPU:%d" % (task, device)
+  else:
+    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
+
+
+def _tpu_host_device_name(job, task):
+  """Returns the device name for the CPU device on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:CPU:0" % task
+  else:
+    return "/job:%s/task:%d/device:CPU:0" % (job, task)
+
+
 class Topology(object):
   """Describes a set of TPU devices.
 
@@ -71,6 +88,8 @@ class Topology(object):
         raise ValueError("`device_coordinates` must be a rank 3 int32 array "
                          "with minor dimension equal to the mesh shape rank")
 
+    self._topology_tasks, self._topology_devices = self._invert_topology()
+
   def _parse_topology(self, serialized):
     """Parses a serialized `TopologyProto` into `self`."""
     proto = topology_pb2.TopologyProto()
@@ -106,6 +125,17 @@ class Topology(object):
                              len(proto.mesh_shape)))
     self._device_coordinates = coords
 
+  def _invert_topology(self):
+    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
+    tasks = np.full(list(self.mesh_shape), -1, dtype=np.int32)
+    devices = np.full(list(self.mesh_shape), -1, dtype=np.int32)
+    for task in xrange(self.device_coordinates.shape[0]):
+      for device in xrange(self.device_coordinates.shape[1]):
+        x, y, z = self.device_coordinates[task, device, :]
+        tasks[x, y, z] = task
+        devices[x, y, z] = device
+    return tasks, devices
+
   @property
   def mesh_shape(self):
     """A rank 1 int32 array describing the shape of the TPU topology."""
@@ -130,6 +160,43 @@ class Topology(object):
     """
     return self._device_coordinates
 
+  def task_ordinal_at_coordinates(self, device_coordinates):
+    """Returns the TensorFlow task number attached to `device_coordinates`.
+
+    Args:
+      device_coordinates: An integer sequence describing a device's physical
+        coordinates in the TPU fabric.
+
+    Returns:
+      Returns the TensorFlow task number that contains the TPU device with those
+      physical coordinates.
+    """
+    return self._topology_tasks[tuple(device_coordinates)]
+
+  def tpu_device_ordinal_at_coordinates(self, device_coordinates):
+    """Returns the TensorFlow device number at `device_coordinates`.
+
+    Args:
+      device_coordinates: An integer sequence describing a device's physical
+        coordinates in the TPU fabric.
+
+    Returns:
+      Returns the TensorFlow device number within the task corresponding to
+      attached to the device with those physical coordinates.
+    """
+    return self._topology_devices[tuple(device_coordinates)]
+
+  def cpu_device_name_at_coordinates(self, device_coordinates, job=None):
+    """Returns the CPU device attached to a logical core."""
+    return _tpu_host_device_name(
+        job, self._topology_tasks[tuple(device_coordinates)])
+
+  def tpu_device_name_at_coordinates(self, device_coordinates, job=None):
+    """Returns the name of the TPU device assigned to a logical core."""
+    return _tpu_device_name(job,
+                            self._topology_tasks[tuple(device_coordinates)],
+                            self._topology_devices[tuple(device_coordinates)])
+
   @property
   def num_tasks(self):
     """Returns the number of TensorFlow tasks in the TPU slice."""
diff --git a/tensorflow/contrib/tpu/python/tpu/topology_test.py b/tensorflow/contrib/tpu/python/tpu/topology_test.py
index e67fdb263aa48a37f65c3623365ebcf8f98bebd4..fafe3254d84551d3d7ed8a9d3346849411714f97 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/topology_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.platform import test
 class TopologyTest(test.TestCase):
 
   def testSerialization(self):
-    """Test if the class is able to generate serialzied string."""
+    """Tests if the class is able to generate serialized strings."""
     original_topology = topology.Topology(
         mesh_shape=[1, 1, 2],
         device_coordinates=[[[0, 0, 0], [0, 0, 1]]],
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 11aaa1c66a82c7f062e4e0d3702cd0c2e3304b44..def57da20d6018dcf27ccb7a9d04592f38ce2f7c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.compiler import xla
 from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
@@ -371,14 +372,11 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     if external_control_inputs:
       # Use an identity to pull control inputs as data inputs. Note that we
       # ignore ops which don't have outputs. TODO(phawkins): fix that.
-      with ops.control_dependencies(None):
-        self.Enter()
-        external_control_inputs = [
-            array_ops.identity(x.outputs[0]).op
-            for x in external_control_inputs
-            if x.outputs
-        ]
-        self.Exit()
+      external_control_inputs = [
+          array_ops.identity(x.outputs[0]).op
+          for x in external_control_inputs
+          if x.outputs
+      ]
       # pylint: disable=protected-access
       op._add_control_inputs(external_control_inputs)
       # pylint: enable=protected-access
@@ -601,7 +599,7 @@ def split_compile_and_replicate(computation,
           "input types {}, replica {} had input types {}".format(
               input_types, i, types))
 
-  arg_error = tpu_function.check_function_argument_count(
+  arg_error = xla.check_function_argument_count(
       computation, input_arity, infeed_queue)
   if arg_error is not None:
     if infeed_queue is None:
@@ -1003,8 +1001,8 @@ def rewrite(computation,
       `rewrite` is a list of tensors corresponding to the tensors from the
       output of `computation`.
 
-      All `Operation`s returned from `computation` will be executed when
-      evaluating any of the returned output tensors.
+      All `Operation`s constructed during `computation` will be executed when
+      evaluating any of the returned output tensors, not just the ones returned.
     inputs: A list of input tensors or `None` (equivalent to an empty list).
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
@@ -1113,7 +1111,7 @@ def validate_inference_rewrite_for_variables(graph):
   Raises:
     RuntimeError: if validation failed.
   """
-  if not any([x.type == "GuaranteeConst" for x in graph.get_operations()]):
+  if not any(x.type == "GuaranteeConst" for x in graph.get_operations()):
     raise RuntimeError(
         "No GuaranteeConst ops found in the graph after running "
         "tpu.rewrite_for_inference(...). Please check that you are using "
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index da6bdf67d686fba09d66386de982b57aa28d4dd4..672462447944b777375331d49727c4d5366cf295 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -41,7 +41,7 @@ _NUM_CORES_TO_COMPUTATION_SHAPE = {
 
 
 class TPUContext(object):
-  """The context of current input_fn invocation."""
+  """A context that holds the current configuration of the TPU computation."""
 
   def __init__(self,
                internal_ctx,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccba8a46c7cad0337119672e02314684f4451479
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
@@ -0,0 +1,1110 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU embedding APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import math
+import re
+import six
+
+from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.tpu.ops import gen_tpu_ops
+from tensorflow.contrib.tpu.proto import tpu_embedding_configuration_pb2 as elc
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+
+TRAINING = elc.TPUEmbeddingConfiguration.TRAINING
+INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
+
+# TODO(shizhiw): A better interface is to make `num_hosts` and
+# `num_cores_per_host` optional parameters for `TPUEmbedding`
+# constructor. Usually they can be automatically detected, but
+# user can also specify them for debugging (b/112112496).
+# Auto-detection can be done with `tpu_system_metadata.py`.
+_MASTER_JOB = 'tpu_worker'
+_HOST_PATTERN = '/job:tpu_worker/task:{}/device:CPU:0'
+_NUM_CORES_PER_HOST = 8
+
+_TEST_MASTER_JOB = None
+_TEST_HOST = '/replica:0/task:0/device:CPU:0'
+_TEST_NUM_CORES_PER_HOST = 2
+
+
+class TableConfig(
+    collections.namedtuple(
+        'TableConfig',
+        ['vocabulary_size', 'dimension', 'initializer', 'combiner'])):
+  """Embedding table configuration."""
+
+  @experimental
+  def __new__(cls,
+              vocabulary_size,
+              dimension,
+              initializer=None,
+              combiner='mean'):
+    """Embedding table configuration.
+
+    Args:
+      vocabulary_size: Number of vocabulary (/rows) in the table.
+      dimension: The embedding dimension.
+      initializer: A variable initializer function to be used in embedding
+        variable initialization. If not specified, defaults to
+        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+        `1/sqrt(dimension)`.
+      combiner: A string specifying how to reduce if there are multiple entries
+        in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+        'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+        with bag-of-words columns. For more information, see
+        `tf.nn.embedding_lookup_sparse`.
+
+    Returns:
+      `TableConfig`.
+
+    Raises:
+      ValueError: if `vocabulary_size` is not positive integer.
+      ValueError: if `dimension` is not positive integer.
+      ValueError: if `initializer` is specified and is not callable.
+      ValueError: if `combiner` is not supported.
+    """
+    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
+      raise ValueError('Invalid vocabulary_size {}.'.format(vocabulary_size))
+
+    if not isinstance(dimension, int) or dimension < 1:
+      raise ValueError('Invalid dimension {}.'.format(dimension))
+
+    if (initializer is not None) and (not callable(initializer)):
+      raise ValueError('initializer must be callable if specified.')
+    if initializer is None:
+      initializer = init_ops.truncated_normal_initializer(
+          mean=0.0, stddev=1 / math.sqrt(dimension))
+
+    if combiner not in ('mean', 'sum', 'sqrtn'):
+      raise ValueError('Invalid combiner {}'.format(combiner))
+
+    return super(TableConfig, cls).__new__(cls, vocabulary_size, dimension,
+                                           initializer, combiner)
+
+
+# TODO(shizhiw): Factor `use_gradient_accumulation` and
+# `pipeline_execution_with_tensor_core` out of `_OptimizationParameters`.
+class _OptimizationParameters(object):
+  """Parameters common to all optimizations."""
+
+  def __init__(self, learning_rate, use_gradient_accumulation,
+               pipeline_execution_with_tensor_core):
+    self.learning_rate = learning_rate
+    self.use_gradient_accumulation = use_gradient_accumulation
+    self.pipeline_execution_with_tensor_core = (
+        pipeline_execution_with_tensor_core)
+
+
+class AdagradParameters(_OptimizationParameters):
+  """Optimization parameters for Adagrad."""
+
+  def __init__(self, learning_rate, initial_accumulator,
+               use_gradient_accumulation=False,
+               pipeline_execution_with_tensor_core=True):
+    """Optimization parameters for Adagrad.
+
+    Args:
+      learning_rate: used for updating embedding table.
+      initial_accumulator: initial accumulator for Adagrad.
+      use_gradient_accumulation: setting this to `True` makes embedding
+         gradients calculation more accurate but slower. Please see
+         `optimization_parameters.proto` for details.
+         for details.
+      pipeline_execution_with_tensor_core: setting this to `True` makes training
+        faster, but trained model will be different if step N and step N+1
+        involve the same set of embedding ID. Please see
+        `tpu_embedding_configuration.proto` for details.
+    """
+    super(AdagradParameters, self).__init__(learning_rate,
+                                            use_gradient_accumulation,
+                                            pipeline_execution_with_tensor_core)
+    self.initial_accumulator = initial_accumulator
+
+
+class AdamParameters(_OptimizationParameters):
+  """Optimization parameters for Adam."""
+
+  def __init__(self, learning_rate,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-08,
+               lazy_adam=True,
+               sum_inside_sqrt=True,
+               use_gradient_accumulation=False,
+               pipeline_execution_with_tensor_core=True):
+    """Optimization parameters for Adam.
+
+    Args:
+      learning_rate: a floating point value. The learning rate.
+      beta1: A float value.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability.
+      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
+        Please see `optimization_parameters.proto` for details.
+      sum_inside_sqrt: This improves training speed. Please see
+        `optimization_parameters.proto` for details.
+      use_gradient_accumulation: setting this to `True` makes embedding
+        gradients calculation more accurate but slower. Please see
+        `optimization_parameters.proto` for details.
+        for details.
+      pipeline_execution_with_tensor_core: setting this to `True` makes training
+        faster, but trained model will be different if step N and step N+1
+        involve the same set of embedding ID. Please see
+        `tpu_embedding_configuration.proto` for details.
+    """
+    super(AdamParameters, self).__init__(learning_rate,
+                                         use_gradient_accumulation,
+                                         pipeline_execution_with_tensor_core)
+    self.beta1 = beta1
+    self.beta2 = beta2
+    self.epsilon = epsilon
+    self.lazy_adam = lazy_adam
+    self.sum_inside_sqrt = sum_inside_sqrt
+
+
+class StochasticGradientDescentParameters(_OptimizationParameters):
+  """Optimization parameters for stochastic gradient descent.
+
+  Args:
+    learning_rate: a floating point value. The learning rate.
+    use_gradient_accumulation: setting this to `True` makes embedding
+      gradients calculation more accurate but slower. Please see
+         `optimization_parameters.proto` for details.
+    pipeline_execution_with_tensor_core: setting this to `True` makes training
+      faster, but trained model will be different if step N and step N+1
+      involve the same set of embedding ID. Please see
+      `tpu_embedding_configuration.proto` for details.
+    """
+
+  def __init__(self, learning_rate, use_gradient_accumulation=False,
+               pipeline_execution_with_tensor_core=True):
+    super(StochasticGradientDescentParameters, self).__init__(
+        learning_rate, use_gradient_accumulation,
+        pipeline_execution_with_tensor_core)
+
+
+class TPUEmbedding(object):
+  """API for using TPU for embedding.
+
+    Example:
+    ```
+    table_config_user = tpu_embedding.TableConfig(
+        vocabulary_size=4, dimension=2,
+        initializer=initializer, combiner='mean')
+    table_to_config_dict = {'video': table_config_video,
+                          'user': table_config_user}
+    feature_to_table_dict = {'watched': 'video',
+                             'favorited': 'video',
+                             'friends': 'user'}
+    batch_size = 4
+    num_hosts = 1
+    optimization_parameters = tpu_embedding.AdagradParameters(1., 1.)
+    mode = tpu_embedding.TRAINING
+    embedding = tpu_embedding.TPUEmbedding(
+        table_to_config_dict, feature_to_table_dict,
+        batch_size, num_hosts, mode, optimization_parameters)
+
+    batch_size_per_core = embedding.batch_size_per_core
+    sparse_features_list = []
+    for host in hosts:
+      with ops.device(host):
+        for _ in range(embedding.num_cores_per_host):
+          sparse_features = {}
+          sparse_features['watched'] = sparse_tensor.SparseTensor(...)
+          sparse_features['favorited'] = sparse_tensor.SparseTensor(...)
+          sparse_features['friends'] = sparse_tensor.SparseTensor(...)
+          sparse_features_list.append(sparse_features)
+
+    enqueue_ops = embedding.generate_enqueue_ops(sparse_features_list)
+
+    def computation():
+      activations = embedding.get_activations()
+      loss = compute_loss(activations)
+
+      base_optimizer = gradient_descent.GradientDescentOptimizer(
+          learning_rate=1)
+      cross_shard_optimizer = tpu_optimizer.CrossShardOptimizer(
+          base_optimizer)
+
+      train_op = cross_shard_optimizer.minimize(loss)
+      # `train_op` and `send_gradients_op` must happen in order.
+      with ops.control_dependencies([train_op]):
+        send_gradients_op = embedding.generate_send_gradients_op()
+      with ops.control_dependencies([send_gradients_op]):
+        loss = array_ops.identity(loss)
+
+    loss = tpu.shard(computation,
+                     num_shards=embedding.num_cores)
+
+    with self.test_session() as sess:
+      sess.run(tpu.initialize_system(embedding_config=
+                                     embedding.config_proto))
+      sess.run(variables.global_variables_initializer())
+      sess.run(embedding.init_ops)
+      sess.run(enqueue_ops)
+      loss_val = sess.run(loss)
+    ```
+  """
+
+  # TODO(shizhiw): Instead of `feature_to_table_dict` which maps to table
+  # name, consider `feature_to_config_dict` which maps to `FeatureConfig`.
+  # `FeatureConfig` could have fields other than table name. For example, it
+  # could have a field to indicate that the feature should not be used to
+  # update embedding table (cr/204852758, cr/204940540). Also, this can support
+  # different combiners for different features within the same table.
+  # TODO(shizhiw, b/118512626): Remove `batch_size` from `__init__` and move it
+  # to `FeatureConfig`?
+
+  # TODO(shizhiw): will it be cleaner to make `table_to_config_dict` and
+  # `feature_to_table_dict` lists of `TableSpec` and `FeatureSpec` respectively?
+
+  # TODO(shizhiw): Consider adding `input_fn` as an option to remove boilerplate
+  # for-loops around construction of inputs.
+
+  # `optimization_parameter` applies to all tables. If the need arises,
+  # we can add `optimization_parameters` to `TableConfig` to override this
+  # global setting.
+  @experimental
+  def __init__(self,
+               table_to_config_dict,
+               feature_to_table_dict,
+               batch_size,
+               num_hosts,
+               mode,
+               optimization_parameters=None,
+               tpu_embedding_test=False):
+    """API for using TPU for embedding lookups.
+
+    Args:
+      table_to_config_dict: A dictionary mapping from string of table name to
+        `TableConfig`. Table refers to an embedding table, e.g. `params`
+        argument to `tf.nn.embedding_lookup_sparse()`.
+      feature_to_table_dict: A dictionary mapping from string of feature name
+        to string of table name. Feature refers to ids to lookup in embedding
+        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
+      batch_size: An `int` representing the global batch size.
+      num_hosts: An `int` representing the number of TPU hosts.
+      mode: `TRAINING` or `INFERENCE`.
+      optimization_parameters: `AdagradParameters`, `AdamParameters`,
+        `Stochasticgradientdescentparameters`. Must be set in training and must
+        be `None` in inference.
+      tpu_embedding_test: A `bool`. Only used for testing.
+
+    Raises:
+      ValueError: if any input is invalid.
+    """
+    _validate_table_to_config_dict(table_to_config_dict)
+    # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
+    self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
+    self._combiners = _create_combiners(self._table_to_config_dict)
+
+    _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict)
+    self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict)
+    self._table_to_features_dict = _create_table_to_features_dict(
+        self._feature_to_table_dict)
+
+    self._batch_size = batch_size
+
+    if tpu_embedding_test:
+      self._num_hosts = 1
+      self._hosts = [_TEST_HOST]
+      self._num_cores_per_host = _TEST_NUM_CORES_PER_HOST
+    else:
+      self._num_hosts = num_hosts
+      self._hosts = [_HOST_PATTERN.format(i) for i in range(self._num_hosts)]
+      self._num_cores_per_host = _NUM_CORES_PER_HOST
+    self._num_cores = self._num_cores_per_host * self._num_hosts
+
+    _validate_batch_size(self._batch_size, self._num_cores)
+    self._batch_size_per_core = self._batch_size // self._num_cores
+
+    self._init_ops = []
+
+    # TODO(shizhiw): remove `mode`?
+    if mode == TRAINING:
+      _validate_optimization_parameters(optimization_parameters)
+      self._optimization_parameters = optimization_parameters
+    elif mode == INFERENCE:
+      if optimization_parameters is not None:
+        raise ValueError('`optimization_parameters` should be `None` '
+                         'for inference mode.')
+      self._optimization_parameters = (
+          StochasticGradientDescentParameters(1.))
+    else:
+      raise ValueError('`mode` only supports {} and {}; got {}.'
+                       .format(TRAINING, INFERENCE, mode))
+    self._mode = mode
+
+    # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
+    # and create special handler for inference that inherits from
+    # StochasticGradientDescentHandler with more user-friendly error message
+    # on get_slot().
+    self._optimizer_handler = _get_optimization_handler(
+        self._optimization_parameters)
+
+    dummy_table_variables_init_op = self._create_dummy_table_variables()
+    self._init_ops.append(dummy_table_variables_init_op)
+
+    self._config_proto = self._create_config_proto()
+
+    self._create_variables_and_ops()
+    self._init_ops.extend(self._load_parameters_ops)
+
+  @property
+  def hosts(self):
+    """A list of device names for CPU hosts.
+
+    Returns:
+      A list of device names for CPU hosts.
+    """
+    return self._hosts
+
+  # TODO(shizhiw): change to num_tensor_cores_per_host to be more explicit and
+  # to be consistent with `tpu_embedding_configuration.proto`.
+  @property
+  def num_cores_per_host(self):
+    """Number of TPU cores on a CPU host.
+
+    Returns:
+      Number of TPU cores on a CPU host.
+    """
+    return self._num_cores_per_host
+
+  @property
+  def num_cores(self):
+    """Total number of TPU cores on all hosts.
+
+    Returns:
+      Total number of TPU cores on all hosts.
+    """
+    return self._num_cores
+
+  @property
+  def batch_size_per_core(self):
+    """Batch size for each TPU core.
+
+    The sparse tensors in `sparse_features_list` to `generate_enqueue_ops`
+       must have batch dimension equal to this.
+
+    Returns:
+      Batch size for each TPU core.
+    """
+    return self._batch_size_per_core
+
+  @property
+  def config_proto(self):
+    """Create embedding config proto for `tpu.initialize_system()`.
+
+    Returns:
+      an `TPUEmbeddingConfiguration` proto describing the desired
+         configuration of the hardware embedding lookup tables, which
+         is passed to `tpu.initialize_system()`.
+    """
+    return self._config_proto
+
+  @property
+  def init_ops(self):
+    """Initialization ops for TPU embedding.
+
+    It must be called after all global variables have been initialized,
+    i.e. after `global_variables_initializer()`, as it loads embedding
+    tables into TPU.
+
+    Returns:
+      A list of ops.
+    """
+    return self._init_ops
+
+  # TODO(shizhiw): get table variables the same way as getting slot variables.
+  @property
+  def table_to_table_variables_dict(self):
+    return copy.copy(self._table_to_table_variables_dict)
+
+  def get_slot_names(self):
+    """Return a list of the names of slots created by `TPUEmbedding`."""
+    return self._optimizer_handler.get_slot_names()
+
+  def get_slot(self, table, name):
+    """Return a slot named `name` create for `table` by `TPUEmbedding`."""
+    return self._optimizer_handler.get_slot(table, name)
+
+  # TODO(shizhiw): expose load to user too?
+  @property
+  def retrieve_parameters_ops(self):
+    return self._retrieve_parameters_ops
+
+  def _create_config_proto(self):
+    """Create `TPUEmbeddingConfiguration`."""
+    config_proto = elc.TPUEmbeddingConfiguration()
+    for table in self._table_to_config_dict:
+      table_descriptor = config_proto.table_descriptor.add()
+      table_descriptor.name = table
+
+      table_config = self._table_to_config_dict[table]
+      table_descriptor.vocabulary_size = table_config.vocabulary_size
+      table_descriptor.dimension = table_config.dimension
+
+      features_for_table = self._table_to_features_dict[table]
+      table_descriptor.num_features = len(features_for_table)
+
+      table_descriptor.optimization_parameters.learning_rate.constant = (
+          self._optimization_parameters.learning_rate)
+      table_descriptor.optimization_parameters.use_gradient_accumulation = (
+          self._optimization_parameters.use_gradient_accumulation)
+      self._optimizer_handler.set_optimization_parameters(table_descriptor)
+
+    config_proto.mode = self._mode
+    config_proto.batch_size_per_tensor_core = self._batch_size_per_core
+    config_proto.num_hosts = self._num_hosts
+    config_proto.num_tensor_cores = self._num_cores
+    config_proto.sharding_strategy = elc.TPUEmbeddingConfiguration.DIV_DEFAULT
+    config_proto.pipeline_execution_with_tensor_core = (
+        self._optimization_parameters.pipeline_execution_with_tensor_core)
+
+    return config_proto
+
+  def _create_variables_and_ops(self):
+    """Create embedding variables and return ops to load them into TPU."""
+    self._load_parameters_ops = []
+    self._retrieve_parameters_ops = []
+    self._table_to_table_variables_dict = {}
+    for table in self._table_to_config_dict:
+      device_fn = _create_device_fn(self._hosts)
+      with ops.device(device_fn):
+        # TODO(shizhiw): allow user to specify variable name so that
+        # they could make the name consistent with CPU etc.
+        variable_name = table
+        table_variables = _create_partitioned_variables(
+            name=variable_name,
+            num_hosts=self._num_hosts,
+            vocabulary_size=self._table_to_config_dict[table].vocabulary_size,
+            embedding_dimension=self._table_to_config_dict[table].dimension,
+            initializer=self._table_to_config_dict[table].initializer,
+            collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+        self._table_to_table_variables_dict[table] = table_variables
+
+        self._optimizer_handler.create_variables_and_ops(
+            table, variable_name, self._num_hosts,
+            self._table_to_config_dict[table], table_variables,
+            self._load_parameters_ops, self._retrieve_parameters_ops)
+
+  def _create_dummy_table_variables(self):
+    """Create dummy embedding table variables.
+
+    The sole purpose of these dummy variables are to trigger gradient
+    calcuation wrt them so that the gradients wrt activation can be captured
+    and later sent to TPU embedding.
+
+    Returns:
+      Initializer for these variables.
+
+    Raises:
+      RuntimeError: if collection to store gradients already exists and is not
+      empty.
+    """
+    self._dummy_table_variables = []
+    # TODO(shizhiw): remove table id.
+    for table_id, table in enumerate(self._table_to_features_dict):
+      self._dummy_table_variables.append(
+          variable_scope.get_variable(
+              'tpu_embedding_dummy_table_variable_%s' % table,
+              dtype=dtypes.float32,
+              shape=[1],
+              use_resource=True,
+              trainable=True,
+              # TODO(shizhiw): Remove these dummy variables as
+              # tensorflow optimizer creates slot variable for them which
+              # is undesirable.
+              # e.g. tpu_embedding_dummy_table_variable_mlp_user/Adam{_1}.
+              # Explicitly specifying collections prevents this variable from
+              # being added to the GLOBAL_VARIABLES collection, so that Saver()
+              # ignores it.
+              collections=['tpu_embedding_dummy_table_variables']))
+
+      g = ops.get_default_graph()
+      table_gradients = g.get_collection_ref(
+          'tpu_embedding_gradients_table_%d' % table_id)
+      if table_gradients:
+        raise RuntimeError(
+            'tpu_embedding_gradients_table_%d is not empty.' % table_id)
+      table_gradients.extend([None] * len(self._table_to_features_dict[table]))
+
+    return variables.variables_initializer(
+        self._dummy_table_variables,
+        name='tpu_embedding_dummy_table_variables_init')
+
+  def generate_enqueue_ops(self, sparse_features_list):
+    """Generate enqueue ops.
+
+    Args:
+      sparse_features_list: a list of dictionary mapping from string
+        of feature names to sparse tensor. Each dictionary is for one
+        TPU core. Dictionaries for the same core should be contiguous
+        on the list.
+
+    Returns:
+      Ops to enqueue to TPU for embedding.
+    """
+    self._validate_generate_enqueue_ops_sparse_features_list(
+        sparse_features_list)
+    return [
+        self._generate_enqueue_op(
+            sparse_features, device_ordinal=i % self._num_cores_per_host)
+        for i, sparse_features in enumerate(sparse_features_list)
+    ]
+
+  def _validate_generate_enqueue_ops_sparse_features_list(
+      self, sparse_features_list):
+    """Validate `sparse_features_list`."""
+    if len(sparse_features_list) != self._num_cores:
+      raise ValueError('Length of `sparse_features_list` should match the '
+                       'number of cores; '
+                       '`len(sparse_features_list)` is {}, '
+                       'number of cores is {}.'.format(
+                           len(sparse_features_list), self._num_cores))
+
+    feature_set = set(self._feature_to_table_dict.keys())
+    contiguous_device = None
+    for i, sparse_features in enumerate(sparse_features_list):
+      used_feature_set = set(sparse_features.keys())
+
+      # Check features are valid.
+      missing_feature_set = feature_set - used_feature_set
+      if missing_feature_set:
+        raise ValueError('`sparse_features_list[{}]` misses a feature that is '
+                         'in `feature_to_config_dict`: {}.'.format(
+                             i, missing_feature_set))
+
+      extra_feature_set = used_feature_set - feature_set
+      if extra_feature_set:
+        raise ValueError('`sparse_features_list[{}]` has a feature that is not '
+                         'in `feature_to_config_dict`: {}.'.format(
+                             i, extra_feature_set))
+
+      device = None
+      device_feature = None
+      for feature, tensor in six.iteritems(sparse_features):
+        if not isinstance(tensor, sparse_tensor.SparseTensor):
+          raise ValueError('`sparse_features_list[{}]` has a feature that is '
+                           'not mapped to `SparseTensor`. '
+                           '`feature`: {}, type: {}'.format(
+                               i, feature, type(tensor)))
+
+        # Check all features are on the same device.
+        if device is None:
+          device = tensor.op.device
+          device_feature = feature
+        else:
+          if device != tensor.op.device:
+            raise ValueError('Devices are different between features in '
+                             '`sparse_features_list[{}]`; '
+                             'devices: {}, {}; features: {}, {}.'.format(
+                                 i, device, tensor.op.device, feature,
+                                 device_feature))
+
+      if i % self._num_cores_per_host:
+        if device != contiguous_device:
+          raise ValueError('We expect the `sparse_features` which are on the '
+                           'same host to be contiguous in '
+                           '`sparse_features_list`, '
+                           '`sparse_features_list[{}]` is on device {}, '
+                           'but is expected to be on device {}.'.format(
+                               i, device, contiguous_device))
+      else:
+        contiguous_device = device
+
+  def _generate_enqueue_op(self, sparse_features, device_ordinal):
+    with ops.colocate_with(list(sparse_features.values())[0]):
+      sample_idcs, embedding_idcs, aggregation_weights = (
+          self._format_for_tpu_embedding_sparse_batch(sparse_features))
+      return tpu_ops.enqueue_tpu_embedding_sparse_batch(
+          sample_idcs,
+          embedding_idcs,
+          aggregation_weights,
+          combiners=self._combiners,
+          device_ordinal=device_ordinal)
+
+  def _format_for_tpu_embedding_sparse_batch(self, sparse_features):
+    """Format sparse features for `enqueue_tpu_embedding_sparse_batch()`.
+
+    Args:
+      sparse_features: a `Dict` of `SparseTensor`s for embedding.
+
+    Returns:
+      Arguments for `enqueue_tpu_embedding_sparse_batch()`.
+    """
+
+    sample_idcs, embedding_idcs, aggregation_weights = list(), list(), list()
+    for table in self._table_to_features_dict:
+      sample_t, indices_t, weights_t = list(), list(), list()
+
+      features = self._table_to_features_dict[table]
+      for i, feature in enumerate(features):
+        tensor = sparse_features[feature]
+        sample_indices = tensor.indices[:, 0]
+        embedding_indices = tensor.values
+        weights = array_ops.ones_like(embedding_indices)
+        sample_t.append(i * self._batch_size_per_core + sample_indices)
+        indices_t.append(embedding_indices)
+        weights_t.append(weights)
+
+      sample_idcs.append(
+          math_ops.cast(array_ops.concat(sample_t, axis=0), dtype=dtypes.int32))
+      embedding_idcs.append(
+          math_ops.cast(
+              array_ops.concat(indices_t, axis=0), dtype=dtypes.int32))
+      aggregation_weights.append(
+          math_ops.cast(
+              array_ops.concat(weights_t, axis=0), dtype=dtypes.float32))
+
+    return sample_idcs, embedding_idcs, aggregation_weights
+
+  def get_activations(self):
+    """Get activations for features.
+
+    This should be called within `computation` that is passed to
+      `tpu.replicate` and friends.
+
+    Returns:
+      A dictionary mapping from `String` of feature name to `Tensor`
+        of activation.
+    """
+    recv_activations = tpu_ops.recv_tpu_embedding_activations(
+        num_outputs=len(self._table_to_config_dict),
+        config=self._config_proto.SerializeToString())
+
+    activations = collections.OrderedDict()
+    for table_id, table in enumerate(self._table_to_features_dict):
+      features = self._table_to_features_dict[table]
+      for lookup_id, feature in enumerate(features):
+        start_row = lookup_id * self._batch_size_per_core
+        end_row = start_row + self._batch_size_per_core
+        activations[feature] = gen_tpu_ops.tpu_embedding_activations(
+            self._dummy_table_variables[table_id],
+            recv_activations[table_id][start_row:end_row, :],
+            table_id=table_id,
+            lookup_id=lookup_id)
+    return activations
+
+  # TODO(shizhiw): Make `gradient_multiplier` per feature. Setting it to 0 would
+  # have the effect of `tf.stop_gradients()`.
+  # TODO(shizhiw): Consider alternative ways to capture gradients wrt embedding
+  # layer outputs to remove `_dummy_table_variables`,
+  # `_embedding_activation_grad` and `tpu_embedding_gradients_table_%d'.
+  def generate_send_gradients_op(self, gradient_multipliers=None):
+    """Retrieve gradients from collections and send them to TPU embedding.
+
+    Args:
+      gradient_multipliers: None, or dict mapping table names to gradient
+        multiplier Tensors.
+
+    Returns:
+      SendTPUEmbeddingGradients Op.
+
+    Raises:
+      ValueError: If required gradients have not been defined.
+      RuntimeError: If `mode` is not `TRAINING`.
+    """
+    if self._mode != TRAINING:
+      raise RuntimeError('Only in training mode gradients need to '
+                         'be sent to TPU embedding; got mode {}.'
+                         .format(self._mode))
+
+    g = ops.get_default_graph()
+    gradients = list()
+    for table_id, table in enumerate(self._table_to_config_dict):
+      table_gradients = g.get_collection(
+          'tpu_embedding_gradients_table_%d' % table_id)
+      if any(gradient is None for gradient in table_gradients):
+        raise ValueError(
+            'Table {}/{} has undefined gradients: this is probably because the '
+            'model asked TPUEmbedding to compute activations that were not '
+            'used.'.format(table_id, table))
+      concat_table_grads = array_ops.concat(table_gradients, axis=0)
+      if gradient_multipliers is not None:
+        concat_table_grads *= gradient_multipliers[table.name]
+      gradients.append(concat_table_grads)
+
+    return tpu_ops.send_tpu_embedding_gradients(
+        inputs=gradients, config=self.config_proto.SerializeToString())
+
+
+def _validate_table_to_config_dict(table_to_config_dict):
+  """Validate `table_to_config_dict`."""
+  for k, v in six.iteritems(table_to_config_dict):
+    if not isinstance(v, TableConfig):
+      raise ValueError('Value of `table_to_config_dict` must be of type '
+                       '`TableConfig`, got {} for {}.'.format(type(v), k))
+
+
+def _validate_feature_to_table_dict(table_to_config_dict,
+                                    feature_to_table_dict):
+  """Validate `feature_to_table_dict`."""
+  used_table_set = set(feature_to_table_dict.values())
+  table_set = set(table_to_config_dict.keys())
+
+  unused_table_set = table_set - used_table_set
+  if unused_table_set:
+    raise ValueError('`table_to_config_dict` specifies table that is not '
+                     'used in `feature_to_table_dict`: {}.'
+                     .format(unused_table_set))
+
+  extra_table_set = used_table_set - table_set
+  if extra_table_set:
+    raise ValueError('`feature_to_table_dict` refers to a table that is not '
+                     'specified in `table_to_config_dict`: {}.'
+                     .format(extra_table_set))
+
+
+def _validate_batch_size(batch_size, num_cores):
+  if batch_size % num_cores:
+    raise ValueError('`batch_size` is not a multiple of number of '
+                     'cores. `batch_size`={}, `_num_cores`={}.'.format(
+                         batch_size, num_cores))
+
+
+def _validate_optimization_parameters(optimization_parameters):
+  if not isinstance(optimization_parameters, _OptimizationParameters):
+    raise ValueError('`optimization_parameters` must inherit from '
+                     '`_OptimizationPramaters`. '
+                     '`type(optimization_parameters)`={}'.format(
+                         type(optimization_parameters)))
+
+
+class _OptimizerHandler(object):
+  """Interface class for handling optimizer specific logic."""
+
+  def __init__(self, optimization_parameters):
+    self._optimization_parameters = optimization_parameters
+
+  def set_optimization_parameters(self, table_descriptor):
+    raise NotImplementedError()
+
+  def create_variables_and_ops(self, table, variable_name):
+    raise NotImplementedError()
+
+  def get_slot_names(self):
+    raise NotImplementedError()
+
+  def get_slot(self, table, name):
+    raise NotImplementedError()
+
+
+class _AdagradHandler(_OptimizerHandler):
+  """Handles Adagrad specific logic."""
+
+  def __init__(self, optimization_parameters):
+    super(_AdagradHandler, self).__init__(optimization_parameters)
+    self._table_to_accumulator_variables_dict = {}
+
+  def set_optimization_parameters(self, table_descriptor):
+    table_descriptor.optimization_parameters.adagrad.SetInParent()
+
+  def create_variables_and_ops(self, table, variable_name, num_hosts,
+                               table_config, table_variables,
+                               load_parameters_ops, retrieve_parameters_ops):
+    optimizer_name = 'Adagrad'
+    accumulator_initializer = init_ops.constant_initializer(
+        self._optimization_parameters.initial_accumulator)
+    accumulator_variables = _create_partitioned_variables(
+        name='%s/%s' % (variable_name, optimizer_name),
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=accumulator_initializer)
+
+    self._table_to_accumulator_variables_dict[table] = accumulator_variables
+    for host_id, table_variable, accumulator_variable in (zip(
+        range(num_hosts), table_variables, accumulator_variables)):
+      with ops.colocate_with(table_variable):
+        load_parameters_op = (
+            tpu_ops.load_tpu_embedding_adagrad_parameters(
+                parameters=table_variable,
+                accumulators=accumulator_variable,
+                table_name=table,
+                num_shards=num_hosts,
+                shard_id=host_id))
+        retrieved_table, retrieved_accumulator = (
+            tpu_ops.retrieve_tpu_embedding_adagrad_parameters(
+                table_name=table,
+                num_shards=num_hosts,
+                shard_id=host_id))
+        retrieve_parameters_op = control_flow_ops.group(
+            state_ops.assign(table_variable, retrieved_table),
+            state_ops.assign(accumulator_variable, retrieved_accumulator))
+
+      load_parameters_ops.append(load_parameters_op)
+      retrieve_parameters_ops.append(retrieve_parameters_op)
+
+  def get_slot_names(self):
+    return ['accumulator']
+
+  def get_slot(self, table, name):
+    if name not in self.get_slot_names():
+      raise ValueError('Adagrad has {} as slot names; got {}.'
+                       .format(self.get_slot_names(), name))
+    return self._table_to_accumulator_variables_dict[table]
+
+
+class _AdamHandler(_OptimizerHandler):
+  """Handles Adam specific logic."""
+
+  def __init__(self, optimization_parameters):
+    super(_AdamHandler, self).__init__(optimization_parameters)
+    self._table_to_m_variables_dict = {}
+    self._table_to_v_variables_dict = {}
+
+  def set_optimization_parameters(self, table_descriptor):
+    table_descriptor.optimization_parameters.adam.beta1 = (
+        self._optimization_parameters.beta1)
+    table_descriptor.optimization_parameters.adam.beta2 = (
+        self._optimization_parameters.beta2)
+    table_descriptor.optimization_parameters.adam.epsilon = (
+        self._optimization_parameters.epsilon)
+    table_descriptor.optimization_parameters.adam.use_non_lazy_adam = (
+        not self._optimization_parameters.lazy_adam)
+    table_descriptor.optimization_parameters.adam.use_sum_inside_sqrt = (
+        self._optimization_parameters.sum_inside_sqrt)
+
+  def create_variables_and_ops(self, table, variable_name, num_hosts,
+                               table_config, table_variables,
+                               load_parameters_ops, retrieve_parameters_ops):
+    optimizer_name = 'Adam'
+    m_initializer = init_ops.zeros_initializer()
+    m_variables = _create_partitioned_variables(
+        name='%s/%s/m' % (variable_name, optimizer_name),
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=m_initializer)
+    v_initializer = init_ops.zeros_initializer()
+    v_variables = _create_partitioned_variables(
+        name='%s/%s/v' % (variable_name, optimizer_name),
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=v_initializer)
+
+    self._table_to_m_variables_dict[table] = m_variables
+    self._table_to_v_variables_dict[table] = v_variables
+
+    for host_id, table_variable, m_variable, v_variable in (zip(
+        range(num_hosts), table_variables,
+        m_variables, v_variables)):
+      with ops.colocate_with(table_variable):
+        load_parameters_op = (
+            tpu_ops.load_tpu_embedding_adam_parameters(
+                parameters=table_variable,
+                momenta=m_variable,
+                velocities=v_variable,
+                table_name=table,
+                num_shards=num_hosts,
+                shard_id=host_id))
+        retrieved_table, retrieved_m, retrieved_v = (
+            tpu_ops.retrieve_tpu_embedding_adam_parameters(
+                table_name=table,
+                num_shards=num_hosts,
+                shard_id=host_id))
+        retrieve_parameters_op = control_flow_ops.group(
+            state_ops.assign(table_variable, retrieved_table),
+            state_ops.assign(m_variable, retrieved_m),
+            state_ops.assign(v_variable, retrieved_v))
+
+      load_parameters_ops.append(load_parameters_op)
+      retrieve_parameters_ops.append(retrieve_parameters_op)
+
+  def get_slot_names(self):
+    return ['m', 'v']
+
+  def get_slot(self, table, name):
+    if name == 'm':
+      return self._table_to_m_variables_dict[table]
+    elif name == 'v':
+      return self._table_to_v_variables_dict[table]
+    else:
+      raise ValueError('Adam has {} as slot names; got {}.'
+                       .format(self.get_slot_names(), name))
+
+
+class _StochasticGradientDescentHandler(_OptimizerHandler):
+  """Handles stochastic gradient descent specific logic."""
+
+  def set_optimization_parameters(self, table_descriptor):
+    (table_descriptor.optimization_parameters.stochastic_gradient_descent
+     .SetInParent())
+
+  def create_variables_and_ops(self, table, variable_name, num_hosts,
+                               table_config, table_variables,
+                               load_parameters_ops, retrieve_parameters_ops):
+    del table_config
+
+    for host_id, table_variable in (zip(
+        range(num_hosts), table_variables)):
+      with ops.colocate_with(table_variable):
+        load_parameters_op = (
+            tpu_ops
+            .load_tpu_embedding_stochastic_gradient_descent_parameters(
+                parameters=table_variable,
+                table_name=table,
+                num_shards=num_hosts,
+                shard_id=host_id))
+        retrieved_table = (
+            tpu_ops
+            .retrieve_tpu_embedding_stochastic_gradient_descent_parameters(
+                table_name=table,
+                num_shards=num_hosts,
+                shard_id=host_id))
+        retrieve_parameters_op = control_flow_ops.group(
+            state_ops.assign(table_variable, retrieved_table))
+
+      load_parameters_ops.append(load_parameters_op)
+      retrieve_parameters_ops.append(retrieve_parameters_op)
+
+  def get_slot_names(self):
+    return []
+
+  def get_slot(self, table, name):
+    raise ValueError('Stochastic gradient descent does not have slot variable.')
+
+
+def _get_optimization_handler(optimization_parameters):
+  if isinstance(optimization_parameters, AdagradParameters):
+    return _AdagradHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, AdamParameters):
+    return _AdamHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, StochasticGradientDescentParameters):
+    return _StochasticGradientDescentHandler(optimization_parameters)
+  else:
+    return NotImplementedError()
+
+
+def _create_ordered_dict(d):
+  """Create an OrderedDict from Dict."""
+  return collections.OrderedDict((k, d[k]) for k in sorted(d))
+
+
+def _create_combiners(table_to_config_dict):
+  return [table_to_config_dict[t].combiner for t in table_to_config_dict]
+
+
+def _create_table_to_features_dict(feature_to_table_dict):
+  """Create mapping from table to a list of its features."""
+  table_to_features_dict_tmp = {}
+  for feature, table in six.iteritems(feature_to_table_dict):
+    if table in table_to_features_dict_tmp:
+      table_to_features_dict_tmp[table].append(feature)
+    else:
+      table_to_features_dict_tmp[table] = [feature]
+
+  table_to_features_dict = collections.OrderedDict()
+  for table in sorted(table_to_features_dict_tmp):
+    table_to_features_dict[table] = sorted(table_to_features_dict_tmp[table])
+  return table_to_features_dict
+
+
+def _create_device_fn(hosts):
+  """Create device_fn() to use with _create_partitioned_variables()."""
+
+  def device_fn(op):
+    """Returns the `device` for `op`."""
+    part_match = re.match(r'.*/part_(\d+)(/|$)', op.name)
+
+    if part_match:
+      idx = int(part_match.group(1))
+    else:
+      raise RuntimeError('Internal Error: '
+                         'Expected %s to contain /part_*.' % op.name)
+
+    device = hosts[idx]
+    return device
+
+  return device_fn
+
+
+def _create_partitioned_variables(name,
+                                  num_hosts,
+                                  vocabulary_size,
+                                  embedding_dimension,
+                                  initializer,
+                                  collections=None):  # pylint: disable=redefined-outer-name
+  """Creates ParitionedVariables based on `num_hosts` for `table`."""
+  # TODO(shizhiw): automatically place embedding lookup elsewhere?
+  if vocabulary_size < num_hosts:
+    raise ValueError('`vocabulary_size`({}) is smaller than `num_hosts`({}). '
+                     'As TPU embedding is not optimized for small tables, '
+                     'please consider other ways for this embedding lookup.')
+
+  return list(variable_scope.get_variable(
+      name,
+      shape=(vocabulary_size, embedding_dimension),
+      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
+      dtype=dtypes.float32,
+      initializer=initializer,
+      collections=collections,
+      trainable=False))
+
+
+@ops.RegisterGradient('TPUEmbeddingActivations')
+def _embedding_activations_grad(activations_op, grad_wrt_activations):
+  """Saves the gradient of embedding activations ops in a graph collection."""
+  g = ops.get_default_graph()
+  table_id = activations_op.get_attr('table_id')
+  lookup_id = activations_op.get_attr('lookup_id')
+  table_gradients = g.get_collection_ref(
+      'tpu_embedding_gradients_table_%d' % table_id)
+
+  if not table_gradients:
+    raise RuntimeError(
+        'Gradients for TPUEmbedding have been generated in non-training mode. '
+        'This is not expected. Consider putting your Optimizer.minimize code '
+        'behind the training mode condition check. For Estimator, you can '
+        'do \n\n'
+        '    if mode == tf.estimator.ModeKeys.TRAIN:\n'
+        '        train_op = opt.minimize(loss)\n'
+        '\n')
+
+  table_gradients[lookup_id] = array_ops.identity(grad_wrt_activations)
+  return [
+      # RegisterGradient requires that value be returned for all inputs. Since
+      # the first argument (tpu_gradient_variable_{table_name}) has shape [1],
+      # we will return zeros(shape=[1]). The actual gradient w.r.t. the
+      # embedding activations (grad_wrt_activations) has the same shape as the
+      # activations returned by  embedding_activations.
+      array_ops.zeros(arg.shape, dtype=dtypes.float32)
+      for arg in activations_op.inputs
+  ]
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 8d15c857f8a05e99b0f1106cba2a2d36824e5cac..96b9556e137effcaaa5916b9723142f737a6dc33 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -31,6 +31,7 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.tpu.python.tpu import tensor_tracer
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import error_handling
 from tensorflow.contrib.tpu.python.tpu import session_support
@@ -44,11 +45,11 @@ from tensorflow.contrib.training.python.training import hparam
 from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest as data_nest
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.estimator.export import export_output as export_output_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -76,7 +77,6 @@ from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
-
 _INITIAL_LOSS = 1e7
 _ZERO_LOSS = 0.
 _TPU_ESTIMATOR = 'tpu_estimator'
@@ -96,7 +96,6 @@ _REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
 # off of using _USE_TPU_KEY.
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
 
-
 # TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
 # only used for per-core based deployments. For per-host based pipelines, if a
 # user returns a Dataset instance it will be automatically wrapped in a
@@ -104,7 +103,6 @@ _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
 # explicitly).
 _WRAP_INPUT_FN_INTO_WHILE_LOOP = False
 
-
 ops.register_proto_function(
     '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
     proto_type=variable_pb2.VariableDef,
@@ -112,6 +110,15 @@ ops.register_proto_function(
     from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
 
 
+def _is_iterable(obj):
+  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
+  try:
+    iter(obj)
+    return True
+  except TypeError:
+    return False
+
+
 def _create_global_step(graph):
   graph = graph or ops.get_default_graph()
   if training.get_global_step(graph) is not None:
@@ -206,8 +213,8 @@ def _increase_eval_step_op(iterations_per_loop):
   """Returns an op to increase the eval step for TPU evaluation.
 
   Args:
-    iterations_per_loop: Tensor. The number of eval steps running in TPU
-        system before returning to CPU host for each `Session.run`.
+    iterations_per_loop: Tensor. The number of eval steps running in TPU system
+      before returning to CPU host for each `Session.run`.
 
   Returns:
     An operation
@@ -292,15 +299,14 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
       host_calls['host_call'] = host_call
     _OutfeedHostCall.validate(host_calls)
 
-    training_hooks = list(training_hooks or [])
-    evaluation_hooks = list(evaluation_hooks or [])
-    prediction_hooks = list(prediction_hooks or [])
+    training_hooks = tuple(training_hooks or [])
+    evaluation_hooks = tuple(evaluation_hooks or [])
+    prediction_hooks = tuple(prediction_hooks or [])
 
     for hook in training_hooks + evaluation_hooks + prediction_hooks:
       if not isinstance(hook, session_run_hook.SessionRunHook):
-        raise TypeError(
-            'All hooks must be SessionRunHook instances, given: {}'.format(
-                hook))
+        raise TypeError('All hooks must be SessionRunHook instances, given: {}'
+                        .format(hook))
 
     return super(TPUEstimatorSpec, cls).__new__(
         cls,
@@ -330,7 +336,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
     hooks = None
     if self.host_call is not None:
       hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
-    hooks = list(hooks or [])
+    hooks = tuple(hooks or [])
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
         mode=self.mode,
@@ -372,7 +378,7 @@ class _OpQueueContext(object):
       yield iterations
 
   def join(self):
-    logging.info('Shutting down %s thread.' % self._name)
+    logging.info('Shutting down %s thread.', self._name)
     self.stop()
     self._thread.join()
 
@@ -407,12 +413,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
                enqueue_ops,
                dequeue_ops,
                run_infeed_loop_on_coordinator=True,
-               rendezvous=None):
+               rendezvous=None,
+               master=None,
+               session_config=None):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
     self._rendezvous = rendezvous
-
+    self._master = master
+    self._session_config = session_config
     self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
     self._initial_infeed_sleep_secs = (
         ctx.config.tpu_config.initial_infeed_sleep_secs)
@@ -424,11 +433,10 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   def begin(self):
     logging.info('TPU job name %s', self._master_job)
     self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+    self._init_ops = []
     if self._should_initialize_tpu:
-      self._init_ops = [tpu.initialize_system(job=self._master_job)]
       self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
     else:
-      self._init_ops = []
       self._finalize_ops = []
 
     summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
@@ -470,7 +478,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     return _OpQueueContext(name=name, target=target, args=args)
 
   def after_create_session(self, session, coord):
-    logging.info('Init TPU system')
+    if self._should_initialize_tpu:
+      logging.info('Init TPU system')
+      start = time.time()
+      with ops.Graph().as_default():
+        with tf_session.Session(
+            self._master, config=self._session_config) as sess:
+          sess.run(tpu.initialize_system(job=self._master_job))
+      logging.info('Initialized TPU in %d seconds', time.time() - start)
+
     session.run(self._init_ops,
                 options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
 
@@ -480,6 +496,12 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     self._outfeed_controller = _OpQueueContext(
         name='OutfeedController', target=self._run_outfeed, args=(session,))
 
+    # Enable the worker watchdog to terminate workers on coordinator exit.
+    watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
+    if watchdog_timeout > 0:
+      session_support.start_worker_watchdog(session,
+                                            shutdown_timeout=watchdog_timeout)
+
   def before_run(self, run_context):
     self._feed_error = None
 
@@ -508,10 +530,16 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
 class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops, rendezvous=None):
+  def __init__(self, ctx, enqueue_ops, dequeue_ops, rendezvous=None,
+               master=None, session_config=None):
     super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
-        ctx, enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=False,
-        rendezvous=rendezvous)
+        ctx,
+        enqueue_ops,
+        dequeue_ops,
+        run_infeed_loop_on_coordinator=False,
+        rendezvous=rendezvous,
+        master=master,
+        session_config=session_config)
 
   def _create_infeed_controller(self, name, target, args):
     return _OpSignalOnceQueueContext(name=name, target=target, args=args)
@@ -660,8 +688,7 @@ def generate_per_core_enqueue_ops_fn_for_host(
         user_context = tpu_context.TPUContext(
             internal_ctx=ctx,
             input_device=host_device,
-            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal
-        )
+            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal)
         inputs = _Inputs.from_input_fn(input_fn(user_context))
         if inputs.is_dataset:
           raise TypeError(
@@ -694,13 +721,11 @@ def generate_per_host_enqueue_ops_fn_for_host(
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
 
-  hooks = []
+  dataset_initializer = None
 
   with ops.device(device):
     user_context = tpu_context.TPUContext(
-        internal_ctx=ctx,
-        input_device=device,
-        invocation_index=host_id)
+        internal_ctx=ctx, input_device=device, invocation_index=host_id)
     inputs = _Inputs.from_input_fn(input_fn(user_context))
 
     is_dataset = inputs.is_dataset
@@ -712,11 +737,12 @@ def generate_per_host_enqueue_ops_fn_for_host(
       if batch_axis is not None:
         raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
       inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset, batch_size=ctx.batch_size_for_input_fn,
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
           add_padding=True)
 
     if is_dataset:
-      hooks.append(inputs.dataset_initializer_hook())
+      dataset_initializer = inputs.dataset_initializer()
 
     tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
 
@@ -762,20 +788,18 @@ def generate_per_host_enqueue_ops_fn_for_host(
             'signals': signals,
         }
 
-  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
 
 
 def generate_per_host_v2_enqueue_ops_fn_for_host(
     ctx, input_fn, inputs_structure_recorder, device, host_id):
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
-  hooks = []
+  dataset_initializer = None
 
   with ops.device(device):
     user_context = tpu_context.TPUContext(
-        internal_ctx=ctx,
-        input_device=device,
-        invocation_index=host_id)
+        internal_ctx=ctx, input_device=device, invocation_index=host_id)
     inputs = _Inputs.from_input_fn(input_fn(user_context))
 
     is_dataset = inputs.is_dataset
@@ -790,7 +814,7 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
           add_padding=True,
           num_invocations_per_step=ctx.num_of_replicas_per_host)
 
-    hooks.append(inputs.dataset_initializer_hook())
+    dataset_initializer = inputs.dataset_initializer()
     tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
 
   def enqueue_ops_fn():
@@ -851,14 +875,14 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
           'signals': signals,
       }
 
-  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
 
 
 def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
                                       num_hosts):
   """Generates infeed enqueue ops for one input_fn on all the hosts."""
   captured_infeed_queue = _CapturedObject()
-  hooks = []
+  dataset_initializer = None
   device_0 = ctx.tpu_host_placement_function(host_id=0)
   with ops.device(device_0):
     user_context = tpu_context.TPUContext(
@@ -878,7 +902,7 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
           add_padding=True)
 
     if is_dataset:
-      hooks.append(inputs.dataset_initializer_hook())
+      dataset_initializer = inputs.dataset_initializer()
     num_replicas_per_host = ctx.num_of_replicas_per_host
 
   def tpu_ordinal_function_impl(replica_id):
@@ -929,7 +953,7 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
           'signals': signals,
       }
 
-  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
 
 
 class _InputPipeline(object):
@@ -956,12 +980,12 @@ class _InputPipeline(object):
   may expect multiple `features` and `labels` tuples one for each core.
 
   TPUEstimator allows various different structures for inputs (namely `features`
-  and `labels`).  `features` can be `Tensor`, dict of string name to `Tensor`,
-  or nested tuples and `labels` could be `None`, `Tensor`, or dict of string
-  name to `Tensor`. TPU infeed/outfeed library expects flattened tensor list.
-  So, `features` and `labels` need to be flattened, before infeed enqueue, and
-  the structure of them needs to be recorded, in order to restore them after
-  infeed dequeue.
+  and `labels`).  Both `features` and `labels` can be any nested sturcture
+  supported by TF nest (namely, dict, tuples, namedtuples or any nested
+  structure of such of Tensors).  `labels` could be `None` as well.
+
+  These are flattened before they are passed to the infeed/outfeed library
+  as that expectes flattend lists.
   """
 
   class InputsStructureRecorder(object):
@@ -1133,7 +1157,7 @@ class _InputPipeline(object):
     """Deploys the input pipeline and record input structure."""
     enqueue_ops = []
     infeed_queues = []
-    all_hooks = []
+    all_dataset_initializers = []
     num_hosts = self._ctx.num_hosts
     tpu_host_placement_fn = self._ctx.tpu_host_placement_function
 
@@ -1165,12 +1189,12 @@ class _InputPipeline(object):
     elif self._ctx.is_input_broadcast_with_iterators():
       # Only calls input_fn in host 0.
       host_device = tpu_host_placement_fn(host_id=0)
-      enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
+      enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
           generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn,
                                             self._inputs_structure_recorder,
                                             num_hosts))
-      all_hooks.extend(hooks)
-      if is_dataset:
+      if dataset_initializer:
+        all_dataset_initializers.append(dataset_initializer)
         run_infeed_loop_on_coordinator = False
         wrap_fn = (
             _wrap_computation_in_while_loop
@@ -1186,17 +1210,16 @@ class _InputPipeline(object):
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
             if self._ctx.is_input_per_host_with_iterators():
-              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
+              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
                   generate_per_host_v2_enqueue_ops_fn_for_host(
                       self._ctx, self._input_fn,
                       self._inputs_structure_recorder, host_device, host_id))
             else:
-              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
+              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
                   generate_per_host_enqueue_ops_fn_for_host(
                       self._ctx, self._input_fn,
                       self._inputs_structure_recorder, self._batch_axis,
                       host_device, host_id))
-            all_hooks.extend(hooks)
 
             # NOTE(xiejw): We dispatch here based on the return type of the
             # users `input_fn`.
@@ -1210,7 +1233,8 @@ class _InputPipeline(object):
             # handled in TF control flow properly. In this case, we will use
             # python loop to enqueue the data into TPU system.  This may be
             # slow compared to the previous case.
-            if is_dataset:
+            if dataset_initializer:
+              all_dataset_initializers.append(dataset_initializer)
               run_infeed_loop_on_coordinator = False
               wrap_fn = (
                   _wrap_computation_in_while_loop
@@ -1225,7 +1249,9 @@ class _InputPipeline(object):
     # dequeue is dtypes and types. So, any one can be used. Here, grab the
     # first one.
     self._infeed_queue = infeed_queues[0]
-    return enqueue_ops, all_hooks, run_infeed_loop_on_coordinator
+    return enqueue_ops, [
+        util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers)
+    ], run_infeed_loop_on_coordinator
 
   def _validate_input_pipeline(self):
     """Validates the input pipeline.
@@ -1313,9 +1339,15 @@ class _ModelFnWrapper(object):
 
       captured_training_hooks.capture(estimator_spec.training_hooks)
 
+      tracing_ops = []
+      if tensor_tracer.TensorTracer.is_enabled():
+        tt = tensor_tracer.TensorTracer()
+        loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(), loss,
+                                         self._ctx.num_replicas)
+
       # We must run train_op to update the variables prior to running the
       # outfeed.
-      with ops.control_dependencies([train_op]):
+      with ops.control_dependencies([train_op]+tracing_ops):
         host_call_outfeed_ops = []
         if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
             and estimator_spec.host_call is not None):
@@ -1443,21 +1475,19 @@ class _ModelFnWrapper(object):
       raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
 
     for (key, tensor) in predictions.items():
-      if tensor.shape[0].value is None:
+      if tensor.shape.dims[0].value is None:
         raise ValueError(
             'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
-            'dynamic shape (should be static). Tensor: {}'.format(
-                key, tensor))
+            'dynamic shape (should be static). Tensor: {}'.format(key, tensor))
     return predictions
 
-  def _validate_model_features_and_labels(self,
-                                          features,
-                                          labels,
+  def _validate_model_features_and_labels(self, features, labels,
                                           is_export_mode):
     """Validates that the features and labels for the model function are valid.
 
     A valid features/labels object is the one with:
-    - Type: Tensor or a dictionary of Tensors
+    - Type: A tensor or any nested structure of tensors supported by TF nest,
+        namely nested dictionary, tuple, namedtuple, or sequence of tensors.
     - Static shape if is_export_mode is False.
 
     Args:
@@ -1472,11 +1502,6 @@ class _ModelFnWrapper(object):
 
     def validate(obj, obj_name):
       """Helper validate function."""
-      if not isinstance(obj, ops.Tensor) and not isinstance(obj, dict):
-        raise TypeError(
-            'The {} to the model returned by input_fn must be either a Tensor '
-            'or a dictionary of Tensors. {}: {}'.format(obj_name, obj_name,
-                                                        obj))
       if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode):
         return
       if isinstance(obj, ops.Tensor):
@@ -1485,14 +1510,11 @@ class _ModelFnWrapper(object):
               'The {} to the model returned by input_fn must have static shape.'
               ' Tensor: {}'.format(obj_name, obj))
       else:
-        for (key, value) in obj.items():
-          flattened_tensors = data_nest.flatten(value)
-          for tensor in flattened_tensors:
-            if not tensor.get_shape().is_fully_defined():
-              raise ValueError(
-                  'The {} to the model returned by input_fn must have static '
-                  'shape. Key: \'{}\', Tensor: {}'.format(
-                      obj_name, key, tensor))
+        for tensor in data_nest.flatten(obj):
+          if not tensor.get_shape().is_fully_defined():
+            raise ValueError(
+                ('The {} to the model returned by input_fn must have static '
+                 'shape. Tensor: {}').format(obj_name, tensor))
 
     validate(features, 'features')
     if labels is not None:
@@ -1713,7 +1735,8 @@ class _OutfeedHostCall(object):
     dequeue_ops_by_name = {}
     pos = 0
     for name in self._names:
-      dequeue_ops_by_name[name] = dequeue_ops[pos:pos+len(self._tensors[name])]
+      dequeue_ops_by_name[name] = dequeue_ops[pos:pos +
+                                              len(self._tensors[name])]
       pos += len(self._tensors[name])
 
     # It is assumed evaluation always happens on single host TPU system. So,
@@ -1794,19 +1817,18 @@ class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
         summary_writer=summary_writer)
 
   def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
-    global_steps_per_sec = elapsed_steps / elapsed_time
-    examples_per_sec = self._batch_size * global_steps_per_sec
+    global_step_per_sec = elapsed_steps / elapsed_time
+    examples_per_sec = self._batch_size * global_step_per_sec
     if self._summary_writer is not None:
       global_step_summary = Summary(value=[
-          Summary.Value(tag='global_steps/sec',
-                        simple_value=global_steps_per_sec)
+          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
       ])
       example_summary = Summary(value=[
           Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
       ])
       self._summary_writer.add_summary(global_step_summary, global_step)
       self._summary_writer.add_summary(example_summary, global_step)
-    logging.info('global_steps/sec: %g', global_steps_per_sec)
+    logging.info('global_step/sec: %g', global_step_per_sec)
     logging.info('examples/sec: %g', examples_per_sec)
 
 
@@ -1858,7 +1880,7 @@ class TPUEstimator(estimator_lib.Estimator):
   the following discussion on TPU evaluation does not apply.
 
   `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
-  `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. (See
+  `tensors` could be a list of any nested structure of `Tensor`s (See
   `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
   a dict from metric string name to the result of calling a metric function,
   namely a `(metric_tensor, update_op)` tuple.
@@ -2043,8 +2065,9 @@ class TPUEstimator(estimator_lib.Estimator):
 
     Args:
       model_fn: Model function as required by `Estimator` which returns
-      EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
-      and `prediction_hooks` must not capure any TPU Tensor inside the model_fn.
+        EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
+        and `prediction_hooks` must not capure any TPU Tensor inside the
+        model_fn.
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
         continue training a previously saved model. If `None`, the model_dir in
@@ -2055,19 +2078,18 @@ class TPUEstimator(estimator_lib.Estimator):
         `input_fn` and `model_fn`.  Keys are names of parameters, values are
         basic python types. There are reserved keys for `TPUEstimator`,
         including 'batch_size'.
-      use_tpu: A bool indicating whether TPU support is enabled. Currently,
-        - TPU training and evaluation respect this bit, but eval_on_tpu can
-          override execution of eval. See below.
-        - Predict still happens on CPU.
+      use_tpu: A bool indicating whether TPU support is enabled. Currently, -
+        TPU training and evaluation respect this bit, but eval_on_tpu can
+        override execution of eval. See below. - Predict still happens on CPU.
       train_batch_size: An int representing the global training batch size.
         TPUEstimator transforms this global batch size to a per-shard batch
         size, as params['batch_size'], when calling `input_fn` and `model_fn`.
-        Cannot be `None` if `use_tpu` is `True`.
-        Must be divisible by total number of replicas.
-      eval_batch_size: An int representing evaluation batch size.
-        Must be divisible by total number of replicas.
-      predict_batch_size: An int representing the prediction batch size.
-        Must be divisible by total number of replicas.
+        Cannot be `None` if `use_tpu` is `True`. Must be divisible by total
+        number of replicas.
+      eval_batch_size: An int representing evaluation batch size. Must be
+        divisible by total number of replicas.
+      predict_batch_size: An int representing the prediction batch size. Must be
+        divisible by total number of replicas.
       batch_axis: A python tuple of int values describing how each tensor
         produced by the Estimator `input_fn` should be split across the TPU
         compute shards. For example, if your input_fn produced (images, labels)
@@ -2083,11 +2105,10 @@ class TPUEstimator(estimator_lib.Estimator):
       export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
         serving on TPU besides the one on CPU.
       warm_start_from: Optional string filepath to a checkpoint or SavedModel to
-                       warm-start from, or a `tf.estimator.WarmStartSettings`
-                       object to fully configure warm-starting.  If the string
-                       filepath is provided instead of a `WarmStartSettings`,
-                       then all variables are warm-started, and it is assumed
-                       that vocabularies and Tensor names are unchanged.
+        warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
+        configure warm-starting.  If the string filepath is provided instead of
+        a `WarmStartSettings`, then all variables are warm-started, and it is
+        assumed that vocabularies and Tensor names are unchanged.
 
     Raises:
       ValueError: `params` has reserved keys already.
@@ -2148,10 +2169,8 @@ class TPUEstimator(estimator_lib.Estimator):
     # All properties passed to _InternalTPUContext are immutable.
     # pylint: disable=protected-access
     self._ctx = tpu_context._get_tpu_context(
-        self._config, train_batch_size,
-        eval_batch_size, predict_batch_size,
-        use_tpu,
-        eval_on_tpu)
+        self._config, train_batch_size, eval_batch_size, predict_batch_size,
+        use_tpu, eval_on_tpu)
 
     self._export_to_tpu = export_to_tpu
 
@@ -2162,7 +2181,6 @@ class TPUEstimator(estimator_lib.Estimator):
                                builder,
                                input_receiver_fn_map,
                                checkpoint_path,
-                               strip_default_attrs,
                                save_variables=True,
                                mode=model_fn_lib.ModeKeys.PREDICT,
                                export_tags=None,
@@ -2173,38 +2191,37 @@ class TPUEstimator(estimator_lib.Estimator):
           'when `export_to_tpu` is `True`; '
           'got {}.'.format(mode))
 
-    (super(TPUEstimator, self).
-     _add_meta_graph_for_mode(builder,
-                              input_receiver_fn_map,
-                              checkpoint_path,
-                              strip_default_attrs,
-                              save_variables,
-                              mode=mode,
-                              export_tags=export_tags,
-                              check_variables=check_variables))
+    (super(TPUEstimator, self)._add_meta_graph_for_mode(
+        builder,
+        input_receiver_fn_map,
+        checkpoint_path,
+        save_variables,
+        mode=mode,
+        export_tags=export_tags,
+        check_variables=check_variables))
 
     if self._export_to_tpu:
-      input_receiver_fn_map = {_REWRITE_FOR_INFERENCE_MODE:
-                               input_receiver_fn_map[mode]}
+      input_receiver_fn_map = {
+          _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode]
+      }
       export_tags = [tag_constants.SERVING, tag_constants.TPU]
       mode = _REWRITE_FOR_INFERENCE_MODE
       # See b/110052256 for why `check_variables` is `False`.
-      (super(TPUEstimator, self).
-       _add_meta_graph_for_mode(builder,
-                                input_receiver_fn_map,
-                                checkpoint_path,
-                                strip_default_attrs,
-                                save_variables=False,
-                                mode=mode,
-                                export_tags=export_tags,
-                                check_variables=False))
+      (super(TPUEstimator, self)._add_meta_graph_for_mode(
+          builder,
+          input_receiver_fn_map,
+          checkpoint_path,
+          save_variables=False,
+          mode=mode,
+          export_tags=export_tags,
+          check_variables=False))
 
   def _call_model_fn(self, features, labels, mode, config):
     if mode == _REWRITE_FOR_INFERENCE_MODE:
       return self._call_model_fn_for_inference(features, labels, mode, config)
     else:
-      return super(TPUEstimator, self)._call_model_fn(
-          features, labels, mode, config)
+      return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
+                                                      config)
 
   def _call_model_fn_for_inference(self, features, labels, mode, config):
     """Wraps `_call_model_fn` for `export_savedmodel`."""
@@ -2217,7 +2234,7 @@ class TPUEstimator(estimator_lib.Estimator):
     def computation():
       """Compute tpu tensors used in export_outputs.
 
-      Passed to rewrite_for_inference so that model_fn will be called under
+      Passed to rewrite so that model_fn will be called under
       the rewriting contexts. Only tpu tensors are returned, but export_outputs
       and scaffold are captured.
 
@@ -2226,7 +2243,7 @@ class TPUEstimator(estimator_lib.Estimator):
          outside_compilation.
       """
       # We should only call model fn once and it should be inside `computation`
-      # so that building the graph will happen under `rewrite_for_inference`.
+      # so that building the graph will happen under `rewrite`.
       mode = model_fn_lib.ModeKeys.PREDICT
       estimator_spec = self._call_model_fn(features, labels, mode, config)
 
@@ -2234,8 +2251,7 @@ class TPUEstimator(estimator_lib.Estimator):
       # from `computation` for rewriting.
       tensors_dict = collections.OrderedDict(
           (k, _export_output_to_tensors(v))
-          for k, v in six.iteritems(estimator_spec.export_outputs)
-      )
+          for k, v in six.iteritems(estimator_spec.export_outputs))
       tensors = nest.flatten(tensors_dict)
       tpu_tensors = [t for t in tensors if _is_tpu_tensor(t)]
 
@@ -2244,7 +2260,7 @@ class TPUEstimator(estimator_lib.Estimator):
       capture.capture((estimator_spec, tensors_dict, tensors))
       return tpu_tensors
 
-    tpu_tensors_on_cpu = tpu.rewrite_for_inference(computation)
+    tpu_tensors_on_cpu = tpu.rewrite(computation)
     estimator_spec, tensors_dict, tensors = capture.get()
 
     # Reconstruct `tensors`, but with `tpu_tensors` replaced with
@@ -2258,9 +2274,9 @@ class TPUEstimator(estimator_lib.Estimator):
       else:
         # Only fetching `tpu_tensors_on_cpu` does not trigger
         # TPU computation and blocks, so we add the control dependency here.
-        control_inputs = (tpu_tensors_on_cpu
-                          if isinstance(tpu_tensors_on_cpu, (list, tuple))
-                          else (tpu_tensors_on_cpu,))
+        control_inputs = (
+            tpu_tensors_on_cpu if _is_iterable(tpu_tensors_on_cpu) else
+            (tpu_tensors_on_cpu,))
         with ops.control_dependencies(control_inputs):
           new_tensors.append(array_ops.identity(t))
 
@@ -2270,8 +2286,7 @@ class TPUEstimator(estimator_lib.Estimator):
     export_outputs = estimator_spec.export_outputs
     new_export_outputs = collections.OrderedDict(
         (k, _clone_export_output_with_tensors(export_outputs[k], v))
-        for k, v in six.iteritems(new_tensors_dict)
-    )
+        for k, v in six.iteritems(new_tensors_dict))
 
     return estimator_spec._replace(export_outputs=new_export_outputs)
 
@@ -2335,9 +2350,9 @@ class TPUEstimator(estimator_lib.Estimator):
       mode: ModeKeys
 
     Returns:
-      Either features or (features, labels) where features and labels are:
-        features - `Tensor` or dictionary of string feature name to `Tensor`.
-        labels - `Tensor` or dictionary of `Tensor` with labels.
+      In TPU mode, returns an input_fn to be called later in model_fn.
+      Otherwise, calls the input_fn and returns either fatures or
+        (features, labels).
 
     Raises:
       ValueError: if input_fn takes invalid arguments or does not have `params`.
@@ -2365,8 +2380,8 @@ class TPUEstimator(estimator_lib.Estimator):
       # input_fn for use_tpu=True/False.
       batch_size_for_input_fn = ctx.batch_size_for_input_fn
       if batch_size_for_input_fn is not None:
-        _add_item_to_params(kwargs['params'],
-                            _BATCH_SIZE_KEY, batch_size_for_input_fn)
+        _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
+                            batch_size_for_input_fn)
 
       # For export_savedmodel, input_fn is never passed to Estimator. So,
       # `is_export_mode` must be False.
@@ -2411,24 +2426,32 @@ class TPUEstimator(estimator_lib.Estimator):
     self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
     try:
       return super(TPUEstimator, self).train(
-          input_fn=input_fn, hooks=hooks, steps=steps, max_steps=max_steps,
-          saving_listeners=saving_listeners
-      )
+          input_fn=input_fn,
+          hooks=hooks,
+          steps=steps,
+          max_steps=max_steps,
+          saving_listeners=saving_listeners)
     except Exception:  # pylint: disable=broad-except
       rendezvous.record_error('training_loop', sys.exc_info())
     finally:
       rendezvous.record_done('training_loop')
       rendezvous.raise_errors()
 
-  def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
+  def evaluate(self,
+               input_fn,
+               steps=None,
+               hooks=None,
+               checkpoint_path=None,
                name=None):
     rendezvous = error_handling.ErrorRendezvous(num_sources=3)
     self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
     try:
       return super(TPUEstimator, self).evaluate(
-          input_fn, steps=steps, hooks=hooks, checkpoint_path=checkpoint_path,
-          name=name
-      )
+          input_fn,
+          steps=steps,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          name=name)
     except Exception:  # pylint: disable=broad-except
       rendezvous.record_error('evaluation_loop', sys.exc_info())
     finally:
@@ -2480,17 +2503,19 @@ class TPUEstimator(estimator_lib.Estimator):
 
         # examples_hook is added to training_hooks for both CPU and TPU
         # execution.
-        examples_hook = ExamplesPerSecondHook(
-            ctx.global_batch_size,
-            output_dir=self.model_dir,
-            every_n_steps=self._log_every_n_steps)
+        if self._log_every_n_steps is not None:
+          examples_hook = ExamplesPerSecondHook(
+              ctx.global_batch_size,
+              output_dir=self.model_dir,
+              every_n_steps=self._log_every_n_steps)
 
         if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
           logging.info('Running %s on CPU', mode)
           estimator_spec = model_fn_wrapper.call_without_tpu(
               features, labels, is_export_mode=is_export_mode)
-          estimator_spec = estimator_spec._replace(
-              training_hooks=estimator_spec.training_hooks + (examples_hook,))
+          if self._log_every_n_steps is not None:
+            estimator_spec = estimator_spec._replace(
+                training_hooks=estimator_spec.training_hooks + (examples_hook,))
           return estimator_spec
 
         assert labels is None, '`labels` passed to `model_fn` must be `None`.'
@@ -2522,28 +2547,24 @@ class TPUEstimator(estimator_lib.Estimator):
           if shutdown_mode:
             if shutdown_mode == 'shutdown_worker':
               finalizer_hooks = [
-                  session_support.ShutdownLameWorkers(timeout_ms=60*1000),
+                  session_support.ShutdownLameWorkers(timeout_ms=60 * 1000),
               ]
             elif shutdown_mode == 'shutdown_computation':
               finalizer_hooks = [
-                  session_support.RestartComputation(timeout_ms=60*1000),
+                  session_support.RestartComputation(timeout_ms=60 * 1000),
               ]
             else:
-              raise ValueError('Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' %
-                               shutdown_mode)
+              raise ValueError(
+                  'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode)
 
-            shutdown_hooks.append(session_support.GracefulShutdownHook(
-                checkpoint_prefix=self.model_dir + '/model.ckpt',
-                on_shutdown_hooks=finalizer_hooks
-            ))
+            shutdown_hooks.append(
+                session_support.GracefulShutdownHook(
+                    checkpoint_prefix=self.model_dir + '/model.ckpt',
+                    on_shutdown_hooks=finalizer_hooks))
 
           with ops.control_dependencies([loss]):
             global_step = array_ops.identity(training.get_global_step())
           hooks = input_hooks + shutdown_hooks
-          logging_hook_frequency = (    # Divide and round up
-              (self._log_every_n_steps +
-               self._config.tpu_config.iterations_per_loop - 1) //
-              self._config.tpu_config.iterations_per_loop)
           hooks.extend([
               TPUInfeedOutfeedSessionHook(
                   ctx,
@@ -2552,18 +2573,25 @@ class TPUEstimator(estimator_lib.Estimator):
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator),
                   rendezvous=self._rendezvous[mode],
+                  master=self._config.master,
+                  session_config=self._session_config,
               ),
-              InstallSignalHandlerHook(),
-              training.LoggingTensorHook(
-                  {
-                      'loss': array_ops.identity(loss),
-                      'step': global_step,
-                  },
-                  every_n_iter=logging_hook_frequency)
+              InstallSignalHandlerHook()
           ])
-          examples_hook._set_steps_per_run(   # pylint: disable=protected-access
-              self._config.tpu_config.iterations_per_loop)
-          hooks.append(examples_hook)
+          if self._log_every_n_steps is not None:
+            logging_hook_frequency = (  # Divide and round up
+                (self._log_every_n_steps +
+                 self._config.tpu_config.iterations_per_loop - 1) //
+                self._config.tpu_config.iterations_per_loop)
+            hooks.append(
+                training.LoggingTensorHook({
+                    'loss': array_ops.identity(loss),
+                    'step': global_step,
+                },
+                                           every_n_iter=logging_hook_frequency))
+            examples_hook._set_steps_per_run(  # pylint: disable=protected-access
+                self._config.tpu_config.iterations_per_loop)
+            hooks.append(examples_hook)
 
           if training_hooks:
             hooks.extend(training_hooks)
@@ -2576,7 +2604,7 @@ class TPUEstimator(estimator_lib.Estimator):
                 save_secs=self._config.save_checkpoints_secs,
                 save_steps=self._config.save_checkpoints_steps,
                 scaffold=scaffold)
-            checkpoint_hook._set_steps_per_run(   # pylint: disable=protected-access
+            checkpoint_hook._set_steps_per_run(  # pylint: disable=protected-access
                 self._config.tpu_config.iterations_per_loop)
             chief_hooks.append(checkpoint_hook)
 
@@ -2602,15 +2630,10 @@ class TPUEstimator(estimator_lib.Estimator):
           total_loss, host_calls, scaffold, eval_hooks = _eval_on_tpu_system(
               ctx, model_fn_wrapper, dequeue_fn)
           iterations_per_loop_var = _create_or_get_iterations_per_loop()
-          mean_loss = math_ops.div(total_loss,
-                                   math_ops.cast(
-                                       iterations_per_loop_var,
-                                       dtype=total_loss.dtype))
-
-          # Creates a dummy metric update_op for all metrics. Estimator expects
-          # all metrics in eval_metric_ops have update_op and calls them one by
-          # one. The real metric update_ops are invoked in a separated thread.
-          # So, here give Estimator the dummy op for all metrics.
+          mean_loss = math_ops.div(
+              total_loss,
+              math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
+
           with ops.control_dependencies([mean_loss]):
             # After TPU evaluation computation is done (the mean_loss tensor),
             # reads all variables back from TPU and updates the eval step
@@ -2618,16 +2641,30 @@ class TPUEstimator(estimator_lib.Estimator):
             internal_ops_to_run = _sync_variables_ops(ctx)
             internal_ops_to_run.append(
                 _increase_eval_step_op(iterations_per_loop_var))
-            with ops.control_dependencies(internal_ops_to_run):
-              dummy_update_op = control_flow_ops.no_op()
 
           host_call_ret = host_calls.create_tpu_hostcall()
           eval_metric_ops = {}
           eval_update_ops = []
 
-          for k, v in host_call_ret.get('eval_metrics', {}).items():
-            eval_metric_ops[k] = (v[0], dummy_update_op)
-            eval_update_ops.append(v[1])
+          eval_metrics = host_call_ret.get('eval_metrics', {})
+          if eval_metrics:
+            # Creates a dummy metric update_op for all metrics. Estimator
+            # expects all metrics in `eval_metric_ops` have update_op and calls
+            # them one by one. The real metric update_ops are invoked in a
+            # separated thread. So, here give Estimator the dummy op for all
+            # metrics.
+            with ops.control_dependencies(internal_ops_to_run):
+              dummy_update_op = control_flow_ops.no_op()
+
+            for k, v in eval_metrics.items():
+              eval_metric_ops[k] = (v[0], dummy_update_op)
+              eval_update_ops.append(v[1])
+          else:
+            # If no eval metrics are passed, create an identity node for the
+            # loss and add `internal_ops_to_run` to its dependencies. So
+            # `internal_ops_to_run` can be executed.
+            with ops.control_dependencies(internal_ops_to_run):
+              mean_loss = array_ops.identity(mean_loss)
 
           if 'host_call' not in host_call_ret:
             host_ops = []
@@ -2640,8 +2677,10 @@ class TPUEstimator(estimator_lib.Estimator):
                   eval_update_ops + host_ops,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode]),
-          ] + input_hooks
+                  rendezvous=self._rendezvous[mode],
+                  master=self._config.evaluation_master,
+                  session_config=self._session_config,
+              )] + input_hooks
 
           if eval_hooks:
             hooks.extend(eval_hooks)
@@ -2696,7 +2735,8 @@ class TPUEstimator(estimator_lib.Estimator):
 
         predictions = host_call_ret['predictions']
         _verify_cross_hosts_transfer_size(
-            predictions, message=(
+            predictions,
+            message=(
                 'The estimated size for TPUEstimatorSpec.predictions is too '
                 'large.'))
         signals = host_call_ret['signals']
@@ -2711,7 +2751,9 @@ class TPUEstimator(estimator_lib.Estimator):
         hooks = [
             _StoppingPredictHook(scalar_stopping_signal),
             TPUInfeedOutfeedSessionHookForPrediction(
-                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode]),
+                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
+                master=self._config.master,
+                session_config=self._session_config),
         ] + input_hooks
 
         if prediction_hooks:
@@ -2742,7 +2784,8 @@ def _export_output_to_tensors(export_output):
 
   Args:
     export_output: an `ExportOutput` object such as `ClassificationOutput`,
-            `RegressionOutput`, or `PredictOutput`.
+      `RegressionOutput`, or `PredictOutput`.
+
   Returns:
     a list of tensors used in export_output.
 
@@ -2755,7 +2798,7 @@ def _export_output_to_tensors(export_output):
   elif isinstance(export_output, export_output_lib.RegressionOutput):
     return [export_output.value]
   elif isinstance(export_output, export_output_lib.PredictOutput):
-    return export_output.outputs.values()
+    return list(export_output.outputs.values())
   else:
     raise ValueError(
         '`export_output` must be have type `ClassificationOutput`, '
@@ -2767,7 +2810,7 @@ def _clone_export_output_with_tensors(export_output, tensors):
 
   Args:
     export_output: an `ExportOutput` object such as `ClassificationOutput`,
-            `RegressionOutput`, or `PredictOutput`.
+      `RegressionOutput`, or `PredictOutput`.
     tensors: a list of `Tensors` used to construct a new `export_output`.
 
   Returns:
@@ -2804,9 +2847,8 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
 
   def multi_tpu_eval_steps_on_single_shard():
-    return training_loop.repeat(
-        iterations_per_loop_var,
-        single_tpu_eval_step, [_ZERO_LOSS])
+    return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
+                                [_ZERO_LOSS])
 
   (loss,) = tpu.shard(
       multi_tpu_eval_steps_on_single_shard,
@@ -2828,9 +2870,8 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
        model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
 
   def multi_tpu_train_steps_on_single_shard():
-    return training_loop.repeat(
-        iterations_per_loop_var,
-        single_tpu_train_step, [_INITIAL_LOSS])
+    return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
+                                [_INITIAL_LOSS])
 
   (loss,) = tpu.shard(
       multi_tpu_train_steps_on_single_shard,
@@ -2987,6 +3028,12 @@ class _CapturingContext(control_flow_ops.ControlFlowContext):
     control_flow_ops.ControlFlowContext.__init__(self)
     self._message = message
 
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    # pylint: disable=useless-super-delegation
+    # NOTE(slebedev): the method is required by `ControlFlowContext`.
+    super(_CapturingContext, self).to_control_flow_context_def(
+        context_def, export_scope)
+
   def AddOp(self, op):  # pylint: disable=invalid-name
     for c in op.inputs:
       if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
@@ -3027,7 +3074,7 @@ class _Inputs(object):
   @staticmethod
   def from_input_fn(return_values):
     """Returns an `_Inputs` instance according to `input_fn` return value."""
-    if isinstance(return_values, dataset_ops.Dataset):
+    if isinstance(return_values, dataset_ops.DatasetV2):
       dataset = return_values
       return _Inputs(dataset=dataset)
 
@@ -3047,23 +3094,19 @@ class _Inputs(object):
     """Returns True if the return value from input_fn is Dataset."""
     return self._dataset is not None
 
-  def dataset_initializer_hook(self):
-    """Returns a `SessionRunHook` to initialize this dataset.
+  def dataset_initializer(self):
+    """Returns the dataset's initializer.
 
-    This must be called before `features_and_labels`.
+    The initializer must be run before calling `features_and_labels`.
     """
-    iterator = self._dataset.make_initializable_iterator()
-    # pylint: disable=protected-access
-    hook = estimator_util._DatasetInitializerHook(iterator)
-    # pylint: enable=protected-access
-    self._iterator = iterator
-    return hook
+    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
+    return self._iterator.initializer
 
   def features_and_labels(self):
     """Gets `features` and `labels`."""
     if self.is_dataset:
       if self._iterator is None:
-        raise RuntimeError('Internal error: Must call dataset_initializer_hook '
+        raise RuntimeError('Internal error: Must run dataset_initializer '
                            'before calling features_and_labels(). Please file '
                            'a bug!')
       return _Inputs._parse_inputs(self._iterator.get_next())
@@ -3180,8 +3223,8 @@ class _InputsWithStoppingSignals(_Inputs):
 
       if add_padding:
         padding_mask, features, labels = (
-            _PaddingSignals.pad_features_and_labels(
-                features, labels, batch_size))
+            _PaddingSignals.pad_features_and_labels(features, labels,
+                                                    batch_size))
 
         new_input_dict['features'] = features
         if labels is not None:
@@ -3194,7 +3237,8 @@ class _InputsWithStoppingSignals(_Inputs):
         padding_mask = None
 
       new_input_dict['signals'] = _StopSignals(
-          stop=stop, batch_size=batch_size, padding_mask=padding_mask).as_dict()
+          stop=stop, batch_size=batch_size,
+          padding_mask=padding_mask).as_dict()
 
       return new_input_dict
 
@@ -3237,8 +3281,8 @@ class _StopSignals(object):
     if isinstance(scalar_stopping_signal, ops.Tensor):
       # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
       # way to express the bool check whether scalar_stopping_signal is True.
-      return math_ops.logical_and(
-          scalar_stopping_signal, _StopSignals.STOPPING_SIGNAL)
+      return math_ops.logical_and(scalar_stopping_signal,
+                                  _StopSignals.STOPPING_SIGNAL)
     else:
       # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
       # the graph anymore. Here, we use pure Python.
@@ -3257,7 +3301,8 @@ class _PaddingSignals(object):
     batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
 
     check_greater = check_ops.assert_greater_equal(
-        batch_size_tensor, real_batch_size,
+        batch_size_tensor,
+        real_batch_size,
         data=(batch_size_tensor, real_batch_size),
         message='The real batch size should not be greater than batch_size.')
 
@@ -3281,8 +3326,8 @@ class _PaddingSignals(object):
     if labels is not None:
       labels = nest_pad(labels)
 
-    padding_mask = _PaddingSignals._padding_mask(
-        real_batch_size, missing_count, batch_size)
+    padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count,
+                                                 batch_size)
 
     return padding_mask, features, labels
 
@@ -3330,20 +3375,20 @@ class _PaddingSignals(object):
 
   @staticmethod
   def _find_any_tensor(batch_features):
-    tensors = [x for x in nest.flatten(batch_features)
-               if isinstance(x, ops.Tensor)]
+    tensors = [
+        x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor)
+    ]
     if not tensors:
       raise ValueError('Cannot find any Tensor in features dict.')
     return tensors[0]
 
   @staticmethod
   def _padding_mask(real_batch_size, missing_count, batch_size):
-    padding_mask = array_ops.concat(
-        [
-            array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
-            array_ops.ones((missing_count,), dtype=dtypes.int32)
-        ],
-        axis=0)
+    padding_mask = array_ops.concat([
+        array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
+        array_ops.ones((missing_count,), dtype=dtypes.int32)
+    ],
+                                    axis=0)
     padding_mask.set_shape((batch_size,))
     return padding_mask
 
@@ -3361,9 +3406,11 @@ def _verify_cross_hosts_transfer_size(tensor_dict, message):
         '{} The transfer size is larger than the protobuf limit. Please '
         'consider to use Tensors with smaller shapes or reduce batch '
         'size. Given:\n'
-        '{}'.format(message, '\n'.join([
-            ' -- Key: {}, Shape: {}'.format(k, v)
-            for k, v in tensor_structure.items()])))
+        '{}'.format(
+            message, '\n'.join([
+                ' -- Key: {}, Shape: {}'.format(k, v)
+                for k, v in tensor_structure.items()
+            ])))
 
 
 def _add_item_to_params(params, key, value):
@@ -3392,8 +3439,8 @@ def export_estimator_savedmodel(estimator,
     estimator: `Estimator` with which model has been trained.
     export_dir_base: A string containing a directory in which to create
       timestamped subdirectories containing exported SavedModels.
-    serving_input_receiver_fn: A function that takes no argument and
-      returns a `ServingInputReceiver` or `TensorServingInputReceiver`.
+    serving_input_receiver_fn: A function that takes no argument and returns a
+      `ServingInputReceiver` or `TensorServingInputReceiver`.
     assets_extra: A dict specifying how to populate the assets.extra directory
       within the exported SavedModel, or `None` if no extra assets are needed.
     as_text: whether to write the SavedModel proto in text format.
@@ -3417,7 +3464,5 @@ def export_estimator_savedmodel(estimator,
       eval_batch_size=2048,  # Does not matter.
   )
   return est.export_savedmodel(export_dir_base, serving_input_receiver_fn,
-                               assets_extra,
-                               as_text,
-                               checkpoint_path,
+                               assets_extra, as_text, checkpoint_path,
                                strip_default_attrs)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
index bd530fdc3aaf585680ac94e1535051ae4156a925..e3ea983abfd24d03c964fbc647b56262e15e0a96 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.tpu.python.tpu import tpu_estimator
-from tensorflow.python import data as dataset_lib
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
@@ -34,10 +34,10 @@ def make_input_fn(num_samples):
 
   def input_fn(params):
     batch_size = params['batch_size']
-    da1 = dataset_lib.Dataset.from_tensor_slices(a)
-    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+    da1 = dataset_ops.Dataset.from_tensor_slices(a)
+    da2 = dataset_ops.Dataset.from_tensor_slices(b)
 
-    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset_ops.Dataset.zip((da1, da2))
     dataset = dataset.map(lambda fa, fb: {'a': fa, 'b': fb})
     dataset = dataset.batch(batch_size)
     return dataset
@@ -50,10 +50,10 @@ def make_input_fn_with_labels(num_samples):
 
   def input_fn(params):
     batch_size = params['batch_size']
-    da1 = dataset_lib.Dataset.from_tensor_slices(a)
-    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+    da1 = dataset_ops.Dataset.from_tensor_slices(a)
+    da2 = dataset_ops.Dataset.from_tensor_slices(b)
 
-    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset_ops.Dataset.zip((da1, da2))
     dataset = dataset.map(lambda fa, fb: ({'a': fa}, fb))
     dataset = dataset.batch(batch_size)
     return dataset
@@ -71,7 +71,7 @@ class TPUEstimatorStoppingSignalsTest(test.TestCase):
 
     with ops.Graph().as_default():
       dataset = input_fn(params)
-      features = dataset.make_one_shot_iterator().get_next()
+      features = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
       # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
       self.assertIsNone(features['a'].shape.as_list()[0])
@@ -100,7 +100,7 @@ class TPUEstimatorStoppingSignalsTest(test.TestCase):
     with ops.Graph().as_default():
       dataset = input_fn(params)
       inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size)
-      hook = inputs.dataset_initializer_hook()
+      dataset_initializer = inputs.dataset_initializer()
       features, _ = inputs.features_and_labels()
       signals = inputs.signals()
 
@@ -108,8 +108,7 @@ class TPUEstimatorStoppingSignalsTest(test.TestCase):
       self.assertIsNone(features['a'].shape.as_list()[0])
 
       with session.Session() as sess:
-        hook.begin()
-        hook.after_create_session(sess, coord=None)
+        sess.run(dataset_initializer)
 
         result, evaluated_signals = sess.run([features, signals])
         self.assertAllEqual(a[:batch_size], result['a'])
@@ -143,7 +142,7 @@ class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
       dataset = input_fn(params)
       inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
                                                         add_padding=True)
-      hook = inputs.dataset_initializer_hook()
+      dataset_initializer = inputs.dataset_initializer()
       features, _ = inputs.features_and_labels()
       signals = inputs.signals()
 
@@ -151,8 +150,7 @@ class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
       self.assertEqual(batch_size, features['a'].shape.as_list()[0])
 
       with session.Session() as sess:
-        hook.begin()
-        hook.after_create_session(sess, coord=None)
+        sess.run(dataset_initializer)
 
         result, evaluated_signals = sess.run([features, signals])
         self.assertAllEqual(a[:batch_size], result['a'])
@@ -187,7 +185,7 @@ class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
       dataset = input_fn(params)
       inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
                                                         add_padding=True)
-      hook = inputs.dataset_initializer_hook()
+      dataset_initializer = inputs.dataset_initializer()
       features, labels = inputs.features_and_labels()
       signals = inputs.signals()
 
@@ -195,8 +193,7 @@ class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
       self.assertEqual(batch_size, features['a'].shape.as_list()[0])
 
       with session.Session() as sess:
-        hook.begin()
-        hook.after_create_session(sess, coord=None)
+        sess.run(dataset_initializer)
 
         evaluated_features, evaluated_labels, evaluated_signals = (
             sess.run([features, labels, signals]))
@@ -255,7 +252,7 @@ class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
       dataset = input_fn(params)
       inputs = tpu_estimator._InputsWithStoppingSignals(dataset, batch_size,
                                                         add_padding=True)
-      hook = inputs.dataset_initializer_hook()
+      dataset_initializer = inputs.dataset_initializer()
       features, _ = inputs.features_and_labels()
       signals = inputs.signals()
 
@@ -264,8 +261,7 @@ class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
               features, signals))
 
       with session.Session() as sess:
-        hook.begin()
-        hook.after_create_session(sess, coord=None)
+        sess.run(dataset_initializer)
 
         result, evaluated_signals = sess.run([sliced_features, signals])
         self.assertAllEqual(a[:batch_size], result['a'])
@@ -297,7 +293,7 @@ class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
       dataset = input_fn(params)
       inputs = tpu_estimator._InputsWithStoppingSignals(
           dataset, batch_size, add_padding=True, num_invocations_per_step=2)
-      hook = inputs.dataset_initializer_hook()
+      dataset_initializer = inputs.dataset_initializer()
       features, _ = inputs.features_and_labels()
       signals = inputs.signals()
 
@@ -305,8 +301,7 @@ class TPUEstimatorStoppingSignalsWithPaddingTest(test.TestCase):
           tpu_estimator._PaddingSignals.slice_tensor_or_dict(features, signals))
 
       with session.Session() as sess:
-        hook.begin()
-        hook.after_create_session(sess, coord=None)
+        sess.run(dataset_initializer)
 
         result, evaluated_signals = sess.run([sliced_features, signals])
         self.assertAllEqual(a[:batch_size], result['a'])
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index e75a09492ec12b95bad32b221a8e78a1b79f3a6b..d5957b7e8ec40b40c7af8822378cee6134ef0d0f 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -26,7 +26,6 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
-from tensorflow.compiler.xla.python_api import xla_shape
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_sharding
@@ -92,8 +91,7 @@ class InfeedQueue(object):
       else:
         raise ValueError(
             "number of tuple elements cannot be inferred from InfeedQueue "
-            "constructor"
-        )
+            "constructor")
     if number_of_tuple_elements <= 0:
       raise ValueError("number_of_tuple_elements %d must be > 0" %
                        number_of_tuple_elements)
@@ -293,9 +291,8 @@ class InfeedQueue(object):
         self.number_of_tuple_elements
     """
     if len(input_tensors) != self.number_of_tuple_elements:
-      raise ValueError(
-          "input_tensors is %s, but should be a list of %d Tensors", (
-              str(input_tensors), self.number_of_tuple_elements))
+      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
+                       % (str(input_tensors), self.number_of_tuple_elements))
     self.set_tuple_shapes([t.shape for t in input_tensors])
     self.set_tuple_types([t.dtype for t in input_tensors])
 
@@ -451,8 +448,8 @@ class InfeedQueue(object):
       for i in xrange(1, self.number_of_tuple_elements):
         if devices[0] != devices[i]:
           raise ValueError(
-              "input devices for shard %d are %s, but should all be the same",
-              index, str(devices))
+              "input devices for shard %d are %s, but should all be the same" %
+              (index, str(devices)))
       with ops.colocate_with(inputs[0]):
         return tpu_ops.infeed_enqueue_tuple(
             inputs=inputs,
@@ -792,18 +789,14 @@ class _PartitionedInfeedQueue(InfeedQueue):
 
     Args:
       tensor: Input tensor for partitioning.
-      dims: A list of integer describes how to partition the input tensor.
+      dims: 1-D np.array of the list of integer describes how to partition the
+        input tensor.
 
     Raises:
       ValueError: If the tensor can't be partitioned by dims or the
         num_cores_per_replica doesn't match the number of
         partitions(dims.prod()).
     """
-    if dims is None:
-      return
-
-    dims = np.array(dims)
-
     if (dims < 1).any():
       raise ValueError("All input partition dims must be >= 1.")
 
@@ -823,11 +816,6 @@ class _PartitionedInfeedQueue(InfeedQueue):
           "partition dims = {}).".format(tensor.shape.as_list(), dims))
 
     tensor.shape.assert_is_fully_defined()
-    if (np.array(tensor.shape.as_list()) % dims != 0).any():
-      raise ValueError(
-          "All input partition dims must divide exactly into the `Tensor` "
-          "shape (tensor shape = {}, input partition dims = {}).".format(
-              tensor.shape.as_list(), dims))
 
   def _partition_or_replicate_on_host(self, tensor, dims):
     """Partitions or replicates the input tensor.
@@ -840,16 +828,39 @@ class _PartitionedInfeedQueue(InfeedQueue):
     Returns:
       An iterator of `Tensor`s or a list of partioned tensors.
     """
-    self._check_input_partition_dims(tensor, dims)
     if dims is None:
       return itertools.repeat(tensor)
-    else:
-      output = [tensor]
-      for axis, dim in enumerate(dims):
-        if dim > 1:
-          output = [array_ops.split(x, dim, axis=axis) for x in output]
-          output = nest.flatten(output)
-      return output
+    dims = np.array(dims)
+    self._check_input_partition_dims(tensor, dims)
+    output = [tensor]
+    shape_list = np.array(tensor.shape.as_list())
+    quotients, remainders = np.divmod(shape_list, dims)
+    for axis, (quotient, remainder, dim, original_size) in enumerate(
+        zip(quotients, remainders, dims, shape_list)):
+      if dim <= 1:
+        continue
+      if remainder > 0:
+        # For each dimension, when it cannot be evenly partitioned, XLA assumes
+        # tensors are partitioned in a greedy manner by using
+        # ceil_ratio(size/dim) first. E.g. 2D tensor with shape (5, 14) and dims
+        # are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] =>
+        # [[(3, 4), (3, 4), (2, 4), (2, 2)],
+        # [(2, 4), (2, 4), (2, 4), (2, 2)]]
+        ceil_ratio = quotient + 1
+        num_full_slots, left_over = np.divmod(original_size, ceil_ratio)
+        num_or_size_splits = [ceil_ratio] * num_full_slots + [left_over]
+        if len(num_or_size_splits) < dim:
+          num_or_size_splits += [0] * (dim - len(num_or_size_splits))
+        new_output = []
+        for x in output:
+          new_output.append(
+              array_ops.split(
+                  x, num_or_size_splits=num_or_size_splits, axis=axis))
+        output = new_output
+      else:
+        output = [array_ops.split(x, dim, axis=axis) for x in output]
+      output = nest.flatten(output)
+    return output
 
   def _tag_sharding_attribute_for_dequeued_tensor(self, tensor, dims):
     """Tags appropriate XLA sharding attribute to the dequeued tensor.
@@ -866,13 +877,9 @@ class _PartitionedInfeedQueue(InfeedQueue):
     elif np.prod(dims) == 1:
       return xla_sharding.assign_device(tensor, 0)
     else:
-      tile_shape = np.array(tensor.shape.as_list()) // dims
       tile_assignment = np.arange(np.prod(dims)).reshape(dims)
       return xla_sharding.tile(
           tensor=tensor,
-          tile_shape=xla_shape.CreateShapeFromDtypeAndTuple(
-              dtype=np.dtype(tensor.dtype.as_numpy_dtype),
-              shape_tuple=tile_shape),
           tile_assignment=tile_assignment)
 
   def _tag_sharding_attribute_for_dequeued_tensors(self, dequeues, dims):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_function.py b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
index 0c7a38dbbbdb08a57ee3a25e07c53df09479c831..84d5967ea547f0c036f7c9aa936ac0c99c141304 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_function.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 
 import contextlib
 
-from tensorflow.python.util import tf_inspect
-
 
 class TpuContext(object):
   """A context object holding state about the TPU computation being built."""
@@ -57,48 +55,3 @@ def tpu_shard_context(number_of_shards):
 
 def get_tpu_context():
   return _current_tpu_context
-
-
-def check_function_argument_count(func, input_arity, infeed_queue):
-  """Validate the number of input arguments to a tpu function.
-
-  Args:
-    func: the Python function that will be called to generate the body of an XLA
-      computation graph.
-    input_arity: the number of explicit arguments supplied by the caller.
-    infeed_queue: if not None, the infeed queue that will supply
-      additional arguments to the function.
-
-  Returns:
-    None if function can be called with the supplied number of
-      arguments, or an error string if it cannot.
-  """
-  def format_error(complaint, quantity):
-    return "%s %d argument%s" % (complaint, quantity, ""
-                                 if quantity == 1 else "s")
-
-  number_of_arguments_needed = input_arity
-  if infeed_queue is not None:
-    number_of_arguments_needed += infeed_queue.number_of_tuple_elements
-  arg_spec = tf_inspect.getargspec(func)
-  number_of_args = len(arg_spec.args)
-  if arg_spec.defaults is None:
-    number_of_defaults = 0
-  else:
-    number_of_defaults = len(arg_spec.defaults)
-  min_required_arguments = number_of_args - number_of_defaults
-  if number_of_arguments_needed < min_required_arguments:
-    # The required number of arguments is not enough to call the function.
-    if number_of_defaults == 0 and arg_spec.varargs is None:
-      return format_error("exactly", number_of_args)
-    else:
-      return format_error("at least", min_required_arguments)
-  if arg_spec.varargs is None and number_of_arguments_needed > number_of_args:
-    # The required number of arguments is too many to call the function.
-    if number_of_defaults == 0:
-      return format_error("exactly", number_of_args)
-    else:
-      return format_error("at most", number_of_args)
-  # Since there are varargs, func can accept any number of arguments
-  # greater than the minimum.
-  return None
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_function_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_function_test.py
deleted file mode 100644
index 463c249a95c8a07745b6603636f8f799384f2845..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tpu/python/tpu/tpu_function_test.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Tests for tpu_function helpers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-
-from tensorflow.python.platform import test
-
-
-class FunctionArgCheckTest(test.TestCase):
-
-  def testSimple(self):
-    """Tests that arg checker works for functions with no varargs or defaults.
-    """
-
-    def func(x, y, z):
-      return x + y + z
-
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 3, None))
-    self.assertEqual("exactly 3 arguments",
-                     tpu_function.check_function_argument_count(func, 2, None))
-    queue = tpu_feed.InfeedQueue(2)
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 1, queue))
-    self.assertEqual("exactly 3 arguments",
-                     tpu_function.check_function_argument_count(func, 2, queue))
-
-  def testDefaultArgs(self):
-    """Tests that arg checker works for a function with no varargs."""
-
-    def func(x, y, z=17):
-      return x + y + z
-
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 3, None))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 2, None))
-    self.assertEqual("at least 2 arguments",
-                     tpu_function.check_function_argument_count(func, 1, None))
-    self.assertEqual("at most 3 arguments",
-                     tpu_function.check_function_argument_count(func, 4, None))
-    queue = tpu_feed.InfeedQueue(1)
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 2, queue))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 1, queue))
-    self.assertEqual("at least 2 arguments",
-                     tpu_function.check_function_argument_count(func, 0, queue))
-    self.assertEqual("at most 3 arguments",
-                     tpu_function.check_function_argument_count(func, 4, queue))
-
-  def testVarArgs(self):
-    """Tests that arg checker works for a function with varargs."""
-
-    def func(x, y, *z):
-      return x + y + len(z)
-
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 2, None))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 3, None))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 4, None))
-    self.assertEqual("at least 2 arguments",
-                     tpu_function.check_function_argument_count(func, 1, None))
-    queue = tpu_feed.InfeedQueue(1)
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 1, queue))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 2, queue))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 3, queue))
-    self.assertEqual("at least 2 arguments",
-                     tpu_function.check_function_argument_count(func, 0, queue))
-
-  def testVarArgsAndDefaults(self):
-    """Tests that arg checker works for a function with varargs and defaults."""
-
-    def func(x, y, z=17, *q):
-      return x + y + z + len(q)
-
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 2, None))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 3, None))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 4, None))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 5, None))
-    self.assertEqual("at least 2 arguments",
-                     tpu_function.check_function_argument_count(func, 1, None))
-    queue = tpu_feed.InfeedQueue(1)
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 1, queue))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 2, queue))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 3, queue))
-    self.assertEqual(None,
-                     tpu_function.check_function_argument_count(func, 4, queue))
-    self.assertEqual("at least 2 arguments",
-                     tpu_function.check_function_argument_count(func, 0, queue))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index ec682e5829c4df536a043334b74200f0b6259df3..d66ecfcf4a56b8da1c2d2f518bebe4baa76b315e 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -52,6 +52,7 @@ def _query_tpu_system_metadata(master_address, cluster_def=None,
   devices = []
   device_dict = collections.defaultdict(list)
 
+  # TODO(b/120564445): Replace with standard library for retries.
   retry_count = 1
   while True:
     logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
diff --git a/tensorflow/contrib/tpu/python/tpu/training_loop.py b/tensorflow/contrib/tpu/python/tpu/training_loop.py
index 10a8bccf3b23add75188e16eb3591c32eb8621ee..0187b4bec6ecc55943bf48b9268a74e18ea5b488 100644
--- a/tensorflow/contrib/tpu/python/tpu/training_loop.py
+++ b/tensorflow/contrib/tpu/python/tpu/training_loop.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.compiler import xla
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 
 from tensorflow.python.framework import ops
@@ -59,7 +60,7 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
   input_types = [x.dtype for x in inputs]
   input_arity = len(inputs)
 
-  body_arg_error = tpu_function.check_function_argument_count(
+  body_arg_error = xla.check_function_argument_count(
       body, input_arity, infeed_queue)
   if body_arg_error is not None:
     if infeed_queue is None:
@@ -74,7 +75,7 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
           "infeed, but the computation needs %s" % (input_arity, str(
               [i.name for i in inputs]), infeed_queue.number_of_tuple_elements,
                                                     body_arg_error))
-  condition_arg_error = tpu_function.check_function_argument_count(
+  condition_arg_error = xla.check_function_argument_count(
       condition, input_arity, None)
   if condition_arg_error is not None:
     if infeed_queue is None:
@@ -165,8 +166,8 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
   # control dependencies from any side-effecting operations.
   if input_arity == 0:
     inputs = [array_ops.constant(0)]
-  return control_flow_ops.while_loop(condition_wrapper, body_wrapper, inputs,
-                                     name="")
+  return control_flow_ops.while_loop(
+      condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
 
 
 def repeat(n, body, inputs=None, infeed_queue=None, name=None):
diff --git a/tensorflow/contrib/tpu/python/tpu/util.py b/tensorflow/contrib/tpu/python/tpu/util.py
index b8ea307d8900cf1b6d1e6e808d0b9ede26f86490..dfb8ce1d1821da05c853bb0d10b1db3a857ccb1b 100644
--- a/tensorflow/contrib/tpu/python/tpu/util.py
+++ b/tensorflow/contrib/tpu/python/tpu/util.py
@@ -19,8 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
 import six
 
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training
 
 def check_positive_integer(value, name):
   """Checks whether `value` is a positive integer."""
@@ -29,3 +32,20 @@ def check_positive_integer(value, name):
 
   if value <= 0:
     raise ValueError('{} must be positive, got {}'.format(name, value))
+
+
+# TODO(b/118302029) Remove this copy of MultiHostDatasetInitializerHook after we
+# release a tensorflow_estimator with MultiHostDatasetInitializerHook in
+# python/estimator/util.py.
+class MultiHostDatasetInitializerHook(training.SessionRunHook):
+  """Creates a SessionRunHook that initializes all passed iterators."""
+
+  def __init__(self, dataset_initializers):
+    self._initializers = dataset_initializers
+
+  def after_create_session(self, session, coord):
+    del coord
+    start = time.time()
+    session.run(self._initializers)
+    logging.info('Initialized dataset iterators in %d seconds',
+                 time.time() - start)
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index b6514e19dc92fe4c7cdcdb6582a7c0ad5ad573d5..552febd80bd35b37a95cdaaf8d5923278311ac8e 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -89,12 +89,9 @@ handle training:
 
         dataset = tf.data.TFRecordDataset(
             filename, buffer_size=FLAGS.dataset_reader_buffer_size)
-        dataset = dataset.map(parser).cache().repeat().batch(batch_size)
-        images, labels = dataset.make_one_shot_iterator().get_next()
-        # set_shape to give inputs statically known shapes.
-        images.set_shape([batch_size, 28 * 28])
-        labels.set_shape([batch_size])
-        return images, labels
+        dataset = dataset.map(parser).cache().repeat().batch(
+            batch_size, drop_remainder=True)
+        return dataset
       return input_fn
 
 
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 00295f57f60858db5234ce28cc643ea9eee44daa..f6427ae05a20f253edf030eff0f860361616042b 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -26,7 +26,6 @@ py_library(
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
-        "python/training/tensor_queue_dataset.py",
         "python/training/training.py",
         "python/training/tuner.py",
     ],
@@ -287,28 +286,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "tensor_queue_dataset_test",
-    size = "large",
-    srcs = ["python/training/tensor_queue_dataset_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":training_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data",
-        "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
-        "//third_party/py/numpy",
-    ],
-)
-
 tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index 3547e71184ec2b99163ea4247c01d24487811b47..87ce57ef060a0eb9383248255713421c14988416 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -59,8 +59,6 @@ from tensorflow.contrib.training.python.training.hparam import *
 from tensorflow.contrib.training.python.training.resample import *
 from tensorflow.contrib.training.python.training.sampling_ops import *
 from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import *
-from tensorflow.contrib.training.python.training.tensor_queue_dataset import enqueue_in_queue_dataset
-from tensorflow.contrib.training.python.training.tensor_queue_dataset import prepend_from_queue_and_padded_batch_dataset
 from tensorflow.contrib.training.python.training.training import add_gradients_summaries
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms_fn
@@ -79,7 +77,6 @@ _allowed_symbols = [
     'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook',
     'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries',
     'clip_gradient_norms', 'clip_gradient_norms_fn', 'create_train_op',
-    'multiply_gradients', 'enqueue_in_queue_dataset',
-    'prepend_from_queue_and_padded_batch_dataset', 'train']
+    'multiply_gradients', 'train']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py
index 7140f2a46d57f0f3b76ff4f1ea9d0d73808405c8..849b77d60954dc91726e261a0523943d704e5d21 100644
--- a/tensorflow/contrib/training/python/training/sampling_ops.py
+++ b/tensorflow/contrib/training/python/training/sampling_ops.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -300,10 +301,11 @@ def _verify_data_inputs(tensor_list):
   """Verify that batched data inputs are well-formed."""
   for tensor in tensor_list:
     # Data tensor should have a batch dimension.
-    tensor_shape = tensor.get_shape().with_rank_at_least(1)
+    shape = tensor.get_shape().with_rank_at_least(1)
 
     # Data batch dimensions must be compatible.
-    tensor_shape[0].assert_is_compatible_with(tensor_list[0].get_shape()[0])
+    tensor_shape.dimension_at_index(shape, 0).assert_is_compatible_with(
+        tensor_list[0].get_shape()[0])
 
   return tensor_list
 
@@ -340,10 +342,11 @@ def _verify_input(tensor_list, labels, probs_list):
 
   for tensor in tensor_list:
     # Data tensor should have a batch dimension.
-    tensor_shape = tensor.get_shape().with_rank_at_least(1)
+    shape = tensor.get_shape().with_rank_at_least(1)
 
     # Data and label batch dimensions must be compatible.
-    tensor_shape[0].assert_is_compatible_with(labels.get_shape()[0])
+    tensor_shape.dimension_at_index(shape, 0).assert_is_compatible_with(
+        labels.get_shape()[0])
 
   # Data and labels must have the same, strictly positive batch size. Since we
   # can't assume we know the batch size at graph creation, add runtime checks.
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
deleted file mode 100644
index 8896a95327a4cb609a9a78412afa68b316a3131e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python wrappers for Datasets and Iterators."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import convert
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.util import nest as tf_nest
-
-
-class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that prepends a queue to another `Dataset`.
-
-  A vector of handles to the queue is returned as the first component of
-  the associated iterator.  This vector can be passed to
-  `enqueue_in_queue_dataset` to add new elements to the queue.
-  """
-
-  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
-    """Initialize `PrependFromQueueAndPaddedBatchDataset`."""
-    super(_PrependFromQueueAndPaddedBatchDataset, self).__init__(input_dataset)
-    if sparse.any_sparse(input_dataset.output_classes):
-      raise TypeError(
-          "Batching of padded sparse tensors is not currently supported")
-    self._input_dataset = input_dataset
-    self._batch_size = ops.convert_to_tensor(
-        batch_size, dtype=dtypes.int64, name="batch_size")
-    if padded_shapes is None:
-      self._padded_shapes = nest.map_structure(
-          convert.partial_shape_to_tensor, input_dataset.output_shapes)
-    else:
-      self._padded_shapes = nest.map_structure_up_to(
-          input_dataset.output_shapes, convert.partial_shape_to_tensor,
-          padded_shapes)
-    # pylint: disable=protected-access
-    padding_values = (
-        padding_values if padding_values is not None else
-        dataset_ops._default_padding(input_dataset))
-    self._padding_values = nest.map_structure_up_to(
-        input_dataset.output_shapes, dataset_ops._padding_value_to_tensor,
-        padding_values, input_dataset.output_types)
-    # pylint: enable=protected-access
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return gen_dataset_ops.prepend_from_queue_and_padded_batch_dataset(
-        self._input_dataset._as_variant_tensor(),
-        batch_size=self._batch_size,
-        padded_shapes=[
-            ops.convert_to_tensor(s, dtype=dtypes.int64)
-            for s in nest.flatten(self._padded_shapes)
-        ],
-        padding_values=nest.flatten(self._padding_values),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-    # pylint: enable=protected-access
-
-  @property
-  def output_classes(self):
-    return (ops.Tensor, self._input_dataset.output_classes)
-
-  def _as_batch_shape(self, shape_like):
-    return tensor_shape.vector(None).concatenate(
-        tensor_util.constant_value_as_shape(shape_like))
-
-  @property
-  def output_shapes(self):
-    # First output is a variant representing the Queue
-    return (tensor_shape.vector(None),
-            nest.map_structure(self._as_batch_shape, self._padded_shapes))
-
-  @property
-  def output_types(self):
-    # First output is a variant representing the Queue
-    return (dtypes.variant, self._input_dataset.output_types)
-
-
-def prepend_from_queue_and_padded_batch_dataset(batch_size,
-                                                padding_values=None,
-                                                padded_shapes=None):
-  """A transformation that prepends a queue to a `Dataset` and batches results.
-
-  A vector of handles to the queue is returned as the first component of the
-  associated iterator.  This vector can be passed to `enqueue_in_queue_dataset`
-  to add new elements to the queue.
-
-  Below is an example of how this dataset might be used to split incoming
-  variable-length sequences into "head" and "rest" parts, where "rest" parts
-  are re-enqueued back into the dataset.  A more realistic example would
-  perform some calculation on the "head" and modify some components of "rest"
-  with the result (before re-enqueueing).
-
-  ```python
-  dataset = tf.data.Dataset.from_tensor_slices([2*x for x in range(10)])
-  # Make a dataset of variable-length vectors and their lengths.
-  dataset = dataset.map(lambda count: (count, tf.ones((count,))))
-  # Emit a queue we can prepend to, and counts/values as padded batch.
-  dataset = dataset.apply(
-      tf.contrib.training.prepend_from_queue_and_padded_batch_dataset(
-        batch_size=10))
-  dataset = dataset.prefetch(1)
-
-  iterator = dataset.make_one_shot_iterator()
-  queue, (count, padded_value) = iterator.get_next()
-
-  # Split the padded_value into two pieces: head and rest
-  rest_indices = tf.squeeze(tf.where(count > 3), axis=1)
-  bound = tf.minimum(3, tf.reduce_max(count))
-  value_head = padded_value[:, :bound]
-  count_rest = tf.gather(count - 3, rest_indices)
-  value_rest = tf.gather(padded_value[:, bound:], rest_indices)
-  queue_rest = tf.gather(queue, rest_indices)
-  enqueue_rest_op = tf.contrib.training.enqueue_in_queue_dataset(
-    queue_rest, (count_rest, value_rest))
-  with tf.control_dependencies([enqueue_rest_op]):
-    calculation = fn(value_head)
-
-  while True:  # Will raise OutOfRange when finished with all pieces.
-    session.run(calculation)
-  ```
-
-  Args:
-    batch_size: `int64` scalar tensor.  The batch size to use when performing
-      padded batching.
-    padding_values: (optional) Nested tuple of scalar tensors.  If provided,
-      the structure and dtypes of padding_values should match that of
-      incoming dataset's `output_types`.
-    padded_shapes: (optional) Nested tuple of `int64` vector tensors.
-      If provided, the structure must match that of the incoming dataset's
-      `output_types`.  If not provided, the incoming dataset's `output_shapes`
-      is used.  Any unknown (`None` or `-1`) dimensions in the shapes are
-      treated as being unique per-batch: for each batch time, an unknown
-      dimension is replaced with the maximum given value of this dimension
-      across all tensors for the given component in the batch.
-
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    `tf.data.Dataset.apply`.
-  """
-
-  def _apply_fn(dataset):
-    return _PrependFromQueueAndPaddedBatchDataset(
-        dataset,
-        batch_size=batch_size,
-        padding_values=padding_values,
-        padded_shapes=padded_shapes)
-
-  return _apply_fn
-
-
-def enqueue_in_queue_dataset(queue, components):
-  """Enqueue components into queue from `PrependFromQueueAndPaddedBatchDataset`.
-
-  The components' dtypes and shapes must be compatible with the `output_shapes`
-  attribute of the `dataset` created by
-  `prepend_from_queue_and_padded_batch_dataset`.  This operation supports both
-  non-batched and batched modes.
-
-  For more details, see the example in the docstring for
-  `prepend_from_queue_and_padded_batch_dataset`.
-
-  Args:
-    queue: `variant` scalar or vector tensor.
-      The tensor emitted by the first component of the iterator associated with
-      `prepend_from_queue_and_padded_batch_dataset`.  If this is a scalar,
-      then the `components` input tensors should not have a prepended batch
-      dimension.
-    components: Nested tuple of tensors, each with a leading batch dimension
-      if `queue` is a vector.  The structure, dtypes, and shapes
-      (excluding batch dimension) must match the nested tuples
-      `dataset.output_types[1]` and `dataset.output_shapes[1]` (the non-queue
-      output types and shapes) of the `dataset` emitted by
-      the original `prepend_from_queue_and_padded_batch_dataset` call.
-
-  Returns:
-    An `Operation` that enqueues `components` into the dataset(s) associated
-    with entries of `queue`.
-  """
-  return gen_dataset_ops.enqueue_in_queue_dataset(
-      queue=queue, components=tf_nest.flatten(components))
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
deleted file mode 100644
index c1657fec7bbe4a3227c3ea273b72176ac4066c50..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TensorQueueDataset."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd
-from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-
-
-class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
-
-  def testNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    self.assertEqual((dtypes.variant, dtypes.int32), dataset.output_types)
-    self.assertAllEqual(([None],) * 2,
-                        [x.as_list() for x in dataset.output_shapes])
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertEqual([0], self.evaluate(value))
-    self.assertEqual([1], self.evaluate(value))
-    self.assertEqual([2], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertAllEqual([0, 1], self.evaluate(value))
-    self.assertAllEqual([2], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedWithBiggerPaddingNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=2, padded_shapes=[3]))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertAllEqual([[0, 0, 0], [1, 0, 0]], self.evaluate(value))
-    self.assertAllEqual([[2, 0, 0]], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedWithBiggerPaddingOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=1, padded_shapes=[3]))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0, 0, 0]], sess.run(value))
-      value_1, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[1, 0, 0]], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[-1, 0, 0]], value_2)
-      value_3 = sess.run(value)
-      self.assertAllEqual([[1, 0, 0]], value_3)
-      value_4, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[2, 0, 0]], value_4)
-      value_5 = sess.run(value)
-      self.assertAllEqual([[-2, 0, 0]], value_5)
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.cached_session() as sess:
-      self.assertEqual([0], sess.run(value))
-      value_1, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([1], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([-1], value_2)
-      value_3 = sess.run(value)
-      self.assertEqual([1], value_3)
-      value_4, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([2], value_4)
-      value_5 = sess.run(value)
-      self.assertEqual([-2], value_5)
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testBatchedOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]],
-                                                  array_ops.expand_dims(
-                                                      value[0], axis=0))
-    with self.cached_session() as sess:
-      value_0, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([0, 1], value_0)
-      value_1, _ = sess.run([value, enqueue_zeroth])
-      self.assertAllEqual([0, -1], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([0, 2], value_2)
-      self.assertAllEqual([0, -2], sess.run(value))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testManyEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_many_more = [
-        tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i)
-        for i in range(1000)
-    ]
-    with self.cached_session() as sess:
-      value_0, _ = sess.run((value, enqueue_many_more))
-      self.assertEqual([0], value_0)
-      rest = []
-      for _ in range(1000):
-        rest.append(sess.run(value))
-      self.assertEquals([[100 + i] for i in range(1000)], sorted(rest))
-      # Going back to the original input.
-      value_1, _ = sess.run((value, enqueue_many_more))
-      self.assertEqual(1, value_1)
-      rest = []
-      for _ in range(1000):
-        rest.append(sess.run(value))
-      self.assertEquals([[100 + i + 1] for i in range(1000)], sorted(rest))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testEnqueueWithPrefetch(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    # Prefetching will request additional values before they are
-    # available to the queue.
-    dataset = dataset.prefetch(buffer_size=3)
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1)
-    with self.cached_session() as sess:
-      i = 0
-      while i < 4:
-        received, _ = sess.run((value, enqueue))
-        if received.size > 0:
-          self.assertAllEqual([i], received)
-          i += 1
-      received_last = False
-      while True:
-        try:
-          received = sess.run(value)
-          if received.size > 0:
-            self.assertAllEqual([4], received)
-            received_last = True
-        except errors.OutOfRangeError:
-          break
-      self.assertTrue(received_last)
-
-  def testDatasetWithPaddedShapeSmallerThanInputFails(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0, 0, 0]]).repeat(None)
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=1, padded_shapes=[2]))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    with self.cached_session() as sess:
-      with self.assertRaisesOpError(
-          r"Incompatible input shapes at component 0 between "
-          r"input dataset this dataset: \[3\] vs. \[2\]"):
-        sess.run(value)
-
-  def testEnqueueWithIncompatibleInputsFailsWithInformativeError(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0]).repeat(None)
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-
-    enqueue_bad_structure = tqd.enqueue_in_queue_dataset(
-        queue_handle, (value, value))
-    enqueue_bad_dtype = tqd.enqueue_in_queue_dataset(queue_handle,
-                                                     np.array(
-                                                         [1.0],
-                                                         dtype=np.float32))
-    enqueue_bad_shape_no_batch_dim = tqd.enqueue_in_queue_dataset(
-        queue_handle, ([1],))
-    enqueue_bad_shape = tqd.enqueue_in_queue_dataset(queue_handle,
-                                                     np.array(
-                                                         [[1]], dtype=np.int32))
-
-    with self.cached_session() as sess:
-      with self.assertRaisesOpError(
-          "mismatched number of tensors.  Queue expects 1 tensors but "
-          "tried to insert 2"):
-        sess.run(enqueue_bad_structure)
-      with self.assertRaisesOpError(r"Expected component 0 to have batched "
-                                    r"shape \[1,...\], but saw shape: \[\]"):
-        sess.run(enqueue_bad_shape_no_batch_dim)
-      with self.assertRaisesOpError(
-          r"mismatched shapes at component 0.  Attempted to insert tensor "
-          r"with shape \[1\] but queue expected shape: \[\]"):
-        sess.run(enqueue_bad_shape)
-      with self.assertRaisesOpError(
-          r"mismatched dtypes at component 0.  Attempted to insert tensor "
-          r"of type float but queue expected type: int32"):
-        sess.run(enqueue_bad_dtype)
-
-  def testEnqueueWithPaddedBatchFailsWithInformativeError(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    with self.assertRaisesRegexp(
-        TypeError, r"Unable to create padding for field of type 'variant'"):
-      dataset.padded_batch(batch_size=10, padded_shapes=[1])
-
-  def testOneEnqueueWithPadding(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
-    # Make a dataset of variable-length vectors and their lengths.
-    dataset = dataset.map(
-        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
-    # Emit a queue we can prepend to, and counts/values as padded
-    # batch.
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=3))
-
-    iterator = dataset.make_one_shot_iterator()
-    queue, (count, padded_value) = iterator.get_next()
-
-    # Split the padded_value into two pieces: head and rest
-    rest_indices = array_ops.squeeze(array_ops.where(count > 2), axis=1)
-    bound = math_ops.minimum(2, math_ops.reduce_max(count))
-    value_head = padded_value[:, :bound]
-    count_rest = array_ops.gather(count - 2, rest_indices)
-    value_rest = array_ops.gather(padded_value, rest_indices)[:, bound:]
-    queue_rest = array_ops.gather(queue, rest_indices)
-    enqueue_rest_op = tqd.enqueue_in_queue_dataset(queue_rest,
-                                                   (count_rest, value_rest))
-    with ops.control_dependencies([enqueue_rest_op]):
-      calc = array_ops.identity(value_head)
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc))
-      self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc))
-      self.assertAllEqual([[6, 6]], sess.run(calc))
-      self.assertAllEqual([[6, 6]], sess.run(calc))
-      # Get some final batches due to prefetching.
-      for _ in range(3):
-        try:
-          self.assertAllEqual(
-              np.empty(shape=(0, 0), dtype=np.int32), sess.run(calc))
-        except errors.OutOfRangeError as e:
-          self.assertTrue(str(e).startswith("End of sequence"))
-
-  def testNonstandardPadding(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
-    # Make a dataset of variable-length vectors and their lengths.
-    dataset = dataset.map(
-        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
-    # Emit a queue we can prepend to, and counts/values as padded
-    # batch.
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=3, padding_values=(
-                0,
-                -1,
-            )))
-
-    iterator = dataset.make_one_shot_iterator()
-    _, (unused_count, padded_value) = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]],
-                          sess.run(padded_value))
-      self.assertAllEqual([[6] * 6], sess.run(padded_value))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(padded_value)
-
-
-# TODO(ebrevdo): Figure out how to use run_core_tests to test state
-# saving of an iterator that's had some tensors enqueued into its queue.
-class PrependFromQueueAndPaddedBatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
-
-  def testPrependFromQueueAndPaddedBatch(self):
-
-    def build_dataset(seq_lens):
-      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
-          lambda x: array_ops.fill([x], x)).apply(
-              tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=4))
-
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
-
-  def testPrependFromQueueAndPaddedBatchNonDefaultPadding(self):
-
-    def build_dataset(seq_lens):
-
-      def fill_tuple(x):
-        filled = array_ops.fill([x], x)
-        return (filled, string_ops.as_string(filled))
-
-      padded_shape = [-1]
-      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
-          fill_tuple).apply(
-              tqd.prepend_from_queue_and_padded_batch_dataset(
-                  batch_size=4,
-                  padded_shapes=(padded_shape, padded_shape),
-                  padding_values=(-1, "<end>")))
-
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/training/python/training/tuner.py b/tensorflow/contrib/training/python/training/tuner.py
index 8843632619f0881f888ca76c9de484f081786b19..ad647a61da7adb549ec21a940bf8682b722353b5 100644
--- a/tensorflow/contrib/training/python/training/tuner.py
+++ b/tensorflow/contrib/training/python/training/tuner.py
@@ -21,9 +21,12 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 from tensorflow.contrib.framework.python.framework import experimental
 
 
+@six.add_metaclass(abc.ABCMeta)
 class Tuner(object):
   """Tuner class is the interface for Experiment hyper-parameters tuning.
 
@@ -42,8 +45,6 @@ class Tuner(object):
     learn_runner.tune(experiment_fn=_create_my_experiment, tuner)
   """
 
-  __metaclass__ = abc.ABCMeta
-
   @experimental
   @abc.abstractmethod
   def next_trial(self):
diff --git a/tensorflow/contrib/util/__init__.py b/tensorflow/contrib/util/__init__.py
index 338acef63f244613cbd14a2da04c7ec4d811a0af..acc5a049aa87649e4f8bf3a00be605616ea7b630 100644
--- a/tensorflow/contrib/util/__init__.py
+++ b/tensorflow/contrib/util/__init__.py
@@ -15,8 +15,6 @@
 
 """Utilities for dealing with Tensors.
 
-See [Contrib Util](https://tensorflow.org/api_guides/python/contrib.util) guide.
-
 @@constant_value
 @@make_tensor_proto
 @@make_ndarray
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index f7c979e86320d59ad033e2b8d7fcdff89ce0d133..9db80f6b5736d849d88e1e41ea467a5ff11844f5 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -1028,7 +1027,10 @@ Status RdmaTensorResponse::PrepareRecvTensor(
     return errors::Aborted(
         "RecvTensor expects a different device incarnation: ",
         parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
-        ". Your worker job was probably restarted. Check your "
+        ". Your worker job (\"",
+        channel_->adapter_->worker_env_->session_mgr->LegacySession()
+            ->worker_name,
+        "\") was probably restarted. Check your "
         "worker job for the reason why it was restarted.");
   }
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 6a3ee3c1cbcb549e22ab62c71f15ab44f92183ff..66714235b535c14a8f13c40bb2a4df8d7494dc05 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -73,12 +73,10 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_header_only_library",
-    "full_path",
     "if_android",
     "if_ios",
     "if_linux_x86_64",
     "if_mobile",
-    "if_not_mobile",
     "if_not_windows",
     "if_windows",
     "tf_cc_test",
@@ -97,6 +95,8 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
+load("//tensorflow:tensorflow.bzl", "if_nccl")
+load("//tensorflow:tensorflow.bzl", "tensorflow_opensource_extra_deps")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 
 # For platform specific build config
@@ -110,8 +110,10 @@ load(
     "tf_additional_device_tracer_cuda_deps",
     "tf_additional_device_tracer_deps",
     "tf_additional_device_tracer_srcs",
+    "tf_additional_device_tracer_test_flags",
     "tf_additional_gdr_lib_defines",
     "tf_additional_human_readable_json_deps",
+    "tf_additional_logger_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
     "tf_additional_lib_hdrs",
@@ -131,7 +133,6 @@ load(
     "tf_kernel_tests_linkstatic",
     "tf_lib_proto_compiler_deps",
     "tf_lib_proto_parsing_deps",
-    "tf_nano_proto_library",
     "tf_platform_hdrs",
     "tf_platform_srcs",
     "tf_proto_library",
@@ -239,7 +240,6 @@ tf_proto_library(
     srcs = [],
     cc_api_version = 2,
     default_header = True,
-    js_api_version = 2,
     protodeps = [
         ":protos_all_proto",
         ":error_codes_proto",
@@ -253,15 +253,6 @@ tf_jspb_proto_library(
     deps = [":protos_all_cc"],
 )
 
-tf_nano_proto_library(
-    name = "protos_all_nano_proto",
-    field_style = "accessors",
-    generate_equals = 1,
-    generate_intdefs = 1,
-    visibility = ["//visibility:public"],
-    deps = [":protos_all_cc"],
-)
-
 proto_library(
     name = "example_protos",
     srcs = [
@@ -311,6 +302,7 @@ filegroup(
         "platform/env_time.h",
         "platform/logging.h",
         "platform/macros.h",
+        "platform/platform_strings.h",
         "platform/types.h",
     ],
     visibility = ["//visibility:private"],
@@ -337,6 +329,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "framework_bounds_check",
+    hdrs = ["framework/bounds_check.h"],
+    visibility = ["//tensorflow/core/kernels:friends"],
+    deps = [
+        "//tensorflow/core:platform_base",
+        "//third_party/eigen3",
+    ],
+)
+
 filegroup(
     name = "platform_port_hdrs",
     srcs = [
@@ -443,6 +445,18 @@ cc_library(
     ] + tf_additional_human_readable_json_deps(),
 )
 
+cc_library(
+    name = "logger",
+    srcs = tf_platform_srcs(["logger.cc"]),
+    hdrs = ["platform/logger.h"] + tf_platform_hdrs(["logger.h"]),
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+    ] + tf_additional_logger_deps(),
+)
+
 filegroup(
     name = "platform_env_hdrs",
     srcs = [
@@ -478,7 +492,10 @@ cc_library(
         ":platform_env_internal_hdrs",
     ],
     copts = tf_copts(),
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = [
+        "//tensorflow/c:__subpackages__",
+        "//tensorflow/core:__subpackages__",
+    ],
     deps = [
         ":error_codes_proto_cc",
         ":lib",
@@ -520,6 +537,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "platform_strings",
+    srcs = tf_platform_srcs([
+        "platform/platform_strings.cc",
+        "platform/platform_strings_computed.h",
+    ]),
+    hdrs = [
+        "platform/platform_strings.h",
+    ],
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [":lib"],
+)
+
 filegroup(
     name = "platform_other_hdrs",
     srcs = [
@@ -648,7 +678,6 @@ cc_library(
         "lib/core/arena.h",
         "lib/core/bitmap.h",
         "lib/core/bits.h",
-        "lib/core/casts.h",
         "lib/core/coding.h",
         "lib/core/errors.h",
         "lib/core/notification.h",
@@ -787,10 +816,13 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        ":function_ops_op_lib",
+        ":functional_ops_op_lib",
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
         "//tensorflow/core/platform/default/build_config:gtest",
+        "//tensorflow/core/kernels:required",
     ] + tf_additional_test_deps(),
 )
 
@@ -824,6 +856,7 @@ tf_cuda_library(
     hdrs = [
         "example/feature_util.h",
         "framework/allocator.h",
+        "framework/bounds_check.h",
         "framework/variant.h",
         "framework/variant_encode_decode.h",
         "framework/variant_op_registry.h",
@@ -839,6 +872,7 @@ tf_cuda_library(
         "framework/dataset_stateful_op_whitelist.h",
         "framework/device_base.h",
         "framework/function.h",
+        "framework/function_handle_cache.h",
         "framework/graph_def_util.h",
         "framework/graph_to_functiondef.h",
         "framework/kernel_def_builder.h",
@@ -854,6 +888,7 @@ tf_cuda_library(
         "framework/op_def_builder.h",
         "framework/op_def_util.h",
         "framework/op_kernel.h",
+        "framework/ops_util.h",
         "framework/partial_tensor_shape.h",
         "framework/queue_interface.h",
         "framework/reader_interface.h",
@@ -881,6 +916,7 @@ tf_cuda_library(
         "util/bcast.h",
         "util/cuda_kernel_helper.h",
         "util/device_name_utils.h",
+        "util/dump_graph.h",
         "util/events_writer.h",
         "util/example_proto_fast_parsing.h",
         "util/example_proto_helper.h",
@@ -898,6 +934,7 @@ tf_cuda_library(
         "util/stream_executor_util.h",
         "util/strided_slice_op.h",
         "util/tensor_format.h",
+        "util/tensor_ops_util.h",
         "util/tensor_slice_reader.h",
         "util/tensor_slice_reader_cache.h",
         "util/tensor_slice_writer.h",
@@ -1035,6 +1072,7 @@ tf_gen_op_libs(
         "batch_ops",
         "bitwise_ops",
         "boosted_trees_ops",
+        "tensor_forest_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
         "collective_ops",
@@ -1055,6 +1093,8 @@ tf_gen_op_libs(
         "logging_ops",
         "manip_ops",
         "math_ops",
+        "mkl_nn_ops",
+        "nccl_ops",
         "nn_ops",
         "no_op",
         "parsing_ops",
@@ -1080,7 +1120,11 @@ tf_gen_op_libs(
     op_lib_names = [
         "string_ops",
     ],
-    deps = ["@com_google_absl//absl/strings"],
+    deps = [
+        ":lib_internal",
+        ":lib_proto_parsing",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 tf_gen_op_libs(
@@ -1156,6 +1200,23 @@ tf_gen_op_libs(
     ],
 )
 
+cc_library(
+    name = "ragged_ops",
+    deps = [
+        ":ragged_array_ops_op_lib",
+        ":ragged_conversion_ops_op_lib",
+        ":ragged_math_ops_op_lib",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "ragged_array_ops",
+        "ragged_conversion_ops",
+        "ragged_math_ops",
+    ],
+)
+
 cc_library(
     name = "ops",
     visibility = ["//visibility:public"],
@@ -1165,6 +1226,7 @@ cc_library(
         ":batch_ops_op_lib",
         ":bitwise_ops_op_lib",
         ":boosted_trees_ops_op_lib",
+        ":tensor_forest_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
         ":collective_ops_op_lib",
@@ -1186,9 +1248,11 @@ cc_library(
         ":lookup_ops_op_lib",
         ":manip_ops_op_lib",
         ":math_ops_op_lib",
+        ":nccl_ops_op_lib",
         ":nn_ops_op_lib",
         ":no_op_op_lib",
         ":parsing_ops_op_lib",
+        ":ragged_ops",
         ":random_ops_op_lib",
         ":remote_fused_graph_ops_op_lib",
         ":resource_variable_ops_op_lib",
@@ -1207,7 +1271,7 @@ cc_library(
         ":training_ops_op_lib",
         ":user_ops_op_lib",
         ":word2vec_ops",
-    ] + tf_additional_cloud_op_deps(),
+    ] + if_mkl([":mkl_nn_ops_op_lib"]) + tf_additional_cloud_op_deps(),
     alwayslink = 1,
 )
 
@@ -1242,6 +1306,7 @@ cc_library(
     srcs = [
         "ops/math_grad.cc",
         "ops/random_grad.cc",
+        "ops/stateless_random_grad.cc",
     ],
     linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     visibility = ["//visibility:public"],
@@ -1262,7 +1327,9 @@ cc_library(
         ":framework",
         ":lib",
         ":nn_ops_op_lib",
-    ],
+    ] + if_mkl([
+        ":mkl_nn_ops_op_lib",
+    ]),
     alwayslink = 1,
 )
 
@@ -1313,6 +1380,7 @@ cc_library(
         "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/core/kernels:boosted_trees_ops",
+        "//tensorflow/core/kernels:tensor_forest_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
         "//tensorflow/core/kernels:collective_ops",
@@ -1341,6 +1409,7 @@ cc_library(
         "//tensorflow/core/kernels:parameterized_truncated_normal_op",
         "//tensorflow/core/kernels:parsing",
         "//tensorflow/core/kernels:partitioned_function_ops",
+        "//tensorflow/core/kernels:ragged_ops",
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:random_poisson_op",
         "//tensorflow/core/kernels:remote_fused_graph_ops",
@@ -1383,6 +1452,8 @@ cc_library(
     ]) + if_cuda([
         "//tensorflow/core/grappler/optimizers:gpu_swapping_kernels",
         "//tensorflow/core/grappler/optimizers:gpu_swapping_ops",
+    ]) + if_nccl([
+        "//tensorflow/core/kernels:nccl_kernels",
     ]),
 )
 
@@ -1406,7 +1477,8 @@ tf_cuda_library(
         ":example_parser_configuration",
         ":gpu_runtime",
         ":lib",
-    ],
+        ":ops",
+    ] + tensorflow_opensource_extra_deps(),
 )
 
 cc_library(
@@ -1458,12 +1530,16 @@ cc_library(
         ":test",
         ":testlib_ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/core/kernels:cast_op",
-        "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:ops_testutil",
         "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/kernels:random_ops",
-    ],
+    ] + if_dynamic_kernels(
+        [],
+        otherwise = [
+            "//tensorflow/core/kernels:cast_op",
+            "//tensorflow/core/kernels:constant_op",
+            "//tensorflow/core/kernels:random_ops",
+        ],
+    ),
 )
 
 cc_library(
@@ -1542,6 +1618,8 @@ filegroup(
             "util/stats_calculator.*",
             "util/reporter.*",
             "platform/**/cuda_libdevice_path.*",
+            "platform/**/logger.cc",
+            "platform/**/logger.h",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
             "platform/google/**/*",
@@ -1604,6 +1682,9 @@ filegroup(
 # operators, use :android_tensorflow_lib if you want full operator
 # support.
 #
+# If you just need TensorFlow types, e.g. Tensors, use
+# :android_tensorflow_lib_lite_no_runtime.
+#
 # Compiles to a trivial library on non-Android to prevent irrelevant
 # build errors. If not building this as part of an android_binary,
 # a command such as the following must be used:
@@ -1614,7 +1695,33 @@ filegroup(
 cc_library(
     name = "android_tensorflow_lib_lite",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts(android_optimization_level_override = None),
+    copts = tf_copts(android_optimization_level_override = None) + [
+        "-DSUPPORT_SELECTIVE_REGISTRATION",
+    ],
+    linkopts = ["-lz"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":mobile_additional_lib_deps",
+        ":protos_all_cc_impl",
+        ":stats_calculator_portable",
+        "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
+        "@nsync//:nsync_cpp",
+        "@protobuf_archive//:protobuf",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "android_tensorflow_lib_lite_nortti",
+    srcs = if_android(["//tensorflow/core:android_srcs"]),
+    copts = tf_copts(android_optimization_level_override = None) + [
+        "-DSUPPORT_SELECTIVE_REGISTRATION",
+    ] + tf_opts_nortti_if_android(),
     linkopts = ["-lz"],
     tags = [
         "manual",
@@ -1636,6 +1743,7 @@ cc_library(
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1725,50 +1833,21 @@ cc_library(
 # Does not contain operators. In contrast to android_tensorflow_lib_lite,
 # this links in framework support for all types, relying on selective
 # registration of ops to prune code size.
-cc_library(
+#
+# TODO(gonnet): Move all users of these aliases to the corresponding
+#     :android_tensorflow_lib_lite* targets and remove.
+alias(
     name = "android_tensorflow_lib_selective_registration",
-    srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts(android_optimization_level_override = None) + [
-        "-DSUPPORT_SELECTIVE_REGISTRATION",
-    ],
-    linkopts = if_android(["-lz"]),
-    tags = [
-        "manual",
-        "notap",
-    ],
+    actual = ":android_tensorflow_lib_lite",
     visibility = ["//visibility:public"],
-    deps = [
-        ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "@double_conversion//:double-conversion",
-        "@nsync//:nsync_cpp",
-        "@protobuf_archive//:protobuf",
-    ],
-    alwayslink = 1,
 )
 
 # Android library for use with the SELECTIVE_REGISTRATION feature with
 # no proto_rtti.
-cc_library(
+alias(
     name = "android_tensorflow_lib_selective_registration_nortti",
-    srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [
-        "-DSUPPORT_SELECTIVE_REGISTRATION",
-    ],
-    linkopts = if_android(["-lz"]),
-    tags = [
-        "manual",
-        "notap",
-    ],
+    actual = ":android_tensorflow_lib_lite_nortti",
     visibility = ["//visibility:public"],
-    deps = [
-        ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "@double_conversion//:double-conversion",
-        "@nsync//:nsync_cpp",
-        "@protobuf_archive//:protobuf",
-    ],
-    alwayslink = 1,
 )
 
 filegroup(
@@ -2009,9 +2088,7 @@ tf_proto_library_cc(
     srcs = ["protobuf/master.proto"],
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
-    visibility = [
-        "//tensorflow:internal",
-    ],
+    visibility = ["//tensorflow:internal"],
 )
 
 tf_proto_library_cc(
@@ -2137,6 +2214,7 @@ cc_library(
             "lib/**/*.cc",
             "platform/*.cc",
             "platform/profile_utils/**/*.cc",
+        ] + [
             "framework/resource_handle.cc",
             "util/env_var.cc",
         ],
@@ -2150,6 +2228,7 @@ cc_library(
             "platform/**/env_time.cc",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logger.cc",
             "platform/**/logging.cc",
             "platform/**/human_readable_json.cc",
             "platform/abi.cc",
@@ -2162,6 +2241,7 @@ cc_library(
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logger.cc",
             "platform/**/logging.cc",
             "platform/**/human_readable_json.cc",
             "platform/abi.cc",
@@ -2244,7 +2324,6 @@ cc_library(
     srcs = ["lib/png/png_io.cc"],
     hdrs = [
         "lib/bfloat16/bfloat16.h",
-        "lib/core/casts.h",
         "lib/core/stringpiece.h",
         "lib/png/png_io.h",
         "platform/byte_order.h",
@@ -2267,6 +2346,7 @@ cc_library(
         ":lib",
         ":lib_internal",
         "//tensorflow/core/platform/default/build_config:png",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@zlib_archive//:zlib",
     ],
@@ -2364,7 +2444,6 @@ cc_library(
     ]),
     hdrs = [
         "lib/bfloat16/bfloat16.h",
-        "lib/core/casts.h",
         "lib/core/stringpiece.h",
         "lib/png/png_io.h",
         "platform/byte_order.h",
@@ -2390,7 +2469,6 @@ tf_proto_library(
     srcs = ERROR_CODES_PROTO_SRCS,
     cc_api_version = 2,
     default_header = True,
-    js_api_version = 2,
     provide_cc_alias = True,
 )
 
@@ -2410,7 +2488,6 @@ tf_proto_library(
     srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
     cc_api_version = 2,
     default_header = True,
-    js_api_version = 2,
     protodeps = [
         ":error_codes_proto",
     ],
@@ -2486,6 +2563,7 @@ FRAMEWORK_INTERNAL_PRIVATE_HEADERS = [
 })
 
 FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
+    "framework/model.h",  # only needed for tests
     "framework/op_segment.h",
     "framework/rendezvous.h",  # only needed for tests
     "framework/resource_var.h",
@@ -2605,6 +2683,9 @@ tf_cuda_library(
         ":protos_all_cc",
         ":stats_calculator_portable",
         ":version_lib",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
@@ -2781,7 +2862,6 @@ tf_cuda_library(
         ":functional_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:required",
-        ":core_cpu_impl",
     ]),
     alwayslink = 1,
 )
@@ -2809,6 +2889,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/graph_optimizer.h",
     "common_runtime/local_device.h",
     "common_runtime/lower_if_op.h",
+    "common_runtime/lower_if_while.h",
     "common_runtime/lower_while_op.h",
     "common_runtime/memory_types.h",
     "common_runtime/mkl_cpu_allocator.h",
@@ -2864,6 +2945,7 @@ tf_cuda_library(
         "common_runtime/hierarchical_tree_broadcaster.cc",
         "common_runtime/local_device.cc",
         "common_runtime/lower_if_op.cc",
+        "common_runtime/lower_if_while.cc",
         "common_runtime/lower_while_op.cc",
         "common_runtime/memory_types.cc",
         "common_runtime/mkl_cpu_allocator.cc",
@@ -2906,6 +2988,7 @@ tf_cuda_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
+        "@com_google_absl//absl/memory",
         "//third_party/eigen3",
         "//tensorflow/core/grappler:grappler_item",
     ] + mkl_deps(),
@@ -2933,13 +3016,8 @@ tf_cuda_library(
     copts = tf_copts(),
     deps = [
         ":framework",
-        ":framework_internal",
-        ":function_ops_op_lib",
-        ":functional_grad",
-        ":functional_ops_op_lib",
         ":graph",
         ":lib",
-        ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
@@ -2947,8 +3025,13 @@ tf_cuda_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//third_party/eigen3",
+    ] + mkl_deps() + tf_additional_core_deps() + if_static([
+        ":core_cpu_impl",
+        ":function_ops_op_lib",
+        ":functional_grad",
+        ":functional_ops_op_lib",
         "//tensorflow/core/kernels:required",
-    ] + mkl_deps() + tf_additional_core_deps() + if_static([":core_cpu_impl"]),
+    ]),
     alwayslink = 1,
 )
 
@@ -2965,6 +3048,15 @@ cc_library(
     deps = [":lib_internal"],
 )
 
+tf_cuda_library(
+    name = "metrics",
+    srcs = ["common_runtime/metrics.cc"],
+    hdrs = ["common_runtime/metrics.h"],
+    deps = [
+        ":lib",
+    ],
+)
+
 tf_cuda_library(
     name = "direct_session_internal",
     srcs = ["common_runtime/direct_session.cc"],
@@ -2981,6 +3073,7 @@ tf_cuda_library(
         ":graph",
         ":lib",
         ":lib_internal",
+        ":metrics",
         ":proto_text",
         ":protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
@@ -3016,7 +3109,9 @@ tf_cuda_library(
     ],
     copts = tf_copts(),
     cuda_deps = if_cuda_is_configured(tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps()),
-    visibility = ["//visibility:private"],
+    visibility = [
+        "//tensorflow:internal",
+    ],
     deps = [
         ":core_cpu_internal",
         ":lib",
@@ -3285,7 +3380,6 @@ tf_cc_tests(
     size = "small",
     srcs = [
         "lib/core/arena_test.cc",
-        "lib/core/bit_cast_test.cc",
         "lib/core/bitmap_test.cc",
         "lib/core/blocking_counter_test.cc",
         "lib/core/coding_test.cc",
@@ -3343,6 +3437,7 @@ tf_cc_tests(
         "platform/profile_utils/cpu_utils_test.cc",
         "platform/stacktrace_handler_test.cc",
         "platform/subprocess_test.cc",
+        "platform/vmodule_benchmark_test.cc",
     ],
     deps = [
         ":lib",
@@ -3356,6 +3451,20 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "vmodule_test",
+    srcs = ["platform/vmodule_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test(
     name = "lib_random_random_distributions_test",
     srcs = ["lib/random/random_distributions_test.cc"],
@@ -3371,6 +3480,16 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "platform_strings_test",
+    size = "small",
+    srcs = ["platform/platform_strings_test.cc"],
+    deps = [
+        ":lib",
+        ":platform_strings",
+    ],
+)
+
 tf_cc_test(
     name = "platform_env_test",
     size = "small",
@@ -3491,6 +3610,7 @@ tf_cc_test(
         ":lib_internal",
         ":test",
         ":test_main",
+        "@com_google_absl//absl/base",
     ],
 )
 
@@ -3579,6 +3699,7 @@ tf_cc_tests(
         "framework/kernel_def_builder_test.cc",
         "framework/kernel_def_util_test.cc",
         "framework/memory_types_test.cc",
+        "framework/model_test.cc",
         "framework/node_def_builder_test.cc",
         "framework/node_def_util_test.cc",
         "framework/op_compatibility_test.cc",
@@ -3616,6 +3737,7 @@ tf_cc_tests(
         "util/bcast_test.cc",
         "util/command_line_flags_test.cc",
         "util/device_name_utils_test.cc",
+        "util/dump_graph_test.cc",
         "util/equal_graph_def_test.cc",
         "util/events_writer_test.cc",
         "util/example_proto_fast_parsing_test.cc",
@@ -3664,6 +3786,7 @@ tf_cc_tests(
         "//tensorflow/cc:while_loop",
         "//tensorflow/core/kernels:ops_util",
         "//third_party/eigen3",
+        "@com_google_absl//absl/base",
     ],
 )
 
@@ -3745,6 +3868,7 @@ tf_cc_tests_gpu(
         ":test",
         ":test_main",
         ":testlib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -3773,6 +3897,7 @@ tf_cc_tests_gpu(
         ":test",
         ":test_main",
         ":testlib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -4046,6 +4171,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:immutable_constant_op",
         "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:topk_op",
         "//third_party/eigen3",
     ],
 )
@@ -4330,13 +4456,17 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:functional_ops",
+        "//tensorflow/cc:sendrecv_ops",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:partitioned_function_ops",
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:shape_ops",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -4639,7 +4769,8 @@ tf_cc_test_gpu(
     name = "device_tracer_test",
     size = "small",
     srcs = ["platform/device_tracer_test.cc"],
-    args = ["--heap_check=local"],
+    args =
+        ["--heap_check=local"] + tf_additional_device_tracer_test_flags(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
@@ -4709,6 +4840,29 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_tests(
+    name = "common_runtime_lower_if_while_test",
+    size = "small",
+    srcs = ["common_runtime/lower_if_while_test.cc"],
+    deps = [
+        ":all_kernels",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+    ],
+)
+
 # Test data
 filegroup(
     name = "image_testdata",
@@ -4791,6 +4945,7 @@ transitive_hdrs(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:platform_strings",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor",
     ],
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 06b797e32edc046bab498f8d775040d57ef62ce9..f610facd75d8cfa42845714f87498bd7afff58e2 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -17,6 +17,10 @@ load(
     "tf_cc_binary",
     "tf_cc_test",
 )
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 filegroup(
     name = "base_api_def",
@@ -40,6 +44,7 @@ cc_library(
     name = "excluded_ops_lib",
     srcs = ["excluded_ops.cc"],
     hdrs = ["excluded_ops.h"],
+    copts = if_mkl(["-DINTEL_MKL=1"]),
 )
 
 cc_library(
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index 51812caeb2979270c913adee4fba2ce02f9c4d0e..7405e2ace72d1c08cf87cc0040e617379e18149b 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
 namespace {
@@ -176,6 +175,22 @@ void TestDeprecatedAttributesSetCorrectly(
     }
   }
 }
+
+void TestDeprecationVersionSetCorrectly(
+    const std::unordered_map<string, ApiDef>& api_defs_map) {
+  for (const auto& name_and_api_def : api_defs_map) {
+    const auto& name = name_and_api_def.first;
+    const auto& api_def = name_and_api_def.second;
+    if (api_def.deprecation_version() != 0) {
+      ASSERT_TRUE(api_def.deprecation_version() > 0)
+          << "Found ApiDef with negative deprecation_version";
+      ASSERT_FALSE(api_def.deprecation_message().empty())
+          << "ApiDef that includes deprecation_version > 0 must also specify "
+          << "a deprecation_message. Op " << name
+          << " has deprecation_version > 0 but deprecation_message is not set.";
+    }
+  }
+}
 }  // namespace
 
 class BaseApiTest : public ::testing::Test {
@@ -268,6 +283,12 @@ TEST_F(BaseApiTest, DeprecationSetCorrectly) {
   TestDeprecatedAttributesSetCorrectly(api_defs_map_);
 }
 
+// Checks that deprecation_version is set for entire op only if
+// deprecation_message is set.
+TEST_F(BaseApiTest, DeprecationVersionSetCorrectly) {
+  TestDeprecationVersionSetCorrectly(api_defs_map_);
+}
+
 class PythonApiTest : public ::testing::Test {
  protected:
   PythonApiTest() {
@@ -309,4 +330,10 @@ TEST_F(PythonApiTest, DeprecationSetCorrectly) {
   TestDeprecatedAttributesSetCorrectly(api_defs_map_);
 }
 
+// Checks that deprecation_version is set for entire op only if
+// deprecation_message is set.
+TEST_F(PythonApiTest, DeprecationVersionSetCorrectly) {
+  TestDeprecationVersionSetCorrectly(api_defs_map_);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
index 639d962874d083472e6df13550e107026fd2d0a1..32def912f83e420eab58a3071f573ae81139a298 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "batch_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
index cdaeb5091c7b407addec2811bbf0cb79e61db2d2..bfaf3d2ea5912bf5fde34a91ec51ad42f66b6adb 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "float_values"
     description: <<END
-float; List of Rank 2 Tensor each containing float values for a single feature.
+float; List of Rank 1 Tensor each containing float values for a single feature.
 END
   }
   in_arg {
@@ -17,7 +17,7 @@ END
   out_arg {
     name: "buckets"
     description: <<END
-int; List of Rank 2 Tensors each containing the bucketized values for a single feature.
+int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
index ca111af312daa6e7696203762cdd979345dc9bcf..e7a3ca3d9fd051a0fc08ef2a02a72bf3f9dcfaca 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
@@ -4,7 +4,7 @@ op {
   in_arg {
     name: "float_values"
     description: <<END
-float; List of Rank 2 Tensors each containing values for a single feature.
+float; List of Rank 1 Tensors each containing values for a single feature.
 END
   }
   in_arg {
@@ -22,8 +22,8 @@ END
   out_arg {
     name: "summaries"
     description: <<END
-float; List of Rank 2 Tensors each containing the quantile summary (value, weight,
-min_rank, max_rank) of a single feature.
+float; List of Rank 2 Tensors each containing the quantile summary
+(value, weight, min_rank, max_rank) of a single feature.
 END
   }
   attr {
@@ -35,6 +35,7 @@ END
   }
   summary: "Makes the summary of quantiles for the batch."
   description: <<END
-An op that takes a list of tensors and outputs the quantile summaries for each tensor.
+An op that takes a list of tensors (one tensor per feature) and outputs the
+quantile summaries for each tensor.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e61e5fa93aae47924dc7d4306f478e2adcfe9d6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceDeserialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "quantile_stream_resource_handle"
+    description: <<END
+resource handle referring to a QuantileStreamResource.
+END
+  }
+  in_arg {
+    name: "bucket_boundaries"
+    description: <<END
+float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+END
+  }
+  attr {
+    name: "num_streams"
+    description: <<END
+inferred int; number of features to get bucket boundaries for.
+END
+  }
+  summary: "Deserialize bucket boundaries and ready flag into current QuantileAccumulator."
+  description: <<END
+An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
deleted file mode 100644
index 73df11b2f75f82fad174fb7e77eccbef35c2c7d1..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BytesProducedStatsDataset"
-  summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
index 6889b8ea148b57da847964c062bd52b1027b8d22..9f7088b90077544ca11fff08dae526140ca1aa6e 100644
--- a/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "CacheDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filename"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
index 67281f9547ac6bb9df5b19e9f31da891454993bd..7997d8daaf91e47044f0729fb8a3c80d69d13acc 100644
--- a/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "ConcatenateDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that concatenates `input_dataset` with `another_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
index 2b9dffd883250fd5631444252e7b236116e2e822..27d7d6b98684e10853f2f73373a756f0006daa0e 100644
--- a/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "DatasetToSingleElement"
+  visibility: HIDDEN
   in_arg {
     name: "dataset"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
deleted file mode 100644
index e1b8a9abdd2bec0fda690f96d266569b2fb2fcab..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-op {
-  graph_op_name: "DatasetToTFRecord"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the dataset to write.
-END
-  }
-  in_arg {
-    name: "filename"
-    description: <<END
-A scalar string tensor representing the filename to use.
-END
-  }
-  in_arg {
-    name: "compression_type"
-    description: <<END
-A scalar string tensor containing either (i) the empty string (no
-compression), (ii) "ZLIB", or (iii) "GZIP".
-END
-  }
-  summary: "Writes the given dataset to the given file using the TFRecord format."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
deleted file mode 100644
index e275cfdd3de5de36979967b1d85d1ae9cd0582a8..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-op {
-  graph_op_name: "DenseToSparseBatchDataset"
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A handle to an input dataset. Must have a single component.
-END
-  }
-  in_arg {
-    name: "batch_size"
-    description: <<END
-A scalar representing the number of elements to accumulate in a
-batch.
-END
-  }
-  in_arg {
-    name: "row_shape"
-    description: <<END
-A vector representing the dense shape of each row in the produced
-SparseTensor. The shape may be partially specified, using `-1` to indicate
-that a particular dimension should use the maximum size of all batch elements.
-END
-  }
-  summary: "Creates a dataset that batches input elements into a SparseTensor."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_Dequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_Dequantize.pbtxt
index 40c00ef58f8d9e6262023d6a3299fa5f6fbd8f2a..cd4cc5c906b2b2416d06d39c6176c4de99c979f5 100644
--- a/tensorflow/core/api_def/base_api/api_def_Dequantize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Dequantize.pbtxt
@@ -21,7 +21,7 @@ used to convert the float values to their quantized equivalents.
 In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 
 ```
-if T == qint8, in[i] += (range(T) + 1)/ 2.0
+if T == qint8: in[i] += (range(T) + 1)/ 2.0
 out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
 ```
 here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
deleted file mode 100644
index 9722f5ede30cb0b893171bfc36a0eb8c1ab3c7e2..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "EnqueueInQueueDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc296162ae83117d349147c2655756c59384c051
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalBytesProducedStatsDataset"
+  visibility: HIDDEN
+  summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac014bcc5e6ae48cdecd6acefca267da3f2fe4f1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "ExperimentalDatasetCardinality"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to return cardinality for.
+END
+  }
+  out_arg {
+    name: "cardinality"
+    description: <<END
+The cardinality of `input_dataset`. Named constants are used to represent
+infinite and unknown cardinality.
+END
+  }
+  summary: "Returns the cardinality of `input_dataset`."
+  description: <<END
+Returns the cardinality of `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..085d20d7bf1882accfa3380465568774d1459afb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "ExperimentalDatasetToTFRecord"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to write.
+END
+  }
+  in_arg {
+    name: "filename"
+    description: <<END
+A scalar string tensor representing the filename to use.
+END
+  }
+  in_arg {
+    name: "compression_type"
+    description: <<END
+A scalar string tensor containing either (i) the empty string (no
+compression), (ii) "ZLIB", or (iii) "GZIP".
+END
+  }
+  summary: "Writes the given dataset to the given file using the TFRecord format."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ebd6d88a8b9ff9e0a855215a0167f043d083bad
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "ExperimentalDenseToSparseBatchDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A handle to an input dataset. Must have a single component.
+END
+  }
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch.
+END
+  }
+  in_arg {
+    name: "row_shape"
+    description: <<END
+A vector representing the dense shape of each row in the produced
+SparseTensor. The shape may be partially specified, using `-1` to indicate
+that a particular dimension should use the maximum size of all batch elements.
+END
+  }
+  summary: "Creates a dataset that batches input elements into a SparseTensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
deleted file mode 100644
index 66511eff60b900ab061c96d310ead3dfb7b3eba4..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-op {
-  graph_op_name: "ExperimentalFunctionBufferingResource"
-  in_arg {
-    name: "string_arg"
-    description: <<END
-String argument to the function call.
-END
-  }
-  in_arg {
-    name: "target_device"
-    description: <<END
-Target device to execute the function on.
-END
-  }
-  out_arg {
-    name: "resource"
-    description: <<END
-Handle to the resource created.
-END
-  }
-  attr {
-    name: "shared_name"
-    description: <<END
-If non-empty, this resource will be shared under the given name across
-multiple sessions.
-END
-  }
-  attr {
-    name: "container"
-    description: <<END
-If non-empty, this resource is placed in the given container.
-Otherwise, a default container is used.
-END
-  }
-  attr {
-    name: "f"
-    description: <<END
-Function to be executed.
-END
-  }
-  attr {
-    name: "buffer_size"
-    description: <<END
-Size of the buffer.
-END
-  }
-  attr {
-    name: "output_types"
-    description: <<END
-The type list for the return values.
-END
-  }
-  summary: <<END
-Creates a resource that fills up a buffer by making function calls.
-END
-  visibility: HIDDEN
-}
-
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
deleted file mode 100644
index bf4b66b22bfe23312ddfcb86ef0084d1d2fa71ea..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-op {
-  graph_op_name: "ExperimentalFunctionBufferingResourceGetNext"
-  in_arg {
-    name: "function_buffer_resource"
-    description: <<END
-The FunctionBufferingResource handle.
-END
-  }
-  out_arg {
-    name: "output"
-    description: <<END
-A list of return values.
-END
-  }
-  attr {
-    name: "output_types"
-    description: <<END
-The type list for the return values.
-END
-  }
-  summary: <<END
-Gets the next element from a FunctionBufferingResource.
-END
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
deleted file mode 100644
index 729718ddb3d4480f10f395f34e76d47a8b0f8b28..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
+++ /dev/null
@@ -1,13 +0,0 @@
-op {
-  graph_op_name: "ExperimentalFunctionBufferingResourceReset"
-  in_arg {
-    name: "function_buffer_resource"
-    description: <<END
-The FunctionBufferingResource handle.
-END
-  }
-  summary: <<END
-Resets the FunctionBufferingResource.
-END
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd132802fac8cbbd06872cd50415d3a5d29abc38
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "ExperimentalGroupByReducerDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "key_func_other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `key_func`.
+END
+  }
+  attr {
+    name: "key_func"
+    description: <<END
+A function mapping an element of `input_dataset`, concatenated
+with `key_func_other_arguments` to a scalar value of type DT_INT64.
+END
+  }
+  in_arg {
+    name: "init_func_other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `init_func`.
+END
+  }
+  attr {
+    name: "init_func"
+    description: <<END
+A function mapping a key of type DT_INT64, concatenated with
+`init_func_other_arguments` to the initial reducer state.
+END
+  }
+  in_arg {
+    name: "reduce_func_other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `reduce_func`.
+END
+  }
+  attr {
+    name: "reduce_func"
+    description: <<END
+A function mapping the current reducer state and an element of `input_dataset`,
+concatenated with `reduce_func_other_arguments` to a new reducer state.
+END
+  }
+  in_arg {
+    name: "finalize_func_other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `finalize_func`.
+END
+  }
+  attr {
+    name: "finalize_func"
+    description: <<END
+A function mapping the final reducer state to an output element.
+END
+  }
+  summary: "Creates a dataset that computes a group-by on `input_dataset`."
+  description: <<END
+Creates a dataset that computes a group-by on `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e4c12ed815d8119999852056a473b76e2d4ab90
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "ExperimentalGroupByWindowDataset"
+  visibility: HIDDEN
+  attr {
+    name: "key_func"
+    description: <<END
+A function mapping an element of `input_dataset`, concatenated
+with `key_func_other_arguments` to a scalar value of type DT_INT64.
+END
+  }
+  summary: "Creates a dataset that computes a windowed group-by on `input_dataset`."
+  description: <<END
+// TODO(mrry): Support non-int64 keys.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7351b9d70a75285351534d474209339b6bcbce4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalLatencyStatsDataset"
+  visibility: HIDDEN
+  summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc4270670c5369d6d7440b50dae98f367453b3d9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "ExperimentalMapAndBatchDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when building a closure
+for `f`.
+END
+  }
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch. It determines the number of concurrent invocations of `f` that process
+elements from `input_dataset` in parallel.
+END
+  }
+  in_arg {
+    name: "num_parallel_calls"
+    description: <<END
+A scalar representing the maximum number of parallel invocations of the `map_fn`
+function. Applying the `map_fn` on consecutive input elements in parallel has
+the potential to improve input pipeline throughput.
+END
+  }
+  in_arg {
+    name: "drop_remainder"
+    description: <<END
+A scalar representing whether the last batch should be dropped in case its size
+is smaller than desired.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+A function to apply to the outputs of `input_dataset`.
+END
+  }
+  summary: "Creates a dataset that fuses mapping with batching."
+  description: <<END
+Creates a dataset that applies `f` to the outputs of `input_dataset` and then
+batches `batch_size` of them.
+
+Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
+to `batch_size * num_parallel_batches` copies of `f` in parallel.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9619edcac1cce1bf8ab73ab271b647f902539bb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalMapDataset"
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMatchingFilesDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMatchingFilesDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..993a79814907a0d11c639ce60a785f740ec665c7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMatchingFilesDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalMatchingFilesDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMaxIntraOpParallelismDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMaxIntraOpParallelismDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a18aa378ffa1e6f8a1d857760b30d81f9afa15b1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMaxIntraOpParallelismDataset.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ExperimentalMaxIntraOpParallelismDataset"
+  in_arg {
+    name: "max_intra_op_parallelism"
+    description: <<END
+Identifies the maximum intra-op parallelism to use.
+END
+  }
+  summary: <<END
+Creates a dataset that overrides the maximum intra-op parallelism.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalNonSerializableDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalNonSerializableDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08632aa262a35b0f33bd4bdb82783dc7643c5c6d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalNonSerializableDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalNonSerializableDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalNumaMapAndBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalNumaMapAndBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..243922d96943d2c10a32d41eca1908124f92c3ce
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalNumaMapAndBatchDataset.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "ExperimentalNumaMapAndBatchDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when building a closure
+for `f`.
+END
+  }
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch. It determines the number of concurrent invocations of `f` that process
+elements from `input_dataset` in parallel.
+END
+  }
+  in_arg {
+    name: "num_parallel_calls"
+    description: <<END
+A scalar representing the maximum number of parallel invocations of the `map_fn`
+function. Applying the `map_fn` on consecutive input elements in parallel has
+the potential to improve input pipeline throughput.
+END
+  }
+  in_arg {
+    name: "drop_remainder"
+    description: <<END
+A scalar representing whether the last batch should be dropped in case its size
+is smaller than desired.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+A function to apply to the outputs of `input_dataset`.
+END
+  }
+  summary: "Creates a dataset that fuses mapping with batching."
+  description: <<END
+Creates a dataset that applies `f` to the outputs of `input_dataset` and then
+batches `batch_size` of them.
+
+Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
+to `batch_size * num_parallel_batches` copies of `f` in parallel.
+
+Unlike "MapAndBatchDatasetV2", this dataset uses a NUMA-aware thread scheduling
+policy. Because it uses the single-threaded executor, it only supports the
+function-based control flow ops.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd70e3328493825b268fc1a2f6e1c85207a426bf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "ExperimentalParallelInterleaveDataset"
+  visibility: HIDDEN
+  attr {
+    name: "f"
+    description: <<END
+A function mapping elements of `input_dataset`, concatenated with
+`other_arguments`, to a Dataset variant that contains elements matching
+`output_types` and `output_shapes`.
+END
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: <<END
+The resulting dataset is similar to the `InterleaveDataset`, with the exception
+that if retrieving the next value from a dataset would cause the requester to
+block, it will skip that input dataset. This dataset is especially useful
+when loading data from a variable-latency datastores (e.g. HDFS, GCS), as it
+allows the training step to proceed so long as some data is available.
+
+!! WARNING !! This dataset is not deterministic!
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2de13c5ceef4eced73f6e0984e70921926ece7f2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
@@ -0,0 +1,70 @@
+op {
+  graph_op_name: "ExperimentalParseExampleDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "dense_defaults"
+    description: <<END
+A dict mapping string keys to `Tensor`s.
+The keys of the dict must match the dense_keys of the feature.
+END
+  }
+  attr {
+    name: "sparse_keys"
+    description: <<END
+A list of string keys in the examples features.
+The results for these keys will be returned as `SparseTensor` objects.
+END
+  }
+  attr {
+    name: "dense_keys"
+    description: <<END
+A list of Ndense string Tensors (scalars).
+The keys expected in the Examples features associated with dense values.
+END
+  }
+  attr {
+    name: "sparse_types"
+    description: <<END
+A list of `DTypes` of the same length as `sparse_keys`.
+Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+and `tf.string` (`BytesList`) are supported.
+END
+  }
+    attr {
+    name: "Tdense"
+    description: <<END
+A list of DTypes of the same length as `dense_keys`.
+Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+and `tf.string` (`BytesList`) are supported.
+
+END
+  }
+  attr {
+    name: "dense_shapes"
+    description: <<END
+List of tuples with the same length as `dense_keys`.
+The shape of the data for each dense feature referenced by `dense_keys`.
+Required for any input tensors identified by `dense_keys`.  Must be
+either fully defined, or may contain an unknown first dimension.
+An unknown first dimension means the feature is treated as having
+a variable number of blocks, and the output shape along this dimension
+is considered unknown at graph build time.  Padding is applied for
+minibatch elements smaller than the maximum number of blocks for the
+given feature along this dimension.
+END
+  }
+    attr {
+    name: "output_types"
+    description: <<END
+The type list for the return values.
+END
+  }
+    attr {
+    name: "output_shapes"
+    description: <<END
+The list of shapes being produced.
+END
+  }
+   summary: "Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features."
+}
+
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalPrivateThreadPoolDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalPrivateThreadPoolDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eaa49b7fa5e9f98f02586d9922b00f0bda3af908
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalPrivateThreadPoolDataset.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ExperimentalPrivateThreadPoolDataset"
+  in_arg {
+    name: "num_threads"
+    description: <<END
+Identifies the number of threads to use for the private threadpool.
+END
+  }
+  summary: <<END
+Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5d7bc4adb79ac63aaf41f03063b26257ebee429
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "ExperimentalRandomDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "seed"
+    description: <<END
+A scalar seed for the random number generator. If either seed or
+seed2 is set to be non-zero, the random number generator is seeded
+by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  in_arg {
+    name: "seed2"
+    description: <<END
+A second scalar seed to avoid seed collision.
+END
+  }
+  summary: "Creates a Dataset that returns pseudorandom numbers."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4742cf4d57ff471178f0d59d9fd8a99a1e6f2166
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalScanDataset"
+  visibility: HIDDEN
+  summary: "Creates a dataset successively reduces `f` over the elements of `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e6b2f81b333899e3cdc2723edb537507f541a64
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalSetStatsAggregatorDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalSleepDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSleepDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9caff5439436df1d6c06a250f62542a7a6091737
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSleepDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalSleepDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc62750b66a996d1429fcd8477bcd57b7b488dda
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "ExperimentalSlidingWindowDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "window_size"
+    description: <<END
+A scalar representing the number of elements in the
+sliding window.
+END
+  }
+  in_arg {
+    name: "window_shift"
+    description: <<END
+A scalar representing the steps moving the sliding window
+forward in one iteration. It must be positive.
+END
+  }
+  in_arg {
+    name: "window_stride"
+    description: <<END
+A scalar representing the stride of the input elements of the sliding window.
+It must be positive.
+END
+  }
+  summary: "Creates a dataset that passes a sliding window over `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35cddbd061917e397aa7b10e7fee43033adfc2e2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "ExperimentalSqlDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "driver_name"
+    description: <<END
+The database type. Currently, the only supported type is 'sqlite'.
+END
+  }
+  in_arg {
+    name: "data_source_name"
+    description: <<END
+A connection string to connect to the database.
+END
+  }
+  in_arg {
+    name: "query"
+    description: <<END
+A SQL query to execute.
+END
+  }
+  summary: "Creates a dataset that executes a SQL query and emits rows of the result set."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a770d462d54230340ac278f755b997d7c9144a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalStatsAggregatorHandle"
+  visibility: HIDDEN
+  summary: "Creates a statistics manager resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ffe010368918a2134fa70d3bc6d6fb30a7dbc2c5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalStatsAggregatorSummary"
+  visibility: HIDDEN
+  summary: "Produces a summary of any statistics recorded by the given statistics manager."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c89e1fd0bdd6ef594797233170b41cb86521c84f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalUnbatchDataset"
+  visibility: HIDDEN
+  summary: "A dataset that splits the elements of its input into multiple elements."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractVolumePatches.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractVolumePatches.pbtxt
index 3c8a455983775fda8eef329603c761fbae8bcf58..9c4015eaa4cb1c112096773f6ba30edd3ec395ca 100644
--- a/tensorflow/core/api_def/base_api/api_def_ExtractVolumePatches.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractVolumePatches.pbtxt
@@ -42,8 +42,5 @@ We specify the size-related attributes as:
 ```
 END
   }
-  summary: <<END
-Extract `patches` from `input` and put them in the "depth" output
-dimension. 3D extension of `extract_image_patches`.
-END
+  summary: "Extract `patches` from `input` and put them in the \"depth\" output dimension. 3D extension of `extract_image_patches`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
index 4e48d6c169b6641ece5f11d5add478ce25611ee8..0ba2327371a4ba0f5f553815fc9e8c991f62b424 100644
--- a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most
+A complex tensor of the same shape as `input`. The inner-most
   dimension of `input` is replaced with its 1D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
index 555f8e60673d71e43dbb5d4dc17ae345606a2089..c7b780a56f04298bc7906955cb17bc335ec4e8d5 100644
--- a/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 2
+A complex tensor of the same shape as `input`. The inner-most 2
   dimensions of `input` are replaced with their 2D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
index fd60c0f3785a22f456c63285bf59381e6a2a5d66..776529bc593b10915c6be8c4a3bdac6e6b131c32 100644
--- a/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FilterDataset"
+  visibility: HIDDEN
   in_arg {
     name: "other_arguments"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
index 651b84d0d660a0bfc0ef45dd841dfc51ee1e3340..3b142432582146fcc0534d36d1aa063b71f11338 100644
--- a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FixedLengthRecordDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..def9f85e02d9d34412ed42d7774d77e8b6a328e0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordDatasetV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
index 1936119c50f5323e69465a79cda784afc68c3aca..1e20e853254ccb5086b3b52f473a4a823fefefe8 100644
--- a/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FlatMapDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
index 4f1cf3e6867a06df1f39774bc389fbe35a994ab4..06e9a6463e76dbf43caae878b62afcba55e6995d 100644
--- a/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "GeneratorDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that invokes a function to generate elements."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
deleted file mode 100644
index 067ad4018b09d4909325dbc152e30a0afcf29235..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
+++ /dev/null
@@ -1,69 +0,0 @@
-op {
-  graph_op_name: "GroupByReducerDataset"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the input dataset.
-END
-  }
-  in_arg {
-    name: "key_func_other_arguments"
-    description: <<END
-A list of tensors, typically values that were captured when
-building a closure for `key_func`.
-END
-  }
-  attr {
-    name: "key_func"
-    description: <<END
-A function mapping an element of `input_dataset`, concatenated
-with `key_func_other_arguments` to a scalar value of type DT_INT64.
-END
-  }
-  in_arg {
-    name: "init_func_other_arguments"
-    description: <<END
-A list of tensors, typically values that were captured when
-building a closure for `init_func`.
-END
-  }
-  attr {
-    name: "init_func"
-    description: <<END
-A function mapping a key of type DT_INT64, concatenated with
-`init_func_other_arguments` to the initial reducer state.
-END
-  }
-  in_arg {
-    name: "reduce_func_other_arguments"
-    description: <<END
-A list of tensors, typically values that were captured when
-building a closure for `reduce_func`.
-END
-  }
-  attr {
-    name: "reduce_func"
-    description: <<END
-A function mapping the current reducer state and an element of `input_dataset`,
-concatenated with `reduce_func_other_arguments` to a new reducer state.
-END
-  }
-  in_arg {
-    name: "finalize_func_other_arguments"
-    description: <<END
-A list of tensors, typically values that were captured when
-building a closure for `finalize_func`.
-END
-  }
-  attr {
-    name: "finalize_func"
-    description: <<END
-A function mapping the final reducer state to an output element.
-END
-  }
-  summary: "Creates a dataset that computes a group-by on `input_dataset`."
-  description: <<END
-Creates a dataset that computes a group-by on `input_dataset`.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
deleted file mode 100644
index ea6bcd469577d02e39afbeb2ba0c8b467e312ba9..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-op {
-  graph_op_name: "GroupByWindowDataset"
-  attr {
-    name: "key_func"
-    description: <<END
-A function mapping an element of `input_dataset`, concatenated
-with `key_func_other_arguments` to a scalar value of type DT_INT64.
-END
-  }
-  summary: "Creates a dataset that computes a windowed group-by on `input_dataset`."
-  description: <<END
-// TODO(mrry): Support non-int64 keys.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
index b793c99cf74408305b48dbbf1c9df7b03d09b2f3..c17a84000560e9e14e10326e42e84dd49d924bf2 100644
--- a/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most
+A complex tensor of the same shape as `input`. The inner-most
   dimension of `input` is replaced with its inverse 1D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
index 7f38f14308de70fb0ebc229064d010762055c458..7458d233ec8bd385e7976095d0cf89dfa0b36ace 100644
--- a/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 2
+A complex tensor of the same shape as `input`. The inner-most 2
   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
index bec2828e2462227b962bc045d796484a10365452..597edf5fb2b2d1c1f9d5a97992ec074385407f47 100644
--- a/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "InterleaveDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
deleted file mode 100644
index 78d946b0b47044855ff145e9492fdb3721ff0044..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "LatencyStatsDataset"
-  summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..280148e032c1c9fb498edf131a874d4b134fd0f0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LeakyRelu.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "LeakyRelu"
+  visibility: HIDDEN
+  summary: "Computes rectified linear: `max(features, features * alpha)`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LeakyReluGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_LeakyReluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e4275266020e210000d455700164ebe410cb2f35
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LeakyReluGrad.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "LeakyReluGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+The backpropagated gradients to the corresponding LeakyRelu operation.
+END
+  }
+  in_arg {
+    name: "features"
+    description: <<END
+The features passed as input to the corresponding LeakyRelu operation,
+OR the outputs of that operation (both work equivalently).
+END
+  }
+  out_arg {
+    name: "backprops"
+    description: <<END
+`gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
+END
+  }
+  summary: "Computes rectified linear gradients for a LeakyRelu operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableRemoveV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableRemoveV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..333fe6f4b2739541e4cd8bbf6e263d601e44a752
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableRemoveV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "LookupTableRemoveV2"
+  visibility: HIDDEN
+  endpoint {
+    name: "LookupTableRemove"
+  }
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+Any shape.  Keys of the elements to remove.
+END
+  }
+  summary: "Removes keys and its associated values from a table."
+  description: <<END
+The tensor `keys` must of the same type as the keys of the table. Keys not
+already in the table are silently ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Lu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Lu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35dbee8364ec596ee18cf8892361ee3112a7764a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Lu.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "Lu"
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+size `[M, M]`.
+END
+  }
+  out_arg {
+    name: "lu"
+    description: <<END
+A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+lower triangular factor `L` with unit diagonal, and whose upper triangular part
+denotes the upper triangular factor `U`.
+END
+  }
+  out_arg {
+    name: "p"
+    description: <<END
+Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+`[..., M]`.
+@compatibility(scipy)
+Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+packed into a single tensor, the permutation is applied to `input` instead of
+the right hand side and the permutation `P` is returned as a list of indices
+instead of a permutation matrix.
+@end_compatibility
+END
+  }
+  summary: "Computes the LU decomposition of one or more square matrices."
+  description: <<END
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices.
+
+The input has to be invertible.
+
+The output consists of two tensors LU and P containing the LU decomposition
+of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+upper triangular factors.
+
+For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+entries correspond to the upper triangular part, including the diagonal, of LU.
+
+P represents a permutation matrix encoded as a list of indices each between `0`
+and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+P, then the L, U and P satisfies P_mat * input = L * U.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
deleted file mode 100644
index e230c51edfe9355b556812b0946b3a4879f160bc..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
+++ /dev/null
@@ -1,53 +0,0 @@
-op {
-  graph_op_name: "MapAndBatchDataset"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the input dataset.
-END
-  }
-  in_arg {
-    name: "other_arguments"
-    description: <<END
-A list of tensors, typically values that were captured when building a closure
-for `f`.
-END
-  }
-  in_arg {
-    name: "batch_size"
-    description: <<END
-A scalar representing the number of elements to accumulate in a
-batch. It determines the number of concurrent invocations of `f` that process
-elements from `input_dataset` in parallel.
-END
-  }
-  in_arg {
-    name: "num_parallel_batches"
-    description: <<END
-A scalar representing the number of batches to create in parallel. Processing
-multiple batches in parallel benefits workloads prone to stragglers.
-END
-  }
-  in_arg {
-    name: "drop_remainder"
-    description: <<END
-A scalar representing whether the last batch should be dropped in case its size
-is smaller than desired.
-END
-  }
-  attr {
-    name: "f"
-    description: <<END
-A function to apply to the outputs of `input_dataset`.
-END
-  }
-  summary: "Creates a dataset that fuses mapping with batching."
-  description: <<END
-Creates a dataset that applies `f` to the outputs of `input_dataset` and then
-batches `batch_size` of them.
-
-Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
-to `batch_size * num_parallel_batches` copies of `f` in parallel.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
deleted file mode 100644
index 81ef92cae0c95c765a82c993f58f261509c47d71..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
+++ /dev/null
@@ -1,54 +0,0 @@
-op {
-  graph_op_name: "MapAndBatchDatasetV2"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the input dataset.
-END
-  }
-  in_arg {
-    name: "other_arguments"
-    description: <<END
-A list of tensors, typically values that were captured when building a closure
-for `f`.
-END
-  }
-  in_arg {
-    name: "batch_size"
-    description: <<END
-A scalar representing the number of elements to accumulate in a
-batch. It determines the number of concurrent invocations of `f` that process
-elements from `input_dataset` in parallel.
-END
-  }
-  in_arg {
-    name: "num_parallel_calls"
-    description: <<END
-A scalar representing the maximum number of parallel invocations of the `map_fn`
-function. Applying the `map_fn` on consecutive input elements in parallel has
-the potential to improve input pipeline throughput.
-END
-  }
-  in_arg {
-    name: "drop_remainder"
-    description: <<END
-A scalar representing whether the last batch should be dropped in case its size
-is smaller than desired.
-END
-  }
-  attr {
-    name: "f"
-    description: <<END
-A function to apply to the outputs of `input_dataset`.
-END
-  }
-  summary: "Creates a dataset that fuses mapping with batching."
-  description: <<END
-Creates a dataset that applies `f` to the outputs of `input_dataset` and then
-batches `batch_size` of them.
-
-Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
-to `batch_size * num_parallel_batches` copies of `f` in parallel.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
index 76d63ec2478e07d5af09754dc63994841119fa56..4f235f49461465931c6b863b2007c512511c873c 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "MapDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixSquareRoot.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixSquareRoot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9f1e593ccb99a21102a6857ecbe3dbc91a19abf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixSquareRoot.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "MatrixSquareRoot"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, M]`.
+
+@compatibility(scipy)
+Equivalent to scipy.linalg.sqrtm
+@end_compatibility
+END
+  }
+  summary: "Computes the matrix square root of one or more square matrices:"
+  description: <<END
+matmul(sqrtm(A), sqrtm(A)) = A
+
+The input matrix should be invertible. If the input matrix is real, it should
+have no eigenvalues which are real and negative (pairs of complex conjugate
+eigenvalues are allowed).
+
+The matrix square root is computed by first reducing the matrix to 
+quasi-triangular form with the real Schur decomposition. The square root 
+of the quasi-triangular matrix is then computed directly. Details of 
+the algorithm can be found in: Nicholas J. Higham, "Computing real 
+square roots of a real matrix", Linear Algebra Appl., 1987.
+
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. The output is a tensor of the same shape as the input
+containing the matrix square root for all input submatrices `[..., :, :]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NcclAllReduce.pbtxt b/tensorflow/core/api_def/base_api/api_def_NcclAllReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6f81482eb58343a4da66bbdb3a249ba9678d52a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NcclAllReduce.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "NcclAllReduce"
+  summary: "Outputs a tensor containing the reduction across all input tensors."
+  description: <<END
+Outputs a tensor containing the reduction across all input tensors passed to ops
+within the same `shared_name.
+
+The graph should be constructed so if one op runs with shared_name value `c`,
+then `num_devices` ops will run with shared_name value `c`.  Failure to do so
+will cause the graph execution to fail to complete.
+
+input: the input to the reduction
+data: the value of the reduction across all `num_devices` devices.
+reduction: the reduction operation to perform.
+num_devices: The number of devices participating in this reduction.
+shared_name: Identifier that shared between ops of the same reduction.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NcclBroadcast.pbtxt b/tensorflow/core/api_def/base_api/api_def_NcclBroadcast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38db37c4b542b447ce9dacba0bb28a160bb55e0f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NcclBroadcast.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "NcclBroadcast"
+  summary: "Sends `input` to all devices that are connected to the output."
+  description: <<END
+Sends `input` to all devices that are connected to the output.
+
+The graph should be constructed so that all ops connected to the output have a
+valid device assignment, and the op itself is assigned one of these devices.
+
+input: The input to the broadcast.
+output: The same as input.
+shape: The shape of the input tensor.
+
+END
+
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NcclReduce.pbtxt b/tensorflow/core/api_def/base_api/api_def_NcclReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7098fd4cce83afca5b0abc02821e60a4ea15c95
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NcclReduce.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "NcclReduce"
+  summary: "Reduces `input` from `num_devices` using `reduction` to a single device."
+  description: <<END
+Reduces `input` from `num_devices` using `reduction` to a single device.
+
+The graph should be constructed so that all inputs have a valid device
+assignment, and the op itself is assigned one of these devices.
+
+input: The input to the reduction.
+data: the value of the reduction across all `num_devices` devices.
+reduction: the reduction operation to perform.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
index d243dfe8b67bc14e9c5e22d5e68e3faf5d4684a8..53f4d94ecc8810a38aaafac29438d8186636684a 100644
--- a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "PaddedBatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "batch_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
deleted file mode 100644
index d6889b54a032bb20896dc7b03af5621f45d365d9..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-op {
-  graph_op_name: "ParallelInterleaveDataset"
-  attr {
-    name: "f"
-    description: <<END
-A function mapping elements of `input_dataset`, concatenated with
-`other_arguments`, to a Dataset variant that contains elements matching
-`output_types` and `output_shapes`.
-END
-  }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: <<END
-The resulting dataset is similar to the `InterleaveDataset`, with the exception
-that if retrieving the next value from a dataset would cause the requester to
-block, it will skip that input dataset. This dataset is especially useful
-when loading data from a variable-latency datastores (e.g. HDFS, GCS), as it
-allows the training step to proceed so long as some data is available.
-
-!! WARNING !! This dataset is not deterministic!
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
index 313494dd738b02d09807ec78fc8e0802e719e116..5343605edd5859d2cafa656f3821a318e24d0b09 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ParallelMapDataset"
+  visibility: HIDDEN
   in_arg {
     name: "num_parallel_calls"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
deleted file mode 100644
index 3de2f18fc28b57171b478f43c64a88d72069a89f..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
+++ /dev/null
@@ -1,69 +0,0 @@
-op {
-  graph_op_name: "ParseExampleDataset"
-  in_arg {
-    name: "dense_defaults"
-    description: <<END
-A dict mapping string keys to `Tensor`s.
-The keys of the dict must match the dense_keys of the feature.
-END
-  }
-  attr {
-    name: "sparse_keys"
-    description: <<END
-A list of string keys in the examples features.
-The results for these keys will be returned as `SparseTensor` objects.
-END
-  }
-  attr {
-    name: "dense_keys"
-    description: <<END
-A list of Ndense string Tensors (scalars).
-The keys expected in the Examples features associated with dense values.
-END
-  }
-  attr {
-    name: "sparse_types"
-    description: <<END
-A list of `DTypes` of the same length as `sparse_keys`.
-Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-and `tf.string` (`BytesList`) are supported.
-END
-  }
-    attr {
-    name: "Tdense"
-    description: <<END
-A list of DTypes of the same length as `dense_keys`.
-Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-and `tf.string` (`BytesList`) are supported.
-
-END
-  }
-  attr {
-    name: "dense_shapes"
-    description: <<END
-List of tuples with the same length as `dense_keys`.
-The shape of the data for each dense feature referenced by `dense_keys`.
-Required for any input tensors identified by `dense_keys`.  Must be
-either fully defined, or may contain an unknown first dimension.
-An unknown first dimension means the feature is treated as having
-a variable number of blocks, and the output shape along this dimension
-is considered unknown at graph build time.  Padding is applied for
-minibatch elements smaller than the maximum number of blocks for the
-given feature along this dimension.
-END
-  }
-    attr {
-    name: "output_types"
-    description: <<END
-The type list for the return values.
-END
-  }
-    attr {
-    name: "output_shapes"
-    description: <<END
-The list of shapes being produced.
-END
-  }
-   summary: "Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features."
-}
-
diff --git a/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
index e158eedc6f0ef11de3c8979d65dd69d8bece1eb4..a71336a285542bc4bdf095fb2ac477ea975725c0 100644
--- a/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "PrefetchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
deleted file mode 100644
index d4549340fac6d59cc994050e65f5a0016f2d52ab..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
index 9b500d0b58d2dad182a7069824a55ee953fbda05..dff7c8754f90026c69f22a3a1eea097b946a8c1f 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -39,6 +39,19 @@ END
     name: "range_given"
     description: <<END
 Whether the range is given or should be determined from the `input` tensor.
+END
+  }
+  attr {
+    name: "round_mode"
+    description: <<END
+The 'round_mode' attribute controls which rounding tie-breaking algorithm is
+used when rounding float values to their quantized equivalents. The following
+rounding modes are currently supported:
+
+*   HALF_TO_EVEN: this is the default round_mode.
+*   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
+    rounds up to -7.
+
 END
   }
   summary: "Quantizes then dequantizes a tensor."
@@ -93,5 +106,7 @@ following to each value in the 'input' tensor.
 
 output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
 
+The above round function rounds the value based on the given round_mode.
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
index 37ac10dddb7fc583d270fd6a75af6ab82f9206a5..b7311153f459d9f426efbbfaddcf3fa0307188eb 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
@@ -42,7 +42,7 @@ In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 
 ```
 out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-if T == qint8, out[i] -= (range(T) + 1) / 2.0
+if T == qint8: out[i] -= (range(T) + 1) / 2.0
 ```
 
 here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c40332ea28421e0b6a8ab771f6d19fdaa75a63a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt
@@ -0,0 +1,81 @@
+op {
+  graph_op_name: "RaggedGather"
+  visibility: HIDDEN
+  in_arg {
+    name: "params_nested_splits"
+    description: <<END
+The `nested_row_splits` tensors that define the row-partitioning for the
+`params` RaggedTensor input.
+END
+  }
+  in_arg {
+    name: "params_dense_values"
+    description: <<END
+The `flat_values` for the `params` RaggedTensor. There was a terminology change
+at the python level from dense_values to flat_values, so dense_values is the
+deprecated name.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Indices in the outermost dimension of `params` of the values that should be
+gathered.
+END
+  }
+  out_arg {
+    name: "output_nested_splits"
+    description: <<END
+The `nested_row_splits` tensors that define the row-partitioning for the
+returned RaggedTensor.
+END
+  }
+  out_arg {
+    name: "output_dense_values"
+    description: "The `flat_values` for the returned RaggedTensor."
+  }
+  attr {
+    name: "PARAMS_RAGGED_RANK"
+    description: <<END
+The ragged rank of the `params` RaggedTensor. `params_nested_splits` should
+contain this number of `row_splits` tensors. This value should equal
+`params.ragged_rank`.
+END
+  }
+  attr {
+    name: "OUTPUT_RAGGED_RANK"
+    description: <<END
+The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+this number of `row_splits` tensors. This value should equal
+`indices.shape.ndims + params.ragged_rank - 1`.
+END
+  }
+  summary: <<END
+Gather ragged slices from `params` axis `0` according to `indices`.
+END
+  description: <<END
+Outputs a `RaggedTensor` output composed from `output_dense_values` and
+`output_nested_splits`, such that:
+
+```python
+output.shape = indices.shape + params.shape[1:]
+output.ragged_rank = indices.shape.ndims + params.ragged_rank
+output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+```
+
+where
+
+* `params =
+   ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+   provides the values that should be gathered.
+* `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+   values should be gathered.
+* `output =
+   ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+   is the output tensor.
+
+(Note: This c++ op is used to implement the higher-level python
+`tf.ragged.gather` op, which also supports ragged indices.)
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a9b2af804483df8eafd3306fc4f68cb9de55f2b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "RaggedRange"
+  visibility: HIDDEN
+  in_arg{
+    name: "starts"
+    description: "The starts of each range."
+  }
+  in_arg{
+    name: "limits"
+    description: "The limits of each range."
+  }
+  in_arg{
+    name: "deltas"
+    description: "The deltas of each range."
+  }
+  out_arg{
+    name: "rt_nested_splits"
+    description: "The `row_splits` for the returned `RaggedTensor`."
+  }
+  out_arg{
+    name: "rt_dense_values"
+    description: "The `flat_values` for the returned `RaggedTensor`."
+  }
+  summary: <<END
+Returns a `RaggedTensor` containing the specified sequences of numbers.
+END
+  description: <<END
+
+Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+`rt_nested_splits`, such that
+`result[i] = range(starts[i], limits[i], deltas[i])`.
+
+```python
+>>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
+...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+>>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+>>> print result.eval().tolist()
+[[2],               # result[0] = range(2, 3)
+ [],                # result[1] = range(5, 5)
+ [8, 9, 10, 11]]    # result[2] = range(8, 12)
+```
+
+The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+The vector inputs must all have the same size.  Scalar inputs are broadcast
+to match the size of the vector inputs.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..958c71185e4b9f2f876ca66f9cfaeabcbe2050cc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "RaggedTensorToSparse"
+  visibility: HIDDEN
+  in_arg {
+    name: "rt_nested_splits"
+    description: "The `row_splits` for the `RaggedTensor`."
+  }
+  in_arg {
+    name: "rt_dense_values"
+    description: "The `flat_values` for the `RaggedTensor`."
+  }
+  out_arg {
+    name: "sparse_indices"
+    description: "The indices for the `SparseTensor`."
+  }
+  out_arg {
+    name: "sparse_values"
+    description: "The values of the `SparseTensor`."
+  }
+  out_arg {
+    name: "sparse_dense_shape"
+    description: <<END
+`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+END
+  }
+  attr {
+    name: "RAGGED_RANK"
+    description: <<END
+The ragged rank of the input RaggedTensor.  `rt_nested_splits` should contain
+this number of ragged-splits tensors.  This value should equal
+`input.ragged_rank`.
+END
+  }
+  summary: <<END
+Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+END
+  description: <<END
+input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+output=SparseTensor(indices=sparse_indices, values=sparse_values,
+                    dense_shape=sparse_dense_shape)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
deleted file mode 100644
index 0466b40f85eb118c94404e2f0d7670392bc7afdf..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-op {
-  graph_op_name: "RandomDataset"
-  in_arg {
-    name: "seed"
-    description: <<END
-A scalar seed for the random number generator. If either seed or
-seed2 is set to be non-zero, the random number generator is seeded
-by the given seed.  Otherwise, a random seed is used.
-END
-  }
-  in_arg {
-    name: "seed2"
-    description: <<END
-A second scalar seed to avoid seed collision.
-END
-  }
-  summary: "Creates a Dataset that returns pseudorandom numbers."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
index a9e14b8a052e416dd78f1abdc25c9b024a778107..4ac5050040c22ff6ffc5d0bb7c69453cd9e12f5c 100644
--- a/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "RangeDataset"
+  visibility: HIDDEN
   in_arg {
     name: "start"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
index fc6169cd32f1671000a9cb96209059d062c00db8..b2fcab15384d0cc7354699d15a25bdf8879fbac6 100644
--- a/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "RepeatDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ee16ef1baa86f31dfa78bb75aeea81e4b983972
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
@@ -0,0 +1,85 @@
+op {
+  graph_op_name: "ResourceApplyAdamWithAmsgrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "vhat"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the Adam algorithm."
+  description: <<END
+$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$vhat_t := max{vhat_{t-1}, v_t}$$
+$$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..830391a32baa48a358c5cd12d73bfc26b852fe6d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "ResourceApplyKerasMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var + momentum * accum, so in the end, the var you get is actually
+var + momentum * accum.
+END
+  }
+  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
+  description: <<END
+want to use Nesterov momentum.
+
+accum = accum * momentum - lr * grad
+var += accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b10b1bc2a9bb7a28f9f96fdb0328ab23952f7e56
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "ResourceSparseApplyKerasMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var + momentum * accum, so in the end, the var you get is actually
+var + momentum * accum.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
+  description: <<END
+Set use_nesterov = True if you want to use Nesterov momentum.
+
+That is for rows we have grad for, we update var and accum as follows:
+
+accum = accum * momentum - lr * grad
+var += accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
deleted file mode 100644
index e83d4a9e967f959b19adc5fad38a7141f8936cc4..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ScanDataset"
-  summary: "Creates a dataset successively reduces `f` over the elements of `input_dataset`."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 0b5917d428c5a2d8438294760020fa61efbe2b7a..41955cfbfa44a97659df26cfc6abb3a7a8c72582 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -32,6 +32,10 @@ slices within a tensor (initially zero for numeric, empty for string) of
 the given `shape` according to indices.  This operator is the inverse of the
 `tf.gather_nd` operator which extracts values or slices from a given tensor.
 
+This operation is similar to tensor_scatter_add, except that the tensor is
+zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+
 If `indices` contains duplicates, then their updates are accumulated (summed).
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
diff --git a/tensorflow/core/api_def/base_api/api_def_SdcaOptimizerV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SdcaOptimizerV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c615dee8c7dd6ac9050d83aa9d1e5b06c2375a8c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SdcaOptimizerV2.pbtxt
@@ -0,0 +1,171 @@
+op {
+  graph_op_name: "SdcaOptimizerV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "sparse_example_indices"
+    description: <<END
+a list of vectors which contain example indices.
+END
+  }
+  in_arg {
+    name: "sparse_feature_indices"
+    description: <<END
+a list of vectors which contain feature indices.
+END
+  }
+  in_arg {
+    name: "sparse_feature_values"
+    description: <<END
+a list of vectors which contains feature value
+associated with each feature group.
+END
+  }
+  in_arg {
+    name: "dense_features"
+    description: <<END
+a list of matrices which contains the dense feature values.
+END
+  }
+  in_arg {
+    name: "example_weights"
+    description: <<END
+a vector which contains the weight associated with each
+example.
+END
+  }
+  in_arg {
+    name: "example_labels"
+    description: <<END
+a vector which contains the label/target associated with each
+example.
+END
+  }
+  in_arg {
+    name: "sparse_indices"
+    description: <<END
+a list of vectors where each value is the indices which has
+corresponding weights in sparse_weights. This field maybe omitted for the
+dense approach.
+END
+  }
+  in_arg {
+    name: "sparse_weights"
+    description: <<END
+a list of vectors where each value is the weight associated with
+a sparse feature group.
+END
+  }
+  in_arg {
+    name: "dense_weights"
+    description: <<END
+a list of vectors where the values are the weights associated
+with a dense feature group.
+END
+  }
+  in_arg {
+    name: "example_state_data"
+    description: <<END
+a list of vectors containing the example state data.
+END
+  }
+  out_arg {
+    name: "out_example_state_data"
+    description: <<END
+a list of vectors containing the updated example state
+data.
+END
+  }
+  out_arg {
+    name: "out_delta_sparse_weights"
+    description: <<END
+a list of vectors where each value is the delta
+weights associated with a sparse feature group.
+END
+  }
+  out_arg {
+    name: "out_delta_dense_weights"
+    description: <<END
+a list of vectors where the values are the delta
+weights associated with a dense feature group.
+END
+  }
+  attr {
+    name: "loss_type"
+    description: <<END
+Type of the primal loss. Currently SdcaSolver supports logistic,
+squared and hinge losses.
+END
+  }
+  attr {
+    name: "adaptive"
+    default_value {
+      b: True
+    }
+    description: <<END
+Whether to use Adaptive SDCA for the inner loop.
+END
+  }
+  attr {
+    name: "num_sparse_features"
+    description: <<END
+Number of sparse feature groups to train on.
+END
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    description: <<END
+Number of sparse feature groups with values
+associated with it, otherwise implicitly treats values as 1.0.
+END
+  }
+  attr {
+    name: "num_dense_features"
+    description: <<END
+Number of dense feature groups to train on.
+END
+  }
+  attr {
+    name: "l1"
+    description: <<END
+Symmetric l1 regularization strength.
+END
+  }
+  attr {
+    name: "l2"
+    description: <<END
+Symmetric l2 regularization strength.
+END
+  }
+  attr {
+    name: "num_loss_partitions"
+    description: <<END
+Number of partitions of the global loss function.
+END
+  }
+  attr {
+    name: "num_inner_iterations"
+    description: <<END
+Number of iterations per mini-batch.
+END
+  }
+  summary: "Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for"
+  description: <<END
+linear models with L1 + L2 regularization. As global optimization objective is
+strongly-convex, the optimizer optimizes the dual objective at each step. The
+optimizer applies each update one example at a time. Examples are sampled
+uniformly, and the optimizer is learning rate free and enjoys linear convergence
+rate.
+
+[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+Shai Shalev-Shwartz, Tong Zhang. 2012
+
+$$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+
+[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+Peter Richtarik, Martin Takac. 2015
+
+[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
deleted file mode 100644
index 77123e143b200fc079879bc0e891a771a7cb67e7..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "SetStatsAggregatorDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
index fb425b24a4134366df1129df63dc0361537dd746..9ea1cc8babe8832d0553b942901c1c391f1b2709 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ShuffleAndRepeatDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
index ea5c52c0ee3826076b855ca243f03cb940b8e0b2..c7f4836a3ad32011f4903973f9400362c795c841 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ShuffleDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SinkDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SinkDataset.pbtxt
deleted file mode 100644
index b5758ddbfb0542cbbdf85ff278ae8e3ce833403a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_SinkDataset.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-op {
-  graph_op_name: "SinkDataset"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the input dataset.
-END
-  }
-  summary: "A placeholder for input pipeline graph optimizations."
-  description: <<END
-A placeholder for input pipeline graph optimizations.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
index 44e5bac79b8cdfb703d8679b66d79ab9e9e7509a..f830049d053b50257d343306c9726adcf10aabd7 100644
--- a/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "SkipDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
deleted file mode 100644
index ddde3ee5b4ef1d82cc244563d4835e319a9dc50a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-op {
-  graph_op_name: "SlideDataset"
-  in_arg {
-    name: "window_size"
-    description: <<END
-A scalar representing the number of elements in the
-sliding window.
-END
-  }
-  in_arg {
-    name: "window_shift"
-    description: <<END
-A scalar representing the steps moving the sliding window
-forward in one iteration. It must be positive.
-END
-  }
-  in_arg {
-    name: "window_stride"
-    description: <<END
-A scalar representing the stride of the input elements of the sliding window.
-It must be positive.
-END
-  }
-  summary: "Creates a dataset that passes a sliding window over `input_dataset`."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
index ffb805834908103865e5fcb8d98fb080d60a44ab..4203eca73a5f954a3f407f2a5ad9b1193b044ec5 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "SparseTensorSliceDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that splits a SparseTensor into elements row-wise."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
deleted file mode 100644
index 7570d5da5662b8eab90e7dd00f8cb225a963d373..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-op {
-  graph_op_name: "SqlDataset"
-  in_arg {
-    name: "driver_name"
-    description: <<END
-The database type. Currently, the only supported type is 'sqlite'.
-END
-  }
-  in_arg {
-    name: "data_source_name"
-    description: <<END
-A connection string to connect to the database.
-END
-  }
-  in_arg {
-    name: "query"
-    description: <<END
-A SQL query to execute.
-END
-  }
-  summary: "Creates a dataset that executes a SQL query and emits rows of the result set."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformInt.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformInt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6a6dbdf54a6c0ba7763f50f662051708c8776f7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformInt.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "StatelessRandomUniformInt"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  in_arg {
+    name: "minval"
+    description: <<END
+Minimum value (inclusive, scalar).
+END
+  }
+  in_arg {
+    name: "maxval"
+    description: <<END
+Maximum value (exclusive, scalar).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom random integers from a uniform distribution."
+  description: <<END
+The generated values follow a uniform distribution in the range `[minval, maxval)`.
+
+The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
deleted file mode 100644
index 9b30d64afe18a71fbbe73b397979796b8b844faa..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorHandle"
-  summary: "Creates a statistics manager resource."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
deleted file mode 100644
index bcaf9fea1af5123848b2d6267b3ef0f7279a7230..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorSummary"
-  summary: "Produces a summary of any statistics recorded by the given statistics manager."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
index 80f64cebb1bef262146afdadd5c37b0a30277db0..30e425794b358f9a99efae1c116d7b35753f6bff 100644
--- a/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TFRecordDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
index 8808dc6b1f0d0ae3a0e83f376eab245beaad2de1..eadcb6cd051bc306ba98d8a4318135e1fd7ccfb2 100644
--- a/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TakeDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
index 050e174aacb12b415357437e7f989b09faf40621..c086d7420c27055d374b1924148c868cc9d6dfcc 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "TensorDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that emits `components` as a tuple of tensors once."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe2ccd9da62db86c2204cad8be7ed0d7588eb47a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestCreateTreeVariable"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be created.
+END
+  }
+  in_arg {
+    name: "tree_config"
+    description: <<END
+Serialized proto string of the boosted_trees.Tree.
+END
+  }
+  summary: "Creates a tree resource and returns a handle to it."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43dbcb7b42d3bc72077292a765fe71d6393286ae
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeDeserialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be restored.
+END
+  }
+  in_arg {
+    name: "tree_config"
+    description: <<END
+Serialied proto string of the boosted_trees.Tree proto.
+END
+  }
+  summary: "Deserializes a proto into the tree handle"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9c7a67888e21cbc025750bce66a8b85da5f2519
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeIsInitializedOp"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree.
+END
+  }
+  out_arg {
+    name: "is_initialized"
+    description: <<END
+Whether the tree is initialized.
+END
+  }
+  summary: "Checks whether a tree has been initialized."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8d92702748299dbf38b187f412ad72920374dfb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "TensorForestTreePredict"
+  visibility: HIDDEN
+  attr {
+    name: "logits_dimension"
+    description: <<END
+Scalar, dimension of the logits.
+END
+  }
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource.
+END
+  }
+  in_arg {
+    name: "dense_features"
+    description: <<END
+Rank 2 dense features tensor.
+END
+  }
+  out_arg {
+    name: "logits"
+    description: <<END
+The logits predictions from the tree for each instance in the batch.
+END
+  }
+  summary: "Output the logits for the given input data"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbf5c51d647ca76e6af49af66c4e732a70d76472
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorForestTreeResourceHandleOp"
+  visibility: HIDDEN
+  summary: "Creates a handle to a TensorForestTreeResource"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aac2afa0f85958012abb336d0c853cc2ad6d2c90
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeSerialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be serialized.
+END
+  }
+  out_arg {
+    name: "tree_config"
+    description: <<END
+Serialied proto string of the tree resource.
+END
+  }
+  summary: "Serializes the tree handle to a proto"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b85b0ed6cf59bf69d9e48583ad39666aa21d6c5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeSize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource.
+END
+  }
+  out_arg {
+    name: "tree_size"
+    description: <<END
+The size of the tree.
+END
+  }
+  summary: "Get the number of nodes in a tree"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..909c09aa12bd715d4ec6b6d19a9cd6b4b72f804a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "TensorListConcat"
+  summary: "Concats all tensors in the list along the 0th dimension."
+  description: <<END
+Requires that all tensors have the same shape except the first dimension.
+
+input_handle: The input list.
+tensor: The concated result.
+lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24156cb8c47fab5af34bff3be3975b7a7959e542
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "TensorListSplit"
+  summary: "Splits a tensor into a list."
+  description: <<END
+list[i] corresponds to lengths[i] tensors from the input tensor.
+The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+
+tensor: The input tensor.
+element_shape: A shape compatible with that of elements in the tensor.
+lengths: Vector of sizes of the 0th dimension of tensors in the list.
+output_handle: The list.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1634e51c3cb6f009a2578b145f968af815da988f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterAdd.pbtxt
@@ -0,0 +1,94 @@
+op {
+  graph_op_name: "TensorScatterAdd"
+  in_arg {
+    name: "tensor"
+    description: <<END
+Tensor to copy/update.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor copied from tensor and updates added according to the indices.
+END
+  }
+  summary: "Adds sparse `updates` to an existing tensor according to `indices`."
+  description: <<END
+This operation creates a new tensor by adding sparse `updates` to the passed
+in `tensor`.
+This operation is very similar to `tf.scatter_nd_add`, except that the updates
+are added onto an existing tensor (as opposed to a variable). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of tensor_scatter_add is to add individual elements to a
+tensor by index. For example, say we want to add 4 elements in a rank-1
+tensor with 8 elements.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_add(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [1, 12, 1, 11, 10, 1, 1, 13]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4])
+    updated = tf.tensor_scatter_add(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..851628bce155874b164336f56a7e4c6f3a424d90
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterSub.pbtxt
@@ -0,0 +1,94 @@
+op {
+  graph_op_name: "TensorScatterSub"
+  in_arg {
+    name: "tensor"
+    description: <<END
+Tensor to copy/update.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor copied from tensor and updates subtracted according to the indices.
+END
+  }
+  summary: "Subtracts sparse `updates` from an existing tensor according to `indices`."
+  description: <<END
+This operation creates a new tensor by subtracting sparse `updates` from the
+passed in `tensor`.
+This operation is very similar to `tf.scatter_nd_sub`, except that the updates
+are subtracted from an existing tensor (as opposed to a variable). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of tensor_scatter_sub is to subtract individual elements
+from a tensor by index. For example, say we want to insert 4 scattered elements
+in a rank-1 tensor with 8 elements.
+
+In Python, this scatter subtract operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_sub(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [1, -10, 1, -9, -8, 1, 1, -11]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4])
+    updated = tf.tensor_scatter_sub(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a6ed1e1ce49891e98ec42c8f28f27c27d3669e7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt
@@ -0,0 +1,106 @@
+op {
+  graph_op_name: "TensorScatterUpdate"
+  in_arg {
+    name: "tensor"
+    description: <<END
+Tensor to copy/update.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor with the given shape and updates applied according
+to the indices.
+END
+  }
+  summary: "Scatter `updates` into an existing tensor according to `indices`."
+  description: <<END
+This operation creates a new tensor by applying sparse `updates` to the passed
+in `tensor`.
+This operation is very similar to `tf.scatter_nd`, except that the updates are
+scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+If `indices` contains duplicates, then their updates are accumulated (summed).
+
+**WARNING**: The order in which updates are applied is nondeterministic, so the
+output will be nondeterministic if `indices` contains duplicates -- because
+of some numerical approximation issues, numbers summed in different order
+may yield different results.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of scatter is to insert individual elements in a tensor by
+index. For example, say we want to insert 4 scattered elements in a rank-1
+tensor with 8 elements.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+</div>
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_update(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [1, 11, 1, 10, 9, 1, 1, 12]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4])
+    updated = tf.tensor_scatter_update(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
index a26a98fd7f3a6564309efd28dff8c2bc93d7a67f..30cb803b26bf836a7b02cc3fb6875175046eab94 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "TensorSliceDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that emits each dim-0 slice of `components` once."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
index 6b630509964ed56ecaf401b10a46c5e53cd46528..31ef3e3335e2812156fc3d1af2c5c1724fa52310 100644
--- a/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TextLineDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
deleted file mode 100644
index 324fadac0af5088e86e61beaaa27f2111cfd4b82..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "UnbatchDataset"
-  summary: "A dataset that splits the elements of its input into multiple elements."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15fc8747af14b4ee139fd5a6781ff6126ab95a64
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt
@@ -0,0 +1,87 @@
+op {
+  graph_op_name: "UnicodeDecodeWithOffsets"
+  in_arg {
+    name: "input"
+    description: <<END
+The text to be decoded. Can have any shape. Note that the output is flattened
+to a vector of char values.
+END
+  }
+  out_arg {
+    name: "row_splits"
+    description: <<END
+A 1D int32 tensor containing the row splits.
+END
+  }
+  out_arg {
+    name: "char_values"
+    description: <<END
+A 1D int32 Tensor containing the decoded codepoints.
+END
+  }
+  out_arg {
+    name: "char_to_byte_starts"
+    description: <<END
+A 1D int32 Tensor containing the byte index in the input string where each
+character in `char_values` starts.
+END
+  }
+  attr {
+    name: "input_encoding"
+    description: <<END
+Text encoding of the input strings. This is any of the encodings supported
+by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD or U+65533.)
+END
+  }
+  attr {
+    name: "replace_control_characters"
+    description: <<END
+Whether to replace the C0 control characters (00-1F) with the
+`replacement_char`. Default is false.
+END
+  }
+  summary: <<END
+Decodes each string in `input` into a sequence of Unicode code points.
+END
+  description: <<END
+The character codepoints for all strings are returned using a single vector
+`char_values`, with strings expanded to characters in row-major order.
+Similarly, the character start byte offsets are returned using a single vector
+`char_to_byte_starts`, with strings expanded in row-major order.
+
+The `row_splits` tensor indicates where the codepoints and start offsets for
+each input string begin and end within the `char_values` and
+`char_to_byte_starts` tensors.  In particular, the values for the `i`th
+string (in row-major order) are stored in the slice
+`[row_splits[i]:row_splits[i+1]]`. Thus:
+
+* `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+  character in the `i`th string (in row-major order).
+* `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+  character in the `i`th string (in row-major order).
+* `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+  string (in row-major order).
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeEncode.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeEncode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26f786586073f10d5ab93a3edaa928e868735878
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeEncode.pbtxt
@@ -0,0 +1,73 @@
+op {
+  graph_op_name: "UnicodeEncode"
+  visibility: HIDDEN
+  endpoint {
+    name: "UnicodeEncode"
+  }
+  in_arg {
+    name: "input_values"
+    description: <<END
+A 1D tensor containing the unicode codepoints that should be encoded.
+END
+  }
+  in_arg {
+    name: "input_splits"
+    description: <<END
+A 1D tensor specifying how the unicode codepoints should be split into strings.
+In particular, `output[i]` is constructed by encoding the codepoints in the
+slice `input_values[input_splits[i]:input_splits[i+1]]`.
+END
+  }
+  attr {
+    name: "output_encoding"
+    description: <<END
+Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+"UTF-16-BE", and "UTF-32-BE"`.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD (U+65533).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The 1-D Tensor of strings encoded from the provided unicode codepoints.
+END
+  }
+  summary: "Encode a tensor of ints into unicode strings."
+  description: <<END
+Returns a vector of strings, where `output[i]` is constructed by encoding the
+Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+using `output_encoding`.
+
+---
+
+Example:
+
+```
+input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+input_splits = [0, 5, 10]
+output_encoding = 'UTF-8'
+
+output = ['Hello', 'World']
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeTranscode.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeTranscode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b6e3de62a405ac75435d29cc6c42a0493a159a3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeTranscode.pbtxt
@@ -0,0 +1,91 @@
+op {
+  graph_op_name: "UnicodeTranscode"
+  in_arg {
+    name: "input"
+    description: <<END
+The text to be processed. Can have any shape.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A string tensor containing unicode text encoded using `output_encoding`.
+END
+  }
+  attr {
+    name: "input_encoding"
+    description: <<END
+Text encoding of the input strings. This is any of the encodings supported
+by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+END
+  }
+  attr {
+    name: "output_encoding"
+    description: <<END
+The unicode encoding to use in the output. Must be one of
+`"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD or U+65533.)
+
+Note that for UTF-8, passing a replacement character expressible in 1 byte, such
+as ' ', will preserve string alignment to the source since invalid bytes will be
+replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
+replacement character will preserve byte alignment to the source.
+END
+  }
+  attr {
+    name: "replace_control_characters"
+    description: <<END
+Whether to replace the C0 control characters (00-1F) with the
+`replacement_char`. Default is false.
+END
+  }
+  summary: <<END
+Transcode the input text from a source encoding to a destination encoding.
+END
+  description: <<END
+The input is a string tensor of any shape. The output is a string tensor of
+the same shape containing the transcoded strings. Output strings are always
+valid unicode. If the input contains invalid encoding positions, the
+`errors` attribute sets the policy for how to deal with them. If the default
+error-handling policy is used, invalid formatting will be substituted in the
+output by the `replacement_char`. If the errors policy is to `ignore`, any
+invalid encoding positions in the input are skipped and not included in the
+output. If it set to `strict` then any invalid formatting will result in an
+InvalidArgument error.
+
+This operation can be used with `output_encoding = input_encoding` to enforce
+correct formatting for inputs even if they are already in the desired encoding.
+
+If the input is prefixed by a Byte Order Mark needed to determine encoding
+(e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
+BOM will be consumed and not emitted into the output. If the input encoding
+is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
+interpreted as a non-breaking-space and is preserved in the output (including
+always for UTF-8).
+
+The end result is that if the input is marked as an explicit endianness the
+transcoding is faithful to all codepoints in the source. If it is not marked
+with an explicit endianness, the BOM is not considered part of the string itself
+but as metadata, and so is not preserved in the output.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnwrapDatasetVariant.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnwrapDatasetVariant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f88a1dac378b5fd8a3347df90b987d21644a3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnwrapDatasetVariant.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnwrapDatasetVariant"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WrapDatasetVariant.pbtxt b/tensorflow/core/api_def/base_api/api_def_WrapDatasetVariant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40f5c7a0d212fb74e67ea6dde58bca191a153231
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WrapDatasetVariant.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WrapDatasetVariant"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
index 7495693ccc50fede4a359d13aa710a1fd2fd9402..3c819963590f8f4ca05fd137ee70183c7d688aa2 100644
--- a/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "ZipDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that zips together `input_datasets`."
 }
diff --git a/tensorflow/core/api_def/excluded_ops.cc b/tensorflow/core/api_def/excluded_ops.cc
index 931c943dbc803c120d1beddbd4c2a67831834a6a..02026e94abc5b3284578859e157279b27ba84446 100644
--- a/tensorflow/core/api_def/excluded_ops.cc
+++ b/tensorflow/core/api_def/excluded_ops.cc
@@ -21,7 +21,19 @@ const std::unordered_set<std::string>* GetExcludedOps() {
   static std::unordered_set<std::string>* excluded_ops =
       new std::unordered_set<std::string>(
           {"BigQueryReader", "GenerateBigQueryReaderPartitions",
-           "GcsConfigureBlockCache", "GcsConfigureCredentials"});
+           "GcsConfigureBlockCache", "GcsConfigureCredentials",
+#ifdef INTEL_MKL
+           // QuantizedFusedOps for Intel CPU
+           "QuantizedConv2DAndRequantize", "QuantizedConv2DWithBias",
+           "QuantizedConv2DWithBiasAndRequantize", "QuantizedConv2DAndRelu",
+           "QuantizedConv2DAndReluAndRequantize",
+           "QuantizedConv2DWithBiasAndRelu",
+           "QuantizedConv2DWithBiasAndReluAndRequantize",
+           "QuantizedConv2DWithBiasSumAndRelu",
+           "QuantizedConv2DWithBiasSumAndReluAndRequantize",
+           "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+#endif  // INTEL_MKL
+          });
   return excluded_ops;
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt b/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt
index 1fd8baf05f56888fdd04cc6ed7b0b808df3e82e9..f4d7f498b2591f3a1e109c29d332f695b52f928e 100644
--- a/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "acos"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Acosh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Acosh.pbtxt
index f7946652ef848bb579f6f6f8946b09283b1925fe..e921f26d1ed62083a77cdab535b45850a767beed 100644
--- a/tensorflow/core/api_def/python_api/api_def_Acosh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Acosh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "acosh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Add.pbtxt b/tensorflow/core/api_def/python_api/api_def_Add.pbtxt
index fb505a91ac3da82e07e4c04e25e6cc5ac3fe3e9d..4c6f387ebd2ef482affcaa4a3827d50a6edaf3cf 100644
--- a/tensorflow/core/api_def/python_api/api_def_Add.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Add.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "add"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
index ea65543a768074653a999ab2f86a084917345ac3..d51defc376ff9a0961ed5bd43b848ea3f6df288d 100644
--- a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "as_string"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Asin.pbtxt b/tensorflow/core/api_def/python_api/api_def_Asin.pbtxt
index eedf4553c6a8781aaa27bb6aa7efc29300f81df2..b13f5c398f92df56d52d174bf256fdfffc942c5f 100644
--- a/tensorflow/core/api_def/python_api/api_def_Asin.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Asin.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "asin"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Asinh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Asinh.pbtxt
index 10c2fb356ef258b5884024cbc67ceaa034522e45..89a3f9da44656808628a4bff99aeabc722bc7f9a 100644
--- a/tensorflow/core/api_def/python_api/api_def_Asinh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Asinh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "asinh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Atan.pbtxt b/tensorflow/core/api_def/python_api/api_def_Atan.pbtxt
index 03dd5dc848eab4c175004d243ce90e39bb33091c..4403a2379cfc9565f2e730284bb865a512caf52d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Atan.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Atan.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "atan"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Atan2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Atan2.pbtxt
index 85b27bd881dd1aa153cc27a773191f2743a00b4f..56eed0f0fbb219381b764ac6a0e137498e96b2c7 100644
--- a/tensorflow/core/api_def/python_api/api_def_Atan2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Atan2.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "atan2"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Atanh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Atanh.pbtxt
index ee7c0600d6b23cbb9ee28c14c5a3ac5b71449f8b..a8f5e792f06640ca36f756ba2d55148dde7ba1f6 100644
--- a/tensorflow/core/api_def/python_api/api_def_Atanh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Atanh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "atanh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
deleted file mode 100644
index 4289c1daf96583943b8dfad84aeca3351657bee4..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt
index e395e333bf510402b36db94f939abdceafc11a76..94ffc7c068edd961ced8879fde3482076376010f 100644
--- a/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt
@@ -1,10 +1,12 @@
 op {
   graph_op_name: "BatchToSpaceND"
+  deprecation_message: "use batch_to_space"
   endpoint {
     name: "batch_to_space_nd"
+    deprecation_version: 2
   }
   endpoint {
     name: "manip.batch_to_space_nd"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
index 7965af4916e7b8f590bd22452459410075c37cf8..fdbe5282bc136fa7cb59e9e638e6f1952b3ed5ce 100644
--- a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "BesselI0e"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.bessel_i0e"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
index dffd296f6d8288356add56f8fbff01bfc4c9213a..3f08cd766d8cb0698c62fbb488ce71ea8018d9e2 100644
--- a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "BesselI1e"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.bessel_i1e"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Betainc.pbtxt b/tensorflow/core/api_def/python_api/api_def_Betainc.pbtxt
index 7ad7cbcba9a90643dac7d39e0185ac57c1b0107b..1c90c56f5e73f97148132c4ad4db5a31aa33a82d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Betainc.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Betainc.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "betainc"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
deleted file mode 100644
index fcf541f9036baaef1590f06da0d7471b0558b4c7..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BytesProducedStatsDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
deleted file mode 100644
index 2bbb4ff9e3b08d0dd11c7444e5d00feb514e81c0..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "CacheDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Ceil.pbtxt b/tensorflow/core/api_def/python_api/api_def_Ceil.pbtxt
index f2265bad56cd8cb19ac5f4b45f0a5b62c6ffa257..331bb9cbf5581ce7a75d2377f5364b3629c66901 100644
--- a/tensorflow/core/api_def/python_api/api_def_Ceil.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Ceil.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "ceil"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt b/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
index 541b09a591fcddd6398a195f25b444be732e778e..cf7a56ec782360076a18aa9ab7959e0de4a20987 100644
--- a/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
@@ -1,10 +1,12 @@
 op {
   graph_op_name: "CheckNumerics"
+  deprecation_version: 2
+  deprecation_message: "Use debugging.assert_all_finite instead"
   endpoint {
     name: "debugging.check_numerics"
   }
   endpoint {
     name: "check_numerics"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Cholesky.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cholesky.pbtxt
index 942f4e6ed8da2bba2450a192e4a2b5fdc97dba1e..5db2667262686f04d9f12f32d6855bac76041b92 100644
--- a/tensorflow/core/api_def/python_api/api_def_Cholesky.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Cholesky.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "cholesky"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
deleted file mode 100644
index c005a4da0f866c1d1106effabbaa22f1abecf422..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ConcatenateDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
index 2ae75d6da222d84245bb2a912942522eb52047bc..1f4bc6d22e3e9aa6e5923bd4fccf6caec322921d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2D"
-  endpoint {
-    name: "nn.conv2d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
index 6f21d8c8802f9a18c9357dbe68d3c65407bff923..1a9d96f3ab184d22ee999f727cb0f8f33e86841d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2DBackpropFilter"
-  endpoint {
-    name: "nn.conv2d_backprop_filter"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
index ea976799cbc73bc9164a15e781a051f03e14275b..1505a307658786b2c9d68263d7b50e87348d5027 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2DBackpropInput"
-  endpoint {
-    name: "nn.conv2d_backprop_input"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
index ba8d178263c94574c0aaac8f1f24fb1424a50275..cb463dd0d8d725ca4851d93e37d1f6b63e4117c8 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv3D"
-  endpoint {
-    name: "nn.conv3d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
index 1da8ee3a25f36a0b44f6458a351854190fe7830f..590b37c95fb2a43e49d5c5ae4dcfe8cc499a4c6d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -1,6 +1,10 @@
 op {
   graph_op_name: "Conv3DBackpropFilterV2"
+  endpoint {
+    name: "nn.conv3d_backprop_filter"
+  }
   endpoint {
     name: "nn.conv3d_backprop_filter_v2"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Cos.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cos.pbtxt
index 1af8c0c2c9f4b88da4c315427455ac4d46bb101a..db52d25ff2d9a8afef759b77eee03274c19256cf 100644
--- a/tensorflow/core/api_def/python_api/api_def_Cos.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Cos.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "cos"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Cosh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cosh.pbtxt
index 2de87df40d726ea6022a5a85583fcf327f7ce800..74bf57356596488dddca1e208b7ed86e831a256b 100644
--- a/tensorflow/core/api_def/python_api/api_def_Cosh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Cosh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "cosh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
index ce65f8172ddfea2ae08750cf37bba8e3e012f5f5..2559a6c80b812475ef5b6ca5d0a0cc35bffc4d4b 100644
--- a/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "CropAndResize"
-  endpoint {
-    name: "image.crop_and_resize"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Cross.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cross.pbtxt
index e8a871cae6b101b883ca25fe812bdc12b4aa64c7..51394dda4e97979a439e87c11b4763ac7ba2a0fd 100644
--- a/tensorflow/core/api_def/python_api/api_def_Cross.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Cross.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "cross"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
deleted file mode 100644
index e3d34cc15be752b466aa03f6805cd687698f74fa..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DatasetToSingleElement"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
index fbe9c882538776abb35b7c654ede0fffbfaa078c..2c3857cc539df8cfc9085d0a44628ebbb6a36e34 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeAndCropJpeg"
-  endpoint {
-    name: "image.decode_and_crop_jpeg"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeBase64.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeBase64.pbtxt
index 8b96eee6311e4ab22e5faa3e42229af850b678ec..e4a61e122ceddb4d8902f49852577b115aef7688 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeBase64.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeBase64.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "decode_base64"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
index 573d83f3739a86d00550c519cb19aef452813927..ffe19ca8dc3a91857b6c5473209670c3b0f1240a 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeBmp"
-  endpoint {
-    name: "image.decode_bmp"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt
index 829608fc8f9ae9f2859fcf2a50c881557069538d..a85a76a8dc6669514f47953cff638003bacea3c8 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeCompressed.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "decode_compressed"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
index eed64df79cf7837c1cc0580dd2cb0f06acf289cc..ff68b997e14c043b1d1af8b22ba99607106bb302 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeGif"
-  endpoint {
-    name: "image.decode_gif"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeJSONExample.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeJSONExample.pbtxt
index 9f28bc5f59bdc1c99351a6d13eaed84c200ccfb8..13ffbcce7c71cce1fc9950bf708e0bd5318af020 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeJSONExample.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeJSONExample.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "decode_json_example"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
index 994bc4e1f4fd1707579ac2bda4fae5ed327430ab..97d262abe578df1ca357b6288d415ed180df3392 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeJpeg"
-  endpoint {
-    name: "image.decode_jpeg"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
index 309eec5ac368297563af7e6e752921fd270186ef..3b9290a2c5b8ee1e10de6dad1eeafbbe450d99d7 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodePng"
-  endpoint {
-    name: "image.decode_png"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeRaw.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeRaw.pbtxt
index 0010a59ca40adb889119309d84b26d42fb002a01..dab7a5e0094721a7d79bbcb05b8946882d777446 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeRaw.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeRaw.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "decode_raw"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
deleted file mode 100644
index 0a8e068afb744ce8b472111d19cf743d39ac44ef..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DenseToSparseBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
index 1bb17e548d1cd0ca77d6415b7fa165b1a6b7cae3..e26d029212e3bc421987f6d203b2e6ce5a95c7ac 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -1,6 +1,8 @@
 op {
   graph_op_name: "DepthwiseConv2dNative"
+  deprecation_message: "Use nn.depthwise_conv2d instead"
   endpoint {
     name: "nn.depthwise_conv2d_native"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
index 6f9df4b1a11459c252f2961fb1caacaad64021ae..01c4a50ca6fa31f65feb9d5a65fbf105525772e8 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -2,5 +2,10 @@ op {
   graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
   endpoint {
     name: "nn.depthwise_conv2d_native_backprop_filter"
+    deprecated: true
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "nn.depthwise_conv2d_backprop_filter"
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
index 0bd72539e932f597e86f63ef52519652f0e8efd7..f32aa8a69f24db4abc3f8e1aef514ee84d73c23f 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -2,5 +2,10 @@ op {
   graph_op_name: "DepthwiseConv2dNativeBackpropInput"
   endpoint {
     name: "nn.depthwise_conv2d_native_backprop_input"
+    deprecated: true
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "nn.depthwise_conv2d_backprop_input"
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Dequantize.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dequantize.pbtxt
index 5edd0c216ba4edb034f322f55fb8bc12647c7abe..96844a65b510cb57c804f5d8288acfa7e5bf7eaa 100644
--- a/tensorflow/core/api_def/python_api/api_def_Dequantize.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Dequantize.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "dequantize"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Diag.pbtxt b/tensorflow/core/api_def/python_api/api_def_Diag.pbtxt
index cba30e63e892cf73ad99e6ea5f7afad846f66549..43e7af891c510cc317569ac1835c4cbd87a7d158 100644
--- a/tensorflow/core/api_def/python_api/api_def_Diag.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Diag.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "diag"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt b/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt
index 54e1f34e82b3c5dddad338cfeb7eecb0bac12fdd..6a149848f69e6383f2b40db808df9e2b8082ba13 100644
--- a/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "diag_part"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Digamma.pbtxt b/tensorflow/core/api_def/python_api/api_def_Digamma.pbtxt
index 91b4dfead77664bb792428f0ca5283addbaed2d4..e6e9375ecd954bd74cf11c93735b08cb3c0615eb 100644
--- a/tensorflow/core/api_def/python_api/api_def_Digamma.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Digamma.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "digamma"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
index 6d73ecf1bb06895017b2d2ac2a16c702681eb217..1bd83d906152d2e5792fecd5e80e339e0c67e7a5 100644
--- a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
@@ -2,5 +2,6 @@ op {
   graph_op_name: "Dilation2D"
   endpoint {
     name: "nn.dilation2d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_EncodeBase64.pbtxt b/tensorflow/core/api_def/python_api/api_def_EncodeBase64.pbtxt
index 71bb73cfb24ee8e644bf54d0077a1f2c5b8a0e77..534b5d8152c1497e6480de0691bd8555e781a62b 100644
--- a/tensorflow/core/api_def/python_api/api_def_EncodeBase64.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_EncodeBase64.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "encode_base64"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
index 5c31e9d0f32e6e13ba7d87d8a234e238c048a8b9..054ffb997b3def412f50b12216794d53d3add41c 100644
--- a/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "EncodeJpeg"
-  endpoint {
-    name: "image.encode_jpeg"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
deleted file mode 100644
index 051cf14c0ec2b32779be8b9c297b93abd1bc1318..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "EnqueueInQueueDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt b/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
index 78aa1b3bc53b424822142b5fd66eeabbf445a499..34717e74bcd06012bc256f8e3cbbe5cbb6d6f661 100644
--- a/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "equal"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
index 391167254edb69725c778e6319bf8a9f6038589f..21ae77e9ed71cae895b5e3f62adb2607704b5858 100644
--- a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
@@ -1,4 +1,10 @@
 op {
   graph_op_name: "Erf"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.erf"
+  }
+  endpoint {
+    name: "erf"
+    deprecation_version: 2
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Erfc.pbtxt b/tensorflow/core/api_def/python_api/api_def_Erfc.pbtxt
index e96df0c596ab19986eef6d3d2bb449c6dee4606a..fccda9dfca5967a73396a2b3ff32b7e85984aedf 100644
--- a/tensorflow/core/api_def/python_api/api_def_Erfc.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Erfc.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "erfc"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt
index 70323fe5b478a56f1d81ac13b81e0f49b745673c..38a9078d9f6b53d63a4682ef076e651f061a44e9 100644
--- a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "exp"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Expm1.pbtxt b/tensorflow/core/api_def/python_api/api_def_Expm1.pbtxt
index 8ddf9d4d70f491e36258d33f7e6e8aebd27b0296..d8bdaeadc88ca05a2d9224d86ae6b9c09ccbbdcf 100644
--- a/tensorflow/core/api_def/python_api/api_def_Expm1.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Expm1.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "expm1"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
index f008b1222deeca5374107bcfb939df098b70b7eb..17921dea4d5e19ef960100a72709a2311da66f3d 100644
--- a/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "ExtractImagePatches"
-  endpoint {
-    name: "image.extract_image_patches"
-  }
-  endpoint {
-    name: "extract_image_patches"
-    deprecated: true
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
index 6849a6d3fa5f37b0d4f92829c8b07754b922a319..a57955c8a74af58cafee4719a86d649efbcb504b 100644
--- a/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ExtractJpegShape"
-  endpoint {
-    name: "image.extract_jpeg_shape"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
index d79e936b7195dd0ae547b436582d3144a35e0ad1..33f87caa38c38a7522e43104276b033a6ea5609a 100644
--- a/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
@@ -1,10 +1,14 @@
 op {
   graph_op_name: "FFT"
+  endpoint {
+    name: "signal.fft"
+  }
   endpoint {
     name: "spectral.fft"
+    deprecation_version: 2
   }
   endpoint {
     name: "fft"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
index 9ed1341dfe2d0c4f57e0fa3c2d14378bce452be3..2273a757898bcd4c3b10fbee3bac272396bfb092 100644
--- a/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FFT2D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "FFT2D"
+  endpoint {
+    name: "signal.fft2d"
+  }
   endpoint {
     name: "spectral.fft2d"
+    deprecation_version: 2
   }
   endpoint {
     name: "fft2d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
index 5a4e1d6adf9b9c2bf68c6375de6aebfdfcf5bfb3..6a43b86e3d388c3aca752d7d61413bce1d2f4989 100644
--- a/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FFT3D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "FFT3D"
+  endpoint {
+    name: "signal.fft3d"
+  }
   endpoint {
     name: "spectral.fft3d"
+    deprecation_version: 2
   }
   endpoint {
     name: "fft3d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxArgs.pbtxt b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
index d8db83331f916c48f1cb2afef9b3d8bc2e291107..97ab3ff7efc1f413b80c3987f891ce70434ae20d 100644
--- a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "fake_quant_with_min_max_args"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt
index 74f01d1a0c56918128a069861c7d9eecdb89a708..a30bdc3534341098696d41820ff79ba8a606a363 100644
--- a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "fake_quant_with_min_max_args_gradient"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVars.pbtxt b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVars.pbtxt
index e14fb6d118ada932e536d5408619ca4eda75a348..fc64d0e15acea3c3ae22397bbd3c51ee81a915ff 100644
--- a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVars.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVars.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "fake_quant_with_min_max_vars"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt
index 4611ebdfb8286070c739703bffeb38fe64582713..66fcfbb846667538210beb527d99504517b3c5ca 100644
--- a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "fake_quant_with_min_max_vars_gradient"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
index 0936e513c3ff0996d7fdebf09fb6943f74349469..132ecc1ac49f576bb290c4c3f8a661bacff5aafa 100644
--- a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "fake_quant_with_min_max_vars_per_channel"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
index 0d9968248c5397001604c790b22131581b48d636..66c811b6a26ca19ae352f401fd1062eac3fb77df 100644
--- a/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "fake_quant_with_min_max_vars_per_channel_gradient"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
deleted file mode 100644
index 6f91b842181c769d0a2f921f1d7566c4d8522541..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FilterDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
deleted file mode 100644
index d0703471d38c94a8c37da6f0a65ebd165c23a820..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FixedLengthRecordDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
deleted file mode 100644
index 9de61ac263cd82a0893aa2e27b9d7532490ca441..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FlatMapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt b/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt
index 9b93caa0b1cb9b142b9bcf1993da9cebedf019e6..14accd2b20d7b9a1ae7c907b91f8207ae67d2ee8 100644
--- a/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "floor"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
index 26598ab1fb918e251d4c4da7b14810ebf4c44779..efd42b888d21fad6c369ae63182ed8846bf9f0b1 100644
--- a/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "FloorDiv"
-  visibility: HIDDEN
+  endpoint {
+    name: "floor_div"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
index ef562e93a0dee0a3f24716719cb24232302626dc..e5db6d49b29e46c9f19c43767c16a5e5296304e8 100644
--- a/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "FloorMod"
-  visibility: HIDDEN
+  endpoint {
+    name: "floormod"
+  }
+  endpoint {
+    name: "mod"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
index 16ed9b56f2b662b6cca44f5c955e579c2f9d7971..cbe87777a7fec7557b5153df8cd7689f22aa961e 100644
--- a/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "FractionalAvgPool"
-  endpoint {
-    name: "nn.fractional_avg_pool"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
index 695559520805abd02e0575f7f85937d00f0dc5fd..02470b43454cdcb44ee624ecab4486fa36caa7da 100644
--- a/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "FractionalMaxPool"
-  endpoint {
-    name: "nn.fractional_max_pool"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/python_api/api_def_GatherNd.pbtxt
index 598f23bde3c3caf9875fa8f4e3606c631d8d6f28..1c3b9d5571e325fd0ea2bfa3dfc2b06d651e4087 100644
--- a/tensorflow/core/api_def/python_api/api_def_GatherNd.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_GatherNd.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "manip.gather_nd"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
deleted file mode 100644
index 9dcfa0f7d210012aa5c2d43349239a953ea3739e..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "GeneratorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Greater.pbtxt b/tensorflow/core/api_def/python_api/api_def_Greater.pbtxt
index 7de60d44c40efdf8a99e266a58065387940c0b32..7926deaa3b01e3cd817cf3fb45c6dbaad4d130d8 100644
--- a/tensorflow/core/api_def/python_api/api_def_Greater.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Greater.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "greater"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_GreaterEqual.pbtxt b/tensorflow/core/api_def/python_api/api_def_GreaterEqual.pbtxt
index 9c8975c2a978341964fe79c203ab7619ed0f42dd..21bbb1b094ca7b353d2a62f8a32e74f3c18953da 100644
--- a/tensorflow/core/api_def/python_api/api_def_GreaterEqual.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_GreaterEqual.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "greater_equal"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
deleted file mode 100644
index 8d40208e613e6b7ee1522c2990afea1345cc5de1..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "GroupByWindowDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
index 17fbd8ace4333f2b83e936b70091073d3c39e3bf..68ef4833949f37384ff24662fe204903608fefd6 100644
--- a/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
@@ -1,10 +1,14 @@
 op {
   graph_op_name: "IFFT"
+  endpoint {
+    name: "signal.ifft"
+  }
   endpoint {
     name: "spectral.ifft"
+    deprecation_version: 2
   }
   endpoint {
     name: "ifft"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
index d6b36a314b8d8a197651ee3c68b1376a9bbed669..47fb6fa191f68f75e09846b6b26479cf46505eac 100644
--- a/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT2D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "IFFT2D"
+  endpoint {
+    name: "signal.ifft2d"
+  }
   endpoint {
     name: "spectral.ifft2d"
+    deprecation_version: 2
   }
   endpoint {
     name: "ifft2d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
index 6def5b36da17766c5342703fcefe2b377028f330..aff598314b21bba23883d4fffdeecdc2096099eb 100644
--- a/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT3D.pbtxt
@@ -1,9 +1,14 @@
 op {
   graph_op_name: "IFFT3D"
+  endpoint {
+    name: "signal.ifft3d"
+  }
   endpoint {
     name: "spectral.ifft3d"
+    deprecation_version: 2
   }
   endpoint {
     name: "ifft3d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Igamma.pbtxt b/tensorflow/core/api_def/python_api/api_def_Igamma.pbtxt
index 8c4815c26eeabc446cfb37c082c9f5fd7d1fbcbb..c07932a1a7ad0be4486b102084175848bdf3d6e4 100644
--- a/tensorflow/core/api_def/python_api/api_def_Igamma.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Igamma.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "igamma"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Igammac.pbtxt b/tensorflow/core/api_def/python_api/api_def_Igammac.pbtxt
index b43b54391b7d8f3bd4a07e93880efebfc1929395..8031a51db96e185a34243623af0afdc552329177 100644
--- a/tensorflow/core/api_def/python_api/api_def_Igammac.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Igammac.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "igammac"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
deleted file mode 100644
index ef1b06b19cc6a0c62f6e9f451aceed8aeabed553..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "InterleaveDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_InvertPermutation.pbtxt b/tensorflow/core/api_def/python_api/api_def_InvertPermutation.pbtxt
index d75fcd63e3baeb6ae04745ab3ffd3a49867fa054..d75cef5fac3e150205c253e1a2cfcdf2d420ecee 100644
--- a/tensorflow/core/api_def/python_api/api_def_InvertPermutation.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_InvertPermutation.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "invert_permutation"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
index 27142644bf098b003528f858640aed6b9e08764f..ccd736a483ef3e927e270a33639f6f38856312b8 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
@@ -1,10 +1,14 @@
 op {
   graph_op_name: "IsFinite"
+  endpoint {
+    name: "math.is_finite"
+  }
   endpoint {
     name: "debugging.is_finite"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_finite"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
index 4cd92f1cb78f223b7dffaaeecc0149754b58aa41..3cbfb7317c1383db74317080d1dfe93628aab3b4 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
@@ -1,10 +1,14 @@
 op {
   graph_op_name: "IsInf"
+  endpoint {
+    name: "math.is_inf"
+  }
   endpoint {
     name: "debugging.is_inf"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_inf"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
index 07d49f9436ea262d78d708b7fa94b4fd78deabfa..b01536664e5111217c7d1e5fb415c8e791cbaa34 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
@@ -1,10 +1,14 @@
 op {
   graph_op_name: "IsNan"
+  endpoint {
+    name: "math.is_nan"
+  }
   endpoint {
     name: "debugging.is_nan"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_nan"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
deleted file mode 100644
index 94bf6106ad8459767d31a345a17483b255dfc02b..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "LatencyStatsDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Less.pbtxt b/tensorflow/core/api_def/python_api/api_def_Less.pbtxt
index 055df2922ac4ca023490fdcff02d9279e6037948..0b5f06e99f08ef43f2a9ede83611a70b99e7b2a6 100644
--- a/tensorflow/core/api_def/python_api/api_def_Less.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Less.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "less"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LessEqual.pbtxt b/tensorflow/core/api_def/python_api/api_def_LessEqual.pbtxt
index d2803ddb69264589174b708317ba7cd028fc9bd5..afc4f2a8c93ebc5c701119d8fb6ed345235fd569 100644
--- a/tensorflow/core/api_def/python_api/api_def_LessEqual.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LessEqual.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "less_equal"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Lgamma.pbtxt b/tensorflow/core/api_def/python_api/api_def_Lgamma.pbtxt
index 0262b838caa0e36123bb30f209e66119e214aa32..ee339967a489627c91cec7c04fb4e7db74b2b618 100644
--- a/tensorflow/core/api_def/python_api/api_def_Lgamma.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Lgamma.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "lgamma"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt b/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
index b1de2cb207d221593b41d82d43b759e49d411710..3835661be57ce0ca829db231fcd1d5b0bec8215a 100644
--- a/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
@@ -2,6 +2,7 @@ op {
   graph_op_name: "LinSpace"
   endpoint {
     name: "lin_space"
+    deprecation_version: 2
   }
   endpoint {
     name: "linspace"
diff --git a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
index 26d2473b9c6bde835dfb8665bef3eecef1accf4a..b6d2da6d32a270f5e99bf551fc786e7a98b54cf7 100644
--- a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "log"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
index d85b6dccece9d6fc83155ea357bb96091b56de70..e3da451de3fc7ceb544779a0a4445620f0d0af1f 100644
--- a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "log1p"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt
index 80bd98b740a03bc6f9f190bef9150b23a4aee0cb..d4e6a7a380ea7f842fc715fc6347dca75ebbb176 100644
--- a/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "logical_and"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LogicalNot.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogicalNot.pbtxt
index b2244c44b1d28769b1c30a1da576068a4a15fbd0..49068738a415ebceaeaa85abb1f4bdf6d1098c09 100644
--- a/tensorflow/core/api_def/python_api/api_def_LogicalNot.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LogicalNot.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "logical_not"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_LogicalOr.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogicalOr.pbtxt
index cf78b52e077b7cc33c3453bfade86a52be4f7b84..a5133962dcd4e0f163242f2f456eb61d2db383fd 100644
--- a/tensorflow/core/api_def/python_api/api_def_LogicalOr.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LogicalOr.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "logical_or"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Lu.pbtxt b/tensorflow/core/api_def/python_api/api_def_Lu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0b6b53da50e474c3bfe2065a607a19baf06bc80
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Lu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Lu"
+  endpoint {
+    name: "linalg.lu"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
deleted file mode 100644
index cffd2910fb404bc7f75e55e42b9ebba1635db134..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MapAndBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
deleted file mode 100644
index 0b1d2f2c730ff8b8b928fcd97c4fe3bdc704e470..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatchingFiles.pbtxt
index 74145670a8f95603c178690bc9a6054c111be19c..2dc5916f60f3f02ebca14c15610981b9a934be91 100644
--- a/tensorflow/core/api_def/python_api/api_def_MatchingFiles.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MatchingFiles.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "matching_files"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixBandPart.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixBandPart.pbtxt
index 1122c52ab404230244660da905407852b4cb0492..c8aaf44b0d458c3cac456e53e5d196a3ec3ed064 100644
--- a/tensorflow/core/api_def/python_api/api_def_MatrixBandPart.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixBandPart.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "matrix_band_part"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixDeterminant.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixDeterminant.pbtxt
index 9563bf0354598a55c6ad14c2e6d0acea27bb2467..64a5950e56a28728a44b4e480db152c2aa0d0e0b 100644
--- a/tensorflow/core/api_def/python_api/api_def_MatrixDeterminant.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixDeterminant.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "matrix_determinant"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixDiag.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixDiag.pbtxt
index 8ab0bf75ebc5a4b1f8b8128046b9c4f06bd21786..57dc182474e242006f7c1bf201f981fab68412b3 100644
--- a/tensorflow/core/api_def/python_api/api_def_MatrixDiag.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixDiag.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "matrix_diag"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixDiagPart.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixDiagPart.pbtxt
index 82ce67853c9507736e4597791f6cb8bb05ca3932..142763f44bdc0a6b8c42f30dfcb6820e8a1da19b 100644
--- a/tensorflow/core/api_def/python_api/api_def_MatrixDiagPart.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixDiagPart.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "matrix_diag_part"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixInverse.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixInverse.pbtxt
index 85862f6eb570963317176616edc8d42c524c08ef..13df986ac17620c01d7e14acacbb2bb6d5658bd7 100644
--- a/tensorflow/core/api_def/python_api/api_def_MatrixInverse.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixInverse.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "matrix_inverse"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixSetDiag.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixSetDiag.pbtxt
index 6325e4f0e6e0210abebde9f123ea5434a2cc2862..fc97a29cf216311c1aeff878e26daf2ec5634d31 100644
--- a/tensorflow/core/api_def/python_api/api_def_MatrixSetDiag.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixSetDiag.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "matrix_set_diag"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixSolve.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixSolve.pbtxt
index 6325dff407af716e32bd8d7daa97606b8a74089d..0bbc9891590efaf5a74fea57d15fa54fb5d20964 100644
--- a/tensorflow/core/api_def/python_api/api_def_MatrixSolve.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixSolve.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "matrix_solve"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixSquareRoot.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixSquareRoot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..efbea6d2d19eb3e1d29f36f95672375046a977df
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixSquareRoot.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "MatrixSquareRoot"
+  endpoint {
+    name: "linalg.sqrtm"
+  }
+  endpoint {
+    name: "matrix_square_root"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixTriangularSolve.pbtxt
index 7f865e23b2ab908f3aa53e3c163f48d44c799ddb..17dc57335ae47e2d877eefa80b1c2d8d0c23ebbe 100644
--- a/tensorflow/core/api_def/python_api/api_def_MatrixTriangularSolve.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixTriangularSolve.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "matrix_triangular_solve"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
index 7d8abca5f1ad76df62e78f9d7228b586dce31bf6..13a1a0b5df4d73884d267777ccf5ad6a44fcdbd4 100644
--- a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -2,5 +2,6 @@ op {
   graph_op_name: "MaxPoolWithArgmax"
   endpoint {
     name: "nn.max_pool_with_argmax"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt b/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt
index bcff379b719337c5c4512e57edf2b06be8a46587..130729ece172f4bbf55da5ae6d7a5f1c8e322434 100644
--- a/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "maximum"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt b/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt
index 9aae74226a27b8ff0ac665ba1d5f8494111b49ed..8aded1f154d7dc2f204317c69a411c527b01144b 100644
--- a/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "minimum"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
index 0e2bb9b950d933f2e73272b403fba2c29110b3cb..ac166561ee9b1ab5fcee6fad776971172b0ee5ba 100644
--- a/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Neg"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.negative"
+  }
+  endpoint {
+    name: "negative"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt b/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
index f37317854fa7a553a4701a2e83982a43b9be8169..07fe3b6af1912bbeb235f4664f2787f67066ffe3 100644
--- a/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "not_equal"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
deleted file mode 100644
index c6223b3132ed0d6878995d3c5e657275fac0cc4f..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PaddedBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
deleted file mode 100644
index 93cd5719feb613cd3de2e422e23cc3d690bdef08..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParallelInterleaveDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
deleted file mode 100644
index 09d200dd24c828af85d1505bb17086dbfa688ee8..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParallelMapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
deleted file mode 100644
index 45826b6fdcc582ac7fd84d45b079b7f4994bc370..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParseExampleDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParseTensor.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseTensor.pbtxt
index 10b3aab0c771ec91a298375cc893af8e446b9020..6ea8094565b54cd6254dd47439e4226635ff99d8 100644
--- a/tensorflow/core/api_def/python_api/api_def_ParseTensor.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ParseTensor.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "parse_tensor"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_PlaceholderWithDefault.pbtxt b/tensorflow/core/api_def/python_api/api_def_PlaceholderWithDefault.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1a722c1036db085968c911ebcb697b0aeed8d55
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PlaceholderWithDefault.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PlaceholderWithDefault"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Polygamma.pbtxt b/tensorflow/core/api_def/python_api/api_def_Polygamma.pbtxt
index 9df81402d55242da9b911faa8233ddc22ca22093..33c96505ba4d234892f56ed82dfb60e66f3e2cc4 100644
--- a/tensorflow/core/api_def/python_api/api_def_Polygamma.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Polygamma.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "polygamma"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
deleted file mode 100644
index ec4e214eb5e082c8f732cbef9db69524c48d80a4..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PrefetchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
deleted file mode 100644
index 228c4047d2e0b7ddfec1d8cd4fad478aa6c4c1a7..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Qr.pbtxt b/tensorflow/core/api_def/python_api/api_def_Qr.pbtxt
index 0260eecc9172f2be6928394043b6c6848955be8b..e3a0e9d45a596cb5a0002de0687a4f33459432fd 100644
--- a/tensorflow/core/api_def/python_api/api_def_Qr.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Qr.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "qr"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2.pbtxt
index 15e181be20948128a7f970f024e6cc8dfe28c96c..a30b42010a9eacd642e6528771866f01c95b9810 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "QuantizeAndDequantizeV2"
-  visibility: HIDDEN
+  endpoint {
+    name: "quantization.quantize_and_dequantize"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
index dfa793a16e18ab30891bcb9a997d7bed02410e54..6aceba3b1188919d4b0318f560ed32921e823343 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedAvgPool"
   endpoint {
     name: "nn.quantized_avg_pool"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedConcat.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedConcat.pbtxt
index 69404b947257d2d6000cdd43a2497ec4883bc8b6..937a1a813d484544db6002f41406f6ddcd7f8563 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedConcat.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedConcat.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "quantized_concat"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
index 2409d12abeff922cca92f9ae609764a27f651356..4b5a04f45ef014ad328fea26e613f227d1821e71 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedConv2D"
   endpoint {
     name: "nn.quantized_conv2d"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
index 3a58590f5773a3d886ace95108ee63a659362de2..cd1c7fdbf22ec746a080566b20daa7b100e5cb65 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedMaxPool"
   endpoint {
     name: "nn.quantized_max_pool"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
index 926ec98eeb468e7fa4846ae013a112cc865bb82c..d83d71c65cabf7a00d65c9dc87c6465f7c1ae9f5 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedReluX"
   endpoint {
     name: "nn.quantized_relu_x"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
deleted file mode 100644
index a5f6f8c6f1db344c480e2bd452362d977dc15000..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RandomDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
deleted file mode 100644
index 4cd8296b2233ac58c12e6573d2194f7d976d9137..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RangeDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReadFile.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReadFile.pbtxt
index 9d479be45ff483cdf5c4d03468bb033f663aa070..a671bc3ed14910e4e16064b990907189a99b0545 100644
--- a/tensorflow/core/api_def/python_api/api_def_ReadFile.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ReadFile.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "read_file"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
index bd87eef8240532c158b7604d8c5576e6d0b8b24b..f9e01eb56744cefddb41bad1a54d539ab3e0c548 100644
--- a/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "RealDiv"
-  visibility: HIDDEN
+  endpoint {
+    name: "realdiv"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Reciprocal.pbtxt b/tensorflow/core/api_def/python_api/api_def_Reciprocal.pbtxt
index c4d4c27722266f70c5d72e609e25d88838ab7a23..d10b87b6a7be8cfe33b17d2fa6f900e85c778c9a 100644
--- a/tensorflow/core/api_def/python_api/api_def_Reciprocal.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Reciprocal.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "reciprocal"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
deleted file mode 100644
index be301da8386af0fbd98c9b02d2cfc0fe79178990..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RepeatDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Reshape.pbtxt b/tensorflow/core/api_def/python_api/api_def_Reshape.pbtxt
index b3d596de7aaede578a36f37d6146648646b7aa90..ee20249094cb7dce6ea7b0c172bea07538e49710 100644
--- a/tensorflow/core/api_def/python_api/api_def_Reshape.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Reshape.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "manip.reshape"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
index 2f1b4aee00d90221d659daa34a7eb3462f42fa0c..e1a1f883d8ba6850f429ca5ebc8ab89789a2df90 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeArea"
-  endpoint {
-    name: "image.resize_area"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
index 3ec8e0ad6359307eab1b166801474817d8c5282b..e0bec8c116db961f873e1aa961d32d9422311696 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeBicubic"
-  endpoint {
-    name: "image.resize_bicubic"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
index eb3b8d6f458fff6163932457ef6c73a8fbbd721e..6121c1128c9060914723beb9d056d51a212b54bc 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeBilinear"
-  endpoint {
-    name: "image.resize_bilinear"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
index 25c5d5701feefd6f8270236f29e1c187fa3cf06a..0e86e4ce3ea33515947eae08705d5ea6c6860faa 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeNearestNeighbor"
-  endpoint {
-    name: "image.resize_nearest_neighbor"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1eef1b69b979bfeaaaaec81f47a6e62c8ecd8284
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdamWithAmsgrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c39242b3101449ed08c7b132502f7a9eea1228e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyKerasMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..180793521352a3d9ba3b75b709c3f9d2d37c8f93
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyKerasMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReverseV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReverseV2.pbtxt
index 51478b7c3434d40f6ce930f7cb38b357470c2e99..9ff0506c4e7f15b2ebe2f73aa32bf14eb22d7b76 100644
--- a/tensorflow/core/api_def/python_api/api_def_ReverseV2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ReverseV2.pbtxt
@@ -5,10 +5,10 @@ op {
   }
   endpoint {
     name: "manip.reverse"
-    deprecated: true
+    deprecation_version: 2
   }
   endpoint {
     name: "reverse_v2"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Rint.pbtxt b/tensorflow/core/api_def/python_api/api_def_Rint.pbtxt
index ec37a231273cf4ab124ec6399dc551b6a67d23d5..06e02f354c93d8783f541ee2bc4eab1207cda1d5 100644
--- a/tensorflow/core/api_def/python_api/api_def_Rint.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Rint.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "rint"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt
index 4fc2b8142108e0ec41f17eb8ba904e1b1bcbf07c..3cfbfc1106e68f20473ae6f4bae3dc7cb67f0483 100644
--- a/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "rsqrt"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
deleted file mode 100644
index e71b655c22fbcbf1524433fc65a392e4d80c5c43..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ScanDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterNd.pbtxt
index 85888da45a2296f2a1206dd06ad961b537dba982..b76497d2661ea3d339cc2f626b358358a84cd9d2 100644
--- a/tensorflow/core/api_def/python_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterNd.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "manip.scatter_nd"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_SegmentMax.pbtxt
index 2e22c375c071db9ecf7bc3023ac50fd92696b0df..5f40b94b81eca96ca199cd1832cfd2954c6d27ff 100644
--- a/tensorflow/core/api_def/python_api/api_def_SegmentMax.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_SegmentMax.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "segment_max"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/python_api/api_def_SegmentMean.pbtxt
index 646348072f08c2ebd2aa2a9253567d0ee5a52645..a7da724f1dcae7b1daef2d846fd5ffa5b7381ada 100644
--- a/tensorflow/core/api_def/python_api/api_def_SegmentMean.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_SegmentMean.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "segment_mean"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/python_api/api_def_SegmentMin.pbtxt
index 1a77019a2dca9db2369cf6646b71a762da24116c..d4ccfe7457b74ce85f20c2155d2201a9ccff5cc1 100644
--- a/tensorflow/core/api_def/python_api/api_def_SegmentMin.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_SegmentMin.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "segment_min"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/python_api/api_def_SegmentProd.pbtxt
index cf4d6f0237dc9d1615225feebd78e3faf1deb3e1..8bbd6ce105fd2d4ef6918c835495091be1a42254 100644
--- a/tensorflow/core/api_def/python_api/api_def_SegmentProd.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_SegmentProd.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "segment_prod"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/python_api/api_def_SegmentSum.pbtxt
index c6d7999455039fa07c2c4205474334326f1c19eb..b40b5237a28350651c33dfad45b214ab378e0664 100644
--- a/tensorflow/core/api_def/python_api/api_def_SegmentSum.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_SegmentSum.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "segment_sum"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SerializeTensor.pbtxt b/tensorflow/core/api_def/python_api/api_def_SerializeTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..219b125da3ddf97bd5b1eca0adb5660362f08a03
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SerializeTensor.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "SerializeTensor"
+  endpoint {
+    name: "io.serialize_tensor"
+  }
+  endpoint {
+    name: "serialize_tensor"
+    deprecation_version: 2
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
deleted file mode 100644
index 3a8c1036ca34233b245a92110dc6e81ac348942d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SetStatsAggregatorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
deleted file mode 100644
index 7b0d2994f0711f440fb6623aa2322c86bd3859f8..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ShuffleAndRepeatDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
deleted file mode 100644
index 8f0be9197adeb23b2d5047c5d69916df0e2c1eda..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ShuffleDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
index c2ee91dd12ed16ba27a9c4ae45b48194bc5a8b03..fb427cdb191d4976cf50d214e7f58695e7c41490 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Sign"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.sign"
+  }
+  endpoint {
+    name: "sign"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Sin.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sin.pbtxt
index 9c19a1a177bceb1681e613cf90e7f9086ce711f5..a2b776ee0c8d8f4cfa9833ff90819fd5d9222320 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sin.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sin.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "sin"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Sinh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sinh.pbtxt
index 155e58e6d5f209d2b9862410c22a709366eefe62..38c7c729bfa982ac8aa490e6d95cb8a9978d7b4d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sinh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sinh.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "sinh"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
deleted file mode 100644
index 96a551c5b6669a8d019e3c705507aba768ab9d21..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SkipDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
deleted file mode 100644
index 867116c5da718f66205132d70a93c39464096df6..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SlideDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SpaceToBatchND.pbtxt b/tensorflow/core/api_def/python_api/api_def_SpaceToBatchND.pbtxt
index 146b97f444a85a7da34a6bab837b7a947d950aa7..9069a3e7a2f76afd1b7dbc5b9f94cbc3d44c3b63 100644
--- a/tensorflow/core/api_def/python_api/api_def_SpaceToBatchND.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_SpaceToBatchND.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "manip.space_to_batch_nd"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
deleted file mode 100644
index 19c0c7f199dfd24d24a56c3766733f9e55957c12..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SparseTensorSliceDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
deleted file mode 100644
index 2ab4c3e441dd51f50a2796ef9d6fa0d21b727ffa..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SqlDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
index 59e2dfe8366813242337c9490d74ca317e525636..16a4d9a7bcc0058aa0baf46ed0b932d4c26a23e2 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Sqrt"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.sqrt"
+  }
+  endpoint {
+    name: "sqrt"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
index 7b39ae25fa062b4271dcc2aee6523847c97b1e4d..0bd2f1bf41b80b1a21d50a9b9f437da33e36584c 100644
--- a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Square"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.square"
+  }
+  endpoint {
+    name: "square"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SquaredDifference.pbtxt b/tensorflow/core/api_def/python_api/api_def_SquaredDifference.pbtxt
index 4bab8cf00c34bacbb13ccc6a64426ab3231ff691..c2ef8d3b34c4865da0cef73c52de5201a9b8ea3f 100644
--- a/tensorflow/core/api_def/python_api/api_def_SquaredDifference.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_SquaredDifference.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "squared_difference"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
deleted file mode 100644
index f7bed36602f40602313157c20677acbbf592d7be..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorHandle"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
deleted file mode 100644
index 8b1bab2440f1934f1fd0194b76b7907fb0fb142d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorSummary"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringJoin.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringJoin.pbtxt
index 46a7c0361e21a8a72d506c1a3280e7f27dce0fa2..a54cdb46c1f04ad31f5405ca6542c0a560820120 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringJoin.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringJoin.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "string_join"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringStrip.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringStrip.pbtxt
index fbcdeaad6d3be27c49658e70b65ffe853aa58c51..fedc03a19da68c854edfd5ac3b7b48ac8982819b 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringStrip.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringStrip.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "string_strip"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt
index d122e79b39466c7ea311145f4767bcdc69d0ca3a..dc4493c841062adfbdfccdc501ecdfd6228e7aae 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "StringToHashBucket"
-  endpoint {
-    name: "strings.to_hash_bucket"
-  }
-  endpoint {
-    name: "string_to_hash_bucket"
-    deprecated: true
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringToHashBucketFast.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringToHashBucketFast.pbtxt
index aef9dffefe5f495813c8192304c0fd765da14331..06451a9ad57b4229c6628cb1a35b62f30128f7a4 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringToHashBucketFast.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringToHashBucketFast.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "string_to_hash_bucket_fast"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringToHashBucketStrong.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringToHashBucketStrong.pbtxt
index 385b9fd02ac214be2074f1cfe7a9615343259e94..8e103c8e2d3016de4d6d73b2ff3786e6166c7203 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringToHashBucketStrong.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringToHashBucketStrong.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "string_to_hash_bucket_strong"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt
index f740b9849df4d2e2c4125556fd87df3dd07491a7..9c89d02fb762c50eb2379d35b3d238797caa41ef 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "StringToNumber"
-  endpoint {
-    name: "strings.to_number"
-  }
-  endpoint {
-    name: "string_to_number"
-    deprecated: true
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
deleted file mode 100644
index 3c270ada3c219b03715e0cd651a4b56fe5ebc227..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TFRecordDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
deleted file mode 100644
index 711b335dc1926d32071637b3c986727c339736a3..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TakeDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Tan.pbtxt b/tensorflow/core/api_def/python_api/api_def_Tan.pbtxt
index ffa92f55800f7837d648cba9f93788c0b6a5a0bc..20cfac05fdf4e0bc257550e200aa9e2a693a9ef0 100644
--- a/tensorflow/core/api_def/python_api/api_def_Tan.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Tan.pbtxt
@@ -5,6 +5,5 @@ op {
   }
   endpoint {
     name: "tan"
-    deprecated: true
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
index c946e0a794a77fe6f40613824e6d614e9667ccf9..80d11d27853d89b17fc86fca4fc9219452cd1aca 100644
--- a/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
@@ -1,4 +1,12 @@
 op {
   graph_op_name: "Tanh"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.tanh"
+  }
+  endpoint {
+    name: "nn.tanh"
+  }
+  endpoint {
+    name: "tanh"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
deleted file mode 100644
index 5bc3920c56360f2348805db1db79ab2b630f379d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TensorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7b6fd106ce304f1e75913614c54f12a3efe5e38
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListConcat"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListPushBackBatch.pbtxt
similarity index 100%
rename from tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt
rename to tensorflow/core/api_def/python_api/api_def_TensorListPushBackBatch.pbtxt
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..091297db07174a3925ed2a09b879d013580b606e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListSplit"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
deleted file mode 100644
index 89ad016483fa392a302915d588d32201237c717a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TensorSliceDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
deleted file mode 100644
index 08d785191b6a4bddce2ac43fd4c0188b4d74548e..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TextLineDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Tile.pbtxt b/tensorflow/core/api_def/python_api/api_def_Tile.pbtxt
index 1d8695f1fdfdf7cbac9a316275a5e43fa80cf8e1..3ffbe8cf526edbcc67d667059019038f519cce6d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Tile.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Tile.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "manip.tile"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
index 2a547f771cfb3d4f3d9496ea24196e1a8a1f1879..8e46c5e663a3fca40a6c2e4890a6ab9388645ad9 100644
--- a/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "TruncateDiv"
-  visibility: HIDDEN
+  endpoint {
+    name: "truncatediv"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt b/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
index 0731e8810e25cad2cca02522aba55d032b1765b2..97fb816a7ad395a4ad67d0296d87cf6264c76ac2 100644
--- a/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "TruncateMod"
-  visibility: HIDDEN
+  endpoint {
+    name: "truncatemod"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
deleted file mode 100644
index 1e5415749f0d3abad8f6f5c632a0bc59b11e8de2..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "UnbatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_UnicodeTranscode.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnicodeTranscode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ddadd9426bc898541a6890a4867f5d4921bfdd9e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_UnicodeTranscode.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnicodeTranscode"
+  endpoint {
+    name: "strings.unicode_transcode"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentMax.pbtxt
index cf8184324160bd46701a0a60ea93531aec393a3f..32044fd90edf9f5cd97ea49f89563df7e4ddc0b4 100644
--- a/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentMax.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "unsorted_segment_max"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentMin.pbtxt
index 475361c85a26f98c92a9ba2c5f72b8753794ca29..177e840e4272d9793e833d082f9fbb5004a8ab78 100644
--- a/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentMin.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentMin.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "unsorted_segment_min"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentProd.pbtxt
index a9d741bbc33a0ba10c072201b1db184b9abc91d6..f3aa8e8a515ed01b7984bbd1c49b2e531cac33f9 100644
--- a/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentProd.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentProd.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "unsorted_segment_prod"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentSum.pbtxt
index 337678dcffe12da62672ec7ed19e466f1fac119d..1542bb039e0c0d28f52cb0214aef0a2aeb6d01bb 100644
--- a/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_UnsortedSegmentSum.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "unsorted_segment_sum"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_WriteFile.pbtxt b/tensorflow/core/api_def/python_api/api_def_WriteFile.pbtxt
index 1a58ae19e54195d67bc7504fc31b04dc6feab20d..d065027e9320d2240c81db2385bec08810432c7f 100644
--- a/tensorflow/core/api_def/python_api/api_def_WriteFile.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_WriteFile.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "write_file"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Zeta.pbtxt b/tensorflow/core/api_def/python_api/api_def_Zeta.pbtxt
index 4684a9d6242c5ed5f02ac941605e37004ae46438..69bf4eb51d269882def813b76825596b20b9b8bf 100644
--- a/tensorflow/core/api_def/python_api/api_def_Zeta.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Zeta.pbtxt
@@ -5,6 +5,6 @@ op {
   }
   endpoint {
     name: "zeta"
-    deprecated: true
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
deleted file mode 100644
index dd1459521ff70fc4b3adce7fbb1251b45106b439..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ZipDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index 822d0065b6713dbc6692ed11b7a938a784b0d597..c4bc1a684cb3ffaa30cdaece041fc51c266a3782 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -74,8 +74,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
 
   Status rewriteNode(Node* n, Graph* g) {
     AttrSlice n_attrs = n->attrs();
-    auto base_make_node = [n, g, &n_attrs](const string& op,
-                                           const string& name) {
+    auto base_make_node = [n, &n_attrs](const string& op, const string& name) {
       NodeBuilder node_builder(name, op);
 
       // The pieces of AccumulateNV2 should all be on the same node.
@@ -86,7 +85,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
       }
       return node_builder;
     };
-    auto make_node = [n, g, &n_attrs, &base_make_node](string op) {
+    auto make_node = [n, g, &base_make_node](string op) {
       return base_make_node(
           op, g->NewName(strings::StrCat(n->name(), "/Internal")));
     };
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 5b01f7fa037f4a67be4bff455c847ddfdabef682..92e56df18105218fc8a5112a880b6c999f1a2649 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -261,6 +261,13 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
   });
 }
 
+void BaseCollectiveExecutor::CompleteParamsAsync(
+    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
+    StatusCallback done) {
+  cp->instance.gpu_ring_order = *gpu_ring_order_;
+  cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr, done);
+}
+
 Status BaseCollectiveExecutor::CreateCollective(
     const CollectiveParams& col_params,
     CollectiveImplementationInterface** col_impl) {
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 360ce4db7bdab16d38872722540f2fe08a1b143f..09826a8814511cb46c907b983f240fe17df70e3d 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -89,11 +89,13 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
  public:
   BaseCollectiveExecutor(CollectiveExecutorMgrInterface* cem,
                          PerStepCollectiveRemoteAccess* remote_access,
-                         int64 step_id, const DeviceMgr* dev_mgr)
+                         int64 step_id, const DeviceMgr* dev_mgr,
+                         const string* gpu_ring_order)
       : CollectiveExecutor(cem),
         step_id_(step_id),
         dev_mgr_(dev_mgr),
-        remote_access_(remote_access) {}
+        remote_access_(remote_access),
+        gpu_ring_order_(gpu_ring_order) {}
 
   ~BaseCollectiveExecutor() override;
 
@@ -102,6 +104,10 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
   void ExecuteAsync(OpKernelContext* ctx, const CollectiveParams& col_params,
                     const string& exec_key, StatusCallback done) override;
 
+  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           StatusCallback done) override;
+
   PerStepCollectiveRemoteAccess* remote_access() override {
     return remote_access_.get();
   }
@@ -133,6 +139,7 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
   const int64 step_id_;
   const DeviceMgr* dev_mgr_;  // Not owned.
   std::unique_ptr<PerStepCollectiveRemoteAccess> remote_access_;
+  const string* gpu_ring_order_;  // Not owned.
 
  private:
   Status CreateCollective(const CollectiveParams& col_params,
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
index 4f03a5e13ad59b59c7675ac344b2894b19c3be22..7bbc7ca06c56084cea934c097f6720ef4397edbe 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -29,7 +29,9 @@ CollectiveExecutorMgr::CollectiveExecutorMgr(
     std::unique_ptr<ParamResolverInterface> param_resolver)
     : dev_mgr_(dev_mgr),
       dev_resolver_(std::move(dev_resolver)),
-      param_resolver_(std::move(param_resolver)) {}
+      param_resolver_(std::move(param_resolver)),
+      gpu_ring_order_(
+          config.gpu_options().experimental().collective_ring_order()) {}
 
 CollectiveExecutorMgr::~CollectiveExecutorMgr() {
   for (auto iter : executor_table_) {
@@ -56,7 +58,8 @@ CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
 CollectiveExecutor* CollectiveExecutorMgr::Create(int64 step_id) {
   CollectiveRemoteAccessLocal* rma =
       new CollectiveRemoteAccessLocal(dev_mgr_, dev_resolver_.get(), step_id);
-  return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_);
+  return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
+                                    &gpu_ring_order_);
 }
 
 void CollectiveExecutorMgr::Cleanup(int64 step_id) {
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
index d53aca85b967c1a5f635192268b2ef7597431b96..4db121a4d6d024872c32b8e85524d7ff1bfdc018 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -62,8 +62,7 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
   const DeviceMgr* dev_mgr_;
   std::unique_ptr<DeviceResolverInterface> dev_resolver_;
   std::unique_ptr<ParamResolverInterface> param_resolver_;
-  CollectiveRemoteAccess* remote_access_;
-  string task_name_;
+  string gpu_ring_order_;
 
  private:
   mutex exec_mu_;
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index 91994c57311f95a669949a38c161f7d3acf5f54d..f3d86aa633938042b862613162d1c2a94b0fe35a 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -38,8 +38,9 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
     auto* device_count = options.config.mutable_device_count();
     string task_name = "/job:localhost/replica:0/task:0";
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     std::unique_ptr<DeviceResolverInterface> drl(
         new DeviceResolverLocal(device_mgr_.get()));
     std::unique_ptr<ParamResolverInterface> prl(
@@ -50,7 +51,6 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
   }
 
   std::unique_ptr<CollectiveExecutorMgr> cme_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
 };
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 7cb90de3c791f4916990b235c3684f9955cd12f2..a8e3f4c881afc9c37ce4b5196c32ec591be5506d 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -170,8 +171,43 @@ GlobalDeviceMap BuildDevRecs(const CollInstanceParams& ip,
   return gdm;
 }
 
-void OrderTaskDeviceMap(TaskDeviceMap* tdm) {
+bool ParseRingOrder(const string& gpu_ring_order_str, TaskDeviceMap* tdm) {
+  std::vector<int32> gpu_ring_order_vec;
+  if (!str_util::SplitAndParseAsInts(gpu_ring_order_str, ',',
+                                     &gpu_ring_order_vec)) {
+    return false;
+  }
+  if (gpu_ring_order_vec.size() != tdm->size()) return false;
+  // gpu id -> local rank
+  gtl::FlatMap<int32, int32> gpu_ranks;
+  for (int32 rank = 0; rank < static_cast<int32>(gpu_ring_order_vec.size());
+       ++rank) {
+    gpu_ranks[gpu_ring_order_vec[rank]] = rank;
+  }
+
+  for (auto& tdm_it : *tdm) {
+    DeviceNameUtils::ParsedName parsed_name;
+    DevRec* dr = &tdm_it.second;
+    if (!DeviceNameUtils::ParseFullName(dr->device, &parsed_name)) {
+      return false;
+    }
+    auto rank_it = gpu_ranks.find(parsed_name.id);
+    if (rank_it == gpu_ranks.end()) return false;
+    dr->local_rank = rank_it->second;
+  }
+  VLOG(2) << "Assigned local ranks based on ring order " << gpu_ring_order_str;
+  return true;
+}
+
+void OrderTaskDeviceMap(const string& gpu_ring_order, TaskDeviceMap* tdm) {
   CHECK_GT(tdm->size(), 0);  // Should never be called with 0 devices
+
+  // If a valid ring order has been passed in via ConfigProto, use that.
+  if (ParseRingOrder(gpu_ring_order, tdm)) return;
+
+  // Either no ring order was passed in, or the format was unexpected.
+  // We now assign a ring order based on link strengths.  Note that this
+  // algorithm is not optimal and may not always find the best ring order.
   int least_rank = -1;
   string next_device;
   std::set<string> selected;
@@ -256,7 +292,7 @@ GlobalDeviceMap EstablishGlobalRank(
   GlobalDeviceMap gdm = BuildDevRecs(cp->instance, localities);
   for (auto& iter : gdm) {
     TaskDeviceMap& tdm = iter.second;
-    OrderTaskDeviceMap(&tdm);
+    OrderTaskDeviceMap(cp->instance.gpu_ring_order, &tdm);
   }
   // Connect the global rank order by the order in which tasks first appear.
   std::set<string> ordered_tasks;
@@ -445,8 +481,7 @@ void CollectiveParamResolverLocal::CompleteDefaultRanking(
   ir->shared.instance.task_names = new_task_names;
   if (VLOG_IS_ON(2)) {
     string buf;
-    for (const auto& d : cp->instance.device_names)
-      strings::StrAppend(&buf, "\n", d);
+    for (const auto& d : new_device_names) strings::StrAppend(&buf, "\n", d);
     VLOG(2) << "Optimized device order for " << ir->shared.name << ": " << buf;
   }
 }
@@ -476,7 +511,7 @@ void CollectiveParamResolverLocal::FindInstanceRec(
         if (irec->is_init) {
           exit_outside_locks = true;
         } else {
-          irec->init_waiters.push_back([this, gr, cp, done](InstanceRec* irec) {
+          irec->init_waiters.push_back([this, done](InstanceRec* irec) {
             CallbackWithStatus(done, irec);
           });
           return;
@@ -661,7 +696,7 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
         if (ir->source_rank >= 0) {
           ir->status = errors::Internal("Instance ", cp->instance.instance_key,
                                         " already has source ", ir->source_rank,
-                                        ", recevied second claim from ",
+                                        ", received second claim from ",
                                         cp->default_rank);
         } else {
           ir->source_rank = cp->default_rank;
@@ -673,7 +708,16 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
       return;
     }
     CHECK_EQ(ir->known_count, ir->shared.group.group_size);
-    CHECK_GE(ir->source_rank, 0);
+    if (ir->source_rank < 0) {
+      // NOTE(ayushd): changing the error message below would also require
+      // updating CompleteParamsBroadcastForgotSend test in
+      // CollectiveParamResolverLocalTest.
+      ir->status =
+          errors::Internal("Instance ", cp->instance.instance_key,
+                           " found no source for broadcast.  This "
+                           "could mean that there were group_size=",
+                           ir->known_count, " BcastRecvs but no BcastSend.");
+    }
     if (!ir->known_waiters.empty()) {
       ready_waiters = std::move(ir->known_waiters);
     }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index c5c3497e28cc9c7a7254c7f15a4bdfa5bf261980..365bddc787a7ba3d97f2df29b4ebd2a3c7118ef7 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -57,6 +57,9 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
                              const StatusCallback& done) override;
 
  protected:
+  // For access to InstanceRec and CompleteDefaultRanking.
+  friend class CollectiveParamResolverLocalTest;
+
   // Used to complete/verify CollGroup.
   struct GroupRec {
     CollGroupParams group;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 9e1e2e8d5b24b3cc0bd17fd493f7429c4a547ef0..94d889c40dff89204ccfc43478f8732815a4ead4 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -37,19 +37,123 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     string task_name = "/job:localhost/replica:0/task:0";
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
     prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
                                                 task_name));
   }
 
-  std::vector<Device*> devices_;
+  void RunCompleteDefaultRanking(
+      const CollectiveParams& shared_cp,
+      const std::vector<DeviceLocality>& localities,
+      const std::vector<int32>& gpu_ring_order,
+      const std::vector<string>& expected_device_order) {
+    CollectiveParams cp;
+    cp.instance.device_names = shared_cp.instance.device_names;
+    CollectiveParamResolverLocal::InstanceRec ir;
+    {
+      mutex_lock l(ir.out_mu);
+      ir.shared.name = shared_cp.name;
+      ir.shared.group = shared_cp.group;
+      ir.shared.instance = shared_cp.instance;
+      if (!gpu_ring_order.empty()) {
+        ir.shared.instance.gpu_ring_order = "";
+        for (int i = 0; i < static_cast<int32>(gpu_ring_order.size() - 1);
+             ++i) {
+          ir.shared.instance.gpu_ring_order = strings::StrCat(
+              ir.shared.instance.gpu_ring_order, gpu_ring_order[i], ",");
+        }
+        ir.shared.instance.gpu_ring_order = strings::StrCat(
+            ir.shared.instance.gpu_ring_order, gpu_ring_order.back());
+      }
+      VLOG(2) << "gpu_ring_order " << ir.shared.instance.gpu_ring_order;
+      prl_->CompleteDefaultRanking(nullptr, &cp, &ir, localities);
+      EXPECT_EQ(ir.shared.instance.device_names, expected_device_order);
+    }
+  }
+
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
   std::unique_ptr<CollectiveParamResolverLocal> prl_;
 };
 
+TEST_F(CollectiveParamResolverLocalTest, CompleteDefaultRanking) {
+  constexpr int kNumGpus = 8;
+  CollectiveParams cp;
+  std::vector<DeviceLocality> localities(kNumGpus);
+  cp.name = "PRLTest";
+  cp.group.device_type = DeviceType("GPU");
+  cp.group.num_tasks = 1;
+  cp.group.group_size = kNumGpus;
+  cp.instance.instance_key = 5;
+  cp.instance.type = REDUCTION_COLLECTIVE;
+  cp.instance.data_type = DataType(DT_FLOAT);
+  std::unordered_set<int> clique1 = {0, 1, 6, 7};
+  for (int gpu_idx = 0; gpu_idx < kNumGpus; ++gpu_idx) {
+    cp.instance.task_names.push_back("/job:localhost/replica:0/task:0");
+    cp.instance.device_names.push_back(strings::StrCat(
+        "/job:localhost/replica:0/task:0/device:GPU:", gpu_idx));
+    DeviceLocality* locality = &localities[gpu_idx];
+    // Build localities so that 0,1,6,7 and 2,3,4,5 form 2 strongly connected
+    // components.  Across components, connect 3 and 7.
+    for (int link_idx = 0; link_idx < kNumGpus; ++link_idx) {
+      if (gpu_idx == link_idx) continue;
+      bool gpu_in_clique1 = clique1.find(gpu_idx) != clique1.end();
+      bool link_in_clique1 = clique1.find(link_idx) != clique1.end();
+      if ((gpu_in_clique1 && link_in_clique1) ||
+          (!gpu_in_clique1 && !link_in_clique1)) {
+        LocalLinks* links = locality->mutable_links();
+        InterconnectLink* ilink = links->add_link();
+        ilink->set_device_id(link_idx);
+        ilink->set_strength(2);
+      } else if ((gpu_idx == 3 && link_idx == 7) ||
+                 (gpu_idx == 7 && link_idx == 3)) {
+        LocalLinks* links = locality->mutable_links();
+        InterconnectLink* ilink = links->add_link();
+        ilink->set_device_id(link_idx);
+        ilink->set_strength(1);
+      }
+    }
+  }
+  RunCompleteDefaultRanking(cp, localities, {1, 3, 5, 7, 6, 4, 2, 0},
+                            {
+                                "/job:localhost/replica:0/task:0/device:GPU:1",
+                                "/job:localhost/replica:0/task:0/device:GPU:3",
+                                "/job:localhost/replica:0/task:0/device:GPU:5",
+                                "/job:localhost/replica:0/task:0/device:GPU:7",
+                                "/job:localhost/replica:0/task:0/device:GPU:6",
+                                "/job:localhost/replica:0/task:0/device:GPU:4",
+                                "/job:localhost/replica:0/task:0/device:GPU:2",
+                                "/job:localhost/replica:0/task:0/device:GPU:0",
+                            });
+  RunCompleteDefaultRanking(cp, localities, {7, 6, 5, 4, 3, 2, 1, 0},
+                            {
+                                "/job:localhost/replica:0/task:0/device:GPU:7",
+                                "/job:localhost/replica:0/task:0/device:GPU:6",
+                                "/job:localhost/replica:0/task:0/device:GPU:5",
+                                "/job:localhost/replica:0/task:0/device:GPU:4",
+                                "/job:localhost/replica:0/task:0/device:GPU:3",
+                                "/job:localhost/replica:0/task:0/device:GPU:2",
+                                "/job:localhost/replica:0/task:0/device:GPU:1",
+                                "/job:localhost/replica:0/task:0/device:GPU:0",
+                            });
+  // With no gpu_ring_order passed, automatic link detection should kick in.
+  // Starting at dev 0, the best order would be: 0,1,6,7,3,2,4,5
+  RunCompleteDefaultRanking(cp, localities, {},
+                            {
+                                "/job:localhost/replica:0/task:0/device:GPU:0",
+                                "/job:localhost/replica:0/task:0/device:GPU:1",
+                                "/job:localhost/replica:0/task:0/device:GPU:6",
+                                "/job:localhost/replica:0/task:0/device:GPU:7",
+                                "/job:localhost/replica:0/task:0/device:GPU:3",
+                                "/job:localhost/replica:0/task:0/device:GPU:2",
+                                "/job:localhost/replica:0/task:0/device:GPU:4",
+                                "/job:localhost/replica:0/task:0/device:GPU:5",
+                            });
+}
+
 TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
   CollectiveParams cps[NUM_DEVS];
   Status statuses[NUM_DEVS];
@@ -96,28 +200,35 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
   }
 }
 
+void InitializeCollectiveParamsForBroadcast(int instance_key, int device_idx,
+                                            bool is_source,
+                                            CollectiveParams* cp) {
+  cp->group.group_key = 1;
+  cp->group.group_size = 3;
+  cp->group.device_type = DeviceType("CPU");
+  cp->group.num_tasks = 1;
+  cp->instance.instance_key = instance_key;
+  cp->instance.type = BROADCAST_COLLECTIVE;
+  cp->instance.data_type = DataType(DT_FLOAT);
+  cp->instance.shape = TensorShape({5});
+  cp->instance.device_names.push_back(strings::StrCat(
+      "/job:localhost/replica:0/task:0/device:CPU:", device_idx));
+  cp->instance.impl_details.subdiv_offsets.push_back(0);
+  cp->is_source = is_source;
+}
+
 TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
+  constexpr int kInstanceKey = 5;
   CollectiveParams cps[NUM_DEVS];
   Status statuses[NUM_DEVS];
   Notification note[NUM_DEVS];
   for (int i = 0; i < NUM_DEVS; ++i) {
     CollectiveParams* cp = &cps[i];
-    cp->group.group_key = 1;
-    cp->group.group_size = 3;
-    cp->group.device_type = DeviceType("CPU");
-    cp->group.num_tasks = 1;
-    cp->instance.instance_key = 3;
-    cp->instance.type = BROADCAST_COLLECTIVE;
-    cp->instance.data_type = DataType(DT_FLOAT);
-    cp->instance.shape = TensorShape({5});
-    cp->instance.device_names.push_back(
-        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
-    cp->instance.impl_details.subdiv_offsets.push_back(0);
-    cp->is_source = (i == 1);
+    InitializeCollectiveParamsForBroadcast(kInstanceKey, i, i == 1, cp);
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
       prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
                                 nullptr /*CancellationManager*/,
-                                [this, &statuses, &note, i](const Status& s) {
+                                [&statuses, &note, i](const Status& s) {
                                   statuses[i] = s;
                                   note[i].Notify();
                                 });
@@ -141,4 +252,38 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
   }
 }
 
+// If we don't mark any participant in a broadcast as the source, we essentially
+// create a collective group with only broadcast recvs.  In that case, we should
+// get an internal error from param resolution.
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcastForgotSender) {
+  constexpr int kInstanceKey = 8;
+  CollectiveParams cps[NUM_DEVS];
+  Status statuses[NUM_DEVS];
+  Notification note[NUM_DEVS];
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    CollectiveParams* cp = &cps[i];
+    InitializeCollectiveParamsForBroadcast(kInstanceKey, i, false, cp);
+    Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
+      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+                                nullptr /*CancellationManager*/,
+                                [&statuses, &note, i](const Status& s) {
+                                  statuses[i] = s;
+                                  note[i].Notify();
+                                });
+    });
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    note[i].WaitForNotification();
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    EXPECT_EQ(statuses[i].code(), error::INTERNAL);
+    EXPECT_EQ(statuses[i].error_message(),
+              strings::StrCat(
+                  "Instance ", kInstanceKey,
+                  " found no source for broadcast.  This could mean that there"
+                  " were group_size=",
+                  NUM_DEVS, " BcastRecvs but no BcastSend."));
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index 288ae9d794a2547d7837e1311e71c4681236704a..d99565b49abde95ca2fa28293771970b19620dd5 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -38,7 +38,7 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
     return;
   }
   buf_rendezvous_.ConsumeBuf(
-      key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr,
+      key, [to_tensor, to_device_ctx, to_device, to_alloc_attr,
             dev_to_dev_stream_index,
             done](const Status& s, BufRendezvous::Hook* hook) {
         if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index a931fe64bd13c57e2b9d55c5c1bf46862b3cb524..4263f3a4add524bf59e7c08cfb5d927ac9e23e06 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -42,8 +42,9 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
     prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
                                                 kTaskName));
@@ -51,7 +52,6 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
                                                kStepId));
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
   std::unique_ptr<CollectiveParamResolverLocal> prl_;
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index db137f1a195a0568e84146d22144d8679a834cf3..5c226ec56e13fbb398d852ff6287910d2347785e 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -245,17 +245,17 @@ bool IsConstantFoldable(
   if (n->IsSink()) {
     return false;
   }
+  if (n->IsFakeParam()) {
+    return false;
+  }
   // Since constant-folding runs on the CPU, do not attempt to constant-fold
   // operators that have no CPU kernel. Also implies that we will not
   // constant-fold functions.
   // TODO(phawkins): allow constant-folding for functions; functions may
   // be arbitrarily expensive to execute.
-  if (!FindKernelDef(DeviceType(DEVICE_CPU), n->def(), /*def=*/nullptr,
-                     /*kernel_class_name=*/nullptr)
-           .ok()) {
+  if (!KernelDefAvailable(DeviceType(DEVICE_CPU), n->def())) {
     return false;
   }
-
   return true;
 }
 
@@ -466,23 +466,23 @@ Graph* GetConstantGraph(
 bool ReplaceTensorWithConstant(
     Graph* graph, Device* partition_device, NodeAndOutput tensor,
     const Tensor& constant, const gtl::FlatSet<Node*>& control_deps,
-    int64 max_constant_size_in_bytes, bool disable_memory_output_type_check,
+    int64 max_constant_size_in_bytes,
     const ConstantFoldNameGenerator& generate_new_name) {
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) Do not replace another constant.
-  // 2) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
-  // constraint, do not replace it.
-  // 3) If the size of the constant in bytes is too large (>
+  // 2) If the destination tensor or any other tensor from the same node is not
+  // an int32 tensor, and has HOST_MEMORY constraint, do not replace it.
+  // 3) If the destination tensor or any other tensor from the same node is an
+  // int32 tensor, and has DEVICE_MEMORY constraint, do not replace it.
+  // 4) If the size of the constant in bytes is too large (>
   // max_constant_in_bytes), do not replace it. This prevents the size of the
   // Graph from growing too large.
-  // 4) If the constant op created does not have a kernel implementation
+  // 5) If the constant op created does not have a kernel implementation
   // for the device, do not use it.
   // TODO(keveman): Consider adding a new constant op that has a kernel
   // implementation for all types, but with HostMemory constraint on it's
   // output.
-  // 5) If the constant op for the device has different output memory type
-  // from the original op output memory type, do not replace it.
   if (tensor.first->IsConstant()) {
     return false;
   }
@@ -490,15 +490,20 @@ bool ReplaceTensorWithConstant(
                                ? DeviceType{partition_device->device_type()}
                                : DEVICE_CPU;
   if (partition_device && device_type != DEVICE_CPU) {
-    MemoryType memory_type;
-    if (!MemoryTypeForOutput(device_type, graph, tensor.first, tensor.second,
-                             &memory_type)
+    MemoryTypeVector input_mvec;
+    MemoryTypeVector output_mvec;
+    if (!MemoryTypesForNode(graph->op_registry(), device_type,
+                            tensor.first->def(), &input_mvec, &output_mvec)
              .ok()) {
       return false;
     }
-    bool is_int32 = tensor.first->output_type(tensor.second) == DT_INT32;
-    if (memory_type == HOST_MEMORY && !is_int32) {
-      return false;
+    for (int i = 0; i < output_mvec.size(); i++) {
+      MemoryType memory_type = output_mvec[i];
+      bool is_int32 = tensor.first->output_type(i) == DT_INT32;
+      if ((memory_type == HOST_MEMORY && !is_int32) ||
+          (memory_type == DEVICE_MEMORY && is_int32)) {
+        return false;
+      }
     }
   }
   if (constant.TotalBytes() > max_constant_size_in_bytes) {
@@ -535,25 +540,6 @@ bool ReplaceTensorWithConstant(
   if (!NodeBuilder(builder).Finalize(graph, &constant_node).ok()) {
     return false;
   }
-  if (!disable_memory_output_type_check) {
-    if (partition_device && device_type != DEVICE_CPU) {
-      MemoryType original_output_memory_type;
-      if (!MemoryTypeForOutput(device_type, graph, tensor.first, tensor.second,
-                               &original_output_memory_type)
-               .ok()) {
-        return false;
-      }
-      MemoryType const_output_memory_type;
-      if (!MemoryTypeForOutput(device_type, graph, constant_node, 0,
-                               &const_output_memory_type)
-               .ok()) {
-        return false;
-      }
-      if (original_output_memory_type != const_output_memory_type) {
-        return false;
-      }
-    }
-  }
   for (auto edge : edges_to_remove) {
     graph->AddEdge(constant_node, 0, edge->dst(), edge->dst_input());
     graph->RemoveEdge(edge);
@@ -660,8 +646,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
         constant_control_deps[tensors_to_replace[c].first];
     if (ReplaceTensorWithConstant(
             graph, partition_device, tensors_to_replace[c], outputs[c],
-            control_deps, opts.max_constant_size_in_bytes,
-            opts.disable_memory_output_type_check, generate_new_name)) {
+            control_deps, opts.max_constant_size_in_bytes, generate_new_name)) {
       ++num_nodes_replaced;
     }
   }
diff --git a/tensorflow/core/common_runtime/constant_folding.h b/tensorflow/core/common_runtime/constant_folding.h
index 4c71b7bd27ecaca62c340c0bfdd7ee0e77c87925..a9a84f761b678c1c5de69908e0323ed9910a4a02 100644
--- a/tensorflow/core/common_runtime/constant_folding.h
+++ b/tensorflow/core/common_runtime/constant_folding.h
@@ -45,10 +45,6 @@ struct ConstantFoldingOptions {
   // optimization.
   int64 max_constant_size_in_bytes = 10 * 1024 * 1024;
 
-  // If disable_memory_output_type_check is true, we will disable output memory
-  // type check for constant node replacement.
-  bool disable_memory_output_type_check = false;
-
   // A generator for the name suffix of constant folded nodes. A
   // default id generator that monotonically increases is used if nullptr is
   // passed.
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 16b61315f29322565492da8c168c6fbc89d6daf1..1d4586f3da84f0beabe440dca51105826feb197c 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -18,13 +18,16 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
 
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -70,15 +73,6 @@ class ConstantFoldingTest : public ::testing::Test {
     test::ExpectTensorEqual<T>(t, test::AsTensor(values, shape));
   }
 
-  // Builds a map from node name to Node* for `graph`.
-  std::unordered_map<string, Node*> NodeNameIndex(const Graph& graph) {
-    std::unordered_map<string, Node*> index;
-    for (Node* node : graph.nodes()) {
-      index[node->name()] = node;
-    }
-    return index;
-  }
-
   // Constructs the following graph.
   /*
         s1  s2
@@ -99,6 +93,24 @@ class ConstantFoldingTest : public ::testing::Test {
   }
 };
 
+class FakeDevice : public Device {
+ private:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+ public:
+  Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
+
+  static std::unique_ptr<Device> Make(const string& name, const string& type) {
+    DeviceAttributes device_attributes;
+    device_attributes.set_name(name);
+    device_attributes.set_device_type(DeviceType(type).type());
+    return std::unique_ptr<Device>(new FakeDevice(device_attributes));
+  }
+};
+
 TEST_F(ConstantFoldingTest, Basic) {
   Scope s = Scope::NewRootScope();
   BuildSimpleGraph(&s);
@@ -110,7 +122,7 @@ TEST_F(ConstantFoldingTest, Basic) {
                             nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
 
-  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> index = g.BuildNodeNameIndex();
   Node* s1 = index.at("s1");
   Node* s2 = index.at("s2");
   // Nodes s1 and s2 now should now have a constant input
@@ -165,7 +177,7 @@ TEST_F(ConstantFoldingTest, DeterministicFolding) {
   Graph g2(OpRegistry::Global());
   TF_ASSERT_OK(build_graph_and_constant_folding(g2, true));
   EXPECT_EQ(g1.num_nodes(), g2.num_nodes());
-  auto index = NodeNameIndex(g2);
+  auto index = g2.BuildNodeNameIndex();
 
   // All the nodes in g1 are expected to be present in g2.
   for (int64 i = 0; i < g1.num_nodes(); ++i) {
@@ -188,7 +200,7 @@ TEST_F(ConstantFoldingTest, ConsiderFunction) {
       ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
 
-  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> index = g.BuildNodeNameIndex();
   Node* s1 = index.at("s1");
   Node* s2 = index.at("s2");
   Node* m2 = index.at("m2");
@@ -217,7 +229,7 @@ TEST_F(ConstantFoldingTest, TestNoReplaceAnotherConstant) {
                             nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
 
-  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> index = g.BuildNodeNameIndex();
   Node* d = index.at("d");
   Node* s3 = index.at("s3");
 
@@ -245,7 +257,7 @@ TEST_F(ConstantFoldingTest, TwoOutputs) {
                             nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
 
-  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> index = g.BuildNodeNameIndex();
   Node* b0 = index.at("b0");
   Node* b1 = index.at("b1");
 
@@ -277,7 +289,7 @@ TEST_F(ConstantFoldingTest, TwoOutputsFoldOneOutput) {
       ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
 
-  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> index = g.BuildNodeNameIndex();
   Node* b0 = index.at("b0");
   Node* b1 = index.at("b1");
   Node* b1_ident = index.at("b1_ident");
@@ -412,7 +424,7 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
                             nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
 
-  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> index = g.BuildNodeNameIndex();
   Node* recv1 = index.at("recv1");
   Node* recv2 = index.at("recv2");
   Node* send = index.at("send");
@@ -454,7 +466,7 @@ TEST_F(ConstantFoldingTest, SimpleShapeKnown) {
                             "receiver");
     TF_ASSERT_OK(s.ToGraph(&g));
   }
-  std::unordered_map<string, Node*> orig_index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> orig_index = g.BuildNodeNameIndex();
   Node* recv0 = orig_index.at("recv0");
   Node* recv1 = orig_index.at("recv1");
   PartialTensorShape ps0;
@@ -473,7 +485,7 @@ TEST_F(ConstantFoldingTest, SimpleShapeKnown) {
       ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
 
-  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> index = g.BuildNodeNameIndex();
   Node* recv2 = index.at("recv2");
   Node* send0 = index.at("send0");
   Node* send1 = index.at("send1");
@@ -533,7 +545,7 @@ TEST_F(ConstantFoldingTest, PartialShape) {
                             "receiver");
     TF_ASSERT_OK(s.ToGraph(&g));
   }
-  std::unordered_map<string, Node*> orig_index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> orig_index = g.BuildNodeNameIndex();
   Node* recv0 = orig_index.at("recv0");
   Node* recv1 = orig_index.at("recv1");
   PartialTensorShape ps0;
@@ -550,7 +562,7 @@ TEST_F(ConstantFoldingTest, PartialShape) {
       ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
 
-  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> index = g.BuildNodeNameIndex();
   Node* shape = index.at("shape");
   Node* size = index.at("size");
   Node* rank1 = index.at("rank1");
@@ -590,7 +602,7 @@ TEST_F(ConstantFoldingTest, ConstShapeKnown) {
                             "receiver");
     TF_ASSERT_OK(s.ToGraph(&g));
   }
-  std::unordered_map<string, Node*> orig_index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> orig_index = g.BuildNodeNameIndex();
   Node* c0 = orig_index.at("c0");
   PartialTensorShape ps0;
   int c0_dims[] = {};
@@ -604,7 +616,7 @@ TEST_F(ConstantFoldingTest, ConstShapeKnown) {
       ConstantFold(opts, nullptr, Env::Default(), nullptr, &g, &was_mutated));
   EXPECT_TRUE(was_mutated);
 
-  std::unordered_map<string, Node*> index = NodeNameIndex(g);
+  std::unordered_map<string, Node*> index = g.BuildNodeNameIndex();
   Node* recv0 = index.at("recv0");
   Node* send0 = index.at("send0");
 
@@ -619,6 +631,31 @@ TEST_F(ConstantFoldingTest, ConstShapeKnown) {
   }
 }
 
+TEST_F(ConstantFoldingTest, NoReplacePartialOutput) {
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope().ExitOnError().WithAssignedDevice("/gpu:0");
+
+    auto c0 = ops::Const<float>(s.WithOpName("c0"), {5.0, 2.0, 8.0, 1.0}, {4});
+    auto k = ops::Const<int>(s.WithOpName("k"), 3);
+    auto topK =
+        ops::TopK(s.WithOpName("topK"), c0, k, ops::TopK::Sorted(false));
+    auto send_values = ops::_Send(s.WithOpName("send_values"), topK.values,
+                                  "send_values", "sender", 0, "receiver");
+    auto send_indices = ops::_Send(s.WithOpName("send_indices"), topK.indices,
+                                   "send_indices", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+  bool was_mutated;
+  TF_EXPECT_OK(ConstantFold(
+      ConstantFoldingOptions{}, nullptr, Env::Default(),
+      FakeDevice::Make("/job:tpu_worker/replica:0/task:0/device:GPU:0",
+                       DEVICE_GPU)
+          .get(),
+      &g, &was_mutated));
+  EXPECT_FALSE(was_mutated);
+}
+
 namespace {
 
 const char kTestMemRegionName[] = "test://test";
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/common_runtime/device.cc
index 8fc64fff69a6252ed9860f8dcb75814cfd0785ff..9925814a48acf19162a39f07666a909db56e39e4 100644
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/common_runtime/device.cc
@@ -36,6 +36,8 @@ Device::~Device() {
   }
 }
 
+void Device::Sync(const DoneCallback& done) { done(Sync()); }
+
 // static
 DeviceAttributes Device::BuildDeviceAttributes(
     const string& name, DeviceType device, Bytes memory_limit,
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 2ef1547cd9a56de0750eac1583568a06720acb99..8dfbb21eda641ff9f70c58f1f4bf150ba4cceef3 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -55,6 +55,9 @@ class DeviceMgr;
 
 class Device : public DeviceBase {
  public:
+  // Callback type that takes a Status and returns void.
+  typedef std::function<void(const Status&)> DoneCallback;
+
   Device(Env* env, const DeviceAttributes& device_attributes);
   ~Device() override;
 
@@ -112,6 +115,13 @@ class Device : public DeviceBase {
   // at completion.
   virtual Status Sync() = 0;
 
+  // Calls the given callback when all operations queued on the device at the
+  // time of the call have completed. The callback is passed any error pending
+  // on the device at completion.
+  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
+  // version.
+  virtual void Sync(const DoneCallback& done);
+
   // Override this to return true for devices that require a Sync() call before
   // session completion.
   virtual bool RequiresSyncOnCompletion() const { return false; }
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc
index b43c718817558f0e44eff5f5e5d5ec3a81d25ddd..0fad13fe1e747e219c40c5262877dce6a7544b8a 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/common_runtime/device_factory.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -89,9 +90,9 @@ DeviceFactory* DeviceFactory::GetFactory(const string& device_type) {
   return it->second.factory.get();
 }
 
-Status DeviceFactory::AddDevices(const SessionOptions& options,
-                                 const string& name_prefix,
-                                 std::vector<Device*>* devices) {
+Status DeviceFactory::AddDevices(
+    const SessionOptions& options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   // CPU first. A CPU device is required.
   auto cpu_factory = GetFactory("CPU");
   if (!cpu_factory) {
@@ -116,19 +117,24 @@ Status DeviceFactory::AddDevices(const SessionOptions& options,
   return Status::OK();
 }
 
-Device* DeviceFactory::NewDevice(const string& type,
-                                 const SessionOptions& options,
-                                 const string& name_prefix) {
+std::unique_ptr<Device> DeviceFactory::NewDevice(const string& type,
+                                                 const SessionOptions& options,
+                                                 const string& name_prefix) {
   auto device_factory = GetFactory(type);
   if (!device_factory) {
     return nullptr;
   }
   SessionOptions opt = options;
   (*opt.config.mutable_device_count())[type] = 1;
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(device_factory->CreateDevices(opt, name_prefix, &devices));
-  CHECK_EQ(devices.size(), size_t{1});
-  return devices[0];
+  int expected_num_devices = 1;
+  auto iter = options.config.device_count().find(type);
+  if (iter != options.config.device_count().end()) {
+    expected_num_devices = iter->second;
+  }
+  DCHECK_EQ(devices.size(), static_cast<size_t>(expected_num_devices));
+  return std::move(devices[0]);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_factory.h b/tensorflow/core/common_runtime/device_factory.h
index db50226fe895963778eafe8a49289889eae16b1f..b3cd7adca9c638d43400cfa04ec63db1437ed62c 100644
--- a/tensorflow/core/common_runtime/device_factory.h
+++ b/tensorflow/core/common_runtime/device_factory.h
@@ -40,18 +40,19 @@ class DeviceFactory {
   // CPU devices are added first.
   static Status AddDevices(const SessionOptions& options,
                            const string& name_prefix,
-                           std::vector<Device*>* devices);
+                           std::vector<std::unique_ptr<Device>>* devices);
 
   // Helper for tests.  Create a single device of type "type".  The
   // returned device is always numbered zero, so if creating multiple
   // devices of the same type, supply distinct name_prefix arguments.
-  static Device* NewDevice(const string& type, const SessionOptions& options,
-                           const string& name_prefix);
+  static std::unique_ptr<Device> NewDevice(const string& type,
+                                           const SessionOptions& options,
+                                           const string& name_prefix);
 
   // Most clients should call AddDevices() instead.
-  virtual Status CreateDevices(const SessionOptions& options,
-                               const string& name_prefix,
-                               std::vector<Device*>* devices) = 0;
+  virtual Status CreateDevices(
+      const SessionOptions& options, const string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) = 0;
 
   // Return the device priority number for a "device_type" string.
   //
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 470abc1431292820dec747110a60c08246470c3c..1f7d7c4699872e55a73ebab919936435684405fe 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
 
+#include <memory>
 #include <vector>
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -24,32 +25,32 @@ limitations under the License.
 
 namespace tensorflow {
 
-DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
-    : name_backing_store_(128) {
-  for (Device* d : devices) {
+DeviceMgr::DeviceMgr(std::vector<std::unique_ptr<Device>> devices)
+    : devices_(std::move(devices)), name_backing_store_(128) {
+  for (auto& d : devices_) {
     CHECK(d->device_mgr_ == nullptr);
     d->device_mgr_ = this;
 
-    devices_.push_back(d);
-
     // Register under the (1) full name and (2) canonical name.
     for (const string& name :
          DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
-      device_map_[CopyToBackingStore(name)] = d;
+      device_map_[CopyToBackingStore(name)] = d.get();
     }
     // Register under the (3) local name and (4) legacy local name.
     for (const string& name :
          DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
-      device_map_[CopyToBackingStore(name)] = d;
+      device_map_[CopyToBackingStore(name)] = d.get();
     }
     device_type_counts_[d->device_type()]++;
   }
 }
 
-DeviceMgr::~DeviceMgr() {
-  // TODO(b/37437134): Remove destructor after converting to std::unique_ptr.
-  for (Device* p : devices_) delete p;
-}
+DeviceMgr::DeviceMgr(std::unique_ptr<Device> device)
+    : DeviceMgr([&device] {
+        std::vector<std::unique_ptr<Device>> vector;
+        vector.push_back(std::move(device));
+        return vector;
+      }()) {}
 
 StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
   size_t n = s.size();
@@ -61,18 +62,22 @@ StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
 void DeviceMgr::ListDeviceAttributes(
     std::vector<DeviceAttributes>* devices) const {
   devices->reserve(devices_.size());
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     devices->emplace_back(dev->attributes());
   }
 }
 
 std::vector<Device*> DeviceMgr::ListDevices() const {
-  return std::vector<Device*>(devices_.begin(), devices_.end());
+  std::vector<Device*> devices(devices_.size());
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    devices[i] = devices_[i].get();
+  }
+  return devices;
 }
 
 string DeviceMgr::DebugString() const {
   string out;
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     strings::StrAppend(&out, dev->name(), "\n");
   }
   return out;
@@ -80,7 +85,7 @@ string DeviceMgr::DebugString() const {
 
 string DeviceMgr::DeviceMappingString() const {
   string out;
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     if (!dev->attributes().physical_device_desc().empty()) {
       strings::StrAppend(&out, dev->name(), " -> ",
                          dev->attributes().physical_device_desc(), "\n");
@@ -107,7 +112,7 @@ Status DeviceMgr::LookupDevice(StringPiece name, Device** device) const {
 
 void DeviceMgr::ClearContainers(gtl::ArraySlice<string> containers) const {
   Status s;
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     if (containers.empty()) {
       s.Update(dev->resource_manager()->Cleanup(
           dev->resource_manager()->default_container()));
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index c1ff10d9b59cbba59bb89c7585a3b1c27111aaf6..bf8694655ae06fab590e4111488e3212e3e87ef7 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
 
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -34,15 +35,17 @@ class DeviceAttributes;
 
 class DeviceMgr {
  public:
-  // Takes ownership of each device in 'devices'.
+  // Constructs a DeviceMgr from a list of devices.
   // TODO(zhifengc): Other initialization information.
-  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
-  explicit DeviceMgr(const std::vector<Device*>& devices);
-  ~DeviceMgr();
+  explicit DeviceMgr(std::vector<std::unique_ptr<Device>> devices);
+
+  // Constructs a DeviceMgr managing a single device.
+  explicit DeviceMgr(std::unique_ptr<Device> device);
 
   // Returns attributes of all devices.
   void ListDeviceAttributes(std::vector<DeviceAttributes>* devices) const;
 
+  // Returns raw pointers to the underlying devices.
   std::vector<Device*> ListDevices() const;
 
   // Returns a string listing all devices.
@@ -62,9 +65,7 @@ class DeviceMgr {
   int NumDeviceType(const string& type) const;
 
  private:
-  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
-  typedef gtl::InlinedVector<Device*, 8> DeviceVec;
-  DeviceVec devices_;
+  const std::vector<std::unique_ptr<Device>> devices_;
 
   StringPiece CopyToBackingStore(StringPiece s);
 
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
index f5a6471ff731578d377ccfc9ad146847ae3f221c..54f1119e139886096cb7c2007e584003992d86c2 100644
--- a/tensorflow/core/common_runtime/device_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -36,12 +36,12 @@ class DeviceResolverLocalTest : public ::testing::Test {
     string task_name = "/job:localhost/replica:0/task:0";
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
 };
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index fd9c4222a7afd4914415c9c62e1ced118ea75d1f..6a8c3d14e543a74354bae77518e9f88502813463 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -57,7 +57,7 @@ class DeviceSetTest : public ::testing::Test {
 class DummyFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
+                       std::vector<std::unique_ptr<Device>>* devices) override {
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 458e133b682d1bb00eec4f29b3b49f1ce932cc1e..0434ca47b68f28ff65cb3d5e165bc5545ebe96f0 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
@@ -64,6 +65,7 @@ limitations under the License.
 #include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
@@ -154,12 +156,12 @@ class DirectSessionFactory : public SessionFactory {
     if (options.config.graph_options().build_cost_model() > 0) {
       EnableCPUAllocatorFullStats(true);
     }
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
         options, "/job:localhost/replica:0/task:0", &devices));
 
     DirectSession* session =
-        new DirectSession(options, new DeviceMgr(devices), this);
+        new DirectSession(options, new DeviceMgr(std::move(devices)), this);
     {
       mutex_lock l(sessions_lock_);
       sessions_.push_back(session);
@@ -252,11 +254,19 @@ static RunHandlerPool* GetOrCreateRunHandlerPool(
   return pool;
 }
 
-bool DirectSession::ShouldUseRunHandlerPool() const {
-  if (options_.config.session_inter_op_thread_pool_size() > 0 ||
-      options_.config.use_per_session_threads()) {
+bool DirectSession::ShouldUseRunHandlerPool(
+    const RunOptions& run_options) const {
+  if (options_.config.use_per_session_threads()) return false;
+  if (options_.config.session_inter_op_thread_pool_size() > 0 &&
+      run_options.inter_op_thread_pool() > 0)
     return false;
-  }
+  // Only use RunHandlerPool when:
+  // a. Single global thread pool is used for inter-op parallelism.
+  // b. When multiple inter_op_thread_pool(s) are created, use it only while
+  // running sessions on the default inter_op_thread_pool=0. Typically,
+  // servo-team uses inter_op_thread_pool > 0 for model loading.
+  // TODO(crk): Revisit whether we'd want to create one (static) RunHandlerPool
+  // per entry in session_inter_op_thread_pool() in the future.
   return true;
 }
 
@@ -453,6 +463,10 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
                                   CallFrameInterface* call_frame,
                                   ExecutorsAndKeys* executors_and_keys,
                                   RunMetadata* run_metadata) {
+  const uint64 start_time_usecs = Env::Default()->NowMicros();
+  string session_id_meta = strings::StrCat("SessionRun #id=", step_id, "#");
+  tracing::ScopedActivity activity(session_id_meta);
+
   const int64 executor_step_count = executors_and_keys->step_count.fetch_add(1);
 
   std::unique_ptr<DebuggerStateInterface> debugger_state;
@@ -599,9 +613,8 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   }
 
   std::unique_ptr<RunHandler> handler;
-  if (ShouldUseRunHandlerPool() &&
+  if (ShouldUseRunHandlerPool(run_options) &&
       run_options.experimental().use_run_handler_pool()) {
-    // Non-null only when a global inter-op pool is used.
     VLOG(1) << "Using RunHandler to scheduler inter-op closures.";
     handler = GetOrCreateRunHandlerPool(options_)->Get();
   }
@@ -705,6 +718,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
+  UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
 
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 3a168bbe3fcb08167465ab75a155e2d2b4038046..6754e9cfb71700090049107cf4dd122175527ffe 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -247,8 +247,10 @@ class DirectSession : public Session {
                                    ExecutorsAndKeys* executors_and_keys,
                                    RunMetadata* run_metadata);
 
-  // Returns whether inter-op execution uses a global pool.
-  bool ShouldUseRunHandlerPool() const;
+  // Returns whether inter-op execution uses a global pool or the input
+  // `run_options` requests being run on inter_op_thread_pool = 0 in case
+  // multiple pools are configured.
+  bool ShouldUseRunHandlerPool(const RunOptions& run_options) const;
 
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 2c63b8704ee1d08d643c9e90940c3897fbb1358b..6a265c468c1fe617d38e539fac20fb0cba294afe 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -107,26 +107,20 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim_size());
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
+        // if MKL is used, it goes through additional
+        // graph rewrite pass on top of Tensorflow.
+        // In TF, every time a graph pass
+        // happens, "constant" nodes are allocated
+        // and deallocated. Each allocation calls the
+        // (FindChunkPtr of BFCAllocator),
+        // which increments the value of AllocationId.
+        // Thus AllocationId of MKL can differ with TF if
+        // someone changes the relevant codes in BFCAllocator.
+        // Currently they are the same.
         if (node->name() == y->name()) {
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
-          // if MKL is used, it goes through various additional
-          // graph rewrite pass. In TF, everytime a graph pass
-          // happens, "constant" nodes are allocated
-          // and deallocated. Each allocation calls the
-          // (FindChunkPtr of BFCAllocator),
-          // which increments the value of AllocationId.
-          // Thus AllocationId becomes more than TF if MKL
-          // is used. Now IDs for MKL are 8 more than TF.
-          EXPECT_EQ(21, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(13, cm->AllocationId(node, 0));
-#endif  // INTEL_MKL && ENABLE_MKL
         } else {
-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
-          EXPECT_EQ(22, cm->AllocationId(node, 0));
-#else
           EXPECT_EQ(14, cm->AllocationId(node, 0));
-#endif  // INTEL_MKL && ENABLE_MKL
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 7b74c67c85865944332e7d628d299c0c11f1ae0f..86890ba07d8b9a4320c47ffde1b3b8d78d15ac5a 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -148,6 +148,7 @@ tf_cuda_library(
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":attr_builder",
         "@farmhash_archive//:farmhash",
     ] + select({
         "//tensorflow:android": [
@@ -180,6 +181,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -219,7 +221,6 @@ tf_cuda_library(
     hdrs = ["attr_builder.h"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":kernel_and_device",
         "@farmhash_archive//:farmhash",
         # Only the TF_AttrType enum is required, so pull in just the C headers.
         # TODO(b/113535673): Break this dependency and avoid the C header completely.
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index cf1cd4134e94fcf3486ffb89a1e1972100887b9d..a750f8cbba4de4abd33d6ec395b6b0a5fb76cc67 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -40,6 +39,18 @@ std::unordered_map<string, const AttrTypeMap*>* OpNameToAttrTypeMap() {
 
 const uint32 kIsList = 1U << 31;
 
+AttrTypeMap* DefaultFunctionAttrTypeMap() {
+  AttrTypeMap* map = new AttrTypeMap();
+  (*map)["executor_type"] = TF_ATTR_STRING;
+  (*map)["config"] = TF_ATTR_STRING;
+  return map;
+}
+
+const AttrTypeMap* GetDefaultFunctionAttrTypeMap() {
+  static const AttrTypeMap* map = DefaultFunctionAttrTypeMap();
+  return map;
+}
+
 }  // namespace
 
 Status OpDefForOp(const char* op_name, const OpDef** op_def) {
@@ -51,13 +62,27 @@ Status OpDefForOp(const char* op_name, const OpDef** op_def) {
   return s;
 }
 
-Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
+Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
+                        bool* is_function) {
   mutex_lock l(g_op_name_to_attr_type_map_lock);
+  *is_function = false;
   *out = gtl::FindPtrOrNull(*OpNameToAttrTypeMap(), op_name);
   if (*out != nullptr) return Status::OK();
   const OpDef* op_def = nullptr;
   Status s = OpDefForOp(op_name, &op_def);
-  if (!s.ok()) return s;
+  if (errors::IsNotFound(s)) {
+    // If we did not find the op def, we assume `op_name` is a function.
+    // If it is actually a misspelled op, user will get another error when
+    // trying to run it.
+    // TODO(iga): If we ever have a use case for different attribute specs
+    // in different functions, we will need to look at the OpDef in the
+    // function def to retrieve their types.
+    *out = GetDefaultFunctionAttrTypeMap();
+    *is_function = true;
+    return Status::OK();
+  } else if (!s.ok()) {
+    return s;
+  }
   std::unique_ptr<AttrTypeMap> m(new AttrTypeMap);
   // TODO(agarwal): Avoid having to create this "registry" at runtime,
   // perhaps can be done at op registration time?
@@ -99,7 +124,7 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
 #define DEFINE_SET_ATTR(value_type, value_field)                             \
   template <>                                                                \
   AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
-    value_field.push_back(std::make_pair(attr_name, value));                 \
+    value_field.push_back(std::make_pair(string(attr_name), value));         \
     return *this;                                                            \
   }
 
@@ -136,6 +161,22 @@ void AttrBuilder::FillAttrValueMap(AttrValueMap* m,
       m->insert(*it);
     }
   }
+  // For any attr-value pairs that exist in the op def (from op registry) but
+  // not `m`, fill them into `m`, so that we can run a TFE_Op without having to
+  // specify all the default attr values (e.g. for matmul, the `transpose_a`
+  // attr defaults to false).
+  const OpDef* op_def = nullptr;
+  Status s = OpDefForOp(op_name_.c_str(), &op_def);
+  // This is expected, if this op is a custom function, and is therefore not
+  // present in the op registry.
+  if (!s.ok()) return;
+
+  DCHECK(op_def);
+  for (const auto& attr_def : op_def->attr()) {
+    if (attr_def.has_default_value() && !m->count(attr_def.name())) {
+      SetInAttrValueMap(m, attr_def.name(), attr_def.default_value());
+    }
+  }
 }
 
 const NodeDef& AttrBuilder::BuildNodeDef() {
@@ -169,7 +210,7 @@ namespace {
 inline tensorflow::Fprint128 FingerprintCat128(const tensorflow::Fprint128& a,
                                                const tensorflow::Fprint128& b) {
   return {tensorflow::FingerprintCat64(a.low64, b.low64),
-          tensorflow::FingerprintCat64(a.low64, b.low64)};
+          tensorflow::FingerprintCat64(a.high64, b.high64)};
 }
 
 void CombineUnordered(const tensorflow::Fprint128& a,
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index cbe6a1cb50ebaee85972c69c8c03ff8e1c3f70e7..5e0172dfd328dbd4f16abdce879be1d1338e692c 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -23,7 +23,6 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -44,7 +43,11 @@ typedef std::unordered_map<string, uint32> AttrTypeMap;
 Status OpDefForOp(const char* op_name, const OpDef** op_def);
 
 // Returns the AttrTypeMap for the TensorFlow operation named op_name.
-Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out);
+// If op_name is not registered in global op registry, AttrTypeMapForOp assumes
+// the op to be a function and returns the default attributes for a function.
+// `is_function` is set to true in this case.
+Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
+                        bool* is_function);
 
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
@@ -96,7 +99,7 @@ class AttrBuilder {
   template <class T>
   AttrBuilder& Set(StringPiece attr_name, T&& value) {
     MayBeInitializeNodeDef();
-    SetInAttrValueMap(node_def_->mutable_attr(), attr_name, value);
+    SetInAttrValueMap(node_def_->mutable_attr(), string(attr_name), value);
     return *this;
   }
 
@@ -107,13 +110,19 @@ class AttrBuilder {
 
  private:
   template <class T>
-  using AttrVec = tensorflow::gtl::InlinedVector<std::pair<StringPiece, T>, 2>;
+  using AttrVec = tensorflow::gtl::InlinedVector<std::pair<string, T>, 2>;
 
   void MayBeInitializeNodeDef();
+  // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
+  // well as any default attr-value pairs from the associated op_def, if there
+  // is one.
+  //
+  // If `include_those_in_node_def` is true, also include any attr-value pairs
+  // from `node_def_`.
   void FillAttrValueMap(AttrValueMap* m, bool include_those_in_node_def) const;
 
   template <class T>
-  void SetInAttrValueMap(AttrValueMap* m, StringPiece attr_name,
+  void SetInAttrValueMap(AttrValueMap* m, const string& attr_name,
                          T&& value) const {
     DCHECK(!node_def_finalized_)
         << "Calling SetInAttrValueMap after BuildNodeDef.";
@@ -122,12 +131,12 @@ class AttrBuilder {
     AttrValue attr_value;
     if (found == nullptr) {
       SetAttrValue(value, &attr_value);
-      m->insert(AttrValueMap::value_type(string(attr_name), attr_value));
+      m->insert(AttrValueMap::value_type(attr_name, attr_value));
     } else {
       // TODO(ashankar): Do what is done in
       // NodeDefBuilder::CheckInconsistency(attr_name, *found, attr_value);
       SetAttrValue(std::forward<T>(value), &attr_value);
-      (*m)[string(attr_name)] = attr_value;
+      (*m)[attr_name] = attr_value;
     }
   }
 
@@ -151,7 +160,6 @@ template <>
 AttrBuilder& AttrBuilder::Set(StringPiece attr_name,
                               tensorflow::DataType&& value);
 
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_ATTR_BUILDER_H_
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 79b094f2e008786661b0236bc7bcdb3f37a23946..220cc6f5ce0bff32cfdc8d4e837c6900c773728e 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -35,9 +35,18 @@ namespace {
 
 TEST(AttrTypeMap, Lookup) {
   const AttrTypeMap* m = nullptr;
-  Status s = AttrTypeMapForOp("ThisOpCannotPossiblyExist", &m);
-  EXPECT_FALSE(s.ok());
-  s = AttrTypeMapForOp("MatMul", &m);
+  // Unknown ops are assumed to be functions.
+  // Their maps are filled with default attributes.
+  bool is_function = false;
+  Status s = AttrTypeMapForOp("SomeFunctionName", &m, &is_function);
+  EXPECT_TRUE(s.ok());
+  EXPECT_TRUE(is_function);
+  EXPECT_EQ(TF_ATTR_STRING, m->find("executor_type")->second);
+  EXPECT_EQ(TF_ATTR_STRING, m->find("config")->second);
+
+  is_function = true;
+  s = AttrTypeMapForOp("MatMul", &m, &is_function);
+  EXPECT_FALSE(is_function);
   ASSERT_TRUE(s.ok()) << s;
 
   TF_AttrType t;
@@ -50,7 +59,7 @@ TEST(AttrTypeMap, Lookup) {
   EXPECT_EQ(TF_ATTR_BOOL, t);
   EXPECT_EQ(is_list, 0);
 
-  s = AttrTypeMapForOp("Squeeze", &m);
+  s = AttrTypeMapForOp("Squeeze", &m, &is_function);
   ASSERT_TRUE(s.ok()) << s;
   s = AttrTypeByName(*m, "squeeze_dims", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 18420b60fd238fb30f47011e93a44681b2654f41..1727c045604bd19e038857fa34780f34cbb05d44 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/context.h"
 
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -32,18 +35,6 @@ bool ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val) {
   return default_val;
 }
 
-std::unique_ptr<thread::ThreadPool> EagerThreadPool(
-    const SessionOptions& opts) {
-  SessionOptions opts_copy(opts);
-  if (opts_copy.config.inter_op_parallelism_threads() == 0) {
-    // Eager defaults to a single thread when no threads are specified.
-    opts_copy.config.set_inter_op_parallelism_threads(1);
-  }
-
-  return std::unique_ptr<thread::ThreadPool>(
-      NewThreadPoolFromSessionOptions(opts_copy));
-}
-
 }  // namespace
 
 EagerContext::EagerContext(const SessionOptions& opts,
@@ -61,7 +52,7 @@ EagerContext::EagerContext(const SessionOptions& opts,
     : policy_(default_policy),
       devices_(device_mgr->ListDevices()),
       rendezvous_(rendezvous),
-      thread_pool_(EagerThreadPool(opts)),
+      thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
           device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_, {},
           thread_pool_.get())),
@@ -70,7 +61,9 @@ EagerContext::EagerContext(const SessionOptions& opts,
       async_default_(async),
       log_memory_(LogMemory::IsEnabled()),
       env_(opts.env),
-      use_send_tensor_rpc_(false) {
+      use_send_tensor_rpc_(false),
+      pin_small_ops_to_cpu_(ReadBoolFromEnvVar(
+          "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", true)) {
   if (device_mgr_owned) {
     local_device_manager_.reset(device_mgr);
     local_unowned_device_manager_ = nullptr;
@@ -81,6 +74,13 @@ EagerContext::EagerContext(const SessionOptions& opts,
   runner_ = [this](std::function<void()> closure) {
     this->thread_pool_->Schedule(std::move(closure));
   };
+
+  std::unique_ptr<DeviceResolverInterface> drl(
+      new DeviceResolverLocal(local_device_mgr()));
+  std::unique_ptr<ParamResolverInterface> cprl(new CollectiveParamResolverLocal(
+      local_device_mgr(), drl.get(), "/job:localhost/replica:0/task:0"));
+  collective_executor_mgr_.reset(new CollectiveExecutorMgr(
+      opts.config, local_device_mgr(), std::move(drl), std::move(cprl)));
 }
 
 void EagerContext::InitDeviceMapAndAsync() {
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 5ed6057ec6d55b70ce4710410364b3eb226747c9..cdef94789337550fdaa760638f098ba47af5dfdb 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #endif
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -131,6 +132,8 @@ class EagerContext {
 
   Device* HostCPU() { return devices_[0]; }
 
+  GraphCollector* GetGraphCollector() { return &graph_collector_; }
+
   uint64 NextId() { return executor_.NextId(); }
 
   void ExecutorAdd(EagerNode* node) { executor_.Add(node); }
@@ -145,6 +148,11 @@ class EagerContext {
   bool LogMemory() { return log_memory_; }
 
   Rendezvous* GetRendezvous() { return rendezvous_; }
+  std::unique_ptr<CollectiveExecutor::Handle> GetCollectiveExecutorHandle() {
+    return std::unique_ptr<CollectiveExecutor::Handle>(
+        new CollectiveExecutor::Handle(
+            collective_executor_mgr_->FindOrCreate(0), true /*inherit_ref*/));
+  }
 
   const tensorflow::DeviceMgr* local_device_mgr() const {
     return (local_device_manager_ != nullptr) ? local_device_manager_.get()
@@ -202,6 +210,9 @@ class EagerContext {
   // EagerService.SendTensor RPC. If false, _Send/_Recv ops should be used
   // instead (which in-turn use WorkerService.RecvTensor RPCs).
   bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
+  bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; }
+
+  tensorflow::Env* TFEnv() const { return env_; }
 
  private:
   void InitDeviceMapAndAsync();
@@ -248,6 +259,7 @@ class EagerContext {
   std::atomic<bool> should_store_metadata_{false};
   mutex metadata_mu_;
   RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
+  GraphCollector graph_collector_;
   const bool log_device_placement_;
   // EagerExecutor for async execution.
   EagerExecutor executor_;
@@ -267,6 +279,8 @@ class EagerContext {
 
   Env* const env_;
 
+  std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr_;
+
 #ifndef __ANDROID__
   void CloseRemoteContexts();
 
@@ -293,6 +307,7 @@ class EagerContext {
 #endif
 
   bool use_send_tensor_rpc_;
+  const bool pin_small_ops_to_cpu_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index 8a887540b066055fc1f59e64e0cead9f2512178e..5bc3bb689e076467672af85d28bb340b56e7ee79 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -30,7 +30,7 @@ class CopyToDeviceNode : public EagerNode {
         src_(src),
         dstd_(dstd),
         ctx_(ctx),
-        dst_(new TensorHandle(id, src_->dtype, ctx)) {
+        dst_(new TensorHandle(id, dstd_, dstd_, src->dtype, ctx)) {
     src_->Ref();
     dst_->Ref();
   }
@@ -44,13 +44,11 @@ class CopyToDeviceNode : public EagerNode {
     TensorHandle* temp = nullptr;
     TF_RETURN_IF_ERROR(src_->CopyToDevice(ctx_, dstd_, &temp));
     const Tensor* tensor = nullptr;
-    Device* device = nullptr;
-    Device* op_device = nullptr;
-    Status status = temp->TensorAndDevice(&tensor, &device, &op_device);
+    Status status = temp->Tensor(&tensor);
     // `temp` is a ready handle. So the following call should return OK.
     TF_DCHECK_OK(status) << status.error_message();
     DCHECK(tensor);
-    dst_->SetTensorAndDevice(*tensor, device, op_device);
+    dst_->SetTensor(*tensor);
     temp->Unref();
     return Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index fcf62c7715320466a49c707e31cf7a5045f16b8e..935ca7f9aa766a69582b4c94fec6c508e3f5a369 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -22,11 +22,14 @@ limitations under the License.
 namespace tensorflow {
 class EagerOperation {
  public:
-  // t is NULL iff the EagerOperation corresponds to a TensorFlow function
-  // instead of a primitive operation.
   EagerOperation(tensorflow::EagerContext* ctx, const char* op,
-                 const tensorflow::AttrTypeMap* t)
-      : ctx_(ctx), name_(op), attrs_(op), attr_types_(t), device_(nullptr) {}
+                 bool is_function, const tensorflow::AttrTypeMap* t)
+      : ctx_(ctx),
+        name_(op),
+        attrs_(op),
+        attr_types_(t),
+        device_(nullptr),
+        is_function_(is_function) {}
 
   ~EagerOperation() {
     for (tensorflow::TensorHandle* h : inputs_) {
@@ -34,7 +37,7 @@ class EagerOperation {
     }
   }
 
-  bool is_function() const { return attr_types_ == nullptr; }
+  bool is_function() const { return is_function_; }
 
   tensorflow::EagerContext* EagerContext() { return ctx_; }
 
@@ -68,6 +71,7 @@ class EagerOperation {
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs_;
   tensorflow::Device* device_;
   bool use_xla_ = false;
+  const bool is_function_;
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 1bc63616d0ae7f01a27d526ef81b584f4a20706d..783baa96c92f224e45404e5f6586011599f02292 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/lib/core/errors.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/env.h"
@@ -84,8 +86,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
                                       RunMetadata* run_metadata,
                                       TensorHandle** handle) {
   EagerContext* ctx = op->EagerContext();
-  Device* handle_device = nullptr;
-  TF_RETURN_IF_ERROR((*handle)->Device(&handle_device));
+  Device* handle_device = (*handle)->device();
   const Device* actual_device =
       handle_device == nullptr ? ctx->HostCPU() : handle_device;
   const Device* op_device =
@@ -192,23 +193,23 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
 }
 
 Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
-  DeviceTypeVector final_devices;
+  PrioritizedDeviceTypeVector final_devices;
   TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
       ctx->prioritized_device_type_list(), ndef, &final_devices));
   if (final_devices.empty()) {
-    return errors::Internal(
-        "Could not find valid device for node.\nNode: ", SummarizeNodeDef(ndef),
-        "\nAll kernels registered for op ", ndef.op(), " :\n",
-        KernelsRegisteredForOp(ndef.op()));
+    return errors::Internal("Could not find valid device for node.\nNode: ",
+                            FormatNodeDefForError(ndef),
+                            "\nAll kernels registered for op ", ndef.op(),
+                            " :\n", KernelsRegisteredForOp(ndef.op()));
   }
   for (Device* d : *ctx->devices()) {
-    if (d->device_type() == final_devices[0].type_string()) {
+    if (d->device_type() == final_devices[0].first.type_string()) {
       *device = d;
       return Status::OK();
     }
   }
   return errors::Unknown("Could not find a device for node ",
-                         SummarizeNodeDef(ndef));
+                         FormatNodeDefForError(ndef));
 }
 
 Status GetOutputDTypes(EagerOperation* op, DataTypeVector* output_dtypes) {
@@ -262,7 +263,8 @@ Status EagerLocalExecute(EagerOperation* op,
     // Note that it is not ideal, but currently ok, to set this
     // attribute after computing the kernel cache key above.
     if (op->is_function() && device != nullptr &&
-        device->device_type() == "TPU") {
+        (device->device_type() == "TPU" || device->device_type() == "XLA_GPU" ||
+         device->device_type() == "XLA_CPU")) {
       op->MutableAttrs()->Set(kXlaCompileAttr, true);
     }
 
@@ -276,33 +278,21 @@ Status EagerLocalExecute(EagerOperation* op,
       LOG(INFO) << "Executing op " << ndef.op() << " in device "
                 << device->name();
     }
-    kernel = new KernelAndDevice(ctx->GetRendezvous(), ctx->LogMemory());
-    auto* flr = ctx->func_lib(device);
 
+    auto* flr = ctx->func_lib(device);
     if (flr == nullptr) {
       return errors::Unavailable(
           "Unable to find a FunctionLibraryRuntime corresponding to device ",
           device->name());
     }
+    kernel = new KernelAndDevice(ctx->GetRendezvous(), ctx->LogMemory(),
+                                 ctx->GetCollectiveExecutorHandle());
     status = KernelAndDevice::Init(ndef, flr, ctx->runner(), kernel);
     if (!status.ok()) {
       delete kernel;
       return status;
     }
-    // Update output_dtypes inside `kernel`.
-    const OpDef* op_def = nullptr;
-    const FunctionDef* function_def = ctx->FuncLibDef()->Find(ndef.op());
-    if (function_def != nullptr) {
-      op_def = &(function_def->signature());
-    }
-    if (op_def == nullptr) {
-      status = OpDefForOp(ndef.op().c_str(), &op_def);
-      if (!status.ok()) return status;
-    }
-    DataTypeVector input_dtypes;
-    status = InOutTypesForNode(ndef, *op_def, &input_dtypes,
-                               kernel->mutable_output_dtypes());
-    if (!status.ok()) return status;
+
     ctx->AddKernelToCache(cache_key, kernel);
   }
   const DataTypeVector& output_dtypes = kernel->output_dtypes();
@@ -323,7 +313,11 @@ Status EagerLocalExecute(EagerOperation* op,
       ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
   if (!status.ok()) return status;
   std::unique_ptr<NodeExecStats> maybe_stats;
+  StepStats* maybe_step_stats = nullptr;
+  GraphCollector* graph_collector = nullptr;
   if (ctx->ShouldStoreMetadata()) {
+    graph_collector = ctx->GetGraphCollector();
+    maybe_step_stats = ctx->RunMetadataProto()->mutable_step_stats();
     int64 now_nanos = Env::Default()->NowNanos();
     maybe_stats.reset(new NodeExecStats);
     maybe_stats->set_node_name(op->Name());
@@ -342,17 +336,20 @@ Status EagerLocalExecute(EagerOperation* op,
     // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
     tensorflow::uint64 id = ctx->NextId();
     for (int i = 0; i < *num_retvals; ++i) {
-      (*retvals)[i] = new TensorHandle(id, output_dtypes[i], ctx);
+      (*retvals)[i] = new TensorHandle(id, /* d= */ kernel->OutputDevice(i),
+                                       /* op_device= */ kernel->device(),
+                                       output_dtypes[i], ctx);
     }
-    EagerNode* node =
-        new ExecuteNode(id, ctx, op->Device(), op->Inputs(), kernel,
-                        maybe_stats.release(), output_dtypes, *retvals);
+    EagerNode* node = new ExecuteNode(
+        id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
+        maybe_step_stats, graph_collector, output_dtypes, *retvals);
     ctx->ExecutorAdd(node);
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
     status = EagerExecute(ctx, op->Device(), op->Inputs(), kernel,
-                          maybe_stats.get(), retvals->data(), *num_retvals);
+                          maybe_stats.get(), maybe_step_stats, graph_collector,
+                          retvals->data(), *num_retvals);
   }
 
   return status;
@@ -424,8 +421,23 @@ Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
   request.set_op_id(ctx->NextId());
   request.set_device_name(recv_device->name());
 
+  Device* tensor_handle_device = h->device();
+
+  // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence copy
+  // it to the CPU before copying it out.
+  // TODO(nareshmodi): this is currently slow, but can be fixed by making tensor
+  // handles aware of more than one device.
+  TensorHandle* actual_handle;
+  if (tensor_handle_device != nullptr &&
+      tensor_handle_device->device_type() != "CPU") {
+    TF_RETURN_IF_ERROR(h->CopyToDevice(ctx, ctx->HostCPU(), &actual_handle));
+  } else {
+    actual_handle = h;
+    actual_handle->Ref();
+  }
+
   const Tensor* tensor;
-  TF_RETURN_IF_ERROR(h->Tensor(&tensor));
+  TF_RETURN_IF_ERROR(actual_handle->Tensor(&tensor));
   tensor->AsProtoTensorContent(request.add_tensors());
 
   const tensorflow::uint64 id = request.op_id();
@@ -449,6 +461,8 @@ Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
                              recv_device, recv_device, ctx);
   (*result)->SetRemoteShape(MakeUnique<TensorShape>(tensor->shape()));
 
+  actual_handle->Unref();
+
   return Status::OK();
 #endif
 }
@@ -474,8 +488,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   auto* remote_op = request->add_queue()->mutable_operation();
 
   for (int i = 0; i < op->Inputs().size(); i++) {
-    tensorflow::Device* input_device;
-    TF_RETURN_IF_ERROR(op->Inputs()[i]->Device(&input_device));
+    tensorflow::Device* input_device = op->Inputs()[i]->device();
     if (op->Device() != input_device &&
         // If the expected and actual devices are on the same task, don't
         // explicitly copy, and instead depend on the copy to happen locally
@@ -579,19 +592,39 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   return Status::OK();
 #endif
 }
-}  // namespace
 
-Status EagerExecute(EagerOperation* op,
-                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
-                    int* num_retvals) {
-  // Ensure all resource-touching ops run in the device the resource is,
-  // regardless of anything else that has been specified. This is identical to
-  // the graph mode behavior.
+// These ops are not pinnable since they generate data. It can be slower to
+// generate and then copy the data instead of just generating the data on the
+// device directly.
+bool IsPinnableOp(const string& op_type) {
+  static const gtl::FlatSet<string>* unpinnable_ops = new gtl::FlatSet<string>({
+      "RandomUniform",
+      "RandomUniformInt",
+      "RandomNormal",
+      "StatelessRandomUniform",
+      "StatelessRandomUniformInt",
+      "StatelessRandomNormal",
+  });
+
+  return unpinnable_ops->find(op_type) == unpinnable_ops->end();
+}
+
+// The Op device may be updated if:
+// - A resource touching input is specified: all resource-touching ops run in
+// the device the resource is, regardless of anything else that has been
+// specified. This is identical to the graph mode behavior.
+//
+// - All op inputs are on the CPU, small (<64 elements) and integers
+// (int32/int64). This can be disabled by setting the environment variable
+// "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
+Status MaybeUpdateOpDevice(EagerOperation* op) {
   EagerContext* ctx = op->EagerContext();
+  bool device_set_for_resource_variable = false;
+  bool all_inputs_eligible_for_cpu_pinning =
+      ctx->PinSmallOpsToCPU() && IsPinnableOp(op->Name());
+
   for (int i = 0; i < op->Inputs().size(); ++i) {
-    Device* input_op_device = nullptr;
-    auto status = op->Inputs()[i]->OpDevice(&input_op_device);
-    if (!status.ok()) return status;
+    Device* input_op_device = op->Inputs()[i]->op_device();
     VLOG(2) << "for op " << op->Name() << " input " << i << " "
             << DataTypeString(op->Inputs()[i]->dtype) << " "
             << (input_op_device == nullptr ? "cpu" : input_op_device->name())
@@ -603,8 +636,53 @@ Status EagerExecute(EagerOperation* op,
               << d->name() << " because input #" << i
               << " is a resource in this device.";
       op->SetDevice(d);
+
+      device_set_for_resource_variable = true;
+      all_inputs_eligible_for_cpu_pinning = false;
+    } else if (all_inputs_eligible_for_cpu_pinning) {
+      TensorHandle* handle = op->Inputs()[i];
+
+      // Input is on CPU.
+      if (input_op_device != nullptr && input_op_device != ctx->HostCPU()) {
+        all_inputs_eligible_for_cpu_pinning = false;
+        continue;
+      }
+
+      if (handle->dtype != DataType::DT_INT32 &&
+          handle->dtype != DataType::DT_INT64) {
+        all_inputs_eligible_for_cpu_pinning = false;
+        continue;
+      }
+
+      int64 num_elements;
+      TF_RETURN_IF_ERROR(handle->NumElements(&num_elements));
+      if (num_elements > 64) {
+        all_inputs_eligible_for_cpu_pinning = false;
+      }
     }
   }
+
+  // Ops without inputs are usually ops that generate a tensor in some way and
+  // usually require being present on whatever device they are scheduled on
+  // - for e.g. VarHandleOp or _Recv).
+  // TODO(nareshmodi): Is it possible there is no int32/int64 CPU kernel for
+  // an op, but there is a GPU kernel?
+  if (!op->Inputs().empty() && all_inputs_eligible_for_cpu_pinning) {
+    VLOG(1) << "Forcing op " << op->Name()
+            << " to be on the CPU since all input tensors have an "
+               "int32/int64 dtype, and are small (less than 64 elements).";
+    op->SetDevice(ctx->HostCPU());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status EagerExecute(EagerOperation* op,
+                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
+                    int* num_retvals) {
+  TF_RETURN_IF_ERROR(MaybeUpdateOpDevice(op));
+
   bool op_is_local = IsLocal(op->EagerContext(), op->Device());
 
   if (op_is_local) {
@@ -622,7 +700,9 @@ Status EagerExecute(EagerOperation* op,
 Status EagerExecute(EagerContext* ctx, Device* device,
                     const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                     KernelAndDevice* kernel, NodeExecStats* maybe_stats,
-                    TensorHandle** retvals, int num_retvals) {
+                    StepStats* maybe_step_stats,
+                    GraphCollector* graph_collector, TensorHandle** retvals,
+                    int num_retvals) {
   if (device == nullptr) {
     // TODO(apassos) debug how the assignment below might return a different
     // device from the one requested above.
@@ -643,9 +723,11 @@ Status EagerExecute(EagerContext* ctx, Device* device,
   // TODO(agarwal): change Run to take vector of handles ?
   ScopedStepContainer* container = ctx->StepContainer();
   if (container == nullptr) {
-    TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats));
+    TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats,
+                                   maybe_step_stats, graph_collector));
   } else {
-    TF_RETURN_IF_ERROR(kernel->Run(container, &inputs, &outputs, maybe_stats));
+    TF_RETURN_IF_ERROR(kernel->Run(container, &inputs, &outputs, maybe_stats,
+                                   maybe_step_stats, graph_collector));
   }
   if (maybe_stats != nullptr) {
     int64 nanos = Env::Default()->NowNanos();
@@ -657,6 +739,14 @@ Status EagerExecute(EagerContext* ctx, Device* device,
     maybe_stats->set_all_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
     mutex_lock ml(*ctx->MetadataMu());
     if (ctx->ShouldStoreMetadata()) {
+      {
+        GraphCollector* collector = ctx->GetGraphCollector();
+        mutex_lock mll(collector->mu);
+        for (const auto& graph : collector->graphs) {
+          *ctx->RunMetadataProto()->add_partition_graphs() = graph;
+        }
+        collector->graphs.clear();
+      }
       auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
       // Lazily initialize the RunMetadata with information about all devices if
       // this is the first call.
@@ -678,17 +768,19 @@ Status EagerExecute(EagerContext* ctx, Device* device,
     }
   }
   DCHECK_EQ(num_retvals, outputs.size());
-  Device* op_device = device;
   for (int i = 0; i < num_retvals; ++i) {
-    Device* d = op_device;
-    if (d != nullptr && output_memory_types != nullptr &&
-        (*output_memory_types)[i] == HOST_MEMORY) {
-      d = nullptr;
-    }
     if (retvals[i] == nullptr) {
-      retvals[i] = new TensorHandle(outputs[i], d, op_device, ctx);
+      retvals[i] =
+          new TensorHandle(outputs[i], /* d= */ kernel->OutputDevice(i),
+                           /* op_device= */ device, ctx);
     } else {
-      retvals[i]->SetTensorAndDevice(outputs[i], d, op_device);
+      // In the async case, the retval is not a nullptr, and its device is
+      // already set since all TensorHandles always have their device set during
+      // construction.
+      DCHECK_EQ(device, retvals[i]->op_device());
+      DCHECK_EQ(kernel->OutputDevice(i), retvals[i]->device());
+
+      retvals[i]->SetTensor(outputs[i]);
     }
   }
   return Status::OK();
@@ -738,8 +830,11 @@ Status ExecuteSend(EagerContext* ctx, tensorflow::Device* device,
                    TensorHandle* h, StringPiece wire_id,
                    const string& recv_device) {
   const tensorflow::AttrTypeMap* types;
-  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Send", &types));
-  tensorflow::EagerOperation op(ctx, "_Send", types);
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(
+      tensorflow::AttrTypeMapForOp("_Send", &types, &is_function));
+  DCHECK(!is_function);
+  tensorflow::EagerOperation op(ctx, "_Send", /*is_function=*/false, types);
 
   op.AddInput(h);
 
@@ -766,8 +861,11 @@ Status ExecuteRecv(EagerContext* ctx, tensorflow::Device* device,
                    const string& send_device, int64 send_device_incarnation,
                    TensorHandle** result) {
   const tensorflow::AttrTypeMap* types;
-  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Recv", &types));
-  tensorflow::EagerOperation op(ctx, "_Recv", types);
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(
+      tensorflow::AttrTypeMapForOp("_Recv", &types, &is_function));
+  DCHECK(!is_function);
+  tensorflow::EagerOperation op(ctx, "_Recv", /*is_function=*/false, types);
 
   op.SetDevice(device);
 
@@ -803,8 +901,7 @@ string GetUniqueWireID() {
 
 Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
                          const char* device_name, TensorHandle** result) {
-  tensorflow::Device* send_device;
-  TF_RETURN_IF_ERROR(h->Device(&send_device));
+  tensorflow::Device* send_device = h->device();
 
   if (send_device == nullptr) {
     send_device = ctx->HostCPU();
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index f4f84980fb90a02f031f9651ef02d2c796a30934..6143a52d4b9c83444eb98567decf26dbfca58504 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -46,7 +46,9 @@ Status EagerExecute(
 Status EagerExecute(EagerContext* ctx, Device* device,
                     const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                     KernelAndDevice* kernel, NodeExecStats* maybe_stats,
-                    TensorHandle** retvals, int num_retvals);
+                    StepStats* maybe_step_stats,
+                    GraphCollector* graph_collector, TensorHandle** retvals,
+                    int num_retvals);
 
 // Low-level utility to copy a tensor handle from one device to another.
 Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index 93018dd96914c0d091c7242a9c053fabce434e78..a99d509dd60c4ad50b67ef237423570d7b595234 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -34,6 +34,7 @@ class ExecuteNode : public EagerNode {
   ExecuteNode(uint64 id, EagerContext* ctx, Device* op_device,
               const tensorflow::gtl::InlinedVector<TensorHandle*, 4>& inputs,
               KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+              StepStats* maybe_step_stats, GraphCollector* graph_collector,
               const DataTypeVector& output_dtypes,
               const tensorflow::gtl::InlinedVector<TensorHandle*, 2>& retvals)
       : EagerNode(id),
@@ -42,6 +43,8 @@ class ExecuteNode : public EagerNode {
         inputs_(inputs),
         kernel_(kernel),
         maybe_stats_(maybe_stats),
+        maybe_step_stats_(maybe_step_stats),
+        graph_collector_(graph_collector),
         retvals_(retvals) {
     for (auto handle : inputs_) {
       handle->Ref();
@@ -61,9 +64,9 @@ class ExecuteNode : public EagerNode {
   }
 
   tensorflow::Status Run() override {
-    const Status status =
-        EagerExecute(ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
-                     retvals_.begin(), retvals_.size());
+    const Status status = EagerExecute(
+        ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
+        maybe_step_stats_, graph_collector_, retvals_.begin(), retvals_.size());
     if (status.ok()) {
       return status;
     } else {
@@ -80,6 +83,8 @@ class ExecuteNode : public EagerNode {
   tensorflow::gtl::InlinedVector<TensorHandle*, 4> inputs_;
   tensorflow::KernelAndDevice* kernel_;
   std::unique_ptr<NodeExecStats> maybe_stats_;
+  StepStats* maybe_step_stats_;
+  tensorflow::GraphCollector* graph_collector_;
   tensorflow::gtl::InlinedVector<TensorHandle*, 2> retvals_;
 };
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 83d8425477fe30570a6ed987756749b1156f8632..317e9a16074b37ef6ecaf1d7f8c1a2daa412f75e 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -32,32 +34,45 @@ limitations under the License.
 namespace tensorflow {
 
 // static
-Status KernelAndDevice::Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
+Status KernelAndDevice::Init(const NodeDef& ndef, FunctionLibraryRuntime* flr,
                              std::function<void(std::function<void()>)>* runner,
                              KernelAndDevice* out) {
   OpKernel* k = nullptr;
-  Status s = flib->CreateKernel(ndef, &k);
-  out->device_ = flib->device();
+  TF_RETURN_IF_ERROR(flr->CreateKernel(ndef, &k));
+  out->device_ = flr->device();
   out->kernel_.reset(k);
-  out->flib_ = flib;
+  out->flr_ = flr;
   out->runner_ = runner;
   out->default_runner_ = [](std::function<void()> f) { f(); };
-  return s;
+
+  // Update output_dtypes_.
+  const OpDef* op_def = nullptr;
+  const FunctionDef* function_def =
+      flr->GetFunctionLibraryDefinition()->Find(ndef.op());
+  if (function_def != nullptr) {
+    op_def = &(function_def->signature());
+  } else {
+    TF_RETURN_IF_ERROR(OpDefForOp(ndef.op().c_str(), &op_def));
+  }
+  return OutputTypesForNode(ndef, *op_def, &out->output_dtypes_);
 }
 
 Status KernelAndDevice::Run(std::vector<Tensor>* inputs,
-                            std::vector<Tensor>* outputs,
-                            NodeExecStats* stats) {
+                            std::vector<Tensor>* outputs, NodeExecStats* stats,
+                            StepStats* step_stats,
+                            GraphCollector* graph_collector) {
   ScopedStepContainer step_container(0, [this](const string& name) {
     device_->resource_manager()->Cleanup(name).IgnoreError();
   });
-  return this->Run(&step_container, inputs, outputs, stats);
+  return this->Run(&step_container, inputs, outputs, stats, step_stats,
+                   graph_collector);
 }
 
 Status KernelAndDevice::Run(ScopedStepContainer* step_container,
                             std::vector<Tensor>* inputs,
-                            std::vector<Tensor>* outputs,
-                            NodeExecStats* stats) {
+                            std::vector<Tensor>* outputs, NodeExecStats* stats,
+                            StepStats* step_stats,
+                            GraphCollector* graph_collector) {
   gtl::InlinedVector<TensorValue, 4> input_vector;
   for (Tensor& t : *inputs) {
     input_vector.push_back(TensorValue(&t));
@@ -69,6 +84,15 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
                              tensorflow::HOST_MEMORY);
   }
 
+  gtl::InlinedVector<DeviceContext*, 4> input_device_contexts;
+  for (int i = 0; i < inputs->size(); i++) {
+    DeviceContext* device_context = nullptr;
+    if (device_->tensorflow_gpu_device_info() != nullptr) {
+      device_context = device_->tensorflow_gpu_device_info()->default_context;
+    }
+    input_device_contexts.push_back(device_context);
+  }
+
   OpKernelContext::Params params;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
@@ -76,13 +100,17 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   params.op_kernel = kernel_.get();
   params.resource_manager = device_->resource_manager();
   params.output_attr_array = gtl::vector_as_array(&out_attrs);
-  params.function_library = flib_;
+  params.function_library = flr_;
   params.slice_reader_cache = &slice_reader_cache_;
   params.rendezvous = rendez_;
   params.cancellation_manager = &cm_;
   params.log_memory = log_memory_;
+  std::unique_ptr<StepStatsCollector> step_stats_collector;
   if (stats != nullptr) {
+    step_stats_collector.reset(new StepStatsCollector(step_stats));
     params.track_allocations = true;
+    params.stats_collector = step_stats_collector.get();
+    params.graph_collector = graph_collector;
   }
   if (runner_ == nullptr) {
     params.runner = &default_runner_;
@@ -91,6 +119,9 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   }
 
   params.step_container = step_container;
+  params.collective_executor =
+      collective_executor_ ? collective_executor_->get() : nullptr;
+  params.input_device_contexts = &input_device_contexts;
 
   OpKernelContext context(&params);
 
@@ -112,7 +143,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     outputs->push_back(Tensor(*context.mutable_output(i)));
   }
   if (stats != nullptr) {
-    for (const auto& allocator_pair : context.wrapped_allocators()) {
+    for (const auto& allocator_pair : context.ConsumeWrappedAllocators()) {
       AllocatorMemoryUsed* memory = stats->add_memory();
       memory->set_allocator_name(allocator_pair.first->Name());
       auto sizes = allocator_pair.second->GetSizes();
@@ -132,8 +163,17 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     }
 
     ms->set_persistent_memory_size(context.persistent_memory_allocated());
+    step_stats_collector->Finalize();
   }
   return Status::OK();
 }
 
+tensorflow::Device* KernelAndDevice::OutputDevice(int idx) const {
+  if (device_ != nullptr &&
+      kernel_->output_memory_types()[idx] == HOST_MEMORY) {
+    return nullptr;
+  }
+  return device_;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 04151a11713b1a927a219266d5eda26088fad0dc..ee430b7fc70e1f4e5256e9dd28f4240ce57de86a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -36,6 +37,7 @@ namespace tensorflow {
 // Forward declaration for proto class NodeExecStats so we do not need to
 // include the proto header
 class NodeExecStats;
+class StepStats;
 
 // KernelAndDevice encapsulates an instantiated kernel and the device it is on.
 //
@@ -49,28 +51,37 @@ class KernelAndDevice {
   //
   // The provided FunctionLibraryRuntime MUST outlive all calls to
   // Run() on the returned KernelAndDevice.
-  static Status Init(const NodeDef& ndef, FunctionLibraryRuntime* flib,
+  static Status Init(const NodeDef& ndef, FunctionLibraryRuntime* flr,
                      std::function<void(std::function<void()>)>* runner,
                      KernelAndDevice* out);
 
   KernelAndDevice(tensorflow::Rendezvous* rendez, bool log_memory)
+      : KernelAndDevice(rendez, log_memory, nullptr) {}
+
+  KernelAndDevice(
+      tensorflow::Rendezvous* rendez, bool log_memory,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor)
       : device_(nullptr),
-        flib_(nullptr),
+        flr_(nullptr),
         rendez_(rendez),
-        log_memory_(log_memory) {}
+        log_memory_(log_memory),
+        collective_executor_(std::move(collective_executor)) {}
 
   // TODO(ashankar): Handle list-valued inputs.
   Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
-             NodeExecStats* stats);
+             NodeExecStats* stats, StepStats* step_stats,
+             GraphCollector* graph_collector);
 
   Status Run(ScopedStepContainer* step_container, std::vector<Tensor>* inputs,
-             std::vector<Tensor>* outputs, NodeExecStats* stats);
+             std::vector<Tensor>* outputs, NodeExecStats* stats,
+             StepStats* step_stats, GraphCollector* graph_collector);
+
+  Device* OutputDevice(int idx) const;
 
   const OpKernel* kernel() const { return kernel_.get(); }
 
   Device* device() const { return device_; }
 
-  DataTypeVector* mutable_output_dtypes() { return &output_dtypes_; }
   const DataTypeVector& output_dtypes() { return output_dtypes_; }
 
  private:
@@ -81,13 +92,14 @@ class KernelAndDevice {
   CancellationManager cm_;
   std::unique_ptr<OpKernel> kernel_;
   Device* device_;
-  FunctionLibraryRuntime* flib_;
+  FunctionLibraryRuntime* flr_;
   checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
   Rendezvous* rendez_;
   DataTypeVector output_dtypes_;
   std::function<void(std::function<void()>)>* runner_;
   std::function<void(std::function<void()>)> default_runner_;
   const bool log_memory_;
+  const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index da280b23174f1e8f5b5d859e30befc25db8dbabf..3ffed3ce321e79d021c302acf444f93cc9ccce53 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -37,12 +38,13 @@ namespace {
 class TestEnv {
  public:
   TestEnv() : flib_def_(OpRegistry::Global(), {}) {
-    Device* device =
-        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
-    device_mgr_.reset(new DeviceMgr({device}));
-    flib_runtime_ = NewFunctionLibraryRuntime(device_mgr_.get(), Env::Default(),
-                                              device, TF_GRAPH_DEF_VERSION,
-                                              &flib_def_, nullptr, {}, nullptr);
+    std::vector<std::unique_ptr<Device>> devices;
+    devices.push_back(
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+    flib_runtime_ = NewFunctionLibraryRuntime(
+        device_mgr_.get(), Env::Default(), device_mgr_->ListDevices()[0],
+        TF_GRAPH_DEF_VERSION, &flib_def_, nullptr, {}, nullptr);
   }
 
   FunctionLibraryRuntime* function_library_runtime() const {
@@ -132,7 +134,7 @@ void BM_KernelAndDeviceRun(int iters) {
                                     nullptr, &kernel));
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(kernel.Run(&inputs, &outputs, nullptr));
+    TF_CHECK_OK(kernel.Run(&inputs, &outputs, nullptr, nullptr, nullptr));
   }
 }
 BENCHMARK(BM_KernelAndDeviceRun);
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d58724cbfacf6fec6b097dead53d9bd373cd2e7f..0acd1609361453a0901e346f3b9d76e6e3a7b872 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -79,20 +79,6 @@ Status TensorHandle::Tensor(const tensorflow::Tensor** t) {
   return Status::OK();
 }
 
-Status TensorHandle::Device(tensorflow::Device** d) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *d = device_;
-  return Status::OK();
-}
-
-Status TensorHandle::OpDevice(tensorflow::Device** d) {
-  TF_RETURN_IF_ERROR(WaitReady());
-  DCHECK(IsReady());
-  *d = op_device_;
-  return Status::OK();
-}
-
 Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
                                      tensorflow::Device** device,
                                      tensorflow::Device** op_device) {
@@ -178,17 +164,12 @@ Status TensorHandle::RemoteAddress(int64* op_id, int32* output_num) {
   return Status::OK();
 }
 
-void TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor,
-                                      tensorflow::Device* device,
-                                      tensorflow::Device* op_device) {
+void TensorHandle::SetTensor(const tensorflow::Tensor& tensor) {
   mutex_lock l(ctx_mutex_);
-  DCHECK(node_id_ > 0 && !is_ready_)
-      << "SetTensorAndDevice should be only called  "
-      << "on non-ready handles.";
+  DCHECK(node_id_ > 0 && !is_ready_) << "SetTensor should be only called  "
+                                     << "on non-ready handles.";
   is_ready_ = true;
   tensor_ = tensor;
-  device_ = device;
-  op_device_ = op_device;
 }
 
 Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
@@ -203,10 +184,7 @@ Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
   bool is_same_device = (srcd == dstd) || (srcd->name() == dstd->name());
   const bool dst_cpu = dstd->tensorflow_gpu_device_info() == nullptr;
   const bool src_cpu = srcd->tensorflow_gpu_device_info() == nullptr;
-  // both_on_cpu can be true and yet is_same_device is false, if one of src/dst
-  // has device type XLA_CPU, and the other CPU.
-  const bool both_on_cpu = src_cpu && dst_cpu;
-  if (is_same_device || both_on_cpu) {
+  if (is_same_device) {
     *output = new tensorflow::TensorHandle(*src, dstd, dstd, ctx);
     return tensorflow::Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index e55f1a03385f2dffa4d55961b6df502e17a1b474..0fdd31ab5fcfe99c92074fc69d831d17f46d607e 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -61,12 +61,13 @@ class TensorHandle : public core::RefCounted {
         ctx_(ctx),
         is_ready_(true) {}
 
-  TensorHandle(uint64 node_id, DataType dtype, EagerContext* ctx)
+  TensorHandle(uint64 node_id, Device* d, Device* op_device, DataType dtype,
+               EagerContext* ctx)
       : dtype(dtype),
         node_id_(node_id),
         tensor_(dtype),
-        device_(nullptr),
-        op_device_(nullptr),
+        device_(d),
+        op_device_(op_device),
         remote_op_id_(-1),
         remote_output_num_(-1),
         remote_shape_node_id_(-1),
@@ -101,9 +102,9 @@ class TensorHandle : public core::RefCounted {
 
   Status Tensor(const tensorflow::Tensor** t);
 
-  Status Device(tensorflow::Device** d);
+  tensorflow::Device* device() const { return device_; }
 
-  Status OpDevice(tensorflow::Device** d);
+  tensorflow::Device* op_device() const { return op_device_; }
 
   Status TensorAndDevice(const tensorflow::Tensor** tensor,
                          tensorflow::Device** device,
@@ -120,9 +121,7 @@ class TensorHandle : public core::RefCounted {
 
   // Note that this can be called at most once, and only on non-ready handles,
   // and makes them ready.
-  void SetTensorAndDevice(const tensorflow::Tensor& tensor,
-                          tensorflow::Device* device,
-                          tensorflow::Device* op_device);
+  void SetTensor(const tensorflow::Tensor& tensor);
 
   Status CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
                       TensorHandle** output);
@@ -172,11 +171,11 @@ class TensorHandle : public core::RefCounted {
   //
   // TODO(ashankar): Reference count TFE_Context to ensure that 'device_' of a
   // TFE_TensorHandle does not outlive the TFE_Context from which it came?
-  tensorflow::Device* device_;
+  tensorflow::Device* const device_;
 
   // Device in which the op producing this tensor was executed. Equals to
   // device_ for constant tensors.
-  tensorflow::Device* op_device_;
+  tensorflow::Device* const op_device_;
 
   // IDs required when this class is representing a remote tensor handle.
   const int64 remote_op_id_;
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 2c48084cab32d32928724f7d8777b62f6ea3e804..6b3284b84a0d2741f315c3f91db35eebc68f9e98 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -1238,8 +1239,8 @@ class ExecutorState {
   // Step-local container.
   ScopedStepContainer* step_container_;
   StepStatsCollectorInterface* const stats_collector_;
-  const tracing::TraceCollector* const trace_collector_;
   const tracing::EventCollector* const event_collector_;
+  Context context_;
 
   // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
   // instead of a pointer?  (avoids having to delete).
@@ -1364,9 +1365,9 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       tensor_store_(args.tensor_store),
       step_container_(args.step_container),
       stats_collector_(args.stats_collector),
-      trace_collector_(tracing::GetTraceCollector()),
       event_collector_(
           tracing::GetEventCollector(tracing::EventCategory::kCompute)),
+      context_(ContextKind::kThread),
       slice_reader_cache_(new checkpoint::TensorSliceReaderCacheWrapper),
       call_frame_(args.call_frame),
       impl_(impl),
@@ -1562,7 +1563,6 @@ struct ExecutorState::AsyncState {
 // Returns true if `item` might be traced by the given trace and event
 // collectors. Returns false only if `item` definitely will not be traced.
 bool MightTrace(const NodeItem& item,
-                const tracing::TraceCollector* trace_collector,
                 const tracing::EventCollector* event_collector,
                 bool using_annotations) {
   // Tracing will only be enabled if either `event_collector` is non null,
@@ -1575,6 +1575,7 @@ bool MightTrace(const NodeItem& item,
   if (event_collector != nullptr) {
     return true;
   }
+  auto* trace_collector = tracing::GetTraceCollector();
   if (trace_collector) {
     if (using_annotations) {
       return trace_collector->IsEnabledForAnnotations();
@@ -1586,6 +1587,7 @@ bool MightTrace(const NodeItem& item,
 }
 
 void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
+  WithContext wc(context_);
   const GraphView& gview = impl_->gview_;
   TaggedNodeSeq ready;
   TaggedNodeReadyQueue inline_ready;
@@ -1647,9 +1649,10 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
     params.track_allocations = false;
     stats = nullptr;
     if (stats_collector_ && !tagged_node.is_dead) {
-      // track allocations if and only if we are collecting statistics
-      params.track_allocations = true;
       stats = stats_collector_->CreateNodeExecStats(node);
+      // Track allocations if and only if we are collecting statistics, and
+      // `stats` object is expecting allocations to be tracked.
+      params.track_allocations = stats ? stats->TrackAllocations() : false;
       nodestats::SetScheduled(stats, scheduled_nsec);
       nodestats::SetAllStart(stats);
     }
@@ -1708,7 +1711,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         auto done = [this, state]() {
           Device* device = impl_->params_.device;
           NodeExecStatsInterface* stats = state->stats;  // Shorthand
-          Entry* first_input = state->first_input;     // Shorthand
+          Entry* first_input = state->first_input;       // Shorthand
 
           nodestats::SetOpEnd(stats);
           EntryVector outputs;
@@ -1757,9 +1760,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         OpKernelContext ctx(&params, item.num_outputs);
         nodestats::SetOpStart(stats);
 
-        if (TF_PREDICT_FALSE(MightTrace(item, trace_collector_,
-                                        event_collector_,
-                                        trace_using_annotations_))) {
+        if (TF_PREDICT_FALSE(
+                MightTrace(item, event_collector_, trace_using_annotations_))) {
           const string& op_name = op_kernel->name();
           tracing::ScopedRegion region(tracing::EventCategory::kCompute,
                                        op_name);
@@ -1767,14 +1769,18 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
             // The OpKernel may create child activities (such as GPU kernel
             // launches), so use a `ScopedAnnotation` to relate these activities
             // in the trace.
-            tracing::ScopedAnnotation activity(op_name,
-                                               op_kernel->type_string());
+            tracing::ScopedAnnotation activity(
+                op_name, strings::StrCat(op_kernel->type_string(),
+                                         "#id=", step_id_, "#"));
             device->Compute(op_kernel, &ctx);
           } else {
             // Use the cheaper `ScopedActivity` to trace just the OpKernel
             // execution.
-            tracing::ScopedActivity activity(op_name, op_kernel->type_string(),
-                                             item.kernel_is_expensive);
+            tracing::ScopedActivity activity(
+                op_name,
+                strings::StrCat(op_kernel->type_string(), "#id=", step_id_,
+                                "#"),
+                item.kernel_is_expensive);
             device->Compute(op_kernel, &ctx);
           }
         } else {
@@ -1968,7 +1974,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       // tensor value at i-th output.
       if (!IsSwitch(node) && !IsRecv(node)) {
         s.Update(errors::Internal("Missing ", i, "-th output from ",
-                                  SummarizeNode(*node)));
+                                  FormatNodeForError(*node)));
       }
     } else {
       Entry* out = &((*outputs)[i]);
@@ -2022,7 +2028,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
                                   DataTypeString(dtype),
                                   " does not match declared output type ",
                                   DataTypeString(item.output_type(i)),
-                                  " for node ", SummarizeNode(*node)));
+                                  " for node ", FormatNodeForError(*node)));
       }
     }
     if (!val.is_ref()) {
@@ -2037,6 +2043,24 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
 void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
                                      const NodeItem* item, EntryVector* outputs,
                                      TaggedNodeSeq* ready) {
+  auto activity_handle =
+      [&]() -> std::unique_ptr<tracing::TraceCollector::Handle> {
+    auto* trace_collector = tracing::GetTraceCollector();
+    if (TF_PREDICT_FALSE(trace_collector != nullptr &&
+                         trace_collector->IsEnabledForActivities(
+                             false /* is_expensive */))) {
+      const string& op_name = item->kernel->name();
+      // Intentionally using ExecutorPropagateOutputs as the first key so that
+      // users are aware that it's not the op invocation.
+      return trace_collector->CreateActivityHandle(
+          "ExecutorPropagateOutputs",
+          strings::StrCat(op_name, "#id=", step_id_, "#"),
+          false /* is_expensive */);
+    } else {
+      return nullptr;
+    }
+  }();
+
   const Node* node = tagged_node.node;
   FrameState* input_frame = tagged_node.input_frame;
   const int64 input_iter = tagged_node.input_iter;
@@ -2368,18 +2392,23 @@ void ExecutorState::Finish() {
   auto done_cb = std::move(done_cb_);
   auto runner = std::move(runner_);
   mu_.unlock();
+  CHECK(done_cb != nullptr);
   Device* device = impl_->params_.device;
+
   if ((sync_on_finish_ && status.ok()) || device->RequiresSyncOnCompletion()) {
     // Block until the device has finished all queued operations. For
     // devices like GPUs that continue to execute Ops after their Compute
     // methods have completed, this ensures that control is not returned to
     // the user until the step (and its side-effects) has actually completed.
-    status.Update(device->Sync());
+    device->Sync([=](Status new_status) mutable {
+      status.Update(new_status);
+      delete this;
+      runner([=]() { done_cb(status); });
+    });
+  } else {
+    delete this;
+    runner([=]() { done_cb(status); });
   }
-
-  delete this;
-  CHECK(done_cb != nullptr);
-  runner([=]() { done_cb(status); });
 }
 
 void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 7697103faf9bfa7a3fdbdbc0c3286d07d257d817..c311b2533eaa0bf08494a71b51922b1b886ac549 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -53,17 +53,17 @@ class ExecutorTest : public ::testing::Test {
     // when the test completes.
     CHECK(rendez_->Unref());
     delete exec_;
-    delete device_;
   }
 
   // Resets executor_ with a new executor based on a graph 'gdef'.
   void Create(std::unique_ptr<const Graph> graph) {
     const int version = graph->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
@@ -83,7 +83,7 @@ class ExecutorTest : public ::testing::Test {
   }
 
   thread::ThreadPool* thread_pool_ = nullptr;
-  Device* device_ = nullptr;
+  std::unique_ptr<Device> device_;
   Executor* exec_ = nullptr;
   StepStatsCollector step_stats_collector_;
   StepStats step_stats_;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 472865ca43f5ccc129e52192a9e64c39a80a0611..7eb622dc117f40a68079e6cea1a829227acfed7a 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -46,7 +46,11 @@ namespace tensorflow {
 
 // A few string constant used throughout this module.
 static constexpr const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static constexpr const char* const kDeviceArgOp =
+    FunctionLibraryDefinition::kDeviceArgOp;
 static constexpr const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
+static constexpr const char* const kDeviceRetOp =
+    FunctionLibraryDefinition::kDeviceRetOp;
 static constexpr const char* const kGradientOp =
     FunctionLibraryDefinition::kGradientOp;
 static constexpr const char* const kNodeLabel = "Func";
@@ -139,6 +143,153 @@ static Node* AddRet(Graph* g, Endpoint input, int index) {
   return ret;
 }
 
+// FunctionLibraryRuntime implementation that forwards all the function calls to
+// the base runtime implementation, and only overrides overlay lib in calls to
+// Instantiate (if caller doesn't provide its own overlay lib).
+//
+// When function library runtime (FunctionLibraryRuntimeImpl specifically)
+// instantiates function into a Graph object, it also creates an Executor for
+// it. That executor has a pointer to the function library runtime instance,
+// that is used to instantiate all nested function calls.
+//
+// If the original function was instantiated using overlay lib, we must preserve
+// that overlay lib in the executor's function library runtime.
+//
+// IMPORTANT: This runtime is intended for use only in executors created for
+// functions instantiated into a graph in FunctionLibraryRuntimeImpl.
+class FunctionLibraryRuntimeOverlay : public FunctionLibraryRuntime {
+ public:
+  FunctionLibraryRuntimeOverlay(
+      FunctionLibraryRuntime* base_flr,
+      const FunctionLibraryDefinition* overlay_lib_def)
+      : base_flr_(base_flr), overlay_lib_def_(overlay_lib_def) {}
+  ~FunctionLibraryRuntimeOverlay() override;
+
+  Status Instantiate(const string& function_name, AttrSlice attrs,
+                     const InstantiateOptions& options,
+                     Handle* handle) override;
+
+  Status ReleaseHandle(Handle handle) override;
+
+  const FunctionBody* GetFunctionBody(Handle h) override;
+
+  void Run(const Options& opts, Handle handle, gtl::ArraySlice<Tensor> args,
+           std::vector<Tensor>* rets, DoneCallback done) override;
+
+  void Run(const Options& opts, Handle handle, CallFrameInterface* call_frame,
+           DoneCallback done) override;
+
+  Status CreateKernel(const NodeDef& ndef, OpKernel** kernel) override;
+
+  bool IsStateful(const string& function_name) override;
+
+  const FunctionLibraryDefinition* GetFunctionLibraryDefinition()
+      const override;
+
+  Env* env() override;
+  Device* device() override;
+  const DeviceMgr* device_mgr() const override;
+
+  string DebugString(Handle handle) override;
+  int graph_def_version() override;
+
+  Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+               FunctionLibraryRuntime** out_flr) override;
+
+ private:
+  FunctionLibraryRuntime* base_flr_;                  // not owned
+  const FunctionLibraryDefinition* overlay_lib_def_;  // not owned
+};
+
+FunctionLibraryRuntimeOverlay::~FunctionLibraryRuntimeOverlay() = default;
+
+Status FunctionLibraryRuntimeOverlay::Instantiate(
+    const string& function_name, AttrSlice attrs,
+    const InstantiateOptions& options, Handle* handle) {
+  // We automatically add overlay lib to all instantiations, if the caller
+  // doesn't provide its own override.
+  if (!options.overlay_lib && overlay_lib_def_) {
+    InstantiateOptions options_copy = options;
+    options_copy.overlay_lib = overlay_lib_def_;
+    return base_flr_->Instantiate(function_name, attrs, options_copy, handle);
+  } else {
+    return base_flr_->Instantiate(function_name, attrs, options, handle);
+  }
+}
+
+Status FunctionLibraryRuntimeOverlay::ReleaseHandle(Handle handle) {
+  return base_flr_->ReleaseHandle(handle);
+}
+
+const FunctionBody* FunctionLibraryRuntimeOverlay::GetFunctionBody(Handle h) {
+  return base_flr_->GetFunctionBody(h);
+}
+
+void FunctionLibraryRuntimeOverlay::Run(const Options& opts, Handle handle,
+                                        gtl::ArraySlice<Tensor> args,
+                                        std::vector<Tensor>* rets,
+                                        DoneCallback done) {
+  base_flr_->Run(opts, handle, args, rets, std::move(done));
+}
+
+void FunctionLibraryRuntimeOverlay::Run(const Options& opts, Handle handle,
+                                        CallFrameInterface* call_frame,
+                                        DoneCallback done) {
+  base_flr_->Run(opts, handle, call_frame, std::move(done));
+}
+
+Status FunctionLibraryRuntimeOverlay::CreateKernel(const NodeDef&, OpKernel**) {
+  // We don't have access base_lib_def_ in base function library runtime (aka
+  // FunctionLibraryRuntimeImpl), so to make sure we do not create kernel with
+  // wrong lib_def we just disable creation of new kernels through overlays.
+  //
+  // When we call Instantiate from the base runtime with overlay lib override,
+  // the base runtime implementation is responsible for correctly passing custom
+  // overlay lib to all kernel constructions.
+  return errors::Internal(
+      "Overlay function library runtime doesn't support kernel creation.");
+}
+
+bool FunctionLibraryRuntimeOverlay::IsStateful(const string& function_name) {
+  // Important: we do not forward lookup to the base FLR.
+  const OpDef* op_def;
+  const Status s = overlay_lib_def_->LookUpOpDef(function_name, &op_def);
+  return s.ok() && op_def->is_stateful();
+}
+
+Env* FunctionLibraryRuntimeOverlay::env() { return base_flr_->env(); }
+
+Device* FunctionLibraryRuntimeOverlay::device() { return base_flr_->device(); }
+
+const DeviceMgr* FunctionLibraryRuntimeOverlay::device_mgr() const {
+  return base_flr_->device_mgr();
+}
+
+const FunctionLibraryDefinition*
+FunctionLibraryRuntimeOverlay::GetFunctionLibraryDefinition() const {
+  return overlay_lib_def_ ? overlay_lib_def_
+                          : base_flr_->GetFunctionLibraryDefinition();
+}
+
+string FunctionLibraryRuntimeOverlay::DebugString(Handle handle) {
+  return base_flr_->DebugString(handle);
+}
+
+int FunctionLibraryRuntimeOverlay::graph_def_version() {
+  return base_flr_->graph_def_version();
+}
+
+Status FunctionLibraryRuntimeOverlay::Clone(
+    std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
+    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
+    FunctionLibraryRuntime** out_flr) {
+  // NOTE(ezhulenev): Cloned FunctionLibraryRuntime will be missing overlay lib,
+  // but that's ok because we anyway do not copy/clone instantiated items from
+  // the base FLR.
+  return base_flr_->Clone(out_lib_def, out_pflr, out_flr);
+}
+
 class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
  public:
   FunctionLibraryRuntimeImpl(const DeviceMgr* dmgr, Env* env, Device* device,
@@ -216,11 +367,13 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
     const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
+    FunctionLibraryRuntimeOverlay* overlay_flr = nullptr;
     string executor_type;
 
     ~Item() {
       delete this->func_graph;
       delete this->exec;
+      delete this->overlay_flr;
     }
   };
   std::unordered_map<Handle, std::unique_ptr<Item>> items_ GUARDED_BY(mu_);
@@ -233,8 +386,8 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   Status FunctionDefToBody(const FunctionDef& fdef, AttrSlice attrs,
                            const FunctionLibraryDefinition* lib_def,
                            FunctionBody** fbody);
-  Status CreateItem(Handle handle, Item** item);
-  Status GetOrCreateItem(Handle handle, Item** item);
+  Status CreateItem(Item** item);
+  Status GetOrCreateItem(LocalHandle local_handle, Item** item);
   Status InstantiateSymbolicGradient(const NameAttrList& func,
                                      const FunctionLibraryDefinition* lib_def,
                                      FunctionBody** g_body);
@@ -242,7 +395,11 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   AttrValueMap FixAttrs(const AttrSlice& attrs);
   void RunRemote(const Options& opts, Handle handle,
                  gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-                 Executor::Args* exec_args, Item* item, DoneCallback done);
+                 Item* item, DoneCallback done);
+
+  void ExecutorArgsFromOptions(const FunctionLibraryRuntime::Options& run_opts,
+                               CallFrameInterface* frame,
+                               Executor::Args* exec_args);
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryRuntimeImpl);
 };
@@ -538,40 +695,43 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
     TF_RETURN_IF_ERROR(FunctionDefToBody(*fdef, attrs, lib_def, &fbody));
   }
 
+  LocalHandle local_handle;
   {
     mutex_lock l(mu_);
     *handle = parent_->GetHandle(key);
     if (*handle != kInvalidHandle) {
       delete fbody;
-      ++items_[parent_->GetHandleOnDevice(device_name_, *handle)]
-            ->instantiation_counter;
+      local_handle = parent_->GetHandleOnDevice(device_name_, *handle);
+      ++items_[local_handle]->instantiation_counter;
     } else {
       *handle = parent_->AddHandle(key, device_name_, next_handle_);
       Item* item = new Item;
       item->func_graph = fbody;
       item->overlay_lib = options.overlay_lib;
       item->instantiation_counter = 1;
-      item->executor_type = options.executor_type;
-      items_.emplace(next_handle_, std::unique_ptr<Item>(item));
-      next_handle_++;
+      item->executor_type = ExecutorType(options, attrs);
+      if (options.overlay_lib) {
+        item->overlay_flr =
+            new FunctionLibraryRuntimeOverlay(this, options.overlay_lib);
+      }
+      local_handle = next_handle_++;
+      items_.emplace(local_handle, std::unique_ptr<Item>(item));
     }
   }
 
   if (options.create_kernels_eagerly) {
     Item* item;
-    TF_RETURN_IF_ERROR(GetOrCreateItem(*handle, &item));
+    TF_RETURN_IF_ERROR(GetOrCreateItem(local_handle, &item));
   }
 
   return Status::OK();
 }
 
 Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+  LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
+  if (h == kInvalidLocalHandle) {
     return parent_->ReleaseHandle(handle);
   }
-
-  LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
-  CHECK_NE(h, kInvalidLocalHandle);
   mutex_lock l(mu_);
   CHECK_EQ(1, items_.count(h));
   std::unique_ptr<Item>& item = items_[h];
@@ -632,7 +792,7 @@ void PruneFunctionBody(Graph* g) {
 }
 }  // namespace
 
-Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
+Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   const FunctionBody* fbody;
   const FunctionLibraryDefinition* lib_def;
   string executor_type;
@@ -653,11 +813,14 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
   TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device()->device_type()),
                                        device()->name(), g.get()));
 
-  // Creates an executor based on the g.  This must be done without
+  // Creates an executor based on the g. This must be done without
   // holding mu_ because create_kernel_ calls back into the library.
   LocalExecutorParams params;
   params.device = device_;
-  params.function_library = this;
+  params.function_library =
+      (*item)->overlay_flr
+          ? static_cast<FunctionLibraryRuntime*>((*item)->overlay_flr)
+          : static_cast<FunctionLibraryRuntime*>(this);
   if (lib_def == base_lib_def_) {
     params.create_kernel = create_kernel_;
   } else {
@@ -683,13 +846,13 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
   return Status::OK();
 }
 
-Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
-  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+Status FunctionLibraryRuntimeImpl::GetOrCreateItem(LocalHandle local_handle,
+                                                   Item** item) {
   {
     tf_shared_lock l(mu_);
     auto iter = items_.find(local_handle);
     if (iter == items_.end()) {
-      return errors::NotFound("Function handle ", handle,
+      return errors::Internal("Local function handle ", local_handle,
                               " is not valid. Likely an internal error.");
     }
     *item = iter->second.get();
@@ -699,22 +862,37 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
   }
   // NOTE: We need to call CreateItem out of mu_ because creating an
   // executor needs to call CreateKernel.
-  return CreateItem(handle, item);
+  return CreateItem(item);
+}
+
+void FunctionLibraryRuntimeImpl::ExecutorArgsFromOptions(
+    const FunctionLibraryRuntime::Options& run_opts, CallFrameInterface* frame,
+    Executor::Args* exec_args) {
+  // Inherit the step_id from the caller.
+  exec_args->step_id = run_opts.step_id;
+  exec_args->rendezvous = run_opts.rendezvous;
+  exec_args->stats_collector = run_opts.stats_collector;
+  exec_args->cancellation_manager = run_opts.cancellation_manager;
+  exec_args->step_container = run_opts.step_container;
+  if (run_opts.runner) {
+    exec_args->runner = *run_opts.runner;
+  } else {
+    exec_args->runner = default_runner_;
+  }
+  exec_args->collective_executor = run_opts.collective_executor;
+  exec_args->call_frame = frame;
 }
 
 void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                                            gtl::ArraySlice<Tensor> args,
                                            std::vector<Tensor>* rets,
-                                           Executor::Args* exec_args,
                                            Item* item, DoneCallback done) {
-  DCHECK(exec_args->call_frame == nullptr);
   string target_device = parent_->GetDeviceName(handle);
   string source_device = opts.source_device;
   Rendezvous* rendezvous = opts.rendezvous;
   DeviceContext* device_context;
   Status s = parent_->GetDeviceContext(target_device, &device_context);
   if (!s.ok()) {
-    delete exec_args;
     done(s);
     return;
   }
@@ -722,7 +900,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   s = parent_->GetDeviceIncarnation(source_device, &src_incarnation);
   s.Update(parent_->GetDeviceIncarnation(target_device, &target_incarnation));
   if (!s.ok()) {
-    delete exec_args;
     done(s);
     return;
   }
@@ -730,13 +907,8 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   const FunctionBody* fbody = GetFunctionBody(handle);
   FunctionCallFrame* frame =
       new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
-  exec_args->call_frame = frame;
-  if (!s.ok()) {
-    delete frame;
-    delete exec_args;
-    done(s);
-    return;
-  }
+  Executor::Args* exec_args = new Executor::Args;
+  ExecutorArgsFromOptions(opts, frame, exec_args);
 
   std::vector<AllocatorAttributes> args_alloc_attrs, rets_alloc_attrs;
   args_alloc_attrs.reserve(fbody->arg_types.size());
@@ -782,10 +954,10 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           return;
         }
         item->exec->RunAsync(
-            *exec_args, [frame, rets, done, source_device, target_device,
-                         target_incarnation, rendezvous, device_context,
-                         remote_args, exec_args, rets_alloc_attrs,
-                         allow_dead_tensors](const Status& status) {
+            *exec_args,
+            [frame, rets, done, source_device, target_device,
+             target_incarnation, rendezvous, device_context, remote_args,
+             rets_alloc_attrs, allow_dead_tensors](const Status& status) {
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets, allow_dead_tensors);
@@ -793,7 +965,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
               delete frame;
               if (!s.ok()) {
                 delete remote_args;
-                delete exec_args;
                 done(s);
                 return;
               }
@@ -801,9 +972,9 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                   target_device, source_device, "ret_", target_incarnation,
                   *rets, device_context, rets_alloc_attrs, rendezvous);
               delete remote_args;
-              delete exec_args;
               done(s);
             });
+        delete exec_args;
       });
 }
 
@@ -826,7 +997,8 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     };
   }
 
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  if (local_handle == kInvalidLocalHandle) {
     parent_->Run(run_opts, handle, args, rets, done);
     return;
   }
@@ -836,54 +1008,43 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   }
   DCHECK(run_opts.runner != nullptr);
 
-  Executor::Args* exec_args = new Executor::Args;
-  // Inherit the step_id from the caller.
-  exec_args->step_id = run_opts.step_id;
-  exec_args->rendezvous = run_opts.rendezvous;
-  exec_args->stats_collector = run_opts.stats_collector;
-  exec_args->cancellation_manager = run_opts.cancellation_manager;
-  exec_args->step_container = run_opts.step_container;
-  exec_args->runner = *run_opts.runner;
-  exec_args->collective_executor = run_opts.collective_executor;
-
   Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
+  Status s = GetOrCreateItem(local_handle, &item);
   if (!s.ok()) {
-    delete exec_args;
     done(s);
     return;
   }
 
   if (run_opts.remote_execution) {
     // NOTE(mrry): `RunRemote()` will set `exec_args->call_frame` for us.
-    RunRemote(run_opts, handle, args, rets, exec_args, item, done);
+    RunRemote(run_opts, handle, args, rets, item, done);
     return;
   }
 
   const FunctionBody* fbody = GetFunctionBody(handle);
   FunctionCallFrame* frame =
       new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
-  exec_args->call_frame = frame;
   s = frame->SetArgs(args);
   if (!s.ok()) {
     delete frame;
-    delete exec_args;
     done(s);
     return;
   }
 
-  bool allow_dead_tensors = opts.allow_dead_tensors;
+  Executor::Args exec_args;
+  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
+
+  bool allow_dead_tensors = run_opts.allow_dead_tensors;
   item->exec->RunAsync(
       // Executor args
-      *exec_args,
+      exec_args,
       // Done callback.
-      [frame, rets, done, exec_args, allow_dead_tensors](const Status& status) {
+      [frame, rets, done, allow_dead_tensors](const Status& status) {
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets, allow_dead_tensors);
         }
         delete frame;
-        delete exec_args;
         done(s);
       });
 }
@@ -895,8 +1056,8 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     done(errors::Cancelled(""));
     return;
   }
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle) ||
-      opts.remote_execution) {
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  if (local_handle == kInvalidLocalHandle || opts.remote_execution) {
     done(errors::Unimplemented("Remote calling with CallFrameInterface"));
     return;
   }
@@ -917,7 +1078,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   }
 
   Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
+  Status s = GetOrCreateItem(local_handle, &item);
   if (!s.ok()) {
     done(s);
     return;
@@ -928,16 +1089,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   DCHECK(run_opts.runner != nullptr);
 
   Executor::Args exec_args;
-  // Inherit the step_id from the caller.
-  exec_args.step_id = run_opts.step_id;
-  exec_args.rendezvous = run_opts.rendezvous;
-  exec_args.stats_collector = run_opts.stats_collector;
-  exec_args.cancellation_manager = run_opts.cancellation_manager;
-  exec_args.collective_executor = run_opts.collective_executor;
-  exec_args.step_container = run_opts.step_container;
-  exec_args.runner = *run_opts.runner;
-  exec_args.call_frame = frame;
-
+  ExecutorArgsFromOptions(run_opts, frame, &exec_args);
   item->exec->RunAsync(exec_args, std::move(done));
 }
 
@@ -949,7 +1101,8 @@ bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) {
 
 string FunctionLibraryRuntimeImpl::DebugString(Handle handle) {
   Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
+  LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
+  Status s = GetOrCreateItem(local_handle, &item);
   if (s.ok()) {
     return tensorflow::DebugString(item->graph);
   } else {
@@ -1484,9 +1637,9 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
   this->ret_nodes.resize(ret_types.size());
   for (Node* n : this->graph->op_nodes()) {
     gtl::InlinedVector<Node*, 4>* node_vec;
-    if (n->type_string() == kRetOp) {
+    if (n->type_string() == kRetOp || n->type_string() == kDeviceRetOp) {
       node_vec = &this->ret_nodes;
-    } else if (n->type_string() == kArgOp) {
+    } else if (n->type_string() == kArgOp || n->type_string() == kDeviceArgOp) {
       node_vec = &this->arg_nodes;
     } else {
       continue;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 7bab9be9a622965fcbc583ef4bf196d0735a02ee..cab95cb596858f99285c3cfc5673f87b70368a32 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -18,10 +18,15 @@ limitations under the License.
 #include <atomic>
 #include <utility>
 
+#include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
@@ -143,14 +148,15 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 3});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices_));
+        options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    device_mgr_.reset(new DeviceMgr(devices_));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, default_thread_pool, nullptr /* cluster_flr */));
@@ -354,7 +360,6 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   FunctionLibraryRuntime* flr0_;
   FunctionLibraryRuntime* flr1_;
   FunctionLibraryRuntime* flr2_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
@@ -432,6 +437,57 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesNInOverlayLib) {
            "Not found: Function XTimesTwo is not defined.");
 }
 
+TEST_F(FunctionLibraryRuntimeTest, XTimesNInOverlayLibAndDelayedInstantiation) {
+  using FDH = ::tensorflow::FunctionDefHelper;
+
+  Init({});
+
+  FunctionDef xt4_override = test::function::XTimesTwo();
+  xt4_override.mutable_signature()->set_name("XTimesFour");
+
+  // Call XTimesFour via PartitionedCall which delays functions instantiation
+  // to the first call to Compute/ComputeAsync.
+  FunctionDef my_xt4 = FunctionDefHelper::Create(
+      "MyXTimesFour", {"x:float"}, {"z:float"}, {},
+      {{{"x_times_four"},
+        "PartitionedCall",
+        {"x"},
+        {{"Tin", DataTypeSlice({DT_FLOAT})},
+         {"Tout", DataTypeSlice({DT_FLOAT})},
+         {"f", FDH::FunctionRef("XTimesFour", {{"T", DT_FLOAT}})}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "x_times_four:output:0"}});
+
+  FunctionDefLibrary lib;
+  *lib.add_function() = test::function::XTimesTwo();
+  *lib.add_function() = test::function::XTimesFour();
+  *lib.add_function() = my_xt4;
+  std::unique_ptr<FunctionLibraryDefinition> overlay_lib(
+      new FunctionLibraryDefinition(OpRegistry::Global(), lib));
+
+  FunctionLibraryRuntime::InstantiateOptions options;
+  options.overlay_lib = overlay_lib.get();
+
+  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  Tensor y;
+
+  // When we instantiate with default library overlay we should get x*4.
+  TF_CHECK_OK(InstantiateAndRun(flr0_, "MyXTimesFour", {}, options, {x}, {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({4, 8, 12, 16}));
+
+  // Overlay library that overrides default XTimesFour with XTimesTwo body.
+  FunctionDefLibrary lib_override;
+  *lib_override.add_function() = xt4_override;
+  *lib_override.add_function() = my_xt4;
+  std::unique_ptr<FunctionLibraryDefinition> overlay_lib_override(
+      new FunctionLibraryDefinition(OpRegistry::Global(), lib_override));
+
+  // We should call the XTimesFour override which is actually x*2.
+  options.overlay_lib = overlay_lib_override.get();
+  TF_CHECK_OK(InstantiateAndRun(flr0_, "MyXTimesFour", {}, options, {x}, {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+}
+
 TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
   auto T = DT_INT32;
 
@@ -584,7 +640,28 @@ TEST_F(FunctionLibraryRuntimeTest, ExecutorFactory) {
              "Internal: This is a dummy.");
   }
 
-  // Test that non-existent exector types trigger an error.
+  // Test that a non-default executor factory can be invoked via an attr.
+  {
+    FunctionLibraryRuntime::InstantiateOptions options;
+    HasError(InstantiateAndRun(flr0_, "XTimesTwo",
+                               {{"T", DT_FLOAT}, {"_executor", "DUMMY"}},
+                               options, {x}, {&y}),
+             "Internal: This is a dummy.");
+  }
+
+  // Test that a non-default executor factory specified via an
+  // `InstantiateOptions` supersedes the attr when both are present.
+  {
+    FunctionLibraryRuntime::InstantiateOptions options;
+    options.executor_type = "DUMMY";
+    HasError(
+        InstantiateAndRun(flr0_, "XTimesTwo",
+                          {{"T", DT_FLOAT}, {"_executor", "UNKNOWN_EXECUTOR"}},
+                          options, {x}, {&y}),
+        "Internal: This is a dummy.");
+  }
+
+  // Test that non-existent executor types trigger an error.
   {
     FunctionLibraryRuntime::InstantiateOptions options;
     options.executor_type = "UNKNOWN_EXECUTOR";
@@ -593,6 +670,15 @@ TEST_F(FunctionLibraryRuntimeTest, ExecutorFactory) {
              "Not found: No executor factory registered for the given executor "
              "type: UNKNOWN_EXECUTOR");
   }
+  {
+    FunctionLibraryRuntime::InstantiateOptions options;
+    HasError(
+        InstantiateAndRun(flr0_, "XTimesTwo",
+                          {{"T", DT_FLOAT}, {"_executor", "UNKNOWN_EXECUTOR"}},
+                          options, {x}, {&y}),
+        "Not found: No executor factory registered for the given executor "
+        "type: UNKNOWN_EXECUTOR");
+  }
 }
 
 TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
@@ -858,18 +944,51 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
   EXPECT_EQ(expected_node_names, executed_node_names);
 }
 
+// Constant folding generates names using a global counter.
+// This function invokes constant folding and parses the counter
+// from the generated node name.
+int GetConstantFoldingCounter() {
+  Graph g(OpRegistry::Global());
+  Scope s = Scope::NewRootScope();
+  auto a = ops::Const<float>(s, {1.0}, {});
+  auto b = ops::Const<float>(s, {2.0}, {});
+
+  auto add = ops::Add(s.WithOpName("add"), a, b);
+  auto send =
+      ops::_Send(s.WithOpName("s1"), add, "add", "sender", 0, "receiver");
+
+  TF_CHECK_OK(s.ToGraph(&g));
+  bool was_mutated;
+  ConstantFoldingOptions opt{};
+  TF_CHECK_OK(
+      ConstantFold(opt, nullptr, Env::Default(), nullptr, &g, &was_mutated));
+  GraphDef def;
+  g.ToGraphDef(&def);
+  for (const NodeDef& node : def.node()) {
+    if (absl::StartsWith(node.name(), "add/")) {
+      std::vector<std::string> v = absl::StrSplit(node.name(), "__cf__");
+      CHECK_GT(v.size(), 1);
+      int counter;
+      CHECK(absl::SimpleAtoi(v[v.size() - 1], &counter));
+      return counter;
+    }
+  }
+  LOG(FATAL) << "Should have found a node that replcaed add";
+}
+
 TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
   std::unique_ptr<Graph> g = GetFuncBody(flr0_, "XTimes16", {{"T", DT_FLOAT}});
   ASSERT_TRUE(g != nullptr);
   ExpandInlineFunctions(flr0_, g.get());
+  int cf_counter = GetConstantFoldingCounter();
   OptimizeGraph(flr0_, &g);
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto x4_x2_scale = ops::Const<float>(
-        s.WithOpName("x4/x2/scale/_12__cf__10")
+        s.WithOpName("x4/x2/scale/_12__cf__" + std::to_string(cf_counter + 1))
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
@@ -1069,20 +1188,20 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 
+  int cf_counter = GetConstantFoldingCounter();
   OptimizeGraph(flr0_, &g);
-
   {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
     auto scale = ops::Const(
-        s.WithOpName("scale/_6__cf__15")
+        s.WithOpName("scale/_6__cf__" + std::to_string(cf_counter + 2))
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
     auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
     auto const0 = ops::Const(
-        s.WithOpName("Func/_1/sy/_5__cf__14")
+        s.WithOpName("Func/_1/sy/_5__cf__" + std::to_string(cf_counter + 1))
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         0, {0});
     auto func1_rx = ops::internal::BroadcastGradientArgs(
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 655a68cfc936c739fd9d90d0e39b46afb2bb1f45..1b803736fb881c8f133198ab39e5801a357c5659 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -54,21 +54,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 3});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices_));
+        options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    device_mgr_.reset(new DeviceMgr(devices_));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, default_thread_pool, nullptr /* cluster_flr */));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
-    flr1_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:1");
-    flr2_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:2");
-    fdef_lib_ = lib_def_->ToProto();
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
@@ -192,13 +190,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   FunctionLibraryRuntime* flr0_;
-  FunctionLibraryRuntime* flr1_;
-  FunctionLibraryRuntime* flr2_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionDefLibrary fdef_lib_;
 };
 
 TEST_F(FunctionLibraryRuntimeTest, DefaultThreadpool) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index d8ebdeff5d21c9efcd5cc30d1e4324f11a81d4b7..5152d97fdefed688ba05043072ff6df635471ed9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -84,13 +84,13 @@ namespace tensorflow {
 // corresponding stream have completed.  The following two classes
 // serve this purpose in two different compilation environments.
 
-class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
+class EigenGpuStreamDevice : public ::Eigen::StreamInterface {
  public:
-  EigenCudaStreamDevice()
+  EigenGpuStreamDevice()
       : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
     Eigen::initializeDeviceProp();
   }
-  ~EigenCudaStreamDevice() override {}
+  ~EigenGpuStreamDevice() override {}
   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
                     TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
                     char* scratch) {
@@ -101,7 +101,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
     context_ = context;
     scratch_ = scratch;
     semaphore_ =
-        reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
+        reinterpret_cast<unsigned int*>(scratch + Eigen::kGpuScratchSize);
     stream_ = cuda_stream;
     allocator_ = alloc;
     PlatformGpuId platform_gpu_id;
@@ -185,7 +185,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
   mutable unsigned int* semaphore_;
   OpKernelContext* context_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
+  TF_DISALLOW_COPY_AND_ASSIGN(EigenGpuStreamDevice);
 };
 
 // This factory helps to ensure that different GPU device objects that refer to
@@ -292,7 +292,7 @@ Status BaseGPUDevice::InitScratchBuffers() {
       DCHECK(streams_[i]);
       if (scratch_.size() > i && scratch_[i]) continue;
       size_t scratch_buffer_size =
-          Eigen::kCudaScratchSize + sizeof(unsigned int);
+          Eigen::kGpuScratchSize + sizeof(unsigned int);
       void* scratch_buffer = gpu_allocator_->AllocateRaw(
           Allocator::kAllocatorAlignment, scratch_buffer_size);
       if (scratch_buffer == nullptr) {
@@ -304,7 +304,7 @@ Status BaseGPUDevice::InitScratchBuffers() {
           se::DeviceMemoryBase(scratch_buffer, scratch_buffer_size));
 
       bool ok = executor_->SynchronousMemZero(
-          &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
+          &mem, Eigen::kGpuScratchSize + sizeof(unsigned int));
       if (!ok) {
         return errors::FailedPrecondition(
             "Failed to memcopy into scratch buffer for device ",
@@ -692,7 +692,7 @@ class ConcretePerOpGpuDevice : public PerOpGpuDevice {
   const Eigen::GpuDevice& device() const override { return device_; }
 
  private:
-  EigenCudaStreamDevice stream_device_;
+  EigenGpuStreamDevice stream_device_;
   Eigen::GpuDevice device_;
 };
 
@@ -907,9 +907,9 @@ Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
 const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
 const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
 
-Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
-                                           const string& name_prefix,
-                                           std::vector<Device*>* devices) {
+Status BaseGPUDeviceFactory::CreateDevices(
+    const SessionOptions& options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
   se::Platform* gpu_manager = GPUMachineManager();
   if (gpu_manager == nullptr) {
@@ -1073,12 +1073,10 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
   // LINT.ThenChange(//tensorflow/python/platform/test.py)
 }
 
-Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
-                                             const string& name_prefix,
-                                             TfGpuId tf_gpu_id,
-                                             int64 memory_limit,
-                                             const DeviceLocality& dev_locality,
-                                             std::vector<Device*>* devices) {
+Status BaseGPUDeviceFactory::CreateGPUDevice(
+    const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
+    int64 memory_limit, const DeviceLocality& dev_locality,
+    std::vector<std::unique_ptr<Device>>* devices) {
   CHECK_GE(tf_gpu_id.value(), 0);
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
@@ -1108,7 +1106,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   // different (which should be an error).
   //
   // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
-  BaseGPUDevice* gpu_device = CreateGPUDevice(
+  std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
       options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
       tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc),
       gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
@@ -1116,7 +1114,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
             << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
             << GetShortDeviceDescription(platform_gpu_id, desc) << ")";
   TF_RETURN_IF_ERROR(gpu_device->Init(options));
-  devices->push_back(gpu_device);
+  devices->push_back(std::move(gpu_device));
 
   return Status::OK();
 }
@@ -1169,6 +1167,7 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
     LocalityMap* localities) {
   std::vector<TfGpuId> all_tf_gpu_ids;
+  all_tf_gpu_ids.reserve(num_tf_gpus);
   for (int i = 0; i < num_tf_gpus; ++i) {
     all_tf_gpu_ids.push_back(TfGpuId(i));
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 674e8384d5eaac9946a3882ca7c1c7655229b8c4..d002d02c51d073ef3019fa1659d555b5d092d883 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -166,7 +166,7 @@ class BaseGPUDevice : public LocalDevice {
 class BaseGPUDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 
   struct InterconnectMap {
     // Name of interconnect technology, if known.
@@ -207,15 +207,13 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   Status CreateGPUDevice(const SessionOptions& options,
                          const string& name_prefix, TfGpuId tf_gpu_id,
                          int64 memory_limit, const DeviceLocality& dev_locality,
-                         std::vector<Device*>* devices);
-
-  virtual BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
-                                         const string& name, Bytes memory_limit,
-                                         const DeviceLocality& dev_locality,
-                                         TfGpuId tf_gpu_id,
-                                         const string& physical_device_desc,
-                                         Allocator* gpu_allocator,
-                                         Allocator* cpu_allocator) = 0;
+                         std::vector<std::unique_ptr<Device>>* devices);
+
+  virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
+      const SessionOptions& options, const string& name, Bytes memory_limit,
+      const DeviceLocality& dev_locality, TfGpuId tf_gpu_id,
+      const string& physical_device_desc, Allocator* gpu_allocator,
+      Allocator* cpu_allocator) = 0;
 
   // Returns into 'ids' the list of valid platform GPU ids, in the order that
   // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index e1aaf95df6de07c8d12f2c443f0b6bfd6a99a968..8dc719732927880e6ebb628962160c4a90b1f25c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -59,15 +59,14 @@ class GPUDevice : public BaseGPUDevice {
 
 class GPUDeviceFactory : public BaseGPUDeviceFactory {
  private:
-  BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
-                                 const string& name, Bytes memory_limit,
-                                 const DeviceLocality& locality,
-                                 TfGpuId tf_gpu_id,
-                                 const string& physical_device_desc,
-                                 Allocator* gpu_allocator,
-                                 Allocator* cpu_allocator) override {
-    return new GPUDevice(options, name, memory_limit, locality, tf_gpu_id,
-                         physical_device_desc, gpu_allocator, cpu_allocator);
+  std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
+      const SessionOptions& options, const string& name, Bytes memory_limit,
+      const DeviceLocality& locality, TfGpuId tf_gpu_id,
+      const string& physical_device_desc, Allocator* gpu_allocator,
+      Allocator* cpu_allocator) override {
+    return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
+                                        tf_gpu_id, physical_device_desc,
+                                        gpu_allocator, cpu_allocator);
   }
 };
 
@@ -108,7 +107,7 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
 class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
+                       std::vector<std::unique_ptr<Device>>* devices) override {
     int n = 1;
     auto iter = options.config.device_count().find("CPU");
     if (iter != options.config.device_count().end()) {
@@ -116,7 +115,7 @@ class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
     }
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:CPU:", i);
-      devices->push_back(new GPUCompatibleCPUDevice(
+      devices->push_back(absl::make_unique<GPUCompatibleCPUDevice>(
           options, name, Bytes(256 << 20), DeviceLocality(), cpu_allocator()));
     }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
index 75be6d60b86af101fb9de7497490e72c523d632b..58656ec7576ef92122f2855acf2b544a30d00573 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
@@ -33,7 +33,7 @@ namespace {
 
 TEST(GPUDeviceOnNonGPUMachineTest, CreateGPUDevicesOnNonGPUMachine) {
   SessionOptions opts;
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, "/job:localhost/replica:0/task:0", &devices));
   EXPECT_TRUE(devices.empty());
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 36294094e9ad88fb45832c0295d07c9c1dbf5c6b..ae623b2adbe152de6cbad248db234ac5469f83e1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -88,7 +88,7 @@ class GPUDeviceTest : public ::testing::Test {
 
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,abc");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -97,7 +97,7 @@ TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
 
 TEST_F(GPUDeviceTest, InvalidGpuId) {
   SessionOptions opts = MakeSessionOptions("100");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -107,7 +107,7 @@ TEST_F(GPUDeviceTest, InvalidGpuId) {
 
 TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,0");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -117,7 +117,7 @@ TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
 
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
   SessionOptions opts = MakeSessionOptions("0", 0.1, 1, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -129,7 +129,7 @@ TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
   // device_count is 0, but with one entry in visible_device_list and one
   // (empty) VirtualDevices messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 0, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
@@ -141,7 +141,7 @@ TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
   // Single entry in visible_device_list with two (empty) VirtualDevices
   // messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 8, {{}, {}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
@@ -155,7 +155,7 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
   // Three entries in visible_device_list with two (empty) VirtualDevices
   // messages.
   SessionOptions opts = MakeSessionOptions("0,1", 0, 8, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -169,39 +169,36 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
 TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
   // It'll create single virtual device when the virtual device config is empty.
   SessionOptions opts = MakeSessionOptions("0");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
   // It'll create single virtual device for the gpu in question when
   // memory_limit_mb is unset.
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
-  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(2, devices.size());
@@ -219,7 +216,6 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
             devices[1]->attributes().locality().links().link(0).type());
   EXPECT_EQ(BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength,
             devices[1]->attributes().locality().links().link(0).strength());
-  gtl::STLDeleteElements(&devices);
 }
 
 // Enabling unified memory on pre-Pascal GPUs results in an initialization
@@ -236,7 +232,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
   opts.config.mutable_gpu_options()
       ->mutable_experimental()
       ->set_use_unified_memory(true);
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INTERNAL);
@@ -259,7 +255,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
   }
 
   SessionOptions opts = MakeSessionOptions("0", kGpuMemoryFraction);
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_ASSERT_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   ASSERT_EQ(1, devices.size());
@@ -278,8 +274,6 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
                                      (memory_limit >> 20) << 20);
   EXPECT_NE(ptr, nullptr);
   allocator->DeallocateRaw(ptr);
-
-  gtl::STLDeleteElements(&devices);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index d2adf699f524ef6771da6b0a41e7fc552d2bbdfa..fe3214755715a896b472835652be68c5ef65a6e9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -78,7 +78,8 @@ static std::atomic_int_fast64_t live_tensor_bytes(0);
 // A TensorBuffer that counts live memory usage for testing
 class TestTensorBuffer : public TensorBuffer {
  public:
-  explicit TestTensorBuffer(size_t bytes) : bytes_(bytes) {
+  explicit TestTensorBuffer(size_t bytes)
+      : TensorBuffer(nullptr), bytes_(bytes) {
     live_tensor_bytes += bytes_;
   }
   ~TestTensorBuffer() override { live_tensor_bytes -= bytes_; }
@@ -86,7 +87,6 @@ class TestTensorBuffer : public TensorBuffer {
   size_t size() const override { return bytes_; }
 
   // Not used in this test
-  void* data() const override { return nullptr; }
   TensorBuffer* root_buffer() override { return nullptr; }
   void FillAllocationDescription(AllocationDescription* arg) const override {}
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 3e95374fda89cd14660fa6974789c17be522bb03..8167cfb9d7dc6cd91a17323b3083d1823cbaa5e0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -55,35 +55,25 @@ bool useCudaMemoryGuardAllocator() {
 
 }  // namespace
 
-GPUProcessState* GPUProcessState::instance_ = nullptr;
-
-/*static*/ GPUProcessState* GPUProcessState::singleton() {
-  if (instance_ == nullptr) {
-    instance_ = new GPUProcessState;
-  }
-  CHECK(instance_->process_state_);
-
-  return instance_;
+/*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
+  static GPUProcessState* instance = ps ? ps : new GPUProcessState;
+  DCHECK((!ps) || (ps == instance))
+      << "Multiple calls to GPUProcessState with non-null ps";
+  return instance;
 }
 
 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
-  CHECK(instance_ == nullptr);
-  instance_ = this;
   process_state_ = ProcessState::singleton();
 }
 
-// Normally the GPUProcessState singleton is never explicitly deleted.
-// This function is defined for debugging problems with the allocators.
-GPUProcessState::~GPUProcessState() {
-  CHECK_EQ(this, instance_);
-  instance_ = nullptr;
-}
-
 int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
   // Return the NUMA node associated with the GPU's StreamExecutor.
   se::StreamExecutor* se =
       GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
-  return se->GetDeviceDescription().numa_node();
+  int numa_node = se->GetDeviceDescription().numa_node();
+  // bus_id must be non-negative.  If the numa_node is not known,
+  // use 0.
+  return numa_node >= 0 ? numa_node : 0;
 }
 
 Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
@@ -110,6 +100,7 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
     PlatformGpuId platform_gpu_id;
     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
     int bus_id = BusIdForGPU(tf_gpu_id);
+    DCHECK_GE(bus_id, 0);
     while (bus_id >= gpu_visitors_.size()) {
       gpu_visitors_.push_back({});
     }
@@ -166,7 +157,9 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
       !process_state_->ProcessState::FLAGS_brain_mem_reg_cuda_dma) {
     return process_state_->GetCPUAllocator(numa_node);
   }
-  CHECK_GE(numa_node, 0);
+  if (numa_node == port::kNUMANoAffinity) {
+    numa_node = 0;
+  }
   {
     // Here we optimize the most common use case where cuda_host_allocators_
     // and cuda_al_ have already been populated and since we're only reading
@@ -260,6 +253,7 @@ void GPUProcessState::AddGPUAllocVisitor(int bus_id,
   CHECK(gpu_allocators_.empty())  // Crash OK
       << "AddGPUAllocVisitor must be called before "
          "first call to GetGPUAllocator.";
+  DCHECK_GE(bus_id, 0);
   while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
     gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index 43e9a316604006bb20f5ff171730f4b2ddc7e3d6..df51c10c8065fa94d736c8f4dfa76faebdc8bc62 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -37,7 +37,19 @@ class PoolAllocator;
 // Singleton that manages per-process state when GPUs are present.
 class GPUProcessState {
  public:
-  static GPUProcessState* singleton();
+  // If ps == nullptr, returns pointer to the single instance of this class to
+  // be used within this process.
+  //
+  // If ps != nullptrs, accepts a value to be returned by all subsequent calls.
+  // A non-null ps may ONLY be provided during program static storage
+  // initialization.  Must not be called more than once with a non-null ps.
+  //
+  // If a derived class of GPUProcessState is ever used in a process, it must
+  // always be used in place of this class.  In order to ensure that existing
+  // calls to GPUProcessState::singleton() all resolve to the derived instance
+  // instead, this function must be called once during startup, supplying the
+  // derived instance value, prior to any accessor call to this function.
+  static GPUProcessState* singleton(GPUProcessState* ps = nullptr);
 
   // Query whether any GPU device has been created so far.
   // Disable thread safety analysis since a race is benign here.
@@ -97,7 +109,11 @@ class GPUProcessState {
   virtual int BusIdForGPU(TfGpuId tf_gpu_id);
 
  protected:
+  // GPUProcessState is a singleton that should not normally be deleted except
+  // at process shutdown.
   GPUProcessState();
+  virtual ~GPUProcessState() {}
+  friend class GPUDeviceTest;
 
   // Helper method for unit tests to reset the ProcessState singleton by
   // cleaning up everything. Never use in production.
@@ -127,10 +143,6 @@ class GPUProcessState {
       GUARDED_BY(mu_);
   std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_
       GUARDED_BY(mu_);
-
-  virtual ~GPUProcessState();
-
-  friend class GPUDeviceTest;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 4475fa979eb39f3da7e6f95caf8174c30d0f4ce2..880806f120d010a812bbced62409a1ff5ed8e9d7 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -25,9 +25,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
@@ -37,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -76,7 +79,7 @@ GraphExecutionState::~GraphExecutionState() {
     GraphDef* graph_def, const GraphExecutionStateOptions& options,
     std::unique_ptr<GraphExecutionState>* out_state) {
 #ifndef __ANDROID__
-  VLOG(4) << "Graph proto is " << graph_def->DebugString();
+  VLOG(4) << "Graph proto is \n" << graph_def->DebugString();
 #endif  // __ANDROID__
 
   std::unique_ptr<GraphExecutionState> ret(
@@ -393,6 +396,42 @@ Status ValidateFeedAndFetchDevices(
   }
   return Status::OK();
 }
+
+Status GetFeedShapeAndTypeFromAttribute(const NodeDef& node,
+                                        PartialTensorShape* shape,
+                                        DataType* type) {
+  static const gtl::FlatSet<string>* const kHasExplicitShapeAttribute =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
+          "Placeholder", "PlaceholderV2", "PlaceholderWithDefault",
+          "ParallelConcat", "ImmutableConst", "_ParallelConcatStart",
+          "InfeedDequeue", "OutfeedDequeue", "CollectiveBcastSend",
+          "CollectiveBcastRecv", "AccumulateNV2", "VariableV2", "Variable",
+          "TemporaryVariable", "NcclBroadcast", "_ScopedAllocator",
+          "_ScopedAllocatorConcat"}));
+
+  // All the node types handled here have their output datatype set in
+  // either attribute 'dtype' or 'T'.
+  if (!GetNodeAttr(node, "dtype", type).ok() &&
+      !GetNodeAttr(node, "T", type).ok()) {
+    return errors::InvalidArgument(
+        "Could not determine output type for feed node: ", node.name(),
+        " of type ", node.op());
+  }
+
+  // First handle the case of feeding a const node.
+  if (node.op() == "Const" && HasNodeAttr(node, "value")) {
+    *shape =
+        PartialTensorShape(node.attr().at("value").tensor().tensor_shape());
+  } else if (kHasExplicitShapeAttribute->find(node.op()) !=
+             kHasExplicitShapeAttribute->end()) {
+    TF_RETURN_IF_ERROR(GetNodeAttr(node, "shape", shape));
+  } else {
+    return errors::InvalidArgument("Could not determine shape for feed node: ",
+                                   node.name(), " of type ", node.op());
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 Status GraphExecutionState::PruneGraph(
@@ -531,7 +570,8 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
 
-  Placer placer(new_graph.get(), device_set_, session_options_);
+  Placer placer(new_graph.get(), device_set_, session_options_,
+                /* default_device= */ nullptr);
   // TODO(mrry): Consider making the Placer cancelable.
   TF_RETURN_IF_ERROR(placer.Run());
 
@@ -551,16 +591,17 @@ Status GraphExecutionState::OptimizeGraph(
     return errors::InvalidArgument("Can't optimize a pruned graph");
   }
 
-  const RewriterConfig& rewrite_options =
-      session_options_->config.graph_options().rewrite_options();
-
-  if (grappler::MetaOptimizerEnabled(rewrite_options)) {
-    // Adding this functionality in steps. The first step is to make sure
-    // we don't break dependencies. The second step will be to turn the
-    // functionality on by default.
+  if (grappler::MetaOptimizerEnabled(session_options_->config)) {
     grappler::GrapplerItem item;
     item.id = "tf_graph";
     graph_->ToGraphDef(&item.graph);
+
+    // It's ok to skip invalid device annotations in Grappler.
+    Status inferred_devices = item.InferDevicesFromGraph();
+    if (!inferred_devices.ok()) {
+      VLOG(3) << inferred_devices.error_message();
+    }
+
     // TODO(b/114748242): Add a unit test to test this bug fix.
     if (flib_def_) {
       *item.graph.mutable_library() = flib_def_->ToProto();
@@ -601,26 +642,30 @@ Status GraphExecutionState::OptimizeGraph(
         if (feeds.find(node.name()) == feeds.end()) {
           continue;
         }
-        if (node.attr().count("dtype") == 0 ||
-            node.attr().count("shape") == 0) {
-          return errors::InvalidArgument("Missing node shape or type");
-        }
-        TensorShapeProto shape_proto(node.attr().at("shape").shape());
-        // If the shape of the placeholder value is only partially known,
-        // we're free to use any dimension we want to feed the placeholder. We
-        // choose 1 to minimize the memory impact. Note that this only matters
-        // if an optimizer choose to run the graph to build its cost model,
-        // which doesn't happen (yet)
-        if (shape_proto.unknown_rank()) {
-          shape_proto.set_unknown_rank(false);
-        }
-        for (auto& dim : *shape_proto.mutable_dim()) {
-          if (dim.size() < 0) {
-            dim.set_size(1);
+        // Get the type and shape of the feed node.
+        PartialTensorShape partial_shape;
+        DataType type;
+        TF_RETURN_IF_ERROR(
+            GetFeedShapeAndTypeFromAttribute(node, &partial_shape, &type));
+        // If the shape of the placeholder is only partially known, we are free
+        // to set unknown dimensions of its shape to any value we desire. We
+        // choose 0 to minimize the memory impact. Note that this only matters
+        // if an optimizer chooses to run the graph.
+        TensorShape shape;
+        if (partial_shape.unknown_rank()) {
+          shape = TensorShape({0});
+        } else {
+          for (int i = 0; i < partial_shape.dims(); ++i) {
+            if (partial_shape.dim_size(i) < 0) {
+              partial_shape.set_dim(i, 0);
+            }
+          }
+          if (!partial_shape.AsTensorShape(&shape)) {
+            return errors::InvalidArgument(
+                "Could not derive shape for feed node: ", node.DebugString());
           }
         }
-        TensorShape shape(shape_proto);
-        DataType type = node.attr().at("dtype").type();
+
         Tensor fake_input(type, shape);
         item.feed.emplace_back(node.name(), fake_input);
       }
@@ -637,7 +682,7 @@ Status GraphExecutionState::OptimizeGraph(
     grappler::VirtualCluster cluster(device_set_);
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
-        item, rewrite_options, cpu_device, &cluster, &new_graph));
+        item, session_options_->config, cpu_device, &cluster, &new_graph));
 
     // Merge optimized graph function library with an original library.
     // Optimized graph might have new functions specialized for it's
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 91194bc86f023b6dfe5c08038839649c79f571a2..37a979a8f1929ed6312dc79354a3c206f7c4c5f4 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -39,8 +39,7 @@ void GraphOptimizer::Optimize(
     const std::unordered_map<string, std::vector<PartialTensorShape>>*
         shape_map,
     const std::function<bool(const Node*)>& cse_consider_fn,
-    const std::function<bool(const Node*)>& cf_consider_fn,
-    bool cf_disable_memory_output_type_check) {
+    const std::function<bool(const Node*)>& cf_consider_fn) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -65,8 +64,6 @@ void GraphOptimizer::Optimize(
       ConstantFoldingOptions cf_opts;
       cf_opts.shape_map = shape_map;
       cf_opts.consider = cf_consider_fn;
-      cf_opts.disable_memory_output_type_check =
-          cf_disable_memory_output_type_check;
       if (opts_.max_folded_constant_in_bytes() > 0) {
         cf_opts.max_constant_size_in_bytes =
             opts_.max_folded_constant_in_bytes();
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 8954e9612df6a269f4fb2fead5d928a08db1ff8e..789cc5694219e1386bde0fb1821dfdc9928523f1 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -47,16 +47,13 @@ class GraphOptimizer {
   // returns true will be considered for CSE.
   // If cf_consider_fn is not null then only nodes for which cf_consider_fn
   // returns true will be considered for CF.
-  // If cf_disable_memory_output_type_check is true, CF will discard output
-  // memory type check for constant node replacement.
   void Optimize(
       FunctionLibraryRuntime* runtime, Env* env, Device* device,
       std::unique_ptr<Graph>* graph,
       const std::unordered_map<string, std::vector<PartialTensorShape>>*
           shape_map,
       const std::function<bool(const Node*)>& cse_consider_fn = nullptr,
-      const std::function<bool(const Node*)>& cf_consider_fn = nullptr,
-      bool cf_disable_memory_output_type_check = false);
+      const std::function<bool(const Node*)>& cf_consider_fn = nullptr);
 
   const OptimizerOptions& options() { return opts_; }
 
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index da0e359cf8abdd93dc05256c6edd94d613ef7355..f0656ff53332d7dd4f21d9d874846c16fb669681 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h"
 
 #include <algorithm>
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -217,7 +218,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
             << " num_devices_per_worker=" << num_devices_per_worker;
     int total_num_devices = num_workers * num_devices_per_worker;
     device_type_ = device_type;
-    std::vector<Device*> local_devices;
+    std::vector<std::unique_ptr<Device>> local_devices;
     SessionOptions sess_opts;
     sess_opts.env = Env::Default();
     Bytes mem_limit(4 << 20);
@@ -227,7 +228,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
         if (device_type == DEVICE_CPU) {
           string dev_name = strings::StrCat("/job:worker/replica:0/task:", wi,
                                             "/device:CPU:", di);
-          local_devices.push_back(new ThreadPoolDevice(
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
               sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
         } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
           int dev_idx = (wi * num_devices_per_worker) + di;
@@ -235,7 +236,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
             LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
                          "than one ring node.";
           } else {
-            local_devices.push_back(gpu_devices_[dev_idx]);
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
           }
         } else {
           LOG(FATAL) << "Unsupported device_type " << device_type;
@@ -243,13 +244,14 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       }
     }
     if (!dev_mgr_ || device_type == DEVICE_CPU) {
-      dev_mgr_.reset(new DeviceMgr(local_devices));
+      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
     }
+    if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
     dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
     rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
                            fail_after);
-    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
-                                           dev_mgr_.get());
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
     col_params_.name = "test_collective";
     col_params_.instance.data_type = dtype;
     static const int kGroupKey = 6;
@@ -713,8 +715,9 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
   std::unique_ptr<DeviceResolverLocal> dev_resolver_;
   std::vector<DeviceInstance*> instances_;
   CollectiveParams col_params_;
-  std::vector<tensorflow::Device*> gpu_devices_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
   std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
   int bcast_recv_counter_ GUARDED_BY(mu_) = 0;
   int bcast_send_counter_ GUARDED_BY(mu_) = 0;
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 1f585a8c24801e9139cab5cc650fce19dd97e05e..bdd6c0e87d4443873fa43789afad993399b23fd5 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -75,12 +75,12 @@ Benchmark::Benchmark(const string& device, Graph* g,
   const int graph_def_version = g->versions().producer();
 
   LocalExecutorParams params;
-  params.device = device_;
+  params.device = device_.get();
   params.function_library = nullptr;
   params.create_kernel = [this, graph_def_version](const NodeDef& ndef,
                                                    OpKernel** kernel) {
-    return CreateNonCachedKernel(device_, nullptr, ndef, graph_def_version,
-                                 kernel);
+    return CreateNonCachedKernel(device_.get(), nullptr, ndef,
+                                 graph_def_version, kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
@@ -107,7 +107,7 @@ Benchmark::~Benchmark() {
     // run kernel destructors that may attempt to access state borrowed from
     // `device_`, such as the resource manager.
     exec_.reset();
-    delete device_;
+    device_.reset();
     delete pool_;
   }
 }
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 555b43f655b49c76a0a01dd35d099248b4681300..b1557c50b0371d627e93c358073c3c17b681c80b 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -55,7 +55,7 @@ class Benchmark {
 
  private:
   thread::ThreadPool* pool_ = nullptr;
-  Device* device_ = nullptr;
+  std::unique_ptr<Device> device_ = nullptr;
   Rendezvous* rendez_ = nullptr;
   std::unique_ptr<Executor> exec_;
 
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 873182371e097cf0929cd6886b3ec70dfb9b3ab2..f1fcca194e9ef56bf7b96e6c73717db7620b9812 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -18,11 +18,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/common_runtime/process_state.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -30,23 +32,52 @@ namespace tensorflow {
 
 /* static */
 bool LocalDevice::use_global_threadpool_ = true;
+mutex LocalDevice::global_tp_mu_;
+gtl::InlinedVector<LocalDevice::EigenThreadPoolInfo*, 4>
+    LocalDevice::global_tp_info_;
 
 struct LocalDevice::EigenThreadPoolInfo {
-  explicit EigenThreadPoolInfo(const SessionOptions& options) {
+  // Wrapper so we can provide the CPUAllocator to Eigen for use
+  // when ops need extra tmp memory.
+  class EigenAllocator : public Eigen::Allocator {
+   public:
+    explicit EigenAllocator(tensorflow::Allocator* a) : allocator_(a) {}
+    void* allocate(size_t num_bytes) const override {
+      return allocator_->AllocateRaw(64, num_bytes);
+    }
+    void deallocate(void* buffer) const override {
+      allocator_->DeallocateRaw(buffer);
+    }
+    tensorflow::Allocator* allocator_;
+  };
+
+  explicit EigenThreadPoolInfo(const SessionOptions& options, int numa_node,
+                               Allocator* allocator) {
     int32 intra_op_parallelism_threads =
         options.config.intra_op_parallelism_threads();
     if (intra_op_parallelism_threads == 0) {
       intra_op_parallelism_threads = port::NumSchedulableCPUs();
+      if (numa_node != port::kNUMANoAffinity) {
+        // Assume that CPUs are equally distributed over available NUMA nodes.
+        // This may not be true, but there isn't currently a better way of
+        // determining the number of CPUs specific to the requested node.
+        intra_op_parallelism_threads /= port::NUMANumNodes();
+      }
     }
-    VLOG(1) << "Local device intra op parallelism threads: "
-            << intra_op_parallelism_threads;
+    ThreadOptions thread_opts;
+    thread_opts.numa_node = numa_node;
     eigen_worker_threads_.num_threads = intra_op_parallelism_threads;
     eigen_worker_threads_.workers = new thread::ThreadPool(
-        options.env, "Eigen", intra_op_parallelism_threads);
+        options.env, thread_opts, strings::StrCat("numa_", numa_node, "_Eigen"),
+        intra_op_parallelism_threads);
     eigen_threadpool_wrapper_.reset(
         new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    if (allocator) {
+      eigen_allocator_.reset(new EigenAllocator(allocator));
+    }
     eigen_device_.reset(new Eigen::ThreadPoolDevice(
-        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads,
+        eigen_allocator_.get()));
   }
 
   ~EigenThreadPoolInfo() {
@@ -58,6 +89,7 @@ struct LocalDevice::EigenThreadPoolInfo {
   DeviceBase::CpuWorkerThreads eigen_worker_threads_;
   std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
   std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+  std::unique_ptr<EigenAllocator> eigen_allocator_;
 };
 
 LocalDevice::LocalDevice(const SessionOptions& options,
@@ -68,15 +100,34 @@ LocalDevice::LocalDevice(const SessionOptions& options,
   port::InfoAboutUnusedCPUFeatures();
   LocalDevice::EigenThreadPoolInfo* tp_info;
   if (use_global_threadpool_) {
-    // All ThreadPoolDevices in the process will use this single fixed
-    // sized threadpool for numerical computations.
-    static LocalDevice::EigenThreadPoolInfo* global_tp_info =
-        new LocalDevice::EigenThreadPoolInfo(options);
-    tp_info = global_tp_info;
+    mutex_lock l(global_tp_mu_);
+    if (options.config.experimental().use_numa_affinity()) {
+      int numa_node = attributes.locality().numa_node();
+      int num_numa_nodes = port::NUMANumNodes();
+      DCHECK_LT(numa_node, num_numa_nodes);
+      Allocator* numa_allocator =
+          ProcessState::singleton()->GetCPUAllocator(numa_node);
+      while (numa_node >= global_tp_info_.size()) {
+        global_tp_info_.push_back(nullptr);
+      }
+      if (!global_tp_info_[numa_node]) {
+        global_tp_info_[numa_node] = new LocalDevice::EigenThreadPoolInfo(
+            options, numa_node, numa_allocator);
+      }
+      tp_info = global_tp_info_[numa_node];
+    } else {
+      if (global_tp_info_.empty()) {
+        global_tp_info_.push_back(new LocalDevice::EigenThreadPoolInfo(
+            options, port::kNUMANoAffinity, nullptr));
+      }
+      tp_info = global_tp_info_[0];
+    }
   } else {
     // Each LocalDevice owns a separate ThreadPoolDevice for numerical
     // computations.
-    owned_tp_info_.reset(new LocalDevice::EigenThreadPoolInfo(options));
+    // TODO(tucker): NUMA for these too?
+    owned_tp_info_.reset(new LocalDevice::EigenThreadPoolInfo(
+        options, port::kNUMANoAffinity, nullptr));
     tp_info = owned_tp_info_.get();
   }
   set_tensorflow_cpu_worker_threads(&tp_info->eigen_worker_threads_);
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index 226f121bf32e0259d13dca633627174d5cdab917..f305c212c5a331be7992188d2b2e4c323ab6d403 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -47,6 +47,13 @@ class LocalDevice : public Device {
   struct EigenThreadPoolInfo;
   std::unique_ptr<EigenThreadPoolInfo> owned_tp_info_;
 
+  // All ThreadPoolDevices in the process associated with the same
+  // NUMA node will share a single fixed sized threadpool for numerical
+  // computations.
+  static mutex global_tp_mu_;
+  static gtl::InlinedVector<EigenThreadPoolInfo*, 4> global_tp_info_
+      GUARDED_BY(global_tp_mu_);
+
   friend class test::Benchmark;
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalDevice);
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index a02084f223f9e14e4d0ce0d27437f1e71c726bec..9738006f5ca9eb821439a9ad507aec3db434946c 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -22,10 +22,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-// TODO(jpienaar): Consider making it a public attribute.
-const char* const LowerIfOpPass::kLowerUsingSwitchMergeAttr =
-    "_lower_using_switch_merge";
-
 namespace {
 
 using NodeOut = NodeBuilder::NodeOut;
@@ -80,7 +76,7 @@ class CondBuilder {
   // The identity node with the same outputs as the original If op.
   Node* lowered_if_output_;
   // The predicate of the conditional.
-  Node* pred_;
+  OutputTensor pred_;
   // Node corresponding to pivot_f branch of predicate switch which is
   // the pivot node that dominates all nodes in the false/else branch.
   Node* pivot_f_;
@@ -106,7 +102,9 @@ CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
       name_(if_op->name()),
       then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
       else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
-  TF_CHECK_OK(if_op_->input_node(0, &pred_));
+  TF_CHECK_OK(if_op_->input_tensor(0, &pred_));
+  then_call_builder_.Device(if_op_->requested_device());
+  else_call_builder_.Device(if_op_->requested_device());
 }
 
 Status CondBuilder::CreatePivotNodes() {
@@ -115,17 +113,20 @@ Status CondBuilder::CreatePivotNodes() {
   Node* switch_pred;
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName("switch_pred"), "Switch", graph_->op_registry())
-          .Input(NodeOut(pred_, 0))
-          .Input(NodeOut(pred_, 0))
+          .Input(NodeOut(pred_))
+          .Input(NodeOut(pred_))
+          .Device(if_op_->requested_device())
           .Finalize(graph_, &switch_pred));
   control_predecessor_ = switch_pred;
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName("pivot_f"), "Identity", graph_->op_registry())
           .Input(switch_pred, kElseBranch)
+          .Device(if_op_->requested_device())
           .Finalize(graph_, &pivot_f_));
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName("pivot_t"), "Identity", graph_->op_registry())
           .Input(switch_pred, kThenBranch)
+          .Device(if_op_->requested_device())
           .Finalize(graph_, &pivot_t_));
   return Status::OK();
 }
@@ -139,7 +140,8 @@ Status CondBuilder::AddInput(Node* src, int src_output) {
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry())
           .Input(src, src_output)
-          .Input(pred_, 0)
+          .Input(pred_)
+          .Device(if_op_->requested_device())
           .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
   else_call_builder_.Input(input, kElseBranch);
@@ -178,6 +180,7 @@ Status CondBuilder::AddOutputs() {
     TF_RETURN_IF_ERROR(
         NodeBuilder(graph_->NewName("merge"), "Merge", graph_->op_registry())
             .Input({NodeOut(then_call_node_, i), NodeOut(else_call_node_, i)})
+            .Device(if_op_->requested_device())
             .Finalize(graph_, &merges[i]));
     outputs_[i] = NodeOut(merges[i], 0);
   }
@@ -218,7 +221,7 @@ Status InlineCallInGraph(Node* n, const FunctionLibraryDefinition& flib,
 Status CondBuilder::BuildLoweredIfOutput() {
   // Build the identity node output.
   NodeBuilder ib(name_, "IdentityN");
-  ib.Input(outputs_);
+  ib.Input(outputs_).Device(if_op_->requested_device());
   return ib.Finalize(graph_, &lowered_if_output_);
 }
 
@@ -230,45 +233,7 @@ Status CondBuilder::InlineCallNodes() {
 
 }  // namespace
 
-Status LowerIfOpPass::Run(const GraphOptimizationPassOptions& options) {
-  if (options.partition_graphs != nullptr) {
-    return errors::Internal(
-        "Lowering If op should happen before partitioning.");
-  }
-  if (options.graph == nullptr) {
-    return Status::OK();
-  }
-
-  Graph* g = options.graph->get();
-  if (g == nullptr) {
-    return errors::Internal("Lowering If op requires a graph to be available.");
-  }
-
-  FunctionLibraryDefinition* flib = options.flib_def;
-  if (flib == nullptr) {
-    return errors::Internal(
-        "Lowering If op requires a FunctionLibraryDefinition to be available.");
-  }
-
-  // Match all the nodes that need to be rewritten.
-  gtl::InlinedVector<Node*, 2> matches;
-  for (Node* n : g->op_nodes()) {
-    if (n->type_string() == "If") {
-      // Only rewrite if the If op is marked as needing to be lowered.
-      bool match;
-      Status s = GetNodeAttr(n->attrs(), kLowerUsingSwitchMergeAttr, &match);
-      if (s.ok() && match) matches.push_back(n);
-    }
-  }
-  for (Node* n : matches) {
-    TF_RETURN_IF_ERROR(RewriteNode(n, *flib, g));
-  }
-  return Status::OK();
-}
-
-Status LowerIfOpPass::RewriteNode(Node* n,
-                                  const FunctionLibraryDefinition& flib,
-                                  Graph* g) {
+Status RewriteIfNode(Node* n, Graph* g, const FunctionLibraryDefinition& flib) {
   const AttrValue* then_attr = n->attrs().Find("then_branch");
   if (then_attr == nullptr) {
     return errors::InvalidArgument("Then branch function missing");
@@ -289,7 +254,4 @@ Status LowerIfOpPass::RewriteNode(Node* n,
   return Status::OK();
 }
 
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0,
-                      LowerIfOpPass);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_if_op.h b/tensorflow/core/common_runtime/lower_if_op.h
index 5ab1123e3f061c7f49d62849e1a465e627a1fb5f..fc52e597be1387c35b082c04d3f71547327a66fd 100644
--- a/tensorflow/core/common_runtime/lower_if_op.h
+++ b/tensorflow/core/common_runtime/lower_if_op.h
@@ -21,18 +21,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Rewrite If ops to use switch and merge nodes instead.
-class LowerIfOpPass : public GraphOptimizationPass {
- public:
-  static const char* const kLowerUsingSwitchMergeAttr;
-
-  Status Run(const GraphOptimizationPassOptions& options) override;
-
- private:
-  // Rewrite the given If node `n` in graph `g` to use the switch-merge
-  // form. `flib` should contain the branch functions referenced by `n`.
-  Status RewriteNode(Node* n, const FunctionLibraryDefinition& flib, Graph* g);
-};
+// Replaces If node `n` with its lowered form that uses Switch and Merge nodes.
+Status RewriteIfNode(Node* n, Graph* g, const FunctionLibraryDefinition& flib);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/lower_if_op_test.cc b/tensorflow/core/common_runtime/lower_if_op_test.cc
index 044a355d0657a6653dc6e7538a8ac91e8da55313..5765e3e367f3e0bb61f087e36ea84d4e9c8b4f15 100644
--- a/tensorflow/core/common_runtime/lower_if_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_if_op_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/lower_if_op.h"
+#include "tensorflow/core/common_runtime/lower_if_while.h"
 
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
@@ -40,7 +40,7 @@ Status Rewrite(std::unique_ptr<Graph>* graph) {
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = graph;
   opt_options.flib_def = &flib_def;
-  LowerIfOpPass pass;
+  LowerIfWhilePass pass;
   return pass.Run(opt_options);
 }
 
@@ -51,7 +51,6 @@ TEST(LowerIfOpTest, Simple) {
   FunctionDefLibrary f_lib_proto;
   *(f_lib_proto.add_function()) = test::function::XTimesTwo();
   *(f_lib_proto.add_function()) = test::function::XTimesFour();
-  FunctionLibraryDefinition f_lib(OpRegistry::Global(), f_lib_proto);
 
   // Construct simple conditional that switches on `pred` and operates only on
   // single input `A`.
@@ -65,12 +64,12 @@ TEST(LowerIfOpTest, Simple) {
   tb.mutable_func()->set_name("XTimesTwo");
   AttrValue eb;
   eb.mutable_func()->set_name("XTimesFour");
-  TF_ASSERT_OK(NodeBuilder("if", "If", &f_lib)
+  TF_ASSERT_OK(NodeBuilder("if", "If", &root.graph()->flib_def())
                    .Input(pred.node())
                    .Input(inputs)
                    .Attr("then_branch", tb)
                    .Attr("else_branch", eb)
-                   .Attr(LowerIfOpPass::kLowerUsingSwitchMergeAttr, true)
+                   .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
                    .Attr("Tout", {DT_INT32})
                    .Finalize(root.graph(), &written_if));
   TF_ASSERT_OK(root.DoShapeInference(written_if));
diff --git a/tensorflow/core/common_runtime/lower_if_while.cc b/tensorflow/core/common_runtime/lower_if_while.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce8d99d6f1e9939d98c7f33f6778eadbb4e352a8
--- /dev/null
+++ b/tensorflow/core/common_runtime/lower_if_while.cc
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/lower_if_while.h"
+#include "tensorflow/core/common_runtime/lower_if_op.h"
+#include "tensorflow/core/common_runtime/lower_while_op.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+namespace tensorflow {
+
+#if defined(_MSC_VER)
+constexpr char* LowerIfWhilePass::kLowerUsingSwitchMergeAttr;
+#else
+constexpr char LowerIfWhilePass::kLowerUsingSwitchMergeAttr[];
+#endif
+
+namespace {
+
+bool HasLoweringAttr(const AttrSlice& attrs) {
+  bool match;
+  Status s =
+      GetNodeAttr(attrs, LowerIfWhilePass::kLowerUsingSwitchMergeAttr, &match);
+  return s.ok() && match;
+}
+
+}  // namespace
+
+Status LowerIfWhilePass::Run(const GraphOptimizationPassOptions& options) {
+  if (options.partition_graphs != nullptr) {
+    return errors::Internal(
+        "Lowering If/While ops should happen before partitioning.");
+  }
+  if (options.graph == nullptr) {
+    return Status::OK();
+  }
+
+  Graph* g = options.graph->get();
+  if (g == nullptr) {
+    return errors::Internal(
+        "Lowering While op requires a graph to be available.");
+  }
+
+  FunctionLibraryDefinition* flib = options.flib_def;
+  if (flib == nullptr) {
+    return errors::Internal(
+        "Lowering If op requires a FunctionLibraryDefinition to be available.");
+  }
+
+  // Lower all If and While ops that have the `kLowerUsingSwitchMergeAttr` attr
+  // set.
+  // We start at `i` = 2 to skip the source and sink nodes.
+  // Note that `g->num_node_ids()` may change in the for body if a matching If
+  // or While node is lowered. Since new graph nodes are always added to the
+  // end of the list of nodes it is ensured that nested If/While nodes will be
+  // lowered as well.
+  for (int i = 2; i < g->num_node_ids(); ++i) {
+    Node* n = g->FindNodeId(i);
+    if (n == nullptr) continue;  // deleted node
+    if (HasLoweringAttr(n->attrs())) {
+      if (n->type_string() == "If") {
+        TF_RETURN_IF_ERROR(RewriteIfNode(n, g, *flib));
+      } else if (n->type_string() == "While") {
+        TF_RETURN_IF_ERROR(RewriteWhileNode(n, g, *flib));
+      } else {
+        return errors::Internal(
+            "Node ", FormatNodeForError(*n), " of type ", n->type_string(),
+            " has '", LowerIfWhilePass::kLowerUsingSwitchMergeAttr,
+            "' attr set but it does not support lowering.\n");
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0,
+                      LowerIfWhilePass);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_if_while.h b/tensorflow/core/common_runtime/lower_if_while.h
new file mode 100644
index 0000000000000000000000000000000000000000..efa3945bca4b90373e223f50200fc2366b897985
--- /dev/null
+++ b/tensorflow/core/common_runtime/lower_if_while.h
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_WHILE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_WHILE_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Rewrite If and While ops to use lower level control flow primitives instead.
+class LowerIfWhilePass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+#if defined(_MSC_VER)
+  static constexpr char* kLowerUsingSwitchMergeAttr =
+#else
+  static constexpr char kLowerUsingSwitchMergeAttr[] =
+#endif
+      "_lower_using_switch_merge";
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_WHILE_H_
diff --git a/tensorflow/core/common_runtime/lower_if_while_test.cc b/tensorflow/core/common_runtime/lower_if_while_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07bcecf168856b4222d95900e06a91bea78a0bd6
--- /dev/null
+++ b/tensorflow/core/common_runtime/lower_if_while_test.cc
@@ -0,0 +1,337 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/lower_if_while.h"
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+typedef FunctionDefHelper FDH;
+
+static void AssertHasSubstr(StringPiece s, StringPiece expected) {
+  ASSERT_TRUE(str_util::StrContains(s, expected))
+      << "'" << s << "' does not contain '" << expected << "'";
+}
+
+Status Rewrite(std::unique_ptr<Graph>* graph) {
+  FunctionLibraryDefinition flib_def((*graph)->flib_def());
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = graph;
+  opt_options.flib_def = &flib_def;
+  LowerIfWhilePass pass;
+  return pass.Run(opt_options);
+}
+
+// (counter:int32, pred:bool, x:int32) -> counter < N
+FunctionDef WhileWithIfCond(int32 N) {
+  const Tensor kN = test::AsScalar<int32>(N);
+  return FDH::Define(
+      // Name
+      "WhileWithIfCond",
+      // Args
+      {"counter: int32", "pred: bool", "x: int32"},
+      // Return values
+      {"z: bool"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"N"}, "Const", {}, {{"value", kN}, {"dtype", DT_INT32}}},
+          {{"z"}, "Less", {"counter", "N"}, {{"T", DT_INT32}}},
+      });
+}
+
+// (counter:int32, pred:bool, x:int32) ->
+//   (counter+1, pred, if pred: x * 2 else: x * 4)
+FunctionDef WhileWithIfBody() {
+  NameAttrList then_func;
+  then_func.set_name("XTimesTwo");
+  NameAttrList else_func;
+  else_func.set_name("XTimesFour");
+  const Tensor kOne = test::AsScalar<int32>(1);
+  std::vector<DataType> input_types = {DT_INT32};
+  std::vector<DataType> output_types = {DT_INT32};
+  return FDH::Define(
+      // Name
+      "WhileWithIfBody",
+      // Args
+      {"counter: int32", "pred: bool", "x: int32"},
+      // Return values
+      {"updated_counter: int32", "pred: bool", "if: int32"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"if"},
+           "If",
+           {"pred", "x"},
+           {{"then_branch", then_func},
+            {"else_branch", else_func},
+            {"Tcond", DT_BOOL},
+            {"Tin", input_types},
+            {"Tout", output_types},
+            {LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true}}},
+          {{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_INT32}}},
+          {{"updated_counter"}, "Add", {"counter", "one"}, {{"T", DT_INT32}}},
+      });
+}
+
+TEST(LowerIfWhileTest, CondInWhile) {
+  // Tests the value of `a` for different values of args after the following
+  // program:
+  //
+  // Args:
+  // counter = Arg(type = int32)
+  // pred = Arg(type = bool)
+  // a = Arg(type = int32)
+  // N = 3
+  // while (counter < N) {
+  //   counter += 1;
+  //   if (pred) {
+  //     a *= 2;
+  //   } else {
+  //     a *= 4;
+  //   }
+  // }
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  FunctionDefLibrary f_lib_proto;
+  // Cond functions.
+  *f_lib_proto.add_function() = test::function::XTimesTwo();
+  *f_lib_proto.add_function() = test::function::XTimesFour();
+  // While functions.
+  *f_lib_proto.add_function() = WhileWithIfCond(3);
+  *f_lib_proto.add_function() = WhileWithIfBody();
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto counter = ops::_Arg(root.WithOpName("counter"), DT_INT32, 0);
+  auto pred = ops::_Arg(root.WithOpName("pred"), DT_BOOL, 0);
+  auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+  std::vector<NodeBuilder::NodeOut> inputs(
+      {NodeBuilder::NodeOut(counter.node()), NodeBuilder::NodeOut(pred.node()),
+       NodeBuilder::NodeOut(a.node())});
+  Node* while_node;
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("WhileWithIfCond");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("WhileWithIfBody");
+  TF_ASSERT_OK(NodeBuilder("while", "While", &root.graph()->flib_def())
+                   .Input(inputs)
+                   .Attr("T", {DT_INT32, DT_BOOL, DT_INT32})
+                   .Attr("cond", cond_func)
+                   .Attr("body", body_func)
+                   .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
+                   .Finalize(root.graph(), &while_node));
+  TF_ASSERT_OK(root.DoShapeInference(while_node));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  // Lowered graph has no While and If ops.
+  for (const auto* op : graph->op_nodes()) {
+    ASSERT_NE(op->type_string(), "While");
+    ASSERT_NE(op->type_string(), "If");
+  }
+
+  // Verify execution.
+  ClientSession session(root);
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(counter.node()), Input::Initializer(0));
+    feeds.emplace(Output(pred.node()), Input::Initializer(true));
+    feeds.emplace(Output(a.node()), Input::Initializer(1));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(while_node, 2)}, &out_tensors));
+    ASSERT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 8);
+  }
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(counter.node()), Input::Initializer(0));
+    feeds.emplace(Output(pred.node()), Input::Initializer(false));
+    feeds.emplace(Output(a.node()), Input::Initializer(1));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(while_node, 2)}, &out_tensors));
+    ASSERT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 64);  // a
+  }
+}
+
+// x:int32 ->
+//    while x <= N:
+//      x*=2;
+//    return x;
+FunctionDef IfWithWhileThen() {
+  NameAttrList cond_func;
+  cond_func.set_name("LessThanOrEqualToN");
+  NameAttrList body_func;
+  body_func.set_name("XTimesTwo");
+  std::vector<DataType> input_and_output_types = {DT_INT32};
+  std::vector<TensorShape> output_shapes = {TensorShape()};
+  return FDH::Define(
+      // Name
+      "IfWithWhileThen",
+      // Args
+      {"x: int32"},
+      // Return values
+      {"while: int32"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"while"},
+           "While",
+           {"x"},
+           {{"cond", cond_func},
+            {"body", body_func},
+            {"T", input_and_output_types},
+            {"output_shapes", output_shapes},
+            {LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true}}},
+      });
+}
+
+TEST(LowerIfWhileTest, WhileInCond) {
+  // Tests the value of `a` for different values of args after the following
+  // program:
+  //
+  // Args:
+  // pred = Arg(type = bool)
+  // a = Arg(type = int32)
+  // N = 8
+  // if (pred) {
+  //   while (a <= N) {
+  //     a *= 2;
+  //   }
+  // }
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+  *f_lib_proto.add_function() = test::function::XTimesTwo();
+  *f_lib_proto.add_function() = test::function::LessThanOrEqualToN(8);
+  *f_lib_proto.add_function() = IfWithWhileThen();
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto pred = ops::_Arg(root.WithOpName("pred"), DT_BOOL, 0);
+  auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 1);
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+  AttrValue then_func;
+  then_func.mutable_func()->set_name("IfWithWhileThen");
+  AttrValue else_func;
+  else_func.mutable_func()->set_name("XTimesTwo");
+  Node* if_node;
+  TF_ASSERT_OK(NodeBuilder("if", "If", &root.graph()->flib_def())
+                   .Input(pred.node())
+                   .Input(inputs)
+                   .Attr("then_branch", then_func)
+                   .Attr("else_branch", else_func)
+                   .Attr("Tout", {DT_INT32})
+                   .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
+                   .Finalize(root.graph(), &if_node));
+  TF_ASSERT_OK(root.DoShapeInference(if_node));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  // The input graph has no lower level control flow primitives.
+  int node_called_if_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    ASSERT_FALSE(op->IsEnter());
+    ASSERT_FALSE(op->IsExit());
+    ASSERT_FALSE(op->IsSwitch());
+    ASSERT_FALSE(op->IsMerge());
+    ASSERT_FALSE(op->IsNextIteration());
+    ASSERT_FALSE(op->IsLoopCond());
+    if (op->name() == "if") {
+      node_called_if_count++;
+    }
+  }
+  ASSERT_EQ(node_called_if_count, 1);
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  node_called_if_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->name() == "if") {
+      node_called_if_count++;
+    }
+    ASSERT_NE(op->type_string(), "While");
+    ASSERT_NE(op->type_string(), "If");
+  }
+  // One node per loop input.
+  ASSERT_EQ(node_called_if_count, 1);
+
+  // Verify execution.
+  ClientSession session(root);
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(true));
+    feeds.emplace(Output(a.node()), Input::Initializer(1));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(if_node)}, &out_tensors));
+    ASSERT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 16);
+  }
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(false));
+    feeds.emplace(Output(a.node()), Input::Initializer(1));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(if_node)}, &out_tensors));
+    ASSERT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 2);
+  }
+}
+
+TEST(LowerIfWhileTest, RaisesWhenLoweringUnhandledOpType) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Node* const_node;
+  Tensor const_val(DT_INT32, TensorShape({}));
+  const_val.scalar<int32>()() = 1;
+  TF_ASSERT_OK(NodeBuilder("const", "Const")
+                   .Attr("value", const_val)
+                   .Attr("dtype", const_val.dtype())
+                   .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
+                   .Finalize(root.graph(), &const_node));
+  TF_ASSERT_OK(root.DoShapeInference(const_node));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  Status s = Rewrite(&graph);
+  ASSERT_EQ(s.code(), error::INTERNAL);
+  AssertHasSubstr(s.error_message(), "does not support lowering");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 1f5da133e9ebba4b8487865c1c48b570a02d81b2..6f9921a7968b9cad4bc96b21600fdb026636bc2a 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -53,8 +53,9 @@ using NodeOut = NodeBuilder::NodeOut;
 class LowerWhileHelper {
  public:
   static Status Run(Node* while_op, const string& cond_fn_name,
-                    const string& body_fn_name, Graph* graph) {
-    LowerWhileHelper helper(while_op, cond_fn_name, body_fn_name, graph);
+                    const string& body_fn_name, Graph* graph,
+                    const FunctionLibraryDefinition& flib) {
+    LowerWhileHelper helper(while_op, cond_fn_name, body_fn_name, graph, flib);
     return helper.RunInternal();
   }
 
@@ -63,7 +64,8 @@ class LowerWhileHelper {
   // and body functions named `cond_fn_name` and `body_fn_name` respectively in
   // the given graph.
   LowerWhileHelper(Node* while_op, const string& cond_fn_name,
-                   const string& body_fn_name, Graph* graph);
+                   const string& body_fn_name, Graph* graph,
+                   const FunctionLibraryDefinition& flib);
 
   Status RunInternal();
 
@@ -127,6 +129,7 @@ class LowerWhileHelper {
   // The IdentityN node with the same outputs as the original While op.
   Node* lowered_while_output_;
   Graph* graph_;
+  const FunctionLibraryDefinition& flib_;
   // Name of the `while_op_`.
   string name_;
 
@@ -143,9 +146,11 @@ class LowerWhileHelper {
 };
 
 LowerWhileHelper::LowerWhileHelper(Node* while_op, const string& cond_fn_name,
-                                   const string& body_fn_name, Graph* graph)
+                                   const string& body_fn_name, Graph* graph,
+                                   const FunctionLibraryDefinition& flib)
     : while_op_(while_op),
       graph_(graph),
+      flib_(flib),
       name_(while_op->name()),
       cond_call_builder_(NewName("cond"), cond_fn_name, graph->op_registry()),
       body_call_builder_(NewName("body"), body_fn_name, graph->op_registry()),
@@ -346,8 +351,8 @@ string LowerWhileHelper::NewName(const string& infix) {
   return graph_->NewName(strings::StrCat(name_, "/", infix));
 }
 
-Status InlineCallInGraph(Node* n, Graph* g) {
-  const auto& lib = g->flib_def();
+Status InlineCallInGraph(Node* n, Graph* g,
+                         const FunctionLibraryDefinition& lib) {
   const FunctionDef* fdef = lib.Find(n->type_string());
   CHECK(fdef != nullptr);
   FunctionBody* fbody;
@@ -365,46 +370,15 @@ Status InlineCallInGraph(Node* n, Graph* g) {
 }
 
 Status LowerWhileHelper::InlineCallNodes() {
-  TF_RETURN_IF_ERROR(InlineCallInGraph(cond_call_node_, graph_));
-  TF_RETURN_IF_ERROR(InlineCallInGraph(body_call_node_, graph_));
+  TF_RETURN_IF_ERROR(InlineCallInGraph(cond_call_node_, graph_, flib_));
+  TF_RETURN_IF_ERROR(InlineCallInGraph(body_call_node_, graph_, flib_));
   return Status::OK();
 }
 
 }  // namespace
 
-Status LowerWhileOpPass::Run(const GraphOptimizationPassOptions& options) {
-  if (options.partition_graphs != nullptr) {
-    return errors::Internal(
-        "Lowering While op should happen before partitioning.");
-  }
-  if (options.graph == nullptr) {
-    return Status::OK();
-  }
-
-  Graph* g = options.graph->get();
-  if (g == nullptr) {
-    return errors::Internal(
-        "Lowering While op requires a graph to be available.");
-  }
-
-  // Match all the nodes that need to be rewritten.
-  gtl::InlinedVector<Node*, 2> matches;
-  for (Node* n : g->op_nodes()) {
-    if (n->type_string() == "While") {
-      // Only rewrite if the While op is marked as needing to be lowered.
-      bool match;
-      Status s = GetNodeAttr(n->attrs(),
-                             LowerIfOpPass::kLowerUsingSwitchMergeAttr, &match);
-      if (s.ok() && match) matches.push_back(n);
-    }
-  }
-  for (Node* n : matches) {
-    TF_RETURN_IF_ERROR(RewriteNode(n, g));
-  }
-  return Status::OK();
-}
-
-Status LowerWhileOpPass::RewriteNode(Node* n, Graph* g) {
+Status RewriteWhileNode(Node* n, Graph* g,
+                        const FunctionLibraryDefinition& flib) {
   const AttrValue* cond_attr = n->attrs().Find("cond");
   if (cond_attr == nullptr) {
     return errors::InvalidArgument("While cond function missing");
@@ -415,13 +389,10 @@ Status LowerWhileOpPass::RewriteNode(Node* n, Graph* g) {
   }
 
   TF_RETURN_IF_ERROR(LowerWhileHelper::Run(n, cond_attr->func().name(),
-                                           body_attr->func().name(), g));
+                                           body_attr->func().name(), g, flib));
   g->RemoveNode(n);
 
   return Status::OK();
 }
 
-REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0,
-                      LowerWhileOpPass);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_while_op.h b/tensorflow/core/common_runtime/lower_while_op.h
index eadafbeb9105ca906533c5e741a897ee2b9377ff..4b640bafba57636518e29e4ccc970fd841ee41ce 100644
--- a/tensorflow/core/common_runtime/lower_while_op.h
+++ b/tensorflow/core/common_runtime/lower_while_op.h
@@ -21,16 +21,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Rewrite While ops to use lower level control flow primitives instead.
-class LowerWhileOpPass : public GraphOptimizationPass {
- public:
-  Status Run(const GraphOptimizationPassOptions& options) override;
-
- private:
-  // Rewrite the given While node `n` in graph `g` to use the lower level
-  // primitives Enter, Exit, Switch, Merge and NextIteration.
-  Status RewriteNode(Node* n, Graph* g);
-};
+// Replaces While node `n` with its lowered form that uses Enter, Exit, Switch,
+// Merge, NextIteration and LoopCond nodes.
+Status RewriteWhileNode(Node* n, Graph* g,
+                        const FunctionLibraryDefinition& flib);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index 27cbada004e646c1165398188c4df75ca010cf93..24fd4ed5bb5939e066fa5b8d75b9b9c3aaf5895a 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/lower_while_op.h"
-#include "tensorflow/core/common_runtime/lower_if_op.h"
+#include "tensorflow/core/common_runtime/lower_if_while.h"
 
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
@@ -37,13 +36,11 @@ namespace tensorflow {
 namespace {
 
 Status Rewrite(std::unique_ptr<Graph>* graph) {
-  FunctionDefLibrary flib;
-  FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib);
-
+  FunctionLibraryDefinition flib_def((*graph)->flib_def());
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = graph;
   opt_options.flib_def = &flib_def;
-  LowerWhileOpPass pass;
+  LowerIfWhilePass pass;
   return pass.Run(opt_options);
 }
 
@@ -54,7 +51,6 @@ TEST(LowerWhileOpTest, Simple) {
   FunctionDefLibrary f_lib_proto;
   *f_lib_proto.add_function() = test::function::XTimesTwo();
   *f_lib_proto.add_function() = test::function::LessThanOrEqualToN(8);
-  FunctionLibraryDefinition f_lib(OpRegistry::Global(), f_lib_proto);
 
   Scope root = Scope::NewRootScope().ExitOnError();
   TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
@@ -65,12 +61,12 @@ TEST(LowerWhileOpTest, Simple) {
   cond_func.mutable_func()->set_name("LessThanOrEqualToN");
   AttrValue body_func;
   body_func.mutable_func()->set_name("XTimesTwo");
-  TF_ASSERT_OK(NodeBuilder("while", "While", &f_lib)
+  TF_ASSERT_OK(NodeBuilder("while", "While", &root.graph()->flib_def())
                    .Input(inputs)
                    .Attr("T", {DT_INT32})
                    .Attr("cond", cond_func)
                    .Attr("body", body_func)
-                   .Attr(LowerIfOpPass::kLowerUsingSwitchMergeAttr, true)
+                   .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
                    .Finalize(root.graph(), &while_node));
   TF_ASSERT_OK(root.DoShapeInference(while_node));
   TF_ASSERT_OK(root.ToGraph(graph.get()));
@@ -154,7 +150,6 @@ TEST(LowerWhileOpTest, MultipleInputs) {
   FunctionDefLibrary f_lib_proto;
   *(f_lib_proto.add_function()) = test::function::XPlusOneXTimesY();
   *(f_lib_proto.add_function()) = test::function::XYXLessThanOrEqualToN(4);
-  FunctionLibraryDefinition f_lib(OpRegistry::Global(), f_lib_proto);
 
   Scope root = Scope::NewRootScope().ExitOnError();
   TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
@@ -167,12 +162,12 @@ TEST(LowerWhileOpTest, MultipleInputs) {
   cond_func.mutable_func()->set_name("XYXLessThanOrEqualToN");
   AttrValue body_func;
   body_func.mutable_func()->set_name("XPlusOneXTimesY");
-  TF_ASSERT_OK(NodeBuilder("while", "While", &f_lib)
+  TF_ASSERT_OK(NodeBuilder("while", "While", &root.graph()->flib_def())
                    .Input(inputs)
                    .Attr("T", {DT_INT32, DT_INT32})
                    .Attr("cond", cond_func)
                    .Attr("body", body_func)
-                   .Attr(LowerIfOpPass::kLowerUsingSwitchMergeAttr, true)
+                   .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
                    .Finalize(root.graph(), &while_node));
   TF_ASSERT_OK(root.DoShapeInference(while_node));
   TF_ASSERT_OK(root.ToGraph(graph.get()));
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 116750fbfd60f74ff49390de56f659308aa50f5c..f2534b7dc3bbfeda29682ec990bc8816056c1edc 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -96,11 +96,12 @@ Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g) {
         if (sm == dm) {
           return Status::OK();
         }
-        return errors::Internal(
-            "Memory type mismatch (", sm, " ", dm,
-            ") between :", e->src()->id(), ":", e->src_output(), " and ",
-            e->dst()->id(), ":", e->dst_input(), " : from ",
-            e->src()->DebugString(), " to ", e->dst()->DebugString());
+        return errors::Internal("Memory type mismatch (", sm, " ", dm,
+                                ") between :", e->src()->id(), ":",
+                                e->src_output(), " and ", e->dst()->id(), ":",
+                                e->dst_input(), " : from ",
+                                FormatNodeForError(*e->src()), " to ",
+                                FormatNodeForError(*e->dst()));
       });
 }
 
@@ -209,7 +210,7 @@ Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
                                         &inp_mvec, &out_mvec));
   if (out_mvec.size() <= index) {
     return errors::Internal("Trying to get the memory type for ", index,
-                            "'th output of node ", n->DebugString(),
+                            "'th output of node ", FormatNodeForError(*n),
                             " that has only ", out_mvec.size(), " outputs");
   }
   *memory_type = out_mvec[index];
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4c94ed7ec0cb1c5e8b341b75f1d075d30d6125a
--- /dev/null
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/metrics.h"
+#include "tensorflow/core/lib/monitoring/counter.h"
+
+namespace tensorflow {
+
+namespace {
+
+auto* graph_runs = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_runs",
+    "The number of graph executions used to collect "
+    "/tensorflow/core/graph_run_time_usecs");
+
+auto* graph_run_time_usecs = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_run_time_usecs",
+    "The total time spent on executing graphs in microseconds.");
+}  // namespace
+
+void UpdateGraphExecTime(const uint64 running_time_usecs) {
+  if (running_time_usecs > 0) {
+    graph_runs->GetCell()->IncrementBy(1);
+    graph_run_time_usecs->GetCell()->IncrementBy(running_time_usecs);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3430c9f030998f118c1626e6bbed93dd316a525
--- /dev/null
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+void UpdateGraphExecTime(const uint64 running_time_usecs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index 6ac047295dce8f78016d8ce65ddebeb20c372531..9be540b0192416b6dfa636b054bd174bb8376eec 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -39,6 +40,19 @@ Status OptimizationPassRegistry::RunGrouping(
         VLOG(1) << "Running optimization pass: " << pass->name();
         Status s = pass->Run(options);
         if (!s.ok()) return s;
+        if (VLOG_IS_ON(1)) {
+          DumpGraphToFile(
+              strings::StrCat("after_phase_", phase.first, "_", pass->name()),
+              **options.graph);
+          if (options.partition_graphs) {
+            for (auto& part : *options.partition_graphs) {
+              DumpGraphToFile(
+                  strings::StrCat("after_phase_", phase.first, "_",
+                                  pass->name(), "_partition_", part.first),
+                  *part.second);
+            }
+          }
+        }
       }
     }
   }
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 3b599954334fefa8c68d7bcb95c4cfc4d062c152..515c1971d9d5cb179b7b9764ff3462579e742dfc 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -30,6 +31,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/port.h"
 
 namespace tensorflow {
 
@@ -44,27 +47,51 @@ const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
 // returned list is sorted by preferred type (higher numeric type is preferred).
 std::vector<Device*> FilterSupportedDevices(
     const std::vector<Device*>& devices,
-    const DeviceTypeVector& supported_device_types) {
-  std::vector<Device*> filtered_devices;
-  for (const DeviceType& d : supported_device_types) {
+    const PrioritizedDeviceTypeVector& supported_device_types,
+    const Device* default_device) {
+  Device* filtered_default_device = nullptr;
+  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
+  for (const auto& supported_device_type : supported_device_types) {
     for (Device* device : devices) {
-      if (DeviceType(device->attributes().device_type()) == d) {
-        filtered_devices.emplace_back(device);
+      if (DeviceType(device->attributes().device_type()) ==
+          supported_device_type.first) {
+        if (device == default_device) {
+          filtered_default_device = device;
+        } else {
+          prioritized_filtered_devices.emplace_back(
+              device, supported_device_type.second);
+        }
       }
     }
   }
 
-  auto device_sort = [](const Device* a, const Device* b) {
-    auto a_priority = DeviceSet::DeviceTypeOrder(DeviceType(a->device_type()));
-    auto b_priority = DeviceSet::DeviceTypeOrder(DeviceType(b->device_type()));
+  auto device_sort = [](const std::pair<Device*, int32>& a,
+                        const std::pair<Device*, int32>& b) {
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+
+    auto a_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
+    auto b_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
     // First sort by prioritized device type (higher is preferred) and
     // then by device name (lexicographically).
     if (a_priority != b_priority) {
       return a_priority > b_priority;
     }
-    return StringPiece(a->name()) < StringPiece(b->name());
+    return StringPiece(a.first->name()) < StringPiece(b.first->name());
   };
-  std::sort(filtered_devices.begin(), filtered_devices.end(), device_sort);
+  std::sort(prioritized_filtered_devices.begin(),
+            prioritized_filtered_devices.end(), device_sort);
+
+  std::vector<Device*> filtered_devices;
+  if (filtered_default_device != nullptr) {
+    filtered_devices.emplace_back(filtered_default_device);
+  }
+  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
+    filtered_devices.push_back(prioritized_filtered_device.first);
+  }
   return filtered_devices;
 }
 
@@ -99,11 +126,12 @@ std::vector<Device*> FilterSupportedDevices(
 class ColocationGraph {
  public:
   ColocationGraph(Graph* graph, const DeviceSet* device_set,
-                  bool allow_soft_placement)
+                  bool allow_soft_placement, const Device* default_device)
       : graph_(graph),
         device_set_(device_set),
         device_types_(device_set->PrioritizedDeviceTypeList()),
-        allow_soft_placement_(allow_soft_placement) {
+        allow_soft_placement_(allow_soft_placement),
+        default_device_(default_device) {
     members_.resize(graph->num_node_ids());
   }
 
@@ -314,7 +342,8 @@ class ColocationGraph {
         // Filter devices into those that are compatible with the root
         // node (and its children).
         devices = FilterSupportedDevices(
-            devices, members_[node_root].supported_device_types);
+            devices, members_[node_root].supported_device_types,
+            default_device_);
       }
 
       // Perform soft placement if allow_soft_placement_ is set.
@@ -329,7 +358,8 @@ class ColocationGraph {
         device_set_->FindMatchingDevices(soft_device_name, &devices);
         if (!devices.empty()) {
           devices = FilterSupportedDevices(
-              devices, members_[node_root].supported_device_types);
+              devices, members_[node_root].supported_device_types,
+              default_device_);
         }
       }
 
@@ -358,11 +388,20 @@ class ColocationGraph {
             }
             std::sort(device_names.begin(), device_names.end());
 
+            string gpu_msg = "";
+            if (!IsGoogleCudaEnabled() &&
+                str_util::Lowercase(specified_device_name.type) == "gpu") {
+              gpu_msg =
+                  " The requested device appears to be a GPU, but CUDA is not "
+                  "enabled.";
+            }
+
             return errors::InvalidArgument(
-                "Operation was explicitly assigned to ",
-                node->requested_device(), " but available devices are [ ",
+                errors::FormatNodeNameForError(node->name()),
+                "was explicitly assigned to ", node->requested_device(),
+                " but available devices are [ ",
                 str_util::Join(device_names, ", "), " ]. Make sure ",
-                "the device specification refers to a valid device.");
+                "the device specification refers to a valid device.", gpu_msg);
           } else if (specified_device_name.has_type) {
             return errors::InvalidArgument(
                 "Could not satisfy explicit device specification '",
@@ -396,7 +435,8 @@ class ColocationGraph {
         return errors::Internal("No devices are registered");
       }
       devices = FilterSupportedDevices(
-          device_set_->devices(), members_[node_root].supported_device_types);
+          device_set_->devices(), members_[node_root].supported_device_types,
+          default_device_);
 
       if (devices.empty()) {
         return errors::InvalidArgument(
@@ -441,7 +481,7 @@ class ColocationGraph {
     // The intersection of all device types supported by this node,
     // and those of all of its children, in priority order
     // of the preferred device.
-    DeviceTypeVector supported_device_types;
+    PrioritizedDeviceTypeVector supported_device_types;
 
     // The merged form of the device requested for this node, with
     // those of all of its children.
@@ -480,8 +520,8 @@ class ColocationGraph {
       const string& op_type = node->type_string();
       string devices_registered;
       for (const auto& device_type : members_[id].supported_device_types) {
-        strings::StrAppend(&devices_registered, DeviceTypeString(device_type),
-                           " ");
+        strings::StrAppend(&devices_registered,
+                           DeviceTypeString(device_type.first), " ");
       }
 
       type_to_devices[op_type] = std::move(devices_registered);
@@ -534,8 +574,9 @@ class ColocationGraph {
                                 "' does not match any device");
       }
 
-      for (const DeviceType& d : member->supported_device_types) {
-        if (DeviceType(assigned_device->attributes().device_type()) == d) {
+      for (const auto& d : member->supported_device_types) {
+        if (DeviceType(assigned_device->attributes().device_type()) ==
+            d.first) {
           return Status::OK();
         }
       }
@@ -556,11 +597,21 @@ class ColocationGraph {
         for (Device* d : device_set_->devices()) {
           registered_device_types.insert(d->device_type());
         }
+        std::vector<string> attr_key_vals;
+        for (const auto& it : node.attrs()) {
+          const string& name = it.first;
+          const AttrValue& attr_value = it.second;
+          attr_key_vals.push_back(
+              strings::StrCat(name, "=", SummarizeAttrValue(attr_value)));
+        }
         return errors::InvalidArgument(
             "No OpKernel was registered to support Op '", node.type_string(),
-            "' with these attrs.  Registered devices: [",
-            str_util::Join(registered_device_types, ","),
-            "], Registered kernels:\n",
+            "' used by ", errors::FormatNodeNameForError(node.name()),
+            "with these attrs: [", str_util::Join(attr_key_vals, ", "),
+            "]\n"
+            "Registered devices: [",
+            str_util::Join(registered_device_types, ", "), "]\n",
+            "Registered kernels:\n",
             KernelsRegisteredForOp(node.type_string()));
       }
 
@@ -582,24 +633,102 @@ class ColocationGraph {
     return Status::OK();
   }
 
+  static bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
+    for (const auto& prioritized_device_type : device_types) {
+      if (prioritized_device_type.second != 0) return true;
+    }
+    return false;
+  }
+
+  static bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
+                                const PrioritizedDeviceTypeVector& b_types) {
+    if (a_types.size() != b_types.size()) {
+      return false;
+    }
+    for (int i = 0; i < a_types.size(); ++i) {
+      if (a_types[i].first != b_types[i].first) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Updates target to contain the intersection of the device types in
   // "target" and "other".
-  static void MergeSupportedDevices(DeviceTypeVector* target,
-                                    const DeviceTypeVector& other) {
-    DeviceTypeVector temp = *target;
+  static void MergeSupportedDevices(PrioritizedDeviceTypeVector* target,
+                                    const PrioritizedDeviceTypeVector& other) {
+    PrioritizedDeviceTypeVector temp = *target;
     target->clear();
 
-    // Iterate in priority order.
-    for (const DeviceType& device_type : temp) {
+    // Generate intersection with priorities.
+    PrioritizedDeviceTypeVector target_intersection;
+    PrioritizedDeviceTypeVector other_intersection;
+    for (const auto& prioritized_device_type : temp) {
       bool found = false;
-      for (const DeviceType& other_device_type : other) {
-        if (device_type == other_device_type) {
+      for (const auto& other_prioritized_device_type : other) {
+        if (prioritized_device_type.first ==
+            other_prioritized_device_type.first) {
           found = true;
+          other_intersection.push_back(other_prioritized_device_type);
           break;
         }
       }
       if (found) {
-        target->push_back(device_type);
+        target_intersection.push_back(prioritized_device_type);
+      }
+    }
+
+    // Sort the devices by priority order.
+    auto device_sort = [](const std::pair<DeviceType, int32>& a,
+                          const std::pair<DeviceType, int32>& b) {
+      // First look at set priorities.
+      if (a.second != b.second) {
+        return a.second > b.second;
+      }
+      // Then fallback to default priorities.
+      auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
+      auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
+      if (a_priority != b_priority) {
+        return a_priority > b_priority;
+      }
+      // Finally just look at the Device type strings.
+      return a.first.type_string() < b.first.type_string();
+    };
+
+    std::sort(target_intersection.begin(), target_intersection.end(),
+              device_sort);
+    std::sort(other_intersection.begin(), other_intersection.end(),
+              device_sort);
+
+    bool is_target_prioritized = HasPriorities(target_intersection);
+    bool is_other_prioritized = HasPriorities(other_intersection);
+    // If neither are prioritized then we just return the original i.e. target
+    // prioritization.
+    if (!is_target_prioritized && !is_other_prioritized) {
+      *target = target_intersection;
+    }
+    // If only one is prioritized, then we respect priorities of that in the
+    // intersection.
+    if (is_target_prioritized && !is_other_prioritized) {
+      *target = target_intersection;
+    }
+    if (!is_target_prioritized && is_other_prioritized) {
+      *target = other_intersection;
+    }
+    // If both have priorities and agree then we go with that. If the
+    // prioritization order is different, then we just fallback to the default
+    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
+    // merged priorities to 0, so that downstream merges work correctly as well.
+    if (is_target_prioritized && is_other_prioritized) {
+      bool priorities_agree =
+          ArePrioritiesSame(target_intersection, other_intersection);
+      if (priorities_agree) {
+        *target = target_intersection;
+      } else {
+        for (const auto& prioritized_device : target_intersection) {
+          target->push_back(std::make_pair(prioritized_device.first, 0));
+        }
+        std::sort(target->begin(), target->end(), device_sort);
       }
     }
   }
@@ -659,6 +788,7 @@ class ColocationGraph {
   const DeviceSet* device_set_;  // Not owned.
   const std::vector<DeviceType> device_types_;
   const bool allow_soft_placement_;
+  const Device* default_device_;
 };
 
 // Returns true if the node has no inputs and produces outputs
@@ -684,15 +814,16 @@ bool IsExemptFromResourceInputColocation(const Node* node) {
 }  // namespace
 
 Placer::Placer(Graph* graph, const DeviceSet* devices,
-               const SessionOptions* options)
+               const SessionOptions* options, const Device* default_device)
     : graph_(graph),
       devices_(devices),
       options_(options),
       log_device_placement_(options != nullptr &&
-                            options->config.log_device_placement()) {}
+                            options->config.log_device_placement()),
+      default_device_(default_device) {}
 
 Placer::Placer(Graph* graph, const DeviceSet* devices)
-    : Placer(graph, devices, nullptr) {}
+    : Placer(graph, devices, nullptr, nullptr) {}
 
 Placer::~Placer() {}
 
@@ -703,7 +834,8 @@ Status Placer::Run() {
 
   ColocationGraph colocation_graph(
       graph_, devices_,
-      options_ == nullptr || options_->config.allow_soft_placement());
+      options_ == nullptr || options_->config.allow_soft_placement(),
+      default_device_);
 
   TF_RETURN_IF_ERROR(colocation_graph.InitializeMembers());
 
@@ -870,7 +1002,7 @@ Status Placer::Run() {
     int assigned_device = -1;
 
     // Heuristic A application.
-    if (IsGeneratorNode(node)) {
+    if (IsGeneratorNode(node) && !node->out_edges().empty()) {
       const Node* output = (*node->out_edges().begin())->dst();
       int output_device_name = output->assigned_device_name_index();
 
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index f97ffe7372e63029841d00d7b72d3a6e471fc74f..e3e8f3790c5fc1d6223a9e6ba1d3aa79eca0d3e3 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -62,9 +62,14 @@ class Placer {
   // Graph "graph" (nodes in which may or may not be assigned) on the
   // given DeviceSet "devices".
   //
-  // The "graph", and "devices" pointer arguments
-  // are borrowed by this Placer, and must outlive it.
-  Placer(Graph* graph, const DeviceSet* devices, const SessionOptions* options);
+  // If non-null, default_device is used where possible as a placement for nodes
+  // which do not have a device specified, ahead of other devices which would
+  // otherwise be higher priority.
+  //
+  // The "graph", "devices", and "default_device" pointer arguments are borrowed
+  // by this Placer, and must outlive it.
+  Placer(Graph* graph, const DeviceSet* devices, const SessionOptions* options,
+         const Device* default_device);
 
   Placer(Graph* graph, const DeviceSet* devices);
 
@@ -92,6 +97,7 @@ class Placer {
   const DeviceSet* const devices_;  // Not owned.
   const SessionOptions* options_;   // Not owned.
   const bool log_device_placement_;
+  const Device* default_device_;  // Not owned.
 
   TF_DISALLOW_COPY_AND_ASSIGN(Placer);
 };
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 9b8a95e3b6975d2c7ebfc0317d8e45a569412c3a..04e77e55f62e1bd9345c8e9113407bbf0a375774 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -92,7 +92,7 @@ class FakeDevice : public Device {
 class DummyFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
+                       std::vector<std::unique_ptr<Device>>* devices) override {
     return Status::OK();
   }
 };
@@ -164,6 +164,13 @@ REGISTER_KERNEL_BUILDER(Name("TestDeviceEnforce").Device("FakeGPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("Shape").Device("FakeCPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("Shape").Device("FakeGPU"), DummyOp);
 
+// Op that has kernels with device priorities specified.
+REGISTER_OP("TestDatasetOp").Input("a: float").Output("b: float");
+REGISTER_KERNEL_BUILDER(Name("TestDatasetOp").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("TestDatasetOp").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // A PlacerTest method has three phases:
@@ -208,7 +215,7 @@ class PlacerTest : public ::testing::Test {
   //
   // REQUIRES: "*graph" was produced by the most recent call to BuildGraph.
   Status Place(Graph* graph, DeviceSet* devices, SessionOptions* options) {
-    Placer placer(graph, devices, options);
+    Placer placer(graph, devices, options, nullptr);
     return placer.Run();
   }
 
@@ -285,6 +292,251 @@ TEST_F(PlacerTest, TestNoConstraints) {
   EXPECT_DEVICE_TYPE(g, "n2", "FakeGPU");
 }
 
+// Test that a graph with no constraints but using kernels that have a specified
+// device priority will successfully assign nodes to the device with higher
+// priority
+TEST_F(PlacerTest, TestNoConstraintsWithPrioritizedKernels) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestInput", b.opts().WithName("in"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 1),
+                 b.opts().WithName("n2"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "n2", "FakeCPU");
+}
+
+TEST_F(PlacerTest, TestGPUInputIntoPrioritizedKernel) {
+  Graph g(OpRegistry::Global());
+  {
+    // Scope for temp variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestGPUOutput", b.opts().WithName("in"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeCPU");
+}
+
+// Tests that a GPU kernel colocated with prioritized kernel respects it.
+TEST_F(PlacerTest, TestGPUInputColocatedWithPrioritizedKernel) {
+  Graph g(OpRegistry::Global());
+  {
+    // Scope for temp variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestGPUOutput", b.opts().WithName("in"));
+    // We colocate n1 with in.
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1").WithAttr("_class", {"loc:@in"}));
+    // We don't colocate n2 with in.
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n2"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n2", "FakeCPU");
+}
+
+REGISTER_OP("CreateDatasetCPU").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetCPU").Device("FakeCPU"), DummyOp);
+
+REGISTER_OP("CreateDatasetSP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetSP").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetSP").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
+REGISTER_OP("CreateDatasetRP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetRP").Device("FakeCPU").Priority(1),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetRP").Device("FakeGPU").Priority(2),
+                        DummyOp);
+
+REGISTER_OP("CreateDatasetNP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetNP").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetNP").Device("FakeGPU"), DummyOp);
+
+REGISTER_OP("IteratorNP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorNP").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorNP").Device("FakeGPU"), DummyOp);
+
+REGISTER_OP("IteratorSP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorSP").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorSP").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
+REGISTER_OP("IteratorRP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorRP").Device("FakeCPU").Priority(1),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorRP").Device("FakeGPU").Priority(2),
+                        DummyOp);
+
+REGISTER_OP("IteratorGPU").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorGPU").Device("FakeGPU"), DummyOp);
+
+// Test reference edges with one node having prioritized kernels and the other
+// has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestDSWithPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorNP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test reference edges with one node having kernels with regular priority and
+// the other has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestDSWithGPUPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorNP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and the other
+// has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestITWithPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetNP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test reference edges with one node having kernels with regular priority and
+// the other has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestITWithGPUPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetNP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and other node
+// can only be placed on GPU. We should respect the constraint then.
+TEST_F(PlacerTest, TestITGPU) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorGPU", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and other node
+// can only be placed on CPU. We should respect the constraint then.
+TEST_F(PlacerTest, TestSimpleIteratorOnlyGPU) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetCPU", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test constraints with agreeing priorities.
+TEST_F(PlacerTest, TestAgreeingPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test constraints with agreeing regular priorities.
+TEST_F(PlacerTest, TestAgreeingRegularPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test constraints with different priorities. In this case, we should bail
+// and just revert to default.
+TEST_F(PlacerTest, TestConflictingPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test constraints with different priorities. In this case, we should bail
+// and just revert to default.
+TEST_F(PlacerTest, TestConflictingPrioritiesReversed) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
 // Test that a graph with device type and reference constraints on
 // some of the ops will successfully assign nodes to the constrained
 // device, and colocate nodes with reference connections.
@@ -1028,9 +1280,10 @@ TEST_F(PlacerTest, TestNoKernelsRegistered) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "No OpKernel was registered to support Op 'VariableNoKernels'"));
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(),
+                            "No OpKernel was registered to support Op "
+                            "'VariableNoKernels' used by {{node var}}"));
   EXPECT_TRUE(
       str_util::StrContains(s.error_message(), "<no registered kernels>"));
 }
@@ -1052,9 +1305,9 @@ TEST_F(PlacerTest, TestNoDevicesRegistered) {
 
   Status s = Place(&g, &cpu_only);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "No OpKernel was registered to support Op 'VariableGPU'"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "No OpKernel was registered to support Op "
+                                    "'VariableGPU' used by {{node var}}"));
   EXPECT_TRUE(str_util::StrContains(s.error_message(), "device='FakeGPU'"));
 }
 
@@ -1193,14 +1446,37 @@ TEST_F(PlacerTest, TestNonExistentDevice) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
+  SessionOptions options;
+  Status s = Place(&g, &options);
+  EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+  LOG(WARNING) << s.error_message();
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "was explicitly assigned to /job:foo/replica:17"));
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(), "but available devices"));
+}
+
+#if !GOOGLE_CUDA
+// Test that we inform the user if they appear to be explicitly placing nodes
+// on a GPU when CUDA is not available
+TEST_F(PlacerTest, TestUseGpuWithNoCuda) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    ops::SourceOp("VariableGPU",
+                  b.opts().WithName("var").WithDevice("/device:gpu:0"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
   EXPECT_TRUE(str_util::StrContains(
       s.error_message(),
-      "was explicitly assigned to /job:foo/replica:17 but available devices"));
+      "The requested device appears to be a GPU, but CUDA is not enabled."));
 }
+#endif
 
 TEST_F(PlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
   Graph g(OpRegistry::Global());
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index 66dc8f332217c30a3b3a1745a7c90a1880e3e068..6b40fcc4c70f50ba5bc643855a8035d73b92bfb0 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -258,7 +259,12 @@ void PoolAllocator::EvictOne() {
 void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
   void* ptr = nullptr;
   if (num_bytes > 0) {
-    ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+    if (numa_node_ == port::kNUMANoAffinity) {
+      ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+    } else {
+      ptr =
+          port::NUMAMalloc(numa_node_, num_bytes, static_cast<int>(alignment));
+    }
     VisitAlloc(ptr, numa_node_, num_bytes);
   }
   return ptr;
@@ -267,7 +273,11 @@ void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
 void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) {
   if (num_bytes > 0) {
     VisitFree(ptr, numa_node_, num_bytes);
-    port::AlignedFree(ptr);
+    if (numa_node_ == port::kNUMANoAffinity) {
+      port::AlignedFree(ptr);
+    } else {
+      port::NUMAFree(ptr, num_bytes);
+    }
   }
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 5b4623ba10fe684f6399a244e30ecafd55003c95..8be9c7b678e2bbe7659c9e22e31cb595ce704307 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -150,7 +150,6 @@ class Pow2Rounder : public RoundUpInterface {
 
 class BasicCPUAllocator : public SubAllocator {
  public:
-  // Argument numa_node is currently ignored.
   BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors,
                     const std::vector<Visitor>& free_visitors)
       : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {}
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index cce230801183591371268bf2827d153c9c19b840..21cb62118aebafa8a03903296b65f0617510f080 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -62,9 +62,12 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 2});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0",
-                                          &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+                                          &devices));
+    device0_ = devices[0].get();
+    device1_ = devices[1].get();
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
@@ -138,8 +141,9 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
+  Device* device0_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  Device* device1_ = nullptr;  // Not owned. (Owned by device_mgr_.)
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<TestClusterFLR> cluster_flr_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr_;
@@ -165,16 +169,16 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, Basic) {
   FunctionLibraryRuntime* flr =
       proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:0");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[0]);
+  EXPECT_EQ(flr->device(), device0_);
   flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/device:CPU:0");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[0]);
+  EXPECT_EQ(flr->device(), device0_);
   flr = proc_flr_->GetFLR("/device:CPU:0");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[0]);
+  EXPECT_EQ(flr->device(), device0_);
   flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:1");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[1]);
+  EXPECT_EQ(flr->device(), device1_);
   flr = proc_flr_->GetFLR("abc");
   EXPECT_EQ(flr, nullptr);
   rendezvous_->Unref();
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index bcaa37fc8a156a63fcc76f9b8bb39ac8fd75f15a..3d8ac9b1344d8f2ca210451194adf4607dd52b7d 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -32,28 +32,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-ProcessState* ProcessState::instance_ = nullptr;
-
 /*static*/ ProcessState* ProcessState::singleton() {
-  if (instance_ == nullptr) {
-    instance_ = new ProcessState;
-  }
-
-  return instance_;
+  static ProcessState* instance = new ProcessState;
+  return instance;
 }
 
 ProcessState::ProcessState() : numa_enabled_(false) {
-  CHECK(instance_ == nullptr);
-}
-
-// Normally the ProcessState singleton is never explicitly deleted.
-// This function is defined for debugging problems with the allocators.
-ProcessState::~ProcessState() {
-  CHECK_EQ(this, instance_);
-  instance_ = nullptr;
-  for (Allocator* a : cpu_allocators_) {
-    delete a;
-  }
 }
 
 string ProcessState::MemDesc::DebugString() {
@@ -72,8 +56,7 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
 }
 
 Allocator* ProcessState::GetCPUAllocator(int numa_node) {
-  CHECK_GE(numa_node, 0);
-  if (!numa_enabled_) numa_node = 0;
+  if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0;
   mutex_lock lock(mu_);
   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
     // If visitors have been defined we need an Allocator built from
@@ -90,8 +73,9 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
     Allocator* allocator = nullptr;
     SubAllocator* sub_allocator =
         (alloc_visitors_defined || use_bfc_allocator)
-            ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1,
-                                    cpu_alloc_visitors_, cpu_free_visitors_)
+            ? new BasicCPUAllocator(
+                  numa_enabled_ ? numa_node : port::kNUMANoAffinity,
+                  cpu_alloc_visitors_, cpu_free_visitors_)
             : nullptr;
     if (use_bfc_allocator) {
       // TODO(reedwm): evaluate whether 64GB by default is the best choice.
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index cac312d8496d3d4e454291405bcd16c432af8852..6849d305b3c5577485e83ed7d2e9521dce20a452 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -63,7 +63,7 @@ class ProcessState {
   MemDesc PtrType(const void* ptr);
 
   // Returns the one CPUAllocator used for the given numa_node.
-  // TEMPORARY: ignores numa_node.
+  // Treats numa_node == kNUMANoAffinity as numa_node == 0.
   Allocator* GetCPUAllocator(int numa_node);
 
   // Registers alloc visitor for the CPU allocator(s).
@@ -87,19 +87,19 @@ class ProcessState {
 
   // Helper method for unit tests to reset the ProcessState singleton by
   // cleaning up everything. Never use in production.
-  virtual void TestOnlyReset();
+  void TestOnlyReset();
 
   static ProcessState* instance_;
   bool numa_enabled_;
 
   mutex mu_;
 
+  // Indexed by numa_node.  If we want numa-specific allocators AND a
+  // non-specific allocator, maybe should index by numa_node+1.
   std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_);
   std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_);
   std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_);
 
-  virtual ~ProcessState();
-
   // Optional RecordingAllocators that wrap the corresponding
   // Allocators for runtime attribute use analysis.
   MDMap mem_desc_map_;
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
index 56766a8df4526cb2d6fb20c5dcd461a65d2a994b..45541c35fe9b7bd7886b0c0928a77e2359a9aaa3 100644
--- a/tensorflow/core/common_runtime/renamed_device.cc
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -14,15 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/renamed_device.h"
+#include "absl/memory/memory.h"
 
 namespace tensorflow {
 
-// TODO(saeta): Convert to returning a std::unique_ptr?
 /* static */
-Device* RenamedDevice::NewRenamedDevice(const string& new_base,
-                                        Device* underlying,
-                                        bool owns_underlying,
-                                        bool isolate_session_state) {
+std::unique_ptr<Device> RenamedDevice::NewRenamedDevice(
+    const string& new_base, Device* underlying, bool owns_underlying,
+    bool isolate_session_state) {
   DeviceNameUtils::ParsedName parsed_name;
   CHECK(DeviceNameUtils::ParseFullName(new_base, &parsed_name));
   DeviceNameUtils::ParsedName underlying_parsed_name =
@@ -36,8 +35,9 @@ Device* RenamedDevice::NewRenamedDevice(const string& new_base,
                                           parsed_name.id);
   DeviceAttributes attributes(underlying->attributes());
   attributes.set_name(name);
-  return new RenamedDevice(underlying, attributes, owns_underlying,
-                           isolate_session_state);
+  // Call absl::WrapUnique to access private constructor.
+  return absl::WrapUnique(new RenamedDevice(
+      underlying, attributes, owns_underlying, isolate_session_state));
 }
 
 RenamedDevice::RenamedDevice(Device* underlying,
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index c00789a55631aad1dbd79ee3cbe588b0436a853f..6d24f496ffb3c78c4f7e38564bba11ebabfbc39e 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -28,9 +28,10 @@ namespace tensorflow {
 // session.
 class RenamedDevice : public Device {
  public:
-  static Device* NewRenamedDevice(const string& new_base, Device* underlying,
-                                  bool owns_underlying,
-                                  bool isolate_session_state);
+  static std::unique_ptr<Device> NewRenamedDevice(const string& new_base,
+                                                  Device* underlying,
+                                                  bool owns_underlying,
+                                                  bool isolate_session_state);
 
   ~RenamedDevice() override;
 
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index b1fe928ba7d4d2184b5d28344fa7dea0cb3c160b..092f15e49e330de21452e0f7b4d8cc51607a44ed 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -290,7 +290,7 @@ void RingReducer::Run(StatusCallback done) {
         col_ctx_->device, col_ctx_->op_ctx->input_alloc_attr(0),
         col_ctx_->op_ctx->output_alloc_attr(0), col_ctx_->input,
         col_ctx_->output, 0 /*dev_to_dev_stream_index*/,
-        [this, &note, &status](const Status& s) {
+        [&note, &status](const Status& s) {
           status.Update(s);
           note.Notify();
         });
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 75aba435726bbd4976ab4837a9752b915b845374..7feb29a6dbbb17d73967344ad07db9d234411840 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/ring_reducer.h"
 
 #include <algorithm>
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -157,7 +158,7 @@ class RingReducerTest : public ::testing::Test {
     InitGPUDevices();
 #endif
     device_type_ = device_type;
-    std::vector<Device*> local_devices;
+    std::vector<std::unique_ptr<Device>> local_devices;
     SessionOptions sess_opts;
     sess_opts.env = Env::Default();
     Bytes mem_limit(4 << 20);
@@ -167,7 +168,7 @@ class RingReducerTest : public ::testing::Test {
         if (device_type == DEVICE_CPU) {
           string dev_name =
               strings::StrCat("/job:worker/replica:0/task:", wi, "/cpu:", di);
-          local_devices.push_back(new ThreadPoolDevice(
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
               sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
         } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
           int dev_idx = (wi * num_devices) + di;
@@ -175,7 +176,7 @@ class RingReducerTest : public ::testing::Test {
             LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
                          "than one ring node.";
           } else {
-            local_devices.push_back(gpu_devices_[dev_idx]);
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
           }
         } else {
           LOG(FATAL) << "Unsupported device_type " << device_type;
@@ -185,13 +186,14 @@ class RingReducerTest : public ::testing::Test {
     if (!dev_mgr_ || device_type == DEVICE_CPU) {
       LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
                  << " devices: ";
-      dev_mgr_.reset(new DeviceMgr(local_devices));
+      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
     }
+    if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
     dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
     rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
                            fail_after);
-    col_exec_ = new BaseCollectiveExecutor(&col_exec_mgr_, rma_, kStepId,
-                                           dev_mgr_.get());
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
     col_params_.name = "test_collective";
     static const int kGroupKey = 5;
     col_params_.group.group_key = kGroupKey;
@@ -543,8 +545,9 @@ class RingReducerTest : public ::testing::Test {
   std::unique_ptr<DeviceResolverLocal> dev_resolver_;
   std::vector<DeviceInstance*> instances_;
   CollectiveParams col_params_;
-  std::vector<tensorflow::Device*> gpu_devices_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
   std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
   int32 reduce_counter_ GUARDED_BY(mu_) = 0;
 };
diff --git a/tensorflow/core/common_runtime/session_options.cc b/tensorflow/core/common_runtime/session_options.cc
index aacd57000cfb143a99bc79fa9767a228ed31ef0b..57c3b605575b925e6f4a131f076cfe6f25c92fc1 100644
--- a/tensorflow/core/common_runtime/session_options.cc
+++ b/tensorflow/core/common_runtime/session_options.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index fa4d1eda625fa74c59ca3597d9d4f8f6654c7cfb..9488a447789e67f3a9e73af43a0f3a849457e51f 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -288,6 +288,11 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
         "output_port '", output_port, "' is out of range, ", "node '",
         node->name(), "' has ", node->num_outputs(), " outputs");
   }
+  // Note: it's possible, if the node's been updated, that the shape inference
+  // context doesn't have the right number of outputs.
+  if (node->num_outputs() > c->num_outputs()) {
+    TF_RETURN_IF_ERROR(c->ExpandOutputs(node->num_outputs()));
+  }
 
   // Check compatibility, and merge the shapes.
   ShapeHandle existing_shape = c->output(output_port);
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index a70ab93d4ad7f7aac1221560c7fc124f2e5a29ed..49265445659ff1daa30b632f60c03845d4a6a7f7 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -139,7 +139,7 @@ void NodeExecStatsWrapper::SetScheduled(int64 nanos) {
 }
 
 void NodeExecStatsWrapper::SetMemory(OpKernelContext* ctx) {
-  for (const auto& allocator_pair : ctx->wrapped_allocators()) {
+  for (const auto& allocator_pair : ctx->ConsumeWrappedAllocators()) {
     AddAllocation(allocator_pair.first, allocator_pair.second);
   }
   auto* ms = stats_->mutable_memory_stats();
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 4365b11b19e1b1593985602398a3836ecbf4bceb..7d34383ce8209c9f4b889410a96bce02f6702a64 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -68,8 +68,13 @@ class NodeExecStatsInterface {
   // Called immediately after this executor finishes processing this node.
   virtual void RecordExecutorEnded() = 0;
 
+  // Returns `true` if this object should track memory allocations.
+  virtual bool TrackAllocations() const = 0;
+
   // Records information about the memory allocated during the execution of this
   // node.
+  //
+  // Takes ownership of any `TrackingAllocator` objects stored in `ctx`.
   virtual void SetMemory(OpKernelContext* ctx) = 0;
 
   // Records information about the tensor produced by this node at the given
@@ -104,6 +109,7 @@ class NodeExecStatsWrapper : public NodeExecStatsInterface {
   void RecordComputeStarted() override;
   void RecordComputeEnded() override;
   void RecordExecutorEnded() override;
+  bool TrackAllocations() const override { return true; }
   void SetMemory(OpKernelContext* ctx) override;
   void SetOutput(int slot, const Tensor* tensor) override;
   void SetReferencedTensors(const TensorReferenceVector& tensors) override;
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 6404d8bc6a209997afbe33c547679ebb2cb5cbf5..ca7ca5443c954a6cdcb5d25324ea84163bb4291e 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -93,7 +93,7 @@ Status ThreadPoolDevice::MakeTensorFromProto(
     Tensor* tensor) {
   if (tensor_proto.dtype() > 0 && tensor_proto.dtype() <= DataType_MAX) {
     Tensor parsed(tensor_proto.dtype());
-    if (parsed.FromProto(cpu_allocator(), tensor_proto)) {
+    if (parsed.FromProto(allocator_, tensor_proto)) {
       *tensor = std::move(parsed);
       return Status::OK();
     }
diff --git a/tensorflow/core/common_runtime/threadpool_device_factory.cc b/tensorflow/core/common_runtime/threadpool_device_factory.cc
index 6a900c02c00e976fdef2e4b5f6673f27affb3069..f9cbb817499ef5c35a91e5c7f2e51f9bd5267180 100644
--- a/tensorflow/core/common_runtime/threadpool_device_factory.cc
+++ b/tensorflow/core/common_runtime/threadpool_device_factory.cc
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Register a factory that provides CPU devices.
-#include "tensorflow/core/common_runtime/threadpool_device.h"
-
 #include <vector>
+
+// Register a factory that provides CPU devices.
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -27,9 +30,8 @@ namespace tensorflow {
 class ThreadPoolDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
-    // TODO(zhifengc/tucker): Figure out the number of available CPUs
-    // and/or NUMA configuration.
+                       std::vector<std::unique_ptr<Device>>* devices) override {
+    int num_numa_nodes = port::NUMANumNodes();
     int n = 1;
     auto iter = options.config.device_count().find("CPU");
     if (iter != options.config.device_count().end()) {
@@ -37,8 +39,26 @@ class ThreadPoolDeviceFactory : public DeviceFactory {
     }
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:CPU:", i);
-      devices->push_back(new ThreadPoolDevice(
-          options, name, Bytes(256 << 20), DeviceLocality(), cpu_allocator()));
+      std::unique_ptr<ThreadPoolDevice> tpd;
+      if (options.config.experimental().use_numa_affinity()) {
+        int numa_node = i % num_numa_nodes;
+        if (numa_node != i) {
+          LOG(INFO) << "Only " << num_numa_nodes
+                    << " NUMA nodes visible in system, "
+                    << " assigning device " << name << " to NUMA node "
+                    << numa_node;
+        }
+        DeviceLocality dev_locality;
+        dev_locality.set_numa_node(numa_node);
+        tpd = absl::make_unique<ThreadPoolDevice>(
+            options, name, Bytes(256 << 20), dev_locality,
+            ProcessState::singleton()->GetCPUAllocator(numa_node));
+      } else {
+        tpd = absl::make_unique<ThreadPoolDevice>(
+            options, name, Bytes(256 << 20), DeviceLocality(),
+            ProcessState::singleton()->GetCPUAllocator(port::kNUMANoAffinity));
+      }
+      devices->push_back(std::move(tpd));
     }
 
     return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 37029f3f1a797f8879a5475acc53d17840768a4e..e388d3e6f0f5636c044c36ee03c826f1872cac9f 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -15,7 +15,7 @@ filegroup(
     ]),
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
@@ -189,7 +189,7 @@ cc_library(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "worker",
     srcs = ["worker.cc"],
     hdrs = [
@@ -204,6 +204,7 @@ cc_library(
         ":worker_interface",
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:device_tracer",
         "//tensorflow/core:lib_internal",
     ],
 )
@@ -424,6 +425,7 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/debug",
@@ -466,6 +468,17 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "server_lib_test",
+    srcs = ["server_lib_test.cc"],
+    deps = [
+        ":server_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "rpc_collective_executor_mgr",
     srcs = ["rpc_collective_executor_mgr.cc"],
@@ -612,6 +625,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 4eed856759ae7ea2a982e1604ecbc0237e304731..40b18d321a1cb3fafeaa4b864e737f6d86695842 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -29,7 +29,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static Device* NewDevice(const string& type, const string& name) {
+static std::unique_ptr<Device> NewDevice(const string& type,
+                                         const string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -40,7 +41,7 @@ static Device* NewDevice(const string& type, const string& name) {
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(3);  // a non-default value
-  return new FakeDevice(attr);
+  return absl::make_unique<FakeDevice>(attr);
 }
 
 class FakeWorker : public TestWorkerInterface {
@@ -156,16 +157,16 @@ class DeviceResDistTest : public ::testing::Test {
 
   void DefineWorker(const ConfigProto& config, const string& worker_name,
                     const string& device_type, int num_devices) {
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i)));
     }
-    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    DeviceMgr* dev_mgr = new DeviceMgr(std::move(devices));
     device_mgrs_.push_back(dev_mgr);
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto d : devices) {
+    for (auto* d : dev_mgr->ListDevices()) {
       dv->push_back(d->name());
     }
     DeviceResolverDistributed* dev_res =
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 805e023b0f3c86f02e301d61cf88029065fe248b..9087703cb5524dd7a66c5d1552b3bcc07977e488 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -61,6 +61,15 @@ class RecvBufCall : public CancellableCall {
   RecvBufResponse resp_;
 };
 
+void PopulateTensorFromExtra(const RecvBufRespExtra& extra,
+                             Tensor* cpu_tensor) {
+  char* head = reinterpret_cast<char*>(DMAHelper::base(cpu_tensor));
+  for (const auto& tensor_content_chunk : extra.tensor_content()) {
+    memcpy(head, tensor_content_chunk.data(),
+           tensor_content_chunk.size());
+    head += tensor_content_chunk.size();
+  }
+}
 }  // namespace
 
 void CollectiveRemoteAccessDistributed::RecvFromPeer(
@@ -95,7 +104,10 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
       // them into the destination tensor here.
       RecvBufRespExtra extra;
       state->call->resp_.transport_options().UnpackTo(&extra);
-      int64 num_bytes = extra.tensor_content().size();
+      int64 num_bytes = 0;
+      for (const auto& chunk : extra.tensor_content()) {
+        num_bytes += chunk.size();
+      }
       if (num_bytes != to_tensor->TotalBytes()) {
         done(errors::Internal("RecvBufResponse returned ", num_bytes,
                               " bytes where to_tensor expected ",
@@ -118,8 +130,7 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
         cpu_attr.set_gpu_compatible(true);
         Tensor* cpu_tensor = new Tensor(cpu_dev->GetAllocator(cpu_attr),
                                         to_tensor->dtype(), to_tensor->shape());
-        memcpy(DMAHelper::base(cpu_tensor), extra.tensor_content().data(),
-               num_bytes);
+        PopulateTensorFromExtra(extra, cpu_tensor);
         // Then copy it to the GPU.
         CopyTensor::ViaDMA("",  // edge name (non-existent)
                            nullptr /*send_dev_ctx*/, to_device_ctx, cpu_dev,
@@ -135,8 +146,7 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
         return;
       } else {
         // CPU device
-        memcpy(DMAHelper::base(to_tensor), extra.tensor_content().data(),
-               num_bytes);
+        PopulateTensorFromExtra(extra, to_tensor);
       }
     }
     if (!s.ok() && errors::IsFailedPrecondition(s)) {
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index bfd312410cb18f7545ffae5555027a68ebd54734..26f722a6bd4104b2dc264c2946bc5b5656b0fb32 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -41,7 +41,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static Device* NewDevice(const string& type, const string& name) {
+static std::unique_ptr<Device> NewDevice(const string& type,
+                                         const string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -52,7 +53,7 @@ static Device* NewDevice(const string& type, const string& name) {
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(3);  // a non-default value
-  return new FakeDevice(attr);
+  return absl::make_unique<FakeDevice>(attr);
 }
 
 static int64 kStepId = 123;
@@ -104,7 +105,7 @@ class FakeWorker : public TestWorkerInterface {
             // bytes in the response.
             RecvBufRespExtra extra;
             int64 num_bytes = h->prod_value->TotalBytes();
-            extra.set_tensor_content(string(
+            extra.add_tensor_content(string(
                 reinterpret_cast<const char*>(DMAHelper::base(h->prod_value)),
                 num_bytes));
             response->mutable_transport_options()->PackFrom(extra);
@@ -211,16 +212,16 @@ class CollRMADistTest : public ::testing::Test {
 
   void DefineWorker(const ConfigProto& config, const string& worker_name,
                     const string& device_type, int num_devices) {
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i)));
     }
-    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    DeviceMgr* dev_mgr = new DeviceMgr(std::move(devices));
     device_mgrs_.push_back(dev_mgr);
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto d : devices) {
+    for (auto d : dev_mgr->ListDevices()) {
       dv->push_back(d->name());
     }
     DeviceResolverDistributed* dev_res =
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index ae44b98bd52d6dcc32919ca9d850fcf13aac89db..842a2b3b058b8c55bec0c07816c1305ed9a2f305 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/test_utils.h"
 #include "tensorflow/core/lib/core/notification.h"
@@ -41,8 +42,8 @@ class TestableDeviceResolverDistributed : public DeviceResolverDistributed {
 
 // Create a fake 'Device' whose only interesting attribute is a non-default
 // DeviceLocality.
-static Device* NewDevice(const string& type, const string& name,
-                         int numa_node) {
+static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
+                                         int numa_node) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -53,7 +54,7 @@ static Device* NewDevice(const string& type, const string& name,
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(numa_node);
-  return new FakeDevice(attr);
+  return absl::make_unique<FakeDevice>(attr);
 }
 
 // Create a fake WorkerInterface that responds to requests without RPCs,
@@ -151,19 +152,19 @@ class DeviceResDistTest : public ::testing::Test {
 
   void DefineWorker(const string& worker_name, const string& device_type,
                     int num_devices) {
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i), i));
     }
-    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    DeviceMgr* dev_mgr = new DeviceMgr(std::move(devices));
     TestableDeviceResolverDistributed* dev_res =
         new TestableDeviceResolverDistributed(dev_mgr, &wc_, worker_name);
     resolvers_[worker_name] = dev_res;
     device_mgrs_.push_back(dev_mgr);
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto d : devices) {
+    for (auto* d : dev_mgr->ListDevices()) {
       dv->push_back(d->name());
     }
     FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 055e5dfcedaea0bb2209132f2ffd60cd5a4dbae0..55b2657e74ef5c2be8c1b0f11d4a00186e063e31 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -69,6 +69,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index b8af63724aa1dbe1a20dbc18bd6115c9aab78a0c..13c959d8506868a3d9d8dbba59a7d092e6d4fd94 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/host_info.h"
 
 namespace tensorflow {
 namespace eager {
@@ -86,7 +88,7 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
     return tensorflow::errors::Internal(
         "invalid eager env_ or env_->rendezvous_mgr.");
   }
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
       // TODO(nareshmodi): Correctly set the SessionOptions.
@@ -96,12 +98,12 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
                       request->server_def().task_index()),
       &devices));
   response->mutable_device_attributes()->Reserve(devices.size());
-  for (auto& d : devices) {
+  for (const auto& d : devices) {
     *response->add_device_attributes() = d->attributes();
   }
 
-  std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
-      new tensorflow::DeviceMgr(devices));
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr =
+      absl::make_unique<DeviceMgr>(std::move(devices));
 
   auto* r = env_->rendezvous_mgr->Find(request->rendezvous_id());
   auto session_name = strings::StrCat("eager_", request->rendezvous_id());
@@ -152,20 +154,19 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
   std::unique_ptr<tensorflow::EagerOperation> op;
   const char* name = operation.name().c_str();  // Shorthand
   const tensorflow::AttrTypeMap* types;
-  auto status = tensorflow::AttrTypeMapForOp(name, &types);
-  if (status.ok()) {
-    op.reset(
-        new tensorflow::EagerOperation(server_context->Context(), name, types));
-  } else if (errors::IsNotFound(status)) {
-    if (server_context->Context()->FindFunctionByName(name)) {
-      op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
-                                              nullptr));
-    } else {
-      return status;
-    }
-  } else {
-    return status;
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp(name, &types, &is_function));
+  if (is_function && !server_context->Context()->FindFunctionByName(name)) {
+    return errors::NotFound(
+        "'", name,
+        "' is neither a type of a primitive operation nor a name "
+        "of a function registered in binary running on ",
+        port::Hostname(),
+        ". Make sure the operation or function is "
+        "registered in the binary running in this process.");
   }
+  op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
+                                          is_function, types));
 
   TF_RETURN_IF_ERROR(op->SetDevice(operation.device().c_str()));
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 5c9b33b345b8b3f8efec8ac14720a11867e1d5cd..7a1463e8f047040b28dbb951e6db2b7af75294f2 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -68,12 +68,9 @@ class EagerServiceImplTest : public ::testing::Test {
     worker_env_.rendezvous_mgr = &rendezvous_mgr_;
     worker_env_.session_mgr = session_mgr_.get();
 
-    Device* device = DeviceFactory::NewDevice(
-        "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0");
-
-    worker_env_.local_devices = {device};
-
-    device_mgr_.reset(new DeviceMgr(worker_env_.local_devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(DeviceFactory::NewDevice(
+        "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
+    worker_env_.local_devices = device_mgr_->ListDevices();
     worker_env_.device_mgr = device_mgr_.get();
   }
 
@@ -345,8 +342,7 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
       response.context_id(), RemoteTensorHandleInternal(2, 0), &tensor_handle));
   TF_ASSERT_OK(tensor_handle->Tensor(&t));
 
-  Device* device = nullptr;
-  TF_ASSERT_OK(tensor_handle->Device(&device));
+  Device* device = tensor_handle->device();
   EXPECT_NE(device, nullptr);
   EXPECT_EQ(device->name(), "/job:localhost/replica:0/task:0/device:CPU:0");
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 3361819e4318bb3d97c4689e42d4942a5d473b34..ee5823e314f777f758a6c0d8ef7129c4bbd2916c 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
 
+#include <chrono>  // NOLINT(build/c++11)
 #include <vector>
 
 #include "tensorflow/core/common_runtime/build_graph_options.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
@@ -90,7 +92,7 @@ static Status ValidateGraphDefForDevices(const GraphDef& gdef) {
   for (const auto& ndef : gdef.node()) {
     if (!DeviceNameUtils::ParseFullName(ndef.device(), &parsed)) {
       return errors::InvalidArgument("Missing device name in: ",
-                                     SummarizeNodeDef(ndef));
+                                     FormatNodeDefForError(ndef));
     }
   }
   return Status::OK();
@@ -386,6 +388,7 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
                             MutableRunGraphResponseWrapper* response,
                             CancellationManager* cancellation_manager,
                             const NamedTensors& in, StatusCallback done) {
+  const uint64 start_time_usecs = Env::Default()->NowMicros();
   // Lookup an item. Holds one ref while executing.
   Item* item = nullptr;
   {
@@ -443,14 +446,16 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
     return;
   }
 
-  StartParallelExecutors(handle, step_id, item, rendezvous, ce_handle,
-                         collector, cost_graph, cancellation_manager,
-                         [item, rendezvous, ce_handle, done](const Status& s) {
-                           done(s);
-                           rendezvous->Unref();
-                           item->Unref();
-                           delete ce_handle;
-                         });
+  StartParallelExecutors(
+      handle, step_id, item, rendezvous, ce_handle, collector, cost_graph,
+      cancellation_manager,
+      [item, rendezvous, ce_handle, done, start_time_usecs](const Status& s) {
+        done(s);
+        UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
+        rendezvous->Unref();
+        item->Unref();
+        delete ce_handle;
+      });
 }
 
 void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 8e9eec1ed926fb72887ec50e58ae8e505abad807..bc8ba6e47d5c66eab72eacd1f4d9a65a4b9cae6c 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/master_session.h"
 
+#include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -64,27 +65,33 @@ namespace tensorflow {
 class MasterSession::ReffedClientGraph : public core::RefCounted {
  public:
   ReffedClientGraph(const string& handle, const BuildGraphOptions& bopts,
-                    std::unique_ptr<ClientGraph> cg,
+                    std::unique_ptr<ClientGraph> client_graph,
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
                     bool is_partial, WorkerCacheInterface* worker_cache,
                     bool should_deregister)
       : session_handle_(handle),
         bg_opts_(bopts),
-        client_graph_(std::move(cg)),
+        client_graph_before_register_(std::move(client_graph)),
         session_opts_(session_opts),
         is_partial_(is_partial),
         callable_opts_(bopts.callable_options),
         worker_cache_(worker_cache),
-        should_deregister_(should_deregister) {
+        should_deregister_(should_deregister),
+        collective_graph_key_(
+            client_graph_before_register_->collective_graph_key) {
     VLOG(1) << "Created ReffedClientGraph for node with "
-            << client_graph()->graph.num_node_ids();
+            << client_graph_before_register_->graph.num_node_ids();
 
     stats_publisher_ = stats_publisher_factory(handle, bopts, session_opts);
 
     // Initialize a name to node map for processing device stats.
-    for (Node* n : client_graph_->graph.nodes()) {
-      name_to_node_.insert({n->name(), n});
+    for (Node* n : client_graph_before_register_->graph.nodes()) {
+      name_to_node_details_.emplace(
+          n->name(),
+          NodeDetails(n->type_string(),
+                      strings::StrCat(
+                          "(", str_util::Join(n->requested_inputs(), ", "))));
     }
   }
 
@@ -98,12 +105,12 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     }
   }
 
-  const ClientGraph* client_graph() { return client_graph_.get(); }
-
   const CallableOptions& callable_options() { return callable_opts_; }
 
   const BuildGraphOptions& build_graph_options() { return bg_opts_; }
 
+  int64 collective_graph_key() { return collective_graph_key_; }
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
                                                     int64 execution_count,
                                                     const RunOptions& ropts) {
@@ -187,7 +194,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   // Partitions the graph into subgraphs and registers them on
   // workers.
-  Status RegisterPartitions(const PartitionOptions& popts);
+  Status RegisterPartitions(PartitionOptions popts);
 
   // Runs one step of all partitions.
   Status RunPartitions(const MasterEnv* env, int64 step_id,
@@ -214,29 +221,28 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                       const RunState* run_state,
                       GraphExecutionState* execution_state);
 
-  string DetailText(const Node& node, const NodeExecStats& ns) {
-    int64 tot = 0;
-    for (auto& no : ns.output()) {
-      tot += no.tensor_description().allocation_description().requested_bytes();
-    }
-    string bytes;
-    if (tot >= 0.1 * 1048576.0) {
-      bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
-    }
-    return strings::StrCat(bytes, node.name(), " = ", node.type_string(), "(",
-                           str_util::Join(node.requested_inputs(), ", "), ")");
-  }
-
  private:
   const string session_handle_;
   const BuildGraphOptions bg_opts_;
-  const std::unique_ptr<ClientGraph> client_graph_;
+
+  // NOTE(mrry): This pointer will be null after `RegisterPartitions()` returns.
+  std::unique_ptr<ClientGraph> client_graph_before_register_ GUARDED_BY(mu_);
   const SessionOptions session_opts_;
   const bool is_partial_;
   const CallableOptions callable_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
-  std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node_;
+
+  struct NodeDetails {
+    explicit NodeDetails(string type_string, string detail_text)
+        : type_string(std::move(type_string)),
+          detail_text(std::move(detail_text)) {}
+    const string type_string;
+    const string detail_text;
+  };
+  std::unordered_map<string, NodeDetails> name_to_node_details_;
+
   const bool should_deregister_;
+  const int64 collective_graph_key_;
   std::atomic<int64> execution_count_ = {0};
 
   // Graph partitioned into per-location subgraphs.
@@ -268,9 +274,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   mutable mutex mu_;
 
   // Partition initialization and registration only needs to happen
-  // once. init_started_ && !init_done_ indicates the initialization
-  // is on going.
-  bool init_started_ GUARDED_BY(mu_) = false;
+  // once. `!client_graph_before_register_ && !init_done_.HasBeenNotified()`
+  // indicates the initialization is ongoing.
   Notification init_done_;
 
   // init_result_ remembers the initialization error if any.
@@ -278,6 +283,19 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   std::unique_ptr<StatsPublisherInterface> stats_publisher_;
 
+  string DetailText(const NodeDetails& details, const NodeExecStats& stats) {
+    int64 tot = 0;
+    for (auto& no : stats.output()) {
+      tot += no.tensor_description().allocation_description().requested_bytes();
+    }
+    string bytes;
+    if (tot >= 0.1 * 1048576.0) {
+      bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
+    }
+    return strings::StrCat(bytes, stats.node_name(), " = ",
+                           details.type_string, details.detail_text);
+  }
+
   // Send/Recv nodes that are the result of client-added
   // feeds and fetches must be tracked so that the tensors
   // can be added to the local rendezvous.
@@ -286,7 +304,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   // The actual graph partitioning and registration implementation.
   Status DoBuildPartitions(
-      PartitionOptions pots,
+      PartitionOptions popts, ClientGraph* client_graph,
       std::unordered_map<string, GraphDef>* out_partitions);
   Status DoRegisterPartitions(
       const PartitionOptions& popts,
@@ -311,14 +329,20 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 };
 
 Status MasterSession::ReffedClientGraph::RegisterPartitions(
-    const PartitionOptions& popts) {
+    PartitionOptions popts) {
   {  // Ensure register once.
     mu_.lock();
-    if (!init_started_) {
-      init_started_ = true;
+    if (client_graph_before_register_) {
+      // The `ClientGraph` is no longer needed after partitions are registered.
+      // Since it can account for a large amount of memory, we consume it here,
+      // and it will be freed after concluding with registration.
+
+      std::unique_ptr<ClientGraph> client_graph;
+      std::swap(client_graph_before_register_, client_graph);
       mu_.unlock();
       std::unordered_map<string, GraphDef> graph_defs;
-      Status s = DoBuildPartitions(popts, &graph_defs);
+      popts.flib_def = client_graph->flib_def.get();
+      Status s = DoBuildPartitions(popts, client_graph.get(), &graph_defs);
       if (s.ok()) {
         // NOTE(mrry): The pointers in `graph_defs_for_publishing` do not remain
         // valid after the call to DoRegisterPartitions begins, so
@@ -394,19 +418,19 @@ void MasterSession::ReffedClientGraph::TrackFeedsAndFetches(
 }
 
 Status MasterSession::ReffedClientGraph::DoBuildPartitions(
-    PartitionOptions popts,
+    PartitionOptions popts, ClientGraph* client_graph,
     std::unordered_map<string, GraphDef>* out_partitions) {
   if (popts.need_to_record_start_times) {
     CostModel cost_model(true);
-    cost_model.InitFromGraph(client_graph()->graph);
+    cost_model.InitFromGraph(client_graph->graph);
     // TODO(yuanbyu): Use the real cost model.
     // execution_state_->MergeFromGlobal(&cost_model);
-    SlackAnalysis sa(&client_graph()->graph, &cost_model);
+    SlackAnalysis sa(&client_graph->graph, &cost_model);
     sa.ComputeAsap(&popts.start_times);
   }
 
   // Partition the graph.
-  return Partition(popts, &client_graph_->graph, out_partitions);
+  return Partition(popts, &client_graph->graph, out_partitions);
 }
 
 Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
@@ -415,7 +439,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
   partitions_.reserve(graph_partitions.size());
   Status s;
   for (auto& name_def : graph_partitions) {
-    partitions_.resize(partitions_.size() + 1);
+    partitions_.emplace_back();
     Part* part = &partitions_.back();
     part->name = name_def.first;
     TrackFeedsAndFetches(part, name_def.second, popts);
@@ -449,7 +473,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
     *c->req.mutable_debug_options() =
         callable_opts_.run_options().debug_options();
-    c->req.set_collective_graph_key(client_graph()->collective_graph_key);
+    c->req.set_collective_graph_key(collective_graph_key_);
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -915,8 +939,8 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
       ph->RecordOneOp(dev_name, ns, true /*is_copy*/, "", ns.node_name(),
                       ns.timeline_label());
     } else {
-      const Node* node = name_to_node_[ns.node_name()];
-      const bool found_node_in_graph = node != nullptr;
+      auto iter = name_to_node_details_.find(ns.node_name());
+      const bool found_node_in_graph = iter != name_to_node_details_.end();
       if (!found_node_in_graph && ns.timeline_label().empty()) {
         // The counter incrementing is not thread-safe. But we don't really
         // care.
@@ -930,13 +954,13 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
         }
         continue;
       }
-      string optype =
-          found_node_in_graph ? node->type_string() : ns.node_name();
+      const string& optype =
+          found_node_in_graph ? iter->second.type_string : ns.node_name();
       string details;
       if (!ns.timeline_label().empty()) {
         details = ns.timeline_label();
       } else if (found_node_in_graph) {
-        details = DetailText(*node, ns);
+        details = DetailText(iter->second, ns);
       } else {
         // Leave details string empty
       }
@@ -1545,14 +1569,13 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   // Registers subgraphs if haven't done so.
   PartitionOptions popts;
   popts.node_to_loc = SplitByWorker;
-  // The closures potps.{new_name,get_incarnation} are called synchronously in
+  // The closures popts.{new_name,get_incarnation} are called synchronously in
   // RegisterPartitions() below, so do not need a Ref()/Unref() pair to keep
   // "this" alive during the closure.
   popts.new_name = [this](const string& prefix) {
     mutex_lock l(mu_);
     return strings::StrCat(prefix, "_S", next_node_id_++);
   };
-  popts.flib_def = rcg->client_graph()->flib_def.get();
   popts.get_incarnation = [this](const string& name) -> int64 {
     Device* d = devices_->FindDeviceByName(name);
     if (d == nullptr) {
@@ -1580,7 +1603,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
     popts.need_to_record_start_times = true;
   }
 
-  TF_RETURN_IF_ERROR(rcg->RegisterPartitions(popts));
+  TF_RETURN_IF_ERROR(rcg->RegisterPartitions(std::move(popts)));
 
   return Status::OK();
 }
@@ -1784,10 +1807,10 @@ Status MasterSession::PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
   Status s = run_status;
   if (s.ok()) {
     pss->end_micros = Env::Default()->NowMicros();
-    if (rcg->client_graph()->collective_graph_key !=
+    if (rcg->collective_graph_key() !=
         BuildGraphOptions::kNoCollectiveGraphKey) {
-      env_->collective_executor_mgr->RetireStepId(
-          rcg->client_graph()->collective_graph_key, step_id);
+      env_->collective_executor_mgr->RetireStepId(rcg->collective_graph_key(),
+                                                  step_id);
     }
     // Schedule post-processing and cleanup to be done asynchronously.
     rcg->ProcessStats(step_id, pss, ph.get(), run_options, out_run_metadata);
@@ -1846,7 +1869,7 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  uint64 step_id = NewStepId(rcg->client_graph()->collective_graph_key);
+  uint64 step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   std::unique_ptr<ProfileHandler> ph;
@@ -1854,6 +1877,7 @@ Status MasterSession::DoRunWithLocalExecution(
 
   Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
                                 &cancellation_manager_, false);
+
   cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
   return PostRunCleanup(rcg, step_id, req.options(), &pss, ph, s,
                         resp->mutable_metadata());
@@ -1910,7 +1934,7 @@ Status MasterSession::DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
   // Prepare.
   int64 count = rcg->get_and_increment_execution_count();
 
-  const uint64 step_id = NewStepId(rcg->client_graph()->collective_graph_key);
+  const uint64 step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   const RunOptions& run_options = rcg->callable_options().run_options();
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 4a10d99a6070d18acc127a519e0b1b852bc82497..273709a01fd799f7f4aa8afc80d3bdfc48d36322 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -87,6 +87,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache_logger",
@@ -104,6 +105,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 181422118cd9f01658c1601a1779355f127c6fac..3626a48171e0b628b2630c35a17826b8713dc9d1 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -40,7 +40,7 @@ class GrpcEagerClient : public EagerClient {
       override {                                                          \
     new RPCState<protobuf::Message>(                                      \
         &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
-        response, std::move(done), nullptr);                              \
+        response, std::move(done), nullptr, nullptr);                     \
   }
 
   CLIENT_METHOD(CreateContext);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 456c30ecf499016493e220ebdd2008ae48ce52df..781b7d65cdd184363d7c7650305bd62f3129c271 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -53,30 +53,58 @@ Status ValidateHostPortPair(const string& host_port) {
   }
   return Status::OK();
 }
-}  // namespace
 
-Status NewHostPortGrpcChannel(const string& target,
-                              SharedGrpcChannelPtr* channel_pointer) {
-  // Minimally ensure that the target is valid
-  TF_RETURN_IF_ERROR(ValidateHostPortPair(target));
+}  // namespace
 
+::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options) {
   // TODO(mrry): Implement secure channels.
   ::grpc::ChannelArguments args;
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
   // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
   // on connection failure, which makes our tests time out.
   args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  if (rpc_options != nullptr) {
+    if (rpc_options->compression_algorithm() == "deflate") {
+      args.SetCompressionAlgorithm(GRPC_COMPRESS_DEFLATE);
+      args.SetInt(GRPC_COMPRESSION_CHANNEL_DEFAULT_LEVEL,
+                  rpc_options->compression_level());
+      VLOG(5) << "Setting GRPC compression : algo='"
+              << rpc_options->compression_algorithm()
+              << "' level=" << rpc_options->compression_level();
+    } else if (rpc_options->compression_algorithm() == "gzip") {
+      args.SetCompressionAlgorithm(GRPC_COMPRESS_GZIP);
+      args.SetInt(GRPC_COMPRESSION_CHANNEL_DEFAULT_LEVEL,
+                  rpc_options->compression_level());
+      VLOG(5) << "Setting GRPC compression : algo='"
+              << rpc_options->compression_algorithm()
+              << "' level=" << rpc_options->compression_level();
+    } else if (!rpc_options->compression_algorithm().empty()) {
+      LOG(ERROR) << "Invalid compression algorithm: "
+                 << rpc_options->compression_algorithm();
+    }
+  }
+  return args;
+}
+
+Status NewHostPortGrpcChannel(const string& target,
+                              const RPCOptions* rpc_options,
+                              SharedGrpcChannelPtr* channel_pointer) {
+  // Minimally ensure that the target is valid
+  TF_RETURN_IF_ERROR(ValidateHostPortPair(target));
+
+  ::grpc::ChannelArguments args = GetChannelArguments(rpc_options);
   *channel_pointer = ::grpc::CreateCustomChannel(
       "dns:///" + target, ::grpc::InsecureChannelCredentials(), args);
   return Status::OK();
 }
 
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<Status(string, SharedGrpcChannelPtr*)>&
-        new_channel_func_ptr) {
+    const std::function<Status(string, const RPCOptions*,
+                               SharedGrpcChannelPtr*)>& new_channel_func_ptr) {
   return [new_channel_func_ptr](const string& target) -> SharedGrpcChannelPtr {
     SharedGrpcChannelPtr channel_ptr;
-    if (new_channel_func_ptr(target, &channel_ptr).ok()) {
+    if (new_channel_func_ptr(target, /*rpc_options=*/nullptr, &channel_ptr)
+            .ok()) {
       return channel_ptr;
     } else {
       return nullptr;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index 6fa99d7b148c010dede55a8cdcbdfca081c5e96a..57d16218e8f6a64c5030e075ebc770fc5566a106 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "grpcpp/grpcpp.h"
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
@@ -86,11 +87,14 @@ GrpcChannelCache* NewGrpcChannelCache(const GrpcChannelSpec& channel_spec,
 
 // Below here are internal-only functions.
 
+::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options);
+
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<Status(string, SharedGrpcChannelPtr*)>&
-        new_channel_func_ptr);
+    const std::function<Status(string, const RPCOptions*,
+                               SharedGrpcChannelPtr*)>& new_channel_func_ptr);
 
 Status NewHostPortGrpcChannel(const string& target,
+                              const RPCOptions* rpc_options,
                               SharedGrpcChannelPtr* channel_pointer);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
index a814ef85e2091ef46c466a012ac7c093981a1165..a6fae2286f5957f7aa0b45479ad262647e81ce74 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
@@ -184,18 +184,39 @@ TEST(GrpcChannelTest, SparseHostPorts) {
 TEST(GrpcChannelTest, NewHostPortGrpcChannelValidation) {
   SharedGrpcChannelPtr mock_ptr;
 
-  EXPECT_TRUE(NewHostPortGrpcChannel("127.0.0.1:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("example.com:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("fqdn.example.com.:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("[2002:a9c:258e::]:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("[::]:2222", &mock_ptr).ok());
-
-  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:2222", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("127.0.0.1:2222/", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("[::]/:2222", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("[::]:2222/", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("[::]:", &mock_ptr).ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("127.0.0.1:2222", /*rpc_options=*/nullptr,
+                                     &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("example.com:2222",
+                                     /*rpc_options=*/nullptr, &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("fqdn.example.com.:2222",
+                                     /*rpc_options=*/nullptr, &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("[2002:a9c:258e::]:2222",
+                                     /*rpc_options=*/nullptr, &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(
+      NewHostPortGrpcChannel("[::]:2222", /*rpc_options=*/nullptr, &mock_ptr)
+          .ok());
+
+  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:2222",
+                                      /*rpc_options=*/nullptr, &mock_ptr)
+                   .ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel("127.0.0.1:2222/",
+                                      /*rpc_options=*/nullptr, &mock_ptr)
+                   .ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel(
+                   "example.com/abc:", /*rpc_options=*/nullptr, &mock_ptr)
+                   .ok());
+  EXPECT_FALSE(
+      NewHostPortGrpcChannel("[::]/:2222", /*rpc_options=*/nullptr, &mock_ptr)
+          .ok());
+  EXPECT_FALSE(
+      NewHostPortGrpcChannel("[::]:2222/", /*rpc_options=*/nullptr, &mock_ptr)
+          .ok());
+  EXPECT_FALSE(
+      NewHostPortGrpcChannel("[::]:", /*rpc_options=*/nullptr, &mock_ptr).ok());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 6008462d0448130ed05393dd438d01002d243167..2daefcb399c79324f80278340967b679be5c6574 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -30,9 +30,11 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
@@ -41,10 +43,12 @@ class GrpcRemoteWorker : public WorkerInterface {
  public:
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
+                            thread::ThreadPool* callback_threadpool,
                             WorkerCacheLogger* logger)
       : channel_(std::move(channel)),
         stub_(channel_),
         cq_(completion_queue),
+        callback_threadpool_(callback_threadpool),
         getstatus_(Method(GrpcWorkerMethod::kGetStatus)),
         createworkersession_(Method(GrpcWorkerMethod::kCreateWorkerSession)),
         deleteworkersession_(Method(GrpcWorkerMethod::kDeleteWorkerSession)),
@@ -121,7 +125,44 @@ class GrpcRemoteWorker : public WorkerInterface {
 
   void RecvBufAsync(CallOptions* call_opts, const RecvBufRequest* request,
                     RecvBufResponse* response, StatusCallback done) override {
-    IssueRequest(request, response, recvbuf_, std::move(done), call_opts);
+    int64 start_usec = Env::Default()->NowMicros();
+    // Type-specialized logging for this method.
+    bool logging_active = logger_->LoggingActive() || VLOG_IS_ON(2);
+    StatusCallback wrapper_done;
+    const StatusCallback* cb_to_use;
+    if (!logging_active) {
+      cb_to_use = &done;  // No additional work to do, so just use done directly
+    } else {
+      wrapper_done = [this, request, response, done, start_usec](Status s) {
+        if (logger_->LoggingActive()) {
+          int64 end_usec = Env::Default()->NowMicros();
+          int64 step_id = request->step_id();
+          RecvBufRespExtra extra;
+          response->transport_options().UnpackTo(&extra);
+          int64 num_bytes = 0;
+          for (const auto& chunk : extra.tensor_content()) {
+            num_bytes += chunk.size();
+          }
+          int64 send_start_usec = start_usec;
+          // Prefer start time reported by the sender, if available.
+          if (response->send_start_micros()) {
+            send_start_usec = std::max(
+                start_usec, static_cast<int64>(response->send_start_micros()));
+            send_start_usec = std::min(send_start_usec, end_usec - 1);
+          }
+          const string& key = request->buf_rendezvous_key();
+          logger_->RecordDataTransfer(
+              step_id, send_start_usec, end_usec, key, request->src_device(),
+              request->dst_device(), num_bytes, "", "RecvBuf");
+        }
+        VLOG(2) << "done callback, req: " << request->DebugString()
+                << " response " << response->DebugString();
+        done(s);
+      };
+      cb_to_use = &wrapper_done;
+    }
+
+    IssueRequest(request, response, recvbuf_, *cb_to_use, call_opts);
   }
 
   void CompleteGroupAsync(CallOptions* call_opts,
@@ -220,13 +261,15 @@ class GrpcRemoteWorker : public WorkerInterface {
                     protobuf::Message* response, const ::grpc::string& method,
                     StatusCallback done, CallOptions* call_opts = nullptr) {
     new RPCState<protobuf::Message>(&stub_, cq_, method, *request, response,
-                                    std::move(done), call_opts);
+                                    std::move(done), call_opts,
+                                    callback_threadpool_);
   }
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
-                                 std::move(done), call_opts);
+                                 std::move(done), call_opts,
+                                 callback_threadpool_);
   }
 
   // Helper function for initializing the RpcMethod objects below.
@@ -235,6 +278,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   SharedGrpcChannelPtr channel_;
   ::grpc::GenericStub stub_;
   ::grpc::CompletionQueue* cq_;
+  thread::ThreadPool* callback_threadpool_;
 
   const ::grpc::string getstatus_;
   const ::grpc::string createworkersession_;
@@ -260,8 +304,10 @@ class GrpcRemoteWorker : public WorkerInterface {
 
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
+                                     thread::ThreadPool* callback_threadpool,
                                      WorkerCacheLogger* logger) {
-  return new GrpcRemoteWorker(std::move(channel), completion_queue, logger);
+  return new GrpcRemoteWorker(std::move(channel), completion_queue,
+                              callback_threadpool, logger);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index b85c1dc5b4e592e621ee96853dd724440ad9b4bd..d1f0e94ba52d81451a1085804cf01375f4d2fb57 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -19,18 +19,19 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 
 namespace grpc {
 class CompletionQueue;
 }
 
 namespace tensorflow {
-
 class WorkerCacheLogger;
 class WorkerInterface;
 
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
+                                     thread::ThreadPool* callback_threadpool,
                                      WorkerCacheLogger* logger);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index cde6b785dc6e351ba0d51bef9b23d6bd05742320..4f5975bbc11a6217355c1fcf368996a0fca45969 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -206,11 +206,11 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
 
   int index = call->index();
   // This object will delete itself when done.
-  new RPCState<string>(get_stub(index), &completion_queue_,
-                       *get_method_ptr(index), call->request(),
-                       call->response(),
-                       /*done=*/[call](const Status& s) { call->Done(s); },
-                       call->call_opts(), fail_fast_, timeout_in_ms_);
+  new RPCState<string>(
+      get_stub(index), &completion_queue_, *get_method_ptr(index),
+      call->request(), call->response(),
+      /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
+      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index c4f2247145c20b5c49ed227ed0b52abe44ebc43d..cbd5cd927e7d73fd0ed28a910c89eef1f73b0d91 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstring>
 #include <limits>
 #include <memory>
+#include <vector>
 
 #include "grpc/support/alloc.h"
 #include "grpcpp/grpcpp.h"
@@ -156,10 +157,12 @@ Status GrpcServer::Init(
   string name_prefix =
       strings::StrCat("/job:", server_def_.job_name(), "/replica:0",
                       "/task:", server_def_.task_index());
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(sess_opts, name_prefix,
-                                               &master_env_.local_devices));
-  worker_env_.local_devices = master_env_.local_devices;
-  worker_env_.device_mgr = new DeviceMgr(worker_env_.local_devices);
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_RETURN_IF_ERROR(
+      DeviceFactory::AddDevices(sess_opts, name_prefix, &devices));
+  worker_env_.device_mgr = new DeviceMgr(std::move(devices));
+  master_env_.local_devices = worker_env_.device_mgr->ListDevices();
+  worker_env_.local_devices = worker_env_.device_mgr->ListDevices();
   worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
                                    ? new RpcRendezvousMgr(&worker_env_)
                                    : rendezvous_mgr_func(&worker_env_);
@@ -194,8 +197,8 @@ Status GrpcServer::Init(
   MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);
   master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
-  worker_impl_ =
-      worker_func ? worker_func(&worker_env_) : NewGrpcWorker(&worker_env_);
+  worker_impl_ = worker_func ? worker_func(&worker_env_, config)
+                             : NewGrpcWorker(&worker_env_, config);
   worker_service_ =
       NewGrpcWorkerService(worker_impl_.get(), &builder).release();
   eager_service_ = new eager::GrpcEagerServiceImpl(&worker_env_, &builder);
@@ -451,7 +454,11 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr, nullptr));
+  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
   *out_server = std::move(ret);
   return Status::OK();
 }
@@ -462,7 +469,11 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr, nullptr));
+  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  if (!s.ok()) {
+    LOG(ERROR) << s;
+    return s;
+  }
   *out_server = std::move(ret);
   return Status::OK();
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 7979e96d3edbf955eb93eb27b30e435b875bcfc7..c1395abddebd1af780ade4884b3f5af239c5fb0e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -53,7 +53,8 @@ typedef std::function<void(const WorkerEnv*, ::grpc::ServerBuilder*)>
     ServiceInitFunction;
 
 // function that creates a grpc based worker implementation.
-typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*)>
+typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*,
+                                                  const ConfigProto& config)>
     WorkerCreationFunction;
 
 class GrpcServer : public ServerInterface {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index fdce1b10e0a8ade6f96b280e3c6dc33ec69d504b..32063fecbbef4347bcdbfbdfda32f008015b5975 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -52,8 +52,9 @@ Status GrpcSession::Create(const SessionOptions& options,
   }
   if (!master) {
     SharedGrpcChannelPtr master_channel;
-    TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
-        options.target.substr(kSchemePrefixLength), &master_channel));
+    TF_RETURN_IF_ERROR(
+        NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength),
+                               &options.config.rpc_options(), &master_channel));
     master.reset(NewGrpcMaster(master_channel));
   }
   session->SetRemoteMaster(std::move(master));
@@ -91,6 +92,12 @@ void ReEncodeConsts(GraphDef* gdef) {
 }
 }  // namespace
 
+void GrpcSession::SetHandleAndGraphVersion(string handle, int64 graph_version) {
+  mutex_lock l(mu_);
+  handle_ = std::move(handle);
+  current_graph_version_ = graph_version;
+}
+
 Status GrpcSession::Handle(string* out_handle) {
   mutex_lock l(mu_);
   if (handle_.empty()) {
@@ -116,9 +123,7 @@ Status GrpcSession::CreateImpl(CallOptions* call_options,
   CreateSessionResponse resp;
   Status s = master_->CreateSession(call_options, &req, &resp);
   if (s.ok()) {
-    mutex_lock l(mu_);
-    swap(handle_, *(resp.mutable_session_handle()));
-    current_graph_version_ = resp.graph_version();
+    SetHandleAndGraphVersion(resp.session_handle(), resp.graph_version());
   }
   return s;
 }
@@ -384,8 +389,9 @@ void GrpcSession::SetRemoteMaster(std::unique_ptr<MasterInterface> master) {
 Status GrpcSession::Reset(const SessionOptions& options,
                           const std::vector<string>& containers) {
   SharedGrpcChannelPtr master_channel;
-  TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
-      options.target.substr(kSchemePrefixLength), &master_channel));
+  TF_RETURN_IF_ERROR(
+      NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength),
+                             /*rpc_options=*/nullptr, &master_channel));
   auto master = NewGrpcMaster(master_channel);
   ResetRequest req;
   for (const auto& c : containers) req.add_container(c);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index 63795117f9763434f5ff331d3d2d3bdb99413e81..a3ed3ec73669a0844c27af90e974131574174e88 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -106,9 +106,12 @@ class GrpcSession : public Session {
  protected:
   // Takes ownership of `*master`.
   void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
+  // Allows subclasses to customize Session creation.
+  void SetHandleAndGraphVersion(string handle, int64 graph_version)
+      LOCKS_EXCLUDED(mu_);
 
  private:
-  SessionOptions options_;
+  const SessionOptions options_;
   std::unique_ptr<MasterInterface> master_;
   mutex mu_;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index fc601991a24d5718d58bc70da370b952622fd5c8..ad0f8e5e2fcec011812b69082bc1747bd51fd7d3 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -1066,4 +1066,31 @@ TEST(SessionTest, RunTimeoutWithRunOptions) {
               error::INTERNAL == status.code());
 }
 
+TEST(SessionTest, TestCompression) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 1, &cluster));
+  SessionOptions options = Options(cluster->targets()[0], 100);
+  RPCOptions* rpc_options = options.config.mutable_rpc_options();
+  rpc_options->set_compression_algorithm("deflate");
+  rpc_options->set_compression_level(GRPC_COMPRESS_LEVEL_HIGH);
+
+  std::unique_ptr<Session> session(NewRemote(options));
+
+  static const float kTestValue = 409.1934f;
+  Graph graph(OpRegistry::Global());
+  Tensor tensor(DT_FLOAT, TensorShape({1, 1}));
+  tensor.flat<float>()(0) = kTestValue;
+  Node* b = test::graph::Constant(&graph, tensor);
+  GraphDef gdef;
+  graph.ToGraphDef(&gdef);
+  RunOptions run_options;
+  TF_CHECK_OK(session->Create(run_options, gdef));
+
+  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<Tensor> outputs;
+  TF_CHECK_OK(session->Run(inputs, {b->name()}, {}, &outputs));
+  ASSERT_EQ(1, outputs.size());
+  IsSingleFloatValue(outputs[0], kTestValue);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 61c5bc285f2f2e38a39737408a446a84b8442690..b67f3c4563107882a556e83c07ee20ca69b3f3b4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/notification.h"
 
 namespace tensorflow {
@@ -36,16 +37,18 @@ class RPCState : public GrpcClientCQTag {
   // Default behavior is to set fail_fast = False and handle timeouts manually.
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const protobuf::Message& request,
-           Response* response, StatusCallback done, CallOptions* call_opts)
+           Response* response, StatusCallback done, CallOptions* call_opts,
+           thread::ThreadPool* threadpool)
       : RPCState(stub, cq, method, request, response, std::move(done),
-                 call_opts, /*fail_fast=*/false, /*timeout_in_ms=*/0) {}
+                 call_opts, threadpool, /*fail_fast=*/false,
+                 /*timeout_in_ms=*/0) {}
 
   template <typename Request>
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           bool fail_fast, int64 timeout_in_ms)
-      : call_opts_(call_opts), done_(std::move(done)) {
+           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms)
+      : call_opts_(call_opts), threadpool_(threadpool), done_(std::move(done)) {
     context_.set_fail_fast(fail_fast);
     if (timeout_in_ms > 0) {
       context_.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
@@ -77,11 +80,27 @@ class RPCState : public GrpcClientCQTag {
       // to Finish for client-side unary calls, ok should never be false
       s.Update(errors::Internal("unexpected ok value at rpc completion"));
     }
-    if (s.ok() && !GrpcMaybeParseProto(&response_buf_, response_)) {
-      s.Update(errors::Internal("could not parse rpc response"));
-    }
-    if (!s.ok()) {
+
+    if (s.ok()) {
+      if (threadpool_) {
+        // Run parse and callback in another thread, returning this
+        // one to service more RPCs.
+        threadpool_->Schedule([this]() { ParseAndCallDone(); });
+      } else {
+        ParseAndCallDone();
+        return;
+      }
+    } else {
       VLOG(2) << "Call returned with non-ok status: " << s;
+      done_(s);
+      delete this;
+    }
+  }
+
+  void ParseAndCallDone() {
+    Status s;
+    if (!GrpcMaybeParseProto(&response_buf_, response_)) {
+      s.Update(errors::Internal("could not parse rpc response"));
     }
     done_(s);
     delete this;
@@ -90,6 +109,7 @@ class RPCState : public GrpcClientCQTag {
  private:
   CallOptions* call_opts_;
   ::grpc::ClientContext context_;
+  thread::ThreadPool* threadpool_;
   std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
   Response* response_;
   ::grpc::ByteBuffer request_buf_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index e1541db69bfc2471ff1241a0154f442c1fd5511c..60d5881d4ca75a7ea201d592d8668bce7438592e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -43,7 +43,17 @@ class GrpcWorkerCache : public WorkerCachePartial {
         local_worker_(local_worker),
         channel_cache_(channel_cache),
         threads_(kGrpcWorkerCacheThreadCount),
-        next_round_robin_assignment_(0) {}
+        next_round_robin_assignment_(0) {
+    // NOTE: We don't yet have any reason to assign NUMA affinity to this
+    // ThreadPool.  If there's only a single NIC it shouldn't make any
+    // difference since presumably it is handling memory from all nodes.
+    ThreadOptions options;
+    options.numa_node = port::kNUMANoAffinity;
+    const int kNumCallbackThreads = 10;
+    callback_threadpool_.reset(new thread::ThreadPool(
+        Env::Default(), options, "grpc_wcache_callback", kNumCallbackThreads,
+        false /*low_latency_hint*/, nullptr /*allocator*/));
+  }
 
   // Explicit destructor to control destruction order.
   ~GrpcWorkerCache() override {
@@ -67,7 +77,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
       if (!channel) return nullptr;
       return NewGrpcRemoteWorker(
           channel, threads_[AssignWorkerToThread(target)].completion_queue(),
-          &logger_);
+          callback_threadpool_.get(), &logger_);
     }
   }
 
@@ -138,6 +148,8 @@ class GrpcWorkerCache : public WorkerCachePartial {
   WorkerCacheLogger logger_;
   std::vector<GrpcWorkerCacheThread> threads_;
 
+  std::unique_ptr<thread::ThreadPool> callback_threadpool_;
+
   mutex assignment_mu_;
   std::unordered_map<std::string, size_t> target_assignments_
       GUARDED_BY(assignment_mu_);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 1b6d796bd4331a2558572f91324abdabaec45356..de80992095d13fa38172b3a30c5fdd6c177994e1 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -418,8 +418,13 @@ class GrpcWorkerService : public AsyncServiceInterface {
 
 }  // namespace
 
-GrpcWorker::GrpcWorker(WorkerEnv* worker_env)
-    : Worker(worker_env), recent_request_ids_(100000) {}
+GrpcWorker::GrpcWorker(WorkerEnv* worker_env, const ConfigProto& config)
+    : Worker(worker_env),
+      recent_request_ids_(100000),
+      recv_buf_max_chunk_(
+          config.experimental().recv_buf_max_chunk() > 0
+              ? config.experimental().recv_buf_max_chunk()
+              : (config.experimental().recv_buf_max_chunk() < 0 ? 0 : 4096)) {}
 
 // GrpcRecvTensorAsync: unlike the other Worker methods, which use protocol
 // buffers for a response object, to avoid extra protocol buffer serialization
@@ -505,6 +510,33 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+namespace {
+// If RecvBufRespExtra.tensor_content is a single large string, then gRPC
+// can stall on the recv side when the string buffer needs to be enlarged,
+// since the size is not sent in advance.  Changing this field to a sequence
+// of small strings costs some extra time on the send side, since we do
+// some otherwise unnecessary copies, but it improves runtime overall by
+// improving flow control.  Best performance is likely achieved with a
+// max_chunk_bytes equal to the memory page size.
+//
+// TODO(tucker): When proto3 supports [ctype=CORD] then change
+// RecvBufRespExtra.tensor_content to a cord instead of a repeated string,
+// and remove this function.
+void SetTensorInRecvBufResp(int64 max_chunk_bytes, const Tensor* tensor,
+                            int64 num_bytes, RecvBufResponse* response) {
+  RecvBufRespExtra extra;
+  const char* head = reinterpret_cast<const char*>(DMAHelper::base(tensor));
+  while (num_bytes > 0) {
+    int64 bytes =
+        max_chunk_bytes > 0 ? std::min(num_bytes, max_chunk_bytes) : num_bytes;
+    extra.add_tensor_content(std::string(head, bytes));
+    head += bytes;
+    num_bytes -= bytes;
+  }
+  response->mutable_transport_options()->PackFrom(extra);
+}
+}  // namespace
+
 void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
                               RecvBufResponse* response, StatusCallback done) {
   // This is a generic, low performance implementation appropriate for grpc.
@@ -551,11 +583,8 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
                   [this, num_bytes, response, done, hook,
                    cpu_tensor](const Status& s) {
                     if (s.ok()) {
-                      RecvBufRespExtra extra;
-                      extra.set_tensor_content(reinterpret_cast<const char*>(
-                                                   DMAHelper::base(cpu_tensor)),
-                                               num_bytes);
-                      response->mutable_transport_options()->PackFrom(extra);
+                      SetTensorInRecvBufResp(recv_buf_max_chunk_, cpu_tensor,
+                                             num_bytes, response);
                     }
                     response->set_send_start_micros(env_->env->NowMicros());
                     done(s);
@@ -566,11 +595,8 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
             }
           } else {
             // Tensor is on CPU.
-            RecvBufRespExtra extra;
-            extra.set_tensor_content(reinterpret_cast<const char*>(
-                                         DMAHelper::base(hook->prod_value)),
-                                     num_bytes);
-            response->mutable_transport_options()->PackFrom(extra);
+            SetTensorInRecvBufResp(recv_buf_max_chunk_, hook->prod_value,
+                                   num_bytes, response);
           }
         }
         response->set_send_start_micros(env_->env->NowMicros());
@@ -608,8 +634,9 @@ void GrpcWorker::LoggingAsync(const LoggingRequest* request,
 
 WorkerEnv* GrpcWorker::env() { return env_; }
 
-std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env) {
-  return std::unique_ptr<GrpcWorker>(new GrpcWorker(env));
+std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env,
+                                          const ConfigProto& config) {
+  return std::unique_ptr<GrpcWorker>(new GrpcWorker(env, config));
 }
 
 std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index d9e48524dea0f265a7ee4b9a16ee12fd007d17ff..996617d385d1c0e397c30eeceb4f737690fb9490 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -27,12 +27,13 @@ class ServerBuilder;
 namespace tensorflow {
 
 class AsyncServiceInterface;
+class ConfigProto;
 struct WorkerEnv;
 struct WorkerSession;
 
 class GrpcWorker : public Worker {
  public:
-  GrpcWorker(WorkerEnv* env);
+  GrpcWorker(WorkerEnv* env, const ConfigProto& config);
 
   // Specialized version of RecvTensor for gRPC, which avoids a copy.
   virtual void GrpcRecvTensorAsync(CallOptions* opts,
@@ -50,9 +51,11 @@ class GrpcWorker : public Worker {
 
  private:
   RecentRequestIds recent_request_ids_;
+  const int32 recv_buf_max_chunk_;
 };
 
-std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* worker_env);
+std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* worker_env,
+                                          const ConfigProto& config);
 
 // Returns an implementation of WorkerService rpc service.
 std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index b8cb5385038ed2c01d15cb5a571cd2d5ec6505c8..9fb920404f987d6b5b324cce4155da40c7e753b4 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -244,6 +244,15 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
   // Record "call" in active_ so that it can be aborted cleanly.
   RegisterCall(call);
 
+  // RendezvousMgr already aborted, shouldn't send RPC call any more
+  if (!call->status().ok()) {
+    call->done()(call->status(), Args(), Args(), Tensor(), false);
+    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
+    call->wi_ = nullptr;
+    get_call_freelist()->Release(call, session()->worker_cache.get());
+    return;
+  }
+
   // Start "call".
   Ref();
   call->Start([this, call]() {
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
index 45b989f6e226761b8b1af068f6a60796d3b4d3c4..054bed7781b8f43e2294c8b8c00ecd119aa20f76 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
@@ -49,7 +49,8 @@ CollectiveExecutor* RpcCollectiveExecutorMgr::Create(int64 step_id) {
   CollectiveRemoteAccessDistributed* rma =
       new CollectiveRemoteAccessDistributed(dev_mgr_, dev_resolver_.get(),
                                             worker_cache_, step_id);
-  return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_);
+  return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
+                                    &gpu_ring_order_);
 }
 
 namespace {
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
index 0323300fdde0734d3da216ed69958556b27a49b5..1c87fe9d928f65008d0a87af58873bffb5f9aa18 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
@@ -42,8 +42,9 @@ class RpcCollectiveExecutorMgrTest : public ::testing::Test {
     WorkerCacheInterface* worker_cache = nullptr;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     std::unique_ptr<DeviceResolverDistributed> dr(new DeviceResolverDistributed(
         device_mgr_.get(), worker_cache, task_name));
     std::unique_ptr<CollectiveParamResolverDistributed> cpr(
@@ -57,7 +58,6 @@ class RpcCollectiveExecutorMgrTest : public ::testing::Test {
   }
 
   std::unique_ptr<RpcCollectiveExecutorMgr> cme_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 7d308bb723a71e23482b6f52fa6d8fa53f89dda8..fe9369e884b8e24b31622b82487712ae6f96a6dd 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -49,16 +49,22 @@ void ServerFactory::Register(const string& server_type,
 Status ServerFactory::GetFactory(const ServerDef& server_def,
                                  ServerFactory** out_factory) {
   mutex_lock l(*get_server_factory_lock());
-  // TODO(mrry): Improve the error reporting here.
   for (const auto& server_factory : *server_factories()) {
     if (server_factory.second->AcceptsOptions(server_def)) {
       *out_factory = server_factory.second;
       return Status::OK();
     }
   }
+
+  std::vector<string> server_names;
+  for (const auto& server_factory : *server_factories()) {
+    server_names.push_back(server_factory.first);
+  }
+
   return errors::NotFound(
       "No server factory registered for the given ServerDef: ",
-      server_def.DebugString());
+      server_def.DebugString(), "\nThe available server factories are: [ ",
+      str_util::Join(server_names, ", "), " ]");
 }
 
 // Creates a server based on the given `server_def`, and stores it in
diff --git a/tensorflow/core/distributed_runtime/server_lib_test.cc b/tensorflow/core/distributed_runtime/server_lib_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..460372523c98c4e5a1e83be7a025e5911e9b4a8c
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/server_lib_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class TestServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "test_protocol";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return Status::OK();
+  }
+};
+
+TEST(ServerLibTest, NewServerFactoryAccepts) {
+  ServerFactory::Register("TEST_SERVER", new TestServerFactory());
+  ServerDef server_def;
+  server_def.set_protocol("test_protocol");
+  std::unique_ptr<ServerInterface> server;
+  TF_EXPECT_OK(NewServer(server_def, &server));
+}
+
+TEST(ServerLibTest, NewServerNoFactoriesAccept) {
+  ServerDef server_def;
+  server_def.set_protocol("fake_protocol");
+  std::unique_ptr<ServerInterface> server;
+  Status s = NewServer(server_def, &server);
+  ASSERT_NE(s, Status::OK());
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No server factory registered for the given ServerDef"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "The available server factories are: ["));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 95b31c6991f6344c1b15b1fd28225aef37359818..29fe767e42a8dcec873f2a03dfe3247841da38c1 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -78,13 +78,13 @@ Status SessionMgr::CreateSession(const string& session,
 
   if (isolate_session_state) {
     // Create a private copy of the DeviceMgr for the WorkerSession.
-    std::vector<Device*> renamed_devices;
+    std::vector<std::unique_ptr<Device>> renamed_devices;
     for (Device* d : worker_env_->local_devices) {
       renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
           worker_name, d, false, isolate_session_state));
     }
 
-    auto device_mgr = MakeUnique<DeviceMgr>(renamed_devices);
+    auto device_mgr = MakeUnique<DeviceMgr>(std::move(renamed_devices));
     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
     worker_session.reset(
         new WorkerSession(session, worker_name,
@@ -122,7 +122,9 @@ Status SessionMgr::WorkerSessionForSessionLocked(
     auto it = sessions_.find(session_handle);
     if (it == sessions_.end()) {
       return errors::Aborted("Session handle is not found: ", session_handle,
-                             ". Possibly this worker just restarted.");
+                             ". Possibly this worker (\"",
+                             legacy_session_->worker_name,
+                             "\") just restarted.");
     } else {
       *out_session = it->second;
     }
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 99192119a63e2553bc107eff3f79a436c455b9e3..1ab0d20f0b53798ea63e69d25f41c47bcaef17d4 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -46,11 +46,9 @@ class SessionMgrTest : public ::testing::Test {
   SessionMgrTest()
       : mgr_(&env_, "/job:mnist/replica:0/task:0",
              std::unique_ptr<WorkerCacheInterface>(), factory_) {
-    Device* device =
-        FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0")
-            .release();
-    env_.local_devices = {device};
-    device_mgr_.reset(new DeviceMgr(env_.local_devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(
+        FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0"));
+    env_.local_devices = device_mgr_->ListDevices();
     env_.device_mgr = device_mgr_.get();
   }
 
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 1ea19c48f09170e6044eb9c72b5090dfc2feb703..f42143e5824827e35a97ac25cb80b0e2c82e716e 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
+#include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/tracing.h"
 
 namespace tensorflow {
@@ -179,7 +180,28 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
       request->exec_opts().record_timeline() ||
       request->exec_opts().record_costs()) {
     collector = new StepStatsCollector(response->mutable_step_stats());
-    // TODO(mrry,pbar): GPU tracing for distributed steps.
+  }
+  DeviceTracer* tracer = nullptr;
+  if (collector && request->exec_opts().record_timeline()) {
+    // If timeline was requested, assume we want hardware level tracing.
+    std::unique_ptr<DeviceTracer> trptr = CreateDeviceTracer();
+    if (trptr) {
+      tracer = trptr.release();
+      Status s = tracer->Start();
+      if (!s.ok()) {
+        delete tracer;
+        if (errors::IsUnavailable(s)) {
+          LOG(WARNING)
+              << "Hardware tracing unavailable, continuing without it. " << s;
+          tracer = nullptr;
+        } else {
+          delete collector;
+          delete out;
+          done(s);
+          return;
+        }
+      }
+    }
   }
   CancellationManager* cm = new CancellationManager;
   opts->SetCancelCallback([this, cm, step_id]() {
@@ -194,6 +216,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
     opts->ClearCancelCallback();
     delete cm;
     delete collector;
+    delete tracer;
     delete out;
     done(errors::Aborted("Call was aborted"));
     return;
@@ -201,8 +224,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
   session->graph_mgr->ExecuteAsync(
       request->graph_handle(), step_id, session.get(), request->exec_opts(),
       collector, response, cm, in,
-      [this, step_id, response, session, cm, out, token, collector, opts,
-       done](Status s) {
+      [this, step_id, response, session, cm, out, token, collector, tracer,
+       opts, done](Status s) {
         if (s.ok()) {
           s = session->graph_mgr->RecvOutputs(step_id, out);
         }
@@ -210,6 +233,15 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
         cancellation_manager_.DeregisterCallback(token);
         delete cm;
 
+        if (tracer) {
+          Status tracer_status = tracer->Stop();
+          if (tracer_status.ok()) {
+            tracer_status = tracer->Collect(collector);
+          }
+          if (!tracer_status.ok()) {
+            LOG(ERROR) << "Bad status from tracer: " << tracer_status;
+          }
+        }
         if (s.ok()) {
           for (const auto& p : *out) {
             const string& key = p.first;
@@ -219,6 +251,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
         }
         if (collector) collector->Finalize();
         delete collector;
+        delete tracer;
         delete out;
         done(s);
       });
@@ -405,7 +438,9 @@ Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
     return errors::Aborted(
         "RecvTensor expects a different device incarnation: ",
         parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
-        ". Your worker job was probably restarted. Check your "
+        ". Your worker job (\"",
+        env_->session_mgr->LegacySession()->worker_name,
+        "\") was probably restarted. Check your "
         "worker job for the reason why it was restarted.");
   }
 
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 95ca3c3b4d11fac0d103eb52f19d5b0b2f4ad3ea..e0a1734087061c4c736ff93918fd82945b3742c1 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -101,13 +101,18 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
                                            const string& transfer_method_name) {
   NodeExecStats* ns = new NodeExecStats;
   ns->set_node_name(transfer_method_name);
+  int64 elapsed_usecs = end_usecs - start_usecs;
   if (details.empty()) {
     auto byte_string = strings::StrCat("[", bytes, "B] ");
     if (bytes >= 0.1 * 1048576.0) {
       byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0);
     }
-    auto label = strings::StrCat(byte_string, tensor_name, " from ", src_device,
-                                 " to ", dst_device);
+    float mbs_rate = (8.0 * static_cast<float>(bytes)) / elapsed_usecs;
+    auto rate_string = (mbs_rate >= 1000.0)
+                           ? strings::Printf("[%.1fGb/s] ", mbs_rate / 1000.0)
+                           : strings::Printf("[%fMb/s] ", mbs_rate);
+    auto label = strings::StrCat(byte_string, rate_string, tensor_name,
+                                 " from ", src_device, " to ", dst_device);
     ns->set_timeline_label(label);
   } else {
     ns->set_timeline_label(details);
@@ -115,13 +120,10 @@ void WorkerCacheLogger::RecordDataTransfer(int64 step_id, int64 start_usecs,
 
   ns->set_all_start_micros(start_usecs);
   ns->set_op_start_rel_micros(0);
-  int64 elapsed = end_usecs - start_usecs;
-  ns->set_op_end_rel_micros(elapsed);
-  ns->set_all_end_rel_micros(elapsed);
+  ns->set_op_end_rel_micros(elapsed_usecs);
+  ns->set_all_end_rel_micros(elapsed_usecs);
   NodeOutput* no = ns->add_output();
   no->set_slot(0);
-  // TODO(tucker): Maybe set the dimensions too, but then they'll
-  // need to be passed in.
   no->mutable_tensor_description()
       ->mutable_allocation_description()
       ->set_requested_bytes(bytes);
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 84cee5569c4ac2c0083e4d4970b48460d9bd95ca..89c49a2ad050bfe067e9557aabd2916fba812fb0 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -96,9 +96,11 @@ static int64_t TotalAllocationWarningBytes() {
 void EnableCPUAllocatorStats(bool enable) {
   cpu_allocator_collect_stats = enable;
 }
+bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
 void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
+bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
 namespace {
 // A default Allocator for CPU devices.  ProcessState::GetCPUAllocator() will
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 8c23604625ba77a4ca4fa42f96059735ed525f5d..531ea73e89277c83cfede50fce0de08b65c5e5a5 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -383,10 +383,12 @@ Allocator* cpu_allocator();
 // If 'enable' is true, the default CPU allocator implementation will collect
 // AllocatorStats. By default, it's disabled.
 void EnableCPUAllocatorStats(bool enable);
+bool CPUAllocatorStatsEnabled();
 
 // If 'enable' is true, the default CPU allocator implementation will collect
 // full statistics. By default, it's disabled.
 void EnableCPUAllocatorFullStats(bool enable);
+bool CPUAllocatorFullStatsEnabled();
 
 // An object that does the underlying suballoc/free of memory for a higher-level
 // allocator.  The expectation is that the higher-level allocator is doing some
diff --git a/tensorflow/core/framework/api_def.proto b/tensorflow/core/framework/api_def.proto
index f8553cf5bbb690a664513c783795e75a4625e5f9..b0f852170b159a38f3e7f8c8c0ff18cfffa068af 100644
--- a/tensorflow/core/framework/api_def.proto
+++ b/tensorflow/core/framework/api_def.proto
@@ -34,6 +34,10 @@ message ApiDef {
   // that should be logged when this op is used.
   // The message should indicate alternative op to use, if any.
   string deprecation_message = 12;
+  // Major version when the op will be deleted. For e.g. set this
+  // value to 2 if op API should be removed in TensorFlow 2.0 and
+  // deprecated in versions before that.
+  int32 deprecation_version = 13;
 
   enum Visibility {
     // Normally this is "VISIBLE" unless you are inheriting a
@@ -64,6 +68,11 @@ message ApiDef {
     // to use a non-deprecated endpoint instead will be printed. If all
     // endpoints are deprecated, set deprecation_message in ApiDef instead.
     bool deprecated = 3;
+
+    // Major version when an endpoint will be deleted. For e.g. set this
+    // value to 2 if endpoint should be removed in TensorFlow 2.0 and
+    // deprecated in versions before that.
+    int32 deprecation_version = 4;
   }
   repeated Endpoint endpoint = 3;
 
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index 0a1b5e1975580984c8f245f0889d0cb00ef4dba6..ce97085494175e57b41215779b32234c1c1d5f3c 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -15,14 +15,28 @@ limitations under the License.
 
 #include "tensorflow/core/framework/bfloat16.h"
 
+#include "absl/base/casts.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace {
 
+TEST(Bfloat16Test, DefaultValueIsZero) {
+  EXPECT_EQ(0.0f, static_cast<float>(bfloat16()));
+}
+
+TEST(Bfloat16Test, RepresentableFloatsRoundTripViaBfloat16) {
+  const std::vector<float> values = {
+      -std::numeric_limits<float>::infinity(), -1.0, -0.5, -0.0, 0.0, 0.5, 1.0,
+      std::numeric_limits<float>::infinity(),
+  };
+  for (float v : values) {
+    EXPECT_EQ(v, static_cast<float>(static_cast<bfloat16>(v)));
+  }
+}
+
 TEST(Bfloat16Test, Simple) {
   bfloat16 a(12);
   // Floating point representation of 12: 0x41400000
@@ -31,8 +45,8 @@ TEST(Bfloat16Test, Simple) {
 
 float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
                     uint32_t low_mantissa) {
-  return bit_cast<float>((sign << 31) + (exponent << 23) +
-                         (high_mantissa << 16) + low_mantissa);
+  return absl::bit_cast<float>((sign << 31) + (exponent << 23) +
+                               (high_mantissa << 16) + low_mantissa);
 }
 
 struct Bfloat16TestParam {
diff --git a/tensorflow/core/framework/bounds_check.h b/tensorflow/core/framework/bounds_check.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fd071c0b9a395a90bd2dd7bfaefdb728a951a0b
--- /dev/null
+++ b/tensorflow/core/framework/bounds_check.h
@@ -0,0 +1,54 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_BOUNDS_CHECK_H_
+#define TENSORFLOW_CORE_FRAMEWORK_BOUNDS_CHECK_H_
+
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+
+// Check that 0 <= index < limit using a single comparison, assuming
+// that 0 <= limit if Index is signed.  Intended for use in performance
+// critical contexts where 0 <= index < limit is almost always true.
+template <typename Ta, typename Tb>
+EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool FastBoundsCheck(const Ta index,
+                                                           const Tb limit) {
+  static_assert(std::is_integral<Ta>::value && std::is_integral<Tb>::value,
+                "FastBoundsCheck can only be used on integer types.");
+  typedef typename std::make_unsigned<decltype(index + limit)>::type UIndex;
+  return TF_PREDICT_TRUE(static_cast<UIndex>(index) <
+                         static_cast<UIndex>(limit));
+}
+
+namespace internal {
+// Ensure that the compiler cannot elide a copy into a local, for
+// bounds checking on source tensors that might be updated asynchronously.
+// This function may only be used on primitive integral types (int32, int64,
+// etc).  It does not guarantee any atomicity or barriers.
+template <typename T>
+EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
+  static_assert(std::is_integral<T>::value,
+                "SubtleMustCopy can only be used on integer types.");
+  auto *to_x = reinterpret_cast<const volatile T *>(&x);
+  return *to_x;
+}
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_BOUNDS_CHECK_H_
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index 4cb277d5a886a4d1b5560b7c18a6ff1f429502f5..7fa58347f258acf327e112f4c9cd58c37134ceee 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -64,6 +64,7 @@ CollInstanceParams& CollInstanceParams::operator=(
     device_names.assign(other.device_names.begin(), other.device_names.end());
     task_names.assign(other.task_names.begin(), other.task_names.end());
     same_num_devices_per_task = other.same_num_devices_per_task;
+    gpu_ring_order = other.gpu_ring_order;
     impl_details.subdiv_offsets.assign(
         other.impl_details.subdiv_offsets.begin(),
         other.impl_details.subdiv_offsets.end());
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index e35edb09d0c1cab98202b45c4cd52d256bcc963b..0321429702af74dfb18ca631b0314c705150ec06 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -85,6 +85,9 @@ struct CollInstanceParams {
   std::vector<string> task_names;
   // True if every task has the same number of devices.
   bool same_num_devices_per_task = false;
+  // If passed in to GPUOptions in ConfigProto, defines a good ring order for
+  // GPUs.  Assumes same GPU configuration at each worker.
+  string gpu_ring_order = "";
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
@@ -259,7 +262,9 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
   virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
                                    CancellationManager* cancel_mgr,
                                    StatusCallback done) {
-    cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr, done);
+    done(errors::Internal(
+        "A collective Op has been called in a context in which "
+        "a CollectiveExecutor has not been provided."));
   }
 
   virtual PerStepCollectiveRemoteAccess* remote_access() { return nullptr; }
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 50403b4004d77f94759d183a7657426583f35cb4..bf2d902af41c690be25a170da6fc22a4902e2d50 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -228,12 +228,12 @@ Status BiasAddShape(shape_inference::InferenceContext* c) {
   if (s.ok() && data_format == "NCHW") {
     // Merge the length of bias_shape into the third to last dimension
     ShapeHandle first;
-    TF_RETURN_IF_ERROR(c->Subshape(input_shape, 0, -3, &first));
+    TF_RETURN_IF_ERROR(c->Subshape(input_shape, 0, 1, &first));
 
     ShapeHandle last;
-    TF_RETURN_IF_ERROR(c->Subshape(input_shape, -2, &last));
+    TF_RETURN_IF_ERROR(c->Subshape(input_shape, 2, &last));
 
-    DimensionHandle input_bias_dim = c->Dim(input_shape, -3);
+    DimensionHandle input_bias_dim = c->Dim(input_shape, 1);
     DimensionHandle merged_bias_dim;
     TF_RETURN_IF_ERROR(c->Merge(input_bias_dim, bias_dim, &merged_bias_dim));
     ShapeHandle merged_bias = c->Vector(merged_bias_dim);
@@ -266,7 +266,7 @@ Status BiasAddGradShape(shape_inference::InferenceContext* c) {
 
   if (s.ok() && data_format == "NCHW") {
     TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 3, &input_shape));
-    c->set_output(0, c->Vector(c->Dim(input_shape, -3)));
+    c->set_output(0, c->Vector(c->Dim(input_shape, 1)));
   } else {
     TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input_shape));
     c->set_output(0, c->Vector(c->Dim(input_shape, -1)));
@@ -1059,7 +1059,7 @@ Status UnknownShape(shape_inference::InferenceContext* c) {
 template <typename T>
 Status ReductionShapeHelper(const Tensor* reduction_indices_t,
                             const int32 input_rank,
-                            std::set<int64>& true_indices) {
+                            std::set<int64>* true_indices) {
   auto reduction_indices = reduction_indices_t->flat<T>();
   for (int i = 0; i < reduction_indices_t->NumElements(); ++i) {
     const T reduction_index = reduction_indices(i);
@@ -1074,7 +1074,7 @@ Status ReductionShapeHelper(const Tensor* reduction_indices_t,
       wrapped_index += input_rank;
     }
 
-    true_indices.insert(wrapped_index);
+    true_indices->insert(wrapped_index);
   }
   return Status::OK();
 }
@@ -1112,10 +1112,10 @@ Status ReductionShape(InferenceContext* c) {
   std::set<int64> true_indices;
   if (reduction_indices_t->dtype() == DataType::DT_INT32) {
     TF_RETURN_IF_ERROR(ReductionShapeHelper<int32>(reduction_indices_t,
-                                                   input_rank, true_indices));
+                                                   input_rank, &true_indices));
   } else if (reduction_indices_t->dtype() == DataType::DT_INT64) {
     TF_RETURN_IF_ERROR(ReductionShapeHelper<int64>(reduction_indices_t,
-                                                   input_rank, true_indices));
+                                                   input_rank, &true_indices));
   } else {
     return errors::InvalidArgument(
         "reduction_indices can only be int32 or int64");
@@ -1457,7 +1457,11 @@ Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
 Status ScatterNdUpdateShape(InferenceContext* c) {
   ShapeHandle input_shape = c->input(0);
   if (c->input_handle_shapes_and_types(0) != nullptr) {
-    input_shape = (*c->input_handle_shapes_and_types(0))[0].shape;
+    // This is called for tf.scatter_nd_update; input is a Variable handle.
+    const auto& shape_and_type = *(c->input_handle_shapes_and_types(0));
+    if (shape_and_type.size() == 1) {
+      input_shape = shape_and_type[0].shape;
+    }
   }
   ShapeHandle indices_shape;
   TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
@@ -1514,7 +1518,8 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
     }
   }
 
-  if (c->input_handle_shapes_and_types(0) == nullptr) {
+  if (c->input_handle_shapes_and_types(0) == nullptr && c->num_outputs() > 0) {
+    // This is called for tf.scatter_nd; output is a tensor with this shape.
     c->set_output(0, input_shape);
   }
   return Status::OK();
@@ -1544,6 +1549,51 @@ Status ExplicitShapes(InferenceContext* c) {
   return Status::OK();
 }
 
+Status SparseReduceShapeFn(InferenceContext* c) {
+  // Input 0: input_indices
+  // Input 1: input_values
+  // Input 2: input_shape
+  // Input 3: reduction_axes
+  // Attr: keep_dims
+  bool keep_dims = false;
+  TF_RETURN_IF_ERROR(c->GetAttr("keep_dims", &keep_dims));
+
+  const Tensor* shape_tensor = c->input_tensor(2);
+  const Tensor* axes_tensor = c->input_tensor(3);
+  if (shape_tensor != nullptr && axes_tensor != nullptr) {
+    auto shape_vec = shape_tensor->flat<int64>();
+    auto axes_vec = axes_tensor->flat<int32>();
+
+    int64 ndims = shape_vec.size();
+    std::unordered_set<int64> axes;
+    for (int i = 0; i < axes_vec.size(); i++) {
+      axes.insert((axes_vec(i) + ndims) % ndims);
+    }
+
+    std::vector<DimensionHandle> dims;
+    if (keep_dims) {
+      dims.reserve(ndims);
+      for (int d = 0; d < ndims; ++d) {
+        if (axes.find(d) == axes.end()) {
+          dims.push_back(c->MakeDim(shape_vec(d)));
+        } else {
+          dims.push_back(c->MakeDim(1));
+        }
+      }
+    } else {
+      for (int d = 0; d < ndims; ++d) {
+        if (axes.find(d) == axes.end()) {
+          dims.push_back(c->MakeDim(shape_vec(d)));
+        }
+      }
+    }
+
+    c->set_output(0, c->MakeShape(dims));
+    return Status::OK();
+  }
+  return UnknownShape(c);
+}
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 3a496e06aeb5c28d4e7c8ea306151fee16d3eba4..362899b947b1fd479d227ac5421a5f458405f3c6 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -310,6 +310,9 @@ Status ExplicitShape(InferenceContext* c);
 // Shape function for multiple-output ops with an explicit "shapes" attribute.
 Status ExplicitShapes(InferenceContext* c);
 
+// Shape function for SparseReduceMax and SparseReduceSum.
+Status SparseReduceShapeFn(InferenceContext* c);
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 919e0967c03a24eda9a22e931c2708a412159420..7c395679d304ffab1dfeff6804eede0d09b63734 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -278,9 +278,7 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Finalize(&def));
     InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
                        {S({8, 6, 4, 2, 3, 4, 5}), S({3})}, {}, {}, {});
-    TF_EXPECT_OK(BiasAddShape(&c));
-    ShapeHandle output = c.output(0);
-    EXPECT_EQ("[8,6,4,2,3,4,5]", c.DebugString(output));
+    EXPECT_FALSE(BiasAddShape(&c).ok());
   }
 
   {
@@ -291,7 +289,7 @@ TEST(CommonShapeFnsTest, BiasAddShapeTest) {
                     .Attr("data_format", "NCHW")
                     .Finalize(&def));
     InferenceContext c(TF_GRAPH_DEF_VERSION, &def, op_def,
-                       {S({10, 11, 12}), S({10})}, {}, {}, {});
+                       {S({10, 11, 12}), S({11})}, {}, {}, {});
     TF_EXPECT_OK(BiasAddShape(&c));
     ShapeHandle output = c.output(0);
     EXPECT_EQ("[10,11,12]", c.DebugString(output));
@@ -371,7 +369,7 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                        {S({8, 6, 4, 2, 3, 4, 5})}, {}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
-    EXPECT_EQ(3, c.Value(c.Dim(output, 0)));
+    EXPECT_EQ(6, c.Value(c.Dim(output, 0)));
   }
 
   {
@@ -384,7 +382,7 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
                        {}, {}, {});
     TF_EXPECT_OK(BiasAddGradShape(&c));
     ShapeHandle output = c.output(0);
-    EXPECT_EQ(10, c.Value(c.Dim(output, 0)));
+    EXPECT_EQ(11, c.Value(c.Dim(output, 0)));
   }
 
   {
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 284dafb886e6dcdb55da7496b048718cfb190862..6e214332710c9f2e854db99ec588424c8df81145 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
+#include <unordered_map>
 
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 namespace data {
@@ -71,6 +76,113 @@ class DatasetVariantWrapper {
   DatasetBase* const dataset_;  // Owns one reference.
 };
 
+const char kWrappedDatasetVariantTypeName[] =
+    "tensorflow::data::WrappedDatasetVariant";
+
+class WrappedDatasetVariantWrapper {
+ public:
+  WrappedDatasetVariantWrapper() {}
+
+  explicit WrappedDatasetVariantWrapper(const Tensor& ds_tensor)
+      : ds_tensor_(ds_tensor) {}
+
+  Tensor get() const { return ds_tensor_; }
+
+  string TypeName() const { return "tensorflow::WrappedDatasetVariantWrapper"; }
+
+  string DebugString() const {
+    return "tensorflow::WrappedDatasetVariantWrapper::DebugString";
+  }
+
+  void Encode(VariantTensorData* data) const {
+    *(data->add_tensors()) = ds_tensor_;
+  }
+
+  bool Decode(const VariantTensorData& data) {
+    ds_tensor_ = data.tensors(0);
+    return true;
+  }
+
+ private:
+  Tensor ds_tensor_;
+};
+
+class WrapDatasetVariantOp : public OpKernel {
+ public:
+  explicit WrapDatasetVariantOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& tensor = ctx->input(0);
+    OP_REQUIRES(ctx,
+                tensor.dtype() == DT_VARIANT &&
+                    TensorShapeUtils::IsScalar(tensor.shape()),
+                errors::InvalidArgument(
+                    "Dataset tensor must be a scalar of dtype DT_VARIANT."));
+    DatasetBase* unused;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(tensor, &unused));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    output->scalar<Variant>()() = WrappedDatasetVariantWrapper(tensor);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("WrapDatasetVariant").Device(DEVICE_CPU),
+                        WrapDatasetVariantOp);
+REGISTER_KERNEL_BUILDER(Name("WrapDatasetVariant")
+                            .HostMemory("input_handle")
+                            .HostMemory("output_handle")
+                            .Device(DEVICE_GPU),
+                        WrapDatasetVariantOp);
+
+class UnwrapDatasetVariantOp : public OpKernel {
+ public:
+  explicit UnwrapDatasetVariantOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& tensor = ctx->input(0);
+    OP_REQUIRES(ctx,
+                tensor.dtype() == DT_VARIANT &&
+                    TensorShapeUtils::IsScalar(tensor.shape()),
+                errors::InvalidArgument(
+                    "Dataset tensor must be a scalar of dtype DT_VARIANT."));
+    Variant variant = tensor.scalar<Variant>()();
+    const WrappedDatasetVariantWrapper* wrapper =
+        variant.get<WrappedDatasetVariantWrapper>();
+    OP_REQUIRES(ctx, wrapper != nullptr,
+                errors::InvalidArgument(
+                    "Tensor must be a WrappedDataset variant object."));
+    Tensor ds_tensor = wrapper->get();
+    OP_REQUIRES_OK(ctx, ctx->set_output("output_handle", ds_tensor));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnwrapDatasetVariant").Device(DEVICE_CPU),
+                        UnwrapDatasetVariantOp);
+REGISTER_KERNEL_BUILDER(Name("UnwrapDatasetVariant")
+                            .HostMemory("input_handle")
+                            .HostMemory("output_handle")
+                            .Device(DEVICE_GPU),
+                        UnwrapDatasetVariantOp);
+
+static Status WrappedDatasetVariantDeviceCopy(
+    const WrappedDatasetVariantWrapper& from, WrappedDatasetVariantWrapper* to,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
+  *to = WrappedDatasetVariantWrapper(from);
+  return Status::OK();
+}
+
+#define REGISTER_OPTIONAL_COPY(DIRECTION)               \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      WrappedDatasetVariantWrapper, DIRECTION,          \
+      WrappedDatasetVariantDeviceCopy)
+
+REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE);
+REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST);
+REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
+
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(WrappedDatasetVariantWrapper,
+                                       kWrappedDatasetVariantTypeName);
+
 }  // namespace
 
 Status GraphDefBuilderWrapper::AddDataset(
@@ -140,7 +252,7 @@ Status GraphDefBuilderWrapper::AddFunction(SerializationContext* ctx,
             << " the graph. It will not be added again.";
     return Status::OK();
   }
-  if (!ctx->allow_stateful_functions()) {
+  if (!ctx->optimization_only()) {
     TF_RETURN_IF_ERROR(
         EnsureFunctionIsStateless(ctx->flib_def(), function_name));
   }
@@ -203,28 +315,23 @@ bool GraphDefBuilderWrapper::HasAttr(const string& name,
   return HasAttr(op_def, attr_name);
 }
 
-Status DatasetBase::Save(SerializationContext* ctx,
-                         IteratorStateWriter* writer) const {
-  string serialized_graph_def;
-  string output_node;
-  GraphDefBuilder b;
-  DatasetGraphDefBuilder db(&b);
-  Node* node = nullptr;
-  TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
-  output_node = node->name();
-  GraphDef graph_def;
-  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-  graph_def.SerializeToString(&serialized_graph_def);
-  TF_RETURN_IF_ERROR(
-      writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
-  TF_RETURN_IF_ERROR(
-      writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
-  return Status::OK();
+int64 GetAllocatedBytes(const std::vector<Tensor>& element) {
+  int64 allocated_bytes = 0;
+  DatasetBase* dataset;
+  for (auto& tensor : element) {
+    if (tensor.dtype() == DT_VARIANT &&
+        GetDatasetFromVariantTensor(tensor, &dataset).ok()) {
+      allocated_bytes += dataset->AllocatedBytes();
+    } else {
+      allocated_bytes += tensor.AllocatedBytes();
+    }
+  }
+  return allocated_bytes;
 }
 
 Status GetDatasetFromVariantTensor(const Tensor& tensor,
                                    DatasetBase** out_dataset) {
-  if (!(tensor.dtype() == DT_VARIANT ||
+  if (!(tensor.dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(tensor.shape()))) {
     return errors::InvalidArgument(
         "Dataset tensor must be a scalar of dtype DT_VARIANT.");
@@ -251,6 +358,47 @@ Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor) {
   return Status::OK();
 }
 
+Status DatasetBase::Save(SerializationContext* ctx,
+                         IteratorStateWriter* writer) const {
+  string serialized_graph_def;
+  string output_node;
+  GraphDefBuilder b;
+  DatasetGraphDefBuilder db(&b);
+  Node* node = nullptr;
+  TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
+  output_node = node->name();
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
+  graph_def.SerializeToString(&serialized_graph_def);
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(kDatasetGraphOutputNodeKey, output_node));
+  return Status::OK();
+}
+
+Status DatasetBase::DatasetGraphDefBuilder::AddInputDataset(
+    SerializationContext* ctx, const DatasetBase* dataset, Node** output) {
+  Status status = dataset->AsGraphDefInternal(ctx, this, output);
+  if (ctx->optimization_only() && errors::IsUnimplemented(status)) {
+    Tensor t(DT_VARIANT, TensorShape({}));
+    // `StoreDatasetInVariantTensor` will transfer ownership of `dataset`. We
+    // increment the refcount of `dataset` here to retain ownership.
+    dataset->Ref();
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(const_cast<DatasetBase*>(dataset), &t));
+    TF_RETURN_IF_ERROR(AddPlaceholder(t, output));
+    DCHECK_NE(ctx->input_list(), nullptr);
+    ctx->input_list()->emplace_back((*output)->name(), std::move(t));
+    LOG(WARNING)
+        << "Input of " << dataset->DebugString()
+        << " will not be optimized because the dataset does not implement the "
+           "AsGraphDefInternal() method needed to apply optimizations.";
+    return Status::OK();
+  }
+  return status;
+}
+
 void DatasetOpKernel::Compute(OpKernelContext* ctx) {
   DatasetBase* dataset = nullptr;
   MakeDataset(ctx, &dataset);
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 964a7d5f8c20c9a4c76143c3643282e2e6bf7325..7d3776a6ec92b5ab6befbab3162c3d4937c4fe70 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <deque>
 #include <memory>
+#include <unordered_map>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -30,8 +31,10 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
 
 // Polymorphic datasets should support all primitive TensorFlow
@@ -50,6 +53,9 @@ namespace data {
 // A constant that can be used to enable auto-tuning.
 constexpr int kAutoTune = -1;
 
+constexpr int kInfiniteCardinality = -1;
+constexpr int kUnknownCardinality = -2;
+
 class DatasetBase;
 class SerializationContext;
 
@@ -160,7 +166,7 @@ class GraphDefBuilderWrapper {
                     const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
                     Node** output) {
     std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
-    for (int i = 0; i < inputs.size(); i++) {
+    for (size_t i = 0; i < inputs.size(); i++) {
       enumerated_inputs[i] = std::make_pair(i, inputs[i]);
     }
     return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
@@ -257,6 +263,7 @@ class GraphDefBuilderWrapper {
 };
 
 class StatsAggregator;
+class FunctionHandleCache;
 
 // A cut-down version of `OpKernelContext` for running computations in
 // iterators. Note that we cannot simply use `OpKernelContext` here because we
@@ -272,47 +279,84 @@ class StatsAggregator;
 class IteratorContext {
  public:
   struct Params {
-    // Interface to operating system functionality.
-    Env* env;
+    explicit Params(IteratorContext* ctx)
+        : allocator_getter(ctx->allocator_getter()),
+          env(ctx->env()),
+          function_library(ctx->function_library()),
+          lib(ctx->lib()),
+          function_handle_cache(ctx->function_handle_cache()),
+          resource_mgr(ctx->resource_mgr()),
+          model(ctx->model()),
+          runner(*(ctx->runner())),
+          runner_threadpool_size(ctx->runner_threadpool_size()),
+          stats_aggregator(ctx->stats_aggregator()) {}
+
+    explicit Params(OpKernelContext* ctx)
+        : env(ctx->env()),
+          lib(ctx->function_library()),
+          runner(*(ctx->runner())) {
+      // NOTE: need reinterpret_cast because function.h forward-declares Device.
+      DeviceBase* device =
+          reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
+      allocator_getter = [device](AllocatorAttributes attrs) {
+        return device->GetAllocator(attrs);
+      };
+      thread::ThreadPool* thread_pool =
+          ctx->device()->tensorflow_device_thread_pool();
+      if (thread_pool) {
+        runner_threadpool_size = thread_pool->NumThreads();
+      } else {
+        runner_threadpool_size = port::NumSchedulableCPUs();
+      }
+    }
 
-    // Function call support.
-    std::function<void(std::function<void()>)> runner = nullptr;
+    // The Allocator to be used to allocate the output of an iterator.
+    std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
 
-    // The `StatsAggregator` object to record statistics about the iterator.
-    std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
+    // Interface to operating system functionality.
+    Env* env = nullptr;
+
+    // The FunctionLibraryDefinition used to look up user-defined functions.
+    std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
 
     // The FunctionLibraryRuntime object to be used to make function calls.
     FunctionLibraryRuntime* lib = nullptr;
-    std::shared_ptr<const FunctionLibraryDefinition> function_library = nullptr;
 
-    // The Allocator to be used to allocate the output of an iterator.
-    std::function<Allocator*(AllocatorAttributes)> allocator_getter = nullptr;
+    // A FunctionHandleCache that owns all the function handles. Not owned.
+    FunctionHandleCache* function_handle_cache = nullptr;
+
+    // A resource manager for storing dataset-related state, e.g. random
+    // seeds or cached tensors. Not owned.
+    ResourceMgr* resource_mgr = nullptr;
 
     // If non-null, identifies the object used for performance modeling.
     std::shared_ptr<model::Model> model = nullptr;
+
+    // Function call support.
+    std::function<void(std::function<void()>)> runner = nullptr;
+
+    // Number of threads used for executing user-defined functions.
+    int32 runner_threadpool_size = 0;
+
+    // The `StatsAggregator` object to record statistics about the iterator.
+    std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
   };
 
+  explicit IteratorContext(IteratorContext* ctx) : params_(Params{ctx}) {}
+
+  explicit IteratorContext(OpKernelContext* ctx) : params_(Params{ctx}) {}
+
   explicit IteratorContext(Params params) : params_(std::move(params)) {}
 
-  explicit IteratorContext(OpKernelContext* ctx) {
-    params_.env = ctx->env();
-    params_.runner = *(ctx->runner());
-    params_.lib = ctx->function_library();
-    // NOTE: must use reinterpret_cast because function.h forward-declares
-    // Device.
-    DeviceBase* device =
-        reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
-    params_.allocator_getter = [device](AllocatorAttributes attrs) {
-      return device->GetAllocator(attrs);
-    };
+  Allocator* allocator(AllocatorAttributes attrs) {
+    return params_.allocator_getter(attrs);
   }
 
-  Env* env() const { return params_.env; }
-
-  std::function<void(std::function<void()>)>* runner() {
-    return &params_.runner;
+  std::function<Allocator*(AllocatorAttributes)> allocator_getter() {
+    return params_.allocator_getter;
   }
 
+  Env* env() const { return params_.env; }
 
   std::shared_ptr<const FunctionLibraryDefinition> function_library() {
     return params_.function_library;
@@ -320,22 +364,24 @@ class IteratorContext {
 
   FunctionLibraryRuntime* lib() { return params_.lib; }
 
-  void set_lib(FunctionLibraryRuntime* lib) { params_.lib = lib; }
-
-  Allocator* allocator(AllocatorAttributes attrs) {
-    return params_.allocator_getter(attrs);
+  FunctionHandleCache* function_handle_cache() {
+    return params_.function_handle_cache;
   }
 
-  std::function<Allocator*(AllocatorAttributes)> allocator_getter() {
-    return params_.allocator_getter;
+  ResourceMgr* resource_mgr() { return params_.resource_mgr; }
+
+  const std::shared_ptr<model::Model>& model() { return params_.model; }
+
+  std::function<void(std::function<void()>)>* runner() {
+    return &params_.runner;
   }
 
+  int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
+
   std::shared_ptr<StatsAggregator> stats_aggregator() {
     return params_.stats_aggregator;
   }
 
-  std::shared_ptr<model::Model> model() { return params_.model; }
-
   Params params() { return params_; }
 
  private:
@@ -346,21 +392,21 @@ class IteratorContext {
 class SerializationContext {
  public:
   struct Params {
-    bool allow_stateful_functions = false;
     const FunctionLibraryDefinition* flib_def = nullptr;           // Not owned.
     std::vector<std::pair<string, Tensor>>* input_list = nullptr;  // Not owned.
+    bool optimization_only = false;
   };
 
   explicit SerializationContext(Params params) : params_(std::move(params)) {}
 
-  bool allow_stateful_functions() { return params_.allow_stateful_functions; }
-
   const FunctionLibraryDefinition& flib_def() { return *params_.flib_def; }
 
   std::vector<std::pair<string, Tensor>>* input_list() {
     return params_.input_list;
   }
 
+  bool optimization_only() { return params_.optimization_only; }
+
  private:
   Params params_;
 
@@ -429,6 +475,10 @@ class IteratorBase {
   }
 
  protected:
+  // Returns a node that models this iterator.
+  virtual std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const = 0;
+
   // This is needed so that sub-classes of IteratorBase can call
   // `SaveInternal` on their input iterators.
   Status SaveInput(IteratorStateWriter* writer,
@@ -456,6 +506,7 @@ class IteratorBase {
 
  private:
   friend class DatasetBase;  // for access to `AddCleanupFunction`
+  friend class DatasetBaseIterator;  // for access to `node_`
 
   // Registers a cleanup function to be called upon object destruction.
   //
@@ -464,7 +515,11 @@ class IteratorBase {
     cleanup_fns_.push_back(std::move(cleanup_fn));
   }
 
+  // Associates the given performance modeling `Node` with this iterator.
+  void SetNode(std::shared_ptr<model::Node> node) { node_ = node.get(); }
+
   std::vector<std::function<void()>> cleanup_fns_;
+  model::Node* node_ = nullptr;  // Not owned.
 };
 
 // Represents runtime information needed to construct a dataset.
@@ -486,6 +541,25 @@ class DatasetContext {
   Params params_;
 };
 
+// Returns the number of bytes allocated for the given tensor.
+int64 GetAllocatedBytes(const std::vector<Tensor>& element);
+
+// Validates and extracts a `DatasetBase` object from `tensor`.
+//
+// `tensor` must have been written by a call to SetVariantTensorToDataset().
+//
+// The retrieved pointer is a borrowed reference to the dataset, which is owned
+// by the tensor. The consumer must either acquire its own reference to the
+// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
+// destroyed or mutated while the retrieved pointer is in use.
+Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                   DatasetBase** out_dataset);
+
+// Stores a `DatasetBase` object in `tensor`.
+//
+// The ownership of `dataset` is transferred to `tensor`.
+Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
+
 // Represents a (potentially infinite) range of outputs, where each
 // output is a tuple of tensors.
 class DatasetBase : public core::RefCounted {
@@ -511,22 +585,22 @@ class DatasetBase : public core::RefCounted {
   //
   // The prefix identifies the sequence of iterators leading up to the newly
   // created iterator.
-  Status MakeIterator(IteratorContext* ctx, const string& prefix,
+  Status MakeIterator(IteratorContext* ctx, const string& output_prefix,
                       std::unique_ptr<IteratorBase>* iterator) const {
-    *iterator = MakeIteratorInternal(prefix);
-    if (ctx->model()) {
-      ctx->model()->AddNode((*iterator)->prefix(), prefix);
-      std::shared_ptr<model::Model> model = ctx->model();
+    *iterator = MakeIteratorInternal(output_prefix);
+    if (const auto& model = ctx->model()) {
       const string& prefix = (*iterator)->prefix();
+      (*iterator)->SetNode(model->AddNode(MakeNodeFactory(ctx, iterator->get()),
+                                          prefix, output_prefix));
       (*iterator)->AddCleanupFunction(
           [model, prefix]() { model->RemoveNode(prefix); });
     }
     return (*iterator)->Initialize(ctx);
   }
 
-  Status MakeIterator(IteratorContext&& ctx, const string& prefix,
+  Status MakeIterator(IteratorContext&& ctx, const string& output_prefix,
                       std::unique_ptr<IteratorBase>* iterator) const {
-    return MakeIterator(&ctx, prefix, iterator);
+    return MakeIterator(&ctx, output_prefix, iterator);
   }
 
   // Returns a vector of DataType values, representing the respective
@@ -539,6 +613,12 @@ class DatasetBase : public core::RefCounted {
   // in the outputs of this dataset.
   virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
 
+  // Returns the number of bytes allocated for tensors of this dataset.
+  virtual int64 AllocatedBytes() const { return 0; }
+
+  // Returns the cardinality of this dataset.
+  virtual int64 Cardinality() const { return kUnknownCardinality; }
+
   // A human-readable debug string for this dataset.
   virtual string DebugString() const = 0;
 
@@ -553,12 +633,9 @@ class DatasetBase : public core::RefCounted {
    public:
     DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
     Status AddInputDataset(SerializationContext* ctx,
-                           const DatasetBase* dataset, Node** output) {
-      return dataset->AsGraphDefInternal(ctx, this, output);
-    }
+                           const DatasetBase* dataset, Node** output);
   };
 
-  // TODO(jsimsa): Consolidate overloading into a single method.
   virtual Status AsGraphDefInternal(SerializationContext* ctx,
                                     DatasetGraphDefBuilder* b,
                                     Node** node) const = 0;
@@ -567,6 +644,14 @@ class DatasetBase : public core::RefCounted {
       const string& prefix) const = 0;
 
  private:
+  // Returns a factory for nodes that represent the given iterator.
+  static model::Node::Factory MakeNodeFactory(IteratorContext* ctx,
+                                              IteratorBase* iterator) {
+    return [ctx, iterator](model::Node::Args args) {
+      return iterator->CreateNode(ctx, std::move(args));
+    };
+  }
+
   const string name_;
 };
 
@@ -631,51 +716,60 @@ class DatasetBaseIterator : public IteratorBase {
     return strings::StrCat(params_.prefix, ":", name);
   }
 
-  // When performance modeling is enabled, this method adds a constant parameter
-  // to the model node corresponding to this iterator.
-  void AddConstantParameter(IteratorContext* ctx, const string& name,
-                            int64 value) {
-    if (ctx->model()) {
-      ctx->model()->AddConstantParameter(prefix(), name, value);
+  // By default we model iterators using an unknown node, which acts as
+  // pass-through with respect to performance modeling.
+  std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const override {
+    return model::MakeUnknownNode(std::move(args));
+  }
+
+  // When performance modeling is enabled, this method records the fact that
+  // this iterator has dequeued a element from an internal buffer.
+  void RecordBufferDequeue(IteratorContext* ctx,
+                           const std::vector<Tensor>& element) {
+    if (node_) {
+      node_->add_buffered_bytes(-GetAllocatedBytes(element));
     }
   }
 
-  // When performance modeling is enabled, this method adds a tunable parameter
-  // to the model node corresponding to this iterator.
-  //
-  // The performance modeling logic may use `state` to set the value of the
-  // tunable parameter at any point during the lifetime of this iterator. When
-  // it does, it acquires `state->mu` and notifies `state->cond_var`.
-  void AddTunableParameter(IteratorContext* ctx, const string& name,
-                           std::shared_ptr<model::SharedState> state, int64 min,
-                           int64 max) {
-    if (ctx->model()) {
-      ctx->model()->AddTunableParameter(prefix(), name, std::move(state), min,
-                                        max);
+  // When performance modeling is enabled, this method records the fact that
+  // this iterator has enqueued a element in an internal buffer.
+  void RecordBufferEnqueue(IteratorContext* ctx,
+                           const std::vector<Tensor>& element) {
+    if (node_) {
+      node_->add_buffered_bytes(GetAllocatedBytes(element));
     }
   }
 
   // When performance modeling is enabled, this method records the fact that
   // this iterator has produced an element.
   void RecordElement(IteratorContext* ctx) {
-    if (ctx->model()) {
-      ctx->model()->RecordElement(prefix());
+    if (node_) {
+      node_->record_element();
     }
   }
 
   // When performance modeling is enabled, this method records the fact that
   // a thread of this iterator has started work.
   void RecordStart(IteratorContext* ctx, bool stop_output = false) {
-    if (ctx->model()) {
-      ctx->model()->RecordStart(prefix(), stop_output);
+    if (node_) {
+      int64 now_nanos = Env::Default()->NowNanos();
+      if (stop_output && node_->output()) {
+        node_->output()->record_stop(now_nanos);
+      }
+      node_->record_start(now_nanos);
     }
   }
 
   // When performance modeling is enabled, this method records the fact that
   // a thread of this iterator has stopped work.
   void RecordStop(IteratorContext* ctx, bool start_output = false) {
-    if (ctx->model()) {
-      ctx->model()->RecordStop(prefix(), start_output);
+    if (node_) {
+      int64 now_nanos = Env::Default()->NowNanos();
+      node_->record_stop(now_nanos);
+      if (start_output && node_->output()) {
+        node_->output()->record_start(now_nanos);
+      }
     }
   }
 
@@ -779,22 +873,6 @@ class BinaryDatasetOpKernel : public DatasetOpKernel {
                            DatasetBase** output) = 0;
 };
 
-// Validates and extracts a `DatasetBase` object from `tensor`.
-//
-// `tensor` must have been written by a call to SetVariantTensorToDataset().
-//
-// The retrieved pointer is a borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetDatasetFromVariantTensor(const Tensor& tensor,
-                                   DatasetBase** out_dataset);
-
-// Stores a `DatasetBase` object in `tensor`.
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
-
 // A simple background worker that executes closures asynchronously and without
 // blocking.
 //
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index 9108c32942ad65616b246227f2ad84a56ea9eb93..78ace480c4bad66b06f27ca90a1bc5c482c3f00c 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -34,14 +34,14 @@ const string& DeviceBase::name() const {
 }
 
 void DeviceBase::set_eigen_cpu_device(Eigen::ThreadPoolDevice* d) {
-  // Eigen::ThreadPoolDevice is a very cheap struct (one pointer and
+  // Eigen::ThreadPoolDevice is a very cheap struct (two pointers and
   // an int).  Therefore, we can afford a pre-allocated array of
   // Eigen::ThreadPoolDevice.  Here, we ensure that
   // Eigen::ThreadPoolDevices in eigen_cpu_devices_ has increasingly
   // larger numThreads.
   for (int i = 1; i <= d->numThreads(); ++i) {
-    eigen_cpu_devices_.push_back(
-        new Eigen::ThreadPoolDevice(d->getPool(), i /* numThreads() */));
+    eigen_cpu_devices_.push_back(new Eigen::ThreadPoolDevice(
+        d->getPool(), i /* numThreads() */, d->allocator()));
   }
 }
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 20f957190bed129986b1da661f2b7b320b354046..b69a40f3128905960cc054ddea7cc20b5d4583a3 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb_text.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -149,8 +152,8 @@ class FunctionInstantiationHelper {
   }
 
   // Builds index for nodes that can be used as node's input arguments.
-  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
-                            AttrSlice attr_values) {
+  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def, AttrSlice attr_values,
+                            bool ints_on_device) {
     bool is_type_list;
     DataTypeVector dtypes;
     TF_RETURN_IF_ERROR(
@@ -169,7 +172,11 @@ class FunctionInstantiationHelper {
         strings::StrAppend(&name, "_", i);
       }
       NodeDef* gnode = AddNode(name);
-      gnode->set_op(FunctionLibraryDefinition::kArgOp);
+      if (ints_on_device && dtypes[i] == DataType::DT_INT32) {
+        gnode->set_op(FunctionLibraryDefinition::kDeviceArgOp);
+      } else {
+        gnode->set_op(FunctionLibraryDefinition::kArgOp);
+      }
       AddAttr("T", dtypes[i], gnode);
       AddAttr("index", arg_index, gnode);
       result_.arg_types.push_back(dtypes[i]);
@@ -238,7 +245,8 @@ class FunctionInstantiationHelper {
         const auto* item = GetItemOrNull(input_name);
         if (item == nullptr) {
           return errors::InvalidArgument(
-              "input ", input_name, " is not found: ", SummarizeNodeDef(fnode));
+              "input ", input_name,
+              " is not found: ", FormatNodeDefForError(fnode));
         }
         if (item->dtypes.size() > dtypes.size() - j) {
           return errors::InvalidArgument("Input ", input_name, " too long for ",
@@ -303,7 +311,7 @@ class FunctionInstantiationHelper {
   Status AddReturnNode(
       const OpDef::ArgDef& ret_def, AttrSlice attrs,
       const ::tensorflow::protobuf::Map<string, string>& ret_map,
-      int* ret_index) {
+      bool ints_on_device, int* ret_index) {
     auto ret_iter = ret_map.find(ret_def.name());
     if (ret_iter == ret_map.end()) {
       return errors::InvalidArgument("Return ", ret_def.name(), " missing.");
@@ -329,7 +337,11 @@ class FunctionInstantiationHelper {
         strings::StrAppend(&name, "_", i);
       }
       NodeDef* gnode = AddNode(name);
-      gnode->set_op(FunctionLibraryDefinition::kRetOp);
+      if (ints_on_device && dtypes[i] == DataType::DT_INT32) {
+        gnode->set_op(FunctionLibraryDefinition::kDeviceRetOp);
+      } else {
+        gnode->set_op(FunctionLibraryDefinition::kRetOp);
+      }
       AddInput(nodes_.size() - 1, item->nid, item->idx + i);
       AddAttr("T", dtypes[i], gnode);
       AddAttr("index", (*ret_index)++, gnode);
@@ -497,6 +509,16 @@ string Print(const NodeDef& n) {
       entries.push_back(strings::StrCat(a.first, "=", Print(a.second)));
     }
     std::sort(entries.begin(), entries.end());
+    // Add a short device string at the end of all attributes.
+    if (!n.device().empty()) {
+      DeviceNameUtils::ParsedName parsed;
+      if (DeviceNameUtils::ParseFullName(n.device(), &parsed)) {
+        entries.push_back(
+            strings::StrCat("device=", parsed.type, ":", parsed.id));
+      } else {
+        entries.push_back("device=<FAILED_TO_PARSE>");
+      }
+    }
     strings::StrAppend(&out, "[", str_util::Join(entries, ", "), "]");
   }
   strings::StrAppend(&out, "(");
@@ -559,9 +581,11 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   std::vector<const NodeDef*> ret;
   std::vector<const NodeDef*> body;
   for (const NodeDef* n : nodes) {
-    if (n->op() == FunctionLibraryDefinition::kArgOp) {
+    if (n->op() == FunctionLibraryDefinition::kArgOp ||
+        n->op() == FunctionLibraryDefinition::kDeviceArgOp) {
       arg.push_back(n);
-    } else if (n->op() == FunctionLibraryDefinition::kRetOp) {
+    } else if (n->op() == FunctionLibraryDefinition::kRetOp ||
+               n->op() == FunctionLibraryDefinition::kDeviceRetOp) {
       ret.push_back(n);
     } else {
       body.push_back(n);
@@ -578,26 +602,50 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   std::sort(ret.begin(), ret.end(), comp);
   string out;
   strings::StrAppend(&out, "\n(");
-  auto get_type = [](const NodeDef& n) {
+  auto get_type_and_device = [](const NodeDef& n) {
     DataType dt;
     if (!GetNodeAttr(n, "T", &dt).ok()) {
       dt = DT_INVALID;
     }
+    if (!n.device().empty()) {
+      DeviceNameUtils::ParsedName parsed;
+      if (DeviceNameUtils::ParseFullName(n.device(), &parsed)) {
+        return strings::StrCat(DataTypeString(dt), "@", parsed.type, ":",
+                               parsed.id);
+      } else {
+        return strings::StrCat(DataTypeString(dt), "@",
+                               "<FAILED_TO_PARSE_DEVICE>");
+      }
+    }
     return DataTypeString(dt);
   };
   for (size_t i = 0; i < arg.size(); ++i) {
     const NodeDef* n = arg[i];
     if (i > 0) strings::StrAppend(&out, ", ");
     CHECK_GE(n->attr_size(), 2);
-    strings::StrAppend(&out, n->name(), ":", get_type(*n));
+    strings::StrAppend(&out, n->name(), ":", get_type_and_device(*n));
   }
   strings::StrAppend(&out, ") -> (");
   for (size_t i = 0; i < ret.size(); ++i) {
     const NodeDef* n = ret[i];
     if (i > 0) strings::StrAppend(&out, ", ");
     CHECK_LE(2, n->attr_size());
-    CHECK_EQ(1, n->input_size());
-    strings::StrAppend(&out, n->input(0), ":", get_type(*n));
+
+    // The _RetVal op should have a unique non-control input. We assert that
+    // here and add it to the output.
+    bool found_non_control_input = false;
+    for (const string& input : n->input()) {
+      if (!input.empty() && input[0] != '^') {
+        DCHECK_EQ(found_non_control_input, false)
+            << "RetVal node has more than one non-control input: "
+            << absl::StrJoin(n->input(), ", ");
+        strings::StrAppend(&out, n->input(0), ":", get_type_and_device(*n));
+        found_non_control_input = true;
+      }
+    }
+    DCHECK_EQ(found_non_control_input, true)
+        << "RetVal did not have any non-control inputs: "
+        << absl::StrJoin(n->input(), ", ");
   }
   strings::StrAppend(&out, ") {\n");
   for (size_t i = 0; i < body.size(); ++i) {
@@ -633,10 +681,13 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   const OpDef& sig = fdef.signature();
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
 
+  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
+                        fdef.attr().at("experimental_ints_on_device").b();
+
   FunctionInstantiationHelper helper(get_function, result);
   Status s;
   for (const OpDef::ArgDef& arg_def : sig.input_arg()) {
-    s = helper.BuildInputArgIndex(arg_def, attr_values);
+    s = helper.BuildInputArgIndex(arg_def, attr_values, ints_on_device);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", Print(arg_def));
       return s;
@@ -673,7 +724,8 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
     s = helper.BuildNodeOutputIndex(fdef.node_def(i), AttrSlice(&node_attrs[i]),
                                     result->nodes.size() + i);
     if (!s.ok()) {
-      errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
+      errors::AppendToMessage(&s, "In ",
+                              FormatNodeDefForError(fdef.node_def(i)));
       return s;
     }
   }
@@ -681,7 +733,8 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   for (int i = 0; i < fdef.node_def_size(); ++i) {
     s = helper.InstantiateNode(fdef.node_def(i), AttrSlice(&node_attrs[i]));
     if (!s.ok()) {
-      errors::AppendToMessage(&s, "In ", SummarizeNodeDef(fdef.node_def(i)));
+      errors::AppendToMessage(&s, "In ",
+                              FormatNodeDefForError(fdef.node_def(i)));
       return s;
     }
   }
@@ -689,7 +742,8 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   // Emits nodes for the function's return values.
   int ret_index = 0;
   for (const OpDef::ArgDef& ret_def : sig.output_arg()) {
-    s = helper.AddReturnNode(ret_def, attr_values, fdef.ret(), &ret_index);
+    s = helper.AddReturnNode(ret_def, attr_values, fdef.ret(), ints_on_device,
+                             &ret_index);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In function output ", Print(ret_def));
       return s;
@@ -796,12 +850,28 @@ uint64 FunctionDefHash(const FunctionDef& fdef) {
   return h;
 }
 
+static constexpr const char* const kExecutorAttr = "_executor";
+
+/* static */
+string FunctionLibraryRuntime::ExecutorType(const InstantiateOptions& options,
+                                            AttrSlice attrs) {
+  if (!options.executor_type.empty()) {
+    return options.executor_type;
+  } else if (const AttrValue* executor_attr = attrs.Find(kExecutorAttr)) {
+    return executor_attr->s();
+  } else {
+    return string();
+  }
+}
+
 string Canonicalize(const string& funcname, AttrSlice attrs,
                     const FunctionLibraryRuntime::InstantiateOptions& options) {
   std::vector<string> entries;
   entries.reserve(options.target.empty() ? attrs.size() : (attrs.size() + 1));
   for (auto p : attrs) {
-    entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
+    if (p.first != kExecutorAttr) {
+      entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
+    }
   }
   if (!options.target.empty()) {
     entries.push_back(
@@ -815,9 +885,9 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
     entries.push_back(
         strings::StrCat("_state_handle", "=", options.state_handle));
   }
-  if (!options.executor_type.empty()) {
-    entries.push_back(
-        strings::StrCat("_executor_type", "=", options.executor_type));
+  string executor_type = FunctionLibraryRuntime::ExecutorType(options, attrs);
+  if (!executor_type.empty()) {
+    entries.push_back(strings::StrCat(kExecutorAttr, "=", executor_type));
   }
   std::sort(entries.begin(), entries.end());
   return strings::StrCat(funcname, "[", str_util::Join(entries, ","), "]");
@@ -1096,7 +1166,7 @@ Status FunctionLibraryDefinition::ReplaceFunction(const string& func,
                                                   const FunctionDef& fdef) {
   mutex_lock l(mu_);
   bool added;
-  TF_RETURN_IF_ERROR(RemoveFunction(func));
+  TF_RETURN_IF_ERROR(RemoveFunctionHelper(func));
   TF_RETURN_IF_ERROR(AddFunctionDefHelper(fdef, &added));
   return Status::OK();
 }
@@ -1110,6 +1180,12 @@ Status FunctionLibraryDefinition::ReplaceGradient(const GradientDef& grad) {
 }
 
 Status FunctionLibraryDefinition::RemoveFunction(const string& func) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(RemoveFunctionHelper(func));
+  return Status::OK();
+}
+
+Status FunctionLibraryDefinition::RemoveFunctionHelper(const string& func) {
   const auto& i = function_defs_.find(func);
   if (i == function_defs_.end()) {
     return errors::InvalidArgument("Tried to remove non-existent function ",
@@ -1133,7 +1209,7 @@ void FunctionLibraryDefinition::Remove(
     const std::vector<string>& funcs,
     const std::vector<string>& funcs_with_grads) {
   for (const string& f : funcs) {
-    Status s = RemoveFunction(f);
+    Status s = RemoveFunctionHelper(f);
     DCHECK(s.ok());
   }
   for (const string& f : funcs_with_grads) {
@@ -1201,6 +1277,16 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   }
 }
 
+std::vector<string> FunctionLibraryDefinition::ListFunctionNames() const {
+  std::vector<string> function_names;
+  tf_shared_lock l(mu_);
+  function_names.reserve(function_defs_.size());
+  for (const auto& it : function_defs_) {
+    function_names.emplace_back(it.first);
+  }
+  return function_names;
+}
+
 FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
   FunctionDefLibrary lib;
   tf_shared_lock l(mu_);
@@ -1240,6 +1326,138 @@ GET_ATTR(string)
 GET_ATTR(bool)
 #undef GET_ATTR
 
+namespace {
+
+constexpr char kExperimentalApiImplements[] = "experimental_api_implements";
+
+absl::flat_hash_set<string> ReachableFunctions(
+    const FunctionLibraryDefinition& flib,
+    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+  // Functions that are reachable from the graph.
+  absl::flat_hash_set<string> reachable_funcs;
+
+  // For any functions, if it has attribute "experimental_api_implements" =
+  // "some_interface" and it is reachable, then it means any other
+  // function with same attribute name and value could also be potentially
+  // reachable, eg via experimental_implementation_selector swapping the
+  // nodedef.
+  absl::flat_hash_set<string> reachable_api_interface;
+
+  // Functions might be reachable from the nested function calls, so we keep a
+  // queue of functions that we have to check.
+  gtl::InlinedVector<const FunctionDef*, 4> func_queue;
+
+  // Add reachable and not already processed functions to the functions queue.
+  const auto add_to_func_queue = [&](const string& func_name) {
+    const FunctionDef* func = flib.Find(func_name);
+    if (func && reachable_funcs.find(func_name) == reachable_funcs.end()) {
+      func_queue.push_back(func);
+    }
+  };
+
+  // Add all the functions that are reachable from the given node to the queue.
+  const auto process_node = [&](const NodeDef& node) {
+    // Node itself can be a call to the function.
+    add_to_func_queue(node.op());
+
+    // Or node can have an attribute referencing a function.
+    for (const auto& attr : node.attr()) {
+      const auto& attr_value = attr.second;
+
+      // 1. AttrValue.func
+      if (attr_value.has_func()) {
+        add_to_func_queue(attr_value.func().name());
+      }
+
+      // 2. AttrValue.ListValue.func
+      if (attr_value.has_list()) {
+        for (const auto& func : attr_value.list().func()) {
+          add_to_func_queue(func.name());
+        }
+      }
+    }
+  };
+
+  // Add all functions that are directly called from the optimized graph.
+  std::for_each(nodes.begin(), nodes.end(), process_node);
+
+  // Process all reachable functions.
+  while (!func_queue.empty()) {
+    const FunctionDef* func = func_queue.back();
+    func_queue.pop_back();
+
+    const string& func_name = func->signature().name();
+    reachable_funcs.insert(func_name);
+
+    const auto attr_it = func->attr().find(kExperimentalApiImplements);
+    if (attr_it != func->attr().end()) {
+      reachable_api_interface.insert(attr_it->second.s());
+    }
+
+    // Find all the functions called from the function body.
+    const auto& func_body = func->node_def();
+    std::for_each(func_body.begin(), func_body.end(), process_node);
+
+    // Check if the function has a registered gradient.
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
+  }
+
+  for (const auto& func_name : flib.ListFunctionNames()) {
+    const auto& func_def = flib.Find(func_name);
+    const auto attr_it = func_def->attr().find(kExperimentalApiImplements);
+    if (attr_it != func_def->attr().end()) {
+      if (reachable_api_interface.contains(attr_it->second.s())) {
+        reachable_funcs.insert(func_name);
+      }
+    }
+  }
+
+  return reachable_funcs;
+}
+
+FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
+    const FunctionLibraryDefinition& flib,
+    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, nodes);
+
+  FunctionLibraryDefinition reachable_flib(flib.default_registry(),
+                                           FunctionDefLibrary());
+
+  for (const string& func_name : reachable_funcs) {
+    const FunctionDef* func = flib.Find(func_name);
+    DCHECK_NE(func, nullptr);
+    // That should never fail, because we copy functions from valid flib and use
+    // the same default registry.
+    const Status added = reachable_flib.AddFunctionDef(*func);
+    DCHECK(added.ok());
+
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) {
+      GradientDef grad;
+      grad.set_function_name(func_name);
+      grad.set_gradient_func(grad_func_name);
+      // It can only fail if function already has a gradient function.
+      const Status added_grad = reachable_flib.AddGradientDef(grad);
+      DCHECK(added_grad.ok());
+    }
+  }
+
+  return reachable_flib;
+}
+
+}  // namespace
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const GraphDef& graph) const {
+  return ReachableFunctionLibraryDefinition(*this, graph.node());
+}
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const FunctionDef& func) const {
+  return ReachableFunctionLibraryDefinition(*this, func.node_def());
+}
+
 void FunctionDefHelper::AttrValueWrapper::InitFromString(StringPiece val) {
   if (val.size() >= 2 && val[0] == '$') {
     proto.set_placeholder(val.data() + 1, val.size() - 1);
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 4d6d68e214e794256c20f121611ad97c407310f3..9cf4b0f4cdf1d4c3604eebcf33bb51274578d73c 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -294,7 +294,7 @@ class FunctionCallFrame : public CallFrameInterface {
 class FunctionLibraryDefinition : public OpRegistryInterface {
  public:
   // Note: This constructor grabs `lib_def`'s lock in shared mode.
-  explicit FunctionLibraryDefinition(const FunctionLibraryDefinition& lib_def);
+  FunctionLibraryDefinition(const FunctionLibraryDefinition& lib_def);
   FunctionLibraryDefinition(const OpRegistryInterface* default_registry,
                             const FunctionDefLibrary& lib_def);
   ~FunctionLibraryDefinition() override;
@@ -329,6 +329,8 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
 
   // Replaces the function corresponding to `func` with `fdef`. Returns
   // a non-OK status if "func" was not found in the library, OK otherwise.
+  // Please be careful when replacing function: make sure all previous pointers
+  // returned by `Find()` are no longer in use.
   Status ReplaceFunction(const string& func, const FunctionDef& fdef);
 
   // Replaces the gradient corresponding to `grad.function_name()`. Returns
@@ -336,6 +338,13 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // otherwise.
   Status ReplaceGradient(const GradientDef& grad);
 
+  // Removes the function corresponding to 'func'. Returns a non-OK status if
+  // 'func' was not found in the library, OK otherwise.
+  // Please be careful when removing function: make sure there are no other
+  // nodes using the function, and all previous pointers returned by `Find()`
+  // are no longer in use.
+  Status RemoveFunction(const string& func);
+
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
@@ -370,7 +379,9 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Ops created for function arguments bear the name given by `kArgOp`; those
   // created for return values bear the name given by `kRetOp`.
   static constexpr const char* const kArgOp = "_Arg";
+  static constexpr const char* const kDeviceArgOp = "_DeviceArg";
   static constexpr const char* const kRetOp = "_Retval";
+  static constexpr const char* const kDeviceRetOp = "_DeviceRetval";
 
   static constexpr const char* const kGradientOp = "SymbolicGradient";
   static constexpr const char* const kFuncAttr = "f";
@@ -396,10 +407,18 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
     return function_defs_.size();
   }
 
+  // Returns all the function names in the FunctionLibraryDefinition.
+  std::vector<string> ListFunctionNames() const LOCKS_EXCLUDED(mu_);
+
   const OpRegistryInterface* default_registry() const {
     return default_registry_;
   }
 
+  // Returns a copy of `*this` with only the subset of functions that are
+  // reachable from the nodes of `graph` or `func`.
+  FunctionLibraryDefinition ReachableDefinitions(const GraphDef& graph) const;
+  FunctionLibraryDefinition ReachableDefinitions(const FunctionDef& func) const;
+
  private:
   // Shape inference for functions is handled separately by ShapeRefiner.
 
@@ -441,7 +460,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Remove `func` from the library. Returns non-OK Status unless `func` is in
   // the library. This should only be called when there is a guarantee that the
   // function being removed hasn't been retrieved with `Find`.
-  Status RemoveFunction(const string& func) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status RemoveFunctionHelper(const string& func) EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Remove gradient of function `func` from the library. Returns non-OK Status
   // unless `func` has a gradient.
@@ -609,6 +628,13 @@ class FunctionLibraryRuntime {
   virtual Status Clone(std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
                        std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
                        FunctionLibraryRuntime** out_flr) = 0;
+
+  // Returns the name of the executor class (in the sense of
+  // `ExecutorFactory::GetFactory()`) that will be used based on the given
+  // dynamic `options` and static `attrs`. If none is specified, this method
+  // will return an empty string, which leaves the decision up to the runtime.
+  static string ExecutorType(const InstantiateOptions& options,
+                             AttrSlice attrs);
 };
 
 // Returns a canonicalized string for the instantiation of the
diff --git a/tensorflow/core/framework/function_handle_cache.cc b/tensorflow/core/framework/function_handle_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b93b6b2f8702ccfbc191072054278f7f732dc5a
--- /dev/null
+++ b/tensorflow/core/framework/function_handle_cache.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/function_handle_cache.h"
+
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace data {
+
+FunctionHandleCache::FunctionHandleCache(FunctionLibraryRuntime* lib)
+    : lib_(lib), state_handle_(strings::Printf("%lld", random::New64())) {}
+
+FunctionHandleCache::~FunctionHandleCache() {
+  Status s = Clear();
+  if (!s.ok()) {
+    LOG(ERROR) << "Failed to clear function handle cache: " << s.ToString();
+  }
+}
+
+Status FunctionHandleCache::Instantiate(
+    const string& function_name, AttrSlice attrs,
+    FunctionLibraryRuntime::InstantiateOptions options,
+    FunctionLibraryRuntime::Handle* handle) {
+  string key = Canonicalize(function_name, attrs, options);
+  FunctionLibraryRuntime::Handle h;
+  {
+    tf_shared_lock l(mu_);
+    h = gtl::FindWithDefault(handles_, key, kInvalidHandle);
+  }
+  if (h == kInvalidHandle) {
+    options.state_handle = state_handle_;
+    TF_RETURN_IF_ERROR(
+        lib_->Instantiate(function_name, attrs, options, handle));
+    mutex_lock l(mu_);
+    handles_[key] = *handle;
+  } else {
+    *handle = h;
+  }
+  return Status::OK();
+}
+
+Status FunctionHandleCache::Clear() {
+  mutex_lock l(mu_);
+  for (auto entry : handles_) {
+    TF_RETURN_IF_ERROR(lib_->ReleaseHandle(entry.second));
+  }
+  handles_.clear();
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/function_handle_cache.h b/tensorflow/core/framework/function_handle_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..2800a598e09dc305dc65abd0283545f5493b150a
--- /dev/null
+++ b/tensorflow/core/framework/function_handle_cache.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+namespace data {
+
+class FunctionHandleCache {
+ public:
+  explicit FunctionHandleCache(FunctionLibraryRuntime* lib);
+
+  ~FunctionHandleCache();
+
+  // Looks up the function to be instantiated in the cache first. If present,
+  // returns handle from there. Otherwise, instantiates a new function
+  // and stores handle in the cache.
+  Status Instantiate(const string& function_name, AttrSlice attrs,
+                     FunctionLibraryRuntime::InstantiateOptions options,
+                     FunctionLibraryRuntime::Handle* handle);
+
+  // Releases all the handles in the cache, clearing out the state for all
+  // functions involved.
+  Status Clear();
+
+ private:
+  mutex mu_;
+  FunctionLibraryRuntime* lib_ = nullptr;  // not owned
+  const string state_handle_;
+  std::unordered_map<string, FunctionLibraryRuntime::Handle> handles_
+      GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 44e1383719c9c903f956fca0b1ba93ec5df4adb4..75d45fa2c84ebc340dfb79b76f7b406d7a099c1f 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -495,6 +495,23 @@ MySelect(x:float) -> (z:float) {
   EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
+TEST(TFunc, IntsOnDeviceArgNotSet) {
+  auto fdef = test::function::XTimesTwoInt32();
+  InstantiationResult result;
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
+  EXPECT_EQ(5, result.nodes.size());
+  EXPECT_EQ("_Retval", result.nodes[4].op());
+}
+
+TEST(TFunc, IntsOnDeviceArgSet) {
+  auto fdef = test::function::XTimesTwoInt32();
+  (*fdef.mutable_attr())["experimental_ints_on_device"].set_b(true);
+  InstantiationResult result;
+  TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
+  EXPECT_EQ(5, result.nodes.size());
+  EXPECT_EQ("_DeviceRetval", result.nodes[4].op());
+}
+
 static void HasError(const Status& s, const string& substr) {
   EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
@@ -1196,6 +1213,17 @@ TEST(FunctionLibraryDefinitionTest, ToProto) {
   EXPECT_EQ(f3->DebugString(), f4->DebugString());
 }
 
+TEST(FunctionLibraryDefinitionTest, FunctionNames) {
+  FunctionDefLibrary proto;
+  *proto.add_function() = test::function::XTimesTwo();
+  *proto.add_function() = test::function::WXPlusB();
+  const FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
+
+  const std::vector<string> function_names = lib_def.ListFunctionNames();
+  const std::vector<string> expected = {"XTimesTwo", "WXPlusB"};
+  EXPECT_EQ(function_names, expected);
+}
+
 TEST(FunctionLibraryDefinitionTest, GetAttr_FuncNoAttr) {
   FunctionDefLibrary proto;
   *proto.add_function() = test::function::XTimesTwo();
@@ -1276,6 +1304,79 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_Gradient) {
   EXPECT_EQ(annotation, false);  // WXPlusB has no custom gradient.
 }
 
+TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
+  using ::tensorflow::test::function::GDef;
+  using ::tensorflow::test::function::NDef;
+  using FDH = ::tensorflow::FunctionDefHelper;
+
+  const auto make_simple_fdef = [](const string& name,
+                                   const string& interface_name) {
+    auto func_def = FDH::Create(
+        name, {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+        {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+        /* Mapping between function returns and function node outputs. */
+        {{"z", "output:z:0"}});
+
+    if (!interface_name.empty()) {
+      auto* attr = func_def.mutable_attr();
+      (*attr)["experimental_api_implements"].set_s(interface_name);
+    }
+    return func_def;
+  };
+
+  FunctionDef func_1 = make_simple_fdef("Func1", "");
+  FunctionDef func_2 = make_simple_fdef("Func2", "");
+  FunctionDef func_3 = make_simple_fdef("Func3", "");
+  FunctionDef func_4 = make_simple_fdef("Func4", "api_1");
+  FunctionDef func_5 = make_simple_fdef("Func5", "api_1");
+  FunctionDef func_6 = make_simple_fdef("Func6", "api_2");
+
+  FunctionDef func_2_grad = make_simple_fdef("Func2_grad", "");
+
+  constexpr char kDevice[] = "/device:CPU:0";
+
+  GraphDef graph = GDef(
+      {
+          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+          NDef("x", "Func1", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+                {"Tout", DataTypeSlice{DT_FLOAT}},
+                {"f", FDH::FunctionRef("Func2", {{"T", DT_FLOAT}})}},
+               kDevice),
+          NDef("z", "Func4", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+      },
+      // FunctionLib
+      {func_1, func_2, func_3, func_2_grad, func_4, func_5, func_6});
+
+  // Register custom function gradient after the graph was constructed.
+  GradientDef* func3_grad_def = graph.mutable_library()->add_gradient();
+  func3_grad_def->set_function_name("Func2");
+  func3_grad_def->set_gradient_func("Func2_grad");
+
+  FunctionLibraryDefinition flib(OpRegistry::Global(), graph.library());
+
+  // - 'Func1' is called directly from the graph.
+  // - 'Func2' is called indirectly via a PartitionedCall attribute, and it also
+  //   has a custom gradient ('Func2_grad') that must remain in the library.
+  // - 'Func3' is unreachable and has to be removed from the library
+  // - 'Func4' is called directly from the graph
+  // - 'Func5' is not called directly, but it implements same interface as Func4
+  //   which is directly called.
+  // - 'Func6' is not called directly, and the interface it implements has not
+  //   not been called by another nodes in the graph.
+  FunctionLibraryDefinition reachable_flib = flib.ReachableDefinitions(graph);
+  EXPECT_EQ(reachable_flib.num_functions(), 5);
+  EXPECT_TRUE(reachable_flib.Contains("Func1"));
+  EXPECT_TRUE(reachable_flib.Contains("Func2"));
+  EXPECT_TRUE(reachable_flib.Contains("Func2_grad"));
+  EXPECT_FALSE(reachable_flib.Contains("Func3"));
+  EXPECT_TRUE(reachable_flib.Contains("Func4"));
+  EXPECT_TRUE(reachable_flib.Contains("Func5"));
+  EXPECT_FALSE(reachable_flib.Contains("Func6"));
+}
+
 // TODO(skyewm): this could be more thorough
 TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   // Equal functions
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index f7539d37be08ce1235f35dcc0a8fd0bfcb12b434..56102db30ee96ac39023cb9508901f35df201617 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -103,7 +103,7 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
         return errors::InvalidArgument(
             "Attr '", attr.first,
             "' missing in producer's OpDef: ", SummarizeOpDef(*producer_op_def),
-            " but found in node: ", SummarizeNodeDef(*node_def));
+            " but found in node: ", FormatNodeDefForError(*node_def));
       }
       // ...and it has the same value as the default in producer,
       if (producer_attr_def->has_default_value() &&
diff --git a/tensorflow/core/framework/kernel_def.proto b/tensorflow/core/framework/kernel_def.proto
index e16c2ae73bd5fb559daa0f1b8ec141479ce3d67a..358621dc0f5cc19d4687d75e97a76b9fafe3325f 100644
--- a/tensorflow/core/framework/kernel_def.proto
+++ b/tensorflow/core/framework/kernel_def.proto
@@ -33,6 +33,11 @@ message KernelDef {
   // won't be used unless the user specifies a "_kernel" attr with
   // value matching this.
   string label = 5;
+
+  // Prioritization of kernel amongst different devices. By default we assume
+  // priority is 0. The higher the priority the better. By default (i.e. if
+  // this is not set), we prefer GPU kernels over CPU.
+  int32 priority = 6;
 }
 
 // A collection of KernelDefs
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index eb86f18ff06c38860e0c24e60b42326317ddecfb..fcacc3bebbab66449f81e5fa4f3aba2565f3f18e 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -66,6 +66,11 @@ KernelDefBuilder& KernelDefBuilder::Label(const char* label) {
   return *this;
 }
 
+KernelDefBuilder& KernelDefBuilder::Priority(int32 priority) {
+  kernel_def_->set_priority(priority);
+  return *this;
+}
+
 const KernelDef* KernelDefBuilder::Build() {
   KernelDef* r = kernel_def_;
   kernel_def_ = nullptr;
diff --git a/tensorflow/core/framework/kernel_def_builder.h b/tensorflow/core/framework/kernel_def_builder.h
index 32dd21f94e0edf8b48cd2f710d1cd99038cba122..d74453cf60678d0f07e53190adba4903c120c69a 100644
--- a/tensorflow/core/framework/kernel_def_builder.h
+++ b/tensorflow/core/framework/kernel_def_builder.h
@@ -64,6 +64,9 @@ class KernelDefBuilder {
   // "_kernel" attr.  May only be specified once.  Returns *this.
   KernelDefBuilder& Label(const char* label);
 
+  // Specify a priority number for this kernel.
+  KernelDefBuilder& Priority(int32 priority);
+
   // Returns a pointer to a KernelDef with fields set based on the
   // above calls to this instance.
   // Caller takes ownership of the result.
diff --git a/tensorflow/core/framework/lookup_interface.cc b/tensorflow/core/framework/lookup_interface.cc
index bf3204ea6e283d90390be46d5753a830705619ee..117adbf65c42cd695e5e007d9249e4346b3e91b0 100644
--- a/tensorflow/core/framework/lookup_interface.cc
+++ b/tensorflow/core/framework/lookup_interface.cc
@@ -71,6 +71,14 @@ Status LookupInterface::CheckKeyAndValueTensorsForImport(const Tensor& keys,
   return CheckKeyAndValueTensorsHelper(keys, values);
 }
 
+Status LookupInterface::CheckKeyTensorForRemove(const Tensor& keys) {
+  if (keys.dtype() != key_dtype()) {
+    return errors::InvalidArgument("Key must be type ", key_dtype(),
+                                   " but got ", keys.dtype());
+  }
+  return CheckKeyShape(keys.shape());
+}
+
 Status LookupInterface::CheckFindArguments(const Tensor& key,
                                            const Tensor& default_value) {
   TF_RETURN_IF_ERROR(CheckKeyAndValueTypes(key, default_value));
diff --git a/tensorflow/core/framework/lookup_interface.h b/tensorflow/core/framework/lookup_interface.h
index 0622dd06cba9d416ed5a9c664c07007706307c8b..d33945fd1b0c44264855ed518714eb35faf4b29f 100644
--- a/tensorflow/core/framework/lookup_interface.h
+++ b/tensorflow/core/framework/lookup_interface.h
@@ -64,6 +64,17 @@ class LookupInterface : public ResourceBase {
   virtual Status Insert(OpKernelContext* ctx, const Tensor& keys,
                         const Tensor& values) = 0;
 
+  // Removes elements from the table.
+  // This method is only implemented in mutable tables that can be updated over
+  // the execution of the graph. It returns Status::NotImplemented for read-only
+  // tables that are initialized once before they can be looked up.
+
+  // Returns the following statuses:
+  // - OK: when the remove finishes successfully.
+  // - InvalidArgument: if any of the preconditions on the lookup key fails.
+  // - Unimplemented: if the table does not support removals.
+  virtual Status Remove(OpKernelContext* ctx, const Tensor& keys) = 0;
+
   // Returns the number of elements in the table.
   virtual size_t size() const = 0;
 
@@ -107,6 +118,12 @@ class LookupInterface : public ResourceBase {
   virtual Status CheckKeyAndValueTensorsForImport(const Tensor& keys,
                                                   const Tensor& values);
 
+  // Check format of the key tensor for the Remove function.
+  // Returns OK if all the following requirements are satisfied, otherwise it
+  // returns InvalidArgument:
+  // - DataType of the tensor keys equals to the table key_dtype
+  virtual Status CheckKeyTensorForRemove(const Tensor& keys);
+
   // Check the arguments of a find operation. Returns OK if all the following
   // requirements are satisfied, otherwise it returns InvalidArgument:
   // - DataType of the tensor keys equals to the table key_dtype
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index bfdb3a66583d9ea920e172cb4d5ab0a6ba21ea00..3bd5b725b860ff522dba5be86ef7ab64b387b03e 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -21,244 +21,317 @@ namespace tensorflow {
 namespace data {
 namespace model {
 
-// TODO(jsimsa): Use `Node` subclassing instead of types and node statements.
-void Model::Node::CollectTunables(
-    std::vector<std::shared_ptr<Node::Tunable>>* tunables) {
-  tf_shared_lock l(mu_);
-  for (auto input : inputs_) {
-    input->CollectTunables(tunables);
-  }
-  switch (type_) {
-    case Type::MAP_AND_BATCH:
-    case Type::PARALLEL_INTERLEAVE_V2:
-    case Type::PARALLEL_MAP: {
-      if (auto* tunable_param =
-              gtl::FindOrNull(tunable_params_, "parallelism")) {
-        tunables->push_back(*tunable_param);
-      }
-      return;
-    }
-    default:
-      return;
-  }
+std::shared_ptr<Parameter> MakeParameter(const string& name,
+                                         std::shared_ptr<SharedState> state,
+                                         int64 min, int64 max) {
+  return std::make_shared<Parameter>(name, state, min, max);
 }
 
-int64 Model::Node::GetParameterValue(const string& name) {
-  if (auto* tunable_param = gtl::FindOrNull(tunable_params_, name)) {
-    return (*tunable_param)->value;
+namespace {
+
+// The first input of InterleaveMany corresponds to the input dataset whose
+// elements are used to create the (derived) input datasets whose elements are
+// interleaved as output.
+//
+// TODO(jsimsa): model the first input
+class InterleaveMany : public Node {
+ public:
+  using Node::Node;
+
+  virtual ~InterleaveMany() {}
+
+ protected:
+  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return std::make_shared<InterleaveMany>(
+        Args{id_, name_, std::move(output)});
   }
-  return constant_params_[name];
-}
 
-int64 Model::Node::ProcessingTimeLocked() {
-  switch (type_) {
-    case Type::BATCH:
-    case Type::MAP_AND_BATCH:
-    case Type::PADDED_BATCH: {
-      int64 batch_size = GetParameterValue("batch_size");
-      return NanosPerElementLocked() + batch_size * ProcessingTimeForInputs();
+  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    if (inputs_.size() <= 1) {
+      return NanosPerElementLocked();
     }
-    case Type::FILTER: {
-      std::shared_ptr<Node> input = inputs_.front();
-      double ratio = static_cast<double>(input->num_elements()) /
-                     static_cast<double>(num_elements_);
-      return NanosPerElementLocked() +
-             static_cast<int64>(ratio *
-                                static_cast<double>(ProcessingTimeForInputs()));
+    int64 delta = NanosPerElementLocked() * (inputs_.size() - 1);
+    input_times->back() += delta;
+    auto cleanup = gtl::MakeCleanup(
+        [input_times, delta]() { input_times->back() -= delta; });
+    int64 output_time =
+        static_cast<double>(OutputTimeForInputs(input_times) -
+                            inputs_.front()->OutputTime(input_times)) /
+        static_cast<double>(inputs_.size() - 1);
+    return NanosPerElementLocked() + output_time;
+  }
+
+  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    if (inputs_.size() <= 1) {
+      return NanosPerElementLocked();
     }
-    case Type::FLAT_MAP:
-    case Type::INTERLEAVE:
-    case Type::PARALLEL_INTERLEAVE:
-    case Type::PARALLEL_INTERLEAVE_V2: {
-      // TODO(jsimsa): model the first input
-      // TODO(jsimsa): use processing time history as a prior for future inputs
-      if (inputs_.size() <= 1) {
-        return NanosPerElementLocked();
-      }
-      int64 processing_time =
-          ProcessingTimeForInputs() - inputs_.front()->ProcessingTime();
-      return NanosPerElementLocked() +
-             static_cast<double>(processing_time) /
-                 static_cast<double>(inputs_.size() - 1);
+    int64 processing_time =
+        static_cast<double>(ProcessingTimeForInputs() -
+                            inputs_.front()->ProcessingTime()) /
+        static_cast<double>(inputs_.size() - 1);
+    return NanosPerElementLocked() + processing_time;
+  }
+};
+
+// TODO(jsimsa): model the first input
+class AsyncInterleaveMany : public Node {
+ public:
+  AsyncInterleaveMany(Node::Args args,
+                      std::vector<std::shared_ptr<Parameter>> parameters)
+      : Node(args) {
+    for (auto& parameter : parameters) {
+      parameters_[parameter->name] = std::move(parameter);
     }
-    case Type::CACHE:
-    case Type::CONCATENATE:
-    case Type::MAP:
-    case Type::PARALLEL_MAP:
-    case Type::PREFETCH:
-      // TODO(jsimsa): use processing time history as a prior for future inputs
-    case Type::REPEAT:
-    case Type::SHUFFLE:
-    case Type::SKIP:
-    case Type::TAKE:
-    case Type::ZIP: {
-      return NanosPerElementLocked() + ProcessingTimeForInputs();
+  }
+
+  virtual ~AsyncInterleaveMany() {}
+
+ protected:
+  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    std::vector<std::shared_ptr<Parameter>> parameters;
+    for (auto& pair : parameters_) {
+      parameters.push_back(pair.second);
     }
-    default:
-      return NanosPerElementLocked();
+    return std::make_shared<AsyncInterleaveMany>(
+        Args{id_, name_, std::move(output)}, parameters);
   }
-}
 
-int64 Model::Node::OutputTimeLocked(std::vector<int64>* input_times) {
-  switch (type_) {
-    case Type::BATCH:
-    case Type::PADDED_BATCH: {
-      double batch_size = GetParameterValue("batch_size");
-      int64 old_value = (*input_times)[input_times->size() - 1];
-      (*input_times)[input_times->size() - 1] = static_cast<int64>(
-          static_cast<double>(old_value + NanosPerElementLocked()) /
-          batch_size);
-      auto cleanup = gtl::MakeCleanup([input_times, old_value]() {
-        (*input_times)[input_times->size() - 1] = old_value;
-      });
-      return NanosPerElementLocked() +
-             batch_size * OutputTimeForInputs(input_times);
+  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    if (inputs_.size() <= 1) {
+      return NanosPerElementLocked();
     }
-    case Type::FILTER: {
-      std::shared_ptr<Node> input = inputs_.front();
-      int64 old_value = (*input_times)[input_times->size() - 1];
-      double ratio = static_cast<double>(input->num_elements()) /
-                     static_cast<double>(num_elements_);
-      (*input_times)[input_times->size() - 1] = static_cast<int64>(
-          static_cast<double>(old_value + NanosPerElementLocked()) / ratio);
-      auto cleanup = gtl::MakeCleanup([input_times, old_value]() {
-        (*input_times)[input_times->size() - 1] = old_value;
-      });
-      return NanosPerElementLocked() +
-             static_cast<int64>(
-                 static_cast<double>(OutputTimeForInputs(input_times)) * ratio);
+    int64 old_input_time = input_times->back();
+    int64 new_input_time = static_cast<double>(NanosPerElementLocked()) *
+                           static_cast<double>(inputs_.size() - 1);
+    input_times->push_back(new_input_time);
+    auto cleanup =
+        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
+    double parallelism = inputs_.size() - 1;  // default to cycle length
+    if (auto* parameter = gtl::FindOrNull(parameters_, "parallelism")) {
+      parallelism = std::min(static_cast<int>(parallelism),
+                             static_cast<int>((*parameter)->value));
     }
-    case Type::FLAT_MAP:
-    case Type::INTERLEAVE: {
-      // TODO(jsimsa): model the first input
-      // TODO(jsimsa): use cycle length metadata instead of `inputs_.size() - 1`
-      if (inputs_.size() <= 1) {
-        return NanosPerElementLocked();
-      }
-      int64 delta =
-          static_cast<int64>(static_cast<double>(NanosPerElementLocked()) *
-                             static_cast<double>(inputs_.size() - 1));
-      (*input_times)[input_times->size() - 1] += delta;
-      auto cleanup = gtl::MakeCleanup([input_times, delta]() {
-        (*input_times)[input_times->size() - 1] -= delta;
-      });
-      int64 output_time = OutputTimeForInputs(input_times) -
-                          inputs_.front()->OutputTime(input_times);
-      return NanosPerElementLocked() +
-             static_cast<double>(output_time) /
-                 static_cast<double>(inputs_.size() - 1);
+    int64 output_time =
+        static_cast<double>(OutputTimeForInputs(input_times) -
+                            inputs_.front()->OutputTime(input_times)) /
+        static_cast<double>(inputs_.size() - 1) / parallelism;
+    return std::max(0LL,
+                    NanosPerElementLocked() + output_time - old_input_time);
+  }
+
+  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    if (inputs_.size() <= 1) {
+      return NanosPerElementLocked();
     }
-    case Type::MAP_AND_BATCH: {
-      double batch_size = GetParameterValue("batch_size");
-      double parallelism = GetParameterValue("parallelism");
-      int64 delta =
-          static_cast<int64>(static_cast<double>(NanosPerElementLocked()) /
-                             (batch_size * parallelism));
-      input_times->push_back(delta);
-      auto cleanup =
-          gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-      int64 output_time = static_cast<int64>(
-          static_cast<double>(NanosPerElementLocked()) / parallelism +
-          batch_size * OutputTimeForInputs(input_times));
-      return std::max(0LL,
-                      output_time - input_times->at(input_times->size() - 2));
+    int64 processing_time =
+        ProcessingTimeForInputs() - inputs_.front()->ProcessingTime();
+    return NanosPerElementLocked() +
+           static_cast<double>(processing_time) /
+               static_cast<double>(inputs_.size() - 1);
+  }
+};
+
+class KnownRatio : public Node {
+ public:
+  KnownRatio(Node::Args args, int64 ratio) : Node(args), ratio_(ratio) {}
+
+  virtual ~KnownRatio() {}
+
+ protected:
+  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return std::make_shared<KnownRatio>(Args{id_, name_, std::move(output)},
+                                        ratio_);
+  }
+
+  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    if (ratio_ == 0) {
+      return NanosPerElementLocked();
     }
-    case Type::PARALLEL_INTERLEAVE: {
-      // TODO(jsimsa): model the first input
-      if (inputs_.size() <= 1) {
-        return NanosPerElementLocked();
-      }
-      int64 delta = static_cast<double>(NanosPerElementLocked()) *
-                    static_cast<double>(inputs_.size() - 1);
-      input_times->push_back(delta);
-      auto cleanup =
-          gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-      int64 inputs_output_time = OutputTimeForInputs(input_times) -
-                                 inputs_.front()->OutputTime(input_times);
-      double parallelism = GetParameterValue("parallelism");
-      int64 output_time =
-          NanosPerElementLocked() + ((static_cast<double>(inputs_output_time) /
-                                      static_cast<double>(inputs_.size() - 1)) /
-                                     parallelism);
-      return std::max(0LL,
-                      output_time - input_times->at(input_times->size() - 2));
+    int64 old_input_time = input_times->back();
+    input_times->back() += static_cast<int64>(
+        static_cast<double>(old_input_time + NanosPerElementLocked()) / ratio_);
+    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
+      input_times->back() = old_input_time;
+    });
+    return NanosPerElementLocked() + ratio_ * OutputTimeForInputs(input_times);
+  }
+
+  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    return NanosPerElementLocked() + ratio_ * ProcessingTimeForInputs();
+  }
+
+ private:
+  const double ratio_;
+};
+
+class AsyncKnownRatio : public Node {
+ public:
+  AsyncKnownRatio(Node::Args args, double ratio,
+                  std::vector<std::shared_ptr<Parameter>> parameters)
+      : Node(args), ratio_(ratio) {
+    for (auto& parameter : parameters) {
+      parameters_[parameter->name] = std::move(parameter);
     }
-    case Type::PARALLEL_INTERLEAVE_V2: {
-      // TODO(jsimsa): model the first input
-      if (inputs_.size() <= 1) {
-        return NanosPerElementLocked();
-      }
-      int64 delta = static_cast<double>(NanosPerElementLocked()) *
-                    static_cast<double>(inputs_.size() - 1);
-      input_times->push_back(delta);
-      auto cleanup =
-          gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-      int64 inputs_output_time = OutputTimeForInputs(input_times) -
-                                 inputs_.front()->OutputTime(input_times);
-      double parallelism =
-          std::min(static_cast<int>(GetParameterValue("cycle_length")),
-                   static_cast<int>(GetParameterValue("parallelism")));
-      int64 output_time =
-          NanosPerElementLocked() + ((static_cast<double>(inputs_output_time) /
-                                      static_cast<double>(inputs_.size() - 1)) /
-                                     parallelism);
-      return std::max(0LL,
-                      output_time - input_times->at(input_times->size() - 2));
+  }
+
+  virtual ~AsyncKnownRatio() {}
+
+ protected:
+  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    std::vector<std::shared_ptr<Parameter>> parameters;
+    for (auto& pair : parameters_) {
+      parameters.push_back(pair.second);
     }
-    case Type::PARALLEL_MAP: {
-      double parallelism =
-          std::min(port::NumSchedulableCPUs(),
-                   static_cast<int>(GetParameterValue("parallelism")));
-      int64 delta = static_cast<int64>(
-          static_cast<double>(NanosPerElementLocked()) / parallelism);
-      input_times->push_back(delta);
-      auto cleanup =
-          gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-      int64 output_time =
-          static_cast<double>(NanosPerElementLocked()) / parallelism +
-          OutputTimeForInputs(input_times);
-      return std::max(0LL,
-                      output_time - input_times->at(input_times->size() - 2));
+    return std::make_shared<AsyncKnownRatio>(
+        Args{id_, name_, std::move(output)}, ratio_, parameters);
+  }
+
+  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    double parallelism = 1.0;
+    if (auto* parameter = gtl::FindOrNull(parameters_, "parallelism")) {
+      parallelism = (*parameter)->value;
     }
-    case Type::PREFETCH: {
-      int64 delta = NanosPerElementLocked();
-      input_times->push_back(delta);
-      auto cleanup =
-          gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-      return std::max(0LL, NanosPerElementLocked() +
-                               OutputTimeForInputs(input_times) -
-                               input_times->at(input_times->size() - 2));
+    if (ratio_ == 0.0) {
+      int64 output_time =
+          static_cast<double>(NanosPerElementLocked()) / parallelism;
+      return std::max(0LL, output_time - input_times->back());
     }
-    case Type::CACHE:
-    case Type::CONCATENATE:
-    case Type::MAP:
-    case Type::REPEAT:
-    case Type::SHUFFLE:
-    case Type::SKIP:
-    case Type::TAKE:
-    case Type::ZIP: {
-      int64 delta = NanosPerElementLocked();
-      (*input_times)[input_times->size() - 1] += delta;
-      auto cleanup = gtl::MakeCleanup([input_times, delta]() {
-        (*input_times)[input_times->size() - 1] -= delta;
-      });
-      return NanosPerElementLocked() + OutputTimeForInputs(input_times);
+    int64 old_input_time = input_times->back();
+    int64 new_input_time = static_cast<int64>(
+        static_cast<double>(NanosPerElementLocked()) / ratio_ / parallelism);
+    input_times->push_back(new_input_time);
+    auto cleanup =
+        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
+    int64 output_time = static_cast<int64>(
+        static_cast<double>(NanosPerElementLocked()) / parallelism +
+        ratio_ * OutputTimeForInputs(input_times));
+    return std::max(0LL, output_time - old_input_time);
+  }
+
+  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    return NanosPerElementLocked() + ratio_ * ProcessingTimeForInputs();
+  }
+
+ private:
+  const double ratio_;
+};
+
+class UnknownRatio : public Node {
+ public:
+  using Node::Node;
+
+  virtual ~UnknownRatio() {}
+
+ protected:
+  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
+  }
+
+  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    if (num_elements_ == 0 || inputs_.empty() ||
+        inputs_.front()->num_elements() == 0) {
+      return NanosPerElementLocked();
     }
-    default:
+    // TODO(jsimsa): The current implementation assumes that the number of input
+    // elements consumed per output is the same across all inputs.
+    std::shared_ptr<Node> input = inputs_.front();
+    double ratio = static_cast<double>(input->num_elements()) /
+                   static_cast<double>(num_elements_);
+    int64 old_input_time = input_times->back();
+    input_times->back() =
+        static_cast<double>(old_input_time + NanosPerElementLocked()) / ratio;
+    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
+      input_times->back() = old_input_time;
+    });
+    return NanosPerElementLocked() +
+           static_cast<int64>(
+               ratio * static_cast<double>(OutputTimeForInputs(input_times)));
+  }
+
+  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    if (inputs_.empty() || num_elements_ == 0) {
       return NanosPerElementLocked();
+    }
+    // TODO(jsimsa): The current implementation that the number of input
+    // elements consumed per output is the same across all inputs.
+    std::shared_ptr<Node> input = inputs_.front();
+    double ratio = static_cast<double>(input->num_elements()) /
+                   static_cast<double>(num_elements_);
+    return NanosPerElementLocked() +
+           static_cast<int64>(ratio *
+                              static_cast<double>(ProcessingTimeForInputs()));
   }
-}
+};
 
-void Model::AddConstantParameter(const string& node_name,
-                                 const string& parameter_name, int64 value) {
-  tf_shared_lock l(mu_);
-  auto node = gtl::FindOrNull(lookup_table_, node_name);
-  if (node) {
-    (*node)->add_constant_param(parameter_name, value);
+class Unknown : public Node {
+ public:
+  using Node::Node;
+
+  virtual ~Unknown() {}
+
+ protected:
+  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
+  }
+
+  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return OutputTimeForInputs(input_times);
+  }
+
+  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    return ProcessingTimeForInputs();
   }
+};
+
+}  // namespace
+
+std::shared_ptr<Node> MakeInterleaveManyNode(Node::Args args) {
+  return std::make_shared<InterleaveMany>(std::move(args));
 }
 
-void Model::AddNode(const string& name, const string& output_name) {
+std::shared_ptr<Node> MakeAsyncInterleaveManyNode(
+    Node::Args args, std::vector<std::shared_ptr<Parameter>> parameters) {
+  return std::make_shared<AsyncInterleaveMany>(std::move(args),
+                                               std::move(parameters));
+}
+
+std::shared_ptr<Node> MakeKnownRatioNode(Node::Args args, double ratio) {
+  return std::make_shared<KnownRatio>(std::move(args), ratio);
+}
+
+std::shared_ptr<Node> MakeAsyncKnownRatioNode(
+    Node::Args args, double ratio,
+    std::vector<std::shared_ptr<Parameter>> parameters) {
+  return std::make_shared<AsyncKnownRatio>(std::move(args), ratio,
+                                           std::move(parameters));
+}
+
+std::shared_ptr<Node> MakeSourceNode(Node::Args args) {
+  return MakeKnownRatioNode(std::move(args), 0);
+}
+
+std::shared_ptr<Node> MakeUnknownRatioNode(Node::Args args) {
+  return std::make_shared<UnknownRatio>(std::move(args));
+}
+
+std::shared_ptr<Node> MakeUnknownNode(Node::Args args) {
+  return std::make_shared<Unknown>(std::move(args));
+}
+
+std::shared_ptr<Node> Model::AddNode(Node::Factory factory, const string& name,
+                                     const string& output_name) {
   // The name captures the sequence of iterators joined by `::`. We use the full
   // sequence as the key in the lookup table, but only the last element of the
   // sequence as the name node.
@@ -276,7 +349,7 @@ void Model::AddNode(const string& name, const string& output_name) {
   if (it != lookup_table_.end()) {
     output = it->second;
   }
-  std::shared_ptr<Node> node(new Node(id_counter_++, tokens.back(), output));
+  std::shared_ptr<Node> node = factory({id_counter_++, tokens.back(), output});
   if (!output_) {
     output_ = node;
   }
@@ -284,6 +357,7 @@ void Model::AddNode(const string& name, const string& output_name) {
     output->add_input(node);
   }
   lookup_table_.insert(std::make_pair(name, node));
+  return node;
 }
 
 void Model::AddProcessingTime(const string& name, int64 delta) {
@@ -294,16 +368,6 @@ void Model::AddProcessingTime(const string& name, int64 delta) {
   }
 }
 
-void Model::AddTunableParameter(const string& node_name,
-                                const string& parameter_name,
-                                std::shared_ptr<SharedState> state, int64 min,
-                                int64 max) {
-  tf_shared_lock l(mu_);
-  auto node = *gtl::FindOrNull(lookup_table_, node_name);
-  DCHECK(node);
-  node->add_tunable_param(parameter_name, std::move(state), min, max);
-}
-
 // The optimization algorithm starts by setting all tunable parallelism
 // parameters to 1. It then repeatedly identifies the parameter whose increase
 // in parallelism decreases the output time the most. This process is repeated
@@ -311,55 +375,58 @@ void Model::AddTunableParameter(const string& node_name,
 // is less than or equal to the processing time needed to produce an element
 // divided by CPU budget.
 void Model::Optimize(int64 cpu_budget) {
-  std::vector<std::shared_ptr<Model::Node::Tunable>> tunables;
+  std::shared_ptr<Node> snapshot;
   {
     tf_shared_lock lock(mu_);
-    const int64 processing_time = ProcessingTime();
-    tunables = CollectTunables();
-    for (auto tunable : tunables) {
-      tunable->value = 1;
-    }
-    while (true) {
-      const int64 output_time = OutputTime();
-      bool all_tunables = true;
-      for (auto& tunable : tunables) {
-        if (tunable->value < tunable->max) {
-          all_tunables = false;
-          break;
-        }
-      }
-      if (output_time < processing_time / cpu_budget || all_tunables) {
+    snapshot = output_->Snapshot(nullptr);
+  }
+  const int64 processing_time = ProcessingTime(snapshot);
+  auto parameters = CollectTunableParameters(snapshot);
+  for (auto& parameter : parameters) {
+    parameter->value = 1;
+  }
+  while (true) {
+    const int64 output_time = OutputTime(snapshot);
+    bool all_max = true;
+    for (auto& parameter : parameters) {
+      if (parameter->value < parameter->max) {
+        all_max = false;
         break;
       }
-      int64 best_delta = -1;
-      Model::Node::Tunable* best_tunable = nullptr;
-      for (auto& tunable : tunables) {
-        if (tunable->value == tunable->max) {
-          continue;
-        }
-        tunable->value++;
-        int64 delta = output_time - OutputTime();
-        if (delta > best_delta) {
-          best_delta = delta;
-          best_tunable = tunable.get();
-        }
-        tunable->value--;
+    }
+    if (output_time < processing_time / cpu_budget || all_max) {
+      break;
+    }
+    int64 best_delta = -1;
+    Parameter* best_parameter = nullptr;
+    for (auto& parameter : parameters) {
+      if (parameter->value == parameter->max) {
+        continue;
       }
-      if (!best_tunable) {
-        // NOTE: This can happen because we are performing the optimization
-        // while the model data is changing. If this becomes an issue, we should
-        // look into performing the optimization using a model snapshot.
-        break;
+      parameter->value++;
+      int64 delta = output_time - OutputTime(snapshot);
+      if (delta > best_delta) {
+        best_delta = delta;
+        best_parameter = parameter.get();
       }
-      best_tunable->value++;
+      parameter->value--;
     }
+    if (!best_parameter) {
+      // This should never happen because we are using a model snapshot and
+      // the output time is monotonically decreasing w.r.t. parallelism.
+      LOG(WARNING) << "Failed to find a tunable parameter that would "
+                      "decrease the output time, aborting the current "
+                      "optimization attempt.";
+      return;
+    }
+    best_parameter->value++;
   }
-  VLOG(2) << "Number of knobs: " << tunables.size();
-  for (auto& tunable : tunables) {
-    VLOG(2) << "Setting tunable parameter: " << tunable->value;
-    mutex_lock l(*tunable->state->mu);
-    tunable->state->value = tunable->value;
-    tunable->state->cond_var->notify_all();
+  VLOG(2) << "Number of tunable parameters: " << parameters.size();
+  for (auto& parameter : parameters) {
+    VLOG(2) << "Setting tunable parameter: " << parameter->value;
+    mutex_lock l(*parameter->state->mu);
+    parameter->state->value = parameter->value;
+    parameter->state->cond_var->notify_all();
   }
 }
 
@@ -375,10 +442,11 @@ void Model::RecordStart(const string& name, bool stop_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
   if (node) {
+    int64 now_nanos = Env::Default()->NowNanos();
     if (stop_output && (*node)->output()) {
-      (*node)->output()->record_stop();
+      (*node)->output()->record_stop(now_nanos);
     }
-    (*node)->record_start();
+    (*node)->record_start(now_nanos);
   }
 }
 
@@ -386,9 +454,10 @@ void Model::RecordStop(const string& name, bool start_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
   if (node) {
-    (*node)->record_stop();
+    int64 now_nanos = Env::Default()->NowNanos();
+    (*node)->record_stop(now_nanos);
     if (start_output && (*node)->output()) {
-      (*node)->output()->record_start();
+      (*node)->output()->record_start(now_nanos);
     }
   }
 }
@@ -402,18 +471,21 @@ void Model::RemoveNode(const string& name) {
   lookup_table_.erase(name);
 }
 
-std::vector<std::shared_ptr<Model::Node::Tunable>> Model::CollectTunables() {
-  std::vector<std::shared_ptr<Model::Node::Tunable>> tunables;
-  output_->CollectTunables(&tunables);
-  return tunables;
+std::vector<std::shared_ptr<Parameter>> Model::CollectTunableParameters(
+    std::shared_ptr<Node> node) {
+  std::vector<std::shared_ptr<Parameter>> parameters;
+  node->CollectTunableParameters(&parameters);
+  return parameters;
 }
 
-int64 Model::OutputTime() {
+int64 Model::OutputTime(std::shared_ptr<Node> node) {
   std::vector<int64> input_times(1, 0);
-  return output_->OutputTime(&input_times);
+  return node->OutputTime(&input_times);
 }
 
-int64 Model::ProcessingTime() { return output_->ProcessingTime(); }
+int64 Model::ProcessingTime(std::shared_ptr<Node> node) {
+  return node->ProcessingTime();
+}
 
 }  // namespace model
 }  // namespace data
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index eae0fa70e806a2dd381e29d35bdf9e675b34c4a0..10059bbfd5a89a3b24ce3daf981408564a5351b2 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -18,7 +18,8 @@ limitations under the License.
 #include <list>
 #include <memory>
 #include <string>
-#include <thread>  // (b/114492873): move this include into core/platform
+// TODO(b/114492873): Move this include into core/platform.
+#include <thread>  // NOLINT
 #include <utility>
 #include <vector>
 
@@ -37,15 +38,300 @@ namespace model {
 // the performance model.
 struct SharedState {
  public:
-  explicit SharedState(int64 value, std::shared_ptr<mutex> mu,
-                       std::shared_ptr<condition_variable> cond_var)
+  SharedState(int64 value, std::shared_ptr<mutex> mu,
+              std::shared_ptr<condition_variable> cond_var)
       : value(value), mu(std::move(mu)), cond_var(std::move(cond_var)) {}
 
+  int64 value;
   std::shared_ptr<mutex> mu;
   std::shared_ptr<condition_variable> cond_var;
+  bool tunable = false;
+};
+
+// Represents a parameter.
+struct Parameter {
+  Parameter(const string& name, std::shared_ptr<SharedState> state, int64 min,
+            int64 max)
+      : name(name),
+        value(state->value),
+        min(min),
+        max(max),
+        state(std::move(state)) {}
+
+  // Human-readable name of the parameter.
+  string name;
+
+  // Identifies the model value of the parameter. This can be different from
+  // the actual value (e.g. during optimization search).
   int64 value;
+
+  // Identifies the minimum value of the parameter.
+  int64 min;
+
+  // Identifies the maximum value of the parameter.
+  int64 max;
+
+  // Shared state of the parameter.
+  std::shared_ptr<SharedState> state;
+};
+
+std::shared_ptr<Parameter> MakeParameter(const string& name,
+                                         std::shared_ptr<SharedState> state,
+                                         int64 min, int64 max);
+
+// Abstract representation of a TensorFlow input pipeline node. It collects
+// information about inputs to this node, processing time spent executing the
+// node logic, number of elements produced by the node, various other
+// information (e.g. batch size or execution parallelism).
+//
+// Developers of tf.data transformations are not expected to interact with
+// this class directly. Boiler plate code for creating the abstract
+// representation of the input pipeline and collecting common information has
+// been added to the implementation of `DatasetBase` and `DatasetBaseIterator`
+// respectively.
+//
+// In addition, `DatasetBaseIterator` provides wrappers that can be used for
+// transformation-specific information collection. The `SetMetadata` wrapper
+// can be used to pass arbitrary metadata to the modeling framework, while the
+// `StartWork` and `StopWork` wrappers should be used to correctly account for
+// processing time of multi-threaded transformation that yield the CPU; such
+// transformations should invoke `StartWork()` when a transformation thread
+// starts executing (e.g. when created or woken up) and `StopWork()` when a
+// transformation thread stops executing (e.g. when returning or waiting).
+class Node {
+ public:
+  // Arguments for `Node` constructor.
+  struct Args {
+    int64 id;
+    string name;
+    std::shared_ptr<Node> output;
+  };
+
+  using Factory = std::function<std::shared_ptr<Node>(Args)>;
+
+  explicit Node(Args args)
+      : id_(args.id), name_(args.name), output_(args.output.get()) {}
+
+  // Increments the bytes buffered by the given delta.
+  void add_buffered_bytes(int64 delta) LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    buffered_bytes_ += delta;
+  }
+
+  // Adds an input.
+  void add_input(std::shared_ptr<Node> node) LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    inputs_.push_back(node);
+  }
+
+  // Increments the aggregate processing time by the given delta.
+  void add_processing_time(int64 delta) LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    processing_time_ += delta;
+  }
+
+  // Returns the number of bytes stored in this node's buffer.
+  int64 buffered_bytes() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return buffered_bytes_;
+  }
+
+  // Returns the unique node ID.
+  int64 id() const LOCKS_EXCLUDED(mu_) { return id_; }
+
+  // Returns the node inputs.
+  std::list<std::shared_ptr<Node>> inputs() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return inputs_;
+  }
+
+  // Returns the node name.
+  const string& name() const { return name_; }
+
+  // Returns the number of elements produced by the node.
+  int64 num_elements() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return num_elements_;
+  }
+
+  // Returns the node output.
+  Node* output() const { return output_; }
+
+  // Returns the aggregate processing time.
+  int64 processing_time() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return processing_time_;
+  }
+
+  // Records that the node produced an element.
+  void record_element() LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    num_elements_++;
+  }
+
+  // Records that a node thread has started executing.
+  void record_start(int64 time_nanos) LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    work_start_[std::this_thread::get_id()] = time_nanos;
+  }
+
+  // Records that a node thread has stopped executing.
+  void record_stop(int64 time_nanos) LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    std::thread::id tid = std::this_thread::get_id();
+    auto iter = work_start_.find(tid);
+    if (iter != work_start_.end()) {
+      processing_time_ += time_nanos - iter->second;
+      work_start_.erase(iter);
+    } else {
+      LOG(WARNING)
+          << "Encountered a stop event that was not preceded by a start event.";
+    }
+  }
+
+  // Removes an input.
+  void remove_input(std::shared_ptr<Node> input) LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    inputs_.remove(input);
+  }
+
+  // Collects tunable parameters in the subtree rooted in this node.
+  void CollectTunableParameters(
+      std::vector<std::shared_ptr<Parameter>>* parameters) const
+      LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    for (auto& pair : parameters_) {
+      if (pair.second->state->tunable) {
+        parameters->push_back(pair.second);
+      }
+    }
+    for (auto& input : inputs_) {
+      input->CollectTunableParameters(parameters);
+    }
+  }
+
+  // Returns the per-element output time for this node.
+  int64 OutputTime(std::vector<int64>* input_times) const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return OutputTimeLocked(input_times);
+  }
+
+  // Returns the per-element processing time spent in the subtree rooted in
+  // this node.
+  int64 ProcessingTime() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return ProcessingTimeLocked();
+  }
+
+  // Returns a copy of this node, making a deep copy of its inputs and a
+  // shallow copy of its tunable parameters.
+  //
+  // The purpose for this method is to allow the model optimization logic to
+  // operate over immutable state while allowing concurrent model updates.
+  std::shared_ptr<Node> Snapshot(std::shared_ptr<Node> output)
+      LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    std::shared_ptr<Node> result = Clone(output);
+    result->buffered_bytes_ = buffered_bytes_;
+    result->processing_time_ = processing_time_;
+    result->num_elements_ = num_elements_;
+    result->parameters_ = parameters_;
+    for (auto& input : inputs_) {
+      result->add_input(input->Snapshot(result));
+    }
+    return result;
+  }
+
+ protected:
+  // Creates a clone of this node.
+  virtual std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const
+      SHARED_LOCKS_REQUIRED(mu_) = 0;
+
+  // Returns the per-element processing time spent in this node.
+  int64 NanosPerElementLocked() const SHARED_LOCKS_REQUIRED(mu_) {
+    if (num_elements_ == 0) {
+      return 0;
+    }
+    return static_cast<int64>(static_cast<double>(processing_time_) /
+                              static_cast<double>(num_elements_));
+  }
+
+  // Returns the sum of per-element output time for the inputs of this node.
+  int64 OutputTimeForInputs(std::vector<int64>* input_times) const
+      SHARED_LOCKS_REQUIRED(mu_) {
+    int64 sum = 0;
+    for (auto& input : inputs_) {
+      sum += input->OutputTime(input_times);
+    }
+    return sum;
+  }
+
+  // Returns the per-element output time for this node.
+  virtual int64 OutputTimeLocked(std::vector<int64>* input_times) const
+      SHARED_LOCKS_REQUIRED(mu_) = 0;
+
+  // Returns the sum of per-element processing time for the inputs of this node.
+  //
+  // TODO(jsimsa): use processing time history as a prior for future inputs
+  int64 ProcessingTimeForInputs() const SHARED_LOCKS_REQUIRED(mu_) {
+    int64 sum = 0;
+    for (auto& input : inputs_) {
+      sum += input->ProcessingTime();
+    }
+    return sum;
+  }
+
+  // Returns the per-element processing time spent in the subtree rooted in
+  // this node.
+  virtual int64 ProcessingTimeLocked() const SHARED_LOCKS_REQUIRED(mu_) = 0;
+
+  mutable mutex mu_;
+  const int64 id_;
+  const string name_;
+  int64 buffered_bytes_ GUARDED_BY(mu_) = 0;
+  int64 processing_time_ GUARDED_BY(mu_) = 0;
+  int64 num_elements_ GUARDED_BY(mu_) = 0;
+  std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
+  std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
+  std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
+
+  // The reference to the output node is not owned so that that deletion of a
+  // node results in recursive deletion of the subtree rooted in the node.
+  Node* const output_;
 };
 
+// InterleaveMany is used to model datasets whose inputs are used to create
+// datasets whose elements are then interleaved.
+std::shared_ptr<Node> MakeInterleaveManyNode(Node::Args args);
+
+// AsyncInterleaveMany nodes are the asynchronous version of InterleaveMany
+// nodes.
+std::shared_ptr<Node> MakeAsyncInterleaveManyNode(
+    Node::Args args, std::vector<std::shared_ptr<Parameter>> parameters);
+
+// KnownMany nodes model datasets that synchronously consume known number of
+// input element per output element.
+std::shared_ptr<Node> MakeKnownRatioNode(Node::Args args, double ratio);
+
+// AsyncKnownRatio nodes are the asynchronous version of KnownRate nodes.
+std::shared_ptr<Node> MakeAsyncKnownRatioNode(
+    Node::Args args, double ratio,
+    std::vector<std::shared_ptr<Parameter>> parameters);
+
+// Source nodes represent data sources.
+std::shared_ptr<Node> MakeSourceNode(Node::Args args);
+
+// UnknownMany nodes represent datasets that synchronously consume an
+// unknown number of input elements per output.
+//
+// Unlike KnownRatio nodes which expect the ratio between inputs and outputs is
+// specified as a parameter, UnknownRatio estimates the ratio empirically.
+std::shared_ptr<Node> MakeUnknownRatioNode(Node::Args args);
+
+// Unknown nodes represent datasets for which we do not have a model. It acts
+// as pass-through between inputs and output.
+std::shared_ptr<Node> MakeUnknownNode(Node::Args args);
+
 // Abstract representation of a TensorFlow input pipeline that can be used
 // for collecting runtime information and optimizing performance. It collects
 // runtime information about execution of the input pipeline that is used to
@@ -60,24 +346,13 @@ class Model {
  public:
   Model() = default;
 
-  // Adds a constant parameter for the given node.
-  void AddConstantParameter(const string& node_name,
-                            const string& parameter_name, int64 value)
-      LOCKS_EXCLUDED(mu_);
-
-  // Adds a node with the given name and given output (identified by name).
-  void AddNode(const string& name, const string& output_name)
-      LOCKS_EXCLUDED(mu_);
+  // Adds a node with the given name and given output.
+  std::shared_ptr<Node> AddNode(Node::Factory factory, const string& name,
+                                const string& output_name) LOCKS_EXCLUDED(mu_);
 
   // Increments the processing time for the given node..
   void AddProcessingTime(const string& name, int64 delta) LOCKS_EXCLUDED(mu_);
 
-  // Adds a tunable parameter for the given node.
-  void AddTunableParameter(const string& node_name,
-                           const string& parameter_name,
-                           std::shared_ptr<SharedState> value, int64 min,
-                           int64 max) LOCKS_EXCLUDED(mu_);
-
   // Runs optimization.
   void Optimize(int64 cpu_budget) LOCKS_EXCLUDED(mu_);
 
@@ -96,301 +371,15 @@ class Model {
   void RemoveNode(const string& name) LOCKS_EXCLUDED(mu_);
 
  private:
-  // Abstract representation of a TensorFlow input pipeline node. It collects
-  // information about inputs to this node, processing time spent executing the
-  // node logic, number of elements produced by the node, various other
-  // information (e.g. batch size or execution parallelism).
-  //
-  // Developers of tf.data transformations are not expected to interact with
-  // this class directly. Boiler plate code for creating the abstract
-  // representation of the input pipeline and collecting common information has
-  // been added to the implementation of `DatasetBase` and `DatasetBaseIterator`
-  // respectively.
-  //
-  // In addition, `DatasetBaseIterator` provides wrappers that can be used for
-  // transformation-specific information collection. The `SetMetadata` wrapper
-  // can be used to pass arbitrary metadata to the modeling framework, while the
-  // `StartWork` and `StopWork` wrappers should be used to correctly account for
-  // processing time of multi-threaded transformation that yield the CPU; such
-  // transformations should invoke `StartWork()` when a transformation thread
-  // starts executing (e.g. when created or woken up) and `StopWork()` when a
-  // transformation thread stops executing (e.g. when returning or waiting).
-  //
-  // TODO(jsimsa): Create an API to capture the abstract semantics of each
-  // tf.data transformation and replace switch-case blocks with inheritance.
-  class Node {
-   public:
-    // Represents a tunable parameter.
-    struct Tunable {
-      Tunable(std::shared_ptr<SharedState> state, int64 min, int64 max)
-          : value(state->value), min(min), max(max), state(std::move(state)) {}
-
-      // Identifies the model value of the parameter. This can be different from
-      // the actual value (e.g. during optimization search).
-      int64 value;
-
-      // Identifies the minimum value of the parameter.
-      int64 min;
-
-      // Identifies the maximum value of the parameter.
-      int64 max;
-
-      // Shared state of the parameter.
-      std::shared_ptr<SharedState> state;
-    };
-
-    Node(int64 id, const string& name, std::shared_ptr<Node> output)
-        : id_(id), name_(name), type_(TypeFromName(name)), output_(output) {}
-
-    // Adds a constant parameter.
-    void add_constant_param(const string& name, int64 value)
-        LOCKS_EXCLUDED(mu_) {
-      mutex_lock l(mu_);
-      constant_params_[name] = value;
-    }
-
-    // Adds an input.
-    void add_input(std::shared_ptr<Node> node) LOCKS_EXCLUDED(mu_) {
-      mutex_lock l(mu_);
-      inputs_.push_back(node);
-    }
-
-    // Increments the aggregate processing time by the given delta.
-    void add_processing_time(int64 delta) LOCKS_EXCLUDED(mu_) {
-      mutex_lock l(mu_);
-      processing_time_ += delta;
-    }
-
-    // Adds a tunable parameter.
-    void add_tunable_param(const string& name,
-                           std::shared_ptr<SharedState> state, int64 min,
-                           int64 max) LOCKS_EXCLUDED(mu_) {
-      mutex_lock l(mu_);
-      tunable_params_[name] =
-          std::make_shared<Tunable>(std::move(state), min, max);
-    }
-
-    // Returns the unique node ID.
-    int64 id() LOCKS_EXCLUDED(mu_) { return id_; }
-
-    // Returns the node inputs.
-    std::list<std::shared_ptr<Node>> inputs() LOCKS_EXCLUDED(mu_) {
-      tf_shared_lock l(mu_);
-      return inputs_;
-    }
-
-    // Returns the node name.
-    const string& name() LOCKS_EXCLUDED(mu_) {
-      tf_shared_lock l(mu_);
-      return name_;
-    }
-
-    // Returns the number of elements produced by the node.
-    int64 num_elements() LOCKS_EXCLUDED(mu_) {
-      tf_shared_lock l(mu_);
-      return num_elements_;
-    }
-
-    // Returns the node output.
-    std::shared_ptr<Node> output() LOCKS_EXCLUDED(mu_) {
-      tf_shared_lock l(mu_);
-      return output_;
-    }
-
-    // Records that the node produced an element.
-    void record_element() LOCKS_EXCLUDED(mu_) {
-      mutex_lock l(mu_);
-      num_elements_++;
-    }
-
-    // Records that a node thread has started executing.
-    void record_start() LOCKS_EXCLUDED(mu_) {
-      mutex_lock l(mu_);
-      work_start_[std::this_thread::get_id()] = Env::Default()->NowNanos();
-    }
-
-    // Records that a node thread has stopped executing.
-    void record_stop() LOCKS_EXCLUDED(mu_) {
-      mutex_lock l(mu_);
-      std::thread::id tid = std::this_thread::get_id();
-      auto start_time = gtl::FindOrNull(work_start_, tid);
-      DCHECK(start_time)
-          << "Encountered a stop event that was not preceded by a start event.";
-      if (start_time) {
-        processing_time_ += Env::Default()->NowNanos() - *start_time;
-        work_start_.erase(tid);
-      }
-    }
-
-    // Removes an input.
-    void remove_input(std::shared_ptr<Node> input) LOCKS_EXCLUDED(mu_) {
-      mutex_lock l(mu_);
-      inputs_.remove(input);
-    }
-
-    // Set the node output.
-    void set_output(std::shared_ptr<Node> output) LOCKS_EXCLUDED(mu_) {
-      mutex_lock l(mu_);
-      output_ = output;
-    }
-
-    // Collects tunable parameters in the subtree rooted in this node.
-    void CollectTunables(std::vector<std::shared_ptr<Tunable>>* tunables)
-        LOCKS_EXCLUDED(mu_);
-
-    // Returns the per-element output time for this node.
-    int64 OutputTime(std::vector<int64>* input_times) LOCKS_EXCLUDED(mu_) {
-      tf_shared_lock l(mu_);
-      return OutputTimeLocked(input_times);
-    }
-
-    // Returns the per-element processing time spent in the subtree rooted in
-    // this node.
-    int64 ProcessingTime() LOCKS_EXCLUDED(mu_) {
-      tf_shared_lock l(mu_);
-      return ProcessingTimeLocked();
-    }
-
-   private:
-    enum class Type {
-      BATCH = 0,
-      CACHE,
-      CONCATENATE,
-      FILTER,
-      FLAT_MAP,
-      INTERLEAVE,
-      MAP,
-      MAP_AND_BATCH,
-      PADDED_BATCH,
-      PARALLEL_INTERLEAVE,
-      PARALLEL_INTERLEAVE_V2,
-      PARALLEL_MAP,
-      PREFETCH,
-      REPEAT,
-      SHUFFLE,
-      SKIP,
-      TAKE,
-      ZIP,
-      UNKNOWN,
-    };
-
-    // Gets a value of the given parameter (tunable or constant).
-    int64 GetParameterValue(const string& name) SHARED_LOCKS_REQUIRED(mu_);
-
-    // Returns the per-element processing time spent in this node.
-    int64 NanosPerElement() LOCKS_EXCLUDED(mu_) {
-      tf_shared_lock l(mu_);
-      return NanosPerElementLocked();
-    }
-
-    int64 NanosPerElementLocked() SHARED_LOCKS_REQUIRED(mu_) {
-      if (num_elements_ == 0) {
-        return 0;
-      }
-      return (int64)((double)processing_time_ / (double)num_elements_);
-    }
-
-    int64 OutputTimeLocked(std::vector<int64>* input_times)
-        SHARED_LOCKS_REQUIRED(mu_);
-
-    int64 OutputTimeForInputs(std::vector<int64>* input_times)
-        SHARED_LOCKS_REQUIRED(mu_) {
-      int64 sum = 0;
-      for (auto input : inputs_) {
-        sum += input->OutputTime(input_times);
-      }
-      return sum;
-    }
-
-    int64 ProcessingTimeLocked() SHARED_LOCKS_REQUIRED(mu_);
-
-    // Returns the per-element processing time spent in the inputs of this node.
-    int64 ProcessingTimeForInputs() SHARED_LOCKS_REQUIRED(mu_) {
-      int64 sum = 0;
-      for (auto input : inputs_) {
-        sum += input->ProcessingTime();
-      }
-      return sum;
-    }
-
-    Type TypeFromName(const string& name) SHARED_LOCKS_REQUIRED(mu_) {
-      if (name_ == "Batch") {
-        return Type::BATCH;
-      }
-      if (str_util::EndsWith(name_, "Cache")) {
-        return Type::CACHE;
-      }
-      if (name_ == "Concatenate") {
-        return Type::CONCATENATE;
-      }
-      if (name_ == "Filter") {
-        return Type::FILTER;
-      }
-      if (name_ == "FlatMap") {
-        return Type::FLAT_MAP;
-      }
-      if (name_ == "Interleave") {
-        return Type::INTERLEAVE;
-      }
-      if (name_ == "Map") {
-        return Type::MAP;
-      }
-      if (name_ == "MapAndBatch") {
-        return Type::MAP_AND_BATCH;
-      }
-      if (name_ == "PaddedBatch") {
-        return Type::PADDED_BATCH;
-      }
-      if (name_ == "ParallelInterleave") {
-        return Type::PARALLEL_INTERLEAVE;
-      }
-      if (name_ == "ParallelInterleaveV2") {
-        return Type::PARALLEL_INTERLEAVE_V2;
-      }
-      if (name_ == "ParallelMap") {
-        return Type::PARALLEL_MAP;
-      }
-      if (name_ == "Prefetch") {
-        return Type::PREFETCH;
-      }
-      if (str_util::EndsWith(name_, "Repeat")) {
-        return Type::REPEAT;
-      }
-      if (name_ == "Shuffle") {
-        return Type::SHUFFLE;
-      }
-      if (str_util::EndsWith(name_, "Skip")) {
-        return Type::SKIP;
-      }
-      if (str_util::EndsWith(name_, "Take")) {
-        return Type::TAKE;
-      }
-      if (name_ == "Zip") {
-        return Type::ZIP;
-      }
-      return Type::UNKNOWN;
-    }
-
-    mutex mu_;
-    const int64 id_;
-    const string name_;
-    const Type type_;
-    int64 processing_time_ GUARDED_BY(mu_) = 0;
-    int64 num_elements_ GUARDED_BY(mu_) = 0;
-    std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
-    std::map<string, int64> constant_params_ GUARDED_BY(mu_);
-    // Tunables are shared with the model during optimization.
-    std::map<string, std::shared_ptr<Tunable>> tunable_params_ GUARDED_BY(mu_);
-    std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
-    std::shared_ptr<Node> output_ GUARDED_BY(mu_);
-  };
-
-  std::vector<std::shared_ptr<Node::Tunable>> CollectTunables()
-      SHARED_LOCKS_REQUIRED(mu_);
+  // Collects tunable parameters in the tree rooted in the given node.
+  std::vector<std::shared_ptr<Parameter>> CollectTunableParameters(
+      std::shared_ptr<Node> node);
 
-  int64 OutputTime() SHARED_LOCKS_REQUIRED(mu_);
+  // Collects the output time for the given node.
+  int64 OutputTime(std::shared_ptr<Node> node);
 
-  int64 ProcessingTime() SHARED_LOCKS_REQUIRED(mu_);
+  // Collects the processing time for the given node.
+  int64 ProcessingTime(std::shared_ptr<Node> node);
 
   // Used for coordination between different input pipeline threads. Exclusive
   // access is required only when adding or removing nodes. Concurrent access to
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90bd570f90cdab2182f3d46e009b2cd972667ef9
--- /dev/null
+++ b/tensorflow/core/framework/model_test.cc
@@ -0,0 +1,392 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/model.h"
+#include <memory>
+
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace model {
+namespace {
+
+class AsyncInterleaveManyTest
+    : public ::testing::TestWithParam<std::tuple<int64, int64>> {};
+
+TEST_P(AsyncInterleaveManyTest, Model) {
+  const int64 parallelism = std::get<0>(GetParam());
+  const int64 input_time = std::get<1>(GetParam());
+  std::shared_ptr<Node> async_interleave_many =
+      model::MakeAsyncInterleaveManyNode(
+          {0, "async_interleave_many", nullptr},
+          {model::MakeParameter(
+              "parallelism",
+              std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
+              parallelism)});
+  std::shared_ptr<Node> meta_source =
+      model::MakeSourceNode({1, "meta_source", async_interleave_many});
+  async_interleave_many->add_input(meta_source);
+  auto cleanup_meta = gtl::MakeCleanup([async_interleave_many, meta_source]() {
+    async_interleave_many->remove_input(meta_source);
+  });
+  std::shared_ptr<Node> source1 =
+      model::MakeSourceNode({1, "source1", async_interleave_many});
+  async_interleave_many->add_input(source1);
+  auto cleanup1 = gtl::MakeCleanup([async_interleave_many, source1]() {
+    async_interleave_many->remove_input(source1);
+  });
+  std::shared_ptr<Node> source2 =
+      model::MakeSourceNode({2, "source2", async_interleave_many});
+  async_interleave_many->add_input(source2);
+  auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
+    async_interleave_many->remove_input(source2);
+  });
+  std::vector<int64> input_times(1, input_time);
+  async_interleave_many->add_processing_time(100);
+  EXPECT_EQ(100, async_interleave_many->processing_time());
+  EXPECT_EQ(0, async_interleave_many->ProcessingTime());
+  EXPECT_EQ(0, async_interleave_many->OutputTime(&input_times));
+  async_interleave_many->record_element();
+  EXPECT_EQ(1, async_interleave_many->num_elements());
+  EXPECT_EQ(100, async_interleave_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, 100 - input_time),
+            async_interleave_many->OutputTime(&input_times));
+  source1->add_processing_time(200);
+  source2->add_processing_time(300);
+  EXPECT_EQ(100, async_interleave_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, 100 - input_time),
+            async_interleave_many->OutputTime(&input_times));
+  source1->record_element();
+  source2->record_element();
+  EXPECT_EQ(100 + 250, async_interleave_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, 100 + 250 / parallelism - input_time),
+            async_interleave_many->OutputTime(&input_times));
+  async_interleave_many->record_element();
+  EXPECT_EQ(50 + 250, async_interleave_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, 50 + 250 / parallelism - input_time),
+            async_interleave_many->OutputTime(&input_times));
+}
+
+INSTANTIATE_TEST_CASE_P(Test, AsyncInterleaveManyTest,
+                        ::testing::Combine(::testing::Values(1, 2),
+                                           ::testing::Values(0, 50, 100, 200)));
+
+class AsyncKnownRatioTest
+    : public ::testing::TestWithParam<std::tuple<int64, int64, int64>> {};
+
+TEST_P(AsyncKnownRatioTest, Model) {
+  const int64 parallelism = std::get<0>(GetParam());
+  const int64 input_time = std::get<1>(GetParam());
+  const int64 num_inputs_per_output = std::get<2>(GetParam());
+  std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
+      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {model::MakeParameter(
+          "parallelism",
+          std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
+          parallelism)});
+  std::shared_ptr<Node> source1 =
+      model::MakeSourceNode({1, "source1", async_known_many});
+  async_known_many->add_input(source1);
+  std::shared_ptr<Node> source2 =
+      model::MakeSourceNode({2, "source2", async_known_many});
+  async_known_many->add_input(source2);
+  std::vector<int64> input_times(1, input_time);
+  source1->add_processing_time(100);
+  EXPECT_EQ(0, async_known_many->ProcessingTime());
+  EXPECT_EQ(0, async_known_many->OutputTime(&input_times));
+  source2->add_processing_time(200);
+  EXPECT_EQ(0, async_known_many->ProcessingTime());
+  EXPECT_EQ(0, async_known_many->OutputTime(&input_times));
+  source1->record_element();
+  EXPECT_EQ(num_inputs_per_output * 100, async_known_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, num_inputs_per_output * 100 - input_time),
+            async_known_many->OutputTime(&input_times));
+  source2->record_element();
+  EXPECT_EQ(num_inputs_per_output * (100 + 200),
+            async_known_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (100 + 200) - input_time),
+            async_known_many->OutputTime(&input_times));
+  source1->record_element();
+  EXPECT_EQ(num_inputs_per_output * (50 + 200),
+            async_known_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 200) - input_time),
+            async_known_many->OutputTime(&input_times));
+  source2->record_element();
+  EXPECT_EQ(num_inputs_per_output * (50 + 100),
+            async_known_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) - input_time),
+            async_known_many->OutputTime(&input_times));
+  async_known_many->add_processing_time(128);
+  EXPECT_EQ(num_inputs_per_output * (50 + 100),
+            async_known_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) - input_time),
+            async_known_many->OutputTime(&input_times));
+  async_known_many->record_element();
+  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 128,
+            async_known_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) +
+                              128 / parallelism - input_time),
+            async_known_many->OutputTime(&input_times));
+  async_known_many->record_element();
+  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 64,
+            async_known_many->ProcessingTime());
+  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) +
+                              64 / parallelism - input_time),
+            async_known_many->OutputTime(&input_times));
+}
+
+INSTANTIATE_TEST_CASE_P(Test, AsyncKnownRatioTest,
+                        ::testing::Combine(::testing::Values(1, 2, 4, 8),
+                                           ::testing::Values(0, 50, 100, 200),
+                                           ::testing::Values(0, 1, 2, 4)));
+
+TEST(InterleaveManyTest, Model) {
+  std::shared_ptr<Node> interleave_many =
+      model::MakeInterleaveManyNode({0, "interleave_many", nullptr});
+  std::shared_ptr<Node> meta_source =
+      model::MakeSourceNode({1, "meta_source", interleave_many});
+  interleave_many->add_input(meta_source);
+  std::shared_ptr<Node> source1 =
+      model::MakeSourceNode({1, "source1", interleave_many});
+  interleave_many->add_input(source1);
+  std::shared_ptr<Node> source2 =
+      model::MakeSourceNode({2, "source2", interleave_many});
+  interleave_many->add_input(source2);
+  std::vector<int64> input_times(1, 0);
+  interleave_many->add_processing_time(100);
+  EXPECT_EQ(100, interleave_many->processing_time());
+  EXPECT_EQ(0, interleave_many->ProcessingTime());
+  EXPECT_EQ(0, interleave_many->OutputTime(&input_times));
+  interleave_many->record_element();
+  EXPECT_EQ(1, interleave_many->num_elements());
+  EXPECT_EQ(100, interleave_many->ProcessingTime());
+  EXPECT_EQ(100, interleave_many->OutputTime(&input_times));
+  source1->add_processing_time(200);
+  source2->add_processing_time(300);
+  EXPECT_EQ(100, interleave_many->ProcessingTime());
+  EXPECT_EQ(100, interleave_many->OutputTime(&input_times));
+  source1->record_element();
+  source2->record_element();
+  EXPECT_EQ(350, interleave_many->ProcessingTime());
+  EXPECT_EQ(350, interleave_many->OutputTime(&input_times));
+  interleave_many->record_element();
+  EXPECT_EQ(300, interleave_many->ProcessingTime());
+  EXPECT_EQ(300, interleave_many->OutputTime(&input_times));
+}
+
+class KnownRatioTest : public ::testing::TestWithParam<int64> {};
+
+TEST_P(KnownRatioTest, Model) {
+  const int64 num_inputs_per_output = GetParam();
+  std::shared_ptr<Node> known_many = model::MakeKnownRatioNode(
+      {0, "known_many", nullptr}, num_inputs_per_output);
+  std::shared_ptr<Node> source1 =
+      model::MakeSourceNode({1, "source1", known_many});
+  known_many->add_input(source1);
+  std::shared_ptr<Node> source2 =
+      model::MakeSourceNode({2, "source2", known_many});
+  known_many->add_input(source2);
+  std::vector<int64> input_times(1, 0);
+  source1->add_processing_time(100);
+  EXPECT_EQ(0, known_many->ProcessingTime());
+  EXPECT_EQ(0, known_many->OutputTime(&input_times));
+  source2->add_processing_time(200);
+  EXPECT_EQ(0, known_many->ProcessingTime());
+  EXPECT_EQ(0, known_many->OutputTime(&input_times));
+  source1->record_element();
+  EXPECT_EQ(num_inputs_per_output * 100, known_many->ProcessingTime());
+  EXPECT_EQ(num_inputs_per_output * 100, known_many->OutputTime(&input_times));
+  source2->record_element();
+  EXPECT_EQ(num_inputs_per_output * (100 + 200), known_many->ProcessingTime());
+  EXPECT_EQ(num_inputs_per_output * (100 + 200),
+            known_many->OutputTime(&input_times));
+  source1->record_element();
+  EXPECT_EQ(num_inputs_per_output * (50 + 200), known_many->ProcessingTime());
+  EXPECT_EQ(num_inputs_per_output * (50 + 200),
+            known_many->OutputTime(&input_times));
+  source2->record_element();
+  EXPECT_EQ(num_inputs_per_output * (50 + 100), known_many->ProcessingTime());
+  EXPECT_EQ(num_inputs_per_output * (50 + 100),
+            known_many->OutputTime(&input_times));
+  known_many->add_processing_time(128);
+  EXPECT_EQ(num_inputs_per_output * (50 + 100), known_many->ProcessingTime());
+  EXPECT_EQ(num_inputs_per_output * (50 + 100),
+            known_many->OutputTime(&input_times));
+  known_many->record_element();
+  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 128,
+            known_many->ProcessingTime());
+  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 128,
+            known_many->OutputTime(&input_times));
+  known_many->record_element();
+  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 64,
+            known_many->ProcessingTime());
+  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 64,
+            known_many->OutputTime(&input_times));
+}
+
+INSTANTIATE_TEST_CASE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
+
+TEST(SourceTest, Model) {
+  std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
+  std::vector<int64> input_times(1, 0);
+  source->add_processing_time(100);
+  EXPECT_EQ(100, source->processing_time());
+  EXPECT_EQ(0, source->ProcessingTime());
+  EXPECT_EQ(0, source->OutputTime(&input_times));
+  source->record_element();
+  EXPECT_EQ(1, source->num_elements());
+  EXPECT_EQ(100, source->ProcessingTime());
+  EXPECT_EQ(100, source->OutputTime(&input_times));
+  source->record_element();
+  EXPECT_EQ(2, source->num_elements());
+  EXPECT_EQ(50, source->ProcessingTime());
+  EXPECT_EQ(50, source->OutputTime(&input_times));
+}
+
+TEST(UnknownRatioTest, Model) {
+  std::shared_ptr<Node> unknown_many =
+      model::MakeUnknownRatioNode({0, "unknown_many", nullptr});
+  std::shared_ptr<Node> source1 =
+      model::MakeSourceNode({1, "source1", unknown_many});
+  unknown_many->add_input(source1);
+  std::shared_ptr<Node> source2 =
+      model::MakeSourceNode({2, "source2", unknown_many});
+  unknown_many->add_input(source2);
+  std::vector<int64> input_times(1, 0);
+  unknown_many->add_processing_time(100);
+  EXPECT_EQ(100, unknown_many->processing_time());
+  EXPECT_EQ(0, unknown_many->ProcessingTime());
+  EXPECT_EQ(0, unknown_many->OutputTime(&input_times));
+  unknown_many->record_element();
+  EXPECT_EQ(1, unknown_many->num_elements());
+  EXPECT_EQ(100, unknown_many->ProcessingTime());
+  EXPECT_EQ(100, unknown_many->OutputTime(&input_times));
+  source1->add_processing_time(100);
+  source2->add_processing_time(200);
+  EXPECT_EQ(100, unknown_many->ProcessingTime());
+  EXPECT_EQ(100, unknown_many->OutputTime(&input_times));
+  source1->record_element();
+  source2->record_element();
+  EXPECT_EQ(400, unknown_many->ProcessingTime());
+  EXPECT_EQ(400, unknown_many->OutputTime(&input_times));
+  unknown_many->record_element();
+  EXPECT_EQ(200, unknown_many->ProcessingTime());
+  EXPECT_EQ(200, unknown_many->OutputTime(&input_times));
+}
+
+TEST(UnknownTest, Model) {
+  std::shared_ptr<Node> unknown =
+      model::MakeUnknownNode({0, "unknown", nullptr});
+  std::shared_ptr<Node> source1 =
+      model::MakeSourceNode({1, "source1", unknown});
+  unknown->add_input(source1);
+  std::shared_ptr<Node> source2 =
+      model::MakeSourceNode({2, "source2", unknown});
+  unknown->add_input(source2);
+  std::vector<int64> input_times(1, 0);
+  source1->add_processing_time(100);
+  EXPECT_EQ(0, unknown->ProcessingTime());
+  EXPECT_EQ(0, unknown->OutputTime(&input_times));
+  source2->add_processing_time(100);
+  EXPECT_EQ(0, unknown->ProcessingTime());
+  EXPECT_EQ(0, unknown->OutputTime(&input_times));
+  source1->record_element();
+  EXPECT_EQ(100, unknown->ProcessingTime());
+  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  source2->record_element();
+  EXPECT_EQ(200, unknown->ProcessingTime());
+  EXPECT_EQ(200, unknown->OutputTime(&input_times));
+  source1->record_element();
+  EXPECT_EQ(150, unknown->ProcessingTime());
+  EXPECT_EQ(150, unknown->OutputTime(&input_times));
+  source2->record_element();
+  EXPECT_EQ(100, unknown->ProcessingTime());
+  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  // Unknown node processing time should not affect its ProcessingTime() or
+  // OutputTime().
+  unknown->add_processing_time(100);
+  EXPECT_EQ(100, unknown->processing_time());
+  EXPECT_EQ(100, unknown->ProcessingTime());
+  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  // Unknown node number of elements should not affect its ProcessingTime() or
+  // OutputTime().
+  unknown->record_element();
+  EXPECT_EQ(1, unknown->num_elements());
+  EXPECT_EQ(100, unknown->ProcessingTime());
+  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+}
+
+class TestNode : public model::Node {
+ public:
+  using model::Node::Node;
+
+  virtual ~TestNode() {}
+
+ protected:
+  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return nullptr;
+  }
+
+  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return 0;
+  }
+
+  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    return 0;
+  }
+};
+
+TEST(SetterGetterTest, Node) {
+  std::shared_ptr<TestNode> node =
+      std::make_shared<TestNode>(model::Node::Args{-1, "TestNode", nullptr});
+  EXPECT_EQ(-1, node->id());
+  EXPECT_EQ("TestNode", node->name());
+  EXPECT_EQ(nullptr, node->output());
+
+  EXPECT_EQ(0, node->buffered_bytes());
+  node->add_buffered_bytes(42);
+  EXPECT_EQ(42, node->buffered_bytes());
+
+  EXPECT_EQ(0, node->processing_time());
+  node->record_start(1);
+  EXPECT_EQ(0, node->processing_time());
+  node->record_stop(41);
+  EXPECT_EQ(40, node->processing_time());
+  node->add_processing_time(2);
+  EXPECT_EQ(42, node->processing_time());
+
+  std::shared_ptr<TestNode> input =
+      std::make_shared<TestNode>(model::Node::Args{-1, "TestInput", node});
+  EXPECT_EQ(node.get(), input->output());
+  EXPECT_EQ(0, node->inputs().size());
+  node->add_input(input);
+  EXPECT_EQ(1, node->inputs().size());
+  EXPECT_EQ(input, node->inputs().front());
+  node->remove_input(input);
+  EXPECT_EQ(0, node->inputs().size());
+
+  EXPECT_EQ(0, node->num_elements());
+  node->record_element();
+  EXPECT_EQ(1, node->num_elements());
+}
+
+}  // namespace
+}  // namespace model
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 43ac1d0ada38ebbb64b8a52d66fdfbc54f3de260..95a787b2df02d48f316653ee5059b4f7e80f73e1 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -86,8 +86,8 @@ string AttrSlice::SummarizeNode() const {
 string SummarizeNode(const Node& node) { return SummarizeNodeDef(node.def()); }
 
 string SummarizeNodeDef(const NodeDef& node_def) {
-  string ret = strings::StrCat(FormatNodeDefForError(node_def), " = ",
-                               node_def.op(), "[");
+  string ret = strings::StrCat(errors::FormatNodeNameForError(node_def.name()),
+                               " = ", node_def.op(), "[");
   strings::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
   strings::StrAppend(&ret, "](");
 
@@ -102,11 +102,16 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   return ret;
 }
 
+string SummarizeAttrs(const NodeDef& node_def) {
+  return SummarizeAttrsHelper(node_def, node_def.device());
+}
+
 string FormatNodeForError(const Node& node) {
   return FormatNodeDefForError(node.def());
 }
 
 string FormatNodeDefForError(const NodeDef& node_def) {
+  VLOG(1) << "Error in the node: " << SummarizeNodeDef(node_def);
   return errors::FormatNodeNameForError(node_def.name());
 }
 
@@ -419,9 +424,9 @@ Status NumOutputsForNode(const NodeDef& node_def, const OpDef& op_def,
 
 Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   if (node_def.op() != op_def.name()) {
-    return errors::InvalidArgument("NodeDef op '", node_def.op(),
-                                   "' does not match ", SummarizeOpDef(op_def),
-                                   "; NodeDef: ", SummarizeNodeDef(node_def));
+    return errors::InvalidArgument(
+        "NodeDef op '", node_def.op(), "' does not match ",
+        SummarizeOpDef(op_def), "; NodeDef: ", FormatNodeDefForError(node_def));
   }
 
   bool seen_control = false;
@@ -431,14 +436,14 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
     if (str_util::StartsWith(input, "^")) {
       seen_control = true;
       if (input.find(':') != string::npos) {
-        return errors::InvalidArgument(
-            "Control input '", input,
-            "' must not have ':' in NodeDef: ", SummarizeNodeDef(node_def));
+        return errors::InvalidArgument("Control input '", input,
+                                       "' must not have ':' in NodeDef: ",
+                                       FormatNodeDefForError(node_def));
       }
     } else if (seen_control) {
-      return errors::InvalidArgument(
-          "Non-control input '", input,
-          "' after control input in NodeDef: ", SummarizeNodeDef(node_def));
+      return errors::InvalidArgument("Non-control input '", input,
+                                     "' after control input in NodeDef: ",
+                                     FormatNodeDefForError(node_def));
     } else {
       ++num_inputs;
     }
@@ -468,13 +473,14 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
       // the binary producing it.
       return errors::InvalidArgument(
           "NodeDef mentions attr '", attr.first, "' not in ",
-          SummarizeOpDef(op_def), "; NodeDef: ", SummarizeNodeDef(node_def),
+          SummarizeOpDef(op_def),
+          "; NodeDef: ", FormatNodeDefForError(node_def),
           ". (Check whether your GraphDef-interpreting binary is up to date "
           "with your GraphDef-generating binary.).");
     }
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
         ValidateAttrValue(attr.second, *iter->second),
-        "; NodeDef: ", SummarizeNodeDef(node_def), "; ",
+        "; NodeDef: ", FormatNodeDefForError(node_def), "; ",
         SummarizeOpDef(op_def));
     // Keep track of which attr names have (not) been found in the NodeDef.
     op_attrs.erase(iter);
@@ -487,10 +493,10 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
       if (!attrs.empty()) strings::StrAppend(&attrs, "', '");
       strings::StrAppend(&attrs, attr_pair.first);
     }
-    return errors::InvalidArgument("NodeDef missing attr",
-                                   op_attrs.size() == 1 ? " '" : "s '", attrs,
-                                   "' from ", SummarizeOpDef(op_def),
-                                   "; NodeDef: ", SummarizeNodeDef(node_def));
+    return errors::InvalidArgument(
+        "NodeDef missing attr", op_attrs.size() == 1 ? " '" : "s '", attrs,
+        "' from ", SummarizeOpDef(op_def),
+        "; NodeDef: ", FormatNodeDefForError(node_def));
   }
 
   // Validate the number of inputs.
@@ -501,7 +507,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
     return errors::InvalidArgument(
         "NodeDef expected inputs '", DataTypeVectorString(inputs),
         "' do not match ", num_inputs, " inputs specified; ",
-        SummarizeOpDef(op_def), "; NodeDef: ", SummarizeNodeDef(node_def));
+        SummarizeOpDef(op_def), "; NodeDef: ", FormatNodeDefForError(node_def));
   }
 
   return Status::OK();
@@ -657,7 +663,7 @@ Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
 Status AttachDef(const Status& status, const NodeDef& node_def) {
   Status ret = status;
   errors::AppendToMessage(
-      &ret, strings::StrCat(" [[", SummarizeNodeDef(node_def), "]]"));
+      &ret, strings::StrCat(" [[", FormatNodeDefForError(node_def), "]]"));
   return ret;
 }
 
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 0ff67554eb3d2b4713c6c329dec2dc814ce28395..f682bb15355550622e8bbe384df790f1022bd630 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -48,6 +48,7 @@ extern const char* const kColocationGroupPrefix;
 // than a text-format proto.
 string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
+string SummarizeAttrs(const NodeDef& node_def);
 
 // Produces a formatted string pattern from the node which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 3e34bf041882f640c059ea16e83b9244e679f81c..e3cb4a40ec5503307813d292f4f538fb8577a25b 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 
+#include <mutex>  // NOLINT
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -38,8 +39,11 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/platform_strings.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/ptr_util.h"
 
@@ -253,6 +257,9 @@ Status OpKernelConstruction::allocate_persistent(
 
 // OpKernelContext -----------------------------------------------------------
 
+const int OpKernelContext::Params::kNeverForward;
+const int OpKernelContext::Params::kNoReservation;
+
 OpKernelContext::OpKernelContext(Params* params)
     : OpKernelContext(
           params, static_cast<int>(params->op_kernel->output_types().size())) {}
@@ -284,6 +291,13 @@ OpKernelContext::~OpKernelContext() {
     }
   }
   if (params_->record_tensor_accesses) referenced_tensors_.Destroy();
+  if (params_->track_allocations && !wrapped_allocators_.empty()) {
+    LOG(WARNING) << "OpKernelContext is tracking allocations but they are not "
+                 << "being consumed by the StepStatsCollector.";
+    for (auto& wrapped_alloator : wrapped_allocators_) {
+      wrapped_alloator.second->GetRecordsAndUnRef();
+    }
+  }
 }
 
 Allocator* OpKernelContext::get_allocator(AllocatorAttributes attr) {
@@ -702,10 +716,10 @@ Status OpKernelContext::allocate_output(int index, const TensorShape& shape,
   const DataType type = params_->op_kernel->output_type(index);
   DCHECK(!IsRefType(type));
   DCHECK(mutable_output(index) == nullptr);
-  Tensor* output_tensor = new Tensor();
-  Status s = allocate_tensor(type, shape, output_tensor, attr);
+  auto output_tensor = MakeUnique<Tensor>();
+  Status s = allocate_tensor(type, shape, output_tensor.get(), attr);
   if (s.ok()) {
-    outputs_[index] = TensorValue(output_tensor);
+    outputs_[index] = TensorValue(output_tensor.release());
     *output = outputs_[index].tensor;
   }
   return s;
@@ -912,11 +926,12 @@ void OpKernelContext::clear_recorded_memory() {
 
 struct KernelRegistration {
   KernelRegistration(const KernelDef& d, StringPiece c,
-                     kernel_factory::OpKernelRegistrar::Factory f)
-      : def(d), kernel_class_name(c), factory(f) {}
+                     std::unique_ptr<kernel_factory::OpKernelFactory> f)
+      : def(d), kernel_class_name(c), factory(std::move(f)) {}
+
   const KernelDef def;
   const string kernel_class_name;
-  const kernel_factory::OpKernelRegistrar::Factory factory;
+  std::unique_ptr<kernel_factory::OpKernelFactory> factory;
 };
 
 // This maps from 'op_type' + DeviceType to the set of KernelDefs and
@@ -924,12 +939,96 @@ struct KernelRegistration {
 // KernelDef.
 typedef std::unordered_multimap<string, KernelRegistration> KernelRegistry;
 
+#if defined(_WIN32)
+static const char kKernelLibPattern[] = "libtfkernel*.dll";
+#elif defined(__APPLE__)
+static const char kKernelLibPattern[] = "libtfkernel*.dylib";
+#else
+static const char kKernelLibPattern[] = "libtfkernel*.so";
+#endif
+
+#define FEATURE(x) \
+  { x, #x }
+
+// Returns Status::OK if the dynamic library at the given path is safe to
+// load with some level of confidence.
+static Status IsProbablySafeToLoad(const string& path) {
+  // A map of platform string to required CPU feature.
+  using port::CPUFeature;
+  static const auto* feature_map =
+      new std::map<string, std::pair<CPUFeature, string>>{
+          {"__AVX512VL__=1", FEATURE(CPUFeature::AVX512VL)},
+      };
+
+  std::vector<std::string> platform_strings;
+  int result = GetPlatformStrings(path, &platform_strings);
+  if (result) {
+    return Status(error::Code::UNKNOWN, strerror(result));
+  }
+  if (platform_strings.empty()) {
+    return Status(error::Code::FAILED_PRECONDITION,
+                  "Didn't find any platform strings");
+  }
+  std::vector<std::string> missing_features;
+  for (const auto& platform_string : platform_strings) {
+    const auto& entry = feature_map->find(platform_string);
+    if (entry != feature_map->end() &&
+        !port::TestCPUFeature(entry->second.first)) {
+      missing_features.emplace_back(entry->second.second);
+    }
+  }
+  if (!missing_features.empty()) {
+    string errmsg = "Missing CPU features: ";
+    errmsg.append(str_util::Join(missing_features, ", "));
+    return Status(errors::Code::FAILED_PRECONDITION, errmsg);
+  }
+  return Status::OK();
+}
+
+void LoadDynamicKernelsInternal() {
+  Env* env = Env::Default();
+  string bazel_kernel_dir = io::JoinPath(env->GetRunfilesDir(),
+                                         "tensorflow",
+                                         "core",
+                                         "kernels");
+  std::vector<string> files;
+  Status s_kernel_dir = env->GetChildren(bazel_kernel_dir, &files);
+  if (s_kernel_dir.ok()) {
+    string dll_spec = io::JoinPath(bazel_kernel_dir, kKernelLibPattern);
+    for (const auto& file : files) {
+      string fullpath = io::JoinPath(bazel_kernel_dir, file);
+      if (env->MatchPath(fullpath, dll_spec)) {
+        Status s = IsProbablySafeToLoad(fullpath);
+        if (s.ok()) {
+          // TODO(gunan): Store the handles to the opened files.
+          void* unused_filehandle;
+          TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
+        } else {
+          LOG(WARNING) << "Not loading plugin library " << fullpath << ": "
+                       << s.error_message();
+        }
+      }
+    }
+  }
+}
+
+// Mechanism for loading existing kernel libraries.
+void LoadDynamicKernels() {
+  // TODO(gunan): As more features are available, add intelligent kernel
+  // selection, and dropping unsuitable kernel logic here.
+  static std::once_flag dll_loader_flag;
+  std::call_once(dll_loader_flag, LoadDynamicKernelsInternal);
+}
+
 void* GlobalKernelRegistry() {
   static KernelRegistry* global_kernel_registry = new KernelRegistry;
   return global_kernel_registry;
 }
 
 static KernelRegistry* GlobalKernelRegistryTyped() {
+#ifdef AUTOLOAD_DYNAMIC_KERNELS
+  LoadDynamicKernels();
+#endif  // AUTOLOAD_DYNAMIC_KERNELS
   return reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
 }
 
@@ -943,14 +1042,23 @@ namespace kernel_factory {
 
 void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
                                      StringPiece kernel_class_name,
-                                     Factory factory) {
+                                     std::unique_ptr<OpKernelFactory> factory) {
   // See comments in register_kernel::Name in header for info on _no_register.
   if (kernel_def->op() != "_no_register") {
     const string key =
         Key(kernel_def->op(), DeviceType(kernel_def->device_type()),
             kernel_def->label());
-    GlobalKernelRegistryTyped()->insert(std::make_pair(
-        key, KernelRegistration(*kernel_def, kernel_class_name, factory)));
+
+    // To avoid calling LoadDynamicKernels DO NOT CALL GlobalKernelRegistryTyped
+    // here.
+    // InitInternal gets called by static initializers, so it ends up executing
+    // before main. This causes LoadKernelLibraries function to get called
+    // before some file libraries can initialize, which in turn crashes the
+    // program flakily. Until we get rid of static initializers in kernel
+    // registration mechanism, we have this workaround here.
+    reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry())
+        ->emplace(key, KernelRegistration(*kernel_def, kernel_class_name,
+                                          std::move(factory)));
   }
   delete kernel_def;
 }
@@ -982,7 +1090,7 @@ Status FindKernelRegistration(const DeviceType& device_type,
       if (*reg != nullptr) {
         return errors::InvalidArgument(
             "Multiple OpKernel registrations match NodeDef '",
-            SummarizeNodeDef(node_def), "': '",
+            FormatNodeDefForError(node_def), "': '",
             ProtoShortDebugString((*reg)->def), "' and '",
             ProtoShortDebugString(iter->second.def), "'");
       }
@@ -996,6 +1104,15 @@ Status FindKernelRegistration(const DeviceType& device_type,
 
 }  // namespace
 
+bool KernelDefAvailable(const DeviceType& device_type,
+                        const NodeDef& node_def) {
+  const KernelRegistration* reg = nullptr;
+  bool was_attr_mismatch;
+  Status result =
+      FindKernelRegistration(device_type, node_def, &reg, &was_attr_mismatch);
+  return result.ok() && reg != nullptr;
+}
+
 // TODO(irving): Change const NodeDef& to const Node&
 Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
                      const KernelDef** def, string* kernel_class_name) {
@@ -1007,10 +1124,11 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
     Status s = errors::NotFound(
         "No registered '", node_def.op(), "' OpKernel for ",
         DeviceTypeString(device_type), " devices compatible with node ",
-        SummarizeNodeDef(node_def));
+        FormatNodeDefForError(node_def));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
-          &s, " (OpKernel was found, but attributes didn't match)");
+          &s, " (OpKernel was found, but attributes didn't match) ",
+          "Requested Attributes: ", SummarizeAttrs(node_def));
     }
     errors::AppendToMessage(
         &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
@@ -1023,7 +1141,7 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
 
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    DeviceTypeVector* device_types) {
+    PrioritizedDeviceTypeVector* prioritized_device_types) {
   // TODO(zhifengc): Changes the callers (SimplePlacer and
   // DynamicPlacer) to consider the possibility that 'def' is call to
   // a user-defined function and only calls this
@@ -1036,12 +1154,21 @@ Status SupportedDeviceTypesForNode(
       bool was_attr_mismatch;
       TF_RETURN_IF_ERROR(
           FindKernelRegistration(device_type, def, &reg, &was_attr_mismatch));
-      if (reg != nullptr) device_types->push_back(device_type);
+      if (reg != nullptr) {
+        int32 priority = reg->def.priority();
+        prioritized_device_types->emplace_back(device_type, priority);
+      }
     }
+    std::sort(prioritized_device_types->begin(),
+              prioritized_device_types->end(),
+              [](const std::pair<DeviceType, int32>& a,
+                 const std::pair<DeviceType, int32>& b) {
+                return a.second > b.second;
+              });
   } else {
     // Assumes that all device types support this node.
     for (const DeviceType& device_type : prioritized_types) {
-      device_types->push_back(device_type);
+      prioritized_device_types->push_back(std::make_pair(device_type, 0));
     }
   }
   return Status::OK();
@@ -1133,10 +1260,11 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
     s.Update(errors::NotFound("No registered '", node_def.op(),
                               "' OpKernel for ", DeviceTypeString(device_type),
                               " devices compatible with node ",
-                              SummarizeNodeDef(node_def)));
+                              FormatNodeDefForError(node_def)));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
-          &s, " (OpKernel was found, but attributes didn't match)");
+          &s, " (OpKernel was found, but attributes didn't match) ",
+          "Requested Attributes: ", SummarizeAttrs(node_def));
     }
     errors::AppendToMessage(
         &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
@@ -1148,7 +1276,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
   DataTypeVector outputs;
   s.Update(InOutTypesForNode(node_def, *op_def, &inputs, &outputs));
   if (!s.ok()) {
-    errors::AppendToMessage(&s, " for node: ", SummarizeNodeDef(node_def));
+    errors::AppendToMessage(&s, " for node: ", FormatNodeDefForError(node_def));
     return s;
   }
 
@@ -1165,7 +1293,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
   OpKernelConstruction context(
       device_type, device, allocator, &node_def, op_def, flib, inputs,
       input_memory_types, outputs, output_memory_types, graph_def_version, &s);
-  *kernel = (*registration->factory)(&context);
+  *kernel = registration->factory->Create(&context);
   if (!s.ok()) {
     delete *kernel;
     *kernel = nullptr;
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 4bbd6c3d7d21dfac6a8813464684d2220c56ebff..19a0c5e5be2e8cbb16d55db21d4d425d9add2974 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -489,6 +490,17 @@ struct TensorValue {
   Tensor* tensor;
 };
 
+// Used to store partitioned graphs from function-calling ops.
+struct GraphCollector {
+  mutex mu;
+  std::vector<GraphDef> graphs GUARDED_BY(mu);
+
+  void CollectGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    graphs.push_back(graph);
+  }
+};
+
 class OpKernelContext {
  public:
   // The first element of a WrappedAllocator is a "base" Allocator and
@@ -589,6 +601,7 @@ class OpKernelContext {
     FunctionLibraryRuntime* function_library = nullptr;
     std::function<void(std::function<void()>)>* runner = nullptr;
     StepStatsCollectorInterface* stats_collector = nullptr;
+    GraphCollector* graph_collector = nullptr;
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
@@ -711,6 +724,9 @@ class OpKernelContext {
   // Usage: if (!context->ValidateInputsAreSameShape(this)) return;
   bool ValidateInputsAreSameShape(OpKernel* op);
 
+  // If non-null, kernels should populate with any partition subgraphs created.
+  GraphCollector* graph_collector() { return params_->graph_collector; }
+
   // Input to output forwarding.
 
   // Set the output Ref Tensor at output_index to be an alias of the
@@ -966,9 +982,10 @@ class OpKernelContext {
     return params_->output_attr_array[index];
   }
 
-  gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators() const {
+  gtl::InlinedVector<WrappedAllocator, 4> ConsumeWrappedAllocators() {
     mutex_lock lock(mu_);
-    gtl::InlinedVector<WrappedAllocator, 4> retrieved = wrapped_allocators_;
+    gtl::InlinedVector<WrappedAllocator, 4> retrieved;
+    retrieved.swap(wrapped_allocators_);
     return retrieved;
   }
 
@@ -1220,7 +1237,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
 //           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    DeviceTypeVector* device_types);
+    PrioritizedDeviceTypeVector* device_types);
 
 // Returns a message with a description of the kernels registered for op
 // `op_name`.
@@ -1304,7 +1321,8 @@ class Name : public KernelDefBuilder {
             return new __VA_ARGS__(context);                             \
           });
 
-void* GlobalKernelRegistry();
+// Checks whether a given kernel is registered on device_type.
+bool KernelDefAvailable(const DeviceType& device_type, const NodeDef& node_def);
 
 // If node_def has a corresponding kernel registered on device_type,
 // returns OK and fill in the kernel def and kernel_class_name. <def> and
@@ -1328,22 +1346,55 @@ KernelList GetRegisteredKernelsForOp(StringPiece op_name);
 
 namespace kernel_factory {
 
+// OpKernelFactory is responsible for creating OpKernels when TensorFlow needs
+// them. You register factories with the TensorFlow core by constructing an
+// OpKernelRegistrar and passing the factory as a constructor parameter.
+class OpKernelFactory {
+ public:
+  virtual OpKernel* Create(OpKernelConstruction* context) = 0;
+  virtual ~OpKernelFactory() = default;
+};
+
 class OpKernelRegistrar {
  public:
-  typedef OpKernel* (*Factory)(OpKernelConstruction*);
+  // Registers the given kernel factory with TensorFlow. TF will call the
+  // factory Create() method when it determines that a kernel matching the given
+  // KernelDef is required.
+  OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
+                    std::unique_ptr<OpKernelFactory> factory) {
+    // Perform the check in the header to allow compile-time optimization
+    // to a no-op, allowing the linker to remove the kernel symbols.
+    if (kernel_def != nullptr) {
+      InitInternal(kernel_def, kernel_class_name, std::move(factory));
+    }
+  }
 
+  // Registers the given factory function with TensorFlow. This is equivalent
+  // to registering a factory whose Create function invokes `create_fn`.
   OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
-                    Factory factory) {
+                    OpKernel* (*create_fn)(OpKernelConstruction*)) {
     // Perform the check in the header to allow compile-time optimization
     // to a no-op, allowing the linker to remove the kernel symbols.
     if (kernel_def != nullptr) {
-      InitInternal(kernel_def, kernel_class_name, factory);
+      struct PtrOpKernelFactory : public OpKernelFactory {
+        explicit PtrOpKernelFactory(
+            OpKernel* (*create_func)(OpKernelConstruction*))
+            : create_func_(create_func) {}
+
+        OpKernel* Create(OpKernelConstruction* context) override {
+          return (*create_func_)(context);
+        }
+
+        OpKernel* (*create_func_)(OpKernelConstruction*);
+      };
+      InitInternal(kernel_def, kernel_class_name,
+                   absl::make_unique<PtrOpKernelFactory>(create_fn));
     }
   }
 
  private:
   void InitInternal(const KernelDef* kernel_def, StringPiece kernel_class_name,
-                    Factory factory);
+                    std::unique_ptr<OpKernelFactory> factory);
 };
 
 }  // namespace kernel_factory
@@ -1476,6 +1527,7 @@ T* OpKernelContext::op_device_context() {
 
 template <typename T>
 T* OpKernelContext::input_device_context(int index) {
+  DCHECK_NE(params_->input_device_contexts, nullptr);
   DCHECK_GE(index, 0);
   DCHECK_LT(index, params_->input_device_contexts->size());
   static_assert(std::is_base_of<DeviceContext, T>::value,
@@ -1484,6 +1536,7 @@ T* OpKernelContext::input_device_context(int index) {
 }
 
 inline DeviceContext* OpKernelContext::input_device_context(int index) {
+  DCHECK_NE(params_->input_device_contexts, nullptr);
   DCHECK_GE(index, 0);
   DCHECK_LT(index, params_->input_device_contexts->size());
   return (*params_->input_device_contexts)[index];
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 83dda6579b784be538f45d9c95be57d412f49668..d8001cd07103f01c57480b62f3d40ff40514af88 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -102,6 +102,27 @@ REGISTER_OP("Test4").Input("i: float").Output("o: float");
 REGISTER_KERNEL_BUILDER(Name("Test4").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("Test4").Device(DEVICE_GPU), DummyKernel);
 
+// Kernels with different priorities.
+REGISTER_OP("Test5").Input("a: T").Input("b: T").Attr("T: type");
+
+class TestOp5Cpu : public tensorflow::OpKernel {
+ public:
+  explicit TestOp5Cpu(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("Test5").Device(DEVICE_CPU).Priority(2),
+                        TestOp5Cpu);
+
+class TestOp5Gpu : public tensorflow::OpKernel {
+ public:
+  explicit TestOp5Gpu(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("Test5").Device(DEVICE_GPU).Priority(1),
+                        TestOp5Gpu);
+
 static std::vector<DeviceType> DeviceTypes() {
   return {DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)};
 }
@@ -185,10 +206,10 @@ TEST_F(OpKernelTest, SuccessBothCpuAndGpu) {
 
 TEST_F(OpKernelTest, CpuTypeRegistered) {
   NodeDef ndef = CreateNodeDef("Test1", {DT_FLOAT, DT_INT32});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
   EXPECT_EQ(1, devs.size());
-  EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0]);
+  EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
 }
 
 TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
@@ -196,24 +217,24 @@ TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
     // Try a node def of an op that is registered for a specific type
     // only on CPU.
     NodeDef ndef = CreateNodeDef("Test3", {DT_INT8, DT_INT8});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(1, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0]);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
   }
   {
     // Try a node def of an op that is registered for a specific type
     // only on GPU.
     NodeDef ndef = CreateNodeDef("Test3", {DT_FLOAT, DT_FLOAT});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(1, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0]);
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0].first);
   }
   {
     // Try a node def of an op that is only registered for other types.
     NodeDef ndef = CreateNodeDef("Test3", {DT_STRING, DT_STRING});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(0, devs.size());
   }
@@ -221,11 +242,23 @@ TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
   {
     // Try a node def of an op that is registered for both.
     NodeDef ndef = CreateNodeDef("Test4", {DT_FLOAT});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
+    TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
+    EXPECT_EQ(2, devs.size());
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0].first);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[1].first);
+  }
+
+  {
+    // Try a node def of an op where kernels have priorities.
+    NodeDef ndef = CreateNodeDef("Test5", {DT_STRING, DT_STRING});
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(2, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0]);
-    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[1]);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
+    EXPECT_EQ(2, devs[0].second);
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[1].first);
+    EXPECT_EQ(1, devs[1].second);
   }
 }
 
@@ -412,11 +445,11 @@ class OpKernelBuilderTest : public ::testing::Test {
     }
 
     // Test SupportedDeviceTypesForNode()
-    DeviceTypeVector devices;
+    PrioritizedDeviceTypeVector devices;
     TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
     bool found = false;
-    for (const DeviceType& dt : devices) {
-      if (dt == device_type) {
+    for (const auto& dt : devices) {
+      if (dt.first == device_type) {
         found = true;
       }
     }
@@ -445,11 +478,11 @@ class OpKernelBuilderTest : public ::testing::Test {
       EXPECT_EQ(code, status.code());
 
       // Test SupportedDeviceTypesForNode().
-      DeviceTypeVector devices;
+      PrioritizedDeviceTypeVector devices;
       if (errors::IsNotFound(status)) {
         TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
-        for (const DeviceType& dt : devices) {
-          EXPECT_NE(dt, device_type);
+        for (const auto& dt : devices) {
+          EXPECT_NE(dt.first, device_type);
         }
       } else {
         Status status2 =
@@ -562,7 +595,7 @@ REGISTER_KERNEL_BUILDER(Name("DuplicateKernel").Device(DEVICE_CPU),
 
 TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   const NodeDef ndef = CreateNodeDef("DuplicateKernel", {});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(
@@ -582,7 +615,7 @@ REGISTER_KERNEL_BUILDER(
 TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   const NodeDef ndef =
       CreateNodeDef("DuplicateKernelForT", {"T|type|DT_FLOAT"});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(
@@ -603,7 +636,7 @@ REGISTER_KERNEL_BUILDER(Name("BadConstraint")
 
 TEST_F(OpKernelBuilderTest, BadConstraint) {
   const NodeDef ndef = CreateNodeDef("BadConstraint", {});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(
diff --git a/tensorflow/core/kernels/ops_util.cc b/tensorflow/core/framework/ops_util.cc
similarity index 98%
rename from tensorflow/core/kernels/ops_util.cc
rename to tensorflow/core/framework/ops_util.cc
index efacd05dd39cceb33397d647bbbc4c71228f1029..e8cf014ca03457e4673a14765cee5a05746b901a 100644
--- a/tensorflow/core/kernels/ops_util.cc
+++ b/tensorflow/core/framework/ops_util.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <cmath>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/padding.h"
diff --git a/tensorflow/core/framework/ops_util.h b/tensorflow/core/framework/ops_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..feaab10b366994be53b64518361d9f8d22c707c2
--- /dev/null
+++ b/tensorflow/core/framework/ops_util.h
@@ -0,0 +1,116 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OPS_UTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OPS_UTIL_H_
+
+// This file contains utilities for various operations.
+
+#include <array>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/padding.h"
+
+namespace tensorflow {
+
+// Calculates broadcast starting index and size.  For SAME padding, addition
+// padding could be applied to right, left, top and bottom.  Depending on the
+// current index, input size, kernel size, stride, padding size, the starting
+// index and size for broadcast for that dimension are different from the
+// current index and kernel size.
+// This is mainly used by gradient algorithms for pooling operations.
+Status GetBroadcastSize(const int index, const int in_size, const int ksize,
+                        const int stride, const int pad_size, int* bindex,
+                        int* bsize);
+
+// Converts Brain's Padding to Eigen's PaddingType.
+Eigen::PaddingType BrainPadding2EigenPadding(Padding padding);
+
+// Given a shape 's' of a tensor of type T. Returns true iff the
+// number of bytes occupied by each dim 0 (i.e., &tensor(i + 1, ...) -
+// &tensor(i, ...)) is multiple of EIGEN_MAX_ALIGN_BYTES.
+template <typename T>
+bool IsInnerDimsSizeAligned(const TensorShape& s) {
+  if (s.dims() == 0) return false;
+  const int64 dim0_size = s.dim_size(0);
+  if (dim0_size == 0) return false;
+#if EIGEN_MAX_ALIGN_BYTES == 0
+  return true;
+#else
+  const int64 bytes_per_dim0 = (s.num_elements() / dim0_size) * sizeof(T);
+  return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0;
+#endif
+}
+
+// Given a shape 's' of a tensor of type T and the `start` and `end` index of a
+// dim 0 slice, returns true iff slice is aligned with respect to original
+// tensor. Here aligned implies the address is a multiple of
+// EIGEN_MAX_ALIGN_BYTES.
+template <typename T>
+bool IsDim0SliceAligned(const TensorShape& s, int64 start, int64 end_or_size) {
+  if (s.dims() == 1) {
+#if EIGEN_MAX_ALIGN_BYTES == 0
+    return true;
+#else
+    bool start_aligned = (start * sizeof(T)) % EIGEN_MAX_ALIGN_BYTES == 0;
+    // End is aligned if either the explicit end index is passed and is a
+    // a multiple of EIGEN_MAX_ALIGN_BYTES, or the start index is aligned and
+    // the size is aligned. So for convenience we can either pass start and
+    // index, or start and size.
+    bool end_aligned = (end_or_size * sizeof(T)) % EIGEN_MAX_ALIGN_BYTES == 0;
+    return start_aligned && end_aligned;
+#endif
+  } else {
+    return IsInnerDimsSizeAligned<T>(s);
+  }
+}
+
+// Returns <suffix> sanitized to have only [a-zA-Z0-9-_].
+string SanitizeThreadSuffix(string suffix);
+
+// Helper to compute 'strides' given a tensor 'shape'. I.e.,
+// strides[i] = prod(shape.dim_size[(i+1):])
+template <typename T>
+gtl::InlinedVector<T, 8> ComputeStride(const TensorShape& shape) {
+  const int ndims = shape.dims();
+  gtl::InlinedVector<T, 8> strides(ndims);
+  T stride = 1;
+  for (int i = ndims - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= static_cast<T>(shape.dim_size(i));
+  }
+  return strides;
+}
+
+// Helper to compute 'strides' given an Eigen TensorDimensions
+template <typename T, typename EigenDimensions>
+gtl::InlinedVector<T, 8> ComputeEigenStrides(const EigenDimensions& shape) {
+  const int ndims = shape.rank();
+  gtl::InlinedVector<T, 8> strides(ndims);
+  T stride = 1;
+  for (int i = ndims - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= static_cast<T>(shape[i]);
+  }
+  return strides;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OPS_UTIL_H_
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index d475a1972d494635c5ebe455415c062553470752..660021759dec1609018b4fca96523983a84d234f 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -69,6 +69,10 @@ template <>
 struct proxy_type_pod<GPUDevice, 2> {
   typedef Eigen::half type;
 };
+template <>
+struct proxy_type_pod<GPUDevice, 1> {
+  typedef ::tensorflow::int8 type;
+};
 
 #ifdef TENSORFLOW_USE_SYCL
 template <>
@@ -81,7 +85,7 @@ struct proxy_type_pod<SYCLDevice, 4> {
 };
 #endif  // TENSORFLOW_USE_SYCL
 
-/// If POD we use proxy_type_pod, otherwise this maps to identiy.
+/// If POD we use proxy_type_pod, otherwise this maps to identity.
 template <typename Device, typename T>
 struct proxy_type {
   typedef typename std::conditional<
@@ -94,8 +98,9 @@ struct proxy_type {
 #define TF_CALL_CPU_PROXY_TYPES(m)                                     \
   TF_CALL_int64(m) TF_CALL_int32(m) TF_CALL_uint16(m) TF_CALL_int16(m) \
       TF_CALL_int8(m) TF_CALL_complex128(m)
-#define TF_CALL_GPU_PROXY_TYPES(m) \
-  TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m)
+#define TF_CALL_GPU_PROXY_TYPES(m)                                    \
+  TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m) \
+      TF_CALL_int8(m)
 #ifdef TENSORFLOW_USE_SYCL
 #define TF_CALL_SYCL_PROXY_TYPES(m) \
   TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 508a8d3149b9f614afc900b528ae5777d0d2f5fc..9f3204ab96050a1cc06ab3052741f0044369b83e 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -204,12 +204,19 @@ Status ResourceMgr::Delete(const ResourceHandle& handle) {
 }
 
 Status ResourceMgr::Cleanup(const string& container) {
+  {
+    tf_shared_lock l(mu_);
+    if (!gtl::FindOrNull(containers_, container)) {
+      // Nothing to cleanup.
+      return Status::OK();
+    }
+  }
   Container* b = nullptr;
   {
     mutex_lock l(mu_);
     auto iter = containers_.find(container);
     if (iter == containers_.end()) {
-      // Nothing to cleanup, it's OK.
+      // Nothing to cleanup, it's OK (concurrent cleanup).
       return Status::OK();
     }
     b = iter->second;
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 4a531648d96f643d070a78e9f1b3a25eb8a74986..3195cd2e9dccaaf26ac6111a78acdb7278ea92e7 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -129,12 +129,13 @@ class ResourceMgr {
                 T** resource) const TF_MUST_USE_RESULT;
 
   // Similar to Lookup, but looks up multiple resources at once, with only a
-  // single lock acquisition.
+  // single lock acquisition.  If containers_and_names[i] is uninitialized
+  // then this function does not modify resources[i].
   template <typename T>
   Status LookupMany(absl::Span<std::pair<const string*, const string*> const>
                         containers_and_names,
                     std::vector<std::unique_ptr<T, core::RefCountDeleter>>*
-                        resource) const TF_MUST_USE_RESULT;
+                        resources) const TF_MUST_USE_RESULT;
 
   // If "container" has a resource "name", returns it in
   // "*resource". Otherwise, invokes creator() to create the resource.
@@ -261,7 +262,8 @@ Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p, T* value);
 template <typename T>
 Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p, T** value);
 
-// Looks up multiple resources pointed by a sequence of resource handles.
+// Looks up multiple resources pointed by a sequence of resource handles.  If
+// p[i] is uninitialized then values[i] is unmodified.
 template <typename T>
 Status LookupResources(
     OpKernelContext* ctx, absl::Span<ResourceHandle const> p,
@@ -445,10 +447,11 @@ Status ResourceMgr::LookupMany(
   resources->resize(containers_and_names.size());
   for (size_t i = 0; i < containers_and_names.size(); ++i) {
     T* resource;
-    TF_RETURN_IF_ERROR(LookupInternal(*containers_and_names[i].first,
-                                      *containers_and_names[i].second,
-                                      &resource));
-    (*resources)[i].reset(resource);
+    Status s = LookupInternal(*containers_and_names[i].first,
+                              *containers_and_names[i].second, &resource);
+    if (s.ok()) {
+      (*resources)[i].reset(resource);
+    }
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc
index 0c4007eafcee22d280747cf9b21630a12a63e961..55790b6e526ea38c45c79bd7e8345bafed90c5d0 100644
--- a/tensorflow/core/framework/run_handler.cc
+++ b/tensorflow/core/framework/run_handler.cc
@@ -92,6 +92,27 @@ class RunHandlerPool::Impl {
       handlers_.emplace_back(new RunHandler::Impl(this));
       free_handlers_.push_back(handlers_.back().get());
     }
+    // Set steal partitions to a fixed size steal domain of size 6 = 2 *
+    // kMinThreadsPerRequest.
+    std::vector<std::pair<unsigned, unsigned>> steal_partitions(
+        num_inter_op_threads);
+    int kStealDomainSize = std::min(6, num_inter_op_threads);
+    unsigned steal_start = 0, steal_end = kStealDomainSize;
+    for (int i = 0; i < num_inter_op_threads; ++i) {
+      if (i > steal_start) {
+        if (steal_end + kStealDomainSize < num_inter_op_threads) {
+          steal_start = steal_end;
+          steal_end += kStealDomainSize;
+        } else {
+          steal_end = num_inter_op_threads;
+          steal_start = steal_end - kStealDomainSize;
+        }
+      }
+      steal_partitions[i] = std::make_pair(steal_start, steal_end);
+      VLOG(1) << "Steal partition i: " << i << " steal_start: " << steal_start
+              << " steal_end: " << steal_end;
+    }
+    inter_op_thread_pool_->SetStealPartitions(steal_partitions);
   }
 
   ~Impl() {
@@ -223,7 +244,9 @@ void RunHandlerPool::Impl::RecomputePoolStatsLocked() {
 void RunHandler::Impl::ScheduleInterOpClosure(std::function<void()> fn) {
   std::uint_fast32_t start = 0, limit = 0;
   DecodePartition(inter_op_scheduling_range(), &start, &limit);
-  pool_impl_->inter_op_thread_pool()->Schedule(std::move(fn));
+  DCHECK_LT(start, limit);
+  pool_impl_->inter_op_thread_pool()->ScheduleWithHint(std::move(fn), start,
+                                                       limit);
 }
 
 void RunHandler::Impl::Reset() {
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 3e77028a5f1dcf3d35fbcfbc15be99cf957b36db..4dcc80680ff7c62b31fb266c0f5cd80a9325fe81 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -239,6 +239,15 @@ void InferenceContext::PreInputInit(
   output_handle_shapes_and_types_.resize(num_outputs);
 }
 
+Status InferenceContext::ExpandOutputs(int new_output_size) {
+  if (new_output_size < outputs_.size()) {
+    return errors::InvalidArgument("Trying to reduce number of outputs of op.");
+  }
+  outputs_.resize(new_output_size, nullptr);
+  output_handle_shapes_and_types_.resize(new_output_size);
+  return Status::OK();
+}
+
 void InferenceContext::PostInputInit(
     std::vector<std::unique_ptr<std::vector<ShapeAndType>>> input_handle_data) {
   int num_inputs_from_node_def = 0;
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 81258b55b392e25efe7ed117c09645faab067e30..e3885b7d9e8a3f746d0cc2121dad71221d4ec06b 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -323,13 +323,13 @@ class InferenceContext {
     return input_tensors_as_shapes_;
   }
 
-  ShapeHandle output(int64 idx) const { return outputs_[idx]; }
-  void set_output(int idx, ShapeHandle shape) { outputs_[idx] = shape; }
+  ShapeHandle output(int64 idx) const { return outputs_.at(idx); }
+  void set_output(int idx, ShapeHandle shape) { outputs_.at(idx) = shape; }
   Status set_output(StringPiece output_name,
                     const std::vector<ShapeHandle>& shapes);
 
   int num_outputs() const { return outputs_.size(); }
-  ShapeHandle output(int idx) const { return outputs_[idx]; }
+  ShapeHandle output(int idx) const { return outputs_.at(idx); }
   Status output(StringPiece output_name,
                 std::vector<ShapeHandle>* output) const;
 
@@ -645,6 +645,9 @@ class InferenceContext {
     return merged_dims_;
   }
 
+  // Adds new outputs; useful when mutating the graph.
+  Status ExpandOutputs(int new_output_size);
+
  private:
   // Creates and stores shapes for use in InferenceContext.
   class ShapeManager {
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 1dea6da9113bab15848eb6be9004bd9f180e518d..7e841489eb35d4ec3d18fe255472107ef9d60efe 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -68,7 +68,8 @@ namespace {
 // An un-templated base class for Buffer.
 class BufferBase : public TensorBuffer {
  public:
-  explicit BufferBase(Allocator* alloc) : alloc_(alloc) {}
+  explicit BufferBase(Allocator* alloc, void* data_ptr)
+      : TensorBuffer(data_ptr), alloc_(alloc) {}
 
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -106,7 +107,6 @@ class Buffer : public BufferBase {
   Buffer(Allocator* a, int64 n);
   Buffer(Allocator* a, int64 n, const AllocationAttributes& allocation_attr);
 
-  void* data() const override { return data_; }
   size_t size() const override { return sizeof(T) * elem_; }
 
  private:
@@ -442,20 +442,20 @@ struct ProtoHelper<Eigen::half> {
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n)
-    : BufferBase(a), data_(a->Allocate<T>(n)), elem_(n) {}
+    : BufferBase(a, a->Allocate<T>(n)), elem_(n) {}
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n,
                   const AllocationAttributes& allocation_attr)
-    : BufferBase(a), data_(a->Allocate<T>(n, allocation_attr)), elem_(n) {}
+    : BufferBase(a, a->Allocate<T>(n, allocation_attr)), elem_(n) {}
 
 template <typename T>
 Buffer<T>::~Buffer() {
-  if (data_) {
+  if (data()) {
     if (LogMemory::IsEnabled()) {
       RecordDeallocation();
     }
-    alloc_->Deallocate<T>(data_, elem_);
+    alloc_->Deallocate<T>(static_cast<T*>(data()), elem_);
   }
 }
 
@@ -752,12 +752,21 @@ Tensor::Tensor(Allocator* a, DataType type, const TensorShape& shape,
 Tensor::Tensor(DataType type, const TensorShape& shape)
     : Tensor(cpu_allocator(), type, shape) {}
 
+void Tensor::HostScalarTensorBufferBase::FillAllocationDescription(
+    AllocationDescription* proto) const {
+  proto->set_requested_bytes(size());
+  proto->set_allocator_name("HostScalarTensorBuffer");
+  proto->set_ptr(reinterpret_cast<uintptr_t>(data()));
+}
+
 template <typename T>
 class SubBuffer : public TensorBuffer {
  public:
   // This buffer is an alias to buf[delta, delta + n).
   SubBuffer(TensorBuffer* buf, int64 delta, int64 n)
-      : root_(buf->root_buffer()), data_(buf->base<T>() + delta), elem_(n) {
+      : TensorBuffer(buf->base<T>() + delta),
+        root_(buf->root_buffer()),
+        elem_(n) {
     // Sanity check. The caller should ensure the sub buffer is valid.
     CHECK_LE(root_->base<T>(), this->base<T>());
     T* root_limit = root_->base<T>() + root_->size() / sizeof(T);
@@ -768,7 +777,6 @@ class SubBuffer : public TensorBuffer {
     root_->Ref();
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return sizeof(T) * elem_; }
   TensorBuffer* root_buffer() override { return root_; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index d0f9eb56e236aac2b04174330450e4b4d87b24b3..6e03cf9f6f47c89289ffaec507f56d8c734e52a9 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
 #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
 
+#include <cstdint>
+#include <type_traits>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -28,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -110,6 +113,76 @@ class Tensor {
   /// for details.
   explicit Tensor(DataType type);
 
+ private:
+  // A tag type for selecting the `Tensor` constructor overload that creates a
+  // scalar tensor in host memory.
+  struct host_scalar_tag {};
+
+  class HostScalarTensorBufferBase;
+  template <typename T>
+  struct ValueAndTensorBuffer;
+
+  // Creates a tensor with the given scalar `value` in CPU memory.
+  template <typename T>
+  Tensor(T value, host_scalar_tag tag);
+
+ public:
+  // A series of specialized constructors for scalar tensors in host memory.
+  //
+  // NOTE: The `Variant` host-scalar constructor is not defined, because Variant
+  // is implicitly constructible from many different types, and this causes
+  // ambiguities with some compilers.
+  explicit Tensor(float scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(double scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(string scalar_value)
+      : Tensor(std::move(scalar_value), host_scalar_tag{}) {}
+  explicit Tensor(complex64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(complex128 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(int64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(uint64 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(bool scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(quint8 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(quint16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(qint32 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(bfloat16 scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(Eigen::half scalar_value)
+      : Tensor(scalar_value, host_scalar_tag{}) {}
+  explicit Tensor(ResourceHandle scalar_value)
+      : Tensor(std::move(scalar_value), host_scalar_tag{}) {}
+
+  // NOTE: The `const char*` host-scalar constructor is provided as a
+  // convenience because otherwise passing a string literal would surprisingly
+  // construct a DT_BOOL tensor.
+  explicit Tensor(const char* scalar_value)
+      : Tensor(string(scalar_value), host_scalar_tag{}) {}
+
   /// Copy constructor.
   Tensor(const Tensor& other);
 
@@ -563,10 +636,15 @@ class Tensor {
 // Interface to access the raw ref-counted data buffer.
 class TensorBuffer : public core::RefCounted {
  public:
+  explicit TensorBuffer(void* data_ptr) : data_(data_ptr) {}
   ~TensorBuffer() override {}
 
   // data() points to a memory region of size() bytes.
-  virtual void* data() const = 0;
+  //
+  // NOTE(mrry): The `data()` method is not virtual for performance reasons.
+  // It can be called multiple times when the contents of a `Tensor` are
+  // accessed, and so making it non-virtual allows the body to be inlined.
+  void* data() const { return data_; }
   virtual size_t size() const = 0;
 
   // If this TensorBuffer is sub-buffer of another TensorBuffer,
@@ -584,6 +662,9 @@ class TensorBuffer : public core::RefCounted {
 
   // Whether this TensorBuffer owns the underlying memory.
   virtual bool OwnsMemory() const { return true; }
+
+ private:
+  void* const data_;
 };
 
 template <typename T>
@@ -799,6 +880,80 @@ inline Tensor::Tensor(Tensor&& other)
   other.buf_ = nullptr;
 }
 
+class Tensor::HostScalarTensorBufferBase : public TensorBuffer {
+ public:
+  using TensorBuffer::TensorBuffer;
+  void FillAllocationDescription(AllocationDescription* proto) const final;
+};
+
+// A packed representation for a single scalar value of type `T`, and a
+// `TensorBuffer` implementation that describes (and manages the lifetime of)
+// that value.
+template <typename T>
+struct Tensor::ValueAndTensorBuffer {
+  class HostScalarTensorBuffer : public Tensor::HostScalarTensorBufferBase {
+   public:
+    HostScalarTensorBuffer(void* data) : HostScalarTensorBufferBase(data) {}
+    size_t size() const final { return sizeof(T); }
+    TensorBuffer* root_buffer() final { return this; }
+
+    // Override `operator delete` so that calling `delete this` in
+    // `core::Refcounted::Unref()` for an object of this type will free
+    // the enclosing `ValueAndTensorBuffer` for the tensor buffer.
+    //
+    // NOTE(mrry): The definition of this method must be outside the class
+    // definition in order to satisfy some compilers.
+    static void operator delete(void* ptr);
+
+    static void operator delete(void*, void*) {
+      // Some compilers require an overridden class-specific deallocation
+      // function, which will be called if placement `new` throws an
+      // exception.
+    }
+
+   private:
+    ~HostScalarTensorBuffer() override { static_cast<T*>(data())->~T(); }
+  };
+
+  T value;
+  HostScalarTensorBuffer tensor_buffer;
+};
+
+/* static */
+template <typename T>
+void Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer::operator delete(
+    void* ptr) {
+  // Use a dummy object to compute to offset of
+  // `ValueAndTensorBuffer::tensor_buffer`, because `offsetof()` is not
+  // necessarily defined on this non-POD type (until C++17).
+  //
+  // NOTE(mrry): Using `sizeof(Tensor::ValueAndTensorBuffer<T>)` here requires
+  // us to define this method outside the class definition, so that it is not
+  // considered an incomplete type.
+  typename std::aligned_storage<sizeof(Tensor::ValueAndTensorBuffer<T>),
+                                alignof(Tensor::ValueAndTensorBuffer<T>)>::type
+      dummy_storage_;
+  Tensor::ValueAndTensorBuffer<T>* dummy_object =
+      reinterpret_cast<Tensor::ValueAndTensorBuffer<T>*>(&dummy_storage_);
+  intptr_t offset = reinterpret_cast<intptr_t>(&dummy_object->tensor_buffer) -
+                    reinterpret_cast<intptr_t>(dummy_object);
+
+  port::AlignedFree(static_cast<char*>(ptr) - offset);
+}
+
+template <typename T>
+Tensor::Tensor(T value, host_scalar_tag tag) {
+  auto* value_and_buf = static_cast<Tensor::ValueAndTensorBuffer<T>*>(
+      port::AlignedMalloc(sizeof(typename Tensor::ValueAndTensorBuffer<T>),
+                          EIGEN_MAX_ALIGN_BYTES));
+  new (&value_and_buf->value) T(std::move(value));
+  new (&value_and_buf->tensor_buffer)
+      typename Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer(
+          value_and_buf);
+  buf_ = &value_and_buf->tensor_buffer;
+  set_dtype(DataTypeToEnum<T>::value);
+}
+
 inline Tensor& Tensor::operator=(Tensor&& other) {
   // Avoid self-assignment, since we might destroy our underlying buffer.
   if (&other != this) {
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index be7e740c335ced3ec6826e804090927962d57285..625d88ec1bdcdd9765dd64b09a1bad51f7fa3370 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -300,13 +300,17 @@ class TensorShape : public TensorShapeBase<TensorShape> {
   bool operator!=(const TensorShape& b) const { return !IsSameSize(b); }
 
   /// Fill `*dsizes` from `*this`.
-  template <int NDIMS>
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizes() const;
+  /// Notice: Using IndexType=int32 in combination with To32Bit() can
+  /// significantly improve performance on GPU.
+  template <int NDIMS, typename IndexType = Eigen::DenseIndex>
+  Eigen::DSizes<IndexType, NDIMS> AsEigenDSizes() const;
 
   /// Same as `AsEigenDSizes()` but allows for `NDIMS > dims()` -- in
   /// which case we pad the rest of the sizes with 1.
-  template <int NDIMS>
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizesWithPadding() const;
+  /// Notice: Using IndexType=int32 in combination with To32Bit() can
+  /// significantly improve performance on GPU.
+  template <int NDIMS, typename IndexType = Eigen::DenseIndex>
+  Eigen::DSizes<IndexType, NDIMS> AsEigenDSizesWithPadding() const;
 
  private:
   // These CHECK fail to ease debugging.
@@ -458,20 +462,19 @@ class PartialTensorShapeUtils {
 // Template method implementation details below
 // ----------------------------------------------------------------------------
 
-template <int NDIMS>
-Eigen::DSizes<Eigen::DenseIndex, NDIMS> TensorShape::AsEigenDSizes() const {
+template <int NDIMS, typename IndexType>
+Eigen::DSizes<IndexType, NDIMS> TensorShape::AsEigenDSizes() const {
   CheckDimsEqual(NDIMS);
-  return AsEigenDSizesWithPadding<NDIMS>();
+  return AsEigenDSizesWithPadding<NDIMS, IndexType>();
 }
 
-template <int NDIMS>
-Eigen::DSizes<Eigen::DenseIndex, NDIMS> TensorShape::AsEigenDSizesWithPadding()
-    const {
+template <int NDIMS, typename IndexType>
+Eigen::DSizes<IndexType, NDIMS> TensorShape::AsEigenDSizesWithPadding() const {
   CheckDimsAtLeast(NDIMS);
   static_assert(NDIMS <= TensorShape::MaxDimensions(), "Too many dimensions");
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+  Eigen::DSizes<IndexType, NDIMS> dsizes;
   for (int d = 0; d < dims(); d++) {
-    dsizes[d] = dim_size(d);
+    dsizes[d] = static_cast<IndexType>(dim_size(d));
   }
   for (int d = dims(); d < NDIMS; d++) {
     dsizes[d] = 1;
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index c5966041435b240a821fc510fa3479a06ca457e9..713f91fe04c6fe498209d88193f6fbb1729ec57c 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -830,6 +830,45 @@ TEST(Tensor_Scalar, Basics) {
   }
 }
 
+TEST(Tensor_HostScalar, Basics) {
+  {
+    Tensor t(true);
+    EXPECT_EQ(DT_BOOL, t.dtype());
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<bool>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    EXPECT_TRUE(Tt());
+    Tt() = false;
+    EXPECT_FALSE(Tt());
+  }
+  {
+    Tensor t(123.45f);
+    EXPECT_EQ(DT_FLOAT, t.dtype());
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<float>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    EXPECT_FLOAT_EQ(123.45f, Tt());
+    Tt() = 42.0f;
+    EXPECT_FLOAT_EQ(42.0f, Tt());
+  }
+  {
+    // NOTE(mrry): Use long enough strings so that the contents are dynamically
+    // allocated, and the absence of a call to the string destructor would
+    // cause a memory leak.
+    Tensor t("fooooooooooooooooooooooooooooooooooooo");
+    EXPECT_EQ(DT_STRING, t.dtype());
+    EXPECT_EQ(1, t.NumElements());
+    auto Tt = t.scalar<string>();
+    EXPECT_EQ(1, Tt.size());
+    EXPECT_EQ(0, Tt.rank());
+    EXPECT_EQ("fooooooooooooooooooooooooooooooooooooo", Tt());
+    Tt() = "baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaar";
+    EXPECT_EQ("baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaar", Tt());
+  }
+}
+
 TEST(Tensor_Float, Reshape_And_Slice_Assignment) {
   // A test to experiment with a way to assign to a subset of a tensor
   Tensor t(DT_FLOAT, TensorShape({10, 4, 3, 2}));
@@ -1452,5 +1491,26 @@ void BM_CreateAndMoveCtrWithBuf(int iters) {
 }
 BENCHMARK(BM_CreateAndMoveCtrWithBuf);
 
+// Benchmark creating and destroy a host-scalar tensor, using the allocator
+// interface.
+void BM_CreateAndDestroyHostScalarNonOptimized(int iters) {
+  TensorShape shape({});
+  Allocator* allocator = cpu_allocator();
+  while (--iters) {
+    Tensor a(allocator, DT_FLOAT, shape);
+    a.scalar<float>()() = 37.0;
+  }
+}
+BENCHMARK(BM_CreateAndDestroyHostScalarNonOptimized);
+
+// Benchmark creating and destroy a host-scalar tensor, using the specialized
+// constructor.
+void BM_CreateAndDestroyHostScalarOptimized(int iters) {
+  while (--iters) {
+    Tensor a(37.0);
+  }
+}
+BENCHMARK(BM_CreateAndDestroyHostScalarOptimized);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 2e96b0578792aa5d48a9384696d23b9728f45e22..c0df19334210bb0830371d3d5c2fc4edd0d297bc 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -104,6 +104,8 @@ typedef gtl::InlinedVector<DataType, 4> DataTypeVector;
 typedef gtl::ArraySlice<DataType> DataTypeSlice;
 
 typedef gtl::InlinedVector<DeviceType, 4> DeviceTypeVector;
+typedef gtl::InlinedVector<std::pair<DeviceType, int32>, 4>
+    PrioritizedDeviceTypeVector;
 
 // Convert the enums to strings for errors:
 string DataTypeString(DataType dtype);
@@ -295,7 +297,7 @@ inline const DataTypeSet& QuantizedTypes() { return kQuantizedTypes; }
 // Types that support '<' and '>', including quantized types.
 const DataTypeSet kRealAndQuantizedTypes =
     ToSet(DT_FLOAT) | ToSet(DT_DOUBLE) | ToSet(DT_INT32) | ToSet(DT_INT64) |
-    ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_UINT16) | ToSet(DT_INT8) |
+    ToSet(DT_UINT8) | ToSet(DT_UINT16) | ToSet(DT_INT16) | ToSet(DT_INT8) |
     ToSet(DT_QINT8) | ToSet(DT_QUINT8) | ToSet(DT_QINT16) | ToSet(DT_QUINT16) |
     ToSet(DT_QINT32) | ToSet(DT_HALF) | ToSet(DT_BFLOAT16);
 inline const DataTypeSet& RealAndQuantizedTypes() {
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index 8a240ee1e35fc4e4a0b96515b33819b47919349c..8c69c870345a68a2c5fc5f1f33015c7bb97c123e 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -38,6 +38,8 @@ class VariantTensorDataProto;
 class VariantTensorData {
  public:
   VariantTensorData();
+  // TODO(b/118823936): This silently returns if the proto is invalid.
+  // Consider calling FromProto explicitly instead.
   VariantTensorData(VariantTensorDataProto proto);
   ~VariantTensorData();
 
diff --git a/tensorflow/core/graph/edgeset_test.cc b/tensorflow/core/graph/edgeset_test.cc
index b4cef8f336550fd6eb32e189179af3f50f8c2030..c5d2d6c70f0266966694c6af682975ce4719a203 100644
--- a/tensorflow/core/graph/edgeset_test.cc
+++ b/tensorflow/core/graph/edgeset_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/edgeset.h"
 
+#include <set>
 #include <vector>
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/test.h"
@@ -22,30 +23,27 @@ limitations under the License.
 namespace tensorflow {
 class EdgeSetTest : public ::testing::Test {
  public:
-  EdgeSetTest() : edges_(nullptr), eset_(nullptr) {}
-
-  ~EdgeSetTest() override {
-    delete eset_;
-    delete[] edges_;
-  }
+  EdgeSetTest() : edges_(nullptr) {}
+  ~EdgeSetTest() override { delete[] edges_; }
 
   void MakeEdgeSet(int n) {
-    delete eset_;
-    delete[] edges_;
+    if (edges_) {
+      delete[] edges_;
+    }
     edges_ = new Edge[n];
-    eset_ = new EdgeSet;
+    eset_.clear();
     model_.clear();
     for (int i = 0; i < n; i++) {
-      eset_->insert(&edges_[i]);
+      eset_.insert(&edges_[i]);
       model_.insert(&edges_[i]);
     }
   }
 
   void CheckSame() {
-    EXPECT_EQ(model_.size(), eset_->size());
-    EXPECT_EQ(model_.empty(), eset_->empty());
+    EXPECT_EQ(model_.size(), eset_.size());
+    EXPECT_EQ(model_.empty(), eset_.empty());
     std::vector<const Edge*> modelv(model_.begin(), model_.end());
-    std::vector<const Edge*> esetv(eset_->begin(), eset_->end());
+    std::vector<const Edge*> esetv(eset_.begin(), eset_.end());
     std::sort(modelv.begin(), modelv.end());
     std::sort(esetv.begin(), esetv.end());
     EXPECT_EQ(modelv.size(), esetv.size());
@@ -54,26 +52,27 @@ class EdgeSetTest : public ::testing::Test {
     }
   }
 
+  static constexpr int kInline = 64 / sizeof(const void*);
   Edge nonexistent_;
   Edge* edges_;
-  EdgeSet* eset_;
+  EdgeSet eset_;
   std::set<const Edge*> model_;
 };
 
 namespace {
 
 TEST_F(EdgeSetTest, Ops) {
-  for (int n : {0, 1, 2, 3, 4, 10}) {
+  for (int n : {0, 1, 2, kInline + 1}) {
     MakeEdgeSet(n);
     CheckSame();
-    EXPECT_EQ((n == 0), eset_->empty());
-    EXPECT_EQ(n, eset_->size());
+    EXPECT_EQ((n == 0), eset_.empty());
+    EXPECT_EQ(n, eset_.size());
 
-    eset_->clear();
+    eset_.clear();
     model_.clear();
     CheckSame();
 
-    eset_->insert(&edges_[0]);
+    eset_.insert(&edges_[0]);
     model_.insert(&edges_[0]);
     CheckSame();
   }
@@ -81,15 +80,14 @@ TEST_F(EdgeSetTest, Ops) {
 
 // Try insert/erase of existing elements at different positions.
 TEST_F(EdgeSetTest, Exists) {
-  for (int n : {0, 1, 2, 3, 4, 10}) {
+  for (int n : {0, 1, 2, kInline + 1}) {
     MakeEdgeSet(n);
     for (int pos = 0; pos < n; pos++) {
-      MakeEdgeSet(n);
-      auto p = eset_->insert(&edges_[pos]);
+      auto p = eset_.insert(&edges_[pos]);
       EXPECT_FALSE(p.second);
       EXPECT_EQ(&edges_[pos], *p.first);
 
-      EXPECT_EQ(1, eset_->erase(&edges_[pos]));
+      EXPECT_EQ(1, eset_.erase(&edges_[pos]));
       model_.erase(&edges_[pos]);
       CheckSame();
     }
@@ -98,10 +96,10 @@ TEST_F(EdgeSetTest, Exists) {
 
 // Try insert/erase of non-existent element.
 TEST_F(EdgeSetTest, DoesNotExist) {
-  for (int n : {0, 1, 2, 3, 4, 10}) {
+  for (int n : {0, 1, 2, kInline + 1}) {
     MakeEdgeSet(n);
-    EXPECT_EQ(0, eset_->erase(&nonexistent_));
-    auto p = eset_->insert(&nonexistent_);
+    EXPECT_EQ(0, eset_.erase(&nonexistent_));
+    auto p = eset_.insert(&nonexistent_);
     EXPECT_TRUE(p.second);
     EXPECT_EQ(&nonexistent_, *p.first);
   }
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 7a4a0096fa1cad77c68cef17d43cd2899c2b1b24..550e3ef915290c499c904c14e2ca8c5fa7e4a981 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -34,7 +34,7 @@ namespace tensorflow {
 
 const int Graph::kControlSlot = -1;
 
-class NodeProperties {
+struct NodeProperties {
  public:
   NodeProperties(const OpDef* op_def, const NodeDef& node_def,
                  const DataTypeSlice inputs, const DataTypeSlice outputs)
@@ -84,6 +84,7 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"CollectiveReduce", NC_COLLECTIVE},
         {"CollectiveBcastSend", NC_COLLECTIVE},
         {"CollectiveBcastRecv", NC_COLLECTIVE},
+        {"FakeParam", NC_FAKE_PARAM},
     });
 
 #undef REF_CLASS
@@ -142,6 +143,19 @@ void Node::Clear() {
   assigned_device_name_index_ = 0;
 }
 
+void Node::UpdateProperties() {
+  DataTypeVector inputs;
+  DataTypeVector outputs;
+  Status status =
+      InOutTypesForNode(props_->node_def, *(props_->op_def), &inputs, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed at updating node: " << status;
+    return;
+  }
+  props_ = std::make_shared<NodeProperties>(props_->op_def, props_->node_def,
+                                            inputs, outputs);
+}
+
 const string& Node::name() const { return props_->node_def.name(); }
 const string& Node::type_string() const { return props_->node_def.op(); }
 const NodeDef& Node::def() const { return props_->node_def; }
@@ -271,6 +285,14 @@ Status Node::input_node(int idx, const Node** const_n) const {
   return Status::OK();
 }
 
+Status Node::input_tensor(int idx, OutputTensor* t) const {
+  const Edge* e;
+  TF_RETURN_IF_ERROR(input_edge(idx, &e));
+  DCHECK(e != nullptr);
+  *t = OutputTensor(e->src(), e->src_output());
+  return Status::OK();
+}
+
 // InputTensor
 
 bool InputTensor::operator==(const InputTensor& other) const {
@@ -516,7 +538,7 @@ Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
   const Edge* e = FindEdge(dst, dst_index);
   if (e == nullptr) {
     return errors::InvalidArgument("Couldn't find edge to ",
-                                   dst->DebugString());
+                                   FormatNodeForError(*dst));
   }
   RemoveEdge(e);
   AddEdge(new_src, new_src_index, dst, dst_index);
@@ -526,6 +548,22 @@ Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
   return Status::OK();
 }
 
+Status Graph::AddWhileInputHack(Node* new_src, int new_src_index, Node* dst) {
+  if (dst->type_string() != "While") {
+    return errors::Internal(
+        "dst argument to AddWhileEdgeHack should be a While op, got: ",
+        dst->DebugString());
+  }
+  TF_RETURN_IF_ERROR(IsValidOutputTensor(new_src, new_src_index));
+  int dst_index = dst->in_edges().size();
+  TF_RETURN_IF_ERROR(IsValidInputTensor(dst, dst_index));
+  AddEdge(new_src, new_src_index, dst, dst_index);
+  dst->MaybeCopyOnWrite();
+  dst->props_->node_def.add_input(
+      strings::StrCat(new_src->name(), ":", new_src_index));
+  return Status::OK();
+}
+
 Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
   // Need a new-enough consumer to support the functions we add to the graph.
   if (fdef_lib.function_size() > 0 && versions_->min_consumer() < 12) {
@@ -737,6 +775,14 @@ Status Graph::AddWhileContext(StringPiece frame_name,
   return Status::OK();
 }
 
+std::unordered_map<string, Node*> Graph::BuildNodeNameIndex() const {
+  std::unordered_map<string, Node*> result;
+  for (Node* n : nodes()) {
+    result[n->name()] = n;
+  }
+  return result;
+}
+
 string Edge::DebugString() const {
   return strings::Printf("[id=%d %s:%d -> %s:%d]", id_, src_->name().c_str(),
                          src_output_, dst_->name().c_str(), dst_input_);
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 2944951f826ab8f90559250259ce4c8e06da884f..667eaba24c3341cbafc68c92ac5e9fa23dbe669d 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -59,12 +59,13 @@ class EdgeSetTest;
 class Graph;
 class GraphDef;
 class Node;
+struct OutputTensor;
 class VersionDef;
 class WhileContext;
 
 class NeighborIter;    // Declared below
 class NodeIter;        // Declared below
-class NodeProperties;  // Defined in .cc
+struct NodeProperties;  // Defined in .cc
 
 class Node {
  public:
@@ -167,10 +168,12 @@ class Node {
   bool IsCollective() const { return class_ == NC_COLLECTIVE; }
 
   bool IsMetadata() const { return class_ == NC_METADATA; }
+  bool IsFakeParam() const { return class_ == NC_FAKE_PARAM; }
 
   template <typename T>
   void AddAttr(const string& name, const T& val) {
     SetAttrValue(val, AddAttrHelper(name));
+    UpdateProperties();
   }
 
   void ClearAttr(const string& name);
@@ -187,6 +190,10 @@ class Node {
   Status input_node(int idx, const Node** n) const;
   Status input_node(int idx, Node** n) const;
 
+  // Returns into '*t' the idx-th input tensor of this node, represented as the
+  // output tensor of input_node(idx).
+  Status input_tensor(int idx, OutputTensor* t) const;
+
   WhileContext* while_ctx() const { return while_ctx_; }
   void set_while_ctx(WhileContext* while_ctx) {
     DCHECK(IsExit());
@@ -211,6 +218,10 @@ class Node {
   // e.g. in AddAttr.
   void MaybeCopyOnWrite();
 
+  // Called after an attr has changed. Decides whether we need to update some
+  // property of the node (stored in props_).
+  void UpdateProperties();
+
   AttrValue* AddAttrHelper(const string& name);
 
   // A set of mutually exclusive classes for different kinds of nodes,
@@ -238,6 +249,7 @@ class Node {
     NC_METADATA,
     NC_SCOPED_ALLOCATOR,
     NC_COLLECTIVE,
+    NC_FAKE_PARAM,
     NC_OTHER  // Not a special kind of node
   };
 
@@ -280,10 +292,10 @@ class Node {
 
 // Represents an input of a node, i.e., the `index`-th input to `node`.
 struct InputTensor {
-  const Node* node;
+  Node* node;
   int index;
 
-  InputTensor(const Node* n, int i) : node(n), index(i) {}
+  InputTensor(Node* n, int i) : node(n), index(i) {}
   InputTensor() : node(nullptr), index(0) {}
 
   // Returns true if this InputTensor is identical to 'other'. Nodes are
@@ -301,10 +313,10 @@ struct InputTensor {
 // that a single `OutputTensor` can correspond to multiple `Edge`s if the output
 // is consumed by multiple destination nodes.
 struct OutputTensor {
-  const Node* node;
+  Node* node;
   int index;
 
-  OutputTensor(const Node* n, int i) : node(n), index(i) {}
+  OutputTensor(Node* n, int i) : node(n), index(i) {}
   OutputTensor() : node(nullptr), index(0) {}
 
   // Returns true if this OutputTensor is identical to 'other'. Nodes are
@@ -418,9 +430,9 @@ class Graph {
   // Constructs a graph with a single SOURCE (always id kSourceId) and a
   // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
   //
-  // The graph can hold ops found in registry. `registry`s lifetime must be at
+  // The graph can hold ops found in the registry. `ops`s lifetime must be at
   // least that of the constructed graph's.
-  explicit Graph(const OpRegistryInterface* registry);
+  explicit Graph(const OpRegistryInterface* ops);
 
   // Constructs a graph with a single SOURCE (always id kSourceId) and a
   // single SINK (always id kSinkId) node, and an edge from SOURCE->SINK.
@@ -481,11 +493,17 @@ class Graph {
   // the corresponding NodeDef to reflect the change.
   // REQUIRES: The control edge must exist.
   void RemoveControlEdge(const Edge* e);
+
   // Updates the input to a node.  The existing edge to `dst` is removed and an
   // edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
   // is also updated.
   Status UpdateEdge(Node* new_src, int new_src_index, Node* dst, int dst_index);
 
+  // Like AddEdge but updates dst's NodeDef. Used to add an input edge to a
+  // "While" op during gradient construction, see AddInputWhileHack in
+  // python_api.h for more details.
+  Status AddWhileInputHack(Node* new_src, int new_src_index, Node* dst);
+
   // Adds the function and gradient definitions in `fdef_lib` to this graph's op
   // registry. Ignores duplicate functions, and returns a bad status if an
   // imported function differs from an existing function or op with the same
@@ -609,6 +627,9 @@ class Graph {
                          std::vector<OutputTensor> body_outputs,
                          WhileContext** result);
 
+  // Builds a node name to node pointer index for all nodes in the graph.
+  std::unordered_map<string, Node*> BuildNodeNameIndex() const;
+
   // TODO(josh11b): uint64 hash() const;
 
  private:
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index eeb5c14eaa5f43b8e05c4ee6573cfe9d4c54af02..f6d83d5f6fff9be372e512e2ff7b8366201bdd81 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -86,7 +86,8 @@ class GraphConstructor {
           return_nodes(in.return_nodes),
           importing(true),
           validate_colocation_constraints(in.validate_colocation_constraints),
-          validate_shape(in.validate_shape) {}
+          validate_shape(in.validate_shape),
+          default_device(in.default_device) {}
 
     bool allow_internal_ops;
     bool expect_device_spec;
@@ -111,6 +112,8 @@ class GraphConstructor {
     bool importing;
     bool validate_colocation_constraints;
     bool validate_shape = true;
+
+    string default_device;
   };
 
   typedef gtl::ArraySlice<const NodeDef*> NodeDefSlice;
@@ -963,6 +966,10 @@ Status GraphConstructor::Convert() {
         // Note that input_already_exists can grow here
         AddControlDependencies(&imported_node_def, &input_already_exists);
       }
+      if (!opts_.default_device.empty() && imported_node_def.device().empty()) {
+        imported_node_def.set_device(opts_.default_device);
+      }
+
       node_def = &imported_node_def;
     } else {
       node_def = &original_node_def;
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index f6e41faf9c6b49485e54e1a1bdb33c33f30aa386..7af824a6f52db0da8f83c0cc9ee0b435202cdd0d 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -138,6 +138,9 @@ struct ImportGraphDefOptions {
   // with ops that are not defined in the binary calling ImportGraphDef.
   // Similar to the producer_op_list argument to import_graph_def in the
   // python API.
+
+  // Try to set default execution device for this grapth.
+  string default_device;
 };
 
 // Optional results that may be returned by ImportGraphDef.
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 3eef6bd2bd28d7f359f8893005db1fe76a26451c..1912f2fc96a4e214a283fc4c93f0bd7bf30b9437 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -3212,6 +3212,33 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ValidateColationConstraints) {
   TF_EXPECT_OK(ImportGraphDef(options, def, &graph_, nullptr));
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_ValidateDefaultDevice) {
+  std::string gdef_ascii(
+      R"EOF(
+      node { name: 'test_input' op: 'TestInput' }
+      node { name: 'test_input_with_dev' op: 'TestInput' device: 'some dev'}
+      node { name: 'test_op' op: 'TestMul' input: [ 'test_input:0', 'test_input:1' ] }
+      node { name: 'test_op_with_dev' op: 'TestMul' input: [ 'test_input:0', 'test_input:1' ] device: 'some dev'}
+      )EOF");
+
+  GraphDef gdef;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(gdef_ascii, &gdef));
+
+  ImportGraphDefOptions options;
+  options.default_device = "/gpu:13";
+  ImportGraphDefResults res;
+
+  TF_ASSERT_OK(ImportGraphDef(options, gdef, &graph_, NULL, &res));
+  std::map<string, string> node2dev;
+  for (Node* n : graph_.nodes()) {
+    node2dev[n->name()] = n->requested_device();
+  }
+  EXPECT_EQ(node2dev["test_input"], "/gpu:13");
+  EXPECT_EQ(node2dev["test_op"], "/gpu:13");
+  EXPECT_EQ(node2dev["test_input_with_dev"], "some dev");
+  EXPECT_EQ(node2dev["test_op_with_dev"], "some dev");
+}
+
 TEST_F(GraphConstructorTest, ImportGraphDef_UnknownOps) {
   const string pb_ascii = "node { name: 'op_from_contrib' op: 'OpFromContrib'}";
   // Try load twice to check for two parts of the error message. We cannot check
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 1dbcebab598c7230008ab61e1094229bde76b757..9c640c42a5891b632e18517c848cc9a0c76a0f45 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -1186,7 +1187,8 @@ Status Partition(const PartitionOptions& opts, Graph* g,
   for (auto& it : *partitions) {
     GraphDef* gdef = &it.second;
     *gdef->mutable_versions() = g->versions();
-    *gdef->mutable_library() = flib_def->ToProto();
+    // Prune unreachable functions from `flib_def` before adding them to `gdef`.
+    *gdef->mutable_library() = flib_def->ReachableDefinitions(*gdef).ToProto();
 
     // Traverse the graph to fill every send/recv op's incarnation
     // information.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index f44ed47a6e94acdce66c36902cbcf2fdfb041447..29d8034d2a14b6fa2c49b5fa65cb409209b29944 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -470,13 +470,19 @@ TEST_F(GraphPartitionTest, Functions) {
   ConstructOp(in_.WithOpName("A2"), "XTimesTwo", {a1});
   ConstructOp(in_.WithOpName("B2"), "XTimesFour", {b1});
 
+  // The `Partition()` helper function uses the first letter of the op name ('A'
+  // or 'B') to choose a device for each node.
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  // Test that partition graphs inherit function library from original graph
+  // Test that partition graphs inherit function library from original graph.
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
-  ExpectFunctions(partitions_[a].library(), {"XTimesTwo", "XTimesFour"});
+
+  // Node "A2" is placed in part `a`, and uses only "XTimesTwo".
+  ExpectFunctions(partitions_[a].library(), {"XTimesTwo"});
+  // Node "B2" is placed in part `b`, and uses both "XTimesFour" directly,
+  // and "XTimesTwo" in the body of "XTimesFour".
   ExpectFunctions(partitions_[b].library(), {"XTimesTwo", "XTimesFour"});
 }
 
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index c8c2b225fea721bd19683fbdb805601bb9be494b..333c32567fc9b922951b558c86f29087da770894 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 
 #include <set>
+#include <unordered_map>
 #include <vector>
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
@@ -643,32 +644,92 @@ TEST_F(GraphTest, AddFunctionLibrary) {
             "because it already has gradient function 'Undefined'");
 }
 
-REGISTER_OP("Input").Output("o: float");
-REGISTER_OP("In2Out1").Input("a: float").Input("b: float").Output("o: float");
+TEST_F(GraphTest, BuildNodeNameIndex) {
+  FromGraphDef(
+      "node { name: 'A' op: 'OneOutput' }"
+      "node { name: 'B' op: 'OneInputTwoOutputs' input: [ 'A:0' ] }"
+      "node { name: 'C' op: 'NoOp' } ");
 
-static void BM_InEdgeIteration(int iters, int num_nodes) {
-  testing::StopTiming();
+  auto node_name_index = graph_.BuildNodeNameIndex();
+  EXPECT_EQ(node_name_index.size(), 5);
+
+  std::vector<string> node_names{"_SOURCE", "_SINK", "A", "B", "C"};
+  for (const string& node_name : node_names) {
+    EXPECT_NE(node_name_index.find(node_name), node_name_index.end());
+    EXPECT_EQ(node_name_index[node_name], FindNode(node_name));
+  }
+}
+
+REGISTER_OP("Input").Output("y: float");
+REGISTER_OP("In2Out1").Input("a: float").Input("b: float").Output("y: float");
+REGISTER_OP("In4Out1")
+    .Input("a: float")
+    .Input("b: float")
+    .Input("c: float")
+    .Input("d: float")
+    .Output("y: float");
+REGISTER_OP("In8Out1")
+    .Input("a: float")
+    .Input("b: float")
+    .Input("c: float")
+    .Input("d: float")
+    .Input("e: float")
+    .Input("f: float")
+    .Input("g: float")
+    .Input("h: float")
+    .Output("y: float");
+REGISTER_OP("In16Out1")
+    .Input("a: float")
+    .Input("b: float")
+    .Input("c: float")
+    .Input("d: float")
+    .Input("e: float")
+    .Input("f: float")
+    .Input("g: float")
+    .Input("h: float")
+    .Input("i: float")
+    .Input("j: float")
+    .Input("k: float")
+    .Input("l: float")
+    .Input("m: float")
+    .Input("n: float")
+    .Input("o: float")
+    .Input("p: float")
+    .Output("y: float");
+
+GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
+  const int kNumInNodes = 10 * num_edges_per_node;
   string s;
-  for (int in = 0; in < 10; in++) {
+  for (int in = 0; in < kNumInNodes; in++) {
     s += strings::Printf("node { name: 'in%04d' op: 'Input' }", in);
   }
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
   for (int op = 0; op < num_nodes; op++) {
-    s += strings::Printf(
-        "node { name: 'op%04d' op: 'In2Out1' input: ['in%04d', 'in%04d' ] }",
-        op, rnd.Uniform(10), rnd.Uniform(10));
+    s += strings::Printf("node { name: 'op%05d' op: 'In%dOut1' input: [ ", op,
+                         num_edges_per_node);
+    for (int edge = 0; edge < num_edges_per_node - 1; ++edge) {
+      s += strings::Printf("'in%04d', ", rnd.Uniform(kNumInNodes));
+    }
+    s += strings::Printf("'in%04d' ] } ", rnd.Uniform(kNumInNodes));
   }
 
-  Graph graph(OpRegistry::Global());
   GraphDef graph_def;
   CHECK(protobuf::TextFormat::ParseFromString(s, &graph_def));
+  return graph_def;
+}
+
+static void BM_InEdgeIteration(int iters, int num_nodes,
+                               int num_edges_per_node) {
+  testing::StopTiming();
+  const GraphDef graph_def = CreateGraphDef(num_nodes, num_edges_per_node);
+  Graph graph(OpRegistry::Global());
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
 
   int64 sum = 0;
   testing::StartTiming();
-  for (int i = 0; i < iters; i += graph.num_node_ids()) {
+  for (int i = 0; i < iters; ++i) {
     for (const Node* node : graph.nodes()) {
       for (auto e : node->in_edges()) {
         sum += e->id();
@@ -676,8 +737,67 @@ static void BM_InEdgeIteration(int iters, int num_nodes) {
     }
   }
   VLOG(1) << sum;
+  testing::StopTiming();
+}
+BENCHMARK(BM_InEdgeIteration)->ArgPair(10, 2);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 6, 2);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 9, 2);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 12, 2);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 15, 2);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(10, 4);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 6, 4);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 9, 4);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 12, 4);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 15, 4);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(10, 8);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 6, 8);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 9, 8);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 12, 8);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 15, 8);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(10, 16);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 6, 16);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 9, 16);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 12, 16);
+BENCHMARK(BM_InEdgeIteration)->ArgPair(1 << 15, 16);
+
+static void BM_GraphCreation(int iters, int num_nodes, int num_edges_per_node) {
+  testing::StopTiming();
+  const GraphDef graph_def = CreateGraphDef(num_nodes, num_edges_per_node);
+  const auto registry = OpRegistry::Global();
+  GraphConstructorOptions opts;
+  // Warmup step.
+  Graph graph(registry);
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
+  int64 sum = 0;
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    Graph graph(registry);
+    TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
+    sum += graph.num_node_ids();
+  }
+  VLOG(1) << sum;
+  testing::StopTiming();
 }
-BENCHMARK(BM_InEdgeIteration)->Range(10, 100000);
+BENCHMARK(BM_GraphCreation)->ArgPair(10, 2);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 6, 2);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 9, 2);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 12, 2);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 2);
+BENCHMARK(BM_GraphCreation)->ArgPair(10, 4);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 6, 4);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 9, 4);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 12, 4);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 4);
+BENCHMARK(BM_GraphCreation)->ArgPair(10, 8);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 6, 8);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 9, 8);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 12, 8);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 8);
+BENCHMARK(BM_GraphCreation)->ArgPair(10, 16);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 6, 16);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 9, 16);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 12, 16);
+BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 16);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index bab1df87a4d3c62b8377363e1ea7a0af33434dc3..990b2fe9b04770dc875b949ec3e17c321fe018be 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -75,6 +75,8 @@ int inline GetTensorMetaDataIndex(int n, int total_tensors) {
 namespace mkl_op_registry {
 static const char* kMklOpLabel = "MklOp";
 static const char* kMklOpLabelPattern = "label='MklOp'";
+static const char* kMklQuantizedOpLabel = "QuantizedMklOp";
+static const char* kMklQuantizedOpLabelPattern = "label='QuantizedMklOp'";
 // Prefix that we add to Tensorflow op name to construct Mkl op name.
 static const char* const kMklOpPrefix = "_Mkl";
 
@@ -91,9 +93,30 @@ inline string GetMklOpName(const string& name) {
 // @return: true if opname is registered as Mkl op; false otherwise
 static inline bool IsMklOp(const string& op_name, DataType T) {
   string kernel = KernelsRegisteredForOp(op_name);
-  bool result =
-      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-  return result;
+
+  // Restrict quantized ops to QUINT8 and QINT8 for now
+  if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
+    return (T == DT_QUINT8 || T == DT_QINT8);
+  }
+  // Restrict regular ops to FLOAT
+  if (kernel.find(kMklOpLabelPattern) != string::npos) {
+    return (T == DT_FLOAT);
+  }
+  return false;
+}
+
+// TODO(mdfaijul): QuantizedConv2D is registered with input: QUINT8
+// filter:QINT8 for mkldnn integration. First a dummy kernel is created
+// and then it is replaced by an actual kernel.
+static inline bool IsMklOp(const string& op_name, DataType Tinput,
+                           DataType Tfilter) {
+  string kernel = KernelsRegisteredForOp(op_name);
+
+  // Restrict quantized ops to QUINT8 and QINT8 for now
+  if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
+    return (Tinput == DT_QUINT8 && Tfilter == DT_QINT8);
+  }
+  return false;
 }
 
 // Check whether opname with type T is registered as MKL-compliant and
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 7394b1cddfbc56c758e7853dea548429d8c25608..52b46600943b31f4d0205d0eb120cc282c78240f 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -22,2203 +22,31 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <set>
+#include <stack>
+#include <tuple>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/util/util.h"
-
-#include "tensorflow/core/graph/mkl_graph_util.h"
-#include "tensorflow/core/graph/mkl_layout_pass.h"
-
-namespace tensorflow {
-
-#ifdef INTEL_MKL_ML_ONLY
-
-// This pass implements rewriting of graph to support following scenarios:
-// (A) Merging nodes in the graph
-// (B) Rewriting a node in the graph to a new node
-//     Rewrite happens under following 2 scenarios:
-//     1) Propagating Mkl layout as an additional output tensor
-//        (we will loosely call a tensor that carries Mkl layout as Mkl tensor
-//         henceforth.) from every Mkl supported NN layer.
-//     2) Context-based rewrite: This is needed in order to optimize
-//        gradient ops of Conv2D+AddBias. Gradient op of both the Conv2D and
-//        MatMul is BiasAddGrad, and we need to rewrite BiasAddGrad into
-//        Conv2D-specific BiasAddGrad, and MatMul-specific BiasAddGrad.
-//        This is context-specific optimization, where the context is the
-//        forward operator that the BiasAddGrad corresponds to.
-//
-// Example of A : Merging nodes in the graph
-// -----------------------------------------
-// Currently, we merge Conv2D+AddBias together. Consider Conv2D and BiasAdd as:
-//
-//           O = Conv2D(A, B)
-//           P = BiasAdd(O, C)
-//
-// We merge them into Conv2DWithBias as:
-//           P = _MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
-//
-// The meaning of A_m, B_m and C_m is explained in B.1.
-//
-// Merge rules:
-//  - The merge for Conv2D and BiasAdd happens when the output of Conv2D _only_
-//    goes to BiasAdd.
-//  - Also, the intersection of attributes of both the nodes must have same
-//    values.
-//  - Both the nodes must have been assigned to same device (if any).
-//
-// Example of B.1 : Rewriting nodes to Mkl nodes
-// ---------------------------------------------
-// Consider a Relu node. Current definition of Relu node looks like:
-//
-//           O = Relu(A)
-//
-// Relu has 1 input (A), and 1 output (O).
-//
-// This rewrite pass will generate a new graph node for Relu (new node is
-// called MklRelu) as:
-//
-//          O, O_m = MklRelu(A, A_m)
-//
-// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here input A is
-// same as input A of Relu; output O is same as output O of Relu. O_m is the
-// additional output tensor that will be set by MklRelu, and it represents
-// Mkl tensor corresponding to O -- in other words, O_m is some kind of
-// metadata for O. A_m is additional input of Relu, and it represents metadata
-// for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
-// this metadata from previous node in the graph.
-//
-// When a previous node in the graph is an Mkl node, A_m will represent a valid
-// Mkl tensor. But when a previous node is not an Mkl node, A_m will represent
-// a dummy Mkl tensor.
-//
-// Rewriting rules:
-//  - Selection of a node for rewriting happens by registering the op type of
-//    the node with the rewriting pass. If the op type is not registered, then
-//    all nodes of this op type will not be rewritten.
-//  - Number of inputs after rewriting:
-//      Since for every input Tensorflow tensor, the rewritten node gets Mkl
-//      tensor(s), rewritten node gets 2*N inputs, where N is the number of
-//      inputs for the original node.
-//  - Number of outputs after rewriting:
-//      Since for every output Tensorflow tensor, the rewritten node generates
-//      Mkl tensor(s), the rewritten node generates 2*N outputs, where N is the
-//      number of outputs of the original node.
-//  - Ordering of Tensorflow tensors and Mkl tensors:
-//      Since every rewritten node generates twice the number of inputs and
-//      outputs, one could imagine various orderings among Tensorflow tensors
-//      and Mkl tensors. E.g., assume an op 'Conv2D' that takes (A, B) as
-//      inputs, then the new op '_MklConv2D' can take inputs A, B, A_m and B_m
-//      in A, A_m, B, B_m order or it can also take them in A, B, A_m, B_m
-//      order. Among N inputs one can get N! permutations.
-//
-//      So the question is: which order do we follow? We support 2 types of
-//      orderings: (1) interleaved, and (2) contiguous. Interleaved ordering
-//      follows an intuitive order where an Mkl tensor follows the
-//      corresponding Tensorflow tensor immediately. In the context of the
-//      above example, it will be: A, A_m, B, B_m. Note that the ordering rule
-//      applies to both the inputs and outputs. Contiguous ordering means
-//      all the Tensorflow tensors are contiguous followed by all the Mkl
-//      tensors. We use contiguous ordering as default.
-//
-// Graph rewrite algorithm:
-//      Algorithm: Graph Rewrite
-//      Input: Graph G, Names of the nodes to rewrite and their new names
-//      Output: Modified Graph G' if the nodes are modified, G otherwise.
-//      Start:
-//        N = Topological_Sort(G) // N is a set of nodes in toposort order.
-//        foreach node n in N
-//        do
-//          if (Is_MKL_Op(n))  // Can this node accept an Mkl layout as input.
-//          then
-//            E = set of <incoming edge and its src_output slot> of n
-//            E' = {}   // a new set of edges for rewritten node
-//            foreach <e,s> in E
-//            do
-//              E' U {<e,s>}  // First copy edge which generates Tensorflow
-//                            // tensor as it is
-//              m = Source node of edge e
-//              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
-//              then
-//                E' U {<m,s+1>}    // If yes, then m will generate an Mkl
-//                                  // tensor as an additional output.
-//              else
-//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate a dummy
-//                                                 // Mkl tensor.
-//                E' U {<d,0>}  // The dummy Mkl tensor has only 1 output slot.
-//              fi
-//            done
-//            n' = Build_New_Node(G,new_name,E')
-//            Mark_Rewritten(n')  // Mark the new node as being rewritten.
-//          fi
-//        done
-//
-//      Explanation:
-//        For graph rewrite, we visit nodes of the input graph in the
-//        topological sort order. With this ordering, we visit nodes in the
-//        top-to-bottom fashion. We need this order because while visiting a
-//        node we want that all of its input nodes are visited and rewritten if
-//        applicable. This is because if we need to rewrite a given node
-//        then all of its input nodes need to be fixed (in other words they
-//        cannot be deleted later.)
-//
-//        While visiting a node, we first check if the op type of the node is
-//        an Mkl op. If it is, then we rewrite that node after constructing
-//        new inputs to the node. If the op type of the node is not Mkl op,
-//        then we do not rewrite that node.
-//
-// Handling workspace propagation for certain ops:
-//
-//        Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
-//        passing of a workspace from their respective forward ops. Workspace
-//        tensors provide memory for storing results of intermediate operations
-//        which are helpful in backward propagation. TensorFlow does not have
-//        a notion of a workspace and as a result does not allow producing
-//        additional outputs from these forward ops. For these ops, we need
-//        to add 2 extra edges between forward ops and their corresponding
-//        backward ops - the first extra edge carries a workspace tensor and
-//        the second one carries an Mkl tensor for the workspace tensor.
-//
-//        Example:
-//
-//        Typical graph for MaxPool and its gradient looks like:
-//
-//        A = MaxPool(T)
-//        B = MaxPoolGrad(X, A, Y)
-//
-//        We will transform this graph to propagate the workspace as:
-//        (with the contiguous ordering)
-//
-//        A, W, A_m, W_m = MklMaxPool(T, T_m)
-//        B, B_m = MklMaxPoolGrad(X, A, Y, W, X_m, A_m, Y_m, W_m)
-//
-//        Here W is the workspace tensor. Transformed tensor names with the
-//        suffix _m are Mkl tensors, and this transformation has been done
-//        using the algorithm discussed earlier. The transformation for
-//        workspace propagation only adds extra outputs (W, W_m) for a forward
-//        op and connects them to the corresponding backward ops.
-//
-//        Terms:
-//
-//        Forward op name = name of the op in the forward pass
-//          where a workspace tensor originates (MaxPool in this example)
-//        Backward op name = name of the op in the backward pass that receives
-//          a workspace tensor from the forward op (MaxPoolGrad in the example)
-//        Slot = Position of the output or input slot that will be
-//               used by the workspace tensor (1 for MklMaxPool as W is the 2nd
-//               output of MaxPool (0 is 1st); 3 for MklMaxPoolGrad)
-//
-//        Question:
-//
-//        How do we associate a backward op to a forward op? There can be more
-//        than one op with the exact same name.
-//
-//        In this example, we associate MaxPoolGrad with MaxPool. But there
-//        could be more than one MaxPool ops. To solve this problem, we look
-//        for _direct_ edge between a forward op and a backward op (tensor A is
-//        flowing along this edge in the example).
-//
-//        How do we transform forward and backward ops when there is no direct
-//        edge between them? In such a case, we generate dummy tensors for
-//        workspace tensors. For the example, transformation of MaxPool will
-//        be exactly same as it would be when there is a direct edge between
-//        the forward and the backward op --- it is just that MaxPool won't
-//        generate any workspace tensor. For MaxPoolGrad, the transformation
-//        will also be same, but instead of connecting W and W_m with the
-//        outputs of MaxPool, we will produce dummy tensors for them, and we
-//        will set workspace_enabled attribute to false.
-//
-// Example of B.2 : Context-based node rewrite
-// -------------------------------------------
-// Consider BiasAddGrad op as:
-//
-//           O = _MklConv2D(A, B, C, A_m, B_m, C_m)
-//           P = BiasAddGrad(O)
-//
-// Then we rewrite it as:
-//
-//           P = Conv2DWithBiasBackpropBias(O, O_m)
-//
-// Rewrite of BiasAddGrad into Conv2DWithBiasBackpropBias takes place depending
-// on the matching 'context'. The term context is loosely related to which
-// forward op is _associated_ to BiasAddGrad. If it is _MklConv2DWithBias then
-// we consider it Conv2D context; if it is MatMul, then it is MatMul context.
-
-class MklLayoutRewritePass : public GraphOptimizationPass {
- public:
-  MklLayoutRewritePass() {
-    // NOTE: names are alphabetically sorted.
-    csinfo_.addn = "AddN";
-    csinfo_.avg_pool = "AvgPool";
-    csinfo_.avg_pool_grad = "AvgPoolGrad";
-    csinfo_.bias_add = "BiasAdd";
-    csinfo_.bias_add_grad = "BiasAddGrad";
-    csinfo_.concat = "Concat";
-    csinfo_.concatv2 = "ConcatV2";
-    csinfo_.conv2d = "Conv2D";
-    csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
-    csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
-    csinfo_.fused_batch_norm = "FusedBatchNorm";
-    csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
-    csinfo_.identity = "Identity";
-    csinfo_.lrn = "LRN";
-    csinfo_.lrn_grad = "LRNGrad";
-    csinfo_.matmul = "MatMul";
-    csinfo_.max_pool = "MaxPool";
-    csinfo_.max_pool_grad = "MaxPoolGrad";
-    csinfo_.mkl_conv2d = "_MklConv2D";
-    csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput";
-    csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
-    csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
-    csinfo_.mkl_conv2d_with_bias_backprop_bias =
-        "_MklConv2DWithBiasBackpropBias";
-    csinfo_.relu = "Relu";
-    csinfo_.relu_grad = "ReluGrad";
-    csinfo_.reshape = "Reshape";
-    csinfo_.split = "Split";
-    // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
-    // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
-    // MklInputConversion op is added before it.
-    csinfo_.add = "Add";
-    csinfo_.maximum = "Maximum";
-    csinfo_.mul = "Mul";
-    csinfo_.squared_difference = "SquaredDifference";
-    csinfo_.sub = "Sub";
-    // End - element-wise ops. See note above.
-
-    // NOTE: names are alphabetically sorted.
-    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
-                      CopyAttrsAddN, AddNRewrite, nullptr});
-    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.avg_pool,
-                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
-                      CopyAttrsPooling, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.avg_pool_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
-                      CopyAttrsPooling, AlwaysRewrite, nullptr});
-    // BiasAddGrad gets written into Conv2DWithBiasBackpropBias depending
-    // on if context contains Conv2D.
-    rinfo_.push_back({csinfo_.bias_add_grad,
-                      csinfo_.mkl_conv2d_with_bias_backprop_bias,
-                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
-                      &biasaddgrad_conv2dwithbias_context_});
-    // BiasAddGrad gets written into BiasAddGrad depending on if context
-    // contains MatMul.
-    rinfo_.push_back({csinfo_.bias_add_grad, csinfo_.matmul,
-                      CopyAttrsBiasAddGrad, ContextMatchRewrite,
-                      &biasaddgrad_matmul_context_});
-    rinfo_.push_back({csinfo_.concat,
-                      mkl_op_registry::GetMklOpName(csinfo_.concat),
-                      CopyAttrsConcat, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.concatv2,
-                      mkl_op_registry::GetMklOpName(csinfo_.concatv2),
-                      CopyAttrsConcatV2, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.conv2d,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv2d),
-                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.conv2d_grad_filter,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
-                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.conv2d_grad_input,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
-                      CopyAttrsConv2D, AlwaysRewrite, nullptr});
-
-    rinfo_.push_back({csinfo_.fused_batch_norm,
-                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
-                      CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
-    rinfo_.push_back(
-        {csinfo_.fused_batch_norm_grad,
-         mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-         CopyAttrsFusedBatchNorm, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.identity,
-                      mkl_op_registry::GetMklOpName(csinfo_.identity),
-                      CopyAttrsIdentity, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
-                      CopyAttrsLRN, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.lrn_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
-                      CopyAttrsLRN, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.max_pool,
-                      mkl_op_registry::GetMklOpName(csinfo_.max_pool),
-                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite, nullptr});
-    rinfo_.push_back({csinfo_.max_pool_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
-                      CopyAttrsPooling, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.maximum,
-                      mkl_op_registry::GetMklOpName(csinfo_.maximum),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.relu_grad,
-                      mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.reshape,
-                      mkl_op_registry::GetMklOpName(csinfo_.reshape),
-                      CopyAttrsReshape, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.squared_difference,
-                      mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-    rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
-                      CopyAttrsDataType, AlwaysRewrite, nullptr});
-
-    // Add info about which ops to add workspace edge to and the slots.
-    wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
-    wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
-
-    // Add a rule for merging nodes
-    minfo_.push_back({csinfo_.mkl_conv2d, csinfo_.bias_add, 0,
-                      csinfo_.mkl_conv2d_with_bias});
-
-    biasaddgrad_matmul_context_ = {csinfo_.bias_add_grad, csinfo_.matmul,
-                                   IsBiasAddGradInMatMulContext};
-
-    biasaddgrad_conv2dwithbias_context_ = {
-        csinfo_.bias_add_grad, csinfo_.mkl_conv2d_with_bias,
-        IsBiasAddGradInConv2DWithBiasContext};
-
-    cinfo_.push_back(&biasaddgrad_matmul_context_);
-    cinfo_.push_back(&biasaddgrad_conv2dwithbias_context_);
-  }
-
-  // Standard interface to run pass
-  Status Run(const GraphOptimizationPassOptions& options);
-
-  // Helper function which does most of heavy lifting for rewriting
-  // Mkl nodes to propagate Mkl tensor as additional output
-  //
-  // Extracts common functionality between Run public interface and
-  // test interface.
-  //
-  // @return true, if and only if graph is mutated; false otherwise.
-  bool RunPass(std::unique_ptr<Graph>* g);
-
-  /// Structure to specify the context information used in a node rewrite rule
-  typedef struct {
-    string node;  // Name of the node to be rewritten
-    string fwd;   // Name of the node in the forward pass that this node
-                  // corresponds to
-    std::function<bool(const Node*, const Node**, void* c)> context_match_fn;
-  } ContextInfo;
-
-  /// Structure to specify the name of an original node, its new name after
-  /// rewrite, the number of inputs to the original node, the function to
-  /// be used to copy attributes for the op, and the rule (if any) which
-  /// must hold for rewriting the node
-  typedef struct {
-    string name;      // Original name of op of the node in the graph
-    string new_name;  // New name of the op of the node in the graph
-    // A function handler to copy attributes from an old node to a new node.
-    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
-    // A rule under which to rewrite this node
-    std::function<bool(const Node*, const ContextInfo* c)> rewrite_rule;
-    // ContextInfo, if any, to be used for rewrite
-    ContextInfo* context;
-  } RewriteInfo;
-
-  /// Structure to specify a forward op, a backward op, and the slot numbers
-  /// in the forward and backward ops where we will add a workspace edge.
-  typedef struct {
-    string fwd_op;    // Name of a forward op in the graph
-    string bwd_op;    // Name of a backward op in the graph
-    int fwd_slot;     // Output slot in the forward op node where actual
-                      // output tensor resides
-    int bwd_slot;     // Input slot in the backward op node where actual
-                      // input tensor resides
-    int ws_fwd_slot;  // Output slot in the forward op node where workspace
-                      // edge is added
-    int ws_bwd_slot;  // Input slot in the backward op node where workspace
-                      // edge is added
-  } WorkSpaceInfo;
-
-  /// Structure to specify information used in node merge
-  typedef struct {
-    string pred;      // Predecessor node string
-    string succ;      // Successor node string
-    int op;           // The operand no the predecessor node corresponds
-                      // to the successor node
-    string new_node;  // Name of the node after merge
-  } MergeInfo;
-
-  /// Structure to store all constant strings
-  /// NOTE: names are alphabetically sorted.
-  typedef struct {
-    string addn;
-    string add;
-    string avg_pool;
-    string avg_pool_grad;
-    string bias_add;
-    string bias_add_grad;
-    string concat;
-    string concatv2;
-    string conv2d;
-    string conv2d_grad_input;
-    string conv2d_grad_filter;
-    string fused_batch_norm;
-    string fused_batch_norm_grad;
-    string identity;
-    string lrn;
-    string lrn_grad;
-    string matmul;
-    string max_pool;
-    string max_pool_grad;
-    string maximum;
-    string mkl_conv2d;
-    string mkl_conv2d_grad_input;
-    string mkl_conv2d_grad_filter;
-    string mkl_conv2d_with_bias;
-    string mkl_conv2d_with_bias_backprop_bias;
-    string mul;
-    string relu;
-    string relu_grad;
-    string reshape;
-    string split;
-    string squared_difference;
-    string sub;
-  } ConstStringsInfo;
-
- private:
-  /// Maintain info about nodes to rewrite
-  std::vector<RewriteInfo> rinfo_;
-
-  /// Maintain info about nodes to add workspace edge
-  std::vector<WorkSpaceInfo> wsinfo_;
-
-  /// Maintain info about nodes to be merged
-  std::vector<MergeInfo> minfo_;
-
-  /// Maintain info about nodes to rewrite
-  static std::vector<ContextInfo*> cinfo_;
-
-  /// Maintain structure of constant strings
-  static ConstStringsInfo csinfo_;
-
-  /// Context variables used in referencing rules
-  static ContextInfo biasaddgrad_matmul_context_;
-  static ContextInfo biasaddgrad_conv2dwithbias_context_;
-
- private:
-  // Is OpDef::ArgDef a list type? It could be N * T or list(type).
-  // Refer to opdef.proto for details of list type.
-  inline bool ArgIsList(const OpDef::ArgDef& arg) const {
-    return !arg.type_list_attr().empty() || !arg.number_attr().empty();
-  }
-
-  // Get length of a list in 'n' if 'arg' is of list type. Refer to
-  // description of ArgIsList for definition of list type.
-  inline int GetTensorListLength(const OpDef::ArgDef& arg, Node* n) {
-    CHECK_EQ(ArgIsList(arg), true);
-    int N = 0;
-    const string attr_name = !arg.type_list_attr().empty()
-                                 ? arg.type_list_attr()
-                                 : arg.number_attr();
-    if (!arg.type_list_attr().empty()) {
-      std::vector<DataType> value;
-      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &value));
-      N = value.size();
-    } else {
-      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &N));
-    }
-    return N;
-  }
-
-  // Can op represented by node 'n' run on DEVICE_CPU?
-  // Op can run on CPU with MKL if the runtime assigned device or the
-  // user requested device contains device CPU, or both are empty.
-  bool CanOpRunOnCPUDevice(const Node* n) {
-    bool result = true;
-    string reason;
-
-    // Substring that should be checked for in device name for CPU device.
-    const char* const kCPUDeviceSubStr = "CPU";
-
-    // If Op has been specifically assigned to a non-CPU device, then No.
-    if (!n->assigned_device_name().empty() &&
-       !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
-      result = false;
-      reason = "Op has been assigned a runtime device that is not CPU.";
-    }
-
-    // If user has specifically assigned this op to a non-CPU device, then No.
-    if (!n->def().device().empty() &&
-       !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
-      result = false;
-      reason = "User has assigned a device that is not CPU.";
-    }
-
-    if (result == false) {
-      VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
-              << n->type_string() << ", reason: " << reason;
-    }
-
-    // Otherwise Yes.
-    return result;
-  }
-
-  // Return a node that can be merged with input node 'n'
-  //
-  // @return pointer to the node if we can find such a
-  // node. Otherwise, it returns nullptr.
-  Node* CheckForNodeMerge(const Node* n) const;
-
-  // Merge predecessor node with its successor.
-  // Currently, we merge Conv2D with BiasAdd only.
-  //
-  // Input nodes succ and pred may be deleted if the call to
-  // this function is successful. Attempt to use the pointers
-  // after the call to function may result in undefined behaviors.
-  //
-  // @input g - input graph, succ - successor node, pred - predecessor node
-  // @return Status::OK(), if merging is successful and supported.
-  //         Returns appropriate Status error code otherwise.
-  //         Graph is updated in case nodes are merged. Otherwise, it is
-  //         not updated.
-  Status MergeNode(std::unique_ptr<Graph>* g, Node* succ, Node* pred);
-
-  // Check if the node 'n' has any applicable rewrite rule
-  // We check for 2 scenarios for rewrite.
-  //
-  // @return RewriteInfo* for the applicable rewrite rule
-  const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
-
-  // Default rewrite rule to be used in scenario 1 for rewrite.
-  // @return - true (since we want to always rewrite)
-  static bool AlwaysRewrite(const Node* n, const ContextInfo* c = nullptr) {
-    return true;
-  }
-
-  // Check if we are performing pooling on depth or batch. If it is, then we
-  // do not rewrite MaxPool node to Mkl version.
-  // @return - true (if it is not a depth/batch wise pooling case);
-  //           false otherwise.
-  static bool NonDepthBatchWisePoolRewrite(const Node* n,
-                                           const ContextInfo* c) {
-    CHECK_NOTNULL(n);
-
-    string data_format_str;
-    TensorFormat data_format;
-    std::vector<int32> ksize, strides;
-    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
-    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(), true);
-    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
-
-    // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (GetTensorDim(ksize, data_format, 'N') == 1 &&
-        GetTensorDim(strides, data_format, 'N') == 1 &&
-        GetTensorDim(ksize, data_format, 'C') == 1 &&
-        GetTensorDim(strides, data_format, 'C') == 1) {
-      return true;
-    }
-
-    return false;
-  }
-
-  static bool AddNRewrite(const Node* n, const ContextInfo* c) {
-    CHECK_NOTNULL(n);
-
-    int num;
-    CHECK_EQ(GetNodeAttr(n->def(), "N", &num).ok(), true);
-
-    // Condition that specifies non-batch-wise and non-depth-wise pooling.
-    if (num == 2) {
-      return true;
-    }
-
-    return false;
-  }
-  // Is BiasAddGrad node in 'n' is associated with Conv2DWithBias node
-  // specified in contextinfo 'ci'. Function updates fwd_node to point
-  // to Conv2DWithBias node if 'n' is associated with Conv2DWithBias.
-  //
-  // Association checks for one of the following graphs:
-  //
-  // Graph A:
-  //
-  // _ = Conv2DWithBias(F, I, _)
-  // ..
-  // _ = Conv2DBackpropFilter(F, _, G)
-  // _ = Conv2DBackpropInput(_, I, G)
-  // _ = BiasAddGrad(G)
-  //
-  // OR
-  //
-  // Graph B:
-  //
-  // _ = Conv2DWithBias(F, _, _)
-  // ..
-  // _ = Conv2DBackpropFilter(F, _, G)
-  // _ = BiasAddGrad(G)
-  //
-  // Here F, G, and I are graph nodes; _ represents graph nodes that we
-  // don't care here.
-  //
-  // @return - true (if BiasAddGrad is associated with Conv2DWithBias);
-  //           false otherwise.
-  static bool IsBiasAddGradInConv2DWithBiasContext(const Node* n,
-                                                   const Node** fwd_node,
-                                                   void* ci) {
-    CHECK_NOTNULL(n);
-    CHECK_NOTNULL(fwd_node);
-    CHECK_NOTNULL(ci);
-    *fwd_node = nullptr;
-
-    CHECK_EQ(n->type_string(), csinfo_.bias_add_grad);
-
-    // Get the only 1 input of BiasAddGrad.
-    CHECK_EQ(n->num_inputs(), 1);
-    const Node* bias_add_grad_inp = nullptr;
-    TF_CHECK_OK(n->input_node(0, &bias_add_grad_inp));
-    CHECK_NOTNULL(bias_add_grad_inp);
-
-    // Check if this input also goes to BackpropFilter and BackpropInput
-    // as 3rd input.
-    bool found_backprop_input = false;
-    bool found_backprop_filter = false;
-    Node* backprop_filter_node = nullptr;
-    Node* backprop_input_node = nullptr;
-
-    for (const Edge* e : bias_add_grad_inp->out_edges()) {
-      Node* third_input = nullptr;
-      if (e->dst()->type_string() == csinfo_.conv2d_grad_input ||
-          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_input) {
-        // Third input (index 2) of BackpropInput
-        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
-        // Third input (index 2) of BackpropInput must be same as the input
-        // of BiasAddGrad.
-        if (third_input == bias_add_grad_inp) {
-          found_backprop_input = true;
-          backprop_input_node = e->dst();
-        }
-      }
-
-      if (e->dst()->type_string() == csinfo_.conv2d_grad_filter ||
-          e->dst()->type_string() == csinfo_.mkl_conv2d_grad_filter) {
-        // Third input (index 2) of BackpropFilter
-        TF_CHECK_OK(e->dst()->input_node(2, &third_input));
-        // Third input (index 2) of BackpropFilter must be same as the input
-        // of BiasAddGrad.
-        if (third_input == bias_add_grad_inp) {
-          found_backprop_filter = true;
-          backprop_filter_node = e->dst();
-        }
-      }
-
-      // If we found both the nodes, then we can stop the search.
-      if (found_backprop_input && found_backprop_filter) {
-        break;
-      }
-    }
-
-    // If BackpropFilter node is not found, then this is not
-    // Conv2DWithBias context. For 2nd graph in the example above, only
-    // BackpropFilter would be present.
-    if (!found_backprop_filter) {
-      return false;
-    }
-
-    // Otherwise, we found the nodes.
-    CHECK_NOTNULL(backprop_filter_node);
-    if (found_backprop_input) {
-      CHECK_NOTNULL(backprop_input_node);
-    }
-
-    // Now that we confirmed that this is Conv2DWithBias context, we need to
-    // get access to the forward node (Conv2DWithBias). 2nd input of
-    // Conv2DWithBias is same as the 2nd input of Conv2DBackpropInput; 1st
-    // input of Conv2DWithBias is same as the 1st input of Conv2DBackpropFilter
-    // (This comes from definition of gradient computation for Conv2D).
-    if (found_backprop_input) {
-      // Graph A in the example.
-      Node* second_inp_of_input = nullptr;
-      Node* first_inp_of_filter = nullptr;
-      TF_CHECK_OK(backprop_input_node->input_node(1, &second_inp_of_input));
-      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
-      CHECK_NOTNULL(second_inp_of_input);
-      CHECK_NOTNULL(first_inp_of_filter);
-
-      // Now we need to find out Conv2DWithBias node from these input nodes.
-      // Conv2DWithBias node is the node that accepts both the nodes
-      // second_inp_of_input and first_inp_of_filter in 2nd and 1st input slots.
-      for (const Edge* fe : first_inp_of_filter->out_edges()) {
-        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
-            fe->dst_input() == 0) {
-          for (const Edge* ie : second_inp_of_input->out_edges()) {
-            if (ie->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
-                ie->dst_input() == 1 && fe->dst() == ie->dst()) {
-              VLOG(1) << "MklLayoutRewritePass: found "
-                      << fe->dst()->DebugString()
-                      << " as the forward node for matching context, backward"
-                      << " node is: " << n->DebugString();
-              *fwd_node = fe->dst();
-              return true;
-            }
-          }
-        }
-      }
-    } else {
-      // We did not find BackpropInput, so we work with BackpropFilter only.
-      // Graph B in the example.
-      Node* first_inp_of_filter = nullptr;
-      TF_CHECK_OK(backprop_filter_node->input_node(0, &first_inp_of_filter));
-      CHECK_NOTNULL(first_inp_of_filter);
-
-      // Now we need to find out Conv2DWithBias node from first input of
-      // BackpropFIlter. Conv2DWithBias node is the node that accepts
-      // first_inp_of_filter in 1st input slot.
-      for (const Edge* fe : first_inp_of_filter->out_edges()) {
-        if (fe->dst()->type_string() == csinfo_.mkl_conv2d_with_bias &&
-            fe->dst_input() == 0) {
-          VLOG(1) << "MklLayoutRewritePass: found " << fe->dst()->DebugString()
-                  << " as the forward node for matching context, backward"
-                  << " node is: " << n->DebugString();
-          *fwd_node = fe->dst();
-          return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
-  // Is BiasAddGrad node in 'n' is associated with MatMul node
-  // specified in contextinfo 'ci'. Function does not update fwd_node.
-  //
-  // @return - true (if BiasAddGrad is associated with MatMul);
-  //           false otherwise.
-  static bool IsBiasAddGradInMatMulContext(const Node* n, const Node** fwd_node,
-                                           void* ci) {
-    return (!IsBiasAddGradInConv2DWithBiasContext(n, fwd_node, ci));
-  }
-
-  // Rewrite rule that uses context-information for matching,
-  // used in scenario 2.
-  //
-  // @input - Node 'n' for which to search for matching context
-  // @input - The context 'c' under which to rewrite
-  // @return - true if we can rewrite node under context 'c';
-  //           false otherwise.
-  static bool ContextMatchRewrite(const Node* n, const ContextInfo* c);
-
-  // Helper function that searches the matching contextinfo for the node.
-  //
-  // @input n - Node (gradient op) whose contextinfo is to be searched,
-  //        fwd_node - pointer to node from the forward pass that this node
-  //        belongs to. fwd_node cannot be NULL.
-  // @return Matching contextinfo in case a match is found; null otherwise.
-  //         Also updates *fwd_node with pointer to forward node that this
-  //         context matches.
-  static const ContextInfo* SearchMatchingContext(const Node* n,
-                                                  const Node** fwd_node);
-
-  // Rewrites input node to a new node specified by its matching rewrite info.
-  //
-  // Method first searches matching rewrite info for input node and then
-  // uses that info to rewrite.
-  //
-  // Input node may be deleted in case of rewrite. Attempt to use the node
-  // after the call can result in undefined behaviors.
-  //
-  // @input  g - input graph, n - Node to be rewritten,
-  //         ri - matching rewriteinfo
-  // @return Status::OK(), if the input node is rewritten;
-  //         Returns appropriate Status error code otherwise.
-  //         Graph is updated in case the input node is rewritten.
-  //         Otherwise, it is not updated.
-  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const RewriteInfo* ri);
-
-  // Get nodes that will feed a list of TF tensors to the new
-  // node that we are constructing.
-  //
-  // @input g - input graph,
-  // @input inputs - inputs to old node that we are using for constructing
-  //                 new inputs,
-  // @input input_idx - the index in the 'inputs' vector pointing to the
-  //                    current input that we have processed so far
-  // @output input_idx - index will be incremented by the number of nodes
-  //                     from 'inputs' that are processed
-  // @input list_length - The expected length of list of TF tensors
-  // @output output_nodes - the list of new nodes creating TF tensors
-  //
-  // @return None
-  void GetNodesProducingTFTensorList(
-      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-      int* input_idx, int list_length,
-      std::vector<NodeBuilder::NodeOut>* output_nodes);
-
-  // Get nodes that will feed a list of Mkl tensors to the new
-  // node that we are constructing.
-  //
-  // @input g - input graph,
-  // @input orig_node - Original node that we are rewriting
-  // @input inputs - inputs to old node that we are using for constructing
-  //                 new inputs,
-  // @input input_idx - the index in the 'inputs' vector pointing to the
-  //                    current input that we have processed so far
-  // @output input_idx - index will be incremented by the number of nodes
-  //                     from 'inputs' that are processed
-  // @input list_length - The expected length of list of Mkl tensors
-  // @output output_nodes - the list of new nodes creating Mkl tensors
-  //
-  // @return None
-  void GetNodesProducingMklTensorList(
-      std::unique_ptr<Graph>* g, Node* orig_node,
-      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-      int* input_idx, int list_length,
-      std::vector<NodeBuilder::NodeOut>* output_nodes);
-
-  // Get a node that will feed an Mkl tensor to the new
-  // node that we are constructing. The output node could be (1) 'n'
-  // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
-  // if 'n' is not an Mkl layer.
-  //
-  // @input g - input graph,
-  // @input orig_node - Original node that we are rewriting,
-  // @input n - Node based on which we are creating Mkl node,
-  // @input n_output_slot - the output slot of node 'n'
-  //            which is feeding to the node that we are constructing
-  // @output mkl_node - the new node that will feed Mkl tensor
-  // @output mkl_node_output_slot - the slot number of mkl_node that
-  //                                will feed the tensor
-  // @return None
-  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
-                                 Node* n, int n_output_slot, Node** mkl_node,
-                                 int* mkl_node_output_slot);
-
-  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
-  // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
-  // set up in contiguous fashion. 'workspace_tensors' carry graph nodes
-  // producing workspace edges if 'are_workspace_tensors_available' is true.
-  // Otherwise, 'workspace_tensors' is empty vector.
-  //
-  // For details, refer to 'Ordering of inputs after rewriting' section in the
-  // documentation above.
-  //
-  // Returns Status::OK() if setting up inputs is successful, otherwise
-  // returns appropriate status code.
-  int SetUpContiguousInputs(
-      std::unique_ptr<Graph>* g,
-      const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
-      NodeBuilder* nb, Node* old_node,
-      std::vector<NodeBuilder::NodeOut>* workspace_tensors,
-      bool are_workspace_tensors_available);
-
-  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
-  // in graph 'g'. Original node is input in 'orig_node'.
-  //
-  // For details, refer to 'Ordering of Tensorflow tensors and Mkl tensors'
-  // section in the documentation above.
-  //
-  // Returns Status::OK() if setting up inputs is successful, otherwise
-  // returns appropriate status code.
-  Status SetUpInputs(std::unique_ptr<Graph>* g,
-                     const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
-                     NodeBuilder* nb, Node* orig_node);
-
-  // Add workspace edge on the input or output side of Node 'orig_node' by using
-  // NodeBuilder 'nb' for the new node provided. If 'orig_node' does not dictate
-  // adding workspace edge then do not add it. Workspace Tensorflow and Mkl
-  // tensors, if they need to be added, will be set into these tensors.
-  // If we set workspace tensors, then are_ws_tensors_added should be true.
-  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orig_node,
-                                NodeBuilder* nb,
-                                std::vector<NodeBuilder::NodeOut>* ws_tensors,
-                                bool* are_ws_tensors_added);
-
-  // Functions specific to operators to copy attributes
-  // We need operator-specific function to copy attributes because the framework
-  // does not provide any generic function for it.
-  // NOTE: names are alphabetically sorted.
-  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsIdentity(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
-
-  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
-  // using node for original node 'orig_node' and return it in '*out'.
-  // TODO(nhasabni) We should move this to mkl_util.h
-  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
-                             Node* orig_node);
-  void GetDummyWorkspaceTensorNode(std::unique_ptr<Graph>* g, Node** out,
-                                   Node* orig_node);
-};
-
-MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
-MklLayoutRewritePass::ContextInfo
-    MklLayoutRewritePass::biasaddgrad_conv2dwithbias_context_;
-MklLayoutRewritePass::ContextInfo
-    MklLayoutRewritePass::biasaddgrad_matmul_context_;
-std::vector<MklLayoutRewritePass::ContextInfo*> MklLayoutRewritePass::cinfo_;
-
-// We register Mkl rewrite pass for phase 1 in post partitioning group.
-// We register it here so that we get a complete picture of all users of Mkl
-// nodes. Do not change the ordering of the Mkl passes.
-const OptimizationPassRegistry::Grouping kMklLayoutRewritePassGroup =
-    OptimizationPassRegistry::POST_PARTITIONING;
-#ifdef ENABLE_MKL
-REGISTER_OPTIMIZATION(kMklLayoutRewritePassGroup, 1, MklLayoutRewritePass);
-#endif  // ENABLE_MKL
-
-//////////////////////////////////////////////////////////////////////////
-//           Helper functions for creating new node
-//////////////////////////////////////////////////////////////////////////
-
-static void FillInputs(const Node* n,
-                       gtl::InlinedVector<Node*, 4>* control_edges,
-                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
-  control_edges->clear();
-  for (const Edge* e : n->in_edges()) {
-    if (e->IsControlEdge()) {
-      control_edges->push_back(e->src());
-    } else {
-      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
-    }
-  }
-  std::sort(control_edges->begin(), control_edges->end());
-  if (n->op_def().is_commutative()) {
-    // For commutative inputs, we sort the input by the input Node*
-    // to get a canonical ordering (so that add(a,b) and add(b, a) will
-    // hash to the same value if is_commutative is true for 'add').
-    std::sort(in->begin(), in->end());
-  }
-}
-
-void MklLayoutRewritePass::GetNodesProducingTFTensorList(
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
-    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
-  CHECK_LT(*input_idx, inputs.size());
-  CHECK_GT(list_length, 0);
-  CHECK_NOTNULL(output_nodes);
-  output_nodes->reserve(list_length);
-
-  while (list_length != 0) {
-    CHECK_GT(list_length, 0);
-    CHECK_LT(*input_idx, inputs.size());
-    Node* n = inputs[*input_idx].first;
-    int slot = inputs[*input_idx].second;
-    // If input node 'n' is just producing a single tensor at
-    // output slot 'slot' then we just add that single node.
-    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
-    (*input_idx)++;
-    list_length--;
-  }
-}
-
-// TODO(nhasabni) We should move this to mkl_util.h.
-void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
-                                                 Node** out, Node* orig_node) {
-  // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
-  // dummy Mkl tensor. 8 = 2*size_t.
-  const DataType dt = DataTypeToEnum<uint8>::v();
-  TensorProto proto;
-  proto.set_dtype(dt);
-  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
-  proto.set_tensor_content(string(reinterpret_cast<const char*>(zero), 8));
-  TensorShape dummy_shape({8});
-  dummy_shape.AsProto(proto.mutable_tensor_shape());
-  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                  .Attr("value", proto)
-                  .Attr("dtype", dt)
-                  .Device(orig_node->def().device())  // We place this node on
-                                                      // the same device as the
-                                                      // device of the original
-                                                      // node.
-                  .Finalize(&**g, out));
-  CHECK_NOTNULL(*out); // Make sure we got a valid object before using it
-
-  // If number of inputs to the original node is > 0, then we add
-  // control dependency between 1st input (index 0) of the original node and
-  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
-  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
-  // rewritten node. Adding control edge between 1st input of the original node
-  // and the dummy Mkl node ensures that the dummy node is in the same frame
-  // as the original node. Choosing 1st input is not necessary - any input of
-  // the original node is fine because all the inputs of a node are always in
-  // the same frame.
-  if (orig_node->num_inputs() > 0) {
-    Node* orig_input0 = nullptr;
-    TF_CHECK_OK(
-        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
-    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
-  }
-
-  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
-}
-
-void MklLayoutRewritePass::GetNodesProducingMklTensorList(
-    std::unique_ptr<Graph>* g, Node* orig_node,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
-    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
-  CHECK_LT(*input_idx, inputs.size());
-  CHECK_GT(list_length, 0);
-  CHECK_NOTNULL(output_nodes);
-  output_nodes->reserve(list_length);
-
-  while (list_length != 0) {
-    CHECK_GT(list_length, 0);
-    CHECK_LT(*input_idx, inputs.size());
-    Node* n = inputs[*input_idx].first;
-    int slot = inputs[*input_idx].second;
-    // If 'n' is producing a single tensor, then create a single Mkl tensor
-    // node.
-    Node* mkl_node = nullptr;
-    int mkl_node_output_slot = 0;
-    GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
-                              &mkl_node_output_slot);
-    output_nodes->push_back(
-        NodeBuilder::NodeOut(mkl_node, mkl_node_output_slot));
-    (*input_idx)++;
-    list_length--;
-  }
-}
-
-// Get an input node that will feed Mkl tensor to the new
-// node that we are constructing. An input node could be (1) 'n'
-// if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
-// if 'n' is not an Mkl layer.
-void MklLayoutRewritePass::GetNodeProducingMklTensor(
-    std::unique_ptr<Graph>* g, Node* orig_node, Node* n, int n_output_slot,
-    Node** mkl_node, int* mkl_node_output_slot) {
-  CHECK_NOTNULL(n);
-  CHECK_NOTNULL(mkl_node);
-  CHECK_NOTNULL(mkl_node_output_slot);
-
-  // If this is an MKL op, then it will create extra output for MKL layout.
-  DataType T;
-  if (GetNodeAttr(n->def(), "T", &T).ok() &&
-      mkl_op_registry::IsMklOp(n->type_string(), T)) {
-    // If this is an MKL op, then it will generate an edge that will receive
-    // Mkl tensor from a node.
-    // output slot number for Mkl tensor would be N+slot number of TensorFlow
-    // tensor, where N is total number of TensorFlow tensors.
-    *mkl_node = n;
-    *mkl_node_output_slot =
-        GetTensorMetaDataIndex(n_output_slot, n->num_outputs());
-  } else {
-    // If we have not visited the node and rewritten it, then we need
-    // to create a dummy node that will feed a dummy Mkl tensor to this node.
-    // DummyMklTensor node has no input and generates only 1 output
-    // (dummy Mkl tensor) as output slot number 0.
-    GetDummyMklTensorNode(g, mkl_node, orig_node);
-    CHECK_NOTNULL(*mkl_node);
-    *mkl_node_output_slot = 0;
-  }
-}
-
-int MklLayoutRewritePass::SetUpContiguousInputs(
-    std::unique_ptr<Graph>* g,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
-    NodeBuilder* nb, Node* old_node,
-    std::vector<NodeBuilder::NodeOut>* workspace_tensors,
-    bool are_workspace_tensors_available) {
-  CHECK_NOTNULL(workspace_tensors);
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-
-  // TODO(nhasabni): Temporary solution to connect filter input of
-  // BackpropInput with the converted filter from Conv2D.
-  bool do_connect_conv2d_backprop_input_filter = false;
-  Node* conv2d_node = nullptr;
-  // Filter node is 2nd input (slot index 1) of Conv2D.
-  int kConv2DFilterInputSlotIdx = 1;
-  int kConv2DBackpropInputFilterInputSlotIdx = 1;
-  int kConv2DFilterOutputSlotIdx = 1;
-  if (old_node->type_string() == csinfo_.conv2d_grad_input) {
-    // We need to find Conv2D node from Conv2DBackpropInput.
-    // For that let's first find filter node that is 2nd input (slot 1)
-    // of BackpropInput.
-    Node* filter_node = nullptr;
-    TF_CHECK_OK(old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx,
-                                     &filter_node));
-    CHECK_NOTNULL(filter_node);
-
-    // Now check which nodes receive from filter_node. Filter feeds as
-    // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
-    for (const Edge* e : filter_node->out_edges()) {
-      if (e->dst()->type_string() == csinfo_.mkl_conv2d &&
-          e->dst_input() == kConv2DFilterInputSlotIdx
-          /* filter is 2nd input of Conv2D and _MklConv2D. */) {
-        if (conv2d_node != nullptr) {
-          VLOG(1) << "MklLayoutRewritePass: unusual case of same filter"
-                  << " feeding multiple Conv2D nodes: "
-                  << filter_node->DebugString();
-          // We will not connect filter input of Conv2DBackpropInput
-          // to be safe here.
-          do_connect_conv2d_backprop_input_filter = false;
-          break;
-        } else {
-          conv2d_node = e->dst();
-          do_connect_conv2d_backprop_input_filter = true;
-        }
-      }
-    }
-  }
-
-  // Number of input slots to original op
-  // Input slots are represented by .Input() calls in REGISTER_OP.
-  int old_node_input_slots = old_node->op_def().input_arg_size();
-  // Actual number of inputs can be greater than or equal to number
-  // of Input slots because inputs of type list could be unfolded.
-  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
-  int nn_slot_idx = 0;  // slot index for inputs of new node
-
-  // Let's copy all inputs (TF tensors) of original node to new node.
-  int iidx = 0;
-  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
-    // An input slot could be a single tensor or a list. We need
-    // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
-    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
-    if (ArgIsList(arg)) {
-      std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
-                                    &new_node_inputs);
-      nb->Input(new_node_inputs);
-      nn_slot_idx++;
-    } else {
-      // Special case for connecting filter input of Conv2DBackpropInput
-      if (do_connect_conv2d_backprop_input_filter &&
-          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
-        nb->Input(conv2d_node, kConv2DFilterOutputSlotIdx);
-      } else {
-        nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
-      }
-      iidx++;
-      nn_slot_idx++;
-    }
-  }
-
-  // If workspace tensors are available for this op and we are using
-  // contiguous ordering then we need to add Tensorflow tensor for
-  // workspace here because Tensorflow tensor for workspace is the
-  // last tensor in the list of Tensorflow tensors.
-  if (are_workspace_tensors_available) {
-    CHECK_EQ(workspace_tensors->size(), 2);
-    // Tensorflow tensor
-    nb->Input((*workspace_tensors)[0].node, (*workspace_tensors)[0].index);
-    nn_slot_idx++;
-  }
-
-  // Let's now setup all Mkl inputs to new node.
-  // Number of Mkl inputs must be same as number of TF inputs.
-  iidx = 0;
-  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
-    // An input slot could be a single tensor or a list. We need
-    // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
-    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
-    if (ArgIsList(arg)) {
-      std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
-                                     &new_node_inputs);
-      nb->Input(new_node_inputs);
-      nn_slot_idx++;
-    } else {
-      Node* mkl_node = nullptr;
-      int mkl_node_output_slot = 0;
-      // Special case for connecting filter input of Conv2DBackpropInput
-      if (do_connect_conv2d_backprop_input_filter &&
-          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
-        GetNodeProducingMklTensor(g, old_node, conv2d_node,
-                                  kConv2DFilterOutputSlotIdx, &mkl_node,
-                                  &mkl_node_output_slot);
-      } else {
-        GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
-                                  old_node_inputs[iidx].second, &mkl_node,
-                                  &mkl_node_output_slot);
-      }
-      nb->Input(mkl_node, mkl_node_output_slot);
-      iidx++;
-      nn_slot_idx++;
-    }
-  }
-
-  // If workspace tensors are available for this op and we are using
-  // contiguous ordering then we need to add Mkl tensor for
-  // workspace here because Mkl tensor for workspace is the
-  // last tensor in the list of Mkl tensors.
-  if (are_workspace_tensors_available) {
-    CHECK_EQ(workspace_tensors->size(), 2);
-    // Mkl tensor
-    nb->Input((*workspace_tensors)[1].node, (*workspace_tensors)[1].index);
-    nn_slot_idx++;
-  }
-
-  return nn_slot_idx;
-}
-
-Status MklLayoutRewritePass::SetUpInputs(
-    std::unique_ptr<Graph>* g,
-    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
-    NodeBuilder* nb, Node* old_node) {
-  // Let's check if we need to add workspace tensors for this node.
-  // We add workspace edge only for MaxPool, LRN and BatchNorm.
-  std::vector<NodeBuilder::NodeOut> workspace_tensors;
-  bool are_workspace_tensors_available = false;
-  AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
-                           &are_workspace_tensors_available);
-
-  int new_node_input_slots = 0;
-  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-    // TODO(nhasabni): implement this function just for same of completion.
-    // We do not use interleaved ordering right now.
-    return Status(
-        error::Code::UNIMPLEMENTED,
-        "Interleaved ordering of tensors is currently not supported.");
-  } else {
-    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-    new_node_input_slots = SetUpContiguousInputs(
-        g, old_node_inputs, nb, old_node, &workspace_tensors,
-        are_workspace_tensors_available);
-  }
-
-  // Sanity check
-  int old_node_input_slots = old_node->op_def().input_arg_size();
-  if (!are_workspace_tensors_available) {
-    // If we are not adding workspace tensors for this op, then the total
-    // number of input slots to the new node _must_ be 2 times the number
-    // of input slots to the original node: N original Tensorflow tensors and
-    // N for Mkl tensors corresponding to each Tensorflow tensors.
-    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2);
-  } else {
-    // If we are adding workspace tensors for this op, then the total
-    // The total number of input slots to new node _must_ be 2 times the number
-    // of input slots to the original node: N original Tensorflow tensors and
-    // N for Mkl tensors corresponding to each Tensorflow tensors plus 2
-    // (for workspace Tensorflow tensor and workspace Mkl tensor).
-    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2 + 2);
-  }
-
-  return Status::OK();
-}
-
-//////////////////////////////////////////////////////////////////////////
-//           Helper functions related to workspace pass
-//////////////////////////////////////////////////////////////////////////
-
-// TODO(nhasabni) We should move this to mkl_util.h.
-void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
-    std::unique_ptr<Graph>* g, Node** out, Node* orig_node) {
-  // We use a tensor of shape {1} and value 0 to represent
-  // dummy float tensor. We need this as a dummy workspace tensor.
-  // Workspace tensor has type float.
-  const DataType dt = DataTypeToEnum<float>::v();
-  TensorProto proto;
-  proto.set_dtype(dt);
-  float zero[1] = {0};
-  proto.set_tensor_content(string(reinterpret_cast<char*>(&zero), 4));
-  TensorShape dummy_shape({1});
-  dummy_shape.AsProto(proto.mutable_tensor_shape());
-  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
-                  .Attr("value", proto)
-                  .Attr("dtype", dt)
-                  .Device(orig_node->def().device())  // We place this node on
-                                                      // same the device as the
-                                                      // device of the original
-                                                      // node.
-                  .Finalize(&**g, out));
-  CHECK_NOTNULL(*out); // Make sure we got a valid object before using it
-
-  // If number of inputs to the original node is > 0, then we add
-  // control dependency between 1st input (index 0) of the original node and
-  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
-  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
-  // rewritten node. Adding control edge between 1st input of the original node
-  // and the dummy Mkl node ensures that the dummy node is in the same frame
-  // as the original node. Choosing 1st input is not necessary - any input of
-  // the original node is fine because all the inputs of a node are always in
-  // the same frame.
-  if (orig_node->num_inputs() > 0) {
-    Node* orig_input0 = nullptr;
-    TF_CHECK_OK(
-        orig_node->input_node(0, const_cast<const Node**>(&orig_input0)));
-    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
-  }
-
-  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
-}
-
-void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
-    std::unique_ptr<Graph>* g, Node* orig_node, NodeBuilder* nb,
-    std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
-  bool workspace_edge_added = false;  // Default initializer
-  CHECK_NOTNULL(are_ws_tensors_added);
-  *are_ws_tensors_added = false;  // Default initializer
-
-  DataType T;
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  for (auto ws : wsinfo_) {
-    if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklOp(
-            mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
-      // If this op is a fwd op, then we need to check if there is an
-      // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
-      // an edge, then we just add an attribute on this node for setting
-      // workspace_passed to true. We don't add actual workspace edge
-      // in this node. Actual workspace edge gets added in the backward
-      // op for this node.
-      for (const Edge* e : orig_node->out_edges()) {
-        if (e->src_output() == ws.fwd_slot &&
-            e->dst()->type_string() == ws.bwd_op &&
-            e->dst_input() == ws.bwd_slot) {
-          nb->Attr("workspace_enabled", true);
-          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
-                  << orig_node->type_string();
-          workspace_edge_added = true;
-          // We found the edge that we were looking for, so break.
-          break;
-        }
-      }
-
-      if (!workspace_edge_added) {
-        // If we are here, then we did not find backward operator for this
-        // node.
-        nb->Attr("workspace_enabled", false);
-      }
-    } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklOp(
-                   mkl_op_registry::GetMklOpName(orig_node->type_string()),
-                   T)) {
-      // If this op is a bwd op, then we need to add workspace edge and
-      // it's Mkl tensor edge between its corresponding fwd op and this
-      // op. Corresponding fwd op is specified in 'fwd_op' field of
-      // workspace info. fwd_slot and bwd_slot in workspace info specify
-      // an edge between which slots connect forward and backward op.
-      // Once all these criteria match, we add a workspace edge between
-      // ws_fwd_slot and ws_bwd_slot. Its corresponding Mkl tensor is
-      // determined by interleaved/contiguous ordering. Function
-      // DataIndexToMetaDataIndex tells us the location of Mkl tensor
-      // from the location of the Tensorflow tensor.
-      for (const Edge* e : orig_node->in_edges()) {
-        if (e->src_output() == ws.fwd_slot &&
-            // We would have rewritten the forward op, so we need to use
-            // GetMklOpName call to get its Mkl name.
-            e->src()->type_string() ==
-                mkl_op_registry::GetMklOpName(ws.fwd_op) &&
-            e->dst_input() == ws.bwd_slot) {
-          nb->Attr("workspace_enabled", true);
-          CHECK_NOTNULL(ws_tensors);
-          // Add workspace edge between fwd op and bwd op.
-          ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
-          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
-          ws_tensors->push_back(NodeBuilder::NodeOut(
-              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
-                                                 e->src()->num_outputs())));
-          *are_ws_tensors_added = true;
-          // In terms of input ordering, we add these calls to add Input
-          // here because workspace edge (and its Mkl tensor) is the last
-          // edge in the fwdop and bwdop. So all inputs before workspace
-          // tensor have been added by SetUpInputs function.
-          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
-                  << orig_node->type_string();
-          workspace_edge_added = true;
-          // We found the edge that we were looking for, so break.
-          break;
-        }
-      }
-
-      // If we are here means we did not find fwd op that feeds to this
-      // bwd op. So in this case, we need to generate dummy tensors for
-      // workspace input and Mkl tensor for workspace, and set
-      // workspace_enabled to false.
-      if (!workspace_edge_added) {
-        nb->Attr("workspace_enabled", false);
-        Node* dmt_ws = nullptr;      // Dummy tensor for workspace
-        Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
-        GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
-        GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
-        CHECK_NOTNULL(dmt_ws);
-        CHECK_NOTNULL(dmt_mkl_ws);
-        CHECK_NOTNULL(ws_tensors);
-        // We add dummy tensor as workspace tensor.
-        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
-        // We add dummy tensor as Mkl tensor for workspace tensor.
-        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_mkl_ws, 0));
-        *are_ws_tensors_added = true;
-        VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
-                << orig_node->type_string();
-      }
-    } else {
-      // If this node does not match any workspace info, then we do not
-      // do anything special for workspace propagation for it.
-    }
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Op-specific functions to copy attributes from old node to new node
-//////////////////////////////////////////////////////////////////////////
-
-void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
-                                           NodeBuilder* nb) {
-  DataType T;
-  string data_format;
-  string padding;
-  std::vector<int32> strides;
-  bool use_cudnn_on_gpu;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-  TF_CHECK_OK(
-      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("strides", strides);
-  nb->Attr("padding", padding);
-  nb->Attr("data_format", data_format);
-  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
-}
-
-void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
-                                         NodeBuilder* nb) {
-  DataType T;
-  int N;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("N", N);
-}
-
-void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
-                                                NodeBuilder* nb) {
-  DataType T;
-  string data_format;
-  std::vector<int32> strides;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("strides", strides);
-  nb->Attr("data_format", data_format);
-}
-
-void MklLayoutRewritePass::CopyAttrsIdentity(const Node* orig_node,
-                                             NodeBuilder* nb) {
-  DataType T;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  // Add attributes to new node.
-  nb->Attr("T", T);
-}
-
-void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
-                                        NodeBuilder* nb) {
-  DataType T;
-  int depth_radius;
-  float bias;
-  float alpha;
-  float beta;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("depth_radius", depth_radius);
-  nb->Attr("bias", bias);
-  nb->Attr("alpha", alpha);
-  nb->Attr("beta", beta);
-}
-
-void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
-                                            NodeBuilder* nb) {
-  DataType T;
-  string data_format;
-  string padding;
-  std::vector<int32> ksize, strides;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("ksize", ksize);
-  nb->Attr("strides", strides);
-  nb->Attr("padding", padding);
-  nb->Attr("data_format", data_format);
-}
-
-void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
-                                             NodeBuilder* nb) {
-  DataType T;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-}
-
-void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                            NodeBuilder* nb) {
-  DataType T;
-  DataType Tshape;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("Tshape", Tshape);
-}
-
-void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
-                                          NodeBuilder* nb) {
-  DataType T;
-  string data_format;
-  int num_split;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("num_split", num_split);
-  nb->Attr("data_format", data_format);
-}
-
-void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
-                                           NodeBuilder* nb) {
-  DataType T;
-  int N;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("N", N);
-}
-
-void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
-                                             NodeBuilder* nb) {
-  DataType T;
-  int N;
-  DataType tidx;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("N", N);
-  nb->Attr("Tidx", tidx);
-}
-
-void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
-                                                   NodeBuilder* nb) {
-  DataType T;
-  float epsilon;
-  string data_format;
-  bool is_training;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("epsilon", epsilon);
-  nb->Attr("data_format", data_format);
-  nb->Attr("is_training", is_training);
-}
-
-//////////////////////////////////////////////////////////////////////////
-//           Helper functions related to node merge pass
-//////////////////////////////////////////////////////////////////////////
-
-Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
-  // TODO(nhasabni) Add check for type of node similar to CheckForNodeRewrite
-  // once we support BiasAddGrad as Mkl layer.
-
-  // Search for all matching mergeinfo.
-  // We allow more than one match for extensibility.
-  std::vector<const MergeInfo*> matching_mi;
-  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
-    if (a->type_string() == mi->succ) {
-      matching_mi.push_back(&*mi);
-    }
-  }
-
-  for (const MergeInfo* mi : matching_mi) {
-    const int N_in = a->num_inputs();
-    if (mi->op >= N_in) {
-      continue;
-    }
-
-    // Get the control edges and input of node
-    gtl::InlinedVector<Node*, 4> a_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
-    FillInputs(a, &a_control_edges, &a_in);
-
-    // Get operand op of the operator
-    Node* b = nullptr;
-    b = a_in[mi->op].first;
-    if (b == nullptr || (b->type_string() != mi->pred)) {
-      // NOTE: Should the first check be assert?
-      continue;
-    }
-
-    const int B_in = b->num_inputs();
-    gtl::InlinedVector<Node*, 4> b_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(B_in);
-    FillInputs(b, &b_control_edges, &b_in);
-
-    // Shouldn't merge if a and b have different control edges.
-    if (a_control_edges != b_control_edges) {
-      continue;
-    } else {
-      // We found a match.
-      return b;
-    }
-  }
-
-  return nullptr;
-}
-
-Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* succ,
-                                       Node* pred) {
-  CHECK_NOTNULL(succ);
-  CHECK_NOTNULL(pred);
-
-  if (succ->type_string() == csinfo_.bias_add &&
-      pred->type_string() == csinfo_.mkl_conv2d) {
-    // 1. Get all attributes from input nodes.
-    DataType T_pred, T_succ;
-    string padding;
-    std::vector<int32> strides;
-    string data_format_pred, data_format_succ;
-    bool use_cudnn_on_gnu;
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
-    TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
-    TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
-    TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-    TF_CHECK_OK(
-        GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
-    // We check to ensure that data formats of both succ and pred are same.
-    // We expect them to be same, so we can enforce this as assert.
-    // But assert can be too strict, so we enforce this as a check.
-    // If the check fails, then we do not merge two nodes.
-    // We also do same check for devices.
-    if (data_format_pred != data_format_succ || T_pred != T_succ ||
-        pred->assigned_device_name() != succ->assigned_device_name() ||
-        pred->def().device() != succ->def().device()) {
-      return Status(error::Code::INVALID_ARGUMENT,
-                    "data_format or T attribute or devices of Conv2D and "
-                    "BiasAdd do not match. Will skip node merge optimization");
-    }
-
-    const int succ_num = succ->num_inputs();
-    gtl::InlinedVector<Node*, 4> succ_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
-    FillInputs(succ, &succ_control_edges, &succ_in);
-
-    const int pred_num = pred->num_inputs();
-    gtl::InlinedVector<Node*, 4> pred_control_edges;
-    gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
-    FillInputs(pred, &pred_control_edges, &pred_in);
-
-    // We need to ensure that there is only 1 edge between Conv2D and AddBias.
-    // Otherwise, merging is semantically incorrect.
-    if (pred->out_edges().size() != 1) {
-      return Status(error::Code::INVALID_ARGUMENT,
-                    "Conv2D has multiple outputs."
-                    "Will skip node merge optimization");
-    }
-
-    for (const Edge* e : pred->out_edges()) {
-      if (e->dst() != succ) {
-        return Status(error::Code::INVALID_ARGUMENT,
-                      "Conv2D does not feed to BiasAdd."
-                      "Will skip node merge optimization");
-      }
-    }
-
-    // 2. Get inputs from both the nodes.
-    // Find the 2 inputs from the conv and the bias from the add Bias.
-    // Get operand 0, 1 of conv2D and their Mkl tensors.
-    CHECK_EQ(pred->in_edges().size(), 4);  // _MklConv2D must have 4 inputs.
-    // Get operand 1 of add_bias
-    // BiasAdd must have 2 inputs: Conv, bias
-    CHECK_EQ(succ->in_edges().size(), 2);
-    Node* oper3_mkl = nullptr;  // Mkl tensor corresponding to oper3
-    int oper3_mkl_slot = 0;     // For dummy MKL tensor node, output slot is 0.
-    GetDummyMklTensorNode(g, &oper3_mkl, pred);  // Get dummy Mkl tensor node
-    // as BiasAdd does not have Mkl tensor as input.
-    CHECK_NOTNULL(oper3_mkl);
-
-    // We will use the node name of BiasAdd as the name of new node
-    // Build new node. We use same name as original node, but change the op
-    // name.
-    NodeBuilder nb(succ->name(), csinfo_.mkl_conv2d_with_bias);
-    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-      nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
-      // pred_in[1] will be Mkl tensor for In1 if we follow interleaved
-      // ordering, and it will be 2nd Tensorflow tensor for Conv2D if
-      // we follow contiguous ordering.
-      nb.Input(pred_in[1].first, pred_in[1].second);  // Mkl for In1
-      nb.Input(pred_in[2].first, pred_in[2].second);  // In2 of Conv2D
-      nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2
-      nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
-      nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
-    } else {
-      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-      nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
-      // pred_in[1] will be Mkl tensor for In1 if we follow interleaved
-      // ordering, and it will be 2nd Tensorflow tensor for Conv2D if
-      // we follow contiguous ordering.
-      nb.Input(pred_in[1].first, pred_in[1].second);  // In2 of Conv2D
-      nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
-      nb.Input(pred_in[2].first, pred_in[2].second);  // Mkl for In1 of Conv2D
-      nb.Input(pred_in[3].first, pred_in[3].second);  // Mkl for In2 of Conv2D
-      nb.Input(oper3_mkl, oper3_mkl_slot);            // Mkl for In2 of BiasAdd
-    }
-
-    // Copy attributes from Conv2D to Conv2DWithBias.
-    CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
-
-    // Copy the device assigned to old node to new node.
-    nb.Device(succ->def().device());
-
-    // Create node.
-    Node* new_node;
-    TF_CHECK_OK(nb.Finalize(&**g, &new_node));
-    CHECK_NOTNULL(new_node);
-
-    // Set the Mkl layer label for this op.
-    new_node->AddAttr("_kernel", mkl_op_registry::kMklOpLabel);
-
-    // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
-    // node are already copied in BuildNode. We handle control edges now.
-    for (const Edge* e : pred->in_edges()) {
-      if (e->IsControlEdge()) {
-        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
-      }
-    }
-    for (const Edge* e : succ->in_edges()) {
-      if (e->IsControlEdge()) {
-        CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
-      }
-    }
-
-    // Incoming edges are fixed, we will fix the outgoing edges now.
-    // First, we will fix outgoing control edges from 'pred' node.
-    // We don't need to handle outgoing data edges from 'pred' node
-    // because pred has only 1 output going to succ node (we enforced
-    // this check for merge already).
-    for (const Edge* e : pred->out_edges()) {
-      if (e->IsControlEdge()) {
-        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
-      }
-    }
-
-    // Second, we will fix outgoing control and data edges from 'succ' node.
-    for (const Edge* e : succ->out_edges()) {
-      if (e->IsControlEdge()) {
-        CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
-      } else {
-        CHECK_NOTNULL(
-            (*g)->AddEdge(new_node, e->src_output(), e->dst(), e->dst_input()));
-      }
-    }
-
-    // Copy device assigned to old node to new node.
-    // It's ok to use pred or succ as we have enforced a check that
-    // both have same device assigned.
-    new_node->set_assigned_device_name(pred->assigned_device_name());
-
-    VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
-            << ", and node: " << succ->DebugString()
-            << ", into node:" << new_node->DebugString();
-
-    (*g)->RemoveNode(succ);
-    (*g)->RemoveNode(pred);
-
-    return Status::OK();
-  }
-
-  return Status(error::Code::UNIMPLEMENTED,
-                "Unimplemented case for node merge optimization.");
-}
-
-//////////////////////////////////////////////////////////////////////////
-//           Helper functions for node rewrite
-//////////////////////////////////////////////////////////////////////////
-
-Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
-                                         Node* orig_node,
-                                         const RewriteInfo* ri) {
-  CHECK_NOTNULL(ri);
-  CHECK_NOTNULL(orig_node);
-
-  VLOG(1) << "MklLayoutRewritePass: Original node:" << orig_node->DebugString();
-
-  // Check if this is scenario 2 (context-based rewrite).
-  // Get the matching ContextInfo if it is.
-  const Node* fwd_node = nullptr;
-  const ContextInfo* ci = nullptr;
-  bool is_context_based_rewrite = false;
-  if ((ci = SearchMatchingContext(orig_node, &fwd_node)) != nullptr) {
-    is_context_based_rewrite = true;
-
-    // Sanity checks for context-based rewrite (if any)
-    if (orig_node->type_string() == csinfo_.bias_add_grad &&
-        ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
-      CHECK_NOTNULL(fwd_node);
-      DataType orig_T, ctx_T;
-      string orig_data_format, ctx_data_format;
-      TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &orig_T));
-      TF_CHECK_OK(
-          GetNodeAttr(orig_node->def(), "data_format", &orig_data_format));
-      TF_CHECK_OK(GetNodeAttr(fwd_node->def(), "T", &ctx_T));
-      TF_CHECK_OK(
-          GetNodeAttr(fwd_node->def(), "data_format", &ctx_data_format));
-
-      if (orig_data_format != ctx_data_format || orig_T != ctx_T ||
-          orig_node->assigned_device_name() !=
-              fwd_node->assigned_device_name() ||
-          orig_node->def().device() != fwd_node->def().device()) {
-        return Status(
-            error::Code::INVALID_ARGUMENT,
-            "data_format or T attribute or devices of BiasAddGrad and "
-            "Conv2D do not match. Will skip node rewrite optimization");
-      }
-    } else if (orig_node->type_string() == csinfo_.bias_add_grad &&
-               ri->new_name == csinfo_.matmul) {
-      // When BiasAddGrad has MatMul in context, we do not do any rewrite
-      // and leave BiasAddGrad as it is. But we check for this condition
-      // when we check for node rewrite rule. So we should not even come
-      // here for MatMul. So we will fail now.
-      return Status(
-          error::Code::INVALID_ARGUMENT,
-          "No rewrite is required for BiasAddGrad for MatMul context.");
-    }
-  }
-
-  // Get all inputs.
-  int num_inputs = orig_node->in_edges().size();
-
-  // Drop count for control edges from inputs
-  for (const Edge* e : orig_node->in_edges()) {
-    if (e->IsControlEdge()) {
-      num_inputs--;
-    }
-  }
-
-  gtl::InlinedVector<Node*, 4> control_edges;
-  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
-  FillInputs(orig_node, &control_edges, &inputs);
-
-  // Build new node. We use same name as original node, but change the op name.
-  NodeBuilder nb(orig_node->name().c_str(), ri->new_name.c_str());
-  // Copy user-specified device assigned to original node to new node.
-  nb.Device(orig_node->def().device());
-  // Set up new inputs to the rewritten node.
-  Status s = SetUpInputs(g, inputs, &nb, orig_node);
-  if (s != Status::OK()) {
-    return s;
-  }
-
-  // Copy attributes from original node to new node (for scenario 1).
-  // For context-based rewrite, we use context to copy the attributes.
-  if (is_context_based_rewrite) {
-    if (orig_node->type_string() == csinfo_.bias_add_grad &&
-        ri->new_name == csinfo_.mkl_conv2d_with_bias_backprop_bias) {
-      CHECK_NOTNULL(fwd_node);
-      ri->copy_attrs(fwd_node, &nb);
-    } else {
-      return Status(error::Code::UNIMPLEMENTED,
-                    "Unimplemented case for node rewrite optimization.");
-    }
-  } else {
-    ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
-  }
-  // Set the Mkl layer label for this op.
-  nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
-
-  // Finalize graph and get new node.
-  Node* new_node = nullptr;
-  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
-  CHECK_NOTNULL(new_node);
-
-  // Incoming data edges from 'orig_node' node to new 'new_node' node are
-  // already copied in BuildNode. We need to handle control edges now.
-  for (const Edge* e : orig_node->in_edges()) {
-    if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
-    }
-  }
-
-  // Copy outgoing edges from 'orig_node' node to new
-  // 'new_node' node, since the output also follows same ordering among
-  // Tensorflow tensors and Mkl tensors. We need to connect Tensorflow
-  // tensors appropriately. Specifically, nth output of the original node
-  // will become 2*nth output of the Mkl node for the interleaved ordering
-  // of the tensors. For the contiguous ordering of the tensors, it will be n.
-  // GetTensorDataIndex provides this mapping function.
-  for (const Edge* e : orig_node->out_edges()) {
-    if (e->IsControlEdge()) {
-      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
-    } else {
-      CHECK_NOTNULL((*g)->AddEdge(
-          new_node,
-          GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
-          e->dst(), e->dst_input()));
-    }
-  }
-
-  // Copy the runtime device assigned from original code to new node.
-  new_node->set_assigned_device_name(orig_node->assigned_device_name());
-
-  // Delete original node and mark new node as rewritten.
-  (*g)->RemoveNode(orig_node);
-
-  VLOG(1) << "MklLayoutRewritePass: New node:" << new_node->DebugString();
-  return Status::OK();
-}
-
-const MklLayoutRewritePass::ContextInfo*
-MklLayoutRewritePass::SearchMatchingContext(const Node* n,
-                                            const Node** fwd_node) {
-  CHECK_NOTNULL(n);
-  CHECK_NOTNULL(fwd_node);
-  *fwd_node = nullptr;
-
-  // Search for matching contextinfo based on node name and call
-  // callback function using matching contextinfo.
-  // There could be more than one matching contextinfos but whichever
-  // matches first is returned.
-  for (auto ci = cinfo_.cbegin(); ci != cinfo_.cend(); ++ci) {
-    if (n->type_string() == (*ci)->node &&
-        (*ci)->context_match_fn(n, fwd_node, *ci)) {
-      VLOG(1) << "Found context as matching: " << (*ci)->fwd;
-      return *ci;
-    }
-  }
-  return nullptr;
-}
-
-bool MklLayoutRewritePass::ContextMatchRewrite(const Node* n,
-                                               const ContextInfo* c) {
-  const Node* fwd_node = nullptr;
-  return SearchMatchingContext(n, &fwd_node) == c;
-}
-
-const MklLayoutRewritePass::RewriteInfo*
-MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
-  CHECK_NOTNULL(n);
-
-  // First check if node along with its type is supported by MKL layer.
-  // We do not want to rewrite an op into Mkl op if types are not supported.
-  // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
-  // MklRelu if type is INT32.
-  DataType T;
-  if (!GetNodeAttr(n->def(), "T", &T).ok()) {
-    return nullptr;
-  }
-
-  // BiasAddGrad is not an Mkl layer, so we make an exception for it.
-  if (n->type_string() != csinfo_.bias_add_grad) {
-    if (!mkl_op_registry::IsMklOp(
-            mkl_op_registry::GetMklOpName(n->type_string()), T)) {
-      return nullptr;
-    }
-  }
-
-  // For elementwise node, we reuse the Eigen implementation and pass the MKL
-  // metadata tensor through so we can avoid conversions. However, if all
-  // incoming edges are in TF format, we don't need all this overhead, so
-  // replace the elementwise node only if at least one of its parents is a MKL
-  // node.
-  //
-  // TODO(vrane): Add implementation for element-wise ops that doesn't reuse
-  // eigen code to reduce cross-library dependency.
-  if (mkl_op_registry::IsMklElementWiseOp(
-          mkl_op_registry::GetMklOpName(n->type_string()), T)) {
-    bool incoming_mkl_edge = false;
-    for (auto parent : n->in_edges()) {
-      if (mkl_op_registry::IsMklOp(
-              mkl_op_registry::GetMklOpName(parent->src()->type_string()), T)) {
-        incoming_mkl_edge = true;
-        break;
-      } else {
-        VLOG(1) << "Non-MKL parent is: " << parent->src()->type_string();
-      }
-    }
-    if (incoming_mkl_edge == false) {
-      VLOG(1) << "Skipping replacement of elementwise node which has no MKL "
-                 "parents.";
-      return nullptr;
-    }
-  }
-
-  // We support 2 types of node rewrites:
-  // 1. Rewriting BiasAddGrad depending on its MklConv2DWithBias context.
-  // 2. Rewriting an op to Mkl op always
-  // We return true if any of these 2 conditions is met.
-
-  // Find matching RewriteInfo and then check that rewrite rule applies.
-  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
-    if (n->type_string().compare(ri->name) == 0 &&
-        ri->rewrite_rule(n, ri->context)) {
-      // If we are rewriting BiasAddGrad into BiasAddGrad for MatMul context,
-      // then we just return directly.
-      if (n->type_string() == csinfo_.bias_add_grad &&
-          ri->context->fwd == csinfo_.matmul &&
-          ri->new_name == csinfo_.bias_add_grad) {
-        return nullptr;
-      }
-      return &*ri;
-    }
-  }
-
-  // Else return not found.
-  return nullptr;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-//              Run function for the pass
-///////////////////////////////////////////////////////////////////////////////
-
-bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
-  bool result = false;
-  CHECK_NOTNULL(g);
-
-  DumpGraph("Before running MklLayoutRewritePass", &**g);
-
-  std::vector<Node*> order;
-  GetReversePostOrder(**g, &order);  // This will give us topological sort.
-
-  for (Node* n : order) {
-    // If node is not an op or it cannot run on CPU device, then skip.
-    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
-      continue;
-    }
-
-    const RewriteInfo* ri = nullptr;
-    Node* predn = nullptr;
-    // We will first search if node is to be rewritten
-    if ((ri = CheckForNodeRewrite(n)) != nullptr) {
-      string node_name = n->name();
-      string op_name = n->type_string();
-
-      VLOG(1) << "MklLayoutRewritePass: Scheduled node " << node_name
-              << " with op " << op_name << " for rewrite using"
-              << " layout optimization.";
-
-      if (RewriteNode(g, n, ri) == Status::OK()) {
-        VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
-                << " with op " << op_name << " for Mkl layout optimization.";
-        result = true;
-      }
-    } else if ((predn = CheckForNodeMerge(n)) != nullptr) {
-      // Otherwise, we will check if the node is to be merged.
-      string n1_name = n->name();
-      string n2_name = predn->name();
-
-      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
-              << n2_name << " for merging";
 
-      if (MergeNode(g, n, predn) == Status::OK()) {
-        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
-                << n2_name;
-        result = true;
-      }
-    }
-  }
-
-  DumpGraph("After running MklLayoutRewritePass", &**g);
-
-  return result;
-}
-
-bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
-  return MklLayoutRewritePass().RunPass(g);
-}
-
-Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
-  if (options.graph == nullptr && options.partition_graphs == nullptr) {
-    return Status::OK();
-  }
-
-  auto process_graph = [&](std::unique_ptr<Graph>* g) {
-    // Get the ownership of a graph
-    std::unique_ptr<Graph>* ng = std::move(g);
-    RunPass(ng);
-    // Return the ownership of a graph back
-    g->reset(ng->release());
-  };
-
-  if (kMklLayoutRewritePassGroup !=
-      OptimizationPassRegistry::POST_PARTITIONING) {
-    // For any pre-partitioning phase, a graph is stored in options.graph.
-    process_graph(options.graph);
-  } else {
-    // For post partitioning phase, graphs are stored in
-    // options.partition_graphs.
-    for (auto& pg : *options.partition_graphs) {
-      process_graph(&pg.second);
-    }
-  }
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/util.h"
 
-  return Status::OK();
-}
+#include "tensorflow/core/graph/mkl_graph_util.h"
+#include "tensorflow/core/graph/mkl_layout_pass.h"
 
-#else   // INTEL_MKL_ML_ONLY
+namespace tensorflow {
 
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
@@ -2446,14 +274,49 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_grad_filter_with_bias =
         "_MklConv2DBackpropFilterWithBias";
+    csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
+    csinfo_.pad = "Pad";
+    csinfo_.pad_with_conv2d = "__MklDummyPadWithConv2D";
+// Temporarily don't convert quantized operators into MKL versions for now.
+// TODO(Intel-tf) Once all the relevant PRs have been merged then remove
+// the ifdef.
+#ifdef INTEL_MKL_QUANTIZED
+    csinfo_.quantized_avg_pool = "QuantizedAvgPool";
+    csinfo_.quantized_concatv2 = "QuantizedConcatV2";
+    csinfo_.quantized_conv2d = "QuantizedConv2D";
+    csinfo_.quantized_conv2d_with_requantize = "QuantizedConv2DAndRequantize";
+    csinfo_.quantized_conv2d_with_bias = "QuantizedConv2DWithBias";
+    csinfo_.quantized_conv2d_with_bias_and_requantize =
+        "QuantizedConv2DWithBiasAndRequantize";
+    csinfo_.quantized_conv2d_and_relu = "QuantizedConv2DAndRelu";
+    csinfo_.quantized_conv2d_and_relu_and_requantize =
+        "QuantizedConv2DAndReluAndRequantize";
+    csinfo_.quantized_conv2d_with_bias_and_relu =
+        "QuantizedConv2DWithBiasAndRelu";
+    csinfo_.quantized_conv2d_with_bias_and_relu_and_requantize =
+        "QuantizedConv2DWithBiasAndReluAndRequantize";
+    csinfo_.quantized_max_pool = "QuantizedMaxPool";
+    csinfo_.quantized_conv2d_with_bias_sum_and_relu =
+        "QuantizedConv2DWithBiasSumAndRelu";
+    csinfo_.quantized_conv2d_with_bias_sum_and_relu_and_requantize =
+        "QuantizedConv2DWithBiasSumAndReluAndRequantize";
+    csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize =
+        "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize";
+#endif
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
+    csinfo_.relu6 = "Relu6";
+    csinfo_.relu6_grad = "Relu6Grad";
+#ifdef INTEL_MKL_QUANTIZED
+    csinfo_.requantize = "Requantize";
+#endif
     csinfo_.tanh = "Tanh";
     csinfo_.tanh_grad = "TanhGrad";
     csinfo_.reshape = "Reshape";
     csinfo_.slice = "Slice";
     csinfo_.softmax = "Softmax";
     csinfo_.split = "Split";
+    csinfo_.transpose = "Transpose";
     // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
     // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
     // MklInputConversion op is added before it.
@@ -2540,14 +403,83 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.maximum,
                       mkl_op_registry::GetMklOpName(csinfo_.maximum),
                       CopyAttrsDataType, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.mul,
-                      mkl_op_registry::GetMklOpName(csinfo_.mul),
+    rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d,
+                      CopyAttrsPadWithConv2D, AlwaysRewrite});
+#ifdef INTEL_MKL_QUANTIZED
+    rinfo_.push_back({csinfo_.quantized_avg_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
+                      CopyAttrsQuantizedPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.quantized_concatv2,
+                      mkl_op_registry::GetMklOpName(csinfo_.quantized_concatv2),
+                      CopyAttrsConcatV2, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.quantized_conv2d,
+                      mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d),
+                      CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.quantized_conv2d_with_requantize,
+                      mkl_op_registry::GetMklOpName(
+                          csinfo_.quantized_conv2d_with_requantize),
+                      CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.quantized_conv2d_with_bias,
+         mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d_with_bias),
+         CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_and_requantize,
+                      mkl_op_registry::GetMklOpName(
+                          csinfo_.quantized_conv2d_with_bias_and_requantize),
+                      CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.quantized_conv2d_and_relu,
+         mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d_and_relu),
+         CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.quantized_conv2d_and_relu_and_requantize,
+                      mkl_op_registry::GetMklOpName(
+                          csinfo_.quantized_conv2d_and_relu_and_requantize),
+                      CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_and_relu,
+                      mkl_op_registry::GetMklOpName(
+                          csinfo_.quantized_conv2d_with_bias_and_relu),
+                      CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.quantized_conv2d_with_bias_and_relu_and_requantize,
+         mkl_op_registry::GetMklOpName(
+             csinfo_.quantized_conv2d_with_bias_and_relu_and_requantize),
+         CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.quantized_max_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.quantized_max_pool),
+                      CopyAttrsQuantizedPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_sum_and_relu,
+                      mkl_op_registry::GetMklOpName(
+                          csinfo_.quantized_conv2d_with_bias_sum_and_relu),
+                      CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.quantized_conv2d_with_bias_sum_and_relu_and_requantize,
+         mkl_op_registry::GetMklOpName(
+             csinfo_.quantized_conv2d_with_bias_sum_and_relu_and_requantize),
+         CopyAttrsQuantizedConv2D, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize,
+         mkl_op_registry::GetMklOpName(
+             csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize),
+         CopyAttrsQuantizedConv2D, AlwaysRewrite});
+#endif
     rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.relu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
                       CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu6,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu6),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu6_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu6_grad),
+                      CopyAttrsDataType, AlwaysRewrite});
+#ifdef INTEL_MKL_QUANTIZED
+    rinfo_.push_back({csinfo_.requantize,
+                      mkl_op_registry::GetMklOpName(csinfo_.requantize),
+                      CopyAttrsRequantize, AlwaysRewrite});
+#endif
     /*
     rinfo_.push_back({csinfo_.tanh,
                       mkl_op_registry::GetMklOpName(csinfo_.tanh),
@@ -2569,15 +501,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsDataType, AlwaysRewrite});
-    rinfo_.push_back({csinfo_.sub,
-                      mkl_op_registry::GetMklOpName(csinfo_.sub),
+    rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsDataType, AlwaysRewrite});
 
     // Add info about which ops to add workspace edge to and the slots.
     wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
     wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
-    wsinfo_.push_back
-        ({csinfo_.max_pool3d, csinfo_.max_pool3d_grad, 0, 1, 1, 3});
+    wsinfo_.push_back(
+        {csinfo_.max_pool3d, csinfo_.max_pool3d_grad, 0, 1, 1, 3});
 
     // Add a rule for merging nodes
     minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
@@ -2586,6 +517,44 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
                       GetConv2DBackpropFilterOrBiasAddGrad});
+    minfo_.push_back(
+        {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D});
+    // Merge Pad and Conv2d, only if the pad op is "Pad"
+    // Doesn't merge if pad op is "PadV2" or "MirrorPad"
+
+    // The fusion patterns in "finfo_" that show up first will get applied
+    // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
+    // A->B->C->D to ABCD}, since the first gets applied first, the final
+    // graph will be ABC->D.
+
+    //
+    // Add rules to fuse sequences such as "Transpose (NCHW -> NHWC) + Conv2D
+    // (NHWC) + Transpose (NHWC->
+    // NCHW)" into "Conv2D (NCHW)". Such patterns occur frequently in Keras.
+    // Note: we use the term "merge" to combine (exactly) 2 nodes into one,
+    // while "fusion" is for 3+ nodes situation.
+    //
+
+    // Transpose + Conv2d + Transpose:
+    std::vector<int> transpose_to_nhwc = {NCHW::dim::N, NCHW::dim::H,
+                                          NCHW::dim::W, NCHW::dim::C};
+    std::vector<int> transpose_to_nchw = {NHWC::dim::N, NHWC::dim::C,
+                                          NHWC::dim::H, NHWC::dim::W};
+    auto CheckForTransposeToNHWC =
+        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nhwc);
+    auto CheckForConv2dOp =
+        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv2d);
+    auto CheckForTransposeToNCHW =
+        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nchw);
+    auto FuseConv2D =
+        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3, "NCHW");
+    finfo_.push_back(
+        {"transpose-elimination for Conv2D",
+         {CheckForTransposeToNHWC, CheckForConv2dOp, CheckForTransposeToNCHW},
+         // CheckForMklOp
+         FuseConv2D,
+         CopyAttrsConv});
   }
 
   // Standard interface to run pass
@@ -2608,7 +577,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string name;      // Original name of op of the node in the graph
     string new_name;  // New name of the op of the node in the graph
     // A function handler to copy attributes from an old node to a new node.
-    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    std::function<void(const Node*, NodeBuilder*, bool)> copy_attrs;
     // A rule under which to rewrite this node
     std::function<bool(const Node*)> rewrite_rule;
   } RewriteInfo;
@@ -2638,6 +607,41 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::function<Node*(const Node*)> get_node_to_be_merged;
   } MergeInfo;
 
+  // Structure to specify information used in node fusion of 3+ operators
+  typedef struct {
+    std::string pattern_name;  // Name to describe this pattern, such as
+                               // "Transpose_Mklop_Transpose".
+    std::vector<std::function<bool(const Node*)> >
+        node_checkers;  // Extra restriction checker for these ops
+    std::function<Status(
+        std::unique_ptr<Graph>*, std::vector<Node*>&,
+        std::function<void(const Node*, NodeBuilder* nb, bool)>)>
+        fuse_func;
+    std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs;
+  } FusionInfo;
+
+  //
+  // Dimension indices for 2D tensor.
+  //
+  struct NCHW {
+    enum dim { N = 0, C = 1, H = 2, W = 3 };
+  };
+
+  struct NHWC {
+    enum dim { N = 0, H = 1, W = 2, C = 3 };
+  };
+
+  //
+  // dimension indices for 3D tensor.
+  //
+  struct NCDHW {
+    enum dim { N = 0, C = 1, D = 2, H = 3, W = 4 };
+  };
+
+  struct NDHWC {
+    enum dim { N = 0, D = 1, H = 2, W = 3, C = 4 };
+  };
+
   /// Structure to store all constant strings
   /// NOTE: names are alphabetically sorted.
   typedef struct {
@@ -2675,11 +679,32 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_grad_filter;
     string mkl_conv2d_grad_filter_with_bias;
     string mkl_conv2d_with_bias;
+    string mkl_pad_with_conv2d;
     string mul;
+    string pad;
+    string pad_with_conv2d;
+    string quantized_avg_pool;
+    string quantized_conv2d;
+    string quantized_conv2d_with_requantize;
+    string quantized_conv2d_with_bias;
+    string quantized_conv2d_with_bias_and_requantize;
+    string quantized_conv2d_and_relu;
+    string quantized_conv2d_and_relu_and_requantize;
+    string quantized_conv2d_with_bias_and_relu;
+    string quantized_conv2d_with_bias_and_relu_and_requantize;
+    string quantized_concatv2;
+    string quantized_max_pool;
+    string quantized_conv2d_with_bias_sum_and_relu;
+    string quantized_conv2d_with_bias_sum_and_relu_and_requantize;
+    string quant_conv2d_with_bias_signed_sum_and_relu_and_requantize;
     string relu;
     string relu_grad;
+    string relu6;
+    string relu6_grad;
+    string requantize;
     string tanh;
     string tanh_grad;
+    string transpose;
     string reshape;
     string slice;
     string softmax;
@@ -2698,6 +723,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// Maintain info about nodes to be merged
   std::vector<MergeInfo> minfo_;
 
+  /// Maintain info about nodes to be fused
+  std::vector<FusionInfo> finfo_;
+
   /// Maintain structure of constant strings
   static ConstStringsInfo csinfo_;
 
@@ -2782,6 +810,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   // Helper function to merge different nodes
   Status MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g, Node* m, Node* n);
+  Status MergePadWithConv2D(std::unique_ptr<Graph>* g, Node* m, Node* n);
   Status MergeConv2DBackpropFilterWithBiasAddGrad(std::unique_ptr<Graph>* g,
                                                   Node* m, Node* n);
 
@@ -2819,6 +848,54 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return n;
   }
 
+  // Find Pad or Conv2D node that can be merged with input node 'm'.
+  // If input 'm' is Pad, then check if there exists Conv2D node that can be
+  // merged with 'm'. If input 'm' is Conv2D, then check if there exists Pad
+  // node that can be merged with 'm'.
+  static Node* GetPadOrConv2D(const Node* m) {
+    DCHECK(m);
+    Node* n = nullptr;
+
+    const Node* conv_node;
+    if (m->type_string() == csinfo_.pad) {
+      // If m is Pad, then Conv2D is the output of Pad.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() && e->dst()->type_string() == csinfo_.conv2d) {
+          n = e->dst();
+          conv_node = n;
+          break;
+        }
+      }
+    } else {
+      DCHECK_EQ(m->type_string(), csinfo_.conv2d);
+      // If m is conv2D, Go over all input edges
+      // and search for Pad  Node.
+      for (const Edge* e : m->in_edges()) {
+        if (!e->IsControlEdge() && e->src()->type_string() == csinfo_.pad) {
+          n = e->src();
+          conv_node = m;
+          break;
+        }
+      }
+    }
+    // Check if only VALID type of padding is used
+    // or not.
+    if (n != nullptr) {
+      string padding;
+      TF_CHECK_OK(GetNodeAttr(conv_node->def(), "padding", &padding));
+      if (padding != "VALID")
+        // Then do not merge.
+        // Only VALID type of padding in conv op can be
+        // merged with Pad op.
+        n = nullptr;
+    } else {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Pad and Conv2D node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
   // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
   // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
   // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
@@ -2876,11 +953,125 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return n;
   }
 
+  // Return a node that can be fused with input node 'n'
+  //
+  // @return tuple. If we can find such nodes, the first
+  // element of the tuple is a true. Otherwise, it's false.
+  std::tuple<bool, std::vector<Node*>, const MklLayoutRewritePass::FusionInfo>
+  CheckForNodeFusion(Node* n) const;
+
+  // Fuse nodes in the vector "nodes"
+  Status FuseNode(std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+                  const MklLayoutRewritePass::FusionInfo fi);
+
+  // Fuse tranpose(to "NHWC") + mklop("NHWC") + transpose(to "NCHW") into
+  // mklop("NCHW").
+  // Here "mklop" can be any MKL-DNN supported op, such as Conv2D.
+  static Status FuseTransposeMklOpTranspose(
+      std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+      std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs,
+      string data_format);
+
+  static bool CheckForTranspose(const Node* node, std::vector<int> perm) {
+    // Check if node's type is "Transpose"
+    if (node->type_string() != "Transpose") return false;
+
+    // If "Transpose" has multiple output data edges, also don't fuse it.
+    if (node->num_outputs() > 1 || node->out_edges().size() > 1) return false;
+
+    // Check if has out control edge. If true, this is a training graph.
+    // Currently we focus on inference and do no fusion in training.
+    // Note: this constraint will eventually be removed, if we enabled this
+    // fusion for training
+    // in the future.
+    for (const Edge* e : node->out_edges()) {
+      if (e->IsControlEdge()) {
+        return false;
+      }
+    }
+
+    // If "Transpose" has input control edges, don't fuse on it.
+    for (const Edge* e : node->in_edges()) {
+      if (e->IsControlEdge()) {
+        return false;
+      }
+    }
+
+    // We compared the tensor containing the permutation order ("perm_node")
+    // with our desired order ("perm"). If they're exactly match, this check
+    // succeed and returns true.
+    for (const Edge* e : node->in_edges()) {
+      if (!e->IsControlEdge()) {
+        const Node* perm_node = e->src();
+
+        const int kPermTensorIndex = 1;
+        if (perm_node->type_string() == "Const" &&
+            e->dst_input() == kPermTensorIndex) {
+          // we find the "perm" node, now try to retrieve its value.
+          const TensorProto* proto = nullptr;
+          DCHECK(GetNodeAttr(perm_node->def(), "value", &proto).ok());
+
+          DataType type;
+          GetNodeAttr(perm_node->def(), "dtype", &type);
+
+          // Here we directly access to the "tensor_content", rather than
+          // "int_val". This is because we find "int_val" is
+          // not set properly under some circumstances.
+          if (type == DT_INT32) {
+            const int type_size = 4;
+            const int* tensor_content =
+                reinterpret_cast<const int*>(proto->tensor_content().c_str());
+            const int tensor_content_size =
+                proto->tensor_content().size() / type_size;
+
+            std::vector<int> perm_value(tensor_content,
+                                        tensor_content + tensor_content_size);
+
+            return perm_value == perm;
+          } else if (type == DT_INT64) {
+            const int type_size = 8;
+            const long* tensor_content =
+                reinterpret_cast<const long*>(proto->tensor_content().c_str());
+            const int tensor_content_size =
+                proto->tensor_content().size() / type_size;
+
+            std::vector<long> perm_value(tensor_content,
+                                         tensor_content + tensor_content_size);
+            std::vector<long> long_perm(perm.cbegin(), perm.cend());
+
+            return perm_value == long_perm;
+          }
+          return false;
+        }
+      }
+    }
+    return false;
+  }
+
+  static bool CheckForMklOp(const Node* node, string name = "") {
+    if (node == nullptr) return false;
+
+    if (!name.empty() && node->type_string() != name) {
+      return false;
+    }
+
+    // if mklop has multiple outputs, don't fuse it.
+    if (node->num_outputs() > 1) return false;
+
+    if (node->out_edges().size() > 1) return false;
+
+    DataType T;
+    TF_CHECK_OK(GetNodeAttr(node->def(), "T", &T));
+    return mkl_op_registry::IsMklOp(
+        mkl_op_registry::GetMklOpName(node->type_string()), T);
+  }
+
   // Check if the node 'n' has any applicable rewrite rule
   // We check for 2 scenarios for rewrite.
   //
   // @return RewriteInfo* for the applicable rewrite rule
   const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
+  const RewriteInfo* CheckForQuantizedNodeRewrite(const Node* n) const;
 
   // Default rewrite rule to be used in scenario 1 for rewrite.
   // @return - true (since we want to always rewrite)
@@ -3100,8 +1291,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
   // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
   // 'g'. Returns true is fixup was done; otherwise, it returns false.
-  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
-    const Edge* e_data, const Edge* e_metadata);
+  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g, const Edge* e_data,
+                                  const Edge* e_metadata);
 
   // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
   // connected? If not, then fix them. This is needed because a graph may have
@@ -3130,18 +1321,43 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
   // NOTE: names are alphabetically sorted.
-  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
+                            bool change_format = false);
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb,
+                                   bool change_format = false);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb,
+                              bool change_format = false);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb,
+                                bool change_format = false);
+  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                            bool change_format = false);
+  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb,
+                                bool change_format = false);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb,
+                                      bool change_format = false);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
+                           bool change_format = false);
+  static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb,
+                                     bool change_format = false);
+  static void CopyAttrsFromPadAndConv2D(const Node* orig_node1,
+                                        const Node* orig_node2, NodeBuilder* nb,
+                                        bool change_format = false);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb,
+                               bool change_format = false);
+  static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb,
+                                        bool change_format = false);
+  static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsQuantizedConcat(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb,
+                               bool change_format = false);
+  static void CopyAttrsRequantize(const Node* orig_node, NodeBuilder* nb,
+                                  bool change_format = false);
+  static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb,
+                             bool change_format = false);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb,
+                             bool change_format = false);
 
   // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
   // using node for original node 'orig_node' and return it in '*out'.
@@ -3228,7 +1444,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
                                                       // device of the original
                                                       // node.
                   .Finalize(&**g, out));
-  CHECK_NOTNULL(*out); // Make sure we got a valid object before using it
+  CHECK_NOTNULL(*out);  // Make sure we got a valid object before using it
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -3341,6 +1557,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
     for (const Edge* e : filter_node->out_edges()) {
       if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
+           // add check for mkl_pad_with_conv2d
+           e->dst()->type_string() == csinfo_.mkl_pad_with_conv2d ||
            e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) &&
           e->dst_input() == kConv2DFilterInputSlotIdx
           /* filter is 2nd input of Conv2D and _MklConv2D. */) {
@@ -3463,8 +1681,27 @@ Status MklLayoutRewritePass::SetUpInputs(
   // We add workspace edge only for MaxPool, LRN and BatchNorm.
   std::vector<NodeBuilder::NodeOut> workspace_tensors;
   bool are_workspace_tensors_available = false;
-  AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
-                           &are_workspace_tensors_available);
+
+  // Avoid workspace check for QuantizedConv2D and the fused
+  // Ops as they don't have attribute: "T".
+  std::vector<string> quant_ops{
+      "QuantizedConv2D",
+      "QuantizedConv2DWithBias",
+      "QuantizedConv2DAndRelu",
+      "QuantizedConv2DWithBiasAndRelu",
+      "QuantizedConv2DWithBiasSumAndRelu",
+      "QuantizedConv2DAndRequantize",
+      "QuantizedConv2DWithBiasAndRequantize",
+      "QuantizedConv2DAndReluAndRequantize",
+      "QuantizedConv2DWithBiasAndReluAndRequantize",
+      "QuantizedConv2DWithBiasSumAndReluAndRequantize",
+      "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"};
+  bool should_check_workspace =
+      std::find(std::begin(quant_ops), std::end(quant_ops),
+                old_node->type_string()) == std::end(quant_ops);
+  if (should_check_workspace)
+    AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
+                             &are_workspace_tensors_available);
 
   int new_node_input_slots = 0;
   if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
@@ -3623,13 +1860,71 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
-void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
-                                         NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("padding", padding);
+
+  if (!change_format) {
+    nb->Attr("strides", strides);
+    nb->Attr("dilations", dilations);
+
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+    nb->Attr("data_format", data_format);
+  } else {
+    std::vector<int32> new_strides;
+    std::vector<int32> new_dilations;
+    if (strides.size() == 5) {
+      // "strides" and "dilations" also need to be changed according to
+      // "data_format",
+      // in this case, is "NDHWC" to "NCDHW".
+      new_strides = {strides[NDHWC::dim::N], strides[NDHWC::dim::C],
+                     strides[NDHWC::dim::D], strides[NDHWC::dim::H],
+                     strides[NDHWC::dim::W]};
+
+      new_dilations = {dilations[NDHWC::dim::N], dilations[NDHWC::dim::C],
+                       dilations[NDHWC::dim::D], dilations[NDHWC::dim::H],
+                       dilations[NDHWC::dim::W]};
+    } else {
+      // "strides" and "dilations" also need to be changed according to
+      // "data_format",
+      // in this case, is "NHWC" to "NCHW".
+
+      new_strides = {strides[NHWC::dim::N], strides[NHWC::dim::C],
+                     strides[NHWC::dim::H], strides[NHWC::dim::W]};
+
+      new_dilations = {dilations[NHWC::dim::N], dilations[NHWC::dim::C],
+                       dilations[NHWC::dim::H], dilations[NHWC::dim::W]};
+    }
+    nb->Attr("strides", new_strides);
+    nb->Attr("dilations", new_dilations);
+  }
+}
+
+// Used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D
+void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node,
+                                                  NodeBuilder* nb,
+                                                  bool change_format) {
+  DataType Tpaddings;
   DataType T;
   string data_format;
   string padding;
   std::vector<int32> strides;
   std::vector<int32> dilations;
+  bool use_cudnn_on_gpu;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
@@ -3637,6 +1932,9 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -3644,10 +1942,46 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
   nb->Attr("dilations", dilations);
   nb->Attr("padding", padding);
   nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+  nb->Attr("Tpaddings", Tpaddings);
+}
+
+// Used with MergePadWithConv2D
+void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
+                                                     const Node* orig_node2,
+                                                     NodeBuilder* nb,
+                                                     bool change_format) {
+  DataType Tpaddings;
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node 1.
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node1->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+  // Get all attributes from old node 2.
+  TF_CHECK_OK(GetNodeAttr(orig_node2->def(), "Tpaddings", &Tpaddings));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("dilations", dilations);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+  nb->Attr("Tpaddings", Tpaddings);
 }
 
-void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
-                                         NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
   DataType T;
   int N;
 
@@ -3661,7 +1995,8 @@ void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
-                                                NodeBuilder* nb) {
+                                                NodeBuilder* nb,
+                                                bool change_format) {
   DataType T;
   string data_format;
   std::vector<int32> strides;
@@ -3677,8 +2012,8 @@ void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
   nb->Attr("data_format", data_format);
 }
 
-void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
-                                        NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
+                                        bool change_format) {
   DataType T;
   int depth_radius;
   float bias;
@@ -3701,7 +2036,8 @@ void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
-                                            NodeBuilder* nb) {
+                                            NodeBuilder* nb,
+                                            bool change_format) {
   DataType T;
   string data_format;
   string padding;
@@ -3723,18 +2059,86 @@ void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
-                                             NodeBuilder* nb) {
+                                             NodeBuilder* nb,
+                                             bool change_format) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node,
+                                                     NodeBuilder* nb,
+                                                     bool change_format) {
   DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> ksize, strides;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
 
   // Add attributes to new node.
   nb->Attr("T", T);
+  nb->Attr("ksize", ksize);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+}
+
+void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
+                                                    NodeBuilder* nb,
+                                                    bool change_format) {
+  DataType Tinput, Tfilter, out_type;
+  string padding;
+  string data_format("NHWC");
+  std::vector<int32> strides, dilations;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tinput", &Tinput));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tfilter", &Tfilter));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "out_type", &out_type));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+
+  // Add attributes to new node.
+  nb->Attr("Tinput", Tinput);
+  nb->Attr("Tfilter", Tfilter);
+  nb->Attr("out_type", out_type);
+  nb->Attr("padding", padding);
+  nb->Attr("strides", strides);
+  nb->Attr("dilations", dilations);
+  nb->Attr("T", out_type);  // added "T" for facilitating MklToTf conversion.
+  nb->Attr("data_format", data_format);
+  // Requantization attr Tbias
+  DataType Tbias;
+  Status bias_status = GetNodeAttr(orig_node->def(), "Tbias", &Tbias);
+  if (bias_status.ToString() == "OK") nb->Attr("Tbias", Tbias);
+}
+
+void MklLayoutRewritePass::CopyAttrsRequantize(const Node* orig_node,
+                                               NodeBuilder* nb,
+                                               bool change_format) {
+  DataType Tinput, out_type;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tinput", &Tinput));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "out_type", &out_type));
+
+  // Add attributes to new node.
+  nb->Attr("Tinput", Tinput);
+  nb->Attr("out_type", out_type);
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                            NodeBuilder* nb) {
+                                            NodeBuilder* nb,
+                                            bool change_format) {
   DataType T;
   DataType Tshape;
 
@@ -3747,7 +2151,7 @@ void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
-                                          NodeBuilder* nb) {
+                                          NodeBuilder* nb, bool change_format) {
   DataType T;
   DataType Index;
 
@@ -3760,7 +2164,7 @@ void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
-                                          NodeBuilder* nb) {
+                                          NodeBuilder* nb, bool change_format) {
   DataType T;
   string data_format;
   int num_split;
@@ -3777,7 +2181,8 @@ void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                           NodeBuilder* nb,
+                                           bool change_format) {
   DataType T;
   int N;
 
@@ -3791,7 +2196,8 @@ void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
-                                             NodeBuilder* nb) {
+                                             NodeBuilder* nb,
+                                             bool change_format) {
   DataType T;
   int N;
   DataType tidx;
@@ -3808,7 +2214,8 @@ void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
-                                                   NodeBuilder* nb) {
+                                                   NodeBuilder* nb,
+                                                   bool change_format) {
   DataType T;
   float epsilon;
   string data_format;
@@ -4024,6 +2431,165 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   return Status::OK();
 }
 
+Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
+                                                Node* m, Node* n) {
+  DCHECK(((m->type_string() == csinfo_.pad &&
+           n->type_string() == csinfo_.conv2d)) ||
+         ((n->type_string() == csinfo_.pad &&
+           m->type_string() == csinfo_.conv2d)));
+
+  // Conv2D is successor node, and Pad predecessor node.
+  Node* pred = m->type_string() == csinfo_.pad ? m : n;
+  Node* succ = m->type_string() == csinfo_.pad ? n : m;
+
+  // 1. Get all attributes from input nodes.
+  DataType T_pred, T_succ;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  string data_format_pred, data_format_succ;
+  bool use_cudnn_on_gnu;
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "dilations", &dilations));
+  // Data format for pad is not available and not necessary, thus
+  // dont need to match data format for Pad
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  // Check if the data types and devices of both succ and pred are the same.
+  // Assert is not used,  because it can be too strict.
+  // Don't need to check for data formats because it is not available in Pad.
+  if (T_pred != T_succ ||
+      pred->assigned_device_name() != succ->assigned_device_name() ||
+      pred->def().device() != succ->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "T attribute or devices of Conv2D and "
+                  "Pad do not match. Will skip node merge optimization");
+  }
+
+  const int succ_num = succ->num_inputs();
+  gtl::InlinedVector<Node*, 4> succ_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
+  FillInputs(succ, &succ_control_edges, &succ_in);
+
+  const int pred_num = pred->num_inputs();
+  gtl::InlinedVector<Node*, 4> pred_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
+  FillInputs(pred, &pred_control_edges, &pred_in);
+
+  // We need to ensure that Pad only feeds to Conv2D (some other operator is
+  // not expecting output of Pad). If this is not the case, then we cannot
+  // merge Conv2D with Pad.
+  const int kFirstOutputSlot = 0;
+  for (const Edge* e : pred->out_edges()) {
+    if (e->src_output() == kFirstOutputSlot && e->dst() != succ) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "Pad does not feed to Conv2D, or "
+                    "it feeds Conv2D but has multiple outputs. "
+                    "Will skip node merge optimization");
+    }
+  }
+
+  // 2. Get inputs from both the nodes.
+
+  // Pad must have 2 data inputs: "input" and paddings.
+  int PadDataInputEdges = 0;
+  for (const Edge* e : pred->in_edges()) {
+    if (!e->IsControlEdge()) {
+      PadDataInputEdges++;
+    }
+  }
+  DCHECK_EQ(PadDataInputEdges, 2);
+
+  // Conv2D must have 2 data inputs: pad output and Filter
+  int ConvDataInputEdges = 0;
+  for (const Edge* e : succ->in_edges()) {
+    if (!e->IsControlEdge()) {
+      ConvDataInputEdges++;
+    }
+  }
+  DCHECK_EQ(ConvDataInputEdges, 2);
+
+  // We will use the node name of Conv2D as the name of new node
+  // Build new node. We use same name as original node, but change the op
+  // name.
+  NodeBuilder nb(succ->name(), csinfo_.pad_with_conv2d);
+  nb.Input(pred_in[0].first, pred_in[0].second);  // In1 (input data)  of Pad
+  // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
+  nb.Input(succ_in[1].first, succ_in[1].second);  // In2 (filter) of conv2d
+  // In1 of Conv2D is same as output of Pad.
+  // Thus, only need to add In2 of Conv2D
+  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
+
+  // Copy attributes from Pad and conv2D to PadWithConv2D.
+  CopyAttrsFromPadAndConv2D(const_cast<const Node*>(succ),
+                            const_cast<const Node*>(pred), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(succ->def().device());
+
+  // Create node.
+  Node* new_node;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  DCHECK(new_node);
+
+  // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+  // node are already copied in BuildNode.
+  // We handle control edges now.
+  for (const Edge* e : pred->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Don't allow duplicate edge
+      (*g)->AddControlEdge(e->src(), new_node, false);
+    }
+  }
+  for (const Edge* e : succ->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Don't allow duplicate edge
+      (*g)->AddControlEdge(e->src(), new_node, false);
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'pred' node.
+  for (const Edge* e : pred->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Don't allow duplicate edge
+      (*g)->AddControlEdge(new_node, e->dst(), false);
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'succ' node.
+  for (const Edge* e : succ->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      (*g)->AddControlEdge(new_node, e->dst(), false);
+    } else {
+      // Conv2D has only 1 output (at slot 0) and merged node also has only 1
+      // output (at slot 0).
+      const int kPadWithConv2DOutputSlot = 0;
+      (*g)->AddEdge(new_node, kPadWithConv2DOutputSlot, e->dst(),
+                    e->dst_input());
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use pred or succ as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(pred->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+          << ", and node: " << succ->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(succ);
+  (*g)->RemoveNode(pred);
+
+  return Status::OK();
+}
+
 Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
     std::unique_ptr<Graph>* g, Node* m, Node* n) {
   CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
@@ -4157,6 +2723,12 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
         m->type_string() == csinfo_.conv2d))) {
     return this->MergeConv2DWithBiasAdd(g, m, n);
   }
+  if (((m->type_string() == csinfo_.pad &&
+        n->type_string() == csinfo_.conv2d)) ||
+      ((n->type_string() == csinfo_.pad &&
+        m->type_string() == csinfo_.conv2d))) {
+    return this->MergePadWithConv2D(g, m, n);
+  }
 
   if (((m->type_string() == csinfo_.bias_add_grad &&
         n->type_string() == csinfo_.conv2d_grad_filter)) ||
@@ -4205,10 +2777,18 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
     return s;
   }
 
-  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
-  // Set the Mkl layer label for this op.
-  nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
+  const bool kPartialCopyAttrs = false;
+  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, kPartialCopyAttrs);
 
+  // Set the Mkl layer label for this op.
+  if (DataTypeIsQuantized(orig_node->input_type(0)) ||
+      DataTypeIsQuantized(orig_node->output_type(0))) {
+#ifdef INTEL_MKL_QUANTIZED
+    nb.Attr("_kernel", mkl_op_registry::kMklQuantizedOpLabel);
+#endif
+  } else {
+    nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
+  }
   // Finalize graph and get new node.
   Node* new_node = nullptr;
   TF_CHECK_OK(nb.Finalize(&**g, &new_node));
@@ -4254,10 +2834,38 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   return Status::OK();
 }
 
+// TODO(mdfaijul): Is there any other elegent way to check for quantized ops
+// having attributes other than "T"?
+// Current implementation reflects only QuantizedConv2D and its fused Ops.
+const MklLayoutRewritePass::RewriteInfo*
+MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
+#ifdef INTEL_MKL_QUANTIZED
+  DataType Tinput, Tfilter;
+  if (!(GetNodeAttr(n->def(), "Tinput", &Tinput).ok() &&
+        GetNodeAttr(n->def(), "Tfilter", &Tfilter).ok())) {
+    return nullptr;
+  }
+  if (mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
+                               Tinput, Tfilter)) {
+    for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
+      if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) {
+        return &*ri;
+      }
+    }
+  }
+#endif
+  return nullptr;
+}
+
 const MklLayoutRewritePass::RewriteInfo*
 MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   CHECK_NOTNULL(n);
 
+  // QuntizedOps may have attributes other than "T", so decoupled the check
+  // with a function, CheckForQuantizedNodeRewrite(const Node*).
+  const RewriteInfo* ri = CheckForQuantizedNodeRewrite(n);
+  if (ri != nullptr) return ri;
+
   // First check if node along with its type is supported by MKL layer.
   // We do not want to rewrite an op into Mkl op if types are not supported.
   // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
@@ -4267,10 +2875,11 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
     return nullptr;
   }
 
-  // We make an exception for __MklDummyConv2DWithBias and
-  // __MklConv2DBackpropFilterWithBias since their names do not match Mkl node
-  // names.
+  // We make an exception for __MklDummyConv2DWithBias,
+  // __MklConv2DBackpropFilterWithBias, and __MklDummyPadWithConv2D since their
+  // names do not match Mkl node names.
   if (n->type_string() != csinfo_.conv2d_with_bias &&
+      n->type_string() != csinfo_.pad_with_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
                                 T)) {
@@ -4330,27 +2939,165 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node fusion
+//////////////////////////////////////////////////////////////////////////
+Status MklLayoutRewritePass::FuseTransposeMklOpTranspose(
+    std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+    std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs,
+    string data_format) {
+  Node* transpose_to_nhwc = nodes[0];
+  Node* mklop = nodes[1];
+  Node* transpose_to_nchw = nodes[2];
+
+  const int transpose_nhwc_num_inputs = transpose_to_nhwc->num_inputs();
+  gtl::InlinedVector<Node*, 4> transpose_nhwc_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> transpose_nhwc_in(
+      transpose_nhwc_num_inputs);
+  FillInputs(transpose_to_nhwc, &transpose_nhwc_control_edges,
+             &transpose_nhwc_in);
+
+  const int mklop_num_inputs = mklop->num_inputs();
+  gtl::InlinedVector<Node*, 4> mklop_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> mklop_in(mklop_num_inputs);
+  FillInputs(mklop, &mklop_control_edges, &mklop_in);
+
+  const int transpose_nchw_num_inputs = transpose_to_nchw->num_inputs();
+  gtl::InlinedVector<Node*, 4> transpose_nchw_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> transpose_nchw_in(
+      transpose_nchw_num_inputs);
+  FillInputs(transpose_to_nchw, &transpose_nchw_control_edges,
+             &transpose_nchw_in);
+
+  // We use same name as original node, but change the op
+  // type.
+  NodeBuilder nb(mklop->name(), mklop->type_string());
+
+  // Storing the output slots of the input nodes.
+  for (int i = 0; i < mklop_num_inputs; i++) {
+    if (mklop_in[i].first == transpose_to_nhwc) {
+      // Fill "x":
+      nb.Input(transpose_nhwc_in[0].first, transpose_nhwc_in[0].second);
+    } else {
+      // Fill inputs other than "x":
+      nb.Input(mklop_in[i].first, mklop_in[i].second);
+    }
+  }
+
+  copy_attrs(const_cast<const Node*>(mklop), &nb, true);
+  nb.Attr("data_format", data_format);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(mklop->def().device());
+
+  // Create node.
+  Node* new_node;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  DCHECK(new_node);
+
+  // Fill outputs.
+  for (const Edge* e : transpose_to_nchw->out_edges()) {
+    if (!e->IsControlEdge()) {
+      const int kTransposeWithMklOpOutputSlot = 0;
+      DCHECK((*g)->AddEdge(new_node, kTransposeWithMklOpOutputSlot, e->dst(),
+                           e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  new_node->set_assigned_device_name(mklop->assigned_device_name());
+
+  // Copy requested_device and assigned_device_name_index
+  new_node->set_requested_device(mklop->requested_device());
+  new_node->set_assigned_device_name_index(mklop->assigned_device_name_index());
+
+  (*g)->RemoveNode(transpose_to_nhwc);
+  (*g)->RemoveNode(mklop);
+  (*g)->RemoveNode(transpose_to_nchw);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::FuseNode(
+    std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+    const MklLayoutRewritePass::FusionInfo fi) {
+  return fi.fuse_func(g, nodes, fi.copy_attrs);
+}
+
+std::tuple<bool, std::vector<Node*>, const MklLayoutRewritePass::FusionInfo>
+MklLayoutRewritePass::CheckForNodeFusion(Node* a) const {
+  // Stores matched nodes, in the same order as node_checkers.
+  std::vector<Node*> nodes;
+
+  for (auto fi = finfo_.begin(); fi != finfo_.end(); ++fi) {
+    //
+    // Make sure node "a" and its succeding nodes (b, c ...), match the pattern
+    // defined in fusion info (ops[0], ops[1], ...),
+    // a.k.a. "a->b->c" matches "op1->op2->op3"
+    //
+
+    // Stores the first unvisted outgoing edge of each matched node in "nodes".
+    std::stack<EdgeSet::const_iterator> current_neighbor_stack;
+    nodes.clear();
+
+    auto node_checker = fi->node_checkers.begin();
+    if (a != nullptr && (*node_checker)(a)) {
+      nodes.push_back(a);
+      current_neighbor_stack.push(a->out_edges().begin());
+      ++node_checker;
+    }
+
+    while (!nodes.empty()) {
+      auto& current_neighbor_iter = current_neighbor_stack.top();
+
+      if (current_neighbor_iter != nodes.back()->out_edges().end()) {
+        // Found an unvisited edge. Goes through the edge to get the neighbor.
+        Node* neighbor_node = (*current_neighbor_iter)->dst();
+        ++current_neighbor_stack.top();  // Retrieves the next unvisited edge.
+
+        if ((*node_checker)(neighbor_node)) {
+          // Found a match. Stores the node and moves to the next checker.
+          nodes.push_back(neighbor_node);
+          current_neighbor_stack.push(neighbor_node->out_edges().begin());
+          if (++node_checker == fi->node_checkers.end()) {
+            return make_tuple(true, nodes, *fi);
+          }
+        }
+      } else {
+        // Removes the current node since none of its neighbor leads to a
+        // further match.
+        nodes.pop_back();
+        current_neighbor_stack.pop();
+        --node_checker;
+      }
+    }
+  }
+
+  return make_tuple(false, std::vector<Node*>(), FusionInfo());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Post-rewrite Mkl metadata fixup pass
 ///////////////////////////////////////////////////////////////////////////////
 bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
-    const Edge* e_data, const Edge* e_metadata) {
+                                                      const Edge* e_data,
+                                                      const Edge* e_metadata) {
   if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
     return false;
   }
 
   Node* n_data = e_data->src();
   int n_data_op_slot = e_data->src_output();
-  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
-                                                  n_data->num_outputs());
+  int n_metadata_op_slot =
+      GetTensorMetaDataIndex(n_data_op_slot, n_data->num_outputs());
 
   // If the source of meta edge is a constant node (producing dummy Mkl metadata
   // tensor), then we will need to fix.
   if (IsConstant(e_metadata->src())) {
     Node* e_metadata_dst = e_metadata->dst();
     int e_metadata_in_slot = e_metadata->dst_input();
-    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
-                  e_metadata_dst, e_metadata_in_slot));
+    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot, e_metadata_dst,
+                                e_metadata_in_slot));
 
     (*g)->RemoveEdge(e_metadata);
     return true;
@@ -4360,7 +3107,7 @@ bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
 }
 
 bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
-    Node* n) {
+                                               Node* n) {
   bool result = false;
 
   // If graph node is not Mkl node, then return.
@@ -4401,8 +3148,8 @@ bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
       // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
       // 'e'. For that, let's first get the input slot of 'n' where the meta
       // edge will feed the value.
-      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
-                                                  n->num_inputs());
+      int e_meta_in_slot =
+          GetTensorMetaDataIndex(e->dst_input(), n->num_inputs());
       const Edge* e_meta = nullptr;
       TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
 
@@ -4454,6 +3201,30 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
 
+#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    auto check_result = CheckForNodeFusion(n);
+    bool found_pattern = std::get<0>(check_result);
+    std::vector<Node*> nodes = std::get<1>(check_result);
+    const FusionInfo fi = std::get<2>(check_result);
+
+    // if "found_pattern" is true, we can do the fusion.
+    if (found_pattern) {
+      if (FuseNode(g, nodes, fi) == Status::OK()) {
+        result = true;
+      }
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeFusion)", &**g);
+#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
+
   order.clear();
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
   for (Node* n : order) {
@@ -4539,7 +3310,7 @@ Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) {
 
   return Status::OK();
 }
-#endif  // INTEL_MKL_ML_ONLY
+
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 77640e287cf75ecbb73f5ae03e7493fa7e3de002..04c4b85d64d63f275a08abb86d7bf3393398dc67 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -37,1869 +37,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifdef INTEL_MKL_ML_ONLY
-
-namespace {
-
-const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
-const char kGPUDevice[] = "/job:a/replica:0/task:0/device:GPU:0";
-
-static void InitGraph(const string& s, Graph* graph,
-                      const string& device = kCPUDevice) {
-  GraphDef graph_def;
-
-  auto parser = protobuf::TextFormat::Parser();
-  //  parser.AllowRelaxedWhitespace(true);
-  CHECK(parser.MergeFromString(s, &graph_def)) << s;
-  GraphConstructorOptions opts;
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
-
-  for (Node* node : graph->nodes()) {
-    node->set_assigned_device_name(device);
-  }
-}
-
-class MklLayoutPassTest : public ::testing::Test {
- public:
-  MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
-
-  void InitGraph(const string& s, const string& device = kCPUDevice) {
-    ::tensorflow::InitGraph(s, &graph_, device);
-    original_ = CanonicalGraphString(&graph_);
-  }
-
-  static bool IncludeNode(const Node* n) { return n->IsOp(); }
-
-  static string EdgeId(const Node* n, int index) {
-    if (index == 0) {
-      return n->name();
-    } else if (index == Graph::kControlSlot) {
-      return strings::StrCat(n->name(), ":control");
-    } else {
-      return strings::StrCat(n->name(), ":", index);
-    }
-  }
-
-  string CanonicalGraphString(Graph* g) {
-    std::vector<string> nodes;
-    std::vector<string> edges;
-    for (const Node* n : g->nodes()) {
-      if (IncludeNode(n)) {
-        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
-      }
-    }
-    for (const Edge* e : g->edges()) {
-      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
-        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
-                                        EdgeId(e->dst(), e->dst_input())));
-      }
-    }
-    // Canonicalize
-    std::sort(nodes.begin(), nodes.end());
-    std::sort(edges.begin(), edges.end());
-    return strings::StrCat(str_util::Join(nodes, ";"), "|",
-                           str_util::Join(edges, ";"));
-  }
-
-  string DoMklLayoutOptimizationPass() {
-    string before = CanonicalGraphString(&graph_);
-    LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
-
-    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
-    RunMklLayoutRewritePass(ug);
-
-    string result = CanonicalGraphString(&graph_);
-    LOG(ERROR) << "After MKL layout rewrite pass:  " << result;
-    return result;
-  }
-
-  const string& OriginalGraph() const { return original_; }
-
-  Graph graph_;
-  string original_;
-};
-
-REGISTER_OP("Input").Output("o: float").SetIsStateful();
-REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
-REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
-REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
-REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
-REGISTER_OP("_MklInput2")
-    .Output("o: uint8")
-    .Output("o1: uint8")
-    .SetIsStateful();
-
-/////////////////////////////////////////////////////////////////////
-//  Unit tests related to node merge optiimization
-/////////////////////////////////////////////////////////////////////
-
-TEST_F(MklLayoutPassTest, Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Zeta);D(Zeta)|"
-            "A->C;A->D;B->C:1;B->D:1");
-}
-
-// Test set 1: Conv2D + AddBias
-
-// C=_MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Zeta(E,Y) (for interleaved ordering)
-// C=_MklConv2D(A,B,M,N); E=BiasAdd(C,D); Z=Zeta(E,Y) (for contiguous ordering)
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
-            "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->E;"
-            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;M->E:3;"
-            "N->E:4;Y->Z:1");
-}
-
-// C=_MklConv2D(A,M:1,B,N:1); E=BiasAdd(C,D); Z=Zeta(E,Y) (for interleaved)
-// C=_MklConv2D(A,B,M:1,N:1); E=BiasAdd(C,D); Z=Zeta(E,Y) (for contiguous)
-// Test for correct output slots selected
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive1) {
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput2'}"
-      "node { name: 'N' op: '_MklInput2'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M:1', 'N:1']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);D(Input);DMT/_0(Const);E(_MklConv2DWithBias);"
-            "M(_MklInput2);N(_MklInput2);Y(Input);Z(Zeta)|A->E;"
-            "A:control->DMT/_0:control;B->E:1;D->E:2;DMT/_0->E:5;E->Z;"
-            "M:1->E:3;N:1->E:4;Y->Z:1");
-}
-
-// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y);
-// This is a case of node rewrite followed by node merge.
-// We will first rewrite Conv2D to _MklConv2D, and then merge _MklConv2D
-// with BiasAdd to produce _MklConv2DWithBias.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive2) {
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Zeta)|"
-            "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
-            "DMT/_2->E:5;E->Z;Y->Z:1");
-}
-
-// Graph contains only _MklConv2D, no AddBias.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);M(_MklInput);N(_MklInput)|"
-            "A->C;B->C:1;M->C:2;N->C:3");
-}
-
-// _MklConv2D output does not go to BiasAdd.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }");  // Output of _MklConv2D does not go to BiasAdd.
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);"
-            "M(_MklInput);N(_MklInput)|A->C;B->C:1;D->F;E->F:1;M->C:2;N->C:3");
-}
-
-// _MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Zeta).
-// Merge should not be done in such case.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D', 'E'] }"  // Conv2D has two outputs.
-                              // No merge should happen.
-      "node { name: 'G' op: 'Zeta'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Input);E(Input);F(BiasAdd);"
-            "G(Zeta);M(_MklInput);N(_MklInput)|A->C;B->C:1;C->G;D->F;"
-            "E->F:1;E->G:1;M->C:2;N->C:3");
-}
-
-// data_format attribute value mismatch. Merge should not be done
-// in such case.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NHCW' } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);M(_MklInput);"
-            "N(_MklInput)|A->C;B->C:1;C->E;D->E:1;M->C:2;N->C:3");
-}
-
-// Test set 2: _MklConv2D..BiasAddGrad -> _MklConv2DWithBiasBackpropBias
-// rewrite tests
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
-// and BackpropInput
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'Int32Input'}"
-      "node { name: 'I' op: '_MklConv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
-      "node { name: 'J' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
-            "I(_MklConv2DBackpropInput);J(_MklConv2DWithBiasBackpropBias);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G;B->D:1;"
-            "B->I:1;C->D:2;D->E;DMT/_0->J:1;E->G:2;E->I:2;E->J;"
-            "E:control->DMT/_0:control;F->G:1;H->I;M->D:3;M->G:3;M->I:3;"
-            "N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
-// and BackpropInput. But nodes do not match criteria for rewrite. So
-// rewrite should not happen.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'Int32Input'}"
-      "node { name: 'I' op: '_MklConv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
-      "node { name: 'J' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
-            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
-            "B->I:1;C->D:2;D->E;E->G;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
-            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter
-// and BackpropInput. But nodes do not match criteria for rewrite. So
-// rewrite should not happen.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'Int32Input'}"
-      "node { name: 'I' op: '_MklConv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['H', 'B', 'E', 'M', 'N', 'O']}"
-      "node { name: 'J' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(Int32Input);"
-            "I(_MklConv2DBackpropInput);J(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
-            "B->I:1;C->D:2;D->E;E->G:2;E->I:2;E->J;F->G:1;H->I;M->D:3;M->G:3;"
-            "M->I:3;N->D:4;N->G:4;N->I:4;O->D:5;O->G:5;O->I:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);DMT/_0(Const);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);"
-            "H(_MklConv2DWithBiasBackpropBias);M(_MklInput);N(_MklInput);"
-            "O(_MklInput)|A->D;A->E:1;A->G;B->D:1;C->D:2;D->E;DMT/_0->H:1;"
-            "E->G:2;E->H;E:control->DMT/_0:control;F->G:1;M->D:3;M->G:3;"
-            "N->D:4;N->G:4;O->D:5;O->G:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
-// But BackpropFilter node inputs do not satisfy criteria for rewrite.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
-            "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
-            "O->G:5");
-}
-
-// BiasAddGrad rewrite to BackpropBias in the presence of BackpropFilter only
-// But BackpropFilter node inputs do not satisfy criteria for rewrite.
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_BpropFilter_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['B', 'A', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'Int32Input'}"
-      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'F', 'E', 'M', 'N', 'O'] }"
-      "node { name: 'H' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D:1;A->E:1;A->G;B->D;"
-            "C->D:2;D->E;E->G:2;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
-            "O->G:5");
-}
-
-// No _MklConv2DWithBias in context, but _MklConv2D in context.
-// No rewrite for BiasAddGrad should happen.
-// C=_MklConv2D(A,M,B,N); D=Zeta(C,A); E=BiasAddGrad(D) (for interleaved)
-// C=_MklConv2D(A,B,M,N); D=Zeta(C,A); E=BiasAddGrad(D) (for contiguous)
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Neg_NoMklConv2DWithBias) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
-            "M->C:2;N->C:3");
-}
-
-// No Conv2D in the context for BiasAddGrad. No rewrite should happen.
-// C=Polygamma(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Polygamma'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No Conv2D in the context for BiasAddGrad, but MatMul in context.
-// Rewrite should happen, but name of BiasAddGrad does not change.
-// C=MatMul(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_Negative_NoConv2D_MatMul) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'MatMul'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'transpose_a'      value { b: false } }"
-      " attr { key: 'transpose_b'      value { b: false } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests
-// C=MatMul(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
-TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'MatMul'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'transpose_a'      value { b: false } }"
-      " attr { key: 'transpose_b'      value { b: false } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-// No MatMul in the context for BiasAddGrad. No rewrite should happen.
-// C=Polygamma(A,B); D=Zeta(C,A); E=BiasAddGrad(D)
-TEST_F(MklLayoutPassTest, NodeMerge_MatMulBiasAddGrad_Negative_NoMatMul) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Polygamma'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
-}
-
-/////////////////////////////////////////////////////////////////////
-//  Unit tests related to rewriting node to Mkl node
-/////////////////////////////////////////////////////////////////////
-
-// Single Conv2D Op; No Mkl layer on the input and on the output.
-// We will generate dummy Mkl tensor as 2nd input of Conv2D.
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
-            "DMT/_1->C:3");
-}
-
-// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
-// have 2 outputs, both of which will be inputs to next Conv2D.
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'C']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->C;A->D;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
-            "C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
-}
-
-// Conv2D with INT32 which is not supported by Mkl
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
-  InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
-      "node { name: 'B' op: 'HalfInput'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_HALF } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
-      " input: ['B', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(HalfInput);B(HalfInput);C(Conv2D);D(Zeta)|"
-            "A->C;B->C:1;B->D;C->D:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Conv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
-            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
-            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
-            "DMT/_1->D:4;DMT/_2->D:5");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Conv2DBackpropInput'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['B', 'A', 'C']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
-            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
-            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
-            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
-            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
-}
-
-// Concat Op test: Concat with no Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'B' op: 'InputList'"
-      " attr { key: 'N'                value { i: 2 } }}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Concat'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['A', 'B:0', 'B:1']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(
-      DoMklLayoutOptimizationPass(),
-      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
-}
-
-// Concat with 2 Mkl layers feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'Concat'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['G', 'E', 'F']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
-            "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
-            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
-            "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
-            "G:control->DMT/_4:control;H->I:1");
-}
-
-// Concat with 1 Mkl and 1 non-Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'Concat'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['G', 'E', 'F']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
-            "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
-            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
-            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
-}
-
-// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
-  InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'B' op: 'InputList'"
-      " attr { key: 'N'                value { i: 2 } }}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'ConcatV2'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['B:0', 'B:1', 'A']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D:2;B->D;B:1->D:1;"
-            "B:control->DMT/_0:control;B:control->DMT/_1:control;"
-            "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
-            "DMT/_1->D:4;DMT/_2->D:5");
-}
-
-// ConcatV2 with 2 Mkl layers feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'ConcatV2'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['E', 'F', 'G']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
-            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
-            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
-            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
-            "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;"
-            "F:2->H:4;G->H:2;H->I:1");
-}
-
-// ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'ConcatV2'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['E', 'F', 'G']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
-            "H(_MklConcatV2);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
-            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;"
-            "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
-            "G->H:2;H->I:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Relu'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;"
-            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'ReluGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Relu'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'ReluGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
-            "DMT/_1->C:2");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"
-            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Int32Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'AvgPoolGrad' "
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
-            "DMT/_1->C:3");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'I' op: 'Int32Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'AvgPoolGrad' "
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['I', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
-            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
-            "I:control->DMT/_1:control");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNorm'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
-}
-
-/////////////////////////////////////////////////////////////////////
-//  Unit tests related to rewriting node for workspace edges
-/////////////////////////////////////////////////////////////////////
-
-/* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
-TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['B'] }"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['B', 'C', 'D'] }"
-      "node { name: 'F' op: 'Input'}"
-      "node { name: 'G' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['E', 'F', 'B'] }"
-      "node { name: 'H' op: 'Input'}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['H', 'G'] }");
-  EXPECT_EQ(
-      DoMklLayoutOptimizationPass(),
-      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
-      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
-      "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
-      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
-      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
-      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
-}
-
-/* Test LRN->LRNGrad replacement by workspace nodes. */
-TEST_F(MklLayoutPassTest, LRN_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'D', 'B'] }"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|"
-            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
-            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
-}
-
-/* Test LRN->LRNGrad replacement when only one of them is present. */
-TEST_F(MklLayoutPassTest, LRN_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|"
-            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
-}
-
-/* Test LRN->LRNGrad replacement when only one of them is present. */
-TEST_F(MklLayoutPassTest, LRN_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
-            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
-            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
-}
-
-/* Test LRN->LRNGrad negative case, where single LRN feeds
-   2 LRNGrad nodes at different slots. */
-TEST_F(MklLayoutPassTest, LRN_Negative3) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'D', 'B'] }"
-      "node { name: 'F' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'B', 'D'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['E', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
-            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Zeta)|A->B;"
-            "A:control->DMT/_0:control;B->E:2;"
-            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
-            "C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "C:control->DMT/_3:control;C:control->DMT/_4:control;"
-            "C:control->DMT/_5:control;C:control->DMT/_6:control;"
-            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
-            "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
-}
-
-/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['C', 'B', 'D'] }"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|"
-            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
-            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
-}
-
-// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
-// In this case, we will rewrite MaxPool node but workspace edges will not
-// be present.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"
-            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
-}
-
-// Test MaxPoolGrad replacement when only one of them is present.
-// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
-// its Mkl part, we will generate dummy tensor.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
-            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
-            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
-}
-
-// Test MaxPool handling for batch-wise pooling (NCHW)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for batch-wise pooling (NCHW)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for depth-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for depth-wise pooling (NCHW)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for batch-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for batch-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for depth-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Test MaxPool handling for depth-wise pooling (NHWC)
-// No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-/////////////////////////////////////////////////////////////////////
-
-// Single Conv2D Op on GPU device
-// No rewrite should happen
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Conv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'O' op: '_MklInput'}"
-      "node { name: 'D' op: '_MklConv2DWithBias'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
-      "node { name: 'E' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['D', 'A']}"
-      "node { name: 'F' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['E'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
-            "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
-            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
-            "M->D:3;N->D:4;O->D:5");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Conv2DBackpropFilter'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'C']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
-            "A->D;A->E;B->D:1;C->D:2;D->E:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Relu'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'ReluGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
-}
-
-// Concat Op test: Concat with no Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'B' op: 'InputList'"
-      " attr { key: 'N'                value { i: 2 } }}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Concat'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['A', 'B:0', 'B:1']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
-            "B->D:1;B:1->D:2;C->E;D->E:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'B' op: 'InputList'"
-      " attr { key: 'N'                value { i: 2 } }}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'ConcatV2'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['B:0', 'B:1', 'A']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
-            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNorm'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);E(Input);"
-            "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
-            "E->F:4;F->G:1");
-}
-
-TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
-  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'BiasAdd'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['C', 'D'] }"
-      "node { name: 'Y' op: 'Input'}"
-      "node { name: 'Z' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['E', 'Y']}",
-      kGPUDevice);
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
-            "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
-            "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
-}
-
-/////////////////////////////////////////////////////////////////////
-
-static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
-  testing::StopTiming();
-  string s;
-  for (int in = 0; in < 10; in++) {
-    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
-  }
-  random::PhiloxRandom philox(301, 17);
-  random::SimplePhilox rnd(&philox);
-  for (int op = 0; op < op_nodes; op++) {
-    s += strings::Printf(
-        "node { name: 'op%04d' op: 'Zeta' attr { key: 'T' value { "
-        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
-        op, rnd.Uniform(10), rnd.Uniform(10));
-  }
-
-  bool first = true;
-  while (iters > 0) {
-    Graph* graph = new Graph(OpRegistry::Global());
-    InitGraph(s, graph);
-    int N = graph->num_node_ids();
-    if (first) {
-      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
-      first = false;
-    }
-    {
-      testing::StartTiming();
-      std::unique_ptr<Graph> ug(graph);
-      RunMklLayoutRewritePass(&ug);
-      testing::StopTiming();
-    }
-    iters -= N;  // Our benchmark units are individual graph nodes,
-                 // not whole graphs
-    // delete graph;
-  }
-}
-BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
-
-}  // namespace
-
-#else  // INTEL_MKL_ML_ONLY
-
 // NOTE: Unit tests in this file rely on a topological sorted graph for
 // printing. But since sibling nodes of a node in the topologically sorted graph
 // can be printed in different orders, tests may fail if the order in which
@@ -1928,6 +65,13 @@ static void InitGraph(const string& s, Graph* graph,
 class MklLayoutPassTest : public ::testing::Test {
  public:
   MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+  // Ashraf added
+  Node* FindNode(const string& name) {
+    for (Node* node : graph_.nodes()) {
+      if (node->name() == name) return node;
+    }
+    LOG(FATAL) << name;
+  }
 
   void InitGraph(const string& s, const string& device = kCPUDevice) {
     ::tensorflow::InitGraph(s, &graph_, device);
@@ -1994,6 +138,8 @@ REGISTER_OP("_MklInput2")
     .Output("o: uint8")
     .Output("o1: uint8")
     .SetIsStateful();
+REGISTER_OP("Output2").Input("i: float").Input("i1: float").SetIsStateful();
+REGISTER_OP("Output").Input("i: float").SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -2318,6 +464,559 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
             "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1");
 }
 
+// Test set 3: Pad + Conv2D fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// After layout pass
+// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+// Test if input control edges do not duplicate after merge.
+// If both the merging ops have input control edge from a common op
+// then, the merged op will have only one control edge from that
+// common op.
+// padding is VALID type
+// A = input(image), A1 = input, B = input(paddings),
+// C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// A1:control->C:control
+// A1:control->E:control
+// After layout pass:
+// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2)
+// A1:control->E:control (only one control edge)
+TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A1' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  Node* a1 = FindNode("A1");
+  Node* c = FindNode("C");
+  Node* e = FindNode("E");
+  const Edge* edge = graph_.AddControlEdge(a1, c);
+  const Edge* edge_1 = graph_.AddControlEdge(a1, e);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;"
+      "A1:control->E:control;A:control->DMT/_0:control;A:control->DMT/"
+      "_1:control;"
+      "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"
+      "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+// Test if output control edges does not duplicate after merge.
+// If both the merging ops have output control edge to a common op,
+// then after merge, the merged op will have only one control edge
+// to that commom op.
+// padding is VALID type
+// A = input(image), B = input(paddings), C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// C:control->A1:control
+// E:control->A1:control
+// After layout pass:
+// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2)
+// E:control->A1:control (only one control edge)
+TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A1' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  Node* a1 = FindNode("A1");
+  Node* c = FindNode("C");
+  Node* e = FindNode("E");
+  const Edge* edge = graph_.AddControlEdge(c, a1);
+  const Edge* edge_1 = graph_.AddControlEdge(e, a1);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;"
+      "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+      "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"
+      "DMT/_2->E:5;E->Z;E:control->A1:control;Y->Z:1");
+}
+// Pad + Conv2D fusion with padding is VALID,
+// Input node pointing to both Pad and Conv2D
+// A = input(image), B = input(paddings), C= Pad
+// E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,A); Z=Zeta(E,Y)
+// After layout pass
+// _MklPadWithConv2D(A, A, B, DMT/_0, DMT/_1, DMT/_2)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_Input) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'A'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;A->E:1;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+// Pad + Conv2D with padding is VALID,
+// Input node pointing to both Pad and Conv2D
+// Output of both Pad and Conv2D feeds one node (Z as Output2)
+// A = input(as image), B = input(as paddings), C= Pad
+// E = Conv2D, Z = Output2
+// C=Pad(A,B); E=Conv2D(C,A); Z=Output(C,E)
+// After layout pass - No merging, since Pad and Conv2D both
+// feed to the same node (Z)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_InOutput) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'A'] }"
+      "node { name: 'Z' op: 'Output2'"
+      " input: ['C', 'E']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);DMT/_0(Const);DMT/_1(Const);"
+            "E(_MklConv2D);Z(Output2)|A->C;A->E:1;B->C:1;C->E;C->Z;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "DMT/_0->E:2;DMT/_1->E:3;E->Z:1");
+}
+// Pad + Conv2D; padding is SAME
+// A = input(image), B = input(paddings), C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "E(_MklConv2D);Y(Input);Z(Zeta)|A->C;B->C:1;C->E;"
+      "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+      "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1");
+}
+#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Positive) {
+  InitGraph(
+      "node { name: 'Input0' op: 'Input'}"
+      "node { name: 'Input1' op: 'Input'}"
+      "node { name: 'Const0' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node { name: 'Const1' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\003\\000\\000\\000\\001\\000\\000\\000\\002\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node {              \
+      name: 'Transpose0' \
+      op: 'Transpose'    \
+      input: 'Input0'    \
+      input: 'Const0'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node {                 \
+      name: 'Conv2D'        \
+      op: 'Conv2D'          \
+      input: 'Transpose0'   \
+      input: 'Input1'       \
+      attr {                \
+        key: 'T'            \
+        value {             \
+          type: DT_FLOAT    \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'data_format'  \
+        value {             \
+          s: 'NHWC'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'dilations'    \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'padding'      \
+        value {             \
+          s: 'SAME'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'strides'      \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'use_cudnn_on_gpu' \
+        value {                 \
+          b: true               \
+        }                       \
+      }                         \
+    }"
+      "node {              \
+      name: 'Transpose1' \
+      op: 'Transpose'    \
+      input: 'Conv2D'    \
+      input: 'Const1'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node { name: 'Relu' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['Transpose1'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "Const0(Const);Const1(Const);"
+            "Conv2D(_MklConv2D);DMT/_0(Const);DMT/_1(Const);Input0(Input);"
+            "Input1(Input);Relu(_MklRelu)|Conv2D->Relu;Conv2D:2->Relu:1;DMT/"
+            "_0->Conv2D:2;DMT/_1->Conv2D:3;Input0->Conv2D;"
+            "Input0:control->DMT/_0:control;Input0:control->DMT/"
+            "_1:control;Input1->Conv2D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Negative) {
+  InitGraph(
+      "node { name: 'Input0' op: 'Input'}"
+      "node { name: 'Input1' op: 'Input'}"
+      "node { name: 'Const0' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node { name: 'Const1' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node {              \
+      name: 'Transpose0' \
+      op: 'Transpose'    \
+      input: 'Input0'    \
+      input: 'Const0'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node {                 \
+      name: 'Conv2D'        \
+      op: 'Conv2D'          \
+      input: 'Transpose0'   \
+      input: 'Input1'       \
+      attr {                \
+        key: 'T'            \
+        value {             \
+          type: DT_FLOAT    \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'data_format'  \
+        value {             \
+          s: 'NHWC'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'dilations'    \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'padding'      \
+        value {             \
+          s: 'SAME'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'strides'      \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'use_cudnn_on_gpu' \
+        value {                 \
+          b: true               \
+        }                       \
+      }                         \
+    }"
+      "node {              \
+      name: 'Transpose1' \
+      op: 'Transpose'    \
+      input: 'Conv2D'    \
+      input: 'Const1'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node { name: 'Relu' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['Transpose1'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "Const0(Const);Const1(Const);"
+      "Conv2D(_MklConv2D);DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);"
+      "Input0(Input);Input1(Input);Relu(_MklRelu);"
+      "Transpose0(Transpose);Transpose1(Transpose)|Const0->Transpose0:1;Const1-"
+      ">Transpose1:1;"
+      "Conv2D->Transpose1;DMT/_0->Conv2D:2;DMT/_1->Conv2D:3;DMT/"
+      "_2->Relu:1;Input0->Transpose0;"
+      "Input1->Conv2D:1;Transpose0->Conv2D;Transpose0:control->DMT/_0:control;"
+      "Transpose0:control->DMT/"
+      "_1:control;Transpose1->Relu;Transpose1:control->DMT/_2:control");
+}
+#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
+
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node to Mkl node
 /////////////////////////////////////////////////////////////////////
@@ -2777,6 +1476,52 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
             "DMT/_1->C:2");
 }
 
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu6_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu6'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu6);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu6Grad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Relu6Grad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklRelu6Grad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu6Relu6Grad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu6'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Relu6Grad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu6);C(_MklRelu6Grad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+            "DMT/_1->C:2");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -3378,6 +2123,33 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
             "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
 }
 
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu6_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu6'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Relu6);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu6Grad_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Relu6Grad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Relu6Grad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -3602,8 +2374,6 @@ BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
 
-#endif  // INTEL_MKL_ML_ONLY
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index 6804ab84ce3260fa1cbb0b23cc2dff90baed8855..66467699454c4677138c98200983b8b8588b8d08 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -149,22 +149,20 @@ Status MklToTfConversionPass::InsertConversionNodeOnEdge(
   CHECK_NOTNULL(dst);
 
   Node* conversion_node = nullptr;
-  DataType src_datatype = DT_INVALID;
-  DataType dst_datatype = DT_INVALID;
+  DataType src_datatype = src->output_type(e->src_output());
+  DataType dst_datatype = dst->input_type(e->dst_input());
   string data_format;
 
-  TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype));
-  bool dst_dtype_found =
-      GetNodeAttr(dst->def(), "T", &dst_datatype) == Status::OK();
   // We compare source and destination datatypes only when both are found.
-  if (dst_dtype_found && (src_datatype != dst_datatype)) {
-    string err_msg = "T attribute of " + src->name() + " and " + dst->name() +
-                     " do not match. Will not insert" +
-                     " MklToTf node in such case.";
+  if (src_datatype != dst_datatype) {
+    string err_msg = "T attribute of " + src->name() + ":" +
+                     std::to_string(e->src_output()) + " and " + dst->name() +
+                     ":" + std::to_string(e->dst_input()) +
+                     " do not"
+                     " match. Will not insert MklToTf node in such case.";
     return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str());
   }
 
-  // Build the conversion node and specify src as input.
   TF_CHECK_OK(
       NodeBuilder((*g)->NewName("Mkl2Tf"), "_MklToTf")
           .Input(src, e->src_output())
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index d92874909f9eba3be1b4dcf57f2e3f8188a7d0ab..a91e6dd05738ae8242c812970e8bbc4a10c7675a 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -29,6 +29,8 @@ NodeBuilder::NodeOut::NodeOut(Node* n, int32 i)  // NOLINT(runtime/explicit)
       index(i),
       dt(SafeGetOutput(node, i, &error)) {}
 
+NodeBuilder::NodeOut::NodeOut(OutputTensor t) : NodeOut(t.node, t.index) {}
+
 NodeBuilder::NodeOut::NodeOut(StringPiece n, int32 i, DataType t)
     : node(nullptr), error(false), name(n), index(i), dt(t) {}
 
@@ -104,6 +106,11 @@ NodeBuilder& NodeBuilder::AssignedDevice(StringPiece device) {
   return *this;
 }
 
+NodeBuilder& NodeBuilder::XlaCluster(StringPiece xla_cluster) {
+  def_builder_.Attr("_XlaCluster", xla_cluster);
+  return *this;
+}
+
 Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const {
   // In case of error, set *created_node to nullptr.
   if (created_node != nullptr) *created_node = nullptr;
@@ -140,10 +147,10 @@ void NodeBuilder::AddIndexError(const Node* node, int i) {
         strings::StrCat("Attempt to add nullptr Node to node with type ",
                         def_builder_.op_def().name()));
   } else {
-    errors_.emplace_back(
-        strings::StrCat("Attempt to add output ", i, " of ", node->name(),
-                        " not in range [0, ", node->num_outputs(),
-                        ") to node with type ", def_builder_.op_def().name()));
+    errors_.emplace_back(strings::StrCat(
+        "Attempt to add output ", i, " of ", node->name(), " not in range [0, ",
+        node->num_outputs(), ") to node with type ",
+        def_builder_.op_def().name(), ". Node: ", FormatNodeForError(*node)));
   }
 }
 
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index d576985a232da5f402b2e2d26bac1d0e1306f82f..b1dc2ae92f14ba4519d98a4c556c1d06e14b6b5d 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -50,6 +50,7 @@ class NodeBuilder {
   struct NodeOut {
     // For referencing an existing Node.
     NodeOut(Node* n, int32 i = 0);
+    NodeOut(OutputTensor t);
 
     // For referencing Nodes not in the graph being built. It is
     // useful when preparing a graph for ExtendSession or creating a
@@ -103,6 +104,9 @@ class NodeBuilder {
   // Sets the device name in the "assigned device" field in tensorflow::Node.
   NodeBuilder& AssignedDevice(StringPiece device);
 
+  // Sets the _XlaCluster attribute in created node to `xla_cluster`.
+  NodeBuilder& XlaCluster(StringPiece xla_cluster);
+
   // Set the value of an attr.  attr_name must match the name of one of
   // attrs defined by the Op, and value must have the corresponding type
   // (see SetAttrValue() in ../framework/attr_value_util.h for legal
diff --git a/tensorflow/core/graph/tensor_id.h b/tensorflow/core/graph/tensor_id.h
index 0ba39426184e2c8b2e6f5abad2378c31a4c76f9a..b0f621fa6c4abced21df6e00bf852ff9642facb0 100644
--- a/tensorflow/core/graph/tensor_id.h
+++ b/tensorflow/core/graph/tensor_id.h
@@ -41,6 +41,9 @@ struct TensorId : public std::pair<StringPiece, int> {
   TensorId() : Base() {}
   TensorId(const SafeTensorId& id);
 
+  const StringPiece node() const { return first; }
+  int index() const { return second; }
+
   string ToString() const {
     if (second == Graph::kControlSlot) return strings::StrCat("^", first);
     return strings::StrCat(first, ":", second);
@@ -68,6 +71,9 @@ struct SafeTensorId : public std::pair<string, int> {
   SafeTensorId(const string& str, int idx) : Base(str, idx) {}
   SafeTensorId(const TensorId& id);
 
+  const string& node() const { return first; }
+  int index() const { return second; }
+
   string ToString() const {
     if (second == Graph::kControlSlot) return strings::StrCat("^", first);
     return strings::StrCat(first, ":", second);
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 0a38aa1c9192a6f2628c1ca916bd75a8cb51d2e8..0e74a30c7a92ebd46a933f1056ccb093fa095128 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -123,6 +123,17 @@ Node* Assign(Graph* g, Node* var, Node* val) {
   return ret;
 }
 
+Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive, bool reverse) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Cumsum")
+                  .Input(data)
+                  .Input(axes)
+                  .Attr("exclusive", exclusive)
+                  .Attr("reverse", reverse)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
              bool keep_dims) {
   Node* ret;
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index b00196f58735f938f562b5cabcd2985274b34f56..0c7233161f4128c1da0d8761b0b49fc2f4cf2524 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -68,6 +68,10 @@ Node* Recv(Graph* g, const string& tensor, const string& type,
            const string& sender, const uint64 sender_incarnation,
            const string& receiver);
 
+// Adds a cumsum "node" in "g" doing cumsum(data, axes).
+Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive = false,
+             bool reverse = false);
+
 // Adds a reduction "node" in "g" doing sum(data, axes).  "reduce" is
 // a reduction, e.g., Sum, Max, Min, Mean, etc.
 Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 7c6fe56e1f2f743bf74e3968eda01e58742ab008..f353d789d47030afda5d9680cca8094d48b827f1 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -23,9 +23,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -39,6 +41,7 @@ tf_cc_test(
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -66,8 +69,14 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":utils",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -81,6 +90,8 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -96,6 +107,8 @@ cc_library(
         ":utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -130,6 +143,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
@@ -164,8 +178,10 @@ cc_library(
         ":graph_view",
         ":grappler_item",
         ":utils",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -178,6 +194,7 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index f543dca49ecb23018bccd562ece5148836dfb720..dbd8f26c286f07107a63e9c745c442b171f29aaa 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -34,8 +34,11 @@ VirtualCluster::VirtualCluster(
 
 VirtualCluster::VirtualCluster(
     const std::unordered_map<string, DeviceProperties>& devices,
-    OpLevelCostEstimator* node_estimator, ReadyNodeManager* node_manager)
-    : Cluster(0), node_estimator_(node_estimator), node_manager_(node_manager) {
+    std::unique_ptr<OpLevelCostEstimator> node_estimator,
+    std::unique_ptr<ReadyNodeManager> node_manager)
+    : Cluster(0),
+      node_estimator_(std::move(node_estimator)),
+      node_manager_(std::move(node_manager)) {
   devices_ = devices;
 }
 
@@ -70,8 +73,8 @@ Status VirtualCluster::Run(const GraphDef& graph,
   item.graph = graph;
   item.feed = feed;
   item.fetch = fetch;
-  VirtualScheduler scheduler(&item, true, this, node_manager_.get());
-  TF_RETURN_IF_ERROR(scheduler.Init());
+  VirtualScheduler scheduler(true, this, node_manager_.get());
+  TF_RETURN_IF_ERROR(scheduler.Init(&item));
 
   if (metadata) {
     metadata->clear_step_stats();
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
index 6adb0b99bc913a3522373eee8154991b8450d041..d19e39cd29204c98d1edea03756649e61c2c4129 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -34,8 +34,8 @@ class VirtualCluster : public Cluster {
  public:
   VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices);
   VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
-                 OpLevelCostEstimator* node_estimator,
-                 ReadyNodeManager* node_manager);
+                 std::unique_ptr<OpLevelCostEstimator> node_estimator,
+                 std::unique_ptr<ReadyNodeManager> node_manager);
   VirtualCluster(const DeviceSet* device_set);
 
   ~VirtualCluster() override;
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index f3dc2c2091781035fa1eae8c2575b82bc4f47c8e..5090e62b2ccfb00241e2b9c87d1922320646632e 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -41,9 +41,11 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
-        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -236,6 +238,7 @@ tf_cc_test(
     name = "virtual_scheduler_test",
     srcs = ["virtual_scheduler_test.cc"],
     deps = [
+        ":utils",
         ":virtual_placer",
         ":virtual_scheduler",
         "//tensorflow/cc:cc_ops",
@@ -311,6 +314,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
     ] + tf_protos_grappler(),
 )
 
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index 0690640ffa4b6578d2f98e7c0cde8fae69c8f8ee..b7804ffaa5378c67028b39819a07fc00719c9896 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -27,90 +27,150 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+
+// Helper function in PredictCosts() to add cost node to cost_graph.
+void AddCostNode(ReadyNodeManager* node_manager, const OpContext& op_context,
+                 int node_id, const Costs& node_costs,
+                 gtl::FlatMap<string, CostGraphDef::Node*>* name_to_cost_node,
+                 gtl::FlatMap<string, int>* name_to_id,
+                 CostGraphDef* cost_graph) {
+  const string& op_name = op_context.name;
+  auto it = name_to_cost_node->find(op_name);
+  CostGraphDef::Node* node;
+  if (it != name_to_cost_node->end()) {
+    node = it->second;
+    node->clear_input_info();
+    node->clear_output_info();
+  } else {
+    node = cost_graph->add_node();
+    (*name_to_cost_node)[op_name] = node;
+    node->set_name(op_name);
+    node->set_id(node_id);
+    (*name_to_id)[node->name()] = node->id();
+  }
+  // For nodes we have seen before (e.g. Merge nodes are executed twice by
+  // VirtualScheduler), the following fields will be overwritten/updated
+  node->set_device(op_context.device_name);
+  node->set_compute_cost(node_costs.execution_time.asMicroSeconds().count());
+  node->set_compute_time(node_costs.compute_time.asMicroSeconds().count());
+  node->set_memory_time(node_costs.memory_time.asMicroSeconds().count());
+  node->set_inaccurate(node_costs.inaccurate);
+
+  for (const string& input : node_manager->GetCurrNode()->input()) {
+    int input_port;
+    string input_name = ParseNodeName(input, &input_port);
+
+    // All inputs should have been seen already unless this is a Merge node
+    if (name_to_id->find(input_name) == name_to_id->end()) {
+      if (!IsMerge(*node_manager->GetCurrNode()))
+        LOG(ERROR) << "input: " << input
+                   << " not found for non-Merge node: " << op_name;
+
+      // For Merge node, some of inputs may not be seen before
+      // For example, for a typical while loop in tensorflow, Merge node
+      // will be executed twice by VirtualScheduler (one for Enter, the
+      // other for NextIteration), so eventually both inputs will be added
+      continue;
+    }
+
+    if (IsControlInput(input)) {
+      node->add_control_input(name_to_id->at(input_name));
+    } else {
+      auto* input_info = node->add_input_info();
+      input_info->set_preceding_node(name_to_id->at(input_name));
+      input_info->set_preceding_port(input_port);
+    }
+  }
+
+  for (const auto& output : op_context.op_info.outputs()) {
+    auto output_info = node->add_output_info();
+    output_info->set_alias_input_port(-1);
+    output_info->set_dtype(output.dtype());
+    auto shape = output_info->mutable_shape();
+    *shape = output.shape();
+  }
+}
+
+}  // namespace
+
 AnalyticalCostEstimator::AnalyticalCostEstimator(Cluster* cluster,
                                                  bool use_static_shapes)
-    : cluster_(cluster),
-      node_estimator_(new OpLevelCostEstimator()),
-      node_manager_(VirtualScheduler::ReadyNodeManagerFactory("FirstReady")),
-      use_static_shapes_(use_static_shapes) {}
+    : AnalyticalCostEstimator(
+          cluster, absl::make_unique<OpLevelCostEstimator>(),
+          ReadyNodeManagerFactory("FirstReady"), use_static_shapes, nullptr) {}
 
 AnalyticalCostEstimator::AnalyticalCostEstimator(
-    Cluster* cluster, OpLevelCostEstimator* node_estimator,
-    ReadyNodeManager* node_manager, bool use_static_shapes)
+    Cluster* cluster, std::unique_ptr<OpLevelCostEstimator> node_estimator,
+    std::unique_ptr<ReadyNodeManager> node_manager, bool use_static_shapes,
+    RunMetadata* run_metadata)
     : cluster_(cluster),
-      node_estimator_(node_estimator),
-      node_manager_(node_manager),
-      use_static_shapes_(use_static_shapes) {}
+      node_estimator_(std::move(node_estimator)),
+      node_manager_(std::move(node_manager)),
+      use_static_shapes_(use_static_shapes),
+      run_metadata_(run_metadata) {
+  scheduler_ = absl::make_unique<VirtualScheduler>(use_static_shapes_, cluster_,
+                                                   node_manager_.get());
+}
 
 Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
   item_ = item;
   return Status::OK();
 }
 
+// TODO(b/67607683): unify logic with VirtualCluster logic
 Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
                                              CostGraphDef* cost_graph,
                                              Costs* costs) const {
   GrapplerItem item = item_;
   item.graph = optimized_graph;
 
-  std::unordered_map<string, CostGraphDef::Node*> name_to_cost;
+  auto status = scheduler_->Init(&item);
+  if (!status.ok()) {
+    costs->execution_time = Costs::Duration::max();
+    return status;
+  }
+
+  gtl::FlatMap<string, CostGraphDef::Node*> name_to_cost_node;
   if (cost_graph) {
+    // TODO(pcma): Clear nodes in cost_graph after we make sure we always pass
+    // in an empty cost_graph (a non-empty but incomplete cost_graph will cause
+    // problems, e.g., no node_id in cost_graph)
     for (auto& node : *cost_graph->mutable_node()) {
-      name_to_cost[node.name()] = &node;
+      name_to_cost_node[node.name()] = &node;
     }
   }
   std::vector<string> inaccurate_nodes;
   int nodes_executed = 0;
-  VirtualScheduler scheduler(&item, use_static_shapes_, cluster_,
-                             node_manager_.get());
-  auto status = scheduler.Init();
-  if (!status.ok()) {
-    costs->execution_time = Costs::Duration::max();
-    return status;
-  }
+  int node_id = 0;
+  gtl::FlatMap<string, int> name_to_id;
 
   Costs node_costs;
   do {
     ++nodes_executed;
-    OpContext op_context = scheduler.GetCurrNode();
-    const string& op_name = op_context.name;
-
+    OpContext op_context = scheduler_->GetCurrNode();
     node_costs = node_estimator_->PredictCosts(op_context);
+
     if (node_costs.inaccurate) {
-      inaccurate_nodes.push_back(op_name);
+      inaccurate_nodes.push_back(op_context.name);
+      if (node_costs.num_ops_with_unknown_shapes > 0)
+        VLOG(4) << op_context.name << " has "
+                << node_costs.num_ops_with_unknown_shapes << " unknown shapes";
     }
+
+    // TODO(pcma): Add unit tests for generating CostGraphDef
     if (cost_graph) {
-      auto it = name_to_cost.find(op_name);
-      CostGraphDef::Node* cost_node;
-      if (it != name_to_cost.end()) {
-        cost_node = it->second;
-      } else {
-        cost_node = cost_graph->add_node();
-        cost_node->set_name(op_name);
-      }
-      cost_node->set_device(op_context.device_name);
-      cost_node->set_compute_cost(
-          node_costs.execution_time.asMicroSeconds().count());
-      cost_node->set_compute_time(
-          node_costs.compute_time.asMicroSeconds().count());
-      cost_node->set_memory_time(
-          node_costs.memory_time.asMicroSeconds().count());
-      cost_node->set_inaccurate(node_costs.inaccurate);
-      for (const auto& output : op_context.op_info.outputs()) {
-        auto output_info = cost_node->add_output_info();
-        output_info->set_dtype(output.dtype());
-        auto shape = output_info->mutable_shape();
-        *shape = output.shape();
-      }
+      AddCostNode(node_manager_.get(), op_context, node_id++, node_costs,
+                  &name_to_cost_node, &name_to_id, cost_graph);
     }
-  } while (scheduler.MarkCurrNodeExecuted(node_costs));
+  } while (scheduler_->MarkCurrNodeExecuted(node_costs));
 
-  RunMetadata run_metadata;
-  *costs = scheduler.Summary(&run_metadata);
   VLOG(1) << inaccurate_nodes.size() << " out of " << nodes_executed
           << " nodes have inaccurate time estimation";
   if (VLOG_IS_ON(3)) {
@@ -119,10 +179,25 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     }
   }
 
+  *costs = scheduler_->Summary(run_metadata_);
+  // run_metadata_ gets step_stats and parition_graphs from Summary.
+  // Note that cost_graph could already point to the cost_graph field of
+  // run_metadata_, since both are set by the caller.
+  if (run_metadata_ && cost_graph &&
+      run_metadata_->mutable_cost_graph() != cost_graph)
+    *run_metadata_->mutable_cost_graph() = *cost_graph;
+
   if (VLOG_IS_ON(1)) {
-    bool verbosity = VLOG_IS_ON(2);
-    VLOG(1) << GetStatsStringFromRunMetadata(run_metadata, verbosity);
+    bool verbose = VLOG_IS_ON(2);
+    if (run_metadata_) {
+      VLOG(1) << GetStatsStringFromRunMetadata(*run_metadata_, verbose);
+    } else {
+      RunMetadata run_metadata;
+      scheduler_->GenerateRunMetadata(&run_metadata);
+      VLOG(1) << GetStatsStringFromRunMetadata(run_metadata, verbose);
+    }
   }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
index dd2738e088023ae387f269152c3ad9d33bcfd645..2629672459c512a22a861bd5c0dfe0207afc38a0 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -39,12 +39,16 @@ class AnalyticalCostEstimator : public CostEstimator {
  public:
   // Does not take ownership of cluster.
   AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes);
-  // Does not take ownership of the cluster, but takes ownership of the
-  // node_estimator and the node_manager
+  // Does not take ownership of cluster or run_metadata
+  //
+  // When metadata is provided, step_stats and partition_graphs fields will
+  // always be filled during PredictCosts, and the cost_graph field of metadata
+  // will be filled only when cost_graph is not nullptr when invoking
+  // PredictCosts.
   AnalyticalCostEstimator(Cluster* cluster,
-                          OpLevelCostEstimator* node_estimator,
-                          ReadyNodeManager* node_manager,
-                          bool use_static_shapes);
+                          std::unique_ptr<OpLevelCostEstimator> node_estimator,
+                          std::unique_ptr<ReadyNodeManager> node_manager,
+                          bool use_static_shapes, RunMetadata* run_metadata);
   ~AnalyticalCostEstimator() override {}
 
   // Initializes the estimator for the specified grappler item.
@@ -53,16 +57,21 @@ class AnalyticalCostEstimator : public CostEstimator {
 
   // Predict the performance of each node of the optimized graph and annotate
   // the CostGraphDef with the corresponding estimates. Also returns the
-  // expected latency for the whole graph.
+  // expected cost for the whole graph.
   Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
-                      Costs* overall_latency) const override;
+                      Costs* cost) const override;
+
+  const VirtualScheduler* GetScheduler() const { return scheduler_.get(); }
 
  private:
-  Cluster* cluster_;  // Not owned.
+  Cluster* cluster_;
   GrapplerItem item_;
   std::unique_ptr<OpLevelCostEstimator> node_estimator_;
   std::unique_ptr<ReadyNodeManager> node_manager_;
   bool use_static_shapes_;
+  std::unique_ptr<VirtualScheduler> scheduler_;
+
+  RunMetadata* run_metadata_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index e91f0cc9dacedbd6850c94722d82c18b1c298bd2..e3b3a36b096da807d05bee50d52a7a5c37884b52 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
 
 #include <chrono>
+#include <cmath>
 #include <unordered_map>
 #include "tensorflow/core/lib/core/status.h"
 
@@ -30,6 +31,40 @@ struct GrapplerItem;
 constexpr int64 kMemoryUnknown = -1ll;
 constexpr int64 kZeroMemory = 0ll;
 
+struct DeviceInfo {
+  // Billions of operations executed per second.
+  double gigaops;
+
+  // Bandwidth to main memory in GB per second.
+  double gb_per_sec;
+
+  // Read bandwidth to intermediate memory in GB per second.
+  double intermediate_read_gb_per_sec;
+
+  // Read bandwidth to intermediate memory in GB per second.
+  double intermediate_write_gb_per_sec;
+
+  DeviceInfo()
+      : gigaops(INFINITY),
+        gb_per_sec(INFINITY),
+        intermediate_read_gb_per_sec(INFINITY),
+        intermediate_write_gb_per_sec(INFINITY) {}
+
+  DeviceInfo(const DeviceInfo& input)
+      : gigaops(input.gigaops),
+        gb_per_sec(input.gb_per_sec),
+        intermediate_read_gb_per_sec(input.intermediate_read_gb_per_sec),
+        intermediate_write_gb_per_sec(input.intermediate_write_gb_per_sec) {}
+
+  DeviceInfo(double gigaops, double gb_per_sec,
+             double intermediate_read_gb_per_sec = INFINITY,
+             double intermediate_write_gb_per_sec = INFINITY)
+      : gigaops(gigaops),
+        gb_per_sec(gb_per_sec),
+        intermediate_read_gb_per_sec(intermediate_read_gb_per_sec),
+        intermediate_write_gb_per_sec(intermediate_write_gb_per_sec) {}
+};
+
 // Holds the set of things we might want to estimate or measure in Grappler.
 // Always produce execution time. Other fields are optional depending on the
 // estimator being used.
@@ -96,6 +131,9 @@ struct Costs {
   // Memory access cost of running the graph.
   Duration memory_time;
 
+  // Intermediate memory access cost of running the graph
+  Duration intermediate_memory_time;
+
   // This field can be a very pessimistic estimate of the main memory
   // requirements of a graph. For example, it might assume that all activations
   // are live for all of a graph's execution.
@@ -141,6 +179,7 @@ Costs::Costs() {
   execution_time = Duration::zero();
   compute_time = Duration::zero();
   memory_time = Duration::zero();
+  intermediate_memory_time = Duration::zero();
   max_memory = kMemoryUnknown;
   persistent_memory = kMemoryUnknown;
   temporary_memory = kMemoryUnknown;
@@ -153,6 +192,7 @@ Costs Costs::ZeroCosts() {
   costs.execution_time = Duration::zero();
   costs.compute_time = Duration::zero();
   costs.memory_time = Duration::zero();
+  costs.intermediate_memory_time = Duration::zero();
   costs.max_memory = kZeroMemory;
   costs.persistent_memory = kZeroMemory;
   costs.temporary_memory = kZeroMemory;
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 56c8339d57105b37aad178ce7743bea3ea6e467b..1df26d94d1fe1ed35765291da6c7d2eae513e713 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -15,18 +15,23 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
+#include <limits>
+#include <list>
 #include <queue>
 #include <unordered_map>
 #include <unordered_set>
+#include "absl/memory/memory.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/costs/utils.h"
-#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
@@ -259,6 +264,8 @@ typename DisjointSet<Handle>::Rep* DisjointSet<Handle>::Find(Handle value) {
   return root;
 }
 
+// TODO(dyoon): Move many helper functions in this file (including those within
+// SymbolicShapeRefiner class) to shared utils.
 bool IsEnqueue(const NodeDef& n) {
   return (n.op().find("Enqueue") != string::npos &&
           n.op().find("EnqueueMany") == string::npos);
@@ -284,9 +291,9 @@ bool HasAnyUnknownDimensions(const TensorShapeProto& proto) {
 // This really should be done in an external debugging tool
 void VerboseLogUnknownDimensionSources(
     const GraphDef& graph,
-    const std::map<string, std::vector<OpInfo::TensorProperties>>&
+    const std::unordered_map<string, std::vector<OpInfo::TensorProperties>>&
         input_properties_map,
-    const std::map<string, std::vector<OpInfo::TensorProperties>>&
+    const std::unordered_map<string, std::vector<OpInfo::TensorProperties>>&
         output_properties_map) {
   if (!VLOG_IS_ON(2)) {
     return;
@@ -380,21 +387,29 @@ TensorProto MakeTensorProtoFromShape(InferenceContext* ic,
   return tensor_proto;
 }
 
-// Returns a Const NodeDef with shape = `shape`, values = `tensor_as_shape`,
-// and dtype = `dtype`.
-NodeDef MakeConstNodeDefFromShape(InferenceContext* ic,
-                                  const ShapeHandle& shape,
-                                  const ShapeHandle& tensor_as_shape,
-                                  const DataType& dtype) {
+// Returns a Const NodeDef with tensor `tensor_proto` and dtype = `dtype`.
+NodeDef MakeConstNodeDefFromTensorProto(InferenceContext* ic,
+                                        const TensorProto& tensor_proto,
+                                        const DataType& dtype) {
   NodeDef const_node;
   const_node.set_name("const_from_shape");
   const_node.set_op("Const");
   auto* attr = const_node.mutable_attr();
   (*attr)["dtype"].set_type(dtype);
   auto* tensor = (*attr)["value"].mutable_tensor();
-  *tensor = MakeTensorProtoFromShape(ic, shape, tensor_as_shape, dtype);
+  *tensor = tensor_proto;
   return const_node;
 }
+
+// Returns a Const NodeDef with shape = `shape`, values = `tensor_as_shape`,
+// and dtype = `dtype`.
+NodeDef MakeConstNodeDefFromShape(InferenceContext* ic,
+                                  const ShapeHandle& shape,
+                                  const ShapeHandle& tensor_as_shape,
+                                  const DataType& dtype) {
+  return MakeConstNodeDefFromTensorProto(
+      ic, MakeTensorProtoFromShape(ic, shape, tensor_as_shape, dtype), dtype);
+}
 }  // namespace
 
 // Queue of nodes to process. Nodes can be enqueued in any order, but will be
@@ -442,10 +457,10 @@ class SymbolicShapeRefiner {
       const GraphView& graph,
       const std::unordered_map<string, std::unordered_set<int>>& fed_ports)
       : graph_(graph),
-        function_library_(OpRegistry::Global(), graph.GetGraph()->library()),
+        function_library_(OpRegistry::Global(), graph.graph()->library()),
         fed_ports_(fed_ports) {
-    graph_def_version_ = graph.GetGraph()->versions().producer();
-    node_to_context_.reserve(graph.GetGraph()->node_size());
+    graph_def_version_ = graph.graph()->versions().producer();
+    node_to_context_.reserve(graph.graph()->node_size());
   }
 
   const GraphView& graph() const { return graph_; }
@@ -455,6 +470,9 @@ class SymbolicShapeRefiner {
     DataTypeVector input_types;
     DataTypeVector output_types;
     std::unique_ptr<InferenceContext> inference_context;
+    // Additional info for propagating tensor values and tensor shapes.
+    std::vector<const TensorProto*> input_tensor_protos;
+    std::vector<const TensorProto*> output_tensor_protos;
     std::vector<ShapeHandle> output_tensors_as_shapes;
   };
 
@@ -495,7 +513,7 @@ class SymbolicShapeRefiner {
     // Placeholder with Const) don't affect one in
     // fun_to_grappler_function_item_.
     GrapplerFunctionItem grappler_function_item = it->second;
-    GraphView gv(&grappler_function_item.graph);
+    MutableGraphView gv(&grappler_function_item.graph);
 
     // Forward shapes from function input nodes to argument nodes.
     for (int i = 0; i < grappler_function_item.inputs().size(); ++i) {
@@ -507,27 +525,26 @@ class SymbolicShapeRefiner {
             "supported.");
       }
       NodeDef* fun_node = gv.GetNode(fun_input.input_name);
-      const string& input = function_node->input(i);
-      const string& node_name = NodeName(input);
+      const TensorId input_tensor = ParseTensorName(function_node->input(i));
 
-      if (IsControlInput(input)) {
+      if (IsControlInput(input_tensor)) {
         return errors::FailedPrecondition(
             "Function inputs should not contain control nodes.");
       }
 
-      NodeDef* input_node = graph_.GetNode(node_name);
+      const NodeDef* input_node = graph_.GetNode(input_tensor.node());
       if (input_node == nullptr) {
-        return errors::FailedPrecondition(node_name,
+        return errors::FailedPrecondition(input_tensor.node(),
                                           " was not found in the graph.");
       }
 
       InferenceContext* input_inference_context = GetContext(input_node);
       if (input_inference_context == nullptr) {
         return errors::FailedPrecondition(
-            "Inference context has not been created for ", node_name);
+            "Inference context has not been created for ", input_tensor.node());
       }
 
-      int output_port_num = NodePosition(input);
+      int output_port_num = input_tensor.index();
       AttrValue attr_output_shape;
       TensorShapeProto proto;
       const auto& handle = input_inference_context->output(output_port_num);
@@ -549,10 +566,16 @@ class SymbolicShapeRefiner {
     for (int i = grappler_function_item.inputs().size() - 1; i >= 0; --i) {
       const string& input = function_node->input(i);
       const string& node_name = NodeName(input);
-      NodeDef* input_node = graph_.GetNode(node_name);
+      const NodeDef* input_node = graph_.GetNode(node_name);
       if (IsConstant(*input_node)) {
         TF_CHECK_OK(
             ReplaceInputWithConst(*input_node, i, &grappler_function_item));
+      } else if (ctx->input_tensor_protos.size() > i &&
+                 ctx->input_tensor_protos[i] != nullptr) {
+        NodeDef const_input_node = MakeConstNodeDefFromTensorProto(
+            ic, *ctx->input_tensor_protos[i], ctx->input_types[i]);
+        TF_CHECK_OK(ReplaceInputWithConst(const_input_node, i,
+                                          &grappler_function_item));
       } else if (ic->input_tensors_as_shapes().size() > i &&
                  IsShapeFullyDefinedIntegerVectorOrScalar(
                      ic, ic->input(i), ic->input_tensors_as_shapes()[i],
@@ -574,6 +597,8 @@ class SymbolicShapeRefiner {
     // Add return nodes for output shapes.
     int output = 0;
     ctx->output_tensors_as_shapes.resize(grappler_function_item.output_size());
+    ctx->output_tensor_protos.resize(grappler_function_item.output_size(),
+                                     nullptr);
     for (auto const& out_arg : grappler_function_item.outputs()) {
       if (out_arg.output_tensors.size() > 1) {
         // TODO(jmdecker): Handle case of multiple output tensors
@@ -584,24 +609,22 @@ class SymbolicShapeRefiner {
 
       // It is guaranteed that output_tensors does not contain any control
       // inputs, so port_id >= 0.
-      string out_tensor = out_arg.output_tensors[0];
-      int port_id;
-      string node_name = ParseNodeName(out_tensor, &port_id);
+      TensorId out_tensor = ParseTensorName(out_arg.output_tensors[0]);
 
-      const NodeDef* retnode = gv.GetNode(node_name);
+      const NodeDef* retnode = gv.GetNode(out_tensor.node());
       if (retnode == nullptr) {
         return errors::FailedPrecondition(
-            "Unable to find return function_node ", node_name, " for ",
+            "Unable to find return function_node ", out_tensor.node(), " for ",
             function_node->name());
       }
 
       auto output_properties = gp.GetOutputProperties(retnode->name());
-      if (port_id >= output_properties.size()) {
+      if (out_tensor.index() >= output_properties.size()) {
         return errors::InvalidArgument(
-            out_tensor, " has invalid position ", port_id,
+            out_tensor.ToString(), " has invalid position ", out_tensor.index(),
             " (output_properties.size() = ", output_properties.size(), ").");
       }
-      auto const& outprop = output_properties[port_id];
+      auto const& outprop = output_properties[out_tensor.index()];
       const TensorShapeProto& shape = outprop.shape();
       ShapeHandle out;
       TF_RETURN_IF_ERROR(ic->MakeShapeFromShapeProto(shape, &out));
@@ -610,8 +633,11 @@ class SymbolicShapeRefiner {
         // Forward tensor value to output_tensors_as_shape.
         Tensor tensor;
         if (tensor.FromProto(outprop.value())) {
-          MaybeSetTensorValueToShape(ic, tensor,
-                                     &ctx->output_tensors_as_shapes[output]);
+          MaybeTensorValueToShape(ic, tensor,
+                                  &ctx->output_tensors_as_shapes[output]);
+          const_tensors_to_propagate_.push_back(outprop.value());
+          ctx->output_tensor_protos[output] =
+              &const_tensors_to_propagate_.back();
         }
       }
       output++;
@@ -636,6 +662,8 @@ class SymbolicShapeRefiner {
                                              nullptr);
     std::vector<ShapeHandle> input_tensors_as_shapes(
         inference_context->num_inputs());
+    node_context->input_tensor_protos.resize(inference_context->num_inputs(),
+                                             nullptr);
 
     for (int dst_input = 0; dst_input < inference_context->num_inputs();
          ++dst_input) {
@@ -651,55 +679,64 @@ class SymbolicShapeRefiner {
               "' was not previously added to SymbolicShapeRefiner.");
         }
 
-        if (IsConstant(*input)) {
-          // Convert constant value into tensors.
-          if (const_values[dst_input].FromProto(
-                  input->attr().at("value").tensor())) {
-            input_tensors[dst_input] = &const_values[dst_input];
-            MaybeSetTensorValueToShape(inference_context,
-                                       const_values[dst_input],
-                                       &input_tensors_as_shapes[dst_input]);
-          }
-        } else if (IsRank(*input)) {
-          if (c->inference_context->RankKnown(c->inference_context->input(0))) {
-            int32 rank =
-                c->inference_context->Rank(c->inference_context->input(0));
-            Tensor t(DT_INT32, {});
-            t.flat<int32>()(0) = rank;
-            const_values[dst_input] = t;
-            input_tensors[dst_input] = &const_values[dst_input];
-          }
-        } else if (IsSize(*input)) {
-          DimensionHandle size =
-              c->inference_context->NumElements(c->inference_context->input(0));
-          if (c->inference_context->ValueKnown(size)) {
-            int64 sz = c->inference_context->Value(size);
-            bool valid = false;
-            if (input->attr().at("T").type() == DT_INT32) {
-              if (sz < std::numeric_limits<int32>::max()) {
-                Tensor t(DT_INT32, {});
-                t.flat<int32>()(0) = sz;
-                const_values[dst_input] = t;
-                valid = true;
-              }
-            } else {
-              Tensor t(DT_INT64, {});
-              t.flat<int64>()(0) = sz;
-              const_values[dst_input] = t;
-              valid = true;
-            }
-            if (valid) {
-              input_tensors[dst_input] = &const_values[dst_input];
-            }
-          }
-        }
+        if (src_output >= c->inference_context->num_outputs())
+          return errors::OutOfRange("src_output = ", src_output,
+                                    ", but num_outputs is only ",
+                                    c->inference_context->num_outputs());
+
+        // Propagate input node's NodeContext info to the current node's
+        // NodeContext:
+        // output_tensor_protos to input_tensor_protos and input_tensors, and
+        // output_tensors_as_shapes to input_tensors_as_shapes.
 
         if (c->output_tensors_as_shapes.size() > src_output) {
           input_tensors_as_shapes[dst_input] =
               c->output_tensors_as_shapes[src_output];
         }
 
+        if (c->output_tensor_protos.size() > src_output) {
+          auto* tensor_proto = c->output_tensor_protos[src_output];
+          if (tensor_proto != nullptr &&
+              const_values[dst_input].FromProto(*tensor_proto)) {
+            input_tensors[dst_input] = &const_values[dst_input];
+            node_context->input_tensor_protos[dst_input] = tensor_proto;
+
+            if (!inference_context->FullyDefined(
+                    input_tensors_as_shapes[dst_input])) {
+              // Shape from a Const is not fully defined when the Const has
+              // value -1 (e.g., Reshape(x, Const(-1)) to reshape an arbitrary
+              // tensor x to a vector).
+              // It's possible that the same Const with -1 is used in many
+              // places, but that doesn't mean the resultant shapes are
+              // identical. e.g., x1 = Reshape(x, c) and y1 = Reshape(y, c),
+              // where c is -1. In this case, shape inference yields both x1 and
+              // y1 as rank 1, size unknown, but still the shapes of x1 and y1
+              // can be different. (even if we use different Const(-1) for x1
+              // and x2, graph optimzier may merge them to single Const through
+              // duplicate removal.)
+              // If we reuse output_tensors_as_shapes to input_tensors_as_shapes
+              // by copying ShapeHandle, they share the same Shape object, and
+              // SymbolicShapeManager, later in InferStatically(), assigns the
+              // same symbolic dim value (unique value < -1); in the above
+              // Reshape example, the shapes of x1 and y1 become, for example,
+              // [-278] and graph optimizer may yield incorrect output 'cause it
+              // assumes x1 and y1 have the same shape.
+              // To prevent this, we re-create a ShapeHandle from the Const
+              // tensor, instead of reusing output_tensors_as_shapes (so that
+              // ShapeHandles of the const fanouts have the same values,
+              // but different Shape objects -- SymbolicShapeManager assigns
+              // different symbol id to each fanout shape).
+              // TODO(dyoon): clean up the way values are propagated.
+              MaybeTensorValueToShape(inference_context,
+                                      const_values[dst_input],
+                                      &input_tensors_as_shapes[dst_input]);
+            }
+          }
+        }
+
         DCHECK_GE(dst_input, 0);
+        // NOTE: we check only shape is refined; we do not (yet) check whether
+        // tensor value is refined.
         if (!*refined && !inference_context->input(dst_input).SameHandle(
                              c->inference_context->output(src_output))) {
           *refined = true;
@@ -974,17 +1011,53 @@ class SymbolicShapeRefiner {
     return dim;
   }
 
-  Status InferShapes(const NodeDef& node, NodeContext* c) {
-    InferenceContext* ic = c->inference_context.get();
-
-    auto it = fed_ports_.find(node.name());
-    const bool is_fed = it != fed_ports_.end();
-
-    // Propagate shape tensors unless the node is fed.
+  Status MaybeUpdateNodeContextOutput(const NodeDef& node, const bool is_fed,
+                                      NodeContext* c) {
+    // Propagate tensors and shape tensors unless the node is fed.
     // TODO(bsteiner) We should still propagate the shapes to the ports that
     // aren't fed in the case of a ShapeN node.
+
+    InferenceContext* ic = c->inference_context.get();
     if (!is_fed) {
-      if (IsShape(node)) {
+      if (IsConstant(node)) {
+        c->output_tensor_protos.resize(1);
+        const TensorProto& tensor_proto = node.attr().at("value").tensor();
+        c->output_tensor_protos[0] = &tensor_proto;
+        c->output_tensors_as_shapes.resize(1);
+        MaybeTensorProtoToShape(ic, tensor_proto,
+                                &c->output_tensors_as_shapes[0]);
+      } else if (IsRank(node)) {
+        if (ic->RankKnown(ic->input(0))) {
+          // Propagate rank value.
+          int32 rank = ic->Rank(ic->input(0));
+          const_tensors_to_propagate_.push_back(
+              MakeIntegerScalarTensorProto(DT_INT32, rank));
+          c->output_tensor_protos.resize(1);
+          c->output_tensor_protos[0] = &const_tensors_to_propagate_.back();
+        }
+      } else if (IsSize(node)) {
+        DimensionHandle size = ic->NumElements(ic->input(0));
+        if (ic->ValueKnown(size)) {
+          // Propagate size value.
+          int64 sz = ic->Value(size);
+          bool valid = false;
+          if (node.attr().at("T").type() == DT_INT32) {
+            if (sz < std::numeric_limits<int32>::max()) {
+              const_tensors_to_propagate_.push_back(
+                  MakeIntegerScalarTensorProto(DT_INT32, sz));
+              valid = true;
+            }
+          } else {
+            const_tensors_to_propagate_.push_back(
+                MakeIntegerScalarTensorProto(DT_INT64, sz));
+            valid = true;
+          }
+          if (valid) {
+            c->output_tensor_protos.resize(1);
+            c->output_tensor_protos[0] = &const_tensors_to_propagate_.back();
+          }
+        }
+      } else if (IsShape(node)) {
         c->output_tensors_as_shapes.resize(1);
         c->output_tensors_as_shapes[0] = c->inference_context->input(0);
       } else if (IsShapeN(node)) {
@@ -1041,10 +1114,13 @@ class SymbolicShapeRefiner {
           c->output_tensors_as_shapes.resize(1);
           c->output_tensors_as_shapes[0] = ic->MakeShape(dims);
         }
-      } else if (IsIdentity(node)) {
-        // Pass input_tensors_as_shapes to output_tensors_as_shapes.
+      } else if (IsIdentity(node) || IsIdentityNSingleInput(node)) {
         c->output_tensors_as_shapes.resize(1);
         c->output_tensors_as_shapes[0] = ic->input_tensors_as_shapes()[0];
+        if (c->input_tensor_protos[0] != nullptr) {
+          c->output_tensor_protos.resize(1);
+          c->output_tensor_protos[0] = c->input_tensor_protos[0];
+        }
       } else if (IsSlice(node)) {
         ShapeHandle input = ic->input_tensors_as_shapes()[0];
         bool valid = ic->RankKnown(input);
@@ -1125,7 +1201,10 @@ class SymbolicShapeRefiner {
         }
       }
     }
+    return Status::OK();
+  }
 
+  Status InferShapes(const NodeDef& node, NodeContext* c) {
     // Infer the shapes of output tensors.
     if (!c->op_data || c->op_data->shape_inference_fn == nullptr) {
       // There is nothing more we can infer, annotate outputs with unknown
@@ -1137,6 +1216,8 @@ class SymbolicShapeRefiner {
         c->inference_context->Run(c->op_data->shape_inference_fn));
 
     Status status = Status::OK();
+    auto it = fed_ports_.find(node.name());
+    const bool is_fed = it != fed_ports_.end();
     if (is_fed) {
       // It is possible to feed node output ports with tensors of any shape: as
       // a result, the shape of a fed port is completely unknown.
@@ -1145,6 +1226,9 @@ class SymbolicShapeRefiner {
       }
     }
 
+    // Update NodeContext output fields after shape inference function runs.
+    status.Update(MaybeUpdateNodeContextOutput(node, is_fed, c));
+
     return status;
   }
 
@@ -1166,16 +1250,54 @@ class SymbolicShapeRefiner {
     return false;
   }
 
-  void MaybeSetTensorValueToShape(InferenceContext* ic, const Tensor& tensor,
-                                  ShapeHandle* tensors_as_shapes) {
+  TensorProto MakeIntegerScalarTensorProto(const DataType dtype,
+                                           const int64 val) {
+    TensorProto tensor_proto;
+    tensor_proto.set_dtype(dtype);
+    // Scalar TensorProto has an empty tensor_shape; no dim, no dim.size.
+    tensor_proto.mutable_tensor_shape();
+    if (dtype == DT_INT32) {
+      tensor_proto.add_int_val(val);
+    } else if (dtype == DT_INT64) {
+      tensor_proto.add_int64_val(val);
+    }
+    return tensor_proto;
+  }
+
+  bool MaybeTensorProtoToShape(InferenceContext* ic,
+                               const TensorProto& tensor_proto,
+                               ShapeHandle* tensors_as_shapes) {
+    // Skip if dtype is not integer.
+    if (tensor_proto.dtype() != DT_INT32 && tensor_proto.dtype() != DT_INT64) {
+      return false;
+    }
+    // Skip if shape is neither scalar nor vector.
+    if (tensor_proto.tensor_shape().unknown_rank() ||
+        tensor_proto.tensor_shape().dim_size() > 1) {
+      return false;
+    }
+    Tensor tensor;
+    if (!tensor.FromProto(tensor_proto)) {
+      return false;
+    }
+    return MaybeTensorValueToShape(ic, tensor, tensors_as_shapes);
+  }
+
+  bool MaybeTensorValueToShape(InferenceContext* ic, const Tensor& tensor,
+                               ShapeHandle* tensors_as_shapes) {
     // Integer tensors of rank one can also be interpreted as a shape
     // provided all their values are >= -1.
     if (IsIntegerVector(tensor)) {
-      ShapeHandle tensor_shape = ic->Vector(tensor.NumElements());
-      ShapeHandle shp;
-      // Note that MakeShapeFromTensor filters out invalid values (e.g., < -1).
-      if (ic->MakeShapeFromTensor(&tensor, tensor_shape, &shp).ok()) {
-        *tensors_as_shapes = shp;
+      bool has_values_smaller_than_minus_1 = false;
+      std::vector<DimensionHandle> dims;
+      for (int i = 0; i < tensor.NumElements(); i++) {
+        int64 value = tensor.dtype() == DT_INT32 ? tensor.flat<int32>()(i)
+                                                 : tensor.flat<int64>()(i);
+        has_values_smaller_than_minus_1 |= (value < -1);
+        dims.push_back(value < 0 ? ic->UnknownDim() : ic->MakeDim(value));
+      }
+      if (!has_values_smaller_than_minus_1) {
+        *tensors_as_shapes = ic->MakeShape(dims);
       }
     } else if (IsIntegerScalar(tensor)) {
       // Scalar constant.
@@ -1185,8 +1307,10 @@ class SymbolicShapeRefiner {
       // It's a limitation as we use ShapeHandle as a means to pass values.
       if (value >= -1) {
         *tensors_as_shapes = ic->MakeShape({ic->MakeDim(value)});
+        return true;
       }
     }
+    return false;
   }
 
   const GraphView& graph_;
@@ -1198,6 +1322,11 @@ class SymbolicShapeRefiner {
       fun_to_grappler_function_item_;
   FunctionLibraryDefinition function_library_;
   const std::unordered_map<string, std::unordered_set<int>>& fed_ports_;
+  // Store TensorProtos for tensor value propagation. Note that we use list, not
+  // vector, as we use pointers to the TensorProtos in this container. Vector
+  // may resize and copy the objects into a new buffer, then the existing
+  // pointers become dangling pointers.
+  std::list<TensorProto> const_tensors_to_propagate_;
 };
 
 // Keep track of shapes and dimensions in a graph.
@@ -1301,8 +1430,8 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
       continue;
     }
     ShapeHandle input = in->output(fanin.src.port_id);
-    CHECK_EQ(fanin.tgt.node, node);
-    c->SetInput(fanin.tgt.port_id, input);
+    CHECK_EQ(fanin.dst.node, node);
+    c->SetInput(fanin.dst.port_id, input);
     if (!out_initialized) {
       out_initialized = true;
       out = input;
@@ -1527,13 +1656,12 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   std::unordered_map<string, std::unordered_set<int>> fed_ports;
   if (!assume_valid_feeds) {
     for (const auto& feed : item_.feed) {
-      int port_index = 0;
-      string node_name = ParseNodeName(feed.first, &port_index);
-      fed_ports[node_name].insert(port_index);
+      SafeTensorId tensor_id = ParseTensorName(feed.first);
+      fed_ports[tensor_id.node()].insert(tensor_id.index());
     }
   }
 
-  GraphView graph_view(const_cast<GraphDef*>(&item_.graph));
+  GraphView graph_view(&item_.graph);
 
   // List the resources and the nodes using them. Also collect the Merge nodes,
   // fed nodes, and primary inputs.
@@ -1585,10 +1713,10 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   for (const auto& resource : resources) {
     for (const NodeDef* src : resource.second.first) {
       resource_handles[src] = resource.first;
-      for (const NodeDef* tgt : resource.second.second) {
+      for (const NodeDef* dst : resource.second.second) {
         // Add control edges from enqueue to dequeue nodes to ensure they are
         // processed in their logical order.
-        extra_deps.emplace_back(src, tgt);
+        extra_deps.emplace_back(src, dst);
       }
     }
   }
@@ -1624,7 +1752,8 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
       PropagateShapes(&refiner, &new_shapes, resource_handles, num_loops));
 
   // Track shapes globally across the graph.
-  SymbolicShapeManager shape_manager;
+  std::unique_ptr<SymbolicShapeManager> shape_manager =
+      absl::make_unique<SymbolicShapeManager>();
   bool found_error = false;
   for (const NodeDef& node : item_.graph.node()) {
     auto node_ctx = refiner.GetContext(&node);
@@ -1637,14 +1766,14 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
       continue;
     }
     for (const auto& merged_shapes : node_ctx->MergedShapes()) {
-      if (!shape_manager.Merge(merged_shapes.first, merged_shapes.second)
+      if (!shape_manager->Merge(merged_shapes.first, merged_shapes.second)
                .ok()) {
         found_error = true;
         break;
       }
     }
     for (const auto& merged_dims : node_ctx->MergedDims()) {
-      if (!shape_manager.Merge(merged_dims.first, merged_dims.second).ok()) {
+      if (!shape_manager->Merge(merged_dims.first, merged_dims.second).ok()) {
         found_error = true;
         break;
       }
@@ -1652,7 +1781,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     if (found_error) {
       // The shapes aren't consistent, we can't infer safely: discard all the
       // information discovered so far.
-      shape_manager = SymbolicShapeManager();
+      shape_manager = absl::make_unique<SymbolicShapeManager>();
       break;
     }
   }
@@ -1676,15 +1805,17 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
       input_properties.resize(ic->num_inputs());
       GraphView::InputPort input(&node, -1);
       for (int i = 0; i < ic->num_inputs(); ++i) {
-        shape_manager.AsTensorProperties(ic->input(i), ctx->input_types[i],
-                                         &input_properties[i]);
+        shape_manager->AsTensorProperties(ic->input(i), ctx->input_types[i],
+                                          &input_properties[i]);
         input.port_id = i;
         GraphView::OutputPort fanin = graph_view.GetRegularFanin(input);
-        // Export tensor value (either const tensor or input_tensors_as_shapes)
-        // to input_properties.value.
+        // Export tensor value to input_properties.value.
         if (IsConstant(*fanin.node)) {
           const TensorProto& raw_val = fanin.node->attr().at("value").tensor();
           *input_properties[i].mutable_value() = raw_val;
+        } else if (ctx->input_tensor_protos.size() > i &&
+                   ctx->input_tensor_protos[i] != nullptr) {
+          *input_properties[i].mutable_value() = *ctx->input_tensor_protos[i];
         } else if (ic->input_tensors_as_shapes().size() > i &&
                    IsShapeFullyDefinedIntegerVectorOrScalar(
                        ic, ic->input(i), ic->input_tensors_as_shapes()[i],
@@ -1705,13 +1836,15 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
 
       output_properties.resize(ic->num_outputs());
       for (int i = 0; i < ic->num_outputs(); ++i) {
-        shape_manager.AsTensorProperties(ic->output(i), ctx->output_types[i],
-                                         &output_properties[i]);
-        // Export tensor value (either const tensor or input_tensors_as_shapes)
-        // to output_properties.value.
+        shape_manager->AsTensorProperties(ic->output(i), ctx->output_types[i],
+                                          &output_properties[i]);
+        // Export tensor value to output_properties.value.
         if (IsConstant(node)) {
           const TensorProto& raw_val = node.attr().at("value").tensor();
           *output_properties[i].mutable_value() = raw_val;
+        } else if (ctx->output_tensor_protos.size() > i &&
+                   ctx->output_tensor_protos[i] != nullptr) {
+          *output_properties[i].mutable_value() = *ctx->output_tensor_protos[i];
         } else if (ctx->output_tensors_as_shapes.size() > i &&
                    IsShapeFullyDefinedIntegerVectorOrScalar(
                        ic, ic->output(i), ctx->output_tensors_as_shapes[i],
@@ -1792,12 +1925,12 @@ Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
   return Status::OK();
 }
 
-bool GraphProperties::HasInputProperties(const string& name) const {
-  return input_properties_.find(name) != input_properties_.end();
+bool GraphProperties::HasInputProperties(const string& node_name) const {
+  return input_properties_.find(node_name) != input_properties_.end();
 }
 
-bool GraphProperties::HasOutputProperties(const string& name) const {
-  return output_properties_.find(name) != output_properties_.end();
+bool GraphProperties::HasOutputProperties(const string& node_name) const {
+  return output_properties_.find(node_name) != output_properties_.end();
 }
 
 const std::vector<OpInfo::TensorProperties>&
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 28fd7565ccf5ba816a38e148a2cec82862139004..fbae1ca5b437c1d73c38da3ef580a9e49e8c84c5 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -63,8 +63,8 @@ class GraphProperties {
   // values strictly less than -1 to encode symbolic dimensions: although we
   // don't know the actual value of the symbolic dimension, we know that all the
   // dimensions denoted by the same negative value are the equal.
-  bool HasInputProperties(const string& name) const;
-  bool HasOutputProperties(const string& name) const;
+  bool HasInputProperties(const string& node_name) const;
+  bool HasOutputProperties(const string& node_name) const;
   const std::vector<OpInfo::TensorProperties>& GetInputProperties(
       const string& node_name) const;
   const std::vector<OpInfo::TensorProperties>& GetOutputProperties(
@@ -123,8 +123,10 @@ class GraphProperties {
 
   // Data members
   const GrapplerItem& item_;
-  std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
-  std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
+  std::unordered_map<string, std::vector<OpInfo::TensorProperties>>
+      input_properties_;
+  std::unordered_map<string, std::vector<OpInfo::TensorProperties>>
+      output_properties_;
   const std::vector<OpInfo::TensorProperties> missing_properties_;
 };
 
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 362092a6cfa915b0427abcd939dad31b117ba9a8..5aae773994c3136b3f41b2ae7934073cbb1daf98 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -285,6 +286,37 @@ TEST_F(GraphPropertiesTest, Variables) {
   }
 }
 
+TEST_F(GraphPropertiesTest, ReadVariableOpAfterEnter) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", TensorShape({3, 7}))
+                  .Finalize(item.graph.add_node()));
+  TF_CHECK_OK(NodeDefBuilder("Enter", "Enter")
+                  .Attr("T", DT_RESOURCE)
+                  .Attr("frame_name", "while_context")
+                  .Attr("is_constant", true)
+                  .Attr("parallel_iterations", 10)
+                  .Input("Var", 0, DT_RESOURCE)
+                  .Finalize(item.graph.add_node()));
+  TF_CHECK_OK(NodeDefBuilder("ReadVariableOpAfterEnter", "ReadVariableOp")
+                  .Attr("dtype", DT_FLOAT)
+                  .Input("Enter", 0, DT_RESOURCE)
+                  .Finalize(item.graph.add_node()));
+
+  // LOG(INFO) << item.graph.DebugString();
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+  const auto props = properties.GetOutputProperties("ReadVariableOpAfterEnter");
+  EXPECT_EQ(1, props.size());
+  const OpInfo::TensorProperties& prop = props[0];
+  EXPECT_EQ(DT_FLOAT, prop.dtype());
+  EXPECT_FALSE(prop.shape().unknown_rank());
+  EXPECT_EQ(2, prop.shape().dim_size());
+  EXPECT_EQ(3, prop.shape().dim(0).size());
+  EXPECT_EQ(7, prop.shape().dim(1).size());
+}
+
 TEST_F(GraphPropertiesTest, VarHandles) {
   GrapplerItem item;
   TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
@@ -865,8 +897,8 @@ TEST_F(GraphPropertiesTest, TensorAsShapesPropagation) {
   EXPECT_TRUE(properties.GetOutputProperties("b1")[0].has_value());
   EXPECT_TRUE(properties.GetOutputProperties("c")[0].has_value());
   EXPECT_TRUE(properties.GetInputProperties("c1")[0].has_value());
-  // Note that we propagate tensro value of only 1D vector and scalar.
-  EXPECT_FALSE(properties.GetOutputProperties("c1")[0].has_value());
+  // Note that we propagate tensor value of only 1D vector and scalar.
+  EXPECT_TRUE(properties.GetOutputProperties("c1")[0].has_value());
 
   // Check values.
   ExpectTensorValues({5, 7}, properties.GetOutputProperties("a")[0].value());
@@ -883,7 +915,8 @@ TEST_F(GraphPropertiesTest, TensorAsShapesPropagation) {
                      properties.GetOutputProperties("c")[0].value());
   ExpectTensorValues({c_values},
                      properties.GetInputProperties("c1")[0].value());
-  // No output value for c1, as it's neither 1D vector nor scalar.
+  ExpectTensorValues({c_values},
+                     properties.GetOutputProperties("c1")[0].value());
 }
 
 TEST_F(GraphPropertiesTest, IdentityPassingShape) {
@@ -928,6 +961,50 @@ TEST_F(GraphPropertiesTest, PackWithConstInput) {
   EXPECT_EQ("float: [1,2,3,4]", PropToString(out_prop0));
 }
 
+TEST_F(GraphPropertiesTest, RankOp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("Const"), 1, {4, 4, 4});
+  Output r = ops::Rank(s.WithOpName("Rank"), c);
+  Output i = ops::Identity(s.WithOpName("Identity"), r);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+  const auto rank_props = properties.GetOutputProperties("Rank");
+  const OpInfo::TensorProperties rank_prop0 = rank_props[0];
+  EXPECT_EQ("int32: []", PropToString(rank_prop0));
+  EXPECT_TRUE(rank_prop0.has_value());
+  ExpectTensorValues({3}, rank_prop0.value());
+  const auto identity_props = properties.GetOutputProperties("Identity");
+  const OpInfo::TensorProperties identity_props0 = identity_props[0];
+  EXPECT_EQ("int32: []", PropToString(identity_props0));
+  EXPECT_TRUE(identity_props0.has_value());
+  ExpectTensorValues({3}, identity_props0.value());
+}
+
+TEST_F(GraphPropertiesTest, SizeOp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("Const"), 1, {1, 2, 3, 4});
+  Output r = ops::Size(s.WithOpName("Size"), c);
+  Output i = ops::Identity(s.WithOpName("Identity"), r);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+  const auto size_props = properties.GetOutputProperties("Size");
+  const OpInfo::TensorProperties size_props0 = size_props[0];
+  EXPECT_EQ("int32: []", PropToString(size_props0));
+  EXPECT_TRUE(size_props0.has_value());
+  ExpectTensorValues({24}, size_props0.value());
+  const auto identity_props = properties.GetOutputProperties("Identity");
+  const OpInfo::TensorProperties identity_props0 = identity_props[0];
+  EXPECT_EQ("int32: []", PropToString(identity_props0));
+  EXPECT_TRUE(identity_props0.has_value());
+  ExpectTensorValues({24}, identity_props0.value());
+}
+
 TEST_F(GraphPropertiesTest, PackWithIdentityInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   // Same to PackWithConstInput test case, but a, b, c, and d are Identity ops
@@ -1340,6 +1417,8 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   Output zero = ops::Const(s.WithOpName("zero"), 0.0f, {});
   Output g = ops::Shape(s.WithOpName("g"), c);
   Output h = ops::Fill(s.WithOpName("h"), g, zero);
+  Output zero_idx = ops::Const(s.WithOpName("zero_idx"), {0}, {1});
+  Output j = ops::Sum(s.WithOpName("j"), a, zero_idx);
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -1382,6 +1461,10 @@ TEST_F(GraphPropertiesTest, SymbolicShapes) {
   ASSERT_EQ(2, shape_f.dim_size());
   EXPECT_EQ(shape_h.dim(0).size(), shape_c.dim(0).size());
   EXPECT_EQ(shape_h.dim(1).size(), shape_c.dim(1).size());
+
+  const auto shape_j = properties.GetOutputProperties("j").at(0).shape();
+  ASSERT_EQ(1, shape_j.dim_size());
+  EXPECT_EQ(shape_j.dim(0).size(), shape_a.dim(1).size());
 }
 
 TEST_F(GraphPropertiesTest, DoNotValidateColocationConstraints) {
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 71f4d9fd05cd15581b7631d403f52823e4310f1e..0e55209238555deb88d69ba97fc4df8cb11d3677 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -311,8 +311,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {"Square", EIGEN_COST(scalar_square_op<float>)},
       {"Tanh", EIGEN_COST(scalar_tanh_op<float>)},
       {"Relu", EIGEN_COST(scalar_max_op<float>)},
-      {"Sigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
-      {"QuantizedSigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
+      {"Sigmoid", EIGEN_COST(scalar_logistic_op<float>)},
+      {"QuantizedSigmoid", EIGEN_COST(scalar_logistic_op<float>)},
       {"Sign", EIGEN_COST(scalar_sign_op<float>)},
       {"Sin", EIGEN_COST(scalar_sin_op<float>)},
       {"Tan", EIGEN_COST(scalar_tan_op<float>)},
@@ -372,7 +372,7 @@ Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
   return costs;
 }
 
-OpLevelCostEstimator::DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
+DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
     const DeviceProperties& device) const {
   double gflops = -1;
   double gb_per_sec = -1;
@@ -420,7 +420,7 @@ OpLevelCostEstimator::DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
   DCHECK_LT(0, gflops) << device.DebugString();
   DCHECK_LT(0, gb_per_sec) << device.DebugString();
 
-  return {gflops, gb_per_sec};
+  return DeviceInfo(gflops, gb_per_sec);
 }
 
 Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
@@ -478,8 +478,8 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
   bool unknown_shapes = false;
   const double input_size = CalculateInputSize(op_info, &unknown_shapes);
   const double output_size = CalculateOutputSize(op_info, &unknown_shapes);
-  const double total_io_bytes = input_size + output_size;
-  Costs costs = PredictOpCountBasedCost(operations, total_io_bytes, op_info);
+  Costs costs =
+      PredictOpCountBasedCost(operations, input_size, output_size, op_info);
   costs.inaccurate = unknown_shapes;
   costs.num_ops_with_unknown_shapes = unknown_shapes;
   costs.max_memory = output_size;
@@ -487,9 +487,13 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
 }
 
 Costs OpLevelCostEstimator::PredictOpCountBasedCost(
-    double operations, double total_io_bytes, const OpInfo& op_info) const {
+    double operations, double input_io_bytes, double output_io_bytes,
+    const OpInfo& op_info) const {
+  double total_io_bytes = input_io_bytes + output_io_bytes;
   const DeviceInfo device_info = GetDeviceInfo(op_info.device());
-  if (device_info.gigaops <= 0 || device_info.gb_per_sec <= 0) {
+  if (device_info.gigaops <= 0 || device_info.gb_per_sec <= 0 ||
+      device_info.intermediate_read_gb_per_sec <= 0 ||
+      device_info.intermediate_write_gb_per_sec <= 0) {
     VLOG(1) << "BAD DEVICE. Op:" << op_info.op()
             << " device type:" << op_info.device().type()
             << " device model:" << op_info.device().model();
@@ -504,9 +508,29 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
   VLOG(1) << "Op:" << op_info.op() << " Size (KB):" << (total_io_bytes) / 1e3
           << " Memory Time (ns):" << memory_cost.count();
 
+  // Check if bytes > 0.  If it's not and the bandwidth is set to infinity
+  // then the result would be undefined.
+  double intermediate_read_time =
+      (input_io_bytes > 0)
+          ? std::ceil(input_io_bytes / device_info.intermediate_read_gb_per_sec)
+          : 0;
+
+  double intermediate_write_time =
+      (output_io_bytes > 0)
+          ? std::ceil(output_io_bytes /
+                      device_info.intermediate_write_gb_per_sec)
+          : 0;
+
+  Costs::NanoSeconds intermediate_memory_cost(intermediate_read_time +
+                                              intermediate_write_time);
+  VLOG(1) << "Op:" << op_info.op() << " Size (KB):" << (total_io_bytes) / 1e3
+          << " Intermediate Memory Time (ns):"
+          << intermediate_memory_cost.count();
+
   Costs costs;
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
+  costs.intermediate_memory_time = intermediate_memory_cost;
   CombineCostsAndUpdateExecutionTime(&costs);
   return costs;
 }
@@ -1273,8 +1297,8 @@ Costs OpLevelCostEstimator::PredictGatherOrSlice(
         CalculateTensorElementCount(op_info.inputs(1), &unknown_shapes);
   }
 
-  const double total_io = input_size + output_size;
-  Costs costs = PredictOpCountBasedCost(op_count, total_io, op_info);
+  Costs costs =
+      PredictOpCountBasedCost(op_count, input_size, output_size, op_info);
   costs.inaccurate = unknown_shapes;
   costs.num_ops_with_unknown_shapes = unknown_shapes;
   costs.max_memory = output_size;
@@ -1291,12 +1315,15 @@ Costs OpLevelCostEstimator::PredictFusedOp(
   // operations here; so we simply add the compute times of each component
   // operation, then update the execution time.
   Costs fused_cost = PredictOpCountBasedCost(0, op_context.op_info);
+
   fused_cost.compute_time = 0;
   fused_cost.inaccurate = false;
   for (auto& fused_op : fused_op_contexts) {
     auto op_cost = PredictCosts(fused_op);
+
     fused_cost.compute_time += op_cost.compute_time;
     fused_cost.inaccurate |= op_cost.inaccurate;
+    fused_cost.intermediate_memory_time += op_cost.intermediate_memory_time;
   }
 
   CombineCostsAndUpdateExecutionTime(&fused_cost);
@@ -1415,8 +1442,8 @@ Costs OpLevelCostEstimator::PredictMaxPool(const OpContext& op_context) const {
   const double total_output_size =
       CalculateOutputSize(op_info, &found_unknown_shapes);
 
-  Costs costs = PredictOpCountBasedCost(
-      ops, total_input_size + total_output_size, op_info);
+  Costs costs = PredictOpCountBasedCost(ops, total_input_size,
+                                        total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
@@ -1458,8 +1485,8 @@ Costs OpLevelCostEstimator::PredictMaxPoolGrad(
   const double total_output_size =
       CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
 
-  Costs costs = PredictOpCountBasedCost(
-      ops, total_input_size + total_output_size, op_info);
+  Costs costs = PredictOpCountBasedCost(ops, total_input_size,
+                                        total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
@@ -1491,8 +1518,8 @@ Costs OpLevelCostEstimator::PredictAvgPool(const OpContext& op_context) const {
   const double total_output_size =
       CalculateOutputSize(op_info, &found_unknown_shapes);
 
-  Costs costs = PredictOpCountBasedCost(
-      ops, total_input_size + total_output_size, op_info);
+  Costs costs = PredictOpCountBasedCost(ops, total_input_size,
+                                        total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
@@ -1544,8 +1571,8 @@ Costs OpLevelCostEstimator::PredictAvgPoolGrad(
   const double total_output_size =
       CalculateOutputSize(op_info, &found_unknown_shapes);
 
-  Costs costs = PredictOpCountBasedCost(
-      ops, total_input_size + total_output_size, op_info);
+  Costs costs = PredictOpCountBasedCost(ops, total_input_size,
+                                        total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
@@ -1590,9 +1617,9 @@ Costs OpLevelCostEstimator::PredictFusedBatchNorm(
     total_output_size = size_nhwc;
   }
 
-  Costs costs = PredictOpCountBasedCost(
-      ops, total_input_size + total_output_size + total_internal_read_size,
-      op_info);
+  Costs costs =
+      PredictOpCountBasedCost(ops, total_input_size + total_internal_read_size,
+                              total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
@@ -1624,9 +1651,9 @@ Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
   double total_internal_read_size = size_nhwc;
   double total_output_size = size_nhwc * 1 + size_c * 2;
 
-  Costs costs = PredictOpCountBasedCost(
-      ops, total_input_size + total_output_size + total_internal_read_size,
-      op_info);
+  Costs costs =
+      PredictOpCountBasedCost(ops, total_input_size + total_internal_read_size,
+                              total_output_size, op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   costs.max_memory = total_output_size;
@@ -1637,9 +1664,12 @@ Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
 void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
     Costs* costs) const {
   if (compute_memory_overlap_) {
-    costs->execution_time = std::max(costs->compute_time, costs->memory_time);
+    costs->execution_time =
+        std::max(costs->intermediate_memory_time,
+                 std::max(costs->compute_time, costs->memory_time));
   } else {
-    costs->execution_time = costs->compute_time + costs->memory_time;
+    costs->execution_time = costs->compute_time + costs->memory_time +
+                            costs->intermediate_memory_time;
   }
 }
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index a277dfdf65dfc7604c79332a32293ce14c4378f7..84dd9213f773b538db71f0999c7ffd0b34e1881c 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -40,12 +40,6 @@ class OpLevelCostEstimator {
 
   virtual Costs PredictCosts(const OpContext& op_context) const;
 
-  // Basic device performance info, sufficient for roofline estimate.
-  struct DeviceInfo {
-    double gigaops;     // Billions of operations executed per second.
-    double gb_per_sec;  // Bandwidth to main memory in GB per second.
-  };
-
   // Returns basic device performance info.
   virtual DeviceInfo GetDeviceInfo(const DeviceProperties& device) const;
 
@@ -60,7 +54,8 @@ class OpLevelCostEstimator {
   // Naive cost estimate based on the given operations count and the given total
   // io size in bytes. Sizes of op_info inputs and outputs are not taken into
   // consideration.
-  Costs PredictOpCountBasedCost(double operations, double total_io_bytes,
+  Costs PredictOpCountBasedCost(double operations, double input_io_bytes,
+                                double output_io_bytes,
                                 const OpInfo& op_info) const;
 
   // This family of routines counts the number of operations to perform the
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 998bd59dce37e320b847852fe0c5529c5bccebc4..c9ce63a8ef2aa301f690cec16fcd03fb83309c7c 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -832,7 +832,7 @@ TEST_F(OpLevelCostEstimatorTest, GetTensorShapeProtoFromTensorProto) {
   EXPECT_FALSE(
       GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
 
-  // Check GetTensorShapeProtoFromTensorProto() resturns correct values.
+  // Check GetTensorShapeProtoFromTensorProto() returns correct values.
   {
     std::vector<int64> shape_expected = {10, 20, 30, 40};
     GetTensorProto(DT_INT32, {4}, shape_expected,
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 5415324b4862ce975a8f17f87ec4d9dd914ba1fb..7d868a3679e5b3d5759fdd951e726cfe7af3babf 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -74,7 +74,8 @@ static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
       }
       break;
     }
-    default: {}
+    default: {
+    }
   }
   return tensors;
 }
@@ -201,6 +202,43 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
   return inputs;
 }
 
+int64 CalculateTensorSize(const OpInfo::TensorProperties& prop) {
+  int64 size = DataTypeSize(BaseType(prop.dtype()));
+  TensorShapeProto shape = prop.shape();
+
+  // Can't infer the size if the rank is unknown. It has to be at least a
+  // scalar though.
+  if (shape.unknown_rank()) {
+    VLOG(2) << "CalculateTensorSize() -- unknown rank";
+    return size;
+  }
+
+  // If one of the dimensions is unknown statically, assume it's at least one.
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    if (shape.dim(i).size() < 0) {
+      shape.mutable_dim(i)->set_size(1);
+      VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i;
+    }
+  }
+
+  int64 num_elems = TensorShape(shape).num_elements();
+  return num_elems * size;
+}
+
+int64 CalculateOutputSize(
+    const std::vector<OpInfo::TensorProperties>& output_properties,
+    const int port_num) {
+  if (port_num < 0) return 4;  // 4B for control dependency.
+
+  if (port_num >= output_properties.size()) {
+    LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num
+               << " >= output_properties.size(): " << output_properties.size();
+    return 0;
+  }
+
+  return CalculateTensorSize(output_properties[port_num]);
+}
+
 DeviceProperties GetDeviceInfo(const string& device_str) {
   DeviceProperties unknown;
   unknown.set_type("UNKNOWN");
diff --git a/tensorflow/core/grappler/costs/utils.h b/tensorflow/core/grappler/costs/utils.h
index 5fd67177121be28d8ed727986ffba406dd0450d0..ea64e5a41dff54e455d71d2841807a088e3c44ce 100644
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@@ -43,6 +43,17 @@ std::vector<OpInfo::TensorProperties> FindInputFeatures(
     const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
     const std::unordered_map<string, const NodeDef*>& name_to_node);
 
+// Returns the size of tensor (unit: bytes). For tensor shape with unknown rank,
+// it assumes the tensor to be scalar. For any unknown dimension, it assumes
+// size one.
+int64 CalculateTensorSize(const OpInfo::TensorProperties& prop);
+
+// Returns the size of output at port_num (unit: bytes). A special case is
+// port_num -1, which is for control dependency and assumed to be 4 bytes.
+int64 CalculateOutputSize(
+    const std::vector<OpInfo::TensorProperties>& output_properties,
+    int port_num);
+
 // Returns the DeviceProperties of the device on which 'node' runs.
 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node);
 DeviceProperties GetDeviceInfo(const string& device_str);
diff --git a/tensorflow/core/grappler/costs/utils_test.cc b/tensorflow/core/grappler/costs/utils_test.cc
index baa654f475776b1b734ab06c859b4bad11b447e5..db5c11f0fe102d86c639ddbfc78e74d75b7f0353 100644
--- a/tensorflow/core/grappler/costs/utils_test.cc
+++ b/tensorflow/core/grappler/costs/utils_test.cc
@@ -26,36 +26,42 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-class UtilsTest : public ::testing::Test {
- public:
-  void CreateConstOp(const string& name, std::initializer_list<int64> dims,
-                     NodeDef* node) {
-    Tensor tensor(DT_FLOAT, TensorShape(dims));
-    for (int64 i = 0; i < tensor.NumElements(); ++i) {
-      tensor.flat<float>()(i) = i / 10.0f;
-    }
-    TF_CHECK_OK(NodeDefBuilder(name, "Const")
-                    .Attr("dtype", DT_FLOAT)
-                    .Attr("value", tensor)
-                    .Finalize(node));
-  }
+namespace {
 
-  void CreateConstSizesOp(const string& name, const std::vector<int32>& sizes,
-                          NodeDef* node) {
-    TensorShape shape;
-    shape.AddDim(sizes.size());
-    Tensor tensor(DT_INT32, shape);
-    for (int64 i = 0; i < tensor.NumElements(); ++i) {
-      tensor.flat<int32>()(i) = sizes[i];
-    }
-    TF_CHECK_OK(NodeDefBuilder(name, "Const")
-                    .Attr("dtype", DT_INT32)
-                    .Attr("value", tensor)
-                    .Finalize(node));
-  }
-};
+void CreateConstOp(const string& name, std::initializer_list<int64> dims,
+                   NodeDef* node) {
+  Tensor tensor(DT_FLOAT, TensorShape(dims));
+  for (int64 i = 0; i < tensor.NumElements(); ++i)
+    tensor.flat<float>()(i) = i / 10.0f;
+  TF_CHECK_OK(NodeDefBuilder(name, "Const")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("value", tensor)
+                  .Finalize(node));
+}
 
-TEST_F(UtilsTest, ConvOpInfo) {
+void CreateConstSizesOp(const string& name, const std::vector<int32>& sizes,
+                        NodeDef* node) {
+  TensorShape shape;
+  shape.AddDim(sizes.size());
+  Tensor tensor(DT_INT32, shape);
+  for (int64 i = 0; i < tensor.NumElements(); ++i)
+    tensor.flat<int32>()(i) = sizes[i];
+  TF_CHECK_OK(NodeDefBuilder(name, "Const")
+                  .Attr("dtype", DT_INT32)
+                  .Attr("value", tensor)
+                  .Finalize(node));
+}
+
+// Helper method for converting shapes vector to TensorProperty.
+OpInfo::TensorProperties ShapeToTensorProperty(const std::vector<int>& shapes,
+                                               const DataType& data_type) {
+  OpInfo::TensorProperties prop;
+  prop.set_dtype(data_type);
+  for (int shape : shapes) prop.mutable_shape()->add_dim()->set_size(shape);
+  return prop;
+}
+
+TEST(UtilsTest, ConvOpInfo) {
   int batch = 32;
   int rows = 7;
   int cols = 9;
@@ -146,7 +152,7 @@ TEST_F(UtilsTest, ConvOpInfo) {
   }
 }
 
-TEST_F(UtilsTest, TestSkipControlInput) {
+TEST(UtilsTest, TestSkipControlInput) {
   GraphDef graph;
   TF_CHECK_OK(NodeDefBuilder("constant", "Const")
                   .Attr("dtype", DT_INT32)
@@ -172,6 +178,52 @@ TEST_F(UtilsTest, TestSkipControlInput) {
   EXPECT_TRUE(node_found);
 }
 
+TEST(UtilsTest, CalculateTensorSize) {
+  // Test normal usage.
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 1,
+            CalculateTensorSize(ShapeToTensorProperty({1}, DT_FLOAT)));
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 4 * 4,
+            CalculateTensorSize(ShapeToTensorProperty({4, 4}, DT_FLOAT)));
+  EXPECT_EQ(DataTypeSize(DT_HALF) * 10 * 10 * 10,
+            CalculateTensorSize(ShapeToTensorProperty({10, 10, 10}, DT_HALF)));
+  EXPECT_EQ(
+      DataTypeSize(DT_FLOAT) * 100 * 7 * 8 * 99,
+      CalculateTensorSize(ShapeToTensorProperty({100, 7, 8, 99}, DT_FLOAT)));
+
+  // Test unknown rank: assumes the tensor to be a scalar.
+  OpInfo::TensorProperties t = ShapeToTensorProperty({100, 7, 8, 99}, DT_FLOAT);
+  t.mutable_shape()->set_unknown_rank(true);
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 1, CalculateTensorSize(t));
+
+  // Test unknown shape: assumes unknown shape (-1) to have size 1.
+  EXPECT_EQ(
+      DataTypeSize(DT_FLOAT) * 1 * 7 * 8 * 99,
+      CalculateTensorSize(ShapeToTensorProperty({-1, 7, 8, 99}, DT_FLOAT)));
+  EXPECT_EQ(
+      DataTypeSize(DT_FLOAT) * 1 * 7 * 1 * 99,
+      CalculateTensorSize(ShapeToTensorProperty({-1, 7, -1, 99}, DT_FLOAT)));
+}
+
+TEST(UtilsTest, CalculateOutputSize) {
+  // Create a set of tensor properties.
+  std::vector<OpInfo::TensorProperties> output = {
+      ShapeToTensorProperty({4, 4}, DT_FLOAT),          // 0
+      ShapeToTensorProperty({-1, 7, -1, 99}, DT_FLOAT)  // 1
+  };
+
+  // Test valid outputs.
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 4 * 4, CalculateOutputSize(output, 0));
+  EXPECT_EQ(DataTypeSize(DT_FLOAT) * 1 * 7 * 1 * 99,
+            CalculateOutputSize(output, 1));
+
+  // port_num -1 is for control dependency: hard coded 4B.
+  EXPECT_EQ(4, CalculateOutputSize(output, -1));
+
+  // Invalid port_num (though it may be an error) shall yield zero
+  // output size.
+  EXPECT_EQ(0, CalculateOutputSize(output, 2));
+}
+
 // Class for testing TensorSizeHistogram.
 class TestTensorSizeHistogram : public TensorSizeHistogram {
  public:
@@ -285,5 +337,7 @@ TEST(DeviceClassTest, GetDeviceClassForNonChannelDevice) {
   EXPECT_EQ("//GPU", GetDeviceClassForNonChannelDevice("/device:GPU:7"));
 }
 
+}  // namespace
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 037a823096ce23f64cdbdfcf684acb8d8ad8fe08..ae5200b359232153f96c9ffa21a505d2a056d55d 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -47,6 +47,7 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
   result.execution_time += right.execution_time;
   result.compute_time += right.compute_time;
   result.memory_time += right.memory_time;
+  result.intermediate_memory_time += right.intermediate_memory_time;
 
   result.num_ops_total += right.num_ops_total;
   if (right.inaccurate) result.inaccurate = true;
@@ -274,21 +275,23 @@ bool CompositeNodeManager::Empty() const {
   return empty && send_manager_.Empty() && recv_manager_.Empty();
 }
 
-VirtualScheduler::VirtualScheduler(const GrapplerItem* grappler_item,
-                                   const bool use_static_shapes,
-                                   Cluster* cluster,
-                                   ReadyNodeManager* ready_nodes)
-    : ready_nodes_(ready_nodes),
-      graph_costs_(Costs::ZeroCosts()),
-      graph_properties_(*grappler_item),
-      cluster_(cluster),
-      grappler_item_(grappler_item),
-      use_static_shapes_(use_static_shapes),
-      placer_(cluster) {
-  graph_costs_.num_ops_total = 0;
-  initialized_ = false;
+std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
+    const string& ready_node_manager) {
+  if (ready_node_manager == "FIFO") {
+    return absl::make_unique<FIFOManager>();
+  } else if (ready_node_manager == "LIFO") {
+    return absl::make_unique<LIFOManager>();
+  } else if (ready_node_manager == "FirstReady") {
+    return absl::make_unique<FirstReadyManager>();
+  } else if (ready_node_manager == "Composite") {
+    return absl::make_unique<CompositeNodeManager>();
+  }
+  LOG(FATAL) << "Not a valid ready node manager: " << ready_node_manager;
+  return nullptr;
 }
 
+// TODO(pcma): Delete this deprecated API after power_analyzer.cc is modeified
+// to use the new factory API
 ReadyNodeManager* VirtualScheduler::ReadyNodeManagerFactory(
     const string& ready_node_manager) {
   if (ready_node_manager == "FIFO") {
@@ -303,21 +306,69 @@ ReadyNodeManager* VirtualScheduler::ReadyNodeManagerFactory(
   LOG(FATAL) << "Not a valid ready node manager: " << ready_node_manager;
 }
 
+VirtualScheduler::VirtualScheduler(const GrapplerItem* grappler_item,
+                                   const bool use_static_shapes,
+                                   Cluster* cluster,
+                                   ReadyNodeManager* ready_nodes)
+    : ready_nodes_(ready_nodes),
+      graph_costs_(Costs::ZeroCosts()),
+      graph_properties_(new GraphProperties(*grappler_item)),
+      cluster_(cluster),
+      grappler_item_(grappler_item),
+      use_static_shapes_(use_static_shapes),
+      placer_(cluster) {
+  graph_costs_.num_ops_total = 0;
+  initialized_ = false;
+}
+
+VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
+                                   Cluster* cluster,
+                                   ReadyNodeManager* ready_nodes)
+    : ready_nodes_(ready_nodes),
+      graph_costs_(Costs::ZeroCosts()),
+      cluster_(cluster),
+      use_static_shapes_(use_static_shapes),
+      placer_(cluster) {
+  graph_costs_.num_ops_total = 0;
+  initialized_ = false;
+}
+
+Status VirtualScheduler::Init(const GrapplerItem* item) {
+  grappler_item_ = item;
+  graph_properties_ = absl::make_unique<GraphProperties>(*item);
+
+  return Init();
+}
+
+// TODO(pcma): Merge with Init(const GrapplerItem* item) when this
+// deprecated API is deleted
 Status VirtualScheduler::Init() {
+  initialized_ = false;
+
+  // Clear all internal states so that the VirtualScheduler is reusable for
+  // different GrapplerItems
+  node_map_.clear();
+  device_.clear();
+  additional_nodes_.clear();
+
+  graph_costs_ = Costs::ZeroCosts();
+  graph_costs_.num_ops_total = 0;
+  op_to_cost_.clear();
+
+  op_counts_.clear();
+  op_costs_.clear();
+
   // Init() preprocesses the input grappler_item and graph_properties to extract
   // necessary information for emulating tensorflow op scheduling and
   // construct internal data structures (NodeState and DeviceState) for virtual
   // scheduling.
   ready_nodes_->Init(GetNodeStates());
+
   // Construct graph properties.
-  Status status;
   if (use_static_shapes_) {
-    status = graph_properties_.InferStatically(true);
+    TF_RETURN_IF_ERROR(graph_properties_->InferStatically(true));
   } else {
-    status = graph_properties_.InferDynamically(cluster_);
-  }
-  if (!status.ok()) {
-    return status;
+    TF_RETURN_IF_ERROR(graph_properties_->InferDynamically(cluster_));
   }
 
   const auto& graph = grappler_item_->graph;
@@ -418,8 +469,8 @@ Status VirtualScheduler::Init() {
         } else {
           // Different device, no cached copy; transfer input_node to the
           // curr_node's device.
-          auto send_and_recv =
-              CreateSendRecv(input_node, curr_node, input_node_name);
+          auto send_and_recv = CreateSendRecv(input_node, curr_node, input_node,
+                                              input_node_name);
           // Note that CreateSendRecv() already connected input/output between
           // _Send and _Recv ops.
           const auto* send = send_and_recv.first;
@@ -473,6 +524,7 @@ Status VirtualScheduler::Init() {
     VLOG(1) << "Some feed nodes were not consumed by the fetch fanin: "
             << str_util::Join(feed_nodes, ",");
   }
+
   initialized_ = true;
   return Status::OK();
 }
@@ -511,7 +563,7 @@ void VirtualScheduler::MaybeUpdateInputOutput(const NodeDef* node) {
       outputs.push_back(control_message);
     } else {
       auto output_properties =
-          graph_properties_.GetOutputProperties(NodeName(input_source_name));
+          graph_properties_->GetOutputProperties(NodeName(input_source_name));
       // Like with HasInputProperties, if a node does not have output
       // properties, it's likely it was pruned during the shape inference run.
       if (!output_properties.empty()) {
@@ -556,7 +608,8 @@ string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
 }
 
 std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
-    const NodeDef* from, const NodeDef* to, const string& input_name) {
+    const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
+    const string& input_name) {
   CHECK(!initialized_) << "CreateSendRecv is called after Init().";
 
   // Connect "from" node to "to" node with _Send and _Recv such that
@@ -589,6 +642,12 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   send_attr[kAttrInputSrc].set_s(input_name);
   send_attr[kAttrSrcDevice].set_s(DeviceName(from));
   send_attr[kAttrDstDevice].set_s(DeviceName(to));
+  // GraphDef generated by AutoGrappler has tensor_name field when removing
+  // _Send/_Recv nodes.
+  if (input_node->attr().count(kAttrTensorName)) {
+    send_attr[kAttrTensorName].set_s(
+        input_node->attr().at(kAttrTensorName).s());
+  }
 
   // _Recv op.
   auto* recv = new NodeDef();
@@ -598,6 +657,10 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   recv->set_device(DeviceName(to));
   auto& recv_attr = *(recv->mutable_attr());
   recv_attr[kAttrInputSrc].set_s(input_name);
+  if (input_node->attr().count(kAttrTensorName)) {
+    recv_attr[kAttrTensorName].set_s(
+        input_node->attr().at(kAttrTensorName).s());
+  }
 
   // NodeState for _Send op.
   auto& send_node_state = GetNodeStateOrCreateIt(send);
@@ -664,9 +727,9 @@ NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
   it = node_map_.emplace(node, NodeState()).first;
   auto& node_state = it->second;
   node_state.input_properties =
-      graph_properties_.GetInputProperties(node->name());
+      graph_properties_->GetInputProperties(node->name());
   node_state.output_properties =
-      graph_properties_.GetOutputProperties(node->name());
+      graph_properties_->GetOutputProperties(node->name());
 
   // Some ops may need further processing to the input / output properties:
   // _Send and _Recv.
@@ -695,38 +758,6 @@ NodeState& VirtualScheduler::GetNodeStateOrCreateIt(const NodeDef* node) {
   return it->second;
 }
 
-int64 VirtualScheduler::CalculateOutputSize(
-    const std::vector<OpInfo::TensorProperties>& output_properties,
-    const int port_num) const {
-  if (port_num < 0) {
-    return 4;  // 4B for control dependency.
-  }
-
-  if (port_num >= output_properties.size()) {
-    VLOG(3) << "VirtualScheduler::CalculateOutputSize() -- "
-            << "port_num: " << port_num
-            << " >= output_properties.size(): " << output_properties.size();
-    return 0;
-  }
-
-  const auto& output = output_properties[port_num];
-  int64 output_size = DataTypeSize(BaseType(output.dtype()));
-
-  for (const auto& dim : output.shape().dim()) {
-    auto dim_size = dim.size();
-    if (dim_size < 0) {
-      // Zero output size if there's any unknown dim.
-      output_size = 0;
-      VLOG(3) << "VirtualScheduler::CalculateOutputSize() -- "
-              << "unknown dim: " << output_size;
-      break;
-    }
-    output_size *= dim_size;
-  }
-
-  return output_size;
-}
-
 Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
                                           std::map<string, Costs>* op_cost) {
   auto it = op_cost->find(op_name);
@@ -744,7 +775,10 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   const NodeDef* node = ready_nodes_->GetCurrNode();
   const string& op_name = node->op();
 
-  // Also keep track of op counts and times per op (with their shapes).
+  auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
+  op_cost = CombineCosts(op_cost, node_costs);
+
+  // Also keep track of op counts and costs per op (with their shapes).
   OpContext op_context = GetCurrNode();
   string node_description = GetOpDescription(op_context.op_info);
   op_counts_[node_description] += 1;
@@ -752,9 +786,6 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
       std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
                      !node_costs.inaccurate);
 
-  auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
-  op_cost = CombineCosts(op_cost, node_costs);
-
   // Update node and device states.
   auto& node_state = node_map_[node];
   auto& device = device_[node_state.device_name];
@@ -795,7 +826,7 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
           << ", scheduled: " << node_state.time_scheduled.count()
           << ", finished: " << node_state.time_finished.count();
 
-  // Increment num_inputs_ready of the output nodes
+  // Increment num_inputs_ready of the output nodes and maybe add to ready nodes
   for (const auto& port_num_output_pair : node_state.outputs) {
     for (auto* output_node : port_num_output_pair.second) {
       auto& output_state = node_map_[output_node];
@@ -812,7 +843,7 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
     }
   }
 
-  // Increment num_outputs_executed of the input nodes.
+  // Increment num_outputs_executed of the input nodes and maybe update memory.
   for (const auto& input_port : node_state.inputs) {
     auto* input = input_port.first;
     auto port = input_port.second;
@@ -841,7 +872,6 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
     }
   }
 
-  // Remove the current node; assume FIFO.
   ready_nodes_->RemoveCurrNode();
 
   return !ready_nodes_->Empty();
@@ -857,23 +887,29 @@ Costs VirtualScheduler::Summary() const {
   VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
   VLOG(1) << "Expected compute time: " << graph_costs_.compute_time.count();
   VLOG(1) << "Expected memory time: " << graph_costs_.memory_time.count();
+  VLOG(1) << "Expected intermediate memory time: "
+          << graph_costs_.intermediate_memory_time.count();
   VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
   VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
   VLOG(1) << "Expected max per-op streaming buffers: "
           << graph_costs_.max_per_op_streaming;
 
-  VLOG(1) << "Per-op execution time / compute time / memory time:";
+  VLOG(1) << "Per-op execution time / compute time / memory time"
+          << " / intermediate memory time:";
   for (const auto& op_cost_pair : op_to_cost_) {
     const auto& op = op_cost_pair.first;
     const auto& cost = op_cost_pair.second.execution_time.count();
     const auto& compute_cost = op_cost_pair.second.compute_time.count();
     const auto& memory_cost = op_cost_pair.second.memory_time.count();
+    const auto& intermediate_memory_cost =
+        op_cost_pair.second.intermediate_memory_time.count();
     const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
     if (cost) {  // Skip printing out zero-cost ops.
       VLOG(1) << strings::Printf(
-          " + %30s : %c %10lld / %10lld / %10lld", op.c_str(),
+          " + %30s : %c %10lld / %10lld / %10lld / %10lld", op.c_str(),
           (is_op_cost_accurate ? ' ' : '~'), static_cast<int64>(cost),
-          static_cast<int64>(compute_cost), static_cast<int64>(memory_cost));
+          static_cast<int64>(compute_cost), static_cast<int64>(memory_cost),
+          static_cast<int64>(intermediate_memory_cost));
     }
   }
 
@@ -926,7 +962,8 @@ Costs VirtualScheduler::Summary() const {
             << " having unknown shapes";
 
     VLOG(1) << "Per-op execution time / compute time / memory time "
-               "(and memory usage at peak memory usage):";
+            << " / intermediate memory time"
+            << " (and memory usage at peak memory usage):";
 
     // Profile non-persistent op memory usage.
     for (const auto& node_port : state.mem_usage_snapshot_at_peak) {
@@ -942,6 +979,8 @@ Costs VirtualScheduler::Summary() const {
       const auto& cost = op_cost_pair.second.execution_time.count();
       const auto& compute_cost = op_cost_pair.second.compute_time.count();
       const auto& memory_cost = op_cost_pair.second.memory_time.count();
+      const auto& intermediate_memory_cost =
+          op_cost_pair.second.intermediate_memory_time.count();
       total_compute_time_ns += op_cost_pair.second.execution_time;
       const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
       if (!is_op_cost_accurate) {
@@ -959,12 +998,13 @@ Costs VirtualScheduler::Summary() const {
                                : 0.0;
       if (cost || mem_usage_percent > 1.0) {
         // Print out only non-zero cost ops or ops with > 1% memory usage.
-        VLOG(1) << strings::Printf(" + %30s : %c %10lld / %10lld / %10lld",
-                                   op.c_str(),
-                                   (is_op_cost_accurate ? ' ' : '~'),
-                                   static_cast<int64>(cost),
-                                   static_cast<int64>(compute_cost),
-                                   static_cast<int64>(memory_cost))
+        VLOG(1) << strings::Printf(
+                       " + %30s : %c %10lld / %10lld / %10lld / %10lld",
+                       op.c_str(), (is_op_cost_accurate ? ' ' : '~'),
+                       static_cast<int64>(cost),
+                       static_cast<int64>(compute_cost),
+                       static_cast<int64>(memory_cost),
+                       static_cast<int64>(intermediate_memory_cost))
                 << " (" << strings::HumanReadableNumBytes(op_mem_usage) << " ["
                 << mem_usage_percent << "%] "
                 << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
@@ -993,7 +1033,8 @@ Costs VirtualScheduler::Summary() const {
       bool is_cost_accurate;
       std::tie(cost, is_cost_accurate) = op_costs_.at(item.first);
       VLOG(2) << "Node: " << item.first << ", Count: " << item.second
-              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost;
+              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost
+              << " us";
     }
   }
 
@@ -1003,11 +1044,12 @@ Costs VirtualScheduler::Summary() const {
 }
 
 Costs VirtualScheduler::Summary(RunMetadata* metadata) {
-  if (!metadata) {
-    return Summary();
-  }
+  if (metadata) GenerateRunMetadata(metadata);
+  return Summary();
+}
 
-  // Fill RunMetadata.
+void VirtualScheduler::GenerateRunMetadata(RunMetadata* metadata) {
+  // Fill RunMetadata's step_stats and partition_graphs fields.
   StepStats* stepstats = metadata->mutable_step_stats();
   for (const auto& device : device_) {
     GraphDef* device_partition_graph = metadata->add_partition_graphs();
@@ -1055,8 +1097,6 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) {
       *device_partition_graph->add_node() = *node_def;
     }
   }
-
-  return Summary();
 }
 
 const std::unordered_map<string, int64> VirtualScheduler::GetPeakMemoryUsage()
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 0e66e8a463f910b4e86a2aec17fef6ccfe7a2c8c..6a835f32d16d0850c06891f656b2bec910e26b78 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -107,10 +107,10 @@ struct DeviceState {
       mem_usage_snapshot_at_peak;
 
   Costs device_costs;
-  std::map<string, Costs> op_to_cost;    // Per-op cost.
-  std::map<string, int64> op_to_memory;  // Per-op memory usage at peak usage.
-  int64 memory_usage;
-  int64 max_memory_usage;
+  std::map<string, Costs> op_to_cost;  // Per-op cost.
+
+  int64 memory_usage;      // Current temporary memory usage
+  int64 max_memory_usage;  // Max temporary memory usage
 
   DeviceState() {
     device_costs = Costs::ZeroCosts();
@@ -248,17 +248,35 @@ class CompositeNodeManager : public ReadyNodeManager {
   const NodeDef* curr_node_;
 };
 
+// Constructs a ready node manager from the given string.
+std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
+    const string& ready_node_manager);
+
 // The virtual scheduler emulates execution of nodes in a graph, considering
 // dependencies, device, etc.
 class VirtualScheduler {
  public:
+  // TODO(pcma): Modify power_analyzer.cc to use new API's.
+  // DEPRECATED
   VirtualScheduler(const GrapplerItem* grappler_item,
                    const bool use_static_shapes, Cluster* cluster,
                    ReadyNodeManager* ready_nodes);
-  // Initializes NodeState and DeviceState from grappler_item_ and
-  // graph_properties_.
+  // DEPRECATED
   Status Init();
 
+  // Does not take ownership of cluster or ready_nodes.
+  VirtualScheduler(bool use_static_shapes, Cluster* cluster,
+                   ReadyNodeManager* ready_nodes);
+  // Initializes the scheduler for the specific grappler item.
+  // Should be called immediately after the c'tor or when the scheduler will be
+  // reused for a new grappler item. All internal states of the scheduler
+  // related to the previous grappler item will be reset/cleared.
+  //
+  // This function should be called at least once after the scheduler is
+  // constructed. An uninitialized or failed-to-initialize scheduler will cause
+  // undefined behavior.
+  Status Init(const GrapplerItem* item);
+
   OpContext GetCurrNode() const;
 
   // Returns true if there is any node to be scheduled.
@@ -269,7 +287,11 @@ class VirtualScheduler {
   // Like the above, but writes detailed stats to RunMetadata.
   // If metadata is nullptr, then just calls and return Summary().
   Costs Summary(RunMetadata* metadata);
-  // Methods called from constructor.
+  // Generate RunMetadata's step_stats and partition_graphs fields from results
+  // of the virtual execution of the graph.
+  void GenerateRunMetadata(RunMetadata* metadata);
+
+  // DEPRECATED
   static ReadyNodeManager* ReadyNodeManagerFactory(
       const string& ready_node_manager);
 
@@ -283,25 +305,20 @@ class VirtualScheduler {
     return &node_map_;
   }
 
- protected:
-  // Returns the size of output at port_num (unit: bytes). A special case is
-  // port_num -1, which is for control dependency and assumed to be 4 bytes.
-  int64 CalculateOutputSize(
-      const std::vector<OpInfo::TensorProperties>& output_properties,
-      const int port_num) const;
-
  private:
   // Constants.
   const string kAttrInputSrc = "input_source_";
-  const string kAttrSrcDevice = "src_device_";
-  const string kAttrDstDevice = "dst_device_";
+  const string kAttrSrcDevice = "send_device";
+  const string kAttrDstDevice = "recv_device";
+  const string kAttrTensorName = "tensor_name";
   const string kChannelDevice = "Channel";
 
   // Methods called from Init(). Fails if initialize_ is set.
   void MaybeUpdateInputOutput(const NodeDef* node);
   NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
   std::pair<const NodeDef*, const NodeDef*> CreateSendRecv(
-      const NodeDef* from, const NodeDef* to, const string& input_name);
+      const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
+      const string& input_name);
   string DeviceName(const NodeDef* node) const;
   string SanitizedDeviceName(const NodeDef* node) const;
   string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
@@ -321,8 +338,11 @@ class VirtualScheduler {
   std::vector<std::unique_ptr<NodeDef>> additional_nodes_;
 
   // Stats:
-  std::map<string, int> op_counts_;  // Op counts with key with input shape.
-  // Individual op costs (with input shapes).
+  // Op counts with key with input shape.
+  // Example key: "[Op=AssignSub, input_shapes=[[7,1,160,160][7,1,160,160]]"
+  std::map<string, int> op_counts_;
+  // Individual op costs with key with input shape.
+  // Integer field for execution time in micro seconds.
   // Boolean field for whether the cost is accurate.
   std::map<string, std::pair<int, bool>> op_costs_;
 
@@ -330,8 +350,8 @@ class VirtualScheduler {
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
   // Auxiliary data structures for constructing NodeState and DeviceState.
-  GraphProperties graph_properties_;
-  Cluster* cluster_;  // Not owned.
+  std::unique_ptr<GraphProperties> graph_properties_;  // Initialized in Init().
+  Cluster* cluster_;                                   // Not owned.
 
   const GrapplerItem* grappler_item_;  // Not owned.
   bool use_static_shapes_;
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 80889afc86968c2a9bc96bcdb33968bc8559cc94..0a695458e17a576ecda631b576d4ace4aa947dbc 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -19,21 +19,20 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
+
 // Class for testing virtual scheduler.
 class TestVirtualScheduler : public VirtualScheduler {
  public:
-  TestVirtualScheduler(const GrapplerItem* grappler_item,
-                       const bool use_static_shapes, Cluster* cluster)
-      : VirtualScheduler(grappler_item, use_static_shapes, cluster,
-                         &ready_node_manager_) {}
+  TestVirtualScheduler(const bool use_static_shapes, Cluster* cluster)
+      : VirtualScheduler(use_static_shapes, cluster, &ready_node_manager_) {}
 
-  FRIEND_TEST(VirtualSchedulerTest, CalculateOutputSize);
   FRIEND_TEST(VirtualSchedulerTest, MemoryUsage);
   FRIEND_TEST(VirtualSchedulerTest, ControlDependency);
   FRIEND_TEST(VirtualSchedulerTest, ComplexDependency);
@@ -46,6 +45,30 @@ class TestVirtualScheduler : public VirtualScheduler {
 
 class VirtualSchedulerTest : public ::testing::Test {
  protected:
+  VirtualSchedulerTest() {
+    // node1_ to node6_ on kCPU0, with time_ready in reverse_order.
+    NodeSetUp("Node1", kConv2D, kCPU0, 6000, &node1_);
+    NodeSetUp("Node2", kConv2D, kCPU0, 5000, &node2_);
+    NodeSetUp("Node3", kConv2D, kCPU0, 4000, &node3_);
+    NodeSetUp("Node4", kConv2D, kCPU0, 3000, &node4_);
+    NodeSetUp("Node5", kConv2D, kCPU0, 2000, &node5_);
+    NodeSetUp("Node6", kConv2D, kCPU0, 1000, &node6_);
+
+    // Initializes cluster_ and scheduler_.
+    std::unordered_map<string, DeviceProperties> devices;
+
+    // Set some dummy CPU properties
+    DeviceProperties cpu_device = GetDummyCPUDevice();
+
+    // IMPORTANT: Device is not actually ever used in the test case since
+    // force_cpu_type is defaulted to "Haswell"
+    devices[kCPU0] = cpu_device;
+    devices[kCPU1] = cpu_device;
+    cluster_ = absl::make_unique<VirtualCluster>(devices);
+    scheduler_ = absl::make_unique<TestVirtualScheduler>(
+        /* use_static_shapes = */ true, cluster_.get());
+  }
+
   NodeDef node1_, node2_, node3_, node4_, node5_, node6_;
   std::unordered_map<const NodeDef*, NodeState> node_states_;
 
@@ -83,29 +106,6 @@ class VirtualSchedulerTest : public ::testing::Test {
     node_states_[node].device_name = device_name;
   }
 
-  void SetUp() override {
-    // node1_ to node6_ on kCPU0, with time_ready in reverse_order.
-    NodeSetUp("Node1", kConv2D, kCPU0, 6000, &node1_);
-    NodeSetUp("Node2", kConv2D, kCPU0, 5000, &node2_);
-    NodeSetUp("Node3", kConv2D, kCPU0, 4000, &node3_);
-    NodeSetUp("Node4", kConv2D, kCPU0, 3000, &node4_);
-    NodeSetUp("Node5", kConv2D, kCPU0, 2000, &node5_);
-    NodeSetUp("Node6", kConv2D, kCPU0, 1000, &node6_);
-
-    // Initializes cluster_ and placer_.
-    std::unordered_map<string, DeviceProperties> devices;
-
-    // Set some dummy CPU properties
-    DeviceProperties cpu_device = GetDummyCPUDevice();
-
-    // IMPORTANT: Device is not actually ever used in the test case since
-    // force_cpu_type is defaulted to "Haswell"
-    devices[kCPU0] = cpu_device;
-    devices[kCPU1] = cpu_device;
-    cluster_.reset(new VirtualCluster(devices));
-    placer_.reset(new VirtualPlacer(cluster_.get()));
-  }
-
   // Three Conv2Ds with only two in fetch nodes.
   void CreateGrapplerItemWithConv2Ds() {
     Scope s = Scope::NewRootScope().WithDevice(kCPU0);
@@ -918,11 +918,7 @@ versions {
   }
 
   // Call this after creating grappler_item_ and setting up dependency_.
-  void InitScheduler() {
-    scheduler_.reset(new TestVirtualScheduler(
-        grappler_item_.get(), true /* use_static_shapes */, cluster_.get()));
-    TF_CHECK_OK(scheduler_->Init());
-  }
+  void InitScheduler() { TF_ASSERT_OK(scheduler_->Init(grappler_item_.get())); }
 
   // Returns cost based on op.
   Costs SimplePredictCosts(const OpContext& op_context) const {
@@ -1034,25 +1030,12 @@ versions {
     }
   }
 
-  // Helper method for converting shape vector to TensorProperty.
-  OpInfo::TensorProperties ShapeToTensorProperty(
-      const std::vector<int> shape, const DataType& data_type) const {
-    OpInfo::TensorProperties tensor_property;
-    tensor_property.set_dtype(data_type);
-    for (const auto& x : shape) {
-      tensor_property.mutable_shape()->add_dim()->set_size(x);
-    }
-    return tensor_property;
-  }
-
-  // SetUp() inits cluster_ and placer_.
+  // cluster_ and scheduler_ are initialized in the c'tor.
   std::unique_ptr<VirtualCluster> cluster_;
-  std::unique_ptr<VirtualPlacer> placer_;
+  std::unique_ptr<TestVirtualScheduler> scheduler_;
 
-  // grappler_item_ and scheduler_ will be initialized differently for each test
-  // case.
+  // grappler_item_ will be initialized differently for each test case.
   std::unique_ptr<GrapplerItem> grappler_item_;
-  std::unique_ptr<TestVirtualScheduler> scheduler_;
   // Node name -> its preceding nodes map for testing scheduling order.
   std::unordered_map<string, std::vector<string>> dependency_;
 
@@ -1729,38 +1712,6 @@ TEST_F(VirtualSchedulerTest, InitAndBasicScheduling) {
   EXPECT_EQ(2, ops_executed["c1"].op_info.inputs_size());
 }
 
-TEST_F(VirtualSchedulerTest, CalculateOutputSize) {
-  // Init.
-  CreateGrapplerItemWithAddN();
-  InitScheduler();
-
-  // Create a set of tensor properties.
-  std::vector<OpInfo::TensorProperties> output;
-  output.push_back(ShapeToTensorProperty({4, 4}, DT_FLOAT));           // 0
-  output.push_back(ShapeToTensorProperty({1}, DT_FLOAT));              // 1
-  output.push_back(ShapeToTensorProperty({10, 10, 10}, DT_HALF));      // 2
-  output.push_back(ShapeToTensorProperty({100, 7, 8, 99}, DT_FLOAT));  // 3
-  output.push_back(ShapeToTensorProperty({-1, 7, 8, 99}, DT_FLOAT));   // 4
-  output.push_back(ShapeToTensorProperty({-1, 7, -1, 99}, DT_FLOAT));  // 4
-
-  // port_num -1 is for control dependency: hard coded 4B.
-  EXPECT_EQ(4, scheduler_->CalculateOutputSize(output, -1));
-
-  // Test valid outputs.
-  EXPECT_EQ(4 * 4 * 4, scheduler_->CalculateOutputSize(output, 0));
-  EXPECT_EQ(4 * 1, scheduler_->CalculateOutputSize(output, 1));
-  EXPECT_EQ(2 * 10 * 10 * 10, scheduler_->CalculateOutputSize(output, 2));
-  EXPECT_EQ(4 * 100 * 7 * 8 * 99, scheduler_->CalculateOutputSize(output, 3));
-
-  // Any unknown shape (-1) shall yield zero output size.
-  EXPECT_EQ(0, scheduler_->CalculateOutputSize(output, 4));
-  EXPECT_EQ(0, scheduler_->CalculateOutputSize(output, 5));
-
-  // Invalid port_num (though it may be an error) shall yield zero
-  // output size.
-  EXPECT_EQ(0, scheduler_->CalculateOutputSize(output, 6));
-}
-
 TEST_F(VirtualSchedulerTest, MemoryUsage) {
   // Init.
   CreateGrapplerItemWithAddN();
@@ -2041,7 +1992,7 @@ TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
     for (const auto& output_property : output_properties_) {
       output_properties.push_back(output_property);
     }
-    return scheduler_->CalculateOutputSize(output_properties, 0);
+    return CalculateOutputSize(output_properties, 0);
   };
 
   // Validate transfer size.
@@ -2119,5 +2070,6 @@ TEST_F(VirtualSchedulerTest, GraphWihtOnlyRecv) {
   // Recv without Send will be treated as initially ready node.
   EXPECT_GT(ops_executed.count("Recv"), 0);
 }
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.h b/tensorflow/core/grappler/graph_analyzer/sig_node.h
index 45c0ed31626ec99d1c443313f9b4d6ef9a6fa43a..66d290d88e4a4b4ef8fef7574444d47f57cc237a 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node.h
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node.h
@@ -178,7 +178,7 @@ class SigNode {
   // computed.
   size_t GetTopoHash(int distance) const;
 
-  // The the hash value for the highest computed distance. It must be previously
+  // The hash value for the highest computed distance. It must be previously
   // computed.
   size_t GetHighTopoHash() const {
     CHECK(!topo_hash_.empty());
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index de0a63fc4e39dee306a85bfbe2624b12358c94cb..ba9d2eb32181940bc430771db281c6cea8cb48c4 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -63,216 +63,31 @@ int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
   return OpPortIdToArgId(node, op.input_arg(), port_id);
 }
 
-GraphView::GraphView(GraphDef* graph) : graph_(graph) {
-  for (int i = 0; i < graph_->node_size(); i++) {
-    auto node = graph_->mutable_node(i);
-    AddUniqueNodeOrDie(node);
-  }
-
-  for (NodeDef& node : *graph_->mutable_node()) {
-    AddFanouts(&node);
-  }
-}
-
-void GraphView::AddUniqueNodeOrDie(NodeDef* node) {
-  auto result = nodes_.emplace(node->name(), node);
-  // Check that the graph doesn't contain multiple nodes with the same name.
-  CHECK(result.second) << "Non unique node name detected: " << node->name();
-}
-
-void GraphView::AddFanouts(NodeDef* node) {
-  for (int i = 0; i < node->input_size(); ++i) {
-    OutputPort fanin;
-    const string fanin_name = ParseNodeName(node->input(i), &fanin.port_id);
-    fanin.node = nodes_[fanin_name];
-
-    InputPort input;
-    input.node = node;
-    if (fanin.port_id < 0) {
-      input.port_id = -1;
-    } else {
-      input.port_id = i;
-      num_regular_outputs_[fanin.node] =
-          std::max(num_regular_outputs_[fanin.node], fanin.port_id);
-    }
-
-    fanouts_[fanin].insert(input);
-  }
-}
-
-NodeDef* GraphView::GetNode(const string& node_name) const {
-  auto it = nodes_.find(node_name);
-  if (it == nodes_.end()) {
-    return nullptr;
-  }
-  return it->second;
-}
-
-GraphView::InputPort GraphView::GetInputPort(const string& node_name,
-                                             int port_id) const {
-  InputPort result;
-  result.node = GetNode(node_name);
-  // TODO(bsteiner): verify that the node has at least port_id input ports
-  result.port_id = port_id;
-  return result;
-}
-
-GraphView::OutputPort GraphView::GetOutputPort(const string& node_name,
-                                               int port_id) const {
-  OutputPort result;
-  result.node = GetNode(node_name);
-  // TODO(bsteiner): verify that the node has at least port_id output ports
-  result.port_id = port_id;
-  return result;
-}
-
-const std::unordered_set<GraphView::InputPort, GraphView::HashPort>&
-GraphView::GetFanout(const GraphView::OutputPort& port) const {
-  auto it = fanouts_.find(port);
-  if (it == fanouts_.end()) {
-    return empty_set_;
-  }
-  return it->second;
-}
-
-std::unordered_set<GraphView::OutputPort, GraphView::HashPort>
-GraphView::GetFanin(const GraphView::InputPort& port) const {
-  std::unordered_set<GraphView::OutputPort, GraphView::HashPort> result;
-  if (port.port_id >= 0) {
-    result.insert(GetRegularFanin(port));
-  } else {
-    for (int i = port.node->input_size() - 1; i >= 0; --i) {
-      OutputPort fanin;
-      string fanin_name = ParseNodeName(port.node->input(i), &fanin.port_id);
-      if (fanin.port_id < 0) {
-        auto it = nodes_.find(fanin_name);
-        if (it != nodes_.end()) {
-          fanin.node = it->second;
-          result.insert(fanin);
-        }
-      } else {
-        break;
-      }
-    }
-  }
-  return result;
-}
-
-const GraphView::OutputPort GraphView::GetRegularFanin(
-    const GraphView::InputPort& port) const {
-  CHECK_LE(0, port.port_id);
-  OutputPort fanin;
-  string fanin_name =
-      ParseNodeName(port.node->input(port.port_id), &fanin.port_id);
-  auto it = nodes_.find(fanin_name);
-  if (it == nodes_.end()) {
-    fanin.node = nullptr;
-  } else {
-    fanin.node = it->second;
-  }
-  return fanin;
-}
-
-std::unordered_set<GraphView::InputPort, GraphView::HashPort>
-GraphView::GetFanouts(const NodeDef& node,
-                      bool include_controlled_nodes) const {
-  std::unordered_set<InputPort, HashPort> result;
-  OutputPort port;
-  port.node = const_cast<NodeDef*>(&node);
-  const int first_port_id = include_controlled_nodes ? -1 : 0;
-  auto it = num_regular_outputs_.find(&node);
-  const int last_port_id = (it != num_regular_outputs_.end()) ? it->second : -1;
-
-  for (int i = first_port_id; i <= last_port_id; ++i) {
-    port.port_id = i;
-    auto it = fanouts_.find(port);
-    if (it != fanouts_.end()) {
-      result.insert(it->second.begin(), it->second.end());
-    }
-  }
-  return result;
+bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
+                         int port) {
+  const auto output = GraphView::OutputPort(node, port);
+  const auto fanout = graph_view.GetFanout(output);
+  return fanout.size() <= 1;
 }
 
-std::unordered_set<GraphView::OutputPort, GraphView::HashPort>
-GraphView::GetFanins(const NodeDef& node,
-                     bool include_controlling_nodes) const {
-  std::unordered_set<OutputPort, HashPort> result;
-  for (int i = 0; i < node.input_size(); ++i) {
-    OutputPort fanin;
-    string fanin_name = ParseNodeName(node.input(i), &fanin.port_id);
-    if (fanin.port_id < 0) {
-      if (!include_controlling_nodes) {
-        break;
-      }
-    }
-    auto it = nodes_.find(fanin_name);
-    if (it != nodes_.end()) {
-      fanin.node = it->second;
-      result.insert(fanin);
-    }
-  }
-  return result;
+bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port) {
+  const auto output = GraphView::OutputPort(node, port);
+  const auto fanout = graph_view.GetFanout(output);
+  return !fanout.empty();
 }
 
-int GraphView::NumFanins(const NodeDef& node,
-                         bool include_controlling_nodes) const {
-  int count = 0;
-  for (const string& input : node.input()) {
-    if (!include_controlling_nodes && IsControlInput(input)) {
-      break;
-    }
-    count += 1;
-  }
-  return count;
+bool NoControlFanin(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::InputPort(node, -1);
+  return graph_view.GetFanin(control_port).empty();
 }
 
-std::unordered_set<GraphView::Edge, GraphView::HashEdge>
-GraphView::GetFanoutEdges(const NodeDef& node,
-                          bool include_controlled_edges) const {
-  std::unordered_set<Edge, HashEdge> result;
-  OutputPort port;
-  port.node = const_cast<NodeDef*>(&node);
-  const int first_port_id = include_controlled_edges ? -1 : 0;
-  auto it = num_regular_outputs_.find(&node);
-  const int last_port_id = (it != num_regular_outputs_.end()) ? it->second : -1;
-
-  for (int i = first_port_id; i <= last_port_id; ++i) {
-    port.port_id = i;
-    auto it = fanouts_.find(port);
-    if (it != fanouts_.end()) {
-      Edge fanout;
-      fanout.src.node = const_cast<NodeDef*>(&node);
-      fanout.src.port_id = i;
-      for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) {
-        fanout.tgt = *itr;
-        result.insert(fanout);
-      }
-    }
-  }
-  return result;
+bool NoControlFanout(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::OutputPort(node, -1);
+  return graph_view.GetFanout(control_port).empty();
 }
 
-std::unordered_set<GraphView::Edge, GraphView::HashEdge>
-GraphView::GetFaninEdges(const NodeDef& node,
-                         bool include_controlling_edges) const {
-  std::unordered_set<Edge, HashEdge> result;
-  for (int i = 0; i < node.input_size(); ++i) {
-    Edge fanin;
-    fanin.tgt.node = const_cast<NodeDef*>(&node);
-    fanin.tgt.port_id = i;
-    string fanin_name = ParseNodeName(node.input(i), &fanin.src.port_id);
-    if (fanin.src.port_id < 0) {
-      if (!include_controlling_edges) {
-        break;
-      }
-    }
-    auto it = nodes_.find(fanin_name);
-    if (it != nodes_.end()) {
-      fanin.src.node = it->second;
-      result.insert(fanin);
-    }
-  }
-  return result;
+bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node) {
+  return NoControlFanin(graph_view, node) && NoControlFanout(graph_view, node);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 09c36a136834cf838e1ccdc440a71fbdc2ba9558..0a47b2256583f35e6ef413b50fdc8eea2bdc978d 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -18,9 +18,16 @@ limitations under the License.
 
 #include <unordered_map>
 #include <unordered_set>
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/hash/hash.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -36,116 +43,316 @@ namespace grappler {
 int OpOutputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id);
 int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id);
 
-// A utility class to simplify the traversal of a GraphDef.
-class GraphView {
+namespace internal {
+
+// GraphViewInternal is a helper class to simplify graph traversal. It creates
+// an immutable view of the nodes and edges represented by a GraphDef protocol
+// buffer.
+//
+// There are two public classes implementing GraphViewInternal:
+//
+// - GraphView: constructed from the `const GraphDef` and doesn't allow
+//   to mutate underlying graph via input/output ports lookup functions (ports
+//   have const pointers to nodes).
+//
+// - MutableGraphView: constructed from the 'GraphDef` and allows to mutate
+//   the graph via input/output ports lookup functions (ports have non-const
+//   pointers to nodes), and also have couple additional functions to
+//   add/remove/replace nodes in the graph.
+//
+// --------------------------- !!! WARNING !!! ---------------------------------
+//     Removing nodes from the graph outside of MutableGraphView will
+//     lead to segfaults! Guaranteed by absl::string_view!
+// -----------------------------------------------------------------------------
+//
+template <typename GraphDefT, typename NodeDefT>
+class GraphViewInternal {
  public:
   struct Port {
-    Port() = default;
-    Port(NodeDef* n, int port) : node(n), port_id(port) {}
-
-    // TODO(prazek): ports should keep the constness of GraphView.  The only way
-    // to modify graph through the view should be using MutableGraphView.
-    NodeDef* node = nullptr;
-    int port_id = -1;
+    Port() : node(nullptr), port_id(0) {}
+    Port(NodeDefT* n, int port) : node(n), port_id(port) {}
 
     bool operator==(const Port& other) const {
       return node == other.node && port_id == other.port_id;
     }
+
+    template <typename H>
+    friend H AbslHashValue(H h, const Port& p) {
+      return H::combine(std::move(h), p.node, p.port_id);
+    }
+
+    NodeDefT* node;
+    int port_id;
   };
+
   struct InputPort : public Port {
-    InputPort() = default;
-    InputPort(NodeDef* n, int port_id) : Port(n, port_id) {}
-    InputPort(const NodeDef* n, int port_id)
-        : Port(const_cast<NodeDef*>(n), port_id) {}
-  };
-  struct OutputPort : public Port {
-    OutputPort() = default;
-    OutputPort(NodeDef* n, int port_id) : Port(n, port_id) {}
+    using Port::Port;
   };
 
-  struct HashPort {
-    std::size_t operator()(const Port& port) const {
-      return reinterpret_cast<std::size_t>(port.node) + port.port_id;
-    }
+  struct OutputPort : public Port {
+    using Port::Port;
   };
 
   struct Edge {
-    OutputPort src;
-    InputPort tgt;
+    Edge(OutputPort s, InputPort d) : src(s), dst(d) {}
 
     bool operator==(const Edge& other) const {
-      return src == other.src && tgt == other.tgt;
+      return src == other.src && dst == other.dst;
     }
-  };
-  struct HashEdge {
-    std::size_t operator()(const Edge& edge) const {
-      return HashPort()(edge.src) + HashPort()(edge.tgt);
+
+    template <typename H>
+    friend H AbslHashValue(H h, const Edge& e) {
+      return H::combine(std::move(h), e.src, e.dst);
     }
+
+    OutputPort src;
+    InputPort dst;
   };
 
-  explicit GraphView(GraphDef* graph);
-  GraphDef* GetGraph() const { return graph_; }
-  NodeDef* GetNode(const string& node_name) const;
+  GraphDefT* graph() const { return graph_; }
+
+  // Find a node by name or return `nullptr` if it's not in a graph view.
+  NodeDefT* GetNode(absl::string_view node_name) const {
+    return gtl::FindWithDefault(nodes_, node_name, nullptr);
+  }
+
   // Get the specified input port. Note that the special '-1' port_id can be
   // used to access the controlling nodes (i.e. the nodes connected to node_name
   // through an incoming control dependency).
-  InputPort GetInputPort(const string& node_name, int port_id) const;
+  InputPort GetInputPort(absl::string_view node_name, int port_id) const {
+    return InputPort(GetNode(node_name), port_id);
+  }
+
   // Get the specified output port. Note that the special '-1' port_id can be
   // used to access the controlled nodes (i.e. the nodes connected to node_name
   // through an outgoing control dependency).
-  OutputPort GetOutputPort(const string& node_name, int port_id) const;
+  OutputPort GetOutputPort(absl::string_view node_name, int port_id) const {
+    return OutputPort(GetNode(node_name), port_id);
+  }
 
   // Get the input (resp. output) port(s) in the immediate fanout (resp. fanin)
   // of an output (resp. input) port.
-  const std::unordered_set<InputPort, HashPort>& GetFanout(
-      const OutputPort& port) const;
-  std::unordered_set<OutputPort, HashPort> GetFanin(
-      const InputPort& port) const;
+  const absl::flat_hash_set<InputPort>& GetFanout(
+      const OutputPort& port) const {
+    return gtl::FindWithDefault(fanouts_, port, fanout_not_found_value_);
+  }
+
+  absl::flat_hash_set<OutputPort> GetFanin(const InputPort& port) const {
+    if (port.port_id >= 0) return {GetRegularFanin(port)};
+
+    // Collect fanin for the control input.
+    absl::flat_hash_set<OutputPort> result;
+    for (int i = port.node->input_size() - 1; i >= 0; --i) {
+      TensorId tensor_id = ParseTensorName(port.node->input(i));
+      if (tensor_id.index() >= 0) break;  // we reached regular inputs
+
+      auto it = nodes_.find(tensor_id.node());
+      if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
+    }
+    return result;
+  }
 
   // Special case: regular (i.e. non-control) input ports can only have one
   // fanin.
-  const OutputPort GetRegularFanin(const InputPort& port) const;
+  const OutputPort GetRegularFanin(const InputPort& port) const {
+    DCHECK_GE(port.port_id, 0);
+    if (port.port_id < 0) return OutputPort();
+
+    TensorId tensor_id = ParseTensorName(port.node->input(port.port_id));
+    return GetOutputPort(tensor_id.node(), tensor_id.index());
+  }
+
+  // Get all the input (resp. output) ports in the immediate fanout (resp
+  // fanin) of a node. Include the controlling nodes iff
+  // include_controlling_nodes is true.
+  absl::flat_hash_set<InputPort> GetFanouts(
+      const NodeDef& node, bool include_controlled_nodes) const {
+    absl::flat_hash_set<InputPort> result;
 
-  // Get all the input (resp. output) ports in the immediate fanout (resp fanin)
-  // of a node. Include the controlling nodes iff include_controlling_nodes is
-  // true.
-  std::unordered_set<InputPort, HashPort> GetFanouts(
-      const NodeDef& node, bool include_controlled_nodes) const;
-  std::unordered_set<OutputPort, HashPort> GetFanins(
-      const NodeDef& node, bool include_controlling_nodes) const;
+    OutputPort port;
+    port.node = const_cast<NodeDefT*>(&node);
+    const int first_port_id = include_controlled_nodes ? -1 : 0;
+    const int last_port_id =
+        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
+
+    for (int i = first_port_id; i <= last_port_id; ++i) {
+      port.port_id = i;
+      auto it = fanouts_.find(port);
+      if (it != fanouts_.end()) {
+        result.insert(it->second.begin(), it->second.end());
+      }
+    }
+    return result;
+  }
+
+  absl::flat_hash_set<OutputPort> GetFanins(
+      const NodeDef& node, bool include_controlling_nodes) const {
+    absl::flat_hash_set<OutputPort> result;
+    for (int i = 0; i < node.input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(node.input(i));
+      if (tensor_id.index() < 0 && !include_controlling_nodes) break;
+
+      auto it = nodes_.find(tensor_id.node());
+      if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
+    }
+    return result;
+  }
 
   // Get the number of ports in the immediate fanin of a node. Count the
   // controlling nodes iff include_controlling_nodes is true.
-  int NumFanins(const NodeDef& node, bool include_controlling_nodes) const;
+  int NumFanins(const NodeDef& node, bool include_controlling_nodes) const {
+    int count = 0;
+    for (const string& input : node.input()) {
+      if (!include_controlling_nodes && IsControlInput(input)) {
+        break;
+      }
+      count += 1;
+    }
+    return count;
+  }
+
+  // Get the number of ports in the immediate fanout of a node. Count the
+  // controlling nodes iff include_controlling_nodes is true.
+  int NumFanouts(const NodeDef& node, bool include_controlling_nodes) const {
+    int count = 0;
+
+    OutputPort port;
+    port.node = const_cast<NodeDefT*>(&node);
+    const int first_port_id = include_controlling_nodes ? -1 : 0;
+    const int last_port_id =
+        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
+
+    for (int i = first_port_id; i <= last_port_id; ++i) {
+      port.port_id = i;
+      auto it = fanouts_.find(port);
+      if (it != fanouts_.end()) count += it->second.size();
+    }
+
+    return count;
+  }
 
-  // Get all the edge in the immediate fanout (resp fanin) of a node. Include
-  // the control edges iff include_controlling_edges is true.
-  std::unordered_set<Edge, HashEdge> GetFanoutEdges(
-      const NodeDef& node, bool include_controlled_edges) const;
-  std::unordered_set<Edge, HashEdge> GetFaninEdges(
-      const NodeDef& node, bool include_controlling_edges) const;
+  // Get all the edges in the immediate fanout (resp fanin) of a node.
+  // Include the control edges iff include_controlling_edges is true.
+  absl::flat_hash_set<Edge> GetFanoutEdges(
+      const NodeDef& node, bool include_controlled_edges) const {
+    absl::flat_hash_set<Edge> result;
+
+    OutputPort port;
+    port.node = const_cast<NodeDefT*>(&node);
+    const int first_port_id = include_controlled_edges ? -1 : 0;
+    const int last_port_id =
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
+
+    for (int i = first_port_id; i <= last_port_id; ++i) {
+      port.port_id = i;
+      auto it = fanouts_.find(port);
+      if (it != fanouts_.end()) {
+        for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) {
+          result.emplace(/*src*/ OutputPort(const_cast<NodeDefT*>(&node), i),
+                         /*dst*/ *itr);
+        }
+      }
+    }
+    return result;
+  }
+
+  absl::flat_hash_set<Edge> GetFaninEdges(
+      const NodeDef& node, bool include_controlling_edges) const {
+    absl::flat_hash_set<Edge> result;
+    for (int i = 0; i < node.input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(node.input(i));
+      if (tensor_id.index() < 0 && !include_controlling_edges) break;
+
+      auto it = nodes_.find(tensor_id.node());
+      if (it != nodes_.end()) {
+        result.emplace(/*src*/ OutputPort(it->second, tensor_id.index()),
+                       /*dst*/ InputPort(const_cast<NodeDefT*>(&node), i));
+      }
+    }
+    return result;
+  }
 
  protected:
-  // Add a new `node` to the graph.
-  void AddUniqueNodeOrDie(NodeDef* node);
-  // Add fanout to every `node` input.
-  void AddFanouts(NodeDef* node);
-  std::unordered_map<string, NodeDef*>* MutableNodes() { return &nodes_; }
-  GraphDef* MutableGraph() { return graph_; }
-
-  using FanoutsMapType =
-      std::unordered_map<OutputPort, std::unordered_set<InputPort, HashPort>,
-                         HashPort>;
-  FanoutsMapType* MutableFanouts() { return &fanouts_; }
+  explicit GraphViewInternal(GraphDefT* graph) : graph_(graph) {}
+
+  void AddUniqueNodeOrDie(NodeDefT* node) {
+    auto result = nodes_.emplace(node->name(), node);
+    // TODO(ezhulenev): Replace CHECK with factory method returning
+    // absl::StatusOr (when available).
+    CHECK(result.second) << "Non unique node name detected: " << node->name();
+  }
+
+  void AddFanouts(NodeDefT* node) {
+    for (int i = 0; i < node->input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(node->input(i));
+      OutputPort output(nodes_[tensor_id.node()], tensor_id.index());
+
+      if (output.port_id < 0) {
+        fanouts_[output].emplace(node, -1);
+      } else {
+        max_regular_output_port_[output.node] =
+            std::max(max_regular_output_port_[output.node], output.port_id);
+        fanouts_[output].emplace(node, i);
+      }
+    }
+  }
+
+  // Access to the mutable internal state for MutableGraphView.
+  absl::flat_hash_map<absl::string_view, NodeDefT*>& nodes() { return nodes_; }
+
+  absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>>& fanouts() {
+    return fanouts_;
+  }
+
+  absl::flat_hash_map<const NodeDef*, int>& max_regular_output_port() {
+    return max_regular_output_port_;
+  }
 
  private:
-  GraphDef* graph_;
-  std::unordered_map<string, NodeDef*> nodes_;
-  std::unordered_set<InputPort, HashPort> empty_set_;
-  FanoutsMapType fanouts_;
-  std::unordered_map<const NodeDef*, int> num_regular_outputs_;
+  GraphDefT* graph_;  // must outlive the graph view
+
+  // A mapping from the node name to the node itself.
+  absl::flat_hash_map<absl::string_view, NodeDefT*> nodes_;
+
+  // A mapping from the output port to all inputs that read from it.
+  absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>> fanouts_;
+
+  // Keep a maximum index of tensor fetched from the node. It doesn't guarantee
+  // that all tensors in the [0, max_regular_output_port] range are actually
+  // fetched by other nodes.
+  absl::flat_hash_map<const NodeDef*, int> max_regular_output_port_;
+
+  // If the node has no fanouts at given output port (output tensor consumers)
+  // we return a reference to this set from `GetFanout` (we can't construct new
+  // empty set every time, because we need a non-dangling reference).
+  absl::flat_hash_set<InputPort> fanout_not_found_value_;
+};
+
+}  // namespace internal
+
+// Immutable GraphView that keeps the constness of the GraphDef. If you need to
+// mutate the graph or the nodes via the graph view lookup functions, see
+// MutableGraphView.
+class GraphView
+    : public internal::GraphViewInternal<const GraphDef, const NodeDef> {
+ public:
+  explicit GraphView(const GraphDef* graph) : GraphViewInternal(graph) {
+    for (const NodeDef& node : graph->node()) AddUniqueNodeOrDie(&node);
+    for (const NodeDef& node : graph->node()) AddFanouts(&node);
+  }
 };
 
+// Returns true if node has one (or zero) fanout nodes at given output port.
+bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
+                         int port = 0);
+
+// Returns true if node has at least one fanout node at given output port.
+bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port = 0);
+
+bool NoControlFanin(const GraphView& graph_view, const NodeDef* node);
+bool NoControlFanout(const GraphView& graph_view, const NodeDef* node);
+bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index f90e2c8cfcd765850bf741faaaa59525b711d2a5..cbf859a4a99d7c434a4a65185c8962ea539c1aed 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/graph_view.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/cc/ops/parsing_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -158,19 +160,22 @@ TEST_F(GraphViewTest, BasicGraph) {
 
   const NodeDef* add_node = graph.GetNode("AddN");
   EXPECT_NE(nullptr, add_node);
-  string fanouts;
+
+  absl::flat_hash_set<string> fanouts;
+  absl::flat_hash_set<string> expected_fanouts = {"AddN_2:0", "AddN_3:0"};
   for (const auto& fo : graph.GetFanouts(*add_node, false)) {
-    strings::StrAppend(&fanouts,
-                       strings::StrCat(fo.node->name(), ":", fo.port_id, " "));
+    fanouts.insert(absl::StrCat(fo.node->name(), ":", fo.port_id));
   }
-  EXPECT_EQ("AddN_2:0 AddN_3:0 ", fanouts);
+  EXPECT_EQ(graph.NumFanouts(*add_node, false), 2);
+  EXPECT_EQ(fanouts, expected_fanouts);
 
-  string fanins;
+  absl::flat_hash_set<string> fanins;
+  absl::flat_hash_set<string> expected_fanins = {"Square_1:0", "Square:0"};
   for (const auto& fi : graph.GetFanins(*add_node, false)) {
-    strings::StrAppend(&fanins,
-                       strings::StrCat(fi.node->name(), ":", fi.port_id, " "));
+    fanins.insert(absl::StrCat(fi.node->name(), ":", fi.port_id));
   }
-  EXPECT_EQ("Square_1:0 Square:0 ", fanins);
+  EXPECT_EQ(graph.NumFanins(*add_node, false), 2);
+  EXPECT_EQ(fanins, expected_fanins);
 }
 
 TEST_F(GraphViewTest, ControlDependencies) {
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 2c490f3966cb45f61a22ba0a858a928f46e9db1b..74bde67f198f8c6d31273861cf9b35537909447c 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -19,27 +19,33 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
-GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef* graph_def) {
-  id = other.id;
-  feed = other.feed;
-  fetch = other.fetch;
-  init_ops = other.init_ops;
-  keep_ops = other.keep_ops;
-  expected_init_time = other.expected_init_time;
-  save_op = other.save_op;
-  restore_op = other.restore_op;
-  save_restore_loc_tensor = other.save_restore_loc_tensor;
-  queue_runners = other.queue_runners;
-  allowed_optimizations = other.allowed_optimizations;
-  graph.Swap(graph_def);
+GrapplerItem GrapplerItem::WithGraph(GraphDef&& graph_def) const {
+  GrapplerItem item;
+  item.id = id;
+  item.feed = feed;
+  item.fetch = fetch;
+  item.init_ops = init_ops;
+  item.keep_ops = keep_ops;
+  item.expected_init_time = expected_init_time;
+  item.save_op = save_op;
+  item.restore_op = restore_op;
+  item.save_restore_loc_tensor = save_restore_loc_tensor;
+  item.queue_runners = queue_runners;
+  item.devices_ = devices_;
+  item.allowed_optimizations_ = allowed_optimizations_;
+  item.graph.Swap(&graph_def);
+  return item;
 }
 
 std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
@@ -111,6 +117,64 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   return result;
 }
 
+const std::unordered_set<string>& GrapplerItem::devices() const {
+  return devices_;
+}
+
+Status GrapplerItem::AddDevice(const string& device) {
+  DeviceNameUtils::ParsedName name;
+
+  if (!DeviceNameUtils::ParseFullName(device, &name)) {
+    return errors::InvalidArgument("Invalid device name: device=", device);
+
+  } else if (!name.has_job || !name.has_replica || !name.has_task ||
+             !name.has_type || !name.has_id) {
+    return errors::InvalidArgument("Not a fully defined device name: device=",
+                                   device);
+  }
+
+  devices_.insert(DeviceNameUtils::ParsedNameToString(name));
+  return Status::OK();
+}
+
+Status GrapplerItem::AddDevices(const GrapplerItem& other) {
+  std::vector<absl::string_view> invalid_devices;
+  for (const string& device : other.devices()) {
+    Status added = AddDevice(device);
+    if (!added.ok()) invalid_devices.emplace_back(device);
+  }
+  return invalid_devices.empty()
+             ? Status::OK()
+             : errors::InvalidArgument("Skipped invalid devices: [",
+                                       absl::StrJoin(invalid_devices, ", "),
+                                       "]");
+}
+
+Status GrapplerItem::InferDevicesFromGraph() {
+  absl::flat_hash_set<absl::string_view> invalid_devices;
+  for (const NodeDef& node : graph.node()) {
+    Status added = AddDevice(node.device());
+    if (!added.ok()) invalid_devices.insert(node.device());
+  }
+  VLOG(2) << "Inferred device set: [" << absl::StrJoin(devices_, ", ") << "]";
+  return invalid_devices.empty()
+             ? Status::OK()
+             : errors::InvalidArgument("Skipped invalid devices: [",
+                                       absl::StrJoin(invalid_devices, ", "),
+                                       "]");
+}
+
+void GrapplerItem::ClearDevices() { devices_.clear(); }
+
+const GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations()
+    const {
+  return allowed_optimizations_;
+}
+
+GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations() {
+  return allowed_optimizations_;
+}
+
 std::vector<const NodeDef*> ComputeTransitiveFanin(
     const GraphDef& graph, const std::vector<string>& terminal_nodes) {
   bool ill_formed = false;
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index a0748abfe691334c6dc838c05e0d3f1cee2e2ecb..9051542988c4261aacb5fc25c8e6e2f1d35adfa0 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -35,12 +35,15 @@ namespace grappler {
 // nodes, and potentially a set of nodes to feed.
 struct GrapplerItem {
   GrapplerItem() = default;
-  GrapplerItem(const GrapplerItem& other, GraphDef&& graph_def)
-      : GrapplerItem(other, &graph_def) {}
-  // Swaps *graph_def with an empty GraphDef.
-  GrapplerItem(const GrapplerItem& other, GraphDef* graph_def);
+  GrapplerItem(const GrapplerItem& other) = default;
+  GrapplerItem(GrapplerItem&& other) = default;
+  GrapplerItem& operator=(const GrapplerItem& other) = default;
+  GrapplerItem& operator=(GrapplerItem&& other) = default;
   virtual ~GrapplerItem() = default;
 
+  // Create a copy of this GrapplerItem with graph swapped with the argument.
+  GrapplerItem WithGraph(GraphDef&& graph) const;
+
   string id;  // A unique id for this item
 
   // Inputs
@@ -83,9 +86,42 @@ struct GrapplerItem {
     // Is it allowed to add nodes to the graph that do not have registered
     // gradient function.
     bool non_differentiable_rewrites = true;
+    // By default we are not allowed to inline ops with side effects into the
+    // main graph, because we can't guarantee that after pruning these ops will
+    // be executed. However if we are optimizing a function library (see
+    // meta_optimizer.cc) and a graph was instantiated by a function definition,
+    // we can do that, because functions guarantee that all side effects will be
+    // executed (see function_optimizer.cc for details).
+    bool inline_ops_with_side_effects = false;
   };
 
-  AllowedOptimizations allowed_optimizations;
+  const std::unordered_set<string>& devices() const;
+  // Adds a device to a set of available devices, only if it's a valid fully
+  // defined device name. Returns `Status::OK()` if successfully added a device,
+  // and an error otherwise.
+  Status AddDevice(const string& device);
+  // Adds all valid devices from the other Grappler item to the device set.
+  Status AddDevices(const GrapplerItem& other);
+  // Adds all valid devices from the nodes of the graph to the device set.
+  // Returns `Status::OK()` if all device annotations found in a graph are valid
+  // fully defined device names, and an error otherwise.
+  Status InferDevicesFromGraph();
+  // Clears a set of available devices.
+  void ClearDevices();
+
+  const AllowedOptimizations& allowed_optimizations() const;
+  AllowedOptimizations& allowed_optimizations();
+
+ private:
+  // TODO(ezhulenev) Make GrapplerItem a class and hide all public data members.
+  // TODO(ezhulenev): Migrate all unordered collections to absl.
+
+  // A set of fully defined device names that can be used to place the nodes of
+  // the `graph`.
+  // Example of a fully defined name: "/job:work/replica:1/task:1/device:CPU:0"
+  std::unordered_set<string> devices_;
+
+  AllowedOptimizations allowed_optimizations_;
 };
 
 // Return the transitive fanin of a set of terminal nodes.
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 369046666db8fb537d515952b56311afbba440f2..9224ee7849211f849c3655d6faea18dcc32b8e17 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -65,7 +65,11 @@ void InitializeTensor(DataType type, Tensor* tensor) {
     for (int i = 0; i < flat.size(); i++) {
       flat(i) = i % period;
     }
-  } else {
+  } else if (type != DT_STRING && type != DT_RESOURCE && type != DT_VARIANT) {
+    // DT_STRING, DT_RESOURCE and DT_VARIANT are not simple types according to
+    // is_simple_type<> in tensorflow/core/framework/type_traits.h, and
+    // Allocator will run non-trivial constructor/destructor for a Tensor with
+    // one of these types, so we should not memset its buffer.
     memset(const_cast<char*>(tensor->tensor_data().data()), 0,
            tensor->tensor_data().size());
   }
@@ -98,10 +102,11 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   }
 
   // Instantiate all variables for function library runtime creation.
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
-  std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(devices));
+  Device* cpu_device = devices[0].get();
+  std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(std::move(devices)));
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              graph_def.library());
   Env* env = Env::Default();
@@ -120,7 +125,7 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
       new ProcessFunctionLibraryRuntime(dvc_mgr.get(), env,
                                         graph_def.versions().producer(),
                                         &function_library, *optimizer_opts));
-  FunctionLibraryRuntime* flr = pflr->GetFLR(devices[0]->name());
+  FunctionLibraryRuntime* flr = pflr->GetFLR(cpu_device->name());
 
   // Create the GraphOptimizer to optimize the graph def.
   GraphConstructorOptions graph_ctor_opts;
@@ -133,7 +138,7 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
 
   // Optimize the graph.
   ::tensorflow::GraphOptimizer optimizer(*optimizer_opts);
-  optimizer.Optimize(flr, env, devices[0], &graphptr, /*shape_map=*/nullptr);
+  optimizer.Optimize(flr, env, cpu_device, &graphptr, /*shape_map=*/nullptr);
   graphptr->ToGraphDef(output_graph_def);
 
   // The default values of attributes might have been stripped by the optimizer.
@@ -515,7 +520,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
         }
         if (!iter->second.has_tensor() ||
             iter->second.tensor().string_val_size() != 1) {
-          LOG(INFO) << "Unexected AttrValue proto: "
+          LOG(INFO) << "Unexpected AttrValue proto: "
                     << iter->second.DebugString();
           return nullptr;
         }
@@ -630,5 +635,14 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   return new_item;
 }
 
+std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDefFile(
+    const string& id, const string& meta_graph_file, const ItemConfig& cfg) {
+  MetaGraphDef meta_graph;
+  if (!ReadMetaGraphDefFromFile(meta_graph_file, &meta_graph).ok()) {
+    return nullptr;
+  }
+  return GrapplerItemFromMetaGraphDef(id, meta_graph, cfg);
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index 1698587f8c06ac0e3d5d676d095621a9221df9b9..7102cf94c647a0bfa68d25b56a9913e2be51c784 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -58,6 +58,12 @@ struct ItemConfig {
 std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     const string& id, const MetaGraphDef& meta_graph, const ItemConfig& cfg);
 
+// Factory method for creating a GrapplerItem from a file
+// containing a MetaGraphDef in either binary or text format.
+// Returns nullptr if the given meta_graph cannot be converted.
+std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDefFile(
+    const string& id, const string& meta_graph_file, const ItemConfig& cfg);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/grappler_item_test.cc b/tensorflow/core/grappler/grappler_item_test.cc
index 72a9f481cab6cc5dfdc5994459e149739e427ce6..a8fbe356829409ac3b472267cd22d4b5b54cd1f5 100644
--- a/tensorflow/core/grappler/grappler_item_test.cc
+++ b/tensorflow/core/grappler/grappler_item_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -44,6 +46,32 @@ TEST_F(GrapplerItemTest, Basic) {
   EXPECT_EQ(main_ops, graph_nodes);
 }
 
+TEST_F(GrapplerItemTest, InferDevices) {
+  using test::function::NDef;
+
+  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+  const string cpu2 = "/device:CPU:2";
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {
+          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+          NDef("c", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu2),
+      },
+      {} /* Empty function library */);
+
+  ASSERT_FALSE(item.InferDevicesFromGraph().ok());
+
+  EXPECT_EQ(item.devices().size(), 2);
+  EXPECT_NE(item.devices().find(cpu0), item.devices().end());
+  EXPECT_NE(item.devices().find(cpu1), item.devices().end());
+
+  item.ClearDevices();
+  EXPECT_EQ(item.devices().size(), 0);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/inputs/utils.cc b/tensorflow/core/grappler/inputs/utils.cc
index def9198a69903a9920ce04d3c4d2cc0e1c23a6f2..03f59701cefd61e927857a2f88983fe8402e850c 100644
--- a/tensorflow/core/grappler/inputs/utils.cc
+++ b/tensorflow/core/grappler/inputs/utils.cc
@@ -35,11 +35,19 @@ bool FileExists(const string& file, Status* status) {
   return status->ok();
 }
 
-Status ReadGraphDefFromFile(const string& graph_def_pbtxt_path,
-                            GraphDef* result) {
+Status ReadGraphDefFromFile(const string& graph_def_path, GraphDef* result) {
   Status status;
-  if (FileExists(graph_def_pbtxt_path, &status)) {
-    return ReadTextProto(Env::Default(), graph_def_pbtxt_path, result);
+  if (!ReadBinaryProto(Env::Default(), graph_def_path, result).ok()) {
+    return ReadTextProto(Env::Default(), graph_def_path, result);
+  }
+  return status;
+}
+
+Status ReadMetaGraphDefFromFile(const string& graph_def_path,
+                                MetaGraphDef* result) {
+  Status status;
+  if (!ReadBinaryProto(Env::Default(), graph_def_path, result).ok()) {
+    return ReadTextProto(Env::Default(), graph_def_path, result);
   }
   return status;
 }
diff --git a/tensorflow/core/grappler/inputs/utils.h b/tensorflow/core/grappler/inputs/utils.h
index 4b9cb0a9adb1b0dedd2c791aed21ebb6fb76d24a..2588e380fed231067ea75cdafb2217543f0e3812 100644
--- a/tensorflow/core/grappler/inputs/utils.h
+++ b/tensorflow/core/grappler/inputs/utils.h
@@ -20,7 +20,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -31,8 +33,12 @@ bool FilesExist(const std::set<string>& files);
 
 bool FileExists(const string& file, Status* status);
 
-Status ReadGraphDefFromFile(const string& graph_def_pbtxt_path,
-                            GraphDef* result);
+// Reads GraphDef from file in either text or raw serialized format.
+Status ReadGraphDefFromFile(const string& graph_def_path, GraphDef* result);
+
+// Reads MetaGraphDef from file in either text or raw serialized format.
+Status ReadMetaGraphDefFromFile(const string& meta_graph_def_path,
+                                MetaGraphDef* result);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/inputs/utils_test.cc b/tensorflow/core/grappler/inputs/utils_test.cc
index 694a85528091c28ddd290367c0cb4c1c1c4262fa..c8af2aa7383774e2fb6ee9b999bcf30b21aafb8a 100644
--- a/tensorflow/core/grappler/inputs/utils_test.cc
+++ b/tensorflow/core/grappler/inputs/utils_test.cc
@@ -31,6 +31,25 @@ class UtilsTest : public ::testing::Test {
     non_existent_file_ = io::JoinPath(BaseDir(), "non_existent_file.txt");
     actual_file_ = io::JoinPath(BaseDir(), "test_file.txt");
     TF_CHECK_OK(WriteStringToFile(env_, actual_file_, "Some test data"));
+
+    text_graph_def_file_ = io::JoinPath(BaseDir(), "text_graph_def_file.txt");
+    binary_graph_def_file_ =
+        io::JoinPath(BaseDir(), "binary_graph_def_file.txt");
+    text_meta_graph_def_file_ =
+        io::JoinPath(BaseDir(), "text_meta_graph_def_file.txt");
+    binary_meta_graph_def_file_ =
+        io::JoinPath(BaseDir(), "binary_meta_graph_def_file.txt");
+
+    auto node = graph_def_.add_node();
+    node->set_name("foo");
+    node->set_op("bar");
+    TF_CHECK_OK(WriteTextProto(env_, text_graph_def_file_, graph_def_));
+    TF_CHECK_OK(WriteBinaryProto(env_, binary_graph_def_file_, graph_def_));
+    *meta_graph_def_.mutable_graph_def() = graph_def_;
+    TF_CHECK_OK(
+        WriteTextProto(env_, text_meta_graph_def_file_, meta_graph_def_));
+    TF_CHECK_OK(
+        WriteBinaryProto(env_, binary_meta_graph_def_file_, meta_graph_def_));
   }
 
   void TearDown() override {
@@ -39,8 +58,14 @@ class UtilsTest : public ::testing::Test {
         env_->DeleteRecursively(BaseDir(), &undeleted_files, &undeleted_dirs));
   }
 
+  GraphDef graph_def_;
+  MetaGraphDef meta_graph_def_;
   string non_existent_file_;
   string actual_file_;
+  string text_graph_def_file_;
+  string binary_graph_def_file_;
+  string text_meta_graph_def_file_;
+  string binary_meta_graph_def_file_;
   Env* env_ = Env::Default();
 };
 
@@ -58,6 +83,30 @@ TEST_F(UtilsTest, FilesExist) {
   EXPECT_TRUE(status[1].ok());
 }
 
+TEST_F(UtilsTest, ReadGraphDefFromFile_Text) {
+  GraphDef result;
+  TF_CHECK_OK(ReadGraphDefFromFile(text_graph_def_file_, &result));
+  EXPECT_EQ(result.DebugString(), graph_def_.DebugString());
+}
+
+TEST_F(UtilsTest, ReadGraphDefFromFile_Binary) {
+  GraphDef result;
+  TF_CHECK_OK(ReadGraphDefFromFile(binary_graph_def_file_, &result));
+  EXPECT_EQ(result.DebugString(), graph_def_.DebugString());
+}
+
+TEST_F(UtilsTest, ReadMetaGraphDefFromFile_Text) {
+  MetaGraphDef result;
+  TF_CHECK_OK(ReadMetaGraphDefFromFile(text_meta_graph_def_file_, &result));
+  EXPECT_EQ(result.DebugString(), meta_graph_def_.DebugString());
+}
+
+TEST_F(UtilsTest, ReadReadMetaGraphDefFromFile_Binary) {
+  MetaGraphDef result;
+  TF_CHECK_OK(ReadMetaGraphDefFromFile(binary_meta_graph_def_file_, &result));
+  EXPECT_EQ(result.DebugString(), meta_graph_def_.DebugString());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index f0aff90c6c237c0097451c5153568808cf46728a..1a4754153bca9bb7ee019b9b9ea67e6ce3cb5f89 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -14,13 +14,34 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
+const absl::flat_hash_set<MutableGraphView::InputPort>&
+MutableGraphView::GetFanout(const GraphView::OutputPort& port) const {
+  return GetFanout(MutableGraphView::OutputPort(const_cast<NodeDef*>(port.node),
+                                                port.port_id));
+}
+
+absl::flat_hash_set<MutableGraphView::OutputPort> MutableGraphView::GetFanin(
+    const GraphView::InputPort& port) const {
+  return GetFanin(MutableGraphView::InputPort(const_cast<NodeDef*>(port.node),
+                                              port.port_id));
+}
+
+const MutableGraphView::OutputPort MutableGraphView::GetRegularFanin(
+    const GraphView::InputPort& port) const {
+  return GetRegularFanin(MutableGraphView::InputPort(
+      const_cast<NodeDef*>(port.node), port.port_id));
+}
+
 NodeDef* MutableGraphView::AddNode(NodeDef&& node) {
-  auto* node_in_graph = GetGraph()->add_node();
+  auto* node_in_graph = graph()->add_node();
   *node_in_graph = std::move(node);
 
   AddUniqueNodeOrDie(node_in_graph);
@@ -29,54 +50,137 @@ NodeDef* MutableGraphView::AddNode(NodeDef&& node) {
   return node_in_graph;
 }
 
-NodeDef* MutableGraphView::InsertNode(const NodeDef& input_node, NodeDef&& node,
-                                      const int output_port_id) {
-  auto* node_in_graph = GetGraph()->add_node();
-  *node_in_graph = std::move(node);
+void MutableGraphView::UpdateFanouts(absl::string_view from_node,
+                                     absl::string_view to_node) {
+  NodeDef* from_node_ptr = GetNode(from_node);
+  NodeDef* to_node_ptr = GetNode(to_node);
+  if (from_node_ptr && to_node_ptr) {
+    UpdateFanouts(from_node_ptr, to_node_ptr);
+  } else if (!from_node_ptr) {
+    LOG(WARNING) << absl::Substitute(
+        "Can't update fanouts from '$0' to '$1', from node was not found.",
+        from_node, to_node);
+  } else {
+    LOG(WARNING) << absl::Substitute(
+        "Can't update fanouts from '$0' to '$1', to node was not found.",
+        from_node, to_node);
+  }
+}
 
-  AddUniqueNodeOrDie(node_in_graph);
+void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
+  VLOG(0) << absl::Substitute("Update fanouts from '$0' to '$1'.",
+                              from_node->name(), to_node->name());
+
+  // Update internal state with the new output_port->input_port edge.
+  const auto add_edge = [this](const OutputPort& output_port,
+                               const InputPort& input_port) {
+    fanouts()[output_port].insert(input_port);
+  };
+
+  // Remove invalidated edge from the internal state.
+  const auto remove_edge = [this](const OutputPort& output_port,
+                                  const InputPort& input_port) {
+    fanouts()[output_port].erase(input_port);
+  };
+
+  // First we update regular fanouts. For the regular fanouts
+  // `input_port:port_id` is the input index in NodeDef.
+
+  auto regular_edges =
+      GetFanoutEdges(*from_node, /*include_controlled_edges=*/false);
+
+  // Maximum index of the `from_node` output tensor that is still used as an
+  // input to some other node.
+  int keep_max_regular_output_port = -1;
+
+  for (const Edge& edge : regular_edges) {
+    const OutputPort output_port = edge.src;
+    const InputPort input_port = edge.dst;
+
+    // If the `to_node` reads from the `from_node`, skip this edge (see
+    // AddAndUpdateFanoutsWithoutSelfLoops test for an example).
+    if (input_port.node == to_node) {
+      keep_max_regular_output_port =
+          std::max(keep_max_regular_output_port, input_port.port_id);
+      continue;
+    }
+
+    // Update input at destination node.
+    input_port.node->set_input(
+        input_port.port_id,
+        output_port.port_id == 0
+            ? to_node->name()
+            : absl::StrCat(to_node->name(), ":", output_port.port_id));
+
+    // Remove old edge between the `from_node` and the fanout node.
+    remove_edge(output_port, input_port);
+    // Add an edge between the `to_node` and new fanout node.
+    add_edge(OutputPort(to_node, output_port.port_id), input_port);
+  }
 
-  // replace input for the output nodes of `input_node` with `node`
-  ReplaceInput(input_node, *node_in_graph, output_port_id);
+  // For the control fanouts we do not know the input index in a NodeDef,
+  // so we have to traverse all control inputs.
+
+  auto control_fanouts =
+      GetFanout(GraphView::OutputPort(from_node, Graph::kControlSlot));
+  if (control_fanouts.empty()) return;
+
+  const string from_control_input = absl::StrCat("^", from_node->name());
+  const string to_control_input = absl::StrCat("^", to_node->name());
+
+  for (const InputPort& control_port : control_fanouts) {
+    // Node can't be control dependency of itself.
+    if (control_port.node == to_node) continue;
+
+    // Find and update input corresponding to control dependency.
+    NodeDef* node = control_port.node;
+    for (int i = node->input_size() - 1; i >= 0; --i) {
+      const string& input = node->input(i);
+      if (!IsControlInput(input)) break;  // we reached regular inputs
+      if (input == from_control_input) {
+        node->set_input(i, to_control_input);
+      }
+    }
+
+    // Remove old edge between the `from_node` and the fanout node.
+    remove_edge(OutputPort(from_node, Graph::kControlSlot), control_port);
+    // Add an edge between the `to_node` and new fanout node.
+    add_edge(OutputPort(to_node, Graph::kControlSlot), control_port);
+  }
 
-  AddFanouts(node_in_graph);
-  return node_in_graph;
-}
+  // Because we update all regular fanouts of `from_node`, we can just copy
+  // the value `num_regular_outputs`.
+  max_regular_output_port()[to_node] = max_regular_output_port()[from_node];
 
-void MutableGraphView::ReplaceInput(const NodeDef& old_input,
-                                    const NodeDef& new_input,
-                                    const int output_port_id) {
-  GraphView::OutputPort output_port =
-      GetOutputPort(old_input.name(), output_port_id);
-  auto fanout = GetFanout(output_port);
-  for (auto& input_port : fanout) {
-    input_port.node->set_input(input_port.port_id, new_input.name());
-    AddFanouts(input_port.node);
+  // Check if all fanouts were updated to read from the `to_node`.
+  if (keep_max_regular_output_port >= 0) {
+    max_regular_output_port()[from_node] = keep_max_regular_output_port;
+  } else {
+    max_regular_output_port().erase(from_node);
   }
 }
 
 void MutableGraphView::DeleteNodes(const std::set<string>& nodes_to_delete) {
   for (const string& node_name_to_delete : nodes_to_delete)
-    RemoveFanouts(MutableNodes()->at(node_name_to_delete));
+    RemoveFanouts(nodes().at(node_name_to_delete));
   for (const string& node_name_to_delete : nodes_to_delete)
-    MutableNodes()->erase(node_name_to_delete);
-  EraseNodesFromGraph(nodes_to_delete, GetGraph());
+    nodes().erase(node_name_to_delete);
+  EraseNodesFromGraph(nodes_to_delete, graph());
 }
 
-void MutableGraphView::RemoveFanouts(NodeDef* node) {
-  for (int i = 0; i < node->input_size(); ++i) {
-    OutputPort fanin;
-    string fanin_name = ParseNodeName(node->input(i), &fanin.port_id);
-    fanin.node = (*MutableNodes())[fanin_name];
+void MutableGraphView::RemoveFanouts(NodeDef* deleted_node) {
+  for (int i = 0; i < deleted_node->input_size(); ++i) {
+    TensorId tensor_id = ParseTensorName(deleted_node->input(i));
+    OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
 
     InputPort input;
-    input.node = node;
-    if (fanin.port_id < 0)
-      input.port_id = -1;
+    input.node = deleted_node;
+    if (tensor_id.index() < 0)
+      input.port_id = Graph::kControlSlot;
     else
       input.port_id = i;
 
-    (*MutableFanouts())[fanin].erase(input);
+    fanouts()[fanin].erase(input);
   }
 }
 
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
index 971e5503d4ce908dbb86a4f127ac4da6bea95874..355dd6c491763e96b509ce42977e2cf0f5db2eb5 100644
--- a/tensorflow/core/grappler/mutable_graph_view.h
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -24,37 +24,64 @@ namespace grappler {
 // A utility class to simplify the traversal of a GraphDef that, unlike
 // GraphView, supports updating the graph.  Note that you should not modify the
 // graph separately, because the view will get out of sync.
-class MutableGraphView : public GraphView {
+
+class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
  public:
-  using GraphView::GraphView;
+  explicit MutableGraphView(GraphDef* graph) : GraphViewInternal(graph) {
+    for (NodeDef& node : *graph->mutable_node()) AddUniqueNodeOrDie(&node);
+    for (NodeDef& node : *graph->mutable_node()) AddFanouts(&node);
+  }
 
-  GraphDef* GetGraph() { return MutableGraph(); }
+  // Lookup fanouts/fanins using immutable ports.
+  using GraphViewInternal::GetFanout;
+  const absl::flat_hash_set<InputPort>& GetFanout(
+      const GraphView::OutputPort& port) const;
 
-  // Adds a new node to graph and updates the view.
-  NodeDef* AddNode(NodeDef&& node);
+  using GraphViewInternal::GetFanin;
+  absl::flat_hash_set<OutputPort> GetFanin(
+      const GraphView::InputPort& port) const;
 
-  // Inserts a new node to the graph after `input` node and updates the view.
-  // This adds `node` to the graph and replaces the input for the output
-  // nodes of `input` with a port `output_port_id` with the new node.
-  NodeDef* InsertNode(const NodeDef& input, NodeDef&& node,
-                      int output_port_id = 0);
+  using GraphViewInternal::GetRegularFanin;
+  const OutputPort GetRegularFanin(const GraphView::InputPort& port) const;
 
-  // Replaces the input for the output nodes of 'old_input' with a port
-  // `output_port_id` with 'new_input'.
+  // Adds a new node to graph and updates the view. Returns a pointer to the
+  // node in graph.
+  NodeDef* AddNode(NodeDef&& node);
+
+  // Updates all fanouts (input ports fetching output tensors) from `from_node`
+  // to the `to_node`, including control dependencies.
+  //
+  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0, ^bar)
+  //   2. foo2(bar:1, other:1)
   //
-  // E.g: We have 2 nodes that use 'bar' node outputs as inputs:
-  // foo(bar:0, bar:1),  foo2(other:0, bar:0)
-  // Calling ReplaceInput(bar, new, 0) changes every occurrence of bar:0 for
-  // new:0.  Result:
-  // foo(new:0, bar:1),  foo2(other:0, new:0)
-  void ReplaceInput(const NodeDef& old_input, const NodeDef& new_input,
-                    int output_port_id = 0);
+  // After calling ForwardOutputs(bar, new_bar):
+  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
+  //   2. foo2(new_bar:1, other:1)
+  void UpdateFanouts(absl::string_view from_node, absl::string_view to_node);
 
   // Deletes nodes from the graph.
   void DeleteNodes(const std::set<string>& nodes_to_delete);
 
  private:
-  void RemoveFanouts(NodeDef* node);
+  // Updates all fanouts (input ports fetching output tensors) from `from_node`
+  // to the `to_node`, including control dependencies.
+  //
+  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0, ^bar)
+  //   2. foo2(bar:1, other:1)
+  //
+  // After calling ForwardOutputs(bar, new_bar):
+  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
+  //   2. foo2(new_bar:1, other:1)
+  //
+  // IMPORTANT: If `from_node` or `to_node` is not in the underlying graph, the
+  // behavior is undefined.
+  void UpdateFanouts(NodeDef* from_node, NodeDef* to_node);
+
+  // Remove fanouts of the deleted node from internal state (including control
+  // dependencies).
+  void RemoveFanouts(NodeDef* deleted_node);
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
index 2536bec35ddcf7f45eb6dd5a7899059a7e67e418..c1b3f8c01cf3dbb570d64845fb7097d1b309fc30 100644
--- a/tensorflow/core/grappler/mutable_graph_view_test.cc
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/platform/test.h"
@@ -23,103 +24,122 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-bool FindChildWithName(const MutableGraphView& graph,
-                       const string& output_port_name,
-                       const string& input_name) {
-  GraphView::OutputPort output_port = graph.GetOutputPort(output_port_name, 0);
-  auto fanout = graph.GetFanout(output_port);
-  for (auto& input_port : fanout) {
-    if (input_port.node->name() == input_name) return true;
-  }
-  return false;
+using ::tensorflow::test::function::NDef;
+
+TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar", "NotImportant", {}, {}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar", "other", "bar:1", "^bar"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"})},
+      /* empty function library */ {});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NotImportant", {}, {}));
+  NodeDef* bar = graph.GetNode("bar");
+
+  graph.UpdateFanouts(bar->name(), new_bar->name());
+
+  // Fanout nodes must have their inputs updated.
+  NodeDef* foo_1 = graph.GetNode("foo_1");
+  ASSERT_NE(foo_1, nullptr);
+  ASSERT_EQ(foo_1->input_size(), 4);
+  EXPECT_EQ(foo_1->input(0), "new_bar");
+  EXPECT_EQ(foo_1->input(1), "other");
+  EXPECT_EQ(foo_1->input(2), "new_bar:1");
+  EXPECT_EQ(foo_1->input(3), "^new_bar");
+
+  NodeDef* foo_2 = graph.GetNode("foo_2");
+  ASSERT_NE(foo_2, nullptr);
+  ASSERT_EQ(foo_2->input_size(), 3);
+  EXPECT_EQ(foo_2->input(0), "other:1");
+  EXPECT_EQ(foo_2->input(1), "new_bar:2");
+  EXPECT_EQ(foo_2->input(2), "^new_bar");
+
+  // And fanouts mapping must be also updated for both nodes.
+  bool include_control_fanouts = true;
+  auto old_node_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
+  auto new_node_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
+
+  EXPECT_TRUE(old_node_fanouts.empty());
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 0)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 2)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, -1)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
+  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
 }
 
-TrivialTestGraphInputYielder SimpleGraph() {
-  // This outputs simple graph like:
-  //        x
-  //       / \
-  // Square   Square_1
-  //   |   \  /    |
-  //   |    \/     |
-  //   |    /\     |
-  //   |   /  \    |
-  //  AddN     AddN_1
-  //      \   /
-  //        y
-  TrivialTestGraphInputYielder simple_graph(2, 2, 2, false,
-                                            {"/CPU:0", "/GPU:0"});
-  return simple_graph;
-}
-
-TEST(MutableGraphViewTest, AddAndReplaceInput) {
-  TrivialTestGraphInputYielder fake_input = SimpleGraph();
-  GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+TEST(MutableGraphViewTest, AddAndUpdateFanoutsWithoutSelfLoops) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def =
+      test::function::GDef({NDef("bar", "NotImportant", {}, {}),
+                            NDef("foo", "NotImportant", {"bar", "^bar"})},
+                           /* empty function library */ {});
 
-  GraphDef new_graph = item.graph;
-  MutableGraphView graph(&new_graph);
+  MutableGraphView graph(&graph_def);
 
-  GraphView::InputPort input = graph.GetInputPort("AddN", 0);
-  EXPECT_EQ("AddN", input.node->name());
-  EXPECT_EQ(0, input.port_id);
-  GraphView::OutputPort fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ("Square", fanin.node->name());
-  EXPECT_EQ(0, fanin.port_id);
+  // `new_bar` reads the output of an original `bar` node.
+  NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NewBar", {"bar"}, {}));
+  NodeDef* bar = graph.GetNode("bar");
 
-  EXPECT_FALSE(FindChildWithName(graph, "Square", "new_node"));
+  graph.UpdateFanouts("bar", new_bar->name());
 
-  NodeDef new_node = *input.node;
-  new_node.set_name("new_node");
+  // Foo node must read from `new_bar`.
+  NodeDef* foo = graph.GetNode("foo");
+  ASSERT_NE(foo, nullptr);
+  ASSERT_EQ(foo->input_size(), 2);
+  EXPECT_EQ(foo->input(0), "new_bar");
+  EXPECT_EQ(foo->input(1), "^new_bar");
 
-  EXPECT_EQ(graph.GetNode("new_node"), nullptr);
-  NodeDef* node_in_graph = graph.AddNode(std::move(new_node));
-  EXPECT_NE(graph.GetNode("new_node"), nullptr);
+  // And the `new_bar` should read from the original `bar`.
+  ASSERT_EQ(new_bar->input_size(), 1);
+  ASSERT_EQ(new_bar->input(0), "bar");
 
-  graph.ReplaceInput(*input.node, *node_in_graph);
-  EXPECT_TRUE(FindChildWithName(graph, "Square", "new_node"));
-  EXPECT_TRUE(FindChildWithName(graph, "new_node", "y"));
-}
+  // And fanouts mapping must be also updated for both nodes.
+  bool include_control_fanouts = true;
+  auto bar_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
+  auto new_bar_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
 
-TEST(MutableGraphViewTest, InsertNodes) {
-  TrivialTestGraphInputYielder fake_input = SimpleGraph();
+  EXPECT_EQ(bar_fanouts.size(), 1);
+  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(new_bar, 0)), 1);
 
-  GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  EXPECT_EQ(new_bar_fanouts.size(), 2);
+  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, 0)), 1);
+  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, -1)), 1);
+}
 
-  GraphDef new_graph = item.graph;
-  MutableGraphView graph(&new_graph);
+TEST(MutableGraphViewTest, DeleteNodes) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar", "NotImportant", {}, {}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar", "other", "bar:1", "^bar"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"})},
+      /* empty function library */ {});
 
-  GraphView::InputPort input = graph.GetInputPort("AddN", 0);
+  MutableGraphView graph(&graph_def);
 
-  NodeDef new_node = *input.node;
-  new_node.set_name("new_node");
-  new_node.set_input(0, input.node->name());
+  EXPECT_NE(graph.GetNode("foo_1"), nullptr);
+  graph.DeleteNodes({"foo_1"});
 
-  EXPECT_EQ(graph.GetNode("new_node"), nullptr);
-  graph.InsertNode(*input.node, std::move(new_node));
-  EXPECT_NE(graph.GetNode("new_node"), nullptr);
-  EXPECT_TRUE(FindChildWithName(graph, "Square", "AddN"));
-  EXPECT_TRUE(FindChildWithName(graph, "Square", "AddN_1"));
-  EXPECT_TRUE(FindChildWithName(graph, "Square_1", "AddN"));
-  EXPECT_TRUE(FindChildWithName(graph, "Square_1", "AddN_1"));
-  EXPECT_TRUE(FindChildWithName(graph, "AddN", "new_node"));
-  EXPECT_TRUE(FindChildWithName(graph, "AddN_1", "y"));
-  EXPECT_TRUE(FindChildWithName(graph, "new_node", "y"));
-}
+  EXPECT_EQ(graph.GetNode("foo_1"), nullptr);
 
-TEST(MutableGraphViewTest, DeleteNodes) {
-  // Outputs simple graph as described in first test.
-  TrivialTestGraphInputYielder fake_input = SimpleGraph();
-  GrapplerItem item;
-  CHECK(fake_input.NextItem(&item));
+  NodeDef* bar = graph.GetNode("bar");
+  NodeDef* other = graph.GetNode("other");
+  NodeDef* foo_2 = graph.GetNode("foo_2");
 
-  GraphDef new_graph = item.graph;
-  MutableGraphView graph(&new_graph);
+  bool include_control_fanouts = true;
+  auto bar_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
+  auto other_fanouts = graph.GetFanouts(*other, include_control_fanouts);
 
-  EXPECT_NE(graph.GetNode("AddN"), nullptr);
-  graph.DeleteNodes({"AddN"});
+  EXPECT_EQ(bar_fanouts.size(), 2);
+  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
+  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
 
-  EXPECT_EQ(graph.GetNode("AddN"), nullptr);
+  EXPECT_EQ(other_fanouts.size(), 1);
+  EXPECT_EQ(other_fanouts.count(MutableGraphView::InputPort(foo_2, 0)), 1);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 1b5a215987f085cef51d43e08757518054016d65..38fc1fff329eda5b80bb771442f2c543bd27e85d 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -73,6 +73,17 @@ bool IsBitcast(const NodeDef& node) { return node.op() == "Bitcast"; }
 
 bool IsCast(const NodeDef& node) { return node.op() == "Cast"; }
 
+bool IsCastLike(const NodeDef& node) {
+  static const gtl::FlatSet<string>* const kCastLikeOps =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
+          "Angle", "Bucketize", "Cast", "CompareAndBitpack", "Dequantize",
+          "HistogramFixedWidth", "Imag", "IsFinite", "IsInf", "IsNan",
+          "Quantize", "QuantizeDownAndShrinkRange", "QuantizeV2",
+          "QuantizedInstanceNorm", "QuantizedRelu", "QuantizedRelu6",
+          "QuantizedReluX", "Real", "Requantize"}));
+  return kCastLikeOps->count(node.op()) > 0;
+}
+
 bool IsCheckNumerics(const NodeDef& node) {
   return node.op() == "CheckNumerics";
 }
@@ -195,12 +206,19 @@ bool IsExit(const NodeDef& node) {
 
 bool IsExp(const NodeDef& node) { return node.op() == "Exp"; }
 
+bool IsFakeParam(const NodeDef& node) { return node.op() == "FakeParam"; }
+
 bool IsFill(const NodeDef& node) { return node.op() == "Fill"; }
 
 bool IsFloorDiv(const NodeDef& node) { return node.op() == "FloorDiv"; }
 
 bool IsFloorMod(const NodeDef& node) { return node.op() == "FloorMod"; }
 
+bool IsFusedBatchNorm(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "FusedBatchNorm" || op == "FusedBatchNormV2";
+}
+
 bool IsFusedBatchNormGrad(const NodeDef& node) {
   const auto& op = node.op();
   return op == "FusedBatchNormGrad" || op == "FusedBatchNormGradV2";
@@ -216,9 +234,6 @@ bool IsHistogramSummary(const NodeDef& node) {
 
 bool IsIdentity(const NodeDef& node) {
   const auto& op = node.op();
-  if (op == "IdentityN" && node.attr().at("T").list().type_size() == 1) {
-    return true;
-  }
   return op == "Identity" || op == "RefIdentity";
 }
 
@@ -227,12 +242,21 @@ bool IsIdentityN(const NodeDef& node) {
   return op == "IdentityN";
 }
 
+bool IsIdentityNSingleInput(const NodeDef& node) {
+  return IsIdentityN(node) && node.attr().count("T") != 0 &&
+         node.attr().at("T").list().type_size() == 1;
+}
+
 bool IsIgamma(const NodeDef& node) { return node.op() == "Igamma"; }
 
 bool IsIgammac(const NodeDef& node) { return node.op() == "Igammac"; }
 
 bool IsImag(const NodeDef& node) { return node.op() == "Imag"; }
 
+bool IsImmutableConst(const NodeDef& node) {
+  return node.op() == "ImmutableConst";
+}
+
 bool IsInvGrad(const NodeDef& node) { return node.op() == "InvGrad"; }
 
 bool IsLess(const NodeDef& node) { return node.op() == "Less"; }
@@ -298,6 +322,10 @@ bool IsPad(const NodeDef& node) {
   return op == "Pad" || op == "PadV2";
 }
 
+bool IsPartitionedCall(const NodeDef& node) {
+  return node.op() == "PartitionedCall";
+}
+
 bool IsPlaceholder(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Placeholder" || op == "PlaceholderV2" ||
@@ -340,6 +368,8 @@ bool IsReduction(const NodeDef& node) {
          op == "Mean" || op == "Any" || op == "All";
 }
 
+bool IsRelu(const NodeDef& node) { return node.op() == "Relu"; }
+
 bool IsReluGrad(const NodeDef& node) { return node.op() == "ReluGrad"; }
 
 bool IsRelu6Grad(const NodeDef& node) { return node.op() == "Relu6Grad"; }
@@ -416,6 +446,10 @@ bool IsStackPopOp(const NodeDef& node) {
   return node.op() == "StackPop" || node.op() == "StackPopV2";
 }
 
+bool IsStatefulPartitionedCall(const NodeDef& node) {
+  return node.op() == "StatefulPartitionedCall";
+}
+
 bool IsStopGradient(const NodeDef& node) {
   const auto& op = node.op();
   return op == "StopGradient" || op == "PreventGradient";
@@ -517,14 +551,15 @@ bool MaybeHasRefInput(const NodeDef& node) {
   return false;
 }
 
-bool IsFreeOfSideEffect(const NodeDef& node) {
+bool IsFreeOfSideEffect(const NodeDef& node,
+                        const OpRegistryInterface* op_registry) {
   // Placeholders must be preserved to keep the graph feedable.
   if (IsPlaceholder(node)) {
     return false;
   }
   const OpDef* op_def = nullptr;
   const string& op_name = node.op();
-  Status status = OpRegistry::Global()->LookUpOpDef(op_name, &op_def);
+  Status status = op_registry->LookUpOpDef(op_name, &op_def);
   if (!status.ok()) {
     return false;
   }
@@ -541,9 +576,17 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
   if (node.op().find("Queue") != string::npos) {
     return false;
   }
+  // Sending a tensor via a network is a side effect.
+  if (IsSend(node)) {
+    return false;
+  }
   return !ModifiesInputsInPlace(node);
 }
 
+bool IsFreeOfSideEffect(const NodeDef& node) {
+  return IsFreeOfSideEffect(node, OpRegistry::Global());
+}
+
 bool ModifiesInputsInPlace(const NodeDef& node) {
   // Some nodes do in-place updates on regular tensor inputs.
   string op_name = node.op();
@@ -630,8 +673,15 @@ bool IsValuePreserving(const NodeDef& node) {
       CHECK_NOTNULL((new gtl::FlatSet<string>{
           "InvertPermutation",
           "Reverse",
+          "ReverseV2",
           "Roll",
           "Transpose",
+          "DepthToSpace",
+          "SpaceToDepth",
+          "BatchToSpace",
+          "BatchToSpaceND",
+          "SpaceToBatch",
+          "SpaceToBatchND",
       }));
   return IsValueAndOrderPreserving(node) ||
          kValuePreservingOps->count(node.op()) > 0;
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index d4e0159e814a721699cc5148be194178d55c0a4f..67897e8512d7dc6e4774c066297674629dd4f714 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
 
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -62,18 +63,22 @@ bool IsEnter(const NodeDef& node);
 bool IsEqual(const NodeDef& node);
 bool IsExit(const NodeDef& node);
 bool IsExp(const NodeDef& node);
+bool IsFakeParam(const NodeDef& node);
 bool IsFill(const NodeDef& node);
 bool IsFloorDiv(const NodeDef& node);
 bool IsFloorMod(const NodeDef& node);
+bool IsFusedBatchNorm(const NodeDef& node);
 bool IsFusedBatchNormGrad(const NodeDef& node);
 bool IsGreater(const NodeDef& node);
 bool IsGreaterEqual(const NodeDef& node);
 bool IsHistogramSummary(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
 bool IsIdentityN(const NodeDef& node);
+bool IsIdentityNSingleInput(const NodeDef& node);
 bool IsIgamma(const NodeDef& node);
 bool IsIgammac(const NodeDef& node);
 bool IsImag(const NodeDef& node);
+bool IsImmutableConst(const NodeDef& node);
 bool IsInvGrad(const NodeDef& node);
 bool IsLess(const NodeDef& node);
 bool IsLessEqual(const NodeDef& node);
@@ -97,6 +102,7 @@ bool IsNextIteration(const NodeDef& node);
 bool IsPack(const NodeDef& node);
 bool IsPad(const NodeDef& node);
 bool IsPack(const NodeDef& node);
+bool IsPartitionedCall(const NodeDef& node);
 bool IsNeg(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
 bool IsNotEqual(const NodeDef& node);
@@ -110,6 +116,7 @@ bool IsRandomShuffle(const NodeDef& node);
 bool IsRank(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
+bool IsRelu(const NodeDef& node);
 bool IsRelu6Grad(const NodeDef& node);
 bool IsReluGrad(const NodeDef& node);
 bool IsReciprocalGrad(const NodeDef& node);
@@ -144,6 +151,7 @@ bool IsStackOp(const NodeDef& node);
 bool IsStackCloseOp(const NodeDef& node);
 bool IsStackPushOp(const NodeDef& node);
 bool IsStackPopOp(const NodeDef& node);
+bool IsStatefulPartitionedCall(const NodeDef& node);
 bool IsStopGradient(const NodeDef& node);
 bool IsStridedSlice(const NodeDef& node);
 bool IsStridedSliceGrad(const NodeDef& node);
@@ -173,7 +181,9 @@ bool IsCommutative(const NodeDef& node);
 // value.
 bool IsPersistent(const NodeDef& node);
 
-bool IsFreeOfSideEffect(const NodeDef& node);
+bool IsFreeOfSideEffect(const NodeDef& node,
+                        const OpRegistryInterface* op_registry);
+bool IsFreeOfSideEffect(const NodeDef& node);  // use OpRegistry::Global()
 
 // Returns true if the takes a tensor reference as input, or if looking up its
 // OpDef failed.
@@ -210,6 +220,10 @@ bool IsUnaryElementWise(const NodeDef& node);
 // Returns true if we can find an opdef corresponding to the op of the node.
 bool HasOpDef(const NodeDef& node);
 
+// Returns true if the op changes the scalar type of its first input elements
+// and preserves the number of elements.
+bool IsCastLike(const NodeDef& node);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index c708f849488c724dd98ba888654c564e93c9b963..79578cb3ce0733bcfce1a382414c20881879e3e3 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -3,7 +3,6 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 # Platform specific build config
@@ -103,6 +102,7 @@ cc_library(
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -113,6 +113,7 @@ tf_cc_test(
     deps = [
         ":constant_folding",
         ":dependency_optimizer",
+        "//tensorflow/cc:array_ops_internal",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:all_kernels",
@@ -140,14 +141,18 @@ cc_library(
     deps = [
         ":graph_optimizer",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -175,22 +180,6 @@ tf_cuda_cc_test(
     ],
 )
 
-cc_library(
-    name = "graph_rewriter",
-    srcs = ["graph_rewriter.cc"],
-    hdrs = [
-        "graph_rewriter.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
-    ],
-)
-
 cc_library(
     name = "graph_optimizer",
     hdrs = [
@@ -209,6 +198,8 @@ cc_library(
     hdrs = ["graph_optimizer_stage.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
@@ -347,11 +338,14 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        ":graph_rewriter",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -391,7 +385,7 @@ cc_library(
     srcs = [
         "gpu_swapping_ops.cc",
     ],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -411,24 +405,25 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        ":graph_rewriter",
         ":static_schedule",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/costs:utils",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/utils:traversal",
     ],
 )
 
-tf_cuda_only_cc_test(
+tf_cuda_cc_test(
     name = "memory_optimizer_test",
     srcs = ["memory_optimizer_test.cc"],
     tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
@@ -437,6 +432,8 @@ tf_cuda_only_cc_test(
         ":gpu_swapping_ops",
         ":memory_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensor_testutil",
@@ -464,7 +461,6 @@ cc_library(
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/costs:virtual_placer",
@@ -531,6 +527,8 @@ cc_library(
         "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -552,6 +550,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
         "//tensorflow/core/grappler/utils:grappler_test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -612,12 +611,12 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -650,8 +649,8 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
@@ -692,6 +691,8 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -764,7 +765,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
     ],
@@ -831,6 +831,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -857,11 +858,10 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 7d5014ee0ada1f0345e108804f5170e01f5b4354..d35c00f29ecb1c1acedb41c29f08d20decf6476e 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
+#include "tensorflow/core/util/strided_slice_op.h"
 
 using tensorflow::strings::StrCat;
 
@@ -66,7 +67,8 @@ bool ValuesFromConstNode(const NodeDef& node, std::vector<T>* values) {
     return false;
   }
 
-  if (node.attr().at("dtype").type() != DataTypeToEnum<T>::value) {
+  if (node.attr().count("dtype") == 0 || node.attr().count("value") == 0 ||
+      node.attr().at("dtype").type() != DataTypeToEnum<T>::value) {
     return false;
   }
 
@@ -124,39 +126,6 @@ void SetDataTypeToAttr(DataType dtype, const string& attr_name, NodeDef* node) {
   (*node->mutable_attr())[attr_name].set_type(dtype);
 }
 
-string SourceDataTypeAttrName(const NodeDef& node) {
-  if (node.op() == "Bitcast") {
-    return "T";
-  } else if (node.op() == "Cast") {
-    return "SrcT";
-  } else {
-    LOG(FATAL) << "SourceDataTypeAttrName not implemented for op " << node.op();
-  }
-}
-
-string DestinationDataTypeAttrName(const NodeDef& node) {
-  if (node.op() == "Bitcast") {
-    return "type";
-  } else if (node.op() == "Cast") {
-    return "DstT";
-  } else {
-    LOG(FATAL) << "DestinationDataTypeAttrName not implemented for op "
-               << node.op();
-  }
-}
-
-DataType GetSourceDataType(const NodeDef& node) {
-  return GetDataTypeFromAttr(node, SourceDataTypeAttrName(node));
-}
-
-DataType GetDestinationDataType(const NodeDef& node) {
-  return GetDataTypeFromAttr(node, DestinationDataTypeAttrName(node));
-}
-
-void SetSourceDataType(DataType dtype, NodeDef* node) {
-  SetDataTypeToAttr(dtype, SourceDataTypeAttrName(*node), node);
-}
-
 NodeDef* GetTailOfValuePreservingChain(
     const NodeDef& node, const NodeMap& node_map,
     const std::unordered_set<string>& nodes_to_preserve) {
@@ -266,18 +235,17 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
 
   // TODO(ezhulenev): move to GraphOptimizerStage?
   bool IsDrivenByControlDependency(const NodeDef& node) const {
-    return std::any_of(node.input().begin(), node.input().end(),
-                       IsControlInput);
+    return std::any_of(
+        node.input().begin(), node.input().end(),
+        [](const string& input) { return IsControlInput(input); });
   }
 
   // TODO(ezhulenev): move to GraphOptimizerStage?
   bool DrivesControlDependency(const NodeDef& node) const {
-    int position;
     for (const NodeDef* output : ctx().node_map->GetOutputs(node.name())) {
       for (int i = 0; i < output->input_size(); ++i) {
-        auto input = output->input(i);
-        StringPiece name = ParseNodeNameAsStringPiece(input, &position);
-        if (name == node.name() && /*control input*/ position < 0) {
+        const TensorId tensor = ParseTensorName(output->input(i));
+        if (tensor.node() == node.name() && tensor.index() < 0) {
           return true;
         }
       }
@@ -632,7 +600,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     CHECK(!inputs.empty()) << "Inputs must be non-empty";
 
     // Do not create redundant AddN nodes
-    if (inputs.size() == 1) {
+    if (inputs.size() == 1 || root_node.attr().count("T") == 0) {
       return inputs[0];
     }
 
@@ -1268,7 +1236,12 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
     // Bypass Bitcast whose source type and destination type are equal.
-    if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
+    AttrSlice attrs(*node);
+    DataType input_type;
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "T", &input_type));
+    DataType output_type;
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "type", &output_type));
+    if (input_type == output_type) {
       *simplified_node_name = node->input(0);
       return Status::OK();
     }
@@ -1279,9 +1252,12 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &operand));
 
     if (IsBitcast(*operand)) {
+      AttrSlice operand_attrs(*operand);
+      DataType operand_input_type;
+      TF_RETURN_IF_ERROR(GetNodeAttr(operand_attrs, "T", &operand_input_type));
       // Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
       bitcast->set_input(0, operand->input(0));
-      SetSourceDataType(GetSourceDataType(*operand), bitcast);
+      SetDataTypeToAttr(operand_input_type, "T", bitcast);
       ctx().node_map->UpdateInput(bitcast->name(), bitcast->input(0),
                                   operand->input(0));
       AddToOptimizationQueue(bitcast);
@@ -1306,7 +1282,12 @@ class RemoveRedundantCastStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
     // Bypass Cast whose source type and destination type are equal.
-    if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
+    AttrSlice attrs(*node);
+    DataType input_type;
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "SrcT", &input_type));
+    DataType output_type;
+    TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "DstT", &output_type));
+    if (input_type == output_type) {
       *simplified_node_name = node->input(0);
     }
     return Status::OK();
@@ -1441,10 +1422,11 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
 
   bool IsSupported(const NodeDef* node) const override {
     if (IsInPreserveSet(*node)) return false;
-    if (IsConcat(*node)) {
+    if (IsConcat(*node) && node->attr().count("N") != 0) {
       const int n = node->attr().at("N").i();
       return n > 1;
-    } else if (IsSplit(*node) || IsSplitV(*node)) {
+    } else if ((IsSplit(*node) || IsSplitV(*node)) &&
+               node->attr().count("num_split") != 0) {
       const int num_split = node->attr().at("num_split").i();
       if (NumNonControlOutputs(*node, *ctx().node_map) > num_split) {
         // TODO(rmlarsen): Remove this constraint when we have optimizations
@@ -1547,6 +1529,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
   Status InitializeChains(const NodeDef& node, ChainLinkSet* tails) const {
     if (node_is_concat_) {
       // Handle concat nodes by looking backwards in the graph.
+      TF_RETURN_IF_ERROR(CheckAttrExists(node, "N"));
       const int n = node.attr().at("N").i();
       const int start = node.op() == "Concat" ? 1 : 0;
       const int end = start + n;
@@ -1567,11 +1550,9 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
       const auto& outputs = ctx().node_map->GetOutputs(node.name());
       for (NodeDef* output : outputs) {
         if (IsControlInput(output->input(0))) continue;
-        int port;
-        const StringPiece node_name =
-            ParseNodeNameAsStringPiece(output->input(0), &port);
-        if (node_name == node.name()) {
-          tails->insert(ChainLink(output, port));
+        TensorId tensor_id = ParseTensorName(output->input(0));
+        if (tensor_id.node() == node.name()) {
+          tails->insert(ChainLink(output, tensor_id.index()));
         } else {
           // This output node has a non-control input other than the split node,
           // abort.
@@ -1618,14 +1599,12 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
         new_tails->insert(ChainLink(new_tail, link.port_origin));
       } else {
         for (NodeDef* new_tail : ctx().node_map->GetOutputs(tail->name())) {
-          int port;
-          const StringPiece node_name =
-              ParseNodeNameAsStringPiece(new_tail->input(0), &port);
-          if (node_name != tail->name()) {
+          const TensorId tensor = ParseTensorName(new_tail->input(0));
+          if (tensor.node() != tail->name()) {
             return Status::OK();
           }
           // Skip control outputs.
-          if (port >= 0) {
+          if (tensor.index() >= 0) {
             // Remember original port.
             new_tails->insert(ChainLink(new_tail, link.port_origin));
           }
@@ -1779,7 +1758,8 @@ class SqrtDivToRsqrtMulStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &y));
     // Optimize only if divisor is a Sqrt whose output is not being consumed
     // elsewhere.
-    if (IsSqrt(*y) && (NumNonControlOutputs(*y, *ctx().node_map) == 1)) {
+    if (IsSqrt(*y) && !IsInPreserveSet(*y) &&
+        (NumNonControlOutputs(*y, *ctx().node_map) == 1)) {
       // a / sqrt(b) = a * rsqrt(b)
       node->set_op("Mul");
       y->set_op("Rsqrt");
@@ -1849,87 +1829,136 @@ class RemoveRedundantReshape : public ArithmeticOptimizerStage {
   }
 };
 
-// Reorder Cast and Transpose if beneficial.
+// Reorder casting and value-preserving ops if beneficial.
 //
-// A common pattern after the layout optimizer is casting an uint8 NHWC
-// image to float before transposing it to NCHW. It is beneficial to reorder
-// the cast and the transpose to make the transpose process smaller amount
-// of data. This optimization converts
-//   Transpose(Cast(image, dst_type), perm)
+// Original motivation: A common pattern after the layout optimizer is
+// casting an uint8 NHWC image to float before transposing it to NCHW. It
+// is beneficial to reorder the cast and the transpose to make the transpose
+// process smaller amount of data. More generally, this optimization converts
+//   Op(Cast(tensor, dst_type))
+// to
+//   Cast(Op(tensor), dst_type)
+// when sizeof(tensor.type) < sizeof(dst_type), and Op is any value-preserving
+// Op, i.e. an op that only reorders the elements in its first input. Similarly,
+// this optimization converts
+//   Cast(Op(tensor), dst_type)
 // to
-//   Cast(Transpose(image, perm), dst_type)
-// when sizeof(image.type) < sizeof(dst_type).
+//   Op(Cast(tensor, dst_type))
+// when sizeof(tensor.type) > sizeof(dst_type)
 //
-// TODO(jingyue): This optimization can be generalized to a cast followed by
-// a chain of ops that merely reorder elements (e.g. Reshape and
-// DepthToSpace).
-class ReorderCastAndTranspose : public ArithmeticOptimizerStage {
+class ReorderCastLikeAndValuePreserving : public ArithmeticOptimizerStage {
  public:
-  explicit ReorderCastAndTranspose(const GraphOptimizerContext& ctx,
-                                   const ArithmeticOptimizerContext& ctx_ext)
-      : ArithmeticOptimizerStage("ReorderCastAndTranspose", ctx, ctx_ext) {}
-  ~ReorderCastAndTranspose() override = default;
+  explicit ReorderCastLikeAndValuePreserving(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("ReorderCastLikeAndValuePreserving", ctx,
+                                 ctx_ext) {}
+  ~ReorderCastLikeAndValuePreserving() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsTranspose(*node) && NodeIsOnCpuOrGpu(node);
-  }
+    return (IsValuePreserving(*node) || IsCastLike(*node)) &&
+           !IsCheckNumerics(*node) && NodeIsOnCpuOrGpu(node) &&
+           !IsControlFlow(*node) && !IsInPreserveSet(*node);
+  }
+
+  Status TrySimplify(NodeDef* consumer, string* simplified_node_name) override {
+    NodeDef* producer;
+    TF_RETURN_IF_ERROR(GetInputNode(consumer->input(0), &producer));
+    const bool producer_is_cast = IsCastLike(*producer);
+    const bool can_optimize =
+        !IsCheckNumerics(*producer) &&
+        ((producer_is_cast && IsValuePreserving(*consumer)) ||
+         (IsValuePreserving(*producer) && IsCastLike(*consumer)));
+    if (!can_optimize || IsControlFlow(*producer) ||
+        producer->device() != consumer->device()) {
+      return Status::OK();
+    }
 
-  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    const NodeDef* transpose = node;
+    const NodeDef* cast_like_node = producer_is_cast ? producer : consumer;
+    const OpDef* cast_like_op_def = nullptr;
+    TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(cast_like_node->op(),
+                                                         &cast_like_op_def));
+    DataType cast_src_type;
+    TF_RETURN_IF_ERROR(InputTypeForNode(*cast_like_node, *cast_like_op_def, 0,
+                                        &cast_src_type));
+    DataType cast_dst_type;
+    TF_RETURN_IF_ERROR(OutputTypeForNode(*cast_like_node, *cast_like_op_def, 0,
+                                         &cast_dst_type));
+    if (!IsFixedSizeType(cast_src_type) || !IsFixedSizeType(cast_dst_type)) {
+      return Status::OK();
+    } else if (producer_is_cast &&
+               DataTypeSize(cast_dst_type) <= DataTypeSize(cast_src_type)) {
+      return Status::OK();
+    } else if (!producer_is_cast &&
+               DataTypeSize(cast_dst_type) >= DataTypeSize(cast_src_type)) {
+      return Status::OK();
+    }
 
-    // Verify that input to Transpose is the Cast op.
-    NodeDef* cast;
-    TF_RETURN_IF_ERROR(GetInputNode(transpose->input(0), &cast));
-    if (!IsCast(*cast)) return Status::OK();
+    // Check that nodes were not already optimized.
+    const string optimized_producer_name = OptimizedNodeName(
+        ParseNodeScopeAndName(producer->name()), DataTypeString(cast_dst_type));
+    const string optimized_consumer_name = OptimizedNodeName(
+        ParseNodeScopeAndName(consumer->name()), DataTypeString(cast_src_type));
+    const bool is_already_optimized =
+        ctx().node_map->NodeExists(optimized_consumer_name) ||
+        ctx().node_map->NodeExists(optimized_producer_name);
+    if (is_already_optimized) {
+      return Status::OK();
+    }
 
-    // Input to the Cast-Transpose chain.
+    // Add copies of consumer and producer in reverse order.
     NodeDef* input;
-    TF_RETURN_IF_ERROR(GetInputNode(cast->input(0), &input));
-
-    const DataType src_type = GetSourceDataType(*cast);
-    const DataType dst_type = GetDestinationDataType(*cast);
-
-    const string src_type_name = DataTypeString(src_type);
-    const string dst_type_name = DataTypeString(dst_type);
-
-    // Check if nodes were not already optimized.
-    const string optimized_cast_name =
-        OptimizedNodeName(ParseNodeScopeAndName(cast->name()), dst_type_name);
-    const string optimized_transpose_name = OptimizedNodeName(
-        ParseNodeScopeAndName(transpose->name()), src_type_name);
-
-    bool is_already_optimized =
-        ctx().node_map->NodeExists(optimized_transpose_name) ||
-        ctx().node_map->NodeExists(optimized_cast_name);
-
-    if (IsNumberType(src_type) && IsNumberType(dst_type) &&
-        DataTypeSize(src_type) < DataTypeSize(dst_type) &&
-        !is_already_optimized) {
-      NodeDef* new_transpose = AddCopyNode(optimized_transpose_name, transpose);
-      (*new_transpose->mutable_attr())["T"].set_type(src_type);
-      new_transpose->set_input(0, cast->input(0));
-
-      ctx().node_map->AddOutput(input->name(), new_transpose->name());
-      ctx().node_map->AddOutput(NodeName(new_transpose->input(1)),
-                                new_transpose->name());
-
-      NodeDef* new_cast = AddCopyNode(optimized_cast_name, cast);
-      new_cast->set_input(0, new_transpose->name());
-      ctx().node_map->AddOutput(new_transpose->name(), new_cast->name());
-
-      AddToOptimizationQueue(new_transpose);
-      ForwardControlDependencies(new_transpose, {cast, node});
-
-      *simplified_node_name = new_cast->name();
-    }
+    TF_RETURN_IF_ERROR(GetInputNode(producer->input(0), &input));
+    // Create new producer node.
+    NodeDef* new_producer = AddCopyNode(optimized_consumer_name, consumer);
+    new_producer->set_input(0, producer->input(0));
+    ctx().node_map->AddOutput(input->name(), new_producer->name());
+
+    // Create new consumer node.
+    NodeDef* new_consumer = AddCopyNode(optimized_producer_name, producer);
+    new_consumer->set_input(0, new_producer->name());
+
+    NodeDef* new_value_preserving =
+        producer_is_cast ? new_producer : new_consumer;
+    const DataType new_input_type =
+        producer_is_cast ? cast_src_type : cast_dst_type;
+    // Update the input type of the value-preserving node. The input and
+    // output types of the cast-like nodes remain the same.
+    TF_RETURN_IF_ERROR(SetInputType(new_input_type, new_value_preserving));
+    // Make sure there is a kernel registered for the value preserving op
+    // with the new input type.
+    TF_RETURN_IF_ERROR(IsKernelRegisteredForNode(*new_value_preserving));
+    ctx().node_map->AddOutput(new_producer->name(), new_consumer->name());
+
+    AddToOptimizationQueue(new_producer);
+    *simplified_node_name = new_consumer->name();
 
     return Status::OK();
   }
 
  private:
+  // Sets the type of the first input to dtype.
+  Status SetInputType(DataType dtype, NodeDef* node) {
+    const OpDef* op_def = nullptr;
+    TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node->op(), &op_def));
+    const OpDef::ArgDef& input_arg = op_def->input_arg(0);
+    const string& type_attr_name = input_arg.type_attr();
+    if (type_attr_name.empty()) {
+      if (input_arg.type() == DT_INVALID || input_arg.type() != dtype) {
+        return errors::InvalidArgument("Could not set input type of ",
+                                       node->op(), " op to ",
+                                       DataTypeString(dtype));
+      } else {
+        // Op has fixed input type that already matches dtype.
+        return Status::OK();
+      }
+    }
+    SetDataTypeToAttr(dtype, type_attr_name, node);
+    return Status::OK();
+  }
   // This optimization can be dangerous on devices other than CPU and
   // GPU. The transpose might not be implemented for image.type, or
-  // might be slower with image.type than with dst_type.
+  // might be slower with image.type than with cast_dst_type.
   bool NodeIsOnCpuOrGpu(const NodeDef* node) const {
     using str_util::StrContains;
 
@@ -1940,7 +1969,10 @@ class ReorderCastAndTranspose : public ArithmeticOptimizerStage {
            (StrContains(device, DEVICE_CPU) || StrContains(device, DEVICE_GPU));
   }
 
-  bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
+  bool IsFixedSizeType(DataType dtype) {
+    return dtype != DT_STRING && dtype != DT_VARIANT && dtype != DT_RESOURCE &&
+           !kQuantizedTypes.Contains(dtype);
+  }
 };
 
 // Fold a multiply of a scalar into the following convolution. This folding
@@ -2020,6 +2052,8 @@ class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
 
     // Check that 'scale * weight' can be const folded.
     TF_RETURN_IF_TRUE(!IsConstant(*scale));
+    TF_RETURN_IF_ERROR(CheckAttrsExist(*scale, {"dtype", "value"}));
+    TF_RETURN_IF_ERROR(CheckAttrExists(*weights, "dtype"));
     TF_RETURN_IF_TRUE(scale->attr().at("dtype").type() !=
                       weights->attr().at("dtype").type());
 
@@ -2275,7 +2309,9 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
   ~SimplifyAggregation() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsAggregate(*node) && NumNonControlInputs(*node) > 0;
+    return IsAggregate(*node) && NumNonControlInputs(*node) > 0 &&
+           GetDataTypeFromAttr(*node, "T") !=
+               DT_VARIANT;  // TODO(b/119787146): Enable for variants.
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -2371,11 +2407,10 @@ class ConvertPowStage : public ArithmeticOptimizerStage {
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     const auto& pow_props =
         ctx().graph_properties->GetInputProperties(node->name())[1];
-    for (int i = 0; i < pow_props.shape().dim_size(); ++i) {
-      if (pow_props.shape().dim(i).size() < 0) {
-        // skip if p is is not fully defined.
-        return Status::OK();
-      }
+    PartialTensorShape shape(pow_props.shape());
+    if (!shape.IsFullyDefined()) {
+      // skip if p is not fully defined.
+      return Status::OK();
     }
     if (TensorShape::IsValid(pow_props.shape()) && pow_props.has_value()) {
       Tensor pow(pow_props.dtype(), pow_props.shape());
@@ -2423,11 +2458,10 @@ class ConvertPowStage : public ArithmeticOptimizerStage {
         AddToOptimizationQueue(y);
       } else if (curr == complex128(0, 0) &&
                  ShapesSymbolicallyEqual(value_props.shape(), output_shape)) {
-        for (int i = 0; i < value_props.shape().dim_size(); ++i) {
-          if (value_props.shape().dim(i).size() < 0) {
-            // skip if b is is not fully defined.
-            return Status::OK();
-          }
+        PartialTensorShape shape(value_props.shape());
+        if (!shape.IsFullyDefined()) {
+          // skip if b is not fully defined.
+          return Status::OK();
         }
         if (TensorShape::IsValid(value_props.shape()) &&
             value_props.has_value()) {
@@ -2794,6 +2828,7 @@ class UnaryOpsComposition : public ArithmeticOptimizerStage {
   }
 
   Status TrySimplify(NodeDef* root, string* simplified_node_name) override {
+    TF_RETURN_IF_ERROR(CheckAttrExists(*root, "T"));
     DataType dtype = root->attr().at("T").type();
 
     // Keep a trace of all supported input nodes that can be fused together.
@@ -2902,6 +2937,284 @@ class UnaryOpsComposition : public ArithmeticOptimizerStage {
   std::unordered_set<string> fused_nodes_;
 };
 
+// Replace operations of the form:
+//    x = stack((a_0, a_1, ..., a_{n-1}), axis=k)[:,...,i,...]
+// with
+//    a_i
+// when the strided slice index `i` is applied in the k'th axis.
+//
+// Similarly, replace operations of the form:
+//    x = stack((a_0, a_1, ..., a_{n-1}), axis=k)[:,...,i:i+1,...]
+// with
+//    expand_dims(a_i, axis=k)
+//
+// TODO(ebrevdo): Extend to also replace operations of the form
+//    concat((a_0, a_1, ..., ), axis=k)[:, ..., s_i:s_{i+1}, ...]
+// with
+//    a_i,
+// when
+//    s_i = cumsum(shape(a)[k] for a in (a_0, ...,))[i]
+// and slicing is in the k'th axis.
+class RemoveStackStridedSliceSameAxis : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveStackStridedSliceSameAxis(
+      const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveStackStridedSliceSameAxis", ctx,
+                                 ctx_ext) {}
+  ~RemoveStackStridedSliceSameAxis() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsStridedSlice(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    // *node is a StridedSlice NodeDef.
+    NodeDef* pack;
+
+    // Get the input and see if it's a Pack op.
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &pack));
+    if (!IsPack(*pack)) return Status::OK();
+
+    bool return_early;
+    PartialTensorShape pack_output_shape;
+    int pack_axis;
+    TF_RETURN_IF_ERROR(
+        CheckInputs(node, pack, &pack_output_shape, &pack_axis, &return_early));
+    if (return_early) return Status::OK();
+
+    int slice_start_value;
+    bool found;
+    TF_RETURN_IF_ERROR(GetSliceAxis(node, pack, pack_output_shape, pack_axis,
+                                    &slice_start_value, &found));
+    if (!found) return Status::OK();
+
+    return RewriteGraph(node, pack, slice_start_value, pack_axis,
+                        simplified_node_name);
+  }
+
+ protected:
+  bool IsReallyConstant(const NodeDef& node) const {
+    if (!IsConstant(node)) {
+      return false;
+    }
+    // If the node is fed it's not constant anymore.
+    return ctx().feed_nodes->find(node.name()) == ctx().feed_nodes->end();
+  }
+
+  bool GetConstantAsInt64(const NodeDef& node, DataType dtype,
+                          std::vector<int64>* values) {
+    if (dtype == DT_INT32) {
+      std::vector<int32> values_int32;
+      if (!ValuesFromConstNode(node, &values_int32)) {
+        return false;
+      }
+      std::copy(values_int32.begin(), values_int32.end(),
+                std::inserter(*values, values->begin()));
+      return true;
+    } else {
+      return ValuesFromConstNode(node, values);
+    }
+  }
+
+  Status CheckInputs(const NodeDef* node, const NodeDef* pack,
+                     PartialTensorShape* pack_output_shape, int* pack_axis,
+                     bool* return_early) {
+    *return_early = true;
+    TF_RETURN_IF_ERROR(CheckAttrExists(*pack, "axis"));
+
+    *pack_axis = pack->attr().at("axis").i();
+    auto slice_properties =
+        ctx().graph_properties->GetInputProperties(node->name());
+    if (slice_properties.empty() ||
+        slice_properties[0].shape().unknown_rank()) {
+      return Status::OK();
+    }
+    *pack_output_shape = slice_properties[0].shape();
+    const int pack_input_rank = pack_output_shape->dims() - 1;
+    if (*pack_axis < 0) {
+      // The ndims of any input into Pack op is its output ndims - 1.
+      *pack_axis += pack_input_rank;
+    }
+    if (*pack_axis < 0 || *pack_axis >= pack_input_rank) {
+      return errors::InvalidArgument(
+          "Pack node (", pack->name(),
+          ") axis attribute is out of bounds: ", pack->attr().at("axis").i());
+    }
+    *return_early = false;
+    return Status::OK();
+  }
+
+  Status GetSliceAxis(const NodeDef* node, const NodeDef* pack,
+                      const PartialTensorShape& pack_output_shape,
+                      int pack_axis, int* slice_start_value, bool* found) {
+    *found = false;
+    TF_RETURN_IF_ERROR(
+        CheckAttrsExist(*node, {"begin_mask", "end_mask", "ellipsis_mask",
+                                "new_axis_mask", "shrink_axis_mask"}));
+
+    const int begin_mask = node->attr().at("begin_mask").i();
+    const int end_mask = node->attr().at("end_mask").i();
+    const int ellipsis_mask = node->attr().at("ellipsis_mask").i();
+    const int new_axis_mask = node->attr().at("new_axis_mask").i();
+    const int shrink_axis_mask = node->attr().at("shrink_axis_mask").i();
+
+    // Check that the StridedSlice is one of these at pack_axis:
+    //   [..., i, ...]
+    //   [..., i:i+1, ...]
+    //   [..., :1, ...]
+    //   [..., -1:, ...]
+    ///  [..., s_{pack_axis}-1:, ...]
+    NodeDef* slice_begin;
+    NodeDef* slice_end;
+    NodeDef* slice_strides;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &slice_begin));
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(2), &slice_end));
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(3), &slice_strides));
+
+    for (const auto* n : {slice_begin, slice_end, slice_strides}) {
+      if (!IsReallyConstant(*n)) return Status::OK();
+    }
+
+    Tensor slice_begin_t;
+    Tensor slice_end_t;
+    Tensor slice_strides_t;
+
+    TF_RETURN_IF_ERROR(CheckAttrExists(*slice_begin, "value"));
+    if (!slice_begin_t.FromProto(slice_begin->attr().at("value").tensor())) {
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(CheckAttrExists(*slice_end, "value"));
+    if (!slice_end_t.FromProto(slice_end->attr().at("value").tensor())) {
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(CheckAttrExists(*slice_strides, "value"));
+    if (!slice_strides_t.FromProto(
+            slice_strides->attr().at("value").tensor())) {
+      return Status::OK();
+    }
+    TensorShape processing_shape;
+    TensorShape final_shape;
+    bool is_identity;
+    bool is_simple_slice;
+    bool slice_dim0;
+    gtl::InlinedVector<int64, 4> slice_begin_vec;
+    gtl::InlinedVector<int64, 4> slice_end_vec;
+    gtl::InlinedVector<int64, 4> slice_strides_vec;
+    TF_RETURN_IF_ERROR(ValidateStridedSliceOp(
+        &slice_begin_t, &slice_end_t, slice_strides_t, pack_output_shape,
+        begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask,
+        &processing_shape, &final_shape, &is_identity, &is_simple_slice,
+        &slice_dim0, &slice_begin_vec, &slice_end_vec, &slice_strides_vec));
+
+    if (!is_simple_slice) return Status::OK();
+
+    int begin_index = -1;
+    int64 begin_value = 0;
+    for (int i = 0; i < slice_begin_vec.size(); ++i) {
+      const int64 v = slice_begin_vec[i];
+      if (v != 0) {
+        if (begin_index != -1) {
+          // At least two start values that are nonzero.
+          return Status::OK();
+        }
+        begin_index = i;
+        begin_value = v;
+      }
+    }
+
+    int end_index = -1;
+    int64 end_value = 0;
+    for (int i = 0; i < slice_end_vec.size(); ++i) {
+      const int64 v = slice_end_vec[i];
+      if (v != pack_output_shape.dim_size(i)) {
+        if (end_index != -1) {
+          // At least two end values that are nonzero.
+          return Status::OK();
+        }
+        end_index = i;
+        end_value = v;
+      }
+    }
+
+    if (begin_index == -1 && end_index == -1) return Status::OK();
+    if (begin_index != -1 && end_index != -1 && begin_index != end_index) {
+      // Somehow received different axes for begin/end slicing
+      return Status::OK();
+    }
+    const int slice_axis = (begin_index == -1) ? end_index : begin_index;
+    if (slice_axis != pack_axis) {
+      // Not slicing on the same axis as the Pack op.
+      return Status::OK();
+    }
+    *slice_start_value = (begin_index == -1) ? 0 : begin_value;
+    const int64 slice_end_value =
+        (end_index == -1) ? pack_output_shape.dim_size(slice_axis) : end_value;
+    if (slice_end_value != *slice_start_value + 1) {
+      // Not slicing a single value out.
+      return Status::OK();
+    }
+
+    if (*slice_start_value < 0 || *slice_start_value >= pack->input_size()) {
+      return errors::InvalidArgument(
+          "Node ", node->name(), " requested invalid slice index ",
+          *slice_start_value, " on axis ", slice_axis,
+          " from tensor of shape: ", pack_output_shape.DebugString());
+    }
+
+    *found = true;  // slice_start_value is valid.
+    return Status::OK();
+  }
+
+  Status RewriteGraph(const NodeDef* node, const NodeDef* pack,
+                      int slice_start_value, int pack_axis,
+                      string* simplified_node_name) {
+    OpInfo::TensorProperties input_slice_properties;
+    NodeDef* input_slice;
+    TF_RETURN_IF_ERROR(
+        GetInputNode(pack->input(slice_start_value), &input_slice));
+    TF_RETURN_IF_ERROR(GetTensorProperties(pack->input(slice_start_value),
+                                           &input_slice_properties));
+    PartialTensorShape input_slice_shape(input_slice_properties.shape());
+
+    OpInfo::TensorProperties output_properties;
+    TF_RETURN_IF_ERROR(GetTensorProperties(
+        strings::StrCat(node->name(), ":", 0), &output_properties));
+    PartialTensorShape output_shape(output_properties.shape());
+    NodeDef* output =
+        AddEmptyNode(OptimizedNodeName(ParseNodeScopeAndName(node->name())));
+    if (input_slice_shape.IsCompatibleWith(output_shape)) {
+      output->set_op("Identity");
+      output->set_device(node->device());
+      SetDataTypeToAttr(output_properties.dtype(), "T", output);
+      output->add_input(input_slice->name());
+    } else {
+      NodeDef* axis = AddEmptyNode(
+          OptimizedNodeName(ParseNodeScopeAndName(node->name()), "Axis"));
+      axis->set_op("Const");
+      axis->set_device(node->device());
+      auto axis_attr = axis->mutable_attr();
+      SetDataTypeToAttr(DT_INT32, "dtype", axis);
+      auto* axis_t = (*axis_attr)["value"].mutable_tensor();
+      axis_t->set_dtype(DT_INT32);
+      axis_t->add_int_val(pack_axis);
+      AddToOptimizationQueue(axis);
+      output->set_op("ExpandDims");
+      output->set_device(node->device());
+      SetDataTypeToAttr(output_properties.dtype(), "T", output);
+      output->add_input(input_slice->name());
+      output->add_input(axis->name());
+    }
+
+    // Copy dependencies over.
+    ForwardControlDependencies(output, {node, pack});
+    AddToOptimizationQueue(output);
+    *simplified_node_name = output->name();
+
+    return Status::OK();
+  }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -2930,10 +3243,10 @@ uint64 UniqueNodes::ComputeSignature(const NodeDef& node) const {
   h = Hash64Combine(Hash64(node.device()), h);
 
   for (const auto& input : node.input()) {
-    int pos;
-    const StringPiece node_name = ParseNodeNameAsStringPiece(input, &pos);
-    h = Hash64CombineUnordered(Hash64(node_name.data(), node_name.size()), h);
-    h = Hash64CombineUnordered(std::hash<int>()(pos), h);
+    const TensorId input_tensor = ParseTensorName(input);
+    h = Hash64CombineUnordered(
+        Hash64(input_tensor.node().data(), input_tensor.node().size()), h);
+    h = Hash64CombineUnordered(std::hash<int>()(input_tensor.index()), h);
   }
   for (const auto& attr : node.attr()) {
     h = Hash64CombineUnordered(Hash64(attr.first), h);
@@ -3076,7 +3389,8 @@ void ArithmeticOptimizer::DedupComputations() {
       }
       VLOG(3) << "Remove duplicated node: node=" << node->name()
               << " representative=" << rep->name();
-      const std::set<NodeDef*>& fanouts = node_map_->GetOutputs(node->name());
+      const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
+      std::vector<NodeDef*> fanouts(tmp.begin(), tmp.end());
       for (NodeDef* fanout : fanouts) {
         for (int i = 0; i < fanout->input_size(); ++i) {
           string* fanout_input = fanout->mutable_input(i);
@@ -3132,7 +3446,7 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
 
   const GraphOptimizerContext ctx(&nodes_to_preserve_, optimized_graph_,
                                   graph_properties_.get(), node_map_.get(),
-                                  opt_level_);
+                                  &feed_nodes_, opt_level_);
   const ArithmeticOptimizerContext ctx_ext(&nodes_to_simplify);
 
   // Stop pipeline after first stage returning non-empty simplified tensor name.
@@ -3167,8 +3481,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<ReplaceMulWithSquare>(ctx, ctx_ext);
   if (options_.remove_logical_not)
     pipeline.AddStage<RemoveLogicalNotStage>(ctx, ctx_ext);
-  if (options_.reorder_cast_and_transpose)
-    pipeline.AddStage<ReorderCastAndTranspose>(ctx, ctx_ext);
+  if (options_.reorder_cast_like_and_value_preserving)
+    pipeline.AddStage<ReorderCastLikeAndValuePreserving>(ctx, ctx_ext);
   if (options_.simplify_aggregation)
     pipeline.AddStage<SimplifyAggregation>(ctx, ctx_ext);
   if (options_.hoist_cwise_unary_chains)
@@ -3186,11 +3500,14 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<ConvertExpm1Stage>(ctx, ctx_ext);
   if (options_.unary_ops_composition)
     pipeline.AddStage<UnaryOpsComposition>(ctx, ctx_ext);
+  if (options_.remove_stack_strided_slice_same_axis)
+    pipeline.AddStage<RemoveStackStridedSliceSameAxis>(ctx, ctx_ext);
 
   VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
           << str_util::Join(pipeline.StageNames(), ", ");
 
   while (!nodes_to_simplify.Empty()) {
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     NodeDef* node = nodes_to_simplify.PopBack();
 
     string simplified_tensor = "";
@@ -3244,18 +3561,22 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
-  *optimized_graph = item.graph;
-  GrapplerItem optimized_item(item, optimized_graph);
+  GrapplerItem optimized_item(item);
   optimized_graph_ = &optimized_item.graph;
   node_map_.reset(new NodeMap(optimized_graph_));
 
+  for (const auto& feed : item.feed) {
+    feed_nodes_.insert(NodeName(feed.first));
+  }
+
   // Disable restricted graph rewrites.
   options_.unary_ops_composition &=
-      item.allowed_optimizations.non_differentiable_rewrites;
+      item.allowed_optimizations().non_differentiable_rewrites;
 
   if (options_.dedup_computations) {
     DedupComputations();
   }
+  GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
   // Perform topological sort on the graph in order to help AddOpsRewrite to
   // optimize larger subgraphs starting from the roots with more inputs.
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index d457eb6d21ef969042634351db4b4147ea05fe37..e1395d75426314afe049be3bc3bd68e3126d4915 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -72,13 +73,14 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
     bool remove_redundant_reshape = true;
-    bool reorder_cast_and_transpose = true;
+    bool reorder_cast_like_and_value_preserving = true;
     bool replace_mul_with_square = true;
     bool simplify_aggregation = true;
     bool convert_pow = true;
     bool convert_log1p = true;
     bool convert_expm1 = true;
     bool unary_ops_composition = true;
+    bool remove_stack_strided_slice_same_axis = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
@@ -128,6 +130,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
   std::unique_ptr<NodeMap> node_map_;
   std::unique_ptr<GraphProperties> graph_properties_;
   GraphDef* optimized_graph_ = nullptr;  // Not owned.
+  gtl::FlatSet<string> feed_nodes_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 77f3c64c659a8a0ffd7ccd1b9b3b44c42b74ab0d..35d22898f6c15afd63df8b6136fad1f346172cd5 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -158,7 +159,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.remove_redundant_reshape = false;
     options.remove_negation = false;
     options.remove_logical_not = false;
-    options.reorder_cast_and_transpose = false;
+    options.reorder_cast_like_and_value_preserving = false;
     options.replace_mul_with_square = false;
     options.simplify_aggregation = false;
     options.unary_ops_composition = false;
@@ -231,7 +232,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
 
   void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
-    optimizer->options_.reorder_cast_and_transpose = true;
+    optimizer->options_.reorder_cast_like_and_value_preserving = true;
   }
 
   void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
@@ -288,6 +289,12 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     DisableAllStages(optimizer);
     optimizer->options_.unary_ops_composition = true;
   }
+
+  void EnableOnlyRemoveStackStridedSliceSameAxis(
+      ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_stack_strided_slice_same_axis = true;
+  }
 };
 
 TEST_F(ArithmeticOptimizerTest, NoOp) {
@@ -1314,8 +1321,8 @@ TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshape_CombineReshapes) {
   test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/gpu:0");
+TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast_ProducerIsCast) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/CPU:0");
   Output nhwc_uint8 =
       ops::Placeholder(s, DT_UINT8, ops::Placeholder::Shape({8, 28, 28, 3}));
   Output nhwc_fp32 = ops::Cast(s, nhwc_uint8, DT_FLOAT);
@@ -1327,11 +1334,14 @@ TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  auto input_t = GenerateRandomTensor<DT_UINT8>(TensorShape({8, 28, 28, 3}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", input_t}});
+  EXPECT_EQ(1, tensors_expected.size());
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  OptimizeAndPrune(&optimizer, &item, &output);
 
   const NodeDef* transpose_node = nullptr;
   for (const NodeDef& node : output.node()) {
@@ -1348,36 +1358,210 @@ TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast) {
       EXPECT_EQ(NodeName(node.input(0)), transpose_node->name());
     }
   }
+
+  auto tensors =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", input_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(ArithmeticOptimizerTest, NoReorderTransposeCast) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/gpu:0");
+TEST_F(ArithmeticOptimizerTest, ReorderS2DCast_ProducerIsCast) {
+  // TODO(jingyue): Evaluate S2D+Cast on GPU as well. We can't simply put nodes
+  // under a /GPU:0 scope, because this test would fail if the testing machine
+  // doesn't have a GPU. Maybe EvaluateNodes should allow soft placement?
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/CPU:0");
+  Output outputs =
+      ops::Placeholder(s, DT_UINT8, ops::Placeholder::Shape({8, 28, 28, 3}));
+  outputs = ops::Cast(s, outputs, DT_FLOAT);
+  outputs = ops::SpaceToDepth(s, outputs, 2);
+  outputs = ops::Identity(s.WithOpName("outputs"), outputs);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto input_t = GenerateRandomTensor<DT_UINT8>(TensorShape({8, 28, 28, 3}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", input_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  const NodeDef* s2d_node = nullptr;
+  for (const NodeDef& node : output.node()) {
+    if (node.op() == "SpaceToDepth") {
+      EXPECT_EQ(s2d_node, nullptr);
+      EXPECT_EQ(DT_UINT8, node.attr().at("T").type());
+      s2d_node = &node;
+    }
+  }
+  EXPECT_NE(s2d_node, nullptr);
+
+  for (const NodeDef& node : output.node()) {
+    if (node.op() == "Cast") {
+      EXPECT_EQ(NodeName(node.input(0)), s2d_node->name());
+    }
+  }
+
+  auto tensors =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", input_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast_ProducerIsTranspose) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/CPU:0");
   Output nhwc_fp32 =
       ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({8, 28, 28, 3}));
-  Output nhwc_uint8 = ops::Cast(s, nhwc_fp32, DT_UINT8);
-  Output nchw_uint8 =
-      ops::Transpose(s, nhwc_uint8, ops::Const(s, {0, 3, 1, 2}, {4}));
+  Output nchw_fp32 =
+      ops::Transpose(s, nhwc_fp32, ops::Const(s, {0, 3, 1, 2}, {4}));
+  Output nchw_uint8 = ops::Cast(s, nchw_fp32, DT_UINT8);
   Output outputs = ops::Identity(s.WithOpName("outputs"), nchw_uint8);
 
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto input_t =
+      GenerateConstantTensor<DT_FLOAT>(TensorShape({8, 28, 28, 3}), 42.0f);
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", input_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  OptimizeAndPrune(&optimizer, &item, &output);
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  const NodeDef* cast_node = nullptr;
+  for (const NodeDef& node : output.node()) {
+    if (node.op() == "Cast") {
+      EXPECT_EQ(cast_node, nullptr);
+      cast_node = &node;
+      EXPECT_EQ(NodeName(node.input(0)), "Placeholder");
+    }
+  }
+  EXPECT_NE(cast_node, nullptr);
+
+  for (const NodeDef& node : output.node()) {
+    if (node.op() == "Transpose") {
+      EXPECT_EQ(DT_UINT8, node.attr().at("T").type());
+      EXPECT_EQ(NodeName(node.input(0)), cast_node->name());
+    }
+  }
+
+  auto tensors =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", input_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<uint8>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(ArithmeticOptimizerTest, ReorderTransposeReverseCast) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/CPU:0");
+  Output nhwc_uint8 =
+      ops::Placeholder(s, DT_UINT8, ops::Placeholder::Shape({8, 28, 28, 3}));
+  Output nhwc_fp32 = ops::Cast(s, nhwc_uint8, DT_FLOAT);
+  Output nhwc_fp32_reversed =
+      ops::Reverse(s, nhwc_fp32, ops::Const(s, {0}, {1}));
+  Output nchw_fp32_reversed =
+      ops::Transpose(s, nhwc_fp32_reversed, ops::Const(s, {0, 3, 1, 2}, {4}));
+
+  Output outputs = ops::Identity(s.WithOpName("outputs"), nchw_fp32_reversed);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  int num_transposes = 0;
+  auto input_t = GenerateRandomTensor<DT_UINT8>(TensorShape({8, 28, 28, 3}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", input_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  const NodeDef* reverse_node = nullptr;
+  const NodeDef* transpose_node = nullptr;
+  const NodeDef* cast_node = nullptr;
   for (const NodeDef& node : output.node()) {
     if (node.op() == "Transpose") {
+      EXPECT_EQ(transpose_node, nullptr);
+      EXPECT_EQ(DT_UINT8, node.attr().at("T").type());
+      transpose_node = &node;
+    } else if (node.op() == "ReverseV2") {
+      EXPECT_EQ(reverse_node, nullptr);
       EXPECT_EQ(DT_UINT8, node.attr().at("T").type());
-      EXPECT_EQ(node.input(0), "Cast");
-      ++num_transposes;
+      reverse_node = &node;
+    } else if (node.op() == "Cast") {
+      cast_node = &node;
     }
   }
-  EXPECT_EQ(1, num_transposes);
+  EXPECT_NE(cast_node, nullptr);
+  EXPECT_NE(reverse_node, nullptr);
+  EXPECT_NE(transpose_node, nullptr);
+  EXPECT_EQ(NodeName(reverse_node->input(0)), "Placeholder");
+  EXPECT_EQ(NodeName(transpose_node->input(0)), reverse_node->name());
+  EXPECT_EQ(NodeName(cast_node->input(0)), transpose_node->name());
+
+  auto tensors =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", input_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast_CheckNumericsToIdentity) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/CPU:0");
+  Output nhwc_uint8 =
+      ops::Placeholder(s, DT_UINT8, ops::Placeholder::Shape({8, 28, 28, 3}));
+  Output nhwc_fp32 = ops::Cast(s, nhwc_uint8, DT_FLOAT);
+  Output nchw_fp32 = ops::CheckNumerics(s, nhwc_fp32, "foo");
+  Output outputs = ops::Identity(s.WithOpName("outputs"), nchw_fp32);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  CompareGraphs(item.graph, output);
+}
+
+TEST_F(ArithmeticOptimizerTest, NoReorderTransposeCast_ProducerIsCast) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/CPU:0");
+  Output nhwc_fp32 =
+      ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({8, 28, 28, 3}));
+  Output nhwc_uint8 = ops::Cast(s, nhwc_fp32, DT_UINT8);
+  Output nchw_uint8 =
+      ops::Transpose(s, nhwc_uint8, ops::Const(s, {0, 3, 1, 2}, {4}));
+  Output outputs = ops::Identity(s.WithOpName("outputs"), nchw_uint8);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  CompareGraphs(item.graph, output);
+}
+
+TEST_F(ArithmeticOptimizerTest, NoReorderTransposeCast_ProducerIsTranspose) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/CPU:0");
+  Output nhwc_uint8 =
+      ops::Placeholder(s, DT_UINT8, ops::Placeholder::Shape({8, 28, 28, 3}));
+  Output nchw_uint8 =
+      ops::Transpose(s, nhwc_uint8, ops::Const(s, {0, 3, 1, 2}, {4}));
+  Output nchw_fp32 = ops::Cast(s, nchw_uint8, DT_FLOAT);
+  Output outputs = ops::Identity(s.WithOpName("outputs"), nchw_fp32);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  CompareGraphs(item.graph, output);
 }
 
 TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposes) {
@@ -1677,7 +1861,7 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   //   Conv2D(Transpose(Cast(I)), W*S)
   //     =>
   //   Conv2D(Cast(Transpose(I)), W*S)
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/gpu:0");
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/cpu:0");
 
   Output inputs =
       ops::Placeholder(s, DT_UINT8, ops::Placeholder::Shape({8, 28, 28, 3}));
@@ -1698,11 +1882,10 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   GraphDef output;
   ArithmeticOptimizer optimizer;  // all optimization stages are on
   OptimizeTwiceAndPrune(&optimizer, &item, &output, /*const_folding=*/true);
-
   NodeMap node_map(&output);
 
   // Expected names for reordered cast and transpose.
-  const string p = "ArithmeticOptimizer/ReorderCastAndTranspose_";
+  const string p = "ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_";
   const string optimized_cast_name = strings::StrCat(p, "float_Cast");
   const string optimized_transpose_name = strings::StrCat(p, "uint8_Transpose");
 
@@ -1733,7 +1916,7 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
 TEST_F(ArithmeticOptimizerTest, OptimizeMultipleMulTransposeConv) {
   // This unit test exercises optimization of folding mul into conv for
   // multiple nodes in the graph.
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/gpu:0");
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/cpu:0");
 
   GrapplerItem item;
   Output conv[2];
@@ -2464,6 +2647,48 @@ TEST_F(ArithmeticOptimizerTest, ConvertSqrtDivToRsqrtMul) {
   }
 }
 
+TEST_F(ArithmeticOptimizerTest, DoNotConvertSqrtDivToRsqrtMulDivisorFetchNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output floats = ops::Const(s.WithOpName("floats"),
+                             {0.7423212f, 0.19757693f, 0.53124744f}, {1, 3});
+  Output output0 = ops::Sqrt(s.WithOpName("output0"), floats);
+  Output const1 = ops::Const(s.WithOpName("const1"), 1.0f, {3});
+  Output mul1 = ops::Multiply(s.WithOpName("mul1"), const1, 0.5f);
+  Output grad = ops::Div(s.WithOpName("grad"), mul1, output0);
+
+  GrapplerItem item;
+  item.fetch = {"grad", "output0"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(2, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlySqrtDivToRsqrtMul(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(2, tensors.size());
+
+  for (int i = 0; i < tensors.size(); i++) {
+    EXPECT_EQ(tensors[i].NumElements(), tensors_expected[i].NumElements());
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "grad") {
+      EXPECT_EQ("Div", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("mul1", node.input(0));
+      EXPECT_EQ("output0", node.input(1));
+    } else if (node.name() == "output0") {
+      EXPECT_EQ("Sqrt", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("floats", node.input(0));
+    }
+  }
+}
+
 TEST_F(ArithmeticOptimizerTest, ConvertPow) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
@@ -3364,5 +3589,236 @@ TEST_F(ArithmeticOptimizerTest, UnaryOpsComposition) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
+TEST_F(ArithmeticOptimizerTest, RemoveStackStridedSliceSameAxis) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto a_in =
+      ops::Const(s.WithOpName("a_in"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  auto b_in =
+      ops::Const(s.WithOpName("b_in"), {-1.0f, -2.0f, -3.0f, -4.0f}, {2, 2});
+  auto c_in =
+      ops::Const(s.WithOpName("c_in"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
+  auto a = ops::PlaceholderWithDefault(s.WithOpName("a"), a_in,
+                                       PartialTensorShape({-1, -1}));
+  auto b = ops::PlaceholderWithDefault(s.WithOpName("b"), b_in,
+                                       PartialTensorShape({-1, -1}));
+  auto c = ops::PlaceholderWithDefault(s.WithOpName("c"), c_in,
+                                       PartialTensorShape({-1, -1}));
+  // stacked = tf.stack((a, b, c), axis=1).
+  // stacked.shape == [2, 3, 2] (a, b, c are stacked along new axis 1)
+  auto stacked =
+      ops::Stack(s.WithOpName("stacked"), {a.output, b.output, c.output},
+                 ops::Stack::Axis(1));
+  auto expanded_a = ops::ExpandDims(s.WithOpName("expanded_a"), a, {1});
+  auto expanded_b = ops::ExpandDims(s.WithOpName("expanded_b"), b, {1});
+  auto expanded_c = ops::ExpandDims(s.WithOpName("expanded_c"), c, {1});
+  auto begin_a = ops::Const(s.WithOpName("begin_a"), {0, 0, 0}, {3});
+  auto end_a = ops::Const(s.WithOpName("end_a"), {0, 1, 0}, {3});
+  auto begin_b = ops::Const(s.WithOpName("begin_b"), {0, 1, 0}, {3});
+  auto end_b = ops::Const(s.WithOpName("end_b"), {0, 2, 0}, {3});
+  auto begin_c = ops::Const(s.WithOpName("begin_c"), {0, 2, 0}, {3});
+  auto end_c = ops::Const(s.WithOpName("end_c"), {0, 3, 0}, {3});
+  auto end_c_1to = ops::Const(s.WithOpName("begin_c_2to"), {0, 0, 0}, {3});
+  auto strides = ops::Const(s.WithOpName("strides"), {1, 1, 1}, {3});
+
+  // stacked[:, 0]
+  using SS = ops::StridedSlice;
+  auto pa_slice = ops::Identity(
+      s.WithOpName("pa_slice_out"),
+      SS(s.WithOpName("pa_slice"), stacked, begin_a, end_a, strides,
+         SS::BeginMask(0b0101)  // 5
+             .EllipsisMask(0)
+             .EndMask(0b0101)  // 5
+             .NewAxisMask(0)
+             .ShrinkAxisMask(0b0010)));  // 2
+
+  // stacked[:, 1]
+  auto pb_slice = ops::Identity(
+      s.WithOpName("pb_slice_out"),
+      SS(s.WithOpName("pb_slice"), stacked, begin_b, end_b, strides,
+         SS::BeginMask(0b0101)  // 5
+             .EllipsisMask(0)
+             .EndMask(0b0101)  // 5
+             .NewAxisMask(0)
+             .ShrinkAxisMask(0b0010)));  // 2
+
+  // stacked[:, 2]
+  auto pc_slice = ops::Identity(
+      s.WithOpName("pc_slice_out"),
+      SS(s.WithOpName("pc_slice"), stacked, begin_c, end_c, strides,
+         SS::BeginMask(0b0101)  // 5
+             .EllipsisMask(0)
+             .EndMask(0b0101)  // 5
+             .NewAxisMask(0)
+             .ShrinkAxisMask(0b0010)));  // 2
+
+  // stacked[:, 0:1, :]
+  auto pa_slice_01 = ops::Identity(
+      s.WithOpName("pa_slice_01_out"),
+      SS(s.WithOpName("pa_slice_01"), stacked, begin_a, end_a, strides,
+         SS::BeginMask(0b0101)  // 5
+             .EllipsisMask(0)
+             .EndMask(0b0101)  // 5
+             .NewAxisMask(0)
+             .ShrinkAxisMask(0)));
+
+  // stacked[:, :1, :]
+  auto pa_slice_to1 = ops::Identity(
+      s.WithOpName("pa_slice_to1_out"),
+      SS(s.WithOpName("pa_slice_to1"), stacked, begin_a, end_a, strides,
+         SS::BeginMask(0b0111)  // 7
+             .EllipsisMask(0)
+             .EndMask(0b0101)  // 5
+             .NewAxisMask(0)
+             .ShrinkAxisMask(0)));
+
+  // stacked[:, 1:2, :]
+  auto pb_slice_12 = ops::Identity(
+      s.WithOpName("pb_slice_12_out"),
+      SS(s.WithOpName("pb_slice_12"), stacked, begin_b, end_b, strides,
+         SS::BeginMask(0b0101)  // 5
+             .EllipsisMask(0)
+             .EndMask(0b0101)  // 5
+             .NewAxisMask(0)
+             .ShrinkAxisMask(0)));
+
+  // stacked[:, 2:, :].
+  auto pc_slice_2to = ops::Identity(
+      s.WithOpName("pc_slice_2to_out"),
+      SS(s.WithOpName("pc_slice_2to"), stacked, begin_c, end_c_1to, strides,
+         SS::BeginMask(0b0101)  // 5
+             .EllipsisMask(0)
+             .EndMask(0b0111)  // 7
+             .NewAxisMask(0)
+             .ShrinkAxisMask(0)));
+
+  GrapplerItem item;
+  item.fetch = {"a",
+                "b",
+                "c",
+                "pa_slice_out",
+                "pb_slice_out",
+                "pc_slice_out",
+                "expanded_a",
+                "expanded_b",
+                "expanded_c",
+                "pa_slice_01_out",
+                "pa_slice_to1_out",
+                "pb_slice_12_out",
+                "pc_slice_2to_out"};
+  enum FetchItem {
+    fA,
+    fB,
+    fC,
+    fASliceOut,
+    fBSliceOut,
+    fCSliceOut,
+    fExpandedA,
+    fExpandedB,
+    fExpandedC,
+    fASlice01Out,
+    fASliceTo1Out,
+    fBSlice12Out,
+    fCSlice2ToOut,
+  };
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  // stacked[:, 0, :] == a.
+  test::ExpectTensorEqual<float>(tensors_expected[fA],
+                                 tensors_expected[fASliceOut]);
+  // stacked[:, 1, :] == b.
+  test::ExpectTensorEqual<float>(tensors_expected[fB],
+                                 tensors_expected[fBSliceOut]);
+  // stacked[:, 2, :] == c.
+  test::ExpectTensorEqual<float>(tensors_expected[fC],
+                                 tensors_expected[fCSliceOut]);
+
+  // stacked[:, 0:1, :] == expand_dims(a, 1).
+  test::ExpectTensorEqual<float>(tensors_expected[fExpandedA],
+                                 tensors_expected[fASlice01Out]);
+
+  // stacked[:, :1, :] == expand_dims(a, 1).
+  test::ExpectTensorEqual<float>(tensors_expected[fExpandedA],
+                                 tensors_expected[fASliceTo1Out]);
+
+  // stacked[:, 1:2, :] == expand_dims(b, 1).
+  test::ExpectTensorEqual<float>(tensors_expected[fExpandedB],
+                                 tensors_expected[fBSlice12Out]);
+  // stacked[:, 2:, :] == expand_dims(c, 1).
+  test::ExpectTensorEqual<float>(tensors_expected[fExpandedC],
+                                 tensors_expected[fCSlice2ToOut]);
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveStackStridedSliceSameAxis(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  for (const auto& node : output.node()) {
+    if (node.name() == "pa_slice_out") {
+      EXPECT_EQ(node.input(0), "a");
+    } else if (node.name() == "pb_slice_out") {
+      EXPECT_EQ(node.input(0), "b");
+    } else if (node.name() == "pc_slice_out") {
+      EXPECT_EQ(node.input(0), "c");
+    } else if (str_util::EndsWith(node.name(), "_out")) {
+      EXPECT_EQ(strings::StrCat(node.input(0), "_out"),
+                strings::StrCat(
+                    "ArithmeticOptimizer/RemoveStackStridedSliceSameAxis_",
+                    node.name()));
+    }
+  }
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+
+  // stacked[:, 0, :] == a.
+  test::ExpectTensorEqual<float>(tensors_expected[fA], tensors[fASliceOut]);
+
+  // stacked[:, 1, :] == b.
+  test::ExpectTensorEqual<float>(tensors_expected[fB], tensors[fBSliceOut]);
+  // stacked[:, 2, :] == c.
+  test::ExpectTensorEqual<float>(tensors_expected[fC], tensors[fCSliceOut]);
+
+  // stacked[:, 0:1, :] == expand_dims(a, 1).
+  test::ExpectTensorEqual<float>(tensors_expected[fExpandedA],
+                                 tensors[fASlice01Out]);
+
+  // stacked[:, :1, :] == expand_dims(a, 1).
+  test::ExpectTensorEqual<float>(tensors_expected[fExpandedA],
+                                 tensors[fASliceTo1Out]);
+
+  // stacked[:, 1:2, :] == expand_dims(b, 1).
+  test::ExpectTensorEqual<float>(tensors_expected[fExpandedB],
+                                 tensors[fBSlice12Out]);
+  // stacked[:, 2:, :] == expand_dims(c, 1).
+  test::ExpectTensorEqual<float>(tensors_expected[fExpandedC],
+                                 tensors[fCSlice2ToOut]);
+}
+
+TEST_F(ArithmeticOptimizerTest, SimplifyAggregationBFloat16) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  Output cast = ops::Cast(s.WithOpName("cast"), x, DT_BFLOAT16);
+  Output add = ops::AddN(s.WithOpName("add"), {cast, cast});
+  Output id = ops::Identity(s.WithOpName("id"), add);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlySimplifyAggregation(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // Extra node created for multiplier.
+  EXPECT_EQ(5, output.node_size());
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bfloat16>(tensors_expected[0], tensors[0]);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index ca5d3a6dfdaafe29b3ac013b06a8743251f14b40..5e3e5d6af9a7dd435a15f83e94434de0c25ed7aa 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -157,6 +157,16 @@ bool GetConcatAxis(const GraphProperties& properties, NodeDef* node,
   return true;
 }
 
+bool HasTPUAttributes(const NodeDef& node) {
+  AttrSlice attrs(node);
+  for (auto attr : attrs) {
+    if (attr.first.find("_tpu_") != attr.first.npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace
 
 ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
@@ -187,7 +197,7 @@ string ConstantFolding::AddControlDependency(const string& input_name,
     // switch node, and use it to anchor the control dependency.
     auto outputs = node_map->GetOutputs(node->name());
     for (const NodeDef* output : outputs) {
-      if (IsIdentity(*output)) {
+      if (IsIdentity(*output) || IsIdentityNSingleInput(*output)) {
         if (IsSameInput(node->input(0), input_name)) {
           return AsControlDependency(*output);
         }
@@ -349,6 +359,9 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
       if (IsReallyConstant(*array_size)) {
         // Don't materialize 0 sizes to avoid triggering incorrect static
         // checks. A 0 sized array that can't grow isn't useful anyway.
+        if (array_size->attr().count("value") == 0) {
+          continue;
+        }
         const TensorProto& raw_val = array_size->attr().at("value").tensor();
         if (raw_val.dtype() != DT_INT32) {
           continue;
@@ -454,6 +467,9 @@ bool ExtractShape(const NodeDef& shape_node, const GraphProperties& properties,
       *min_id = std::min<int64>(*min_id, dim.size());
     }
   } else {
+    if (shape_node.attr().count("value") == 0) {
+      return false;
+    }
     const TensorProto& raw_val = shape_node.attr().at("value").tensor();
     if (raw_val.dtype() != DT_INT64 && raw_val.dtype() != DT_INT32) {
       return false;
@@ -552,6 +568,7 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
   reduce_dims[0] = bcast.grad_x_reduce_idx();
   reduce_dims[1] = bcast.grad_y_reduce_idx();
 
+  TF_RETURN_IF_ERROR(CheckAttrExists(node, "T"));
   const DataType type = node.attr().at("T").type();
   NodeDef* out[2];
   for (int j = 0; j < 2; ++j) {
@@ -616,28 +633,37 @@ Status ConstantFolding::MaterializeReductionIndices(
     // We can't do anything if we don't know the rank of the input.
     return Status::OK();
   }
-  const int rank = input_prop.shape().dim_size();
-  if (rank == 0) {
+  const int input_rank = input_prop.shape().dim_size();
+  if (input_rank < 1) {
     // Unexpected graph, don't try to change it.
     return Status::OK();
   }
+  const OpInfo::TensorProperties& reduction_indices_prop = input_props[1];
+  DataType dtype = reduction_indices_prop.dtype();
+  if (dtype != DT_INT32 && dtype != DT_INT64) {
+    return Status::OK();
+  }
+  PartialTensorShape reduction_indices_shape(reduction_indices_prop.shape());
+  const int num_reduction_indices = reduction_indices_shape.num_elements();
+
   const std::vector<OpInfo::TensorProperties>& output_props =
       properties.GetOutputProperties(node->name());
   if (output_props.size() != 1) {
     return Status::OK();
   }
-  const bool keep_dims =
-      node->attr().count("keep_dims") && node->attr().at("keep_dims").b();
   const OpInfo::TensorProperties& output_prop = output_props[0];
-  PartialTensorShape output_shape(output_prop.shape());
-  if (output_shape.num_elements() != 1) {
-    bool full_reduction = false;
+  const int output_rank =
+      output_prop.shape().unknown_rank() ? -1 : output_prop.shape().dim_size();
+
+  bool full_reduction = output_rank == 0 || num_reduction_indices == input_rank;
+  if (!full_reduction) {
+    // A full reduction will generate a tensor of one of the shapes
+    // [], [1], [1, 1], [1, 1, ...]. Even if we do not know the number of
+    // elements in the output of the reduction, we may deduce it from reshape
+    // nodes following it.
     for (const NodeDef* fanout : node_map_->GetOutputs(node->name())) {
-      if (!IsReshape(*fanout) && !keep_dims) {
-        // Depending on how it's setup, a full reduction will generate a tensor
-        // of shape [], [1], [1, 1], [1, 1, ...]. If keep_dims isn't true, we
-        // rely on the existence of a reshape node following the reduction to
-        // ensure that the fanout is fed a scalar of the right shape.
+      full_reduction = false;
+      if (!IsReshape(*fanout)) {
         return Status::OK();
       }
       const std::vector<OpInfo::TensorProperties>& reshape_props =
@@ -658,20 +684,15 @@ Status ConstantFolding::MaterializeReductionIndices(
     }
   }
 
-  const OpInfo::TensorProperties& reduction_prop = input_props[1];
-  DataType dtype = reduction_prop.dtype();
-  if (dtype != DT_INT32 && dtype != DT_INT64) {
-    return Status::OK();
-  }
-  // We know it's a full reduction. We can generate the set of indices to
-  // reduce.
+  // We know it's a full reduction. We can generate the full set of indices to
+  // reduce as a constant node.
   string const_name = OptimizedNodeName(*node, "-reduction_indices");
   if (node_map_->GetNode(const_name)) {
     return Status::OK();
   }
   NodeDef* reduction_indices = graph_->add_node();
-  Tensor value(dtype, TensorShape({rank}));
-  for (int i = 0; i < rank; ++i) {
+  Tensor value(dtype, TensorShape({input_rank}));
+  for (int i = 0; i < input_rank; ++i) {
     if (dtype == DT_INT32) {
       value.vec<int32>()(i) = i;
     } else {
@@ -680,6 +701,7 @@ Status ConstantFolding::MaterializeReductionIndices(
   }
   TF_RETURN_IF_ERROR(
       CreateNodeDef(const_name, TensorValue(&value), reduction_indices));
+
   reduction_indices->set_device(node->device());
   string ctrl_dep =
       AddControlDependency(node->input(1), graph_, node_map_.get());
@@ -719,6 +741,12 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
       nodes_whitelist_.find(node.name()) == nodes_whitelist_.end()) {
     return false;
   }
+  // `FakeParam` op is used as a placeholder in If branch function. It doesn't
+  // have a valid output when executed.
+  if (IsFakeParam(node)) {
+    return false;
+  }
+
   // Skip control flow nodes, they can't be folded.
   if (ModifiesFrameInfo(node)) {
     return false;
@@ -746,6 +774,13 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     return false;
   }
 
+  // Don't fold nodes that contain TPU attributes.
+  // TODO(rmlarsen): We should be able to fold many of these nodes as long as we
+  // properly forward custom attributes, b/119051778.
+  if (HasTPUAttributes(node)) {
+    return false;
+  }
+
   const OpDef* op_def = nullptr;
   Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
   if (!status.ok()) {
@@ -785,7 +820,8 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     if (is_const) {
       // Don't fold strings constants for now since this causes problems with
       // checkpointing.
-      if (input_node->attr().at("dtype").type() == DT_STRING) {
+      if (input_node->attr().count("dtype") == 0 ||
+          input_node->attr().at("dtype").type() == DT_STRING) {
         return false;
       }
       // Special case: If a Merge node has at least one constant input that
@@ -953,7 +989,8 @@ Status ConstantFolding::EvaluateNode(const NodeDef& node,
 }
 
 Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
-                                            std::vector<NodeDef>* outputs) {
+                                            std::vector<NodeDef>* outputs,
+                                            bool* result_too_large) {
   TensorVector inputs;
   TensorVector output_tensors;
   auto inputs_cleanup = gtl::MakeCleanup([&inputs, &output_tensors] {
@@ -968,9 +1005,8 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
   });
 
   for (const auto& input : node.input()) {
-    int port = 0;
-    ParseNodeNameAsStringPiece(input, &port);
-    if (port < 0) {
+    const TensorId input_tensor = ParseTensorName(input);
+    if (input_tensor.index() < 0) {
       // Control dependency
       break;
     }
@@ -980,6 +1016,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
                     strings::StrCat("Can't fold ", node.name(), ", its ", input,
                                     " isn't constant"));
     }
+    TF_RETURN_IF_ERROR(CheckAttrExists(*input_node, "value"));
     const TensorProto& raw_val = input_node->attr().at("value").tensor();
     Tensor* value = new Tensor(raw_val.dtype(), raw_val.tensor_shape());
     CHECK(value->FromProto(raw_val));
@@ -998,8 +1035,11 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
       node_name = strings::StrCat(node_name, "-", i);
     }
     if (output_tensors[i].tensor) {
-      TF_RETURN_IF_ERROR(
-          CreateNodeDef(node_name, output_tensors[i], &outputs->at(i)));
+      Status s = CreateNodeDef(node_name, output_tensors[i], &outputs->at(i));
+      if (!s.ok()) {
+        *result_too_large = true;
+        return s;
+      }
     } else {
       // Create an empty NodeDef to identify dead outputs (e.g. the output of a
       // switch that's not selected by the switch predicate).
@@ -1009,7 +1049,8 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
   return Status::OK();
 }
 
-Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
+Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
+                                 bool* result_too_large) {
   if (IsMerge(*node)) {
     // Merge nodes are special, in the sense that they execute as soon as one of
     // their input is ready. We can therefore fold a merge node iff it has at
@@ -1102,10 +1143,14 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
   }
 
   std::vector<NodeDef> const_nodes;
-  TF_RETURN_IF_ERROR(EvaluateOneFoldable(*node, &const_nodes));
+  TF_RETURN_IF_ERROR(
+      EvaluateOneFoldable(*node, &const_nodes, result_too_large));
+  VLOG(1) << "Folded node:\n" << node->DebugString();
+
   NodeDef* constant_output = nullptr;
   for (int i = 0; i < const_nodes.size(); i++) {
     NodeDef* const_node = &const_nodes[i];
+    VLOG(1) << "Generated constant node:\n" << const_node->DebugString();
     if (const_node->name().empty()) {
       // Dead output: we can't create a constant to encode its value, so we'll
       // just skip it. We'll preserve the edges that originate from that
@@ -1207,7 +1252,8 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
   return Status::OK();
 }
 
-Status ConstantFolding::FoldGraph(GraphDef* output) {
+Status ConstantFolding::FoldGraph(
+    GraphDef* output, absl::flat_hash_set<string>* nodes_to_not_simplify) {
   std::unordered_set<string> processed_nodes;
   std::deque<NodeDef*> queue;
   for (int i = 0; i < graph_->node_size(); i++) {
@@ -1230,11 +1276,15 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
                 return n1->name() < n2->name();
               });
 
-    Status s = FoldNode(node, output);
+    bool result_too_large = false;
+    Status s = FoldNode(node, output, &result_too_large);
     processed_nodes.insert(node->name());
     if (!s.ok()) {
       VLOG(1) << "Failed to fold node " << node->DebugString()
               << "\nError message: " << s;
+      if (result_too_large) {
+        nodes_to_not_simplify->emplace(node->name());
+      }
     } else {
       for (auto& output : fanout) {
         if (IsFoldable(*output)) {
@@ -1266,64 +1316,6 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
   return Status::OK();
 }
 
-// Returns true iff this reduction can be reduced to an identity (i.e if the set
-// of dimensions to reduce along is empty). This happens often in the gradient
-// graphs.
-bool ConstantFolding::IsSimplifiableReduction(
-    const NodeDef& node, const GraphProperties& properties) const {
-  if (IsReduction(node)) {
-    CHECK_LE(2, node.input_size());
-    const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
-    if (IsReallyConstant(*reductions_indices)) {
-      TensorVector output;
-      auto outputs_cleanup = gtl::MakeCleanup([&output] {
-        for (const auto& out : output) {
-          delete out.tensor;
-        }
-      });
-      Status s = EvaluateNode(*reductions_indices, TensorVector(), &output);
-      if (!s.ok()) {
-        return false;
-      }
-      CHECK_EQ(1, output.size());
-      int output_size = output[0]->NumElements();
-      if (output_size == 0) {
-        return true;
-      }
-      if (node.attr().count("keep_dims") > 0 &&
-          node.attr().at("keep_dims").b()) {
-        const auto& props = properties.GetInputProperties(node.name());
-        if (!props.empty()) {
-          const TensorShapeProto& input_shape = props[0].shape();
-          if (!input_shape.unknown_rank()) {
-            bool simplifiable = true;
-            for (int i = 0; i < output[0]->NumElements(); ++i) {
-              int64 dim;
-              if (output[0]->dtype() == DT_INT32) {
-                dim = output[0]->flat<int32>()(i);
-              } else {
-                dim = output[0]->flat<int64>()(i);
-              }
-              if (dim < 0) {
-                dim += input_shape.dim_size();
-              }
-              if (dim < 0 || dim >= input_shape.dim_size() ||
-                  input_shape.dim(dim).size() != 1) {
-                simplifiable = false;
-                break;
-              }
-            }
-            if (simplifiable) {
-              return true;
-            }
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
 bool ConstantFolding::IsSimplifiableReshape(
     const NodeDef& node, const GraphProperties& properties) const {
   if (!IsReshape(node)) {
@@ -1393,16 +1385,13 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
   if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
     return false;
   }
-  if (node.op() == "OnesLike") {
-    return true;
-  }
+  if (node.op() == "OnesLike") return true;
   if (node.op() == "Fill") {
     NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
     return values != nullptr && IsOnes(*values);
   }
-  if (node.op() != "Const") {
-    return false;
-  }
+  if (node.op() != "Const") return false;
+  if (node.attr().count("dtype") == 0) return false;
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
     IS_ONES_CASE(DT_BOOL);
@@ -1429,16 +1418,13 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
     return false;
   }
-  if (node.op() == "ZerosLike") {
-    return true;
-  }
+  if (node.op() == "ZerosLike") return true;
   if (node.op() == "Fill") {
     NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
     return values != nullptr && IsZeros(*values);
   }
-  if (!IsConstant(node)) {
-    return false;
-  }
+  if (!IsConstant(node)) return false;
+  if (node.attr().count("dtype") == 0) return false;
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
     IS_ZEROS_CASE(DT_BOOL);
@@ -1569,13 +1555,22 @@ Status ConstantFolding::ReplaceOperationWithConstant(
   return Status::OK();
 }
 
-Status ConstantFolding::SimplifyGraph(bool use_shape_info,
-                                      GraphDef* optimized_graph,
-                                      GraphProperties* properties) {
+Status ConstantFolding::SimplifyGraph(
+    bool use_shape_info, GraphDef* optimized_graph, GraphProperties* properties,
+    absl::flat_hash_set<string>* nodes_to_not_simplify) {
   for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    TF_RETURN_IF_ERROR(SimplifyNode(use_shape_info,
-                                    optimized_graph->mutable_node(i),
-                                    optimized_graph, properties));
+    NodeDef* node = optimized_graph->mutable_node(i);
+    // TODO(lyandy): Move nodes to not simplify check into SimplifyNode and
+    // generalize to only restrict certain simplifications.
+    if (nodes_to_not_simplify->find(node->name()) ==
+        nodes_to_not_simplify->end()) {
+      if (HasTPUAttributes(optimized_graph->node(i))) {
+        nodes_to_not_simplify->insert(node->name());
+        continue;
+      }
+      TF_RETURN_IF_ERROR(
+          SimplifyNode(use_shape_info, node, optimized_graph, properties));
+    }
   }
   return Status::OK();
 }
@@ -1670,7 +1665,7 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
     return Status::OK();
   }
 
-  if (SimplifyReduction(*properties, node)) {
+  if (SimplifyReduction(optimized_graph, *properties, node)) {
     graph_modified_ = true;
     return Status::OK();
   }
@@ -1732,11 +1727,11 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
 bool ConstantFolding::RemoveSplitOrSplitV(const GraphProperties& properties,
                                           GraphDef* optimized_graph,
                                           NodeDef* node) {
+  if (node->attr().count("num_split") == 0) return false;
   if (IsSplit(*node) && node->attr().at("num_split").i() == 1) {
     ReplaceOperationWithIdentity(1, properties, node, optimized_graph);
     return true;
   }
-
   if (IsSplitV(*node) && node->attr().at("num_split").i() == 1) {
     ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
     return true;
@@ -1913,6 +1908,8 @@ Status ConstantFolding::SimplifyStridedSlice(const GraphProperties& properties,
                                              NodeDef* node, bool* success) {
   if (use_shape_info && IsStridedSlice(*node) &&
       properties.GetInputProperties(node->name()).size() == 4) {
+    TF_RETURN_IF_ERROR(
+        CheckAttrsExist(*node, {"new_axis_mask", "shrink_axis_mask"}));
     if (node->attr().at("new_axis_mask").i() != 0 ||
         node->attr().at("shrink_axis_mask").i() != 0) {
       // Skip nodes with new/shrink axis mask, since they involve dimension
@@ -1947,6 +1944,8 @@ Status ConstantFolding::SimplifyStridedSlice(const GraphProperties& properties,
         return errors::InvalidArgument("Cannot parse tensor from proto: ",
                                        s.value().DebugString());
       }
+      TF_RETURN_IF_ERROR(
+          CheckAttrsExist(*node, {"begin_mask", "end_mask", "ellipsis_mask"}));
       int begin_mask = node->attr().at("begin_mask").i();
       int end_mask = node->attr().at("end_mask").i();
       std::set<int> expanded_ellipsis_indices;
@@ -2195,7 +2194,7 @@ bool ConstantFolding::SimplifySwitch(GraphDef* optimized_graph, NodeDef* node) {
     auto fanouts = node_map_->GetOutputs(node->name());
     if (fanouts.size() == 2) {
       for (NodeDef* fanout : fanouts) {
-        if (!IsIdentity(*fanout) ||
+        if ((!IsIdentity(*fanout) && !IsIdentityNSingleInput(*fanout)) ||
             NumNonControlOutputs(*fanout, *node_map_) > 0) {
           already_optimized = false;
           break;
@@ -2269,13 +2268,152 @@ bool ConstantFolding::SimplifySwitch(GraphDef* optimized_graph, NodeDef* node) {
   return false;
 }
 
-bool ConstantFolding::SimplifyReduction(const GraphProperties& properties,
+bool ConstantFolding::IsReductionCandidateForSimplification(
+    const NodeDef& node, const GraphProperties& properties,
+    TensorShapeProto* input_tensor_shape, TensorShapeProto* output_tensor_shape,
+    bool* is_single_element_op) const {
+  // Ensure its an appropriate Reduce node.
+  if (!IsReduction(node) || node.input_size() < 2) {
+    return false;
+  }
+  // Ensure that the axes to reduce by are constant.
+  NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
+  if (!IsReallyConstant(*reductions_indices)) {
+    return false;
+  }
+
+  // Get the properties of the input & output tensors and check if they both
+  // contain a single element.
+  if (!properties.HasInputProperties(node.name()) ||
+      !properties.HasOutputProperties(node.name())) {
+    return false;
+  }
+  const auto& input_props = properties.GetInputProperties(node.name())[0];
+  const auto& output_props = properties.GetOutputProperties(node.name())[0];
+  if (!input_props.has_shape() || input_props.shape().unknown_rank() ||
+      !output_props.has_shape() || output_props.shape().unknown_rank()) {
+    return false;
+  }
+  *input_tensor_shape = input_props.shape();
+  *output_tensor_shape = output_props.shape();
+  for (int i = 0; i < input_tensor_shape->dim_size(); ++i) {
+    if (input_tensor_shape->dim(i).size() < 0) {
+      return false;
+    }
+  }
+  for (int i = 0; i < output_tensor_shape->dim_size(); ++i) {
+    if (output_tensor_shape->dim(i).size() < 0) {
+      return false;
+    }
+  }
+  const int input_num_elements =
+      TensorShape(*input_tensor_shape).num_elements();
+  const int output_num_elements =
+      TensorShape(*output_tensor_shape).num_elements();
+  *is_single_element_op = input_num_elements == 1 && output_num_elements == 1;
+
+  return true;
+}
+
+bool ConstantFolding::IsReductionSimplifiableToIdentity(
+    const NodeDef& node, const TensorShapeProto& input_shape, bool keep_dims,
+    const TensorVector& reduction_indices_vector) const {
+  int output_size = reduction_indices_vector[0]->NumElements();
+  if (output_size == 0) {
+    return true;
+  }
+
+  if (!keep_dims) {
+    return false;
+  }
+  bool simplifiable = true;
+  for (int i = 0; i < output_size; ++i) {
+    int64 dim;
+    if (reduction_indices_vector[0]->dtype() == DT_INT32) {
+      dim = reduction_indices_vector[0]->flat<int32>()(i);
+    } else {
+      dim = reduction_indices_vector[0]->flat<int64>()(i);
+    }
+    if (dim < 0) {
+      dim += input_shape.dim_size();
+    }
+    if (dim < 0 || dim >= input_shape.dim_size() ||
+        input_shape.dim(dim).size() != 1) {
+      simplifiable = false;
+      break;
+    }
+  }
+  return simplifiable;
+}
+
+bool ConstantFolding::SimplifyReduction(GraphDef* optimized_graph,
+                                        const GraphProperties& properties,
                                         NodeDef* node) {
-  if (IsSimplifiableReduction(*node, properties)) {
+  bool is_single_element_op = false;
+  TensorShapeProto input_tensor_shape, output_tensor_shape;
+  if (!IsReductionCandidateForSimplification(
+          *node, properties, &input_tensor_shape, &output_tensor_shape,
+          &is_single_element_op)) {
+    return false;
+  }
+
+  // Get the reduction indices.
+  string reduction_indices_input = node->input(1);
+  NodeDef* reduction_indices = node_map_->GetNode(reduction_indices_input);
+  TensorVector reduction_indices_vector;
+  auto outputs_cleanup = gtl::MakeCleanup([&reduction_indices_vector] {
+    for (const auto& out : reduction_indices_vector) {
+      delete out.tensor;
+    }
+  });
+  if (!EvaluateNode(*reduction_indices, TensorVector(),
+                    &reduction_indices_vector)
+           .ok() ||
+      reduction_indices_vector.size() != 1) {
+    return false;
+  }
+
+  bool keep_dims =
+      node->attr().count("keep_dims") > 0 && node->attr().at("keep_dims").b();
+  bool simplifiable_to_reshape =
+      is_single_element_op && !keep_dims && (node->attr().count("T") > 0);
+  bool simplifiable_to_identity = IsReductionSimplifiableToIdentity(
+      *node, input_tensor_shape, keep_dims, reduction_indices_vector);
+
+  if (simplifiable_to_reshape) {
+    // Const node to output shape.
+    const int new_num_dimensions = output_tensor_shape.dim_size();
+    Tensor tensor(DT_INT32, TensorShape({new_num_dimensions}));
+    for (int i = 0; i < new_num_dimensions; i++) {
+      tensor.flat<int>()(i) = 1;
+    }
+    TensorValue shape_value(&tensor);
+    NodeDef* shape_node = optimized_graph->add_node();
+    if (!CreateNodeDef(OptimizedNodeName(*node, "_shape_const"), shape_value,
+                       shape_node)
+             .ok()) {
+      return false;
+    }
+    shape_node->set_device(node->device());
+    node_map_->AddNode(shape_node->name(), shape_node);
+    // Control dependency to ensure shape_node is in the correct frame.
+    shape_node->add_input(AsControlDependency(reduction_indices_input));
+    node_map_->AddOutput(NodeName(reduction_indices_input), shape_node->name());
+    // Optimize node to Reshape.
+    node->set_op("Reshape");
+    node_map_->UpdateInput(node->name(), node->input(1), shape_node->name());
+    node->set_input(1, shape_node->name());
+    node->mutable_attr()->erase("keep_dims");
+    node->mutable_attr()->erase("Tidx");
+    AttrValue attr_type_indices;
+    attr_type_indices.set_type(DT_INT32);
+    (*node->mutable_attr())["Tshape"] = attr_type_indices;
+    return true;
+  } else if (simplifiable_to_identity) {
     // Replace the reduction node with an identity node, that can be further
     // optimized by the model pruner.
     DataType output_type;
-    if (node->attr().count("T") > 0) {
+    if (node->attr().count("T") != 0) {
       output_type = node->attr().at("T").type();
     } else {
       // This is an 'any' or 'all' reduction. The output is always boolean.
@@ -2292,8 +2430,10 @@ bool ConstantFolding::SimplifyReduction(const GraphProperties& properties,
 
 bool ConstantFolding::SimplifyReshape(const GraphProperties& properties,
                                       bool use_shape_info, NodeDef* node) {
-  if (!use_shape_info) return false;
-  if (!IsSimplifiableReshape(*node, properties)) return false;
+  if (!use_shape_info || node->attr().count("T") == 0 ||
+      !IsSimplifiableReshape(*node, properties)) {
+    return false;
+  }
   DataType output_type = node->attr().at("T").type();
   node->set_op("Identity");
   node->clear_attr();
@@ -2305,6 +2445,7 @@ bool ConstantFolding::SimplifyReshape(const GraphProperties& properties,
 Status ConstantFolding::SimplifyArithmeticOperations(
     const GraphProperties& properties, bool use_shape_info,
     GraphDef* optimized_graph, NodeDef* node, bool* success) {
+  *success = false;
   const bool is_mul = IsMul(*node) || IsLogicalAnd(*node);
   const bool is_matmul = IsMatMul(*node);
   const bool is_add = IsAdd(*node) || IsBiasAdd(*node) || IsLogicalOr(*node);
@@ -2349,6 +2490,7 @@ Status ConstantFolding::SimplifyArithmeticOperations(
 
     // Replace 1 / y with Reciprocal op.
     if (y_matches_output_shape && is_any_div && x_is_one) {
+      TF_RETURN_IF_ERROR(CheckAttrExists(*node, "T"));
       DataType type = node->attr().at("T").type();
       if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
         ReplaceDivisionOfOnesByReciprocal(node, optimized_graph);
@@ -2674,7 +2816,8 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node,
 
 bool ConstantFolding::PartialConstPropThroughIdentityN(NodeDef* node) {
   // Partial constant propagation through IdentityN.
-  if (IsIdentityN(*node) && NumNonControlInputs(*node) > 0) {
+  if ((IsIdentityN(*node) || IsIdentityNSingleInput(*node)) &&
+      NumNonControlInputs(*node) > 0) {
     const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
     const std::vector<NodeDef*> consumers(tmp.begin(), tmp.end());
     bool updated_graph = false;
@@ -3000,10 +3143,11 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
     TF_RETURN_IF_ERROR(MaterializeShapes(properties));
     TF_RETURN_IF_ERROR(MaterializeConstants(properties));
   }
-  TF_RETURN_IF_ERROR(FoldGraph(optimized_graph));
+  absl::flat_hash_set<string> nodes_to_not_simplify;
+  TF_RETURN_IF_ERROR(FoldGraph(optimized_graph, &nodes_to_not_simplify));
   node_map_.reset(new NodeMap(optimized_graph));
-  TF_RETURN_IF_ERROR(
-      SimplifyGraph(can_use_shape_info, optimized_graph, &properties));
+  TF_RETURN_IF_ERROR(SimplifyGraph(can_use_shape_info, optimized_graph,
+                                   &properties, &nodes_to_not_simplify));
 
   return Status::OK();
 }
@@ -3037,6 +3181,7 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
   *optimized_graph = item.graph;
   int64 node_count;
   do {
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     graph_modified_ = false;
     item_to_optimize.graph.Swap(optimized_graph);
     graph_ = &item_to_optimize.graph;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 8593b3e0b878e50c75bab8fa8b3e377aabf8d257..0b778882d7d4d89d83de5d6bd5a6f9c827cf5bf8 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -71,10 +73,11 @@ class ConstantFolding : public GraphOptimizer {
                       const gtl::InlinedVector<TensorValue, 4>& inputs,
                       gtl::InlinedVector<TensorValue, 4>* output) const;
 
-  Status EvaluateOneFoldable(const NodeDef& node,
-                             std::vector<NodeDef>* outputs);
+  Status EvaluateOneFoldable(const NodeDef& node, std::vector<NodeDef>* outputs,
+                             bool* result_too_large);
 
-  Status FoldNode(NodeDef* node, GraphDef* output_graph);
+  Status FoldNode(NodeDef* node, GraphDef* output_graph,
+                  bool* result_too_large);
 
   bool IsOnes(const NodeDef& node) const;
   bool IsZeros(const NodeDef& node) const;
@@ -91,14 +94,14 @@ class ConstantFolding : public GraphOptimizer {
                                       NodeDef* node, GraphDef* graph,
                                       bool* success);
   void ReplaceDivisionOfOnesByReciprocal(NodeDef* node, GraphDef* graph);
-  Status FoldGraph(GraphDef* output);
+  Status FoldGraph(GraphDef* output,
+                   absl::flat_hash_set<string>* nodes_to_not_simplify);
 
-  bool IsSimplifiableReduction(const NodeDef& node,
-                               const GraphProperties& properties) const;
   bool IsSimplifiableReshape(const NodeDef& node,
                              const GraphProperties& properties) const;
   Status SimplifyGraph(bool use_shape_info, GraphDef* optimized_graph,
-                       GraphProperties* properties);
+                       GraphProperties* properties,
+                       absl::flat_hash_set<string>* nodes_to_not_simplify);
   Status SimplifyNode(bool use_shape_info, NodeDef* node,
                       GraphDef* optimized_graph, GraphProperties* properties);
 
@@ -143,8 +146,22 @@ class ConstantFolding : public GraphOptimizer {
   bool SimplifyReshape(const GraphProperties& properties, bool use_shape_info,
                        NodeDef* node);
 
-  // Simplifies a Reduction operation to an Identity operation if applicable.
-  bool SimplifyReduction(const GraphProperties& properties, NodeDef* node);
+  // Returns true if theres a possibility that a Reduce node could be simplified
+  // to an Identity/Reshape.
+  bool IsReductionCandidateForSimplification(
+      const NodeDef& node, const GraphProperties& properties,
+      TensorShapeProto* input_tensor_shape,
+      TensorShapeProto* output_tensor_shape, bool* is_single_element_op) const;
+  // Returns true iff this reduction can be reduced to an identity (i.e if the
+  // set of dimensions to reduce along is empty). This happens often in the
+  // gradient graphs.
+  bool IsReductionSimplifiableToIdentity(
+      const NodeDef& node, const TensorShapeProto& input_shape, bool keep_dims,
+      const gtl::InlinedVector<TensorValue, 4>& reduction_indices_vector) const;
+  // Simplifies a Reduction operation to an Identity/Reshape operation if
+  // applicable.
+  bool SimplifyReduction(GraphDef* optimized_graph,
+                         const GraphProperties& properties, NodeDef* node);
 
   // Switch(x, x) will always feed false to its false branch and true to
   // its true branch. By rewriting the graph a bit, we can propagate these
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index b09360a2c26b74f8f708b807d93c04d7273b56f5..192f48272f9ed08b2b6424f3c8e33d1afafdb56d 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
@@ -72,9 +74,9 @@ class ConstantFoldingTest : public GrapplerTest {
       GrapplerItem item;
       TF_CHECK_OK(s.ToGraphDef(&item.graph));
       item.fetch = {"mul1", "mul2", "add1", "add2"};
-      ConstantFolding optimizer(nullptr /* cpu_device */);
+      ConstantFolding optimizer(/*cpu_device=*/nullptr);
       GraphDef output;
-      Status status = optimizer.Optimize(nullptr, item, &output);
+      Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
       TF_EXPECT_OK(status);
 
       EXPECT_EQ(7, output.node_size());
@@ -132,9 +134,9 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   item.fetch.push_back("d");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(1, output.node_size());
@@ -178,9 +180,9 @@ TEST_F(ConstantFoldingTest, AddTree) {
   item.fetch = {"add_parent", "mul_parent", "addmul_parent"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // We expect the following rewrite(s) to occur:
@@ -276,13 +278,11 @@ TEST_F(ConstantFoldingTest, ConvPushDownTest) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::cout << output.DebugString() << std::endl;
-
   EXPECT_EQ(5, output.node_size());
   int found = 0;
   for (const auto& node : output.node()) {
@@ -366,9 +366,9 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
     item.fetch = {"stack", "matmul3", "matmul4"};
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     const string suffix =
@@ -521,9 +521,9 @@ TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div_f", "div_i", "realdiv"};
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(8, output.node_size());
@@ -611,9 +611,9 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(15, output.node_size());
@@ -683,9 +683,9 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(10, output.node_size());
@@ -741,9 +741,9 @@ TEST_F(ConstantFoldingTest, CreateConstNodes) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(24, output.node_size());
@@ -790,9 +790,9 @@ TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   item.fetch.push_back("f");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(2, output.node_size());
@@ -831,9 +831,9 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   item.fetch.push_back("e");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "e"};
@@ -874,9 +874,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "c",
@@ -932,9 +932,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "i2"};
@@ -1009,9 +1009,9 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
   }
 
   item.fetch = outputs;
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int constant_folded = 0;
@@ -1047,9 +1047,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterialization) {
   item.fetch.push_back("p2");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1097,9 +1097,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1163,9 +1163,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   int found = 0;
   for (const auto& node : output.node()) {
@@ -1235,9 +1235,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
   item.fetch.push_back("ia");
   item.fetch.push_back("ib");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1307,9 +1307,9 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
@@ -1409,9 +1409,9 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
 
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
                                     "switch",   "i",
@@ -1505,9 +1505,9 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3", "out4", "idx4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(19, output.node_size());
@@ -1590,9 +1590,9 @@ TEST_F(ConstantFoldingTest, SplitRemoval) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1636,9 +1636,9 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1686,9 +1686,9 @@ TEST_F(ConstantFoldingTest, TransposeOnSize1DimsRemoval) {
   item.fetch = {"out1"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1723,9 +1723,9 @@ TEST_F(ConstantFoldingTest, RandomShuffleOnScalarRemoval) {
   item.fetch = {"out1", "out2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1769,9 +1769,9 @@ TEST_F(ConstantFoldingTest, ReverseOnSize1DimsRemoval) {
   item.fetch = {"out1"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1805,9 +1805,9 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -1852,9 +1852,9 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -1901,9 +1901,9 @@ TEST_F(ConstantFoldingTest, StridedSliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -1959,9 +1959,9 @@ TEST_F(ConstantFoldingTest, StridedSliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -2012,9 +2012,9 @@ TEST_F(ConstantFoldingTest, TileWithMultipliesBeingOne) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2045,9 +2045,9 @@ TEST_F(ConstantFoldingTest, MergeConcat) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2075,9 +2075,9 @@ TEST_F(ConstantFoldingTest, MergeConcat_SameInput) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2106,9 +2106,9 @@ TEST_F(ConstantFoldingTest, MergeConcat_ConcatWithConst) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2137,9 +2137,9 @@ TEST_F(ConstantFoldingTest, MergeConcat_AxisMismatch) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2175,9 +2175,9 @@ TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2221,9 +2221,9 @@ TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2269,9 +2269,9 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   item.fetch = {"s", "p2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -2303,6 +2303,95 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
+TEST_F(ConstantFoldingTest, SingleElementEmptyAxisReduction) {
+  // Build a simple graph with reductions that involve single-element input and
+  // no axes to reduce along.
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output input_var_three_dim = ops::Variable(
+      scope.WithOpName("input_var_three_dim"), {1, 1, 1}, DT_FLOAT);
+  Output input_var_one_dim =
+      ops::Variable(scope.WithOpName("input_var_one_dim"), {1}, DT_FLOAT);
+  Output one_axis = ops::Const(scope.WithOpName("one_axis"), {0}, {1});
+  Output multiple_axes =
+      ops::Const(scope.WithOpName("multiple_axes"), {1, 0}, {2});
+  Output variable_axis =
+      ops::Variable(scope.WithOpName("input_var_axis"), {1}, DT_INT32);
+  ops::Mean::Attrs attr;
+  attr = attr.KeepDims(false);
+  // Should be optimized to Reshape.
+  Output mean_1 = ops::Mean(scope.WithOpName("mean_1"), input_var_three_dim,
+                            one_axis, attr.KeepDims(false));
+  Output mean_2 = ops::Mean(scope.WithOpName("mean_2"), input_var_three_dim,
+                            multiple_axes, attr.KeepDims(false));
+  // Should remain as-is, since OutputProperties will not be known this node.
+  Output mean_3 = ops::Mean(scope.WithOpName("mean_3"), input_var_one_dim,
+                            one_axis, attr.KeepDims(false));
+  // Should remain as-is.
+  Output mean_4 = ops::Mean(scope.WithOpName("mean_4"), input_var_three_dim,
+                            variable_axis, attr.KeepDims(false));
+  // Should be optimized to Identity, since KeepDims=true.
+  Output mean_5 = ops::Mean(scope.WithOpName("mean_5"), input_var_three_dim,
+                            multiple_axes, attr.KeepDims(true));
+
+  GrapplerItem item;
+  item.fetch = {"mean_1", "mean_2", "mean_3", "mean_4", "mean_5"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // Ensure Mean node is optimized to Reshape.
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "mean_1" || node.name() == "mean_2") {
+      found++;
+      EXPECT_EQ("Reshape", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("input_var_three_dim", node.input(0));
+    } else if (node.name() == "mean_3") {
+      found++;
+      EXPECT_EQ("Mean", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("input_var_one_dim", node.input(0));
+    } else if (node.name() == "mean_4") {
+      found++;
+      EXPECT_EQ("Mean", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("input_var_three_dim", node.input(0));
+    } else if (node.name() == "mean_5") {
+      found++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^multiple_axes", node.input(1));
+    }
+  }
+  EXPECT_EQ(5, found);
+
+  // Ensure resultant values from Mean and Reshape are the same.
+  auto input_var_three_dim_t =
+      GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 1, 1}));
+  auto input_var_one_dim_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
+  Tensor input_var_axis_t(DT_INT32, TensorShape({1}));
+  input_var_axis_t.flat<int32>()(0) = 0;
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch,
+                    {{"input_var_three_dim", input_var_three_dim_t},
+                     {"input_var_one_dim", input_var_one_dim_t},
+                     {"input_var_axis", input_var_axis_t}});
+  EXPECT_EQ(5, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"input_var_three_dim", input_var_three_dim_t},
+                                {"input_var_one_dim", input_var_one_dim_t},
+                                {"input_var_axis", input_var_axis_t}});
+  EXPECT_EQ(5, tensors.size());
+  for (int i = 0; i < 5; ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+  }
+}
+
 TEST_F(ConstantFoldingTest, NoOpReshape) {
   // Build a simple graph with a reshape that can be reduced to the identity.
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
@@ -2344,9 +2433,9 @@ TEST_F(ConstantFoldingTest, NoOpReshape) {
   item.fetch = {"s1", "s2", "s3", "s4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -2406,9 +2495,9 @@ TEST_F(ConstantFoldingTest, Packing) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   const std::vector<string> fetch_nodes = {"i1", "i2"};
@@ -2449,9 +2538,9 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
@@ -2463,7 +2552,7 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -2530,14 +2619,14 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
   EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(11, output.node_size());
@@ -2591,58 +2680,100 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
 }
 
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output input =
-      ops::Placeholder(s.WithOpName("input"), DT_FLOAT,
-                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
-  Output indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
-  Output sum = ops::Sum(s.WithOpName("sum"), input, indices);
-  Output size = ops::Const(s.WithOpName("size"), 1, {1});
-  Output reshape = ops::Reshape(s.WithOpName("reshape"), sum, size);
+  for (bool use_reshape : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output input =
+        ops::Placeholder(s.WithOpName("input"), DT_FLOAT,
+                         ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+    // If use_reshape is false, we need to now the number of indices to apply
+    // the rewrite.
+    Output indices = ops::Placeholder(
+        s.WithOpName("indices"), DT_INT32,
+        ops::Placeholder::Shape(PartialTensorShape({use_reshape ? -1 : 2})));
+    Output sum = ops::Sum(s.WithOpName("sum"), input, indices);
+    if (use_reshape) {
+      Output size = ops::Const(s.WithOpName("size"), 1, {1});
+      Output reshape = ops::Reshape(s.WithOpName("reshape"), sum, size);
+    }
 
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch.push_back("reshape");
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch.push_back(use_reshape ? "reshape" : "sum");
 
-  auto input_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
-  Tensor indices_t(DT_INT32, TensorShape({2}));
-  indices_t.flat<int>()(0) = 0;
-  indices_t.flat<int>()(1) = 1;
-  auto tensors_expected = EvaluateNodes(
-      item.graph, item.fetch, {{"input", input_t}, {"indices", indices_t}});
-  EXPECT_EQ(1, tensors_expected.size());
+    auto input_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+    Tensor indices_t(DT_INT32, TensorShape({2}));
+    indices_t.flat<int>()(0) = 0;
+    indices_t.flat<int>()(1) = 1;
+    auto tensors_expected = EvaluateNodes(
+        item.graph, item.fetch, {{"input", input_t}, {"indices", indices_t}});
+    EXPECT_EQ(1, tensors_expected.size());
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
-  GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+    // Use aggressive mode to force the shape inference to propagate placeholder
+    // shapes.
+    ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                              /*cpu_device=*/nullptr);
+    GraphDef output;
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+    TF_EXPECT_OK(status);
 
-  // Run a second time to make sure the optimization is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+    // Run a second time to make sure the optimization is idempotent.
+    item.graph.Swap(&output);
+    status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+    TF_EXPECT_OK(status);
 
-  int found = 0;
-  for (const auto& node : output.node()) {
-    if (node.name() == "ConstantFolding/sum-reduction_indices") {
-      ++found;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ("^indices", node.input(0));
-      EXPECT_EQ(2, TensorShape(node.attr().at("value").tensor().tensor_shape())
-                       .num_elements());
-    } else if (node.name() == "sum") {
-      ++found;
-      EXPECT_EQ("ConstantFolding/sum-reduction_indices", node.input(1));
-    } else if (node.name() == "indices") {
-      ++found;
+    int found = 0;
+    for (const auto& node : output.node()) {
+      if (node.name() == "ConstantFolding/sum-reduction_indices") {
+        ++found;
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^indices", node.input(0));
+        EXPECT_EQ(2,
+                  TensorShape(node.attr().at("value").tensor().tensor_shape())
+                      .num_elements());
+      } else if (node.name() == "sum") {
+        ++found;
+        EXPECT_EQ("ConstantFolding/sum-reduction_indices", node.input(1));
+      } else if (node.name() == "indices") {
+        ++found;
+      }
     }
+    EXPECT_EQ(3, found);
+
+    auto tensors = EvaluateNodes(output, item.fetch,
+                                 {{"input", input_t}, {"indices", indices_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
   }
-  EXPECT_EQ(3, found);
+}
 
-  auto tensors = EvaluateNodes(output, item.fetch,
-                               {{"input", input_t}, {"indices", indices_t}});
-  EXPECT_EQ(1, tensors.size());
-  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+TEST_F(ConstantFoldingTest, MaterializeReductionIndices_NotFullReduction) {
+  for (bool input_rank_known : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output input =
+        (input_rank_known ? ops::Placeholder(s.WithOpName("input"), DT_FLOAT,
+                                             ops::Placeholder::Shape(
+                                                 PartialTensorShape({-1, -1})))
+                          : ops::Placeholder(s.WithOpName("input"), DT_FLOAT));
+    Output indices =
+        ops::Placeholder(s.WithOpName("indices"), DT_INT32,
+                         ops::Placeholder::Shape(
+                             PartialTensorShape({input_rank_known ? 1 : 2})));
+    Output sum = ops::Sum(s.WithOpName("sum"), input, indices);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch.push_back("sum");
+
+    // Use aggressive mode to force the shape inference to propagate placeholder
+    // shapes.
+    ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                              /*cpu_device=*/nullptr);
+    GraphDef output;
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    CompareGraphs(item.graph, output);
+  }
 }
 
 TEST_F(ConstantFoldingTest, LargeConstant) {
@@ -2657,9 +2788,9 @@ TEST_F(ConstantFoldingTest, LargeConstant) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("out");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Make sure the diag node hasn't been folded, since it would use too much
@@ -2702,9 +2833,9 @@ TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
   item.fetch.push_back("id_true");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(6, output.node_size());
@@ -2794,9 +2925,9 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
     item.fetch = {"stack"};
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     EXPECT_EQ(16, output.node_size());
@@ -2886,13 +3017,13 @@ TEST_F(ConstantFoldingTest, PartialFolding_Concat) {
 
   auto tensors_expected = EvaluateNodes(item.graph, {"concat0"});
   EXPECT_EQ(1, tensors_expected.size());
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   // Run the optimizer twice to make sure the rewrite is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(21, output.node_size());
@@ -2959,9 +3090,9 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
   item.fetch.push_back("add0");
   item.fetch.push_back("add1");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(8, output.node_size());
   for (const auto& node : output.node()) {
@@ -3021,9 +3152,9 @@ TEST_F(ConstantFoldingTest, TrivialPack) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch = {"stack", "stack_no_axis"};
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(7, output.node_size());
   int found = 0;
@@ -3103,13 +3234,13 @@ TEST_F(ConstantFoldingTest, Enter) {
   item.fetch.push_back("id3");
   item.fetch.push_back("id4");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   // Run the optimizer twice to make sure the rewrite is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(9, output.node_size());
@@ -3158,13 +3289,13 @@ TEST_F(ConstantFoldingTest, TensorArraySize) {
   auto tensors_expected =
       EvaluateNodes(item.graph, {"dynamic_sz", "static_sz"});
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   // Run the optimizer twice to make sure the rewrite is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(8, output.node_size());
@@ -3196,9 +3327,9 @@ TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) {
   item.fetch.push_back("c");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(1, output.node_size());
@@ -3215,6 +3346,126 @@ TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(ConstantFoldingTest, EvaluatingLargeConstantNoFoldingMergingLoop) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  int size = 10 * 1024 * 1024 / 4 / 2;
+  Output nonconst =
+      ops::RandomUniform(s.WithOpName("nonconst"), {size, 1}, DT_FLOAT);
+  Output const1 = ops::Const(s.WithOpName("const1"), 0.0f, {size, 1});
+  Output const2 = ops::Const(s.WithOpName("const2"), 1.0f, {size, 1});
+  Output axis = ops::Const(s.WithOpName("axis"), -1, {});
+  Output concat1 =
+      ops::Concat(s.WithOpName("concat1"), {nonconst, const1}, axis);
+  Output result = ops::Concat(s.WithOpName("result"), {concat1, const2}, axis);
+
+  GrapplerItem item;
+  item.fetch.push_back("result");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::vector<string> fetch = {"result"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  EXPECT_EQ(tensors_expected[0].shape(), tensors[0].shape());
+}
+
+class ConstantFoldingCastConstTest : public GrapplerTest {
+ protected:
+  void ConstantFoldingCastConst(bool fetch_const, bool fetch_cast,
+                                bool fetch_const_child, bool fetch_cast_child) {
+    if (!fetch_const && !fetch_cast && !fetch_const_child &&
+        !fetch_cast_child) {
+      return;
+    }
+
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    CreateCastConstGraph(s);
+    GrapplerItem item;
+    int expected_output_size = SetFetch(&item, fetch_const, fetch_cast,
+                                        fetch_const_child, fetch_cast_child);
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    GraphDef output = ConstantFoldingOptimize(item);
+    EXPECT_EQ(expected_output_size, output.node_size());
+
+    EvaluateAndCompareUnoptimized(item.graph, output, item.fetch);
+  }
+
+ private:
+  void CreateCastConstGraph(const tensorflow::Scope& s) {
+    Output const1 = ops::Const(s.WithOpName("const1"), 2, {5, 5});
+    Output cast = ops::Cast(s.WithOpName("cast"), const1, DT_FLOAT);
+    Output const1_child = ops::Identity(s.WithOpName("const1_child"), const1);
+    Output cast_child = ops::Identity(s.WithOpName("cast_child"), cast);
+  }
+
+  int SetFetch(GrapplerItem* item, bool fetch_const, bool fetch_cast,
+               bool fetch_const_child, bool fetch_cast_child) {
+    int expected_output_size = 0;
+    if (fetch_const) {
+      item->fetch.push_back("const1");
+      expected_output_size++;
+    }
+    if (fetch_cast) {
+      item->fetch.push_back("cast");
+      expected_output_size++;
+    }
+    if (fetch_const_child) {
+      item->fetch.push_back("const1_child");
+      expected_output_size++;
+    }
+    if (fetch_cast_child) {
+      item->fetch.push_back("cast_child");
+      expected_output_size++;
+    }
+    return expected_output_size;
+  }
+
+  GraphDef ConstantFoldingOptimize(const GrapplerItem& item) {
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
+    GraphDef output;
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+    TF_EXPECT_OK(status);
+    return output;
+  }
+
+  void EvaluateAndCompareUnoptimized(const GraphDef& unoptimized_graph,
+                                     const GraphDef& optimized_graph,
+                                     const std::vector<string>& fetch_nodes) {
+    auto tensors_expected = EvaluateNodes(unoptimized_graph, fetch_nodes);
+    auto tensors = EvaluateNodes(optimized_graph, fetch_nodes);
+    ASSERT_EQ(fetch_nodes.size(), tensors_expected.size());
+    ASSERT_EQ(fetch_nodes.size(), tensors.size());
+    for (int i = 0; i < fetch_nodes.size(); i++) {
+      if (fetch_nodes[i] == "const1" || fetch_nodes[i] == "const1_child") {
+        test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+      } else {
+        test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+      }
+    }
+  }
+};
+
+TEST_F(ConstantFoldingCastConstTest, CastConstFolding) {
+  for (bool fetch_const : {false, true}) {
+    for (bool fetch_cast : {false, true}) {
+      for (bool fetch_const_child : {false, true}) {
+        for (bool fetch_cast_child : {false, true}) {
+          ConstantFoldingCastConst(fetch_const, fetch_cast, fetch_const_child,
+                                   fetch_cast_child);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index ee7c14e3ab39315cde5bd08fa04213e90eb42f0b..7593023ff4d649c623db9be98ac52ef6b799219f 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -227,35 +227,62 @@ cc_library(
 )
 
 cc_library(
-    name = "map_vectorization",
-    srcs = ["map_vectorization.cc"],
-    hdrs = [
-        "map_vectorization.h",
-    ],
+    name = "make_numa_aware",
+    srcs = ["make_numa_aware.cc"],
+    hdrs = ["make_numa_aware.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":function_utils",
         ":graph_utils",
-        ":vectorization_utils",
-        "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
 )
 
 tf_cc_test(
-    name = "map_vectorization_test",
-    srcs = ["map_vectorization_test.cc"],
+    name = "make_numa_aware_test",
+    srcs = ["make_numa_aware_test.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":graph_test_utils",
         ":graph_utils",
-        ":map_vectorization",
+        ":make_numa_aware",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
+cc_library(
+    name = "make_sloppy",
+    srcs = ["make_sloppy.cc"],
+    hdrs = ["make_sloppy.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+)
+
+tf_cc_test(
+    name = "make_sloppy_test",
+    srcs = ["make_sloppy_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_test_utils",
+        ":graph_utils",
+        ":make_sloppy",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -414,6 +441,44 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "map_vectorization",
+    srcs = ["map_vectorization.cc"],
+    hdrs = [
+        "map_vectorization.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":function_utils",
+        ":graph_utils",
+        ":vectorization_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:lib_internal",
+    ] + tf_protos_all(),
+)
+
+tf_cc_test(
+    name = "map_vectorization_test",
+    srcs = ["map_vectorization_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        ":map_vectorization",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
 cc_library(
     name = "noop_elimination",
     srcs = ["noop_elimination.cc"],
@@ -489,6 +554,8 @@ cc_library(
         ":filter_fusion",
         ":hoist_random_uniform",
         ":latency_all_edges",
+        ":make_numa_aware",
+        ":make_sloppy",
         ":map_and_batch_fusion",
         ":map_and_filter_fusion",
         ":map_fusion",
@@ -524,6 +591,7 @@ cc_library(
     deps = [
         ":function_utils",
         ":graph_utils",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/cc:ops",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:core_cpu",
@@ -545,7 +613,9 @@ tf_cc_test(
         ":graph_utils",
         ":function_utils",
         ":vectorization_utils",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -555,7 +625,11 @@ tf_cc_test(
         # For ops we need registered
         "//tensorflow/core/kernels/data:dataset_ops",
         "//tensorflow/core/kernels:cast_op",
+        "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:logging_ops",
+        "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels:nn",
+        "//tensorflow/core/kernels:parsing",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ] + tf_protos_all(),
 )
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
index 1ad495bbad023b6c9f3461648d83c044ee0dc1f5..89b568ecf161cda08f1b71b369c3edb1d43f2a7f 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
@@ -37,7 +37,7 @@ NodeDef MakeFusedFilterNode(const NodeDef& first_filter_node,
                             const FunctionDef& fused_function,
                             MutableGraphView* graph) {
   NodeDef fused_node;
-  graph_utils::SetUniqueGraphNodeName("fused_filter", graph->GetGraph(),
+  graph_utils::SetUniqueGraphNodeName("fused_filter", graph->graph(),
                                       &fused_node);
 
   fused_node.set_op("FilterDataset");
@@ -109,7 +109,7 @@ Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* fused_filter_node = graph.AddNode(MakeFusedFilterNode(
         *first_filter_node, *second_filter_node, *fused_predicate, &graph));
 
-    graph.ReplaceInput(*second_filter_node, *fused_filter_node);
+    graph.UpdateFanouts(second_filter_node->name(), fused_filter_node->name());
 
     // TODO(prazek): we should run some optimizations on the fused filter
     // functions, or make sure that optimization passes run after filter
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index b2eec7220e3753c03e9f8caef7f47cd89c0bd14a..9d8b388a3a8bca1fb560e5acc94d50f3d82ed30d 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -19,11 +19,37 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace graph_tests_utils {
 
+NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name,
+                       StringPiece function_name) {
+  return test::function::NDef(
+      name, "FilterDataset", {string(input_node_name)},
+      {{"predicate", FunctionDefHelper::FunctionRef(string(function_name))},
+       {"Targuments", {}},
+       {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+       {"output_types", gtl::ArraySlice<TensorShape>{}}});
+}
+
+NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
+                            StringPiece batch_size_node_name,
+                            StringPiece num_parallel_calls_node_name,
+                            StringPiece drop_remainder_node_name,
+                            StringPiece function_name) {
+  return test::function::NDef(
+      name, "ExperimentalMapAndBatchDataset",
+      {string(input_node_name), "", string(batch_size_node_name),
+       string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
+      {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
+       {"Targuments", {}},
+       {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+       {"output_types", gtl::ArraySlice<TensorShape>{}}});
+}
+
 NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
                     StringPiece function_name) {
   return test::function::NDef(
@@ -34,14 +60,51 @@ NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
        {"output_types", gtl::ArraySlice<DataType>{}}});
 }
 
-NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name,
-                       StringPiece function_name) {
+NodeDef MakeParallelInterleaveNode(StringPiece name,
+                                   StringPiece input_node_name,
+                                   StringPiece cycle_length_node_name,
+                                   StringPiece block_length_node_name,
+                                   StringPiece num_parallel_calls_node_name,
+                                   StringPiece function_name, bool sloppy) {
   return test::function::NDef(
-      name, "FilterDataset", {string(input_node_name)},
-      {{"predicate", FunctionDefHelper::FunctionRef(string(function_name))},
-       {"Targuments", {}},
-       {"output_shapes", gtl::ArraySlice<TensorShape>{}},
-       {"output_types", gtl::ArraySlice<TensorShape>{}}});
+      name, "ParallelInterleaveDatasetV2",
+      {string(input_node_name), "", string(cycle_length_node_name),
+       string(block_length_node_name), string(num_parallel_calls_node_name)},
+      {
+          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"Targuments", {}},
+          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+          {"output_types", gtl::ArraySlice<TensorShape>{}},
+          {"sloppy", sloppy},
+      });
+}
+
+NodeDef MakeParallelMapNode(StringPiece name, StringPiece input_node_name,
+                            StringPiece num_parallel_calls_node_name,
+                            StringPiece function_name, bool sloppy) {
+  return test::function::NDef(
+      name, "ParallelMapDataset",
+      {string(input_node_name), string(num_parallel_calls_node_name)},
+      {
+          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"Targuments", {}},
+          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"sloppy", sloppy},
+      });
+}
+
+NodeDef MakeParseExampleNode(StringPiece name, StringPiece input_node_name,
+                             StringPiece num_parallel_calls_node_name,
+                             bool sloppy) {
+  return test::function::NDef(
+      name, "ParseExampleDataset",
+      {string(input_node_name), string(num_parallel_calls_node_name)},
+      {
+          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"sloppy", sloppy},
+      });
 }
 
 }  // end namespace graph_tests_utils
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
index ca0fde997daf30899fce9679cf3b8a15ae9709b1..a2707ee7b7f3888212f2402617d2063f1feb9c8d 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
@@ -23,11 +23,38 @@ namespace tensorflow {
 namespace grappler {
 namespace graph_tests_utils {
 
+// Creates a test NodeDef for FilterDataset.
+NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name,
+                       StringPiece function_name = "IsZero");
+
+// Creates a test NodeDef for MapDataset.
 NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
                     StringPiece function_name = "XTimesTwo");
 
-NodeDef MakeFilterNode(StringPiece name, StringPiece input_node_name,
-                       StringPiece function_name = "IsZero");
+// Creates a test NodeDef for MapAndBatchDataset.
+NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
+                            StringPiece batch_size_node_name,
+                            StringPiece num_parallel_calls_node_name,
+                            StringPiece drop_remainder_node_name,
+                            StringPiece function_name = "XTimesTwo");
+
+// Creates a test NodeDef for ParallelInterleaveDataset.
+NodeDef MakeParallelInterleaveNode(StringPiece name,
+                                   StringPiece input_node_name,
+                                   StringPiece cycle_length_node_name,
+                                   StringPiece block_length_node_name,
+                                   StringPiece num_parallel_calls_node_name,
+                                   StringPiece function_name, bool sloppy);
+
+// Creates a test NodeDef for ParallelMapDataset.
+NodeDef MakeParallelMapNode(StringPiece name, StringPiece input_node_name,
+                            StringPiece num_parallel_calls_node_name,
+                            StringPiece function_name, bool sloppy);
+
+// Creates a test NodeDef for ParseExampleDataset.
+NodeDef MakeParseExampleNode(StringPiece name, StringPiece input_node_name,
+                             StringPiece num_parallel_calls_node_name,
+                             bool sloppy);
 
 }  // end namespace graph_tests_utils
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index b863a25dc5f699f63d4d5f99f7f29d1a62004a53..90208c1fba6b089f57b303827cf1327ad43bf736 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -72,7 +72,7 @@ NodeDef* AddScalarConstNodeHelper(
     MutableGraphView* graph) {
   NodeDef node;
   node.set_op(kConstOpName);
-  SetUniqueGraphNodeName(kConstOpName, graph->GetGraph(), &node);
+  SetUniqueGraphNodeName(kConstOpName, graph->graph(), &node);
 
   (*node.mutable_attr())["dtype"].set_type(dtype);
   std::unique_ptr<tensorflow::TensorProto> tensor =
@@ -92,7 +92,7 @@ NodeDef* AddScalarConstNodeHelper(
 NodeDef* AddScalarPlaceholder(DataType dtype, MutableGraphView* graph) {
   NodeDef node;
   node.set_op("Placeholder");
-  SetUniqueGraphNodeName(node.op(), graph->GetGraph(), &node);
+  SetUniqueGraphNodeName(node.op(), graph->graph(), &node);
   (*node.mutable_attr())["dtype"].set_type(dtype);
   TensorShapeProto* shape = (*node.mutable_attr())["shape"].mutable_shape();
   shape->set_unknown_rank(false);
@@ -107,7 +107,7 @@ NodeDef* AddNode(StringPiece name, StringPiece op,
   if (!name.empty()) {
     node.set_name(string(name));
   } else {
-    SetUniqueGraphNodeName(op, graph->GetGraph(), &node);
+    SetUniqueGraphNodeName(op, graph->graph(), &node);
   }
   node.set_op(string(op));
   for (const string& input : inputs) {
@@ -228,7 +228,7 @@ std::vector<int> FindAllGraphNodesWithOp(const string& op,
 
 NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph) {
   if (node.input_size() == 0) return nullptr;
-  GraphView::InputPort input_port = graph.GetInputPort(node.name(), 0);
+  MutableGraphView::InputPort input_port = graph.GetInputPort(node.name(), 0);
   return graph.GetRegularFanin(input_port).node;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 4ab6d71532ce005a1eddf664f3e3ccd8d646b60e..5c0f03dca8774d64395c8bc0f2c1334a45bfe9dc 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -41,7 +41,7 @@ TEST(GraphUtilsTest, AddScalarConstNodeBool) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
   NodeDef* bool_node = AddScalarConstNode<bool>(true, &graph);
-  EXPECT_TRUE(ContainsGraphNodeWithName(bool_node->name(), *graph.GetGraph()));
+  EXPECT_TRUE(ContainsGraphNodeWithName(bool_node->name(), *graph.graph()));
   EXPECT_EQ(bool_node->attr().at("value").tensor().bool_val(0), true);
 }
 
@@ -49,8 +49,7 @@ TEST(GraphUtilsTest, AddScalarConstNodeDouble) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
   NodeDef* double_node = AddScalarConstNode<double>(3.14, &graph);
-  EXPECT_TRUE(
-      ContainsGraphNodeWithName(double_node->name(), *graph.GetGraph()));
+  EXPECT_TRUE(ContainsGraphNodeWithName(double_node->name(), *graph.graph()));
   EXPECT_FLOAT_EQ(double_node->attr().at("value").tensor().double_val(0), 3.14);
 }
 
@@ -58,7 +57,7 @@ TEST(GraphUtilsTest, AddScalarConstNodeFloat) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
   NodeDef* float_node = AddScalarConstNode<float>(3.14, &graph);
-  EXPECT_TRUE(ContainsGraphNodeWithName(float_node->name(), *graph.GetGraph()));
+  EXPECT_TRUE(ContainsGraphNodeWithName(float_node->name(), *graph.graph()));
   EXPECT_FLOAT_EQ(float_node->attr().at("value").tensor().float_val(0), 3.14);
 }
 
@@ -66,7 +65,7 @@ TEST(GraphUtilsTest, AddScalarConstNodeInt) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
   NodeDef* int_node = AddScalarConstNode<int>(42, &graph);
-  EXPECT_TRUE(ContainsGraphNodeWithName(int_node->name(), *graph.GetGraph()));
+  EXPECT_TRUE(ContainsGraphNodeWithName(int_node->name(), *graph.graph()));
   EXPECT_EQ(int_node->attr().at("value").tensor().int_val(0), 42);
 }
 
@@ -74,7 +73,7 @@ TEST(GraphUtilsTest, AddScalarConstNodeInt64) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
   NodeDef* int64_node = AddScalarConstNode<int64>(42, &graph);
-  EXPECT_TRUE(ContainsGraphNodeWithName(int64_node->name(), *graph.GetGraph()));
+  EXPECT_TRUE(ContainsGraphNodeWithName(int64_node->name(), *graph.graph()));
   EXPECT_EQ(int64_node->attr().at("value").tensor().int64_val(0), 42);
 }
 
@@ -82,8 +81,7 @@ TEST(GraphUtilsTest, AddScalarConstNodeString) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
   NodeDef* string_node = AddScalarConstNode<StringPiece>("hello", &graph);
-  EXPECT_TRUE(
-      ContainsGraphNodeWithName(string_node->name(), *graph.GetGraph()));
+  EXPECT_TRUE(ContainsGraphNodeWithName(string_node->name(), *graph.graph()));
   EXPECT_EQ(string_node->attr().at("value").tensor().string_val(0), "hello");
 }
 
@@ -106,13 +104,13 @@ TEST(GraphUtilsTest, Compare) {
 TEST(GraphUtilsTest, ContainsGraphNodeWithName) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
-  EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.GetGraph()));
+  EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.graph()));
 
   AddNode("A", "OpA", {}, {}, &graph);
-  EXPECT_TRUE(ContainsGraphNodeWithName("A", *graph.GetGraph()));
+  EXPECT_TRUE(ContainsGraphNodeWithName("A", *graph.graph()));
 
   graph.DeleteNodes({"A"});
-  EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.GetGraph()));
+  EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.graph()));
 }
 
 TEST(GraphUtilsTest, ContainsGraphFunctionWithName) {
@@ -128,25 +126,25 @@ TEST(GraphUtilsTest, ContainsGraphFunctionWithName) {
 TEST(GraphUtilsTest, ContainsNodeWithOp) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
-  EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.GetGraph()));
+  EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.graph()));
 
   AddNode("A", "OpA", {}, {}, &graph);
-  EXPECT_TRUE(ContainsNodeWithOp("OpA", *graph.GetGraph()));
+  EXPECT_TRUE(ContainsNodeWithOp("OpA", *graph.graph()));
 
   graph.DeleteNodes({"A"});
-  EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.GetGraph()));
+  EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.graph()));
 }
 
 TEST(GraphUtilsTest, FindGraphNodeWithName) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
-  EXPECT_EQ(FindGraphNodeWithName("A", *graph.GetGraph()), -1);
+  EXPECT_EQ(FindGraphNodeWithName("A", *graph.graph()), -1);
 
   AddNode("A", "OpA", {}, {}, &graph);
-  EXPECT_NE(FindGraphNodeWithName("A", *graph.GetGraph()), -1);
+  EXPECT_NE(FindGraphNodeWithName("A", *graph.graph()), -1);
 
   graph.DeleteNodes({"A"});
-  EXPECT_EQ(FindGraphNodeWithName("A", *graph.GetGraph()), -1);
+  EXPECT_EQ(FindGraphNodeWithName("A", *graph.graph()), -1);
 }
 
 TEST(GraphUtilsTest, FindGraphFunctionWithName) {
@@ -162,35 +160,35 @@ TEST(GraphUtilsTest, FindGraphFunctionWithName) {
 TEST(GraphUtilsTest, FindGraphNodeWithOp) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
-  EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.GetGraph()), -1);
+  EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.graph()), -1);
 
   AddNode("A", "OpA", {}, {}, &graph);
   AddNode("B", "OpB", {"A"}, {}, &graph);
   AddNode("A2", "OpA", {"B"}, {}, &graph);
-  EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.GetGraph()), 0);
+  EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.graph()), 0);
 
   graph.DeleteNodes({"B"});
-  EXPECT_EQ(FindGraphNodeWithOp("OpB", *graph.GetGraph()), -1);
-  EXPECT_EQ(FindGraphNodeWithName("A2", *graph.GetGraph()), 1);
+  EXPECT_EQ(FindGraphNodeWithOp("OpB", *graph.graph()), -1);
+  EXPECT_EQ(FindGraphNodeWithName("A2", *graph.graph()), 1);
 }
 
 TEST(GraphUtilsTest, FindAllGraphNodesWithOp) {
   GraphDef graph_def;
   MutableGraphView graph(&graph_def);
-  EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.GetGraph()), -1);
+  EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.graph()), -1);
 
   AddNode("A", "OpA", {}, {}, &graph);
   AddNode("B", "OpB", {"A"}, {}, &graph);
   AddNode("A2", "OpA", {"B"}, {}, &graph);
   std::vector<int> result_indices =
-      FindAllGraphNodesWithOp("OpA", *graph.GetGraph());
+      FindAllGraphNodesWithOp("OpA", *graph.graph());
   EXPECT_EQ(result_indices.size(), 2);
   EXPECT_EQ(result_indices.at(0), 0);
   EXPECT_EQ(result_indices.at(1), 2);
 
   graph.DeleteNodes({"A2"});
   std::vector<int> result_indices_new =
-      FindAllGraphNodesWithOp("OpA", *graph.GetGraph());
+      FindAllGraphNodesWithOp("OpA", *graph.graph());
   EXPECT_EQ(result_indices_new.size(), 1);
   EXPECT_EQ(result_indices_new.at(0), 0);
 }
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
index ce0b2db03963b2f1174866a255eaa8b2b8463f89..60755256d83d74287748125e18ccd8a63a1b4759 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
@@ -39,7 +39,7 @@ NodeDef MakeStatelessMap(const NodeDef& map_node, const NodeDef& zip_node,
                          const FunctionDef& stateless_function,
                          MutableGraphView* graph) {
   NodeDef stateless_map;
-  graph_utils::SetUniqueGraphNodeName("stateless_map", graph->GetGraph(),
+  graph_utils::SetUniqueGraphNodeName("stateless_map", graph->graph(),
                                       &stateless_map);
 
   stateless_map.set_op("MapDataset");
@@ -67,8 +67,8 @@ NodeDef MakeStatelessMap(const NodeDef& map_node, const NodeDef& zip_node,
 NodeDef MakeRandomDataset(const NodeDef& random_uniform_node,
                           MutableGraphView* graph) {
   NodeDef random_dataset;
-  random_dataset.set_op("RandomDataset");
-  graph_utils::SetUniqueGraphNodeName("RandomDataset", graph->GetGraph(),
+  random_dataset.set_op("ExperimentalRandomDataset");
+  graph_utils::SetUniqueGraphNodeName("RandomDataset", graph->graph(),
                                       &random_dataset);
 
   const auto* seed = graph_utils::AddScalarConstNode<int64>(
@@ -89,7 +89,7 @@ NodeDef MakeRandomDataset(const NodeDef& random_uniform_node,
 NodeDef MakeBatchTwo(const NodeDef& random_dataset, MutableGraphView* graph) {
   NodeDef batch_dataset;
   batch_dataset.set_op("BatchDatasetV2");
-  graph_utils::SetUniqueGraphNodeName("pair_of_random", graph->GetGraph(),
+  graph_utils::SetUniqueGraphNodeName("pair_of_random", graph->graph(),
                                       &batch_dataset);
   const auto* batch_size = graph_utils::AddScalarConstNode<int64>(2, graph);
   const auto* drop_reminder = graph_utils::AddScalarConstNode(false, graph);
@@ -112,7 +112,7 @@ NodeDef MakeBatchTwo(const NodeDef& random_dataset, MutableGraphView* graph) {
 NodeDef MakeZipNode(const NodeDef& first_node, const NodeDef& second_node,
                     MutableGraphView* graph) {
   NodeDef zip_node;
-  graph_utils::SetUniqueGraphNodeName("zip_with_random", graph->GetGraph(),
+  graph_utils::SetUniqueGraphNodeName("zip_with_random", graph->graph(),
                                       &zip_node);
 
   zip_node.set_op("ZipDataset");
@@ -266,7 +266,7 @@ Status HoistRandomUniform::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* stateless_map = graph.AddNode(
         MakeStatelessMap(*map_node, *zip_node, *stateless_func, &graph));
 
-    graph.ReplaceInput(*map_node, *stateless_map);
+    graph.UpdateFanouts(map_node->name(), stateless_map->name());
 
     // TODO(b/116285210): we could also remove map functions from library if
     // they are not used anymore.
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
index 455459e3f67d9cb51bf24af24e2c73f30447b24f..b6a29a442ea3a3e62eeec8d1f571fef5225c3c80 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
@@ -55,7 +55,7 @@ TEST(HoistRandomUniform, SimpleHoisting) {
   const int zip_dataset_id =
       graph_utils::FindGraphNodeWithOp("ZipDataset", output);
   const int random_dataset_id =
-      graph_utils::FindGraphNodeWithOp("RandomDataset", output);
+      graph_utils::FindGraphNodeWithOp("ExperimentalRandomDataset", output);
   const int batch_random_id =
       graph_utils::FindGraphNodeWithOp("BatchDatasetV2", output);
   ASSERT_NE(random_dataset_id, -1);
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index 9e382aeef9c257ea5523658c9d3087200f99bed9..52b4b785a3d09ca7f3bec3373d9dd1c8de444a87 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -31,14 +31,13 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kInsertOpName[] = "LatencyStatsDataset";
+constexpr char kInsertOpName[] = "ExperimentalLatencyStatsDataset";
 
 NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
   NodeDef new_node;
   new_node.set_op(kInsertOpName);
   graph_utils::SetUniqueGraphNodeName(
-      strings::StrCat(kInsertOpName, "_generated"), graph->GetGraph(),
-      &new_node);
+      strings::StrCat(kInsertOpName, "_generated"), graph->graph(), &new_node);
   // Set the input of LatencyDataset node as `node`
   new_node.add_input(node.name());
 
@@ -81,7 +80,8 @@ Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item,
       // node corresponds to a `Dataset` op.
       continue;
     }
-    GraphView::OutputPort output_port = graph.GetOutputPort(node.name(), 0);
+    MutableGraphView::OutputPort output_port =
+        graph.GetOutputPort(node.name(), 0);
     auto fanout = graph.GetFanout(output_port);
     if (fanout.size() > 1) {
       LOG(WARNING) << node.name() << " has fanout size " << fanout.size();
@@ -96,7 +96,8 @@ Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item,
       }
     }
 
-    graph.InsertNode(node, MakeLatencyNode(node, &graph));
+    NodeDef* latency_node = graph.AddNode(MakeLatencyNode(node, &graph));
+    graph.UpdateFanouts(node.name(), latency_node->name());
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
index 6789cf5bd669cfa61e161397f792700098923e75..d428d04a66659cd3b961428e3762ea3ab81ad69e 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
@@ -57,9 +57,10 @@ TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("LatencyStatsDataset", output));
-  std::vector<int> latency_node_indices =
-      graph_utils::FindAllGraphNodesWithOp("LatencyStatsDataset", output);
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalLatencyStatsDataset",
+                                              output));
+  std::vector<int> latency_node_indices = graph_utils::FindAllGraphNodesWithOp(
+      "ExperimentalLatencyStatsDataset", output);
   EXPECT_EQ(latency_node_indices.size(), 3);
   std::vector<NodeDef> dataset_nodes = {std::move(from_tensor_node),
                                         std::move(map_node),
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72c27a1d4afb8f3766a1f7c56ade37b1e161a039
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/make_numa_aware.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+NodeDef MakeNumaAwareNode(const NodeDef& node, MutableGraphView* graph) {
+  NodeDef numa_aware_node = node;
+  graph_utils::SetUniqueGraphNodeName("make_numa_aware", graph->graph(),
+                                      &numa_aware_node);
+  numa_aware_node.set_op("ExperimentalNumaMapAndBatchDataset");
+  return numa_aware_node;
+}
+
+}  // namespace
+
+Status MakeNumaAware::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* output) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+  std::set<string> nodes_to_delete;
+
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.op() != "ExperimentalMapAndBatchDataset") continue;
+
+    auto* numa_node = graph.AddNode(MakeNumaAwareNode(node, &graph));
+    graph.UpdateFanouts(node.name(), numa_node->name());
+    nodes_to_delete.insert(node.name());
+  }
+  graph.DeleteNodes(nodes_to_delete);
+  return Status::OK();
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(MakeNumaAware, "make_numa_aware");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.h b/tensorflow/core/grappler/optimizers/data/make_numa_aware.h
new file mode 100644
index 0000000000000000000000000000000000000000..48a7d8145f0157c6cea1633edb68d9ee3ee08de1
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class MakeNumaAware : public CustomGraphOptimizer {
+ public:
+  MakeNumaAware() = default;
+  ~MakeNumaAware() override = default;
+
+  string name() const override { return "make_numa_aware"; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override {}
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b83fb6ef19f8ee241dd4f7b635c9672ef01bcc0
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/make_numa_aware.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(MakeNumaAwareTest, ReplaceSimple) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {
+          NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+          NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+          NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+          NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+          NDef("batch_size", "Const", {}, {{"value", 3}, {"dtype", DT_INT32}}),
+          NDef("num_parallel_calls", "Const", {},
+               {{"value", 5}, {"dtype", DT_INT32}}),
+          NDef("drop_remainder", "Const", {},
+               {{"value", 0}, {"dtype", DT_BOOL}}),
+          graph_tests_utils::MakeMapAndBatchNode(
+              "map_and_batch", "range", "batch_size", "num_parallel_calls",
+              "drop_remainder"),
+      },
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  MakeNumaAware optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                               output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
+      "ExperimentalNumaMapAndBatchDataset", output));
+}
+
+TEST(MapAndBatchNumaAawareReplacementTest, ReplaceWithExtraChild) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {
+          NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+          NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+          NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+          NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+          NDef("batch_size", "Const", {}, {{"value", 3}, {"dtype", DT_INT32}}),
+          NDef("num_parallel_calls", "Const", {},
+               {{"value", 5}, {"dtype", DT_INT32}}),
+          NDef("drop_remainder", "Const", {},
+               {{"value", 0}, {"dtype", DT_BOOL}}),
+          graph_tests_utils::MakeMapAndBatchNode(
+              "map_and_batch", "range", "batch_size", "num_parallel_calls",
+              "drop_remainder"),
+          NDef("cache", "CacheDataset", {"map_and_batch"}, {}),
+      },
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  MakeNumaAware optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                               output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
+      "ExperimentalNumaMapAndBatchDataset", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
+
+  int numa_map_and_batch_component_id = graph_utils::FindGraphNodeWithOp(
+      "ExperimentalNumaMapAndBatchDataset", output);
+  auto& numa_map_and_batch_component =
+      output.node(numa_map_and_batch_component_id);
+  EXPECT_EQ(numa_map_and_batch_component.input(0), "range");
+
+  int cache_id = graph_utils::FindGraphNodeWithOp("CacheDataset", output);
+  auto& cache_node = output.node(cache_id);
+  EXPECT_EQ(cache_node.input(0), numa_map_and_batch_component.name());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.cc b/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1cfaef3ffb270cc338aaaef601f5f6037740112e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/make_sloppy.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status MakeSloppy::Optimize(Cluster* cluster, const GrapplerItem& item,
+                            GraphDef* output) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  for (NodeDef& node : *output->mutable_node()) {
+    if (node.op() == "ParallelInterleaveDatasetV2" ||
+        node.op() == "ParallelMapDataset" ||
+        node.op() == "ParseExampleDataset") {
+      (*node.mutable_attr())["sloppy"].set_b(true);
+    }
+  }
+  return Status::OK();
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(MakeSloppy, "make_sloppy");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.h b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dcab1038de3f6c39c4db4954903465bc0a6146d
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class MakeSloppy : public CustomGraphOptimizer {
+ public:
+  MakeSloppy() = default;
+  ~MakeSloppy() override = default;
+
+  string name() const override { return "make_sloppy"; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override {}
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy_test.cc b/tensorflow/core/grappler/optimizers/data/make_sloppy_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24431f47441369b27b742105cfe21f505df89ec8
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/make_sloppy.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+using graph_tests_utils::MakeParallelInterleaveNode;
+using graph_tests_utils::MakeParallelMapNode;
+using graph_tests_utils::MakeParseExampleNode;
+
+TEST(MakeSloppy, ParallelInterleave) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("cycle_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("block_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 1}, {"dtype", DT_INT32}}),
+       MakeParallelInterleaveNode("interleave", "range", "cycle_length",
+                                  "block_length", "num_parallel_calls",
+                                  "XTimesTwo", /*sloppy=*/false)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  MakeSloppy optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("interleave", output));
+  int index = graph_utils::FindGraphNodeWithName("interleave", output);
+  EXPECT_TRUE(output.node(index).attr().at("sloppy").b());
+}
+
+TEST(MakeSloppy, ParallelMap) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 1}, {"dtype", DT_INT32}}),
+       MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo",
+                           /*sloppy=*/false)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  MakeSloppy optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("map", output));
+  int index = graph_utils::FindGraphNodeWithName("map", output);
+  EXPECT_TRUE(output.node(index).attr().at("sloppy").b());
+}
+
+TEST(MakeSloppy, ParseExampleDataset) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 1}, {"dtype", DT_INT32}}),
+       MakeParseExampleNode("parse_example", "range", "num_parallel_calls",
+                            /*sloppy=*/false)},
+      // FunctionLib
+      {});
+
+  MakeSloppy optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("parse_example", output));
+  int index = graph_utils::FindGraphNodeWithName("parse_example", output);
+  EXPECT_TRUE(output.node(index).attr().at("sloppy").b());
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index e66766eb23bd53872c559a51e91cc27bbe4f7f47..84c4d82f6a38dd81e88374c6ce6a7a6082451a38 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -30,14 +30,13 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kFusedOpName[] = "MapAndBatchDatasetV2";
+constexpr char kFusedOpName[] = "ExperimentalMapAndBatchDataset";
 
 NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
                             MutableGraphView* graph) {
   NodeDef new_node;
   new_node.set_op(kFusedOpName);
-  graph_utils::SetUniqueGraphNodeName(kFusedOpName, graph->GetGraph(),
-                                      &new_node);
+  graph_utils::SetUniqueGraphNodeName(kFusedOpName, graph->graph(), &new_node);
 
   // Set the `input` input argument.
   new_node.add_input(map_node.input(0));
@@ -78,15 +77,22 @@ NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
     new_node.add_input(tmp->name());
   }
 
-  // Set `f` and `Targuments` attributes.
+  // Required attributes.
   for (auto key : {"f", "Targuments"}) {
     graph_utils::CopyAttribute(key, map_node, &new_node);
   }
-
-  // Set `output_types` and `output_shapes` attributes.
   for (auto key : {"output_shapes", "output_types"}) {
     graph_utils::CopyAttribute(key, batch_node, &new_node);
   }
+
+  // Optional attributes.
+  // TODO(jsimsa): Support `use_inter_op_parallelism` and `sloppy`.
+  for (auto key : {"preserve_cardinality"}) {
+    if (gtl::FindOrNull(map_node.attr(), key)) {
+      graph_utils::CopyAttribute(key, map_node, &new_node);
+    }
+  }
+
   return new_node;
 }
 
@@ -114,7 +120,7 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     auto* new_node =
         graph.AddNode(MakeMapAndBatchNode(*map_node, batch_node, &graph));
-    graph.ReplaceInput(batch_node, *new_node);
+    graph.UpdateFanouts(batch_node.name(), new_node->name());
 
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index b676246b318d5ba0997722f12f38a61347607873..ef4e64826f030ae404a0a523ad5f09bbf7e325a4 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -84,9 +84,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -169,9 +170,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -252,9 +254,10 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -309,7 +312,7 @@ TEST(MapAndBatchFusionTest, NoChange) {
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::Compare(*graph.GetGraph(), output));
+  EXPECT_TRUE(graph_utils::Compare(*graph.graph(), output));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index c4868eacbbf6d4ca0d99b631c1d0cf1f304a3e94..233d7968c8965a5ec2389aa297da72a9708b9257 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -37,23 +37,30 @@ NodeDef MakeFusedNode(const NodeDef& map_node,
                       const FunctionDef& fused_function,
                       MutableGraphView* graph) {
   NodeDef fused_node;
-  graph_utils::SetUniqueGraphNodeName("fused_map", graph->GetGraph(),
-                                      &fused_node);
-  fused_node.set_op("MapDataset");
-  fused_node.add_input(map_node.input(0));
+  graph_utils::SetUniqueGraphNodeName("fused_map", graph->graph(), &fused_node);
+  fused_node.set_op(map_node.op());
+
+  // Copy over inputs.
+  for (int i = 0; i < map_node.input_size(); ++i) {
+    fused_node.add_input(map_node.input(i));
+  }
 
   auto attr = map_node.attr().at("f");
   attr.mutable_func()->set_name(fused_function.signature().name());
   (*fused_node.mutable_attr())["f"] = std::move(attr);
 
-  graph_utils::CopyAttribute("Targuments", map_node, &fused_node);
-
-  for (auto key : {"output_shapes", "output_types"})
+  // Required attrs.
+  for (auto key : {"Targuments", "output_shapes", "output_types"}) {
     graph_utils::CopyAttribute(key, map_node, &fused_node);
+  }
 
-  if (const auto* attr =
-          gtl::FindOrNull(map_node.attr(), "use_inter_op_parallelism"))
-    (*fused_node.mutable_attr())["use_inter_op_parallelism"] = *attr;
+  // Optional attrs.
+  for (auto key :
+       {"use_inter_op_parallelism", "sloppy", "preserve_cardinality"}) {
+    if (gtl::FindOrNull(map_node.attr(), key)) {
+      graph_utils::CopyAttribute(key, map_node, &fused_node);
+    }
+  }
 
   // Add the predicate output attributes.
   (*fused_node.mutable_attr())["output_types"]
@@ -72,8 +79,8 @@ NodeDef MakeFilterByLastComponentNode(const NodeDef& fused_map_node,
                                       const NodeDef& filter_node,
                                       MutableGraphView* graph) {
   NodeDef filter_by_component;
-  graph_utils::SetUniqueGraphNodeName("FilterByLastComponent",
-                                      graph->GetGraph(), &filter_by_component);
+  graph_utils::SetUniqueGraphNodeName("FilterByLastComponent", graph->graph(),
+                                      &filter_by_component);
   filter_by_component.set_op("FilterByLastComponentDataset");
   filter_by_component.add_input(fused_map_node.name());
 
@@ -98,7 +105,9 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
-    if (node.op() == "MapDataset") return &node;
+    if (node.op() == "MapDataset" || node.op() == "ParallelMapDataset") {
+      return &node;
+    }
     return nullptr;
   };
 
@@ -146,7 +155,7 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* filter_by_component = graph.AddNode(
         MakeFilterByLastComponentNode(*fused_maps, *filter_node, &graph));
 
-    graph.ReplaceInput(*filter_node, *filter_by_component);
+    graph.UpdateFanouts(filter_node->name(), filter_by_component->name());
     TF_RETURN_IF_ERROR(function_library.AddFunctionDef(*fused_function));
 
     // TODO(prazek): we could also remove functions from library if they are not
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
index 6e6da37d7c20dee92bfe3676fa838ce82dd9222f..c5a5e22aba6cd2af4b2de9fa516e49b00e6e0c12 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion_test.cc
@@ -30,6 +30,7 @@ namespace grappler {
 namespace {
 using graph_tests_utils::MakeFilterNode;
 using graph_tests_utils::MakeMapNode;
+using graph_tests_utils::MakeParallelMapNode;
 
 TEST(MapAndFilterFusionTest, FuseMapAndFilter) {
   using test::function::NDef;
@@ -58,6 +59,41 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilter) {
       graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
 }
 
+TEST(MapAndFilterFusionTest, FuseParallelMapAndFilter) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 3}, {"dtype", "DT_INT32"}}),
+       MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo",
+                           /*sloppy=*/false),
+       MakeFilterNode("filter", "map")},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+          test::function::IsZero(),
+      });
+
+  MapAndFilterFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output))
+      << output.DebugString();
+  auto& map_node = output.node(
+      graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output));
+  EXPECT_FALSE(map_node.attr().at("sloppy").b()) << map_node.DebugString();
+  EXPECT_TRUE(
+      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output))
+      << output.DebugString();
+}
+
 TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) {
   using test::function::NDef;
   GrapplerItem item;
@@ -103,6 +139,56 @@ TEST(MapAndFilterFusionTest, FuseMapAndFilterWithExtraChild) {
   EXPECT_EQ(cache_node.input(0), filter_by_component.name());
 }
 
+TEST(MapAndFilterFusionTest, FuseParallelMapAndFilterWithExtraChild) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_STRING}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 3}, {"dtype", "DT_INT32"}}),
+       MakeParallelMapNode("map", "range", "num_parallel_calls", "XTimesTwo",
+                           /*sloppy=*/true),
+       MakeFilterNode("filter", "map"),
+       NDef("cache", "CacheDataset", {"filter", "filename"}, {})},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+          test::function::IsZero(),
+      });
+
+  MapAndFilterFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("filter", output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output));
+  ASSERT_TRUE(
+      graph_utils::ContainsNodeWithOp("FilterByLastComponentDataset", output));
+  ASSERT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
+
+  int map_id = graph_utils::FindGraphNodeWithOp("ParallelMapDataset", output);
+  auto& map_node = output.node(map_id);
+  ASSERT_EQ(map_node.input_size(), 2);
+  EXPECT_EQ(map_node.input(0), "range");
+  EXPECT_EQ(map_node.input(1), "num_parallel_calls");
+
+  int filter_by_component_id =
+      graph_utils::FindGraphNodeWithOp("FilterByLastComponentDataset", output);
+  auto& filter_by_component = output.node(filter_by_component_id);
+  ASSERT_EQ(filter_by_component.input_size(), 1);
+  EXPECT_EQ(filter_by_component.input(0), map_node.name());
+
+  int cache_id = graph_utils::FindGraphNodeWithOp("CacheDataset", output);
+  auto& cache_node = output.node(cache_id);
+  ASSERT_EQ(cache_node.input_size(), 2);
+  EXPECT_EQ(cache_node.input(0), filter_by_component.name());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index bd943342e8009b643b96886b415ec74dbec2517b..6b8015f96a29ac2fa2de3871a678a1b82efb12ff 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -39,8 +39,7 @@ NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
                       const FunctionDef& fused_function,
                       MutableGraphView* graph) {
   NodeDef fused_node;
-  graph_utils::SetUniqueGraphNodeName("fused_map", graph->GetGraph(),
-                                      &fused_node);
+  graph_utils::SetUniqueGraphNodeName("fused_map", graph->graph(), &fused_node);
   fused_node.set_op("MapDataset");
   fused_node.add_input(parent_map_node.input(0));
 
@@ -63,9 +62,16 @@ NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
       gtl::FindOrNull(map_node.attr(), "use_inter_op_parallelism");
   // Some graphs cannot execute with use_inter_op_parallelism=False, so we need
   // to set it to true if one of the ops have it set to true.
-  if (value_or_false(first_parallelism) || value_or_false(second_parallelism)) {
-    (*fused_node.mutable_attr())["use_inter_op_parallelism"].set_b(true);
-  }
+  (*fused_node.mutable_attr())["use_inter_op_parallelism"].set_b(
+      value_or_false(first_parallelism) || value_or_false(second_parallelism));
+
+  const auto* first_cardinality =
+      gtl::FindOrNull(parent_map_node.attr(), "preserve_cardinality");
+  const auto* second_cardinality =
+      gtl::FindOrNull(map_node.attr(), "preserve_cardinality");
+  (*fused_node.mutable_attr())["preserve_cardinality"].set_b(
+      value_or_false(first_cardinality) && value_or_false(second_cardinality));
+
   return fused_node;
 }
 
@@ -124,7 +130,7 @@ Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* fused_maps_node = graph.AddNode(
         MakeFusedNode(*parent_map_node, *map_node, *fused_function, &graph));
 
-    graph.ReplaceInput(*map_node, *fused_maps_node);
+    graph.UpdateFanouts(map_node->name(), fused_maps_node->name());
 
     // TODO(prazek): we should run some optimizations on the fused map
     // functions, or make sure that optimization passes run after map
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
index 782c9f48b74b94616cfeada505d7cde5edd88ce6..8e49f908a77288c8e99b62706578d86a272ab682 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
@@ -47,7 +47,7 @@ bool CanParallelize(const FunctionDef& function,
 
 NodeDef MakeParallelMap(const NodeDef& map_node, MutableGraphView* graph) {
   NodeDef parallel_map = map_node;
-  graph_utils::SetUniqueGraphNodeName("parallel_map", graph->GetGraph(),
+  graph_utils::SetUniqueGraphNodeName("parallel_map", graph->graph(),
                                       &parallel_map);
   parallel_map.set_op("ParallelMapDataset");
   // TODO(b/114475558): We want to set `num_parallel_calls` to a special value,
@@ -83,7 +83,7 @@ Status MapParallelization::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!CanParallelize(*function, function_library)) continue;
 
     auto* parallel_map = graph.AddNode(MakeParallelMap(*map_node, &graph));
-    graph.ReplaceInput(*map_node, *parallel_map);
+    graph.UpdateFanouts(map_node->name(), parallel_map->name());
     nodes_to_delete.insert(map_node->name());
   }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index a9254ed58baff6fe180f88d16d7f48479eef495a..3401dcc6f23bae1b2e77d5ea18a94f382fee4fb8 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -60,14 +60,24 @@ FunctionDef* CreateMapDefunWrapper(const NodeDef& map_node,
     graph_utils::CopyAttribute(k, map_node, map_defun_node);
   }
 
+  // Note that the inputs to the function are either regular arguments (for
+  // which the function is mapped across their 0th dimension) or captured inputs
+  // (for which the function takes the argument wholesale). We can infer
+  // the split between these arguments from the `map_node`'s attrs.
+  // The Targuments attr on `map_node` corresponds to a list of types of
+  // MapDataset's captured inputs.
+  auto t_captured = map_node.attr().at("Targuments");
+
   // Get types of input arguments from original map function
-  AttrValue t_args;
+  DataTypeVector t_args;  // Regular arguments
   for (const auto& input : vectorized_func->signature().input_arg()) {
-    t_args.mutable_list()->add_type(input.type());
+    t_args.push_back(input.type());
     map_defun_node->add_input(input.name());
   }
-  (*map_defun_node->mutable_attr())["Targuments"] = t_args;
-  AddNodeAttr("Tcaptured", DataTypeVector(), map_defun_node);
+  // Erase the captured arguments from Targuments
+  t_args.erase(t_args.end() - t_captured.list().type_size(), t_args.end());
+  AddNodeAttr("Targuments", t_args, map_defun_node);
+  AddNodeAttr("Tcaptured", t_captured, map_defun_node);
 
   // Set return values to match output names
   string output_prefix = strings::StrCat(map_defun_node->name(), ":output:");
@@ -96,7 +106,9 @@ FunctionDef* AddVectorizedFunction(const NodeDef& map_node,
       *vectorized_func, map_defun_node, library, &result);
 
   if (!s.ok()) {
-    LOG(ERROR) << "VectorizeMapDefun failed: " << s;
+    LOG(WARNING) << "VectorizeMapDefun failed. The function will only be "
+                    "naively vectorized with MapDefun. Reason: "
+                 << s;
     return vectorized_func;
   }
   return result;
@@ -129,17 +141,13 @@ bool IsStatefulFn(const FunctionLibraryDefinition& library,
   return false;
 }
 
-bool HasCapturedInputs(const NodeDef& map_node) {
-  return map_node.attr().at("Targuments").list().type_size() > 0;
-}
-
 NodeDef MakeNewBatchNode(const NodeDef& old_batch_node,
                          const NodeDef& input_node,
                          const FunctionDef& vectorized_func,
                          MutableGraphView* graph) {
   NodeDef batch_node;
   batch_node.set_op(old_batch_node.op());
-  graph_utils::SetUniqueGraphNodeName(batch_node.op(), graph->GetGraph(),
+  graph_utils::SetUniqueGraphNodeName(batch_node.op(), graph->graph(),
                                       &batch_node);
 
   // Set the `input_dataset` input argument
@@ -179,8 +187,7 @@ NodeDef MakeNewMapNode(const NodeDef& old_map_node,
                        MutableGraphView* graph) {
   NodeDef map_node;
   map_node.set_op(old_map_node.op());
-  graph_utils::SetUniqueGraphNodeName(map_node.op(), graph->GetGraph(),
-                                      &map_node);
+  graph_utils::SetUniqueGraphNodeName(map_node.op(), graph->graph(), &map_node);
 
   // Set the `input_dataset` input argument
   map_node.add_input(new_batch_node.name());
@@ -239,15 +246,12 @@ Status MapVectorization::Optimize(Cluster* cluster, const GrapplerItem& item,
     // Check that this is a valid optimization.
     if (!IsOutputShapesFullyDefined(*input_node) ||
         !IsOutputShapesFullyDefined(*map_node) ||
-        IsStatefulFn(function_library, *orig_func) ||
-        HasCapturedInputs(*map_node)) {
+        IsStatefulFn(function_library, *orig_func)) {
       // 1. If any of the inputs have an unknown shape, don't optimize, since
       // inputs might not be batchable.
       // 2. If any of the map func outputs have an unknown shape, don't
       // optimize, so that batching errors surface as before.
       // 3. If the function is stateful, don't vectorize it.
-      // 4. TODO(rachelim): Make this work for MapDataset with captured inputs
-      // by tiling inputs or modifying the signature of MapDefun.
       continue;
     }
 
@@ -260,7 +264,7 @@ Status MapVectorization::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     auto* new_map_node = graph.AddNode(MakeNewMapNode(
         *map_node, batch_node, *new_batch_node, *vectorized_func, &graph));
-    graph.ReplaceInput(batch_node, *new_map_node);
+    graph.UpdateFanouts(batch_node.name(), new_map_node->name());
 
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index cf5a19bab11cee0a166b9967478d703a4a90d130..bd405c8329464793ee42757bc7ee1a3f34826bd9 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -30,7 +30,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-bool IsTakeAll(const NodeDef& take_node, const GraphView& graph) {
+bool IsTakeAll(const NodeDef& take_node, const MutableGraphView& graph) {
   if (take_node.op() != "TakeDataset") return false;
 
   const auto& count_node = *graph.GetNode(take_node.input(1));
@@ -44,21 +44,28 @@ bool IsConstNodeWithValue(const NodeDef& node, int value) {
   return node.attr().at("value").tensor().int64_val(0) == value;
 }
 
-bool IsSkipNone(const NodeDef& skip_node, const GraphView& graph) {
+bool IsSkipNone(const NodeDef& skip_node, const MutableGraphView& graph) {
   if (skip_node.op() != "SkipDataset") return false;
   // We are looking only for skip(0) nodes.
   return IsConstNodeWithValue(*graph.GetNode(skip_node.input(1)), 0);
 }
 
-bool IsRepeatOne(const NodeDef& repeat_node, const GraphView& graph) {
+bool IsRepeatOne(const NodeDef& repeat_node, const MutableGraphView& graph) {
   if (repeat_node.op() != "RepeatDataset") return false;
   // We are looking only for repeat(1) nodes.
   return IsConstNodeWithValue(*graph.GetNode(repeat_node.input(1)), 1);
 }
 
-bool IsNoOp(const NodeDef& node, const GraphView& graph) {
+bool IsPrefetchZero(const NodeDef& prefetch_node,
+                    const MutableGraphView& graph) {
+  if (prefetch_node.op() != "PrefetchDataset") return false;
+  // We are looking only for prefetch(0) nodes.
+  return IsConstNodeWithValue(*graph.GetNode(prefetch_node.input(1)), 0);
+}
+
+bool IsNoOp(const NodeDef& node, const MutableGraphView& graph) {
   return IsTakeAll(node, graph) || IsSkipNone(node, graph) ||
-         IsRepeatOne(node, graph);
+         IsRepeatOne(node, graph) || IsPrefetchZero(node, graph);
 }
 
 }  // namespace
@@ -72,7 +79,7 @@ Status NoOpElimination::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!IsNoOp(node, graph)) continue;
 
     NodeDef* const parent = graph_utils::GetInputNode(node, graph);
-    graph.ReplaceInput(node, *parent);
+    graph.UpdateFanouts(node.name(), parent->name());
 
     nodes_to_delete.insert(node.name());
   }
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.h b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
index c67cea49d50ced7c95ccf51b47b678e85701c3af..a65fccd882b782d4c6ead5ef9cb15e2cebd05e6f 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.h
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 namespace grappler {
 
 // This class eliminates tf.data transformations such as `take(n)` (for n < 0),
-// `skip(0)`, or `repeat(1)`
+// `skip(0)`, `repeat(1)`, or `prefetch(0)`.
 class NoOpElimination : public CustomGraphOptimizer {
  public:
   NoOpElimination() = default;
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
index be1a66df75d2cd24c719f931e3884cb462389ff1..323bb1d599f36ed21f5a0ddf8649f53fdf980ef8 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
@@ -106,6 +106,8 @@ INSTANTIATE_TEST_CASE_P(
                       std::make_tuple("SkipDataset", -1, true),
                       std::make_tuple("SkipDataset", 0, false),
                       std::make_tuple("SkipDataset", 3, true),
+                      std::make_tuple("PrefetchDataset", 0, false),
+                      std::make_tuple("PrefetchDataset", 1, true),
                       std::make_tuple("RepeatDataset", 1, false),
                       std::make_tuple("RepeatDataset", 2, true)));
 
@@ -154,6 +156,8 @@ INSTANTIATE_TEST_CASE_P(
                       std::make_tuple("SkipDataset", -1, true),
                       std::make_tuple("SkipDataset", 0, false),
                       std::make_tuple("SkipDataset", 3, true),
+                      std::make_tuple("PrefetchDataset", 0, false),
+                      std::make_tuple("PrefetchDataset", 1, true),
                       std::make_tuple("RepeatDataset", 1, false),
                       std::make_tuple("RepeatDataset", 2, true)));
 
@@ -206,12 +210,15 @@ TEST_P(NoOpMultipleEliminationTest, EliminateMultipleNoOpNode) {
 const auto *const kTakeNode = new std::pair<string, int>{"TakeDataset", -1};
 const auto *const kSkipNode = new std::pair<string, int>{"SkipDataset", 0};
 const auto *const kRepeatNode = new std::pair<string, int>{"RepeatDataset", 1};
+const auto *const kPrefetchNode =
+    new std::pair<string, int>{"PrefetchDataset", 0};
 
 INSTANTIATE_TEST_CASE_P(
     BasicRemovalTest, NoOpMultipleEliminationTest,
-    ::testing::Combine(::testing::Values(*kTakeNode, *kSkipNode, *kRepeatNode),
-                       ::testing::Values(*kTakeNode, *kSkipNode,
-                                         *kRepeatNode)));
+    ::testing::Combine(::testing::Values(*kTakeNode, *kSkipNode, *kRepeatNode,
+                                         *kPrefetchNode),
+                       ::testing::Values(*kTakeNode, *kSkipNode, *kRepeatNode,
+                                         *kPrefetchNode)));
 
 struct NoOpPlaceholdersTest
     : ::testing::TestWithParam<std::tuple<string, string>> {};
@@ -244,9 +251,10 @@ TEST_P(NoOpPlaceholdersTest, NonConstNoOpNode) {
 
 INSTANTIATE_TEST_CASE_P(
     DoNotRemovePlaceholders, NoOpPlaceholdersTest,
-    ::testing::Combine(
-        ::testing::Values("TakeDataset", "SkipDataset", "RepeatDataset"),
-        ::testing::Values("TakeDataset", "SkipDataset", "RepeatDataset")));
+    ::testing::Combine(::testing::Values("TakeDataset", "SkipDataset",
+                                         "RepeatDataset", "PrefetchDataset"),
+                       ::testing::Values("TakeDataset", "SkipDataset",
+                                         "RepeatDataset", "PrefetchDataset")));
 
 }  // namespace
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index 99c4afa6340094991ffa9646710a7febc66f7d64..d9af78d38cd590f5eecefe4d70c7e45dd94985c0 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -86,7 +86,7 @@ Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
 
     NodeDef* shuffle_and_repeat_node =
         graph.AddNode(make_shuffle_and_repeat_node(shuffle_node, repeat_node));
-    graph.ReplaceInput(repeat_node, *shuffle_and_repeat_node);
+    graph.UpdateFanouts(repeat_node.name(), shuffle_and_repeat_node->name());
 
     // Mark the `Shuffle` and `Repeat` nodes for removal.
     nodes_to_delete.insert(shuffle_node.name());
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
index f0696eb76d02cc11346da44d70fd86b3ce1a9cbb..556e1d3ab57947f122e383eb32342caf1a0924cb 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
@@ -127,7 +127,7 @@ TEST(ShuffleAndRepeatFusionTest, NoChange) {
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::Compare(*graph.GetGraph(), output));
+  EXPECT_TRUE(graph_utils::Compare(*graph.graph(), output));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 37aa24b947199a20cdd6f5f3b56bd6166840433d..541302361fb07066127196166750e6f5324b7d98 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -9,14 +9,31 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 
 VECTORIZER_DEPS = [
     ":vectorizer_registry",
+    "//tensorflow/cc:ops",
     "//tensorflow/core/grappler/optimizers/data:graph_utils",
+    "//tensorflow/core:core_cpu",
+    "//tensorflow/cc:scope_internal",
+    "//tensorflow/cc:math_ops",
+    "//tensorflow/cc:array_ops",
+    "//tensorflow/cc:const_op",
 ] + tf_protos_all()
 
+cc_library(
+    name = "wrapped_tensor",
+    hdrs = ["wrapped_tensor.h"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "vectorizer",
     hdrs = ["vectorizer.h"],
     deps = [
+        ":wrapped_tensor",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/cc:ops",
         "//tensorflow/core:lib",
     ] + tf_protos_all(),
 )
@@ -29,16 +46,61 @@ cc_library(
         ":vectorizer",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
+tf_cc_test(
+    name = "vectorizer_registry_test",
+    srcs = ["vectorizer_registry_test.cc"],
+    deps = [
+        ":vectorizer_registry",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + tf_protos_all(),
+)
+
 cc_library(
-    name = "cast_vectorizer",
-    srcs = ["cast_vectorizer.cc"],
+    name = "cwise_op_vectorizer",
+    srcs = ["cwise_op_vectorizer.cc"],
     deps = VECTORIZER_DEPS,
     alwayslink = 1,
 )
 
+cc_library(
+    name = "decode_csv_vectorizer",
+    srcs = ["decode_csv_vectorizer.cc"],
+    deps = VECTORIZER_DEPS,
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "parse_single_example_vectorizer",
+    srcs = ["parse_single_example_vectorizer.cc"],
+    deps = VECTORIZER_DEPS,
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "reshape_vectorizer",
+    srcs = ["reshape_vectorizer.cc"],
+    deps = VECTORIZER_DEPS,
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "transpose_vectorizer",
+    srcs = ["transpose_vectorizer.cc"],
+    deps = VECTORIZER_DEPS + [
+        ":vectorizer",
+        ":wrapped_tensor",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "unpack_vectorizer",
     srcs = ["unpack_vectorizer.cc"],
@@ -51,20 +113,14 @@ cc_library(
     hdrs = ["vectorizer_registry.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":cast_vectorizer",
+        ":cwise_op_vectorizer",
+        ":decode_csv_vectorizer",
+        ":parse_single_example_vectorizer",
+        ":reshape_vectorizer",
+        ":transpose_vectorizer",
         ":unpack_vectorizer",
         ":vectorizer",
         ":vectorizer_registry",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
-
-tf_cc_test(
-    name = "vectorizer_registry_test",
-    srcs = ["vectorizer_registry_test.cc"],
-    deps = [
-        ":vectorizer_registry",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ] + tf_protos_all(),
-)
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc
deleted file mode 100644
index 3af6bab40906be175d6de6f3115c0206873bb5ee..0000000000000000000000000000000000000000
--- a/tensorflow/core/grappler/optimizers/data/vectorization/cast_vectorizer.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-#include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
-
-namespace tensorflow {
-namespace grappler {
-namespace vectorization_utils {
-
-class CastVectorizer : public Vectorizer {
- public:
-  Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<Port>* input_ports,
-                   std::vector<Port>* output_ports) override {
-    Status s;
-    if (node.num_inputs() != 1) {
-      return errors::Internal("Cast op should only have one input.");
-    }
-
-    // Add new Cast node with the same op and attrs as the original node
-    auto new_cast_node = outer_scope->AddNode(node.def(), &s);
-    TF_RETURN_IF_ERROR(s);
-
-    // Add input and output mappings
-    input_ports->push_back({new_cast_node, 0});
-    output_ports->push_back({new_cast_node, 0});
-    return Status::OK();
-  }
-};
-
-REGISTER_VECTORIZER("Cast", CastVectorizer);
-
-}  // namespace vectorization_utils
-}  // namespace grappler
-}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9d853f84a8a7bad557452f3cbd14db05bef58bf1
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/cwise_op_vectorizer.cc
@@ -0,0 +1,272 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope_internal.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+const char* const kExpandDimsPrefix = "vectorized/expanddims/";
+
+// Reshapes stacked inputs for broadcast. Stacked inputs have an extra leading
+// dimension, which may cause automatic broadcasting rules to expand the
+// input dimensions wrongly when the unstacked shapes have different ranks.
+// To avoid that, we reshape stacked inputs to the maximum rank they need
+// to be broadcasted to.
+//
+// For example, suppose we have inputs A and B, where A is a stacked tensor with
+// shape [n, 5] (where n is the stack size) and B is an unstacked tensor with
+// shape [12, 7, 5]. If we added them directly, tensorflow broadcasting rules
+// would expand the dimensions of A to [1, n, 5], then (incorrectly) check that
+// the dimensions n and 7 are compatible, and if so, create an output of shape
+// [12, 7, 5]. However, correct addition of these inputs would create an output
+// with shape [n, 12, 7, 5]: we need to manually expand the dimensions of A
+// *after* the leading dimension, i.e. expand A to the shape [n, 1, 1, 5] before
+// broadcasting.
+Status ExpandDimsForBroadcast(VectorizerInput* inputs, Graph* g) {
+  Status status;
+  Scope parent = NewInternalScope(g, &status, nullptr);
+  Scope scope = parent.NewSubScope(kExpandDimsPrefix);
+
+  // TODO(rachelim): We can potentially get rid of all these ops if shapes are
+  // known statically
+
+  // Get the stacked rank of each input
+  auto get_stacked_rank = [&scope](const WrappedTensor& input) {
+    Output rank = ops::Rank(scope, Output(input.node, input.output_index));
+
+    if (!input.stacked) {
+      // If the input is unstacked, add 1
+      rank = ops::Add(scope, rank, ops::Const(scope, 1));
+    }
+
+    return rank;
+  };
+
+  Output rank_0 = get_stacked_rank(inputs->at(0));
+  Output rank_1 = get_stacked_rank(inputs->at(1));
+
+  Output max_rank = ops::Maximum(scope, rank_0, rank_1);
+
+  // For all inputs that are stacked, expand dimensions after dim 0.
+  auto expand_dims_if_unstacked =
+      [&scope, &max_rank](const WrappedTensor& tensor, const Output& rank) {
+        if (!tensor.stacked)
+          return WrappedTensor(tensor.node, tensor.output_index, false);
+
+        Output input(tensor.node, tensor.output_index);
+
+        Output rank_diff = ops::Sub(scope, max_rank, rank);
+
+        // [1] * rank_diff
+        Output ones = ops::Fill(
+            scope, ops::ExpandDims(scope, rank_diff, ops::Const(scope, 0)),
+            ops::Const(scope, 1));
+
+        Output shape = ops::Shape(scope, input);
+
+        Output const_vec_1 = ops::Const(scope, {1});
+        // shape[:1]
+        Output concat_pre = ops::StridedSlice(
+            scope, shape, const_vec_1, const_vec_1, const_vec_1,
+            ops::StridedSlice::Attrs().BeginMask(1));
+
+        // shape[1:]
+        Output concat_post = ops::StridedSlice(
+            scope, shape, const_vec_1, const_vec_1, const_vec_1,
+            ops::StridedSlice::Attrs().EndMask(1));
+
+        // tf.concat([shape[:1], ones, shape[1:]], 0)
+        Output new_shape = ops::Concat(scope, {concat_pre, ones, concat_post},
+                                       ops::Const(scope, 0));
+
+        Output reshaped = ops::Reshape(scope, input, new_shape);
+
+        return WrappedTensor(reshaped.node(), 0, true);
+      };
+
+  *inputs = VectorizerInput({expand_dims_if_unstacked(inputs->at(0), rank_0),
+                             expand_dims_if_unstacked(inputs->at(1), rank_1)});
+  return Status::OK();
+}
+
+// Vectorization helper for component-wise ops. Since these operations act
+// component-wise, the vectorized op is the same as the original.
+Status CwiseVectorizeHelper(const Node& node, Graph* outer_scope,
+                            VectorizerInput&& inputs,
+                            VectorizerOutput* outputs) {
+  // Add new node with the same op type and attrs as the original node
+  Node* new_node;
+  auto node_builder = NodeBuilder(strings::StrCat("vectorized/", node.name()),
+                                  node.type_string());
+  for (const auto& input : inputs) {
+    node_builder = node_builder.Input(input.node, input.output_index);
+  }
+  for (const auto& attr_slice : node.attrs()) {
+    node_builder = node_builder.Attr(attr_slice.first, attr_slice.second);
+  }
+  TF_RETURN_IF_ERROR(node_builder.Finalize(outer_scope, &new_node));
+
+  // Add output mappings
+  outputs->push_back({new_node, 0, true});
+  return Status::OK();
+}
+
+class UnaryCwiseOpVectorizer : public Vectorizer {
+ public:
+  Status Vectorize(const Node& node, Graph* outer_scope,
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    if (inputs.size() != 1) {
+      return errors::Internal("Failed to vectorize ", node.type_string(),
+                              ". The op should have 1 input, but has ",
+                              inputs.size());
+    }
+
+    return CwiseVectorizeHelper(node, outer_scope, std::move(inputs), outputs);
+  }
+};
+
+class BinaryCwiseOpVectorizer : public Vectorizer {
+ public:
+  Status Vectorize(const Node& node, Graph* outer_scope,
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    if (inputs.size() != 2) {
+      return errors::Internal("Failed to vectorize ", node.type_string(),
+                              ". The op should have 2 input, but has ",
+                              inputs.size());
+    }
+    // Binary ops support broadcasting
+    TF_RETURN_IF_ERROR(ExpandDimsForBroadcast(&inputs, outer_scope));
+
+    return CwiseVectorizeHelper(node, outer_scope, std::move(inputs), outputs);
+  }
+};
+
+// Bitwise unary
+REGISTER_VECTORIZER("Invert", UnaryCwiseOpVectorizer);
+
+// Logical unary
+REGISTER_VECTORIZER("LogicalNot", UnaryCwiseOpVectorizer);
+
+// Complex unary
+REGISTER_VECTORIZER("Angle", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("ComplexAbs", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Conj", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Imag", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Real", UnaryCwiseOpVectorizer);
+
+// Real unary
+REGISTER_VECTORIZER("Abs", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Acos", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Acosh", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Asin", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Asinh", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Atan", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Atanh", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("BesselI0e", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("BesselI1e", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Ceil", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Cos", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Cosh", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Digamma", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Elu", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Erf", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Erfc", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Exp", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Expm1", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Floor", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Inv", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("IsFinite", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("IsInf", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Lgamma", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Log", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Log1p", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Neg", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Reciprocal", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Relu", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Relu6", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Rint", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Round", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Rsqrt", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Selu", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Sigmoid", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Sign", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Sin", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Sinh", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Softplus", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Softsign", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Sqrt", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Square", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Tanh", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Tan", UnaryCwiseOpVectorizer);
+
+// Miscellaneous unary
+REGISTER_VECTORIZER("Cast", UnaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Identity", UnaryCwiseOpVectorizer);
+
+// Bitwise binary
+REGISTER_VECTORIZER("BitwiseAnd", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("BitwiseOr", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("BitwiseXor", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("LeftShift", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("RightShift", BinaryCwiseOpVectorizer);
+
+// Logical binary
+REGISTER_VECTORIZER("LogicalAnd", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("LogicalOr", BinaryCwiseOpVectorizer);
+
+// Real binary
+REGISTER_VECTORIZER("Add", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("AddV2", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Atan2", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Complex", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Div", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("DivNoNan", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Equal", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("FloorDiv", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("FloorMod", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Greater", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("GreaterEqual", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Igamma", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Igammac", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("IgammaGradA", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Less", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("LessEqual", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Maximum", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Minimum", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Mod", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Mul", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("NotEqual", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Polygamma", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Pow", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("RealDiv", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("SquaredDifference", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Sub", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("TruncateDiv", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("TruncateMod", BinaryCwiseOpVectorizer);
+REGISTER_VECTORIZER("Zeta", BinaryCwiseOpVectorizer);
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76c0047747645915456eac7eef887d8eb302ba15
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/decode_csv_vectorizer.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+// DecodeCSV is the vectorized version of itself.
+class DecodeCSVVectorizer : public Vectorizer {
+ public:
+  Status Vectorize(const Node& node, Graph* outer_scope,
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    NodeBuilder::NodeOut records;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &records));
+
+    std::vector<NodeBuilder::NodeOut> defaults;
+    defaults.resize(inputs.size() - 1);
+    for (size_t i = 1; i < inputs.size(); ++i) {
+      TF_RETURN_IF_ERROR(inputs.unstacked(i, &defaults[i - 1]));
+    }
+
+    Node* new_node;
+    auto node_builder = NodeBuilder(node.type_string(), node.type_string())
+                            .Input(records)
+                            .Input(defaults);
+
+    for (const auto& attr : node.attrs()) {
+      node_builder = node_builder.Attr(attr.first, attr.second);
+    }
+    TF_RETURN_IF_ERROR(node_builder.Finalize(outer_scope, &new_node));
+
+    // Add output mappings
+    for (int i = 0; i < node.num_outputs(); ++i) {
+      outputs->emplace_back(new_node, i, true);
+    }
+    return Status::OK();
+  }
+};
+
+REGISTER_VECTORIZER("DecodeCSV", DecodeCSVVectorizer);
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f81b2d01d99452adfb970d1c81b3dd2e6ea3ae1d
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/parse_single_example_vectorizer.cc
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/scope_internal.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+// ParseExample is the vectorized version of ParseSingleExample.
+class ParseSingleExampleVectorizer : public Vectorizer {
+ public:
+  Status Vectorize(const Node& node, Graph* outer_scope,
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    NodeBuilder::NodeOut serialized;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &serialized));
+
+    std::vector<NodeBuilder::NodeOut> dense_defaults;
+    dense_defaults.resize(inputs.size() - 1);
+    for (size_t i = 1; i < inputs.size(); ++i) {
+      TF_RETURN_IF_ERROR(inputs.unstacked(i, &dense_defaults[i - 1]));
+    }
+
+    Status scope_status;
+    Scope parent = NewInternalScope(outer_scope, &scope_status, nullptr);
+    Scope s = parent.NewSubScope("vectorize/parse_single_example");
+
+    // Empty string vector
+    Node* names = ops::Const(s, std::initializer_list<string>({})).node();
+
+    // sparse_keys and dense_keys are attrs on ParseSingleExample, but are
+    // inputs on ParseExample. We have to add const input nodes for these.
+    auto make_list_input_from_attr =
+        [&s, &node](StringPiece attr_name,
+                    std::vector<NodeBuilder::NodeOut>* result) {
+          std::vector<string> attr_vals;
+          TF_RETURN_IF_ERROR(GetNodeAttr(node.attrs(), attr_name, &attr_vals));
+          result->reserve(attr_vals.size());
+
+          for (const auto& val : attr_vals) {
+            result->push_back(ops::Const(s, val).node());
+          }
+          return Status::OK();
+        };
+
+    std::vector<NodeBuilder::NodeOut> sparse_keys;
+    TF_RETURN_IF_ERROR(make_list_input_from_attr("sparse_keys", &sparse_keys));
+
+    std::vector<NodeBuilder::NodeOut> dense_keys;
+    TF_RETURN_IF_ERROR(make_list_input_from_attr("dense_keys", &dense_keys));
+
+    TF_RETURN_IF_ERROR(scope_status);
+
+    Node* new_node;
+    auto node_builder =
+        NodeBuilder(strings::StrCat("vectorized/", node.name()), "ParseExample")
+            .Input(serialized)
+            .Input(names)
+            .Input(sparse_keys)
+            .Input(dense_keys)
+            .Input(dense_defaults);
+
+    for (const auto& attr : {"sparse_types", "dense_shapes"}) {
+      // Copy attrs if they exist
+      const AttrValue* val;
+      TF_RETURN_IF_ERROR(node.attrs().Find(attr, &val));
+      node_builder = node_builder.Attr(attr, *val);
+    }
+
+    TF_RETURN_IF_ERROR(node_builder.Finalize(outer_scope, &new_node));
+
+    // Add output mappings
+    for (size_t i = 0; i < node.num_outputs(); ++i) {
+      outputs->emplace_back(new_node, i, true);
+    }
+    return Status::OK();
+  }
+};
+
+REGISTER_VECTORIZER("ParseSingleExample", ParseSingleExampleVectorizer);
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a094bfd1de4fe48811584e2dcf93fc67b6bb94da
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/reshape_vectorizer.cc
@@ -0,0 +1,75 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope_internal.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+const char* const kReshapePrefix = "vectorized/reshape";
+
+// The vectorized shape should be the original shape with an additional leading
+// dimension that is the same as the leading dimension of the stacked
+// input tensor.
+Output GetVectorizedShape(Scope* s, Output tensor, Output original_shape) {
+  Output const_vec_1 = ops::Const(*s, {1});
+  Output shape = ops::Shape(*s, tensor);
+
+  // shape[:1]
+  Output dim_0 =
+      ops::StridedSlice(*s, shape, const_vec_1, const_vec_1, const_vec_1,
+                        ops::StridedSlice::Attrs().BeginMask(1));
+
+  // tf.concat([dim_0, original], 0)
+  return ops::Concat(*s, {dim_0, original_shape}, ops::Const(*s, 0));
+}
+
+class ReshapeVectorizer : public Vectorizer {
+ public:
+  Status Vectorize(const Node& node, Graph* outer_scope,
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    Status status;
+    Scope parent = NewInternalScope(outer_scope, &status, nullptr);
+    Scope s = parent.NewSubScope(kReshapePrefix);
+
+    Output tensor, shape;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &tensor));
+    TF_RETURN_IF_ERROR(inputs.unstacked(1, &shape));
+
+    Output vectorized_reshape =
+        ops::Reshape(s, tensor, GetVectorizedShape(&s, tensor, shape));
+
+    TF_RETURN_IF_ERROR(status);
+
+    // Add output mappings
+    outputs->push_back({vectorized_reshape.node(), 0, true});
+    return Status::OK();
+  }
+};
+
+REGISTER_VECTORIZER("Reshape", ReshapeVectorizer);
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45ad72bb7af4c3b4c73ff7a3ee93fc8e15eb7af4
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/transpose_vectorizer.cc
@@ -0,0 +1,82 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/framework/scope_internal.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h"
+#include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+constexpr char kTransposePrefix[] = "vectorized/transpose";
+
+class TransposeVectorizer : public Vectorizer {
+ public:
+  Status Vectorize(const Node& node, Graph* outer_scope,
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    Status status;
+    Scope parent = NewInternalScope(outer_scope, &status, /*refiner=*/nullptr);
+    Scope scope = parent.NewSubScope(kTransposePrefix);
+
+    Output tensor, original_perm;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &tensor));
+    TF_RETURN_IF_ERROR(inputs.unstacked(1, &original_perm));
+    if (original_perm.type() != DT_INT32) {
+      original_perm = ops::Cast(scope, original_perm, DT_INT32);
+    }
+
+    // The vectorized permutation is the original permutation with an additional
+    // leading 0 and all other values incremented by 1.
+    // perm = tf.concat([[0], original_perm + 1], axis=0)
+    Output perm =
+        ops::Concat(scope,
+                    std::initializer_list<Output>(
+                        {ops::Const(scope, {0}),
+                         ops::Add(scope, original_perm, ops::Const(scope, 1))}),
+                    ops::Const(scope, 0));
+
+    Output vectorized_transpose = ops::Transpose(scope, tensor, perm);
+
+    TF_RETURN_IF_ERROR(status);
+
+    // Add output mappings.
+    outputs->push_back({vectorized_transpose.node(), 0, true});
+    return Status::OK();
+  }
+};
+
+REGISTER_VECTORIZER("Transpose", TransposeVectorizer);
+
+}  // namespace
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
index 74ce520ce1f7c49d2c41cc63f76cfc56d14ac915..6e00c0cb05128b2efe2a55b10e1a96060f94266d 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/unpack_vectorizer.cc
@@ -19,33 +19,42 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
+namespace {
 
 class UnpackVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<Port>* input_ports,
-                   std::vector<Port>* output_ports) override {
-    Status s;
-    if (node.num_inputs() != 1) {
-      return errors::Internal("Unpack op should only have one input.");
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
+    NodeBuilder::NodeOut value;
+    TF_RETURN_IF_ERROR(inputs.stacked(0, &value));
+
+    int axis = 0;
+    if (HasNodeAttr(node.def(), "axis")) {
+      TF_RETURN_IF_ERROR(GetNodeAttr(node.attrs(), "axis", &axis));
     }
 
-    // Add new Unpack node with the same op and attrs as the original node
-    auto new_unpack_node = outer_scope->AddNode(node.def(), &s);
-    TF_RETURN_IF_ERROR(s);
+    if (axis >= 0) {
+      // Since the vectorized input has an extra leading dimension, we need
+      // to increment `axis` attr by 1 for non-negative axis values.
+      // Note: negative axis values wrap around.
+      axis += 1;
+    }
 
-    // Increment "axis" attr by 1:
-    int new_axis = node.def().attr().at("axis").i() + 1;
-    new_unpack_node->AddAttr("axis", new_axis);
+    int num;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node.attrs(), "num", &num));
 
-    // Add the input mappings
-    input_ports->push_back({new_unpack_node, 0});
+    Node* new_node;
+    TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("vectorized/", node.name()),
+                                   node.type_string())
+                           .Input(value)
+                           .Attr("axis", axis)
+                           .Attr("num", num)
+                           .Finalize(outer_scope, &new_node));
 
     // Add the output mappings
-    int num = node.def().attr().at("num").i();
     for (int i = 0; i < num; ++i) {
-      output_ports->push_back({new_unpack_node, i});
+      outputs->push_back({new_node, i, true});
     }
 
     return Status::OK();
@@ -54,6 +63,6 @@ class UnpackVectorizer : public Vectorizer {
 
 REGISTER_VECTORIZER("Unpack", UnpackVectorizer);
 
-}  // namespace vectorization_utils
+}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
index 56eb88c95e5f4ec109e5bea4c41e19cb6508ef0c..7c9905f89ad1b6969b95ed708b9dd2dd7da6bb35 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h
@@ -18,15 +18,79 @@ limitations under the License.
 
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
 
-// Describes a tensor with its operation Node and output position
-typedef std::pair<Node*, int> Port;
+// Represents the outputs of a vectorized op. Currently, a simple type alias
+// provided for symmetry with `VectorizerInput`.
+using VectorizerOutput = std::vector<WrappedTensor>;
+
+// Represents the inputs of a vectorized op. Supports iteration, random access,
+// and retrieval of stacked and unstacked tensor inputs.
+class VectorizerInput {
+ public:
+  VectorizerInput(std::vector<WrappedTensor>&& inputs)
+      : inputs_(std::move(inputs)) {}
+
+  // Gets the stacked tensor input at position index. Returns an error if
+  // the tensor at index is unstacked. The type T must have a (Node*, int)
+  // constructor.
+  template <class T>
+  Status stacked(int index, T* result) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, size());
+
+    if (!inputs_[index].stacked) {
+      return errors::InvalidArgument("Expecting input ", index,
+                                     " to be stacked.");
+    }
+    *result = {inputs_[index].node, inputs_[index].output_index};
+    return Status::OK();
+  }
+
+  // Gets the unstacked tensor input at position index. Returns an error if
+  // the tensor at index is stacked. The type T must have a (Node*, int)
+  // constructor.
+  template <class T>
+  Status unstacked(int index, T* result) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, size());
+
+    if (inputs_[index].stacked) {
+      return errors::InvalidArgument("Expecting input ", index,
+                                     " to be unstacked.");
+    }
+    *result = {inputs_[index].node, inputs_[index].output_index};
+    return Status::OK();
+  }
+
+  // Returns a const reference to the element at specified location index.
+  const WrappedTensor& at(int index) const {
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, size());
+    return inputs_.at(index);
+  }
+
+  // Returns a const iterator pointing to the first wrapped tensor input.
+  std::vector<WrappedTensor>::const_iterator begin() const {
+    return inputs_.begin();
+  }
+  // Returns a const iterator pointing to the past-the-end wrapped tensor input.
+  std::vector<WrappedTensor>::const_iterator end() const {
+    return inputs_.end();
+  }
+
+  // Returns the number of input tensors.
+  size_t size() const { return inputs_.size(); }
+
+ private:
+  std::vector<WrappedTensor> inputs_;
+};
 
 // Interface for vectorization of TensorFlow operations. See `CastVectorizer`
 // for an example.
@@ -36,17 +100,17 @@ class Vectorizer {
 
   // Vectorizes an operation, `node`, by adding Node(s) to `outer_scope`
   // that produce the same vector output(s) as executing `node`'s op
-  // on elements of the vector inputs. The new Node(s) collectively have the
+  // on elements of `inputs`. The new Node(s) collectively have the
   // same number of input and output ports as the node being converted.
-  // Adds mappings for the new nodes' input and output ports to `inputs` and
-  // `outputs` respectively, where the i'th Port in inputs/outputs
-  // corresponds to the i'th input/output port of the node to be converted.
+  // Adds edges between the newly created nodes and nodes in `inputs`, and adds
+  // mappings to the new nodes' output ports to `outputs`, where the i'th
+  // value in `outputs` corresponds to the i'th output port of the node
+  // to be converted.
   virtual Status Vectorize(const Node& node, Graph* outer_scope,
-                           std::vector<Port>* input_ports,
-                           std::vector<Port>* output_ports) = 0;
+                           VectorizerInput&& inputs,
+                           VectorizerOutput* outputs) = 0;
 };
 
-}  // namespace vectorization_utils
 }  // namespace grappler
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_VECTORIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.cc
index a6551e36ac3b01064f788bc0d3d926df580ffdcb..e1cf77a7d568e406586125f1a47a7804c4bbaa33 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
 
 VectorizerRegistry* VectorizerRegistry::Global() {
   static VectorizerRegistry* registry = new VectorizerRegistry;
@@ -42,6 +41,5 @@ void VectorizerRegistry::Register(const string& op_type,
   vectorizers_.insert(std::pair<const string&, std::unique_ptr<Vectorizer>>(
       op_type, std::move(vectorizer)));
 }
-}  // namespace vectorization_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h
index 16159d47ca8eee40b0afa63858a84565d021c0b7..2781b9851005c2393e7ced58a9d8e5abe198fa57 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h
@@ -19,11 +19,11 @@ limitations under the License.
 #include <functional>
 #include <map>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer.h"
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
 
 // A global VectorizerRegistry is used to hold all the vectorizers.
 class VectorizerRegistry {
@@ -38,7 +38,7 @@ class VectorizerRegistry {
   void Register(const string& op_type, std::unique_ptr<Vectorizer> vectorizer);
 
  private:
-  std::map<string, std::unique_ptr<Vectorizer>> vectorizers_;
+  absl::flat_hash_map<string, std::unique_ptr<Vectorizer>> vectorizers_;
 };
 
 namespace vectorizer_registration {
@@ -59,16 +59,12 @@ class VectorizerRegistration {
 #define REGISTER_VECTORIZER_UNIQ_HELPER(ctr, op_type, vectorizer) \
   REGISTER_VECTORIZER_UNIQ(ctr, op_type, vectorizer)
 
-#define REGISTER_VECTORIZER_UNIQ(ctr, op_type, vectorizer)                  \
-  static ::tensorflow::grappler::vectorization_utils::                      \
-      vectorizer_registration::VectorizerRegistration                       \
-          vectorizer_registration_##ctr(                                    \
-              op_type,                                                      \
-              ::std::unique_ptr<                                            \
-                  ::tensorflow::grappler::vectorization_utils::Vectorizer>( \
-                  new vectorizer()))
+#define REGISTER_VECTORIZER_UNIQ(ctr, op_type, vectorizer)                \
+  static ::tensorflow::grappler::vectorizer_registration::                \
+      VectorizerRegistration vectorizer_registration_##ctr(               \
+          op_type, ::std::unique_ptr<::tensorflow::grappler::Vectorizer>( \
+                       new vectorizer()))
 
-}  // namespace vectorization_utils
 }  // namespace grappler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
index 663ceba0279e06e93992371542e7f2f7b281ca06..0eee91f241a8e3c09b93a159c93addb43e749b02 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
@@ -20,13 +20,12 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace vectorization_utils {
 
 class TestVectorizer : public Vectorizer {
  public:
   Status Vectorize(const Node& node, Graph* outer_scope,
-                   std::vector<Port>* inputs,
-                   std::vector<Port>* outputs) override {
+                   VectorizerInput&& inputs,
+                   VectorizerOutput* outputs) override {
     return Status::OK();
   }
 };
@@ -43,10 +42,10 @@ TEST(TestVectorizer, TestTestVectorizer) {
   NodeDef node_def;
   Status s;
   Node* node = g.AddNode(node_def, &s);
-  std::vector<Port> inputs, outputs;
-  EXPECT_TRUE(vectorizer->Vectorize(*node, &g, &inputs, &outputs).ok());
+  std::vector<WrappedTensor> inputs, outputs;
+  EXPECT_TRUE(
+      vectorizer->Vectorize(*node, &g, std::move(inputs), &outputs).ok());
 }
 
-}  // namespace vectorization_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h b/tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..4439b4ab4e6eb72575bc71ef1a4ca7c0138a7f20
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/wrapped_tensor.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_WRAPPED_TENSOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_WRAPPED_TENSOR_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Represents a tensor that has been vectorized.
+struct WrappedTensor {
+  Node* const node;
+  const int output_index;
+
+  // Whether the tensor is stacked, i.e. represents the results of applying
+  // the operation on all slices of the input, where each row i of the
+  // tensor corresponds to the op's output on slice i of the input. False
+  // if the tensor is not stacked, i.e. represents the result of the op on
+  // a single slice of the input, where the result does not vary between
+  // slices.
+  bool stacked;
+
+  WrappedTensor(Node* node, int output_index, bool stacked)
+      : node(node), output_index(output_index), stacked(stacked) {}
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_WRAPPED_TENSOR_H_
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index 344c420902acaa2fc3d1eb639f79dc778a79509e..60c557d557e31173135cf9639efbf345a586faa1 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/data/vectorization_utils.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
 
 #include "absl/strings/str_join.h"
@@ -45,22 +46,6 @@ namespace {
 // Describes a tensor with its operation Node and output position
 typedef std::pair<Node*, int> TensorDesc;
 
-// Equivalent to python Pfor's WrappedTensor struct
-struct WrappedTensor {
-  TensorDesc tensor;
-
-  // Whether the tensor is stacked, i.e. represents the results of applying
-  // the operation on all slices of the input, where each row i of the
-  // tensor corresponds to the op's output on slice i of the input. False
-  // if the tensor is not stacked, i.e. represents the result of the op on
-  // a single slice of the input, where the result does not vary between
-  // slices.
-  bool stacked;
-
-  WrappedTensor(TensorDesc&& tensor, bool stacked)
-      : tensor(std::move(tensor)), stacked(stacked) {}
-};
-
 const char* const kRetValOp = "_Retval";
 
 void ReplaceEdgeSources(const TensorDesc& old_src, const TensorDesc& new_src,
@@ -80,9 +65,18 @@ void ReplaceEdgeSources(const TensorDesc& old_src, const TensorDesc& new_src,
   }
 }
 
+// Update node attrs to keep its properties consistent with the function
+void UpdateMapDefunAttrs(FunctionBody* map_defun_fn, Node* map_defun_node) {
+  map_defun_node->AddAttr("output_types", map_defun_fn->ret_types);
+
+  // TODO(rachelim): Propagate precise shapes if they're known, which may enable
+  // subsequent optimizations.
+  map_defun_node->AddAttr("output_shapes", std::vector<PartialTensorShape>(
+                                               map_defun_fn->ret_types.size()));
+}
+
 Status AddMapDefunOutput(FunctionBody* map_defun_fn, Node* map_defun_node,
                          const TensorDesc& output) {
-  // Note that we don't update MapDefun attrs as we go, only when we are done
   DataType type = output.first->output_type(output.second);
   int index = map_defun_fn->ret_nodes.size();
 
@@ -99,13 +93,13 @@ Status AddMapDefunOutput(FunctionBody* map_defun_fn, Node* map_defun_node,
   map_defun_fn->graph->AddEdge(output.first, output.second, ret_node, 0);
   map_defun_fn->ret_nodes.push_back(ret_node);
   map_defun_fn->ret_types.push_back(type);
+  UpdateMapDefunAttrs(map_defun_fn, map_defun_node);
 
   return s;
 }
 
 void RemoveMapDefunOutput(int output_position, Graph* outer_scope,
                           FunctionBody* map_defun_fn, Node* map_defun_node) {
-  // Note that we don't update MapDefun attrs as we go, only when we are done
   DCHECK_LT(output_position, map_defun_fn->ret_nodes.size())
       << "Trying to remove output that doesn't exist. Output number: "
       << output_position;
@@ -118,6 +112,7 @@ void RemoveMapDefunOutput(int output_position, Graph* outer_scope,
                                 output_position);
   map_defun_fn->ret_types.erase(map_defun_fn->ret_types.begin() +
                                 output_position);
+  UpdateMapDefunAttrs(map_defun_fn, map_defun_node);
 
   // Renumber the nodes and edges that come after
   for (int i = 0; i < num_later_outputs; ++i) {
@@ -135,6 +130,7 @@ void RemoveMapDefunOutput(int output_position, Graph* outer_scope,
 // This class transforms the input FunctionDefs into their corresponding
 // Graph objects and works on the graphs directly, then converts them back
 // to FunctionDefs when GetResult is called.
+// TODO(rachelim): Move this to its own header.
 class Vectorization {
  public:
   explicit Vectorization(FunctionDefLibrary* lib)
@@ -187,18 +183,25 @@ class Vectorization {
   Status StackTensor(WrappedTensor* unstacked, TensorDesc* result);
 
   // Recursively looks for unstacked nodes in the `map_defun_fn_` graph by
-  // doing a depth-first search from the ret nodes. Lifts nodes that are
-  // unstacked (i.e. don't derive from arg nodes) into `outer_scope_` directly
-  // and add mappings to `conversion_map_`.
-  Status AddUnstackedNodeMappings();
-
-  // Recursive helper for `AddUnstackedNodeMappings`, returns true if tensor
-  // is unstacked.
-  bool AddUnstackedNodeMappingsHelper(TensorDesc&& tensor, Status* status);
-
-  // Add mappings from `map_defun_fn_` arg nodes to `map_defun_node_` input
-  // nodes to `conversion_map_`.
-  Status AddArgNodeMappings();
+  // doing a depth-first search from the ret nodes. Lifts tensors that are
+  // unstacked (i.e. don't derive from arg tensors) into `outer_scope_` directly
+  // and adds mappings to `conversion_map_`.
+  // Note that this function may have false negatives, i.e. not
+  // add mappings for some tensors that are unstacked. This may happen in the
+  // following cases: 1) a vectorized op produces unstacked outputs from stacked
+  // inputs (e.g. the vectorized "Shape" op), 2) the tensors are in a cycle, or
+  // 3) the unstacked op could not be lifted into `outer_scope`.
+  Status AddUnstackedTensorMappings();
+
+  // Recursive helper for `AddUnstackedTensorMappings`. If an op node is
+  // unstacked, lifts its output tensors into `outer_scope`, adding the mappings
+  // to `conversion_map`. Returns true if the unstacked mappings were added.
+  bool AddUnstackedTensorMappingsHelper(
+      TensorDesc&& tensor, absl::flat_hash_set<const Edge*>* visited);
+
+  // Add mappings from `map_defun_fn_` arg tensors to `map_defun_node_` input
+  // tensors to `conversion_map_`.
+  Status AddArgTensorMappings();
 
   // Maps a tensor to the corresponding WrappedTensor. For example,
   // {"Cast" Node*, 0} -> WrappedTensor({"Vectorize/Cast" Node*, 0}, true)
@@ -239,34 +242,53 @@ Status Vectorization::AddConversionMapping(Node* op_node) {
     return errors::Unimplemented("No vectorizer registered for op: ",
                                  op_node->type_string());
   }
-  std::vector<Port> input_ports, output_ports;
-  input_ports.reserve(op_node->num_inputs());
-  output_ports.reserve(op_node->num_outputs());
-  TF_RETURN_IF_ERROR(vectorizer->Vectorize(*op_node, outer_scope_.get(),
-                                           &input_ports, &output_ports));
+  std::vector<WrappedTensor> inputs, outputs;
+  inputs.reserve(op_node->num_inputs());
+  outputs.reserve(op_node->num_outputs());
 
   std::vector<const Edge*> input_edges;
   TF_RETURN_IF_ERROR(op_node->input_edges(&input_edges));
 
-  if (op_node->num_outputs() != output_ports.size() ||
-      op_node->num_inputs() != input_ports.size() ||
-      input_edges.size() != input_ports.size()) {
-    return errors::Internal("Vectorizer inputs/outputs don't match.");
-  }
-
-  // Promote the inputs of the op to MapDefun outputs and connect the edges
-  // accordingly.
+  // The inputs for the node to be converted may already have been converted
+  // themselves. For those that are not, we promote them to MapDefun outputs.
   for (size_t i = 0; i < op_node->num_inputs(); ++i) {
     auto edge = input_edges[i];
-    TF_RETURN_IF_ERROR(AddMapDefunOutput(map_defun_fn_.get(), map_defun_node_,
-                                         {edge->src(), edge->src_output()}));
-    outer_scope_->AddEdge(map_defun_node_, map_defun_fn_->ret_nodes.size() - 1,
-                          input_ports[i].first, input_ports[i].second);
+    if (auto found = gtl::FindOrNull(conversion_map_,
+                                     {edge->src(), edge->src_output()})) {
+      inputs.push_back(*found);
+    } else {
+      // TODO(rachelim): Handle the case where unconverted inputs are unstacked.
+      // We assume that all unconverted inputs will be stacked, since we
+      // converted all unstacked nodes in `Initialize`. However, it's actually
+      // possible that yet-unconverted nodes may produce unstacked outputs after
+      // they are vectorized. (For example, see the "Shape" converter in
+      // tensorflow/python/ops/parallel_for/pfor.py). If a vectorizer expects
+      // an unstacked input but receives a stacked one, vectorizer->Vectorize
+      // will return an error.
+      TF_RETURN_IF_ERROR(AddMapDefunOutput(map_defun_fn_.get(), map_defun_node_,
+                                           {edge->src(), edge->src_output()}));
+      int output_index = map_defun_fn_->ret_nodes.size() - 1;
+      inputs.push_back({map_defun_node_, output_index, true});
+    }
+  }
+
+  Status s = vectorizer->Vectorize(*op_node, outer_scope_.get(),
+                                   std::move(inputs), &outputs);
+  if (!s.ok()) {
+    VLOG(2) << "Vectorizer for op \"" << op_node->type_string()
+            << "\" failed with error: " << s;
+    return s;
+  }
+
+  if (op_node->num_outputs() != outputs.size()) {
+    return errors::Internal(
+        "Number of vectorizer outputs does not match. Expected: ",
+        op_node->num_outputs(), " Actual: ", outputs.size());
   }
 
   // Add output mappings.
   for (size_t i = 0; i < op_node->num_outputs(); ++i) {
-    conversion_map_.insert({{op_node, i}, {std::move(output_ports[i]), true}});
+    conversion_map_.insert({{op_node, i}, outputs[i]});
   }
 
   return Status::OK();
@@ -281,25 +303,22 @@ Status Vectorization::ConvertOutput(int output_position) {
 
   TensorDesc output({ret_edge->src(), ret_edge->src_output()});
   TensorDesc converted_output;
-  if (auto found = gtl::FindOrNull(conversion_map_, output)) {
-    // It's possible the output already has a mapping, if it comes from a node
-    // that has already been converted.
-    if (found->stacked) {
-      converted_output = found->tensor;
-    } else {
-      // Some outputs may be unstacked if they don't derive from arg nodes
-      // (for example, if a function returns a constant). For these, we
-      // have to add extra nodes to tile it in the 0th dimension.
-      TF_RETURN_IF_ERROR(StackTensor(found, &converted_output));
-    }
-  } else {
-    // Note: All unstacked nodes are converted ahead of time in `Initialize`,
-    // and here we assume that all op vectorizers create only stacked outputs.
-    // This may not hold in the future, as more vectorizers are added that
-    // may actually create unstacked outputs. For example, see the `Shape`
-    // converter in third_party/tensorflow/python/ops/parallel_for/pfor.py
+
+  // It's possible the output already has a mapping, if it comes from a node
+  // that has already been converted.
+  auto found = gtl::FindOrNull(conversion_map_, output);
+  if (!found) {
     TF_RETURN_IF_ERROR(AddConversionMapping(output.first));
-    converted_output = conversion_map_.at(output).tensor;
+    found = &conversion_map_.at(output);
+  }
+
+  if (found->stacked) {
+    converted_output = {found->node, found->output_index};
+  } else {
+    // Some outputs may be unstacked if they don't derive from arg nodes
+    // (for example, if a function returns a constant). For these, we
+    // have to add extra nodes to tile it in the 0th dimension.
+    TF_RETURN_IF_ERROR(StackTensor(found, &converted_output));
   }
 
   ReplaceEdgeSources({map_defun_node_, output_position}, converted_output,
@@ -342,13 +361,6 @@ void Vectorization::VectorizeHelper() {
   // need the MapDefun node and can delete it.
   if (map_defun_fn_->ret_nodes.empty()) {
     outer_scope_->RemoveNode(map_defun_node_);
-  } else {
-    // Update MapDefun node attrs accordingly
-    DCHECK_EQ(map_defun_fn_->ret_types.size(), map_defun_fn_->ret_nodes.size());
-    map_defun_node_->AddAttr(
-        "output_shapes",
-        std::vector<PartialTensorShape>(map_defun_fn_->ret_types.size()));
-    map_defun_node_->AddAttr("output_types", map_defun_fn_->ret_types);
   }
 }
 
@@ -392,9 +404,8 @@ Status Vectorization::Initialize(const FunctionDef& outer_scope,
   }
   map_defun_node_ = outer_scope_->FindNodeId(node_id);
 
-  TF_RETURN_IF_ERROR(AddArgNodeMappings());
-
-  TF_RETURN_IF_ERROR(AddUnstackedNodeMappings());
+  TF_RETURN_IF_ERROR(AddArgTensorMappings());
+  TF_RETURN_IF_ERROR(AddUnstackedTensorMappings());
   loop_len_node_ = nullptr;
 
   return Status::OK();
@@ -455,7 +466,7 @@ Status Vectorization::StackTensor(WrappedTensor* unstacked,
 
   Node* ones_shape;
   TF_RETURN_IF_ERROR(node_builder("Shape")
-                         .Input(unstacked->tensor.first)  // input
+                         .Input(unstacked->node)  // input
                          .Finalize(g, &ones_shape));
 
   Node* ones;
@@ -473,8 +484,8 @@ Status Vectorization::StackTensor(WrappedTensor* unstacked,
 
   Node* expand_dims;
   TF_RETURN_IF_ERROR(node_builder("ExpandDims")
-                         .Input(unstacked->tensor.first)  // input
-                         .Input(const_0)                  // dim
+                         .Input(unstacked->node)  // input
+                         .Input(const_0)          // dim
                          .Finalize(g, &expand_dims));
 
   TF_RETURN_IF_ERROR(node_builder("Tile")
@@ -485,23 +496,43 @@ Status Vectorization::StackTensor(WrappedTensor* unstacked,
   return Status::OK();
 }
 
-Status Vectorization::AddArgNodeMappings() {
-  for (auto arg_node : map_defun_fn_->arg_nodes) {
+Status Vectorization::AddArgTensorMappings() {
+  // Note that inputs to map_defun_fn_ are either regular arguments (for which
+  // the operations are mapped across their 0th dimension) or captured inputs
+  // (for which the operations apply to the argument wholesale).
+  int num_args =
+      map_defun_node_->attrs().Find("Targuments")->list().type_size();
+
+  auto add_conversion = [this](Node* arg_node, bool stacked) {
     Node* input_node;
     TF_RETURN_IF_ERROR(map_defun_node_->input_node(
         arg_node->attrs().Find("index")->i(), &input_node));
 
-    conversion_map_.insert({{arg_node, 0}, {{input_node, 0}, true}});
+    conversion_map_.insert({{arg_node, 0}, {input_node, 0, stacked}});
 
     // Control inputs
     conversion_map_.insert({{arg_node, Graph::kControlSlot},
-                            {{input_node, Graph::kControlSlot}, true}});
+                            {input_node, Graph::kControlSlot, stacked}});
+
+    return Status::OK();
+  };
+
+  // Regular arguments
+  for (int i = 0; i < num_args; ++i) {
+    TF_RETURN_IF_ERROR(add_conversion(map_defun_fn_->arg_nodes[i], true));
+  }
+
+  // Captured inputs. These are applied (without slicing) to every iteration of
+  // the map function, hence are mapped to unstacked nodes.
+  for (int i = num_args; i < map_defun_fn_->arg_nodes.size(); ++i) {
+    TF_RETURN_IF_ERROR(add_conversion(map_defun_fn_->arg_nodes[i], false));
   }
+
   return Status::OK();
 }
 
-bool Vectorization::AddUnstackedNodeMappingsHelper(TensorDesc&& tensor,
-                                                   Status* status) {
+bool Vectorization::AddUnstackedTensorMappingsHelper(
+    TensorDesc&& tensor, absl::flat_hash_set<const Edge*>* visited) {
   if (auto found = gtl::FindOrNull(conversion_map_, tensor)) {
     return !found->stacked;
   }
@@ -513,14 +544,22 @@ bool Vectorization::AddUnstackedNodeMappingsHelper(TensorDesc&& tensor,
   }
 
   bool is_unstacked = true;
-  for (auto edge : tensor.first->in_edges()) {
+  for (const auto& edge : tensor.first->in_edges()) {
     // Ignore Source nodes. Note that these are also ignored in the
     // GraphToFunctionDef conversion.
     if (edge->src()->IsSource()) continue;
 
+    if (visited->find(edge) != visited->end()) {
+      // If we've visited this edge already, we're in a cycle. In this case, we
+      // are conservative and don't mark the node as unstacked.
+      is_unstacked = false;
+      continue;
+    }
+    visited->insert(edge);
+
     // A node is unstacked if all of its inputs are unstacked
-    is_unstacked &= AddUnstackedNodeMappingsHelper(
-        {edge->src(), edge->src_output()}, status);
+    is_unstacked &= AddUnstackedTensorMappingsHelper(
+        {edge->src(), edge->src_output()}, visited);
   }
 
   if (!is_unstacked) {
@@ -530,45 +569,42 @@ bool Vectorization::AddUnstackedNodeMappingsHelper(TensorDesc&& tensor,
   // If the node is unstacked, we copy it into outer_scope_ and
   // add it to the map. Note that we don't clean up the nodes that are copied
   // in map_defun_fn_, and rely on them being pruned out later.
-  Node* node = outer_scope_->AddNode(tensor.first->def(), status);
-  if (!status->ok()) return true;
+  Status status;
+  Node* node = outer_scope_->AddNode(tensor.first->def(), &status);
+  if (!status.ok()) return false;
 
   // Add input edges to nodes that should already have been lifted.
-  for (auto edge : tensor.first->in_edges()) {
+  for (const auto& edge : tensor.first->in_edges()) {
     // Ignore Source nodes. Note that these are also ignored in the
     // GraphToFunctionDef conversion.
     if (edge->src()->IsSource()) continue;
 
     if (auto found = gtl::FindOrNull(conversion_map_,
                                      {edge->src(), edge->src_output()})) {
-      outer_scope_->AddEdge(found->tensor.first, found->tensor.second, node,
+      outer_scope_->AddEdge(found->node, found->output_index, node,
                             edge->dst_input());
     } else {
-      status->Update(errors::Internal(
-          "Could not find input conversion even though we did depth first "
-          "conversion."));
+      return false;
     }
   }
 
   // Add output mappings
   for (int i = 0; i < tensor.first->num_outputs(); ++i) {
-    conversion_map_.insert(
-        {{tensor.first, i}, WrappedTensor({node, i}, false)});
+    conversion_map_.insert({{tensor.first, i}, WrappedTensor(node, i, false)});
   }
   conversion_map_.insert({{tensor.first, Graph::kControlSlot},
-                          WrappedTensor({node, Graph::kControlSlot}, false)});
+                          WrappedTensor(node, Graph::kControlSlot, false)});
 
   return true;
 }
 
-Status Vectorization::AddUnstackedNodeMappings() {
-  SetVector<Node*> unstacked_nodes;
-  Status s;
+Status Vectorization::AddUnstackedTensorMappings() {
+  absl::flat_hash_set<const Edge*> visited;
   for (const auto& ret_node : map_defun_fn_->ret_nodes) {
     const Edge* in_edge = nullptr;
     TF_RETURN_IF_ERROR(ret_node->input_edge(0, &in_edge));
-    AddUnstackedNodeMappingsHelper({in_edge->src(), in_edge->src_output()}, &s);
-    TF_RETURN_IF_ERROR(s);
+    AddUnstackedTensorMappingsHelper({in_edge->src(), in_edge->src_output()},
+                                     &visited);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
index a6020e36bb9181f648b566ad8110f9e5d69ce423..f5aa8c888e0daec24d518d64df5fc6c7682929ff 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils_test.cc
@@ -15,11 +15,30 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/vectorization_utils.h"
 
+#include <cstddef>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
@@ -27,39 +46,73 @@ namespace grappler {
 namespace vectorization_utils {
 namespace {
 
-NodeDef* AddCastNode(const string& name, const std::vector<string>& inputs,
-                     DataType src, DataType dst, bool truncate,
-                     FunctionDef* fn) {
-  NodeDef* node = function_utils::AddNode(name, "Cast", inputs, {}, fn);
-  graph_transforms::SetNodeAttr("SrcT", src, node);
-  graph_transforms::SetNodeAttr("DstT", dst, node);
-  graph_transforms::SetNodeAttr("Truncate", truncate, node);
-  return node;
+// Wraps a function in another function with a MapDefun node
+Status WrapFunctionWithMapDefun(const FunctionDef& inner, FunctionDef* result) {
+  Graph graph(OpRegistry::Global());
+  std::vector<NodeBuilder::NodeOut> inputs;
+  inputs.reserve(inner.signature().input_arg_size());
+  for (int i = 0; i < inner.signature().input_arg_size(); ++i) {
+    Node* arg;
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(strings::StrCat("arg", i), /*op_name=*/"_Arg")
+            .Attr("T", inner.signature().input_arg(i).type())
+            .Attr("index", i)
+            .Finalize(&graph, &arg));
+    inputs.push_back(arg);
+  }
+
+  DataTypeVector output_types;
+  output_types.reserve(inner.signature().output_arg_size());
+  for (const auto& output_arg : inner.signature().output_arg()) {
+    output_types.push_back(output_arg.type());
+  }
+
+  Node* map_defun_node;
+  NameAttrList func_attr;
+  func_attr.set_name(inner.signature().name());
+  TF_RETURN_IF_ERROR(
+      NodeBuilder("map_defun", "MapDefun")
+          .Input(inputs)                               // arguments
+          .Input(std::vector<NodeBuilder::NodeOut>())  // captured_inputs
+          .Attr("f", func_attr)
+          .Attr("output_types", output_types)
+          .Attr("output_shapes", std::vector<PartialTensorShape>(
+                                     inner.signature().output_arg_size()))
+          .Finalize(&graph, &map_defun_node));
+
+  for (size_t i = 0; i < map_defun_node->num_outputs(); ++i) {
+    Node* ret;
+    TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("ret", i), "_Retval")
+                           .Input(map_defun_node, i)
+                           .Attr("index", static_cast<int>(i))
+                           .Finalize(&graph, &ret));
+  }
+
+  return GraphToFunctionDef(graph, "outer_function", result);
 }
 
-NodeDef* AddUnstackNode(const string& name, const std::vector<string>& inputs,
-                        DataType t, int axis, int num, FunctionDef* fn) {
-  NodeDef* node = function_utils::AddNode(name, "Unpack", inputs, {}, fn);
-  graph_transforms::SetNodeAttr("T", t, node);
-  graph_transforms::SetNodeAttr("axis", axis, node);
-  graph_transforms::SetNodeAttr("num", num, node);
-  return node;
+// Wraps the function `fn` in another function with a MapDefun node, then
+// vectorizes the wrapper function with VectorizeMapDefun.
+Status WrapAndVectorize(const FunctionDef& fn, FunctionDefLibrary* lib,
+                        FunctionDef** result) {
+  FunctionDef outer;
+  TF_RETURN_IF_ERROR(WrapFunctionWithMapDefun(fn, &outer));
+  const NodeDef& map_defun_node = outer.node_def(0);
+
+  *lib->add_function() = outer;
+  *lib->add_function() = fn;
+
+  TF_RETURN_IF_ERROR(VectorizeMapDefun(outer, map_defun_node, lib, result));
+
+  return Status::OK();
 }
 
-NodeDef* AddMapDefunNode(const string& name, const std::vector<string>& inputs,
-                         const std::vector<DataType>& t_arguments,
-                         const std::vector<DataType>& output_types,
-                         const std::vector<TensorShape>& output_shapes,
-                         const string& function_name, FunctionDef* fn) {
-  NameAttrList func;
-  func.set_name(function_name);
-  NodeDef* node = function_utils::AddNode(name, "MapDefun", inputs, {}, fn);
-  graph_transforms::SetNodeAttr("Targuments", t_arguments, node);
-  graph_transforms::SetNodeAttr("Tcaptured", DataTypeVector(), node);
-  graph_transforms::SetNodeAttr("output_types", output_types, node);
-  graph_transforms::SetNodeAttr("output_shapes", output_shapes, node);
-  graph_transforms::SetNodeAttr("f", func, node);
-  return node;
+FunctionDefHelper::Node Cast(string&& name, std::vector<string>&& inputs,
+                             DataType src, DataType dst) {
+  return {{name},
+          "Cast",
+          inputs,
+          {{"SrcT", src}, {"DstT", dst}, {"Truncate", false}}};
 }
 
 string GetRetval(const FunctionDef& function_def, int index) {
@@ -67,31 +120,9 @@ string GetRetval(const FunctionDef& function_def, int index) {
       function_def.signature().output_arg(index).name());
 }
 
-// TODO(rachelim): Use FunctionDefHelper::Create instead
-FunctionDef CreateFunction(
-    StringPiece name, const std::vector<std::pair<string, DataType>>& inputs,
-    const std::vector<std::pair<string, DataType>>& outputs,
-    const std::map<string, string>& rets) {
-  FunctionDef func;
-  auto* signature = func.mutable_signature();
-  signature->set_name(string(name));
-  for (const auto& x : inputs) {
-    auto* arg_def = signature->add_input_arg();
-    arg_def->set_name(x.first);
-    arg_def->set_type(x.second);
-  }
-  for (const auto& x : outputs) {
-    auto* arg_def = signature->add_output_arg();
-    arg_def->set_name(x.first);
-    arg_def->set_type(x.second);
-  }
-  for (const auto& x : rets) {
-    (*func.mutable_ret())[x.first] = x.second;
-  }
-
-  return func;
-}
-
+///==================================//
+// Tests for vectorization framework //
+///==================================//
 
 // Before:
 //
@@ -124,32 +155,24 @@ FunctionDef CreateFunction(
 // +---------------+ Ret0 +---+ Ret1 +--------+
 //                 +------+   +------+
 //
-TEST(VectorizeMapDefunTest, VectorizeDefunNoOps) {
-  FunctionDef inner =
-      CreateFunction("inner_function", {{"arg0", DT_INT32}, {"arg1", DT_INT32}},
-                     {{"ret0", DT_INT32}, {"ret1", DT_INT32}},
-                     {{"ret0", "arg0"}, {"ret1", "arg1"}});
-  FunctionDef outer = CreateFunction(
-      "outer_function", {{"ret0", DT_INT32}, {"ret1", DT_INT32}},
-      {{"mapdefun", DT_INT32}, {"mapdefun_0", DT_INT32}},
-      {{"mapdefun", "MapDefun:output:0"}, {"mapdefun_0", "MapDefun:output:1"}});
-
-  NodeDef* map_defun = AddMapDefunNode(
-      "MapDefun", {"ret0", "ret1"}, {DT_INT32, DT_INT32}, {DT_INT32, DT_INT32},
-      {{}, {}}, inner.signature().name(), &outer);
-  CHECK_NOTNULL(map_defun);
-
+TEST(VectorizeMapDefunTest, VectorizeWithNoOps) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32", "arg1: int32"},
+      /*out_def=*/{"ret0: int32", "ret1: int32"},
+      /*attr_def=*/{},
+      /*node_def=*/{},
+      /*ret_def=*/{{"ret0", "arg0"}, {"ret1", "arg1"}});
   FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
   FunctionDef* vectorized;
-  Status s = VectorizeMapDefun(outer, *map_defun, &lib, &vectorized);
-  LOG(ERROR) << s;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+
   EXPECT_TRUE(
       !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
-  EXPECT_EQ(GetRetval(*vectorized, 0), "ret0");
-  EXPECT_EQ(GetRetval(*vectorized, 1), "ret1");
+  EXPECT_EQ(GetRetval(*vectorized, 0),
+            vectorized->signature().input_arg(0).name());
+  EXPECT_EQ(GetRetval(*vectorized, 1),
+            vectorized->signature().input_arg(1).name());
 }
 
 // Before:
@@ -178,7 +201,7 @@ TEST(VectorizeMapDefunTest, VectorizeDefunNoOps) {
 // +---------------+ Ret0 +---+ Ret1 +--------+
 //                 +------+   +------+
 //
-//   where XOp1 is not convertible.
+//   where XOp1 does not have a vectorizer defined.
 //
 // After:
 //
@@ -207,42 +230,33 @@ TEST(VectorizeMapDefunTest, VectorizeDefunNoOps) {
 // +---------------+ Ret0 +---+ Ret1 +--------+
 //                 +------+   +------+
 //
-TEST(VectorizeMapDefunTest, VectorizeDefunUnconvertible) {
-  FunctionDef inner =
-      CreateFunction("inner_function", {{"arg0", DT_INT32}, {"arg1", DT_INT32}},
-                     {{"ret0", DT_INT32}, {"ret1", DT_INT32}},
-                     {{"ret0", "MatMul:product:0"}, {"ret1", "Cast:y:0"}});
+TEST(VectorizeMapDefunTest, VectorizeWithUnvectorizableOp) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32", "arg1: int32"},
+      /*out_def=*/{"ret0: int32", "ret1: int32"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {{{"MatMul"}, "MatMul", {"arg0", "arg0"}, {{"T", DT_INT32}}},
+       Cast("Cast", {"arg1"}, DT_INT32, DT_INT32)},  //
+      /*ret_def=*/{{"ret0", "MatMul:product:0"}, {"ret1", "Cast:y:0"}});
   // TODO(rachelim): If we ever write a converter for MatMul, we have to
   // change this test.
-  NodeDef* x_op1 =
-      function_utils::AddNode("MatMul", "MatMul", {"arg0", "arg0"}, {}, &inner);
-  CHECK_NOTNULL(x_op1);
-  graph_transforms::SetNodeAttr("T", DT_INT32, x_op1);
-
-  NodeDef* cast_node =
-      AddCastNode("Cast", {"arg1"}, DT_INT32, DT_INT32, false, &inner);
-  CHECK_NOTNULL(cast_node);
-
-  FunctionDef outer = CreateFunction(
-      "outer_function", {{"x", DT_INT32}, {"y", DT_INT32}},
-      {{"mapdefun", DT_INT32}, {"mapdefun_0", DT_INT32}},
-      {{"mapdefun", "MapDefun:output:0"}, {"mapdefun_0", "MapDefun:output:1"}});
-
-  NodeDef* map_defun = AddMapDefunNode(
-      "MapDefun", {"x", "y"}, {DT_INT32, DT_INT32}, {DT_INT32, DT_INT32},
-      {{}, {}}, inner.signature().name(), &outer);
-  CHECK_NOTNULL(map_defun);
 
   FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
   FunctionDef* vectorized;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
 
+  ASSERT_TRUE(
+      function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
   auto map_defun_node = vectorized->node_def(
       function_utils::FindFunctionNodeWithOp("MapDefun", *vectorized));
+
   // The Cast node should be converted just fine.
-  EXPECT_EQ(GetRetval(*vectorized, 1), "Cast:y:0");
+  ASSERT_TRUE(function_utils::ContainsFunctionNodeWithOp("Cast", *vectorized));
+  auto cast = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
+  EXPECT_EQ(GetRetval(*vectorized, 1), strings::StrCat(cast.name(), ":y:0"));
 
   // The inner function should only have one retval.
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), lib);
@@ -250,78 +264,6 @@ TEST(VectorizeMapDefunTest, VectorizeDefunUnconvertible) {
       lib_def.Find(map_defun_node.attr().at("f").func().name());
   EXPECT_EQ(map_defun_fn->signature().output_arg_size(), 1);
 }
-
-// Before:
-//
-//
-//                 +------+
-// +---------------+ Arg0 +---------+
-// |               +---+--+         |
-// |                   |            |
-// |               +---v--+         |
-// |   +-----------+ Arg0 +-----+   |
-// |   |           +---+--+     |   |
-// |   |               |        |   |
-// |   |               |        |   |
-// |   |           +---v--+     |   |
-// |   |           | Cast |     |   |
-// |   |           +---+--+     |   |
-// |   |               |        |   |
-// |   | MapDefun  +---v--+     |   |
-// |   +-----------+ Ret0 +-----+   |
-// |               +---+--+         |
-// |                   |            |
-// |               +---v--+         |
-// +---------------+ Ret0 +---------+
-//                 +------+
-//
-//
-//  After:
-//
-//                 +------+
-// +---------------+ Arg0 +---------+
-// |               +---+--+         |
-// |                   |            |
-// |               +---v--+         |
-// |               | Cast |         |
-// |               +---+--+         |
-// |                   |            |
-// |               +---v--+         |
-// +---------------+ Ret0 +---------+
-//                 +------+
-//
-TEST(VectorizeMapDefunTest, VectorizeDefunSimpleCast) {
-  FunctionDef inner =
-      CreateFunction("inner_function", {{"arg0", DT_INT32}},
-                     {{"ret0", DT_INT64}}, {{"ret0", "Cast:y:0"}});
-  NodeDef* cast_op =
-      AddCastNode("Cast", {"arg0"}, DT_INT32, DT_INT64, false, &inner);
-  CHECK_NOTNULL(cast_op);
-
-  FunctionDef outer = CreateFunction("outer_function", {{"x", DT_INT32}},
-                                     {{"mapdefun", DT_INT64}},
-                                     {{"mapdefun", "MapDefun:output:0"}});
-
-  NodeDef* map_defun =
-      AddMapDefunNode("MapDefun", {"x"}, {DT_INT32}, {DT_INT64}, {{}},
-                      inner.signature().name(), &outer);
-  CHECK_NOTNULL(map_defun);
-
-  FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
-  FunctionDef* vectorized;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
-  EXPECT_TRUE(
-      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
-  const NodeDef& cast_node = vectorized->node_def(
-      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
-  EXPECT_EQ(cast_node.input(0), "x");
-  EXPECT_EQ(GetRetval(*vectorized, 0),
-            strings::StrCat(cast_node.name(), ":y:0"));
-  EXPECT_EQ(vectorized->node_def_size(), 1);
-}
-
 // Before:
 //
 //                 +------+
@@ -365,134 +307,31 @@ TEST(VectorizeMapDefunTest, VectorizeDefunSimpleCast) {
 // +---------------+ Ret0 +---+ Ret1 +--------+
 //                 +------+   +------+
 //
-TEST(VectorizeMapDefunTest, VectorizeDefunCastUsedTwice) {
+TEST(VectorizeMapDefunTest, VectorizeWithOutputUsedTwice) {
   // Tests that behavior is correct when an output is used more than once.
-  FunctionDef inner =
-      CreateFunction("inner_function", {{"arg0", DT_INT32}},
-                     {{"ret0", DT_INT64}, {"ret1", DT_INT64}},
-                     {{"ret0", "Cast:y:0"}, {"ret1", "Cast:y:0"}});
-  NodeDef* cast_op =
-      AddCastNode("Cast", {"arg0"}, DT_INT32, DT_INT64, false, &inner);
-  CHECK_NOTNULL(cast_op);
-
-  FunctionDef outer = CreateFunction(
-      "outer_function", {{"x", DT_INT32}},
-      {{"mapdefun", DT_INT64}, {"mapdefun_0", DT_INT64}},
-      {{"mapdefun", "MapDefun:output:0"}, {"mapdefun_0", "MapDefun:output:1"}});
-
-  NodeDef* map_defun =
-      AddMapDefunNode("MapDefun", {"x"}, {DT_INT32}, {DT_INT64, DT_INT64},
-                      {{}, {}}, inner.signature().name(), &outer);
-  CHECK_NOTNULL(map_defun);
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int64", "ret1: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{Cast("Cast", {"arg0"}, DT_INT32, DT_INT64)},
+      /*ret_def=*/{{"ret0", "Cast:y:0"}, {"ret1", "Cast:y:0"}});
 
   FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
   FunctionDef* vectorized;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+
   EXPECT_TRUE(
       !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
   const NodeDef& cast_node = vectorized->node_def(
       function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
-  EXPECT_EQ(cast_node.input(0), "x");
+  EXPECT_EQ(cast_node.input(0), vectorized->signature().input_arg(0).name());
   EXPECT_EQ(GetRetval(*vectorized, 0),
             strings::StrCat(cast_node.name(), ":y:0"));
   EXPECT_EQ(GetRetval(*vectorized, 1),
             strings::StrCat(cast_node.name(), ":y:0"));
   EXPECT_EQ(vectorized->node_def_size(), 1);
 }
-
-// Before:
-//
-//                        +------+
-// +----------------------+ Arg0 +----------------------+
-// |                      +---+--+                      |
-// |                          |                         |
-// |                      +---v--+                      |
-// |   +------------------+ Arg0 +------------------+   |
-// |   |                  +---+--+                  |   |
-// |   |                      |                     |   |
-// |   |                      |                     |   |
-// |   |                  +---v---+ num=3           |   |
-// |   |                  |Unstack| axis=0          |   |
-// |   |                  ++--+--++                 |   |
-// |   |                   |  |  |                  |   |
-// |   |              +----+  |  +-------+          |   |
-// |   |              |       |          |          |   |
-// |   | MapDefun +---v--+  +-v----+  +--v---+      |   |
-// |   +----------+ Ret0 +--+ Ret1 +--+ Ret2 +------+   |
-// |              +---+--+  +--+---+  +--+---+          |
-// |                  |        |         |              |
-// |              +---v--+  +--v---+  +--v---+          |
-// +--------------+ Ret0 +--+ Ret1 +--+ Ret2 +----------+
-//                +------+  +------+  +------+
-//
-//
-//  After:
-//
-//                        +------+
-// +----------------------+ Arg0 +----------------------+
-// |                      +---+--+                      |
-// |                          |                         |
-// |                          |                         |
-// |                          |                         |
-// |                      +---v---+ num=3               |
-// |                      |Unstack| axis=1              |
-// |                      ++--+--++                     |
-// |                       |  |  |                      |
-// |                  +----+  |  +-------+              |
-// |                  |       |          |              |
-// |                  |       |          |              |
-// |              +---v--+  +-v----+  +--v---+          |
-// +--------------+ Ret0 +--+ Ret1 +--+ Ret2 +----------+
-//                +------+  +------+  +------+
-//
-TEST(VectorizeMapDefunTest, VectorizeDefunOpWithMultipleOutputs) {
-  FunctionDef inner = CreateFunction(
-      "inner_function", {{"arg0", DT_INT32}},
-      {{"ret0", DT_INT32}, {"ret1", DT_INT32}, {"ret2", DT_INT32}},
-      {{"ret0", "MyUnstack:output:0"},
-       {"ret1", "MyUnstack:output:1"},
-       {"ret2", "MyUnstack:output:2"}});
-  NodeDef* unstack_op =
-      AddUnstackNode("MyUnstack", {"arg0"}, DT_INT32, 0, 3, &inner);
-  CHECK_NOTNULL(unstack_op);
-
-  FunctionDef outer = CreateFunction("outer_function", {{"x", DT_INT32}},
-                                     {{"mapdefun", DT_INT32},
-                                      {"mapdefun_0", DT_INT32},
-                                      {"mapdefun_1", DT_INT32}},
-                                     {{"mapdefun", "MapDefun:output:0"},
-                                      {"mapdefun_0", "MapDefun:output:1"},
-                                      {"mapdefun_1", "MapDefun:output:2"}});
-
-  NodeDef* map_defun = AddMapDefunNode(
-      "MapDefun", {"x"}, {DT_INT32}, {DT_INT32, DT_INT32, DT_INT32},
-      {{1}, {1}, {1}}, inner.signature().name(), &outer);
-  CHECK_NOTNULL(map_defun);
-
-  FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
-  FunctionDef* vectorized;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
-  EXPECT_TRUE(
-      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
-  const NodeDef& unpack_node = vectorized->node_def(
-      function_utils::FindFunctionNodeWithOp("Unpack", *vectorized));
-  EXPECT_EQ(unpack_node.input(0), "x");
-  EXPECT_EQ(unpack_node.attr().at("axis").i(), 1);
-  EXPECT_EQ(unpack_node.attr().at("T").type(), DT_INT32);
-  EXPECT_EQ(unpack_node.attr().at("num").i(), 3);
-  EXPECT_EQ(GetRetval(*vectorized, 0),
-            strings::StrCat(unpack_node.name(), ":output:0"));
-  EXPECT_EQ(GetRetval(*vectorized, 1),
-            strings::StrCat(unpack_node.name(), ":output:1"));
-  EXPECT_EQ(GetRetval(*vectorized, 2),
-            strings::StrCat(unpack_node.name(), ":output:2"));
-  EXPECT_EQ(vectorized->node_def_size(), 1);
-}
-
 // Before:
 //
 //                        +------+
@@ -543,43 +382,31 @@ TEST(VectorizeMapDefunTest, VectorizeDefunOpWithMultipleOutputs) {
 // +--------------+ Ret0 +--+ Ret1 +--+ Ret2 +----------+
 //                +------+  +------+  +------+
 //
-TEST(VectorizeMapDefunTest, VectorizeDefunChainedConvertibleOps) {
-  FunctionDef inner = CreateFunction(
-      "inner_function", {{"arg0", DT_INT32}},
-      {{"ret0", DT_INT32}, {"ret1", DT_INT32}, {"ret2", DT_INT32}},
+TEST(VectorizeMapDefunTest, VectorizeWithChainedConvertibleOps) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int32", "ret1: int32", "ret2: int32"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {Cast("Cast", {"arg0"}, DT_INT32, DT_INT32),
+       {{"MyUnstack"},
+        "Unpack",
+        {"Cast:y:0"},
+        {{"T", DT_INT32}, {"axis", 0}, {"num", 3}}}},
+      /*ret_def=*/
       {{"ret0", "MyUnstack:output:0"},
        {"ret1", "MyUnstack:output:1"},
        {"ret2", "MyUnstack:output:2"}});
-  NodeDef* cast_op =
-      AddCastNode("Cast", {"arg0"}, DT_INT32, DT_INT32, false, &inner);
-  CHECK_NOTNULL(cast_op);
-  NodeDef* unstack_op =
-      AddUnstackNode("MyUnstack", {"Cast:y:0"}, DT_INT32, 0, 3, &inner);
-  CHECK_NOTNULL(unstack_op);
-
-  FunctionDef outer = CreateFunction("outer_function", {{"x", DT_INT32}},
-                                     {{"mapdefun", DT_INT32},
-                                      {"mapdefun_0", DT_INT32},
-                                      {"mapdefun_1", DT_INT32}},
-                                     {{"mapdefun", "MapDefun:output:0"},
-                                      {"mapdefun_0", "MapDefun:output:1"},
-                                      {"mapdefun_1", "MapDefun:output:2"}});
-
-  NodeDef* map_defun = AddMapDefunNode(
-      "MapDefun", {"x"}, {DT_INT32}, {DT_INT32, DT_INT32, DT_INT32},
-      {{1}, {1}, {1}}, inner.signature().name(), &outer);
-  CHECK_NOTNULL(map_defun);
 
   FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
   FunctionDef* vectorized;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
   EXPECT_TRUE(
       !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
   const NodeDef& cast_node = vectorized->node_def(
       function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
-  EXPECT_EQ(cast_node.input(0), "x");
+  EXPECT_EQ(cast_node.input(0), vectorized->signature().input_arg(0).name());
   const NodeDef& unpack_node = vectorized->node_def(
       function_utils::FindFunctionNodeWithOp("Unpack", *vectorized));
   EXPECT_EQ(unpack_node.input(0), strings::StrCat(cast_node.name(), ":y:0"));
@@ -627,34 +454,23 @@ TEST(VectorizeMapDefunTest, VectorizeDefunChainedConvertibleOps) {
 //
 //  No change because we don't deal with control inputs for now.
 //
-TEST(VectorizeMapDefunTest, VectorizeDefunWithControlInputs) {
-  FunctionDef inner =
-      CreateFunction("inner_function", {{"arg0", DT_INT32}},
-                     {{"ret0", DT_INT64}}, {{"ret0", "Cast:y:0"}});
-  NodeDef* print_op = function_utils::AddNode(
-      "Print", "Print", {"arg0", "arg0"}, {/*attrs*/}, &inner);
-  graph_transforms::SetNodeAttr("T", DT_INT32, print_op);
-  graph_transforms::SetNodeAttr("U", gtl::ArraySlice<DataType>({DT_INT32}),
-                                print_op);
-  CHECK_NOTNULL(print_op);
-  NodeDef* cast_op = AddCastNode("Cast", {"arg0", "^Print"}, DT_INT32, DT_INT64,
-                                 false, &inner);
-  CHECK_NOTNULL(cast_op);
-
-  FunctionDef outer = CreateFunction("outer_function", {{"x", DT_INT32}},
-                                     {{"mapdefun", DT_INT64}},
-                                     {{"mapdefun", "MapDefun:output:0"}});
-
-  NodeDef* map_defun =
-      AddMapDefunNode("MapDefun", {"x"}, {DT_INT32}, {DT_INT64}, {{}},
-                      inner.signature().name(), &outer);
-  CHECK_NOTNULL(map_defun);
+TEST(VectorizeMapDefunTest, VectorizeWithControlInputs) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {{{"Print"},
+        "Print",
+        {"arg0", "arg0"},
+        {{"T", DT_INT32}, {"U", gtl::ArraySlice<DataType>({DT_INT32})}}},
+       Cast("Cast", {"arg0", "^Print"}, DT_INT32, DT_INT64)},
+      /*ret_def=*/{{"ret0", "Cast:y:0"}});
 
   FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
   FunctionDef* vectorized;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
   // They should be unchanged
   // We check this somewhat manually as the names of nodes may have changed
   EXPECT_EQ(vectorized->node_def_size(), 1);
@@ -689,6 +505,10 @@ TEST(VectorizeMapDefunTest, VectorizeDefunWithControlInputs) {
 // |   |           |Const |     |   |
 // |   |           +---+--+     |   |
 // |   |               |        |   |
+// |   |           +---v--+     |   |
+// |   |           | Cast |     |   |
+// |   |           +---+--+     |   |
+// |   |               |        |   |
 // |   | MapDefun  +---v--+     |   |
 // |   +-----------+ Ret0 +-----+   |
 // |               +---+--+         |
@@ -708,7 +528,9 @@ TEST(VectorizeMapDefunTest, VectorizeDefunWithControlInputs) {
 // |               |Const |         |
 // |               +---+--+         |
 // |                   |            |
-// |                   |            |
+// |               +---v--+         |
+// |               | Cast |         |
+// |               +---+--+         |
 // |                   |            |
 // |               +---v--+         |
 // |               |Stack*|         |
@@ -721,27 +543,28 @@ TEST(VectorizeMapDefunTest, VectorizeDefunWithControlInputs) {
 //                 +------+
 // *Not actually a Stack node, but does the equivalent.
 //
-TEST(VectorizeMapDefunTest, VectorizeConst) {
+TEST(VectorizeMapDefunTest, VectorizeWithUnstackedOutput) {
   FunctionDef inner = FunctionDefHelper::Create(
-      "inner_function", {"arg0: int32"}, {"ret0: int32"}, {/* attrs */},
-      {/* nodes */ FunctionDefHelper::Const("Const", 2)},
-      {{"ret0", "Const:output:0"}});
-  FunctionDef outer = FunctionDefHelper::Create(
-      "outer_function", {"outer_arg0: int32"}, {"mapdefun: int32"},
-      {/* attrs */}, {/* nodes */}, {{"mapdefun", "MapDefun:output:0"}});
-
-  NodeDef* map_defun =
-      AddMapDefunNode("MapDefun", {"outer_arg0"}, {DT_INT32}, {DT_INT32}, {{}},
-                      inner.signature().name(), &outer);
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {FunctionDefHelper::Const("Const", 2),
+       Cast("Cast", {"Const:output:0"}, DT_INT32, DT_INT64)},
+      /*ret_def=*/{{"ret0", "Cast:y:0"}});
 
   FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
   FunctionDef* vectorized;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
   EXPECT_TRUE(
       !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
-  EXPECT_TRUE(function_utils::ContainsFunctionNodeWithOp("Const", *vectorized));
+  auto const_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Const", *vectorized));
+  auto cast_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
+  EXPECT_EQ(cast_node.input(0).substr(0, cast_node.input(0).find(':')),
+            const_node.name());
 }
 
 // Before:
@@ -755,13 +578,11 @@ TEST(VectorizeMapDefunTest, VectorizeConst) {
 // |   +-----------+ Arg0 +-----+   |
 // |   |           +------+     |   |
 // |   |                        |   |
-// |   |                        |   |
-// |   |           +------+     |   |
-// |   |           |Const |     |   |
-// |   |           +---+--+     |   |
-// |   |               |        |   |
-// |   |           +---v--+     |   |
-// |   |           | Cast |     |   |
+// |   | +------+  +------+     |   |
+// |   | |Const |  |Const |     |   |
+// |   | +---+--+  +---+--+     |   |
+// |   |     :     +---v--+     |   |
+// |   |     ::::::> Cast |     |   |
 // |   |           +---+--+     |   |
 // |   |               |        |   |
 // |   | MapDefun  +---v--+     |   |
@@ -775,57 +596,148 @@ TEST(VectorizeMapDefunTest, VectorizeConst) {
 //
 //  After:
 //
+//
 //                 +------+
 // +---------------+ Arg0 +---------+
 // |               +------+         |
 // |                                |
+// |                                |
 // |               +------+         |
-// |               |Const |         |
-// |               +---+--+         |
-// |                   |            |
-// |               +---v--+         |
-// |               | Cast |         |
+// |     +------+  |Const |         |
+// |     |Const |  +---+--+         |
+// |     +---+--+      |            |
+// |         :     +---v--+         |
+// |         ::::::> Cast |         |
 // |               +---+--+         |
 // |                   |            |
 // |               +---v--+         |
-// |               |Stack*|         |
+// |               +Stack*+         |
 // |               +---+--+         |
 // |                   |            |
-// |                   |            |
-// |                   |            |
 // |               +---v--+         |
 // +---------------+ Ret0 +---------+
 //                 +------+
 // *Not actually a Stack node, but does the equivalent.
 //
-TEST(VectorizeMapDefunTest, VectorizeUnstackedOutput) {
+TEST(VectorizeMapDefunTest, VectorizeWithUnstackedControl) {
   FunctionDef inner = FunctionDefHelper::Create(
-      "inner_function", {"arg0: int32"}, {"ret0: int64"}, {/* attrs */},
-      {/* nodes */ FunctionDefHelper::Const("Const", 2)},
-      {{"ret0", "Cast:y:0"}});
-  AddCastNode("Cast", {"Const:output:0"}, DT_INT32, DT_INT64, false, &inner);
-
-  FunctionDef outer = FunctionDefHelper::Create(
-      "outer_function", {"outer_arg0: int32"}, {"mapdefun: int64"},
-      {/* attrs */}, {/* nodes */}, {{"mapdefun", "MapDefun:output:0"}});
-
-  NodeDef* map_defun =
-      AddMapDefunNode("MapDefun", {"outer_arg0"}, {DT_INT32}, {DT_INT64}, {{}},
-                      inner.signature().name(), &outer);
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {FunctionDefHelper::Const("Const", 2),
+       FunctionDefHelper::Const("ConstDep", 3),
+       Cast("Cast", {"Const:output:0", "^ConstDep"}, DT_INT32, DT_INT64)},
+      /*ret_def=*/{{"ret0", "Cast:y:0"}});
 
   FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
   FunctionDef* vectorized;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+
+  auto find_const = [vectorized](int val) -> const NodeDef* {
+    for (const auto& n : vectorized->node_def()) {
+      if (n.attr().at("value").tensor().int_val(0) == val) {
+        return &n;
+      }
+    }
+    return nullptr;
+  };
+
   EXPECT_TRUE(
       !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
-  auto const_node = vectorized->node_def(
-      function_utils::FindFunctionNodeWithOp("Const", *vectorized));
+  auto const_node = find_const(2);
+  auto const_dep_node = find_const(3);
   auto cast_node = vectorized->node_def(
       function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
   EXPECT_EQ(cast_node.input(0).substr(0, cast_node.input(0).find(':')),
-            const_node.name());
+            const_node->name());
+  EXPECT_EQ(cast_node.input(1), strings::StrCat("^", const_dep_node->name()));
+}
+
+///==================================//
+// Tests for specific op vectorizers //
+///==================================//
+
+// Before:
+//
+//                        +------+
+// +----------------------+ Arg0 +----------------------+
+// |                      +---+--+                      |
+// |                          |                         |
+// |                      +---v--+                      |
+// |   +------------------+ Arg0 +------------------+   |
+// |   |                  +---+--+                  |   |
+// |   |                      |                     |   |
+// |   |                      |                     |   |
+// |   |                  +---v---+ num=3           |   |
+// |   |                  |Unstack| axis=0          |   |
+// |   |                  ++--+--++                 |   |
+// |   |                   |  |  |                  |   |
+// |   |              +----+  |  +-------+          |   |
+// |   |              |       |          |          |   |
+// |   | MapDefun +---v--+  +-v----+  +--v---+      |   |
+// |   +----------+ Ret0 +--+ Ret1 +--+ Ret2 +------+   |
+// |              +---+--+  +--+---+  +--+---+          |
+// |                  |        |         |              |
+// |              +---v--+  +--v---+  +--v---+          |
+// +--------------+ Ret0 +--+ Ret1 +--+ Ret2 +----------+
+//                +------+  +------+  +------+
+//
+//
+//  After:
+//
+//                        +------+
+// +----------------------+ Arg0 +----------------------+
+// |                      +---+--+                      |
+// |                          |                         |
+// |                          |                         |
+// |                          |                         |
+// |                      +---v---+ num=3               |
+// |                      |Unstack| axis=1              |
+// |                      ++--+--++                     |
+// |                       |  |  |                      |
+// |                  +----+  |  +-------+              |
+// |                  |       |          |              |
+// |                  |       |          |              |
+// |              +---v--+  +-v----+  +--v---+          |
+// +--------------+ Ret0 +--+ Ret1 +--+ Ret2 +----------+
+//                +------+  +------+  +------+
+//
+TEST(VectorizerTest, VectorizeUnstack) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int32", "ret1: int32", "ret2: int32"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {{{"MyUnstack"},
+        "Unpack",
+        {"arg0"},
+        {{"T", DT_INT32}, {"axis", 0}, {"num", 3}}}},
+      /*ret_def=*/
+      {{"ret0", "MyUnstack:output:0"},
+       {"ret1", "MyUnstack:output:1"},
+       {"ret2", "MyUnstack:output:2"}});
+
+  FunctionDefLibrary lib;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  const NodeDef& unpack_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Unpack", *vectorized));
+  EXPECT_EQ(unpack_node.input(0), vectorized->signature().input_arg(0).name());
+  EXPECT_EQ(unpack_node.attr().at("axis").i(), 1);
+  EXPECT_EQ(unpack_node.attr().at("T").type(), DT_INT32);
+  EXPECT_EQ(unpack_node.attr().at("num").i(), 3);
+  EXPECT_EQ(GetRetval(*vectorized, 0),
+            strings::StrCat(unpack_node.name(), ":output:0"));
+  EXPECT_EQ(GetRetval(*vectorized, 1),
+            strings::StrCat(unpack_node.name(), ":output:1"));
+  EXPECT_EQ(GetRetval(*vectorized, 2),
+            strings::StrCat(unpack_node.name(), ":output:2"));
+  EXPECT_EQ(vectorized->node_def_size(), 1);
 }
 
 // Before:
@@ -837,13 +749,11 @@ TEST(VectorizeMapDefunTest, VectorizeUnstackedOutput) {
 // |                   |            |
 // |               +---v--+         |
 // |   +-----------+ Arg0 +-----+   |
-// |   |           +------+     |   |
-// |   |                        |   |
-// |   | +------+  +------+     |   |
-// |   | |Const |  |Const |     |   |
-// |   | +---+--+  +---+--+     |   |
-// |   |     :     +---v--+     |   |
-// |   |     ::::::> Cast |     |   |
+// |   |           +---+--+     |   |
+// |   |               |        |   |
+// |   |               |        |   |
+// |   |           +---v--+     |   |
+// |   |           | Cast |     |   |
 // |   |           +---+--+     |   |
 // |   |               |        |   |
 // |   | MapDefun  +---v--+     |   |
@@ -857,81 +767,618 @@ TEST(VectorizeMapDefunTest, VectorizeUnstackedOutput) {
 //
 //  After:
 //
-//
 //                 +------+
 // +---------------+ Arg0 +---------+
-// |               +------+         |
-// |                                |
-// |                                |
-// |               +------+         |
-// |     +------+  |Const |         |
-// |     |Const |  +---+--+         |
-// |     +---+--+      |            |
-// |         :     +---v--+         |
-// |         ::::::> Cast |         |
 // |               +---+--+         |
 // |                   |            |
 // |               +---v--+         |
-// |               +Stack*+         |
+// |               | Cast |         |
 // |               +---+--+         |
 // |                   |            |
 // |               +---v--+         |
 // +---------------+ Ret0 +---------+
 //                 +------+
-// *Not actually a Stack node, but does the equivalent.
 //
-TEST(VectorizeMapDefunTest, VectorizeUnstackedControl) {
+TEST(VectorizerTest, VectorizeCast) {
   FunctionDef inner = FunctionDefHelper::Create(
-      "inner_function", {"arg0: int32"}, {"ret0: int64"}, {/* attrs */},
-      {/* nodes */ FunctionDefHelper::Const("Const", 2),
-       FunctionDefHelper::Const("ConstDep", 3)},
-      {{"ret0", "Cast:y:0"}});
-  AddCastNode("Cast", {"Const:output:0", "^ConstDep"}, DT_INT32, DT_INT64,
-              false, &inner);
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{Cast("Cast", {"arg0"}, DT_INT32, DT_INT64)},
+      /*ret_def=*/{{"ret0", "Cast:y:0"}});
 
-  FunctionDef outer = FunctionDefHelper::Create(
-      "outer_function", {"outer_arg0: int32"}, {"mapdefun: int64"},
-      {/* attrs */}, {/* nodes */}, {{"mapdefun", "MapDefun:output:0"}});
+  FunctionDefLibrary lib;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  const NodeDef& cast_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
+  EXPECT_EQ(cast_node.input(0), vectorized->signature().input_arg(0).name());
+  EXPECT_EQ(GetRetval(*vectorized, 0),
+            strings::StrCat(cast_node.name(), ":y:0"));
+  EXPECT_EQ(vectorized->node_def_size(), 1);
+}
 
-  NodeDef* map_defun =
-      AddMapDefunNode("MapDefun", {"outer_arg0"}, {DT_INT32}, {DT_INT64}, {{}},
-                      inner.signature().name(), &outer);
+// Before:
+//
+//                   +------+
+// +-----------------+ Arg0 +----------------------+
+// |                 +---+--+                      |
+// |                     |                         |
+// |                 +---v--+                      |
+// |   +-------------+ Arg0 +------------------+   |
+// |   |             +---+--+                  |   |
+// |   |                 |                     |   |
+// |   |                 |          +-----+    |   |
+// |   |                 |          |Const|    |   |
+// |   |                 |          +-+---+    |   |
+// |   |                 |            |        |   |
+// |   |                 |   +--------+        |   |
+// |   |                 |   |                 |   |
+// |   |               +-v---v-+               |   |
+// |   |               |  Add  |               |   |
+// |   |               +-+-----+               |   |
+// |   |                 |                     |   |
+// |   |                 |                     |   |
+// |   | MapDefun      +-v----+                |   |
+// |   +---------------| Ret  |----------------+   |
+// |                   +--v---+                    |
+// |                      |                        |
+// |                      |                        |
+// |                   +--v----                    |
+// +-------------------| Ret  |--------------------+
+//                     +------+
+//
+//
+//  After:
+//
+//              +------+
+// +------------+ Arg0 +----------------------+
+// |            +---+--+                      |
+// |                |                         |
+// |                |              +-----+    |
+// |                |              |Const|    |
+// |              +-v---------+    +--+--+    |
+// |              |ExpandDims*|       |       |
+// |              +-----+-----+       |       |
+// |                    |             |       |
+// |                    +-----+ +-----+       |
+// |                          | |             |
+// |                        +-v-v-+           |
+// |                        | Add |           |
+// |                        +--+--+           |
+// |                           |              |
+// |                       +---v--+           |
+// +-----------------------+ Ret  +-----------+
+//                         +------+
+//
+TEST(VectorizerTest, VectorizeAdd) {
+  // Note that this checks that the "Add" vectorizer is successful, but does not
+  // check that the transformed function is correct (i.e. produces the same
+  // output as the unvectorized map defun). For the latter, the tests are in
+  // tensorflow/python/data/experimental/kernel_tests/optimization/
+  // map_vectorization_test.py
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int32"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {FunctionDefHelper::Const("Const", 2),
+       {{"Add"}, "Add", {"arg0", "Const:output:0"}, {{"T", DT_INT32}}}},
+      /*ret_def=*/{{"ret0", "Add:z:0"}});
 
   FunctionDefLibrary lib;
-  *lib.add_function() = outer;
-  *lib.add_function() = inner;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+}
+
+// Tests that a function which applies a cwise op can be vectorized completely.
+Status CwiseTestHelper(DataType input_type, const string& op_type,
+                       size_t arity) {
+  // Note that this checks that the cwise op vectorizer is successful, but does
+  // not check that the transformed function is correct (i.e. produces the same
+  // output as the unvectorized map defun). For the latter, the tests are in
+  // tensorflow/python/data/experimental/kernel_tests/optimization/
+  // map_vectorization_test.py
+
+  FunctionDef inner;
+  // Create inner function with a single operation of type op_type. The output
+  // type attr of the function is inferred by NodeBuilder.
+  Node *op, *retval;
+  Graph graph(OpRegistry::Global());
+
+  auto node_builder = NodeBuilder("op", op_type);
+  for (size_t i = 0; i < arity; ++i) {
+    Node* arg;
+    TF_RETURN_IF_ERROR(NodeBuilder(strings::StrCat("arg", i), "_Arg")
+                           .Attr("T", input_type)
+                           .Attr("index", static_cast<int>(i))
+                           .Finalize(&graph, &arg));
+
+    node_builder = node_builder.Input(arg);
+  }
+  TF_RETURN_IF_ERROR(node_builder.Finalize(&graph, &op));
 
+  TF_RETURN_IF_ERROR(NodeBuilder("ret", "_Retval")
+                         .Input(op)
+                         .Attr("index", 0)
+                         .Finalize(&graph, &retval));
+
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(graph, "inner_function", &inner));
+
+  FunctionDefLibrary lib;
   FunctionDef* vectorized;
-  EXPECT_TRUE(VectorizeMapDefun(outer, *map_defun, &lib, &vectorized).ok());
+  TF_RETURN_IF_ERROR(WrapAndVectorize(inner, &lib, &vectorized));
 
-  auto find_const = [vectorized](int val) -> const NodeDef* {
-    for (const auto& n : vectorized->node_def()) {
-      if (n.attr().at("value").tensor().int_val(0) == val) {
-        return &n;
-      }
-    }
-    return nullptr;
-  };
+  return function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized)
+             ? errors::Internal(
+                   "Test for cwise vectorizer for op \"", op_type,
+                   "\" failed. The function was not fully vectorized.")
+             : Status::OK();
+}
+
+class BitwiseUnaryTest : public ::testing::TestWithParam<const char*> {};
 
+TEST_P(BitwiseUnaryTest, VectorizeCwiseBitwiseUnary) {
+  TF_EXPECT_OK(CwiseTestHelper(DT_INT32, GetParam(), 1));
+}
+
+INSTANTIATE_TEST_CASE_P(Test, BitwiseUnaryTest, ::testing::Values("Invert"));
+
+class LogicalUnaryTest : public ::testing::TestWithParam<const char*> {};
+
+TEST_P(LogicalUnaryTest, VectorizeCwiseLogicalUnary) {
+  TF_EXPECT_OK(CwiseTestHelper(DT_BOOL, GetParam(), 1));
+}
+
+INSTANTIATE_TEST_CASE_P(Test, LogicalUnaryTest,
+                        ::testing::Values("LogicalNot"));
+
+class ComplexUnaryTest : public ::testing::TestWithParam<const char*> {};
+
+TEST_P(ComplexUnaryTest, VectorizeCwiseComplexUnary) {
+  TF_EXPECT_OK(CwiseTestHelper(DT_COMPLEX64, GetParam(), 1));
+}
+
+INSTANTIATE_TEST_CASE_P(Test, ComplexUnaryTest,
+                        ::testing::Values("Angle", "ComplexAbs", "Conj", "Imag",
+                                          "Real"));
+
+class RealUnaryTest : public ::testing::TestWithParam<const char*> {};
+
+TEST_P(RealUnaryTest, VectorizeCwiseRealUnary) {
+  TF_EXPECT_OK(CwiseTestHelper(DT_FLOAT, GetParam(), 1));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Test, RealUnaryTest,
+    ::testing::Values("Abs", "Acos", "Acosh", "Asin", "Asinh", "Atan", "Atanh",
+                      "BesselI0e", "BesselI1e", "Ceil", "Cos", "Cosh",
+                      "Digamma", "Elu", "Erf", "Erfc", "Exp", "Expm1", "Floor",
+                      "Inv", "IsFinite", "IsInf", "Lgamma", "Log", "Log1p",
+                      "Neg", "Reciprocal", "Relu", "Relu6", "Rint", "Round",
+                      "Rsqrt", "Selu", "Sigmoid", "Sign", "Sin", "Sinh",
+                      "Softplus", "Softsign", "Sqrt", "Square", "Tanh", "Tan"));
+
+class BitwiseBinaryTest : public ::testing::TestWithParam<const char*> {};
+
+TEST_P(BitwiseBinaryTest, VectorizeCwiseBitwiseBinary) {
+  TF_EXPECT_OK(CwiseTestHelper(DT_INT32, GetParam(), 2));
+}
+
+INSTANTIATE_TEST_CASE_P(Test, BitwiseBinaryTest,
+                        ::testing::Values("BitwiseAnd", "BitwiseOr",
+                                          "BitwiseXor", "LeftShift",
+                                          "RightShift"));
+
+class LogicalBinaryTest : public ::testing::TestWithParam<const char*> {};
+
+TEST_P(LogicalBinaryTest, VectorizeCwiseLogicalBinary) {
+  TF_EXPECT_OK(CwiseTestHelper(DT_BOOL, GetParam(), 2));
+}
+
+INSTANTIATE_TEST_CASE_P(Test, LogicalBinaryTest,
+                        ::testing::Values("LogicalAnd", "LogicalOr"));
+
+class RealBinaryTest : public ::testing::TestWithParam<const char*> {};
+
+TEST_P(RealBinaryTest, VectorizeCwiseRealBinary) {
+  TF_EXPECT_OK(CwiseTestHelper(DT_FLOAT, GetParam(), 2));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Test, RealBinaryTest,
+    ::testing::Values("Add", "AddV2", "Atan2", "Complex", "Div", "DivNoNan",
+                      "Equal", "FloorDiv", "FloorMod", "Greater",
+                      "GreaterEqual", "Igamma", "Igammac", "IgammaGradA",
+                      "Less", "LessEqual", "Maximum", "Minimum", "Mod", "Mul",
+                      "NotEqual", "Polygamma", "Pow", "RealDiv",
+                      "SquaredDifference", "Sub", "TruncateDiv", "TruncateMod",
+                      "Zeta"));
+
+// Before:
+//
+//
+//                 +------+
+// +---------------+ Arg0 +---------------------+
+// |               +---+--+                     |
+// |                   |                        |
+// |               +---v--+                     |
+// |   +-----------+ Arg0 +-----------------+   |
+// |   |           +---+--+                 |   |
+// |   |               |                    |   |
+// |   |               |                    |   |
+// |   |               |   (3,3,3)          |   |
+// |   |               |   +-----+          |   |
+// |   |               |   |Const|          |   |
+// |   |               |   +--+--+          |   |
+// |   |               |      |             |   |
+// |   |               | +----+             |   |
+// |   |           +---v-v-+                |   |
+// |   |           |Reshape|                |   |
+// |   |           +---+---+                |   |
+// |   |               |                    |   |
+// |   | MapDefun  +---v--+                 |   |
+// |   +-----------+ Ret0 +-----------------+   |
+// |               +---+--+                     |
+// |                   |                        |
+// |               +---v--+                     |
+// +---------------+ Ret0 +---------------------+
+//                 +------+
+//
+//
+//  After:
+//
+//           +------+
+// +---------+ Arg0 +------------------------+
+// |         +---+--+                        |
+// |             |                           |
+// |             |                           |
+// |             |     +-----+               |
+// |             |     |Const|               |
+// |             |     +--+--+               |
+// |             |        |                  |
+// |             |    +---v---+              |
+// |             |    |Concat*|              |
+// |             |    +---+---+              |
+// |             |        |                  |
+// |             | +------+                  |
+// |             | |                         |
+// |         +---v-v-+                       |
+// |         |Reshape|                       |
+// |         +---+---+                       |
+// |             |                           |
+// |         +---v--+                        |
+// +---------+ Ret0 +------------------------+
+//           +------+
+//
+// (Where Concat* appends the 0th dim of the input to the new shape)
+//
+TEST(VectorizerTest, VectorizeReshape) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int32"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {FunctionDefHelper::Const("Const", gtl::ArraySlice<int>({3, 3, 3})),
+       {{"Reshape"},
+        "Reshape",
+        {"arg0", "Const:output:0"},
+        {{"T", DT_INT32}, {"Tshape", DT_INT32}}}},
+      /*ret_def=*/{{"ret0", "Reshape:output:0"}});
+
+  FunctionDefLibrary lib;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
   EXPECT_TRUE(
       !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
-  auto const_node = find_const(2);
-  auto const_dep_node = find_const(3);
-  auto cast_node = vectorized->node_def(
-      function_utils::FindFunctionNodeWithOp("Cast", *vectorized));
-  EXPECT_EQ(cast_node.input(0).substr(0, cast_node.input(0).find(':')),
-            const_node->name());
-  EXPECT_EQ(cast_node.input(1), strings::StrCat("^", const_dep_node->name()));
+  EXPECT_TRUE(
+      function_utils::ContainsFunctionNodeWithOp("Reshape", *vectorized));
+  auto reshape_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Reshape", *vectorized));
+  EXPECT_EQ(GetRetval(*vectorized, 0),
+            strings::StrCat(reshape_node.name(), ":output:0"));
+}
+
+// Before:
+//
+//
+//                 +------+
+// +---------------+ Arg0 +---------------------+
+// |               +---+--+                     |
+// |                   |                        |
+// |               +---v--+                     |
+// |   +-----------+ Arg0 +-----------------+   |
+// |   |           +---+--+                 |   |
+// |   |               |                    |   |
+// |   |               |   record_defaults  |   |
+// |   |               |   +-----+  +-----+ |   |
+// |   |               |   |Const|  |Const| |   |
+// |   |               |   +--+--+  +--+--+ |   |
+// |   |               |      |        |    |   |
+// |   |               | +----+        |    |   |
+// |   |               | |             |    |   |
+// |   |               | | +-----------+    |   |
+// |   |               | | |                |   |
+// |   |           +---v-v-v-+              |   |
+// |   |           |DecodeCSV|              |   |
+// |   |           +---+---+-+              |   |
+// |   |               |   |                |   |
+// |   |               |   +------+         |   |
+// |   |               |          |         |   |
+// |   | MapDefun  +---v--+   +---v--+      |   |
+// |   +-----------+ Ret0 +---+ Ret1 +------+   |
+// |               +---+--+   +---+--+          |
+// |                   |          |             |
+// |               +---v--+   +---v--+          |
+// +---------------+ Ret0 +---+ Ret1 +----------+
+//                 +------+   +------+
+//
+//  After:
+//
+//           +------+
+// +---------+ Arg0 +------------------------+
+// |         +---+--+                        |
+// |             |                           |
+// |             |                           |
+// |             |     +-----+ +-----+       |
+// |             |     |Const| |Const|       |
+// |             |     +--+--+ +--+--+       |
+// |             |        |       |          |
+// |             |        |       |          |
+// |             | +------+       |          |
+// |             | | +------------+          |
+// |             | | |                       |
+// |             | | |                       |
+// |         +---v-v-v-+                     |
+// |         |DecodeCSV|                     |
+// |         +---+---+-+                     |
+// |             |   |                       |
+// |             |   +-------+               |
+// |             |           |               |
+// |           +-v----+   +--v---+           |
+// +-----------+ Ret0 +---+ Ret1 +-----------+
+//             +------+   +------+
+//
+//
+TEST(VectorizerTest, VectorizeDecodeCSV) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: string"},
+      /*out_def=*/{"ret0: int32", "ret1: string"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {FunctionDefHelper::Const("Default0", gtl::ArraySlice<int>({2})),
+       FunctionDefHelper::Const("Default1", gtl::ArraySlice<string>({})),
+       {{"DecodeCSV"},
+        "DecodeCSV",
+        {"arg0", "Default0:output:0", "Default1:output:0"},
+        {{"OUT_TYPE", DataTypeVector({DT_INT32, DT_STRING})}}}},
+      /*ret_def=*/
+      {{"ret0", "DecodeCSV:output:0"}, {"ret1", "DecodeCSV:output:1"}});
+
+  FunctionDefLibrary lib;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+}
+
+TEST(VectorizerTest, VectorizeDecodeCSVWithStackedDefaults) {
+  // When the `record_defaults` input to DecodeCSV are stacked,
+  // the node should not be vectorized.
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: string", "arg1: int32", "arg2: string"},
+      /*out_def=*/{"ret0: int32", "ret1: string"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {{{"DecodeCSV"},
+        "DecodeCSV",
+        {"arg0", "arg1", "arg2"},  // Inputs come from args, which are "stacked"
+        {{"OUT_TYPE", DataTypeVector({DT_INT32, DT_STRING})}}}},
+      /*ret_def=*/
+      {{"ret0", "DecodeCSV:output:0"}, {"ret1", "DecodeCSV:output:1"}});
+
+  FunctionDefLibrary lib;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+  EXPECT_TRUE(
+      function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+}
+
+// Before:
+//
+//
+//                 +------+
+// +---------------+ Arg0 +---------------------+
+// |               +---+--+                     |
+// |                   |                        |
+// |               +---v--+                     |
+// |   +-----------+ Arg0 +-----------------+   |
+// |   |           +---+--+                 |   |
+// |   |               |                    |   |
+// |   |               |   dense_defaults   |   |
+// |   |               |   +-----+  +-----+ |   |
+// |   |               |   |Const|  |Const| |   |
+// |   |               |   +--+--+  +--+--+ |   |
+// |   |               |      |        |    |   |
+// |   |               | +----+        |    |   |
+// |   |               | |             |    |   |
+// |   |               | | +-----------+    |   |
+// |   |               | | |                |   |
+// |   |           +---v-v-v----------+     |   |
+// |   |           |ParseSingleExample|     |   |
+// |   |           +---+---+----------+     |   |
+// |   |               |                    |   |
+// |   |             (...)                  |   |
+// |   |               |                    |   |
+// |   | MapDefun  +---v--+                 |   |
+// |   +-----------+ Rets*+-----------------+   |
+// |               +---+--+                     |
+// |                   |                        |
+// |               +---v--+                     |
+// +---------------+ Rets*+---------------------+
+//                 +------+
+//
+//  After:
+//
+//           +------+
+// +---------+ Arg0 +------------------------------------+
+// |         +---+--+                                    |
+// |             |                                       |
+// |             |   names                               |
+// |             |   sparse_types                        |
+// |             |   dense_types   dense_defaults        |
+// |             |  +============+ +-----+ +-----+       |
+// |             |  |  Consts*   | |Const| |Const|       |
+// |             |  +============+ +--+--+ +--+--+       |
+// |             |       |            |       |          |
+// |             |     (...)          |       |          |
+// |             |       |     +------+       |          |
+// |             |       |     | +------------+          |
+// |             |       |     | |                       |
+// |             |       |     | |                       |
+// |         +---v-------v-----v-v-+                     |
+// |         |  ParseExample       |                     |
+// |         +---+-----------------+                     |
+// |             |                                       |
+// |           (...)                                     |
+// |             |                                       |
+// |           +-v----+                                  |
+// +-----------+ Rets*+----------------------------------+
+//             +------+
+//
+// *Multiple nodes. Only one drawn for brevity.
+//
+TEST(VectorizerTest, VectorizeParseSingleExample) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: string"},
+      /*out_def=*/
+      {"si0: int64", "si1: int64", "sv0: int64", "sv1: string", "ss0: int64",
+       "ss1: int64", "dv0: int64", "dv1: string"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {FunctionDefHelper::Const("DenseIntDefault", static_cast<int64>(0)),
+       FunctionDefHelper::Const("DenseStrDefault", string("")),
+       {{"Parse"},
+        "ParseSingleExample",
+        {"arg0", "DenseIntDefault:output:0", "DenseStrDefault:output:0"},
+        {
+            {"Tdense", DataTypeVector({DT_INT64, DT_STRING})},
+            {"dense_keys", gtl::ArraySlice<string>({"dense_int", "dense_str"})},
+            {"dense_shapes", gtl::ArraySlice<TensorShape>({}, {})},
+            {"num_sparse", 2},
+            {"sparse_keys", gtl::ArraySlice<string>({"spar_int", "spar_str"})},
+            {"sparse_types", DataTypeVector({DT_INT64, DT_STRING})},
+        }}},
+      /*ret_def=*/
+      {
+          {"si0", "Parse:sparse_indices:0"},
+          {"si1", "Parse:sparse_indices:1"},
+          {"sv0", "Parse:sparse_values:0"},
+          {"sv1", "Parse:sparse_values:1"},
+          {"ss0", "Parse:sparse_shapes:0"},
+          {"ss1", "Parse:sparse_shapes:1"},
+          {"dv0", "Parse:dense_values:0"},
+          {"dv1", "Parse:dense_values:1"},
+      });
+
+  FunctionDefLibrary lib;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+  EXPECT_TRUE(
+      !function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  EXPECT_TRUE(
+      function_utils::ContainsFunctionNodeWithOp("ParseExample", *vectorized));
 }
 
-// TODO(rachelim): More test cases when we get around to implementing them:
-// [] A badly defined converter, e.g. doesn't produce nodes that have the
-//    same number of outputs/inputs as the nodes to be converted
-// [] Converter where the 'converted' form has multiple nodes.
-// [] Case with dependent nodes, e.g. ops with const inputs that are
-//    broadcasted.
-// [] Python-side tests to actually run the functions to make sure
-//    they work.
+TEST(VectorizerTest, VectorizeParseSingleExampleWithStackedDefaults) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: string", "arg1: string"},
+      /*out_def=*/{"dv0: int64", "dv1: string"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {FunctionDefHelper::Const("DenseIntDefault", static_cast<int64>(0)),
+       {{"Parse"},
+        "ParseSingleExample",
+        {"arg0", "DenseIntDefault:output:0", "arg1"},
+        {
+            {"Tdense", DataTypeVector({DT_INT64, DT_STRING})},
+            {"dense_keys", gtl::ArraySlice<string>({"dense_int", "dense_str"})},
+            {"dense_shapes", gtl::ArraySlice<TensorShape>({}, {})},
+            {"num_sparse", 0},
+            {"sparse_keys", gtl::ArraySlice<string>({})},
+            {"sparse_types", DataTypeVector({})},
+        }}},
+      /*ret_def=*/
+      {
+          {"dv0", "Parse:dense_values:0"},
+          {"dv1", "Parse:dense_values:1"},
+      });
+
+  FunctionDefLibrary lib;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+  EXPECT_TRUE(
+      function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+}
+
+TEST(VectorizerTest, VectorizeTranspose) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"out: int32"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {FunctionDefHelper::Const("Perm", gtl::ArraySlice<int>({1, 0})),
+       {{"Transpose"},
+        "Transpose",
+        {"arg0", "Perm:output:0"},
+        {{"T", DT_INT32}, {"Tperm", DT_INT32}}}},
+      /*ret_def=*/{{"out", "Transpose:y:0"}});
+
+  FunctionDefLibrary lib;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+  EXPECT_FALSE(
+      function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+}
+
+TEST(VectorizerTest, VectorizeIdentity) {
+  FunctionDef inner = FunctionDefHelper::Create(
+      /*function_name=*/"inner_function",
+      /*in_def=*/{"arg0: int32"},
+      /*out_def=*/{"ret0: int32"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"Identity"}, "Identity", {"arg0"}, {{"T", DT_INT32}}}},
+      /*ret_def=*/{{"ret0", "Identity:output:0"}});
+
+  FunctionDefLibrary lib;
+  FunctionDef* vectorized;
+  TF_ASSERT_OK(WrapAndVectorize(inner, &lib, &vectorized));
+
+  EXPECT_FALSE(
+      function_utils::ContainsFunctionNodeWithOp("MapDefun", *vectorized));
+  ASSERT_TRUE(
+      function_utils::ContainsFunctionNodeWithOp("Identity", *vectorized));
+  const NodeDef& identity_node = vectorized->node_def(
+      function_utils::FindFunctionNodeWithOp("Identity", *vectorized));
+
+  EXPECT_EQ(identity_node.input(0),
+            vectorized->signature().input_arg(0).name());
+  EXPECT_EQ(GetRetval(*vectorized, 0),
+            strings::StrCat(identity_node.name(), ":output:0"));
+  EXPECT_EQ(vectorized->node_def_size(), 1);
+}
 
 }  // namespace
 }  // namespace vectorization_utils
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index bb14ce310dc151d109b1106e82c424f59b9e6cec..7fee3ae9d51bcdb234945a6000985fb5531000a0 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -57,7 +57,7 @@ bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
 }  // namespace
 
 bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) const {
-  if (!IsIdentity(node)) {
+  if (!IsIdentity(node) && !IsIdentityN(node)) {
     return true;
   }
 
@@ -133,15 +133,53 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) const {
   return true;
 }
 
+int DependencyOptimizer::NumEdgesIfBypassed(
+    const NodeDef& node, const std::vector<NodeDef*>& output_nodes) const {
+  const bool is_multi_input_identity_n =
+      IsIdentityN(node) && !IsIdentityNSingleInput(node);
+  const int num_outputs = output_nodes.size();
+  const int num_inputs = node.input_size();
+
+  if (is_multi_input_identity_n) {
+    // multi-input identity_n with input/output control dependencies will likely
+    // increase number of edges after optimization.
+    int num_edges_if_bypassed(0);
+    for (string input_node_name : node.input()) {
+      if (IsControlInput(input_node_name)) {
+        num_edges_if_bypassed += num_outputs;
+      } else {
+        ++num_edges_if_bypassed;
+      }
+    }
+
+    for (auto consumer : output_nodes) {
+      for (int j = 0; j < consumer->input_size(); ++j) {
+        const TensorId consumer_input = ParseTensorName(consumer->input(j));
+        if (consumer_input.node() == node.name()) {
+          if (IsControlInput(consumer_input)) {
+            num_edges_if_bypassed += num_inputs;
+          } else {
+            ++num_edges_if_bypassed;
+          }
+        }
+      }
+    }
+    return num_edges_if_bypassed;
+  } else {
+    return num_inputs * num_outputs;
+  }
+}
+
 bool DependencyOptimizer::BypassingNodeIsBeneficial(
     const NodeDef& node, const std::vector<NodeDef*>& input_nodes,
     const std::vector<NodeDef*>& output_nodes) const {
-  const bool is_identity = IsIdentity(node);
+  const bool is_identity = IsIdentity(node) || IsIdentityNSingleInput(node);
+  const bool is_multi_input_identity_n =
+      IsIdentityN(node) && !IsIdentityNSingleInput(node);
   const int num_outputs = output_nodes.size();
   const int num_inputs = node.input_size();
 
-  // Don't increase the number of edges in the graph.
-  if (num_inputs * num_outputs > num_inputs + num_outputs) {
+  if (NumEdgesIfBypassed(node, output_nodes) > num_inputs + num_outputs) {
     return false;
   }
 
@@ -166,7 +204,9 @@ bool DependencyOptimizer::BypassingNodeIsBeneficial(
   for (NodeDef* output_node : output_nodes) {
     num_cross_out += static_cast<int>(output_node->device() != node_dev);
   }
-  if (is_identity && num_cross_in > 0 && num_cross_out > 0) {
+
+  if ((is_identity || is_multi_input_identity_n) && num_cross_in > 0 &&
+      num_cross_out > 0) {
     // This identity node follows a device crossing, so it might be
     // following a _Recv node after partioning. Do not remove such nodes,
     // unless they only have consumers on the same device as themselves.
@@ -193,7 +233,9 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
                                        std::set<int>* nodes_to_delete) {
   NodeDef* node = optimized_graph_->mutable_node(node_idx);
   const bool is_noop = IsNoOp(*node);
-  const bool is_identity = IsIdentity(*node);
+  const bool is_identity = IsIdentity(*node) || IsIdentityNSingleInput(*node);
+  const bool is_multi_input_identity =
+      IsIdentityN(*node) && !IsIdentityNSingleInput(*node);
   const string node_name = node->name();
   // Constant nodes with no input control dependency are always executed early,
   // so we can prune all their output control dependencies.
@@ -203,11 +245,9 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       bool optimize_fanout = false;
       bool data_connection = false;
       for (int i = fanout->input_size() - 1; i >= 0; --i) {
-        int pos;
-        StringPiece input_name =
-            ParseNodeNameAsStringPiece(fanout->input(i), &pos);
-        if (input_name == node_name) {
-          if (pos < 0) {
+        const TensorId input_tensor = ParseTensorName(fanout->input(i));
+        if (input_tensor.node() == node_name) {
+          if (input_tensor.index() < 0) {
             fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
             fanout->mutable_input()->RemoveLast();
             optimize_fanout = true;
@@ -315,7 +355,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
   //    y --^> |          | --^> b       /\    +---+
   //           +----------+             y --^> b
 
-  if (is_noop || (is_identity && SafeToRemoveIdentity(*node))) {
+  if (is_noop || ((is_identity || is_multi_input_identity) &&
+                  SafeToRemoveIdentity(*node))) {
     const auto& output_node_set = node_map_->GetOutputs(node_name);
     const std::vector<NodeDef*> output_nodes(output_node_set.begin(),
                                              output_node_set.end());
@@ -343,34 +384,30 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
         const NodeDef* input = input_nodes[i];
         // Forward dependency from input to consumer if it doesn't already
         // depend on it.
-        if (is_identity && i == 0) {
+        if ((is_identity && i == 0) ||
+            (is_multi_input_identity && !IsControlInput(node->input(i)))) {
           // Replace regular input from Identity node.
-          bool found_input = false;
           string new_input;
-          const string& input_to_forward = node->input(0);
+          const string& input_to_forward = node->input(i);
           CHECK(!IsControlInput(input_to_forward));
           for (int j = 0; j < consumer->input_size(); ++j) {
-            const string& old_input = consumer->input(j);
-            int old_input_pos;
-            StringPiece old_input_node_name =
-                ParseNodeNameAsStringPiece(old_input, &old_input_pos);
-            if (old_input_node_name == node_name) {
-              if (old_input_pos >= 0) {
+            const TensorId old_input = ParseTensorName(consumer->input(j));
+            if (old_input.node() == node_name) {
+              if (old_input.index() == i) {
                 // Regular input
                 new_input = input_to_forward;
-                node_map_->UpdateInput(consumer->name(), old_input, new_input);
+                node_map_->UpdateInput(consumer->name(), old_input.ToString(),
+                                       new_input);
                 consumer->set_input(j, new_input);
-                found_input = true;
-              } else {
+              } else if (old_input.index() == -1) {
                 // Control dependency
                 new_input = AsControlDependency(NodeName(input_to_forward));
-                node_map_->UpdateInput(consumer->name(), old_input, new_input);
+                node_map_->UpdateInput(consumer->name(), old_input.ToString(),
+                                       new_input);
                 consumer->set_input(j, new_input);
-                found_input = true;
               }
             }
           }
-          CHECK(found_input);
           updated_consumer = true;
         } else {
           // Forward dependency from input to consumer if it doesn't already
@@ -415,8 +452,8 @@ Status DependencyOptimizer::OptimizeDependencies() {
   std::set<int> nodes_to_delete;
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
     const NodeDef& node = optimized_graph_->node(i);
-    if (IsNoOp(node) || IsIdentity(node) || IsConstant(node) ||
-        SafeToConvertToNoOp(node)) {
+    if (IsNoOp(node) || IsIdentity(node) || IsIdentityN(node) ||
+        IsConstant(node) || SafeToConvertToNoOp(node)) {
       nodes_to_simplify.PushBack(i);
     }
   }
@@ -652,6 +689,7 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   const int num_iterations = 2;
   for (int iteration = 0; iteration < num_iterations; ++iteration) {
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     Status topo_sort_status;
     // Perform topological sort to prepare the graph for transitive reduction.
     topo_sort_status = TopologicalSort(optimized_graph_);
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 48cfa236af847ad16b9c5878ac469356080b21ec..7b032673fb3456a724d8021a5dcebc8b4c957ba8 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -48,7 +48,8 @@ class DependencyOptimizer : public GraphOptimizer {
   bool BypassingNodeIsBeneficial(
       const NodeDef& node, const std::vector<NodeDef*>& input_nodes,
       const std::vector<NodeDef*>& output_nodes) const;
-
+  int NumEdgesIfBypassed(const NodeDef& node,
+                         const std::vector<NodeDef*>& output_nodes) const;
   // Returns true if node is not an Identity node or if it is an Identity
   // that is safe to remove.
   bool SafeToRemoveIdentity(const NodeDef& node) const;
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index c0f07562affcde5a811751f7a066cf9db8b1a0e6..8d70d9d5c73690e87d84cf941c749948e47ace26 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -634,7 +634,7 @@ TEST_F(DependencyOptimizerTest, IdentityInputs) {
   EXPECT_EQ("s:1", output.node(5).input(0));
 }
 
-TEST_F(DependencyOptimizerTest, IdentityN) {
+TEST_F(DependencyOptimizerTest, RemoveIdentityN_SwitchInput) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
   Output b = ops::Placeholder(scope.WithOpName("b"), DT_BOOL);
   Output x = ops::RandomUniform(scope.WithOpName("x"), {1, 2}, DT_FLOAT);
@@ -643,8 +643,6 @@ TEST_F(DependencyOptimizerTest, IdentityN) {
   // IdentityN nodes to be removed.
   auto id_f = ops::IdentityN(scope.WithOpName("id_f"), {s.output_false});
   auto id_t = ops::IdentityN(scope.WithOpName("id_t"), {s.output_true});
-
-  // IdentityN node that can't be removed.
   auto id_b =
       ops::IdentityN(scope.WithOpName("id_b"), {s.output_false, s.output_true});
 
@@ -663,22 +661,50 @@ TEST_F(DependencyOptimizerTest, IdentityN) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(9, output.node_size());
-  EXPECT_EQ("out1", output.node(5).name());
-  EXPECT_EQ(1, output.node(5).input_size());
-  EXPECT_EQ("s", output.node(5).input(0));
+  EXPECT_EQ(8, output.node_size());
+
+  auto out1_node = output.node(7);
+  EXPECT_EQ("out1", out1_node.name());
+  EXPECT_EQ(1, out1_node.input_size());
+  EXPECT_EQ("s", out1_node.input(0));
+
+  auto out2_node = output.node(4);
+  EXPECT_EQ("out2", out2_node.name());
+  EXPECT_EQ(1, out2_node.input_size());
+  EXPECT_EQ("s:1", out2_node.input(0));
+
+  auto out3_node = output.node(5);
+  EXPECT_EQ("out3", out3_node.name());
+  EXPECT_EQ(1, out3_node.input_size());
+  EXPECT_EQ("s", out3_node.input(0));
+
+  auto out4_node = output.node(6);
+  EXPECT_EQ("out4", out4_node.name());
+  EXPECT_EQ(1, out4_node.input_size());
+  EXPECT_EQ("s:1", out4_node.input(0));
+}
 
-  EXPECT_EQ("out2", output.node(6).name());
-  EXPECT_EQ(1, output.node(6).input_size());
-  EXPECT_EQ("s:1", output.node(6).input(0));
+TEST_F(DependencyOptimizerTest, DoNotRemoveIdentityNWithControlDependency) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output input1 = ops::Placeholder(scope.WithOpName("input1"), DT_BOOL);
+  Output input2 = ops::Const(scope.WithOpName("input2"), {1, 2});
+
+  auto id_n = ops::IdentityN(scope.WithOpName("id_n"), {input1, input2});
+  Output out1 = ops::Identity(scope.WithOpName("out1"), id_n[0]);
+  Output out2 = ops::Identity(scope.WithOpName("out2"), id_n[1]);
+  auto out3 =
+      ops::NoOp(scope.WithOpName("out3").WithControlDependencies(id_n[1]));
 
-  EXPECT_EQ("out3", output.node(7).name());
-  EXPECT_EQ(1, output.node(7).input_size());
-  EXPECT_EQ("id_b", output.node(7).input(0));
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"out1", "out2", "out3"};
+
+  DependencyOptimizer optimizer;
+  GraphDef optimized_graph_def;
+  Status status = optimizer.Optimize(nullptr, item, &optimized_graph_def);
+  TF_EXPECT_OK(status);
 
-  EXPECT_EQ("out4", output.node(8).name());
-  EXPECT_EQ(1, output.node(8).input_size());
-  EXPECT_EQ("id_b:1", output.node(8).input(0));
+  EXPECT_EQ(6, optimized_graph_def.node_size());
 }
 
 TEST_F(DependencyOptimizerTest,
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc b/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
index 2c36c9b7b314669402108c5f5a864eb731002fcf..75ad8bffefd8aa00bb1ba88c10ed9b1170a0d25f 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -32,6 +34,73 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
+                     const FunctionApiInfo& apiInfo) {
+  VLOG(3) << "Node def before swap is: " << node_def->DebugString();
+  auto tin = node_def->mutable_attr()->find("Tin");
+  tin->second.mutable_list()->clear_type();
+  for (const auto& tin_dtype : apiInfo.input_arg_dtypes()) {
+    tin->second.mutable_list()->add_type(tin_dtype);
+  }
+
+  auto tout = node_def->mutable_attr()->find("Tout");
+  tout->second.mutable_list()->clear_type();
+  for (const auto& tout_dtype : apiInfo.output_arg_dtypes()) {
+    tout->second.mutable_list()->add_type(tout_dtype);
+  }
+
+  if (apiInfo.function_type() == FunctionApiInfo::BACKWARD) {
+    // Update the inputs since for backward function, it might have different
+    // number of inputs due the different number output from forward function.
+    // The output of forward function are composed by two parts:
+    //   1. Real output tensors from defun.
+    //   2. Internal states that will be used for gradient calculation.
+    // Part 1 will be static, and part 2 could be different based on the
+    // different implementation.
+
+    const int prev_input_size = node_def->input_size();
+    const int diff = prev_input_size - apiInfo.input_arg_dtypes().size();
+    if (diff >= 0) {
+      for (int i = 0; i < diff; ++i) node_def->mutable_input()->RemoveLast();
+    } else {
+      // Adding new inputs for internal states, the name of the internal states
+      // should be in format "{forward_node_name}:{index}", where the newly
+      // added index should start from last index of the state.
+      // Eg:
+      // {
+      //   input: "gradients/unified_lstm/strided_slice_1_grad/StridedSliceGrad"
+      //   input: "gradients/zeros_like_1"
+      //   input: "gradients/zeros_like_2"
+      //   input: "unified_lstm/StatefulPartitionedCall:3"
+      //   input: "unified_lstm/StatefulPartitionedCall:4"
+      //   # New input should be "unified_lstm/StatefulPartitionedCall:5"
+      // }
+      const string last_input = node_def->input(prev_input_size - 1);
+      const std::vector<string> name_index = ::absl::StrSplit(last_input, ':');
+      if (name_index.size() != 2) {
+        return errors::InvalidArgument(
+            "Invalid format of input node name: ", last_input,
+            " Expected: {forward_node_name}:{index}");
+      }
+      const absl::string_view node_name = name_index[0];
+      int last_index;
+      if (!::absl::SimpleAtoi(name_index[1], &last_index)) {
+        return errors::InvalidArgument(
+            "The index of input node is expected to be number, got: ",
+            name_index[1]);
+      }
+      for (int i = 1; i <= -diff; ++i)
+        node_def->add_input(strings::StrCat(node_name, ":", i + last_index));
+    }
+  }
+
+  node_def->mutable_attr()->find("f")->second.mutable_func()->set_name(
+      funcName);
+
+  VLOG(3) << "Node def after swap is: " << node_def->DebugString();
+  return Status::OK();
+}
+
 Status ExperimentalImplementationSelector::LoadFunctions(
     const GraphDef& graph) {
   lib_info_.reset(new FunctionLibraryApiInfo);
@@ -43,8 +112,11 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
     NodeDef* node_def) const {
   // There are two ways of calling functions:
   //  1. By specifying an op name as a function name, or
-  //  2. Via the @defun functional interface, where the real function name
-  //     appear as the attribute with type func.
+  //  2. Via the @defun functional interface, where the real function call
+  //     happens with partitionedcall op, and the function name appear as the
+  //     attribute with name "f" and type func. In this use case, there are more
+  //     attributes need to be taken care, like Tin and Tout which take care of
+  //     the DTYPE of input/output.
   std::vector<string> function_attribute_names;
   for (const auto& attr : node_def->attr()) {
     if (attr.second.has_func() &&
@@ -70,22 +142,29 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
 
   for (const auto& attr_name : function_attribute_names) {
     string function_name = node_def->attr().at(attr_name).func().name();
-    string best_function_name;
-    lib_info_->GetBestImplementation(function_name, parsed_name.type,
-                                     &best_function_name);
-    if (function_name != best_function_name) {
-      node_def->mutable_attr()
-          ->find(attr_name)
-          ->second.mutable_func()
-          ->set_name(best_function_name);
+    std::vector<string> equiv_func_names;
+    TF_RETURN_IF_ERROR(lib_info_->GetEquivalentImplementations(
+        function_name, &equiv_func_names));
+    for (const auto& func_name : equiv_func_names) {
+      const auto& func_api_info = lib_info_->GetApiInfo(func_name);
+      if (func_api_info->preferred_device() == parsed_name.type) {
+        VLOG(2) << "Swapping: " << function_name << " TO: " << func_name;
+        TF_RETURN_IF_ERROR(UpdateNodeDef(node_def, func_name, *func_api_info));
+        break;
+      }
     }
   }
+
   if (lib_info_->GetApiInfo(node_def->op()) != nullptr) {
-    string best_function_name;
-    lib_info_->GetBestImplementation(node_def->op(), parsed_name.type,
-                                     &best_function_name);
-    if (node_def->op() != best_function_name) {
-      node_def->set_op(best_function_name);
+    std::vector<string> equiv_func_names;
+    TF_RETURN_IF_ERROR(lib_info_->GetEquivalentImplementations(
+        node_def->op(), &equiv_func_names));
+    for (const string& func_name : equiv_func_names) {
+      const auto func_api_info = lib_info_->GetApiInfo(func_name);
+      if (func_api_info->preferred_device() == parsed_name.type) {
+        node_def->set_op(func_name);
+        break;
+      }
     }
   }
   return Status::OK();
@@ -93,6 +172,11 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
 
 Status ExperimentalImplementationSelector::SelectImplementation(
     GraphDef* graph) const {
+  if (!graph->has_library()) {
+    VLOG(2) << "Skipping graph since it does not have function def";
+    return Status::OK();
+  }
+
   for (int k = 0; k < graph->node_size(); ++k)
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph->mutable_node(k)));
 
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
index 3f1ebefac68a1e9b86acea0ddb9dd1c6a638ac6e..e330835e9bc4fea33928e376a3fd98ebe34a74ee 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
@@ -127,12 +127,107 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
                                  test::AsScalar<float>(4.0f));
 
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   const auto twice_boosted_tensor = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(twice_boosted_tensor[0],
                                  test::AsScalar<float>(2.0f));
 }
 
+TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+  // boost_1 returns the doubled input and a const as the internal state, the
+  // state will be feed to gradient function to mimic the behavior of backward
+  // function of defun that use internal states as extra inputs.
+  FunctionDef boost_1 = FDH::Create(
+      "Boost1", {"x:float"}, {"z:float", "s:float"}, {},
+      {{{"boost"}, "Add", {"x", "x"}, {{"T", DT_FLOAT}}},
+       FDH::Const("one", 1.0f)},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "boost:z:0"}, {"s", "one:output:0"}});
+  auto* boost_1_attr = boost_1.mutable_attr();
+  (*boost_1_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_1_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_attr)["backward_function_name"].set_s("BoostCpuGradient");
+
+  FunctionDef boost_1_gradient = FDH::Create(
+      "Boost1Gradient", {"x:float", "s:float"}, {"dx:float"}, {},
+      {FDH::Const("two", 2.0f),
+       {{"grad"}, "Mul", {"x", "two:output:0"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"dx", "grad:z:0"}});
+  auto* boost_1_grad_attr = boost_1_gradient.mutable_attr();
+  (*boost_1_grad_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_1_grad_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_grad_attr)["forward_function_name"].set_s("BoostCpu");
+
+  // boost_2 return the input * 4, and with two extra internal states.
+  FunctionDef boost_2_func = FDH::Create(
+      "Boost2", {"x:float"}, {"z:float", "s1:float", "s2:float"}, {},
+      {FDH::Const("four", 4.0f),
+       {{"boost"}, "Mul", {"x", "four:output:0"}, {{"T", DT_FLOAT}}},
+       FDH::Const("one", 1.0f),
+       FDH::Const("two", 2.0f)},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "boost:z:0"}, {"s1", "one:output:0"}, {"s2", "two:output:0"}});
+  auto* boost_2_attr = boost_2_func.mutable_attr();
+  (*boost_2_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_attr)["backward_function_name"].set_s("BoostGpuGradient");
+
+  FunctionDef boost_2_gradient = FDH::Create(
+      "Boost2Gradient", {"x:float", "s1:float", "s2:float"}, {"dx:float"}, {},
+      {FDH::Const("four", 4.0f),
+       {{"grad"}, "Mul", {"x", "four:output:0"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"dx", "grad:z:0"}});
+  auto* boost_2_grad_attr = boost_2_gradient.mutable_attr();
+  (*boost_2_grad_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_2_grad_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_grad_attr)["forward_function_name"].set_s("BoostGpu");
+
+  // Define the forward function with f = boost2 function but with CPU device.
+  // Expect the grappler plugin to swap f and attributes to use the boost1.
+  const auto forward =
+      NDef("lstm/StatefulPartitionedCall", "StatefulPartitionedCall", {"input"},
+           {{"Tin", DataTypeSlice{DT_FLOAT}},
+            {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+            {"f", FDH::FunctionRef("Boost2")}},
+           CpuDevice);
+  const auto backward =
+      NDef("gradient/lstm/StatefulPartitionedCall", "StatefulPartitionedCall",
+           {"input", "lstm/StatefulPartitionedCall:1",
+            "lstm/StatefulPartitionedCall:2"},
+           {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+            {"Tout", DataTypeSlice{DT_FLOAT}},
+            {"f", FDH::FunctionRef("Boost2Gradient")}},
+           CpuDevice);
+
+  ExperimentalImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("input", "Placeholder", {}, {{"dtype", DT_FLOAT}}, CpuDevice),
+       forward, backward,
+       NDef("output", "Identity", {"lstm/StatefulPartitionedCall:0"},
+            {{"T", DT_FLOAT}}, CpuDevice)},
+      // FunctionLib
+      {boost_1, boost_1_gradient, boost_2_func, boost_2_gradient});
+
+  const Tensor input = test::AsScalar<float>(1.0f);
+  item.fetch = {"output"};
+  item.feed.emplace_back("input", input);
+
+  const auto four_times_boosted_tensor = EvaluateFetchNodes(item);
+  test::ExpectTensorEqual<float>(four_times_boosted_tensor[0],
+                                 test::AsScalar<float>(4.0f));
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  const auto twice_boosted_tensor = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(twice_boosted_tensor[0],
+                                 test::AsScalar<float>(2.0f));
+}
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.cc b/tensorflow/core/grappler/optimizers/function_api_info.cc
index 798e0f6fd55930f437d7a95d1886eb14e07946b5..497ad6032ea80b22e5b5e2b23b2860b7c99fc57b 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info.cc
@@ -27,6 +27,7 @@ FunctionApiInfo::FunctionApiInfo() {}
 FunctionApiInfo::~FunctionApiInfo() {}
 
 Status FunctionApiInfo::Init(const FunctionDef& function_def) {
+  function_type_ = FunctionApiInfo::FunctionType::INFERENCE;
   for (const auto& attr : function_def.attr()) {
     if (attr.first == "experimental_api_preferred_device") {
       preferred_device_ = attr.second.s();
@@ -34,7 +35,25 @@ Status FunctionApiInfo::Init(const FunctionDef& function_def) {
     if (attr.first == "experimental_api_implements") {
       interface_name_ = attr.second.s();
     }
+    if (attr.first == "forward_function_name") {
+      function_type_ = FunctionApiInfo::FunctionType::BACKWARD;
+      pairing_function_name_ = attr.second.s();
+    }
+    if (attr.first == "backward_function_name") {
+      function_type_ = FunctionApiInfo::FunctionType::FORWARD;
+      pairing_function_name_ = attr.second.s();
+    }
+  }
+
+  input_arg_dtypes_.reserve(function_def.signature().input_arg_size());
+  for (const auto& input_arg : function_def.signature().input_arg()) {
+    input_arg_dtypes_.emplace_back(input_arg.type());
   }
+  output_arg_dtypes_.reserve(function_def.signature().output_arg_size());
+  for (const auto& output_arg : function_def.signature().output_arg()) {
+    output_arg_dtypes_.emplace_back(output_arg.type());
+  }
+
   if (interface_name_.empty() && !preferred_device_.empty()) {
     return errors::InvalidArgument(
         "Function '", function_def.signature().name(),
@@ -51,53 +70,94 @@ const string& FunctionApiInfo::interface_name() const {
   return interface_name_;
 }
 
+const FunctionApiInfo::FunctionType FunctionApiInfo::function_type() const {
+  return function_type_;
+}
+
+const string& FunctionApiInfo::pairing_function_name() const {
+  return pairing_function_name_;
+}
+
+const DataTypeVector& FunctionApiInfo::input_arg_dtypes() const {
+  return input_arg_dtypes_;
+}
+
+const DataTypeVector& FunctionApiInfo::output_arg_dtypes() const {
+  return output_arg_dtypes_;
+}
+
 FunctionLibraryApiInfo::FunctionLibraryApiInfo() {}
 FunctionLibraryApiInfo::~FunctionLibraryApiInfo() {}
 
 namespace {
-bool IsSameSignature(const FunctionDef& f1, const FunctionDef& f2) {
-  if (f1.ret().size() != f2.ret().size()) return false;
+bool IsSameArgDef(const OpDef::ArgDef& arg1, const OpDef::ArgDef& arg2) {
+  if (arg1.type() != arg2.type()) return false;
+  if (arg1.type_attr() != arg2.type_attr()) return false;
+  if (arg1.number_attr() != arg2.number_attr()) return false;
+  if (arg1.type_list_attr() != arg2.type_list_attr()) return false;
+  if (arg1.is_ref() != arg2.is_ref()) return false;
+  return true;
+}
+
+bool IsSameSignature(const FunctionDef& f1, const FunctionDef& f2,
+                     const bool check_inputs, const bool check_outputs) {
   const auto& sig1 = f1.signature();
   const auto& sig2 = f2.signature();
   // Functions have positional semantics, so we don't check for names.
-  if (sig1.input_arg_size() != sig2.input_arg_size()) return false;
-  for (int k = 0; k < sig1.input_arg_size(); ++k) {
-    const OpDef::ArgDef& arg1 = sig1.input_arg(k);
-    const OpDef::ArgDef& arg2 = sig2.input_arg(k);
-    if (arg1.type() != arg2.type()) return false;
-    if (arg1.type_attr() != arg2.type_attr()) return false;
-    if (arg1.number_attr() != arg2.number_attr()) return false;
-    if (arg1.type_list_attr() != arg2.type_list_attr()) return false;
-    if (arg1.is_ref() != arg2.is_ref()) return false;
+  if (check_inputs) {
+    if (sig1.input_arg_size() != sig2.input_arg_size()) return false;
+    for (int k = 0; k < sig1.input_arg_size(); ++k) {
+      if (!IsSameArgDef(sig1.input_arg(k), sig2.input_arg(k))) return false;
+    }
+  }
+  if (check_outputs) {
+    if (f1.ret().size() != f2.ret().size()) return false;
+    if (sig1.output_arg_size() != sig2.output_arg_size()) return false;
+    for (int k = 0; k < sig1.output_arg_size(); ++k) {
+      if (!IsSameArgDef(sig1.output_arg(k), sig2.output_arg(k))) return false;
+    }
   }
   return true;
 }
 
 Status ValidateSignature(const string& interface_name,
-                         const std::vector<const FunctionDef*>& equiv_funcs) {
+                         const std::vector<const FunctionDef*>& equiv_funcs,
+                         const FunctionApiInfo::FunctionType function_type) {
   if (equiv_funcs.size() < 2) return Status::OK();
   for (size_t k = 1; k < equiv_funcs.size(); ++k) {
-    if (!IsSameSignature(*equiv_funcs[0], *equiv_funcs[k]))
+    const bool check_input =
+        (function_type == FunctionApiInfo::FunctionType::INFERENCE ||
+         function_type == FunctionApiInfo::FunctionType::FORWARD);
+    const bool check_output =
+        (function_type == FunctionApiInfo::FunctionType::INFERENCE ||
+         function_type == FunctionApiInfo::FunctionType::BACKWARD);
+    if (!IsSameSignature(*equiv_funcs[0], *equiv_funcs[k], check_input,
+                         check_output)) {
       return errors::InvalidArgument(
           "Functions '", equiv_funcs[0]->signature().name(), "' and '",
           equiv_funcs[k]->signature().name(), "' both implement '",
           interface_name, "' but their signatures do not match.");
+    }
   }
   return Status::OK();
 }
 
 Status ValidateSignatures(
     const std::unordered_map<string, std::vector<const FunctionDef*>>&
-        intf_to_func) {
+        intf_to_func,
+    const FunctionApiInfo::FunctionType function_type) {
   for (const auto& item : intf_to_func)
-    TF_RETURN_IF_ERROR(ValidateSignature(item.first, item.second));
+    TF_RETURN_IF_ERROR(
+        ValidateSignature(item.first, item.second, function_type));
   return Status::OK();
 }
 }  // namespace
 
 Status FunctionLibraryApiInfo::Init(
     const FunctionDefLibrary& function_library) {
-  std::unordered_map<string, std::vector<const FunctionDef*>> intf_to_func;
+  std::unordered_map<string, std::vector<const FunctionDef*>> infer_funcs;
+  std::unordered_map<string, std::vector<const FunctionDef*>> fwd_funcs;
+  std::unordered_map<string, std::vector<const FunctionDef*>> bwd_funcs;
   for (const auto& function : function_library.function()) {
     std::unique_ptr<FunctionApiInfo> func_info(new FunctionApiInfo);
     TF_RETURN_IF_ERROR(func_info->Init(function));
@@ -106,54 +166,64 @@ Status FunctionLibraryApiInfo::Init(
 
     const string& function_name = function.signature().name();
     const string& interface_name = func_info->interface_name();
-    func_to_intf_[function_name] = interface_name;
-    intf_to_funcs_[interface_name].emplace_back(function_name);
-    intf_to_func[interface_name].emplace_back(&function);
+    VLOG(3) << "Got " << func_info->function_type()
+            << " function: " << function_name
+            << " with interface: " << interface_name;
+    switch (func_info->function_type()) {
+      case FunctionApiInfo::FunctionType::INFERENCE:
+        intf_to_inference_funcs_[interface_name].emplace_back(function_name);
+        infer_funcs[interface_name].emplace_back(&function);
+        break;
+      case FunctionApiInfo::FunctionType::FORWARD:
+        intf_to_forward_funcs_[interface_name].emplace_back(function_name);
+        fwd_funcs[interface_name].emplace_back(&function);
+        break;
+      case FunctionApiInfo::FunctionType::BACKWARD:
+        intf_to_backward_funcs_[interface_name].emplace_back(function_name);
+        bwd_funcs[interface_name].emplace_back(&function);
+        break;
+      default:
+        return errors::InvalidArgument("Unrecognized function type: ",
+                                       func_info->function_type());
+    }
     func_info_[function_name] = std::move(func_info);
   }
-  TF_RETURN_IF_ERROR(ValidateSignatures(intf_to_func));
+  TF_RETURN_IF_ERROR(ValidateSignatures(
+      infer_funcs, FunctionApiInfo::FunctionType::INFERENCE));
+  TF_RETURN_IF_ERROR(
+      ValidateSignatures(fwd_funcs, FunctionApiInfo::FunctionType::FORWARD));
+  TF_RETURN_IF_ERROR(
+      ValidateSignatures(bwd_funcs, FunctionApiInfo::FunctionType::BACKWARD));
   return Status::OK();
 }
 
-void FunctionLibraryApiInfo::GetEquivalentImplementations(
-    const string& function_name, std::vector<string>* other_names) const {
-  const auto intf_it = func_to_intf_.find(function_name);
-  // The function does not implement any interface.
-  if (intf_it == func_to_intf_.end()) return;
-  CHECK(!intf_it->second.empty()) << "Function " << function_name
-                                  << "should at least implement 1 interface.";
-  const auto it = intf_to_funcs_.find(intf_it->second);
-  CHECK(it != intf_to_funcs_.end())
-      << "Function " << function_name << " maps to " << intf_it->second
-      << " but no reverse mapping was found";
-  CHECK_GE(it->second.size(), 1) << "Class " << it->first << " is empty";
-  other_names->reserve(it->second.size() - 1);
-  for (const auto& other_name : it->second) {
-    if (other_name == function_name) continue;
-    other_names->emplace_back(other_name);
+Status FunctionLibraryApiInfo::GetEquivalentImplementations(
+    const string& function_name, std::vector<string>* other_functions) const {
+  const auto func_it = func_info_.find(function_name);
+  if (func_it == func_info_.end()) return Status::OK();
+  const FunctionApiInfo* func_info = func_it->second.get();
+
+  absl::flat_hash_map<string, std::vector<string>>::const_iterator it;
+  switch (func_info->function_type()) {
+    case FunctionApiInfo::FunctionType::INFERENCE:
+      it = intf_to_inference_funcs_.find(func_info->interface_name());
+      break;
+    case FunctionApiInfo::FunctionType::FORWARD:
+      it = intf_to_forward_funcs_.find(func_info->interface_name());
+      break;
+    case FunctionApiInfo::FunctionType::BACKWARD:
+      it = intf_to_backward_funcs_.find(func_info->interface_name());
+      break;
+    default:
+      return errors::InvalidArgument("Unrecognized function type: ",
+                                     func_info->function_type());
   }
-}
 
-void FunctionLibraryApiInfo::GetBestImplementation(
-    const string& function_name, const string& device,
-    string* best_func_name) const {
-  CHECK(best_func_name != nullptr);
-  const auto func_it = func_to_intf_.find(function_name);
-  if (func_it == func_to_intf_.end()) return;
-
-  const auto it = intf_to_funcs_.find(func_it->second);
-  // No function found for the given interface.
-  if (it == intf_to_funcs_.end()) return;
   for (const auto& func_name : it->second) {
-    const auto func_api_info = func_info_.find(func_name)->second.get();
-    if (func_api_info->preferred_device() == device) {
-      best_func_name->assign(func_name);
-      return;
-    }
+    if (func_name == function_name) continue;
+    other_functions->emplace_back(func_name);
   }
-  // Didn't find a function with the match device name, choose the first one
-  // among all the available functions.
-  best_func_name->assign(it->second.front());
+  return Status::OK();
 }
 
 const FunctionApiInfo* FunctionLibraryApiInfo::GetApiInfo(
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.h b/tensorflow/core/grappler/optimizers/function_api_info.h
index 412687c58c15460a05b2e697afb1f84454462da8..9a5f548951f0931e98fbe4074f7bbd9aacab0c6e 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.h
+++ b/tensorflow/core/grappler/optimizers/function_api_info.h
@@ -20,7 +20,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -30,14 +33,32 @@ class FunctionApiInfo {
   FunctionApiInfo();
   virtual ~FunctionApiInfo();
 
+  enum FunctionType {
+    INFERENCE,  // Default type.
+    FORWARD,
+    BACKWARD,
+  };
+
   Status Init(const FunctionDef& function_def);
 
   const string& interface_name() const;
   const string& preferred_device() const;
+  const FunctionType function_type() const;
+  const string& pairing_function_name() const;
+  const DataTypeVector& input_arg_dtypes() const;
+  const DataTypeVector& output_arg_dtypes() const;
 
  private:
   string interface_name_;
   string preferred_device_;
+  FunctionType function_type_;
+  // The pairing function is used to pair between forward and backward function,
+  // which will be useful during function swapping. Inference function won't
+  // have pairing function.
+  string pairing_function_name_;
+  // The following two attributes are useful for forward and backward functions.
+  DataTypeVector input_arg_dtypes_;
+  DataTypeVector output_arg_dtypes_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionApiInfo);
 };
@@ -55,21 +76,22 @@ class FunctionLibraryApiInfo {
   // Populate the internal field for the functions within the function_library.
   Status Init(const FunctionDefLibrary& function_library);
 
-  void GetEquivalentImplementations(const string& function_name,
-                                    std::vector<string>* other_names) const;
-
-  void GetBestImplementation(const string& function_name, const string& device,
-                             string* best_func_name) const;
+  Status GetEquivalentImplementations(
+      const string& function_name, std::vector<string>* other_functions) const;
 
   const FunctionApiInfo* GetApiInfo(const string& function_name) const;
 
  private:
   // Map between function name to function details.
   std::unordered_map<string, std::unique_ptr<FunctionApiInfo>> func_info_;
-  // Map between function name to interface name.
-  std::unordered_map<string, string> func_to_intf_;
+
   // Map between interface name to function names.
-  std::unordered_map<string, std::vector<string>> intf_to_funcs_;
+  // Forward/backward function pair usually have different signatures between
+  // each other since forward function could produce extra internal state as
+  // output, and backward will take those extra state as inputs.
+  absl::flat_hash_map<string, std::vector<string>> intf_to_inference_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_forward_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_backward_funcs_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryApiInfo);
 };
diff --git a/tensorflow/core/grappler/optimizers/function_api_info_test.cc b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
index 582890d3e3bb807552039de4a3ff5e8c6e393ca5..b683d26b32f04759b658e9e0704f1b6b661fe178 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
@@ -36,28 +36,35 @@ void SetArg(const string& name, const string& type_name,
 
 typedef std::pair<string, string> ArgSpec;  // name, type.
 
-void SetArgs(const std::vector<ArgSpec>& args_spec, OpDef* sig) {
-  for (const auto& arg_spec : args_spec)
+void SetArgs(const std::vector<ArgSpec>& input_args_spec,
+             const std::vector<ArgSpec>& output_args_spec, OpDef* sig) {
+  for (const auto& arg_spec : input_args_spec)
     SetArg(arg_spec.first, arg_spec.second, sig->add_input_arg());
-  SetArg("output", "float32", sig->add_output_arg());
+  for (const auto& arg_spec : output_args_spec)
+    SetArg(arg_spec.first, arg_spec.second, sig->add_output_arg());
 }
 
 void PopulateFunction(const string& name, const string& api_interface_name,
                       const string& preferred_device,
                       const std::vector<ArgSpec>& input_args,
+                      const std::vector<ArgSpec>& output_args,
+                      const string& forward_function_name,
+                      const string& backward_function_name,
                       FunctionDef* func_def) {
   OpDef* sig = func_def->mutable_signature();
   sig->set_name(name);
 
-  SetArgs(input_args, sig);
-
-  if (!api_interface_name.empty() || !preferred_device.empty()) {
-    auto* func_attr = func_def->mutable_attr();
-    if (!api_interface_name.empty())
-      (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
-    if (!preferred_device.empty())
-      (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
-  }
+  SetArgs(input_args, output_args, sig);
+
+  auto* func_attr = func_def->mutable_attr();
+  if (!api_interface_name.empty())
+    (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
+  if (!preferred_device.empty())
+    (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
+  if (!forward_function_name.empty())
+    (*func_attr)["forward_function_name"].set_s(forward_function_name);
+  if (!backward_function_name.empty())
+    (*func_attr)["backward_function_name"].set_s(backward_function_name);
 }
 
 void PopulateSampleLibrary(const bool mismatch_args,
@@ -65,39 +72,50 @@ void PopulateSampleLibrary(const bool mismatch_args,
   const std::vector<ArgSpec> func_args{{"in1", "float32"}, {"in2", "int32"}};
   const std::vector<ArgSpec> func_wrong_args{{"in1", "int32"},
                                              {"in2", "int32"}};
-  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", func_args,
-                   func_lib->add_function());
+  const std::vector<ArgSpec> output_args{{"out", "float32"}};
+  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", func_args, output_args, "",
+                   "", func_lib->add_function());
   PopulateFunction("DoStuffGpu", "DoStuff", "GPU",
-                   mismatch_args ? func_wrong_args : func_args,
+                   mismatch_args ? func_wrong_args : func_args, output_args, "",
+                   "", func_lib->add_function());
+  PopulateFunction("DoThings", "DoThings", "", func_args, output_args, "", "",
                    func_lib->add_function());
-  PopulateFunction("DoThings", "DoThings", "", func_args,
+  PopulateFunction("OneOff", "", "", func_args, output_args, "", "",
                    func_lib->add_function());
-  PopulateFunction("OneOff", "", "", func_args, func_lib->add_function());
-  PopulateFunction("AnotherOneOff", "", "", func_args,
+  PopulateFunction("AnotherOneOff", "", "", func_args, output_args, "", "",
                    func_lib->add_function());
 }
 
+void PopulateComplexLibrary(FunctionDefLibrary* func_lib) {
+  const std::vector<ArgSpec> input_args{{"in1", "float32"}, {"in2", "int32"}};
+  const std::vector<ArgSpec> output_args{{"out", "float32"}};
+  const std::vector<ArgSpec> output_with_state{
+      {"out", "float32"}, {"state1", "int32"}, {"state2", "int32"}};
+
+  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", input_args, output_args, "",
+                   "DoStuffCpu_gradient", func_lib->add_function());
+  PopulateFunction("DoStuffCpu_gradient", "DoStuff", "CPU", output_args,
+                   input_args, "DoStuffCpu", "", func_lib->add_function());
+  PopulateFunction("DoStuffGpu", "DoStuff", "GPU", input_args,
+                   output_with_state, "", "DoStuffGpu_gradient",
+                   func_lib->add_function());
+  PopulateFunction("DoStuffGpu_gradient", "DoStuff", "GPU", output_with_state,
+                   input_args, "DoStuffGpu", "", func_lib->add_function());
+}
+
 bool CheckEquivImpl(const FunctionLibraryApiInfo& lib_api_info,
                     const string& func_name,
                     const std::vector<string>& expected_other) {
   std::vector<string> other_impl;
-  lib_api_info.GetEquivalentImplementations(func_name, &other_impl);
+  Status status =
+      lib_api_info.GetEquivalentImplementations(func_name, &other_impl);
+  EXPECT_EQ(status, Status::OK());
   const std::unordered_set<string> actual(other_impl.begin(), other_impl.end());
   const std::unordered_set<string> expected(expected_other.begin(),
                                             expected_other.end());
   return actual == expected;
 }
 
-bool CheckGetBestImpl(const FunctionLibraryApiInfo& lib_api_info,
-                      const string& function_name, const string& device,
-                      const string& expected_function_name) {
-  string best_function_name;
-  lib_api_info.GetBestImplementation(function_name, device,
-                                     &best_function_name);
-
-  return best_function_name == expected_function_name;
-}
-
 string GetInterfaceName(const FunctionLibraryApiInfo& lib_api_info,
                         const string& func_name) {
   auto* info = lib_api_info.GetApiInfo(func_name);
@@ -117,34 +135,46 @@ TEST(FunctionApiInfoTest, ParseTags) {
   PopulateSampleLibrary(/* mismatch_args */ false, &func_lib);
   FunctionLibraryApiInfo lib_api_info;
   TF_ASSERT_OK(lib_api_info.Init(func_lib));
+
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu"));
+  EXPECT_EQ("DoThings", GetInterfaceName(lib_api_info, "DoThings"));
+
+  EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu"));
+  EXPECT_EQ("", GetPreferredDevice(lib_api_info, "DoThings"));
+
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu", {"DoStuffGpu"}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu", {"DoStuffCpu"}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "Undefined", {}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "OneOff", {}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "AnotherOneOff", {}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoThings", {}));
+}
+
+TEST(FunctionApiInfoTest, ComplexFunctionLib) {
+  FunctionDefLibrary func_lib;
+  PopulateComplexLibrary(&func_lib);
+  FunctionLibraryApiInfo lib_api_info;
+  TF_ASSERT_OK(lib_api_info.Init(func_lib));
 
   EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu_gradient"));
   EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu"));
-  EXPECT_EQ("DoThings", GetInterfaceName(lib_api_info, "DoThings"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu_gradient"));
 
   EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu_gradient"));
   EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu"));
-  EXPECT_EQ("", GetPreferredDevice(lib_api_info, "DoThings"));
+  EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu_gradient"));
 
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffCpu", "CPU", "DoStuffCpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffCpu", "GPU", "DoStuffGpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "CPU", "DoStuffCpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "GPU", "DoStuffGpu"));
-
-  EXPECT_TRUE(CheckGetBestImpl(lib_api_info, "DoThings", "GPU", "DoThings"));
-  // TPU impl is not available, choose the first one available which is the CPU.
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "TPU", "DoStuffCpu"));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu", {"DoStuffGpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu", {"DoStuffCpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu_gradient",
+                             {"DoStuffGpu_gradient"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu_gradient",
+                             {"DoStuffCpu_gradient"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "Undefined", {}));
 }
 
 TEST(FunctionApiInfoTest, MismatchedArguments) {
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 56364f00950b99020ac2a2cbd0651b12179cd6b9..8beebb90496005dea556ec90de24072a6e6fd9b6 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -16,9 +16,17 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
 
 #include <unordered_map>
+#include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.h"
@@ -29,7 +37,9 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
@@ -39,9 +49,20 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+// WARNING: Code in this file implicitly assumes that function input and output
+// arguments are plain tensors (tensor lists are not supported). Function inputs
+// and outputs are always expanded to a single placeholder or output tensor.
+// With this assumption, the calling node's input/output ports always match
+// function input/output arguments.
+//
+// This is guaranteed by the implementation of MakeGrapplerFunctionItem.
+
 // Mark functions that were created as a result of function specialization.
 constexpr char kGrapplerSpecializedFuncAttr[] = "_GrapplerSpecializedFunc";
 
+// Name of the attribute that defines the function for indirect function calls.
+constexpr char kFuncAttrName[] = "f";
+
 constexpr char kNoInlineAttr[] = "_noinline";
 
 bool AttrIsTrue(const FunctionDef& func, const string& attr) {
@@ -56,37 +77,101 @@ bool MarkedNoInline(const FunctionDef& func) {
   return AttrIsTrue(func, kNoInlineAttr);
 }
 
-// Find unique name for the specialized function. Collision can happen if
-// specialized function is instantiated for the nodes with the same name (e.g.
-// inside function body of two different functions).
-string UniqueSpecializedFunctionName(const FunctionDef& func,
-                                     const NodeDef& func_node,
-                                     const FunctionLibraryDefinition& flib) {
-  using str_util::StringReplace;
-  using strings::StrCat;
+// There are two ways of calling a Tensorflow function:
+//
+// 1. Direct function call: node.op() is the name of the function.
+//
+// 2. Indirect function call: the function name is passed through a node
+//    attribute, and special Tensorflow kernels are responsible for calling the
+//    function through the FunctionLibraryRuntime. Example: PartitionedCallOp.
+
+// Check if func_node.op() matches the name in FunctionDef signature.
+bool IsDirectFunctionCall(const FunctionDef& func, const NodeDef& func_node) {
+  return func_node.op() == func.signature().name();
+}
 
-  string specialized_name = StrCat(func.signature().name(), "_specialized_for_",
-                                   StringReplace(func_node.name(), "/", "_",
-                                                 /*replace_all*/ true));
-  string unique_name = specialized_name;
+// Check if func_node has function attribute with a function name matching
+// FunctionDef signature.
+bool IsIndirectFunctionCall(const FunctionDef& func, const NodeDef& func_node) {
+  auto* func_attr = AttrSlice(func_node).Find(kFuncAttrName);
+  return func_attr != nullptr && func_attr->has_func() &&
+         func_attr->func().name() == func.signature().name();
+}
 
-  int idx = 0;
-  while (flib.Find(unique_name)) {
-    unique_name = strings::StrCat(specialized_name, "_", ++idx);
+AttrSlice FunctionInstantiationAttributes(const FunctionDef& func,
+                                          const NodeDef& func_node) {
+  if (IsDirectFunctionCall(func, func_node)) {
+    return AttrSlice(func_node);
+
+  } else if (IsIndirectFunctionCall(func, func_node)) {
+    auto* func_attr = AttrSlice(func_node).Find(kFuncAttrName);
+    return AttrSlice(&func_attr->func().attr());
+
+  } else {
+    LOG(WARNING) << "Can't resolve function instantiation attributes: "
+                 << SummarizeNodeDef(func_node);
+    return AttrSlice();
   }
-  return unique_name;
 }
 
+// This is a fake device that should not be used for any op kernel execution,
+// the only purpose of this device is to be passed as a part of DeviceSet to the
+// Placer.
+class FakeDevice : public Device {
+ public:
+  FakeDevice(Env* env, const string& device) : Device(env, attr(device)) {}
+  explicit FakeDevice(const string& device) : FakeDevice(nullptr, device) {}
+  Status Sync() override { return Status::OK(); }
+
+ private:
+  static DeviceAttributes attr(const string& device) {
+    DeviceNameUtils::ParsedName parsed_name;
+    bool parsed = DeviceNameUtils::ParseFullName(device, &parsed_name);
+    DCHECK(parsed) << "Failed to parse full device name: " << device;
+
+    DeviceAttributes attr;
+    attr.set_name(device);
+    attr.set_device_type(parsed_name.type);
+    return attr;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// Function specialization.
+//
+// FunctionDef is somewhat similar to function template in C++, given all the
+// type parameters (and attribute values) it generates a statically defined
+// graph from the type parametrized "graph template" (function body).
+//
+// Function specialization instantiates a parametrized FunctionDef into a
+// statically defined graph, and then converts it back to the fully defined
+// FunctionDef (it doesn't have any unknown type parameters or attribute
+// values, known as placeholders).
+//
+// Given the fully specified graph we can apply all the Grappler optimizers to
+// it (see details in MetaOptimizer). Also we can push known constant inputs
+// into the function body, and remove unused outputs/inputs.
+
 // Specialized function instantiation type parameters, body parameters, and
 // const inputs.
 struct FunctionSpecializationSignature {
+  // Currently we do not support functions with tensor lists as inputs or
+  // outputs, so caller node input/output ports always match function
+  // input/output arguments.
+  using InputPort = int;
+  using OutputPort = int;
+
   string func_name;
+  bool is_in_fetch_set;
+  gtl::FlatSet<OutputPort> active_outputs;
   std::unordered_map<string, DataType> type_parameters;
   std::unordered_map<string, AttrValue> body_parameters;
-  std::unordered_map<int, string> const_inputs;
+  std::unordered_map<InputPort, string> const_inputs;
 
   bool operator==(const FunctionSpecializationSignature& other) const {
     bool equals = func_name == other.func_name &&
+                  is_in_fetch_set == other.is_in_fetch_set &&
+                  active_outputs == other.active_outputs &&
                   type_parameters == other.type_parameters &&
                   const_inputs == other.const_inputs;
 
@@ -104,11 +189,21 @@ struct FunctionSpecializationSignature {
     return true;
   }
 
+  // TODO(ezhulenev): Migrate to AbslHashValue.
+  // TODO(ezhulenev): Optimize performance by computing hashes of unordered
+  // values first, and then compute a hash of sorted hashes.
   struct Hash {
     uint64 operator()(FunctionSpecializationSignature const& s) const {
       uint64 h = Hash64(s.func_name);
+      h = Hash64Combine(std::hash<bool>()(s.is_in_fetch_set), h);
+
+      // Use std::set/std::map for deterministic iteration order.
 
-      // Use std::map for deterministic iteration order.
+      std::set<OutputPort> active_outputs(s.active_outputs.begin(),
+                                          s.active_outputs.end());
+      for (const auto& active_output : active_outputs) {
+        h = Hash64Combine(std::hash<int>()(active_output), h);
+      }
 
       std::map<string, DataType> types(s.type_parameters.begin(),
                                        s.type_parameters.end());
@@ -126,8 +221,8 @@ struct FunctionSpecializationSignature {
         h = Hash64Combine(FastAttrValueHash(pair.second), h);
       }
 
-      std::map<int, string> inputs(s.const_inputs.begin(),
-                                   s.const_inputs.end());
+      std::map<InputPort, string> inputs(s.const_inputs.begin(),
+                                         s.const_inputs.end());
       for (const auto& pair : inputs) {
         h = Hash64Combine(std::hash<int>()(pair.first), h);
         h = Hash64Combine(Hash64(pair.second), h);
@@ -140,24 +235,41 @@ struct FunctionSpecializationSignature {
 
 struct FunctionSpecialization {
   string specialized_func_name;
-  std::unordered_set<string> const_inputs;
-  std::unordered_set<string> control_deps;
-};
-
-class FakeCPUDevice : public Device {
- public:
-  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
-  Status Sync() override { return Status::OK(); }
+  // True if the function caller node is in GrapplerItem fetch set.
+  bool is_in_fetch_set;
+  // Names of the tensors that were pushed down into the function body.
+  gtl::FlatSet<string> const_inputs;
+  // Control dependencies of pushed down const inputs have to be attached to
+  // function caller node.
+  gtl::FlatSet<string> control_deps;
+  // Output tensors (ports) that consumed by other nodes in the graph or in a
+  // GrapplerItem fetch set.
+  gtl::FlatSet<int> active_outputs;
+  // Mapping from original function output port to the output port of
+  // specialized function. If function specialization changes the number of
+  // function outputs it's required to update all node consumers.
+  std::vector<std::pair<int, int>> output_mapping;
 };
 
 class FunctionOptimizerContext {
  public:
   explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
                                     const GrapplerItem& item)
-      : graph_version_(item.graph.versions().producer()),
-        function_library_(OpRegistry::Global(), item.graph.library()) {
+      : grappler_item_id_(item.id),
+        graph_version_(item.graph.versions().producer()),
+        opt_level_(opt_level),
+        allowed_optimizations_(item.allowed_optimizations()),
+        function_library_(OpRegistry::Global(), item.graph.library()),
+        available_device_names_(item.devices().begin(), item.devices().end()),
+        graph_view_(&item.graph) {
     InitializeTrulyConstNodes(item);
-    InitializeInlinedFunctions(opt_level, item);
+    InitializeFetchNodes(item);
+  }
+
+  const RewriterConfig::Toggle opt_level() const { return opt_level_; }
+
+  const GrapplerItem::AllowedOptimizations& allowed_optimizations() const {
+    return allowed_optimizations_;
   }
 
   const FunctionLibraryDefinition& function_library() const {
@@ -173,8 +285,35 @@ class FunctionOptimizerContext {
     return flr_;
   }
 
-  bool IsInlinedFunction(const string& name) const {
-    return inlined_functions_.count(name) > 0;
+  const gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
+  tensor_mapping() const {
+    return tensor_mapping_;
+  }
+
+  const gtl::FlatMap<string, std::vector<string>>& control_overrides() const {
+    return control_overrides_;
+  }
+
+  const GraphView& graph_view() const { return graph_view_; }
+
+  const string& grappler_item_id() const { return grappler_item_id_; }
+
+  const gtl::FlatSet<string>& fetch_tensors() const { return fetch_tensors_; }
+
+  const DeviceSet* devices() const {
+    // Create fake devices lazily only if we need a DeviceSet.
+    if (available_devices_.empty() && !available_device_names_.empty()) {
+      for (const string& name : available_device_names_) {
+        auto device = absl::make_unique<FakeDevice>(name);
+        available_device_set_.AddDevice(device.get());
+        available_devices_.push_back(std::move(device));
+      }
+    }
+    return &available_device_set_;
+  }
+
+  bool IsFetchNode(const string& node_name) const {
+    return fetch_nodes_.find(node_name) != fetch_nodes_.end();
   }
 
   bool IsTrulyConst(const string& name) const {
@@ -185,11 +324,6 @@ class FunctionOptimizerContext {
     return gtl::FindWithDefault(truly_const_nodes_, name, nullptr);
   }
 
-  // Find inlining candidate by name. Return nullptr if not found.
-  const FunctionDef* FindInlinedFunction(const string& name) const {
-    return gtl::FindWithDefault(inlined_functions_, name, nullptr);
-  }
-
   const FunctionSpecialization* FindFunctionSpecialization(
       const FunctionSpecializationSignature& sig) const {
     return gtl::FindOrNull(specialized_functions_, sig);
@@ -200,9 +334,38 @@ class FunctionOptimizerContext {
     specialized_functions_.emplace(sig, specialized_func);
   }
 
+  void AddTensorMapping(const SafeTensorId& from, const SafeTensorId& to) {
+    auto inserted = tensor_mapping_.insert({from, to});
+    DCHECK(inserted.second)
+        << "Failed to insert duplicated tensor mapping: "
+        << "from=" << from.ToString() << " to=" << to.ToString();
+  }
+
+  void AddTensorMapping(const string& func_node,
+                        const FunctionSpecialization& specialized_func) {
+    for (const auto& pair : specialized_func.output_mapping) {
+      int from_idx = pair.first;
+      int to_idx = pair.second;
+      if (from_idx != to_idx) {
+        SafeTensorId from_tensor(func_node, from_idx);
+        SafeTensorId to_tensor(func_node, to_idx);
+        auto inserted = tensor_mapping_.insert({from_tensor, to_tensor});
+        DCHECK(inserted.second);
+      }
+    }
+  }
+
+  void AddControlOverrides(const NodeDef& func_node,
+                           const std::vector<string>& control_overrides) {
+    control_overrides_[func_node.name()].reserve(control_overrides.size());
+    for (const string& control_override : control_overrides) {
+      control_overrides_[func_node.name()].push_back(control_override);
+    }
+  }
+
  private:
   void InitializeTrulyConstNodes(const GrapplerItem& item) {
-    std::unordered_set<string> feed_nodes;
+    gtl::FlatSet<string> feed_nodes;
     for (const auto& feed : item.feed) {
       feed_nodes.insert(NodeName(feed.first));
     }
@@ -214,44 +377,32 @@ class FunctionOptimizerContext {
     }
   }
 
-  void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level,
-                                  const GrapplerItem& item) {
-    bool aggressive = opt_level == RewriterConfig::AGGRESSIVE;
-
-    for (const FunctionDef& func : item.graph.library().function()) {
-      // Can't create IdentityN nodes with no input or output: skip these
-      // functions for now.
-      if (func.signature().input_arg_size() == 0 ||
-          func.signature().output_arg_size() == 0) {
-        continue;
-      }
-      bool marked_noinline = MarkedNoInline(func);
-      bool marked_specialized = MarkedSpecialized(func);
-
-      if (!marked_specialized && (!marked_noinline || aggressive)) {
-        inlined_functions_[func.signature().name()] = &func;
-      }
+  void InitializeFetchNodes(const GrapplerItem& item) {
+    for (const string& fetch : item.fetch) {
+      fetch_tensors_.insert(fetch);
+      fetch_nodes_.insert(NodeName(fetch));
     }
   }
 
   void InitializeFunctionLibraryRuntime() {
     if (!flr_) {
       Env* env = Env::Default();
-      DeviceAttributes attr;
-      attr.set_name("/device:CPU:0");
-      attr.set_device_type("CPU");
-      Device* device = new FakeCPUDevice(env, attr);
-      device_mgr_.reset(new DeviceMgr({device}));
+      std::vector<std::unique_ptr<Device>> devices;
+      devices.push_back(absl::make_unique<FakeDevice>(env, "/device:CPU:0"));
+      device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
       OptimizerOptions optimizer_opts;
       optimizer_opts.set_do_function_inlining(true);
       process_flr_.reset(new ProcessFunctionLibraryRuntime(
           device_mgr_.get(), env, graph_version_, &function_library_,
           optimizer_opts));
-      flr_ = process_flr_->GetFLR(device->name());
+      flr_ = process_flr_->GetFLR(device_mgr_->ListDevices()[0]->name());
     }
   }
 
+  const string grappler_item_id_;
   const int graph_version_;
+  const RewriterConfig::Toggle opt_level_;
+  const GrapplerItem::AllowedOptimizations allowed_optimizations_;
   FunctionLibraryDefinition function_library_;
 
   // These fields initialized lazily only if needed.
@@ -259,8 +410,16 @@ class FunctionOptimizerContext {
   std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
   FunctionLibraryRuntime* flr_ = nullptr;
 
-  // Functions that can be inlined into optimized graph.
-  std::unordered_map<string, const FunctionDef*> inlined_functions_;
+  // Fully defined names of the devices available to the GrapplerItem.
+  const gtl::FlatSet<string> available_device_names_;
+
+  // List of available `FakedDevices` (lazily initialized, see devices()).
+  mutable std::vector<std::unique_ptr<Device>> available_devices_;
+
+  // DeviceSet of fake devices (`FakeDevice`) constructed from
+  // available_devices_ (lazily initialized).
+  mutable DeviceSet available_device_set_;
+
   // Nodes that are Const and not in feed.
   std::unordered_map<string, const NodeDef*> truly_const_nodes_;
   // Specialized functions.
@@ -269,104 +428,114 @@ class FunctionOptimizerContext {
                      FunctionSpecializationSignature::Hash>
       specialized_functions_;
 
+  // GrapplerItem.fetch is a vector of tensors.
+  gtl::FlatSet<string> fetch_tensors_;  // format: node_name:port
+  gtl::FlatSet<string> fetch_nodes_;    // format: node_name
+
+  // After function inlining and specialization, the optimized graph might be in
+  // invalid state, nodes can read from non-existing function call nodes that
+  // were inlined, or they can read from output index that is no longer valid
+  // after unused outputs pruning.
+  //
+  // Tensor mapping that has to be applied to the graph after all functions
+  // optimizations (invalidated tensor id -> optimized graph tensor id).
+  gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
+      tensor_mapping_;
+
+  // When we inline a function into the optimized graph, we no longer have the
+  // function call node to anchor control dependencies. Instead we must expand
+  // each function call control output edge into multiple control dependencies
+  // to all side-effectful ops inside the function body.
+  //
+  // Invalidated function call node name -> Inlined side-effectful nodes
+  gtl::FlatMap<string, std::vector<string>> control_overrides_;
+
+  // Use graph view to find active outputs of the function caller nodes.
+  GraphView graph_view_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
-bool HasTrulyConstInputs(const NodeDef& node,
-                         const FunctionOptimizerContext& ctx) {
-  const auto is_truly_const = [&ctx](const string& input) {
-    return ctx.IsTrulyConst(NodeName(input));
-  };
-  return std::any_of(node.input().begin(), node.input().end(), is_truly_const);
-}
+// Returns a pointer to the called function definition iff the given node is
+// indeed a function call. Otherwise returns nullptr.
+const FunctionDef* FindFunctionCall(const FunctionOptimizerContext& ctx,
+                                    const NodeDef& node) {
+  // Check if a node does indirect function call via PartitionedCallOp.
+  if (IsPartitionedCall(node) || IsStatefulPartitionedCall(node)) {
+    const AttrValue* func_attr = AttrSlice(node).Find("f");
+    return (func_attr != nullptr && func_attr->has_func())
+               ? ctx.function_library().Find(func_attr->func().name())
+               : nullptr;
+  }
 
-// Return trimmed FunctionDefLibrary with functions that are reachable from
-// the optimized graph.
-FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib,
-                                       const GraphDef& optimized_graph) {
-  // Functions that are reachable from the optimized graph.
-  std::unordered_set<string> keep_funcs;
-
-  std::vector<const FunctionDef*> func_queue;
-  func_queue.reserve(flib.num_functions());
-
-  // Add registered and not already processed functions to the queue by name.
-  const auto add_to_func_queue = [&](const string& func_name) {
-    const FunctionDef* func = flib.Find(func_name);
-    if (func && keep_funcs.find(func_name) == keep_funcs.end()) {
-      func_queue.push_back(func);
-    }
-  };
+  // Check if the function op itself is a function name.
+  return ctx.function_library().Find(node.op());
+}
 
-  // Find all the functions that are reachable from the given node.
-  const auto add_node_to_func_queue = [&](const NodeDef& node) {
-    // Node itself can be a call to the function.
-    add_to_func_queue(node.op());
+gtl::FlatSet<int> GetActiveOutputs(const NodeDef& node,
+                                   const FunctionOptimizerContext& ctx,
+                                   int size_hint = 0) {
+  gtl::FlatSet<int> active_outputs;
+  active_outputs.reserve(static_cast<size_t>(size_hint));
+
+  // 1. Output can be consumed by the other graph node.
+  const auto node_fanout_edges =
+      ctx.graph_view().GetFanoutEdges(node, /*include_controlled_edges=*/false);
+  for (const GraphView::Edge& edge : node_fanout_edges) {
+    active_outputs.insert(edge.src.port_id);
+  }
 
-    // Or node can have an attribute referencing a function.
-    for (const auto& attr : node.attr()) {
-      const auto& attr_value = attr.second;
+  // 2. Or it can be in a fetch set.
+  for (const string& fetch_tensor : ctx.fetch_tensors()) {
+    int port = NodePositionIfSameNode(fetch_tensor, node.name());
+    if (port >= 0) active_outputs.insert(port);
+  }
 
-      // 1. AttrValue.func
-      if (attr_value.has_func()) {
-        add_to_func_queue(attr_value.func().name());
-      }
+  return active_outputs;
+}
 
-      // 2. AttrValue.ListValue.func
-      if (attr_value.has_list()) {
-        for (const auto& func : attr_value.list().func()) {
-          add_to_func_queue(func.name());
-        }
-      }
-    }
+bool HasTrulyConstInputs(const NodeDef& node,
+                         const FunctionOptimizerContext& ctx) {
+  const auto is_truly_const = [&ctx](const string& input) {
+    return ctx.IsTrulyConst(NodeName(input));
   };
+  return absl::c_any_of(node.input(), is_truly_const);
+}
 
-  // Add all functions that are directly called from the optimized graph.
-  const auto& graph_nodes = optimized_graph.node();
-  std::for_each(graph_nodes.begin(), graph_nodes.end(), add_node_to_func_queue);
-
-  // Process all reachable functions.
-  while (!func_queue.empty()) {
-    const FunctionDef* func = func_queue.back();
-    func_queue.pop_back();
-
-    const string& func_name = func->signature().name();
-    keep_funcs.insert(func_name);
-
-    // Find all the functions called from the function body.
-    const auto& func_body = func->node_def();
-    std::for_each(func_body.begin(), func_body.end(), add_node_to_func_queue);
+bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
+                      const FunctionOptimizerContext& ctx) {
+  // Functions with tensor list outputs are not supported right now, so the
+  // number of output args is the same as number of possible function caller
+  // node outputs.
+  int num_outputs = func.signature().output_arg_size();
+  const gtl::FlatSet<int> active_outputs =
+      GetActiveOutputs(func_node, ctx, /*size_hind*/ num_outputs);
 
-    // Check if the function has a registered gradient.
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
-  }
+  return active_outputs.size() != num_outputs;
+}
 
-  FunctionDefLibrary lib;
-  for (const string& func_name : keep_funcs) {
-    const FunctionDef* func = CHECK_NOTNULL(flib.Find(func_name));
-    *lib.add_function() = *func;
+// Return pruned FunctionDefLibrary with functions that are reachable from
+// the optimized graph.
+FunctionDefLibrary PruneFunctionLibrary(const FunctionLibraryDefinition& flib,
+                                        const GraphDef& optimized_graph) {
+  FunctionLibraryDefinition pruned_flib =
+      ReachableFunctionLibraryDefinition(flib, optimized_graph);
 
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) {
-      GradientDef* gd = lib.add_gradient();
-      gd->set_function_name(func_name);
-      gd->set_gradient_func(grad_func_name);
-    }
-  }
+  int pruned_functions = static_cast<int>(pruned_flib.num_functions()) -
+                         static_cast<int>(flib.num_functions());
 
-  VLOG(3) << "Trimmed function library: " << keep_funcs.size() << " functions ("
-          << static_cast<int>(keep_funcs.size() - flib.num_functions()) << ")";
+  VLOG(3) << "Pruned function library: " << pruned_flib.num_functions()
+          << " functions (" << pruned_functions << ")";
 
-  return lib;
+  return pruned_flib.ToProto();
 }
 
 // Push all constant inputs of an instantiating node into the function body.
 Status PushDownConstInputs(const NodeDef& func_node,
                            const FunctionOptimizerContext& ctx,
                            GrapplerFunctionItem* item,
-                           std::unordered_set<string>* const_inputs,
-                           std::unordered_set<string>* control_deps) {
+                           gtl::FlatSet<string>* const_inputs,
+                           gtl::FlatSet<string>* control_deps) {
   // Record node control dependencies in the control_deps set.
   const auto record_control_deps = [&](const NodeDef* const_input) {
     for (int i = const_input->input_size() - 1; i >= 0; --i) {
@@ -397,32 +566,32 @@ Status PushDownConstInputs(const NodeDef& func_node,
 
 // Remove inputs that were pushed into the function body, and attach their
 // control dependencies to the function caller node.
-void RemovePushedDownConstInputs(const std::unordered_set<string>& const_inputs,
-                                 const std::unordered_set<string>& control_deps,
+void RemovePushedDownConstInputs(const FunctionSpecialization& specialization,
                                  NodeDef* specialized_func_node) {
   // Nothing to do if it was no const inputs to the function node.
-  if (const_inputs.empty()) return;
+  if (specialization.const_inputs.empty()) return;
 
   // Keep only non-const inputs.
   std::vector<string> keep_inputs;
   const auto& inputs = specialized_func_node->input();
   std::copy_if(inputs.begin(), inputs.end(), std::back_inserter(keep_inputs),
                [&](const string& input) {
-                 return const_inputs.find(input) == const_inputs.end();
+                 return specialization.const_inputs.find(input) ==
+                        specialization.const_inputs.end();
                });
 
   specialized_func_node->clear_input();
   for (const auto& keep : keep_inputs) specialized_func_node->add_input(keep);
 
   // Attach control dependencies of pushed down const input to the caller node.
-  if (!control_deps.empty()) {
-    std::unordered_set<string> existing_control_deps;
+  if (!specialization.control_deps.empty()) {
+    gtl::FlatSet<string> existing_control_deps;
 
     for (const string& input : keep_inputs) {
       existing_control_deps.insert(AsControlDependency(NodeName(input)));
     }
 
-    for (const string& ctrl : control_deps) {
+    for (const string& ctrl : specialization.control_deps) {
       if (existing_control_deps.find(ctrl) == existing_control_deps.end()) {
         VLOG(3) << "Forward control dependency: input=" << ctrl;
         specialized_func_node->add_input(ctrl);
@@ -431,19 +600,136 @@ void RemovePushedDownConstInputs(const std::unordered_set<string>& const_inputs,
   }
 }
 
+// Remove Tin type parameters for pushed down const inputs.
+void RemovePushedDownConstInputTypes(
+    const FunctionSpecialization& specialization, const NodeDef& func_node,
+    NodeDef* specialized_func_node) {
+  // Nothing to do if it was no const inputs to the function node.
+  if (specialization.const_inputs.empty()) return;
+
+  // Make sure that original function caller has Tin attribute.
+  const AttrValue* tin = AttrSlice(func_node).Find("Tin");
+  if (tin == nullptr || !tin->has_list()) return;
+
+  // Clear input types for the specialized node.
+  auto* attr = specialized_func_node->mutable_attr();
+  (*attr)["Tin"].mutable_list()->clear_type();
+
+  // Keep types of non-const inputs.
+  for (int i = 0; i < func_node.input_size(); ++i) {
+    const string& input = func_node.input(i);
+    if (IsControlInput(input)) break;
+
+    if (specialization.const_inputs.find(input) ==
+        specialization.const_inputs.end()) {
+      DataType dt = tin->list().type(i);
+      (*attr)["Tin"].mutable_list()->add_type(dt);
+    }
+  }
+}
+
+// Remove Tout type parameters for pruned function outputs.
+void RemoveUnusedOutputsTypes(const FunctionSpecialization& specialization,
+                              const NodeDef& func_node,
+                              NodeDef* specialized_func_node) {
+  // Make sure that original function caller has Tout attribute.
+  const AttrValue* tout = AttrSlice(func_node).Find("Tout");
+  if (tout == nullptr || !tout->has_list()) return;
+
+  // Nothing to do if all outputs are active.
+  if (specialization.active_outputs.size() == tout->list().type_size()) return;
+
+  // Clear input types for the specialized node.
+  auto* attr = specialized_func_node->mutable_attr();
+  (*attr)["Tout"].mutable_list()->clear_type();
+
+  // Keep output types of active outputs only.
+  for (int i = 0; i < tout->list().type_size(); ++i) {
+    if (specialization.active_outputs.find(i) !=
+        specialization.active_outputs.end()) {
+      DataType dt = tout->list().type(i);
+      (*attr)["Tout"].mutable_list()->add_type(dt);
+    }
+  }
+}
+
+Status UpdateSpecializedFunctionCallSite(const FunctionDef& func,
+                                         const NodeDef& func_node,
+                                         const string& specialized_func_name,
+                                         NodeDef* specialized_func_node) {
+  if (IsDirectFunctionCall(func, func_node)) {
+    specialized_func_node->set_op(specialized_func_name);
+
+  } else if (IsIndirectFunctionCall(func, func_node)) {
+    auto* attr = specialized_func_node->mutable_attr();
+    (*attr)[kFuncAttrName].mutable_func()->set_name(specialized_func_name);
+
+  } else {
+    return errors::InvalidArgument("Unknown function call site");
+  }
+
+  return Status::OK();
+}
+
+// Update a graph node created from the original function caller node, to the
+// function specialization. Function specialization might change the number of
+// inputs and outputs, so we have to make sure that graph node is updated
+// accordingly.
+Status UpdateSpecializedFunctionNode(
+    const FunctionDef& func, const NodeDef& func_node,
+    const FunctionSpecialization& specialization,
+    NodeDef* specialized_func_node) {
+  // Function called indirectly via custom kernel (e.g. PartitionedCallOp).
+  bool is_indirect_call = IsIndirectFunctionCall(func, func_node);
+
+  // 1. Call the specialized function instead of original one.
+  TF_RETURN_IF_ERROR(UpdateSpecializedFunctionCallSite(
+      func, func_node, specialization.specialized_func_name,
+      specialized_func_node));
+
+  // 2. Remove inputs corresponding to the pushed down consts.
+  RemovePushedDownConstInputs(specialization, specialized_func_node);
+
+  // NOTE: PartitionedCallOp has `Tin` and `Tout` attributes for input/output
+  // types, that must be in sync with updated function signature.
+
+  // 3. Update input types for the indirect function calls.
+  if (is_indirect_call) {
+    RemovePushedDownConstInputTypes(specialization, func_node,
+                                    specialized_func_node);
+  }
+
+  // 4. Update output types for the indirect function call. It's unsafe to
+  // change the number of outputs for the fetch nodes, so we just skip them.
+  if (is_indirect_call && !specialization.is_in_fetch_set) {
+    RemoveUnusedOutputsTypes(specialization, func_node, specialized_func_node);
+  }
+
+  // 5. Remove custom gradient annotation.
+  specialized_func_node->mutable_attr()->erase("_gradient_op_type");
+
+  return Status::OK();
+}
+
 Status InitializeFunctionSpecializationSignature(
     const NodeDef& func_node, const FunctionDef& func,
-    const AttrValueMap& func_attr, const FunctionOptimizerContext& ctx,
-    FunctionSpecializationSignature* sig) {
+    const AttrSlice& func_instantiation_attr,
+    const FunctionOptimizerContext& ctx, FunctionSpecializationSignature* sig) {
+  DCHECK(sig->const_inputs.empty());
+  DCHECK(sig->active_outputs.empty());
+
   sig->func_name = func.signature().name();
+  sig->is_in_fetch_set = ctx.IsFetchNode(func_node.name());
+  sig->active_outputs = GetActiveOutputs(func_node, ctx);
 
-  TF_RETURN_IF_ERROR(
-      InstantiationTypeParameters(func, func_attr, &sig->type_parameters));
-  TF_RETURN_IF_ERROR(
-      InstantiationBodyParameters(func, func_attr, &sig->body_parameters));
+  TF_RETURN_IF_ERROR(InstantiationTypeParameters(func, func_instantiation_attr,
+                                                 &sig->type_parameters));
+  TF_RETURN_IF_ERROR(InstantiationBodyParameters(func, func_instantiation_attr,
+                                                 &sig->body_parameters));
 
   for (int i = 0; i < func_node.input_size(); ++i) {
     const string& input = func_node.input(i);
+    if (IsControlInput(input)) break;
     if (ctx.IsTrulyConst(input)) {
       sig->const_inputs.emplace(i, input);
     }
@@ -452,19 +738,32 @@ Status InitializeFunctionSpecializationSignature(
   return Status::OK();
 }
 
+// Create a name for the function specialization. The name of the function, name
+// of the node instantiating it, and a Grappler item id should generate unique
+// function name. Meta optimizer might create multiple Grappler items for the
+// same graph when optimizing functions, but it's guaranteed that they all will
+// have unique ids.
+string SpecializedFunctionName(const FunctionOptimizerContext& ctx,
+                               const FunctionDef& func,
+                               const NodeDef& func_node) {
+  return absl::Substitute("$0_specialized_for_$1_at_$2",
+                          func.signature().name(),
+                          absl::StrReplaceAll(func_node.name(), {{"/", "_"}}),
+                          ctx.grappler_item_id());
+}
+
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
                           const int graph_def_version,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
-  VLOG(2) << "Specialize function instantiation: "
-          << SummarizeNodeDef(func_node);
+  VLOG(2) << "Specialize function call: " << SummarizeNodeDef(func_node);
 
-  const std::unordered_map<string, AttrValue> func_attr(
-      func_node.attr().begin(), func_node.attr().end());
+  const AttrSlice func_instantiation_attr =
+      FunctionInstantiationAttributes(func, func_node);
 
   FunctionSpecializationSignature signature;
   TF_RETURN_IF_ERROR(InitializeFunctionSpecializationSignature(
-      func_node, func, func_attr, *ctx, &signature));
+      func_node, func, func_instantiation_attr, *ctx, &signature));
 
   // Check if function was already specialized for identical context.
   const FunctionSpecialization* already_specialized =
@@ -478,11 +777,11 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
     // Add a function call node for the specialized function.
     NodeDef* specialized_func_node = optimized_graph->add_node();
     *specialized_func_node = func_node;
-    specialized_func_node->set_op(already_specialized->specialized_func_name);
 
-    RemovePushedDownConstInputs(already_specialized->const_inputs,
-                                already_specialized->control_deps,
-                                specialized_func_node);
+    TF_RETURN_IF_ERROR(UpdateSpecializedFunctionNode(
+        func, func_node, *already_specialized, specialized_func_node));
+
+    ctx->AddTensorMapping(specialized_func_node->name(), *already_specialized);
 
     return Status::OK();
   }
@@ -493,23 +792,36 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // Make a GrapplerFunctionItem and convert it back to FunctionDef after
   // pushing all constant inputs into the function body.
   GrapplerFunctionItem item;
-  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                              graph_def_version, &item));
+  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_instantiation_attr,
+                                              flib, graph_def_version, &item));
 
   // Push const inputs into the function body, and keep track of their control
   // dependencies.
-  std::unordered_set<string> const_inputs;
-  std::unordered_set<string> control_deps;
+  gtl::FlatSet<string> const_inputs;
+  gtl::FlatSet<string> control_deps;
   TF_RETURN_IF_ERROR(PushDownConstInputs(func_node, *ctx, &item, &const_inputs,
                                          &control_deps));
 
+  // Remove function outputs that do not have any consumers. We can't safely
+  // update outputs for the fetch nodes, so we just skip them.
+  std::vector<std::pair<int, int>> output_mapping;
+  if (!signature.is_in_fetch_set) {
+    TF_RETURN_IF_ERROR(
+        RemoveUnusedOutputs(signature.active_outputs, &item, &output_mapping));
+  }
+
   // TODO(ezhulenev): Push down known input shapes.
   FunctionDef specialized_func;
   TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func));
 
   // Find a name for specialized function.
   const string specialized_func_name =
-      UniqueSpecializedFunctionName(func, func_node, flib);
+      SpecializedFunctionName(*ctx, func, func_node);
+  if (flib.Contains(specialized_func_name)) {
+    // NOTE(ezhulenev): This should never happen. If it happens, it's a sign of
+    // a serious internal error, that must be investigated.
+    return errors::Internal("Created duplicate function specialization");
+  }
 
   specialized_func.mutable_signature()->set_name(specialized_func_name);
   auto* specialized_attr = specialized_func.mutable_attr();
@@ -522,14 +834,107 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // Add a function call node for the specialized function.
   NodeDef* specialized_func_node = optimized_graph->add_node();
   *specialized_func_node = func_node;
-  specialized_func_node->set_op(specialized_func_name);
 
-  // Update specialized node to remove inputs for pushed down consts.
-  RemovePushedDownConstInputs(const_inputs, control_deps,
-                              specialized_func_node);
+  FunctionSpecialization func_specialization = {
+      specialized_func_name, signature.is_in_fetch_set, const_inputs,
+      control_deps,          signature.active_outputs,  output_mapping};
 
-  ctx->AddSpecializedFunction(
-      signature, {specialized_func_name, const_inputs, control_deps});
+  TF_RETURN_IF_ERROR(UpdateSpecializedFunctionNode(
+      func, func_node, func_specialization, specialized_func_node));
+
+  ctx->AddSpecializedFunction(signature, func_specialization);
+  ctx->AddTensorMapping(specialized_func_node->name(), func_specialization);
+
+  return Status::OK();
+}
+
+// -------------------------------------------------------------------------- //
+// Inline direct functions calls.
+//
+// When we inline direct function calls, we instantiate the function body from
+// its FunctionDef and caller node attributes, and embed the instantiated graph
+// into the "main graph". When we do that, we must preserve the function call
+// semantics:
+//
+// 1) All input nodes must be executed before any of function body nodes will
+//    start executing.
+// 2) All function body nodes must be executed before any of the nodes, reading
+//    outputs of the function will start executing.
+// 3) All nodes with side effects inside a function must be executed, this is
+//    different from the nodes with side effects in the main graph, that can be
+//    pruned if they are not in transitive dependency set of any of the fetch
+//    nodes.
+// 4) All nodes of the function body must be execute on the device specified by
+//    the function caller node.
+//
+// To guarantee that function call semantics are preserved after inlining, we
+// insert an IdentityN node before the inlined function body, and hook all
+// inputs into that, and we insert another IdentityN node to hook all function
+// outputs to it.
+
+// Returns `Status::OK()` iff `node` is a direct function call of `func`, and we
+// know how to inline it into the main graph, otherwise returns and error
+// indicating why the function call is not inlinable.
+Status IsInlinableDirectFunctionCall(const FunctionOptimizerContext& ctx,
+                                     const FunctionDef& func,
+                                     const NodeDef& func_node) {
+  // Indirect function calls (PartitionedCallOp) have automatic control
+  // dependencies and inlined separately from direct function calls.
+  if (!IsDirectFunctionCall(func, func_node)) {
+    return errors::InvalidArgument("Unsupported function call type: ",
+                                   SummarizeNodeDef(func_node));
+  }
+
+  // For direct function  calls we insert IdentityN nodes before/after inlined
+  // function body to preserve function call semantics (all inputs evaluated
+  // before function evaluation starts, and all function body nodes finished
+  // before output consumed by other nodes).
+  if (func.signature().input_arg_size() == 0) {
+    return errors::FailedPrecondition(
+        "Can't inline direct function call with empty inputs: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // TODO(ezhulenev): Relax constraint on output args?
+  if (func.signature().output_arg_size() == 0) {
+    return errors::FailedPrecondition(
+        "Can't inline direct function call with empty outputs: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function must execute all the nodes in a function body that might have side
+  // effects. After inlining these nodes into the main graph, we can no longer
+  // guarantee that. For now we disable inlining functions with side effects.
+  //
+  // Attaching control dependency to the output IdentityN node is not safe,
+  // because it might be split or pruned in a later optimization pass.
+  //
+  // Indirect function calls (via PartitionedCallOp) have automatic dependency
+  // tracking, and allow us to safely inline functions with side effects.
+  bool has_side_effects =
+      absl::c_any_of(func.node_def(), [&ctx](const NodeDef& node) {
+        return !IsFreeOfSideEffect(node, &ctx.function_library());
+      });
+  if (has_side_effects) {
+    return errors::FailedPrecondition(
+        "Can't inline function with side-effects in the function body: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We ignore `_noinline` marker in aggressive mode.
+  bool aggressive = ctx.opt_level() == RewriterConfig::AGGRESSIVE;
+  if (MarkedNoInline(func) && !aggressive) {
+    return errors::FailedPrecondition(
+        "Can't inline function marked with '_noinline': ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function specialization and inlining must be mutually exclusive.
+  if (MarkedSpecialized(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function created in Grappler function specialization: ",
+        SummarizeNodeDef(func_node));
+  }
 
   return Status::OK();
 }
@@ -576,17 +981,21 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
   return outputs;
 }
 
-Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
-                      const FunctionOptimizerContext& ctx,
-                      const int graph_def_version, GraphDef* optimized_graph) {
-  VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node);
+Status InlineDirectFunctionCall(const NodeDef& func_node,
+                                const FunctionDef& func,
+                                const int graph_def_version,
+                                const FunctionOptimizerContext& ctx,
+                                GraphDef* optimized_graph) {
+  VLOG(2) << "Inline direct function call: " << SummarizeNodeDef(func_node);
+  TF_RETURN_IF_ERROR(IsInlinableDirectFunctionCall(ctx, func, func_node));
 
-  const std::unordered_map<string, AttrValue> func_attr(
-      func_node.attr().begin(), func_node.attr().end());
+  const AttrSlice func_instantiation_attr =
+      FunctionInstantiationAttributes(func, func_node);
 
   GrapplerFunctionItem item;
-  Status item_status = MakeGrapplerFunctionItem(
-      func, func_attr, ctx.function_library(), graph_def_version, &item);
+  Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
+                                                ctx.function_library(),
+                                                graph_def_version, &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -641,21 +1050,35 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
     // Make sure the node is placed.
     func_body_node.set_device(func_node.device());
 
-    // Check if a body node is itself a function.
-    const FunctionDef* func_body_node_func =
-        ctx.FindInlinedFunction(func_body_node.op());
-    if (func_body_node_func != nullptr) {
-      // Recursively inline function calls.
-      TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
-                                        ctx, graph_def_version,
-                                        optimized_graph));
-    } else {
+    // Move the function body node to the optimized graph.
+    const auto move_node_to_optimized_graph = [&]() {
       // Annotate the node with the function attributes.
       for (const auto& attr : func.attr()) {
         func_body_node.mutable_attr()->insert(attr);
       }
       // Move the node to the main graph.
       optimized_graph->add_node()->Swap(&func_body_node);
+    };
+
+    // Check if a body node is itself a function call and can be inlined.
+    const FunctionDef* func_body_node_func =
+        FindFunctionCall(ctx, func_body_node);
+
+    if (func_body_node_func != nullptr) {
+      Status inlinable = IsInlinableDirectFunctionCall(
+          ctx, *func_body_node_func, func_body_node);
+      if (inlinable.ok()) {
+        TF_RETURN_IF_ERROR(
+            InlineDirectFunctionCall(func_body_node, *func_body_node_func,
+                                     graph_def_version, ctx, optimized_graph));
+      } else {
+        VLOG(2) << "Can't inline nested direct function call: "
+                << inlinable.error_message();
+        move_node_to_optimized_graph();
+      }
+
+    } else {
+      move_node_to_optimized_graph();
     }
   }
 
@@ -762,15 +1185,339 @@ Status InlineSymbolicGradient(const NodeDef& node,
   return Status::OK();
 }
 
+// -------------------------------------------------------------------------- //
+// Inline indirect functions calls (aka PartitionedCallOp).
+//
+// When we inline indirect function calls, we instantiate the function body from
+// its FunctionDef and caller node attributes, and embed the instantiated graph
+// into the "main graph".
+//
+// In contrast to direct function calls, `PartitionedCallOp` has automatic
+// dependency tracking via input/output control edges, and we relax some of the
+// constraints that we have for direct function call inlining.
+//
+// "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data type)
+// input argument it "captures" the mutable resource.  This is implemented by
+// automatically adding a incoming control edge from the previous side-effectful
+// op touching that resource, and an outgoing control edge to the next
+// side-effectful op using the same resource. This serializes the mutations of
+// the resource to make graph execution deterministic.
+//
+// Function call inlining must preserve side effect visibility:
+//
+// 1) All side effects to the captured resources, that happened before function
+//    call must be visible to the function body nodes using that resources.
+// 2) All side effects to the captured resources, that happened inside function
+//    body, must be visible to every op/function using that resource after the
+//    function call completed.
+
+// To guarantee that these properties are preserved after inlining we do:
+//
+// 1) Forward all input control dependencies from the function call node to the
+//    inlined function inputs (Identity nodes).
+// 2) Each side-effectful op inside function body adds itself as a control
+//    dependency to all the nodes in output control set of function call node.
+//
+// We do not add any other control dependencies to/from function body nodes,
+// because they are pure functions of input tensors, and can be freely
+// reordered.
+
+// Returns `Status::OK()` iff `node` is an indirect function call of `func`, and
+// we know how to inline it into the main graph, otherwise returns and error
+// indicating why the function call is not inlinable.
+Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx,
+                                       const FunctionDef& func,
+                                       const NodeDef& func_node) {
+  // We inline direct function calls above, using different rules.
+  if (!IsIndirectFunctionCall(func, func_node)) {
+    return errors::InvalidArgument("Unsupported function call type: ",
+                                   SummarizeNodeDef(func_node));
+  }
+
+  if (MarkedNoInline(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function marked with '_noinline': ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function specialization and inlining must be mutually exclusive.
+  if (MarkedSpecialized(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function created in Grappler function specialization: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We can't inline functions that are in a fetch set, because it would
+  // invalidate fetch tensors (function call node fully inlined and doesn't
+  // exist in the optimized graph).
+  if (ctx.IsFetchNode(func_node.name())) {
+    return errors::FailedPrecondition(
+        "Can't inline function in a Grappler item fetch set: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We can't inline functions with `Switch` nodes in the function body, because
+  // they might have dead tensors as a function output argument (we need all
+  // intermediate tensors to compute the function gradient). `PartitionedCallOp`
+  // invokes functions with `allow_dead_tensors = true` to reset dead flag,
+  // and return default initialized tensors instead of a dead tensors.
+  // TODO(ezhulenev): Do the liveness analysis and add
+  // `IdentitytWithResurrection` nodes after all potentially dead output
+  // tensors?
+  if (absl::c_any_of(func.node_def(), IsSwitch)) {
+    return errors::FailedPrecondition(
+        "Can't inline function with `Switch` nodes in the function body: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  return Status::OK();
+}
+
+Status InlineIndirectFunctionCall(const NodeDef& func_node,
+                                  const FunctionDef& func,
+                                  const int graph_def_version,
+                                  FunctionOptimizerContext* ctx,
+                                  GraphDef* optimized_graph) {
+  VLOG(2) << "Inline indirect function call: " << SummarizeNodeDef(func_node);
+  TF_RETURN_IF_ERROR(IsInlinableIndirectFunctionCall(*ctx, func, func_node));
+
+  const AttrSlice func_instantiation_attr =
+      FunctionInstantiationAttributes(func, func_node);
+
+  GrapplerFunctionItem item;
+  Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
+                                                ctx->function_library(),
+                                                graph_def_version, &item);
+
+  if (!item_status.ok()) {
+    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
+                                   " instantiated by ", func_node.name(),
+                                   ". Error: ", item_status.error_message());
+  }
+
+  GraphView::InputPort control_input_port =
+      ctx->graph_view().GetInputPort(func_node.name(), Graph::kControlSlot);
+  GraphView::OutputPort control_output_port =
+      ctx->graph_view().GetOutputPort(func_node.name(), Graph::kControlSlot);
+
+  // Nodes that have side effects to the captured resources.
+  std::vector<string> happens_before;
+  absl::c_transform(
+      ctx->graph_view().GetFanin(control_input_port),
+      std::back_inserter(happens_before),
+      [](const GraphView::OutputPort port) { return port.node->name(); });
+
+  VLOG(3) << "Happens before set (size = " << happens_before.size()
+          << "): " << absl::StrJoin(happens_before, ", ");
+
+  // Nodes that must observe side effects to the captured resources.
+  std::vector<string> happens_after;
+  absl::c_transform(
+      ctx->graph_view().GetFanout(control_output_port),
+      std::back_inserter(happens_after),
+      [](const GraphView::InputPort port) { return port.node->name(); });
+
+  VLOG(3) << "Happens after set (size = " << happens_after.size()
+          << "): " << absl::StrJoin(happens_after, ", ");
+
+  // Regular (positional) inputs to the function call.
+  std::vector<SafeTensorId> inputs;
+  for (const string& input : func_node.input()) {
+    SafeTensorId tensor_id = ParseTensorName(input);
+    if (tensor_id.index() == Graph::kControlSlot) break;
+    inputs.push_back(tensor_id);
+  }
+
+  // If we have a node inside the function body without inputs (e.g. Const), we
+  // must attach a control dependency to it, to make sure that if a function
+  // call happens inside a loop, the node will be evaluated in correct frame.
+  //
+  // If the function call node has no inputs and no control dependencies, it
+  // means that it can't be a function call inside a loop, and we can safely
+  // insert that node without inputs into the main graph.
+  //
+  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
+  // the function is called inside a loop.
+  std::vector<string> empty_inputs_hook;
+  if (!item.inputs().empty()) {
+    const InputArgExpansion& arg0 = item.inputs()[0];
+    DCHECK(!arg0.placeholders.empty());
+    empty_inputs_hook.push_back(AsControlDependency(AddPrefixToNodeName(
+        arg0.placeholders[0], /*prefix=*/func_node.name())));
+  } else if (!happens_before.empty()) {
+    empty_inputs_hook.push_back(AsControlDependency(happens_before[0]));
+  }
+
+  // Mapping from input placeholder name to function input position.
+  int idx = 0;
+  absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (const string& placeholder : input_arg.placeholders) {
+      input_placeholders_idx[placeholder] = idx++;
+    }
+  }
+
+  const string prefix = strings::StrCat(func_node.name(), "/");
+
+  // ------------------------------------------------------------------------ //
+  // First we need to assign device placements to all function body nodes.
+
+  GraphDef placed_graph_def;
+
+  const DeviceSet* devices = ctx->devices();
+
+  if (devices->devices().empty()) {
+    // If there are no devices available for placer, we just put all nodes to
+    // the same device as a function caller node. This can happen if Grappler is
+    // running "offline", without active runtime session, for example as a part
+    // of a batch job for graph analysis/optimization.
+    VLOG(3) << "Assign function call node device to all function body nodes. "
+            << "Device: " << func_node.device();
+    placed_graph_def = item.mutable_function_body();
+    for (NodeDef& node : *placed_graph_def.mutable_node()) {
+      node.set_device(func_node.device());
+    }
+  } else {
+    // If we are running in an active runtime session, Grappler will get the
+    // graph after initial placing is done, and we should have devices for the
+    // placer.
+    VLOG(3) << "Run placer for instantiated function body. Devices: ["
+            << absl::StrJoin(
+                   devices->devices(), ", ",
+                   [](string* out, const Device* d) { out->append(d->name()); })
+            << "]";
+
+    // Construct a Graph object from the instantiated function body.
+    GraphConstructorOptions opts;
+    Graph graph(ctx->function_library());
+    TF_RETURN_IF_ERROR(
+        ConvertGraphDefToGraph(opts, item.function_body(), &graph));
+
+    // Use function caller node device as a default for placer.
+    const Device* default_device =
+        devices->FindDeviceByName(func_node.device());
+
+    Placer placer(&graph, devices, nullptr, /* No session options */
+                  default_device);
+    TF_RETURN_IF_ERROR(placer.Run());
+
+    // Convert Graph back to the GraphDef.
+    graph.ToGraphDef(&placed_graph_def);
+  }
+
+  // ------------------------------------------------------------------------ //
+  // After all nodes placed we need to prepare them for inlining into the
+  // optimized graph: turn placeholders into identities, update nodes
+  // connectivity, etc...
+
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    if (item.IsInputPlaceholder(func_body_node.name())) {
+      // Turn input placeholders into identity node.
+      DCHECK_EQ(0, func_body_node.input_size());
+      func_body_node.set_op("Identity");
+      (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
+      func_body_node.mutable_attr()->erase("dtype");
+      func_body_node.mutable_attr()->erase("shape");
+      int input_idx = input_placeholders_idx[func_body_node.name()];
+      func_body_node.add_input(strings::StrCat(inputs[input_idx].ToString()));
+
+      // All side effects must happen before inputs can start executing.
+      for (const string& hb_node : happens_before) {
+        func_body_node.add_input(AsControlDependency(hb_node));
+      }
+
+    } else {
+      // Update inputs of the regular function body nodes.
+      for (string& input : *func_body_node.mutable_input()) {
+        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+      }
+      if (func_body_node.input_size() == 0 && !empty_inputs_hook.empty()) {
+        *func_body_node.add_input() = empty_inputs_hook[0];
+      }
+    }
+
+    // Add the function node name as a prefix 1) to node name to avoid
+    // collisions; 2) to frame name to avoid multiple LoopCond nodes in one
+    // frame after inlining.
+    TF_RETURN_IF_ERROR(
+        AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &func_body_node));
+
+    // After inlining into the optimized graph, NodeDef must have all attributes
+    // defined, which is not required for a node in a FunctionDef.
+    const OpDef* op_def;
+    TF_RETURN_IF_ERROR(
+        ctx->function_library().LookUpOpDef(func_body_node.op(), &op_def));
+    AddDefaultsToNodeDef(*op_def, &func_body_node);
+  }
+
+  // Construct a graph view for the preprocessed function body graph.
+  GraphView placed_graph_view(&placed_graph_def);
+
+  // Keep track of side-effectful ops inside function body. Each outgoing
+  // control edge from the function call node, must be replaced with control
+  // edges from inlined side-effectful ops.
+  std::vector<string> side_effectful_nodes;
+
+  // We have to make sure that all side-effectful nodes inside a function body
+  // will be executed after function inlining.
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    if (!IsFreeOfSideEffect(func_body_node, &ctx->function_library())) {
+      int num_fanouts = placed_graph_view.NumFanouts(
+          func_body_node, /*include_controlling_nodes=*/true);
+
+      // If the node doesn't have any outgoing edges and we do not have any
+      // nodes in the `happens_after` set, we can't inline a function and
+      // guarantee that side-effects will be executed. The only exception if we
+      // do function library optimization, and the GrapplerItem was constructed
+      // for the function body, because functions have strict semantics.
+
+      if (num_fanouts == 0 && happens_after.empty() &&
+          !ctx->allowed_optimizations().inline_ops_with_side_effects) {
+        return errors::Internal(
+            "Can't inline a function with a side-effectful op with empty "
+            "fanouts and empty output control edge set. Function body node: ",
+            SummarizeNodeDef(func_body_node));
+      }
+
+      side_effectful_nodes.push_back(func_body_node.name());
+    }
+  }
+
+  // Move all the nodes to the optimized graph after successful preprocessing.
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    optimized_graph->add_node()->Swap(&func_body_node);
+  }
+
+  // TODO(ezhulenev): Inline nested indirect function calls.
+
+  // Indirect function call is fully inlined into the optimized graph, and we do
+  // not copy the original function call node, so we have to setup tensor
+  // mapping from old output tensors, to the outputs of inlined nodes.
+  int output_idx = 0;
+  for (const OutputArgExpansion& output : item.outputs()) {
+    for (const string& output_tensor : output.output_tensors) {
+      const SafeTensorId from_tensor(func_node.name(), output_idx++);
+      const SafeTensorId to_tensor = ParseTensorName(
+          AddPrefixToNodeName(output_tensor, /*prefix=*/func_node.name()));
+      ctx->AddTensorMapping(from_tensor, to_tensor);
+    }
+  }
+
+  // After inlining we'll have to forward all control dependencies from function
+  // call node to all side-effectful ops inside function body.
+  ctx->AddControlOverrides(func_node, side_effectful_nodes);
+
+  VLOG(3) << "Successfully inlined indirect function call: "
+          << SummarizeNodeDef(func_node);
+  return Status::OK();
+}
+
 }  // namespace
 
-Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
-  VLOG(1) << "Optimize Grappler item: id=" << item.id;
-
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
-    VLOG(3) << "Skip Grappler item with empty function library";
     *optimized_graph = item.graph;
     return Status::OK();
   }
@@ -782,8 +1529,6 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   bool specialize_func = options_.enable_function_specialization;
 
   for (const NodeDef& node : item.graph.node()) {
-    const string func_name = node.op();
-
     // Each node optimization can modify optimized graph only by adding new
     // nodes, we can check node size to make sure that graph was not modified.
     const int num_nodes_before = optimized_graph->node_size();
@@ -808,36 +1553,71 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }                                                              \
   } while (0)
 
-    // 1. Inline symbolic gradients into the optimized graph.
-    if (func_name == "SymbolicGradient" && inline_gradients) {
-      // Inline symbolic gradients only if the corresponding function is inlined
+    // ---------------------------------------------------------------------- //
+    // 1. Inline symbolic gradients into the optimized graph.                 //
+    // ---------------------------------------------------------------------- //
+
+    if (IsSymbolicGradient(node) && inline_gradients) {
+      // Inline symbolic gradients only if the corresponding function is not
+      // marked as `_noinline`.
       const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
-      string f_name = f_attr != nullptr ? f_attr->func().name() : "";
-      if (ctx.IsInlinedFunction(f_name)) {
+      const string f_name = f_attr != nullptr ? f_attr->func().name() : "";
+      const FunctionDef* func = ctx.function_library().Find(f_name);
+      if (func && !MarkedNoInline(*func)) {
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
             InlineSymbolicGradient(node, &ctx, optimized_graph));
         continue;
       }
     }
 
-    // 2. Check if a node op is a function call.
-    const FunctionDef* func = ctx.function_library().Find(func_name);
+    // ---------------------------------------------------------------------- //
+    // 2. Inline or specialize function calls.                                //
+    // ---------------------------------------------------------------------- //
+
+    // Find if a node is a function call (direct or indirect).
+    const FunctionDef* func = FindFunctionCall(ctx, node);
+
     if (func != nullptr) {
-      // 2a. Inline it if it's allowed to do so.
-      if (inline_func && ctx.IsInlinedFunction(func_name)) {
-        // Inline function body into the optimized graph}
-        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            InlineFunction(node, *func, ctx, item.graph.versions().producer(),
-                           optimized_graph));
-        continue;
+      const string& func_name = func->signature().name();
+      const int graph_def_version = item.graph.versions().producer();
+
+      const bool is_direct_func = IsDirectFunctionCall(*func, node);
+      const bool is_indirect_func = IsIndirectFunctionCall(*func, node);
+
+      // 2a. Inline direct function call if it's inlinable.
+      if (inline_func && is_direct_func) {
+        Status inlinable = IsInlinableDirectFunctionCall(ctx, *func, node);
+        if (inlinable.ok()) {
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineDirectFunctionCall(
+              node, *func, graph_def_version, ctx, optimized_graph));
+          continue;
+        } else {
+          VLOG(2) << inlinable.error_message();
+        }
       }
 
+      // 2b. Inline indirect function call if it's inlinable.
+      if (inline_func && is_indirect_func) {
+        Status inlinable = IsInlinableIndirectFunctionCall(ctx, *func, node);
+        if (inlinable.ok()) {
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineIndirectFunctionCall(
+              node, *func, graph_def_version, &ctx, optimized_graph));
+          continue;
+        } else {
+          VLOG(2) << inlinable.error_message();
+        }
+      }
+
+      // 2c. Specialize it to its instantiation context if can't be inlined,
+      // and it has something worth specializing.
+      bool specialization_worthy = IsParametrized(*func) ||
+                                   HasTrulyConstInputs(node, ctx) ||
+                                   HasUnusedOutputs(node, *func, ctx);
+
       // Do not specialize if function has custom gradient.
       const string grad_func = ctx.function_library().FindGradient(func_name);
 
-      // 2b. Specialize it to it's instantiation context if can't be inlined.
-      if (specialize_func && grad_func.empty() &&
-          (IsParametrized(*func) || HasTrulyConstInputs(node, ctx))) {
+      if (specialize_func && grad_func.empty() && specialization_worthy) {
         // TODO(ezhulenev): Specialize function call if input has a known shape.
         // Specialize function body for its instantiation attributes and inputs.
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
@@ -847,6 +1627,7 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       }
     }
 
+    // ---------------------------------------------------------------------- //
     // If we reached this point, node was not handled by any of the stages
     // (inline, specialize), simply add a copy to the graph.
     add_node_copy();
@@ -854,10 +1635,80 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 #undef TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED
   }
 
+  // After function specialization and inlining graph might be in invalid
+  // state, and some nodes can read tensors that do not exists anymore in the
+  // optimized graph: function call node was fully inlined into the graph, or
+  // output index was invalidated by the output pruning.
+
+  if (!ctx.tensor_mapping().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      for (int idx = 0; idx < node.input_size(); ++idx) {
+        TensorId input_tensor = ParseTensorName(node.input(idx));
+        if (input_tensor.index() == Graph::kControlSlot) break;
+
+        auto mapping = ctx.tensor_mapping().find(input_tensor);
+        if (mapping != ctx.tensor_mapping().end()) {
+          node.set_input(idx, mapping->second.ToString());
+        }
+      }
+    }
+  }
+
+  // Function inlining instantiates function body directly into the optimized
+  // graph, and we might end up with control dependencies to the nodes that no
+  // longer exist in a graph. We need to apply control overrides to all
+  // invalidated nodes, and rewire control dependencies to the inlined
+  // side-effectful function body nodes.
+
+  // TODO(ezhulenev): With nested function call inlining, single pass over
+  // `control_overrides` might not bring the graph into a valid state,
+  // continue until it converges and all invalidated control dependencies
+  // removed.
+
+  if (!ctx.control_overrides().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      // Keep track of new control inputs to the node.
+      gtl::FlatSet<string> add_ctrl_inputs;
+
+      // Remove all invalidated control inputs.
+      for (int idx = 0; idx < node.input_size(); /* see below */) {
+        // TODO(ezhulenev): Use non-allocating TensorId after migrating
+        // `control_overrides()` to absl::flat_hash_set.
+        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+
+        auto overrides = ctx.control_overrides().find(input_tensor.node());
+        if (overrides != ctx.control_overrides().end()) {
+          // If this happens it's a bug in the function inlining.
+          if (input_tensor.index() != Graph::kControlSlot) {
+            return errors::Internal(
+                "Illegal input edge from inlined function call node");
+          }
+          // Remove control dependency to the inlined function call node.
+          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
+          node.mutable_input()->RemoveLast();
+
+          // Keep track of all overrides.
+          for (const string& override : overrides->second) {
+            add_ctrl_inputs.insert(AsControlDependency(override));
+          }
+        } else {
+          // Go to the next input only if the current one was not invalidated,
+          // otherwise we need to check the swapped input as well.
+          ++idx;
+        }
+      }
+
+      // Add overrides to the node inputs.
+      for (const string& ctrl_input : add_ctrl_inputs) {
+        node.add_input(ctrl_input);
+      }
+    }
+  }
+
   *optimized_graph->mutable_versions() = item.graph.versions();
   *optimized_graph->mutable_library() =
       options_.enable_trim_function_library
-          ? TrimFunctionLibrary(ctx.function_library(), *optimized_graph)
+          ? PruneFunctionLibrary(ctx.function_library(), *optimized_graph)
           : ctx.function_library().ToProto();
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index fab3f994c1a8bce6653653099f8187e09d0fff40..c971eec3f4dae5cc3457ad802700ee4f3086eb90 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -107,7 +108,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_SimpleFunction) {
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -183,7 +184,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_SkipErrorsIfGraphNotModified) {
   item.fetch = {"z1"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -283,7 +284,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FixedTypeFunction) {
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -367,7 +368,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithOutputMapping) {
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -417,7 +418,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithInputForwarding) {
   item.feed.emplace_back("x4", test::AsScalar<float>(-1.0f));
   item.feed.emplace_back("x3", test::AsScalar<int>(1234));
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
   test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
@@ -493,49 +494,49 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithNestedFunctionCall) {
 
   int count = 0;
   for (const NodeDef& node : output.node()) {
-    if (node.name() == "square/inlined_inputs" && count++) {
+    if (node.name() == "square/inlined_inputs" && ++count) {
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("a", node.input(0));
-    } else if (node.name() == "square/x" && count++) {
+    } else if (node.name() == "square/x" && ++count) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("square/inlined_inputs:0", node.input(0));
-    } else if (node.name() == "square/output/inlined_inputs" && count++) {
+    } else if (node.name() == "square/output/inlined_inputs" && ++count) {
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("square/x", node.input(0));
       EXPECT_EQ("square/x", node.input(1));
-    } else if (node.name() == "square/output/x" && count++) {
+    } else if (node.name() == "square/output/x" && ++count) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("square/output/inlined_inputs:0", node.input(0));
-    } else if (node.name() == "square/output/y" && count++) {
+    } else if (node.name() == "square/output/y" && ++count) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("square/output/inlined_inputs:1", node.input(0));
-    } else if (node.name() == "square/output/output" && count++) {
+    } else if (node.name() == "square/output/output" && ++count) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("square/output/x", node.input(0));
       EXPECT_EQ("square/output/y", node.input(1));
-    } else if (node.name() == "square/output" && count++) {
+    } else if (node.name() == "square/output" && ++count) {
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("square/output/output", node.input(0));
-    } else if (node.name() == "square" && count++) {
+    } else if (node.name() == "square" && ++count) {
       EXPECT_EQ("IdentityN", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("square/output", node.input(0));
-    } else if (node.name() == "outputs" && count++) {
+    } else if (node.name() == "outputs" && ++count) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(kDevice, node.device());
       EXPECT_EQ(1, node.input_size());
@@ -548,7 +549,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithNestedFunctionCall) {
   item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
   auto tensors_expected = EvaluateFetchNodes(item);
 
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
@@ -698,7 +699,332 @@ TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
   CompareGraphs(item.graph, output);
 }
 
-TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionSimpleFunction) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("c", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func} /* Function library */);
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Function must be inlined and all nodes placed on a valid device.
+       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.feed.emplace_back("a", pi);
+  item.feed.emplace_back("b", pi);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors_expected = EvaluateFetchNodes(item);
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  ASSERT_EQ(tensors.size(), tensors_expected.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithControlDependencies) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+  const TensorShape scalar = TensorShape({});
+
+  // Compute `x*y` and add `1.0` to the variable.
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T", "v: resource"}, {"z:T"}, {"T: {float, double}"},
+      {{{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+       {{"add"},
+        "AssignAddVariableOp",
+        {"v", "one:output:0"},
+        {{"dtype", DT_FLOAT}}},
+       {{"mul"}, "Mul", {"x", "y", "^add"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute:
+  //   a = Placeholder
+  //   b = Placeholder
+  //   v = VarHandleOp(init = a)
+  //   f1 = MyMul(a, b, v)
+  //   f2 = MyMul(f1, f1, v)
+  //   return [f2, v]
+  GrapplerItem item;
+  item.fetch = {"out_1", "out_2"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Initialize variable with one of the placeholders.
+       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}}),
+       NDef("init_v", "AssignVariableOp", {"v", "a"}, {{"dtype", DT_FLOAT}},
+            kDevice),
+
+       // Call function first time.
+       NDef("f1", "PartitionedCall", {"a", "b", "v", "^init_v"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_RESOURCE}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Call function second time.
+       NDef("f2", "PartitionedCall", {"f1", "f1", "v", "^f1"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_RESOURCE}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Return result of multiplication and a current value of the variable.
+       NDef("out_1", "Identity", {"f2"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_2", "ReadVariableOp", {"v", "^f1", "^f2"},
+            {{"dtype", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Initialize variable with one of the placeholders.
+       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}}),
+       NDef("init_v", "AssignVariableOp", {"v", "a"}, {{"dtype", DT_FLOAT}},
+            kDevice),
+
+       // Function body of a first function call inlined into the graph.
+       NDef("f1/x", "Identity", {"a:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/y", "Identity", {"b:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/v", "Identity", {"v:0", "^init_v"}, {{"T", DT_RESOURCE}},
+            kDevice),
+       NDef("f1/one", "Const", {"^f1/x"},
+            {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
+       NDef("f1/add", "AssignAddVariableOp", {"f1/v", "f1/one"},
+            {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("f1/mul", "Mul", {"f1/x", "f1/y", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Function body of a second function call also inlined into the graph,
+       // and input nodes read directly from the inlined nodes of the first
+       // function call.
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f2/v", "Identity", {"v:0", "^f1/add"}, {{"T", DT_RESOURCE}},
+            kDevice),
+       NDef("f2/one", "Const", {"^f2/x"},
+            {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
+       NDef("f2/add", "AssignAddVariableOp", {"f2/v", "f2/one"},
+            {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("f2/mul", "Mul", {"f2/x", "f2/y", "^f2/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Return values read directly from inlined nodes.
+       NDef("out_1", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_2", "ReadVariableOp", {"v", "^f1/add", "^f2/add"},
+            {{"dtype", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  item.feed.emplace_back("a", kOne);
+  item.feed.emplace_back("b", kTwo);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors_expected.size(), 2);
+  EXPECT_EQ(tensors_expected[0].flat<float>()(0), 4.0);  // mul
+  EXPECT_EQ(tensors_expected[1].flat<float>()(0), 3.0);  // read variable
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors.size(), 2);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+  // Add device placement spec to the function body node.
+  (*mul_func.mutable_node_def())[0].set_device("/device:CPU:1");
+
+  // We need fully defined device names to run the placer for inlined function.
+  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+
+  // Build a graph to compute c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+       NDef("c", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            cpu0),
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, cpu0)},
+      // Function library.
+      {mul_func});
+  ASSERT_TRUE(item.InferDevicesFromGraph().ok());
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+
+       // Function must be inlined and `mul` node placed on a requested device.
+       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, cpu1),
+
+       NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, cpu0)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+  const TensorShape scalar = TensorShape({});
+
+  // MyMul doesn't have any side-effectful nodes in the function body, but the
+  // optimized graph has a control dependency edge `f1->f2`.
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute:
+  //   a = Placeholder
+  //   b = Placeholder
+  //   f1 = MyMul(a, b)
+  //   f2 = MyMul(a, b, ^f1)  <-- control dependency on inlined function!
+  //   return f2
+  GrapplerItem item;
+  item.fetch = {"out"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Call function first time.
+       NDef("f1", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Call function second time.
+       NDef("f2", "PartitionedCall", {"f1", "f1", "^f1"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Return result of f2.
+       NDef("out", "Identity", {"f2"}, {{"T", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Function body of a first function call inlined into the graph.
+       NDef("f1/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/mul", "Mul", {"f1/x", "f1/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Function body of a second function call also inlined into the graph,
+       // and input nodes read directly from the inlined nodes of the first
+       // function call, and control dependency edge removed.
+       NDef("f2/x", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/mul", "Mul", {"f2/x", "f2/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Return directly from inlined node of f2.
+       NDef("out", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  item.feed.emplace_back("a", kOne);
+  item.feed.emplace_back("b", kTwo);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, SpecializeFunctionXTimesTwo) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
@@ -710,6 +1036,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
 
   // Build a graph to compute y = XTimesTwo(x).
   GrapplerItem item;
+  item.id = "tf_graph";
   item.graph = test::function::GDef(
       {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
        NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
@@ -722,14 +1049,14 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   // Make sure that specialized function was added to the library and original
   // function was removed.
   EXPECT_EQ(1, output.library().function_size());
-  EXPECT_EQ("XTimesTwo_specialized_for_y",
+  EXPECT_EQ("XTimesTwo_specialized_for_y_at_tf_graph",
             output.library().function(0).signature().name());
 
   // And 'y' node is calling specialized function.
   int count = 0;
   for (const NodeDef& node : output.node()) {
-    if (node.name() == "y" && count++) {
-      EXPECT_EQ("XTimesTwo_specialized_for_y", node.op());
+    if (node.name() == "y" && ++count) {
+      EXPECT_EQ("XTimesTwo_specialized_for_y_at_tf_graph", node.op());
     }
   }
   EXPECT_EQ(1, count);
@@ -740,12 +1067,76 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
-TEST_F(FunctionOptimizerTest, SpecializeFunction_PushDownConstInput) {
+TEST_F(FunctionOptimizerTest, SpecializeIndirectFunctionXTimesTwo) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // Mark XTimesTwo as noinline.
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  (*x_times_two.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {x_times_two};
+
+  // Tensorflow graph:
+  //   y = PartitionedCall[f=XTimesTwo, Tin=[DT_FLOAT], Tout=[DT_FLOAT]](x)
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y", "PartitionedCall", {"x"},
+            {{"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("XTimesTwo", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that specialized function was added to the library and original
+  // function was removed.
+  EXPECT_EQ(1, output.library().function_size());
+  EXPECT_EQ("XTimesTwo_specialized_for_y_at_tf_graph",
+            output.library().function(0).signature().name());
+
+  // And 'y' node is calling specialized function.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y" && ++count) {
+      EXPECT_EQ("PartitionedCall", node.op());
+      auto& func = AttrSlice(node).Find("f")->func();
+      // Function calls into the specialized function.
+      EXPECT_EQ("XTimesTwo_specialized_for_y_at_tf_graph", func.name());
+      // And input/output types stay the same.
+      auto& tin = AttrSlice(node).Find("Tin")->list();
+      auto& tout = AttrSlice(node).Find("Tout")->list();
+      ASSERT_EQ(1, tin.type_size());
+      ASSERT_EQ(1, tout.type_size());
+      EXPECT_EQ(DT_FLOAT, tin.type(0));
+      EXPECT_EQ(DT_FLOAT, tout.type(0));
+    }
+  }
+  EXPECT_EQ(1, count);
+
+  // And that graph evaluation yields the same result.
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z"};
+  item.feed.emplace_back("x", pi);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, SpecializeFunctionPushDownConstInput) {
   using test::function::NDef;
 
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
@@ -764,6 +1155,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_PushDownConstInput) {
   const Tensor kTwo = test::AsScalar<float>(2.0);
 
   GrapplerItem item;
+  item.id = "tf_graph";
   item.graph = test::function::GDef(
       {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
        NDef("init", "NoOp", {}, {}, kDevice),
@@ -781,13 +1173,14 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_PushDownConstInput) {
   ASSERT_EQ(1, output.library().function_size());
 
   const FunctionDef& specialized = output.library().function(0);
-  EXPECT_EQ("MyMul_specialized_for_y", specialized.signature().name());
+  EXPECT_EQ("MyMul_specialized_for_y_at_tf_graph",
+            specialized.signature().name());
   EXPECT_EQ(1, specialized.signature().input_arg_size());
 
   // And 'y' node has control dependencies of a pushed down const node.
   int count = 0;
   for (const NodeDef& node : output.node()) {
-    if (node.name() == "y" && count++) {
+    if (node.name() == "y" && ++count) {
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("^init", node.input(1));
@@ -801,7 +1194,87 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_PushDownConstInput) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, SpecializeIndirectFunctionPushDownConstInput) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  // Mark MyMul as noinline.
+  (*mul_func.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {mul_func};
+
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+
+  // Tensorflow graph:
+  //   y = PartitionedCall[Tin=[DT_FLOAT], Tout=[DT_FLOAT], f=MyMul](x, two)
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("init", "NoOp", {}, {}, kDevice),
+       NDef("two", "Const", {"^init", "^x"},
+            {{"dtype", DT_FLOAT}, {"value", kTwo}}, kDevice),
+       NDef("y", "PartitionedCall", {"x", "two"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that specialized function was added to the library and original
+  // function was removed.
+  ASSERT_EQ(1, output.library().function_size());
+
+  const FunctionDef& specialized = output.library().function(0);
+  EXPECT_EQ("MyMul_specialized_for_y_at_tf_graph",
+            specialized.signature().name());
+  EXPECT_EQ(1, specialized.signature().input_arg_size());
+
+  // And 'y' node has control dependencies of a pushed down const node.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y" && ++count) {
+      EXPECT_EQ("PartitionedCall", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^init", node.input(1));
+      // Function calls into the specialized function.
+      auto& func = AttrSlice(node).Find("f")->func();
+      EXPECT_EQ("MyMul_specialized_for_y_at_tf_graph", func.name());
+      // And input/output type lists were updated.
+      auto& tin = AttrSlice(node).Find("Tin")->list();
+      auto& tout = AttrSlice(node).Find("Tout")->list();
+      ASSERT_EQ(1, tin.type_size());
+      ASSERT_EQ(1, tout.type_size());
+      EXPECT_EQ(DT_FLOAT, tin.type(0));
+      EXPECT_EQ(DT_FLOAT, tout.type(0));
+    }
+  }
+  ASSERT_EQ(1, count);
+
+  // And that graph evaluation yields the same result.
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z"};
+  item.feed.emplace_back("x", pi);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -824,6 +1297,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_OncePerUniqueContext) {
   const Tensor kThree = test::AsScalar<float>(3.0);
 
   GrapplerItem item;
+  item.id = "tf_graph";
   item.graph = test::function::GDef(
       {NDef("init", "NoOp", {}, {}, kDevice),
 
@@ -856,6 +1330,10 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_OncePerUniqueContext) {
        NDef("mul_6", "MyMul", {"three", "xf"}, {{"T", DT_FLOAT}}, kDevice)},
       function_library);
 
+  // Specify fetch nodes before optimization to prevent pruning unused function
+  // outputs.
+  item.fetch = {"mul_1", "mul_2", "mul_3", "mul_4", "mul_5", "mul_6"};
+
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
@@ -865,39 +1343,40 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_OncePerUniqueContext) {
   // And graph nodes calling specialized functions.
   int count = 0;
   for (const NodeDef& node : output.node()) {
-    if (node.name() == "mul_1" && count++) {
-      EXPECT_EQ("MyMul_specialized_for_mul_1", node.op());
+    if (node.name() == "mul_1" && ++count) {
+      EXPECT_EQ("MyMul_specialized_for_mul_1_at_tf_graph", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("xf", node.input(0));
       EXPECT_EQ("yf", node.input(1));
 
-    } else if (node.name() == "mul_2" && count++) {
-      EXPECT_EQ("MyMul_specialized_for_mul_1", node.op());
+    } else if (node.name() == "mul_2" && ++count) {
+      EXPECT_EQ("MyMul_specialized_for_mul_1_at_tf_graph", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("yf", node.input(0));
       EXPECT_EQ("xf", node.input(1));
 
-    } else if (node.name() == "mul_3" && count++) {
-      EXPECT_EQ("MyMul_specialized_for_mul_3", node.op());
+    } else if (node.name() == "mul_3" && ++count) {
+      EXPECT_EQ("MyMul_specialized_for_mul_3_at_tf_graph", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("xi", node.input(0));
       EXPECT_EQ("yi", node.input(1));
 
-    } else if (node.name() == "mul_4" && count++) {
-      EXPECT_EQ("MyMul_specialized_for_mul_4", node.op());
+    } else if (node.name() == "mul_4" && ++count) {
+      EXPECT_EQ("MyMul_specialized_for_mul_4_at_tf_graph", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("xf", node.input(0));
       EXPECT_EQ("^init", node.input(1));
 
-    } else if (node.name() == "mul_5" && count++) {
-      EXPECT_EQ("MyMul_specialized_for_mul_4", node.op());
+    } else if (node.name() == "mul_5" && ++count) {
+      EXPECT_EQ("MyMul_specialized_for_mul_4_at_tf_graph", node.op());
       ASSERT_EQ(3, node.input_size());
       EXPECT_EQ("yf", node.input(0));
-      EXPECT_EQ("^init", node.input(1));
-      EXPECT_EQ("^xf", node.input(2));
+      gtl::FlatSet<string> expected_ctrl = {"^init", "^xf"};
+      gtl::FlatSet<string> actual_ctrl = {node.input(1), node.input(2)};
+      EXPECT_EQ(expected_ctrl, actual_ctrl);
 
-    } else if (node.name() == "mul_6" && count++) {
-      EXPECT_EQ("MyMul_specialized_for_mul_6", node.op());
+    } else if (node.name() == "mul_6" && ++count) {
+      EXPECT_EQ("MyMul_specialized_for_mul_6_at_tf_graph", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("xf", node.input(0));
       EXPECT_EQ("^init", node.input(1));
@@ -908,11 +1387,10 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_OncePerUniqueContext) {
   // And that graph evaluation yields the same result.
   Tensor pi = test::AsScalar<float>(3.14f);
   Tensor four = test::AsScalar<int32>(4);
-  item.fetch = {"mul_1", "mul_2", "mul_3", "mul_4", "mul_5", "mul_6"};
   item.feed = {{"xf", pi}, {"yf", pi}, {"xi", four}, {"yi", four}};
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
@@ -923,6 +1401,274 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_OncePerUniqueContext) {
   test::ExpectTensorEqual<float>(tensors_expected[5], tensors[5]);
 }
 
+TEST_F(FunctionOptimizerTest, SpecializeFunctionForUsedOutputTensors) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // MyFunc computes x*y three times and has three output values.
+  FunctionDef my_func = FunctionDefHelper::Create(
+      "MyFunc", {"x:T", "y:T"}, {"z1:T", "z2:T", "z3:T"}, {"T: {float, int32}"},
+      {{{"output1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output2"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output3"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z1", "output1:z:0"}, {"z2", "output2:z:0"}, {"z3", "output3:z:0"}});
+  (*my_func.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {my_func};
+
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.graph = test::function::GDef(
+      {NDef("init", "NoOp", {}, {}, kDevice),
+
+       // Float placeholders.
+       NDef("xf", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("yf", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Specialization #1: DT_FLOAT type parameter. All outputs used.
+       NDef("fn1", "MyFunc", {"xf", "yf"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn1_0", "Identity", {"fn1:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn1_1", "Identity", {"fn1:1"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn1_2", "Identity", {"fn1:2"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #2: DT_FLOAT type parameter. Only first output used.
+       NDef("fn2", "MyFunc", {"xf", "yf"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn2_0", "Identity", {"fn2:0"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #3: DT_FLOAT type parameter. Only second output used.
+       NDef("fn3", "MyFunc", {"xf", "yf"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn3_1", "Identity", {"fn3:1"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #4: DT_FLOAT type parameter. Only last output used.
+       NDef("fn4", "MyFunc", {"xf", "yf"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn4_2", "Identity", {"fn4:2"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #5: DT_FLOAT type parameter. First and last outputs.
+       NDef("fn5", "MyFunc", {"xf", "yf"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn5_0", "Identity", {"fn5:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn5_2", "Identity", {"fn5:2"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #6: DT_FLOAT type parameter. Outputs not used.
+       // Check that function optimizer do not fail. In practice it should be
+       // pruned from the graph before passing to function optimizer.
+       NDef("fn6", "MyFunc", {"xf", "yf"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that MyFunc was specialized once per unique context.
+  EXPECT_EQ(6, output.library().function_size());
+
+  // And graph nodes calling specialized functions.
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    // All function caller nodes must be specialized.
+    if (node.name() == "fn1" && ++found) {
+      EXPECT_EQ("MyFunc_specialized_for_fn1_at_tf_graph", node.op());
+    } else if (node.name() == "fn2" && ++found) {
+      EXPECT_EQ("MyFunc_specialized_for_fn2_at_tf_graph", node.op());
+    } else if (node.name() == "fn3" && ++found) {
+      EXPECT_EQ("MyFunc_specialized_for_fn3_at_tf_graph", node.op());
+    } else if (node.name() == "fn4" && ++found) {
+      EXPECT_EQ("MyFunc_specialized_for_fn4_at_tf_graph", node.op());
+    } else if (node.name() == "fn5" && ++found) {
+      EXPECT_EQ("MyFunc_specialized_for_fn5_at_tf_graph", node.op());
+    } else if (node.name() == "fn6" && ++found) {
+      EXPECT_EQ("MyFunc_specialized_for_fn6_at_tf_graph", node.op());
+    }
+    // And all consumers of specialized function nodes must be mapped to new
+    // output ports.
+    if (node.name() == "use_fn3_1" && ++found) {
+      EXPECT_EQ("fn3:0", node.input(0));
+    } else if (node.name() == "use_fn4_2" && ++found) {
+      EXPECT_EQ("fn4:0", node.input(0));
+    } else if (node.name() == "use_fn5_0" && ++found) {
+      EXPECT_EQ("fn5:0", node.input(0));
+    } else if (node.name() == "use_fn5_2" && ++found) {
+      EXPECT_EQ("fn5:1", node.input(0));
+    }
+  }
+  EXPECT_EQ(10, found);
+
+  // And that graph evaluation yields the same result.
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"use_fn1_0", "use_fn1_1", "use_fn1_2", "use_fn2_0",
+                "use_fn3_1", "use_fn4_2", "use_fn5_0", "use_fn5_2"};
+  item.feed = {{"xf", pi}, {"yf", pi}};
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  ASSERT_EQ(tensors_expected.size(), tensors.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+  }
+}
+
+TEST_F(FunctionOptimizerTest, SpecializeIndirectFunctionForUsedOutputTensors) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // MyFunc computes x*y three times and has three output values.
+  FunctionDef my_func = FunctionDefHelper::Create(
+      "MyFunc", {"x:T", "y:T"}, {"z1:T", "z2:T", "z3:T"}, {"T: {float, int32}"},
+      {{{"output1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output2"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output3"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z1", "output1:z:0"}, {"z2", "output2:z:0"}, {"z3", "output3:z:0"}});
+  (*my_func.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {my_func};
+
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.graph = test::function::GDef(
+      {NDef("init", "NoOp", {}, {}, kDevice),
+
+       // Float placeholders.
+       NDef("xf", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("yf", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Specialization #1: DT_FLOAT type parameter. All outputs used.
+       NDef("fn1", "PartitionedCall", {"xf", "yf"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyFunc", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("use_fn1_0", "Identity", {"fn1:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn1_1", "Identity", {"fn1:1"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn1_2", "Identity", {"fn1:2"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #2: DT_FLOAT type parameter. Only first output used.
+       NDef("fn2", "PartitionedCall", {"xf", "yf"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyFunc", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("use_fn2_0", "Identity", {"fn2:0"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #3: DT_FLOAT type parameter. Only second output used.
+       NDef("fn3", "PartitionedCall", {"xf", "yf"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyFunc", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("use_fn3_1", "Identity", {"fn3:1"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #4: DT_FLOAT type parameter. Only last output used.
+       NDef("fn4", "PartitionedCall", {"xf", "yf"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyFunc", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("use_fn4_2", "Identity", {"fn4:2"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #5: DT_FLOAT type parameter. First and last outputs.
+       NDef("fn5", "PartitionedCall", {"xf", "yf"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyFunc", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("use_fn5_0", "Identity", {"fn5:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("use_fn5_2", "Identity", {"fn5:2"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #6: DT_FLOAT type parameter. Outputs not used.
+       // Check that function optimizer do not fail. In practice it should be
+       // pruned from the graph before passing to function optimizer.
+       NDef("fn6", "PartitionedCall", {"xf", "yf"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyFunc", {{"T", DT_FLOAT}})}},
+            kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that MyFunc was specialized once per unique context.
+  EXPECT_EQ(6, output.library().function_size());
+
+  // And graph nodes calling specialized functions.
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    // All function caller nodes must be specialized.
+    if (node.name() == "fn1" && ++found) {
+      auto& func = AttrSlice(node).Find("f")->func();
+      auto& tout = AttrSlice(node).Find("Tout")->list();
+      EXPECT_EQ("PartitionedCall", node.op());
+      EXPECT_EQ("MyFunc_specialized_for_fn1_at_tf_graph", func.name());
+      ASSERT_EQ(3, tout.type_size());
+
+    } else if (node.name() == "fn2" && ++found) {
+      auto& func = AttrSlice(node).Find("f")->func();
+      auto& tout = AttrSlice(node).Find("Tout")->list();
+      EXPECT_EQ("PartitionedCall", node.op());
+      EXPECT_EQ("MyFunc_specialized_for_fn2_at_tf_graph", func.name());
+      ASSERT_EQ(1, tout.type_size());
+
+    } else if (node.name() == "fn3" && ++found) {
+      auto& func = AttrSlice(node).Find("f")->func();
+      auto& tout = AttrSlice(node).Find("Tout")->list();
+      EXPECT_EQ("PartitionedCall", node.op());
+      EXPECT_EQ("MyFunc_specialized_for_fn3_at_tf_graph", func.name());
+      ASSERT_EQ(1, tout.type_size());
+
+    } else if (node.name() == "fn4" && ++found) {
+      auto& func = AttrSlice(node).Find("f")->func();
+      auto& tout = AttrSlice(node).Find("Tout")->list();
+      EXPECT_EQ("PartitionedCall", node.op());
+      EXPECT_EQ("MyFunc_specialized_for_fn4_at_tf_graph", func.name());
+      ASSERT_EQ(1, tout.type_size());
+
+    } else if (node.name() == "fn5" && ++found) {
+      auto& func = AttrSlice(node).Find("f")->func();
+      auto& tout = AttrSlice(node).Find("Tout")->list();
+      EXPECT_EQ("PartitionedCall", node.op());
+      EXPECT_EQ("MyFunc_specialized_for_fn5_at_tf_graph", func.name());
+      ASSERT_EQ(2, tout.type_size());
+
+    } else if (node.name() == "fn6" && ++found) {
+      auto& func = AttrSlice(node).Find("f")->func();
+      auto& tout = AttrSlice(node).Find("Tout")->list();
+      EXPECT_EQ("PartitionedCall", node.op());
+      EXPECT_EQ("MyFunc_specialized_for_fn6_at_tf_graph", func.name());
+      ASSERT_EQ(0, tout.type_size());
+    }
+    // And all consumers of specialized function nodes must be mapped to new
+    // output ports.
+    if (node.name() == "use_fn3_1" && ++found) {
+      EXPECT_EQ("fn3:0", node.input(0));
+    } else if (node.name() == "use_fn4_2" && ++found) {
+      EXPECT_EQ("fn4:0", node.input(0));
+    } else if (node.name() == "use_fn5_0" && ++found) {
+      EXPECT_EQ("fn5:0", node.input(0));
+    } else if (node.name() == "use_fn5_2" && ++found) {
+      EXPECT_EQ("fn5:1", node.input(0));
+    }
+  }
+  EXPECT_EQ(10, found);
+
+  // And that graph evaluation yields the same result.
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"use_fn1_0", "use_fn1_1", "use_fn1_2", "use_fn2_0",
+                "use_fn3_1", "use_fn4_2", "use_fn5_0", "use_fn5_2"};
+  item.feed = {{"xf", pi}, {"yf", pi}};
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  ASSERT_EQ(tensors_expected.size(), tensors.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+  }
+}
+
 TEST_F(FunctionOptimizerTest, PruningUselessLibraryFunctions) {
   using test::function::NDef;
   FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 765dd13263f0298a850c8598d499bfeeda2f186c..e587a2b2af74cb417ac58f672a4cc5526335d0a8 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -16,8 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
 
+#include <string>
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -29,6 +31,7 @@ struct GrapplerItem;
 // optimization of a GrapplerItem for running on a cluster.
 class GraphOptimizer {
  public:
+  GraphOptimizer() : deadline_usec_(0) {}
   virtual ~GraphOptimizer() {}
 
   virtual string name() const = 0;
@@ -45,8 +48,28 @@ class GraphOptimizer {
   // call to Optimize) performed.  Lower "result" scores are better.
   virtual void Feedback(Cluster* cluster, const GrapplerItem& item,
                         const GraphDef& optimized_graph, double result) = 0;
+
+  // Set deadline in microseconds since epoch. A value of zero means no
+  // deadline.
+  void set_deadline_usec(uint64 deadline_usec) {
+    deadline_usec_ = deadline_usec;
+  }
+  uint64 deadline_usec() const { return deadline_usec_; }
+  bool DeadlineExceeded() const {
+    return deadline_usec_ > 0 && Env::Default()->NowMicros() > deadline_usec_;
+  }
+
+ private:
+  uint64 deadline_usec_;
 };
 
+#define GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED()                              \
+  do {                                                                      \
+    if (this->DeadlineExceeded()) {                                         \
+      return errors::DeadlineExceeded(this->name(), " exceeded deadline."); \
+    }                                                                       \
+  } while (0)
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
index 1ea57f7b4f003e8a98fe187f6325e39ebe30e9e7..82c408b521f58bcde685474ba13146d2f56379ba 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+#include "tensorflow/core/graph/tensor_id.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -46,25 +47,27 @@ Status GetTensorProperties(const GraphOptimizerContext& ctx,
     return errors::InvalidArgument("Graph properties are unknown.");
   }
 
-  int port;
-  string tensor_node_name = ParseNodeName(tensor, &port);
-  if (port < 0) {
+  // TODO(ezhulenev): Make it TensorId when graph properties will support
+  // absl::string_view lookup.
+  SafeTensorId tensor_id = ParseTensorName(tensor);
+
+  if (tensor_id.index() < 0) {
     return errors::InvalidArgument(
         "Can't get tensor properties of control dependency ", tensor);
   }
 
   const auto& output_properties =
-      ctx.graph_properties->GetOutputProperties(tensor_node_name);
+      ctx.graph_properties->GetOutputProperties(tensor_id.node());
   auto num_outputs = output_properties.size();
 
-  if (num_outputs == 0 || port > num_outputs - 1) {
+  if (num_outputs == 0 || tensor_id.index() > num_outputs - 1) {
     return errors::InvalidArgument(
-        "Node ", tensor_node_name,
-        " is missing output properties at position :", port,
+        "Node ", tensor_id.node(),
+        " is missing output properties at position :", tensor_id.index(),
         " (num_outputs=", num_outputs, ")");
   }
 
-  properties->CopyFrom(output_properties[port]);
+  properties->CopyFrom(output_properties[tensor_id.index()]);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 2afb5df4318307259752795e079ce58aeb27802b..99fcb31523800c76b8c413da92576fc16092f588 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -46,17 +47,20 @@ struct GraphOptimizerContext {
   GraphOptimizerContext(const std::unordered_set<string>* nodes_to_preserve,
                         GraphDef* optimized_graph,
                         GraphProperties* graph_properties, NodeMap* node_map,
+                        gtl::FlatSet<string>* feed_nodes,
                         RewriterConfig::Toggle opt_level)
       : nodes_to_preserve(nodes_to_preserve),
         optimized_graph(optimized_graph),
         graph_properties(graph_properties),
         node_map(node_map),
+        feed_nodes(feed_nodes),
         opt_level(opt_level) {}
 
   const std::unordered_set<string>* nodes_to_preserve;
   GraphDef* optimized_graph;
   GraphProperties* graph_properties;
   NodeMap* node_map;
+  gtl::FlatSet<string>* feed_nodes;
   RewriterConfig::Toggle opt_level;
 };
 
@@ -235,7 +239,8 @@ class GraphOptimizerStagePipeline {
         // case of any error it must leave optimized graph unmodified.
         if (!stage_status.ok()) {
           LOG(WARNING) << "Failed to run optimizer " << stage->optimizer_name()
-                       << ", stage " << stage->stage_name()
+                       << ", stage " << stage->stage_name() << " node "
+                       << node->name()
                        << ". Error: " << stage_status.error_message();
         }
         if (break_predicate_(*result)) return true;
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
index 34f28c7c2760445b103346fb57501f86b2d486e3..799c40c67bca0ae4cdac99b59404b2942cb481b4 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
@@ -61,6 +61,7 @@ TEST_F(GraphOptimizerStageTest, OptimizedNodeName) {
                             /*optimized_graph*/ nullptr,
                             /*graph_properties*/ nullptr,
                             /*node_name*/ nullptr,
+                            /*feed_nodes*/ nullptr,
                             /*opt_level*/ RewriterConfig::ON);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
@@ -97,6 +98,7 @@ TEST_F(GraphOptimizerStageTest, GetInputNodeAndProperties) {
                             /*optimized_graph*/ &item.graph,
                             /*graph_properties*/ &properties,
                             /*node_name*/ &node_map,
+                            /*feed_nodes*/ nullptr,
                             /*opt_level*/ RewriterConfig::ON);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
@@ -137,6 +139,7 @@ TEST_F(GraphOptimizerStageTest, AddNodes) {
                             /*optimized_graph*/ &item.graph,
                             /*graph_properties*/ &properties,
                             /*node_name*/ &node_map,
+                            /*feed_nodes*/ nullptr,
                             /*opt_level*/ RewriterConfig::ON);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.cc b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
deleted file mode 100644
index b45ceb12a7972d8e0fb15c0562d0e4ceeeeeef1c..0000000000000000000000000000000000000000
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
-#include <unordered_map>
-#include <unordered_set>
-#include "tensorflow/core/framework/function.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/utils.h"
-
-namespace tensorflow {
-namespace grappler {
-
-GraphRewriter::GraphRewriter(const GrapplerItem& item) {
-  OpRegistryInterface* op_registry = OpRegistry::Global();
-  for (auto& node : item.graph.node()) {
-    NodeInfo* info = new NodeInfo();
-    info->def = &node;
-
-    const OpRegistrationData* op_reg_data = nullptr;
-    Status s = op_registry->LookUp(node.op(), &op_reg_data);
-    // TODO(bsteiner): make this not a best-effort lookup and evaluation?
-    if (s.ok()) {
-      DataTypeVector inputs;
-      s = InOutTypesForNode(node, op_reg_data->op_def, &inputs, &info->outputs);
-      if (!s.ok()) {
-        info->outputs.clear();
-      }
-    }
-
-    nodes_[node.name()].reset(info);
-  }
-
-  std::unordered_set<string> function_names;
-  for (const auto& function : item.graph.library().function()) {
-    function_names.insert(function.signature().name());
-  }
-
-  for (auto& node : item.graph.node()) {
-    RecordConnectivity(node, function_names);
-  }
-}
-
-void GraphRewriter::ForwardInputs(
-    const NodeDef& original_node,
-    const std::unordered_set<const NodeDef*>& nodes_to_delete,
-    NodeDef* new_node) {
-  ForwardInputsInternal(original_node, nodes_to_delete, false, new_node);
-  if (!new_node->name().empty()) {
-    optimized_nodes_[new_node->name()] = new_node;
-  }
-  // Reorder inputs such that control inputs come after regular inputs.
-  int pos = 0;
-  for (int i = 0; i < new_node->input_size(); ++i) {
-    if (!IsControlInput(new_node->input(i))) {
-      new_node->mutable_input()->SwapElements(pos, i);
-      ++pos;
-    }
-  }
-  DedupControlInputs(new_node);
-}
-
-bool GraphRewriter::DrivesControlDependency(const NodeDef& node) const {
-  return control_dependency_drivers_.find(&node) !=
-         control_dependency_drivers_.end();
-}
-
-bool GraphRewriter::FeedsMerge(const NodeDef& node) const {
-  return merge_feeders_.find(&node) != merge_feeders_.end();
-}
-
-bool GraphRewriter::IsDrivenByControlDependency(const NodeDef& node) const {
-  for (const auto& input : node.input()) {
-    CHECK(!input.empty());
-    if (input[0] == '^') {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool GraphRewriter::IsConnectedToFunction(const NodeDef& node) const {
-  return function_neighbors_.find(&node) != function_neighbors_.end();
-}
-
-bool GraphRewriter::IsDrivenByAnotherDevice(const NodeDef& node) const {
-  return cross_device_receivers_.find(&node) != cross_device_receivers_.end();
-}
-
-bool GraphRewriter::ReceivesRefValue(const NodeDef& node) const {
-  return ref_receivers_.find(&node) != ref_receivers_.end();
-}
-
-bool GraphRewriter::IsDrivenBySwitch(const NodeDef& node) const {
-  return switch_receivers_.find(&node) != switch_receivers_.end();
-}
-
-bool GraphRewriter::RemovalIncreasesEdgeCount(const NodeDef& node) const {
-  const int in_degree = node.input_size();
-  auto itr = nodes_.find(node.name());
-  if (itr == nodes_.end()) {
-    return true;
-  }
-  const int out_degree = itr->second->out_degree;
-  return in_degree * out_degree > in_degree + out_degree;
-}
-
-void GraphRewriter::RecordConnectivity(
-    const NodeDef& node, const std::unordered_set<string>& function_names) {
-  const bool is_function =
-      function_names.find(node.op()) != function_names.end();
-
-  bool ref_receiver = false;
-  bool switch_receiver = false;
-  for (const auto& input : node.input()) {
-    int position = 0;
-    string input_node_name = ParseNodeName(input, &position);
-    auto itr = nodes_.find(input_node_name);
-    if (itr == nodes_.end()) {
-      continue;
-    }
-
-    NodeInfo* fanin_info = itr->second.get();
-    const NodeDef* fanin = fanin_info->def;
-    if (IsMerge(node)) {
-      merge_feeders_.insert(fanin);
-    }
-    // Update out_degree of fanin.
-    ++fanin_info->out_degree;
-    if (position < 0) {
-      // This is a control edge
-      control_dependency_drivers_.insert(fanin);
-    } else {
-      // This is a regular edge
-      if (function_names.find(fanin->op()) != function_names.end()) {
-        function_neighbors_.insert(&node);
-      }
-      if (is_function) {
-        function_neighbors_.insert(fanin);
-      }
-      if (IsSwitch(*fanin)) {
-        switch_receiver = true;
-      }
-      if (position < fanin_info->outputs.size() &&
-          IsRefType(fanin_info->outputs[position])) {
-        ref_receiver = true;
-      }
-    }
-    if (fanin->device() != node.device()) {
-      cross_device_receivers_.insert(&node);
-    }
-  }
-
-  if (ref_receiver) {
-    ref_receivers_.insert(&node);
-  }
-  if (switch_receiver) {
-    switch_receivers_.insert(&node);
-  }
-}
-
-void GraphRewriter::ForwardInputsInternal(
-    const NodeDef& node,
-    const std::unordered_set<const NodeDef*>& nodes_to_delete,
-    bool add_as_control, NodeDef* new_node) {
-  // To speed things up, use the optimized version of the node if
-  // available.
-  auto itr = optimized_nodes_.find(node.name());
-  if (itr != optimized_nodes_.end()) {
-    for (const string& input : itr->second->input()) {
-      *new_node->add_input() =
-          add_as_control ? AsControlDependency(NodeName(input)) : input;
-    }
-    return;
-  }
-  for (const auto& input : node.input()) {
-    const string input_node_name = NodeName(input);
-    auto itr = nodes_.find(input_node_name);
-    if (itr == nodes_.end()) {
-      // Invalid input, preserve it as is.
-      *new_node->add_input() =
-          add_as_control ? AsControlDependency(NodeName(input)) : input;
-      continue;
-    }
-    const NodeDef* input_node = itr->second->def;
-    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
-      ForwardInputsInternal(*input_node, nodes_to_delete,
-                            add_as_control || IsControlInput(input), new_node);
-    } else {
-      *new_node->add_input() =
-          add_as_control ? AsControlDependency(NodeName(input)) : input;
-    }
-  }
-}
-
-}  // end namespace grappler
-}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
deleted file mode 100644
index 4a5a150dc9234ffdeed9b991c828e8ec30befde8..0000000000000000000000000000000000000000
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
-#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
-
-#include <unordered_map>
-#include <unordered_set>
-#include "tensorflow/core/grappler/grappler_item.h"
-
-namespace tensorflow {
-namespace grappler {
-
-// Tools and utilities to simplify common graph rewrites.
-class GraphRewriter {
- public:
-  GraphRewriter(const GrapplerItem& item);
-
-  // Forward the inputs of original_node as needed to skip over the nodes that
-  // are to be deleted. In other words, if I is an input of 'original_node', and
-  // I doesn't belong to one of the nodes in 'nodes_to_delete', I will be an
-  // input to 'new_node'. On the other hand, if I belong to a node that will be
-  // deleted, I will be replaced with the inputs J of the deleted node (unless J
-  // belong to nodes that will be deleted, in which case we'll look for
-  // preserved inputs further down the graph).
-  void ForwardInputs(const NodeDef& original_node,
-                     const std::unordered_set<const NodeDef*>& nodes_to_delete,
-                     NodeDef* new_node);
-
-  // Returns true if at least one of the edges in the direct fanout of 'node' is
-  // a control dependency edge.
-  bool DrivesControlDependency(const NodeDef& node) const;
-
-  // Returns true if at least one of the incident edges is a control dependency
-  // edge.
-  bool IsDrivenByControlDependency(const NodeDef& node) const;
-
-  // Returns true if at least one of the nodes in the direct fanin or the direct
-  // fanout (excluding control dependencies) of 'node' is a function.
-  bool IsConnectedToFunction(const NodeDef& node) const;
-
-  // Returns true if the node is driven by at least one node placed on another
-  // device.
-  bool IsDrivenByAnotherDevice(const NodeDef& node) const;
-
-  // Returns true if the node has input from a stateful op.
-  bool ReceivesRefValue(const NodeDef& node) const;
-
-  // Returns true if the node is driven by a Switch node.
-  bool IsDrivenBySwitch(const NodeDef& node) const;
-
-  // Returns true if the node feeds a Merge node.
-  bool FeedsMerge(const NodeDef& node) const;
-
-  // Returns true if removal of this degree would increase edge count, i.e. if
-  // in-degree * out-degree > in-degree + out-degree or if the condition could
-  // not be verified.
-  bool RemovalIncreasesEdgeCount(const NodeDef& node) const;
-
- private:
-  void RecordConnectivity(const NodeDef& node,
-                          const std::unordered_set<string>& function_names);
-  void ForwardInputsInternal(
-      const NodeDef& original_node,
-      const std::unordered_set<const NodeDef*>& nodes_to_delete,
-      bool add_as_control, NodeDef* new_node);
-
-  struct NodeInfo {
-    int out_degree = 0;
-    const NodeDef* def;
-
-    // These are filled in when the NodeInfo is built, but not that they
-    // may be empty - if the op could not be loaded from the registry.
-    DataTypeVector outputs;
-  };
-
-  std::unordered_map<string, std::unique_ptr<NodeInfo>> nodes_;
-  std::unordered_map<string, const NodeDef*> optimized_nodes_;
-  std::unordered_set<const NodeDef*> control_dependency_drivers_;
-  std::unordered_set<const NodeDef*> function_neighbors_;
-  std::unordered_set<const NodeDef*> cross_device_receivers_;
-  std::unordered_set<const NodeDef*> ref_receivers_;
-  std::unordered_set<const NodeDef*> switch_receivers_;
-  std::unordered_set<const NodeDef*> merge_feeders_;
-};
-
-}  // end namespace grappler
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 3251e7cb1027a184917218f2a5a4560fa0dee43c..8f25a1c8c1c48281fb44c01a142348863836d5aa 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -119,6 +118,8 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "Exit",
                                           "Exp",
                                           "Expm1",
+                                          "FakeQuantWithMinMaxVars",
+                                          "FakeQuantWithMinMaxArgs",
                                           "Fill",
                                           "Floor",
                                           "FloorDiv",
@@ -161,6 +162,8 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "PreventGradient",
                                           "Prod",
                                           "Polygamma",
+                                          "QuantizeAndDequantizeV2",
+                                          "QuantizeAndDequantizeV3",
                                           "Pow",
                                           "Real",
                                           "RealDiv",
@@ -1965,9 +1968,9 @@ class DataLayoutOptimizer : GraphProcessor {
   // Expand all nodes which is in NHWC, but supports NCHW or is layout agnostic.
   Status Expand() {
     int node_size_original = graph_->node_size();
-    std::unordered_map<const NodeDef*, std::vector<int>> frames;
-    int num_frames;
-    TF_RETURN_IF_ERROR(IdentifyFrames(*graph_, &frames, &num_frames));
+
+    FrameView frame_view;
+    TF_RETURN_IF_ERROR(frame_view.InferFromGraph(*graph_));
 
     // This is the first pass where we expand the nodes which support NCHW.
     std::set<string> ops_format_supported = GetOpsFormatSupported();
@@ -1979,7 +1982,7 @@ class DataLayoutOptimizer : GraphProcessor {
       if (ops_format_supported.find(graph_->node(i).op()) !=
           ops_format_supported.end()) {
         auto node = graph_->mutable_node(i);
-        bool is_in_frame = !frames[node].empty();
+        bool is_in_frame = frame_view.IsInFrame(*node);
         OptimizeContext opt_cxt(graph_, node, node_map_, graph_properties_,
                                 virtual_placer_, nodes_to_preserve_,
                                 is_in_frame);
@@ -2029,7 +2032,7 @@ class DataLayoutOptimizer : GraphProcessor {
         if (ops_format_agnostic.find(graph_->node(i).op()) !=
             ops_format_agnostic.end()) {
           auto node = graph_->mutable_node(i);
-          bool is_in_frame = !frames[node].empty();
+          bool is_in_frame = frame_view.IsInFrame(*node);
           OptimizeContext opt_cxt(graph_, node, node_map_, graph_properties_,
                                   virtual_placer_, nodes_to_preserve_,
                                   is_in_frame);
@@ -2188,6 +2191,7 @@ Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     *output = item.graph;
     return status;
   }
+  GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
   TuningConfig config;
   config.no_gemm = true;
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index f3a07be72840c357e50c1b52f303550655eec4fb..36064738408c744db53cb9e95645d6a2968b1746 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -29,12 +30,11 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -379,14 +379,14 @@ Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(
 
 Status LoopInvariantNodeMotionOptimizer::Optimize() {
   node_map_.reset(new NodeMap(optimized_graph_));
-  FrameMap frame_map;
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                               &frame_map, &num_frames));
+  FrameView frame_view;
+  // TODO(ezhulenev): Use GraphView when migrated from NodeMap.
+  TF_RETURN_IF_ERROR(frame_view.InferFromGraph(*optimized_graph_));
+
   std::deque<int> worklist;
-  for (auto iter = frame_map.begin(); iter != frame_map.end(); ++iter) {
-    auto* node = iter->first;
-    auto& frame_ids = iter->second;
+  for (const NodeDef& node : optimized_graph_->node()) {
+    const std::vector<int>& frame_ids = frame_view.Frames(node);
+
     if (frame_ids.size() >= 3) {
       for (unsigned int i = 1; i < frame_ids.size() - 1; ++i) {
         frame_parent_[frame_ids[i]] = frame_ids[i - 1];
@@ -399,18 +399,18 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
     }
     if (!frame_ids.empty()) {
       frame_children_.insert(std::make_pair(frame_ids.back(), empty_set_));
-      if (node->op() == "LoopCond") {
+      if (node.op() == "LoopCond") {
         if (loop_cond_.count(frame_ids.back())) {
           return errors::InvalidArgument(
               "Loop ", frame_ids.back(),
-              " has more than one LoopCond node: ", node->name(), " and ",
+              " has more than one LoopCond node: ", node.name(), " and ",
               loop_cond_[frame_ids.back()]->name());
         }
-        loop_cond_[frame_ids.back()] = node;
+        loop_cond_[frame_ids.back()] = &node;
       }
-      if (IsEnter(*node) && node->attr().at("is_constant").b()) {
+      if (IsEnter(node) && node.attr().at("is_constant").b()) {
         invariant_enters_[frame_ids.back()].push_back(
-            const_cast<NodeDef*>(node));
+            const_cast<NodeDef*>(&node));
       }
     }
   }
@@ -565,13 +565,14 @@ Status EvaluateBoolOpForConstantOperands(const NodeDef& op_node,
   return Status::OK();
 }
 
-Status CheckForDeadFanout(const GraphView& view, const NodeDef& switch_node,
-                          const NodeMap& node_map,
+Status CheckForDeadFanout(const MutableGraphView& view,
+                          const NodeDef& switch_node, const NodeMap& node_map,
                           DeviceBase* cpu_device, ResourceMgr* resource_mgr,
                           bool* has_dead_fanout, int* dead_fanout) {
   *has_dead_fanout = false;
   GraphView::InputPort switch_loopcond_port(&switch_node, 1);
-  NodeDef* switch_predicate = view.GetRegularFanin(switch_loopcond_port).node;
+  const NodeDef* switch_predicate =
+      view.GetRegularFanin(switch_loopcond_port).node;
 
   // CASE 1: Control is a constant.
   if (IsConstant(*switch_predicate)) {
@@ -582,7 +583,7 @@ Status CheckForDeadFanout(const GraphView& view, const NodeDef& switch_node,
   }
 
   GraphView::InputPort switch_input_port(&switch_node, 0);
-  NodeDef* switch_input = view.GetRegularFanin(switch_input_port).node;
+  const NodeDef* switch_input = view.GetRegularFanin(switch_input_port).node;
 
   // CASE 2: Zero-iteration while loop.
   // We check if its a while loop such that the condition is a simple binary
@@ -707,10 +708,9 @@ Status LoopOptimizer::RemoveDeadBranches(
   std::unordered_map<NodeDef*, std::set<int>> dead_merge_inputs;
   // TODO(bsteiner): also rewrite switches as identity. For now we just record
   // them
-  std::unordered_set<GraphView::OutputPort, GraphView::HashPort>
-      identity_switches;
+  absl::flat_hash_set<GraphView::OutputPort> identity_switches;
 
-  GraphView view(optimized_graph);
+  MutableGraphView view(optimized_graph);
   for (const NodeDef& node : optimized_graph->node()) {
     if (!IsSwitch(node)) {
       continue;
@@ -727,11 +727,12 @@ Status LoopOptimizer::RemoveDeadBranches(
     if (!has_dead_fanout) {
       continue;
     }
-    GraphView::OutputPort dead(const_cast<NodeDef*>(&node), dead_fanout);
+    GraphView::OutputPort dead(&node, dead_fanout);
     identity_switches.insert(dead);
 
-    SetVector<GraphView::InputPort, GraphView::HashPort> zombie_inputs;
-    for (const GraphView::InputPort& port : view.GetFanout(dead)) {
+    SetVector<MutableGraphView::InputPort, absl::Hash<MutableGraphView::Port>>
+        zombie_inputs;
+    for (const MutableGraphView::InputPort& port : view.GetFanout(dead)) {
       if (dead_nodes.find(port.node) == dead_nodes.end()) {
         zombie_inputs.PushBack(port);
       }
@@ -745,7 +746,7 @@ Status LoopOptimizer::RemoveDeadBranches(
         dead_merge_inputs;
     bool found_node_to_preserve = false;
     while (!found_node_to_preserve && !zombie_inputs.Empty()) {
-      GraphView::InputPort dead = zombie_inputs.PopBack();
+      MutableGraphView::InputPort dead = zombie_inputs.PopBack();
       if (nodes_to_preserve.find(dead.node->name()) !=
           nodes_to_preserve.end()) {
         found_node_to_preserve = true;
@@ -764,9 +765,9 @@ Status LoopOptimizer::RemoveDeadBranches(
           found_node_to_preserve = true;
           break;
         }
-        GraphView::OutputPort value_index(dead.node, 1);
-        const std::unordered_set<GraphView::InputPort, GraphView::HashPort>&
-            index_fanout = view.GetFanout(value_index);
+        MutableGraphView::OutputPort value_index(dead.node, 1);
+        const absl::flat_hash_set<MutableGraphView::InputPort>& index_fanout =
+            view.GetFanout(value_index);
         if (!index_fanout.empty()) {
           // The 2nd output (that indicates which input is propagated) is
           // connected. This never happens in practice, so we'll just skip this
@@ -779,7 +780,6 @@ Status LoopOptimizer::RemoveDeadBranches(
         if (dead.port_id < 0) {
           // If the control dependency never gets triggered the merge will also
           // never get triggered.
-          local_dead_nodes.insert(dead.node);
           fully_dead = true;
         } else {
           local_dead_merge_inputs[dead.node].insert(dead.port_id);
@@ -787,12 +787,12 @@ Status LoopOptimizer::RemoveDeadBranches(
               dead.node->attr().at("N").i()) {
             fully_dead = true;
           }
-          if (fully_dead) {
-            local_dead_nodes.insert(dead.node);
-            for (const GraphView::InputPort& port :
-                 view.GetFanouts(*dead.node, true)) {
-              zombie_inputs.PushBack(port);
-            }
+        }
+        if (fully_dead) {
+          local_dead_nodes.insert(dead.node);
+          for (const MutableGraphView::InputPort& port :
+               view.GetFanouts(*dead.node, true)) {
+            zombie_inputs.PushBack(port);
           }
         }
       } else if (dead.node->op() == "ControlTrigger") {
@@ -801,7 +801,7 @@ Status LoopOptimizer::RemoveDeadBranches(
         break;
       } else {
         if (local_dead_nodes.insert(dead.node).second) {
-          for (const GraphView::InputPort& dead_fanout :
+          for (const MutableGraphView::InputPort& dead_fanout :
                view.GetFanouts(*dead.node, true)) {
             zombie_inputs.PushBack(dead_fanout);
           }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index 7c04f55381edca8f6a6679edb73479414f4c6f0b..d467237a9a704a81a0ecc1da71531868c7f3a49b 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <unordered_set>
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 81f40db8f0b7ec0bc79713493940ad24b2f657b3..587767c23c370ca1f747fc5b4e2bfa4cba3ae10d 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -101,27 +101,30 @@ TEST_F(LoopOptimizerTest, Basic) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd")).back(), 0);
+  }
 
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd")).back(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, Const) {
@@ -149,26 +152,29 @@ TEST_F(LoopOptimizerTest, Const) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const")).back(), 0);
+  }
 
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const")).back(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 0);
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const")).size(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, ControlOutput) {
@@ -197,24 +203,27 @@ TEST_F(LoopOptimizerTest, ControlOutput) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+  }
 
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoop1) {
@@ -258,31 +267,34 @@ TEST_F(LoopOptimizerTest, NestedLoop1) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+  }
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoop2) {
@@ -326,27 +338,30 @@ TEST_F(LoopOptimizerTest, NestedLoop2) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+  }
 
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoopConst1) {
@@ -390,28 +405,31 @@ TEST_F(LoopOptimizerTest, NestedLoopConst1) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const2")).back(), 1);
+  }
 
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 0);
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const2")).back(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoopConst2) {
@@ -455,26 +473,29 @@ TEST_F(LoopOptimizerTest, NestedLoopConst2) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const2")).back(), 1);
+  }
 
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 0);
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 0);
+  }
 }
 
 void VerifyGraphsEqual(const GraphDef& original_graph,
@@ -720,6 +741,418 @@ TEST_F(LoopOptimizerTest, RemoveDeadBranches_ConstantCondition) {
   }
 }
 
+TEST_F(LoopOptimizerTest, RemoveDeadBranches_FullyRemoveDeadBranches) {
+  const string gdef_ascii = R"EOF(
+node {
+  name: "episodicreplaybuffer_add_readvariableop_resource"
+  op: "_Arg"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/and_1/x"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: true
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/begin_episode"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: false
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Switch"
+  op: "Switch"
+  input: "EpisodicReplayBuffer/add/and_1/x"
+  input: "EpisodicReplayBuffer/add/and_1/x"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/NoOp"
+  op: "NoOp"
+  input: "^EpisodicReplayBuffer/add/and_1/x"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Assert/Switch"
+  op: "Switch"
+  input: "EpisodicReplayBuffer/add/and_1/x"
+  input: "EpisodicReplayBuffer/add/and_1/x"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@EpisodicReplayBuffer/add/assert_equal/All"
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Assert/Switch_1"
+  op: "Switch"
+  input: "EpisodicReplayBuffer/add/begin_episode"
+  input: "EpisodicReplayBuffer/add/and_1/x"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@EpisodicReplayBuffer/add/begin_episode"
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Assert/Switch_2"
+  op: "Switch"
+  input: "EpisodicReplayBuffer/add/begin_episode"
+  input: "EpisodicReplayBuffer/add/and_1/x"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@EpisodicReplayBuffer/add/end_episode"
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/switch_f"
+  op: "Identity"
+  input: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Switch"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/control_dependency"
+  op: "Const"
+  input: "^EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/NoOp"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        tensor_content: "\001"
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Assert"
+  op: "Assert"
+  input: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Assert/Switch"
+  input: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Assert/Switch_1"
+  input: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Assert/Switch_2"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_BOOL
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    key: "summarize"
+    value {
+      i: 3
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/control_dependency_1"
+  op: "Identity"
+  input: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/switch_f"
+  input: "^EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Assert"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/switch_f"
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Merge"
+  op: "Merge"
+  input: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/control_dependency_1"
+  input: "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/control_dependency"
+  input: "^EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Assert"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/FloorMod/y"
+  op: "Const"
+  input: "^EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Merge"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 5000
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/ReadVariableOp"
+  op: "ReadVariableOp"
+  input: "episodicreplaybuffer_add_readvariableop_resource"
+  input: "^EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Merge"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/Less/y"
+  op: "Const"
+  input: "^EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Merge"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT64
+        tensor_shape {
+        }
+        int64_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/Less"
+  op: "Less"
+  input: "EpisodicReplayBuffer/add/ReadVariableOp"
+  input: "EpisodicReplayBuffer/add/Less/y"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT64
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/or"
+  op: "LogicalOr"
+  input: "EpisodicReplayBuffer/add/begin_episode"
+  input: "EpisodicReplayBuffer/add/Less"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+}
+node {
+  name: "EpisodicReplayBuffer/add/get_episode_id/pred_id"
+  op: "Identity"
+  input: "EpisodicReplayBuffer/add/or"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/get_episode_id/Switch"
+  op: "Switch"
+  input: "EpisodicReplayBuffer/add/or"
+  input: "EpisodicReplayBuffer/add/or"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/get_episode_id/critical_section_execute/AssignVariableOp/Switch"
+  op: "Switch"
+  input: "episodicreplaybuffer_add_readvariableop_resource"
+  input: "EpisodicReplayBuffer/add/get_episode_id/pred_id"
+  input: "^EpisodicReplayBuffer/add/ReadVariableOp"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@EpisodicReplayBuffer/add/ReadVariableOp/resource"
+      }
+    }
+  }
+}
+node {
+  name: "EpisodicReplayBuffer/add/get_episode_id/critical_section_execute/ReadVariableOp_3"
+  op: "ReadVariableOp"
+  input: "^EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Merge"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 27
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  item.fetch = {
+      "EpisodicReplayBuffer/add/get_episode_id/critical_section_execute/"
+      "ReadVariableOp_3"};
+
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE, nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_CHECK_OK(status);
+
+  bool found_merge = false;
+  for (const auto& node : output.node()) {
+    if (node.name() ==
+        "EpisodicReplayBuffer/add/assert_equal/Assert/AssertGuard/Merge") {
+      found_merge = true;
+    }
+  }
+
+  EXPECT_TRUE(found_merge)
+      << "Merge node was deleted, but it shouldn't have been.";
+}
+
 TEST_F(LoopOptimizerTest, RemoveDeadBranches_ZeroIterWhile) {
   const string gdef_ascii = R"EOF(
 node {
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index c775a2691431e041e2d6208664bee4ed4cbf2359..227c2bb8b0f3d3e6809f65f3b3716270b0c2c6e5 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -29,20 +29,24 @@ limitations under the License.
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
-#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
 #include "tensorflow/core/grappler/optimizers/static_schedule.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+
 // Prefix added to nodes which are recomputed.
 const char* kRecomputedNodePrefix = "Recomputed";
 const char* kRecomputeTriggerNodePrefix = "RecomputeTrigger";
@@ -492,7 +496,7 @@ void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
 
 bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
   // Look for AddN nodes (and equivalent) and record input names.
-  GraphView view(&item->graph);
+  MutableGraphView view(&item->graph);
 
   std::unordered_map<string, std::unordered_set<NodeDef*>> addn_list;
   for (NodeDef& node : *item->graph.mutable_node()) {
@@ -574,9 +578,9 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     }
 
     // Compute a topological ordering for the node fanin.
-    std::unordered_map<NodeDef*, int> topo_order;
+    std::unordered_map<const NodeDef*, int> topo_order;
     ReverseDfs(view, {node}, nullptr,
-               [&topo_order](NodeDef* n) {
+               [&topo_order](const NodeDef* n) {
                  int topo_index = topo_order.size();
                  topo_order[n] = topo_index;
                },
@@ -587,7 +591,7 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
     for (int i = 0; i < node->input_size(); ++i) {
       const string& input = node->input(i);
       const string node_name = NodeName(input);
-      NodeDef* node = view.GetNode(node_name);
+      const NodeDef* node = view.GetNode(node_name);
       input_topo_index.push_back(topo_order.at(node));
     }
     int min_input_topo_index = INT_MAX;
@@ -744,25 +748,6 @@ Status BuildSwapPair(NodeDef* node, int input_to_swap,
   return Status::OK();
 }
 
-static int64 EstimateSize(const OpInfo::TensorProperties& t) {
-  DataType dtype = t.dtype();
-  int64 size = DataTypeSize(dtype);
-  TensorShapeProto shape = t.shape();
-  if (shape.unknown_rank()) {
-    // Can't infer the size if the rank is unknown. It has to be at least a
-    // scalar though.
-    return size;
-  }
-  // If one of the dimensions is unknown statically, assume it's at least one.
-  for (int i = 0; i < shape.dim_size(); ++i) {
-    if (shape.dim(i).size() < 0) {
-      shape.mutable_dim(i)->set_size(1);
-    }
-  }
-  int64 num_elems = TensorShape(shape).num_elements();
-  return num_elems * size;
-}
-
 struct SwapInfo {
   std::vector<int> inputs_to_swap;
   Costs::NanoSeconds time_to_swap = 0;
@@ -848,7 +833,8 @@ static const NodeDef* FindSwapInTrigger(
   return nullptr;
 }
 
-static bool IsSwappable(const GraphView& graph, GraphView::OutputPort output) {
+static bool IsSwappable(const MutableGraphView& graph,
+                        MutableGraphView::OutputPort output) {
   const NodeDef& node = *output.node;
   // There is no point in swapping out persistent tensors, since the tensor will
   // continue to use memory.
@@ -874,10 +860,10 @@ static bool IsSwappable(const GraphView& graph, GraphView::OutputPort output) {
     // If placed on the same device, these nodes are just forwarding references
     // to their input. Therefore they are swappable iff their fanin is swappable
     // or it resides on a different device.
-    GraphView::InputPort input;
+    MutableGraphView::InputPort input;
     input.node = output.node;
     input.port_id = 0;
-    GraphView::OutputPort fanin = graph.GetRegularFanin(input);
+    MutableGraphView::OutputPort fanin = graph.GetRegularFanin(input);
     if (fanin.node->device() == node.device()) {
       return IsSwappable(graph, fanin);
     }
@@ -886,19 +872,19 @@ static bool IsSwappable(const GraphView& graph, GraphView::OutputPort output) {
 }
 
 static NodeDef* FindSwapOutTrigger(
-    const NodeDef* node, int input_id, const GraphView& view,
+    const NodeDef* node, int input_id, const MutableGraphView& view,
     const std::unordered_map<const NodeDef*, Costs::NanoSeconds>&
         execution_times) {
   // Find the output port that generated the tensor to swap.
-  GraphView::InputPort swap;
+  MutableGraphView::InputPort swap;
   swap.node = const_cast<NodeDef*>(node);
   swap.port_id = input_id;
-  GraphView::OutputPort generator = view.GetRegularFanin(swap);
+  MutableGraphView::OutputPort generator = view.GetRegularFanin(swap);
   if (!generator.node) {
     return nullptr;
   }
 
-  const std::unordered_set<GraphView::InputPort, GraphView::HashPort>& fanout =
+  const absl::flat_hash_set<MutableGraphView::InputPort>& fanout =
       view.GetFanout(generator);
   NodeDef* trigger = nullptr;
   Costs::NanoSeconds earliest_fanout(Costs::NanoSeconds::infinity());
@@ -917,7 +903,7 @@ static NodeDef* FindSwapOutTrigger(
   return trigger;
 }
 
-static bool IsSwappable(GraphView::InputPort input) {
+static bool IsSwappable(MutableGraphView::InputPort input) {
   const NodeDef& node = *input.node;
 
   const OpDef* op_def;
@@ -934,9 +920,9 @@ static bool IsSwappable(GraphView::InputPort input) {
 }
 
 struct MemInfo {
-  GraphView::OutputPort port;
+  MutableGraphView::OutputPort port;
   int64 memory_used;
-  std::vector<GraphView::InputPort> uses_left;
+  std::vector<MutableGraphView::InputPort> uses_left;
   double fitness;
 
   bool operator<(const MemInfo& other) const { return fitness < other.fitness; }
@@ -1007,7 +993,7 @@ static bool IdentifySwappingCandidates(
 
     std::vector<MemInfo> mem_state;
 
-    GraphView graph(&item->graph);
+    MutableGraphView graph(&item->graph);
     for (const auto& live_tensor : mem_usage.live_tensors) {
       if (live_tensor.memory_used <= 1024) {
         // Don't bother with small tensors.
@@ -1023,7 +1009,7 @@ static bool IdentifySwappingCandidates(
       if (skip_list->find(live_tensor.node) != skip_list->end()) {
         continue;
       }
-      GraphView::OutputPort port =
+      MutableGraphView::OutputPort port =
           graph.GetOutputPort(live_tensor.node, live_tensor.output_id);
       if (!IsSwappable(graph, port)) {
         continue;
@@ -1034,7 +1020,7 @@ static bool IdentifySwappingCandidates(
       Costs::Duration allocation_time = live_tensor.allocation_time;
       Costs::Duration earliest_use(Costs::Duration::infinity());
       bool valid = true;
-      for (GraphView::InputPort input : graph.GetFanout(port)) {
+      for (MutableGraphView::InputPort input : graph.GetFanout(port)) {
         // Get execution time.
         auto it = op_completion_times.find(input.node->name());
         if (it == op_completion_times.end()) {
@@ -1076,7 +1062,7 @@ static bool IdentifySwappingCandidates(
         // the values do not fit into any integral type.
         mem_info.fitness =
             MathUtil::IPow<double>((earliest_use - peak_time).count(), 2) /
-            MathUtil::IPow<double>(mem_info.uses_left.size(), 2) +
+                MathUtil::IPow<double>(mem_info.uses_left.size(), 2) +
             MathUtil::IPow<double>((allocation_time - peak_time).count(), 2);
         mem_info.fitness = -mem_info.fitness;
         mem_state.push_back(mem_info);
@@ -1087,7 +1073,8 @@ static bool IdentifySwappingCandidates(
     std::sort(mem_state.begin(), mem_state.end());
 
     for (const MemInfo& mem_info : mem_state) {
-      for (const GraphView::InputPort fanout_to_swap : mem_info.uses_left) {
+      for (const MutableGraphView::InputPort fanout_to_swap :
+           mem_info.uses_left) {
         VLOG(1) << "Will swap fanout " << fanout_to_swap.node->name() << ":"
                 << fanout_to_swap.port_id << " of tensor "
                 << mem_info.port.node->name() << ":" << mem_info.port.port_id
@@ -1149,7 +1136,7 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level,
     int64 bytes_to_swap = 0;
     for (int64 input_id : swap_info.inputs_to_swap) {
       const OpInfo::TensorProperties& t = props[input_id];
-      bytes_to_swap += EstimateSize(t);
+      bytes_to_swap += CalculateTensorSize(t);
     }
     // Let's assume we're going to swap over PCIe running at 16 GBps.
     swap_info.time_to_swap = bytes_to_swap / 16;
@@ -1164,7 +1151,7 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level,
   for (const auto& node : item->graph.node()) {
     name_map[node.name()] = &node;
   }
-  GraphView view(&item->graph);
+  MutableGraphView view(&item->graph);
 
   bool updated_graph = false;
 
@@ -1225,6 +1212,20 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level,
   return updated_graph;
 }
 
+bool CrossesTaskOrCpuGpuBoundary(const NodeDef& node1, const NodeDef& node2) {
+  string task1;
+  string device1;
+  DeviceNameUtils::SplitDeviceName(node1.device(), &task1, &device1);
+  string task2;
+  string device2;
+  DeviceNameUtils::SplitDeviceName(node2.device(), &task2, &device2);
+  return task1 != task2 ||
+         (str_util::StrContains(device1, DEVICE_CPU) &&
+          str_util::StrContains(device2, DEVICE_GPU)) ||
+         (str_util::StrContains(device1, DEVICE_GPU) &&
+          str_util::StrContains(device2, DEVICE_CPU));
+}
+
 // TODO(rmlarsen): Add distributed TF test.
 Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
   std::unordered_set<string> devices;
@@ -1256,22 +1257,23 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
   TF_RETURN_IF_ERROR(graph_view.Initialize(*optimized_graph));
   for (int i : assign_nodes) {
     if (optimized_nodes.find(i) == optimized_nodes.end()) {
-      const NodeDef& node = optimized_graph->node(i);
+      const NodeDef& assign_node = optimized_graph->node(i);
       optimized_nodes.insert(i);
       std::vector<int> assign_nodes_in_fanout;
       assign_nodes_in_fanout.push_back(i);
       std::set<int> transitive_fanout;
       graph_view.DepthFirstSearch(std::unordered_set<string>{}, i,
                                   &transitive_fanout);
-      const string& assign_device = node.device();
       bool relax_constraint = true;
       // If all nodes in the transitive fanout are on the same device as the
       // assign node, there is no need to allocate the output in pinned memory.
       for (int fanout : transitive_fanout) {
         const NodeDef& fanout_node = optimized_graph->node(fanout);
         if (relax_constraint &&
-            (fanout_node.device() != assign_device || IsSend(fanout_node))) {
+            (IsSend(fanout_node) ||
+             CrossesTaskOrCpuGpuBoundary(fanout_node, assign_node))) {
           relax_constraint = false;
+          break;
         }
         if (optimized_nodes.find(fanout) == optimized_nodes.end() &&
             IsAssign(fanout_node)) {
@@ -1279,17 +1281,18 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
         }
       }
 
-      for (int assign_idx : assign_nodes_in_fanout) {
-        if (relax_constraint) {
+      if (relax_constraint) {
+        for (int assign_idx : assign_nodes_in_fanout) {
           // If all devices match in fanout of node(i) then, by transitivity,
           // they must also match in the fanout of other assign nodes
-          // node(assign_idx) in the fanout, so we can process them here,
+          // in the fanout of node(i), so we can process them here,
           // and save computing their transitive fanout later.
           optimized_nodes.insert(assign_idx);
 
           // Set an attribute telling AssignOp to ignore allocator constraints.
-          NodeDef* assign_node = optimized_graph->mutable_node(assign_idx);
-          (*assign_node
+          NodeDef* assign_node_to_relax =
+              optimized_graph->mutable_node(assign_idx);
+          (*assign_node_to_relax
                 ->mutable_attr())["_grappler_relax_allocator_constraints"]
               .set_b(true);
         }
@@ -1299,20 +1302,22 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
   return Status::OK();
 }
 
+}  // namespace
+
 Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
-  *optimized_graph = item.graph;
+  GrapplerItem optimized_item(item);
 
   RecomputationRewritingPass(optimization_level_,
-                             recomputation_targets_name_scope_, optimized_graph,
-                             item);
+                             recomputation_targets_name_scope_,
+                             &optimized_item.graph, item);
 
-  GrapplerItem optimized_item(item, optimized_graph);
   std::unordered_set<string> skip_list;
   // Bound the number of rewrite passes to avoid long processing times on graphs
   // that simply won't fit in memory.
   bool updated_graph = true;
   for (int i = 0; i < 25 && updated_graph; ++i) {
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     updated_graph = false;
     if ((optimization_level_ == RewriterConfig::DEFAULT_MEM_OPT ||
          optimization_level_ == RewriterConfig::SCHEDULING_HEURISTICS ||
@@ -1321,6 +1326,7 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       updated_graph |= SchedulingPass(cluster, &optimized_item);
     }
 
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     if ((optimization_level_ == RewriterConfig::DEFAULT_MEM_OPT ||
          optimization_level_ == RewriterConfig::SWAPPING_HEURISTICS ||
          optimization_level_ == RewriterConfig::HEURISTICS ||
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.h b/tensorflow/core/grappler/optimizers/memory_optimizer.h
index 653ffaec4c206cbb925bc40359f3950b17255a75..baaadf64b3b891f6390eeaf7e745a996d3dd04c9 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
 
+#include <string>
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index a3f0e078616efe1c54453bdd8e26c677d52435bd..356b23dec0de7d8648fd92b977413720654f2451 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -15,16 +15,23 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 
+#include <memory>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -272,7 +279,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   EXPECT_EQ("^swap_out_e_0", new_c.input(1));
 
   // Run the optimizer a second time to ensure it's idempotent.
-  GrapplerItem item_copy(item, std::move(output));
+  GrapplerItem item_copy = item.WithGraph(std::move(output));
   status = optimizer.Optimize(cluster.get(), item_copy, &output);
   TF_EXPECT_OK(status);
 
@@ -280,7 +287,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   item.fetch = {"e"};
   item.init_ops = {init.name()};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 #endif
@@ -330,7 +337,7 @@ TEST_F(MemoryOptimizerTest, SwappingHeuristics) {
 
 #if GOOGLE_CUDA
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
@@ -379,7 +386,7 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
 
 #if GOOGLE_CUDA
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 #endif
@@ -467,7 +474,7 @@ TEST_F(RelaxAllocatorConstraintsTest, SameDevice) {
   item.fetch = {"exp"};
   item.init_ops = {"variable"};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -498,12 +505,37 @@ TEST_F(RelaxAllocatorConstraintsTest, DifferentDevice) {
   item.fetch = {"exp"};
   item.init_ops = {"variable"};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 #endif
 }
 
+TEST_F(RelaxAllocatorConstraintsTest, SameDeviceType) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output constant = ops::Const(s.WithOpName("constant").WithDevice("/cpu:0"),
+                               -3.14f, {128, 128});
+  Output variable = ops::Variable(s.WithOpName("variable").WithDevice("/cpu:0"),
+                                  {128, 128}, DT_FLOAT);
+  Output assign = ops::Assign(s.WithOpName("assign").WithDevice("/cpu:0"),
+                              variable, constant);
+  // Assign and Exp run on different devies, but do not straddle a CPU:GPU
+  // boundary, so we can we do not need to enforce allocation in pinned memory.
+  Output exp = ops::Exp(s.WithOpName("exp").WithDevice("/cpu:1"), assign);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  MemoryOptimizer optimizer(RewriterConfig::MANUAL);
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  auto node = output.node(2);
+  EXPECT_EQ("assign", node.name());
+  EXPECT_EQ(1, node.attr().count("_grappler_relax_allocator_constraints"));
+  EXPECT_TRUE(node.attr().at("_grappler_relax_allocator_constraints").b());
+}
+
 TEST_F(RelaxAllocatorConstraintsTest, SendNode) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output constant = ops::Const(s.WithOpName("constant").WithDevice("/cpu:0"),
@@ -566,7 +598,7 @@ TEST_F(RelaxAllocatorConstraintsTest, AssignNodeInFanout) {
   item.fetch = {"assign0", "assign1"};
   item.init_ops = {"exp1", "variable1"};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   for (int i = 0; i < tensors_expected.size(); ++i) {
     test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index c3d70a1fdf852871460007fe39f3392b66e6a1a5..572cc41d765f5b0e285bbff3ff600c15fbed1431 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -85,6 +87,18 @@ bool IsTPUGraphDef(const GraphDef& def) {
   return false;
 }
 
+uint64 DeadlineMicroSeconds(const RewriterConfig& cfg) {
+  const uint64 kFiveMinutesInUsec = 5 * 60 * 1000 * 1000;
+  if (cfg.meta_optimizer_timeout_ms() < 0) {
+    return 0;
+  } else {
+    return cfg.meta_optimizer_timeout_ms() == 0
+               ? Env::Default()->NowMicros() + kFiveMinutesInUsec
+               : Env::Default()->NowMicros() +
+                     cfg.meta_optimizer_timeout_ms() * 1000;
+  }
+}
+
 }  // namespace
 
 #define MK_OPT(NAME, VALUE) \
@@ -114,6 +128,14 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
 
 #undef MK_OPT
 
+MetaOptimizer::MetaOptimizer(DeviceBase* cpu_device, const ConfigProto& cfg)
+    : cpu_device_(cpu_device),
+      config_proto_(cfg),
+      cfg_(*config_proto_.mutable_graph_options()->mutable_rewrite_options()) {
+  DCHECK(cpu_device_ == nullptr ||
+         cpu_device_->attributes().device_type() == "CPU");
+}
+
 Status MetaOptimizer::InitializeOptimizers(
     std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
   if (cfg_.disable_meta_optimizer()) {
@@ -260,6 +282,18 @@ MetaOptimizer::GetCustomGraphOptimizerConfig(const string& name) const {
   return nullptr;
 }
 
+#define RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer)                            \
+  {                                                                            \
+    const Status status = RunOptimizer(optimizer, cluster, &optimized_item,    \
+                                       optimized_graph, &optimization_result); \
+    if (status.ok()) {                                                         \
+      is_optimized = true;                                                     \
+    } else if (cfg_.fail_on_optimizer_errors()) {                              \
+      VLOG(2) << "Optimizer '" << optimizer->name() << "' failed: " << status; \
+      TF_RETURN_IF_ERROR(status);                                              \
+    }                                                                          \
+  }
+
 Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
                                     GraphDef* optimized_graph) {
   int min_graph_nodes = cfg_.min_graph_nodes() == 0 ? kDefaultMinGraphNodes
@@ -309,6 +343,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
 
     VLOG(4) << "Starting optimization iteration " << iteration;
     for (const auto& optimizer : optimizers) {
+      GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
       // Some optimizers can run only once.
       if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue;
       // Some must run only on the last iteration.
@@ -320,9 +355,7 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
         if (fusion_optimizer == nullptr) fusion_optimizer = optimizer.get();
         continue;
       }
-      Status status = RunOptimizer(optimizer.get(), cluster, &optimized_item,
-                                   optimized_graph, &optimization_result);
-      if (status.ok()) is_optimized = true;
+      RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer.get());
     }
   }
 
@@ -333,16 +366,12 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   // optimizations from taking place since we don't have shape inference for
   // functions, and we can't optimize across function boundaries.
   if (fusion_optimizer != nullptr) {
-    Status status = RunOptimizer(fusion_optimizer, cluster, &optimized_item,
-                                 optimized_graph, &optimization_result);
-    if (status.ok()) is_optimized = true;
+    RUN_OPTIMIZER_OR_RETURN_IF_ERROR(fusion_optimizer);
   }
 
   // ScopedAllocatorOptimizer must run last.
   if (sa_optimizer != nullptr) {
-    Status status = RunOptimizer(sa_optimizer, cluster, &optimized_item,
-                                 optimized_graph, &optimization_result);
-    if (status.ok()) is_optimized = true;
+    RUN_OPTIMIZER_OR_RETURN_IF_ERROR(sa_optimizer);
   }
 
   // Record graph optimization result.
@@ -359,6 +388,8 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   return Status::OK();
 }
 
+#undef RUN_OPTIMIZER_OR_RETURN_IF_ERROR
+
 Status MetaOptimizer::RunOptimizer(
     GraphOptimizer* optimizer, Cluster* cluster, GrapplerItem* optimized_item,
     GraphDef* optimized_graph, GraphOptimizationResult* optimization_result) {
@@ -367,6 +398,7 @@ Status MetaOptimizer::RunOptimizer(
   // resets optimized_graph to an empty graph.
   optimized_graph->Swap(&optimized_item->graph);
   *optimized_graph = GraphDef();
+  optimizer->set_deadline_usec(this->deadline_usec());
   Status status =
       optimizer->Optimize(cluster, *optimized_item, optimized_graph);
   uint64 end_us = Env::Default()->NowMicros();
@@ -393,9 +425,33 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   VLOG(1) << "Starting optimization for grappler item: " << item.id;
   optimization_results_.clear();
 
+  // 0. Original graph might contain a huge function library, that is mostly
+  // unused. This library copied over by each individual Grappler optimizer,
+  // which adds a huge overhead. Before starting optimization passes we just
+  // remove all the unreachable functions.
+  // TODO(ezhulenev): Construct reachable function library definition directly
+  // from the proto without constructing temporary FunctionLibraryDefinition.
+  GraphDef trimmed_graph;  // do not copy graph with a potentially huge library
+  *trimmed_graph.mutable_node() = item.graph.node();
+  *trimmed_graph.mutable_versions() = item.graph.versions();
+  *trimmed_graph.mutable_library() =
+      grappler::ReachableFunctionLibraryDefinition(
+          FunctionLibraryDefinition(OpRegistry::Global(), item.graph.library()),
+          item.graph)
+          .ToProto();
+
+  GrapplerItem trimmed_item = item.WithGraph(std::move(trimmed_graph));
+
+  VLOG(1) << absl::Substitute(
+      "Deleted $0 unreachable functions from the graph (library size = $1)",
+      item.graph.library().function_size() -
+          trimmed_item.graph.library().function_size(),
+      trimmed_item.graph.library().function_size());
+
   // 1. Optimize main graph
-  TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph));
+  TF_RETURN_IF_ERROR(OptimizeGraph(cluster, trimmed_item, optimized_graph));
   VLOG(1) << "Optimized main graph.";
+  GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
 
   // Skip optimizing functions if this is a TPU graph. Currently, Grappler
   // passes do not handle TPU functions correctly in a variety of ways (Note
@@ -407,15 +463,20 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   // optimize TPU functions with Grappler, this check preserves that.
   if (IsTPUGraphDef(*optimized_graph)) {
     VLOG(2) << "Skipping optimizing funcs for TPU graphs";
+    if (VLOG_IS_ON(1)) {
+      DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+    }
     return Status::OK();
   }
 
-  // 2. Optimize function library
-  FunctionLibraryDefinition flib(OpRegistry::Global(),
-                                 optimized_graph->library());
+  // 2. Optimize functions reachable from the optimized graph.
+  FunctionLibraryDefinition flib = ReachableFunctionLibraryDefinition(
+      FunctionLibraryDefinition(OpRegistry::Global(),
+                                optimized_graph->library()),
+      *optimized_graph);
 
   // Find functions for which we might need to compute a gradient at runtime.
-  gtl::FlatSet<string> differentiable_functions;
+  absl::flat_hash_set<string> differentiable_functions;
   for (const NodeDef& node : optimized_graph->node()) {
     if (IsSymbolicGradient(node)) {
       const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
@@ -424,15 +485,20 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   // Optimize each function only once.
-  std::unordered_set<string> optimized_funcs;
+  absl::flat_hash_set<string> optimized_funcs;
   bool optimize_function_library = true;
 
   while (optimize_function_library) {
     optimize_function_library = false;
 
     for (const FunctionDef& func : optimized_graph->library().function()) {
+      GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
+
       const string& func_name = func.signature().name();
 
+      // Skip functions that are not reachable from the optimized graph.
+      if (!flib.Contains(func_name)) continue;
+
       // Skip already optimized functions.
       if (optimized_funcs.find(func_name) != optimized_funcs.end()) continue;
 
@@ -452,15 +518,25 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       // Make a GrapplerItem from a FunctionDef.
       GrapplerFunctionItem func_item;
       TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
-          func, flib, item.graph.versions().producer(), &func_item));
+          func, flib, trimmed_item.graph.versions().producer(), &func_item));
 
       // If we need to compute the gradient of optimized function at runtime, we
       // can't perform non-differentiable rewrites.
       if (differentiable_functions.find(func_name) !=
           differentiable_functions.end()) {
-        func_item.allowed_optimizations.non_differentiable_rewrites = false;
+        func_item.allowed_optimizations().non_differentiable_rewrites = false;
       }
 
+      // Function item is allowed to use all devices from the main graph.
+      Status added_devices = func_item.AddDevices(item);
+      if (!added_devices.ok()) {
+        VLOG(3) << added_devices.error_message();
+      }
+
+      // We can safely inline nested function calls with side-effectful ops into
+      // the function body (see function_optimizer.cc for details).
+      func_item.allowed_optimizations().inline_ops_with_side_effects = true;
+
       // Optimize function body graph.
       GraphDef optimized_func_graph;
       TF_RETURN_IF_ERROR(
@@ -493,6 +569,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   VLOG(1) << "Optimized " << optimized_funcs.size()
           << " functions: " << str_util::Join(optimized_funcs, ", ");
 
+  if (VLOG_IS_ON(1)) {
+    DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+  }
   return Status::OK();
 }
 
@@ -510,32 +589,40 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
   // Nothing to do for MetaOptimizer.
 }
 
-bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
-  if (cfg.disable_meta_optimizer()) {
+bool MetaOptimizerEnabled(const ConfigProto& cfg) {
+  const auto& rewrite_cfg = cfg.graph_options().rewrite_options();
+  if (rewrite_cfg.disable_meta_optimizer()) {
     return false;
   }
-  return !cfg.disable_model_pruning() ||
-         cfg.layout_optimizer() != RewriterConfig::OFF ||
-         cfg.function_optimization() != RewriterConfig::OFF ||
-         cfg.constant_folding() != RewriterConfig::OFF ||
-         cfg.shape_optimization() != RewriterConfig::OFF ||
-         cfg.remapping() != RewriterConfig::OFF ||
-         cfg.arithmetic_optimization() != RewriterConfig::OFF ||
-         cfg.loop_optimization() != RewriterConfig::OFF ||
-         cfg.dependency_optimization() != RewriterConfig::OFF ||
-         cfg.auto_parallel().enable() ||
-         cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
-         cfg.debug_stripper() == RewriterConfig::ON ||
-         cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
-         cfg.pin_to_host_optimization() == RewriterConfig::ON ||
-         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
+  return !rewrite_cfg.disable_model_pruning() ||
+         rewrite_cfg.layout_optimizer() != RewriterConfig::OFF ||
+         rewrite_cfg.function_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.constant_folding() != RewriterConfig::OFF ||
+         rewrite_cfg.shape_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.remapping() != RewriterConfig::OFF ||
+         rewrite_cfg.arithmetic_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.loop_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.dependency_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.auto_parallel().enable() ||
+         rewrite_cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
+         rewrite_cfg.debug_stripper() == RewriterConfig::ON ||
+         rewrite_cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
+         rewrite_cfg.pin_to_host_optimization() == RewriterConfig::ON ||
+         !rewrite_cfg.optimizers().empty() ||
+         !rewrite_cfg.custom_optimizers().empty();
 }
 
-Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph) {
   MetaOptimizer optimizer(cpu_device, cfg);
-  return optimizer.Optimize(cluster, item, optimized_graph);
+  optimizer.set_deadline_usec(
+      DeadlineMicroSeconds(cfg.graph_options().rewrite_options()));
+  Status status = optimizer.Optimize(cluster, item, optimized_graph);
+  if (!status.ok()) {
+    *optimized_graph = item.graph;
+  }
+  return status;
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index 99a0a33ffac8f28e385790fd4b6c7e802b0606a1..a06da4394e4b8a4d8e75855a0a432114f7d7fcb3 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -28,8 +29,7 @@ namespace grappler {
 // Run the other grappler optimizers based on the specified rewriter config.
 class MetaOptimizer : public GraphOptimizer {
  public:
-  MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
-      : cpu_device_(cpu_device), cfg_(cfg) {}
+  MetaOptimizer(DeviceBase* cpu_device, const ConfigProto& cfg);
   ~MetaOptimizer() override = default;
 
   string name() const override { return "meta_optimizer"; };
@@ -66,7 +66,8 @@ class MetaOptimizer : public GraphOptimizer {
                        GraphDef* optimized_graph);
 
   DeviceBase* const cpu_device_;  // may be NULL
-  RewriterConfig cfg_;
+  ConfigProto config_proto_;
+  RewriterConfig& cfg_;
 
   struct OptimizerResult {
     string optimizer_name;
@@ -86,7 +87,7 @@ class MetaOptimizer : public GraphOptimizer {
   std::vector<GraphOptimizationResult> optimization_results_;
 };
 
-bool MetaOptimizerEnabled(const RewriterConfig& cfg);
+bool MetaOptimizerEnabled(const ConfigProto& cfg);
 
 // Run the meta optimizer.
 //
@@ -94,7 +95,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg);
 // during constant folding; if NULL, a new device is created for doing constant
 // folding. For performance, it is recommended to pass in an existing cpu_device
 // when possible.
-Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph);
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 3f3f43382ff60af8ebf6ac63b2daccc4fbd710b3..12db5d6ca9b001fa04e42e6d228fe6289d87726e 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -107,7 +108,7 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
                   GraphDef* optimized_graph) override {
     *optimized_graph = item.graph;
     if (allowed_optimizations_) {
-      allowed_optimizations_->insert({item.id, item.allowed_optimizations});
+      allowed_optimizations_->insert({item.id, item.allowed_optimizations()});
     }
     return Status::OK();
   }
@@ -133,11 +134,13 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   CHECK(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -150,13 +153,15 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerWithParams) {
   CHECK(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizerWithParams");
   auto* custom_config = rewriter_config.add_custom_optimizers();
   custom_config->set_name("TestOptimizerWithParams");
   (*custom_config->mutable_parameter_map())["foo"] = AttrValue();
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -170,13 +175,15 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerAndCustomGraphOptimizer) {
 
   TestOptimizer::SetOptimized(false);
   TestGraphOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizer");
   auto customGraphOptimizer = rewriter_config.add_custom_optimizers();
   customGraphOptimizer->set_name("TestGraphOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -189,11 +196,13 @@ TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -204,13 +213,15 @@ TEST_F(MetaOptimizerTest, RunToggleOptimizersAndCustomGraphOptimizerTwice) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   auto customGraphOptimizer = rewriter_config.add_custom_optimizers();
   customGraphOptimizer->set_name("TestGraphOptimizer");
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -221,13 +232,16 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   using test::function::NDef;
 
   // Enable ony function optimization.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_function_optimization(RewriterConfig::ON);
   rewriter_config.add_optimizers("function");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define function library:
   //
@@ -266,6 +280,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   //   square = MySquare(a);        // a^2
   //   quadratic = MyQuadratic(b);  // b^4
   GrapplerItem item;
+  item.id = "tf_graph";
   item.graph = test::function::GDef(
       {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
        NDef("b", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
@@ -285,36 +300,47 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
                                            output.library());
 
   // Specialized and optimized functions should be added to the graph.
-  EXPECT_EQ(5, optimized_flib.num_functions());
+  EXPECT_EQ(6, optimized_flib.num_functions());
+
+  // Get a specialized function name.
+  const auto specialized_name = [](const string& fn, const string& node,
+                                   const string& id) {
+    return absl::Substitute("$0_specialized_for_$1_at_$2", fn, node, id);
+  };
 
   // MyQuadratic should be specialized once:
   //   0. 'quadratic' node in the main graph
-  const string optimized_0 = "MyQuadratic_specialized_for_quadratic";
+  const string optimized_0 =
+      specialized_name("MyQuadratic", "quadratic", "tf_graph");
 
   // MySquare should be specialized and optimized for 3 instantiations:
-  //   1.  'square' node in the main graph
-  //   2.  'square' node in the MyQuadratic specialization
-  //   3*. 'quadratic' node in the MyQuadratic specialization
-  //        has identical instantiation context to #2
+  //   1. 'square' node in the main graph
+  //   2. 'square' node in the MyQuadratic specialization (not in a fetch set)
+  //   3. 'quadratic' node in the MyQuadratic specialization (is in a fetch set)
 
-  const string optimized_1 = "MySquare_specialized_for_square";
-  const string optimized_2 = "MySquare_specialized_for_square_1";
+  const string optimized_1 = specialized_name("MySquare", "square", "tf_graph");
+  const string optimized_2 =
+      specialized_name("MySquare", "square", optimized_0);
+  const string optimized_3 =
+      specialized_name("MySquare", "quadratic", optimized_0);
 
   const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
   const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
   const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
+  const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3);
 
   ASSERT_NE(optimized_func_0, nullptr);
   ASSERT_NE(optimized_func_1, nullptr);
   ASSERT_NE(optimized_func_2, nullptr);
+  ASSERT_NE(optimized_func_3, nullptr);
 
   // Graph should call optimized function.
   int count = 0;
   for (const NodeDef& node : output.node()) {
-    if (node.name() == "square" && count++) {
-      EXPECT_EQ("MySquare_specialized_for_square", node.op());
-    } else if (node.name() == "quadratic" && count++) {
-      EXPECT_EQ("MyQuadratic_specialized_for_quadratic", node.op());
+    if (node.name() == "square" && ++count) {
+      EXPECT_EQ(optimized_1, node.op());
+    } else if (node.name() == "quadratic" && ++count) {
+      EXPECT_EQ(optimized_0, node.op());
     }
   }
   EXPECT_EQ(2, count);
@@ -322,41 +348,40 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   // Specialized MySquare should call specialized functions.
   count = 0;
   for (const NodeDef& node : optimized_func_0->node_def()) {
-    if (node.name() == "square" && count++) {
-      EXPECT_EQ(optimized_2, node.op());
-    } else if (node.name() == "quadratic" && count++) {
-      // Share specialized function with the 'square' node.
+    if (node.name() == "square" && ++count) {
       EXPECT_EQ(optimized_2, node.op());
+    } else if (node.name() == "quadratic" && ++count) {
+      EXPECT_EQ(optimized_3, node.op());
     }
   }
   EXPECT_EQ(2, count);
 
-  const std::vector<const FunctionDef*> optimized_funcs = {optimized_func_1,
-                                                           optimized_func_2};
+  const std::vector<const FunctionDef*> optimized_funcs = {
+      optimized_func_1, optimized_func_2, optimized_func_3};
 
   // MyMul should be inlined into all optimized versions of MySquare.
   for (const FunctionDef* optimized_func : optimized_funcs) {
     count = 0;
     for (const NodeDef& node : optimized_func->node_def()) {
-      if (node.name() == "my_mul/inlined_inputs" && count++) {
+      if (node.name() == "my_mul/inlined_inputs" && ++count) {
         EXPECT_EQ("IdentityN", node.op());
         EXPECT_EQ(2, node.input_size());
         EXPECT_EQ("x:0", node.input(0));
         EXPECT_EQ("x:0", node.input(1));
-      } else if (node.name() == "my_mul/x" && count++) {
+      } else if (node.name() == "my_mul/x" && ++count) {
         EXPECT_EQ("Identity", node.op());
         EXPECT_EQ(1, node.input_size());
         EXPECT_EQ("my_mul/inlined_inputs:output:0", node.input(0));
-      } else if (node.name() == "my_mul/y" && count++) {
+      } else if (node.name() == "my_mul/y" && ++count) {
         EXPECT_EQ("Identity", node.op());
         EXPECT_EQ(1, node.input_size());
         EXPECT_EQ("my_mul/inlined_inputs:output:1", node.input(0));
-      } else if (node.name() == "my_mul/mul" && count++) {
+      } else if (node.name() == "my_mul/mul" && ++count) {
         EXPECT_EQ("Mul", node.op());
         EXPECT_EQ(2, node.input_size());
         EXPECT_EQ("my_mul/x:output:0", node.input(0));
         EXPECT_EQ("my_mul/y:output:0", node.input(1));
-      } else if (node.name() == "my_mul" && count++) {
+      } else if (node.name() == "my_mul" && ++count) {
         EXPECT_EQ("IdentityN", node.op());
         EXPECT_EQ(1, node.input_size());
         EXPECT_EQ("my_mul/mul:z:0", node.input(0));
@@ -371,13 +396,119 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   item.feed.emplace_back("b", test::AsScalar<int>(4));
   auto tensors_expected = EvaluateFetchNodes(item);
 
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
   test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
 }
 
+TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
+  using test::function::NDef;
+
+  // Enable function optimization and pruning.
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.set_function_optimization(RewriterConfig::ON);
+  rewriter_config.add_optimizers("function");
+  rewriter_config.add_optimizers("pruning");
+  rewriter_config.set_min_graph_nodes(-1);
+
+  MetaOptimizer optimizer(nullptr, config_proto);
+
+  // MyFunc defines two Mul nodes inside function body and two corresponding
+  // function outputs.
+  FunctionDef my_func = FunctionDefHelper::Create(
+      "MyFunc", {"x:T", "y:T"}, {"z1:T", "z2:T"}, {"T: {float, double}"},
+      {{{"mul1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"mul2"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z1", "mul1:z:0"}, {"z2", "mul2:z:0"}});
+  (*my_func.mutable_attr())["_noinline"].set_b(true);
+
+  // Tensorflow graph:
+  //
+  //   a = tf.Placeholder(tf.float);
+  //   b = tf.Placeholder(tf.int32);
+  //
+  //   fn1 = MyFunc(a, b);
+  //   fn2 = MyFunc(a, b);
+  //
+  // Fetch: fn1:0 and fn2:1 via Identity nodes.
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("fn1", "MyFunc", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("fn2", "MyFunc", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+       // Read outputs of function call nodes
+       NDef("out_fn1", "Identity", {"fn1:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_fn2", "Identity", {"fn2:1"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {my_func});
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  FunctionLibraryDefinition optimized_flib(OpRegistry::Global(),
+                                           output.library());
+
+  // Specialized and optimized functions should be added to the graph.
+  EXPECT_EQ(2, optimized_flib.num_functions());
+
+  // Expected names of the specialized and optimized functions.
+  const string optimized_fn1 = "MyFunc_specialized_for_fn1_at_tf_graph";
+  const string optimized_fn2 = "MyFunc_specialized_for_fn2_at_tf_graph";
+
+  const FunctionDef* optimized_func_fn1 = optimized_flib.Find(optimized_fn1);
+  const FunctionDef* optimized_func_fn2 = optimized_flib.Find(optimized_fn2);
+
+  ASSERT_NE(optimized_func_fn1, nullptr);
+  ASSERT_NE(optimized_func_fn2, nullptr);
+
+  // Graph should call optimized function.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "fn1" && ++count) {
+      EXPECT_EQ(optimized_fn1, node.op());
+    } else if (node.name() == "fn2" && ++count) {
+      EXPECT_EQ(optimized_fn2, node.op());
+    }
+  }
+  EXPECT_EQ(2, count);
+
+  // Specialized MyFuncs should have just one Mul node and single output arg.
+
+  // 1. Specialized for fn1:0.
+  ASSERT_EQ(1, optimized_func_fn1->node_def_size());
+  EXPECT_EQ(1, optimized_func_fn1->signature().output_arg_size());
+  EXPECT_EQ("z1", optimized_func_fn1->signature().output_arg(0).name());
+  EXPECT_EQ("mul1", optimized_func_fn1->node_def(0).name());
+
+  // 2. Specialized for fn2:1.
+  ASSERT_EQ(1, optimized_func_fn2->node_def_size());
+  EXPECT_EQ(1, optimized_func_fn2->signature().output_arg_size());
+  EXPECT_EQ("z2", optimized_func_fn2->signature().output_arg(0).name());
+  EXPECT_EQ("mul2", optimized_func_fn2->node_def(0).name());
+
+  // Verify that output tensors are equal.
+  item.fetch = {"out_fn1", "out_fn2"};
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
+  item.feed.emplace_back("b", test::AsScalar<float>(3.123f));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
+}
+
 TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
   using test::function::NDef;
   using FDH = FunctionDefHelper;
@@ -390,12 +521,15 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
       &allowed_optimizations);
 
   // Just record properties of optimized Grappler items.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define simple function library with two identical mul functions.
   FunctionDef mul_func_1 = FunctionDefHelper::Create(
@@ -461,6 +595,71 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
   EXPECT_FALSE(allowed_optimizations_my_mul_2->non_differentiable_rewrites);
 }
 
+class SleepingOptimizer : public CustomGraphOptimizer {
+ public:
+  SleepingOptimizer() {}
+  string name() const override { return "test_optimizer"; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override {
+    *optimized_graph = item.graph;
+    optimized_graph->add_node();
+    sleep(1);
+    return Status::OK();
+  }
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override {}
+};
+
+REGISTER_GRAPH_OPTIMIZER(SleepingOptimizer);
+
+TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  ConfigProto config;
+  RewriterConfig& rewriter_config =
+      *config.mutable_graph_options()->mutable_rewrite_options();
+  rewriter_config.add_optimizers("SleepingOptimizer");
+  rewriter_config.set_min_graph_nodes(-1);
+  rewriter_config.set_meta_optimizer_timeout_ms(1500);
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+
+  GraphDef output;
+  const Status status =
+      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
+  EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
+  // Make sure the graph was reverted to the original regardless of when the
+  // optimizer timed out.
+  CompareGraphs(item.graph, output);
+}
+
+TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  ConfigProto config;
+  RewriterConfig& rewriter_config =
+      *config.mutable_graph_options()->mutable_rewrite_options();
+  rewriter_config.add_optimizers("SleepingOptimizer");
+  rewriter_config.set_min_graph_nodes(-1);
+  rewriter_config.set_meta_optimizer_timeout_ms(1500);
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
+  GraphDef output;
+  const Status status =
+      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 36eab4999d0ff3a608a991eeb920c4159d911ad1..c548c570e07499ae326ca57ec83ea1b5738fdaf6 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -14,38 +14,410 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+
 #include <unordered_set>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
-bool IsTrivialOp(const NodeDef& node, const GraphRewriter& rewriter) {
+bool IsTrivialIdentity(const NodeDef& node,
+                       const MutableGraphView& graph_view) {
+  for (const auto input :
+       graph_view.GetFanins(node, /*include_controlling_nodes=*/true)) {
+    if (input.port_id == Graph::kControlSlot) {
+      // Node is driven by control dependency.
+      return false;
+    } else if (IsSwitch(*input.node)) {  // Node is driven by switch.
+      return false;
+    }
+  }
+  for (const auto output :
+       graph_view.GetFanouts(node, /*include_controlled_nodes=*/true)) {
+    if (output.port_id == Graph::kControlSlot) {
+      // Node drives control dependency.
+      return false;
+    } else if (IsMerge(*output.node)) {  // Node feeds merge.
+      return false;
+    }
+  }
+  return true;
+}
+
+bool IsTrivialOp(const NodeDef& node, const MutableGraphView& graph_view) {
   // Remove the stop gradient nodes since they serve no purpose once the graph
   // is built. Also remove Identity ops.
   if (IsStopGradient(node)) {
     return true;
   }
-  if (IsIdentity(node)) {
-    if (rewriter.FeedsMerge(node) || rewriter.IsDrivenBySwitch(node) ||
-        rewriter.IsDrivenByControlDependency(node) ||
-        rewriter.DrivesControlDependency(node)) {
+  if (IsIdentity(node) || IsIdentityNSingleInput(node)) {
+    return IsTrivialIdentity(node, graph_view);
+  }
+
+  return IsAddN(node) && NumNonControlInputs(node) <= 1;
+}
+
+bool RemovalIncreasesEdgeCount(const NodeDef& node,
+                               const MutableGraphView& graph_view) {
+  int in_degree =
+      graph_view.NumFanins(node, /*include_controlling_nodes=*/true);
+  int out_degree =
+      graph_view.NumFanouts(node, /*include_controlling_nodes=*/true);
+  return in_degree * out_degree > in_degree + out_degree;
+}
+
+bool IsOutputPortRefValue(const NodeDef& node, int port_id,
+                          const OpRegistryInterface& op_registry) {
+  const OpRegistrationData* op_reg_data = nullptr;
+  Status s = op_registry.LookUp(node.op(), &op_reg_data);
+  if (s.ok()) {
+    DataType output_type;
+    s = OutputTypeForNode(node, op_reg_data->op_def, port_id, &output_type);
+    if (s.ok() && IsRefType(output_type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CanRemoveNode(const NodeDef& node, const MutableGraphView& graph_view,
+                   const absl::flat_hash_set<string>& function_names,
+                   const OpRegistryInterface& op_registry) {
+  if (RemovalIncreasesEdgeCount(node, graph_view)) {
+    return false;
+  }
+  for (const auto input :
+       graph_view.GetFanins(node, /*include_controlling_nodes=*/true)) {
+    if (node.device() != input.node->device()) {
+      // Node is driven by a different device.
+      return false;
+    } else if (input.port_id == Graph::kControlSlot) {
+      // Node is driven by control dependency.
+      continue;
+    } else if (function_names.find(input.node->op()) != function_names.end()) {
+      // Node input is a function call.
+      return false;
+    } else if (IsOutputPortRefValue(*input.node, input.port_id, op_registry)) {
+      return false;
+    }
+  }
+  for (const auto output :
+       graph_view.GetFanouts(node, /*include_controlled_nodes=*/false)) {
+    if (function_names.find(output.node->op()) != function_names.end()) {
+      // Node output is a function call.
       return false;
+    }
+  }
+  return true;
+}
+
+void ForwardInputsInternal(
+    const NodeDef& node,
+    const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
+    bool add_as_control, NodeDef* new_node,
+    const absl::flat_hash_map<string, const NodeDef*>& optimized_nodes,
+    const MutableGraphView& graph_view) {
+  // To speed things up, use the optimized version of the node if
+  // available.
+  auto itr = optimized_nodes.find(node.name());
+  if (itr != optimized_nodes.end()) {
+    for (const string& input : itr->second->input()) {
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+    }
+    return;
+  }
+  for (const auto& input : node.input()) {
+    const NodeDef* input_node = graph_view.GetNode(NodeName(input));
+    if (input_node == nullptr) {
+      // Invalid input, preserve it as is.
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+      continue;
+    }
+    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
+      ForwardInputsInternal(*input_node, nodes_to_delete,
+                            add_as_control || IsControlInput(input), new_node,
+                            optimized_nodes, graph_view);
     } else {
-      return true;
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
     }
   }
-  if (IsAddN(node) && NumNonControlInputs(node) <= 1) {
-    return true;
+}
+
+void ForwardInputs(const NodeDef& original_node,
+                   const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
+                   NodeDef* new_node,
+                   absl::flat_hash_map<string, const NodeDef*>* optimized_nodes,
+                   const MutableGraphView& graph_view) {
+  // Forwards inputs of nodes to be deleted to their respective outputs.
+  ForwardInputsInternal(original_node, nodes_to_delete,
+                        /*add_as_control=*/false, new_node, *optimized_nodes,
+                        graph_view);
+  if (!new_node->name().empty()) {
+    (*optimized_nodes)[new_node->name()] = new_node;
   }
+  // Reorder inputs such that control inputs come after regular inputs.
+  int pos = 0;
+  for (int i = 0; i < new_node->input_size(); ++i) {
+    if (!IsControlInput(new_node->input(i))) {
+      new_node->mutable_input()->SwapElements(pos, i);
+      ++pos;
+    }
+  }
+  DedupControlInputs(new_node);
+}
 
-  return false;
+absl::flat_hash_map<string, absl::flat_hash_set<int>> IdentityNTerminalPorts(
+    const NodeMap& node_map, const std::vector<string>& terminal_nodes,
+    int graph_size) {
+  // Determines which ports for IdentityN nodes (that can be rewritten) lead to
+  // a terminal node.
+  std::vector<string> to_visit;
+  to_visit.reserve(graph_size);
+  // Set terminal nodes as visited so terminal nodes that may be IdentityN don't
+  // get pruned later on.
+  absl::flat_hash_set<string> visited(terminal_nodes.begin(),
+                                      terminal_nodes.end());
+  for (string terminal_node : terminal_nodes) {
+    NodeDef* node = node_map.GetNode(terminal_node);
+    if (node == nullptr) {
+      continue;
+    }
+    for (string input : node->input()) {
+      to_visit.push_back(input);
+    }
+  }
+
+  absl::flat_hash_set<string> identity_n_fanouts;
+  while (!to_visit.empty()) {
+    string curr = to_visit.back();
+    to_visit.pop_back();
+    NodeDef* curr_node = node_map.GetNode(curr);
+    if (curr_node == nullptr ||
+        visited.find(curr_node->name()) != visited.end()) {
+      continue;
+    }
+    // For IdentityN nodes, only traverse up through the port that comes from a
+    // terminal node along with control inputs. The IdentityN node is not marked
+    // as visited so other node input traversals can go through the other ports
+    // of the IdentityN node.
+    if (IsIdentityN(*curr_node)) {
+      if (identity_n_fanouts.find(curr) == identity_n_fanouts.end()) {
+        identity_n_fanouts.emplace(curr);
+        int pos = NodePositionIfSameNode(curr, curr_node->name());
+        if (pos >= 0) {
+          to_visit.push_back(curr_node->input(pos));
+        }
+        for (const string& input : curr_node->input()) {
+          if (IsControlInput(input) &&
+              identity_n_fanouts.find(input) == identity_n_fanouts.end()) {
+            to_visit.push_back(input);
+          }
+        }
+      }
+    } else {
+      for (const string& input : curr_node->input()) {
+        to_visit.push_back(input);
+      }
+      visited.emplace(curr_node->name());
+    }
+  }
+
+  absl::flat_hash_map<string, absl::flat_hash_set<int>> identity_n_ports;
+  for (const auto& fanout : identity_n_fanouts) {
+    int pos;
+    string node_name = ParseNodeName(fanout, &pos);
+    if (node_name.empty() || pos < 0) {  // Exclude control inputs.
+      continue;
+    }
+    if (identity_n_ports.find(node_name) == identity_n_ports.end()) {
+      identity_n_ports[node_name] = {pos};
+    } else {
+      identity_n_ports[node_name].emplace(pos);
+    }
+  }
+
+  return identity_n_ports;
+}
+
+string NewIdentityFromIdentityN(int pos, const NodeDef& identity_n,
+                                GraphDef* graph, NodeMap* node_map) {
+  // TODO(lyandy): Migrate over to GrapplerOptimizerStage and use
+  // OptimizedNodeName for new node name.
+  string new_node_name =
+      strings::StrCat(identity_n.name(), "-", pos, "-grappler-ModelPruner");
+  if (node_map->NodeExists(new_node_name)) {
+    return "";
+  }
+  NodeDef* new_node = graph->add_node();
+  Status status = NodeDefBuilder(new_node_name, "Identity")
+                      .Input(identity_n.input(pos), 0,
+                             identity_n.attr().at("T").list().type(pos))
+                      .Device(identity_n.device())
+                      .Finalize(new_node);
+  if (!status.ok()) {
+    return "";
+  }
+  node_map->AddNode(new_node->name(), new_node);
+  node_map->AddOutput(NodeName(new_node->input(0)), new_node->name());
+  return new_node->name();
+}
+
+Status RewriteIdentityNAndInputsOutputs(
+    NodeDef* node, int num_non_control_inputs,
+    const absl::flat_hash_set<int>& terminal_ports, GraphDef* graph,
+    NodeMap* node_map) {
+  // Rewrite IdentityN node and associated inputs and outputs. For inputs and
+  // outputs that don't lead to a terminal node, a new Identity node is created
+  // and those inputs and outputs are rewritten to use the new Identity node as
+  // their outputs and inputs respectively. For the remaining nodes, the ouputs
+  // have their inputs updated with the adjusted port, from the IdentityN node
+  // having less inputs.
+  struct NodeOutputUpdate {
+    string input;
+    string output;
+  };
+
+  absl::flat_hash_map<int, int> terminal_input_pos;
+  absl::flat_hash_map<int, string> new_identities;
+  int new_idx = 0;
+  for (int i = 0; i < num_non_control_inputs; i++) {
+    if (terminal_ports.find(i) != terminal_ports.end()) {
+      terminal_input_pos[i] = new_idx++;
+    } else {
+      string identity = NewIdentityFromIdentityN(i, *node, graph, node_map);
+      if (identity.empty()) {
+        // Fail early when creating Identity from IdentityN errors.
+        return errors::Internal(
+            "Could not create Identity node from IdentityN node ", node->name(),
+            " at port ", i);
+      }
+      new_identities[i] = identity;
+    }
+  }
+
+  std::vector<NodeOutputUpdate> updates;
+  for (NodeDef* output : node_map->GetOutputs(node->name())) {
+    for (int i = 0; i < output->input_size(); i++) {
+      string input = output->input(i);
+      if (IsControlInput(input)) {
+        continue;
+      }
+      TensorId input_tensor = ParseTensorName(input);
+      if (input_tensor.node() == node->name()) {
+        if (terminal_ports.find(input_tensor.index()) == terminal_ports.end()) {
+          // Replace input that does not lead to a terminal node with newly
+          // created identity.
+          string new_identity = new_identities[input_tensor.index()];
+          output->set_input(i, new_identity);
+          updates.push_back({new_identity, output->name()});
+        } else {
+          // Update input ports that lead to a terminal node from splitting
+          // inputs.
+          int new_pos = terminal_input_pos[input_tensor.index()];
+          string updated_input_name =
+              new_pos > 0 ? strings::StrCat(node->name(), ":", new_pos)
+                          : node->name();
+          output->set_input(i, updated_input_name);
+        }
+      }
+    }
+  }
+
+  for (NodeOutputUpdate update : updates) {
+    node_map->AddOutput(update.input, update.output);
+  }
+
+  // Update inputs and types by removing inputs that were split away from
+  // main IdentityN node.
+  const int num_inputs = node->input_size();
+  int curr_pos = 0;
+  auto mutable_inputs = node->mutable_input();
+  auto mutable_types =
+      node->mutable_attr()->at("T").mutable_list()->mutable_type();
+  for (int i = 0; i < num_non_control_inputs; i++) {
+    if (terminal_input_pos.find(i) != terminal_input_pos.end()) {
+      mutable_inputs->SwapElements(i, curr_pos);
+      mutable_types->SwapElements(i, curr_pos);
+      curr_pos++;
+    }
+  }
+  mutable_types->Truncate(curr_pos);
+  // Control inputs.
+  for (int i = num_non_control_inputs; i < num_inputs; i++) {
+    mutable_inputs->SwapElements(i, curr_pos++);
+  }
+  mutable_inputs->DeleteSubrange(curr_pos, num_inputs - curr_pos);
+
+  return Status::OK();
+}
+
+Status SplitIdentityNInputs(GraphDef* graph,
+                            const std::vector<string>& terminal_nodes,
+                            bool* updated_graph) {
+  // For inputs of IdentityN nodes that do not lead to a terminal node, remove
+  // them from IdentityN and create new individual Identity nodes. This will
+  // allow ModelPruner to possibly remove nodes in the transitive fanin of the
+  // newly created Identity nodes.
+  NodeMap node_map(graph);
+
+  for (auto const& terminal :
+       IdentityNTerminalPorts(node_map, terminal_nodes, graph->node_size())) {
+    NodeDef* node = node_map.GetNode(terminal.first);
+    if (node == nullptr) {
+      continue;
+    }
+
+    const int num_non_control_inputs = NumNonControlInputs(*node);
+    if (node->attr().count("T") == 0 ||
+        node->attr().at("T").list().type_size() != num_non_control_inputs ||
+        terminal.second.size() >= num_non_control_inputs) {
+      continue;
+    }
+
+    TF_RETURN_IF_ERROR(RewriteIdentityNAndInputsOutputs(
+        node, num_non_control_inputs, terminal.second, graph, &node_map));
+    *updated_graph = true;
+  }
+
+  return Status::OK();
+}
+
+Status SetTransitiveFaninGraph(const GraphDef& input_graph,
+                               GraphDef* output_graph,
+                               const std::vector<string>& terminal_nodes) {
+  // Determines transitive fanin nodes from terminal nodes and add them to the
+  // output graph.
+  bool ill_formed = false;
+  std::vector<const NodeDef*> keep =
+      ComputeTransitiveFanin(input_graph, terminal_nodes, &ill_formed);
+  if (ill_formed) {
+    // Some graph edges are invalid, or some of the feeds/fetch don't exist:
+    // let's be conservative and preserve the graph as is.
+    return errors::InvalidArgument("Invalid input graph.");
+  }
+  // Try to keep the nodes ordered somewhat topologically since this helps
+  // further optimizations perform better.
+  output_graph->mutable_node()->Reserve(keep.size());
+  for (int i = keep.size() - 1; i >= 0; --i) {
+    *output_graph->add_node() = *keep[i];
+  }
+
+  return Status::OK();
 }
 
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -60,30 +432,32 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     std::vector<string> terminal_nodes(nodes_to_preserve.begin(),
                                        nodes_to_preserve.end());
     std::sort(terminal_nodes.begin(), terminal_nodes.end());
-    bool ill_formed = false;
-    std::vector<const NodeDef*> keep =
-        ComputeTransitiveFanin(item.graph, terminal_nodes, &ill_formed);
-    if (ill_formed) {
-      // Some graph edges are invalid, or some of the feeds/fetch don't exist:
-      // let's be conservative and preserve the graph as is.
-      return errors::InvalidArgument("Invalid input graph.");
-    }
-    // Try to keep the nodes ordered somewhat topologically since this helps
-    // further optimizations perform better.
-    runnable_item.graph.mutable_node()->Reserve(keep.size());
-    for (int i = keep.size() - 1; i >= 0; --i) {
-      *runnable_item.graph.add_node() = *keep[i];
+    TF_RETURN_IF_ERROR(SetTransitiveFaninGraph(item.graph, &runnable_item.graph,
+                                               terminal_nodes));
+    bool did_split_identity_n = false;
+    TF_RETURN_IF_ERROR(SplitIdentityNInputs(
+        &runnable_item.graph, terminal_nodes, &did_split_identity_n));
+    if (did_split_identity_n) {
+      GraphDef fanin_split_identity_n_graph;
+      TF_RETURN_IF_ERROR(SetTransitiveFaninGraph(
+          runnable_item.graph, &fanin_split_identity_n_graph, terminal_nodes));
+      runnable_item.graph.Swap(&fanin_split_identity_n_graph);
     }
   } else {
     runnable_item = item;
   }
 
-  GraphRewriter rewriter(runnable_item);
+  MutableGraphView graph_view(&runnable_item.graph);
+  absl::flat_hash_set<string> function_names;
+  for (const auto& function : item.graph.library().function()) {
+    function_names.insert(function.signature().name());
+  }
+  OpRegistryInterface* op_registry = OpRegistry::Global();
 
   // Check if we can further prune the graph, by removing the trivial ops.
-  std::unordered_set<const NodeDef*> nodes_to_delete;
+  absl::flat_hash_set<const NodeDef*> nodes_to_delete;
   for (auto& node : runnable_item.graph.node()) {
-    if (!IsTrivialOp(node, rewriter)) {
+    if (!IsTrivialOp(node, graph_view)) {
       continue;
     }
 
@@ -106,10 +480,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     //   converting references to non-references. It is important to preserve
     //   these non-references since the partitioner will avoid sending
     //   non-references across partitions more than once.
-    if (!rewriter.RemovalIncreasesEdgeCount(node) &&
-        !rewriter.IsConnectedToFunction(node) &&
-        !rewriter.IsDrivenByAnotherDevice(node) &&
-        !rewriter.ReceivesRefValue(node)) {
+    if (CanRemoveNode(node, graph_view, function_names, *op_registry)) {
       nodes_to_delete.insert(&node);
     }
   }
@@ -125,13 +496,15 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   const bool fetches_are_known = !item.fetch.empty();
   pruned_graph->mutable_node()->Reserve(runnable_item.graph.node_size());
+  absl::flat_hash_map<string, const NodeDef*> optimized_nodes;
   for (auto& node : runnable_item.graph.node()) {
     if (!fetches_are_known ||
         nodes_to_delete.find(&node) == nodes_to_delete.end()) {
       NodeDef* new_node = pruned_graph->add_node();
       *new_node = node;
       new_node->clear_input();
-      rewriter.ForwardInputs(node, nodes_to_delete, new_node);
+      ForwardInputs(node, nodes_to_delete, new_node, &optimized_nodes,
+                    graph_view);
     }
   }
   VLOG(1) << "Pruned " << nodes_to_delete.size()
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index cf5b990377f7c6a1b7206260bf8a11bc7788e30a..b988e1bfee137f63a867f6c6cbfc103396eda1d2 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -143,6 +144,113 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
   test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
+TEST_F(ModelPrunerTest, IdentityNInputPruning) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 2.0f, {10, 10});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
+  Output c = ops::Const(s.WithOpName("c"), 3.0f, {10, 10});
+  Output d = ops::Const(s.WithOpName("d"), 4.0f, {10, 10});
+  auto e =
+      ops::IdentityN(s.WithOpName("e").WithControlDependencies(d), {a, b, c});
+  auto f = ops::IdentityN(s.WithOpName("f"), {e[2], e[1], e[0]});
+  Output g = ops::Sqrt(s.WithOpName("g"), {f[1]});
+  Output h = ops::Sqrt(s.WithOpName("h"), {f[2]});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  item.fetch = {"g", "h"};
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(7, output.node_size());
+  const NodeDef& new_g = output.node(0);
+  EXPECT_EQ("g", new_g.name());
+  const NodeDef& new_a = output.node(1);
+  EXPECT_EQ("a", new_a.name());
+  const NodeDef& new_b = output.node(2);
+  EXPECT_EQ("b", new_b.name());
+  const NodeDef& new_d = output.node(3);
+  EXPECT_EQ("d", new_d.name());
+  const NodeDef& new_e = output.node(4);
+  EXPECT_EQ("e", new_e.name());
+  const NodeDef& new_f = output.node(5);
+  EXPECT_EQ("f", new_f.name());
+  const NodeDef& new_h = output.node(6);
+  EXPECT_EQ("h", new_h.name());
+
+  // Node "c" is pruned along with inputs leading to "c".
+  EXPECT_EQ(3, new_e.input_size());
+  EXPECT_EQ("a", new_e.input(0));
+  EXPECT_EQ("b", new_e.input(1));
+  EXPECT_EQ("^d", new_e.input(2));
+  EXPECT_EQ(2, new_f.input_size());
+  EXPECT_EQ("e:1", new_f.input(0));
+  EXPECT_EQ("e", new_f.input(1));
+  EXPECT_EQ(1, new_g.input_size());
+  EXPECT_EQ("f", new_g.input(0));
+  EXPECT_EQ(1, new_h.input_size());
+  EXPECT_EQ("f:1", new_h.input(0));
+
+  auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
+  auto actual_tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(2, expected_tensors.size());
+  EXPECT_EQ(2, actual_tensors.size());
+  for (int i = 0; i < expected_tensors.size(); i++) {
+    test::ExpectTensorEqual<float>(expected_tensors[i], actual_tensors[i]);
+  }
+}
+
+TEST_F(ModelPrunerTest, IdentityNInputPruningWithIdentityNInFetch) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a = ops::Const(s.WithOpName("a"), 2.0f, {10, 10});
+  Output b = ops::Sqrt(s.WithOpName("b"), {a});
+  Output c = ops::Const(s.WithOpName("c"), 3.0f, {10, 10});
+  Output d = ops::Const(s.WithOpName("d"), 4.0f, {10, 10});
+  auto e =
+      ops::IdentityN(s.WithOpName("e").WithControlDependencies(d), {a, b, c});
+  auto f = ops::IdentityN(s.WithOpName("f"), {e[0], e[1], e[2]});
+  auto g = ops::IdentityN(s.WithOpName("g"), {f[1]});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  item.fetch = {"g"};
+  ModelPruner pruner;
+  GraphDef output;
+  Status status = pruner.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  const NodeDef& new_a = output.node(0);
+  EXPECT_EQ("a", new_a.name());
+  const NodeDef& new_b = output.node(1);
+  EXPECT_EQ("b", new_b.name());
+  const NodeDef& new_d = output.node(2);
+  EXPECT_EQ("d", new_d.name());
+  const NodeDef& new_e = output.node(3);
+  EXPECT_EQ("e", new_e.name());
+  const NodeDef& new_g = output.node(4);
+  EXPECT_EQ("g", new_g.name());
+
+  EXPECT_EQ(2, new_e.input_size());
+  EXPECT_EQ("b", new_e.input(0));
+  EXPECT_EQ("^d", new_e.input(1));
+  EXPECT_EQ(1, new_g.input_size());
+  // Single output IdentityN (node "f") was pruned.
+  EXPECT_EQ("e", new_g.input(0));
+
+  auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
+  auto actual_tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
+}
+
 TEST_F(ModelPrunerTest, NoOpPruning) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index 8ed4271fa4eb9198ef8537247f5948f559889fff..9845fb08d5060445b994b1c998ddd600a842e155 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -113,7 +113,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
 
   // These nodes may be optimized away downstream (even if pinned to Host), we
   // should (recusively) check their source.
-  if (IsIdentity(node)) {
+  if (IsIdentity(node) || IsIdentityNSingleInput(node)) {
     for (const auto& fanin : graph.GetFanins(node, false)) {
       bool fanin_candidate = false;
       TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
@@ -259,6 +259,8 @@ Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
   return Status::OK();
 }
 
+// Tries to find a Host device from `devices`. Returns empty string if no
+// matching Host device is found.
 string TryFindHostDevice(const gtl::FlatSet<string>& devices,
                          bool has_device_cpu, const string& device) {
   // Force this node onto the CPU.
@@ -280,8 +282,8 @@ string TryFindHostDevice(const gtl::FlatSet<string>& devices,
     }
   }
 
-  // We couldn't find an appropriate Host device, return original device.
-  return device;
+  // We couldn't find an appropriate Host device, return no device.
+  return "";
 }
 
 bool IsTPUGraphDef(const GraphDef& def) {
@@ -325,6 +327,7 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   std::vector<std::pair<NodeDef*, string>> const_nodes;
 
   for (auto& node : *optimized_graph->mutable_node()) {
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     bool is_candidate = false;
     TF_RETURN_IF_ERROR(
         internal::IsNodeHostCandidate(graph, &properties, node, &is_candidate));
@@ -332,16 +335,20 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       continue;
     }
 
-    if (IsConstant(node)) {
-      const_nodes.emplace_back(&node, node.device());
+    string device =
+        internal::TryFindHostDevice(devices, has_device_cpu, node.device());
+    if (!device.empty()) {
+      // Keep track of all Const nodes that we swapped.
+      if (IsConstant(node)) {
+        const_nodes.emplace_back(&node, node.device());
+      }
+      *node.mutable_device() = std::move(device);
     }
-    // Try and swap the device to Host.
-    node.set_device(
-        internal::TryFindHostDevice(devices, has_device_cpu, node.device()));
   }
 
   // Traverse all `const_nodes`, and map them back to GPU greedily.
   for (auto& it : const_nodes) {
+    GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     NodeDef* node = it.first;
     const string& device = it.second;
 
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
index 7c64529441816158868801f8702d9969204c4172..3f7ff678ed4ff5e41b3b253be6d23d07ad6dedef 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
@@ -28,30 +28,40 @@ namespace {
 
 class PinToHostOptimizerTest : public GrapplerTest {};
 
-TEST_F(PinToHostOptimizerTest, TryFindHostDevice) {
+TEST_F(PinToHostOptimizerTest, TryFindHostDeviceNoDevices) {
   gtl::FlatSet<string> devices = {};
-  EXPECT_EQ("ABC", internal::TryFindHostDevice(devices, false, "ABC"));
 
-  devices = {"/device:CPU:0", "/device:XLA_GPU:0"};
+  EXPECT_EQ(internal::TryFindHostDevice(devices, false, "ABC"), "");
+}
+
+TEST_F(PinToHostOptimizerTest, TryFindHostDeviceCpuXlaGpu) {
+  gtl::FlatSet<string> devices = {"/device:CPU:0", "/device:XLA_GPU:0"};
+
   EXPECT_EQ(internal::TryFindHostDevice(devices, true, ""), "/device:CPU:0");
   EXPECT_EQ(internal::TryFindHostDevice(devices, true, "/device:XLA_GPU:0"),
             "/device:CPU:0");
   EXPECT_EQ(internal::TryFindHostDevice(devices, true, "/device:XLA_GPU:*"),
             "/device:CPU:0");
+}
+
+TEST_F(PinToHostOptimizerTest, TryFindHostDeviceXlaCpuXlaGpu) {
+  gtl::FlatSet<string> devices = {"/device:XLA_CPU:0", "/device:XLA_GPU:0"};
 
-  devices = {"/device:XLA_CPU:0", "/device:XLA_GPU:0"};
   EXPECT_EQ(internal::TryFindHostDevice(devices, false, ""), "");
   EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:0"),
             "/device:XLA_CPU:0");
   EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:*"),
             "/device:XLA_CPU:0");
+}
+
+TEST_F(PinToHostOptimizerTest, TryFindHostDeviceXlaGpu) {
+  gtl::FlatSet<string> devices = {"/device:XLA_GPU:0"};
 
-  devices = {"/device:XLA_GPU:0"};
   EXPECT_EQ(internal::TryFindHostDevice(devices, false, ""), "");
   EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:0"),
-            "/device:XLA_GPU:0");
+            "");
   EXPECT_EQ(internal::TryFindHostDevice(devices, false, "/device:XLA_GPU:*"),
-            "/device:XLA_GPU:*");
+            "");
 }
 
 TEST_F(PinToHostOptimizerTest, OptimizeSmallOpsToHost) {
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 9ada8b7ff9546b097a1bd347c31dcfb8470d36c7..3fb3f2b0ec75d1a628445a2f5e4d58e7a498c893 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -22,19 +23,498 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
 
-void AddBatchNormNodes(GraphDef* optimized_graph, const NodeDef& fused_node) {
+namespace {
+
+constexpr char kFusedConv2D[] = "_FusedConv2D";
+
+constexpr char kDataFormat[] = "data_format";
+constexpr char kIsTraining[] = "is_training";
+
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+bool EigenSupportsContractionOutputKernel() {
+#if defined(EIGEN_USE_LIBXSMM)
+  return false;
+#endif
+  return true;
+}
+
+struct RemapperContext {
+  explicit RemapperContext(const GrapplerItem& item)
+      : nodes_to_preserve(item.NodesToPreserve()),
+        graph_view(&item.graph),
+        graph_properties(item),
+        inferred_graph_properties(false) {}
+
+  std::unordered_set<string> nodes_to_preserve;
+  GraphView graph_view;
+  GraphProperties graph_properties;
+  bool inferred_graph_properties;
+};
+
+// FusedBatchNorm that can be replaced with a cheaper set of primitives.
+struct FusedBatchNorm {
+  const NodeDef* fused_batch_norm = nullptr;
+};
+
+// Conv2D node followed by a BiasAdd.
+struct Conv2DWithBiasAdd {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* bias_add = nullptr;
+};
+
+// Conv2D node followed by a BiasAdd and Relu.
+struct Conv2DWithBiasAddAndRelu {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* bias_add = nullptr;
+  const NodeDef* relu = nullptr;
+};
+
+// Conv2D node followed by a Squeeze and BiasAdd.
+struct Conv2DWithSqueezeAndBiasAdd {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* squeeze = nullptr;
+  const NodeDef* bias_add = nullptr;
+};
+
+// Conv2D node followed by a FusedBatchNorm.
+struct Conv2DWithBatchNorm {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* fused_batch_norm = nullptr;
+  float epsilon = 0.0;
+};
+
+// Conv2D node followed by a FusedBatchNorm and Relu.
+struct Conv2DWithBatchNormAndRelu {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* fused_batch_norm = nullptr;
+  const NodeDef* relu = nullptr;
+  float epsilon = 0.0;
+};
+
+bool IsFloatOrDoubleDataType(const NodeDef* node,
+                             const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == DT_FLOAT || dtype == DT_DOUBLE;
+}
+
+bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
+                      const string& type_attr = "T") {
+  DataType lhs_attr = GetDataTypeFromAttr(*lhs, type_attr);
+  DataType rhs_attr = GetDataTypeFromAttr(*rhs, type_attr);
+
+  return lhs_attr != DT_INVALID && rhs_attr != DT_INVALID &&
+         lhs_attr == rhs_attr;
+}
+
+bool HasDataType(const NodeDef* node, const DataType& expected,
+                 const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == expected;
+}
+
+bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
+  return ctx.nodes_to_preserve.count(node->name()) > 0;
+}
+
+bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* node,
+                        Conv2DWithBiasAdd* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a BiasAdd.
+  if (!node) return false;
+  if (!IsBiasAdd(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // Input to the BiasAdd must be a Conv2D in NHWC format.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
+  if (!conv2d.node) return false;
+  if (!IsConv2D(*conv2d.node)) return false;
+  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+BiasAdd pattern.
+  matched->conv2d = conv2d.node;
+  matched->bias_add = node;
+
+  return true;
+}
+
+bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* node,
+                               Conv2DWithBiasAddAndRelu* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a Relu.
+  if (!node) return false;
+  if (!IsRelu(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // And input to Relu must match Conv2DWithBiasAdd pattern.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto bias_add = ctx.graph_view.GetRegularFanin(input_port);
+
+  Conv2DWithBiasAdd base;
+  if (!FindConv2DWithBias(ctx, bias_add.node, &base)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, base.bias_add)) return false;
+  if (!HaveSameDataType(node, base.bias_add)) return false;
+  if (IsInPreserveSet(ctx, base.bias_add)) return false;
+
+  // We successfully found a Conv2D+BiasAdd+Relu pattern.
+  matched->conv2d = base.conv2d;
+  matched->bias_add = base.bias_add;
+  matched->relu = node;
+
+  return true;
+}
+
+bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
+                                  const NodeDef* node,
+                                  Conv2DWithSqueezeAndBiasAdd* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a BiasAdd.
+  if (node == nullptr) return false;
+  if (node->op() != "BiasAdd") return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // Input to the BiasAdd must be a Squeeze.
+  const auto bias_input_port = GraphView::InputPort(node, 0);
+  const auto squeeze = ctx.graph_view.GetRegularFanin(bias_input_port);
+  if (squeeze.node == nullptr) return false;
+  if (squeeze.node->op() != "Squeeze") return false;
+  if (!NodeIsOnCpu(squeeze.node)) return false;
+  if (!HaveSameDataType(node, squeeze.node, "T")) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, squeeze.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, squeeze.node)) return false;
+  if (IsInPreserveSet(ctx, squeeze.node)) return false;
+
+  // Squeeze must not squeeze output channel dimension.
+  std::vector<int32> dims;
+  if (!GetNodeAttr(*squeeze.node, "squeeze_dims", &dims).ok()) return false;
+  for (auto dim : dims) {
+    if (dim == 3) return false;
+  }
+
+  // Input to the Squeeze must be a Conv2D in NHWC format.
+  const auto squeeze_input_port = GraphView::InputPort(squeeze.node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(squeeze_input_port);
+  if (conv2d.node == nullptr) return false;
+  if (conv2d.node->op() != "Conv2D") return false;
+  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+Squeeze+BiasAdd pattern.
+  matched->conv2d = conv2d.node;
+  matched->squeeze = squeeze.node;
+  matched->bias_add = node;
+
+  return true;
+}
+
+bool FindConv2DWithBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+                             Conv2DWithBatchNorm* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a FusedBatchNorm or a FusedBatchNormV2.
+  if (node == nullptr) return false;
+  if (!IsFusedBatchNorm(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!HasDataType(node, DT_FLOAT)) return false;
+
+  // V2 has a separate data type for the scale/offset/mean/variance inputs.
+  if (node->op() == "FusedBatchNormV2" && !HasDataType(node, DT_FLOAT, "U"))
+    return false;
+
+  // Check that batch normalization is in inference mode.
+  const auto& attr = node->attr();
+  if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
+
+  // Check that only 0th output is consumed by other nodes.
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (HasFanouts(ctx.graph_view, node, 1)) return false;  // batch_mean
+  if (HasFanouts(ctx.graph_view, node, 2)) return false;  // batch_variance
+  if (HasFanouts(ctx.graph_view, node, 3)) return false;  // reserve_space_1
+  if (HasFanouts(ctx.graph_view, node, 4)) return false;  // reserve_space_2
+
+  // Input to the FusedBatchNorm must be a Conv2D in NHWC format.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
+  if (conv2d.node == nullptr) return false;
+  if (!IsConv2D(*conv2d.node)) return false;
+  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+FusedBatchNorm pattern.
+  matched->conv2d = conv2d.node;
+  matched->fused_batch_norm = node;
+  if (!GetNodeAttr(*node, "epsilon", &matched->epsilon).ok()) return false;
+
+  return true;
+}
+
+bool FindConv2DWithBatchNormAndRelu(const RemapperContext& ctx,
+                                    const NodeDef* node,
+                                    Conv2DWithBatchNormAndRelu* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a Relu.
+  if (node == nullptr) return false;
+  if (!IsRelu(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // And input to Relu must match Conv2DWithBatchNorm pattern.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto batch_norm = ctx.graph_view.GetRegularFanin(input_port);
+
+  Conv2DWithBatchNorm base;
+  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm)) return false;
+  if (!HaveSameDataType(node, base.fused_batch_norm)) return false;
+  if (IsInPreserveSet(ctx, base.fused_batch_norm)) return false;
+
+  // We successfully found a Conv2D+FusedBatchNorm+Relu pattern.
+  matched->conv2d = base.conv2d;
+  matched->fused_batch_norm = base.fused_batch_norm;
+  matched->relu = node;
+  matched->epsilon = base.epsilon;
+
+  return true;
+}
+
+// Check that given node meets some basic FusedBatchNorm optimization
+// preconditions. We use this check to lazily infer graph properties which is
+// rather expensive.
+bool IsFusedBatchNormCandidate(const NodeDef& node) {
+  if (!IsFusedBatchNorm(node)) return false;
+  if (GetDataTypeFromAttr(node, "T") != DT_FLOAT) return false;
+
+  // Check that the node is in inference mode.
+  const auto& attr = node.attr();
+  if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
+
+  return true;
+}
+
+bool FindFusedBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+                        FusedBatchNorm* matched) {
+  if (!IsFusedBatchNormCandidate(*node)) return false;
+
+  const auto& props = ctx.graph_properties.GetInputProperties(node->name());
+
+  // a. Scaling factor can be const folded:
+  //      scaling_factor = (variance + epsilon).rsqrt() * scale
+  bool const_scaling_factor =
+      props.size() == 5 &&     // [x, scale, offset, mean, variance]
+      props[1].has_value() &&  // scale
+      props[4].has_value();    // variance aka estimated variance
+
+  // b. Or input can be const folded into some other expression.
+  auto const_inputs = std::count_if(
+      props.begin(), props.end(),
+      [](const OpInfo::TensorProperties& props) { return props.has_value(); });
+
+  // TODO(bsteiner): use the cost model to compare the cost of fused batch
+  // norm against that of the optimized form.
+  bool can_remap = const_scaling_factor || const_inputs >= 4;
+  if (!can_remap) return false;
+
+  // The optimized version only generates the first output.
+  for (GraphView::Edge edge : ctx.graph_view.GetFanoutEdges(*node, false)) {
+    if (edge.src.port_id != 0) return false;
+  }
+
+  // We found a fused batch norm node that can be replaced with primitive ops.
+  matched->fused_batch_norm = node;
+  return true;
+}
+
+void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
+                          const std::vector<string>& fused_ops = {},
+                          int num_args = 1, float epsilon = 0.0) {
+  auto* attr = fused_conv2d->mutable_attr();
+  auto src_attr = conv2d->attr();
+
+  (*attr)["T"] = src_attr.at("T");
+  (*attr)["strides"] = src_attr.at("strides");
+  (*attr)["padding"] = src_attr.at("padding");
+  (*attr)["dilations"] = src_attr.at("dilations");
+  (*attr)["data_format"] = src_attr.at("data_format");
+
+  auto* fused_ops_attr = (*attr)["fused_ops"].mutable_list();
+  for (const string& fused_op : fused_ops) {
+    fused_ops_attr->add_s(fused_op);
+  }
+
+  SetAttrValue(num_args, &(*attr)["num_args"]);
+  // Required only for FusedBatchNorm.
+  SetAttrValue(epsilon, &(*attr)["epsilon"]);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBiasAdd& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BiasAdd: bias_add=" << matched.bias_add->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.bias_add->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.bias_add->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBiasAddAndRelu& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: relu=" << matched.relu->name()
+          << " bias_add=" << matched.bias_add->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.relu->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.relu->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd", "Relu"});
+
+  invalidated_nodes->insert(matched.relu);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithSqueezeAndBiasAdd& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with Squeeze and BiasAdd: "
+          << " bias_add=" << matched.bias_add->name()
+          << " squeeze=" << matched.squeeze->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  // Replace Conv2D node with a fused Conv2D. Matched pattern guarantees that it
+  // has single consumer (only the squeeze node).
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.conv2d->name());
+  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_device(matched.conv2d->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+
+  // Replace BiasAdd node with a Squeeze.
+  NodeDef* remapped_squeeze = optimized_graph->add_node();
+  *remapped_squeeze = *matched.squeeze;
+  remapped_squeeze->set_name(matched.bias_add->name());
+  remapped_squeeze->set_input(0, fused_conv2d->name());
+
+  invalidated_nodes->insert(matched.squeeze);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBatchNorm& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BatchNorm: batch_norm="
+          << matched.fused_batch_norm->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.fused_batch_norm->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
+  fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
+  fused_conv2d->add_input(matched.fused_batch_norm->input(2));  // 3: offset
+  fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
+  fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm"},
+                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+
+  invalidated_nodes->insert(matched.fused_batch_norm);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBatchNormAndRelu& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BatchNorm and Relu: relu="
+          << matched.relu->name()
+          << " batch_norm=" << matched.fused_batch_norm->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.relu->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
+  fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
+  fused_conv2d->add_input(matched.fused_batch_norm->input(2));  // 3: offset
+  fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
+  fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm", "Relu"},
+                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+
+  invalidated_nodes->insert(matched.relu);
+  invalidated_nodes->insert(matched.fused_batch_norm);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddBatchNormNodes(const FusedBatchNorm& matched,
+                       GraphDef* optimized_graph) {
+  const NodeDef& fused_node = *matched.fused_batch_norm;
+  VLOG(2) << "Optimizing fused batch norm node "
+          << SummarizeNodeDef(fused_node);
+
   const string& x = fused_node.input(0);
   string scale = fused_node.input(1);
   string offset = fused_node.input(2);
   string mean = fused_node.input(3);
   string variance = fused_node.input(4);
 
-  if (fused_node.attr().at("data_format").s() == "NCHW") {
+  if (fused_node.attr().at(kDataFormat).s() == "NCHW") {
     // Need to reshape the last 4 inputs
     NodeDef* new_shape = optimized_graph->add_node();
     new_shape->set_name(AddPrefixToNodeName("NCHWShape", fused_node.name()));
@@ -164,59 +644,94 @@ void AddBatchNormNodes(GraphDef* optimized_graph, const NodeDef& fused_node) {
   *r->add_input() = a->name();
   *r->add_input() = c->name();
 }
+}  // namespace
 
 Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
                           GraphDef* optimized_graph) {
-  GraphProperties properties(item);
-  bool inferred_properties = false;
-  GraphView graph(const_cast<GraphDef*>(&item.graph));
-
-  // During inference, most of the inputs to FusedBatchNorm are constant, and we
-  // can therefore replace the op with a much cheaper set of primitives.
-  optimized_graph->mutable_node()->Reserve(item.graph.node_size());
-  for (const NodeDef& node : item.graph.node()) {
-    if (node.op() == "FusedBatchNorm" || node.op() == "FusedBatchNormV2") {
-      bool optimizable = (node.attr().count("T") == 0 ||
-                          node.attr().at("T").type() == DT_FLOAT);
-      optimizable &= (node.attr().count("is_training") == 0 ||
-                      !node.attr().at("is_training").b());
-      if (optimizable) {
-        int const_inputs = 0;
-        if (!inferred_properties) {
-          // Infer properties lazily in case they are not needed.
-          TF_RETURN_IF_ERROR(properties.InferStatically(false));
-          inferred_properties = true;
-        }
-        const auto& props = properties.GetInputProperties(node.name());
-        for (const auto& prop : props) {
-          if (prop.has_value()) {
-            const_inputs += 1;
-          }
-        }
-        // TODO(bsteiner): use the cost model to compare the cost of fused batch
-        // norm against that of the optimized form.
-        optimizable = (const_inputs >= 4);
-      }
-      if (optimizable) {
-        for (GraphView::Edge edge : graph.GetFanoutEdges(node, false)) {
-          if (edge.src.port_id != 0) {
-            // The optimized version only generates the first output.
-            optimizable = false;
-            break;
-          }
-        }
-      }
-      if (optimizable) {
-        VLOG(1) << "Optimizing fused batch norm node " << node.DebugString();
-        AddBatchNormNodes(optimized_graph, node);
-        continue;
-      }
+  // Supported graph patterns.
+  // clang-format off
+  FusedBatchNorm              fused_batch_norm;
+  Conv2DWithBiasAdd           conv2d_with_bias;
+  Conv2DWithBiasAddAndRelu    conv2d_with_bias_and_relu;
+  Conv2DWithBatchNorm         conv2d_with_batch_norm;
+  Conv2DWithBatchNormAndRelu  conv2d_with_batch_norm_and_relu;
+  Conv2DWithSqueezeAndBiasAdd conv2d_with_squeeze_and_bias;
+  // clang-format on
+
+  // Processing graph in reverse-topological sorted order allows to remap
+  // longer chains of dependent ops in one pass.
+  GraphDef topo_sorted_graph = item.graph;
+  TF_RETURN_IF_ERROR(TopologicalSort(&topo_sorted_graph));
+  std::reverse(topo_sorted_graph.mutable_node()->begin(),
+               topo_sorted_graph.mutable_node()->end());
+
+  GrapplerItem topo_sorted_item = item.WithGraph(std::move(topo_sorted_graph));
+  RemapperContext ctx(topo_sorted_item);
+
+  // Skip nodes that were invalidated by a remapper, e.g. do not process BiasAdd
+  // and Relu nodes that were fused into a Conv2D node.
+  absl::flat_hash_set<const NodeDef*> invalidated_nodes;
+
+  optimized_graph->mutable_node()->Reserve(topo_sorted_item.graph.node_size());
+  for (const NodeDef& node : topo_sorted_item.graph.node()) {
+    // Check if node was invalidated by one of the previous remaps.
+    if (invalidated_nodes.count(&node) > 0) continue;
+
+    // Remap Conv2D+BiasAdd into the _FusedConv2D.
+    if (FindConv2DWithBias(ctx, &node, &conv2d_with_bias)) {
+      AddFusedConv2DNode(conv2d_with_bias, optimized_graph, &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+BiasAdd+Relu into the _FusedConv2D.
+    if (FindConv2DWithBiasAndRelu(ctx, &node, &conv2d_with_bias_and_relu)) {
+      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+Squeeze+BiasAdd into the _FusedConv2D+Squeeze.
+    if (FindConv2DWithSqueezeAndBias(ctx, &node,
+                                     &conv2d_with_squeeze_and_bias)) {
+      AddFusedConv2DNode(conv2d_with_squeeze_and_bias, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+FusedBatchNorm into the _FusedConv2D;
+    if (FindConv2DWithBatchNorm(ctx, &node, &conv2d_with_batch_norm)) {
+      AddFusedConv2DNode(conv2d_with_batch_norm, optimized_graph,
+                         &invalidated_nodes);
+      continue;
     }
+
+    // Remap Conv2D+FusedBatchNorm+Relu into the _FusedConv2D;
+    if (FindConv2DWithBatchNormAndRelu(ctx, &node,
+                                       &conv2d_with_batch_norm_and_relu)) {
+      AddFusedConv2DNode(conv2d_with_batch_norm_and_relu, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Infer properties lazily in case they are not needed.
+    if (!ctx.inferred_graph_properties && IsFusedBatchNormCandidate(node)) {
+      TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(false));
+      ctx.inferred_graph_properties = true;
+    }
+
+    // During inference, most of the inputs to FusedBatchNorm are constant, and
+    // we can therefore replace the op with a much cheaper set of primitives.
+    if (FindFusedBatchNorm(ctx, &node, &fused_batch_norm)) {
+      AddBatchNormNodes(fused_batch_norm, optimized_graph);
+      continue;
+    }
+
+    // If we didn't match a node to any pattern copy it to the optimized graph.
     *optimized_graph->add_node() = node;
   }
 
-  *optimized_graph->mutable_library() = item.graph.library();
-  *optimized_graph->mutable_versions() = item.graph.versions();
+  *optimized_graph->mutable_library() = topo_sorted_item.graph.library();
+  *optimized_graph->mutable_versions() = topo_sorted_item.graph.versions();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 4cbf0d8d6f11ea77cfd3973e3ff0c109d48c0273..ffc242decc70e8947547fbe9ca25909625381887 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -24,7 +24,17 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-class RemapperTest : public GrapplerTest {};
+class RemapperTest : public GrapplerTest {
+ protected:
+  // TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+  // contractions with non-default contraction output kernels.
+  bool EigenSupportsContractionOutputKernel() {
+#if defined(EIGEN_USE_LIBXSMM)
+    return false;
+#endif
+    return true;
+  }
+};
 
 TEST_F(RemapperTest, FusedBatchNorm) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -91,5 +101,361 @@ TEST_F(RemapperTest, FusedBatchNormNCHW) {
   }
 }
 
+TEST_F(RemapperTest, FuseConv2DWithBias) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ::tensorflow::ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "bias_add") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(1, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithBiasAndRelu) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ::tensorflow::ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+  auto relu = ops::Relu(s.WithOpName("relu"), bias_add);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "relu") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      ASSERT_EQ(2, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      EXPECT_EQ("Relu", fused_ops[1]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto scale_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
+  auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
+  auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
+  auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  ops::FusedBatchNorm::Attrs attrs;
+  attrs = attrs.IsTraining(false);
+  auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
+                                        offset, mean, variance, attrs);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), batch_norm.y);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t},
+               {"scale", scale_t}, {"offset", offset_t},
+               {"mean", mean_t},   {"variance", variance_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "batch_norm") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(4, node.attr().at("num_args").i());
+      EXPECT_EQ("scale", node.input(2));
+      EXPECT_EQ("offset", node.input(3));
+      EXPECT_EQ("mean", node.input(4));
+      EXPECT_EQ("variance", node.input(5));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(1, fused_ops.size());
+      EXPECT_EQ("FusedBatchNorm", fused_ops[0]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithBatchNormAndRelu) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto scale_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
+  auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
+  auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
+  auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  ops::FusedBatchNorm::Attrs attrs;
+  attrs = attrs.IsTraining(false);
+  auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
+                                        offset, mean, variance, attrs);
+  auto relu = ops::Relu(s.WithOpName("relu"), batch_norm.y);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t},
+               {"scale", scale_t}, {"offset", offset_t},
+               {"mean", mean_t},   {"variance", variance_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "relu") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(4, node.attr().at("num_args").i());
+      EXPECT_EQ("scale", node.input(2));
+      EXPECT_EQ("offset", node.input(3));
+      EXPECT_EQ("mean", node.input(4));
+      EXPECT_EQ("variance", node.input(5));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(2, fused_ops.size());
+      EXPECT_EQ("FusedBatchNorm", fused_ops[0]);
+      EXPECT_EQ("Relu", fused_ops[1]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 1, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+
+  ops::Squeeze::Attrs attrs;
+  attrs = attrs.Axis({2});
+  auto squeeze = ops::Squeeze(s.WithOpName("squeeze"), conv, attrs);
+
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), squeeze, bias);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 1, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "conv") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      ASSERT_EQ(1, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      found++;
+    } else if (node.name() == "bias_add") {
+      EXPECT_EQ("Squeeze", node.op());
+      EXPECT_EQ("conv", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 0d4aaf646218f1a784878bd099e68f166dd0340b..e537b3df07deea17b1a53d1abf18be7bad3a6d23 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -790,20 +790,17 @@ Tree* ComputeScopeTree(const string& op_name,
   return root;
 }
 
-void PartitionByLoopStructure(const FrameMap& frame_map,
+void PartitionByLoopStructure(const FrameView& frame_view,
                               std::vector<NodeDef*> nodes,
                               std::vector<std::vector<NodeDef*>>* loop_groups) {
   // It is assumed that two nodes with identical loop containment have
-  // identical integer vectors.  Represent those by 64 bit hashes.
+  // identical integer vectors. Represent those by 64 bit hashes.
   std::unordered_map<uint64, std::vector<NodeDef*>> loop_sets;
   for (NodeDef* nd : nodes) {
     uint64 hash = 0;
-    const auto& it = frame_map.find(nd);
-    if (it != frame_map.end()) {
-      const std::vector<int>& loop_ids = it->second;
-      for (int id : loop_ids) {
-        hash = Hash64Combine(hash, static_cast<uint64>(id));
-      }
+    const std::vector<int>& loop_ids = frame_view.Frames(*nd);
+    for (int id : loop_ids) {
+      hash = Hash64Combine(hash, static_cast<uint64>(id));
     }
     loop_sets[hash].push_back(nd);
   }
@@ -821,10 +818,11 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
   GraphOpOccurrences occ;
   FindOpOccurrences(graph, op_name_set_, &occ);
   if (!occ.empty()) {
-    FrameMap frame_map;
-    int num_frames;
-    LOG_WARNING_AND_RETURN_IF_ERROR(
-        IdentifyFramesWithNodeMap(*graph, *node_map_, &frame_map, &num_frames));
+    FrameView frame_view;
+    // TODO(ezhulenev): Pass a GraphView when this optimizer will be migrated
+    // from NodeMap.
+    LOG_WARNING_AND_RETURN_IF_ERROR(frame_view.InferFromGraph(*graph));
+
     for (auto& dt : occ) {
       VLOG(2) << "Processing device " << dt.first;
       const DevOpOccurrences& dev_occ = dt.second;
@@ -841,26 +839,26 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
         // Nodes with a common depth and root path are now grouped
         // in the same Tree struct.  Split those groups into subgroups that
         // share identical loop nesting.
-        status = ApplyToAll(
-            root.get(), [this, rewriter, graph, &frame_map, &op_name](Tree* t) {
-              VLOG(2) << "applied to tree node " << t->edge_ << " at depth "
-                      << t->depth_ << " of size " << t->nodes_.size();
-              if (t->nodes_.size() > 1) {
-                std::vector<std::vector<NodeDef*>> loop_groups;
-                PartitionByLoopStructure(frame_map, t->nodes_, &loop_groups);
-                for (auto& lg : loop_groups) {
-                  if (lg.size() > 1) {
-                    bool applied = false;
-                    Status s = OrderNodeSet(&lg);
-                    TF_RETURN_IF_ERROR(s);
-                    VLOG(1) << "Applying Rewriter for " << op_name;
-                    s = rewriter->Rewrite(this, graph, op_name, lg, &applied);
-                    LOG_WARNING_AND_RETURN_IF_ERROR(s);
-                  }
-                }
+        status = ApplyToAll(root.get(), [this, rewriter, graph, &frame_view,
+                                         &op_name](Tree* t) {
+          VLOG(2) << "applied to tree node " << t->edge_ << " at depth "
+                  << t->depth_ << " of size " << t->nodes_.size();
+          if (t->nodes_.size() > 1) {
+            std::vector<std::vector<NodeDef*>> loop_groups;
+            PartitionByLoopStructure(frame_view, t->nodes_, &loop_groups);
+            for (auto& lg : loop_groups) {
+              if (lg.size() > 1) {
+                bool applied = false;
+                Status s = OrderNodeSet(&lg);
+                TF_RETURN_IF_ERROR(s);
+                VLOG(1) << "Applying Rewriter for " << op_name;
+                s = rewriter->Rewrite(this, graph, op_name, lg, &applied);
+                LOG_WARNING_AND_RETURN_IF_ERROR(s);
               }
-              return Status::OK();
-            });
+            }
+          }
+          return Status::OK();
+        });
         if (!status.ok()) {
           break;
         }
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index 6ccb1cd783d82e54e637241184633b6337f23939..7dae0e3cd9ef8a0b3b16a164fa77aa89353d3989 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/symbolic_shapes.h"
@@ -34,7 +34,7 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   GraphProperties properties(item);
   bool inferred_properties = false;
-  GraphView graph(optimized_graph);
+  MutableGraphView graph(optimized_graph);
 
   // The product of all the dimensions in a tensor shape can be expressed more
   // simply as the size of the tensor.
@@ -42,8 +42,8 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (!IsShape(node)) {
       continue;
     }
-    for (GraphView::InputPort fanout :
-         graph.GetFanout(GraphView::OutputPort(&node, 0))) {
+    for (MutableGraphView::InputPort fanout :
+         graph.GetFanout(MutableGraphView::OutputPort(&node, 0))) {
       if (fanout.node->op() != "Prod") {
         continue;
       }
@@ -53,8 +53,8 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         // rewrite the whole expression directly as a Size operation.
         continue;
       }
-      const GraphView::OutputPort reduce_indices =
-          graph.GetRegularFanin(GraphView::InputPort(fanout.node, 1));
+      const MutableGraphView::OutputPort reduce_indices =
+          graph.GetRegularFanin(MutableGraphView::InputPort(fanout.node, 1));
       if (!inferred_properties) {
         // Infer properties lazily in case they are not needed.
         TF_RETURN_IF_ERROR(properties.InferStatically(false));
@@ -90,10 +90,10 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // is possible whenever the symbolic dimensions in the numerator and
     // denominator cancel each other.
     if (node.op() == "Div") {
-      const GraphView::OutputPort input1 =
-          graph.GetRegularFanin(GraphView::InputPort(&node, 0));
-      const GraphView::OutputPort input2 =
-          graph.GetRegularFanin(GraphView::InputPort(&node, 1));
+      const MutableGraphView::OutputPort input1 =
+          graph.GetRegularFanin(MutableGraphView::InputPort(&node, 0));
+      const MutableGraphView::OutputPort input2 =
+          graph.GetRegularFanin(MutableGraphView::InputPort(&node, 1));
       if (!IsSize(*input1.node) || !IsSize(*input2.node)) {
         continue;
       }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 5867d013242f4d7b8a8c2a2223439470ac6f0fa1..29775442629dd5a56776f2d0005f9ba50c2da84b 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -22,14 +22,17 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -37,8 +40,8 @@ namespace {
 template <typename T>
 bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
   using RealType = typename Eigen::NumTraits<T>::Real;
-  if (value > static_cast<double>(std::numeric_limits<RealType>::max()) ||
-      value < static_cast<double>(std::numeric_limits<RealType>::min())) {
+  if (value > static_cast<double>(Eigen::NumTraits<RealType>::highest()) ||
+      value < static_cast<double>(Eigen::NumTraits<RealType>::lowest())) {
     return false;
   }
   tensor->flat<T>()(0) = static_cast<T>(value);
@@ -142,20 +145,18 @@ void NodeMap::UpdateOutput(const string& node_name,
 }
 
 bool IsSameInput(const string& name1, const string& name2) {
-  if (name1 == name2) {
-    return true;
-  }
-  int position1;
-  StringPiece node1 = ParseNodeNameAsStringPiece(name1, &position1);
-  int position2;
-  StringPiece node2 = ParseNodeNameAsStringPiece(name2, &position2);
-  return (position1 == position2) && (node1 == node2);
+  if (name1 == name2) return true;
+  TensorId tensor1 = ParseTensorName(name1);
+  TensorId tensor2 = ParseTensorName(name2);
+  return tensor1.node() == tensor2.node() && tensor1.index() == tensor2.index();
 }
 
 bool IsControlInput(const string& name) {
   return !name.empty() && name[0] == '^';
 }
 
+bool IsControlInput(const TensorId& tensor_id) { return tensor_id.index() < 0; }
+
 string AddPrefixToNodeName(const string& name, const string& prefix,
                            const string& delimiter) {
   if (!name.empty()) {
@@ -197,6 +198,12 @@ string AsControlDependency(const string& node_name) {
              : strings::StrCat("^", node_name);
 }
 
+bool NodeIsOnCpu(const NodeDef* node) {
+  string task, device;
+  return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+         str_util::StartsWith(device, DEVICE_CPU);
+}
+
 int NumOutputs(const NodeDef& node, GraphDef* graph) {
   int num_outputs = 0;
   const OpDef* op_def = nullptr;
@@ -242,7 +249,6 @@ int NumNonControlInputs(const NodeDef& node) {
 
 int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
   int num_outputs = 0;
-  int pos;
   for (const NodeDef* output : node_map.GetOutputs(node.name())) {
     for (const string& node_as_input : output->input()) {
       if (IsControlInput(node_as_input)) {
@@ -251,9 +257,8 @@ int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
       if (node_as_input == node.name()) {
         ++num_outputs;
       } else {
-        const StringPiece name =
-            ParseNodeNameAsStringPiece(node_as_input, &pos);
-        if (name == node.name()) {
+        const TensorId tensor = ParseTensorName(node_as_input);
+        if (tensor.node() == node.name()) {
           ++num_outputs;
         }
       }
@@ -280,11 +285,11 @@ int NumNonControlDataOutputs(const NodeDef& node, const NodeMap& node_map) {
 
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
-DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name) {
-  if (!node.attr().count(attr_name)) {
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& type_attr) {
+  if (!node.attr().count(type_attr)) {
     return DT_INVALID;
   }
-  const auto& attr = node.attr().at(attr_name);
+  const auto& attr = node.attr().at(type_attr);
   if (attr.value_case() != AttrValue::kType) {
     return DT_INVALID;
   }
@@ -547,5 +552,29 @@ Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
 
 #undef HANDLE_CASE
 
+Status CheckAttrExists(const NodeDef& node, const string& key) {
+  if (!HasNodeAttr(node, key)) {
+    return errors::InvalidArgument("Node '", node.name(), "' lacks '", key,
+                                   "' attr: ", node.ShortDebugString());
+  }
+  return Status::OK();
+}
+
+Status CheckAttrsExist(const NodeDef& node, absl::Span<const string> keys) {
+  for (const string& key : keys) {
+    TF_RETURN_IF_ERROR(CheckAttrExists(node, key));
+  }
+  return Status::OK();
+}
+
+Status IsKernelRegisteredForNode(const NodeDef& node) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (!DeviceNameUtils::ParseFullName(node.device(), &parsed_name)) {
+    return errors::InvalidArgument("Could not parse device name: ",
+                                   node.device());
+  }
+  return FindKernelDef(DeviceType(parsed_name.type), node, nullptr, nullptr);
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 95126d470c6aa3a787614448c722cc8e414f82ed..b1e2d4e9cb5bbe15508695595de4e00f7313c401 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -17,18 +17,24 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_H_
 
 #include <functional>
-#include <unordered_map>
+#include <iterator>
+#include <set>
 #include <unordered_set>
+#include <utility>
 #include <vector>
-
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -57,8 +63,8 @@ class NodeMap {
 
  private:
   const std::set<NodeDef*> empty_set_;
-  std::unordered_map<string, NodeDef*> nodes_;
-  std::unordered_map<string, std::set<NodeDef*>> outputs_;
+  gtl::FlatMap<string, NodeDef*> nodes_;
+  gtl::FlatMap<string, std::set<NodeDef*>> outputs_;
 };
 
 // A vector with a set. The set stores the same elements as the vector, and
@@ -90,7 +96,7 @@ class SetVector {
   void Reserve(int64 size) { vector_.reserve(size); }
 
  private:
-  std::unordered_set<T, Hash> set_;
+  gtl::FlatSet<T, Hash> set_;
   std::vector<T> vector_;
 };
 
@@ -98,6 +104,9 @@ class SetVector {
 // the ^ character.
 bool IsControlInput(const string& name);
 
+// True iff tensor index refers to a control input.
+bool IsControlInput(const TensorId& tensor_id);
+
 // True iff 'name1' and 'name2' refer to the same input.
 bool IsSameInput(const string& name1, const string& name2);
 
@@ -160,6 +169,7 @@ inline string NodeName(const string& name) {
 }
 
 // Returns the node name and position in a single call.
+// DEPRECATED(ezhulenev): Use TensorId and ParseTensorName.
 inline StringPiece ParseNodeNameAsStringPiece(const string& name,
                                               int* position) {
   static const string empty;
@@ -190,6 +200,7 @@ inline StringPiece ParseNodeNameAsStringPiece(const string& name,
 }
 
 // Returns the node name and position in a single call.
+// DEPRECATED(ezhulenev): Use SafeTensorId and ParseTensorName.
 inline string ParseNodeName(const string& name, int* position) {
   return string(ParseNodeNameAsStringPiece(name, position));
 }
@@ -224,6 +235,9 @@ string AsControlDependency(const NodeDef& node);
 // for control dependency, given a node name
 string AsControlDependency(const string& node);
 
+// Returns true if the node is assigned to run on CPU device.
+bool NodeIsOnCpu(const NodeDef* node);
+
 // Returns the number of outputs of a node according to its OpDef. Note that
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
@@ -244,9 +258,15 @@ int NumNonControlDataOutputs(const NodeDef& node, const NodeMap& node_map);
 // Removes redundant control inputs from node.
 void DedupControlInputs(NodeDef* node);
 
+// Returns an error if an attribute with the given key does not exist in node.
+Status CheckAttrExists(const NodeDef& node, const string& key);
+
+// Returns an error if attributes with the given keys do not exist in node.
+Status CheckAttrsExist(const NodeDef& node, absl::Span<const string> keys);
+
 // Returns the data type in attribute `attr_name` of `node`. If that attribute
 // doesn't exist, returns DT_INVALID.
-DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name);
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& type_attr);
 
 // Returns the last node in the simple chain starting at source and traversing
 // through the input(0) edge from each node as long as the next node satisfies
@@ -265,6 +285,10 @@ NodeDef* GetTailOfChain(const NodeDef& source, const NodeMap& node_map,
 void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
                          bool invert_permutation);
 
+// Returns Status::OK() if a kernel is registered for node.op() on the device
+// type corresponding to node.device().
+Status IsKernelRegisteredForNode(const NodeDef& node);
+
 Status SetTensorValue(DataType dtype, int value, Tensor* tensor);
 
 void EraseNodesFromGraph(const std::set<int>& nodes_to_delete, GraphDef* graph);
@@ -331,7 +355,7 @@ class SimpleGraphView {
  private:
   const GraphDef* graph_;  // Not owned.
   std::vector<string> index_to_name_;
-  std::unordered_map<string, int> name_to_index_;
+  gtl::FlatMap<string, int> name_to_index_;
   std::vector<gtl::InlinedVector<int, 4>> inputs_;
   std::vector<gtl::InlinedVector<int, 2>> outputs_;
 };
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index bdbb8836e17dd9023b94ca236fd5a82ccb7ac87d..c0f19d3828ac1581a937531318ff62875fbf3bc7 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -74,8 +74,9 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -101,6 +102,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -170,6 +173,7 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/utils/frame.cc b/tensorflow/core/grappler/utils/frame.cc
index df5f4ff7cf38dbc7ab3038346cd4ea65031c8227..2484b35de06c74659c583c7d34d4881729e00f21 100644
--- a/tensorflow/core/grappler/utils/frame.cc
+++ b/tensorflow/core/grappler/utils/frame.cc
@@ -15,77 +15,128 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/utils/frame.h"
 #include <deque>
-#include <stack>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 namespace grappler {
 
-Status IdentifyFrames(const GraphDef& graph, FrameMap* frame_map,
-                      int* num_frames) {
-  NodeMap node_map(const_cast<GraphDef*>(&graph));
-  return IdentifyFramesWithNodeMap(graph, node_map, frame_map, num_frames);
-}
+namespace {}  // namespace
+
+Status FrameView::InferFromGraphView(const GraphView& graph_view) {
+  if (is_inferred_) {
+    return errors::Internal("FrameView was already inferred from the graph");
+  }
+  is_inferred_ = true;
+
+  std::deque<const NodeDef*> ready_nodes;
 
-Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
-                                 FrameMap* frame_map, int* num_frames) {
-  std::deque<std::pair<const NodeDef*, std::vector<int>>> ready_nodes;
-  for (const NodeDef& node : graph.node()) {
+  // All nodes without inputs are automatically added to the ready queue.
+  for (const NodeDef& node : graph_view.graph()->node()) {
     if (node.input_size() == 0) {
-      std::vector<int> empty;
-      ready_nodes.emplace_back(&node, empty);
-      (*frame_map)[&node] = empty;
+      ready_nodes.push_back(&node);
+      node_to_frames_[&node] = node_has_no_frames_;
     }
   }
-  std::map<string, int> name_to_id;
+
+  // We assign unique int id to each frame, and use this map to track what
+  // frames we've already seen in the graph.
+  absl::flat_hash_map<string, int> frame_name_to_id;
+
   while (!ready_nodes.empty()) {
-    auto ready_node = ready_nodes.front();
-    for (const auto& fanout : node_map.GetOutputs(ready_node.first->name())) {
-      if (frame_map->count(fanout) < 1) {
-        std::vector<int> frame_ids = ready_node.second;
-        if (IsExit(*ready_node.first)) {
+    const NodeDef* ready_node = ready_nodes.front();
+
+    absl::flat_hash_set<GraphView::InputPort> fanouts =
+        graph_view.GetFanouts(*ready_node, /*include_controlled_nodes=*/true);
+
+    for (const GraphView::InputPort& fanout : fanouts) {
+      if (node_to_frames_.count(fanout.node) < 1) {
+        // If we have never seen this node before, we add all frames from the
+        // incoming node (and pop/push frames if coming from Exit/Enter nodes).
+        std::vector<int> frame_ids = node_to_frames_[ready_node];
+
+        if (IsExit(*ready_node)) {
           frame_ids.pop_back();
         }
-        if (IsEnter(*fanout)) {
-          CHECK(fanout->attr().count("frame_name"))
-              << "Missing frame name for the Enter node " << fanout->name();
-          string name = fanout->attr().at("frame_name").s();
-          int id;
-          if (name_to_id.count(name)) {
-            id = name_to_id[name];
+
+        if (IsEnter(*fanout.node)) {
+          const AttrValue* frame_name_attr =
+              AttrSlice(*fanout.node).Find("frame_name");
+
+          if (!frame_name_attr) {
+            return errors::InvalidArgument(
+                "Missing frame name for the Enter node: ",
+                SummarizeNodeDef(*fanout.node));
+          }
+
+          absl::string_view frame_name = frame_name_attr->s();
+          int frame_id;
+
+          if (frame_name_to_id.count(frame_name)) {
+            frame_id = frame_name_to_id[frame_name];
           } else {
-            id = name_to_id.size();
-            name_to_id[name] = id;
+            frame_id = static_cast<int>(frame_name_to_id.size());
+            frame_name_to_id[frame_name] = frame_id;
           }
-          frame_ids.push_back(id);
+
+          frame_ids.push_back(frame_id);
         }
-        ready_nodes.emplace_back(fanout, frame_ids);
-        (*frame_map)[fanout] = frame_ids;
+
+        ready_nodes.push_back(fanout.node);
+        node_to_frames_[fanout.node] = std::move(frame_ids);
+
       } else {
-        auto frame_ids_fanout = (*frame_map)[fanout];
-        auto frame_ids_node = ready_node.second;
-        if (IsEnter(*fanout)) {
+        // If we've already seen this node before, we need to make sure that
+        // graph is correct and same nodes doesn't have incoming edges with
+        // conflicting frames (all inputs must be produces in the same frame).
+
+        std::vector<int> frame_ids_fanout = node_to_frames_[fanout.node];
+        std::vector<int> frame_ids_node = node_to_frames_[ready_node];
+
+        if (IsEnter(*fanout.node)) {
           frame_ids_fanout.pop_back();
         }
-        if (IsExit(*ready_node.first)) {
+        if (IsExit(*ready_node)) {
           frame_ids_node.pop_back();
         }
+
         if (frame_ids_node != frame_ids_fanout) {
           return errors::InvalidArgument(
-              "Invalid graph: Frame ids for node ", ready_node.first->name(),
-              " does not match frame ids for it's fanout.");
+              "Invalid graph: Frame ids for node ", ready_node->name(),
+              " does not match frame ids for it's fanout ",
+              fanout.node->name());
         }
       }
     }
+
     ready_nodes.pop_front();
   }
-  *num_frames = name_to_id.size();
+
+  num_frames_ = static_cast<int>(frame_name_to_id.size());
   return Status::OK();
 }
 
+Status FrameView::InferFromGraph(const GraphDef& graph) {
+  return InferFromGraphView(GraphView(&graph));
+}
+
+const std::vector<int>& FrameView::Frames(const NodeDef& node) const {
+  DCHECK(is_inferred_) << "FrameView is not initialized";
+  auto frames = node_to_frames_.find(&node);
+  if (frames == node_to_frames_.end()) {
+    LOG(WARNING) << "Node doesn't belong to the graph used for initialization";
+    return node_has_no_frames_;
+  } else {
+    return frames->second;
+  }
+}
+
+bool FrameView::IsInFrame(const NodeDef& node) const {
+  return !Frames(node).empty();
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/frame.h b/tensorflow/core/grappler/utils/frame.h
index 95b72748f4e1f13f1c61d64c4a457287e9d7d46b..04c6588275098a0a3f7110be7af4e2e9207b0ac2 100644
--- a/tensorflow/core/grappler/utils/frame.h
+++ b/tensorflow/core/grappler/utils/frame.h
@@ -17,25 +17,52 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
 
 #include <unordered_map>
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
-using FrameMap = std::unordered_map<const NodeDef*, std::vector<int>>;
+// FrameView is a helper class that allows to find in what execution frames (if
+// any) the given node can be running in. It's constructed from an immutable
+// GraphView, and any modification of the underlying graph might invalidate it.
+//
+// All execution frames assigned an unique integer id, but they do not have any
+// meaning whatsoever, it's just a sequence number.
+//
+// See the paper "Dynamic Control Flow in Large-Scale Machine Learning" for
+// detailed explanation of execution frames (https://arxiv.org/abs/1805.01772).
+class FrameView {
+ public:
+  FrameView() : is_inferred_(false), num_frames_(0) {}
 
-// Returns the number of frames present in the graph, and populates
-// the 'frames' argument with the collection of frames (denoted by their
-// frame ids) in the outermost-to-innermost order. Frame ids are arbitrary.
-Status IdentifyFrames(const GraphDef& graph, FrameMap* frame_map,
-                      int* num_frames);
+  // Infers nodes execution frames from the GraphView. Returns an error if
+  // called multiple times.
+  Status InferFromGraphView(const GraphView& graph_view);
+  // Infers nodes execution by constructing temporary GraphView and passing it
+  // to InferFromGraphView.
+  Status InferFromGraph(const GraphDef& graph);
 
-// As above, but use an existing NodeMap for graph instead of building it
-// from scratch.
-Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
-                                 FrameMap* frame_map, int* num_frames);
+  // Returns all frames of the given node (denoted by their frame ids) in
+  // outermost-to-innermost order.
+  const std::vector<int>& Frames(const NodeDef& node) const;
+
+  // Returns true iff the node is at least in one execution frame.
+  bool IsInFrame(const NodeDef& node) const;
+
+  int num_frames() const { return num_frames_; }
+  bool is_inferred() const { return is_inferred_; }
+
+ private:
+  bool is_inferred_;  // true if it was inferred from the graph
+  int num_frames_;    // number of frames present in a graph
+  absl::flat_hash_map<const NodeDef*, std::vector<int>> node_to_frames_;
+
+  // We return a reference to this vector if node has no frames.
+  const std::vector<int> node_has_no_frames_;
+};
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/frame_test.cc b/tensorflow/core/grappler/utils/frame_test.cc
index df76083fc3a0334172ac93998e0b549a2c723431..cc82e0ed3a39dd117e2197fa9a47fe2f3372051d 100644
--- a/tensorflow/core/grappler/utils/frame_test.cc
+++ b/tensorflow/core/grappler/utils/frame_test.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class IdentifyFramesTest : public ::testing::Test {
+class FrameViewTest : public ::testing::Test {
  protected:
   static NodeDef CreateNode(const string& name,
                             const std::vector<string>& inputs) {
@@ -53,19 +53,17 @@ class IdentifyFramesTest : public ::testing::Test {
   }
 };
 
-TEST_F(IdentifyFramesTest, NestedLoop) {
+TEST_F(FrameViewTest, NestedLoop) {
   GraphDef graph;
   // Create a two-level nested loop
   *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() =
-      CreateNode("1", "Enter", "map/while/while_context1", {"0"});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context1", {"0"});
   *graph.add_node() = CreateNode("2", {"1"});
   *graph.add_node() = CreateNode("3", "Merge", {"2", "14"});
   *graph.add_node() = CreateNode("4", {"3"});
   *graph.add_node() = CreateNode("5", "Switch", {"4"});
   *graph.add_node() = CreateNode("6", {"5"});
-  *graph.add_node() =
-      CreateNode("7", "Enter", "map/while/while_context2", {"6"});
+  *graph.add_node() = CreateNode("7", "Enter", "while/context2", {"6"});
   *graph.add_node() = CreateNode("8", {"7"});
   *graph.add_node() = CreateNode("9", "Merge", {"8", "12"});
   *graph.add_node() = CreateNode("10", {"9"});
@@ -77,118 +75,106 @@ TEST_F(IdentifyFramesTest, NestedLoop) {
   *graph.add_node() = CreateNode("16", "Exit", {"15"});
   *graph.add_node() = CreateNode("17", {"16"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}},      {"1", {0}},     {"2", {0}},     {"3", {0}},
       {"4", {0}},     {"5", {0}},     {"6", {0}},     {"7", {0, 1}},
       {"8", {0, 1}},  {"9", {0, 1}},  {"10", {0, 1}}, {"11", {0, 1}},
       {"12", {0, 1}}, {"13", {0, 1}}, {"14", {0}},    {"15", {0}},
       {"16", {0}},    {"17", {}}};
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 2);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
-TEST_F(IdentifyFramesTest, MultipleInputsToEnter) {
+TEST_F(FrameViewTest, MultipleInputsToEnter) {
   GraphDef graph;
   *graph.add_node() = CreateNode("0", {});
   *graph.add_node() = CreateNode("1", {});
-  *graph.add_node() =
-      CreateNode("2", "Enter", "map/while/while_context", {"0", "1"});
+  *graph.add_node() = CreateNode("2", "Enter", "while/context", {"0", "1"});
   *graph.add_node() = CreateNode("3", "Exit", {"2"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {}}, {"2", {0}}, {"3", {0}}};
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 1);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
-TEST_F(IdentifyFramesTest, ExitOutput) {
+TEST_F(FrameViewTest, ExitOutput) {
   GraphDef graph;
   *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() =
-      CreateNode("1", "Enter", "map/while/while_context", {"0"});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context", {"0"});
   *graph.add_node() = CreateNode("2", "Exit", {"1"});
   *graph.add_node() = CreateNode("3", {});
   *graph.add_node() = CreateNode("4", {"2", "3"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {0}}, {"2", {0}}, {"3", {}}, {"4", {}}};
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 1);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
-TEST_F(IdentifyFramesTest, MultipleEnterNodes) {
+TEST_F(FrameViewTest, MultipleEnterNodes) {
   GraphDef graph;
   *graph.add_node() = CreateNode("0", {});
-  string frame = "map/while/while_context";
-  *graph.add_node() = CreateNode("1", "Enter", frame, {"0"});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context", {"0"});
   *graph.add_node() = CreateNode("2", {"1"});
   *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", "Enter", frame, {"5"});
+  *graph.add_node() = CreateNode("4", "Enter", "while/context", {"5"});
   *graph.add_node() = CreateNode("3", {"4", "2"});
   *graph.add_node() = CreateNode("6", "Merge", {"3", "8"});
   *graph.add_node() = CreateNode("7", "Switch", {"6"});
   *graph.add_node() = CreateNode("8", "NextIteration", {"7"});
   *graph.add_node() = CreateNode("9", "Exit", {"7"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {0}}, {"2", {0}}, {"3", {0}}, {"4", {0}},
       {"5", {}}, {"6", {0}}, {"7", {0}}, {"8", {0}}, {"9", {0}}};
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 1);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
+TEST_F(FrameViewTest, ConflictingFrames) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context1", {"0"});
+  *graph.add_node() = CreateNode("2", "Enter", "while/context2", {"1"});
+  *graph.add_node() = CreateNode("3", {"1", "2"});
+
+  FrameView frame_view;
+  ASSERT_FALSE(frame_view.InferFromGraph(graph).ok());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 6861fb423c50a5d489a1a8d4eb0f77a64be63f2d..57863a71f35f176e3935e2121f5650a58c72d642 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -57,14 +58,14 @@ Status RegisterFunctionBodyOutputs(const FunctionLibraryDefinition& flib,
 // Replace the placeholder attribute values with the values specified in
 // instantiation attributes.
 Status ResolveFunctionBodyNodeAttrPlaceholders(
-    const AttrValueMap& func_instantiation_attr, NodeDef* node) {
+    const AttrSlice& func_instantiation_attr, NodeDef* node) {
   for (auto& attr : *node->mutable_attr()) {
     const string& placeholder = attr.second.placeholder();
     if (placeholder.empty()) continue;
 
-    auto it = func_instantiation_attr.find(placeholder);
-    if (it != func_instantiation_attr.end()) {
-      attr.second = it->second;
+    const AttrValue* attr_value = func_instantiation_attr.Find(placeholder);
+    if (attr_value) {
+      attr.second = *attr_value;
     } else {
       return errors::InvalidArgument("Can't resolve placeholder: ",
                                      placeholder);
@@ -75,6 +76,16 @@ Status ResolveFunctionBodyNodeAttrPlaceholders(
 
 }  // namespace
 
+FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
+    const FunctionLibraryDefinition& flib, const GraphDef& graph) {
+  return flib.ReachableDefinitions(graph);
+}
+
+FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
+    const FunctionLibraryDefinition& flib, const FunctionDef& func) {
+  return flib.ReachableDefinitions(func);
+}
+
 void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
     InputArgExpansion input_arg_expansion) {
   string input_name = input_arg_expansion.input_name;
@@ -83,7 +94,7 @@ void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
   for (int i = 0; i < placeholders.size(); ++i) {
     const string& placeholder = input_arg_expansion.placeholders[i];
     input_arg_placeholders_.insert(
-        {placeholder, InputArgPlaceholder{input_name, /*position=*/i}});
+        {placeholder, InputArgPlaceholder{input_name, /*input_position=*/i}});
   }
   input_arg_expansions_.insert(
       {std::move(input_name), std::move(input_arg_expansion)});
@@ -237,8 +248,8 @@ Status GrapplerFunctionConnectivity::AsFunctionDefInput(
     const InputArgPlaceholder* placeholder =
         FindOrNull(input_arg_placeholders_, node_name);
     if (placeholder != nullptr) {
-      *func_def_input =
-          strings::StrCat(placeholder->input_name, ":", placeholder->position);
+      *func_def_input = strings::StrCat(placeholder->input_name, ":",
+                                        placeholder->input_position);
       return Status::OK();
     }
   }
@@ -277,15 +288,15 @@ Status GrapplerFunctionConnectivity::AsFunctionDefNode(
 
 Status GrapplerFunctionItemInstantiation::GetTypeAttr(
     const string& type_attr_name, DataType* data_type) const {
-  auto it = func_instantiation_attr_->find(type_attr_name);
-  if (it == func_instantiation_attr_->end()) {
+  const AttrValue* type_attr = func_instantiation_attr_.Find(type_attr_name);
+  if (type_attr == nullptr) {
     return errors::InvalidArgument("Type attribute ", type_attr_name,
                                    " is not defined");
-  } else if (it->second.type() == DT_INVALID) {
+  } else if (type_attr->type() == DT_INVALID) {
     return errors::InvalidArgument("Type attribute ", type_attr_name,
                                    " is not defined with a valid type");
   } else {
-    *data_type = it->second.type();
+    *data_type = type_attr->type();
   }
   return Status::OK();
 }
@@ -307,7 +318,7 @@ Status GrapplerFunctionItemInstantiation::GetArgType(
 }
 
 GrapplerFunctionItem::GrapplerFunctionItem(
-    string func_name, string description, AttrValueMap func_attr,
+    string func_name, string description, AttrSlice func_attr,
     std::vector<InputArgExpansion> input_arg_expansions,
     std::vector<OutputArgExpansion> output_arg_expansions,
     std::vector<string> keep_nodes, const int graph_def_version,
@@ -336,12 +347,6 @@ GrapplerFunctionItem::GrapplerFunctionItem(
       fetch.push_back(output_tensor);
     }
   }
-  // Stateful and Send (it's not stateful) nodes must be preserved in the graph.
-  for (const NodeDef& node : graph.node()) {
-    if (IsSend(node)) {
-      keep_ops.push_back(node.name());
-    }
-  }
 }
 
 const string& GrapplerFunctionItem::description() const { return description_; }
@@ -375,9 +380,7 @@ const std::size_t GrapplerFunctionItem::output_size() const {
   return output_arg_expansions_.size();
 }
 
-const AttrValueMap& GrapplerFunctionItem::func_attr() const {
-  return func_attr_;
-}
+const AttrSlice& GrapplerFunctionItem::func_attr() const { return func_attr_; }
 
 const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
 
@@ -418,13 +421,13 @@ bool IsParametrized(const FunctionDef& func) {
 }
 
 Status InstantiationTypeParameters(
-    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    const FunctionDef& func, const AttrSlice& func_instantiation_attr,
     std::unordered_map<string, DataType>* type_parameters) {
   if (!type_parameters->empty()) {
     return errors::InvalidArgument("Type parameters output map must be empty");
   }
 
-  GrapplerFunctionItemInstantiation instantiation(&func_instantiation_attr);
+  GrapplerFunctionItemInstantiation instantiation(func_instantiation_attr);
 
   const auto resolve_type_attr = [&](const OpDef::ArgDef& arg) {
     // Check if it's unknown and unresolved type.
@@ -446,7 +449,7 @@ Status InstantiationTypeParameters(
 }
 
 Status InstantiationBodyParameters(
-    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    const FunctionDef& func, const AttrSlice& func_instantiation_attr,
     std::unordered_map<string, AttrValue>* body_parameters) {
   if (!body_parameters->empty()) {
     return errors::InvalidArgument("Body parameters output map must be empty");
@@ -461,9 +464,10 @@ Status InstantiationBodyParameters(
         continue;
       }
 
-      auto it = func_instantiation_attr.find(placeholder);
-      if (it != func_instantiation_attr.end()) {
-        body_parameters->insert({placeholder, it->second});
+      const AttrValue* placeholder_value =
+          func_instantiation_attr.Find(placeholder);
+      if (placeholder_value) {
+        body_parameters->insert({placeholder, *placeholder_value});
       } else {
         return errors::InvalidArgument("Can't resolve placeholder: ",
                                        placeholder);
@@ -475,7 +479,7 @@ Status InstantiationBodyParameters(
 }
 
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
-                                const AttrValueMap& func_instantiation_attr,
+                                const AttrSlice& func_instantiation_attr,
                                 const FunctionLibraryDefinition& flib,
                                 const int graph_def_version,
                                 GrapplerFunctionItem* item) {
@@ -495,15 +499,25 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
   }
 
   // Helper methods to lookup function instantiation attributes
-  GrapplerFunctionItemInstantiation instantiation(&func_instantiation_attr);
+  GrapplerFunctionItemInstantiation instantiation(func_instantiation_attr);
 
   // Mapping from FunctionDef input format (name[:output][:position]) to
   // GraphDef input format (name[:position])
   GrapplerFunctionConnectivity connectivity;
 
-  // Function body shares the library with the graph that instantiated it.
+  // Instantiate function body into a statically defined graph def.
   GraphDef function_body;
-  *function_body.mutable_library() = flib.ToProto();
+
+  // Function body shares the library with the graph that instantiated it. We do
+  // not need a full copy of the function library, just the reachable subset.
+  *function_body.mutable_library() =
+      ReachableFunctionLibraryDefinition(flib, func).ToProto();
+
+  VLOG(3) << absl::Substitute(
+      "Deleted $0 unreachable functions from the Grappler function item "
+      "instantiation of $1 (library size = $2)",
+      flib.num_functions() - function_body.library().function_size(),
+      signature.name(), function_body.library().function_size());
 
   // TODO(ezhulenev): support functions with tensor sequence inputs/outputs
 
@@ -541,7 +555,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
     InputArgExpansion input_expansion{/*input_name=*/input.name(),
                                       /*data_type=*/input_data_type,
-                                      /*is_ref*/ input.is_ref(),
+                                      /*is_ref=*/input.is_ref(),
                                       /*placeholders=*/{input.name()}};
     connectivity.RegisterInputArgExpansion(input_expansion);
     inputs.push_back(std::move(input_expansion));
@@ -564,8 +578,8 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     TF_RETURN_IF_ERROR(RegisterFunctionBodyOutputs(*registration, func_def_node,
                                                    &connectivity));
 
-    // Stateful and Send nodes must be preserved in a function body
-    if (registration->op_def.is_stateful() || IsSend(func_def_node)) {
+    // Ops with side effects must be preserved in a function body.
+    if (!IsFreeOfSideEffect(func_def_node)) {
       keep_nodes.push_back(func_def_node.name());
     }
   }
@@ -602,9 +616,9 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   *item = GrapplerFunctionItem(
       /*func_name=*/signature.name(), /*description=*/signature.description(),
-      /*func_attr=*/AttrValueMap(func.attr().begin(), func.attr().end()),
-      std::move(inputs), std::move(outputs), std::move(keep_nodes),
-      graph_def_version, is_stateful, std::move(function_body));
+      /*func_attr=*/AttrSlice(&func.attr()), std::move(inputs),
+      std::move(outputs), std::move(keep_nodes), graph_def_version, is_stateful,
+      std::move(function_body));
   return Status::OK();
 }
 
@@ -612,7 +626,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const FunctionLibraryDefinition& flib,
                                 const int graph_def_version,
                                 GrapplerFunctionItem* item) {
-  return MakeGrapplerFunctionItem(func, AttrValueMap(), flib, graph_def_version,
+  return MakeGrapplerFunctionItem(func, AttrSlice(), flib, graph_def_version,
                                   item);
 }
 
@@ -685,6 +699,47 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   return Status::OK();
 }
 
+Status RemoveUnusedOutputs(const gtl::FlatSet<int>& active_outputs,
+                           GrapplerFunctionItem* item,
+                           std::vector<std::pair<int, int>>* output_mapping) {
+  DCHECK(output_mapping->empty());
+
+  // Do some sanity checking of the active outputs positions.
+  for (int active_output : active_outputs) {
+    if (active_output < 0 || active_output >= item->output_size()) {
+      return errors::InvalidArgument(
+          "Active output position is out of bound: active_output=",
+          active_output, " num_output_args=", item->output_size());
+    }
+  }
+
+  gtl::FlatSet<const OutputArgExpansion*> unused_output_args;
+
+  const auto is_unused_output_arg = [&](const OutputArgExpansion& output) {
+    return unused_output_args.find(&output) != unused_output_args.end();
+  };
+
+  for (int i = 0; i < item->output_size(); ++i) {
+    const OutputArgExpansion& output = item->output(i);
+    DCHECK(output.output_tensors.size() == 1)
+        << "Output arg expansion must have single tensor";
+
+    if (active_outputs.find(i) == active_outputs.end()) {
+      VLOG(3) << "Remove unused output: output_name=" << output.output_name
+              << " output_position=" << i;
+      unused_output_args.insert(&output);
+    } else if (!unused_output_args.empty()) {
+      // Add output mapping only if output position changed.
+      output_mapping->push_back({i, i - unused_output_args.size()});
+    }
+  }
+
+  auto& o = item->output_arg_expansions_;
+  o.erase(std::remove_if(o.begin(), o.end(), is_unused_output_arg), o.end());
+
+  return Status::OK();
+}
+
 Status MakeFunctionDef(const GrapplerFunctionItem& item,
                        const FunctionLibraryDefinition& flib,
                        FunctionDef* func) {
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index ef944ced090ae7a38748706868eb2919d7fa4f83..038cf5f527e0f32cc10e123bb0cab357e5902463 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -25,11 +25,17 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 
 namespace tensorflow {
 namespace grappler {
 
-using AttrValueMap = std::unordered_map<string, AttrValue>;
+// Returns a copy of FunctionLibraryDefinition with subset of functions that are
+// reachable from the nodes of the graph.
+FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
+    const FunctionLibraryDefinition& flib, const GraphDef& graph);
+FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
+    const FunctionLibraryDefinition& flib, const FunctionDef& func);
 
 // Depending on the function instantiation attributes, input argument to the
 // function might be a single tensor, list of tensors of the same type, or a
@@ -105,8 +111,10 @@ class GrapplerFunctionConnectivity {
   std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
 
   struct InputArgPlaceholder {
-    string input_name;
-    int position;
+    string input_name;   // Name of the function input argument.
+    int input_position;  // Index of a tensor in the function input argument
+                         // expansion, it can be greater than `0` if input
+                         // argument is a list of tensors (aka list(type)).
   };
 
   // Mapping from input arg placeholder to the function input tensor.
@@ -117,8 +125,7 @@ class GrapplerFunctionConnectivity {
 // a function.
 class GrapplerFunctionItemInstantiation {
  public:
-  explicit GrapplerFunctionItemInstantiation(
-      const AttrValueMap* func_instantiation_attr)
+  explicit GrapplerFunctionItemInstantiation(AttrSlice func_instantiation_attr)
       : func_instantiation_attr_(func_instantiation_attr) {}
 
   // Get DataType from attributes by name. Return error if attribute is missing,
@@ -130,19 +137,13 @@ class GrapplerFunctionItemInstantiation {
   Status GetArgType(const OpDef::ArgDef& arg, DataType* data_type) const;
 
  private:
-  const AttrValueMap* func_instantiation_attr_;  // do not own
+  const AttrSlice func_instantiation_attr_;  // do not own
 };
 
 // A special case of GrapplerItem, constructed from a TensorFlow Function.
 class GrapplerFunctionItem : public GrapplerItem {
  public:
   GrapplerFunctionItem() = default;
-  GrapplerFunctionItem(string func_name, string description,
-                       AttrValueMap func_attr,
-                       std::vector<InputArgExpansion> input_arg_expansions,
-                       std::vector<OutputArgExpansion> output_arg_expansions,
-                       std::vector<string> keep_nodes, int graph_def_version,
-                       bool is_stateful, GraphDef&& function_body);
 
   const string& description() const;
 
@@ -156,7 +157,7 @@ class GrapplerFunctionItem : public GrapplerItem {
   const OutputArgExpansion& output(int i) const;
   const std::size_t output_size() const;
 
-  const AttrValueMap& func_attr() const;
+  const AttrSlice& func_attr() const;
   const GraphDef& function_body() const;
   GraphDef& mutable_function_body();
 
@@ -165,12 +166,25 @@ class GrapplerFunctionItem : public GrapplerItem {
   GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
 
  private:
+  friend Status MakeGrapplerFunctionItem(const FunctionDef&, const AttrSlice&,
+                                         const FunctionLibraryDefinition&, int,
+                                         GrapplerFunctionItem*);
   friend Status ReplaceInputWithConst(const NodeDef&, int,
                                       GrapplerFunctionItem*);
+  friend Status RemoveUnusedOutputs(
+      const gtl::FlatSet<int>& active_outputs, GrapplerFunctionItem* item,
+      std::vector<std::pair<int, int>>* output_mapping);
+
+  GrapplerFunctionItem(string func_name, string description,
+                       AttrSlice func_attr,
+                       std::vector<InputArgExpansion> input_arg_expansions,
+                       std::vector<OutputArgExpansion> output_arg_expansions,
+                       std::vector<string> keep_nodes, int graph_def_version,
+                       bool is_stateful, GraphDef&& function_body);
 
   string description_;
-  AttrValueMap func_attr_;  // Attributes specific to function definition that
-                            // produced this item (FuncDef.attr field).
+  AttrSlice func_attr_;  // Attributes specific to function definition that
+                         // produced this item (FuncDef.attr field).
 
   std::vector<InputArgExpansion> input_arg_expansions_;
   std::vector<OutputArgExpansion> output_arg_expansions_;
@@ -195,14 +209,14 @@ bool IsParametrized(const FunctionDef& func);
 // Resolve function instantiation type parameters from the attributes of the
 // caller node. Return error if type can't be resolved.
 Status InstantiationTypeParameters(
-    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    const FunctionDef& func, const AttrSlice& func_instantiation_attr,
     std::unordered_map<string, DataType>* type_parameters);
 
 // Resolve function instantiation body parameters (values for the function body
 // attr placeholders) from the attributes of the caller node. Return error if
 // type can't be resolved.
 Status InstantiationBodyParameters(
-    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    const FunctionDef& func, const AttrSlice& func_instantiation_attr,
     std::unordered_map<string, AttrValue>* body_parameters);
 
 // Register GrapplerFunctionItem input arg expansion and function body outputs
@@ -216,13 +230,23 @@ Status RegisterGrapplerFunctionConnectivity(
 Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
                              GrapplerFunctionItem* item);
 
+// Remove function output arguments that do not have any active outputs (output
+// tensor connected to other node inputs or in a fetch set). Active outputs uses
+// GraphDef output position encoding, and multiple active outputs could
+// potentially be connected to the same output argument (in case of tensor list
+// outputs). Add output mapping for all active outputs that changed it's output
+// position (std::pair<old position, new position>).
+Status RemoveUnusedOutputs(const gtl::FlatSet<int>& active_outputs,
+                           GrapplerFunctionItem* item,
+                           std::vector<std::pair<int, int>>* output_mapping);
+
 // Make a GrapplerFunctionItem from the function definition and function
 // instantiation attributes (caller node attributes). Returns error if the given
 // function def cannot be converted (e.g. not all attributes are defined).
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
-                                const AttrValueMap& func_instantiation_attr,
+                                const AttrSlice& func_instantiation_attr,
                                 const FunctionLibraryDefinition& flib,
-                                const int graph_def_version,
+                                int graph_def_version,
                                 GrapplerFunctionItem* item);
 
 // Make a GrapplerFunction item from the function definition. Function must be
@@ -232,7 +256,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 // without specializing it to it's instantiation attributes (at least types)?
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const FunctionLibraryDefinition& flib,
-                                const int graph_def_version,
+                                int graph_def_version,
                                 GrapplerFunctionItem* item);
 
 // Make a FunctionDef from the GrapplerFunctionItem. Use function library
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index b51f2781b8e2180067e735ca1b9a8aaf39fc5273..8639dec05a1eb8aa7afcadc20ee9f8949bfeae14 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -28,6 +28,8 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kDevice[] = "/device:CPU:0";
+
 class FunctionsTest : public ::testing::Test {};
 
 TEST_F(FunctionsTest, IsParametrized) {
@@ -69,15 +71,15 @@ TEST_F(FunctionsTest, InstantiationParameters) {
       /* Mapping between function returns and function node outputs. */
       {{"x", "cx:output:0"}, {"y", "cy:output:0"}});
 
-  std::unordered_map<string, AttrValue> func_instantiation_attr;
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
   func_instantiation_attr["key"].set_s("key-value");
   func_instantiation_attr["A"].set_type(DT_FLOAT);
   func_instantiation_attr["B"].set_type(DT_INT32);
   func_instantiation_attr["C"].set_type(DT_DOUBLE);
 
   std::unordered_map<string, DataType> type_parameters;
-  TF_EXPECT_OK(InstantiationTypeParameters(func, func_instantiation_attr,
-                                           &type_parameters));
+  TF_EXPECT_OK(InstantiationTypeParameters(
+      func, AttrSlice(&func_instantiation_attr), &type_parameters));
 
   ASSERT_EQ(3, type_parameters.size());
   EXPECT_EQ(DT_FLOAT, type_parameters["A"]);
@@ -85,8 +87,8 @@ TEST_F(FunctionsTest, InstantiationParameters) {
   EXPECT_EQ(DT_DOUBLE, type_parameters["C"]);
 
   std::unordered_map<string, AttrValue> body_parameters;
-  TF_EXPECT_OK(InstantiationBodyParameters(func, func_instantiation_attr,
-                                           &body_parameters));
+  TF_EXPECT_OK(InstantiationBodyParameters(
+      func, AttrSlice(&func_instantiation_attr), &body_parameters));
 
   ASSERT_EQ(1, body_parameters.size());
   EXPECT_EQ("key-value", body_parameters["key"].s());
@@ -235,13 +237,14 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
           {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
       });
 
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["T"].set_type(DT_FLOAT);
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("XTimesTwo", item.id);
   EXPECT_EQ(4, item.function_body().node_size());
@@ -256,19 +259,19 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
-    if (node.name() == "x" && count++) {
+    if (node.name() == "x" && ++count) {
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "two" && count++) {
+    } else if (node.name() == "two" && ++count) {
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "scale" && count++) {
+    } else if (node.name() == "scale" && ++count) {
       EXPECT_EQ("Cast", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("two", node.input(0));
-    } else if (node.name() == "y" && count++) {
+    } else if (node.name() == "y" && ++count) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
       EXPECT_EQ(2, node.input_size());
@@ -311,13 +314,14 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
       // Nodes
       nodes);
 
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["T"].set_type(DT_FLOAT);
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("SubGrad", item.id);
   EXPECT_EQ(12, item.function_body().node_size());
@@ -338,17 +342,17 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "rx" && count++) {
+    } else if (node.name() == "rx" && ++count) {
       EXPECT_EQ("BroadcastGradientArgs", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("sx", node.input(0));
       EXPECT_EQ("sy", node.input(1));
-    } else if (node.name() == "sum_gx" && count++) {
+    } else if (node.name() == "sum_gx" && ++count) {
       EXPECT_EQ("Sum", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("gx", node.input(0));
       EXPECT_EQ("rx", node.input(1));
-    } else if (node.name() == "sum_gy" && count++) {
+    } else if (node.name() == "sum_gy" && ++count) {
       EXPECT_EQ("Sum", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("gy", node.input(0));
@@ -394,12 +398,13 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
       // Output Mapping
       {{"o", "o:z:0"}});
 
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["T"].set_type(DT_FLOAT);
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -408,29 +413,29 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "a0" && count++) {
+    } else if (node.name() == "a0" && ++count) {
       EXPECT_EQ("Swap", node.op());
       EXPECT_EQ(3, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("y", node.input(1));
       EXPECT_EQ("^x2", node.input(2));
-    } else if (node.name() == "a1" && count++) {
+    } else if (node.name() == "a1" && ++count) {
       EXPECT_EQ("Swap", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("a0", node.input(0));
       EXPECT_EQ("a0:1", node.input(1));
-    } else if (node.name() == "x2" && count++) {
+    } else if (node.name() == "x2" && ++count) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("x", node.input(1));
-    } else if (node.name() == "y2" && count++) {
+    } else if (node.name() == "y2" && ++count) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ(3, node.input_size());
       EXPECT_EQ("y", node.input(0));
       EXPECT_EQ("y", node.input(1));
       EXPECT_EQ("^a1", node.input(2));
-    } else if (node.name() == "o" && count++) {
+    } else if (node.name() == "o" && ++count) {
       EXPECT_EQ("Add", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x2", node.input(0));
@@ -456,27 +461,28 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
       // Mapping
       {{"out", "Exp:y:0"}});
 
-  std::unordered_map<string, AttrValue> func_attr;
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ(1, item.output_size());
   EXPECT_EQ("Exp", item.output(0).output_tensors[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
-    if (node.name() == "in" && count++) {
+    if (node.name() == "in" && ++count) {
       EXPECT_EQ("Placeholder", node.op());
       EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
       EXPECT_EQ(0, node.input_size());
-    } else if (node.name() == "Linear_func" && count++) {
+    } else if (node.name() == "Linear_func" && ++count) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("in", node.input(0));
-    } else if (node.name() == "Exp" && count++) {
+    } else if (node.name() == "Exp" && ++count) {
       EXPECT_EQ("Exp", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("Linear_func", node.input(0));
@@ -500,12 +506,13 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
       // Mapping
       {{"out0", "in0"}});
 
-  std::unordered_map<string, AttrValue> func_attr;
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("ForwardInputs", item.id);
   EXPECT_EQ(5, item.function_body().node_size());
@@ -546,13 +553,14 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
       {{{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
        {{"o"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}}});
 
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["T"].set_type(DT_FLOAT);
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ(0, item.input_size());
   EXPECT_EQ(1, item.output_size());
@@ -568,6 +576,33 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   EXPECT_EQ("two", cast.input(0));
 }
 
+TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  FunctionDef func = FunctionDefHelper::Define(
+      /* Name */ "SideEffects",
+      /* Args */ {"x: Ref(float)"},
+      /* Return values */ {},
+      /* Attr def */ {},
+      /* Nodes */
+      {{{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+       {{"update"}, "AssignAdd", {"x", "one"}, {{"T", DT_FLOAT}}}});
+
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
+
+  EXPECT_EQ("SideEffects", item.id);
+  EXPECT_EQ(3, item.function_body().node_size());
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ(0, item.output_size());
+  ASSERT_EQ(1, item.keep_ops.size());
+  EXPECT_EQ("update", item.keep_ops[0]);
+}
+
 TEST_F(FunctionsTest, MakeFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
@@ -586,13 +621,14 @@ TEST_F(FunctionsTest, MakeFunctionDef) {
           {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
       });
 
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["T"].set_type(DT_FLOAT);
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   FunctionDef specialized;
   TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
@@ -606,9 +642,9 @@ TEST_F(FunctionsTest, MakeFunctionDef) {
   // Function body specialized for instantiation types
   int count = 0;
   for (const NodeDef &node : specialized.node_def()) {
-    if (node.name() == "scale" && count++) {
+    if (node.name() == "scale" && ++count) {
       EXPECT_EQ(DT_FLOAT, node.attr().at("DstT").type());
-    } else if (node.name() == "y" && count++) {
+    } else if (node.name() == "y" && ++count) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ("x:0", node.input(0));
       EXPECT_EQ("scale:y:0", node.input(1));
@@ -625,13 +661,14 @@ TEST_F(FunctionsTest, ReplaceInputWithConst) {
       /* Mapping between function returns and function node outputs. */
       {{"z", "output:z:0"}});
 
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["T"].set_type(DT_FLOAT);
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ(2, item.input_size());
   EXPECT_EQ(1, item.output_size());
@@ -679,13 +716,13 @@ TEST_F(FunctionsTest, ReplaceInputWithConst) {
   // Check that graph has const nodes pushed into function body.
   int count = 0;
   for (const NodeDef &node : specialized.node_def()) {
-    if (node.name() == "x" && count++) {
+    if (node.name() == "x" && ++count) {
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ("const_input_x", node.attr().at("Tag").s());
-    } else if (node.name() == "y" && count++) {
+    } else if (node.name() == "y" && ++count) {
       EXPECT_EQ("Const", node.op());
       EXPECT_EQ("const_input_y", node.attr().at("Tag").s());
-    } else if (node.name() == "output" && count++) {
+    } else if (node.name() == "output" && ++count) {
       EXPECT_EQ("Mul", node.op());
       EXPECT_EQ("x:output:0", node.input(0));
       EXPECT_EQ("y:output:0", node.input(1));
@@ -695,7 +732,7 @@ TEST_F(FunctionsTest, ReplaceInputWithConst) {
 }
 
 TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
-  using test::function::NDef;
+  using ::tensorflow::test::function::NDef;
 
   FunctionDef mul_func = FunctionDefHelper::Create(
       "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
@@ -713,8 +750,8 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
       {/* pass input to output through identity */
        NDef("output", "Identity", {"x"}, {{"T", "float"}})});
 
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_FLOAT);
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["T"].set_type(DT_FLOAT);
 
   FunctionDefLibrary lib_def;
   *lib_def.add_function() = func;
@@ -722,8 +759,9 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), lib_def);
 
   GrapplerFunctionItem item;
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   // Replace function body with identity function
   item.SwapFunctionBody(std::move(id_func_body));
@@ -733,7 +771,7 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   // Check that graph body was updated.
   int count = 0;
   for (const NodeDef &node : specialized.node_def()) {
-    if (node.name() == "output" && count++) {
+    if (node.name() == "output" && ++count) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ("x:0", node.input(0));
     }
@@ -762,10 +800,11 @@ TEST_F(FunctionsTest, FunctionDefGrapplerFunctionItemRoundTrip) {
   FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
 
   GrapplerFunctionItem item;
-  std::unordered_map<string, AttrValue> func_attr;
-  func_attr["T"].set_type(DT_INT32);
-  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib,
-                                        TF_GRAPH_DEF_VERSION, &item));
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["T"].set_type(DT_INT32);
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
 
   FunctionDef func2;
   TF_EXPECT_OK(MakeFunctionDef(item, flib, &func2));
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 6266733f3e6588af9e06a5a279ecabf5adbd009a..576494cad55e22ba8457f30d0ea79b53f6f5de78 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -114,9 +114,13 @@ void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) const {
   for (int i = 0; i < want.node_size(); ++i) {
     EXPECT_EQ(want.node(i).op(), got.node(i).op());
     EXPECT_EQ(want.node(i).name(), got.node(i).name());
+    EXPECT_EQ(want.node(i).device(), got.node(i).device());
+
     ASSERT_EQ(want.node(i).input_size(), got.node(i).input_size());
     for (int j = 0; j < want.node(i).input_size(); ++j) {
-      EXPECT_TRUE(IsSameInput(want.node(i).input(j), got.node(i).input(j)));
+      const TensorId want_tensor = ParseTensorName(want.node(i).input(j));
+      const TensorId got_tensor = ParseTensorName(got.node(i).input(j));
+      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
     }
   }
 }
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index bd4d7f2a7e89ad502f3b139f62e00dba198bb751..0cfd740dcbe15e0571bc159858c0ed33c2071cb8 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -58,7 +58,7 @@ class GrapplerTest : public ::testing::Test {
   // Count nodes of the given op-type in a graph.
   int CountOpNodes(const GraphDef& graph, const string& op);
 
-  // Get a random tansor with given shape.
+  // Get a random tensor with given shape.
   template <DataType DTYPE>
   Tensor GenerateRandomTensor(const TensorShape& shape) const {
     typedef typename EnumToDataType<DTYPE>::Type T;
@@ -68,6 +68,17 @@ class GrapplerTest : public ::testing::Test {
     return tensor;
   }
 
+  // Get a constant tensor with given shape.
+  template <DataType DTYPE>
+  Tensor GenerateConstantTensor(
+      const TensorShape& shape,
+      typename EnumToDataType<DTYPE>::Type value) const {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    Tensor tensor(DTYPE, shape);
+    for (auto i = 0; i < tensor.NumElements(); i++) tensor.flat<T>()(i) = value;
+    return tensor;
+  }
+
  private:
   SessionOptions options_;
 };
diff --git a/tensorflow/core/grappler/utils/traversal.cc b/tensorflow/core/grappler/utils/traversal.cc
index f44f53c4e63805544fa480628e805303064edb3d..6952277568676baf5812a20c4c743356eeedd40a 100644
--- a/tensorflow/core/grappler/utils/traversal.cc
+++ b/tensorflow/core/grappler/utils/traversal.cc
@@ -14,30 +14,36 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/traversal.h"
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 
 namespace tensorflow {
 namespace grappler {
 
-void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
-                const std::function<void(NodeDef*)>& pre_order,
-                const std::function<void(NodeDef*)>& post_order,
-                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge) {
+namespace {
+
+template <typename GraphViewType>
+void ReverseDfsInternal(
+    const GraphViewType& graph_view, const std::vector<const NodeDef*>& from,
+    const std::function<void(const NodeDef*)>& pre_order,
+    const std::function<void(const NodeDef*)>& post_order,
+    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
   // Stack of work to do.
   struct StackElem {
-    NodeDef* node;
+    const NodeDef* node;
     bool children_visited;
-    NodeDef* src;
+    const NodeDef* src;
   };
   std::vector<StackElem> stack;
 
   stack.reserve(from.size());
-  for (NodeDef* node : from) {
+  for (const NodeDef* node : from) {
     stack.push_back(StackElem{node, false});
   }
 
   enum NodeState { NOT_VISITED = 0, VISITING = 1, DONE = 2 };
-  std::unordered_map<NodeDef*, NodeState> node_state;
+  absl::flat_hash_map<const NodeDef*, NodeState> node_state;
   while (!stack.empty()) {
     StackElem w = stack.back();
     stack.pop_back();
@@ -69,12 +75,32 @@ void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
     // Enqueue the node again with the children_visited flag set to true.
     stack.push_back(StackElem{w.node, true, w.src});
 
-    // Now enqueu the node children.
+    // Now enqueue the node children.
     for (const auto fanin : graph_view.GetFanins(*w.node, true)) {
       stack.push_back(StackElem{fanin.node, false, w.node});
     }
   }
 }
 
+}  // namespace
+
+void ReverseDfs(
+    const GraphView& graph_view, const std::vector<const NodeDef*>& from,
+    const std::function<void(const NodeDef*)>& pre_order,
+    const std::function<void(const NodeDef*)>& post_order,
+    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
+  ReverseDfsInternal<GraphView>(graph_view, from, pre_order, post_order,
+                                on_back_edge);
+}
+
+void ReverseDfs(
+    const MutableGraphView& graph_view, const std::vector<const NodeDef*>& from,
+    const std::function<void(const NodeDef*)>& pre_order,
+    const std::function<void(const NodeDef*)>& post_order,
+    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
+  ReverseDfsInternal<MutableGraphView>(graph_view, from, pre_order, post_order,
+                                       on_back_edge);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal.h b/tensorflow/core/grappler/utils/traversal.h
index bb3fa090e8fdaf12ed6dcb18eb1511c55496a125..5b7737f97eb1f8ee56efd599d6216dc4e472febd 100644
--- a/tensorflow/core/grappler/utils/traversal.h
+++ b/tensorflow/core/grappler/utils/traversal.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 #include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -28,10 +29,17 @@ namespace grappler {
 // order. If loops are found, the on_back_edge functor will be called on the
 // corresponding back edges. Moreover, the pre and post order will assume that
 // these back edges will be cut.
-void ReverseDfs(const GraphView& graph_view, const std::vector<NodeDef*>& from,
-                const std::function<void(NodeDef*)>& pre_order,
-                const std::function<void(NodeDef*)>& post_order,
-                const std::function<void(NodeDef*, NodeDef*)>& on_back_edge);
+void ReverseDfs(
+    const GraphView& graph_view, const std::vector<const NodeDef*>& from,
+    const std::function<void(const NodeDef*)>& pre_order,
+    const std::function<void(const NodeDef*)>& post_order,
+    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge);
+
+void ReverseDfs(
+    const MutableGraphView& graph_view, const std::vector<const NodeDef*>& from,
+    const std::function<void(const NodeDef*)>& pre_order,
+    const std::function<void(const NodeDef*)>& post_order,
+    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
index cc68bd1a9637cb6f61955e8fa5d495a34f19cb09..c040477a08970436cb07f6bb87c30e47b6b72525 100644
--- a/tensorflow/core/grappler/utils/traversal_test.cc
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -14,9 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/traversal.h"
-//#include "tensorflow/core/framework/node_def.pb.h"
-//#include "tensorflow/core/lib/core/status_test_util.h"
-//#include "tensorflow/core/platform/protobuf.h"
+
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -53,19 +51,28 @@ TEST_F(TraversalTest, ReverseDfsNoLoop) {
   *graph.add_node() = CreateNode("5", {});
   *graph.add_node() = CreateNode("4", {});
 
-  std::vector<NodeDef*> start_nodes = {graph.mutable_node(1),
-                                       graph.mutable_node(2)};
+  std::vector<const NodeDef*> start_nodes = {&graph.node(1), &graph.node(2)};
   std::vector<string> pre_order;
   std::vector<string> post_order;
   bool found_back_edge = false;
   ReverseDfs(
       GraphView(&graph), start_nodes,
-      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
-      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
-      [&found_back_edge](NodeDef*, NodeDef*) { found_back_edge = true; });
+      [&pre_order](const NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](const NodeDef* n) { post_order.push_back(n->name()); },
+      [&found_back_edge](const NodeDef*, const NodeDef*) {
+        found_back_edge = true;
+      });
+
+  // Pre/Post order traversals are non deterministic because a node fanin is an
+  // absl::flat_hash_set with non deterministic traversal order.
+  using ValidTraversal = std::pair<std::vector<string>, std::vector<string>>;
 
-  EXPECT_EQ(std::vector<string>({"1", "4", "3", "2", "5", "0"}), pre_order);
-  EXPECT_EQ(std::vector<string>({"4", "5", "2", "3", "1", "0"}), post_order);
+  std::set<ValidTraversal> valid_traversals = {
+      // pre_order                     post_order
+      {{"1", "4", "3", "2", "5", "0"}, {"4", "5", "2", "3", "1", "0"}},
+      {{"1", "3", "2", "5", "4", "0"}, {"5", "2", "3", "4", "1", "0"}}};
+
+  EXPECT_EQ(valid_traversals.count({pre_order, post_order}), 1);
   EXPECT_FALSE(found_back_edge);
 }
 
@@ -79,20 +86,29 @@ TEST_F(TraversalTest, ReverseDfsWithLoop) {
   *graph.add_node() = CreateNode("1", "Enter", {});
   *graph.add_node() = CreateNode("6", "Exit", {"3"});
 
-  std::vector<NodeDef*> start_nodes = {graph.mutable_node(5)};
+  std::vector<const NodeDef*> start_nodes = {&graph.node(5)};
   std::vector<string> pre_order;
   std::vector<string> post_order;
   std::vector<string> back_edges;
   ReverseDfs(
       GraphView(&graph), start_nodes,
-      [&pre_order](NodeDef* n) { pre_order.push_back(n->name()); },
-      [&post_order](NodeDef* n) { post_order.push_back(n->name()); },
-      [&back_edges](NodeDef* src, NodeDef* dst) {
+      [&pre_order](const NodeDef* n) { pre_order.push_back(n->name()); },
+      [&post_order](const NodeDef* n) { post_order.push_back(n->name()); },
+      [&back_edges](const NodeDef* src, const NodeDef* dst) {
         back_edges.push_back(strings::StrCat(src->name(), "->", dst->name()));
       });
 
-  EXPECT_EQ(std::vector<string>({"6", "3", "2", "1", "5", "4"}), pre_order);
-  EXPECT_EQ(std::vector<string>({"1", "4", "5", "2", "3", "6"}), post_order);
+  // Pre/Post order traversals are non deterministic because a node fanin is an
+  // absl::flat_hash_set with non deterministic traversal order.
+  using ValidTraversal = std::pair<std::vector<string>, std::vector<string>>;
+
+  std::set<ValidTraversal> valid_traversals = {
+      // pre_order                     post_order
+      {{"6", "3", "2", "4", "5", "1"}, {"5", "4", "1", "2", "3", "6"}},
+      {{"6", "3", "2", "1", "5", "4"}, {"1", "4", "5", "2", "3", "6"}},
+      {{"6", "3", "2", "5", "4", "1"}, {"4", "5", "1", "2", "3", "6"}}};
+
+  EXPECT_EQ(valid_traversals.count({pre_order, post_order}), 1);
   EXPECT_EQ(std::vector<string>({"4->3"}), back_edges);
 }
 
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 9b6c1f690b44a204bc2fbc938dd010b01a774e36..e993391b51bfe882a1e662f220ace0542db4ffba 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -14,9 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils.h"
+
+#include <unistd.h>
+#include <limits>
+#include <memory>
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -24,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -350,10 +357,57 @@ TEST_F(UtilsTest, NumNonControlOutputs) {
   EXPECT_EQ(1, NumNonControlDataOutputs(*add_node, node_map));
 }
 
+TEST(CheckAttrExists, All) {
+  NodeDef node;
+  node.set_name("node");
+  (*node.mutable_attr())["apple"].set_i(7);
+  (*node.mutable_attr())["pear"].set_b(true);
+
+  TF_EXPECT_OK(CheckAttrExists(node, "apple"));
+  TF_EXPECT_OK(CheckAttrExists(node, "pear"));
+
+  TF_EXPECT_OK(CheckAttrsExist(node, {}));
+  TF_EXPECT_OK(CheckAttrsExist(node, {"apple"}));
+  TF_EXPECT_OK(CheckAttrsExist(node, {"pear"}));
+  TF_EXPECT_OK(CheckAttrsExist(node, {"apple", "pear"}));
+  TF_EXPECT_OK(CheckAttrsExist(node, {"pear", "apple"}));
+
+  Status status = CheckAttrExists(node, "banana");
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(status.ToString(),
+            "Invalid argument: Node 'node' lacks 'banana' attr: name: \"node\" "
+            "attr { key: \"apple\" value { i: 7 } } attr { key: \"pear\" value "
+            "{ b: true } }");
+  EXPECT_FALSE(CheckAttrsExist(node, {""}).ok());
+  EXPECT_FALSE(CheckAttrsExist(node, {"pear", "cherry"}).ok());
+  EXPECT_FALSE(CheckAttrsExist(node, {"banana", "apple"}).ok());
+}
+
 TEST_F(UtilsTest, DeleteNodes) {
   // TODO(rmlarsen): write forgotten test.
 }
 
+TEST(IsKernelRegisteredForNode, All) {
+  NodeDef node;
+  node.set_name("foo");
+  node.set_op("NoOp");
+  node.set_device("/cpu:0");
+  TF_EXPECT_OK(IsKernelRegisteredForNode(node));
+  node.set_device("/gpu:0");
+  TF_EXPECT_OK(IsKernelRegisteredForNode(node));
+
+  // Bad device name.
+  node.set_device("");
+  EXPECT_FALSE(IsKernelRegisteredForNode(node).ok());
+
+  // Check an op that is only defined on CPU.
+  node.set_op("MatchingFiles");
+  node.set_device("/cpu:0");
+  TF_EXPECT_OK(IsKernelRegisteredForNode(node));
+  node.set_device("/gpu:0");
+  EXPECT_FALSE(IsKernelRegisteredForNode(node).ok());
+}
+
 #define BM_NodePositionIfSameNode(I, N, NAME)               \
   static void BM_NodePositionIfSameNode_##NAME(int iters) { \
     string input = I;                                       \
@@ -390,6 +444,26 @@ BM_ParseNodeNameAsStringPiece("foo:123", foo123);
 BM_ParseNodeNameAsStringPiece("foo/bar/baz:123", foo_bar_baz_123);
 BM_ParseNodeNameAsStringPiece("^foo/bar/baz:123", foo_bar_baz_123_ctrl);
 
+TEST_F(UtilsTest, SetTensorValueBFloat16) {
+  Tensor t(DT_BFLOAT16, TensorShape({}));
+  TF_ASSERT_OK(SetTensorValue(t.dtype(), 2, &t));
+  test::ExpectTensorEqual<bfloat16>(Tensor(bfloat16(2)), t);
+}
+
+TEST_F(UtilsTest, SetTensorValueBFloat16IntMax) {
+  Tensor t(DT_BFLOAT16, TensorShape({}));
+  TF_ASSERT_OK(SetTensorValue(t.dtype(), std::numeric_limits<int>::max(), &t));
+  test::ExpectTensorEqual<bfloat16>(
+      Tensor(bfloat16(std::numeric_limits<int>::max())), t);
+}
+
+TEST_F(UtilsTest, SetTensorValueBFloat16IntMin) {
+  Tensor t(DT_BFLOAT16, TensorShape({}));
+  TF_ASSERT_OK(SetTensorValue(t.dtype(), std::numeric_limits<int>::min(), &t));
+  test::ExpectTensorEqual<bfloat16>(
+      Tensor(bfloat16(std::numeric_limits<int>::min())), t);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 3a920f26f3aaa504451e944fa81b67d80c092c60..0e5d8d765a6bfde3a0e187c0b386174d3b20a098 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -22,33 +22,33 @@ package_group(
         "//learning/brain/research/sparse_matrix/...",
         "//learning/faster_training/...",
         "//tensorflow/...",
+        "//tensorflow_text/...",
         "//third_party/car/...",
     ],
 )
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "cc_header_only_library",
     "if_android",
+    "if_not_windows",
+    "tf_cc_binary",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
-    "tf_cc_binary",
     "tf_copts",
     "tf_cuda_library",
-    "tf_opts_nortti_if_android",
     "tf_kernel_library",
     "tf_mkl_kernel_library",
-    "cc_header_only_library",
-    "if_not_windows",
-    "if_override_eigen_strong_inline",
+    "tf_opts_nortti_if_android",
 )
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library",
     "tf_kernel_tests_linkstatic",
+    "tf_proto_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -93,6 +93,17 @@ config_setting(
     },
 )
 
+config_setting(
+    # Add "--define tensorflow_mkldnn_contraction_kernel=1" to your build command to use mkldnn
+    # sgemm in Eigen tensor contractions (matrix multiplications and convolutions). The mkldnn
+    # kernels are generated at runtime and use avx/avx2/fma/avx512 based on cpu status registers
+    # (https://en.wikipedia.org/wiki/CPUID).
+    name = "mkldnn_contraction_kernel",
+    values = {
+        "define": "tensorflow_mkldnn_contraction_kernel=1",
+    },
+)
+
 # Public support libraries ----------------------------------------------------
 
 cc_library(
@@ -184,15 +195,35 @@ cc_library(
     deps = ["//third_party/eigen3"],
 )
 
-cc_library(
+tf_kernel_library(
     name = "conv_2d",
     hdrs = ["conv_2d.h"],
+    gpu_srcs = [
+        "conv_2d.h",
+        "conv_2d_gpu.h",
+        "conv_2d_gpu_double.cu.cc",
+        "conv_2d_gpu_float.cu.cc",
+        "conv_2d_gpu_half.cu.cc",
+        "conv_2d_gpu_int.cu.cc",
+        "conv_2d_gpu_uint16.cu.cc",
+        "conv_2d_gpu_uint32.cu.cc",
+        "conv_2d_gpu_uint64.cu.cc",
+        "conv_2d_gpu_uint8.cu.cc",
+    ],
     deps = [
         ":eigen_helpers",
+        ":fill_functor",
         ":gpu_util_hdrs",
+        ":image_resizer_state",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -270,6 +301,20 @@ cc_library(
     ],
 )
 
+tf_kernel_library(
+    name = "nccl_kernels",
+    srcs = if_cuda([
+        "nccl_ops.cc",
+    ]),
+    deps = if_cuda([
+        "@local_config_nccl//:nccl",
+        "//tensorflow/core/nccl:nccl_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_headers_lib",
+        "//tensorflow/core:nccl_ops_op_lib",
+    ]),
+)
+
 tf_cuda_library(
     name = "ops_testutil",
     testonly = 1,
@@ -293,14 +338,10 @@ tf_cuda_library(
 
 cc_library(
     name = "ops_util",
-    srcs = ["ops_util.cc"],
     hdrs = ["ops_util.h"],
     copts = if_not_windows(["-Wno-sign-compare"]),
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//third_party/eigen3",
     ],
 )
 
@@ -508,8 +549,7 @@ cc_library(
     hdrs = ["bounds_check.h"],
     visibility = [":friends"],
     deps = [
-        "//tensorflow/core:framework_lite",
-        "//third_party/eigen3",
+        "//tensorflow/core:framework_bounds_check",
     ],
 )
 
@@ -534,6 +574,40 @@ cc_library(
     ],
 )
 
+# Depending on a build configuration this target provides custom kernel for Eigen
+# tensor contractions (small matrix multiplication kernel used to multiple together
+# blocks of the original tensors).
+#
+# 0) Default contraction kernel is Eigen::internal::gebp_kernel.
+#
+# 1) --define tensorflow_mkldnn_contraction_kernel=1
+#    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at runtime and
+#    use avx/avx2/fma/avx512 based on cpu status registers (https://en.wikipedia.org/wiki/CPUID).
+#
+# If you use `tensor.contract(other_tensor)` in your code, you must include additional header
+# to get the benefit of custom contraction kernel:
+#
+#   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#   #include "third_party/tensorflow/core/kernels/eigen_contraction_kernel.h"
+#   #endif
+cc_library(
+    name = "eigen_contraction_kernel",
+    hdrs = ["eigen_contraction_kernel.h"],
+    defines = select({
+        ":mkldnn_contraction_kernel": [
+            "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
+            "TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        "//third_party/eigen3",
+    ] + select({
+        ":mkldnn_contraction_kernel": ["@mkl_dnn//:mkldnn_single_threaded"],
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
     name = "eigen_helpers",
     hdrs = [
@@ -548,6 +622,7 @@ cc_library(
         "eigen_volume_patch.h",
     ],
     deps = [
+        ":eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
@@ -958,6 +1033,87 @@ tf_kernel_library(
     ]) + ARRAY_DEPS,
 )
 
+cc_library(
+    name = "ragged_ops",
+    deps = [
+        ":ragged_gather_op",
+        ":ragged_range_op",
+        ":ragged_tensor_to_sparse_kernel",
+    ],
+)
+
+tf_kernel_library(
+    name = "ragged_gather_op",
+    srcs = ["ragged_gather_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ragged_array_ops_op_lib",
+    ],
+)
+
+tf_cc_test(
+    name = "ragged_gather_op_test",
+    size = "small",
+    srcs = ["ragged_gather_op_test.cc"],
+    deps = [
+        ":ragged_gather_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ragged_array_ops_op_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
+tf_kernel_library(
+    name = "ragged_range_op",
+    srcs = ["ragged_range_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ragged_math_ops_op_lib",
+    ],
+)
+
+tf_cc_test(
+    name = "ragged_range_op_test",
+    srcs = ["ragged_range_op_test.cc"],
+    deps = [
+        ":ragged_range_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ragged_math_ops_op_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
+tf_kernel_library(
+    name = "ragged_tensor_to_sparse_kernel",
+    srcs = ["ragged_tensor_to_sparse_kernel.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ragged_conversion_ops_op_lib",
+    ],
+)
+
+tf_cc_test(
+    name = "ragged_tensor_to_sparse_kernel_test",
+    size = "small",
+    srcs = ["ragged_tensor_to_sparse_kernel_test.cc"],
+    deps = [
+        ":ragged_tensor_to_sparse_kernel",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ragged_conversion_ops_op_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
 tf_kernel_library(
     name = "cudnn_rnn_kernels",
     srcs = ["cudnn_rnn_ops.cc"],
@@ -1537,14 +1693,14 @@ tf_kernel_library(
     ],
     visibility = [":friends"],
     deps = [
-        ":conv_ops",
+        ":conv_2d",
         ":ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//third_party/eigen3",
     ],
-    alwayslink = 0,
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -1797,10 +1953,22 @@ tf_kernel_library(
     deps = DATA_FLOW_DEPS,
 )
 
+cc_library(
+    name = "stack",
+    srcs = ["stack.cc"],
+    hdrs = ["stack.h"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "stack_ops",
     prefix = "stack_ops",
-    deps = DATA_FLOW_DEPS,
+    deps = DATA_FLOW_DEPS + [":stack"],
 )
 
 tf_kernel_library(
@@ -2032,6 +2200,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:resource_variable_ops_op_lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2096,6 +2265,11 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+        "//tensorflow/core/grappler/utils:functions",
     ],
 )
 
@@ -2297,12 +2471,25 @@ tf_cc_tests(
     ],
     deps = [
         ":eigen_helpers",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+    ],
+)
+
+# Conditional test target generation is not supported by the "tf_cc_tests" macro
+# (can't add 'select' to the srcs field, type 'select' is not iterable).
+tf_cc_test(
+    name = "eigen_mkldnn_contraction_kernel_test",
+    size = "small",
+    srcs = select({
+        ":mkldnn_contraction_kernel": ["eigen_mkldnn_contraction_kernel_test.cc"],
+        "//conditions:default": [],
+    }),
+    tags = ["mkldnn_contraction_kernel"],
+    deps = [
+        ":eigen_contraction_kernel",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
 
@@ -2568,11 +2755,13 @@ cc_library(
         ":cholesky_grad",
         ":cholesky_op",
         ":determinant_op",
+        ":lu_op",
         ":matrix_exponential_op",
         ":matrix_inverse_op",
         ":matrix_logarithm_op",
         ":matrix_solve_ls_op",
         ":matrix_solve_op",
+        ":matrix_square_root_op",
         ":matrix_triangular_solve_op",
         ":qr_op",
         ":self_adjoint_eig_op",
@@ -2682,6 +2871,12 @@ tf_kernel_library(
     deps = LINALG_DEPS,
 )
 
+tf_kernel_library(
+    name = "matrix_square_root_op",
+    prefix = "matrix_square_root_op",
+    deps = LINALG_DEPS,
+)
+
 tf_kernel_library(
     name = "matrix_triangular_solve_op",
     prefix = "matrix_triangular_solve_op",
@@ -2706,6 +2901,19 @@ tf_kernel_library(
     deps = LINALG_DEPS,
 )
 
+tf_kernel_library(
+    name = "lu_op",
+    prefix = "lu_op",
+    deps = if_cuda([
+        ":cuda_solvers",
+        ":transpose_functor",
+    ]) + [
+        "//third_party/eigen3",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "linalg_ops_common",
     srcs = ["linalg_ops_common.cc"],
@@ -2908,11 +3116,8 @@ tf_kernel_library(
     ]),
     # <prefix>*impl.h are excluded by default from the CPU build, add explicitly.
     hdrs = ["batch_matmul_op_impl.h"],
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "batch_matmul_op",
-    deps = MATH_DEPS + if_mkl_ml([
+    deps = MATH_DEPS + [":eigen_contraction_kernel"] + if_mkl_ml([
         "//third_party/mkl:intel_binary_blob",
     ]),
 )
@@ -2978,9 +3183,6 @@ tf_kernel_library(
         "mkl_matmul_op.cc",
     ]),
     hdrs = ["matmul_op.h"],
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
         ":xsmm": [
             "TENSORFLOW_USE_LIBXSMM",
@@ -2989,11 +3191,10 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     deps = MATH_DEPS + [
+        ":eigen_contraction_kernel",
         ":gpu_util_hdrs",
     ] + select({
-        ":xsmm": [
-            "@libxsmm_archive//:xsmm_avx",
-        ],
+        ":xsmm": ["@libxsmm_archive//:xsmm_avx"],
         "//conditions:default": [],
     }) + mkl_deps() + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
@@ -3018,7 +3219,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "scan_ops",
     prefix = "scan_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -3196,6 +3397,29 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "scan_ops_test",
+    size = "small",
+    srcs = ["scan_ops_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":host_constant_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":scan_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "reduction_ops_test",
     size = "small",
@@ -3379,9 +3603,6 @@ tf_kernel_library(
         ":xsmm_convolutions": ["xsmm_conv2d.h"],
         "//conditions:default": [],
     }),
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
         ":xsmm_convolutions": [
             "TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS",
@@ -3399,6 +3620,7 @@ tf_kernel_library(
         ":bounds_check",
         ":conv_2d",
         ":conv_3d",
+        ":eigen_contraction_kernel",
         ":image_resizer_state",
         ":fill_functor",
         ":ops_util",
@@ -3489,6 +3711,7 @@ cc_library(
 NN_DEPS = [
     ":bounds_check",
     ":conv_2d",
+    ":eigen_contraction_kernel",
     ":fused_batch_norm_util_gpu",
     ":ops_util",
     ":pooling_ops",
@@ -3498,7 +3721,7 @@ NN_DEPS = [
     "//tensorflow/core:nn_grad",
     "//tensorflow/core:nn_ops_op_lib",
     "//third_party/eigen3",
-]
+] + if_mkl(["//tensorflow/core:mkl_nn_ops_op_lib"])
 
 tf_kernel_library(
     name = "batch_norm_op",
@@ -3537,9 +3760,6 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "lrn_op",
-    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
-    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "lrn_op",
     deps = NN_DEPS,
 )
@@ -4383,6 +4603,7 @@ tf_kernel_library(
         ":dense_update_functor",
         ":training_op_helpers",
         ":variable_ops",
+        ":inplace_ops",
     ],
 )
 
@@ -4450,6 +4671,7 @@ cc_library(
         ":string_strip_op",
         ":string_to_hash_bucket_op",
         ":substr_op",
+        ":unicode_ops",
         ":unicode_script_op",
     ],
 )
@@ -4615,6 +4837,22 @@ tf_kernel_library(
     deps = STRING_DEPS,
 )
 
+tf_kernel_library(
+    name = "unicode_ops",
+    prefix = "unicode_ops",
+    deps = [
+        ":bounds_check",
+        ":string_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:string_ops_op_lib",
+        "//third_party/eigen3",
+        "//third_party/icu/data:conversion_data",
+        "@icu//:common",
+    ],
+)
+
 tf_kernel_library(
     name = "base64_ops",
     prefix = "base64_ops",
@@ -5002,7 +5240,6 @@ filegroup(
         "eigen_volume_patch.h",
         "fifo_queue.h",
         "maxpooling_op.h",
-        "ops_util.cc",
         "ops_util.h",
         "padding_fifo_queue.h",
         "pooling_ops_common.cc",
@@ -5164,7 +5401,9 @@ filegroup(
         "batch_norm_op.h",
         "control_flow_ops.h",
         "conv_2d.h",
+        "conv_3d.h",
         "conv_ops.h",
+        "conv_ops_gpu.h",
         "data_format_ops.h",
         "depthtospace_op.h",
         "depthwise_conv_op.h",
@@ -5182,7 +5421,9 @@ filegroup(
         "mfcc_mel_filterbank.h",
         "mirror_pad_op.h",
         "mirror_pad_op_cpu_impl.h",
+        "multinomial_op.h",
         "pad_op.h",
+        "pooling_ops_3d.h",
         "random_op.h",
         "reduction_ops.h",
         "reduction_ops_common.h",
@@ -5199,6 +5440,7 @@ filegroup(
         "spacetobatch_functor.h",
         "spacetodepth_op.h",
         "spectrogram.h",
+        "stateless_random_ops.h",
         "string_util.h",
         "tensor_array.h",
         "tile_functor.h",
@@ -5230,6 +5472,7 @@ filegroup(
         "conv_grad_ops.cc",
         "conv_grad_ops.h",
         "conv_ops.cc",
+        "conv_ops_3d.cc",
         "conv_ops_fused.cc",
         "conv_ops_using_gemm.cc",
         "crop_and_resize_op.cc",
@@ -5240,6 +5483,8 @@ filegroup(
         "cwise_op_bitwise_and.cc",
         "cwise_op_bitwise_or.cc",
         "cwise_op_bitwise_xor.cc",
+        "cwise_op_cos.cc",
+        "cwise_op_cosh.cc",
         "cwise_op_div.cc",
         "cwise_op_equal_to_1.cc",
         "cwise_op_equal_to_2.cc",
@@ -5274,10 +5519,13 @@ filegroup(
         "cwise_op_select.cc",
         "cwise_op_sigmoid.cc",
         "cwise_op_sign.cc",
+        "cwise_op_sin.cc",
+        "cwise_op_sinh.cc",
         "cwise_op_sqrt.cc",
         "cwise_op_square.cc",
         "cwise_op_squared_difference.cc",
         "cwise_op_sub.cc",
+        "cwise_op_tan.cc",
         "cwise_op_tanh.cc",
         "cwise_op_xlogy.cc",
         "cwise_op_xdivy.cc",
@@ -5332,9 +5580,11 @@ filegroup(
         "mirror_pad_op_cpu_impl_3.cc",
         "mirror_pad_op_cpu_impl_4.cc",
         "mirror_pad_op_cpu_impl_5.cc",
+        "multinomial_op.cc",
         "pad_op.cc",
         "padding_fifo_queue.cc",
         "padding_fifo_queue_op.cc",
+        "pooling_ops_3d.cc",
         "queue_base.cc",
         "queue_op.cc",
         "queue_ops.cc",
@@ -5368,7 +5618,10 @@ filegroup(
         "sparse_to_dense_op.cc",
         "spectrogram.cc",
         "spectrogram_op.cc",
+        "stack.cc",
+        "stack.h",
         "stack_ops.cc",
+        "stateless_random_ops.cc",
         "string_join_op.cc",
         "string_util.cc",
         "summary_op.cc",
@@ -5455,7 +5708,6 @@ filegroup(
             "*_3d*",
             "*.cu.*",
             # Ops already in android_srcs
-            "ops_util.cc",
             "pooling_ops_common.cc",
             # Ops which we are currently excluding because they are likely
             # not used on Android. Those ops also do not compile if included,
@@ -5496,6 +5748,7 @@ filegroup(
             "batch_kernels.*",
             "regex_full_match_op.cc",
             "regex_replace_op.cc",
+            "unicode_ops.cc",
             "unicode_script_op.cc",
             # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
             "mkl_*",
@@ -6041,6 +6294,28 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_quantized_pooling_ops_test",
+    size = "small",
+    srcs = ["mkl_quantized_pooling_ops_test.cc"],
+    deps = [
+        ":mkl_input_conversion_op",
+        ":mkl_pooling_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "quantized_reshape_op_test",
     size = "small",
@@ -6306,6 +6581,10 @@ tf_cc_test(
 
 tf_mkl_kernel_library(
     name = "mkl_conv_op",
+    hdrs = [
+        "mkl_quantized_conv_ops.h",
+        "no_op.h",
+    ],
     prefix = "mkl_conv",
     deps = [
         ":bounds_check",
@@ -6315,6 +6594,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6349,6 +6629,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6364,6 +6645,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6383,6 +6665,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
@@ -6397,6 +6680,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
@@ -6412,6 +6696,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
@@ -6432,7 +6717,7 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_concat_op",
     prefix = "mkl_concat_op",
-    deps = ARRAY_DEPS + mkl_deps(),
+    deps = [":quantization_utils"] + ARRAY_DEPS + mkl_deps(),
 )
 
 tf_mkl_kernel_library(
@@ -6470,6 +6755,31 @@ tf_mkl_kernel_library(
     deps = NN_DEPS + mkl_deps() + [":cwise_op"],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_fused_ops_test",
+    size = "small",
+    srcs = ["mkl_fused_ops_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":conv_ops",
+        ":image",
+        ":mkl_conv_op",
+        ":mkl_tfconv_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_mkl_kernel_library(
     name = "mkl_transpose_op",
     srcs = [
@@ -6497,19 +6807,10 @@ tf_kernel_library(
     ],
 )
 
-cc_library(
-    name = "captured_function",
-    hdrs = ["captured_function.h"],
-    deps = [
-        "//tensorflow/core/kernels/data:captured_function",
-    ],
-)
-
-cc_library(
-    name = "dataset",
-    hdrs = ["dataset.h"],
+tf_kernel_library(
+    name = "tensor_forest_ops",
     deps = [
-        "//tensorflow/core/kernels/data:dataset",
+        "//tensorflow/core/kernels/tensor_forest:tensor_forest_ops",
     ],
 )
 
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 72155fd037378fc3d93c02e9b893a6671e9659a6..47e10f56dfa682d97b04b78cd0e5f9a536081025 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -320,13 +320,14 @@ class AdjustContrastOpv2<CPUDevice> : public AdjustContrastOpV2Base {
     int64 batch = outputs.dimension(0);
     int64 image_size = outputs.dimension(1);
     int64 channels = outputs.dimension(2);
-    // Similar to the reduction case, a straighforward implementation of this
+    // Similar to the reduction case, a straightforward implementation of this
     // does not utilize vectorization well because of the small channel size.
     // This algorithm repeatedly increases the area to be copied, and leads to
     // much better vectorinizations in the copy.
     for (int64 i = 0; i < batch; i++) {
       // Copy over the inputs into outputs in this batch. Effectively:
-      // outputs(i, :, k) = inputs(i, k). An example of how this algorith works:
+      // outputs(i, :, k) = inputs(i, k). An example of how this algorithm
+      // works:
       //
       //    x = float[1, 3], y = float[2048, 3]
       //    round 0
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc
index 6079aa749d52c5a3483ac21cd44feef5a3978fb3..52dec94305d3c8558013861a44524609ad6eed7a 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@@ -216,8 +216,8 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, delta_h](
-              int64 start_channel, int64 end_channel) {
+          [&input_data, &output_data, delta_h](int64 start_channel,
+                                               int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 2f125312d00d71a838a9b1eb834c064da541c1ad..150e8fe6379fd2a41778e94df793ba45ef0d309e 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -227,6 +227,7 @@ REGISTER_ADDN_CPU(Variant);
 #if GOOGLE_CUDA
 #define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU)
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU);
+TF_CALL_int64(REGISTER_ADDN_GPU);
 TF_CALL_complex64(REGISTER_ADDN_GPU);
 TF_CALL_complex128(REGISTER_ADDN_GPU);
 TF_CALL_variant(REGISTER_ADDN_GPU);
diff --git a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
index 3f449be754492bf9034ee68b2ba2571b12960b6f..8fef84305f0ddbf5151db7690912424f1afe71e3 100644
--- a/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/aggregate_ops_gpu.cu.cc
@@ -154,6 +154,7 @@ struct Add9Functor<GPUDevice, T> {
   template struct functor::Add9Functor<GPUDevice, type>;
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_FUNCTORS);
+TF_CALL_int64(REGISTER_FUNCTORS);
 TF_CALL_complex64(REGISTER_FUNCTORS);
 TF_CALL_complex128(REGISTER_FUNCTORS);
 
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 944564dfba62f257ae45b3c5c25d0de64fa0b773..aa9123582210bdf31993e9d8c58ba90cc02acc5e 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -180,7 +180,7 @@ class Barrier : public ResourceBase {
         // SQSS is closed, nothing is left in the incomplete set,
         // the queue is not already marked as closed, and (most
         // importantly), the queue has entries in it.
-        [this, ctx, callback, component_index]() {
+        [this, ctx, callback]() {
           if (!ctx->status().ok()) {
             callback();
             return;
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 766713a338caf3f9aa317179902c596de3a25cfd..43539ac908ffdcb49d6f35ad3dc8cdc6ce28bc61 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -34,6 +34,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 25ae795d8e76d0917f09f2faa7b1cddd754132d1..7bc43be66b35c7366a463de6b749bca99ee30e44 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -32,6 +32,7 @@ TF_CALL_double(REGISTER_BATCH_MATMUL_CPU);
 
 TF_CALL_half(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_int32(REGISTER_BATCH_MATMUL_CPU);
+TF_CALL_int64(REGISTER_BATCH_MATMUL_CPU);
 
 #if GOOGLE_CUDA
 TF_CALL_float(REGISTER_BATCH_MATMUL_GPU);
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index e15ea82e7d694b34cc23b636ebf1b2dcd5c05850..d4f4b43d63b90c22abbbe82263b09353912010c8 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -43,11 +43,12 @@ typedef Eigen::SyclDevice SYCLDevice;
 namespace {
 
 void GetBiasValueDims(const Tensor& value_tensor, TensorFormat data_format,
-                      int32* batch, int32* height, int32* width,
+                      int32* batch, int32* height, int32* width, int32* depth,
                       int32* channel) {
   *batch = 1;
-  *width = 1;
   *height = 1;
+  *width = 1;
+  *depth = 1;
   *channel = 1;
   if (data_format == FORMAT_NHWC) {
     int32 channel_dim = value_tensor.dims() - 1;
@@ -56,14 +57,14 @@ void GetBiasValueDims(const Tensor& value_tensor, TensorFormat data_format,
       *batch *= static_cast<int32>(value_tensor.dim_size(i));
     }
   } else if (data_format == FORMAT_NCHW) {
-    int32 channel_dim = value_tensor.dims() - 3;
-    int32 height_dim = value_tensor.dims() - 2;
-    int32 width_dim = value_tensor.dims() - 1;
-    *channel = static_cast<int32>(value_tensor.dim_size(channel_dim));
-    *height = static_cast<int32>(value_tensor.dim_size(height_dim));
-    *width = static_cast<int32>(value_tensor.dim_size(width_dim));
-    for (int32 i = 0; i < channel_dim; i++) {
-      *batch *= static_cast<int32>(value_tensor.dim_size(i));
+    *batch = static_cast<int32>(value_tensor.dim_size(0));
+    *channel = static_cast<int32>(value_tensor.dim_size(1));
+    *height = static_cast<int32>(value_tensor.dim_size(2));
+    if (value_tensor.dims() > 3) {
+      *width = static_cast<int32>(value_tensor.dim_size(3));
+    }
+    if (value_tensor.dims() > 4) {
+      *depth = static_cast<int32>(value_tensor.dim_size(4));
     }
   }
 }
@@ -109,10 +110,8 @@ class BiasOp : public BinaryOp<T> {
     // Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
     size_t channel_dim;
     if (data_format_ == FORMAT_NCHW) {
-      OP_REQUIRES(context, input.dims() == 4,
-                  errors::InvalidArgument(
-                      "NCHW format supports only 4D input tensor."));
-      channel_dim = 1;
+      channel_dim = 1;  // NCHW always have channel dim in 1 (with 3, 4, 5
+                        // dimensions data).
     } else {
       channel_dim = input.shape().dims() - 1;  // End of code by intel_tf.
     }
@@ -132,14 +131,41 @@ class BiasOp : public BinaryOp<T> {
 
     // Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
     if (data_format_ == FORMAT_NCHW) {
-      int32 batch, height, width, channel;
-      GetBiasValueDims(input, data_format_, &batch, &height, &width, &channel);
-      Eigen::DSizes<Eigen::Index, 4> four_dims(1, channel, 1, 1);
-      Eigen::DSizes<Eigen::Index, 4> broad_cast_dims(batch, 1, height, width);
-      const Device& d = context->eigen_device<Device>();
-      output->tensor<T, 4>().device(d) =
-          input.tensor<T, 4>() +
-          bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
+      int32 batch, height, width, depth, channel;
+      GetBiasValueDims(input, data_format_, &batch, &height, &width, &depth,
+                       &channel);
+      switch (input.shape().dims()) {
+        case 3: {
+          Eigen::DSizes<int32, 3> three_dims(1, channel, 1);
+          Eigen::DSizes<int32, 3> broad_cast_dims(batch, 1, height);
+          const Device& d = context->eigen_device<Device>();
+          output->tensor<T, 3>().device(d) =
+              input.tensor<T, 3>() + bias.tensor<T, 1>()
+                                         .reshape(three_dims)
+                                         .broadcast(broad_cast_dims);
+        } break;
+        case 4: {
+          Eigen::DSizes<int32, 4> four_dims(1, channel, 1, 1);
+          Eigen::DSizes<int32, 4> broad_cast_dims(batch, 1, height, width);
+          const Device& d = context->eigen_device<Device>();
+          output->tensor<T, 4>().device(d) =
+              input.tensor<T, 4>() +
+              bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
+        } break;
+        case 5: {
+          Eigen::DSizes<int32, 5> four_dims(1, channel, 1, 1, 1);
+          Eigen::DSizes<int32, 5> broad_cast_dims(batch, 1, height, width,
+                                                  depth);
+          const Device& d = context->eigen_device<Device>();
+          output->tensor<T, 5>().device(d) =
+              input.tensor<T, 5>() +
+              bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
+        } break;
+        default:
+          OP_REQUIRES(context, false,
+                      errors::InvalidArgument("Only ranks up to 5 supported: ",
+                                              input.shape().DebugString()));
+      }
       return;
     }  // End of code by intel_tf.
 
@@ -229,9 +255,9 @@ class BiasGradOp : public OpKernel {
                         std::numeric_limits<int32>::max()),
         errors::InvalidArgument("BiasGrad requires tensor size <= int32 max"));
 
-    int32 batch, height, width, channel;
+    int32 batch, height, width, depth, channel;
     GetBiasValueDims(output_backprop, data_format_, &batch, &height, &width,
-                     &channel);
+                     &depth, &channel);
     Tensor* output = nullptr;
     TensorShape output_shape{channel};
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
@@ -243,6 +269,7 @@ class BiasGradOp : public OpKernel {
       output->template flat<T>().setZero();
     } else {
       // Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
+      // TODO(yongtang): Add 3/4/5 dimensional data support for NCHW format.
       if (data_format_ == FORMAT_NCHW) {
         OP_REQUIRES(context, output_backprop.dims() == 4,
                     errors::InvalidArgument(
@@ -330,8 +357,9 @@ class BiasOp<GPUDevice, T> : public BinaryOp<T> {
     OP_REQUIRES(context, TensorShapeUtils::IsVector(bias.shape()),
                 errors::InvalidArgument("Biases must be 1D: ",
                                         bias.shape().DebugString()));
-    int32 batch, height, width, channel;
-    GetBiasValueDims(input, data_format_, &batch, &height, &width, &channel);
+    int32 batch, height, width, depth, channel;
+    GetBiasValueDims(input, data_format_, &batch, &height, &width, &depth,
+                     &channel);
     OP_REQUIRES(context, bias.shape().dim_size(0) == channel,
                 errors::InvalidArgument(
                     "Must provide as many biases as the channel dimension "
@@ -344,7 +372,7 @@ class BiasOp<GPUDevice, T> : public BinaryOp<T> {
     if (input.NumElements() > 0) {
       BiasGPU<T>::compute(context->template eigen_device<Device>(),
                           input.flat<T>().data(), bias.flat<T>().data(),
-                          output->flat<T>().data(), batch, width, height,
+                          output->flat<T>().data(), batch, width, height, depth,
                           channel, data_format_);
     }
   }
@@ -516,9 +544,9 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
                 TensorShapeUtils::IsMatrixOrHigher(output_backprop.shape()),
                 errors::InvalidArgument("Input tensor must be at least 2D: ",
                                         output_backprop.shape().DebugString()));
-    int32 batch, height, width, channel;
+    int32 batch, height, width, depth, channel;
     GetBiasValueDims(output_backprop, data_format_, &batch, &height, &width,
-                     &channel);
+                     &depth, &channel);
     Tensor* output = nullptr;
     TensorShape output_shape{channel};
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 1a7211a7cba8db6b3e57327df4018fb2ea0dbd0a..24fea8a8e6f10cea4f74e743c8aa2c6bfb49313f 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -76,9 +76,9 @@ __global__ void BiasNCHWKernel(int32 nthreads, const T* input, const T* bias,
 template <typename T>
 void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
                          T* output, int32 batch, int32 height, int32 width,
-                         int32 channel, TensorFormat data_format) {
+                         int depth, int32 channel, TensorFormat data_format) {
   const int32 bias_size = channel;
-  const int32 image_size = height * width;
+  const int32 image_size = height * width * depth;
   const int32 total_count = batch * bias_size * image_size;
   if (total_count == 0) {
     return;
diff --git a/tensorflow/core/kernels/bias_op_gpu.h b/tensorflow/core/kernels/bias_op_gpu.h
index c1051f43c9f44ec42f7bb679d521b2bcaae03880..a0b2ce4f9b34b0b343de3d09374b07d554c57d15 100644
--- a/tensorflow/core/kernels/bias_op_gpu.h
+++ b/tensorflow/core/kernels/bias_op_gpu.h
@@ -32,7 +32,7 @@ template <typename T>
 struct BiasGPU {
   static void compute(const GPUDevice& d, const T* input, const T* bias,
                       T* output, int32 batch, int32 height, int32 width,
-                      int32 channel, TensorFormat data_format);
+                      int32 depth, int32 channel, TensorFormat data_format);
 };
 
 template <typename T>
diff --git a/tensorflow/core/kernels/bitcast_op.cc b/tensorflow/core/kernels/bitcast_op.cc
index 90825e6d39a1d65673f9e2a502f6dafcbf7ad090..f602cfa428a555970f35b4057c46641a3ba156dd 100644
--- a/tensorflow/core/kernels/bitcast_op.cc
+++ b/tensorflow/core/kernels/bitcast_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/casts.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/bitcast_op.h b/tensorflow/core/kernels/bitcast_op.h
index 900ab6f35c15e908a415849784b612da2b6d7c22..1f3659f303338efc69da56da0a67144e9400844b 100644
--- a/tensorflow/core/kernels/bitcast_op.h
+++ b/tensorflow/core/kernels/bitcast_op.h
@@ -25,6 +25,5 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/casts.h"
 
 #endif  // TENSORFLOW_CORE_KERNELS_BITCAST_OP_H_
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index 4e8bfa02fc3a21329e6495fc4ebccf365d3a02a8..8f2c2dbe8a778353dff5e0b8823ac99de68282df 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -2,7 +2,10 @@
 #   OpKernels for boosted trees ops.
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
 )
 
 licenses(["notice"])  # Apache 2.0
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 1ab72af05914bc15148fc4caff7a07493c1ff1e5..4e9bab3e21f9f240d32e78a1a489033a693caa73 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -12,6 +12,7 @@ message Node {
     Leaf leaf = 1;
     BucketizedSplit bucketized_split = 2;
     CategoricalSplit categorical_split = 3;
+    DenseSplit dense_split = 4;
   }
   NodeMetadata metadata = 777;
 }
@@ -70,6 +71,19 @@ message CategoricalSplit {
   int32 right_id = 4;
 }
 
+// TODO(nponomareva): move out of boosted_trees and rename to trees.proto
+message DenseSplit {
+  // Float feature column and split threshold describing
+  // the rule feature <= threshold.
+  int32 feature_id = 1;
+  float threshold = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
 // Tree describes a list of connected nodes.
 // Node 0 must be the root and can carry any payload including a leaf
 // in the case of representing the bias.
diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
index d1840941c1d38f7f299788cd739ab055f036c039..81f04732d331a7eccb825642283cd27d63e35a79 100644
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@@ -29,6 +29,7 @@
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -151,8 +152,14 @@ class BoostedTreesMakeQuantileSummariesOp : public OpKernel {
     const Tensor* example_weights_t;
     OP_REQUIRES_OK(context,
                    context->input(kExampleWeightsName, &example_weights_t));
+    DCHECK(float_features_list.size() > 0) << "Got empty feature list";
     auto example_weights = example_weights_t->flat<float>();
-    const int64 batch_size = example_weights.size();
+    const int64 weight_size = example_weights.size();
+    const int64 batch_size = float_features_list[0].flat<float>().size();
+    OP_REQUIRES(
+        context, weight_size == 1 || weight_size == batch_size,
+        errors::InvalidArgument(strings::Printf(
+            "Weights should be a single value or same size as features.")));
     const Tensor* epsilon_t;
     OP_REQUIRES_OK(context, context->input(kEpsilonName, &epsilon_t));
     float epsilon = epsilon_t->scalar<float>()();
@@ -168,7 +175,9 @@ class BoostedTreesMakeQuantileSummariesOp : public OpKernel {
         QuantileStream stream(epsilon, batch_size + 1);
         // Run quantile summary generation.
         for (int64 j = 0; j < batch_size; j++) {
-          stream.PushEntry(feature_values(j), example_weights(j));
+          stream.PushEntry(feature_values(j), (weight_size > 1)
+                                                  ? example_weights(j)
+                                                  : example_weights(0));
         }
         stream.Finalize();
         const auto summary_entry_list = stream.GetFinalSummary().GetEntryList();
@@ -263,6 +272,57 @@ REGISTER_KERNEL_BUILDER(
     Name("BoostedTreesQuantileStreamResourceAddSummaries").Device(DEVICE_CPU),
     BoostedTreesQuantileStreamResourceAddSummariesOp);
 
+class BoostedTreesQuantileStreamResourceDeserializeOp : public OpKernel {
+ public:
+  explicit BoostedTreesQuantileStreamResourceDeserializeOp(
+      OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr(kNumStreamsName, &num_features_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    QuantileStreamResource* streams_resource;
+    // Create a reference to the underlying resource using the handle.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &streams_resource));
+    // Remove the reference at the end of this scope.
+    mutex_lock l(*streams_resource->mutex());
+    core::ScopedUnref unref_me(streams_resource);
+
+    OpInputList bucket_boundaries_list;
+    OP_REQUIRES_OK(context, context->input_list(kBucketBoundariesName,
+                                                &bucket_boundaries_list));
+
+    auto do_quantile_deserialize = [&](const int64 begin, const int64 end) {
+      // Iterating over all streams.
+      for (int64 stream_idx = begin; stream_idx < end; stream_idx++) {
+        const Tensor& bucket_boundaries_t = bucket_boundaries_list[stream_idx];
+        const auto& bucket_boundaries = bucket_boundaries_t.vec<float>();
+        std::vector<float> result;
+        result.reserve(bucket_boundaries.size());
+        for (size_t i = 0; i < bucket_boundaries.size(); ++i) {
+          result.push_back(bucket_boundaries(i));
+        }
+        streams_resource->set_boundaries(result, stream_idx);
+      }
+    };
+
+    // TODO(tanzheny): comment on the magic number.
+    const int64 kCostPerUnit = 500 * num_features_;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *context->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, num_features_,
+          kCostPerUnit, do_quantile_deserialize);
+  }
+
+ private:
+  int64 num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesQuantileStreamResourceDeserialize").Device(DEVICE_CPU),
+    BoostedTreesQuantileStreamResourceDeserializeOp);
+
 class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
  public:
   explicit BoostedTreesQuantileStreamResourceFlushOp(
@@ -409,28 +469,29 @@ class BoostedTreesBucketizeOp : public OpKernel {
         const int64 num_values = values_tensor.dim_size(0);
 
         Tensor* output_t = nullptr;
-        OP_REQUIRES_OK(
-            context, buckets_list.allocate(
-                         feature_idx, TensorShape({num_values, 1}), &output_t));
-        auto output = output_t->matrix<int32>();
+        OP_REQUIRES_OK(context,
+                       buckets_list.allocate(
+                           feature_idx, TensorShape({num_values}), &output_t));
+        auto output = output_t->flat<int32>();
 
         const std::vector<float>& bucket_boundaries_vector =
             GetBuckets(feature_idx, bucket_boundaries_list);
-        CHECK(!bucket_boundaries_vector.empty())
-            << "Got empty buckets for feature " << feature_idx;
         auto flat_values = values_tensor.flat<float>();
+        const auto& iter_begin = bucket_boundaries_vector.begin();
+        const auto& iter_end = bucket_boundaries_vector.end();
         for (int64 instance = 0; instance < num_values; instance++) {
+          if (iter_begin == iter_end) {
+            output(instance) = 0;
+            continue;
+          }
           const float value = flat_values(instance);
-          auto bucket_iter =
-              std::lower_bound(bucket_boundaries_vector.begin(),
-                               bucket_boundaries_vector.end(), value);
-          if (bucket_iter == bucket_boundaries_vector.end()) {
+          auto bucket_iter = std::lower_bound(iter_begin, iter_end, value);
+          if (bucket_iter == iter_end) {
             --bucket_iter;
           }
-          const int32 bucket = static_cast<int32>(
-              bucket_iter - bucket_boundaries_vector.begin());
+          const int32 bucket = static_cast<int32>(bucket_iter - iter_begin);
           // Bucket id.
-          output(instance, 0) = bucket;
+          output(instance) = bucket;
         }
       }
     };
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
index 31d7fe25a477c3a2374d95749c5ff940ac2311d5..5690c3a601466cb525af66ce2e46e9ad7bec9443 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
@@ -39,7 +39,7 @@ class WeightedQuantilesSummary {
       // Explicitly initialize all of memory (including padding from memory
       // alignment) to allow the struct to be msan-resistant "plain old data".
       //
-      // POD = http://en.cppreference.com/w/cpp/concept/PODType
+      // POD = https://en.cppreference.com/w/cpp/named_req/PODType
       memset(this, 0, sizeof(*this));
 
       value = v;
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/bounds_check.h
index 18727c0db32ba4379ebec0e58bd2a41fe8b058f1..ce6ec1012daacf915fee0ee7bb059306058361d5 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/bounds_check.h
@@ -16,39 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
 #define TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
 
-#include <type_traits>
-
-#include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-
-// Check that 0 <= index < limit using a single comparison, assuming
-// that 0 <= limit if Index is signed.  Intended for use in performance
-// critical contexts where 0 <= index < limit is almost always true.
-template <typename Ta, typename Tb>
-EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool FastBoundsCheck(const Ta index,
-                                                           const Tb limit) {
-  static_assert(std::is_integral<Ta>::value && std::is_integral<Tb>::value,
-                "FastBoundsCheck can only be used on integer types.");
-  typedef typename std::make_unsigned<decltype(index + limit)>::type UIndex;
-  return TF_PREDICT_TRUE(static_cast<UIndex>(index) <
-                         static_cast<UIndex>(limit));
-}
-
-namespace internal {
-// Ensure that the compiler cannot elide a copy into a local, for
-// bounds checking on source tensors that might be updated asynchronously.
-// This function may only be used on primitive integral types (int32, int64,
-// etc).  It does not guarantee any atomicity or barriers.
-template <typename T>
-EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC const T SubtleMustCopy(const T &x) {
-  static_assert(std::is_integral<T>::value,
-                "SubtleMustCopy can only be used on integer types.");
-  auto *to_x = reinterpret_cast<const volatile T *>(&x);
-  return *to_x;
-}
-}  // namespace internal
-}  // namespace tensorflow
+#include "tensorflow/core/framework/bounds_check.h"
 
 #endif  // TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
index a2327a7272e67de450e8133b8ccdff58d67bb64d..bc11c5f914bfcbcbbc4445cace7126717f3d8d2d 100644
--- a/tensorflow/core/kernels/broadcast_to_op.h
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -16,36 +16,66 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_BROADCAST_TO_OP_H_
 #define TENSORFLOW_CORE_KERNELS_BROADCAST_TO_OP_H_
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/fill_functor.h"
 
 namespace tensorflow {
 
 namespace functor {
 
+#define BROADCAST_SHAPE(NDIMS, input_shape, output_shape)                 \
+  auto reshape = AsEigenDSizesWithPrefix<NDIMS>(input_shape);             \
+  auto broadcast = output_shape.AsEigenDSizes<NDIMS>();                   \
+  auto reshape_32bit = AsEigenDSizesWithPrefix<NDIMS, int>(input_shape);  \
+  auto broadcast_32bit = output_shape.AsEigenDSizes<NDIMS, int>();        \
+  if (input_shape.dims() > 0) {                                           \
+    for (int i = 0; i < NDIMS; i++) {                                     \
+      if (reshape[i] != broadcast[i]) {                                   \
+        OP_REQUIRES(                                                      \
+            ctx, ((reshape[i] != 0) && (broadcast[i] % reshape[i] == 0)), \
+            errors::InvalidArgument("invalid shape to broadcast from ",   \
+                                    input_shape.DebugString(), " to ",    \
+                                    output_shape.DebugString()));         \
+        broadcast[i] = broadcast[i] / reshape[i];                         \
+      } else {                                                            \
+        broadcast[i] = 1;                                                 \
+      }                                                                   \
+      if (can_use_32bit) {                                                \
+        broadcast_32bit[i] = static_cast<int>(broadcast[i]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define HANDLE_BROADCAST_FROM_SCALAR()                              \
+  if (std::is_same<Eigen::GpuDevice, Device>::value) {              \
+    FillFunctor<Device, T>()(d, output_tensor.flat<T>(),            \
+                             input_tensor.scalar<T>());             \
+  } else {                                                          \
+    output.device(d) = output.constant(input_tensor.scalar<T>()()); \
+  }
+
+#define HANDLE_BROADCAST_CASE(dim_i)                                        \
+  case dim_i: {                                                             \
+    if (can_use_32bit) {                                                    \
+      auto input = input_tensor.tensor<T, dim_i>();                         \
+      To32Bit(output).device(d) =                                           \
+          To32Bit(input).reshape(reshape_32bit).broadcast(broadcast_32bit); \
+    } else {                                                                \
+      auto input = input_tensor.tensor<T, dim_i>();                         \
+      output.device(d) = input.reshape(reshape).broadcast(broadcast);       \
+    }                                                                       \
+  } break
+
 template <typename Device, typename T>
 struct BroadcastTo {
   void operator()(const Device &d, OpKernelContext *ctx, Tensor &output_tensor,
                   const TensorShape &output_shape, const Tensor &input_tensor,
                   const TensorShape &input_shape) {
-#define BROADCAST_SHAPE(broadcast, reshape, NDIMS, input_shape, output_shape) \
-  for (int i = 0; i < NDIMS; i++) {                                           \
-    if (reshape[i] != broadcast[i]) {                                         \
-      OP_REQUIRES(ctx,                                                        \
-                  ((reshape[i] != 0) && (broadcast[i] % reshape[i] == 0)),    \
-                  errors::InvalidArgument("invalid shape to broadcast from ", \
-                                          input_shape.DebugString(), " to ",  \
-                                          output_shape.DebugString()));       \
-      broadcast[i] = broadcast[i] / reshape[i];                               \
-    } else {                                                                  \
-      broadcast[i] = 1;                                                       \
-    }                                                                         \
-  }
-
     if (output_shape.num_elements() == 0) {
       return;
     }
@@ -54,6 +84,10 @@ struct BroadcastTo {
       return;
     }
 
+    const bool can_use_32bit = std::is_same<Eigen::GpuDevice, Device>::value &&
+                               output_tensor.NumElements() < kint32max &&
+                               input_tensor.NumElements() < kint32max;
+
     switch (output_shape.dims()) {
       case 0: {
         if (input_shape.dims() > 0) {
@@ -66,20 +100,14 @@ struct BroadcastTo {
         break;
       }
       case 1: {
-        auto reshape = AsEigenDSizesWithPrefix<1>(input_shape);
-        auto broadcast = output_shape.AsEigenDSizes<1>();
-
-        BROADCAST_SHAPE(broadcast, reshape, 1, input_shape, output_shape);
+        BROADCAST_SHAPE(1, input_shape, output_shape);
 
         auto output = output_tensor.tensor<T, 1>();
         switch (input_shape.dims()) {
           case 0: {
-            output.device(d) = output.constant(input_tensor.scalar<T>()());
-          } break;
-          case 1: {
-            auto input = input_tensor.tensor<T, 1>();
-            output.device(d) = input.broadcast(broadcast);
+            HANDLE_BROADCAST_FROM_SCALAR();
           } break;
+            HANDLE_BROADCAST_CASE(1);
           default:
             ctx->CtxFailure(errors::InvalidArgument(
                 "invalid shape to broadcast from ", input_shape.DebugString(),
@@ -88,24 +116,14 @@ struct BroadcastTo {
         }
       } break;
       case 2: {
-        auto reshape = AsEigenDSizesWithPrefix<2>(input_shape);
-        auto broadcast = output_shape.AsEigenDSizes<2>();
-
-        BROADCAST_SHAPE(broadcast, reshape, 2, input_shape, output_shape);
-
+        BROADCAST_SHAPE(2, input_shape, output_shape);
         auto output = output_tensor.tensor<T, 2>();
         switch (input_shape.dims()) {
           case 0: {
-            output.device(d) = output.constant(input_tensor.scalar<T>()());
-          } break;
-          case 1: {
-            auto input = input_tensor.tensor<T, 1>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 2: {
-            auto input = input_tensor.tensor<T, 2>();
-            output.device(d) = input.broadcast(broadcast);
+            HANDLE_BROADCAST_FROM_SCALAR();
           } break;
+            HANDLE_BROADCAST_CASE(1);
+            HANDLE_BROADCAST_CASE(2);
           default:
             ctx->CtxFailure(errors::InvalidArgument(
                 "invalid shape to broadcast from ", input_shape.DebugString(),
@@ -114,28 +132,15 @@ struct BroadcastTo {
         }
       } break;
       case 3: {
-        auto reshape = AsEigenDSizesWithPrefix<3>(input_shape);
-        auto broadcast = output_shape.AsEigenDSizes<3>();
-
-        BROADCAST_SHAPE(broadcast, reshape, 3, input_shape, output_shape);
-
+        BROADCAST_SHAPE(3, input_shape, output_shape);
         auto output = output_tensor.tensor<T, 3>();
         switch (input_shape.dims()) {
           case 0: {
-            output.device(d) = output.constant(input_tensor.scalar<T>()());
-          } break;
-          case 1: {
-            auto input = input_tensor.tensor<T, 1>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 2: {
-            auto input = input_tensor.tensor<T, 2>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 3: {
-            auto input = input_tensor.tensor<T, 3>();
-            output.device(d) = input.broadcast(broadcast);
+            HANDLE_BROADCAST_FROM_SCALAR();
           } break;
+            HANDLE_BROADCAST_CASE(1);
+            HANDLE_BROADCAST_CASE(2);
+            HANDLE_BROADCAST_CASE(3);
           default:
             ctx->CtxFailure(errors::InvalidArgument(
                 "invalid shape to broadcast from ", input_shape.DebugString(),
@@ -144,31 +149,16 @@ struct BroadcastTo {
         }
       } break;
       case 4: {
-        auto reshape = AsEigenDSizesWithPrefix<4>(input_shape);
-        auto broadcast = output_shape.AsEigenDSizes<4>();
-
-        BROADCAST_SHAPE(broadcast, reshape, 4, input_shape, output_shape);
+        BROADCAST_SHAPE(4, input_shape, output_shape);
         auto output = output_tensor.tensor<T, 4>();
         switch (input_shape.dims()) {
           case 0: {
-            output.device(d) = output.constant(input_tensor.scalar<T>()());
-          } break;
-          case 1: {
-            auto input = input_tensor.tensor<T, 1>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 2: {
-            auto input = input_tensor.tensor<T, 2>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 3: {
-            auto input = input_tensor.tensor<T, 3>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 4: {
-            auto input = input_tensor.tensor<T, 4>();
-            output.device(d) = input.broadcast(broadcast);
+            HANDLE_BROADCAST_FROM_SCALAR();
           } break;
+            HANDLE_BROADCAST_CASE(1);
+            HANDLE_BROADCAST_CASE(2);
+            HANDLE_BROADCAST_CASE(3);
+            HANDLE_BROADCAST_CASE(4);
           default:
             ctx->CtxFailure(errors::InvalidArgument(
                 "invalid shape to broadcast from ", input_shape.DebugString(),
@@ -177,35 +167,17 @@ struct BroadcastTo {
         }
       } break;
       case 5: {
-        auto reshape = AsEigenDSizesWithPrefix<5>(input_shape);
-        auto broadcast = output_shape.AsEigenDSizes<5>();
-
-        BROADCAST_SHAPE(broadcast, reshape, 5, input_shape, output_shape);
+        BROADCAST_SHAPE(5, input_shape, output_shape);
         auto output = output_tensor.tensor<T, 5>();
         switch (input_shape.dims()) {
           case 0: {
-            output.device(d) = output.constant(input_tensor.scalar<T>()());
-          } break;
-          case 1: {
-            auto input = input_tensor.tensor<T, 1>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 2: {
-            auto input = input_tensor.tensor<T, 2>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 3: {
-            auto input = input_tensor.tensor<T, 3>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 4: {
-            auto input = input_tensor.tensor<T, 4>();
-            output.device(d) = input.reshape(reshape).broadcast(broadcast);
-          } break;
-          case 5: {
-            auto input = input_tensor.tensor<T, 5>();
-            output.device(d) = input.broadcast(broadcast);
+            HANDLE_BROADCAST_FROM_SCALAR();
           } break;
+            HANDLE_BROADCAST_CASE(1);
+            HANDLE_BROADCAST_CASE(2);
+            HANDLE_BROADCAST_CASE(3);
+            HANDLE_BROADCAST_CASE(4);
+            HANDLE_BROADCAST_CASE(5);
           default:
             ctx->CtxFailure(errors::InvalidArgument(
                 "invalid shape to broadcast from ", input_shape.DebugString(),
@@ -222,20 +194,25 @@ struct BroadcastTo {
   }
 
  private:
-  template <int NDIMS>
-  Eigen::DSizes<Eigen::DenseIndex, NDIMS> AsEigenDSizesWithPrefix(
+  template <int NDIMS, typename DimType = Eigen::DenseIndex>
+  Eigen::DSizes<DimType, NDIMS> AsEigenDSizesWithPrefix(
       const TensorShape &shape) const {
-    Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+    Eigen::DSizes<DimType, NDIMS> dsizes;
     for (int d = 0; d < NDIMS - shape.dims(); d++) {
       dsizes[d] = 1;
     }
     for (int d = NDIMS - shape.dims(); d < NDIMS; d++) {
-      dsizes[d] = shape.dim_size(d - (NDIMS - shape.dims()));
+      dsizes[d] =
+          static_cast<DimType>(shape.dim_size(d - (NDIMS - shape.dims())));
     }
     return dsizes;
   }
 };
 
+#undef BROADCAST_SHAPE
+#undef HANDLE_BROADCAST_FROM_SCALAR
+#undef HANDLE_BROADCAST_CASE
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/captured_function.h b/tensorflow/core/kernels/captured_function.h
deleted file mode 100644
index 2d2d87134e786139386509c6e5f353bb88882915..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/captured_function.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
-#define TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
-
-#include "tensorflow/core/kernels/data/captured_function.h"
-
-#endif  // TENSORFLOW_CORE_KERNELS_CAPTURED_FUNCTION_H_
diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
index 87bdba14550918b777363dd2077e4199d99d658f..f9f10c1b42f2ed6d2012798c8f720bbb9d211f5c 100644
--- a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -60,9 +60,9 @@ template <typename T>
 struct CheckNumericsLaunch {
   void Run(const GPUDevice &d, const T *data, int size,
            int abnormal_detected[2]) {
-    const int32 block_size = d.maxCudaThreadsPerBlock();
+    const int32 block_size = d.maxGpuThreadsPerBlock();
     const int32 num_blocks =
-        (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+        (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
         block_size;
 
     CheckNumericsKernel<T><<<num_blocks, block_size, 0, d.stream()>>>(
diff --git a/tensorflow/core/kernels/constant_op_test.cc b/tensorflow/core/kernels/constant_op_test.cc
index 0faad11e4721c9c575ef29591b30135b256bf41c..3988c190e701c8eb0d3163ec26ddefc5aba93541 100644
--- a/tensorflow/core/kernels/constant_op_test.cc
+++ b/tensorflow/core/kernels/constant_op_test.cc
@@ -79,7 +79,7 @@ void ConstantOpTest::PersistentMemoryTrackingTest(bool on_gpu) {
   }
 
   // Remove memory leak errors.
-  for (auto allocator_pair : ctx.wrapped_allocators()) {
+  for (auto allocator_pair : ctx.ConsumeWrappedAllocators()) {
     allocator_pair.second->GetRecordsAndUnRef();
   }
 }
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index fd3a0ad422372f84669d34b33b4931c88c0b6730..36def4a53065e2c6ac68a8b67818096012104753 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -70,9 +70,15 @@ void SwitchOp::Compute(OpKernelContext* context) {
 TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
+REGISTER_CPU_SWITCH(uint64);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
+REGISTER_GPU_SWITCH(uint64);
+TF_CALL_variant(REGISTER_GPU_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
@@ -256,8 +262,11 @@ REGISTER_KERNEL_BUILDER(Name("RefMerge").Device(DEVICE_CPU), MergeOp);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
+REGISTER_GPU_KERNEL(uint64);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
@@ -592,6 +601,13 @@ LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
 LoopCondOp::~LoopCondOp() = default;
 
 void LoopCondOp::Compute(OpKernelContext* context) {
+  CancellationManager* cm = context->cancellation_manager();
+  if (cm != nullptr) {
+    bool already_cancelled = cm->IsCancelled();
+    OP_REQUIRES(context, !already_cancelled,
+                errors::Cancelled("Loop execution was cancelled."));
+  }
+
   context->set_output(0, context->input(0));
 }
 
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 639c3062cc689f21359914f1848c6dbb21d97c6d..1bac2a18c30c841b7431e6a12063eba508e54d86 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -51,42 +51,47 @@ struct InflatePadAndShuffle {
   }
 };
 
-template <typename Device, typename Input, typename Filter, typename Output>
+template <typename Device, typename Input, typename Filter, typename Output,
+          typename OutputKernel>
 void SpatialConvolutionFunc(const Device& d, Output output, Input input,
                             Filter filter, int row_stride, int col_stride,
                             int row_dilation, int col_dilation,
-                            const Eigen::PaddingType& padding) {
+                            const Eigen::PaddingType& padding,
+                            const OutputKernel& output_kernel) {
   // Need to swap row/col when calling Eigen.
   output.device(d) =
       Eigen::SpatialConvolution(input, filter, col_stride, row_stride, padding,
-                                col_dilation, row_dilation);
+                                col_dilation, row_dilation, output_kernel);
 }
 
-template <typename Device, typename T>
+template <typename Device, typename T,
+          typename OutputKernel = const Eigen::NoOpOutputKernel>
 struct SpatialConvolution {
   void operator()(const Device& d, typename TTypes<T, 4>::Tensor output,
                   typename TTypes<T, 4>::ConstTensor input,
                   typename TTypes<T, 4>::ConstTensor filter, int row_stride,
                   int col_stride, int row_dilation, int col_dilation,
-                  const Eigen::PaddingType& padding) {
+                  const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
     SpatialConvolutionFunc(d, output, input, filter, row_stride, col_stride,
-                           row_dilation, col_dilation, padding);
+                           row_dilation, col_dilation, padding, output_kernel);
   }
 };
 
-template <typename Device>
-struct SpatialConvolution<Device, Eigen::half> {
+template <typename Device, typename OutputKernel>
+struct SpatialConvolution<Device, Eigen::half, OutputKernel> {
   void operator()(const Device& d,
                   typename TTypes<Eigen::half, 4>::Tensor output,
                   typename TTypes<Eigen::half, 4>::ConstTensor input,
                   typename TTypes<Eigen::half, 4>::ConstTensor filter,
                   int row_stride, int col_stride, int row_dilation,
-                  int col_dilation, const Eigen::PaddingType& padding) {
+                  int col_dilation, const Eigen::PaddingType& padding,
+                  const OutputKernel& output_kernel = OutputKernel()) {
     output.device(d) =
         Eigen::SpatialConvolution(input.cast<float>(), filter.cast<float>(),
                                   col_stride, row_stride, padding, col_dilation,
-                                  row_dilation)
-            .cast<Eigen::half>();
+                                  row_dilation, output_kernel)
+            .template cast<Eigen::half>();
   }
 };
 
@@ -124,7 +129,8 @@ struct SpatialConvolutionBackwardFilter {
 // TODO(vrv): Figure out how to use the MatMulFunctor in matmul_op.h.
 // My initial attempt to do this compiled but failed in the pytest
 // due to a swigdeps error.
-template <typename Device, typename T>
+template <typename Device, typename T,
+          typename OutputKernel = const Eigen::NoOpOutputKernel>
 struct MatMulConvFunctor {
   // Computes on device "d": out = in0 * in1, where * is matrix
   // multiplication.
@@ -132,8 +138,9 @@ struct MatMulConvFunctor {
       const Device& d, typename TTypes<T, 2>::Tensor out,
       typename TTypes<T, 2>::ConstTensor in0,
       typename TTypes<T, 2>::ConstTensor in1,
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
-    out.device(d) = in0.contract(in1, dim_pair);
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
+      const OutputKernel& output_kernel = OutputKernel()) {
+    out.device(d) = in0.contract(in1, dim_pair, output_kernel);
   }
 };
 
@@ -155,7 +162,7 @@ struct TransformFilter {
     merged_dims[1] = in.dimension(NDIMS - 2);  // input filters
     merged_dims[2] = in.dimension(NDIMS - 1);  // output filters
 
-    CHECK(dst_filter_format == FORMAT_OIHW)
+    DCHECK(dst_filter_format == FORMAT_OIHW)
         << "Unsupported destination filter format: "
         << ToString(dst_filter_format);
     // Source filter format is FORMAT_HWIO and spatial dimensions HW are merged
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d117574284065ff8fcf62d913257b0ccdd497e5
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -0,0 +1,1007 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "cuda/include/cuda.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T, bool conjugate>
+struct maybe_conj {
+  __device__ static __inline__ T run(T x) {
+    if (conjugate) {
+      return Eigen::numext::conj(x);
+    } else {
+      return x;
+    }
+  }
+};
+
+// Partial specializations for Cuda types used to store complex numbers.
+template <bool conjugate>
+struct maybe_conj<float2, conjugate> {
+  __device__ static __inline__ float2 run(float2 c) {
+    if (conjugate) {
+      float2 c_conj;
+      c_conj.x = c.x;
+      c_conj.y = -c.y;
+      return c_conj;
+    } else {
+      return c;
+    }
+  }
+};
+
+template <bool conjugate>
+struct maybe_conj<double2, conjugate> {
+  __device__ static __inline__ double2 run(double2 c) {
+    if (conjugate) {
+      double2 c_conj;
+      c_conj.x = c.x;
+      c_conj.y = -c.y;
+      return c_conj;
+    } else {
+      return c;
+    }
+  }
+};
+
+// TODO(mjanusz): Move this to a shared util file.
+// A simple array that contains data that can be passed between CPU and GPU.
+template <typename T, int IndexCount, T DefaultValue>
+struct Array {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[](int index) const {
+    return data[index];
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[](int index) {
+    return data[index];
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array() {
+    for (int i = 0; i < IndexCount; i++) {
+      data[i] = DefaultValue;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(T a0) {
+    data[0] = a0;
+    for (int i = 1; i < IndexCount; i++) {
+      data[i] = DefaultValue;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(T a0, T a1) {
+    data[0] = a0;
+    data[1] = a1;
+    for (int i = 2; i < IndexCount; i++) {
+      data[i] = DefaultValue;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(T a0, T a1, T a2) {
+    data[0] = a0;
+    data[1] = a1;
+    data[2] = a2;
+    for (int i = 3; i < IndexCount; i++) {
+      data[i] = DefaultValue;
+    }
+  }
+  EIGEN_STRONG_INLINE Array(const std::array<T, IndexCount>& array) {
+    for (int i = 0; i < IndexCount; i++) {
+      data[i] = array[i];
+    }
+  }
+  T data[IndexCount];
+};
+
+// A dimension type with compile-time known size.
+template <int IndexCount>
+struct Dimension : Array<int, IndexCount, 1> {
+  typedef Array<int, IndexCount, 1> Base;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension() : Base() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension(int a0) : Base(a0) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension(int a0, int a1)
+      : Base(a0, a1) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension(int a0, int a1, int a2)
+      : Base(a0, a1, a2) {}
+  EIGEN_STRONG_INLINE Dimension(const std::array<int, IndexCount>& array)
+      : Base(array) {}
+};
+
+// An index type with compile-time known size.
+template <int IndexCount>
+struct Index : Array<int, IndexCount, 0> {
+  typedef Array<int, IndexCount, 0> Base;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index() : Base() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index(int a0) : Base(a0) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index(int a0, int a1) : Base(a0, a1) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index(int a0, int a1, int a2)
+      : Base(a0, a1, a2) {}
+};
+
+// A helper function that converts a tensor index into a flat array index.
+template <int IndexCount>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int TensorIndexToFlat(
+    const Index<IndexCount>& index, const Dimension<IndexCount>& dims) {
+  int flat_index = index[0];
+  for (int i = 1; i < IndexCount; i++) {
+    flat_index = flat_index * dims[i] + index[i];
+  }
+  return flat_index;
+}
+
+// A helper function that converts a flat array index into a tensor index.
+template <int IndexCount>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index<IndexCount> FlatToTensorIndex(
+    int index, const Dimension<IndexCount>& dims) {
+  Index<IndexCount> tensor_index;
+  for (int i = IndexCount - 1; i >= 0; i--) {
+    int new_index = index / dims[i];
+    tensor_index[i] = index - dims[i] * new_index;
+    index = new_index;
+  }
+  return tensor_index;
+}
+
+// A simple CUDA custom kernel to shuffle dimensions of a 3D tensor according to
+// the given shuffle permutation in template parameters. Shuffle permutation
+// <sp0, sp1, sp2> shuffles dimensions such that input dimension 0 goes to sp0,
+// 1 goes to sp1 and 2 goes to sp2. For example, shuffle permutation <2, 0, 1>
+// will populate output so that input[x][y][z] is equal to (*output)[y][z][x].
+//
+// Requires that nthreads is equal to the total number of elements in the input
+// tensor.
+template <typename T, int sp0, int sp1, int sp2, bool conjugate = false>
+__global__ void ShuffleInTensor3Simple(int nthreads, const T* input,
+                                       Dimension<3> input_dims, T* output) {
+  Dimension<3> output_dims;
+  output_dims[sp0] = input_dims[0];
+  output_dims[sp1] = input_dims[1];
+  output_dims[sp2] = input_dims[2];
+
+  // Iterate over output as opposed to iterating over input for better
+  // performance. Iterating over output will generate sequential writes and
+  // random reads that performs better compared to sequential reads and random
+  // writes.
+  CUDA_1D_KERNEL_LOOP(output_index, nthreads) {
+    Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
+
+    Index<3> input_tensor_index;
+    input_tensor_index[0] = output_tensor_index[sp0];
+    input_tensor_index[1] = output_tensor_index[sp1];
+    input_tensor_index[2] = output_tensor_index[sp2];
+
+    int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
+
+    output[output_index] =
+        maybe_conj<T, conjugate>::run(ldg(input + input_index));
+  }
+}
+
+// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor,
+// where dimensions are zero-based: output[i][j][k] = input[i][k][j].
+//
+// Each thread block operates on a single tile, a rectangle of dimensions
+// TileSizeI x TileSizeJ.
+//
+// In general, for best performance, you should probably set TileSizeI,
+// TileSizeJ equal to the number of threads in a warp (32 in nvidia GPUs).
+// With a TileSizeI, TileSizeJ of 32, NumThreads of 128 or 256 seems to get
+// the best performance on K40 GPUs.
+template <typename T, int NumThreads, int TileSizeI, int TileSizeJ,
+          bool conjugate = false>
+__global__ void SwapDimension1And2InTensor3UsingTiles(
+    const T* __restrict__ input, Dimension<3> input_dims,
+    T* __restrict__ output) {
+  eigen_assert(blockDim.x == NumThreads);
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  constexpr int ReadRowPerPass = NumThreads / TileSizeJ;
+  constexpr int WriteRowPerPass = NumThreads / TileSizeI;
+  // One extra line in the inner dimension to avoid share memory bank conflict.
+  // This is to mimic the following, but no constructor of T can be invoked.
+  //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
+  __shared__ __align__(
+      alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
+  typedef T(*SharedMemoryTile)[TileSizeJ + 1];
+  SharedMemoryTile shared_memory_tile =
+      reinterpret_cast<SharedMemoryTile>(shared_mem_raw);
+
+  int x = threadIdx.x;
+
+  Dimension<3> output_dims = {
+      input_dims[0],
+      input_dims[2],
+      input_dims[1],
+  };
+
+  Dimension<3> input_dims_in_tiles = {
+      input_dims[0],
+      (input_dims[1] + TileSizeI - 1) / TileSizeI,
+      (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
+  };
+
+  Index<3> input_tile_index =
+      FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
+
+  Index<3> input_tile_origin = {
+      input_tile_index[0],
+      input_tile_index[1] * TileSizeI,
+      input_tile_index[2] * TileSizeJ,
+  };
+
+  int input_origin_flat_index =
+      TensorIndexToFlat(input_tile_origin, input_dims);
+
+  bool full_tile = true;
+  int tile_width = TileSizeJ;
+
+  // Only the last row or column may not have the full size.
+  if (input_tile_index[2] == input_dims_in_tiles[2] - 1) {
+    tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSizeJ;
+    full_tile &= false;
+  }
+
+  int tile_height = TileSizeI;
+
+  if (input_tile_index[1] == input_dims_in_tiles[1] - 1) {
+    tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSizeI;
+    full_tile &= false;
+  }
+
+  // Calculate effective thread number. This ensures that we use the largest
+  // number of threads available to form a regular thread block with no
+  // trailing incomplete lines.
+  constexpr int in_effective_thread_num = NumThreads / TileSizeJ * TileSizeJ;
+
+  if (x < in_effective_thread_num) {
+    // Orient the logical thread block with respect to the input array.
+    // ie. align the contiguous dimension of thread blocks with the contiguous
+    // dimension of the input array.
+    int ti = x / TileSizeJ;
+    int tj = x % TileSizeJ;
+    int input_index = input_origin_flat_index + ti * input_dims[2] + tj;
+    int input_increment = ReadRowPerPass * input_dims[2];
+
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeI); i_loc += ReadRowPerPass) {
+        shared_memory_tile[i_loc][tj] =
+            maybe_conj<T, conjugate>::run(input[input_index]);
+        input_index += input_increment;
+      }
+    } else {
+      if (tj < tile_width) {
+        for (int i_loc = ti; i_loc < (tile_height); i_loc += ReadRowPerPass) {
+          shared_memory_tile[i_loc][tj] =
+              maybe_conj<T, conjugate>::run(input[input_index]);
+          input_index += input_increment;
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+
+  Index<3> output_tile_index = {
+      input_tile_index[0],
+      input_tile_index[2],
+      input_tile_index[1],
+  };
+
+  Index<3> output_tile_origin = {
+      output_tile_index[0],
+      output_tile_index[1] * TileSizeJ,
+      output_tile_index[2] * TileSizeI,
+  };
+
+  int output_origin_flat_index =
+      TensorIndexToFlat(output_tile_origin, output_dims);
+
+  constexpr int out_effective_thread_num = NumThreads / TileSizeI * TileSizeI;
+
+  if (x < out_effective_thread_num) {
+    // Re-orient the logical thread block with respect to the output array.
+    // ie. align the contiguous dimension of thread blocks with contiguous
+    // dimension of the output array.
+    int ti = x / TileSizeI;
+    int tj = x % TileSizeI;
+    int output_index = output_origin_flat_index + ti * output_dims[2] + tj;
+    int output_increment = WriteRowPerPass * output_dims[2];
+
+    if (full_tile) {
+#pragma unroll
+      for (int i_loc = ti; i_loc < (TileSizeJ); i_loc += WriteRowPerPass) {
+        output[output_index] = shared_memory_tile[tj][i_loc];
+        output_index += output_increment;
+      }
+    } else {
+      if (tj < tile_height) {
+        for (int i_loc = ti; i_loc < (tile_width); i_loc += WriteRowPerPass) {
+          output[output_index] = shared_memory_tile[tj][i_loc];
+          output_index += output_increment;
+        }
+      }
+    }
+  }
+}
+
+// A Cuda custom kernel that convert input to output, given proper padding on
+// the left and the top. The padded value is zero.
+template <typename T, int NDIMS>
+__global__ void PadInputCustomKernelNHWC(int nthreads, const T* input,
+                                         Dimension<NDIMS> input_dims, T* output,
+                                         Dimension<NDIMS> output_dims,
+                                         Dimension<NDIMS - 2> padding_left) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int output_index = index;
+    Index<NDIMS> output_tensor_index =
+        FlatToTensorIndex(output_index, output_dims);
+
+    Index<NDIMS> input_tensor_index;
+    input_tensor_index[0] = output_tensor_index[0];  // batch
+    bool ok = true;
+    for (int i = 1; i < NDIMS - 1; i++) {
+      input_tensor_index[i] = output_tensor_index[i] - padding_left[i - 1];
+      ok &=
+          (input_tensor_index[i] >= 0 && input_tensor_index[i] < input_dims[i]);
+    }
+    input_tensor_index[NDIMS - 1] = output_tensor_index[NDIMS - 1];  // channels
+
+    if (ok) {
+      const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
+      output[output_index] = input[input_index];
+    } else {
+      output[output_index] = T(0);
+    }
+  }
+}
+
+template <typename T, int NDIMS>
+__global__ void PadInputCustomKernelNCHW(int nthreads, const T* input,
+                                         Dimension<NDIMS> input_dims, T* output,
+                                         Dimension<NDIMS> output_dims,
+                                         Dimension<NDIMS - 2> padding_left) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int output_index = index;
+    Index<NDIMS> output_tensor_index =
+        FlatToTensorIndex(output_index, output_dims);
+
+    Index<NDIMS> input_tensor_index;
+    input_tensor_index[0] = output_tensor_index[0];  // batch
+    input_tensor_index[1] = output_tensor_index[1];  // channels
+    bool ok = true;
+    for (int i = 2; i < NDIMS; i++) {
+      input_tensor_index[i] = output_tensor_index[i] - padding_left[i - 2];
+      ok &=
+          (input_tensor_index[i] >= 0 && input_tensor_index[i] < input_dims[i]);
+    }
+
+    if (ok) {
+      const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
+      output[output_index] = input[input_index];
+    } else {
+      output[output_index] = T(0);
+    }
+  }
+}
+
+// A GPU helper function that converts TensorFlow filter format to Cudnn filter
+// format.
+template <typename T, int NDIMS>
+struct TransformFilter<GPUDevice, T, int, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, FilterTensorFormat dst_filter_format,
+                  typename TTypes<T, NDIMS, int>::ConstTensor in,
+                  typename TTypes<T, NDIMS, int>::Tensor out) {
+    Dimension<3> combined_dims;
+    combined_dims[0] = in.dimension(0);  // spatial dimensions
+    for (int i = 1; i < NDIMS - 2; i++) {
+      combined_dims[0] *= in.dimension(i);
+    }
+    combined_dims[1] = in.dimension(NDIMS - 2);  // input filters
+    combined_dims[2] = in.dimension(NDIMS - 1);  // output filters
+    CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
+
+    CHECK(dst_filter_format == FORMAT_OIHW)
+        << "Unsupported output layout: " << ToString(dst_filter_format);
+
+    ShuffleInTensor3Simple<T, 2, 1, 0>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, in.data(), combined_dims, out.data());
+  }
+};
+
+// Converts Cudnn filter format OIHW back to TensorFlow filter format HWIO.
+// TODO(hinsu): Support reverse transformation from filter format OHWI as well.
+template <typename T, int NDIMS>
+struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
+                  typename TTypes<T, NDIMS>::Tensor out) {
+    Dimension<3> combined_dims;
+    combined_dims[0] = in.dimension(0);  // output filters
+    combined_dims[1] = in.dimension(1);  // input filters
+    combined_dims[2] = in.dimension(2);  // spatial dimensions
+    for (int i = 3; i < NDIMS; ++i) {
+      combined_dims[2] *= in.dimension(i);
+    }
+    CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
+    ShuffleInTensor3Simple<T, 2, 1, 0>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, in.data(), combined_dims, out.data());
+  }
+};
+
+// A GPU helper function that converts input tensor to a larger output tensor,
+// given proper padding values. The padded value is zero.
+template <typename T, int NDIMS>
+struct PadInput<GPUDevice, T, int, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d,
+                  typename TTypes<T, NDIMS, int>::ConstTensor in,
+                  const std::array<int, NDIMS - 2>& padding_left,
+                  const std::array<int, NDIMS - 2>& padding_right,
+                  typename TTypes<T, NDIMS, int>::Tensor out,
+                  TensorFormat format) {
+    CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
+    Dimension<NDIMS> input_dims;
+    for (int i = 0; i < NDIMS; ++i) {
+      input_dims[i] = in.dimension(i);
+    }
+    Dimension<NDIMS> output_dims;
+    for (int i = 0; i < NDIMS; ++i) {
+      output_dims[i] = out.dimension(i);
+    }
+
+    const Dimension<NDIMS - 2> padding_left_dim(padding_left);
+
+    if (format == FORMAT_NHWC) {
+      PadInputCustomKernelNHWC<T, NDIMS>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              config.virtual_thread_count, in.data(), input_dims, out.data(),
+              output_dims, padding_left_dim);
+    } else if (format == FORMAT_NCHW) {
+      PadInputCustomKernelNCHW<T, NDIMS>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              config.virtual_thread_count, in.data(), input_dims, out.data(),
+              output_dims, padding_left_dim);
+    } else {
+      LOG(FATAL) << "Invalid data format: " << format;
+    }
+  }
+};
+
+// We want std::equal_to and std::greater, but they're not constexpr until
+// C++14.
+struct EqualTo {
+  constexpr bool operator()(int a, int b) const { return a == b; }
+};
+
+struct GreaterThan {
+  constexpr bool operator()(int a, int b) const { return a > b; }
+};
+
+// For each data type, the tile size possibility frontier denotes the tile size
+// combinations that consume the most computational resources constrained by
+// - number of threads per SM limit,
+// - limit on size of the short dimension (<=15) due to the definition of
+//   narrow matrix,
+// - shared memory limit and
+// - some experimentally determined, type-specific constraint on the product of
+//   two side lengths to increase grid-level parallelism.
+//
+// A tile size combination lies on the frontier if and only if one or more
+// constraint mentioned above is hit. Tile size combinations lying outside this
+// frontier are either not possible, or are slower than the alternatives.
+//
+// It is instrumental to consider, for each data type, two subsets of the
+// corresponding frontier:
+// - long side frontier: the union of the biggest tile size combination for
+//   each legal long side len.
+// - non long side frontier: the frontier set minus the long side frontier.
+//
+// TileSizePossibilityFrontierCheck defines the frontier using only the long
+// side frontier tile size combinations (since one can easily extrapolate
+// the entire frontier from this subset). It serves as a utility function
+// to help us determine where a tile size combination of interest lies with
+// resepect to the frontier.
+template <typename Op>
+constexpr bool TileSizePossibilityFrontierCheck(int TileLongSide,
+                                                int TileShortSide,
+                                                int size_of_t, Op op) {
+  // clang-format off
+
+  return (size_of_t == 16 && ((TileLongSide == 32   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 64   && op(TileShortSide, 4))  ||
+                             (TileLongSide == 128  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 2)))) ||
+          (size_of_t == 8 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 256  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 2)))) ||
+          (size_of_t == 4 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 2 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
+          (size_of_t == 1 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
+                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
+                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
+                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
+                             (TileLongSide == 1024 && op(TileShortSide, 2))));
+
+  // clang-format on
+}
+
+constexpr bool TileSizeOnLongSideFrontier(int TileLongSide, int TileShortSide,
+                                          int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, EqualTo());
+}
+constexpr bool TileSizeOutsideFrontier(int TileLongSide, int TileShortSide,
+                                       int size_of_t) {
+  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
+                                          size_of_t, GreaterThan());
+}
+constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
+                                             int TileShortSide, int size_of_t) {
+  // For a tile size combination (longside, shortside), lying on the frontier
+  // implies that (longside, shortside) is on or within the frontier but
+  // (longside*2, shortside) or (longside, shortside+1) is not. With the above
+  // criterion, we simply need to use !TileSizeOnLongSideFrontier to ensure that
+  // it is not on the long side frontier.
+  return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
+         (TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
+          TileSizeOutsideFrontier(TileLongSide, TileShortSide + 1,
+                                  size_of_t)) &&
+         !TileSizeOnLongSideFrontier(TileLongSide, TileShortSide, size_of_t);
+}
+
+// Helper function to launch a batch narrow matirx transpose kernel.
+template <typename T, int TileLongSide, int TileShortSide>
+void LaunchBatchNarrowMatrixTransposeKernel(
+    const GPUDevice& d, int tile_size_i, int tile_size_j, int total_tiles_count,
+    const T* input, const Dimension<3>& input_dims, T* output) {
+  constexpr int NumThreads = TileLongSide;
+  if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
+    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
+                                          TileShortSide>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
+  } else {
+    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
+                                          TileLongSide>
+        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
+                                                           output);
+  }
+}
+
+// Recursive template function to search, in a trial-and-error manner, for the
+// minimum tile size configuration satisfying the requested tile side lengths.
+// An important invariant of this search procedure is that for an unsatisfied
+// request, we always try doubling the long side len first, and only after
+// the request is satisfied for the long side len do we begin incrementing
+// the short side len.
+//
+// We have three specializations of this search function depending on where the
+// current tile size combination lies with respect to the frontier.
+// - It lies within the frontier. If request is not satisfied, for the next tile
+// size combination, we first try doubling the long side len and if that does
+// not work, we then increment the short side len.
+// - It lies on the non long side frontier. If the request is not satisfied, we
+// can only increment the short side len.
+// - It lies on the long side frontier. We launch the kernel without checking if
+// the request is satisfied or not.
+template <typename T, int TileLongSide, int TileShortSide,
+          typename dummy = void>
+struct BatchNarrowMatrixTransposeDispatcher {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; we then
+    // determine whether it is the long side or the short side that falls short
+    // of the request and increase that parameter accordingly.
+    const bool long_side_request_not_satisfied =
+        std::max(tile_size_i, tile_size_j) > TileLongSide;
+
+    if (long_side_request_not_satisfied) {
+      BatchNarrowMatrixTransposeDispatcher<
+          T, TileLongSide * 2, TileShortSide>::DoIt(d, tile_size_i, tile_size_j,
+                                                    total_tiles_count, input,
+                                                    input_dims, output);
+    } else {
+      BatchNarrowMatrixTransposeDispatcher<
+          T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
+                                                    total_tiles_count, input,
+                                                    input_dims, output);
+    }
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide,
+    typename std::enable_if<TileSizeOnNonLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+    bool request_satisfied =
+        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
+        std::min(tile_size_i, tile_size_j) <= TileShortSide;
+
+    if (request_satisfied) {
+      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+          output);
+      return;
+    }
+
+    // If the execution reaches here, then the kernel was not launched; since
+    // we are on the non long side frontier, we increment the short dimension
+    // and try again.
+    BatchNarrowMatrixTransposeDispatcher<
+        T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
+                                                  total_tiles_count, input,
+                                                  input_dims, output);
+  }
+};
+
+template <typename T, int TileLongSide, int TileShortSide>
+struct BatchNarrowMatrixTransposeDispatcher<
+    T, TileLongSide, TileShortSide,
+    typename std::enable_if<TileSizeOnLongSideFrontier(
+                                TileLongSide, TileShortSide, sizeof(T)),
+                            void>::type> {
+  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
+                   int total_tiles_count, const T* input,
+                   const Dimension<3>& input_dims, T* output) {
+    static_assert(
+        (TileLongSide & (TileLongSide - 1)) == 0,
+        "The length of the longer side of the tile is always a power of 2.");
+
+    LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
+        d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
+        output);
+  }
+};
+
+// This function tries to recover, in a brute force way, the frontier defined in
+// TileSizePossibilityFrontierCheck as a vector of tile size combinations lying
+// on the long side frontier. This vector is sufficient to determine the entire
+// frontier.
+//
+// Note that if one changes the frontier definition in
+// TileSizePossibilityFrontierCheck and forgets to set the largest short
+// side len of the largest legal long side len to 2, this function will fail
+// and crash the program.
+template <int SizeOfT>
+const std::vector<std::pair<int, int>>& GetTileSizesFrontier() {
+  static_assert(
+      SizeOfT <= 16,
+      "Currently, only data types of sizes 16 bytes or less are supported.");
+  static_assert((SizeOfT & (SizeOfT - 1)) == 0,
+                "Data types must have sizes that are powers of 2.");
+
+  // Expensive work to populate sizes, lazily run in a thread-safe
+  // manner the first time GetTileSizesFrontier<N> is called.
+  static auto* frontier = [] {
+    auto* frontier = new std::vector<std::pair<int, int>>();
+    const int kMaxLongSideLen = 1024;
+    const int kMaxShortSideLen = 15;
+    for (int long_side = 32; long_side <= kMaxLongSideLen; long_side *= 2) {
+      for (int short_side = 2; short_side <= kMaxShortSideLen;
+           short_side += 1) {
+        if (TileSizeOnLongSideFrontier(long_side, short_side, SizeOfT)) {
+          // The current combination lies on the frontier, thus we
+          // add it to the frontier definition.
+          frontier->push_back(std::make_pair(long_side, short_side));
+
+          // The long side length is the largest one allowed iff its
+          // corresponding short side length is 2.
+          if (short_side == 2) return frontier;
+
+          // We have exhausted all the possibilities in the frontier
+          // with the given long side length.
+          break;
+        }
+      }
+    }
+    LOG(FATAL)
+        << "The corresponding short side length of the largest long side "
+           "length has to be 2.";
+  }();
+  return *frontier;
+}
+
+// Helper structs to help determine which data type to use given the size of
+// the matrix data type. A transpose of elements of size N will use a kernel
+// which operates on an array of TransposeElemType<N>::type.
+template <int ElemBytes>
+struct TransposeElemType;
+template <>
+struct TransposeElemType<1> {
+  using type = uint8;
+};
+template <>
+struct TransposeElemType<2> {
+  using type = uint16;
+};
+template <>
+struct TransposeElemType<4> {
+  using type = uint32;
+};
+template <>
+struct TransposeElemType<8> {
+  using type = uint64;
+};
+template <>
+struct TransposeElemType<16> {
+  using type = float4;
+};
+
+// A helper function to make RunSwapDimension1And2InTensor3 concise. This
+// helper function looks at the data type and input matrix sizes and decides
+// the thread numbers and tile sizes to use.
+template <typename T, bool conjugate = false>
+void SwapDimension1And2InTensor3WithNarrowMatrices(
+    const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
+    T* output, const int kMinDimensionToUseTiles) {
+  // Get available tile sizes here for the data type requested:
+  const auto& tile_spec = GetTileSizesFrontier<sizeof(T)>();
+
+  int tile_long_side_len = 0;
+  int tile_short_side_len = 0;
+  float lowest_cost = std::numeric_limits<float>::max();
+  int data_long_side = std::max(input_dims[1], input_dims[2]);
+
+  for (auto tile_size_pair : tile_spec) {
+    int proposed_tile_long_side_len = tile_size_pair.first;
+
+    // Number of threads that will not be doing anything useful when reading
+    // the matrix because the thread block size is bigger than the data block
+    // size.
+    int num_wasted_threads =
+        data_long_side - MathUtil::FloorOfRatio<int>(
+                             data_long_side, proposed_tile_long_side_len) *
+                             proposed_tile_long_side_len;
+
+    int num_full_tiles = MathUtil::FloorOfRatio<int>(
+        data_long_side, proposed_tile_long_side_len);
+
+    float cost = 0;
+
+    // However, if we can execute two or more full tiles, then we gladly
+    // accept any number of wasted threads and ignore its cost.
+    if (num_full_tiles <= 1) cost = num_wasted_threads;
+
+    // Using less than or equal to here because given the same cost, we
+    // would like to launch as many threads as possible.
+    if (cost <= lowest_cost) {
+      tile_long_side_len = proposed_tile_long_side_len;
+      tile_short_side_len = tile_size_pair.second;
+      lowest_cost = cost;
+    }
+  }
+
+  // Request tile sizes such that the longer side of threadblock aligns with
+  // the longer side of input data block to maximize read throughput.
+  // The ideal tile shape is one where the length of the shorter side of the
+  // tile is equal to the length of the shorter side of the input matrix.
+  int requested_tile_size_i = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? tile_long_side_len
+                                  : input_dims[1];
+  int requested_tile_size_j = input_dims[1] >= kMinDimensionToUseTiles
+                                  ? input_dims[2]
+                                  : tile_long_side_len;
+
+  // Truncate the shorter size requested according to the manual limit set in
+  // tile_spec to make sure that we do not launch configurations violating
+  // hardware limits.
+  requested_tile_size_i =
+      requested_tile_size_i == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_i, tile_short_side_len);
+  requested_tile_size_j =
+      requested_tile_size_j == tile_long_side_len
+          ? tile_long_side_len
+          : std::min(requested_tile_size_j, tile_short_side_len);
+
+  Dimension<3> input_dims_in_tiles = {
+      input_dims[0],
+      MathUtil::CeilOfRatio<int>(input_dims[1], requested_tile_size_i),
+      MathUtil::CeilOfRatio<int>(input_dims[2], requested_tile_size_j),
+  };
+
+  int total_tiles_count =
+      input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2];
+
+  using ElemType = typename TransposeElemType<sizeof(T)>::type;
+  static_assert(alignof(T) >= alignof(ElemType), "Unexpected data alignment.");
+  BatchNarrowMatrixTransposeDispatcher<ElemType, 32, 2>::DoIt(
+      d, requested_tile_size_i, requested_tile_size_j, total_tiles_count,
+      reinterpret_cast<const ElemType*>(input), input_dims,
+      reinterpret_cast<ElemType*>(output));
+}
+
+// Launch the GPU kernel that would swap dimension-1 and dimension-2 in a
+// 3D tensor. It looks at the shape of the incoming data, and decides the best
+// strategy to launch.
+template <typename T, bool conjugate = false>
+void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
+                                    const Dimension<3>& input_dims, T* output) {
+  // If both dimensions are not trivial, use tiles for the actual swapping.
+  // If one dimension is trivial, use SmallDim kernel for swapping.
+  // Otherwise, the trivial swapping relying on the ldg cache is more efficient.
+  static const int kMinDimensionToUseTiles = 16;
+  static const int kMinDimensionToUseRectTiles = 96;
+
+  bool large_matrix = input_dims[1] >= kMinDimensionToUseTiles &&
+                      input_dims[2] >= kMinDimensionToUseTiles;
+  bool narrow_matrix = input_dims[1] >= kMinDimensionToUseRectTiles ||
+                       input_dims[2] >= kMinDimensionToUseRectTiles;
+  if (large_matrix) {
+    // We get best performance when kTileSize is the number of threads in a warp
+    // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
+    // threads.
+    constexpr int kTileSize = 32;
+    constexpr int kNumThreads = 256;
+
+    Dimension<3> input_dims_in_tiles = {
+        input_dims[0],
+        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
+        MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
+    };
+
+    int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
+                            input_dims_in_tiles[2];
+    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
+                                          conjugate>
+        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
+                                                            output);
+
+  } else if (narrow_matrix) {
+    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
+        d, input, input_dims, output, kMinDimensionToUseTiles);
+  } else {
+    int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
+    ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, input, input_dims, output);
+  }
+}
+
+// A GPU helper functor that does general dimension 1 and 2 switch for 3D
+// tensor.
+template <typename T, bool conjugate>
+struct SwapDimension1And2InTensor3<GPUDevice, T, conjugate> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& combined_dims, T* out) {
+    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
+                               static_cast<int>(combined_dims[1]),
+                               static_cast<int>(combined_dims[2])};
+    RunSwapDimension1And2InTensor3<T, conjugate>(d, in, input_dims, out);
+  }
+};
+
+// A GPU helper functor that does general dimension 0 and 2 switch for 3D
+// tensor.
+template <typename T, bool conjugate>
+struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, const T* in,
+                  const gtl::ArraySlice<int64>& combined_dims, T* out) {
+    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
+                               static_cast<int>(combined_dims[1]),
+                               static_cast<int>(combined_dims[2])};
+    size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
+    CudaLaunchConfig config = GetCudaLaunchConfig(total_size, d);
+    ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            config.virtual_thread_count, in, input_dims, out);
+  }
+};
+
+// A GPU helper functor that converts NHWC TensorFlow data format to
+// NCHW format that is accepted by Cudnn.
+template <typename T, int NDIMS>
+struct NHWCToNCHW<GPUDevice, T, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
+                  typename TTypes<T, NDIMS>::Tensor out) {
+    Dimension<3> combined_dims;
+    combined_dims[0] = in.dimension(0);  // N (batch)
+    combined_dims[1] = in.dimension(1);  // spatial dimensions (HW)
+    for (int i = 2; i < NDIMS - 1; ++i) {
+      combined_dims[1] *= in.dimension(i);
+    }
+    combined_dims[2] = in.dimension(NDIMS - 1);  // C (channels)
+    RunSwapDimension1And2InTensor3(d, in.data(), combined_dims, out.data());
+  }
+};
+
+// A GPU helper functor that converts NCHW Cudnn data format to NHWC TensorFlow
+// Format.
+template <typename T, int NDIMS>
+struct NCHWToNHWC<GPUDevice, T, NDIMS> {
+  typedef GPUDevice Device;
+  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
+                  typename TTypes<T, NDIMS>::Tensor out) {
+    Dimension<3> combined_dims;
+    combined_dims[0] = in.dimension(0);  // N (batch)
+    combined_dims[1] = in.dimension(1);  // C (channel)
+    combined_dims[2] = in.dimension(2);  // spatial dimensions (HW)
+    for (int i = 3; i < NDIMS; ++i) {
+      combined_dims[2] *= in.dimension(i);
+    }
+    RunSwapDimension1And2InTensor3(d, in.data(), combined_dims, out.data());
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
diff --git a/tensorflow/core/kernels/conv_2d_gpu_double.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..353d6d113023a3b970a39bbc097e32d5154de9a7
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_double.cu.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
+
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, double, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, double, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, double, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, double, 4>;
+template struct PadInput<Eigen::GpuDevice, double, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, double, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, double, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, double, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, double, 5>;
+template struct PadInput<Eigen::GpuDevice, double, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21030dd12b3912b8d251c0bf386e7059348bb312
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4, int>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4,
+                                  Eigen::DenseIndex>;
+
+template struct TransformDepth<Eigen::GpuDevice, float, int>;
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, float, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 4>;
+template struct PadInput<Eigen::GpuDevice, float, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, float, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 5>;
+template struct PadInput<Eigen::GpuDevice, float, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..948308651fbe8a8ed0c88ba33416361d468abc97
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4, int>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4,
+                                  Eigen::DenseIndex>;
+
+template struct TransformDepth<Eigen::GpuDevice, Eigen::half, int>;
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, Eigen::half>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 4>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 5>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_int.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_int.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..901ce3e55d4f42cf399331cbf471835bef5d7097
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_int.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+// For 2d ops.
+template struct PadInput<Eigen::GpuDevice, int, int, 4>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint16.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e47532a9832f7c77cc5a6d714e870b3c872aa0d5
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint16.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint16>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint16>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint32.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint32.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56cd5dd218ccc20bc434025bfabbe87c67e0e090
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint32.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint32>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint32>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint64.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..045a664e9653e37ab27ba3a05836ed303b2cf0ea
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint64.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint64>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint64>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint8.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint8.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..215417860afc2b01108fad085133144542791a52
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint8.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint8>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint8>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 9e86a16b66d5ba7614effe850f5901a4fb6e8091..bc30da40991b56adc136bbe6115db16c00a04666 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -44,6 +44,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 43bb5ea56c97af793cce78b7f6b9f0ae9e224414..e06af15f2fc5558e9810c3da525fbf3cb385e893 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -43,6 +43,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index bab91f5e861236f87f12d39e452a37d75467be0d..e4c49efea0bd87fdbaa3fbdad3d5612d6b4f8a82 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -35,6 +35,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 using stream_executor::dnn::DimIndex;
@@ -1070,6 +1074,7 @@ namespace functor {
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -1859,6 +1864,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                           Conv3DBackpropFilterOp<GPUDevice, T>);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 78856c4a9925a5273449376ba18fadd6f33144bb..74857fc2078dc3ee5e17959fc32febcdcb38a689 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -739,11 +739,16 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       To32Bit(transformed_filter.tensor<T, 4>()));
 
   Tensor transformed_output;
-  OP_REQUIRES_OK(
-      ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                              ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows,
-                                              out_cols, out_depths),
-                              &transformed_output));
+  if (data_format == FORMAT_NHWC) {
+    // Only allocate temporary memory when a layout transformation is needed.
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                ShapeFromFormat(FORMAT_NCHW, out_batch,
+                                                out_rows, out_cols, out_depths),
+                                &transformed_output));
+  } else {
+    transformed_output = *output;
+  }
 
   auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                   input.template flat<T>().size());
@@ -851,47 +856,47 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
         ") filter shape(", filter.shape().DebugString(), ")"));
   }
 
-  // Convert the output tensor back from NHWC to NCHW.
+  // Convert the output tensor back from NCHW to NHWC.
   if (data_format == FORMAT_NHWC) {
     functor::NCHWToNHWC<GPUDevice, T, 4>()(
         ctx->eigen_device<GPUDevice>(),
         const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
         output->tensor<T, 4>());
-  } else {
-    *output = transformed_output;
   }
 }
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                                  \
-  template <>                                                                \
-  void SpatialConvolution<GPUDevice, T>::operator()(                         \
-      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,              \
-      typename TTypes<T, 4>::ConstTensor input,                              \
-      typename TTypes<T, 4>::ConstTensor filter, int row_stride,             \
-      int col_stride, int row_dilation, int col_dilation,                    \
-      const Eigen::PaddingType& padding);                                    \
-  extern template struct SpatialConvolution<GPUDevice, T>;                   \
-  template <>                                                                \
-  void MatMulConvFunctor<GPUDevice, T>::operator()(                          \
-      const GPUDevice& d, typename TTypes<T, 2>::Tensor out,                 \
-      typename TTypes<T, 2>::ConstTensor in0,                                \
-      typename TTypes<T, 2>::ConstTensor in1,                                \
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); \
-  extern template struct MatMulConvFunctor<GPUDevice, T>;                    \
-  template <>                                                                \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(                    \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,              \
-      typename TTypes<T, 4, int>::ConstTensor in,                            \
-      typename TTypes<T, 4, int>::Tensor out);                               \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;              \
-  template <>                                                                \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                           \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,        \
-      const std::array<int, 2>& padding_left,                                \
-      const std::array<int, 2>& padding_right,                               \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);     \
+#define DECLARE_GPU_SPEC(T)                                                 \
+  template <>                                                               \
+  void SpatialConvolution<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,             \
+      typename TTypes<T, 4>::ConstTensor input,                             \
+      typename TTypes<T, 4>::ConstTensor filter, int row_stride,            \
+      int col_stride, int row_dilation, int col_dilation,                   \
+      const Eigen::PaddingType& padding,                                    \
+      const Eigen::NoOpOutputKernel& output_kernel);                        \
+  extern template struct SpatialConvolution<GPUDevice, T>;                  \
+  template <>                                                               \
+  void MatMulConvFunctor<GPUDevice, T>::operator()(                         \
+      const GPUDevice& d, typename TTypes<T, 2>::Tensor out,                \
+      typename TTypes<T, 2>::ConstTensor in0,                               \
+      typename TTypes<T, 2>::ConstTensor in1,                               \
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair, \
+      const Eigen::NoOpOutputKernel& output_kernel);                        \
+  extern template struct MatMulConvFunctor<GPUDevice, T>;                   \
+  template <>                                                               \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(                   \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,             \
+      typename TTypes<T, 4, int>::ConstTensor in,                           \
+      typename TTypes<T, 4, int>::Tensor out);                              \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;             \
+  template <>                                                               \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                          \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,       \
+      const std::array<int, 2>& padding_left,                               \
+      const std::array<int, 2>& padding_right,                              \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);    \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 83df4dce38e09b09956104c411d3e36f6cfb7657..f20ac93b5a01cf2dbd1c53ce55c832727f49979f 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -533,10 +533,19 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
-      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
+      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);   \
+  template <>                                                         \
+  void NHWCToNCHW<GPUDevice, T, 5>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
+      typename TTypes<T, 5>::Tensor out);                             \
+  template <>                                                         \
+  void NCHWToNHWC<GPUDevice, T, 5>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
+      typename TTypes<T, 5>::Tensor out);
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 
 }  // namespace functor
@@ -548,6 +557,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv3DOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    Conv3DOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 972100ba77872eb54af75e6f62bda5ac0ecc1774..798a7325cd25494d8b12447c86f4883ca038c8ca 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -14,888 +14,496 @@ limitations under the License.
 ==============================================================================*/
 
 // Implements convolution operations with other kernels baked into the
-// processing, to optimize latency and memory usage.
+// processing, to optimize latency and memory usage:
+//  - Conv2D + BiasAdd + <Activation>
+//  - Conv2D + FusedBatchNorm + <Activation>
+//
+// Activation: Relu, Relu6, Elu, etc...
+//
+// Kernels for convolutions fused with image transformations (resize and mirror
+// padding) defined in `conv_ops_fused_image_transform.cc`.
 
 #define EIGEN_USE_THREADS
 
-#include <string.h>
-#include <map>
+#include <string>
 #include <vector>
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/kernels/gemm_functors.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/util/mirror_pad_mode.h"
-#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
-
 namespace {
 
-// We don't want to allocate a buffer to hold all the patches if the size is
-// going to be extremely large, so break it into chunks if it's bigger than
-// a limit. Each chunk will be processed serially, so we can refill the
-// buffer for the next chunk and reuse it, keeping maximum memory size down.
-// In this case, we've picked 16 megabytes as a reasonable limit for Android and
-// other platforms using Eigen, and 1MB for iOS devices, from experimentation.
-#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
-const size_t kMaxChunkSize = (1 * 1024 * 1024);
-#else
-const size_t kMaxChunkSize = (16 * 1024 * 1024);
-#endif
-const size_t kResizeCacheSize = (8 * 1024 * 1024);
-
-// Lookup method used when resizing.
-enum SamplingMode {
-  BILINEAR = 0,
-  NEAREST = 1,
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+// Type aliases for the unaligned tensors (tensor maps) used in output kernels.
+template <typename T>
+struct OutputTypes {
+  // There is no guarantee that the output block passed to the output kernel
+  // will be aligned.
+
+  using Tensor =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>,
+                       Eigen::Unaligned>;
+
+  using ConstTensor = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>,
+      Eigen::Unaligned>;
 };
 
-// Simple utility function used by FusedConv to multithread basic workloads. To
-// use it, pass begin and end values for the full workload and a std::function
-// that receives a subset of that through the begin and end values for each
-// worker's task. The division of the full workload into worker tasks is handled
-// by the multithreading logic. Here's an example of how to use it:
-// std::vector<float> my_vector(100);
-// ...
-// FusedConvParallelFor(context, 0, 100,
-//   [&my_vector](int64 task_begin, int64 task_end) {
-//     for (int64 current = task_begin; current != task_end; ++current) {
-//       my_vector[current] *= 10.0f;
-//     }
-// });
-void FusedConvParallelFor(
-    OpKernelContext* context, int64 begin, int64 end,
-    const std::function<void(int64, int64)>& task_function) {
-// On iOS, the thread management imposes a very big performance penalty, so
-// just call the function directly with no multithreading.
-#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
-  task_function(begin, end);
-#else
-  auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  thread::ThreadPool* thread_pool = worker_threads.workers;
-  const int64 total_elements = end - begin;
-  // This is a bit of an arbitrary number, but was found to work well for
-  // typical models we've been profiling on various devices.
-  const int64 element_cost = 10000000;
-  thread_pool->ParallelFor(
-      total_elements, element_cost,
-      [begin, task_function](int64 begin_offset, int64 end_offset) {
-        const int64 task_begin = begin + begin_offset;
-        const int64 task_end = begin + end_offset;
-        task_function(task_begin, task_end);
-      });
-#endif
-}
-
-// Holds the state needed for the resizing subtasks.
-template <class T1>
-struct ResizeTaskParameters {
-  ResizeTaskParameters() : st(false) {}
-
-  int cache_height;
-  T1* resize_cache;
-  int cache_line_width;
-  int input_width;
-  int input_depth;
-  int top_padding;
-  int pad_offset;
-  int64 resized_height;
-  ImageResizerState st;
-  const T1* input_batch_start;
-  int64 cache_start_x;
-  int64 cache_end_x;
-  int left_padding;
-  int64 resized_width;
-  int64 padded_width;
-  int64 padded_height;
+// Type alias for the tensor contraction output mapper.
+template <typename Scalar, typename Index>
+using ContractionOutputMapper =
+    Eigen::internal::blas_data_mapper<Scalar, Index, Eigen::ColMajor>;
+
+// Returns input expression without any transformations.
+struct Identity {
+  template <typename XprType>
+  static auto apply(XprType expr) -> XprType {
+    return expr;
+  };
 };
 
-template <class T1>
-struct PerCacheLineParameters {
-  PerCacheLineParameters() {}
-  PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
-      : cache_line_start(other.cache_line_start),
-        input_top_row_start(other.input_top_row_start),
-        input_bottom_row_start(other.input_bottom_row_start),
-        y_lerp(other.y_lerp) {}
-
-  T1* cache_line_start;
-  const T1* input_top_row_start;
-  const T1* input_bottom_row_start;
-  T1 y_lerp;
+// Applies `Relu` to the passed input expression.
+struct Relu {
+  template <typename XprType>
+  static auto apply(XprType expr)
+      -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) {
+    return expr.cwiseMax(static_cast<typename XprType::Scalar>(0));
+  };
 };
 
-// Helper class to simplify bilinear filtering
-template <class T1>
-struct SampleRect {
-  EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
-                                 const T1* in_bottom_left,
-                                 const T1* in_bottom_right)
-      : top_left(in_top_left),
-        top_right(in_top_right),
-        bottom_left(in_bottom_left),
-        bottom_right(in_bottom_right) {}
-
-  EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
-                                        T1 y_lerp) const {
-    const T1 top =
-        top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
-    const T1 bottom = bottom_left[channel] +
-                      (bottom_right[channel] - bottom_left[channel]) * x_lerp;
-    return top + (bottom - top) * y_lerp;
+// TensorContraction swaps lhs with rhs, and changes layout from RowMajor
+// (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul
+// using these tensors.
+//
+// TensorContraction output matrix (before reshape) has a ColMajor layout, and
+// has dimensions:
+//  - rows: output_channels
+//  - cols: all other dimensions
+//
+// First element in every column is:
+//   [batch ??, height ??, width ??, out_channel = i]
+//
+// We do not know what are the values of the 'batch', 'height', and 'width' here
+// (if we know original dimensions, they can be computed from 'j').
+//
+// Each column of an output block is a continuous slice along the output channel
+// dimension, so we can use it to efficiently compute any transformation that
+// depends only on a channel value (e.g. add channel bias).
+
+// Output kernel that fuses BiasAdd operation into the output of tensor
+// contraction + any other transformation defined by Transform.
+template <typename T, typename Transform = Identity>
+struct BiasAddOutputKernel {
+  explicit BiasAddOutputKernel(const T* bias_data) : bias_data(bias_data) {}
+
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, Index>& output_mapper,
+      const Eigen::TensorContractionParams& params, Index i, Index j,
+      Index num_rows, Index num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* bias_base = bias_data + i;
+    typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename OutputTypes<T>::Tensor output(output_base, num_rows);
+      const auto expr = output + bias;
+      output = Transform::template apply<decltype(expr)>(expr);
+    }
   }
 
-  const T1* top_left;
-  const T1* top_right;
-  const T1* bottom_left;
-  const T1* bottom_right;
+ private:
+  const T* bias_data;
 };
 
-// Calculates parameters which remain constant through a resize cache row.
-template <class T1>
-EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
-    int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width,
-    int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset,
-    int64 resized_height, const ImageResizerState& st,
-    const T1* input_batch_start) {
-  PerCacheLineParameters<T1> result;
-  // The cache is organized so that the real y values of the resized image map
-  // onto the actual cache values through a modulo scheme. This means that as we
-  // progress downwards through the image, we keep reusing a small cache and so
-  // keep memory usage down.
-  int64 cache_index_y;
-  if (cache_y < 0) {
-    cache_index_y = cache_height + (cache_y % cache_height);
-  } else {
-    cache_index_y = cache_y % cache_height;
-  }
-  result.cache_line_start =
-      resize_cache + (cache_index_y * cache_line_width * input_depth);
-  // This part is implementing the mirror padding that happens before resizing.
-  float in_y = (cache_y - top_padding);
-  if (in_y < 0) {
-    in_y = -(in_y + 1.0f - pad_offset);
-  } else if (in_y >= resized_height) {
-    in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset);
+// Output kernel that fuses FusedBatchNorm operation into the output of tensor
+// contraction + any other transformation defined by Transform.
+template <typename T, typename Transform = Identity>
+struct FusedBatchNormOutputKernel {
+  FusedBatchNormOutputKernel(T epsilon, const T* scaling_factor_data,
+                             const T* offset_data, const T* estimated_mean_data)
+      : epsilon(epsilon),
+        scaling_factor_data(scaling_factor_data),
+        offset_data(offset_data),
+        estimated_mean_data(estimated_mean_data) {}
+
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, Index>& output_mapper,
+      const Eigen::TensorContractionParams& params, Index i, Index j,
+      Index num_rows, Index num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* scaling_factor_base = scaling_factor_data + i;
+    const T* offset_base = offset_data + i;
+    const T* mean_base = estimated_mean_data + i;
+
+    typename OutputTypes<T>::ConstTensor scaling_factor(scaling_factor_base,
+                                                        num_rows);
+    typename OutputTypes<T>::ConstTensor offset(offset_base, num_rows);
+    typename OutputTypes<T>::ConstTensor mean(mean_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename OutputTypes<T>::Tensor output(output_base, num_rows);
+
+      auto scaled = (output - mean) * scaling_factor;
+      auto shifted = scaled + offset;
+
+      output = Transform::template apply<decltype(shifted)>(shifted);
+    }
   }
-  // Here's where do do the actual resize.
-  in_y *= st.height_scale;
-  const int64 top_y_index = static_cast<int64>(std::floor(in_y));
-  const int64 bottom_y_index =
-      std::min(static_cast<int64>(std::ceil(in_y)), (st.in_height - 1));
-  // Lerp is used for bilinear filtering when that's needed.
-  result.y_lerp = static_cast<T1>(in_y - top_y_index);
-  // Which rows of the original input image to pull the values from.
-  result.input_top_row_start =
-      input_batch_start + (top_y_index * input_width * input_depth);
-  result.input_bottom_row_start =
-      input_batch_start + (bottom_y_index * input_width * input_depth);
-  return result;
-}
-
-template <class T1>
-struct PerCachePixelParameters {
-  PerCachePixelParameters() {}
-  PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
-      : cache_line_pixel(other.cache_line_pixel),
-        left_x_index(other.left_x_index),
-        right_x_index(other.right_x_index),
-        x_lerp(other.x_lerp) {}
-
-  T1* cache_line_pixel;
-  int64 left_x_index;
-  int64 right_x_index;
-  T1 x_lerp;
+
+ private:
+  T epsilon;
+  const T* scaling_factor_data;
+  const T* offset_data;
+  const T* estimated_mean_data;
 };
 
-// Pulls out common parameters used for every resized pixel.
-template <class T1>
-EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
-CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x,
-                                 T1* cache_line_start, int64 input_depth,
-                                 int64 left_padding, int64 pad_offset,
-                                 int64 resized_width,
-                                 const ImageResizerState& st) {
-  PerCachePixelParameters<T1> result;
-  // Figure out where we're going to store the results of our transform.
-  const int cache_index_x = cache_x - cache_start_x;
-  result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
-  // Implement mirror padding by flipping in_x if it's off the edge.
-  float in_x = (cache_x - left_padding);
-  if (in_x < 0) {
-    in_x = -(in_x + 1.0f - pad_offset);
-  } else if (in_x >= resized_width) {
-    in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset);
-  }
-  // Resize the x parameters.
-  in_x *= st.width_scale;
-  // Get the x coordinates for the left and right pixels to pull from.
-  result.left_x_index = static_cast<int64>(std::floor(in_x));
-  result.right_x_index =
-      std::min(static_cast<int64>(std::ceil(in_x)), (st.in_width - 1));
-  // This x_lerp is used to blend pixels in bilinear filtering.
-  result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
-  return result;
-}
-
-// Combines bilinear resizing and mirror padding into the im2col transformation
-// stage of convolution.
-template <class T1, class T2, class T3, class TGemmFunctor,
-          SamplingMode SampleMode>
-class FusedResizeAndPadConvFunctor {
+// Type aliases for the output kernels, purely for the sake of better launch
+// dispatching code readability.
+template <typename T>
+using WithBiasAdd = BiasAddOutputKernel<T>;
+template <typename T>
+using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
+template <typename T>
+using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
+template <typename T>
+using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
+
+// Dispatch 2D convolution to the appropriate primitive operation:
+//   (1) MatMul for the case of 1x1 convolution.
+//   (2) MatMul for the case when filter size equals to the input size.
+//   (3) General spatial 2D convolution for all other cases.
+template <typename T>
+class LaunchConv2DWithOutputKernel {
  public:
-  void operator()(OpKernelContext* context, const Tensor& input,
-                  int input_batches, int resized_height, int resized_width,
-                  int padded_height, int padded_width, int input_depth,
-                  const T2* filter_data, int filter_height, int filter_width,
-                  int filter_count, int stride_rows, int stride_cols,
-                  Padding padding, T3* output_data, int output_height,
-                  int output_width, const ImageResizerState& st,
-                  int top_padding, int bottom_padding, int left_padding,
-                  int right_padding, int pad_offset) {
-    if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) ||
-        (input_depth <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad input dimensions: "
-                   << input_batches << ", " << padded_height << ", "
-                   << padded_width << ", " << input_depth;
-      return;
-    }
-    if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
-                   << filter_width << ", " << filter_height << ", "
-                   << filter_count;
-      return;
-    }
-    if ((output_width <= 0) || (output_height <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad output width or height: "
-                   << output_width << ", " << output_height;
-      return;
-    }
-    OP_REQUIRES(
-        context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)),
-        errors::InvalidArgument("Bad sample mode passed in", SampleMode));
-
-    // These calculations define how the patches will be positioned within the
-    // input image. The actual definitions are quite complex, and rely on the
-    // previously-calculated output size.
-    int filter_left_offset;
-    int filter_top_offset;
-    if (padding == VALID) {
-      filter_left_offset =
-          ((output_width - 1) * stride_cols + filter_width - padded_width + 1) /
-          2;
-      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
-                           padded_height + 1) /
-                          2;
-    } else {
-      filter_left_offset =
-          ((output_width - 1) * stride_cols + filter_width - padded_width) / 2;
-      filter_top_offset =
-          ((output_height - 1) * stride_rows + filter_height - padded_height) /
-          2;
-    }
-
-    ResizeTaskParameters<T1> task_params;
-    task_params.input_depth = input_depth;
-    task_params.top_padding = top_padding;
-    task_params.pad_offset = pad_offset;
-    task_params.resized_height = resized_height;
-    task_params.st = st;
-    task_params.left_padding = left_padding;
-    task_params.resized_width = resized_width;
-    task_params.padded_width = padded_width;
-    task_params.padded_height = padded_height;
-
-    // The im2col buffer has # of patches rows, and # of filters cols.
-    // It's laid out like this, in row major order in memory:
-    //        < filter value count >
-    //   ^   +---------------------+
-    // patch |                     |
-    // count |                     |
-    //   v   +---------------------+
-    // Each patch row contains a filter_width x filter_height patch of the
-    // input, with the depth channel as the most contiguous in memory, followed
-    // by the width, then the height. This is the standard memory order in the
-    // image world if it helps to visualize it.
-    const int filter_value_count = filter_width * filter_height * input_depth;
-
-    OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
-                errors::InvalidArgument("Im2Col patch too large for buffer"));
-    const size_t patches_per_chunk =
-        kMaxChunkSize / (filter_value_count * sizeof(T1));
-    // Because memory allocation is very expensive on mobile platforms, try to
-    // allocate a persistent buffer that will be kept around between calls. We
-    // use TensorFlow's resource management to ensure that the memory will be
-    // released when the session is over.
-    Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
-    std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
-        [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
-          *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
-          return Status::OK();
-        };
-    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
-                                "Conv2d", "im2col_buffer",
-                                &im2col_buffer_resource, creator));
-
-    // Create a resize cache memory buffer that will hold the rows of
-    // transformed and mirror padded input pixels, ready to be copied
-    // into filter patches by im2col.
-    // It's laid out like this, in row major order in memory:
-    //         < cache line width >
-    //   ^    +--------------------+
-    // cache  |                    |
-    // height |                    |
-    //   v    +--------------------+
-    // Each cache row contains a cache_line_width number of resized pixels,
-    // each with input_depth channels. The cache height is typically less than
-    // the full height the resized image would be, so it's filled up
-    // incrementally as we progress downwards through the input creating im2col
-    // patches.
-    task_params.cache_start_x = -filter_left_offset;
-    task_params.cache_end_x =
-        (((output_width - 1) * stride_cols) - filter_left_offset) +
-        filter_width;
-    task_params.cache_line_width =
-        task_params.cache_end_x - task_params.cache_start_x;
-    task_params.cache_height =
-        kResizeCacheSize / (task_params.cache_line_width * input_depth);
-    const int needed_resize_cache_count =
-        filter_height * task_params.cache_line_width * input_depth;
-    OP_REQUIRES(context,
-                (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
-                errors::InvalidArgument("Input too large for resize cache"));
-    Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
-    std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
-        resize_creator =
-            [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
-              *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
-              return Status::OK();
-            };
-    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
-                                "Conv2d", "resize_cache",
-                                &resize_cache_resource, resize_creator));
-
-    // This means that multiple ops can't be run simultaneously on different
-    // threads, because we have a single shared resource. The platforms this is
-    // aimed at have intra-op parallelism as their focus though, so it shouldn't
-    // be an issue.
-    mutex_lock lock_buffer(im2col_buffer_resource->mu);
-    core::ScopedUnref unref_buffer(im2col_buffer_resource);
-    T1* im2col_buffer = im2col_buffer_resource->data;
-
-    // This buffer is used as a fairly heavy-weight cache for the resized and
-    // mirrored inputs to the im2col operation. The problem is that we want to
-    // keep the memory usage down by not rendering the fully resized and padded
-    // input tensor to the convolution into an entire buffer. The first approach
-    // to avoid this was to fold the bilinear filtering and padding spatial
-    // transformations into the im2col lookup itself. This successfully reduced
-    // memory usage, but because im2col can access an individual pixel for many
-    // different patches, the extra overhead of doing the same bilinear lookups
-    // repeatedly became too expensive.
-    // The resize cache is designed to avoid this problem by keeping a
-    // horizontal slice of the resized and padded input to the im2col
-    // precalculated, so that repeated accesses to the same pixel from different
-    // filter patches can just be copied from this cache. It's organized as a
-    // horizontal slice stretching across the whole virtual image, and as high
-    // as the filter window, so that as the patch processing moves across all
-    // the pixels are present, and before a new row of patches is started any
-    // previously calculated rows that are needed are maintained, with new rows
-    // calculated as required.
-    mutex_lock resize_lock_buffer(resize_cache_resource->mu);
-    core::ScopedUnref unref_resized_cache(resize_cache_resource);
-    task_params.resize_cache = resize_cache_resource->data;
-
-    const T1* input_data = input.flat<T1>().data();
-    const int64 input_height = input.shape().dim_sizes()[1];
-    task_params.input_width = input.shape().dim_sizes()[2];
-
-    int end_cached_lines = std::numeric_limits<int>::min();
-
-    for (int batch = 0; batch < input_batches; ++batch) {
-      task_params.input_batch_start =
-          input_data +
-          (batch * input_height * task_params.input_width * input_depth);
-      const int in_y_end =
-          ((output_height * stride_rows) - filter_top_offset) + filter_height;
-      for (int out_y = 0; out_y < output_height; ++out_y) {
-        const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
-        const int cache_start_y = std::max(in_y_origin, end_cached_lines);
-        const int cache_end_y = std::min(
-            in_y_end, std::max((in_y_origin + task_params.cache_height),
-                               end_cached_lines));
-        if (end_cached_lines < (in_y_origin + filter_height)) {
-          // This call breaks up the work required for calculating the mirror
-          // padding and resizing across multiple threads.
-          FusedConvParallelFor(
-              context, cache_start_y, cache_end_y,
-              [task_params](int64 task_cache_start_y, int64 task_cache_end_y) {
-                // This is a long and confusing function, but it's been laid out
-                // this way to help with performance on some intensive models.
-                // What it's doing is populating a cache of the original input
-                // image, after it's been bilinear resized and had its edges
-                // mirrored. This allows the following im2col code to access the
-                // transformed pixels from this cache, without having to
-                // repeatedly apply the expensive bilinear calculations as the
-                // same pixels are accessed by different patches.
-                // This is most effective when the stride is small and the
-                // filter size is large, since that's when pixels are reused
-                // most frequently as patches overlap.
-                for (int cache_y = task_cache_start_y;
-                     cache_y < task_cache_end_y; ++cache_y) {
-                  // We organize the cache as a series of rows, each containing
-                  // all the transformed pixels for a given line in the image.
-                  // This cache is big enough to hold at least a filter's height
-                  // worth of rows, but typically more, limited by the size of
-                  // the cache buffer.
-                  // We don't allocate an entire image's worth of rows though,
-                  // because we're trying to keep memory usage down, so as we
-                  // progress downwards through the im2col we periodically
-                  // refresh the cache so that the next lines that are needed
-                  // for that operation are always present.
-                  // Work out the parameters that remain constant across the
-                  // row we're calculating.
-                  PerCacheLineParameters<T1> line_params(
-                      CalculatePerCacheLineParameters<T1>(
-                          task_params.cache_height, cache_y,
-                          task_params.resize_cache,
-                          task_params.cache_line_width, task_params.input_width,
-                          task_params.input_depth, task_params.top_padding,
-                          task_params.pad_offset, task_params.resized_height,
-                          task_params.st, task_params.input_batch_start));
-                  // Iterate through the resize cache row we're filling in.
-                  for (int cache_x = task_params.cache_start_x;
-                       cache_x < task_params.cache_end_x; ++cache_x) {
-                    // Figure out what we need for the cache pixel we're
-                    // populating.
-                    PerCachePixelParameters<T1> pixel_params(
-                        CalculatePerCachePixelParameters<T1>(
-                            cache_x, task_params.cache_start_x,
-                            line_params.cache_line_start,
-                            task_params.input_depth, task_params.left_padding,
-                            task_params.pad_offset, task_params.resized_width,
-                            task_params.st));
-                    // If the access is off the left, right, top, or bottom of
-                    // the resized image, the conv padding means we should set
-                    // it to zero.
-                    if ((cache_x < 0) ||
-                        (cache_x >= task_params.padded_width) ||
-                        (cache_y < 0) ||
-                        (cache_y >= task_params.padded_height)) {
-                      std::fill_n(pixel_params.cache_line_pixel,
-                                  task_params.input_depth, T1(0));
-                    } else {
-                      // There are two different sampling strategies for
-                      // resizing. When using nearest, we can just do a
-                      // straight copy of the pixel closest to our sample point,
-                      // but bilinear requires a more complex calculation.
-                      if (SampleMode == NEAREST) {
-                        const T1* input_top_left_pixel =
-                            line_params.input_top_row_start +
-                            (pixel_params.left_x_index *
-                             task_params.input_depth);
-
-                        std::copy_n(input_top_left_pixel,
-                                    task_params.input_depth,
-                                    pixel_params.cache_line_pixel);
-                      } else {
-                        const SampleRect<T1> rect(
-                            line_params.input_top_row_start +
-                                (pixel_params.left_x_index *
-                                 task_params.input_depth),
-                            line_params.input_top_row_start +
-                                (pixel_params.right_x_index *
-                                 task_params.input_depth),
-                            line_params.input_bottom_row_start +
-                                (pixel_params.left_x_index *
-                                 task_params.input_depth),
-                            line_params.input_bottom_row_start +
-                                (pixel_params.right_x_index *
-                                 task_params.input_depth));
-                        for (int in_channel = 0;
-                             in_channel < task_params.input_depth;
-                             ++in_channel) {
-                          pixel_params.cache_line_pixel[in_channel] =
-                              rect.BilinearSample(in_channel,
-                                                  pixel_params.x_lerp,
-                                                  line_params.y_lerp);
-                        }
-                      }
-                    }
-                  }
-                }
-              });
-          end_cached_lines = cache_end_y;
-        }
-        for (int out_x = 0; out_x < output_width; ++out_x) {
-          const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
-          const int patch_index = (batch * output_width * output_height) +
-                                  (out_y * output_width) + out_x;
-          const int patch_index_within_chunk = patch_index % patches_per_chunk;
-          T1* im2col_patch_start =
-              im2col_buffer + (patch_index_within_chunk * filter_value_count);
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            T1* im2col_row_start =
-                im2col_patch_start +
-                (filter_y * filter_width * task_params.input_depth);
-            const int conv_in_y = in_y_origin + filter_y;
-            int cache_index_y;
-            if (conv_in_y < 0) {
-              cache_index_y = task_params.cache_height +
-                              (conv_in_y % task_params.cache_height);
-            } else {
-              cache_index_y = conv_in_y % task_params.cache_height;
-            }
-            T1* cache_line_start =
-                task_params.resize_cache +
-                (cache_index_y * task_params.cache_line_width *
-                 task_params.input_depth);
-            T1* cache_filter_row_start =
-                cache_line_start + ((in_x_origin - task_params.cache_start_x) *
-                                    task_params.input_depth);
-            std::copy_n(cache_filter_row_start,
-                        (filter_width * task_params.input_depth),
-                        im2col_row_start);
-          }
-          const bool is_last_in_chunk =
-              (patch_index_within_chunk == (patches_per_chunk - 1));
-          const bool is_last_overall =
-              ((batch == (input_batches - 1)) &&
-               (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
-          if (is_last_in_chunk || is_last_overall) {
-            // Now we've assembled a set of image patches into a matrix, apply
-            // a GEMM matrix multiply of the patches as rows, times the filter
-            // weights in columns, to get partial results in the output
-            // matrix.
-            const int how_many_patches = patch_index_within_chunk + 1;
-            const int m = how_many_patches;
-            const int n = filter_count;
-            const int k = filter_value_count;
-            const int lda = filter_value_count;
-            const int ldb = filter_count;
-            const int ldc = filter_count;
-            const size_t start_patch_index =
-                patch_index - (how_many_patches - 1);
-            T3* chunk_output_data =
-                output_data + (start_patch_index * filter_count);
-            TGemmFunctor gemm_functor;
-            gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
-                         chunk_output_data, ldc);
-          }
-        }
+  LaunchConv2DWithOutputKernel(int row_stride, int col_stride,      //
+                               int row_dilation, int col_dilation,  //
+                               Padding padding)
+      : row_stride_(row_stride),
+        col_stride_(col_stride),
+        row_dilation_(row_dilation),
+        col_dilation_(col_dilation),
+        padding_(padding) {}
+
+  template <typename OutputKernel>
+  void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
+                  const Tensor& input, const Tensor& filter, Tensor* output) {
+    if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 &&
+        row_stride_ == 1 && col_stride_ == 1) {
+      int conv_width = 1;  // Width for the convolution step.
+      for (int i = 0; i < 3; ++i) {
+        conv_width *= output->dim_size(i);
       }
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+          ctx->eigen_device<CPUDevice>(),
+          output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
+          input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
+          filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
+          dim_pair, output_kernel);
+
+    } else if (filter.dim_size(0) == input.dim_size(1) &&
+               filter.dim_size(1) == input.dim_size(2) && row_dilation_ == 1 &&
+               col_dilation_ == 1 && padding_ == VALID) {
+      // If the input data and filter have the same height/width,
+      // reduce the 2D convolution to matrix multiplication.
+      const auto k =  // Length of reduction dimension.
+          filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2);
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+          ctx->eigen_device<CPUDevice>(),
+          output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}),
+          input.shaped<T, 2>({input.dim_size(0), k}),
+          filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair,
+          output_kernel);
+
+    } else {
+      functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
+          ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
+          input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_, col_stride_,
+          row_dilation_, col_dilation_, BrainPadding2EigenPadding(padding_),
+          output_kernel);
     }
   }
+
+ private:
+  int row_stride_;
+  int col_stride_;
+  int row_dilation_;
+  int col_dilation_;
+  const Padding padding_;
 };
 
 }  // namespace
 
-// Implements a version of convolution with bilinear resizing and mirror padding
-// included.
-template <class T, class TConvFunctor, bool DoResize>
-class FusedResizeConv2DUsingGemmOp : public OpKernel {
+// Conv2D op with fused output kernels. Supports only CPUDevice.
+template <typename T>
+class FusedConv2DOp : public OpKernel {
  public:
-  explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    if (DoResize) {
-      OP_REQUIRES_OK(context,
-                     context->GetAttr("resize_align_corners", &align_corners_));
+  explicit FusedConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
+
+    // 'fused_ops' and 'num_args' attributes are specified by the Grappler
+    // Remapper optimizer.
+
+    std::vector<string> fused_ops;
+    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
+    OP_REQUIRES(context, !fused_ops.empty(),
+                errors::InvalidArgument(
+                    "Fused Conv2D must have at least one fused op."));
+
+    int num_args;
+    OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
+
+    // TODO(ezhulenev): Add support for fusion element-wise op chains defined
+    // at runtime, e.g. Relu+Sqrt+Tanh+etc...
+
+    // Match combination of fused ops to one of the supported fusions.
+    if (FusedOpsMatches(fused_ops, {"BiasAdd"})) {
+      fused_computation_ = FusedComputationType::kBiasAdd;
+    } else if (FusedOpsMatches(fused_ops, {"BiasAdd", "Relu"})) {
+      fused_computation_ = FusedComputationType::kBiasAddWithRelu;
+    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm"})) {
+      fused_computation_ = FusedComputationType::kFusedBatchNorm;
+    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm", "Relu"})) {
+      fused_computation_ = FusedComputationType::kFusedBatchNormWithRelu;
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::Unimplemented("Fusion is not implemented: [",
+                                        str_util::Join(fused_ops, ","), "]"));
     }
-    MirrorPadMode mode;
-    OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
 
-    switch (mode) {
-      case MirrorPadMode::SYMMETRIC: {
-        offset_ = 0;
-        break;
-      }
-      case MirrorPadMode::REFLECT: {
-        offset_ = 1;
-        break;
-      }
-      default:
-        OP_REQUIRES(context, false,
-                    errors::InvalidArgument(
-                        "mode must be either REFLECT or SYMMETRIC."));
+    // Depending on a picked fusion type validate fusion-specific arguments.
+
+    if (fused_computation_ == FusedComputationType::kBiasAdd ||
+        fused_computation_ == FusedComputationType::kBiasAddWithRelu) {
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
+    }
+
+    if (fused_computation_ == FusedComputationType::kFusedBatchNorm ||
+        fused_computation_ == FusedComputationType::kFusedBatchNormWithRelu) {
+      OP_REQUIRES(
+          context, num_args == 4,
+          errors::InvalidArgument("Fused FusedBatchNorm must have four extra "
+                                  "arguments: scale, offset, mean, variance."));
+      OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
     }
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    OP_REQUIRES(context, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N');
-    const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C');
-    OP_REQUIRES(
-        context, stride_n == 1 && stride_c == 1,
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
   void Compute(OpKernelContext* context) override {
     // Input tensor is of the following dimensions:
     // [ batch, in_rows, in_cols, in_depth ]
     const Tensor& input = context->input(0);
-    OP_REQUIRES(context, (input.shape().num_elements() > 0),
-                errors::InvalidArgument("Input tensor can't be empty"));
-
-    ImageResizerState st(false);
-    if (DoResize) {
-      st = ImageResizerState(align_corners_);
-      st.ValidateAndCalculateOutputSize(context, input);
-      if (!context->status().ok()) return;
-    } else {
-      // Set up the resize parameters to do no scaling at all.
-      st.batch_size = input.dim_size(0);
-      st.out_height = input.dim_size(1);
-      st.out_width = input.dim_size(2);
-      st.in_height = input.dim_size(1);
-      st.in_width = input.dim_size(2);
-      st.channels = input.dim_size(3);
-      st.height_scale = 1.0f;
-      st.width_scale = 1.0f;
-    }
-    TensorShape resized_shape(
-        {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)});
-    int paddings_index;
-    int filter_index;
-    if (DoResize) {
-      paddings_index = 2;
-      filter_index = 3;
-    } else {
-      paddings_index = 1;
-      filter_index = 2;
-    }
-    const Tensor& paddings = context->input(paddings_index);
-
-    const int dims = resized_shape.dims();
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::IsMatrix(paddings.shape()) &&
-            paddings.dim_size(1) == 2,
-        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
-                                paddings.shape().DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1)
-            ? 1
-            : dims;
-    OP_REQUIRES(
-        context, fixed_dims == paddings.dim_size(0),
-        errors::InvalidArgument(
-            "The first dimension of paddings must be the rank of inputs: ",
-            fixed_dims, " ", paddings.shape().DebugString(), " ",
-            resized_shape.DebugString()));
-    OP_REQUIRES(
-        context, dims == paddings.dim_size(0),
-        errors::InvalidArgument(
-            "The first dimension of paddings must be the rank of inputs: ",
-            dims, " ", paddings.shape().DebugString(), " ",
-            resized_shape.DebugString()));
-
-    OP_REQUIRES(
-        context, dims == 4,
-        errors::InvalidArgument(
-            "Fused mirror padding only supports four-dimensional inputs, but ",
-            dims, " requested"));
-
-    // Compute the shape of the output tensor, and allocate it.
-    TensorShape padded_shape;
-    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
-    for (int d = 0; d < dims; ++d) {
-      const int32 before =
-          paddings_matrix(d, 0);  // Pad before existing elements.
-      const int32 after =
-          paddings_matrix(d, 1);  // Pad after existing elements.
-      OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument(
-                      "paddings must be non-negative: ", before, " ", after));
-      if (offset_ == 0) {  // SYMMETRIC mode.
-        OP_REQUIRES(
-            context,
-            before <= resized_shape.dim_size(d) &&
-                after <= resized_shape.dim_size(d),
-            errors::InvalidArgument("paddings must be no greater "
-                                    "than the dimension size: ",
-                                    before, ", ", after, " greater than ",
-                                    resized_shape.dim_size(d)));
-      } else if (offset_ == 1) {  // REFLECT mode.
-        OP_REQUIRES(
-            context,
-            before < resized_shape.dim_size(d) &&
-                after < resized_shape.dim_size(d),
-            errors::InvalidArgument("paddings must be less than"
-                                    " the dimension size: ",
-                                    before, ", ", after, " not less than ",
-                                    resized_shape.dim_size(d)));
-      }
-      padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
-    }
-
-    OP_REQUIRES(
-        context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)),
-        errors::InvalidArgument(
-            "Fused mirror padding only support spatial padding, not batches: ",
-            paddings.DebugString()));
-    OP_REQUIRES(
-        context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)),
-        errors::InvalidArgument(
-            "Fused mirror padding only support spatial padding, not channels: ",
-            paddings.DebugString()));
-    const int32 top_padding = paddings_matrix(1, 0);
-    const int32 bottom_padding = paddings_matrix(1, 1);
-    const int32 left_padding = paddings_matrix(2, 0);
-    const int32 right_padding = paddings_matrix(2, 1);
 
     // Input filter is of the following dimensions:
     // [ filter_rows, filter_cols, in_depth, out_depth]
-    const Tensor& filter = context->input(filter_index);
-
-    // For 2D convolution, there should be 4 dimensions.
-    OP_REQUIRES(context, padded_shape.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        padded_shape.DebugString()));
-    OP_REQUIRES(context, filter.dims() == 4,
-                errors::InvalidArgument("filter must be 4-dimensional: ",
-                                        filter.shape().DebugString()));
-
-    // We only check the first three dims, since the depth is accessed as an
-    // int64 below.
-    for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(
-          context,
-          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
-          errors::InvalidArgument("filter too large"));
-    }
+    const Tensor& filter = context->input(1);
 
-    // The last dimension for input is in_depth. It must be the same as the
-    // filter's in_depth.
-    const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", in_depth,
-                    " vs ", filter.dim_size(2)));
-
-    // The last dimension for filter is out_depth.
-    const int out_depth = static_cast<int>(filter.dim_size(3));
-
-    // The second dimension for input is rows/height.
-    // The first dimension for filter is rows/height.
-    const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input rows too large"));
-    const int padded_rows = static_cast<int>(padded_rows_raw);
-    const int filter_rows = static_cast<int>(filter.dim_size(0));
-    const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
-
-    // The third dimension for input is columns/width.
-    // The second dimension for filter is columns/width.
-    const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input cols too large"));
-    const int padded_cols = static_cast<int>(padded_cols_raw);
-    const int filter_cols = static_cast<int>(filter.dim_size(1));
-    const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
-
-    // The first dimension for input is batch.
-    const int64 batch_raw = padded_shape.dim_size(0);
-    OP_REQUIRES(context,
-                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
-                errors::InvalidArgument("batch is too large"));
-    const int batch = static_cast<int>(batch_raw);
-
-    // For now we take the stride from the second and third dimensions only (we
-    // do not support striding on the batch or depth dimension).
-    const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H');
-    const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W');
-
-    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    Conv2DDimensions dimensions;
     OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
-    TensorShape out_shape =
-        ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
-    OP_REQUIRES(context, (out_shape.num_elements() > 0),
-                errors::InvalidArgument("Output tensor can't be empty"));
+                   ComputeConv2DDimension(params_, input, filter, &dimensions));
+
+    TensorShape out_shape = ShapeFromFormat(
+        params_.data_format, dimensions.batch, dimensions.out_rows,
+        dimensions.out_cols, dimensions.out_depth);
 
     // Output tensor is of the following dimensions:
     // [ in_batch, out_rows, out_cols, out_depth ]
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
 
-    VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
-            << ", padded_cols = " << padded_cols
-            << ", resized_cols = " << resized_cols
-            << ", filter_cols = " << filter_cols
-            << ", padded_rows = " << padded_rows
-            << ", resized_rows = " << resized_rows
-            << ", filter_rows = " << filter_rows
-            << ", stride_rows = " << stride_rows
-            << ", stride_cols = " << stride_cols
-            << ", out_depth = " << out_depth << ", DoResize=" << DoResize;
+    VLOG(2) << "FusedConv2DWithBias: in_depth = " << dimensions.in_depth
+            << ", patch_depth = " << dimensions.patch_depth
+            << ", input_cols = " << dimensions.input_cols
+            << ", filter_cols = " << dimensions.filter_cols
+            << ", input_rows = " << dimensions.input_rows
+            << ", filter_rows = " << dimensions.filter_rows
+            << ", stride_rows = " << dimensions.stride_rows
+            << ", stride_cols = " << dimensions.stride_cols
+            << ", dilation_rows = " << dimensions.dilation_rows
+            << ", dilation_cols = " << dimensions.dilation_cols
+            << ", out_depth = " << dimensions.out_depth;
 
     // If there is nothing to compute, return.
     if (out_shape.num_elements() == 0) {
       return;
     }
-    TConvFunctor conv_functor;
-    conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
-                 padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
-                 filter_cols, out_depth, stride_rows, stride_cols, padding_,
-                 output->flat<T>().data(), out_rows, out_cols, st, top_padding,
-                 bottom_padding, left_padding, right_padding, offset_);
+
+    OP_REQUIRES(context, params_.data_format == FORMAT_NHWC,
+                errors::Unimplemented("Fused conv implementation only supports "
+                                      "NHWC tensor format for now."));
+    OP_REQUIRES(context, dimensions.in_depth == filter.dim_size(2),
+                errors::Unimplemented("Fused conv implementation does not "
+                                      "support grouped convolutions for now."));
+
+    BiasAddArgs bias_add;
+    FusedBatchNormArgs fused_batch_norm;
+
+    LaunchConv2DWithOutputKernel<T> conv2d(
+        dimensions.stride_rows, dimensions.stride_cols,
+        dimensions.dilation_rows, dimensions.dilation_cols, params_.padding);
+
+    switch (fused_computation_) {
+      case FusedComputationType::kBiasAdd:
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
+        conv2d(WithBiasAdd<T>(bias_add.bias_add_data), context, input, filter,
+               output);
+        break;
+
+      case FusedComputationType::kBiasAddWithRelu:
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
+        conv2d(WithBiasAddAndRelu<T>(bias_add.bias_add_data), context, input,
+               filter, output);
+        break;
+
+      case FusedComputationType::kFusedBatchNorm:
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, &fused_batch_norm));
+        conv2d(WithFusedBatchNorm<T>(epsilon_,
+                                     fused_batch_norm.scaling_factor.data(),
+                                     fused_batch_norm.offset_data,
+                                     fused_batch_norm.estimated_mean_data),
+               context, input, filter, output);
+        break;
+
+      case FusedComputationType::kFusedBatchNormWithRelu:
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, &fused_batch_norm));
+        conv2d(WithFusedBatchNormAndRelu<T>(
+                   epsilon_, fused_batch_norm.scaling_factor.data(),
+                   fused_batch_norm.offset_data,
+                   fused_batch_norm.estimated_mean_data),
+               context, input, filter, output);
+        break;
+    }
   }
 
  private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  bool align_corners_;
-  int offset_;
+  bool FusedOpsMatches(const std::vector<string>& fused_ops,
+                       const std::vector<string>& expected) const {
+    return fused_ops == expected;
+  }
+
+  struct BiasAddArgs {
+    const T* bias_add_data = nullptr;
+  };
+
+  struct FusedBatchNormArgs {
+    const T* scale_data = nullptr;
+    const T* offset_data = nullptr;
+    const T* estimated_mean_data = nullptr;
+    const T* estimated_variance_data = nullptr;
+
+    // Precomputed expression:
+    //   scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
+    Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
+  };
+
+#define TF_REQUIRES(EXP, STATUS) \
+  if (!TF_PREDICT_TRUE(EXP)) return (STATUS)
+
+  void InitDataPtr(const Tensor& tensor, const T** ptr) const {
+    *ptr = reinterpret_cast<const T*>(tensor.tensor_data().data());
+  }
+
+  Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs* args) const {
+    // Bias of the following dimensions: [ output_depth ]
+    const Tensor& bias = context->input(2);
+
+    TF_REQUIRES(bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional",
+                                        bias.shape().DebugString()));
+
+    InitDataPtr(bias, &args->bias_add_data);
+
+    return Status::OK();
+  }
+
+  Status InitFusedBatchNormArgs(OpKernelContext* context,
+                                FusedBatchNormArgs* args) const {
+    const Tensor& scale = context->input(2);
+    const Tensor& offset = context->input(3);
+    const Tensor& estimated_mean = context->input(4);
+    const Tensor& estimated_variance = context->input(5);
+
+    TF_REQUIRES(scale.dims() == 1,
+                errors::InvalidArgument("scale must be 1-dimensional",
+                                        scale.shape().DebugString()));
+    TF_REQUIRES(offset.dims() == 1,
+                errors::InvalidArgument("offset must be 1-dimensional",
+                                        offset.shape().DebugString()));
+    TF_REQUIRES(estimated_mean.dims() == 1,
+                errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                        estimated_mean.shape().DebugString()));
+    TF_REQUIRES(
+        estimated_variance.dims() == 1,
+        errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                estimated_variance.shape().DebugString()));
+
+    InitDataPtr(scale, &args->scale_data);
+    InitDataPtr(offset, &args->offset_data);
+    InitDataPtr(estimated_mean, &args->estimated_mean_data);
+    InitDataPtr(estimated_variance, &args->estimated_variance_data);
+
+    // Precompute scaling factor once for all output blocks (kernels).
+    args->scaling_factor =
+        (estimated_variance.flat<T>() + static_cast<T>(epsilon_)).rsqrt() *
+        scale.flat<T>();
+
+    return Status::OK();
+  }
+
+#undef TF_REQUIRES
+
+  // Element-wise ops applied to the result of Conv2D.
+  // TODO(ezhulenev): Add support for runtime-defined op chains.
+  enum class FusedComputationType {
+    kBiasAdd,
+    kBiasAddWithRelu,
+    kFusedBatchNorm,
+    kFusedBatchNormWithRelu
+  };
+
+  Conv2DParameters params_;
+  FusedComputationType fused_computation_;
+
+  // FusedBatchNorm attributes.
+  float epsilon_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
+  TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DOp);
 };
 
-#define REGISTER_FUSED(T)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedResizeAndPadConv2D")                                     \
-          .Device(DEVICE_CPU)                                             \
-          .TypeConstraint<T>("T"),                                        \
-      FusedResizeConv2DUsingGemmOp<                                       \
-          T,                                                              \
-          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                       BILINEAR>,                         \
-          true>);
-
-TF_CALL_half(REGISTER_FUSED);
-TF_CALL_float(REGISTER_FUSED);
-TF_CALL_double(REGISTER_FUSED);
-
-#define REGISTER_PAD_ONLY_FUSED(T)                                        \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
-      FusedResizeConv2DUsingGemmOp<                                       \
-          T,                                                              \
-          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                       NEAREST>,                          \
-          false>);
-
-TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
-TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
-TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
+#define REGISTER_FUSED_CONV2D(T)                                      \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      FusedConv2DOp<T>);
+
+// If we're using the alternative GEMM-based implementation of Conv2D for the
+// CPU implementation, don't register this EigenTensor-based version.
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
+TF_CALL_float(REGISTER_FUSED_CONV2D);
+TF_CALL_double(REGISTER_FUSED_CONV2D);
+#endif  // !USE_GEMM_FOR_CONV
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7be1de29c951dca16085e35587d02eeeec01354f
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -0,0 +1,902 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements convolution operations with image transformations (resize and
+// mirror padding) baked into the processing, to optimize latency and memory
+// usage.
+
+#define EIGEN_USE_THREADS
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_ops.h"
+#include "tensorflow/core/kernels/gemm_functors.h"
+#include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+namespace {
+
+// We don't want to allocate a buffer to hold all the patches if the size is
+// going to be extremely large, so break it into chunks if it's bigger than
+// a limit. Each chunk will be processed serially, so we can refill the
+// buffer for the next chunk and reuse it, keeping maximum memory size down.
+// In this case, we've picked 16 megabytes as a reasonable limit for Android and
+// other platforms using Eigen, and 1MB for iOS devices, from experimentation.
+#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
+const size_t kMaxChunkSize = (1 * 1024 * 1024);
+#else
+const size_t kMaxChunkSize = (16 * 1024 * 1024);
+#endif
+const size_t kResizeCacheSize = (8 * 1024 * 1024);
+
+// Lookup method used when resizing.
+enum SamplingMode {
+  BILINEAR = 0,
+  NEAREST = 1,
+};
+
+// Simple utility function used by FusedConv to multithread basic workloads. To
+// use it, pass begin and end values for the full workload and a std::function
+// that receives a subset of that through the begin and end values for each
+// worker's task. The division of the full workload into worker tasks is handled
+// by the multithreading logic. Here's an example of how to use it:
+// std::vector<float> my_vector(100);
+// ...
+// FusedConvParallelFor(context, 0, 100,
+//   [&my_vector](int64 task_begin, int64 task_end) {
+//     for (int64 current = task_begin; current != task_end; ++current) {
+//       my_vector[current] *= 10.0f;
+//     }
+// });
+void FusedConvParallelFor(
+    OpKernelContext* context, int64 begin, int64 end,
+    const std::function<void(int64, int64)>& task_function) {
+// On iOS, the thread management imposes a very big performance penalty, so
+// just call the function directly with no multithreading.
+#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
+  task_function(begin, end);
+#else
+  auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  thread::ThreadPool* thread_pool = worker_threads.workers;
+  const int64 total_elements = end - begin;
+  // This is a bit of an arbitrary number, but was found to work well for
+  // typical models we've been profiling on various devices.
+  const int64 element_cost = 10000000;
+  thread_pool->ParallelFor(
+      total_elements, element_cost,
+      [begin, task_function](int64 begin_offset, int64 end_offset) {
+        const int64 task_begin = begin + begin_offset;
+        const int64 task_end = begin + end_offset;
+        task_function(task_begin, task_end);
+      });
+#endif
+}
+
+// Holds the state needed for the resizing subtasks.
+template <class T1>
+struct ResizeTaskParameters {
+  ResizeTaskParameters() : st(false) {}
+
+  int cache_height;
+  T1* resize_cache;
+  int cache_line_width;
+  int input_width;
+  int input_depth;
+  int top_padding;
+  int pad_offset;
+  int64 resized_height;
+  ImageResizerState st;
+  const T1* input_batch_start;
+  int64 cache_start_x;
+  int64 cache_end_x;
+  int left_padding;
+  int64 resized_width;
+  int64 padded_width;
+  int64 padded_height;
+};
+
+template <class T1>
+struct PerCacheLineParameters {
+  PerCacheLineParameters() {}
+  PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
+      : cache_line_start(other.cache_line_start),
+        input_top_row_start(other.input_top_row_start),
+        input_bottom_row_start(other.input_bottom_row_start),
+        y_lerp(other.y_lerp) {}
+
+  T1* cache_line_start;
+  const T1* input_top_row_start;
+  const T1* input_bottom_row_start;
+  T1 y_lerp;
+};
+
+// Helper class to simplify bilinear filtering
+template <class T1>
+struct SampleRect {
+  EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
+                                 const T1* in_bottom_left,
+                                 const T1* in_bottom_right)
+      : top_left(in_top_left),
+        top_right(in_top_right),
+        bottom_left(in_bottom_left),
+        bottom_right(in_bottom_right) {}
+
+  EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
+                                        T1 y_lerp) const {
+    const T1 top =
+        top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
+    const T1 bottom = bottom_left[channel] +
+                      (bottom_right[channel] - bottom_left[channel]) * x_lerp;
+    return top + (bottom - top) * y_lerp;
+  }
+
+  const T1* top_left;
+  const T1* top_right;
+  const T1* bottom_left;
+  const T1* bottom_right;
+};
+
+// Calculates parameters which remain constant through a resize cache row.
+template <class T1>
+EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
+    int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width,
+    int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset,
+    int64 resized_height, const ImageResizerState& st,
+    const T1* input_batch_start) {
+  PerCacheLineParameters<T1> result;
+  // The cache is organized so that the real y values of the resized image map
+  // onto the actual cache values through a modulo scheme. This means that as we
+  // progress downwards through the image, we keep reusing a small cache and so
+  // keep memory usage down.
+  int64 cache_index_y;
+  if (cache_y < 0) {
+    cache_index_y = cache_height + (cache_y % cache_height);
+  } else {
+    cache_index_y = cache_y % cache_height;
+  }
+  result.cache_line_start =
+      resize_cache + (cache_index_y * cache_line_width * input_depth);
+  // This part is implementing the mirror padding that happens before resizing.
+  float in_y = (cache_y - top_padding);
+  if (in_y < 0) {
+    in_y = -(in_y + 1.0f - pad_offset);
+  } else if (in_y >= resized_height) {
+    in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset);
+  }
+  // Here's where do do the actual resize.
+  in_y *= st.height_scale;
+  const int64 top_y_index = static_cast<int64>(std::floor(in_y));
+  const int64 bottom_y_index =
+      std::min(static_cast<int64>(std::ceil(in_y)), (st.in_height - 1));
+  // Lerp is used for bilinear filtering when that's needed.
+  result.y_lerp = static_cast<T1>(in_y - top_y_index);
+  // Which rows of the original input image to pull the values from.
+  result.input_top_row_start =
+      input_batch_start + (top_y_index * input_width * input_depth);
+  result.input_bottom_row_start =
+      input_batch_start + (bottom_y_index * input_width * input_depth);
+  return result;
+}
+
+template <class T1>
+struct PerCachePixelParameters {
+  PerCachePixelParameters() {}
+  PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
+      : cache_line_pixel(other.cache_line_pixel),
+        left_x_index(other.left_x_index),
+        right_x_index(other.right_x_index),
+        x_lerp(other.x_lerp) {}
+
+  T1* cache_line_pixel;
+  int64 left_x_index;
+  int64 right_x_index;
+  T1 x_lerp;
+};
+
+// Pulls out common parameters used for every resized pixel.
+template <class T1>
+EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
+CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x,
+                                 T1* cache_line_start, int64 input_depth,
+                                 int64 left_padding, int64 pad_offset,
+                                 int64 resized_width,
+                                 const ImageResizerState& st) {
+  PerCachePixelParameters<T1> result;
+  // Figure out where we're going to store the results of our transform.
+  const int cache_index_x = cache_x - cache_start_x;
+  result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
+  // Implement mirror padding by flipping in_x if it's off the edge.
+  float in_x = (cache_x - left_padding);
+  if (in_x < 0) {
+    in_x = -(in_x + 1.0f - pad_offset);
+  } else if (in_x >= resized_width) {
+    in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset);
+  }
+  // Resize the x parameters.
+  in_x *= st.width_scale;
+  // Get the x coordinates for the left and right pixels to pull from.
+  result.left_x_index = static_cast<int64>(std::floor(in_x));
+  result.right_x_index =
+      std::min(static_cast<int64>(std::ceil(in_x)), (st.in_width - 1));
+  // This x_lerp is used to blend pixels in bilinear filtering.
+  result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
+  return result;
+}
+
+// Combines bilinear resizing and mirror padding into the im2col transformation
+// stage of convolution.
+template <class T1, class T2, class T3, class TGemmFunctor,
+          SamplingMode SampleMode>
+class FusedResizeAndPadConvFunctor {
+ public:
+  void operator()(OpKernelContext* context, const Tensor& input,
+                  int input_batches, int resized_height, int resized_width,
+                  int padded_height, int padded_width, int input_depth,
+                  const T2* filter_data, int filter_height, int filter_width,
+                  int filter_count, int stride_rows, int stride_cols,
+                  Padding padding, T3* output_data, int output_height,
+                  int output_width, const ImageResizerState& st,
+                  int top_padding, int bottom_padding, int left_padding,
+                  int right_padding, int pad_offset) {
+    if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) ||
+        (input_depth <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad input dimensions: "
+                   << input_batches << ", " << padded_height << ", "
+                   << padded_width << ", " << input_depth;
+      return;
+    }
+    if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
+                   << filter_width << ", " << filter_height << ", "
+                   << filter_count;
+      return;
+    }
+    if ((output_width <= 0) || (output_height <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad output width or height: "
+                   << output_width << ", " << output_height;
+      return;
+    }
+    OP_REQUIRES(
+        context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)),
+        errors::InvalidArgument("Bad sample mode passed in", SampleMode));
+
+    // These calculations define how the patches will be positioned within the
+    // input image. The actual definitions are quite complex, and rely on the
+    // previously-calculated output size.
+    int filter_left_offset;
+    int filter_top_offset;
+    if (padding == VALID) {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - padded_width + 1) /
+          2;
+      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
+                           padded_height + 1) /
+                          2;
+    } else {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - padded_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride_rows + filter_height - padded_height) /
+          2;
+    }
+
+    ResizeTaskParameters<T1> task_params;
+    task_params.input_depth = input_depth;
+    task_params.top_padding = top_padding;
+    task_params.pad_offset = pad_offset;
+    task_params.resized_height = resized_height;
+    task_params.st = st;
+    task_params.left_padding = left_padding;
+    task_params.resized_width = resized_width;
+    task_params.padded_width = padded_width;
+    task_params.padded_height = padded_height;
+
+    // The im2col buffer has # of patches rows, and # of filters cols.
+    // It's laid out like this, in row major order in memory:
+    //        < filter value count >
+    //   ^   +---------------------+
+    // patch |                     |
+    // count |                     |
+    //   v   +---------------------+
+    // Each patch row contains a filter_width x filter_height patch of the
+    // input, with the depth channel as the most contiguous in memory, followed
+    // by the width, then the height. This is the standard memory order in the
+    // image world if it helps to visualize it.
+    const int filter_value_count = filter_width * filter_height * input_depth;
+
+    OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
+                errors::InvalidArgument("Im2Col patch too large for buffer"));
+    const size_t patches_per_chunk =
+        kMaxChunkSize / (filter_value_count * sizeof(T1));
+    // Because memory allocation is very expensive on mobile platforms, try to
+    // allocate a persistent buffer that will be kept around between calls. We
+    // use TensorFlow's resource management to ensure that the memory will be
+    // released when the session is over.
+    Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
+    std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
+        [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
+          *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
+
+    // Create a resize cache memory buffer that will hold the rows of
+    // transformed and mirror padded input pixels, ready to be copied
+    // into filter patches by im2col.
+    // It's laid out like this, in row major order in memory:
+    //         < cache line width >
+    //   ^    +--------------------+
+    // cache  |                    |
+    // height |                    |
+    //   v    +--------------------+
+    // Each cache row contains a cache_line_width number of resized pixels,
+    // each with input_depth channels. The cache height is typically less than
+    // the full height the resized image would be, so it's filled up
+    // incrementally as we progress downwards through the input creating im2col
+    // patches.
+    task_params.cache_start_x = -filter_left_offset;
+    task_params.cache_end_x =
+        (((output_width - 1) * stride_cols) - filter_left_offset) +
+        filter_width;
+    task_params.cache_line_width =
+        task_params.cache_end_x - task_params.cache_start_x;
+    task_params.cache_height =
+        kResizeCacheSize / (task_params.cache_line_width * input_depth);
+    const int needed_resize_cache_count =
+        filter_height * task_params.cache_line_width * input_depth;
+    OP_REQUIRES(context,
+                (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
+                errors::InvalidArgument("Input too large for resize cache"));
+    Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
+    std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
+        resize_creator =
+            [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
+              *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
+              return Status::OK();
+            };
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "resize_cache",
+                                &resize_cache_resource, resize_creator));
+
+    // This means that multiple ops can't be run simultaneously on different
+    // threads, because we have a single shared resource. The platforms this is
+    // aimed at have intra-op parallelism as their focus though, so it shouldn't
+    // be an issue.
+    mutex_lock lock_buffer(im2col_buffer_resource->mu);
+    core::ScopedUnref unref_buffer(im2col_buffer_resource);
+    T1* im2col_buffer = im2col_buffer_resource->data;
+
+    // This buffer is used as a fairly heavy-weight cache for the resized and
+    // mirrored inputs to the im2col operation. The problem is that we want to
+    // keep the memory usage down by not rendering the fully resized and padded
+    // input tensor to the convolution into an entire buffer. The first approach
+    // to avoid this was to fold the bilinear filtering and padding spatial
+    // transformations into the im2col lookup itself. This successfully reduced
+    // memory usage, but because im2col can access an individual pixel for many
+    // different patches, the extra overhead of doing the same bilinear lookups
+    // repeatedly became too expensive.
+    // The resize cache is designed to avoid this problem by keeping a
+    // horizontal slice of the resized and padded input to the im2col
+    // precalculated, so that repeated accesses to the same pixel from different
+    // filter patches can just be copied from this cache. It's organized as a
+    // horizontal slice stretching across the whole virtual image, and as high
+    // as the filter window, so that as the patch processing moves across all
+    // the pixels are present, and before a new row of patches is started any
+    // previously calculated rows that are needed are maintained, with new rows
+    // calculated as required.
+    mutex_lock resize_lock_buffer(resize_cache_resource->mu);
+    core::ScopedUnref unref_resized_cache(resize_cache_resource);
+    task_params.resize_cache = resize_cache_resource->data;
+
+    const T1* input_data = input.flat<T1>().data();
+    const int64 input_height = input.shape().dim_sizes()[1];
+    task_params.input_width = input.shape().dim_sizes()[2];
+
+    int end_cached_lines = std::numeric_limits<int>::min();
+
+    for (int batch = 0; batch < input_batches; ++batch) {
+      task_params.input_batch_start =
+          input_data +
+          (batch * input_height * task_params.input_width * input_depth);
+      const int in_y_end =
+          ((output_height * stride_rows) - filter_top_offset) + filter_height;
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
+        const int cache_start_y = std::max(in_y_origin, end_cached_lines);
+        const int cache_end_y = std::min(
+            in_y_end, std::max((in_y_origin + task_params.cache_height),
+                               end_cached_lines));
+        if (end_cached_lines < (in_y_origin + filter_height)) {
+          // This call breaks up the work required for calculating the mirror
+          // padding and resizing across multiple threads.
+          FusedConvParallelFor(
+              context, cache_start_y, cache_end_y,
+              [task_params](int64 task_cache_start_y, int64 task_cache_end_y) {
+                // This is a long and confusing function, but it's been laid out
+                // this way to help with performance on some intensive models.
+                // What it's doing is populating a cache of the original input
+                // image, after it's been bilinear resized and had its edges
+                // mirrored. This allows the following im2col code to access the
+                // transformed pixels from this cache, without having to
+                // repeatedly apply the expensive bilinear calculations as the
+                // same pixels are accessed by different patches.
+                // This is most effective when the stride is small and the
+                // filter size is large, since that's when pixels are reused
+                // most frequently as patches overlap.
+                for (int cache_y = task_cache_start_y;
+                     cache_y < task_cache_end_y; ++cache_y) {
+                  // We organize the cache as a series of rows, each containing
+                  // all the transformed pixels for a given line in the image.
+                  // This cache is big enough to hold at least a filter's height
+                  // worth of rows, but typically more, limited by the size of
+                  // the cache buffer.
+                  // We don't allocate an entire image's worth of rows though,
+                  // because we're trying to keep memory usage down, so as we
+                  // progress downwards through the im2col we periodically
+                  // refresh the cache so that the next lines that are needed
+                  // for that operation are always present.
+                  // Work out the parameters that remain constant across the
+                  // row we're calculating.
+                  PerCacheLineParameters<T1> line_params(
+                      CalculatePerCacheLineParameters<T1>(
+                          task_params.cache_height, cache_y,
+                          task_params.resize_cache,
+                          task_params.cache_line_width, task_params.input_width,
+                          task_params.input_depth, task_params.top_padding,
+                          task_params.pad_offset, task_params.resized_height,
+                          task_params.st, task_params.input_batch_start));
+                  // Iterate through the resize cache row we're filling in.
+                  for (int cache_x = task_params.cache_start_x;
+                       cache_x < task_params.cache_end_x; ++cache_x) {
+                    // Figure out what we need for the cache pixel we're
+                    // populating.
+                    PerCachePixelParameters<T1> pixel_params(
+                        CalculatePerCachePixelParameters<T1>(
+                            cache_x, task_params.cache_start_x,
+                            line_params.cache_line_start,
+                            task_params.input_depth, task_params.left_padding,
+                            task_params.pad_offset, task_params.resized_width,
+                            task_params.st));
+                    // If the access is off the left, right, top, or bottom of
+                    // the resized image, the conv padding means we should set
+                    // it to zero.
+                    if ((cache_x < 0) ||
+                        (cache_x >= task_params.padded_width) ||
+                        (cache_y < 0) ||
+                        (cache_y >= task_params.padded_height)) {
+                      std::fill_n(pixel_params.cache_line_pixel,
+                                  task_params.input_depth, T1(0));
+                    } else {
+                      // There are two different sampling strategies for
+                      // resizing. When using nearest, we can just do a
+                      // straight copy of the pixel closest to our sample point,
+                      // but bilinear requires a more complex calculation.
+                      if (SampleMode == NEAREST) {
+                        const T1* input_top_left_pixel =
+                            line_params.input_top_row_start +
+                            (pixel_params.left_x_index *
+                             task_params.input_depth);
+
+                        std::copy_n(input_top_left_pixel,
+                                    task_params.input_depth,
+                                    pixel_params.cache_line_pixel);
+                      } else {
+                        const SampleRect<T1> rect(
+                            line_params.input_top_row_start +
+                                (pixel_params.left_x_index *
+                                 task_params.input_depth),
+                            line_params.input_top_row_start +
+                                (pixel_params.right_x_index *
+                                 task_params.input_depth),
+                            line_params.input_bottom_row_start +
+                                (pixel_params.left_x_index *
+                                 task_params.input_depth),
+                            line_params.input_bottom_row_start +
+                                (pixel_params.right_x_index *
+                                 task_params.input_depth));
+                        for (int in_channel = 0;
+                             in_channel < task_params.input_depth;
+                             ++in_channel) {
+                          pixel_params.cache_line_pixel[in_channel] =
+                              rect.BilinearSample(in_channel,
+                                                  pixel_params.x_lerp,
+                                                  line_params.y_lerp);
+                        }
+                      }
+                    }
+                  }
+                }
+              });
+          end_cached_lines = cache_end_y;
+        }
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
+          const int patch_index = (batch * output_width * output_height) +
+                                  (out_y * output_width) + out_x;
+          const int patch_index_within_chunk = patch_index % patches_per_chunk;
+          T1* im2col_patch_start =
+              im2col_buffer + (patch_index_within_chunk * filter_value_count);
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            T1* im2col_row_start =
+                im2col_patch_start +
+                (filter_y * filter_width * task_params.input_depth);
+            const int conv_in_y = in_y_origin + filter_y;
+            int cache_index_y;
+            if (conv_in_y < 0) {
+              cache_index_y = task_params.cache_height +
+                              (conv_in_y % task_params.cache_height);
+            } else {
+              cache_index_y = conv_in_y % task_params.cache_height;
+            }
+            T1* cache_line_start =
+                task_params.resize_cache +
+                (cache_index_y * task_params.cache_line_width *
+                 task_params.input_depth);
+            T1* cache_filter_row_start =
+                cache_line_start + ((in_x_origin - task_params.cache_start_x) *
+                                    task_params.input_depth);
+            std::copy_n(cache_filter_row_start,
+                        (filter_width * task_params.input_depth),
+                        im2col_row_start);
+          }
+          const bool is_last_in_chunk =
+              (patch_index_within_chunk == (patches_per_chunk - 1));
+          const bool is_last_overall =
+              ((batch == (input_batches - 1)) &&
+               (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
+          if (is_last_in_chunk || is_last_overall) {
+            // Now we've assembled a set of image patches into a matrix, apply
+            // a GEMM matrix multiply of the patches as rows, times the filter
+            // weights in columns, to get partial results in the output
+            // matrix.
+            const int how_many_patches = patch_index_within_chunk + 1;
+            const int m = how_many_patches;
+            const int n = filter_count;
+            const int k = filter_value_count;
+            const int lda = filter_value_count;
+            const int ldb = filter_count;
+            const int ldc = filter_count;
+            const size_t start_patch_index =
+                patch_index - (how_many_patches - 1);
+            T3* chunk_output_data =
+                output_data + (start_patch_index * filter_count);
+            TGemmFunctor gemm_functor;
+            gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
+                         chunk_output_data, ldc);
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace
+
+// Implements a version of convolution with bilinear resizing and mirror padding
+// included.
+template <class T, class TConvFunctor, bool DoResize>
+class FusedResizeConv2DUsingGemmOp : public OpKernel {
+ public:
+  explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    if (DoResize) {
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("resize_align_corners", &align_corners_));
+    }
+    MirrorPadMode mode;
+    OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
+
+    switch (mode) {
+      case MirrorPadMode::SYMMETRIC: {
+        offset_ = 0;
+        break;
+      }
+      case MirrorPadMode::REFLECT: {
+        offset_ = 1;
+        break;
+      }
+      default:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "mode must be either REFLECT or SYMMETRIC."));
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N');
+    const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C');
+    OP_REQUIRES(
+        context, stride_n == 1 && stride_c == 1,
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, (input.shape().num_elements() > 0),
+                errors::InvalidArgument("Input tensor can't be empty"));
+
+    ImageResizerState st(false);
+    if (DoResize) {
+      st = ImageResizerState(align_corners_);
+      st.ValidateAndCalculateOutputSize(context, input);
+      if (!context->status().ok()) return;
+    } else {
+      // Set up the resize parameters to do no scaling at all.
+      st.batch_size = input.dim_size(0);
+      st.out_height = input.dim_size(1);
+      st.out_width = input.dim_size(2);
+      st.in_height = input.dim_size(1);
+      st.in_width = input.dim_size(2);
+      st.channels = input.dim_size(3);
+      st.height_scale = 1.0f;
+      st.width_scale = 1.0f;
+    }
+    TensorShape resized_shape(
+        {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)});
+    int paddings_index;
+    int filter_index;
+    if (DoResize) {
+      paddings_index = 2;
+      filter_index = 3;
+    } else {
+      paddings_index = 1;
+      filter_index = 2;
+    }
+    const Tensor& paddings = context->input(paddings_index);
+
+    const int dims = resized_shape.dims();
+    OP_REQUIRES(
+        context,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
+            paddings.dim_size(1) == 2,
+        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
+                                paddings.shape().DebugString()));
+    const int fixed_dims =
+        (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1)
+            ? 1
+            : dims;
+    OP_REQUIRES(
+        context, fixed_dims == paddings.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs: ",
+            fixed_dims, " ", paddings.shape().DebugString(), " ",
+            resized_shape.DebugString()));
+    OP_REQUIRES(
+        context, dims == paddings.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs: ",
+            dims, " ", paddings.shape().DebugString(), " ",
+            resized_shape.DebugString()));
+
+    OP_REQUIRES(
+        context, dims == 4,
+        errors::InvalidArgument(
+            "Fused mirror padding only supports four-dimensional inputs, but ",
+            dims, " requested"));
+
+    // Compute the shape of the output tensor, and allocate it.
+    TensorShape padded_shape;
+    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
+    for (int d = 0; d < dims; ++d) {
+      const int32 before =
+          paddings_matrix(d, 0);  // Pad before existing elements.
+      const int32 after =
+          paddings_matrix(d, 1);  // Pad after existing elements.
+      OP_REQUIRES(context, before >= 0 && after >= 0,
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
+      if (offset_ == 0) {  // SYMMETRIC mode.
+        OP_REQUIRES(
+            context,
+            before <= resized_shape.dim_size(d) &&
+                after <= resized_shape.dim_size(d),
+            errors::InvalidArgument("paddings must be no greater "
+                                    "than the dimension size: ",
+                                    before, ", ", after, " greater than ",
+                                    resized_shape.dim_size(d)));
+      } else if (offset_ == 1) {  // REFLECT mode.
+        OP_REQUIRES(
+            context,
+            before < resized_shape.dim_size(d) &&
+                after < resized_shape.dim_size(d),
+            errors::InvalidArgument("paddings must be less than"
+                                    " the dimension size: ",
+                                    before, ", ", after, " not less than ",
+                                    resized_shape.dim_size(d)));
+      }
+      padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
+    }
+
+    OP_REQUIRES(
+        context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)),
+        errors::InvalidArgument(
+            "Fused mirror padding only support spatial padding, not batches: ",
+            paddings.DebugString()));
+    OP_REQUIRES(
+        context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)),
+        errors::InvalidArgument(
+            "Fused mirror padding only support spatial padding, not channels: ",
+            paddings.DebugString()));
+    const int32 top_padding = paddings_matrix(1, 0);
+    const int32 bottom_padding = paddings_matrix(1, 1);
+    const int32 left_padding = paddings_matrix(2, 0);
+    const int32 right_padding = paddings_matrix(2, 1);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(filter_index);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, padded_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        padded_shape.DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    // We only check the first three dims, since the depth is accessed as an
+    // int64 below.
+    for (int i = 0; i < 3; i++) {
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
+    }
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = padded_shape.dim_size(3);
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int out_depth = static_cast<int>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 padded_rows_raw = padded_shape.dim_size(1);
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
+    const int padded_rows = static_cast<int>(padded_rows_raw);
+    const int filter_rows = static_cast<int>(filter.dim_size(0));
+    const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 padded_cols_raw = padded_shape.dim_size(2);
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
+    const int padded_cols = static_cast<int>(padded_cols_raw);
+    const int filter_cols = static_cast<int>(filter.dim_size(1));
+    const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
+
+    // The first dimension for input is batch.
+    const int64 batch_raw = padded_shape.dim_size(0);
+    OP_REQUIRES(context,
+                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
+    const int batch = static_cast<int>(batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H');
+    const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W');
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape =
+        ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
+    OP_REQUIRES(context, (out_shape.num_elements() > 0),
+                errors::InvalidArgument("Output tensor can't be empty"));
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
+            << ", padded_cols = " << padded_cols
+            << ", resized_cols = " << resized_cols
+            << ", filter_cols = " << filter_cols
+            << ", padded_rows = " << padded_rows
+            << ", resized_rows = " << resized_rows
+            << ", filter_rows = " << filter_rows
+            << ", stride_rows = " << stride_rows
+            << ", stride_cols = " << stride_cols
+            << ", out_depth = " << out_depth << ", DoResize=" << DoResize;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+    TConvFunctor conv_functor;
+    conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
+                 padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
+                 filter_cols, out_depth, stride_rows, stride_cols, padding_,
+                 output->flat<T>().data(), out_rows, out_cols, st, top_padding,
+                 bottom_padding, left_padding, right_padding, offset_);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  bool align_corners_;
+  int offset_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
+};
+
+#define REGISTER_FUSED(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedResizeAndPadConv2D")                                     \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<T>("T"),                                        \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       BILINEAR>,                         \
+          true>);
+
+TF_CALL_half(REGISTER_FUSED);
+TF_CALL_float(REGISTER_FUSED);
+TF_CALL_double(REGISTER_FUSED);
+
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       NEAREST>,                          \
+          false>);
+
+TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
+TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
+TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
deleted file mode 100644
index 46167db3a2b44da40a2dc60e90d6b0cd900503ec..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ /dev/null
@@ -1,1075 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if GOOGLE_CUDA
-
-#define EIGEN_USE_GPU
-
-#include <algorithm>
-#include <array>
-#include <limits>
-#include <utility>
-
-#include "cuda/include/cuda.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/conv_2d.h"
-#include "tensorflow/core/lib/math/math_util.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
-#include "tensorflow/core/util/tensor_format.h"
-
-namespace tensorflow {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-namespace functor {
-namespace {
-template <typename T, bool conjugate>
-struct maybe_conj {
-  __device__ static __inline__ T run(T x) {
-    if (conjugate) {
-      return Eigen::numext::conj(x);
-    } else {
-      return x;
-    }
-  }
-};
-
-// Partial specializations for Cuda types used to store complex numbers.
-template <bool conjugate>
-struct maybe_conj<float2, conjugate> {
-  __device__ static __inline__ float2 run(float2 c) {
-    if (conjugate) {
-      float2 c_conj;
-      c_conj.x = c.x;
-      c_conj.y = -c.y;
-      return c_conj;
-    } else {
-      return c;
-    }
-  }
-};
-
-template <bool conjugate>
-struct maybe_conj<double2, conjugate> {
-  __device__ static __inline__ double2 run(double2 c) {
-    if (conjugate) {
-      double2 c_conj;
-      c_conj.x = c.x;
-      c_conj.y = -c.y;
-      return c_conj;
-    } else {
-      return c;
-    }
-  }
-};
-
-}  // namespace
-
-// TODO(mjanusz): Move this to a shared util file.
-// A simple array that contains data that can be passed between CPU and GPU.
-template <typename T, int IndexCount, T DefaultValue>
-struct Array {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[](int index) const {
-    return data[index];
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[](int index) {
-    return data[index];
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array() {
-    for (int i = 0; i < IndexCount; i++) {
-      data[i] = DefaultValue;
-    }
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(T a0) {
-    data[0] = a0;
-    for (int i = 1; i < IndexCount; i++) {
-      data[i] = DefaultValue;
-    }
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(T a0, T a1) {
-    data[0] = a0;
-    data[1] = a1;
-    for (int i = 2; i < IndexCount; i++) {
-      data[i] = DefaultValue;
-    }
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(T a0, T a1, T a2) {
-    data[0] = a0;
-    data[1] = a1;
-    data[2] = a2;
-    for (int i = 3; i < IndexCount; i++) {
-      data[i] = DefaultValue;
-    }
-  }
-  EIGEN_STRONG_INLINE Array(const std::array<T, IndexCount>& array) {
-    for (int i = 0; i < IndexCount; i++) {
-      data[i] = array[i];
-    }
-  }
-  T data[IndexCount];
-};
-
-// A dimension type with compile-time known size.
-template <int IndexCount>
-struct Dimension : Array<int, IndexCount, 1> {
-  typedef Array<int, IndexCount, 1> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension() : Base() {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension(int a0) : Base(a0) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension(int a0, int a1)
-      : Base(a0, a1) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Dimension(int a0, int a1, int a2)
-      : Base(a0, a1, a2) {}
-  EIGEN_STRONG_INLINE Dimension(const std::array<int, IndexCount>& array)
-      : Base(array) {}
-};
-
-// An index type with compile-time known size.
-template <int IndexCount>
-struct Index : Array<int, IndexCount, 0> {
-  typedef Array<int, IndexCount, 0> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index() : Base() {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index(int a0) : Base(a0) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index(int a0, int a1) : Base(a0, a1) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index(int a0, int a1, int a2)
-      : Base(a0, a1, a2) {}
-};
-
-// A helper function that converts a tensor index into a flat array index.
-template <int IndexCount>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int TensorIndexToFlat(
-    const Index<IndexCount>& index, const Dimension<IndexCount>& dims) {
-  int flat_index = index[0];
-  for (int i = 1; i < IndexCount; i++) {
-    flat_index = flat_index * dims[i] + index[i];
-  }
-  return flat_index;
-}
-
-// A helper function that converts a flat array index into a tensor index.
-template <int IndexCount>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index<IndexCount> FlatToTensorIndex(
-    int index, const Dimension<IndexCount>& dims) {
-  Index<IndexCount> tensor_index;
-  for (int i = IndexCount - 1; i >= 0; i--) {
-    int new_index = index / dims[i];
-    tensor_index[i] = index - dims[i] * new_index;
-    index = new_index;
-  }
-  return tensor_index;
-}
-
-// A simple CUDA custom kernel to shuffle dimensions of a 3D tensor according to
-// the given shuffle permutation in template parameters. Shuffle permutation
-// <sp0, sp1, sp2> shuffles dimensions such that input dimension 0 goes to sp0,
-// 1 goes to sp1 and 2 goes to sp2. For example, shuffle permutation <2, 0, 1>
-// will populate output so that input[x][y][z] is equal to (*output)[y][z][x].
-//
-// Requires that nthreads is equal to the total number of elements in the input
-// tensor.
-template <typename T, int sp0, int sp1, int sp2, bool conjugate = false>
-__global__ void ShuffleInTensor3Simple(int nthreads, const T* input,
-                                       Dimension<3> input_dims, T* output) {
-  Dimension<3> output_dims;
-  output_dims[sp0] = input_dims[0];
-  output_dims[sp1] = input_dims[1];
-  output_dims[sp2] = input_dims[2];
-
-  // Iterate over output as opposed to iterating over input for better
-  // performance. Iterating over output will generate sequential writes and
-  // random reads that performs better compared to sequential reads and random
-  // writes.
-  CUDA_1D_KERNEL_LOOP(output_index, nthreads) {
-    Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
-
-    Index<3> input_tensor_index;
-    input_tensor_index[0] = output_tensor_index[sp0];
-    input_tensor_index[1] = output_tensor_index[sp1];
-    input_tensor_index[2] = output_tensor_index[sp2];
-
-    int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
-
-    output[output_index] =
-        maybe_conj<T, conjugate>::run(ldg(input + input_index));
-  }
-}
-
-// Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor,
-// where dimensions are zero-based: output[i][j][k] = input[i][k][j].
-//
-// Each thread block operates on a single tile, a rectangle of dimensions
-// TileSizeI x TileSizeJ.
-//
-// In general, for best performance, you should probably set TileSizeI,
-// TileSizeJ equal to the number of threads in a warp (32 in nvidia GPUs).
-// With a TileSizeI, TileSizeJ of 32, NumThreads of 128 or 256 seems to get
-// the best performance on K40 GPUs.
-template <typename T, int NumThreads, int TileSizeI, int TileSizeJ,
-          bool conjugate = false>
-__global__ void SwapDimension1And2InTensor3UsingTiles(
-    const T* __restrict__ input, Dimension<3> input_dims,
-    T* __restrict__ output) {
-  eigen_assert(blockDim.x == NumThreads);
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  constexpr int ReadRowPerPass = NumThreads / TileSizeJ;
-  constexpr int WriteRowPerPass = NumThreads / TileSizeI;
-  // One extra line in the inner dimension to avoid share memory bank conflict.
-  // This is to mimic the following, but no constructor of T can be invoked.
-  //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
-  __shared__ __align__(
-      alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
-  typedef T(*SharedMemoryTile)[TileSizeJ + 1];
-  SharedMemoryTile shared_memory_tile =
-      reinterpret_cast<SharedMemoryTile>(shared_mem_raw);
-
-  int x = threadIdx.x;
-
-  Dimension<3> output_dims = {
-      input_dims[0],
-      input_dims[2],
-      input_dims[1],
-  };
-
-  Dimension<3> input_dims_in_tiles = {
-      input_dims[0],
-      (input_dims[1] + TileSizeI - 1) / TileSizeI,
-      (input_dims[2] + TileSizeJ - 1) / TileSizeJ,
-  };
-
-  Index<3> input_tile_index =
-      FlatToTensorIndex(blockIdx.x, input_dims_in_tiles);
-
-  Index<3> input_tile_origin = {
-      input_tile_index[0],
-      input_tile_index[1] * TileSizeI,
-      input_tile_index[2] * TileSizeJ,
-  };
-
-  int input_origin_flat_index =
-      TensorIndexToFlat(input_tile_origin, input_dims);
-
-  bool full_tile = true;
-  int tile_width = TileSizeJ;
-
-  // Only the last row or column may not have the full size.
-  if (input_tile_index[2] == input_dims_in_tiles[2] - 1) {
-    tile_width = input_dims[2] - (input_dims_in_tiles[2] - 1) * TileSizeJ;
-    full_tile &= false;
-  }
-
-  int tile_height = TileSizeI;
-
-  if (input_tile_index[1] == input_dims_in_tiles[1] - 1) {
-    tile_height = input_dims[1] - (input_dims_in_tiles[1] - 1) * TileSizeI;
-    full_tile &= false;
-  }
-
-  // Calculate effective thread number. This ensures that we use the largest
-  // number of threads available to form a regular thread block with no
-  // trailing incomplete lines.
-  constexpr int in_effective_thread_num = NumThreads / TileSizeJ * TileSizeJ;
-
-  if (x < in_effective_thread_num) {
-    // Orient the logical thread block with respect to the input array.
-    // ie. align the contiguous dimension of thread blocks with the contiguous
-    // dimension of the input array.
-    int ti = x / TileSizeJ;
-    int tj = x % TileSizeJ;
-    int input_index = input_origin_flat_index + ti * input_dims[2] + tj;
-    int input_increment = ReadRowPerPass * input_dims[2];
-
-    if (full_tile) {
-#pragma unroll
-      for (int i_loc = ti; i_loc < (TileSizeI); i_loc += ReadRowPerPass) {
-        shared_memory_tile[i_loc][tj] =
-            maybe_conj<T, conjugate>::run(input[input_index]);
-        input_index += input_increment;
-      }
-    } else {
-      if (tj < tile_width) {
-        for (int i_loc = ti; i_loc < (tile_height); i_loc += ReadRowPerPass) {
-          shared_memory_tile[i_loc][tj] =
-              maybe_conj<T, conjugate>::run(input[input_index]);
-          input_index += input_increment;
-        }
-      }
-    }
-  }
-
-  __syncthreads();
-
-  Index<3> output_tile_index = {
-      input_tile_index[0],
-      input_tile_index[2],
-      input_tile_index[1],
-  };
-
-  Index<3> output_tile_origin = {
-      output_tile_index[0],
-      output_tile_index[1] * TileSizeJ,
-      output_tile_index[2] * TileSizeI,
-  };
-
-  int output_origin_flat_index =
-      TensorIndexToFlat(output_tile_origin, output_dims);
-
-  constexpr int out_effective_thread_num = NumThreads / TileSizeI * TileSizeI;
-
-  if (x < out_effective_thread_num) {
-    // Re-orient the logical thread block with respect to the output array.
-    // ie. align the contiguous dimension of thread blocks with contiguous
-    // dimension of the output array.
-    int ti = x / TileSizeI;
-    int tj = x % TileSizeI;
-    int output_index = output_origin_flat_index + ti * output_dims[2] + tj;
-    int output_increment = WriteRowPerPass * output_dims[2];
-
-    if (full_tile) {
-#pragma unroll
-      for (int i_loc = ti; i_loc < (TileSizeJ); i_loc += WriteRowPerPass) {
-        output[output_index] = shared_memory_tile[tj][i_loc];
-        output_index += output_increment;
-      }
-    } else {
-      if (tj < tile_height) {
-        for (int i_loc = ti; i_loc < (tile_width); i_loc += WriteRowPerPass) {
-          output[output_index] = shared_memory_tile[tj][i_loc];
-          output_index += output_increment;
-        }
-      }
-    }
-  }
-}
-
-// A Cuda custom kernel that convert input to output, given proper padding on
-// the left and the top. The padded value is zero.
-template <typename T, int NDIMS>
-__global__ void PadInputCustomKernelNHWC(int nthreads, const T* input,
-                                         Dimension<NDIMS> input_dims, T* output,
-                                         Dimension<NDIMS> output_dims,
-                                         Dimension<NDIMS - 2> padding_left) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int output_index = index;
-    Index<NDIMS> output_tensor_index =
-        FlatToTensorIndex(output_index, output_dims);
-
-    Index<NDIMS> input_tensor_index;
-    input_tensor_index[0] = output_tensor_index[0];  // batch
-    bool ok = true;
-    for (int i = 1; i < NDIMS - 1; i++) {
-      input_tensor_index[i] = output_tensor_index[i] - padding_left[i - 1];
-      ok &=
-          (input_tensor_index[i] >= 0 && input_tensor_index[i] < input_dims[i]);
-    }
-    input_tensor_index[NDIMS - 1] = output_tensor_index[NDIMS - 1];  // channels
-
-    if (ok) {
-      const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
-      output[output_index] = input[input_index];
-    } else {
-      output[output_index] = T(0);
-    }
-  }
-}
-
-template <typename T, int NDIMS>
-__global__ void PadInputCustomKernelNCHW(int nthreads, const T* input,
-                                         Dimension<NDIMS> input_dims, T* output,
-                                         Dimension<NDIMS> output_dims,
-                                         Dimension<NDIMS - 2> padding_left) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int output_index = index;
-    Index<NDIMS> output_tensor_index =
-        FlatToTensorIndex(output_index, output_dims);
-
-    Index<NDIMS> input_tensor_index;
-    input_tensor_index[0] = output_tensor_index[0];  // batch
-    input_tensor_index[1] = output_tensor_index[1];  // channels
-    bool ok = true;
-    for (int i = 2; i < NDIMS; i++) {
-      input_tensor_index[i] = output_tensor_index[i] - padding_left[i - 2];
-      ok &=
-          (input_tensor_index[i] >= 0 && input_tensor_index[i] < input_dims[i]);
-    }
-
-    if (ok) {
-      const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
-      output[output_index] = input[input_index];
-    } else {
-      output[output_index] = T(0);
-    }
-  }
-}
-
-// A GPU helper function that converts TensorFlow filter format to Cudnn filter
-// format.
-template <typename T, int NDIMS>
-struct TransformFilter<GPUDevice, T, int, NDIMS> {
-  typedef GPUDevice Device;
-  void operator()(const Device& d, FilterTensorFormat dst_filter_format,
-                  typename TTypes<T, NDIMS, int>::ConstTensor in,
-                  typename TTypes<T, NDIMS, int>::Tensor out) {
-    Dimension<3> combined_dims;
-    combined_dims[0] = in.dimension(0);  // spatial dimensions
-    for (int i = 1; i < NDIMS - 2; i++) {
-      combined_dims[0] *= in.dimension(i);
-    }
-    combined_dims[1] = in.dimension(NDIMS - 2);  // input filters
-    combined_dims[2] = in.dimension(NDIMS - 1);  // output filters
-    CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
-
-    CHECK(dst_filter_format == FORMAT_OIHW)
-        << "Unsupported output layout: " << ToString(dst_filter_format);
-
-    ShuffleInTensor3Simple<T, 2, 1, 0>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, in.data(), combined_dims, out.data());
-  }
-};
-
-// Converts Cudnn filter format OIHW back to TensorFlow filter format HWIO.
-// TODO(hinsu): Support reverse transformation from filter format OHWI as well.
-template <typename T, int NDIMS>
-struct ReverseTransformFilter<GPUDevice, T, NDIMS> {
-  typedef GPUDevice Device;
-  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
-                  typename TTypes<T, NDIMS>::Tensor out) {
-    Dimension<3> combined_dims;
-    combined_dims[0] = in.dimension(0);  // output filters
-    combined_dims[1] = in.dimension(1);  // input filters
-    combined_dims[2] = in.dimension(2);  // spatial dimensions
-    for (int i = 3; i < NDIMS; ++i) {
-      combined_dims[2] *= in.dimension(i);
-    }
-    CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
-    ShuffleInTensor3Simple<T, 2, 1, 0>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, in.data(), combined_dims, out.data());
-  }
-};
-
-// A GPU helper function that converts input tensor to a larger output tensor,
-// given proper padding values. The padded value is zero.
-template <typename T, int NDIMS>
-struct PadInput<GPUDevice, T, int, NDIMS> {
-  typedef GPUDevice Device;
-  void operator()(const Device& d,
-                  typename TTypes<T, NDIMS, int>::ConstTensor in,
-                  const std::array<int, NDIMS - 2>& padding_left,
-                  const std::array<int, NDIMS - 2>& padding_right,
-                  typename TTypes<T, NDIMS, int>::Tensor out,
-                  TensorFormat format) {
-    CudaLaunchConfig config = GetCudaLaunchConfig(out.size(), d);
-    Dimension<NDIMS> input_dims;
-    for (int i = 0; i < NDIMS; ++i) {
-      input_dims[i] = in.dimension(i);
-    }
-    Dimension<NDIMS> output_dims;
-    for (int i = 0; i < NDIMS; ++i) {
-      output_dims[i] = out.dimension(i);
-    }
-
-    const Dimension<NDIMS - 2> padding_left_dim(padding_left);
-
-    if (format == FORMAT_NHWC) {
-      PadInputCustomKernelNHWC<T, NDIMS>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              config.virtual_thread_count, in.data(), input_dims, out.data(),
-              output_dims, padding_left_dim);
-    } else if (format == FORMAT_NCHW) {
-      PadInputCustomKernelNCHW<T, NDIMS>
-          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-              config.virtual_thread_count, in.data(), input_dims, out.data(),
-              output_dims, padding_left_dim);
-    } else {
-      LOG(FATAL) << "Invalid data format: " << format;
-    }
-  }
-};
-
-// We want std::equal_to and std::greater, but they're not constexpr until
-// C++14.
-struct EqualTo {
-  constexpr bool operator()(int a, int b) const { return a == b; }
-};
-
-struct GreaterThan {
-  constexpr bool operator()(int a, int b) const { return a > b; }
-};
-
-// For each data type, the tile size possibility frontier denotes the tile size
-// combinations that consume the most computational resources constrained by
-// - number of threads per SM limit,
-// - limit on size of the short dimension (<=15) due to the definition of
-//   narrow matrix,
-// - shared memory limit and
-// - some experimentally determined, type-specific constraint on the product of
-//   two side lengths to increase grid-level parallelism.
-//
-// A tile size combination lies on the frontier if and only if one or more
-// constraint mentioned above is hit. Tile size combinations lying outside this
-// frontier are either not possible, or are slower than the alternatives.
-//
-// It is instrumental to consider, for each data type, two subsets of the
-// corresponding frontier:
-// - long side frontier: the union of the biggest tile size combination for
-//   each legal long side len.
-// - non long side frontier: the frontier set minus the long side frontier.
-//
-// TileSizePossibilityFrontierCheck defines the frontier using only the long
-// side frontier tile size combinations (since one can easily extrapolate
-// the entire frontier from this subset). It serves as a utility function
-// to help us determine where a tile size combination of interest lies with
-// resepect to the frontier.
-template <typename Op>
-constexpr bool TileSizePossibilityFrontierCheck(int TileLongSide,
-                                                int TileShortSide,
-                                                int size_of_t, Op op) {
-  // clang-format off
-
-  return (size_of_t == 16 && ((TileLongSide == 32   && op(TileShortSide, 4))  ||
-                             (TileLongSide == 64   && op(TileShortSide, 4))  ||
-                             (TileLongSide == 128  && op(TileShortSide, 4))  ||
-                             (TileLongSide == 256  && op(TileShortSide, 2)))) ||
-          (size_of_t == 8 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
-                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
-                             (TileLongSide == 128  && op(TileShortSide, 8))  ||
-                             (TileLongSide == 256  && op(TileShortSide, 4))  ||
-                             (TileLongSide == 512  && op(TileShortSide, 2)))) ||
-          (size_of_t == 4 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
-                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
-                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
-                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
-                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
-                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
-          (size_of_t == 2 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
-                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
-                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
-                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
-                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
-                             (TileLongSide == 1024 && op(TileShortSide, 2)))) ||
-          (size_of_t == 1 && ((TileLongSide == 32   && op(TileShortSide, 15)) ||
-                             (TileLongSide == 64   && op(TileShortSide, 15)) ||
-                             (TileLongSide == 128  && op(TileShortSide, 15)) ||
-                             (TileLongSide == 256  && op(TileShortSide, 8))  ||
-                             (TileLongSide == 512  && op(TileShortSide, 4))  ||
-                             (TileLongSide == 1024 && op(TileShortSide, 2))));
-
-  // clang-format on
-}
-
-constexpr bool TileSizeOnLongSideFrontier(int TileLongSide, int TileShortSide,
-                                          int size_of_t) {
-  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
-                                          size_of_t, EqualTo());
-}
-constexpr bool TileSizeOutsideFrontier(int TileLongSide, int TileShortSide,
-                                       int size_of_t) {
-  return TileSizePossibilityFrontierCheck(TileLongSide, TileShortSide,
-                                          size_of_t, GreaterThan());
-}
-constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
-                                             int TileShortSide, int size_of_t) {
-  // For a tile size combination (longside, shortside), lying on the frontier
-  // implies that (longside, shortside) is on or within the frontier but
-  // (longside*2, shortside) or (longside, shortside+1) is not. With the above
-  // criterion, we simply need to use !TileSizeOnLongSideFrontier to ensure that
-  // it is not on the long side frontier.
-  return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
-         (TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
-          TileSizeOutsideFrontier(TileLongSide, TileShortSide + 1,
-                                  size_of_t)) &&
-         !TileSizeOnLongSideFrontier(TileLongSide, TileShortSide, size_of_t);
-}
-
-// Helper function to launch a batch narrow matirx transpose kernel.
-template <typename T, int TileLongSide, int TileShortSide>
-void LaunchBatchNarrowMatrixTransposeKernel(
-    const GPUDevice& d, int tile_size_i, int tile_size_j, int total_tiles_count,
-    const T* input, const Dimension<3>& input_dims, T* output) {
-  constexpr int NumThreads = TileLongSide;
-  if (tile_size_i <= TileLongSide && tile_size_j <= TileShortSide) {
-    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileLongSide,
-                                          TileShortSide>
-        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
-                                                           output);
-  } else {
-    SwapDimension1And2InTensor3UsingTiles<T, NumThreads, TileShortSide,
-                                          TileLongSide>
-        <<<total_tiles_count, NumThreads, 0, d.stream()>>>(input, input_dims,
-                                                           output);
-  }
-}
-
-// Recursive template function to search, in a trial-and-error manner, for the
-// minimum tile size configuration satisfying the requested tile side lengths.
-// An important invariant of this search procedure is that for an unsatisfied
-// request, we always try doubling the long side len first, and only after
-// the request is satisfied for the long side len do we begin incrementing
-// the short side len.
-//
-// We have three specializations of this search function depending on where the
-// current tile size combination lies with respect to the frontier.
-// - It lies within the frontier. If request is not satisfied, for the next tile
-// size combination, we first try doubling the long side len and if that does
-// not work, we then increment the short side len.
-// - It lies on the non long side frontier. If the request is not satisfied, we
-// can only increment the short side len.
-// - It lies on the long side frontier. We launch the kernel without checking if
-// the request is satisfied or not.
-template <typename T, int TileLongSide, int TileShortSide,
-          typename dummy = void>
-struct BatchNarrowMatrixTransposeDispatcher {
-  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
-                   int total_tiles_count, const T* input,
-                   const Dimension<3>& input_dims, T* output) {
-    static_assert(
-        (TileLongSide & (TileLongSide - 1)) == 0,
-        "The length of the longer side of the tile is always a power of 2.");
-    bool request_satisfied =
-        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
-        std::min(tile_size_i, tile_size_j) <= TileShortSide;
-
-    if (request_satisfied) {
-      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
-          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
-          output);
-      return;
-    }
-
-    // If the execution reaches here, then the kernel was not launched; we then
-    // determine whether it is the long side or the short side that falls short
-    // of the request and increase that parameter accordingly.
-    const bool long_side_request_not_satisfied =
-        std::max(tile_size_i, tile_size_j) > TileLongSide;
-
-    if (long_side_request_not_satisfied) {
-      BatchNarrowMatrixTransposeDispatcher<
-          T, TileLongSide * 2, TileShortSide>::DoIt(d, tile_size_i, tile_size_j,
-                                                    total_tiles_count, input,
-                                                    input_dims, output);
-    } else {
-      BatchNarrowMatrixTransposeDispatcher<
-          T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
-                                                    total_tiles_count, input,
-                                                    input_dims, output);
-    }
-  }
-};
-
-template <typename T, int TileLongSide, int TileShortSide>
-struct BatchNarrowMatrixTransposeDispatcher<
-    T, TileLongSide, TileShortSide,
-    typename std::enable_if<TileSizeOnNonLongSideFrontier(
-                                TileLongSide, TileShortSide, sizeof(T)),
-                            void>::type> {
-  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
-                   int total_tiles_count, const T* input,
-                   const Dimension<3>& input_dims, T* output) {
-    static_assert(
-        (TileLongSide & (TileLongSide - 1)) == 0,
-        "The length of the longer side of the tile is always a power of 2.");
-    bool request_satisfied =
-        std::max(tile_size_i, tile_size_j) <= TileLongSide &&
-        std::min(tile_size_i, tile_size_j) <= TileShortSide;
-
-    if (request_satisfied) {
-      LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
-          d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
-          output);
-      return;
-    }
-
-    // If the execution reaches here, then the kernel was not launched; since
-    // we are on the non long side frontier, we increment the short dimension
-    // and try again.
-    BatchNarrowMatrixTransposeDispatcher<
-        T, TileLongSide, TileShortSide + 1>::DoIt(d, tile_size_i, tile_size_j,
-                                                  total_tiles_count, input,
-                                                  input_dims, output);
-  }
-};
-
-template <typename T, int TileLongSide, int TileShortSide>
-struct BatchNarrowMatrixTransposeDispatcher<
-    T, TileLongSide, TileShortSide,
-    typename std::enable_if<TileSizeOnLongSideFrontier(
-                                TileLongSide, TileShortSide, sizeof(T)),
-                            void>::type> {
-  static void DoIt(const GPUDevice& d, int tile_size_i, int tile_size_j,
-                   int total_tiles_count, const T* input,
-                   const Dimension<3>& input_dims, T* output) {
-    static_assert(
-        (TileLongSide & (TileLongSide - 1)) == 0,
-        "The length of the longer side of the tile is always a power of 2.");
-
-    LaunchBatchNarrowMatrixTransposeKernel<T, TileLongSide, TileShortSide>(
-        d, tile_size_i, tile_size_j, total_tiles_count, input, input_dims,
-        output);
-  }
-};
-
-// This function tries to recover, in a brute force way, the frontier defined in
-// TileSizePossibilityFrontierCheck as a vector of tile size combinations lying
-// on the long side frontier. This vector is sufficient to determine the entire
-// frontier.
-//
-// Note that if one changes the frontier definition in
-// TileSizePossibilityFrontierCheck and forgets to set the largest short
-// side len of the largest legal long side len to 2, this function will fail
-// and crash the program.
-template <int SizeOfT>
-const std::vector<std::pair<int, int>>& GetTileSizesFrontier() {
-  static_assert(
-      SizeOfT <= 16,
-      "Currently, only data types of sizes 16 bytes or less are supported.");
-  static_assert((SizeOfT & (SizeOfT - 1)) == 0,
-                "Data types must have sizes that are powers of 2.");
-
-  // Expensive work to populate sizes, lazily run in a thread-safe
-  // manner the first time GetTileSizesFrontier<N> is called.
-  static auto* frontier = [] {
-    auto* frontier = new std::vector<std::pair<int, int>>();
-    const int kMaxLongSideLen = 1024;
-    const int kMaxShortSideLen = 15;
-    for (int long_side = 32; long_side <= kMaxLongSideLen; long_side *= 2) {
-      for (int short_side = 2; short_side <= kMaxShortSideLen;
-           short_side += 1) {
-        if (TileSizeOnLongSideFrontier(long_side, short_side, SizeOfT)) {
-          // The current combination lies on the frontier, thus we
-          // add it to the frontier definition.
-          frontier->push_back(std::make_pair(long_side, short_side));
-
-          // The long side length is the largest one allowed iff its
-          // corresponding short side length is 2.
-          if (short_side == 2) return frontier;
-
-          // We have exhausted all the possibilities in the frontier
-          // with the given long side length.
-          break;
-        }
-      }
-    }
-    LOG(FATAL)
-        << "The corresponding short side length of the largest long side "
-           "length has to be 2.";
-  }();
-  return *frontier;
-}
-
-// Helper structs to help determine which data type to use given the size of
-// the matrix data type. A transpose of elements of size N will use a kernel
-// which operates on an array of TransposeElemType<N>::type.
-template <int ElemBytes>
-struct TransposeElemType;
-template <>
-struct TransposeElemType<1> {
-  using type = uint8;
-};
-template <>
-struct TransposeElemType<2> {
-  using type = uint16;
-};
-template <>
-struct TransposeElemType<4> {
-  using type = uint32;
-};
-template <>
-struct TransposeElemType<8> {
-  using type = uint64;
-};
-template <>
-struct TransposeElemType<16> {
-  using type = float4;
-};
-
-// A helper function to make RunSwapDimension1And2InTensor3 concise. This
-// helper function looks at the data type and input matrix sizes and decides
-// the thread numbers and tile sizes to use.
-template <typename T, bool conjugate = false>
-void SwapDimension1And2InTensor3WithNarrowMatrices(
-    const GPUDevice& d, const T* input, const Dimension<3>& input_dims,
-    T* output, const int kMinDimensionToUseTiles) {
-  // Get available tile sizes here for the data type requested:
-  const auto& tile_spec = GetTileSizesFrontier<sizeof(T)>();
-
-  int tile_long_side_len = 0;
-  int tile_short_side_len = 0;
-  float lowest_cost = std::numeric_limits<float>::max();
-  int data_long_side = std::max(input_dims[1], input_dims[2]);
-
-  for (auto tile_size_pair : tile_spec) {
-    int proposed_tile_long_side_len = tile_size_pair.first;
-
-    // Number of threads that will not be doing anything useful when reading
-    // the matrix because the thread block size is bigger than the data block
-    // size.
-    int num_wasted_threads =
-        data_long_side - MathUtil::FloorOfRatio<int>(
-                             data_long_side, proposed_tile_long_side_len) *
-                             proposed_tile_long_side_len;
-
-    int num_full_tiles = MathUtil::FloorOfRatio<int>(
-        data_long_side, proposed_tile_long_side_len);
-
-    float cost = 0;
-
-    // However, if we can execute two or more full tiles, then we gladly
-    // accept any number of wasted threads and ignore its cost.
-    if (num_full_tiles <= 1) cost = num_wasted_threads;
-
-    // Using less than or equal to here because given the same cost, we
-    // would like to launch as many threads as possible.
-    if (cost <= lowest_cost) {
-      tile_long_side_len = proposed_tile_long_side_len;
-      tile_short_side_len = tile_size_pair.second;
-      lowest_cost = cost;
-    }
-  }
-
-  // Request tile sizes such that the longer side of threadblock aligns with
-  // the longer side of input data block to maximize read throughput.
-  // The ideal tile shape is one where the length of the shorter side of the
-  // tile is equal to the length of the shorter side of the input matrix.
-  int requested_tile_size_i = input_dims[1] >= kMinDimensionToUseTiles
-                                  ? tile_long_side_len
-                                  : input_dims[1];
-  int requested_tile_size_j = input_dims[1] >= kMinDimensionToUseTiles
-                                  ? input_dims[2]
-                                  : tile_long_side_len;
-
-  // Truncate the shorter size requested according to the manual limit set in
-  // tile_spec to make sure that we do not launch configurations violating
-  // hardware limits.
-  requested_tile_size_i =
-      requested_tile_size_i == tile_long_side_len
-          ? tile_long_side_len
-          : std::min(requested_tile_size_i, tile_short_side_len);
-  requested_tile_size_j =
-      requested_tile_size_j == tile_long_side_len
-          ? tile_long_side_len
-          : std::min(requested_tile_size_j, tile_short_side_len);
-
-  Dimension<3> input_dims_in_tiles = {
-      input_dims[0],
-      MathUtil::CeilOfRatio<int>(input_dims[1], requested_tile_size_i),
-      MathUtil::CeilOfRatio<int>(input_dims[2], requested_tile_size_j),
-  };
-
-  int total_tiles_count =
-      input_dims_in_tiles[0] * input_dims_in_tiles[1] * input_dims_in_tiles[2];
-
-  using ElemType = typename TransposeElemType<sizeof(T)>::type;
-  static_assert(alignof(T) >= alignof(ElemType), "Unexpected data alignment.");
-  BatchNarrowMatrixTransposeDispatcher<ElemType, 32, 2>::DoIt(
-      d, requested_tile_size_i, requested_tile_size_j, total_tiles_count,
-      reinterpret_cast<const ElemType*>(input), input_dims,
-      reinterpret_cast<ElemType*>(output));
-}
-
-// Launch the GPU kernel that would swap dimension-1 and dimension-2 in a
-// 3D tensor. It looks at the shape of the incoming data, and decides the best
-// strategy to launch.
-template <typename T, bool conjugate = false>
-void RunSwapDimension1And2InTensor3(const GPUDevice& d, const T* input,
-                                    const Dimension<3>& input_dims, T* output) {
-  // If both dimensions are not trivial, use tiles for the actual swapping.
-  // If one dimension is trivial, use SmallDim kernel for swapping.
-  // Otherwise, the trivial swapping relying on the ldg cache is more efficient.
-  static const int kMinDimensionToUseTiles = 16;
-  static const int kMinDimensionToUseRectTiles = 96;
-
-  bool large_matrix = input_dims[1] >= kMinDimensionToUseTiles &&
-                      input_dims[2] >= kMinDimensionToUseTiles;
-  bool narrow_matrix = input_dims[1] >= kMinDimensionToUseRectTiles ||
-                       input_dims[2] >= kMinDimensionToUseRectTiles;
-  if (large_matrix) {
-    // We get best performance when kTileSize is the number of threads in a warp
-    // (32 on our GPUs) and NumSubTiles is 8, so our block size is 8 * 32 = 256
-    // threads.
-    constexpr int kTileSize = 32;
-    constexpr int kNumThreads = 256;
-
-    Dimension<3> input_dims_in_tiles = {
-        input_dims[0],
-        MathUtil::CeilOfRatio<int>(input_dims[1], kTileSize),
-        MathUtil::CeilOfRatio<int>(input_dims[2], kTileSize),
-    };
-
-    int total_tiles_count = input_dims_in_tiles[0] * input_dims_in_tiles[1] *
-                            input_dims_in_tiles[2];
-    SwapDimension1And2InTensor3UsingTiles<T, kNumThreads, kTileSize, kTileSize,
-                                          conjugate>
-        <<<total_tiles_count, kNumThreads, 0, d.stream()>>>(input, input_dims,
-                                                            output);
-
-  } else if (narrow_matrix) {
-    SwapDimension1And2InTensor3WithNarrowMatrices<T, conjugate>(
-        d, input, input_dims, output, kMinDimensionToUseTiles);
-  } else {
-    int total_element_count = input_dims[0] * input_dims[1] * input_dims[2];
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_element_count, d);
-    ShuffleInTensor3Simple<T, 0, 2, 1, conjugate>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, input, input_dims, output);
-  }
-}
-
-// A GPU helper functor that does general dimension 1 and 2 switch for 3D
-// tensor.
-template <typename T, bool conjugate>
-struct SwapDimension1And2InTensor3<GPUDevice, T, conjugate> {
-  typedef GPUDevice Device;
-  void operator()(const Device& d, const T* in,
-                  const gtl::ArraySlice<int64>& combined_dims, T* out) {
-    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
-                               static_cast<int>(combined_dims[1]),
-                               static_cast<int>(combined_dims[2])};
-    RunSwapDimension1And2InTensor3<T, conjugate>(d, in, input_dims, out);
-  }
-};
-
-// A GPU helper functor that does general dimension 0 and 2 switch for 3D
-// tensor.
-template <typename T, bool conjugate>
-struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
-  typedef GPUDevice Device;
-  void operator()(const Device& d, const T* in,
-                  const gtl::ArraySlice<int64>& combined_dims, T* out) {
-    Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
-                               static_cast<int>(combined_dims[1]),
-                               static_cast<int>(combined_dims[2])};
-    size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
-    CudaLaunchConfig config = GetCudaLaunchConfig(total_size, d);
-    ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>
-        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
-            config.virtual_thread_count, in, input_dims, out);
-  }
-};
-
-// A GPU helper functor that converts NHWC TensorFlow data format to
-// NCHW format that is accepted by Cudnn.
-template <typename T, int NDIMS>
-struct NHWCToNCHW<GPUDevice, T, NDIMS> {
-  typedef GPUDevice Device;
-  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
-                  typename TTypes<T, NDIMS>::Tensor out) {
-    Dimension<3> combined_dims;
-    combined_dims[0] = in.dimension(0);  // N (batch)
-    combined_dims[1] = in.dimension(1);  // spatial dimensions (HW)
-    for (int i = 2; i < NDIMS - 1; ++i) {
-      combined_dims[1] *= in.dimension(i);
-    }
-    combined_dims[2] = in.dimension(NDIMS - 1);  // C (channels)
-    RunSwapDimension1And2InTensor3(d, in.data(), combined_dims, out.data());
-  }
-};
-
-// A GPU helper functor that converts NCHW Cudnn data format to NHWC TensorFlow
-// Format.
-template <typename T, int NDIMS>
-struct NCHWToNHWC<GPUDevice, T, NDIMS> {
-  typedef GPUDevice Device;
-  void operator()(const Device& d, typename TTypes<T, NDIMS>::ConstTensor in,
-                  typename TTypes<T, NDIMS>::Tensor out) {
-    Dimension<3> combined_dims;
-    combined_dims[0] = in.dimension(0);  // N (batch)
-    combined_dims[1] = in.dimension(1);  // C (channel)
-    combined_dims[2] = in.dimension(2);  // spatial dimensions (HW)
-    for (int i = 3; i < NDIMS; ++i) {
-      combined_dims[2] *= in.dimension(i);
-    }
-    RunSwapDimension1And2InTensor3(d, in.data(), combined_dims, out.data());
-  }
-};
-
-}  // namespace functor
-
-template struct functor::ShuffleAndReverse<GPUDevice, float, 4, int>;
-template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4, int>;
-
-template struct functor::ShuffleAndReverse<GPUDevice, float, 4,
-                                           Eigen::DenseIndex>;
-template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4,
-                                           Eigen::DenseIndex>;
-
-template struct functor::TransformDepth<GPUDevice, float, int>;
-template struct functor::TransformDepth<GPUDevice, Eigen::half, int>;
-
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint8>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint16>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint32>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint64>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, float4>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, float2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, double2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, Eigen::half>;
-
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint8>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint16>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint32>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint64>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, float4>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, float2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, double2,
-                                                     /*conjugate=*/true>;
-
-// For 2d ops.
-template struct functor::TransformFilter<GPUDevice, double, int, 4>;
-template struct functor::TransformFilter<GPUDevice, float, int, 4>;
-template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
-
-template struct functor::ReverseTransformFilter<GPUDevice, double, 4>;
-template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
-template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;
-
-template struct functor::NHWCToNCHW<GPUDevice, double, 4>;
-template struct functor::NHWCToNCHW<GPUDevice, float, 4>;
-template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 4>;
-
-template struct functor::NCHWToNHWC<GPUDevice, double, 4>;
-template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
-template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;
-
-template struct functor::PadInput<GPUDevice, int, int, 4>;
-template struct functor::PadInput<GPUDevice, double, int, 4>;
-template struct functor::PadInput<GPUDevice, float, int, 4>;
-template struct functor::PadInput<GPUDevice, Eigen::half, int, 4>;
-
-// For 3d ops.
-template struct functor::TransformFilter<GPUDevice, float, int, 5>;
-template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 5>;
-
-template struct functor::ReverseTransformFilter<GPUDevice, float, 5>;
-template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 5>;
-
-template struct functor::NHWCToNCHW<GPUDevice, float, 5>;
-template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 5>;
-
-template struct functor::NCHWToNHWC<GPUDevice, float, 5>;
-template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 5>;
-
-template struct functor::PadInput<GPUDevice, float, int, 5>;
-template struct functor::PadInput<GPUDevice, Eigen::half, int, 5>;
-
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 1236f27051898e88f580a139f1d6cbf95dd0411b..bf98acdecfd1a3b8a946648c105f0d313f2296ab 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
@@ -27,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
@@ -522,4 +526,933 @@ TEST_F(ConvOpTest, HandwrittenConv) { HandwrittenConv(); }
 
 TEST_F(ConvOpTest, AnisotropicStride) { AnisotropicStrides(); }
 
+template <typename T>
+class FusedConv2DOpTest : public OpsTestBase {
+ protected:
+  static constexpr int kDepth = 3;
+  static constexpr int kImageWidth = 32;
+  static constexpr int kImageHeight = 32;
+  static constexpr int kImageBatchCount = 8;
+
+  using BiasAddGraphRunner =
+      std::function<void(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* out)>;
+
+  using BatchNormGraphRunner = std::function<void(
+      const Tensor& input_data, const Tensor& filter_data,
+      const Tensor& scale_data, const Tensor& offset_data,
+      const Tensor& mean_data, const Tensor& variance_data, Tensor* out)>;
+
+  // Runs a Tensorflow graph defined by the root scope, and fetches the result
+  // of 'fetch' node into the output Tensor.
+  void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
+                   Tensor* output) {
+    tensorflow::GraphDef graph;
+    TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+    // `FusedConv2D` is available only on CPU, and in this test we don't want to
+    // compare GPU vs CPU numbers, so place all nodes on CPU.
+    for (NodeDef& mutable_node : *graph.mutable_node()) {
+      mutable_node.set_device("/device:CPU:0");
+    }
+
+    // Disable Grappler constant folding for the test graphs.
+    tensorflow::SessionOptions session_options;
+    tensorflow::RewriterConfig* cfg =
+        session_options.config.mutable_graph_options()
+            ->mutable_rewrite_options();
+    cfg->set_constant_folding(tensorflow::RewriterConfig::OFF);
+
+    std::unique_ptr<tensorflow::Session> session(
+        tensorflow::NewSession(session_options));
+
+    TF_ASSERT_OK(session->Create(graph));
+
+    std::vector<Tensor> unfused_tensors;
+    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
+
+    *output = unfused_tensors[0];
+  }
+
+  void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* output,
+                         int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    RunAndFetch(root, "with_bias", output);
+  }
+
+  void RunConv2DWithBiasAndRelu(const Tensor& input_data,
+                                const Tensor& filter_data,
+                                const Tensor& bias_data, Tensor* output,
+                                int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
+
+    RunAndFetch(root, "with_relu", output);
+  }
+
+  void RunConv2DWithBatchNorm(const Tensor& input_data,
+                              const Tensor& filter_data,
+                              const Tensor& scale_data,
+                              const Tensor& offset_data,
+                              const Tensor& mean_data,
+                              const Tensor& variance_data, Tensor* output,
+                              int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    ops::FusedBatchNorm::Attrs attr;
+    attr = attr.IsTraining(false);
+
+    auto with_fused_batch_norm = ops::FusedBatchNorm(
+        root.WithOpName("with_fused_batch_norm"), conv,
+        ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
+        ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
+        ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
+        ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
+        attr);
+
+    RunAndFetch(root, "with_fused_batch_norm", output);
+  }
+
+  void RunConv2DWithBatchNormAndRelu(const Tensor& input_data,
+                                     const Tensor& filter_data,
+                                     const Tensor& scale_data,
+                                     const Tensor& offset_data,
+                                     const Tensor& mean_data,
+                                     const Tensor& variance_data,
+                                     Tensor* output, int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    ops::FusedBatchNorm::Attrs attr;
+    attr = attr.IsTraining(false);
+
+    auto with_fused_batch_norm = ops::FusedBatchNorm(
+        root.WithOpName("with_fused_batch_norm"), conv,
+        ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
+        ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
+        ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
+        ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
+        attr);
+
+    auto with_relu =
+        ops::Relu(root.WithOpName("with_relu"), with_fused_batch_norm.y);
+
+    RunAndFetch(root, "with_relu", output);
+  }
+
+  void RunFusedConv2DOp(const Tensor& image, const Tensor& filter,
+                        const std::vector<Tensor>& args,
+                        const std::vector<string>& fused_ops, Tensor* output,
+                        int stride = 1) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    int num_args = static_cast<int>(args.size());
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_FusedConv2D")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Attr("num_args", num_args)
+                     .Input(FakeInput(num_args, dtype))
+                     .Attr("T", dtype)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "SAME")
+                     .Attr("fused_ops", fused_ops)
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    *output = *GetOutput(0);
+  }
+
+  void VerifyBiasAddTensorsNear(int depth, int image_width, int image_height,
+                                int image_batch_count, int filter_size,
+                                int filter_count,
+                                const BiasAddGraphRunner& run_default,
+                                const BiasAddGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    // Add some negative values to filter to properly test Relu.
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+    filter.flat<T>() -= filter.flat<T>().constant(static_cast<T>(0.5f));
+
+    const int bias_size = filter_count;
+    Tensor bias(dtype, {bias_size});
+    bias.flat<T>() = bias.flat<T>().setRandom();
+    bias.flat<T>() += bias.flat<T>().constant(static_cast<T>(0.5f));
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, bias, &conv_2d);
+    run_fused(image, filter, bias, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    // NOTE(ezhulenev): When filter size is equal to the input image size, we
+    // effectevily do element-wise product and full sum reduction, and these
+    // operations intoroduce higher than "normal" numerical errors.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-3);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d);
+    }
+  }
+
+  void VerifyFusedBatchNormTensorsNear(int depth, int image_width,
+                                       int image_height, int image_batch_count,
+                                       int filter_size, int filter_count,
+                                       const BatchNormGraphRunner& run_default,
+                                       const BatchNormGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    // Add some negative values to filter to properly test Relu.
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+    filter.flat<T>() -= filter.flat<T>().constant(static_cast<T>(0.5f));
+
+    const int scale_size = filter_count;
+
+    Tensor scale(dtype, {scale_size});
+    scale.flat<T>() = scale.flat<T>().setRandom();
+
+    Tensor offset(dtype, {scale_size});
+    offset.flat<T>() = offset.flat<T>().setRandom();
+
+    Tensor mean(dtype, {scale_size});
+    mean.flat<T>() = mean.flat<T>().setRandom();
+
+    Tensor variance(dtype, {scale_size});
+    variance.flat<T>() = variance.flat<T>().setRandom();
+    variance.flat<T>() += variance.flat<T>().constant(static_cast<T>(0.5f));
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, scale, offset, mean, variance, &conv_2d);
+    run_fused(image, filter, scale, offset, mean, variance, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    // NOTE(ezhulenev): When filter size is equal to the input image size, we
+    // effectevily do element-wise product and full sum reduction, and these
+    // operations intoroduce higher than "normal" numerical errors.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-3);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d);
+    }
+  }
+
+  // Verifies that computing Conv2D+BiasAdd in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBias(int filter_size, int filter_count,
+                            int depth = kDepth, int image_width = kImageWidth,
+                            int image_height = kImageHeight,
+                            int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithBias(input_data, filter_data, bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused = [this](const Tensor& input_data,
+                                                const Tensor& filter_data,
+                                                const Tensor& bias_data,
+                                                Tensor* out) {
+      RunFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"}, out);
+    };
+
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBiasAndRelu(int filter_size, int filter_count,
+                                   int depth = kDepth,
+                                   int image_width = kImageWidth,
+                                   int image_height = kImageHeight,
+                                   int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithBiasAndRelu(input_data, filter_data, bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data, {bias_data},
+                           {"BiasAdd", "Relu"}, out);
+        };
+
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+FusedBatchNorm in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBatchNorm(int filter_size, int filter_count,
+                                 int depth = kDepth,
+                                 int image_width = kImageWidth,
+                                 int image_height = kImageHeight,
+                                 int image_batch_count = kImageBatchCount) {
+    const BatchNormGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunConv2DWithBatchNorm(input_data, filter_data, scale_data,
+                                 offset_data, mean_data, variance_data, out);
+        };
+
+    const BatchNormGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data,
+                           {scale_data, offset_data, mean_data, variance_data},
+                           {"FusedBatchNorm"}, out);
+        };
+
+    VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
+                                    image_batch_count, filter_size,
+                                    filter_count, run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+FusedBatchNorm+Relu in a graph is identical
+  // to FusedConv2D.
+  void VerifyConv2DWithBatchNormAndRelu(
+      int filter_size, int filter_count, int depth = kDepth,
+      int image_width = kImageWidth, int image_height = kImageHeight,
+      int image_batch_count = kImageBatchCount) {
+    const BatchNormGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunConv2DWithBatchNormAndRelu(input_data, filter_data, scale_data,
+                                        offset_data, mean_data, variance_data,
+                                        out);
+        };
+
+    const BatchNormGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data,
+                           {scale_data, offset_data, mean_data, variance_data},
+                           {"FusedBatchNorm", "Relu"}, out);
+        };
+
+    VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
+                                    image_batch_count, filter_size,
+                                    filter_count, run_default, run_fused);
+  }
+};
+
+// Conv2D with BatchNorm can be tested only with `T=float`, because default
+// `FusedBatchNorm` kernel supports only floats for scale, mean and variance.
+
+template <typename T>
+class FusedConv2DWithBiasOpTest : public FusedConv2DOpTest<T> {};
+template <typename T>
+class FusedConv2DWithBatchNormOpTest : public FusedConv2DOpTest<T> {};
+
+TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest);
+TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest);
+
+// -------------------------------------------------------------------------- //
+// Conv2D + BiasAdd + {Relu}                                                  //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolution) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolutionAndRelu) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+// -------------------------------------------------------------------------- //
+// Conv2D + FusedBatchNorm + {Relu}                                           //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ImageSizeConvolution) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ImageSizeConvolutionAndRelu) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest,    //
+                           OneByOneConvolution,          //
+                           ImageSizeConvolution,         //
+                           SpatialConvolution,           //
+                           OneByOneConvolutionAndRelu,   //
+                           ImageSizeConvolutionAndRelu,  //
+                           SpatialConvolutionAndRelu);
+
+REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest,  //
+                           OneByOneConvolution,             //
+                           ImageSizeConvolution,            //
+                           SpatialConvolution,              //
+                           OneByOneConvolutionAndRelu,      //
+                           ImageSizeConvolutionAndRelu,     //
+                           SpatialConvolutionAndRelu);
+
+using FusedBiasAddDataTypes = ::testing::Types<float, double>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBiasOpTest,
+                              FusedBiasAddDataTypes);
+
+using FusedBatchNormDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBatchNormOpTest,
+                              FusedBatchNormDataTypes);
+
+////////////////////////////////////////////////////////////////////////////////
+// Performance benchmarks for the FusedConv2DWithBiasOp.                      //
+////////////////////////////////////////////////////////////////////////////////
+
+struct Conv2DGraph {
+  Graph* graph;
+  Node* conv2d;
+};
+
+struct Conv2DWithBiasGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* bias;
+};
+
+struct Conv2DWithBiasAndReluGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* bias;
+  Node* relu;
+};
+
+struct Conv2DWithBatchNormGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* batch_norm;
+};
+
+struct Conv2DWithBatchNormAndReluGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* batch_norm;
+  Node* relu;
+};
+
+static Tensor MakeRandomTensor(const TensorShape& shape) {
+  Tensor tensor(DT_FLOAT, TensorShape(shape));
+  tensor.flat<float>() = tensor.flat<float>().setRandom();
+  return tensor;
+}
+
+// Creates a simple Tensorflow graph with single Conv2D node.
+static Conv2DGraph Conv2D(int batch, int height, int width, int in_depth,
+                          int filter_w, int filter_h, int out_depth) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+
+  Node* conv2d;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "Conv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Finalize(graph, &conv2d));
+
+  return {graph, conv2d};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by BiasAdd.
+static Conv2DWithBiasGraph Conv2DWithBias(int batch, int height, int width,
+                                          int in_depth, int filter_w,
+                                          int filter_h, int out_depth) {
+  Conv2DGraph conv_graph =
+      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+
+  Tensor bias_t = MakeRandomTensor({out_depth});
+  Node* bias = test::graph::Constant(graph, bias_t, "bias");
+
+  Node* out;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("bias"), "BiasAdd")
+                  .Input(conv2d)
+                  .Input(bias)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("data_format", "NHWC")
+                  .Finalize(graph, &out));
+
+  return {graph, conv2d, out};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by BiasAdd and Relu.
+static Conv2DWithBiasAndReluGraph Conv2DWithBiasAndRelu(int batch, int height,
+                                                        int width, int in_depth,
+                                                        int filter_w,
+                                                        int filter_h,
+                                                        int out_depth) {
+  Conv2DWithBiasGraph conv_graph = Conv2DWithBias(
+      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+  Node* bias = conv_graph.bias;
+
+  Node* relu;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("relu"), "Relu")
+                  .Input(bias)
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(graph, &relu));
+
+  return {graph, conv2d, bias, relu};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm.
+static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(int batch, int height,
+                                                    int width, int in_depth,
+                                                    int filter_w, int filter_h,
+                                                    int out_depth) {
+  Conv2DGraph conv_graph =
+      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+
+  Tensor scale_t = MakeRandomTensor({out_depth});
+  Tensor offset_t = MakeRandomTensor({out_depth});
+  Tensor mean_t = MakeRandomTensor({out_depth});
+  Tensor variance_t = MakeRandomTensor({out_depth});
+
+  Node* scale = test::graph::Constant(graph, scale_t, "scale");
+  Node* offset = test::graph::Constant(graph, offset_t, "offset");
+  Node* mean = test::graph::Constant(graph, mean_t, "mean");
+  Node* variance = test::graph::Constant(graph, variance_t, "variance");
+
+  Node* out;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("batch_norm"), "FusedBatchNorm")
+                  .Input(conv2d)
+                  .Input(scale)
+                  .Input(offset)
+                  .Input(mean)
+                  .Input(variance)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("is_training", false)
+                  .Finalize(graph, &out));
+
+  return {graph, conv2d, out};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm and
+// Relu.
+static Conv2DWithBatchNormAndReluGraph Conv2DWithBatchNormAndRelu(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth) {
+  Conv2DWithBatchNormGraph conv_graph = Conv2DWithBatchNorm(
+      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+  Node* batch_norm = conv_graph.batch_norm;
+
+  Node* relu;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("relu"), "Relu")
+                  .Input(batch_norm)
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(graph, &relu));
+
+  return {graph, conv2d, batch_norm, relu};
+}
+
+// Creates a tensorflow graph with a single FusedConv2D (with BiasAdd) node and
+// fuses into it additional computations (e.g. Relu).
+static Graph* FusedConv2DWithBias(int batch, int height, int width,
+                                  int in_depth, int filter_w, int filter_h,
+                                  int out_depth,
+                                  const std::vector<string>& fused_ops = {}) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+  Tensor bias_t = MakeRandomTensor({out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+  Node* bias = test::graph::Constant(graph, bias_t, "bias");
+
+  std::vector<NodeBuilder::NodeOut> args = {bias};
+
+  Node* conv;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "_FusedConv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("num_args", 1)
+                  .Input(args)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Attr("fused_ops", fused_ops)
+                  .Finalize(graph, &conv));
+
+  return graph;
+}
+
+// Creates a tensorflow graph with a single FusedConv2D (with FusedBatchNorm)
+// node and fuses into it additional computations (e.g. Relu).
+static Graph* FusedConv2DWithBatchNorm(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, const std::vector<string>& fused_ops = {}) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+  Tensor scale_t = MakeRandomTensor({out_depth});
+  Tensor offset_t = MakeRandomTensor({out_depth});
+  Tensor mean_t = MakeRandomTensor({out_depth});
+  Tensor variance_t = MakeRandomTensor({out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+  Node* scale = test::graph::Constant(graph, scale_t, "scale");
+  Node* offset = test::graph::Constant(graph, offset_t, "offset");
+  Node* mean = test::graph::Constant(graph, mean_t, "mean");
+  Node* variance = test::graph::Constant(graph, variance_t, "variance");
+
+  std::vector<NodeBuilder::NodeOut> args = {scale, offset, mean, variance};
+
+  Node* conv;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "_FusedConv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("num_args", 4)
+                  .Input(args)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Attr("fused_ops", fused_ops)
+                  .Finalize(graph, &conv));
+
+  return graph;
+}
+
+// Macro arguments names: --------------------------------------------------- //
+//    N: batch size
+//    H: height
+//    W: width
+//    C: channels
+//   FC: filter count
+//   FH: filter height
+//   FW: filter width
+
+#define BM_SETUP(N, H, W, C, type, LABEL, NAME)                               \
+  testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
+  testing::SetLabel(LABEL);
+
+#define BM_NAME(name, type, N, H, W, C, FW, FH, FC) \
+  name##_##type##_##N##_##H##_##W##_##C##_##FW##_##FH##_##FC
+
+#define BM_Conv2D(N, H, W, C, FW, FH, FC, type, LABEL)                       \
+  static void BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC)(int iters) {  \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
+    test::Benchmark(#type, Conv2D(N, H, W, C, FW, FH, FC).graph).Run(iters); \
+  }                                                                          \
+  BENCHMARK(BM_NAME(BM_Conv2D, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                   \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                           \
+    test::Benchmark(#type, Conv2DWithBias(N, H, W, C, FW, FH, FC).graph) \
+        .Run(iters);                                                     \
+  }                                                                      \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBias, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                    \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                            \
+    test::Benchmark(#type,                                                \
+                    Conv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC).graph)  \
+        .Run(iters);                                                      \
+  }                                                                       \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type,                                                    \
+                    FusedConv2DWithBias(N, H, W, C, FW, FH, FC, {"BiasAdd"})) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                         \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
+    test::Benchmark(#type, FusedConv2DWithBias(N, H, W, C, FW, FH, FC,         \
+                                               {"BiasAdd", "Relu"}))           \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type, Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC).graph) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                         \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
+    test::Benchmark(#type,                                                     \
+                    Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC).graph)  \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                       \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
+    test::Benchmark(#type, FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,  \
+                                                    {"FusedBatchNorm"}))     \
+        .Run(iters);                                                         \
+  }                                                                          \
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type,      \
+                                           LABEL)                             \
+  static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C,   \
+                      FW, FH, FC)(int iters) {                                \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type,                                                    \
+                    FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,          \
+                                             {"FusedBatchNorm", "Relu"}))     \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
+                    FH, FC));
+
+// -------------------------------------------------------------------------- //
+// Pixel CNN convolutions.
+// -------------------------------------------------------------------------- //
+
+// 1x1 Convolution: MatMulFunctor
+
+BM_Conv2D(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2D(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2D(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+// 1) BiasAdd {+ Relu}
+
+BM_Conv2DWithBias(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBias(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBias(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBias(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBias(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBias(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+// 2) FusedBatchNorm {+ Relu}
+
+BM_Conv2DWithBatchNorm(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNorm(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNorm(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBatchNormAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNormAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNormAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNorm(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNorm(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBatchNorm(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNormAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNormAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu,
+                                   "1x1 /b 16");
+BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu,
+                                   "1x1 /b 32");
+
+// -------------------------------------------------------------------------- //
+// 3x3 Convolution: SpatialConvolution
+// -------------------------------------------------------------------------- //
+
+BM_Conv2D(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2D(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2D(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+// 1) BiasAdd {+ Relu}
+
+BM_Conv2DWithBias(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBias(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBias(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBias(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBias(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2DWithBias(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+// 2) FusedBatchNorm {+ Relu}
+
+BM_Conv2DWithBatchNorm(8, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNorm(16, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNorm(32, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBatchNormAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBatchNormAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBatchNorm(8, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNorm(16, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBatchNorm(32, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNormAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBatchNormAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu,
+                                   "3x3 /b 16");
+BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu,
+                                   "3x3 /b 32");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 04959df38d821aa4d321303c13f8547bcc34accd..fbd702ef14ed2b810b9cb08679baf8688ca58d9c 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -515,14 +515,17 @@ struct CudnnRnnModelShapes {
 // key.
 struct CudnnRnnConfigHasher {
   uint64 operator()(
-      const std::pair<CudnnRnnModelShapes, AlgorithmDesc>& to_hash) const {
+      const std::pair<CudnnRnnModelShapes, absl::optional<AlgorithmDesc>>&
+          to_hash) const {
     auto& shapes = to_hash.first;
     auto& algo_desc = to_hash.second;
 
     uint64 hash =
         HashList({shapes.num_layers, shapes.input_size, shapes.num_units,
                   shapes.dir_count, shapes.batch_size});
-    hash = Hash64Combine(hash, algo_desc.hash());
+    if (algo_desc.has_value()) {
+      hash = Hash64Combine(hash, algo_desc->hash());
+    }
     return hash;
   }
 };
@@ -531,8 +534,9 @@ struct CudnnRnnConfigHasher {
 // table key.
 struct CudnnRnnConfigComparator {
   bool operator()(
-      const std::pair<CudnnRnnModelShapes, AlgorithmDesc>& lhs,
-      const std::pair<CudnnRnnModelShapes, AlgorithmDesc>& rhs) const {
+      const std::pair<CudnnRnnModelShapes, absl::optional<AlgorithmDesc>>& lhs,
+      const std::pair<CudnnRnnModelShapes, absl::optional<AlgorithmDesc>>& rhs)
+      const {
     return lhs.first.IsCompatibleWith(rhs.first) && lhs.second == rhs.second;
   }
 };
@@ -887,10 +891,9 @@ class CudnnRNNKernelCommon : public OpKernel {
     return Status::OK();
   }
 
-  using RnnStateCache =
-      gtl::FlatMap<std::pair<CudnnRnnModelShapes, AlgorithmDesc>,
-                   RnnScratchSpace, CudnnRnnConfigHasher,
-                   CudnnRnnConfigComparator>;
+  using RnnStateCache = gtl::FlatMap<
+      std::pair<CudnnRnnModelShapes, absl::optional<AlgorithmDesc>>,
+      RnnScratchSpace, CudnnRnnConfigHasher, CudnnRnnConfigComparator>;
   // Returns a raw rnn descriptor pointer. The cache owns the rnn descriptor and
   // should outlive the returned pointer.
   template <typename T>
@@ -1317,9 +1320,9 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
       OP_REQUIRES_OK(context, context->allocate_output(4, TensorShape({2}),
                                                        &output_host_reserved));
       auto output_host_reserved_int8 = output_host_reserved->vec<int8>();
-      output_host_reserved_int8(0) = best_algo_config.algorithm().algo_id();
+      output_host_reserved_int8(0) = best_algo_config.algorithm()->algo_id();
       output_host_reserved_int8(1) =
-          best_algo_config.algorithm().tensor_ops_enabled();
+          best_algo_config.algorithm()->tensor_ops_enabled();
     } else {
       OP_REQUIRES_OK(context,
                      context->allocate_output(4, {}, &output_host_reserved));
@@ -1357,6 +1360,10 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
         modeltypes.rnn_mode, modeltypes.rnn_input_mode, input->dtype());
 
     if (AutoTuneRnnConfigMap::GetInstance()->Find(rnn_params, algo_config)) {
+      VLOG(1) << "Using existing best Cudnn RNN algorithm "
+              << "(algo, tensor_op_enabled) = ("
+              << algo_config->algorithm()->algo_id() << ", "
+              << algo_config->algorithm()->tensor_ops_enabled() << ").";
       return Status::OK();
     }
 
@@ -1390,6 +1397,8 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
     }
     ProfileResult best_result;
     for (auto& algo : algorithms) {
+      VLOG(1) << "Profile Cudnn RNN algorithm (algo, tensor_op_enabled) =  ("
+              << algo.algo_id() << ", " << algo.tensor_ops_enabled() << ").";
       Status status;
       ProfileResult final_profile_result;
 
@@ -1438,8 +1447,9 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
       }
 
       auto total_time = final_profile_result.elapsed_time_in_ms();
-      VLOG(1) << "Profile Cudnn RNN algo " << algo.algo_id()
-              << " run time: " << total_time << " ms";
+      VLOG(1) << "Cudnn RNN algorithm (algo, tensor_op_enabled) =  ("
+              << algo.algo_id() << ", " << algo.tensor_ops_enabled() << ")"
+              << " run time: " << total_time << " ms.";
       if (total_time < best_result.elapsed_time_in_ms()) {
         best_result.set_elapsed_time_in_ms(total_time);
         best_result.set_algorithm(algo);
@@ -1450,6 +1460,9 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
       return Status(error::Code::INTERNAL, "No algorithm worked!");
     }
     algo_config->set_algorithm(best_result.algorithm());
+    VLOG(1) << "Best Cudnn RNN algorithm (algo, tensor_op_enabled) =  ("
+            << best_result.algorithm().algo_id() << ", "
+            << best_result.algorithm().tensor_ops_enabled() << ").";
     AutoTuneRnnConfigMap::GetInstance()->Insert(rnn_params, *algo_config);
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
index 627ecc8c802a2bbd428f9cc2160bec379d7b654b..303d8e47913ec8aa3cdd4a79d5573228ab8e177c 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_select.cu.cc
@@ -94,6 +94,7 @@ struct BatchSelectFunctor<GPUDevice, T> {
   template struct SelectScalarFunctor<GPUDevice, T>; \
   template struct BatchSelectFunctor<GPUDevice, T>;
 
+SELECT_FUNCTOR(bool);
 SELECT_FUNCTOR(Eigen::half);
 SELECT_FUNCTOR(float);
 SELECT_FUNCTOR(double);
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index d6988a562c6000bf285136ef3d036748c484d7c9..dd4e4ea547e7738b76796c0e8d174602645b83df 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -159,6 +159,7 @@ TF_CALL_ALL_TYPES(REGISTER_SELECT);
       Name("Select").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       SelectOp<GPUDevice, type>);
 
+REGISTER_SELECT_GPU(bool);
 REGISTER_SELECT_GPU(Eigen::half);
 REGISTER_SELECT_GPU(float);
 REGISTER_SELECT_GPU(double);
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index 78fefc69c776e2f7b7c44c941e0a1afefdbaf143..d0ff271df6ad0475b970b7303292c8f7ea14396e 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
-          float, Eigen::half, double, int32, int64);
+REGISTER7(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
+          float, Eigen::half, double, int32, int64, complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "SquaredDifference", functor::squared_difference,
           float, Eigen::half, double, int64);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 66ba827a9015fcf9875ed6c51cfc3c0e1cc7983d..abfb4a039cf85a14d8cfcd5acf96d35175cf8c95 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -296,27 +296,32 @@ struct less_equal : std::binary_function<T, T, bool> {
   }
 };
 
-// Functor that enables composition of multiple Eigen functors.
-template <typename Scalar, typename UnaryFunctor, typename BinaryFunctor>
-struct scalar_compose_op {
+// Functor that enables squared difference functor.
+template <typename Scalar>
+struct scalar_squared_difference_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
   operator()(const Scalar& a, const Scalar& b) const {
-    return UnaryFunctor()(BinaryFunctor()(a, b));
+    const Scalar v = scalar_difference_op<Scalar>()(a, b);
+    return scalar_product_op<Scalar>()(v, scalar_conjugate_op<Scalar>()(v));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& a, const Packet& b) const {
-    return UnaryFunctor().packetOp(BinaryFunctor().packetOp(a, b));
+    const Packet v = scalar_difference_op<Scalar>().packetOp(a, b);
+    return scalar_product_op<Scalar>().packetOp(
+        v, scalar_conjugate_op<Scalar>().packetOp(v));
   }
 };
 
-template <typename Scalar, typename UnaryFunctor, typename BinaryFunctor>
-struct functor_traits<scalar_compose_op<Scalar, UnaryFunctor, BinaryFunctor>> {
+template <typename Scalar>
+struct functor_traits<scalar_squared_difference_op<Scalar>> {
   enum {
-    Cost = functor_traits<UnaryFunctor>::Cost +
-           functor_traits<BinaryFunctor>::Cost,
-    PacketAccess = functor_traits<UnaryFunctor>::PacketAccess &&
-                   functor_traits<BinaryFunctor>::PacketAccess
+    Cost = functor_traits<scalar_difference_op<Scalar>>::Cost +
+           functor_traits<scalar_conjugate_op<Scalar>>::Cost +
+           functor_traits<scalar_product_op<Scalar>>::Cost,
+    PacketAccess = functor_traits<scalar_difference_op<Scalar>>::PacketAccess &&
+                   functor_traits<scalar_conjugate_op<Scalar>>::PacketAccess &&
+                   functor_traits<scalar_product_op<Scalar>>::PacketAccess
   };
 };
 
@@ -449,6 +454,27 @@ struct functor_traits<scalar_round_op_google<Scalar>> {
   enum { Cost = 4 * NumTraits<Scalar>::AddCost, PacketAccess = false };
 };
 
+template <typename Scalar>
+struct scalar_round_up_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x) const {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
+                        NUMERIC_TYPE_MUST_BE_REAL)
+
+    Scalar round_val = Eigen::numext::floor(x);
+    const Scalar fraction = x - round_val;
+    if (fraction >= Scalar(.5)) {
+      round_val += Scalar(1.0);
+    }
+    return round_val;
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_round_up_op<Scalar>> {
+  enum { Cost = 4 * NumTraits<Scalar>::AddCost, PacketAccess = false };
+};
+
 #undef ENABLE_FLOAT_EQUALITY_WARNING
 #undef DISABLE_FLOAT_EQUALITY_WARNING
 
@@ -656,7 +682,7 @@ template <typename T>
 struct erfc : base<T, Eigen::internal::scalar_erfc_op<T>> {};
 
 template <typename T>
-struct sigmoid : base<T, Eigen::internal::scalar_sigmoid_op<T>> {};
+struct sigmoid : base<T, Eigen::internal::scalar_logistic_op<T>> {};
 
 template <typename T>
 struct sin : base<T, Eigen::internal::scalar_sin_op<T>> {};
@@ -754,7 +780,7 @@ struct rint : base<T, scalar_rint_op<T>> {};
 // pow(x, y) = x ^ y
 // maximum(x, y) = x > y ? x : y
 // minimum(x, y) = x < y ? x : y
-// squared_difference(x, y) = (x - y) * (x - y)
+// squared_difference(x, y) = conj(x - y) * (x - y)
 
 template <typename T>
 struct add : base<T, Eigen::internal::scalar_sum_op<T>> {
@@ -864,9 +890,7 @@ struct atan2 : base<T, scalar_atan2_op<T>> {};
 
 template <typename T>
 struct squared_difference
-    : base<T, Eigen::internal::scalar_compose_op<
-                  T, Eigen::internal::scalar_square_op<T>,
-                  Eigen::internal::scalar_difference_op<T>>> {};
+    : base<T, Eigen::internal::scalar_squared_difference_op<T>> {};
 
 template <typename T>
 struct xdivy : base<T, Eigen::internal::xdivy_op<T>> {};
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 451f8c1a6c9a8d41374b81e4b0b460faa17c8e96..e2ab77632da4830f63d63c95c6ace5465fb46b9e 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -13,16 +13,6 @@ load(
     "tf_cc_test",
 )
 
-tf_kernel_library(
-    name = "stats_aggregator_ops",
-    srcs = ["stats_aggregator_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 # TODO(mrry): Remove this empty forwarding library.
 cc_library(
     name = "dataset",
@@ -37,7 +27,6 @@ cc_library(
     hdrs = ["dataset_utils.h"],
     deps = [
         ":captured_function",
-        ":dataset",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -45,12 +34,21 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dataset_utils_test",
+    srcs = ["dataset_utils_test.cc"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "captured_function",
     srcs = ["captured_function.cc"],
     hdrs = ["captured_function.h"],
     deps = [
-        ":dataset",
         ":single_threaded_executor",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -102,7 +100,6 @@ cc_library(
     srcs = ["window_dataset.cc"],
     hdrs = ["window_dataset.h"],
     deps = [
-        ":dataset",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -113,7 +110,6 @@ tf_kernel_library(
     name = "batch_dataset_op",
     srcs = ["batch_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -125,7 +121,6 @@ tf_kernel_library(
     name = "window_dataset_op",
     srcs = ["window_dataset_op.cc"],
     deps = [
-        ":dataset",
         ":window_dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -134,64 +129,10 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "slide_dataset_op",
-    srcs = ["slide_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "padded_batch_dataset_op",
     srcs = ["padded_batch_dataset_op.cc"],
     deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "dense_to_sparse_batch_dataset_op",
-    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_reducer_dataset_op",
-    srcs = ["group_by_reducer_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_window_dataset_op",
-    srcs = ["group_by_window_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        ":window_dataset",
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -204,7 +145,7 @@ tf_kernel_library(
     srcs = ["filter_dataset_op.cc"],
     deps = [
         ":captured_function",
-        ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -217,7 +158,6 @@ tf_kernel_library(
     name = "filter_by_component_dataset_op",
     srcs = ["filter_by_component_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -231,26 +171,11 @@ tf_kernel_library(
     srcs = ["map_dataset_op.cc"],
     deps = [
         ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "map_and_batch_dataset_op",
-    srcs = ["map_and_batch_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:inplace_ops",
     ],
 )
 
@@ -259,7 +184,6 @@ cc_library(
     srcs = ["parallel_map_iterator.cc"],
     hdrs = ["parallel_map_iterator.h"],
     deps = [
-        ":dataset",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -269,22 +193,12 @@ cc_library(
     ],
 )
 
-tf_kernel_library(
-    name = "parse_example_dataset_op",
-    srcs = ["parse_example_dataset_op.cc"],
-    deps = [
-        ":parallel_map_iterator",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-    ],
-)
-
 tf_kernel_library(
     name = "parallel_map_dataset_op",
     srcs = ["parallel_map_dataset_op.cc"],
     deps = [
         ":captured_function",
-        ":dataset",
+        ":dataset_utils",
         ":parallel_map_iterator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -309,26 +223,11 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "scan_dataset_op",
-    srcs = ["scan_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "flat_map_dataset_op",
     srcs = ["flat_map_dataset_op.cc"],
     deps = [
         ":captured_function",
-        ":dataset",
         ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -343,7 +242,6 @@ tf_kernel_library(
     srcs = ["interleave_dataset_op.cc"],
     deps = [
         ":captured_function",
-        ":dataset",
         ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -358,14 +256,12 @@ tf_kernel_library(
     srcs = ["parallel_interleave_dataset_op.cc"],
     deps = [
         ":captured_function",
-        ":dataset",
         ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -393,7 +289,6 @@ tf_kernel_library(
     srcs = ["prefetch_dataset_op.cc"],
     hdrs = ["prefetch_dataset_op.h"],
     deps = [
-        ":dataset",
         ":prefetch_autotuner",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -408,7 +303,6 @@ tf_kernel_library(
     name = "repeat_dataset_op",
     srcs = ["repeat_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -420,7 +314,6 @@ tf_kernel_library(
     name = "take_dataset_op",
     srcs = ["take_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -432,43 +325,6 @@ tf_kernel_library(
     name = "skip_dataset_op",
     srcs = ["skip_dataset_op.cc"],
     deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "stats_dataset_ops",
-    srcs = ["stats_dataset_ops.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_kernel_library(
-    name = "stats_aggregator_dataset_op",
-    srcs = ["stats_aggregator_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "random_dataset_op",
-    srcs = ["random_dataset_op.cc"],
-    deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -480,7 +336,6 @@ tf_kernel_library(
     name = "range_dataset_op",
     srcs = ["range_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -492,7 +347,6 @@ tf_kernel_library(
     name = "shuffle_dataset_op",
     srcs = ["shuffle_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -504,7 +358,6 @@ tf_kernel_library(
     name = "sparse_tensor_slice_dataset_op",
     srcs = ["sparse_tensor_slice_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -516,53 +369,26 @@ tf_kernel_library(
     name = "tensor_dataset_op",
     srcs = ["tensor_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
     ],
 )
 
-tf_kernel_library(
-    name = "tensor_queue_dataset_op",
-    srcs = ["tensor_queue_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
     ],
 )
 
-tf_kernel_library(
-    name = "unbatch_dataset_op",
-    srcs = ["unbatch_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -574,7 +400,6 @@ tf_kernel_library(
     name = "concatenate_dataset_op",
     srcs = ["concatenate_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -586,7 +411,6 @@ tf_kernel_library(
     name = "reader_dataset_ops",
     srcs = ["reader_dataset_ops.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -594,27 +418,11 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "sql_dataset_ops",
-    srcs = [
-        "sql_dataset_ops.cc",
-    ],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels/data/sql",
-    ],
-)
-
 tf_kernel_library(
     name = "iterator_ops",
     srcs = ["iterator_ops.cc"],
     hdrs = ["iterator_ops.h"],
     deps = [
-        ":dataset",
         ":dataset_utils",
         ":optional_ops",
         "//tensorflow/core:core_cpu_internal",
@@ -625,6 +433,7 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
         "//tensorflow/core/kernels:ops_util",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -632,7 +441,6 @@ tf_kernel_library(
     name = "multi_device_iterator_ops",
     srcs = ["multi_device_iterator_ops.cc"],
     deps = [
-        ":dataset",
         ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -646,6 +454,10 @@ tf_kernel_library(
     name = "optional_ops",
     srcs = ["optional_ops.cc"],
     hdrs = ["optional_ops.h"],
+    gpu_srcs = [
+        "optional_ops.cu.cc",
+        "optional_ops.h",
+    ],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -653,6 +465,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
     ],
 )
 
@@ -660,7 +473,6 @@ tf_kernel_library(
     name = "cache_dataset_ops",
     srcs = ["cache_dataset_ops.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -673,11 +485,9 @@ tf_kernel_library(
     name = "optimize_dataset_op",
     srcs = ["optimize_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
@@ -686,6 +496,8 @@ tf_kernel_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/optimizers/data",
+        "//tensorflow/core/grappler/optimizers/data:function_utils",
+        "//tensorflow/core/grappler/optimizers/data:graph_utils",
     ],
 )
 
@@ -693,7 +505,6 @@ tf_kernel_library(
     name = "model_dataset_op",
     srcs = ["model_dataset_op.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -706,7 +517,6 @@ tf_kernel_library(
     name = "dataset_ops",
     srcs = ["dataset_ops.cc"],
     deps = [
-        ":dataset",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -722,18 +532,13 @@ tf_kernel_library(
         ":batch_dataset_op",
         ":cache_dataset_ops",
         ":concatenate_dataset_op",
-        ":dataset",
         ":dataset_ops",
-        ":dense_to_sparse_batch_dataset_op",
         ":filter_by_component_dataset_op",
         ":filter_dataset_op",
         ":flat_map_dataset_op",
         ":generator_dataset_op",
-        ":group_by_reducer_dataset_op",
-        ":group_by_window_dataset_op",
         ":interleave_dataset_op",
         ":iterator_ops",
-        ":map_and_batch_dataset_op",
         ":map_dataset_op",
         ":map_defun_op",
         ":model_dataset_op",
@@ -743,46 +548,22 @@ tf_kernel_library(
         ":padded_batch_dataset_op",
         ":parallel_interleave_dataset_op",
         ":parallel_map_dataset_op",
-        ":parse_example_dataset_op",
         ":prefetch_dataset_op",
-        ":random_dataset_op",
         ":range_dataset_op",
         ":reader_dataset_ops",
         ":repeat_dataset_op",
-        ":scan_dataset_op",
         ":shuffle_dataset_op",
         ":skip_dataset_op",
-        ":slide_dataset_op",
         ":sparse_tensor_slice_dataset_op",
-        ":sql_dataset_ops",
-        ":stats_aggregator_dataset_op",
-        ":stats_aggregator_ops",
-        ":stats_dataset_ops",
         ":take_dataset_op",
         ":tensor_dataset_op",
-        ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
-        ":unbatch_dataset_op",
         ":window_dataset_op",
-        ":writer_ops",
         ":zip_dataset_op",
         "//tensorflow/core/kernels/data/experimental:dataset_kernels",
     ],
 )
 
-tf_kernel_library(
-    name = "writer_ops",
-    srcs = ["writer_ops.cc"],
-    deps = [
-        ":dataset",
-        ":dataset_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:ops_util",
-    ],
-)
-
 tf_kernel_library(
     name = "map_defun_op",
     srcs = ["map_defun_op.cc"],
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index d1db1d7becaa1976b4c6de32efbfefb02635a9b9..1f8d2bdbae897e471113375150935b69e47f6d84 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class BatchDatasetOp : public UnaryDatasetOpKernel {
@@ -95,6 +95,15 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
       return strings::StrCat("BatchDatasetOp(", batch_size_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -117,7 +126,6 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        AddConstantParameter(ctx, "batch_size", dataset()->batch_size_);
         return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
       }
 
@@ -173,8 +181,14 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           const Tensor& first_element = batch_elements[0][component_index];
           TensorShape batch_component_shape({num_batch_elements});
           batch_component_shape.AppendShape(first_element.shape());
-          Tensor batch_component(ctx->allocator({}), first_element.dtype(),
-                                 batch_component_shape);
+          out_tensors->emplace_back(ctx->allocator({}), first_element.dtype(),
+                                    batch_component_shape);
+          if (!out_tensors->back().IsInitialized()) {
+            return errors::ResourceExhausted(
+                "Failed to allocate memory for the batch of component ",
+                component_index);
+          }
+          Tensor& batch_component = out_tensors->back();
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
           for (size_t i = 0; i < num_batch_elements; ++i) {
@@ -192,13 +206,18 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
                 std::move(batch_elements[i][component_index]), &batch_component,
                 i));
           }
-          out_tensors->emplace_back(std::move(batch_component));
         }
         *end_of_sequence = false;
         return Status::OK();
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         dataset()->batch_size_);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (!input_impl_) {
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 34c6c8653854b026e74296c697096cf7ebda14c0..f00b38e732a7835896a275d14507e75eade05fa1 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
@@ -23,7 +24,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level description of
+// See documentation in ../../ops/dataset_ops.cc for a high-level description of
 // the following op.
 
 class CacheDatasetOp : public UnaryDatasetOpKernel {
@@ -84,6 +85,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return "CacheDatasetOp::FileDataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -133,6 +136,12 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("mode"), mode_));
@@ -243,6 +252,12 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
         }
 
        protected:
+        std::shared_ptr<model::Node> CreateNode(
+            IteratorContext* ctx, model::Node::Args args) const override {
+          return model::MakeKnownRatioNode(std::move(args),
+                                           /*ratio=*/1);
+        }
+
         Status SaveInternal(IteratorStateWriter* writer) override {
           mutex_lock l(mu_);
           if (iteration_completed_) {
@@ -468,6 +483,12 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
         }
 
        protected:
+        std::shared_ptr<model::Node> CreateNode(
+            IteratorContext* ctx, model::Node::Args args) const override {
+          return model::MakeKnownRatioNode(std::move(args),
+                                           /*ratio=*/1);
+        }
+
         Status SaveInternal(IteratorStateWriter* writer) override {
           mutex_lock l(mu_);
           TF_RETURN_IF_ERROR(
@@ -516,10 +537,12 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
         // `FileReaderIterator` and seek to the `cur_index`.
         switch (mode_) {
           case Mode::read:
-            iterator_.reset(new FileReaderIterator({dataset(), prefix()}));
+            iterator_.reset(new FileReaderIterator(
+                {dataset(), strings::StrCat(prefix(), "Impl")}));
             break;
           case Mode::write:
-            iterator_.reset(new FileWriterIterator({dataset(), prefix()}));
+            iterator_.reset(new FileWriterIterator(
+                {dataset(), strings::StrCat(prefix(), "Impl")}));
         }
       }
 
@@ -542,9 +565,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
   class MemoryDataset : public DatasetBase {
    public:
     explicit MemoryDataset(OpKernelContext* ctx, const DatasetBase* input)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          cache_(new MemoryCache()) {
+        : DatasetBase(DatasetContext(ctx)), input_(input) {
       input->Ref();
     }
 
@@ -552,8 +573,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new MemoryIterator(
-          {this, strings::StrCat(prefix, "::MemoryCache")}, cache_));
+      return std::unique_ptr<IteratorBase>(
+          new MemoryIterator({this, strings::StrCat(prefix, "::MemoryCache")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -568,6 +589,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return "CacheDatasetOp::MemoryDataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -587,10 +610,12 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
     // The expected use is that a single `MemoryWriterIterator` populates the
     // cache with dataset elements. Once all elements are cached, the cache can
     // be used by one or more `MemoryReaderIterator`s.
-    class MemoryCache {
+    class MemoryCache : public ResourceBase {
      public:
       MemoryCache() = default;
 
+      string DebugString() override { return "CacheDataset::MemoryCache"; }
+
       // Marks the cache as completed.
       void Complete() {
         mutex_lock l(mu_);
@@ -657,15 +682,25 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     class MemoryIterator : public DatasetIterator<MemoryDataset> {
      public:
-      explicit MemoryIterator(const Params& params,
-                              const std::shared_ptr<MemoryCache>& cache)
-          : DatasetIterator<MemoryDataset>(params), cache_(cache) {
-        mode_ = cache->MaybeClaim() ? Mode::write : Mode::read;
-        InitializeIterator();
-      }
+      explicit MemoryIterator(const Params& params)
+          : DatasetIterator<MemoryDataset>(params) {}
+
+      ~MemoryIterator() override { cache_->Unref(); }
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(mu_);
+        // Use the resource manager in the iterator context to get / create
+        // a cache.
+        ResourceMgr* mgr = ctx->resource_mgr();
+        const string name =
+            strings::StrCat(prefix(), "::", dataset()->name(), "::MemoryCache");
+        TF_RETURN_IF_ERROR(mgr->LookupOrCreate<MemoryCache>(
+            "tf_data", name, &cache_, [](MemoryCache** cache) {
+              *cache = new MemoryCache();
+              return Status::OK();
+            }));
+        mode_ = cache_->MaybeClaim() ? Mode::write : Mode::read;
+        InitializeIterator();
         if (mode_ == Mode::read && !cache_->IsCompleted()) {
           return errors::Internal(
               "Cache should only be read after it has been completed.");
@@ -681,6 +716,12 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("mode"), mode_));
@@ -758,8 +799,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
      private:
       class MemoryWriterIterator : public DatasetIterator<MemoryDataset> {
        public:
-        explicit MemoryWriterIterator(const Params& params,
-                                      const std::shared_ptr<MemoryCache>& cache)
+        explicit MemoryWriterIterator(const Params& params, MemoryCache* cache)
             : DatasetIterator<MemoryDataset>(params), cache_(cache) {
           CHECK(cache_);
         }
@@ -792,11 +832,18 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
             cache_->Complete();
             return Status::OK();
           }
+          RecordBufferEnqueue(ctx, *out_tensors);
           cache_->emplace_back(*out_tensors);
           return Status::OK();
         }
 
        protected:
+        std::shared_ptr<model::Node> CreateNode(
+            IteratorContext* ctx, model::Node::Args args) const override {
+          return model::MakeKnownRatioNode(std::move(args),
+                                           /*ratio=*/1);
+        }
+
         Status SaveInternal(IteratorStateWriter* writer) override {
           mutex_lock l(mu_);
           return SaveInput(writer, input_impl_);
@@ -811,31 +858,25 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
        private:
         mutex mu_;
         std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-        std::shared_ptr<MemoryCache> cache_;
+        MemoryCache* const cache_ GUARDED_BY(mu_);  // not owned.
       };  // MemoryWriterIterator
 
       class MemoryReaderIterator : public DatasetIterator<MemoryDataset> {
        public:
-        explicit MemoryReaderIterator(const Params& params,
-                                      const std::shared_ptr<MemoryCache>& cache)
+        explicit MemoryReaderIterator(const Params& params, MemoryCache* cache)
             : DatasetIterator<MemoryDataset>(params), cache_(cache), index_(0) {
           CHECK(cache);
         }
 
-       protected:
-        Status SaveInternal(IteratorStateWriter* writer) override {
-          mutex_lock l(mu_);
-          TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("index"), index_));
-          return Status::OK();
-        }
-
-        Status RestoreInternal(IteratorContext* ctx,
-                               IteratorStateReader* reader) override {
-          mutex_lock l(mu_);
-          {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("index"), &temp));
-            index_ = static_cast<size_t>(temp);
+        Status Initialize(IteratorContext* ctx) override {
+          // The memory allocated for the cache is owned by the parent
+          // dataset but performance modeling uses the iterator abstraction and
+          // thus we record the memory allocated for the cache here. The caveat
+          // is that this is incorrect if there are concurrent instances of this
+          // iterator.
+          tf_shared_lock l(mu_);
+          for (size_t i = 0; i < cache_->size(); ++i) {
+            RecordBufferEnqueue(ctx, cache_->at(i));
           }
           return Status::OK();
         }
@@ -857,33 +898,56 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
           }
         }
 
+       protected:
+        std::shared_ptr<model::Node> CreateNode(
+            IteratorContext* ctx, model::Node::Args args) const override {
+          return model::MakeKnownRatioNode(std::move(args),
+                                           /*ratio=*/1);
+        }
+
+        Status SaveInternal(IteratorStateWriter* writer) override {
+          mutex_lock l(mu_);
+          TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("index"), index_));
+          return Status::OK();
+        }
+
+        Status RestoreInternal(IteratorContext* ctx,
+                               IteratorStateReader* reader) override {
+          mutex_lock l(mu_);
+          {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("index"), &temp));
+            index_ = static_cast<size_t>(temp);
+          }
+          return Status::OK();
+        }
+
        private:
         mutex mu_;
-        const std::shared_ptr<MemoryCache> cache_;
+        MemoryCache* const cache_ GUARDED_BY(mu_);  // not owned.
         size_t index_ GUARDED_BY(mu_);
       };  // MemoryReaderIterator
 
       void InitializeIterator() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         switch (mode_) {
           case Mode::read:
-            iterator_.reset(
-                new MemoryReaderIterator({dataset(), prefix()}, cache_));
+            iterator_.reset(new MemoryReaderIterator(
+                {dataset(), strings::StrCat(prefix(), "Impl")}, cache_));
             break;
           case Mode::write:
-            iterator_.reset(
-                new MemoryWriterIterator({dataset(), prefix()}, cache_));
+            iterator_.reset(new MemoryWriterIterator(
+                {dataset(), strings::StrCat(prefix(), "Impl")}, cache_));
         }
       }
 
       mutex mu_;
-      std::shared_ptr<MemoryCache> cache_;
+      MemoryCache* cache_ GUARDED_BY(mu_);  // not owned.
       enum Mode { read, write };
       Mode mode_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> iterator_ GUARDED_BY(mu_);
     };  // MemoryIterator
 
     const DatasetBase* const input_;
-    const std::shared_ptr<MemoryCache> cache_;
   };  // MemoryDataset
 };    // CacheDatasetOp
 
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 0bb929b3ce6c685c8a238223d47486d2041a1160..973b6b06048fb715d9fd32791223cda21751b1c8 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -19,8 +19,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/util/ptr_util.h"
 
@@ -75,6 +78,8 @@ class SimpleStepStatsCollector : public StepStatsCollectorInterface {
       end_time_ns_ = Env::Default()->NowNanos();
     }
 
+    bool TrackAllocations() const override { return false; }
+
     void SetMemory(OpKernelContext* ctx) override {}
 
     void SetOutput(int slot, const Tensor* tensor) override {}
@@ -114,10 +119,34 @@ Status CapturedFunction::Create(
   return Status::OK();
 }
 
-CapturedFunction::~CapturedFunction() {
-  if (lib_ != nullptr && f_handle_ != kInvalidHandle) {
-    lib_->ReleaseHandle(f_handle_).IgnoreError();
+Status CapturedFunction::Instantiate(
+    IteratorContext* ctx, std::unique_ptr<InstantiatedCapturedFunction>*
+                              instantiated_captured_function) {
+  // The context's runtime will be used for all subsequent calls.
+  FunctionLibraryRuntime* lib = ctx->lib();
+  FunctionLibraryRuntime::InstantiateOptions inst_opts;
+  inst_opts.overlay_lib = ctx->function_library().get();
+  inst_opts.create_kernels_eagerly = true;
+  if (!use_inter_op_parallelism_) {
+    inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
+  }
+
+  FunctionLibraryRuntime::Handle f_handle;
+  TF_RETURN_IF_ERROR(ctx->function_handle_cache()->Instantiate(
+      func_.name(), AttrSlice(&func_.attr()), inst_opts, &f_handle));
+  const FunctionBody* fbody = lib->GetFunctionBody(f_handle);
+  if (fbody == nullptr) {
+    return errors::Internal("Failed to instantiate function body.");
+  }
+
+  DataTypeVector ret_types;
+  for (const auto& ret_type : fbody->ret_types) {
+    ret_types.push_back(ret_type);
   }
+
+  instantiated_captured_function->reset(new InstantiatedCapturedFunction(
+      lib, f_handle, std::move(ret_types), *ctx->runner(), this));
+  return Status::OK();
 }
 
 namespace {
@@ -240,35 +269,35 @@ class BorrowedArgsCallFrame : public CallFrameBase {
 
 }  // namespace
 
-Status CapturedFunction::GetHandle(IteratorContext* ctx,
-                                   FunctionLibraryRuntime::Handle* out_handle) {
-  tf_shared_lock l(mu_);
-  if (lib_ == nullptr) {
-    return errors::Internal("Captured function \"", func_.name(),
-                            "\" was called before it was instantiated.");
-  }
-  if (ctx->lib() != lib_) {
-    return errors::Internal("Captured function \"", func_.name(),
-                            "\" was called with a different "
-                            "FunctionLibraryRuntime*, which is not permitted.");
-  }
-  *out_handle = f_handle_;
-  return Status::OK();
-}
-
-Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
-                             std::vector<Tensor>* rets) {
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(GetHandle(ctx, &handle));
-
+InstantiatedCapturedFunction::InstantiatedCapturedFunction(
+    FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
+    DataTypeVector ret_types, std::function<void(std::function<void()>)> runner,
+    CapturedFunction* captured_func)
+    : lib_(lib),
+      f_handle_(f_handle),
+      ret_types_(std::move(ret_types)),
+      captured_runner_(std::move(runner)),
+      captured_func_(captured_func) {}
+
+// NOTE: We don't release f_handle_ here and instead delegate the function
+// handle releasing to the FunctionHandleCache. This is because in some cases
+// (RepeatDatasetOp in particular), we want to keep the function state (e.g.
+// random number generator) even after the Iterator is reset after going through
+// one epoch.
+InstantiatedCapturedFunction::~InstantiatedCapturedFunction() {}
+
+Status InstantiatedCapturedFunction::Run(IteratorContext* ctx,
+                                         std::vector<Tensor>&& args,
+                                         std::vector<Tensor>* rets) const {
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(f_opts.step_id, [ctx](const string& name) {
-    ctx->lib()->device()->resource_manager()->Cleanup(name).IgnoreError();
-  });
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(
+      f_opts.step_id, [this](const string& name) {
+        lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
+      });
   f_opts.step_container = &step_container;
   f_opts.runner = ctx->runner();
-  if (ctx->lib()->device()->device_type() != DEVICE_CPU) {
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -280,10 +309,11 @@ Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
   CancellationManager c_mgr;
   f_opts.cancellation_manager = &c_mgr;
 
-  OwnedArgsCallFrame frame(std::move(args), &captured_inputs_, ret_types_);
+  OwnedArgsCallFrame frame(std::move(args), &captured_func_->captured_inputs(),
+                           ret_types_);
   Notification n;
   Status s;
-  ctx->lib()->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](Status func_status) {
     s.Update(func_status);
     n.Notify();
   });
@@ -292,20 +322,18 @@ Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
   return frame.ConsumeRetvals(rets);
 }
 
-Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
-                                             const std::vector<Tensor>& args,
-                                             std::vector<Tensor>* rets) {
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(GetHandle(ctx, &handle));
-
+Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
+    IteratorContext* ctx, const std::vector<Tensor>& args,
+    std::vector<Tensor>* rets) const {
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(f_opts.step_id, [ctx](const string& name) {
-    ctx->lib()->device()->resource_manager()->Cleanup(name).IgnoreError();
-  });
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(
+      f_opts.step_id, [this](const string& name) {
+        lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
+      });
   f_opts.step_container = &step_container;
   f_opts.runner = ctx->runner();
-  if (ctx->lib()->device()->device_type() != DEVICE_CPU) {
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -317,11 +345,12 @@ Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
   CancellationManager c_mgr;
   f_opts.cancellation_manager = &c_mgr;
 
-  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
+                              ret_types_);
   Notification n;
   Status s;
 
-  ctx->lib()->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](Status func_status) {
     s.Update(func_status);
     n.Notify();
   });
@@ -330,65 +359,17 @@ Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
   return frame.ConsumeRetvals(rets);
 }
 
-Status CapturedFunction::Instantiate(IteratorContext* ctx) {
-  mutex_lock l(mu_);
-  if (lib_ == nullptr) {
-    // The context's runtime will be used for all subsequent calls.
-    lib_ = ctx->lib();
-    DCHECK(f_handle_ == kInvalidHandle);
-    FunctionLibraryRuntime::InstantiateOptions inst_opts;
-    inst_opts.overlay_lib = ctx->function_library().get();
-    inst_opts.state_handle = std::to_string(random::New64());
-    inst_opts.create_kernels_eagerly = true;
-    if (!use_inter_op_parallelism_) {
-      inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
-    }
-    Status s = (lib_->Instantiate(func_.name(), AttrSlice(&func_.attr()),
-                                  inst_opts, &f_handle_));
-    TF_RETURN_IF_ERROR(s);
-    const FunctionBody* fbody = lib_->GetFunctionBody(f_handle_);
-    if (fbody == nullptr) {
-      return errors::Internal("Failed to instantiate function body.");
-    }
-    ret_types_ = fbody->ret_types;
-  } else {
-    if (ctx->lib() != lib_) {
-      return errors::Internal(
-          "Captured function was called with a different "
-          "FunctionLibraryRuntime*, which is not permitted.");
-    }
-  }
-  if (captured_runner_ == nullptr) {
-    captured_runner_ = *ctx->runner();
-  }
-  return Status::OK();
-}
-
-Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
-                                         std::vector<Tensor>* rets) {
-  FunctionLibraryRuntime* lib;
-  FunctionLibraryRuntime::Handle handle;
-  std::function<void(std::function<void()>)>* runner;
-  {
-    tf_shared_lock l(mu_);
-    if (lib_ == nullptr) {
-      return errors::FailedPrecondition(
-          "`CapturedFunction::Instantiate()` must be called before a call to "
-          "`CapturedFunction::RunInstantiated()`.");
-    }
-    lib = lib_;
-    handle = f_handle_;
-    runner = &captured_runner_;
-  }
-
+Status InstantiatedCapturedFunction::RunInstantiated(
+    const std::vector<Tensor>& args, std::vector<Tensor>* rets) {
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(f_opts.step_id, [lib](const string& name) {
-    lib->device()->resource_manager()->Cleanup(name).IgnoreError();
-  });
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(
+      f_opts.step_id, [this](const string& name) {
+        lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
+      });
   f_opts.step_container = &step_container;
-  f_opts.runner = runner;
-  if (lib->device()->device_type() != DEVICE_CPU) {
+  f_opts.runner = &captured_runner_;
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -400,11 +381,12 @@ Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
   CancellationManager c_mgr;
   f_opts.cancellation_manager = &c_mgr;
 
-  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
+                              ret_types_);
   Notification n;
   Status s;
 
-  lib->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](Status func_status) {
     s.Update(func_status);
     n.Notify();
   });
@@ -413,33 +395,25 @@ Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
   return frame.ConsumeRetvals(rets);
 }
 
-void CapturedFunction::RunAsync(IteratorContext* ctx,
-                                std::vector<Tensor>&& args,
-                                std::vector<Tensor>* rets,
-                                FunctionLibraryRuntime::DoneCallback done,
-                                const string& prefix) {
+void InstantiatedCapturedFunction::RunAsync(
+    IteratorContext* ctx, std::vector<Tensor>&& args, std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::DoneCallback done, const string& prefix) const {
   // NOTE(mrry): This method does not transfer ownership of `ctx`, and it may
   // be deleted before `done` is called. Take care not to capture `ctx` in any
   // code that may execute asynchronously in this function.
-  FunctionLibraryRuntime::Handle handle;
-  Status s = GetHandle(ctx, &handle);
-  if (!s.ok()) {
-    done(s);
-    return;
-  }
-  OwnedArgsCallFrame* frame =
-      new OwnedArgsCallFrame(std::move(args), &captured_inputs_, ret_types_);
+  OwnedArgsCallFrame* frame = new OwnedArgsCallFrame(
+      std::move(args), &captured_func_->captured_inputs(), ret_types_);
 
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ResourceMgr* resource_mgr = ctx->lib()->device()->resource_manager();
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ResourceMgr* resource_mgr = lib_->device()->resource_manager();
   ScopedStepContainer* step_container = new ScopedStepContainer(
       f_opts.step_id, [resource_mgr](const string& name) {
         resource_mgr->Cleanup(name).IgnoreError();
       });
   f_opts.step_container = step_container;
   f_opts.runner = ctx->runner();
-  if (ctx->lib()->device()->device_type() != DEVICE_CPU) {
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -451,15 +425,17 @@ void CapturedFunction::RunAsync(IteratorContext* ctx,
   CancellationManager* c_mgr = new CancellationManager;
   f_opts.cancellation_manager = c_mgr;
   std::shared_ptr<SimpleStepStatsCollector> stats_collector;
-  if (ctx->model()) {
+  if (ctx->model() || ctx->stats_aggregator()) {
     stats_collector = MakeUnique<SimpleStepStatsCollector>();
   }
   f_opts.stats_collector = stats_collector.get();
 
   auto callback = std::bind(
-      [rets, step_container, c_mgr, frame](
+      [this, rets, step_container, c_mgr, frame](
           const FunctionLibraryRuntime::DoneCallback& done,
-          const std::shared_ptr<model::Model>& model, const string& prefix,
+          const std::shared_ptr<model::Model>& model,
+          const std::shared_ptr<StatsAggregator>& stats_aggregator,
+          const string& prefix,
           const std::shared_ptr<SimpleStepStatsCollector>& stats_collector,
           // Begin unbound arguments.
           Status s) {
@@ -469,6 +445,14 @@ void CapturedFunction::RunAsync(IteratorContext* ctx,
           s = frame->ConsumeRetvals(rets);
         }
         delete frame;
+
+        if (stats_aggregator) {
+          stats_aggregator->AddToHistogram(
+              strings::StrCat(
+                  str_util::Split(prefix, "::", str_util::SkipEmpty()).back(),
+                  "::", captured_func_->func().name(), "::execution_time"),
+              {static_cast<float>(stats_collector->processing_time())});
+        }
         if (model) {
           model->AddProcessingTime(prefix, stats_collector->processing_time());
           model->RecordStart(prefix, false /* stop_output */);
@@ -478,18 +462,16 @@ void CapturedFunction::RunAsync(IteratorContext* ctx,
           model->RecordStop(prefix, false /* start_output */);
         }
       },
-      std::move(done), ctx->model(), prefix, std::move(stats_collector),
-      std::placeholders::_1);
+      std::move(done), ctx->model(), ctx->stats_aggregator(), prefix,
+      std::move(stats_collector), std::placeholders::_1);
 
-  ctx->lib()->Run(f_opts, handle, frame, std::move(callback));
+  lib_->Run(f_opts, f_handle_, frame, std::move(callback));
 }
 
 CapturedFunction::CapturedFunction(const NameAttrList& func,
                                    std::vector<Tensor> captured_inputs,
                                    bool use_inter_op_parallelism)
     : func_(func),
-      lib_(nullptr),
-      f_handle_(kInvalidHandle),
       captured_inputs_(std::move(captured_inputs)),
       use_inter_op_parallelism_(use_inter_op_parallelism) {}
 
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index a10376bf9700e41b50503d661420a60901f48ff7..cffaf405ecbad4302be4e1b6022fda6db3dad359 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -18,9 +18,9 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -34,59 +34,41 @@ class ResourceMgr;
 
 namespace data {
 
-// A `CapturedFunction` encapsulates a TensorFlow function and all of
-// the runtime support required to execute it.
+class CapturedFunction;
+
+// An InstantiatedCapturedFunction encapsulates all the runtime support needed
+// to execute a tensorflow function.
 //
-// The `Dataset`-related classes use `CapturedFunction` to execute
-// TensorFlow functions outside a the normal `OpKernel::Compute()`
-// context.
-class CapturedFunction {
+// While CapturedFunction (below) encapsulates the more permanent attributes
+// of the function i.e. name, captured arguments etc.,
+// InstantiatedCapturedFunction encapsulates the more runtime aspects i.e.
+// FunctionLibraryRuntime, function handle etc.
+//
+// The `Iterator-`related classes use `InstantiatedCapturedFunction` to execute
+// functions outside a the normal `OpKernel::Compute()` context.
+class InstantiatedCapturedFunction {
  public:
-  // Creates a new instance using a list of named attributes, fetching captured
-  // inputs from a context argument.
-  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
-                       const string& argument,
-                       std::unique_ptr<CapturedFunction>* out_function);
-
-  // Creates a new instance using a list of named attributes, fetching captured
-  // inputs from a context argument.
-  //
-  // If `use_inter_op_parallelism` is false, the runtime may use an executor
-  // that is optimized for small functions.
-  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
-                       const string& argument, bool use_inter_op_parallelism,
-                       std::unique_ptr<CapturedFunction>* out_function);
-
-  ~CapturedFunction();
+  ~InstantiatedCapturedFunction();
 
-  // Runs the "Captured function" using the given FLR and caches the lib and
-  // handle generated during instantiation. If Run is called with a different
-  // lib afterwards, generates an error. This method takes ownership of the
-  // tensors in `args`, in order to be able to deallocate them as early as
+  // Runs the "Instantiated Captured function". This method takes ownership of
+  // the tensors in `args`, in order to be able to deallocate them as early as
   // possible. Use `RunWithBorrowedArgs()` if the caller needs to retain
   // ownership of the `args`.
   Status Run(IteratorContext* ctx, std::vector<Tensor>&& args,
-             std::vector<Tensor>* rets);
+             std::vector<Tensor>* rets) const;
 
   // Synchronously runs the captured function on the given `args`, and stores
   // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
   // possible.
   Status RunWithBorrowedArgs(IteratorContext* ctx,
                              const std::vector<Tensor>& args,
-                             std::vector<Tensor>* rets);
-
-  // Explicitly instantiate this function for use in the given
-  // context. This method, and the context-less overload
-  // `RunInstantiated()` below can be useful for calling a captured
-  // function in cases where an `IteratorContext*` is not available
-  // (such as a destructor).
-  Status Instantiate(IteratorContext* ctx);
+                             std::vector<Tensor>* rets) const;
 
   // Synchronously runs the captured function on the given `args`, and stores
   // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
-  // possible.
-  //
-  // REQUIRES: `this->Instantiate()` must have been called before this method.
+  // possible. This can be useful for calling a captured
+  // function in cases where an `IteratorContext*` is not available
+  // (such as a destructor).
   Status RunInstantiated(const std::vector<Tensor>& args,
                          std::vector<Tensor>* rets);
 
@@ -97,16 +79,9 @@ class CapturedFunction {
   void RunAsync(IteratorContext* ctx, std::vector<Tensor>&& args,
                 std::vector<Tensor>* rets,
                 FunctionLibraryRuntime::DoneCallback done,
-                const string& prefix);
-
-  // Returns the named list of function arguments.
-  const NameAttrList& func() { return func_; }
+                const string& prefix) const;
 
-  // Returns that additional captured inputs that will be passed to the function
-  // when `Run*()` is called.
-  const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
-
-  // Returns a step ID for use when running a `CapturedFunction`.
+  // Returns a step ID for use when running an `InstantiatedCapturedFunction`.
   static int64 generate_step_id() {
     // Choose a step ID that is guaranteed not to clash with any
     // Session-generated step ID. DirectSession only generates
@@ -116,26 +91,66 @@ class CapturedFunction {
     return -std::abs(static_cast<int64>(random::New64()));
   }
 
+ private:
+  InstantiatedCapturedFunction(
+      FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
+      DataTypeVector ret_types,
+      std::function<void(std::function<void()>)> runner,
+      CapturedFunction* captured_func);
+
+  friend class CapturedFunction;
+
+  FunctionLibraryRuntime* const lib_;
+  const FunctionLibraryRuntime::Handle f_handle_;
+  const DataTypeVector ret_types_;
+  std::function<void(std::function<void()>)> captured_runner_;
+  CapturedFunction* const captured_func_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(InstantiatedCapturedFunction);
+};
+
+// A `CapturedFunction` encapsulates a TensorFlow function, plus any "captured"
+// arguments that it closed over in the user program.
+class CapturedFunction {
+ public:
+  // Creates a new instance using a list of named attributes, fetching captured
+  // inputs from a context argument.
+  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
+                       const string& argument,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
+  // Creates a new instance using a list of named attributes, fetching captured
+  // inputs from a context argument.
+  //
+  // If `use_inter_op_parallelism` is false, the runtime may use an executor
+  // that is optimized for small functions.
+  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
+                       const string& argument, bool use_inter_op_parallelism,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
+  // Instantiates this function for use in the given context, providing an
+  // InstantiatedCapturedFunction that can be used to execute functions.
+  Status Instantiate(IteratorContext* ctx,
+                     std::unique_ptr<InstantiatedCapturedFunction>*
+                         instantiated_captured_function);
+
+  // Returns the named list of function arguments.
+  const NameAttrList& func() { return func_; }
+
+  // Returns that additional captured inputs that will be passed to the function
+  const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
+
  private:
   CapturedFunction(const NameAttrList& func,
                    std::vector<Tensor> captured_inputs,
                    bool use_inter_op_parallelism);
 
-  Status GetHandle(IteratorContext* ctx,
-                   FunctionLibraryRuntime::Handle* out_handle);
-
-  mutex mu_;
   const NameAttrList func_;
-  FunctionLibraryRuntime* lib_ GUARDED_BY(mu_);
-  FunctionLibraryRuntime::Handle f_handle_ GUARDED_BY(mu_);
   const std::vector<Tensor> captured_inputs_;
-  DataTypeSlice ret_types_;
-  std::function<void(std::function<void()>)> captured_runner_ = nullptr;
   const bool use_inter_op_parallelism_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
 };
-
 }  // namespace data
 
 // TODO(b/114112161): Remove these aliases when all users have moved over to the
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 9607e9444c2aa9ec877e27ffbbcd229153961ae9..066b2c9aef4faaf23981b207e46c301e99360119 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
@@ -79,6 +79,18 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       return "ConcatenateDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override {
+      int64 n1 = input_->Cardinality();
+      int64 n2 = to_concatenate_->Cardinality();
+      if (n1 == kInfiniteCardinality || n2 == kInfiniteCardinality) {
+        return kInfiniteCardinality;
+      }
+      if (n1 == kUnknownCardinality || n2 == kUnknownCardinality) {
+        return kUnknownCardinality;
+      }
+      return n1 + n2;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -129,6 +141,12 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index bd1ccd5b5d97f13d31f33d5114edb5298cfddee8..0abfdbb56b577764bbd48dbe0903148b2cf691d6 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 class DatasetToGraphOp : public OpKernel {
  public:
@@ -46,8 +46,25 @@ class DatasetToGraphOp : public OpKernel {
   }
 };
 
+class DatasetCardinalityOp : public OpKernel {
+ public:
+  explicit DatasetCardinalityOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
+    result->scalar<int64>()() = dataset->Cardinality();
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("DatasetToGraph").Device(DEVICE_CPU),
                         DatasetToGraphOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDatasetCardinality").Device(DEVICE_CPU),
+    DatasetCardinalityOp);
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index e10833f525e9d1f406d73ed4eccb9a256da94c6b..4d92d314d3d207d12310bb744b5601ad922bc570 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -15,18 +15,78 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 
 namespace tensorflow {
 namespace data {
 
+Status ComputeShortCircuitIndices(OpKernelContext* ctx,
+                                  const NameAttrList& func,
+                                  std::vector<int>* indices) {
+  FunctionLibraryRuntime::Handle fn_handle;
+  TF_RETURN_IF_ERROR(ctx->function_library()->Instantiate(
+      func.name(), AttrSlice(&func.attr()), &fn_handle));
+  auto cleanup = gtl::MakeCleanup([ctx, fn_handle]() {
+    Status s = ctx->function_library()->ReleaseHandle(fn_handle);
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to release handle: " << s.error_message();
+    }
+  });
+
+  // If the function contains any stateful operations, we conservatively execute
+  // the entire function.
+  if (ctx->function_library()->IsStateful(func.name())) {
+    indices->clear();
+    return Status::OK();
+  }
+
+  const FunctionBody* fn_body =
+      ctx->function_library()->GetFunctionBody(fn_handle);
+  indices->resize(fn_body->ret_nodes.size());
+
+  for (size_t i = 0; i < fn_body->ret_nodes.size(); ++i) {
+    Node* ret_node = fn_body->ret_nodes[i];
+    Node* ret_input_node;
+    TF_RETURN_IF_ERROR(ret_node->input_node(0, &ret_input_node));
+
+    while (ret_input_node->def().op() == "Identity") {
+      TF_RETURN_IF_ERROR(ret_input_node->input_node(0, &ret_input_node));
+    }
+
+    if (ret_input_node->def().op() == FunctionLibraryDefinition::kArgOp) {
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(ret_input_node->def(), "index", &((*indices)[i])));
+    } else {
+      indices->clear();
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+std::vector<bool> ComputeMoveVector(const std::vector<int>& indices) {
+  std::map<int, int> last_use;
+  for (size_t i = 0; i < indices.size(); ++i) {
+    last_use[indices[i]] = i;
+  }
+  std::vector<bool> can_move;
+  can_move.resize(indices.size());
+  for (size_t i = 0; i < indices.size(); ++i) {
+    can_move[i] = last_use[indices[i]] == i;
+  }
+  return can_move;
+}
+
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
-    int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
-    std::unique_ptr<IteratorBase>* out_iterator) {
+    int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
+    StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator) {
   std::vector<Tensor> return_values;
 
-  TF_RETURN_IF_ERROR(
-      captured_func->RunWithBorrowedArgs(ctx, input_element, &return_values));
+  TF_RETURN_IF_ERROR(inst_captured_func.RunWithBorrowedArgs(ctx, input_element,
+                                                            &return_values));
 
   if (!(return_values.size() == 1 && return_values[0].dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(return_values[0].shape()))) {
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 6ec1350cd4105d49eb07242b1f0dad407eb20bfc..23a3d93ed160c95099a5c8ddb237b4c055a1845c 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -15,17 +15,37 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_DATASET_UTILS_H_
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 
+// This method is used to determine whether we can short-circuit the evaluation
+// of the user-defined function `func`. Short-circuting is possible if every
+// function output corresponds to one of its inputs (e.g. `f(x) = x`, `f(x,y) =
+// (y,x)`, or `f(x) = (x,x)`).
+//
+// If short-circuiting is possible, the method stores the mapping from output
+// indices to input indices in `indices`. Otherwise, `indices` will be empty.
+//
+// Returns non-ok status if analysis of the function fails.
+//
+// TODO(jsimsa): Extend this to support constants as well.
+Status ComputeShortCircuitIndices(OpKernelContext* ctx,
+                                  const NameAttrList& func,
+                                  std::vector<int>* indices);
+
+// Given a vector that maps output indices to input indices, return a vector
+// that identifies for which output indices can we move the input (assuming
+// output indices are processed left to right).
+std::vector<bool> ComputeMoveVector(const std::vector<int>& indices);
+
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
-    int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
-    std::unique_ptr<IteratorBase>* out_iterator);
+    int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
+    StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator);
 
 // Returns Status::OK() if `expected` and `received` types match,
 // errors::InvalidArgument otherwise.
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43295b8ebb8f9df2acae8e17162f2d307dd4d9c5
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+TEST(DatasetUtils, ComputeMoveVector) {
+  struct TestCase {
+    std::vector<int> indices;
+    std::vector<bool> expected;
+  };
+
+  TestCase test_cases[] = {
+      TestCase{{}, {}},
+      TestCase{{1}, {true}},
+      TestCase{{1, 1}, {false, true}},
+      TestCase{{1, 2}, {true, true}},
+      TestCase{{1, 1, 2}, {false, true, true}},
+      TestCase{{1, 2, 2}, {true, false, true}},
+  };
+
+  for (auto& test_case : test_cases) {
+    EXPECT_EQ(test_case.expected, ComputeMoveVector(test_case.indices));
+  }
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 43406db3ed8ce625da362a4205b75ba3ce72b20f..7433303f77671cbf67a6365fb1d552edc7b471e0 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -11,35 +11,31 @@ load(
     "tf_kernel_library",
 )
 
-cc_library(
-    name = "indexed_dataset_headers",
-    hdrs = ["indexed_dataset.h"],
+tf_kernel_library(
+    name = "assert_next_dataset_op",
+    srcs = ["assert_next_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
 )
 
 tf_kernel_library(
-    name = "indexed_dataset",
-    srcs = [
-        "identity_indexed_dataset.cc",
-        "indexed_dataset.cc",
-    ],
+    name = "csv_dataset_op",
+    srcs = ["csv_dataset_op.cc"],
     deps = [
-        ":indexed_dataset_headers",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
 tf_kernel_library(
-    name = "prefetching_kernels",
-    srcs = ["prefetching_kernels.cc"],
+    name = "dense_to_sparse_batch_dataset_op",
+    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -59,13 +55,29 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "csv_dataset_op",
-    srcs = ["csv_dataset_op.cc"],
+    name = "group_by_reducer_dataset_op",
+    srcs = ["group_by_reducer_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+    ],
+)
+
+tf_kernel_library(
+    name = "group_by_window_dataset_op",
+    srcs = ["group_by_window_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:window_dataset",
     ],
 )
 
@@ -79,6 +91,18 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "indexed_dataset_op",
+    srcs = ["indexed_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:dataset_utils",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "lmdb_dataset_op",
     srcs = ["lmdb_dataset_op.cc"],
@@ -92,19 +116,198 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "threadpool_dataset_op",
-    srcs = ["threadpool_dataset_op.cc"],
+    name = "map_and_batch_dataset_op",
+    srcs = ["map_and_batch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:inplace_ops",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "matching_files_dataset_op",
+    srcs = ["matching_files_dataset_op.cc"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:dataset",
+    ],
+)
+
+tf_kernel_library(
+    name = "non_serializable_dataset_op",
+    srcs = ["non_serializable_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
 )
 
 tf_kernel_library(
-    name = "unique_dataset_op",
-    srcs = ["unique_dataset_op.cc"],
+    name = "numa_map_and_batch_dataset_op",
+    srcs = ["numa_map_and_batch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:inplace_ops",
+        "//tensorflow/core/kernels/data:captured_function",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_kernel_library(
+    name = "parallel_interleave_dataset_op",
+    srcs = ["parallel_interleave_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "parse_example_dataset_op",
+    srcs = ["parse_example_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/kernels/data:parallel_map_iterator",
+    ],
+)
+
+tf_kernel_library(
+    name = "prefetching_kernels",
+    srcs = ["prefetching_kernels.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "random_dataset_op",
+    srcs = ["random_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "scan_dataset_op",
+    srcs = ["scan_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+    ],
+)
+
+tf_kernel_library(
+    name = "set_stats_aggregator_dataset_op",
+    srcs = ["set_stats_aggregator_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sleep_dataset_op",
+    srcs = ["sleep_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+    ],
+)
+
+tf_kernel_library(
+    name = "sliding_window_dataset_op",
+    srcs = ["sliding_window_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sql_dataset_op",
+    srcs = [
+        "sql_dataset_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data/experimental/sql",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_aggregator_ops",
+    srcs = ["stats_aggregator_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_dataset_ops",
+    srcs = ["stats_dataset_ops.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "to_tf_record_op",
+    srcs = ["to_tf_record_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "threadpool_dataset_op",
+    srcs = ["threadpool_dataset_op.cc"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -114,11 +317,23 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "assert_next_dataset_op",
-    srcs = ["assert_next_dataset_op.cc"],
+    name = "unbatch_dataset_op",
+    srcs = ["unbatch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "unique_dataset_op",
+    srcs = ["unique_dataset_op.cc"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
@@ -128,12 +343,31 @@ tf_kernel_library(
     deps = [
         ":assert_next_dataset_op",
         ":csv_dataset_op",
+        ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
+        ":group_by_reducer_dataset_op",
+        ":group_by_window_dataset_op",
         ":ignore_errors_dataset_op",
-        ":indexed_dataset",
+        ":indexed_dataset_op",
         ":lmdb_dataset_op",
+        ":map_and_batch_dataset_op",
+        ":matching_files_dataset_op",
+        ":non_serializable_dataset_op",
+        ":numa_map_and_batch_dataset_op",
+        ":parallel_interleave_dataset_op",
+        ":parse_example_dataset_op",
         ":prefetching_kernels",
+        ":random_dataset_op",
+        ":scan_dataset_op",
+        ":set_stats_aggregator_dataset_op",
+        ":sleep_dataset_op",
+        ":sliding_window_dataset_op",
+        ":sql_dataset_op",
+        ":stats_aggregator_ops",
+        ":stats_dataset_ops",
         ":threadpool_dataset_op",
+        ":to_tf_record_op",
+        ":unbatch_dataset_op",
         ":unique_dataset_op",
     ],
 )
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index 3511cca0f522b53d77e8c71dd65324a0e332da83..3e87f484b940b336ed68099df7427250a4304207 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -76,6 +76,8 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel {
       return "AssertNextDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -122,6 +124,12 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index 7451ca4cb160aa3db87d52fa3ae501b801db7fd4..f6f58fc430b41d05bccdc413c00151130bf7d36d 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -263,6 +263,11 @@ class CSVDatasetOp : public DatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"),
@@ -640,7 +645,8 @@ class CSVDatasetOp : public DatasetOpKernel {
                                          " fields but have more in record");
         }
         const DataType& dtype = dataset()->out_type_[output_idx];
-        Tensor component(ctx->allocator({}), dtype, {});
+        out_tensors->emplace_back(ctx->allocator({}), dtype, TensorShape({}));
+        Tensor& component = out_tensors->back();
         if ((field.empty() || field == dataset()->na_value_) &&
             dataset()->record_defaults_[output_idx].NumElements() != 1) {
           // If the field is empty or NA value, and default is not given,
@@ -726,7 +732,6 @@ class CSVDatasetOp : public DatasetOpKernel {
                                            " not supported in field ",
                                            output_idx);
         }
-        out_tensors->push_back(std::move(component));
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
similarity index 93%
rename from tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index 237511a07dc890572626ef5abd09beed20cb9dad..97e64dd7444e93660afa6defa31314c909a31c7b 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
@@ -114,6 +114,14 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ + (n % batch_size_ == 0 ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -272,6 +280,13 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(
+            std::move(args),
+            DatasetIterator<Dataset<T>>::dataset()->batch_size_);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(Iterator::SaveInput(writer, input_impl_));
@@ -297,8 +312,9 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("DenseToSparseBatchDataset").Device(DEVICE_CPU),
-                        DenseToSparseBatchDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDenseToSparseBatchDataset").Device(DEVICE_CPU),
+    DenseToSparseBatchDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
index c47a9099c4afc3e3d21cde4c4009b5007789804b..d8bb696167a7971ac21db4b449508946a0c7f11b 100644
--- a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
@@ -202,6 +202,11 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeInterleaveManyNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (selector_input_impl_) {
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
similarity index 92%
rename from tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index e7244ee208cb7069ae80d3ba7affd1c8ee254e05..1c298cfdd6a3a39aabd81cb5226e03b1c3e3de63 100644
--- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -15,17 +15,17 @@ limitations under the License.
 #include <map>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
  public:
@@ -191,11 +191,14 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_finalize_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(
+            ctx, &instantiated_key_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Instantiate(
+            ctx, &instantiated_init_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(
+            ctx, &instantiated_reduce_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_finalize_func_->Instantiate(
+            ctx, &instantiated_finalize_func_));
         return Status::OK();
       }
 
@@ -213,9 +216,8 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
           if (!end_of_input_) {
             // Run the key function on the input element.
             std::vector<Tensor> key_func_output;
-            TF_RETURN_IF_ERROR(
-                dataset()->captured_key_func_->RunWithBorrowedArgs(
-                    ctx, next_input_element, &key_func_output));
+            TF_RETURN_IF_ERROR(instantiated_key_func_->RunWithBorrowedArgs(
+                ctx, next_input_element, &key_func_output));
 
             if (key_func_output.size() != 1 ||
                 key_func_output[0].dtype() != DT_INT64 ||
@@ -229,7 +231,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
             if (states_.find(key) == states_.end()) {
               // Run the init function to create the initial state.
               std::vector<Tensor> init_func_output;
-              TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Run(
+              TF_RETURN_IF_ERROR(instantiated_init_func_->Run(
                   ctx, std::move(key_func_output), &init_func_output));
               states_[key] = init_func_output;
             }
@@ -243,7 +245,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
                       std::back_inserter(args));
 
             std::vector<Tensor> reduce_func_output;
-            TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
+            TF_RETURN_IF_ERROR(instantiated_reduce_func_->Run(
                 ctx, std::move(args), &reduce_func_output));
             states_[key] = reduce_func_output;
           } else {
@@ -259,14 +261,18 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_finalize_func_->RunWithBorrowedArgs(
-                ctx, states_[keys_[keys_index_++]], out_tensors));
+        TF_RETURN_IF_ERROR(instantiated_finalize_func_->RunWithBorrowedArgs(
+            ctx, states_[keys_[keys_index_++]], out_tensors));
         *end_of_sequence = false;
         return Status::OK();
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeUnknownRatioNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -379,6 +385,10 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       std::map<int64, std::vector<Tensor>> states_ GUARDED_BY(mu_);
       std::vector<int64> keys_ GUARDED_BY(mu_);
       int64 keys_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_key_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_init_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_reduce_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_finalize_func_;
     };
 
     const NameAttrList& key_func() const { return captured_key_func_->func(); }
@@ -428,8 +438,9 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList finalize_func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("GroupByReducerDataset").Device(DEVICE_CPU),
-                        GroupByReducerDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalGroupByReducerDataset").Device(DEVICE_CPU),
+    GroupByReducerDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
similarity index 94%
rename from tensorflow/core/kernels/data/group_by_window_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 14aefe5d5445d414d2f81778a491544801cc0727..98603d5a732c8143db61535e6704d6a7b214413c 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -15,10 +15,10 @@ limitations under the License.
 #include <map>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/data/window_dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
@@ -26,7 +26,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
  public:
@@ -175,10 +175,12 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_window_size_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(
+            ctx, &instantiated_key_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(
+            ctx, &instantiated_reduce_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Instantiate(
+            ctx, &instantiated_window_size_func_));
         return Status::OK();
       }
 
@@ -215,9 +217,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               // Run the key function on the input element to identify its
               // group.
               std::vector<Tensor> key_func_output;
-              TF_RETURN_IF_ERROR(
-                  dataset()->captured_key_func_->RunWithBorrowedArgs(
-                      ctx, next_input_element, &key_func_output));
+              TF_RETURN_IF_ERROR(instantiated_key_func_->RunWithBorrowedArgs(
+                  ctx, next_input_element, &key_func_output));
 
               if (key_func_output.size() != 1 ||
                   key_func_output[0].dtype() != DT_INT64 ||
@@ -232,7 +233,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 // Run the window size function on the key to identify its
                 // window size.
                 std::vector<Tensor> window_size_func_output;
-                TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Run(
+                TF_RETURN_IF_ERROR(instantiated_window_size_func_->Run(
                     ctx, std::move(key_func_output), &window_size_func_output));
 
                 if (window_size_func_output.size() != 1 ||
@@ -281,6 +282,11 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeUnknownRatioNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -447,8 +453,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         std::vector<Tensor> args(
             {std::move(key_arg), std::move(group_dataset_arg)});
         std::vector<Tensor> return_values;
-        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
-            ctx, std::move(args), &return_values));
+        TF_RETURN_IF_ERROR(instantiated_reduce_func_->Run(ctx, std::move(args),
+                                                          &return_values));
 
         if (!(return_values.size() == 1 &&
               return_values[0].dtype() == DT_VARIANT &&
@@ -477,6 +483,10 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       std::map<int64, std::vector<std::vector<Tensor>>> groups_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> current_group_iterator_ GUARDED_BY(mu_);
       std::map<int64, int64> window_sizes_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_key_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_reduce_func_;
+      std::unique_ptr<InstantiatedCapturedFunction>
+          instantiated_window_size_func_;
     };
 
     Status OtherArgumentsNodeAndType(
@@ -513,8 +523,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList window_size_func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
-                        GroupByWindowDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalGroupByWindowDataset").Device(DEVICE_CPU),
+    GroupByWindowDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
deleted file mode 100644
index 2141f118ca7b90868dee3005e02e3d1e16971b21..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
- public:
-  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
-
-  void MakeIndexedDataset(OpKernelContext* ctx,
-                          IndexedDataset** output) override {
-    uint64 size = -1;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
-    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
-    *output = new Dataset(ctx, size);
-  }
-
-  class Dataset : public IndexedDataset {
-   public:
-    Dataset(OpKernelContext* ctx, uint64 size)
-        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
-
-    Status MaterializeDataset(
-        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
-      materialized->reset(new Materialized(this));
-      return Status::OK();
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
-      return *dtypes;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
-      return *shapes;
-    }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
-    }
-
-    string DebugString() const override {
-      return "IdentityIndexedDataset::Dataset";
-    }
-
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** node) const override {
-      return errors::Unimplemented(
-          "identity_indexed_dataset.AsGraphDefInternal");
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        if (cur_ < dataset()->size_) {
-          Tensor result_tensor(ctx->allocator({}), DT_UINT64, {});
-          result_tensor.scalar<uint64>()() = cur_++;
-          out_tensors->emplace_back(std::move(result_tensor));
-          *end_of_sequence = false;
-          return Status::OK();
-        }
-        *end_of_sequence = true;
-        return Status::OK();
-      }
-
-     private:
-      mutex mu_;
-      uint64 cur_ GUARDED_BY(mu_);
-    };
-
-    class Materialized : public MaterializedIndexedDataset {
-     public:
-      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
-        dataset->Ref();
-      }
-
-      ~Materialized() override {
-        // TODO(saeta): Pull this into MaterializedIndexedDataset
-        dataset_->Unref();
-      }
-
-      const DataTypeVector& output_dtypes() const override {
-        return dataset_->output_dtypes();
-      }
-
-      const std::vector<PartialTensorShape>& output_shapes() const override {
-        return dataset_->output_shapes();
-      }
-
-      Status Get(IteratorContext&& ctx, uint64 index,
-                 std::vector<Tensor>* out_tensors) const override {
-        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
-                  << ")";
-        if (index >= dataset_->size_) {
-          // Note: use InvalidArgument instead of OutOfRange error because many
-          // things consider OutOfRange to be a "clean termination" error.
-          return errors::InvalidArgument(
-              "Index ", index,
-              " is out of range for this dataset. (Size is: ", dataset_->size_,
-              ".)");
-        }
-        Tensor result_tensor(ctx.allocator({}), DT_UINT64, {});
-        result_tensor.scalar<uint64>()() = index;
-        out_tensors->emplace_back(std::move(result_tensor));
-        return Status::OK();
-      }
-
-      Status Size(uint64* size) const override {
-        *size = dataset_->size_;
-        return Status::OK();
-      }
-
-     private:
-      const Dataset* const dataset_;  // Not owned.
-    };
-
-    const uint64 size_;
-    std::shared_ptr<Materialized> materialized_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
-    IdentityIndexedDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index b34377c6429837e1d1d03a2c57272ba522af3468..d445d9c8094eec5c9a2bff9c45e2dc28e264d096 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -60,6 +60,8 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       return "IgnoreErrorsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -103,6 +105,12 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_)
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset.cc
deleted file mode 100644
index 75ea462f4020bbf02ab05597a23869f90a90cc30..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset.cc
+++ /dev/null
@@ -1,375 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
-
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-Status VerifyTypesMatch(const DataTypeVector& expected,
-                        const DataTypeVector& received) {
-  if (expected.size() != received.size()) {
-    return errors::InvalidArgument(
-        "Number of components does not match: expected ", expected.size(),
-        " types but got ", received.size(), ".");
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    if (expected[i] != received[i]) {
-      return errors::InvalidArgument("Data type mismatch at component ", i,
-                                     ": expected ", DataTypeString(expected[i]),
-                                     " but got ", DataTypeString(received[i]),
-                                     ".");
-    }
-  }
-  return Status::OK();
-}
-
-Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
-                              const std::vector<PartialTensorShape>& received) {
-  if (expected.size() != received.size()) {
-    return errors::InvalidArgument(
-        "Number of components does not match: expected ", expected.size(),
-        " shapes but got ", received.size(), ".");
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    if (!expected[i].IsCompatibleWith(received[i])) {
-      return errors::InvalidArgument("Incompatible shapes at component ", i,
-                                     ": expected ", expected[i].DebugString(),
-                                     " but got ", received[i].DebugString(),
-                                     ".");
-    }
-  }
-
-  return Status::OK();
-}
-
-class MaterializedDatasetResource : public ResourceBase {
- public:
-  MaterializedDatasetResource(
-      const DataTypeVector& output_dtypes,
-      const std::vector<PartialTensorShape>& output_shapes)
-      : output_dtypes_(output_dtypes), output_shapes_(output_shapes) {}
-
-  string DebugString() override {
-    return "Materialized IndexedDataset resource";
-  }
-
-  Status Get(IteratorContext&& ctx, uint64 index,
-             std::vector<Tensor>* out_tensors) {
-    std::shared_ptr<MaterializedIndexedDataset> captured(materialized_);
-    if (captured) {
-      return captured->Get(std::move(ctx), index, out_tensors);
-    } else {
-      return errors::FailedPrecondition(
-          "Get() failed because the MaterializedIndexedDataset has not been "
-          "initialized. Ensure that you have run the materialization operation "
-          "for this MaterializedIndexedDataset before retrieving elements.");
-    }
-  }
-
-  // TODO(saeta): Implement Save and Restore
-
-  const DataTypeVector& output_dtypes() const { return output_dtypes_; }
-  const std::vector<PartialTensorShape>& output_shapes() const {
-    return output_shapes_;
-  }
-
-  Status set_materialized_dataset(
-      const std::shared_ptr<MaterializedIndexedDataset>& dataset) {
-    if (dataset) {
-      TF_RETURN_IF_ERROR(
-          VerifyTypesMatch(output_dtypes_, dataset->output_dtypes()));
-      TF_RETURN_IF_ERROR(
-          VerifyShapesCompatible(output_shapes_, dataset->output_shapes()));
-    }
-    materialized_ = dataset;
-    return Status::OK();
-  }
-
- private:
-  std::shared_ptr<MaterializedIndexedDataset> materialized_;
-  const DataTypeVector output_dtypes_;
-  const std::vector<PartialTensorShape> output_shapes_;
-};
-
-// A wrapper class for storing an `IndexedDataset` instance in a DT_VARIANT
-// tensor. Objects of the wrapper class own a reference on an instance of an
-// `IndexedTensor` and the wrapper's copy constructor and desctructor take care
-// of managing the reference count.
-//
-// NOTE: This is not a feature-complete implementation of the DT_VARIANT
-// specification. In particular, we cannot currently serialize an arbitrary
-// `IndexedDataset` object, so the `Encode()` and `Decode()` methods are not
-// implemented.
-//
-// NOTE(saeta): When `IndexedDataset`s get merged into core, we can instead just
-// use `tensorflow::DatasetVariantWrapper`.
-class IndexedDatasetVariantWrapper {
- public:
-  IndexedDatasetVariantWrapper() : dataset_(nullptr) {}
-
-  // Transfers ownership of `dataset` to `*this`.
-  explicit IndexedDatasetVariantWrapper(IndexedDataset* dataset)
-      : dataset_(dataset) {}
-
-  IndexedDatasetVariantWrapper(const IndexedDatasetVariantWrapper& other)
-      : dataset_(other.dataset_) {
-    if (dataset_) dataset_->Ref();
-  }
-
-  ~IndexedDatasetVariantWrapper() {
-    if (dataset_) dataset_->Unref();
-  }
-
-  IndexedDataset* get() const { return dataset_; }
-
-  string TypeName() const { return "tensorflow::IndexedDatasetVariantWrapper"; }
-  string DebugString() const {
-    if (dataset_) {
-      return dataset_->DebugString();
-    } else {
-      return "<Uninitialized IndexedDatasetVariantWrapper>";
-    }
-  }
-
-  void Encode(VariantTensorData* data) const {
-    LOG(ERROR) << "The Encode() method is not implemented for "
-                  "IndexedDatasetVariantWrapper objects.";
-  }
-
-  bool Decode(const VariantTensorData& data) {
-    LOG(ERROR) << "The Decode() method is not implemented for "
-                  "IndexedDatasetVariantWrapper objects.";
-    return false;
-  }
-
- private:
-  IndexedDataset* const dataset_;  // Owns one reference.
-};
-
-}  // namespace
-
-Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
-                                          IndexedDataset** out_dataset) {
-  if (!(tensor.dtype() == DT_VARIANT ||
-        TensorShapeUtils::IsScalar(tensor.shape()))) {
-    return errors::InvalidArgument(
-        "IndexedDataset tensor must be a scalar of dtype DT_VARIANT.");
-  }
-  const Variant& variant = tensor.scalar<Variant>()();
-  const IndexedDatasetVariantWrapper* wrapper =
-      variant.get<IndexedDatasetVariantWrapper>();
-  if (wrapper == nullptr) {
-    return errors::InvalidArgument("Tensor must be an IndexedDataset object.");
-  }
-  *out_dataset = wrapper->get();
-  if (*out_dataset == nullptr) {
-    return errors::Internal("Read uninitialized IndexedDataset variant.");
-  }
-  return Status::OK();
-}
-
-Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
-                                          Tensor* tensor) {
-  if (!(tensor->dtype() == DT_VARIANT ||
-        TensorShapeUtils::IsScalar(tensor->shape()))) {
-    return errors::InvalidArgument(
-        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
-  }
-  tensor->scalar<Variant>()() = IndexedDatasetVariantWrapper(dataset);
-  return Status::OK();
-}
-
-void IndexedDatasetOpKernel::Compute(OpKernelContext* ctx) {
-  IndexedDataset* dataset = nullptr;
-  MakeIndexedDataset(ctx, &dataset);
-
-  if (ctx->status().ok()) {
-    OP_REQUIRES(ctx, dataset != nullptr,
-                errors::Internal("MakeIndexedDataset did not correctly "
-                                 "construct the IndexedDataset"));
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
-    OP_REQUIRES_OK(ctx, StoreIndexedDatasetInVariantTensor(dataset, output));
-  }
-}
-
-namespace {
-
-class MaterializedHandleOp : public OpKernel {
- public:
-  explicit MaterializedHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  ~MaterializedHandleOp() override {
-    if (resource_ != nullptr) {
-      resource_->Unref();
-      if (cinfo_.resource_is_private_to_kernel()) {
-        if (!cinfo_.resource_manager()
-                 ->template Delete<MaterializedDatasetResource>(
-                     cinfo_.container(), cinfo_.name())
-                 .ok()) {
-          // Do nothing; the resource can have been deleted by session resets.
-          // Note: cargo-culted from $tf/core/framework/resource_op_kernel.h
-        }
-      }
-    }
-  }
-
-  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
-    {
-      mutex_lock l(mu_);
-      if (resource_ == nullptr) {
-        ResourceMgr* mgr = context->resource_manager();
-        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
-
-        MaterializedDatasetResource* resource;
-        OP_REQUIRES_OK(context,
-                       mgr->LookupOrCreate<MaterializedDatasetResource>(
-                           cinfo_.container(), cinfo_.name(), &resource,
-                           [this](MaterializedDatasetResource** ret)
-                               EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                                 *ret = new MaterializedDatasetResource(
-                                     output_dtypes_, output_shapes_);
-                                 return Status::OK();
-                               }));
-        Status s = VerifyResource(resource);
-        if (TF_PREDICT_FALSE(!s.ok())) {
-          resource->Unref();
-          context->SetStatus(s);
-          return;
-        }
-
-        resource_ = resource;
-      }
-    }
-    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                context, 0, cinfo_.container(), cinfo_.name(),
-                                MakeTypeIndex<MaterializedDatasetResource>()));
-  }
-
- private:
-  // During the first Compute(), resource is either created or looked up using
-  // shared_name. In the latter case, the resource found should be verified if
-  // it is compatible with this op's configuration. The verification may fail in
-  // cases such as two graphs asking queues of the same shared name to have
-  // inconsistent capacities.
-  Status VerifyResource(MaterializedDatasetResource* resource) {
-    TF_RETURN_IF_ERROR(
-        VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
-    TF_RETURN_IF_ERROR(
-        VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
-    return Status::OK();
-  }
-
-  mutex mu_;
-  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
-  MaterializedDatasetResource* resource_ GUARDED_BY(mu_) = nullptr;
-  DataTypeVector output_dtypes_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
-
-// TODO(saeta): Make async.
-class MaterializeDatasetOp : public OpKernel {
- public:
-  explicit MaterializeDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    IndexedDataset* dataset;
-    OP_REQUIRES_OK(ctx,
-                   GetIndexedDatasetFromVariantTensor(ctx->input(0), &dataset));
-
-    MaterializedDatasetResource* materialized_resource;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
-                                       &materialized_resource));
-    core::ScopedUnref unref(materialized_resource);
-    std::shared_ptr<MaterializedIndexedDataset> materialized;
-    OP_REQUIRES_OK(ctx, dataset->MaterializeDataset(&materialized));
-    OP_REQUIRES_OK(
-        ctx, materialized_resource->set_materialized_dataset(materialized));
-  }
-};
-
-// TODO(saeta): Make async
-class IndexedDatasetGet : public OpKernel {
- public:
-  explicit IndexedDatasetGet(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    MaterializedDatasetResource* materialized_resource;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0),
-                                       &materialized_resource));
-    auto cleanup = gtl::MakeCleanup([materialized_resource] {
-      materialized_resource->Unref();  // Note: can't use core::ScopedUnref.
-    });
-
-    const Tensor* index_t;
-    OP_REQUIRES_OK(ctx, ctx->input("index", &index_t));
-    // TODO(saeta): Support batch reads (indexes should be non-scalar!)
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(index_t->shape()),
-                errors::InvalidArgument("index must be a scalar"));
-    const uint64 index = index_t->scalar<uint64>()();
-
-    std::vector<Tensor> out_tensors;
-    Status s =
-        materialized_resource->Get(IteratorContext(ctx), index, &out_tensors);
-
-    // Note: Unref materialized_resource to avoid destruction races. (Important
-    // in a [future] async op implementation.)
-    cleanup.release()();
-
-    if (!s.ok()) {
-      ctx->SetStatus(s);
-    } else {
-      auto expected_shapes = materialized_resource->output_shapes();
-      auto expected_types = materialized_resource->output_dtypes();
-      for (size_t i = 0; i < out_tensors.size(); ++i) {
-        OP_REQUIRES(
-            ctx, expected_shapes[i].IsCompatibleWith(out_tensors[i].shape()),
-            errors::Internal(
-                "Materialized dataset output at index ", i,
-                " is incompatible with the expected shape. (Expected: ",
-                expected_shapes[i], ", got: ", out_tensors[i].shape(), ")"));
-        OP_REQUIRES(ctx, out_tensors[i].dtype() == expected_types[i],
-                    errors::Internal("Materialized dataset output at index ", i,
-                                     " was not the expected dtype. (Expected: ",
-                                     expected_types[i],
-                                     ", got: ", out_tensors[i].dtype(), ")"));
-        ctx->set_output(i, out_tensors[i]);
-      }
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalMaterializedIndexDatasetHandle").Device(DEVICE_CPU),
-    MaterializedHandleOp);
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalIndexedDatasetMaterialize").Device(DEVICE_CPU),
-    MaterializeDatasetOp);
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalIndexedDatasetGet").Device(DEVICE_CPU),
-    IndexedDatasetGet);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset.h b/tensorflow/core/kernels/data/experimental/indexed_dataset.h
deleted file mode 100644
index 27a8360cbcffc55c2f4f8ce437e5080e070845df..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-namespace tensorflow {
-namespace data {
-
-// TODO(saeta): Urgh, this is ugly.
-class MaterializedIndexedDataset {
- public:
-  virtual ~MaterializedIndexedDataset() = default;
-
-  // Retrieve the element at a given index. The output tensors are stored in
-  // out_tensors.
-  //
-  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
-  // returned.
-  //
-  // Get is thread-safe.
-  virtual Status Get(IteratorContext&& ctx, uint64 index,
-                     std::vector<Tensor>* out_tensors) const = 0;
-
-  // Size determines the number of elements in this IndexedDataset.
-  //
-  // Size is thread-safe.
-  virtual Status Size(uint64* size) const = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this dataset.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this dataset.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-};
-
-// IndexedDataset represents a dataset that supports random access in addition
-// to iterator-based sequential access.
-//
-// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
-// significant (backwards incompatible) changes!
-class IndexedDataset : public DatasetBase {
- public:
-  IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
-
-  // Materialize (if necessary) the dataset, and return a pointer.
-  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
-  virtual Status MaterializeDataset(
-      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
-};
-
-// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
-// rest of the TensorFlow runtime.
-//
-// Most IndexedDataset's will be private members of classes inheriting from this
-// class.
-class IndexedDatasetOpKernel : public OpKernel {
- public:
-  IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) final;
-
- protected:
-  // Subclasses should implement this method. It will be called during Compute
-  // execution.
-  virtual void MakeIndexedDataset(OpKernelContext* ctx,
-                                  IndexedDataset** output) = 0;
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-};
-
-// Validates and extracts an `IndexedDataset` object from `tensor`.
-//
-// `tensor` must have been written by a call to
-// `StoreIndexedDatasetInVariantTensor`
-//
-// The retrieved pointer isa  borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
-                                          IndexedDataset** out_dataset);
-
-// Stores an `IndexedDataset` object in `tensor.`
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
-                                          Tensor* tensor);
-
-}  // namespace data
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a07eaebdf9d645fba51945d7bd3e79b72b5e5dc2
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
@@ -0,0 +1,548 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// TODO(saeta): Urgh, this is ugly.
+class MaterializedIndexedDataset {
+ public:
+  virtual ~MaterializedIndexedDataset() = default;
+
+  // Retrieve the element at a given index. The output tensors are stored in
+  // out_tensors.
+  //
+  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
+  // returned.
+  //
+  // Get is thread-safe.
+  virtual Status Get(IteratorContext&& ctx, uint64 index,
+                     std::vector<Tensor>* out_tensors) const = 0;
+
+  // Size determines the number of elements in this IndexedDataset.
+  //
+  // Size is thread-safe.
+  virtual Status Size(uint64* size) const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+};
+
+// IndexedDataset represents a dataset that supports random access in addition
+// to iterator-based sequential access.
+//
+// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
+// significant (backwards incompatible) changes!
+class IndexedDataset : public DatasetBase {
+ public:
+  explicit IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
+
+  // Materialize (if necessary) the dataset, and return a pointer.
+  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
+  virtual Status MaterializeDataset(
+      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
+};
+
+// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
+// rest of the TensorFlow runtime.
+//
+// Most IndexedDataset's will be private members of classes inheriting from this
+// class.
+class IndexedDatasetOpKernel : public OpKernel {
+ public:
+  explicit IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeIndexedDataset(OpKernelContext* ctx,
+                                  IndexedDataset** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+};
+
+class MaterializedDatasetResource : public ResourceBase {
+ public:
+  MaterializedDatasetResource(
+      const DataTypeVector& output_dtypes,
+      const std::vector<PartialTensorShape>& output_shapes)
+      : output_dtypes_(output_dtypes), output_shapes_(output_shapes) {}
+
+  string DebugString() override {
+    return "Materialized IndexedDataset resource";
+  }
+
+  Status Get(IteratorContext&& ctx, uint64 index,
+             std::vector<Tensor>* out_tensors) {
+    std::shared_ptr<MaterializedIndexedDataset> captured(materialized_);
+    if (captured) {
+      return captured->Get(std::move(ctx), index, out_tensors);
+    } else {
+      return errors::FailedPrecondition(
+          "Get() failed because the MaterializedIndexedDataset has not been "
+          "initialized. Ensure that you have run the materialization operation "
+          "for this MaterializedIndexedDataset before retrieving elements.");
+    }
+  }
+
+  // TODO(saeta): Implement Save and Restore
+
+  const DataTypeVector& output_dtypes() const { return output_dtypes_; }
+  const std::vector<PartialTensorShape>& output_shapes() const {
+    return output_shapes_;
+  }
+
+  Status set_materialized_dataset(
+      const std::shared_ptr<MaterializedIndexedDataset>& dataset) {
+    if (dataset) {
+      TF_RETURN_IF_ERROR(
+          VerifyTypesMatch(output_dtypes_, dataset->output_dtypes()));
+      TF_RETURN_IF_ERROR(
+          VerifyShapesCompatible(output_shapes_, dataset->output_shapes()));
+    }
+    materialized_ = dataset;
+    return Status::OK();
+  }
+
+ private:
+  std::shared_ptr<MaterializedIndexedDataset> materialized_;
+  const DataTypeVector output_dtypes_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+// A wrapper class for storing an `IndexedDataset` instance in a DT_VARIANT
+// tensor. Objects of the wrapper class own a reference on an instance of an
+// `IndexedTensor` and the wrapper's copy constructor and desctructor take care
+// of managing the reference count.
+//
+// NOTE: This is not a feature-complete implementation of the DT_VARIANT
+// specification. In particular, we cannot currently serialize an arbitrary
+// `IndexedDataset` object, so the `Encode()` and `Decode()` methods are not
+// implemented.
+//
+// NOTE(saeta): When `IndexedDataset`s get merged into core, we can instead just
+// use `tensorflow::DatasetVariantWrapper`.
+class IndexedDatasetVariantWrapper {
+ public:
+  IndexedDatasetVariantWrapper() : dataset_(nullptr) {}
+
+  // Transfers ownership of `dataset` to `*this`.
+  explicit IndexedDatasetVariantWrapper(IndexedDataset* dataset)
+      : dataset_(dataset) {}
+
+  IndexedDatasetVariantWrapper(const IndexedDatasetVariantWrapper& other)
+      : dataset_(other.dataset_) {
+    if (dataset_) dataset_->Ref();
+  }
+
+  ~IndexedDatasetVariantWrapper() {
+    if (dataset_) dataset_->Unref();
+  }
+
+  IndexedDataset* get() const { return dataset_; }
+
+  string TypeName() const { return "tensorflow::IndexedDatasetVariantWrapper"; }
+  string DebugString() const {
+    if (dataset_) {
+      return dataset_->DebugString();
+    } else {
+      return "<Uninitialized IndexedDatasetVariantWrapper>";
+    }
+  }
+
+  void Encode(VariantTensorData* data) const {
+    LOG(ERROR) << "The Encode() method is not implemented for "
+                  "IndexedDatasetVariantWrapper objects.";
+  }
+
+  bool Decode(const VariantTensorData& data) {
+    LOG(ERROR) << "The Decode() method is not implemented for "
+                  "IndexedDatasetVariantWrapper objects.";
+    return false;
+  }
+
+ private:
+  IndexedDataset* const dataset_;  // Owns one reference.
+};
+
+Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
+                                          IndexedDataset** out_dataset) {
+  if (!(tensor.dtype() == DT_VARIANT ||
+        TensorShapeUtils::IsScalar(tensor.shape()))) {
+    return errors::InvalidArgument(
+        "IndexedDataset tensor must be a scalar of dtype DT_VARIANT.");
+  }
+  const Variant& variant = tensor.scalar<Variant>()();
+  const IndexedDatasetVariantWrapper* wrapper =
+      variant.get<IndexedDatasetVariantWrapper>();
+  if (wrapper == nullptr) {
+    return errors::InvalidArgument("Tensor must be an IndexedDataset object.");
+  }
+  *out_dataset = wrapper->get();
+  if (*out_dataset == nullptr) {
+    return errors::Internal("Read uninitialized IndexedDataset variant.");
+  }
+  return Status::OK();
+}
+
+Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
+                                          Tensor* tensor) {
+  if (!(tensor->dtype() == DT_VARIANT ||
+        TensorShapeUtils::IsScalar(tensor->shape()))) {
+    return errors::InvalidArgument(
+        "Dataset tensor must be a scalar of dtype DT_VARIANT.");
+  }
+  tensor->scalar<Variant>()() = IndexedDatasetVariantWrapper(dataset);
+  return Status::OK();
+}
+
+void IndexedDatasetOpKernel::Compute(OpKernelContext* ctx) {
+  IndexedDataset* dataset = nullptr;
+  MakeIndexedDataset(ctx, &dataset);
+
+  if (ctx->status().ok()) {
+    OP_REQUIRES(ctx, dataset != nullptr,
+                errors::Internal("MakeIndexedDataset did not correctly "
+                                 "construct the IndexedDataset"));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    OP_REQUIRES_OK(ctx, StoreIndexedDatasetInVariantTensor(dataset, output));
+  }
+}
+
+class MaterializedHandleOp : public OpKernel {
+ public:
+  explicit MaterializedHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  ~MaterializedHandleOp() override {
+    if (resource_ != nullptr) {
+      resource_->Unref();
+      if (cinfo_.resource_is_private_to_kernel()) {
+        if (!cinfo_.resource_manager()
+                 ->template Delete<MaterializedDatasetResource>(
+                     cinfo_.container(), cinfo_.name())
+                 .ok()) {
+          // Do nothing; the resource can have been deleted by session resets.
+          // Note: cargo-culted from $tf/core/framework/resource_op_kernel.h
+        }
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
+    {
+      mutex_lock l(mu_);
+      if (resource_ == nullptr) {
+        ResourceMgr* mgr = context->resource_manager();
+        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+        MaterializedDatasetResource* resource;
+        OP_REQUIRES_OK(context,
+                       mgr->LookupOrCreate<MaterializedDatasetResource>(
+                           cinfo_.container(), cinfo_.name(), &resource,
+                           [this](MaterializedDatasetResource** ret)
+                               EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                 *ret = new MaterializedDatasetResource(
+                                     output_dtypes_, output_shapes_);
+                                 return Status::OK();
+                               }));
+        Status s = VerifyResource(resource);
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          resource->Unref();
+          context->SetStatus(s);
+          return;
+        }
+
+        resource_ = resource;
+      }
+    }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, cinfo_.container(), cinfo_.name(),
+                                MakeTypeIndex<MaterializedDatasetResource>()));
+  }
+
+ private:
+  // During the first Compute(), resource is either created or looked up using
+  // shared_name. In the latter case, the resource found should be verified if
+  // it is compatible with this op's configuration. The verification may fail in
+  // cases such as two graphs asking queues of the same shared name to have
+  // inconsistent capacities.
+  Status VerifyResource(MaterializedDatasetResource* resource) {
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
+    return Status::OK();
+  }
+
+  mutex mu_;
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
+  MaterializedDatasetResource* resource_ GUARDED_BY(mu_) = nullptr;
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+// TODO(saeta): Make async.
+class MaterializeDatasetOp : public OpKernel {
+ public:
+  explicit MaterializeDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    IndexedDataset* dataset;
+    OP_REQUIRES_OK(ctx,
+                   GetIndexedDatasetFromVariantTensor(ctx->input(0), &dataset));
+
+    MaterializedDatasetResource* materialized_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
+                                       &materialized_resource));
+    core::ScopedUnref unref(materialized_resource);
+    std::shared_ptr<MaterializedIndexedDataset> materialized;
+    OP_REQUIRES_OK(ctx, dataset->MaterializeDataset(&materialized));
+    OP_REQUIRES_OK(
+        ctx, materialized_resource->set_materialized_dataset(materialized));
+  }
+};
+
+// TODO(saeta): Make async
+class IndexedDatasetGet : public OpKernel {
+ public:
+  explicit IndexedDatasetGet(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    MaterializedDatasetResource* materialized_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0),
+                                       &materialized_resource));
+    auto cleanup = gtl::MakeCleanup([materialized_resource] {
+      materialized_resource->Unref();  // Note: can't use core::ScopedUnref.
+    });
+
+    const Tensor* index_t;
+    OP_REQUIRES_OK(ctx, ctx->input("index", &index_t));
+    // TODO(saeta): Support batch reads (indexes should be non-scalar!)
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(index_t->shape()),
+                errors::InvalidArgument("index must be a scalar"));
+    const uint64 index = index_t->scalar<uint64>()();
+
+    std::vector<Tensor> out_tensors;
+    Status s =
+        materialized_resource->Get(IteratorContext(ctx), index, &out_tensors);
+
+    // Note: Unref materialized_resource to avoid destruction races. (Important
+    // in a [future] async op implementation.)
+    cleanup.release()();
+
+    if (!s.ok()) {
+      ctx->SetStatus(s);
+    } else {
+      auto expected_shapes = materialized_resource->output_shapes();
+      auto expected_types = materialized_resource->output_dtypes();
+      for (size_t i = 0; i < out_tensors.size(); ++i) {
+        OP_REQUIRES(
+            ctx, expected_shapes[i].IsCompatibleWith(out_tensors[i].shape()),
+            errors::Internal(
+                "Materialized dataset output at index ", i,
+                " is incompatible with the expected shape. (Expected: ",
+                expected_shapes[i], ", got: ", out_tensors[i].shape(), ")"));
+        OP_REQUIRES(ctx, out_tensors[i].dtype() == expected_types[i],
+                    errors::Internal("Materialized dataset output at index ", i,
+                                     " was not the expected dtype. (Expected: ",
+                                     expected_types[i],
+                                     ", got: ", out_tensors[i].dtype(), ")"));
+        ctx->set_output(i, out_tensors[i]);
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMaterializedIndexDatasetHandle").Device(DEVICE_CPU),
+    MaterializedHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIndexedDatasetMaterialize").Device(DEVICE_CPU),
+    MaterializeDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIndexedDatasetGet").Device(DEVICE_CPU),
+    IndexedDatasetGet);
+
+class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
+ public:
+  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
+
+  void MakeIndexedDataset(OpKernelContext* ctx,
+                          IndexedDataset** output) override {
+    uint64 size = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
+    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
+    *output = new Dataset(ctx, size);
+  }
+
+  class Dataset : public IndexedDataset {
+   public:
+    Dataset(OpKernelContext* ctx, uint64 size)
+        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
+
+    Status MaterializeDataset(
+        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
+      materialized->reset(new Materialized(this));
+      return Status::OK();
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
+    }
+
+    string DebugString() const override {
+      return "IdentityIndexedDataset::Dataset";
+    }
+
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** node) const override {
+      return errors::Unimplemented(
+          "identity_indexed_dataset.AsGraphDefInternal");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (cur_ < dataset()->size_) {
+          out_tensors->emplace_back(ctx->allocator({}), DT_UINT64,
+                                    TensorShape({}));
+          out_tensors->back().scalar<uint64>()() = cur_++;
+          *end_of_sequence = false;
+          return Status::OK();
+        }
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      mutex mu_;
+      uint64 cur_ GUARDED_BY(mu_);
+    };
+
+    class Materialized : public MaterializedIndexedDataset {
+     public:
+      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
+        dataset->Ref();
+      }
+
+      ~Materialized() override {
+        // TODO(saeta): Pull this into MaterializedIndexedDataset
+        dataset_->Unref();
+      }
+
+      const DataTypeVector& output_dtypes() const override {
+        return dataset_->output_dtypes();
+      }
+
+      const std::vector<PartialTensorShape>& output_shapes() const override {
+        return dataset_->output_shapes();
+      }
+
+      Status Get(IteratorContext&& ctx, uint64 index,
+                 std::vector<Tensor>* out_tensors) const override {
+        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
+                  << ")";
+        if (index >= dataset_->size_) {
+          // Note: use InvalidArgument instead of OutOfRange error because many
+          // things consider OutOfRange to be a "clean termination" error.
+          return errors::InvalidArgument(
+              "Index ", index,
+              " is out of range for this dataset. (Size is: ", dataset_->size_,
+              ".)");
+        }
+        out_tensors->emplace_back(ctx.allocator({}), DT_UINT64,
+                                  TensorShape({}));
+        out_tensors->back().scalar<uint64>()() = index;
+        return Status::OK();
+      }
+
+      Status Size(uint64* size) const override {
+        *size = dataset_->size_;
+        return Status::OK();
+      }
+
+     private:
+      const Dataset* const dataset_;  // Not owned.
+    };
+
+    const uint64 size_;
+    std::shared_ptr<Materialized> materialized_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
+    IdentityIndexedDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
index 8a88d32f0cb55472ed0534c8fbcb5e7f8ec677ea..6248eb775e481cc5f6940b5c2131d4c963186af5 100644
--- a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
@@ -92,16 +92,18 @@ class LMDBDatasetOp : public DatasetOpKernel {
         mutex_lock l(mu_);
         do {
           if (mdb_cursor_) {
-            Tensor key_tensor(ctx->allocator({}), DT_STRING, {});
+            out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
+                                      TensorShape({}));
+            Tensor& key_tensor = out_tensors->back();
             key_tensor.scalar<string>()() = string(
                 static_cast<const char*>(mdb_key_.mv_data), mdb_key_.mv_size);
-            out_tensors->emplace_back(std::move(key_tensor));
 
-            Tensor value_tensor(ctx->allocator({}), DT_STRING, {});
+            out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
+                                      TensorShape({}));
+            Tensor& value_tensor = out_tensors->back();
             value_tensor.scalar<string>()() =
                 string(static_cast<const char*>(mdb_value_.mv_data),
                        mdb_value_.mv_size);
-            out_tensors->emplace_back(std::move(value_tensor));
 
             int val;
             val = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT);
@@ -125,6 +127,11 @@ class LMDBDatasetOp : public DatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         return errors::Unimplemented(
             "Checkpointing is currently not supported for LMDBDataset.");
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d86c3a1a63dff8c9b0c4c1ea9bfbced6e3ddbf7e
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -0,0 +1,829 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_THREADS
+
+#include <atomic>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/inplace_ops_functor.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// Maximum number of batch results to buffer.
+const int64 kMaxBatchResults = 16;
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  using MapAndBatchIteratorFunction =
+      std::function<void(IteratorContext*, InstantiatedCapturedFunction*,
+                         const string&, std::vector<Tensor>,
+                         std::shared_ptr<std::vector<Tensor>>, StatusCallback)>;
+
+  explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 batch_size;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "batch_size", &batch_size));
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("batch_size must be greater than zero."));
+
+    int64 num_parallel_calls;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
+                                            &num_parallel_calls));
+    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
+                errors::InvalidArgument(
+                    "num_parallel_calls must be greater than zero."));
+
+    bool drop_remainder;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "drop_remainder", &drop_remainder));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
+                                                 &captured_func));
+
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+
+    MapAndBatchIteratorFunction map_func;
+    CapturedFunction* raw_captured_func = captured_func.get();
+    if (indices.empty()) {
+      map_func = [](IteratorContext* ctx,
+                    InstantiatedCapturedFunction* instantiated_captured_func,
+                    const string& prefix, std::vector<Tensor> args,
+                    std::shared_ptr<std::vector<Tensor>> out_tensors,
+                    StatusCallback done) {
+        instantiated_captured_func->RunAsync(
+            ctx, std::move(args), out_tensors.get(), std::move(done), prefix);
+      };
+    } else {
+      std::vector<bool> can_move = ComputeMoveVector(indices);
+      map_func = [raw_captured_func, indices, can_move](
+                     IteratorContext* ctx,
+                     InstantiatedCapturedFunction* instantiated_captured_func,
+                     const string& prefix, std::vector<Tensor> args,
+                     std::shared_ptr<std::vector<Tensor>> out_tensors,
+                     StatusCallback done) {
+        const std::vector<Tensor>& captured_inputs =
+            raw_captured_func->captured_inputs();
+        size_t num_args = args.size();
+        for (size_t i = 0; i < indices.size(); ++i) {
+          if (indices[i] < num_args) {
+            if (can_move[i]) {
+              out_tensors->push_back(std::move(args[indices[i]]));
+            } else {
+              out_tensors->push_back(args[indices[i]]);
+            }
+          } else {
+            out_tensors->push_back(captured_inputs[indices[i] - num_args]);
+          }
+        }
+        // Run the `done` callback on a threadpool thread, because it will
+        // potentially do a lot of copying work, and we want to run that
+        // concurrently with the next invocation.
+        (*ctx->runner())(std::bind(std::move(done), Status::OK()));
+      };
+    }
+
+    *output = new Dataset(ctx, input, func_, batch_size, num_parallel_calls,
+                          drop_remainder, output_types_, output_shapes_,
+                          std::move(captured_func), &ctx->eigen_cpu_device(),
+                          std::move(map_func), preserve_cardinality_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func, int64 batch_size,
+            int64 num_parallel_calls, bool drop_remainder,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            std::unique_ptr<CapturedFunction> captured_func,
+            const Eigen::ThreadPoolDevice* device,
+            MapAndBatchIteratorFunction map_func, bool preserve_cardinality)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          func_(func),
+          batch_size_(batch_size),
+          num_parallel_calls_(num_parallel_calls),
+          drop_remainder_(drop_remainder),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
+          captured_func_(std::move(captured_func)),
+          device_(device),
+          map_func_(std::move(map_func)),
+          preserve_cardinality_(preserve_cardinality) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::MapAndBatch")},
+          map_func_);
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "MapAndBatchDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* batch_size_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
+      Node* num_parallel_calls_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(num_parallel_calls_, &num_parallel_calls_node));
+      Node* drop_remainder_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
+
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {std::make_pair(0, input_graph_node),
+           std::make_pair(2, batch_size_node),
+           std::make_pair(3, num_parallel_calls_node),
+           std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
+          {std::make_pair(1, other_arguments)},      // Tensor list inputs.
+          {std::make_pair("f", f),
+           std::make_pair("Targuments", other_arguments_types_attr),
+           std::make_pair("preserve_cardinality",
+                          preserve_cardinality_attr)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params,
+                        MapAndBatchIteratorFunction map_func)
+          : DatasetIterator<Dataset>(params),
+            mu_(std::make_shared<mutex>()),
+            cond_var_(std::make_shared<condition_variable>()),
+            num_parallel_calls_(std::make_shared<model::SharedState>(
+                params.dataset->num_parallel_calls_, mu_, cond_var_)),
+            map_func_(std::move(map_func)),
+            max_batch_results_(std::min(kMaxBatchResults,
+                                        (params.dataset->num_parallel_calls_ +
+                                         params.dataset->batch_size_ - 1) /
+                                            params.dataset->batch_size_)) {
+        std::vector<string> components =
+            str_util::Split(params.prefix, "::", str_util::SkipEmpty());
+        prefix_end_ = components.back();
+      }
+
+      ~Iterator() override {
+        mutex_lock l(*mu_);
+        // Cancel the runner thread.
+        cancelled_ = true;
+        cond_var_->notify_all();
+        // Wait for all in-flight calls to complete.
+        while (num_calls_ > 0) {
+          cond_var_->wait(l);
+        }
+      }
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(*mu_);
+        if (num_parallel_calls_->value == kAutoTune) {
+          num_parallel_calls_->value = ctx->runner_threadpool_size();
+          num_parallel_calls_->tunable = true;
+        }
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        std::shared_ptr<BatchResult> result;
+        {
+          mutex_lock l(*mu_);
+          EnsureRunnerThreadStarted(ctx);
+          while (batch_results_.empty() ||
+                 batch_results_.front()->num_calls > 0) {
+            ++waiting_;
+            RecordStop(ctx);
+            cond_var_->wait(l);
+            RecordStart(ctx);
+            --waiting_;
+          }
+          std::swap(result, batch_results_.front());
+          batch_results_.pop_front();
+          cond_var_->notify_all();
+        }
+        return ProcessResult(ctx, result, out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeAsyncKnownRatioNode(
+            std::move(args), dataset()->batch_size_,
+            {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
+                                  /*max=*/ctx->runner_threadpool_size())});
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(*mu_);
+        // Wait for all in-flight calls to complete.
+        while (num_calls_ > 0) {
+          cond_var_->wait(l);
+        }
+        DCHECK_EQ(num_calls_, 0);
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("call_counter"), call_counter_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"),
+                                               batch_results_.size()));
+        for (size_t i = 0; i < batch_results_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteBatchResult(writer, i));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(*mu_);
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("call_counter"), &call_counter_));
+        int64 batch_results_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("batch_results_size"),
+                                              &batch_results_size));
+        for (int i = 0; i < batch_results_size; ++i) {
+          TF_RETURN_IF_ERROR(ReadBatchResult(ctx, reader, i));
+        }
+        return Status::OK();
+      }
+
+     private:
+      // BatchResult encapsulates the output batch, as well as anciliary
+      // metadata required to execute the fused map-and-batch operation.
+      struct BatchResult {
+        explicit BatchResult(int64 batch_size) {
+          end_of_input = false;
+          num_calls = batch_size;
+          num_elements = 0;
+          output_allocated = false;
+          status = Status::OK();
+          status_offset = -1;
+        }
+
+        // UpdateStatus updates the batch's aggregate Status.
+        //
+        // In order to ensure that exactly the first non-OK status is returned
+        // (required to make the behavior is observably identical to a
+        // sequential execution of map followed by batch), we must also keep
+        // track of the offset into the batch that produced `s`.
+        void UpdateStatus(const Status& s, int64 offset) {
+          if (TF_PREDICT_FALSE(!s.ok())) {
+            mutex_lock l(mu);
+            if (status.ok() || offset < status_offset) {
+              status = s;
+              status_offset = offset;
+            }
+          }
+        }
+
+        mutex mu;
+        bool end_of_input GUARDED_BY(mu);
+        int64 num_elements GUARDED_BY(mu);
+        std::vector<Tensor> output;
+        bool output_allocated GUARDED_BY(mu);
+        Status status GUARDED_BY(mu);
+        int64 status_offset GUARDED_BY(mu);
+        // Counts the number of outstanding calls for this batch.
+        int64 num_calls;  // access guarded by owner's mutex
+      };
+
+      void CallCompleted(const std::shared_ptr<IteratorContext>& ctx,
+                         const std::shared_ptr<BatchResult>& result)
+          LOCKS_EXCLUDED(*mu_) {
+        mutex_lock l(*mu_);
+        num_calls_--;
+        result->num_calls--;
+        const auto& stats_aggregator = ctx->stats_aggregator();
+        if (stats_aggregator) {
+          stats_aggregator->AddScalar(
+              strings::StrCat(prefix_end_, "::active_parallel_calls"),
+              static_cast<float>(num_calls_));
+        }
+        cond_var_->notify_all();
+      }
+
+      void CallFunction(std::shared_ptr<IteratorContext> ctx,
+                        const std::shared_ptr<BatchResult>& result,
+                        int64 offset) LOCKS_EXCLUDED(*mu_) {
+        // Get the next input element.
+        std::vector<Tensor> input_element;
+        bool end_of_input;
+        Status status =
+            input_impl_->GetNext(ctx.get(), &input_element, &end_of_input);
+        bool return_early;
+        {
+          mutex_lock l(result->mu);
+          result->end_of_input = result->end_of_input || end_of_input;
+          result->status.Update(status);
+          return_early = result->end_of_input || !result->status.ok();
+        }
+        if (return_early) {
+          CallCompleted(ctx, result);
+          return;
+        }
+
+        std::shared_ptr<std::vector<Tensor>> return_values =
+            std::make_shared<std::vector<Tensor>>();
+        auto done = [this, ctx, result, return_values, offset](Status status) {
+          if (dataset()->preserve_cardinality_ &&
+              errors::IsOutOfRange(status)) {
+            // To guarantee that the transformation preserves the cardinality of
+            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+            // former may be interpreted by a caller as the end of sequence.
+            status = errors::InvalidArgument(
+                "Function invocation produced OutOfRangeError: ",
+                status.error_message());
+          }
+          result->UpdateStatus(status, offset);
+          if (status.ok()) {
+            Status allocate_status =
+                EnsureOutputAllocated(ctx, result, return_values);
+            if (!allocate_status.ok()) {
+              result->UpdateStatus(allocate_status, offset);
+            } else {
+              for (size_t i = 0; i < return_values->size(); ++i) {
+                const Tensor& tensor = return_values->at(i);
+                Tensor* batch = &(result->output)[i];
+                if (tensor.NumElements() !=
+                    (batch->NumElements() / batch->dim_size(0))) {
+                  TensorShape batch_shape = batch->shape();
+                  batch_shape.RemoveDim(0);
+                  result->UpdateStatus(
+                      errors::InvalidArgument(
+                          "Cannot add tensor to the batch: number of elements "
+                          "does not match. Shapes are: [tensor]: ",
+                          tensor.shape().DebugString(),
+                          ", [batch]: ", batch_shape.DebugString()),
+                      offset);
+                  break;
+                }
+                // TODO(mrry): Add a version of DoParallelConcat that allows us
+                // to move `tensor` where possible, to speed up string tensor
+                // batching.
+                Status copy_status =
+                    batch_util::CopyElementToSlice(tensor, batch, offset);
+                if (!copy_status.ok()) {
+                  result->UpdateStatus(copy_status, offset);
+                  break;
+                }
+              }
+            }
+            {
+              mutex_lock l(result->mu);
+              result->num_elements++;
+            }
+          }
+          CallCompleted(ctx, result);
+        };
+
+        // Apply the map function on `input_element`, storing the result in
+        // `return_values`, and invoking `done` when finished.
+        map_func_(ctx.get(), instantiated_captured_func_.get(), prefix(),
+                  std::move(input_element), std::move(return_values),
+                  std::move(done));
+      }
+
+      Status CopyPartialBatch(Tensor* output, const Tensor& value,
+                              int64 num_elements) {
+        switch (value.dtype()) {
+#define HANDLE_TYPE(type)                                         \
+  case DataTypeToEnum<type>::value: {                             \
+    auto output_t = output->flat_outer_dims<type>();              \
+    auto value_t = value.flat_outer_dims<type>();                 \
+    for (size_t i = 0; i < num_elements; i++) {                   \
+      output_t.template chip<0>(i) = value_t.template chip<0>(i); \
+    }                                                             \
+    return Status::OK();                                          \
+  }
+          TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+          default:
+            return errors::InvalidArgument("Unsupported data type: ",
+                                           DataTypeString(value.dtype()));
+        }
+        return Status::OK();
+      }
+
+      void EnsureRunnerThreadStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (!runner_thread_) {
+          auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
+          runner_thread_.reset(ctx->env()->StartThread(
+              {}, "tf_data_map_and_batch",
+              std::bind(&Iterator::RunnerThread, this, ctx_copy)));
+        }
+      }
+
+      Status EnsureOutputAllocated(
+          const std::shared_ptr<IteratorContext>& ctx,
+          const std::shared_ptr<BatchResult>& result,
+          const std::shared_ptr<std::vector<Tensor>>& return_values) {
+        mutex_lock l(result->mu);
+        if (result->output_allocated) {
+          return Status::OK();
+        }
+        const size_t num_components = return_values->size();
+        for (size_t i = 0; i < num_components; ++i) {
+          TensorShape component_shape({dataset()->batch_size_});
+          component_shape.AppendShape(return_values->at(i).shape());
+          AllocatorAttributes attr;
+          attr.set_gpu_compatible(true);
+          result->output.emplace_back(ctx->allocator(attr),
+                                      return_values->at(i).dtype(),
+                                      component_shape);
+          if (!result->output.back().IsInitialized()) {
+            return errors::ResourceExhausted(
+                "Failed to allocate memory for the batch of component ", i);
+          }
+        }
+        result->output_allocated = true;
+        return Status::OK();
+      }
+
+      Status ProcessResult(IteratorContext* ctx,
+                           const std::shared_ptr<BatchResult>& result,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) {
+        mutex_lock l(result->mu);
+        if (result->num_elements == 0) {
+          if (result->status.ok() || errors::IsOutOfRange(result->status)) {
+            *end_of_sequence = true;
+            return Status::OK();
+          } else {
+            *end_of_sequence = false;
+            return result->status;
+          }
+        }
+        if (!result->status.ok() && !errors::IsOutOfRange(result->status)) {
+          // Deallocate tensors allocated for the output.
+          result->output.clear();
+          *end_of_sequence = false;
+          return result->status;
+        }
+        if (result->num_elements < dataset()->batch_size_) {
+          if (dataset()->drop_remainder_) {
+            // Deallocate tensors allocated for the output.
+            result->output.clear();
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          const std::vector<Tensor>& output = result->output;
+          for (size_t i = 0; i < output.size(); ++i) {
+            TensorShape component_shape(result->output[i].shape());
+            component_shape.set_dim(0, result->num_elements);
+            AllocatorAttributes attr;
+            attr.set_gpu_compatible(true);
+            out_tensors->emplace_back(ctx->allocator(attr), output[i].dtype(),
+                                      component_shape);
+            TF_RETURN_IF_ERROR(CopyPartialBatch(&out_tensors->back(), output[i],
+                                                result->num_elements));
+          }
+          // Deallocate tensors allocated for the output.
+          result->output.clear();
+        } else {
+          *out_tensors = std::move(result->output);
+        }
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+      void RunnerThread(const std::shared_ptr<IteratorContext>& ctx)
+          LOCKS_EXCLUDED(*mu_) {
+        std::vector<std::pair<std::shared_ptr<BatchResult>, int64>> new_calls;
+        RecordStart(ctx.get());
+        auto stop_cleanup =
+            gtl::MakeCleanup([this, &ctx]() { RecordStop(ctx.get()); });
+        {
+          tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
+          new_calls.reserve(num_parallel_calls_->value);
+        }
+        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
+          int64 num_parallel_calls = num_parallel_calls_->value;
+          return num_calls_ >= num_parallel_calls ||
+                 (batch_results_.size() > max_batch_results_ ||
+                  (batch_results_.size() == max_batch_results_ &&
+                   call_counter_ % dataset()->batch_size_ == 0));
+        };
+        while (true) {
+          {
+            mutex_lock l(*mu_);
+            while (!cancelled_ && busy()) {
+              if (waiting_ > 0 && num_calls_ < num_parallel_calls_->value &&
+                  max_batch_results_ < kMaxBatchResults) {
+                // If there is a caller waiting for a batch and the number of
+                // outstanding calls is not maxed out, it means we are out of
+                // `batch_results_` slots. Instead of waiting for a slot to open
+                // up, we create a new one to utilize CPU efficiently.
+                max_batch_results_++;
+                continue;
+              }
+              RecordStop(ctx.get());
+              cond_var_->wait(l);
+              RecordStart(ctx.get());
+            }
+
+            if (cancelled_) {
+              return;
+            }
+
+            while (!busy()) {
+              if (call_counter_ % dataset()->batch_size_ == 0) {
+                batch_results_.push_back(
+                    std::make_shared<BatchResult>(dataset()->batch_size_));
+              }
+              int64 offset = call_counter_++ % dataset()->batch_size_;
+              new_calls.emplace_back(batch_results_.back(), offset);
+              num_calls_++;
+            }
+          }
+          const std::shared_ptr<StatsAggregator>& stats_aggregator =
+              ctx->stats_aggregator();
+          if (stats_aggregator) {
+            mutex_lock l(*mu_);
+            // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
+            // monitoring code or as histogram at fixed time intervals.
+            stats_aggregator->AddScalar(
+                strings::StrCat(prefix_end_, "::active_parallel_calls"),
+                static_cast<float>(num_calls_));
+            stats_aggregator->AddScalar(
+                strings::StrCat(prefix_end_, "::num_parallel_calls"),
+                static_cast<float>(num_parallel_calls_->value));
+          }
+          for (const auto& call : new_calls) {
+            CallFunction(ctx, call.first, call.second);
+          }
+          new_calls.clear();
+        }
+      }
+
+      Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
+                             size_t index) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        batch_results_.push_back(
+            std::make_shared<BatchResult>(dataset()->batch_size_));
+        std::shared_ptr<BatchResult> result = batch_results_.back();
+        string prefix = strings::StrCat("batch_results_", index);
+        mutex_lock l(result->mu);
+        result->end_of_input = reader->Contains(
+            full_name(strings::StrCat(prefix, "_end_of_input")));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name(strings::StrCat(prefix, "_num_calls")),
+                               &result->num_calls));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_num_elements")),
+            &result->num_elements));
+        result->output_allocated = reader->Contains(
+            full_name(strings::StrCat(prefix, "_output_allocated")));
+        int64 output_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_output_size")), &output_size));
+        result->output.reserve(output_size);
+        for (int i = 0; i < output_size; i++) {
+          Tensor t;
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(prefix, "_output_", i)), &t));
+          // If the batch was not full, we may have stored only the relevant
+          // slice. Since tensors in `BatchResult.output` are expected to
+          // have the leading dimension of size batch_size, we build a larger
+          // tensor and copy the slice read from the checkpoint into it.
+          if (t.dim_size(0) < dataset()->batch_size_) {
+            TensorShape component_shape(t.shape());
+            component_shape.set_dim(0, dataset()->batch_size_);
+            AllocatorAttributes attr;
+            attr.set_gpu_compatible(true);
+            Tensor new_t(ctx->allocator(attr), t.dtype(), component_shape);
+            TF_RETURN_IF_ERROR(CopyPartialBatch(&new_t, t, t.dim_size(0)));
+            result->output.emplace_back(std::move(new_t));
+          } else {
+            result->output.emplace_back(std::move(t));
+          }
+        }
+        TF_RETURN_IF_ERROR(ReadStatus(
+            reader, strings::StrCat(prefix, "_status"), &result->status));
+        return Status::OK();
+      }
+
+      Status ReadStatus(IteratorStateReader* reader, const string& prefix,
+                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
+      Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        std::shared_ptr<BatchResult> result = batch_results_[index];
+        string prefix = strings::StrCat("batch_results_", index);
+        mutex_lock l(result->mu);
+        if (result->end_of_input) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_num_calls")),
+            result->num_calls));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_num_elements")),
+            result->num_elements));
+        if (result->output_allocated) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_output_allocated")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_output_size")),
+            result->output.size()));
+        for (int i = 0; i < result->output.size(); i++) {
+          // If the batch is not full, we only store the first `num_elements`
+          // values. The rest of the batch tensor is *uninitialized* and
+          // accessing that will raise msan errors.
+          if (result->num_elements < dataset()->batch_size_) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result->output[i].Slice(0, result->num_elements)));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result->output[i]));
+          }
+        }
+        TF_RETURN_IF_ERROR(WriteStatus(
+            writer, strings::StrCat(prefix, "_status"), result->status));
+        return Status::OK();
+      }
+
+      Status WriteStatus(IteratorStateWriter* writer, const string& prefix,
+                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      // Used for coordination between the main thread, the runner thread, and
+      // the callback threads.
+      const std::shared_ptr<mutex> mu_;
+      // Used for coordination between the main thread, the runner thread, and
+      // the callback threads. In particular, the runner thread should only
+      // schedule new calls when the number of in-flight calls is less than
+      // `num_parallel_calls_->value` and there are slots available in the
+      // `batch_results_` buffer.
+      const std::shared_ptr<condition_variable> cond_var_;
+      // Identifies the maximum number of parallel calls.
+      const std::shared_ptr<model::SharedState> num_parallel_calls_;
+      const MapAndBatchIteratorFunction map_func_;
+
+      // Counts the number of outstanding calls for this batch.
+      int64 num_calls_ GUARDED_BY(*mu_) = 0;
+      // Counts the total number of calls.
+      int64 call_counter_ GUARDED_BY(*mu_) = 0;
+      std::unique_ptr<IteratorBase> input_impl_;
+      // Buffer for storing the (intermediate) batch results.
+      std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
+      // Background thread used for coordinating input processing.
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      // Determines whether the transformation has been cancelled.
+      bool cancelled_ GUARDED_BY(*mu_) = false;
+      // Identifies the number of callers currently waiting for a batch result.
+      int64 waiting_ GUARDED_BY(*mu_) = 0;
+      // Identifies the maximum number of batch results to store.
+      int64 max_batch_results_ GUARDED_BY(*mu_);
+      string prefix_end_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList func_;
+    const int64 batch_size_;
+    const int64 num_parallel_calls_;
+    const bool drop_remainder_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const Eigen::ThreadPoolDevice* device_;  // not owned
+    const MapAndBatchIteratorFunction map_func_;
+    const bool preserve_cardinality_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList func_;
+  bool preserve_cardinality_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMapAndBatchDataset").Device(DEVICE_CPU),
+    MapAndBatchDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa27a13416d093dd19475b97b51ac28489d4d177
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -0,0 +1,375 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <queue>
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_inputstream.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+class MatchingFilesDatasetOp : public DatasetOpKernel {
+ public:
+  using DatasetOpKernel::DatasetOpKernel;
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* patterns_t;
+    OP_REQUIRES_OK(ctx, ctx->input("patterns", &patterns_t));
+    const auto patterns = patterns_t->flat<string>();
+    size_t num_patterns = static_cast<size_t>(patterns.size());
+    std::vector<string> pattern_strs;
+    pattern_strs.reserve(num_patterns);
+
+    for (size_t i = 0; i < num_patterns; i++) {
+      pattern_strs.push_back(patterns(i));
+    }
+
+    *output = new Dataset(ctx, std::move(pattern_strs));
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<string> patterns)
+        : DatasetBase(DatasetContext(ctx)), patterns_(std::move(patterns)) {}
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::MatchingFiles")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() const override {
+      return "MatchingFilesDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* patterns_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(patterns_, &patterns_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {patterns_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        FileSystem* fs;
+
+        TF_RETURN_IF_ERROR(ctx->env()->GetFileSystemForFile(
+            dataset()->patterns_[(current_pattern_index_ > 0)
+                                     ? current_pattern_index_ - 1
+                                     : 0],
+            &fs));
+
+        while (!filepath_queue_.empty() ||
+               current_pattern_index_ < dataset()->patterns_.size()) {
+          // All the elements in the heap will be the matched filenames or the
+          // potential directories.
+          if (!filepath_queue_.empty()) {
+            PathStatus current_path = filepath_queue_.top();
+            filepath_queue_.pop();
+
+            if (!current_path.second) {
+              Tensor filepath_tensor(ctx->allocator({}), DT_STRING, {});
+
+              // Replace the forward slash with the backslash for Windows path
+              if (isWindows_) {
+                std::replace(current_path.first.begin(),
+                             current_path.first.end(), '/', '\\');
+              }
+
+              filepath_tensor.scalar<string>()() =
+                  std::move(current_path.first);
+              out_tensors->emplace_back(std::move(filepath_tensor));
+              *end_of_sequence = false;
+              hasMatch_ = true;
+              return Status::OK();
+            }
+
+            // In this case, current_path is a directory. Then continue the
+            // search.
+            TF_RETURN_IF_ERROR(
+                UpdateIterator(ctx, fs, current_path.first, current_pattern_));
+          } else {
+            // search a new pattern
+            current_pattern_ = dataset()->patterns_[current_pattern_index_];
+
+            // Windows paths contain backslashes and Windows APIs accept forward
+            // and backslashes equivalently, so we convert the pattern to use
+            // forward slashes exclusively. The backslash is used as the
+            // indicator of Windows paths. Note that this is not ideal, since
+            // the API expects backslash as an escape character, but no code
+            // appears to rely on this behavior
+            if (current_pattern_.find('\\') != std::string::npos) {
+              isWindows_ = true;
+              std::replace(current_pattern_.begin(), current_pattern_.end(),
+                           '\\', '/');
+            } else {
+              isWindows_ = false;
+            }
+
+            StringPiece fixed_prefix =
+                StringPiece(current_pattern_)
+                    .substr(0, current_pattern_.find_first_of("*?[\\"));
+            string current_dir(io::Dirname(fixed_prefix));
+
+            // If current_dir is empty then we need to fix up fixed_prefix and
+            // current_pattern_ to include . as the top level directory.
+            if (current_dir.empty()) {
+              current_dir = ".";
+              current_pattern_ = io::JoinPath(current_dir, current_pattern_);
+            }
+
+            TF_RETURN_IF_ERROR(
+                UpdateIterator(ctx, fs, current_dir, current_pattern_));
+            ++current_pattern_index_;
+          }
+        }
+
+        *end_of_sequence = true;
+        if (hasMatch_) {
+          return Status::OK();
+        } else {
+          return errors::NotFound("Don't find any matched files");
+        }
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("current_pattern_index"), current_pattern_index_));
+
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_pattern"),
+                                               current_pattern_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("hasMatch"), hasMatch_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("isWindows"), isWindows_));
+
+        if (!filepath_queue_.empty()) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("queue_size"),
+                                                 filepath_queue_.size()));
+          int i = 0;
+          while (!filepath_queue_.empty()) {
+            TF_RETURN_IF_ERROR(
+                writer->WriteScalar(full_name(strings::StrCat("path_", i)),
+                                    filepath_queue_.top().first));
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("path_status_", i)),
+                filepath_queue_.top().second));
+            filepath_queue_.pop();
+            i++;
+          }
+        }
+
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        int64 current_pattern_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name("current_pattern_index"), &current_pattern_index));
+        current_pattern_index_ = size_t(current_pattern_index);
+
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_pattern"),
+                                              &current_pattern_));
+        int64 hasMatch;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("hasMatch"), &hasMatch));
+        hasMatch_ = static_cast<bool>(hasMatch);
+
+        int64 isWindows;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("isWindows"), &isWindows));
+        isWindows_ = static_cast<bool>(isWindows);
+
+        if (reader->Contains(full_name("queue_size"))) {
+          int64 queue_size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("queue_size"), &queue_size));
+          for (int i = 0; i < queue_size; i++) {
+            string path;
+            int64 path_status;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("path_", i)), &path));
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("path_status_", i)), &path_status));
+            filepath_queue_.push(
+                PathStatus(path, static_cast<bool>(path_status)));
+          }
+        }
+
+        return Status::OK();
+      }
+
+     private:
+      Status UpdateIterator(IteratorContext* ctx, FileSystem* fs,
+                            const string& dir, const string& eval_pattern)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        StringPiece fixed_prefix =
+            StringPiece(eval_pattern)
+                .substr(0, eval_pattern.find_first_of("*?[\\"));
+
+        filepath_queue_.push(PathStatus(dir, true));
+        Status ret;  // Status to return
+
+        // DFS to find the first element in the iterator.
+        while (!filepath_queue_.empty()) {
+          const PathStatus current_path = filepath_queue_.top();
+
+          // All the files in the heap are matched with the pattern, so finish
+          // the search if current_path is a file.
+          if (!current_path.second) {
+            return Status::OK();
+          }
+
+          filepath_queue_.pop();
+
+          // If current_path is a directory, search its children.
+          const string& current_dir = current_path.first;
+          std::vector<string> children;
+          ret.Update(fs->GetChildren(current_dir, &children));
+
+          // Handle the error cases: 1) continue the search if the status is
+          // NOT_FOUND; 2) return the non-ok status immediately if it is not
+          // NOT_FOUND.
+          if (ret.code() == error::NOT_FOUND) {
+            continue;
+          } else if (!ret.ok()) {
+            return ret;
+          }
+
+          // children_dir_status holds is_dir status for children. It can have
+          // three possible values: OK for true; FAILED_PRECONDITION for false;
+          // CANCELLED if we don't calculate IsDirectory (we might do that
+          // because there isn't any point in exploring that child path).
+          std::vector<Status> children_dir_status;
+          children_dir_status.resize(children.size());
+
+          // This IsDirectory call can be expensive for some FS. Parallelizing
+          // it.
+          auto is_directory_fn = [fs, current_dir, &children, &fixed_prefix,
+                                  &children_dir_status](int i) {
+            const string child_path = io::JoinPath(current_dir, children[i]);
+            // In case the child_path doesn't start with the fixed_prefix, then
+            // we don't need to explore this path.
+            if (!str_util::StartsWith(child_path, fixed_prefix)) {
+              children_dir_status[i] =
+                  errors::Cancelled("Operation not needed");
+            } else {
+              children_dir_status[i] = fs->IsDirectory(child_path);
+            }
+          };
+
+          BlockingCounter counter(children.size());
+          for (int i = 0; i < children.size(); i++) {
+            (*ctx->runner())([&is_directory_fn, &counter, i] {
+              is_directory_fn(i);
+              counter.DecrementCount();
+            });
+          }
+          counter.Wait();
+
+          for (int i = 0; i < children.size(); i++) {
+            const string& child_dir_path =
+                io::JoinPath(current_dir, children[i]);
+            const Status& child_dir_status = children_dir_status[i];
+
+            // If the IsDirectory call was cancelled we bail.
+            if (child_dir_status.code() == tensorflow::error::CANCELLED) {
+              continue;
+            }
+
+            if (child_dir_status.ok()) {
+              // push the child dir for next search
+              filepath_queue_.push(PathStatus(child_dir_path, true));
+            } else {
+              // This case will be a file: if the file matches the pattern, push
+              // it to the heap; otherwise, ignore it.
+              if (ctx->env()->MatchPath(child_dir_path, eval_pattern)) {
+                filepath_queue_.push(PathStatus(child_dir_path, false));
+              }
+            }
+          }
+        }
+        return ret;
+      }
+
+      mutex mu_;
+      // True means the path is a directory; False means the path is a filename.
+      typedef std::pair<string, bool> PathStatus;
+      std::priority_queue<PathStatus, std::vector<PathStatus>,
+                          std::greater<PathStatus>>
+          filepath_queue_ GUARDED_BY(mu_);
+      size_t current_pattern_index_ GUARDED_BY(mu_) = 0;
+      string current_pattern_ GUARDED_BY(mu_);
+      bool hasMatch_ GUARDED_BY(mu_) = false;
+      bool isWindows_ GUARDED_BY(mu_) = false;
+    };
+
+    const std::vector<string> patterns_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMatchingFilesDataset").Device(DEVICE_CPU),
+    MatchingFilesDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61811ea14eddc9f40987e12ce6343268da24a503
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -0,0 +1,132 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit NonSerializableDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    *output = new Dataset(ctx, input, output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::NonSerializable")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "NonSerializableDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented(DebugString(), "::AsGraphDefInternal");
+    }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args), /*ratio=*/1);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* input_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalNonSerializableDataset").Device(DEVICE_CPU),
+    NonSerializableDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46233942f066de8fe799a958f164f8afa30e49ef
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
@@ -0,0 +1,1155 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_THREADS
+
+#include <atomic>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/inplace_ops_functor.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/numa.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// kWindowSize is the fixed constant controlling the number of batch outputs
+// each NumaWorkerBlock may be processing at a time. This is currently a
+// constant and not user configurable to enable future performance optimizations
+// in the implementation.
+const int64 kWindowSize = 10;
+
+// Define a helper for more consistent logging.
+#define WORKER_VLOG(verbose_level)                                           \
+  VLOG(verbose_level) << "WorkerThread (" << numa_node << ", " << thread_num \
+                      << "): "
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit NumaMapAndBatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    // TODO(saeta): Implement support for preserve_cardinality logic.
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 batch_size;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "batch_size", &batch_size));
+    OP_REQUIRES(
+        ctx, batch_size > 0,
+        errors::InvalidArgument("batch_size must be greater than zero."));
+
+    int64 num_parallel_calls;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
+                                            &num_parallel_calls));
+    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
+                errors::InvalidArgument(
+                    "num_parallel_calls must be greater than zero."));
+
+    bool drop_remainder;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "drop_remainder", &drop_remainder));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
+                                      /* use_inter_op_parallelism = */ false,
+                                      &captured_func));
+
+    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
+                          drop_remainder, output_types_, output_shapes_, func_,
+                          std::move(captured_func));
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
+            int64 num_parallel_calls, bool drop_remainder,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          batch_size_(batch_size),
+          num_parallel_calls_(num_parallel_calls),
+          drop_remainder_(drop_remainder),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
+          func_(func),
+          captured_func_(std::move(captured_func)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::NumaMapAndBatch")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "NumaMapAndBatchDatasetOp::Dataset";
+    }
+
+    // TODO(b/120482302): Note that this is inaccurate until
+    // NumaMapAndBatchMapDataset modified to preserve cardinality.
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* batch_size_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
+      Node* num_parallel_calls_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(num_parallel_calls_, &num_parallel_calls_node));
+      Node* drop_remainder_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
+
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {std::make_pair(0, input_graph_node),
+           std::make_pair(2, batch_size_node),
+           std::make_pair(3, num_parallel_calls_node),
+           std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
+          {std::make_pair(1, other_arguments)},      // Tensor list inputs.
+          {std::make_pair("f", f),
+           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            mu_(std::make_shared<mutex>()),
+            autotune_cond_var_(std::make_shared<condition_variable>()),
+            num_parallel_calls_(std::make_shared<model::SharedState>(
+                params.dataset->num_parallel_calls_, mu_, autotune_cond_var_)) {
+      }
+
+      ~Iterator() override {
+        mutex_lock l(*mu_);
+        cancelled_ = true;
+        VLOG(3) << "NumaMapAndBatchIterator::~Iterator: cancelling operations.";
+        for (size_t i = 0; i < workers_.size(); ++i) {
+          workers_[i]->manager.Cancel();
+        }
+        VLOG(3) << "NumaMapAndBatchIterator::~Iterator: waiting for threads to "
+                   "shut down.";
+      }
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(*mu_);
+        if (num_parallel_calls_->value == kAutoTune) {
+          num_parallel_calls_->value = ctx->runner_threadpool_size();
+          num_parallel_calls_->tunable = true;
+        }
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_));
+        return Status::OK();
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        auto cleanup = gtl::MakeCleanup(
+            [] { VLOG(3) << "GetNextInternal call returning."; });
+        NumaWorkerBlock* worker = nullptr;
+        {
+          mutex_lock l(*mu_);
+          VLOG(3) << "GetNextInternal call; current block: " << cur_block_;
+          if (global_end_of_input_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          TF_RETURN_IF_ERROR(EnsureBackgroundThreadsStarted(ctx));
+          worker = workers_[cur_block_].get();
+          cur_block_ = (cur_block_ + 1) % workers_.size();
+        }
+        bool global_end_of_input_local = false;
+        Status s = worker->manager.GetBatch(ctx, dataset()->drop_remainder_,
+                                            &global_end_of_input_local,
+                                            out_tensors, end_of_sequence);
+        if (global_end_of_input_local) {
+          mutex_lock l(*mu_);
+          global_end_of_input_ = global_end_of_input_local;
+        }
+        return s;
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeAsyncKnownRatioNode(
+            std::move(args), dataset()->batch_size_,
+            {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
+                                  /*max=*/ctx->runner_threadpool_size())});
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(*mu_);
+        for (size_t i = 0; i < workers_.size(); ++i) {
+          if (!workers_[i]->manager.Quiesce()) {
+            return errors::Cancelled(
+                "The iterator was deleted before it could reach a "
+                "checkpointable state.");
+          }
+        }
+
+        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("num_workers"), workers_.size()));
+
+        for (size_t i = 0; i < workers_.size(); ++i) {
+          size_t index = (cur_block_ + i) % workers_.size();
+          TF_RETURN_IF_ERROR(workers_[index]->manager.Save(writer, this, i));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(*mu_);
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        int64 num_workers = -1;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("num_workers"), &num_workers));
+        // Note: num_workers can be 0 if the iterator wasn't started when
+        // first checkpointed.
+        if (num_workers < 0) {
+          return errors::DataLoss(
+              "When restoring from checkpoint, we encountered a data "
+              "consistency error: num_workers has an invalid value: ",
+              num_workers);
+        }
+        if (port::NUMAEnabled()) {
+          int actual_numa_domains = port::NUMANumNodes();
+          if (actual_numa_domains != num_workers && num_workers > 0) {
+            LOG(WARNING) << "# NUMA domains mismatch when restoring from "
+                            "checkpoint: checkpoint has "
+                         << num_workers
+                         << " NUMA domains, while this host has: "
+                         << actual_numa_domains << " NUMA domains.";
+          }
+        }
+        if (num_workers > 1 && !port::NUMAEnabled()) {
+          LOG(WARNING) << "NUMA is not enabled for this process, but restoring "
+                          "a checkpoint that assumes "
+                       << num_workers << " NUMA domains.";
+        }
+        workers_.resize(num_workers);
+        for (size_t i = 0; i < num_workers; ++i) {
+          workers_[i] = MakeUnique<NumaWorkerBlock>(this);
+          TF_RETURN_IF_ERROR(
+              workers_[i]->manager.Restore(ctx, reader, this, i));
+        }
+        cur_block_ = 0;
+        return Status::OK();
+      }
+
+     private:
+      // NumaBlockManager manages all the state for a set of threads pinned to a
+      // single NUMA domain.
+      //
+      // The methods can be divided into 3 categories based on who should call
+      // them:
+      //
+      //  (1) RunnerThread: WaitForInputSpace, PushInputs, SetEndOfInput.
+      //  (2) WorkerThread: RetrieveInput, GetBatchTensors.
+      //      RecordBatchEntryComplete
+      //  (3) Client threads: GetBatch, Cancel, Save, Restore.
+      //
+      // Internally, we manage state in a circular buffer of size `kWindowSize`.
+      // There are 3 pointers into the circular buffer, and must maintain the
+      // following order: (1) next_input_batch_ (corresponding to the next input
+      // batch to be pulled from the input iterator), (2) next_input_
+      // (corresponding to the batch the WorkerThreads should pull from for
+      // their next inputs), and (3) next_output_ corresponding to the next
+      // value to be consumed by the output iterator.
+      //
+      // Methods return errors::Cancelled if the iteration is cancelled before
+      // completing.
+      //
+      // NumaBlockManager is thread safe.
+      class NumaBlockManager {
+       public:
+        explicit NumaBlockManager(Iterator* itr) : itr_(itr) {}
+
+        // WaitForInputSpace blocks until there is space in the circular buffer
+        // to begin processing a new batch of elements.
+        //
+        // Returns true when there is space, false if the Iterator is cancelled.
+        bool WaitForInputSpace(IteratorContext* ctx) {
+          mutex_lock l(mu_);
+
+          size_t next = (next_input_batch_ + 1) % kWindowSize;
+          DCHECK(next < kWindowSize) << next;
+
+          // Wait for space in the circular buffer.
+          while (!cancelled_ && batches_[next].state != BatchState::kEmpty) {
+            VLOG(3) << "Waiting for input space; next: " << next
+                    << ", next_output_: " << next_output_
+                    << ", next_input_batch_: " << next_input_batch_;
+            itr_->RecordStop(ctx);
+            runner_cond_var_.wait(l);
+            itr_->RecordStart(ctx);
+          }
+          if (cancelled_) {
+            VLOG(3) << "WaitForInputSpace cancelled.";
+            return false;
+          }
+
+          DCHECK(batches_[next].state == BatchState::kEmpty);
+
+          next_input_batch_ = next;
+          return true;
+        }
+
+        // PushInputs sets the inputs for the next batch as retrieved from the
+        // input iterator.
+        void PushInputs(const Status& status,
+                        std::vector<std::vector<Tensor>> inputs) {
+          mutex_lock l(mu_);
+
+          DCHECK(next_input_ < kWindowSize) << next_input_;
+          DCHECK(batches_[next_input_batch_].state == BatchState::kEmpty);
+          DCHECK(batches_[next_input_batch_].next_input_to_process == 0)
+              << batches_[next_input_batch_].next_input_to_process;
+          DCHECK(batches_[next_input_batch_].status.ok())
+              << batches_[next_input_batch_].status;
+
+          batches_[next_input_batch_].inputs.swap(inputs);
+          batches_[next_input_batch_].state = BatchState::kInputsFilled;
+          batches_[next_input_batch_].status.Update(status);
+          if (batches_[next_input_batch_].status.ok()) {
+            worker_cond_var_.notify_all();
+          } else {
+            client_cond_var_.notify_all();
+            batches_[next_input_batch_].error_index = 0;
+          }
+        }
+
+        // SetEndOfInput records the fact that we have reached the end of the
+        // input iterator, and that we should return end_of_sequence = true when
+        // we have exhaused all buffered batches.
+        void SetEndOfInput() {
+          mutex_lock l(mu_);
+          reached_eof_ = true;
+          worker_cond_var_.notify_all();
+          client_cond_var_.notify_all();
+        }
+
+        // RetrieveInput gets the next input tuple to be mapped by a worker
+        // thread.
+        //
+        // Returns true if an input was retrieved, false if the iterator has
+        // been cancelled.
+        bool RetrieveInput(IteratorContext* ctx, std::vector<Tensor>* input,
+                           uint64* index, size_t* sequence_number) {
+          mutex_lock l(mu_);
+
+          // Wait for inputs to be ready.
+          while (!cancelled_ &&
+                 batches_[next_input_].state != BatchState::kInputsFilled) {
+            itr_->RecordStop(ctx);
+            worker_cond_var_.wait(l);
+            itr_->RecordStart(ctx);
+          }
+
+          if (cancelled_) {
+            return false;
+          }
+
+          DCHECK(batches_[next_input_].next_input_to_process <
+                 batches_[next_input_].inputs.size())
+              << "next_input_: " << next_input_ << ", next_input_to_process: "
+              << batches_[next_input_].next_input_to_process
+              << ", inputs.size(): " << batches_[next_input_].inputs.size()
+              << ", state: " << static_cast<int32>(batches_[next_input_].state)
+              << ", this: " << this;
+          *index = batches_[next_input_].next_input_to_process;
+          *sequence_number = next_input_;
+          input->swap(batches_[next_input_]
+                          .inputs[batches_[next_input_].next_input_to_process]);
+          // Increment pointers.
+          batches_[next_input_].next_input_to_process++;
+
+          if (batches_[next_input_].next_input_to_process ==
+              batches_[next_input_].inputs.size()) {
+            batches_[next_input_].state = BatchState::kAllMapsStarted;
+            next_input_ = (next_input_ + 1) % kWindowSize;
+          }
+          return true;
+        }
+
+        // GetBatchTensors returns a pointer to the output batch tensors for the
+        // worker thread to copy into.
+        //
+        // allocate_output is a function taking a batch size, and a pointer to
+        // the output tuple of Tensors to allocate them. The allocate_output
+        // function is called at most once per output batch.
+        std::vector<Tensor>* GetBatchTensors(
+            size_t sequence_number,
+            std::function<void(size_t, std::vector<Tensor>*)> allocate_output) {
+          mutex_lock l(mu_);
+          DCHECK(sequence_number < kWindowSize) << sequence_number;
+          DCHECK(batches_[sequence_number].state == BatchState::kInputsFilled ||
+                 batches_[sequence_number].state == BatchState::kAllMapsStarted)
+              << sequence_number;
+
+          if (batches_[sequence_number].outputs.empty()) {
+            allocate_output(batches_[sequence_number].inputs.size(),
+                            &batches_[sequence_number].outputs);
+          }
+          return &batches_[sequence_number].outputs;
+        }
+
+        // RecordBatchEntryComplete records an element of the batch has finished
+        // copying into the output tensors.
+        void RecordBatchEntryComplete(size_t sequence_number, uint64 index,
+                                      Status s) {
+          mutex_lock l(mu_);
+          DCHECK(sequence_number < kWindowSize) << sequence_number;
+          DCHECK(batches_[sequence_number].state == BatchState::kInputsFilled ||
+                 batches_[sequence_number].state == BatchState::kAllMapsStarted)
+              << sequence_number;
+
+          batches_[sequence_number].num_outputs_complete++;
+          if (!s.ok() && batches_[sequence_number].error_index > index) {
+            batches_[sequence_number].status = s;
+            batches_[sequence_number].error_index = index;
+          }
+
+          if (batches_[sequence_number].num_outputs_complete ==
+              batches_[sequence_number].inputs.size()) {
+            DCHECK(batches_[sequence_number].state ==
+                   BatchState::kAllMapsStarted);
+            batches_[sequence_number].state = BatchState::kOutputsComplete;
+            batches_[sequence_number].inputs.clear();  // Eagerly save memory.
+            batches_[sequence_number].inputs.shrink_to_fit();
+            client_cond_var_.notify_all();
+          }
+        }
+
+        // GetBatch retrieves the next output batch tensors.
+        Status GetBatch(IteratorContext* ctx, bool drop_remainder,
+                        bool* global_eof, std::vector<Tensor>* out_tensor,
+                        bool* end_of_sequence) {
+          mutex_lock l(mu_);
+          // Wait until one of 3 conditions occurs:
+          //  (1) we're cancelled.
+          //  (2) the state becomes kOutputsComplete
+          //  (3) state is empty && reached_eof.
+          while (!cancelled_ &&
+                 batches_[next_output_].state != BatchState::kOutputsComplete &&
+                 !(reached_eof_ &&
+                   batches_[next_output_].state == BatchState::kEmpty)) {
+            VLOG(3) << "Waiting in GetBatch.";
+            itr_->RecordStop(ctx);
+            client_cond_var_.wait(l);
+            itr_->RecordStart(ctx);
+          }
+
+          if (cancelled_) {
+            return errors::Cancelled(
+                "Cancelled in NumaMapAndBatch::GetNext call.");
+          }
+
+          if (reached_eof_ &&
+              batches_[next_output_].state == BatchState::kEmpty) {
+            VLOG(4) << "GetBatch returning end of sequence.";
+            *end_of_sequence = true;
+            *global_eof = true;
+            return Status::OK();
+          }
+
+          VLOG(3) << "Returning output index: " << next_output_
+                  << ", this: " << this;
+
+          *end_of_sequence = false;
+          Status s = batches_[next_output_].status;
+          if (s.ok()) {
+            out_tensor->swap(batches_[next_output_].outputs);
+          }
+          // Handle early termination.
+          if (errors::IsOutOfRange(s)) {
+            *global_eof = true;
+            s = Status::OK();
+            if (drop_remainder || batches_[next_output_].error_index == 0) {
+              *end_of_sequence = true;
+            } else {
+              std::vector<Tensor> true_outputs;
+              for (size_t i = 0; i < batches_[next_output_].outputs.size();
+                   ++i) {
+                TensorShape component_shape(
+                    batches_[next_output_].outputs[i].shape());
+                component_shape.set_dim(0, batches_[next_output_].error_index);
+                AllocatorAttributes attr;
+                attr.set_gpu_compatible(true);
+                true_outputs.emplace_back(
+                    ctx->allocator(attr),
+                    batches_[next_output_].outputs[i].dtype(), component_shape);
+                TF_RETURN_IF_ERROR(CopyPartialBatch(
+                    &true_outputs.back(), batches_[next_output_].outputs[i],
+                    batches_[next_output_].error_index));
+              }
+              out_tensor->swap(true_outputs);
+            }
+          }
+
+          batches_[next_output_].Reset();
+          next_output_ = (next_output_ + 1) % kWindowSize;
+          runner_cond_var_.notify_all();
+
+          return s;
+        }
+
+        void Cancel() {
+          mutex_lock l(mu_);
+          VLOG(3) << "Cancelling NUMA block.";
+          cancelled_ = true;
+          runner_cond_var_.notify_all();
+          worker_cond_var_.notify_all();
+          client_cond_var_.notify_all();
+        }
+
+        // Waits until all the worker threads have completed their work and all
+        // internal state has reached a "safe-point" where we can safely
+        // checkpoint.
+        //
+        // Returns true if completed successfully, false if cancelled while
+        // waiting.
+        bool Quiesce() {
+          mutex_lock l(mu_);
+          VLOG(3) << "Waiting until the operations have quiesced.";
+          while (!cancelled_ && !AllMapOperationsFinished()) {
+            client_cond_var_.wait(l);
+          }
+          if (cancelled_) {
+            return false;
+          }
+          return true;
+        }
+
+        Status Save(IteratorStateWriter* writer, Iterator* itr, size_t index) {
+          mutex_lock l(mu_);
+          string prefix = itr->full_name(strings::StrCat("numa_block_", index));
+          if (reached_eof_) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                strings::StrCat(prefix, "_end_of_input"), ""));
+          }
+          for (size_t i = 0; i < kWindowSize; ++i) {
+            size_t index = (next_output_ + i) % kWindowSize;
+            if (batches_[index].state == BatchState::kEmpty) {
+              break;
+            }
+            string batch_prefix = strings::StrCat(prefix, "_batch_", i);
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                strings::StrCat(batch_prefix, "_code"),
+                static_cast<int64>(batches_[index].status.code())));
+            if (!batches_[index].status.ok()) {
+              TF_RETURN_IF_ERROR(
+                  writer->WriteScalar(strings::StrCat(batch_prefix, "_msg"),
+                                      batches_[index].status.error_message()));
+              TF_RETURN_IF_ERROR(writer->WriteScalar(
+                  strings::StrCat(batch_prefix, "_error_index"),
+                  batches_[index].error_index));
+            }
+
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                strings::StrCat(batch_prefix, "_output_size"),
+                batches_[index].outputs.size()));
+            for (size_t j = 0; j < batches_[index].outputs.size(); ++j) {
+              string tensor_prefix =
+                  strings::StrCat(batch_prefix, "_output_", j);
+              if (!batches_[index].status.ok()) {
+                DCHECK(batches_[index].error_index >= 0 &&
+                       batches_[index].error_index <
+                           itr_->dataset()->batch_size_);
+                // If the batch is not full, we only store the first
+                // `error_index` values. The rest of the batch tensor might not
+                // be initialized, and accessing that will raise msan errors.
+                TF_RETURN_IF_ERROR(writer->WriteTensor(
+                    tensor_prefix, batches_[index].outputs[j].Slice(
+                                       0, batches_[index].error_index)));
+              } else {
+                TF_RETURN_IF_ERROR(writer->WriteTensor(
+                    tensor_prefix, batches_[index].outputs[j]));
+              }
+            }
+          }
+          return Status::OK();
+        }
+
+        Status Restore(IteratorContext* ctx, IteratorStateReader* reader,
+                       Iterator* itr, size_t index) {
+          mutex_lock l(mu_);
+          if (reached_eof_) {
+            return errors::FailedPrecondition(
+                "Already reached the end of the sequence.");
+          }
+          string prefix = itr->full_name(strings::StrCat("numa_block_", index));
+          reached_eof_ =
+              reader->Contains(strings::StrCat(prefix, "_end_of_input"));
+          for (size_t i = 0; i < kWindowSize; ++i) {
+            string batch_prefix = strings::StrCat(prefix, "_batch_", i);
+            if (!reader->Contains(strings::StrCat(batch_prefix, "_code"))) {
+              break;
+            }
+            Batch batch;
+            batch.state = BatchState::kOutputsComplete;
+            int64 code_int;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                strings::StrCat(batch_prefix, "_code"), &code_int));
+            error::Code code = static_cast<error::Code>(code_int);
+            if (code != error::Code::OK) {
+              string error_message;
+              TF_RETURN_IF_ERROR(reader->ReadScalar(
+                  strings::StrCat(batch_prefix, "_msg"), &error_message));
+              batch.status = Status(code, error_message);
+              int64 error_index_int = -1;
+              TF_RETURN_IF_ERROR(reader->ReadScalar(
+                  strings::StrCat(batch_prefix, "_error_index"),
+                  &error_index_int));
+              if (error_index_int < 0 ||
+                  error_index_int > itr->dataset()->batch_size_) {
+                return errors::FailedPrecondition(
+                    "Error index out of bounds when restoring from checkpoint; "
+                    "error index: ",
+                    error_index_int);
+              }
+              batch.error_index = static_cast<size_t>(error_index_int);
+            }
+            int64 output_size = -1;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                strings::StrCat(batch_prefix, "_output_size"), &output_size));
+            batch.outputs.reserve(output_size);
+            for (size_t j = 0; j < output_size; ++j) {
+              string tensor_name = strings::StrCat(batch_prefix, "_output_", j);
+              Tensor t;
+              TF_RETURN_IF_ERROR(reader->ReadTensor(tensor_name, &t));
+              batch.outputs.emplace_back(std::move(t));
+            }
+            batches_[i] = std::move(batch);
+          }
+          return Status::OK();
+        }
+
+       private:
+        bool AllMapOperationsFinished() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          for (size_t i = 0; i < kWindowSize; ++i) {
+            if (batches_[i].state == BatchState::kInputsFilled ||
+                batches_[i].state == BatchState::kAllMapsStarted) {
+              return false;
+            }
+            if (batches_[i].state != BatchState::kOutputsComplete &&
+                !reached_eof_) {
+              return false;
+            }
+          }
+          return true;
+        }
+
+        // Batches begin in the `kEmpty` state. Once the RunnerThread has
+        // filled the `inputs` to a `Batch`, it transitions to the
+        // `kInputsFilled` state. At this point, the Worker threads run the map
+        // function and copy the outputs appropriately. Once all worker threads
+        // have started, it transitions to `kAllMapsStarted`. After the outputs
+        // are complete, the GetNext call can consume the outputs, and return
+        // the batch to the kEmpty state.
+        enum class BatchState {
+          kEmpty,
+          kInputsFilled,
+          kAllMapsStarted,
+          kOutputsComplete,
+        };
+
+        // Batch captures all the state of an output batch as it progresses
+        // through the machinery. Once the RunnerThread fills inputs, it
+        // transitions to `kInputsFilled`. At this point, the worker threads can
+        // work on it, incrementing outputs_complete for every element of the
+        // input set that is copied into the output Tensors. Once all the input
+        // tuples have been processed (i.e. num_outputs_complete ==
+        // inputs.size()), it transitions to the `kOutputsComplete` stage, where
+        // it is ready to be returned by a `GetBatch` call (called from
+        // `GetNextInternal`).
+        struct Batch {
+          BatchState state;
+          // Aggregates the Status of the input iterator's GetNext
+          // calls, in addition to the Status of the map function invocations.
+          //
+          // In the case where multiple non-OK statuses are encountered, we
+          // return the first one encountered.
+          Status status;
+          // In order to return the correct error status, we keep track of the
+          // error_index.
+          size_t error_index;
+          // The batch_size input tuples (or fewer in the case of the last
+          // batch).
+          // TODO(saeta): Avoid re-allocating vectors all the time!
+          std::vector<std::vector<Tensor>> inputs;
+          std::vector<Tensor> outputs;
+          size_t next_input_to_process;
+          size_t num_outputs_complete;
+
+          Batch() { Reset(); }
+
+          // Resets the Batch state (e.g. after consuming the outputs).
+          void Reset() {
+            state = BatchState::kEmpty;
+            status = Status::OK();
+            inputs.clear();
+            inputs.shrink_to_fit();
+            outputs.clear();
+            outputs.shrink_to_fit();
+            next_input_to_process = 0;
+            num_outputs_complete = 0;
+            error_index = -1;
+          }
+        };
+
+        Iterator* itr_;  // Not owned.
+        mutex mu_;
+        Batch batches_[kWindowSize] GUARDED_BY(mu_);
+        size_t next_input_batch_ GUARDED_BY(mu_) = -1;
+        size_t next_input_ GUARDED_BY(mu_) = 0;
+        size_t next_output_ GUARDED_BY(mu_) = 0;
+        bool cancelled_ GUARDED_BY(mu_) = false;
+        bool reached_eof_ GUARDED_BY(mu_) = false;
+
+        // The runner thread waits on this condition variable for space to be
+        // available. When the client thread takes a value out of the circular
+        // buffer, it notifies this condition variable that space is now
+        // available.
+        condition_variable runner_cond_var_ GUARDED_BY(mu_);
+        // The worker threads wait on this condition variable for available
+        // inputs. When the runner thread makes new inputs available, it
+        // notifies this condition variable.
+        condition_variable worker_cond_var_ GUARDED_BY(mu_);
+        // The client threads wait on this condition variable for avaiable
+        // batched outputs. When worker threads complete a batch, they notify
+        // this condition variable.
+        condition_variable client_cond_var_ GUARDED_BY(mu_);
+      };
+      // Mark NumaBlockManager as a friend of Iterator in order to call
+      // protected Iterator methods during checkpointing.
+      friend NumaBlockManager;
+
+      struct NumaWorkerBlock {
+        NumaBlockManager manager;
+        // TODO(saeta): Migrate to BackgroundWorker.
+        std::vector<std::unique_ptr<Thread>> threads;
+
+        explicit NumaWorkerBlock(Iterator* itr) : manager(itr) {}
+      };
+
+      static void CustomNumaWorkerBlockDeleter(NumaWorkerBlock* ptr) {
+        ptr->~NumaWorkerBlock();
+        port::NUMAFree(ptr, sizeof(NumaWorkerBlock));
+      }
+      static void DefaultNumaWorkerBlockDeleter(NumaWorkerBlock* ptr) {
+        delete ptr;
+      }
+
+      static Status CopyPartialBatch(Tensor* output, const Tensor& value,
+                                     int64 num_elements) {
+        switch (value.dtype()) {
+#define HANDLE_TYPE(type)                                         \
+  case DataTypeToEnum<type>::value: {                             \
+    auto output_t = output->flat_outer_dims<type>();              \
+    auto value_t = value.flat_outer_dims<type>();                 \
+    for (size_t i = 0; i < num_elements; i++) {                   \
+      output_t.template chip<0>(i) = value_t.template chip<0>(i); \
+    }                                                             \
+    return Status::OK();                                          \
+  }
+          TF_CALL_DATASET_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+          default:
+            return errors::InvalidArgument("Unsupported data type: ",
+                                           DataTypeString(value.dtype()));
+        }
+        return Status::OK();
+      }
+
+      Status EnsureBackgroundThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (curr_num_parallel_calls_ >= num_parallel_calls_->value) {
+          // All necessary threads have been started.
+          curr_num_parallel_calls_ = num_parallel_calls_->value;
+          return Status::OK();
+        }
+
+        VLOG(4) << "Starting workers";
+        bool numa_enabled = port::NUMAEnabled();
+
+        if (!numa_enabled) {
+          LOG(INFO) << "NUMA not enabled on this host.";
+        }
+
+        int num_numa_nodes = port::NUMANumNodes();
+        if (num_numa_nodes < 1) {
+          return errors::Internal("The number of NUMA nodes is invalid: ",
+                                  num_numa_nodes);
+        }
+
+        // Only resize when empty to support restoring from checkpoints.
+        if (workers_.empty()) {
+          VLOG(3) << "# NUMA Nodes: " << num_numa_nodes
+                  << ", # Parallel Calls: " << num_parallel_calls_->value;
+          workers_.resize(num_numa_nodes);
+        } else {
+          num_numa_nodes = workers_.size();
+        }
+
+        // Round up num_parallel_calls, with a minimum of 1.
+        const size_t num_threads_per_block =
+            std::max(1LL, (num_parallel_calls_->value + num_numa_nodes - 1) /
+                              num_numa_nodes);
+
+        VLOG(3) << "Starting " << num_threads_per_block * num_numa_nodes
+                << " worker threads, with " << num_threads_per_block
+                << " threads per block.";
+
+        // Only allocate new_ctx if required.
+        std::shared_ptr<IteratorContext> new_ctx;
+
+        for (int i = 0; i < num_numa_nodes; ++i) {
+          if (!workers_[i]) {
+            if (numa_enabled) {
+              // Allocate in appropriate NUMA domain.
+              // 4k page align.
+              void* ptr = port::NUMAMalloc(i, sizeof(NumaWorkerBlock), 0);
+              if (ptr != nullptr) {
+                NumaWorkerBlock* block = new (ptr) NumaWorkerBlock(this);
+                workers_[i] =
+                    std::unique_ptr<NumaWorkerBlock,
+                                    std::function<void(NumaWorkerBlock*)>>(
+                        block, CustomNumaWorkerBlockDeleter);
+              } else {
+                LOG(ERROR) << "Could not NUMA-allocate worker block: " << i;
+              }
+            }
+            // If the NUMA allocation fails, or NUMA is not enabled.
+            if (!workers_[i]) {
+              workers_[i] =
+                  std::unique_ptr<NumaWorkerBlock,
+                                  std::function<void(NumaWorkerBlock*)>>(
+                      new NumaWorkerBlock(this), DefaultNumaWorkerBlockDeleter);
+            }
+          }
+          // Be sure to start threads if num_parallel_calls_ has changed.
+          for (size_t j = workers_[i]->threads.size();
+               j < num_threads_per_block; ++j) {
+            VLOG(3) << "Starting worker " << i << ", " << j;
+            if (!new_ctx) {
+              new_ctx = std::make_shared<IteratorContext>(*ctx);
+            }
+            workers_[i]->threads.emplace_back(ctx->env()->StartThread(
+                {}, strings::StrCat("tf_data_numa_map_and_batch_", i, "_", j),
+                [this, new_ctx, i, j]() { WorkerThread(new_ctx, i, j); }));
+            VLOG(3) << "Worker " << i << ", " << j << " successfully started.";
+          }
+        }
+        if (!runner_thread_) {
+          if (!new_ctx) {
+            new_ctx = std::make_shared<IteratorContext>(*ctx);
+          }
+          runner_thread_.reset(ctx->env()->StartThread(
+              {}, "tf_data_numa_map_and_batch",
+              [this, new_ctx] { RunnerThread(new_ctx); }));
+        }
+        VLOG(3) << "All workers & runner thread started.";
+        return Status::OK();
+      }
+
+      void AllocateOutput(IteratorContext* ctx, size_t batch_size,
+                          const std::vector<Tensor>& map_fn_outputs,
+                          std::vector<Tensor>* batch_outputs) {
+        DCHECK(dataset()->output_dtypes().size() ==
+               dataset()->output_shapes().size());
+        DCHECK(map_fn_outputs.size() == dataset()->output_dtypes().size());
+        for (size_t i = 0; i < dataset()->output_dtypes().size(); ++i) {
+          TensorShape component_shape({static_cast<uint32>(batch_size)});
+          component_shape.AppendShape(map_fn_outputs.at(i).shape());
+          AllocatorAttributes attr;
+          attr.set_gpu_compatible(true);
+          batch_outputs->emplace_back(ctx->allocator(attr),
+                                      map_fn_outputs.at(i).dtype(),
+                                      component_shape);
+        }
+      }
+
+      void RunnerThread(std::shared_ptr<IteratorContext> ctx)
+          LOCKS_EXCLUDED(mu_) {
+        RecordStart(ctx.get());
+        auto cleanup = gtl::MakeCleanup([this, &ctx] {
+          // Set end of input on all the managers in order to clean up in an
+          // orderly fashion.
+          VLOG(3) << "Setting End of Input on workers_[*]->manager";
+          for (size_t i = 0; i < workers_.size(); ++i) {
+            workers_[i]->manager.SetEndOfInput();
+          }
+          RecordStop(ctx.get());
+        });
+
+        const size_t num_blocks = workers_.size();
+
+        while (true) {
+          for (size_t block = 0; block < num_blocks; ++block) {
+            VLOG(4) << "RunnerThread waiting for input space in block: "
+                    << block;
+            if (TF_PREDICT_FALSE(
+                    !workers_[block]->manager.WaitForInputSpace(ctx.get()))) {
+              VLOG(3) << "RunnerThread exiting due to cancellation.";
+              return;
+            }
+            VLOG(4) << "RunnerThread has space; pulling on upstream for block "
+                    << block;
+
+            Status s;
+            std::vector<std::vector<Tensor>> inputs;
+            bool end_of_sequence = false;
+            for (size_t i = 0; i < dataset()->batch_size_; ++i) {
+              std::vector<Tensor> tuple;
+              s.Update(
+                  input_impl_->GetNext(ctx.get(), &tuple, &end_of_sequence));
+              if (!s.ok()) {
+                break;
+              }
+              if (end_of_sequence) {
+                VLOG(4) << "Runner thread encountered end of sequence.";
+                if (dataset()->drop_remainder_) {
+                  return;
+                }
+                break;
+              }
+              inputs.push_back(std::move(tuple));
+            }
+
+            VLOG(4) << "Moving inputs to block " << block
+                    << ", which has size: " << inputs.size();
+            if (!s.ok() || !inputs.empty()) {
+              workers_[block]->manager.PushInputs(s, std::move(inputs));
+              VLOG(4) << "Inputs moved into block " << block;
+            }
+            if (end_of_sequence) {
+              return;
+            }
+          }
+        }
+      }
+
+      void WorkerThread(std::shared_ptr<IteratorContext> ctx,
+                        const int numa_node, const int thread_num) {
+        RecordStart(ctx.get());
+        WORKER_VLOG(3) << "started.";
+        auto stop_cleanup =
+            gtl::MakeCleanup([this, numa_node, thread_num, &ctx]() {
+              RecordStop(ctx.get());
+              WORKER_VLOG(3) << "exiting.";
+            });
+
+        NumaWorkerBlock* block = workers_[numa_node].get();
+        port::NUMASetThreadNodeAffinity(numa_node);
+        const int num_numa_nodes = port::NUMANumNodes();
+        const int minimum_num_parallel_calls = thread_num * num_numa_nodes;
+
+        while (true) {
+          // Put threads to sleep based on autotuner.
+          {
+            mutex_lock l(*mu_);
+            while (minimum_num_parallel_calls >= num_parallel_calls_->value &&
+                   !cancelled_) {
+              RecordStop(ctx.get());
+              autotune_cond_var_->wait(l);
+              RecordStart(ctx.get());
+            }
+            if (cancelled_) {
+              return;
+            }
+          }
+
+          std::vector<Tensor> input;
+          uint64 index = 0;
+          size_t sequence_number = 0;
+          WORKER_VLOG(4) << "retrieving input.";
+          {
+            tracing::ScopedActivity trace(
+                "NumaMapAndBatch::Iterator::Worker::RetrieveInput");
+            if (!block->manager.RetrieveInput(ctx.get(), &input, &index,
+                                              &sequence_number)) {
+              return;
+            }
+          }
+
+          WORKER_VLOG(4) << "retrieved input; index: " << index
+                         << ", sequence_number: " << sequence_number;
+
+          std::vector<Tensor> return_values;
+          Status s;
+          {
+            tracing::ScopedActivity trace(
+                "NumaMapAndBatch::Iterator::Worker::FunctionExecution");
+            s = instantiated_captured_func_->Run(ctx.get(), std::move(input),
+                                                 &return_values);
+          }
+          WORKER_VLOG(4) << "ran function for index: " << index
+                         << ", sequence_number: " << sequence_number;
+
+          if (s.ok()) {
+            std::vector<Tensor>* output = block->manager.GetBatchTensors(
+                sequence_number,
+                [this, ctx, &return_values](size_t batch_size,
+                                            std::vector<Tensor>* output) {
+                  AllocateOutput(ctx.get(), batch_size, return_values, output);
+                });
+            WORKER_VLOG(4) << "copying tensors to batch output.";
+            {
+              tracing::ScopedActivity trace(
+                  "NumaMapAndBatch::Iterator::Worker::BatchCopy");
+              for (size_t i = 0; i < return_values.size() && s.ok(); ++i) {
+                Tensor& tensor = return_values.at(i);
+                Tensor* batch = &output->at(i);
+                if (tensor.NumElements() !=
+                    (batch->NumElements() / batch->dim_size(0))) {
+                  s.Update(errors::InvalidArgument(
+                      "Cannot add tensor to the batch: number of elements does "
+                      "not match. Shapes are: [tensor]: ",
+                      tensor.shape().DebugString(),
+                      ", [batch]: ", batch->shape().DebugString()));
+                  break;
+                }
+                s.Update(batch_util::CopyElementToSlice(std::move(tensor),
+                                                        batch, index));
+              }
+            }
+          }
+
+          block->manager.RecordBatchEntryComplete(sequence_number, index, s);
+          WORKER_VLOG(4) << "finished index: " << index
+                         << ", sequence_number: " << sequence_number;
+        }
+      }
+
+      // mu_ protects shared internal state and is used to coordinate between
+      // the auto-tuner, client threads, worker threads, and the runner thread.
+      const std::shared_ptr<mutex> mu_;
+      const std::shared_ptr<condition_variable> autotune_cond_var_;
+      // The maximum number of parallel calls (can be auto-tuned).
+      const std::shared_ptr<model::SharedState> num_parallel_calls_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+
+      // Caches the last-seen value of num_parallel_calls_->value to
+      // short-circuit starting workers.
+      int64 curr_num_parallel_calls_ GUARDED_BY(*mu_) = 0;
+
+      std::unique_ptr<IteratorBase> input_impl_;
+      int64 cur_block_ GUARDED_BY(*mu_) = 0;
+      bool global_end_of_input_ GUARDED_BY(*mu_) = false;
+      bool cancelled_ GUARDED_BY(*mu_) = false;
+      std::vector<std::unique_ptr<NumaWorkerBlock,
+                                  std::function<void(NumaWorkerBlock*)>>>
+          workers_;  // Const after initialization.
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+    };
+
+    const DatasetBase* const input_;
+    const int64 batch_size_;
+    const int64 num_parallel_calls_;
+    const bool drop_remainder_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const NameAttrList func_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList func_;
+  bool preserve_cardinality_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalNumaMapAndBatchDataset").Device(DEVICE_CPU),
+    NumaMapAndBatchDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0230f90aba1c849483da5f8d7297c44c8a1174de
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -0,0 +1,1085 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <atomic>
+#include <deque>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 cycle_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
+    OP_REQUIRES(ctx, cycle_length > 0,
+                errors::InvalidArgument("`cycle_length` must be > 0"));
+
+    int64 block_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "block_length", &block_length));
+    OP_REQUIRES(ctx, block_length > 0,
+                errors::InvalidArgument("`block_length` must be > 0"));
+
+    bool sloppy = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
+
+    int64 buffer_output_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
+                                            &buffer_output_elements));
+    OP_REQUIRES(
+        ctx, buffer_output_elements > 0,
+        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
+
+    int64 prefetch_input_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
+                                            &prefetch_input_elements));
+    OP_REQUIRES(
+        ctx, prefetch_input_elements >= 0,
+        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
+                                      &captured_func));
+
+    *output =
+        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
+                    cycle_length, block_length, sloppy, buffer_output_elements,
+                    prefetch_input_elements, output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
+            int64 block_length, bool sloppy, int64 buffer_output_elements,
+            int64 prefetch_input_elements, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          interleave_func_(func),
+          captured_func_(std::move(captured_func)),
+          cycle_length_(cycle_length),
+          block_length_(block_length),
+          sloppy_(sloppy),
+          buffer_output_elements_(buffer_output_elements),
+          prefetch_input_elements_(prefetch_input_elements),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "ParallelInterleaveDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+      Node* cycle_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
+      Node* block_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
+      Node* sloppy_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
+      Node* buffer_output_elements_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
+      Node* prefetch_input_elements_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
+                                      &prefetch_input_elements_node));
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(interleave_func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {{0, input_node},
+           {2, cycle_length_node},
+           {3, block_length_node},
+           {4, sloppy_node},
+           {5, buffer_output_elements_node},
+           {6, prefetch_input_elements_node}},
+          {{1, other_arguments}},
+          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+      return Status::OK();
+    }
+
+   private:
+    int64 num_threads() const {
+      return cycle_length_ + prefetch_input_elements_;
+    }
+
+    // Parallel interleave's implementation is designed around a few principles:
+    //  1. Thread creation is relatively expensive. (Not reusing
+    //     threads causes a number of indirect costs such as poorer tcmalloc
+    //     performance due to thread-local caches, etc.) We allocate a fixed
+    //     number of threads at the start and never change. This is why we've
+    //     fused functionality that is theoretically orthogonal (i.e.
+    //     .prefetch()) into the implementation.
+    //  2. Drop-in replacement for standard interleave. The goal will be to
+    //     auto-opt people into an optimized implementation without any work
+    //     on the customer's part. We thus go through great pains to maintain
+    //     identical iteration orders, full determinism (disabled only via a
+    //     flag, etc.)
+    //  3. Performance across a variety of environments and I/O envelopes.
+    //
+    // The actual implementation centers around a collection of worker threads
+    // and their corresponding worker state (tracked in the `workers_` vector).
+    // Worker threads repeatedly receive a vector of Tensors that are used as
+    // input to the flat-map function (`captured_func_`). The output of this
+    // function must be a dataset. The worker thread then repeatedly calls
+    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
+    // that a caller will block waiting for an element to be produced.
+    //
+    // Pointers to these worker states are kept in 2 disjoint data structures:
+    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
+    //     in `workers_` that we are interleaving. Worker threads backing these
+    //     WorkerStates should be regularly producing values.
+    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
+    //     `workers_` that we will move to `interleave_indices_` when an
+    //     iterator in `interleave_indices_` is exhausted.
+    //
+    // The client calls `GetNext[Internal]()` to retrieve an output element. The
+    // internal implementation updates the state of `interleave_indices_` and
+    // `staging_indices_` as output iterators (run by the worker threads) are
+    // exhausted.
+    //
+    // `input_impl_` is the input iterator that generates arguments for the
+    // flat-map function (`captured_func_`). It is set to an iterator at
+    // Iterator construction, and is fixed until we consume all input elements.
+    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
+    // memory.
+    //
+    // A few invariants are maintained:
+    //  1. No element in interleave_indices_ should be a -1 unless
+    //     `staging_indices_` is empty and `input_impl_` is empty.
+    //  2. Every `worker_` element is pointed to by at most one element of the
+    //     union of `interleave_indices_` and `staging_indices_`.
+    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
+    //     an element in `interleave_indices_` or `staging_indices_`.
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            workers_(dataset()->num_threads()),
+            worker_thread_states_(dataset()->num_threads()) {}
+
+      ~Iterator() override {
+        mutex_lock l(mu_);
+        cancelled_ = true;
+        // Notify all workers in case they are blocked.
+        for (auto& worker : workers_) {
+          worker.cond_var.notify_all();
+        }
+      }
+
+      Status Initialize(IteratorContext* ctx) override {
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      // It is implemented so that it matches the deterministic interleave
+      // unless getting the next element would block and we are allowed to be
+      // sloppy.
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
+        while (!cancelled_) {
+          // Wait for an item to become available, blocking if necessary. If we
+          // are allowed to be sloppy, we can skip over input datasets that do
+          // not have an item readily available.
+          bool can_produce_elements = false;
+          bool must_wait_for_input = true;
+          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
+            int64 index = (next_index_ + i) % interleave_indices_.size();
+            int64 current_worker_index = interleave_indices_[index];
+            if (current_worker_index < 0) {
+              continue;  // Empty interleave elements.
+            }
+            WorkerState* current_worker = &workers_[current_worker_index];
+            can_produce_elements |= current_worker->MayHaveElements();
+            if (!current_worker->outputs.empty()) {
+              // We have an element!
+              next_index_ = index;
+              const bool element_acquired_sloppily =
+                  dataset()->sloppy_ && i > 1;
+              if (!element_acquired_sloppily) {
+                // If the element was acquired in the regular (non-sloppy)
+                // order, then advance the current block and cycle pointers to
+                // the next element in the regular order.
+                block_count_++;
+                if (block_count_ == dataset()->block_length_) {
+                  next_index_ = (index + 1) % interleave_indices_.size();
+                  block_count_ = 0;
+                }
+              } else {
+                block_count_ = 0;
+              }
+              *end_of_sequence = false;
+              Status s = current_worker->outputs.front().status;
+              current_worker->outputs.front().output.swap(*out_tensors);
+              current_worker->outputs.pop_front();
+              current_worker->cond_var.notify_one();
+              return s;
+            } else if (current_worker->is_producing && !dataset()->sloppy_) {
+              // current_worker.outputs.empty(), and we must wait for this
+              // iterator.
+              if (next_index_ != index) {
+                // We have advanced to a new iterator; reset block counts.
+                next_index_ = index;
+                block_count_ = 0;
+              }
+              break;
+            } else if (!current_worker->is_producing) {
+              // This iterator has reached end of input.
+              interleave_indices_[index] = -1;
+              if (input_impl_) {
+                // Start prefetching a new iterator.
+                std::vector<Tensor> args;
+                bool end_of_input = false;
+                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+                if (end_of_input) {
+                  input_impl_.reset();
+                } else {
+                  current_worker->SetInputs(s, std::move(args));
+                  staging_indices_.emplace_back(current_worker_index);
+                }
+              }
+
+              if (!staging_indices_.empty()) {
+                // Move a worker from `staging_indices_` to
+                // `interleave_indices_`.
+                interleave_indices_[index] = staging_indices_.front();
+                staging_indices_.pop_front();
+
+                next_index_ = (index + 1) % interleave_indices_.size();
+                block_count_ = 0;
+                // Restart the inner [for] loop
+                can_produce_elements = true;
+                must_wait_for_input = false;
+                break;
+              }
+            }
+          }
+
+          if (!can_produce_elements && !input_impl_) {
+            // No potential for future values.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          if (must_wait_for_input) {
+            // Wait for elements to become available.
+            RecordStop(ctx);
+            if (dataset()->sloppy_) {
+              sloppy_cond_var_.wait(l);
+            } else {
+              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
+            }
+            RecordStart(ctx);
+          }
+        }
+        return errors::Cancelled(
+            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeAsyncInterleaveManyNode(std::move(args),
+                                                  /*parameters=*/{});
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("next_index"), next_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_count"), block_count_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("workers_size"), workers_.size()));
+        for (int i = 0; i < workers_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
+        }
+        for (int i = 0; i < worker_thread_states_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
+                                               interleave_indices_.size()));
+        for (int i = 0; i < interleave_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("interleave_indices_", i)),
+              interleave_indices_[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
+                                               staging_indices_.size()));
+        for (int i = 0; i < staging_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("staging_indices_", i)),
+              staging_indices_[i]));
+        }
+        if (!worker_threads_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("worker_threads_running"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (!reader->Contains(full_name("input_exhausted"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        int64 temp;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
+        next_index_ = size_t(temp);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
+        block_count_ = size_t(temp);
+
+        // Restore WorkerStates.
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("workers_size"), &temp));
+        if (temp != dataset()->num_threads()) {
+          return errors::Internal("Expected ", dataset()->num_threads(),
+                                  " worker states but found ", temp, ".");
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
+        }
+
+        // Restore `interleave_indices_`.
+        std::set<int64> all_indices;
+        {
+          int64 interleave_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
+                                                &interleave_size));
+          interleave_indices_.reserve(interleave_size);
+          for (int64 i = 0; i < interleave_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("interleave_indices_", i)), &temp));
+            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            interleave_indices_.emplace_back(temp);
+          }
+        }
+
+        // Restore `staging_indices_`.
+        {
+          int64 staging_size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("staging_size"), &staging_size));
+          for (int i = 0; i < staging_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("staging_indices_", i)), &temp));
+            if (all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            staging_indices_.emplace_back(temp);
+          }
+        }
+
+        // Start Worker threads.
+        if (reader->Contains(full_name("worker_threads_running"))) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      // OutputElem contains the information from a call to GetNext by an output
+      // iterator.
+      struct OutputElem {
+        // The output iterator sets `status` if getting the output element
+        // fails.
+        Status status;
+        // The buffered data element.
+        std::vector<Tensor> output;
+
+        explicit OutputElem(const Status& s) : status(s) {}
+      };
+
+      // Worker threads operate on their relevant WorkerState structs.
+      //
+      // WorkerState's fields are all protected by mu_;
+      struct WorkerState {
+        // The arguments to be used to construct an output iterator.
+        std::vector<Tensor> input;
+        // The buffered output elements.
+        std::deque<OutputElem> outputs;
+        // Set to true iff the worker thread expects to append more elements to
+        // outputs. is_producing can be false despite !outputs.empty().
+        // Concretely, all output elements will have been consumed only when:
+        // is_producing == false && outputs.empty();
+        bool is_producing = false;
+        // Condition variable used to coordinate between threads. The worker
+        // thread waits on this condition variable when it is either (1) waiting
+        // for the main thread to add arguments to `input`, or (2) waiting for
+        // the main thread to consume an element of `outputs`. The main thread
+        // waits on cond_var if it is waiting for the worker thread to produce
+        // an element into `outputs` (this implies sloppy_==false).
+        condition_variable cond_var;
+
+        inline bool MayHaveElements() const {
+          return is_producing || !outputs.empty();
+        }
+
+        // Sets inputs for a worker thread and notifies it to start processing.
+        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
+          if (s.ok()) {
+            DCHECK(!MayHaveElements())
+                << "Tried to start inputs, despite already producing!";
+            input = std::move(input_arguments);
+            is_producing = true;
+            cond_var.notify_one();
+          } else {
+            outputs.emplace_back(s);
+          }
+        }
+      };
+
+      // The internal state of a worker thread that is not already captured
+      // in its `WorkerState`.
+      //
+      // This is needed only for checkpointing purposes. We keep this
+      // separate from `WorkerState` and guard its fields using a separate
+      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
+      struct WorkerThreadState {
+        // The output element that has been produced from the input iterator
+        // and is waiting to be added to `WorkerState.outputs`.
+        OutputElem output_elem;
+
+        // Whether the input iterator returned an `end_of_sequence`.
+        bool end_of_sequence = false;
+
+        // Status returned from `MakeIteratorFromInputElement`.
+        Status iterator_creation_status;
+
+        // The arguments to be used to construct `iterator`.
+        std::vector<Tensor> input;
+
+        std::unique_ptr<IteratorBase> iterator;
+
+        WorkerThreadState() : output_elem(Status::OK()) {}
+      };
+
+      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (worker_threads_.empty()) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
+            std::vector<Tensor> args;
+            bool end_of_input = false;
+            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+            if (end_of_input) {
+              input_impl_.reset();
+              return Status::OK();
+            }
+            workers_[i].SetInputs(s, std::move(args));
+            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+            if (i < dataset()->cycle_length_) {
+              interleave_indices_.push_back(i);
+            } else {
+              staging_indices_.push_back(i);
+            }
+          }
+          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
+          DCHECK(staging_indices_.size() ==
+                 dataset()->prefetch_input_elements_);
+        }
+        return Status::OK();
+      }
+
+      // Produces elements into the worker's output buffers.
+      void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
+                        const int64 thread_index) {
+        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
+        //
+        // 1. Any local state that may need to be checkpointed should be kept
+        //    in `worker_thread_states_[thread_index]`.
+        // 2. `WorkerThreadState` should contain state that is needed only for
+        //    checkpointing, i.e., if we were to remove checkpointing support,
+        //    we could keep that state as local variables in this thread.
+        // 3. This thread should only read/write state at `thread_index`
+        //    and should not access other thread states.
+        // 4. When restoring from checkpoint, threads are started only after
+        //    the restore is complete.
+        // 5. Once restored from a checkpoint, the local state is edited only
+        //    by this thread. 3 & 4 allow making assumptions like temporarily
+        //    caching local state in this thread and using it outside a lock
+        //    e.g. `make_new_iterator`.
+        // 6. `ckpt_mu_` should be wisely used to create *consistent*
+        //    checkpoint markers.
+
+        // std::function arguments are copy-constructable, so we pass raw
+        // pointers, and then immediately wrap them to ensure correct ownership.
+        RecordStart(ctx.get());
+        auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
+          mutex_lock l(mu_);
+          workers_[thread_index].cond_var.notify_all();
+          RecordStop(ctx.get());
+        });
+        bool make_new_iterator;
+        {
+          tf_shared_lock l(ckpt_mu_);
+          // Decide whether a new iterator should be built.
+          // 1. If there is an existing iterator, we use it.
+          // 2. If there was an error in iterator creation that could not be
+          //    notified to the client we attempt to send that to the client
+          //    first.
+          make_new_iterator =
+              worker_thread_states_[thread_index].iterator == nullptr &&
+              worker_thread_states_[thread_index].iterator_creation_status.ok();
+        }
+        // Even though `make_new_iterator` has cached values from
+        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
+        // it is safe to *read* `make_new_iterator`outside of a lock without
+        // worrying about concurrent changes to values in
+        // `worker_thread_states_[thread_index]`. See comment at the start of
+        // this function for details.
+        while (true) {
+          // Whether creation of the iterator succeeded.
+          Status iterator_creation_status;
+          // 1. Build a new iterator or use the existing one.
+          if (make_new_iterator) {
+            // 1a. Get new input tensors or use the exiting ones.
+            bool read_new_input;
+            {
+              tf_shared_lock l(ckpt_mu_);
+              // worker_thread_states_[thread_index].input will be non-empty
+              // if checkpointing happened at CHECKPOINT_MARKER_A.
+              read_new_input =
+                  worker_thread_states_[thread_index].input.empty();
+            }
+
+            if (read_new_input) {
+              mutex_lock l(mu_);
+              while (!cancelled_ && !workers_[thread_index].is_producing) {
+                RecordStop(ctx.get());
+                workers_[thread_index].cond_var.wait(l);
+                RecordStart(ctx.get());
+              }
+              if (cancelled_) return;
+              // Copy the input tensors so that we do not need to block on `mu_`
+              // when building the iterator.
+              // We keep a copy of the input tensors in
+              // `WorkerThreadState.input` till the iterator is in use. This is
+              // used in `RestoreInternal` to re-build the iterator.
+              // TODO(b/78046638): Explore ways to avoid tracking the input
+              // tensors.
+              tf_shared_lock ckpt_l(ckpt_mu_);
+              worker_thread_states_[thread_index].input.swap(
+                  workers_[thread_index].input);
+              // CHECKPOINT_MARKER_A
+              // We have the input tensors but have not built the iterator yet.
+            }
+
+            // 1b. Run the user defined function to produce a new iterator.
+            {
+              tf_shared_lock l(ckpt_mu_);
+              worker_thread_states_[thread_index].iterator_creation_status =
+                  MakeIteratorFromInputElement(
+                      ctx.get(), worker_thread_states_[thread_index].input,
+                      thread_index, *instantiated_captured_func_, prefix(),
+                      &worker_thread_states_[thread_index].iterator);
+              iterator_creation_status =
+                  worker_thread_states_[thread_index].iterator_creation_status;
+              if (!iterator_creation_status.ok()) {
+                worker_thread_states_[thread_index].input.clear();
+              }
+              // CHECKPOINT_MARKER_B
+              // Either an iterator has been successfully built and placed in
+              // `worker_thread_states_[thread_index].iterator` or it failed and
+              // a non-OK status has been put in
+              // `worker_thread_states_[thread_index].iterator_creation_status`.
+            }
+          } else {
+            tf_shared_lock l(ckpt_mu_);
+            iterator_creation_status =
+                worker_thread_states_[thread_index].iterator_creation_status;
+            // Mark that we have used up the restored iterator.
+            make_new_iterator = true;
+          }
+          // 2. Start producing elements or send error state to client if
+          //    iterator creation failed.
+          if (!iterator_creation_status.ok()) {
+            mutex_lock l(mu_);
+            // Wait for space in the prefetch queue.
+            while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                      dataset()->buffer_output_elements_) {
+              RecordStop(ctx.get());
+              workers_[thread_index].cond_var.wait(l);
+              RecordStart(ctx.get());
+            }
+            if (cancelled_) return;
+            tf_shared_lock ckpt_l(ckpt_mu_);
+            workers_[thread_index].outputs.emplace_back(
+                iterator_creation_status);
+            workers_[thread_index].is_producing = false;
+            worker_thread_states_[thread_index].iterator_creation_status =
+                Status::OK();
+            // CHECKPOINT_MARKER_C
+            // Non-OK iterator creation status has been notified to the
+            // client.
+            workers_[thread_index].cond_var.notify_one();
+          } else {
+            bool end_of_sequence = false;
+            while (!end_of_sequence) {
+              // 3.a Produce an element!
+              {
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                if (worker_thread_states_[thread_index]
+                        .output_elem.status.ok() &&
+                    worker_thread_states_[thread_index]
+                        .output_elem.output.empty() &&
+                    !worker_thread_states_[thread_index].end_of_sequence) {
+                  worker_thread_states_[thread_index].output_elem.status =
+                      worker_thread_states_[thread_index].iterator->GetNext(
+                          ctx.get(),
+                          &worker_thread_states_[thread_index]
+                               .output_elem.output,
+                          &worker_thread_states_[thread_index].end_of_sequence);
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                } else {
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                }
+                // CHECKPOINT_MARKER_D
+                // An element has been read or an error or end_of_sequence has
+                // been received from the input iterator and is waiting to be
+                // sent to client.
+              }
+
+              // 3.b Make it available to the client.
+              {
+                mutex_lock l(mu_);
+
+                // Wait for space in the prefetch queue.
+                while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                          dataset()->buffer_output_elements_) {
+                  RecordStop(ctx.get());
+                  workers_[thread_index].cond_var.wait(l);
+                  RecordStart(ctx.get());
+                }
+                if (cancelled_) return;
+
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                workers_[thread_index].is_producing = !end_of_sequence;
+
+                // Output the element.
+
+                // Move the temporary state in WorkerThreadState to WorkerState
+                // and mark it as used.
+                if (end_of_sequence) {
+                  worker_thread_states_[thread_index].iterator.reset();
+                  worker_thread_states_[thread_index].input.clear();
+                  worker_thread_states_[thread_index].end_of_sequence = false;
+                } else {
+                  workers_[thread_index].outputs.emplace_back(
+                      worker_thread_states_[thread_index].output_elem.status);
+                  workers_[thread_index].outputs.back().output.swap(
+                      worker_thread_states_[thread_index].output_elem.output);
+                }
+                worker_thread_states_[thread_index].output_elem.status =
+                    Status::OK();
+                if (dataset()->sloppy_) {
+                  sloppy_cond_var_.notify_one();
+                } else {
+                  workers_[thread_index].cond_var.notify_one();
+                }
+                // CHECKPOINT_MARKER_E
+                // Output element or iterator status has been sent to the
+                // client.
+              }
+            }
+          }
+        }
+      }
+
+      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_", index);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            workers_[index].input.size()));
+        for (int i = 0; i < workers_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              workers_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_outputs_size")),
+            workers_[index].outputs.size()));
+        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+              writer, workers_[index].outputs[i],
+              full_name(strings::StrCat(prefix, "_outputs_", i))));
+        }
+        if (workers_[index].is_producing) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_is_producing")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
+                                   IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        workers_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          workers_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &workers_[index].input.back()));
+        }
+        int64 outputs_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
+            &outputs_size));
+        for (int i = 0; i < outputs_size; ++i) {
+          workers_[index].outputs.emplace_back(Status::OK());
+          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+              reader, &workers_[index].outputs.back(),
+              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
+        }
+        if (reader->Contains(
+                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
+          workers_[index].is_producing = true;
+        } else {
+          workers_[index].is_producing = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
+                                          int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_thread_", index);
+        if (worker_thread_states_[index].iterator != nullptr) {
+          TF_RETURN_IF_ERROR(
+              SaveInput(writer, worker_thread_states_[index].iterator));
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            worker_thread_states_[index].input.size()));
+        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              worker_thread_states_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_iterator_creation_status"),
+            worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+            writer, worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(prefix, "_output"))));
+        if (worker_thread_states_[index].end_of_sequence) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
+                                         IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_thread_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        worker_thread_states_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          worker_thread_states_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &worker_thread_states_[index].input.back()));
+        }
+        // Restore iterator.
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
+          worker_thread_states_[index].iterator.reset();
+        } else {
+          std::unique_ptr<IteratorBase> iterator;
+          Status s = MakeIteratorFromInputElement(
+              ctx, worker_thread_states_[index].input, index,
+              *instantiated_captured_func_, prefix(), &iterator);
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
+          worker_thread_states_[index].iterator.swap(iterator);
+        }
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
+            &worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+            reader, &worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(worker_prefix, "_output"))));
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
+          worker_thread_states_[index].end_of_sequence = true;
+        } else {
+          worker_thread_states_[index].end_of_sequence = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteOutputElemLocked(IteratorStateWriter* writer,
+                                   const OutputElem& output_elem,
+                                   const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_status"), output_elem.status));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
+                                output_elem.output.size()));
+        for (int i = 0; i < output_elem.output.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadOutputElemLocked(IteratorStateReader* reader,
+                                  OutputElem* output_elem, const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
+        int64 output_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            strings::StrCat(prefix, "_output_size"), &output_size));
+        output_elem->output.reserve(output_size);
+        for (int i = 0; i < output_size; ++i) {
+          output_elem->output.emplace_back();
+          TF_RETURN_IF_ERROR(
+              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
+                                 &output_elem->output.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& prefix, const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                              Status* status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
+      // Mutex & condition variable to guard mutable iterator internals and
+      // coordinate among worker threads and client thread[s].
+      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
+      // The main thread waits on this condition variable if running in sloppy
+      // mode and no values are available.
+      condition_variable sloppy_cond_var_;
+      // Mutex used to wait for a consistent state while checkpointing.
+      // Only Save and Restore require an exclusive lock on this mutex. In
+      // other scenarios we just acquire a shared lock so the pipeline's
+      // performance should not be affected in the absence of checkpointing.
+      // A thread must not wait on any condition variable while holding
+      // `ckpt_mu_` in either shared or exclusive modes.
+      mutex ckpt_mu_;
+
+      // The iterator producing elements which are converted to datasets by
+      // the dataset()->captured_func_ then interleaved together.
+      // input_impl_ is reset when we have exhausted its input.
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+
+      // The WorkerState structs the worker threads operate on.
+      // workers_ elements are in at most one of interleave_ and staging_.
+      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
+
+      // Stores the temporary state of WorkerThreads which is not stored in
+      // WorkerState. This is used for checkpointing purposes only.
+      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
+
+      // Indices in `workers_` of iterators to interleave.
+      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
+      // Indices in `workers_` of prefetched iterators.
+      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
+
+      // The index into output_elements_ for next element to produce.
+      size_t next_index_ GUARDED_BY(mu_) = 0;
+      // The number of items produced so far within the block
+      size_t block_count_ GUARDED_BY(mu_) = 0;
+      // Flag to instruct the worker threads to exit.
+      bool cancelled_ GUARDED_BY(mu_) = false;
+      // The worker threads. This must be last to ensure the
+      // threads have exited before any other members are deallocated.
+      // TODO(b/65178177): Avoid allocating additional threads.
+      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList interleave_func_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const int64 cycle_length_;
+    const int64 block_length_;
+    const bool sloppy_;
+    const int64 buffer_output_elements_;
+    const int64 prefetch_input_elements_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList interleave_func_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalParallelInterleaveDataset").Device(DEVICE_CPU),
+    ParallelInterleaveDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
similarity index 80%
rename from tensorflow/core/kernels/data/parse_example_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 1d1a7170629ae71bd56df35d4aea8a4f73301ee8..ea99a8b32c5a945f30945369ef2ed4f4b6725887 100644
--- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -23,9 +23,8 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
-
 class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ParseExampleDatasetOp(OpKernelConstruction* ctx)
@@ -38,6 +37,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dense_shapes", &dense_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("sloppy", &sloppy_));
     for (int i = 0; i < dense_shapes_.size(); ++i) {
       bool shape_ok = true;
       if (dense_shapes_[i].dims() == -1) {
@@ -142,11 +142,11 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       it->second = i++;
     }
 
-    *output = new Dataset(ctx, input, std::move(dense_defaults),
-                          std::move(sparse_keys_), std::move(dense_keys_),
-                          std::move(key_to_output_index), std::move(config),
-                          num_parallel_calls, sparse_types_, dense_types_,
-                          dense_shapes_, output_types_, output_shapes_);
+    *output =
+        new Dataset(ctx, input, dense_defaults, sparse_keys_, dense_keys_,
+                    std::move(key_to_output_index), std::move(config),
+                    num_parallel_calls, sparse_types_, dense_types_,
+                    dense_shapes_, output_types_, output_shapes_, sloppy_);
   }
 
  private:
@@ -161,7 +161,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
             const DataTypeVector& dense_types,
             const std::vector<PartialTensorShape>& dense_shapes,
             const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
+            const std::vector<PartialTensorShape>& output_shapes, bool sloppy)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           dense_defaults_(std::move(dense_defaults)),
@@ -174,7 +174,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
           dense_types_(dense_types),
           dense_shapes_(dense_shapes),
           output_types_(output_types),
-          output_shapes_(output_shapes) {
+          output_shapes_(output_shapes),
+          sloppy_(sloppy) {
       input_->Ref();
     }
 
@@ -182,97 +183,12 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      auto map_fn = [this](IteratorContext* ctx,
-                           std::vector<Tensor> input_element,
-                           std::vector<Tensor>* result, StatusCallback done) {
-        (*ctx->runner())([this, ctx, input_element, result, done]() {
-          thread::ThreadPool* device_threadpool =
-              ctx->lib()->device()->tensorflow_cpu_worker_threads()->workers;
-          std::vector<string> slice_vec;
-          for (Tensor t : input_element) {
-            auto serialized_t = t.flat<string>();
-            gtl::ArraySlice<string> slice(serialized_t.data(),
-                                          serialized_t.size());
-            for (auto it = slice.begin(); it != slice.end(); it++)
-              slice_vec.push_back(*it);
-          }
-          example::FastParseExampleConfig config = config_;
-          // local copy of config_ for modification.
-          auto stats_aggregator = ctx->stats_aggregator();
-          if (stats_aggregator) {
-            config.collect_feature_stats = true;
-          }
-          example::Result example_result;
-          Status s = FastParseExample(config, slice_vec, {}, device_threadpool,
-                                      &example_result);
-          if (s.ok()) {
-            (*result).resize(key_to_output_index_.size());
-            for (int d = 0; d < dense_keys_.size(); ++d) {
-              int output_index = key_to_output_index_.at(dense_keys_[d]);
-              CHECK(example_result.dense_values[d].dtype() ==
-                    output_dtypes()[output_index])
-                  << "Got wrong type for FastParseExample return value " << d
-                  << " (expected "
-                  << DataTypeString(output_dtypes()[output_index]) << ", got "
-                  << DataTypeString(example_result.dense_values[d].dtype())
-                  << ").";
-              CHECK(output_shapes()[output_index].IsCompatibleWith(
-                  example_result.dense_values[d].shape()))
-                  << "Got wrong shape for FastParseExample return value " << d
-                  << " (expected "
-                  << output_shapes()[output_index].DebugString() << ", got "
-                  << example_result.dense_values[d].shape().DebugString()
-                  << ").";
-              (*result)[output_index] = example_result.dense_values[d];
-            }
-            for (int d = 0; d < sparse_keys_.size(); ++d) {
-              Tensor serialized_sparse = Tensor(DT_VARIANT, TensorShape({3}));
-              auto serialized_sparse_t = serialized_sparse.vec<Variant>();
-              serialized_sparse_t(0) = example_result.sparse_indices[d];
-              serialized_sparse_t(1) = example_result.sparse_values[d];
-              serialized_sparse_t(2) = example_result.sparse_shapes[d];
-              int output_index = key_to_output_index_.at(sparse_keys_[d]);
-              CHECK(serialized_sparse.dtype() == output_dtypes()[output_index])
-                  << "Got wrong type for FastParseExample return value " << d
-                  << " (expected "
-                  << DataTypeString(output_dtypes()[output_index]) << ", got "
-                  << DataTypeString(serialized_sparse.dtype()) << ").";
-              CHECK(output_shapes()[output_index].IsCompatibleWith(
-                  serialized_sparse.shape()))
-                  << "Got wrong shape for FastParseExample return value " << d
-                  << " (expected "
-                  << output_shapes()[output_index].DebugString() << ", got "
-                  << serialized_sparse.shape().DebugString() << ").";
-              (*result)[output_index] = serialized_sparse;
-            }
-            // TODO(b/111553342): User provided tags instead of fixed tag.
-            if (stats_aggregator) {
-              stats_aggregator->IncrementCounter(
-                  "examples_count", "trainer",
-                  example_result.feature_stats.size());
-              for (example::PerExampleFeatureStats feature_stats :
-                   example_result.feature_stats) {
-                stats_aggregator->AddToHistogram(
-                    "features",
-                    {static_cast<double>(feature_stats.features_count)});
-                stats_aggregator->IncrementCounter(
-                    "features_count", "trainer", feature_stats.features_count);
-                stats_aggregator->IncrementCounter(
-                    "feature_values_count", "trainer",
-                    feature_stats.feature_values_count);
-                stats_aggregator->AddToHistogram(
-                    "feature-values",
-                    {static_cast<double>(feature_stats.feature_values_count)});
-              }
-            }
-          }
-          done(s);
-        });
-      };
-
+      std::unique_ptr<ParallelMapFunctor> parse_example_functor(
+          new ParseExampleFunctor(this));
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParseExample")}, input_,
-          std::move(map_fn), num_parallel_calls_);
+          std::move(parse_example_functor), num_parallel_calls_, sloppy_,
+          /*preserve_cardinality=*/true);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -287,6 +203,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       return "ParseExampleDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -312,12 +230,14 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       AttrValue sparse_types_attr;
       AttrValue dense_attr;
       AttrValue dense_shapes_attr;
+      AttrValue sloppy_attr;
 
       b->BuildAttrValue(sparse_keys_, &sparse_keys_attr);
       b->BuildAttrValue(dense_keys_, &dense_keys_attr);
       b->BuildAttrValue(sparse_types_, &sparse_types_attr);
       b->BuildAttrValue(dense_types_, &dense_attr);
       b->BuildAttrValue(dense_shapes_, &dense_shapes_attr);
+      b->BuildAttrValue(sloppy_, &sloppy_attr);
 
       TF_RETURN_IF_ERROR(b->AddDataset(this,
                                        {
@@ -329,12 +249,118 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
                                         {"dense_keys", dense_keys_attr},
                                         {"sparse_types", sparse_types_attr},
                                         {"Tdense", dense_attr},
-                                        {"dense_shapes", dense_shapes_attr}},
+                                        {"dense_shapes", dense_shapes_attr},
+                                        {"sloppy", sloppy_attr}},
                                        output));
       return Status::OK();
     }
 
    private:
+    class ParseExampleFunctor : public ParallelMapFunctor {
+     public:
+      explicit ParseExampleFunctor(const Dataset* dataset)
+          : dataset_(dataset) {}
+
+      void MapFunc(IteratorContext* ctx, const string& prefix,
+                   std::vector<Tensor> input, std::vector<Tensor>* output,
+                   StatusCallback callback) override {
+        (*ctx->runner())([this, ctx, input, output, callback]() {
+          thread::ThreadPool* device_threadpool =
+              ctx->lib()->device()->tensorflow_cpu_worker_threads()->workers;
+          std::vector<string> slice_vec;
+          for (const Tensor& t : input) {
+            auto serialized_t = t.flat<string>();
+            gtl::ArraySlice<string> slice(serialized_t.data(),
+                                          serialized_t.size());
+            for (auto it = slice.begin(); it != slice.end(); it++)
+              slice_vec.push_back(*it);
+          }
+          example::FastParseExampleConfig config = dataset_->config_;
+          // local copy of config_ for modification.
+          auto stats_aggregator = ctx->stats_aggregator();
+          if (stats_aggregator) {
+            config.collect_feature_stats = true;
+          }
+          example::Result example_result;
+          Status s = FastParseExample(config, slice_vec, {}, device_threadpool,
+                                      &example_result);
+          if (s.ok()) {
+            (*output).resize(dataset_->key_to_output_index_.size());
+            for (int d = 0; d < dataset_->dense_keys_.size(); ++d) {
+              int output_index =
+                  dataset_->key_to_output_index_.at(dataset_->dense_keys_[d]);
+              DCHECK(example_result.dense_values[d].dtype() ==
+                     dataset_->output_dtypes()[output_index])
+                  << "Got wrong type for FastParseExample return value " << d
+                  << " (expected "
+                  << DataTypeString(dataset_->output_dtypes()[output_index])
+                  << ", got "
+                  << DataTypeString(example_result.dense_values[d].dtype())
+                  << ").";
+              DCHECK(dataset_->output_shapes()[output_index].IsCompatibleWith(
+                  example_result.dense_values[d].shape()))
+                  << "Got wrong shape for FastParseExample return value " << d
+                  << " (expected "
+                  << dataset_->output_shapes()[output_index].DebugString()
+                  << ", got "
+                  << example_result.dense_values[d].shape().DebugString()
+                  << ").";
+              (*output)[output_index] = example_result.dense_values[d];
+            }
+            for (int d = 0; d < dataset_->sparse_keys_.size(); ++d) {
+              int output_index =
+                  dataset_->key_to_output_index_.at(dataset_->sparse_keys_[d]);
+              (*output)[output_index] =
+                  Tensor(ctx->allocator({}), DT_VARIANT, {3});
+              Tensor& serialized_sparse = (*output)[output_index];
+              auto serialized_sparse_t = serialized_sparse.vec<Variant>();
+              serialized_sparse_t(0) = example_result.sparse_indices[d];
+              serialized_sparse_t(1) = example_result.sparse_values[d];
+              serialized_sparse_t(2) = example_result.sparse_shapes[d];
+              DCHECK(serialized_sparse.dtype() ==
+                     dataset_->output_dtypes()[output_index])
+                  << "Got wrong type for FastParseExample return value " << d
+                  << " (expected "
+                  << DataTypeString(dataset_->output_dtypes()[output_index])
+                  << ", got " << DataTypeString(serialized_sparse.dtype())
+                  << ").";
+              DCHECK(dataset_->output_shapes()[output_index].IsCompatibleWith(
+                  serialized_sparse.shape()))
+                  << "Got wrong shape for FastParseExample return value " << d
+                  << " (expected "
+                  << dataset_->output_shapes()[output_index].DebugString()
+                  << ", got " << serialized_sparse.shape().DebugString()
+                  << ").";
+            }
+            // TODO(b/111553342): User provided tags instead of fixed tag.
+            if (stats_aggregator) {
+              stats_aggregator->IncrementCounter(
+                  "examples_count", "trainer",
+                  example_result.feature_stats.size());
+              for (example::PerExampleFeatureStats feature_stats :
+                   example_result.feature_stats) {
+                stats_aggregator->AddToHistogram(
+                    "features",
+                    {static_cast<double>(feature_stats.features_count)});
+                stats_aggregator->IncrementCounter(
+                    "features_count", "trainer", feature_stats.features_count);
+                stats_aggregator->IncrementCounter(
+                    "feature_values_count", "trainer",
+                    feature_stats.feature_values_count);
+                stats_aggregator->AddToHistogram(
+                    "feature-values",
+                    {static_cast<double>(feature_stats.feature_values_count)});
+              }
+            }
+          }
+          callback(s);
+        });
+      }
+
+     private:
+      const Dataset* dataset_;
+    };
+
     const DatasetBase* const input_;
     const std::vector<Tensor> dense_defaults_;
     const std::vector<string> sparse_keys_;
@@ -347,11 +373,13 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
     const std::vector<PartialTensorShape> dense_shapes_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const bool sloppy_;
   };
 
   const int graph_def_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
+  bool sloppy_;
   std::vector<string> sparse_keys_;
   std::vector<string> dense_keys_;
   DataTypeVector sparse_types_;
@@ -361,8 +389,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
   std::vector<std::size_t> elements_per_stride_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ParseExampleDataset").Device(DEVICE_CPU),
-                        ParseExampleDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalParseExampleDataset").Device(DEVICE_CPU),
+    ParseExampleDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
index 2c6179d9f5938d5ef413a83f2aad43fb96f67f47..af024520982106aead1b4bf3d09886bcc42d73d1 100644
--- a/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
+++ b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
@@ -27,434 +27,6 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-struct BufferElement {
-  // The producer sets `status` if getting the input element fails.
-  Status status;
-  // The buffered data element.
-  std::vector<Tensor> value;
-};
-
-using FunctionBufferCallback = std::function<void(const BufferElement&)>;
-
-class FunctionBufferingResource : public ResourceBase {
- public:
-  FunctionBufferingResource(FunctionLibraryRuntime* lib,
-                            std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-                            const NameAttrList& func, int64 buffer_size,
-                            const string& source_device,
-                            const string& target_device,
-                            const std::vector<Tensor>& func_args,
-                            const DataTypeVector& output_types)
-      : lib_(lib),
-        pflr_(std::move(pflr)),
-        func_(func),
-        buffer_size_(buffer_size),
-        source_device_(source_device),
-        target_device_(target_device),
-        func_args_(func_args),
-        output_types_(output_types),
-        handle_(kInvalidHandle),
-        is_buffering_(false),
-        end_of_sequence_(false),
-        cancelled_(false) {}
-
-  ~FunctionBufferingResource() override {
-    Cancel();
-  }
-
-  string DebugString() override {
-    return strings::StrCat("FunctionBufferingResource. Size: ", buffer_size_,
-                           "; target_device: ", target_device_);
-  }
-
-  // Instantiates the function the first time it's called. After that it caches
-  // the handle.
-  Status Instantiate() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    // Re-use existing handle if it's been set, effectively caching it.
-    if (handle_ != kInvalidHandle) {
-      return Status::OK();
-    }
-    AttrValueMap attr_values = func_.attr();
-    FunctionLibraryRuntime::InstantiateOptions opts;
-    opts.target = target_device_;
-    return lib_->Instantiate(func_.name(), AttrSlice(&attr_values), opts,
-                             &handle_);
-  }
-
-  // Returns true if we've got to the end of the sequence and exhausted the
-  // buffer.
-  bool Finished() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    return end_of_sequence_ && buffer_.empty();
-  }
-
-  // Cancels any buffering / prefetching going on.
-  void Cancel() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    cancelled_ = true;
-    while (is_buffering_) {
-      cond_var_.wait(l);
-    }
-  }
-
-  // Cancels all pending operations and then clears out the state.
-  void Reset() LOCKS_EXCLUDED(mu_) {
-    Cancel();
-    mutex_lock l(mu_);
-    buffer_.clear();
-    requests_.clear();
-    is_buffering_ = false;
-    end_of_sequence_ = false;
-    cancelled_ = false;
-  }
-
-  // If the buffer has anything, runs `callback` on the first element in the
-  // buffer, else schedules the `callback` to be called. Requires `args` and
-  // `lib` in case more function calls need to be scheduled.
-  void MaybeGet(FunctionBufferCallback callback) LOCKS_EXCLUDED(mu_) {
-    bool start_buffering = false;
-    bool produced_output = false;
-    BufferElement buffer_element;
-    {
-      mutex_lock l(mu_);
-      if (!is_buffering_ && !end_of_sequence_) {
-        start_buffering = true;
-      }
-      if (!buffer_.empty()) {
-        produced_output = true;
-        std::swap(buffer_element, buffer_.front());
-        buffer_.pop_front();
-      } else {
-        produced_output = false;
-        requests_.push_back(std::move(callback));
-      }
-    }
-    if (produced_output) {
-      callback(buffer_element);
-    }
-    if (start_buffering) {
-      FillBuffer();
-    }
-  }
-
- private:
-  void FillBuffer() LOCKS_EXCLUDED(mu_) {
-    FunctionLibraryRuntime::Handle handle;
-    std::vector<FunctionBufferCallback> cancellation_callbacks;
-    std::vector<BufferElement> cancellation_buffer_elements;
-    bool cancelled = false;
-    {
-      mutex_lock l(mu_);
-      handle = handle_;
-      if (cancelled_) {
-        cancelled = true;
-        // Run through and fulfill all pending requests, if possible.
-        while (!requests_.empty()) {
-          if (!buffer_.empty()) {
-            cancellation_buffer_elements.push_back(std::move(buffer_.front()));
-            buffer_.pop_front();
-            cancellation_callbacks.push_back(std::move(requests_.front()));
-            requests_.pop_front();
-          } else {
-            LOG(ERROR) << "Buffer ran out of elements and we couldn't satisfy: "
-                       << requests_.size() << " requests";
-            break;
-          }
-        }
-        is_buffering_ = false;
-      } else {
-        is_buffering_ = true;
-      }
-    }
-    if (cancelled) {
-      for (int i = 0; i < cancellation_callbacks.size(); ++i) {
-        cancellation_callbacks[i](cancellation_buffer_elements[i]);
-      }
-      cond_var_.notify_all();
-      return;
-    }
-    FunctionLibraryRuntime::Options opts;
-    // Copied from CapturedFunction::generate_step_id();
-    opts.step_id = -std::abs(static_cast<int64>(random::New64()));
-    opts.source_device = source_device_;
-    AllocatorAttributes arg_alloc_attr;
-    arg_alloc_attr.set_on_host(true);
-    opts.args_alloc_attrs.push_back(arg_alloc_attr);
-    for (const auto& dtype : output_types_) {
-      AllocatorAttributes ret_alloc_attrs;
-      if (DataTypeAlwaysOnHost(dtype)) {
-        ret_alloc_attrs.set_on_host(true);
-      }
-      opts.rets_alloc_attrs.push_back(ret_alloc_attrs);
-    }
-    if (opts.source_device != target_device_) {
-      opts.remote_execution = true;
-    }
-    opts.create_rendezvous = true;
-    auto* rets = new std::vector<Tensor>;
-    lib_->Run(opts, handle, func_args_, rets,
-              [this, rets](const Status& status) {
-                FunctionBufferCallback callback = nullptr;
-                BufferElement buffer_front;
-                bool restart_buffering = false;
-                {
-                  mutex_lock l(mu_);
-                  BufferElement buffer_element;
-                  buffer_element.status = status;
-                  if (status.ok()) {
-                    buffer_element.value.swap(*rets);
-                  } else {
-                    end_of_sequence_ = true;
-                    is_buffering_ = false;
-                  }
-                  buffer_.push_back(std::move(buffer_element));
-                  if (!requests_.empty()) {
-                    buffer_front = std::move(buffer_.front());
-                    buffer_.pop_front();
-                    callback = std::move(requests_.front());
-                    requests_.pop_front();
-                  }
-                  if (buffer_.size() < buffer_size_ && !end_of_sequence_) {
-                    restart_buffering = true;
-                  } else {
-                    // When the buffer is full, we don't want to call
-                    // FillBuffer() unless we're in cancellation phase in which
-                    // case FillBuffer() will do the final cleanup post
-                    // cancellation.
-                    if (cancelled_) {
-                      restart_buffering = true;
-                    }
-                    is_buffering_ = false;
-                  }
-                }
-                if (callback != nullptr) {
-                  callback(buffer_front);
-                }
-                if (restart_buffering) {
-                  FillBuffer();
-                }
-              });
-  }
-
-  mutex mu_;
-  FunctionLibraryRuntime* lib_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  NameAttrList func_;
-  const int64 buffer_size_;
-  const string source_device_;
-  const string target_device_;
-  const std::vector<Tensor> func_args_;
-  const DataTypeVector output_types_;
-  FunctionLibraryRuntime::Handle handle_ GUARDED_BY(mu_);
-  std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-  std::deque<FunctionBufferCallback> requests_ GUARDED_BY(mu_);
-  bool is_buffering_ GUARDED_BY(mu_);
-  bool end_of_sequence_ GUARDED_BY(mu_);
-  bool cancelled_ GUARDED_BY(mu_);
-  condition_variable cond_var_;
-};
-
-class FunctionBufferResourceHandleOp : public OpKernel {
- public:
-  explicit FunctionBufferResourceHandleOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx), flib_def_(nullptr) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &buffer_size_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-  }
-
-  ~FunctionBufferResourceHandleOp() override {
-    if (cinfo_.resource_is_private_to_kernel()) {
-      if (!cinfo_.resource_manager()
-               ->Delete<FunctionBufferingResource>(cinfo_.container(),
-                                                   cinfo_.name())
-               .ok()) {
-        // Do nothing; the resource can have been deleted by session resets.
-      }
-    }
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* string_arg;
-    OP_REQUIRES_OK(ctx, ctx->input("string_arg", &string_arg));
-    std::vector<Tensor> func_args;
-    func_args.push_back(*string_arg);
-
-    const string& source_device = ctx->device()->name();
-
-    // Obtain and canonicalize target_device.
-    const Tensor* target_arg;
-    OP_REQUIRES_OK(ctx, ctx->input("target_device", &target_arg));
-    string target_device;
-    OP_REQUIRES_OK(ctx, DeviceNameUtils::CanonicalizeDeviceName(
-                            target_arg->scalar<string>()(), source_device,
-                            &target_device));
-
-    FunctionLibraryRuntime* lib = ctx->function_library();
-    OP_REQUIRES(ctx, lib != nullptr,
-                errors::Internal("No function library is provided."));
-
-    mutex_lock l(mu_);
-    if (!initialized_) {
-      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def()));
-      FunctionLibraryRuntime* clone_lib;
-      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr;
-      OP_REQUIRES_OK(ctx, lib->Clone(&flib_def_, &pflr, &clone_lib));
-      // Create the resource.
-      FunctionBufferingResource* buffer;
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->resource_manager()->LookupOrCreate<FunctionBufferingResource>(
-              cinfo_.container(), cinfo_.name(), &buffer,
-              [clone_lib, &pflr, &source_device, &target_device, func_args,
-               this](FunctionBufferingResource** ptr) {
-                *ptr = new FunctionBufferingResource(
-                    clone_lib, std::move(pflr), func_, buffer_size_,
-                    source_device, target_device, func_args, output_types_);
-                return Status::OK();
-              }));
-      core::ScopedUnref s(buffer);
-      OP_REQUIRES_OK(ctx, buffer->Instantiate());
-      initialized_ = true;
-    }
-
-    OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
-                            ctx, 0, cinfo_.container(), cinfo_.name(),
-                            MakeTypeIndex<FunctionBufferingResource>()));
-  }
-
- private:
-  mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
-  bool initialized_ GUARDED_BY(mu_) = false;
-  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  NameAttrList func_;
-  int64 buffer_size_;
-  string container_;
-  string name_;
-  DataTypeVector output_types_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
-                            .Device(DEVICE_CPU)
-                            .HostMemory("resource")
-                            .HostMemory("string_arg")
-                            .HostMemory("target_device"),
-                        FunctionBufferResourceHandleOp);
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("resource")
-                            .HostMemory("string_arg")
-                            .HostMemory("target_device"),
-                        FunctionBufferResourceHandleOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("resource")
-                            .HostMemory("string_arg")
-                            .HostMemory("target_device"),
-                        FunctionBufferResourceHandleOp);
-#endif  // TENSORFLOW_USE_SYCL
-
-// Prefetches and fills up a buffer by calling a function that provides the
-// elements to buffer.
-class FunctionBufferingResourceGetNextOp : public AsyncOpKernel {
- public:
-  explicit FunctionBufferingResourceGetNextOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx) {}
-
-  ~FunctionBufferingResourceGetNextOp() override {}
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    ResourceHandle handle;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, HandleFromInput(ctx, "function_buffer_resource", &handle), done);
-    FunctionBufferingResource* buffer = nullptr;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer),
-        done);
-
-    if (buffer->Finished()) {
-      buffer->Unref();
-      ctx->SetStatus(errors::OutOfRange("end_of_sequence"));
-      done();
-      return;
-    }
-
-    FunctionBufferCallback callback =
-        [ctx, buffer, done](const BufferElement& buffer_element) {
-          Status s = buffer_element.status;
-          if (!s.ok()) {
-            ctx->SetStatus(s);
-            buffer->Unref();
-            done();
-            return;
-          }
-          for (size_t i = 0; i < buffer_element.value.size(); ++i) {
-            ctx->set_output(i, buffer_element.value[i]);
-          }
-          buffer->Unref();
-          done();
-        };
-    buffer->MaybeGet(std::move(callback));
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
-                            .Device(DEVICE_CPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceGetNextOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceGetNextOp);
-#endif  // TENSORFLOW_USE_SYCL
-
-// Resets the FunctionBufferingResource, cancelling all pending requests and
-// clearing out the buffer.
-class FunctionBufferingResourceResetOp : public OpKernel {
- public:
-  explicit FunctionBufferingResourceResetOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
-
-  ~FunctionBufferingResourceResetOp() override {}
-
-  void Compute(OpKernelContext* ctx) override {
-    ResourceHandle handle;
-    OP_REQUIRES_OK(ctx,
-                   HandleFromInput(ctx, "function_buffer_resource", &handle));
-    FunctionBufferingResource* buffer = nullptr;
-    OP_REQUIRES_OK(
-        ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer));
-    core::ScopedUnref s(buffer);
-
-    buffer->Reset();
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
-                            .Device(DEVICE_CPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceResetOp);
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceResetOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceResetOp);
-#endif  // TENSORFLOW_USE_SYCL
-
 class IteratorGetDeviceOp : public OpKernel {
  public:
   using OpKernel::OpKernel;
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
similarity index 89%
rename from tensorflow/core/kernels/data/random_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 044a791a3fa514efaeefb7a39def1a5907ff9082..6d85cd5c450640a0042add2ead26836433166ade 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class RandomDatasetOp : public DatasetOpKernel {
@@ -76,6 +76,8 @@ class RandomDatasetOp : public DatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override { return kInfiniteCardinality; }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -100,14 +102,19 @@ class RandomDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
-        value_tensor.scalar<int64>()() = Random();
-        out_tensors->emplace_back(std::move(value_tensor));
+        out_tensors->emplace_back(ctx->allocator({}), DT_INT64,
+                                  TensorShape({}));
+        out_tensors->back().scalar<int64>()() = Random();
         *end_of_sequence = false;
         return Status::OK();
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_random_samples"),
@@ -147,7 +154,7 @@ class RandomDatasetOp : public DatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("RandomDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalRandomDataset").Device(DEVICE_CPU),
                         RandomDatasetOp);
 
 }  // namespace
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
similarity index 82%
rename from tensorflow/core/kernels/data/scan_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 2a911aa368090b3bcf46204e540bcb8cd0fb1e6c..0d9a629a27f907fca2214a574db1ea0074a9ed2e 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -16,17 +16,17 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class ScanDatasetOp : public UnaryDatasetOpKernel {
@@ -37,6 +37,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Tstate", &state_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -53,7 +55,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     *output = new Dataset(ctx, input, func_, std::move(initial_state),
                           std::move(captured_func), state_types_, output_types_,
-                          output_shapes_);
+                          output_shapes_, preserve_cardinality_);
   }
 
  private:
@@ -64,7 +66,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& state_types,
             const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
+            const std::vector<PartialTensorShape>& output_shapes,
+            bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -72,7 +75,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
           captured_func_(std::move(captured_func)),
           state_types_(state_types),
           output_types_(output_types),
-          output_shapes_(output_shapes) {
+          output_shapes_(output_shapes),
+          preserve_cardinality_(preserve_cardinality) {
       input_->Ref();
     }
 
@@ -93,6 +97,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "ScanDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -123,12 +129,15 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       b->BuildAttrValue(state_types_, &state_types);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
       TF_RETURN_IF_ERROR(
           b->AddDataset(this, {{0, input_node}},
                         {{1, initial_state_nodes}, {2, other_arguments}},
                         {{"f", f},
                          {"Tstate", state_types},
-                         {"Targuments", other_arguments_types_attr}},
+                         {"Targuments", other_arguments_types_attr},
+                         {"preserve_cardinality", preserve_cardinality_attr}},
                         output));
       return Status::OK();
     }
@@ -143,7 +152,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -168,8 +178,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         state_and_output.reserve(dataset()->state_types_.size() +
                                  output_dtypes().size());
 
-        Status s = dataset()->captured_func_->Run(ctx, std::move(args),
-                                                  &state_and_output);
+        Status s = instantiated_captured_func_->Run(ctx, std::move(args),
+                                                    &state_and_output);
         if (s.ok()) {
           state_.clear();
           size_t i = 0;
@@ -202,15 +212,30 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             out_tensors->push_back(std::move(state_and_output[i]));
           }
         } else if (errors::IsOutOfRange(s)) {
-          // `f` may deliberately raise `errors::OutOfRange` to indicate
-          // that we should terminate the iteration early.
-          *end_of_sequence = true;
-          return Status::OK();
+          if (dataset()->preserve_cardinality_) {
+            // To guarantee that the transformation preserves the cardinality of
+            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+            // former may be interpreted by a caller as the end of sequence.
+            return errors::InvalidArgument(
+                "Function invocation produced OutOfRangeError: ",
+                s.error_message());
+          } else {
+            // `f` may deliberately raise `errors::OutOfRange` to indicate
+            // that we should terminate the iteration early.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
         }
         return s;
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -246,6 +271,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<Tensor> state_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
@@ -255,15 +281,18 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     const DataTypeVector state_types_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const bool preserve_cardinality_;
   };
 
   DataTypeVector state_types_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
+  bool preserve_cardinality_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ScanDataset").Device(DEVICE_CPU), ScanDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalScanDataset").Device(DEVICE_CPU),
+                        ScanDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe128005faca9bd986e7c85600f7f871ebb97a25
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -0,0 +1,213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+class StatsAggregatorWithTagAndPrefix : public StatsAggregator {
+ public:
+  StatsAggregatorWithTagAndPrefix(
+      std::shared_ptr<StatsAggregator> stats_aggregator, const string& tag,
+      const string& prefix)
+      : wrapped_(stats_aggregator), tag_(tag), prefix_(prefix) {}
+
+  void AddToHistogram(const string& name,
+                      gtl::ArraySlice<double> values) override {
+    if (!tag_.empty()) {
+      wrapped_->AddToHistogram(strings::StrCat(tag_, "_", name), values);
+    } else {
+      wrapped_->AddToHistogram(name, values);
+    }
+  }
+
+  void AddScalar(const string& name, float value) override {
+    if (!tag_.empty()) {
+      wrapped_->AddScalar(strings::StrCat(tag_, "_", name), value);
+    } else {
+      wrapped_->AddScalar(name, value);
+    }
+  }
+
+  void EncodeToProto(Summary* out_summary) override {
+    wrapped_->EncodeToProto(out_summary);
+  }
+
+  void IncrementCounter(const string& name, const string& label,
+                        int64 val) override {
+    if (!prefix_.empty()) {
+      wrapped_->IncrementCounter(strings::StrCat(prefix_, "/", name), label,
+                                 val);
+    } else {
+      wrapped_->IncrementCounter(strings::StrCat("/tensorflow/", name), label,
+                                 val);
+    }
+  }
+
+ private:
+  std::shared_ptr<StatsAggregator> wrapped_;
+  string tag_;
+  string prefix_;
+  TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorWithTagAndPrefix);
+};
+
+class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit SetStatsAggregatorDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    StatsAggregatorResource* stats_aggregator_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
+                                       &stats_aggregator_resource));
+    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
+    string tag;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
+    string prefix;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "counter_prefix", &prefix));
+
+    *output = new Dataset(ctx, input, ctx->input(1), stats_aggregator_resource,
+                          tag, prefix);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
+                     const Tensor& resource_handle,
+                     StatsAggregatorResource* stats_aggregator_resource,
+                     const string& tag, const string& prefix)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          resource_handle_(resource_handle),
+          stats_aggregator_resource_(stats_aggregator_resource),
+          tag_(tag),
+          prefix_(prefix) {
+      input_->Ref();
+      stats_aggregator_resource_->Ref();
+    }
+
+    ~Dataset() override {
+      input_->Unref();
+      stats_aggregator_resource_->Unref();
+    }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::SetStatsAggregator")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "SetStatsAggregatorDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* resource_handle_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddTensor(resource_handle_, &resource_handle_node));
+      Node* tag_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
+      Node* prefix_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(prefix_, &prefix_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, resource_handle_node, tag_node, prefix_node},
+          output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        StatsAggregatorResource* stats_aggregator_resource =
+            dataset()->stats_aggregator_resource_;
+        IteratorContext::Params params(ctx);
+        params.stats_aggregator = std::shared_ptr<StatsAggregator>(
+            new StatsAggregatorWithTagAndPrefix(
+                stats_aggregator_resource->stats_aggregator(), dataset()->tag_,
+                dataset()->prefix_));
+        IteratorContext iter_ctx(std::move(params));
+        return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return errors::Unimplemented(dataset()->DebugString(),
+                                     " does not support checkpointing");
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        return errors::Unimplemented(dataset()->DebugString(),
+                                     " does not support checkpointing");
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const Tensor resource_handle_;
+    StatsAggregatorResource* stats_aggregator_resource_;
+    string tag_;
+    string prefix_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalSetStatsAggregatorDataset").Device(DEVICE_CPU),
+    SetStatsAggregatorDatasetOp);
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2fb8ac4f33b1e844bb39cc70a47ccb15424ace7
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class SleepDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  using UnaryDatasetOpKernel::UnaryDatasetOpKernel;
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 sleep_microseconds;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "sleep_microseconds",
+                                                   &sleep_microseconds));
+
+    OP_REQUIRES(ctx, sleep_microseconds >= 0,
+                errors::InvalidArgument("`sleep_microseconds` must be >= 0"));
+
+    *output = new Dataset(ctx, input, sleep_microseconds);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            int64 sleep_microseconds)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          sleep_microseconds_(sleep_microseconds) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Sleep")});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override { return "SleepDatasetOp::Dataset"; }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+
+      Node* sleep_microseconds = nullptr;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(sleep_microseconds_, &sleep_microseconds));
+
+      return b->AddDataset(this,
+                           {{0, input_graph_node},
+                            {1, sleep_microseconds}},  // Single tensor inputs.
+                           {},                         // Tensor list inputs.
+                           {},                         // Attrs
+                           output);
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        RecordStop(ctx);
+        ctx->env()->SleepForMicroseconds(dataset()->sleep_microseconds_);
+        RecordStart(ctx);
+        return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return SaveInput(writer, input_impl_);
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        return RestoreInput(ctx, reader, input_impl_);
+      }
+
+     private:
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    // TODO(b/117612213): Investigate autotuning for this value.
+    const int64 sleep_microseconds_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ExperimentalSleepDataset").Device(DEVICE_CPU),
+                        SleepDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ce4fbd3136d7fbd245fbb920ff658c4eae794c6
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -0,0 +1,312 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <deque>
+#include <vector>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/batch_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit SlidingWindowDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 window_size = 0;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size));
+    OP_REQUIRES(
+        ctx, window_size > 0,
+        errors::InvalidArgument("Window size must be greater than zero."));
+    int64 window_shift = 0;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "window_shift", &window_shift));
+    OP_REQUIRES(
+        ctx, window_shift > 0,
+        errors::InvalidArgument("Window shift must be greater than zero."));
+    int64 window_stride = 0;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "window_stride", &window_stride));
+    OP_REQUIRES(
+        ctx, window_stride > 0,
+        errors::InvalidArgument("window_stride must be greater than zero."));
+    if (window_size == window_shift && window_stride == 1) {
+      LOG(WARNING) << "window_shift: " << window_shift
+                   << " is equal to window_size: " << window_size
+                   << " and window_stride is 1, use `batch` instead.";
+    }
+    *output = new Dataset(ctx, window_size, window_shift, window_stride, input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, int64 window_size, int64 window_shift,
+            int64 window_stride, const DatasetBase* input)
+        : DatasetBase(DatasetContext(ctx)),
+          window_size_(window_size),
+          window_shift_(window_shift),
+          window_stride_(window_stride),
+          input_(input) {
+      input_->Ref();
+
+      const auto& input_shapes = input_->output_shapes();
+      output_shapes_.reserve(input_shapes.size());
+      for (const auto& input_shape : input_shapes) {
+        output_shapes_.emplace_back(
+            PartialTensorShape({-1}).Concatenate(input_shape));
+      }
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          Iterator::Params{this, strings::StrCat(prefix, "::Slide")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return strings::StrCat("SlidingWindowDatasetOp(", window_size_, ", ",
+                             window_shift_, ", ", window_stride_, ")::Dataset");
+    }
+
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / window_shift_;
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* window_size = nullptr;
+      Node* window_shift = nullptr;
+      Node* window_stride = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(window_size_, &window_size));
+      TF_RETURN_IF_ERROR(b->AddScalar(window_shift_, &window_shift));
+      TF_RETURN_IF_ERROR(b->AddScalar(window_stride_, &window_stride));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, window_size, window_shift, window_stride},
+          output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        const int64 window_size = dataset()->window_size_;
+        const int64 window_shift = dataset()->window_shift_;
+        const int64 window_stride = dataset()->window_stride_;
+        std::vector<std::vector<Tensor>> batch_elements;
+        {
+          mutex_lock l(mu_);
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          batch_elements.reserve(window_size);
+
+          // Fill up buffer.
+          size_t target_size = TargetBufferSize(window_size, window_stride);
+          *end_of_sequence = false;
+          for (size_t i = buffer_.size(); i < target_size && !*end_of_sequence;
+               ++i) {
+            std::vector<Tensor> element;
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, &element, end_of_sequence));
+            if (!*end_of_sequence) {
+              buffer_.push_back(std::move(element));
+            } else {
+              input_impl_.reset();
+            }
+          }
+
+          // Drop the final smaller batch.
+          if (buffer_.size() < target_size) {
+            DCHECK(*end_of_sequence);
+            return Status::OK();
+          }
+
+          for (size_t i = 0; i < window_size; ++i) {
+            batch_elements.emplace_back(buffer_[window_stride * i]);
+          }
+
+          // Drop the data before the next iteration.
+          if (window_shift >= buffer_.size()) {
+            for (size_t i = buffer_.size(); i < window_shift; ++i) {
+              bool end_of_input;
+              std::vector<Tensor> element;
+              TF_RETURN_IF_ERROR(
+                  input_impl_->GetNext(ctx, &element, &end_of_input));
+              if (end_of_input) {
+                input_impl_.reset();
+                break;
+              }
+            }
+            buffer_.clear();
+          } else {
+            buffer_.erase(buffer_.begin(), buffer_.begin() + window_shift);
+          }
+        }
+
+        // Construct output tensors.
+        const size_t num_tuple_components = batch_elements[0].size();
+        const int64 num_batch_elements = batch_elements.size();
+        for (size_t component_index = 0; component_index < num_tuple_components;
+             ++component_index) {
+          const Tensor& first_element = batch_elements[0][component_index];
+          TensorShape batch_component_shape({num_batch_elements});
+          batch_component_shape.AppendShape(first_element.shape());
+          out_tensors->emplace_back(ctx->allocator({}), first_element.dtype(),
+                                    batch_component_shape);
+          Tensor& batch_component = out_tensors->back();
+          // Build the output tuple component by copying one slice
+          // from each input element in the batch.
+          for (size_t i = 0; i < num_batch_elements; ++i) {
+            if (batch_elements[i][component_index].shape() !=
+                first_element.shape()) {
+              return errors::InvalidArgument(
+                  "Cannot batch tensors with different shapes in component ",
+                  component_index, ". First element had shape ",
+                  first_element.shape().DebugString(), " and element ", i,
+                  " had shape ",
+                  batch_elements[i][component_index].shape().DebugString(),
+                  ".");
+            }
+            TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
+                std::move(batch_elements[i][component_index]), &batch_component,
+                i));
+          }
+        }
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         dataset()->window_shift_);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        }
+        // Save buffer.
+        TF_RETURN_IF_ERROR(writer->WriteScalar(strings::StrCat("buffer_size"),
+                                               buffer_.size()));
+        for (int64 i = 0; i < buffer_.size(); i++) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              strings::StrCat("buffer[", i, "]_size"), buffer_[i].size()));
+          for (int64 j = 0; j < buffer_[i].size(); j++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                strings::StrCat("buffer[", i, "][", j, "]"), buffer_[i][j]));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        // Restore buffer.
+        int64 buffer_size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(strings::StrCat("buffer_size"), &buffer_size));
+        buffer_.resize(buffer_size);
+        for (int64 i = 0; i < buffer_size; i++) {
+          int64 vector_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              strings::StrCat("buffer[", i, "]_size"), &vector_size));
+          buffer_[i].resize(vector_size);
+          for (int64 j = 0; j < vector_size; j++) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                strings::StrCat("buffer[", i, "][", j, "]"), &buffer_[i][j]));
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      size_t TargetBufferSize(int64 window_size, int64 window_stride) {
+        return (window_size - 1) * window_stride + 1;
+      }
+
+      mutex mu_;
+      std::deque<std::vector<Tensor>> buffer_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const int64 window_size_;
+    const int64 window_shift_;
+    const int64 window_stride_;
+    const DatasetBase* const input_;
+    std::vector<PartialTensorShape> output_shapes_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalSlidingWindowDataset").Device(DEVICE_CPU),
+    SlidingWindowDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sql/BUILD b/tensorflow/core/kernels/data/experimental/sql/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..464410124e970a5bd54b93acb88810672161f5ad
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/sql/BUILD
@@ -0,0 +1,26 @@
+# Description:
+#   SQL library.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "sql",
+    srcs = [
+        "driver_manager.cc",
+        "sqlite_query_connection.cc",
+    ],
+    hdrs = [
+        "driver_manager.h",
+        "query_connection.h",
+        "sqlite_query_connection.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/lib/db:sqlite",
+    ],
+)
diff --git a/tensorflow/core/kernels/data/sql/driver_manager.cc b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
similarity index 88%
rename from tensorflow/core/kernels/data/sql/driver_manager.cc
rename to tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
index 783d1e6cb28fdd3f2e42caecc300ba9bd8b22c04..58174f69a44a5e28dd2d4fd018ee45688d407054 100644
--- a/tensorflow/core/kernels/data/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/sql/driver_manager.h"
-#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/kernels/data/sql/driver_manager.h b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
similarity index 81%
rename from tensorflow/core/kernels/data/sql/driver_manager.h
rename to tensorflow/core/kernels/data/experimental/sql/driver_manager.h
index c5428f396b03f03390f53b6a2e50fca3821dac0c..6afadf91a478e5da470897c3aa2977462337b5e5 100644
--- a/tensorflow/core/kernels/data/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
 
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 
 namespace tensorflow {
 namespace data {
@@ -38,4 +38,4 @@ class DriverManager {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
diff --git a/tensorflow/core/kernels/data/sql/query_connection.h b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
similarity index 92%
rename from tensorflow/core/kernels/data/sql/query_connection.h
rename to tensorflow/core/kernels/data/experimental/sql/query_connection.h
index 2fd229a9bfd4dd4f6e49eaa2452dbd9140050523..10c66436792a9794112a38a4a590e2e9fc3c05c5 100644
--- a/tensorflow/core/kernels/data/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
 
 #include "tensorflow/core/framework/tensor.h"
 
@@ -67,4 +67,4 @@ class QueryConnection {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
similarity index 93%
rename from tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
rename to tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index 5108e839761433eaa357e6565f012e567e47c8a7..cadceee8f516c08a45b63702aa321944e8f0a21e 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h"
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace tensorflow {
@@ -58,9 +58,8 @@ Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
     for (int i = 0; i < column_count_; i++) {
       DataType dt = output_types_[i];
       // TODO(mrry): Pass in the `IteratorContext::allocator()`.
-      Tensor tensor(ctx->allocator({}), dt, {});
-      FillTensorWithResultSetEntry(dt, i, &tensor);
-      out_tensors->emplace_back(std::move(tensor));
+      out_tensors->emplace_back(ctx->allocator({}), dt, TensorShape({}));
+      FillTensorWithResultSetEntry(dt, i, &out_tensors->back());
     }
   }
   return Status::OK();
@@ -107,7 +106,7 @@ void SqliteQueryConnection::FillTensorWithResultSetEntry(
       break;
     // Error preemptively thrown by SqlDatasetOp::MakeDataset in this case.
     default:
-      LOG(FATAL)
+      LOG(ERROR)
           << "Use of unsupported TensorFlow data type by 'SqlQueryConnection': "
           << DataTypeString(data_type) << ".";
   }
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
similarity index 84%
rename from tensorflow/core/kernels/data/sql/sqlite_query_connection.h
rename to tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
index 175492c49dba512f602c7153f1ab66ba6427aa3d..61df29065e15281067ec0fbcb499d382b0ba73f8 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
 
 #include <memory>
 
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -53,4 +53,4 @@ class SqliteQueryConnection : public QueryConnection {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c16d8ed02ccdfb01a41ff9206a003f4a8c04a667
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -0,0 +1,222 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <utility>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/experimental/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
+#include "tensorflow/core/lib/io/inputbuffer.h"
+#include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following ops.
+
+class SqlDatasetOp : public DatasetOpKernel {
+ public:
+  explicit SqlDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    for (const DataType& dt : output_types_) {
+      OP_REQUIRES(ctx,
+                  dt == DT_STRING || dt == DT_INT8 || dt == DT_INT16 ||
+                      dt == DT_INT32 || dt == DT_INT64 || dt == DT_UINT8 ||
+                      dt == DT_UINT16 || dt == DT_BOOL || dt == DT_DOUBLE,
+                  errors::InvalidArgument(
+                      "Each element of `output_types_` must be one of: "
+                      "DT_STRING, DT_INT8, DT_INT16, DT_INT32, DT_INT64, "
+                      "DT_UINT8, DT_UINT16, DT_BOOL, DT_DOUBLE "));
+    }
+    for (const PartialTensorShape& pts : output_shapes_) {
+      OP_REQUIRES(ctx, pts.dims() == 0,
+                  errors::InvalidArgument(
+                      "Each element of `output_shapes_` must be a scalar."));
+    }
+  }
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    string driver_name;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<string>(ctx, "driver_name", &driver_name));
+
+    string data_source_name;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "data_source_name",
+                                                    &data_source_name));
+
+    string query;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "query", &query));
+
+    // TODO(b/64276826) Change this check when we add support for other
+    // databases.
+    OP_REQUIRES(ctx, driver_name == "sqlite",
+                errors::InvalidArgument(tensorflow::strings::Printf(
+                    "The database type, %s, is not supported by SqlDataset. "
+                    "The set of supported databases is: {'sqlite'}.",
+                    driver_name.c_str())));
+
+    *output = new Dataset(ctx, driver_name, data_source_name, query,
+                          output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const string& driver_name,
+            const string& data_source_name, const string& query,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          driver_name_(driver_name),
+          data_source_name_(data_source_name),
+          query_(query),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {}
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Sql")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override { return "SqlDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* driver_name_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(driver_name_, &driver_name_node));
+      Node* data_source_name_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(data_source_name_, &data_source_name_node));
+      Node* query_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(query_, &query_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {driver_name_node, data_source_name_node, query_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+      ~Iterator() override {
+        if (query_connection_initialized_) {
+          Status s = query_connection_->Close();
+          if (!s.ok()) {
+            LOG(WARNING) << "Failed to close query connection: " << s;
+          }
+        }
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (!query_connection_initialized_) {
+          TF_RETURN_IF_ERROR(InitializeQueryConnection());
+        }
+        next_calls_++;
+        return query_connection_->GetNext(ctx, out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (query_connection_initialized_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("next_calls"), next_calls_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("next_calls"))) {
+          TF_RETURN_IF_ERROR(InitializeQueryConnection());
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("next_calls"), &next_calls_));
+          int64 rem_next_calls = next_calls_;
+          std::vector<Tensor> out_tensors;
+          bool end_of_sequence = false;
+          while (rem_next_calls--) {
+            TF_RETURN_IF_ERROR(query_connection_->GetNext(ctx, &out_tensors,
+                                                          &end_of_sequence));
+            out_tensors.clear();
+          }
+        } else {
+          query_connection_initialized_ = false;
+        }
+        return Status::OK();
+      }
+
+     private:
+      Status InitializeQueryConnection() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        query_connection_initialized_ = true;
+        query_connection_ =
+            sql::DriverManager::CreateQueryConnection(dataset()->driver_name_);
+        Status s = query_connection_->Open(dataset()->data_source_name_,
+                                           dataset()->query_,
+                                           dataset()->output_types_);
+        next_calls_ = 0;
+        if (!s.ok()) {
+          LOG(WARNING) << "Failed to connect to database: " << s;
+          return s;
+        }
+        return Status::OK();
+      }
+
+      mutex mu_;
+      // TODO(shivaniagrawal): explore ways to seek into a SQLite databases.
+      int64 next_calls_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<sql::QueryConnection> query_connection_ GUARDED_BY(mu_);
+      bool query_connection_initialized_ GUARDED_BY(mu_) = false;
+    };
+    const string driver_name_;
+    const string data_source_name_;
+    const string query_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ExperimentalSqlDataset").Device(DEVICE_CPU),
+                        SqlDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
similarity index 95%
rename from tensorflow/core/kernels/data/stats_aggregator_ops.cc
rename to tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 2d5146761631f8ed28ebcafac9fd670da9e3b47d..894465e1814cf93b02ecbbb053494d4c032fe243 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -141,10 +141,12 @@ class StatsAggregatorSummaryOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("StatsAggregatorHandle").Device(DEVICE_CPU),
-                        StatsAggregatorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("StatsAggregatorSummary").Device(DEVICE_CPU),
-                        StatsAggregatorSummaryOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalStatsAggregatorHandle").Device(DEVICE_CPU),
+    StatsAggregatorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalStatsAggregatorSummary").Device(DEVICE_CPU),
+    StatsAggregatorSummaryOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
similarity index 90%
rename from tensorflow/core/kernels/data/stats_dataset_ops.cc
rename to tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index e9e42f05a193577fd07d1e830726d00ccfe3350d..1961f25df846e8773bf6b0266d089c9d3bac355b 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
@@ -78,6 +78,8 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       return "LatencyStatsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -116,6 +118,12 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -180,6 +188,8 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       return "BytesProducedStatsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -220,6 +230,12 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -243,10 +259,12 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("LatencyStatsDataset").Device(DEVICE_CPU),
-                        LatencyStatsDatasetOp);
-REGISTER_KERNEL_BUILDER(Name("BytesProducedStatsDataset").Device(DEVICE_CPU),
-                        BytesProducedStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalLatencyStatsDataset").Device(DEVICE_CPU),
+    LatencyStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalBytesProducedStatsDataset").Device(DEVICE_CPU),
+    BytesProducedStatsDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 8d561ca0e331eeed5712d34a61cfbecbc9505cf5..8ae45ed5c9d9fe199ef392a1430f359172ec5c73 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -47,6 +49,8 @@ class ThreadPoolResource : public ResourceBase {
     }
   }
 
+  int32 NumThreads() { return thread_pool_.NumThreads(); }
+
   string DebugString() override { return "ThreadPoolResource"; }
 
  private:
@@ -127,16 +131,17 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
                                        &threadpool_resource));
     core::ScopedUnref unref_iterator(threadpool_resource);
 
-    *output = new Dataset(ctx, input, threadpool_resource);
+    *output = new Dataset(ctx, input, ctx->input(1), threadpool_resource);
   }
 
  private:
   class Dataset : public DatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            ThreadPoolResource* threadpool)
+            const Tensor& resource_handle, ThreadPoolResource* threadpool)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
+          resource_handle_(resource_handle),
           threadpool_(threadpool) {
       input_->Ref();
       threadpool_->Ref();
@@ -164,12 +169,19 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       return "ThreadPoolDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented("%s does not support serialization",
-                                   DebugString());
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* resource_handle_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddTensor(resource_handle_, &resource_handle_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, resource_handle_node}, output));
+      return Status::OK();
     }
 
    private:
@@ -179,36 +191,263 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        return dataset()->input_->MakeIterator(
+            IteratorContext(CreateParams(ctx)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
+        return input_impl_->GetNext(IteratorContext(CreateParams(ctx)),
+                                    out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      IteratorContext::Params CreateParams(IteratorContext* ctx) {
         ThreadPoolResource* pool = dataset()->threadpool_;
-        IteratorContext::Params params;
-        params.env = ctx->env();
+        IteratorContext::Params params(ctx);
         params.runner = [pool](std::function<void()> c) {
           pool->Schedule(std::move(c));
         };
-        params.stats_aggregator = ctx->stats_aggregator();
-        params.lib = ctx->lib();
-        params.function_library = ctx->function_library();
-        params.allocator_getter = ctx->allocator_getter();
-        IteratorContext threadpool_ctx(params);
-        return input_impl_->GetNext(&threadpool_ctx, out_tensors,
-                                    end_of_sequence);
+        params.runner_threadpool_size = pool->NumThreads();
+        return params;
       }
 
-     private:
       std::unique_ptr<IteratorBase> input_impl_;
     };
 
     const DatasetBase* const input_;
+    const Tensor resource_handle_;
     ThreadPoolResource* const threadpool_;
   };
 };
 
+class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit MaxIntraOpParallelismDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 max_intra_op_parallelism;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "max_intra_op_parallelism",
+                                              &max_intra_op_parallelism));
+    OP_REQUIRES(
+        ctx, max_intra_op_parallelism >= 0,
+        errors::InvalidArgument("`max_intra_op_parallelism` must be >= 0"));
+    *output = new Dataset(ctx, input, max_intra_op_parallelism);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            int64 max_intra_op_parallelism)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          max_intra_op_parallelism_(max_intra_op_parallelism) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::MaxIntraOpParallelism")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "MaxIntraOpParallelismDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* max_intra_op_parallelism_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(max_intra_op_parallelism_,
+                                      &max_intra_op_parallelism_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, max_intra_op_parallelism_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        IteratorContext::Params params(ctx);
+        auto max_parallelism = dataset()->max_intra_op_parallelism_;
+        params.runner = std::bind(
+            [max_parallelism](
+                const std::function<void(std::function<void()>)>& runner,
+                std::function<void()> fn) {
+              std::function<void()> scoped_fn = std::bind(
+                  [max_parallelism](const std::function<void()>& fn) {
+                    ScopedPerThreadMaxParallelism scope(max_parallelism);
+                    fn();
+                  },
+                  std::move(fn));
+              (runner)(std::move(scoped_fn));
+            },
+            std::move(*ctx->runner()), std::placeholders::_1);
+        return input_impl_->GetNext(IteratorContext{std::move(params)},
+                                    out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const int64 max_intra_op_parallelism_;
+  };
+};
+
+class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit PrivateThreadPoolDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 num_threads;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "num_threads", &num_threads));
+    OP_REQUIRES(ctx, num_threads >= 1,
+                errors::InvalidArgument("`num_threads` must be >= 1"));
+    *output = new Dataset(ctx, input, num_threads);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int num_threads)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          num_threads_(num_threads) {
+      thread_pool_ = MakeUnique<thread::ThreadPool>(
+          ctx->env(), ThreadOptions{}, "data_private_threadpool", num_threads,
+          /*low_latency_hint=*/false);
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::PrivateThreadPool")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "PrivateThreadPoolDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* num_threads_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(num_threads_, &num_threads_node));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, num_threads_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        thread::ThreadPool* pool = dataset()->thread_pool_.get();
+        IteratorContext::Params params(ctx);
+        params.runner = [pool](std::function<void()> c) {
+          pool->Schedule(std::move(c));
+        };
+        params.runner_threadpool_size = dataset()->num_threads_;
+        return input_impl_->GetNext(IteratorContext{std::move(params)},
+                                    out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const int64 num_threads_;
+    std::unique_ptr<thread::ThreadPool> thread_pool_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMaxIntraOpParallelismDataset").Device(DEVICE_CPU),
+    MaxIntraOpParallelismDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalPrivateThreadPoolDataset").Device(DEVICE_CPU),
+    PrivateThreadPoolDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("ExperimentalThreadPoolHandle").Device(DEVICE_CPU),
                         ThreadPoolHandleOp);
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7728baf1507c6cec2b44f41561f2ab3d04a80cc8
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
@@ -0,0 +1,109 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/record_writer.h"
+#include "tensorflow/core/platform/file_system.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+class ToTFRecordOp : public AsyncOpKernel {
+ public:
+  explicit ToTFRecordOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        background_worker_(ctx->env(), "tf_data_to_tf_record") {}
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
+    }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    // The call to `iterator->GetNext()` may block and depend on an inter-op
+    // thread pool thread, so we issue the call using a background thread.
+    background_worker_.Schedule([this, ctx, done]() {
+      string filename;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
+      string compression_type;
+      OP_REQUIRES_OK_ASYNC(ctx,
+                           ParseScalarArgument<string>(ctx, "compression_type",
+                                                       &compression_type),
+                           done);
+      std::unique_ptr<WritableFile> file;
+      OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file),
+                           done);
+      std::unique_ptr<io::RecordWriter> writer;
+      writer.reset(new io::RecordWriter(
+          file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
+                          compression_type)));
+
+      DatasetBase* dataset;
+      OP_REQUIRES_OK_ASYNC(
+          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
+      std::unique_ptr<IteratorBase> iterator;
+      IteratorContext::Params params(ctx);
+      std::unique_ptr<FunctionHandleCache> function_handle_cache(
+          new FunctionHandleCache(params.lib));
+      params.function_handle_cache = function_handle_cache.get();
+      IteratorContext iter_ctx(std::move(params));
+
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", &iterator),
+          done);
+
+      std::vector<Tensor> components;
+      components.reserve(dataset->output_dtypes().size());
+      bool end_of_sequence;
+      do {
+        OP_REQUIRES_OK_ASYNC(
+            ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+            done);
+
+        if (!end_of_sequence) {
+          OP_REQUIRES_OK_ASYNC(
+              ctx, writer->WriteRecord(components[0].scalar<string>()()), done);
+        }
+        components.clear();
+      } while (!end_of_sequence);
+      done();
+    });
+  }
+
+ private:
+  BackgroundWorker background_worker_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDatasetToTFRecord").Device(DEVICE_CPU), ToTFRecordOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
similarity index 89%
rename from tensorflow/core/kernels/data/unbatch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 74908994b45d76cee8d7898b6b3d71cdda2d73f7..2626ec3ed7250b725650a76b8674e0a76ebc638f 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class UnbatchDatasetOp : public UnaryDatasetOpKernel {
@@ -54,6 +54,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       }
     }
 
+    ~Dataset() override { input_->Unref(); }
+
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
@@ -145,6 +147,20 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        // Unbatch assumes that all input components have the same leading
+        // dimension. If it is statically known for any component, we model the
+        // transformation using `KnownRatio`. Otherwise, we use `UnknownRatio`.
+        for (auto& shape : dataset()->input_->output_shapes()) {
+          if (shape.dims() > 0 && shape.dim_size(0) > 0) {
+            return model::MakeKnownRatioNode(
+                std::move(args), 1.0 / static_cast<double>(shape.dim_size(0)));
+          }
+        }
+        return model::MakeUnknownRatioNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_) {
@@ -205,7 +221,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnbatchDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalUnbatchDataset").Device(DEVICE_CPU),
                         UnbatchDatasetOp);
 
 }  // namespace
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index cd612e0eb2536640cdca74740f5d88de780fc61e..23dd9ff612db61829dcbae65eb3566131d032efc 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -114,6 +114,11 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeUnknownRatioNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_) {
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
index a7e3a5672752d42c038d22a3137e33e253ca3b7c..784f9872860fee0f929dcf4c529c17fbb15e2bc6 100644
--- a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 // TODO(prazek): Filter already has a logic of filtering by the given tensor,
 // but it must return both components.  We could introduce kernel like
@@ -142,6 +142,11 @@ class FilterByLastComponentDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeUnknownRatioNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 00884314a9a87369d9d9c6f14b2df9d2740112d6..b8b657d3433422731d10a00ae6498c2f802669dd 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -13,85 +13,105 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class FilterDatasetOp : public UnaryDatasetOpKernel {
  public:
+  using FilterIteratorPredicate =
+      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
+                           std::vector<Tensor>, bool*)>;
+
   explicit FilterDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
+      : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    FunctionLibraryRuntime::Handle pred_handle;
-    OP_REQUIRES_OK(ctx,
-                   ctx->function_library()->Instantiate(
-                       func_.name(), AttrSlice(&func_.attr()), &pred_handle));
-    auto cleanup = gtl::MakeCleanup([ctx, pred_handle]() {
-      OP_REQUIRES_OK(ctx, ctx->function_library()->ReleaseHandle(pred_handle));
-    });
-
-    const FunctionBody* pred_body =
-        ctx->function_library()->GetFunctionBody(pred_handle);
-    OP_REQUIRES(ctx, pred_body->ret_nodes.size() == 1,
-                errors::InvalidArgument(
-                    "predicate function must have a single return value."));
-    Node* ret_node = pred_body->ret_nodes[0];
-    Node* ret_input_node;
-    OP_REQUIRES_OK(ctx, ret_node->input_node(0, &ret_input_node));
-
     std::unique_ptr<CapturedFunction> captured_func;
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
                                                  &captured_func));
 
-    if (ret_input_node->def().op() == "_Arg") {
-      int32 index = -1;
-      OP_REQUIRES_OK(ctx, GetNodeAttr(ret_input_node->def(), "index", &index));
-      *output = new FilterTensorDataset(ctx, input, func_,
-                                        std::move(captured_func), index);
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+    OP_REQUIRES(ctx, indices.size() <= 1,
+                errors::InvalidArgument(
+                    "predicate function has more than one return value."));
+
+    FilterIteratorPredicate filter_pred;
+    if (indices.empty()) {
+      filter_pred = [](IteratorContext* ctx,
+                       InstantiatedCapturedFunction* inst_captured_func,
+                       const std::vector<Tensor>& args, bool* out_matched) {
+        std::vector<Tensor> result;
+        TF_RETURN_IF_ERROR(
+            inst_captured_func->RunWithBorrowedArgs(ctx, args, &result));
+
+        if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+            result[0].NumElements() != 1) {
+          return errors::InvalidArgument(
+              "Filter predicate `f` must return a scalar bool.");
+        }
+        *out_matched = result[0].scalar<bool>()();
+        return Status::OK();
+      };
     } else {
-      *output = new FilterFunctionDataset(ctx, input, func_,
-                                          std::move(captured_func));
+      filter_pred = [indices](IteratorContext* ctx,
+                              InstantiatedCapturedFunction* inst_captured_func,
+                              const std::vector<Tensor>& args,
+                              bool* out_matched) {
+        const Tensor& predicate = args[indices[0]];
+        if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
+          return errors::InvalidArgument(
+              "Filter predicate `f` must return a scalar bool.");
+        }
+        *out_matched = predicate.scalar<bool>()();
+        return Status::OK();
+      };
     }
+
+    *output = new Dataset(ctx, input, func_, std::move(captured_func),
+                          std::move(filter_pred));
   }
 
  private:
-  const int graph_def_version_;
-
-  class FilterDatasetBase : public DatasetBase {
+  class Dataset : public DatasetBase {
    public:
-    FilterDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
-                      const NameAttrList& func,
-                      std::unique_ptr<CapturedFunction> captured_func)
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func,
+            FilterIteratorPredicate filter_pred)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
-          captured_func_(std::move(captured_func)) {
+          captured_func_(std::move(captured_func)),
+          filter_pred_(std::move(filter_pred)) {
       input_->Ref();
     }
 
-    ~FilterDatasetBase() override { input_->Unref(); }
+    ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Filter")}));
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Filter")},
+          filter_pred_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -133,17 +153,15 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       return Status::OK();
     }
 
-    virtual Status EvaluatePredicate(IteratorContext* ctx,
-                                     const std::vector<Tensor>& element,
-                                     bool* out_matched) const = 0;
-
    private:
-    class Iterator : public DatasetIterator<FilterDatasetBase> {
+    class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<FilterDatasetBase>(params),
+      explicit Iterator(const Params& params,
+                        FilterIteratorPredicate filter_pred)
+          : DatasetIterator<Dataset>(params),
             filtered_elements_(0),
-            dropped_elements_(0) {
+            dropped_elements_(0),
+            filter_pred_(std::move(filter_pred)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
         prefix_end_ = components.back();
@@ -152,7 +170,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -180,8 +199,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
 
-          TF_RETURN_IF_ERROR(
-              dataset()->EvaluatePredicate(ctx, *out_tensors, &matched));
+          TF_RETURN_IF_ERROR(filter_pred_(
+              ctx, instantiated_captured_func_.get(), *out_tensors, &matched));
           if (!matched) {
             // Clear the output tensor list since it didn't match.
             out_tensors->clear();
@@ -218,6 +237,11 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeUnknownRatioNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_)
@@ -251,64 +275,15 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       int64 filtered_elements_ GUARDED_BY(mu_);
       int64 dropped_elements_ GUARDED_BY(mu_);
+      const FilterIteratorPredicate filter_pred_;
       string prefix_end_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
     const NameAttrList func_;
-
-   protected:
     const std::unique_ptr<CapturedFunction> captured_func_;
-  };
-
-  class FilterFunctionDataset : public FilterDatasetBase {
-   public:
-    using FilterDatasetBase::FilterDatasetBase;
-
-   protected:
-    Status EvaluatePredicate(IteratorContext* ctx,
-                             const std::vector<Tensor>& element,
-                             bool* out_matched) const override {
-      // TODO(mrry): Avoid blocking a threadpool thread. We will need to
-      // stack-rip the iterators and use async kernels.
-      std::vector<Tensor> result;
-      TF_RETURN_IF_ERROR(
-          captured_func_->RunWithBorrowedArgs(ctx, element, &result));
-
-      if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
-          result[0].NumElements() != 1) {
-        return errors::InvalidArgument(
-            "Filter predicate `f` must return a scalar bool.");
-      }
-      *out_matched = result[0].scalar<bool>()();
-      return Status::OK();
-    }
-  };
-
-  class FilterTensorDataset : public FilterDatasetBase {
-   public:
-    FilterTensorDataset(OpKernelContext* ctx, const DatasetBase* input,
-                        const NameAttrList& func,
-                        std::unique_ptr<CapturedFunction> captured_func,
-                        int32 index)
-        : FilterDatasetBase(ctx, input, func, std::move(captured_func)),
-          index_(index) {}
-
-   protected:
-    Status EvaluatePredicate(IteratorContext* ctx,
-                             const std::vector<Tensor>& element,
-                             bool* out_matched) const override {
-      const Tensor& predicate = element[index_];
-      if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
-        return errors::InvalidArgument(
-            "Filter predicate `f` must return a scalar bool.");
-      }
-      *out_matched = predicate.scalar<bool>()();
-      return Status::OK();
-    }
-
-   private:
-    const int32 index_;
+    const FilterIteratorPredicate filter_pred_;
   };
 
  private:
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 2fada22a211417864011d07230158d41519e2160..3846334622bf48ecb5e62464f22c2fa3e7c4adc4 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
 
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class FlatMapDatasetOp : public UnaryDatasetOpKernel {
@@ -122,7 +122,8 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -165,6 +166,11 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeInterleaveManyNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_) {
@@ -238,18 +244,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         return MakeIteratorFromInputElement(
             ctx, captured_func_inputs_, element_index_++,
-            dataset()->captured_func_.get(), prefix(),
-            &current_element_iterator_);
-      }
-
-      Status BuildCurrentElementIteratorLocked(OpKernelContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
-        params.lib = ctx->function_library();
-        IteratorContext iter_ctx(std::move(params));
-        return BuildCurrentElementIteratorLocked(&iter_ctx);
+            *instantiated_captured_func_, prefix(), &current_element_iterator_);
       }
 
       mutex mu_;
@@ -257,6 +252,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> current_element_iterator_ GUARDED_BY(mu_);
       std::vector<Tensor> captured_func_inputs_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index b4367d5a110a7f2a9f0cf7a3a80bb3078dfd7c5b..48697ec6c8f05c438badedbc3234dbb1110c7088 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class GeneratorDatasetOp::Dataset : public DatasetBase {
@@ -73,7 +73,8 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     ~Iterator() override {
       if (!finalized_) {
         std::vector<Tensor> ignored;
-        Status s = dataset()->finalize_func_->RunInstantiated(state_, &ignored);
+        Status s =
+            instantiated_finalize_func_->RunInstantiated(state_, &ignored);
         if (!s.ok()) {
           LOG(WARNING)
               << "Error occurred when finalizing GeneratorDataset iterator: "
@@ -83,9 +84,12 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     }
 
     Status Initialize(IteratorContext* ctx) override {
-      TF_RETURN_IF_ERROR(dataset()->init_func_->Instantiate(ctx));
-      TF_RETURN_IF_ERROR(dataset()->next_func_->Instantiate(ctx));
-      TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(ctx));
+      TF_RETURN_IF_ERROR(
+          dataset()->init_func_->Instantiate(ctx, &instantiated_init_func_));
+      TF_RETURN_IF_ERROR(
+          dataset()->next_func_->Instantiate(ctx, &instantiated_next_func_));
+      TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(
+          ctx, &instantiated_finalize_func_));
       return Status::OK();
     }
 
@@ -96,7 +100,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
 
       if (!initialized_) {
         TF_RETURN_IF_ERROR(
-            dataset()->init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
+            instantiated_init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
         initialized_ = true;
       }
 
@@ -105,8 +109,8 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
         return Status::OK();
       }
 
-      Status s =
-          dataset()->next_func_->RunWithBorrowedArgs(ctx, state_, out_tensors);
+      Status s = instantiated_next_func_->RunWithBorrowedArgs(ctx, state_,
+                                                              out_tensors);
       if (s.ok()) {
         *end_of_sequence = false;
       } else if (errors::IsOutOfRange(s)) {
@@ -119,17 +123,26 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
         // finalize function.
         std::vector<Tensor> ignored;
         TF_RETURN_IF_ERROR(
-            dataset()->finalize_func_->RunInstantiated(state_, &ignored));
+            instantiated_finalize_func_->RunInstantiated(state_, &ignored));
         finalized_ = true;
       }
       return s;
     }
 
+   protected:
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeSourceNode(std::move(args));
+    }
+
    private:
     mutex mu_;
     bool initialized_ GUARDED_BY(mu_) = false;
     bool finalized_ GUARDED_BY(mu_) = false;
     std::vector<Tensor> state_ GUARDED_BY(mu_);
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_init_func_;
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_next_func_;
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_finalize_func_;
   };
 
   const std::unique_ptr<CapturedFunction> init_func_;
@@ -169,11 +182,13 @@ void GeneratorDatasetOp::MakeDataset(OpKernelContext* ctx,
 }
 
 namespace {
-REGISTER_KERNEL_BUILDER(Name("GeneratorDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("GeneratorDataset").Device(DEVICE_CPU).Priority(2),
+                        GeneratorDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("GeneratorDataset")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("handle")
+                            .Priority(1),
                         GeneratorDatasetOp);
-REGISTER_KERNEL_BUILDER(
-    Name("GeneratorDataset").Device(DEVICE_GPU).HostMemory("handle"),
-    GeneratorDatasetOp);
 }  // namespace
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 0aa802b8745c1237a7a613ac457b6b55ceeac3c5..54e3645612cd3905f1338fe59ab8caf0ca8941eb 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
 
@@ -24,7 +24,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class InterleaveDatasetOp : public UnaryDatasetOpKernel {
@@ -149,7 +149,8 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -195,7 +196,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
             if (!end_of_input_) {
               TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
                   ctx, args_list_[cycle_index_], cycle_index_,
-                  dataset()->captured_func_.get(), prefix(),
+                  *instantiated_captured_func_, prefix(),
                   &current_elements_[cycle_index_]));
               ++num_open_;
             }
@@ -209,6 +210,11 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeInterleaveManyNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -281,7 +287,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
                   &args_list_[idx][i]));
             }
             TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
-                ctx, args_list_[idx], idx, dataset()->captured_func_.get(),
+                ctx, args_list_[idx], idx, *instantiated_captured_func_,
                 prefix(), &current_elements_[idx]));
             TF_RETURN_IF_ERROR(
                 RestoreInput(ctx, reader, current_elements_[idx]));
@@ -301,6 +307,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       int64 block_index_ GUARDED_BY(mu_) = 0;
       bool end_of_input_ GUARDED_BY(mu_) = false;
       size_t num_open_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 7a833668aca367425f7307276a5361042dee87a1..d5b4bfa5c5e23cc6948f680ba7f49c23447464a5 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/iterator_ops.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
@@ -33,13 +36,14 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following ops.
 
 const char kIteratorVariantTypeName[] = "tensorflow::Iterator";
@@ -56,20 +60,26 @@ class IteratorResource : public ResourceBase {
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                    FunctionLibraryRuntime* lib)
       : device_mgr_(std::move(device_mgr)),
-        flib_def_(std::move(flib_def)),
-        pflr_(std::move(pflr)),
-        lib_(lib),
-        iterator_(nullptr),
+        iterator_state_(
+            new State(std::move(flib_def), std::move(pflr), lib, nullptr)),
         output_dtypes_(output_dtypes),
         output_shapes_(output_shapes) {}
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) {
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
-    if (captured_iterator) {
-      CHECK_NOTNULL(lib_);
-      ctx->set_lib(lib_);
-      return captured_iterator->GetNext(ctx, out_tensors, end_of_sequence);
+    std::shared_ptr<State> captured_state;
+    {
+      tf_shared_lock l(mu_);
+      captured_state = iterator_state_;
+    }
+    if (captured_state->iterator) {
+      IteratorContext::Params params(ctx);
+      params.lib = captured_state->lib;
+      params.function_handle_cache =
+          captured_state->function_handle_cache.get();
+      params.resource_mgr = &captured_state->resource_mgr;
+      return captured_state->iterator->GetNext(
+          IteratorContext(std::move(params)), out_tensors, end_of_sequence);
     } else {
       return errors::FailedPrecondition(
           "GetNext() failed because the iterator has not been initialized. "
@@ -78,10 +88,19 @@ class IteratorResource : public ResourceBase {
     }
   }
 
+  Status GetNext(IteratorContext&& ctx, std::vector<Tensor>* out_tensors,
+                 bool* end_of_sequence) {
+    return GetNext(&ctx, out_tensors, end_of_sequence);
+  }
+
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) {
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
-    if (captured_iterator) {
-      return captured_iterator->Save(ctx, writer);
+    std::shared_ptr<State> captured_state;
+    {
+      tf_shared_lock l(mu_);
+      captured_state = iterator_state_;
+    }
+    if (captured_state) {
+      return captured_state->iterator->Save(ctx, writer);
     } else {
       return errors::FailedPrecondition(
           "Save() failed because the iterator has not been initialized. "
@@ -113,66 +132,101 @@ class IteratorResource : public ResourceBase {
     // because some of the OpKernels in the graph might call functions that are
     // only defined in the loaded GraphDef.
     FunctionLibraryRuntime* lib;
-    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
     std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
     TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
     TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
+    std::unique_ptr<State> new_state(new State(
+        std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */));
 
     TF_RETURN_IF_ERROR(
-        graph_runner.Run(&graph, lib, {}, {output_node}, &outputs));
+        graph_runner.Run(&graph, new_state->lib, {}, {output_node}, &outputs));
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
-    std::unique_ptr<IteratorBase> iterator;
-    IteratorContext iter_ctx(ctx);
-    iter_ctx.set_lib(lib);
+    IteratorContext::Params params(ctx);
+    params.lib = new_state->lib;
+    params.function_handle_cache = new_state->function_handle_cache.get();
+    params.resource_mgr = &new_state->resource_mgr;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
+                                             "Iterator", &new_state->iterator));
     TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
-    TF_RETURN_IF_ERROR(set_iterator(std::move(iterator)));
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
-
-    if (captured_iterator) {
-      IteratorContext::Params params;
-      params.env = ctx->env();
-      params.runner = *(ctx->runner());
-      params.lib = lib;
-      DeviceBase* device = lib->device();
+        VerifyTypesMatch(output_dtypes_, new_state->iterator->output_dtypes()));
+    TF_RETURN_IF_ERROR(VerifyShapesCompatible(
+        output_shapes_, new_state->iterator->output_shapes()));
+
+    {
+      IteratorContext::Params params(ctx);
+      params.lib = new_state->lib;
+      params.function_handle_cache = new_state->function_handle_cache.get();
+      params.resource_mgr = &new_state->resource_mgr;
+      DeviceBase* device = new_state->lib->device();
       params.allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
       IteratorContext iter_ctx(std::move(params));
-
-      TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader));
-      mutex_lock l(mu_);
-      device_mgr_ = std::move(device_mgr);
-      lib_def_ = std::move(flib_def);
-      pflr_ = std::move(pflr);
-      lib_ = lib;
-      return Status::OK();
-    } else {
-      return errors::FailedPrecondition(
-          "Failed to restore iterator. Make sure the checkpoint ",
-          "is not corrupt. If the checkpoint does not contain the GraphDef, ",
-          "you will need to initialize your iterator before restoring.");
+      TF_RETURN_IF_ERROR(new_state->iterator->Restore(&iter_ctx, reader));
     }
+
+    mutex_lock l(mu_);
+    iterator_state_ = std::move(new_state);
+    return Status::OK();
   }
 
-  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
-    tf_shared_lock l(mu_);
-    return lib_def_;
+  Status AddLibrary(const FunctionLibraryDefinition& flib_def) {
+    mutex_lock l(mu_);
+    return iterator_state_->flib_def->AddLibrary(flib_def);
   }
 
-  FunctionLibraryRuntime* function_library_runtime() { return lib_; }
+  Status SetIteratorFromDataset(OpKernelContext* ctx, DatasetBase* dataset) {
+    std::shared_ptr<State> new_state;
+    {
+      tf_shared_lock l(mu_);
+      new_state.reset(new State(iterator_state_->flib_def,
+                                iterator_state_->pflr, iterator_state_->lib,
+                                nullptr /* function_handle_cache */,
+                                nullptr /* iterator */));
+    }
 
-  // Transfers ownership of iterator to this. This method is thread-safe.
-  Status set_iterator(std::unique_ptr<IteratorBase> iterator) {
-    if (iterator) {
-      TF_RETURN_IF_ERROR(
-          VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+    // Ensure that the iterator has access to all functions in the current
+    // subgraph, because some functions may have been defined after the resource
+    // was initially created.
+    Status s = new_state->flib_def->AddLibrary(
+        *ctx->function_library()->GetFunctionLibraryDefinition());
+
+    if (!s.ok()) {
+      // Adding functions to `flib_def_` may fail, if there are clashes between
+      // the function names in (e.g.) a restored graph and the currently
+      // executing graph. In that case, we create a new function runtime for
+      // this iterator, based on the current `OpKernelContext`, which will have
+      // the functions we need.
+      FunctionLibraryRuntime* lib;
+      std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
       TF_RETURN_IF_ERROR(
-          VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+          ctx->function_library()->Clone(&flib_def, &pflr, &lib));
+      new_state->flib_def = std::move(flib_def);
+      new_state->pflr = std::move(pflr);
+      new_state->lib = lib;
     }
-    iterator_.reset(iterator.release());
+
+    new_state->function_handle_cache.reset(
+        new FunctionHandleCache(new_state->lib));
+    // Create new iterator.
+    std::unique_ptr<IteratorBase> iterator;
+    IteratorContext::Params params(ctx);
+    params.lib = new_state->lib;
+    params.function_handle_cache = new_state->function_handle_cache.get();
+    params.resource_mgr = &new_state->resource_mgr;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
+                                             "Iterator", &iterator));
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+    std::swap(new_state->iterator, iterator);
+
+    mutex_lock l(mu_);
+    std::swap(iterator_state_, new_state);
     return Status::OK();
   }
 
@@ -185,16 +239,38 @@ class IteratorResource : public ResourceBase {
   }
 
  private:
-  // The following (device_mgr_, flib_def_, pflr_) are only used when the
-  // IteratorResource is shared between sessions and in that case we create
-  // a new FLR. Otherwise these are set to null.
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
-  std::shared_ptr<IteratorBase> iterator_;
+  struct State {
+    State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
+          std::shared_ptr<ProcessFunctionLibraryRuntime> pflr,
+          FunctionLibraryRuntime* lib, std::unique_ptr<IteratorBase> iterator)
+        : flib_def(flib_def),
+          pflr(pflr),
+          lib(lib),
+          function_handle_cache(absl::make_unique<FunctionHandleCache>(lib)),
+          iterator(std::move(iterator)) {}
+
+    State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
+          std::shared_ptr<ProcessFunctionLibraryRuntime> pflr,
+          FunctionLibraryRuntime* lib,
+          std::unique_ptr<FunctionHandleCache> function_handle_cache,
+          std::unique_ptr<IteratorBase> iterator)
+        : flib_def(flib_def),
+          pflr(pflr),
+          lib(lib),
+          function_handle_cache(std::move(function_handle_cache)),
+          iterator(std::move(iterator)) {}
+
+    std::shared_ptr<FunctionLibraryDefinition> flib_def;
+    std::shared_ptr<ProcessFunctionLibraryRuntime> pflr;
+    FunctionLibraryRuntime* lib = nullptr;  // not owned.
+    std::unique_ptr<FunctionHandleCache> function_handle_cache;
+    ResourceMgr resource_mgr;
+    std::unique_ptr<IteratorBase> iterator;
+  };
+
   mutex mu_;
-  std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
+  const std::unique_ptr<DeviceMgr> device_mgr_ GUARDED_BY(mu_);
+  std::shared_ptr<State> iterator_state_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
 };
@@ -504,10 +580,9 @@ FunctionLibraryRuntime* IteratorHandleOp::CreatePrivateFLR(
   // in its resource manager. The existing device will outlive the
   // IteratorResource, because we are storing the IteratorResource
   // in that device's resource manager.
-  Device* wrapped_device = RenamedDevice::NewRenamedDevice(
+  *device_mgr = absl::make_unique<DeviceMgr>(RenamedDevice::NewRenamedDevice(
       ctx->device()->name(), down_cast<Device*>(ctx->device()),
-      false /* owns_underlying */, false /* isolate_session_state */);
-  device_mgr->reset(new DeviceMgr({wrapped_device}));
+      false /* owns_underlying */, false /* isolate_session_state */));
   flib_def->reset(new FunctionLibraryDefinition(
       *ctx->function_library()->GetFunctionLibraryDefinition()));
   pflr->reset(new ProcessFunctionLibraryRuntime(
@@ -580,13 +655,7 @@ void MakeIteratorOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(
       ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
   core::ScopedUnref unref(iterator_resource);
-
-  std::unique_ptr<IteratorBase> iterator;
-  IteratorContext iter_ctx(ctx);
-  iter_ctx.set_lib(iterator_resource->function_library_runtime());
-  OP_REQUIRES_OK(
-      ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
-  OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
+  OP_REQUIRES_OK(ctx, iterator_resource->SetIteratorFromDataset(ctx, dataset));
 }
 
 namespace {
@@ -595,9 +664,7 @@ class ToSingleElementOp : public AsyncOpKernel {
  public:
   explicit ToSingleElementOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        background_worker_(ctx->env(),
-                           strings::StrCat("to_single_element_op_thread_",
-                                           SanitizeThreadSuffix(name()))) {}
+        background_worker_(ctx->env(), "tf_data_to_single_element") {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     // The call to `iterator->GetNext()` may block and depend on an
@@ -608,10 +675,15 @@ class ToSingleElementOp : public AsyncOpKernel {
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       std::unique_ptr<IteratorBase> iterator;
+      IteratorContext::Params params(ctx);
+      std::unique_ptr<FunctionHandleCache> function_handle_cache(
+          new FunctionHandleCache(params.lib));
+      params.function_handle_cache = function_handle_cache.get();
+      IteratorContext iter_ctx(std::move(params));
+
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          dataset->MakeIterator(IteratorContext(ctx), "SingleElementIterator",
-                                &iterator),
+          dataset->MakeIterator(&iter_ctx, "SingleElementIterator", &iterator),
           done);
 
       // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
@@ -625,8 +697,8 @@ class ToSingleElementOp : public AsyncOpKernel {
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence = false;
 
-      Status s = raw_iterator->GetNext(IteratorContext(ctx), &components,
-                                       &end_of_sequence);
+      Status s =
+          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
       if (!s.ok()) {
         ctx->SetStatus(s);
         return;
@@ -641,8 +713,8 @@ class ToSingleElementOp : public AsyncOpKernel {
       }
 
       components.clear();
-      Status s2 = raw_iterator->GetNext(IteratorContext(ctx), &components,
-                                        &end_of_sequence);
+      Status s2 =
+          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
       if (!s2.ok()) {
         ctx->SetStatus(s2);
         return;
@@ -663,9 +735,7 @@ class ReduceDatasetOp : public AsyncOpKernel {
  public:
   explicit ReduceDatasetOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        background_worker_(
-            ctx->env(),
-            strings::StrCat("reduce_thread_", SanitizeThreadSuffix(name()))) {
+        background_worker_(ctx->env(), "tf_data_reduce_dataset") {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &reduce_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
@@ -693,8 +763,16 @@ class ReduceDatasetOp : public AsyncOpKernel {
                                    use_inter_op_parallelism_, &captured_func),
           done);
 
-      IteratorContext iter_ctx(ctx);
-      OP_REQUIRES_OK_ASYNC(ctx, captured_func->Instantiate(&iter_ctx), done);
+      IteratorContext::Params params(ctx);
+      std::unique_ptr<FunctionHandleCache> function_handle_cache(
+          new FunctionHandleCache(params.lib));
+      params.function_handle_cache = function_handle_cache.get();
+      IteratorContext iter_ctx(std::move(params));
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func;
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          captured_func->Instantiate(&iter_ctx, &instantiated_captured_func),
+          done);
 
       std::unique_ptr<IteratorBase> iterator;
       OP_REQUIRES_OK_ASYNC(
@@ -728,8 +806,8 @@ class ReduceDatasetOp : public AsyncOpKernel {
                   std::back_inserter(args));
 
         std::vector<Tensor> reduce_func_output;
-        status =
-            captured_func->Run(&iter_ctx, std::move(args), &reduce_func_output);
+        status = instantiated_captured_func->Run(&iter_ctx, std::move(args),
+                                                 &reduce_func_output);
         if (!status.ok()) {
           break;
         }
@@ -772,10 +850,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
  public:
   explicit OneShotIteratorOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        background_worker_(
-            ctx->env(),
-            strings::StrCat("one_shot_iterator_initialization_thread_",
-                            SanitizeThreadSuffix(name()))),
+        background_worker_(ctx->env(), "tf_data_one_shot_iterator"),
         graph_def_version_(ctx->graph_def_version())
 
   {
@@ -919,13 +994,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
     // factory function.
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
-    std::unique_ptr<IteratorBase> iter;
-    IteratorContext iter_ctx(ctx);
-    iter_ctx.set_lib(lib);
-    TF_RETURN_IF_ERROR(
-        dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iter));
-    TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter)));
-
+    TF_RETURN_IF_ERROR((*iterator)->SetIteratorFromDataset(ctx, dataset));
     (*iterator)->Ref();
     return Status::OK();
   }
@@ -979,17 +1048,8 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         std::vector<Tensor> components;
         bool end_of_sequence = false;
 
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
-        params.function_library = iterator->function_library();
-        DeviceBase* device = ctx->function_library()->device();
-        params.allocator_getter = [device](AllocatorAttributes attrs) {
-          return device->GetAllocator(attrs);
-        };
-        IteratorContext iter_ctx(std::move(params));
-
-        Status s = iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+        Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                     &end_of_sequence);
         // NOTE(mrry): We must unref the iterator before calling `done()`, to
         // avoid destruction races.
         iterator->Unref();
@@ -1013,22 +1073,11 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
   IteratorResource* iterator;
   OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator));
   core::ScopedUnref unref_iterator(iterator);
-
   std::vector<Tensor> components;
   bool end_of_sequence = false;
 
-  IteratorContext::Params params;
-  params.env = ctx->env();
-  params.runner = *(ctx->runner());
-  params.function_library = iterator->function_library();
-  DeviceBase* device = ctx->function_library()->device();
-  params.allocator_getter = [device](AllocatorAttributes attrs) {
-    return device->GetAllocator(attrs);
-  };
-  IteratorContext iter_ctx(std::move(params));
-
-  OP_REQUIRES_OK(ctx,
-                 iterator->GetNext(&iter_ctx, &components, &end_of_sequence));
+  OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(ctx), &components,
+                                        &end_of_sequence));
   OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
 
   for (int i = 0; i < components.size(); ++i) {
@@ -1043,9 +1092,8 @@ class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
  public:
   explicit IteratorGetNextAsOptionalOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        background_worker_(
-            ctx->env(), strings::StrCat("iterator_get_next_as_optional_thread_",
-                                        SanitizeThreadSuffix(name()))) {
+        background_worker_(ctx->env(),
+                           "tf_data_iterator_get_next_as_optional") {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
@@ -1062,18 +1110,8 @@ class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
           std::vector<Tensor> components;
           bool end_of_sequence = false;
 
-          IteratorContext::Params params;
-          params.env = ctx->env();
-          params.runner = *(ctx->runner());
-          params.function_library = iterator->function_library();
-          DeviceBase* device = ctx->function_library()->device();
-          params.allocator_getter = [device](AllocatorAttributes attrs) {
-            return device->GetAllocator(attrs);
-          };
-          IteratorContext iter_ctx(std::move(params));
-
-          Status s =
-              iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
+          Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                       &end_of_sequence);
           // NOTE(mrry): We must unref the iterator before calling `done()`, to
           // avoid destruction races.
           iterator->Unref();
@@ -1240,50 +1278,60 @@ class DeserializeIteratorOp : public OpKernel {
 
 
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU).Priority(2),
                         IteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_GPU).Priority(1),
                         IteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU).Priority(2),
                         MakeIteratorOp);
 REGISTER_KERNEL_BUILDER(
-    Name("MakeIterator").Device(DEVICE_GPU).HostMemory("dataset"),
+    Name("MakeIterator").Device(DEVICE_GPU).Priority(1).HostMemory("dataset"),
     MakeIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE_CPU),
-                        AnonymousIteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE_GPU),
-                        AnonymousIteratorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("AnonymousIterator").Device(DEVICE_CPU).Priority(2),
+    AnonymousIteratorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("AnonymousIterator").Device(DEVICE_GPU).Priority(1),
+    AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("DatasetToSingleElement").Device(DEVICE_CPU),
                         ToSingleElementOp);
 REGISTER_KERNEL_BUILDER(Name("ReduceDataset").Device(DEVICE_CPU),
                         ReduceDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("OneShotIterator").Device(DEVICE_CPU),
                         OneShotIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU).Priority(2),
                         IteratorGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_GPU).Priority(1),
                         IteratorGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE_CPU),
-                        IteratorGetNextSyncOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE_GPU),
-                        IteratorGetNextSyncOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE_CPU),
-                        IteratorGetNextAsOptionalOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE_GPU),
-                        IteratorGetNextAsOptionalOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle").Device(DEVICE_CPU),
-                        IteratorToStringHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextSync").Device(DEVICE_CPU).Priority(2),
+    IteratorGetNextSyncOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextSync").Device(DEVICE_GPU).Priority(1),
+    IteratorGetNextSyncOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextAsOptional").Device(DEVICE_CPU).Priority(2),
+    IteratorGetNextAsOptionalOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextAsOptional").Device(DEVICE_GPU).Priority(1),
+    IteratorGetNextAsOptionalOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorToStringHandle").Device(DEVICE_CPU).Priority(2),
+    IteratorToStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")
                             .Device(DEVICE_GPU)
-                            .HostMemory("string_handle"),
+                            .HostMemory("string_handle")
+                            .Priority(1),
                         IteratorToStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandle").Device(DEVICE_CPU),
                         IteratorFromStringHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2").Device(DEVICE_CPU),
-                        IteratorFromStringHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorFromStringHandleV2").Device(DEVICE_CPU).Priority(2),
+    IteratorFromStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2")
                             .Device(DEVICE_GPU)
-                            .HostMemory("string_handle"),
+                            .HostMemory("string_handle")
+                            .Priority(1),
                         IteratorFromStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("SerializeIterator").Device(DEVICE_CPU),
                         SerializeIteratorOp);
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index 8a2b2639a783e4464d98fade025555507500db5f..cd72269859044e6efd97a10ad43bc00c90df7d7d 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -107,9 +107,7 @@ class IteratorGetNextOp : public AsyncOpKernel {
  public:
   explicit IteratorGetNextOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        background_worker_(ctx->env(),
-                           strings::StrCat("iterator_get_next_thread_",
-                                           SanitizeThreadSuffix(name()))) {}
+        background_worker_(ctx->env(), "tf_data_iterator_get_next") {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
 
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
deleted file mode 100644
index bf089705607883f6b3b652f129ede9c4d5ac6f47..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ /dev/null
@@ -1,693 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#define EIGEN_USE_THREADS
-
-#include <atomic>
-#include <utility>
-
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/inplace_ops_functor.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/platform/tracing.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
-// TODO(b/116852688): Make coordination between the performance model and this
-// transformation more robust.
-class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        op_version_(ctx->def().op() == "MapAndBatchDataset" ? 1 : 2) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
- protected:
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 batch_size;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "batch_size", &batch_size));
-    OP_REQUIRES(
-        ctx, batch_size > 0,
-        errors::InvalidArgument("batch_size must be greater than zero."));
-
-    int64 num_parallel_calls;
-    switch (op_version_) {
-      case 1:
-        int64 num_parallel_batches;
-        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_batches",
-                                                &num_parallel_batches));
-        num_parallel_calls = num_parallel_batches * batch_size;
-        OP_REQUIRES(ctx, num_parallel_batches > 0,
-                    errors::InvalidArgument(
-                        "num_parallel_batches must be greater than zero."));
-        break;
-      case 2:
-        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
-                                                &num_parallel_calls));
-        OP_REQUIRES(ctx,
-                    num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                    errors::InvalidArgument(
-                        "num_parallel_calls must be greater than zero."));
-        break;
-      default:
-        OP_REQUIRES(ctx, false,
-                    errors::Unimplemented("Unsupported operation version %d.",
-                                          op_version_));
-    }
-
-    bool drop_remainder;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "drop_remainder", &drop_remainder));
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
-                                                 &captured_func));
-
-    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
-                          drop_remainder, output_types_, output_shapes_, func_,
-                          std::move(captured_func), &ctx->eigen_cpu_device());
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
-            int64 num_parallel_calls, bool drop_remainder,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func,
-            const Eigen::ThreadPoolDevice* device)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          batch_size_(batch_size),
-          num_parallel_calls_(num_parallel_calls),
-          drop_remainder_(drop_remainder),
-          output_types_(output_types),
-          output_shapes_(output_shapes),
-          map_fn_(func),
-          captured_func_(std::move(captured_func)),
-          device_(device) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::MapAndBatch")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return "MapAndBatchDatasetOp::Dataset";
-    }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, map_fn_.name()));
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      Node* batch_size_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
-      Node* num_parallel_calls_node;
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(num_parallel_calls_, &num_parallel_calls_node));
-      Node* drop_remainder_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
-
-      DataTypeVector other_arguments_types;
-      other_arguments_types.reserve(captured_func_->captured_inputs().size());
-      std::vector<Node*> other_arguments;
-      other_arguments.reserve(captured_func_->captured_inputs().size());
-      for (const Tensor& t : captured_func_->captured_inputs()) {
-        Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-        other_arguments.emplace_back(node);
-        other_arguments_types.emplace_back(t.dtype());
-      }
-      AttrValue f;
-      b->BuildAttrValue(map_fn_, &f);
-      AttrValue other_arguments_types_attr;
-      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
-
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {std::make_pair(0, input_graph_node),
-           std::make_pair(2, batch_size_node),
-           std::make_pair(3, num_parallel_calls_node),
-           std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
-          {std::make_pair(1, other_arguments)},      // Tensor list inputs.
-          {std::make_pair("f", f),
-           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
-          output));
-      return Status::OK();
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            mu_(std::make_shared<mutex>()),
-            cond_var_(std::make_shared<condition_variable>()),
-            num_parallel_calls_(std::make_shared<model::SharedState>(
-                params.dataset->num_parallel_calls_, mu_, cond_var_)) {}
-
-      ~Iterator() override {
-        mutex_lock l(*mu_);
-        // Cancel the runner thread.
-        cancelled_ = true;
-        cond_var_->notify_all();
-        // Wait for all in-flight calls to complete.
-        while (num_calls_ > 0) {
-          cond_var_->wait(l);
-        }
-      }
-
-      Status Initialize(IteratorContext* ctx) override {
-        mutex_lock l(*mu_);
-        AddConstantParameter(ctx, "batch_size", dataset()->batch_size_);
-        if (num_parallel_calls_->value == kAutoTune) {
-          num_parallel_calls_->value = 1;
-          AddTunableParameter(ctx, "parallelism", num_parallel_calls_, 1,
-                              port::NumSchedulableCPUs());
-        } else {
-          AddConstantParameter(ctx, "parallelism", num_parallel_calls_->value);
-        }
-        TF_RETURN_IF_ERROR(
-            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        std::shared_ptr<BatchResult> result;
-        {
-          mutex_lock l(*mu_);
-          EnsureRunnerThreadStarted(ctx);
-          while (batch_results_.empty() ||
-                 batch_results_.front()->num_calls > 0) {
-            RecordStop(ctx);
-            cond_var_->wait(l);
-            RecordStart(ctx);
-          }
-          std::swap(result, batch_results_.front());
-          batch_results_.pop_front();
-          cond_var_->notify_all();
-        }
-        return ProcessResult(ctx, result, out_tensors, end_of_sequence);
-      }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(*mu_);
-        // Wait for all in-flight calls to complete.
-        while (num_calls_ > 0) {
-          cond_var_->wait(l);
-        }
-        CHECK_EQ(num_calls_, 0);
-        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("call_counter"), call_counter_));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"),
-                                               batch_results_.size()));
-        for (size_t i = 0; i < batch_results_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteBatchResult(writer, i));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(*mu_);
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("call_counter"), &call_counter_));
-        int64 batch_results_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("batch_results_size"),
-                                              &batch_results_size));
-        for (int i = 0; i < batch_results_size; ++i) {
-          TF_RETURN_IF_ERROR(ReadBatchResult(ctx, reader, i));
-        }
-        return Status::OK();
-      }
-
-     private:
-      struct BatchResult {
-        explicit BatchResult(int64 batch_size) {
-          end_of_input = false;
-          num_calls = batch_size;
-          num_elements = 0;
-          output_allocated = false;
-          status = Status::OK();
-        }
-
-        void UpdateStatus(const Status& s) {
-          mutex_lock l(mu);
-          status.Update(s);
-        }
-
-        mutex mu;
-        bool end_of_input GUARDED_BY(mu);
-        int64 num_elements GUARDED_BY(mu);
-        std::vector<Tensor> output;
-        bool output_allocated GUARDED_BY(mu);
-        Status status GUARDED_BY(mu);
-        // Counts the number of outstanding calls for this batch.
-        int64 num_calls;  // access guarded by owner's mutex
-      };
-
-      void Callback(const std::shared_ptr<IteratorContext>& ctx,
-                    const std::shared_ptr<BatchResult>& result,
-                    const std::shared_ptr<std::vector<Tensor>>& return_values,
-                    int64 offset, const Status& status) LOCKS_EXCLUDED(*mu_) {
-        result->UpdateStatus(status);
-        if (status.ok()) {
-          EnsureOutputAllocated(ctx, result, return_values);
-          for (size_t i = 0; i < return_values->size(); ++i) {
-            const Tensor& tensor = return_values->at(i);
-            Tensor* batch = &(result->output)[i];
-            if (tensor.NumElements() !=
-                (batch->NumElements() / batch->dim_size(0))) {
-              TensorShape batch_shape = batch->shape();
-              batch_shape.RemoveDim(0);
-              result->UpdateStatus(errors::InvalidArgument(
-                  "Cannot add tensor to the batch: number of elements does not "
-                  "match. Shapes are: [tensor]: ",
-                  tensor.shape().DebugString(),
-                  ", [batch]: ", batch_shape.DebugString()));
-              break;
-            }
-            // TODO(mrry): Add a version of DoParallelConcat that allows us to
-            // move `tensor` where possible, to speed up string tensor batching.
-            Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                *dataset()->device_, tensor, offset, batch);
-            if (!copy_status.ok()) {
-              result->UpdateStatus(copy_status);
-              break;
-            }
-          }
-          {
-            mutex_lock l(result->mu);
-            result->num_elements++;
-          }
-        }
-        CallCompleted(result);
-      }
-
-      void CallCompleted(const std::shared_ptr<BatchResult>& result)
-          LOCKS_EXCLUDED(*mu_) {
-        mutex_lock l(*mu_);
-        num_calls_--;
-        result->num_calls--;
-        cond_var_->notify_all();
-      }
-
-      void CallFunction(std::shared_ptr<IteratorContext> ctx,
-                        const std::shared_ptr<BatchResult>& result,
-                        int64 offset) LOCKS_EXCLUDED(*mu_) {
-        // Get the next input element.
-        std::vector<Tensor> input_element;
-        bool end_of_input;
-        Status status =
-            input_impl_->GetNext(ctx.get(), &input_element, &end_of_input);
-        bool return_early;
-        {
-          mutex_lock l(result->mu);
-          result->end_of_input = result->end_of_input || end_of_input;
-          result->status.Update(status);
-          return_early = result->end_of_input || !result->status.ok();
-        }
-        if (return_early) {
-          CallCompleted(result);
-          return;
-        }
-
-        // Call `captured_func_(input_element)`, using `Callback` to store the
-        // result in `result`.
-        (*ctx->runner())(std::bind(
-            [this, result, offset](std::shared_ptr<IteratorContext> ctx,
-                                   std::vector<Tensor> input_element) {
-              std::shared_ptr<std::vector<Tensor>> return_values(
-                  new std::vector<Tensor>());
-              dataset()->captured_func_->RunAsync(
-                  ctx.get(), std::move(input_element), return_values.get(),
-                  [this, ctx, result, return_values, offset](Status status) {
-                    Callback(ctx, result, return_values, offset, status);
-                  },
-                  prefix());
-            },
-            ctx, std::move(input_element)));
-      }
-
-      Status CopyPartialBatch(Tensor* output, const Tensor& value,
-                              int64 num_elements) {
-        switch (value.dtype()) {
-#define HANDLE_TYPE(type)                                         \
-  case DataTypeToEnum<type>::value: {                             \
-    auto output_t = output->flat_outer_dims<type>();              \
-    auto value_t = value.flat_outer_dims<type>();                 \
-    for (size_t i = 0; i < num_elements; i++) {                   \
-      output_t.template chip<0>(i) = value_t.template chip<0>(i); \
-    }                                                             \
-    return Status::OK();                                          \
-  }
-          TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-          default:
-            return errors::InvalidArgument("Unsupported data type: ",
-                                           DataTypeString(value.dtype()));
-        }
-        return Status::OK();
-      }
-
-      void EnsureRunnerThreadStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        if (!runner_thread_) {
-          std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "runner_thread",
-              std::bind(&Iterator::RunnerThread, this, ctx_copy)));
-        }
-      }
-
-      void EnsureOutputAllocated(
-          const std::shared_ptr<IteratorContext>& ctx,
-          const std::shared_ptr<BatchResult>& result,
-          const std::shared_ptr<std::vector<Tensor>>& return_values) {
-        mutex_lock l(result->mu);
-        if (result->output_allocated) {
-          return;
-        }
-        const size_t num_components = return_values->size();
-        for (size_t i = 0; i < num_components; ++i) {
-          TensorShape component_shape({dataset()->batch_size_});
-          component_shape.AppendShape(return_values->at(i).shape());
-          AllocatorAttributes attr;
-          attr.set_gpu_compatible(true);
-          Tensor component(ctx->allocator(attr), return_values->at(i).dtype(),
-                           component_shape);
-          result->output.emplace_back(std::move(component));
-        }
-        result->output_allocated = true;
-      }
-
-      Status ProcessResult(IteratorContext* ctx,
-                           const std::shared_ptr<BatchResult>& result,
-                           std::vector<Tensor>* out_tensors,
-                           bool* end_of_sequence) {
-        mutex_lock l(result->mu);
-        if (result->num_elements == 0) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
-        // `f` may deliberately raise `errors::OutOfRange` to indicate that we
-        // should terminate the iteration early.
-        if (!result->status.ok() && !errors::IsOutOfRange(result->status)) {
-          // Deallocate tensors allocated for the output.
-          result->output.clear();
-          *end_of_sequence = false;
-          return result->status;
-        }
-        if (result->num_elements < dataset()->batch_size_) {
-          if (dataset()->drop_remainder_) {
-            // Deallocate tensors allocated for the output.
-            result->output.clear();
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-          const std::vector<Tensor>& output = result->output;
-          for (size_t i = 0; i < output.size(); ++i) {
-            TensorShape component_shape(result->output[i].shape());
-            component_shape.set_dim(0, result->num_elements);
-            AllocatorAttributes attr;
-            attr.set_gpu_compatible(true);
-            Tensor component(ctx->allocator(attr), output[i].dtype(),
-                             component_shape);
-            TF_RETURN_IF_ERROR(
-                CopyPartialBatch(&component, output[i], result->num_elements));
-            out_tensors->emplace_back(std::move(component));
-          }
-          // Deallocate tensors allocated for the output.
-          result->output.clear();
-        } else {
-          *out_tensors = std::move(result->output);
-        }
-        *end_of_sequence = result->num_elements == 0;
-        return Status::OK();
-      }
-
-      void RunnerThread(const std::shared_ptr<IteratorContext>& ctx)
-          LOCKS_EXCLUDED(*mu_) {
-        std::vector<std::pair<std::shared_ptr<BatchResult>, int64>> new_calls;
-        RecordStart(ctx.get());
-        auto stop_cleanup =
-            gtl::MakeCleanup([this, &ctx]() { RecordStop(ctx.get()); });
-        new_calls.reserve(num_parallel_calls_->value);
-        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
-          int64 num_parallel_calls = num_parallel_calls_->value;
-          int64 max_batch_results =
-              (num_parallel_calls + dataset()->batch_size_ - 1) /
-              dataset()->batch_size_;
-          return num_calls_ >= num_parallel_calls ||
-                 (batch_results_.size() > max_batch_results ||
-                  (batch_results_.size() == max_batch_results &&
-                   call_counter_ % dataset()->batch_size_ == 0));
-        };
-        while (true) {
-          {
-            mutex_lock l(*mu_);
-            while (!cancelled_ && busy()) {
-              RecordStop(ctx.get());
-              cond_var_->wait(l);
-              RecordStart(ctx.get());
-            }
-
-            if (cancelled_) {
-              return;
-            }
-
-            while (!busy()) {
-              if (call_counter_ % dataset()->batch_size_ == 0) {
-                batch_results_.emplace_back(
-                    new BatchResult(dataset()->batch_size_));
-              }
-              int64 offset = call_counter_++ % dataset()->batch_size_;
-              new_calls.emplace_back(batch_results_.back(), offset);
-              num_calls_++;
-            }
-          }
-
-          for (const auto& call : new_calls) {
-            CallFunction(ctx, call.first, call.second);
-          }
-          new_calls.clear();
-        }
-      }
-
-      Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
-                             size_t index) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        batch_results_.emplace_back(new BatchResult(dataset()->batch_size_));
-        std::shared_ptr<BatchResult> result = batch_results_.back();
-        string prefix = strings::StrCat("batch_results_", index);
-        mutex_lock l(result->mu);
-        result->end_of_input = reader->Contains(
-            full_name(strings::StrCat(prefix, "_end_of_input")));
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name(strings::StrCat(prefix, "_num_calls")),
-                               &result->num_calls));
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_num_elements")),
-            &result->num_elements));
-        result->output_allocated = reader->Contains(
-            full_name(strings::StrCat(prefix, "_output_allocated")));
-        int64 output_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_output_size")), &output_size));
-        result->output.reserve(output_size);
-        for (int i = 0; i < output_size; i++) {
-          Tensor t;
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(prefix, "_output_", i)), &t));
-          // If the batch was not full, we may have stored only the relevant
-          // slice. Since tensors in `BatchResult.output` are expected to
-          // have the leading dimension of size batch_size, we build a larger
-          // tensor and copy the slice read from the checkpoint into it.
-          if (t.dim_size(0) < dataset()->batch_size_) {
-            TensorShape component_shape(t.shape());
-            component_shape.set_dim(0, dataset()->batch_size_);
-            AllocatorAttributes attr;
-            attr.set_gpu_compatible(true);
-            Tensor new_t(ctx->allocator(attr), t.dtype(), component_shape);
-            TF_RETURN_IF_ERROR(CopyPartialBatch(&new_t, t, t.dim_size(0)));
-            result->output.emplace_back(std::move(new_t));
-          } else {
-            result->output.emplace_back(std::move(t));
-          }
-        }
-        TF_RETURN_IF_ERROR(ReadStatus(
-            reader, strings::StrCat(prefix, "_status"), &result->status));
-        return Status::OK();
-      }
-
-      Status ReadStatus(IteratorStateReader* reader, const string& prefix,
-                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_code")), &code_int));
-        error::Code code = static_cast<error::Code>(code_int);
-
-        if (code != error::Code::OK) {
-          string error_message;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_msg")), &error_message));
-          *status = Status(code, error_message);
-        } else {
-          *status = Status::OK();
-        }
-        return Status::OK();
-      }
-
-      Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
-          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        std::shared_ptr<BatchResult> result = batch_results_[index];
-        string prefix = strings::StrCat("batch_results_", index);
-        mutex_lock l(result->mu);
-        if (result->end_of_input) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_num_calls")),
-            result->num_calls));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_num_elements")),
-            result->num_elements));
-        if (result->output_allocated) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_output_allocated")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_output_size")),
-            result->output.size()));
-        for (int i = 0; i < result->output.size(); i++) {
-          // If the batch is not full, we only store the first `num_elements`
-          // values. The rest of the batch tensor is *uninitialized* and
-          // accessing that will raise msan errors.
-          if (result->num_elements < dataset()->batch_size_) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat(prefix, "_output_", i)),
-                result->output[i].Slice(0, result->num_elements)));
-          } else {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat(prefix, "_output_", i)),
-                result->output[i]));
-          }
-        }
-        TF_RETURN_IF_ERROR(WriteStatus(
-            writer, strings::StrCat(prefix, "_status"), result->status));
-        return Status::OK();
-      }
-
-      Status WriteStatus(IteratorStateWriter* writer, const string& prefix,
-                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
-                                static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
-                                  status.error_message()));
-        }
-        return Status::OK();
-      }
-
-      // Used for coordination between the main thread, the runner thread, and
-      // the callback threads.
-      const std::shared_ptr<mutex> mu_;
-      // Used for coordination between the main thread, the runner thread, and
-      // the callback threads. In particular, the runner thread should only
-      // schedule new calls when the number of in-flight calls is less than
-      // `num_parallel_calls_->value` and there are slots available in the
-      // `batch_results_` buffer.
-      const std::shared_ptr<condition_variable> cond_var_;
-      // Identifies the maximum number of parallel calls.
-      const std::shared_ptr<model::SharedState> num_parallel_calls_;
-      // Counts the number of outstanding calls for this batch.
-      int64 num_calls_ GUARDED_BY(*mu_) = 0;
-      // Counts the total number of calls.
-      int64 call_counter_ GUARDED_BY(*mu_) = 0;
-      std::unique_ptr<IteratorBase> input_impl_;
-      // Buffer for storing the (intermediate) batch results.
-      std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
-      bool cancelled_ GUARDED_BY(*mu_) = false;
-    };
-
-    const DatasetBase* const input_;
-    const NameAttrList func_;
-    const int64 batch_size_;
-    const int64 num_parallel_calls_;
-    const bool drop_remainder_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-    const NameAttrList map_fn_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const Eigen::ThreadPoolDevice* device_;  // not owned
-  };
-
-  const int op_version_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList func_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("MapAndBatchDataset").Device(DEVICE_CPU),
-                        MapAndBatchDatasetOp);
-
-REGISTER_KERNEL_BUILDER(Name("MapAndBatchDatasetV2").Device(DEVICE_CPU),
-                        MapAndBatchDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index f112e1dc431fff9702c40ff99ba37cd5deb9db90..fc6e93a81cb47372fa023a2f793d35008ab830c8 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -13,27 +13,35 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class MapDatasetOp : public UnaryDatasetOpKernel {
  public:
+  using MapIteratorFunction =
+      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
+                           std::vector<Tensor>, std::vector<Tensor>*)>;
+
   explicit MapDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
                                      &use_inter_op_parallelism_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -43,8 +51,47 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
-    *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          output_types_, output_shapes_);
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+
+    MapIteratorFunction map_func;
+    CapturedFunction* raw_captured_func = captured_func.get();
+    if (indices.empty()) {
+      map_func = [](IteratorContext* ctx,
+                    InstantiatedCapturedFunction* inst_captured_func,
+                    std::vector<Tensor> args,
+                    std::vector<Tensor>* out_tensors) {
+        return inst_captured_func->Run(ctx, std::move(args), out_tensors);
+      };
+    } else {
+      std::vector<bool> can_move = ComputeMoveVector(indices);
+      map_func = [raw_captured_func, indices, can_move](
+                     IteratorContext* ctx,
+                     InstantiatedCapturedFunction* inst_captured_func,
+                     std::vector<Tensor> args,
+                     std::vector<Tensor>* out_tensors) {
+        const std::vector<Tensor>& captured_inputs =
+            raw_captured_func->captured_inputs();
+        size_t num_args = args.size();
+        for (size_t i = 0; i < indices.size(); ++i) {
+          if (indices[i] < num_args) {
+            if (can_move[i]) {
+              out_tensors->push_back(std::move(args[indices[i]]));
+            } else {
+              out_tensors->push_back(args[indices[i]]);
+            }
+          } else {
+            out_tensors->push_back(captured_inputs[indices[i] - num_args]);
+          }
+        }
+        return Status::OK();
+      };
+    }
+
+    *output =
+        new Dataset(ctx, input, func_, std::move(captured_func), output_types_,
+                    output_shapes_, use_inter_op_parallelism_,
+                    std::move(map_func), preserve_cardinality_);
   }
 
  private:
@@ -54,13 +101,18 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
             const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
+            const std::vector<PartialTensorShape>& output_shapes,
+            bool use_inter_op_parallelism, MapIteratorFunction map_func,
+            bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
+          use_inter_op_parallelism_(use_inter_op_parallelism),
+          preserve_cardinality_(preserve_cardinality),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
-          output_shapes_(output_shapes) {
+          output_shapes_(output_shapes),
+          map_func_(std::move(map_func)) {
       input_->Ref();
     }
 
@@ -68,8 +120,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Map")}));
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Map")}, map_func_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -81,6 +133,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "MapDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -99,16 +153,34 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
-      AttrValue f;
-      b->BuildAttrValue(func_, &f);
+
+      // Attr: f
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      AttrValue f_attr;
+      b->BuildAttrValue(func_, &f_attr);
+
+      // Attr: Targuments
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
 
+      // Attr: use_inter_op_parallelism
+      AttrValue use_inter_op_parallelism_attr;
+      b->BuildAttrValue(use_inter_op_parallelism_,
+                        &use_inter_op_parallelism_attr);
+
+      // Attr: preserve_cardinality
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
+
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
           {std::make_pair(1, other_arguments)},         // Tensor list inputs.
-          {std::make_pair("f", f),
-           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          {std::make_pair("f", f_attr),
+           std::make_pair("Targuments", other_arguments_types_attr),
+           std::make_pair("use_inter_op_parallelism",
+                          use_inter_op_parallelism_attr),
+           std::make_pair("preserve_cardinality",
+                          preserve_cardinality_attr)},  // Attrs
           output));
       return Status::OK();
     }
@@ -116,13 +188,14 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
+      explicit Iterator(const Params& params, MapIteratorFunction map_func)
+          : DatasetIterator<Dataset>(params), map_func_(std::move(map_func)) {}
 
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -139,21 +212,34 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        // TODO(mrry): Avoid blocking a threadpool thread. We will need to
-        // stack-rip the iterators and use async kernels.
-        Status s =
-            dataset()->captured_func_->Run(ctx, std::move(args), out_tensors);
+        Status s = map_func_(ctx, instantiated_captured_func_.get(), args,
+                             out_tensors);
         if (errors::IsOutOfRange(s)) {
-          // `f` may deliberately raise `errors::OutOfRange` to indicate
-          // that we should terminate the iteration early.
-          *end_of_sequence = true;
-          return Status::OK();
+          if (dataset()->preserve_cardinality_) {
+            // To guarantee that the transformation preserves the cardinality of
+            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+            // former may be interpreted by a caller as the end of sequence.
+            return errors::InvalidArgument(
+                "Function invocation produced OutOfRangeError: ",
+                s.error_message());
+          } else {
+            // `f` may deliberately raise `errors::OutOfRange` to indicate
+            // that we should terminate the iteration early.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
         } else {
           return s;
         }
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
@@ -167,22 +253,33 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       std::unique_ptr<IteratorBase> input_impl_;
+      const MapIteratorFunction map_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
     const NameAttrList func_;
+    const bool use_inter_op_parallelism_;
+    const bool preserve_cardinality_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const MapIteratorFunction map_func_;
   };
 
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
   bool use_inter_op_parallelism_;
+  bool preserve_cardinality_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalMapDataset")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("input_dataset")
+                            .HostMemory("handle"),
+                        MapDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 9aa505f4f1526104178be020e7ae1c7c3f929eff..069d61d80d4f00eecdd77356626d7278c0842445 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/cpu_info.h"
 
@@ -60,6 +60,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "ModelDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -86,22 +88,32 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       }
 
       Status Initialize(IteratorContext* ctx) override {
-        IteratorContext ctx_with_model(CreateParams(ctx));
-        return dataset()->input_->MakeIterator(&ctx_with_model, prefix(),
-                                               &input_impl_);
+        IteratorContext::Params params(ctx);
+        params.model = model_;
+        return dataset()->input_->MakeIterator(
+            IteratorContext(std::move(params)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
-        IteratorContext ctx_with_model(CreateParams(ctx));
-        return input_impl_->GetNext(&ctx_with_model, out_tensors,
-                                    end_of_sequence);
+        IteratorContext::Params params(ctx);
+        {
+          mutex_lock l(mu_);
+          TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
+          params.model = model_;
+        }
+        return input_impl_->GetNext(IteratorContext(std::move(params)),
+                                    out_tensors, end_of_sequence);
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
@@ -115,19 +127,13 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      IteratorContext::Params CreateParams(IteratorContext* ctx) {
-        IteratorContext::Params params = ctx->params();
-        params.model = model_;
-        return params;
-      }
-
      private:
       Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!optimize_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
           optimize_thread_.reset(ctx->env()->StartThread(
-              {}, "optimize_thread",
+              {}, "tf_data_model",
               [this, new_ctx]() { OptimizeThread(new_ctx); }));
         }
         return Status::OK();
@@ -169,7 +175,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       std::shared_ptr<model::Model> model_;
       std::unique_ptr<Thread> optimize_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_;
     };
 
     const DatasetBase* input_;
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index d909b9e9d374a421efb2c4d30aa3c71aa72711eb..ba2125a66eb98985ebd0ae8f55bfc239997ad6df 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
@@ -40,18 +41,21 @@ using MultiDeviceIteratorCallback =
 
 class MultiDeviceIterator : public ResourceBase {
  public:
-  MultiDeviceIterator(const DataTypeVector& output_types,
-                      const std::vector<PartialTensorShape>& output_shapes,
-                      const std::vector<string>& devices,
-                      std::unique_ptr<FunctionLibraryDefinition> flib_def,
-                      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-                      FunctionLibraryRuntime* lib)
+  MultiDeviceIterator(
+      const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      const std::vector<string>& devices,
+      std::unique_ptr<FunctionLibraryDefinition> flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+      FunctionLibraryRuntime* lib,
+      std::unique_ptr<FunctionHandleCache> function_handle_cache)
       : output_types_(output_types),
         output_shapes_(output_shapes),
         devices_(devices),
         flib_def_(std::move(flib_def)),
         pflr_(std::move(pflr)),
-        lib_(lib) {
+        lib_(lib),
+        function_handle_cache_(std::move(function_handle_cache)) {
     DCHECK(lib_ != nullptr);
   }
 
@@ -86,12 +90,20 @@ class MultiDeviceIterator : public ResourceBase {
   void GetNextFromShard(IteratorContext* ctx, int shard_num,
                         int64 incarnation_id,
                         MultiDeviceIteratorCallback callback) {
-    if (lib_ != nullptr) {
-      ctx->set_lib(lib_);
+    if (ctx->lib() == lib_) {
+      tf_shared_lock l(mu_);
+      multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id,
+                                             std::move(callback));
+    } else {
+      IteratorContext::Params params(ctx);
+      params.lib = lib_;
+      params.function_handle_cache = function_handle_cache_.get();
+      params.resource_mgr = &resource_mgr_;
+      IteratorContext iter_ctx(std::move(params));
+      tf_shared_lock l(mu_);
+      multi_device_buffer_->GetNextFromShard(
+          &iter_ctx, shard_num, incarnation_id, std::move(callback));
     }
-    tf_shared_lock l(mu_);
-    multi_device_buffer_->GetNextFromShard(ctx, shard_num, incarnation_id,
-                                           std::move(callback));
   }
 
   const DataTypeVector& output_types() const { return output_types_; }
@@ -110,6 +122,12 @@ class MultiDeviceIterator : public ResourceBase {
     return lib_;
   }
 
+  FunctionHandleCache* function_handle_cache() {
+    return function_handle_cache_.get();
+  }
+
+  ResourceMgr* resource_mgr() { return &resource_mgr_; }
+
  private:
   // A private class that uses a background thread to keep a per device buffer
   // full.
@@ -200,7 +218,7 @@ class MultiDeviceIterator : public ResourceBase {
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!background_thread_) {
         background_thread_.reset(ctx->env()->StartThread(
-            {}, "multi_device_iterator_background_thread",
+            {}, "tf_data_multi_device_iterator",
             std::bind(&MultiDeviceIterator::MultiDeviceBuffer::BackgroundThread,
                       this, new IteratorContext(*ctx))));
       }
@@ -334,6 +352,8 @@ class MultiDeviceIterator : public ResourceBase {
   const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
   const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
   FunctionLibraryRuntime* const lib_ = nullptr;  // not owned.
+  const std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  ResourceMgr resource_mgr_;
   std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
 
   int64 incarnation_id_ GUARDED_BY(mu_) = 0;
@@ -377,21 +397,24 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
         std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
         OP_REQUIRES_OK(context, context->function_library()->Clone(
                                     &flib_def, &pflr, &lib));
+        std::unique_ptr<FunctionHandleCache> function_handle_cache(
+            new FunctionHandleCache(lib));
         ResourceMgr* mgr = context->resource_manager();
         OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
 
         MultiDeviceIterator* resource;
-        OP_REQUIRES_OK(
-            context,
-            mgr->LookupOrCreate<MultiDeviceIterator>(
-                cinfo_.container(), cinfo_.name(), &resource,
-                [this, lib, &flib_def, &pflr](MultiDeviceIterator** ret)
-                    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                      *ret = new MultiDeviceIterator(
-                          output_types_, output_shapes_, devices_,
-                          std::move(flib_def), std::move(pflr), lib);
-                      return Status::OK();
-                    }));
+        OP_REQUIRES_OK(context,
+                       mgr->LookupOrCreate<MultiDeviceIterator>(
+                           cinfo_.container(), cinfo_.name(), &resource,
+                           [this, lib, &flib_def, &pflr,
+                            &function_handle_cache](MultiDeviceIterator** ret)
+                               EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                 *ret = new MultiDeviceIterator(
+                                     output_types_, output_shapes_, devices_,
+                                     std::move(flib_def), std::move(pflr), lib,
+                                     std::move(function_handle_cache));
+                                 return Status::OK();
+                               }));
 
         Status s = VerifyResource(resource);
         if (TF_PREDICT_FALSE(!s.ok())) {
@@ -455,8 +478,11 @@ class MultiDeviceIteratorInitOp : public OpKernel {
     core::ScopedUnref unref(resource);
 
     std::unique_ptr<IteratorBase> iterator;
-    IteratorContext iter_ctx(ctx);
-    iter_ctx.set_lib(resource->lib());
+    IteratorContext::Params params(ctx);
+    params.lib = resource->lib();
+    params.function_handle_cache = resource->function_handle_cache();
+    params.resource_mgr = resource->resource_mgr();
+    IteratorContext iter_ctx(std::move(params));
     OP_REQUIRES_OK(
         ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
     int64 incarnation_id;
@@ -478,11 +504,8 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
  public:
   explicit MultiDeviceIteratorGetNextFromShardOp(OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx),
-        thread_pool_(new thread::ThreadPool(
-            ctx->env(), ThreadOptions(),
-            strings::StrCat("multi_device_iterator_get_next_thread_",
-                            SanitizeThreadSuffix(name())),
-            1 /* num_threads */, false /* low_latency_hint */)) {}
+        background_worker_(ctx->env(),
+                           "tf_data_multi_device_iterator_get_next") {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     const Tensor* tensor_shard_num;
@@ -497,18 +520,8 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
     MultiDeviceIterator* iterator;
     OP_REQUIRES_OK_ASYNC(
         ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
-    thread_pool_->Schedule(std::bind(
+    background_worker_.Schedule(std::bind(
         [ctx, iterator, shard_num, incarnation_id](DoneCallback done) {
-          IteratorContext::Params params;
-          params.env = ctx->env();
-          params.runner = *(ctx->runner());
-          params.function_library = iterator->function_library();
-          DeviceBase* device = ctx->function_library()->device();
-          params.allocator_getter = [device](AllocatorAttributes attrs) {
-            return device->GetAllocator(attrs);
-          };
-          IteratorContext iter_ctx(std::move(params));
-
           MultiDeviceIteratorCallback callback = std::bind(
               [ctx](const HostBufferElement& elem, DoneCallback done) {
                 // iterator->Unref();
@@ -526,6 +539,9 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
               },
               std::placeholders::_1, std::move(done));
 
+          IteratorContext::Params params(ctx);
+          params.function_library = iterator->function_library();
+          IteratorContext iter_ctx(std::move(params));
           iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
                                      callback);
           iterator->Unref();
@@ -534,7 +550,7 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
   }
 
  private:
-  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  BackgroundWorker background_worker_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 1cb7caa73837b462083b7ceedbe5b719191c9dfc..9c50d8050a82397f1578ab3f577ef5ad77f81767 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -17,7 +17,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -26,8 +28,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -36,7 +40,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 class OptimizeDatasetOp : public UnaryDatasetOpKernel {
  public:
@@ -55,8 +59,13 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         ctx, ParseVectorArgument<string>(ctx, "optimizations", &optimizations));
     Dataset* dataset =
         new Dataset(ctx, input, optimizations, output_types_, output_shapes_);
-    OP_REQUIRES_OK(ctx, dataset->Optimize(ctx));
-    *output = dataset;
+    Status s = dataset->Optimize(ctx);
+    if (s.ok()) {
+      *output = dataset;
+    } else {
+      dataset->Unref();
+      OP_REQUIRES_OK(ctx, s);
+    }
   }
 
  private:
@@ -67,6 +76,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
+          optimized_input_(nullptr),
           input_(input),
           optimizations_(optimizations),
           output_types_(output_types),
@@ -76,7 +86,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override {
       input_->Unref();
-      optimized_input_->Unref();
+      if (optimized_input_) {
+        optimized_input_->Unref();
+      }
     }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
@@ -94,9 +106,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       Node* input_node = nullptr;
       SerializationContext::Params params;
       std::vector<std::pair<string, Tensor>> input_list;
-      params.allow_stateful_functions = true;
       params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
       params.input_list = &input_list;
+      params.optimization_only = true;
       SerializationContext serialization_ctx(params);
       TF_RETURN_IF_ERROR(
           db.AddInputDataset(&serialization_ctx, input_, &input_node));
@@ -113,6 +125,19 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       // using the optimized function library.
       TF_RETURN_IF_ERROR(
           ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_));
+
+      // Create a FunctionHandleCache.
+      function_handle_cache_.reset(new FunctionHandleCache(lib_));
+
+      // Some functions may have been modified without having their names
+      // changed (for example, nested dataset graphs from FlatMap or
+      // Interleave). To avoid name conflicts, we remove these functions from
+      // flib_def_ before adding the optimized function library.
+      for (const FunctionDef& fd : graph_def.library().function()) {
+        if (flib_def_->Find(fd.signature().name()) != nullptr) {
+          TF_RETURN_IF_ERROR(flib_def_->RemoveFunction(fd.signature().name()));
+        }
+      }
       TF_RETURN_IF_ERROR(flib_def_->AddLibrary(graph_def.library()));
 
       Graph graph(OpRegistry::Global());
@@ -137,6 +162,8 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -154,22 +181,30 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        IteratorContext::Params params = ctx->params();
+        IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
+        params.function_handle_cache = dataset()->function_handle_cache_.get();
         return dataset()->optimized_input_->MakeIterator(
-            IteratorContext(params), prefix(), &input_impl_);
+            IteratorContext(std::move(params)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        IteratorContext::Params params = ctx->params();
+        IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
-        return input_impl_->GetNext(IteratorContext(params), out_tensors,
-                                    end_of_sequence);
+        params.function_handle_cache = dataset()->function_handle_cache_.get();
+        return input_impl_->GetNext(IteratorContext(std::move(params)),
+                                    out_tensors, end_of_sequence);
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         return Status::OK();
@@ -185,13 +220,60 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
     };
 
+    void AddFakeSinks(FunctionDef* function_def) {
+      int counter = 0;
+      for (const auto& output : function_def->signature().output_arg()) {
+        NodeDef* node = function_def->add_node_def();
+        tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
+            strings::StrCat("FakeSink", counter++), function_def, node);
+        node->set_op("Identity");
+        node->add_input(function_def->ret().at(output.name()));
+        (*node->mutable_attr())["T"].set_type(output.type());
+
+        (*function_def->mutable_ret())[output.name()] =
+            strings::StrCat(node->name(), ":output:0");
+      }
+    }
+
+    void RemoveFakeSinks(FunctionDef* function_def) {
+      // Map from identity node names to their input tensor strings
+      std::map<string, string> identity_map;
+      for (const auto& node : function_def->node_def()) {
+        if (node.op() == "Identity" && node.input_size() == 1) {
+          identity_map[node.name()] = node.input(0);
+        }
+      }
+      for (const auto& output_arg : function_def->signature().output_arg()) {
+        const string& tensor = function_def->ret().at(output_arg.name());
+        const string& output_node = tensor.substr(0, tensor.find(':'));
+        if (identity_map.find(output_node) != identity_map.end()) {
+          (*function_def->mutable_ret())[output_arg.name()] =
+              identity_map.at(output_node);
+        }
+      }
+    }
+
     Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
                               string* output_node) {
-      // Add a fake sink node to allow rewriting the actual sink node.
+      // Add an identity node as the fetch node, otherwise we might get
+      // 'placeholder is both fed and fetched' errors in some cases when using
+      // input list with placeholder dataset nodes.
       NodeDef* node = graph_def->mutable_node()->Add();
-      node->set_name("FakeSink");
-      node->set_op("SinkDataset");
+      tensorflow::grappler::graph_utils::SetUniqueGraphNodeName(
+          "Sink", graph_def, node);
+      node->set_op("Identity");
       node->add_input(*output_node);
+      (*node->mutable_attr())["T"].set_type(DT_VARIANT);
+      *output_node = node->name();
+
+      // Add fake sink node to graph and functions to allow rewriting the actual
+      // sink nodes.
+      // TODO(b/118820916): When MetaOptimizer adds provisions for function
+      // retvals to be optimizable, we will no longer need this.
+      for (auto& function_def :
+           *graph_def->mutable_library()->mutable_function()) {
+        AddFakeSinks(&function_def);
+      }
 
       // Create metagraph.
       MetaGraphDef meta_graph_def;
@@ -200,11 +282,13 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       // Grappler determines fetch ops from collection 'train_op'.
       CollectionDef collection_def;
       auto node_list = collection_def.mutable_node_list();
-      node_list->add_value("FakeSink");
+      node_list->add_value(*output_node);
       (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
 
       // Create Grappler item.
-      tensorflow::RewriterConfig rewriter_config;
+      tensorflow::ConfigProto config;
+      RewriterConfig& rewriter_config =
+          *config.mutable_graph_options()->mutable_rewrite_options();
       for (const string& optimization : optimizations_) {
         rewriter_config.add_optimizers(optimization);
       }
@@ -214,6 +298,18 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       // moment (e.g. because we have no cost model for dataset ops).
       if (optimizations_.empty()) {
         rewriter_config.add_optimizers("non-existent");
+      } else {
+        // If we apply custom dataset optimizers, explicitly trigger a subset of
+        // standard grappler optimizations to further optimize modified dataset
+        // graphs (e.g. performing constant folding on merged functions,
+        // removing unused graph nodes)
+        // TODO(b/118175421): This should be part of the tf.data optimization
+        // pass manager.
+        // TODO(b/120437209): Apply `constfold` optimization when it is fixed.
+        for (const auto& optimizer :
+             {"pruning", "function", "shape", "arithmetic", "dependency"}) {
+          rewriter_config.add_optimizers(optimizer);
+        }
       }
       tensorflow::grappler::ItemConfig item_config;
       item_config.apply_optimizations = true;
@@ -231,14 +327,14 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         }
       }
       TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-          *grappler_item, rewriter_config, ctx->device(), &cluster, graph_def));
-
-      // Set `output_node` to the input of the fake sink node.
-      {
-        grappler::GraphView graph(graph_def);
-        grappler::GraphView::InputPort input_port =
-            graph.GetInputPort("FakeSink", 0);
-        *output_node = graph.GetRegularFanin(input_port).node->name();
+          *grappler_item, config, ctx->device(), &cluster, graph_def));
+
+      // Remove fake sinks after optimizations are done.
+      // TODO(b/118820916): When MetaOptimizer adds provisions for function
+      // retvals to be optimizable, we will no longer need this.
+      for (auto& function_def :
+           *graph_def->mutable_library()->mutable_function()) {
+        RemoveFakeSinks(&function_def);
       }
 
       return Status::OK();
@@ -248,6 +344,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
     FunctionLibraryRuntime* lib_ = nullptr;
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ = nullptr;
     std::unique_ptr<FunctionLibraryDefinition> flib_def_ = nullptr;
+    std::unique_ptr<FunctionHandleCache> function_handle_cache_ = nullptr;
     const DatasetBase* input_;
     const std::vector<string> optimizations_;
     const DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc
index 2ab5c83082b3f97f354a4f39db02c1012557b6a4..d8a7f21c5f99c6d99e506847e00cabc6bd49168f 100644
--- a/tensorflow/core/kernels/data/optional_ops.cc
+++ b/tensorflow/core/kernels/data/optional_ops.cc
@@ -22,75 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 namespace {
-const char kOptionalVariantTypeName[] = "tensorflow::data::Optional";
-
-// An `OptionalVariant` can represent either an "actual value" (a tuple of
-// tensors) or "none", and may be stored in a DT_VARIANT tensor.
-class OptionalVariant {
- public:
-  // Create an `OptionalVariant` with no actual value.
-  OptionalVariant() : values_(nullptr) {}
-
-  // Create an `OptionalVariant` with the actual value given by the tuple of
-  // tensors in `values`.
-  explicit OptionalVariant(std::vector<Tensor> values)
-      : values_(new std::vector<Tensor>(std::move(values))) {}
-
-  OptionalVariant(const OptionalVariant& other) : values_(other.values_) {}
-
-  // Returns true if `this` represents an actual value.
-  bool has_value() const { return values_ != nullptr; }
-
-  // REQUIRES: `this->has_value()` must be true.
-  const std::vector<Tensor>& get_values() const {
-    CHECK(values_) << "Tried to get values from an empty OptionalVariant";
-    return *values_;
-  }
-
-  // Implementations of the necessary methods for using `OptionalVariant`
-  // objects in DT_VARIANT tensors.
-  string TypeName() const { return kOptionalVariantTypeName; }
-  void Encode(VariantTensorData* data) const {
-    data->set_metadata(values_ != nullptr);
-    if (values_ != nullptr) {
-      for (const auto& t : *values_) {
-        *(data->add_tensors()) = t;
-      }
-    }
-  }
-
-  bool Decode(const VariantTensorData& data) {
-    if (data.type_name() != TypeName()) {
-      return false;
-    }
-    bool has_value = false;
-    if (!data.get_metadata(&has_value)) {
-      return false;
-    }
-    if (has_value) {
-      values_.reset(new std::vector<Tensor>(data.tensors()));
-    } else {
-      values_.reset();
-    }
-    return true;
-  }
-
-  string DebugString() const {
-    if (values_) {
-      return strings::StrCat("OptionalVariant<", "values: (",
-                             str_util::Join(*values_, ", ",
-                                            [](string* s, const Tensor& elem) {
-                                              *s = elem.DebugString();
-                                            }),
-                             ")>");
-    } else {
-      return strings::StrCat("OptionalVariant<None>");
-    }
-  }
-
- private:
-  std::shared_ptr<const std::vector<Tensor>> values_;
-};
 
 class OptionalNoneOp : public OpKernel {
  public:
@@ -143,6 +74,12 @@ class OptionalGetValueOp : public OpKernel {
   explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES(
+        ctx, output_shapes_.size() == output_types_.size(),
+        errors::InvalidArgument(
+            "output_types and output_shapes must be same length, got:\n",
+            "output_types: ", output_types_.size(), "\n",
+            "output_shapes: ", output_shapes_.size()));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -162,6 +99,10 @@ class OptionalGetValueOp : public OpKernel {
         ctx, optional->has_value(),
         errors::InvalidArgument("The given optional does not have a value."));
     const auto& components = optional->get_values();
+    OP_REQUIRES(ctx, components.size() == output_types_.size(),
+                errors::InvalidArgument(
+                    "The given optional has ", components.size(),
+                    " components, expected ", output_types_.size()));
     for (int i = 0; i < components.size(); ++i) {
       OP_REQUIRES(
           ctx, components[i].dtype() == output_types_[i],
@@ -186,23 +127,27 @@ class OptionalGetValueOp : public OpKernel {
   std::vector<PartialTensorShape> output_shapes_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU).Priority(2),
                         OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU).Priority(1),
                         OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE_CPU),
-                        OptionalFromValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE_GPU),
-                        OptionalFromValueOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_CPU).Priority(2),
+    OptionalFromValueOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_GPU).Priority(1),
+    OptionalFromValueOp);
 
-REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU).Priority(2),
                         OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(
-    Name("OptionalHasValue").Device(DEVICE_GPU).HostMemory("has_value"),
-    OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("has_value")
+                            .Priority(1),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU).Priority(2),
                         OptionalGetValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU).Priority(1),
                         OptionalGetValueOp);
 
 static Status OptionalDeviceCopy(
@@ -213,15 +158,7 @@ static Status OptionalDeviceCopy(
     std::vector<Tensor> to_values;
     to_values.reserve(from_values.size());
     for (const Tensor& t : from_values) {
-      if (t.dtype() == DT_VARIANT) {
-        // TODO(b/116349787): Implement support for nested variants.
-        return errors::Unimplemented(
-            "Support for copying nested variants to device has not yet been "
-            "implemented.");
-      }
-    }
-    for (const Tensor& t : from_values) {
-      if (DMAHelper::CanUseDMA(&t)) {
+      if (DMAHelper::CanUseDMA(&t) || t.dtype() == DT_VARIANT) {
         Tensor tmp(t.dtype());
         TF_RETURN_IF_ERROR(copy(t, &tmp));
         to_values.push_back(std::move(tmp));
@@ -272,5 +209,20 @@ Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index) {
   return Status::OK();
 }
 
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
+                                         DEVICE_CPU, OptionalVariant,
+                                         OptionalZerosLike<CPUDevice>);
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
+                                          OptionalVariant,
+                                          OptionalBinaryAdd<CPUDevice>);
+
+Status OptionalShape(const OptionalVariant& x, TensorShape* s) {
+  *s = TensorShape({});
+  return Status::OK();
+}
+
+REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(OptionalVariant, OptionalShape);
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optional_ops.cu.cc b/tensorflow/core/kernels/data/optional_ops.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb4a95a6f2245665ff70922df733909e9ba996df
--- /dev/null
+++ b/tensorflow/core/kernels/data/optional_ops.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/data/optional_ops.h"
+
+#include "tensorflow/core/framework/variant_op_registry.h"
+
+namespace tensorflow {
+namespace data {
+
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
+                                         DEVICE_GPU, OptionalVariant,
+                                         OptionalZerosLike<GPUDevice>);
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
+                                          OptionalVariant,
+                                          OptionalBinaryAdd<GPUDevice>);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/data/optional_ops.h b/tensorflow/core/kernels/data/optional_ops.h
index 2cbf2933f50a11b01ca19739e6f3318b4816e800..ef14e843115da0c37d79c6be13b8064c78c072d5 100644
--- a/tensorflow/core/kernels/data/optional_ops.h
+++ b/tensorflow/core/kernels/data/optional_ops.h
@@ -19,10 +19,13 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
 
 namespace tensorflow {
 namespace data {
 
+const char kOptionalVariantTypeName[] = "tensorflow::data::Optional";
+
 // Stores a DT_VARIANT value representing an Optional with the given value
 // in the `output_index`^th output of the given kernel execution context.
 Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
@@ -32,6 +35,122 @@ Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
 // in the `output_index`^th output of the given kernel execution context.
 Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index);
 
+// An `OptionalVariant` can represent either an "actual value" (a tuple of
+// tensors) or "none", and may be stored in a DT_VARIANT tensor.
+class OptionalVariant {
+ public:
+  // Create an `OptionalVariant` with no actual value.
+  OptionalVariant() : values_(nullptr) {}
+
+  // Create an `OptionalVariant` with the actual value given by the tuple of
+  // tensors in `values`.
+  explicit OptionalVariant(std::vector<Tensor> values)
+      : values_(new std::vector<Tensor>(std::move(values))) {}
+
+  OptionalVariant(const OptionalVariant& other) : values_(other.values_) {}
+
+  // Returns true if `this` represents an actual value.
+  bool has_value() const { return values_ != nullptr; }
+
+  // REQUIRES: `this->has_value()` must be true.
+  const std::vector<Tensor>& get_values() const {
+    DCHECK(values_) << "Tried to get values from an empty OptionalVariant";
+    return *values_;
+  }
+
+  // Implementations of the necessary methods for using `OptionalVariant`
+  // objects in DT_VARIANT tensors.
+  string TypeName() const { return kOptionalVariantTypeName; }
+  void Encode(VariantTensorData* data) const {
+    data->set_metadata(values_ != nullptr);
+    if (values_ != nullptr) {
+      for (const auto& t : *values_) {
+        *(data->add_tensors()) = t;
+      }
+    }
+  }
+
+  bool Decode(const VariantTensorData& data) {
+    if (data.type_name() != TypeName()) {
+      return false;
+    }
+    bool has_value = false;
+    if (!data.get_metadata(&has_value)) {
+      return false;
+    }
+    if (has_value) {
+      values_.reset(new std::vector<Tensor>(data.tensors()));
+    } else {
+      values_.reset();
+    }
+    return true;
+  }
+
+  string DebugString() const {
+    if (values_) {
+      return strings::StrCat("OptionalVariant<", "values: (",
+                             str_util::Join(*values_, ", ",
+                                            [](string* s, const Tensor& elem) {
+                                              *s = elem.DebugString();
+                                            }),
+                             ")>");
+    } else {
+      return strings::StrCat("OptionalVariant<None>");
+    }
+  }
+
+ private:
+  std::shared_ptr<const std::vector<Tensor>> values_;
+};
+
+template <typename Device>
+Status OptionalZerosLike(OpKernelContext* ctx, const OptionalVariant& x,
+                         OptionalVariant* y) {
+  if (!x.has_value()) {
+    *y = x;
+    return Status::OK();
+  }
+  std::vector<Tensor> zero_tensors;
+  for (const Tensor& tensor : x.get_values()) {
+    Tensor zero_t;
+    TF_RETURN_IF_ERROR(ZerosLikeTensor<Device>(ctx, tensor, &zero_t));
+    zero_tensors.push_back(std::move(zero_t));
+  }
+  *y = OptionalVariant(zero_tensors);
+  return Status::OK();
+}
+
+template <typename Device>
+Status OptionalBinaryAdd(OpKernelContext* ctx, const OptionalVariant& a,
+                         const OptionalVariant& b, OptionalVariant* out) {
+  // TODO(skyewm): should adding a value to a non-value be a no-op instead?
+  if (a.has_value() != b.has_value()) {
+    return errors::InvalidArgument(
+        "Cannot add optionals because one has a value and the other doesn't.");
+  }
+  if (!a.has_value()) {
+    *out = a;
+    return Status::OK();
+  }
+  if (a.get_values().size() != b.get_values().size()) {
+    return errors::InvalidArgument(
+        "Cannot add optionals because they have different numbers of "
+        "components (",
+        a.get_values().size(), " vs. ", b.get_values().size(), ").");
+  }
+  std::vector<Tensor> out_tensors;
+  for (int i = 0; i < a.get_values().size(); ++i) {
+    const Tensor& a_tensor = a.get_values()[i];
+    const Tensor& b_tensor = b.get_values()[i];
+    Tensor out_tensor;
+    TF_RETURN_IF_ERROR(
+        BinaryAddTensors<Device>(ctx, a_tensor, b_tensor, &out_tensor));
+    out_tensors.push_back(std::move(out_tensor));
+  }
+  *out = OptionalVariant(out_tensors);
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 7b01c3b4e0d67b40359f1fe63820cecae5dc9ea7..0fff4c53706269538f770889744e21fffcae3601 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
@@ -152,6 +152,15 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -207,7 +216,6 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        AddConstantParameter(ctx, "batch_size", dataset()->batch_size_);
         return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
       }
 
@@ -309,9 +317,10 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
 
           // 2. Copy each batch element to the appropriate location in
           // the output component tensor.
-          Tensor batch_component(ctx->allocator({}),
-                                 output_dtypes()[component_index],
-                                 batch_component_shape);
+          out_tensors->emplace_back(ctx->allocator({}),
+                                    output_dtypes()[component_index],
+                                    batch_component_shape);
+          Tensor& batch_component = out_tensors->back();
           TF_RETURN_IF_ERROR(batch_util::SetElementZero(
               &batch_component, dataset()->padding_values_[component_index]));
 
@@ -331,13 +340,18 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
                   batch_elements[i][component_index], &batch_component, i));
             }
           }
-          out_tensors->push_back(std::move(batch_component));
         }
         *end_of_sequence = false;
         return Status::OK();
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         dataset()->batch_size_);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impl_)
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 6b6b3d6ab9cf21b37fdba46b15174bf9df727e7c..2f6d91e863401ca4cc56187a9423ae406b5f651a 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -17,1059 +17,24 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 cycle_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
-    OP_REQUIRES(ctx, cycle_length > 0,
-                errors::InvalidArgument("`cycle_length` must be > 0"));
-
-    int64 block_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "block_length", &block_length));
-    OP_REQUIRES(ctx, block_length > 0,
-                errors::InvalidArgument("`block_length` must be > 0"));
-
-    bool sloppy = false;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
-
-    int64 buffer_output_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
-                                            &buffer_output_elements));
-    OP_REQUIRES(
-        ctx, buffer_output_elements > 0,
-        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
-
-    int64 prefetch_input_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
-                                            &prefetch_input_elements));
-    OP_REQUIRES(
-        ctx, prefetch_input_elements >= 0,
-        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
-                                      &captured_func));
-
-    *output =
-        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
-                    cycle_length, block_length, sloppy, buffer_output_elements,
-                    prefetch_input_elements, output_types_, output_shapes_);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
-            int64 block_length, bool sloppy, int64 buffer_output_elements,
-            int64 prefetch_input_elements, const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          interleave_func_(func),
-          captured_func_(std::move(captured_func)),
-          cycle_length_(cycle_length),
-          block_length_(block_length),
-          sloppy_(sloppy),
-          buffer_output_elements_(buffer_output_elements),
-          prefetch_input_elements_(prefetch_input_elements),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return "ParallelInterleaveDatasetOp::Dataset";
-    }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
-      Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
-      Node* cycle_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
-      Node* block_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
-      Node* sloppy_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
-      Node* buffer_output_elements_node;
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
-      Node* prefetch_input_elements_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
-                                      &prefetch_input_elements_node));
-      DataTypeVector other_arguments_types;
-      other_arguments_types.reserve(captured_func_->captured_inputs().size());
-      std::vector<Node*> other_arguments;
-      other_arguments.reserve(captured_func_->captured_inputs().size());
-      for (const Tensor& t : captured_func_->captured_inputs()) {
-        Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-        other_arguments.emplace_back(node);
-        other_arguments_types.emplace_back(t.dtype());
-      }
-      AttrValue f;
-      b->BuildAttrValue(interleave_func_, &f);
-      AttrValue other_arguments_types_attr;
-      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
-
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {{0, input_node},
-           {2, cycle_length_node},
-           {3, block_length_node},
-           {4, sloppy_node},
-           {5, buffer_output_elements_node},
-           {6, prefetch_input_elements_node}},
-          {{1, other_arguments}},
-          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
-      return Status::OK();
-    }
-
-   private:
-    int64 num_threads() const {
-      return cycle_length_ + prefetch_input_elements_;
-    }
-
-    // Parallel interleave's implementation is designed around a few principles:
-    //  1. Thread creation is relatively expensive. (Not reusing
-    //     threads causes a number of indirect costs such as poorer tcmalloc
-    //     performance due to thread-local caches, etc.) We allocate a fixed
-    //     number of threads at the start and never change. This is why we've
-    //     fused functionality that is theoretically orthogonal (i.e.
-    //     .prefetch()) into the implementation.
-    //  2. Drop-in replacement for standard interleave. The goal will be to
-    //     auto-opt people into an optimized implementation without any work
-    //     on the customer's part. We thus go through great pains to maintain
-    //     identical iteration orders, full determinism (disabled only via a
-    //     flag, etc.)
-    //  3. Performance across a variety of environments and I/O envelopes.
-    //
-    // The actual implementation centers around a collection of worker threads
-    // and their corresponding worker state (tracked in the `workers_` vector).
-    // Worker threads repeatedly receive a vector of Tensors that are used as
-    // input to the flat-map function (`captured_func_`). The output of this
-    // function must be a dataset. The worker thread then repeatedly calls
-    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
-    // that a caller will block waiting for an element to be produced.
-    //
-    // Pointers to these worker states are kept in 2 disjoint data structures:
-    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
-    //     in `workers_` that we are interleaving. Worker threads backing these
-    //     WorkerStates should be regularly producing values.
-    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
-    //     `workers_` that we will move to `interleave_indices_` when an
-    //     iterator in `interleave_indices_` is exhausted.
-    //
-    // The client calls `GetNext[Internal]()` to retrieve an output element. The
-    // internal implementation updates the state of `interleave_indices_` and
-    // `staging_indices_` as output iterators (run by the worker threads) are
-    // exhausted.
-    //
-    // `input_impl_` is the input iterator that generates arguments for the
-    // flat-map function (`captured_func_`). It is set to an iterator at
-    // Iterator construction, and is fixed until we consume all input elements.
-    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
-    // memory.
-    //
-    // A few invariants are maintained:
-    //  1. No element in interleave_indices_ should be a -1 unless
-    //     `staging_indices_` is empty and `input_impl_` is empty.
-    //  2. Every `worker_` element is pointed to by at most one element of the
-    //     union of `interleave_indices_` and `staging_indices_`.
-    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
-    //     an element in `interleave_indices_` or `staging_indices_`.
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            workers_(dataset()->num_threads()),
-            worker_thread_states_(dataset()->num_threads()) {}
-
-      ~Iterator() override {
-        mutex_lock l(mu_);
-        cancelled_ = true;
-        // Notify all workers in case they are blocked.
-        for (auto& worker : workers_) {
-          worker.cond_var.notify_all();
-        }
-      }
-
-      Status Initialize(IteratorContext* ctx) override {
-        AddConstantParameter(ctx, "parallelism", dataset()->cycle_length_);
-        TF_RETURN_IF_ERROR(
-            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
-      }
-
-      // It is implemented so that it matches the deterministic interleave
-      // unless getting the next element would block and we are allowed to be
-      // sloppy.
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
-        while (!cancelled_) {
-          // Wait for an item to become available, blocking if necessary. If we
-          // are allowed to be sloppy, we can skip over input datasets that do
-          // not have an item readily available.
-          bool can_produce_elements = false;
-          bool must_wait_for_input = true;
-          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
-            int64 index = (next_index_ + i) % interleave_indices_.size();
-            int64 current_worker_index = interleave_indices_[index];
-            if (current_worker_index < 0) {
-              continue;  // Empty interleave elements.
-            }
-            WorkerState* current_worker = &workers_[current_worker_index];
-            can_produce_elements |= current_worker->MayHaveElements();
-            if (!current_worker->outputs.empty()) {
-              // We have an element!
-              next_index_ = index;
-              const bool element_acquired_sloppily =
-                  dataset()->sloppy_ && i > 1;
-              if (!element_acquired_sloppily) {
-                // If the element was acquired in the regular (non-sloppy)
-                // order, then advance the current block and cycle pointers to
-                // the next element in the regular order.
-                block_count_++;
-                if (block_count_ == dataset()->block_length_) {
-                  next_index_ = (index + 1) % interleave_indices_.size();
-                  block_count_ = 0;
-                }
-              } else {
-                block_count_ = 0;
-              }
-              *end_of_sequence = false;
-              Status s = current_worker->outputs.front().status;
-              current_worker->outputs.front().output.swap(*out_tensors);
-              current_worker->outputs.pop_front();
-              current_worker->cond_var.notify_one();
-              return s;
-            } else if (current_worker->is_producing && !dataset()->sloppy_) {
-              // current_worker.outputs.empty(), and we must wait for this
-              // iterator.
-              if (next_index_ != index) {
-                // We have advanced to a new iterator; reset block counts.
-                next_index_ = index;
-                block_count_ = 0;
-              }
-              break;
-            } else if (!current_worker->is_producing) {
-              // This iterator has reached end of input.
-              interleave_indices_[index] = -1;
-              if (input_impl_) {
-                // Start prefetching a new iterator.
-                std::vector<Tensor> args;
-                bool end_of_input = false;
-                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-                if (end_of_input) {
-                  input_impl_.reset();
-                } else {
-                  current_worker->SetInputs(s, std::move(args));
-                  staging_indices_.emplace_back(current_worker_index);
-                }
-              }
-
-              if (!staging_indices_.empty()) {
-                // Move a worker from `staging_indices_` to
-                // `interleave_indices_`.
-                interleave_indices_[index] = staging_indices_.front();
-                staging_indices_.pop_front();
-
-                next_index_ = (index + 1) % interleave_indices_.size();
-                block_count_ = 0;
-                // Restart the inner [for] loop
-                can_produce_elements = true;
-                must_wait_for_input = false;
-                break;
-              }
-            }
-          }
-
-          if (!can_produce_elements && !input_impl_) {
-            // No potential for future values.
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-
-          if (must_wait_for_input) {
-            // Wait for elements to become available.
-            RecordStop(ctx);
-            if (dataset()->sloppy_) {
-              sloppy_cond_var_.wait(l);
-            } else {
-              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
-            }
-            RecordStart(ctx);
-          }
-        }
-        return errors::Cancelled(
-            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
-      }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_exhausted"), ""));
-        }
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("next_index"), next_index_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("block_count"), block_count_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("workers_size"), workers_.size()));
-        for (int i = 0; i < workers_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
-        }
-        for (int i = 0; i < worker_thread_states_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
-                                               interleave_indices_.size()));
-        for (int i = 0; i < interleave_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("interleave_indices_", i)),
-              interleave_indices_[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
-                                               staging_indices_.size()));
-        for (int i = 0; i < staging_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("staging_indices_", i)),
-              staging_indices_[i]));
-        }
-        if (!worker_threads_.empty()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("worker_threads_running"), ""));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (!reader->Contains(full_name("input_exhausted"))) {
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
-        int64 temp;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
-        next_index_ = size_t(temp);
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
-        block_count_ = size_t(temp);
-
-        // Restore WorkerStates.
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("workers_size"), &temp));
-        if (temp != dataset()->num_threads()) {
-          return errors::Internal("Expected ", dataset()->num_threads(),
-                                  " worker states but found ", temp, ".");
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
-        }
-
-        // Restore `interleave_indices_`.
-        std::set<int64> all_indices;
-        {
-          int64 interleave_size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
-                                                &interleave_size));
-          interleave_indices_.reserve(interleave_size);
-          for (int64 i = 0; i < interleave_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("interleave_indices_", i)), &temp));
-            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            interleave_indices_.emplace_back(temp);
-          }
-        }
-
-        // Restore `staging_indices_`.
-        {
-          int64 staging_size;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("staging_size"), &staging_size));
-          for (int i = 0; i < staging_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("staging_indices_", i)), &temp));
-            if (all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            staging_indices_.emplace_back(temp);
-          }
-        }
-
-        // Start Worker threads.
-        if (reader->Contains(full_name("worker_threads_running"))) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, "worker_thread",
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-          }
-        }
-        return Status::OK();
-      }
-
-     private:
-      // OutputElem contains the information from a call to GetNext by an output
-      // iterator.
-      struct OutputElem {
-        // The output iterator sets `status` if getting the output element
-        // fails.
-        Status status;
-        // The buffered data element.
-        std::vector<Tensor> output;
-
-        explicit OutputElem(const Status& s) : status(s) {}
-      };
-
-      // Worker threads operate on their relevant WorkerState structs.
-      //
-      // WorkerState's fields are all protected by mu_;
-      struct WorkerState {
-        // The arguments to be used to construct an output iterator.
-        std::vector<Tensor> input;
-        // The buffered output elements.
-        std::deque<OutputElem> outputs;
-        // Set to true iff the worker thread expects to append more elements to
-        // outputs. is_producing can be false despite !outputs.empty().
-        // Concretely, all output elements will have been consumed only when:
-        // is_producing == false && outputs.empty();
-        bool is_producing = false;
-        // Condition variable used to coordinate between threads. The worker
-        // thread waits on this condition variable when it is either (1) waiting
-        // for the main thread to add arguments to `input`, or (2) waiting for
-        // the main thread to consume an element of `outputs`. The main thread
-        // waits on cond_var if it is waiting for the worker thread to produce
-        // an element into `outputs` (this implies sloppy_==false).
-        condition_variable cond_var;
-
-        inline bool MayHaveElements() const {
-          return is_producing || !outputs.empty();
-        }
-
-        // Sets inputs for a worker thread and notifies it to start processing.
-        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
-          if (s.ok()) {
-            DCHECK(!MayHaveElements())
-                << "Tried to start inputs, despite already producing!";
-            input = std::move(input_arguments);
-            is_producing = true;
-            cond_var.notify_one();
-          } else {
-            outputs.emplace_back(s);
-          }
-        }
-      };
-
-      // The internal state of a worker thread that is not already captured
-      // in its `WorkerState`.
-      //
-      // This is needed only for checkpointing purposes. We keep this
-      // separate from `WorkerState` and guard its fields using a separate
-      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
-      struct WorkerThreadState {
-        // The output element that has been produced from the input iterator
-        // and is waiting to be added to `WorkerState.outputs`.
-        OutputElem output_elem;
-
-        // Whether the input iterator returned an `end_of_sequence`.
-        bool end_of_sequence = false;
-
-        // Status returned from `MakeIteratorFromInputElement`.
-        Status iterator_creation_status;
-
-        // The arguments to be used to construct `iterator`.
-        std::vector<Tensor> input;
-
-        std::unique_ptr<IteratorBase> iterator;
-
-        WorkerThreadState() : output_elem(Status::OK()) {}
-      };
-
-      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (worker_threads_.empty()) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
-            std::vector<Tensor> args;
-            bool end_of_input = false;
-            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-            if (end_of_input) {
-              input_impl_.reset();
-              return Status::OK();
-            }
-            workers_[i].SetInputs(s, std::move(args));
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, "worker_thread",
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-            if (i < dataset()->cycle_length_) {
-              interleave_indices_.push_back(i);
-            } else {
-              staging_indices_.push_back(i);
-            }
-          }
-          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
-          DCHECK(staging_indices_.size() ==
-                 dataset()->prefetch_input_elements_);
-        }
-        return Status::OK();
-      }
-
-      // Produces elements into the worker's output buffers.
-      void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
-                        const int64 thread_index) {
-        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
-        //
-        // 1. Any local state that may need to be checkpointed should be kept
-        //    in `worker_thread_states_[thread_index]`.
-        // 2. `WorkerThreadState` should contain state that is needed only for
-        //    checkpointing, i.e., if we were to remove checkpointing support,
-        //    we could keep that state as local variables in this thread.
-        // 3. This thread should only read/write state at `thread_index`
-        //    and should not access other thread states.
-        // 4. When restoring from checkpoint, threads are started only after
-        //    the restore is complete.
-        // 5. Once restored from a checkpoint, the local state is edited only
-        //    by this thread. 3 & 4 allow making assumptions like temporarily
-        //    caching local state in this thread and using it outside a lock
-        //    e.g. `make_new_iterator`.
-        // 6. `ckpt_mu_` should be wisely used to create *consistent*
-        //    checkpoint markers.
-
-        // std::function arguments are copy-constructable, so we pass raw
-        // pointers, and then immediately wrap them to ensure correct ownership.
-        RecordStart(ctx.get());
-        auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
-          mutex_lock l(mu_);
-          workers_[thread_index].cond_var.notify_all();
-          RecordStop(ctx.get());
-        });
-        bool make_new_iterator;
-        {
-          tf_shared_lock l(ckpt_mu_);
-          // Decide whether a new iterator should be built.
-          // 1. If there is an existing iterator, we use it.
-          // 2. If there was an error in iterator creation that could not be
-          //    notified to the client we attempt to send that to the client
-          //    first.
-          make_new_iterator =
-              worker_thread_states_[thread_index].iterator == nullptr &&
-              worker_thread_states_[thread_index].iterator_creation_status.ok();
-        }
-        // Even though `make_new_iterator` has cached values from
-        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
-        // it is safe to *read* `make_new_iterator`outside of a lock without
-        // worrying about concurrent changes to values in
-        // `worker_thread_states_[thread_index]`. See comment at the start of
-        // this function for details.
-        while (true) {
-          // Whether creation of the iterator succeeded.
-          Status iterator_creation_status;
-          // 1. Build a new iterator or use the existing one.
-          if (make_new_iterator) {
-            // 1a. Get new input tensors or use the exiting ones.
-            bool read_new_input;
-            {
-              tf_shared_lock l(ckpt_mu_);
-              // worker_thread_states_[thread_index].input will be non-empty
-              // if checkpointing happened at CHECKPOINT_MARKER_A.
-              read_new_input =
-                  worker_thread_states_[thread_index].input.empty();
-            }
-
-            if (read_new_input) {
-              mutex_lock l(mu_);
-              while (!cancelled_ && !workers_[thread_index].is_producing) {
-                RecordStop(ctx.get());
-                workers_[thread_index].cond_var.wait(l);
-                RecordStart(ctx.get());
-              }
-              if (cancelled_) return;
-              // Copy the input tensors so that we do not need to block on `mu_`
-              // when building the iterator.
-              // We keep a copy of the input tensors in
-              // `WorkerThreadState.input` till the iterator is in use. This is
-              // used in `RestoreInternal` to re-build the iterator.
-              // TODO(b/78046638): Explore ways to avoid tracking the input
-              // tensors.
-              tf_shared_lock ckpt_l(ckpt_mu_);
-              worker_thread_states_[thread_index].input.swap(
-                  workers_[thread_index].input);
-              // CHECKPOINT_MARKER_A
-              // We have the input tensors but have not built the iterator yet.
-            }
-
-            // 1b. Run the user defined function to produce a new iterator.
-            {
-              tf_shared_lock l(ckpt_mu_);
-              worker_thread_states_[thread_index].iterator_creation_status =
-                  MakeIteratorFromInputElement(
-                      ctx.get(), worker_thread_states_[thread_index].input,
-                      thread_index, dataset()->captured_func_.get(), prefix(),
-                      &worker_thread_states_[thread_index].iterator);
-              iterator_creation_status =
-                  worker_thread_states_[thread_index].iterator_creation_status;
-              if (!iterator_creation_status.ok()) {
-                worker_thread_states_[thread_index].input.clear();
-              }
-              // CHECKPOINT_MARKER_B
-              // Either an iterator has been successfully built and placed in
-              // `worker_thread_states_[thread_index].iterator` or it failed and
-              // a non-OK status has been put in
-              // `worker_thread_states_[thread_index].iterator_creation_status`.
-            }
-          } else {
-            tf_shared_lock l(ckpt_mu_);
-            iterator_creation_status =
-                worker_thread_states_[thread_index].iterator_creation_status;
-            // Mark that we have used up the restored iterator.
-            make_new_iterator = true;
-          }
-          // 2. Start producing elements or send error state to client if
-          //    iterator creation failed.
-          if (!iterator_creation_status.ok()) {
-            mutex_lock l(mu_);
-            // Wait for space in the prefetch queue.
-            while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                      dataset()->buffer_output_elements_) {
-              RecordStop(ctx.get());
-              workers_[thread_index].cond_var.wait(l);
-              RecordStart(ctx.get());
-            }
-            if (cancelled_) return;
-            tf_shared_lock ckpt_l(ckpt_mu_);
-            workers_[thread_index].outputs.emplace_back(
-                iterator_creation_status);
-            workers_[thread_index].is_producing = false;
-            worker_thread_states_[thread_index].iterator_creation_status =
-                Status::OK();
-            // CHECKPOINT_MARKER_C
-            // Non-OK iterator creation status has been notified to the
-            // client.
-            workers_[thread_index].cond_var.notify_one();
-          } else {
-            bool end_of_sequence = false;
-            while (!end_of_sequence) {
-              // 3.a Produce an element!
-              {
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                if (worker_thread_states_[thread_index]
-                        .output_elem.status.ok() &&
-                    worker_thread_states_[thread_index]
-                        .output_elem.output.empty() &&
-                    !worker_thread_states_[thread_index].end_of_sequence) {
-                  worker_thread_states_[thread_index].output_elem.status =
-                      worker_thread_states_[thread_index].iterator->GetNext(
-                          ctx.get(),
-                          &worker_thread_states_[thread_index]
-                               .output_elem.output,
-                          &worker_thread_states_[thread_index].end_of_sequence);
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                } else {
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                }
-                // CHECKPOINT_MARKER_D
-                // An element has been read or an error or end_of_sequence has
-                // been received from the input iterator and is waiting to be
-                // sent to client.
-              }
-
-              // 3.b Make it available to the client.
-              {
-                mutex_lock l(mu_);
-
-                // Wait for space in the prefetch queue.
-                while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                          dataset()->buffer_output_elements_) {
-                  RecordStop(ctx.get());
-                  workers_[thread_index].cond_var.wait(l);
-                  RecordStart(ctx.get());
-                }
-                if (cancelled_) return;
-
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                workers_[thread_index].is_producing = !end_of_sequence;
-
-                // Output the element.
-
-                // Move the temporary state in WorkerThreadState to WorkerState
-                // and mark it as used.
-                if (end_of_sequence) {
-                  worker_thread_states_[thread_index].iterator.reset();
-                  worker_thread_states_[thread_index].input.clear();
-                  worker_thread_states_[thread_index].end_of_sequence = false;
-                } else {
-                  workers_[thread_index].outputs.emplace_back(
-                      worker_thread_states_[thread_index].output_elem.status);
-                  workers_[thread_index].outputs.back().output.swap(
-                      worker_thread_states_[thread_index].output_elem.output);
-                }
-                worker_thread_states_[thread_index].output_elem.status =
-                    Status::OK();
-                if (dataset()->sloppy_) {
-                  sloppy_cond_var_.notify_one();
-                } else {
-                  workers_[thread_index].cond_var.notify_one();
-                }
-                // CHECKPOINT_MARKER_E
-                // Output element or iterator status has been sent to the
-                // client.
-              }
-            }
-          }
-        }
-      }
-
-      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_", index);
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            workers_[index].input.size()));
-        for (int i = 0; i < workers_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              workers_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_outputs_size")),
-            workers_[index].outputs.size()));
-        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-              writer, workers_[index].outputs[i],
-              full_name(strings::StrCat(prefix, "_outputs_", i))));
-        }
-        if (workers_[index].is_producing) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_is_producing")), ""));
-        }
-        return Status::OK();
-      }
-
-      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
-                                   IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        workers_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          workers_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &workers_[index].input.back()));
-        }
-        int64 outputs_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
-            &outputs_size));
-        for (int i = 0; i < outputs_size; ++i) {
-          workers_[index].outputs.emplace_back(Status::OK());
-          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-              reader, &workers_[index].outputs.back(),
-              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
-        }
-        if (reader->Contains(
-                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
-          workers_[index].is_producing = true;
-        } else {
-          workers_[index].is_producing = false;
-        }
-        return Status::OK();
-      }
-
-      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
-                                          int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_thread_", index);
-        if (worker_thread_states_[index].iterator != nullptr) {
-          TF_RETURN_IF_ERROR(
-              SaveInput(writer, worker_thread_states_[index].iterator));
-        } else {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            worker_thread_states_[index].input.size()));
-        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              worker_thread_states_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_iterator_creation_status"),
-            worker_thread_states_[index].iterator_creation_status));
-        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-            writer, worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(prefix, "_output"))));
-        if (worker_thread_states_[index].end_of_sequence) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
-        }
-        return Status::OK();
-      }
-
-      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
-                                         IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_thread_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        worker_thread_states_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          worker_thread_states_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &worker_thread_states_[index].input.back()));
-        }
-        // Restore iterator.
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
-          worker_thread_states_[index].iterator.reset();
-        } else {
-          std::unique_ptr<IteratorBase> iterator;
-          Status s = MakeIteratorFromInputElement(
-              ctx, worker_thread_states_[index].input, index,
-              dataset()->captured_func_.get(), prefix(), &iterator);
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
-          worker_thread_states_[index].iterator.swap(iterator);
-        }
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
-            &worker_thread_states_[index].iterator_creation_status));
-        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-            reader, &worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(worker_prefix, "_output"))));
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
-          worker_thread_states_[index].end_of_sequence = true;
-        } else {
-          worker_thread_states_[index].end_of_sequence = false;
-        }
-        return Status::OK();
-      }
-
-      Status WriteOutputElemLocked(IteratorStateWriter* writer,
-                                   const OutputElem& output_elem,
-                                   const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_status"), output_elem.status));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
-                                output_elem.output.size()));
-        for (int i = 0; i < output_elem.output.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
-        }
-        return Status::OK();
-      }
-
-      Status ReadOutputElemLocked(IteratorStateReader* reader,
-                                  OutputElem* output_elem, const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
-        int64 output_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            strings::StrCat(prefix, "_output_size"), &output_size));
-        output_elem->output.reserve(output_size);
-        for (int i = 0; i < output_size; ++i) {
-          output_elem->output.emplace_back();
-          TF_RETURN_IF_ERROR(
-              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
-                                 &output_elem->output.back()));
-        }
-        return Status::OK();
-      }
-
-      Status WriteStatusLocked(IteratorStateWriter* writer,
-                               const string& prefix, const Status& status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
-                                static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
-                                  status.error_message()));
-        }
-        return Status::OK();
-      }
-
-      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
-                              Status* status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_code")), &code_int));
-        error::Code code = static_cast<error::Code>(code_int);
-
-        if (code != error::Code::OK) {
-          string error_message;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_msg")), &error_message));
-          *status = Status(code, error_message);
-        } else {
-          *status = Status::OK();
-        }
-        return Status::OK();
-      }
-
-      // Mutex & condition variable to guard mutable iterator internals and
-      // coordinate among worker threads and client thread[s].
-      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
-      // The main thread waits on this condition variable if running in sloppy
-      // mode and no values are available.
-      condition_variable sloppy_cond_var_;
-      // Mutex used to wait for a consistent state while checkpointing.
-      // Only Save and Restore require an exclusive lock on this mutex. In
-      // other scenarios we just acquire a shared lock so the pipeline's
-      // performance should not be affected in the absence of checkpointing.
-      // A thread must not wait on any condition variable while holding
-      // `ckpt_mu_` in either shared or exclusive modes.
-      mutex ckpt_mu_;
-
-      // The iterator producing elements which are converted to datasets by
-      // the dataset()->captured_func_ then interleaved together.
-      // input_impl_ is reset when we have exhausted its input.
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-
-      // The WorkerState structs the worker threads operate on.
-      // workers_ elements are in at most one of interleave_ and staging_.
-      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
-
-      // Stores the temporary state of WorkerThreads which is not stored in
-      // WorkerState. This is used for checkpointing purposes only.
-      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
-
-      // Indices in `workers_` of iterators to interleave.
-      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
-      // Indices in `workers_` of prefetched iterators.
-      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
-
-      // The index into output_elements_ for next element to produce.
-      size_t next_index_ GUARDED_BY(mu_) = 0;
-      // The number of items produced so far within the block
-      size_t block_count_ GUARDED_BY(mu_) = 0;
-      // Flag to instruct the worker threads to exit.
-      bool cancelled_ GUARDED_BY(mu_) = false;
-      // The worker threads. This must be last to ensure the
-      // threads have exited before any other members are deallocated.
-      // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* const input_;
-    const NameAttrList interleave_func_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const int64 cycle_length_;
-    const int64 block_length_;
-    const bool sloppy_;
-    const int64 buffer_output_elements_;
-    const int64 prefetch_input_elements_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-  };
-
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList interleave_func_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
-                        ParallelInterleaveDatasetOp);
-
 // The motivation for creating an alternative implementation of parallel
 // interleave is to decouple the degree of parallelism from the cycle length.
 // This makes it possible to change the degree of parallelism (e.g. through
@@ -1084,16 +49,14 @@ REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
 // The above design choices were made with automated optimizations in mind,
 // isolating the degree of parallelism as the single tunable knob of this
 // implementation.
-//
-// TODO(b/116852688): Make coordination between the performance model and this
-// transformation more robust.
-class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
  public:
-  explicit ParallelInterleaveDatasetV2Op(OpKernelConstruction* ctx)
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("sloppy", &sloppy_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -1126,9 +89,10 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
         ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
                                       &captured_func));
 
-    *output = new Dataset(ctx, input, interleave_func_,
-                          std::move(captured_func), cycle_length, block_length,
-                          num_parallel_calls, output_types_, output_shapes_);
+    *output =
+        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
+                    cycle_length, block_length, num_parallel_calls, sloppy_,
+                    output_types_, output_shapes_);
   }
 
  private:
@@ -1137,7 +101,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
-            int64 block_length, int64 num_parallel_calls,
+            int64 block_length, int64 num_parallel_calls, bool sloppy,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
@@ -1147,6 +111,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
           cycle_length_(cycle_length),
           block_length_(block_length),
           num_parallel_calls_(num_parallel_calls),
+          sloppy_(sloppy),
           output_types_(output_types),
           output_shapes_(output_shapes) {
       input_->Ref();
@@ -1156,8 +121,10 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::ParallelInterleaveV2")}));
+      return MakeUnique<ParallelInterleaveIterator>(
+          ParallelInterleaveIterator::Params{
+              this, strings::StrCat(prefix, "::ParallelInterleaveV2")},
+          sloppy_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -1200,36 +167,47 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       b->BuildAttrValue(interleave_func_, &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+      AttrValue sloppy_attr;
+      b->BuildAttrValue(sloppy_, &sloppy_attr);
 
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {{0, input_node},
-           {2, cycle_length_node},
-           {3, block_length_node},
-           {4, num_parallel_calls_node}},
-          {{1, other_arguments}},
-          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this,
+                        {{0, input_node},
+                         {2, cycle_length_node},
+                         {3, block_length_node},
+                         {4, num_parallel_calls_node}},
+                        {{1, other_arguments}},
+                        {{"f", f},
+                         {"Targuments", other_arguments_types_attr},
+                         {"sloppy", sloppy_attr}},
+                        output));
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
+    class ParallelInterleaveIterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
+      explicit ParallelInterleaveIterator(const Params& params, bool sloppy)
           : DatasetIterator<Dataset>(params),
             mu_(std::make_shared<mutex>()),
             cond_var_(std::make_shared<condition_variable>()),
             num_parallel_calls_(std::make_shared<model::SharedState>(
                 params.dataset->num_parallel_calls_, mu_, cond_var_)),
+            sloppy_(sloppy),
             args_list_(params.dataset->cycle_length_),
             current_elements_(params.dataset->cycle_length_),
             element_in_use_(params.dataset->cycle_length_, false),
             thread_pool_(new thread::ThreadPool(
-                Env::Default(), ThreadOptions(), "parallel_interleave",
+                Env::Default(), ThreadOptions(),
+                "data_parallel_interleave_worker_pool",
                 dataset()->cycle_length_ /* num_threads */,
-                false /* low_latency_hint */)) {}
+                false /* low_latency_hint */)) {
+        std::vector<string> components =
+            str_util::Split(params.prefix, "::", str_util::SkipEmpty());
+        prefix_end_ = components.back();
+      }
 
-      ~Iterator() override {
+      ~ParallelInterleaveIterator() override {
         mutex_lock l(*mu_);
         // Cancel the runner thread.
         cancelled_ = true;
@@ -1243,16 +221,13 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
         if (num_parallel_calls_->value == kAutoTune) {
-          num_parallel_calls_->value = 1;
-          AddTunableParameter(ctx, "parallelism", num_parallel_calls_, 1,
-                              dataset()->cycle_length_);
-        } else {
-          AddConstantParameter(ctx, "parallelism", num_parallel_calls_->value);
+          num_parallel_calls_->value = dataset()->cycle_length_;
+          num_parallel_calls_->tunable = true;
         }
-        AddConstantParameter(ctx, "cycle_length", dataset()->cycle_length_);
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -1260,23 +235,19 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         std::shared_ptr<InvocationResult> result;
         do {
+          result.reset();
           {
             mutex_lock l(*mu_);
             EnsureRunnerThreadStarted(ctx);
-            while (invocation_results_.empty() &&
-                   (!end_of_input_ || num_open_ > 0)) {
+            while (ShouldWait(&result)) {
               RecordStop(ctx);
               cond_var_->wait(l);
               RecordStart(ctx);
             }
-            if (!invocation_results_.empty()) {
-              std::swap(result, invocation_results_.front());
-              invocation_results_.pop_front();
-            } else {
+            if (!result) {
               *end_of_sequence = true;
               return Status::OK();
             }
-            cond_var_->notify_all();
           }
           RecordStop(ctx);
           result->notification.WaitForNotification();
@@ -1285,12 +256,21 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
         if (result->status.ok()) {
           *out_tensors = std::move(result->return_values);
+          RecordBufferDequeue(ctx, *out_tensors);
         }
         *end_of_sequence = false;
         return result->status;
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeAsyncInterleaveManyNode(
+            std::move(args),
+            {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
+                                  /*max=*/dataset()->cycle_length_)});
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(*mu_);
         // Wait for all in-flight calls to complete.
@@ -1390,7 +370,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
         if (!runner_thread_) {
           std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
           runner_thread_.reset(ctx->env()->StartThread(
-              {}, "runner_thread",
+              {}, "tf_data_parallel_interleave_runner",
               [this, new_ctx]() { RunnerThread(new_ctx); }));
         }
       }
@@ -1415,7 +395,12 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
           if (end_of_input) {
             result->skip = true;
           }
-          result->notification.Notify();
+          RecordBufferEnqueue(ctx.get(), result->return_values);
+          {
+            mutex_lock l(*mu_);
+            result->notification.Notify();
+            cond_var_->notify_all();
+          }
           if (!result->status.ok()) {
             break;
           }
@@ -1429,6 +414,12 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
         mutex_lock l(*mu_);
         element_in_use_[cycle_index] = false;
         num_calls_--;
+        const auto& stats_aggregator = ctx->stats_aggregator();
+        if (stats_aggregator) {
+          stats_aggregator->AddScalar(
+              strings::StrCat(prefix_end_, "::active_parallel_calls"),
+              static_cast<float>(num_calls_));
+        }
         if (end_of_input) {
           args_list_[cycle_index].clear();
           num_open_--;
@@ -1481,7 +472,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
               if (!end_of_input_) {
                 Status status = MakeIteratorFromInputElement(
                     ctx.get(), args_list_[cycle_index_], cycle_index_,
-                    dataset()->captured_func_.get(), prefix(),
+                    *instantiated_captured_func_, prefix(),
                     &current_elements_[cycle_index_]);
                 if (!status.ok()) {
                   invocation_results_.emplace_back(new InvocationResult());
@@ -1505,16 +496,55 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
               }
               num_calls_++;
               element_in_use_[cycle_index_] = true;
-              thread_pool_->Schedule(std::bind(&Iterator::FetchOutputs, this,
-                                               ctx, cycle_index_,
-                                               std::move(results)));
+              thread_pool_->Schedule(
+                  std::bind(&ParallelInterleaveIterator::FetchOutputs, this,
+                            ctx, cycle_index_, std::move(results)));
             }
             cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
           }
+          const auto& stats_aggregator = ctx->stats_aggregator();
+          if (stats_aggregator) {
+            // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
+            // monitoring code or as histogram at fixed time intervals.
+            stats_aggregator->AddScalar(
+                strings::StrCat(prefix_end_, "::active_parallel_calls"),
+                static_cast<float>(num_calls_));
+            stats_aggregator->AddScalar(
+                strings::StrCat(prefix_end_, "::num_parallel_calls"),
+                static_cast<float>(num_parallel_calls_->value));
+          }
           cond_var_->notify_all();
         }
       }
 
+      // Determines whether the caller needs to wait for a result. Upon
+      // returning false, `result` will either be NULL if end of input has been
+      // reached or point to the result.
+      bool ShouldWait(std::shared_ptr<InvocationResult>* result)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (sloppy_) {
+          for (auto it = invocation_results_.begin();
+               it != invocation_results_.end(); ++it) {
+            if ((*it)->notification.HasBeenNotified()) {
+              std::swap(*result, *it);
+              invocation_results_.erase(it);
+              cond_var_->notify_all();
+              return false;
+            }
+          }
+          return !invocation_results_.empty() ||
+                 (!end_of_input_ || num_open_ > 0);
+        } else {
+          if (!invocation_results_.empty()) {
+            std::swap(*result, invocation_results_.front());
+            invocation_results_.pop_front();
+            cond_var_->notify_all();
+            return false;
+          }
+          return (!end_of_input_ || num_open_ > 0);
+        }
+      }
+
       Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
                                const Status& status)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
@@ -1589,7 +619,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
                   &args_list_[idx][i]));
             }
             TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
-                ctx, args_list_[idx], idx, dataset()->captured_func_.get(),
+                ctx, args_list_[idx], idx, *instantiated_captured_func_.get(),
                 prefix(), &current_elements_[idx]));
             TF_RETURN_IF_ERROR(
                 RestoreInput(ctx, reader, current_elements_[idx]));
@@ -1615,6 +645,9 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       // Identifies the maximum number of parallel calls.
       const std::shared_ptr<model::SharedState> num_parallel_calls_;
 
+      // Determines whether outputs can be produced in non-deterministic order.
+      const bool sloppy_;
+
       // Iterator for input elements.
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(*mu_);
 
@@ -1649,6 +682,8 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
       // Identifies whether background activity should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
+      string prefix_end_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
@@ -1657,17 +692,19 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
     const int64 cycle_length_;
     const int64 block_length_;
     const int64 num_parallel_calls_;
+    const bool sloppy_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
   };
 
+  bool sloppy_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList interleave_func_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDatasetV2").Device(DEVICE_CPU),
-                        ParallelInterleaveDatasetV2Op);
+                        ParallelInterleaveDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 6abe6c8338a60af38b448ba972c94153e28020e2..5ac81c187c4f3338785d49b47c232be1f8d1e185 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -15,10 +15,11 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
-#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/parallel_map_iterator.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -27,7 +28,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
@@ -39,6 +40,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
                                      &use_inter_op_parallelism_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("sloppy", &sloppy_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
  protected:
@@ -56,9 +60,13 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
                                                  use_inter_op_parallelism_,
                                                  &captured_func));
 
-    *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
-                          output_shapes_, use_inter_op_parallelism_,
-                          std::move(captured_func));
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+
+    *output =
+        new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
+                    output_shapes_, use_inter_op_parallelism_, sloppy_,
+                    std::move(captured_func), indices, preserve_cardinality_);
   }
 
  private:
@@ -68,8 +76,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
             const NameAttrList& func, int32 num_parallel_calls,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
-            bool use_inter_op_parallelism,
-            std::unique_ptr<CapturedFunction> captured_func)
+            bool use_inter_op_parallelism, bool sloppy,
+            std::unique_ptr<CapturedFunction> captured_func,
+            const std::vector<int> indices, bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -77,7 +86,12 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           output_types_(output_types),
           output_shapes_(output_shapes),
           use_inter_op_parallelism_(use_inter_op_parallelism),
-          captured_func_(std::move(captured_func)) {
+          sloppy_(sloppy),
+          preserve_cardinality_(preserve_cardinality),
+          captured_func_(std::move(captured_func)),
+          indices_(indices),
+          can_move_(indices.empty() ? std::vector<bool>()
+                                    : ComputeMoveVector(indices)) {
       input_->Ref();
     }
 
@@ -85,30 +99,16 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      auto init_func = [this](IteratorContext* ctx) {
-        return captured_func_->Instantiate(ctx);
-      };
-
-      const string& new_prefix = strings::StrCat(prefix, "::ParallelMap");
-      ParallelMapIteratorFunction map_func =
-          [this, new_prefix](IteratorContext* ctx,
-                             std::vector<Tensor> input_element,
-                             std::vector<Tensor>* result, StatusCallback done) {
-            captured_func_->RunAsync(ctx, std::move(input_element), result,
-                                     std::move(done), new_prefix);
-          };
-      if (!use_inter_op_parallelism_) {
-        map_func = [map_func](
-                       IteratorContext* ctx, std::vector<Tensor> input_element,
-                       std::vector<Tensor>* result, StatusCallback done) {
-          (*ctx->runner())(std::bind(map_func, ctx, std::move(input_element),
-                                     result, std::move(done)));
-        };
+      std::unique_ptr<ParallelMapFunctor> parallel_map_functor(nullptr);
+      if (indices_.empty()) {
+        parallel_map_functor.reset(new ParallelMapDatasetFunctor(this));
+      } else {
+        parallel_map_functor.reset(new ShortCircuitFunctor(this));
       }
-
-      return NewParallelMapIterator({this, new_prefix}, input_,
-                                    std::move(init_func), std::move(map_func),
-                                    num_parallel_calls_);
+      return NewParallelMapIterator(
+          {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
+          std::move(parallel_map_functor), num_parallel_calls_, sloppy_,
+          preserve_cardinality_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -123,6 +123,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       return "ParallelMapDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -150,37 +152,126 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
       // Attr: f
       TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
-      AttrValue f;
-      b->BuildAttrValue(func_, &f);
+      AttrValue f_attr;
+      b->BuildAttrValue(func_, &f_attr);
 
       // Attr: Targuments
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
 
+      // Attr: use_inter_op_parallelism
+      AttrValue use_inter_op_parallelism_attr;
+      b->BuildAttrValue(use_inter_op_parallelism_,
+                        &use_inter_op_parallelism_attr);
+
+      // Attr: sloppy
+      AttrValue sloppy_attr;
+      b->BuildAttrValue(sloppy_, &sloppy_attr);
+
+      // Attr: preserve_cardinality
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
+
       TF_RETURN_IF_ERROR(b->AddDataset(
           this,
           {std::make_pair(0, input_graph_node),
            std::make_pair(2, num_parallel_calls)},  // Single tensor inputs.
           {std::make_pair(1, other_arguments)},     // Tensor list inputs.
-          {std::make_pair("f", f),
-           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          {std::make_pair("f", f_attr),
+           std::make_pair("Targuments", other_arguments_types_attr),
+           std::make_pair("use_inter_op_parallelism",
+                          use_inter_op_parallelism_attr),
+           std::make_pair("sloppy", sloppy_attr),
+           std::make_pair("preserve_cardinality",
+                          preserve_cardinality_attr)},  // Attrs
           output));
       return Status::OK();
     }
 
    private:
+    class ShortCircuitFunctor : public ParallelMapFunctor {
+     public:
+      explicit ShortCircuitFunctor(const Dataset* dataset)
+          : dataset_(dataset) {}
+
+      void MapFunc(IteratorContext* ctx, const string& prefix,
+                   std::vector<Tensor> input_element,
+                   std::vector<Tensor>* result, StatusCallback done) override {
+        const std::vector<Tensor>& captured_inputs =
+            dataset_->captured_func_->captured_inputs();
+        size_t num_args = input_element.size();
+        for (size_t i = 0; i < dataset_->indices_.size(); ++i) {
+          if (dataset_->indices_[i] < num_args) {
+            if (dataset_->can_move_[i]) {
+              result->push_back(
+                  std::move(input_element[dataset_->indices_[i]]));
+            } else {
+              result->push_back(input_element[dataset_->indices_[i]]);
+            }
+          } else {
+            result->push_back(
+                captured_inputs[dataset_->indices_[i] - num_args]);
+          }
+        }
+        done(Status::OK());
+      }
+
+      const Dataset* const dataset_;
+    };
+
+    class ParallelMapDatasetFunctor : public ParallelMapFunctor {
+     public:
+      explicit ParallelMapDatasetFunctor(const Dataset* dataset)
+          : dataset_(dataset) {}
+
+      Status InitFunc(IteratorContext* ctx) override {
+        return dataset_->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      void MapFunc(IteratorContext* ctx, const string& prefix,
+                   std::vector<Tensor> input_element,
+                   std::vector<Tensor>* result, StatusCallback done) override {
+        auto map_func = [this](IteratorContext* ctx, const string& prefix,
+                               std::vector<Tensor> input_element,
+                               std::vector<Tensor>* result,
+                               StatusCallback done) {
+          instantiated_captured_func_->RunAsync(
+              ctx, std::move(input_element), result, std::move(done), prefix);
+        };
+        if (!dataset_->use_inter_op_parallelism_) {
+          (*ctx->runner())(std::bind(map_func, ctx, prefix,
+                                     std::move(input_element), result,
+                                     std::move(done)));
+        } else {
+          map_func(ctx, prefix, std::move(input_element), result,
+                   std::move(done));
+        }
+      }
+
+     private:
+      const Dataset* const dataset_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+    };
+
     const DatasetBase* const input_;
     const NameAttrList func_;
     const int32 num_parallel_calls_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
     const bool use_inter_op_parallelism_;
+    const bool sloppy_;
+    const bool preserve_cardinality_;
     const std::unique_ptr<CapturedFunction> captured_func_;
+    const std::vector<int> indices_;
+    const std::vector<bool> can_move_;
   };
 
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   bool use_inter_op_parallelism_;
+  bool sloppy_;
+  bool preserve_cardinality_;
   NameAttrList func_;
 };
 
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index 13bd4b60367d2f36e284fbceb597cb0e03f0b1f9..b97f69250056fbf80c1cf866192a320861b70770 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -17,33 +17,51 @@ limitations under the License.
 #include <atomic>
 #include <deque>
 #include <functional>
+#include <memory>
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// TODO(b/116852688): Make coordination between the performance model and this
-// transformation more robust.
 class ParallelMapIterator : public DatasetBaseIterator {
  public:
-  explicit ParallelMapIterator(
-      const typename DatasetBaseIterator::BaseParams& params,
-      const DatasetBase* input_dataset,
-      std::function<Status(IteratorContext*)> init_func,
-      ParallelMapIteratorFunction map_func, int32 num_parallel_calls)
-      : DatasetBaseIterator(params),
+  struct Params {
+    Params(std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
+           int32 num_parallel_calls, bool sloppy, bool preserve_cardinality)
+        : parallel_map_functor(std::move(parallel_map_functor)),
+          num_parallel_calls(num_parallel_calls),
+          sloppy(sloppy),
+          preserve_cardinality(preserve_cardinality) {}
+
+    std::unique_ptr<ParallelMapFunctor> parallel_map_functor;
+    int32 num_parallel_calls;
+    bool sloppy;
+    bool preserve_cardinality;
+  };
+
+  ParallelMapIterator(
+      const typename DatasetBaseIterator::BaseParams& base_params,
+      const DatasetBase* input_dataset, Params params)
+      : DatasetBaseIterator(base_params),
         input_dataset_(input_dataset),
-        init_func_(std::move(init_func)),
-        map_func_(std::move(map_func)),
+        parallel_map_functor_(std::move(params.parallel_map_functor)),
         mu_(std::make_shared<mutex>()),
         cond_var_(std::make_shared<condition_variable>()),
         num_parallel_calls_(std::make_shared<model::SharedState>(
-            num_parallel_calls, mu_, cond_var_)) {}
+            params.num_parallel_calls, mu_, cond_var_)),
+        sloppy_(params.sloppy),
+        preserve_cardinality_(params.preserve_cardinality) {
+    std::vector<string> components =
+        str_util::Split(base_params.prefix, "::", str_util::SkipEmpty());
+    prefix_end_ = components.back();
+  }
 
   ~ParallelMapIterator() override {
     mutex_lock l(*mu_);
@@ -59,20 +77,12 @@ class ParallelMapIterator : public DatasetBaseIterator {
   Status Initialize(IteratorContext* ctx) override {
     mutex_lock l(*mu_);
     if (num_parallel_calls_->value == kAutoTune) {
-      num_parallel_calls_->value = 1;
-      // TODO(jsimsa): Surface the number of threads used by `ctx->runner()` and
-      // use it here for the maximum.
-      AddTunableParameter(ctx, "parallelism", num_parallel_calls_, 1,
-                          port::NumSchedulableCPUs());
-    } else {
-      AddConstantParameter(ctx, "parallelism", num_parallel_calls_->value);
+      num_parallel_calls_->value = ctx->runner_threadpool_size();
+      num_parallel_calls_->tunable = true;
     }
     TF_RETURN_IF_ERROR(
         input_dataset_->MakeIterator(ctx, prefix(), &input_impl_));
-    if (init_func_) {
-      TF_RETURN_IF_ERROR(init_func_(ctx));
-    }
-    return Status::OK();
+    return parallel_map_functor_->InitFunc(ctx);
   }
 
   Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -81,22 +91,28 @@ class ParallelMapIterator : public DatasetBaseIterator {
     {
       mutex_lock l(*mu_);
       EnsureRunnerThreadStarted(ctx);
-      while (invocation_results_.empty()) {
+      while (ShouldWait(&result)) {
         RecordStop(ctx);
         cond_var_->wait(l);
         RecordStart(ctx);
       }
-      std::swap(result, invocation_results_.front());
-      invocation_results_.pop_front();
-      cond_var_->notify_all();
     }
     RecordStop(ctx);
     result->notification.WaitForNotification();
     RecordStart(ctx);
-    return ProcessResult(result, out_tensors, end_of_sequence);
+    return ProcessResult(ctx, result, out_tensors, end_of_sequence);
   }
 
  protected:
+  std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const override {
+    return model::MakeAsyncKnownRatioNode(
+        std::move(args),
+        /*ratio=*/1,
+        {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
+                              /*max=*/ctx->runner_threadpool_size())});
+  }
+
   Status SaveInternal(IteratorStateWriter* writer) override {
     mutex_lock l(*mu_);
     // Wait for all in-flight calls to complete.
@@ -119,10 +135,10 @@ class ParallelMapIterator : public DatasetBaseIterator {
             result.return_values[j]));
       }
       if (result.end_of_input) {
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(
-                strings::StrCat("invocation_results[", i, "].end_of_input")),
-            ""));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat("invocation_results[",
+                                                          i, "].end_of_input")),
+                                ""));
       }
     }
     return Status::OK();
@@ -133,8 +149,8 @@ class ParallelMapIterator : public DatasetBaseIterator {
     mutex_lock l(*mu_);
     TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
     int64 invocation_results_size;
-    TF_RETURN_IF_ERROR(reader->ReadScalar(
-        full_name("invocation_results.size"), &invocation_results_size));
+    TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("invocation_results.size"),
+                                          &invocation_results_size));
     for (size_t i = 0; i < invocation_results_size; i++) {
       invocation_results_.push_back(std::make_shared<InvocationResult>());
       auto& result = *invocation_results_.back();
@@ -142,15 +158,13 @@ class ParallelMapIterator : public DatasetBaseIterator {
       size_t num_return_values;
       {
         int64 size;
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name(strings::StrCat(
-                                   "invocation_results[", i, "].size")),
-                               &size));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat("invocation_results[", i, "].size")),
+            &size));
         num_return_values = static_cast<size_t>(size);
         if (num_return_values != size) {
           return errors::InvalidArgument(strings::StrCat(
-              full_name(
-                  strings::StrCat("invocation_results[", i, "].size")),
+              full_name(strings::StrCat("invocation_results[", i, "].size")),
               ": ", size, " is not a valid value of type size_t."));
         }
       }
@@ -179,21 +193,27 @@ class ParallelMapIterator : public DatasetBaseIterator {
   void EnsureRunnerThreadStarted(IteratorContext* ctx)
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
     if (!runner_thread_) {
-      std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
+      auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
       runner_thread_.reset(ctx->env()->StartThread(
-          {}, "runner_thread",
+          {}, "tf_data_parallel_map",
           std::bind(&ParallelMapIterator::RunnerThread, this, ctx_copy)));
     }
   }
 
-  void CallCompleted(const std::shared_ptr<InvocationResult>& result)
+  void CallCompleted(const std::shared_ptr<IteratorContext>& ctx,
+                     const std::shared_ptr<InvocationResult>& result)
       LOCKS_EXCLUDED(*mu_) {
-    {
-      mutex_lock l(*mu_);
-      num_calls_--;
-      cond_var_->notify_all();
+    mutex_lock l(*mu_);
+    num_calls_--;
+    const auto& stats_aggregator = ctx->stats_aggregator();
+    if (stats_aggregator) {
+      stats_aggregator->AddScalar(
+          strings::StrCat(prefix_end_, "::active_parallel_calls"),
+          static_cast<float>(num_calls_));
     }
+    RecordBufferEnqueue(ctx.get(), result->return_values);
     result->notification.Notify();
+    cond_var_->notify_all();
   }
 
   void CallFunction(const std::shared_ptr<IteratorContext>& ctx,
@@ -204,44 +224,60 @@ class ParallelMapIterator : public DatasetBaseIterator {
     result->status =
         input_impl_->GetNext(ctx.get(), &input_element, &result->end_of_input);
     if (result->end_of_input || !result->status.ok()) {
-      CallCompleted(result);
+      CallCompleted(ctx, result);
       return;
     }
 
-    // Call `func_(input_element)`, store the result in `result->return_values`,
-    // and notify `result->notification` to unblock a consumer.
-    auto done = [this, result](Status status) {
+    auto done = [this, ctx, result](Status status) {
       result->status.Update(status);
-      CallCompleted(result);
+      CallCompleted(ctx, result);
     };
 
-    map_func_(ctx.get(), std::move(input_element), &result->return_values,
-              std::move(done));
+    // Apply the map function on `input_element`, storing the result in
+    // `result->return_values`, and invoking `done` when finished.
+    parallel_map_functor_->MapFunc(ctx.get(), prefix(),
+                                   std::move(input_element),
+                                   &result->return_values, std::move(done));
   }
 
-  Status ProcessResult(const std::shared_ptr<InvocationResult>& result,
-                       std::vector<Tensor>* out_tensors,
-                       bool* end_of_sequence) {
+  Status ProcessResult(IteratorContext* ctx,
+                       const std::shared_ptr<InvocationResult>& result,
+                       std::vector<Tensor>* out_tensors, bool* end_of_sequence)
+      LOCKS_EXCLUDED(*mu_) {
     if (!result->end_of_input && result->status.ok()) {
       *out_tensors = std::move(result->return_values);
+      RecordBufferDequeue(ctx, *out_tensors);
       *end_of_sequence = false;
       return Status::OK();
     }
     if (errors::IsOutOfRange(result->status)) {
-      // `f` may deliberately raise `errors::OutOfRange` to indicate that we
-      // should terminate the iteration early.
-      *end_of_sequence = true;
-      return Status::OK();
+      if (preserve_cardinality_) {
+        // To guarantee that the transformation preserves the cardinality of the
+        // dataset, we convert `OutOfRange` to `InvalidArgument` as the former
+        // may be interpreted by a caller as the end of sequence.
+        return errors::InvalidArgument(
+            "Function invocation produced OutOfRangeError: ",
+            result->status.error_message());
+      } else {
+        // `f` may deliberately raise `errors::OutOfRange` to indicate
+        // that we should terminate the iteration early.
+        *end_of_sequence = true;
+        return Status::OK();
+      }
     }
     *end_of_sequence = result->end_of_input;
     return result->status;
   }
 
-  void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) {
+  void RunnerThread(const std::shared_ptr<IteratorContext>& ctx)
+      LOCKS_EXCLUDED(*mu_) {
     RecordStart(ctx.get());
     auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
     std::vector<std::shared_ptr<InvocationResult>> new_calls;
-    new_calls.reserve(num_parallel_calls_->value);
+    {
+      tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
+      new_calls.reserve(num_parallel_calls_->value);
+    }
     auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
       int64 num_parallel_calls = num_parallel_calls_->value;
       return num_calls_ >= num_parallel_calls ||
@@ -263,6 +299,17 @@ class ParallelMapIterator : public DatasetBaseIterator {
           new_calls.push_back(invocation_results_.back());
           num_calls_++;
         }
+        const auto& stats_aggregator = ctx->stats_aggregator();
+        if (stats_aggregator) {
+          // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
+          // monitoring code or as histogram at fixed time intervals.
+          stats_aggregator->AddScalar(
+              strings::StrCat(prefix_end_, "::active_parallel_calls"),
+              static_cast<float>(num_calls_));
+          stats_aggregator->AddScalar(
+              strings::StrCat(prefix_end_, "::num_parallel_calls"),
+              static_cast<float>(num_parallel_calls_->value));
+        }
         cond_var_->notify_all();
       }
       for (const auto& call : new_calls) {
@@ -272,6 +319,30 @@ class ParallelMapIterator : public DatasetBaseIterator {
     }
   }
 
+  // Determines whether the caller needs to wait for a result. Upon returning
+  // false, `result` will point to the result.
+  bool ShouldWait(std::shared_ptr<InvocationResult>* result)
+      EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+    if (sloppy_) {
+      for (auto it = invocation_results_.begin();
+           it != invocation_results_.end(); ++it) {
+        if ((*it)->notification.HasBeenNotified() &&
+            (it == invocation_results_.begin() || !(*it)->end_of_input)) {
+          std::swap(*result, *it);
+          invocation_results_.erase(it);
+          cond_var_->notify_all();
+          return false;
+        }
+      }
+    } else if (!invocation_results_.empty()) {
+      std::swap(*result, invocation_results_.front());
+      invocation_results_.pop_front();
+      cond_var_->notify_all();
+      return false;
+    }
+    return true;
+  }
+
   Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
                            const Status& status)
       EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
@@ -302,8 +373,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   string CodeKey(size_t index) {
-    return full_name(
-        strings::StrCat("invocation_results[", index, "].code"));
+    return full_name(strings::StrCat("invocation_results[", index, "].code"));
   }
 
   string ErrorMessageKey(size_t index) {
@@ -312,8 +382,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   const DatasetBase* const input_dataset_;  // Not owned.
-  const std::function<Status(IteratorContext*)> init_func_;
-  const ParallelMapIteratorFunction map_func_;
+  std::unique_ptr<ParallelMapFunctor> parallel_map_functor_;
   // Used for coordination between the main thread and the runner thread.
   const std::shared_ptr<mutex> mu_;
   // Used for coordination between the main thread and the runner thread. In
@@ -324,6 +393,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
   const std::shared_ptr<condition_variable> cond_var_;
   // Identifies the maximum number of parallel calls.
   const std::shared_ptr<model::SharedState> num_parallel_calls_;
+  // Determines whether outputs can be produced in non-deterministic order.
+  const bool sloppy_;
+  const bool preserve_cardinality_;
   // Counts the number of outstanding calls.
   int64 num_calls_ GUARDED_BY(*mu_) = 0;
   std::unique_ptr<IteratorBase> input_impl_;
@@ -332,26 +404,21 @@ class ParallelMapIterator : public DatasetBaseIterator {
       GUARDED_BY(*mu_);
   std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
   bool cancelled_ GUARDED_BY(*mu_) = false;
+  string prefix_end_;
 };
 
 }  // namespace
 
-std::unique_ptr<IteratorBase> NewParallelMapIterator(
-    const DatasetBaseIterator::BaseParams& params,
-    const DatasetBase* input_dataset, ParallelMapIteratorFunction map_func,
-    int32 num_parallel_calls) {
-  return NewParallelMapIterator(params, input_dataset, nullptr,
-                                std::move(map_func), num_parallel_calls);
-}
-
 std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset,
-    std::function<Status(IteratorContext*)> init_func,
-    ParallelMapIteratorFunction map_func, int32 num_parallel_calls) {
-  return std::unique_ptr<IteratorBase>(
-      new ParallelMapIterator(params, input_dataset, std::move(init_func),
-                              std::move(map_func), num_parallel_calls));
+    std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
+    int32 num_parallel_calls, bool sloppy, bool preserve_cardinality) {
+  return MakeUnique<ParallelMapIterator>(
+      params, input_dataset,
+      ParallelMapIterator::Params{std::move(parallel_map_functor),
+                                  num_parallel_calls, sloppy,
+                                  preserve_cardinality});
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.h b/tensorflow/core/kernels/data/parallel_map_iterator.h
index dc26c5cf25d4ccad295d8ea1a42c26d694dfa000..de30446f2631c7e40e090a03517dcc53fdd873b9 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.h
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.h
@@ -22,31 +22,33 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-// A function that transforms elements of one dataset into another
-// asynchronously. The arguments are:
-// 1. An `IteratorContext*` for the context in which the function should
-// execute.
-// 2. A `std::vector<Tensor>` containing the input element.
-// 3. A `std::vector<Tensor>*` to which the function will write the result.
-// 4. A `StatusCallback` that should be invoked when the function is complete.
-using ParallelMapIteratorFunction =
-    std::function<void(IteratorContext*, std::vector<Tensor>,
-                       std::vector<Tensor>*, StatusCallback)>;
-
-// Returns a new iterator that applies `map_func` to the elements of
-// `input_dataset` using the given degree of parallelism. `init_func` (if
-// specified) will be executed when the iterator is initialized (see
-// `IteratorBase::Initialize()`) and enables the user to specify error checking
-// logic that can fail early.
+class ParallelMapFunctor {
+ public:
+  virtual ~ParallelMapFunctor() {}
+
+  // A function that runs when the Iterator is initialized. It enables the user
+  // to specify error checking logic that can fail early.
+  virtual Status InitFunc(IteratorContext* ctx) { return Status::OK(); }
+
+  // A function that transforms elements of one dataset into another
+  // asynchronously. The arguments are:
+  // 1. An `IteratorContext*` for the context in which the function should
+  // execute.
+  // 2. A `std::vector<Tensor>` containing the input element.
+  // 3. A `std::vector<Tensor>*` to which the function will write the result.
+  // 4. A `StatusCallback` that should be invoked when the function is complete.
+  virtual void MapFunc(IteratorContext* ctx, const string& prefix,
+                       std::vector<Tensor> input, std::vector<Tensor>* output,
+                       StatusCallback callback) = 0;
+};
+
+// Returns a new iterator that uses `parallel_map_functor` to apply `MapFunc`
+// to the elements of `input_dataset` using the given degree of parallelism.
 std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset,
-    std::function<Status(IteratorContext*)> init_func,
-    ParallelMapIteratorFunction map_func, int32 num_parallel_calls);
-std::unique_ptr<IteratorBase> NewParallelMapIterator(
-    const DatasetBaseIterator::BaseParams& params,
-    const DatasetBase* input_dataset, ParallelMapIteratorFunction map_func,
-    int32 num_parallel_calls);
+    std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
+    int32 num_parallel_calls, bool sloppy, bool preserve_cardinality);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 754ed772db842d2146693a6f27ebdca906cc5e41..08d6de4bf9a654d433e3cb6dddd6ab0cc1435136 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -26,7 +26,7 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class PrefetchDatasetOp::Dataset : public DatasetBase {
@@ -56,6 +56,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   string DebugString() const override { return "PrefetchDatasetOp::Dataset"; }
 
+  int64 Cardinality() const override { return input_->Cardinality(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -103,7 +105,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      auto stats_aggregator = ctx->stats_aggregator();
+      const auto& stats_aggregator = ctx->stats_aggregator();
       {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(EnsurePrefetchThreadStarted(ctx));
@@ -123,7 +125,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         }
 
         if (!buffer_.empty()) {
-          return Consume(out_tensors, end_of_sequence, stats_aggregator);
+          return Consume(ctx, out_tensors, end_of_sequence);
         }
 
         if (prefetch_thread_finished_) {
@@ -148,6 +150,13 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     }
 
    protected:
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeAsyncKnownRatioNode(std::move(args),
+                                            /*ratio=*/1,
+                                            /*parameters=*/{});
+    }
+
     Status SaveInternal(IteratorStateWriter* writer) override {
       // Acquire both locks to ensure that the prefetch thread and
       // all GetNext threads are blocked.
@@ -219,9 +228,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       std::vector<Tensor> value;
     };
 
-    Status Consume(std::vector<Tensor>* out_tensors, bool* end_of_sequence,
-                   const std::shared_ptr<StatsAggregator>& stats_aggregator)
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    Status Consume(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                   bool* end_of_sequence) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      const auto& stats_aggregator = ctx->stats_aggregator();
       if (stats_aggregator) {
         stats_aggregator->AddToHistogram(
             strings::StrCat(prefix_end_, "::buffer_utilization"),
@@ -239,6 +248,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       Status s = buffer_.front().status;
       if (s.ok()) {
         *out_tensors = std::move(buffer_.front().value);
+        RecordBufferDequeue(ctx, *out_tensors);
       }
       auto_tuner_.RecordConsumption(buffer_.size());
       buffer_.pop_front();
@@ -258,7 +268,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       if (!prefetch_thread_) {
         std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
         prefetch_thread_.reset(ctx->env()->StartThread(
-            {}, "prefetch_thread",
+            {}, "tf_data_prefetch",
             [this, new_ctx]() { PrefetchThread(new_ctx); }));
       }
       return Status::OK();
@@ -309,6 +319,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         // 3. Signal that the element has been produced.
         {
           mutex_lock l(mu_);
+          RecordBufferEnqueue(ctx.get(), buffer_element.value);
           buffer_.push_back(std::move(buffer_element));
           cond_var_.notify_all();
         }
@@ -384,13 +395,14 @@ void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
 }
 
 namespace {
-REGISTER_KERNEL_BUILDER(Name("PrefetchDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("PrefetchDataset").Device(DEVICE_CPU).Priority(2),
                         PrefetchDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("PrefetchDataset")
                             .Device(DEVICE_GPU)
                             .HostMemory("buffer_size")
                             .HostMemory("input_dataset")
-                            .HostMemory("handle"),
+                            .HostMemory("handle")
+                            .Priority(1),
                         PrefetchDatasetOp);
 }  // namespace
 
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.h b/tensorflow/core/kernels/data/prefetch_dataset_op.h
index 588fb25a06b2deb1da775306ac1562c1f213c955..83206374946b246e14b11d728a92741e92a4c990 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.h
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_DATASET_OP_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_DATASET_OP_H_
 
-#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/kernels/data/prefetch_autotuner.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 89fbaae3693ae38cacc2c8dd6d4c087ede478978..580702f741814b6bd86cab2d537b3ad49b4f6177 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class RangeDatasetOp : public DatasetOpKernel {
@@ -73,6 +73,14 @@ class RangeDatasetOp : public DatasetOpKernel {
                              step_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      if (step_ > 0) {
+        return std::max(0LL, (stop_ - start_ - 1) / step_ + 1);
+      } else {
+        return std::max(0LL, (start_ - stop_ - 1) / -step_ + 1);
+      }
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -104,9 +112,8 @@ class RangeDatasetOp : public DatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        Tensor value_tensor(ctx->allocator({}), DT_INT64, {});
-        value_tensor.scalar<int64>()() = next_;
-        out_tensors->emplace_back(std::move(value_tensor));
+        out_tensors->reserve(1);
+        out_tensors->emplace_back(next_);
         *end_of_sequence = false;
         next_ += dataset()->step_;
 
@@ -114,6 +121,11 @@ class RangeDatasetOp : public DatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("next"), next_));
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index c474cb4773abb20f0de9aede1b4c316f7c0e7706..971fd2a43685197892ad0fb3cd37e3709cd144c1 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
@@ -26,7 +26,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following ops.
 
 class TextLineDatasetOp : public DatasetOpKernel {
@@ -142,9 +142,9 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
             if (s.ok()) {
               // Produce the line as output.
-              Tensor line_tensor(ctx->allocator({}), DT_STRING, {});
-              line_tensor.scalar<string>()() = line_contents;
-              out_tensors->emplace_back(std::move(line_tensor));
+              out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
+                                        TensorShape({}));
+              out_tensors->back().scalar<string>()() = std::move(line_contents);
               *end_of_sequence = false;
               return Status::OK();
             } else if (!errors::IsOutOfRange(s)) {
@@ -168,6 +168,11 @@ class TextLineDatasetOp : public DatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"),
@@ -266,6 +271,9 @@ REGISTER_KERNEL_BUILDER(Name("TextLineDataset").Device(DEVICE_CPU),
 class FixedLengthRecordDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
+  explicit FixedLengthRecordDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx),
+        op_version_(ctx->def().op() == "FixedLengthRecordDataset" ? 1 : 2) {}
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
     const Tensor* filenames_tensor;
@@ -306,9 +314,17 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
     if (buffer_size == 0) {
       buffer_size = 256 << 10;  // 256 kB as default.
     }
-
+    string compression_type;
+    if (op_version_ > 1) {
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "compression_type",
+                                                      &compression_type));
+      OP_REQUIRES(ctx,
+                  compression_type.empty() || compression_type == "ZLIB" ||
+                      compression_type == "GZIP",
+                  errors::InvalidArgument("Unsupported compression_type."));
+    }
     *output = new Dataset(ctx, std::move(filenames), header_bytes, record_bytes,
-                          footer_bytes, buffer_size);
+                          footer_bytes, buffer_size, compression_type);
   }
 
  private:
@@ -316,18 +332,24 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
    public:
     explicit Dataset(OpKernelContext* ctx, std::vector<string> filenames,
                      int64 header_bytes, int64 record_bytes, int64 footer_bytes,
-                     int64 buffer_size)
+                     int64 buffer_size, const string& compression_type)
         : DatasetBase(DatasetContext(ctx)),
           filenames_(std::move(filenames)),
           header_bytes_(header_bytes),
           record_bytes_(record_bytes),
           footer_bytes_(footer_bytes),
-          buffer_size_(buffer_size) {}
+          buffer_size_(buffer_size),
+          compression_type_(compression_type) {}
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+      if (compression_type_.empty()) {
+        return std::unique_ptr<IteratorBase>(new UncompressedIterator(
+            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+      } else {
+        return std::unique_ptr<IteratorBase>(new CompressedIterator(
+            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+      }
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -354,22 +376,25 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       Node* record_bytes = nullptr;
       Node* footer_bytes = nullptr;
       Node* buffer_size = nullptr;
+      Node* compression_type = nullptr;
       TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
       TF_RETURN_IF_ERROR(b->AddScalar(header_bytes_, &header_bytes));
       TF_RETURN_IF_ERROR(b->AddScalar(record_bytes_, &record_bytes));
       TF_RETURN_IF_ERROR(b->AddScalar(footer_bytes_, &footer_bytes));
       TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {filenames, header_bytes, record_bytes, footer_bytes, buffer_size},
-          output));
+      TF_RETURN_IF_ERROR(b->AddScalar(compression_type_, &compression_type));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this,
+                        {filenames, header_bytes, record_bytes, footer_bytes,
+                         buffer_size, compression_type},
+                        output));
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
+    class UncompressedIterator : public DatasetIterator<Dataset> {
      public:
-      explicit Iterator(const Params& params)
+      explicit UncompressedIterator(const Params& params)
           : DatasetIterator<Dataset>(params) {}
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -487,16 +512,207 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
     };
 
+    class CompressedIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit CompressedIterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        do {
+          // We are currently processing a file, so try to read the next record.
+          if (buffered_input_stream_) {
+            const int64 current_pos = buffered_input_stream_->Tell();
+            if (dataset()->compression_type_.empty()) {
+              DCHECK_GE(file_pos_limit_, 0);
+              if (current_pos < file_pos_limit_) {
+                string record;
+                TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
+                    dataset()->record_bytes_, &record));
+                // Produce the record as output.
+                Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+                record_tensor.scalar<string>()() = std::move(record);
+                out_tensors->emplace_back(std::move(record_tensor));
+                *end_of_sequence = false;
+                return Status::OK();
+              }
+            } else {
+              string record;
+              Status s = buffered_input_stream_->ReadNBytes(
+                  dataset()->record_bytes_, &record);
+              if (s.ok()) {
+                lookahead_cache_.append(record);
+                record = lookahead_cache_.substr(0, dataset()->record_bytes_);
+                lookahead_cache_ =
+                    lookahead_cache_.substr(dataset()->record_bytes_);
+                // Produce the record as output.
+                Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+                record_tensor.scalar<string>()() = std::move(record);
+                out_tensors->emplace_back(std::move(record_tensor));
+                *end_of_sequence = false;
+                return Status::OK();
+              }
+              if (errors::IsOutOfRange(s) && !record.empty()) {
+                uint64 body_size =
+                    current_pos + record.size() -
+                    (dataset()->header_bytes_ + dataset()->footer_bytes_);
+                return errors::DataLoss(
+                    "Excluding the header (", dataset()->header_bytes_,
+                    " bytes) and footer (", dataset()->footer_bytes_,
+                    " bytes), input file \"",
+                    dataset()->filenames_[current_file_index_],
+                    "\" has body length ", body_size,
+                    " bytes, which is not an exact multiple of the record "
+                    "length (",
+                    dataset()->record_bytes_, " bytes).");
+              }
+            }
+
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            buffered_input_stream_.reset();
+            file_.reset();
+            ++current_file_index_;
+          }
+
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          // Actually move on to next file.
+          if (dataset()->compression_type_.empty()) {
+            uint64 file_size;
+            TF_RETURN_IF_ERROR(ctx->env()->GetFileSize(
+                dataset()->filenames_[current_file_index_], &file_size));
+            file_pos_limit_ = file_size - dataset()->footer_bytes_;
+
+            uint64 body_size = file_size - (dataset()->header_bytes_ +
+                                            dataset()->footer_bytes_);
+
+            if (body_size % dataset()->record_bytes_ != 0) {
+              return errors::InvalidArgument(
+                  "Excluding the header (", dataset()->header_bytes_,
+                  " bytes) and footer (", dataset()->footer_bytes_,
+                  " bytes), input file \"",
+                  dataset()->filenames_[current_file_index_],
+                  "\" has body length ", body_size,
+                  " bytes, which is not an exact multiple of the record length "
+                  "(",
+                  dataset()->record_bytes_, " bytes).");
+            }
+          }
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          if (!dataset()->compression_type_.empty()) {
+            const io::ZlibCompressionOptions zlib_options =
+                dataset()->compression_type_ == "ZLIB"
+                    ? io::ZlibCompressionOptions::DEFAULT()
+                    : io::ZlibCompressionOptions::GZIP();
+            file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
+            buffered_input_stream_.reset(new io::ZlibInputStream(
+                file_stream_.get(), dataset()->buffer_size_,
+                dataset()->buffer_size_, zlib_options));
+          } else {
+            buffered_input_stream_.reset(new io::BufferedInputStream(
+                file_.get(), dataset()->buffer_size_));
+          }
+          TF_RETURN_IF_ERROR(
+              buffered_input_stream_->SkipNBytes(dataset()->header_bytes_));
+          lookahead_cache_.clear();
+          if (!dataset()->compression_type_.empty()) {
+            TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
+                dataset()->footer_bytes_, &lookahead_cache_));
+          }
+        } while (true);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"),
+                                               current_file_index_));
+
+        // `buffered_input_stream_` is empty if
+        // 1. GetNext has not been called even once.
+        // 2. All files have been read and iterator has been exhausted.
+        int64 current_pos =
+            buffered_input_stream_ ? buffered_input_stream_->Tell() : -1;
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("current_pos"), current_pos));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        int64 current_file_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_file_index"),
+                                              &current_file_index));
+        current_file_index_ = size_t(current_file_index);
+        int64 current_pos;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("current_pos"), &current_pos));
+
+        // Seek to current_pos.
+        buffered_input_stream_.reset();
+        file_.reset();
+        if (current_pos >= 0) {  // There was an active buffered_input_stream_.
+          TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
+              dataset()->filenames_[current_file_index_], &file_));
+          const io::ZlibCompressionOptions zlib_options =
+              dataset()->compression_type_ == "ZLIB"
+                  ? io::ZlibCompressionOptions::DEFAULT()
+                  : io::ZlibCompressionOptions::GZIP();
+          file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
+          buffered_input_stream_.reset(new io::ZlibInputStream(
+              file_stream_.get(), dataset()->buffer_size_,
+              dataset()->buffer_size_, zlib_options));
+          lookahead_cache_.clear();
+          TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes(
+              current_pos - dataset()->footer_bytes_));
+          TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
+              dataset()->footer_bytes_, &lookahead_cache_));
+        }
+
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_
+          GUARDED_BY(mu_);  // must outlive buffered_input_stream_
+      std::unique_ptr<io::RandomAccessInputStream>
+          file_stream_;  // must outlive buffered_input_stream_
+      std::unique_ptr<io::InputStreamInterface> buffered_input_stream_
+          GUARDED_BY(mu_);
+      int64 file_pos_limit_ GUARDED_BY(mu_) = -1;
+      string lookahead_cache_ GUARDED_BY(mu_);
+    };
+
     const std::vector<string> filenames_;
     const int64 header_bytes_;
     const int64 record_bytes_;
     const int64 footer_bytes_;
     const int64 buffer_size_;
+    const string compression_type_;
   };
+  const int op_version_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDataset").Device(DEVICE_CPU),
                         FixedLengthRecordDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDatasetV2").Device(DEVICE_CPU),
+                        FixedLengthRecordDatasetOp);
 
 class TFRecordDatasetOp : public DatasetOpKernel {
  public:
@@ -592,13 +808,16 @@ class TFRecordDatasetOp : public DatasetOpKernel {
         do {
           // We are currently processing a file, so try to read the next record.
           if (reader_) {
-            Tensor result_tensor(ctx->allocator({}), DT_STRING, {});
-            Status s = reader_->ReadRecord(&result_tensor.scalar<string>()());
+            out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
+                                      TensorShape({}));
+            Status s =
+                reader_->ReadRecord(&out_tensors->back().scalar<string>()());
             if (s.ok()) {
-              out_tensors->emplace_back(std::move(result_tensor));
               *end_of_sequence = false;
               return Status::OK();
-            } else if (!errors::IsOutOfRange(s)) {
+            }
+            out_tensors->pop_back();
+            if (!errors::IsOutOfRange(s)) {
               return s;
             }
 
@@ -619,6 +838,11 @@ class TFRecordDatasetOp : public DatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"),
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 94e96635ab50fa5c2eb27c40c9e538bba1af46c4..8100f2695b6ee529da252b7b012a7c87ebb0a670 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class RepeatDatasetOp : public UnaryDatasetOpKernel {
@@ -71,6 +71,23 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "RepeatDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (count_ < 0) {
+        if (n == 0) {
+          return 0;
+        }
+        return kInfiniteCardinality;
+      }
+      if (count_ == 0) {
+        return 0;
+      }
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return count_ * n;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -97,6 +114,12 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         return Status::OK();
       }
@@ -139,6 +162,12 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
@@ -210,6 +239,12 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (!first_call_)
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 66466d6a36f9c399f89a7064755bd748624f0152..7134793e26da82e39f53ac21030a9e56e16e26ab 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -16,9 +16,10 @@ limitations under the License.
 #include <deque>
 #include <vector>
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
@@ -30,7 +31,7 @@ namespace {
 
 const int64 kLogIntervalMicros = 10 * 1000000;  // 10 seconds.
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
@@ -61,6 +62,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     template <class T>
     class Iterator : public DatasetIterator<T> {
@@ -68,9 +71,9 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       explicit Iterator(const typename DatasetIterator<T>::Params& params,
                         int64 seed, int64 seed2)
           : DatasetIterator<T>(params),
-            input_impl_(nullptr),
             seed_(seed),
             seed2_(seed2),
+            input_impl_(nullptr),
             epoch_(0),
             num_elements_(0),
             parent_generator_(seed, seed2),
@@ -124,6 +127,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
                 ctx, this->prefix(), &input_impl_));
           }
           if (!end_of_input_sequence) {
+            this->RecordBufferEnqueue(ctx, input_element);
             buffer_[slices_.back()->end % this->dataset()->buffer_size_] =
                 std::move(input_element);
             num_elements_++;
@@ -151,6 +155,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
           int64 index =
               (slices_.front()->start + offset) % this->dataset()->buffer_size_;
           *out_tensors = std::move(buffer_[index]);
+          this->RecordBufferDequeue(ctx, *out_tensors);
           std::swap(
               buffer_[index],
               buffer_[slices_.front()->start % this->dataset()->buffer_size_]);
@@ -164,6 +169,20 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+      void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Reset the generators based on the current iterator seeds.
+        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         // Save state needed to restore the random number generators.
@@ -271,6 +290,10 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+      mutex mu_;
+      int64 seed_ GUARDED_BY(mu_);
+      int64 seed2_ GUARDED_BY(mu_);
+
      private:
       // Used to represent slices of `buffer_` that belong to different epochs.
       // The invariant maintained by the implementation is: `start` <= `end`.
@@ -291,19 +314,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
         return out;
       }
 
-      void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // Reset the generators based on the current iterator seeds.
-        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
-        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
-            &parent_generator_);
-        generator_.Skip(num_random_samples_);
-      }
-
-      mutex mu_;
       std::unique_ptr<std::vector<Tensor>[]> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      int64 seed_ GUARDED_BY(mu_);
-      int64 seed2_ GUARDED_BY(mu_);
       int64 epoch_ GUARDED_BY(mu_);
       int64 num_elements_ GUARDED_BY(mu_);
       std::deque<std::unique_ptr<Slice>> slices_ GUARDED_BY(mu_);
@@ -360,7 +372,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
   }
 
  private:
-  // A dataset that uses a pseduorandom sequence of seeds for the iterators
+  // A dataset that uses a pseudorandom sequence of seeds for the iterators
   // created from it. Used when `reshuffle_each_iteration` is true.
   class ReshufflingDataset : public ShuffleDatasetBase {
    public:
@@ -368,45 +380,126 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
                        int64 buffer_size, int64 seed, int64 seed2, int64 count)
         : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
-          seed2_(seed2),
-          parent_generator_(seed, seed2),
-          generator_(&parent_generator_) {}
+          seed2_(seed2) {}
 
     string DebugString() const override {
-      mutex_lock l(mu_);
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
                              ", ", seed2_, ")::ReshufflingDataset");
     }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      int64 iterator_seed;
-      int64 iterator_seed2;
-      {
-        mutex_lock l(mu_);
-        iterator_seed = Random();
-        iterator_seed2 = Random();
-      }
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Shuffle")},
-                       iterator_seed, iterator_seed2));
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
     }
 
    protected:
+    class RandomSeedGenerator : public ResourceBase {
+     public:
+      RandomSeedGenerator(int64 seed, int64 seed2)
+          : seed_(seed),
+            seed2_(seed2),
+            parent_generator_(seed, seed2),
+            generator_(&parent_generator_) {}
+
+      string DebugString() override {
+        return "ReshufflingDataset::RandomSeedGenerator";
+      }
+
+      void GenerateRandomSeeds(int64* seed1, int64* seed2) {
+        mutex_lock l(mu_);
+        num_random_samples_++;
+        *seed1 = generator_();
+        num_random_samples_++;
+        *seed2 = generator_();
+      }
+
+      int64 num_random_samples() {
+        tf_shared_lock l(mu_);
+        return num_random_samples_;
+      }
+
+      void set_num_random_samples(int64 num_random_samples) {
+        mutex_lock l(mu_);
+        num_random_samples_ = num_random_samples;
+      }
+
+      void Reset() {
+        mutex_lock l(mu_);
+        // Reset the generators based on the current seeds.
+        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+      }
+
+     private:
+      const int64 seed_;
+      const int64 seed2_;
+      mutex mu_;
+      random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+      random::SingleSampleAdapter<random::PhiloxRandom> generator_
+          GUARDED_BY(mu_);
+      int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+    };
+
     class Iterator : public ShuffleDatasetBase::Iterator<ReshufflingDataset> {
      public:
       explicit Iterator(const Params& params, int64 seed, int64 seed2)
           : ShuffleDatasetBase::Iterator<ReshufflingDataset>(params, seed,
                                                              seed2) {}
 
+      ~Iterator() override { seed_generator_->Unref(); }
+
+      Status Initialize(IteratorContext* ctx) override {
+        // Firstly, lookup or create a seed generator from the IteratorResource
+        // resource_mgr.
+        ResourceMgr* mgr = ctx->resource_mgr();
+        RandomSeedGenerator* seed_generator;
+        const string name = strings::StrCat(prefix(), "::", dataset()->name(),
+                                            "::RandomSeedGenerator");
+
+        int64 dataset_seed, dataset_seed2;
+        {
+          tf_shared_lock l(mu_);
+          // Ideally we'd like to hold this lock in the LookupOrCreate method,
+          // but that trips up our Deadlock detection code.
+          dataset_seed = seed_;
+          dataset_seed2 = seed2_;
+        }
+        TF_RETURN_IF_ERROR(mgr->LookupOrCreate<RandomSeedGenerator>(
+            "tf_data", name, &seed_generator,
+            [dataset_seed,
+             dataset_seed2](RandomSeedGenerator** seed_generator) {
+              // On the first iterator creation, use the original seeds from the
+              // dataset to seed a `RandomSeedGenerator` that will provide seeds
+              // for subsequent repetitions of the same dataset.
+              *seed_generator =
+                  new RandomSeedGenerator(dataset_seed, dataset_seed2);
+              return Status::OK();
+            }));
+        // Now use the seed generator to update the base class Iterator seeds
+        // and random number generator with generated seeds for the current
+        // repetition.
+        mutex_lock l(mu_);
+        seed_generator->GenerateRandomSeeds(&seed_, &seed2_);
+        ResetRngs();
+        seed_generator_ = seed_generator;
+        return Status::OK();
+      }
+
      protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(dataset()->mu_);
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
 
+      Status SaveInternal(IteratorStateWriter* writer) override {
         // Save RNG state of Dataset.
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("ds_num_random_samples"),
-                                dataset()->num_random_samples_));
+                                seed_generator_->num_random_samples()));
 
         // Save the Iterator.
         return ShuffleDatasetBase::Iterator<ReshufflingDataset>::SaveInternal(
@@ -415,24 +508,25 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
-        mutex_lock l(dataset()->mu_);
-
         // Restore RNG state of Dataset.
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("ds_num_random_samples"),
-                               &dataset()->num_random_samples_));
-        dataset()->ResetRngs();
+        int64 num_random_samples;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name("ds_num_random_samples"), &num_random_samples));
+        seed_generator_->set_num_random_samples(num_random_samples);
+        seed_generator_->Reset();
 
         // Restore the Iterator.
         return ShuffleDatasetBase::Iterator<
             ReshufflingDataset>::RestoreInternal(ctx, reader);
       }
+
+     private:
+      RandomSeedGenerator* seed_generator_;
     };
 
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      mutex_lock l(mu_);
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* buffer_size = nullptr;
@@ -453,28 +547,8 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
     }
 
    private:
-    random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random() const
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      num_random_samples_++;
-      auto out = generator_();
-      return out;
-    }
-
-    void ResetRngs() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      // Reset the generators based on the current seeds.
-      parent_generator_ = random::PhiloxRandom(seed_, seed2_);
-      generator_ =
-          random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
-      generator_.Skip(num_random_samples_);
-    }
-
-    mutable int64 seed_ GUARDED_BY(mu_);
-    mutable int64 seed2_ GUARDED_BY(mu_);
-    mutable mutex mu_;
-    mutable random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
-    mutable random::SingleSampleAdapter<random::PhiloxRandom> generator_
-        GUARDED_BY(mu_);
-    mutable int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+    const int64 seed_;
+    const int64 seed2_;
   };
 
   // A dataset that uses the same fixed seed for all iterators created from it.
@@ -485,7 +559,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
                      int64 buffer_size, int64 seed, int64 seed2, int64 count)
         : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
-          seed2_(seed) {}
+          seed2_(seed2) {}
 
     string DebugString() const override {
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index 5b084a16f0be01ef276253a703edfdc17bdede01..89e3881037666299f093ed7423b62c9741ca5dd9 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -65,21 +65,28 @@ class SingleThreadedExecutorImpl : public Executor {
         if (IsRefType(dt)) {
           return errors::Unimplemented(
               "Single-threaded executor does not support reference-typed "
-              "edges.");
+              "edges.  But saw type ",
+              DataTypeString(dt), " in outputs of node ", n->name());
         }
       }
 
       if (n->IsControlFlow()) {
         return errors::Unimplemented(
-            "Single-threaded executor does not support control flow.");
+            "Single-threaded executor does not support control flow.  But saw "
+            "control flow node ",
+            n->name());
       }
       if (n->IsSend() || n->IsHostSend() || n->IsRecv() || n->IsHostRecv()) {
         return errors::Unimplemented(
-            "Single-threaded executor does not support partitioned graphs.");
+            "Single-threaded executor does not support partitioned graphs.  "
+            "But saw send/recv node ",
+            n->name());
       }
       if (n->IsCollective()) {
         return errors::Unimplemented(
-            "Single-threaded executor does not support collective ops.");
+            "Single-threaded executor does not support collective ops.  But "
+            "saw collective node ",
+            n->name());
       }
 
       KernelState& kernel_state = kernels_[i];
diff --git a/tensorflow/core/kernels/data/single_threaded_executor_test.cc b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
index 6244e287bb0db911c04da51c3e1fbdc9ae049e38..7bb51fb8b53d59789f2d1efad04f4ffdf39587e4 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
@@ -51,17 +51,17 @@ class ExecutorTest : public ::testing::Test {
     // when the test completes.
     CHECK(rendez_->Unref());
     delete exec_;
-    delete device_;
   }
 
   // Resets executor_ with a new executor based on a graph 'gdef'.
   void Create(std::unique_ptr<const Graph> graph) {
     const int version = graph->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
@@ -86,7 +86,7 @@ class ExecutorTest : public ::testing::Test {
     return exec_->Run(args);
   }
 
-  Device* device_ = nullptr;
+  std::unique_ptr<Device> device_;
   Executor* exec_ = nullptr;
   Executor::Args::Runner runner_;
   Rendezvous* rendez_ = nullptr;
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index b8c7fb15f4bc6cdaf2d9f7ef37a0e484cee4e106..e321066a715d180f0791c9afdfa947560a0fd9ce 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class SkipDatasetOp : public UnaryDatasetOpKernel {
@@ -67,6 +67,14 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "SkipDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return std::max(0LL, n - count_);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -93,6 +101,12 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         return Status::OK();
       }
@@ -149,6 +163,12 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc
deleted file mode 100644
index 1e73cfc753b32967c8731ad8fc6fee7d9470f2d4..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ /dev/null
@@ -1,297 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <deque>
-#include <vector>
-
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/batch_util.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
-class SlideDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit SlideDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 window_size = 0;
-    OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size));
-    OP_REQUIRES(
-        ctx, window_size > 0,
-        errors::InvalidArgument("Window size must be greater than zero."));
-    int64 window_shift = 0;
-    OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<int64>(ctx, "window_shift", &window_shift));
-    OP_REQUIRES(
-        ctx, window_shift > 0,
-        errors::InvalidArgument("Window shift must be greater than zero."));
-    int64 window_stride = 0;
-    OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<int64>(ctx, "window_stride", &window_stride));
-    OP_REQUIRES(
-        ctx, window_stride > 0,
-        errors::InvalidArgument("window_stride must be greater than zero."));
-    if (window_size == window_shift && window_stride == 1) {
-      LOG(WARNING) << "window_shift: " << window_shift
-                   << " is equal to window_size: " << window_size
-                   << " and window_stride is 1, use `batch` instead.";
-    }
-    *output = new Dataset(ctx, window_size, window_shift, window_stride, input);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, int64 window_size, int64 window_shift,
-            int64 window_stride, const DatasetBase* input)
-        : DatasetBase(DatasetContext(ctx)),
-          window_size_(window_size),
-          window_shift_(window_shift),
-          window_stride_(window_stride),
-          input_(input) {
-      input_->Ref();
-
-      const auto& input_shapes = input_->output_shapes();
-      output_shapes_.reserve(input_shapes.size());
-      for (const auto& input_shape : input_shapes) {
-        output_shapes_.emplace_back(
-            PartialTensorShape({-1}).Concatenate(input_shape));
-      }
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          Iterator::Params{this, strings::StrCat(prefix, "::Slide")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return input_->output_dtypes();
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return strings::StrCat("SlideDatasetOp(", window_size_, ", ",
-                             window_shift_, ", ", window_stride_, ")::Dataset");
-    }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      Node* window_size = nullptr;
-      Node* window_shift = nullptr;
-      Node* window_stride = nullptr;
-      TF_RETURN_IF_ERROR(b->AddScalar(window_size_, &window_size));
-      TF_RETURN_IF_ERROR(b->AddScalar(window_shift_, &window_shift));
-      TF_RETURN_IF_ERROR(b->AddScalar(window_stride_, &window_stride));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {input_graph_node, window_size, window_shift, window_stride},
-          output));
-      return Status::OK();
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        const int64 window_size = dataset()->window_size_;
-        const int64 window_shift = dataset()->window_shift_;
-        const int64 window_stride = dataset()->window_stride_;
-        std::vector<std::vector<Tensor>> batch_elements;
-        {
-          mutex_lock l(mu_);
-          if (!input_impl_) {
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-          batch_elements.reserve(window_size);
-
-          // Fill up buffer.
-          size_t target_size = TargetBufferSize(window_size, window_stride);
-          *end_of_sequence = false;
-          for (size_t i = buffer_.size(); i < target_size && !*end_of_sequence;
-               ++i) {
-            std::vector<Tensor> element;
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, &element, end_of_sequence));
-            if (!*end_of_sequence) {
-              buffer_.push_back(std::move(element));
-            } else {
-              input_impl_.reset();
-            }
-          }
-
-          // Drop the final smaller batch.
-          if (buffer_.size() < target_size) {
-            DCHECK(*end_of_sequence);
-            return Status::OK();
-          }
-
-          for (size_t i = 0; i < window_size; ++i) {
-            batch_elements.emplace_back(buffer_[window_stride * i]);
-          }
-
-          // Drop the data before the next iteration.
-          if (window_shift >= buffer_.size()) {
-            for (size_t i = buffer_.size(); i < window_shift; ++i) {
-              bool end_of_input;
-              std::vector<Tensor> element;
-              TF_RETURN_IF_ERROR(
-                  input_impl_->GetNext(ctx, &element, &end_of_input));
-              if (end_of_input) {
-                input_impl_.reset();
-                break;
-              }
-            }
-            buffer_.clear();
-          } else {
-            buffer_.erase(buffer_.begin(), buffer_.begin() + window_shift);
-          }
-        }
-
-        // Construct output tensors.
-        const size_t num_tuple_components = batch_elements[0].size();
-        const int64 num_batch_elements = batch_elements.size();
-        for (size_t component_index = 0; component_index < num_tuple_components;
-             ++component_index) {
-          const Tensor& first_element = batch_elements[0][component_index];
-          TensorShape batch_component_shape({num_batch_elements});
-          batch_component_shape.AppendShape(first_element.shape());
-          Tensor batch_component(cpu_allocator(), first_element.dtype(),
-                                 batch_component_shape);
-          // Build the output tuple component by copying one slice
-          // from each input element in the batch.
-          for (size_t i = 0; i < num_batch_elements; ++i) {
-            if (batch_elements[i][component_index].shape() !=
-                first_element.shape()) {
-              return errors::InvalidArgument(
-                  "Cannot batch tensors with different shapes in component ",
-                  component_index, ". First element had shape ",
-                  first_element.shape().DebugString(), " and element ", i,
-                  " had shape ",
-                  batch_elements[i][component_index].shape().DebugString(),
-                  ".");
-            }
-            TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
-                std::move(batch_elements[i][component_index]), &batch_component,
-                i));
-          }
-          out_tensors->emplace_back(std::move(batch_component));
-        }
-        *end_of_sequence = false;
-        return Status::OK();
-      }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        if (!input_impl_) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_impl_empty"), ""));
-        } else {
-          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        }
-        // Save buffer.
-        TF_RETURN_IF_ERROR(writer->WriteScalar(strings::StrCat("buffer_size"),
-                                               buffer_.size()));
-        for (int64 i = 0; i < buffer_.size(); i++) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              strings::StrCat("buffer[", i, "]_size"), buffer_[i].size()));
-          for (int64 j = 0; j < buffer_[i].size(); j++) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                strings::StrCat("buffer[", i, "][", j, "]"), buffer_[i][j]));
-          }
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        if (!reader->Contains(full_name("input_impl_empty"))) {
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
-        // Restore buffer.
-        int64 buffer_size;
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(strings::StrCat("buffer_size"), &buffer_size));
-        buffer_.resize(buffer_size);
-        for (int64 i = 0; i < buffer_size; i++) {
-          int64 vector_size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              strings::StrCat("buffer[", i, "]_size"), &vector_size));
-          buffer_[i].resize(vector_size);
-          for (int64 j = 0; j < vector_size; j++) {
-            TF_RETURN_IF_ERROR(reader->ReadTensor(
-                strings::StrCat("buffer[", i, "][", j, "]"), &buffer_[i][j]));
-          }
-        }
-        return Status::OK();
-      }
-
-     private:
-      size_t TargetBufferSize(int64 window_size, int64 window_stride) {
-        return (window_size - 1) * window_stride + 1;
-      }
-
-      mutex mu_;
-      std::deque<std::vector<Tensor>> buffer_ GUARDED_BY(mu_);
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-    };
-
-    const int64 window_size_;
-    const int64 window_shift_;
-    const int64 window_stride_;
-    const DatasetBase* const input_;
-    std::vector<PartialTensorShape> output_shapes_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(Name("SlideDataset").Device(DEVICE_CPU),
-                        SlideDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index 85b1e506951279278f348763e5f33eb523257158..be105f8170b8fff79c0c60a76a699a6ee6ba13f9 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -14,17 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include <numeric>
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 template <typename T>
@@ -54,6 +54,8 @@ class Dataset : public DatasetBase {
     return "SparseTensorSliceDatasetOp::Dataset";
   }
 
+  int64 Cardinality() const override { return sparse_tensor_.shape()[0]; }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -152,6 +154,11 @@ class Dataset : public DatasetBase {
     }
 
    protected:
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeSourceNode(std::move(args));
+    }
+
     Status SaveInternal(IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(writer->WriteScalar(Iterator::full_name("i"), i_));
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD
deleted file mode 100644
index dc591208752c52d3f53484f5a1c564666727bb16..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/sql/BUILD
+++ /dev/null
@@ -1,27 +0,0 @@
-# Description:
-#   SQL library.
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-cc_library(
-    name = "sql",
-    srcs = [
-        "driver_manager.cc",
-        "sqlite_query_connection.cc",
-    ],
-    hdrs = [
-        "driver_manager.h",
-        "query_connection.h",
-        "sqlite_query_connection.h",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/kernels/data:dataset",
-        "//tensorflow/core/lib/db:sqlite",
-    ],
-)
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
deleted file mode 100644
index 6bbe459332928dddd76233b749c83ca0b7009f1e..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <utility>
-
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/sql/driver_manager.h"
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
-#include "tensorflow/core/lib/io/inputbuffer.h"
-#include "tensorflow/core/lib/io/record_reader.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-// See documentation in ../ops/dataset_ops.cc for a high-level
-// description of the following ops.
-
-class SqlDatasetOp : public DatasetOpKernel {
- public:
-  explicit SqlDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    for (const DataType& dt : output_types_) {
-      OP_REQUIRES(ctx,
-                  dt == DT_STRING || dt == DT_INT8 || dt == DT_INT16 ||
-                      dt == DT_INT32 || dt == DT_INT64 || dt == DT_UINT8 ||
-                      dt == DT_UINT16 || dt == DT_BOOL || dt == DT_DOUBLE,
-                  errors::InvalidArgument(
-                      "Each element of `output_types_` must be one of: "
-                      "DT_STRING, DT_INT8, DT_INT16, DT_INT32, DT_INT64, "
-                      "DT_UINT8, DT_UINT16, DT_BOOL, DT_DOUBLE "));
-    }
-    for (const PartialTensorShape& pts : output_shapes_) {
-      OP_REQUIRES(ctx, pts.dims() == 0,
-                  errors::InvalidArgument(
-                      "Each element of `output_shapes_` must be a scalar."));
-    }
-  }
-  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
-    string driver_name;
-    OP_REQUIRES_OK(
-        ctx, ParseScalarArgument<string>(ctx, "driver_name", &driver_name));
-
-    string data_source_name;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "data_source_name",
-                                                    &data_source_name));
-
-    string query;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "query", &query));
-
-    // TODO(b/64276826) Change this check when we add support for other
-    // databases.
-    OP_REQUIRES(ctx, driver_name == "sqlite",
-                errors::InvalidArgument(tensorflow::strings::Printf(
-                    "The database type, %s, is not supported by SqlDataset. "
-                    "The set of supported databases is: {'sqlite'}.",
-                    driver_name.c_str())));
-
-    *output = new Dataset(ctx, driver_name, data_source_name, query,
-                          output_types_, output_shapes_);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, const string& driver_name,
-            const string& data_source_name, const string& query,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          driver_name_(driver_name),
-          data_source_name_(data_source_name),
-          query_(query),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {}
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Sql")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override { return "SqlDatasetOp::Dataset"; }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* driver_name_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(driver_name_, &driver_name_node));
-      Node* data_source_name_node;
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(data_source_name_, &data_source_name_node));
-      Node* query_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(query_, &query_node));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {driver_name_node, data_source_name_node, query_node}, output));
-      return Status::OK();
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-      ~Iterator() override {
-        if (query_connection_initialized_) {
-          Status s = query_connection_->Close();
-          if (!s.ok()) {
-            LOG(WARNING) << "Failed to close query connection: " << s;
-          }
-        }
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        if (!query_connection_initialized_) {
-          TF_RETURN_IF_ERROR(InitializeQueryConnection());
-        }
-        next_calls_++;
-        return query_connection_->GetNext(ctx, out_tensors, end_of_sequence);
-      }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        if (query_connection_initialized_) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("next_calls"), next_calls_));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        if (reader->Contains(full_name("next_calls"))) {
-          TF_RETURN_IF_ERROR(InitializeQueryConnection());
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("next_calls"), &next_calls_));
-          int64 rem_next_calls = next_calls_;
-          std::vector<Tensor> out_tensors;
-          bool end_of_sequence = false;
-          while (rem_next_calls--) {
-            TF_RETURN_IF_ERROR(query_connection_->GetNext(ctx, &out_tensors,
-                                                          &end_of_sequence));
-            out_tensors.clear();
-          }
-        } else {
-          query_connection_initialized_ = false;
-        }
-        return Status::OK();
-      }
-
-     private:
-      Status InitializeQueryConnection() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        query_connection_initialized_ = true;
-        query_connection_ =
-            sql::DriverManager::CreateQueryConnection(dataset()->driver_name_);
-        Status s = query_connection_->Open(dataset()->data_source_name_,
-                                           dataset()->query_,
-                                           dataset()->output_types_);
-        next_calls_ = 0;
-        if (!s.ok()) {
-          LOG(WARNING) << "Failed to connect to database: " << s;
-          return s;
-        }
-        return Status::OK();
-      }
-
-      mutex mu_;
-      // TODO(shivaniagrawal): explore ways to seek into a SQLite databases.
-      int64 next_calls_ GUARDED_BY(mu_) = 0;
-      std::unique_ptr<sql::QueryConnection> query_connection_ GUARDED_BY(mu_);
-      bool query_connection_initialized_ GUARDED_BY(mu_) = false;
-    };
-    const string driver_name_;
-    const string data_source_name_;
-    const string query_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-  };
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("SqlDataset").Device(DEVICE_CPU), SqlDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
deleted file mode 100644
index c09a73fff1157f6f0dce9438f6e61eddf2568530..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ /dev/null
@@ -1,210 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/stats_aggregator.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/lib/random/random.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-class StatsAggregatorWithTagAndPrefix : public StatsAggregator {
- public:
-  StatsAggregatorWithTagAndPrefix(
-      std::shared_ptr<StatsAggregator> stats_aggregator, const string& tag,
-      const string& prefix)
-      : wrapped_(stats_aggregator), tag_(tag), prefix_(prefix) {}
-
-  void AddToHistogram(const string& name,
-                      gtl::ArraySlice<double> values) override {
-    if (!tag_.empty()) {
-      wrapped_->AddToHistogram(strings::StrCat(tag_, "_", name), values);
-    } else {
-      wrapped_->AddToHistogram(name, values);
-    }
-  }
-
-  void AddScalar(const string& name, float value) override {
-    if (!tag_.empty()) {
-      wrapped_->AddScalar(strings::StrCat(tag_, "_", name), value);
-    } else {
-      wrapped_->AddScalar(name, value);
-    }
-  }
-
-  void EncodeToProto(Summary* out_summary) override {
-    wrapped_->EncodeToProto(out_summary);
-  }
-
-  void IncrementCounter(const string& name, const string& label,
-                        int64 val) override {
-    if (!prefix_.empty()) {
-      wrapped_->IncrementCounter(strings::StrCat(prefix_, "/", name), label,
-                                 val);
-    } else {
-      wrapped_->IncrementCounter(strings::StrCat("/tensorflow/", name), label,
-                                 val);
-    }
-  }
-
- private:
-  std::shared_ptr<StatsAggregator> wrapped_;
-  string tag_;
-  string prefix_;
-  TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorWithTagAndPrefix);
-};
-
-class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit SetStatsAggregatorDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    StatsAggregatorResource* stats_aggregator_resource;
-    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
-                                       &stats_aggregator_resource));
-    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
-    string tag;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
-    string prefix;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "counter_prefix", &prefix));
-
-    *output = new Dataset(ctx, input, ctx->input(1), stats_aggregator_resource,
-                          tag, prefix);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input,
-                     const Tensor& resource_handle,
-                     StatsAggregatorResource* stats_aggregator_resource,
-                     const string& tag, const string& prefix)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          resource_handle_(resource_handle),
-          stats_aggregator_resource_(stats_aggregator_resource),
-          tag_(tag),
-          prefix_(prefix) {
-      input_->Ref();
-      stats_aggregator_resource_->Ref();
-    }
-
-    ~Dataset() override {
-      input_->Unref();
-      stats_aggregator_resource_->Unref();
-    }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::SetStatsAggregator")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return input_->output_dtypes();
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return input_->output_shapes();
-    }
-
-    string DebugString() const override {
-      return "SetStatsAggregatorDatasetOp::Dataset";
-    }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      Node* resource_handle_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddTensor(resource_handle_, &resource_handle_node));
-      Node* tag_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
-      Node* prefix_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddScalar(prefix_, &prefix_node));
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this, {input_graph_node, resource_handle_node, tag_node, prefix_node},
-          output));
-      return Status::OK();
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        StatsAggregatorResource* stats_aggregator_resource =
-            dataset()->stats_aggregator_resource_;
-        IteratorContext::Params params;
-        params.env = ctx->env();
-        params.runner = *(ctx->runner());
-        params.stats_aggregator = std::shared_ptr<StatsAggregator>(
-            new StatsAggregatorWithTagAndPrefix(
-                stats_aggregator_resource->stats_aggregator(), dataset()->tag_,
-                dataset()->prefix_));
-        params.lib = ctx->lib();
-        params.function_library = ctx->function_library();
-        params.allocator_getter = ctx->allocator_getter();
-        IteratorContext set_stats_aggregator_ctx(params);
-        return input_impl_->GetNext(&set_stats_aggregator_ctx, out_tensors,
-                                    end_of_sequence);
-      }
-
-     protected:
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        return errors::Unimplemented(dataset()->DebugString(),
-                                     " does not support checkpointing");
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        return errors::Unimplemented(dataset()->DebugString(),
-                                     " does not support checkpointing");
-      }
-
-     private:
-      mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* const input_;
-    const Tensor resource_handle_;
-    StatsAggregatorResource* stats_aggregator_resource_;
-    string tag_;
-    string prefix_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(Name("SetStatsAggregatorDataset").Device(DEVICE_CPU),
-                        SetStatsAggregatorDatasetOp);
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index e5cdfdd7321cea3733b0b066754634436df5588d..0a3d5869534ddad9f7ed295171d8deefc2154107 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class TakeDatasetOp : public UnaryDatasetOpKernel {
@@ -68,6 +68,17 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "TakeDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kUnknownCardinality) {
+        return kUnknownCardinality;
+      }
+      if (n == kInfiniteCardinality) {
+        return count_;
+      }
+      return std::min(n, count_);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -94,6 +105,12 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         return Status::OK();
       }
@@ -136,6 +153,12 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index ca4ea25b8999b51ad352bd15eef6f17669c031f9..98c23f23b202dee580fb89f5473f69c61d57c640 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class TensorDatasetOp : public DatasetOpKernel {
@@ -61,6 +61,8 @@ class TensorDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "TensorDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return 1LL; }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -69,10 +71,10 @@ class TensorDatasetOp : public DatasetOpKernel {
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
-        std::vector<std::pair<string, Tensor>>* input_list = ctx->input_list();
-        if (input_list) {
+        if (ctx->optimization_only()) {
           TF_RETURN_IF_ERROR(b->AddPlaceholder(t, &node));
-          input_list->emplace_back(node->name(), t);
+          DCHECK_NE(ctx->input_list(), nullptr);
+          ctx->input_list()->emplace_back(node->name(), t);
         } else {
           TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
         }
@@ -107,6 +109,11 @@ class TensorDatasetOp : public DatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (produced_)
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
deleted file mode 100644
index 2ed636a40095d54278a6373e6bf265c8f330374f..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
+++ /dev/null
@@ -1,652 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <deque>
-
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/util/batch_util.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-bool IsGreaterEqualToOrCompatibleWith(const PartialTensorShape& a,
-                                      const PartialTensorShape& b) {
-  // Returns true if dims[a] >= dims[b], or are compatible.
-  if (a.unknown_rank()) return true;
-  if (a.dims() != b.dims()) return false;
-  for (int d = 0; d < a.dims(); ++d) {
-    if (a.dim_size(d) == -1 || b.dim_size(d) == -1) continue;
-    if (a.dim_size(d) < b.dim_size(d)) return false;
-  }
-  return true;
-}
-
-DataTypeVector PrependQueueType(const DataTypeVector& dtypes) {
-  DataTypeVector out;
-  out.reserve(dtypes.size() + 1);
-  out.push_back(DT_VARIANT);  // The queue component.
-  for (const DataType& d : dtypes) out.push_back(d);
-  return out;
-}
-
-std::vector<PartialTensorShape> PrependQueueShapeWithBatch(
-    const std::vector<PartialTensorShape>& shapes) {
-  std::vector<PartialTensorShape> out;
-  out.reserve(shapes.size() + 1);
-  out.emplace_back(PartialTensorShape({-1}));  // The queue component.
-  for (PartialTensorShape s : shapes) {
-    s.InsertDim(0, -1);  // Unknown batch size.
-    out.push_back(std::move(s));
-  }
-  return out;
-}
-
-class EnqueueInQueueDatasetOp;
-
-class PrependFromQueueAndPaddedBatchDataset : public DatasetBase {
- public:
-  PrependFromQueueAndPaddedBatchDataset(
-      OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input,
-      const DataTypeVector& dtypes,
-      const std::vector<PartialTensorShape>& shapes,
-      std::vector<Tensor> padding_values)
-      : DatasetBase(DatasetContext(ctx)),
-        batch_size_(batch_size),
-        input_(input),
-        dtypes_(dtypes),
-        shapes_(shapes),
-        padding_values_(std::move(padding_values)),
-        dtypes_with_queue_(PrependQueueType(dtypes)),
-        batched_shapes_with_queue_(PrependQueueShapeWithBatch(shapes)) {
-    input_->Ref();
-  }
-
-  ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
-
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(new Iterator(
-        {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
-  }
-
-  const DataTypeVector& output_dtypes() const override {
-    return dtypes_with_queue_;
-  }
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return batched_shapes_with_queue_;
-  }
-
-  string DebugString() const override {
-    return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
-  }
-
- protected:
-  Status AsGraphDefInternal(SerializationContext* ctx,
-                            DatasetGraphDefBuilder* b,
-                            Node** output) const override {
-    Node* input_graph = nullptr;
-    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph));
-    Node* batch_size = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
-
-    std::vector<Node*> padded_shapes;
-    padded_shapes.reserve(shapes_.size());
-    for (int i = 0; i < shapes_.size(); i++) {
-      Node* node;
-      Tensor t(DT_INT64, TensorShape({shapes_[i].dims()}));
-      for (int j = 0; j < shapes_[i].dims(); j++) {
-        t.vec<int64>()(j) = shapes_[i].dim_size(j);
-      }
-      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-      padded_shapes.emplace_back(node);
-    }
-
-    std::vector<Node*> padding_values;
-    padding_values.reserve(padding_values_.size());
-    for (const Tensor& t : padding_values_) {
-      Node* node;
-      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-      padding_values.emplace_back(node);
-    }
-
-    AttrValue output_types;
-    b->BuildAttrValue(dtypes_, &output_types);
-
-    AttrValue output_shapes;
-    b->BuildAttrValue(batched_shapes_with_queue_, &output_shapes);
-
-    AttrValue N;
-    b->BuildAttrValue<int64>(shapes_.size(), &N);
-
-    TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, input_graph}, {1, batch_size}},
-                                     {{2, padded_shapes}, {3, padding_values}},
-                                     {{"Toutput_types", output_types},
-                                      {"output_shapes", output_shapes},
-                                      {"N", N}},
-                                     output));
-
-    return Status::OK();
-  }
-
- private:
-  friend class EnqueueInQueueDatasetOp;
-
-  class Iterator
-      : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
-   public:
-    explicit Iterator(const Params& params)
-        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params) {}
-
-    ~Iterator() override { queue_->Unref(); }
-
-    Status Initialize(IteratorContext* ctx) override {
-      std::unique_ptr<IteratorBase> iterator;
-      TF_RETURN_IF_ERROR(
-          dataset()->input_->MakeIterator(ctx, prefix(), &iterator));
-      queue_ = new TensorQueue(std::move(iterator), dataset()->dtypes_,
-                               dataset()->shapes_);
-      return Status::OK();
-    }
-
-    Status GetNextInternal(IteratorContext* ctx,
-                           std::vector<Tensor>* out_tensors,
-                           bool* end_of_sequence) override {
-      std::vector<std::vector<Tensor>> batch;
-      TF_RETURN_IF_ERROR(queue_->GetNext(ctx, dataset()->batch_size_, &batch,
-                                         end_of_sequence));
-      const auto& dtypes = dataset()->dtypes_;
-      const auto& shapes = dataset()->shapes_;
-      const auto& input_shapes = dataset()->input_->output_shapes();
-      const auto& padding_values = dataset()->padding_values_;
-      const int64 batch_size = batch.size();
-      out_tensors->reserve(dtypes.size());
-
-      std::vector<TensorShape> max_shapes;  // Of non-queue components.
-      for (int i = 0; i < dtypes.size(); ++i) {
-        const PartialTensorShape& shape = shapes[i];
-        TensorShape out_shape({batch_size});
-        for (int r = 0; r < shape.dims(); ++r) {
-          if (shape.dim_size(r) >= 0) {
-            // padded_shape[r] is known.
-            out_shape.AddDim(shape.dim_size(r));
-          } else {
-            // padded_shape[r] is unknown, find the maximum across
-            // the batch.
-            int64 dim = 0;
-            for (int b = 0; b < batch.size(); ++b) {
-              dim = std::max(dim, batch[b][i].dim_size(r));
-            }
-            out_shape.AddDim(dim);
-          }
-        }
-        max_shapes.push_back(std::move(out_shape));
-      }
-
-      Tensor queues_t(cpu_allocator(), DT_VARIANT, TensorShape({batch_size}));
-      if (!batch.empty()) {
-        auto queues = queues_t.flat<Variant>();
-        Variant& queue_inserter = queues(0);
-        queue_inserter = TensorQueueInserter();
-        queue_inserter.get<TensorQueueInserter>()->set_queue(queue_);
-        for (int b = 1; b < batch.size(); ++b) {
-          // Copy the TensorQueueInserter.  Each copy increments the
-          // Ref on the queue_.
-          queues(b) = queues(0);
-        }
-      }
-      out_tensors->push_back(std::move(queues_t));
-
-      for (int i = 0; i < max_shapes.size(); ++i) {
-        Tensor component(cpu_allocator(), dtypes[i], max_shapes[i]);
-        // Try hard to take the fast path.
-        if (shapes[i].IsFullyDefined() &&
-            shapes[i].IsIdenticalTo(input_shapes[i])) {
-          // Take the fast path if we know all the shapes statically.
-          for (int64 b = 0; b < batch.size(); ++b) {
-            TF_RETURN_IF_ERROR(
-                batch_util::CopyElementToSlice(batch[b][i], &component, b));
-          }
-        } else {
-          TF_RETURN_IF_ERROR(
-              batch_util::SetElementZero(&component, padding_values[i]));
-          for (int64 b = 0; b < batch.size(); ++b) {
-            if (batch[b][i].shape() == max_shapes[i]) {
-              TF_RETURN_IF_ERROR(
-                  batch_util::CopyElementToSlice(batch[b][i], &component, b));
-            } else {
-              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
-                  batch[b][i], &component, b));
-            }
-          }
-        }
-        out_tensors->push_back(std::move(component));
-      }
-
-      // end_of_sequence was set before we populated out_tensors, so
-      // it's ok to return now.
-      return Status::OK();
-    }
-
-   protected:
-    // Work around bug in MSVC that disallows access to protected
-    // members of Iterator from within TensorQueue.
-    class TensorQueue;
-    friend class TensorQueue;
-
-    class TensorQueue : public core::RefCounted {
-     public:
-      TensorQueue(std::unique_ptr<IteratorBase> input_impl,
-                  const DataTypeVector& dtypes,
-                  const std::vector<PartialTensorShape>& shapes)
-          : dtypes_(dtypes),
-            shapes_(shapes),
-            input_impl_(std::move(input_impl)) {}
-
-      void MaybeWaitForNotificationLocked(mutex_lock* lock)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // This essentially just releases the lock and immediately relocks.
-        cv_.wait_for(*lock, std::chrono::milliseconds(0));
-      }
-
-      void NotifyLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { cv_.notify_all(); }
-
-      Status GetNext(IteratorContext* ctx, const int64 batch_size,
-                     std::vector<std::vector<Tensor>>* batch,
-                     bool* end_of_sequence) {
-        mutex_lock lock(mu_);
-
-        *end_of_sequence = false;
-
-        for (int64 b = 0; b < batch_size;) {
-          if (!entries_.empty()) {
-            batch->push_back(std::move(entries_.front()));
-            entries_.pop_front();
-            ++b;
-            continue;
-          } else {
-            if (input_impl_) {
-              // There's still input coming in.
-              std::vector<Tensor> tensors;
-              bool input_end;
-              TF_RETURN_IF_ERROR(
-                  input_impl_->GetNext(ctx, &tensors, &input_end));
-              if (!input_end) {
-                batch->push_back(std::move(tensors));
-                ++b;
-                continue;
-              } else {
-                input_impl_.reset();
-              }
-            }
-            if (!input_impl_) {
-              // There's no more input coming in.
-              if (RefCountIsOne()) {
-                // No TensorQueueInserters in the wild.
-                if (batch->empty()) {
-                  *end_of_sequence = true;
-                }
-                break;
-              } else {
-                MaybeWaitForNotificationLocked(&lock);
-                // If there's data available, try to add entries again.
-                // Otherwise return a smaller batch and hope the next
-                // iterator request has a non-empty or unused queue_.
-                if (entries_.empty()) {
-                  break;
-                }
-              }
-            }
-          }
-        }  // for (int64 b = ... batch_size)
-        return Status::OK();
-      }
-
-      Status Insert(const std::vector<Tensor>& tensors) {
-        if (tensors.size() != dtypes_.size()) {
-          return errors::InvalidArgument(
-              "TensorQueue::Insert: mismatched number of tensors.  Queue "
-              "expects ",
-              dtypes_.size(), " tensors but tried to insert ", tensors.size());
-        }
-        for (int i = 0; i < tensors.size(); ++i) {
-          if (tensors[i].dtype() != dtypes_[i]) {
-            return errors::InvalidArgument(
-                "TensorQueue::Insert: mismatched dtypes at component ", i,
-                ".  Attempted "
-                "to insert tensor of type ",
-                DataTypeString(tensors[i].dtype()),
-                " but queue expected type: ", DataTypeString(dtypes_[i]));
-          }
-          if (!shapes_[i].IsCompatibleWith(tensors[i].shape())) {
-            return errors::InvalidArgument(
-                "TensorQueue::Insert: mismatched shapes at component ", i,
-                ".  Attempted "
-                "to insert tensor with shape ",
-                tensors[i].shape().DebugString(),
-                " but queue expected shape: ", shapes_[i].DebugString());
-          }
-        }
-        mutex_lock lock(mu_);
-        entries_.push_back(tensors);
-        NotifyLocked();
-        return Status::OK();
-      }
-
-      Status Save(Iterator* iter, IteratorStateWriter* writer) {
-        mutex_lock lock(mu_);
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(iter->SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(iter->full_name("input_exhausted"), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(iter->full_name("entries_size"),
-                                               entries_.size()));
-        for (int64 b = 0; b < entries_.size(); ++b) {
-          for (int i = 0; i < dtypes_.size(); ++i) {
-            TF_RETURN_IF_ERROR(
-                writer->WriteTensor(strings::StrCat(iter->full_name("entries"),
-                                                    "[", b, "][", i, "]"),
-                                    entries_[b][i]));
-          }
-        }
-        return Status::OK();
-      }
-
-      Status Restore(Iterator* iter, IteratorContext* ctx,
-                     IteratorStateReader* reader) {
-        mutex_lock l(mu_);
-        if (reader->Contains(iter->full_name("input_exhausted"))) {
-          input_impl_.reset();
-        } else {
-          TF_RETURN_IF_ERROR(iter->dataset_input()->MakeIterator(
-              ctx, iter->prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(iter->RestoreInput(ctx, reader, input_impl_));
-        }
-        entries_.clear();
-        int64 entries_size = -1;
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(iter->full_name("entries_size"), &entries_size));
-        if (entries_size < 0) {
-          return errors::DataLoss(
-              "Expected entries_size key '", iter->full_name("entries_size"),
-              "' to have nonnegative value, but saw: ", entries_size);
-        }
-        for (int64 b = 0; b < entries_size; ++b) {
-          std::vector<Tensor> entry;
-          for (int i = 0; i < dtypes_.size(); ++i) {
-            Tensor value;
-            TF_RETURN_IF_ERROR(
-                reader->ReadTensor(strings::StrCat(iter->full_name("entries"),
-                                                   "[", b, "][", i, "]"),
-                                   &value));
-            entry.push_back(std::move(value));
-          }
-          entries_.push_back(std::move(entry));
-        }
-        return Status::OK();
-      }
-
-      mutex* mu() { return &mu_; }
-
-     private:
-      DataTypeVector dtypes_;
-      std::vector<PartialTensorShape> shapes_;
-
-      mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::deque<std::vector<Tensor>> entries_ GUARDED_BY(mu_);
-      condition_variable cv_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* dataset_input() const { return dataset()->input_; }
-
-    Status SaveInternal(IteratorStateWriter* writer) override {
-      return queue_->Save(this, writer);
-    }
-
-    Status RestoreInternal(IteratorContext* ctx,
-                           IteratorStateReader* reader) override {
-      return queue_->Restore(this, ctx, reader);
-    }
-
-   public:
-    class TensorQueueInserter {
-     public:
-      TensorQueueInserter() : queue_(nullptr) {}
-
-      void set_queue(TensorQueue* queue) {
-        queue_ = queue;
-        queue_->Ref();
-      }
-
-      TensorQueueInserter(const TensorQueueInserter& rhs) {
-        queue_ = rhs.queue_;
-        queue_->Ref();
-      };
-
-      TensorQueueInserter(TensorQueueInserter&& rhs) {
-        queue_ = rhs.queue_;
-        rhs.queue_ = nullptr;
-      }
-
-      TensorQueueInserter& operator=(const TensorQueueInserter& rhs) = delete;
-
-      string TypeName() const { return "tensorflow::TensorQueueInserter"; }
-      string DebugString() const { return TypeName(); }
-
-      void Encode(VariantTensorData*) const {}
-      bool Decode(const VariantTensorData&) { return false; }
-
-      ~TensorQueueInserter() {
-        if (queue_) {
-          mutex_lock lock(*queue_->mu());
-          queue_->Unref();
-          queue_->NotifyLocked();
-          queue_ = nullptr;
-        }
-      }
-
-      Status Insert(const std::vector<Tensor>& tensors) const {
-        CHECK(queue_);
-        return queue_->Insert(tensors);
-      }
-
-     private:
-      mutable TensorQueue* queue_;
-    };
-
-   private:
-    TensorQueue* queue_;
-  };
-
- private:
-  const int64 batch_size_;
-  const DatasetBase* input_;
-  const DataTypeVector dtypes_;
-  const std::vector<PartialTensorShape> shapes_;
-  const std::vector<Tensor> padding_values_;
-  const DataTypeVector dtypes_with_queue_;
-  const std::vector<PartialTensorShape> batched_shapes_with_queue_;
-};
-
-class PrependFromQueueAndPaddedBatchDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit PrependFromQueueAndPaddedBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 batch_size = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
-    OP_REQUIRES(
-        ctx, batch_size > 0,
-        errors::InvalidArgument("Batch size must be greater than zero."));
-
-    OpInputList padded_shape_tensors;
-    OP_REQUIRES_OK(ctx,
-                   ctx->input_list("padded_shapes", &padded_shape_tensors));
-    std::vector<PartialTensorShape> padded_shapes;
-    padded_shapes.reserve(padded_shape_tensors.size());
-    OP_REQUIRES(ctx,
-                padded_shape_tensors.size() == input->output_shapes().size(),
-                errors::InvalidArgument("Number of padded shapes (",
-                                        padded_shape_tensors.size(),
-                                        ") must match the number of components "
-                                        "in the input dataset's elements (",
-                                        input->output_shapes().size(), ")"));
-    for (const Tensor& padded_shape_t : padded_shape_tensors) {
-      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
-                  errors::InvalidArgument("All padded shapes must be vectors"));
-      PartialTensorShape padded_shape;
-      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
-                              padded_shape_t.vec<int64>().data(),
-                              padded_shape_t.NumElements(), &padded_shape));
-      padded_shapes.push_back(std::move(padded_shape));
-    }
-
-    OP_REQUIRES(
-        ctx, input->output_dtypes() == output_types_,
-        errors::InvalidArgument("Input dataset and this dataset "
-                                "have different output_types: ",
-                                DataTypeVectorString(input->output_dtypes()),
-                                " and ", DataTypeVectorString(output_types_)));
-
-    for (int i = 0; i < input->output_shapes().size(); ++i) {
-      // Exclude the queue from the tensor_shapes calculation.
-      const PartialTensorShape& tensor_shape = padded_shapes[i];
-      OP_REQUIRES(
-          ctx,
-          IsGreaterEqualToOrCompatibleWith(tensor_shape,
-                                           input->output_shapes()[i]),
-          errors::InvalidArgument("Incompatible input shapes at component ", i,
-                                  " between input dataset this dataset: ",
-                                  input->output_shapes()[i].DebugString(),
-                                  " vs. ", tensor_shape.DebugString()));
-    }
-
-    OpInputList padding_values_list;
-    OP_REQUIRES_OK(ctx,
-                   ctx->input_list("padding_values", &padding_values_list));
-    std::vector<Tensor> padding_values;
-    OP_REQUIRES(ctx,
-                padding_values_list.size() == input->output_shapes().size(),
-                errors::InvalidArgument(
-                    "Number of padding values (", padding_values_list.size(),
-                    ") must match the number of components in the input "
-                    "dataset's elements (",
-                    input->output_shapes().size(), ")"));
-    for (int i = 0; i < padding_values_list.size(); ++i) {
-      const Tensor& padding_value_t = padding_values_list[i];
-      OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
-          errors::InvalidArgument(
-              "All padding values must be scalars; but at component ", i,
-              " saw shape: ", padding_value_t.shape().DebugString()));
-      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
-                  errors::InvalidArgument(
-                      "Mismatched type between padding value ", i,
-                      " and input dataset's component ", i, ": ",
-                      DataTypeString(padding_value_t.dtype()), " vs. ",
-                      DataTypeString(input->output_dtypes()[i])));
-      padding_values.push_back(padding_value_t);
-    }
-
-    *output = new PrependFromQueueAndPaddedBatchDataset(
-        ctx, batch_size, input, output_types_, padded_shapes,
-        std::move(padding_values));
-  }
-
- private:
-  DataTypeVector output_types_;
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("PrependFromQueueAndPaddedBatchDataset").Device(DEVICE_CPU),
-    PrependFromQueueAndPaddedBatchDatasetOp);
-
-class EnqueueInQueueDatasetOp : public OpKernel {
- public:
-  explicit EnqueueInQueueDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) override {
-    using TensorQueueInserter =
-        PrependFromQueueAndPaddedBatchDataset::Iterator::TensorQueueInserter;
-
-    // TODO(ebrevdo): accept list of sequence lengths to do proper
-    // sub-slicing of tensors for placement into the queue?
-    const Tensor& tensor_queue_t = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_queue_t.shape()),
-                errors::InvalidArgument("queue must be a vector, saw shape: ",
-                                        tensor_queue_t.shape().DebugString()));
-    std::vector<const TensorQueueInserter*> inserters;
-    const int64 batch_size = tensor_queue_t.NumElements();
-    inserters.reserve(batch_size);
-    const Variant* variants = tensor_queue_t.flat<Variant>().data();
-    for (int i = 0; i < batch_size; ++i) {
-      const auto* inserter = variants[i].get<TensorQueueInserter>();
-      OP_REQUIRES(ctx, inserter != nullptr,
-                  errors::InvalidArgument(
-                      "Could not access TensorQueueInserter from queue[", i,
-                      "].  Received variant: ", variants[i].DebugString()));
-      inserters.push_back(inserter);
-    }
-
-    OpInputList components;
-    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components));
-    for (int i = 0; i < components.size(); ++i) {
-      OP_REQUIRES(
-          ctx,
-          components[i].dims() > 0 && components[i].dim_size(0) == batch_size,
-          errors::InvalidArgument(
-              "Expected component ", i, " to have batched shape [", batch_size,
-              ",...], but saw shape: ", components[i].shape().DebugString()));
-    }
-    std::vector<TensorShape> element_shapes;
-    for (int i = 0; i < components.size(); ++i) {
-      TensorShape element_shape = components[i].shape();
-      element_shape.RemoveDim(0);
-      element_shapes.push_back(std::move(element_shape));
-    }
-    for (int64 b = 0; b < batch_size; ++b) {
-      std::vector<Tensor> tensors;
-      tensors.reserve(components.size());
-      for (int i = 0; i < components.size(); ++i) {
-        Tensor t(components[i].dtype(), element_shapes[i]);
-        OP_REQUIRES_OK(ctx,
-                       batch_util::CopySliceToElement(components[i], &t, b));
-        tensors.push_back(std::move(t));
-      }
-      // TODO(ebrevdo): Acquire the lock once for all inserters with
-      // the same underlying queue?  Add InsertLocked?
-      OP_REQUIRES_OK(ctx, inserters[b]->Insert(tensors));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("EnqueueInQueueDataset").Device(DEVICE_CPU),
-                        EnqueueInQueueDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 7dc64b0a75ad56a4d56f00a5c69561669692379a..4ba2bde718a6351ff13bc17cf14ae5c60332c6ca 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class TensorSliceDatasetOp : public DatasetOpKernel {
@@ -84,6 +84,8 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       return "TensorSliceDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return tensors_[0].dim_size(0); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -92,10 +94,10 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
-        std::vector<std::pair<string, Tensor>>* input_list = ctx->input_list();
-        if (input_list) {
+        if (ctx->optimization_only()) {
           TF_RETURN_IF_ERROR(b->AddPlaceholder(t, &node));
-          input_list->emplace_back(node->name(), t);
+          DCHECK_NE(ctx->input_list(), nullptr);
+          ctx->input_list()->emplace_back(node->name(), t);
         } else {
           TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
         }
@@ -125,10 +127,11 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
           out_tensors->reserve(dataset()->tensors_.size());
           for (int i = 0; i < dataset()->tensors_.size(); ++i) {
             const Tensor& t = dataset()->tensors_[i];
-            Tensor t_slice(ctx->allocator({}), t.dtype(),
-                           TensorShape(dataset()->shapes_[i].dim_sizes()));
-            TF_RETURN_IF_ERROR(batch_util::CopySliceToElement(t, &t_slice, i_));
-            out_tensors->emplace_back(std::move(t_slice));
+            out_tensors->emplace_back(
+                ctx->allocator({}), t.dtype(),
+                TensorShape(dataset()->shapes_[i].dim_sizes()));
+            TF_RETURN_IF_ERROR(
+                batch_util::CopySliceToElement(t, &out_tensors->back(), i_));
           }
           ++i_;
           *end_of_sequence = false;
@@ -139,6 +142,11 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeSourceNode(std::move(args));
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 2ad4711aabe40bc6af771396c40006670eaf6b9b..c295631550aa008ccbf1abee0a91b27d64a6ba35 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -41,6 +41,16 @@ class WindowDataset : public DatasetBase {
     return output_shapes_;
   }
 
+  int64 AllocatedBytes() const override {
+    int64 allocated_bytes = 0;
+    for (auto& element : elements_) {
+      allocated_bytes += GetAllocatedBytes(element);
+    }
+    return allocated_bytes;
+  }
+
+  int64 Cardinality() const override { return elements_.size(); }
+
   string DebugString() const override { return "WindowDataset"; }
 
  protected:
diff --git a/tensorflow/core/kernels/data/window_dataset.h b/tensorflow/core/kernels/data/window_dataset.h
index 84cb3c7860e42446fbd901eabefd346b47ca3aeb..fa069e1ddf93aec5f65a0503a57000dd2d2c05a4 100644
--- a/tensorflow/core/kernels/data/window_dataset.h
+++ b/tensorflow/core/kernels/data/window_dataset.h
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index ac44623ce202588431aa2488fff57f8ba3f3ac2b..ae13ae5da8d4c093bdb4d6e168584bda234e4502 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/data/window_dataset.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class WindowDatasetOp : public UnaryDatasetOpKernel {
@@ -98,6 +98,15 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
                              window_stride_, drop_remainder_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / window_shift_ +
+             (n % window_shift_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -155,6 +164,7 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
               Status status =
                   input_impl_->GetNext(ctx, &element, end_of_sequence);
               if (!*end_of_sequence) {
+                RecordBufferEnqueue(ctx, element);
                 buffer_.emplace_back(std::move(element), status);
               } else {
                 input_impl_.reset();
@@ -192,8 +202,14 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
                 input_impl_.reset();
               }
             }
+            for (size_t i = 0; i < buffer_.size(); ++i) {
+              RecordBufferDequeue(ctx, buffer_.at(i).result);
+            }
             buffer_.clear();
           } else {
+            for (size_t i = 0; i < window_shift; ++i) {
+              RecordBufferDequeue(ctx, buffer_.at(i).result);
+            }
             buffer_.erase(buffer_.begin(), buffer_.begin() + window_shift);
           }
         }
@@ -232,6 +248,12 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         dataset()->window_shift_);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (!input_impl_) {
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
deleted file mode 100644
index 3f76695bb1cfa3796a7427def7ffeb062af35820..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/data/dataset.h"
-#include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/io/record_writer.h"
-#include "tensorflow/core/platform/file_system.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-class ToTFRecordOp : public AsyncOpKernel {
- public:
-  explicit ToTFRecordOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx),
-        thread_pool_(new thread::ThreadPool(
-            ctx->env(), ThreadOptions(),
-            strings::StrCat("to_tf_record__op_", SanitizeThreadSuffix(name())),
-            1 /* num_threads */, false /* low_latency_hint */)) {}
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    // The call to `iterator->GetNext()` may block and depend on an
-    // inter-op thread pool thread, so we issue the call from the
-    // owned thread pool.
-    thread_pool_->Schedule([this, ctx, done]() {
-      string filename;
-      OP_REQUIRES_OK_ASYNC(
-          ctx, ParseScalarArgument<string>(ctx, "filename", &filename), done);
-      string compression_type;
-      OP_REQUIRES_OK_ASYNC(ctx,
-                           ParseScalarArgument<string>(ctx, "compression_type",
-                                                       &compression_type),
-                           done);
-      std::unique_ptr<WritableFile> file;
-      OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file),
-                           done);
-      std::unique_ptr<io::RecordWriter> writer;
-      writer.reset(new io::RecordWriter(
-          file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
-                          compression_type)));
-
-      DatasetBase* dataset;
-      OP_REQUIRES_OK_ASYNC(
-          ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      std::unique_ptr<IteratorBase> iterator;
-      OP_REQUIRES_OK_ASYNC(
-          ctx,
-          dataset->MakeIterator(IteratorContext(ctx), "ToTFRecordOpIterator",
-                                &iterator),
-          done);
-
-      std::vector<Tensor> components;
-      components.reserve(dataset->output_dtypes().size());
-      bool end_of_sequence;
-      do {
-        OP_REQUIRES_OK_ASYNC(ctx,
-                             iterator->GetNext(IteratorContext(ctx),
-                                               &components, &end_of_sequence),
-                             done);
-
-        if (!end_of_sequence) {
-          OP_REQUIRES_OK_ASYNC(
-              ctx, writer->WriteRecord(components[0].scalar<string>()()), done);
-        }
-        components.clear();
-      } while (!end_of_sequence);
-      done();
-    });
-  }
-
- private:
-  std::unique_ptr<thread::ThreadPool> thread_pool_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
-                        ToTFRecordOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 61a2078f46d016ba5d7bd12beebb718692140f7e..1760e63a9e1c6b6262c19baa8354052d7d73fd3c 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/dataset.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-// See documentation in ../ops/dataset_ops.cc for a high-level
+// See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
 class ZipDatasetOp : public DatasetOpKernel {
@@ -76,6 +76,21 @@ class ZipDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "ZipDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 result = kInfiniteCardinality;
+      for (const auto& input : inputs_) {
+        int64 n = input->Cardinality();
+        if (n == kUnknownCardinality) {
+          return kUnknownCardinality;
+        }
+        if (n != kInfiniteCardinality &&
+            (result == kInfiniteCardinality || n < result)) {
+          result = n;
+        }
+      }
+      return result;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -136,6 +151,14 @@ class ZipDatasetOp : public DatasetOpKernel {
       }
 
      protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        // NOTE: Although this dataset may have multiple inputs, it always
+        // consumes one element per input to produce an output.
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         if (input_impls_.empty()) {
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 23319e6d0c56788e875eba0720006e2843d78a9d..27020cdabdb867e149bc65743fc60673492436f2 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -156,6 +156,16 @@ TF_CALL_int32(REGISTER_KERNEL);
 TF_CALL_int64(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
+#define REGISTER_KERNEL(T)                             \
+  REGISTER_KERNEL_BUILDER(Name("DataFormatVecPermute") \
+                              .Device(DEVICE_CPU)      \
+                              .Label("host")           \
+                              .TypeConstraint<T>("T"), \
+                          DataFormatVecPermuteOp<CPUDevice, T>);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
diff --git a/tensorflow/core/kernels/dataset.h b/tensorflow/core/kernels/dataset.h
deleted file mode 100644
index 69ab78d6355dc2e22c7d77b62123fc0bd2359fc4..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/dataset.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATASET_H_
-#define TENSORFLOW_CORE_KERNELS_DATASET_H_
-
-#include "tensorflow/core/kernels/data/dataset.h"
-
-#endif  // TENSORFLOW_CORE_KERNELS_DATASET_H_
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 1aa8c72d667207cf7d24107da235c0006a6f03f7..750c0318a4df483e1980869a3af8de8aa1efea41 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -434,10 +434,9 @@ struct TransformFilters {
         tile_spatial_size, base_filter_spatial_size, transform_matrix);
 
     auto shard = [&ctx, &args, &transform, &base_filter_rows, &base_filter_cols,
-                  &num_filters_transform, &in_depth, &out_depth,
-                  &filter_shards_row, &filter_shards_col, &tile_spatial_size,
-                  &filter_in, &transform_matrix,
-                  &filter_out](int64 start, int64 limit) {
+                  &num_filters_transform, &in_depth, &filter_shards_row,
+                  &filter_shards_col, &tile_spatial_size, &filter_in,
+                  &transform_matrix, &filter_out](int64 start, int64 limit) {
       // Allocate buffer for pre-processed filter:
       //   [base_filter_rows, base_filter_cols, num_filters_transform, in_depth]
       //
@@ -500,8 +499,9 @@ class GemmFilterPacker {
   typedef Eigen::internal::const_blas_data_mapper<T, int64, Eigen::RowMajor>
       LhsMapper;
   typedef Eigen::internal::gebp_traits<T, T> Traits;
-  Eigen::internal::gemm_pack_lhs<T, int64, LhsMapper, Traits::mr,
-                                 Traits::LhsProgress, Eigen::RowMajor>
+  Eigen::internal::gemm_pack_lhs<
+      T, int64, LhsMapper, Traits::mr, Traits::LhsProgress,
+      typename Traits::LhsPacket4Packing, Eigen::RowMajor>
       pack_lhs;
 
   GemmFilterPacker(const int64 rows, const int64 depth, const T* lhs_input,
@@ -532,9 +532,9 @@ struct PackFilters {
     const int64 out_depth = args.out_depth;
     const int64 num_filters = filter_shards_row * filter_shards_col * out_depth;
 
-    auto shard = [&ctx, &packed_filters, &filter_transform_data,
-                  &tile_spatial_size, &in_depth, &out_depth, &filter_shards_row,
-                  &filter_shards_col, &num_filters](int64 start, int64 limit) {
+    auto shard = [&ctx, &packed_filters, &filter_transform_data, &in_depth,
+                  &out_depth, &filter_shards_row, &filter_shards_col,
+                  &num_filters](int64 start, int64 limit) {
       const int64 filter_coord_stride = num_filters * in_depth;
       for (int64 i = start; i < limit; ++i) {
         // Allocate filter buffer [out_depth, shard_rows, shard_cols, in_depth].
@@ -787,7 +787,7 @@ struct TransformOutputTile {
             const int64 shard_base = sr * filter_shards_col + sc;
             const int64 out_buf_base = tile_base + out_depth_base + shard_base;
 
-            // Calcuate output indices and outputs to drop (if needed).
+            // Calculate output indices and outputs to drop (if needed).
             const int64 out_r_start =
                 in_r + args.pad_rows - sr * tile_stride_rows;
             // NOTE: The index 't' for 'num_tiles is used in index calculation
@@ -1003,9 +1003,9 @@ struct DeepConv2D<CPUDevice, T> {
         out_tile_spatial_size, tile_spatial_size, output_transform_matrix);
 
     auto shard = [&ctx, &args, &transform, &packed_filters, &in_depth,
-                  out_depth, tile_rows, tile_cols, out_tile_rows, out_tile_cols,
-                  filter_shards_row, filter_shards_col, tile_spatial_size,
-                  &input, &tile_transform_matrix, &output_transform_matrix,
+                  out_depth, out_tile_rows, out_tile_cols, filter_shards_row,
+                  filter_shards_col, tile_spatial_size, &input,
+                  &tile_transform_matrix, &output_transform_matrix,
                   &output](int64 batch_start, int64 batch_limit) {
       const int64 row_tiles =
           (args.out_rows + out_tile_rows - 1) / out_tile_rows +
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 17a85d97736c7e01f5cd19c67215567ba67ed1e4..25c57384ca9aa447b4ed92910ca481ff42290b2c 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -59,6 +59,7 @@ struct DenseUpdate<GPUDevice, T, SUB> {
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
+TF_CALL_int8(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
 #define DEFINE_GPU_KERNELS(T) \
@@ -66,6 +67,7 @@ TF_CALL_int64(DEFINE_GPU_KERNELS);
 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
+TF_CALL_int8(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 76afd6f18c23157d79375ff1340a0fb655ab6852..e811968d277ba3594341a59e8d6262cac637e602 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -175,7 +175,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.x depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -459,7 +459,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.z depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -764,7 +764,7 @@ Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args,
   const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
                                       kKnownDepthMultiplier < 0
                                   ? std::numeric_limits<int>::max()
-                                  : device.getNumCudaMultiProcessors();
+                                  : device.getNumGpuMultiProcessors();
   kernel<<<std::min(max_block_count, config.block_count),
            config.thread_per_block, 0, device.stream()>>>(args, input, filter,
                                                           output, num_outputs);
@@ -1176,7 +1176,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
   // Holds block plus halo and filter data for blockDim.x depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -1448,7 +1448,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
   // Holds block plus halo and filter data for blockDim.z depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 3c988db5e618b976b5b2d45a9bfc386485249826..572d04ae2c464d493508d494ba325a33eb92d4c1 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -142,7 +142,7 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
         OP_REQUIRES(
             c, FastBoundsCheck(p, num_partitions_),
             errors::InvalidArgument("indices[", i,
-                                    "] has been asynchronously overwitten and "
+                                    "] has been asynchronously overwritten and "
                                     "is no longer in range!"));
         auto oi = output_index[p];
         OP_REQUIRES(c, FastBoundsCheck(oi, out_flat[p].dimension(0)),
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index fb2a4cc8ef5335d677601dcae7f5ad185bd3df5e..f21f2acf2622a56cc3d6f58d259f79788a314dfb 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -327,6 +327,7 @@ struct ParallelDynamicStitchOpCPU : DynamicStitchOpImplCPU<T, true> {
 
 TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH);
 TF_CALL_variant(REGISTER_DYNAMIC_STITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_DYNAMIC_STITCH);
 #undef REGISTER_DYNAMIC_STITCH
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/eigen_activations_test.cc b/tensorflow/core/kernels/eigen_activations_test.cc
index 34952f5abb8526f0317ba8a674948fada4dc0ce7..195525b02f9c3a829a8c2c3572317a1db5116d11 100644
--- a/tensorflow/core/kernels/eigen_activations_test.cc
+++ b/tensorflow/core/kernels/eigen_activations_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/eigen_activations.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace Eigen {
diff --git a/tensorflow/core/kernels/eigen_attention_test.cc b/tensorflow/core/kernels/eigen_attention_test.cc
index 08f61877182cce36316752b7dd17dee3bd2efaac..8886dba49613b85efecf6b13870a4ec93b09621a 100644
--- a/tensorflow/core/kernels/eigen_attention_test.cc
+++ b/tensorflow/core/kernels/eigen_attention_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/eigen_attention.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace Eigen {
diff --git a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
index 673ec1458b8fb7c187723be360f4b9be164fa547..e5500ba7dadc5bb908cc1a2727b7d085b7a66d30 100644
--- a/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_backward_spatial_convolutions_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..66e93a83af2e5a7aa40818067638bfdde8dd42c9
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -0,0 +1,239 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
+#define TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
+
+// Depending on a build configuration this header provides custom kernel for
+// Eigen tensor contractions (small matrix multiplication kernel used to
+// multiple together blocks of the original tensors).
+//
+// 1) --define tensorflow_mkldnn_contraction_kernel=1
+//    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at
+//    runtime and use avx/avx2/fma/avx512 based on cpu status registers
+//    (https://en.wikipedia.org/wiki/CPUID).
+//
+// If you use `tensor.contract(other_tensor)` in your code, you must include
+// this header to get the benefit of custom contraction kernel:
+//
+//   #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+//   #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+//   #endif
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "mkldnn.h"
+
+namespace Eigen {
+namespace internal {
+
+// Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1"
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
+template <typename Scalar, typename IndexType, typename DataMapper,
+          int StorageOrder>
+struct mkldnn_gemm_pack;
+
+// mkl_gemm_pack for ColMajor storage order.
+template <typename Scalar, typename IndexType, typename DataMapper>
+struct mkldnn_gemm_pack<Scalar, IndexType, DataMapper,
+                        /*StorageOrder*/ ColMajor> {
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  enum { PacketSize = internal::packet_traits<Scalar>::size };
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& data_mapper, IndexType rows,
+                  IndexType cols) {
+    const IndexType unrolled_rows =
+        (rows / (4 * PacketSize)) * (4 * PacketSize);
+    const IndexType vectorized_rows = (rows / PacketSize) * PacketSize;
+
+    for (IndexType col = 0; col < cols; ++col) {
+      LinearMapper lm = data_mapper.getLinearMapper(0, col);
+
+      // Give compiler a strong possibility to unroll the loop.
+      for (IndexType i = 0; i < unrolled_rows; i += 4 * PacketSize) {
+        for (IndexType j = 0; j < 4; ++j) {
+          const Packet p = lm.template loadPacket<Packet>(i + j * PacketSize);
+          internal::pstoreu(block + j * PacketSize, p);
+        }
+        block += 4 * PacketSize;
+      }
+
+      // Process remaining rows with packets.
+      for (IndexType i = unrolled_rows; i < vectorized_rows; i += PacketSize) {
+        const Packet p = lm.template loadPacket<Packet>(i);
+        internal::pstoreu(block, p);
+        block += PacketSize;
+      }
+
+      // Finalize with coefficients.
+      for (IndexType i = vectorized_rows; i < rows; ++i) {
+        *block = lm(i);
+        ++block;
+      }
+    }
+  }
+};
+
+template <typename Scalar, typename IndexType, typename OutputMapper,
+          bool ConjugateLhs = false, bool ConjugateRhs = false>
+struct mkldnn_gemm_kernel;
+
+// mkldnn_gemm_kernel for floats defined as a thin layer on top of mkldnn_sgemm.
+template <typename IndexType, typename OutputMapper, bool ConjugateLhs,
+          bool ConjugateRhs>
+struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
+                          ConjugateLhs, ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const OutputMapper& output, const float* blockA,
+                  const float* blockB, const IndexType rows,
+                  const IndexType depth, const IndexType cols, float alpha) {
+    static const int max_index = (std::numeric_limits<int>::max)();
+
+    eigen_assert(max_index >= rows);
+    eigen_assert(max_index >= cols);
+    eigen_assert(max_index >= depth);
+    eigen_assert(max_index >= output.stride());
+
+    const int m = static_cast<int>(rows);
+    const int n = static_cast<int>(cols);
+    const int k = static_cast<int>(depth);
+
+    const char transposeA = ConjugateLhs ? 'Y' : 'N';
+    const char transposeB = ConjugateRhs ? 'Y' : 'N';
+
+    const int ldA = ConjugateLhs ? k : m;
+    const int ldB = ConjugateRhs ? n : k;
+    const int ldC = static_cast<int>(output.stride());
+
+    const float beta = 1.0;
+
+    mkldnn_status_t st = mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k,
+                                      &alpha, blockA, &ldA, blockB, &ldB, &beta,
+                                      const_cast<float*>(output.data()), &ldC);
+    eigen_assert(st == 0);
+
+    // eigen_assert is a no-op in optimized mode so we add these to avoid
+    // compiler's unused-variable errors.
+    EIGEN_UNUSED_VARIABLE(max_index);
+    EIGEN_UNUSED_VARIABLE(st);
+  }
+};
+
+// For mkldnn_sgemm having the right dimensions (especially for small matrices)
+// is more important than fitting all the working set in L1/L2 caches.
+// TODO(ezhulenev): Do better heuristics.
+template <typename StorageIndex, int sharding_type>
+class TensorContractionBlocking<float, float, float, StorageIndex,
+                                sharding_type> {
+  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
+  using Scalar = float;
+
+  // Adjust the block sizes to work well with mkldnn kernels.
+
+  // Multiply default choice of block size along M and N dimensions.
+  // TODO(ezhulenev): Explore if this can work in general (kScaleM=2.0 worked
+  // well in some of models).
+  static constexpr float kScaleM = 1.5;
+  static constexpr float kScaleN = 1.0;
+
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48.
+  static const StorageIndex kUnrollM = 48;
+
+  // Mkldnn Avx/Avx2/Avx512 unroll factors are: 6/6/8.
+  static const StorageIndex kUnrollN = 24;
+
+ public:
+  TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n,
+                            StorageIndex num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
+    // 1. Compute block sizes using default Eigen heuristics.
+    if (sharding_type == ShardByCol) {
+      computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, mc_, nc_,
+                                                     num_threads);
+    } else {
+      computeProductBlockingSizes<Scalar, Scalar, 1>(kc_, nc_, mc_,
+                                                     num_threads);
+    }
+
+    // 2. And refine them to work well with mkldnn sgemm.
+    mc_ = (std::min)(
+        m, Eigen::divup(static_cast<StorageIndex>(mc_ * kScaleM), kUnrollM) *
+               kUnrollM);
+    nc_ = (std::min)(
+        n, Eigen::divup(static_cast<StorageIndex>(nc_ * kScaleN), kUnrollN) *
+               kUnrollN);
+
+    // We split Kth dimensions in roughly equal slices.
+    StorageIndex target_k_slices =
+        (std::max)(StorageIndex(1), Eigen::divup(k, kc_));
+    StorageIndex packet_size = 8;
+    StorageIndex target_bk =
+        Eigen::divup(k / target_k_slices, packet_size) * packet_size;
+    kc_ = (std::min)(k, target_bk);
+  }
+
+  EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+
+ private:
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
+};
+
+template <typename StorageIndex, typename OutputMapper, typename LhsMapper,
+          typename RhsMapper>
+struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
+                               LhsMapper, RhsMapper> {
+  // For now mkldnn has only mkldnn_sgemm (gemm for floats).
+  using Scalar = float;
+  using Traits = typename internal::gebp_traits<Scalar, Scalar>;
+
+  using LhsPacker = mkldnn_gemm_pack<Scalar, StorageIndex,
+                                     typename LhsMapper::SubMapper, ColMajor>;
+  using RhsPacker = mkldnn_gemm_pack<Scalar, StorageIndex,
+                                     typename RhsMapper::SubMapper, ColMajor>;
+  using GemmKernel = mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>;
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packLhs(
+      Scalar* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(lhsBlock, data_mapper, rows, depth);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packRhs(
+      Scalar* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
+      const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void invoke(
+      const OutputMapper& output_mapper, const Scalar* lhsBlock,
+      const Scalar* rhsBlock, const StorageIndex rows, const StorageIndex depth,
+      const StorageIndex cols, const Scalar alpha) {
+    GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
+  }
+};
+
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // TENSORFLOW_CORE_KERNELS_EIGEN_CONTRACTION_KERNEL_H_
diff --git a/tensorflow/core/kernels/eigen_cuboid_convolution.h b/tensorflow/core/kernels/eigen_cuboid_convolution.h
index 6a9a2accd8d807834930411d2cd1d1e0e9d3c55f..3182307e51e5fc2912ff7e178fbeab6c73d47d03 100644
--- a/tensorflow/core/kernels/eigen_cuboid_convolution.h
+++ b/tensorflow/core/kernels/eigen_cuboid_convolution.h
@@ -19,6 +19,10 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_volume_patch.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -51,11 +55,10 @@ namespace internal {
 //   col - index of the extracted patch (in code: patchIndex)
 //         patchIndex ∈ [0..num_patches * OTHERS] (batch and other dimensions)
 //
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar_,
-          typename Index, typename nocontract_t, typename contract_t, int Side,
-          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
-          int Alignment>
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar_, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper<
     Scalar_, Index, Side,
     TensorEvaluator<const TensorReshapingOp<NewDimension,
@@ -332,13 +335,6 @@ class TensorContractionInputMapper<
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
-                                             const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
  private:
   friend class TensorContractionSubMapper<
       Scalar, Index, Side,
@@ -681,11 +677,10 @@ class TensorContractionInputMapper<
   const TensorEvaluator<ArgType, Device> m_impl;
 };
 
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t, int Side,
-          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
-          int Alignment>
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int Side, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper<
     Scalar, Index, Side,
     TensorEvaluator<const TensorReshapingOp<NewDimension,
@@ -880,6 +875,12 @@ class TensorContractionSubMapper<
     const Index inputIndex = depth + baseIndex;
     return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
   }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
 
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padPlane(const Index plane) const {
@@ -948,7 +949,9 @@ class TensorContractionSubMapper<
   }
 
  private:
-  const ParentMapper& m_base_mapper;
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
+
   Index m_depth_offset;  // First row in the input matrix
   Index m_col_offset;    // First col in the input matrix
 
@@ -991,11 +994,14 @@ class TensorContractionSubMapper<
 // *) nr - number of registers along the 'n' dimension.
 //    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
 //    Multiplication" paper.
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t,
-          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
-          int Alignment, int nr>
+//
+// TODO(ezhulenev): Add support for squeezing reads along two innermost
+// dimensions (see eigen_spatial_convolutions).
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          int nr>
 struct gemm_pack_rhs<
     Scalar, Index,
     TensorContractionSubMapper<
@@ -1170,11 +1176,13 @@ struct gemm_pack_rhs<
 
 // Template specialization for packet_size = 2. We must special-case packet
 // blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          int nr>
+//
+// TODO(ezhulenev): Add support for squeezing reads along two innermost
+// dimensions (see eigen_spatial_convolutions).
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
     Scalar, Index,
     TensorContractionSubMapper<
@@ -1351,11 +1359,10 @@ struct gemm_pack_rhs<
 };
 
 // Special case for non-vectorized types such as float16 (packet_size = 1).
-template <typename NewDimension, DenseIndex Planes, DenseIndex Rows,
-          DenseIndex Cols, typename ArgType, typename Device, typename Scalar,
-          typename Index, typename nocontract_t, typename contract_t,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          int nr>
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar, typename Index,
+          typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
+          bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
     Scalar, Index,
     TensorContractionSubMapper<
@@ -1425,6 +1432,170 @@ struct gemm_pack_rhs<
   }
 };
 
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+// Arrange a block of the right input matrix (in our case it's always a "virtual
+// matrix" constructed from extracted volume patches) in contiguous memory.
+//
+// Mkldnn doesn't require Lhs/Rhs blocks to be packed in any specific format, so
+// this is basically the same as taking a slice of the matrix. Knowing
+// properties of the original patch op we can do it more efficient than default
+// mkldnn_gemm_pack.
+//
+// TODO(ezhulenev): mkldnn_gemm_pack for spatial convolutions supports squeezing
+// reads along the 2 innermost dimensions, add it here if needed.
+template <typename NewDimension, Index Planes, Index Rows, Index Cols,
+          typename ArgType, typename Device, typename Scalar,
+          typename StorageIndex, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered,
+          int Alignment>
+struct mkldnn_gemm_pack<
+    Scalar, StorageIndex,
+    TensorContractionSubMapper<
+        Scalar, StorageIndex, Rhs,
+        TensorEvaluator<const TensorReshapingOp<
+                            NewDimension, const TensorVolumePatchOp<
+                                              Planes, Rows, Cols, ArgType> >,
+                        Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    ColMajor> {
+  typedef TensorContractionSubMapper<
+      Scalar, StorageIndex, Rhs,
+      TensorEvaluator<const TensorReshapingOp<
+                          NewDimension, const TensorVolumePatchOp<
+                                            Planes, Rows, Cols, ArgType> >,
+                      Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
+                  StorageIndex cols) {
+    const bool standard_patches = !rhs.nonStandardPatches();
+
+    if (standard_patches && rhs.patchDepth() % packet_size == 0) {
+      packStandardPatches<true>(block, rhs, rows, cols);
+
+    } else if (standard_patches) {
+      packStandardPatches<false>(block, rhs, rows, cols);
+
+    } else {
+      // With non-standard patches we don't do any vectorized loads.
+      // TODO(ezhulenev): It doesn't look like that we should completely give up
+      // on packets. Make this code path faster!
+      for (StorageIndex col = 0; col < cols; ++col) {
+        SubMapper lm = rhs.getLinearMapper(0, col);
+        for (StorageIndex i = 0; i < rows; ++i) {
+          *block = lm(i);
+          ++block;
+        }
+      }
+    }
+  }
+
+ private:
+  // Pack standard volume patches:
+  //
+  // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
+  //   depth dimension size to be a multiple of packet size, so we can skip all
+  //   non vectorized loads and checks.
+  //
+  template <bool patch_depth_is_multiple_of_packet_size>
+  EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
+                                               const DataMapper& rhs,
+                                               StorageIndex rows,
+                                               StorageIndex cols) {
+    eigen_assert(!rhs.nonStandardPatches());
+
+    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
+    const Index peeled_k = (rows / packet_size) * packet_size;
+
+    const Index start_col = rhs.colOffset();
+    const Index max_col = rhs.maxCol(peeled_k);
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      SubMapper lm = rhs.getLinearMapper(0, col);
+
+      Index k = 0;
+      for (Index c = start_col; c < max_col; ++c) {
+        eigen_assert(k <= peeled_k);
+
+        const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const Index max_row = rhs.maxRow(peeled_k, c);
+        const bool pad_col = lm.padCol(c);
+
+        for (Index r = start_row; r < max_row; ++r) {
+          eigen_assert(k <= peeled_k);
+
+          const Index start_plane =
+              ((c == start_col) && (r == start_row)) ? rhs.planeOffset() : 0;
+          const Index max_plane = rhs.maxPlane(peeled_k, c, r);
+          const bool pad_row = pad_col || lm.padRow(r);
+
+          for (Index p = start_plane; p < max_plane; ++p) {
+            eigen_assert(k <= peeled_k);
+
+            const Index start_depth =
+                ((c == start_col) && (r == start_row) && (p == start_plane))
+                    ? rhs.depthOffset()
+                    : 0;
+            const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+
+            const bool pad = pad_col || pad_row || lm.padPlane(p);
+            const Index base_idx = lm.baseIndex(p, r, c);
+
+            if (patch_depth_is_multiple_of_packet_size)
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
+            const Index max_vectorized_depth =
+                patch_depth_is_multiple_of_packet_size
+                    ? max_depth
+                    : max_depth - packet_size;
+
+            Index d = start_depth;
+
+            // 1. Process depth dimension with vectorized instructions.
+            for (; d < max_vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet packet = pad ? pset1<Packet>(Scalar(0))
+                                        : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, packet);
+              block += packet_size;
+              k += packet_size;
+            }
+
+            // 2. Finish with coefficients.
+            if (!patch_depth_is_multiple_of_packet_size) {
+              for (; d < max_depth; d++) {
+                eigen_assert(k < peeled_k);
+                *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
+                ++block;
+                ++k;
+              }
+            }
+          }
+        }
+      }
+
+      // The loop above should fill peeled_k elements.
+      eigen_assert(peeled_k == k);
+
+      // Fill remaining elements using loadCoeffStandard.
+      for (; k < rows; ++k) {
+        *block = lm.loadCoeffStandard(k);
+        ++block;
+      }
+    }
+  }
+};
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
 }  // namespace internal
 
 /** CuboidConvolution
@@ -1476,9 +1647,8 @@ EIGEN_ALWAYS_INLINE static const typename internal::conditional<
                 const DSizes<typename internal::traits<Input>::Index, 2>,
                 const Kernel> > > >::type
 CuboidConvolution(const Input& input, const Kernel& kernel,
-                  const DenseIndex stridePlanes = 1,
-                  const DenseIndex strideRows = 1,
-                  const DenseIndex strideCols = 1,
+                  const Index stridePlanes = 1, const Index strideRows = 1,
+                  const Index strideCols = 1,
                   const PaddingType padding_type = PADDING_SAME) {
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar,
diff --git a/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da4a61d1bda1ea1171fdea5c9dffaab8aabd4429
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_mkldnn_contraction_kernel_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+template <typename Index, int NumDims>
+Eigen::array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+  Eigen::array<Index, NumDims> dims;
+  for (int i = 0; i < NumDims; ++i) {
+    dims[i] = internal::random<int>(min_dim, max_dim);
+  }
+  return dims;
+}
+}  // namespace
+
+using Scalar = float;
+using Index = Eigen::Index;
+
+TEST(EigenMkldnnTest, MkldnnPack) {
+  // Packing with mkldnn_gemm_pack is the same as taking a slice of 2
+  // dimensional Tensor.
+
+  // Mkldnn pack and gemm are used only in Tensor contractions, and it's
+  // guaranteed that Tensors will have ColMajor layout.
+  static const int Options = ColMajor;
+
+  using DataMapper = blas_data_mapper<Scalar, Index, ColMajor>;
+  using MkldnnGemmPack = mkldnn_gemm_pack<Scalar, Index, DataMapper, ColMajor>;
+  using Tensor2d = Tensor<Scalar, 2, Options, Index>;
+
+  Eigen::array<Index, 2> dims = RandomDims<Index, 2>(1, 500);
+
+  // Create a tensor initialized with random data.
+  Tensor2d src(dims);
+  src.setRandom();
+
+  // Pick a random slice of src tensor.
+  Eigen::array<Index, 2> slice_start = RandomDims<Index, 2>(0, 250);
+  Eigen::array<Index, 2> slice_size = RandomDims<Index, 2>(100, 500);
+
+  // Make sure that slice start + size do not overflow tensor dims.
+  for (int i = 0; i < 2; ++i) {
+    slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+    slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+  }
+
+  // Prepare tensors for packing and slicing results.
+  Tensor2d pack_dst(slice_size[0], slice_size[1]);
+  Tensor2d slice_dst(slice_size[0], slice_size[1]);
+
+  // Pack memory using mkldnn_gemm_pack.
+  DataMapper data_mapper(src.data(), dims[0]);
+  MkldnnGemmPack gemm_pack;
+  gemm_pack(pack_dst.data(),
+            data_mapper.getSubMapper(slice_start[0], slice_start[1]),
+            slice_size[0], slice_size[1]);
+
+  // Slice the source tensor.
+  slice_dst = src.slice(slice_start, slice_size);
+
+  // Verify that dst tensors are equal.
+  EXPECT_EQ(pack_dst.dimensions().TotalSize(),
+            slice_dst.dimensions().TotalSize());
+  for (size_t i = 0; i < pack_dst.dimensions().TotalSize(); ++i) {
+    Scalar packed = pack_dst.coeff(i);
+    Scalar sliced = slice_dst.coeff(i);
+    EXPECT_EQ(packed, sliced);
+  }
+}
+
+TEST(EigenMkldnnTest, MkldnnGemm) {
+  // Mkldnn pack and gemm are used only in Tensor contractions, and it's
+  // guaranteed that Tensors will have ColMajor layout.
+  static const int Options = ColMajor;
+
+  using Tensor2d = Tensor<Scalar, 2, Options, Index>;
+
+  int m = internal::random<int>(1, 100);
+  int n = internal::random<int>(1, 100);
+  int k = internal::random<int>(1, 100);
+
+  Tensor2d lhs(m, k);
+  lhs.setRandom();
+
+  Tensor2d rhs(k, n);
+  rhs.setRandom();
+
+  // Compute matmul with mkldnn gemm kernel.
+  using OutputMapper = blas_data_mapper<Scalar, Index, ColMajor>;
+  using MkldnnGemmKernel =
+      mkldnn_gemm_kernel<Scalar, Index, OutputMapper, ColMajor>;
+
+  Tensor2d mkldnn_result(m, n);
+  mkldnn_result.setZero();
+  OutputMapper output_mapper(mkldnn_result.data(), m);
+
+  MkldnnGemmKernel gemm_kernel;
+  gemm_kernel(output_mapper, lhs.data(), rhs.data(), m, k, n, /*alpha=*/1.0);
+
+  // Compute matmul with Eigen::Matrix.
+  using Matrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+  using MatrixMap = Map<Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>>;
+
+  MatrixMap lhs_mat(lhs.data(), m, k);
+  MatrixMap rhs_mat(rhs.data(), k, n);
+
+  Matrix matmul_result(m, n);
+  matmul_result.setZero();
+  matmul_result = lhs_mat * rhs_mat;
+
+  // Verify that results are equal.
+  for (Index i = 0; i < m * n; ++i) {
+    Scalar gemm = mkldnn_result(i);
+    Scalar matmul = matmul_result(i % m, i / m);
+
+    Scalar delta = std::abs(gemm - matmul);
+
+    // NOTE(rmlarsen): Compute proper forward error bound.
+    Scalar sum = Scalar(0.0);
+    for (int k1 = 0; k1 < k; ++k1) {
+      sum += std::abs(lhs_mat(i % m, k1) * rhs_mat(k1, i / m));
+    }
+    Scalar epsilon = std::numeric_limits<Scalar>::epsilon();
+    Scalar upper_bound = Scalar(1.01) * epsilon * k * sum;
+
+    EXPECT_LE(delta, upper_bound);
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
diff --git a/tensorflow/core/kernels/eigen_pooling_test.cc b/tensorflow/core/kernels/eigen_pooling_test.cc
index 47b6665e680268793df18d50395d0b6c6aca0ad0..1fe9fd09dabbc18726371019e8ab6f07ad945b1d 100644
--- a/tensorflow/core/kernels/eigen_pooling_test.cc
+++ b/tensorflow/core/kernels/eigen_pooling_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/eigen_pooling.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace Eigen {
diff --git a/tensorflow/core/kernels/eigen_softmax_test.cc b/tensorflow/core/kernels/eigen_softmax_test.cc
index 7f985d71366487e0426e25e064764c196979b114..30a1ccca05248749d9edc290ee9a369afe0c11ae 100644
--- a/tensorflow/core/kernels/eigen_softmax_test.cc
+++ b/tensorflow/core/kernels/eigen_softmax_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/eigen_softmax.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace Eigen {
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index e926d73f87c0bb936d068312e7973e4ff6513399..25c735d080e1cef54b7c8cd87d25eb31612192b3 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -18,6 +18,10 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace Eigen {
 
 namespace internal {
@@ -52,8 +56,9 @@ namespace internal {
 //
 // TODO(ezhulenev): Consolidate this part of the code with the image patch
 // extraction code since they are both very similar.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar_, typename Index,
+
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar_, typename Index,
           typename nocontract_t, typename contract_t, int Side, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper<
@@ -66,6 +71,7 @@ class TensorContractionInputMapper<
     inner_dim_reordered, Alignment> {
  public:
   typedef Scalar_ Scalar;
+
   typedef TensorContractionInputMapper<
       Scalar, Index, Side,
       TensorEvaluator<
@@ -75,6 +81,7 @@ class TensorContractionInputMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       Self;
+
   typedef TensorContractionSubMapper<
       Scalar, Index, Side,
       TensorEvaluator<
@@ -84,6 +91,7 @@ class TensorContractionInputMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       SubMapper;
+
   typedef SubMapper VectorMapper;
   typedef SubMapper LinearMapper;
   typedef typename packet_traits<Scalar>::type Packet;
@@ -264,13 +272,6 @@ class TensorContractionInputMapper<
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
 
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth,
-                                             const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
  private:
   friend class TensorContractionSubMapper<
       Scalar, Index, Side,
@@ -511,8 +512,8 @@ class TensorContractionInputMapper<
   const TensorEvaluator<ArgType, Device> m_impl;
 };
 
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, int Side, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper<
@@ -536,6 +537,7 @@ class TensorContractionSubMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       ParentMapper;
+
   typedef TensorContractionSubMapper<
       Scalar, Index, Side,
       TensorEvaluator<
@@ -545,21 +547,22 @@ class TensorContractionSubMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       Self;
+
   typedef Self LinearMapper;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
       const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper),
-        m_depth_offset(vert_offset),
-        m_col_offset(horiz_offset) {
+      : m_depth_offset(vert_offset),
+        m_col_offset(horiz_offset),
+        m_base_mapper(base_mapper) {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
                                      m_otherIndex);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
       const Self& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper.m_base_mapper),
-        m_depth_offset(vert_offset + base_mapper.m_depth_offset),
-        m_col_offset(horiz_offset + base_mapper.m_col_offset) {
+      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+        m_col_offset(horiz_offset + base_mapper.m_col_offset),
+        m_base_mapper(base_mapper.m_base_mapper) {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
                                      m_otherIndex);
   }
@@ -581,7 +584,6 @@ class TensorContractionSubMapper<
     return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
                                                         j + m_col_offset);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
   loadCoeffStandard(Index i) const {
     return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex,
@@ -614,18 +616,29 @@ class TensorContractionSubMapper<
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
     const Index max_col =
-        fastPatchColStride().divide(m_depth_offset + peeled_k);
+        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) /
+        fastPatchColStride();
     return std::min<Index>(1 + max_col, patchCols());
   }
 
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
                                    const Index col) const {
-    const Index max_row = fastPatchRowStride().divide(
-        m_depth_offset + peeled_k - col * patchColStride());
+    const Index max_row = (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) -
+                           col * patchColStride()) /
+                          fastPatchRowStride();
     return std::min<Index>(1 + max_row, patchRows());
   }
 
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col,
+                                     Index row) const {
+    const Index max_depth = m_depth_offset + peeled_k -  //
+                            col * patchColStride() -     //
+                            row * patchRowStride();
+    return std::min<Index>(max_depth, patchDepth());
+  }
+
   // MaxDepth uses only the remaining number of elements in the peeled_k.
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
@@ -682,6 +695,12 @@ class TensorContractionSubMapper<
     const Index inputIndex = depth + baseIndex;
     return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
   }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar coeffNoPadding(const Index depth,
+                                            const Index baseIndex) const {
+    const Index inputIndex = depth + baseIndex;
+    return m_base_mapper.m_impl.coeff(inputIndex);
+  }
 
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
@@ -689,6 +708,12 @@ class TensorContractionSubMapper<
     return r < 0 || r >= m_base_mapper.m_inputRows;
   }
   EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row,
+                                     const Index last_row) const {
+    return m_rowIndex + first_row < 0 ||
+           m_rowIndex + last_row >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
     const Index c = m_colIndex + col;
     return c < 0 || c >= m_base_mapper.m_inputCols;
@@ -701,6 +726,15 @@ class TensorContractionSubMapper<
            c * m_base_mapper.m_colInputStride + m_otherIndex;
   }
 
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index rowStride() const {
+    return m_base_mapper.m_row_strides;
+  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index colStride() const {
+    return m_base_mapper.m_col_strides;
+  }
+
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index rowOffset() const {
     const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
@@ -726,9 +760,8 @@ class TensorContractionSubMapper<
   }
 
  private:
-  const ParentMapper& m_base_mapper;  // that was a reference before
-  Index m_depth_offset;               // First row in the input matrix
-  Index m_col_offset;                 // First col in the input matrix
+  Index m_depth_offset;  // First row in the input matrix
+  Index m_col_offset;    // First col in the input matrix
 
   // Knowing that: col_offset == patchIndex * OTHERS, we keep precomputed base
   // indices for the first element in a patch specified by col_offset
@@ -736,6 +769,9 @@ class TensorContractionSubMapper<
   Index m_rowIndex;
   Index m_colIndex;
   Index m_otherIndex;
+
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
 };
 
 // Arrange a block of the right input matrix (in our case it's always a "virtual
@@ -768,8 +804,8 @@ class TensorContractionSubMapper<
 // *) nr - number of registers along the 'n' dimension.
 //    See GeneralBlockPanelKernel.h and "Anatomy of High-Performance Matrix
 //    Multiplication" paper.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, int packet_size,
           bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
           int nr>
@@ -835,6 +871,55 @@ struct gemm_pack_rhs<
             const bool pad_col2 = dm2.padCol(c);
             const bool pad_col3 = dm3.padCol(c);
 
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                                //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                eigen_assert(k < peeled_k);
+                PacketBlock<Packet, 4> kernel;
+                kernel.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel.packet[2] = rhs.packetNoPadding(d, idx2);
+                kernel.packet[3] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel);
+                pstoreu(block + 0 * packet_size, kernel.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel.packet[1]);
+                pstoreu(block + 2 * packet_size, kernel.packet[2]);
+                pstoreu(block + 3 * packet_size, kernel.packet[3]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
             for (Index r = start_row; r < max_row; ++r) {
               eigen_assert(k <= peeled_k);
 
@@ -929,8 +1014,8 @@ struct gemm_pack_rhs<
 
 // Template specialization for packet_size = 2. We must special-case packet
 // blocks with nr > packet_size, e.g. PacketBlock<Packet2d, 4>.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
           bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
@@ -996,6 +1081,56 @@ struct gemm_pack_rhs<
             const bool pad_col2 = dm2.padCol(c);
             const bool pad_col3 = dm3.padCol(c);
 
+            // We can squeeze reads along the `row` and `depth` dimensions if
+            // the row stride is `1`, which means that `row` and `depth`
+            // dimensions are contiguous (two innermost dimensions).
+            if (rhs.rowStride() == 1 &&                                //
+                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+                !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
+                !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
+                !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
+                !dm3.padRow(start_row) && !dm3.padRow(max_row - 1)) {
+              // Compute how many elements we can squeeze read.
+              const Index start_depth =
+                  (c == start_col) ? rhs.depthOffset() : 0;
+
+              // Upper bound for the number of elements in the depth dimension
+              // that we can squeeze read.
+              const Index squeeze_length =
+                  (max_row - start_row) * rhs.patchDepth() - start_depth;
+
+              // Do not overshoot beyond the block size.
+              const Index max_depth =
+                  start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+              eigen_assert((max_depth - start_depth) % packet_size == 0);
+
+              const Index idx0 = dm0.baseIndex(start_row, c);
+              const Index idx1 = dm1.baseIndex(start_row, c);
+              const Index idx2 = dm2.baseIndex(start_row, c);
+              const Index idx3 = dm3.baseIndex(start_row, c);
+
+              for (Index d = start_depth; d < max_depth; d += packet_size) {
+                PacketBlock<Packet, 2> kernel0;
+                PacketBlock<Packet, 2> kernel1;
+                kernel0.packet[0] = rhs.packetNoPadding(d, idx0);
+                kernel0.packet[1] = rhs.packetNoPadding(d, idx1);
+                kernel1.packet[0] = rhs.packetNoPadding(d, idx2);
+                kernel1.packet[1] = rhs.packetNoPadding(d, idx3);
+                ptranspose(kernel0);
+                ptranspose(kernel1);
+                pstoreu(block + 0 * packet_size, kernel0.packet[0]);
+                pstoreu(block + 1 * packet_size, kernel1.packet[0]);
+                pstoreu(block + 2 * packet_size, kernel0.packet[1]);
+                pstoreu(block + 3 * packet_size, kernel1.packet[1]);
+                block += 4 * packet_size;
+                k += packet_size;
+              }
+
+              // Go to the next column.
+              continue;
+            }
+
+            // If we can't squeeze reads, process rows one by one.
             for (Index r = start_row; r < max_row; ++r) {
               eigen_assert(k <= peeled_k);
 
@@ -1095,8 +1230,8 @@ struct gemm_pack_rhs<
 };
 
 // Special case for non-vectorized types such as float16.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols,
-          typename ArgType, typename Device, typename Scalar, typename Index,
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename Index,
           typename nocontract_t, typename contract_t, bool inner_dim_contiguous,
           bool inner_dim_reordered, int Alignment, int nr>
 struct gemm_pack_rhs<
@@ -1168,6 +1303,210 @@ struct gemm_pack_rhs<
   }
 };
 
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+// Arrange a block of the right input matrix (in our case it's always a
+// "virtual matrix" constructed from extracted image patches) in contiguous
+// memory.
+//
+// Mkldnn doesn't require Lhs/Rhs blocks to be packed in any specific format, so
+// this is basically the same as taking a slice of the matrix. Knowing
+// properties of the original patch op we can do it more efficient than default
+// mkldnn_gemm_pack.
+template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
+          typename Device, typename Scalar, typename StorageIndex,
+          typename nocontract_t, typename contract_t, int packet_size,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
+struct mkldnn_gemm_pack<
+    Scalar, StorageIndex,
+    TensorContractionSubMapper<
+        Scalar, StorageIndex, Rhs,
+        TensorEvaluator<
+            const TensorReshapingOp<
+                NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+            Device>,
+        nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+        inner_dim_reordered, Alignment>,
+    ColMajor> {
+  typedef TensorContractionSubMapper<
+      Scalar, StorageIndex, Rhs,
+      TensorEvaluator<
+          const TensorReshapingOp<
+              NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
+          Device>,
+      nocontract_t, contract_t, packet_size, inner_dim_contiguous,
+      inner_dim_reordered, Alignment>
+      SubMapper;
+
+  typedef SubMapper DataMapper;
+  typedef typename packet_traits<Scalar>::type Packet;
+
+  EIGEN_DONT_INLINE
+  void operator()(Scalar* block, const DataMapper rhs, StorageIndex rows,
+                  StorageIndex cols) {
+    const bool standard_patches = !rhs.nonStandardPatches();
+
+    if (standard_patches && (rhs.patchDepth() % packet_size == 0)) {
+      // Single packet always belong to single patch (row, col).
+      packStandardPatches</*patch_depth_is_multiple_of_packet_size*/ true>(
+          block, rhs, rows, cols);
+
+    } else if (standard_patches) {
+      // Single packet can span across multiple patch rows or columns.
+      packStandardPatches</*patch_depth_is_multiple_of_packet_size*/ false>(
+          block, rhs, rows, cols);
+
+    } else {
+      // With non-standard patches we don't do any vectorized loads.
+      // TODO(ezhulenev): It doesn't look like that we should completely give up
+      // on packets. Make this code path faster!
+      for (StorageIndex col = 0; col < cols; ++col) {
+        SubMapper lm = rhs.getLinearMapper(0, col);
+        for (StorageIndex i = 0; i < rows; ++i) {
+          *block = lm(i);
+          ++block;
+        }
+      }
+    }
+  }
+
+ private:
+  // Pack standard image patches:
+  //
+  // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
+  //   depth dimension size to be a multiple of packet size, so we can skip all
+  //   non vectorized loads and checks.
+  template <bool patch_depth_is_multiple_of_packet_size>
+  EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
+                                               const DataMapper rhs,
+                                               StorageIndex rows,
+                                               StorageIndex cols) {
+    eigen_assert(!rhs.nonStandardPatches());
+
+    // Give vectorized_rows the name used in all other gemm_pack_rhs above.
+    const StorageIndex peeled_k = (rows / packet_size) * packet_size;
+
+    const StorageIndex start_col = rhs.colOffset();
+    const StorageIndex max_col = rhs.maxCol(peeled_k);
+
+    for (StorageIndex col = 0; col < cols; ++col) {
+      SubMapper lm = rhs.getLinearMapper(0, col);
+
+      StorageIndex k = 0;
+      for (Index c = start_col; c < max_col; ++c) {
+        eigen_assert(k <= peeled_k);
+
+        const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const StorageIndex max_row = rhs.maxRow(peeled_k, c);
+        const bool pad_col = lm.padCol(c);
+
+        // We can squeeze reads for all rows in [start_row, max_row) range.
+        if (!pad_col && !lm.padAnyRow(start_row, max_row - 1)) {
+          const StorageIndex start_depth =
+              (c == start_col) ? rhs.depthOffset() : 0;
+
+          const StorageIndex max_depth =
+              std::min<StorageIndex>(start_depth + (peeled_k - k),
+                                     (max_row - start_row) * rhs.patchDepth());
+
+          const StorageIndex base_idx = lm.baseIndex(start_row, c);
+
+          if (patch_depth_is_multiple_of_packet_size) {
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
+            eigen_assert((max_depth - start_depth) % packet_size == 0);
+            StorageIndex d = start_depth;
+
+            for (; d < max_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+              block += packet_size;
+              k += packet_size;
+            }
+
+          } else {
+            StorageIndex d = start_depth;
+            const StorageIndex vectorized_depth = max_depth - packet_size;
+
+            for (; d <= vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+              block += packet_size;
+              k += packet_size;
+            }
+            for (; d < max_depth; d++) {
+              eigen_assert(k < peeled_k);
+              *block = rhs.coeffNoPadding(d, base_idx);
+              ++block;
+              ++k;
+            }
+          }
+
+          // Go to the next column.
+          continue;
+        }
+
+        // If we are not allowed to squeeze reads along the `row` and `depth`
+        // dimensions, we must process rows one by one.
+        for (StorageIndex r = start_row; r < max_row; ++r) {
+          eigen_assert(k <= peeled_k);
+
+          const StorageIndex start_depth =
+              ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+          const StorageIndex max_depth =
+              rhs.maxDepth(peeled_k - k, start_depth);
+
+          const bool pad = pad_col || lm.padRow(r);
+          const StorageIndex base_idx = lm.baseIndex(r, c);
+
+          if (patch_depth_is_multiple_of_packet_size) {
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
+            eigen_assert((max_depth - start_depth) % packet_size == 0);
+            StorageIndex d = start_depth;
+
+            for (; d < max_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = pad ? pset1<Packet>(Scalar(0))
+                                   : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
+
+          } else {
+            const StorageIndex max_vectorized_depth = max_depth - packet_size;
+            StorageIndex d = start_depth;
+            for (; d < max_vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = pad ? pset1<Packet>(Scalar(0))
+                                   : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
+            for (; d < max_depth; d++) {
+              eigen_assert(k < peeled_k);
+              *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
+              ++block;
+              ++k;
+            }
+          }
+        }
+      }
+
+      // The loop above should fill peeled_k elements.
+      eigen_assert(peeled_k == k);
+
+      // Fill remaining elements using loadCoeffStandard.
+      for (; k < rows; ++k) {
+        *block = lm.loadCoeffStandard(k);
+        ++block;
+      }
+    }
+  }
+};
+#endif  // defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+
 }  // end namespace internal
 
 /** SpatialConvolution
@@ -1193,8 +1532,12 @@ struct gemm_pack_rhs<
  * It is possible to swap the order of the width and height dimensions provided
  * that the same order is used in the input, the kernel, and the output.
  *
+ * It is also possible to add an output kernel to the contraction, output
+ * kernel is called by Eigen when it "finalizes" the block of an output tensor.
+ *
  */
-template <typename Input, typename Kernel>
+template <typename Input, typename Kernel,
+          typename OutputKernel = const NoOpOutputKernel>
 EIGEN_DEVICE_FUNC
     EIGEN_ALWAYS_INLINE static const typename internal::conditional<
         internal::traits<Input>::Layout == ColMajor,
@@ -1209,8 +1552,8 @@ EIGEN_DEVICE_FUNC
                     const Kernel>,
                 const TensorReshapingOp<
                     const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const TensorImagePatchOp<Dynamic, Dynamic,
-                                             const Input> > > >,
+                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+                const OutputKernel> >,
         TensorReshapingOp<
             const DSizes<typename internal::traits<Input>::Index,
                          internal::traits<Input>::NumDimensions>,
@@ -1222,13 +1565,14 @@ EIGEN_DEVICE_FUNC
                     const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
                 const TensorReshapingOp<
                     const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel> > > >::type
+                    const Kernel>,
+                const OutputKernel> > >::type
     SpatialConvolution(const Input& input, const Kernel& kernel,
-                       const DenseIndex row_stride = 1,
-                       const DenseIndex col_stride = 1,
+                       const Index row_stride = 1, const Index col_stride = 1,
                        const PaddingType padding_type = PADDING_SAME,
-                       const DenseIndex row_in_stride = 1,
-                       const DenseIndex col_in_stride = 1) {
+                       const Index row_in_stride = 1,
+                       const Index col_in_stride = 1,
+                       const OutputKernel& output_kernel = OutputKernel()) {
   typedef typename internal::traits<Input>::Index TensorIndex;
   TensorRef<Tensor<typename internal::traits<Input>::Scalar,
                    internal::traits<Input>::NumDimensions,
@@ -1258,9 +1602,9 @@ EIGEN_DEVICE_FUNC
   const TensorIndex kernelCols =
       isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
 
-  const DenseIndex kernelRowsEff =
+  const Index kernelRowsEff =
       kernelRows + (kernelRows - 1) * (row_in_stride - 1);
-  const DenseIndex kernelColsEff =
+  const Index kernelColsEff =
       kernelCols + (kernelCols - 1) * (col_in_stride - 1);
 
   array<IndexPair<TensorIndex>, 1> contract_dims;
@@ -1351,13 +1695,13 @@ EIGEN_DEVICE_FUNC
                             kernelRows, kernelCols, row_stride, col_stride,
                             row_in_stride, col_in_stride, padding_type)
                         .reshape(pre_contract_dims),
-                    contract_dims)
+                    contract_dims, output_kernel)
           .reshape(post_contract_dims),
       input
           .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
                                  row_in_stride, col_in_stride, padding_type)
           .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
+          .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
           .reshape(post_contract_dims));
 }
 
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 450b98785baf8ae124727b13595346da6946da83..22f71d62602cc984c0337f728298f7483c35bed9 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace Eigen {
 
@@ -1373,4 +1373,276 @@ TEST(EigenSpatialConvolutionsTest, SpatialConvContractionMapper) {
   EigenApprox(8.0f, direct(0, 1, 3, 0));
 }
 
+static void PackRhsHelper(int iters,
+                          /* Input dimensions: */
+                          int input_batches, int input_cols, int input_rows,
+                          int input_depth,
+                          /* Filter (kernel) dimensions: */
+                          int filter_count, int filter_cols, int filter_rows,
+                          /* Input strides: */
+                          int col_strides, int row_strides,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
+  tensorflow::testing::UseRealTime();
+  tensorflow::testing::StopTiming();
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  // Default Eigen::Tensor layout is column major, so we configure dimensions
+  // starting from the inner most (channels aka depth in this case).
+  Dimensions input_dims(input_depth, input_rows, input_cols, input_batches);
+
+  using Traits = typename Eigen::internal::gebp_traits<float, float>;
+  static const int packet_size = Eigen::internal::packet_traits<float>::size;
+
+  // Reshape dimensions.
+  using NewDimension = Eigen::DSizes<Index, 2>;
+
+  // Contraction dimensions.
+  using nocontract_t = Eigen::array<Eigen::Index, 1>;
+  using contract_t = Eigen::array<Eigen::Index, 1>;
+
+  // Input to the TensorImagePatchOp. It is the tensorflow TTypes<float>::Tensor
+  // with ColMajor layout, instead of RowMajor. But that doesn't make any
+  // difference, because TensorContraction swaps LHS with RHS for row major
+  // inputs, and contraction mapper always works with column major data.
+  using ArgType = TensorMap<Tensor<float, 4>, Eigen::Aligned>;
+
+  using Evaluator = TensorEvaluator<
+      const TensorReshapingOp<
+          NewDimension, const TensorImagePatchOp<Dynamic, Dynamic, ArgType>>,
+      Eigen::DefaultDevice>;
+
+  using InputMapper = Eigen::internal::TensorContractionInputMapper<
+      float, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+  using SubMapper = Eigen::internal::TensorContractionSubMapper<
+      float, Index, Eigen::internal::Rhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackRhsImpl = Eigen::internal::mkldnn_gemm_pack<float, Eigen::Index,
+                                                        SubMapper, ColMajor>;
+#else
+  using PackRhsImpl =
+      Eigen::internal::gemm_pack_rhs<float, Eigen::Index, SubMapper,  //
+                                     Traits::nr,                      //
+                                     ColMajor,                        //
+                                     /*Conjugate*/ false,             //
+                                     /*PanelMode*/ false>;
+#endif
+
+  Eigen::DefaultDevice device;
+
+  // Actual contract dimensions are not important.
+  const Eigen::Index not_important = -1234;
+  nocontract_t nocontract_dim = {not_important};
+  contract_t contract_dim = {not_important};
+
+  // We use tensor of the same dimensions to store packed data.
+  Tensor<float, 4> packed(input_dims);
+
+  // We generate multiple input tensors, around 512mb in total size to measure
+  // realistic workload when input data in not in L1-L3 cache.
+  size_t input_bytes = input_dims.TotalSize() * sizeof(float);
+  size_t mem_size_bytes = 1024 * 1024 * 512;
+  size_t num_inputs =
+      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
+
+  std::vector<Tensor<float, 4>> inputs;
+  std::vector<Evaluator> evaluators;
+  std::vector<InputMapper> input_mappers;
+
+  for (int i = 0; i < num_inputs; ++i) {
+    inputs.emplace_back(input_dims);
+    inputs[i].setRandom();
+
+    ArgType tensor_map(inputs[i].data(), input_dims);
+
+    // 1. Extract image patches from input tensor. All strides are `1`.
+    const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
+        tensor_map,                                            //
+        filter_rows, filter_cols,                              //
+        row_strides, col_strides,                              //
+        /*in_row_strides=*/1, /*in_col_strides=*/1,            //
+        /*row_inflate_strides=*/1, /*col_inflate_strides=*/1,  //
+        Eigen::PADDING_SAME, /*padding_value=*/0.0);
+
+    // 2. Reshape extracted patches into "virtual" 2d tensor.
+    // NOTE: This is valid for PADDING_SAME only.
+    Index output_rows = input_rows / row_strides;
+    Index output_cols = input_cols / col_strides;
+    NewDimension reshape_dims;
+    reshape_dims[0] = input_depth * filter_rows * filter_cols;    // patch size
+    reshape_dims[1] = output_rows * output_cols * input_batches;  // num_patches
+
+    const auto reshape_op =
+        TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
+            image_patch_op, reshape_dims);
+
+    evaluators.emplace_back(reshape_op, device);
+
+    input_mappers.emplace_back(evaluators[i], nocontract_dim, nocontract_dim,
+                               contract_dim, contract_dim);
+  }
+
+  // We read properties of extracted image patches directly from evaluator.
+  const Index patch_depth = evaluators[0].impl().dimensions()[0];
+  const Index patch_rows = evaluators[0].impl().dimensions()[1];
+  const Index patch_cols = evaluators[0].impl().dimensions()[2];
+
+  // Number of patches is the same as the maximum column available through the
+  // InputMapper (SubMapper).
+  const Index num_patches = evaluators[0].impl().dimensions()[3];
+
+  // The size of a single patch, it's the same as the maximum depth available
+  // through the InputMapper (SubMapper).
+  const Index patch_size = patch_depth * patch_rows * patch_cols;
+
+  PackRhsImpl pack_rhs;
+
+  const Index packed_total_size = input_dims.TotalSize();
+
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    int input_idx =
+        num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
+
+    // Depth offset must be a multiple of 8 (float packet size with AVX2).
+    Index depth_offset =
+        (patch_size > block_rows)
+            ? (internal::random<Index>(0, patch_size - 10) / 8) * 8
+            : 0;
+    Index col_offset = internal::random<Index>(0, num_patches - 10);
+
+    Index depth = std::min(block_rows, patch_size - depth_offset);
+    Index cols = std::min(block_cols, num_patches - col_offset);
+
+    // Write packed data to random memory location to emulate cold caches.
+    Index packed_size = depth * cols;
+    Index packed_offset =
+        internal::random<Index>(0, packed_total_size - packed_size - 1);
+
+    SubMapper sub_mapper =
+        input_mappers[input_idx].getSubMapper(depth_offset, col_offset);
+    pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
+  }
+  tensorflow::testing::StopTiming();
+
+  std::ostringstream stringStream;
+  stringStream << "patch: " << patch_rows << "x" << patch_cols << " D"
+               << patch_depth << "; num_patches=" << num_patches
+               << " patch_size=" << patch_size << " num_inputs=" << num_inputs;
+  tensorflow::testing::SetLabel(stringStream.str());
+}
+
+// -------------------------------------------------------------------------- //
+// Macro argumentnames:
+//    N: batch size
+//    H: height
+//    W: width
+//    C: input channels
+//   FC: filter channles
+//   FH: filter height
+//   SH: stride in height dimensions
+//   SW: stride in width dimensions
+//   BR: block rows
+//   BC: block cols
+
+#define BM_CONCAT(a, b) a##b
+
+#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)           \
+  BM_CONCAT(BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \
+            _s##SH##x##SW##_B##BR##x##BC)
+
+#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)         \
+  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
+                      BC)(int iters) {                             \
+    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);  \
+  }                                                                \
+  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
+
+// Number of input channel (input depth) it equal to the number of patch
+// channels (patch depth).
+
+// NOTE: This is the most common case in Tensorflow models.
+// Fast path: input channel dimension is the multiple of the packet size.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 32,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5,     //
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 32,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5,     //
+           /*stride*/ 2, 2,     //
+           /*block*/ 256, 56);
+
+// Slow path: input channel dimension is not the multiple of the packet size.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 30,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5,     //
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 30,     //
+           /*num_filters*/ 64,  //
+           /*filter*/ 5, 5,     //
+           /*stride*/ 2, 2,     //
+           /*block*/ 256, 56);
+
+// Slow path with input channel dimension smaller than the packet size.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 256, 256,  //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 8, 8,     //
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 256, 256,  //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 8, 8,     //
+           /*stride*/ 2, 4,     //
+           /*block*/ 256, 56);
+
+// Short and wide block with small input channel dimension.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 3, 3,     //
+           /*stride*/ 1, 1,     //
+           /*block*/ 36, 432);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 3, 3,     //
+           /*stride*/ 2, 2,     //
+           /*block*/ 36, 432);
 }  // namespace Eigen
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 135d0023458b1ef393ab0bc296dc07310347e7ff..61234479eac086c545c1457a743fb0da9db0c8d6 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -223,7 +223,7 @@ class FractionalAvgPoolGradOp : public OpKernel {
     // Once we figure out the original contributors, we just need to evenly
     // divide the value of this element among these contributors.
     //
-    // Internally, we divide the out_backprop tensor and store it in a temparary
+    // Internally, we divide the out_backprop tensor and store it in a temporary
     // tensor of double type. And cast it to the corresponding type.
     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
         ConstEigenMatrixMap;
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index bfdabc3a9f6dd990abce357b91cb27ea8f169c26..90f94ee4a06519eca064abf9b1e0d60f1f181188 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -69,7 +69,9 @@ void RetvalOp::Compute(OpKernelContext* ctx) {
 }
 
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kArgOp).Device(DEVICE_CPU), ArgOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceArgOp).Device(DEVICE_CPU), ArgOp);
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kRetOp).Device(DEVICE_CPU), RetvalOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceRetOp).Device(DEVICE_CPU), RetvalOp);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER(type)     \
@@ -98,11 +100,14 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
   REGISTER_KERNEL_BUILDER( \
       Name(kArgOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), ArgOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+TF_CALL_QUANTIZED_TYPES(REGISTER)
 TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kArgOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("output")
                                                    .TypeConstraint<int32>("T"),
                                                ArgOp);
+REGISTER_KERNEL_BUILDER(
+    Name(kDeviceArgOp).Device(DEVICE_GPU).TypeConstraint<int32>("T"), ArgOp);
 #undef REGISTER
 
 REGISTER_KERNEL_BUILDER(Name(kArgOp)
@@ -117,15 +122,21 @@ REGISTER_KERNEL_BUILDER(Name(kArgOp)
                             .TypeConstraint<string>("T"),
                         ArgOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name(kArgOp).Device(DEVICE_GPU).TypeConstraint<Variant>("T"), ArgOp);
+
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \
       Name(kRetOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
+TF_CALL_QUANTIZED_TYPES(REGISTER)
 TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("input")
                                                    .TypeConstraint<int32>("T"),
                                                RetvalOp);
+REGISTER_KERNEL_BUILDER(
+    Name(kDeviceRetOp).Device(DEVICE_GPU).TypeConstraint<int32>("T"), RetvalOp);
 REGISTER_KERNEL_BUILDER(Name(kRetOp)
                             .Device(DEVICE_GPU)
                             .TypeConstraint<ResourceHandle>("T")
diff --git a/tensorflow/core/kernels/function_ops.h b/tensorflow/core/kernels/function_ops.h
index 9e88cc6d8c93cd7cdd3190b287938a7fd5675832..9ddd49560392dd4c313877f819c13d2a6b0079ed 100644
--- a/tensorflow/core/kernels/function_ops.h
+++ b/tensorflow/core/kernels/function_ops.h
@@ -22,7 +22,9 @@ limitations under the License.
 namespace tensorflow {
 
 static const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static const char* const kDeviceArgOp = FunctionLibraryDefinition::kDeviceArgOp;
 static const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
+static const char* const kDeviceRetOp = FunctionLibraryDefinition::kDeviceRetOp;
 
 class ArgOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 1529d2e3368266174d3098bad5f4b35bb83b502e..5ecb203cbc7296d75f6a0a68a2189d7bf018c7fe 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -526,21 +526,40 @@ REGISTER_KERNEL_BUILDER(Name("For")
                             .HostMemory("delta"),
                         ForOp);
 
+// FakeParamOp allocates a tensor with a shape conforming to the expected
+// output. This is necessary if the value will be stored in a while_loop's
+// TensorList. The output is otherwise not expected to be consumed by anything
+// else.
 class FakeParamOp : public OpKernel {
  public:
   explicit FakeParamOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    DataType dtype;
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype));
+
+    // Set shape to the specified shape, setting unknown dimensions to empty.
+    // If the specified shape is unknown, leave as an empty shape.
+    TensorShape shape;
+    PartialTensorShape partial_shape;
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &partial_shape));
+    if (!partial_shape.unknown_rank()) {
+      for (int64 d : partial_shape.dim_sizes()) {
+        shape.AddDim(d == -1 ? 0 : d);
+      }
+    }
+
+    // Create a persistent tensor that we can repeatedly return to save memory.
+    // TODO(b/119612758): add optimization to prevent sending this across
+    // devices on each Compute() call.
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                dtype, shape, &value_handle_, nullptr));
   }
 
   void Compute(OpKernelContext* context) override {
-    // We must produce something (only Switch and Recvs are allowed to output
-    // dead tensors). This output is not expected to be consumed by anything.
-    Tensor output_tensor(dtype_, TensorShape({}));
-    context->set_output(0, output_tensor);
+    context->set_output(0, *value_handle_.AccessTensor(context));
   }
 
  private:
-  DataType dtype_;
+  PersistentTensor value_handle_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index d89f1592bd72d0f349b6f8a7eca64fc4d046050a..dbd3bb05dbf1a310ea9c5a5b1003474e33825133 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -248,7 +248,7 @@ struct FusedBatchNorm<GPUDevice, T, U> {
                   Tensor* saved_inv_var, TensorFormat tensor_format,
                   bool is_training) {
     auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream avalible"));
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available"));
 
     const int64 batch_size = GetTensorDim(x, tensor_format, 'N');
     const int64 channels = GetTensorDim(x, tensor_format, 'C');
@@ -389,7 +389,7 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
                   Tensor* scale_backprop, Tensor* offset_backprop,
                   TensorFormat tensor_format) {
     auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream avalible"));
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available"));
 
     const int64 batch_size = GetTensorDim(x, tensor_format, 'N');
     const int64 channels = GetTensorDim(x, tensor_format, 'C');
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index f2e0b2558f36f586ae782a54f0477a9bb9cdd7c9..2d8b734535c964bf4162838baa8ad65af4790423 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -18,13 +18,23 @@ cc_library(
 )
 
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib")
+load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_oss_fuzz_corpus")
+load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_oss_fuzz_dict")
 
 tf_ops_fuzz_target_lib("identity")
 
 tf_ops_fuzz_target_lib("string_to_number")
 
+tf_oss_fuzz_corpus("string_to_number")
+
 tf_ops_fuzz_target_lib("string_split")
 
+tf_oss_fuzz_corpus("string_split")
+
+tf_ops_fuzz_target_lib("string_split_v2")
+
+tf_oss_fuzz_corpus("string_split_v2")
+
 tf_ops_fuzz_target_lib("encode_base64")
 
 tf_ops_fuzz_target_lib("decode_base64")
@@ -33,12 +43,20 @@ tf_ops_fuzz_target_lib("encode_jpeg")
 
 tf_ops_fuzz_target_lib("decode_bmp")
 
+tf_oss_fuzz_corpus("decode_bmp")
+
 tf_ops_fuzz_target_lib("decode_png")
 
-tf_ops_fuzz_target_lib("decode_jpeg")
+tf_oss_fuzz_corpus("decode_png")
+
+tf_oss_fuzz_dict("decode_png")
 
 tf_ops_fuzz_target_lib("decode_wav")
 
+tf_oss_fuzz_corpus("decode_wav")
+
+tf_oss_fuzz_dict("decode_wav")
+
 tf_ops_fuzz_target_lib("example_proto_fast_parsing")
 
 tf_ops_fuzz_target_lib("parse_tensor_op")
@@ -46,3 +64,7 @@ tf_ops_fuzz_target_lib("parse_tensor_op")
 tf_ops_fuzz_target_lib("decode_compressed")
 
 tf_ops_fuzz_target_lib("decode_json_example")
+
+tf_oss_fuzz_corpus("decode_json_example")
+
+tf_oss_fuzz_dict("decode_json_example")
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/012e3ad384a4a1165f8498b5c94ba0d32a73e187 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/012e3ad384a4a1165f8498b5c94ba0d32a73e187
new file mode 100644
index 0000000000000000000000000000000000000000..7a1b8966c21c74f1e0a3f3af4240551bc10ef36c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/012e3ad384a4a1165f8498b5c94ba0d32a73e187 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/055d77f7810048caa28323f6eb552a53d156040b b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/055d77f7810048caa28323f6eb552a53d156040b
new file mode 100644
index 0000000000000000000000000000000000000000..24f658497f182188a6cbf431e1ff810ca268e709
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/055d77f7810048caa28323f6eb552a53d156040b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/131e251bfb82c681cb075d32b99f18fceaca115d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/131e251bfb82c681cb075d32b99f18fceaca115d
new file mode 100644
index 0000000000000000000000000000000000000000..a2d8f84cab77d805ae2582fb551fe2f074b2cfbb
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/131e251bfb82c681cb075d32b99f18fceaca115d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/1399ab0bd9f2c91d270cb43251bdc5729bef3526 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/1399ab0bd9f2c91d270cb43251bdc5729bef3526
new file mode 100644
index 0000000000000000000000000000000000000000..6206dab82b1e4a3d96ee4ae276006183349aa8e6
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/1399ab0bd9f2c91d270cb43251bdc5729bef3526 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/16a6ce88f66d2e9686c8354cad8ba915cf0c11de b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/16a6ce88f66d2e9686c8354cad8ba915cf0c11de
new file mode 100644
index 0000000000000000000000000000000000000000..bcc7f481ae90d0075912bf80469a18a0fd27682e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/16a6ce88f66d2e9686c8354cad8ba915cf0c11de differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/185097ed0588195164619ea930ddd8274a5f32ad b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/185097ed0588195164619ea930ddd8274a5f32ad
new file mode 100644
index 0000000000000000000000000000000000000000..92bddb6dca98bcc26ab9257bf568a462b5db8a36
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/185097ed0588195164619ea930ddd8274a5f32ad differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/27711a87e06a50c81571c27c3aa403a6ad5dc55c b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/27711a87e06a50c81571c27c3aa403a6ad5dc55c
new file mode 100644
index 0000000000000000000000000000000000000000..082b1e5752a95bed0941c60f68e9efb83dcea73e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/27711a87e06a50c81571c27c3aa403a6ad5dc55c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/298c3787ad1722b22569cbc405c464d2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/298c3787ad1722b22569cbc405c464d2
new file mode 100644
index 0000000000000000000000000000000000000000..af1091428d59645a8158218a953fbd60e0c463d8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/298c3787ad1722b22569cbc405c464d2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/2b95ba6d8141ce0d29ff279770903922 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/2b95ba6d8141ce0d29ff279770903922
new file mode 100644
index 0000000000000000000000000000000000000000..fd711cb0e51bb10d84902b03d3a27c086ec8dfbc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/2b95ba6d8141ce0d29ff279770903922 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/321fb3d758b86e37fc340ae2b09b8ed9fa73a4cb b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/321fb3d758b86e37fc340ae2b09b8ed9fa73a4cb
new file mode 100644
index 0000000000000000000000000000000000000000..6748826bd88a8ff6616536e0badb9b5d5184d9cb
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/321fb3d758b86e37fc340ae2b09b8ed9fa73a4cb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/331a98b4e4c87840efea69223766ebd0e1736542 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/331a98b4e4c87840efea69223766ebd0e1736542
new file mode 100644
index 0000000000000000000000000000000000000000..9cf1b9d3af82244e2f1f5f4e9d696fa006a9416f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/331a98b4e4c87840efea69223766ebd0e1736542 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/352d73f841223ecb630b5836585d2ba7b0f9d883 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/352d73f841223ecb630b5836585d2ba7b0f9d883
new file mode 100644
index 0000000000000000000000000000000000000000..932e78b3547e36ee7892f97135b7ac16d03bdff8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/352d73f841223ecb630b5836585d2ba7b0f9d883 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3a84f409d4c117edfdebc508cd23e8fc b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3a84f409d4c117edfdebc508cd23e8fc
new file mode 100644
index 0000000000000000000000000000000000000000..89a090d74ee4af9f5352f3e954e41db64727bfcc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3a84f409d4c117edfdebc508cd23e8fc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3ef5cc982c0b45f69a26fd0f7d376415fdebabd1 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3ef5cc982c0b45f69a26fd0f7d376415fdebabd1
new file mode 100644
index 0000000000000000000000000000000000000000..286949bc56a4fd135b6d645c2f6f6543fb54cefe
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3ef5cc982c0b45f69a26fd0f7d376415fdebabd1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/401c7de8e122018a0e17f57c93db7ee49ab0e906 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/401c7de8e122018a0e17f57c93db7ee49ab0e906
new file mode 100644
index 0000000000000000000000000000000000000000..8d5c7d136e51b0b71a574de271c1d076ad118ec7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/401c7de8e122018a0e17f57c93db7ee49ab0e906 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/52fee71bb8c9c79068e1fe580677ad739a2d0415 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/52fee71bb8c9c79068e1fe580677ad739a2d0415
new file mode 100644
index 0000000000000000000000000000000000000000..f77ffec08653f23af6ec36d2bc3851d37a15a7ef
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/52fee71bb8c9c79068e1fe580677ad739a2d0415 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57b11507813d5727b7789354d888eda83d5f3d86 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57b11507813d5727b7789354d888eda83d5f3d86
new file mode 100644
index 0000000000000000000000000000000000000000..f9af0697d53b40b1617eff3dba43955053de8bff
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57b11507813d5727b7789354d888eda83d5f3d86 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57dff0fa53ee0ef24a43cca6ab0523bfdc1f720d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57dff0fa53ee0ef24a43cca6ab0523bfdc1f720d
new file mode 100644
index 0000000000000000000000000000000000000000..109ab7948ff737251655b35d801b7a147107da71
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57dff0fa53ee0ef24a43cca6ab0523bfdc1f720d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5c42d3df0dc400a7a4175b8d4eec6cc8ee2437b2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5c42d3df0dc400a7a4175b8d4eec6cc8ee2437b2
new file mode 100644
index 0000000000000000000000000000000000000000..bf9772902653f1b4bf258c4268937e18f9abbcf8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5c42d3df0dc400a7a4175b8d4eec6cc8ee2437b2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5cca20637ae75fddad9370ee930837baef8aeb43 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5cca20637ae75fddad9370ee930837baef8aeb43
new file mode 100644
index 0000000000000000000000000000000000000000..cf7a78e9488a8fd55813862e78a360064dff19b7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5cca20637ae75fddad9370ee930837baef8aeb43 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5d34bc9cef0c844b9c5ebe948145c4ca11b5ca09 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5d34bc9cef0c844b9c5ebe948145c4ca11b5ca09
new file mode 100644
index 0000000000000000000000000000000000000000..e5621aa3d1ba240abb65a521507762d520d1d4bf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5d34bc9cef0c844b9c5ebe948145c4ca11b5ca09 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e162fe883bd12fb1c4131d4e0c979a12bd15eac b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e162fe883bd12fb1c4131d4e0c979a12bd15eac
new file mode 100644
index 0000000000000000000000000000000000000000..eea39d6b2f856ab8f1debf18d4d99543076a1b9a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e162fe883bd12fb1c4131d4e0c979a12bd15eac differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e83f8faab9c1a51a33d5e29edbb9dcec23c6092 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e83f8faab9c1a51a33d5e29edbb9dcec23c6092
new file mode 100644
index 0000000000000000000000000000000000000000..fabcbdbe3d4b0e2b8f1de3bf0c7a388369d11547
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e83f8faab9c1a51a33d5e29edbb9dcec23c6092 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/61b29dc2fcef7b6fbe3e0cc88769a7ef b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/61b29dc2fcef7b6fbe3e0cc88769a7ef
new file mode 100644
index 0000000000000000000000000000000000000000..8dfc17e8e05f948210d4ed34113b3517445c418e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/61b29dc2fcef7b6fbe3e0cc88769a7ef differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6361eca190157ece389665ee523ccc3aefcd957f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6361eca190157ece389665ee523ccc3aefcd957f
new file mode 100644
index 0000000000000000000000000000000000000000..141e331ad0148af9b17e2824d8b64a442ac18b38
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6361eca190157ece389665ee523ccc3aefcd957f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/65150515ab3b11d657519b22bb887d74e94b2d7f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/65150515ab3b11d657519b22bb887d74e94b2d7f
new file mode 100644
index 0000000000000000000000000000000000000000..567c645c00ee0e47ca2a840c7a115f68ae889e3d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/65150515ab3b11d657519b22bb887d74e94b2d7f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/656f38ef6dcd58c6a909d61db11f777def69c394 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/656f38ef6dcd58c6a909d61db11f777def69c394
new file mode 100644
index 0000000000000000000000000000000000000000..e1cdb4e5bf9d55e87b4495ab8db931f269e22d61
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/656f38ef6dcd58c6a909d61db11f777def69c394 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/66e0d2cafd592bf9d61ad900fade8ee530d5f3d7 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/66e0d2cafd592bf9d61ad900fade8ee530d5f3d7
new file mode 100644
index 0000000000000000000000000000000000000000..73e53b460ec81a8accdf2b6601dfcd8679465b6a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/66e0d2cafd592bf9d61ad900fade8ee530d5f3d7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6b5b42cb105a2c4c5fd6034e9885cbe457f1b50c b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6b5b42cb105a2c4c5fd6034e9885cbe457f1b50c
new file mode 100644
index 0000000000000000000000000000000000000000..f29b9b217184c4cc8496f1f711caec0c7c632ca4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6b5b42cb105a2c4c5fd6034e9885cbe457f1b50c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/722ed0197cb92ecbf9745edb38275e7a9aaf322f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/722ed0197cb92ecbf9745edb38275e7a9aaf322f
new file mode 100644
index 0000000000000000000000000000000000000000..3b0c338ce203a0fd72936ae40120f140aae712b4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/722ed0197cb92ecbf9745edb38275e7a9aaf322f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/77bdd2efdf328366cbbf3c5688768dc0a88d02b1 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/77bdd2efdf328366cbbf3c5688768dc0a88d02b1
new file mode 100644
index 0000000000000000000000000000000000000000..61dd2583cd6e9c7679c2c30ff6d35df09264fd91
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/77bdd2efdf328366cbbf3c5688768dc0a88d02b1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7841bfa002c05c61d5a5d9241f214cc17a336166 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7841bfa002c05c61d5a5d9241f214cc17a336166
new file mode 100644
index 0000000000000000000000000000000000000000..907ec3b5a3ba35bf86b8988ad6d25592e9015c17
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7841bfa002c05c61d5a5d9241f214cc17a336166 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7899e22fc83f6be28e9130c4a1c91a48 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7899e22fc83f6be28e9130c4a1c91a48
new file mode 100644
index 0000000000000000000000000000000000000000..7e3b1990ad7c09a19fcef334aee9545a31d05380
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7899e22fc83f6be28e9130c4a1c91a48 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7dddccaebd16ae0c26daeffc42df50f529891119 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7dddccaebd16ae0c26daeffc42df50f529891119
new file mode 100644
index 0000000000000000000000000000000000000000..0329a2826a89961de414d0e3c51a7c81ea02d8ef
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7dddccaebd16ae0c26daeffc42df50f529891119 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8157442eee4bbfdd9716e264b11085d61a9955b7 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8157442eee4bbfdd9716e264b11085d61a9955b7
new file mode 100644
index 0000000000000000000000000000000000000000..7e9ef4b3dd47fbbdf0d4087a4d4fdf5323c3bf23
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8157442eee4bbfdd9716e264b11085d61a9955b7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/81ff28ed63d5435ddc4c8771dd5d40aa658cbbe0 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/81ff28ed63d5435ddc4c8771dd5d40aa658cbbe0
new file mode 100644
index 0000000000000000000000000000000000000000..6390e6b2b30b9e4512b191b72f0bed8d4005c0cd
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/81ff28ed63d5435ddc4c8771dd5d40aa658cbbe0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/820c8c0d33c18f6c4d9edd314e91289186931ad0 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/820c8c0d33c18f6c4d9edd314e91289186931ad0
new file mode 100644
index 0000000000000000000000000000000000000000..0084212a656bd4f97a2ba22f4d7ced7f8746946f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/820c8c0d33c18f6c4d9edd314e91289186931ad0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/849e9d7cee1c52105242327086997296e452b981 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/849e9d7cee1c52105242327086997296e452b981
new file mode 100644
index 0000000000000000000000000000000000000000..a36c88daf011cf7809049396476863fd66d5dc2d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/849e9d7cee1c52105242327086997296e452b981 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/84ddb92c63e0fad7018f6069daf8779ce11501e2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/84ddb92c63e0fad7018f6069daf8779ce11501e2
new file mode 100644
index 0000000000000000000000000000000000000000..b5d34609b88399ffe7198acfb06dac91fd50d4cd
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/84ddb92c63e0fad7018f6069daf8779ce11501e2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/86bc3d5dbb9313137502080e58551edd2e649c70 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/86bc3d5dbb9313137502080e58551edd2e649c70
new file mode 100644
index 0000000000000000000000000000000000000000..a9ef2b5a50b36814ecaada8813a5f4b056f17ce8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/86bc3d5dbb9313137502080e58551edd2e649c70 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/87d94d88fe29d277c76e1a52042b02c092d5ae14 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/87d94d88fe29d277c76e1a52042b02c092d5ae14
new file mode 100644
index 0000000000000000000000000000000000000000..83de83f4eb59e2cfea819e1fdee97b3f6aec525c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/87d94d88fe29d277c76e1a52042b02c092d5ae14 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8c4646f3357945c4e19a59ff79fffe3c874dbf16 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8c4646f3357945c4e19a59ff79fffe3c874dbf16
new file mode 100644
index 0000000000000000000000000000000000000000..fa47e75a6323a989daa16e2a482a44f9ab2f2705
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8c4646f3357945c4e19a59ff79fffe3c874dbf16 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/90632bc6dee4eb836f3d7db1d16446a9c8510080 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/90632bc6dee4eb836f3d7db1d16446a9c8510080
new file mode 100644
index 0000000000000000000000000000000000000000..e739e858b860e774e3cdf9142de13206f508d2ea
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/90632bc6dee4eb836f3d7db1d16446a9c8510080 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/94d06016aa949e8e7203217e4cc6625ded7f4244 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/94d06016aa949e8e7203217e4cc6625ded7f4244
new file mode 100644
index 0000000000000000000000000000000000000000..c989a76df155ff09d69d2a765d6e097928b8344f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/94d06016aa949e8e7203217e4cc6625ded7f4244 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9875819b9e5783e7489c29a81cc9d4279209956a b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9875819b9e5783e7489c29a81cc9d4279209956a
new file mode 100644
index 0000000000000000000000000000000000000000..6ff64a7d2bab262cee2e8b6bebf65af9aafda630
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9875819b9e5783e7489c29a81cc9d4279209956a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9c1cc734114b29aac6c51782d5c17e9dbe1faca2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9c1cc734114b29aac6c51782d5c17e9dbe1faca2
new file mode 100644
index 0000000000000000000000000000000000000000..2d1d8576a29b050160e63a142b48927db9540b8d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9c1cc734114b29aac6c51782d5c17e9dbe1faca2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9d2961871eeb201ef8a6f5503d8a8b62 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9d2961871eeb201ef8a6f5503d8a8b62
new file mode 100644
index 0000000000000000000000000000000000000000..a5865c60a614829812b37c8439f6bfc00f7198c0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9d2961871eeb201ef8a6f5503d8a8b62 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9f39e11cdd88344a4894b678e5a04a810880064d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9f39e11cdd88344a4894b678e5a04a810880064d
new file mode 100644
index 0000000000000000000000000000000000000000..8f712aea9ff2d6e930f0269002f9e9d1a0caa016
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9f39e11cdd88344a4894b678e5a04a810880064d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a350588a6dabe4376a066aed44ef8786d8e752e7 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a350588a6dabe4376a066aed44ef8786d8e752e7
new file mode 100644
index 0000000000000000000000000000000000000000..b7ad89078b5fc1b96e811aa117cc11c7f4f0467b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a350588a6dabe4376a066aed44ef8786d8e752e7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a6101a79919d444e1fc50aefab5837c39e3f4a19 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a6101a79919d444e1fc50aefab5837c39e3f4a19
new file mode 100644
index 0000000000000000000000000000000000000000..173c941952bf798f796a9ef4c751d1e6e6e3a09c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a6101a79919d444e1fc50aefab5837c39e3f4a19 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a9c8793f8fb063bec839ee1280406fe5396545e5 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a9c8793f8fb063bec839ee1280406fe5396545e5
new file mode 100644
index 0000000000000000000000000000000000000000..644560ced96b56c960032d7df08b3730053bc3ea
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a9c8793f8fb063bec839ee1280406fe5396545e5 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ad4e9d2234e8599bdf12607c6b8cab4edae82c4e b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ad4e9d2234e8599bdf12607c6b8cab4edae82c4e
new file mode 100644
index 0000000000000000000000000000000000000000..f1826b06e88c9ca21599abaee7481ae2f07ee840
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ad4e9d2234e8599bdf12607c6b8cab4edae82c4e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b90b6830917919e94186d312f06481bd b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b90b6830917919e94186d312f06481bd
new file mode 100644
index 0000000000000000000000000000000000000000..2da6be376f2af7b9cefc00d5b0d096c7e68fea69
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b90b6830917919e94186d312f06481bd differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b98fd4cb1d7031240414301c19b03097c0035c6b b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b98fd4cb1d7031240414301c19b03097c0035c6b
new file mode 100644
index 0000000000000000000000000000000000000000..b84b57bb53177764b84b10a75624dfe949ddbe81
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b98fd4cb1d7031240414301c19b03097c0035c6b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ba976fcdb4daf092ef17ce43bf2b78d9d8bc2aeb b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ba976fcdb4daf092ef17ce43bf2b78d9d8bc2aeb
new file mode 100644
index 0000000000000000000000000000000000000000..4ee9cbdfc3defea2195af8ccd749fc0a326a7d00
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ba976fcdb4daf092ef17ce43bf2b78d9d8bc2aeb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/bc112b571eafee0f5a031f3c9cce6244216d128d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/bc112b571eafee0f5a031f3c9cce6244216d128d
new file mode 100644
index 0000000000000000000000000000000000000000..af1091428d59645a8158218a953fbd60e0c463d8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/bc112b571eafee0f5a031f3c9cce6244216d128d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c42b981c28a1715c375050f6fcf53f1d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c42b981c28a1715c375050f6fcf53f1d
new file mode 100644
index 0000000000000000000000000000000000000000..996e8c826cbd548e4567bfc600496ae31084c288
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c42b981c28a1715c375050f6fcf53f1d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c6049874b33eadb016fccf0c5fa66e556ae069b9 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c6049874b33eadb016fccf0c5fa66e556ae069b9
new file mode 100644
index 0000000000000000000000000000000000000000..4863878ca02ca42e550f521cb1ba5b8bd7046ecc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c6049874b33eadb016fccf0c5fa66e556ae069b9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8697bf2369f6ab85f501376c4d93bb8a56974a3 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8697bf2369f6ab85f501376c4d93bb8a56974a3
new file mode 100644
index 0000000000000000000000000000000000000000..30aacc2f98820e139b1495c9c1527f9119dec97b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8697bf2369f6ab85f501376c4d93bb8a56974a3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8daf283e0aef2fd7b630c0430e05dc28f24ecf6 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8daf283e0aef2fd7b630c0430e05dc28f24ecf6
new file mode 100644
index 0000000000000000000000000000000000000000..b831633f02b50870db441110db29e03331decdaf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8daf283e0aef2fd7b630c0430e05dc28f24ecf6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/cacff56e1af4b8fde912822da06b10fb8c545a19 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/cacff56e1af4b8fde912822da06b10fb8c545a19
new file mode 100644
index 0000000000000000000000000000000000000000..ff492d29d76ce94f0fb8db9a0b6c481c407b205b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/cacff56e1af4b8fde912822da06b10fb8c545a19 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ce4dcc22b1d595c49a25121c0b580104 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ce4dcc22b1d595c49a25121c0b580104
new file mode 100644
index 0000000000000000000000000000000000000000..ea776fb0a94b79362822f76bfa1f0d9364a09c4a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ce4dcc22b1d595c49a25121c0b580104 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d0cd71dbf039fd64cf42eff30da92a71a919226a b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d0cd71dbf039fd64cf42eff30da92a71a919226a
new file mode 100644
index 0000000000000000000000000000000000000000..b9dc5d0f4ee4edb8d45ae3e5ed182f57e5d4e5db
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d0cd71dbf039fd64cf42eff30da92a71a919226a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d5ce626ac3264bed6af5580e341a89406857cbb9 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d5ce626ac3264bed6af5580e341a89406857cbb9
new file mode 100644
index 0000000000000000000000000000000000000000..b17294ec90a31bc77f439f7baae0d897bf860ec4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d5ce626ac3264bed6af5580e341a89406857cbb9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d77ada02e9bc8c24b2711eca6a8f52ae356bfc21 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d77ada02e9bc8c24b2711eca6a8f52ae356bfc21
new file mode 100644
index 0000000000000000000000000000000000000000..9cc65607fecd34abe27d5e0956e9fa462f0c48f1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d77ada02e9bc8c24b2711eca6a8f52ae356bfc21 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d7eb9c5a0f9803df4c00390793b8ab57bd7c9484 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d7eb9c5a0f9803df4c00390793b8ab57bd7c9484
new file mode 100644
index 0000000000000000000000000000000000000000..531427a99c0f1bbef21f35fa90027a7d662b39ec
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d7eb9c5a0f9803df4c00390793b8ab57bd7c9484 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dc1efccdeec17e151a1ec8228c09ab61c3040b33 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dc1efccdeec17e151a1ec8228c09ab61c3040b33
new file mode 100644
index 0000000000000000000000000000000000000000..5a5c1c30eb0d70bea1562137328ac2d74871e43d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dc1efccdeec17e151a1ec8228c09ab61c3040b33 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dcea22c66c60088165a2f1772036473f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dcea22c66c60088165a2f1772036473f
new file mode 100644
index 0000000000000000000000000000000000000000..44d2ebfb3d0f37c6355fe4e44c27f7457ebee63c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dcea22c66c60088165a2f1772036473f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/de539ae7442fa05dafcfe1a021f0186ef74a2b0e b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/de539ae7442fa05dafcfe1a021f0186ef74a2b0e
new file mode 100644
index 0000000000000000000000000000000000000000..f9a8f33443d84d5494b23e5b3317015f10b0496a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/de539ae7442fa05dafcfe1a021f0186ef74a2b0e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2306b1d6b88d0ccc4e2c3a9edb07462a5a32215 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2306b1d6b88d0ccc4e2c3a9edb07462a5a32215
new file mode 100644
index 0000000000000000000000000000000000000000..71bf61cebe4e69dc71714916d804d979824a7d0b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2306b1d6b88d0ccc4e2c3a9edb07462a5a32215 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2778da0240fdd15ef5844905d81c4e05f34a8bd b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2778da0240fdd15ef5844905d81c4e05f34a8bd
new file mode 100644
index 0000000000000000000000000000000000000000..1bad15905ffd4776122e34df11873b390abee72a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2778da0240fdd15ef5844905d81c4e05f34a8bd differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e6642e9266875f9d908942e534bf898103a2c794 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e6642e9266875f9d908942e534bf898103a2c794
new file mode 100644
index 0000000000000000000000000000000000000000..f9d9de9c9c21f18a6eb88afb89f4450d4fd6771f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e6642e9266875f9d908942e534bf898103a2c794 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ec6cdb929c08d8daf2bd7fc185fbf4d787b45120 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ec6cdb929c08d8daf2bd7fc185fbf4d787b45120
new file mode 100644
index 0000000000000000000000000000000000000000..782a0925210623c0d6e33ba7c885435bb19654d3
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ec6cdb929c08d8daf2bd7fc185fbf4d787b45120 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ed8636357f79439b6a03eb14469b686cc401a1c9 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ed8636357f79439b6a03eb14469b686cc401a1c9
new file mode 100644
index 0000000000000000000000000000000000000000..efd9312d94dde30707099125b0280be21f6b2101
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ed8636357f79439b6a03eb14469b686cc401a1c9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ee313e9acecb5c688ce8c9bb10e70e136fbb9c6d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ee313e9acecb5c688ce8c9bb10e70e136fbb9c6d
new file mode 100644
index 0000000000000000000000000000000000000000..03e09e28193a513347433ee41f005f0926329d20
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ee313e9acecb5c688ce8c9bb10e70e136fbb9c6d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ef689af320e7d9e22231109faae2e8149cb86e1c b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ef689af320e7d9e22231109faae2e8149cb86e1c
new file mode 100644
index 0000000000000000000000000000000000000000..f8688710452dafe40fb247674471ea9f642b3778
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ef689af320e7d9e22231109faae2e8149cb86e1c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/fda6b9a9f6ffdf4765c00465619c7ceb3f7db2e4 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/fda6b9a9f6ffdf4765c00465619c7ceb3f7db2e4
new file mode 100644
index 0000000000000000000000000000000000000000..20efec0d1e19ea1f2b317f82c08f0ec5d41a66ca
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/fda6b9a9f6ffdf4765c00465619c7ceb3f7db2e4 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ffe829bb0adac20d9c0756f68a22d1255e4fdb54 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ffe829bb0adac20d9c0756f68a22d1255e4fdb54
new file mode 100644
index 0000000000000000000000000000000000000000..e24c09dacce1b52dc3737b1d921920846b627b2f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ffe829bb0adac20d9c0756f68a22d1255e4fdb54 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144
new file mode 100644
index 0000000000000000000000000000000000000000..06fd8044808ff9cae8663cec970645bd22bf8ab8
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144
@@ -0,0 +1,48 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie: {
+        bytes_list: {
+          value: "VGhlIFNoYXdzaGFuayBSZWRlbXB0aW9u",
+          value: "RmlnaHQgQ2x1Yg=="
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: 9.0,
+          value: 9.7
+        }
+      }
+    },
+    feature: {
+      suggestion: {
+        bytes_list: {
+          value: "SW5jZXB0aW9u"
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2
new file mode 100644
index 0000000000000000000000000000000000000000..4ae686974e2be25e49e3a25064dcfdfb91a41b5b
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:[29.0,2,3,4]}}},feature:{movie_ratings:{float_list:{value:[9.0,9.7]}}},feature:{suggestion_purchased:{float_list:{value:[1.0,2,3,4,5]}}},feature:{purchase_price:{float_list:{value:[9.99,8.88,7.77,6.66,5.55],value:[4.44,3.33,2.22,1.11],value:[1.11,2.22,3.33],value:[4.44,5.55],value:0}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b
new file mode 100644
index 0000000000000000000000000000000000000000..150f8710f7dc094ad1189f1d3c659910d2e1b3e2
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:[9.0,9.7]}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416
new file mode 100644
index 0000000000000000000000000000000000000000..fcfdfedd1b090871954e1d9b99d90480f6082dae
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:[[[[[[9.0,9.7]]]]]],value:[[[9.0,-9.2]]]}}},feature:{suggestion_purchased:{float_list:{value:[1.0,[2,3,[4,5,6,[7,8,9,0]]]]}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f
new file mode 100644
index 0000000000000000000000000000000000000000..7c9981d482fcf5a2a138cc2583ea0dca9589e756
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:9.0,value:9.7}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87
new file mode 100644
index 0000000000000000000000000000000000000000..a1315bb8f9363858c6d79066cac3e93dc40f1602
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87
@@ -0,0 +1,33 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [[[[[[9.0,9.7]]]]]],
+          value: [[[9.0, -9.2]]]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: [1.0, [2, 3, [4, 5, 6, [7, 8, 9, 0]]]]
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d
new file mode 100644
index 0000000000000000000000000000000000000000..d4f9494bbd3f945ed6926f8669c9fab62ae3ede6
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d
@@ -0,0 +1,33 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: 9.0,
+          value: 9.7
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758
new file mode 100644
index 0000000000000000000000000000000000000000..e8ba267eb27b84fb427f33dea60623b8dace79cf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758
@@ -0,0 +1,32 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [9.0,9.7]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223
new file mode 100644
index 0000000000000000000000000000000000000000..3428a1e0fcd730a5e0bce03f0dfd1d5fec90ea74
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie:{bytes_list:{value:"VGhlIFNoYXdzaGFuayBSZWRlbXB0aW9u",value:"RmlnaHQgQ2x1Yg=="}}},feature:{movie_ratings:{float_list:{value:9.0,value:9.7}}},feature:{suggestion:{bytes_list:{value:"SW5jZXB0aW9u"}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2
new file mode 100644
index 0000000000000000000000000000000000000000..ef0923c4500ecc3c6e8f01a87d1109066a752f48
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2
@@ -0,0 +1,36 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: [29.0, 2, 3, 4]
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [9.0,9.7]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: [1.0, 2, 3, 4, 5]
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: [9.99, 8.88, 7.77, 6.66, 5.55],
+          value: [4.44, 3.33, 2.22, 1.11],
+          value: [1.11, 2.22, 3.33],
+          value: [4.44, 5.55],
+          value: 0
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/010dc3d4b05288fcc40de2721052b3dc699f1cb3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/010dc3d4b05288fcc40de2721052b3dc699f1cb3
new file mode 100644
index 0000000000000000000000000000000000000000..9dbc560e1e4b50f98060fdad36ae0f65f0c0c92b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/010dc3d4b05288fcc40de2721052b3dc699f1cb3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/0555cd5e9d99629819cc985285f80da0f00be1e9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0555cd5e9d99629819cc985285f80da0f00be1e9
new file mode 100644
index 0000000000000000000000000000000000000000..fab6d15ebe37176ddc7f3868f576b15d27518d00
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0555cd5e9d99629819cc985285f80da0f00be1e9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/0a0352aa168803ff65455792d9f6ee555c3e7c3f b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0a0352aa168803ff65455792d9f6ee555c3e7c3f
new file mode 100644
index 0000000000000000000000000000000000000000..7918406ac4bc04196bf07a3e8804b0dbc946eaf3
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0a0352aa168803ff65455792d9f6ee555c3e7c3f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/0ed54162df93ef8d00f993ce6b59ba422903d381 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0ed54162df93ef8d00f993ce6b59ba422903d381
new file mode 100644
index 0000000000000000000000000000000000000000..c294b3180f7d1c315466efacee0167a1685ba68f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0ed54162df93ef8d00f993ce6b59ba422903d381 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1547b448171c700613c3946d730de496c9b9863f b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1547b448171c700613c3946d730de496c9b9863f
new file mode 100644
index 0000000000000000000000000000000000000000..0eb3eff90d7fa0c03fc827b65da2e21050f27ebf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1547b448171c700613c3946d730de496c9b9863f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/17859046cbe4ac598a645173d679ce2a52c6afba b/tensorflow/core/kernels/fuzzing/corpus/decode_png/17859046cbe4ac598a645173d679ce2a52c6afba
new file mode 100644
index 0000000000000000000000000000000000000000..deb7b0a784aa876e880755bca4c2f96c1167fa27
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/17859046cbe4ac598a645173d679ce2a52c6afba differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1df76c07817fbc3653a26f34d97658e9973627c2 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1df76c07817fbc3653a26f34d97658e9973627c2
new file mode 100644
index 0000000000000000000000000000000000000000..2b9721d742ad143de0881894c52407af1865b79b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1df76c07817fbc3653a26f34d97658e9973627c2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1f0717f8856d7782e3ab7992d3a72d783a018443 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1f0717f8856d7782e3ab7992d3a72d783a018443
new file mode 100644
index 0000000000000000000000000000000000000000..e0c330f7f4ee396723fedbb530197cd890045730
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1f0717f8856d7782e3ab7992d3a72d783a018443 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/23b911e4ce936def88bc9a46b8b433c0e83fba2a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/23b911e4ce936def88bc9a46b8b433c0e83fba2a
new file mode 100644
index 0000000000000000000000000000000000000000..41fc2fe9516ecaf369c9093c5d815a709963c2b1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/23b911e4ce936def88bc9a46b8b433c0e83fba2a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/25592201c3edff0578dbdac6b0e4f2be109ce151 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/25592201c3edff0578dbdac6b0e4f2be109ce151
new file mode 100644
index 0000000000000000000000000000000000000000..8b5755c4bc513368a2605e05c8b3117feadab086
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/25592201c3edff0578dbdac6b0e4f2be109ce151 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/266fd8495e0b8eb64387c1a62264185e061fee73 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/266fd8495e0b8eb64387c1a62264185e061fee73
new file mode 100644
index 0000000000000000000000000000000000000000..c4d2d8d7f1adc091d4340f3cb45a304df4617a17
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/266fd8495e0b8eb64387c1a62264185e061fee73 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/27f178cf415b4ff8671131ddf1d042dafac2fb3e b/tensorflow/core/kernels/fuzzing/corpus/decode_png/27f178cf415b4ff8671131ddf1d042dafac2fb3e
new file mode 100644
index 0000000000000000000000000000000000000000..1608e5b08373fdbfb62f83c5a708e76e3dcd3db0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/27f178cf415b4ff8671131ddf1d042dafac2fb3e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2a0bdc4d9cc5ea5bb21dd256d6ac96075376a94f b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2a0bdc4d9cc5ea5bb21dd256d6ac96075376a94f
new file mode 100644
index 0000000000000000000000000000000000000000..12e4140981d115312f32b710bb2882d5b1ede161
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2a0bdc4d9cc5ea5bb21dd256d6ac96075376a94f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e5d25add6adc68e0457b358c7a34abf3d41c938 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e5d25add6adc68e0457b358c7a34abf3d41c938
new file mode 100644
index 0000000000000000000000000000000000000000..ecf597f7365bf06492af4d53bca4166f5ecf744a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e5d25add6adc68e0457b358c7a34abf3d41c938 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e6c5b6a766dd5e9bd41eacfd0a36572bd2f7544 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e6c5b6a766dd5e9bd41eacfd0a36572bd2f7544
new file mode 100644
index 0000000000000000000000000000000000000000..e5a18917e3744daf0f8d4eb5cbbb702daff21188
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e6c5b6a766dd5e9bd41eacfd0a36572bd2f7544 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e9c935cf82f6ca640e9a9abc3c30a578ad46176 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e9c935cf82f6ca640e9a9abc3c30a578ad46176
new file mode 100644
index 0000000000000000000000000000000000000000..50be7f686b791d7aa08652159592311c728ec79f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e9c935cf82f6ca640e9a9abc3c30a578ad46176 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2fcf1ed4477f7eaee028f5b3f9edeb5f1a737826 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2fcf1ed4477f7eaee028f5b3f9edeb5f1a737826
new file mode 100644
index 0000000000000000000000000000000000000000..00eba4c39a92d546e7b4f6f77b18e9d3ddaec399
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2fcf1ed4477f7eaee028f5b3f9edeb5f1a737826 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3480713774f590908ca5dba16d121cdfb8fba62b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3480713774f590908ca5dba16d121cdfb8fba62b
new file mode 100644
index 0000000000000000000000000000000000000000..af3afc499d820cdd966ec9a67d3ec6fb39b3e240
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3480713774f590908ca5dba16d121cdfb8fba62b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/39289afcec60d98802b333e0fbb1da4d7aed4ce5 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/39289afcec60d98802b333e0fbb1da4d7aed4ce5
new file mode 100644
index 0000000000000000000000000000000000000000..02c187a49225947f8f20ee87b970783f93eaed76
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/39289afcec60d98802b333e0fbb1da4d7aed4ce5 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3adc488e21d4aca7bed9422f0241a42d0f93e7d9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3adc488e21d4aca7bed9422f0241a42d0f93e7d9
new file mode 100644
index 0000000000000000000000000000000000000000..1cf24048f8ba10e68082ea717656c2c889ebf7f5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3adc488e21d4aca7bed9422f0241a42d0f93e7d9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3cbf274da522483dc991fad9df43a22ac4fb3173 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3cbf274da522483dc991fad9df43a22ac4fb3173
new file mode 100644
index 0000000000000000000000000000000000000000..7f9c0c93ec9bc8f135e7f78010b275079e052b48
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3cbf274da522483dc991fad9df43a22ac4fb3173 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3d840cdff7f5ad16fe8bcb985ed4946c03459432 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3d840cdff7f5ad16fe8bcb985ed4946c03459432
new file mode 100644
index 0000000000000000000000000000000000000000..f48cb4cd19a65487d555b2fdce297d638736edb7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3d840cdff7f5ad16fe8bcb985ed4946c03459432 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3f1e6753c1fca958e859189857449746592158ea b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3f1e6753c1fca958e859189857449746592158ea
new file mode 100644
index 0000000000000000000000000000000000000000..df07889441d48fd938b74aac94e01b39dfa8f63e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3f1e6753c1fca958e859189857449746592158ea differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3fa4075993cb0f9bfa8eea785174a2038a69aa1b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3fa4075993cb0f9bfa8eea785174a2038a69aa1b
new file mode 100644
index 0000000000000000000000000000000000000000..5f9cec9ab5cb27973b8d8e71b47cf61601c0ee3d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3fa4075993cb0f9bfa8eea785174a2038a69aa1b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/4023a373e977be58413e55350380310c5dd1fd6a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4023a373e977be58413e55350380310c5dd1fd6a
new file mode 100644
index 0000000000000000000000000000000000000000..385b8b0c35936d3b9c99facefccb33835569470b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4023a373e977be58413e55350380310c5dd1fd6a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/40caba69dce1cfc48e0e43184d2bfbc6daa4399a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/40caba69dce1cfc48e0e43184d2bfbc6daa4399a
new file mode 100644
index 0000000000000000000000000000000000000000..22f1649adc45eae591f46c4b1a1e0db6bf0cc82e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/40caba69dce1cfc48e0e43184d2bfbc6daa4399a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/41841e9561d8135945c1c1e55ab9e9a1e933653b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41841e9561d8135945c1c1e55ab9e9a1e933653b
new file mode 100644
index 0000000000000000000000000000000000000000..16c0c33b93dadec4fd326f2eff73a81d6c82f508
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41841e9561d8135945c1c1e55ab9e9a1e933653b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/41d40f2d66fa43e34537385594ee9911e65deadf b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41d40f2d66fa43e34537385594ee9911e65deadf
new file mode 100644
index 0000000000000000000000000000000000000000..6e44f2adc7fd1343044613a087c1c3efdaef5081
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41d40f2d66fa43e34537385594ee9911e65deadf differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/421bd39810b50309a71adb2dadc3b19f01a52312 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/421bd39810b50309a71adb2dadc3b19f01a52312
new file mode 100644
index 0000000000000000000000000000000000000000..131004b8943e997ec7ba3212f3ba0d555c4ba4d2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/421bd39810b50309a71adb2dadc3b19f01a52312 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/446c305b2c0665736f94fb2b62dbdef445eff0cf b/tensorflow/core/kernels/fuzzing/corpus/decode_png/446c305b2c0665736f94fb2b62dbdef445eff0cf
new file mode 100644
index 0000000000000000000000000000000000000000..1f2f90b3bc41f8daa93d2df651bf55b2a3ddc78b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/446c305b2c0665736f94fb2b62dbdef445eff0cf differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/449cee952bb645f6f4241a6665d3c6028c073c7a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/449cee952bb645f6f4241a6665d3c6028c073c7a
new file mode 100644
index 0000000000000000000000000000000000000000..c671f7e4c0d7c8816a5fa4915b1fe0cbe9be98e1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/449cee952bb645f6f4241a6665d3c6028c073c7a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/45520b07609978c5aa3516d803527438b93fbadb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/45520b07609978c5aa3516d803527438b93fbadb
new file mode 100644
index 0000000000000000000000000000000000000000..c6f2f7052d2c169c84429800d57e79d8876e2a03
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/45520b07609978c5aa3516d803527438b93fbadb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/4da74a34bcede234b0415f77fbd87d70bf9a777e b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4da74a34bcede234b0415f77fbd87d70bf9a777e
new file mode 100644
index 0000000000000000000000000000000000000000..605ad2d2014ba7bcc2a0b94a9a0b9bacf055d7c5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4da74a34bcede234b0415f77fbd87d70bf9a777e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/51db5d31d2c5300d34831d9f23bcdd0aff9a998b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/51db5d31d2c5300d34831d9f23bcdd0aff9a998b
new file mode 100644
index 0000000000000000000000000000000000000000..bcacbe623f8ca1ac88de8d3dfdce160de04f2f29
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/51db5d31d2c5300d34831d9f23bcdd0aff9a998b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/5cde2a9167798cb77f10abbfb2640a5c357f99fc b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5cde2a9167798cb77f10abbfb2640a5c357f99fc
new file mode 100644
index 0000000000000000000000000000000000000000..2619e1d87638b8f37900a750dba325d075739ad1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5cde2a9167798cb77f10abbfb2640a5c357f99fc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/5e352fc10ac476cfbe1d755f092e069820223249 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5e352fc10ac476cfbe1d755f092e069820223249
new file mode 100644
index 0000000000000000000000000000000000000000..cb55f03ee184e66592d578050dc44a7bf70fe4ac
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5e352fc10ac476cfbe1d755f092e069820223249 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/63661677dd1306cec4b5a565190e65adf2446e52 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/63661677dd1306cec4b5a565190e65adf2446e52
new file mode 100644
index 0000000000000000000000000000000000000000..be952039a4ec42a60e188115aa9c27888c692b06
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/63661677dd1306cec4b5a565190e65adf2446e52 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/65887ed3db382aab1d9485c500f4401318d303b9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/65887ed3db382aab1d9485c500f4401318d303b9
new file mode 100644
index 0000000000000000000000000000000000000000..776adbe8d4bfd9e9b2532525a8f3187d90453d93
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/65887ed3db382aab1d9485c500f4401318d303b9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/67b5181f8f0644597e9bde539e8f083b5cacd0e7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/67b5181f8f0644597e9bde539e8f083b5cacd0e7
new file mode 100644
index 0000000000000000000000000000000000000000..5bee1d494a574bf8a675933d3045150b32395478
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/67b5181f8f0644597e9bde539e8f083b5cacd0e7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/74c9dcf7afee2a6cb1ab3a2c0de744d1b03c1466 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/74c9dcf7afee2a6cb1ab3a2c0de744d1b03c1466
new file mode 100644
index 0000000000000000000000000000000000000000..ea3e0d2bd444e8fd484bd57facdf53f318762b86
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/74c9dcf7afee2a6cb1ab3a2c0de744d1b03c1466 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/792181ca19e6ded261434e588bb7fc2a4816d4ce b/tensorflow/core/kernels/fuzzing/corpus/decode_png/792181ca19e6ded261434e588bb7fc2a4816d4ce
new file mode 100644
index 0000000000000000000000000000000000000000..521deb8d44ea8afc4d5ef285d41f1a42909e8ad5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/792181ca19e6ded261434e588bb7fc2a4816d4ce differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/79f0e2a475487f8fa69e68c1cc947c5851bda741 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/79f0e2a475487f8fa69e68c1cc947c5851bda741
new file mode 100644
index 0000000000000000000000000000000000000000..885332337762ff92dd6a1ad00fe0a1694995a5c0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/79f0e2a475487f8fa69e68c1cc947c5851bda741 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/7e5fcdfeb557ce379ed96925c68505eaac0112db b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7e5fcdfeb557ce379ed96925c68505eaac0112db
new file mode 100644
index 0000000000000000000000000000000000000000..cc011aedc9b5234cb26140a73dc93a85c0e1c6c3
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7e5fcdfeb557ce379ed96925c68505eaac0112db differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/7eec7530acf34b3a96fa9189783453999f7b6838 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7eec7530acf34b3a96fa9189783453999f7b6838
new file mode 100644
index 0000000000000000000000000000000000000000..7e3b6f564f0dc029247355c9a920245354f7cd1f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7eec7530acf34b3a96fa9189783453999f7b6838 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/80114bf9781bffc9db411413d83541d8deaaf7c1 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80114bf9781bffc9db411413d83541d8deaaf7c1
new file mode 100644
index 0000000000000000000000000000000000000000..4828092a8aa538f619fe0238f01041ec24187f0c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80114bf9781bffc9db411413d83541d8deaaf7c1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/80425fb92bb86627e854892f23823fa804e5fdc3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80425fb92bb86627e854892f23823fa804e5fdc3
new file mode 100644
index 0000000000000000000000000000000000000000..2ed0139989f994cef05524f606058404ebc7614a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80425fb92bb86627e854892f23823fa804e5fdc3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/821cdd6eeb919a8dd7f35289abbd583828dd4945 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/821cdd6eeb919a8dd7f35289abbd583828dd4945
new file mode 100644
index 0000000000000000000000000000000000000000..28925e3c80c2210e90461b82d950f63427bd5439
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/821cdd6eeb919a8dd7f35289abbd583828dd4945 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/83e1a31785285338b0ddb3334b0ed098e63dedde b/tensorflow/core/kernels/fuzzing/corpus/decode_png/83e1a31785285338b0ddb3334b0ed098e63dedde
new file mode 100644
index 0000000000000000000000000000000000000000..9a5487fbfbe79979ddb06faf9d1b29a445db75eb
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/83e1a31785285338b0ddb3334b0ed098e63dedde differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/8a4c8100dedd0fb5f2a8b468c678f7ad8269deeb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8a4c8100dedd0fb5f2a8b468c678f7ad8269deeb
new file mode 100644
index 0000000000000000000000000000000000000000..c6c8b7c717310177d1034ec4e1643afbdc6994bf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8a4c8100dedd0fb5f2a8b468c678f7ad8269deeb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/8ae8268c24dc866c1edb3826b93a1c75dbf74ff4 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8ae8268c24dc866c1edb3826b93a1c75dbf74ff4
new file mode 100644
index 0000000000000000000000000000000000000000..ce211f7cfd7e6c6fc33c30d8fdcaf63e401a9365
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8ae8268c24dc866c1edb3826b93a1c75dbf74ff4 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/90f72038cc627f34f074ea72eadbba87a5e3e288 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/90f72038cc627f34f074ea72eadbba87a5e3e288
new file mode 100644
index 0000000000000000000000000000000000000000..3f12cb5f6595439978754328620f1f8f0fb43291
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/90f72038cc627f34f074ea72eadbba87a5e3e288 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/92b67faee4a49df2cdbed785e27b4a1cddcfffa3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/92b67faee4a49df2cdbed785e27b4a1cddcfffa3
new file mode 100644
index 0000000000000000000000000000000000000000..18ff654a110c137350e93dec975a12e17cc0761c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/92b67faee4a49df2cdbed785e27b4a1cddcfffa3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/9463810467aacdc9923b2b20a2236116b760d75b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9463810467aacdc9923b2b20a2236116b760d75b
new file mode 100644
index 0000000000000000000000000000000000000000..eec341bf2bd623f50bbb5e393f4aaa245bf15fec
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9463810467aacdc9923b2b20a2236116b760d75b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/94d7c96aea32ad41ce643d35b951a6d8990b81d6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/94d7c96aea32ad41ce643d35b951a6d8990b81d6
new file mode 100644
index 0000000000000000000000000000000000000000..776f17c6b218bd55751e740b35b3ad6f4097a288
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/94d7c96aea32ad41ce643d35b951a6d8990b81d6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/98cc7e9fe87df914d89a0aef008930f27b3c26f5 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/98cc7e9fe87df914d89a0aef008930f27b3c26f5
new file mode 100644
index 0000000000000000000000000000000000000000..d7296ca03c4fa909de8fb18d8dda7f9cc6bafce7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/98cc7e9fe87df914d89a0aef008930f27b3c26f5 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/99172dfdb4f59aaced29c7681ac6e6ce8356e814 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/99172dfdb4f59aaced29c7681ac6e6ce8356e814
new file mode 100644
index 0000000000000000000000000000000000000000..82559facc80344c43d682b2ee9de939b517264bf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/99172dfdb4f59aaced29c7681ac6e6ce8356e814 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/9ae3b647d895af97fe872c0b1442df7b5b767160 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9ae3b647d895af97fe872c0b1442df7b5b767160
new file mode 100644
index 0000000000000000000000000000000000000000..9e2eff2c4ec0c6ac3ba6e5558cd0a881b4dbb042
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9ae3b647d895af97fe872c0b1442df7b5b767160 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/9d2b1d2121b0508a4fa8d1508adb9d05633fdac3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9d2b1d2121b0508a4fa8d1508adb9d05633fdac3
new file mode 100644
index 0000000000000000000000000000000000000000..6d17e06d470d501c0984f339d75de0f3ba59b4fd
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9d2b1d2121b0508a4fa8d1508adb9d05633fdac3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/a335af37917ccf0c8b11bb884a3a74f3f1d2a7c6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a335af37917ccf0c8b11bb884a3a74f3f1d2a7c6
new file mode 100644
index 0000000000000000000000000000000000000000..ce8245f2da2025b6c37911a22179c45d06b90f5e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a335af37917ccf0c8b11bb884a3a74f3f1d2a7c6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/a738609112d3a6772c50a71e2c3504ebc515b709 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a738609112d3a6772c50a71e2c3504ebc515b709
new file mode 100644
index 0000000000000000000000000000000000000000..a980c777a8531fd113f9f54dee0fa00946f7a420
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a738609112d3a6772c50a71e2c3504ebc515b709 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/a8cecab5d917da5a4729632a7a18c564d7e1607d b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a8cecab5d917da5a4729632a7a18c564d7e1607d
new file mode 100644
index 0000000000000000000000000000000000000000..31a0fe82b99e564781cf22c7de167c25fc4d5e37
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a8cecab5d917da5a4729632a7a18c564d7e1607d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ade919ab2b4a458e806575c941dfe50ae3fd3621 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ade919ab2b4a458e806575c941dfe50ae3fd3621
new file mode 100644
index 0000000000000000000000000000000000000000..776f0b88dcca473db2eb6cb7ccfab99183260ce8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ade919ab2b4a458e806575c941dfe50ae3fd3621 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1251621a5eb5e7fda9cac9baead1c993a285c36 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1251621a5eb5e7fda9cac9baead1c993a285c36
new file mode 100644
index 0000000000000000000000000000000000000000..ba6aa256542e6dc9a6d802784ab451ed6b93b141
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1251621a5eb5e7fda9cac9baead1c993a285c36 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1516b78c3dfe77eeb554985fd7344c0478fbbcb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1516b78c3dfe77eeb554985fd7344c0478fbbcb
new file mode 100644
index 0000000000000000000000000000000000000000..c4ec4ad4b9c4a38459c974e8718532bc8ff4d345
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1516b78c3dfe77eeb554985fd7344c0478fbbcb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b41241740f5f8ad2c1d408f7bb6a313bd863c158 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b41241740f5f8ad2c1d408f7bb6a313bd863c158
new file mode 100644
index 0000000000000000000000000000000000000000..5413efd933685ec8073cdccd087a5009439ffe17
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b41241740f5f8ad2c1d408f7bb6a313bd863c158 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b799c8596523a7ebeb8e11ada08818c10f7eabfc b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b799c8596523a7ebeb8e11ada08818c10f7eabfc
new file mode 100644
index 0000000000000000000000000000000000000000..9cd72246b91b803783c6224511d825edc14c299f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b799c8596523a7ebeb8e11ada08818c10f7eabfc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ba48d0521a111222dc95a3a997c7c92dea5f4443 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ba48d0521a111222dc95a3a997c7c92dea5f4443
new file mode 100644
index 0000000000000000000000000000000000000000..66aac674cd26032159ef1a195cee7e70cd8511ec
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ba48d0521a111222dc95a3a997c7c92dea5f4443 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/c01457c6889fb1b597d308363a36412c0b7f90e7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c01457c6889fb1b597d308363a36412c0b7f90e7
new file mode 100644
index 0000000000000000000000000000000000000000..eff793b204a409edde4d0b601cb5e43e3fe1d088
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c01457c6889fb1b597d308363a36412c0b7f90e7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/c82ebc0d6688d104af04fd20d6d3da591dc391f7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c82ebc0d6688d104af04fd20d6d3da591dc391f7
new file mode 100644
index 0000000000000000000000000000000000000000..ba604969f6e0275a1f9db37b2755e9d693d60f5c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c82ebc0d6688d104af04fd20d6d3da591dc391f7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/c9a03eb758dd84e954e3d70916e2311e8fd21f3c b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c9a03eb758dd84e954e3d70916e2311e8fd21f3c
new file mode 100644
index 0000000000000000000000000000000000000000..c23fb3da9ce10cfab5eed3382e34f887bb117942
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c9a03eb758dd84e954e3d70916e2311e8fd21f3c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/cf892756b33578a54ab20044514e573328d2f1d7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/cf892756b33578a54ab20044514e573328d2f1d7
new file mode 100644
index 0000000000000000000000000000000000000000..42f1f9a29827e46a86ee16dd90a8779e7bc927cb
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/cf892756b33578a54ab20044514e573328d2f1d7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/d3bc3f158a63f1d50b474addd3f7b3d17f23e8e9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d3bc3f158a63f1d50b474addd3f7b3d17f23e8e9
new file mode 100644
index 0000000000000000000000000000000000000000..6b1183f4ffaab6f8b429eeb7ce3640bd3df887d6
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d3bc3f158a63f1d50b474addd3f7b3d17f23e8e9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/d4906950aa9d60ad09dc0f5413c3d88080c3bc37 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d4906950aa9d60ad09dc0f5413c3d88080c3bc37
new file mode 100644
index 0000000000000000000000000000000000000000..2d8fd3f3f0d34782904ba50077e62b6c82ef4f13
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d4906950aa9d60ad09dc0f5413c3d88080c3bc37 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/da31578a8068bad65e1c7a3d06e8f543a2a0bc65 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/da31578a8068bad65e1c7a3d06e8f543a2a0bc65
new file mode 100644
index 0000000000000000000000000000000000000000..dc37f788a1acf12c1e252c880f26eb0a4f53809e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/da31578a8068bad65e1c7a3d06e8f543a2a0bc65 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/dd4a9b5d0740679c249fc884efc499433b29436b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/dd4a9b5d0740679c249fc884efc499433b29436b
new file mode 100644
index 0000000000000000000000000000000000000000..82c5120f464521aa502f234fbccf4ea3e5763b7b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/dd4a9b5d0740679c249fc884efc499433b29436b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/deea4ecc6f0b2a6d89fd25ff76762299f21602fb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/deea4ecc6f0b2a6d89fd25ff76762299f21602fb
new file mode 100644
index 0000000000000000000000000000000000000000..6daa5452a159945afb1d5e072f3f5daa3a1cc2d6
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/deea4ecc6f0b2a6d89fd25ff76762299f21602fb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/e1040c7ffcb39915e0f539018c81f9798924cba6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e1040c7ffcb39915e0f539018c81f9798924cba6
new file mode 100644
index 0000000000000000000000000000000000000000..306bbf464bf804c169e405550ecc2c68d33c7020
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e1040c7ffcb39915e0f539018c81f9798924cba6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/e381dc85682cc33ad99f622b89d145b47f7d6392 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e381dc85682cc33ad99f622b89d145b47f7d6392
new file mode 100644
index 0000000000000000000000000000000000000000..36487c0002294149201115e745543cc2363515d4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e381dc85682cc33ad99f622b89d145b47f7d6392 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ea24498fc7a144fccc6f1665ebf7020df803dd1a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ea24498fc7a144fccc6f1665ebf7020df803dd1a
new file mode 100644
index 0000000000000000000000000000000000000000..ab99a8374aa80d829891b5670078d95b776a6957
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ea24498fc7a144fccc6f1665ebf7020df803dd1a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/eaa5d677e797c07bac98c3c7051abad91852e7c6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/eaa5d677e797c07bac98c3c7051abad91852e7c6
new file mode 100644
index 0000000000000000000000000000000000000000..63ff2676ae32de1b7293ec4a8c34b550b2c22ad5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/eaa5d677e797c07bac98c3c7051abad91852e7c6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ed7871269315725535d8bffec7836c45a3fc5c26 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ed7871269315725535d8bffec7836c45a3fc5c26
new file mode 100644
index 0000000000000000000000000000000000000000..d2a4b9aafd7b745a50401a3ac94660ccc787f99a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ed7871269315725535d8bffec7836c45a3fc5c26 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ee8460f4077064c5a2137075b48eba7d3db5c570 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ee8460f4077064c5a2137075b48eba7d3db5c570
new file mode 100644
index 0000000000000000000000000000000000000000..c3b2bd442c1f47cc24db060eb85a842c26f35a30
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ee8460f4077064c5a2137075b48eba7d3db5c570 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ef09f26e0ee61329f84a9f589629a865ae9ee0a6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ef09f26e0ee61329f84a9f589629a865ae9ee0a6
new file mode 100644
index 0000000000000000000000000000000000000000..2422f7cb3fefac0242fc41a5b1a5aec6c0ffc42e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ef09f26e0ee61329f84a9f589629a865ae9ee0a6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/f477da4d7d8ff2066041e1dd5ee4e833b7111a1a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f477da4d7d8ff2066041e1dd5ee4e833b7111a1a
new file mode 100644
index 0000000000000000000000000000000000000000..2ec0b7ae29cea7c8aa406470f5ad812102ba207b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f477da4d7d8ff2066041e1dd5ee4e833b7111a1a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/f8a379b2498a4eb452a85791a49adf065dab59ae b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f8a379b2498a4eb452a85791a49adf065dab59ae
new file mode 100644
index 0000000000000000000000000000000000000000..5431f584cd14d3c2bb0f1cbea21b1ea8a8c505c5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f8a379b2498a4eb452a85791a49adf065dab59ae differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/fe67bccb06f2174523943cc684518fcf1f7f8046 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/fe67bccb06f2174523943cc684518fcf1f7f8046
new file mode 100644
index 0000000000000000000000000000000000000000..cf043445f4b62caa450c89b2c5ee44f84d837242
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/fe67bccb06f2174523943cc684518fcf1f7f8046 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ff1e67d17c1c27ef0d97900d0ea276b563a64628 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ff1e67d17c1c27ef0d97900d0ea276b563a64628
new file mode 100644
index 0000000000000000000000000000000000000000..b1f8d120a3955c09b390dbaa8cab59ecf063b506
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ff1e67d17c1c27ef0d97900d0ea276b563a64628 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/02cc44cdfec1d9d0d0c66c5a5f40d3d20e4c4c3a b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/02cc44cdfec1d9d0d0c66c5a5f40d3d20e4c4c3a
new file mode 100644
index 0000000000000000000000000000000000000000..2a65e0e2559eea919bdbb661370c679fdbdd7e26
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/02cc44cdfec1d9d0d0c66c5a5f40d3d20e4c4c3a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/087e1d7fae1c1ddcbaa3b5f822a171ad15498186 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/087e1d7fae1c1ddcbaa3b5f822a171ad15498186
new file mode 100644
index 0000000000000000000000000000000000000000..329af8a3b9d8928bc197d1361236a82fda888026
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/087e1d7fae1c1ddcbaa3b5f822a171ad15498186 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/0f61c33027394a0f14d29dcd22f405cad943b7cf b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/0f61c33027394a0f14d29dcd22f405cad943b7cf
new file mode 100644
index 0000000000000000000000000000000000000000..8cda165c8c791847ddaffdecd62f503c42a2895f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/0f61c33027394a0f14d29dcd22f405cad943b7cf differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/10cdebea1659c21a0248f88654ae41f62786abf1 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/10cdebea1659c21a0248f88654ae41f62786abf1
new file mode 100644
index 0000000000000000000000000000000000000000..4afe44dd91c3038dfa26d06730b2213c1d98dc50
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/10cdebea1659c21a0248f88654ae41f62786abf1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/126e68def9fd973a100e0f66cadf09448a716b57 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/126e68def9fd973a100e0f66cadf09448a716b57
new file mode 100644
index 0000000000000000000000000000000000000000..9b2d29856d1483ced1b8ea3cf55f53182947451f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/126e68def9fd973a100e0f66cadf09448a716b57 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1275d41ebf8788ce3a949352e4bc654b04012da3 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1275d41ebf8788ce3a949352e4bc654b04012da3
new file mode 100644
index 0000000000000000000000000000000000000000..b4370a16d58c0b2d52af8fc09ac8606fd7e88d7f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1275d41ebf8788ce3a949352e4bc654b04012da3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1a7f1c407fb3864ddb559f88f373a21d1be51584 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1a7f1c407fb3864ddb559f88f373a21d1be51584
new file mode 100644
index 0000000000000000000000000000000000000000..d28721708d44c0b6227a14286b19f6cdf3be7010
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1a7f1c407fb3864ddb559f88f373a21d1be51584 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1c3e1c91f187f6bcea86f172ff5bbbd955a9654d b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1c3e1c91f187f6bcea86f172ff5bbbd955a9654d
new file mode 100644
index 0000000000000000000000000000000000000000..611b38b71d541c68be9e6397dc4366c75a951532
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1c3e1c91f187f6bcea86f172ff5bbbd955a9654d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/300fe1e0a47543037cbf0243b6756c9aa48799c4 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/300fe1e0a47543037cbf0243b6756c9aa48799c4
new file mode 100644
index 0000000000000000000000000000000000000000..fcf8360b277d0f051e111ce38d6fd4e33a2c17e1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/300fe1e0a47543037cbf0243b6756c9aa48799c4 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/31ec5b0134bedcfe283f4978e6e65b7d35d5d4ad b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/31ec5b0134bedcfe283f4978e6e65b7d35d5d4ad
new file mode 100644
index 0000000000000000000000000000000000000000..868e2672727ac1b7d505faaa22d79edee3252cf5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/31ec5b0134bedcfe283f4978e6e65b7d35d5d4ad differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/4e7cbb27667bcfca92838aa8020749990013a9b1 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/4e7cbb27667bcfca92838aa8020749990013a9b1
new file mode 100644
index 0000000000000000000000000000000000000000..898584d96f6bfcd76ebed9dcde3b7df88442af78
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/4e7cbb27667bcfca92838aa8020749990013a9b1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/585e469231d202812bfba8285fb30c8e31c857b9 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/585e469231d202812bfba8285fb30c8e31c857b9
new file mode 100644
index 0000000000000000000000000000000000000000..a4994c208300b6cba20708367337667fbbd8dabc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/585e469231d202812bfba8285fb30c8e31c857b9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/58eab6bc2386e2ef43fe4f55cb6ad3611399d5de b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/58eab6bc2386e2ef43fe4f55cb6ad3611399d5de
new file mode 100644
index 0000000000000000000000000000000000000000..eb38d110153b75463f4af9c56bf11d5c6d425685
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/58eab6bc2386e2ef43fe4f55cb6ad3611399d5de differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/63448c6a9feb8c72b3e82af4d735ec2e62ddd328 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/63448c6a9feb8c72b3e82af4d735ec2e62ddd328
new file mode 100644
index 0000000000000000000000000000000000000000..6c534ab19cf330994c377b4e158ed98492880589
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/63448c6a9feb8c72b3e82af4d735ec2e62ddd328 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/6874d5b1c7a64b596c61f24877d422e89bebe58b b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/6874d5b1c7a64b596c61f24877d422e89bebe58b
new file mode 100644
index 0000000000000000000000000000000000000000..e054ad5f14723fa1bd5829725e38de4a681cb3e8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/6874d5b1c7a64b596c61f24877d422e89bebe58b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7501f79cb067da108020579ed654349c7933d22f b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7501f79cb067da108020579ed654349c7933d22f
new file mode 100644
index 0000000000000000000000000000000000000000..3be6a61cbab76ee7142caae58d99695ac573a46d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7501f79cb067da108020579ed654349c7933d22f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/782051f8120182b860c7fe1b265179cfa2fe03fd b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/782051f8120182b860c7fe1b265179cfa2fe03fd
new file mode 100644
index 0000000000000000000000000000000000000000..a0d8a6ec48c983fc3870d173c5b4c73eb474eecd
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/782051f8120182b860c7fe1b265179cfa2fe03fd differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/793feab2deb35e284a975f6527d76a8be5540fe6 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/793feab2deb35e284a975f6527d76a8be5540fe6
new file mode 100644
index 0000000000000000000000000000000000000000..8a9216e10b777da5e8888e5549ab39346de8acf2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/793feab2deb35e284a975f6527d76a8be5540fe6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7f41ec3a9805c6b8f3656c4f9f6d0ff7dbf8a329 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7f41ec3a9805c6b8f3656c4f9f6d0ff7dbf8a329
new file mode 100644
index 0000000000000000000000000000000000000000..aaa91f2f45bd5730550211f4a080e6119d15b48e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7f41ec3a9805c6b8f3656c4f9f6d0ff7dbf8a329 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8210dc595a2652f2f812093b01e239e7918ea065 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8210dc595a2652f2f812093b01e239e7918ea065
new file mode 100644
index 0000000000000000000000000000000000000000..46316baf29ff126da46159bf6c16fb11eb9cb23e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8210dc595a2652f2f812093b01e239e7918ea065 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8dffe4c5c26d891b578fd2ea4b9adfc0c96ad5f7 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8dffe4c5c26d891b578fd2ea4b9adfc0c96ad5f7
new file mode 100644
index 0000000000000000000000000000000000000000..54a777d22c159918c5e85078560fe363c2a78b10
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8dffe4c5c26d891b578fd2ea4b9adfc0c96ad5f7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/91d787a9298ddc015efa783a92c4bdba8af0d7de b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/91d787a9298ddc015efa783a92c4bdba8af0d7de
new file mode 100644
index 0000000000000000000000000000000000000000..826747d852b00397d859fcd65e7404d42cd329f5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/91d787a9298ddc015efa783a92c4bdba8af0d7de differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/92c065286f956f086e977556358f6b54b12bcacc b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/92c065286f956f086e977556358f6b54b12bcacc
new file mode 100644
index 0000000000000000000000000000000000000000..77b8f518b4c46b2cef89bd7a12f23ea49c8505ab
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/92c065286f956f086e977556358f6b54b12bcacc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a35c9bb71792b60a13dea23a41b41847ad4b93d6 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a35c9bb71792b60a13dea23a41b41847ad4b93d6
new file mode 100644
index 0000000000000000000000000000000000000000..45d6b6fa606fe4a25d00476bd4ff785bb80fbba4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a35c9bb71792b60a13dea23a41b41847ad4b93d6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a6ea960c7b4d42772888280277b26e645ceee904 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a6ea960c7b4d42772888280277b26e645ceee904
new file mode 100644
index 0000000000000000000000000000000000000000..14954c595882e6d35c5f77f36dfcb9ad315c606c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a6ea960c7b4d42772888280277b26e645ceee904 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/aa526aa853333f0bb11804b5243df411452cecd2 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/aa526aa853333f0bb11804b5243df411452cecd2
new file mode 100644
index 0000000000000000000000000000000000000000..c0cc5c469f9ab41222d76349f960610e01d4cda8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/aa526aa853333f0bb11804b5243df411452cecd2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/ca533cd26c7ca6bf69e62351b265ded496fdf1d9 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/ca533cd26c7ca6bf69e62351b265ded496fdf1d9
new file mode 100644
index 0000000000000000000000000000000000000000..bb026584ca28d5de4e7438c9d21e68726cbf521c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/ca533cd26c7ca6bf69e62351b265ded496fdf1d9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f38c61da15f2cb7a39ff02e69f0b00e99f37ec86 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f38c61da15f2cb7a39ff02e69f0b00e99f37ec86
new file mode 100644
index 0000000000000000000000000000000000000000..0900eb1352b10576bebda11b0e2b943d77877d57
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f38c61da15f2cb7a39ff02e69f0b00e99f37ec86 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f88f1012473e6cfcc9b39b2552f682b2f73eff8c b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f88f1012473e6cfcc9b39b2552f682b2f73eff8c
new file mode 100644
index 0000000000000000000000000000000000000000..d74f0a3326d2a644f869ba969426268dc559df25
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f88f1012473e6cfcc9b39b2552f682b2f73eff8c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fa79819c5de04bc06c69bec3fa7f2e982826ea2f b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fa79819c5de04bc06c69bec3fa7f2e982826ea2f
new file mode 100644
index 0000000000000000000000000000000000000000..2fd41a34fe5b503756e93b68aaa4f90301a78950
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fa79819c5de04bc06c69bec3fa7f2e982826ea2f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fce08de222896ac3a20657a3b4f42d5b6c54a96a b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fce08de222896ac3a20657a3b4f42d5b6c54a96a
new file mode 100644
index 0000000000000000000000000000000000000000..35a99bc97d93c9beeea6a917453685fabf853ab2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fce08de222896ac3a20657a3b4f42d5b6c54a96a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/4c01a1504da9de2216894743ecc44424 b/tensorflow/core/kernels/fuzzing/corpus/string_split/4c01a1504da9de2216894743ecc44424
new file mode 100644
index 0000000000000000000000000000000000000000..eb84b9e610c1e988273a020f8a42c16a0c484951
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/4c01a1504da9de2216894743ecc44424
@@ -0,0 +1 @@
+./,abcd.efgh/abcd,efgh.abcd/efgh,abcd.efgh/a
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/5bf16424630b5afbcffe711fb9834440 b/tensorflow/core/kernels/fuzzing/corpus/string_split/5bf16424630b5afbcffe711fb9834440
new file mode 100644
index 0000000000000000000000000000000000000000..4cd522da7bf4b638331fbd5ef1514a6f81baaf5f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/5bf16424630b5afbcffe711fb9834440
@@ -0,0 +1 @@
+.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/a7185605aef0a8fd682fcb4656e4a736 b/tensorflow/core/kernels/fuzzing/corpus/string_split/a7185605aef0a8fd682fcb4656e4a736
new file mode 100644
index 0000000000000000000000000000000000000000..03cfb6256f33c8605c5122d7459ea6955d63ee68
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/a7185605aef0a8fd682fcb4656e4a736
@@ -0,0 +1 @@
+./, abcde.fghab/cdefg,habcd efgha.bcdef/ghabc
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/d5606def44fdbb9385dd764612069db0 b/tensorflow/core/kernels/fuzzing/corpus/string_split/d5606def44fdbb9385dd764612069db0
new file mode 100644
index 0000000000000000000000000000000000000000..304b0d66fe08fd1a29827488727702dd9b9bce3e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/string_split/d5606def44fdbb9385dd764612069db0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/dbac766f3160de65894bf5153f478146 b/tensorflow/core/kernels/fuzzing/corpus/string_split/dbac766f3160de65894bf5153f478146
new file mode 100644
index 0000000000000000000000000000000000000000..a8740444aa40ccf3249589ccce97568b15a822ae
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/dbac766f3160de65894bf5153f478146
@@ -0,0 +1 @@
+./, ?abcdef.ghabcd/efghab,cdefgh abcdef?ghabcd
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/e85ff62f6d457666f54a37a19a115a24 b/tensorflow/core/kernels/fuzzing/corpus/string_split/e85ff62f6d457666f54a37a19a115a24
new file mode 100644
index 0000000000000000000000000000000000000000..47d551466a4e76bb71bc7496cf1a8a30aa087809
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/e85ff62f6d457666f54a37a19a115a24
@@ -0,0 +1 @@
+./abc.def/gha.bcd/efg.hab/cde.fgh/abc.def/g
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/00fd47bf73afcb72e7ed51bffd5f5fec b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/00fd47bf73afcb72e7ed51bffd5f5fec
new file mode 100644
index 0000000000000000000000000000000000000000..f1410e184b23aacb36f8c741d3ada783b28aa75e
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/00fd47bf73afcb72e7ed51bffd5f5fec
@@ -0,0 +1 @@
+./abc./de./fg./ha./bc./de./fg./ha./bc./de./
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/14908973e6720513a5f37676cb9fcc29 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/14908973e6720513a5f37676cb9fcc29
new file mode 100644
index 0000000000000000000000000000000000000000..e118d2d351b58910d496319e53fbfa78bf3d3ee4
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/14908973e6720513a5f37676cb9fcc29
@@ -0,0 +1 @@
+./, abcde./, fg./, ha./, bc./, de./, fg./, ha
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/2779ba7c4d23eee9f79efa3660084c5d b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/2779ba7c4d23eee9f79efa3660084c5d
new file mode 100644
index 0000000000000000000000000000000000000000..9a6c80919746dbb49b844bb1175de31c3026f9f5
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/2779ba7c4d23eee9f79efa3660084c5d
@@ -0,0 +1 @@
+./,abcd./,ef./,gh./,ab./,cd./,ef./,gh./,ab./
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/5bf16424630b5afbcffe711fb9834440 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/5bf16424630b5afbcffe711fb9834440
new file mode 100644
index 0000000000000000000000000000000000000000..4cd522da7bf4b638331fbd5ef1514a6f81baaf5f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/5bf16424630b5afbcffe711fb9834440
@@ -0,0 +1 @@
+.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/89734a96b93275e495a9498b806fafe1 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/89734a96b93275e495a9498b806fafe1
new file mode 100644
index 0000000000000000000000000000000000000000..5301a91d8e4e48818f103eeee2cee58c2c2e0808
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/89734a96b93275e495a9498b806fafe1
@@ -0,0 +1 @@
+./, ?abcdef./, ?gh./, ?ab./, ?cd./, ?ef./, ?gh
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/d5606def44fdbb9385dd764612069db0 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/d5606def44fdbb9385dd764612069db0
new file mode 100644
index 0000000000000000000000000000000000000000..304b0d66fe08fd1a29827488727702dd9b9bce3e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/d5606def44fdbb9385dd764612069db0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/2db83ea58639b6d7d585fa12e3947a82 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/2db83ea58639b6d7d585fa12e3947a82
new file mode 100644
index 0000000000000000000000000000000000000000..3de80927d5770479cb068d9cd5af9d8a8470dbdf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/2db83ea58639b6d7d585fa12e3947a82
@@ -0,0 +1 @@
+6.023e+23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/36b4a931886b941dc41180050d12ca94 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/36b4a931886b941dc41180050d12ca94
new file mode 100644
index 0000000000000000000000000000000000000000..d531129b2833e4ec7542a939268ae4cb0eeadeba
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/36b4a931886b941dc41180050d12ca94
@@ -0,0 +1 @@
+6.023e-23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/50a2fabfdd276f573ff97ace8b11c5f4 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/50a2fabfdd276f573ff97ace8b11c5f4
new file mode 100644
index 0000000000000000000000000000000000000000..d81cc0710eb6cf9efd5b920a8453e1e07157b6cd
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/50a2fabfdd276f573ff97ace8b11c5f4
@@ -0,0 +1 @@
+42
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/62edb2a1eee34b001652cd86584becf2 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/62edb2a1eee34b001652cd86584becf2
new file mode 100644
index 0000000000000000000000000000000000000000..72f88139d0f639d92fe7869ac222267652d570d8
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/62edb2a1eee34b001652cd86584becf2
@@ -0,0 +1 @@
+0xabcdef
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/90013d1ec28c46a5c00574e60c70b6fc b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/90013d1ec28c46a5c00574e60c70b6fc
new file mode 100644
index 0000000000000000000000000000000000000000..c1113b83e8f16ef607af4427b77d90ed0bfec0b8
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/90013d1ec28c46a5c00574e60c70b6fc
@@ -0,0 +1 @@
+3.14159265359
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/94f3e3cee6957ce5815326d6788c85f4 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/94f3e3cee6957ce5815326d6788c85f4
new file mode 100644
index 0000000000000000000000000000000000000000..320aa3f00ee0abc68be7a71e18775e07a009f73f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/94f3e3cee6957ce5815326d6788c85f4
@@ -0,0 +1 @@
+0.69314718056
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/96f547bc04bb913da0bc08915238ebd8 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/96f547bc04bb913da0bc08915238ebd8
new file mode 100644
index 0000000000000000000000000000000000000000..51b7b732f69d9bc2c3328a649aba0e7523f4dc92
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/96f547bc04bb913da0bc08915238ebd8
@@ -0,0 +1 @@
+6.023e23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/d3a903d18fc11e1f35c572ad4da690ed b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/d3a903d18fc11e1f35c572ad4da690ed
new file mode 100644
index 0000000000000000000000000000000000000000..9a0be0764b639269e5bc669b91c00e0a14a9dc46
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/d3a903d18fc11e1f35c572ad4da690ed
@@ -0,0 +1 @@
+1.61803
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/e3b629c92af44260c189deb32d6f06f3 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/e3b629c92af44260c189deb32d6f06f3
new file mode 100644
index 0000000000000000000000000000000000000000..6a0e60d48b173a5049079e37e284315a97918b76
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/e3b629c92af44260c189deb32d6f06f3
@@ -0,0 +1 @@
+-42
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/f03eecf3bcfe4967a1888156a3115c8d b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/f03eecf3bcfe4967a1888156a3115c8d
new file mode 100644
index 0000000000000000000000000000000000000000..ea9cd255bc796c310f3cca79272597bc75c39cb5
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/f03eecf3bcfe4967a1888156a3115c8d
@@ -0,0 +1 @@
+6.023E+23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/fa54ca9186f77122ae2a82684a062e16 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/fa54ca9186f77122ae2a82684a062e16
new file mode 100644
index 0000000000000000000000000000000000000000..00f1e2ed8ff84bc765220f9e7a939fb101cead10
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/fa54ca9186f77122ae2a82684a062e16
@@ -0,0 +1 @@
+2.71828182846
diff --git a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
deleted file mode 100644
index f3b24b2341e590adfbeac1a18b6a65fbfd34f598..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/fuzzing/decode_jpeg_fuzz.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2016 Google Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
-
-namespace tensorflow {
-namespace fuzzing {
-
-class FuzzDecodeJpeg : public FuzzStringInputOp {
-  SINGLE_INPUT_OP_BUILDER(DT_STRING, DecodeJpeg);
-};
-
-STANDARD_TF_FUZZ_FUNCTION(FuzzDecodeJpeg);
-
-}  // end namespace fuzzing
-}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict
new file mode 100644
index 0000000000000000000000000000000000000000..5fe4ca23d1f9403b6ac7fc3084c9165b55391caf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict
@@ -0,0 +1,6 @@
+"features"
+"feature"
+"bytes_list"
+"float_list"
+"int64_list"
+"value"
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_png.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_png.dict
new file mode 100644
index 0000000000000000000000000000000000000000..d795ae7f71ff5b0c54b96dd967b2a692753523ac
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_png.dict
@@ -0,0 +1,50 @@
+header_87a="87a"
+header_89a="89a"
+header_gif="GIF"
+header_jfif="JFIF\x00"
+header_jfxx="JFXX\x00"
+header_png="\x89PNG\x0d\x0a\x1a\x0a"
+marker_2c=","
+marker_3b=";"
+section_2101="!\x01\x12"
+section_21f9="!\xf9\x04"
+section_21fe="!\xfe"
+section_21ff="!\xff\x11"
+section_IDAT="IDAT"
+section_IEND="IEND"
+section_IHDR="IHDR"
+section_PLTE="PLTE"
+section_bKGD="bKGD"
+section_cHRM="cHRM"
+section_fRAc="fRAc"
+section_ffc0="\xff\xc0"
+section_ffc2="\xff\xc2"
+section_ffc4="\xff\xc4"
+section_ffd0="\xff\xd0"
+section_ffd8="\xff\xd8"
+section_ffd9="\xff\xd9"
+section_ffda="\xff\xda"
+section_ffdb="\xff\xdb"
+section_ffdd="\xff\xdd"
+section_ffe0="\xff\xe0"
+section_ffe1="\xff\xe1"
+section_fffe="\xff\xfe"
+section_gAMA="gAMA"
+section_gIFg="gIFg"
+section_gIFt="gIFt"
+section_gIFx="gIFx"
+section_hIST="hIST"
+section_iCCP="iCCP"
+section_iTXt="iTXt"
+section_oFFs="oFFs"
+section_pCAL="pCAL"
+section_pHYs="pHYs"
+section_sBIT="sBIT"
+section_sCAL="sCAL"
+section_sPLT="sPLT"
+section_sRGB="sRGB"
+section_sTER="sTER"
+section_tEXt="tEXt"
+section_tIME="tIME"
+section_tRNS="tRNS"
+section_zTXt="zTXt"
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_wav.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_wav.dict
new file mode 100644
index 0000000000000000000000000000000000000000..eab65386ce33e0d9ffcf2ef213cdcaea2f5aa7ef
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_wav.dict
@@ -0,0 +1,4 @@
+header_RIFF="RIFF"
+header_WAVE="WAVE"
+section_fmt="fmt "
+section_data="data"
diff --git a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
index a8f07f4bad3a7e7ccff4ebefd4c56c695d0b2573..b8d779fb1384b22b88a79e115fe413464fe6a7e3 100644
--- a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tensorflow {
 namespace fuzzing {
 
-class FuzzEncodeBase64 : public FuzzSession {
+class FuzzEncodeBase64 : public FuzzStringInputOp {
   SINGLE_INPUT_OP_BUILDER(DT_STRING, EncodeBase64);
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index 4d736a21602b34b560ea1c8d9ede4645d806ca29..5b029bf5ec0f20bb160ff7d0091d6a7fd3a627ed 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -43,9 +43,9 @@ class FuzzExampleProtoFastParsing : public FuzzSession {
     std::vector<PartialTensorShape> dense_shapes;
     dense_shapes.push_back(PartialTensorShape());
 
-    std::ignore = ParseExample(scope.WithOpName("output"), in_expanded, names,
-                               sparse_keys, dense_keys, dense_defaults,
-                               sparse_types, dense_shapes);
+    (void)ParseExample(scope.WithOpName("output"), in_expanded, names,
+                       sparse_keys, dense_keys, dense_defaults, sparse_types,
+                       dense_shapes);
   }
 
   void FuzzImpl(const uint8_t* data, size_t size) final {
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index f1f3f199df137b83193c4d1e974dfb401d9ec9ff..57d562ddf43142e47e5d52e4c0dfbbcbbb4bdfe0 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -39,8 +39,7 @@ limitations under the License.
   void BuildGraph(const Scope& scope) override {                         \
     auto op_node =                                                       \
         tensorflow::ops::Placeholder(scope.WithOpName("input1"), dtype); \
-    std::ignore =                                                        \
-        tensorflow::ops::opName(scope.WithOpName("output"), op_node);    \
+    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node);  \
   }
 
 namespace tensorflow {
@@ -73,11 +72,11 @@ class FuzzSession {
   // By convention, the graph should have inputs named "input1", ...
   // "inputN", and one output node, named "output".
   // Users of FuzzSession should override this method to create their graph.
-  virtual void BuildGraph(const Scope& scope) {}
+  virtual void BuildGraph(const Scope& scope) = 0;
 
   // Implements the logic that converts an opaque byte buffer
   // from the fuzzer to Tensor inputs to the graph.  Users must override.
-  virtual void FuzzImpl(const uint8_t* data, size_t size) {}
+  virtual void FuzzImpl(const uint8_t* data, size_t size) = 0;
 
   // Initializes the FuzzSession.  Not safe for multithreading.
   // Separate init function because the call to virtual BuildGraphDef
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index ada1235449de595b6415b880c1883026c8578b0c..ab6812c5f1534426da15fbe73a282ddf21d02931 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -27,7 +27,7 @@ class FuzzParseTensor : public FuzzSession {
     // The serialized proto.
     auto input = Placeholder(scope.WithOpName("input1"), DT_STRING);
 
-    std::ignore = ParseTensor(scope.WithOpName("output"), input, DT_FLOAT);
+    (void)ParseTensor(scope.WithOpName("output"), input, DT_FLOAT);
   }
 
   void FuzzImpl(const uint8_t* data, size_t size) final {
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 738d78e99a0081a2b9f0f59c94433372acec19e2..2564f8ed0303d1c80bad32181507eb678b18345b 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -25,8 +25,8 @@ class FuzzStringSplit : public FuzzSession {
         tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING);
     auto delimeter =
         tensorflow::ops::Placeholder(scope.WithOpName("input2"), DT_STRING);
-    std::ignore = tensorflow::ops::StringSplit(scope.WithOpName("output"),
-                                               input, delimeter);
+    (void)tensorflow::ops::StringSplit(scope.WithOpName("output"), input,
+                                       delimeter);
   }
 
   void FuzzImpl(const uint8_t* data, size_t size) final {
@@ -37,8 +37,7 @@ class FuzzStringSplit : public FuzzSession {
       // The spec for split is that the delimeter should be 0 or 1 characters.
       // Naturally, fuzz it with something larger.  (This omits the possibility
       // of handing it a > int32_max size string, which should be tested for in
-      // an
-      // explicit test).
+      // an explicit test).
       size_t delim_len = static_cast<size_t>(data[0]);
       if (delim_len > size) {
         delim_len = size - 1;
diff --git a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..787bccc15ba3987edc64056bdad091d382b07500
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
@@ -0,0 +1,65 @@
+/* Copyright 2017 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzStringSplitV2 : public FuzzSession {
+  void BuildGraph(const Scope& scope) override {
+    auto input =
+        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING);
+    auto separator =
+        tensorflow::ops::Placeholder(scope.WithOpName("input2"), DT_STRING);
+    (void)tensorflow::ops::StringSplitV2(scope.WithOpName("output"),
+                                               input, separator);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) final {
+    Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
+    Tensor separator_tensor(tensorflow::DT_STRING, TensorShape({}));
+
+    if (size > 0) {
+      // The spec for split is that the separator should be 0 or 1 characters.
+      // Naturally, fuzz it with something that might be larger. But don't split
+      // on a separator that is too large. Let's say we're picking a separator
+      // of size 0, 1, 2 up to MaxSepSize (a static limit that has been picked
+      // arbitrarily).
+      size_t sep_len = static_cast<size_t>(data[0]) % kMaxSepSize;
+
+      // We still have to handle the case when fuzzing input is shorter than the
+      // minimum length required to get the separator
+      if (sep_len > size) {
+        sep_len = size - 1;
+      }
+      separator_tensor.scalar<string>()() =
+          string(reinterpret_cast<const char*>(data), sep_len);
+      input_tensor.scalar<string>()() = string(
+          reinterpret_cast<const char*>(data + sep_len), size - sep_len);
+    }
+
+    RunTwoInputs(input_tensor, separator_tensor).IgnoreError();
+  }
+
+ private:
+  static const size_t kMaxSepSize = 4;
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzStringSplitV2);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl b/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
index f752b59568a74f56c9b581651e54d1cab2af227f..e9322133590487356cf49700e5396e692cda3f04 100644
--- a/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
+++ b/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
@@ -1,13 +1,25 @@
 """Fuzzing template for TensorFlow ops."""
 
 def tf_ops_fuzz_target_lib(name):
-  native.cc_library(
-      name = name + "_fuzz_lib",
-      srcs = [name + "_fuzz.cc"],
-      deps = [
-          "//tensorflow/core/kernels/fuzzing:fuzz_session",
-          "//tensorflow/cc:cc_ops",
-      ],
-      tags = ["no_windows"],
-      alwayslink = 1,
-  )
+    native.cc_library(
+        name = name + "_fuzz_lib",
+        srcs = [name + "_fuzz.cc"],
+        deps = [
+            "//tensorflow/core/kernels/fuzzing:fuzz_session",
+            "//tensorflow/cc:cc_ops",
+        ],
+        tags = ["no_windows"],
+        alwayslink = 1,
+    )
+
+def tf_oss_fuzz_corpus(name):
+    native.filegroup(
+        name = name + "_corpus",
+        srcs = native.glob(["corpus/" + name + "/*"]),
+    )
+
+def tf_oss_fuzz_dict(name):
+    native.filegroup(
+        name = name + "_dict",
+        srcs = native.glob(["dictionaries/" + name + ".dict"]),
+    )
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 4563fc635353e93f523ac035ff7f74bcd0410c09..5db12920ae5d325cc63fa487fcf40147b093f54c 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,6 +31,8 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_bool(DEFINE_GPU_SPECS);
+TF_CALL_int32(DEFINE_GPU_SPECS);
 TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 094504d6b95d4c0f9890e8eb455018477d4261a5..5795f68889e2393451c5cfae2fd29f14e8f9adce 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,6 +153,8 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
+TF_CALL_bool(REGISTER_GATHER_GPU);
+TF_CALL_int32(REGISTER_GATHER_GPU);
 TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/gemm_functors.h b/tensorflow/core/kernels/gemm_functors.h
index 1c808440851d4c01ea61967bbb15d12fd9b857e2..97e077c096031e260d54dcfcccb03af097b0c71e 100644
--- a/tensorflow/core/kernels/gemm_functors.h
+++ b/tensorflow/core/kernels/gemm_functors.h
@@ -36,6 +36,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 // Apple provides an optimized BLAS library that is better than Eigen for their
 // devices, so use that if possible.
 #if defined(__APPLE__) && defined(USE_GEMM_FOR_CONV)
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 4870d9ae200cd55adc4833c044e5588aa1d6aa89..87d36f22d719ade68d17c6f4a2e6dc2deeef9e45 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -40,6 +40,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:reshape_op",
         "//tensorflow/core/kernels:softmax_op",
+        "@com_google_absl//absl/base",
     ],
 )
 
diff --git a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
index d53977703e4716561dc57ff1ad4ef8bd861eeea7..690d13c4e65924c9fdbe7fabdef88b0f7c0c26d5 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_graph_execution_test.cc
@@ -29,6 +29,7 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 
 #include <memory>
 
+#include "absl/base/casts.h"
 #include "tensorflow/core/framework/graph_transfer_info.pb.h"
 #include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -40,7 +41,6 @@ adb push /tmp/imagenet_comp_graph_label_strings.txt /data/local/tmp
 #include "tensorflow/core/kernels/i_remote_fused_graph_executor.h"
 #include "tensorflow/core/kernels/i_remote_fused_graph_ops_definitions.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -132,7 +132,7 @@ static void LoadImage(std::vector<float>* img_floats_ptr) {
   const int64 pixel_count = WIDTH * HEIGHT * DEPTH;
   CHECK(fsize >= 22 /* pos of height */ + sizeof(int));
   CHECK(bmp.data() != nullptr);
-  uint8* const img_bytes = bit_cast<uint8*>(bmp.data());
+  uint8* const img_bytes = absl::bit_cast<uint8*>(bmp.data());
   const int header_size = *(reinterpret_cast<int*>(img_bytes + 10));
   LOG(INFO) << "header size = " << header_size;
   const int size = *(reinterpret_cast<int*>(img_bytes + 14));
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
index 424fe5df3cafe43c012b496bf06743ec12e8f5fe..a14d4967a59f53668a4a4c7135e79ed046666edb 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.h
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -51,6 +51,12 @@ class InitializableLookupTable : public LookupInterface {
         "Insert not supported by InitializableLookupTable implementations");
   }
 
+  // Returns errors::Unimplemented.
+  Status Remove(OpKernelContext* ctx, const Tensor& keys) final {
+    return errors::Unimplemented(
+        "Remove not supported by InitializableLookupTable implementations");
+  }
+
   Status ExportValues(OpKernelContext* context) override {
     return errors::Unimplemented(
         "ExportValues not supported by InitializableLookupTable "
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 2363fbc246fc58b91969c1080e27e2dc366bd64f..7f06764d526e7bf49fea318a9d20eaaea6f45133 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -500,6 +500,9 @@ typedef Eigen::GpuDevice GPUDevice;
       Name("DeepCopy").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),      \
       CopyOp<GPUDevice>);
 
+REGISTER_KERNEL_BUILDER(
+    Name("InplaceUpdate").Device(DEVICE_GPU).TypeConstraint<bool>("T"),
+    InplaceOp<GPUDevice, functor::I_UPDATE>);
 REGISTER(float);
 REGISTER(double);
 REGISTER(Eigen::half);
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 9d20239d2ddbf4e58f4ac1f1bf2ac0baad36f1a5..ba9879691b408d9455de1767d943240a0bab0190 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -134,6 +134,26 @@ void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
   }
 }
 
+template <bool>
+void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  const int64 nelem = v.NumElements();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
+  auto Ty = y->flat_outer_dims<bool>();
+  const int64 nrows = Ty.dimension(0);
+  const int64 ncols = Ty.dimension(1);
+  const int64 n = i.NumElements();
+  const bool* src = v.flat<bool>().data();
+  // TODO(sjhwang): Check that first dimension fits in int32 range.
+  const int32* rowids = i.flat<int32>().data();
+  bool* dst = y->flat<bool>().data();
+  if (op == I_UPDATE) {
+    DoInplaceOpKernel<bool, I_UPDATE>
+        <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+            cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+  }
+}
+
 template <>
 Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
                  const Tensor& v, Tensor* y) {
@@ -144,6 +164,7 @@ Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
     DoInplaceOp<type>(d, op, i, v, y); \
     break;
 
+    CASE(bool)
     CASE(float)
     CASE(double)
     CASE(Eigen::half)
@@ -165,6 +186,7 @@ Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
     y->flat<type>().device(d) = x.flat<type>(); \
     break;
 
+    CASE(bool)
     CASE(float)
     CASE(double)
     CASE(Eigen::half)
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 2088c13586630d5c98333d465afe6fc1174103b3..42fad1d4b053f84a7f5eaae4382f0a090ba628da 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -42,24 +42,32 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 TensorList::TensorList(const TensorList& other)
     : tensors(other.tensors),
       element_shape(other.element_shape),
-      element_dtype(other.element_dtype) {}
+      element_dtype(other.element_dtype),
+      max_num_elements(other.max_num_elements) {}
 
 void TensorList::Encode(VariantTensorData* data) const {
   data->set_type_name(TypeName());
-  for (const Tensor& t : tensors) {
-    *data->add_tensors() = t;
+  std::vector<size_t> invalid_indices;
+  for (size_t i = 0; i < tensors.size(); i++) {
+    if (tensors.at(i).dtype() != DT_INVALID) {
+      *data->add_tensors() = tensors.at(i);
+    } else {
+      invalid_indices.push_back(i);
+    }
   }
   string metadata;
-  core::PutVarint64(&metadata, static_cast<uint64>(element_dtype));
-  if (!element_shape.unknown_rank()) {
-    for (TensorShapeDim dim : element_shape) {
-      if (dim.size > 0) {
-        core::PutVarint64(&metadata, dim.size);
-      } else {
-        core::PutVarint64(&metadata, std::numeric_limits<uint64>::max());
-      }
-    }
+  // TODO(b/118838800): Add a proto for storing the metadata.
+  // Metadata format:
+  // <num_invalid_tensors><invalid_indices><element_dtype><element_shape_proto>
+  core::PutVarint64(&metadata, static_cast<uint64>(invalid_indices.size()));
+  for (size_t i : invalid_indices) {
+    core::PutVarint64(&metadata, static_cast<uint64>(i));
   }
+  core::PutVarint64(&metadata, static_cast<uint64>(element_dtype));
+  core::PutVarint64(&metadata, static_cast<uint64>(max_num_elements));
+  TensorShapeProto element_shape_proto;
+  element_shape.AsProto(&element_shape_proto);
+  element_shape_proto.AppendToString(&metadata);
   data->set_metadata(metadata);
 }
 
@@ -68,10 +76,14 @@ static Status TensorListDeviceCopy(
     const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
   to->element_shape = from.element_shape;
   to->element_dtype = from.element_dtype;
+  to->max_num_elements = from.max_num_elements;
   to->tensors.reserve(from.tensors.size());
   for (const Tensor& t : from.tensors) {
     Tensor tmp(t.dtype());
-    TF_RETURN_IF_ERROR(copy(t, &tmp));
+    // Do not copy uninitialized tensors.
+    if (t.dtype() != DT_INVALID) {
+      TF_RETURN_IF_ERROR(copy(t, &tmp));
+    }
     to->tensors.push_back(tmp);
   }
   return Status::OK();
@@ -95,23 +107,47 @@ Status TensorListShape(const TensorList& t, TensorShape* s) {
 REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(TensorList, TensorListShape);
 
 bool TensorList::Decode(const VariantTensorData& data) {
-  tensors = data.tensors();
+  // TODO(srbs): Change the signature to Decode(VariantTensorData data) so
+  // that we do not have to copy each tensor individually below. This would
+  // require changing VariantTensorData::tensors() as well.
   string metadata;
   data.get_metadata(&metadata);
   uint64 scratch;
   StringPiece iter(metadata);
+  std::vector<size_t> invalid_indices;
   core::GetVarint64(&iter, &scratch);
-  element_dtype = static_cast<DataType>(scratch);
-  std::vector<int64> dims;
-  while (!iter.empty()) {
+  size_t num_invalid_tensors = static_cast<size_t>(scratch);
+  invalid_indices.resize(num_invalid_tensors);
+  for (size_t i = 0; i < num_invalid_tensors; i++) {
     core::GetVarint64(&iter, &scratch);
-    if (scratch == std::numeric_limits<uint64>::max()) {
-      dims.push_back(-1);
+    invalid_indices[i] = static_cast<size_t>(scratch);
+  }
+
+  size_t total_num_tensors = data.tensors().size() + num_invalid_tensors;
+  tensors.reserve(total_num_tensors);
+  std::vector<size_t>::iterator invalid_indices_it = invalid_indices.begin();
+  std::vector<Tensor>::const_iterator tensors_it = data.tensors().begin();
+  for (size_t i = 0; i < total_num_tensors; i++) {
+    if (invalid_indices_it != invalid_indices.end() &&
+        *invalid_indices_it == i) {
+      tensors.emplace_back(Tensor(DT_INVALID));
+      invalid_indices_it++;
+    } else if (tensors_it != data.tensors().end()) {
+      tensors.emplace_back(*tensors_it);
+      tensors_it++;
     } else {
-      dims.push_back(scratch);
+      // VariantTensorData is corrupted.
+      return false;
     }
   }
-  element_shape = PartialTensorShape(dims);
+
+  core::GetVarint64(&iter, &scratch);
+  element_dtype = static_cast<DataType>(scratch);
+  core::GetVarint64(&iter, &scratch);
+  max_num_elements = static_cast<int>(scratch);
+  TensorShapeProto element_shape_proto;
+  element_shape_proto.ParseFromString(string(iter.data(), iter.size()));
+  element_shape = PartialTensorShape(element_shape_proto);
   return true;
 }
 
@@ -144,12 +180,19 @@ class EmptyTensorList : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
+    const Tensor& max_num_elements_t = ctx->input(1);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(max_num_elements_t.shape()),
+        errors::InvalidArgument(
+            "max_num_elements expected to be a scalar ",
+            "but got shape: ", max_num_elements_t.shape().DebugString()));
     Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr));
     TensorList empty;
     empty.element_dtype = element_dtype_;
+    empty.max_num_elements = max_num_elements_t.scalar<int32>()();
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(ctx, TensorShapeFromTensor(ctx->input(0), &element_shape));
     empty.element_shape = element_shape;
@@ -167,9 +210,11 @@ REGISTER_KERNEL_BUILDER(Name("EmptyTensorList").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("EmptyTensorList").Device(DEVICE_GPU).HostMemory("element_shape"),
-    EmptyTensorList);
+REGISTER_KERNEL_BUILDER(Name("EmptyTensorList")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("element_shape")
+                            .HostMemory("max_num_elements"),
+                        EmptyTensorList);
 
 #endif  // GOOGLE_CUDA
 
@@ -206,6 +251,14 @@ class TensorListPushBack : public OpKernel {
                                         " but list elements ",
                                         DataTypeString(l->element_dtype)));
 
+    if (l->max_num_elements != -1) {
+      OP_REQUIRES(
+          c, l->tensors.size() < l->max_num_elements,
+          errors::InvalidArgument("Tried to push item into a full list",
+                                  " list size: ", l->tensors.size(),
+                                  " max_num_elements: ", l->max_num_elements));
+    }
+
     TensorList output;
     output = *l;
     output.tensors.push_back(input);
@@ -274,13 +327,22 @@ class TensorListElementShape : public OpKernel {
                     "list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
     Tensor* result;
-    OP_REQUIRES_OK(c, c->allocate_output(
-                          0, TensorShape{l->element_shape.dims()}, &result));
-    for (int i = 0; i < l->element_shape.dims(); ++i) {
+    if (l->element_shape.unknown_rank()) {
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &result));
       if (result->dtype() == DT_INT32) {
-        result->flat<int32>()(i) = l->element_shape.dim_size(i);
+        result->scalar<int32>()() = -1;
       } else {
-        result->flat<int64>()(i) = l->element_shape.dim_size(i);
+        result->scalar<int64>()() = -1;
+      }
+    } else {
+      OP_REQUIRES_OK(c, c->allocate_output(
+                            0, TensorShape{l->element_shape.dims()}, &result));
+      for (int i = 0; i < l->element_shape.dims(); ++i) {
+        if (result->dtype() == DT_INT32) {
+          result->flat<int32>()(i) = l->element_shape.dim_size(i);
+        } else {
+          result->flat<int64>()(i) = l->element_shape.dim_size(i);
+        }
       }
     }
   }
@@ -421,9 +483,19 @@ REGISTER_KERNEL_BUILDER(Name("TensorListGetItem").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("TensorListGetItem").Device(DEVICE_GPU).HostMemory("index"),
-    TensorListGetItem);
+#define REGISTER_TENSOR_LIST_GET_ITEM_GPU(T)                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("index"),               \
+                          TensorListGetItem);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+REGISTER_TENSOR_LIST_GET_ITEM_GPU(bfloat16)
+#undef REGISTER_TENSOR_LIST_GET_ITEM_GPU
 
 #endif  // GOOGLE_CUDA
 
@@ -449,9 +521,16 @@ class TensorListSetItem : public OpKernel {
                 errors::InvalidArgument("Trying to modify element ", index,
                                         " in a list with ", l->tensors.size(),
                                         " elements."));
+    const Tensor& value = c->input(2);
+    OP_REQUIRES(c, l->element_shape.IsCompatibleWith(value.shape()),
+                errors::InvalidArgument(
+                    "Tried to set a tensor with incompatible shape at a "
+                    "list index. Item element shape: ",
+                    value.shape().DebugString(),
+                    " list shape: ", l->element_shape.DebugString()));
     TensorList output;
     output = *l;
-    output.tensors[index] = c->input(2);
+    output.tensors[index] = value;
     Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
@@ -468,9 +547,19 @@ REGISTER_KERNEL_BUILDER(Name("TensorListSetItem").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("TensorListSetItem").Device(DEVICE_GPU).HostMemory("index"),
-    TensorListSetItem);
+#define REGISTER_TENSOR_LIST_SET_ITEM_GPU(T)                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("index"),               \
+                          TensorListSetItem);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+REGISTER_TENSOR_LIST_SET_ITEM_GPU(bfloat16)
+#undef REGISTER_TENSOR_LIST_SET_ITEM_GPU
 
 #endif  // GOOGLE_CUDA
 
@@ -591,7 +680,11 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListGather<CPUDevice, T>)
+                          TensorListGather<CPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListConcat<CPUDevice, T>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_STACK_CPU);
 REGISTER_TENSOR_LIST_STACK_CPU(quint8);
@@ -611,7 +704,11 @@ REGISTER_TENSOR_LIST_STACK_CPU(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListScatter<CPUDevice, T>)
+                          TensorListScatter<CPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListSplit<CPUDevice, T>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_CPU);
 REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint8);
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index a00bf700ca21ea2a69fdcc84815ca473375b333c..23f552642cac273cf53b25a6d43e1e6ca23ea0cc 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -45,7 +45,12 @@ typedef Eigen::GpuDevice GPUDevice;
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("indices"),             \
-                          TensorListGather<GPUDevice, T>)
+                          TensorListGather<GPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("lengths"),             \
+                          TensorListConcat<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_STACK_GPU);
 REGISTER_TENSOR_LIST_STACK_GPU(bfloat16);
@@ -82,7 +87,13 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("element_shape")        \
                               .HostMemory("indices"),             \
-                          TensorListScatter<GPUDevice, T>)
+                          TensorListScatter<GPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("element_shape")        \
+                              .HostMemory("lengths"),             \
+                          TensorListSplit<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
 REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bfloat16);
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 72581c9293d619dcda32354b6b46a2b42d1eb156..686679474c40dc922683786cdfe65ffb3fbc03e2 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -30,6 +30,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
 #include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
@@ -56,6 +58,9 @@ struct TensorList {
   std::vector<Tensor> tensors;
   PartialTensorShape element_shape;
   DataType element_dtype;
+  // The maximum allowed size of `tensors`. Defaults to -1 meaning that the size
+  // of `tensors` is unbounded.
+  int max_num_elements = -1;
 };
 
 Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
@@ -73,32 +78,51 @@ class TensorListStack : public OpKernel {
   ~TensorListStack() {}
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
                 errors::InvalidArgument(
                     "Input handle is not a list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-    OP_REQUIRES(c, l->element_shape.IsFullyDefined(),
-                errors::InvalidArgument("Tried to stack elements from a list "
-                                        "with non-fully-defined shape: ",
-                                        l->element_shape.DebugString()));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    OP_REQUIRES(
+        c,
+        !tensor_list->tensors.empty() ||
+            tensor_list->element_shape.IsFullyDefined(),
+        errors::InvalidArgument("Tried to stack elements of a empty ",
+                                "list with non-fully-defined shape: ",
+                                tensor_list->element_shape.DebugString()));
     if (num_elements_ != -1) {
-      OP_REQUIRES(c, l->tensors.size() == num_elements_,
-                  errors::InvalidArgument("Operation expected a list with ",
-                                          num_elements_,
-                                          " elements but got a list with ",
-                                          l->tensors.size(), " elements."));
+      OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_,
+                  errors::InvalidArgument(
+                      "Operation expected a list with ", num_elements_,
+                      " elements but got a list with ",
+                      tensor_list->tensors.size(), " elements."));
     }
+    // Compute the shape of the output tensor.
+    // If `element_shape` is fully-defined it gets used. It is assumed that all
+    // element tensors have the same shape.
+    // If `element_shape` is not fully-defined the shape of the first element
+    // tensor is used and it is checked that all other tensors have the same
+    // shape.
     TensorShape resulting_shape;
-    resulting_shape.AddDim(l->tensors.size());
-    for (TensorShapeDim s : l->element_shape) {
-      resulting_shape.AddDim(s.size);
+    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
+      const Tensor& t = tensor_list->tensors[0];
+      resulting_shape = t.shape();
+      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& t = tensor_list->tensors[i];
+        OP_REQUIRES(c, t.shape() == resulting_shape,
+                    errors::InvalidArgument(
+                        "Tried to stack tensors with unequal shapes: ",
+                        resulting_shape.DebugString(), " vs ",
+                        t.shape().DebugString()));
+      }
     }
+    resulting_shape.InsertDim(0, tensor_list->tensors.size());
     Tensor* output;
     OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
     if (output->NumElements() == 0) {
@@ -106,14 +130,8 @@ class TensorListStack : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(l->tensors.size());
-    for (const auto& t : l->tensors) {
-      OP_REQUIRES(
-          c, l->element_shape.IsCompatibleWith(t.shape()),
-          errors::InvalidArgument(
-              "Tensor with invalid shape in list. List element shape shape: ",
-              l->element_shape.DebugString(),
-              " and tensor shape: ", t.shape().DebugString()));
+    inputs_flat.reserve(tensor_list->tensors.size());
+    for (const auto& t : tensor_list->tensors) {
       inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
           t.shaped<T, 2>({1, t.NumElements()})));
     }
@@ -133,6 +151,200 @@ class TensorListStack : public OpKernel {
   DataType element_dtype_;
 };
 
+template <typename Device, typename T>
+class TensorListConcat : public OpKernel {
+ public:
+  using ConstMatrixVector =
+      std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
+  explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  ~TensorListConcat() {}
+
+  void Compute(OpKernelContext* c) override {
+    // Check that the input Variant tensor is indeed a TensorList and has the
+    // correct element type.
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    // If the TensorList is empty, its element_shape must be fully defined
+    // except for the first dimension.
+    PartialTensorShape shape_except_first_dim;
+    if (!tensor_list->element_shape.unknown_rank()) {
+      OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
+                  errors::InvalidArgument(
+                      "Concat requires elements to be at least vectors, ",
+                      "found scalars instead."));
+      shape_except_first_dim = PartialTensorShape(
+          gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
+              .subspan(1));
+    }
+    OP_REQUIRES(c,
+                !tensor_list->tensors.empty() ||
+                    shape_except_first_dim.IsFullyDefined(),
+                errors::InvalidArgument(
+                    "All except the first dimension must be fully defined ",
+                    "when concating an empty tensor list. element_shape: ",
+                    tensor_list->element_shape.DebugString()));
+    // 1. Compute the shape of the output tensor.
+    // If `shape_except_first_dim` is fully-defined we just prepend the leading
+    // dim to it. Otherwise we use the shape of the first element tensor and
+    // check to make sure shapes of all tensors are compatible.
+    TensorShape output_shape;
+    if (!shape_except_first_dim.AsTensorShape(&output_shape)) {
+      const Tensor& element_tensor = tensor_list->tensors[0];
+      OP_REQUIRES(
+          c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
+          errors::InvalidArgument("Concat saw a scalar shape at index ", 0,
+                                  " but requires at least vectors."));
+      output_shape =
+          TensorShape(gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
+                          .subspan(1));
+      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& element_tensor = tensor_list->tensors[i];
+        OP_REQUIRES(
+            c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
+            errors::InvalidArgument("Concat saw a scalar shape at index ", i,
+                                    " but requires at least vectors."));
+        TensorShape actual_shape(
+            gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
+                .subspan(1));
+        OP_REQUIRES(c, actual_shape.dim_sizes() == output_shape.dim_sizes(),
+                    errors::InvalidArgument(
+                        "Tried to concat tensors with unequal shapes: ",
+                        output_shape.DebugString(), " vs ",
+                        actual_shape.DebugString()));
+      }
+    }
+    // 2. Build the lengths_tensor and leading dim of the output tensor by
+    // iterating over all element tensors.
+    Tensor* lengths_tensor = nullptr;
+    OP_REQUIRES_OK(
+        c,
+        c->allocate_output(
+            1, TensorShape({static_cast<int64>(tensor_list->tensors.size())}),
+            &lengths_tensor));
+    auto lengths_tensor_vec = lengths_tensor->vec<int64>();
+    int64 leading_dim = 0;
+    for (size_t i = 0; i < tensor_list->tensors.size(); i++) {
+      int64 dim = tensor_list->tensors[i].shape().dim_size(0);
+      leading_dim += dim;
+      lengths_tensor_vec(i) = dim;
+    }
+    output_shape.InsertDim(0, leading_dim);
+    Tensor* output;
+    // 3. Allocate the output tensor and fill it up with the concated element
+    // tensors.
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() == 0) {
+      return;
+    }
+
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(tensor_list->tensors.size());
+    for (const auto& element_tensor : tensor_list->tensors) {
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+    }
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+
+#if GOOGLE_CUDA
+    if (std::is_same<Device, Eigen::GpuDevice>::value) {
+      ConcatGPU<T>(c, inputs_flat, output, &output_flat);
+      return;
+    }
+#endif  // GOOGLE_CUDA
+    ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListSplit : public OpKernel {
+ public:
+  TensorListSplit(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    Tensor* output_tensor;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, {}, &output_tensor, attr));
+    PartialTensorShape element_shape;
+    OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(1), &element_shape));
+    OP_REQUIRES(c, element_shape.unknown_rank() || element_shape.dims() >= 1,
+                errors::InvalidArgument(
+                    "TensorListSplit requires element_shape to be at least of ",
+                    "rank 1, but saw: ", element_shape.DebugString()));
+    TensorList output_list;
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    TensorShape tensor_shape_without_first_dim(input_tensor.shape());
+    tensor_shape_without_first_dim.RemoveDim(0);
+    PartialTensorShape element_shape_without_first_dim;
+    if (!element_shape.unknown_rank()) {
+      element_shape_without_first_dim =
+          PartialTensorShape(element_shape.dim_sizes());
+      element_shape_without_first_dim.RemoveDim(0);
+    }
+    OP_REQUIRES(c,
+                element_shape_without_first_dim.IsCompatibleWith(
+                    tensor_shape_without_first_dim),
+                errors::InvalidArgument(
+                    "tensor shape ", input_tensor.shape().DebugString(),
+                    " is not compatible with element_shape ",
+                    element_shape.DebugString()));
+    output_list.element_shape = element_shape;
+    const Tensor& lengths = c->input(2);
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(lengths.shape()),
+                errors::InvalidArgument(
+                    "Expected lengths to be a vector, received shape: ",
+                    lengths.shape().DebugString()));
+    output_list.tensors.reserve(lengths.shape().dim_size(0));
+    int64 start = 0;
+    int64 end = 0;
+    for (int i = 0; i < lengths.shape().dim_size(0); ++i) {
+      int64 length = lengths.vec<int64>()(i);
+      OP_REQUIRES(
+          c, length >= 0,
+          errors::InvalidArgument("Invalid value in lengths: ", length));
+      end = start + length;
+      OP_REQUIRES(c, end <= input_tensor.shape().dim_size(0),
+                  errors::InvalidArgument("Attempting to slice [", start, ", ",
+                                          end, "] from tensor with length ",
+                                          input_tensor.shape().dim_size(0)));
+      Tensor tmp = input_tensor.Slice(start, end);
+      start = end;
+      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+      // prevent this.
+      Tensor aligned;
+      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+      aligned.flat<T>().device(c->eigen_device<Device>()) =
+          tmp.unaligned_flat<T>();
+      output_list.tensors.emplace_back(aligned);
+    }
+    OP_REQUIRES(c, end == input_tensor.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Unused values in tensor. Length of tensor: ",
+                    input_tensor.shape().dim_size(0), " Values used: ", end));
+    output_tensor->scalar<Variant>()() = std::move(output_list);
+  }
+};
+
 template <typename Device, typename T>
 class TensorListGather : public OpKernel {
  public:
@@ -143,26 +355,51 @@ class TensorListGather : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
                 errors::InvalidArgument(
                     "Input handle is not a list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-    OP_REQUIRES(c, l->element_shape.IsFullyDefined(),
-                errors::InvalidArgument("Tried to stack elements from a list "
-                                        "with non-fully-defined shape: ",
-                                        l->element_shape.DebugString()));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
     Tensor indices = c->input(1);
+    OP_REQUIRES(
+        c,
+        indices.NumElements() > 0 ||
+            tensor_list->element_shape.IsFullyDefined(),
+        errors::InvalidArgument("Tried to gather 0-elements from "
+                                "a list with non-fully-defined shape: ",
+                                tensor_list->element_shape.DebugString()));
+    // Compute the shape of the output tensor.
+    // If `element_shape` is fully-defined it gets used. It is assumed that all
+    // requested tensors have the same shape.
+    // If `element_shape` is not fully-defined the shape of the first requested
+    // tensor is used and it is checked that all other tensors have the same
+    // shape.
     TensorShape resulting_shape;
-    resulting_shape.AddDim(indices.NumElements());
-    for (TensorShapeDim s : l->element_shape) {
-      resulting_shape.AddDim(s.size);
+    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
+      const int i = indices.flat<int32>()(0);
+      OP_REQUIRES(
+          c, i < tensor_list->tensors.size(),
+          errors::InvalidArgument("Index ", i, " out o range; list only has ",
+                                  tensor_list->tensors.size(), " elements."));
+      const Tensor& t = tensor_list->tensors[i];
+      resulting_shape = t.shape();
+      for (int index = 1; index < indices.NumElements(); ++index) {
+        const int i = indices.flat<int32>()(index);
+        const Tensor& t = tensor_list->tensors[i];
+        OP_REQUIRES(c, t.shape() == resulting_shape,
+                    errors::InvalidArgument(
+                        "Tried to gather elements with unequal shapes: ",
+                        resulting_shape.DebugString(), " vs ",
+                        t.shape().DebugString()));
+      }
     }
+    resulting_shape.InsertDim(0, indices.NumElements());
     Tensor* output;
     OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
     if (output->NumElements() == 0) {
@@ -170,19 +407,14 @@ class TensorListGather : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(l->tensors.size());
+    inputs_flat.reserve(tensor_list->tensors.size());
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
       OP_REQUIRES(
-          c, i < l->tensors.size(),
+          c, i < tensor_list->tensors.size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  l->tensors.size(), " elements."));
-      const Tensor& t = l->tensors[i];
-      OP_REQUIRES(c, l->element_shape.IsCompatibleWith(t.shape()),
-                  errors::InvalidArgument(
-                      "Tensor with invalid shape in list. List element shape: ",
-                      l->element_shape.DebugString(),
-                      " and tensor shape: ", t.shape().DebugString()));
+                                  tensor_list->tensors.size(), " elements."));
+      const Tensor& t = tensor_list->tensors[i];
       inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
           t.shaped<T, 2>({1, t.NumElements()})));
     }
@@ -260,13 +492,13 @@ class TensorListScatter : public OpKernel {
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(2), &element_shape));
     TensorList output_list;
-    const Tensor& t = c->input(0);
-    output_list.element_dtype = t.dtype();
-    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
                 errors::InvalidArgument(
                     "Tensor must be at least a vector, but saw shape: ",
-                    t.shape().DebugString()));
-    TensorShape output_shape(t.shape());
+                    input_tensor.shape().DebugString()));
+    TensorShape output_shape(input_tensor.shape());
     output_shape.RemoveDim(0);
     OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
                 errors::InvalidArgument(
@@ -276,11 +508,11 @@ class TensorListScatter : public OpKernel {
     output_list.tensors.reserve(indices.NumElements());
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
-      OP_REQUIRES(c, i < t.shape().dim_size(0),
-                  errors::InvalidArgument("Trying to scatter index ", i,
-                                          " from tensor with ",
-                                          t.shape().dim_size(0), " rows."));
-      Tensor tmp = t.Slice(i, i + 1);
+      OP_REQUIRES(c, i < input_tensor.shape().dim_size(0),
+                  errors::InvalidArgument(
+                      "Trying to scatter index ", i, " from tensor with ",
+                      input_tensor.shape().dim_size(0), " rows."));
+      Tensor tmp = input_tensor.Slice(i, i + 1);
       TensorShape tmp_shape = tmp.shape();
       tmp_shape.RemoveDim(0);
       OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
@@ -328,40 +560,10 @@ Status TensorListBinaryAdd(OpKernelContext* c, const TensorList& a,
   for (int i = 0; i < a.tensors.size(); ++i) {
     const Tensor& a_tensor = a.tensors[i];
     const Tensor& b_tensor = b.tensors[i];
-    if (a_tensor.dtype() == DT_INVALID) {
-      out->tensors.push_back(b_tensor);
-      continue;
-    }
-    if (b_tensor.dtype() == DT_INVALID) {
-      out->tensors.push_back(a_tensor);
-      continue;
-    }
-    if (a_tensor.shape() != b_tensor.shape()) {
-      // TODO(apassos) support broadcasting additions here?
-      return errors::InvalidArgument(
-          "Trying to add two tensors with incompatible element shapes. "
-          "One is ",
-          a_tensor.shape().DebugString(), " and the other is ",
-          b_tensor.shape().DebugString(), " in position ", i);
-    }
     Tensor out_tensor;
     TF_RETURN_IF_ERROR(
-        c->allocate_temp(a_tensor.dtype(), a_tensor.shape(), &out_tensor));
+        BinaryAddTensors<Device>(c, a_tensor, b_tensor, &out_tensor));
     out->tensors.push_back(out_tensor);
-    switch (out_tensor.dtype()) {
-#define DTYPE_CASE(dtype)                                        \
-  case DataTypeToEnum<dtype>::value:                             \
-    out_tensor.flat<dtype>().device(c->eigen_device<Device>()) = \
-        a_tensor.flat<dtype>() + b_tensor.flat<dtype>();         \
-    break;
-
-      TF_CALL_NUMBER_TYPES(DTYPE_CASE)
-
-#undef DTYPE_CASE
-      default:
-        return errors::InvalidArgument("Trying to add unsupported dtype ",
-                                       out_tensor.dtype());
-    }
   }
   return Status::OK();
 }
@@ -374,41 +576,7 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
   y->tensors.reserve(x.tensors.size());
   for (const Tensor& t : x.tensors) {
     Tensor out_tensor;
-    AllocatorAttributes attr;
-    if (t.dtype() == DT_VARIANT) {
-      attr.set_on_host(true);
-    }
-    TF_RETURN_IF_ERROR(
-        c->allocate_temp(t.dtype(), t.shape(), &out_tensor, attr));
-    switch (out_tensor.dtype()) {
-#define DTYPE_CASE(dtype)                                        \
-  case DataTypeToEnum<dtype>::value:                             \
-    out_tensor.flat<dtype>().device(c->eigen_device<Device>()) = \
-        out_tensor.flat<dtype>().constant(dtype(0));             \
-    break;
-
-      TF_CALL_POD_TYPES(DTYPE_CASE)
-
-#undef DTYPE_CASE
-
-      case DataTypeToEnum<Variant>::value: {
-        const TensorList* inner_x = t.scalar<Variant>()().get<TensorList>();
-        if (inner_x == nullptr) {
-          return errors::InvalidArgument("Input handle is not a list. Saw: '",
-                                         t.scalar<Variant>()().DebugString(),
-                                         "'");
-        }
-        TensorList inner_y;
-        TF_RETURN_IF_ERROR(TensorListZerosLike<Device>(c, *inner_x, &inner_y));
-        out_tensor.scalar<Variant>()() = std::move(inner_y);
-        break;
-      }
-
-      default:
-        return errors::InvalidArgument(
-            "Trying to compute zeros_like for unsupported dtype ",
-            DataTypeString(out_tensor.dtype()));
-    }
+    TF_RETURN_IF_ERROR(ZerosLikeTensor<Device>(c, t, &out_tensor));
     y->tensors.emplace_back(out_tensor);
   }
   return Status::OK();
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 1ded012f3c7ca0dbabe59c3a50319b9fe00fe5fb..2599340d78a5308cbd63338db84e569f12541a4b 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <iostream>
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -22,6 +23,31 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+// If the following string is found at the beginning of an output stream, it
+// will be interpreted as a file path.
+const char kOutputStreamEscapeStr[] = "file://";
+
+// A mutex that guards appending strings to files.
+static mutex* file_mutex = new mutex();
+
+// Appends the given data to the specified file. It will create the file if it
+// doesn't already exist.
+Status AppendStringToFile(const std::string& fname, StringPiece data,
+                          Env* env) {
+  // TODO(ckluk): If opening and closing on every log causes performance issues,
+  // we can reimplement using reference counters.
+  mutex_lock l(*file_mutex);
+  std::unique_ptr<WritableFile> file;
+  TF_RETURN_IF_ERROR(env->NewAppendableFile(fname, &file));
+  Status a = file->Append(absl::StrCat(data, "\n"));
+  Status c = file->Close();
+  return a.ok() ? c : a;
+}
+
+}  // namespace
+
 class AssertOp : public OpKernel {
  public:
   explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -52,6 +78,14 @@ class AssertOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("Assert").Device(DEVICE_CPU), AssertOp);
 
+#if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(Name("Assert")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("condition")
+                            .HostMemory("data"),
+                        AssertOp);
+#endif  // GOOGLE_CUDA
+
 class PrintOp : public OpKernel {
  public:
   explicit PrintOp(OpKernelConstruction* ctx)
@@ -96,6 +130,9 @@ class PrintV2Op : public OpKernel {
   explicit PrintV2Op(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_stream", &output_stream_));
 
+    SetFilePathIfAny();
+    if (!file_path_.empty()) return;
+
     auto output_stream_index =
         std::find(std::begin(valid_output_streams_),
                   std::end(valid_output_streams_), output_stream_);
@@ -115,6 +152,11 @@ class PrintV2Op : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("input", &input_));
     const string& msg = input_->scalar<string>()();
 
+    if (!file_path_.empty()) {
+      // Outputs to a file at the specified path.
+      OP_REQUIRES_OK(ctx, AppendStringToFile(file_path_, msg, ctx->env()));
+      return;
+    }
     if (output_stream_ == "stdout") {
       std::cout << msg << std::endl;
     } else if (output_stream_ == "stderr") {
@@ -131,15 +173,29 @@ class PrintV2Op : public OpKernel {
       for (auto valid_stream : valid_output_streams_) {
         strings::StrAppend(&error_msg, " ", valid_stream);
       }
+      strings::StrAppend(&error_msg, ", or file://<filename>");
       OP_REQUIRES(ctx, false, errors::InvalidArgument(error_msg));
     }
   }
 
-  const char* valid_output_streams_[6] = {"stdout", "stderr", "log(info)",
+  const char* valid_output_streams_[5] = {"stdout", "stderr", "log(info)",
                                           "log(warning)", "log(error)"};
 
  private:
+  // Either output_stream_ or file_path_ (but not both) will be non-empty.
   string output_stream_;
+  string file_path_;
+
+  // If output_stream_ is a file path, extracts it to file_path_ and clears
+  // output_stream_; otherwise sets file_paths_ to "".
+  void SetFilePathIfAny() {
+    if (absl::StartsWith(output_stream_, kOutputStreamEscapeStr)) {
+      file_path_ = output_stream_.substr(strlen(kOutputStreamEscapeStr));
+      output_stream_ = "";
+    } else {
+      file_path_ = "";
+    }
+  }
 };
 
 REGISTER_KERNEL_BUILDER(Name("PrintV2").Device(DEVICE_CPU), PrintV2Op);
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index a4957588617c4a63aab71d9e8a054c028b0eb061..3d1ee50c953b8a1979ff499a4c6da24c526d66f9 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -89,6 +89,16 @@ class MutableHashTableOfScalars final : public LookupInterface {
     return DoInsert(false, keys, values);
   }
 
+  Status Remove(OpKernelContext* ctx, const Tensor& keys) override {
+    const auto key_values = keys.flat<K>();
+
+    mutex_lock l(mu_);
+    for (int64 i = 0; i < key_values.size(); ++i) {
+      table_.erase(SubtleMustCopyIfIntegral(key_values(i)));
+    }
+    return Status::OK();
+  }
+
   Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
                       const Tensor& values) override {
     return DoInsert(true, keys, values);
@@ -212,6 +222,16 @@ class MutableHashTableOfTensors final : public LookupInterface {
     return DoInsert(false, keys, values);
   }
 
+  Status Remove(OpKernelContext* ctx, const Tensor& keys) override {
+    const auto key_values = keys.flat<K>();
+
+    mutex_lock l(mu_);
+    for (int64 i = 0; i < key_values.size(); ++i) {
+      table_.erase(SubtleMustCopyIfIntegral(key_values(i)));
+    }
+    return Status::OK();
+  }
+
   Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
                       const Tensor& values) override {
     return DoInsert(true, keys, values);
@@ -326,6 +346,29 @@ class MutableDenseHashTable final : public LookupInterface {
         empty_key_input->template shaped<K, 2>({1, key_shape_.num_elements()}),
         0);
 
+    const Tensor* deleted_key_input;
+    OP_REQUIRES_OK(ctx, ctx->input("deleted_key", &deleted_key_input));
+    OP_REQUIRES(ctx, key_shape_.IsSameSize(deleted_key_input->shape()),
+                errors::InvalidArgument(
+                    "Empty and deleted keys must have same shape, got shapes: ",
+                    key_shape_.DebugString(), " and ",
+                    deleted_key_input->shape().DebugString()));
+    deleted_key_ = PersistentTensor(*deleted_key_input);
+    deleted_key_hash_ = HashKey(deleted_key_input->template shaped<K, 2>(
+                                    {1, key_shape_.num_elements()}),
+                                0);
+
+    if (empty_key_hash_ == deleted_key_hash_) {
+      const int64 key_size = key_shape_.num_elements();
+      const auto empty_key_matrix =
+          empty_key_.AccessTensor(ctx)->template shaped<K, 2>({1, key_size});
+      const auto deleted_key_matrix =
+          deleted_key_.AccessTensor(ctx)->template shaped<K, 2>({1, key_size});
+      OP_REQUIRES(
+          ctx, !IsEqualKey(empty_key_matrix, 0, deleted_key_matrix, 0),
+          errors::InvalidArgument("Empty and deleted keys cannot be equal"));
+    }
+
     int64 initial_num_buckets;
     OP_REQUIRES_OK(ctx, GetNodeAttr(kernel->def(), "initial_num_buckets",
                                     &initial_num_buckets));
@@ -360,6 +403,8 @@ class MutableDenseHashTable final : public LookupInterface {
         value_buckets_.AccessTensor(ctx)->template matrix<V>();
     const auto empty_key_matrix =
         empty_key_.AccessTensor(ctx)->template shaped<K, 2>({1, key_size});
+    const auto deleted_key_matrix =
+        deleted_key_.AccessTensor(ctx)->template shaped<K, 2>({1, key_size});
     const int64 bit_mask = num_buckets_ - 1;
     // TODO(andreasst): parallelize using work_sharder
     for (int64 i = 0; i < num_elements; ++i) {
@@ -369,6 +414,11 @@ class MutableDenseHashTable final : public LookupInterface {
         return errors::InvalidArgument(
             "Using the empty_key as a table key is not allowed");
       }
+      if (deleted_key_hash_ == key_hash &&
+          IsEqualKey(deleted_key_matrix, 0, key_matrix, i)) {
+        return errors::InvalidArgument(
+            "Using the deleted_key as a table key is not allowed");
+      }
       int64 bucket_index = key_hash & bit_mask;
       int64 num_probes = 0;
       while (true) {
@@ -425,23 +475,40 @@ class MutableDenseHashTable final : public LookupInterface {
     return DoInsert(ctx, key, value, false);
   }
 
+  Status Remove(OpKernelContext* ctx, const Tensor& key) override
+      LOCKS_EXCLUDED(mu_) {
+    if (key.NumElements() != key.dim_size(0) * key_shape_.num_elements()) {
+      TensorShape expected_shape({key.dim_size(0)});
+      expected_shape.AppendShape(key_shape_);
+      return errors::InvalidArgument("Expected key shape ",
+                                     expected_shape.DebugString(), " got ",
+                                     key.shape().DebugString());
+    }
+    mutex_lock l(mu_);
+    return DoRemove(ctx, key);
+  }
+
   Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
                       const Tensor& values) override LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
     num_buckets_ = keys.dim_size(0);
     key_buckets_ = PersistentTensor(keys);
     value_buckets_ = PersistentTensor(values);
-    // Count the number of keys that are not the empty_key. This requires
-    // iterating through the whole table but that is OK as we only execute it
-    // during checkpoint restore.
+    // Count the number of keys that are not the empty_key or deleted_key.
+    // This requires iterating through the whole table but that is OK as we
+    // only execute it during checkpoint restore.
     num_entries_ = 0;
     const auto empty_key_tensor =
         empty_key_.AccessTensor(ctx)->template shaped<K, 2>(
             {1, key_shape_.num_elements()});
+    const auto deleted_key_tensor =
+        deleted_key_.AccessTensor(ctx)->template shaped<K, 2>(
+            {1, key_shape_.num_elements()});
     const auto key_buckets_tensor =
         key_buckets_.AccessTensor(ctx)->template matrix<K>();
     for (int64 i = 0; i < num_buckets_; ++i) {
-      if (!IsEqualKey(key_buckets_tensor, i, empty_key_tensor, 0)) {
+      if (!IsEqualKey(key_buckets_tensor, i, empty_key_tensor, 0) &&
+          !IsEqualKey(key_buckets_tensor, i, deleted_key_tensor, 0)) {
         ++num_entries_;
       }
     }
@@ -498,7 +565,8 @@ class MutableDenseHashTable final : public LookupInterface {
 
  private:
   Status DoInsert(OpKernelContext* ctx, const Tensor& key, const Tensor& value,
-                  bool ignore_empty_key) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                  bool ignore_empty_and_deleted_key)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     const int64 num_elements = (key.dims() == 0) ? 1 : key.dim_size(0);
     const int64 value_size = value_shape_.num_elements();
     const int64 key_size = key_shape_.num_elements();
@@ -511,17 +579,27 @@ class MutableDenseHashTable final : public LookupInterface {
         value_buckets_.AccessTensor(ctx)->template matrix<V>();
     const auto empty_key_tensor =
         empty_key_.AccessTensor(ctx)->template shaped<K, 2>({1, key_size});
+    const auto deleted_key_tensor =
+        deleted_key_.AccessTensor(ctx)->template shaped<K, 2>({1, key_size});
     const int64 bit_mask = num_buckets_ - 1;
     for (int64 i = 0; i < num_elements; ++i) {
       const uint64 key_hash = HashKey(key_matrix, i);
       if (empty_key_hash_ == key_hash &&
           IsEqualKey(empty_key_tensor, 0, key_matrix, i)) {
-        if (ignore_empty_key) {
+        if (ignore_empty_and_deleted_key) {
           continue;
         }
         return errors::InvalidArgument(
             "Using the empty_key as a table key is not allowed");
       }
+      if (deleted_key_hash_ == key_hash &&
+          IsEqualKey(deleted_key_tensor, 0, key_matrix, i)) {
+        if (ignore_empty_and_deleted_key) {
+          continue;
+        }
+        return errors::InvalidArgument(
+            "Using the deleted_key as a table key is not allowed");
+      }
       int64 bucket_index = key_hash & bit_mask;
       int64 num_probes = 0;
       while (true) {
@@ -532,7 +610,9 @@ class MutableDenseHashTable final : public LookupInterface {
           }
           break;
         }
-        if (IsEqualKey(key_buckets_matrix, bucket_index, empty_key_tensor, 0)) {
+        if (IsEqualKey(key_buckets_matrix, bucket_index, empty_key_tensor, 0) ||
+            IsEqualKey(key_buckets_matrix, bucket_index, deleted_key_tensor,
+                       0)) {
           ++num_entries_;
           for (int64 j = 0; j < key_size; ++j) {
             key_buckets_matrix(bucket_index, j) =
@@ -556,6 +636,59 @@ class MutableDenseHashTable final : public LookupInterface {
     return Status::OK();
   }
 
+  Status DoRemove(OpKernelContext* ctx, const Tensor& key)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    const int64 num_elements = key.dim_size(0);
+    const int64 key_size = key_shape_.num_elements();
+    const auto key_matrix = key.shaped<K, 2>({num_elements, key_size});
+
+    auto key_buckets_matrix =
+        key_buckets_.AccessTensor(ctx)->template matrix<K>();
+    const auto empty_key_tensor =
+        empty_key_.AccessTensor(ctx)->template shaped<K, 2>({1, key_size});
+    const auto deleted_key_tensor =
+        deleted_key_.AccessTensor(ctx)->template shaped<K, 2>({1, key_size});
+    const auto deleted_key_flat =
+        deleted_key_.AccessTensor(ctx)->template flat<K>();
+    const int64 bit_mask = num_buckets_ - 1;
+    for (int64 i = 0; i < num_elements; ++i) {
+      const uint64 key_hash = HashKey(key_matrix, i);
+      if (empty_key_hash_ == key_hash &&
+          IsEqualKey(empty_key_tensor, 0, key_matrix, i)) {
+        return errors::InvalidArgument(
+            "Using the empty_key as a table key is not allowed");
+      }
+      if (deleted_key_hash_ == key_hash &&
+          IsEqualKey(deleted_key_tensor, 0, key_matrix, i)) {
+        return errors::InvalidArgument(
+            "Using the deleted_key as a table key is not allowed");
+      }
+      int64 bucket_index = key_hash & bit_mask;
+      int64 num_probes = 0;
+      while (true) {
+        if (IsEqualKey(key_buckets_matrix, bucket_index, key_matrix, i)) {
+          --num_entries_;
+          for (int64 j = 0; j < key_size; ++j) {
+            key_buckets_matrix(bucket_index, j) =
+                SubtleMustCopyIfIntegral(deleted_key_flat(j));
+          }
+          break;
+        }
+        if (IsEqualKey(key_buckets_matrix, bucket_index, empty_key_tensor, 0)) {
+          break;
+        }
+        ++num_probes;
+        bucket_index =
+            (bucket_index + num_probes) & bit_mask;  // quadratic probing
+        if (num_probes >= num_buckets_) {
+          return errors::Internal(
+              "Internal error in MutableDenseHashTable remove");
+        }
+      }
+    }
+    return Status::OK();
+  }
+
   Status AllocateBuckets(OpKernelContext* ctx, int64 new_num_buckets)
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (new_num_buckets < 4 ||
@@ -639,6 +772,8 @@ class MutableDenseHashTable final : public LookupInterface {
   PersistentTensor value_buckets_ GUARDED_BY(mu_);
   PersistentTensor empty_key_;
   uint64 empty_key_hash_;
+  PersistentTensor deleted_key_;
+  uint64 deleted_key_hash_;
 };
 
 }  // namespace lookup
@@ -717,6 +852,39 @@ REGISTER_KERNEL_BUILDER(Name("LookupTableInsert").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("LookupTableInsertV2").Device(DEVICE_CPU),
                         LookupTableInsertOp);
 
+// Table remove op.
+class LookupTableRemoveOp : public OpKernel {
+ public:
+  explicit LookupTableRemoveOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    lookup::LookupInterface* table;
+    OP_REQUIRES_OK(ctx, GetLookupTable("table_handle", ctx, &table));
+    core::ScopedUnref unref_me(table);
+
+    DataType expected_input_0 =
+        (ctx->input_dtype(0) == DT_RESOURCE) ? DT_RESOURCE : DT_STRING_REF;
+    DataTypeVector expected_inputs = {expected_input_0, table->key_dtype()};
+    OP_REQUIRES_OK(ctx, ctx->MatchSignature(expected_inputs, {}));
+
+    const Tensor& key = ctx->input(1);
+    OP_REQUIRES_OK(ctx, table->CheckKeyTensorForRemove(key));
+
+    int64 memory_used_before = 0;
+    if (ctx->track_allocations()) {
+      memory_used_before = table->MemoryUsed();
+    }
+    OP_REQUIRES_OK(ctx, table->Remove(ctx, key));
+    if (ctx->track_allocations()) {
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LookupTableRemoveV2").Device(DEVICE_CPU),
+                        LookupTableRemoveOp);
+
 // Op that returns the size of the given table.
 class LookupTableSizeOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index b4252eb04446895a7c293b62473dba28a06845a1..f405ca3c58cfffc8422dcdd65e66c7fd12784519 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -26,6 +26,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
diff --git a/tensorflow/core/kernels/lu_op.cc b/tensorflow/core/kernels/lu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9591d1bdf2fddea7b9d6265d4a8dd6c3f5f5df6
--- /dev/null
+++ b/tensorflow/core/kernels/lu_op.cc
@@ -0,0 +1,193 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/Eigen/LU"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Scalar, typename Tidx>
+class LuOp : public OpKernel {
+ public:
+  explicit LuOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ protected:
+  using TensorShapes = gtl::InlinedVector<TensorShape, 4>;
+  using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
+
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  using Indices =
+      Eigen::Matrix<Tidx, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using IndicesMap = Eigen::Map<Indices>;
+  using ConstIndicesMap = Eigen::Map<const Indices>;
+
+ public:
+  // Returns the cost per matrix operation. This is used to determine the
+  // number of threads to use for parallelizing factorization in batch mode.
+  // Cost per unit is assumed to be roughly 1ns, based on comments
+  // in core/util/work_sharder.cc.
+  // LU decomposition for a square matrix takes roughly (2/3) * (num_rows)^3.
+  // TODO(anudhyan): Refine this estimate after taking constant factors into
+  // account.
+  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) const {
+    double num_rows = static_cast<double>(input_matrix_shape.dim_size(0));
+    double cost = (2 / 3.0) * MathUtil::IPow(num_rows, 3);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES(context, context->num_inputs() == 1,
+                errors::InvalidArgument("Expecting exactly one input, got ",
+                                        context->num_inputs()));
+
+    const Tensor& input = context->input(0);
+    int input_rank = input.dims();
+    OP_REQUIRES(context, input_rank >= 2,
+                errors::InvalidArgument(
+                    "Input tensor must have rank >= 2, got ", input_rank));
+
+    // If the tensor rank is greater than 2, we consider the inner-most
+    // dimensions as matrices, and loop over all the other outer ("batch")
+    // dimensions to compute the results.
+    TensorShape input_matrix_shape;
+    TensorShape batch_shape;
+    for (int dim = 0; dim < input_rank - 2; ++dim) {
+      batch_shape.AddDim(input.dim_size(dim));
+    }
+    const int64 num_rows = input.dim_size(input_rank - 2);
+    const int64 num_cols = input.dim_size(input_rank - 1);
+
+    input_matrix_shape.AppendShape({num_rows, num_cols});
+    OP_REQUIRES(context, TensorShapeUtils::IsSquareMatrix(input_matrix_shape),
+                errors::InvalidArgument("Input matrix must be square."));
+
+    // packed_triangular_factors is a matrix with the same shape as the input;
+    // permutation is a vector.
+    TensorShape permutation_shape = batch_shape;
+    permutation_shape.AddDim(num_rows);
+
+    TensorShapes output_matrix_shapes({input.shape(), permutation_shape});
+
+    TensorOutputs outputs;
+    Tensor* output_packed_triangular_factors = nullptr;
+    OP_REQUIRES_OK(
+        context, context->forward_input_or_allocate_output(
+                     {0}, 0, input.shape(), &output_packed_triangular_factors));
+    outputs.emplace_back(output_packed_triangular_factors);
+
+    Tensor* output_permutation = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, permutation_shape,
+                                                     &output_permutation));
+    outputs.emplace_back(output_permutation);
+
+    if (num_rows == 0) {
+      return;
+    }
+
+    // Process the individual matrix problems in parallel using a threadpool.
+    auto shard = [this, &input, &num_rows, &num_cols, &outputs,
+                  &output_matrix_shapes, context](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        ComputeTensorSlice(context, i, input, num_rows, num_cols, outputs,
+                           output_matrix_shapes);
+      }
+    };
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          batch_shape.num_elements(), GetCostPerUnit(input_matrix_shape),
+          shard);
+  }
+
+  void ComputeTensorSlice(OpKernelContext* context, int64 matrix_index,
+                          const Tensor& input, int64 num_rows, int64 num_cols,
+                          const TensorOutputs& outputs,
+                          const TensorShapes& output_matrix_shapes) {
+    // TODO(kalakris): Handle alignment if possible. Eigen::Map is
+    // unaligned by default.
+    ConstMatrixMap input_matrix(
+        input.flat<Scalar>().data() + matrix_index * num_rows * num_cols,
+        num_rows, num_cols);
+
+    // packed_triangular_factors has shape [num_rows, num_cols]
+    MatrixMap packed_triangular_factors(
+        outputs[0]->flat<Scalar>().data() + matrix_index * num_rows * num_cols,
+        num_rows, num_rows);
+
+    // permutation has shape [num_rows, 1]
+    IndicesMap permutation_indices(
+        outputs[1]->flat<Tidx>().data() + matrix_index * num_rows, num_rows, 1);
+
+    Eigen::PartialPivLU<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>
+        lu_decomposition(input_matrix);
+
+    // Output the packed triangular factors in a dense form.
+    // The lower triangular factor L corresponds to the strictly lower
+    // triangular part of packed_triangular_factors with an implicit unit
+    // diagonal. The upper triangular factor U is the upper triangular part of
+    // packed_triangular_factors. The triangular factors satisfy the equation
+    //     P * input_matrix = L * U
+    // where P is the permutation matrix corresponding to the indices in
+    // permutation_indices.
+    packed_triangular_factors = lu_decomposition.matrixLU();
+    // Output the permutation matrix used for pivoting.
+    Eigen::PermutationMatrix<-1, -1, Tidx> permutation =
+        lu_decomposition.permutationP().transpose();
+    permutation_indices = permutation.indices();
+
+    // PartialPivLU cannot give strong guarantees on invertibility,
+    // but we can at least guard against exact zero pivots. This can occur as
+    // a result of basic user mistakes such providing integer valued
+    // matrices that are exactly singular, or due to underflow if this
+    // code is run with denormals being flushed to zero.
+    const RealScalar min_abs_pivot =
+        packed_triangular_factors.diagonal().cwiseAbs().minCoeff();
+    OP_REQUIRES(context, min_abs_pivot > RealScalar(0),
+                errors::InvalidArgument("Input is not invertible."));
+  }
+};
+
+#define REGISTER_LU(type, idx_type)                                         \
+  REGISTER_KERNEL_BUILDER(Name("Lu")                                        \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<idx_type>("output_idx_type"), \
+                          LuOp<type, idx_type>);
+
+REGISTER_LU(float, int32);
+REGISTER_LU(double, int32);
+REGISTER_LU(complex64, int32);
+REGISTER_LU(complex128, int32);
+
+REGISTER_LU(float, int64);
+REGISTER_LU(double, int64);
+REGISTER_LU(complex64, int64);
+REGISTER_LU(complex128, int64);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lu_op_gpu.cu.cc b/tensorflow/core/kernels/lu_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f83744b50de5ca7fd247b17e3fcac52889f5f288
--- /dev/null
+++ b/tensorflow/core/kernels/lu_op_gpu.cu.cc
@@ -0,0 +1,275 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+template <typename Scalar>
+__device__ void ComputePermutationFromTranspositions(
+    int64 num_rows, const int* pivots, Scalar* permutation_indices) {
+  // Fill in the output array with the identity permutation.
+  for (int i = 0; i < num_rows; ++i) {
+    permutation_indices[i] = Scalar(i);
+  }
+
+  // Compute the permutation from a sequence of transpositions encoded
+  // in the pivot array by applying the transpositions in order on the
+  // identity permutation.
+  for (int i = 0; i < num_rows; ++i) {
+    // Note: Internally, the cuBlas code uses Fortran convention (1-based)
+    // indexing so ith row was swapped with (pivots[i]-1)'th row in 0-based
+    // indexing.
+    Scalar t = permutation_indices[i];
+    permutation_indices[i] = permutation_indices[pivots[i] - 1];
+    permutation_indices[pivots[i] - 1] = t;
+  }
+}
+}  // namespace
+
+// Kernel to compute the inverse of a permutation from a sequence of
+// transpositions.
+template <typename Scalar>
+__global__ void ComputePermutationFromTranspositionsKernel(
+    CudaLaunchConfig config, const int64 num_rows, const int* all_pivots,
+    Scalar* all_permutation_indices) {
+  // We only parallelize over batches here. Performance is not critical,
+  // since this cheap O(num_rows) kernel always follows an O(num_rows^3)
+  // LU factorization.
+  CUDA_1D_KERNEL_LOOP(index, config.virtual_thread_count) {
+    ComputePermutationFromTranspositions(
+        num_rows, all_pivots + index * num_rows,
+        all_permutation_indices + index * num_rows);
+  }
+}
+
+template <class Scalar, class Tidx>
+class LuOpGpu : public AsyncOpKernel {
+ public:
+  explicit LuOpGpu(OpKernelConstruction* context) : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+
+    // Analyze shape and validate inputs.
+    const int input_rank = input.dims();
+
+    OP_REQUIRES_ASYNC(
+        context, input_rank >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", input_rank),
+        done);
+
+    const int64 num_rows = input.dim_size(input_rank - 2);
+    const int64 num_cols = input.dim_size(input_rank - 1);
+
+    OP_REQUIRES_ASYNC(
+        context, num_rows == num_cols,
+        errors::InvalidArgument("Input matrices must be squares, got", num_rows,
+                                " != ", num_cols),
+        done);
+
+    TensorShape batch_shape;
+    for (int dim = 0; dim < input_rank - 2; ++dim) {
+      batch_shape.AddDim(input.dim_size(dim));
+    }
+    TensorShape permutation_indices_shape = batch_shape;
+    permutation_indices_shape.AddDim(num_rows);
+
+    const GPUDevice& device = context->eigen_device<GPUDevice>();
+    auto solver = absl::make_unique<CudaSolver>(context);
+
+    // We output the packed triangular factors in a dense form.
+    // The lower triangular factor L corresponds to the strictly lower
+    // triangular part of packed_triangular_factors with an implicit unit
+    // diagonal. The upper triangular factor U is the upper triangular part of
+    // packed_triangular_factors. The triangular factors satisfy the equation
+    //     P * input_matrix = L * U
+    // where P is the permutation matrix corresponding to the indices in
+    // permutation_indices.
+    //
+    // Reuse the input buffer or make a copy for the factorization step,
+    // depending on whether this ops owns it exclusively.
+    Tensor* packed_triangular_factors;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->forward_input_or_allocate_output(
+                             {0}, 0, input.shape(), &packed_triangular_factors),
+                         done);
+    if (!packed_triangular_factors->SharesBufferWith(input)) {
+      device.memcpy(packed_triangular_factors->flat<Scalar>().data(),
+                    input.flat<Scalar>().data(),
+                    input.NumElements() * sizeof(Scalar));
+    }
+
+    // Allocate output permutation.
+    Tensor* permutation_indices = nullptr;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_output(1, permutation_indices_shape,
+                                                  &permutation_indices),
+                         done);
+
+    if (input.NumElements() == 0) {
+      done();
+      return;
+    }
+
+    // Allocate a temporary Tensor to store the transposed packed triangular
+    // factors.
+    Tensor packed_triangular_factors_transpose;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_temp(DataTypeToEnum<Scalar>::value, input.shape(),
+                               &packed_triangular_factors_transpose),
+        done);
+    auto packed_triangular_factors_transpose_reshaped =
+        packed_triangular_factors_transpose
+            .template flat_inner_dims<Scalar, 3>();
+    const int64 batch_size =
+        packed_triangular_factors_transpose_reshaped.dimension(0);
+
+    // Allocate pivots on the device.
+    Tensor pivots;
+    OP_REQUIRES_OK_ASYNC(context,
+                         solver->allocate_scoped_tensor(
+                             DataTypeToEnum<int32>::value,
+                             TensorShape{batch_size, num_rows}, &pivots),
+                         done);
+    auto pivots_mat = pivots.template matrix<int32>();
+
+    // Transpose the input. This is necessary because cuBLAS assumes
+    // column-major storage while TensorFlow uses row-major.
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        DoMatrixTranspose(device, *packed_triangular_factors,
+                          &packed_triangular_factors_transpose),
+        done);
+
+    std::vector<DeviceLapackInfo> dev_info;
+    if (num_rows == num_cols && num_rows / batch_size <= 128) {
+      // For small matrices or large batch sizes, we use the batched
+      // interface from cuBlas.
+      auto packed_triangular_factors_ptrs = solver->GetScratchSpace<uint8>(
+          sizeof(Scalar*) * batch_size, "packed_triangular_factors_ptrs",
+          /* on_host */ true);
+      const Scalar** packed_triangular_factors_ptrs_base =
+          reinterpret_cast<const Scalar**>(
+              packed_triangular_factors_ptrs.mutable_data());
+      for (int batch = 0; batch < batch_size; ++batch) {
+        packed_triangular_factors_ptrs_base[batch] =
+            &packed_triangular_factors_transpose_reshaped(batch, 0, 0);
+      }
+      dev_info.push_back(
+          solver->GetDeviceLapackInfo(batch_size, "getrfBatched"));
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          solver->GetrfBatched(num_rows, packed_triangular_factors_ptrs_base,
+                               num_rows, pivots_mat.data(), &dev_info.back(),
+                               batch_size),
+          done);
+    } else {
+      // For small batch sizes we use the non-batched interface from cuSolver,
+      // which is much faster for large matrices.
+      dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrf"));
+      for (int batch = 0; batch < batch_size; ++batch) {
+        OP_REQUIRES_OK_ASYNC(
+            context,
+            solver->Getrf(
+                num_rows, num_cols,
+                &packed_triangular_factors_transpose_reshaped(batch, 0, 0),
+                num_rows, &pivots_mat(batch, 0), &dev_info.back()(batch)),
+            done);
+      }
+    }
+
+    // Transpose the result since we had transposed the input.
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        DoMatrixTranspose(device, packed_triangular_factors_transpose,
+                          packed_triangular_factors),
+        done);
+
+    // Pivots encode the permutation of the rows as a sequences of row swaps.
+    // For each index i, row i is swapped with row pivots[i].
+    int* pivots_ptr = pivots.flat<int>().data();
+    Tidx* permutation_indices_ptr =
+        permutation_indices->template flat<Tidx>().data();
+    CudaLaunchConfig cfgPivots = GetCudaLaunchConfig(batch_size, device);
+    ComputePermutationFromTranspositionsKernel<<<cfgPivots.block_count,
+                                                 cfgPivots.thread_per_block, 0,
+                                                 device.stream()>>>(
+        cfgPivots, num_rows, pivots_ptr, permutation_indices_ptr);
+
+    // Callback for checking info after kernels finish. Also capture the
+    // temporary Tensors/ScratchSpace so they don't get deallocated before the
+    // kernels run.
+    // TODO(rmlarsen): Use move capture once C++14 becomes available.
+    auto info_checker = [context, done, dev_info](
+                            const Status& status,
+                            const std::vector<HostLapackInfo>& host_infos) {
+      if (!status.ok() && errors::IsInvalidArgument(status) &&
+          !host_infos.empty()) {
+        for (int i = 0; i < host_infos[0].size(); ++i) {
+          // Match the CPU error message for singular matrices. Otherwise
+          // just print the original error message from the status below.
+          OP_REQUIRES_ASYNC(context, host_infos[0].data()[i] <= 0,
+                            errors::InvalidArgument("Input is not invertible."),
+                            done);
+        }
+      }
+      OP_REQUIRES_OK_ASYNC(context, status, done);
+      done();
+    };
+
+    CudaSolver::CheckLapackInfoAndDeleteSolverAsync(std::move(solver), dev_info,
+                                                    std::move(info_checker));
+  }
+};
+
+#define REGISTER_LU_GPU(type, idx_type)                                     \
+  REGISTER_KERNEL_BUILDER(Name("Lu")                                        \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<idx_type>("output_idx_type"), \
+                          LuOpGpu<type, idx_type>);
+
+REGISTER_LU_GPU(float, int32);
+REGISTER_LU_GPU(double, int32);
+REGISTER_LU_GPU(complex64, int32);
+REGISTER_LU_GPU(complex128, int32);
+
+REGISTER_LU_GPU(float, int64);
+REGISTER_LU_GPU(double, int64);
+REGISTER_LU_GPU(complex64, int64);
+REGISTER_LU_GPU(complex128, int64);
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 4ad390a4116d10a38943a3f628ede3581cb2e0d9..4ebe165937055ad3773f5f3ae6ab66b71c6a63af 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -580,13 +580,16 @@ struct MatMulFunctor<SYCLDevice, T> {
 
 #if defined(INTEL_MKL) && defined(ENABLE_MKL)
 
-// MKL does not support half, bfloat16 and int32 types for
+// MKL supports float, double, complex64 and complex128 types for
+// matrix-multiplication, and these kernels are registered in mkl_matmul_op.cc.
+// MKL does not support half, bfloat16, int32 and int64 types for
 // matrix-multiplication, so register the kernel to use default Eigen based
 // implementations for these types. REGISTER_CPU defines two versions - Eigen
 // label and NO-LABEL
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_bfloat16(REGISTER_CPU);
 TF_CALL_int32(REGISTER_CPU);
+TF_CALL_int64(REGISTER_CPU);
 
 // Float is supported in both MKL DNN as well as in MKL ML
 // Registration for NO-LABEL version is in mkl_matmul_op.cc for types supported
@@ -614,6 +617,7 @@ TF_CALL_double(REGISTER_CPU);
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_bfloat16(REGISTER_CPU);
 TF_CALL_int32(REGISTER_CPU);
+TF_CALL_int64(REGISTER_CPU);
 TF_CALL_complex64(REGISTER_CPU);
 TF_CALL_complex128(REGISTER_CPU);
 #endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/kernels/matmul_op.h b/tensorflow/core/kernels/matmul_op.h
index 4b74a64025a19bbac1053efb6081347358fdc0c6..48769f3fe5d6eb4d5bb2856f9dd027253ebd8582 100644
--- a/tensorflow/core/kernels/matmul_op.h
+++ b/tensorflow/core/kernels/matmul_op.h
@@ -21,6 +21,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 namespace functor {
 
diff --git a/tensorflow/core/kernels/matrix_square_root_op.cc b/tensorflow/core/kernels/matrix_square_root_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe3d3043c26069a07160ac630bc91f5686ac1d5e
--- /dev/null
+++ b/tensorflow/core/kernels/matrix_square_root_op.cc
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/MatrixFunctions"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+template <class Scalar>
+class MatrixSquareRootOp : public LinearAlgebraOp<Scalar> {
+ public:
+  INHERIT_LINALG_TYPEDEFS(Scalar);
+
+  explicit MatrixSquareRootOp(OpKernelConstruction* context) : Base(context) {}
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& input = inputs[0];
+    if (input.rows() == 0) return;
+    using Matrix =
+        Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    Matrix tmp = input;
+    outputs->at(0) = tmp.sqrt();
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixSquareRootOp);
+};
+
+REGISTER_LINALG_OP("MatrixSquareRoot", (MatrixSquareRootOp<float>), float);
+REGISTER_LINALG_OP("MatrixSquareRoot", (MatrixSquareRootOp<double>), double);
+REGISTER_LINALG_OP("MatrixSquareRoot", (MatrixSquareRootOp<complex64>),
+                   complex64);
+REGISTER_LINALG_OP("MatrixSquareRoot", (MatrixSquareRootOp<complex128>),
+                   complex128);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 0c7a236b2ff0f0b5c6287d1dffb1e8ef9bac7cc0..56d0340547a891fe4929bd6a36a72c5e03d1d1e0 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -384,6 +384,7 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
     int32* top_data, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
+  if (output_size == 0) return true;
   MaxPoolForwardNoMaskKernel_NCHW_VECT_C<<<
       (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
       0, d.stream()>>>(output_size, bottom_data, height, width, channels,
@@ -402,6 +403,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
     int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
+  if (output_size == 0) return true;
   if (propagate_nans) {
     MaxPoolForwardNHWC<true>
         <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
@@ -430,6 +432,7 @@ bool MaxPoolBackwardNoMask<T>::operator()(
   const int kThreadsPerBlock = 1024;
 
   const int bottom_size = batch * channels * height * width;
+  if (bottom_size == 0) return true;
   SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
 
@@ -449,6 +452,7 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int64* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
+  if (input_size == 0) return true;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
@@ -466,6 +470,7 @@ bool MaxPoolGradBackwardNoMask<T>::operator()(
     const int pad_l, const T* top_diff, T* bottom_diff,
     const Eigen::GpuDevice& d) {
   const int num_kernels = batch * channels * pooled_height * pooled_width;
+  if (num_kernels == 0) return true;
   CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
 
   if (data_format == FORMAT_NHWC) {
@@ -489,6 +494,7 @@ bool MaxPoolGradBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
     const int64* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d) {
+  if (input_size == 0) return true;
   CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
   MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
                         d.stream()>>>(output_size, top_diff, mask, top_offset,
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 20aa1f7ea1f81f94155147a5623aaee0c188e49a..8eb334f2b497ea2c7d2d10d3007d30ed5a8adb5e 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -24,268 +24,14 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 
-#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
+#include "tensorflow/core/util/mkl_util.h"
 using mkldnn::stream;
 using mkldnn::sum;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, typename T>
-class MklAddNOp : public OpKernel {
- public:
-  explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    const int num = ctx->num_inputs();
-    OP_REQUIRES(ctx, num / 2 == 2,
-                errors::InvalidArgument("Only additions of two tensors "
-                                        "supported by MKL. Num inputs: ",
-                                        num));
-
-    MklAddNOpContext mkl_context;
-    size_t src1_idx = 0, src2_idx = 1;
-    const Tensor& input0 = MklGetInput(ctx, src1_idx);
-    GetMklShape(ctx, src1_idx, &(mkl_context.input1_shape));
-    bool input1_in_mkl_format = mkl_context.input1_shape.IsMklTensor();
-
-    const Tensor& input1 = MklGetInput(ctx, src2_idx);
-    GetMklShape(ctx, src2_idx, &(mkl_context.input2_shape));
-    bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();
-
-    // if the shapes of two tensors are not same raise op error
-    TensorShape src1_shape, src2_shape;
-    src1_shape = input0.shape();
-    src2_shape = input1.shape();
-    if (!src1_shape.IsSameSize(src2_shape)) {
-      ctx->SetStatus(errors::InvalidArgument(
-          "Inputs to operation ", this->name(), " of type ",
-          this->type_string(), " must have the same size and shape.  Input 0: ",
-          src1_shape.DebugString(), " != input 1: ", src2_shape.DebugString()));
-    }
-    // handle the case of a scalar
-    if (!input1_in_mkl_format && input0.dims() == 0) {
-      const TensorShape& o_shape = input0.shape();
-      Tensor* out_tensor = nullptr;
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
-                                mkl_context.output_shape);
-      float user_i1 = (input0.scalar<T>()());
-      float user_i2 = (input1.scalar<T>()());
-      out_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
-      return;
-    }
-
-    mkl_context.in_dims = input1_in_mkl_format
-                              ? mkl_context.input1_shape.GetDimension()
-                              : input0.dims();
-    mkl_context.in_dims = input2_in_mkl_format
-                              ? mkl_context.input2_shape.GetDimension()
-                              : input1.dims();
-
-    // If there is nothing to compute, return.
-    if (!input1_in_mkl_format && !input2_in_mkl_format) {
-      const TensorShape& o_shape = input0.shape();
-      if (o_shape.num_elements() == 0) {
-        Tensor* out_tensor = nullptr;
-        mkl_context.output_shape.SetMklTensor(false);
-        AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
-                                  mkl_context.output_shape);
-        return;
-      }
-    }
-
-    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-    mkl_context.in_strides = new size_t[mkl_context.in_dims];
-    // Generate size, stride for input if input is in MKL format.
-    if (input1_in_mkl_format || input2_in_mkl_format) {
-      const MklShape* tmp_mkl_shape = (input1_in_mkl_format)
-                                          ? &mkl_context.input1_shape
-                                          : &mkl_context.input2_shape;
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
-        mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
-      }
-    } else {
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] =
-            input0.dim_size((mkl_context.in_dims - 1) - i);
-      }
-      mkl_context.in_strides[0] = 1;
-      for (int i = 1; i < mkl_context.in_dims; i++) {
-        mkl_context.in_strides[i] =
-            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
-      }
-    }
-    std::vector<float> coeff(2, 1.0);
-    mkl_context.MklCreateInputLayouts(ctx);
-    CHECK_EQ(dnnSumCreate_F32(&mkl_context.Eltwise, mkl_context.attributes, 2,
-                              mkl_context.lt_input1, &coeff[0]),
-             E_SUCCESS);
-
-    Tensor mkl_tmp_input1_buf_tensor, mkl_tmp_input2_buf_tensor;
-    mkl_context.MklPrepareAddNInputs(ctx, &mkl_tmp_input1_buf_tensor,
-                                     &mkl_tmp_input2_buf_tensor);
-    Tensor* output = nullptr;
-    if (input1_in_mkl_format || input2_in_mkl_format) {
-      TensorShape tf_shape;
-      mkl_context.output_shape.SetMklTensor(true);
-      mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise,
-                                            dnnResourceDst);
-
-      mkl_context.output_shape.SetTfLayout(
-          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-      if (input1_in_mkl_format == true) {
-        mkl_context.output_shape.SetTfDimOrder(
-            mkl_context.in_dims, mkl_context.input1_shape.GetTfToMklDimMap());
-      } else {
-        mkl_context.output_shape.SetTfDimOrder(
-            mkl_context.in_dims, mkl_context.input2_shape.GetTfToMklDimMap());
-      }
-      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                          mkl_context.output_shape.GetMklLayout())) /
-                      sizeof(T));
-
-      AllocateOutputSetMklShape(ctx, src1_idx, &output, tf_shape,
-                                mkl_context.output_shape);
-    } else {
-      const TensorShape& o_shape = input1.shape();
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(ctx, src1_idx, &output, o_shape,
-                                mkl_context.output_shape);
-    }
-
-    mkl_context.Eltwise_res[dnnResourceDst] =
-        static_cast<void*>(output->flat<T>().data());
-
-    // Execute convolution
-    CHECK_EQ(dnnExecute_F32(mkl_context.Eltwise, mkl_context.Eltwise_res),
-             E_SUCCESS);
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t* in_sizes = nullptr;
-    size_t* in_strides = nullptr;
-    dnnPrimitive_t Eltwise = nullptr;
-    dnnPrimitiveAttributes_t attributes = nullptr;
-    void* Eltwise_res[dnnResourceNumber];
-    dnnLayout_t lt_input1 = nullptr, lt_input2 = nullptr;
-    MklShape input1_shape, input2_shape, output_shape;
-
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool input1_in_mkl_format = input1_shape.IsMklTensor();
-      if (!input1_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout());
-      }
-
-      bool input2_in_mkl_format = input2_shape.IsMklTensor();
-      if (!input2_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout());
-      }
-    }
-
-    void MklPrepareAddNInputs(OpKernelContext* context,
-                              Tensor* mkl_tmp_input1_buf_tensor,
-                              Tensor* mkl_tmp_input2_buf_tensor) {
-      bool mkl_convert_input1, mkl_convert_input2;
-      dnnPrimitive_t mkl_prim_convert_input1 = nullptr,
-                     mkl_prim_convert_input2 = nullptr;
-      dnnLayout_t mkl_lt_internal_input1 = nullptr,
-                  mkl_lt_internal_input2 = nullptr;
-      void *mkl_buf_convert_input1 = nullptr, *mkl_buf_convert_input2 = nullptr;
-      dnnResourceType_t dnnResourceMultipleSrc2 =
-          (dnnResourceType_t)(dnnResourceMultipleSrc + 1);
-      // Compare with internal layouts and convert if needed
-      const Tensor& input1 = MklGetInput(context, 0);
-
-      void* mkl_buf_input1 =
-          const_cast<void*>(static_cast<const void*>(input1.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &mkl_lt_internal_input1, Eltwise, dnnResourceMultipleSrc),
-               E_SUCCESS);
-      mkl_convert_input1 =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input1, lt_input1);
-      if (mkl_convert_input1) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input1, lt_input1,
-                                         mkl_lt_internal_input1),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input1_buf_tensor,
-                       mkl_lt_internal_input1, &mkl_buf_convert_input1);
-        CHECK_EQ(
-            dnnConversionExecute_F32(mkl_prim_convert_input1, mkl_buf_input1,
-                                     mkl_buf_convert_input1),
-            E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input1);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input1);
-
-      Eltwise_res[dnnResourceMultipleSrc] =
-          (mkl_convert_input1) ? mkl_buf_convert_input1 : mkl_buf_input1;
-
-      const Tensor& input2 = MklGetInput(context, 1);
-      void* mkl_buf_input2 =
-          const_cast<void*>(static_cast<const void*>(input2.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &mkl_lt_internal_input2, Eltwise, dnnResourceMultipleSrc2),
-               E_SUCCESS);
-      mkl_convert_input2 =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input2, lt_input2);
-      if (mkl_convert_input2) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input2, lt_input2,
-                                         mkl_lt_internal_input2),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input2_buf_tensor,
-                       mkl_lt_internal_input2, &mkl_buf_convert_input2);
-        CHECK_EQ(
-            dnnConversionExecute_F32(mkl_prim_convert_input2, mkl_buf_input2,
-                                     mkl_buf_convert_input2),
-            E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input2);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input2);
-
-      Eltwise_res[dnnResourceMultipleSrc2] =
-          (mkl_convert_input2) ? mkl_buf_convert_input2 : mkl_buf_input2;
-    }
-
-    void MklCleanup() {
-      bool input1_in_mkl_format = input1_shape.IsMklTensor();
-      bool input2_in_mkl_format = input2_shape.IsMklTensor();
-      dnnDelete_F32(Eltwise);
-      if (!input1_in_mkl_format || !input2_in_mkl_format) {
-        delete[] in_sizes;
-        delete[] in_strides;
-      }
-      if (!input1_in_mkl_format) {
-        dnnLayoutDelete_F32(lt_input1);
-      }
-      if (!input2_in_mkl_format) {
-        dnnLayoutDelete_F32(lt_input2);
-      }
-    }
-  } MklAddNOpContext;
-};
-
-#else  // INTEL_MKL_ML_ONLY
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
@@ -505,7 +251,6 @@ class MklAddNOp : public OpKernel {
   }
 };
 
-#endif
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklAddN")                          \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index 2409f7e9dc298a2f51145d211e984784429f7c8f..28825e1a9c6711d4daf74036896b9fea324163ea 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -364,15 +364,15 @@ class MklAvgPoolingGradOp : public OpKernel {
                                     "1-dimensional and 4 elements"));
 
         // For avgpooling, out_backprop should have 4 dimensions.
-        OP_REQUIRES(context, out_backprop.dims() == 4,
-                    errors::InvalidArgument("out_backprop must be "
-                                            "4-dimensional"));
+        OP_REQUIRES(
+            context, out_backprop.dims() == 4,
+            errors::InvalidArgument("out_backprop must be 4-dimensional"));
       } else {
         // Input in MKL format.
         // For avgpooling, out_backprop should have 4 dimensions.
-        OP_REQUIRES(context, out_backprop_shape.GetDimension() == 4,
-                    errors::InvalidArgument("out_backprop must be "
-                                            "4-dimensional"));
+        OP_REQUIRES(
+            context, out_backprop_shape.GetDimension() == 4,
+            errors::InvalidArgument("out_backprop must be 4-dimensional"));
       }
 
       // TODO(inteltf): Get outbackprop layout.
@@ -484,9 +484,9 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
           dnn_shape_input.IsMklTensor()
               ? dnn_shape_input.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(),
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
+                                                       this->data_format_tf_);
       memory::desc input_md = dnn_shape_input.IsMklTensor()
                                   ? dnn_shape_input.GetMklLayout()
                                   : memory::desc(src_dims, MklDnnType<T>(),
@@ -494,9 +494,17 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
 
       // Get an average pooling primitive from the op pool
       MklPoolingFwdPrimitive<T>* pooling_fwd = nullptr;
+      prop_kind pooling_prop_kind;
+      bool int8_forward_inference =
+          std::is_same<T, qint8>::value || std::is_same<T, quint8>::value;
+      if (int8_forward_inference)
+        pooling_prop_kind = prop_kind::forward_inference;
+      else
+        pooling_prop_kind = prop_kind::forward_training;
       MklPoolingParams fwdParams(src_dims, output_dims_mkl_order, filter_dims,
                                  strides, padding_left, padding_right,
-                                 algorithm::pooling_avg_exclude_padding);
+                                 algorithm::pooling_avg_exclude_padding,
+                                 pooling_prop_kind);
       pooling_fwd = MklPoolingFwdPrimitiveFactory<T>::Get(fwdParams);
 
       // allocate output tensor
@@ -523,6 +531,26 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
 
       // execute pooling
       pooling_fwd->Execute(src_data, dst_data);
+
+      // Pass min, max from input to output
+      if (int8_forward_inference) {
+        const Tensor& min_input_t = MklGetInput(context, 1);
+        const Tensor& max_input_t = MklGetInput(context, 2);
+        const float min_input = min_input_t.flat<float>()(0);
+        const float max_input = max_input_t.flat<float>()(0);
+
+        Tensor* output_min = nullptr;
+        Tensor* output_max = nullptr;
+        MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+        output_min_mkl_shape.SetMklTensor(false);
+        output_max_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 1, &output_min, {},
+                                  output_min_mkl_shape);
+        AllocateOutputSetMklShape(context, 2, &output_max, {},
+                                  output_max_mkl_shape);
+        output_min->flat<float>()(0) = min_input;
+        output_max->flat<float>()(0) = max_input;
+      }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -576,24 +604,26 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_mkl_shape.IsMklTensor()
               ? orig_input_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(orig_input_shape,
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
+                                                       this->data_format_tf_);
 
       memory::dims diff_dst_dims =
           grad_mkl_shape.IsMklTensor()
               ? grad_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(),
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
+                                                       this->data_format_tf_);
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
-      MklPoolingParams bwdParams(orig_input_dims_mkl_order,
-                                 output_dims_mkl_order, filter_dims, strides,
-                                 padding_left, padding_right,
-                                 algorithm::pooling_avg_exclude_padding);
+      // Pass prop_kind::forward_training to create a forward primitive
+      // that is used in the backward pass
+      MklPoolingParams bwdParams(
+          orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
+          strides, padding_left, padding_right,
+          algorithm::pooling_avg_exclude_padding, prop_kind::forward_training);
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
@@ -660,13 +690,13 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
 
     if (!input_gradient_mkl_shape.IsMklTensor()) {
       // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
-      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-                  errors::InvalidArgument("Gradient shape must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("Gradient shape must be 4-dimensional"));
     } else {
-      OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Gradient shape must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_mkl_shape.GetDimension() == 4,
+          errors::InvalidArgument("Gradient shape must be 4-dimensional"));
     }
   }
 };  // MklAvgPoolingGradOp
@@ -691,6 +721,18 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Label(mkl_op_registry::kMklOpLabel),
                         MklAvgPoolingOp<CPUDevice, float>);
 
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedAvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklAvgPoolingOp<CPUDevice, quint8>);
+
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedAvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklAvgPoolingOp<CPUDevice, qint8>);
+
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 8ad7ebb51f3c113928a39f867bfa0950257d6388..f0278caee6b95269b77185d409de67a7441c5ff3 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -13,9 +13,10 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include <limits>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -27,15 +28,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
-
 using mkldnn::concat;
 using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
@@ -63,95 +57,6 @@ class EigenConcatBaseOp : public OpKernel {
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
-#ifdef INTEL_MKL_ML_ONLY
-
-  void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
-    const Tensor* concat_dim_tensor;
-    const char* axis_attribute_name =
-        AxisArgName == NAME_IS_AXIS
-            ? "axis"
-            : AxisArgName == NAME_IS_CONCAT_DIM ? "concat_dim" : "<invalid>";
-    OP_REQUIRES_OK(c, c->input(axis_attribute_name, &concat_dim_tensor));
-    OP_REQUIRES(c, IsLegacyScalar(concat_dim_tensor->shape()),
-                errors::InvalidArgument(
-                    axis_attribute_name,
-                    " tensor should be a scalar integer, but got shape ",
-                    concat_dim_tensor->shape().DebugString()));
-    const int32 concat_dim =
-        internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
-    // Instead of accessing values from context, we use input to Compute.
-    const int N = values.size();
-    const int input_dims = values[0].dims();
-    const TensorShape& input_shape = values[0].shape();
-
-    int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
-    OP_REQUIRES(c,
-                (0 <= axis && axis < input_dims) ||
-                    (allow_legacy_scalars() && concat_dim == 0),
-                errors::InvalidArgument(
-                    "ConcatOp : Expected concatenating dimensions in the range "
-                    "[",
-                    -input_dims, ", ", input_dims, "), but got ", concat_dim));
-    // Note that we reduce the concat of n-dimensional tensors into a two
-    // dimensional concat. Assuming the dimensions of any input/output
-    // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
-    // the dimension indicated with size y0, we flatten it to {x, y}, where y =
-    // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
-    ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(N);
-    int64 inputs_flat_dim0 = 1;
-    for (int d = 0; d < axis; ++d) {
-      inputs_flat_dim0 *= input_shape.dim_size(d);
-    }
-    int64 output_concat_dim = 0;
-    const bool input_is_scalar = IsLegacyScalar(input_shape);
-    for (int i = 0; i < N; ++i) {
-      const auto in = values[i];
-      const bool in_is_scalar = IsLegacyScalar(in.shape());
-      OP_REQUIRES(
-          c, in.dims() == input_dims || (input_is_scalar && in_is_scalar),
-          errors::InvalidArgument(
-              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i,
-              "] = ", in.shape().DebugString()));
-      for (int j = 0; j < input_dims; ++j) {
-        if (j == axis) {
-          continue;
-        }
-        OP_REQUIRES(
-            c, in.dim_size(j) == input_shape.dim_size(j),
-            errors::InvalidArgument(
-                "ConcatOp : Dimensions of inputs should match: shape[0] = ",
-                input_shape.DebugString(), " vs. shape[", i,
-                "] = ", in.shape().DebugString()));
-      }
-      if (in.NumElements() > 0) {
-        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
-        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-            in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
-      }
-      // TODO(irving): Remove check once !allow_legacy_scalars().
-      output_concat_dim += in.dims() > 0 ? in.dim_size(axis) : 1;
-    }
-
-    TensorShape output_shape(input_shape);
-    // TODO(irving): Remove rank 0 case once !allow_legacy_scalars().
-    if (output_shape.dims() == 0) {
-      output_shape.AddDim(output_concat_dim);
-    } else {
-      output_shape.set_dim(axis, output_concat_dim);
-    }
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
-    if (output->NumElements() > 0) {
-      int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
-      auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
-      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
-    }
-  }
-
-#else  // MKL_DNN
-
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
                const TensorShapeList& input_shapes) {
     const Tensor* concat_dim_tensor;
@@ -227,342 +132,7 @@ class EigenConcatBaseOp : public OpKernel {
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
-
-#endif
-};
-
-#ifdef INTEL_MKL_ML_ONLY
-
-// --------------------------------------------------------------------------
-//                      Mkl Concat Op
-// --------------------------------------------------------------------------
-
-template <typename Device, typename T, AxisArgumentName AxisArgName>
-class MklConcatOp : public OpKernel {
- private:
-  TensorFormat data_format_;
-  EigenConcatBaseOp<Device, T, AxisArgName> eigen_concat_op_;
-
- public:
-  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
-      ConstMatrixVector;
-
-  explicit MklConcatOp(OpKernelConstruction* c)
-      : OpKernel(c), eigen_concat_op_(c) {}
-
-  void Compute(OpKernelContext* context) override {
-    MklConcatOpContext mkl_context;
-
-    // Get input tensors.
-    OpInputList input_tensors;
-    GetMklInputList(context, "values", &input_tensors);
-    const int N = input_tensors.size();
-    // Get MKL shapes.
-    MklShapeList input_shapes(N);
-    GetMklShapeList(context, "values", &input_shapes);
-
-    // If this is Concat, then concat_dim is 0th input.
-    // If this is ConcatV2, then axis is Nth input.
-    const Tensor& concat_dim_tensor = AxisArgName == NAME_IS_CONCAT_DIM
-                                          ? MklGetInput(context, 0)
-                                          : MklGetInput(context, N);
-
-    // Sanity checks
-    OP_REQUIRES(
-        context, IsLegacyScalar(concat_dim_tensor.shape()),
-        errors::InvalidArgument(
-            "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim_tensor.shape().DebugString()));
-    int32 concat_dim =
-        internal::SubtleMustCopy(concat_dim_tensor.scalar<int32>()());
-
-    MklShape& inpshape0 = input_shapes[0];
-
-    // Check that all tensors are Mkl, if not we call Eigen version.
-    bool invoke_eigen = false;
-    bool is_concat_dim_channel = true;
-    if (!AreAllMklTensors(input_shapes)) {
-      invoke_eigen = true;
-    }
-
-    // Check that total number of dimensions is 4, if not call Eigen.
-    if (!invoke_eigen) {
-      for (auto& s : input_shapes) {
-        if (s.GetDimension() != 4) {
-          invoke_eigen = true;
-          break;
-        }
-      }
-    }
-
-    // check that concat_dim is channel, if not call Eigen version.
-    if (!invoke_eigen) {
-      for (auto& s : input_shapes) {
-        if (!s.IsMklChannelDim(concat_dim)) {
-          invoke_eigen = true;
-          is_concat_dim_channel = false;
-          break;
-        }
-      }
-    }
-
-    if (invoke_eigen) {
-      VLOG(1) << "_MklConcatOp: Invoking Eigen version of Concat. Reason:"
-              << (!is_concat_dim_channel ? "Concat dimension is not channel"
-                                         : "Not all tensors are in Mkl layout");
-      CallEigenVersion(context, input_tensors, input_shapes);
-      return;
-    }
-
-    // For MKL format, the channel is dimension number 2.
-    // So if we are concating over channel and _all_ inputs are in MKL
-    // format, then we set concat_dim to 2.
-    // Since we have reached till here, it means we are concating
-    // over channel.
-    concat_dim = MklDims::C;
-
-    // One more sanity check: check that ranks of all tensors match
-    // and that their shapes match except for concat_dim.
-    int i = 0;
-    for (auto& s : input_shapes) {
-      size_t exp_dims = inpshape0.GetDimension();
-      OP_REQUIRES(context, s.GetDimension() == exp_dims,
-                  errors::InvalidArgument(
-                      "_MklConcatOp : Ranks of all input tensors should match:"
-                      " input dimensions = ",
-                      s.GetDimension(), " vs. expected rank = ", exp_dims));
-
-      for (int d = 0; d < exp_dims; ++d) {
-        if (d == concat_dim) {
-          continue;
-        }
-
-        size_t exp_size = inpshape0.GetSizes()[d];
-        OP_REQUIRES(
-            context, exp_size == s.GetSizes()[d],
-            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs"
-                                    "should match: shape[0][",
-                                    d, "]= ", exp_size, " vs. shape[", i, "][",
-                                    d, "] = ", s.GetSizes()[d]));
-      }
-      ++i;
-    }
-
-    // Use input MKL layout instead of creating new layouts.
-    int64 output_concat_dim_size = 0;
-    for (auto& s : input_shapes) {
-      output_concat_dim_size +=
-          s.GetDimension() > 0 ? s.GetSizes()[concat_dim] : 1;
-    }
-    mkl_context.MklCreateInputLayouts(context, input_shapes);
-    OP_REQUIRES_OK(context, context->status());
-
-    CHECK_EQ(dnnConcatCreate_F32(&mkl_context.prim_concat, NULL, N,
-                                 &mkl_context.lt_inputs[0]),
-             E_SUCCESS);
-
-    // Calculate output sizes and strides
-    TensorFormat data_format;
-    if (inpshape0.IsTensorInNHWCFormat()) {
-      data_format = FORMAT_NHWC;
-    } else {
-      OP_REQUIRES(
-          context, inpshape0.IsTensorInNCHWFormat(),
-          errors::InvalidArgument(
-              "_MklConcat only supports all inputs in NCHW or NHWC format "));
-      data_format = FORMAT_NCHW;
-    }
-
-    // Since all tensors are in Mkl layout, we copy sizes from input tensor.
-    mkl_context.out_sizes[MklDims::W] = inpshape0.GetSizes()[MklDims::W];
-    mkl_context.out_sizes[MklDims::H] = inpshape0.GetSizes()[MklDims::H];
-    mkl_context.out_sizes[MklDims::C] = output_concat_dim_size;
-    mkl_context.out_sizes[MklDims::N] = inpshape0.GetSizes()[MklDims::N];
-    GetStridesFromSizes(data_format, mkl_context.out_strides,
-                        mkl_context.out_sizes);
-
-    // Set output Mkl shape.
-    int64 dim = 4;
-    MklShape mkl_output_mkl_shape;
-    mkl_output_mkl_shape.SetMklTensor(true);
-    mkl_output_mkl_shape.SetMklLayout(mkl_context.prim_concat, dnnResourceDst);
-    mkl_output_mkl_shape.SetTfLayout(dim, mkl_context.out_sizes,
-                                     mkl_context.out_strides);
-    mkl_output_mkl_shape.SetTfDimOrder(dim, inpshape0.GetTfToMklDimMap());
-
-    TensorShape mkl_output_tf_shape;
-    mkl_output_tf_shape.AddDim(1);
-    mkl_output_tf_shape.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-        sizeof(T));
-
-    Tensor* output = nullptr;
-    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
-                              mkl_output_mkl_shape);
-
-    // Set destination resource.
-    mkl_context.concat_res[dnnResourceDst] =
-        const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
-
-    mkl_context.mkl_tmp_tensors.resize(N);
-    mkl_context.MklPrepareConcatInputs(context, input_tensors);
-    OP_REQUIRES_OK(context, context->status());
-
-    // Execute primitive.
-    CHECK_EQ(dnnExecute_F32(mkl_context.prim_concat, mkl_context.concat_res),
-             E_SUCCESS);
-
-    mkl_context.MklCleanup();
-    OP_REQUIRES_OK(context, context->status());
-  }
-
- private:
-  typedef struct {
-    TensorFormat data_format;
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    dnnPrimitive_t prim_concat;
-    void* concat_res[dnnResourceNumber];
-    std::vector<dnnLayout_t> lt_inputs;
-    std::vector<Tensor> mkl_tmp_tensors;
-
-    // Create MKL dnnLayout_t objects for tensors coming into the layer
-    // We only support case where input tensors are all in Mkl layout.
-    void MklCreateInputLayouts(OpKernelContext* context,
-                               MklShapeList& input_shapes) {
-      for (auto& is : input_shapes) {
-        CHECK_EQ(is.IsMklTensor(), true);
-        lt_inputs.push_back((dnnLayout_t)is.GetCurLayout());
-      }
-    }
-
-    void MklPrepareConcatInputs(OpKernelContext* context,
-                                OpInputList& input_tensors) {
-      CHECK_EQ(lt_inputs.size(), mkl_tmp_tensors.size());
-
-      for (int i = 0; i < lt_inputs.size(); ++i) {
-        dnnPrimitive_t mkl_prim_convert_input;
-        dnnLayout_t mkl_lt_internal_input;
-        void* mkl_buf_convert_input = nullptr;
-
-        CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                     &mkl_lt_internal_input, prim_concat,
-                     (dnnResourceType_t)(dnnResourceMultipleSrc + i)),
-                 E_SUCCESS);
-
-        if (!dnnLayoutCompare_F32(lt_inputs[i], mkl_lt_internal_input)) {
-          CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input,
-                                           lt_inputs[i], mkl_lt_internal_input),
-                   E_SUCCESS);
-
-          AllocTmpBuffer(context, &mkl_tmp_tensors[i], mkl_lt_internal_input,
-                         &mkl_buf_convert_input);
-
-          CHECK_EQ(dnnConversionExecute_F32(
-                       mkl_prim_convert_input,
-                       const_cast<void*>(static_cast<const void*>(
-                           input_tensors[i].flat<T>().data())),
-                       mkl_buf_convert_input),
-                   E_SUCCESS);
-
-          concat_res[dnnResourceMultipleSrc + i] = mkl_buf_convert_input;
-          CHECK_EQ(dnnDelete_F32(mkl_prim_convert_input), E_SUCCESS);
-        } else {
-          concat_res[dnnResourceMultipleSrc + i] = const_cast<void*>(
-              static_cast<const void*>(input_tensors[i].flat<T>().data()));
-        }
-
-        CHECK_EQ(dnnLayoutDelete_F32(mkl_lt_internal_input), E_SUCCESS);
-      }
-    }
-
-    void MklCleanup() {
-      for (auto& lt : lt_inputs) {
-        lt = nullptr;
-      }
-      CHECK_EQ(dnnDelete_F32(prim_concat), E_SUCCESS);
-    }
-  } MklConcatOpContext;
-
-  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const MklShapeList& input_shapes) {
-    // Before calling Eigen version, we need to convert Mkl tensors to TF.
-    // First check that the number of input tensors and the number of Mkl
-    // shapes match.
-    CHECK_EQ(values.size(), input_shapes.size());
-
-    std::vector<Tensor> converted_values;
-    for (int i = 0; i < input_shapes.size(); i++) {
-      if (input_shapes[i].IsMklTensor()) {
-        // If input tensor is Mkl, then do the conversion.
-        Tensor tmp_tensor =
-            ConvertMklToTF<T>(context, values[i], input_shapes[i]);
-        converted_values.push_back(tmp_tensor);
-      } else {
-        // If input tensor is TF already, then we do not need any conversion.
-        converted_values.push_back(values[i]);
-      }
-    }
-
-    // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values);
-
-    // Set dummy Mkl tensor as output Mkl tensor for this op.
-    MklShape mkl_tensor_mkl_shape;
-    mkl_tensor_mkl_shape.SetMklTensor(false);
-    mkl_tensor_mkl_shape.SetDimensions(4);
-    mkl_tensor_mkl_shape.SetTfDimOrder(4);  // Dimensions
-    Tensor* mkl_tensor = nullptr;
-    TensorShape mkl_tensor_tf_shape;
-    mkl_tensor_tf_shape.AddDim(
-        SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
-    int tf_output_index = 0;
-    // TODO(jktomer): replace this with OP_REQUIRES_OK and clean up this file
-    // to propagate the status up the call stack.
-    TF_CHECK_OK(context->allocate_output(
-        GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
-        mkl_tensor_tf_shape, &mkl_tensor));
-    mkl_tensor_mkl_shape.SerializeMklShape(
-        mkl_tensor->flat<uint8>().data(),
-        mkl_tensor->flat<uint8>().size() * sizeof(uint8));
-  }
-
-  // overloading methods with input shapes as a list of TensorShape's
-  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const TensorShapeList& input_shapes) {
-    CHECK_EQ(values.size(), input_shapes.size());
-
-    std::vector<Tensor> converted_values;
-    for (int i = 0; i < input_shapes.size(); i++) {
-      converted_values.push_back(values[i]);
-    }
-
-    // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values);
-
-    // Set dummy Mkl tensor as output Mkl tensor for this op.
-    MklShape mkl_tensor_mkl_shape;
-    mkl_tensor_mkl_shape.SetMklTensor(false);
-    mkl_tensor_mkl_shape.SetDimensions(4);
-    Tensor* mkl_tensor = nullptr;
-    TensorShape mkl_tensor_tf_shape;
-    mkl_tensor_tf_shape.AddDim(
-        SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
-    int tf_output_index = 0;
-    // TODO(jktomer): replace this with OP_REQUIRES_OK and clean up this file
-    // to propagate the status up the call stack.
-    TF_CHECK_OK(context->allocate_output(
-        GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
-        mkl_tensor_tf_shape, &mkl_tensor));
-    mkl_tensor_mkl_shape.SerializeMklShape(
-        mkl_tensor->flat<uint8>().data(),
-        mkl_tensor->flat<uint8>().size() * sizeof(uint8));
-  }
 };
-
-#else
-
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
 // --------------------------------------------------------------------------
@@ -609,8 +179,8 @@ class MklConcatOp : public OpKernel {
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
       const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
-                                       ? mkl_input_shapes[0].GetTfShape()
-                                       : input_tensors[0].shape();
+                                             ? mkl_input_shapes[0].GetTfShape()
+                                             : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
@@ -681,13 +251,12 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         mkl_common_format =
             FindMklCommonFormat(mkl_input_shapes, concat_dim,
-               &isMklReorderNeeded, &dst_concat_dim_size);
+                                &isMklReorderNeeded, &dst_concat_dim_size);
 
         if (!isMklReorderNeeded) {
           // All MKL tensors have a same format. Reorder is not needed.
           for (int k = 0; k < N; k++) {
-            if (input_tensors[k].NumElements() == 0)
-              continue;
+            if (input_tensors[k].NumElements() == 0) continue;
 
             auto src_md = mkl_input_shapes[k].GetMklLayout();
             srcs[k].SetUsrMem(src_md, &input_tensors[k]);
@@ -698,16 +267,16 @@ class MklConcatOp : public OpKernel {
           // MKL tensors have different formats.
           // Reorder them to most common format.
           for (int k = 0; k < N; k++) {
-            if (input_tensors[k].NumElements() == 0)
-              continue;
+            if (input_tensors[k].NumElements() == 0) continue;
 
             auto src_md = mkl_input_shapes[k].GetMklLayout();
             srcs[k].SetUsrMem(src_md, &input_tensors[k]);
 
             if (src_md.data.format != mkl_common_format) {
-              memory::dims src_dims(src_md.data.dims, &src_md.data.dims[src_md.data.ndims]);
-              src_md = memory::desc(src_dims, MklDnnType<T>(),
-                           mkl_common_format);
+              memory::dims src_dims(src_md.data.dims,
+                                    &src_md.data.dims[src_md.data.ndims]);
+              src_md =
+                  memory::desc(src_dims, MklDnnType<T>(), mkl_common_format);
             }
 
             srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
@@ -715,8 +284,7 @@ class MklConcatOp : public OpKernel {
         }
       } else {  // All TF inputs
         for (int k = 0; k < N; k++) {
-          if (input_tensors[k].NumElements() == 0)
-            continue;
+          if (input_tensors[k].NumElements() == 0) continue;
 
           memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
           dst_concat_dim_size += src_dims[concat_dim];
@@ -744,8 +312,8 @@ class MklConcatOp : public OpKernel {
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
         // Set the output format same as the most common format of inputs
         // to avoid layout conversions.
-        dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
+        dst_md =
+            memory::desc(dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
       } else {
         // All inputs are TF tensors.
         // Set the output format same as input format (nchw).
@@ -774,9 +342,10 @@ class MklConcatOp : public OpKernel {
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
       if (are_all_mkl_inputs)
-         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
+        concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
 
-      auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
+      auto concat_pd = concat::primitive_desc(concat_dim, srcs_pd);
+      auto dst_pd = concat_pd.dst_primitive_desc();
 
       MklDnnShape dnn_shape_dst;
       TensorShape tf_shape_dst;
@@ -865,27 +434,26 @@ class MklConcatOp : public OpKernel {
   // Return:
   //   return the common MKL format.
   memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
-      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
+                                     int concat_dim, bool* is_reorder_needed,
+                                     int64* concat_dim_size) {
     *is_reorder_needed = false;
     *concat_dim_size = 0;
     std::unordered_map<int, int> occurrence_map;
-    if (input_shapes.size() == 0)
-      return memory::format::any;
+    if (input_shapes.size() == 0) return memory::format::any;
 
     // Compute ocurrences of each format of all inputs.
-    for (int k=0; k <input_shapes.size(); k++) {
+    for (int k = 0; k < input_shapes.size(); k++) {
       auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
       *concat_dim_size += src_dims[concat_dim];
-      int fmt = static_cast<int>(
-          input_shapes[k].GetMklLayout().data.format);
+      int fmt = static_cast<int>(input_shapes[k].GetMklLayout().data.format);
       occurrence_map[fmt] += 1;
     }
 
     if (occurrence_map.size() == 1) {
-       // this means that all inputs have a same format
-       // return it with is_reorder_needed set false.
-       return static_cast<memory::format>(
-           input_shapes[0].GetMklLayout().data.format);
+      // this means that all inputs have a same format
+      // return it with is_reorder_needed set false.
+      return static_cast<memory::format>(
+          input_shapes[0].GetMklLayout().data.format);
     }
 
     // Input tensors have different formats. Thus, reorder is needed.
@@ -904,8 +472,6 @@ class MklConcatOp : public OpKernel {
   }
 };
 
-#endif
-
 /* Use optimized concat for float type only */
 #define REGISTER_MKL_CPU(type)                                              \
   REGISTER_KERNEL_BUILDER(Name("_MklConcat")                                \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index f406ad2ab565062f178c32692ecfa46917b22356..c1b182be4a4f755bc975563cb3767d7c0079fd7f 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -33,30 +34,20 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
-
 using mkldnn::convolution_backward_weights;
 using mkldnn::memory;
 using mkldnn::prop_kind;
 using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_ML_ONLY
-
 struct MklConvBwdFilterParams {
   memory::dims src_dims;
   memory::dims diff_filter_dims;
@@ -358,388 +349,6 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-#endif
-
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, class T>
-class MklConv2DCustomBackpropFilterOp : public OpKernel {
- public:
-  explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklConv2DGradFilterOpContext mkl_context;
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &(mkl_context.input_shape));
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    const Tensor& filter_sizes = MklGetInput(context, 1);
-
-    const Tensor& out_backprop = MklGetInput(context, 2);
-    GetMklShape(context, 2, &(mkl_context.out_backprop_shape));
-    bool out_backprop_in_mkl_format =
-        mkl_context.out_backprop_shape.IsMklTensor();
-
-    TensorShape input_shape, filter_shape, out_backprop_shape;
-
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
-        errors::InvalidArgument(
-            "Conv2DCustomBackpropFilter: filter_sizes input must be 1-dim, "
-            "not ",
-            filter_sizes.dims()));
-    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                filter_sizes.vec<int32>(), &filter_shape));
-
-    ConvBackpropDimensions backprop_dims;
-
-    // Generate shape for input if input is in MKL format.
-    if (input_in_mkl_format) {
-      OP_REQUIRES(context, mkl_context.input_shape.GetDimension() == 4,
-                  errors::InvalidArgument(
-                      "Conv2DCustomBackpropFilter: input size must be 4-dim"));
-
-      MklSizesToTFSizes(context, data_format_, mkl_context.input_shape,
-                        &input_shape);
-    } else {
-      input_shape = input.shape();
-    }
-
-    // Generate shape for outback prop if input is in MKL format.
-    if (out_backprop_in_mkl_format) {
-      OP_REQUIRES(
-          context, mkl_context.out_backprop_shape.GetDimension() == 4,
-          errors::InvalidArgument(
-              "Conv2DCustomBackpropFilter: outbackprop size must be 4-dim"));
-
-      MklSizesToTFSizes(context, data_format_, mkl_context.out_backprop_shape,
-                        &out_backprop_shape);
-    } else {
-      out_backprop_shape = out_backprop.shape();
-    }
-
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensions(
-                       "Conv2DCustomBackpropFilter", /*num_spatial_dims=*/2,
-                       input_shape, filter_shape, out_backprop_shape, strides_,
-                       padding_, data_format_, &backprop_dims));
-
-    int64 pad_top, pad_bottom;
-    int64 pad_left, pad_right;
-    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
-                                backprop_dims.spatial_dims[0].input_size,
-                                backprop_dims.spatial_dims[0].filter_size,
-                                backprop_dims.spatial_dims[0].stride, padding_,
-                                &backprop_dims.spatial_dims[0].output_size,
-                                &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
-                                backprop_dims.spatial_dims[1].input_size,
-                                backprop_dims.spatial_dims[1].filter_size,
-                                backprop_dims.spatial_dims[1].stride, padding_,
-                                &backprop_dims.spatial_dims[1].output_size,
-                                &pad_left, &pad_right));
-
-    // Create MKL primitives for convolution filter grad
-    mkl_context.in_dims = input_in_mkl_format
-                              ? mkl_context.input_shape.GetDimension()
-                              : input.dims();
-    mkl_context.out_dims = out_backprop_in_mkl_format
-                               ? mkl_context.out_backprop_shape.GetDimension()
-                               : out_backprop.dims();
-    mkl_context.in_sizes[0] =
-        static_cast<size_t>(backprop_dims.spatial_dims[1].input_size);
-    mkl_context.in_sizes[1] =
-        static_cast<size_t>(backprop_dims.spatial_dims[0].input_size);
-    mkl_context.in_sizes[2] = static_cast<size_t>(backprop_dims.in_depth);
-    mkl_context.in_sizes[3] = static_cast<size_t>(backprop_dims.batch_size);
-    mkl_context.out_sizes[0] =
-        static_cast<size_t>(backprop_dims.spatial_dims[1].output_size);
-    mkl_context.out_sizes[1] =
-        static_cast<size_t>(backprop_dims.spatial_dims[0].output_size);
-    mkl_context.out_sizes[2] = static_cast<size_t>(backprop_dims.out_depth);
-    mkl_context.out_sizes[3] = static_cast<size_t>(backprop_dims.batch_size);
-    mkl_context.input_offsets[0] = static_cast<int>(-pad_left);
-    mkl_context.input_offsets[1] = static_cast<int>(-pad_top);
-    mkl_context.conv_strides[0] =
-        static_cast<size_t>(backprop_dims.spatial_dims[1].stride);
-    mkl_context.conv_strides[1] =
-        static_cast<size_t>(backprop_dims.spatial_dims[0].stride);
-
-    GetStridesFromSizes(data_format_, mkl_context.in_strides,
-                        mkl_context.in_sizes);
-    GetStridesFromSizes(data_format_, mkl_context.out_strides,
-                        mkl_context.out_sizes);
-
-    // MKL understands dimensions in 0, 1, 2, and 3 indices denotes
-    // filter cols, rows, input channels, and output depth/channels.
-    mkl_context.filter_dims = 4;
-    mkl_context.filter_sizes[0] = backprop_dims.spatial_dims[1].filter_size;
-    mkl_context.filter_sizes[1] = backprop_dims.spatial_dims[0].filter_size;
-    mkl_context.filter_sizes[2] = backprop_dims.in_depth;
-    mkl_context.filter_sizes[3] = backprop_dims.out_depth;
-
-    // We want filter grad to be in TF format, so
-    // make the strides accordingly to reflect this fact.
-    // Note TF filter layout : (rows, cols, in_depth, out_depth),
-    // while row is the innermost dimension.
-    mkl_context.filter_strides[0] =
-        backprop_dims.out_depth * backprop_dims.in_depth;
-    mkl_context.filter_strides[1] = backprop_dims.out_depth *
-                                    backprop_dims.in_depth *
-                                    backprop_dims.spatial_dims[1].filter_size;
-    mkl_context.filter_strides[2] = backprop_dims.out_depth;
-    mkl_context.filter_strides[3] = 1;
-
-    mkl_context.conv_strides[0] = backprop_dims.spatial_dims[1].stride;
-    mkl_context.conv_strides[1] = backprop_dims.spatial_dims[0].stride;
-
-    // Create convolution-grad-filter primitive
-    CHECK_EQ(dnnConvolutionCreateBackwardFilter_F32(
-                 &mkl_context.prim_conv_bwdfilter, nullptr,
-                 dnnAlgorithmConvolutionDirect, mkl_context.in_dims,
-                 mkl_context.in_sizes, mkl_context.out_sizes,
-                 mkl_context.filter_sizes, mkl_context.conv_strides,
-                 mkl_context.input_offsets, dnnBorderZeros),
-             E_SUCCESS);
-
-    // Create the layouts for entities in received context.
-    mkl_context.MklCreateInputLayouts(context);
-
-    // Mkl needs the entities in its native format.
-    // So create temporary tensors along with buffers to
-    // convert the received entities.
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_out_backprop_buf_tensor;
-    // This preparation sets (1) dnnResourceSrc (2) dnnResourceDiffDst
-    mkl_context.MklPrepareInputs(context, &mkl_tmp_input_buf_tensor,
-                                 &mkl_tmp_out_backprop_buf_tensor);
-
-    // Final conv-grad-filter should be in TF layout.
-    Tensor* grad_filter;
-    mkl_context.grad_filter_shape.SetMklTensor(false);
-    mkl_context.grad_filter_shape.SetTfLayout(mkl_context.filter_dims,
-                                              mkl_context.filter_sizes,
-                                              mkl_context.filter_strides);
-    AllocateOutputSetMklShape(context, 0, &grad_filter, filter_shape,
-                              mkl_context.grad_filter_shape);
-
-    // Need to set member variable for TF layout
-    mkl_context.lt_grad_filter = mkl_context.grad_filter_shape.GetTfLayout();
-
-    // MKL conv-grad-filter might produce grad in its internal layout
-    Tensor mkl_tmp_grad_filter_buf_tensor;
-    // This preparation sets conversion primitive if required
-    // and allocates temporary tensor and its buffer without doing conversions.
-    // Also sets (3) dnnResourceDiffFilter accordingly
-    mkl_context.MklPrepareGradFilter(context, grad_filter,
-                                     &mkl_tmp_grad_filter_buf_tensor);
-
-    // After setting all the required dnnResources, ready for execution!
-    CHECK_EQ(
-        dnnExecute_F32(mkl_context.prim_conv_bwdfilter, mkl_context.conv_res),
-        E_SUCCESS);
-
-    // Convert grad-filter to TF layout
-    if (mkl_context.convert_bwdfilter != nullptr) {
-      void* mkl_buf_convert_grad_filter =
-          const_cast<void*>(static_cast<const void*>(
-              mkl_tmp_grad_filter_buf_tensor.flat<T>().data()));
-      void* mkl_buf_grad_filter = const_cast<void*>(
-          static_cast<const void*>(grad_filter->flat<T>().data()));
-      CHECK_EQ(dnnConversionExecute_F32(mkl_context.convert_bwdfilter,
-                                        mkl_buf_convert_grad_filter,
-                                        mkl_buf_grad_filter),
-               E_SUCCESS);
-    }
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    int out_dims;
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    int filter_dims;
-    size_t filter_sizes[4];
-    size_t filter_strides[4];
-    int input_offsets[2];
-    size_t conv_strides[2];
-    MklShape input_shape, grad_filter_shape, out_backprop_shape;
-    dnnPrimitive_t prim_conv_bwdfilter = nullptr;
-    dnnPrimitive_t convert_bwdfilter = nullptr;
-    dnnLayout_t lt_input = nullptr;
-    dnnLayout_t lt_grad_filter = nullptr;
-    dnnLayout_t lt_out_backprop = nullptr;
-    void* conv_res[dnnResourceNumber];
-
-    void MklCleanup() {
-      // Cleanup member layouts and primitives except "lt_grad_filter_"
-      // which points to MklShape's TFLayout
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      bool out_backprop_in_mkl_format = out_backprop_shape.IsMklTensor();
-      if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
-      if (!out_backprop_in_mkl_format) dnnLayoutDelete_F32(lt_out_backprop);
-      if (convert_bwdfilter != nullptr) dnnDelete_F32(convert_bwdfilter);
-      dnnDelete_F32(prim_conv_bwdfilter);
-    }
-
-    // Create MKL dnnLayout_t objects for tensors coming into the layer
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (input_in_mkl_format) {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      } else {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      }
-
-      bool out_backprop_in_mkl_format = out_backprop_shape.IsMklTensor();
-      if (out_backprop_in_mkl_format) {
-        lt_out_backprop =
-            static_cast<dnnLayout_t>(out_backprop_shape.GetCurLayout());
-      } else {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_out_backprop, out_dims, out_sizes,
-                                     out_strides),
-                 E_SUCCESS);
-      }
-    }
-
-    // Compare incoming tensor layouts with MKL preferred layouts and convert
-    // data to the preferred layout if necessary
-    void MklPrepareInputs(OpKernelContext* context,
-                          Tensor* mkl_tmp_input_buf_tensor,
-                          Tensor* mkl_tmp_out_backprop_buf_tensor) {
-      bool mkl_convert_input, mkl_convert_out_backprop;
-      dnnPrimitive_t mkl_prim_convert_input, mkl_prim_convert_out_backprop;
-      dnnLayout_t mkl_lt_internal_input, mkl_lt_internal_out_backprop;
-      void *mkl_buf_convert_input, *mkl_buf_convert_out_backprop;
-
-      mkl_prim_convert_input = nullptr;
-      mkl_prim_convert_out_backprop = nullptr;
-      mkl_lt_internal_input = nullptr;
-      mkl_lt_internal_out_backprop = nullptr;
-      mkl_buf_convert_input = nullptr;
-      mkl_buf_convert_out_backprop = nullptr;
-
-      // Compare with internal layouts and convert if needed
-      const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &mkl_lt_internal_input, prim_conv_bwdfilter, dnnResourceSrc),
-               E_SUCCESS);
-      mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, lt_input);
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &mkl_buf_convert_input);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                          mkl_buf_convert_input),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-
-      conv_res[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
-
-      const Tensor& out_backprop = MklGetInput(context, 2);
-      void* mkl_buf_out_backprop = const_cast<void*>(
-          static_cast<const void*>(out_backprop.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
-                                                prim_conv_bwdfilter,
-                                                dnnResourceDiffDst),
-               E_SUCCESS);
-      mkl_convert_out_backprop =
-          !dnnLayoutCompare_F32(mkl_lt_internal_out_backprop, lt_out_backprop);
-      if (mkl_convert_out_backprop) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                                         lt_out_backprop,
-                                         mkl_lt_internal_out_backprop),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_out_backprop_buf_tensor,
-                       lt_out_backprop, &mkl_buf_convert_out_backprop);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
-                                          mkl_buf_out_backprop,
-                                          mkl_buf_convert_out_backprop),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_out_backprop);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_out_backprop);
-
-      conv_res[dnnResourceDiffDst] = (mkl_convert_out_backprop)
-                                         ? mkl_buf_convert_out_backprop
-                                         : mkl_buf_out_backprop;
-    }
-
-    void MklPrepareGradFilter(OpKernelContext* context, Tensor* grad_filter,
-                              Tensor* mkl_tmp_grad_filter_buf_tensor) {
-      bool mkl_convert_grad_filter;
-      dnnLayout_t mkl_lt_internal_grad_filter = nullptr;
-      void* mkl_buf_convert_grad_filter = nullptr;
-      void* mkl_buf_grad_filter = const_cast<void*>(
-          static_cast<const void*>(grad_filter->flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_grad_filter,
-                                                prim_conv_bwdfilter,
-                                                dnnResourceDiffFilter),
-               E_SUCCESS);
-      mkl_convert_grad_filter =
-          !dnnLayoutCompare_F32(mkl_lt_internal_grad_filter, lt_grad_filter);
-      if (mkl_convert_grad_filter) {
-        CHECK_EQ(dnnConversionCreate_F32(&convert_bwdfilter,
-                                         mkl_lt_internal_grad_filter,
-                                         lt_grad_filter),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_grad_filter_buf_tensor,
-                       mkl_lt_internal_grad_filter,
-                       &mkl_buf_convert_grad_filter);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_grad_filter);
-
-      conv_res[dnnResourceDiffFilter] = (mkl_convert_grad_filter)
-                                            ? mkl_buf_convert_grad_filter
-                                            : mkl_buf_grad_filter;
-    }
-  } MklConv2DGradFilterOpContext;
-
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
-};
-
-#define REGISTER_MKL_FILTER_KERNELS(T)                              \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
-TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
-#undef REGISTER_MKL_FILTER_KERNELS
-
-#else
-
 template <typename Device, class T, bool biasEnabled>
 class MklConvCustomBackpropFilterOp
     : public MklConvBackpropCommonOp<Device, T> {
@@ -1080,8 +689,6 @@ class MklConvCustomBackpropFilterOp
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
 
-#endif  // INTEL_MKL_ML_ONLY
-
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index a501ce2c93f5d761be5e1372c10c0e5b433fa8b3..786a30bb10dcf464b5768160714238c0d5730e96 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,10 +23,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
-#ifdef INTEL_MKL_ML_ONLY
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -46,19 +43,13 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
-
 using mkldnn::convolution_backward_data;
 using mkldnn::prop_kind;
 using mkldnn::stream;
-#endif
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifndef INTEL_MKL_ML_ONLY
-
 /// utility classes enabling primitive reuse for backward conv ops.
 struct MklConvBwdInputParams {
   memory::dims diff_src_dims;
@@ -293,320 +284,6 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-#endif
-
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp : public OpKernel {
- public:
-  ~MklConv2DCustomBackpropInputOp() {}
-  explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string dataformat;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &dataformat));
-    OP_REQUIRES(context, FormatFromString(dataformat, &data_format),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides));
-    int stride_n = GetTensorDim(strides, data_format, 'N');
-    int stride_c = GetTensorDim(strides, data_format, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklConvBackInputOpContext mkl_context;
-    const Tensor& input = MklGetInput(context, 0);
-    const Tensor& filter = MklGetInput(context, 1);
-
-    GetMklShape(context, 1, &(mkl_context.filter_shape));
-    bool filter_in_mkl_format = mkl_context.filter_shape.IsMklTensor();
-
-    const Tensor& out_backprop = MklGetInput(context, 2);
-    GetMklShape(context, 2, &(mkl_context.outback_shape));
-    bool outback_in_mkl_format = mkl_context.outback_shape.IsMklTensor();
-
-    TensorShape input_shape, filter_shape, outback_shape;
-
-    // Generate input shape.
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsVector(input.shape()),
-        errors::InvalidArgument(
-            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
-            input.dims()));
-    OP_REQUIRES_OK(
-        context, TensorShapeUtils::MakeShape(input.vec<int32>(), &input_shape));
-
-    // Generate shape for filter prop if input is in MKL format.
-    if (filter_in_mkl_format) {
-      OP_REQUIRES(context, mkl_context.filter_shape.GetDimension() == 4,
-                  errors::InvalidArgument(
-                      "Conv2DCustomBackpropInput: size must be 4-dim"));
-
-      const int64* filter_sizes =
-          (const int64*)mkl_context.filter_shape.GetSizes();
-      const int64 filter_dims = mkl_context.filter_shape.GetDimension();
-
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  filter_sizes, filter_dims, &filter_shape));
-    } else {
-      filter_shape = filter.shape();
-    }
-
-    // Generate shape for outback prop if input is in MKL format.
-    if (outback_in_mkl_format) {
-      OP_REQUIRES(context, mkl_context.outback_shape.GetDimension() == 4,
-                  errors::InvalidArgument(
-                      "Conv2DCustomBackpropInput: size must be 4-dim"));
-
-      MklSizesToTFSizes(context, data_format, mkl_context.outback_shape,
-                        &outback_shape);
-    } else {
-      outback_shape = out_backprop.shape();
-    }
-
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(
-        context,
-        ConvBackpropComputeDimensions(
-            "Conv2DCustomBackpropInput", /*num_spatial_dims=*/2, input_shape,
-            filter_shape, outback_shape, strides, padding, data_format, &dims));
-
-    int64 pad_top, pad_bottom;
-    int64 pad_left, pad_right;
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
-            dims.spatial_dims[0].stride, padding,
-            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
-            dims.spatial_dims[1].stride, padding,
-            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
-
-    mkl_context.in_dims = 4;
-
-    mkl_context.in_sizes[0] =
-        static_cast<size_t>(dims.spatial_dims[1].input_size);
-    mkl_context.in_sizes[1] =
-        static_cast<size_t>(dims.spatial_dims[0].input_size);
-    mkl_context.in_sizes[2] = static_cast<size_t>(dims.in_depth);
-    mkl_context.in_sizes[3] = static_cast<size_t>(dims.batch_size);
-
-    mkl_context.out_sizes[0] =
-        static_cast<size_t>(dims.spatial_dims[1].output_size);
-    mkl_context.out_sizes[1] =
-        static_cast<size_t>(dims.spatial_dims[0].output_size);
-    mkl_context.out_sizes[2] = static_cast<size_t>(dims.out_depth);
-    mkl_context.out_sizes[3] = static_cast<size_t>(dims.batch_size);
-
-    mkl_context.input_offset[0] = static_cast<int>(-pad_left);
-    mkl_context.input_offset[1] = static_cast<int>(-pad_top);
-
-    mkl_context.conv_strides[0] =
-        static_cast<size_t>(dims.spatial_dims[1].stride);
-    mkl_context.conv_strides[1] =
-        static_cast<size_t>(dims.spatial_dims[0].stride);
-
-    GetStridesFromSizes(data_format, mkl_context.out_strides,
-                        mkl_context.out_sizes);
-    GetStridesFromSizes(data_format, mkl_context.in_strides,
-                        mkl_context.in_sizes);
-
-    mkl_context.filter_size[0] = dims.spatial_dims[1].filter_size;
-    mkl_context.filter_size[1] = dims.spatial_dims[0].filter_size;
-    mkl_context.filter_size[2] = dims.in_depth;
-    mkl_context.filter_size[3] = dims.out_depth;
-
-    mkl_context.filter_stride[0] =
-        mkl_context.filter_size[2] * mkl_context.filter_size[3];
-    mkl_context.filter_stride[1] = mkl_context.filter_size[2] *
-                                   mkl_context.filter_size[0] *
-                                   mkl_context.filter_size[3];
-    mkl_context.filter_stride[2] = mkl_context.filter_size[3];
-    mkl_context.filter_stride[3] = 1;
-
-    CHECK_EQ(
-        dnnConvolutionCreateBackwardData_F32(
-            &mkl_context.prim_bwddata, NULL, dnnAlgorithmConvolutionDirect,
-            mkl_context.in_dims, mkl_context.in_sizes, mkl_context.out_sizes,
-            mkl_context.filter_size, mkl_context.conv_strides,
-            mkl_context.input_offset, dnnBorderZeros),
-        E_SUCCESS);
-
-    // Allocate output tensor and shape
-    TensorShape mkl_out_shape;
-    MklShape mklOutputShape;
-    mklOutputShape.SetMklTensor(true);
-    mklOutputShape.SetMklLayout(mkl_context.prim_bwddata, dnnResourceDiffSrc);
-    mklOutputShape.SetTfLayout(mkl_context.in_dims, mkl_context.in_sizes,
-                               mkl_context.in_strides);
-    // MKL might change the dimension ordering.
-    // Create mapping to recover the original TF dimension order
-    mklOutputShape.SetTfDimOrder(mkl_context.in_dims, data_format);
-
-    Tensor* in_backprop = nullptr;
-    mkl_out_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                             mklOutputShape.GetMklLayout())) /
-                         sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &in_backprop, mkl_out_shape,
-                              mklOutputShape);
-
-    mkl_context.conv_res[dnnResourceDiffSrc] =
-        static_cast<void*>(const_cast<T*>(in_backprop->flat<T>().data()));
-
-    mkl_context.MklCreateInputLayouts(context);
-    Tensor mkl_tmp_outbackprop_buf_tensor, mkl_tmp_filter_buf_tensor;
-    mkl_context.MklPrepareConvolutionInputs(
-        context, &mkl_tmp_outbackprop_buf_tensor, &mkl_tmp_filter_buf_tensor);
-
-    CHECK_EQ(dnnExecute_F32(mkl_context.prim_bwddata, mkl_context.conv_res),
-             E_SUCCESS);
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    int input_offset[2];
-    size_t filter_size[4];
-    size_t filter_stride[4];
-    size_t conv_strides[2];
-    MklShape filter_shape, outback_shape;
-    dnnPrimitive_t prim_bwddata;
-    void* conv_res[dnnResourceNumber];
-    dnnLayout_t lt_filter, lt_outbackprop;
-
-    // Create MKL dnnLayout_t objects for tensors coming into the layer
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool filter_in_mkl_format = filter_shape.IsMklTensor();
-      bool outback_in_mkl_format = outback_shape.IsMklTensor();
-      if (filter_in_mkl_format) {
-        lt_filter = (dnnLayout_t)filter_shape.GetCurLayout();
-      } else {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_filter, in_dims, filter_size,
-                                     filter_stride),
-                 E_SUCCESS);
-      }
-
-      if (outback_in_mkl_format) {
-        lt_outbackprop = (dnnLayout_t)outback_shape.GetCurLayout();
-      } else {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_outbackprop, in_dims, out_sizes,
-                                     out_strides),
-                 E_SUCCESS);
-      }
-    }
-
-    // Compare incoming input tensor layouts with MKL preferred layouts and
-    // convert data to the preferred layout if necessary
-    void MklPrepareConvolutionInputs(OpKernelContext* context,
-                                     Tensor* mkl_tmp_outbackprop_buf_tensor,
-                                     Tensor* mkl_tmp_filter_buf_tensor) {
-      dnnPrimitive_t mkl_convert_filter = nullptr,
-                     mkl_convert_outbackprop = nullptr;
-      void *mkl_filter_buf = nullptr, *mkl_outbackprop_buf = nullptr;
-      dnnLayout_t mkl_lt_filter_internal = nullptr,
-                  mkl_lt_outbackprop_internal = nullptr;
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &mkl_lt_filter_internal, prim_bwddata, dnnResourceFilter),
-               E_SUCCESS);
-
-      const Tensor& filter = MklGetInput(context, 1);
-
-      CHECK_EQ(
-          dnnLayoutCreateFromPrimitive_F32(&mkl_lt_outbackprop_internal,
-                                           prim_bwddata, dnnResourceDiffDst),
-          E_SUCCESS);
-      if (!dnnLayoutCompare_F32(mkl_lt_filter_internal, lt_filter)) {
-        // Create conversion primitive
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_convert_filter, lt_filter,
-                                         mkl_lt_filter_internal),
-                 E_SUCCESS);
-
-        AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor,
-                       mkl_lt_filter_internal, &mkl_filter_buf);
-        CHECK_EQ(
-            dnnConversionExecute_F32(
-                mkl_convert_filter,
-                static_cast<void*>(const_cast<T*>(filter.flat<T>().data())),
-                mkl_filter_buf),
-            E_SUCCESS);
-
-        // Assign filter buf to resources[] for convolution.
-        conv_res[dnnResourceFilter] = mkl_filter_buf;
-        dnnDelete_F32(mkl_convert_filter);
-      } else {
-        // If we do not need any layout conversion for filter, then
-        // we directly assign input filter to resources[].
-        conv_res[dnnResourceFilter] =
-            static_cast<void*>(const_cast<T*>(filter.flat<T>().data()));
-      }
-      dnnLayoutDelete_F32(mkl_lt_filter_internal);
-      const Tensor& out_backprop = MklGetInput(context, 2);
-      // --
-      // We do similar steps as above for outputbackprop.
-      if (!dnnLayoutCompare_F32(mkl_lt_outbackprop_internal, lt_outbackprop)) {
-        CHECK_EQ(
-            dnnConversionCreate_F32(&mkl_convert_outbackprop, lt_outbackprop,
-                                    mkl_lt_outbackprop_internal),
-            E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf_tensor,
-                       mkl_lt_outbackprop_internal, &mkl_outbackprop_buf);
-
-        CHECK_EQ(dnnConversionExecute_F32(mkl_convert_outbackprop,
-                                          static_cast<void*>(const_cast<T*>(
-                                              out_backprop.flat<T>().data())),
-                                          mkl_outbackprop_buf),
-                 E_SUCCESS);
-
-        conv_res[dnnResourceDiffDst] = mkl_outbackprop_buf;
-        dnnDelete_F32(mkl_convert_outbackprop);
-      } else {
-        conv_res[dnnResourceDiffDst] =
-            static_cast<void*>(const_cast<T*>(out_backprop.flat<T>().data()));
-      }
-      dnnLayoutDelete_F32(mkl_lt_outbackprop_internal);
-    }
-
-    // Cleanup member layouts and primitives
-    void MklCleanup() {
-      bool filter_in_mkl_format = filter_shape.IsMklTensor();
-      bool outback_in_mkl_format = outback_shape.IsMklTensor();
-      if (!filter_in_mkl_format) dnnLayoutDelete_F32(lt_filter);
-      if (!outback_in_mkl_format) dnnLayoutDelete_F32(lt_outbackprop);
-      dnnDelete_F32(prim_bwddata);
-    }
-  } MklConvBackInputOpContext;
-
-  std::vector<int32> strides;
-  Padding padding;
-  TensorFormat data_format;
-};
-
-#define REGISTER_MKL_CPU_KERNELS(T)                                 \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")           \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DCustomBackpropInputOp<CPUDevice, T>);
-
-TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
-#undef REGISTER_MKL_CPU_KERNELS
-
-#else
-
 template <typename Device, class T>
 class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
  public:
@@ -881,7 +558,5 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
 TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
 #undef REGISTER_MKL_CPU_KERNELS
 
-#endif  // INTEL_MKL_ML_ONLY
-
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index b332edad0ae136d5486bd903540dd448a77bd620..6e4fbf55c5f78158ffa811f4823d0086fb382d88 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include <string.h>
+#include <algorithm>
 #include <map>
 #include <vector>
-#include <memory>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -29,6 +29,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/mkl_conv_ops.h"
+#include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
+#include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -69,6 +71,12 @@ struct MklConvFwdParams {
   memory::dims dilations;
   memory::dims padding_left;
   memory::dims padding_right;
+  string dtypes = string("");
+  struct PostOpParam {
+    string name;
+    std::vector<float> param;
+  };
+  std::vector<PostOpParam> post_op_params;
 
   MklConvFwdParams(memory::dims src_dims, memory::dims filter_dims,
                    memory::dims bias_dims, memory::dims dst_dims,
@@ -83,8 +91,10 @@ struct MklConvFwdParams {
         padding_left(padding_left),
         padding_right(padding_right) {}
 };
-
-template <typename T>
+// With quantization, input, filter, and output can have different types
+// so we use differnt template parameter for each type
+template <typename T, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput>
 class MklConvFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims)
@@ -103,16 +113,16 @@ class MklConvFwdPrimitive : public MklPrimitive {
   //   filter_data: input data buffer of filter (weights)
   //   bias_data:   input data buffer of bias
   //   dst_data:    output data buffer of dst
-  void Execute(const T* src_data, const T* filter_data, const T* bias_data,
-               const T* dst_data) {
+  void Execute(const Tinput* src_data, const Tfilter* filter_data,
+               const Tbias* bias_data, const Toutput* dst_data) {
     context_.src_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(src_data)));
+        static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(filter_data)));
+        static_cast<void*>(const_cast<Tfilter*>(filter_data)));
     context_.bias_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(bias_data)));
+        static_cast<void*>(const_cast<Tbias*>(bias_data)));
     context_.dst_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(dst_data)));
+        static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
     // after exec, set data handle back
@@ -128,13 +138,14 @@ class MklConvFwdPrimitive : public MklPrimitive {
   //   src_data:    input data buffer of src
   //   filter_data: input data buffer of filter (weights)
   //   dst_data:    output data buffer of dst
-  void Execute(const T* src_data, const T* filter_data, const T* dst_data) {
+  void Execute(const Tinput* src_data, const Tfilter* filter_data,
+               const Toutput* dst_data) {
     context_.src_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(src_data)));
+        static_cast<void*>(const_cast<Tinput*>(src_data)));
     context_.filter_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(filter_data)));
+        static_cast<void*>(const_cast<Tfilter*>(filter_data)));
     context_.dst_mem->set_data_handle(
-        static_cast<void*>(const_cast<T*>(dst_data)));
+        static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
     // after execution, set data handle back
@@ -200,17 +211,17 @@ class MklConvFwdPrimitive : public MklPrimitive {
   void Setup(const MklConvFwdParams& convFwdDims) {
     // create memory descriptors for convolution data w/ no specified format
     context_.src_md.reset(new memory::desc(
-        {convFwdDims.src_dims}, MklDnnType<T>(), memory::format::any));
+        {convFwdDims.src_dims}, MklDnnType<Tinput>(), memory::format::any));
 
     context_.filter_md.reset(new memory::desc(
-        {convFwdDims.filter_dims}, MklDnnType<T>(), memory::format::any));
+        {convFwdDims.filter_dims}, MklDnnType<Tfilter>(), memory::format::any));
 
     context_.dst_md.reset(new memory::desc(
-        {convFwdDims.dst_dims}, MklDnnType<T>(), memory::format::any));
+        {convFwdDims.dst_dims}, MklDnnType<Toutput>(), memory::format::any));
 
     if (!convFwdDims.bias_dims.empty())
       context_.bias_md.reset(new memory::desc(
-          {convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::any));
+          {convFwdDims.bias_dims}, MklDnnType<Tbias>(), memory::format::any));
 
     // create a convolution
     if (!convFwdDims.bias_dims.empty()) {
@@ -230,6 +241,42 @@ class MklConvFwdPrimitive : public MklPrimitive {
     context_.fwd_pd.reset(new convolution_forward::primitive_desc(
         *context_.fwd_desc, cpu_engine_));
 
+    // Check if there is any fusions as post-ops
+    auto const& post_op_params = convFwdDims.post_op_params;
+    mkldnn::primitive_attr post_ops_attr;
+    mkldnn::post_ops post_ops;
+    if (!post_op_params.empty()) {
+      for (auto const& post_op_param : post_op_params) {
+        if (post_op_param.name == "relu") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.append_eltwise(op_scale, mkldnn::eltwise_relu, op_alpha,
+                                  op_beta);
+        } else if (post_op_param.name == "sum") {
+          DCHECK_EQ(post_op_param.param.size(), 1);
+          float op_scale = post_op_param.param[0];
+          post_ops.append_sum(op_scale);
+        } else if (post_op_param.name == "output_scale") {
+          DCHECK_EQ(post_op_param.param.size(), 1);
+          std::vector<float> scales;
+          scales.push_back(post_op_param.param[0]);
+          post_ops_attr.set_output_scales(0, scales);
+        } else {
+          DCHECK((post_op_param.name == "relu") ||
+                 (post_op_param.name == "sum") ||
+                 (post_op_param.name == "output_scale"));
+        }
+      }
+      post_ops_attr.set_post_ops(post_ops);
+      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
+          *context_.fwd_desc, post_ops_attr, cpu_engine_));
+    } else {
+      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
+          *context_.fwd_desc, cpu_engine_));
+    }
+
     // store the expected memory format
     context_.src_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->src_primitive_desc().desc().data.format);
@@ -268,23 +315,30 @@ class MklConvFwdPrimitive : public MklPrimitive {
   engine cpu_engine_;
 };
 
-template <typename T>
+template <typename T, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput>
 class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
  public:
-  static MklConvFwdPrimitive<T>* Get(const MklConvFwdParams& convFwdDims,
-                                     bool do_not_cache) {
-    MklConvFwdPrimitive<T>* conv_fwd = nullptr;
+  static MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>* Get(
+      const MklConvFwdParams& convFwdDims, bool do_not_cache) {
+    MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>* conv_fwd = nullptr;
 
     if (do_not_cache) { /* Always create new primitive */
-      conv_fwd = new MklConvFwdPrimitive<T>(convFwdDims);
+      conv_fwd = new MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>(
+          convFwdDims);
     } else {
       // try to find a suitable one in pool
-      conv_fwd = dynamic_cast<MklConvFwdPrimitive<T>*>(
-          MklConvFwdPrimitiveFactory<T>::GetInstance().GetConvFwd(convFwdDims));
+      conv_fwd = dynamic_cast<
+          MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>*>(
+          MklConvFwdPrimitiveFactory<T, Tinput, Tfilter, Tbias,
+                                     Toutput>::GetInstance()
+              .GetConvFwd(convFwdDims));
       if (conv_fwd == nullptr) {
-        conv_fwd = new MklConvFwdPrimitive<T>(convFwdDims);
-        MklConvFwdPrimitiveFactory<T>::GetInstance().SetConvFwd(convFwdDims,
-                                                                conv_fwd);
+        conv_fwd = new MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>(
+            convFwdDims);
+        MklConvFwdPrimitiveFactory<T, Tinput, Tfilter, Tbias,
+                                   Toutput>::GetInstance()
+            .SetConvFwd(convFwdDims, conv_fwd);
       }
     }
 
@@ -314,6 +368,29 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(convFwdDims.dilations);
     key_creator.AddAsKey(convFwdDims.padding_left);
     key_creator.AddAsKey(convFwdDims.padding_right);
+    key_creator.AddAsKey(convFwdDims.dtypes);
+
+    // Generate keys for post-ops
+    for (auto const& post_op_param : convFwdDims.post_op_params) {
+      if (post_op_param.name == "relu") {
+        DCHECK_EQ(post_op_param.param.size(), 3);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+        key_creator.AddAsKey(post_op_param.param[1]);
+        key_creator.AddAsKey(post_op_param.param[2]);
+      } else if (post_op_param.name == "sum") {
+        DCHECK_EQ(post_op_param.param.size(), 1);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+      } else if (post_op_param.name == "output_scale") {
+        DCHECK_EQ(post_op_param.param.size(), 1);
+        key_creator.AddAsKey(post_op_param.name);
+        key_creator.AddAsKey(post_op_param.param[0]);
+      } else {
+        return string("not_a_key");
+      }
+    }
+
     return key_creator.GetKey();
   }
 
@@ -757,10 +834,24 @@ class MklConvOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+// FP32 kernel registration for INTEL_MKL_ML
+REGISTER_KERNEL_BUILDER(Name("_MklConv2D")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklConv2DOp<CPUDevice, float, false>);
+REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .Label(mkl_op_registry::kMklOpLabel),
+                        MklConv2DOp<CPUDevice, float, true>);
+
 #else
 
 // Base class for convolution forward operations
-template <typename Device, typename T, bool biasEnabled>
+template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput, typename Ttemp_output, typename Tpadding,
+          bool biasEnabled, bool padEnabled>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -828,25 +919,30 @@ class MklConvOp : public OpKernel {
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
       OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false,
-            errors::InvalidArgument("Filter should not be in "
-            "Mkl Layout"));
+                  errors::InvalidArgument("Filter should not be in "
+                                          "Mkl Layout"));
 
-      MklDnnData<T> src(&cpu_engine);
-      MklDnnData<T> filter(&cpu_engine);
+      MklDnnData<Tinput> src(&cpu_engine_);
+      MklDnnData<Tfilter> filter(&cpu_engine_);
 
       memory::dims src_dims, filter_dims, padding_left, padding_right,
-                   dilations, strides;
+          dilations, strides;
       memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
+      // If pad with conv2d fusion is enabled
+      if (padEnabled) {
+        PadWithConvFusion(context, padding_left, padding_right);
+      }
+
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
-                             dilations_);
+                              dilations_);
       auto src_tf_shape = GetTfShape(context, kInputIndex_Src);
       auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter);
       conv_utl.GetConvFwdSizesInMklOrder(
-          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims,
-          &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order,
-          &padding_left, &padding_right);
+          src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
+          &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
+          &padding_right, padEnabled);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -854,25 +950,35 @@ class MklConvOp : public OpKernel {
 
       // Corner cases: output with 0 elements and 0 batch size.
       Tensor* dst_tensor = nullptr;
-      if (dst_tf_shape.num_elements() == 0 ||
-          dst_dims_tf_order[0] == 0) {
+      if (dst_tf_shape.num_elements() == 0 || dst_dims_tf_order[0] == 0) {
         MklDnnShape dst_mkl_shape;
         dst_mkl_shape.SetMklTensor(false);
-        AllocateOutputSetMklShape(context, kOutputIndex_Dst,
-                    &dst_tensor, src_tf_shape, dst_mkl_shape);
+        AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor,
+                                  src_tf_shape, dst_mkl_shape);
 
         // MklConv2D/3D also outputs converted filter
         // as 2nd output of Conv2D/3D.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
-        AllocateOutputSetMklShape(context, kOutputIndex_Filter,
-                                  &output_filter_tensor,
-                                  filter_tf_shape, filter_mkl_shape);
+        // MklConv2D also outputs converted filter as 2nd output.
+        if (typeid(Tinput) == typeid(float) &&
+            typeid(Tfilter) == typeid(float) &&
+            typeid(Toutput) == typeid(float)) {
+          filter_mkl_shape.SetMklTensor(false);
+          AllocateOutputSetMklShape(context, kOutputIndex_Filter,
+                                    &output_filter_tensor, filter_tf_shape,
+                                    filter_mkl_shape);
+        }
         return;
       }
 
       bool isConv2D = (strides_.size() == 4);
-
+      // TODO(Intel-tf) Add check to make sure padEnabled is true only for 2D
+      if (!isConv2D) {
+        OP_REQUIRES(
+            context, !padEnabled,
+            errors::InvalidArgument("Pad+Conv fusion only works for 2D"));
+      }
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
@@ -887,15 +993,17 @@ class MklConvOp : public OpKernel {
       //     Conv3D: NDHWC or NCDHW
       auto src_md = src_mkl_shape.IsMklTensor()
                         ? src_mkl_shape.GetMklLayout()
-                        : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+                        : memory::desc(src_dims, MklDnnType<Tinput>(), tf_fmt);
+      src.SetUsrMem(src_md, &src_tensor);
 
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
       auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
                            ? filter_mkl_shape.GetMklLayout()
-                           : memory::desc(filter_dims, MklDnnType<T>(),
+                           : memory::desc(filter_dims, MklDnnType<Tfilter>(),
                                           isConv2D ? memory::format::hwio
                                                    : memory::format::dhwio);
+      filter.SetUsrMem(filter_md, &filter_tensor);
       // MKLDNN dilation starts from 0.
       for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
@@ -905,67 +1013,91 @@ class MklConvOp : public OpKernel {
       // in the following cases
       //   1. Legacy CPU without AVX512/AVX2, or
       //   2. 1x1 convolution with stride != 1
-      bool do_not_cache = MklPrimitiveFactory<T>::IsPrimitiveMemOptEnabled() &&
-                    (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) &&
-                    (MklPrimitiveFactory<T>::IsLegacyPlatform() ||
-                     IsConv1x1StrideNot1(filter_dims, strides));
+      bool do_not_cache =
+          MklPrimitiveFactory<Tinput>::IsPrimitiveMemOptEnabled() &&
+          (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) &&
+          (MklPrimitiveFactory<Tinput>::IsLegacyPlatform() ||
+           IsConv1x1StrideNot1(filter_dims, strides));
 
       // get a conv2d fwd from primitive pool
-      MklConvFwdPrimitive<T>* conv_fwd = nullptr;
+      MklConvFwdPrimitive<float, Tinput, Tfilter, Tbias, Ttemp_output>*
+          conv_fwd = nullptr;
       if (biasEnabled) {
         memory::dims bias_dims = {};
         conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
         MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(
-            convFwdDims, do_not_cache);
+
+        // TODO(mdfaijul):  Extend the basic parameters for data types and
+        // fusions
+        this->ExtendConvFwdParams(context, convFwdDims);
+
+        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
+                                              Ttemp_output>::Get(convFwdDims,
+                                                                 do_not_cache);
       } else {
         MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS,
                                      dst_dims_mkl_order, strides, dilations,
                                      padding_left, padding_right);
-        conv_fwd = MklConvFwdPrimitiveFactory<T>::Get(
-            convFwdDims, do_not_cache);
+
+        // Extend the basic parameters for data types and fusions
+        this->ExtendConvFwdParams(context, convFwdDims);
+
+        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
+                                              Ttemp_output>::Get(convFwdDims,
+                                                                 do_not_cache);
       }
 
       // allocate output tensors output_tensor and filter_out_tensor
       std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd =
           conv_fwd->GetPrimitiveDesc();
-      AllocateOutputTensor(context, *conv_fwd_pd,
-                       dst_dims_mkl_order, tf_fmt, &dst_tensor);
+      AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt,
+                           &dst_tensor);
       Tensor* filter_out_tensor = nullptr;
-      AllocateFilterOutputTensor(context, *conv_fwd_pd,
-                                 TFShapeToMklDnnDims(filter_tf_shape),
-                                 &filter_out_tensor);
+      if (typeid(Tinput) == typeid(float) && typeid(Tfilter) == typeid(float) &&
+          typeid(Toutput) == typeid(float)) {
+        AllocateFilterOutputTensor(context, *conv_fwd_pd,
+                                   TFShapeToMklDnnDims(filter_tf_shape),
+                                   &filter_out_tensor);
+      }
 
-      T* dst_data = static_cast<T*>(dst_tensor->flat<T>().data());
+      Ttemp_output* dst_data =
+          reinterpret_cast<Ttemp_output*>(dst_tensor->flat<Toutput>().data());
 
       // check whether src/filter need reorder
-      T *src_data = nullptr;
+      Tinput* src_data = nullptr;
       if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) {
         src.SetUsrMem(src_md, &src_tensor);
         src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc());
-        src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
+        src_data = static_cast<Tinput*>(src.GetOpMem().get_data_handle());
       } else {
-        src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
+        src_data = static_cast<Tinput*>(
+            const_cast<Tinput*>(src_tensor.flat<Tinput>().data()));
       }
-      T* filter_data = nullptr;
+      Tfilter* filter_data = nullptr;
       if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
         filter.SetUsrMem(filter_md, &filter_tensor);
-        filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(),
-                                   filter.GetTensorBuffer(filter_out_tensor));
-        filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
-      } else {
+        if (filter_out_tensor == nullptr) {
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc());
+        } else {
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc(),
+              filter.GetTensorBuffer(filter_out_tensor));
+        }
         filter_data =
-            static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data()));
+            static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
+      } else {
+        filter_data = static_cast<Tfilter*>(
+            const_cast<Tfilter*>(filter_tensor.flat<Tfilter>().data()));
       }
 
       // execute convolution
       if (biasEnabled) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
-        T* bias_data = static_cast<T*>(const_cast<T*>(
-            bias_tensor.flat<T>().data()));
-
+        Tbias* bias_data =
+            this->GetBiasHandle(context, conv_fwd_pd, bias_tensor);
         conv_fwd->Execute(src_data, filter_data, bias_data, dst_data);
       } else {
         conv_fwd->Execute(src_data, filter_data, dst_data);
@@ -973,27 +1105,79 @@ class MklConvOp : public OpKernel {
 
       // delete primitive since it is not cached.
       if (do_not_cache) delete conv_fwd;
-    } catch (mkldnn::error &e) {
+    } catch (mkldnn::error& e) {
       string error_msg = tensorflow::strings::StrCat(
           "Status: ", e.status, ", message: ", string(e.message), ", in file ",
           __FILE__, ":", __LINE__);
-      OP_REQUIRES_OK(context,
-        errors::Aborted("Operation received an exception:", error_msg));
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
- private:
-  std::vector<int32> strides_;
-  std::vector<int32> dilations_;
-  Padding padding_;
-  TensorFormat data_format_;
-  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
-  const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
-  const int kDilationH = 0, kDilationW = 1;
-  engine cpu_engine = engine(engine::cpu, 0);
+  void PadWithConvFusion(OpKernelContext* context, memory::dims& padding_left,
+                         memory::dims& padding_right) {
+    const Tensor& paddings_tf = MklGetInput(context, 2);
+    OP_REQUIRES(context, paddings_tf.dims() == 2,
+                errors::InvalidArgument("paddings must be 2-dimensional: ",
+                                        paddings_tf.shape().DebugString()));
+    Tpadding* paddings = nullptr;
+    // To get individual pad, need to flatten the tensor
+    paddings = static_cast<Tpadding*>(
+        const_cast<Tpadding*>(paddings_tf.flat<Tpadding>().data()));
+    // For NHWC format:
+    // paddings[0], paddings[1], paddings[6], paddings[7] should be zero
+    // if the paddings_tf is [ [0, 0] [1,2] [3,4] [0,0] ]
+    // paddings = {0, 0, 1, 2, 3, 4, 0, 0} ; flat method is row major
+    // then, values are: top = 1, bottom =2, left=3, right=4
+    // For NCHW format:
+    // paddings[0], paddings[1], paddings[2], paddings[3] should be zero
+    // similar explanation as NHWC format will apply.
+    int64 pad_top, pad_left;
+    int64 pad_bottom, pad_right;
+    string data_format = ToString(data_format_);
+    if (data_format == "NHWC") {
+      pad_top = paddings[2];
+      pad_bottom = paddings[3];
+      pad_left = paddings[4];
+      pad_right = paddings[5];
+    } else if (data_format == "NCHW") {
+      pad_top = paddings[4];
+      pad_bottom = paddings[5];
+      pad_left = paddings[6];
+      pad_right = paddings[7];
+    }
+    // Create padding arrays for MKL DNN convolutions.
+    // MKL-DNN uses asymetric padding.
+    padding_left = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+    padding_right = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+  }
+
+ protected:
+  virtual void ExtendConvFwdParams(OpKernelContext* context,
+                                   MklConvFwdParams& params) {
+    // Create a string from data types of input, filter, bias, and output.
+    params.dtypes.append(typeid(Tinput).name());
+    params.dtypes.append(typeid(Tfilter).name());
+    params.dtypes.append(typeid(Tbias).name());
+    params.dtypes.append(typeid(Toutput).name());
+  }
+
+  virtual Tbias* GetBiasHandle(
+      OpKernelContext* context,
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>&
+          conv2d_fwd_pd,
+      const Tensor& bias_tensor) {
+    if (biasEnabled) {
+      return static_cast<Tbias*>(
+          const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+    } else {
+      return nullptr;
+    }
+  }
 
   // Allocate output tensor.
-  void AllocateOutputTensor(
+  virtual void AllocateOutputTensor(
       OpKernelContext* context,
       const convolution_forward::primitive_desc& conv_prim_desc,
       const memory::dims& output_dims_mkl_order,
@@ -1001,23 +1185,41 @@ class MklConvOp : public OpKernel {
     CHECK_NOTNULL(output_tensor);
     auto dst_pd = conv_prim_desc.dst_primitive_desc();
 
+    auto dst_md = dst_pd.desc();
+    if (!std::is_same<Ttemp_output, Toutput>::value) {
+      dst_md.data.data_type =
+          static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
+      dst_pd = memory::primitive_desc(dst_md, cpu_engine_);
+    }
     // Allocate shape of Mkl tensor.
     MklDnnShape output_mkl_shape;
     output_mkl_shape.SetMklTensor(true);
     output_mkl_shape.SetMklLayout(&dst_pd);
-    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetElemType(MklDnnType<Toutput>());
     output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
                                  output_dims_mkl_order, output_tf_format);
 
     // Allocate shape of TF tensor.
     TensorShape output_tf_shape;
-    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+    output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput)));
 
     AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
                               output_tf_shape, output_mkl_shape);
   }
 
-  // Allocate output tensor.
+  engine cpu_engine_ = engine(engine::cpu, 0);
+
+ private:
+  std::vector<int32> strides_;
+  std::vector<int32> dilations_;
+  Padding padding_;
+  TensorFormat data_format_;
+  const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
+  const int kInputIndex_Pad = 2;
+  const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
+  const int kDilationH = 0, kDilationW = 1;
+
+  // Allocate filter output tensor.
   void AllocateFilterOutputTensor(
       OpKernelContext* context,
       const convolution_forward::primitive_desc& conv_prim_desc,
@@ -1029,7 +1231,7 @@ class MklConvOp : public OpKernel {
     MklDnnShape filter_mkl_shape;
     filter_mkl_shape.SetMklTensor(true);
     filter_mkl_shape.SetMklLayout(&filter_pd);
-    filter_mkl_shape.SetElemType(MklDnnType<T>());
+    filter_mkl_shape.SetElemType(MklDnnType<Tfilter>());
 
     // The format of the filter is actually OIhw8i8o, but TF doesn't support
     // this format. Just use format::blocked for now because the layout
@@ -1039,17 +1241,17 @@ class MklConvOp : public OpKernel {
 
     // Allocate the data space for the filter to propagate as TF tensor.
     TensorShape filter_tf_shape;
-    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(T)));
+    filter_tf_shape.AddDim((filter_pd.get_size() / sizeof(Tfilter)));
 
     AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
                               filter_tf_shape, filter_mkl_shape);
   }
-
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
       const convolution_forward::primitive_desc& conv_prim_desc,
-      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
-      MklDnnData<T>* output, Tensor* filter_out_tensor) {
+      MklDnnData<Tinput>* src, MklDnnData<Tfilter>* filter,
+      MklDnnData<Tbias>* bias, MklDnnData<Toutput>* output,
+      Tensor* filter_out_tensor) {
     CHECK_NOTNULL(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
@@ -1065,12 +1267,12 @@ class MklConvOp : public OpKernel {
     // Create convolution primitive and add it to net.
     std::vector<primitive> net;
     if (bias) {
-      CHECK_EQ(biasEnabled, true);
+      DCHECK(biasEnabled);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(), bias->GetOpMem(),
                                         output->GetOpMem()));
     } else {
-      CHECK_EQ(biasEnabled, false);
+      DCHECK(!biasEnabled);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(),
                                         output->GetOpMem()));
@@ -1080,35 +1282,613 @@ class MklConvOp : public OpKernel {
   }
 };
 
-#endif
+// We create new class for each verison of Quantized Convolution and inherit
+// from the FP32 version of the base class
+template <typename Device, typename Tbias, typename Toutput,
+          typename Ttemp_output, bool biasEnabled>
+class MklQuantizedConv2DOp
+    : public MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
+                       int32, biasEnabled, false> {
+ public:
+  virtual ~MklQuantizedConv2DOp() {
+    if (this->input_bias_ != nullptr) {
+      delete this->input_bias_;
+      input_bias_ = nullptr;
+    }
+
+    if (this->scaled_bias_ != nullptr) {
+      delete this->scaled_bias_;
+      scaled_bias_ = nullptr;
+    }
+  }
+
+  explicit MklQuantizedConv2DOp(OpKernelConstruction* context)
+      : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+                  biasEnabled, false>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Compute int32 output tensor
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+              biasEnabled, false>::Compute(context);
+
+    // Compute additional outputs: min/max scalars.
+    int bias_index_offset;
+    bias_index_offset = biasEnabled ? 1 : 0;
+
+    const float min_input =
+        context->input(2 + bias_index_offset).flat<float>()(0);
+    const float max_input =
+        context->input(3 + bias_index_offset).flat<float>()(0);
+    const float min_filter =
+        context->input(4 + bias_index_offset).flat<float>()(0);
+    const float max_filter =
+        context->input(5 + bias_index_offset).flat<float>()(0);
+
+    float min_output_value;
+    float max_output_value;
+    if (std::is_same<Toutput, quint8>::value ||
+        std::is_same<Toutput, qint8>::value) {
+      // This is the case the convolution and requantization are fused.
+      // min_freezed_output and max_freezed_output are the actual range
+      // for the output
+      min_output_value = context->input(6 + bias_index_offset).flat<float>()(0);
+      max_output_value = context->input(7 + bias_index_offset).flat<float>()(0);
+    } else {
+      MklQuantizationRangeForMultiplication<quint8, qint8, qint32>(
+          min_input, max_input, min_filter, max_filter, &min_output_value,
+          &max_output_value);
+    }
+
+    Tensor* output_min = nullptr;
+    Tensor* output_max = nullptr;
+    MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+    output_min_mkl_shape.SetMklTensor(false);
+    output_max_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 1, &output_min, {},
+                              output_min_mkl_shape);
+    AllocateOutputSetMklShape(context, 2, &output_max, {},
+                              output_max_mkl_shape);
+    output_min->flat<float>()(0) = min_output_value;
+    output_max->flat<float>()(0) = max_output_value;
+  }
+
+ protected:
+  void ExtendConvFwdParams(OpKernelContext* context,
+                           MklConvFwdParams& params) override {
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+              biasEnabled, false>::ExtendConvFwdParams(context, params);
+
+    // When the output type is quint8, the output data id requantized
+    // into quint8. A post_op "output_scale" is added to do the conversion.
+    if (std::is_same<Toutput, quint8>::value ||
+        std::is_same<Toutput, qint8>::value) {
+      int bias_index_offset;
+      bias_index_offset = biasEnabled ? 1 : 0;
+
+      const float min_input =
+          context->input(2 + bias_index_offset).flat<float>()(0);
+      const float max_input =
+          context->input(3 + bias_index_offset).flat<float>()(0);
+      const float min_filter =
+          context->input(4 + bias_index_offset).flat<float>()(0);
+      const float max_filter =
+          context->input(5 + bias_index_offset).flat<float>()(0);
+      const float min_freezed_output =
+          context->input(6 + bias_index_offset).flat<float>()(0);
+      const float max_freezed_output =
+          context->input(7 + bias_index_offset).flat<float>()(0);
+
+      float min_output_value;
+      float max_output_value;
+      MklQuantizationRangeForMultiplication<quint8, qint8, qint32>(
+          min_input, max_input, min_filter, max_filter, &min_output_value,
+          &max_output_value);
+      float scale_int32 =
+          std::max(std::abs(min_output_value), std::abs(max_output_value));
+      float scale_eightbit =
+          std::max(std::abs(min_freezed_output), std::abs(max_freezed_output));
+      float scale = 1.0;
+      if (std::is_same<Toutput, quint8>::value)
+        scale = scale_int32 / scale_eightbit / static_cast<float>(1 << 23);
+      else
+        scale = scale_int32 / scale_eightbit / static_cast<float>(1 << 24);
+
+      std::vector<float> output_scale;
+      output_scale.push_back(scale);
+      params.post_op_params.push_back({"output_scale", output_scale});
+    }
+  }
+
+  Tbias* GetBiasHandle(
+      OpKernelContext* context,
+      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>& conv_fwd_pd,
+      const Tensor& bias_tensor) override {
+    int bias_index_offset;
+    bias_index_offset = biasEnabled ? 1 : 0;
+
+    const float min_input =
+        context->input(2 + bias_index_offset).flat<float>()(0);
+    const float max_input =
+        context->input(3 + bias_index_offset).flat<float>()(0);
+    const float min_filter =
+        context->input(4 + bias_index_offset).flat<float>()(0);
+    const float max_filter =
+        context->input(5 + bias_index_offset).flat<float>()(0);
+
+    std::vector<mkldnn::primitive> net;
+    if (biasEnabled) {
+      if (std::is_same<Tbias, qint32>::value) {
+        return static_cast<Tbias*>(
+            const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+      }
+      // If bias is enabled and requantization is not fused, scale the
+      // bias to be consistent with quantized-input and quantized-filter.
+      float bias_scale = 255.0 * 127.0 /
+                         (std::max(std::abs(max_input), std::abs(min_input)) *
+                          std::max(std::abs(max_filter), std::abs(min_filter)));
+      std::vector<float> scales;
+      scales.push_back(bias_scale);
+      mkldnn::primitive_attr bias_attr;
+      bias_attr.set_output_scales(0, scales);
+
+      void* bias_buf = static_cast<void*>(
+          const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
+      input_bias_ = new memory(conv_fwd_pd->bias_primitive_desc(), bias_buf);
+      scaled_bias_ = new memory(conv_fwd_pd->bias_primitive_desc());
+      auto reorder_desc = mkldnn::reorder::primitive_desc(
+          input_bias_->get_primitive_desc(), scaled_bias_->get_primitive_desc(),
+          bias_attr);
+      net.push_back(mkldnn::reorder(reorder_desc, *input_bias_, *scaled_bias_));
+      stream(stream::kind::eager).submit(net).wait();
+      return reinterpret_cast<Tbias*>(scaled_bias_->get_data_handle());
+    } else {
+      return nullptr;
+    }
+  }
+
+  memory* input_bias_ = nullptr;
+  memory* scaled_bias_ = nullptr;
+};
+
+template <typename Device, typename Tbias, typename Toutput,
+          typename Ttemp_output, bool biasEnabled>
+class MklQuantizedConv2DReluOp
+    : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                                  biasEnabled> {
+ public:
+  virtual ~MklQuantizedConv2DReluOp() {}
+
+  explicit MklQuantizedConv2DReluOp(OpKernelConstruction* context)
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
+            context) {}
+
+ protected:
+  void ExtendConvFwdParams(OpKernelContext* context,
+                           MklConvFwdParams& params) override {
+    MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                         biasEnabled>::ExtendConvFwdParams(context, params);
+    params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
+  }
+};
+
+template <typename Device, typename Tbias, typename Toutput,
+          typename Ttemp_output, bool biasEnabled>
+class MklQuantizedConv2DSumReluOp
+    : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                                  biasEnabled> {
+ public:
+  virtual ~MklQuantizedConv2DSumReluOp() {
+    if (this->summand_ != nullptr) {
+      delete this->summand_;
+      summand_ = nullptr;
+    }
+
+    if (this->dst_ != nullptr) {
+      delete this->dst_;
+      dst_ = nullptr;
+    }
+  }
+
+  explicit MklQuantizedConv2DSumReluOp(OpKernelConstruction* context)
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
+            context) {}
+
+ protected:
+  void ExtendConvFwdParams(OpKernelContext* context,
+                           MklConvFwdParams& params) override {
+    MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                         biasEnabled>::ExtendConvFwdParams(context, params);
+    // Calculate the scale (beta in mkldnn api term) for sum
+    if (std::is_same<Toutput, quint8>::value) {
+      int summand_idx = context->num_inputs() / 2 - 1 - 2;
+      DataType summand_type = this->input_type(summand_idx);
+      bool summand_condition =
+          (summand_type == DT_QINT8) || (summand_type == DT_QUINT8);
+      CHECK((summand_condition));
+      int bias_index_offset = biasEnabled ? 1 : 0;
+      const float min_freezed_output =
+          context->input(6 + bias_index_offset).flat<float>()(0);
+      const float max_freezed_output =
+          context->input(7 + bias_index_offset).flat<float>()(0);
+      const float min_freezed_summand =
+          context->input(9 + bias_index_offset).flat<float>()(0);
+      const float max_freezed_summand =
+          context->input(10 + bias_index_offset).flat<float>()(0);
+
+      float scale_output =
+          std::max(std::abs(min_freezed_output), std::abs(max_freezed_output));
+      float scale_summand = std::max(std::abs(min_freezed_summand),
+                                     std::abs(max_freezed_summand));
+      if (summand_type == DT_QUINT8)
+        params.post_op_params.push_back(
+            {"sum", {scale_summand / scale_output}});
+      else
+        params.post_op_params.push_back(
+            {"sum", {2.0f * scale_summand / scale_output}});
+    } else {
+      params.post_op_params.push_back({"sum", {1.0}});
+    }
+    params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
+  }
+
+  // Allocate output tensor.
+  void AllocateOutputTensor(
+      OpKernelContext* context,
+      const convolution_forward::primitive_desc& conv_prim_desc,
+      const memory::dims& output_dims_mkl_order,
+      memory::format output_tf_format, Tensor** output_tensor) override {
+    int summand_idx = context->num_inputs() / 2 - 1;
+    float reorder_sum_scale = 1.0;
+    if (std::is_same<Toutput, quint8>::value) {
+      summand_idx -= 2;
+      DataType summand_type = this->input_type(summand_idx);
+      bool summand_condition =
+          (summand_type == DT_QINT8) || (summand_type == DT_QUINT8);
+      CHECK((summand_condition));
+      Tensor& summand = const_cast<Tensor&>(MklGetInput(context, summand_idx));
+      MklDnnShape summand_mkl_shape;
+      GetMklShape(context, summand_idx, &summand_mkl_shape);
+      auto dst_md = summand_mkl_shape.GetMklLayout();
+      if (summand_mkl_shape.IsMklTensor()) {
+        if (summand_type == DT_QINT8) {
+          summand.UnsafeCopyFromInternal(summand, DT_QUINT8, summand.shape());
+          dst_md.data.data_type =
+              static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
+          summand_mkl_shape.SetMklLayout(&dst_md);
+          summand_mkl_shape.SetElemType(MklDnnType<Toutput>());
+        }
+        ForwardMklTensorInToOutWithMklShape(context, summand_idx, 0,
+                                            summand_mkl_shape);
+        *output_tensor = const_cast<Tensor*>(&summand);
+        return;
+      } else {
+        TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
+                           "Current fusion is not successful."));
+      }
+    }
+    // TODO(mdfaijul): Add cleaner code for non-mkl tensor
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+              biasEnabled, false>::AllocateOutputTensor(context, conv_prim_desc,
+                                                        output_dims_mkl_order,
+                                                        output_tf_format,
+                                                        output_tensor);
+    const Tensor& summand = MklGetInput(context, summand_idx);
+    if (summand.dtype() != DT_FLOAT)
+      TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
+                         "Current fusion requires summand to be float"));
+    MklDnnShape summand_mkl_shape;
+    GetMklShape(context, summand_idx, &summand_mkl_shape);
+    // We need to compute scale for the summand
+    int bias_index_offset = biasEnabled ? 1 : 0;
+    const float min_input =
+        context->input(2 + bias_index_offset).flat<float>()(0);
+    const float max_input =
+        context->input(3 + bias_index_offset).flat<float>()(0);
+    const float min_filter =
+        context->input(4 + bias_index_offset).flat<float>()(0);
+    const float max_filter =
+        context->input(5 + bias_index_offset).flat<float>()(0);
+
+    reorder_sum_scale = 255.0 * 127.0 /
+                        (std::max(std::abs(max_input), std::abs(min_input)) *
+                         std::max(std::abs(max_filter), std::abs(min_filter)));
+    std::vector<float> scales;
+    scales.push_back(reorder_sum_scale);
+    mkldnn::primitive_attr reorder_attr;
+    reorder_attr.set_output_scales(0, scales);
+
+    auto summand_md =
+        summand_mkl_shape.IsMklTensor()
+            ? summand_mkl_shape.GetMklLayout()
+            : memory::desc(output_dims_mkl_order, MklDnnType<Tbias>(),
+                           memory::format::nhwc);
+    auto summand_pd = memory::primitive_desc(summand_md, this->cpu_engine_);
+    void* summand_buf =
+        static_cast<void*>(const_cast<Tbias*>(summand.flat<Tbias>().data()));
+    void* dst_buf =
+        static_cast<void*>((*output_tensor)->flat<Ttemp_output>().data());
+    summand_ = new memory(summand_pd, summand_buf);
+    dst_ = new memory(conv_prim_desc.dst_primitive_desc(), dst_buf);
+    auto reorder_desc = mkldnn::reorder::primitive_desc(
+        summand_pd, conv_prim_desc.dst_primitive_desc(), reorder_attr);
+
+    std::vector<mkldnn::primitive> net;
+    net.push_back(mkldnn::reorder(reorder_desc, *summand_, *dst_));
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  memory* summand_ = nullptr;
+  memory* dst_ = nullptr;
+};
+
+// INT8 kernel registration
+// Register NoOp kernel for QunatizedConv2D for qint8 filter
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2D")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation of MklQuntizedConv2D.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2D")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, float, qint32, qint32, false>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, qint32, qint8, qint8, false>);
+
+// Register NoOp kernel for QuantizedConv2DWithBias to get a python interface.
+// This kernel will be replaced by an MKL kernel during graph
+// optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBias")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation MklQuantizedConv2DWithBias.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBias")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, float, qint32, qint32, true>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("Tbias")
+        .TypeConstraint<qint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, qint32, qint8, qint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<float>("Tbias")
+        .TypeConstraint<qint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DOp<CPUDevice, float, qint8, qint8, true>);
+
+// Register NoOp kernel for QuantizedConv2DAndRelu to get a python interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DAndReluAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<quint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation of MklQuantizedConv2DAndRelu.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DAndRelu")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, float, qint32, qint32, false>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, qint32, quint8, quint8, false>);
+
+// Register NoOp kernel for QuantizedConv2DWithBiasAndRelu to get a python
+// interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+// Register NoOp kernel for QuantizedConv2DWithBiasAndReluAndRequantize
+// to get a python interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasAndReluAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<quint8>("out_type"),
+                        NoOp);
+
+// Register a templatized implementation of MklQuantizedConv2DWithBiasAndRelu.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndRelu")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, float, qint32, qint32, true>);
+
+// Register a templatized implementation of
+// MklQuantizedConv2DWithBiasAndReluAndRequantize.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<float>("Tbias")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, float, quint8, quint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("Tbias")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DReluOp<CPUDevice, qint32, quint8, quint8, true>);
+
+// Register NoOp kernel for QuantizedConv2DWithBiasSumAndRelu to get a python
+// interface.
+// This kernel will be replaced by an MKL kernel during graph-optimization pass.
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasSumAndRelu")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<qint32>("out_type"),
+                        NoOp);
+
+REGISTER_KERNEL_BUILDER(Name("QuantizedConv2DWithBiasSumAndReluAndRequantize")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("Tinput")
+                            .TypeConstraint<qint8>("Tfilter")
+                            .TypeConstraint<quint8>("out_type"),
+                        NoOp);
+REGISTER_KERNEL_BUILDER(
+    Name("QuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type"),
+    NoOp);
+// Register a templatized implementation of MklQuantizedConv2DWithBiasAndRelu.
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSumAndRelu")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<qint32>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, float, qint32, qint32, true>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, qint32, quint8, quint8, true>);
+REGISTER_KERNEL_BUILDER(
+    Name("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<quint8>("Tinput")
+        .TypeConstraint<qint8>("Tfilter")
+        .TypeConstraint<quint8>("out_type")
+        .Label(mkl_op_registry::kMklQuantizedOpLabel),
+    MklQuantizedConv2DSumReluOp<CPUDevice, qint32, quint8, qint8, true>);
+#endif  // INTEL_MKL_ML
 
 // Register 2D operations
-#define REGISTER_MKL_CPU_2D(T)                                      \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                        \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, false>);          \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, true>);           \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")          \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
+#define REGISTER_MKL_CPU_2D(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                               \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, false, false>);          \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                       \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, true, false>);           \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")                 \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklDummyOp<CPUDevice, T>);                       \
+  REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D")                        \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<int32>("Tpaddings")          \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, false, true>);           \
+  REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D")                        \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<int64>("Tpaddings")          \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int64, false, true>);           \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithConv2D")                  \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<int32>("Tpaddings")          \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
                           MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D);
 
 // Register 3D operations
-#define REGISTER_MKL_CPU_3D(T)                                      \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv3D")                        \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, false>);
+#define REGISTER_MKL_CPU_3D(T)                  \
+  REGISTER_KERNEL_BUILDER(                      \
+      Name("_MklConv3D")                        \
+          .Device(DEVICE_CPU)                   \
+          .TypeConstraint<T>("T")               \
+          .Label(mkl_op_registry::kMklOpLabel), \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false>);
 TF_CALL_float(REGISTER_MKL_CPU_3D);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 01cc606f41629452cf2dd4ec784bf2cc1569c43c..e61c20dea9f8c3f8749c302f88a46233dab270b7 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -17,9 +17,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
 #include <limits>
-#include <vector>
 #include <memory>
+#include <vector>
 
+#include "mkldnn.hpp"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -35,25 +36,17 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/util/mkl_util.h"
-
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
-
-using mkldnn::prop_kind;
-using mkldnn::stream;
-
 using mkldnn::convolution_direct;
 using mkldnn::convolution_forward;
-#endif
+using mkldnn::prop_kind;
+using mkldnn::stream;
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_ML_ONLY
-
 class MklDnnConvUtil {
  protected:
   OpKernelContext* context_;  // We don't own this.
@@ -92,7 +85,7 @@ class MklDnnConvUtil {
   }
 
   // Calculate Convolution dilations
-  virtual inline void GetDilationsInMklOrder(memory::dims *dilations) {
+  virtual inline void GetDilationsInMklOrder(memory::dims* dilations) {
     // For now we take the dilation from the second and third dimensions only
     // (we do not support dilation on the batch or depth dimension).
     CHECK_NOTNULL(dilations);
@@ -295,7 +288,7 @@ class MklDnnConvUtil {
       const TensorShape& input_shape, const TensorShape& filter_shape,
       const memory::dims& strides, const memory::dims& dilations,
       memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
-      memory::dims* pad_l, memory::dims* pad_r) {
+      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -380,6 +373,36 @@ class MklDnnConvUtil {
                                    padding_, &out_cols, &pad_left, &pad_right));
     }
 
+    if (isConv2D) {
+      // Conv + pad fusion is enabled only for 2D
+      // If padEnabled, i.e., pad and conv op are fused, then
+      // all pads are already passed from pad op through
+      // *pad_l and *pad_r
+      if (padEnabled) {
+        pad_top = static_cast<int64>((*pad_l)[0]);
+        pad_left = static_cast<int64>((*pad_l)[1]);
+        pad_bottom = static_cast<int64>((*pad_r)[0]);
+        pad_right = static_cast<int64>((*pad_r)[1]);
+        // update the out_rows and out_cols based on all
+        // sides of the pads coming from pad op.
+        out_rows = out_rows + (pad_top + pad_bottom) / stride_rows;
+        out_cols = out_cols + (pad_left + pad_right) / stride_cols;
+      }
+      // Handle padding. MKL-DNN uses asymetric padding.
+      // But, if padEnabled, i.e., pad and conv op are fused,
+      // then, *pad_l and *pad_r are already set from pad op.
+      // In that case they need not set here.
+      else {
+        *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+        *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+      }
+    } else {
+      // Set padding for Conv3D here
+      *pad_l = {static_cast<int>(pad_D1), static_cast<int>(pad_top),
+                static_cast<int>(pad_left)};
+      *pad_r = {static_cast<int>(pad_D2), static_cast<int>(pad_bottom),
+                static_cast<int>(pad_right)};
+    }
     // Tensorflow output is in data_format order.
     //     Conv2D: NHWC or NCHW
     //     Conv3D: NDHWC or NCDHW
@@ -400,9 +423,6 @@ class MklDnnConvUtil {
       mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
       mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
       *output_dims_mkl_order = mkldnn_sizes;
-
-      *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
-      *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
     } else {
       std::vector<int> mkldnn_sizes(5, -1);
       mkldnn_sizes[MklDnnDims3D::Dim3d_N] = out_batch;
@@ -411,11 +431,6 @@ class MklDnnConvUtil {
       mkldnn_sizes[MklDnnDims3D::Dim3d_H] = static_cast<int>(out_rows);
       mkldnn_sizes[MklDnnDims3D::Dim3d_W] = static_cast<int>(out_cols);
       *output_dims_mkl_order = mkldnn_sizes;
-
-      *pad_l = {static_cast<int>(pad_D1), static_cast<int>(pad_top),
-                static_cast<int>(pad_left)};
-      *pad_r = {static_cast<int>(pad_D2), static_cast<int>(pad_bottom),
-                static_cast<int>(pad_right)};
     }
   }
 
@@ -448,8 +463,8 @@ class MklDnnConvUtil {
                                           input_tf_shape.DebugString()));
     }
 
-    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
-                                  strides, dilations, output_dims_tf_order,
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
+                                  dilations, output_dims_tf_order,
                                   output_dims_mkl_order, pad_l, pad_r);
   }
 
@@ -464,10 +479,9 @@ class MklDnnConvUtil {
   inline void GetConvFwdSizesInMklOrder(
       const TensorShape& input_shape, const TensorShape& filter_shape,
       memory::dims* input_dims, memory::dims* filter_dims,
-      memory::dims* strides, memory::dims *dilations,
-      memory::dims* output_dims_tf_order,
-      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
-      memory::dims* pad_r) {
+      memory::dims* strides, memory::dims* dilations,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -483,10 +497,9 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetDilationsInMklOrder(dilations);
-    GetOutputAndPadSizeInMklOrder(input_shape, filter_shape,
-                                  *strides, *dilations,
-                                  output_dims_tf_order, output_dims_mkl_order,
-                                  pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(
+        input_shape, filter_shape, *strides, *dilations, output_dims_tf_order,
+        output_dims_mkl_order, pad_l, pad_r, padEnabled);
     if (!context_->status().ok()) return;
   }
 };
@@ -543,8 +556,6 @@ class MklConvBackpropCommonOp : public OpKernel {
   TensorFormat data_format_;  // NCHW or NHWC
 };
 
-#endif  // INTEL_MKL_ML_ONLY
-
 /////////////////////////////////////////////////////////////////////
 ///  Dummy Mkl op that is just used for operators that are intermediate
 ///  output of node fusion in the graph
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..991fb080934883e05e38e91207a111256b885b82
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+
+// Helper class for converting MKL tesnors to TF tensors and comparing to
+// expected values
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+class ConvMklToTF : public OpsTestBase {
+ public:
+  template <typename T>
+  void ConvertAndCompare(DataType dtype, const Tensor& first,
+                         const Tensor& second, const Tensor& expected) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // Mkl second tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(first.shape(), first.flat<T>());
+    AddInputFromArray<uint8>(second.shape(), second.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    const Tensor& output = *GetOutput(0);
+    test::ExpectTensorNear<T>(expected, output, 1e-5);
+  }
+  void TestBody(){};
+};
+
+// Testing fusion of pad and convolution
+
+class FusedPadConvOpTest : public OpsTestBase {
+ public:
+  template <typename T>
+  void Run(DataType dtype, Tensor& image, Tensor& filter, Tensor& padding,
+           Tensor& expected, const string data_format) {
+    const int stride = 1;
+
+    // Create a fused pad+conv2d node
+    TF_EXPECT_OK(NodeDefBuilder("fused_pad_conv_op", "_MklPadWithConv2D")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(dtype))     // Filter
+                     .Input(FakeInput(DT_INT32))  // Padding
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Attr("padding", "VALID")
+                     .Attr("data_format", data_format)
+                     .Attr("T", dtype)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    // Setting up inputs and execute
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    AddInputFromArray<int32>(padding.shape(), padding.flat<int32>());
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& first = *GetOutput(0);
+    const Tensor& second = *GetOutput(2);
+    ConvMklToTF conv_comp;
+    conv_comp.ConvertAndCompare<T>(dtype, first, second, expected);
+  }
+};
+
+TEST_F(FusedPadConvOpTest, PaddingConvTest) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  Tensor image(DT_FLOAT, {image_batch_count, image_height, image_width, depth});
+  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+  const int padding_height = 4;
+  const int padding_width = 2;
+  Tensor padding(DT_INT32, {padding_height, padding_width});
+  test::FillValues<int32>(&padding, {0, 0, 3, 4, 1, 2, 0, 0});
+
+  Tensor expected(DT_FLOAT, TensorShape({1, 8, 5, 1}));
+  test::FillValues<float>(
+      &expected,
+      {0,  0,   0,   0,   0,   24, 42,  60,  33,  12,  105, 150, 183, 95,
+       32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32,  106, 126, 138,
+       59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
+
+  Run<float>(DT_FLOAT, image, filter, padding, expected, "NHWC");
+}
+
+TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  Tensor image(DT_FLOAT, {image_batch_count, depth, image_height, image_width});
+  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+  const int padding_height = 4;
+  const int padding_width = 2;
+  Tensor padding(DT_INT32, {padding_height, padding_width});
+  test::FillValues<int32>(&padding, {0, 0, 0, 0, 3, 4, 1, 2});
+
+  Tensor expected(DT_FLOAT, TensorShape({1, 1, 8, 5}));
+  test::FillValues<float>(
+      &expected,
+      {0,  0,   0,   0,   0,   24, 42,  60,  33,  12,  105, 150, 183, 95,
+       32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32,  106, 126, 138,
+       59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
+
+  Run<float>(DT_FLOAT, image, filter, padding, expected, "NCHW");
+}
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index b57e8160283bec0e8dbe84d447aeccf472732d79..c1f6fa3fd0a7d2e42e647061dd45f3ee00bc90e6 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -24,43 +24,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#ifdef INTEL_MKL_ML_ONLY
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-#include "tensorflow/core/util/mkl_util.h"
-
-#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
-#endif
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, typename T>
-class MklIdentityOp : public OpKernel {
- public:
-  explicit MklIdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    MklShape mkl_shape_input;
-    GetMklShape(context, 0, &mkl_shape_input);
-    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
-
-    if (input_in_mkl_format) {
-      ForwardMklTensorInToOut(context, 0, 0);
-    } else {
-      ForwardTfTensorInToOut(context, 0, 0);
-    }
-  }
-
-  bool IsExpensive() override { return false; }
-};
-
-#else
-
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
  public:
@@ -83,8 +52,6 @@ class MklIdentityOp : public OpKernel {
   bool IsExpensive() override { return false; }
 };
 
-#endif
-
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklIdentity")                      \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 84ee241b8ecc546eabfaf6aa7e6901cf8eedba5b..ab89fe7d841d3a1c9c6be99161c97dade20bd6ef 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -29,14 +29,10 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+#include "mkldnn.hpp"
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
-
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
-
 using mkldnn::stream;
-#endif
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -60,198 +56,6 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 //     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
-#ifdef INTEL_MKL_ML_ONLY
-template <typename Device, typename T>
-class MklInputConversionOp : public OpKernel {
- public:
-  explicit MklInputConversionOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
-    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
-  }
-
- private:
-  void Compute(OpKernelContext* context) override {
-    // Check if input tensors are in MKL format.
-    const Tensor& input_tensor_0 = MklGetInput(context, 0);
-    MklShape input_shape_0;
-    GetMklShape(context, 0, &input_shape_0);
-
-    const Tensor& input_tensor_1 = MklGetInput(context, 1);
-    MklShape input_shape_1;
-    GetMklShape(context, 1, &input_shape_1);
-
-    bool tf_shapes_are_same = MklCompareShapes(&context->input(0).shape(),
-                                               &context->input(1).shape());
-
-    VLOG(1) << "MklInputConversionOp: Input shapes are "
-            << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
-            << context->input(0).shape().DebugString() << " and "
-            << context->input(1).shape().DebugString();
-
-    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    // if both inputs are in TF format, just copy input tensors to output.
-    if (!input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
-      VLOG(1) << "MklInputConversionOp: No conversion needed, "
-              << "copying TF inputs to output";
-
-      ForwardTfTensorInToOut(context, 0, 0);
-      ForwardTfTensorInToOut(context, 1, 1);
-      return;
-    }
-
-    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    // If both inputs are in MKL format
-    if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      // If both have the same shape, pass them through
-      if (tf_shapes_are_same) {
-        VLOG(1) << "MklInputConversionOp: No conversion needed, "
-                << "copying MKL inputs with identical shapes to output";
-
-        ForwardMklTensorInToOut(context, 0, 0);
-        ForwardMklTensorInToOut(context, 1, 1);
-        return;
-      }
-
-      // Sanity check
-      bool mkl_shapes_are_same =
-          MklCompareShapes(&input_shape_0, &input_shape_1);
-      if (mkl_shapes_are_same) {
-        CHECK(false) << "MklInputConversionOp: Unexpected: TF shapes are "
-                        "different but MKL shapes are same";
-      }
-
-      // Both have different shapes, so broadcast will be necessary.
-      // Convert to TF and pass both tensors through (we can't do broadcast
-      // with MKL tensors)
-      VLOG(1) << "MklInputConversionOp: Broadcast needed, "
-              << "converted MKL inputs to TF format";
-
-      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 0);
-      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 1);
-      SetDummyMklShapeOutput(context, 0);
-      SetDummyMklShapeOutput(context, 1);
-      return;
-    }
-
-    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    // One input is MKL and one is TF. If no broadcast is needed, convert
-    // the TF tensor to MKL, otherwise convert the MKL tensor to TF format
-    VLOG(1) << "MklInputConversionOp: Inputs in different formats (MKL/TF)";
-
-    const Tensor* mkl_tensor;
-    const MklShape* mkl_shape;
-    const Tensor* tf_tensor;
-    MklShape* tf_mkl_shape;
-    uint32 mkl_tensor_index;
-    uint32 tf_tensor_index;
-    if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
-      mkl_tensor = &input_tensor_0;
-      mkl_shape = &input_shape_0;
-      mkl_tensor_index = 0;
-      tf_tensor = &input_tensor_1;
-      tf_mkl_shape = &input_shape_1;
-      tf_tensor_index = 1;
-    } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      mkl_tensor = &input_tensor_1;
-      mkl_shape = &input_shape_1;
-      mkl_tensor_index = 1;
-      tf_tensor = &input_tensor_0;
-      tf_mkl_shape = &input_shape_0;
-      tf_tensor_index = 0;
-    } else {
-      CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
-                      "shapes for MKL "
-                   << "element-wise op";
-    }
-
-    // Broadcast is needed if the shapes are not the same
-    bool broadcast_needed;
-
-    size_t in0_size = 1;
-    for (size_t i = 0; i < mkl_shape->GetDimension(); ++i)
-      in0_size *= mkl_shape->tf_dim_size(i);
-
-    size_t in1_size = 1;
-    for (size_t i = 0; i < tf_tensor->shape().dims(); ++i)
-      in1_size *= tf_tensor->shape().dim_size(i);
-
-    broadcast_needed = (in0_size != in1_size);
-
-    if (!broadcast_needed) {
-      // Both shapes are same, convert the TF input to MKL
-      VLOG(1) << "MklInputConversionOp: No broadcast needed.";
-      VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index
-              << " to MKL format";
-
-      // Create MklShape
-      Tensor* tensor_out;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(true);
-      mkl_output_mkl_shape.SetTfLayout(mkl_shape->GetDimension(),
-                                       mkl_shape->GetSizes(),
-                                       mkl_shape->GetStrides());
-      mkl_output_mkl_shape.SetTfDimOrder(mkl_shape->GetDimension());
-
-      // ** Temporarily borrow the layout from the MKL input **
-      mkl_output_mkl_shape.SetMklLayout(mkl_shape->GetCurLayout());
-
-      // Create output tensor
-      AllocateOutputSetMklShape(context, tf_tensor_index, &tensor_out,
-                                mkl_tensor->shape(), mkl_output_mkl_shape);
-
-      // Since the shapes are the same, use information from the other tensor
-      tf_mkl_shape->SetTfLayout(mkl_shape->GetDimension(),
-                                mkl_shape->GetSizes(), mkl_shape->GetStrides());
-      // Convert the data format
-      tf_mkl_shape->GetConvertedFlatData(
-          mkl_shape->GetCurLayout(),
-          const_cast<T*>(tf_tensor->flat<T>().data()),
-          const_cast<T*>(tensor_out->flat<T>().data()));
-
-      // ** Release the borrowed layout to avoid double deletion
-      //    in the destructor call **
-      mkl_output_mkl_shape.SetMklLayout(nullptr);
-
-      // -- The tensor in MKL format passes through --
-      ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index);
-    } else {
-      // Broadcast is needed, so convert the MKL input to TF
-      VLOG(1) << "MklInputConversionOp: Broadcast needed.";
-      VLOG(1) << "MklInputConversionOp: Converting input " << mkl_tensor_index
-              << " to TF format";
-      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_,
-                                           mkl_tensor_index);
-      SetDummyMklShapeOutput(context, mkl_tensor_index);
-
-      // The tensor in TF format passes through
-      ForwardTfTensorInToOut(context, tf_tensor_index, tf_tensor_index);
-    }
-
-    VLOG(1) << "MklInputConversionOp: Shapes (output): "
-            << context->mutable_output(0)->shape().DebugString() << " and "
-            << context->mutable_output(1)->shape().DebugString();
-
-    VLOG(1) << "MklInputConversion completed successfully.";
-  }
-
- private:
-  /// Data format of the operation
-  string data_format_str;
-
-  /// Data type of the operation
-  DataType op_data_type;
-
-  /// CPUIDInfo
-  bool has_avx512f_ = false;
-};
-
-#else
-
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
@@ -439,8 +243,7 @@ class MklInputConversionOp : public OpKernel {
       // Create reorder between tensorflow layout and Mkl layout if necessary
       std::vector<primitive> net;
       bool reordered = tf_input.CheckReorderToOpMem(
-                   memory::primitive_desc(output_mkl_md, cpu_engine),
-                   tensor_out, &net);
+          memory::primitive_desc(output_mkl_md, cpu_engine), tensor_out, &net);
 
       if (!reordered) {
         // This is the case that the TF tensor has the same shape and format of
@@ -488,8 +291,6 @@ class MklInputConversionOp : public OpKernel {
   bool has_avx512f_ = false;
 };
 
-#endif
-
 ///////////////////////////////////////////////////////////
 //               Register kernel
 ///////////////////////////////////////////////////////////
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 22ff4cd80fe6d4d0b8a85c88dd65a58b7288a351..4d46abb0a4dd232ef13c8b6b0547b0779af1f98f 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -29,25 +30,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
 using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
 using mkldnn::lrn_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
@@ -69,672 +63,6 @@ void GetBandMatrix(int depth, int depth_radius,
 
 }  // namespace
 
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename T>
-class MklLRNOp : public OpKernel {
- public:
-  ~MklLRNOp() {}
-
-  explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
-    int64 depth_radius64;
-    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
-    depth_radius_ = static_cast<size_t>(depth_radius64);
-
-    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
-    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
-    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
-    workspace_enabled_ = false;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklLRNOpContext mkl_context;
-
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    // Sanity checks
-    mkl_context.in_dims = input_in_mkl_format
-                              ? mkl_context.input_shape.GetDimension()
-                              : input.dims();
-    OP_REQUIRES(context, mkl_context.in_dims == 4,
-                errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
-        errors::InvalidArgument("argument to LRN too large"));
-
-    if (!input_in_mkl_format) {
-      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                    beta_, input);
-      return;
-    }
-
-    if (input_in_mkl_format) {
-      // MKL supports normalization over channel dimension only
-      if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
-          MklDims::C) {
-        mkl_context.lt_input =
-            static_cast<dnnLayout_t>(mkl_context.input_shape.GetCurLayout());
-        workspace_enabled_ = true;
-      } else {
-        Tensor converted_tensor =
-            ConvertMklToTF<T>(context, input, mkl_context.input_shape);
-        mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                      beta_, converted_tensor);
-        return;
-      }
-    }
-
-    int kernel_size = 2 * depth_radius_ + 1;
-
-    CHECK_EQ(dnnLRNCreateForward_F32(
-                 &mkl_context.lrn_fwd, NULL, mkl_context.lt_input, kernel_size,
-                 static_cast<float>(alpha_ * kernel_size), beta_, bias_),
-             E_SUCCESS);
-
-    // Allocate output tensor and shape
-    Tensor* output = nullptr;
-    Tensor* workspace = nullptr;
-
-    // Convert Inputs if needed
-    Tensor mkl_tmp_input_buf_tensor;
-    mkl_context.MklPrepareLRNInputs(context, &mkl_tmp_input_buf_tensor);
-
-    // Allocate Layer Outputs
-    mkl_context.MklAllocateOutputs(context, &output, &workspace,
-                                   workspace_enabled_);
-
-    Tensor mkl_tmp_workspace_buf_tensor;
-    mkl_context.MklPrepareLRNOutputs(context, output, workspace,
-                                     &mkl_tmp_workspace_buf_tensor,
-                                     workspace_enabled_);
-
-    // Execute LRN.
-    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_fwd, mkl_context.lrn_res),
-             E_SUCCESS);
-
-    // Release MKL resources.
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    size_t in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    MklShape input_shape;
-    dnnPrimitive_t lrn_fwd = nullptr;
-    dnnPrimitive_t convert_input = nullptr;
-    dnnLayout_t lt_input = nullptr;
-    dnnLayout_t lt_internal_input = nullptr;
-    dnnLayout_t lt_internal_workspace = nullptr;
-    dnnLayout_t lt_internal_output = nullptr;
-    void* lrn_res[dnnResourceNumber];
-
-    // Convert Inputs if needed
-    void MklPrepareLRNInputs(OpKernelContext* context,
-                             Tensor* mkl_tmp_input_buf_tensor) {
-      const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_fwd,
-                                                dnnResourceSrc),
-               E_SUCCESS);
-
-      void* mkl_buf_convert_input = nullptr;
-      bool mkl_convert_input = false;
-      mkl_convert_input = !dnnLayoutCompare_F32(lt_internal_input, lt_input);
-
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input,
-                                         lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_internal_input,
-                       &mkl_buf_convert_input);
-        CHECK_EQ(dnnConversionExecute_F32(convert_input, mkl_buf_input,
-                                          mkl_buf_convert_input),
-                 E_SUCCESS);
-        dnnDelete_F32(convert_input);
-      }
-
-      lrn_res[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
-    }
-
-    // Allocate Layer Outputs
-    void MklAllocateOutputs(OpKernelContext* context, Tensor** output,
-                            Tensor** workspace, bool workspace_enabled_) {
-      TensorShape mkl_output_tf_shape; /* First tensor */
-      MklShape mkl_output_mkl_shape;   /* Second tensor */
-
-      mkl_output_mkl_shape.SetMklTensor(true);
-      mkl_output_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceDst);
-      mkl_output_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
-                                       input_shape.GetStrides());
-      mkl_output_mkl_shape.SetTfDimOrder(in_dims,
-                                         input_shape.GetTfToMklDimMap());
-      mkl_output_tf_shape.AddDim(
-          dnnLayoutGetMemorySize_F32(
-              static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-          sizeof(T));
-      AllocateOutputSetMklShape(context, 0, output,
-                                mkl_output_tf_shape /* First tensor */,
-                                mkl_output_mkl_shape /* Second Tensor */);
-
-      if (workspace_enabled_) {
-        TensorShape mkl_workspace_tf_shape; /* First tensor */
-        MklShape mkl_workspace_mkl_shape;   /* Second tensor */
-        mkl_workspace_mkl_shape.SetMklTensor(false);
-        mkl_workspace_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceWorkspace);
-        // Assumes workspace has same TF layout and TF dim order as input
-        mkl_workspace_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
-                                            input_shape.GetStrides());
-        mkl_workspace_mkl_shape.SetTfDimOrder(in_dims,
-                                              input_shape.GetTfToMklDimMap());
-        mkl_workspace_tf_shape.AddDim(
-            dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                mkl_workspace_mkl_shape.GetMklLayout())) /
-            sizeof(T));
-        AllocateOutputSetMklShape(context, 1, workspace,
-                                  mkl_workspace_tf_shape /* First tensor */,
-                                  mkl_workspace_mkl_shape /* Second Tensor */);
-      }
-    }
-
-    void MklPrepareLRNOutputs(OpKernelContext* context, Tensor* output,
-                              Tensor* workspace,
-                              Tensor* mkl_tmp_workspace_buf_tensor,
-                              bool workspace_enabled_) {
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_workspace, lrn_fwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_output, lrn_fwd,
-                                                dnnResourceDst),
-               E_SUCCESS);
-
-      void* mkl_buf_output =
-          const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
-      lrn_res[dnnResourceDst] = mkl_buf_output;
-
-      void* mkl_buf_workspace = nullptr;
-      if (workspace_enabled_) {
-        mkl_buf_workspace = const_cast<void*>(
-            static_cast<const void*>(workspace->flat<T>().data()));
-      } else {
-        AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor,
-                       lt_internal_workspace, &mkl_buf_workspace);
-      }
-      lrn_res[dnnResourceWorkspace] = mkl_buf_workspace;
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context, int depth_radius_,
-                           float bias_, float alpha_, float beta_,
-                           const Tensor& input) {
-      const int batch = static_cast<int>(input.dim_size(0));
-      const int rows = static_cast<int>(input.dim_size(1));
-      const int cols = static_cast<int>(input.dim_size(2));
-      const int depth = static_cast<int>(input.dim_size(3));
-      const int nodes = cols * rows;
-
-      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
-      // Multiplying the input with the band matrix has the effect of reducing
-      // the
-      // correct patch along the depth.
-      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
-      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
-
-      Tensor *output, *workspace;
-      MklShape mkl_output_mkl_shape, mkl_workspace_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
-                                mkl_output_mkl_shape);
-
-      mkl_workspace_mkl_shape.SetMklTensor(false);
-      mkl_workspace_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 1, &workspace, input.shape(),
-                                mkl_workspace_mkl_shape);
-
-      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
-      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
-      if (beta_ == T(1)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.inverse();
-      } else if (beta_ == T(0.5)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.rsqrt();
-      } else {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * (tmp.log() * -beta_).exp();
-      }
-    }
-
-    // Release MKL resources.
-    void MklCleanup() {
-      dnnDelete_F32(lrn_fwd);
-      dnnLayoutDelete_F32(lt_internal_input);
-      dnnLayoutDelete_F32(lt_internal_workspace);
-      dnnLayoutDelete_F32(lt_internal_output);
-    }
-  } MklLRNOpContext;
-
-  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
-
-  bool workspace_enabled_;
-  int depth_radius_;
-  float bias_;
-  float alpha_;
-  float beta_;
-};
-
-template <typename T>
-class MklLRNGradOp : public OpKernel {
- public:
-  explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
-    int64 depth_radius64;
-    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
-    depth_radius_ = static_cast<int>(depth_radius64);
-    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
-    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
-    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
-    workspace_enabled_ = false;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklLRNGradOpContext mkl_context;
-    mkl_context.depth_radius_ = depth_radius_;
-    mkl_context.bias_ = bias_;
-    mkl_context.alpha_ = alpha_;
-    mkl_context.beta_ = beta_;
-
-    const Tensor& in_grads = MklGetInput(context, 0);
-    const Tensor& in_image = MklGetInput(context, 1);
-    const Tensor& out_image = MklGetInput(context, 2);
-
-    GetMklShape(context, 0, &mkl_context.ingrad_shape);
-    GetMklShape(context, 1, &mkl_context.inimage_shape);
-    GetMklShape(context, 2, &mkl_context.outimage_shape);
-
-    bool ingrad_in_mkl_format = mkl_context.ingrad_shape.IsMklTensor();
-    bool inimage_in_mkl_format = mkl_context.inimage_shape.IsMklTensor();
-    bool outimage_in_mkl_format = mkl_context.outimage_shape.IsMklTensor();
-
-    mkl_context.in_dims = inimage_in_mkl_format
-                              ? mkl_context.inimage_shape.GetDimension()
-                              : in_image.dims();
-    OP_REQUIRES(context, mkl_context.in_dims == 4,
-                errors::InvalidArgument("input images must be 4-dimensional"));
-
-    if (!workspace_enabled_) {
-      mkl_context.MklDefaultToEigen(context);
-      return;
-    }
-
-    if (ingrad_in_mkl_format || inimage_in_mkl_format) {
-      const MklShape* tmp_mkl_shape = (ingrad_in_mkl_format)
-                                          ? &mkl_context.ingrad_shape
-                                          : &mkl_context.inimage_shape;
-      if (tmp_mkl_shape->tf_dim_idx(mkl_context.in_dims - 1) != MklDims::C) {
-        // Fallback to eigen
-        mkl_context.MklDefaultToEigen(context);
-        return;
-      } else {  // MKL supports normalization over channel dimension only
-        for (int i = 0; i < mkl_context.in_dims; i++) {
-          mkl_context.in_sizes[i] = mkl_context.out_sizes[i] =
-              tmp_mkl_shape->GetSizes()[i];
-          mkl_context.in_strides[i] = mkl_context.out_strides[i] =
-              tmp_mkl_shape->GetStrides()[i];
-        }
-      }
-    } else {
-      // Fallback to eigen
-      mkl_context.MklDefaultToEigen(context);
-      return;
-    }
-
-    // Dimensions check for sanity purpose
-    if (ingrad_in_mkl_format) {
-      OP_REQUIRES(
-          context, mkl_context.ingrad_shape.GetDimension() == 4,
-          errors::InvalidArgument("input gradient must be 4-dimensional"));
-    } else {
-      OP_REQUIRES(
-          context, in_grads.dims() == 4,
-          errors::InvalidArgument("input gradient must be 4-dimensional"));
-    }
-
-    if (outimage_in_mkl_format) {
-      OP_REQUIRES(
-          context, mkl_context.outimage_shape.GetDimension() == 4,
-          errors::InvalidArgument("Output image must be 4-dimensional"));
-    } else {
-      OP_REQUIRES(
-          context, out_image.dims() == 4,
-          errors::InvalidArgument("Output image must be 4-dimensional"));
-    }
-
-    // Prepare mkl input layout
-    mkl_context.MklPrepareLRNInputsLayouts(context);
-    int ksize = 2 * depth_radius_ + 1;
-
-    CHECK_EQ(dnnLRNCreateBackward_F32(
-                 &mkl_context.lrn_bwd, NULL, mkl_context.lt_input,
-                 mkl_context.lt_output, ksize,
-                 static_cast<float>(alpha_ * ksize), beta_, bias_),
-             E_SUCCESS);
-
-    // Allocate output tensor and shape.
-    TensorShape mkl_output_tf_shape; /* First tensor */
-    MklShape mkl_output_mkl_shape;   /* Second tensor */
-    mkl_output_mkl_shape.SetMklTensor(true);
-    CHECK_NE(mkl_context.lrn_bwd, nullptr);
-    mkl_output_mkl_shape.SetMklLayout(mkl_context.lrn_bwd, dnnResourceDiffSrc);
-    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
-                                     mkl_context.out_strides);
-    if (ingrad_in_mkl_format) {
-      mkl_output_mkl_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.ingrad_shape.GetTfToMklDimMap());
-    } else {
-      mkl_output_mkl_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.inimage_shape.GetTfToMklDimMap());
-    }
-    mkl_output_tf_shape.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-        sizeof(T));
-    Tensor* output = nullptr;
-    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
-                              mkl_output_mkl_shape);
-
-    // Get pointers to output data.
-    void* user_output =
-        const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
-
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_image_buf_tensor,
-        mkl_tmp_outimage_buf_tensor;
-    // Convert Inputs if needed
-    mkl_context.MklPrepareLRNGradInput(context, &mkl_tmp_input_buf_tensor,
-                                       &mkl_tmp_image_buf_tensor,
-                                       &mkl_tmp_outimage_buf_tensor);
-
-    // We do not do any conversion for output. But we simply emit it
-    // in MKL format.
-    mkl_context.res_lrn_bwd[dnnResourceDiffSrc] = user_output;
-    // Execute LRN backward using dnnExecute
-    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_bwd, mkl_context.res_lrn_bwd),
-             E_SUCCESS);
-    // Release MKL resources.
-    mkl_context.Mklcleanup();
-  }
-
- private:
-  typedef struct {
-    int depth_radius_;
-    float bias_;
-    float alpha_;
-    float beta_;
-    size_t in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    MklShape ingrad_shape, inimage_shape, outimage_shape;
-    dnnPrimitive_t lrn_bwd = nullptr;
-    dnnPrimitive_t convert_input = nullptr;
-    dnnLayout_t lt_input = nullptr;
-    dnnLayout_t lt_output = nullptr;
-    dnnLayout_t lt_bdw_input = nullptr;
-    dnnLayout_t lt_workspace = nullptr;
-    dnnLayout_t lt_internal_input = nullptr;
-    void* res_lrn_bwd[dnnResourceNumber];
-
-    // prepare mkl input
-    void MklPrepareLRNInputsLayouts(OpKernelContext* context) {
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (!ingrad_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(ingrad_shape.GetCurLayout());
-      }
-
-      if (!inimage_in_mkl_format) {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&lt_output, in_dims, out_sizes, out_strides),
-            E_SUCCESS);
-      } else {
-        lt_output = static_cast<dnnLayout_t>(inimage_shape.GetCurLayout());
-      }
-    }
-
-    // convert input if needed
-    void MklPrepareLRNGradInput(OpKernelContext* context,
-                                Tensor* mkl_tmp_input_buf_tensor,
-                                Tensor* mkl_tmp_image_buf_tensor,
-                                Tensor* mkl_tmp_outimage_buf_tensor) {
-      const Tensor& in_grads = MklGetInput(context, 0);
-      const Tensor& in_image = MklGetInput(context, 1);
-      const Tensor& workspace = MklGetInput(
-          context,
-          3); /*Worskpsace is enabled, get the buffer to the workspace */
-
-      void* user_input = const_cast<void*>(
-          static_cast<const void*>(in_grads.flat<T>().data()));
-      void* user_fwd_input = const_cast<void*>(
-          static_cast<const void*>(in_image.flat<T>().data()));
-      void* workspace_buffer = const_cast<void*>(
-          static_cast<const void*>(workspace.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, lrn_bwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_bdw_input, lrn_bwd,
-                                                dnnResourceDiffDst),
-               E_SUCCESS);
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_bwd,
-                                                dnnResourceSrc),
-               E_SUCCESS);
-
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      if (ingrad_in_mkl_format) {
-        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
-          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
-                         &res_lrn_bwd[dnnResourceDiffDst]);
-          ingrad_shape.GetConvertedFlatData(lt_bdw_input, user_input,
-                                            res_lrn_bwd[dnnResourceDiffDst]);
-        } else {
-          res_lrn_bwd[dnnResourceDiffDst] = user_input;
-        }
-      } else {
-        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
-          CHECK_EQ(
-              dnnConversionCreate_F32(&convert_input, lt_input, lt_bdw_input),
-              E_SUCCESS);
-
-          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
-                         &res_lrn_bwd[dnnResourceDiffDst]);
-          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_input,
-                                            res_lrn_bwd[dnnResourceDiffDst]),
-                   E_SUCCESS);
-          dnnDelete_F32(convert_input);
-        } else {
-          res_lrn_bwd[dnnResourceDiffDst] = user_input;
-        }
-      }
-
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (inimage_in_mkl_format) {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          ingrad_shape.GetConvertedFlatData(lt_internal_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
-      } else {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          CHECK_EQ(dnnConversionCreate_F32(
-                       &convert_input,
-                       static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()),
-                       lt_internal_input),
-                   E_SUCCESS);
-
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]),
-                   E_SUCCESS);
-          dnnDelete_F32(convert_input);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
-      }
-
-      res_lrn_bwd[dnnResourceWorkspace] = workspace_buffer;
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context) {
-      Tensor in_grads;
-      Tensor in_image;
-      Tensor out_image;
-
-      GetMklShape(context, 0, &ingrad_shape);
-      GetMklShape(context, 1, &inimage_shape);
-      GetMklShape(context, 2, &outimage_shape);
-
-      if (ingrad_shape.IsMklTensor()) {
-        in_grads =
-            ConvertMklToTF<T>(context, MklGetInput(context, 0), ingrad_shape);
-      } else {
-        in_grads = MklGetInput(context, 0);
-      }
-
-      if (inimage_shape.IsMklTensor()) {
-        in_image =
-            ConvertMklToTF<T>(context, MklGetInput(context, 1), inimage_shape);
-      } else {
-        in_image = MklGetInput(context, 1);
-      }
-
-      if (outimage_shape.IsMklTensor()) {
-        out_image =
-            ConvertMklToTF<T>(context, MklGetInput(context, 2), outimage_shape);
-      } else {
-        out_image = MklGetInput(context, 2);
-      }
-
-      const int64 batch = static_cast<int64>(in_grads.dim_size(0));
-      const int64 rows = static_cast<int64>(in_grads.dim_size(1));
-      const int64 cols = static_cast<int64>(in_grads.dim_size(2));
-      const int64 depth = static_cast<int64>(in_grads.dim_size(3));
-      const auto nodes = cols * rows;
-
-      auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
-
-      auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
-      auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
-
-      Tensor* output;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 0, &output, in_grads.shape(),
-                                mkl_output_mkl_shape);
-
-      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
-      out_shaped.setZero();
-      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
-                    depth](int64 begin, int64 end) {
-        for (int64 i = begin; i < end; ++i) {
-          for (int64 j = 0; j < depth; ++j) {
-            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
-            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
-
-            T norm(0);
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              norm += in_shaped(i, k) * in_shaped(i, k);
-            }
-            norm = alpha_ * norm + bias_;
-            DCHECK_GT(norm, T(1e-6));
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
-                      activations(i, j) / norm;
-              if (k == j) {
-                dyi += Eigen::numext::pow(norm, -beta_);
-              }
-              dyi *= grads_shaped(i, j);
-              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
-                  dyi;
-            }
-          }
-        }
-      };
-      auto worker_threads =
-          *(context->device()->tensorflow_cpu_worker_threads());
-      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
-            depth * depth, shard);
-    }
-
-    // release mkl resources
-    void Mklcleanup() {
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (!ingrad_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_input), E_SUCCESS);
-      }
-
-      if (!inimage_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_output), E_SUCCESS);
-      }
-      dnnDelete_F32(lrn_bwd);
-      dnnLayoutDelete_F32(lt_bdw_input);
-      dnnLayoutDelete_F32(lt_workspace);
-    }
-  } MklLRNGradOpContext;
-
-  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
-  bool workspace_enabled_;
-  int depth_radius_;
-  float bias_;
-  float alpha_;
-  float beta_;
-};
-
-#else
-
 template <typename T>
 class MklLRNOp : public OpKernel {
  public:
@@ -847,7 +175,6 @@ class MklLRNOp : public OpKernel {
                             MklDnnData<T>* src_dnn_data,
                             MklDnnData<T>* dst_dnn_data,
                             MklDnnData<uint8>* wksp_dnn_data = nullptr) {
-
     // Check for input reorder
     src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc());
 
@@ -1160,7 +487,6 @@ class MklLRNGradOp : public OpKernel {
       MklDnnData<T>* output_diff_src,
       const memory::primitive_desc& target_diff_dst_pd,
       const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
-
     // Check for input reordering on the diff dst input
     input_gradient_diff_dst->CheckReorderToOpMem(
         lrn_bkwd_desc.diff_dst_primitive_desc());
@@ -1345,8 +671,6 @@ class MklLRNGradOp : public OpKernel {
   float beta_;
 };
 
-#endif  // INTEL_MKL_ML_ONLY
-
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 256d48f4d5d56995fbca31c18cf29c902831679b..cb494f6c3ec75d36bad42669fd0addcfa31b8bf7 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -520,7 +520,6 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
 
       MklDnnData<T> dnn_data_input(&cpu_engine);
       MklDnnData<T> dnn_data_output(&cpu_engine);
-      MklDnnData<uint8> dnn_data_wksp(&cpu_engine);
 
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
@@ -550,13 +549,13 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
           dnn_shape_input.IsMklTensor()
               ? dnn_shape_input.GetMklLayout()
               : is_pool2d ? memory::desc(
-                               TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
-                                                         this->data_format_tf_),
-                               MklDnnType<T>(), this->data_format_mkldnn_)
-                         : memory::desc(
-                               TFShapeToMklDnnDimsInNCDHW(
-                                   input_tensor_shape, this->data_format_tf_),
-                               MklDnnType<T>(), this->data_format_mkldnn_);
+                                TFShapeToMklDnnDimsInNCHW(
+                                    input_tensor_shape, this->data_format_tf_),
+                                MklDnnType<T>(), this->data_format_mkldnn_)
+                          : memory::desc(
+                                TFShapeToMklDnnDimsInNCDHW(
+                                    input_tensor_shape, this->data_format_tf_),
+                                MklDnnType<T>(), this->data_format_mkldnn_);
 
       // Get src/filter/stride/padding information
       memory::dims src_dims =
@@ -564,17 +563,24 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
               ? dnn_shape_input.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(),
                                                       this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
-                                                      this->data_format_tf_);
+                          : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
+                                                       this->data_format_tf_);
       memory::dims filter_dims, strides, padding_left, padding_right;
       this->PoolParamsToDims(&pool_params, &filter_dims, &strides,
                              &padding_left, &padding_right, is_pool2d);
 
       // Get a pooling op from the cached pool
       MklPoolingFwdPrimitive<T>* pooling_fwd = nullptr;
+      prop_kind pooling_prop_kind;
+      bool int8_forward_inference =
+          std::is_same<T, qint8>::value || std::is_same<T, quint8>::value;
+      if (int8_forward_inference)
+        pooling_prop_kind = prop_kind::forward_inference;
+      else
+        pooling_prop_kind = prop_kind::forward_training;
       MklPoolingParams fwdParams(src_dims, output_dims_mkl_order, filter_dims,
                                  strides, padding_left, padding_right,
-                                 algorithm::pooling_max);
+                                 algorithm::pooling_max, pooling_prop_kind);
       pooling_fwd = MklPoolingFwdPrimitiveFactory<T>::Get(fwdParams);
 
       // allocate output tensor
@@ -586,10 +592,6 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
                                 pooling_fwd->GetDstMemoryFormat(),
                                 output_tensor);
 
-      AllocateWorkspaceTensor(context, *(pooling_fwd->GetPoolingFwdPd()),
-                              &dnn_data_wksp);
-      OP_REQUIRES_OK(context, context->status());
-
       // check wehther we need to reorder src
       const T* src_data = input_tensor.flat<T>().data();
       if (input_md.data.format != pooling_fwd->GetSrcMemoryFormat()) {
@@ -603,10 +605,39 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       }
 
       T* dst_data = output_tensor->flat<T>().data();
-      void* ws_data = dnn_data_wksp.GetOpMem().get_data_handle();
 
-      // execute pooling op
-      pooling_fwd->Execute(src_data, dst_data, ws_data);
+      if (int8_forward_inference) {
+        // Execute pooling op
+        pooling_fwd->Execute(src_data, dst_data);
+
+        // pass min, max from input to output
+        const Tensor& min_input_t = MklGetInput(context, 1);
+        const Tensor& max_input_t = MklGetInput(context, 2);
+        const float min_input = min_input_t.flat<float>()(0);
+        const float max_input = max_input_t.flat<float>()(0);
+
+        Tensor* output_min = nullptr;
+        Tensor* output_max = nullptr;
+        MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+        output_min_mkl_shape.SetMklTensor(false);
+        output_max_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 1, &output_min, {},
+                                  output_min_mkl_shape);
+        AllocateOutputSetMklShape(context, 2, &output_max, {},
+                                  output_max_mkl_shape);
+        output_min->flat<float>()(0) = min_input;
+        output_max->flat<float>()(0) = max_input;
+      } else {
+        MklDnnData<uint8> dnn_data_wksp(&cpu_engine);
+        AllocateWorkspaceTensor(context, *(pooling_fwd->GetPoolingFwdPd()),
+                                &dnn_data_wksp);
+        OP_REQUIRES_OK(context, context->status());
+        T* ws_data =
+            static_cast<T*>(dnn_data_wksp.GetOpMem().get_data_handle());
+
+        // execute pooling op
+        pooling_fwd->Execute(src_data, dst_data, ws_data);
+      }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -684,24 +715,25 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_mkl_shape.IsMklTensor()
               ? orig_input_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(orig_input_shape,
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
+                                                       this->data_format_tf_);
 
       memory::dims diff_dst_dims =
           grad_mkl_shape.IsMklTensor()
               ? grad_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(),
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
+                                                       this->data_format_tf_);
 
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
       MklPoolingParams bwdParams(
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
-          strides, padding_left, padding_right, algorithm::pooling_max);
+          strides, padding_left, padding_right, algorithm::pooling_max,
+          prop_kind::forward_training);
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
@@ -788,39 +820,38 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
                          const MklDnnShape& workspace_mkl_shape) {
     if (!orig_input_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, orig_input_tensor.dims() == 4,
-                  errors::InvalidArgument("Original input shape must be "
-                                          "4-dimensional"));
+                  errors::InvalidArgument(
+                      "Original input shape must be 4-dimensional"));
     } else {
       OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Original input shape must be "
-                                          "4-dimensional"));
+                  errors::InvalidArgument(
+                      "Original input shape must be 4-dimensional"));
     }
     if (!orig_output_mkl_shape.IsMklTensor()) {
-      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-                  errors::InvalidArgument("Original output must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_tensor.dims() == 4,
+          errors::InvalidArgument("Original output must be 4-dimensional"));
     } else {
-      OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Original output must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_mkl_shape.GetDimension() == 4,
+          errors::InvalidArgument("Original output must be 4-dimensional"));
     }
     if (!grad_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, grad_tensor.dims() == 4,
                   errors::InvalidArgument("Gradient must be 4-dimensional"));
     } else {
       OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Gradient must be "
-                                          "4-dimensional"));
+                  errors::InvalidArgument("Gradient must be 4-dimensional"));
     }
     if (this->workspace_enabled_) {
       // The workspace should not be an MKL tensor
       OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
-                  errors::InvalidArgument("Workspace tensor should not"
-                                          " be an MKL Tensor."));
+                  errors::InvalidArgument(
+                      "Workspace tensor should not be an MKL Tensor."));
       // It should only have one dimension
-      OP_REQUIRES(context, workspace_tensor.dims() == 1,
-                  errors::InvalidArgument("Workspace tensor must be "
-                                          "1-dimensional"));
+      OP_REQUIRES(
+          context, workspace_tensor.dims() == 1,
+          errors::InvalidArgument("Workspace tensor must be 1-dimensional"));
     } else {
       OP_REQUIRES(
           context, this->workspace_enabled_,
@@ -852,6 +883,18 @@ REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Label(mkl_op_registry::kMklOpLabel),
                         MklMaxPoolingOp<CPUDevice, float>);
 
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklMaxPoolingOp<CPUDevice, quint8>);
+
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklMaxPoolingOp<CPUDevice, qint8>);
+
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPoolGrad")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 5398e6113f53ecc0516dd87d0148eae63b1aae10..dc84d3941e78a2232041b2dbcf83bf3545982dee 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -41,28 +41,33 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
       << "Pooling algorithm kind is not supported";
 
   context_.alg_kind = fwdParams.alg_kind;
+  context_.prop_kind = fwdParams.prop_kind;
+
   // create memory desc
   // FIXME: Pooling doesn't expose to get the src_primitive_desc,
   //        so src format is currently hard-coded.
   //        A utility function is used to do this,
   //        which may be broken with future CPU architectures
   bool is_2d = (fwdParams.src_dims.size() == 4);
-  context_.src_md.reset(
-      new memory::desc({fwdParams.src_dims}, MklDnnType<T>(),
-                       get_desired_format(fwdParams.src_dims[1], is_2d)));
+  if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value)
+    context_.src_fmt = is_2d ? memory::format::nhwc : memory::format::ndhwc;
+  else
+    context_.src_fmt = get_desired_format(fwdParams.src_dims[1], is_2d);
+
+  context_.src_md.reset(new memory::desc({fwdParams.src_dims}, MklDnnType<T>(),
+                                         context_.src_fmt));
   context_.dst_md.reset(new memory::desc({fwdParams.dst_dims}, MklDnnType<T>(),
                                          memory::format::any));
 
   // create a pooling descriptor
   context_.fwd_desc.reset(new pooling_forward::desc(
-      prop_kind::forward_training, fwdParams.alg_kind, *context_.src_md,
+      fwdParams.prop_kind, fwdParams.alg_kind, *context_.src_md,
       *context_.dst_md, fwdParams.strides, fwdParams.filter_dims,
       fwdParams.padding_left, fwdParams.padding_right, padding_kind::zero));
   context_.fwd_pd.reset(
       new pooling_forward::primitive_desc(*context_.fwd_desc, cpu_engine_));
 
   // store expected primitive format
-  context_.src_fmt = get_desired_format(fwdParams.src_dims[1], is_2d);
   context_.dst_fmt = static_cast<mkldnn::memory::format>(
       context_.fwd_pd.get()->dst_primitive_desc().desc().data.format);
 
@@ -74,7 +79,8 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
       new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
 
   // for max pooling, need to return workspace(ws) for backward computing
-  if (fwdParams.alg_kind == pooling_max) {
+  if (fwdParams.alg_kind == pooling_max &&
+      fwdParams.prop_kind == prop_kind::forward_training) {
     auto ws_pd = context_.fwd_pd.get()->workspace_primitive_desc().desc().data;
     // store workspace's dims and format to create workspace tensor
     context_.ws_fmt = static_cast<mkldnn::memory::format>(ws_pd.format);
@@ -101,7 +107,9 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
   context_.src_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(src_data)));
   context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-  if (context_.alg_kind == pooling_max) {  // max pooling must have ws
+  if (context_.alg_kind == pooling_max &&
+      context_.prop_kind ==
+          prop_kind::forward_training) {  // max pooling must have ws
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(ws_data);
   }
@@ -110,13 +118,17 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
   // set back data handle
   context_.src_mem->set_data_handle(DummyData);
   context_.dst_mem->set_data_handle(DummyData);
-  if (context_.alg_kind == pooling_max) {  // max pooling must have ws
+  if (context_.alg_kind == pooling_max &&
+      context_.prop_kind ==
+          prop_kind::forward_training) {  // max pooling must have ws
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(DummyData);
   }
 }
 
 template class MklPoolingFwdPrimitive<float>;
+template class MklPoolingFwdPrimitive<quint8>;
+template class MklPoolingFwdPrimitive<qint8>;
 
 template <typename T>
 void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
@@ -143,7 +155,7 @@ void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
   // create a forward primitive,
   // which will be used as a hint for creating backward primitive
   context_.fwd_desc.reset(new pooling_forward::desc(
-      prop_kind::forward_training, bwdParams.alg_kind, *context_.diff_src_md,
+      bwdParams.prop_kind, bwdParams.alg_kind, *context_.diff_src_md,
       *context_.diff_dst_md, bwdParams.strides, bwdParams.filter_dims,
       bwdParams.padding_left, bwdParams.padding_right, padding_kind::zero));
   context_.fwd_pd.reset(
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 49f799d7ba2d28bf90bbb4ebd5ada33f0e5d620e..6e42b70d14919f7a15ace0dd9035b4fd57a82a76 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 #include <memory>
-#include <vector>
 #include <string>
+#include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
@@ -50,18 +50,20 @@ struct MklPoolingParams {
   memory::dims padding_left;
   memory::dims padding_right;
   mkldnn::algorithm alg_kind;
+  mkldnn::prop_kind prop_kind;
 
   MklPoolingParams(memory::dims src_dims, memory::dims dst_dims,
                    memory::dims filter_dims, memory::dims strides,
                    memory::dims padding_left, memory::dims padding_right,
-                   mkldnn::algorithm alg_kind)
+                   mkldnn::algorithm alg_kind, mkldnn::prop_kind prop_kind)
       : src_dims(src_dims),
         dst_dims(dst_dims),
         filter_dims(filter_dims),
         strides(strides),
         padding_left(padding_left),
         padding_right(padding_right),
-        alg_kind(alg_kind) {}
+        alg_kind(alg_kind),
+        prop_kind(prop_kind) {}
 };
 
 template <typename T>
@@ -97,6 +99,9 @@ class MklPoolingFwdPrimitive : public MklPrimitive {
     // algorithm
     mkldnn::algorithm alg_kind;
 
+    // Kind of propagation, forward or backward
+    mkldnn::prop_kind prop_kind;
+
     // expected memory format
     memory::format src_fmt;
     memory::format dst_fmt;
@@ -187,6 +192,7 @@ class MklPoolingFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(fwdParams.padding_left);
     key_creator.AddAsKey(fwdParams.padding_right);
     key_creator.AddAsKey<int>(static_cast<int>(fwdParams.alg_kind));
+    key_creator.AddAsKey<int>(static_cast<int>(fwdParams.prop_kind));
     return key_creator.GetKey();
   }
 
@@ -443,7 +449,12 @@ class MklPoolingOpBase : public OpKernel {
   explicit MklPoolingOpBase(OpKernelConstruction* context)
       : OpKernel(context), workspace_enabled_(false) {
     string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+      // current quantized convolution doesn't have data_format attribute.
+      data_format = "NHWC";
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    }
     OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_),
                 errors::InvalidArgument("Invalid data format"));
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
@@ -461,7 +472,7 @@ class MklPoolingOpBase : public OpKernel {
     bool is_pool2d = (this->ksize_.size() == 4);
     this->data_format_mkldnn_ =
         is_pool2d ? TFDataFormatToMklDnnDataFormat(this->data_format_tf_)
-                 : TFDataFormatToMklDnn3DDataFormat(this->data_format_tf_);
+                  : TFDataFormatToMklDnn3DDataFormat(this->data_format_tf_);
 
     // We may not get this attribute for this node if it does not go through
     // graph rewrite pass. So we do not check for error while retrieving this
@@ -655,10 +666,11 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
       OP_REQUIRES(context, input_tensor.dims() == 4 || input_tensor.dims() == 5,
                   errors::InvalidArgument("Input must be 4 or 5-dimensional"));
     } else {
-      OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4 ||
-                               input_mkl_shape.GetDimension() == 5,
-                  errors::InvalidArgument("Input shape must be "
-                                          "4 or 5-dimensional"));
+      OP_REQUIRES(
+          context,
+          input_mkl_shape.GetDimension() == 4 ||
+              input_mkl_shape.GetDimension() == 5,
+          errors::InvalidArgument("Input shape must be 4 or 5-dimensional"));
     }
   }
   // .Input("value: T")
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..10825f696253cc6d38bbdee1e6b660d494c34088
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
@@ -0,0 +1,55 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor.h"
+
+#ifdef INTEL_MKL
+
+namespace tensorflow {
+template <class T>
+float MklFloatForOneQuantizedLevel(float range_min, float range_max) {
+  const int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
+  const int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+  const float float_for_one_quantized_level =
+      (range_max - range_min) / (highest - lowest);
+  return float_for_one_quantized_level;
+}
+
+template <class T1, class T2, class T3>
+void MklQuantizationRangeForMultiplication(float min_a, float max_a,
+                                           float min_b, float max_b,
+                                           float* min_c, float* max_c) {
+  const float a_float_for_one_quant_level =
+      MklFloatForOneQuantizedLevel<T1>(min_a, max_a);
+  const float b_float_for_one_quant_level =
+      MklFloatForOneQuantizedLevel<T2>(min_b, max_b);
+
+  const int64 c_highest = static_cast<int64>(Eigen::NumTraits<T3>::highest());
+  const int64 c_lowest = static_cast<int64>(Eigen::NumTraits<T3>::lowest());
+  const float c_float_for_one_quant_level =
+      a_float_for_one_quant_level * b_float_for_one_quant_level;
+
+  *min_c = c_float_for_one_quant_level * c_lowest;
+  *max_c = c_float_for_one_quant_level * c_highest;
+}
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
+
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_QUANTIZED_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc b/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c1e32d6e35326273cfdd070ca8197e30b8ea7f9
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc
@@ -0,0 +1,201 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+// Helper class for converting MKL tensors to TF tensors and comparing to
+// expected values
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+class ConvMklToTF : public OpsTestBase {
+ public:
+  template <typename T>
+  void ConvertMKL2TF(DataType dtype, const Tensor& first, const Tensor& second,
+                     Tensor& output) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // Mkl second tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(first.shape(), first.flat<T>());
+    AddInputFromArray<uint8>(second.shape(), second.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    output = *GetOutput(0);
+  }
+  void TestBody(){};
+};
+
+class QuantizedPoolingTest : public OpsTestBase {};
+
+TEST_F(QuantizedPoolingTest, SmallAveragePooling) {
+  const int ksize = 2;
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_avg_pool_op", "_MklQuantizedAvgPool")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("ksize", {1, ksize, ksize, 1})
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = 0.0f;
+  const float input_max = 255.0f;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(DT_FLOAT, {1, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+
+  const int expected_width = input_width / stride;
+  const int expected_height = input_height / stride;
+
+  // The input pools we are averaging. (NHWC input, quantized.)
+  //    0th channel       1st channel
+  //    1  3 |  5  7      2  4 |  6  8
+  //    9 11 | 13 15     10 12 | 14 16
+  //   -------------     -------------
+  //   17 19 | 21 23     18 20 | 22 24
+  //   25 27 | 29 31     26 28 | 30 32
+  Tensor expected_float(DT_FLOAT,
+                        {1, expected_height, expected_width, input_channels});
+  test::FillValues<float>(&expected_float, {6, 7, 10, 11, 22, 23, 26, 27});
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& mkl_shape_tensor = *GetOutput(3);
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMKL2TF<quint8>(DT_QUINT8, output, mkl_shape_tensor,
+                                  output_quantized);
+
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedPoolingTest, SmallMaxPooling) {
+  const int ksize = 2;
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_max_pool_op", "_MklQuantizedMaxPool")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("ksize", {1, ksize, ksize, 1})
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = 0.0f;
+  const float input_max = 255.0f;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(DT_FLOAT, {1, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+  const int expected_width = input_width / stride;
+  const int expected_height = input_height / stride;
+
+  // The max is computed from these input pools. (NHWC input, quantized.)
+  //    0th channel       1st channel
+  //    1  3 |  5  7      2  4 |  6  8
+  //    9 11 | 13 15     10 12 | 14 16
+  //   -------------     -------------
+  //   17 19 | 21 23     18 20 | 22 24
+  //   25 27 | 29 31     26 28 | 30 32
+
+  Tensor expected_float(DT_FLOAT,
+                        {1, expected_height, expected_width, input_channels});
+  test::FillValues<float>(&expected_float, {11, 12, 15, 16, 27, 28, 31, 32});
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& mkl_shape_tensor = *GetOutput(3);
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMKL2TF<quint8>(DT_QUINT8, output, mkl_shape_tensor,
+                                  output_quantized);
+
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+}  // namespace tensorflow
+#endif
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 84385356e11d4a74bc601bda94ae7a95c6c0e3cf..708213648b48e2dfbbfe9a63851428aa97c72b64 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mkldnn.hpp"
 
 using mkldnn::algorithm;
+using mkldnn::eltwise_bounded_relu;
 using mkldnn::eltwise_elu;
 using mkldnn::eltwise_relu;
 using mkldnn::eltwise_tanh;
@@ -35,7 +36,6 @@ using mkldnn::prop_kind;
 using mkldnn::relu_backward;
 using mkldnn::relu_forward;
 using mkldnn::stream;
-using mkldnn::memory;
 #else
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
@@ -785,7 +785,8 @@ class MklReluOpBase : public OpKernel {
  public:
   ~MklReluOpBase() {}
 
-  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit MklReluOpBase(OpKernelConstruction* context, float alpha, float beta)
+      : OpKernel(context), alpha_(alpha), beta_(beta) {}
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
   void Compute(OpKernelContext* context) override {
@@ -815,10 +816,9 @@ class MklReluOpBase : public OpKernel {
         src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
       }
 
-      T alpha = 0, beta = 0;
-
       // get a eltwise fwd from primitive pool
-      MklEltwiseFwdParams<T> fwdParams(src_dims, src_md, alg_kind, alpha, beta);
+      MklEltwiseFwdParams<T> fwdParams(src_dims, src_md, alg_kind, alpha_,
+                                       beta_);
       MklEltwiseFwdPrimitive<T>* eltwise_fwd =
           MklEltwiseFwdPrimitiveFactory<T>::Get(fwdParams);
 
@@ -879,6 +879,8 @@ class MklReluOpBase : public OpKernel {
  private:
   engine cpu_engine = engine(engine::cpu, 0);
   std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+  float alpha_;
+  float beta_;
 };
 
 template <typename Device, typename T, algorithm alg_kind>
@@ -886,9 +888,9 @@ class MklReluGradOpBase : public OpKernel {
  public:
   ~MklReluGradOpBase() {}
 
-  explicit MklReluGradOpBase(OpKernelConstruction* context)
-      : OpKernel(context) {
-  }
+  explicit MklReluGradOpBase(OpKernelConstruction* context, float alpha,
+                             float beta)
+      : OpKernel(context), alpha_(alpha), beta_(beta) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) = 0;
 
@@ -958,8 +960,6 @@ class MklReluGradOpBase : public OpKernel {
         src_dims = dnn_shape_src.GetSizesAsMklDnnDims();
       }
 
-      T alpha = 0, beta = 0;
-
       // As per comment above, we tell MKLDNN that both the inputs are in same
       // format. So we set common memory descriptor in MKL format, if any of the
       // inputs are in MKL format. Let's get memory descriptor that we will use
@@ -973,8 +973,8 @@ class MklReluGradOpBase : public OpKernel {
         common_md = src_md;
       }
 
-      MklEltwiseBwdParams<T> bwdParams(src_dims, common_md, alg_kind, alpha,
-                                       beta);
+      MklEltwiseBwdParams<T> bwdParams(src_dims, common_md, alg_kind, alpha_,
+                                       beta_);
       MklEltwiseBwdPrimitive<T>* eltwise_bwd =
           MklEltwiseBwdPrimitiveFactory<T>::Get(bwdParams);
       auto eltwise_bwd_pd = eltwise_bwd->GetEltwiseBwdPd();
@@ -1023,7 +1023,8 @@ class MklReluGradOpBase : public OpKernel {
       }
 
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                  {diff_dst_index}, diff_src_index,
+                                  {static_cast<const int>(diff_dst_index)},
+                                  static_cast<const int>(diff_src_index),
                                   tf_shape_diff_src, &diff_src_tensor));
       AllocateOutputSetMklShape(context, diff_src_index, dnn_shape_diff_src);
 
@@ -1044,6 +1045,8 @@ class MklReluGradOpBase : public OpKernel {
  private:
   engine cpu_engine = engine(engine::cpu, 0);
   std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+  float alpha_;
+  float beta_;
 };
 
 template <typename Device, typename T>
@@ -1052,7 +1055,7 @@ class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
   ~MklReluOp() {}
 
   explicit MklReluOp(OpKernelConstruction* context)
-      : MklReluOpBase<Device, T, eltwise_relu>(context) {}
+      : MklReluOpBase<Device, T, eltwise_relu>(context, 0.0f, 0.0f) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -1081,7 +1084,7 @@ class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
   ~MklReluGradOp() {}
 
   explicit MklReluGradOp(OpKernelConstruction* context)
-      : MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
+      : MklReluGradOpBase<Device, T, eltwise_relu>(context, 0.0f, 0.0f) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
@@ -1115,7 +1118,7 @@ class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
   ~MklEluOp() {}
 
   explicit MklEluOp(OpKernelConstruction* context)
-      : MklReluOpBase<Device, T, eltwise_elu>(context) {}
+      : MklReluOpBase<Device, T, eltwise_elu>(context, 0.0f, 0.0f) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -1148,7 +1151,7 @@ class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
   ~MklEluGradOp() {}
 
   explicit MklEluGradOp(OpKernelConstruction* context)
-      : MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
+      : MklReluGradOpBase<Device, T, eltwise_elu>(context, 0.0f, 0.0f) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
@@ -1187,7 +1190,7 @@ class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
   ~MklTanhOp() {}
 
   explicit MklTanhOp(OpKernelConstruction* context)
-      : MklReluOpBase<Device, T, eltwise_tanh>(context) {}
+      : MklReluOpBase<Device, T, eltwise_tanh>(context, 0.0f, 0.0f) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t src_index = 0;  // index of src input tensor
@@ -1219,7 +1222,7 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
   ~MklTanhGradOp() {}
 
   explicit MklTanhGradOp(OpKernelConstruction* context)
-      : MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
+      : MklReluGradOpBase<Device, T, eltwise_tanh>(context, 0.0f, 0.0f) {}
 
   virtual void Compute_Scalar(OpKernelContext* context) {
     const size_t diff_dst_index = 0;  // index of diff_dst input tensor
@@ -1251,6 +1254,70 @@ class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
   }
 };
 
+#define RELU6_UPPER_BOUND 6.0f
+template <typename Device, typename T>
+class MklRelu6Op : public MklReluOpBase<Device, T, eltwise_bounded_relu> {
+ public:
+  ~MklRelu6Op() {}
+
+  explicit MklRelu6Op(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_bounded_relu>(
+            context, RELU6_UPPER_BOUND, 0.0f) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    T* user_i = const_cast<T*>(src_tensor.flat<T>().data());
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    T* out_o = dst_tensor->flat<T>().data();
+    out_o[0] = std::min(std::max(user_i[0], static_cast<T>(0)),
+                        static_cast<T>(RELU6_UPPER_BOUND));
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklRelu6GradOp
+    : public MklReluGradOpBase<Device, T, eltwise_bounded_relu> {
+ public:
+  ~MklRelu6GradOp() {}
+
+  explicit MklRelu6GradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_bounded_relu>(
+            context, RELU6_UPPER_BOUND, 0.0f) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    T* out_o = diff_src_tensor->flat<T>().data();
+    T* user_i = const_cast<T*>(src_tensor.flat<T>().data());
+    T* user_g = const_cast<T*>(diff_dst_tensor.flat<T>().data());
+    out_o[0] = user_g[0] * user_i[0] > 0 &&
+               (user_i[0] < static_cast<T>(RELU6_UPPER_BOUND));
+    return;
+  }
+};
+
 #endif
 
 // register dnn kernels for supported operations and supported types
@@ -1296,6 +1363,19 @@ TF_CALL_float(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
                           MklTanhGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
 
+#define REGISTER_RELU6_MKL_SUPPORTED_KERNELS_TYPES(type)            \
+  REGISTER_KERNEL_BUILDER(Name("_MklRelu6")                         \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklRelu6Op<CPUDevice, type>);             \
+  REGISTER_KERNEL_BUILDER(Name("_MklRelu6Grad")                     \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklRelu6GradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_RELU6_MKL_SUPPORTED_KERNELS_TYPES);
+
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index d9a7893a530a2c1b47d051d9f7ba5c096367c13a..342e2265ee8e51b5479969c634c0dd09d67a10ba 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -24,15 +24,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
-using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
 #include "tensorflow/core/util/mkl_util.h"
+using mkldnn::stream;
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
@@ -41,103 +35,6 @@ class MklReshapeOp : public OpKernel {
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-#ifdef INTEL_MKL_ML_ONLY
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = MklGetInput(context, 0);
-    const Tensor& sizes = MklGetInput(context, 1);
-
-    // Preliminary validation of sizes.
-    OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
-                errors::InvalidArgument("sizes input must be 1-D, not shape ",
-                                        sizes.shape().DebugString()));
-
-    // Compute the output shape.  Determine product of specified
-    // dimensions, and find the index of the unspecified one.
-    TensorShape shape;
-    int64 product = 1;
-    int unknown_index = -1;
-    switch (sizes.dtype()) {
-      case DT_INT32:
-        OP_REQUIRES_OK(context, ValidateSizes<int32>(sizes, &product,
-                                                     &unknown_index, &shape));
-        break;
-      case DT_INT64:
-        OP_REQUIRES_OK(context, ValidateSizes<int64>(sizes, &product,
-                                                     &unknown_index, &shape));
-        break;
-      default:
-        context->CtxFailure(errors::InvalidArgument(
-            "desired shape must be a DT_INT32 or DT_INT64 vector, not a ",
-            DataTypeString(sizes.dtype())));
-        return;
-    }
-    if (unknown_index != -1) {
-      OP_REQUIRES(
-          context, product > 0,
-          errors::InvalidArgument("Reshape cannot infer the missing input size "
-                                  "for an empty tensor unless all specified "
-                                  "input sizes are non-zero"));
-      const int64 missing = input.NumElements() / product;
-      OP_REQUIRES(
-          context, product * missing == input.NumElements(),
-          errors::InvalidArgument(
-              "Input to reshape is a tensor with ", input.NumElements(),
-              " values, but the requested shape requires a multiple of ",
-              product));
-      shape.set_dim(unknown_index, missing);
-    }
-    OP_REQUIRES(context, shape.num_elements() == input.NumElements(),
-                errors::InvalidArgument("Input to reshape is a tensor with ",
-                                        input.NumElements(),
-                                        " values, but the requested shape has ",
-                                        shape.num_elements()));
-
-    MklShape mkl_shape_input;
-    GetMklShape(context, 0, &mkl_shape_input);
-    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
-    if (input_in_mkl_format) {
-      TensorShape& shape_to = shape;
-      TensorShape shape_from;
-      for (size_t i = 0; i < mkl_shape_input.GetDimension(); i++) {
-        // Outermost to innermost dimension
-        shape_from.AddDim(
-            mkl_shape_input.GetSizes()[mkl_shape_input.tf_dim_idx(i)]);
-      }
-
-      if (shape_from == shape_to) {
-        CopyMklTensorInToOut(context, 0, 0);
-        return;
-      } else {
-        // Allocate output tensor.
-        Tensor* output_tensor = NULL;
-        MklShape mkl_shape_output;
-        mkl_shape_output.SetMklTensor(false);
-        AllocateOutputSetMklShape(context, 0, &output_tensor, shape_to,
-                                  mkl_shape_output);
-
-        // Get output layout pointer.
-        dnnLayout_t output_layout =
-            static_cast<dnnLayout_t>(mkl_shape_input.GetTfLayout());
-
-        // Execute DNNConversion.
-        // Note: we  assume an MKL tensor always have float as its data type.
-        void* input_buffer =
-            static_cast<void*>(const_cast<float*>(input.flat<float>().data()));
-        void* output_buffer = static_cast<void*>(
-            const_cast<float*>(output_tensor->flat<float>().data()));
-        mkl_shape_input.GetConvertedFlatData(output_layout, input_buffer,
-                                             output_buffer);
-
-        VLOG(1) << "MKLToTFConversion complete successfully.";
-        return;
-      }
-    } else {
-      CopyTfTensorInToOutWithShape(context, 0, 0, shape);
-    }
-  }
-
-#else
-
  private:
   // When the input tensor is in MKL layout and we are reshaping the tensor to a
   // different shape than its actual shape, then we use MKLDNN reorder primitive
@@ -316,7 +213,6 @@ class MklReshapeOp : public OpKernel {
     }
   }
 
-#endif  // INTEL_MKL_ML_ONLY
 
  private:
   const int kInputSlotIdx = 0;
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index d63e14adf661e39924ec153cf7c76ab309b96f8b..85cabeb92b69653787ebeebd2eae4f17017063bc 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -327,7 +327,7 @@ class MklDnnSliceOp : public OpKernel {
       output_mkl_shape->SetTfLayout(input_mkl_shape.GetDimension(), output_dims,
                                     input_mkl_shape.GetTfDataFormat());
 
-      output_tf_shape.AddDim((output_pd->get_size() / sizeof(T)) + 1);
+      output_tf_shape.AddDim(output_pd->get_size() / sizeof(T));
     } else {
       // If input is not in Mkl layout, then output won't be in Mkl layout.
       output_mkl_shape->SetMklTensor(false);
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index cfab529662fd334b29181c3c895556eaa7867854..094129ae3efe87e070f8a27c8584f67c927bbec3 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -56,7 +56,7 @@ class MklSoftmaxOp : public OpKernel {
       MklDnnShape src_mkl_shape;
       GetMklShape(context, src_idx, &src_mkl_shape);
 
-      // src_dims is the dimenstion of src_tensor
+      // src_dims is the dimension of src_tensor
       // dim of the dst will also be same as src_dims
       auto src_tf_shape = src_mkl_shape.IsMklTensor()
                               ? src_mkl_shape.GetTfShape()
@@ -64,12 +64,12 @@ class MklSoftmaxOp : public OpKernel {
       auto src_dims = TFShapeToMklDnnDims(src_tf_shape);
       auto output_dims = src_dims;
       memory::format layout_type;
-      // In MKL, data format passed to mkl softmax op depends on dimension of the input tensor.
-      // Here "x" data format in MKL is used for 1 dim tensor, "nc" for 2 dim tensor, 
-      // "tnc" for 3 dim tensor, "nchw" for 4 dim tensor, and "ncdhw" for 5 dim tensor.
-      // Each of the simbols has the following meaning:
-      // n = batch, c = channels, t = sequence lenght, h = height,
-      // w = width, d = depth 
+      // In MKL, data format passed to mkl softmax op depends on dimension of
+      // the input tensor. Here "x" data format in MKL is used for 1 dim tensor,
+      // "nc" for 2 dim tensor, "tnc" for 3 dim tensor, "nchw" for 4 dim tensor,
+      // and "ncdhw" for 5 dim tensor. Each of the simbols has the following
+      // meaning: n = batch, c = channels, t = sequence length, h = height, w =
+      // width, d = depth
       switch (input_dims) {
         case 1:
           layout_type = memory::format::x;
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index 894c2e34e890ce4508a994d3eef4d4e2bc601fcf..665ec4c8079303219023ba4b7623a1db516e48f5 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -32,15 +32,9 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifdef INTEL_MKL_ML_ONLY
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
 #include "tensorflow/core/util/mkl_util.h"
 
-#ifndef INTEL_MKL_ML_ONLY
 using mkldnn::stream;
-#endif
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -64,7 +58,6 @@ class MklToTfOp : public OpKernel {
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
-#ifndef INTEL_MKL_ML_ONLY
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
@@ -111,8 +104,7 @@ class MklToTfOp : public OpKernel {
       // Do we need to reorder Mkl layout into TensorFlow layout?
       if (input.IsReorderNeeded(output_tf_pd)) {
         // Insert reorder between Mkl layout and TensorFlow layout.
-        CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, output_tensor),
-                 true);
+        CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, output_tensor), true);
       } else {
         // If not, just forward input tensor to output tensor.
         CHECK(output_tensor->CopyFrom(input_tensor, output_shape));
@@ -125,57 +117,6 @@ class MklToTfOp : public OpKernel {
                           __FILE__, ":", __LINE__));
     }
   }
-#else
-  static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
-                             string data_format_str, DataType op_data_type,
-                             bool has_avx512f, uint32 input_number) {
-    // Check that input tensor is in MKL format.
-    const Tensor& input_tensor = MklGetInput(context, input_number);
-    MklShape input_shape;
-    GetMklShape(context, input_number, &input_shape);
-
-    // if input is already in Tf format, then just copy input tensor to output.
-    if (!input_shape.IsMklTensor()) {
-      context->set_output(input_number, input_tensor);
-      VLOG(1) << "MKLToTFConversion: No conversion needed, "
-              << "copying input to output";
-      return;
-    }
-
-    // Check that input data type is same as operator data type and that it is
-    // same as output data type.
-    DataType input_data_type = op_kernel->input_type(input_number);
-    DataType output_data_type = op_kernel->output_type(input_number);
-    CHECK_EQ(op_data_type, input_data_type);
-    CHECK_EQ(op_data_type, output_data_type);
-
-    TensorShape output_shape;
-    size_t ndims = input_shape.GetDimension();
-    size_t* in_sizes = new size_t[ndims];
-    for (size_t i = 0; i < ndims; i++) {
-      // Outermost to innermost dimension
-      output_shape.AddDim(input_shape.GetSizes()[input_shape.tf_dim_idx(i)]);
-      in_sizes[i] = input_shape.GetSizes()[i];
-    }
-
-    // Allocate output tensor.
-    Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(input_number, output_shape,
-                                                     &output_tensor));
-
-    dnnLayout_t output_layout =
-        static_cast<dnnLayout_t>(input_shape.GetTfLayout());
-    // Execute DNNConversion.
-    void* input_buffer =
-        static_cast<void*>(const_cast<T*>(input_tensor.flat<T>().data()));
-    delete[] in_sizes;
-    void* output_buffer =
-        static_cast<void*>(const_cast<T*>(output_tensor->flat<T>().data()));
-    input_shape.GetConvertedFlatData(output_layout, input_buffer,
-                                     output_buffer);
-    VLOG(1) << "MKLToTFConversion complete successfully.";
-  }
-#endif
 
  private:
   /// Data format of the operation
@@ -200,6 +141,7 @@ class MklToTfOp : public OpKernel {
                           MklToTfOp<CPUDevice, T>);
 
 TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+TF_CALL_QUANTIZED_TYPES(REGISTER_CPU);
 #undef REGISTER_CPU
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index 6bbe271c542f7b1e54a7f14286863f37df0e9674..edc71569a60eb89682b5eda203942fbe723401a3 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -25,12 +25,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/kernels/transpose_op.h"
 
-#ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::stream;
-#endif
 
 namespace tensorflow {
 
@@ -106,7 +104,6 @@ static const char kMKLConjugateTranspose = 'C';
 
 #endif  // if !defined(INTEL_MKL_DNN_ONLY)
 
-#ifndef INTEL_MKL_ML_ONLY
 // MKL-DNN based Transpose implementation
 template <typename T>
 Status MKLTransposeND(OpKernelContext* ctx, const Tensor& in, Tensor* out,
@@ -144,7 +141,7 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     out.SetUsrMem(in_dims, out_strides, out_tensor);
 
     std::vector<primitive> net;
-    net.push_back(in.CreateReorder(in.GetUsrMem(), out.GetUsrMem()));
+    net.push_back(FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem()));
     stream(stream::kind::eager).submit(net).wait();
     return Status::OK();
   } catch (mkldnn::error& e) {
@@ -154,7 +151,6 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     return errors::Aborted("Operation received an exception:", error_msg);
   }
 }
-#endif  // #ifndef INTEL_MKL_ML_ONLY
 
 }  // namespace
 
@@ -181,7 +177,6 @@ Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
   }
 #endif
 
-#ifndef INTEL_MKL_ML_ONLY
   // MKL-DNN has limit on the maximum number of dimensions in a tensor.
   // Fallback to Eigen for not supported cases.
   if (in.dims() <= TENSOR_MAX_DIMS) {
@@ -194,7 +189,6 @@ Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
         break;
     }
   }
-#endif
 
   // Fallback to eigen if transpose parameters not supported by MKL or MKL-DNN
   typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -227,7 +221,6 @@ Status MklConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
   }
 #endif
 
-#ifndef INTEL_MKL_ML_ONLY
   // MKL-DNN has limit on the maximum number of dimensions in a tensor.
   // Fallback to Eigen for not supported cases.
   if (in.dims() <= TENSOR_MAX_DIMS) {
@@ -240,7 +233,6 @@ Status MklConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
         break;
     }
   }
-#endif
 
   // Fallback to eigen if transpose parameters not supported by MKL or MKL-DNN
   typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/kernels/nccl_ops.cc b/tensorflow/core/kernels/nccl_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6fdeb224781ed6dbf2cdf63c82037eb04a722cc6
--- /dev/null
+++ b/tensorflow/core/kernels/nccl_ops.cc
@@ -0,0 +1,246 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include <vector>
+
+#include "third_party/nccl/nccl.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+
+namespace tensorflow {
+namespace {
+
+// Base class for all communicator ops that use nccl.
+//
+// About memory management and stream syncing:
+// 1. The nccl communicator has a stream for each rank.
+// 2. For input tensors to the communicator, the compute stream is passed to the
+//    NcclManager which will do a needed
+//    communicator_stream.ThenWaitFor(input_tensor_stream).
+// 3. The done_callback of the async kernel is not called by the
+//    NcclManager until after the communicator kernel is complete. This
+//    is enough to a) keep the input tensor data valid for the lifetime of the
+//    collective; and b) ensure the data in the output tensor is available
+//    when the async op kernel's done callback is called.
+class NcclAsyncOpBase : public AsyncOpKernel {
+ public:
+  explicit NcclAsyncOpBase(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("num_devices", &num_devices_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &collective_prefix_));
+  }
+
+  string GetCollectiveKey(OpKernelContext* c) {
+    return strings::StrCat(collective_prefix_, ";", c->step_id(), ";",
+                           c->frame_iter().frame_id, ":",
+                           c->frame_iter().iter_id);
+  }
+
+  int num_devices() const { return num_devices_; }
+
+ private:
+  int num_devices_;
+  string collective_prefix_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(NcclAsyncOpBase);
+};
+
+class NcclReduceOpBase : public NcclAsyncOpBase {
+ public:
+  explicit NcclReduceOpBase(OpKernelConstruction* c) : NcclAsyncOpBase(c) {
+    string reduction;
+    OP_REQUIRES_OK(c, c->GetAttr("reduction", &reduction));
+    if (reduction == "min") {
+      reduction_op_ = ncclMin;
+    } else if (reduction == "max") {
+      reduction_op_ = ncclMax;
+    } else if (reduction == "sum") {
+      reduction_op_ = ncclSum;
+    } else if (reduction == "prod") {
+      reduction_op_ = ncclProd;
+    } else {
+      OP_REQUIRES_OK(c,
+                     errors::InvalidArgument("Invalid reduction: ", reduction));
+    }
+  }
+
+  ncclRedOp_t reduction_op() const { return reduction_op_; }
+
+ private:
+  ncclRedOp_t reduction_op_;
+};
+
+// To execute a single all-reduce, this kernel is called once for each of the
+// <k> devices in the communicator.
+class NcclAllReduceOpKernel : public NcclReduceOpBase {
+ public:
+  explicit NcclAllReduceOpKernel(OpKernelConstruction* c)
+      : NcclReduceOpBase(c) {}
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    const Tensor* in_t = &c->input(0);
+    Tensor* out_t;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, in_t->shape(), &out_t), done);
+
+    auto actual_done = [c, done](Status s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+
+    auto* compute_stream = c->op_device_context()->stream();
+    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
+    NcclManager::instance()->AddToAllReduce(
+        num_devices(), GetCollectiveKey(c), reduction_op(),
+        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
+        compute_stream, in_t, out_t, std::move(actual_done));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("NcclAllReduce").Device(DEVICE_GPU),
+                        NcclAllReduceOpKernel);
+
+// To execute a single reduce, this kernel is called once for all but one of the
+// <k> devices in the communicator, and NcclReduceRecvKernel is called once for
+// the remaining device.
+class NcclReduceSendKernel : public NcclReduceOpBase {
+ public:
+  explicit NcclReduceSendKernel(OpKernelConstruction* c)
+      : NcclReduceOpBase(c) {}
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    auto actual_done = [c, done](Status s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+
+    auto* compute_stream = c->op_device_context()->stream();
+    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
+    NcclManager::instance()->AddReduceSend(
+        num_devices(), GetCollectiveKey(c), reduction_op(),
+        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
+        compute_stream, &c->input(0), std::move(actual_done));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("_NcclReduceSend").Device(DEVICE_GPU),
+                        NcclReduceSendKernel);
+
+// To execute a single reduce, this kernel is called once for one devices, and
+// NcclReduceSendKernel is called for all other <k-1> devices in the
+// communicator.
+class NcclReduceRecvKernel : public NcclReduceOpBase {
+ public:
+  explicit NcclReduceRecvKernel(OpKernelConstruction* c)
+      : NcclReduceOpBase(c) {}
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    const Tensor& in_t = c->input(0);
+    Tensor* out_t;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, in_t.shape(), &out_t), done);
+
+    auto actual_done = [c, done](Status s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+
+    auto* compute_stream = c->op_device_context()->stream();
+    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
+    NcclManager::instance()->AddReduceRecv(
+        num_devices(), GetCollectiveKey(c), reduction_op(),
+        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
+        compute_stream, &in_t, out_t, std::move(actual_done));
+  }
+
+ private:
+  ncclRedOp_t reduction_op_;
+};
+REGISTER_KERNEL_BUILDER(Name("_NcclReduceRecv").Device(DEVICE_GPU),
+                        NcclReduceRecvKernel);
+
+// To execute a single broadcast, this kernel is called once for one device, and
+// NcclBroadcastRecvKernel is called for all other <k-1> devices in the
+// communicator.
+class NcclBroadcastSendKernel : public NcclAsyncOpBase {
+ public:
+  explicit NcclBroadcastSendKernel(OpKernelConstruction* c)
+      : NcclAsyncOpBase(c) {}
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    auto actual_done = [c, done](Status s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+
+    auto* compute_stream = c->op_device_context()->stream();
+    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
+    NcclManager::instance()->AddBroadcastSend(
+        num_devices(), GetCollectiveKey(c), compute_stream->parent(),
+        gpu_info->gpu_id, gpu_info->event_mgr, compute_stream, &c->input(0),
+        std::move(actual_done));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("_NcclBroadcastSend").Device(DEVICE_GPU),
+                        NcclBroadcastSendKernel);
+
+// To execute a single broadcast, this kernel is called once for all but one of
+// the <k> devices in the communicator, and NcclBroadcastSendKernel is called
+// once for the remaining device.
+class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
+ public:
+  explicit NcclBroadcastRecvKernel(OpKernelConstruction* c)
+      : NcclAsyncOpBase(c) {}
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    const Tensor& shape_t = c->input(0);
+    TensorShape shape;
+    OP_REQUIRES_OK_ASYNC(
+        c, TensorShapeUtils::MakeShape(shape_t.vec<int32>(), &shape), done);
+    Tensor* out_t;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape, &out_t), done);
+
+    auto actual_done = [c, done](Status s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+
+    auto* compute_stream = c->op_device_context()->stream();
+    auto* gpu_info = c->device()->tensorflow_gpu_device_info();
+    NcclManager::instance()->AddBroadcastRecv(
+        num_devices(), GetCollectiveKey(c), compute_stream->parent(),
+        gpu_info->gpu_id, gpu_info->event_mgr, compute_stream, out_t,
+        std::move(actual_done));
+  }
+};
+REGISTER_KERNEL_BUILDER(
+    Name("_NcclBroadcastRecv").Device(DEVICE_GPU).HostMemory("shape"),
+    NcclBroadcastRecvKernel);
+
+// Define stub kernels for the ops that get replaced post placement.
+class NcclStubKernel : public AsyncOpKernel {
+ public:
+  explicit NcclStubKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {}
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    c->SetStatus(errors::Unimplemented(
+        "This op should be replaced during graph optimization."));
+    done();
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("NcclBroadcast").Device(DEVICE_GPU),
+                        NcclStubKernel);
+REGISTER_KERNEL_BUILDER(Name("NcclReduce").Device(DEVICE_GPU), NcclStubKernel);
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 81ce6d6e9533bd4b119becef215445bc6254854c..37f615abd97044caa7703837714840b8d451d420 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -163,7 +163,8 @@ void DoNonMaxSuppressionOp(
     // therefore we iterate through the previously selected boxes backwards
     // in order to see if `next_candidate` should be suppressed.
     bool should_select = true;
-    for (int j = selected.size() - 1; j >= 0; --j) {
+
+    for (int j = static_cast<int>(selected.size()) - 1; j >= 0; --j) {
       if (suppress_check_fn(next_candidate.box_index, selected[j])) {
         should_select = false;
         break;
@@ -277,29 +278,30 @@ class NonMaxSuppressionV2Op : public OpKernel {
   }
 };
 
-class NonMaxSuppressionV3V4Base : public OpKernel {
+template <typename Device, typename T>
+class NonMaxSuppressionV3Op : public OpKernel {
  public:
-  explicit NonMaxSuppressionV3V4Base(OpKernelConstruction* context)
+  explicit NonMaxSuppressionV3Op(OpKernelConstruction* context)
       : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
     // boxes: [num_boxes, 4]
-    boxes_ = context->input(0);
+    const Tensor& boxes = context->input(0);
     // scores: [num_boxes]
-    scores_ = context->input(1);
+    const Tensor& scores = context->input(1);
     // max_output_size: scalar
-    max_output_size_ = context->input(2);
+    const Tensor& max_output_size = context->input(2);
     OP_REQUIRES(
-        context, TensorShapeUtils::IsScalar(max_output_size_.shape()),
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
-                                max_output_size_.shape().DebugString()));
+                                max_output_size.shape().DebugString()));
     // iou_threshold: scalar
     const Tensor& iou_threshold = context->input(3);
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
                                         iou_threshold.shape().DebugString()));
-    iou_threshold_val_ = iou_threshold.scalar<float>()();
-    OP_REQUIRES(context, iou_threshold_val_ >= 0 && iou_threshold_val_ <= 1,
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
+    OP_REQUIRES(context, iou_threshold_val >= 0 && iou_threshold_val <= 1,
                 errors::InvalidArgument("iou_threshold must be in [0, 1]"));
     // score_threshold: scalar
     const Tensor& score_threshold = context->input(4);
@@ -307,62 +309,72 @@ class NonMaxSuppressionV3V4Base : public OpKernel {
         context, TensorShapeUtils::IsScalar(score_threshold.shape()),
         errors::InvalidArgument("score_threshold must be 0-D, got shape ",
                                 score_threshold.shape().DebugString()));
-    score_threshold_val_ = score_threshold.scalar<float>()();
+    const float score_threshold_val = score_threshold.scalar<float>()();
 
-    num_boxes_ = 0;
-    ParseAndCheckBoxSizes(context, boxes_, &num_boxes_);
-    CheckScoreSizes(context, num_boxes_, scores_);
+    int num_boxes = 0;
+    ParseAndCheckBoxSizes(context, boxes, &num_boxes);
+    CheckScoreSizes(context, num_boxes, scores);
     if (!context->status().ok()) {
       return;
     }
 
-    DoComputeAndPostProcess(context);
-  }
-
- protected:
-  virtual void DoComputeAndPostProcess(OpKernelContext* context) = 0;
-
-  Tensor boxes_;
-  Tensor scores_;
-  Tensor max_output_size_;
-  int num_boxes_;
-  float iou_threshold_val_;
-  float score_threshold_val_;
-};
-
-template <typename Device, typename T>
-class NonMaxSuppressionV3Op : public NonMaxSuppressionV3V4Base {
- public:
-  explicit NonMaxSuppressionV3Op(OpKernelConstruction* context)
-      : NonMaxSuppressionV3V4Base(context) {}
-
- protected:
-  void DoComputeAndPostProcess(OpKernelContext* context) override {
     auto suppress_check_fn =
-        CreateIOUSuppressCheckFn<T>(boxes_, iou_threshold_val_);
+        CreateIOUSuppressCheckFn<T>(boxes, iou_threshold_val);
 
-    DoNonMaxSuppressionOp<T>(context, scores_, num_boxes_, max_output_size_,
-                             score_threshold_val_, suppress_check_fn);
+    DoNonMaxSuppressionOp<T>(context, scores, num_boxes, max_output_size,
+                             score_threshold_val, suppress_check_fn);
   }
 };
 
 template <typename Device, typename T>
-class NonMaxSuppressionV4Op : public NonMaxSuppressionV3V4Base {
+class NonMaxSuppressionV4Op : public OpKernel {
  public:
   explicit NonMaxSuppressionV4Op(OpKernelConstruction* context)
-      : NonMaxSuppressionV3V4Base(context) {
+      : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("pad_to_max_output_size",
                                              &pad_to_max_output_size_));
   }
 
- protected:
-  void DoComputeAndPostProcess(OpKernelContext* context) override {
+  void Compute(OpKernelContext* context) override {
+    // boxes: [num_boxes, 4]
+    const Tensor& boxes = context->input(0);
+    // scores: [num_boxes]
+    const Tensor& scores = context->input(1);
+    // max_output_size: scalar
+    const Tensor& max_output_size = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
+        errors::InvalidArgument("max_output_size must be 0-D, got shape ",
+                                max_output_size.shape().DebugString()));
+    // iou_threshold: scalar
+    const Tensor& iou_threshold = context->input(3);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
+    OP_REQUIRES(context, iou_threshold_val >= 0 && iou_threshold_val <= 1,
+                errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+    // score_threshold: scalar
+    const Tensor& score_threshold = context->input(4);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(score_threshold.shape()),
+        errors::InvalidArgument("score_threshold must be 0-D, got shape ",
+                                score_threshold.shape().DebugString()));
+    const float score_threshold_val = score_threshold.scalar<float>()();
+
+    int num_boxes = 0;
+    ParseAndCheckBoxSizes(context, boxes, &num_boxes);
+    CheckScoreSizes(context, num_boxes, scores);
+    if (!context->status().ok()) {
+      return;
+    }
+
     auto suppress_check_fn =
-        CreateIOUSuppressCheckFn<T>(boxes_, iou_threshold_val_);
+        CreateIOUSuppressCheckFn<T>(boxes, iou_threshold_val);
     int num_valid_outputs;
 
-    DoNonMaxSuppressionOp<T>(context, scores_, num_boxes_, max_output_size_,
-                             score_threshold_val_, suppress_check_fn,
+    DoNonMaxSuppressionOp<T>(context, scores, num_boxes, max_output_size,
+                             score_threshold_val, suppress_check_fn,
                              pad_to_max_output_size_, &num_valid_outputs);
 
     // Allocate scalar output tensor for number of indices computed.
diff --git a/tensorflow/core/kernels/one_hot_op.cc b/tensorflow/core/kernels/one_hot_op.cc
index c66a812cd9531a2950eacf900a4bd6a9f99b8dc7..c3385091a0dad1dcd154f3d28a1f0eebaaa755c7 100644
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@@ -159,6 +159,7 @@ namespace functor {
   DECLARE_GPU_SPEC_INDEX(T, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
 TF_CALL_int32(DECLARE_GPU_SPEC);
 TF_CALL_int64(DECLARE_GPU_SPEC);
 
@@ -182,6 +183,7 @@ TF_CALL_int64(DECLARE_GPU_SPEC);
   REGISTER_ONE_HOT_GPU_INDEX(type, int64);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ONE_HOT_GPU);
+TF_CALL_bool(REGISTER_ONE_HOT_GPU);
 TF_CALL_int32(REGISTER_ONE_HOT_GPU);
 TF_CALL_int64(REGISTER_ONE_HOT_GPU);
 
diff --git a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
index 647515ae38ab5530b69fa135257584eea531d46c..b7a6da61de1ebe7769930126ea9a89b8f7ce81d5 100644
--- a/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/one_hot_op_gpu.cu.cc
@@ -37,6 +37,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPEC_INDEX(T, int64)
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
 TF_CALL_int32(DEFINE_GPU_SPEC);
 TF_CALL_int64(DEFINE_GPU_SPEC);
 
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
index a496487d1b81892a1a8c563769cfc78531c70c06..842dd7984b34effa1d015af00f4a5caa7e90b3ef 100644
--- a/tensorflow/core/kernels/ops_util.h
+++ b/tensorflow/core/kernels/ops_util.h
@@ -16,101 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_OPS_UTIL_H_
 #define TENSORFLOW_CORE_KERNELS_OPS_UTIL_H_
 
-// This file contains utilities for various operations.
-
-#include <array>
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/padding.h"
-
-namespace tensorflow {
-
-// Calculates broadcast starting index and size.  For SAME padding, addition
-// padding could be applied to right, left, top and bottom.  Depending on the
-// current index, input size, kernel size, stride, padding size, the starting
-// index and size for broadcast for that dimension are different from the
-// current index and kernel size.
-// This is mainly used by gradient algorithms for pooling operations.
-Status GetBroadcastSize(const int index, const int in_size, const int ksize,
-                        const int stride, const int pad_size, int* bindex,
-                        int* bsize);
-
-// Converts Brain's Padding to Eigen's PaddingType.
-Eigen::PaddingType BrainPadding2EigenPadding(Padding padding);
-
-// Given a shape 's' of a tensor of type T. Returns true iff the
-// number of bytes occupied by each dim 0 (i.e., &tensor(i + 1, ...) -
-// &tensor(i, ...)) is multiple of EIGEN_MAX_ALIGN_BYTES.
-template <typename T>
-bool IsInnerDimsSizeAligned(const TensorShape& s) {
-  if (s.dims() == 0) return false;
-  const int64 dim0_size = s.dim_size(0);
-  if (dim0_size == 0) return false;
-#if EIGEN_MAX_ALIGN_BYTES == 0
-  return true;
-#else
-  const int64 bytes_per_dim0 = (s.num_elements() / dim0_size) * sizeof(T);
-  return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0;
-#endif
-}
-
-// Given a shape 's' of a tensor of type T and the `start` and `end` index of a
-// dim 0 slice, returns true iff slice is aligned with respect to original
-// tensor. Here aligned implies the address is a multiple of
-// EIGEN_MAX_ALIGN_BYTES.
-template <typename T>
-bool IsDim0SliceAligned(const TensorShape& s, int64 start, int64 end_or_size) {
-  if (s.dims() == 1) {
-#if EIGEN_MAX_ALIGN_BYTES == 0
-    return true;
-#else
-    bool start_aligned = (start * sizeof(T)) % EIGEN_MAX_ALIGN_BYTES == 0;
-    // End is aligned if either the explicit end index is passed and is a
-    // a multiple of EIGEN_MAX_ALIGN_BYTES, or the start index is aligned and
-    // the size is aligned. So for convenience we can either pass start and
-    // index, or start and size.
-    bool end_aligned = (end_or_size * sizeof(T)) % EIGEN_MAX_ALIGN_BYTES == 0;
-    return start_aligned && end_aligned;
-#endif
-  } else {
-    return IsInnerDimsSizeAligned<T>(s);
-  }
-}
-
-// Returns <suffix> sanitized to have only [a-zA-Z0-9-_].
-string SanitizeThreadSuffix(string suffix);
-
-// Helper to compute 'strides' given a tensor 'shape'. I.e.,
-// strides[i] = prod(shape.dim_size[(i+1):])
-template <typename T>
-gtl::InlinedVector<T, 8> ComputeStride(const TensorShape& shape) {
-  const int ndims = shape.dims();
-  gtl::InlinedVector<T, 8> strides(ndims);
-  T stride = 1;
-  for (int i = ndims - 1; i >= 0; --i) {
-    strides[i] = stride;
-    stride *= static_cast<T>(shape.dim_size(i));
-  }
-  return strides;
-}
-
-// Helper to compute 'strides' given an Eigen TensorDimensions
-template <typename T, typename EigenDimensions>
-gtl::InlinedVector<T, 8> ComputeEigenStrides(const EigenDimensions& shape) {
-  const int ndims = shape.rank();
-  gtl::InlinedVector<T, 8> strides(ndims);
-  T stride = 1;
-  for (int i = ndims - 1; i >= 0; --i) {
-    strides[i] = stride;
-    stride *= static_cast<T>(shape[i]);
-  }
-  return strides;
-}
-
-}  // namespace tensorflow
+// Placeholder for the ops_util library that is moved under core/framework.
+#include "tensorflow/core/framework/ops_util.h"
 
 #endif  // TENSORFLOW_CORE_KERNELS_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index aa70ee06f5305dd92210693471390e1ba4ed8a9e..4fe0668364a6cd28de478fa9bc9db1fed2d06f55 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -45,10 +45,10 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace functor {
 using random::PhiloxRandom;
 
+static constexpr int kMaxIterations = 1000;
+
 template <typename T>
 struct TruncatedNormalFunctor<CPUDevice, T> {
-  static const int kMaxIterations = 1000;
-
   void operator()(OpKernelContext* ctx, const CPUDevice& d, int64 num_batches,
                   int64 samples_per_batch, int64 num_elements,
                   typename TTypes<T>::ConstFlat means,
@@ -57,11 +57,20 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                   typename TTypes<T>::ConstFlat maxvals,
                   const random::PhiloxRandom& gen,
                   typename TTypes<T>::Flat output) {
+    // The randn rejection sampling is used when the mean and at least this many
+    // standard deviations are inside the bounds.
+    // The uniform proposal samplers become less efficient as the bounds are
+    // further from the mean, the reverse is true for the randn sampler.
+    // This number was chosen by empirical benchmarking. If modified, the
+    // benchmarks in parameterized_truncated_normal_op_test should also be
+    // changed.
+    const T kStdDevsInsideBoundsToUseRandnSampler = T(1.3);
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
 
     auto DoWork = [samples_per_batch, num_elements, &ctx, &means, &stddevs,
-                   &minvals, &maxvals, &gen,
-                   &output](int start_batch, int limit_batch) {
+                   &minvals, &maxvals, &gen, &output,
+                   kStdDevsInsideBoundsToUseRandnSampler](int start_batch,
+                                                          int limit_batch) {
       // Capturing "gen" by-value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "gen" by reference and explicitly do a copy assignment here.
@@ -73,6 +82,8 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                     4);
       typedef random::UniformDistribution<random::PhiloxRandom, T> Uniform;
       Uniform dist;
+      typedef random::NormalDistribution<random::PhiloxRandom, T> Normal;
+      Normal normal_dist;
 
       // Vectorized intermediate calculations for uniform rejection sampling.
       // We always generate at most 4 samples.
@@ -125,7 +136,52 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
             (normMin + sqrtFactor);
         const T diff = normMax - normMin;
 
-        if (diff < cutoff) {
+        if (((normMin < -kStdDevsInsideBoundsToUseRandnSampler) &&
+             (normMax >= T(0.))) ||
+            ((normMax > kStdDevsInsideBoundsToUseRandnSampler) &&
+             (normMin <= T(0.)))) {
+          // If the bounds are a least 3 standard deviations from the mean
+          // on at least one side then we rejection sample by sampling
+          // from the normal distribution and rejecting samples outside
+          // the bounds.
+          // Under this condition the acceptance rate per iteration should
+          // always be ~ 50%. This sampler is more efficient (and more
+          // numerically stable when one or both bounds is far from the mean).
+
+          while (sample < limit_sample) {
+            const auto randn_sample = normal_dist(&gen_copy);
+            const int size = randn_sample.size();
+
+            for (int i = 0; i < size; i++) {
+              if ((randn_sample[i] >= normMin) &&
+                  (randn_sample[i] <= normMax)) {
+                output(sample) = randn_sample[i] * stddev + mean;
+                sample++;
+                if (sample >= limit_sample) {
+                  break;
+                }
+                numIterations = 0;
+              } else {
+                numIterations++;
+                if (numIterations > kMaxIterations) {
+                  // This should never occur because this sampler should
+                  // (by the selection criteria above) be used if at least 3
+                  // standard deviations of one side of the distribution
+                  // is within the limits (so acceptance probability per
+                  // iterations >~ 1/2 per iteration).
+                  LOG(ERROR) << "TruncatedNormal randn rejection sampler "
+                             << "exceeded maximum iterations for "
+                             << "normMin=" << normMin << " normMax=" << normMax
+                             << " kMaxIterations=" << kMaxIterations;
+                  ctx->SetStatus(errors::Internal(
+                      "TruncatedNormal randn rejection sampler failed to accept"
+                      " a sample."));
+                  return;
+                }
+              }
+            }
+          }
+        } else if (diff < cutoff) {
           // Sample from a uniform distribution on [normMin, normMax].
 
           const T plusFactor = (normMin < T(0)) ? T(0) : normMin * normMin;
@@ -154,9 +210,13 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
                 // the proposal distribution is the uniform distribution
                 // U(lower_bound, upper_bound).
                 if (!accept) {
-                  LOG(WARNING) << "TruncatedNormal uniform rejection sampler "
-                               << "exceeded max iterations. Sample may contain "
-                               << "outliers.";
+                  LOG(ERROR) << "TruncatedNormal uniform rejection sampler "
+                             << "exceeded max iterations. Sample may contain "
+                             << "outliers.";
+                  ctx->SetStatus(errors::Internal(
+                      "TruncatedNormal uniform rejection sampler failed to "
+                      " accept a sample."));
+                  return;
                 }
                 output(sample) = z[i] * stddev + mean;
                 sample++;
@@ -190,9 +250,13 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
               auto accept = (u <= g && z < normMax);
               if (accept || numIterations + 1 >= kMaxIterations) {
                 if (!accept) {
-                  LOG(WARNING) << "TruncatedNormal exponential distribution "
-                               << "rejection sampler exceeds max iterations. "
-                               << "Sample may contain outliers.";
+                  LOG(ERROR) << "TruncatedNormal exponential distribution "
+                             << "rejection sampler exceeds max iterations. "
+                             << "Sample may contain outliers.";
+                  ctx->SetStatus(errors::Internal(
+                      "TruncatedNormal exponential distribution rejection"
+                      " sampler failed to accept a sample."));
+                  return;
                 }
                 output(sample) = z * stddev + mean;
                 sample++;
@@ -357,9 +421,9 @@ class ParameterizedTruncatedNormalOp : public OpKernel {
 
     auto truncFunctor = functor::TruncatedNormalFunctor<Device, T>();
     // Each worker has the fudge factor for samples_per_batch, so use it here.
-    random::PhiloxRandom rng = generator_.ReserveSamples128(
-        num_batches * 2 * truncFunctor.kMaxIterations *
-        (samples_per_batch + 3) / 4);
+    random::PhiloxRandom rng =
+        generator_.ReserveSamples128(num_batches * 2 * functor::kMaxIterations *
+                                     (samples_per_batch + 3) / 4);
     truncFunctor(ctx, ctx->eigen_device<Device>(), num_batches,
                  samples_per_batch, num_elements, means_tensor.flat<T>(),
                  stddevs_tensor.flat<T>(), minvals_tensor.flat<T>(),
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.h b/tensorflow/core/kernels/parameterized_truncated_normal_op.h
index 2e54db31fe40625dbc884757ac368d94db5d8c7a..c919a22c7b0ce93dc76ba2a8be09aa02ca4e0179 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.h
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.h
@@ -27,11 +27,11 @@ namespace functor {
 
 // Sample a truncated normal random variable, with mean, stddev, minval, and
 // maxval parameters for each batch. Uses two rejection sampling algorithms
-// described in http://rd.springer.com/article/10.1007/BF00143942.
+// described in http://rd.springer.com/article/10.1007/BF00143942 and a randn
+// rejection sampler when most of the normal is inside the bounds.
 //
 // Either minval may be -infinity, or maxval may be +infinity. If the interval
-// (minval, maxval) is empty, the result is NaN. Large intervals which include
-// both tails may have reduced accuracy.
+// (minval, maxval) is empty, the result is NaN.
 template <typename Device, typename T>
 struct TruncatedNormalFunctor {
   void operator()(OpKernelContext* ctx, const Device& d, int64 num_batches,
@@ -42,8 +42,6 @@ struct TruncatedNormalFunctor {
                   typename TTypes<T>::ConstFlat maxvals,
                   const random::PhiloxRandom& gen,
                   typename TTypes<T>::Flat output);
-
-  static const int kMaxIterations = 100;
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index 5b80a962bc492b21847703f6e970d6c0bd1d3e74..45ae45e4f4fc33da5d4b19d79b76e489209a667c 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -44,9 +44,12 @@ class OpKernelContext;
 
 namespace functor {
 
+static constexpr int kMaxIterations = 1000;
+
 typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T>
+
 __global__ void __launch_bounds__(1024)
     TruncatedNormalKernel(random::PhiloxRandom gen, T* data, int64 num_batches,
                           int64 samples_per_batch, int64 num_elements,
@@ -59,9 +62,19 @@ __global__ void __launch_bounds__(1024)
   const int32 initial_offset = blockIdx.x * blockDim.x + threadIdx.x;
   gen.Skip(max_samples_per_item * initial_offset);
   typedef random::UniformDistribution<random::PhiloxRandom, T> Uniform;
+  typedef random::NormalDistribution<random::PhiloxRandom, T> Normal;
   Uniform dist;
+  Normal normal_dist;
   const int kDistSize = Uniform::kResultElementCount;
   const T quietNaN = Eigen::NumTraits<T>::quiet_NaN();
+  // The randn rejection sampling is used when the mean and at least this many
+  // standard deviations are inside the bounds.
+  // The uniform proposal samplers become less efficient as the bounds are
+  // further from the mean, the reverse is true for the randn sampler.
+  // This number was chosen by empirical benchmarking. If modified, the
+  // benchmarks in parameterized_truncated_normal_op_test should also be
+  // changed.
+  const T kStdDevsInsideBoundsToUseRandnSampler = T(1.7);
 
   // We skip the total number of threads to get to the next element. To produce
   // deterministic results between devices, each element in the output array
@@ -114,6 +127,32 @@ __global__ void __launch_bounds__(1024)
           (Eigen::numext::isfinite(normMin) ||
            Eigen::numext::isfinite(normMax)))) {
       data[offset] = quietNaN;
+    } else if (((normMin < -kStdDevsInsideBoundsToUseRandnSampler) &&
+                (normMax >= T(0.))) ||
+               ((normMax > kStdDevsInsideBoundsToUseRandnSampler) &&
+                (normMin <= T(0.)))) {
+      Eigen::array<T, 4> n;
+
+      int numIterations = 0;
+      while (numIterations < kMaxIterations) {
+        const auto randn = normal_dist(&gen);
+        remaining_samples -= gen.kResultElementCount;
+        UNROLL for (int i = 0; i < kDistSize; i++) {
+          if ((randn[i] >= normMin) && randn[i] <= normMax) {
+            data[offset] = randn[i] * stddev + mean;
+            numIterations = kMaxIterations;
+            break;
+          } else if (numIterations + 1 == kMaxIterations) {
+            // If we did not successfully sample after all these iterations
+            // something is wrong. Output a nan.
+            data[offset] = quietNaN;
+            numIterations = kMaxIterations;
+            break;
+          } else {
+            numIterations++;
+          }
+        }
+      }
     } else if (diff < cutoff) {
       // Sample from a uniform distribution on [normMin, normMax].
 
@@ -138,19 +177,17 @@ __global__ void __launch_bounds__(1024)
         const auto u = dist(&gen);
         remaining_samples -= gen.kResultElementCount;
         UNROLL for (int i = 0; i < kDistSize; i++) {
-          if (u[i] <= Eigen::numext::exp(g[i]) ||
-              numIterations + 1 >= kMaxIterations) {
+          bool accept = u[i] <= Eigen::numext::exp(g[i]);
+          if (accept) {
             // Accept the sample z.
-            // If we run out of iterations, just use the current uniform
-            // sample. Emperically, the probability of accepting each sample
-            // is at least 50% for typical inputs, so we will always accept
-            // by 100 iterations.
-            // This introduces a slight inaccuracy when at least one bound
-            // is large, minval is negative and maxval is positive.
             data[offset] = z[i] * stddev + mean;
             // Break out of the nested loop by updating numIterations.
             numIterations = kMaxIterations;
             break;
+          } else if (numIterations + 1 >= kMaxIterations) {
+            data[offset] = quietNaN;
+            numIterations = kMaxIterations;
+            break;
           } else {
             numIterations++;
           }
@@ -171,11 +208,16 @@ __global__ void __launch_bounds__(1024)
           const T x = normMin < alpha ? alpha - z : normMin - alpha;
           const T g = Eigen::numext::exp(-x * x / two);
           const T u = rand[i + 1];
-          if ((u <= g && z < normMax) || numIterations + 1 >= kMaxIterations) {
+          bool accept = (u <= g && z < normMax);
+          if (accept) {
             data[offset] = z * stddev + mean;
             // Break out of the nested loop by updating numIterations.
             numIterations = kMaxIterations;
             break;
+          } else if (numIterations + 1 >= kMaxIterations) {
+            data[offset] = quietNaN;
+            numIterations = kMaxIterations;
+            break;
           } else {
             numIterations++;
           }
@@ -190,8 +232,6 @@ __global__ void __launch_bounds__(1024)
 // Partial specialization for GPU
 template <typename T>
 struct TruncatedNormalFunctor<GPUDevice, T> {
-  static const int kMaxIterations = 1000;
-
   void operator()(OpKernelContext* ctx, const GPUDevice& d, int64 num_batches,
                   int64 samples_per_batch, int64 num_elements,
                   typename TTypes<T>::ConstFlat means,
@@ -209,7 +249,7 @@ struct TruncatedNormalFunctor<GPUDevice, T> {
             stddevs.dimension(0) == 1, minvals.data(),
             minvals.dimension(0) == 1, maxvals.data(),
             maxvals.dimension(0) == 1, kMaxIterations);
-  };
+  }
 };
 
 // Explicit instantiation of the GPU distributions functors
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 3979e4b53a6ddd23cf5440d0b230a5c69e365139..ba51db219ec5528d1dd98f744e70c5cd2cf6c6f8 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -24,6 +24,12 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_partition.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/reffed_status_callback.h"
 
@@ -35,7 +41,6 @@ namespace tensorflow {
 typedef FunctionLibraryRuntime::Handle FHandle;
 
 namespace {
-
 // A `PartitionedCallOp` asynchronously executes a function, potentially across
 // multiple devices but within a single process. The kernel places and
 // partitions a given function's underlying graph, and executes each of the
@@ -46,6 +51,30 @@ class PartitionedCallOp : public AsyncOpKernel {
  public:
   explicit PartitionedCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
+    string deprecated_config_serialized;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &deprecated_config_serialized));
+    string config_proto_serialized;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config_proto", &config_proto_serialized));
+    OP_REQUIRES(
+        ctx,
+        deprecated_config_serialized.empty() || config_proto_serialized.empty(),
+        errors::InvalidArgument("Provided both 'config' and 'config_proto' but "
+                                "only one should be provided.  Note the "
+                                "'config' option is deprecated."));
+    if (!deprecated_config_serialized.empty()) {
+      OP_REQUIRES(ctx,
+                  config_proto_.mutable_graph_options()
+                      ->mutable_rewrite_options()
+                      ->ParseFromString(deprecated_config_serialized),
+                  errors::InvalidArgument("Unable to parse config string as "
+                                          "tensorflow::RewriteOptions proto."));
+    } else {
+      OP_REQUIRES(
+          ctx, config_proto_.ParseFromString(config_proto_serialized),
+          errors::InvalidArgument("Unable to parse config_proto string as "
+                                  "tensorflow::ConfigProto proto."));
+    }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_));
   }
 
   ~PartitionedCallOp() override {}
@@ -76,12 +105,6 @@ class PartitionedCallOp : public AsyncOpKernel {
     {
       mutex_lock l(mu_);
       if (function_handles_.find(lib) == function_handles_.end()) {
-        if (local_device_name_.empty()) {
-          // The full local device name isn't known at kernel construction
-          // time, hence the need to set it here.
-          local_device_name_ = lib->device()->name();
-        }
-
         // TODO(b/37549631): Because this kernel may correspond to a stateful
         // op, it may be shared by multiple subgraphs, which in turn may have
         // different `FunctionLibraryRuntime` objects and therefore different
@@ -109,8 +132,7 @@ class PartitionedCallOp : public AsyncOpKernel {
         // by name.
         auto graph = tensorflow::MakeUnique<Graph>(fbody->graph->flib_def());
         FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-        TF_CHECK_OK(
-                    graph.get()->AddFunctionLibrary(global_flib.ToProto()));
+        TF_CHECK_OK(graph->AddFunctionLibrary(global_flib.ToProto()));
         CopyGraph(*fbody->graph, graph.get());
         OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
 
@@ -127,8 +149,11 @@ class PartitionedCallOp : public AsyncOpKernel {
                              "find cached function partitions; "
                              "this indicates a bug."),
             done);
-        FunctionLibraryDefinition* overlay_lib =
-            new FunctionLibraryDefinition(*lib->GetFunctionLibraryDefinition());
+        // We do not need a full function library in the overlay, we just keep a
+        // subset that is reachable from the instantiated function.
+        FunctionLibraryDefinition* overlay_lib = new FunctionLibraryDefinition(
+            grappler::ReachableFunctionLibraryDefinition(
+                *lib->GetFunctionLibraryDefinition(), fbody->fdef));
         overlay_libs_.emplace(lib, overlay_lib);
 
         GraphOptimizationPassOptions optimization_options;
@@ -145,13 +170,32 @@ class PartitionedCallOp : public AsyncOpKernel {
             OptimizationPassRegistry::Global()->RunGrouping(
                 OptimizationPassRegistry::PRE_PLACEMENT, optimization_options),
             done);
-        Placer placer(graph.get(), &device_set);
+
+        // Make the FunctionLibraryRuntime's device the default device if
+        // nothing else is hard coded. This allows the same function definition
+        // to be specialized to different devices depending on the
+        // PartitionedCallOp's device.
+        Placer placer(graph.get(), &device_set,
+                      nullptr, /* No session options */
+                      lib->device() /* Default device */);
         OP_REQUIRES_OK_ASYNC(ctx, placer.Run(), done);
         OP_REQUIRES_OK_ASYNC(
             ctx,
             OptimizationPassRegistry::Global()->RunGrouping(
                 OptimizationPassRegistry::POST_PLACEMENT, optimization_options),
             done);
+
+        Device* cpu_device;
+        OP_REQUIRES_OK_ASYNC(
+            ctx, lib->device_mgr()->LookupDevice("CPU:0", &cpu_device), done);
+
+        // Run grappler passes on the graph. It is possible that these are
+        // optimized by the graph executor already.
+        OP_REQUIRES_OK_ASYNC(ctx,
+                             OptimizeGraph(ctx, fbody->ret_nodes, overlay_lib,
+                                           device_set, cpu_device, &graph),
+                             done);
+
         OP_REQUIRES_OK_ASYNC(
             ctx,
             OptimizationPassRegistry::Global()->RunGrouping(
@@ -163,6 +207,13 @@ class PartitionedCallOp : public AsyncOpKernel {
         OP_REQUIRES_OK_ASYNC(
             ctx, PartitionHelper(device_set, std::move(graph), &subgraphs),
             done);
+        if (ctx->graph_collector() != nullptr) {
+          for (const auto& pair : subgraphs) {
+            GraphDef def;
+            pair.second->ToGraphDef(&def);
+            ctx->graph_collector()->CollectGraph(def);
+          }
+        }
         optimization_options.graph = nullptr;
         optimization_options.device_set = nullptr;
         optimization_options.partition_graphs = &subgraphs;
@@ -186,6 +237,7 @@ class PartitionedCallOp : public AsyncOpKernel {
               ctx, GraphToFunctionDef(*subgraph, unique_name, &shard), done);
           OP_REQUIRES_OK_ASYNC(ctx, overlay_lib->AddFunctionDef(shard), done);
           FunctionLibraryRuntime::InstantiateOptions opts;
+          opts.executor_type = executor_type_;
           opts.target = target;
           opts.overlay_lib = overlay_lib;
           FHandle handle;
@@ -222,6 +274,12 @@ class PartitionedCallOp : public AsyncOpKernel {
         int index = attr_value->i();
         TF_RETURN_IF_ERROR(node->attrs().Find("T", &attr_value));
         DataType dtype = attr_value->type();
+        if (dtype != args[index].dtype()) {
+          return errors::InvalidArgument("For argument ", index, " expected ",
+                                         DataTypeString(dtype), " tensor, got ",
+                                         DataTypeString(args[index].dtype()),
+                                         " instead.");
+        }
         if (dtype == DT_RESOURCE) {
           const ResourceHandle& handle = args[index].flat<ResourceHandle>()(0);
           node->set_assigned_device_name(handle.device());
@@ -266,8 +324,7 @@ class PartitionedCallOp : public AsyncOpKernel {
     for (const auto& partition : partitions) {
       std::unique_ptr<Graph> subgraph(new Graph(graph->flib_def()));
       FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-      TF_CHECK_OK(
-                subgraph.get()->AddFunctionLibrary(global_flib.ToProto()));
+      TF_CHECK_OK(subgraph->AddFunctionLibrary(global_flib.ToProto()));
       GraphConstructorOptions opts;
       opts.allow_internal_ops = true;
       opts.expect_device_spec = true;
@@ -317,14 +374,6 @@ class PartitionedCallOp : public AsyncOpKernel {
       }
     }
 
-    // Rewrite the indices of the Arg and Retval nodes for this function
-    // to range from 0 to the number of Arg nodes, Retval nodes, respectively.
-    auto sort_by_index = [](std::pair<Node*, int> one,
-                            std::pair<Node*, int> two) -> bool {
-      return one.second < two.second;
-    };
-    std::sort(arg_nodes.begin(), arg_nodes.end(), sort_by_index);
-    std::sort(ret_nodes.begin(), ret_nodes.end(), sort_by_index);
     for (int i = 0; i < arg_nodes.size(); ++i) {
       Node* arg = arg_nodes[i].first;
       arg->AddAttr("index", i);
@@ -382,6 +431,7 @@ class PartitionedCallOp : public AsyncOpKernel {
       return;
     }
 
+    const string& local_device_name = lib->device()->name();
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
     opts.step_container = ctx->step_container();
@@ -390,7 +440,7 @@ class PartitionedCallOp : public AsyncOpKernel {
     // TODO(akshayka): Consider selecting a runner on a per-device basis, i.e.,
     // using device-specific threadpools when available.
     opts.runner = ctx->runner();
-    opts.source_device = local_device_name_;
+    opts.source_device = local_device_name;
     opts.allow_dead_tensors = true;
     // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
     // constructed rendezvous to a rendezvous manager.
@@ -404,7 +454,7 @@ class PartitionedCallOp : public AsyncOpKernel {
         },
         rendez, std::move(done), std::placeholders::_1);
     auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
-    for (int i = 1; i < handles->size(); ++i) {
+    for (int i = 0; i < handles->size(); ++i) {
       refcounted_done->Ref();
     }
 
@@ -418,7 +468,7 @@ class PartitionedCallOp : public AsyncOpKernel {
       const std::vector<int>& ret_indices = indices.second;
       opts.args_alloc_attrs = alloc_attrs.first;
       opts.rets_alloc_attrs = alloc_attrs.second;
-      if (target == local_device_name_) {
+      if (target == local_device_name) {
         opts.remote_execution = false;
         std::vector<Tensor> args = GetArgsForIndices(arg_indices, op_args);
         std::vector<Tensor>* rets = new std::vector<Tensor>;
@@ -458,6 +508,7 @@ class PartitionedCallOp : public AsyncOpKernel {
             });
       }
     }
+    refcounted_done->Unref();
   }
 
   string UniquifyFunctionName(const FunctionLibraryDefinition* function_library,
@@ -470,11 +521,81 @@ class PartitionedCallOp : public AsyncOpKernel {
     }
   }
 
+  Status OptimizeGraph(OpKernelContext* ctx,
+                       const gtl::InlinedVector<Node*, 4>& ret_nodes,
+                       FunctionLibraryDefinition* flib,
+                       const DeviceSet& device_set, Device* cpu_device,
+                       std::unique_ptr<Graph>* graph) {
+    if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto_)) {
+      return Status::OK();
+    }
+
+    tensorflow::grappler::GrapplerItem item;
+
+    // Add all available devices so that inlined function can be placed.
+    for (const Device* d : device_set.devices()) {
+      Status added_device = item.AddDevice(d->name());
+      if (!added_device.ok()) VLOG(3) << added_device.error_message();
+    }
+
+    // Add fetches so that the graph can be pruned.
+    for (Node* node : ret_nodes) {
+      item.fetch.push_back(node->name());
+    }
+
+    (*graph)->ToGraphDef(&item.graph);
+
+    if (flib) {
+      *item.graph.mutable_library() = flib->ToProto();
+    }
+
+    tensorflow::GraphDef out_graph;
+
+    tensorflow::grappler::VirtualCluster cluster(&device_set);
+
+    // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
+    // proto (which also contain the OptimizerOptions).
+    TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+        item, config_proto_, cpu_device, &cluster, &out_graph));
+
+    std::unique_ptr<Graph> optimized_graph(new Graph(OpRegistry::Global()));
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
+        GraphConstructorOptions(), out_graph, optimized_graph.get()));
+
+    // Copy optimized functions back to the overlay lib.
+    if (flib) {
+      for (const FunctionDef& fdef : out_graph.library().function()) {
+        const string& func_name = fdef.signature().name();
+        if (flib->Contains(func_name)) {
+          TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, fdef));
+        } else {
+          TF_RETURN_IF_ERROR(flib->AddFunctionDef(fdef));
+        }
+      }
+    }
+
+    *graph = std::move(optimized_graph);
+
+    // The graph conversion sets the requested device names but not the
+    // assigned device names. However, since at this point the graph is
+    // placed TF expects an assigned device name for every node. Therefore
+    // we copy the requested device into the assigned device field.
+    for (Node* node : graph->get()->nodes()) {
+      node->set_assigned_device_name(node->requested_device());
+    }
+
+    return Status::OK();
+  }
+
   NameAttrList func_;
-  string local_device_name_;
+  ConfigProto config_proto_;
+  string executor_type_;
   // Contains maps from device names to handles of function partitions, keyed by
   // FunctionLibraryRuntime pointers. (Because this kernel may be instantiated
-  // for a stateful op, different invocations of it may use different FLRs.)
+  // for a stateful op, different invocations of it may use different
+  // FLRs. Different device placements of PartitionedCallOp also use different
+  // FLRs, and we use this to set the "default" device for the function to
+  // PartitionedCallOp's device.)
   gtl::FlatMap<FunctionLibraryRuntime*,
                std::unique_ptr<gtl::FlatMap<string, FHandle>>>
       function_handles_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index dadc15b69ee67b51be1647a1e8a6794e684bcff2..f13341e0afe2a605122c77f0d0833d8119ac28d9 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -49,6 +49,21 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
                                         " with signed_input_ ", signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
+
+    string round_mode_string;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
+    OP_REQUIRES(
+        ctx,
+        (round_mode_string == "HALF_UP" || round_mode_string == "HALF_TO_EVEN"),
+        errors::InvalidArgument("Round mode string must be "
+                                "'HALF_UP' or "
+                                "'HALF_TO_EVEN', is '" +
+                                round_mode_string + "'"));
+    if (round_mode_string == "HALF_UP") {
+      round_mode_ = ROUND_HALF_UP;
+    } else if (round_mode_string == "HALF_TO_EVEN") {
+      round_mode_ = ROUND_HALF_TO_EVEN;
+    }
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -76,13 +91,15 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
 
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
     f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_,
-      range_given_, &input_min_tensor, &input_max_tensor, output->flat<T>());
+      range_given_, &input_min_tensor, &input_max_tensor, round_mode_,
+      output->flat<T>());
   }
 
  private:
   bool signed_input_;
   int num_bits_;
   bool range_given_;
+  QuantizerRoundMode round_mode_;
 };
 
 // Simulate quantization precision loss in a float tensor by:
@@ -135,7 +152,8 @@ class QuantizeAndDequantizeV3Op : public OpKernel {
 
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
     f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_val,
-      range_given_, &input_min_tensor, &input_max_tensor, output->flat<T>());
+      range_given_, &input_min_tensor, &input_max_tensor, ROUND_HALF_TO_EVEN,
+      output->flat<T>());
   }
 
  private:
@@ -180,7 +198,7 @@ class QuantizeAndDequantizeOp : public OpKernel {
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> functor;
     functor(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_,
             num_bits_, range_given_, &input_min_tensor, &input_max_tensor,
-            output->flat<T>());
+            ROUND_HALF_TO_EVEN, output->flat<T>());
   }
 
  private:
@@ -198,10 +216,11 @@ struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::ConstVec input,
                   const bool signed_input, const int num_bits,
                   const bool range_given, Tensor* input_min_tensor,
-                  Tensor* input_max_tensor, typename TTypes<T>::Vec out) {
+                  Tensor* input_max_tensor, QuantizerRoundMode round_mode,
+                  typename TTypes<T>::Vec out) {
     QuantizeAndDequantizeOneScaleImpl<CPUDevice, T>::Compute(
         d, input, signed_input, num_bits, range_given, input_min_tensor,
-        input_max_tensor, out);
+        input_max_tensor, round_mode, out);
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 6b0c5e5a466baf60a771d7aa7754975a0c121138..a495e8b71fec285f8649979553bcae7d400cd3d3 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -22,6 +22,20 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops.h"
 
 namespace tensorflow {
+
+enum QuantizerRoundMode {
+  // Round half up: if the fraction of y is exactly 0.5, then
+  // round(y) = y + 0.5
+  // E.g., -5.5 gets rounded to -5, -5.4 goes to -5,
+  // 5.4 goes to 5, and 5.5 goes to 6.
+  ROUND_HALF_UP,
+  // Round half to even: if the fraction of y is exactly 0.5, then round(y) is
+  // the nearest even integer to y.
+  // E.g., 23.5 gets rounded to 24, 24.5 gets rounded to 24, while -23.5 becomes
+  // -24, and -24.5 gets rounded to 24.
+  ROUND_HALF_TO_EVEN,
+};
+
 namespace functor {
 
 // TODO(pauldonnelly): 'signed_input' should really be called 'signed_output'.
@@ -31,15 +45,69 @@ struct QuantizeAndDequantizeOneScaleFunctor {
   void operator()(const Device& d, typename TTypes<T>::ConstVec input,
                   bool signed_input, int num_bits, bool range_given,
                   Tensor* input_min_tensor, Tensor* input_max_tensor,
-                  typename TTypes<T>::Vec out);
+                  QuantizerRoundMode round_mode, typename TTypes<T>::Vec out);
 };
 
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Func>
+void ClampScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input,
+                        T min_range, T max_range, T scale, T inverse_scale,
+                        Func round_func, typename TTypes<T>::Vec out) {
+  out.device(d) = (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
+                      .unaryExpr(round_func) *
+                  inverse_scale;
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T>
+void ClampScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input,
+                        T min_range, T max_range, T scale, T inverse_scale,
+                        QuantizerRoundMode round_mode,
+                        typename TTypes<T>::Vec out) {
+  switch (round_mode) {
+    case ROUND_HALF_TO_EVEN:
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         Eigen::internal::scalar_round_op_google<T>(), out);
+      break;
+    case ROUND_HALF_UP:
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         Eigen::internal::scalar_round_up_op<T>(), out);
+      break;
+  }
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Func>
+void ScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input, T scale,
+                   T inverse_scale, Func round_func,
+                   typename TTypes<T>::Vec out) {
+  out.device(d) = (input * scale).unaryExpr(round_func) * inverse_scale;
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T>
+void ScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input, T scale,
+                   T inverse_scale, QuantizerRoundMode round_mode,
+                   typename TTypes<T>::Vec out) {
+  switch (round_mode) {
+    case ROUND_HALF_TO_EVEN:
+      ScaleAndRound(d, input, scale, inverse_scale,
+                    Eigen::internal::scalar_round_op_google<T>(), out);
+      break;
+    case ROUND_HALF_UP:
+      ScaleAndRound(d, input, scale, inverse_scale,
+                    Eigen::internal::scalar_round_up_op<T>(), out);
+      break;
+  }
+}
+
 // The implementation below runs on both CPU and GPU.
 template <typename Device, typename T>
 struct QuantizeAndDequantizeOneScaleImpl {
   static void Compute(const Device& d, typename TTypes<T>::ConstVec input,
                       bool signed_input, int num_bits, bool range_given,
                       Tensor* input_min_tensor, Tensor* input_max_tensor,
+                      QuantizerRoundMode round_mode,
                       typename TTypes<T>::Vec out) {
     T min_range;
     T max_range;
@@ -89,15 +157,10 @@ struct QuantizeAndDequantizeOneScaleImpl {
       // The semantics of the op does not guarantee to clamp to the specified
       // min_range and max_range - because we may have changed either min_range
       // or max_range.
-      out.device(d) =
-          (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
-              .unaryExpr(Eigen::internal::scalar_round_op_google<T>()) *
-          inverse_scale;
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         round_mode, out);
     } else {
-      out.device(d) =
-          (input * scale)
-              .unaryExpr(Eigen::internal::scalar_round_op_google<T>()) *
-          inverse_scale;
+      ScaleAndRound(d, input, scale, inverse_scale, round_mode, out);
     }
   }
 };
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index 61c79cf6959ce5a20ce1a4ddd1bd4fae103a15d6..5745e418f3614b7ff3a786c4e6ac4c0f40308a25 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -32,10 +32,10 @@ struct QuantizeAndDequantizeOneScaleFunctor<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::ConstVec input,
                   bool signed_input, int num_bits, bool range_given,
                   Tensor* input_min_tensor, Tensor* input_max_tensor,
-                  typename TTypes<T>::Vec out) {
+                  QuantizerRoundMode round_mode, typename TTypes<T>::Vec out) {
     QuantizeAndDequantizeOneScaleImpl<GPUDevice, T>::Compute(
         d, input, signed_input, num_bits, range_given, input_min_tensor,
-        input_max_tensor, out);
+        input_max_tensor, round_mode, out);
   }
 };
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index cddabf8a99aca4a17de78c0ed8e7888e6959be6e..b9e015c96b5cd1edc2c349f1a38fdd074124230e 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -101,17 +101,51 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
           .Attr("range_given", false)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({7}),
+                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
-  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71, 64}.
   // Scale is: 1/127
-  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128}
+  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128, 0.5}
   TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(&expected,
-                          {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
+  test::FillValues<float>(
+      &expected, {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128, 0.5});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+}
+
+// Convert a 1D tensor with signed 8 bits and round_mode half_up.
+TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", true)
+          .Attr("num_bits", 8)
+          .Attr("range_given", false)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({7}),
+                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71, 65}.
+  // Scale is: 1/127
+  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128,
+  // 65.0 /128}
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 38.0 / 128, 102.0 / 128,
+                                      71.0 / 128, 65.0 / 128});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -162,7 +196,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
           .Attr("range_given", false)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3125, 0.8, 0.555});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
@@ -178,6 +212,35 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
+// Convert a 1D tensor with signed 4 bits and round_mode hafl_up.
+TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", true)
+          .Attr("num_bits", 4)
+          .Attr("range_given", false)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3125, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+
+  // With int4, the tensor is quantized to {-8, -4, 0, 3, 6, 4}.
+  // Scale is: 1/8
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 0.375, 0.75, 0.5});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+}
+
 // Convert a 1D tensor with signed 4 bits.
 TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
   TF_ASSERT_OK(
@@ -237,6 +300,38 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
+// Convert a 2D tensor with signed 8 bits, given range and round_mode half_up.
+TEST_F(QuantizeAndDequantizeTest,
+       Convert_2D_tensor_with_int8_range_given_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", true)
+          .Attr("num_bits", 8)
+          .Attr("range_given", true)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  // Note that the last two values are saturated.
+  AddInputFromArray<float>(TensorShape({2, 4}),
+                           {-0.8, -0.5, 0, 0.3, 0.8, 0.555, -2, 33});
+  AddInputFromArray<float>(TensorShape({}), {-1.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});   // Max
+
+  // Note that the range is given as [-1, 1].
+  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
+  // 127}.
+  // Scale is: 1/127
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
+  test::FillValues<float>(
+      &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+                  70.0 / 127, -128.0 / 127, 1});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Convert a 2D tensor with signed 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given_V3) {
   TF_ASSERT_OK(
@@ -293,6 +388,33 @@ TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
+// Convert a 4D tensor with unsigned 8 bits, given range and round_mode half_up.
+TEST_F(QuantizeAndDequantizeTest,
+       Convert_4D_tensor_with_uint8_range_given_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", false)
+          .Attr("num_bits", 8)
+          .Attr("range_given", true)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});  // Max
+
+  // Note that the range is given as [0, 1].
+  // With int8, the tensor is quantized to {0, 0, 77, 204}
+  // Scale is: 1/255
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 1, 1}));
+  test::FillValues<float>(&expected, {0, 0, 77.0 / 255, 204.0 / 255});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Convert a 4D tensor with unsigned 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given_V3) {
   TF_ASSERT_OK(
diff --git a/tensorflow/core/kernels/quantized_add_op.cc b/tensorflow/core/kernels/quantized_add_op.cc
index 337c8e5c17863cc06fba5828605ba5db85b22c31..55c69de7d3ea6c728564abbdb374b3f1a1c5696c 100644
--- a/tensorflow/core/kernels/quantized_add_op.cc
+++ b/tensorflow/core/kernels/quantized_add_op.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/bcast.h"
 
diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc
index 3c7536e037396c338663ce0136832acb87bef401..4e191f162662bb3246f6bc5572e255e0b7590228 100644
--- a/tensorflow/core/kernels/quantized_mul_op.cc
+++ b/tensorflow/core/kernels/quantized_mul_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/bcast.h"
 
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index e6133415d0f5c143acad25ee6e681820e956cca8..6fc489459231695a685346e3f728dd0a1e2202f2 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -273,7 +273,7 @@ void TestResizeBilinearOneDim() {
         << expected_val << ", " << resized_image_val;
   }
 
-  // Value testing with reference implemenatation
+  // Value testing with reference implementation
   CheckTensorValue<qint32>(image_quantized_tensor.flat<qint32>().data(),
                            outputs.at(0).flat<qint32>().data(),
                            /*batch_size=*/1,
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..903a97a9601a9e8613c3189ef61ed9965c82d3d5
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -0,0 +1,294 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/util/util.h"
+
+namespace tensorflow {
+
+namespace {
+
+// For each slice in `(start, limit)` in `value_slices`, append
+// `params_dense_values_in[start:limit] to `values_out`.  `value_size` indicates
+// the number of scalars contained in each value params_dense_values_in[i].
+template <typename VALUE_TYPE>
+void WriteValueSlices(const Tensor& params_dense_values_in,
+                      const std::vector<std::pair<int64, int64>>& value_slices,
+                      int64 value_size, Tensor* values_out) {
+  const auto& params_dense_values =
+      params_dense_values_in.flat_outer_dims<VALUE_TYPE, 2>();
+  auto values = values_out->flat_outer_dims<VALUE_TYPE, 2>();
+  int out_pos = 0;
+  for (const auto& slice : value_slices) {
+    for (int i = slice.first; i < slice.second; ++i) {
+      for (int j = 0; j < value_size; ++j) {
+        values(out_pos, j) = params_dense_values(i, j);
+      }
+      ++out_pos;
+    }
+  }
+}
+
+}  // namespace
+
+template <typename INDEX_TYPE>
+class RaggedGatherOpBase : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    // Get the input Tensors.
+    OpInputList params_nested_splits_in;
+    OP_REQUIRES_OK(context, context->input_list("params_nested_splits",
+                                                &params_nested_splits_in));
+    const Tensor& params_dense_values_in =
+        context->input(params_nested_splits_in.size());
+    const Tensor& indices_in =
+        context->input(params_nested_splits_in.size() + 1);
+
+    DCHECK_GT(params_nested_splits_in.size(), 0);  // Enforced by REGISTER_OP.
+    int64 num_params = params_nested_splits_in[0].dim_size(0) - 1;
+    OP_REQUIRES_OK(context, ValidateIndices(indices_in, num_params));
+
+    OP_REQUIRES(context, params_dense_values_in.dims() > 0,
+                errors::InvalidArgument("params.rank must be nonzero"));
+    int64 num_params_dense_values = params_dense_values_in.dim_size(0);
+
+    // Calculate the `splits`, and store the value slices that we need to
+    // copy in `value_slices`.
+    std::vector<std::pair<int64, int64>> value_slices;
+    int64 num_values = 0;
+    std::vector<std::vector<int64>> out_splits;
+    OP_REQUIRES_OK(context, MakeSplits(indices_in, params_nested_splits_in,
+                                       num_params_dense_values, &out_splits,
+                                       &value_slices, &num_values));
+
+    // Write the output tensors.
+    OP_REQUIRES_OK(context, WriteSplits(out_splits, context));
+    OP_REQUIRES_OK(context,
+                   WriteValues(params_dense_values_in, value_slices,
+                               out_splits.size(), num_values, context));
+  }
+
+ private:
+  // Check if any indices are out-of-bounds.
+  ::tensorflow::Status ValidateIndices(const Tensor& indices_in,
+                                       int64 num_params) {
+    const auto& indices = indices_in.flat<INDEX_TYPE>();
+    for (int64 i = 0; i < indices.size(); ++i) {
+      int64 index = indices(i);
+      if (index < 0 || index >= num_params) {
+        return errors::InvalidArgument(
+            "indices", SliceDebugString(indices_in.shape(), i), " = ", index,
+            " is not in [0, ", num_params, ")");
+      }
+    }
+    return ::tensorflow::Status::OK();
+  }
+
+  // Construct the `splits` output tensors, encoded using a nested vector.
+  // Also find the slices of values that need to be copied, and store them
+  // in `value_slices`.  The total number of values that will be copied (which
+  // we need for allocating the output values tensor) is stored in `num_values`.
+  ::tensorflow::Status MakeSplits(
+      const Tensor& indices_in, const OpInputList& params_nested_splits_in,
+      int64 num_params_dense_values,
+      std::vector<std::vector<int64>>* out_splits,
+      std::vector<std::pair<int64, int64>>* value_slices, int64* num_values) {
+    *num_values = 0;
+    value_slices->clear();
+
+    int num_splits = indices_in.dims() - 1 + params_nested_splits_in.size();
+    out_splits->assign(num_splits, {0});
+
+    // Get Eigen tensors.
+    const auto& indices = indices_in.flat<INDEX_TYPE>();
+    std::vector<TTypes<int64>::ConstFlat> params_nested_splits;
+    params_nested_splits.reserve(params_nested_splits_in.size());
+    for (const auto& splits_in : params_nested_splits_in) {
+      params_nested_splits.push_back(splits_in.flat<int64>());
+    }
+
+    TF_RETURN_IF_ERROR(
+        ValidateSplits(params_nested_splits, num_params_dense_values));
+
+    // Add `splits` that come from all but the last dimension of the dense
+    // Tensor `indices`.  In particular, for each dimension D, we add a
+    // splits tensor whose values are:
+    //   range(splits.shape[D]*splits.shape[D+1] + 1, step=splits.shape[D+1])
+    // E.g., if indices.shape=[5, 3] then we will add a splits tensor
+    // [0, 3, 6, 9, 12, 15], since the outermost dimension has 5 elements,
+    // each of which contains 3 values.
+    for (int dim = 0; dim < indices_in.dims() - 1; ++dim) {
+      int stride = indices_in.dim_size(dim + 1);
+      int index = stride;
+      for (int i = 0; i < indices_in.dim_size(dim); ++i) {
+        out_splits->at(dim).push_back(index);
+        index += stride;
+      }
+    }
+
+    // Add `splits` that come from `params_nested_splits`.  Starting with the
+    // outermost ragged dimension (i.e., the first `splits` tensor), we work
+    // our way in, finding the range of values that should be copied.  As we
+    // go, we update the output `splits` for each dimension with the appropriate
+    // values.  In particular, the *lengths* of the slices from `param_splits`
+    // should be copied to generate corresponding slice lengths in the output
+    // splits.  E.g., if we are copying a ragged row with length 4, then we
+    // should add a new split point to out_splits that is 4 greater than the
+    // previous split point in out_splits.
+    for (int i = 0; i < indices.size(); ++i) {
+      int start = indices(i);
+      int limit = indices(i) + 1;
+
+      // Copy splits.
+      for (int dim = 0; dim < params_nested_splits.size(); ++dim) {
+        const auto& splits = params_nested_splits[dim];
+        int out_dim = dim + indices_in.dims() - 1;
+        if (out_dim >= 0) {
+          int64 delta = out_splits->at(out_dim).back() - splits(start);
+          for (int j = start; j < limit; ++j) {
+            out_splits->at(out_dim).push_back(splits(j + 1) + delta);
+          }
+        }
+        start = splits(start);
+        limit = splits(limit);
+      }
+      if (limit != start) {
+        value_slices->emplace_back(start, limit);
+        *num_values += limit - start;
+      }
+    }
+    return ::tensorflow::Status::OK();
+  }
+
+  ::tensorflow::Status ValidateSplits(
+      const std::vector<TTypes<int64>::ConstFlat>& params_nested_splits,
+      int64 num_params_dense_values) {
+    // Validate
+    for (int dim = 0; dim < params_nested_splits.size(); ++dim) {
+      const auto& splits = params_nested_splits[dim];
+      int64 last_split = (dim == params_nested_splits.size() - 1)
+                             ? num_params_dense_values
+                             : params_nested_splits[dim + 1].size();
+      if (splits.size() == 0) {
+        return errors::InvalidArgument("Ragged splits may not be empty");
+      }
+      if (splits(0) < 0) {
+        return errors::InvalidArgument("Ragged splits must be non-negative");
+      }
+      if (splits(splits.size() - 1) > last_split) {
+        return errors::InvalidArgument(
+            "Ragged splits must not point past values");
+      }
+      for (int i = 1; i < splits.size(); ++i) {
+        if (splits(i - 1) > splits(i)) {
+          return errors::InvalidArgument("Ragged splits must be sorted");
+        }
+      }
+    }
+    return ::tensorflow::Status::OK();
+  }
+
+  ::tensorflow::Status WriteSplits(
+      const std::vector<std::vector<int64>>& out_splits,
+      OpKernelContext* context) {
+    OpOutputList splits_out;
+    TF_RETURN_IF_ERROR(
+        context->output_list("output_nested_splits", &splits_out));
+    for (int i = 0; i < out_splits.size(); ++i) {
+      Tensor* splits;
+      int64 num_splits = out_splits[i].size();
+      TF_RETURN_IF_ERROR(
+          splits_out.allocate(i, TensorShape({num_splits}), &splits));
+      auto splits_flat = splits->flat<int64>();
+      std::copy_n(out_splits[i].data(), out_splits[i].size(),
+                  splits_flat.data());
+    }
+    return ::tensorflow::Status::OK();
+  }
+
+  ::tensorflow::Status WriteValues(
+      const Tensor& params_dense_values_in,
+      const std::vector<std::pair<int64, int64>>& value_slices,
+      int values_index, int64 num_values, OpKernelContext* context) const {
+    Tensor* values_out = nullptr;
+    TensorShape values_shape = params_dense_values_in.shape();
+    values_shape.set_dim(0, num_values);
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(values_index, values_shape, &values_out));
+    const int64 num_elements = params_dense_values_in.NumElements();
+    const int64 value_size =
+        num_elements == 0 ? 0
+                          : (num_elements / params_dense_values_in.dim_size(0));
+    CallWriteValueSlices(params_dense_values_in, value_slices, value_size,
+                         values_out);
+    return ::tensorflow::Status::OK();
+  }
+
+ protected:
+  // Call WriteValueSlices() using the appropriate VALUE_TYPE template
+  // parameter.  This pattern is used to reduce binary size.  In particular,
+  // this allows us to have two instantiations of this class (one for each
+  // index type), rather than 14 (one for each index type and value type),
+  // which cuts the binary size of this op from ~300k to <90k.
+  virtual void CallWriteValueSlices(
+      const Tensor& params_dense_values_in,
+      const std::vector<std::pair<int64, int64>>& value_slices,
+      int64 value_size, Tensor* values_out) const = 0;
+};
+
+template <typename INDEX_TYPE, typename VALUE_TYPE>
+class RaggedGatherOp : public RaggedGatherOpBase<INDEX_TYPE> {
+ public:
+  using RaggedGatherOpBase<INDEX_TYPE>::RaggedGatherOpBase;
+
+ private:
+  void CallWriteValueSlices(
+      const Tensor& params_dense_values_in,
+      const std::vector<std::pair<int64, int64>>& value_slices,
+      int64 value_size, Tensor* values_out) const override {
+    WriteValueSlices<VALUE_TYPE>(params_dense_values_in, value_slices,
+                                 value_size, values_out);
+  }
+};
+
+#define REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(index_type, value_type)   \
+  REGISTER_KERNEL_BUILDER(Name("RaggedGather")                        \
+                              .Device(DEVICE_CPU)                     \
+                              .TypeConstraint<index_type>("Tindices") \
+                              .TypeConstraint<value_type>("Tvalues"), \
+                          RaggedGatherOp<index_type, value_type>);
+#define REGISTER_CPU_KERNEL(value_type)                  \
+  REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int32, value_type) \
+  REGISTER_CPU_KERNEL_WITH_INDEX_TYPE(int64, value_type)
+TF_CALL_POD_TYPES(REGISTER_CPU_KERNEL);
+TF_CALL_string(REGISTER_CPU_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_KERNEL);
+TF_CALL_quint16(REGISTER_CPU_KERNEL);
+TF_CALL_qint16(REGISTER_CPU_KERNEL);
+TF_CALL_uint32(REGISTER_CPU_KERNEL);
+TF_CALL_uint64(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
+#undef REGISTER_CPU_KERNEL_WITH_INDEX_TYPE
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_gather_op_test.cc b/tensorflow/core/kernels/ragged_gather_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..47be788151e485d4888cf9dbd3e0816a63bafb44
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_gather_op_test.cc
@@ -0,0 +1,281 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class RaggedGatherOpTest : public ::tensorflow::OpsTestBase {
+ protected:
+  // Builds the tensorflow test graph for RaggedGather.
+  template <typename VALUE_TYPE, typename INDEX_TYPE>
+  void BuildRaggedGatherGraph(
+      const TensorShape& indices_shape, const std::vector<INDEX_TYPE>& indices,
+      const std::vector<std::vector<int64>>& params_nested_splits,
+      const TensorShape& params_dense_values_shape,
+      const gtl::ArraySlice<VALUE_TYPE> params_dense_values) {
+    const auto& value_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+    const auto& index_dtype = DataTypeToEnum<INDEX_TYPE>::v();
+    int64 PARAMS_RAGGED_RANK = params_nested_splits.size();
+    int64 num_splits = PARAMS_RAGGED_RANK + indices_shape.dims() - 1;
+    TF_ASSERT_OK(
+        NodeDefBuilder("tested_op", "RaggedGather")
+            .Input(FakeInput(PARAMS_RAGGED_RANK))  // params_nested_splits
+            .Input(FakeInput(value_dtype))         // params_dense_values
+            .Input(FakeInput(index_dtype))         // indices
+            .Attr("PARAMS_RAGGED_RANK", PARAMS_RAGGED_RANK)
+            .Attr("OUTPUT_RAGGED_RANK", num_splits)
+            .Attr("Tvalues", value_dtype)
+            .Attr("Tindices", index_dtype)
+            .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+    for (const auto& splits : params_nested_splits) {
+      int64 splits_size = splits.size();
+      AddInputFromArray<int64>(TensorShape({splits_size}), splits);
+    }
+    AddInputFromArray<VALUE_TYPE>(params_dense_values_shape,
+                                  params_dense_values);
+    AddInputFromArray<INDEX_TYPE>(indices_shape, indices);
+  }
+};
+
+TEST_F(RaggedGatherOpTest, RaggedGather) {
+  // indices = [2, 1, 0, 3]
+  // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
+  // params.shape = [4, None]
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({4}),                     // indices.shape
+      {2, 1, 0, 3},                         // indices
+      {{0, 3, 3, 7, 9}},                    // params_nested_splits
+      TensorShape({9}),                     // params_dense_values.shape
+      {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[.4, .5, .6, .7], [.1, .2, .3], [], [.8, .9]]
+  test::ExpectTensorEqual<int64>(*GetOutput(0),
+                                 test::AsTensor<int64>({0, 4, 4, 7, 9}));
+  test::ExpectTensorNear<float>(
+      *GetOutput(1),
+      test::AsTensor<float>({.4, .5, .6, .7, .1, .2, .3, .8, .9}), 0.1);
+}
+
+TEST_F(RaggedGatherOpTest, RaggedGather_3DParams) {
+  // indices = [2, 1, 0, 2, 3]
+  // params = [[[]], [[.1, 2], [.3]], [], [[.4, .5], [.6, .7, .8]], [[.9]]]
+  // params.shape = [5, None, None]
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({5}),                             // indices.shape
+      {2, 1, 0, 2, 3},                              // indices
+      {{0, 1, 3, 3, 5, 6}, {0, 0, 2, 3, 5, 8, 9}},  // params_nested_splits
+      TensorShape({9}),                             // params_dense_values.shape
+      {.1, .2, .3, .4, .5, .6, .7, .8, .9}          // params_dense_values
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[], [[.1, 2], [.3]], [[]], [], [[.4, .5], [.6, .7, .8]]]
+  test::ExpectTensorEqual<int64>(*GetOutput(0),
+                                 test::AsTensor<int64>({0, 0, 2, 3, 3, 5}));
+  test::ExpectTensorEqual<int64>(*GetOutput(1),
+                                 test::AsTensor<int64>({0, 2, 3, 3, 5, 8}));
+  test::ExpectTensorNear<float>(
+      *GetOutput(2), test::AsTensor<float>({.1, .2, .3, .4, .5, .6, .7, .8}),
+      0.1);
+}
+
+TEST_F(RaggedGatherOpTest, RaggedGather_4DParams) {
+  // indices = [2, 1, 0, 2]
+  // params = [[[]], [[[1, 2], [3, 4], [5, 6]], [[7, 8]]], []]
+  // params.shape = [4, None, None, 2]
+  BuildRaggedGatherGraph<int32, int32>(
+      TensorShape({4}),              // indices.shape
+      {2, 1, 0, 2},                  // indices
+      {{0, 1, 3, 3}, {0, 0, 3, 4}},  // params_nested_splits
+      TensorShape({4, 2}),           // params_dense_values.shape
+      {1, 2, 3, 4, 5, 6, 7, 8}       // params_dense_values
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[],
+  //            [[[1, 2], [3, 4], [5, 6]], [[7, 8]]],
+  //            [[]],
+  //            []]
+  test::ExpectTensorEqual<int64>(*GetOutput(0),
+                                 test::AsTensor<int64>({0, 0, 2, 3, 3}));
+  test::ExpectTensorEqual<int64>(*GetOutput(1),
+                                 test::AsTensor<int64>({0, 3, 4, 4}));
+  test::ExpectTensorEqual<int32>(
+      *GetOutput(2),
+      test::AsTensor<int32>({1, 2, 3, 4, 5, 6, 7, 8}, TensorShape({4, 2})));
+}
+
+TEST_F(RaggedGatherOpTest, RaggedGather_2DIndices) {
+  // indices = [[2, 1], [0, 3]]
+  // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({2, 2}),                  // indices.shape
+      {2, 1, 0, 3},                         // indices
+      {{0, 3, 3, 7, 9}},                    // params_nested_splits
+      TensorShape({9}),                     // params_dense_values.shape
+      {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
+  );
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [ [ [.4, .5, .6, .7], [.1, .2, .3] ],
+  //             [ [],               [.8, .9]     ] ]
+  test::ExpectTensorEqual<int64>(*GetOutput(0),
+                                 test::AsTensor<int64>({0, 2, 4}));
+  test::ExpectTensorEqual<int64>(*GetOutput(1),
+                                 test::AsTensor<int64>({0, 4, 4, 7, 9}));
+  test::ExpectTensorNear<float>(
+      *GetOutput(2),
+      test::AsTensor<float>({.4, .5, .6, .7, .1, .2, .3, .8, .9}), 0.1);
+}
+
+TEST_F(RaggedGatherOpTest, RaggedGather_ScalarIndices) {
+  // indices = 2
+  // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({}),                      // indices.shape
+      {2},                                  // indices
+      {{0, 3, 3, 7, 9}},                    // params_nested_splits
+      TensorShape({9}),                     // params_dense_values.shape
+      {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
+  );
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [.4, .5, .6, .7]
+  test::ExpectTensorNear<float>(*GetOutput(0),
+                                test::AsTensor<float>({.4, .5, .6, .7}), 0.1);
+}
+
+TEST_F(RaggedGatherOpTest, RaggedGather_OutOfBounds) {
+  // indices = [2, 10]
+  // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({2}),                     // indices.shape
+      {2, 10},                              // indices
+      {{0, 3, 3, 7, 9}},                    // params_nested_splits
+      TensorShape({9}),                     // params_dense_values.shape
+      {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
+  );
+  EXPECT_EQ("indices[1] = 10 is not in [0, 4)", RunOpKernel().error_message());
+}
+
+TEST_F(RaggedGatherOpTest, InvalidSplitsNotSorted) {
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({2}),                     // indices.shape
+      {0, 2},                               // indices
+      {{0, 3, 5, 2, 9}},                    // params_nested_splits
+      TensorShape({9}),                     // params_dense_values.shape
+      {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
+  );
+  EXPECT_EQ("Ragged splits must be sorted", RunOpKernel().error_message());
+}
+
+TEST_F(RaggedGatherOpTest, InvalidSplitsNegative) {
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({2}),                     // indices.shape
+      {0, 2},                               // indices
+      {{-1, 3, 2, 7, 9}},                   // params_nested_splits
+      TensorShape({9}),                     // params_dense_values.shape
+      {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
+  );
+  EXPECT_EQ("Ragged splits must be non-negative",
+            RunOpKernel().error_message());
+}
+
+TEST_F(RaggedGatherOpTest, InvalidSplitsEmpty) {
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({0}),  // indices.shape
+      {},                // indices
+      {{}},              // params_nested_splits
+      TensorShape({0}),  // params_dense_values.shape
+      {}                 // params_dense_values
+  );
+  EXPECT_EQ("Ragged splits may not be empty", RunOpKernel().error_message());
+}
+
+TEST_F(RaggedGatherOpTest, InvalidSplitsTooBig) {
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({2}),                     // indices.shape
+      {0, 2},                               // indices
+      {{0, 20, 40, 80, 100}},               // params_nested_splits
+      TensorShape({9}),                     // params_dense_values.shape
+      {.1, .2, .3, .4, .5, .6, .7, .8, .9}  // params_dense_values
+  );
+  EXPECT_EQ("Ragged splits must not point past values",
+            RunOpKernel().error_message());
+}
+
+TEST_F(RaggedGatherOpTest, BadValuesShape) {
+  BuildRaggedGatherGraph<float, int32>(
+      TensorShape({0}),  // indices.shape
+      {},                // indices
+      {{0}},             // params_nested_splits
+      TensorShape({}),   // params_dense_values.shape
+      {.1}               // params_dense_values
+  );
+  EXPECT_EQ("params.rank must be nonzero", RunOpKernel().error_message());
+}
+
+TEST_F(RaggedGatherOpTest, ShapeFn) {
+  // RaggedGather(param_splits+, param_values, indices) -> [splits+, values]
+  ShapeInferenceTestOp op("RaggedGather");
+
+  (*op.node_def.mutable_attr())["PARAMS_RAGGED_RANK"].set_i(1);
+  (*op.node_def.mutable_attr())["OUTPUT_RAGGED_RANK"].set_i(1);
+  INFER_OK(op, "?;?;?", "[?];?");
+  INFER_OK(op, "[?];[?];[?]", "[?];[?]");
+  INFER_OK(op, "[?];[?,?,?];[?]", "[?];[?,d1_1,d1_2]");
+  INFER_OK(op, "[5];[10];[15]", "[?];[?]");
+  INFER_OK(op, "[5];[10,2];[15]", "[?];[?,d1_1]");
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[5];[];[]");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[1,2];[];[5]");
+
+  (*op.node_def.mutable_attr())["PARAMS_RAGGED_RANK"].set_i(2);
+  (*op.node_def.mutable_attr())["OUTPUT_RAGGED_RANK"].set_i(2);
+  INFER_OK(op, "?;?;?;?", "[?];[?];?");
+  INFER_OK(op, "[?];[?];[?];[?]", "[?];[?];[?]");
+  INFER_OK(op, "[?];[?];[?,?,?];[?]", "[?];[?];[?,d2_1,d2_2]");
+  INFER_OK(op, "[5];[10];[15];[20]", "[?];[?];[?]");
+
+  (*op.node_def.mutable_attr())["PARAMS_RAGGED_RANK"].set_i(1);
+  (*op.node_def.mutable_attr())["OUTPUT_RAGGED_RANK"].set_i(2);
+  INFER_OK(op, "?;?;?", "[?];[?];?");
+  INFER_OK(op, "[?];[?];[?,?]", "[?];[?];[?]");
+  INFER_OK(op, "[?];[?,?,?];[?,?]", "[?];[?];[?,d1_1,d1_2]");
+  INFER_OK(op, "[15];[20];[5,10]", "[?];[?];[?]");
+  INFER_OK(op, "[15];[20,2];[5,10]", "[?];[?];[?,d1_1]");
+
+  (*op.node_def.mutable_attr())["PARAMS_RAGGED_RANK"].set_i(1);
+  (*op.node_def.mutable_attr())["OUTPUT_RAGGED_RANK"].set_i(0);
+  INFER_OK(op, "[?];[?];[]", "[?]");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_range_op.cc b/tensorflow/core/kernels/ragged_range_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb7546c3974b0f5dcdf9a69f8d214674d856e7fa
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_range_op.cc
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+using errors::InvalidArgument;
+
+template <typename T>
+class RaggedRangeOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& starts_in = context->input(0);
+    const Tensor& limits_in = context->input(1);
+    const Tensor& deltas_in = context->input(2);
+
+    // Check input tensor shapes.
+    OP_REQUIRES(context, starts_in.shape().dims() <= 1,
+                InvalidArgument("starts must be a scalar or vector"));
+    OP_REQUIRES(context, limits_in.shape().dims() <= 1,
+                InvalidArgument("limits must be a scalar or vector"));
+    OP_REQUIRES(context, deltas_in.shape().dims() <= 1,
+                InvalidArgument("deltas must be a scalar or vector"));
+
+    // Determine which tensors we need to broadcast.
+    bool broadcast_starts = starts_in.shape().dims() == 0;
+    bool broadcast_limits = limits_in.shape().dims() == 0;
+    bool broadcast_deltas = deltas_in.shape().dims() == 0;
+
+    // nrows (number of output rows) is the size of the non-broadcast inputs,
+    // or 1 if all inputs are scalars.
+    std::vector<int> in_sizes;
+    if (!broadcast_starts) in_sizes.push_back(starts_in.shape().dim_size(0));
+    if (!broadcast_limits) in_sizes.push_back(limits_in.shape().dim_size(0));
+    if (!broadcast_deltas) in_sizes.push_back(deltas_in.shape().dim_size(0));
+    for (int i = 1; i < in_sizes.size(); ++i) {
+      OP_REQUIRES(context, in_sizes[i] == in_sizes[i - 1],
+                  InvalidArgument("starts, limits, and deltas must have the "
+                                  "same shape"));
+    }
+    int64 nrows = in_sizes.empty() ? 1 : in_sizes[0];
+
+    const auto& starts = starts_in.flat<T>();
+    const auto& limits = limits_in.flat<T>();
+    const auto& deltas = deltas_in.flat<T>();
+
+    // Construct the rt_nested_splits tensor.
+    Tensor* rt_nested_splits_out = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({nrows + 1}),
+                                            &rt_nested_splits_out));
+    auto rt_nested_splits = rt_nested_splits_out->flat<int64>();
+    rt_nested_splits(0) = 0;
+    for (int row = 0; row < nrows; ++row) {
+      T start = broadcast_starts ? starts(0) : starts(row);
+      T limit = broadcast_limits ? limits(0) : limits(row);
+      T delta = broadcast_deltas ? deltas(0) : deltas(row);
+      OP_REQUIRES(context, delta != 0, InvalidArgument("Requires delta != 0"));
+      rt_nested_splits(row + 1) =
+          rt_nested_splits(row) + RangeSize(start, limit, delta);
+    }
+    int64 nvals = rt_nested_splits(nrows);
+
+    // Construct the rt_dense_values tensor.
+    Tensor* rt_dense_values_out = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({nvals}),
+                                                     &rt_dense_values_out));
+    auto rt_dense_values = rt_dense_values_out->flat<T>();
+    int value_index = 0;
+    for (int row = 0; row < nrows; ++row) {
+      int64 row_size = rt_nested_splits(row + 1) - rt_nested_splits(row);
+      T value = broadcast_starts ? starts(0) : starts(row);
+      T delta = broadcast_deltas ? deltas(0) : deltas(row);
+      for (int64 i = 0; i < row_size; ++i) {
+        rt_dense_values(value_index++) = T(value);
+        value += delta;
+      }
+    }
+  }
+
+ private:
+  // Returns the number of elements in the specified range.
+  int64 RangeSize(T start, T limit, T delta) {
+    if (((delta > 0) && (limit < start)) || ((delta < 0) && (limit > start))) {
+      return 0;
+    }
+    // The following is copied from tensorflow::RangeOp::Compute().
+    return (std::is_integral<T>::value
+                ? ((std::abs(limit - start) + std::abs(delta) - 1) /
+                   std::abs(delta))
+                : std::ceil(std::abs((limit - start) / delta)));
+  }
+};
+
+#define REGISTER_CPU_KERNEL(TYPE)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("RaggedRange").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+      RaggedRangeOp<TYPE>);
+TF_CALL_float(REGISTER_CPU_KERNEL);
+TF_CALL_double(REGISTER_CPU_KERNEL);
+TF_CALL_int32(REGISTER_CPU_KERNEL);
+TF_CALL_int64(REGISTER_CPU_KERNEL);
+#undef REGISTER_CPU_KERNEL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_range_op_test.cc b/tensorflow/core/kernels/ragged_range_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66f097091e833d4e006ae32bce6d370c0e390cec
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_range_op_test.cc
@@ -0,0 +1,224 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class RaggedRangeOpTest : public ::tensorflow::OpsTestBase {
+ protected:
+  // Indices of output tensors.
+  static constexpr int kSplitsOutput = 0;
+  static constexpr int kValuesOutput = 1;
+
+  // Builds the tensorflow test graph for the RaggedRange op.
+  template <typename T>
+  void BuildRaggedRangeGraph() {
+    const auto& dtype = DataTypeToEnum<T>::v();
+    TF_ASSERT_OK(NodeDefBuilder("tested_op", "RaggedRange")
+                     .Input(FakeInput(dtype))  // starts
+                     .Input(FakeInput(dtype))  // limits
+                     .Input(FakeInput(dtype))  // deltas
+                     .Attr("T", dtype)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(RaggedRangeOpTest, IntValues) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({4}), {0, 5, 8, 5});   // starts
+  AddInputFromArray<int>(TensorShape({4}), {8, 7, 8, 1});   // limits
+  AddInputFromArray<int>(TensorShape({4}), {2, 1, 1, -1});  // deltas
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[0, 2, 4, 6], [5, 6], [], [5, 4, 3, 2]]
+  test::ExpectTensorEqual<int64>(*GetOutput(kSplitsOutput),
+                                 test::AsTensor<int64>({0, 4, 6, 6, 10}));
+  test::ExpectTensorEqual<int>(
+      *GetOutput(kValuesOutput),
+      test::AsTensor<int>({0, 2, 4, 6, 5, 6, 5, 4, 3, 2}));
+}
+
+TEST_F(RaggedRangeOpTest, FloatValues) {
+  BuildRaggedRangeGraph<float>();
+  AddInputFromArray<float>(TensorShape({4}), {0, 5, 8, 5});   // starts
+  AddInputFromArray<float>(TensorShape({4}), {8, 7, 8, 1});   // limits
+  AddInputFromArray<float>(TensorShape({4}), {2, 1, 1, -1});  // deltas
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[0, 2, 4, 6], [5, 6], [], [5, 4, 3, 2]]
+  test::ExpectTensorEqual<int64>(*GetOutput(kSplitsOutput),
+                                 test::AsTensor<int64>({0, 4, 6, 6, 10}));
+  test::ExpectTensorNear<float>(
+      *GetOutput(kValuesOutput),
+      test::AsTensor<float>({0, 2, 4, 6, 5, 6, 5, 4, 3, 2}), 0.1);
+}
+
+TEST_F(RaggedRangeOpTest, BroadcastDeltas) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({3}), {0, 5, 8});  // starts
+  AddInputFromArray<int>(TensorShape({3}), {8, 7, 8});  // limits
+  AddInputFromArray<int>(TensorShape({}), {1});         // deltas
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[0, 1, 2, 3, 4, 5, 6, 7], [5, 6], []]
+  test::ExpectTensorEqual<int64>(*GetOutput(kSplitsOutput),
+                                 test::AsTensor<int64>({0, 8, 10, 10}));
+  test::ExpectTensorEqual<int>(
+      *GetOutput(kValuesOutput),
+      test::AsTensor<int>({0, 1, 2, 3, 4, 5, 6, 7, 5, 6}));
+}
+
+TEST_F(RaggedRangeOpTest, BroadcastLimitsAndDeltas) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({}), {0});         // starts
+  AddInputFromArray<int>(TensorShape({3}), {3, 0, 2});  // limits
+  AddInputFromArray<int>(TensorShape({}), {1});         // deltas
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[0, 1, 2], [], [0, 1]]
+  test::ExpectTensorEqual<int64>(*GetOutput(kSplitsOutput),
+                                 test::AsTensor<int64>({0, 3, 3, 5}));
+  test::ExpectTensorEqual<int>(*GetOutput(kValuesOutput),
+                               test::AsTensor<int>({0, 1, 2, 0, 1}));
+}
+
+TEST_F(RaggedRangeOpTest, BroadcastStartsAndLimits) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({}), {0});         // starts
+  AddInputFromArray<int>(TensorShape({}), {12});        // limits
+  AddInputFromArray<int>(TensorShape({3}), {3, 4, 5});  // deltas
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[0, 3, 6, 9], [0, 4, 8], [0, 5, 10]]]
+  test::ExpectTensorEqual<int64>(*GetOutput(kSplitsOutput),
+                                 test::AsTensor<int64>({0, 4, 7, 10}));
+  test::ExpectTensorEqual<int>(
+      *GetOutput(kValuesOutput),
+      test::AsTensor<int>({0, 3, 6, 9, 0, 4, 8, 0, 5, 10}));
+}
+
+TEST_F(RaggedRangeOpTest, AllScalarInputs) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({}), {0});  // starts
+  AddInputFromArray<int>(TensorShape({}), {5});  // limits
+  AddInputFromArray<int>(TensorShape({}), {1});  // deltas
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[0, 1, 2, 3, 4]
+  test::ExpectTensorEqual<int64>(*GetOutput(kSplitsOutput),
+                                 test::AsTensor<int64>({0, 5}));
+  test::ExpectTensorEqual<int>(*GetOutput(kValuesOutput),
+                               test::AsTensor<int>({0, 1, 2, 3, 4}));
+}
+
+TEST_F(RaggedRangeOpTest, InvalidArgsStarts) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({4, 1}), {0, 5, 8, 5});  // starts
+  AddInputFromArray<int>(TensorShape({4}), {8, 7, 8, 1});     // limits
+  AddInputFromArray<int>(TensorShape({4}), {2, 1, 1, -1});    // deltas
+  EXPECT_EQ("starts must be a scalar or vector", RunOpKernel().error_message());
+}
+
+TEST_F(RaggedRangeOpTest, InvalidArgsLimits) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({4}), {0, 5, 8, 5});     // starts
+  AddInputFromArray<int>(TensorShape({4, 1}), {8, 7, 8, 1});  // limits
+  AddInputFromArray<int>(TensorShape({4}), {2, 1, 1, -1});    // deltas
+  EXPECT_EQ("limits must be a scalar or vector", RunOpKernel().error_message());
+}
+
+TEST_F(RaggedRangeOpTest, InvalidArgsDeltas) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({4}), {0, 5, 8, 5});      // starts
+  AddInputFromArray<int>(TensorShape({4}), {8, 7, 8, 1});      // limits
+  AddInputFromArray<int>(TensorShape({4, 1}), {2, 1, 1, -1});  // deltas
+  EXPECT_EQ("deltas must be a scalar or vector", RunOpKernel().error_message());
+}
+
+TEST_F(RaggedRangeOpTest, InvalidArgsShapeMismatch) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({4}), {0, 5, 8, 5});   // starts
+  AddInputFromArray<int>(TensorShape({3}), {7, 8, 1});      // limits
+  AddInputFromArray<int>(TensorShape({4}), {2, 1, 1, -1});  // deltas
+  EXPECT_EQ("starts, limits, and deltas must have the same shape",
+            RunOpKernel().error_message());
+}
+
+TEST_F(RaggedRangeOpTest, InvalidArgsZeroDelta) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({4}), {0, 5, 8, 5});   // starts
+  AddInputFromArray<int>(TensorShape({4}), {7, 8, 8, 1});   // limits
+  AddInputFromArray<int>(TensorShape({4}), {2, 1, 0, -1});  // deltas
+  EXPECT_EQ("Requires delta != 0", RunOpKernel().error_message());
+}
+
+TEST_F(RaggedRangeOpTest, EmptyRangePositiveDelta) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({2}), {0, 5});  // starts
+  AddInputFromArray<int>(TensorShape({2}), {5, 0});  // limits
+  AddInputFromArray<int>(TensorShape({}), {2});      // deltas
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[0, 2, 4], []]
+  test::ExpectTensorEqual<int64>(*GetOutput(kSplitsOutput),
+                                 test::AsTensor<int64>({0, 3, 3}));
+  test::ExpectTensorEqual<int>(*GetOutput(kValuesOutput),
+                               test::AsTensor<int>({0, 2, 4}));
+}
+
+TEST_F(RaggedRangeOpTest, EmptyRangeNegativeDelta) {
+  BuildRaggedRangeGraph<int>();
+  AddInputFromArray<int>(TensorShape({2}), {0, 5});  // starts
+  AddInputFromArray<int>(TensorShape({2}), {5, 0});  // limits
+  AddInputFromArray<int>(TensorShape({}), {-2});     // deltas
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Expected: [[], [5, 3, 1]]
+  test::ExpectTensorEqual<int64>(*GetOutput(kSplitsOutput),
+                                 test::AsTensor<int64>({0, 0, 3}));
+  test::ExpectTensorEqual<int>(*GetOutput(kValuesOutput),
+                               test::AsTensor<int>({5, 3, 1}));
+}
+
+TEST_F(RaggedRangeOpTest, ShapeFn) {
+  // RaggedRange(starts, limits, deltas) -> [splits, values]
+  ShapeInferenceTestOp op("RaggedRange");
+  INFER_OK(op, "?;?;?", "[?];[?]");
+  INFER_OK(op, "[3];[3];[3]", "[4];[?]");
+  INFER_OK(op, "[3];[3];[]", "[4];[?]");  // broadcast deltas
+  INFER_OK(op, "[3];[];[3]", "[4];[?]");  // broadcast limits
+  INFER_OK(op, "[];[3];[3]", "[4];[?]");  // broadcast starts
+  INFER_OK(op, "[];[];[]", "[2];[?]");    // degenerate case: all scalar inputs
+  INFER_ERROR("Shape must be at most rank 1 but is rank 2", op,
+              "[5,5];[5];[5]");
+  INFER_ERROR("Shape must be at most rank 1 but is rank 2", op,
+              "[5];[5,5];[5]");
+  INFER_ERROR("Shape must be at most rank 1 but is rank 2", op,
+              "[5];[5];[5,5]");
+  INFER_ERROR("Dimensions must be equal, but are 4 and 3", op, "[3];[4];[3]");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cd4b8da858f630efa9086d4bd74d58bf65d532a
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
@@ -0,0 +1,219 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+using errors::InvalidArgument;
+
+class RaggedTensorToSparseOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    // Read the `rt_nested_splits` input & convert to Eigen tensors.
+    OpInputList rt_nested_splits_in;
+    OP_REQUIRES_OK(
+        context, context->input_list("rt_nested_splits", &rt_nested_splits_in));
+    const int64 rt_nested_splits_len = rt_nested_splits_in.size();
+    DCHECK_GT(rt_nested_splits_len, 0);  // Enforced by REGISTER_OP.
+    std::vector<TTypes<int64>::ConstFlat> rt_nested_splits;
+    rt_nested_splits.reserve(rt_nested_splits_len);
+    for (int i = 0; i < rt_nested_splits_len; ++i) {
+      rt_nested_splits.push_back(rt_nested_splits_in[i].flat<int64>());
+    }
+
+    // Read the `rt_dense_values` input.
+    const Tensor& rt_dense_values_in = context->input(rt_nested_splits_len);
+    OP_REQUIRES_OK(context,
+                   ValidateInputs(rt_nested_splits, rt_dense_values_in));
+
+    // Assemble each value in `sparse_indices` using three parts:
+    // - `index_prefix` is the index in dimensions up through the last ragged
+    //   dimension.
+    // - `index_middle` is the index in the last ragged dimension.
+    // - `index_suffix` is the index in the dense value dimensions.
+    std::vector<int64> index_prefix(rt_nested_splits_len);
+    std::vector<std::vector<int64>> index_suffixes =
+        MakeIndexSuffixes(rt_dense_values_in.shape());
+
+    // Allocate the `sparse_indices` output tensor.
+    const int64 nvals =
+        (rt_nested_splits.back()(rt_nested_splits.back().size() - 1) *
+         index_suffixes.size());
+    const int64 indices_len = rt_nested_splits_len + rt_dense_values_in.dims();
+    Tensor* sparse_indices_out = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape({nvals, indices_len}),
+                                          &sparse_indices_out));
+    auto sparse_indices = sparse_indices_out->tensor<int64, 2>();
+
+    // pos[i] is the current position in rt_nested_splits[i].  final_pos is a
+    // reference to make it easier to refer to pos[-1].
+    std::vector<int64> pos(rt_nested_splits_len);
+    int64& final_pos = pos[rt_nested_splits_len - 1];
+
+    // Each iteration through the loop, we increment pos[-1], and add indices
+    // for all the values corresponding to
+    // rt_nested_splits[-1][pos[-1]:pos[-1]+1].
+    int next_index = 0;
+    int max_final_pos = rt_nested_splits.back().size() - 1;
+    for (; final_pos < max_final_pos; ++final_pos) {
+      // Update `pos` to skip over completed elements (i.e., elements where
+      // we have already generated indices for all contained values).
+      for (int dim = rt_nested_splits_len - 2; dim >= 0; --dim) {
+        while (IsCompleted(pos, dim, rt_nested_splits)) {
+          pos[dim] += 1;
+        }
+      }
+
+      // Update index_prefix.
+      for (int dim = 0; dim < index_prefix.size(); ++dim) {
+        int start = dim > 0 ? rt_nested_splits[dim - 1](pos[dim - 1]) : 0;
+        index_prefix[dim] = pos[dim] - start;
+      }
+
+      // Get length of the final-ragged-dimension slice.
+      const auto& final_splits = rt_nested_splits[rt_nested_splits_len - 1];
+      int64 slice_len = final_splits(final_pos + 1) - final_splits(final_pos);
+
+      // Add sparse_indices for this slice.
+      for (int64 i = 0; i < slice_len; ++i) {
+        for (const auto& index_suffix : index_suffixes) {
+          int dim = 0;
+          for (int64 index : index_prefix) {  // index_prefix
+            sparse_indices(next_index, dim++) = index;
+          }
+          sparse_indices(next_index, dim++) = i;  // index_middle
+          for (int64 index : index_suffix) {      // index_suffix
+            sparse_indices(next_index, dim++) = index;
+          }
+          DCHECK_EQ(dim, indices_len);
+          ++next_index;
+        }
+      }
+    }
+    DCHECK_EQ(next_index, nvals);
+
+    // Output the `sparse_values` Tensor.
+    if (rt_dense_values_in.dims() == 1) {
+      context->set_output(1, rt_dense_values_in);
+    } else {
+      Tensor sparse_values_out(rt_dense_values_in.dtype());
+      bool shapes_match = sparse_values_out.CopyFrom(
+          rt_dense_values_in, {rt_dense_values_in.NumElements()});
+      DCHECK(shapes_match);
+      context->set_output(1, sparse_values_out);
+    }
+
+    // Output the `sparse_dense_shape` Tensor.
+    int64 ndims = rt_nested_splits_len + rt_dense_values_in.dims();
+    Tensor* sparse_dense_shape_out = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, TensorShape({ndims}),
+                                                     &sparse_dense_shape_out));
+    auto sparse_dense_shape = sparse_dense_shape_out->vec<int64>();
+    sparse_dense_shape(0) = rt_nested_splits_in[0].dim_size(0) - 1;
+    for (int dim = 0; dim < rt_nested_splits_len; ++dim) {
+      const auto& splits = rt_nested_splits[dim];
+      int64 max_width = 0;
+      for (int i = 1; i < splits.size(); ++i) {
+        max_width = std::max(max_width, splits(i) - splits(i - 1));
+      }
+      sparse_dense_shape(dim + 1) = max_width;
+    }
+    for (int dim = 1; dim < rt_dense_values_in.dims(); ++dim) {
+      sparse_dense_shape(dim + rt_nested_splits_len) =
+          rt_dense_values_in.dim_size(dim);
+    }
+  }
+
+ private:
+  // Validate `rt_nested_splits` to ensure we don't get any segfaults.
+  static ::tensorflow::Status ValidateInputs(
+      std::vector<TTypes<int64>::ConstFlat> rt_nested_splits,
+      const Tensor& rt_dense_values_in) {
+    for (int i = 0; i < rt_nested_splits.size(); ++i) {
+      if (rt_nested_splits[i].size() == 0) {
+        return InvalidArgument("ragged splits may not be empty.");
+      }
+      if (rt_nested_splits[i](0) != 0) {
+        return InvalidArgument("First value of ragged splits must be 0.");
+      }
+      if (i > 0) {
+        int64 last_split =
+            rt_nested_splits[i - 1](rt_nested_splits[i - 1].size() - 1);
+        if (rt_nested_splits[i].size() != last_split + 1) {
+          return InvalidArgument(
+              "Final value of ragged splits must match the length "
+              "the corresponding ragged values.");
+        }
+      }
+    }
+    if (rt_dense_values_in.dim_size(0) !=
+        rt_nested_splits.back()(rt_nested_splits.back().size() - 1)) {
+      return InvalidArgument(
+          "Final value of ragged splits must match the length "
+          "the corresponding ragged values.");
+    }
+    return ::tensorflow::Status::OK();
+  }
+
+  // Build a list of index suffixes that should be added for each ragged item,
+  // to encode the indices of dense values in that ragged item.  This basically
+  // just gives a row-major enumeration of all indices in the given tensor
+  // shape, ignoring dim[0] (since that's the dimension that iterates over
+  // values, and we want index suffixes for a single value).  Example:
+  // MakeIndexSuffixes(TensorShape({100, 3, 2})
+  //   --> {{0, 0}, {0, 1}, {1, 0}, {1, 1}, {2, 0}, {2, 1}}
+  static std::vector<std::vector<int64>> MakeIndexSuffixes(
+      const TensorShape& values_shape) {
+    std::vector<std::vector<int64>> suffixes{{}};
+    for (int dim = 1; dim < values_shape.dims(); ++dim) {
+      std::vector<std::vector<int64>> new_suffixes;
+      for (const auto& suffix : suffixes) {
+        for (int i = 0; i < values_shape.dim_size(dim); ++i) {
+          new_suffixes.push_back(suffix);
+          new_suffixes.back().push_back(i);
+        }
+      }
+      suffixes.swap(new_suffixes);
+    }
+    return suffixes;
+  }
+
+  // Returns true if the ragged element at pos[dim] is "completed".  A ragged
+  // element is completed if we have already generated indices for all of its
+  // values.
+  static bool IsCompleted(
+      const std::vector<int64>& pos, int dim,
+      const std::vector<TTypes<int64>::ConstFlat>& rt_nested_splits) {
+    int64 current_child = pos[dim + 1];
+    int64 limit_child = rt_nested_splits[dim](pos[dim] + 1);
+    return current_child >= limit_child;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("RaggedTensorToSparse").Device(DEVICE_CPU),
+                        RaggedTensorToSparseOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e53e26701666cc47234cb5325ece209ea4ced187
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel_test.cc
@@ -0,0 +1,224 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+
+namespace tensorflow {
+namespace {
+
+class RaggedTensorToSparseTest : public ::tensorflow::OpsTestBase {
+ protected:
+  static constexpr int kSparseIndicesOutput = 0;
+  static constexpr int kSparseValuesOutput = 1;
+  static constexpr int kSparseDenseShapeOutput = 2;
+  // Builds the tensorflow test graph for the RaggedTensorToSparse op, and
+  // populates the `splits` input with the given values.
+  template <typename T>
+  void BuildRaggedTensorToSparseGraph(
+      const std::vector<std::vector<int64>>& rt_nested_splits,
+      const TensorShape& rt_dense_values_shape,
+      const std::vector<T>& rt_dense_values) {
+    const auto& dtype = DataTypeToEnum<T>::v();
+    int64 num_splits = rt_nested_splits.size();
+    TF_ASSERT_OK(NodeDefBuilder("tested_op", "RaggedTensorToSparse")
+                     .Input(FakeInput(num_splits))  // rt_nested_splits
+                     .Input(FakeInput(dtype))       // rt_dense_values
+                     .Attr("RAGGED_RANK", num_splits)
+                     .Attr("T", dtype)
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+    for (const auto& splits : rt_nested_splits) {
+      int64 splits_size = splits.size();
+      AddInputFromArray<int64>(TensorShape({splits_size}), splits);
+    }
+    AddInputFromArray<T>(rt_dense_values_shape, rt_dense_values);
+  }
+};
+
+TEST_F(RaggedTensorToSparseTest, OneSplits_Values1D) {
+  // ragged_tensor=[[1, 2, 3], [], [4, 5], [6]]
+  BuildRaggedTensorToSparseGraph<int>({{0, 3, 3, 5, 6}},    // splits
+                                      TensorShape({6}),     // values.shape
+                                      {1, 2, 3, 4, 5, 6});  // values
+  TF_ASSERT_OK(RunOpKernel());
+  test::ExpectTensorEqual<int64>(
+      *GetOutput(kSparseIndicesOutput),
+      test::AsTensor<int64>({0, 0, 0, 1, 0, 2, 2, 0, 2, 1, 3, 0}, {6, 2}));
+  test::ExpectTensorEqual<int>(*GetOutput(kSparseValuesOutput),
+                               test::AsTensor<int>({1, 2, 3, 4, 5, 6}));
+  test::ExpectTensorEqual<int64>(*GetOutput(kSparseDenseShapeOutput),
+                                 test::AsTensor<int64>({4, 3}));
+}
+
+TEST_F(RaggedTensorToSparseTest, EmptyRows) {
+  // Empty rows at the beginning, middle, and end of the RaggedTensor.
+  // ragged_tensor=[[], [1, 2, 3, 4], [], [5, 6], []]
+  BuildRaggedTensorToSparseGraph<int>({{0, 0, 4, 4, 6, 6}},  // splits
+                                      TensorShape({6}),      // values.shape
+                                      {1, 2, 3, 4, 5, 6});   // values
+  TF_ASSERT_OK(RunOpKernel());
+  test::ExpectTensorEqual<int64>(
+      *GetOutput(kSparseIndicesOutput),
+      test::AsTensor<int64>({1, 0, 1, 1, 1, 2, 1, 3, 3, 0, 3, 1}, {6, 2}));
+  test::ExpectTensorEqual<int>(*GetOutput(kSparseValuesOutput),
+                               test::AsTensor<int>({1, 2, 3, 4, 5, 6}));
+  test::ExpectTensorEqual<int64>(*GetOutput(kSparseDenseShapeOutput),
+                                 test::AsTensor<int64>({5, 4}));
+}
+
+TEST_F(RaggedTensorToSparseTest, OneSplits_Values2D) {
+  // ragged_tensor=[[[1, 2], [3, 4], [5, 6]], [], [[7, 8], [9, 10]], [[11, 12]]]
+  BuildRaggedTensorToSparseGraph<int>(
+      {{0, 3, 3, 5, 6}},                         // splits
+      TensorShape({6, 2}),                       // values.shape
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});  // values
+  TF_ASSERT_OK(RunOpKernel());
+  std::vector<int64> expected_splits_12_3 = {
+      0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 1,
+      2, 0, 0, 2, 0, 1, 2, 1, 0, 2, 1, 1, 3, 0, 0, 3, 0, 1};
+  std::vector<int> expected_values = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  test::ExpectTensorEqual<int64>(
+      *GetOutput(kSparseIndicesOutput),
+      test::AsTensor<int64>(expected_splits_12_3, {12, 3}));
+  test::ExpectTensorEqual<int>(*GetOutput(kSparseValuesOutput),
+                               test::AsTensor<int>(expected_values));
+  test::ExpectTensorEqual<int64>(*GetOutput(kSparseDenseShapeOutput),
+                                 test::AsTensor<int64>({4, 3, 2}));
+}
+
+TEST_F(RaggedTensorToSparseTest, TwoSplits_Values1D) {
+  // ragged_tensor =
+  //        0             1           2
+  // -+--------------------------------------
+  // 0| [[ [x],         [x x],       [] ],
+  // 1|  [                              ],
+  // 2|  [ [x x x x x], [x x x]         ],
+  // 3|  [ [],          [x x x x]       ]]
+  BuildRaggedTensorToSparseGraph<int>(
+      {{0, 3, 3, 5, 7}, {0, 1, 3, 3, 8, 11, 11, 15}},        // splits
+      TensorShape({15}),                                     // values.shape
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});  // values
+  TF_ASSERT_OK(RunOpKernel());
+  std::vector<int64> expected_splits_15_3 = {
+      0, 0, 0, 0, 1, 0, 0, 1, 1, 2, 0, 0, 2, 0, 1, 2, 0, 2, 2, 0, 3, 2, 0,
+      4, 2, 1, 0, 2, 1, 1, 2, 1, 2, 3, 1, 0, 3, 1, 1, 3, 1, 2, 3, 1, 3};
+  std::vector<int> expected_values = {1, 2,  3,  4,  5,  6,  7, 8,
+                                      9, 10, 11, 12, 13, 14, 15};
+  test::ExpectTensorEqual<int>(*GetOutput(kSparseValuesOutput),
+                               test::AsTensor<int>(expected_values));
+  test::ExpectTensorEqual<int64>(
+      *GetOutput(kSparseIndicesOutput),
+      test::AsTensor<int64>(expected_splits_15_3, {15, 3}));
+  test::ExpectTensorEqual<int64>(*GetOutput(kSparseDenseShapeOutput),
+                                 test::AsTensor<int64>({4, 3, 5}));
+}
+
+TEST_F(RaggedTensorToSparseTest, ShapeFn) {
+  // RaggedSplitsToIndices(rt_nested_splits+, rt_dense_values)
+  //     -> [sparse_indices, sparse_values, sparse_dense_shape]
+  // The output shape will always have the following form:
+  //     [nvals, dense_dims];[nvals];[dense_dims]
+  ShapeInferenceTestOp op("RaggedTensorToSparse");
+
+  // Tests with len(rt_nested_splits)==0.
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(0);
+  INFER_ERROR("Requires RAGGED_RANK>0", op, "?");
+
+  // Tests with len(rt_nested_splits)==1.
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(1);
+  INFER_OK(op, "?;?", "[?,?];[?];[?]");          // nvals=?, dense_dims=?
+  INFER_OK(op, "?;[?]", "[?,2];[?];[2]");        // nvals=?, dense_dims=2
+  INFER_OK(op, "?;[?,?]", "[?,3];[?];[3]");      // nvals=?, dense_dims=3
+  INFER_OK(op, "[?];[5]", "[5,2];[5];[2]");      // nvals=5, dense_dims=2
+  INFER_OK(op, "[?];[5,2]", "[10,3];[10];[3]");  // nvals=10, dense_dims=3
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[];?");
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "[5,5];?");
+  INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "?;[]");
+
+  // Tests with len(rt_nested_splits)==2
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(2);
+  INFER_OK(op, "?;?;?", "[?,?];[?];[?]");            // nvals=?, dense_dims=?
+  INFER_OK(op, "?;?;[?]", "[?,3];[?];[3]");          // nvals=?, dense_dims=3
+  INFER_OK(op, "?;?;[?,?]", "[?,4];[?];[4]");        // nvals=?, dense_dims=4
+  INFER_OK(op, "[?];[?];[5]", "[5,3];[5];[3]");      // nvals=5, dense_dims=3
+  INFER_OK(op, "[?];[?];[5,2]", "[10,4];[10];[4]");  // nvals=10, dense_dims=4
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[5,5];?");
+
+  // Tests with len(rt_nested_splits)==3
+  (*op.node_def.mutable_attr())["RAGGED_RANK"].set_i(3);
+  INFER_OK(op, "?;?;?;?", "[?,?];[?];[?]");    // nvals=?, dense_dims=?
+  INFER_OK(op, "?;?;?;[?]", "[?,4];[?];[4]");  // nvals=?, dense_dims=4
+  INFER_OK(op, "?;?;?;[5]", "[5,4];[5];[4]");  // nvals=5, dense_dims=4
+}
+
+TEST_F(RaggedTensorToSparseTest, NoSplits) {
+  const auto& dtype = DataTypeToEnum<int>::v();
+  TF_ASSERT_OK(NodeDefBuilder("tested_op", "RaggedTensorToSparse")
+                   .Input(FakeInput(0))
+                   .Input(FakeInput(dtype))
+                   .Attr("RAGGED_RANK", 0)
+                   .Attr("T", dtype)
+                   .Finalize(node_def()));
+  EXPECT_TRUE(str_util::StartsWith(
+      InitOp().error_message(),
+      "Value for attr 'RAGGED_RANK' of 0 must be at least minimum 1"));
+}
+
+TEST_F(RaggedTensorToSparseTest, InvalidArg_BadSplitStart) {
+  BuildRaggedTensorToSparseGraph<int>({{5, 7, 10}},      // splits
+                                      TensorShape({0}),  // values.shape
+                                      {});               // values
+  EXPECT_EQ("First value of ragged splits must be 0.",
+            RunOpKernel().error_message());
+}
+
+TEST_F(RaggedTensorToSparseTest, InvalidArg_BadSplitLengths1) {
+  BuildRaggedTensorToSparseGraph<int>({{0, 5}, {0, 2, 4, 6}},  // splits
+                                      TensorShape({0}),        // values.shape
+                                      {});                     // values
+  EXPECT_EQ(
+      "Final value of ragged splits must match the length "
+      "the corresponding ragged values.",
+      RunOpKernel().error_message());
+}
+
+TEST_F(RaggedTensorToSparseTest, InvalidArg_BadSplitLengths2) {
+  BuildRaggedTensorToSparseGraph<int>({{0, 5}},          // splits
+                                      TensorShape({0}),  // values.shape
+                                      {});               // values
+  EXPECT_EQ(
+      "Final value of ragged splits must match the length "
+      "the corresponding ragged values.",
+      RunOpKernel().error_message());
+}
+
+TEST_F(RaggedTensorToSparseTest, InvalidArg_EmptySplits) {
+  BuildRaggedTensorToSparseGraph<int>({{}},              // splits
+                                      TensorShape({0}),  // values.shape
+                                      {});               // values
+  EXPECT_EQ("ragged splits may not be empty.", RunOpKernel().error_message());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 04a53697c090afa0aa3489707d287a2723d0498c..3810d817ca9b4bebe5e98203b1d31024a2deb10b 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -489,13 +489,15 @@ class RandomGammaOp : public OpKernel {
       Name("RandomGamma").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"),        \
       RandomGammaOp<TYPE>)
 
-#define REGISTER_INT(IntType)                                   \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
-                              .Device(DEVICE_CPU)               \
-                              .HostMemory("shape")              \
-                              .HostMemory("minval")             \
-                              .HostMemory("maxval")             \
-                              .TypeConstraint<IntType>("Tout"), \
+#define REGISTER_INT(IntType)                                                 \
+  template struct functor::FillPhiloxRandom<                                  \
+      CPUDevice, random::UniformDistribution<random::PhiloxRandom, IntType>>; \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")                            \
+                              .Device(DEVICE_CPU)                             \
+                              .HostMemory("shape")                            \
+                              .HostMemory("minval")                           \
+                              .HostMemory("maxval")                           \
+                              .TypeConstraint<IntType>("Tout"),               \
                           RandomUniformIntOp<CPUDevice, IntType>);
 
 TF_CALL_half(REGISTER);
@@ -538,14 +540,16 @@ TF_CALL_int64(REGISTER_INT);
           random::TruncatedNormalDistribution<                                 \
               random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
 
-#define REGISTER_INT(IntType)                                   \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
-                              .Device(DEVICE_GPU)               \
-                              .HostMemory("shape")              \
-                              .HostMemory("minval")             \
-                              .HostMemory("maxval")             \
-                              .TypeConstraint<int32>("T")       \
-                              .TypeConstraint<IntType>("Tout"), \
+#define REGISTER_INT(IntType)                                                 \
+  template struct functor::FillPhiloxRandom<                                  \
+      GPUDevice, random::UniformDistribution<random::PhiloxRandom, IntType>>; \
+  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")                            \
+                              .Device(DEVICE_GPU)                             \
+                              .HostMemory("shape")                            \
+                              .HostMemory("minval")                           \
+                              .HostMemory("maxval")                           \
+                              .TypeConstraint<int32>("T")                     \
+                              .TypeConstraint<IntType>("Tout"),               \
                           RandomUniformIntOp<GPUDevice, IntType>);
 
 TF_CALL_half(REGISTER);
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index 3393b39faf4a25791b48af99a5e474f3e9bfbfce..edb2b10e3d69b6ac93c13b875d00fa9de7ed5362 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -217,9 +217,9 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
     OpKernelContext*, const GPUDevice& d, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64 size,
     Distribution dist) {
-  const int32 block_size = d.maxCudaThreadsPerBlock();
+  const int32 block_size = d.maxGpuThreadsPerBlock();
   const int32 num_blocks =
-      (d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor()) /
+      (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
       block_size;
 
   FillPhiloxRandomKernelLaunch<Distribution>
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index bb8254eaacf97f514918c3ae462be3ebfcc53799..e9cf36c62b966f5f91cf7764421f0c1ff6c131fc 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -218,7 +218,11 @@ __global__ void RowReduceKernel(
     T in, outT out, int num_rows, int num_cols, Op op,
     typename std::iterator_traits<T>::value_type initVal) {
   typedef typename std::iterator_traits<T>::value_type value_type;
-  const int row = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+  // Defensive index computation to avoid integer overflow.
+  assert(blockDim.x % 32 == 0);
+  int warps_per_block = blockDim.x / 32;
+  int warp_index = threadIdx.x / 32;
+  const int row = blockIdx.x * warps_per_block + warp_index;
   const int lane = threadIdx.x % 32;
 
   if (num_cols == 1) {
@@ -526,27 +530,27 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
         init);
     return;
   }
-  std::size_t temp_storage_bytes = 0;
 
-  Tensor temp_storage;
-  // written as a loop because it reduces clutter
-  // first pass allocates memory, second launches kernel(s)
-  for (int i = 0; i < 2; ++i) {
-    auto success = cub::DeviceReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, in, out, in_size, op, init, cu_stream);
+  size_t temp_storage_bytes = 0;
+  auto reduce = [&](void* temp_storage_ptr) {
+    auto success =
+        cub::DeviceReduce::Reduce(temp_storage_ptr, temp_storage_bytes, in, out,
+                                  in_size, op, init, cu_stream);
 
     OP_REQUIRES(
         ctx, success == 0,
         errors::Internal("CUB reduce error", cudaGetErrorString(success)));
+  };
 
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
 }
 
 template <typename T, typename Op, typename OUT_T, typename IN_T>
@@ -569,25 +573,26 @@ void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
   cub::TransformInputIterator<int, RowOffset, cub::CountingInputIterator<int>>
       transform_iter(counting_iter, row_offset_op);
 
-  std::size_t temp_storage_bytes = 0;
-  Tensor temp_storage;
-  for (int i = 0; i < 2; ++i) {
+  size_t temp_storage_bytes = 0;
+  auto reduce = [&](void* temp_storage_ptr) {
     auto success = cub::DeviceSegmentedReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, in, out, num_rows, transform_iter,
+        temp_storage_ptr, temp_storage_bytes, in, out, num_rows, transform_iter,
         transform_iter + 1, op, init, cu_stream);
 
     OP_REQUIRES(ctx, success == 0,
                 errors::Internal("CUB segmented reduce error",
                                  cudaGetErrorString(success)));
+  };
 
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
 }
 
 template <typename T, typename Op, typename OUT_T, typename IN_T>
@@ -720,25 +725,25 @@ void Launch3DXZReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x,
                                                                  gather_iter);
 
   std::size_t temp_storage_bytes = 0;
-  Tensor temp_storage;
-
-  for (int i = 0; i < 2; ++i) {
+  auto reduce = [&](void* temp_storage_ptr) {
     auto success = cub::DeviceSegmentedReduce::Reduce(
-        i == 0 ? nullptr : temp_storage.flat<int8_t>().data(),
-        temp_storage_bytes, permute_iter, out, extent_y, transform_iter,
-        transform_iter + 1, op, init, cu_stream);
+        temp_storage_ptr, temp_storage_bytes, permute_iter, out, extent_y,
+        transform_iter, transform_iter + 1, op, init, cu_stream);
 
     OP_REQUIRES(ctx, success == 0,
                 errors::Internal("CUB segmented reduce error",
                                  cudaGetErrorString(success)));
+  };
 
-    if (i == 0)
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(
-              DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
-              &temp_storage));
-  }
+  reduce(nullptr);  // Get required amount of temp storage.
+
+  Tensor temp_storage;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &temp_storage));
+
+  reduce(temp_storage.flat<int8_t>().data());  // Do reduction.
 }
 
 namespace reduction_op_helper {
@@ -880,11 +885,11 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
 };
 
 template <typename T>
-struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
+struct ReduceFunctor<GPUDevice, functor::MeanReducer<T>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
   static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
-                     const Eigen::internal::MeanReducer<T>& reducer) {
+                     const functor::MeanReducer<T>& reducer) {
     int divisor = 1;
     if (out.rank() == 0)
       divisor = in.size();
@@ -910,17 +915,17 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<T>> {
 
   template <typename OUT_T>
   static void FillIdentity(const GPUDevice& d, OUT_T out,
-                           const Eigen::internal::MeanReducer<T>& reducer) {
+                           const functor::MeanReducer<T>& reducer) {
     FillIdentityEigenImpl(d, To32Bit(out), reducer);
   }
 };
 
 template <>
-struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<Eigen::half>> {
+struct ReduceFunctor<GPUDevice, functor::MeanReducer<Eigen::half>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
   static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
                      const ReductionAxes& reduction_axes,
-                     const Eigen::internal::MeanReducer<Eigen::half>& reducer) {
+                     const functor::MeanReducer<Eigen::half>& reducer) {
     float divisor = 1.f;
     if (out.rank() == 0)
       divisor = in.size();
@@ -952,9 +957,8 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::MeanReducer<Eigen::half>> {
   }
 
   template <typename OUT_T>
-  static void FillIdentity(
-      const GPUDevice& d, OUT_T out,
-      const Eigen::internal::MeanReducer<Eigen::half>& reducer) {
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const functor::MeanReducer<Eigen::half>& reducer) {
     FillIdentityEigenImpl(d, To32Bit(out), reducer);
   }
 };
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index eb264e0e5a73635bf2ec05413aba06862a74d2ed..2331599b72f46df7a34e9553d5bd41a7613409da 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -26,13 +26,35 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
+// Dummy class used for template specialization for mean reduction, which is
+// accomplished by SumReducer and on-the-fly division by the reduction factor.
+template <typename Scalar>
+struct MeanReducer {
+  Scalar initialize() const { return Scalar(0); }
+};
+
 template <typename Device, typename OUT_T, typename IN_T,
           typename ReductionAxes, typename Reducer>
-void ReduceEigenImpl(const Device& d, OUT_T out, IN_T in,
-                     const ReductionAxes& reduction_axes,
-                     const Reducer& reducer) {
-  out.device(d) = in.reduce(reduction_axes, reducer);
-}
+struct ReduceEigenImpl {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes, const Reducer& reducer) {
+    out.device(d) = in.reduce(reduction_axes, reducer);
+  }
+};
+
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes, typename Scalar>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::MeanReducer<Scalar>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::MeanReducer<Scalar>& reducer) {
+    static_assert(std::is_same<Scalar, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<Scalar> sum_reducer;
+    out.device(d) = in.reduce(reduction_axes, sum_reducer) /
+                    static_cast<Scalar>(in.size() / out.size());
+  }
+};
 
 // For most reducers, the identity is Reducer::initialize()
 template <typename Reducer>
@@ -46,12 +68,12 @@ struct Identity {
 // MeanReducer is a special case, since it doesn't technically have an identity.
 // Thus, ideally we'd return nan.  However, mean is instantiated for integer
 // types as well, so we do the nan override only for floating point types.
-#define FIX_MEAN_IDENTITY(T)                                    \
-  template <>                                                   \
-  struct Identity<Eigen::internal::MeanReducer<T>> {            \
-    static T identity(const Eigen::internal::MeanReducer<T>&) { \
-      return Eigen::NumTraits<T>::quiet_NaN();                  \
-    }                                                           \
+#define FIX_MEAN_IDENTITY(T)                            \
+  template <>                                           \
+  struct Identity<functor::MeanReducer<T>> {            \
+    static T identity(const functor::MeanReducer<T>&) { \
+      return Eigen::NumTraits<T>::quiet_NaN();          \
+    }                                                   \
   };
 FIX_MEAN_IDENTITY(Eigen::half)
 FIX_MEAN_IDENTITY(float)
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index d83e1c7d15d22f069318fcff603b133ac305813e..c6c36ec29a782ea25cead95e638b41bced018dec 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -256,7 +256,8 @@ struct ReduceFunctorBase {
                      const ReductionAxes& reduction_axes,
                      const Reducer& reducer) {
     const Device& d = ctx->eigen_device<Device>();
-    ReduceEigenImpl(d, out, in, reduction_axes, reducer);
+    ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes, Reducer> reducer_impl;
+    reducer_impl(d, out, in, reduction_axes, reducer);
   }
 
   template <typename OUT_T>
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
index cb19c084e3984ff9f47e2e7bfd97d13004284300..c44a40b3b38f5a37574d0d81b7b67adcf27451e1 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
@@ -52,7 +52,7 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE_IDENTITY(T, R)
 
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::SumReducer<complex128>);
-DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::MeanReducer<complex128>);
+DEFINE_FOR_TYPE_AND_R(complex128, functor::MeanReducer<complex128>);
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::ProdReducer<complex128>);
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
index fa550e594a5c59832f0a3603b2aab425f23b156e..1921130ac043d9d1bfdea415c59aafcedcc31ef3 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
@@ -52,7 +52,7 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE_IDENTITY(T, R)
 
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::SumReducer<complex64>);
-DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::MeanReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(complex64, functor::MeanReducer<complex64>);
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::ProdReducer<complex64>);
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
index de46933f615869b7bc15c6a896e6ad92503c5d67..119f726b929bd9c599e26684fede9890efceb2f2 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
@@ -51,11 +51,11 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                           \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);  \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MeanReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);  \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);  \
+#define DEFINE_FOR_ALL_REDUCERS(T)                          \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(double);
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
index b9d737183977c267416104ab0d59f6c1852b9207..70ba4abac48bcfe10d577a120cf08fdd8650f367 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
@@ -51,11 +51,11 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                           \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);  \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MeanReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);  \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);  \
+#define DEFINE_FOR_ALL_REDUCERS(T)                          \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(float);
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
index 69296c7b65c253050b99eddb262857676925d33f..82f6d7df952fcd8b0aaa3561efd4a4bca93e4dce 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
@@ -51,11 +51,11 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                           \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);  \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MeanReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);  \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);  \
+#define DEFINE_FOR_ALL_REDUCERS(T)                          \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(int32);
diff --git a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
index 2120e22f99cbcff32362408b0bd14c429bbec396..db050fdea38bd6db58424da72ff75e79e9151a09 100644
--- a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
@@ -53,7 +53,7 @@ typedef TTypes<float>::Tensor::Index Index;
 
 #define DEFINE_FOR_ALL_REDUCERS(T)                          \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MeanReducer<T>);
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);
 
 DEFINE_FOR_ALL_REDUCERS(Eigen::half);
 #undef DEFINE_FOR_ALL_REDUCERS
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index f61589f913b14bd99bba8b8a43b01b0213b1ff17..67c974edda284d64a11d6087fc93518b4a86afef 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -17,39 +17,39 @@ limitations under the License.
 
 namespace tensorflow {
 
-#define REGISTER_CPU_KERNELS(type)                                          \
-  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
-                              .Device(DEVICE_CPU)                           \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int32>("Tidx"),               \
-                          ReductionOp<CPUDevice, type, int32,               \
-                                      Eigen::internal::MeanReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
-                              .Device(DEVICE_CPU)                           \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int64>("Tidx"),               \
-                          ReductionOp<CPUDevice, type, int64,               \
-                                      Eigen::internal::MeanReducer<type>>);
+#define REGISTER_CPU_KERNELS(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("Mean")                                                      \
+          .Device(DEVICE_CPU)                                           \
+          .TypeConstraint<type>("T")                                    \
+          .TypeConstraint<int32>("Tidx"),                               \
+      ReductionOp<CPUDevice, type, int32, functor::MeanReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("Mean")                                                      \
+          .Device(DEVICE_CPU)                                           \
+          .TypeConstraint<type>("T")                                    \
+          .TypeConstraint<int64>("Tidx"),                               \
+      ReductionOp<CPUDevice, type, int64, functor::MeanReducer<type>>);
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
 #if GOOGLE_CUDA
 
-#define REGISTER_GPU_KERNELS(type)                                          \
-  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
-                              .Device(DEVICE_GPU)                           \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int32>("Tidx")                \
-                              .HostMemory("reduction_indices"),             \
-                          ReductionOp<GPUDevice, type, int32,               \
-                                      Eigen::internal::MeanReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
-                              .Device(DEVICE_GPU)                           \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int64>("Tidx")                \
-                              .HostMemory("reduction_indices"),             \
-                          ReductionOp<GPUDevice, type, int64,               \
-                                      Eigen::internal::MeanReducer<type>>);
+#define REGISTER_GPU_KERNELS(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("Mean")                                                      \
+          .Device(DEVICE_GPU)                                           \
+          .TypeConstraint<type>("T")                                    \
+          .TypeConstraint<int32>("Tidx")                                \
+          .HostMemory("reduction_indices"),                             \
+      ReductionOp<GPUDevice, type, int32, functor::MeanReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("Mean")                                                      \
+          .Device(DEVICE_GPU)                                           \
+          .TypeConstraint<type>("T")                                    \
+          .TypeConstraint<int64>("Tidx")                                \
+          .HostMemory("reduction_indices"),                             \
+      ReductionOp<GPUDevice, type, int64, functor::MeanReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
@@ -58,21 +58,21 @@ TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                         \
-  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int32>("Tidx")                \
-                              .HostMemory("reduction_indices"),             \
-                          ReductionOp<SYCLDevice, type, int32,              \
-                                      Eigen::internal::MeanReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Mean")                                      \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int64>("Tidx")                \
-                              .HostMemory("reduction_indices"),             \
-                          ReductionOp<SYCLDevice, type, int64,              \
-                                      Eigen::internal::MeanReducer<type>>);
+#define REGISTER_SYCL_KERNELS(type)                                      \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Mean")                                                       \
+          .Device(DEVICE_SYCL)                                           \
+          .TypeConstraint<type>("T")                                     \
+          .TypeConstraint<int32>("Tidx")                                 \
+          .HostMemory("reduction_indices"),                              \
+      ReductionOp<SYCLDevice, type, int32, functor::MeanReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("Mean")                                                       \
+          .Device(DEVICE_SYCL)                                           \
+          .TypeConstraint<type>("T")                                     \
+          .TypeConstraint<int64>("Tidx")                                 \
+          .HostMemory("reduction_indices"),                              \
+      ReductionOp<SYCLDevice, type, int64, functor::MeanReducer<type>>);
 REGISTER_SYCL_KERNELS(float);
 REGISTER_SYCL_KERNELS(double);
 #undef REGISTER_SYCL_KERNELS
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index 5318d8c1339eb5cd9429105082cb50e478d21c41..cf0d0f5c7146430b1135b3b930706b052b16619f 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -51,6 +51,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .HostMemory("reduction_indices"),                                    \
       ReductionOp<GPUDevice, type, int64, Eigen::internal::SumReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index 173fea37ed5e449022befda6c4e640d1dd2a95cd..e67695d54afc9d610ae60f6e6da11c33736e18ba 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -33,19 +33,25 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
-#define REGISTER_RELU_KERNELS(type)                                   \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"),      \
-      ReluOp<CPUDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("ReluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"),  \
-      ReluGradOp<CPUDevice, type>);                                   \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu6").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
-      Relu6Op<CPUDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu6Grad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
-      Relu6GradOp<CPUDevice, type>)
+#define REGISTER_RELU_KERNELS(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu").Device(DEVICE_CPU).TypeConstraint<type>("T"),          \
+      ReluOp<CPUDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ReluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"),      \
+      ReluGradOp<CPUDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu6").Device(DEVICE_CPU).TypeConstraint<type>("T"),         \
+      Relu6Op<CPUDevice, type>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu6Grad").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
+      Relu6GradOp<CPUDevice, type>)                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("LeakyRelu").Device(DEVICE_CPU).TypeConstraint<type>("T"),     \
+      LeakyReluOp<CPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("LeakyReluGrad").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      LeakyReluGradOp<CPUDevice, type>);
 
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_RELU_KERNELS);
 #undef REGISTER_RELU_KERNELS
@@ -99,6 +105,19 @@ namespace functor {
   extern template struct Relu6Grad<GPUDevice, T>;                              \
                                                                                \
   template <>                                                                  \
+  void LeakyRelu<GPUDevice, T>::operator()(                                    \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor features, T alpha,   \
+      typename TTypes<T>::Tensor activations);                                 \
+  extern template struct LeakyRelu<GPUDevice, T>;                              \
+                                                                               \
+  template <>                                                                  \
+  void LeakyReluGrad<GPUDevice, T>::operator()(                                \
+      const GPUDevice& d, typename TTypes<T>::ConstTensor gradients,           \
+      typename TTypes<T>::ConstTensor features, T alpha,                       \
+      typename TTypes<T>::Tensor backprops);                                   \
+  extern template struct LeakyReluGrad<GPUDevice, T>;                          \
+                                                                               \
+  template <>                                                                  \
   void Elu<GPUDevice, T>::operator()(const GPUDevice& d,                       \
                                      typename TTypes<T>::ConstTensor features, \
                                      typename TTypes<T>::Tensor activations);  \
@@ -134,30 +153,36 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
 }  // namespace functor
 
 // Registration of the GPU implementations.
-#define REGISTER_GPU_KERNELS(type)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
-      ReluOp<GPUDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("ReluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
-      ReluGradOp<GPUDevice, type>);                                   \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu6").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
-      Relu6Op<GPUDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Relu6Grad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
-      Relu6GradOp<GPUDevice, type>);                                  \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Elu").Device(DEVICE_GPU).TypeConstraint<type>("T"),       \
-      EluOp<GPUDevice, type>);                                        \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("EluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),   \
-      EluGradOp<GPUDevice, type>);                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Selu").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
-      SeluOp<GPUDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),  \
+#define REGISTER_GPU_KERNELS(type)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu").Device(DEVICE_GPU).TypeConstraint<type>("T"),          \
+      ReluOp<GPUDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("ReluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
+      ReluGradOp<GPUDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu6").Device(DEVICE_GPU).TypeConstraint<type>("T"),         \
+      Relu6Op<GPUDevice, type>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Relu6Grad").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
+      Relu6GradOp<GPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("LeakyRelu").Device(DEVICE_GPU).TypeConstraint<type>("T"),     \
+      LeakyReluOp<GPUDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("LeakyReluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      LeakyReluGradOp<GPUDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Elu").Device(DEVICE_GPU).TypeConstraint<type>("T"),           \
+      EluOp<GPUDevice, type>);                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("EluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),       \
+      EluGradOp<GPUDevice, type>);                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("Selu").Device(DEVICE_GPU).TypeConstraint<type>("T"),          \
+      SeluOp<GPUDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("SeluGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"),      \
       SeluGradOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
@@ -188,30 +213,36 @@ REGISTER_KERNEL_BUILDER(
 
 #ifdef TENSORFLOW_USE_SYCL
 // Registration of the GPU implementations.
-#define REGISTER_SYCL_KERNELS(type)                                    \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
-      ReluOp<SYCLDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
-      ReluGradOp<SYCLDevice, type>);                                   \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
-      Relu6Op<SYCLDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      Relu6GradOp<SYCLDevice, type>);                                  \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),       \
-      EluOp<SYCLDevice, type>);                                        \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      EluGradOp<SYCLDevice, type>);                                    \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("Selu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
-      SeluOp<SYCLDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),  \
+#define REGISTER_SYCL_KERNELS(type)                                        \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),          \
+      ReluOp<SYCLDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
+      ReluGradOp<SYCLDevice, type>);                                       \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"),         \
+      Relu6Op<SYCLDevice, type>);                                          \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
+      Relu6GradOp<SYCLDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("LeakyRelu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
+      LeakyReluOp<SYCLDevice, type>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("LeakyReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
+      LeakyReluGradOp<SYCLDevice, type>);                                  \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),           \
+      EluOp<SYCLDevice, type>);                                            \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),       \
+      EluGradOp<SYCLDevice, type>);                                        \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("Selu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),          \
+      SeluOp<SYCLDevice, type>);                                           \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
       SeluGradOp<SYCLDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
diff --git a/tensorflow/core/kernels/relu_op.h b/tensorflow/core/kernels/relu_op.h
index 4775deeb61ead23369ead19b08f74675db3a5146..a4638c70c2c213909dc02b7860e7a9160c51b476 100644
--- a/tensorflow/core/kernels/relu_op.h
+++ b/tensorflow/core/kernels/relu_op.h
@@ -131,6 +131,67 @@ void Relu6GradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
           output->flat<T>());
 }
 
+template <typename Device, typename T>
+class LeakyReluOp : public UnaryElementWiseOp<T, LeakyReluOp<Device, T>> {
+ public:
+  explicit LeakyReluOp(OpKernelConstruction* context)
+      : UnaryElementWiseOp<T, LeakyReluOp<Device, T>>(context) {
+    float alpha_tmp;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_tmp));
+    alpha_ = T(alpha_tmp);
+  }
+
+  void Operate(OpKernelContext* context, const Tensor& input, Tensor* output) {
+    functor::LeakyRelu<Device, T> functor;
+    functor(context->eigen_device<Device>(), input.flat<T>(), alpha_,
+            output->flat<T>());
+  }
+
+ private:
+  T alpha_;
+};
+
+template <typename Device, typename T>
+class LeakyReluGradOp
+    : public BinaryElementWiseOp<T, LeakyReluGradOp<Device, T>> {
+ public:
+  explicit LeakyReluGradOp(OpKernelConstruction* context)
+      : BinaryElementWiseOp<T, LeakyReluGradOp<Device, T>>(context) {
+    float alpha_tmp;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_tmp));
+    alpha_ = T(alpha_tmp);
+  }
+
+  void OperateNoTemplate(OpKernelContext* context, const Tensor& g,
+                         const Tensor& a, T alpha, Tensor* output);
+
+  // INPUTS:
+  //   g (gradients): backpropagated gradients
+  //   a (inputs): either the inputs that were passed to LeakyReluOp(), or its
+  //               outputs (using either one yields the same result here).
+  // OUTPUT:
+  //   gradients to backprop
+  template <int NDIMS>
+  void Operate(OpKernelContext* context, const Tensor& g, const Tensor& a,
+               Tensor* output) {
+    OperateNoTemplate(context, g, a, alpha_, output);
+  }
+
+ private:
+  T alpha_;
+};
+
+template <typename Device, typename T>
+void LeakyReluGradOp<Device, T>::OperateNoTemplate(OpKernelContext* context,
+                                                   const Tensor& g,
+                                                   const Tensor& a, T alpha,
+                                                   Tensor* output) {
+  if (!ReluHelpers::ValidateSameSize(context, g, a)) return;
+  functor::LeakyReluGrad<Device, T> functor;
+  functor(context->eigen_device<Device>(), g.flat<T>(), a.flat<T>(), alpha,
+          output->flat<T>());
+};
+
 template <typename Device, typename T>
 class EluOp : public UnaryElementWiseOp<T, EluOp<Device, T>> {
  public:
diff --git a/tensorflow/core/kernels/relu_op_functor.h b/tensorflow/core/kernels/relu_op_functor.h
index e564da335ac2ba5616db37bed8bc818c7b1515ad..f917142a12d80c5f51803087382f37768593a400 100644
--- a/tensorflow/core/kernels/relu_op_functor.h
+++ b/tensorflow/core/kernels/relu_op_functor.h
@@ -91,6 +91,36 @@ struct Relu6Grad {
   }
 };
 
+// Functor used by LeakyReluOp to do the computations.
+template <typename Device, typename T>
+struct LeakyRelu {
+  // Computes LeakyRelu activation.
+  //
+  // features: any shape.
+  // activations: same shape as "features".
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor features,
+                  T alpha, typename TTypes<T>::Tensor activations) {
+    activations.device(d) = features.cwiseMax(features * alpha);
+  }
+};
+
+// Functor used by LeakyReluGradOp to do the computations.
+template <typename Device, typename T>
+struct LeakyReluGrad {
+  // Computes LeakyReluGrad backprops.
+  //
+  // gradients: gradients backpropagated to the LeakyRelu op.
+  // features: either the inputs that were passed to the LeakyRelu or, or its
+  //           outputs (using either one yields the same result here).
+  // backprops: gradients to backpropagate to the LeakyRelu inputs.
+  void operator()(const Device& d, typename TTypes<T>::ConstTensor gradients,
+                  typename TTypes<T>::ConstTensor features, T alpha,
+                  typename TTypes<T>::Tensor backprops) {
+    backprops.device(d) =
+        (features > static_cast<T>(0)).select(gradients, gradients * alpha);
+  }
+};
+
 // Functor used by EluOp to do the computations.
 template <typename Device, typename T>
 struct Elu {
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index b9391517c17b680d130d8a7100c5e5907e643d70..dd5f9495e2c778bf5cc3f44f384f40ff1025888b 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -145,14 +145,16 @@ struct Relu<Device, qint8> {
 }  // namespace functor
 
 // Definition of the GPU implementations declared in relu_op.cc.
-#define DEFINE_GPU_KERNELS(T)                       \
-  template struct functor::Relu<GPUDevice, T>;      \
-  template struct functor::ReluGrad<GPUDevice, T>;  \
-  template struct functor::Relu6<GPUDevice, T>;     \
-  template struct functor::Relu6Grad<GPUDevice, T>; \
-  template struct functor::Elu<GPUDevice, T>;       \
-  template struct functor::EluGrad<GPUDevice, T>;   \
-  template struct functor::Selu<GPUDevice, T>;      \
+#define DEFINE_GPU_KERNELS(T)                           \
+  template struct functor::Relu<GPUDevice, T>;          \
+  template struct functor::ReluGrad<GPUDevice, T>;      \
+  template struct functor::Relu6<GPUDevice, T>;         \
+  template struct functor::Relu6Grad<GPUDevice, T>;     \
+  template struct functor::LeakyRelu<GPUDevice, T>;     \
+  template struct functor::LeakyReluGrad<GPUDevice, T>; \
+  template struct functor::Elu<GPUDevice, T>;           \
+  template struct functor::EluGrad<GPUDevice, T>;       \
+  template struct functor::Selu<GPUDevice, T>;          \
   template struct functor::SeluGrad<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 678d675c4a77d13380e9c355e63d2e05dd0a2b7e..170b08b4b7f6c8a6842dd12ad7389900b2d83b86 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -124,12 +125,21 @@ void ReadVariablesOp::Compute(OpKernelContext* ctx) {
   for (size_t i = 0; i < dtypes_.size(); ++i) {
     handles[i] = &HandleFromInput(ctx, i);
   }
-  const auto status = LookupResources(ctx, handles, &variables);
-  OP_REQUIRES(ctx, status.ok(),
-              errors::FailedPrecondition(
-                  "Error while reading resource variable. This could mean that "
-                  "the variable was uninitialized. ",
-                  status.ToString()));
+
+  OP_REQUIRES_OK(ctx, LookupResources(ctx, handles, &variables));
+
+  std::vector<string> uninitialized_vars;
+  for (int64 i = 0; i < variables.size(); i++) {
+    if (variables[i] == nullptr) {
+      uninitialized_vars.push_back(handles[i]->name());
+    }
+  }
+
+  OP_REQUIRES(
+      ctx, uninitialized_vars.empty(),
+      errors::InvalidArgument("In ReadVariableOp the following variables were "
+                              "found uninitialized: ",
+                              absl::StrJoin(uninitialized_vars, ", ")));
 
   for (size_t i = 0; i < dtypes_.size(); ++i) {
     // We're acquiring a reference to the underlying buffer while
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index bb96c42f10c498d0ec3d6a726728cb1e7bc8f111..1c4d0bc1ae9934dbfb8718dfa05202b1d7b38edc 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -373,8 +373,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
                           ReverseV2Op<GPUDevice, T, int64>)
 TF_CALL_uint8(REGISTER_GPU_KERNELS);
 TF_CALL_int8(REGISTER_GPU_KERNELS);
-// TODO decide whether we want to enable the bool kernel.
-// TF_CALL_bool(REGISTER_GPU_KERNELS);
+TF_CALL_bool(REGISTER_GPU_KERNELS);
 TF_CALL_half(REGISTER_GPU_KERNELS);
 TF_CALL_float(REGISTER_GPU_KERNELS);
 TF_CALL_double(REGISTER_GPU_KERNELS);
diff --git a/tensorflow/core/kernels/scan_ops_gpu.cu.cc b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
index ed6c6affce54a7e847ede07b329d31411b713bec..ed66c02dc584541ce4d5eb644630b678c1b05916 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,8 +17,20 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#if CUDA_VERSION >= 9000
+#define CUB_USE_COOPERATIVE_GROUPS
+#endif  // CUDA_VERSION >= 9000
+
+#include "third_party/cub/block/block_load.cuh"
+#include "third_party/cub/block/block_scan.cuh"
+#include "third_party/cub/block/block_store.cuh"
+#include "third_party/cub/iterator/counting_input_iterator.cuh"
+#include "third_party/cub/iterator/transform_input_iterator.cuh"
+#include "cuda/include/cuComplex.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/permutation_input_iterator.h"
+#include "tensorflow/core/util/permutation_output_iterator.h"
 
 #include "tensorflow/core/kernels/scan_ops.h"
 
@@ -27,6 +39,258 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::Index Index;
 
+namespace functor {
+
+// Map a contiguous range to the actual memory locations depending on which
+// axis the scan is taking place over and whether or not reversed.
+struct MapIndexToLocation {
+  __host__ __device__ MapIndexToLocation(int dimx, int dimy, int dimz,
+                                         bool reverse = false)
+      : dimx_(dimx), dimy_(dimy), dimz_(dimz), reverse_(reverse) {}
+
+  __host__ __device__ int operator()(int id) const {
+    if (dimx_ == 1) {
+      int row = id % dimy_;
+      int col = id / dimy_;
+
+      if (reverse_) return (dimy_ - row - 1) * dimz_ + col;
+
+      return row * dimz_ + col;
+    } else if (dimz_ == 1) {
+      if (reverse_) {
+        int row = id / dimy_;
+        int col = id % dimy_;
+        return row * dimy_ + (dimy_ - col - 1);
+      }
+      return id;
+    } else {
+      int col = id % dimy_;
+      int tmp = id / dimy_;
+
+      int row1 = id / (dimy_ * dimz_);
+      int col1 = tmp % dimz_;
+
+      if (reverse_)
+        return row1 * dimy_ * dimz_ + (dimy_ - col - 1) * dimz_ + col1;
+
+      return row1 * dimy_ * dimz_ + col * dimz_ + col1;
+    }
+  }
+
+  int dimx_;
+  int dimy_;
+  int dimz_;
+  bool reverse_;
+};
+
+template <typename T, typename Op>
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  T running_total_;
+  Op op_;
+
+  __device__ BlockPrefixCallbackOp(T running_total, Op op)
+      : running_total_(running_total), op_(op) {}
+
+  // Callback operator to be entered by the first warp of threads in the block.
+  // tid 0 is responsible for returning a value for seeding the block-wide scan.
+  __device__ T operator()(T block_aggregate) {
+    T old_prefix = running_total_;
+    running_total_ = op_(old_prefix, block_aggregate);
+    return old_prefix;
+  }
+};
+
+template <typename T>
+struct Sum {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <typename T>
+struct Prod {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+template <typename T, typename Op>
+struct IsSum {
+  constexpr static bool value =
+      (std::is_same<Op, Sum<T>>::value ||
+       std::is_same<Op, Eigen::internal::SumReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IsProd {
+  constexpr static bool value =
+      (std::is_same<Op, Prod<T>>::value ||
+       std::is_same<Op, Eigen::internal::ProdReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IdentityValue {
+  static_assert(IsSum<T, Op>::value || IsProd<T, Op>::value,
+                "IdentityValue not yet defined for this type.");
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsSum<U, OpCopy>::value, U>::type t = U(0)) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsProd<U, OpCopy>::value, U>::type t = U(1)) {
+    return t;
+  }
+};
+
+// Each block is mapped to one sequence.  A contiguous range is mapped to the
+// appropriate locations in memory by the permutation iterators.  This is
+// ideal for 1-D and row based scans.  Column scans would be better if they
+// did a block load and then locally transposed.  CUB's device wide scan is not
+// used in the large 1D case, even though it would be more efficient, because
+// it is not deterministic.
+template <typename T, typename Op, int BlockDim = 128, int ItemsPerThread = 4>
+__global__ void scan_kernel(const T* in, T* out, int dimx, int dimy, int dimz,
+                            bool exclusive, bool reverse, Op op) {
+  typedef cub::BlockLoad<T, BlockDim, ItemsPerThread, cub::BLOCK_LOAD_TRANSPOSE>
+      BlockLoad;
+  typedef cub::BlockStore<T, BlockDim, ItemsPerThread,
+                          cub::BLOCK_STORE_TRANSPOSE>
+      BlockStore;
+  typedef cub::BlockScan<T, BlockDim> BlockScan;
+
+  // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  __shared__ union {
+    typename BlockLoad::TempStorage load;
+    typename BlockScan::TempStorage scan;
+    typename BlockStore::TempStorage store;
+  } temp_storage;
+
+  int problem_length = dimy;
+
+  // Initialize running total
+  BlockPrefixCallbackOp<T, Op> prefix_op(IdentityValue<T, Op>()(), op);
+
+  MapIndexToLocation map_op(dimx, dimy, dimz, reverse);
+  int block_start = problem_length * blockIdx.x;
+  // Have the block iterate over segments of items
+  for (int block_offset = block_start;
+       block_offset < block_start + problem_length;
+       block_offset += BlockDim * ItemsPerThread) {
+    int valid_items = min(BlockDim * ItemsPerThread,
+                          problem_length - (block_offset % problem_length));
+
+    // first construct a counting iterator that has the desired start point
+    typedef cub::TransformInputIterator<int, MapIndexToLocation,
+                                        cub::CountingInputIterator<int>>
+        MapIterType;
+
+    cub::CountingInputIterator<int> counting_iter(block_offset);
+
+    // Next map the iterator to the actual locations in memory
+    MapIterType map_iter(counting_iter, map_op);
+
+    PermutationInputIterator<T, const T*, MapIterType> permutein_iter(in,
+                                                                      map_iter);
+    PermutationOutputIterator<T, T*, MapIterType> permuteout_iter(out,
+                                                                  map_iter);
+
+    // Load a segment of consecutive items that are blocked across threads
+    T thread_data[ItemsPerThread];
+    BlockLoad(temp_storage.load).Load(permutein_iter, thread_data, valid_items);
+    __syncthreads();
+
+    // Collectively compute the block-wide scan
+    if (exclusive) {
+      BlockScan(temp_storage.scan)
+          .ExclusiveScan(thread_data, thread_data, op, prefix_op);
+    } else {
+      BlockScan(temp_storage.scan)
+          .InclusiveScan(thread_data, thread_data, op, prefix_op);
+    }
+    __syncthreads();
+
+    // Store scanned items to output segment
+    BlockStore(temp_storage.store)
+        .Store(permuteout_iter, thread_data, valid_items);
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Op>
+void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                typename TTypes<T, 3>::Tensor out, Op op, const bool reverse,
+                const bool exclusive) {
+  const int items_per_thread = 4;
+
+  int dimx = in.dimension(0);
+  int dimy = in.dimension(1);
+  int dimz = in.dimension(2);
+  int num_blocks = dimx * dimz;
+
+  int ideal_block_size = dimy / items_per_thread;
+
+  // There seems to be a bug when the type is not float and block_size 1024.
+  // Launch on the smallest power of 2 block size that we can.
+  if (ideal_block_size >= 1024 && std::is_same<T, float>::value) {
+    const int block_size = 1024;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 512) {
+    const int block_size = 512;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 256) {
+    const int block_size = 256;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 128) {
+    const int block_size = 128;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 64) {
+    const int block_size = 64;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else {
+    const int block_size = 32;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  }
+}
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::SumReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::SumReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Sum<T>>(d, in, out, Sum<T>(), reverse, exclusive);
+  }
+};
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::ProdReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::ProdReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Prod<T>>(d, in, out, Prod<T>(), reverse, exclusive);
+  }
+};
+
+}  // namespace functor
+
 #define DEFINE(REDUCER, T) template struct functor::Scan<GPUDevice, REDUCER, T>;
 
 #define DEFINE_FOR_ALL_REDUCERS(T)           \
diff --git a/tensorflow/core/kernels/scan_ops_test.cc b/tensorflow/core/kernels/scan_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..588b606a99b73588112aec1ca66cabf8d82dc38e
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+template <typename T>
+static Graph* LargeOneDCumsum(int num_x, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x}));
+  data.flat<T>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ColCumsum(int num_x, int num_y, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* RowCumsum(int num_x, int num_y, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ThreeDYCumsum(int num_y, int num_z, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({32, num_y, num_z}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+template <typename T>
+static void LargeOneDimensional(int iters, const string& device, int num_x,
+                                bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * sizeof(T));
+  test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse)).Run(iters);
+}
+
+static void DoRowCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, RowCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void DoColCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ColCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void Do3DYCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void BM_OneDCumsumGPU(int iters, int num_x) {
+  LargeOneDimensional<float>(iters, "gpu", num_x);
+}
+BENCHMARK(BM_OneDCumsumGPU)->Range(1, 1 << 21);
+
+static void BM_OneDCumsumGPUHalf(int iters, int num_x) {
+  LargeOneDimensional<Eigen::half>(iters, "gpu", num_x);
+}
+BENCHMARK(BM_OneDCumsumGPUHalf)->Range(1, 1 << 21);
+
+static void BM_Sum2DRowCumsumGPU(int iters, int num_x, int num_y) {
+  DoRowCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DRowCumsumGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnCumsumGPU(int iters, int num_x, int num_y) {
+  DoColCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DColumnCumsumGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYCumsumGPU(int iters, int num_x, int num_y) {
+  Do3DYCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum3DYCumsumGPU)->RangePair(64, 4096, 64, 4096);
+
+static void BM_OneDCumsumGPU_reverse(int iters, int num_x) {
+  LargeOneDimensional<float>(iters, "gpu", num_x, true);
+}
+BENCHMARK(BM_OneDCumsumGPU_reverse)->Range(1, 1 << 21);
+
+static void BM_Sum2DRowCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  DoRowCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum2DRowCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  DoColCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum2DColumnCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  Do3DYCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum3DYCumsumGPU_reverse)->RangePair(32, 2048, 32, 2048);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 2f8aede427acfdbdc09b97828c80ffeee322ffcd..63bb793fdcb7eb20daeee1708cb4ba78274cb9f7 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -121,6 +122,90 @@ class ScatterNdOp : public OpKernel {
   }
 };
 
+template <typename Device, typename T, typename Index,
+          scatter_nd_op::UpdateOp op>
+class TensorScatterOp : public OpKernel {
+ public:
+  explicit TensorScatterOp(OpKernelConstruction* c) : OpKernel(c) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    const DataType index_t = DataTypeToEnum<Index>::v();
+    OP_REQUIRES_OK(c, c->MatchSignature({dt, index_t, dt}, {dt}));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input = c->input(0);
+    const Tensor& indices = c->input(1);
+    const Tensor& updates = c->input(2);
+
+    OP_REQUIRES(c, indices.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Indices shape must have rank at least one. Found:",
+                    indices.shape().DebugString()));
+    OP_REQUIRES(c, updates.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Updates shape must have rank at least one. Found:",
+                    updates.shape().DebugString()));
+
+    TensorShape shape = input.shape();
+
+    OP_REQUIRES(
+        c,
+        (shape.num_elements() > 0 || (indices.shape().num_elements() == 0 &&
+                                      updates.shape().num_elements() == 0)),
+        errors::InvalidArgument(
+            "Indices and updates specified for empty output shape"));
+
+    const int64 outer_dims = indices.shape().dims() - 1;
+
+    for (int i = 0; i < outer_dims; ++i) {
+      OP_REQUIRES(c, indices.shape().dim_size(i) == updates.shape().dim_size(i),
+                  errors::InvalidArgument(
+                      "Outer dimensions of indices and update must match. "
+                      "Indices shape: ",
+                      indices.shape().DebugString(),
+                      ", updates shape:", updates.shape().DebugString()));
+    }
+
+    const int64 ix = indices.shape().dim_size(outer_dims);
+    OP_REQUIRES(
+        c, updates.shape().dims() - outer_dims == shape.dims() - ix,
+        errors::InvalidArgument("Inner dimensions of output shape must match "
+                                "inner dimensions of updates shape. Output: ",
+                                shape.DebugString(),
+                                " updates: ", updates.shape().DebugString()));
+    for (int i = 0; i + outer_dims < updates.shape().dims(); ++i) {
+      OP_REQUIRES(
+          c, updates.shape().dim_size(i + outer_dims) == shape.dim_size(ix + i),
+          errors::InvalidArgument(
+              "The inner ", shape.dims() - ix,
+              " dimensions of output.shape=", shape.DebugString(),
+              " must match the inner ", updates.shape().dims() - outer_dims,
+              " dimensions of updates.shape=", updates.shape().DebugString()));
+    }
+
+    std::unique_ptr<Tensor> forwarded_input = c->forward_input(
+        2, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
+
+    if (forwarded_input == nullptr) {
+      // We were not able to forward the input, so we deep copy the tensor and
+      // set the output.
+      Tensor* out;
+      OP_REQUIRES_OK(c, c->allocate_output(0, input.shape(), &out));
+
+      OP_REQUIRES_OK(c, tensorflow::functor::DoCopy(c->eigen_device<Device>(),
+                                                    input, out));
+      OP_REQUIRES_OK(c,
+                     functor::DoScatterNd<Device, T, Index, op>(
+                         c, indices, updates, shape, out, false /*allocate*/));
+    } else {
+      // Output forwarded, so simply perform the scatter.
+      OP_REQUIRES_OK(c, functor::DoScatterNd<Device, T, Index, op>(
+                            c, indices, updates, shape, forwarded_input.get(),
+                            false /*allocate*/));
+    }
+  }
+};
+
 template <typename Device, typename T, typename Index,
           scatter_nd_op::UpdateOp op>
 class ScatterNdUpdateOp : public OpKernel {
@@ -282,6 +367,56 @@ TF_CALL_bool(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_CPU);
 
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, index_type, \
+                                                          dev)              \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterUpdate")                       \
+                              .Device(DEVICE_##dev)                         \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<index_type>("Tindices"),      \
+                          TensorScatterOp<dev##Device, type, index_type,    \
+                                          scatter_nd_op::UpdateOp::ASSIGN>)
+
+#define REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, index_type, dev) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterAdd")                            \
+                              .Device(DEVICE_##dev)                           \
+                              .TypeConstraint<type>("T")                      \
+                              .TypeConstraint<index_type>("Tindices"),        \
+                          TensorScatterOp<dev##Device, type, index_type,      \
+                                          scatter_nd_op::UpdateOp::ADD>)
+
+#define REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, index_type, dev) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterSub")                            \
+                              .Device(DEVICE_##dev)                           \
+                              .TypeConstraint<type>("T")                      \
+                              .TypeConstraint<index_type>("Tindices"),        \
+                          TensorScatterOp<dev##Device, type, index_type,      \
+                                          scatter_nd_op::UpdateOp::SUB>)
+
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, CPU); \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int64, CPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_ADD_CPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int32, CPU); \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int64, CPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_SUB_CPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int32, CPU); \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int64, CPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_CPU(type)   \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU(type); \
+  REGISTER_SCATTER_ND_TENSOR_ADD_CPU(type);    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_CPU(type);
+
+// Register TensorScatterUpdate/Add/Sub for all number types.
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_TENSOR_CPU);
+// Register only TensorScatterUpdate for string/bool types as well.
+TF_CALL_string(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
+TF_CALL_bool(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
+
+#undef REGISTER_SCATTER_ND_TENSOR_CPU
+
 // Registers GPU kernels.
 #if GOOGLE_CUDA
 
@@ -297,8 +432,7 @@ TF_CALL_bool(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_GPU(type);
 
 TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
-// TODO(b/66916790): Support half types in ScatterNd.
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 
@@ -320,6 +454,25 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_UPDATE_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, GPU); \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int64, GPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_ADD_GPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int32, GPU); \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int64, GPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_SUB_GPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int32, GPU); \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int64, GPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_GPU(type)   \
+  REGISTER_SCATTER_ND_TENSOR_ADD_GPU(type);    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type); \
+  REGISTER_SCATTER_ND_TENSOR_SUB_GPU(type);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_TENSOR_GPU);
+
 #undef REGISTER_SCATTER_ND_ADD
 #undef REGISTER_SCATTER_ND_ADD_SUB
 #undef REGISTER_SCATTER_ND_ADD_SUB_CPU
@@ -329,6 +482,16 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_UPDATE_GPU
 #undef REGISTER_SCATTER_ND_KERNEL
 #undef REGISTER_SCATTER_ND_KERNEL_INDEX
+#undef REGISTER_SCATTER_ND_TENSOR_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_CPU
+#undef REGISTER_SCATTER_ND_TENSOR_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_ADD_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_SUB_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_GPU
 
 #endif  // GOOGLE_CUDA
 
@@ -587,7 +750,6 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
 TF_CALL_int32(DECLARE_GPU_SPECS);
-// TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 3bd4168dc78314ce583b876502777ea0f50a3632..d0e0b15da78fb7326b261c1f131643400e9c3e1f 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -83,7 +83,11 @@ struct ComputeOptions {
           context, false,
           errors::InvalidArgument("Unsupported loss type: ", loss_type));
     }
-    OP_REQUIRES_OK(context, context->GetAttr("adaptative", &adaptive));
+    auto s = context->GetAttr("adaptative", &adaptive);
+    if (!s.ok()) {
+      s = context->GetAttr("adaptive", &adaptive);
+    }
+    OP_REQUIRES_OK(context, s);
     OP_REQUIRES_OK(
         context, context->GetAttr("num_sparse_features", &num_sparse_features));
     OP_REQUIRES_OK(context, context->GetAttr("num_sparse_features_with_values",
@@ -245,6 +249,8 @@ class SdcaOptimizer : public OpKernel {
 };
 REGISTER_KERNEL_BUILDER(Name("SdcaOptimizer").Device(DEVICE_CPU),
                         SdcaOptimizer);
+REGISTER_KERNEL_BUILDER(Name("SdcaOptimizerV2").Device(DEVICE_CPU),
+                        SdcaOptimizer);
 
 class SdcaShrinkL1 : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index a006c69297c992439f099196a886257532feebe5..0b0ff95093e44d0c5ae4cd95da68af16ed01f715 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -292,25 +292,30 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N);
 TF_CALL_complex64(DECLARE_FOR_N);
 TF_CALL_complex128(DECLARE_FOR_N);
 TF_CALL_bfloat16(DECLARE_FOR_N);
+TF_CALL_bool(DECLARE_FOR_N);
+TF_CALL_int8(DECLARE_FOR_N);
+TF_CALL_int64(DECLARE_FOR_N);
 DECLARE_FOR_N(int32);
 
 #undef DECLARE_FOR_N
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
-#define REGISTER_GPU(type)                                     \
-  REGISTER_KERNEL_BUILDER(Name("Slice")                        \
-                              .Device(DEVICE_GPU)              \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("size")              \
-                              .TypeConstraint<int32>("Index"), \
+#define REGISTER_GPU(type)                               \
+  REGISTER_KERNEL_BUILDER(Name("Slice")                  \
+                              .Device(DEVICE_GPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("begin")       \
+                              .HostMemory("size"),       \
                           SliceOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_bfloat16(REGISTER_GPU);
+TF_CALL_bool(REGISTER_GPU);
+TF_CALL_int8(REGISTER_GPU);
+TF_CALL_int64(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -318,7 +323,6 @@ TF_CALL_bfloat16(REGISTER_GPU);
 REGISTER_KERNEL_BUILDER(Name("Slice")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Index")
                             .HostMemory("input")
                             .HostMemory("begin")
                             .HostMemory("size")
diff --git a/tensorflow/core/kernels/slice_op_gpu.cu.cc b/tensorflow/core/kernels/slice_op_gpu.cu.cc
index 9d51f8978c0a24afb2f98845a4de4e8b51a29aeb..044948f4065f97c12b4639792bf5dde8d55b6e86 100644
--- a/tensorflow/core/kernels/slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/slice_op_gpu.cu.cc
@@ -40,7 +40,10 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
 TF_CALL_bfloat16(DEFINE_GPU_KERNELS);
+TF_CALL_bool(DEFINE_GPU_KERNELS);
+TF_CALL_int8(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
+DEFINE_GPU_KERNELS(int64);
 
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/spacetodepth_op.cc b/tensorflow/core/kernels/spacetodepth_op.cc
index e59adfc6acbeef3e2d309629121d308c6e228703..044b818e3ce80da257ab33077770c48b8be520c2 100644
--- a/tensorflow/core/kernels/spacetodepth_op.cc
+++ b/tensorflow/core/kernels/spacetodepth_op.cc
@@ -193,6 +193,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<qint8>("T"),
     SpaceToDepthOp<GPUDevice, qint8>);
+REGISTER_KERNEL_BUILDER(
+    Name("SpaceToDepth").Device(DEVICE_GPU).TypeConstraint<uint8>("T"),
+    SpaceToDepthOp<GPUDevice, uint8>);
 #endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index f38459724abcb544252885b89dede635960b24b9..b565927ccb8d588ea52856c0c23e62e5fa3d18ff 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -240,6 +240,10 @@ template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::half,
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, Eigen::half,
                                                FORMAT_NHWC>;
 
+// Instantiate the GPU implementations for uint8.
+template struct functor::SpaceToDepthOpFunctor<GPUDevice, uint8, FORMAT_NCHW>;
+template struct functor::SpaceToDepthOpFunctor<GPUDevice, uint8, FORMAT_NHWC>;
+
 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, int32, FORMAT_NCHW>;
 
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index ac48202ada2204ea36478257630f20f7892be50b..a4e89f439ed9f5711253924ad120f7a6751e1728 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -88,12 +88,12 @@ class SparseDenseBinaryOpShared : public OpKernel {
     const auto rhs_dims = BCast::FromShape(dense_t->shape());
     BCast b(lhs_dims, rhs_dims, false);  // false for keeping the same num dims.
 
-    // True iff (size(lhs) > size(rhs)), or (sizes equal, lhs cwise rhs).
+    // True iff (size(lhs) >= size(rhs)) and all dims in lhs is greater or equal
+    // to dims in rhs (from right to left).
     auto VecGreaterEq = [](ArraySlice<int64> lhs, ArraySlice<int64> rhs) {
-      if (lhs.size() > rhs.size()) return true;
       if (lhs.size() < rhs.size()) return false;
-      for (size_t i = 0; i < lhs.size(); ++i) {
-        if (lhs[i] < rhs[i]) return false;
+      for (size_t i = 0; i < rhs.size(); ++i) {
+        if (lhs[lhs.size() - 1 - i] < rhs[rhs.size() - 1 - i]) return false;
       }
       return true;
     };
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 866c5dcd521b2a33f44e2466262ec72b577ffa23..2ea7a1ed3b9c5c37e0c93edef9431ce0438d380d 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -44,6 +44,10 @@ limitations under the License.
 #include "include/libxsmm_spmdm.h"
 #endif
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 namespace {
 
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c70a2d62d36b94362c6f10473644f2623b77d2a
--- /dev/null
+++ b/tensorflow/core/kernels/stack.cc
@@ -0,0 +1,339 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/stack.h"
+
+#include <limits.h>
+#include <atomic>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Stack : public ResourceBase {
+ public:
+  static std::atomic<int64> stack_counter;
+
+  struct TensorAndAllocation {
+    Tensor tensor;
+    AllocatorAttributes alloc_attrs;
+    bool swapped_to_cpu;
+  };
+
+  Stack(const DataType& elem_type, const string& stack_name, int max_size)
+      : elem_type_(elem_type),
+        stack_name_(stack_name),
+        max_size_(max_size),
+        closed_(false) {}
+
+  Status Push(const TensorAndAllocation& value) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(CheckNotClosed());
+    if (max_size_ >= 0 && stack_.size() >= max_size_) {
+      return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ",
+                                     "its max_size (", max_size_, ")");
+    }
+    stack_.push_back(value);
+    return Status::OK();
+  }
+
+  Status Pop(TensorAndAllocation* value) {
+    mutex_lock l(mu_);
+    TF_RETURN_IF_ERROR(CheckNotClosed());
+    if (stack_.empty()) {
+      return errors::InvalidArgument("Stack[", stack_name_,
+                                     "] is empty when calling Pop().");
+    }
+    *value = stack_.back();
+    stack_.pop_back();
+    return Status::OK();
+  }
+
+  // We don't swap the first tensor on the stack and any subsequent tensors
+  // that share the buffer with the first tensor.
+  bool IsUsefulToSwap(const Tensor& tensor) const {
+    mutex_lock l(mu_);
+    if (stack_.empty()) {
+      return false;
+    }
+    const Tensor& first = stack_.front().tensor;
+    return !tensor.SharesBufferWith(first);
+  }
+
+  void Close() {
+    mutex_lock l(mu_);
+    stack_.clear();
+    closed_ = true;
+  }
+
+  DataType ElemType() { return elem_type_; }
+
+  string DebugString() override {
+    mutex_lock l(mu_);
+    return strings::StrCat("Stack[", stack_name_, "]");
+  }
+
+  const string& stack_name() { return stack_name_; }
+
+ private:
+  friend class StackOp;
+  mutex* mu() { return &mu_; }
+
+  mutable mutex mu_;
+  DataType elem_type_;
+  const string stack_name_;
+  Tensor handle_;
+  int max_size_;
+  bool closed_ GUARDED_BY(mu_);
+  std::vector<TensorAndAllocation> stack_ GUARDED_BY(mu_);
+
+  Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    if (closed_) {
+      return errors::InvalidArgument("Stack[", stack_name_,
+                                     "] has already been closed.");
+    }
+    return Status::OK();
+  }
+};
+
+Status GetStack(OpKernelContext* ctx, Stack** stack) {
+  if (ctx->input_dtype(0) == DT_RESOURCE) {
+    return LookupResource(ctx, HandleFromInput(ctx, 0), stack);
+  } else {
+    Tensor Tstack_handle = ctx->mutable_input(0, false);
+    if (Tstack_handle.NumElements() != 2) {
+      return errors::InvalidArgument(
+          "Stack handle must have two elements, but had shape: ",
+          Tstack_handle.shape().DebugString());
+    }
+    const string& container = Tstack_handle.flat<string>()(0);
+    const string& stack_name = Tstack_handle.flat<string>()(1);
+    string key = strings::StrCat(container, stack_name);
+    ResourceMgr* rm = ctx->resource_manager();
+    if (rm == nullptr) {
+      return errors::Internal("No resource manager.");
+    }
+    auto* step_container = ctx->step_container();
+    if (step_container == nullptr) {
+      return errors::Internal("No step container.");
+    }
+    TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack));
+    return Status::OK();
+  }
+}
+
+std::atomic<int64> Stack::stack_counter{0};
+
+// StackOp
+
+StackOp::StackOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("elem_type", &elem_type_));
+  OP_REQUIRES_OK(context, context->GetAttr("stack_name", &stack_name_));
+  if (stack_name_.empty()) stack_name_ = name();
+}
+
+void StackOp::Compute(OpKernelContext* ctx) {
+  int32 size = std::numeric_limits<int32>::max();
+  if (ctx->num_inputs() > 0) {
+    const Tensor* tensor_size;
+    OP_REQUIRES_OK(ctx, ctx->input("max_size", &tensor_size));
+
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(tensor_size->shape()),
+        errors::InvalidArgument("Stack size must be a scalar, but had shape: ",
+                                tensor_size->shape().DebugString()));
+
+    int32 size_value = tensor_size->scalar<int32>()();
+    if (size_value >= 0) {
+      size = size_value;
+    }
+  }
+
+  static const char kContainer[] = "_stacks";
+  auto stack_id = Stack::stack_counter.fetch_add(1);
+  string stack_name = strings::StrCat(stack_name_, "_", stack_id);
+  // Store the handle in a per-step container.
+  ResourceMgr* rm = ctx->resource_manager();
+  OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
+  string key = strings::StrCat(kContainer, stack_name);
+  Stack* stack = new Stack(elem_type_, stack_name, size);
+  auto* step_container = ctx->step_container();
+  OP_REQUIRES(ctx, step_container != nullptr,
+              errors::Internal("No step container."));
+  OP_REQUIRES_OK(ctx, rm->Create(step_container->name(), key, stack));
+  if (IsRefType(ctx->expected_output_dtype(0))) {
+    // Create the stack handle.
+    AllocatorAttributes alloc_attr;
+    alloc_attr.set_on_host(true);
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING,
+                                           tensorflow::TensorShape({2}),
+                                           &stack->handle_, alloc_attr));
+    auto handle = stack->handle_.flat<string>();
+    handle(0) = kContainer;
+    handle(1) = std::move(stack_name);
+    ctx->set_output_ref(0, stack->mu(), &stack->handle_);
+  } else {
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->flat<ResourceHandle>()(0) =
+        MakePerStepResourceHandle<Stack>(ctx, key);
+  }
+}
+
+// StackPushOp
+
+StackPushOp::StackPushOp(OpKernelConstruction* context, bool allow_swapping)
+    : AsyncOpKernel(context) {
+  if (allow_swapping) {
+    OP_REQUIRES_OK(context, context->GetAttr("swap_memory", &swap_memory_));
+  }
+}
+
+void StackPushOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  // Get the stack from the handle.
+  Stack* stack = nullptr;
+  OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
+  core::ScopedUnref unref(stack);
+
+  if (ctx->input_dtype(1) != stack->ElemType()) {
+    ctx->CtxFailure(errors::InvalidArgument("Must have type ",
+                                            stack->ElemType(), " but got ",
+                                            ctx->input_dtype(1)));
+    done();
+    return;
+  }
+
+  // Push the tensor onto the stack. Swap the tensor to CPU if instructed.
+  const Tensor& tensor = ctx->input(1);
+  AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
+  // For now, we use a simple heuristic for swapping: A GPU tensor is moved
+  // to CPU if the tensor has more than kCopyThreshold bytes and the GPU
+  // allocator says more than kOccupancy of the memory is in use.
+  static constexpr int kCopyThreshold = 2048;
+  static constexpr double kOccupancy = 0.7;
+  if (swap_memory_ && !alloc_attrs.on_host() &&
+      tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
+    DeviceContext* device_ctxt = ctx->op_device_context();
+    auto device = static_cast<tensorflow::Device*>(ctx->device());
+    Allocator* allocator = device->GetAllocator(alloc_attrs);
+    AllocatorStats stats;
+    allocator->GetStats(&stats);
+    if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
+      // Asynchronously copy the tensor from GPU to CPU memory.
+      // TODO(yuanbyu): Swap the oldest tensor first.
+      AllocatorAttributes host_alloc_attrs;
+      host_alloc_attrs.set_gpu_compatible(true);
+      host_alloc_attrs.set_on_host(true);
+      Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs);
+      Tensor* cpu_tensor =
+          new Tensor(cpu_allocator, tensor.dtype(), tensor.shape());
+      device_ctxt->CopyDeviceTensorToCPU(
+          &tensor, "StackPush", device, cpu_tensor,
+          [cpu_tensor, stack, ctx, done](const Status& s) {
+            ctx->SetStatus(s);
+            if (s.ok()) {
+              AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
+              ctx->SetStatus(stack->Push({*cpu_tensor, alloc_attrs, true}));
+            }
+            if (ctx->status().ok()) {
+              ctx->set_output(0, *cpu_tensor);
+            }
+            done();
+            delete cpu_tensor;
+          });
+      return;
+    }
+  }
+
+  // Execute synchronously if not swapped.
+  OP_REQUIRES_OK_ASYNC(ctx, stack->Push({tensor, alloc_attrs, false}), done);
+  ctx->set_output(0, tensor);
+  done();
+}
+
+bool StackPushOp::IsExpensive() { return false; }
+
+// StackPopOp
+
+StackPopOp::StackPopOp(OpKernelConstruction* context)
+    : AsyncOpKernel(context) {}
+
+void StackPopOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  // Get the stack from the handle.
+  Stack* stack = nullptr;
+  OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
+  core::ScopedUnref unref(stack);
+
+  // Pop the tensor. Transfer the tensor back to device if it was
+  // swapped out to CPU.
+  Stack::TensorAndAllocation value;
+  OP_REQUIRES_OK_ASYNC(ctx, stack->Pop(&value), done);
+  if (value.swapped_to_cpu) {
+    // Asynchronously copy the tensor back from CPU to GPU memory.
+    DeviceContext* device_ctxt = ctx->op_device_context();
+    Device* device = static_cast<Device*>(ctx->device());
+    Tensor* cpu_tensor = &value.tensor;
+    Allocator* gpu_allocator = device->GetAllocator(value.alloc_attrs);
+    Tensor* device_tensor =
+        new Tensor(gpu_allocator, cpu_tensor->dtype(), cpu_tensor->shape());
+    device_ctxt->CopyCPUTensorToDevice(
+        cpu_tensor, device, device_tensor,
+        [device_tensor, ctx, done](const Status& s) {
+          ctx->SetStatus(s);
+          if (s.ok()) {
+            ctx->set_output(0, *device_tensor);
+          }
+          done();
+          delete device_tensor;
+        });
+  } else {
+    // Execute synchronously if not swapped.
+    ctx->set_output(0, value.tensor);
+    done();
+  }
+}
+
+bool StackPopOp::IsExpensive() { return false; }
+
+// StackCloseOp
+
+StackCloseOp::StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+void StackCloseOp::Compute(OpKernelContext* ctx) {
+  Stack* stack = nullptr;
+  OP_REQUIRES_OK(ctx, GetStack(ctx, &stack));
+  core::ScopedUnref unref(stack);
+  stack->Close();
+}
+
+bool StackCloseOp::IsExpensive() { return false; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stack.h b/tensorflow/core/kernels/stack.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1927e1f28fa217822aabedb0211546dd7c72758
--- /dev/null
+++ b/tensorflow/core/kernels/stack.h
@@ -0,0 +1,76 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STACK_H_
+#define TENSORFLOW_CORE_KERNELS_STACK_H_
+
+// See docs in ../ops/data_flow_ops.cc.
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// A per-run local stack. The stack uses a "per-step" resource manager which
+// ensures that correct garbage collection on error or successful completion.
+class StackOp : public OpKernel {
+ public:
+  explicit StackOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataType elem_type_;
+  string stack_name_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
+};
+
+class StackPushOp : public AsyncOpKernel {
+ public:
+  StackPushOp(OpKernelConstruction* context, bool allow_swapping);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+  bool IsExpensive() override;
+
+ private:
+  bool swap_memory_ = false;
+};
+
+// Templated helper to make it easier to register kernels with or without
+// swapping.
+template <bool allow_swapping>
+class TemplatedStackPushOp : public StackPushOp {
+ public:
+  TemplatedStackPushOp(OpKernelConstruction* context)
+      : StackPushOp(context, allow_swapping) {}
+};
+
+class StackPopOp : public AsyncOpKernel {
+ public:
+  explicit StackPopOp(OpKernelConstruction* context);
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+  bool IsExpensive() override;
+};
+
+class StackCloseOp : public OpKernel {
+ public:
+  explicit StackCloseOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
+  bool IsExpensive() override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STACK_H_
diff --git a/tensorflow/core/kernels/stack_ops.cc b/tensorflow/core/kernels/stack_ops.cc
index add4afafc92d4eee56447550390e19b89a95141d..df94a8818e7edd1b7313da4c483725e2119997af 100644
--- a/tensorflow/core/kernels/stack_ops.cc
+++ b/tensorflow/core/kernels/stack_ops.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/data_flow_ops.cc.
 
+#include "tensorflow/core/kernels/stack.h"
+
 #include <limits.h>
 #include <atomic>
 #include <vector>
@@ -38,191 +40,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-typedef Eigen::ThreadPoolDevice CPUDevice;
-typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
-
-class Stack : public ResourceBase {
- public:
-  static std::atomic<int64> stack_counter;
-
-  struct TensorAndAllocation {
-    Tensor tensor;
-    AllocatorAttributes alloc_attrs;
-    bool swapped_to_cpu;
-  };
-
-  Stack(const DataType& elem_type, const string& stack_name, int max_size)
-      : elem_type_(elem_type),
-        stack_name_(stack_name),
-        max_size_(max_size),
-        closed_(false) {}
-
-  Status Push(const TensorAndAllocation& value) {
-    mutex_lock l(mu_);
-    TF_RETURN_IF_ERROR(CheckNotClosed());
-    if (max_size_ >= 0 && stack_.size() >= max_size_) {
-      return errors::InvalidArgument("Stack[", stack_name_, "] overflowed ",
-                                     "its max_size (", max_size_, ")");
-    }
-    stack_.push_back(value);
-    return Status::OK();
-  }
-
-  Status Pop(TensorAndAllocation* value) {
-    mutex_lock l(mu_);
-    TF_RETURN_IF_ERROR(CheckNotClosed());
-    if (stack_.empty()) {
-      return errors::InvalidArgument("Stack[", stack_name_,
-                                     "] is empty when calling Pop().");
-    }
-    *value = stack_.back();
-    stack_.pop_back();
-    return Status::OK();
-  }
-
-  // We don't swap the first tensor on the stack and any subsequent tensors
-  // that share the buffer with the first tensor.
-  bool IsUsefulToSwap(const Tensor& tensor) const {
-    mutex_lock l(mu_);
-    if (stack_.empty()) {
-      return false;
-    }
-    const Tensor& first = stack_.front().tensor;
-    return !tensor.SharesBufferWith(first);
-  }
-
-  void Close() {
-    mutex_lock l(mu_);
-    stack_.clear();
-    closed_ = true;
-  }
-
-  DataType ElemType() { return elem_type_; }
-
-  string DebugString() override {
-    mutex_lock l(mu_);
-    return strings::StrCat("Stack[", stack_name_, "]");
-  }
-
-  const string& stack_name() { return stack_name_; }
-
- private:
-  friend class StackOp;
-  mutex* mu() { return &mu_; }
-
-  mutable mutex mu_;
-  DataType elem_type_;
-  const string stack_name_;
-  Tensor handle_;
-  int max_size_;
-  bool closed_ GUARDED_BY(mu_);
-  std::vector<TensorAndAllocation> stack_ GUARDED_BY(mu_);
-
-  Status CheckNotClosed() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (closed_) {
-      return errors::InvalidArgument("Stack[", stack_name_,
-                                     "] has already been closed.");
-    }
-    return Status::OK();
-  }
-};
-
-Status GetStack(OpKernelContext* ctx, Stack** stack) {
-  if (ctx->input_dtype(0) == DT_RESOURCE) {
-    return LookupResource(ctx, HandleFromInput(ctx, 0), stack);
-  } else {
-    Tensor Tstack_handle = ctx->mutable_input(0, false);
-    if (Tstack_handle.NumElements() != 2) {
-      return errors::InvalidArgument(
-          "Stack handle must have two elements, but had shape: ",
-          Tstack_handle.shape().DebugString());
-    }
-    const string& container = Tstack_handle.flat<string>()(0);
-    const string& stack_name = Tstack_handle.flat<string>()(1);
-    string key = strings::StrCat(container, stack_name);
-    ResourceMgr* rm = ctx->resource_manager();
-    if (rm == nullptr) {
-      return errors::Internal("No resource manager.");
-    }
-    auto* step_container = ctx->step_container();
-    if (step_container == nullptr) {
-      return errors::Internal("No step container.");
-    }
-    TF_RETURN_IF_ERROR(rm->Lookup(step_container->name(), key, stack));
-    return Status::OK();
-  }
-}
-
-std::atomic<int64> Stack::stack_counter{0};
-
-// A per-run local stack. The stack uses a "per-step" resource manager which
-// ensures that correct garbage collection on error or successful completion.
-class StackOp : public OpKernel {
- public:
-  explicit StackOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("elem_type", &elem_type_));
-    OP_REQUIRES_OK(context, context->GetAttr("stack_name", &stack_name_));
-    if (stack_name_.empty()) stack_name_ = name();
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    int32 size = std::numeric_limits<int32>::max();
-    if (ctx->num_inputs() > 0) {
-      const Tensor* tensor_size;
-      OP_REQUIRES_OK(ctx, ctx->input("max_size", &tensor_size));
-
-      OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_size->shape()),
-                  errors::InvalidArgument(
-                      "Stack size must be a scalar, but had shape: ",
-                      tensor_size->shape().DebugString()));
-
-      int32 size_value = tensor_size->scalar<int32>()();
-      if (size_value >= 0) {
-        size = size_value;
-      }
-    }
-
-    static const char kContainer[] = "_stacks";
-    auto stack_id = Stack::stack_counter.fetch_add(1);
-    string stack_name = strings::StrCat(stack_name_, "_", stack_id);
-    // Store the handle in a per-step container.
-    ResourceMgr* rm = ctx->resource_manager();
-    OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
-    string key = strings::StrCat(kContainer, stack_name);
-    Stack* stack = new Stack(elem_type_, stack_name, size);
-    auto* step_container = ctx->step_container();
-    OP_REQUIRES(ctx, step_container != nullptr,
-                errors::Internal("No step container."));
-    OP_REQUIRES_OK(ctx, rm->Create(step_container->name(), key, stack));
-    if (IsRefType(ctx->expected_output_dtype(0))) {
-      // Create the stack handle.
-      AllocatorAttributes alloc_attr;
-      alloc_attr.set_on_host(true);
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensorflow::DT_STRING,
-                                             tensorflow::TensorShape({2}),
-                                             &stack->handle_, alloc_attr));
-      auto handle = stack->handle_.flat<string>();
-      handle(0) = kContainer;
-      handle(1) = std::move(stack_name);
-      ctx->set_output_ref(0, stack->mu(), &stack->handle_);
-    } else {
-      Tensor* handle;
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
-      handle->flat<ResourceHandle>()(0) =
-          MakePerStepResourceHandle<Stack>(ctx, key);
-    }
-  }
-
- private:
-  DataType elem_type_;
-  string stack_name_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(StackOp);
-};
-
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_CPU), StackOp);
 REGISTER_KERNEL_BUILDER(Name("Stack").Device(DEVICE_GPU).HostMemory("handle"),
                         StackOp);
@@ -242,102 +59,22 @@ REGISTER_KERNEL_BUILDER(Name("StackV2")
                         StackOp);
 #endif  // TENSORFLOW_USE_SYCL
 
-template <typename Device>
-class StackPushOp : public AsyncOpKernel {
- public:
-  explicit StackPushOp(OpKernelConstruction* context) : AsyncOpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("swap_memory", &swap_memory_));
-  }
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    // Get the stack from the handle.
-    Stack* stack = nullptr;
-    OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
-    core::ScopedUnref unref(stack);
-
-    if (ctx->input_dtype(1) != stack->ElemType()) {
-      ctx->CtxFailure(errors::InvalidArgument("Must have type ",
-                                              stack->ElemType(), " but got ",
-                                              ctx->input_dtype(1)));
-      done();
-      return;
-    }
-
-    // Push the tensor onto the stack. Swap the tensor to CPU if instructed.
-    const Tensor& tensor = ctx->input(1);
-    AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
-    // For now, we use a simple heuristic for swapping: A GPU tensor is moved
-    // to CPU if the tensor has more than kCopyThreshold bytes and the GPU
-    // allocator says more than kOccupancy of the memory is in use.
-    static constexpr int kCopyThreshold = 2048;
-    static constexpr double kOccupancy = 0.7;
-    if (swap_memory_ && !alloc_attrs.on_host() &&
-        (std::is_same<Device, GPUDevice>::value
-#ifdef TENSORFLOW_USE_SYCL
-         || std::is_same<Device, SYCLDevice>::value
-#endif  // TENSORFLOW_USE_SYCL
-         ) &&
-        tensor.TotalBytes() > kCopyThreshold && stack->IsUsefulToSwap(tensor)) {
-      DeviceContext* device_ctxt = ctx->op_device_context();
-      auto device = static_cast<tensorflow::Device*>(ctx->device());
-      Allocator* allocator = device->GetAllocator(alloc_attrs);
-      AllocatorStats stats;
-      allocator->GetStats(&stats);
-      if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
-        // Asynchronously copy the tensor from GPU to CPU memory.
-        // TODO(yuanbyu): Swap the oldest tensor first.
-        AllocatorAttributes host_alloc_attrs;
-        host_alloc_attrs.set_gpu_compatible(true);
-        host_alloc_attrs.set_on_host(true);
-        Allocator* cpu_allocator = device->GetAllocator(host_alloc_attrs);
-        Tensor* cpu_tensor =
-            new Tensor(cpu_allocator, tensor.dtype(), tensor.shape());
-        device_ctxt->CopyDeviceTensorToCPU(
-            &tensor, "StackPush", device, cpu_tensor,
-            [cpu_tensor, stack, ctx, done](const Status& s) {
-              ctx->SetStatus(s);
-              if (s.ok()) {
-                AllocatorAttributes alloc_attrs = ctx->input_alloc_attr(1);
-                ctx->SetStatus(stack->Push({*cpu_tensor, alloc_attrs, true}));
-              }
-              if (ctx->status().ok()) {
-                ctx->set_output(0, *cpu_tensor);
-              }
-              done();
-              delete cpu_tensor;
-            });
-        return;
-      }
-    }
-
-    // Execute synchronously if not swapped.
-    OP_REQUIRES_OK_ASYNC(ctx, stack->Push({tensor, alloc_attrs, false}), done);
-    ctx->set_output(0, tensor);
-    done();
-  }
-
-  bool IsExpensive() override { return false; }
-
- private:
-  bool swap_memory_;
-};
-
 REGISTER_KERNEL_BUILDER(Name("StackPush").Device(DEVICE_CPU),
-                        StackPushOp<CPUDevice>);
+                        TemplatedStackPushOp</*allow_swapping=*/false>);
 REGISTER_KERNEL_BUILDER(Name("StackPushV2").Device(DEVICE_CPU),
-                        StackPushOp<CPUDevice>);
-
-#define REGISTER_GPU_KERNEL(type)                         \
-  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);        \
-  REGISTER_KERNEL_BUILDER(Name("StackPushV2")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);
+                        TemplatedStackPushOp</*allow_swapping=*/false>);
+
+#define REGISTER_GPU_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")                               \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>); \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")                             \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
@@ -345,21 +82,21 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 // Special GPU kernels for int32 and bool.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-#define REGISTER_GPU_HOST_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("StackPush")               \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .HostMemory("elem")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);        \
-  REGISTER_KERNEL_BUILDER(Name("StackPushV2")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("handle")       \
-                              .HostMemory("elem")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          StackPushOp<GPUDevice>);
+#define REGISTER_GPU_HOST_KERNEL(type)                                    \
+  REGISTER_KERNEL_BUILDER(Name("StackPush")                               \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .HostMemory("elem")                         \
+                              .HostMemory("output")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>); \
+  REGISTER_KERNEL_BUILDER(Name("StackPushV2")                             \
+                              .Device(DEVICE_GPU)                         \
+                              .HostMemory("handle")                       \
+                              .HostMemory("elem")                         \
+                              .HostMemory("output")                       \
+                              .TypeConstraint<type>("T"),                 \
+                          TemplatedStackPushOp</*allow_swapping=*/true>);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(bool);
@@ -372,7 +109,7 @@ REGISTER_GPU_HOST_KERNEL(bool);
                               .Device(DEVICE_SYCL)        \
                               .HostMemory("handle")       \
                               .TypeConstraint<type>("T"), \
-                          StackPushOp<SYCLDevice>);
+                          TemplatedStackPushOp</*allow_swapping=*/true>);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
 
@@ -383,7 +120,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
                               .HostMemory("elem")         \
                               .HostMemory("output")       \
                               .TypeConstraint<type>("T"), \
-                          StackPushOp<SYCLDevice>)
+                          TemplatedStackPushOp</*allow_swapping=*/true>)
 
 REGISTER_SYCL_HOST_KERNEL(int32);
 REGISTER_SYCL_HOST_KERNEL(bool);
@@ -391,48 +128,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-class StackPopOp : public AsyncOpKernel {
- public:
-  explicit StackPopOp(OpKernelConstruction* context) : AsyncOpKernel(context) {}
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    // Get the stack from the handle.
-    Stack* stack = nullptr;
-    OP_REQUIRES_OK_ASYNC(ctx, GetStack(ctx, &stack), done);
-    core::ScopedUnref unref(stack);
-
-    // Pop the tensor. Transfer the tensor back to device if it was
-    // swapped out to CPU.
-    Stack::TensorAndAllocation value;
-    OP_REQUIRES_OK_ASYNC(ctx, stack->Pop(&value), done);
-    if (value.swapped_to_cpu) {
-      // Asynchronously copy the tensor back from CPU to GPU memory.
-      DeviceContext* device_ctxt = ctx->op_device_context();
-      Device* device = static_cast<Device*>(ctx->device());
-      Tensor* cpu_tensor = &value.tensor;
-      Allocator* gpu_allocator = device->GetAllocator(value.alloc_attrs);
-      Tensor* device_tensor =
-          new Tensor(gpu_allocator, cpu_tensor->dtype(), cpu_tensor->shape());
-      device_ctxt->CopyCPUTensorToDevice(
-          cpu_tensor, device, device_tensor,
-          [device_tensor, ctx, done](const Status& s) {
-            ctx->SetStatus(s);
-            if (s.ok()) {
-              ctx->set_output(0, *device_tensor);
-            }
-            done();
-            delete device_tensor;
-          });
-    } else {
-      // Execute synchronously if not swapped.
-      ctx->set_output(0, value.tensor);
-      done();
-    }
-  }
-
-  bool IsExpensive() override { return false; }
-};
-
 REGISTER_KERNEL_BUILDER(Name("StackPop").Device(DEVICE_CPU), StackPopOp);
 REGISTER_KERNEL_BUILDER(Name("StackPopV2").Device(DEVICE_CPU), StackPopOp);
 
@@ -498,20 +193,6 @@ REGISTER_SYCL_HOST_KERNEL(bool);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-class StackCloseOp : public OpKernel {
- public:
-  explicit StackCloseOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    Stack* stack = nullptr;
-    OP_REQUIRES_OK(ctx, GetStack(ctx, &stack));
-    core::ScopedUnref unref(stack);
-    stack->Close();
-  }
-
-  bool IsExpensive() override { return false; }
-};
-
 REGISTER_KERNEL_BUILDER(Name("StackClose").Device(DEVICE_CPU), StackCloseOp);
 REGISTER_KERNEL_BUILDER(
     Name("StackClose").Device(DEVICE_GPU).HostMemory("handle"), StackCloseOp);
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 73a02a34cf231799e6a813f042757d70b4e9414a..c91bdc43cf4636481f141df70f30b1f2d74dc1a2 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -151,7 +151,7 @@ class Buffer : public ResourceBase {
   }
 
   // Are there a limit number of elements or a memory limit
-  // configued on this buffer?
+  // configured on this buffer?
   bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
 
   bool IsCapacityFull() const { return buf_.size() >= capacity_; }
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index eab176c7fb78c1a2f0a48b907c3b01bc640758d0..925f5291a68327c9fd939fd06fc025b58ab436ee 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -113,74 +113,109 @@ class StatelessRandomOp : public StatelessRandomOpBase {
   }
 };
 
-#define REGISTER(TYPE)                                                 \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessRandomUniform")                                   \
-          .Device(DEVICE_CPU)                                          \
-          .HostMemory("shape")                                         \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<CPUDevice, random::UniformDistribution<        \
-                                       random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessRandomNormal")                                    \
-          .Device(DEVICE_CPU)                                          \
-          .HostMemory("shape")                                         \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<CPUDevice, random::NormalDistribution<         \
-                                       random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessTruncatedNormal")                                 \
-          .Device(DEVICE_CPU)                                          \
-          .HostMemory("shape")                                         \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<                                               \
-          CPUDevice,                                                   \
-          random::TruncatedNormalDistribution<                         \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+template <typename Device, typename IntType>
+class StatelessRandomUniformIntOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
 
-TF_CALL_half(REGISTER);
-TF_CALL_float(REGISTER);
-TF_CALL_double(REGISTER);
+  void Fill(OpKernelContext* context, random::PhiloxRandom random,
+            Tensor* output) override {
+    const Tensor& minval = context->input(2);
+    const Tensor& maxval = context->input(3);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(minval.shape()),
+                errors::InvalidArgument("minval must be 0-D, got shape ",
+                                        minval.shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(maxval.shape()),
+                errors::InvalidArgument("maxval must be 0-D, got shape ",
+                                        maxval.shape().DebugString()));
+
+    // Verify that minval < maxval.  Note that we'll never reach this point for
+    // empty output.  Zero impossible things are fine.
+    const auto lo = minval.scalar<IntType>()();
+    const auto hi = maxval.scalar<IntType>()();
+    OP_REQUIRES(
+        context, lo < hi,
+        errors::InvalidArgument("Need minval < maxval, got ", lo, " >= ", hi));
+
+    // Build distribution
+    typedef random::UniformDistribution<random::PhiloxRandom, IntType>
+        Distribution;
+    Distribution dist(lo, hi);
+
+    auto flat = output->flat<IntType>();
+    // Reuse the compute kernels from the stateful random ops
+    functor::FillPhiloxRandom<Device, Distribution>()(
+        context, context->eigen_device<Device>(), random, flat.data(),
+        flat.size(), dist);
+  }
+};
 
-#undef REGISTER
+#define REGISTER(DEVICE, TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessRandomUniform")                                        \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("seed")                                               \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<DEVICE##Device, random::UniformDistribution<        \
+                                            random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessRandomNormal")                                         \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("seed")                                               \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<DEVICE##Device, random::NormalDistribution<         \
+                                            random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessTruncatedNormal")                                      \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("seed")                                               \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<                                                    \
+          DEVICE##Device,                                                   \
+          random::TruncatedNormalDistribution<                              \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+
+#define REGISTER_INT(DEVICE, TYPE)                            \
+  REGISTER_KERNEL_BUILDER(Name("StatelessRandomUniformInt")   \
+                              .Device(DEVICE_##DEVICE)        \
+                              .HostMemory("shape")            \
+                              .HostMemory("seed")             \
+                              .HostMemory("minval")           \
+                              .HostMemory("maxval")           \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          StatelessRandomUniformIntOp<DEVICE##Device, TYPE>);
+
+#define REGISTER_CPU(TYPE) REGISTER(CPU, TYPE)
+#define REGISTER_GPU(TYPE) REGISTER(GPU, TYPE)
+#define REGISTER_INT_CPU(TYPE) REGISTER_INT(CPU, TYPE)
+#define REGISTER_INT_GPU(TYPE) REGISTER_INT(GPU, TYPE)
+
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_bfloat16(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+TF_CALL_int32(REGISTER_INT_CPU);
+TF_CALL_int64(REGISTER_INT_CPU);
 
 #if GOOGLE_CUDA
 
-#define REGISTER(TYPE)                                                 \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessRandomUniform")                                   \
-          .Device(DEVICE_GPU)                                          \
-          .HostMemory("shape")                                         \
-          .HostMemory("seed")                                          \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<GPUDevice, random::UniformDistribution<        \
-                                       random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessRandomNormal")                                    \
-          .Device(DEVICE_GPU)                                          \
-          .HostMemory("shape")                                         \
-          .HostMemory("seed")                                          \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<GPUDevice, random::NormalDistribution<         \
-                                       random::PhiloxRandom, TYPE> >); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("StatelessTruncatedNormal")                                 \
-          .Device(DEVICE_GPU)                                          \
-          .HostMemory("shape")                                         \
-          .HostMemory("seed")                                          \
-          .TypeConstraint<TYPE>("dtype"),                              \
-      StatelessRandomOp<                                               \
-          GPUDevice,                                                   \
-          random::TruncatedNormalDistribution<                         \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >);
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+TF_CALL_int32(REGISTER_INT_GPU);
+TF_CALL_int64(REGISTER_INT_GPU);
 
-TF_CALL_half(REGISTER);
-TF_CALL_float(REGISTER);
-TF_CALL_double(REGISTER);
+#endif  // GOOGLE_CUDA
 
 #undef REGISTER
-
-#endif  // GOOGLE_CUDA
+#undef REGISTER_INT
+#undef REGISTER_CPU
+#undef REGISTER_GPU
+#undef REGISTER_INT_CPU
+#undef REGISTER_INT_GPU
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 3e8a4c5b72528e5425a9e26a199c5a1b027f7ebd..70a7ddbd0643e88655e1c0e1ad197316078267de 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -447,6 +447,8 @@ TF_CALL_ALL_TYPES(REGISTER_STRIDED_SLICE);
                           StridedSliceAssignOp<GPUDevice, type>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
+TF_CALL_bool(REGISTER_GPU);
+TF_CALL_int8(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
index 8ca27e3b920e7c0cd36343d0c9db5a6098b6bede..cce1d2fddde7edc0283c524269de9464c2602e25 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
@@ -54,6 +54,8 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
+TF_CALL_bool(DEFINE_GPU_KERNELS);
+TF_CALL_int8(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
 
 #undef DEFINE_GPU_KERNELS
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 099083b2ffa7447d8249839cde7329a4073f1b7a..c4205159c380cb0a78085f87deb760bd4a8c9791 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -179,10 +179,9 @@ class HandleStridedSliceAssignCase<Device, T, 0> {
   }
 };
 
-// NODE(aselle): according to bsteiner, we need this because otherwise
+// NOTE(aselle): according to bsteiner, we need this because otherwise
 // nvcc instantiates templates that are invalid. strided_slice_op_gpu.cu
-// handles instantiates externally. It is important that this is done#
-
+// handles instantiates externally. It is important that this is done
 // before the HandleXXCase's are instantiated to avoid duplicate
 // specialization errors.
 
@@ -285,6 +284,8 @@ TF_CALL_complex128(PREVENT_FOR_N_GPU);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU);
 TF_CALL_complex64(DECLARE_FOR_N_GPU);
 TF_CALL_complex128(DECLARE_FOR_N_GPU);
+TF_CALL_bool(DECLARE_FOR_N_GPU);
+TF_CALL_int8(DECLARE_FOR_N_GPU);
 DECLARE_FOR_N_GPU(int32);
 DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/string_util.cc b/tensorflow/core/kernels/string_util.cc
index 92c73220d8894234d1c955d32b0aebb9f0444dbd..4d16659e6ef0edccc5761d47dc80cbdc1f3e537f 100644
--- a/tensorflow/core/kernels/string_util.cc
+++ b/tensorflow/core/kernels/string_util.cc
@@ -20,11 +20,16 @@ namespace tensorflow {
 
 // Sets unit value based on str.
 Status ParseUnicodeEncoding(const string& str, UnicodeEncoding* encoding) {
-  if (str == "UTF8") {
+  if (str == "UTF-8") {
     *encoding = UnicodeEncoding::UTF8;
+  } else if (str == "UTF-16-BE") {
+    *encoding = UnicodeEncoding::UTF16BE;
+  } else if (str == "UTF-32-BE") {
+    *encoding = UnicodeEncoding::UTF32BE;
   } else {
-    return errors::InvalidArgument(strings::StrCat(
-        "Invalid encoding \"", str, "\": Should be one of: BYTE"));
+    return errors::InvalidArgument(
+        strings::StrCat("Invalid encoding \"", str,
+                        "\": Should be one of: UTF-8, UTF-16-BE, UTF-32-BE"));
   }
   return Status::OK();
 }
@@ -44,10 +49,10 @@ Status ParseCharUnit(const string& str, CharUnit* unit) {
 
 // Return the number of Unicode characters in a UTF-8 string.
 // Result may be incorrect if the input string is not valid UTF-8.
-int32 UTF8StrLen(const string& string) {
-  const int32 byte_size = string.size();
-  const char* const end = string.data() + byte_size;
-  const char* ptr = string.data();
+int32 UTF8StrLen(const string& str) {
+  const int32 byte_size = str.size();
+  const char* const end = str.data() + byte_size;
+  const char* ptr = str.data();
   int32 skipped_count = 0;
   while (ptr < end) {
     skipped_count += IsTrailByte(*ptr++) ? 1 : 0;
diff --git a/tensorflow/core/kernels/string_util.h b/tensorflow/core/kernels/string_util.h
index d40e93ea334af95ee6e42c73eb0d75accc244179..709403c6262e6311408adc8b95dc6765f70dbb0d 100644
--- a/tensorflow/core/kernels/string_util.h
+++ b/tensorflow/core/kernels/string_util.h
@@ -21,9 +21,7 @@ namespace tensorflow {
 
 // Enumeration for unicode encodings.  Used by ops such as
 // tf.strings.unicode_encode and tf.strings.unicode_decode.
-// TODO(edloper): Add support for:
-// UTF16, UTF32, UTF16BE, UTF32BE, UTF16LE, UTF32LE
-enum class UnicodeEncoding { UTF8 };
+enum class UnicodeEncoding { UTF8, UTF16BE, UTF32BE };
 
 // Enumeration for character units.  Used by string such as
 // tf.strings.length and tf.substr.
@@ -41,7 +39,7 @@ Status ParseCharUnit(const string& str, CharUnit* unit);
 
 // Returns the number of Unicode characters in a UTF-8 string.
 // Result may be incorrect if the input string is not valid UTF-8.
-int32 UTF8StrLen(const string& string);
+int32 UTF8StrLen(const string& str);
 
 // Get the next UTF8 character position starting at the given position and
 // skipping the given number of characters. Position is a byte offset, and
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index a97a71b344d64be09daf919c387d55a5c06db5aa..aa85f546a81d0e6b8cf41fc23532fd4a11fe42ec 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -352,9 +352,9 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
     }
 
     const auto key = strings::StrCat(output_handle(0), output_handle(1));
-    auto creator = [this, key, tensor_array, array_size, marked_size,
-                    element_shape, shape_to_prepend, tensor_array_output_handle,
-                    output_handle](TensorArray** ret) -> Status {
+    auto creator = [key, tensor_array, array_size, marked_size, element_shape,
+                    shape_to_prepend,
+                    tensor_array_output_handle](TensorArray** ret) -> Status {
       *ret = new TensorArray(
           key, tensor_array->ElemType(), *tensor_array_output_handle,
           array_size, element_shape, tensor_array->HasIdenticalElementShapes(),
diff --git a/tensorflow/core/kernels/tensor_forest/BUILD b/tensorflow/core/kernels/tensor_forest/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..df035506f7698d1d213efad6088e9bfb53d97282
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/BUILD
@@ -0,0 +1,53 @@
+# Description:
+#   OpKernels for tensor forest ops.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+cc_library(
+    name = "resources",
+    srcs = ["resources.cc"],
+    hdrs = ["resources.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "resource_ops",
+    srcs = ["resource_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "prediction_ops",
+    srcs = ["prediction_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_forest_ops",
+    deps = [
+        ":prediction_ops",
+        ":resource_ops",
+    ],
+)
diff --git a/tensorflow/core/kernels/tensor_forest/prediction_ops.cc b/tensorflow/core/kernels/tensor_forest/prediction_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e75421fb95791c9dc8aa3b3baf13cffed50d3da
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/prediction_ops.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+class TensorForestTreePredictOp : public OpKernel {
+ public:
+  explicit TensorForestTreePredictOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const Tensor* dense_features_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->input("dense_features", &dense_features_t));
+
+    auto dense_features = dense_features_t->matrix<float>();
+    const int32 batch_size = dense_features_t->dim_size(0);
+
+    Tensor* output_predictions = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, {batch_size, logits_dimension_},
+                                            &output_predictions));
+    auto out = output_predictions->matrix<float>();
+
+    if (decision_tree_resource->get_size() <= 0) {
+      out.setZero();
+      return;
+    }
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    const int32 num_threads = worker_threads->num_threads;
+
+    // TODO(yupbank): This was from contrib version.
+    //  This cost would probably depend on the depth of the tree we have.
+    //  We will need to run it on a number of trees of diff depth
+    //  and see the num of cpu cycles
+    const int64 cost_per_traverse = 500;
+    auto traverse = [this, &out, &dense_features, decision_tree_resource,
+                     batch_size](int64 start, int64 end) {
+      DCHECK_LE(start, end) << "Start exceeding End";
+      DCHECK_LE(end, batch_size) << "End exceeding batch size";
+      for (int example_id = start; example_id < end; ++example_id) {
+        const int32 leaf_id =
+            decision_tree_resource->TraverseTree(example_id, &dense_features);
+        set_output_value(example_id, leaf_id, decision_tree_resource, &out);
+      }
+    };
+    Shard(num_threads, worker_threads->workers, batch_size, cost_per_traverse,
+          traverse);
+  };
+
+  void set_output_value(const int32 example_id, const int32 leaf_id,
+                        const TensorForestTreeResource* decision_tree_resource,
+                        TTypes<float>::Matrix* out) const {
+    for (int j = 0; j < logits_dimension_; ++j) {
+      const float logit = decision_tree_resource->get_prediction(leaf_id, j);
+      (*out)(example_id, j) = logit;
+    }
+  }
+
+ private:
+  int32 logits_dimension_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreePredict").Device(DEVICE_CPU),
+                        TensorForestTreePredictOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resource_ops.cc b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0474d56098f50412345fe017c8bdfb09e908be0b
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
@@ -0,0 +1,136 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+
+namespace tensorflow {
+
+class TensorForestCreateTreeVariableOp : public OpKernel {
+ public:
+  explicit TensorForestCreateTreeVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+
+    auto* const result = new TensorForestTreeResource();
+
+    if (!result->InitFromSerialized(tree_config_t->scalar<string>()())) {
+      result->Unref();
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse tree config."));
+    }
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+};
+
+// Op for serializing a model.
+class TensorForestTreeSerializeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeSerializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_config_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape(), &output_config_t));
+    output_config_t->scalar<string>()() =
+        decision_tree_resource->decision_tree().SerializeAsString();
+  }
+};
+
+// Op for deserializing a tree variable from a checkpoint.
+class TensorForestTreeDeserializeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeDeserializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+
+    // Deallocate all the previous objects on the resource.
+    decision_tree_resource->Reset();
+
+    if (!decision_tree_resource->InitFromSerialized(
+            tree_config_t->scalar<string>()())) {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse tree config."));
+    }
+  }
+};
+
+// Op for getting tree size.
+class TensorForestTreeSizeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeSizeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape(), &output_t));
+    output_t->scalar<int32>()() = decision_tree_resource->get_size();
+  }
+};
+
+REGISTER_RESOURCE_HANDLE_KERNEL(TensorForestTreeResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorForestTreeIsInitializedOp").Device(DEVICE_CPU),
+    IsResourceInitialized<TensorForestTreeResource>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorForestCreateTreeVariable").Device(DEVICE_CPU),
+    TensorForestCreateTreeVariableOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeSerialize").Device(DEVICE_CPU),
+                        TensorForestTreeSerializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeDeserialize").Device(DEVICE_CPU),
+                        TensorForestTreeDeserializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeSize").Device(DEVICE_CPU),
+                        TensorForestTreeSizeOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resources.cc b/tensorflow/core/kernels/tensor_forest/resources.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bcd1a1e904171c6c97a6c1cb5ce0809e393be015
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resources.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+const boosted_trees::Tree& TensorForestTreeResource::decision_tree() const {
+  return *decision_tree_;
+}
+
+const int32 TensorForestTreeResource::get_size() const {
+  return decision_tree_->nodes_size();
+}
+
+TensorForestTreeResource::TensorForestTreeResource()
+    : decision_tree_(
+          protobuf::Arena::CreateMessage<boosted_trees::Tree>(&arena_)) {}
+
+const float TensorForestTreeResource::get_prediction(
+    const int32 id, const int32 dimension_id) const {
+  return decision_tree_->nodes(id).leaf().vector().value(dimension_id);
+}
+
+const int32 TensorForestTreeResource::TraverseTree(
+    const int32 example_id,
+    const TTypes<float>::ConstMatrix* dense_data) const {
+  using boosted_trees::Node;
+  using boosted_trees::Tree;
+  int32 current_id = 0;
+  while (true) {
+    const Node& current = decision_tree_->nodes(current_id);
+    if (current.has_leaf()) {
+      return current_id;
+    }
+    DCHECK_EQ(current.node_case(), Node::kDenseSplit);
+    const auto& split = current.dense_split();
+
+    if ((*dense_data)(example_id, split.feature_id()) <= split.threshold()) {
+      current_id = split.left_id();
+    } else {
+      current_id = split.right_id();
+    }
+  }
+}
+
+bool TensorForestTreeResource::InitFromSerialized(const string& serialized) {
+  return ParseProtoUnlimited(decision_tree_, serialized);
+}
+
+void TensorForestTreeResource::Reset() {
+  arena_.Reset();
+  DCHECK_EQ(0, arena_.SpaceAllocated());
+  decision_tree_ = protobuf::Arena::CreateMessage<boosted_trees::Tree>(&arena_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resources.h b/tensorflow/core/kernels/tensor_forest/resources.h
new file mode 100644
index 0000000000000000000000000000000000000000..da258e5017ca8cc9b996d83bcd767e89d61322d7
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resources.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Forward declaration for proto class Tree.
+namespace boosted_trees {
+class Tree;
+}  // namespace boosted_trees
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class TensorForestTreeResource : public ResourceBase {
+ public:
+  TensorForestTreeResource();
+
+  string DebugString() override {
+    return strings::StrCat("TensorForestTree[size=", get_size(), "]");
+  }
+
+  mutex* get_mutex() { return &mu_; }
+
+  bool InitFromSerialized(const string& serialized);
+
+  // Resets the resource and frees the proto.
+  // Caller needs to hold the mutex lock while calling this.
+  void Reset();
+
+  const int32 get_size() const;
+
+  const boosted_trees::Tree& decision_tree() const;
+
+  const float get_prediction(const int32 id, const int32 dimension_id) const;
+
+  const int32 TraverseTree(const int32 example_id,
+                           const TTypes<float>::ConstMatrix* dense_data) const;
+
+ protected:
+  mutex mu_;
+  protobuf::Arena arena_;
+  boosted_trees::Tree* decision_tree_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
diff --git a/tensorflow/core/kernels/tile_functor.h b/tensorflow/core/kernels/tile_functor.h
index 95986af8b77a05f96804725688890ef619423aa0..9a460d191fc91778b1bcc7a93ee787b1d736abb8 100644
--- a/tensorflow/core/kernels/tile_functor.h
+++ b/tensorflow/core/kernels/tile_functor.h
@@ -36,9 +36,11 @@ void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
   auto x = in.tensor<T, NDIM>();
   auto y = out->tensor<T, NDIM>();
 
+  bool use_32bit = y.size() < Eigen::NumTraits<int>::highest();
+
   Eigen::array<Tmultiples, NDIM> b;
   for (int i = 0; i < NDIM; ++i) b[i] = broadcast_array[i];
-  if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
+  if (use_32bit && Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) {
     // Use 32bit indexing to speed up the computations
     To32Bit(y).device(d) = To32Bit(x).broadcast(b);
   } else {
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index d5d4fa82c793cee5f49b33020d9c10c2090bb984..d714876bdaa964a35c9f011e34b6ec1d7b962ce7 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -577,6 +577,7 @@ TF_CALL_double(REGISTER_GPU);
 TF_CALL_half(REGISTER_GPU);
 TF_CALL_int16(REGISTER_GPU);
 TF_CALL_int32(REGISTER_GPU);
+TF_CALL_int64(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU)
 
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index acf162deec9bdb05183103ce6b47f364106a2036..6504ad1b09c089cafec8c2b0ce0f2971aa506b52 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -283,6 +283,22 @@ struct ApplyMomentum<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyKerasMomentum<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
+    accum.device(d) = accum * momentum() - grad * lr();
+    if (use_nesterov) {
+      var.device(d) += (accum * momentum() - grad * lr());
+    } else {
+      var.device(d) += accum;
+    }
+  }
+};
+
 template <typename Device, typename T>
 struct ApplyAdamNonCuda {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -331,6 +347,28 @@ struct ApplyAdamSYCL {
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
 
+template <typename T>
+struct ApplyAdamWithAmsgrad<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
+                    (T(1) - beta1_power());
+
+    m.device(d) += (grad - m) * (T(1) - beta1());
+    v.device(d) += (grad.square() - v) * (T(1) - beta2());
+    vhat.device(d) = vhat.cwiseMax(v);
+    var.device(d) -= (m * alpha) / (vhat.sqrt() + epsilon());
+  }
+};
+
 template <typename Device, typename T>
 struct ApplyAdaMaxNonCuda {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -2525,6 +2563,217 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyKerasMomentumOp : public OpKernel {
+ public:
+  explicit ApplyKerasMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Tensor& momentum = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyKerasMomentum<Device, T>()(
+        device, var.flat<T>(), accum.flat<T>(), lr.scalar<T>(), grad.flat<T>(),
+        momentum.scalar<T>(), use_nesterov_);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#define REGISTER_KERNELS(D, T)                               \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyKerasMomentum") \
+                              .Device(DEVICE_##D)            \
+                              .HostMemory("var")             \
+                              .HostMemory("accum")           \
+                              .TypeConstraint<T>("T"),       \
+                          ApplyKerasMomentumOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyKerasMomentum<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                   \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstFlat grad,                                 \
+      typename TTypes<T>::ConstScalar momentum, bool use_nesterov);       \
+  extern template struct ApplyKerasMomentum<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyKerasMomentumOp : public OpKernel {
+ public:
+  explicit SparseApplyKerasMomentumOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 0, use_exclusive_lock_, true, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 1, use_exclusive_lock_, true, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    const Tensor& momentum = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+
+    if (N > 0) {
+      const Tindex first_dim_size = var.dim_size(0);
+      auto indices_vec = indices.vec<Tindex>();
+      auto var_flat = var.flat_outer_dims<T>();
+      auto accum_flat = accum.flat_outer_dims<T>();
+      auto grad_flat = grad.flat_outer_dims<T>();
+      T lr_scalar = lr.scalar<T>()();
+      T momentum_scalar = momentum.scalar<T>()();
+
+      for (Tindex i = 0; i < N; i++) {
+        const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+        OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                    errors::InvalidArgument(
+                        strings::StrCat("Index ", index, " at offset ", i,
+                                        " in indices is out of range")));
+        auto a = accum_flat.template chip<0>(index);
+        auto g = grad_flat.template chip<0>(i);
+        auto v = var_flat.template chip<0>(index);
+        a = a * a.constant(momentum_scalar) - g * g.constant(lr_scalar);
+        if (use_nesterov_) {
+          v += a * a.constant(momentum_scalar) - g * g.constant(lr_scalar);
+        } else {
+          v += a;
+        }
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyKerasMomentum")   \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyKerasMomentumOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyAdamOp : public OpKernel {
  public:
@@ -2786,6 +3035,147 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyAdamWithAmsgradOp : public OpKernel {
+ public:
+  explicit ApplyAdamWithAmsgradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 2, use_exclusive_lock_, false, &v));
+    Tensor vhat;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 3, use_exclusive_lock_, false, &vhat));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+    OP_REQUIRES(
+        ctx, vhat.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+
+    const Tensor& beta1_power = ctx->input(4);
+    const Tensor& beta2_power = ctx->input(5);
+    const Tensor& lr = ctx->input(6);
+    const Tensor& beta1 = ctx->input(7);
+    const Tensor& beta2 = ctx->input(8);
+    const Tensor& epsilon = ctx->input(9);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(10);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdamWithAmsgrad<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(), vhat.flat<T>(),
+        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>());
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdamWithAmsgrad") \
+                              .HostMemory("var")               \
+                              .HostMemory("m")                 \
+                              .HostMemory("v")                 \
+                              .HostMemory("vhat")              \
+                              .Device(DEVICE_##D)              \
+                              .TypeConstraint<T>("T"),         \
+                          ApplyAdamWithAmsgradOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                   \
+  template <>                                                 \
+  void ApplyAdamWithAmsgrad<GPUDevice, T>::operator()(        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,       \
+      typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+      typename TTypes<T>::Flat vhat,                          \
+      typename TTypes<T>::ConstScalar beta1_power,            \
+      typename TTypes<T>::ConstScalar beta2_power,            \
+      typename TTypes<T>::ConstScalar lr,                     \
+      typename TTypes<T>::ConstScalar beta1,                  \
+      typename TTypes<T>::ConstScalar beta2,                  \
+      typename TTypes<T>::ConstScalar epsilon,                \
+      typename TTypes<T>::ConstFlat grad);                    \
+  extern template struct ApplyAdamWithAmsgrad<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyAdaMaxOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index e10a4cb125410dee383932f134e0339ba1c19b93..054f07350e60cd8a0c3713efc31d5a606fa6d2bc 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -126,6 +126,15 @@ struct ApplyMomentum {
                   typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyKerasMomentum {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
+};
+
 template <typename Device, typename T>
 struct ApplyAdam {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -139,6 +148,20 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstFlat grad, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyAdamWithAmsgrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyAdaMax {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 4bd32592db16b70b2731a6cf775dbf774263d283..f45b9ffca7c9970ca2aee1416d2c5bf4d90f413a 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -101,6 +101,27 @@ struct ApplyMomentum<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyKerasMomentum<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    accum.device(d) = (accum * momentum.reshape(single).broadcast(bcast) -
+                       grad * lr.reshape(single).broadcast(bcast));
+    if (use_nesterov) {
+      var.device(d) += (accum * momentum.reshape(single).broadcast(bcast) -
+                        grad * lr.reshape(single).broadcast(bcast));
+    } else {
+      var.device(d) += accum;
+    }
+  }
+};
+
 template <typename T>
 struct ApplyAdam<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -144,6 +165,39 @@ struct ApplyAdam<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAdamWithAmsgrad<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    const auto one = static_cast<T>(1.0);
+    m.device(d) =
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
+    v.device(d) =
+        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+                (grad.square() - v);
+    vhat.device(d) = vhat.cwiseMax(v);
+
+    var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+                      (beta1_power.constant(one) - beta1_power))
+                         .reshape(single)
+                         .broadcast(bcast) *
+                     m /
+                     (epsilon.reshape(single).broadcast(bcast) + vhat.sqrt());
+  }
+};
+
 template <typename T>
 struct ApplyAdaMax<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -302,10 +356,18 @@ template struct functor::ApplyMomentum<GPUDevice, Eigen::half>;
 template struct functor::ApplyMomentum<GPUDevice, float>;
 template struct functor::ApplyMomentum<GPUDevice, double>;
 
+template struct functor::ApplyKerasMomentum<GPUDevice, Eigen::half>;
+template struct functor::ApplyKerasMomentum<GPUDevice, float>;
+template struct functor::ApplyKerasMomentum<GPUDevice, double>;
+
 template struct functor::ApplyAdam<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdam<GPUDevice, float>;
 template struct functor::ApplyAdam<GPUDevice, double>;
 
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, Eigen::half>;
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, float>;
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, double>;
+
 template struct functor::ApplyAdaMax<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdaMax<GPUDevice, float>;
 template struct functor::ApplyAdaMax<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index 2dcc4a500e6c64753c6fde4f88582f914a50089e..09804f95dcdd457ee22d2158dffafc2d311c0db5 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -194,6 +194,50 @@ static void BM_Adam(int iters, int params) {
 }
 BENCHMARK(BM_Adam)->Arg(128 << 10)->Arg(256 << 10);
 
+static void AdamWithAmsgrad(int32 n, Graph** init_g, Graph** train_g) {
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto v = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, m, zero);
+    test::graph::Assign(g, v, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto v = Var(g, n);
+    auto vhat = Var(g, n);
+    auto beta1_power = Scalar(g, 0.9);
+    auto beta2_power = Scalar(g, 0.99);
+    auto lr = Scalar(g, 0.01);
+    auto beta1 = Scalar(g, 0.9);
+    auto beta2 = Scalar(g, 0.99);
+    auto epsilon = Scalar(g, 1e-8);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyAdamWithAmsgrad",
+                       {var, m, v, vhat, beta1_power, beta2_power, lr, beta1,
+                        beta2, epsilon, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_AdamWithAmsgrad(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  AdamWithAmsgrad(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_AdamWithAmsgrad)->Arg(128 << 10)->Arg(256 << 10);
+
 static void RMSProp(int32 n, Graph** init_g, Graph** train_g) {
   TensorShape shape({n});
   {
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ee0edb35a72d2e3de747fad32bb69bb2872ac80
--- /dev/null
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -0,0 +1,540 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "unicode/appendable.h"  // TF:icu
+#include "unicode/schriter.h"  // TF:icu
+#include "unicode/uchar.h"  // TF:icu
+#include "unicode/ucnv.h"  // TF:icu
+#include "unicode/ucnv_err.h"  // TF:icu
+#include "unicode/umachine.h"  // TF:icu
+#include "unicode/uniset.h"  // TF:icu
+#include "unicode/unistr.h"  // TF:icu
+#include "unicode/uset.h"  // TF:icu
+#include "unicode/utypes.h"  // TF:icu
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/string_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/bcast.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+void Encode(const UnicodeEncoding encoding, const icu::UnicodeString& in,
+            string* out) {
+  if (encoding == UnicodeEncoding::UTF8) {
+    out->clear();
+    in.toUTF8String(*out);
+  } else if (encoding == UnicodeEncoding::UTF16BE) {
+    // TODO(gbillock): consider using the
+    // extract(char *dest, int32_t destCapacity, UConverter *cnv)
+    // for UTF16/32
+    out->clear();  // subtle: must come before reserve()
+    out->reserve(2 * in.length() + 1);
+    const char16_t* buf = in.getBuffer();
+    for (int i = 0; i < in.length(); ++i) {
+      // Emit big-endian encoding for UTF-16 always.
+      out->push_back((buf[i] & 0xFF00) >> 8);
+      out->push_back(buf[i] & 0x00FF);
+    }
+  } else if (encoding == UnicodeEncoding::UTF32BE) {
+    out->clear();  // subtle: must come before reserve()
+    out->reserve(4 * in.countChar32() + 1);
+    icu::StringCharacterIterator it(in);
+    UChar32 ch;
+    while (it.hasNext()) {
+      ch = it.next32PostInc();
+      out->push_back((ch & 0xFF000000) >> 24);
+      out->push_back((ch & 0x00FF0000) >> 16);
+      out->push_back((ch & 0x0000FF00) >> 8);
+      out->push_back((ch & 0x000000FF));
+    }
+  }
+}
+
+// This error callback is only useful for finding illegal encoding errors when
+// we want to be strict -- otherwise illegal encodings are replaced on read
+// with 0xFFFD and signaled to the callback.
+void unicode_error_callback(const void* context, UConverterToUnicodeArgs* args,
+                            const char* codeUnits, int32_t length,
+                            UConverterCallbackReason reason,
+                            UErrorCode* pErrorCode) {
+  // Careful: this depends on setting up the context settings when the
+  // callback is registered.
+  bool* format_error = const_cast<bool*>(static_cast<const bool*>(context));
+
+  if (reason == UCNV_UNASSIGNED || reason == UCNV_ILLEGAL ||
+      reason == UCNV_IRREGULAR) {
+    *format_error = true;
+  }
+
+  // Side note: the default behavior in this case is that without a substitution
+  // made by the callback, the UConverter will signal an error to the iterator
+  // making the string iteration bail out. Instead, forward to the built-in
+  // substitution handler.
+  UCNV_TO_U_CALLBACK_SUBSTITUTE(nullptr, args, codeUnits, length, reason,
+                                pErrorCode);
+}
+
+// Iterates through a source string given the provided input UConverter specific
+// to the encoding for that string. Calls a provided callback for each codepoint
+// consumed. Provides the callback with the codepoint and the number of bytes
+// consumed from the input string to produce it. If there are invalid encoding
+// loci in the source string, they will be provided as a 0xFFFD codepoint to
+// the callback, unless the "fail_on_formatting_error" arg is set, in which
+// case the callback will be passed the signal that there is such an invalid
+// encoding position.
+// callback: function(UChar32 codepoint, int num_bytes_consumed_from_source_str,
+//                    bool fatal_format_error)
+void IterateUnicodeString(const string& str, UConverter* converter,
+                          std::function<void(UChar32, int, bool)> callback) {
+  const char* source = str.data();
+  const char* limit = str.data() + str.length();
+  UErrorCode status = U_ZERO_ERROR;
+
+  UConverterToUCallback oldAction = nullptr;
+  const void* oldContext = nullptr;
+  bool format_error = false;
+
+  // Subtle. You can't make a function pointer from a std::function. :-(
+  // Instead, we pass the boolean pointer as the "context" object.
+  ucnv_setToUCallBack(converter, unicode_error_callback, &format_error,
+                      &oldAction, &oldContext, &status);
+  if (U_FAILURE(status)) {
+    LOG(ERROR) << "Could not set unicode error callback on converter";
+    return;
+  }
+
+  while (source < limit) {
+    const char* source_pre_fetch = source;
+    // Note: ucnv_getNextUChar returns 0xFFFD on an encoding error.
+    UChar32 next_char = ucnv_getNextUChar(converter, &source, limit, &status);
+    if (U_FAILURE(status)) {
+      source = limit;
+    }
+    int bytes_consumed = source - source_pre_fetch;
+    callback(next_char, bytes_consumed, format_error);
+    format_error = false;
+  }
+
+  ucnv_setToUCallBack(converter, oldAction, oldContext, nullptr, nullptr,
+                      &status);
+}
+
+// Lifecycle wrapper for UConverter making it easier to use with thread_local.
+// TODO(gbillock): Consider whether to use the higher-level convert API and
+// create a specialized fast code path for UTF8.
+class WrappedConverter {
+ public:
+  WrappedConverter() {}
+
+  ~WrappedConverter() {
+    if (converter_) {
+      ucnv_close(converter_);
+    }
+  }
+
+  void init(const string& name) {
+    if (converter_ && name == name_) {
+      // Note: this reset is not typically needed, but if not done, then in some
+      // cases the cached converter will maintain state of input endianness
+      // which isn't valid from input to input in every batched case.
+      ucnv_reset(converter_);
+      return;
+    }
+
+    if (converter_) {
+      ucnv_close(converter_);
+      converter_ = nullptr;
+      name_ = "";
+    }
+
+    UErrorCode status = U_ZERO_ERROR;
+    converter_ = ucnv_open(name.c_str(), &status);
+    if (U_FAILURE(status)) {
+      if (converter_) {
+        ucnv_close(converter_);
+        converter_ = nullptr;
+      }
+    } else {
+      name_ = name;
+    }
+  }
+
+  UConverter* converter_ = nullptr;
+  string name_;
+};
+
+struct ErrorOptions {
+  UChar32 subst = 0xFFFD;
+  bool elide_replacement = false;
+  bool replace_control_chars = false;
+  bool error_on_malformatting = false;
+};
+
+Status GetErrorOptions(OpKernelConstruction* ctx, ErrorOptions* out) {
+  *out = ErrorOptions();
+
+  string error_policy;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("errors", &error_policy));
+
+  if (error_policy == "replace") {
+    out->elide_replacement = false;
+  } else if (error_policy == "ignore") {
+    out->elide_replacement = true;
+  } else if (error_policy == "strict") {
+    out->error_on_malformatting = true;
+  } else {
+    return errors::InvalidArgument(
+        "errors policy must be one of 'strict', 'replace', or 'ignore'");
+  }
+
+  int32 replacement_char;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("replacement_char", &replacement_char));
+
+  if (replacement_char >= UCHAR_MIN_VALUE &&
+      replacement_char <= UCHAR_MAX_VALUE) {
+    out->subst = replacement_char;
+  } else {
+    return errors::InvalidArgument(
+        "replacement_char out of unicode codepoint range");
+  }
+
+  if (ctx->HasAttr("replace_control_characters")) {
+    TF_RETURN_IF_ERROR(ctx->GetAttr("replace_control_characters",
+                                    &(out->replace_control_chars)));
+  }
+
+  return Status::OK();
+}
+
+inline bool ShouldHandleFormatError(const ErrorOptions& error_options,
+                                    UChar32 ch, bool format_error) {
+  return ((error_options.replace_control_chars && ch <= 0x1F) || format_error);
+}
+
+}  // namespace
+
+class UnicodeTranscodeOp : public OpKernel {
+ public:
+  explicit UnicodeTranscodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
+
+    string output_encoding;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &output_encoding));
+    OP_REQUIRES_OK(ctx,
+                   ParseUnicodeEncoding(output_encoding, &output_encoding_));
+
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
+    // Make a temporary UConverter to ensure it will create without error
+    // at execution time (and to warm any data caches the converter needs).
+    // This instance is not used.
+    std::unique_ptr<WrappedConverter> input_encoder =
+        absl::make_unique<WrappedConverter>();
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+
+    static thread_local std::unique_ptr<WrappedConverter> input_encoder;
+    if (!input_encoder) {
+      input_encoder.reset(new WrappedConverter());
+    }
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+
+    // Output may be forwardable from input, in which case work in-place.
+    Tensor* output_tensor;
+    std::unique_ptr<Tensor> maybe_forwarded =
+        ctx->forward_input(0 /*input_index*/, 0 /*output_index*/,
+                           tensorflow::DT_STRING, input_tensor->shape(),
+                           ctx->input_memory_type(0), ctx->input_alloc_attr(0));
+    if (maybe_forwarded) {
+      output_tensor = maybe_forwarded.get();
+      OP_REQUIRES_OK(ctx, ctx->set_output("output", *output_tensor));
+    } else {
+      OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
+                                               &output_tensor));
+      output_tensor->flat<string>() = input_tensor->flat<string>();
+    }
+
+    auto output_flat = output_tensor->flat<string>();
+    bool found_any_format_error = false;
+    for (size_t i = 0; i < output_flat.size(); ++i) {
+      Transcode(&(output_flat(i)), input_encoder->converter_,
+                &found_any_format_error);
+    }
+    if (error_options_.error_on_malformatting && found_any_format_error) {
+      ctx->CtxFailure(
+          errors::InvalidArgument("Invalid formatting on input string"));
+    }
+  }
+
+ private:
+  // Consume a codepoint from the input string and add it to the buffer.
+  // This function takes care of any replacement configuration on invalid or
+  // out-of-range inputs.
+  void TranslateCodepoints(icu::UnicodeString* s, bool* found_any_format_error,
+                           UChar32 ch, int src_bytes, bool format_error) {
+    if (ShouldHandleFormatError(error_options_, ch, format_error)) {
+      *found_any_format_error = true;
+      if (error_options_.elide_replacement) {
+        return;
+      } else {
+        ch = error_options_.subst;
+      }
+    }
+    s->append(ch);
+  }
+
+  // Transcode the string from input encoding to the output_encoding_. If
+  // non-valid characters are encountered, use the subst_/elide_replacement_
+  // config to handle them.
+  void Transcode(string* s, UConverter* input_encoder,
+                 bool* found_any_format_error) {
+    icu::UnicodeString source;
+    IterateUnicodeString(
+        *s, input_encoder,
+        std::bind(&UnicodeTranscodeOp::TranslateCodepoints, this, &source,
+                  found_any_format_error, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3));
+
+    Encode(output_encoding_, source, s);
+  }
+
+  string input_encoding_;
+  ErrorOptions error_options_;
+  UnicodeEncoding output_encoding_ = UnicodeEncoding::UTF8;
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnicodeTranscode").Device(DEVICE_CPU),
+                        UnicodeTranscodeOp);
+
+class UnicodeDecodeWithOffsetsOp : public OpKernel {
+ public:
+  explicit UnicodeDecodeWithOffsetsOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
+    // Make a temporary UConverter to ensure it will create without error
+    // at execution time (and to warm any data caches the converter needs).
+    // This instance is not used.
+    std::unique_ptr<WrappedConverter> input_encoder =
+        absl::make_unique<WrappedConverter>();
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+  }
+
+  void Decode(OpKernelContext* ctx, std::vector<UChar32>* char_values,
+              std::vector<int64>* offset_values, int* string_length,
+              int64* next_row_split, UChar32 char_value, int char_length,
+              bool found_any_format_error) {
+    if (error_options_.error_on_malformatting && found_any_format_error) {
+      ctx->CtxFailure(
+          errors::InvalidArgument("Invalid formatting on input string"));
+    }
+    UChar32 decoded_value = char_value;
+    if (ShouldHandleFormatError(error_options_, char_value,
+                                found_any_format_error)) {
+      if (error_options_.elide_replacement) {
+        return;
+      } else {
+        decoded_value = error_options_.subst;
+      }
+    }
+
+    // Emit the char value.
+    char_values->push_back(decoded_value);
+
+    // Emit the byte offset
+    offset_values->push_back(*string_length);
+    *string_length += char_length;
+    *next_row_split += 1;
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+
+    // Go through all the strings in `input`.
+    const auto& input_vec = input_tensor->flat<string>();
+
+    std::unique_ptr<WrappedConverter> input_encoder =
+        absl::make_unique<WrappedConverter>();
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+
+    std::vector<UChar32> char_values;
+    std::vector<int64> offset_values;
+
+    Tensor* output_row_splits;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("row_splits",
+                                             {input_tensor->NumElements() + 1},
+                                             &output_row_splits));
+    auto out_row_splits = output_row_splits->vec<int64>();
+
+    int row_split_index = 0;
+    int64 next_row_split = 0;
+    for (int i = 0; i < input_vec.size(); ++i) {
+      const string& input = input_vec(i);
+      // Convert input strings into unicode values. Output to a list of
+      // char_values, record row splits and char_to_byte_starts, which are all
+      // the fields needed to construct a RaggedTensor.
+      out_row_splits(row_split_index) = next_row_split;
+      row_split_index++;
+      int string_length = 0;
+      IterateUnicodeString(
+          input, input_encoder->converter_,
+          std::bind(&UnicodeDecodeWithOffsetsOp::Decode, this, ctx,
+                    &char_values, &offset_values, &string_length,
+                    &next_row_split, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3));
+    }
+    out_row_splits(row_split_index) = next_row_split;
+
+    DCHECK(offset_values.size() == char_values.size());
+    Tensor* output_char_values;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("char_values",
+                                  {static_cast<int64>(char_values.size())},
+                                  &output_char_values));
+    Tensor* output_offset_values;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("char_to_byte_starts",
+                                  {static_cast<int64>(offset_values.size())},
+                                  &output_offset_values));
+    auto out_char_values = output_char_values->vec<int32>();
+    auto out_offset_values = output_offset_values->vec<int64>();
+
+    // Load output tensors from intermediate value arrays.
+    for (int i = 0; i < char_values.size(); ++i) {
+      out_char_values(i) = static_cast<int32>(char_values[i]);
+      out_offset_values(i) = offset_values[i];
+    }
+  }
+
+ private:
+  string input_encoding_;
+  ErrorOptions error_options_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets").Device(DEVICE_CPU),
+                        UnicodeDecodeWithOffsetsOp);
+
+class UnicodeEncodeOp : public OpKernel {
+ public:
+  explicit UnicodeEncodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string encoding_tmp;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &encoding_tmp));
+    OP_REQUIRES_OK(ctx, ParseUnicodeEncoding(encoding_tmp, &encoding_));
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
+  }
+
+  /**
+   * Encodes Unicode codepoints into the desired string representation.
+   *
+   * We lose a dimension while encoding, since a series of integer codepoints is
+   * encoded into a single string.
+   *
+   * This accepts two input tensors: a rank 1 tensor of code point values and
+   * a single rank 1 tensor of splits which determine where each string begins
+   * and ends from the provided code points.
+   */
+  void Compute(OpKernelContext* context) override {
+    // Get inputs
+    const Tensor& input_tensor = context->input(0);
+    const auto input_tensor_flat = input_tensor.flat<int32>();
+    const Tensor& input_splits = context->input(1);
+    const auto input_splits_flat = input_splits.flat<int64>();
+
+    // Since we limit to a 2-D input (flat_values of rank 1 and a single splits
+    // tensor), our output dimension will be 1 with it's size equal to the
+    // number of splits (outer dimension or ragged tensor).
+    TensorShape output_shape({input_splits.dim_size(0) - 1});
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(context, context->allocate_output("output", output_shape,
+                                                     &output_tensor));
+    auto output_tensor_flat = output_tensor->flat<string>();
+
+    // Use a single index over the flattened input values tensor.
+    int idx = 0;
+    // Loop through our split dimension to create a new string at each split.
+    for (int i = 1; i < input_splits_flat.size(); ++i) {
+      icu::UnicodeString unicode_string;
+      icu::UnicodeStringAppendable appendable_unicode_string(unicode_string);
+      for (; idx < input_splits_flat(i); ++idx) {
+        int32 code_point = input_tensor_flat(idx);
+        // Check for invalid code point
+        if (code_point > UCHAR_MAX_VALUE || code_point < UCHAR_MIN_VALUE) {
+          if (error_options_.error_on_malformatting) {
+            context->CtxFailure(errors::InvalidArgument(
+                "Code point value out of valid Unicode range."));
+            return;
+          } else if (!error_options_.elide_replacement) {
+            code_point = error_options_.subst;
+          }
+        }
+        appendable_unicode_string.appendCodePoint(code_point);
+      }
+      // Encode our string and save in the output.
+      string result;
+      Encode(encoding_, unicode_string, &result);
+      output_tensor_flat(i - 1) = result;
+    }
+  }
+
+ private:
+  UnicodeEncoding encoding_;
+  ErrorOptions error_options_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnicodeEncode").Device(DEVICE_CPU),
+                        UnicodeEncodeOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 3559baa18eae1eceeebcf07f06340d9f31515d02..3bdcfc90b878479572ad144bc82e9dc6763a4abf 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -108,7 +108,7 @@ class UniqueOp : public OpKernel {
 
       std::unordered_map<T, TIndex> uniq;
       uniq.reserve(2 * N);
-      for (int64 i = 0, j = 0; i < N; ++i) {
+      for (Eigen::Index i = 0, j = 0; i < N; ++i) {
         auto it = uniq.insert(std::make_pair(Tin(i), j));
         idx_vec(i) = it.first->second;
         if (it.second) {
@@ -131,19 +131,20 @@ class UniqueOp : public OpKernel {
       // General implementation when unique is run over multiple elements.
       auto Tin = input.shaped<T, 3>(new_sizes);
 
-      auto hash_fn = [&Tin](const int64& key) {
+      auto hash_fn = [&Tin](const Eigen::Index& key) {
         size_t h = 0;
-        for (int64 i = 0; i < Tin.dimension(0); i++) {
-          for (int64 j = 0; j < Tin.dimension(2); j++) {
+        for (Eigen::Index i = 0; i < Tin.dimension(0); i++) {
+          for (Eigen::Index j = 0; j < Tin.dimension(2); j++) {
             h = Hash64Combine(h, hash<T>{}(Tin(i, key, j)));
           }
         }
         return h;
       };
 
-      auto equal_to_fn = [&Tin](const int64& lhs, const int64& rhs) {
-        for (int64 i = 0; i < Tin.dimension(0); i++) {
-          for (int64 j = 0; j < Tin.dimension(2); j++) {
+      auto equal_to_fn = [&Tin](const Eigen::Index& lhs,
+                                const Eigen::Index& rhs) {
+        for (Eigen::Index i = 0; i < Tin.dimension(0); i++) {
+          for (Eigen::Index j = 0; j < Tin.dimension(2); j++) {
             if (Tin(i, lhs, j) != Tin(i, rhs, j)) {
               return false;
             }
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 5c917e80c146568942f68b90969d8dba27f0dce8..440854658094c3be0ad113ef01d4814f9f45ca06 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -43,7 +43,9 @@ typedef std::complex<double> complex128;
 
 // see framework/bfloat16.h for description.
 struct bfloat16 {
-  B16_DEVICE_FUNC bfloat16() {}
+  // The default constructor must yield a zero value, not an uninitialized
+  // value; some TF kernels use T() as a zero value.
+  B16_DEVICE_FUNC bfloat16() : value(ZERO_VALUE) {}
 
   B16_DEVICE_FUNC static bfloat16 truncate_to_bfloat16(const float v) {
     bfloat16 output;
@@ -376,6 +378,9 @@ struct bfloat16 {
   static const uint16_t NAN_VALUE = 0x7FC0;
 
  private:
+  // A value that represents "zero".
+  static const uint16_t ZERO_VALUE = 0;
+
   B16_DEVICE_FUNC static bool float_isnan(const float& x) {
 #ifdef __CUDA_ARCH__
     return ::isnan(x);
diff --git a/tensorflow/core/lib/core/bit_cast_test.cc b/tensorflow/core/lib/core/bit_cast_test.cc
deleted file mode 100644
index f68b2c405313f56cd6c5fbedb83aebd136c727fa..0000000000000000000000000000000000000000
--- a/tensorflow/core/lib/core/bit_cast_test.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Unit test for bit_cast template.
-
-#include "tensorflow/core/lib/core/casts.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-
-// Marshall and unmarshall.
-// ISO spec C++ section 3.9 promises this will work.
-
-template <int N>
-struct marshall {
-  char buf[N];
-};
-
-template <class T>
-void TestMarshall(const T values[], int num_values) {
-  for (int i = 0; i < num_values; ++i) {
-    T t0 = values[i];
-    marshall<sizeof(T)> m0 = bit_cast<marshall<sizeof(T)> >(t0);
-    T t1 = bit_cast<T>(m0);
-    marshall<sizeof(T)> m1 = bit_cast<marshall<sizeof(T)> >(t1);
-    ASSERT_EQ(0, memcmp(&t0, &t1, sizeof(T)));
-    ASSERT_EQ(0, memcmp(&m0, &m1, sizeof(T)));
-  }
-}
-
-// Convert back and forth to an integral type.  The C++ standard does
-// not guarantee this will work.
-//
-// There are implicit assumptions about sizeof(float) and
-// sizeof(double). These assumptions are quite extant everywhere.
-
-template <class T, class I>
-void TestIntegral(const T values[], int num_values) {
-  for (int i = 0; i < num_values; ++i) {
-    T t0 = values[i];
-    I i0 = bit_cast<I>(t0);
-    T t1 = bit_cast<T>(i0);
-    I i1 = bit_cast<I>(t1);
-    ASSERT_EQ(0, memcmp(&t0, &t1, sizeof(T)));
-    ASSERT_EQ(i0, i1);
-  }
-}
-
-TEST(BitCast, Bool) {
-  LOG(INFO) << "Test bool";
-  static const bool bool_list[] = {false, true};
-  TestMarshall<bool>(bool_list, TF_ARRAYSIZE(bool_list));
-}
-
-TEST(BitCast, Int32) {
-  static const int32 int_list[] = {0,  1,    100,         2147483647,
-                                   -1, -100, -2147483647, -2147483647 - 1};
-  TestMarshall<int32>(int_list, TF_ARRAYSIZE(int_list));
-}
-
-TEST(BitCast, Int64) {
-  static const int64 int64_list[] = {0, 1, 1LL << 40, -1, -(1LL << 40)};
-  TestMarshall<int64>(int64_list, TF_ARRAYSIZE(int64_list));
-}
-
-TEST(BitCast, Uint64) {
-  static const uint64 uint64_list[] = {0, 1, 1LLU << 40, 1LLU << 63};
-  TestMarshall<uint64>(uint64_list, TF_ARRAYSIZE(uint64_list));
-}
-
-TEST(BitCast, Float) {
-  static const float float_list[] = {0.0,  1.0,   -1.0,  10.0,    -10.0,  1e10,
-                                     1e20, 1e-10, 1e-20, 2.71828, 3.14159};
-  TestMarshall<float>(float_list, TF_ARRAYSIZE(float_list));
-  TestIntegral<float, int32>(float_list, TF_ARRAYSIZE(float_list));
-  TestIntegral<float, uint32>(float_list, TF_ARRAYSIZE(float_list));
-}
-
-TEST(BitCast, Double) {
-  static const double double_list[] = {
-      0.0,
-      1.0,
-      -1.0,
-      10.0,
-      -10.0,
-      1e10,
-      1e100,
-      1e-10,
-      1e-100,
-      2.718281828459045,
-      3.141592653589793238462643383279502884197169399375105820974944};
-  TestMarshall<double>(double_list, TF_ARRAYSIZE(double_list));
-  TestIntegral<double, int64>(double_list, TF_ARRAYSIZE(double_list));
-  TestIntegral<double, uint64>(double_list, TF_ARRAYSIZE(double_list));
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/casts.h b/tensorflow/core/lib/core/casts.h
deleted file mode 100644
index 7546d4edc5a5159b593041b4b95837cdf890acef..0000000000000000000000000000000000000000
--- a/tensorflow/core/lib/core/casts.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Various Google-specific casting templates.
-//
-// This code is compiled directly on many platforms, including client
-// platforms like Windows, Mac, and embedded systems.  Before making
-// any changes here, make sure that you're not breaking any platforms.
-//
-
-#ifndef TENSORFLOW_CORE_LIB_CORE_CASTS_H_
-#define TENSORFLOW_CORE_LIB_CORE_CASTS_H_
-
-#include <string.h>  // for memcpy
-
-namespace tensorflow {
-
-// bit_cast<Dest,Source> is a template function that implements the
-// equivalent of "*reinterpret_cast<Dest*>(&source)".  We need this in
-// very low-level functions like the protobuf library and fast math
-// support.
-//
-//   float f = 3.14159265358979;
-//   int i = bit_cast<int32>(f);
-//   // i = 0x40490fdb
-//
-// The classical address-casting method is:
-//
-//   // WRONG
-//   float f = 3.14159265358979;            // WRONG
-//   int i = * reinterpret_cast<int*>(&f);  // WRONG
-//
-// The address-casting method actually produces undefined behavior
-// according to ISO C++ specification section 3.10 -15 -.  Roughly, this
-// section says: if an object in memory has one type, and a program
-// accesses it with a different type, then the result is undefined
-// behavior for most values of "different type".
-//
-// This is true for any cast syntax, either *(int*)&f or
-// *reinterpret_cast<int*>(&f).  And it is particularly true for
-// conversions between integral lvalues and floating-point lvalues.
-//
-// The purpose of 3.10 -15- is to allow optimizing compilers to assume
-// that expressions with different types refer to different memory.  gcc
-// 4.0.1 has an optimizer that takes advantage of this.  So a
-// non-conforming program quietly produces wildly incorrect output.
-//
-// The problem is not the use of reinterpret_cast.  The problem is type
-// punning: holding an object in memory of one type and reading its bits
-// back using a different type.
-//
-// The C++ standard is more subtle and complex than this, but that
-// is the basic idea.
-//
-// Anyways ...
-//
-// bit_cast<> calls memcpy() which is blessed by the standard,
-// especially by the example in section 3.9 .  Also, of course,
-// bit_cast<> wraps up the nasty logic in one place.
-//
-// Fortunately memcpy() is very fast.  In optimized mode, with a
-// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
-// code with the minimal amount of data movement.  On a 32-bit system,
-// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
-// compiles to two loads and two stores.
-//
-// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
-//
-// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
-// is likely to surprise you.
-//
-// Props to Bill Gibbons for the compile time assertion technique and
-// Art Komninos and Igor Tandetnik for the msvc experiments.
-//
-// -- mec 2005-10-17
-
-template <class Dest, class Source>
-inline Dest bit_cast(const Source& source) {
-  static_assert(sizeof(Dest) == sizeof(Source), "Sizes do not match");
-
-  Dest dest;
-  memcpy(&dest, &source, sizeof(dest));
-  return dest;
-}
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_LIB_CORE_CASTS_H_
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 9ccd911b0efbcd047fcfb278cd4e91e2dd768488..e929ff45a1fb8656d5762a8793cb17175f04c1f9 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
@@ -54,6 +55,9 @@ struct EigenEnvironment {
       port::ScopedFlushDenormal flush;
       // Set the processor rounding mode to ROUND TO NEAREST.
       port::ScopedSetRound round(FE_TONEAREST);
+      if (thread_options_.numa_node != port::kNUMANoAffinity) {
+        port::NUMASetThreadNodeAffinity(thread_options_.numa_node);
+      }
       f();
     });
   }
@@ -83,35 +87,38 @@ struct EigenEnvironment {
 
 struct ThreadPool::Impl : Eigen::ThreadPoolTempl<EigenEnvironment> {
   Impl(Env* env, const ThreadOptions& thread_options, const string& name,
-       int num_threads, bool low_latency_hint)
+       int num_threads, bool low_latency_hint, Eigen::Allocator* allocator)
       : Eigen::ThreadPoolTempl<EigenEnvironment>(
             num_threads, low_latency_hint,
-            EigenEnvironment(env, thread_options, name)) {}
+            EigenEnvironment(env, thread_options, name)),
+        allocator_(allocator) {}
 
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn) {
     CHECK_GE(total, 0);
     CHECK_EQ(total, (int64)(Eigen::Index)total);
-    Eigen::ThreadPoolDevice device(this, this->NumThreads());
+    Eigen::ThreadPoolDevice device(this, this->NumThreads(), allocator_);
     device.parallelFor(
         total, Eigen::TensorOpCost(0, 0, cost_per_unit),
         [&fn](Eigen::Index first, Eigen::Index last) { fn(first, last); });
   }
+
+  Eigen::Allocator* allocator_;
 };
 
 ThreadPool::ThreadPool(Env* env, const string& name, int num_threads)
-    : ThreadPool(env, ThreadOptions(), name, num_threads, true) {}
+    : ThreadPool(env, ThreadOptions(), name, num_threads, true, nullptr) {}
 
 ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
                        const string& name, int num_threads)
-    : ThreadPool(env, thread_options, name, num_threads, true) {}
+    : ThreadPool(env, thread_options, name, num_threads, true, nullptr) {}
 
 ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
                        const string& name, int num_threads,
-                       bool low_latency_hint) {
+                       bool low_latency_hint, Eigen::Allocator* allocator) {
   CHECK_GE(num_threads, 1);
   impl_.reset(new ThreadPool::Impl(env, thread_options, "tf_" + name,
-                                   num_threads, low_latency_hint));
+                                   num_threads, low_latency_hint, allocator));
 }
 
 ThreadPool::~ThreadPool() {}
@@ -192,5 +199,14 @@ int ThreadPool::NumThreads() const { return impl_->NumThreads(); }
 
 int ThreadPool::CurrentThreadId() const { return impl_->CurrentThreadId(); }
 
+void ThreadPool::ScheduleWithHint(std::function<void()> fn, int start,
+                                  int limit) {
+  impl_->ScheduleWithHint(std::move(fn), start, limit);
+}
+
+void ThreadPool::SetStealPartitions(
+    const std::vector<std::pair<unsigned, unsigned>>& partitions) {
+  impl_->SetStealPartitions(partitions);
+}
 }  // namespace thread
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index e14ad7ac641ac764ae3326cbedb9998e48e1b070..90c9f294472f1475c99494bc276ce475d5cded81 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -22,6 +22,9 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+namespace Eigen {
+class Allocator;
+}  // namespace Eigen
 namespace tensorflow {
 namespace thread {
 
@@ -37,7 +40,8 @@ class ThreadPool {
   //
   // REQUIRES: num_threads > 0
   ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
-             int num_threads, bool low_latency_hint);
+             int num_threads, bool low_latency_hint,
+             Eigen::Allocator* allocator = nullptr);
 
   // Constructs a pool for low-latency ops that contains "num_threads" threads
   // with specified "name". env->StartThread() is used to create individual
@@ -59,6 +63,10 @@ class ThreadPool {
   // Schedules fn() for execution in the pool of threads.
   void Schedule(std::function<void()> fn);
 
+  void SetStealPartitions(
+      const std::vector<std::pair<unsigned, unsigned>>& partitions);
+
+  void ScheduleWithHint(std::function<void()> fn, int start, int limit);
   // Requires 0 < block_size <= total.
   // Spawns k threads and calls fn(i*block_size, (i+1)*block_size) from the
   // ith thread (i>=0). When (i+1)*block_size > total, fn(i*block_size, total)
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
index 15266af1dbd877ff2023ec32e19c172dc3d00fa9..62dd31a65f68bee0e217b28ac7fdcdee6cd4844e 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
@@ -22,18 +22,19 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/base/casts.h"
 #include "tensorflow/core/lib/jpeg/jpeg_handle.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
-#include "tensorflow/core/lib/core/casts.h"
 
 namespace tensorflow {
 namespace jpeg {
 namespace {
 
+using absl::bit_cast;
 const char kTestData[] = "tensorflow/core/lib/jpeg/testdata/";
 
 int ComputeSumAbsoluteDifference(const uint8* a, const uint8* b, int width,
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index e226a15ccca5ba2223e9f96b746b38679322e478..e8dbcb97b94475f91345676bade0a9d220560741 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -24,7 +24,7 @@ limitations under the License.
 // NOTE(skal): we don't '#include <setjmp.h>' before png.h as it otherwise
 // provokes a compile error. We instead let png.h include what is needed.
 
-#include "tensorflow/core/lib/core/casts.h"
+#include "absl/base/casts.h"
 #include "tensorflow/core/lib/png/png_io.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
@@ -76,7 +76,8 @@ static void Convert8to16(const uint8* p8, int num_comps, int p8_row_bytes,
 #undef CPTR_INC
 
 void ErrorHandler(png_structp png_ptr, png_const_charp msg) {
-  DecodeContext* const ctx = bit_cast<DecodeContext*>(png_get_io_ptr(png_ptr));
+  DecodeContext* const ctx =
+      absl::bit_cast<DecodeContext*>(png_get_io_ptr(png_ptr));
   ctx->error_condition = true;
   // To prevent log spam, errors are logged as VLOG(1) instead of ERROR.
   VLOG(1) << "PNG error: " << msg;
@@ -88,9 +89,14 @@ void WarningHandler(png_structp png_ptr, png_const_charp msg) {
 }
 
 void StringReader(png_structp png_ptr, png_bytep data, png_size_t length) {
-  DecodeContext* const ctx = bit_cast<DecodeContext*>(png_get_io_ptr(png_ptr));
+  DecodeContext* const ctx =
+      absl::bit_cast<DecodeContext*>(png_get_io_ptr(png_ptr));
   if (static_cast<png_size_t>(ctx->data_left) < length) {
-    memset(data, 0, length);
+    // Don't zero out the data buffer as it has been lazily allocated (copy on
+    // write) and zeroing it out here can produce an OOM. Since the buffer is
+    // only used for reading data from the image, this doesn't result in any
+    // data leak, so it is safe to just leave the buffer be as it is and just
+    // exit with error.
     png_error(png_ptr, "More bytes requested to read than available");
   } else {
     memcpy(data, ctx->data, length);
@@ -100,8 +106,8 @@ void StringReader(png_structp png_ptr, png_bytep data, png_size_t length) {
 }
 
 void StringWriter(png_structp png_ptr, png_bytep data, png_size_t length) {
-  string* const s = bit_cast<string*>(png_get_io_ptr(png_ptr));
-  s->append(bit_cast<const char*>(data), length);
+  string* const s = absl::bit_cast<string*>(png_get_io_ptr(png_ptr));
+  s->append(absl::bit_cast<const char*>(data), length);
 }
 
 void StringWriterFlush(png_structp png_ptr) {}
@@ -215,7 +221,7 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
     CommonFreeDecode(context);
     return false;
   }
-  context->data = bit_cast<const uint8*>(png_string.data());
+  context->data = absl::bit_cast<const uint8*>(png_string.data());
   context->data_left = png_string.size();
   png_set_read_fn(context->png_ptr, context, StringReader);
   png_read_info(context->png_ptr, context->info_ptr);
@@ -328,8 +334,8 @@ bool CommonFinishDecode(png_bytep data, int row_bytes, DecodeContext* context) {
 
   // Synthesize 16 bits from 8 if requested.
   if (context->need_to_synthesize_16)
-    Convert8to16(bit_cast<uint8*>(data), context->channels, row_bytes,
-                 context->width, context->height, bit_cast<uint16*>(data),
+    Convert8to16(absl::bit_cast<uint8*>(data), context->channels, row_bytes,
+                 context->width, context->height, absl::bit_cast<uint16*>(data),
                  row_bytes);
   return ok;
 }
diff --git a/tensorflow/core/lib/png/png_io.h b/tensorflow/core/lib/png/png_io.h
index c876c5156aba5d3f1c30af1e8b90bad3b792adb5..d3a44b19eedaef75cc3306f4547a641b4687c207 100644
--- a/tensorflow/core/lib/png/png_io.h
+++ b/tensorflow/core/lib/png/png_io.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/png.h"
 #include "tensorflow/core/platform/types.h"
@@ -68,7 +69,7 @@ bool DecodeHeader(StringPiece png_string, int* width, int* height,
 // DecodeContext context;
 // CHECK(CommonInitDecode(png_string, 3 /*RGB*/, 8 /*uint8*/, &context));
 // char* image_buffer = new char[3*context.width*context.height];
-// CHECK(CommonFinishDecode(bit_cast<png_byte*>(image_buffer),
+// CHECK(CommonFinishDecode(absl::bit_cast<png_byte*>(image_buffer),
 //       3*context.width /*stride*/, &context));
 //
 // desired_channels may be 0 to detected it from the input.
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 87aa5915ff80704bbf6d1b38e3ec4651f7da0d0a..fff6f1fedc3ee9de587f9881d27a28727240c2f6 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 #include <algorithm>
+#include <cinttypes>
 #include <cmath>
 #include <locale>
 #include <unordered_map>
@@ -391,7 +392,7 @@ string FpToString(Fprint fp) {
 bool StringToFp(const string& s, Fprint* fp) {
   char junk;
   uint64_t result;
-  if (sscanf(s.c_str(), "%lx%c", &result, &junk) == 1) {
+  if (sscanf(s.c_str(), "%" SCNx64 "%c", &result, &junk) == 1) {
     *fp = result;
     return true;
   } else {
diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index c536b5688ef06e25c8867cbc58a43be1278e1550..b4f0bfbfb9626e537f34d1f3bbc5edc2f260a3f7 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <string.h>
 #include <algorithm>
 
-#include "tensorflow/core/lib/core/casts.h"
+#include "absl/base/casts.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
@@ -174,7 +174,7 @@ Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
 
   wav_string->resize(file_size);
   char* data = &wav_string->at(0);
-  WavHeader* header = bit_cast<WavHeader*>(data);
+  WavHeader* header = absl::bit_cast<WavHeader*>(data);
 
   // Fill RIFF chunk.
   auto* riff_chunk = &header->riff_chunk;
diff --git a/tensorflow/core/lib/wav/wav_io.h b/tensorflow/core/lib/wav/wav_io.h
index f004524177eef56a876f7aa7cfe6bf80559f4335..9145e7c9f2275683184152b7e8597eeea0ca72a7 100644
--- a/tensorflow/core/lib/wav/wav_io.h
+++ b/tensorflow/core/lib/wav/wav_io.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4be33b2a0cf10a2525f9a93b5d4942b381d92629
--- /dev/null
+++ b/tensorflow/core/nccl/BUILD
@@ -0,0 +1,54 @@
+# Description:
+#   Wrap NVIDIA (https://github.com/NVIDIA/nccl) NCCL with tensorflow ops.
+#   APIs are meant to change over time.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+
+cc_library(
+    name = "nccl_lib",
+    srcs = if_cuda([
+        "nccl_manager.cc",
+        "nccl_manager.h",
+        "nccl_rewrite.cc",
+    ]),
+    copts = tf_copts(),
+    deps = if_cuda([
+        "@local_config_nccl//:nccl",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_headers_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor",
+    ]),
+    alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+    name = "nccl_manager_test",
+    size = "medium",
+    srcs = ["nccl_manager_test.cc"],
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_on_cpu_tap",  # TODO(b/120284216): re-enable multi_gpu
+    ],
+    deps = [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_cuda([
+        ":nccl_lib",
+        "@local_config_nccl//:nccl",
+        "//tensorflow/core:cuda",
+    ]),
+)
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
similarity index 90%
rename from tensorflow/contrib/nccl/kernels/nccl_manager.cc
rename to tensorflow/core/nccl/nccl_manager.cc
index 99fecf96517935bf3bde3636df83b4a9a4e1c779..df49bf1b976726b3c1cbc3917c881dbc380f2f9a 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
 
 #include <utility>
 
@@ -24,6 +24,22 @@ limitations under the License.
 
 namespace tensorflow {
 
+#define NCCL_RETURN_IF_ERROR(...)                               \
+  do {                                                          \
+    ncclResult_t nccl_status = (__VA_ARGS__);                   \
+    if (nccl_status != ncclSuccess) {                           \
+      return errors::Internal(ncclGetErrorString(nccl_status)); \
+    }                                                           \
+  } while (0)
+
+#define CUDA_RETURN_IF_ERROR(...)                               \
+  do {                                                          \
+    cudaError_t cuda_status = (__VA_ARGS__);                    \
+    if (cuda_status != cudaSuccess) {                           \
+      return errors::Internal(cudaGetErrorString(cuda_status)); \
+    }                                                           \
+  } while (0)
+
 using se::cuda::ScopedActivateExecutorContext;
 
 // Contains data for a single stream used for nccl communication; this includes
@@ -177,8 +193,8 @@ NcclManager* NcclManager::instance() {
   return instance;
 }
 
-NcclManager::Communicator* NcclManager::GetCommunicator(
-    NcclManager::Collective* collective) {
+Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
+                                    NcclManager::Communicator** communicator) {
   // Sort by executor to make ordering of executors deterministic.
   std::sort(collective->participants.begin(), collective->participants.end(),
             [](const std::unique_ptr<Participant>& a,
@@ -217,7 +233,10 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
           break;
         }
       }
-      if (i == num_devices) return comm.get();
+      if (i == num_devices) {
+        *communicator = comm.get();
+        return Status::OK();
+      }
     }
   }
 
@@ -264,37 +283,36 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
   // NCCL2 prevents InitAll for more communicators than devices (but doesn't
   // check that device ids are unique). Work around it by initializing each
   // rank individually.
-  cudaGetDeviceCount(&device_count);
+  CUDA_RETURN_IF_ERROR(cudaGetDeviceCount(&device_count));
 #endif
   std::vector<ncclComm_t> nccl_comms(num_devices);
   if (num_devices <= device_count) {
-    auto result =
-        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
-    CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+    NCCL_RETURN_IF_ERROR(
+        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data()));
   } else {
     int savedDevice = 0;
-    CHECK_EQ(cudaGetDevice(&savedDevice), cudaSuccess);
+    CUDA_RETURN_IF_ERROR(cudaGetDevice(&savedDevice));
     ncclUniqueId commId;
-    ncclGetUniqueId(&commId);
+    NCCL_RETURN_IF_ERROR(ncclGetUniqueId(&commId));
 #if NCCL_MAJOR >= 2
-    CHECK_EQ(ncclGroupStart(), ncclSuccess);
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
 #endif
     for (int rank = 0; rank < num_devices; ++rank) {
-      cudaSetDevice(devices[rank]);
-      auto result =
-          ncclCommInitRank(nccl_comms.data() + rank, num_devices, commId, rank);
-      CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+      CUDA_RETURN_IF_ERROR(cudaSetDevice(devices[rank]));
+      NCCL_RETURN_IF_ERROR(ncclCommInitRank(nccl_comms.data() + rank,
+                                            num_devices, commId, rank));
     }
 #if NCCL_MAJOR >= 2
-    CHECK_EQ(ncclGroupEnd(), ncclSuccess);
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
 #endif
-    cudaSetDevice(savedDevice);
+    CUDA_RETURN_IF_ERROR(cudaSetDevice(savedDevice));
   }
   for (int rank = 0; rank < num_devices; ++rank) {
     members[rank].nccl_comm = nccl_comms[rank];
   }
   communicators_.emplace_back(new Communicator(std::move(members)));
-  return communicators_.back().get();
+  *communicator = communicators_.back().get();
+  return Status::OK();
 }
 
 void NcclManager::AddToAllReduce(int num_devices, const string& key,
@@ -400,10 +418,18 @@ void NcclManager::AddParticipant(int num_devices, const string& key,
 void NcclManager::RunCollective(const string& key, Collective* collective) {
   static mutex collective_mu(LINKER_INITIALIZED);
 
-  auto* communicator = GetCommunicator(collective);
-  collective->communicator = communicator;
-  const int size = communicator->num_devices;
+  Communicator* communicator = nullptr;
+  const int size = static_cast<int>(collective->participants.size());
+  Status s = GetCommunicator(collective, &communicator);
+  if (!s.ok()) {
+    for (int i = 0; i < size; ++i) {
+      collective->participants[i]->done_callback(s);
+    }
+    delete collective;
+    return;
+  }
 
+  collective->communicator = communicator;
   for (int rank = 0; rank < size; ++rank) {
     Participant* p = collective->participants[rank].get();
     NcclStream* nccl_stream = communicator->members[rank].nccl_stream;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
similarity index 90%
rename from tensorflow/contrib/nccl/kernels/nccl_manager.h
rename to tensorflow/core/nccl/nccl_manager.h
index 7d158cc98026678edafa0845df92038b449a9225..5da4fe5554d134f79c279542666c841a4e205485 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_NCCL_KERNELS_NCCL_MANAGER_H_
-#define TENSORFLOW_CONTRIB_NCCL_KERNELS_NCCL_MANAGER_H_
+#ifndef TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
+#define TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
 
 #ifdef GOOGLE_CUDA
 
@@ -103,7 +103,13 @@ class NcclManager {
   struct NcclStream;
   struct Participant;
 
-  Communicator* GetCommunicator(Collective* collective);
+  // Gets the `Communicator` object that will be used to enqueue NCCL kernels
+  // for `collective`, and returns it via `communicator`.
+  //
+  // This may involve creating CUDA streams and NCCL initialization.  If a NCCL
+  // or CUDA error occurs in the process, this returns an INTERNAL error with
+  // the corresponding NCCL/CUDA error string.
+  Status GetCommunicator(Collective* collective, Communicator** communicator);
 
   void AddParticipant(int num_devices, const string& key,
                       std::unique_ptr<Participant> participant,
@@ -135,4 +141,4 @@ class NcclManager {
 
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_NCCL_KERNELS_NCCL_MANAGER_H_
+#endif  // TENSORFLOW_CORE_NCCL_NCCL_MANAGER_H_
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
similarity index 85%
rename from tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
rename to tensorflow/core/nccl/nccl_manager_test.cc
index 5144f7c38c8650ebfced1dfcc9378263ebaad8c0..f9ed4d0b9a26c390bc5974f206faea16c8b5b974 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -19,17 +19,17 @@ limitations under the License.
 #include <random>
 #include <vector>
 
-#include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 
-static std::vector<BaseGPUDevice*> GetGPUDevices() {
-  std::vector<Device*> devices;
+static std::vector<std::unique_ptr<BaseGPUDevice>> GetGPUDevices() {
+  std::vector<std::unique_ptr<Device>> devices;
   SessionOptions session_options;
   session_options.config.mutable_gpu_options()
       ->set_per_process_gpu_memory_fraction(0.1);
@@ -37,12 +37,12 @@ static std::vector<BaseGPUDevice*> GetGPUDevices() {
   Status s = DeviceFactory::GetFactory(DEVICE_GPU)
                  ->AddDevices(session_options, "", &devices);
   TF_CHECK_OK(s);
-  std::vector<BaseGPUDevice*> gpus;
-  for (Device* d : devices) {
-    if (d->device_type() == "GPU") {
-      gpus.push_back(static_cast<BaseGPUDevice*>(d));
-    } else {
-      delete d;
+  std::vector<std::unique_ptr<BaseGPUDevice>> gpus;
+  for (std::unique_ptr<Device>& device : devices) {
+    if (device->device_type() == "GPU") {
+      // If `device_type()` is GPU, this `Device` is guaranteed to be a
+      // `BaseGPUDevice`, which is a subclass of `Device`.
+      gpus.emplace_back(static_cast<BaseGPUDevice*>(device.release()));
     }
   }
   return gpus;
@@ -64,16 +64,15 @@ class NcclManagerTest : public ::testing::Test {
   };
 
   static void SetUpTestCase() {
-    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
-    devices_ = new std::vector<BaseGPUDevice*>(GetGPUDevices());
-    CHECK(!devices_->empty());
+    setenv("NCCL_DEBUG", "WARN", 1 /* replace */);
+    setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
+    devices_ = new std::vector<std::unique_ptr<BaseGPUDevice>>(GetGPUDevices());
     LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
   }
 
-  static void TearDownTestCase() {
-    for (auto device : *devices_) delete device;
-    delete devices_;
-  }
+  static int32 NumGPUs() { return static_cast<int32>(devices_->size()); }
+
+  static void TearDownTestCase() { delete devices_; }
 
   TestCase* MakeTestCase(int num_ranks, ncclRedOp_t reduction_op,
                          TensorShape shape, float value_offset) {
@@ -153,7 +152,7 @@ class NcclManagerTest : public ::testing::Test {
       stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
-      test::ExpectTensorNear<Scalar>(test_case->expected, out_cpu, 0.01);
+      test::ExpectClose(test_case->expected, out_cpu);
     }
   }
 
@@ -166,7 +165,7 @@ class NcclManagerTest : public ::testing::Test {
   }
 
   static BaseGPUDevice* GetDevice(size_t rank) {
-    return devices_->at(rank % devices_->size());
+    return devices_->at(rank % devices_->size()).get();
   }
 
  private:
@@ -181,13 +180,14 @@ class NcclManagerTest : public ::testing::Test {
   }
 
  private:
-  static std::vector<BaseGPUDevice*>* devices_;
+  static std::vector<std::unique_ptr<BaseGPUDevice>>* devices_;
   static const DataType data_type_;
   static const Scalar max_;
 };
 
 template <typename Scalar>
-std::vector<BaseGPUDevice*>* NcclManagerTest<Scalar>::devices_ = nullptr;
+std::vector<std::unique_ptr<BaseGPUDevice>>* NcclManagerTest<Scalar>::devices_ =
+    nullptr;
 template <typename Scalar>
 const DataType NcclManagerTest<Scalar>::data_type_ =
     DataTypeToEnum<Scalar>::value;
@@ -195,13 +195,13 @@ template <typename Scalar>
 const Scalar NcclManagerTest<Scalar>::max_ =
     Eigen::NumTraits<Scalar>::highest();
 
-// Instantiate tests for float and half.
-using TypeList = ::testing::Types<float, Eigen::half>;
+// Instantiate tests for float and double.
+using TypeList = ::testing::Types<float, double>;
 TYPED_TEST_CASE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
 TYPED_TEST(NcclManagerTest, BasicSumReduction) {
-  const int num_ranks = 3;
+  const int num_ranks = 4;
 
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
@@ -209,6 +209,7 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
         this->MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0.0f));
     for (int rank = 0; rank < num_ranks; ++rank) {
       auto* device = this->GetDevice(rank);
+      VLOG(2) << "rank " << rank << " device " << device->name();
       auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       NcclManager::instance()->AddToAllReduce(
@@ -225,15 +226,13 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
 // Same as the Basic test, but with multiple threads launching parts of many
 // reductions.
 //
-// Testing the multi-rank execution is currently reduced as it can hang when run
-// with num_ranks > devices->size(), for some GPUs (e.g. K20m).
-// To test the higher settings, increase num_ranks,
-// num_collectives_per_iteration and time_limit_micros.
+// To run test longer, increase num_ranks, num_collectives_per_iteration and
+// time_limit_micros.
 TYPED_TEST(NcclManagerTest, MultipleCallers) {
-  const int num_ranks = 1;                      // 2;
-  const int num_collectives_per_iteration = 1;  // 1000;
-  const int num_threads = 3;
-  const int time_limit_micros = 1;  // 60 * 30 * 1000 * 1000;
+  const int num_ranks = 4;
+  const int num_collectives_per_iteration = 10;  // 1000;
+  const int num_threads = num_ranks * 2;
+  const int time_limit_micros = 100;  // 60 * 30 * 1000 * 1000;
 
   int64 start = Env::Default()->NowMicros();
   srand(Env::Default()->NowMicros());
diff --git a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc b/tensorflow/core/nccl/nccl_rewrite.cc
similarity index 100%
rename from tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
rename to tensorflow/core/nccl/nccl_rewrite.cc
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index f55562ec99d91ef17c4a74d4ecaa7467e6a12e1f..281e2996ed7c2b07881d5ab564fc31463f8f8607 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2743,6 +2743,9 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("range_given: bool = false")
     .Output("output: T")
     .Attr("T: {bfloat16, half, float, double}")
+    .Attr(
+        "round_mode: {'HALF_TO_EVEN', 'HALF_UP'} = "
+        "'HALF_TO_EVEN'")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -2878,14 +2881,9 @@ REGISTER_OP("QuantizedInstanceNorm")
 
 namespace {
 
-Status ScatterNdShape(InferenceContext* c) {
-  ShapeHandle indices_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &indices_shape));
-  ShapeHandle updates_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &updates_shape));
-  ShapeHandle output_shape;
-  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &output_shape));
-
+Status ScatterNdShapeHelper(InferenceContext* c, ShapeHandle indices_shape,
+                            ShapeHandle updates_shape,
+                            ShapeHandle output_shape) {
   if (c->Value(c->NumElements(output_shape)) == 0 &&
       (c->Value(c->NumElements(indices_shape)) > 0 ||
        c->Value(c->NumElements(updates_shape)) > 0)) {
@@ -2940,6 +2938,26 @@ Status ScatterNdShape(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ScatterNdShape(InferenceContext* c) {
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &indices_shape));
+  ShapeHandle updates_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &updates_shape));
+  ShapeHandle output_shape;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &output_shape));
+  return ScatterNdShapeHelper(c, indices_shape, updates_shape, output_shape);
+}
+
+Status ScatterNdTensorShape(InferenceContext* c) {
+  ShapeHandle output_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &output_shape));
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
+  ShapeHandle updates_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &updates_shape));
+  return ScatterNdShapeHelper(c, indices_shape, updates_shape, output_shape);
+}
+
 }  // namespace
 
 REGISTER_OP("UpperBound")
@@ -2979,6 +2997,33 @@ REGISTER_OP("ScatterNd")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape);
 
+REGISTER_OP("TensorScatterUpdate")
+    .Input("tensor: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ScatterNdTensorShape);
+
+REGISTER_OP("TensorScatterAdd")
+    .Input("tensor: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ScatterNdTensorShape);
+
+REGISTER_OP("TensorScatterSub")
+    .Input("tensor: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ScatterNdTensorShape);
+
 REGISTER_OP("ScatterNdNonAliasingAdd")
     .Input("input: T")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
index b8cf5385548918434fe8fac31c92608e86c89519..1c854f661931a6ef26d69752708d7764107b49c6 100644
--- a/tensorflow/core/ops/boosted_trees_ops.cc
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
 
@@ -400,10 +401,7 @@ REGISTER_OP("BoostedTreesMakeQuantileSummaries")
       for (int i = 0; i < num_features; ++i) {
         ShapeHandle feature_shape;
         DimensionHandle unused_dim;
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &feature_shape));
-        TF_RETURN_IF_ERROR(c->Merge(c->Dim(feature_shape, 0),
-                                    c->Dim(example_weights_shape, 0),
-                                    &unused_dim));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &feature_shape));
         // the columns are value, weight, min_rank, max_rank.
         c->set_output(i, c->MakeShape({c->UnknownDim(), 4}));
       }
@@ -431,6 +429,17 @@ REGISTER_OP("BoostedTreesQuantileStreamResourceAddSummaries")
       return Status::OK();
     });
 
+REGISTER_OP("BoostedTreesQuantileStreamResourceDeserialize")
+    .Attr("num_streams: int")
+    .Input("quantile_stream_resource_handle: resource")
+    .Input("bucket_boundaries: num_streams * float")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      return Status::OK();
+    });
+
 REGISTER_OP("BoostedTreesQuantileStreamResourceFlush")
     .Attr("generate_quantiles: bool = False")
     .Input("quantile_stream_resource_handle: resource")
@@ -470,13 +479,13 @@ REGISTER_OP("BoostedTreesBucketize")
       ShapeHandle feature_shape;
       DimensionHandle unused_dim;
       for (int i = 0; i < num_features; i++) {
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 2, &feature_shape));
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &feature_shape));
         TF_RETURN_IF_ERROR(c->Merge(c->Dim(feature_shape, 0),
                                     c->Dim(c->input(0), 0), &unused_dim));
       }
       // Bucketized result should have same dimension as input.
       for (int i = 0; i < num_features; i++) {
-        c->set_output(i, c->MakeShape({c->Dim(c->input(i), 0), 1}));
+        c->set_output(i, c->MakeShape({c->Dim(c->input(i), 0)}));
       }
       return Status::OK();
     });
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 780c6f6448ddf24f0ba6a31de9cdc68c8db31789..1492741e8b3ef4aac19effb9656cf07ecffe7ff3 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -9175,6 +9175,51 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "BatchMatrixBandPart"
   input_arg {
@@ -11720,6 +11765,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "BoostedTreesQuantileStreamResourceDeserialize"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_streams"
+  }
+  attr {
+    name: "num_streams"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "BoostedTreesQuantileStreamResourceFlush"
   input_arg {
@@ -12012,33 +12076,6 @@ op {
     type: "list(float)"
   }
 }
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -17262,21 +17299,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "DatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -18423,69 +18445,6 @@ op {
     }
   }
 }
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -20879,6 +20838,10 @@ op {
     name: "element_shape"
     type_attr: "shape_type"
   }
+  input_arg {
+    name: "max_num_elements"
+    type: DT_INT32
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -21085,24 +21048,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "EnqueueInQueueDataset"
-  input_arg {
-    name: "queue"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "EnsureShape"
   input_arg {
@@ -21558,6 +21503,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalCSVDataset"
   input_arg {
@@ -21623,6 +21595,95 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalDatasetCardinality"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDirectedInterleaveDataset"
   input_arg {
@@ -21658,50 +21719,845 @@ op {
   }
 }
 op {
-  name: "ExperimentalFunctionBufferingResource"
+  name: "ExperimentalGroupByReducerDataset"
   input_arg {
-    name: "string_arg"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalGroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalGroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalIdentityIndexedDataset"
+  input_arg {
+    name: "size"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalIndexedDatasetGet"
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "index"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIndexedDatasetMaterialize"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "materialized"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalIteratorGetDevice"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "device"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalLMDBDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMatchingFilesDataset"
+  input_arg {
+    name: "patterns"
     type: DT_STRING
   }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalMaterializedIndexDatasetHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNumaMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNumaMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
   input_arg {
-    name: "target_device"
-    type: DT_STRING
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
   }
   output_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "f"
+    type: "func"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
     name: "f"
     type: "func"
   }
   attr {
-    name: "buffer_size"
-    type: "int"
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
     type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  is_stateful: true
 }
 op {
-  name: "ExperimentalFunctionBufferingResourceGetNext"
+  name: "ExperimentalParseExampleDataset"
   input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
   }
   output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
@@ -21709,34 +22565,100 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
 }
 op {
-  name: "ExperimentalFunctionBufferingResourceReset"
+  name: "ExperimentalParseExampleDataset"
   input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
   }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalIdentityIndexedDataset"
   input_arg {
-    name: "size"
-    type: DT_UINT64
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
-  is_stateful: true
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ExperimentalIgnoreErrorsDataset"
+  name: "ExperimentalPrivateThreadPoolDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -21755,18 +22677,18 @@ op {
   }
 }
 op {
-  name: "ExperimentalIndexedDatasetGet"
+  name: "ExperimentalRandomDataset"
   input_arg {
-    name: "materialized"
-    type: DT_RESOURCE
+    name: "seed"
+    type: DT_INT64
   }
   input_arg {
-    name: "index"
-    type: DT_UINT64
+    name: "seed2"
+    type: DT_INT64
   }
   output_arg {
-    name: "components"
-    type_list_attr: "output_types"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
     name: "output_types"
@@ -21783,33 +22705,120 @@ op {
   is_stateful: true
 }
 op {
-  name: "ExperimentalIndexedDatasetMaterialize"
+  name: "ExperimentalScanDataset"
   input_arg {
-    name: "dataset"
+    name: "input_dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "materialized"
-    type: DT_RESOURCE
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  is_stateful: true
 }
 op {
-  name: "ExperimentalIteratorGetDevice"
+  name: "ExperimentalScanDataset"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "device"
-    type: DT_STRING
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "ExperimentalLMDBDataset"
+  name: "ExperimentalSetStatsAggregatorDataset"
   input_arg {
-    name: "filenames"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
     type: DT_STRING
   }
   output_arg {
@@ -21831,18 +22840,84 @@ op {
   is_stateful: true
 }
 op {
-  name: "ExperimentalMaterializedIndexDatasetHandle"
+  name: "ExperimentalSleepDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "sleep_microseconds"
+    type: DT_INT64
+  }
   output_arg {
     name: "handle"
-    type: DT_RESOURCE
+    type: DT_VARIANT
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
     name: "output_types"
@@ -21858,6 +22933,40 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -21923,6 +23032,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalUniqueDataset"
   input_arg {
@@ -23361,6 +24493,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FixedLengthRecordDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "FixedLengthRecordReader"
   output_arg {
@@ -26115,207 +27279,6 @@ op {
     }
   }
 }
-op {
-  name: "GroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "GuaranteeConst"
   input_arg {
@@ -28954,30 +29917,141 @@ op {
   }
 }
 op {
-  name: "LatencyStatsDataset"
+  name: "LeakyRelu"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
+}
+op {
+  name: "LeakyRelu"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "features"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "activations"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LeakyReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "LeakyReluGrad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "backprops"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
@@ -30157,31 +31231,55 @@ op {
   is_stateful: true
 }
 op {
-  name: "LookupTableInsert"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableInsertV2"
+  name: "LookupTableInsert"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableRemoveV2"
   input_arg {
     name: "table_handle"
     type: DT_RESOURCE
@@ -30190,18 +31288,10 @@ op {
     name: "keys"
     type_attr: "Tin"
   }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
   attr {
     name: "Tin"
     type: "type"
   }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
   is_stateful: true
 }
 op {
@@ -30271,6 +31361,46 @@ op {
     }
   }
 }
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "MakeIterator"
   input_arg {
@@ -30284,55 +31414,45 @@ op {
   is_stateful: true
 }
 op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
+  name: "MapClear"
   attr {
-    name: "f"
-    type: "func"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "output_types"
+    name: "dtypes"
     type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "MapAndBatchDatasetV2"
+  name: "MapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -30341,18 +31461,6 @@ op {
     name: "other_arguments"
     type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -30378,43 +31486,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-}
-op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
   is_stateful: true
 }
 op {
@@ -30452,7 +31523,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
 }
 op {
   name: "MapDataset"
@@ -30489,6 +31559,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "MapDataset"
@@ -30532,6 +31609,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "MapDefun"
@@ -31039,6 +32123,51 @@ op {
     }
   }
 }
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "MatchingFiles"
   input_arg {
@@ -31401,19 +32530,49 @@ op {
   }
 }
 op {
-  name: "MatrixSolveLs"
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixSquareRoot"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "rhs"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
-  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -31430,13 +32589,6 @@ op {
       }
     }
   }
-  attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
 }
 op {
   name: "MatrixTriangularSolve"
@@ -36590,6 +37742,10 @@ op {
     name: "empty_key"
     type_attr: "key_dtype"
   }
+  input_arg {
+    name: "deleted_key"
+    type_attr: "key_dtype"
+  }
   output_arg {
     name: "table_handle"
     type: DT_RESOURCE
@@ -36847,6 +38003,124 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "NcclAllReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "num_devices"
+    type: "int"
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "NcclBroadcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "NcclReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    number_attr: "num_devices"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "num_devices"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Neg"
   input_arg {
@@ -38458,7 +39732,7 @@ op {
   }
 }
 op {
-  name: "ParallelInterleaveDataset"
+  name: "ParallelInterleaveDatasetV2"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -38476,15 +39750,7 @@ op {
     type: DT_INT64
   }
   input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
+    name: "num_parallel_calls"
     type: DT_INT64
   }
   output_arg {
@@ -38560,6 +39826,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ParallelMapDataset"
@@ -38689,6 +39962,121 @@ op {
     }
   }
 }
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ParameterizedTruncatedNormal"
   input_arg {
@@ -38898,76 +40286,6 @@ op {
     has_minimum: true
   }
 }
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ParseSequenceExample"
   input_arg {
@@ -39465,6 +40783,123 @@ op {
     type: "func"
   }
 }
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "Placeholder"
   output_arg {
@@ -39780,48 +41215,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "PrependFromQueueAndPaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "PreventGradient"
   input_arg {
@@ -39961,6 +41354,21 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "PriorityQueue"
   output_arg {
@@ -40817,6 +42225,71 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+}
 op {
   name: "QuantizeAndDequantizeV3"
   input_arg {
@@ -43847,6 +45320,129 @@ op {
     }
   }
 }
+op {
+  name: "RaggedGather"
+  input_arg {
+    name: "params_nested_splits"
+    type: DT_INT64
+    number_attr: "PARAMS_RAGGED_RANK"
+  }
+  input_arg {
+    name: "params_dense_values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output_nested_splits"
+    type: DT_INT64
+    number_attr: "OUTPUT_RAGGED_RANK"
+  }
+  output_arg {
+    name: "output_dense_values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "PARAMS_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "OUTPUT_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "RaggedRange"
+  input_arg {
+    name: "starts"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "limits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "deltas"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "rt_nested_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "RaggedTensorToSparse"
+  input_arg {
+    name: "rt_nested_splits"
+    type: DT_INT64
+    number_attr: "RAGGED_RANK"
+  }
+  input_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "RandomCrop"
   input_arg {
@@ -43895,34 +45491,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "RandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "RandomGamma"
   input_arg {
@@ -47679,82 +49247,218 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
+        type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -47771,107 +49475,47 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagradDA"
+  name: "ResourceApplyAdam"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "m"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "v"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
+    name: "beta1_power"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta2_power"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   attr {
     name: "T"
     type: "type"
@@ -47879,21 +49523,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -47977,6 +49618,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
@@ -48040,6 +49688,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -48122,6 +49772,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -48190,21 +49841,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -48225,7 +49876,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdam"
+  name: "ResourceApplyAdamWithAmsgrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -48238,6 +49889,10 @@ op {
     name: "v"
     type: DT_RESOURCE
   }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
   input_arg {
     name: "beta1_power"
     type_attr: "T"
@@ -48298,13 +49953,6 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   is_stateful: true
 }
 op {
@@ -49235,21 +50883,93 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrlV2"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -49263,41 +50983,17 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrlV2"
+  name: "ResourceApplyGradientDescent"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "delta"
     type_attr: "T"
   }
   attr {
@@ -49307,21 +51003,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -49367,6 +51060,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -49414,6 +51109,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -49447,21 +51143,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -49475,17 +51171,25 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ResourceApplyKerasMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   attr {
@@ -49520,6 +51224,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
@@ -53428,6 +55139,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyMomentum"
   input_arg {
@@ -56162,52 +57950,6 @@ op {
     }
   }
 }
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -58102,27 +59844,69 @@ op {
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterUpdate"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
   }
   attr {
     name: "Tindices"
@@ -58138,61 +59922,137 @@ op {
     name: "use_locking"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
 }
 op {
-  name: "ScatterUpdate"
+  name: "SdcaFprint"
   input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
   }
+}
+op {
+  name: "SdcaOptimizer"
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
   }
   input_arg {
-    name: "updates"
-    type_attr: "T"
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "example_labels"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
+    name: "out_example_state_data"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
   }
   attr {
-    name: "Tindices"
-    type: "type"
+    name: "loss_type"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
       }
     }
   }
   attr {
-    name: "use_locking"
+    name: "adaptative"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
-}
-op {
-  name: "SdcaFprint"
-  input_arg {
-    name: "input"
-    type: DT_STRING
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
   }
-  output_arg {
-    name: "output"
-    type: DT_INT64
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
@@ -58267,6 +60127,7 @@ op {
         s: "squared_loss"
         s: "hinge_loss"
         s: "smooth_hinge_loss"
+        s: "poisson_loss"
       }
     }
   }
@@ -58314,7 +60175,7 @@ op {
   }
 }
 op {
-  name: "SdcaOptimizer"
+  name: "SdcaOptimizerV2"
   input_arg {
     name: "sparse_example_indices"
     type: DT_INT64
@@ -58390,7 +60251,7 @@ op {
     }
   }
   attr {
-    name: "adaptative"
+    name: "adaptive"
     type: "bool"
     default_value {
       b: false
@@ -59821,42 +61682,6 @@ op {
     }
   }
 }
-op {
-  name: "SetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Shape"
   input_arg {
@@ -60523,17 +62348,6 @@ op {
     }
   }
 }
-op {
-  name: "SinkDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-}
 op {
   name: "Size"
   input_arg {
@@ -60714,41 +62528,6 @@ op {
     }
   }
 }
-op {
-  name: "SlideDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Snapshot"
   input_arg {
@@ -69907,38 +71686,6 @@ op {
     }
   }
 }
-op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Sqrt"
   input_arg {
@@ -70697,6 +72444,126 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -70842,7 +72709,215 @@ op {
   }
 }
 op {
-  name: "StatelessRandomNormal"
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniform"
   input_arg {
     name: "shape"
     type_attr: "T"
@@ -70864,6 +72939,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -70897,14 +72973,22 @@ op {
   }
 }
 op {
-  name: "StatelessRandomUniform"
+  name: "StatelessRandomUniformInt"
   input_arg {
     name: "shape"
     type_attr: "T"
   }
   input_arg {
     name: "seed"
-    type: DT_INT64
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
   }
   output_arg {
     name: "output"
@@ -70913,22 +72997,28 @@ op {
   attr {
     name: "dtype"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_INT64
     }
     allowed_values {
       list {
@@ -70939,14 +73029,14 @@ op {
   }
 }
 op {
-  name: "StatelessRandomUniform"
+  name: "StatelessTruncatedNormal"
   input_arg {
     name: "shape"
     type_attr: "T"
   }
   input_arg {
     name: "seed"
-    type_attr: "Tseed"
+    type: DT_INT64
   }
   output_arg {
     name: "output"
@@ -70979,19 +73069,6 @@ op {
       }
     }
   }
-  attr {
-    name: "Tseed"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
   name: "StatelessTruncatedNormal"
@@ -71001,7 +73078,7 @@ op {
   }
   input_arg {
     name: "seed"
-    type: DT_INT64
+    type_attr: "Tseed"
   }
   output_arg {
     name: "output"
@@ -71034,6 +73111,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "StatelessTruncatedNormal"
@@ -71058,6 +73148,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -71155,40 +73246,6 @@ op {
     }
   }
 }
-op {
-  name: "StatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
 op {
   name: "StopGradient"
   input_arg {
@@ -74024,6 +76081,127 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -74262,47 +76440,179 @@ op {
   }
 }
 op {
-  name: "TensorListSetItem"
+  name: "TensorListSetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListStack"
   input_arg {
     name: "input_handle"
     type: DT_VARIANT
   }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "num_elements"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "TensorScatterAdd"
   input_arg {
-    name: "index"
-    type: DT_INT32
+    name: "tensor"
+    type_attr: "T"
   }
   input_arg {
-    name: "item"
-    type_attr: "element_dtype"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "element_dtype"
+    name: "T"
     type: "type"
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "TensorListStack"
+  name: "TensorScatterSub"
   input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterUpdate"
+  input_arg {
     name: "tensor"
-    type_attr: "element_dtype"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "element_dtype"
+    name: "T"
     type: "type"
   }
   attr {
-    name: "num_elements"
-    type: "int"
-    default_value {
-      i: -1
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
 }
@@ -75456,29 +77766,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "UnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -75520,6 +77807,104 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeEncode"
+  input_arg {
+    name: "input_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "ignore"
+        s: "replace"
+        s: "strict"
+      }
+    }
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+}
 op {
   name: "UnicodeScript"
   input_arg {
@@ -75531,6 +77916,60 @@ op {
     type: DT_INT32
   }
 }
+op {
+  name: "UnicodeTranscode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "UniformCandidateSampler"
   input_arg {
@@ -76623,6 +79062,17 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "UnwrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "UpperBound"
   input_arg {
@@ -77056,6 +79506,17 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "WriteAudioSummary"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index ec22eee874dae0ba2d8cf73922723ae80f54c890..1c117166de029d40b84bbd2335b9315cdc53bcba 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -83,13 +83,6 @@ REGISTER_OP("GeneratorDataset")
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("UnbatchDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("ZipDataset")
     .Input("input_datasets: N * variant")
     .Output("handle: variant")
@@ -142,56 +135,6 @@ REGISTER_OP("SkipDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("BytesProducedStatsDataset")
-    .Input("input_dataset: variant")
-    .Input("tag: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle tag_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("LatencyStatsDataset")
-    .Input("input_dataset: variant")
-    .Input("tag: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle tag_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("ParseExampleDataset")
-    .Input("input_dataset: variant")
-    .Input("num_parallel_calls: int64")
-    .Input("dense_defaults: Tdense")
-    .Output("handle: variant")
-    .Attr("sparse_keys: list(string) >= 0")
-    .Attr("dense_keys: list(string) >= 0")
-    .Attr("sparse_types: list({float,int64,string}) >= 0")
-    .Attr("Tdense: list({float,int64,string}) >= 0")
-    .Attr("dense_shapes: list(shape) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")  // Output components will be
-                                              // sorted by key (dense_keys and
-                                              // sparse_keys combined) here.
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("SetStatsAggregatorDataset")
-    .Input("input_dataset: variant")
-    .Input("stats_aggregator: resource")
-    .Input("tag: string")
-    .Input("counter_prefix: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -201,6 +144,7 @@ REGISTER_OP("MapDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
+    .Attr("preserve_cardinality: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelMapDataset")
@@ -213,60 +157,10 @@ REGISTER_OP("ParallelMapDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
+    .Attr("sloppy: bool = false")
+    .Attr("preserve_cardinality: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("MapAndBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("batch_size: int64")
-    .Input("num_parallel_batches: int64")
-    .Input("drop_remainder: bool")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("MapAndBatchDatasetV2")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("batch_size: int64")
-    .Input("num_parallel_calls: int64")
-    .Input("drop_remainder: bool")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -280,18 +174,6 @@ REGISTER_OP("PrefetchDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("ScanDataset")
-    .Input("input_dataset: variant")
-    .Input("initial_state: Tstate")
-    .Input("other_arguments: Targuments")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Tstate: list(type) >= 1")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("FlatMapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -314,21 +196,6 @@ REGISTER_OP("InterleaveDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ParallelInterleaveDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("cycle_length: int64")
-    .Input("block_length: int64")
-    .Input("sloppy: bool")
-    .Input("buffer_output_elements: int64")
-    .Input("prefetch_input_elements: int64")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("ParallelInterleaveDatasetV2")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -340,43 +207,7 @@ REGISTER_OP("ParallelInterleaveDatasetV2")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("GroupByReducerDataset")
-    .Input("input_dataset: variant")
-    .Input("key_func_other_arguments: Tkey_func_other_arguments")
-    .Input("init_func_other_arguments: Tinit_func_other_arguments")
-    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input("finalize_func_other_arguments: Tfinalize_func_other_arguments")
-    .Output("handle: variant")
-    .Attr("key_func: func")
-    .Attr("init_func: func")
-    .Attr("reduce_func: func")
-    .Attr("finalize_func: func")
-    .Attr("Tkey_func_other_arguments: list(type) >= 0")
-    .Attr("Tinit_func_other_arguments: list(type) >= 0")
-    .Attr("Treduce_func_other_arguments: list(type) >= 0")
-    .Attr("Tfinalize_func_other_arguments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("GroupByWindowDataset")
-    .Input("input_dataset: variant")
-    .Input("key_func_other_arguments: Tkey_func_other_arguments")
-    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input(
-        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
-    .Output("handle: variant")
-    .Attr("key_func: func")
-    .Attr("reduce_func: func")
-    .Attr("window_size_func: func")
-    .Attr("Tkey_func_other_arguments: list(type) >= 0")
-    .Attr("Treduce_func_other_arguments: list(type) >= 0")
-    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("sloppy: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("FilterDataset")
@@ -444,23 +275,6 @@ REGISTER_OP("BatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("SlideDataset")
-    .Input("input_dataset: variant")
-    .Input("window_size: int64")
-    .Input("window_shift: int64")
-    .Input("window_stride: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // window_size, window_shift, and window_stride should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 // TODO(mrry): Validate that `padded_shapes` are all vectors, the lengths of
 // `output_types` and `output_shapes` are `N` the `output_shapes` are (as far as
 // possible to tell statically) compatible with `padded_shapes`, and that
@@ -501,22 +315,6 @@ REGISTER_OP("PaddedBatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("DenseToSparseBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("batch_size: int64")
-    .Input("row_shape: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // batch_size should be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      // row_shape should be a 1-D vector.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("RangeDataset")
     .Input("start: int64")
     .Input("stop: int64")
@@ -535,22 +333,6 @@ REGISTER_OP("RangeDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("RandomDataset")
-    .Input("seed: int64")
-    .Input("seed2: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // buffer_size, seed, and seed2 should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -619,30 +401,35 @@ REGISTER_OP("TextLineDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("SqlDataset")
-    .Input("driver_name: string")
-    .Input("data_source_name: string")
-    .Input("query: string")
+REGISTER_OP("FixedLengthRecordDataset")
+    .Input("filenames: string")
+    .Input("header_bytes: int64")
+    .Input("record_bytes: int64")
+    .Input("footer_bytes: int64")
+    .Input("buffer_size: int64")
     .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
-      // driver_name, data_source_name, and query should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // header_bytes, record_bytes, footer_bytes, buffer_size should be
+      // scalars.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("FixedLengthRecordDataset")
+REGISTER_OP("FixedLengthRecordDatasetV2")
     .Input("filenames: string")
     .Input("header_bytes: int64")
     .Input("record_bytes: int64")
     .Input("footer_bytes: int64")
     .Input("buffer_size: int64")
+    .Input("compression_type: string")
     .Output("handle: variant")
     .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
@@ -800,63 +587,11 @@ REGISTER_OP("DeserializeIterator")
     .Input("serialized: variant")
     .SetShapeFn(shape_inference::NoOutputs);
 
-REGISTER_OP("StatsAggregatorHandle")
-    .Output("handle: resource")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''");
-
-REGISTER_OP("StatsAggregatorSummary")
-    .Input("iterator: resource")
-    .Output("summary: string")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("batch_size: int64")
-    .Input("padded_shapes: N * int64")
-    .Input("padding_values: Toutput_types")
-    .Output("handle: variant")
-    .Attr("Toutput_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .Attr("N: int >= 1")
-    // TODO(ebrevdo): Validate that `padded_shapes` are all vectors, the lengths
-    // of `Toutput_types` and `output_shapes` are `N`, that the
-    // length of `output_types` is `N`, the `output_shapes` are
-    // (as far as possible to tell statically) compatible with `padded_shapes`,
-    // and that `padding_values` are all scalars.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // batch_size should be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("EnqueueInQueueDataset")
-    .Input("queue: variant")
-    .Input("components: Tcomponents")
-    .Attr("Tcomponents: list(type) >= 1")
-    .SetIsStateful()  // To avoid CSE on multiple calls to Enqueue.
-    // TODO(ebrevdo): SetShapeFn to test input dtypes and shapes by
-    // reading from queue handle (is that even possible?).
-    .SetShapeFn(shape_inference::NoOutputs);
-
-REGISTER_OP("DatasetToTFRecord")
-    .Input("input_dataset: variant")
-    .Input("filename: string")
-    .Input("compression_type: string")
-    .SetShapeFn(shape_inference::NoOutputs);
-
 REGISTER_OP("DatasetToGraph")
     .Input("input_dataset: variant")
     .Output("graph: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("SinkDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("OptimizeDataset")
     .Input("input_dataset: variant")
     .Input("optimizations: string")
@@ -952,6 +687,16 @@ REGISTER_OP("MapDefun")
       return Status::OK();
     });
 
+REGISTER_OP("WrapDatasetVariant")
+    .Input("input_handle: variant")
+    .Output("output_handle: variant")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("UnwrapDatasetVariant")
+    .Input("input_handle: variant")
+    .Output("output_handle: variant")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("MultiDeviceIterator")
     .Output("handle: resource")
     .Attr("devices: list(string) >= 1")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index f6bd5dce26765ed21362c7f41e52279841d6c345..f904e2536dfe67facc25335dc3f86b3d45fd116f 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -17,14 +17,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("ExperimentalDirectedInterleaveDataset")
-    .Input("selector_input_dataset: variant")
-    .Input("data_input_datasets: N * variant")
+REGISTER_OP("ExperimentalBytesProducedStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .Attr("N: int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ExperimentalCSVDataset")
     .Input("filenames: string")
@@ -68,6 +71,79 @@ REGISTER_OP("ExperimentalCSVDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalDatasetCardinality")
+    .Input("input_dataset: variant")
+    .Output("cardinality: int64")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalDatasetToTFRecord")
+    .Input("input_dataset: variant")
+    .Input("filename: string")
+    .Input("compression_type: string")
+    .SetShapeFn(shape_inference::NoOutputs);
+
+REGISTER_OP("ExperimentalDenseToSparseBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("row_shape: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // row_shape should be a 1-D vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalDirectedInterleaveDataset")
+    .Input("selector_input_dataset: variant")
+    .Input("data_input_datasets: N * variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalGroupByReducerDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("init_func_other_arguments: Tinit_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input("finalize_func_other_arguments: Tfinalize_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("init_func: func")
+    .Attr("reduce_func: func")
+    .Attr("finalize_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Tinit_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Tfinalize_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalGroupByWindowDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input(
+        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("reduce_func: func")
+    .Attr("window_size_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -75,6 +151,214 @@ REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalLatencyStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalMapAndBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("batch_size: int64")
+    .Input("num_parallel_calls: int64")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("preserve_cardinality: bool = false")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalMapDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("use_inter_op_parallelism: bool = true")
+    .Attr("preserve_cardinality: bool = false")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalMatchingFilesDataset")
+    .Input("patterns: string")
+    .Output("handle: variant")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `patterns` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalNonSerializableDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalParallelInterleaveDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("cycle_length: int64")
+    .Input("block_length: int64")
+    .Input("sloppy: bool")
+    .Input("buffer_output_elements: int64")
+    .Input("prefetch_input_elements: int64")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalParseExampleDataset")
+    .Input("input_dataset: variant")
+    .Input("num_parallel_calls: int64")
+    .Input("dense_defaults: Tdense")
+    .Output("handle: variant")
+    .Attr("sparse_keys: list(string) >= 0")
+    .Attr("dense_keys: list(string) >= 0")
+    .Attr("sparse_types: list({float,int64,string}) >= 0")
+    .Attr("Tdense: list({float,int64,string}) >= 0")
+    .Attr("dense_shapes: list(shape) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")  // Output components will be
+                                              // sorted by key (dense_keys and
+                                              // sparse_keys combined) here.
+    .Attr("sloppy: bool = false")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalRandomDataset")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalScanDataset")
+    .Input("input_dataset: variant")
+    .Input("initial_state: Tstate")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Tstate: list(type) >= 1")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("preserve_cardinality: bool = false")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalSetStatsAggregatorDataset")
+    .Input("input_dataset: variant")
+    .Input("stats_aggregator: resource")
+    .Input("tag: string")
+    .Input("counter_prefix: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalSleepDataset")
+    .Input("input_dataset: variant")
+    .Input("sleep_microseconds: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // Both inputs are scalar.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalSlidingWindowDataset")
+    .Input("input_dataset: variant")
+    .Input("window_size: int64")
+    .Input("window_shift: int64")
+    .Input("window_stride: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // window_size, window_shift, and window_stride should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalSqlDataset")
+    .Input("driver_name: string")
+    .Input("data_source_name: string")
+    .Input("query: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // driver_name, data_source_name, and query should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalStatsAggregatorHandle")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''");
+
+REGISTER_OP("ExperimentalStatsAggregatorSummary")
+    .Input("iterator: resource")
+    .Output("summary: string")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalUnbatchDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalUniqueDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -87,26 +371,21 @@ REGISTER_OP("ExperimentalIteratorGetDevice")
     .Output("device: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ExperimentalFunctionBufferingResource")
-    .Input("string_arg: string")
-    .Input("target_device: string")
-    .Output("resource: resource")
-    .Attr("shared_name: string")
-    .Attr("container: string")
-    .Attr("f: func")
-    .Attr("buffer_size: int")
-    .Attr("output_types: list(type)")
-    .SetShapeFn(shape_inference::UnknownShape);
-
-REGISTER_OP("ExperimentalFunctionBufferingResourceGetNext")
-    .Input("function_buffer_resource: resource")
-    .Attr("output_types: list(type)")
-    .Output("output: output_types")
-    .SetShapeFn(shape_inference::UnknownShape);
+REGISTER_OP("ExperimentalMaxIntraOpParallelismDataset")
+    .Input("input_dataset: variant")
+    .Input("max_intra_op_parallelism: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ExperimentalFunctionBufferingResourceReset")
-    .Input("function_buffer_resource: resource")
-    .SetShapeFn(shape_inference::UnknownShape);
+REGISTER_OP("ExperimentalPrivateThreadPoolDataset")
+    .Input("input_dataset: variant")
+    .Input("num_threads: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalThreadPoolDataset")
     .Input("input_dataset: variant")
@@ -138,6 +417,33 @@ REGISTER_OP("ExperimentalAssertNextDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalNumaMapAndBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("batch_size: int64")
+    .Input("num_parallel_calls: int64")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("preserve_cardinality: bool = false")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("ExperimentalLMDBDataset")
     .Input("filenames: string")
     .Output("handle: variant")
diff --git a/tensorflow/core/ops/function_ops.cc b/tensorflow/core/ops/function_ops.cc
index a6914d9383d2f5c623b17fb0b918c4907ed84175..8e86dd9f780c8eac3dd813c996288a9707247bc4 100644
--- a/tensorflow/core/ops/function_ops.cc
+++ b/tensorflow/core/ops/function_ops.cc
@@ -35,6 +35,22 @@ output: The argument.
 index: This argument is the index-th argument of the function.
 )doc");
 
+REGISTER_SYSTEM_OP("_DeviceArg")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("index: int >= 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* context) {
+      context->set_output(0, context->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+A graph node which represents an argument to a function.
+
+output: The argument.
+index: This argument is the index-th argument of the function.
+)doc");
+
 REGISTER_SYSTEM_OP("_Retval")
     .Input("input: T")
     .Attr("T: type")
@@ -50,6 +66,21 @@ input: The return value.
 index: This return value is the index-th return value of the function.
 )doc");
 
+REGISTER_SYSTEM_OP("_DeviceRetval")
+    .Input("input: T")
+    .Attr("T: type")
+    .Attr("index: int >= 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* context) {
+      return Status::OK();
+    })
+    .Doc(R"doc(
+A graph node which represents a return value of a function.
+
+input: The return value.
+index: This return value is the index-th return value of the function.
+)doc");
+
 REGISTER_OP("_ListToArray")
     .Input("input: Tin")
     .Output("output: N * T")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 22b4b07effe9020c607454cbef5eaf8628980b29..5e0bdd888cea1c508a38afe2f40c7c9f17d28269 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -225,6 +225,9 @@ REGISTER_OP("PartitionedCall")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type) >= 0")
     .Attr("f: func")
+    .Attr("config: string = ''")
+    .Attr("config_proto: string = ''")
+    .Attr("executor_type: string = ''")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("StatefulPartitionedCall")
@@ -233,6 +236,9 @@ REGISTER_OP("StatefulPartitionedCall")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type) >= 0")
     .Attr("f: func")
+    .Attr("config: string = ''")  // Deprecated in favor of config_proto
+    .Attr("config_proto: string = ''")
+    .Attr("executor_type: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
 
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 1d4d51a25d74843be5ba47c3994d774de6c439c2..952ee4bee2e5a49edeea168f4184767dbebc2527 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -109,6 +109,30 @@ Status SelfAdjointEigV2ShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// Input is [...,N,N].
+// First and second outputs are:
+//   [...,N,N]; [...,N].
+Status LuShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
+
+  DimensionHandle n;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(input, -2), c->Dim(input, -1), &n));
+
+  ShapeHandle batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(input, 0, -2, &batch_shape));
+
+  ShapeHandle lu_shape;
+  ShapeHandle p_shape;
+
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Matrix(n, n), &lu_shape));
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Vector(n), &p_shape));
+
+  c->set_output(0, lu_shape);
+  c->set_output(1, p_shape);
+  return Status::OK();
+}
+
 // Input is [...,M,N].
 // First and second outputs are:
 //   [...,M,M]; [...,M,N], if full_matrices is true,
@@ -289,6 +313,14 @@ REGISTER_OP("SelfAdjointEigV2")
     .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn(SelfAdjointEigV2ShapeFn);
 
+REGISTER_OP("Lu")
+    .Input("input: T")
+    .Output("lu: T")
+    .Output("p: output_idx_type")
+    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("output_idx_type: {int32, int64} = DT_INT32")
+    .SetShapeFn(LuShapeFn);
+
 REGISTER_OP("MatrixSolve")
     .Input("matrix: T")
     .Input("rhs: T")
@@ -323,6 +355,12 @@ REGISTER_OP("MatrixSolveLs")
       return MatrixSolveShapeFn(c, false /* square */);
     });
 
+REGISTER_OP("MatrixSquareRoot")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: {double, float, complex64, complex128}")
+    .SetShapeFn(BatchUnchangedSquareShapeFn);
+
 REGISTER_OP("Qr")
     .Input("input: T")
     .Output("q: T")
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index f4be820defa3d4b4e2a45ba2038d9250570f59a5..bfacee14efa41408865fecb103bc63b5f6de73ff 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -274,4 +274,23 @@ TEST(LinalgOpsTest, Svd_ShapeFn) {
   INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
 }
 
+TEST(LinalgOpsTest, Lu_ShapeFn) {
+  ShapeInferenceTestOp op("Lu");
+  INFER_OK(op, "?", "?;?");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,?,3,4,1,2]");
+
+  INFER_OK(op, "[?,?]", "[d0_0,d0_0];[d0_0]");
+  INFER_OK(op, "[1,?]", "[d0_0,d0_0];[d0_0]");
+  INFER_OK(op, "[?,1]", "[d0_1,d0_1];[d0_1]");
+
+  // Repeat previous block of tests with input rank > 2.
+  INFER_OK(op, "[1,?,3,4,?,?]",
+           "[d0_0,d0_1,d0_2,d0_3,d0_4,d0_4];[d0_0,d0_1,d0_2,d0_3,d0_4]");
+  INFER_OK(op, "[1,?,3,4,1,?]",
+           "[d0_0,d0_1,d0_2,d0_3,d0_4,d0_4];[d0_0,d0_1,d0_2,d0_3,d0_4]");
+  INFER_OK(op, "[1,?,3,4,?,1]",
+           "[d0_0,d0_1,d0_2,d0_3,d0_5,d0_5];[d0_0,d0_1,d0_2,d0_3,d0_5]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 7d79df9c1cc37f0cb7ea5be6c5067c2ccae2233e..01ebcd15439d670274d7e2a784ce78c5c1ee44ef 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -22,18 +22,20 @@ namespace {
 
 REGISTER_OP("EmptyTensorList")
     .Input("element_shape: shape_type")
+    .Input("max_num_elements: int32")
     .Output("handle: variant")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          0, &element_shape));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -44,9 +46,9 @@ REGISTER_OP("TensorListPushBack")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
@@ -56,18 +58,21 @@ REGISTER_OP("TensorListPushBack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to push to list with wrong element dtype. List has type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -88,9 +93,9 @@ REGISTER_OP("TensorListPushBackBatch")
 
       c->set_output(0, input_handles);
 
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
@@ -100,18 +105,21 @@ REGISTER_OP("TensorListPushBackBatch")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to push to list with wrong element dtype. List has type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -126,9 +134,9 @@ REGISTER_OP("TensorListPopBack")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
         return errors::InvalidArgument(
@@ -137,19 +145,21 @@ REGISTER_OP("TensorListPopBack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to read from list with wrong element dtype. List has "
               "type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        TF_RETURN_IF_ERROR(
+            c->Merge(tensor_shape, list_shape_type.shape, &ignored));
         c->set_output_handle_shapes_and_types(0, *handle_data);
-        s = list_shape_type.shape;
+        tensor_shape = list_shape_type.shape;
       }
-      c->set_output(1, s);
+      c->set_output(1, tensor_shape);
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -160,9 +170,9 @@ REGISTER_OP("TensorListStack")
     .Attr("element_dtype: type")
     .Attr("num_elements: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
         return errors::InvalidArgument(
@@ -171,16 +181,17 @@ REGISTER_OP("TensorListStack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to read from list with wrong element dtype. List has "
               "type ",
               DataTypeString(list_shape_type.dtype), " but expectec type ",
-              DataTypeString(t));
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       int expected_num_elements = -1;
       TF_RETURN_IF_ERROR(c->GetAttr("num_elements", &expected_num_elements));
@@ -191,11 +202,88 @@ REGISTER_OP("TensorListStack")
         num_elements = c->MakeShape({expected_num_elements});
       }
       shape_inference::ShapeHandle result;
-      TF_RETURN_IF_ERROR(c->Concatenate(num_elements, s, &result));
+      TF_RETURN_IF_ERROR(c->Concatenate(num_elements, element_shape, &result));
       c->set_output(0, result);
       return Status::OK();
     });
 
+REGISTER_OP("TensorListConcat")
+    .Input("input_handle: variant")
+    .Output("tensor: element_dtype")
+    .Output("lengths: int64")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to read from list with wrong variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != element_dtype) {
+          return errors::InvalidArgument(
+              "Trying to read from list with wrong element dtype. List has "
+              "type ",
+              DataTypeString(list_shape_type.dtype), " but expected type ",
+              DataTypeString(element_dtype));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
+      }
+      if (c->RankKnown(element_shape)) {
+        shape_inference::ShapeHandle result;
+        TF_RETURN_IF_ERROR(c->Subshape(element_shape, 1, &result));
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(c->MakeShape({c->UnknownDim()}), result, &result));
+        c->set_output(0, result);
+      } else {
+        c->set_output(0, c->UnknownShape());
+      }
+      c->set_output(1, c->MakeShape({c->UnknownDim()}));
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListSplit")
+    .Input("tensor: element_dtype")
+    .Input("element_shape: shape_type")
+    .Input("lengths: int64")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->input(0);
+      shape_inference::ShapeHandle ignored;
+      // Check that tensor is at least a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(tensor_shape, 1, &ignored));
+      // Check that lengths is a vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &ignored));
+      shape_inference::ShapeHandle element_shape_from_tensor_shape;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(tensor_shape, 1, &element_shape_from_tensor_shape));
+      TF_RETURN_IF_ERROR(c->Concatenate(c->MakeShape({c->UnknownDim()}),
+                                        element_shape_from_tensor_shape,
+                                        &element_shape_from_tensor_shape));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          1, &element_shape));
+      TF_RETURN_IF_ERROR(c->Merge(element_shape_from_tensor_shape,
+                                  element_shape,
+                                  &element_shape_from_tensor_shape));
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListFromTensor")
     .Input("tensor: element_dtype")
     .Input("element_shape: shape_type")
@@ -204,17 +292,20 @@ REGISTER_OP("TensorListFromTensor")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->input(0);
-      shape_inference::ShapeHandle o;
-      TF_RETURN_IF_ERROR(c->Subshape(s, 1, &o));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->input(0);
+      shape_inference::ShapeHandle tensor_shape_except_first_dim;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(tensor_shape, 1, &tensor_shape_except_first_dim));
       shape_inference::ShapeHandle element_shape;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
           1, &element_shape));
-      TF_RETURN_IF_ERROR(c->Merge(o, element_shape, &o));
+      TF_RETURN_IF_ERROR(c->Merge(tensor_shape_except_first_dim, element_shape,
+                                  &tensor_shape_except_first_dim));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{element_shape, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -240,13 +331,14 @@ REGISTER_OP("TensorListReserve")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          0, &element_shape));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -256,17 +348,17 @@ REGISTER_OP("TensorListGetItem")
     .Output("item: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         element_shape = list_shape_type.shape;
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument("Expected list with element dtype ",
-                                         DataTypeString(t),
+                                         DataTypeString(element_dtype),
                                          " but got list with element dtype ",
                                          DataTypeString(list_shape_type.dtype));
         }
@@ -282,17 +374,19 @@ REGISTER_OP("TensorListSetItem")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       c->set_output(0, c->Scalar());
       if (handle_data == nullptr) {
-        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        c->set_output_handle_shapes_and_types(
+            0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       const shape_inference::ShapeAndType& list_shape_type = (*handle_data)[0];
-      shape_inference::ShapeHandle s = c->input(2);
-      TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &s));
+      shape_inference::ShapeHandle item_shape = c->input(2);
+      TF_RETURN_IF_ERROR(
+          c->Merge(item_shape, list_shape_type.shape, &item_shape));
       c->set_output_handle_shapes_and_types(0, *handle_data);
       return Status::OK();
     });
@@ -303,17 +397,17 @@ REGISTER_OP("TensorListGather")
     .Output("values: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         element_shape = list_shape_type.shape;
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument("Expected list with element dtype ",
-                                         DataTypeString(t),
+                                         DataTypeString(element_dtype),
                                          " but got list with element dtype ",
                                          DataTypeString(list_shape_type.dtype));
         }
@@ -332,12 +426,13 @@ REGISTER_OP("TensorListScatter")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(2, &s));
-      c->set_output_handle_shapes_and_types(0, {{s, t}});
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          2, &element_shape));
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -353,28 +448,29 @@ REGISTER_OP("TensorListConcatLists")
       TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input_a));
       c->set_output(0, input_a);
 
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
 
       auto* handle_data_a = c->input_handle_shapes_and_types(0);
       auto* handle_data_b = c->input_handle_shapes_and_types(1);
       if (handle_data_a == nullptr && handle_data_b == nullptr) {
-        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        c->set_output_handle_shapes_and_types(
+            0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       shape_inference::ShapeAndType list_shape_type_a =
           (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
       const shape_inference::ShapeAndType& list_shape_type_b =
           (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
-      if (list_shape_type_a.dtype != t) {
+      if (list_shape_type_a.dtype != element_dtype) {
         return errors::InvalidArgument("input_a.type != element_dtype: ",
                                        DataTypeString(list_shape_type_a.dtype),
-                                       " vs. ", DataTypeString(t));
+                                       " vs. ", DataTypeString(element_dtype));
       }
-      if (list_shape_type_b.dtype != t) {
+      if (list_shape_type_b.dtype != element_dtype) {
         return errors::InvalidArgument("input_b.type != element_dtype: ",
                                        DataTypeString(list_shape_type_b.dtype),
-                                       " vs. ", DataTypeString(t));
+                                       " vs. ", DataTypeString(element_dtype));
       }
       TF_RETURN_IF_ERROR(c->Merge(list_shape_type_a.shape,
                                   list_shape_type_b.shape,
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index 2034d3601bc63451a72cd602e7f9d4ee51871c38..42a1b1d7e3f91321459ee8e9dfbc945126e619f4 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -49,9 +49,7 @@ WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("Print");
 REGISTER_OP("PrintV2")
     .Input("input: string")
     .SetIsStateful()
-    .Attr(
-        "output_stream: {'stdout', 'stderr', 'log(info)', "
-        "'log(warning)', 'log(error)'} = 'stderr'")
+    .Attr("output_stream: string = 'stderr'")
     .SetShapeFn([](InferenceContext* c) {
       // Make sure that the input is a scalar.
       if (c->Rank(c->input(0)) != 0) {
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index 72a77be70d04f87225b0ad7a1290d50368781ebe..a0987cd982ae596965845e50011f449571c14d72 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -214,6 +214,19 @@ REGISTER_OP("LookupTableInsertV2")
       return Status::OK();
     });
 
+REGISTER_OP("LookupTableRemoveV2")
+    .Input("table_handle: resource")
+    .Input("keys: Tin")
+    .Attr("Tin: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &handle));
+
+      // TODO(turboale): Validate keys shape.
+      return Status::OK();
+    });
+
 REGISTER_OP("LookupTableSize")
     .Input("table_handle: Ref(string)")
     .Output("size: int64")
@@ -407,6 +420,7 @@ REGISTER_OP("MutableDenseHashTable")
 
 REGISTER_OP("MutableDenseHashTableV2")
     .Input("empty_key: key_dtype")
+    .Input("deleted_key: key_dtype")
     .Output("table_handle: resource")
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 3eff728f03276817de1200cb1a4520a0bdf296e1..6f261dc1b1813ea1e78736725bdf8af66eab2c18 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -65,7 +65,9 @@ REGISTER_OP("BatchMatMul")
     .Input("x: T")
     .Input("y: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -764,7 +766,9 @@ REGISTER_OP("MatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn(shape_inference::MatMulShape);
 
 REGISTER_OP("SparseMatMul")
@@ -1437,7 +1441,24 @@ REGISTER_OP("Bincount")
     .Attr("T: {int32, int64, float32, float64}")
     .Output("bins: T")
     .SetShapeFn([](InferenceContext* c) {
-      c->set_output(0, c->UnknownShapeOfRank(1));
+      ShapeHandle unused;
+      // The input `size` must be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      const Tensor* size_tensor = c->input_tensor(1);
+      if (size_tensor == nullptr) {
+        // Return unknown shape if size is not known.
+        c->set_output(0, c->UnknownShapeOfRank(1));
+        return Status::OK();
+      }
+
+      // Return `[size]` shape if size is known.
+      int32 size_val = size_tensor->scalar<int32>()();
+      if (size_val < 0) {
+        return errors::InvalidArgument("size (", size_val,
+                                       ") must be non-negative");
+      }
+      c->set_output(0, c->MakeShape({size_val}));
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index be4c3ed2b6eabe931ceeb6c603b587a8d0fcb2f1..05379a7d699629d733cacd71343fc9d912eb0893 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -559,4 +559,16 @@ TEST(MathOpsTest, QuantizedAdd_ShapeFn) {
   INFER_ERROR("must be rank 0", op, "?;?;?;?;[3];?");
   INFER_ERROR("must be rank 0", op, "?;?;?;?;?;[4]");
 }
+
+TEST(MathOpsTest, Bincount_ShapeFn) {
+  ShapeInferenceTestOp op("Bincount");
+
+  // size should be scalar.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;[1];?");
+
+  INFER_OK(op, "?;?;?", "[?]");
+  INFER_OK(op, "?;[];?", "[?]");
+  INFER_OK(op, "[?];[];?", "[?]");
+  INFER_OK(op, "[?];[];[?]", "[?]");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9be3470820eb523e8d41f8bf63434cbb534034d8
--- /dev/null
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -0,0 +1,612 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+// For now, this file only includes MKL quantized ops. In the
+// future, we will move all other MKL ops from nn_ops.cc to this file.
+
+#ifdef INTEL_MKL
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("_MklQuantizedMaxPool")
+    .Input("input:         T")
+    .Input("min_input:     float")
+    .Input("max_input:     float")
+    .Input("mkl_input:     uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Output("output:       T")
+    .Output("min_output:   float")
+    .Output("max_output:   float")
+    .Output("mkl_output:     uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("T: quantizedtype")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .SetShapeFn(shape_inference::MaxPoolShape)
+    .Doc(R"doc(
+MKL version of QuantizedMaxPool operator. Uses MKL DNN APIs to perform max pooling
+on the quantized input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklQuantizedAvgPool")
+    .Input("input:           T")
+    .Input("min_input:       float")
+    .Input("max_input:       float")
+    .Input("mkl_input:       uint8")
+    .Input("mkl_min_input:   uint8")
+    .Input("mkl_max_input:   uint8")
+    .Output("output:         T")
+    .Output("min_output:     float")
+    .Output("max_output:     float")
+    .Output("mkl_output:     uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("T: quantizedtype")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::AvgPoolShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of QuantizedAvgPool operator. Uses MKL DNN APIs to perform average pooling
+on the quantized input.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklQuantizedConv2D")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBias")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for enabling MklToTf
+                               // conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("summand: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_summand: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("summand: Tsummand")
+    .Input("min_summand: float")
+    .Input("max_summand: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Input("mkl_summand: uint8")
+    .Input("mkl_min_summand: uint8")
+    .Input("mkl_max_summand: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("Tsummand: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("summand: Tsummand")
+    .Input("min_summand: float")
+    .Input("max_summand: float")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_bias: uint8")
+    .Input("mkl_min_input: uint8")
+    .Input("mkl_max_input: uint8")
+    .Input("mkl_min_filter: uint8")
+    .Input("mkl_max_filter: uint8")
+    .Input("mkl_min_freezed_output: uint8")
+    .Input("mkl_max_freezed_output: uint8")
+    .Input("mkl_summand: uint8")
+    .Input("mkl_min_summand: uint8")
+    .Input("mkl_max_summand: uint8")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_min_output: uint8")
+    .Output("mkl_max_output: uint8")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("Tsummand: quantizedtype")
+    .Attr("T: quantizedtype")  // Additional attribute "T" for
+                               // enabling MklToTf conversion
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nccl_ops.cc b/tensorflow/core/ops/nccl_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..671e47871beb94b6584e09f2219de4634a690a3c
--- /dev/null
+++ b/tensorflow/core/ops/nccl_ops.cc
@@ -0,0 +1,151 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("NcclAllReduce")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
+    .Attr("T: {half, float, float64, int32, int64}")
+    .Attr("num_devices: int")
+    .Attr("shared_name: string")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+// Note: This op has no kernel implementation, but is replaced by
+// _NcclReduceSend and _NcclReduceRecv during graph optimization stage.
+REGISTER_OP("NcclReduce")
+    .Input("input: num_devices * T")
+    .Output("data: T")
+    .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
+    .Attr("T: {half, float, float64, int32, int64}")
+    .Attr("num_devices: int")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("_NcclReduceSend")
+    .Input("input: T")
+    .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
+    .Attr("T: {half, float, float64, int32, int64}")
+    .Attr("num_devices: int")
+    .Attr("shared_name: string")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Replacement node for NcclReduce.
+
+Reduces `input` to the NcclReduceRecv op registered in the same `shared_name`.
+The graph should be constructed so that 'num_devices-1' devices run
+`_NcclReduceSend` and one device runs _NcclReduceRecv op with shared_name value
+`c`. Failure to do so will cause the graph execution to fail to complete.
+
+input: The input to the reduction.
+reduction: the reduction operation to perform.
+num_devices: The number of devices participating in this reduction.
+shared_name: Identifier that is shared between ops of the same reduce.
+    )doc");
+
+REGISTER_OP("_NcclReduceRecv")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
+    .Attr("T: {half, float, float64, int32, int64}")
+    .Attr("num_devices: int")
+    .Attr("shared_name: string")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Replacement node for NcclReduce.
+
+Reduces 'input' from this op and the NcclReduceSend ops registered in the same
+`shared_name`.
+The graph should be constructed so that 'num_devices-1' devices run
+`_NcclReduceSend` and one device runs _NcclReduceRecv op with shared_name value
+`c`. Failure to do so will cause the graph execution to fail to complete.
+
+input: The input to the reduction.
+data: The reduced data received from this op and the NcclReduceSend op.
+reduction: the reduction operation to perform.
+num_devices: The number of devices participating in this reduction.
+shared_name: Identifier that is shared between ops of the same reduce.
+    )doc");
+
+// Note: This op has no kernel implementation, but is replaced by
+// _NcclBroadcastSend and _NcclBroadcastRecv during graph optimization stage.
+REGISTER_OP("NcclBroadcast")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: {half, float, float64, int32, int64}")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("_NcclBroadcastSend")
+    .Input("input: T")
+    .Attr("T: {half, float, float64, int32, int64}")
+    .Attr("num_devices: int")
+    .Attr("shared_name: string")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Replacement node for NcclBroadcast.
+
+Sends `input` to the _NcclBroadcastRecv ops registered in the same
+`shared_name`.
+The graph should be constructed so that one device runs `_NcclBroadcastSend` and
+`num_devices-1` devices run _NcclBroadcastRecv ops with shared_name value `c`.
+Failure to do so will cause the graph execution to fail to complete.
+
+input: The input to the broadcast.
+num_devices: The number of devices participating in this reduction.
+shared_name: Identifier that is shared between ops of the same broadcast.
+    )doc");
+
+REGISTER_OP("_NcclBroadcastRecv")
+    .Input("shape: int32")
+    .Output("output: T")
+    .Attr("T: {half, float, float64, int32, int64}")
+    .Attr("num_devices: int")
+    .Attr("shared_name: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Replacement node for NcclBroadcast.
+
+Sends data of shape `shape` from the _NcclBroadcastSend op registered in the
+same `shared_name`.
+The graph should be constructed so that one device runs `_NcclBroadcastSend` and
+`num_devices-1` devices run _NcclBroadcastRecv ops with shared_name value `c`.
+Failure to do so will cause the graph execution to fail to complete.
+
+shape: The shape of the output.
+output: The broadcast data received from the NcclBroadcastSend op.
+num_devices: The number of devices participating in this reduction.
+shared_name: Identifier that is shared between ops of the same broadcast.
+    )doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index d1d81b27cc8b148f213ce5561332bc4d8b631029..bc59abc54cc1b87af3c06ce5cfda6fe5dca86e36 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -315,6 +315,27 @@ REGISTER_OP("Conv2DBackpropFilter")
       return Status::OK();
     });
 
+REGISTER_OP("_FusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. Grappler is
+expected to create these operators.
+)doc");
+
 namespace {
 
 Status CommonFusedConvCalculations(InferenceContext* c, bool has_resize) {
@@ -983,6 +1004,21 @@ REGISTER_OP("Relu6Grad")
     .Attr("T: realnumbertype")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
 
+REGISTER_OP("LeakyRelu")
+    .Input("features: T")
+    .Output("activations: T")
+    .Attr("alpha: float = 0.2")
+    .Attr("T: {half, bfloat16, float, double} = DT_FLOAT")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("LeakyReluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Output("backprops: T")
+    .Attr("alpha: float = 0.2")
+    .Attr("T: {half, bfloat16, float, double} = DT_FLOAT")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn);
+
 REGISTER_OP("Elu")
     .Input("features: T")
     .Output("activations: T")
@@ -1173,9 +1209,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument("input must have last dimension >= k = ",
-                                   c->Value(k_dim), " but is ",
-                                   c->Value(last_dim));
+    return errors::InvalidArgument(
+        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
+        c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1229,9 +1265,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument("Input must have last dimension > n = ",
-                                       c->Value(n_dim), " but is ",
-                                       c->Value(last_dim));
+        return errors::InvalidArgument(
+            "Input must have last dimension > n = ", c->Value(n_dim),
+            " but is ", c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
@@ -1573,6 +1609,55 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyPadWithConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("paddings: Tpaddings")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+Dummy node that enables fusing Pad and Conv2D operator for MKL. This node
+does not perform anything. It is just created as an intermediate output of
+merging Pad and Conv2D.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklPadWithConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("paddings: Tpaddings")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_paddings: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+MKL version of Pad and Conv2D operator. Uses MKL DNN APIs to perform
+Pad and 2D convolution to the output of convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DBackpropFilter")
     .Input("input: T")
     .Input("filter_sizes: int32")
@@ -1848,6 +1933,37 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklRelu6")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Relu6 operator. Uses MKL DNN APIs to implement Relu6 operator.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklRelu6Grad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of Relu6Grad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for Relu6 operation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklElu")
     .Input("features: T")
     .Input("mkl_features: uint8")
@@ -2043,7 +2159,6 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-
 REGISTER_OP("_MklAvgPool3DGrad")
     .Input("orig_input_shape: int32")
     .Input("grad: T")
@@ -2126,11 +2241,7 @@ REGISTER_OP("_MklLRN")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifdef INTEL_MKL_ML_ONLY
-    .Output("workspace: T")
-#else
     .Output("workspace: uint8")
-#endif
     .Output("mkl_output: uint8")
     .Output("mkl_workspace: uint8")
     .Attr("depth_radius: int = 5")
@@ -2154,11 +2265,7 @@ REGISTER_OP("_MklLRNGrad")
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
-#ifdef INTEL_MKL_ML_ONLY
-    .Input("workspace: T")
-#else
     .Input("workspace: uint8")
-#endif
     .Input("mkl_input_grads: uint8")
     .Input("mkl_input_image: uint8")
     .Input("mkl_output_image: uint8")
@@ -2339,7 +2446,7 @@ REGISTER_OP("_MklToTf")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, float, double, qint8, quint8, qint32}")
     .Attr(GetConvnetDataFormat2D3DAttrString())
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
@@ -2372,6 +2479,343 @@ element-wise MKL op.
 NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
+
+REGISTER_OP("QuantizedConv2DAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QINT8")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+// Fusion of Quantized Conv2D and BiasAdd.
+REGISTER_OP("QuantizedConv2DWithBias")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("QuantizedConv2DWithBiasAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("out_type: quantizedtype = DT_QINT8")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+// Fusion of Quantized Conv2D and Relu.
+REGISTER_OP("QuantizedConv2DAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("QuantizedConv2DAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+// Fusion of Quantized Conv2D, BiasAdd and Relu.
+REGISTER_OP("QuantizedConv2DWithBiasAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+// Fusion of Quantized Conv2D, BiasAdd, Relu, and Requantize.
+REGISTER_OP("QuantizedConv2DWithBiasAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+// Fusion of Quantized Conv2D, BiasAdd, Sum, and Relu.
+REGISTER_OP("QuantizedConv2DWithBiasSumAndRelu")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: float")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("summand: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QINT32")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("QuantizedConv2DWithBiasSumAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("summand: Tsummand")
+    .Input("min_summand: float")
+    .Input("max_summand: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("Tsummand: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("QuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
+    .Input("input: Tinput")
+    .Input("filter: Tfilter")
+    .Input("bias: Tbias")
+    .Input("min_input: float")
+    .Input("max_input: float")
+    .Input("min_filter: float")
+    .Input("max_filter: float")
+    .Input("min_freezed_output: float")
+    .Input("max_freezed_output: float")
+    .Input("summand: Tsummand")
+    .Input("min_summand: float")
+    .Input("max_summand: float")
+    .Output("output: out_type")
+    .Output("min_output: float")
+    .Output("max_output: float")
+    .Attr("Tinput: quantizedtype")
+    .Attr("Tfilter: quantizedtype")
+    .Attr("Tbias: {float, qint32}")
+    .Attr("Tsummand: quantizedtype")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
 #endif  // INTEL_MKL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 0d8997c1bd1567e861b47f5f0b67ef1902fa736f..89bdcc571efee6c0d193341936758670c1218aab 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3235,6 +3235,7 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -4632,6 +4633,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "BoostedTreesQuantileStreamResourceDeserialize"
+  input_arg {
+    name: "quantile_stream_resource_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucket_boundaries"
+    type: DT_FLOAT
+    number_attr: "num_streams"
+  }
+  attr {
+    name: "num_streams"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "BoostedTreesQuantileStreamResourceFlush"
   input_arg {
@@ -4924,33 +4944,6 @@ op {
     type: "list(float)"
   }
 }
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -7895,21 +7888,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "DatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -8579,37 +8557,6 @@ op {
     }
   }
 }
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -9604,6 +9551,10 @@ op {
     name: "element_shape"
     type_attr: "shape_type"
   }
+  input_arg {
+    name: "max_num_elements"
+    type: DT_INT32
+  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -9810,24 +9761,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "EnqueueInQueueDataset"
-  input_arg {
-    name: "queue"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "EnsureShape"
   input_arg {
@@ -10065,6 +9998,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalCSVDataset"
   input_arg {
@@ -10130,6 +10090,63 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalDatasetCardinality"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDirectedInterleaveDataset"
   input_arg {
@@ -10165,50 +10182,129 @@ op {
   }
 }
 op {
-  name: "ExperimentalFunctionBufferingResource"
+  name: "ExperimentalGroupByReducerDataset"
   input_arg {
-    name: "string_arg"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "target_device"
-    type: DT_STRING
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
   }
   output_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "key_func"
+    type: "func"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "init_func"
+    type: "func"
   }
   attr {
-    name: "f"
+    name: "reduce_func"
     type: "func"
   }
   attr {
-    name: "buffer_size"
-    type: "int"
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
     type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   is_stateful: true
 }
 op {
-  name: "ExperimentalFunctionBufferingResourceGetNext"
+  name: "ExperimentalGroupByWindowDataset"
   input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
   }
   output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
@@ -10216,15 +10312,12 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalFunctionBufferingResourceReset"
-  input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  is_stateful: true
 }
 op {
   name: "ExperimentalIdentityIndexedDataset"
@@ -10337,6 +10430,150 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMatchingFilesDataset"
+  input_arg {
+    name: "patterns"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalMaterializedIndexDatasetHandle"
   output_arg {
@@ -10365,6 +10602,516 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalNumaMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalPrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalSleepDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "sleep_microseconds"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -10430,6 +11177,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalUniqueDataset"
   input_arg {
@@ -11207,6 +11977,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FixedLengthRecordDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "header_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "footer_bytes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "FixedLengthRecordReader"
   output_arg {
@@ -12590,144 +13392,6 @@ op {
     }
   }
 }
-op {
-  name: "GroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "GuaranteeConst"
   input_arg {
@@ -14269,30 +14933,73 @@ op {
   }
 }
 op {
-  name: "LatencyStatsDataset"
+  name: "LeakyRelu"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "features"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "activations"
+    type_attr: "T"
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
+}
+op {
+  name: "LeakyReluGrad"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "features"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "backprops"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 0.2
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
@@ -15000,6 +15707,22 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LookupTableRemoveV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "LookupTableSize"
   input_arg {
@@ -15068,112 +15791,56 @@ op {
   }
 }
 op {
-  name: "MakeIterator"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
+  name: "Lu"
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+    name: "lu"
+    type_attr: "T"
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "MapAndBatchDatasetV2"
+  name: "MakeIterator"
   input_arg {
-    name: "input_dataset"
+    name: "dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "iterator"
+    type: DT_RESOURCE
   }
+  is_stateful: true
 }
 op {
   name: "MapClear"
@@ -15255,6 +15922,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "MapDefun"
@@ -15636,6 +16310,7 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -15911,6 +16586,29 @@ op {
     }
   }
 }
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "MatrixTriangularSolve"
   input_arg {
@@ -17585,6 +18283,10 @@ op {
     name: "empty_key"
     type_attr: "key_dtype"
   }
+  input_arg {
+    name: "deleted_key"
+    type_attr: "key_dtype"
+  }
   output_arg {
     name: "table_handle"
     type: DT_RESOURCE
@@ -17842,6 +18544,124 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "NcclAllReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "num_devices"
+    type: "int"
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "NcclBroadcast"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "NcclReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+    number_attr: "num_devices"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "num_devices"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Neg"
   input_arg {
@@ -19051,62 +19871,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "ParallelInterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ParallelInterleaveDatasetV2"
   input_arg {
@@ -19154,6 +19918,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ParallelMapDataset"
@@ -19201,6 +19972,20 @@ op {
       b: true
     }
   }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ParameterizedTruncatedNormal"
@@ -19348,76 +20133,6 @@ op {
     has_minimum: true
   }
 }
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ParseSequenceExample"
   input_arg {
@@ -19914,6 +20629,27 @@ op {
     name: "f"
     type: "func"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
 }
 op {
   name: "Placeholder"
@@ -20083,48 +20819,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "PrependFromQueueAndPaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "PreventGradient"
   input_arg {
@@ -20205,15 +20899,6 @@ op {
     default_value {
       s: "stderr"
     }
-    allowed_values {
-      list {
-        s: "stdout"
-        s: "stderr"
-        s: "log(info)"
-        s: "log(warning)"
-        s: "log(error)"
-      }
-    }
   }
   is_stateful: true
 }
@@ -20562,6 +21247,19 @@ op {
       }
     }
   }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
 }
 op {
   name: "QuantizeAndDequantizeV3"
@@ -22220,6 +22918,129 @@ op {
     }
   }
 }
+op {
+  name: "RaggedGather"
+  input_arg {
+    name: "params_nested_splits"
+    type: DT_INT64
+    number_attr: "PARAMS_RAGGED_RANK"
+  }
+  input_arg {
+    name: "params_dense_values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output_nested_splits"
+    type: DT_INT64
+    number_attr: "OUTPUT_RAGGED_RANK"
+  }
+  output_arg {
+    name: "output_dense_values"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "PARAMS_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "OUTPUT_RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "RaggedRange"
+  input_arg {
+    name: "starts"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "limits"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "deltas"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "rt_nested_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "RaggedTensorToSparse"
+  input_arg {
+    name: "rt_nested_splits"
+    type: DT_INT64
+    number_attr: "RAGGED_RANK"
+  }
+  input_arg {
+    name: "rt_dense_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "sparse_dense_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "RAGGED_RANK"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "RandomCrop"
   input_arg {
@@ -22269,34 +23090,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "RandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "RandomGamma"
   input_arg {
@@ -24548,6 +25341,86 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdamWithAmsgrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAddSign"
   input_arg {
@@ -24613,41 +25486,109 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyCenteredRMSProp"
+  name: "ResourceApplyCenteredRMSProp"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mg"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "momentum"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr_power"
     type_attr: "T"
   }
   attr {
@@ -24685,7 +25626,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrl"
+  name: "ResourceApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -24714,6 +25655,10 @@ op {
     name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr_power"
     type_attr: "T"
@@ -24753,41 +25698,17 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrlV2"
+  name: "ResourceApplyGradientDescent"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "delta"
     type_attr: "T"
   }
   attr {
@@ -24825,17 +25746,25 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ResourceApplyKerasMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   attr {
@@ -24870,6 +25799,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
@@ -26143,6 +27079,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyMomentum"
   input_arg {
@@ -27242,52 +28255,6 @@ op {
     }
   }
 }
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -27860,27 +28827,69 @@ op {
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ScatterUpdate"
+  input_arg {
+    name: "ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_ref"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
   }
   attr {
     name: "Tindices"
@@ -27896,65 +28905,142 @@ op {
     name: "use_locking"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
 }
 op {
-  name: "ScatterUpdate"
+  name: "SdcaFprint"
   input_arg {
-    name: "ref"
-    type_attr: "T"
-    is_ref: true
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
   }
+}
+op {
+  name: "SdcaOptimizer"
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "sparse_example_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
   }
   input_arg {
-    name: "updates"
-    type_attr: "T"
+    name: "sparse_feature_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_feature_values"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features_with_values"
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "example_labels"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  input_arg {
+    name: "dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
+  }
+  input_arg {
+    name: "example_state_data"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output_ref"
-    type_attr: "T"
-    is_ref: true
+    name: "out_example_state_data"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "out_delta_sparse_weights"
+    type: DT_FLOAT
+    number_attr: "num_sparse_features"
+  }
+  output_arg {
+    name: "out_delta_dense_weights"
+    type: DT_FLOAT
+    number_attr: "num_dense_features"
   }
   attr {
-    name: "Tindices"
-    type: "type"
+    name: "loss_type"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "logistic_loss"
+        s: "squared_loss"
+        s: "hinge_loss"
+        s: "smooth_hinge_loss"
+        s: "poisson_loss"
       }
     }
   }
   attr {
-    name: "use_locking"
+    name: "adaptative"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
-}
-op {
-  name: "SdcaFprint"
-  input_arg {
-    name: "input"
-    type: DT_STRING
+  attr {
+    name: "num_sparse_features"
+    type: "int"
+    has_minimum: true
   }
-  output_arg {
-    name: "output"
-    type: DT_INT64
+  attr {
+    name: "num_sparse_features_with_values"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_dense_features"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "num_loss_partitions"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_inner_iterations"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "SdcaOptimizer"
+  name: "SdcaOptimizerV2"
   input_arg {
     name: "sparse_example_indices"
     type: DT_INT64
@@ -28030,7 +29116,7 @@ op {
     }
   }
   attr {
-    name: "adaptative"
+    name: "adaptive"
     type: "bool"
     default_value {
       b: false
@@ -28606,42 +29692,6 @@ op {
     }
   }
 }
-op {
-  name: "SetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Shape"
   input_arg {
@@ -28952,17 +30002,6 @@ op {
     }
   }
 }
-op {
-  name: "SinkDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-}
 op {
   name: "Size"
   input_arg {
@@ -29116,41 +30155,6 @@ op {
     }
   }
 }
-op {
-  name: "SlideDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Snapshot"
   input_arg {
@@ -32341,38 +33345,6 @@ op {
     }
   }
 }
-op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Sqrt"
   input_arg {
@@ -32851,6 +33823,27 @@ op {
     name: "f"
     type: "func"
   }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   is_stateful: true
 }
 op {
@@ -32978,6 +33971,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -33033,6 +34027,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -33065,6 +34060,62 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomUniformInt"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessTruncatedNormal"
   input_arg {
@@ -33088,6 +34139,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -33185,40 +34237,6 @@ op {
     }
   }
 }
-op {
-  name: "StatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
 op {
   name: "StopGradient"
   input_arg {
@@ -35195,6 +36213,127 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -35433,47 +36572,179 @@ op {
   }
 }
 op {
-  name: "TensorListSetItem"
+  name: "TensorListSetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorListStack"
   input_arg {
     name: "input_handle"
     type: DT_VARIANT
   }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "num_elements"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "TensorScatterAdd"
   input_arg {
-    name: "index"
-    type: DT_INT32
+    name: "tensor"
+    type_attr: "T"
   }
   input_arg {
-    name: "item"
-    type_attr: "element_dtype"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
-    name: "output_handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "element_dtype"
+    name: "T"
     type: "type"
   }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "TensorListStack"
+  name: "TensorScatterSub"
   input_arg {
-    name: "input_handle"
-    type: DT_VARIANT
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
   }
   output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterUpdate"
+  input_arg {
     name: "tensor"
-    type_attr: "element_dtype"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "element_dtype"
+    name: "T"
     type: "type"
   }
   attr {
-    name: "num_elements"
-    type: "int"
-    default_value {
-      i: -1
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
 }
@@ -36094,29 +37365,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "UnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -36158,6 +37406,104 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeEncode"
+  input_arg {
+    name: "input_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "ignore"
+        s: "replace"
+        s: "strict"
+      }
+    }
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+}
 op {
   name: "UnicodeScript"
   input_arg {
@@ -36169,6 +37515,60 @@ op {
     type: DT_INT32
   }
 }
+op {
+  name: "UnicodeTranscode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "UniformCandidateSampler"
   input_arg {
@@ -36753,6 +38153,17 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "UnwrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "UpperBound"
   input_arg {
@@ -37071,6 +38482,17 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "WriteAudioSummary"
   input_arg {
diff --git a/tensorflow/core/ops/ragged_array_ops.cc b/tensorflow/core/ops/ragged_array_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..464257993995907730a39409b2121c7b8bd60d37
--- /dev/null
+++ b/tensorflow/core/ops/ragged_array_ops.cc
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+Status RaggedGatherShapeFn(InferenceContext* c);
+
+//==============================================================================
+// Registered Ops
+//==============================================================================
+
+REGISTER_OP("RaggedGather")
+    .Input("params_nested_splits: PARAMS_RAGGED_RANK * int64")
+    .Input("params_dense_values: Tvalues")
+    .Input("indices: Tindices")
+    .Output("output_nested_splits: OUTPUT_RAGGED_RANK * int64")
+    .Output("output_dense_values: Tvalues")
+    .Attr("Tvalues: type")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("PARAMS_RAGGED_RANK: int >= 1")
+    .Attr("OUTPUT_RAGGED_RANK: int >= 0")
+    .SetShapeFn(RaggedGatherShapeFn);
+
+//==============================================================================
+// Shape Functions
+//==============================================================================
+
+Status RaggedGatherShapeFn(InferenceContext* c) {
+  int num_splits;
+  int64 PARAMS_RAGGED_RANK;
+  TF_RETURN_IF_ERROR(
+      c->GetAttr<int64>("PARAMS_RAGGED_RANK", &PARAMS_RAGGED_RANK));
+  TF_RETURN_IF_ERROR(c->GetAttr<int>("OUTPUT_RAGGED_RANK", &num_splits));
+
+  // Check rank of `indices`.
+  ShapeHandle indices = c->input(PARAMS_RAGGED_RANK + 1);
+  TF_RETURN_IF_ERROR(
+      c->WithRank(indices, num_splits - PARAMS_RAGGED_RANK + 1, &indices));
+
+  // Check that all params_nested_splits have rank 1.
+  for (int64 i = 0; i < PARAMS_RAGGED_RANK; ++i) {
+    ShapeHandle splits = c->input(i);
+    TF_RETURN_IF_ERROR(c->WithRank(splits, 1, &splits));
+  }
+
+  // Check that `params_dense_values` has rank>=1.
+  ShapeHandle params_dense_values = c->input(PARAMS_RAGGED_RANK);
+  TF_RETURN_IF_ERROR(
+      c->WithRankAtLeast(params_dense_values, 1, &params_dense_values));
+
+  // Set the rank for the `splits` outputs.
+  for (int i = 0; i < num_splits; ++i) {
+    c->set_output(i, c->UnknownShapeOfRank(1));
+  }
+
+  // Calculate the `values` shape.
+  ShapeHandle value = c->UnknownShape();
+  ShapeHandle values = c->UnknownShape();
+  TF_RETURN_IF_ERROR(c->Subshape(params_dense_values, 1, &value));
+  TF_RETURN_IF_ERROR(c->Concatenate(c->UnknownShapeOfRank(1), value, &values));
+  c->set_output(num_splits, values);
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/ragged_conversion_ops.cc b/tensorflow/core/ops/ragged_conversion_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90fd51717fa262f6291f232f1f15976b139eaf69
--- /dev/null
+++ b/tensorflow/core/ops/ragged_conversion_ops.cc
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+Status RaggedTensorToSparseShapeFn(InferenceContext* c);
+
+//==============================================================================
+// Registered Ops
+//==============================================================================
+
+REGISTER_OP("RaggedTensorToSparse")
+    .Input("rt_nested_splits: RAGGED_RANK * int64")
+    .Input("rt_dense_values: T")
+    .Output("sparse_indices: int64")
+    .Output("sparse_values: T")
+    .Output("sparse_dense_shape: int64")
+    .Attr("RAGGED_RANK: int >= 1")
+    .Attr("T: type")
+    .SetShapeFn(RaggedTensorToSparseShapeFn);
+
+//==============================================================================
+// Shape Functions
+//==============================================================================
+
+Status RaggedTensorToSparseShapeFn(InferenceContext* c) {
+  int64 num_splits;
+  TF_RETURN_IF_ERROR(c->GetAttr<int64>("RAGGED_RANK", &num_splits));
+  // TODO(b/112274756): Allow ragged_rank to be 0.
+  if (num_splits < 1) {
+    return errors::InvalidArgument("Requires RAGGED_RANK>0");
+  }
+  ShapeHandle rt_dense_values = c->input(num_splits);
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(rt_dense_values, 1, &rt_dense_values));
+
+  // Check that all rt_nested_splits have rank 1.
+  for (int64 i = 0; i < num_splits; ++i) {
+    ShapeHandle splits = c->input(i);
+    TF_RETURN_IF_ERROR(c->WithRank(splits, 1, &splits));
+  }
+
+  DimensionHandle dense_dims =
+      c->RankKnown(rt_dense_values)
+          ? c->MakeDim(c->Rank(rt_dense_values) + num_splits)
+          : c->UnknownDim();
+  DimensionHandle num_values = c->NumElements(rt_dense_values);
+
+  c->set_output(0, c->Matrix(num_values, dense_dims));  // indices
+  c->set_output(1, c->Vector(num_values));              // values
+  c->set_output(2, c->Vector(dense_dims));              // dense_shape
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/ragged_math_ops.cc b/tensorflow/core/ops/ragged_math_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d739c697981a066283dce502cf95c54a96034557
--- /dev/null
+++ b/tensorflow/core/ops/ragged_math_ops.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+Status RaggedRangeShapeFn(InferenceContext* c);
+
+//==============================================================================
+// Registered Ops
+//==============================================================================
+
+REGISTER_OP("RaggedRange")
+    .Input("starts: T")
+    .Input("limits: T")
+    .Input("deltas: T")
+    .Output("rt_nested_splits: int64")
+    .Output("rt_dense_values: T")
+    .Attr("T: {bfloat16, float, double, int32, int64} = DT_INT32")
+    .SetShapeFn(RaggedRangeShapeFn);
+
+//==============================================================================
+// Shape Functions
+//==============================================================================
+
+Status RaggedRangeShapeFn(InferenceContext* c) {
+  // Check that all inputs (starts, limits, and deltas) have rank 0 or 1.
+  ShapeHandle starts = c->input(0);
+  ShapeHandle limits = c->input(1);
+  ShapeHandle deltas = c->input(2);
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(starts, 1, &starts));
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(limits, 1, &limits));
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(deltas, 1, &deltas));
+
+  // For the inputs with rank 1, make sure shapes match.
+  DimensionHandle dim = c->UnknownDim();
+  if (c->Rank(starts) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(c->Dim(starts, 0), dim, &dim));
+  }
+  if (c->Rank(limits) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(c->Dim(limits, 0), dim, &dim));
+  }
+  if (c->Rank(deltas) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(c->Dim(deltas, 0), dim, &dim));
+  }
+
+  // If any input shape is known, then calculate `rt_nested_splits` shape.
+  int64 rt_nested_splits_dim = InferenceContext::kUnknownDim;
+  if (c->ValueKnown(dim)) {
+    rt_nested_splits_dim = c->Value(dim) + 1;
+  } else if (c->Rank(starts) == 0 && c->Rank(limits) == 0 &&
+             c->Rank(deltas) == 0) {
+    rt_nested_splits_dim = 2;
+  }
+  c->set_output(0, c->Vector(rt_nested_splits_dim));
+
+  // `rt_dense_values` is rank 1, but size can't be calculated statically.
+  c->set_output(1, c->UnknownShapeOfRank(1));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index adc9cd14861ba021bc0035931b844a1f760fe12c..65bdde375bf07f8a43d682dd6ff58bc89ef80f68 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -216,7 +216,8 @@ REGISTER_OP("VarIsInitializedOp")
 Status VariableShapeShapeFn(InferenceContext* c) {
   auto* handle_data = c->input_handle_shapes_and_types(0);
   if (handle_data == nullptr || handle_data->empty()) {
-    return errors::InvalidArgument("Handle doesn't have shape information.");
+    c->set_output(0, c->Vector(c->UnknownDim()));
+    return Status::OK();
   }
   ShapeHandle var_shape = (*handle_data)[0].shape;
   int64 rank = c->RankKnown(var_shape) ? c->Rank(var_shape)
diff --git a/tensorflow/core/ops/sdca_ops.cc b/tensorflow/core/ops/sdca_ops.cc
index fdf53a55dd8b4262efd4528066066bdd25cf7b68..51d248f2d6b6272a6836eb9090f49cb8b41e1f0b 100644
--- a/tensorflow/core/ops/sdca_ops.cc
+++ b/tensorflow/core/ops/sdca_ops.cc
@@ -65,6 +65,34 @@ REGISTER_OP("SdcaOptimizer")
     .Output("out_delta_dense_weights: num_dense_features * float")
     .SetShapeFn(ApplySdcaOptimizerShapeFn);
 
+// The SdcaOptimizerV2 op fixes the "adaptative" typo in v1.
+REGISTER_OP("SdcaOptimizerV2")
+    .Attr(
+        "loss_type: {'logistic_loss', 'squared_loss', 'hinge_loss',"
+        "'smooth_hinge_loss', 'poisson_loss'}")
+    .Attr("adaptive : bool=false")
+    .Attr("num_sparse_features: int >= 0")
+    .Attr("num_sparse_features_with_values: int >= 0")
+    .Attr("num_dense_features: int >= 0")
+    .Attr("l1: float")
+    .Attr("l2: float")
+    .Attr("num_loss_partitions: int >= 1")
+    .Attr("num_inner_iterations: int >= 1")
+    .Input("sparse_example_indices: num_sparse_features * int64")
+    .Input("sparse_feature_indices: num_sparse_features * int64")
+    .Input("sparse_feature_values: num_sparse_features_with_values * float")
+    .Input("dense_features: num_dense_features * float")
+    .Input("example_weights: float")
+    .Input("example_labels: float")
+    .Input("sparse_indices: num_sparse_features * int64")
+    .Input("sparse_weights: num_sparse_features * float")
+    .Input("dense_weights: num_dense_features * float")
+    .Input("example_state_data: float")
+    .Output("out_example_state_data: float")
+    .Output("out_delta_sparse_weights: num_sparse_features * float")
+    .Output("out_delta_dense_weights: num_dense_features * float")
+    .SetShapeFn(ApplySdcaOptimizerShapeFn);
+
 REGISTER_OP("SdcaShrinkL1")
     .Attr("num_features: int >= 0")
     .Attr("l1: float")
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index bc0cb2095dabf366e85106770c56a2f169f040c8..de08a1078458c236520924f52450fa8b4dc6f18a 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -401,7 +401,7 @@ REGISTER_OP("SparseReduceMax")
     .Attr("keep_dims: bool = False")
     .Output("output: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnknownShape);
+    .SetShapeFn(shape_inference::SparseReduceShapeFn);
 
 REGISTER_OP("SparseReduceMaxSparse")
     .Input("input_indices: int64")
@@ -423,7 +423,7 @@ REGISTER_OP("SparseReduceSum")
     .Attr("keep_dims: bool = False")
     .Output("output: T")
     .Attr("T: numbertype")
-    .SetShapeFn(shape_inference::UnknownShape);
+    .SetShapeFn(shape_inference::SparseReduceShapeFn);
 
 REGISTER_OP("SparseReduceSumSparse")
     .Input("input_indices: int64")
diff --git a/tensorflow/core/ops/sparse_ops_test.cc b/tensorflow/core/ops/sparse_ops_test.cc
index 6a9b5ce4d31fcd03a69a53893689d67ba5b2b9e7..00283c59932c579046a166e90531c1b8a740f4ab 100644
--- a/tensorflow/core/ops/sparse_ops_test.cc
+++ b/tensorflow/core/ops/sparse_ops_test.cc
@@ -133,6 +133,13 @@ TEST(SparseOpsTest, SparseToDense_ShapeFn) {
 
 TEST(SparseOpsTest, SparseReduceSum_ShapeFn) {
   ShapeInferenceTestOp op("SparseReduceSum");
+  TF_ASSERT_OK(NodeDefBuilder("test", "SparseReduceSum")
+                   .Input({"input_indices", 0, DT_INT64})
+                   .Input({"input_values", 1, DT_INT64})
+                   .Input({"input_shape", 2, DT_INT64})
+                   .Input({"reduction_axes", 3, DT_INT32})
+                   .Attr("keep_dims", false)
+                   .Finalize(&op.node_def));
 
   // Shape fn always yields unknown.
   INFER_OK(op, "?;?;?;?", "?");
diff --git a/tensorflow/core/ops/stateless_random_grad.cc b/tensorflow/core/ops/stateless_random_grad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..331e1d0152178222743f3ead9c724327e1329d83
--- /dev/null
+++ b/tensorflow/core/ops/stateless_random_grad.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+REGISTER_OP_NO_GRADIENT("StatelessRandomUniform");
+REGISTER_OP_NO_GRADIENT("StatelessRandomNormal");
+REGISTER_OP_NO_GRADIENT("StatelessTruncatedNormal");
+REGISTER_OP_NO_GRADIENT("StatelessMultinomial");
+}  // end namespace tensorflow
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index 742709fb1836a0f7e3f0bd94f3dbc3e15423a271..f919a21d607f86c84ff00688dea9cc67f029ceea 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -19,42 +19,55 @@ limitations under the License.
 namespace tensorflow {
 
 using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
-static Status StatelessShape(shape_inference::InferenceContext* context) {
+static Status StatelessShape(InferenceContext* c) {
   // Check seed shape
   ShapeHandle seed;
-  TF_RETURN_IF_ERROR(context->WithRank(context->input(1), 1, &seed));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &seed));
   DimensionHandle unused;
-  TF_RETURN_IF_ERROR(context->WithValue(context->Dim(seed, 0), 2, &unused));
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused));
 
   // Set output shape
   ShapeHandle out;
-  TF_RETURN_IF_ERROR(context->MakeShapeFromShapeTensor(0, &out));
-  context->set_output(0, out);
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+  c->set_output(0, out);
   return Status::OK();
 }
 
-#define REGISTER_STATELESS_OP(name)                  \
-  REGISTER_OP(name)                                  \
-      .Input("shape: T")                             \
-      .Input("seed: Tseed")                          \
-      .Output("output: dtype")                       \
-      .Attr("dtype: {half,float,double} = DT_FLOAT") \
-      .Attr("T: {int32, int64} = DT_INT32")          \
-      .Attr("Tseed: {int32, int64} = DT_INT64")      \
+#define REGISTER_STATELESS_OP(name)                           \
+  REGISTER_OP(name)                                           \
+      .Input("shape: T")                                      \
+      .Input("seed: Tseed")                                   \
+      .Output("output: dtype")                                \
+      .Attr("dtype: {half,bfloat16,float,double} = DT_FLOAT") \
+      .Attr("T: {int32, int64} = DT_INT32")                   \
+      .Attr("Tseed: {int32, int64} = DT_INT64")               \
       .SetShapeFn(StatelessShape)
 
-// This op is exposed through contrib/stateless only.  The interface may change.
 REGISTER_STATELESS_OP("StatelessRandomUniform");
-
-// This op is exposed through contrib/stateless only.  The interface may change.
 REGISTER_STATELESS_OP("StatelessRandomNormal");
-
-// This op is exposed through contrib/stateless only.  The interface may change.
 REGISTER_STATELESS_OP("StatelessTruncatedNormal");
 
-// This op is exposed through contrib/stateless only.  The interface may change.
+#undef REGISTER_STATELESS_OP
+
+REGISTER_OP("StatelessRandomUniformInt")
+    .Input("shape: T")
+    .Input("seed: Tseed")
+    .Input("minval: dtype")
+    .Input("maxval: dtype")
+    .Output("output: dtype")
+    .Attr("dtype: {int32, int64}")
+    .Attr("T: {int32, int64}")
+    .Attr("Tseed: {int32, int64} = DT_INT64")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return StatelessShape(c);
+    });
+
 REGISTER_OP("StatelessMultinomial")
     .Input("logits: T")
     .Input("num_samples: int32")
@@ -80,6 +93,4 @@ REGISTER_OP("StatelessMultinomial")
       return Status::OK();
     });
 
-#undef REGISTER_STATELESS_OP
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 94d71a4113e94d0e6e7874dcadf7846c8f2e678c..8ea74f1d43e5baa3f14398e6ea17c19466ea2973 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -13,13 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
+namespace shape_inference {
+class InferenceContext;
+}  // namespace shape_inference
+
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
@@ -250,4 +261,62 @@ REGISTER_OP("UnicodeScript")
     .Output("output: int32")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("UnicodeEncode")
+    .Input("input_values: int32")
+    .Input("input_splits: int64")
+    .Attr("errors: {'ignore', 'replace', 'strict'} = 'replace'")
+    .Attr("output_encoding: {'UTF-8', 'UTF-16-BE', 'UTF-32-BE'}")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Output("output: string")
+    .SetShapeFn([](InferenceContext* c) {
+      // Check rank of inner values
+      ShapeHandle input_inner_values_shape = c->input(0);
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(input_inner_values_shape, 1, &unused));
+
+      // Check rank of input_splits
+      ShapeHandle splits_shape = c->input(1);
+      TF_RETURN_IF_ERROR(c->WithRank(splits_shape, 1, &unused));
+
+      // Output shape is a 1-D tensor with size equal to number of splits.
+      std::vector<DimensionHandle> dims(1);
+      TF_RETURN_IF_ERROR(c->Subtract(c->Dim(splits_shape, 0), 1, &dims[0]));
+      c->set_output(0, c->MakeShape(dims));
+
+      return Status::OK();
+    });
+
+REGISTER_OP("UnicodeTranscode")
+    .Input("input: string")
+    .Output("output: string")
+    .Attr("input_encoding: string")
+    .Attr("output_encoding: {'UTF-8', 'UTF-16-BE', 'UTF-32-BE'}")
+    .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Attr("replace_control_characters: bool = false")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("UnicodeDecodeWithOffsets")
+    .Input("input: string")
+    .Output("row_splits: int64")
+    .Output("char_values: int32")
+    .Output("char_to_byte_starts: int64")
+    .Attr("input_encoding: string")
+    .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Attr("replace_control_characters: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      // row_splits.shape == [input.size() + 1]
+      DimensionHandle num_row_splits;
+      DimensionHandle input_size = c->NumElements(c->input(0));
+      TF_RETURN_IF_ERROR(c->Add(input_size, 1, &num_row_splits));
+      c->set_output(0, c->Vector(num_row_splits));
+
+      // char_values.shape == offset_values.shape == [num_chars]
+      DimensionHandle num_chars = c->UnknownDim();
+      c->set_output(1, c->Vector(num_chars));
+      c->set_output(2, c->Vector(num_chars));
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/tensor_forest_ops.cc b/tensorflow/core/ops/tensor_forest_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4b6ba318e9d981af2797a54eca7f9caf049f6b0
--- /dev/null
+++ b/tensorflow/core/ops/tensor_forest_ops.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_OP(TensorForestTreeResource);
+
+REGISTER_OP("TensorForestTreeIsInitializedOp")
+    .Input("tree_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorForestCreateTreeVariable")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs);
+
+REGISTER_OP("TensorForestTreeSerialize")
+    .Input("tree_handle: resource")
+    .Output("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("TensorForestTreeDeserialize")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorForestTreeSize")
+    .Input("tree_handle: resource")
+    .Output("tree_size: int32")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("TensorForestTreePredict")
+    .Attr("logits_dimension: int")
+    .Input("tree_handle: resource")
+    .Input("dense_features: float")
+    .Output("logits: float")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle shape_handle;
+      shape_inference::DimensionHandle batch_size = c->UnknownDim();
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &shape_handle));
+
+      batch_size = c->Dim(shape_handle, 0);
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      c->set_output(0, c->Matrix(batch_size, logits_dimension));
+      return Status::OK();
+    });
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 94ff092a85d512e602da5e97fc3007d4c68c5937..995ed42d53dd286e5068f0067b35849c4e36e64b 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -685,6 +685,34 @@ REGISTER_OP("ResourceSparseApplyMomentum")
       return ApplyMomentumShapeFn(c, true /* sparse */);
     });
 
+REGISTER_OP("ResourceApplyKerasMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, false /* sparse */);
+    });
+
+REGISTER_OP("ResourceSparseApplyKerasMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, true /* sparse */);
+    });
+
 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
@@ -741,6 +769,44 @@ REGISTER_OP("ResourceApplyAdam")
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
 
+static Status ApplyAdamWithAmsgradShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s));  // vhat
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta2_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(9), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 10 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ResourceApplyAdamWithAmsgrad")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("v: resource")
+    .Input("vhat: resource")
+    .Input("beta1_power: T")
+    .Input("beta2_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamWithAmsgradShapeFn(c, false /* sparse */);
+    });
+
 static Status ApplyAdaMaxShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index 6ffe51e89774a09ed7ad5ecca22cfbb3b3e1ffdc..e15400780af0880caadd2f79b7322f39e406ca2b 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -135,8 +135,7 @@ Status GoogleAuthProvider::GetToken(string* t) {
   mutex_lock lock(mu_);
   const uint64 now_sec = env_->NowSeconds();
 
-  if (!current_token_.empty() &&
-      now_sec + kExpirationTimeMarginSec < expiration_timestamp_sec_) {
+  if (now_sec + kExpirationTimeMarginSec < expiration_timestamp_sec_) {
     *t = current_token_;
     return Status::OK();
   }
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 9d00aa7b7feadec3a2a2861247c5d536a4237c3e..2efe0c0876e871f6752bb3e7724de4c505102130 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -41,7 +41,7 @@ void CheckFeatureOrDie(CPUFeature feature, const string& feature_name) {
   }
 }
 
-// Check if CPU feature is inclued in the TensorFlow binary.
+// Check if CPU feature is included in the TensorFlow binary.
 void CheckIfFeatureUnused(CPUFeature feature, const string& feature_name,
                           string& missing_instructions) {
   if (TestCPUFeature(feature)) {
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index d884c1aa7c53d5f05ed54393acc9b4070b68d99b..04287151301dd0c6eb25ec7bc8b12a207f44ab90 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -166,10 +166,10 @@ def cc_proto_library(
         proto_gen(
             name = name + "_genproto",
             srcs = srcs,
-            deps = [s + "_genproto" for s in deps],
             includes = includes,
             protoc = protoc,
             visibility = ["//visibility:public"],
+            deps = [s + "_genproto" for s in deps],
         )
 
         # An empty cc_library to make rule dependency consistent.
@@ -193,15 +193,15 @@ def cc_proto_library(
     proto_gen(
         name = name + "_genproto",
         srcs = srcs,
-        deps = [s + "_genproto" for s in deps],
+        outs = outs,
+        gen_cc = 1,
         includes = includes,
-        protoc = protoc,
         plugin = grpc_cpp_plugin,
         plugin_language = "grpc",
         plugin_options = plugin_options,
-        gen_cc = 1,
-        outs = outs,
+        protoc = protoc,
         visibility = ["//visibility:public"],
+        deps = [s + "_genproto" for s in deps],
     )
 
     if use_grpc_plugin:
@@ -286,14 +286,14 @@ def py_proto_library(
     proto_gen(
         name = name + "_genproto",
         srcs = srcs,
-        deps = [s + "_genproto" for s in deps],
-        includes = includes,
-        protoc = protoc,
-        gen_py = 1,
         outs = outs,
-        visibility = ["//visibility:public"],
+        gen_py = 1,
+        includes = includes,
         plugin = grpc_python_plugin,
         plugin_language = "grpc",
+        protoc = protoc,
+        visibility = ["//visibility:public"],
+        deps = [s + "_genproto" for s in deps],
     )
 
     if default_runtime and not default_runtime in py_libs + deps:
@@ -319,14 +319,9 @@ def tf_proto_library_cc(
         cc_grpc_version = None,
         j2objc_api_version = 1,
         cc_api_version = 2,
-        dart_api_version = 2,
-        java_api_version = 2,
-        py_api_version = 2,
-        js_api_version = 2,
         js_codegen = "jspb",
         default_header = False):
     js_codegen = js_codegen  # unused argument
-    js_api_version = js_api_version  # unused argument
     native.filegroup(
         name = name + "_proto_srcs",
         srcs = srcs + tf_deps(protodeps, "_proto_srcs"),
@@ -345,14 +340,13 @@ def tf_proto_library_cc(
         # libraries containing all the sources.
         proto_gen(
             name = cc_name + "_genproto",
-            deps = [s + "_genproto" for s in cc_deps],
             protoc = "@protobuf_archive//:protoc",
             visibility = ["//visibility:public"],
+            deps = [s + "_genproto" for s in cc_deps],
         )
         native.cc_library(
             name = cc_name,
-            deps = cc_deps + ["@protobuf_archive//:protobuf_headers"] +
-                   if_static([name + "_cc_impl"]),
+            deps = cc_deps + ["@protobuf_archive//:protobuf_headers"] + if_static([name + "_cc_impl"]),
             testonly = testonly,
             visibility = visibility,
         )
@@ -365,8 +359,8 @@ def tf_proto_library_cc(
 
     cc_proto_library(
         name = cc_name,
+        testonly = testonly,
         srcs = srcs,
-        deps = cc_deps + ["@protobuf_archive//:cc_wkt_protos"],
         cc_libs = cc_libs + if_static(
             ["@protobuf_archive//:protobuf"],
             ["@protobuf_archive//:protobuf_headers"],
@@ -376,11 +370,11 @@ def tf_proto_library_cc(
             "-Wno-unused-but-set-variable",
             "-Wno-sign-compare",
         ]),
+        default_header = default_header,
         protoc = "@protobuf_archive//:protoc",
         use_grpc_plugin = use_grpc_plugin,
-        testonly = testonly,
         visibility = visibility,
-        default_header = default_header,
+        deps = cc_deps + ["@protobuf_archive//:cc_wkt_protos"],
     )
 
 def tf_proto_library_py(
@@ -399,9 +393,9 @@ def tf_proto_library_py(
         # libraries containing all the sources.
         proto_gen(
             name = py_name + "_genproto",
-            deps = [s + "_genproto" for s in py_deps],
             protoc = "@protobuf_archive//:protoc",
             visibility = ["//visibility:public"],
+            deps = [s + "_genproto" for s in py_deps],
         )
         native.py_library(
             name = py_name,
@@ -413,14 +407,14 @@ def tf_proto_library_py(
 
     py_proto_library(
         name = py_name,
+        testonly = testonly,
         srcs = srcs,
-        srcs_version = srcs_version,
-        deps = deps + py_deps + ["@protobuf_archive//:protobuf_python"],
-        protoc = "@protobuf_archive//:protoc",
         default_runtime = "@protobuf_archive//:protobuf_python",
-        visibility = visibility,
-        testonly = testonly,
+        protoc = "@protobuf_archive//:protoc",
+        srcs_version = srcs_version,
         use_grpc_plugin = use_grpc_plugin,
+        visibility = visibility,
+        deps = deps + py_deps + ["@protobuf_archive//:protobuf_python"],
     )
 
 def tf_jspb_proto_library(**kwargs):
@@ -439,36 +433,32 @@ def tf_proto_library(
         cc_libs = [],
         cc_api_version = 2,
         cc_grpc_version = None,
-        dart_api_version = 2,
         j2objc_api_version = 1,
-        java_api_version = 2,
-        py_api_version = 2,
-        js_api_version = 2,
         js_codegen = "jspb",
         provide_cc_alias = False,
         default_header = False):
     """Make a proto library, possibly depending on other proto libraries."""
-    _ignore = (js_api_version, js_codegen, provide_cc_alias)
+    _ignore = (js_codegen, provide_cc_alias)
 
     tf_proto_library_cc(
         name = name,
+        testonly = testonly,
         srcs = srcs,
-        protodeps = protodeps,
         cc_grpc_version = cc_grpc_version,
         cc_libs = cc_libs,
-        testonly = testonly,
-        visibility = visibility,
         default_header = default_header,
+        protodeps = protodeps,
+        visibility = visibility,
     )
 
     tf_proto_library_py(
         name = name,
+        testonly = testonly,
         srcs = srcs,
         protodeps = protodeps,
         srcs_version = "PY2AND3",
-        testonly = testonly,
-        visibility = visibility,
         use_grpc_plugin = has_services,
+        visibility = visibility,
     )
 
 # A list of all files under platform matching the pattern in 'files'. In
@@ -553,6 +543,9 @@ def tf_additional_proto_srcs():
 def tf_additional_human_readable_json_deps():
     return []
 
+def tf_additional_logger_deps():
+    return []
+
 def tf_additional_all_protos():
     return ["//tensorflow/core:protos_all"]
 
@@ -586,6 +579,9 @@ def tf_additional_device_tracer_cuda_deps():
 def tf_additional_device_tracer_deps():
     return []
 
+def tf_additional_device_tracer_test_flags():
+    return []
+
 def tf_additional_libdevice_data():
     return []
 
@@ -632,23 +628,41 @@ def tf_additional_lib_deps():
 def tf_additional_core_deps():
     return select({
         "//tensorflow:android": [],
-        "//tensorflow:windows": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_gcp_support": [],
         "//conditions:default": [
             "//tensorflow/core/platform/cloud:gcs_file_system",
-            "//tensorflow/core/platform/s3:s3_file_system",
+        ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_hdfs_support": [],
+        "//conditions:default": [
             "//tensorflow/core/platform/hadoop:hadoop_file_system",
         ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_aws_support": [],
+        "//conditions:default": [
+            "//tensorflow/core/platform/s3:s3_file_system",
+        ],
     })
 
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_op_deps():
     return select({
         "//tensorflow:android": [],
-        "//tensorflow:windows": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_gcp_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
             "//tensorflow/contrib/cloud:gcs_config_ops_op_lib",
@@ -659,9 +673,10 @@ def tf_additional_cloud_op_deps():
 def tf_additional_cloud_kernel_deps():
     return select({
         "//tensorflow:android": [],
-        "//tensorflow:windows": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_gcp_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
             "//tensorflow/contrib/cloud/kernels:gcs_config_ops",
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index 83c65dbfa97a77400a7acaf2030a4ca51afd8728..8351362e05699c591b5563f2270928f4408077e8 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -132,7 +132,17 @@ class CUPTIManager {
  public:
   CUPTIManager() {
     cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper());
-    CUPTI_CALL(ActivityRegisterCallbacks(BufferRequested, BufferCompleted));
+  }
+
+  static CUPTIManager *Create() {
+    auto manager = absl::make_unique<CUPTIManager>();
+    CUptiResult status = manager->cupti_wrapper_->ActivityRegisterCallbacks(
+        BufferRequested, BufferCompleted);
+    if (status != CUPTI_SUCCESS) {
+      LOG(ERROR) << "Failed to initialize CUPTI: " << status;
+      return nullptr;
+    }
+    return manager.release();
   }
 
   // Enables tracing and delivers event callbacks to 'client'.
@@ -254,7 +264,7 @@ void CUPTIManager::InternalBufferCompleted(CUcontext ctx, uint32_t streamId,
 }
 
 CUPTIManager *GetCUPTIManager() {
-  static CUPTIManager *manager = new CUPTIManager();
+  static CUPTIManager *manager = CUPTIManager::Create();
   return manager;
 }
 
@@ -287,19 +297,16 @@ CUPTIManager *GetCUPTIManager() {
 // for the duration of the CUPTI API callback.
 TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
 
-class DeviceTracerImpl : public DeviceTracer,
-                         public CUPTIClient,
-                         public tracing::TraceCollector {
+class TraceCollectorImpl : public tracing::TraceCollector {
  public:
-  DeviceTracerImpl();
-  ~DeviceTracerImpl() override;
+  TraceCollectorImpl() { tracing::SetTraceCollector(this); }
 
-  // DeviceTracer interface:
-  Status Start() override;
-  Status Stop() override;
-  Status Collect(StepStatsCollector *collector) override;
+  ~TraceCollectorImpl() override {
+    DCHECK(!active_trace_session_)
+        << "Unexpected active trace session detected. ";
+  }
 
-  // tracing::TraceCollector interface:
+  // Note the method can be called after a call to Stop().
   virtual std::unique_ptr<Handle> CreateAnnotationHandle(
       StringPiece name_part1, StringPiece name_part2) const {
     struct Impl : public tracing::TraceCollector::Handle {
@@ -322,8 +329,7 @@ class DeviceTracerImpl : public DeviceTracer,
   }
 
   bool IsEnabledForAnnotations() const override {
-    // We are always enabled for 'Annotations'.
-    return true;
+    return active_trace_session_.load(std::memory_order_relaxed);
   }
 
   bool IsEnabledForActivities(bool is_expensive) const override {
@@ -331,6 +337,36 @@ class DeviceTracerImpl : public DeviceTracer,
     return false;
   }
 
+  void Start() {
+    DCHECK(!active_trace_session_)
+        << "Unexpected active trace session detected. ";
+    active_trace_session_ = true;
+  }
+
+  void Stop() {
+    DCHECK(active_trace_session_) << "No active trace session detected. ";
+    active_trace_session_ = false;
+  }
+
+ private:
+  std::atomic<bool> active_trace_session_;
+};
+
+TraceCollectorImpl *GlobalDefaultTraceCollector() {
+  static auto *instance = new TraceCollectorImpl();
+  return instance;
+}
+
+class DeviceTracerImpl : public DeviceTracer, public CUPTIClient {
+ public:
+  DeviceTracerImpl(CUPTIManager *cupti_manager);
+  ~DeviceTracerImpl() override;
+
+  // DeviceTracer interface:
+  Status Start() override;
+  Status Stop() override;
+  Status Collect(StepStatsCollector *collector) override;
+
  protected:
   // This callback is used exclusively by CUPTIManager.
   friend class CUPTIManager;
@@ -389,10 +425,9 @@ class DeviceTracerImpl : public DeviceTracer,
   TF_DISALLOW_COPY_AND_ASSIGN(DeviceTracerImpl);
 };
 
-DeviceTracerImpl::DeviceTracerImpl() {
+DeviceTracerImpl::DeviceTracerImpl(CUPTIManager *cupti_manager)
+    : cupti_manager_(cupti_manager) {
   VLOG(1) << "DeviceTracer created.";
-  cupti_manager_ = GetCUPTIManager();
-  CHECK(cupti_manager_);
   cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper());
   enabled_ = false;
 }
@@ -421,7 +456,7 @@ Status DeviceTracerImpl::Start() {
   }
 
   // Register as a TraceEngine to receive ScopedAnnotations.
-  tracing::SetTraceCollector(this);
+  GlobalDefaultTraceCollector()->Start();
 
   // Intercept launch and memcpy calls to capture the Op name annotation.
   // TODO(pbar) Add callbacks for memcpy variants.
@@ -469,7 +504,8 @@ Status DeviceTracerImpl::Stop() {
     return Status::OK();
   }
   CUPTI_CALL(Unsubscribe(subscriber_));
-  tracing::SetTraceCollector(nullptr);
+  GlobalDefaultTraceCollector()->Stop();
+
   TF_RETURN_IF_ERROR(cupti_manager_->DisableTrace());
   end_walltime_us_ = NowInUsec();
   CUPTI_CALL(GetTimestamp(&end_timestamp_));
@@ -646,7 +682,12 @@ Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
 }  // namespace devicetracer
 
 std::unique_ptr<DeviceTracer> CreateDeviceTracer() {
-  std::unique_ptr<DeviceTracer> tracer(new devicetracer::DeviceTracerImpl());
+  devicetracer::CUPTIManager *cupti_manager = devicetracer::GetCUPTIManager();
+  if (cupti_manager == nullptr) {
+    return nullptr;
+  }
+  std::unique_ptr<DeviceTracer> tracer(
+      new devicetracer::DeviceTracerImpl(cupti_manager));
   return tracer;
 }
 
diff --git a/tensorflow/core/platform/default/human_readable_json.cc b/tensorflow/core/platform/default/human_readable_json.cc
index 6bf2106f6e5d38f61e0291817f5106437c541c19..bf9c7b76206b79ad43969a1e3e2de6e6cbdacc46 100644
--- a/tensorflow/core/platform/default/human_readable_json.cc
+++ b/tensorflow/core/platform/default/human_readable_json.cc
@@ -20,11 +20,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
+Status ProtoToHumanReadableJson(const protobuf::Message& proto,
                                 string* result) {
+#ifdef TENSORFLOW_LITE_PROTOS
+  *result = "[human readable output not available on Android]";
+  return Status::OK();
+#else
   result->clear();
 
-  auto status = google::protobuf::util::MessageToJsonString(proto, result);
+  auto status = protobuf::util::MessageToJsonString(proto, result);
   if (!status.ok()) {
     // Convert error_msg google::protobuf::StringPiece to
     // tensorflow::StringPiece.
@@ -34,10 +38,13 @@ Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
                         StringPiece(error_msg.data(), error_msg.length())));
   }
   return Status::OK();
+#endif
 }
 
-Status HumanReadableJsonToProto(const string& str,
-                                ::google::protobuf::Message* proto) {
+Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto) {
+#ifdef TENSORFLOW_LITE_PROTOS
+  return errors::Internal("Cannot parse JSON protos on Android");
+#else
   proto->Clear();
   auto status = google::protobuf::util::JsonStringToMessage(str, proto);
   if (!status.ok()) {
@@ -49,6 +56,7 @@ Status HumanReadableJsonToProto(const string& str,
                         StringPiece(error_msg.data(), error_msg.length())));
   }
   return Status::OK();
+#endif
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/logger.cc b/tensorflow/core/platform/default/logger.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54b1a1a67ca7da65aa6897e6461ebe9b54fb4767
--- /dev/null
+++ b/tensorflow/core/platform/default/logger.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/logger.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+Logger* Logger::Singleton() {
+  class DefaultLogger : public Logger {
+   private:
+    void DoLogProto(google::protobuf::Any* proto) override {
+      VLOG(2) << proto->ShortDebugString();
+    }
+    void DoFlush() override {}
+  };
+  static Logger* instance = new DefaultLogger();
+  return instance;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index c6e5777c265137ca1b215e14a7be0c6422804b4b..26bd8542fd70f7a2565192e3626d3ac84f0edf5f 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -21,18 +21,18 @@ limitations under the License.
 #include <android/log.h>
 #include <iostream>
 #include <sstream>
-#include <cstring>
 #endif
 
 #include <stdlib.h>
+#include <string.h>
 #include <time.h>
 
+#include <string>
+#include <unordered_map>
+
 namespace tensorflow {
 namespace internal {
 
-LogMessage::LogMessage(const char* fname, int line, int severity)
-    : fname_(fname), line_(line), severity_(severity) {}
-
 #if defined(PLATFORM_POSIX_ANDROID)
 void LogMessage::GenerateLogMessage() {
   int android_log_level;
@@ -94,42 +94,133 @@ void LogMessage::GenerateLogMessage() {
 
 namespace {
 
+int ParseInteger(const char* str, size_t size) {
+  // Ideally we would use env_var / safe_strto64, but it is
+  // hard to use here without pulling in a lot of dependencies,
+  // so we use std:istringstream instead
+  string integer_str(str, size);
+  std::istringstream ss(integer_str);
+  int level = 0;
+  ss >> level;
+  return level;
+}
+
 // Parse log level (int64) from environment variable (char*)
 int64 LogLevelStrToInt(const char* tf_env_var_val) {
   if (tf_env_var_val == nullptr) {
     return 0;
   }
+  return ParseInteger(tf_env_var_val, strlen(tf_env_var_val));
+}
 
-  // Ideally we would use env_var / safe_strto64, but it is
-  // hard to use here without pulling in a lot of dependencies,
-  // so we use std:istringstream instead
-  string min_log_level(tf_env_var_val);
-  std::istringstream ss(min_log_level);
-  int64 level;
-  if (!(ss >> level)) {
-    // Invalid vlog level setting, set level to default (0)
-    level = 0;
+// Using StringPiece breaks Windows build.
+struct StringData {
+  struct Hasher {
+    size_t operator()(const StringData& sdata) const {
+      // For dependency reasons, we cannot use hash.h here. Use DBJHash instead.
+      size_t hash = 5381;
+      const char* data = sdata.data;
+      for (const char* top = data + sdata.size; data < top; ++data) {
+        hash = ((hash << 5) + hash) + (*data);
+      }
+      return hash;
+    }
+  };
+
+  StringData() = default;
+  StringData(const char* data, size_t size) : data(data), size(size) {}
+
+  bool operator==(const StringData& rhs) const {
+    return size == rhs.size && memcmp(data, rhs.data, size) == 0;
   }
 
-  return level;
+  const char* data = nullptr;
+  size_t size = 0;
+};
+
+using VmoduleMap = std::unordered_map<StringData, int, StringData::Hasher>;
+
+// Returns a mapping from module name to VLOG level, derived from the
+// TF_CPP_VMOUDLE environment variable; ownership is transferred to the caller.
+VmoduleMap* VmodulesMapFromEnv() {
+  // The value of the env var is supposed to be of the form:
+  //    "foo=1,bar=2,baz=3"
+  const char* env = getenv("TF_CPP_VMODULE");
+  if (env == nullptr) {
+    // If there is no TF_CPP_VMODULE configuration (most common case), return
+    // nullptr so that the ShouldVlogModule() API can fast bail out of it.
+    return nullptr;
+  }
+  // The memory returned by getenv() can be invalidated by following getenv() or
+  // setenv() calls. And since we keep references to it in the VmoduleMap in
+  // form of StringData objects, make a copy of it.
+  const char* env_data = strdup(env);
+  VmoduleMap* result = new VmoduleMap();
+  while (true) {
+    const char* eq = strchr(env_data, '=');
+    if (eq == nullptr) {
+      break;
+    }
+    const char* after_eq = eq + 1;
+
+    // Comma either points at the next comma delimiter, or at a null terminator.
+    // We check that the integer we parse ends at this delimiter.
+    const char* comma = strchr(after_eq, ',');
+    const char* new_env_data;
+    if (comma == nullptr) {
+      comma = strchr(after_eq, '\0');
+      new_env_data = comma;
+    } else {
+      new_env_data = comma + 1;
+    }
+    (*result)[StringData(env_data, eq - env_data)] =
+        ParseInteger(after_eq, comma - after_eq);
+    env_data = new_env_data;
+  }
+  return result;
 }
 
 }  // namespace
 
 int64 MinLogLevelFromEnv() {
+  // We don't want to print logs during fuzzing as that would slow fuzzing down
+  // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
+  // return a value so that nothing is actually printed. Since LOG uses >=
+  // (see ~LogMessage in this file) to see if log messages need to be printed,
+  // the value we're interested on to disable printing is the maximum severity.
+  // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  return tensorflow::NUM_SEVERITIES;
+#else
   const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
   return LogLevelStrToInt(tf_env_var_val);
+#endif
 }
 
 int64 MinVLogLevelFromEnv() {
+  // We don't want to print logs during fuzzing as that would slow fuzzing down
+  // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
+  // return a value so that nothing is actually printed. Since VLOG uses <=
+  // (see VLOG_IS_ON in logging.h) to see if log messages need to be printed,
+  // the value we're interested on to disable printing is 0.
+  // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  return 0;
+#else
   const char* tf_env_var_val = getenv("TF_CPP_MIN_VLOG_LEVEL");
   return LogLevelStrToInt(tf_env_var_val);
+#endif
 }
 
+LogMessage::LogMessage(const char* fname, int line, int severity)
+    : fname_(fname), line_(line), severity_(severity) {}
+
 LogMessage::~LogMessage() {
   // Read the min log level once during the first call to logging.
   static int64 min_log_level = MinLogLevelFromEnv();
-  if (TF_PREDICT_TRUE(severity_ >= min_log_level)) GenerateLogMessage();
+  if (severity_ >= min_log_level) {
+    GenerateLogMessage();
+  }
 }
 
 int64 LogMessage::MinVLogLevel() {
@@ -137,6 +228,24 @@ int64 LogMessage::MinVLogLevel() {
   return min_vlog_level;
 }
 
+bool LogMessage::VmoduleActivated(const char* fname, int level) {
+  if (level <= MinVLogLevel()) {
+    return true;
+  }
+  static VmoduleMap* vmodules = VmodulesMapFromEnv();
+  if (TF_PREDICT_TRUE(vmodules == nullptr)) {
+    return false;
+  }
+  const char* last_slash = strrchr(fname, '/');
+  const char* module_start = last_slash == nullptr ? fname : last_slash + 1;
+  const char* dot_after = strchr(module_start, '.');
+  const char* module_limit =
+      dot_after == nullptr ? strchr(fname, '\0') : dot_after;
+  StringData module(module_start, module_limit - module_start);
+  auto it = vmodules->find(module);
+  return it != vmodules->end() && it->second >= level;
+}
+
 LogMessageFatal::LogMessageFatal(const char* file, int line)
     : LogMessage(file, line, FATAL) {}
 LogMessageFatal::~LogMessageFatal() {
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index 08a692fff75c79a5602d252908284925325deb76..bb8735ed32505294eff75620006694a4eda80bcc 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -46,6 +46,17 @@ class LogMessage : public std::basic_ostringstream<char> {
   // but VLOG(3) will not. Defaults to 0.
   static int64 MinVLogLevel();
 
+  // Returns whether VLOG level lvl is activated for the file fname.
+  //
+  // E.g. if the environment variable TF_CPP_VMODULE contains foo=3 and fname is
+  // foo.cc and lvl is <= 3, this will return true. It will also return true if
+  // the level is lower or equal to TF_CPP_MIN_VLOG_LEVEL (default zero).
+  //
+  // It is expected that the result of this query will be cached in the VLOG-ing
+  // call site to avoid repeated lookups. This routine performs a hash-map
+  // access against the VLOG-ing specification provided by the env var.
+  static bool VmoduleActivated(const char* fname, int level);
+
  protected:
   void GenerateLogMessage();
 
@@ -55,6 +66,13 @@ class LogMessage : public std::basic_ostringstream<char> {
   int severity_;
 };
 
+// Uses the lower operator & precedence to voidify a LogMessage reference, so
+// that the ternary VLOG() implementation is balanced, type wise.
+struct Voidifier {
+  template <typename T>
+  void operator&(const T&)const {}
+};
+
 // LogMessageFatal ensures the process will exit in failure after
 // logging this message.
 class LogMessageFatal : public LogMessage {
@@ -77,18 +95,30 @@ class LogMessageFatal : public LogMessage {
 #define LOG(severity) _TF_LOG_##severity
 
 #ifdef IS_MOBILE_PLATFORM
+
 // Turn VLOG off when under mobile devices for considerations of binary size.
 #define VLOG_IS_ON(lvl) ((lvl) <= 0)
+
 #else
-// Otherwise, Set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level
-// of VLOG
-#define VLOG_IS_ON(lvl) \
-  ((lvl) <= ::tensorflow::internal::LogMessage::MinVLogLevel())
+
+// Otherwise, set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level
+// of VLOG, or TF_CPP_VMODULE to set the minimum log level for individual
+// translation units.
+#define VLOG_IS_ON(lvl)                                                     \
+  (([](int level, const char* fname) {                                      \
+    static const bool vmodule_activated =                                   \
+        ::tensorflow::internal::LogMessage::VmoduleActivated(fname, level); \
+    return vmodule_activated;                                               \
+  })(lvl, __FILE__))
+
 #endif
 
-#define VLOG(lvl)                        \
-  if (TF_PREDICT_FALSE(VLOG_IS_ON(lvl))) \
-  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::INFO)
+#define VLOG(level)                                              \
+  TF_PREDICT_TRUE(!VLOG_IS_ON(level))                            \
+  ? (void)0                                                      \
+  : ::tensorflow::internal::Voidifier() &                        \
+          ::tensorflow::internal::LogMessage(__FILE__, __LINE__, \
+                                             tensorflow::INFO)
 
 // CHECK dies with a fatal error if condition is not true.  It is *not*
 // controlled by NDEBUG, so the check will be executed regardless of
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
index bd9d41c62becf2696467dcc5e1603d77f3dfc0e5..2708d6ebda41c01edd881e733b985e237aa3242a 100644
--- a/tensorflow/core/platform/default/protobuf.h
+++ b/tensorflow/core/platform/default/protobuf.h
@@ -19,18 +19,21 @@ limitations under the License.
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/protobuf.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/protobuf.h
 
-#include "google/protobuf/arena.h"
+#ifndef TENSORFLOW_LITE_PROTOS
 #include "google/protobuf/descriptor.h"
 #include "google/protobuf/descriptor.pb.h"
 #include "google/protobuf/dynamic_message.h"
+#include "google/protobuf/text_format.h"
+#include "google/protobuf/util/json_util.h"
+#include "google/protobuf/util/type_resolver_util.h"
+#endif
+
+#include "google/protobuf/arena.h"
 #include "google/protobuf/io/coded_stream.h"
 #include "google/protobuf/io/zero_copy_stream.h"
 #include "google/protobuf/io/zero_copy_stream_impl_lite.h"
 #include "google/protobuf/map.h"
 #include "google/protobuf/repeated_field.h"
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/util/json_util.h"
-#include "google/protobuf/util/type_resolver_util.h"
 
 namespace tensorflow {
 namespace protobuf = ::google::protobuf;
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 5732271f150a64e22f7eea2eea243e3c6c75631f..1b5382841574e6b8843079ae9cb359c5c9b475d0 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -166,11 +167,24 @@ class Env {
   Status DeleteFile(const string& fname);
 
   /// \brief Deletes the specified directory and all subdirectories and files
-  /// underneath it. undeleted_files and undeleted_dirs stores the number of
-  /// files and directories that weren't deleted (unspecified if the return
-  /// status is not OK).
+  /// underneath it. This is accomplished by traversing the directory tree
+  /// rooted at dirname and deleting entries as they are encountered.
+  ///
+  /// If dirname itself is not readable or does not exist, *undeleted_dir_count
+  /// is set to 1, *undeleted_file_count is set to 0 and an appropriate status
+  /// (e.g. NOT_FOUND) is returned.
+  ///
+  /// If dirname and all its descendants were successfully deleted, TF_OK is
+  /// returned and both error counters are set to zero.
+  ///
+  /// Otherwise, while traversing the tree, undeleted_file_count and
+  /// undeleted_dir_count are updated if an entry of the corresponding type
+  /// could not be deleted. The returned error status represents the reason that
+  /// any one of these entries could not be deleted.
+  ///
   /// REQUIRES: undeleted_files, undeleted_dirs to be not null.
-  /// Typical return codes
+  ///
+  /// Typical return codes:
   ///  * OK - dirname exists and we were able to delete everything underneath.
   ///  * NOT_FOUND - dirname doesn't exist
   ///  * PERMISSION_DENIED - dirname or some descendant is not writable
@@ -395,6 +409,7 @@ struct ThreadOptions {
   size_t stack_size = 0;  // 0: use system default value
   /// Guard area size to use near thread stacks to use (in bytes)
   size_t guard_size = 0;  // 0: use system default value
+  int numa_node = port::kNUMANoAffinity;
 };
 
 /// A utility routine: copy contents of `src` in file system `src_fs`
diff --git a/tensorflow/core/platform/env_time.cc b/tensorflow/core/platform/env_time.cc
index 76a227b69a10224681ce430b88a56fa2caabd264..10ba2abe7cb6485b1974eca85cc634b35cba23e8 100644
--- a/tensorflow/core/platform/env_time.cc
+++ b/tensorflow/core/platform/env_time.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 156af6cdeaa015429d60e4599f59c5a4b806f5e6..c84a93b1bf59be7cb19352825cc4bb82b48e2246 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -167,10 +167,23 @@ class FileSystem {
   virtual Status DeleteDir(const string& dirname) = 0;
 
   /// \brief Deletes the specified directory and all subdirectories and files
-  /// underneath it. undeleted_files and undeleted_dirs stores the number of
-  /// files and directories that weren't deleted (unspecified if the return
-  /// status is not OK).
+  /// underneath it. This is accomplished by traversing the directory tree
+  /// rooted at dirname and deleting entries as they are encountered.
+  ///
+  /// If dirname itself is not readable or does not exist, *undeleted_dir_count
+  /// is set to 1, *undeleted_file_count is set to 0 and an appropriate status
+  /// (e.g. NOT_FOUND) is returned.
+  ///
+  /// If dirname and all its descendants were successfully deleted, TF_OK is
+  /// returned and both error counters are set to zero.
+  ///
+  /// Otherwise, while traversing the tree, undeleted_file_count and
+  /// undeleted_dir_count are updated if an entry of the corresponding type
+  /// could not be deleted. The returned error status represents the reason that
+  /// any one of these entries could not be deleted.
+  ///
   /// REQUIRES: undeleted_files, undeleted_dirs to be not null.
+  ///
   /// Typical return codes:
   ///  * OK - dirname exists and we were able to delete everything underneath.
   ///  * NOT_FOUND - dirname doesn't exist
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
index 342cf28e38d27acda7004adfd13fba333d83fd9c..79186e4cbbf3821f78c8299cc20596f2f214645a 100644
--- a/tensorflow/core/platform/file_system_helper.cc
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -82,6 +82,10 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
     dir_q.pop_front();
     std::vector<string> children;
     Status s = fs->GetChildren(current_dir, &children);
+    // In case PERMISSION_DENIED is encountered, we bail here.
+    if (s.code() == tensorflow::error::PERMISSION_DENIED) {
+      continue;
+    }
     ret.Update(s);
     if (children.empty()) continue;
     // This IsDirectory call can be expensive for some FS. Parallelizing it.
diff --git a/tensorflow/core/platform/init_main.h b/tensorflow/core/platform/init_main.h
index 834c5298169a7e0d0c31a1a8e6fd432e1d374145..9f511983a888fbc759c5b6eddc825113f4011b6d 100644
--- a/tensorflow/core/platform/init_main.h
+++ b/tensorflow/core/platform/init_main.h
@@ -19,10 +19,10 @@ limitations under the License.
 namespace tensorflow {
 namespace port {
 
-// Platform-specific initialization routine that may be invoked by a
+// Platform-specific initialization routine that should be invoked by a
 // main() program that uses TensorFlow.
-//
-// Default implementation does nothing.
+// This performs necessary initialization on some platforms; TensorFlow
+// may not work unless it has been called.
 void InitMain(const char* usage, int* argc, char*** argv);
 
 }  // namespace port
diff --git a/tensorflow/core/platform/logger.h b/tensorflow/core/platform/logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d304bea63a7c78e4a90d78ea2be4ce01caa802d
--- /dev/null
+++ b/tensorflow/core/platform/logger.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_LOGGER_H_
+#define TENSORFLOW_CORE_PLATFORM_LOGGER_H_
+
+#include "google/protobuf/any.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Abstract logging interface. Contrary to logging.h, this class describes an
+// interface, not a concrete logging mechanism. This is useful when we want to
+// log anything to a non-local place, e.g. a database.
+class Logger {
+ public:
+  static Logger* Singleton();
+
+  virtual ~Logger() = default;
+
+  // Logs a typed proto.
+  template <typename ProtoType>
+  void LogProto(const ProtoType& proto) {
+    google::protobuf::Any any;
+    any.PackFrom(proto);
+    DoLogProto(&any);
+  }
+
+  // Flushes any pending log. Blocks until everything is flushed.
+  void Flush() { DoFlush(); }
+
+ private:
+  virtual void DoLogProto(google::protobuf::Any* proto) = 0;
+  virtual void DoFlush() = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_LOGGER_H_
diff --git a/tensorflow/core/platform/numa_test.cc b/tensorflow/core/platform/numa_test.cc
index 8b39ecd59cb1d95b30f33475981ca0a5fce117af..91789efd1eee2f30b0277562e5a2b1f0d14aae26 100644
--- a/tensorflow/core/platform/numa_test.cc
+++ b/tensorflow/core/platform/numa_test.cc
@@ -44,7 +44,7 @@ TEST(Numa, Malloc) {
 
 TEST(Numa, SetNodeAffinity) {
   // NOTE(tucker): This test is not reliable when executed under tap because
-  // the virtual machine may not have access to all of the availble NUMA
+  // the virtual machine may not have access to all of the available NUMA
   // nodes.  Not sure what to do about that.
   EXPECT_EQ(-1, port::NUMAGetThreadNodeAffinity());
   if (port::NUMAEnabled()) {
diff --git a/tensorflow/core/platform/platform_strings.cc b/tensorflow/core/platform/platform_strings.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1852633d595e0b65415284a3233ba11385a3c44
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings.cc
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/platform_strings.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+int GetPlatformStrings(const std::string& path,
+                       std::vector<std::string>* found) {
+  int result;
+  FILE* ifp = fopen(path.c_str(), "rb");
+  if (ifp != nullptr) {
+    static const char prefix[] = TF_PLAT_STR_MAGIC_PREFIX_;
+    int first_char = prefix[1];
+    int last_char = -1;
+    int c;
+    while ((c = getc(ifp)) != EOF) {
+      if (c == first_char && last_char == 0) {
+        int i = 2;
+        while (prefix[i] != 0 && (c = getc(ifp)) == prefix[i]) {
+          i++;
+        }
+        if (prefix[i] == 0) {
+          std::string str;
+          while ((c = getc(ifp)) != EOF && c != 0) {
+            str.push_back(c);
+          }
+          if (!str.empty()) {
+            found->push_back(str);
+          }
+        }
+      }
+      last_char = c;
+    }
+
+    result = (ferror(ifp) == 0) ? 0 : errno;
+    fclose(ifp);
+  } else {
+    result = errno;
+  }
+  return result;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/platform_strings.h b/tensorflow/core/platform/platform_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b1dbd130e0df0e991ac3e2dcce2840e66b1f9b9
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings.h
@@ -0,0 +1,364 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+
+// This header defines the macro TF_PLATFORM_STRINGS() which should be used
+// once in each dynamically loadable TensorFlow module.  It embeds static
+// strings into the compilation unit that allow TensorFlow to determine what
+// compilation options were in effect when the compilation unit was built.  All
+// compilation units within the same dynamically loadable library should be
+// built with the same options (or at least, the strings should be embedded in
+// the compilation unit built with the most restrictive options).
+
+// The platform strings embedded into a binary may be retrieved with the
+// GetPlatformStrings function.
+
+// Rationale:
+// We wish to load only those libraries that this CPU can execute.  For
+// example, we should not load a library compiled with avx256 instructions on a
+// CPU that cannot execute them.
+//
+// One might think that one could dlopen() the library, and call a routine that
+// would return which cpu type it was compiled for.  Alas, this does not work,
+// because at dlopen() time, a library containing C++ will execute constructors
+// of class variables with static storage class.  Even code that looks
+// innocuous may use optional platform-specific instructions.  For example,
+// the fastest way to zero a region of memory might use optional instructions.
+//
+// One might think one could run a tool such as "objdump" to read flags from
+// the libraries' headers, or perhaps disassemble each library to look for
+// particular instructions.  Unfortunately, the desired flags are not present
+// in the headers, and disassembly can be prohibitively slow ("objdump -d" is
+// very slow, for example).  Moreover, a tool to examine the library may not
+// be present on the system unless the user has installed special packages (for
+// example, on Windows).
+//
+// Instead, we adopt a crude but straightforward solution:  We require
+// developers to use the macro TF_PLATFORM_STRINGS() in their library, to
+// embed the compilation options as constant strings.  The compiler's
+// predefined macros pick which strings are included.  We then search for the
+// strings in the files, and then dlopen() only those libraries that have or
+// lack strings as needed.
+//
+// We adopt the approach of placing in the binary a fairly raw copy of the
+// predefined macros, rather than trying to interpret them in complex ways at
+// compile time.  This allows the loading binary to alter its interpretation of
+// the strings without library developers having to recompile.
+
+#include <stdio.h>
+
+#include <string>
+#include <vector>
+
+// Aside from the header guard, the internal macros defined here have the form:
+//   TF_PLAT_STR_*
+
+// If a macro is removed from the list of tested macros, the major version in
+// the following version number should be incremented, and the minor version
+// set to zero.  Otherwise, if a macro is added to the list of tested macros,
+// the minor number should be incremented.
+#define TF_PLAT_STR_VERSION_ "1.0"
+
+// Prefix of each option string indicator in the binary.
+// After the prefix, such strings have the form:
+//    [A-Za-z_0-9]=<value>
+// followed by a terminating nul.  To simplify searching, this prefix is all
+// ASCII, starts with a nul, and contains no character twice.
+#define TF_PLAT_STR_MAGIC_PREFIX_ "\0S\\s\":^p*L}"
+
+// A helper macro for TF_PLAT_STR_AS_STR_().
+#define TF_PLAT_STR_STR_1_(x) #x
+
+// Yield a constant string corresponding to x, after macro expansion.
+#define TF_PLAT_STR_AS_STR_(x) TF_PLAT_STR_STR_1_(x)
+
+// An empty definition to make lists more uniform.
+#define TF_PLAT_STR_TERMINATOR_
+
+// TF_PLAT_STR_(x) introduces a constant string indicating whether a
+// particular compilation option has been turned on.
+//
+// In gcc and clang, we might imagine using something like
+// #define TF_PLAT_STR_(x) \
+//     (sizeof (#x) != sizeof (TF_PLAT_STR_AS_STR_ (x))? \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_ (x) : \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=0"),
+// but some compilers (notably MSVC) place both "foo" and "bar" in the binary
+// when presented with
+//    (true?  "foo" : "bar")
+// so we must use #if to select the strings we need, which is rather verbose.
+#define TF_PLAT_STR_(x) TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_(x)
+
+// Include the #if machinery that sets the macros used below.
+// platform_strings_computed.h can be generated by filtering this header file
+// through:
+// awk '
+// header == "" { print; }
+// /\*\// && header == "" {
+//     print "// Generated from platform_strings.h.";
+//     print "";
+//     print "#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "";
+//     header = 1;
+// }
+// /^#define TF_PLAT_STR_LIST_[a-zA-Z0-9_]*\(\) *\\$/ { active = 1; }
+// /TF_PLAT_STR_TERMINATOR_/ { active = 0; }
+// /^ *TF_PLAT_STR_[A-Za-z0-9_]* *\\$/ && active {
+//     x = $0;
+//     sub(/^ *TF_PLAT_STR_/, "", x);
+//     sub(/ *\\$/, "", x);
+//     printf ("#if defined(%s)\n", x);
+//     printf ("#define TF_PLAT_STR_%s TF_PLAT_STR_(%s)\n", x, x);
+//     printf ("#else\n");
+//     printf ("#define TF_PLAT_STR_%s\n", x);
+//     printf ("#endif\n");
+// }
+// END {
+//     print "";
+//     print "#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+// }'
+#include "tensorflow/core/platform/platform_strings_computed.h"
+
+// clang-format butchers the following lines.
+// clang-format off
+
+// x86_64 and x86_32 optional features.
+#define TF_PLAT_STR_LIST___x86_64__()                                      \
+        TF_PLAT_STR__M_IX86_FP                                             \
+        TF_PLAT_STR__NO_PREFETCHW                                          \
+        TF_PLAT_STR___3dNOW_A__                                            \
+        TF_PLAT_STR___3dNOW__                                              \
+        TF_PLAT_STR___ABM__                                                \
+        TF_PLAT_STR___ADX__                                                \
+        TF_PLAT_STR___AES__                                                \
+        TF_PLAT_STR___AVX2__                                               \
+        TF_PLAT_STR___AVX512BW__                                           \
+        TF_PLAT_STR___AVX512CD__                                           \
+        TF_PLAT_STR___AVX512DQ__                                           \
+        TF_PLAT_STR___AVX512ER__                                           \
+        TF_PLAT_STR___AVX512F__                                            \
+        TF_PLAT_STR___AVX512IFMA__                                         \
+        TF_PLAT_STR___AVX512PF__                                           \
+        TF_PLAT_STR___AVX512VBMI__                                         \
+        TF_PLAT_STR___AVX512VL__                                           \
+        TF_PLAT_STR___AVX__                                                \
+        TF_PLAT_STR___BMI2__                                               \
+        TF_PLAT_STR___BMI__                                                \
+        TF_PLAT_STR___CLFLUSHOPT__                                         \
+        TF_PLAT_STR___CLZERO__                                             \
+        TF_PLAT_STR___F16C__                                               \
+        TF_PLAT_STR___FMA4__                                               \
+        TF_PLAT_STR___FMA__                                                \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___FSGSBASE__                                           \
+        TF_PLAT_STR___FXSR__                                               \
+        TF_PLAT_STR___LWP__                                                \
+        TF_PLAT_STR___LZCNT__                                              \
+        TF_PLAT_STR___MMX__                                                \
+        TF_PLAT_STR___MWAITX__                                             \
+        TF_PLAT_STR___PCLMUL__                                             \
+        TF_PLAT_STR___PKU__                                                \
+        TF_PLAT_STR___POPCNT__                                             \
+        TF_PLAT_STR___PRFCHW__                                             \
+        TF_PLAT_STR___RDRND__                                              \
+        TF_PLAT_STR___RDSEED__                                             \
+        TF_PLAT_STR___RTM__                                                \
+        TF_PLAT_STR___SHA__                                                \
+        TF_PLAT_STR___SSE2_MATH__                                          \
+        TF_PLAT_STR___SSE2__                                               \
+        TF_PLAT_STR___SSE_MATH__                                           \
+        TF_PLAT_STR___SSE__                                                \
+        TF_PLAT_STR___SSE3__                                               \
+        TF_PLAT_STR___SSE4A__                                              \
+        TF_PLAT_STR___SSE4_1__                                             \
+        TF_PLAT_STR___SSE4_2__                                             \
+        TF_PLAT_STR___SSSE3__                                              \
+        TF_PLAT_STR___TBM__                                                \
+        TF_PLAT_STR___XOP__                                                \
+        TF_PLAT_STR___XSAVEC__                                             \
+        TF_PLAT_STR___XSAVEOPT__                                           \
+        TF_PLAT_STR___XSAVES__                                             \
+        TF_PLAT_STR___XSAVE__                                              \
+        TF_PLAT_STR_TERMINATOR_
+
+// PowerPC (64- and 32-bit) optional features.
+#define TF_PLAT_STR_LIST___powerpc64__()                                   \
+        TF_PLAT_STR__SOFT_DOUBLE                                           \
+        TF_PLAT_STR__SOFT_FLOAT                                            \
+        TF_PLAT_STR___ALTIVEC__                                            \
+        TF_PLAT_STR___APPLE_ALTIVEC__                                      \
+        TF_PLAT_STR___CRYPTO__                                             \
+        TF_PLAT_STR___FLOAT128_HARDWARE__                                  \
+        TF_PLAT_STR___FLOAT128_TYPE__                                      \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___HTM__                                                \
+        TF_PLAT_STR___NO_FPRS__                                            \
+        TF_PLAT_STR___NO_LWSYNC__                                          \
+        TF_PLAT_STR___POWER8_VECTOR__                                      \
+        TF_PLAT_STR___POWER9_VECTOR__                                      \
+        TF_PLAT_STR___PPC405__                                             \
+        TF_PLAT_STR___QUAD_MEMORY_ATOMIC__                                 \
+        TF_PLAT_STR___RECIPF__                                             \
+        TF_PLAT_STR___RECIP_PRECISION__                                    \
+        TF_PLAT_STR___RECIP__                                              \
+        TF_PLAT_STR___RSQRTEF__                                            \
+        TF_PLAT_STR___RSQRTE__                                             \
+        TF_PLAT_STR___TM_FENCE__                                           \
+        TF_PLAT_STR___UPPER_REGS_DF__                                      \
+        TF_PLAT_STR___UPPER_REGS_SF__                                      \
+        TF_PLAT_STR___VEC__                                                \
+        TF_PLAT_STR___VSX__                                                \
+        TF_PLAT_STR_TERMINATOR_
+
+// aarch64 and 32-bit arm optional features
+#define TF_PLAT_STR_LIST___aarch64__()                                     \
+        TF_PLAT_STR___ARM_ARCH                                             \
+        TF_PLAT_STR___ARM_FEATURE_CLZ                                      \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRYPTO                                   \
+        TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING                        \
+        TF_PLAT_STR___ARM_FEATURE_DSP                                      \
+        TF_PLAT_STR___ARM_FEATURE_FMA                                      \
+        TF_PLAT_STR___ARM_FEATURE_IDIV                                     \
+        TF_PLAT_STR___ARM_FEATURE_LDREX                                    \
+        TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN                           \
+        TF_PLAT_STR___ARM_FEATURE_QBIT                                     \
+        TF_PLAT_STR___ARM_FEATURE_QRDMX                                    \
+        TF_PLAT_STR___ARM_FEATURE_SAT                                      \
+        TF_PLAT_STR___ARM_FEATURE_SIMD32                                   \
+        TF_PLAT_STR___ARM_FEATURE_UNALIGNED                                \
+        TF_PLAT_STR___ARM_FP                                               \
+        TF_PLAT_STR___ARM_NEON_FP                                          \
+        TF_PLAT_STR___ARM_NEON__                                           \
+        TF_PLAT_STR___ARM_WMMX                                             \
+        TF_PLAT_STR___IWMMXT2__                                            \
+        TF_PLAT_STR___IWMMXT__                                             \
+        TF_PLAT_STR___VFP_FP__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+// Generic features, including indication of architecture and OS.
+// The _M_* macros are defined by Visual Studio.
+// It doesn't define __LITTLE_ENDIAN__ or __BYTE_ORDER__;
+// Windows is assumed to be little endian.
+#define TF_PLAT_STR_LIST___generic__()                                     \
+        TF_PLAT_STR_TARGET_IPHONE_SIMULATOR                                \
+        TF_PLAT_STR_TARGET_OS_IOS                                          \
+        TF_PLAT_STR_TARGET_OS_IPHONE                                       \
+        TF_PLAT_STR__MSC_VER                                               \
+        TF_PLAT_STR__M_ARM                                                 \
+        TF_PLAT_STR__M_ARM64                                               \
+        TF_PLAT_STR__M_ARM_ARMV7VE                                         \
+        TF_PLAT_STR__M_ARM_FP                                              \
+        TF_PLAT_STR__M_IX86                                                \
+        TF_PLAT_STR__M_X64                                                 \
+        TF_PLAT_STR__WIN32                                                 \
+        TF_PLAT_STR__WIN64                                                 \
+        TF_PLAT_STR___ANDROID__                                            \
+        TF_PLAT_STR___APPLE__                                              \
+        TF_PLAT_STR___BYTE_ORDER__                                         \
+        TF_PLAT_STR___CYGWIN__                                             \
+        TF_PLAT_STR___FreeBSD__                                            \
+        TF_PLAT_STR___LITTLE_ENDIAN__                                      \
+        TF_PLAT_STR___NetBSD__                                             \
+        TF_PLAT_STR___OpenBSD__                                            \
+        TF_PLAT_STR_____MSYS__                                             \
+        TF_PLAT_STR___aarch64__                                            \
+        TF_PLAT_STR___alpha__                                              \
+        TF_PLAT_STR___arm__                                                \
+        TF_PLAT_STR___i386__                                               \
+        TF_PLAT_STR___i686__                                               \
+        TF_PLAT_STR___ia64__                                               \
+        TF_PLAT_STR___linux__                                              \
+        TF_PLAT_STR___mips32__                                             \
+        TF_PLAT_STR___mips64__                                             \
+        TF_PLAT_STR___powerpc64__                                          \
+        TF_PLAT_STR___powerpc__                                            \
+        TF_PLAT_STR___riscv___                                             \
+        TF_PLAT_STR___s390x__                                              \
+        TF_PLAT_STR___sparc64__                                            \
+        TF_PLAT_STR___sparc__                                              \
+        TF_PLAT_STR___x86_64__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+#if !defined(__x86_64__) && !defined(_M_X64) && \
+    !defined(__i386__) && !defined(_M_IX86)
+#undef TF_PLAT_STR_LIST___x86_64__
+#define TF_PLAT_STR_LIST___x86_64__()
+#endif
+#if !defined(__powerpc64__) && !defined(__powerpc__)
+#undef TF_PLAT_STR_LIST___powerpc64__
+#define TF_PLAT_STR_LIST___powerpc64__()
+#endif
+#if !defined(__aarch64__) && !defined(_M_ARM64) && \
+    !defined(__arm__) && !defined(_M_ARM)
+#undef TF_PLAT_STR_LIST___aarch64__
+#define TF_PLAT_STR_LIST___aarch64__()
+#endif
+
+// Macro to be used in each dynamically loadable library.
+//
+// The BSS global variable tf_cpu_option_global and the class
+// instance tf_cpu_option_avoid_omit_class are needed to prevent
+// compilers/linkers such as clang from omitting the static variable
+// tf_cpu_option[], which would otherwise appear to be unused.  We cannot make
+// tf_cpu_option[] global, because we then might get multiply-defined symbols
+// if TF_PLAT_STR() is used twice in the same library.
+// (tf_cpu_option_global doesn't see such errors because it is
+// defined in BSS, so multiple definitions are combined by the linker.)  gcc's
+// __attribute__((used)) is insufficient because it seems to be ignored by
+// linkers.
+#define TF_PLATFORM_STRINGS()                                                  \
+    static const char tf_cpu_option[] =                                        \
+        TF_PLAT_STR_MAGIC_PREFIX_ "TF_PLAT_STR_VERSION=" TF_PLAT_STR_VERSION_  \
+        TF_PLAT_STR_LIST___x86_64__()                                          \
+        TF_PLAT_STR_LIST___powerpc64__()                                       \
+        TF_PLAT_STR_LIST___aarch64__()                                         \
+        TF_PLAT_STR_LIST___generic__()                                         \
+    ;                                                                          \
+    const char *tf_cpu_option_global;                                          \
+    namespace {                                                                \
+    class TFCPUOptionHelper {                                                  \
+     public:                                                                   \
+      TFCPUOptionHelper() {                                                    \
+        /* Compilers/linkers remove unused variables aggressively.  The */     \
+        /* following gyrations subvert most such optimizations. */             \
+        tf_cpu_option_global = tf_cpu_option;                                  \
+        /* Nothing is printed because the string starts with a nul. */         \
+        printf("%s", tf_cpu_option);                                           \
+      }                                                                        \
+    } tf_cpu_option_avoid_omit_class;                                          \
+    }  /* anonymous namespace */
+// clang-format on
+
+namespace tensorflow {
+
+class Status;
+
+// Retrieves the platform strings from the file at the given path and appends
+// them to the given vector. If the returned int is non-zero, an error occurred
+// reading the file and vector may or may not be modified. The returned error
+// code is suitable for use with strerror().
+int GetPlatformStrings(const std::string& path,
+                       std::vector<std::string>* found);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
diff --git a/tensorflow/core/platform/platform_strings_computed.h b/tensorflow/core/platform/platform_strings_computed.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a17f3bfc3a866ee1fd4945e9ade5a3e379eefa3
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings_computed.h
@@ -0,0 +1,735 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Generated from platform_strings.h.
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+
+#if defined(_M_IX86_FP)
+#define TF_PLAT_STR__M_IX86_FP TF_PLAT_STR_(_M_IX86_FP)
+#else
+#define TF_PLAT_STR__M_IX86_FP
+#endif
+#if defined(_NO_PREFETCHW)
+#define TF_PLAT_STR__NO_PREFETCHW TF_PLAT_STR_(_NO_PREFETCHW)
+#else
+#define TF_PLAT_STR__NO_PREFETCHW
+#endif
+#if defined(__3dNOW_A__)
+#define TF_PLAT_STR___3dNOW_A__ TF_PLAT_STR_(__3dNOW_A__)
+#else
+#define TF_PLAT_STR___3dNOW_A__
+#endif
+#if defined(__3dNOW__)
+#define TF_PLAT_STR___3dNOW__ TF_PLAT_STR_(__3dNOW__)
+#else
+#define TF_PLAT_STR___3dNOW__
+#endif
+#if defined(__ABM__)
+#define TF_PLAT_STR___ABM__ TF_PLAT_STR_(__ABM__)
+#else
+#define TF_PLAT_STR___ABM__
+#endif
+#if defined(__ADX__)
+#define TF_PLAT_STR___ADX__ TF_PLAT_STR_(__ADX__)
+#else
+#define TF_PLAT_STR___ADX__
+#endif
+#if defined(__AES__)
+#define TF_PLAT_STR___AES__ TF_PLAT_STR_(__AES__)
+#else
+#define TF_PLAT_STR___AES__
+#endif
+#if defined(__AVX2__)
+#define TF_PLAT_STR___AVX2__ TF_PLAT_STR_(__AVX2__)
+#else
+#define TF_PLAT_STR___AVX2__
+#endif
+#if defined(__AVX512BW__)
+#define TF_PLAT_STR___AVX512BW__ TF_PLAT_STR_(__AVX512BW__)
+#else
+#define TF_PLAT_STR___AVX512BW__
+#endif
+#if defined(__AVX512CD__)
+#define TF_PLAT_STR___AVX512CD__ TF_PLAT_STR_(__AVX512CD__)
+#else
+#define TF_PLAT_STR___AVX512CD__
+#endif
+#if defined(__AVX512DQ__)
+#define TF_PLAT_STR___AVX512DQ__ TF_PLAT_STR_(__AVX512DQ__)
+#else
+#define TF_PLAT_STR___AVX512DQ__
+#endif
+#if defined(__AVX512ER__)
+#define TF_PLAT_STR___AVX512ER__ TF_PLAT_STR_(__AVX512ER__)
+#else
+#define TF_PLAT_STR___AVX512ER__
+#endif
+#if defined(__AVX512F__)
+#define TF_PLAT_STR___AVX512F__ TF_PLAT_STR_(__AVX512F__)
+#else
+#define TF_PLAT_STR___AVX512F__
+#endif
+#if defined(__AVX512IFMA__)
+#define TF_PLAT_STR___AVX512IFMA__ TF_PLAT_STR_(__AVX512IFMA__)
+#else
+#define TF_PLAT_STR___AVX512IFMA__
+#endif
+#if defined(__AVX512PF__)
+#define TF_PLAT_STR___AVX512PF__ TF_PLAT_STR_(__AVX512PF__)
+#else
+#define TF_PLAT_STR___AVX512PF__
+#endif
+#if defined(__AVX512VBMI__)
+#define TF_PLAT_STR___AVX512VBMI__ TF_PLAT_STR_(__AVX512VBMI__)
+#else
+#define TF_PLAT_STR___AVX512VBMI__
+#endif
+#if defined(__AVX512VL__)
+#define TF_PLAT_STR___AVX512VL__ TF_PLAT_STR_(__AVX512VL__)
+#else
+#define TF_PLAT_STR___AVX512VL__
+#endif
+#if defined(__AVX__)
+#define TF_PLAT_STR___AVX__ TF_PLAT_STR_(__AVX__)
+#else
+#define TF_PLAT_STR___AVX__
+#endif
+#if defined(__BMI2__)
+#define TF_PLAT_STR___BMI2__ TF_PLAT_STR_(__BMI2__)
+#else
+#define TF_PLAT_STR___BMI2__
+#endif
+#if defined(__BMI__)
+#define TF_PLAT_STR___BMI__ TF_PLAT_STR_(__BMI__)
+#else
+#define TF_PLAT_STR___BMI__
+#endif
+#if defined(__CLFLUSHOPT__)
+#define TF_PLAT_STR___CLFLUSHOPT__ TF_PLAT_STR_(__CLFLUSHOPT__)
+#else
+#define TF_PLAT_STR___CLFLUSHOPT__
+#endif
+#if defined(__CLZERO__)
+#define TF_PLAT_STR___CLZERO__ TF_PLAT_STR_(__CLZERO__)
+#else
+#define TF_PLAT_STR___CLZERO__
+#endif
+#if defined(__F16C__)
+#define TF_PLAT_STR___F16C__ TF_PLAT_STR_(__F16C__)
+#else
+#define TF_PLAT_STR___F16C__
+#endif
+#if defined(__FMA4__)
+#define TF_PLAT_STR___FMA4__ TF_PLAT_STR_(__FMA4__)
+#else
+#define TF_PLAT_STR___FMA4__
+#endif
+#if defined(__FMA__)
+#define TF_PLAT_STR___FMA__ TF_PLAT_STR_(__FMA__)
+#else
+#define TF_PLAT_STR___FMA__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__FSGSBASE__)
+#define TF_PLAT_STR___FSGSBASE__ TF_PLAT_STR_(__FSGSBASE__)
+#else
+#define TF_PLAT_STR___FSGSBASE__
+#endif
+#if defined(__FXSR__)
+#define TF_PLAT_STR___FXSR__ TF_PLAT_STR_(__FXSR__)
+#else
+#define TF_PLAT_STR___FXSR__
+#endif
+#if defined(__LWP__)
+#define TF_PLAT_STR___LWP__ TF_PLAT_STR_(__LWP__)
+#else
+#define TF_PLAT_STR___LWP__
+#endif
+#if defined(__LZCNT__)
+#define TF_PLAT_STR___LZCNT__ TF_PLAT_STR_(__LZCNT__)
+#else
+#define TF_PLAT_STR___LZCNT__
+#endif
+#if defined(__MMX__)
+#define TF_PLAT_STR___MMX__ TF_PLAT_STR_(__MMX__)
+#else
+#define TF_PLAT_STR___MMX__
+#endif
+#if defined(__MWAITX__)
+#define TF_PLAT_STR___MWAITX__ TF_PLAT_STR_(__MWAITX__)
+#else
+#define TF_PLAT_STR___MWAITX__
+#endif
+#if defined(__PCLMUL__)
+#define TF_PLAT_STR___PCLMUL__ TF_PLAT_STR_(__PCLMUL__)
+#else
+#define TF_PLAT_STR___PCLMUL__
+#endif
+#if defined(__PKU__)
+#define TF_PLAT_STR___PKU__ TF_PLAT_STR_(__PKU__)
+#else
+#define TF_PLAT_STR___PKU__
+#endif
+#if defined(__POPCNT__)
+#define TF_PLAT_STR___POPCNT__ TF_PLAT_STR_(__POPCNT__)
+#else
+#define TF_PLAT_STR___POPCNT__
+#endif
+#if defined(__PRFCHW__)
+#define TF_PLAT_STR___PRFCHW__ TF_PLAT_STR_(__PRFCHW__)
+#else
+#define TF_PLAT_STR___PRFCHW__
+#endif
+#if defined(__RDRND__)
+#define TF_PLAT_STR___RDRND__ TF_PLAT_STR_(__RDRND__)
+#else
+#define TF_PLAT_STR___RDRND__
+#endif
+#if defined(__RDSEED__)
+#define TF_PLAT_STR___RDSEED__ TF_PLAT_STR_(__RDSEED__)
+#else
+#define TF_PLAT_STR___RDSEED__
+#endif
+#if defined(__RTM__)
+#define TF_PLAT_STR___RTM__ TF_PLAT_STR_(__RTM__)
+#else
+#define TF_PLAT_STR___RTM__
+#endif
+#if defined(__SHA__)
+#define TF_PLAT_STR___SHA__ TF_PLAT_STR_(__SHA__)
+#else
+#define TF_PLAT_STR___SHA__
+#endif
+#if defined(__SSE2_MATH__)
+#define TF_PLAT_STR___SSE2_MATH__ TF_PLAT_STR_(__SSE2_MATH__)
+#else
+#define TF_PLAT_STR___SSE2_MATH__
+#endif
+#if defined(__SSE2__)
+#define TF_PLAT_STR___SSE2__ TF_PLAT_STR_(__SSE2__)
+#else
+#define TF_PLAT_STR___SSE2__
+#endif
+#if defined(__SSE_MATH__)
+#define TF_PLAT_STR___SSE_MATH__ TF_PLAT_STR_(__SSE_MATH__)
+#else
+#define TF_PLAT_STR___SSE_MATH__
+#endif
+#if defined(__SSE__)
+#define TF_PLAT_STR___SSE__ TF_PLAT_STR_(__SSE__)
+#else
+#define TF_PLAT_STR___SSE__
+#endif
+#if defined(__SSE3__)
+#define TF_PLAT_STR___SSE3__ TF_PLAT_STR_(__SSE3__)
+#else
+#define TF_PLAT_STR___SSE3__
+#endif
+#if defined(__SSE4A__)
+#define TF_PLAT_STR___SSE4A__ TF_PLAT_STR_(__SSE4A__)
+#else
+#define TF_PLAT_STR___SSE4A__
+#endif
+#if defined(__SSE4_1__)
+#define TF_PLAT_STR___SSE4_1__ TF_PLAT_STR_(__SSE4_1__)
+#else
+#define TF_PLAT_STR___SSE4_1__
+#endif
+#if defined(__SSE4_2__)
+#define TF_PLAT_STR___SSE4_2__ TF_PLAT_STR_(__SSE4_2__)
+#else
+#define TF_PLAT_STR___SSE4_2__
+#endif
+#if defined(__SSSE3__)
+#define TF_PLAT_STR___SSSE3__ TF_PLAT_STR_(__SSSE3__)
+#else
+#define TF_PLAT_STR___SSSE3__
+#endif
+#if defined(__TBM__)
+#define TF_PLAT_STR___TBM__ TF_PLAT_STR_(__TBM__)
+#else
+#define TF_PLAT_STR___TBM__
+#endif
+#if defined(__XOP__)
+#define TF_PLAT_STR___XOP__ TF_PLAT_STR_(__XOP__)
+#else
+#define TF_PLAT_STR___XOP__
+#endif
+#if defined(__XSAVEC__)
+#define TF_PLAT_STR___XSAVEC__ TF_PLAT_STR_(__XSAVEC__)
+#else
+#define TF_PLAT_STR___XSAVEC__
+#endif
+#if defined(__XSAVEOPT__)
+#define TF_PLAT_STR___XSAVEOPT__ TF_PLAT_STR_(__XSAVEOPT__)
+#else
+#define TF_PLAT_STR___XSAVEOPT__
+#endif
+#if defined(__XSAVES__)
+#define TF_PLAT_STR___XSAVES__ TF_PLAT_STR_(__XSAVES__)
+#else
+#define TF_PLAT_STR___XSAVES__
+#endif
+#if defined(__XSAVE__)
+#define TF_PLAT_STR___XSAVE__ TF_PLAT_STR_(__XSAVE__)
+#else
+#define TF_PLAT_STR___XSAVE__
+#endif
+#if defined(_SOFT_DOUBLE)
+#define TF_PLAT_STR__SOFT_DOUBLE TF_PLAT_STR_(_SOFT_DOUBLE)
+#else
+#define TF_PLAT_STR__SOFT_DOUBLE
+#endif
+#if defined(_SOFT_FLOAT)
+#define TF_PLAT_STR__SOFT_FLOAT TF_PLAT_STR_(_SOFT_FLOAT)
+#else
+#define TF_PLAT_STR__SOFT_FLOAT
+#endif
+#if defined(__ALTIVEC__)
+#define TF_PLAT_STR___ALTIVEC__ TF_PLAT_STR_(__ALTIVEC__)
+#else
+#define TF_PLAT_STR___ALTIVEC__
+#endif
+#if defined(__APPLE_ALTIVEC__)
+#define TF_PLAT_STR___APPLE_ALTIVEC__ TF_PLAT_STR_(__APPLE_ALTIVEC__)
+#else
+#define TF_PLAT_STR___APPLE_ALTIVEC__
+#endif
+#if defined(__CRYPTO__)
+#define TF_PLAT_STR___CRYPTO__ TF_PLAT_STR_(__CRYPTO__)
+#else
+#define TF_PLAT_STR___CRYPTO__
+#endif
+#if defined(__FLOAT128_HARDWARE__)
+#define TF_PLAT_STR___FLOAT128_HARDWARE__ TF_PLAT_STR_(__FLOAT128_HARDWARE__)
+#else
+#define TF_PLAT_STR___FLOAT128_HARDWARE__
+#endif
+#if defined(__FLOAT128_TYPE__)
+#define TF_PLAT_STR___FLOAT128_TYPE__ TF_PLAT_STR_(__FLOAT128_TYPE__)
+#else
+#define TF_PLAT_STR___FLOAT128_TYPE__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__HTM__)
+#define TF_PLAT_STR___HTM__ TF_PLAT_STR_(__HTM__)
+#else
+#define TF_PLAT_STR___HTM__
+#endif
+#if defined(__NO_FPRS__)
+#define TF_PLAT_STR___NO_FPRS__ TF_PLAT_STR_(__NO_FPRS__)
+#else
+#define TF_PLAT_STR___NO_FPRS__
+#endif
+#if defined(__NO_LWSYNC__)
+#define TF_PLAT_STR___NO_LWSYNC__ TF_PLAT_STR_(__NO_LWSYNC__)
+#else
+#define TF_PLAT_STR___NO_LWSYNC__
+#endif
+#if defined(__POWER8_VECTOR__)
+#define TF_PLAT_STR___POWER8_VECTOR__ TF_PLAT_STR_(__POWER8_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER8_VECTOR__
+#endif
+#if defined(__POWER9_VECTOR__)
+#define TF_PLAT_STR___POWER9_VECTOR__ TF_PLAT_STR_(__POWER9_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER9_VECTOR__
+#endif
+#if defined(__PPC405__)
+#define TF_PLAT_STR___PPC405__ TF_PLAT_STR_(__PPC405__)
+#else
+#define TF_PLAT_STR___PPC405__
+#endif
+#if defined(__QUAD_MEMORY_ATOMIC__)
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__ TF_PLAT_STR_(__QUAD_MEMORY_ATOMIC__)
+#else
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__
+#endif
+#if defined(__RECIPF__)
+#define TF_PLAT_STR___RECIPF__ TF_PLAT_STR_(__RECIPF__)
+#else
+#define TF_PLAT_STR___RECIPF__
+#endif
+#if defined(__RECIP_PRECISION__)
+#define TF_PLAT_STR___RECIP_PRECISION__ TF_PLAT_STR_(__RECIP_PRECISION__)
+#else
+#define TF_PLAT_STR___RECIP_PRECISION__
+#endif
+#if defined(__RECIP__)
+#define TF_PLAT_STR___RECIP__ TF_PLAT_STR_(__RECIP__)
+#else
+#define TF_PLAT_STR___RECIP__
+#endif
+#if defined(__RSQRTEF__)
+#define TF_PLAT_STR___RSQRTEF__ TF_PLAT_STR_(__RSQRTEF__)
+#else
+#define TF_PLAT_STR___RSQRTEF__
+#endif
+#if defined(__RSQRTE__)
+#define TF_PLAT_STR___RSQRTE__ TF_PLAT_STR_(__RSQRTE__)
+#else
+#define TF_PLAT_STR___RSQRTE__
+#endif
+#if defined(__TM_FENCE__)
+#define TF_PLAT_STR___TM_FENCE__ TF_PLAT_STR_(__TM_FENCE__)
+#else
+#define TF_PLAT_STR___TM_FENCE__
+#endif
+#if defined(__UPPER_REGS_DF__)
+#define TF_PLAT_STR___UPPER_REGS_DF__ TF_PLAT_STR_(__UPPER_REGS_DF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_DF__
+#endif
+#if defined(__UPPER_REGS_SF__)
+#define TF_PLAT_STR___UPPER_REGS_SF__ TF_PLAT_STR_(__UPPER_REGS_SF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_SF__
+#endif
+#if defined(__VEC__)
+#define TF_PLAT_STR___VEC__ TF_PLAT_STR_(__VEC__)
+#else
+#define TF_PLAT_STR___VEC__
+#endif
+#if defined(__VSX__)
+#define TF_PLAT_STR___VSX__ TF_PLAT_STR_(__VSX__)
+#else
+#define TF_PLAT_STR___VSX__
+#endif
+#if defined(__ARM_ARCH)
+#define TF_PLAT_STR___ARM_ARCH TF_PLAT_STR_(__ARM_ARCH)
+#else
+#define TF_PLAT_STR___ARM_ARCH
+#endif
+#if defined(__ARM_FEATURE_CLZ)
+#define TF_PLAT_STR___ARM_FEATURE_CLZ TF_PLAT_STR_(__ARM_FEATURE_CLZ)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CLZ
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRYPTO)
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO TF_PLAT_STR_(__ARM_FEATURE_CRYPTO)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO
+#endif
+#if defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING \
+  TF_PLAT_STR_(__ARM_FEATURE_DIRECTED_ROUNDING)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING
+#endif
+#if defined(__ARM_FEATURE_DSP)
+#define TF_PLAT_STR___ARM_FEATURE_DSP TF_PLAT_STR_(__ARM_FEATURE_DSP)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DSP
+#endif
+#if defined(__ARM_FEATURE_FMA)
+#define TF_PLAT_STR___ARM_FEATURE_FMA TF_PLAT_STR_(__ARM_FEATURE_FMA)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_FMA
+#endif
+#if defined(__ARM_FEATURE_IDIV)
+#define TF_PLAT_STR___ARM_FEATURE_IDIV TF_PLAT_STR_(__ARM_FEATURE_IDIV)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_IDIV
+#endif
+#if defined(__ARM_FEATURE_LDREX)
+#define TF_PLAT_STR___ARM_FEATURE_LDREX TF_PLAT_STR_(__ARM_FEATURE_LDREX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_LDREX
+#endif
+#if defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN \
+  TF_PLAT_STR_(__ARM_FEATURE_NUMERIC_MAXMIN)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN
+#endif
+#if defined(__ARM_FEATURE_QBIT)
+#define TF_PLAT_STR___ARM_FEATURE_QBIT TF_PLAT_STR_(__ARM_FEATURE_QBIT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QBIT
+#endif
+#if defined(__ARM_FEATURE_QRDMX)
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX TF_PLAT_STR_(__ARM_FEATURE_QRDMX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX
+#endif
+#if defined(__ARM_FEATURE_SAT)
+#define TF_PLAT_STR___ARM_FEATURE_SAT TF_PLAT_STR_(__ARM_FEATURE_SAT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SAT
+#endif
+#if defined(__ARM_FEATURE_SIMD32)
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32 TF_PLAT_STR_(__ARM_FEATURE_SIMD32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32
+#endif
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED \
+  TF_PLAT_STR_(__ARM_FEATURE_UNALIGNED)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED
+#endif
+#if defined(__ARM_FP)
+#define TF_PLAT_STR___ARM_FP TF_PLAT_STR_(__ARM_FP)
+#else
+#define TF_PLAT_STR___ARM_FP
+#endif
+#if defined(__ARM_NEON_FP)
+#define TF_PLAT_STR___ARM_NEON_FP TF_PLAT_STR_(__ARM_NEON_FP)
+#else
+#define TF_PLAT_STR___ARM_NEON_FP
+#endif
+#if defined(__ARM_NEON__)
+#define TF_PLAT_STR___ARM_NEON__ TF_PLAT_STR_(__ARM_NEON__)
+#else
+#define TF_PLAT_STR___ARM_NEON__
+#endif
+#if defined(__ARM_WMMX)
+#define TF_PLAT_STR___ARM_WMMX TF_PLAT_STR_(__ARM_WMMX)
+#else
+#define TF_PLAT_STR___ARM_WMMX
+#endif
+#if defined(__IWMMXT2__)
+#define TF_PLAT_STR___IWMMXT2__ TF_PLAT_STR_(__IWMMXT2__)
+#else
+#define TF_PLAT_STR___IWMMXT2__
+#endif
+#if defined(__IWMMXT__)
+#define TF_PLAT_STR___IWMMXT__ TF_PLAT_STR_(__IWMMXT__)
+#else
+#define TF_PLAT_STR___IWMMXT__
+#endif
+#if defined(__VFP_FP__)
+#define TF_PLAT_STR___VFP_FP__ TF_PLAT_STR_(__VFP_FP__)
+#else
+#define TF_PLAT_STR___VFP_FP__
+#endif
+#if defined(TARGET_IPHONE_SIMULATOR)
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR \
+  TF_PLAT_STR_(TARGET_IPHONE_SIMULATOR)
+#else
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR
+#endif
+#if defined(TARGET_OS_IOS)
+#define TF_PLAT_STR_TARGET_OS_IOS TF_PLAT_STR_(TARGET_OS_IOS)
+#else
+#define TF_PLAT_STR_TARGET_OS_IOS
+#endif
+#if defined(TARGET_OS_IPHONE)
+#define TF_PLAT_STR_TARGET_OS_IPHONE TF_PLAT_STR_(TARGET_OS_IPHONE)
+#else
+#define TF_PLAT_STR_TARGET_OS_IPHONE
+#endif
+#if defined(_MSC_VER)
+#define TF_PLAT_STR__MSC_VER TF_PLAT_STR_(_MSC_VER)
+#else
+#define TF_PLAT_STR__MSC_VER
+#endif
+#if defined(_M_ARM)
+#define TF_PLAT_STR__M_ARM TF_PLAT_STR_(_M_ARM)
+#else
+#define TF_PLAT_STR__M_ARM
+#endif
+#if defined(_M_ARM64)
+#define TF_PLAT_STR__M_ARM64 TF_PLAT_STR_(_M_ARM64)
+#else
+#define TF_PLAT_STR__M_ARM64
+#endif
+#if defined(_M_ARM_ARMV7VE)
+#define TF_PLAT_STR__M_ARM_ARMV7VE TF_PLAT_STR_(_M_ARM_ARMV7VE)
+#else
+#define TF_PLAT_STR__M_ARM_ARMV7VE
+#endif
+#if defined(_M_ARM_FP)
+#define TF_PLAT_STR__M_ARM_FP TF_PLAT_STR_(_M_ARM_FP)
+#else
+#define TF_PLAT_STR__M_ARM_FP
+#endif
+#if defined(_M_IX86)
+#define TF_PLAT_STR__M_IX86 TF_PLAT_STR_(_M_IX86)
+#else
+#define TF_PLAT_STR__M_IX86
+#endif
+#if defined(_M_X64)
+#define TF_PLAT_STR__M_X64 TF_PLAT_STR_(_M_X64)
+#else
+#define TF_PLAT_STR__M_X64
+#endif
+#if defined(_WIN32)
+#define TF_PLAT_STR__WIN32 TF_PLAT_STR_(_WIN32)
+#else
+#define TF_PLAT_STR__WIN32
+#endif
+#if defined(_WIN64)
+#define TF_PLAT_STR__WIN64 TF_PLAT_STR_(_WIN64)
+#else
+#define TF_PLAT_STR__WIN64
+#endif
+#if defined(__ANDROID__)
+#define TF_PLAT_STR___ANDROID__ TF_PLAT_STR_(__ANDROID__)
+#else
+#define TF_PLAT_STR___ANDROID__
+#endif
+#if defined(__APPLE__)
+#define TF_PLAT_STR___APPLE__ TF_PLAT_STR_(__APPLE__)
+#else
+#define TF_PLAT_STR___APPLE__
+#endif
+#if defined(__BYTE_ORDER__)
+#define TF_PLAT_STR___BYTE_ORDER__ TF_PLAT_STR_(__BYTE_ORDER__)
+#else
+#define TF_PLAT_STR___BYTE_ORDER__
+#endif
+#if defined(__CYGWIN__)
+#define TF_PLAT_STR___CYGWIN__ TF_PLAT_STR_(__CYGWIN__)
+#else
+#define TF_PLAT_STR___CYGWIN__
+#endif
+#if defined(__FreeBSD__)
+#define TF_PLAT_STR___FreeBSD__ TF_PLAT_STR_(__FreeBSD__)
+#else
+#define TF_PLAT_STR___FreeBSD__
+#endif
+#if defined(__LITTLE_ENDIAN__)
+#define TF_PLAT_STR___LITTLE_ENDIAN__ TF_PLAT_STR_(__LITTLE_ENDIAN__)
+#else
+#define TF_PLAT_STR___LITTLE_ENDIAN__
+#endif
+#if defined(__NetBSD__)
+#define TF_PLAT_STR___NetBSD__ TF_PLAT_STR_(__NetBSD__)
+#else
+#define TF_PLAT_STR___NetBSD__
+#endif
+#if defined(__OpenBSD__)
+#define TF_PLAT_STR___OpenBSD__ TF_PLAT_STR_(__OpenBSD__)
+#else
+#define TF_PLAT_STR___OpenBSD__
+#endif
+#if defined(____MSYS__)
+#define TF_PLAT_STR_____MSYS__ TF_PLAT_STR_(____MSYS__)
+#else
+#define TF_PLAT_STR_____MSYS__
+#endif
+#if defined(__aarch64__)
+#define TF_PLAT_STR___aarch64__ TF_PLAT_STR_(__aarch64__)
+#else
+#define TF_PLAT_STR___aarch64__
+#endif
+#if defined(__alpha__)
+#define TF_PLAT_STR___alpha__ TF_PLAT_STR_(__alpha__)
+#else
+#define TF_PLAT_STR___alpha__
+#endif
+#if defined(__arm__)
+#define TF_PLAT_STR___arm__ TF_PLAT_STR_(__arm__)
+#else
+#define TF_PLAT_STR___arm__
+#endif
+#if defined(__i386__)
+#define TF_PLAT_STR___i386__ TF_PLAT_STR_(__i386__)
+#else
+#define TF_PLAT_STR___i386__
+#endif
+#if defined(__i686__)
+#define TF_PLAT_STR___i686__ TF_PLAT_STR_(__i686__)
+#else
+#define TF_PLAT_STR___i686__
+#endif
+#if defined(__ia64__)
+#define TF_PLAT_STR___ia64__ TF_PLAT_STR_(__ia64__)
+#else
+#define TF_PLAT_STR___ia64__
+#endif
+#if defined(__linux__)
+#define TF_PLAT_STR___linux__ TF_PLAT_STR_(__linux__)
+#else
+#define TF_PLAT_STR___linux__
+#endif
+#if defined(__mips32__)
+#define TF_PLAT_STR___mips32__ TF_PLAT_STR_(__mips32__)
+#else
+#define TF_PLAT_STR___mips32__
+#endif
+#if defined(__mips64__)
+#define TF_PLAT_STR___mips64__ TF_PLAT_STR_(__mips64__)
+#else
+#define TF_PLAT_STR___mips64__
+#endif
+#if defined(__powerpc64__)
+#define TF_PLAT_STR___powerpc64__ TF_PLAT_STR_(__powerpc64__)
+#else
+#define TF_PLAT_STR___powerpc64__
+#endif
+#if defined(__powerpc__)
+#define TF_PLAT_STR___powerpc__ TF_PLAT_STR_(__powerpc__)
+#else
+#define TF_PLAT_STR___powerpc__
+#endif
+#if defined(__riscv___)
+#define TF_PLAT_STR___riscv___ TF_PLAT_STR_(__riscv___)
+#else
+#define TF_PLAT_STR___riscv___
+#endif
+#if defined(__s390x__)
+#define TF_PLAT_STR___s390x__ TF_PLAT_STR_(__s390x__)
+#else
+#define TF_PLAT_STR___s390x__
+#endif
+#if defined(__sparc64__)
+#define TF_PLAT_STR___sparc64__ TF_PLAT_STR_(__sparc64__)
+#else
+#define TF_PLAT_STR___sparc64__
+#endif
+#if defined(__sparc__)
+#define TF_PLAT_STR___sparc__ TF_PLAT_STR_(__sparc__)
+#else
+#define TF_PLAT_STR___sparc__
+#endif
+#if defined(__x86_64__)
+#define TF_PLAT_STR___x86_64__ TF_PLAT_STR_(__x86_64__)
+#else
+#define TF_PLAT_STR___x86_64__
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
diff --git a/tensorflow/core/platform/platform_strings_test.cc b/tensorflow/core/platform/platform_strings_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5251f10d4124650dd7b2d260b1665b988bb663c9
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Test for the platform_strings.h header file.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform_strings.h"
+
+// Embed the platform strings in this binary.
+TF_PLATFORM_STRINGS()
+
+// A vector of strings.
+typedef std::vector<std::string> string_vec;
+
+// Append to *found the strings within the named file with the platform_strings
+// magic prefix, and return true; or return false on error.
+
+// Print the platform strings embedded in the binary file_name and return 0,
+// on on error return 2.
+static int PrintStrings(const std::string file_name) {
+  int rc = 0;
+  string_vec str;
+  if (!tensorflow::GetPlatformStrings(file_name, &str)) {
+    for (int i = 0; i != str.size(); i++) {
+      printf("%s\n", str[i].c_str());
+    }
+  } else {
+    perror(file_name.c_str());
+    rc = 2;
+  }
+  return rc;
+}
+
+// Return whether str[] conatins a string with prefix "macro_name="; if so,
+// set *pvalue to the suffix.
+static bool GetValue(const string_vec &str, const std::string &macro_name,
+                     std::string *pvalue) {
+  std::string nam_eq = macro_name + "=";
+  int i = 0;
+  while (i != str.size() && !tensorflow::str_util::StartsWith(str[i], nam_eq)) {
+    i++;
+  }
+  bool found = (i != str.size());
+  if (found) {
+    *pvalue = str[i].substr(nam_eq.size());
+  }
+  return found;
+}
+
+// If macro_name[] is not equal to value[], check that str[] contains the
+// string "macro_name=value".  Otherwise, check that str[] does not contain any
+// string starting with macro_name=".
+static void CheckStr(const string_vec &str, const std::string &macro_name,
+                     const std::string &value) {
+  std::string value_from_str;
+  if (GetValue(str, macro_name, &value_from_str)) {
+    if (value != value_from_str) {
+      // Output everything found, to aid debugging.
+      LOG(ERROR) << "===== value=" << value
+                 << "  value_from_str=" << value_from_str;
+      for (int i = 0; i != str.size(); i++) {
+        LOG(ERROR) << "% " << str[i];
+      }
+      LOG(ERROR) << "=====";
+    }
+    CHECK_EQ(value, value_from_str) << " " << macro_name << ": bad value";
+  } else {
+    // If the string is not found, we expect value to be macro_name.
+    if (value != macro_name) {
+      // Output everything found, to aid debugging.
+      LOG(ERROR) << "===== value=" << value << "  macro_name=" << macro_name;
+      for (int i = 0; i != str.size(); i++) {
+        LOG(ERROR) << "% " << str[i];
+      }
+      LOG(ERROR) << "=====";
+    }
+    CHECK_EQ(value, macro_name) << " " << macro_name << ": not found in binary";
+  }
+}
+
+// Helper for AS_STR(), below, to perform macro expansion.
+#define AS_STR_1_(x) #x
+
+// Yield x after macro expansion as a nul-terminated constant string.
+#define AS_STR(x) AS_STR_1_(x)
+
+// Run the test, and return 0 on success, 2 otherwise.
+static int RunTest(const std::string &binary_name) {
+  int rc = 0;
+  string_vec str;
+
+  if (!tensorflow::GetPlatformStrings(binary_name, &str)) {
+    CheckStr(str, "__linux__", AS_STR(__linux__));
+    CheckStr(str, "_WIN32", AS_STR(_WIN32));
+    CheckStr(str, "__APPLE__", AS_STR(__APPLE__));
+    CheckStr(str, "__x86_64__", AS_STR(__x86_64__));
+    CheckStr(str, "__aarch64__", AS_STR(__aarch64__));
+    CheckStr(str, "__powerpc64__", AS_STR(__powerpc64__));
+    CheckStr(str, "TF_PLAT_STR_VERSION", TF_PLAT_STR_VERSION_);
+  } else {
+    perror(binary_name.c_str());
+    rc = 2;
+  }
+
+  return rc;
+}
+
+int main(int argc, char *argv[]) {
+  tensorflow::Env *env = tensorflow::Env::Default();
+  static const char usage[] = "usage: platform_strings_test [file...]";
+  int rc = 0;
+  tensorflow::port::InitMain(usage, &argc, &argv);
+  if (argc == 1) {
+    printf("rc=%d\n", PrintStrings(env->GetExecutablePath()));
+    rc = RunTest(env->GetExecutablePath());
+  } else {
+    for (int argn = 1; argn != argc; argn++) {
+      rc |= PrintStrings(argv[argn]);
+    }
+  }
+  return rc;
+}
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index af95d8201ed186b2803942446cc8f31fe3601899..0a939aef25236dc33e2be8ec1d76f9ea0075e350 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -123,7 +123,7 @@ class PosixEnv : public Env {
     string bin_path = this->GetExecutablePath();
     string runfiles_path = bin_path + ".runfiles/org_tensorflow";
     Status s = this->IsDirectory(runfiles_path);
-    if (!s.ok()) {
+    if (s.ok()) {
       return runfiles_path;
     } else {
       return bin_path.substr(0, bin_path.find_last_of("/\\"));
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index c7afab9583cee1612a8c12b6f9fff7b89af1d86a..fc48cab56460d85d9997f57cb761481c77413d00 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -240,11 +240,14 @@ Status PosixFileSystem::DeleteFile(const string& fname) {
 }
 
 Status PosixFileSystem::CreateDir(const string& name) {
-  Status result;
-  if (mkdir(TranslateName(name).c_str(), 0755) != 0) {
-    result = IOError(name, errno);
+  string translated = TranslateName(name);
+  if (translated.empty()) {
+    return errors::AlreadyExists(name);
   }
-  return result;
+  if (mkdir(translated.c_str(), 0755) != 0) {
+    return IOError(name, errno);
+  }
+  return Status::OK();
 }
 
 Status PosixFileSystem::DeleteDir(const string& name) {
diff --git a/tensorflow/core/platform/regexp.h b/tensorflow/core/platform/regexp.h
index a4eedf30454567074191b36e0b87bf53987ffc42..ca9ca1e2442d272fc12e29dce81b6c633de97b35 100644
--- a/tensorflow/core/platform/regexp.h
+++ b/tensorflow/core/platform/regexp.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_REGEXP_H_
 #define TENSORFLOW_PLATFORM_REGEXP_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -23,7 +24,7 @@ limitations under the License.
     defined(GOOGLE_RE2)
 #include "tensorflow/core/platform/google/build_config/re2.h"
 namespace tensorflow {
-typedef ::StringPiece RegexpStringPiece;
+typedef absl::string_view RegexpStringPiece;
 }  // namespace tensorflow
 
 #else
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index f26ccd1662e1b7cf020f76f842302486a5dc793e..77ce2026d9d2cdda7ef1ea0ad6bb71050a6467af 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -164,7 +164,7 @@ class WindowsEnv : public Env {
     string bin_path = this->GetExecutablePath();
     string runfiles_path = bin_path + ".runfiles\\org_tensorflow";
     Status s = this->IsDirectory(runfiles_path);
-    if (!s.ok()) {
+    if (s.ok()) {
       return runfiles_path;
     } else {
       return bin_path.substr(0, bin_path.find_last_of("/\\"));
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 6cf79634d7a4f1591185c594ce66fb67ddb85309..993b9906b1c072cb48c816855fb2fc1498ae3f40 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -439,6 +439,9 @@ Status WindowsFileSystem::DeleteFile(const string& fname) {
 Status WindowsFileSystem::CreateDir(const string& name) {
   Status result;
   std::wstring ws_name = Utf8ToWideChar(name);
+  if (ws_name.empty()) {
+    return errors::AlreadyExists(name);
+  }
   if (_wmkdir(ws_name.c_str()) != 0) {
     result = IOError("Failed to create a directory: " + name, errno);
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
index 744e1e95deb458e4399cceba4c91a12eed30be7c..0c26855a43ec40992687cc9c3dd0a0d93e8594df 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -183,7 +183,7 @@ class Samples {
   // This method adds the statistics of graph nodes created by the python
   // call.
   void Add(const CodeNode* node, const std::vector<uint64>& location_ids) {
-    // displayed leaf might not be true leaf. Retrive the true leaves for
+    // displayed leaf might not be true leaf. Retrieve the true leaves for
     // stats.
     std::vector<const CodeNode*> all_leaf = FetchAllLeaf(node);
     CHECK(!all_leaf.empty()) << node->name();
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 86cb20de7bbb4f36bfaa431bc2b81a00dace84df..8796234be0cced4c977a0529aefa10cd16961c1b 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -151,7 +151,7 @@ void ExecStep::AddMemoryStats(const string& dev,
   }
 
   // TODO(xpan): Make this more accurate:
-  // High level: Memory tracking is suspicous and requires large scale
+  // High level: Memory tracking is suspicious and requires large scale
   // clean up.
   // Investigte the memory usage difference between CPU/GPU with OpViewTest.
   //
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 104ab039cb71e5ac2ed2b36744c3c481d24fd63f..b3dc5dccc02737202f9f5ced78471f332efd2eba 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
+// add go_package externally with copybara
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
@@ -148,6 +148,14 @@ message GPUOptions {
     // for each GPUDevice.  Default value is 0, which is automatically
     // converted to 1.
     int32 num_dev_to_dev_copy_streams = 3;
+
+    // If non-empty, defines a good GPU ring order on a single worker based on
+    // device interconnect.  This assumes that all workers have the same GPU
+    // topology.  Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4".
+    // This ring order is used by the RingReducer implementation of
+    // CollectiveReduce, and serves as an override to automatic ring order
+    // generation in OrderTaskDeviceMap() during CollectiveParam resolution.
+    string collective_ring_order = 4;
   }
 
   // Everything inside experimental is subject to change and is not subject
@@ -283,6 +291,13 @@ message RPCOptions {
   // transport for client-master communication that avoids the RPC
   // stack. This option is primarily for used testing the RPC stack.
   bool use_rpc_for_inprocess_master = 1;
+
+  // The compression algorithm to be used. One of "deflate", "gzip".
+  string compression_algorithm = 2;
+
+  // If compression_algorithm is set, the compression level to be used.
+  // From 0 (no compression), up to 3.
+  int32 compression_level = 3;
 };
 
 // Session configuration parameters.
@@ -400,6 +415,16 @@ message ConfigProto {
     // Which executor to use, the default executor will be used
     // if it is an empty string or "DEFAULT"
     string executor_type = 3;
+
+    // Guidance to formatting of large RecvBuf fields for transfer.
+    // Any positive value sets the max chunk size.  0 defaults to 4096.
+    // Any negative value indicates no max, i.e. one chunk only.
+    int32 recv_buf_max_chunk = 4;
+
+    // If true, and supported by the platform, the runtime will attempt to
+    // use NUMA affinity where applicable.  One consequence will be the
+    // existence of as many CPU devices as there are available NUMA nodes.
+    bool use_numa_affinity = 5;
   };
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 03022875e64ace5d680e969138727efb1f522097..c104463c51c7e7be02430c7750ebacee60ed50e4 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -224,7 +224,7 @@ message CloseSessionResponse {
 message ResetRequest {
   // A list of container names, which may be empty.
   //
-  // If 'container' is not empty, releases resoures in the given
+  // If 'container' is not empty, releases resources in the given
   // containers in all devices.
   //
   // If 'container' is empty, releases resources in the default
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 8c31468ff5ab17fbc1a0448655cc1a39d1141972..515d673828e3792ac6f4268fd55b58e43aab509b 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -38,7 +38,7 @@ message RewriterConfig {
   }
 
   // Enum controlling the number of times to run optimizers. The default is to
-  // run them once.
+  // run them twice.
   enum NumIterationsType {
     DEFAULT_NUM_ITERS = 0;
     ONE = 1;
@@ -128,11 +128,20 @@ message RewriterConfig {
   // "gradients/", the default, it will match node name "gradients/foo",
   // "foo/gradients/bar", but not "foo_gradients/"
   string memory_optimizer_target_node_name_scope = 6;
+  // Maximum number of milliseconds to spend optimizing a single graph before
+  // timing out. If equal to 0 the system picks a default (currently 5 minutes).
+  // If less than 0 the optimizer will never time out.
+  int64 meta_optimizer_timeout_ms = 20;
 
   // Configures AutoParallel optimization passes either through the
   // meta-optimizer or when manually specified through the optimizers field.
   AutoParallelOptions auto_parallel = 5;
 
+  // If true, any optimization pass failing will cause the MetaOptimizer to
+  // stop with an error. By default - or when set to false, failing passes are
+  // skipped silently.
+  bool fail_on_optimizer_errors = 21;
+
   ScopedAllocatorOptions scoped_allocator_opts = 16;
 
   // If non-empty, will use this as an alternative way to specify a list of
diff --git a/tensorflow/core/protobuf/transport_options.proto b/tensorflow/core/protobuf/transport_options.proto
index d7b1bddbbe3d7d9bc78f499c68ab8b64766dbb88..1d32475e9b9d6cfc19d9764d6d6ff0ab8f3cec23 100644
--- a/tensorflow/core/protobuf/transport_options.proto
+++ b/tensorflow/core/protobuf/transport_options.proto
@@ -4,5 +4,5 @@ package tensorflow;
 
 // Extra data needed on a non-RDMA RecvBufResponse.
 message RecvBufRespExtra {
-  bytes tensor_content = 1;
+  repeated bytes tensor_content = 1;
 };
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b043a694318d52ed92adfcb8dc7bb90702fc2f31..07eeeb4f032f199fe50b315c39b5e9835770d5c7 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 11
+#define TF_MINOR_VERSION 12
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 55f1e30880bce8dbad8deedf012ea60fb43e3de1..f1196fdfec213c286a489b948aa7e17580048f95 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cinttypes>
 #include <string>
 #include <vector>
 
@@ -70,7 +71,7 @@ bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
       str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     int64_t parsed_int64;
-    if (sscanf(arg.data(), "%ld%c", &parsed_int64, &extra) != 1) {
+    if (sscanf(arg.data(), "%" SCNd64 "%c", &parsed_int64, &extra) != 1) {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
                  << ".";
       *value_parsing_ok = false;
diff --git a/tensorflow/core/util/ctc/ctc_beam_entry.h b/tensorflow/core/util/ctc/ctc_beam_entry.h
index 24002e72a0920fc7a12203e8b843ae96626b4660..7382b8e6849b8884f31a89fd4924704fd7dfe7f0 100644
--- a/tensorflow/core/util/ctc/ctc_beam_entry.h
+++ b/tensorflow/core/util/ctc/ctc_beam_entry.h
@@ -146,4 +146,4 @@ class BeamComparer {
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_ENTRY_H_
-// LINT.ThenChange(//tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h)
+// LINT.ThenChange(//tensorflow/lite/experimental/kernels/ctc_beam_entry.h)
diff --git a/tensorflow/core/util/ctc/ctc_beam_scorer.h b/tensorflow/core/util/ctc/ctc_beam_scorer.h
index 1e45a8abd39a757f4a68d546a8326464f5afcace..fc63dfb0fd29010741d96ca5c010499c06b5da74 100644
--- a/tensorflow/core/util/ctc/ctc_beam_scorer.h
+++ b/tensorflow/core/util/ctc/ctc_beam_scorer.h
@@ -74,4 +74,4 @@ class BaseBeamScorer {
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SCORER_H_
-// LINT.ThenChange(//tensorflow/contrib/lite/experimental/kernels/ctc_beam_scorer.h)
+// LINT.ThenChange(//tensorflow/lite/experimental/kernels/ctc_beam_scorer.h)
diff --git a/tensorflow/core/util/ctc/ctc_beam_search.h b/tensorflow/core/util/ctc/ctc_beam_search.h
index 6fbb1ed0dae179f6abb4d5146169b464c7959a76..f2022d486c76e28fdd656483416c585546c2a211 100644
--- a/tensorflow/core/util/ctc/ctc_beam_search.h
+++ b/tensorflow/core/util/ctc/ctc_beam_search.h
@@ -431,4 +431,4 @@ Status CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_BEAM_SEARCH_H_
-// LINT.ThenChange(//tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h)
+// LINT.ThenChange(//tensorflow/lite/experimental/kernels/ctc_beam_search.h)
diff --git a/tensorflow/core/util/ctc/ctc_decoder.h b/tensorflow/core/util/ctc/ctc_decoder.h
index b55d7d77ac0f070c4ff616558793c8be34c12556..f5c9e4bb596dac8c64750bcfc482ee47322de983 100644
--- a/tensorflow/core/util/ctc/ctc_decoder.h
+++ b/tensorflow/core/util/ctc/ctc_decoder.h
@@ -113,4 +113,4 @@ class CTCGreedyDecoder : public CTCDecoder {
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_DECODER_H_
-// LINT.ThenChange(//tensorflow/contrib/lite/experimental/kernels/ctc_decoder.h)
+// LINT.ThenChange(//tensorflow/lite/experimental/kernels/ctc_decoder.h)
diff --git a/tensorflow/core/util/ctc/ctc_loss_util.h b/tensorflow/core/util/ctc/ctc_loss_util.h
index 054412d388dd53874f3b1b282dc8d2854fe97b27..df0de926d9a8b630dcd08cd6428dd6f3454548a5 100644
--- a/tensorflow/core/util/ctc/ctc_loss_util.h
+++ b/tensorflow/core/util/ctc/ctc_loss_util.h
@@ -47,4 +47,4 @@ inline float LogSumExp(float log_prob_1, float log_prob_2) {
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_UTIL_CTC_CTC_LOSS_UTIL_H_
-// LINT.ThenChange(//tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h)
+// LINT.ThenChange(//tensorflow/lite/experimental/kernels/ctc_loss_util.h)
diff --git a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
index 732ed33ede17bc90d3301d3f1eee6302a96028d7..2b035ab0e9c8500931665890a637ea6f3242ba22 100644
--- a/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/cuda_kernel_helper_test.cu.cc
@@ -131,7 +131,7 @@ class CudaLaunchConfigTest : public ::testing::Test {
  protected:
   const int bufsize = 1024;
   int* outbuf = nullptr;
-  Eigen::CudaStreamDevice stream;
+  Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice d = Eigen::GpuDevice(&stream);
 
   virtual void SetUp() {
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
index d0d95736d3f1c37055b5383aa4e3141145838aab..080d4067cec69084b54ba1c096d01198a8e48d20 100644
--- a/tensorflow/core/util/cuda_launch_config.h
+++ b/tensorflow/core/util/cuda_launch_config.h
@@ -128,12 +128,12 @@ inline CudaLaunchConfig GetCudaLaunchConfig(int work_element_count,
   CudaLaunchConfig config;
   const int virtual_thread_count = work_element_count;
   const int physical_thread_count = std::min(
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor(),
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
       virtual_thread_count);
-  const int thread_per_block = std::min(1024, d.maxCudaThreadsPerBlock());
+  const int thread_per_block = std::min(1024, d.maxGpuThreadsPerBlock());
   const int block_count =
       std::min(DivUp(physical_thread_count, thread_per_block),
-               d.getNumCudaMultiProcessors());
+               d.getNumGpuMultiProcessors());
 
   config.virtual_thread_count = virtual_thread_count;
   config.thread_per_block = thread_per_block;
@@ -184,7 +184,7 @@ inline CudaLaunchConfig GetCudaLaunchConfigFixedBlockSize(
   cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &block_count, func, fixed_block_size, dynamic_shared_memory_size);
   CHECK_EQ(err, cudaSuccess);
-  block_count = std::min(block_count * d.getNumCudaMultiProcessors(),
+  block_count = std::min(block_count * d.getNumGpuMultiProcessors(),
                          DivUp(work_element_count, fixed_block_size));
 
   config.virtual_thread_count = work_element_count;
@@ -213,7 +213,7 @@ inline Cuda2DLaunchConfig GetCuda2DLaunchConfig(int xdim, int ydim,
   int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
 
   const int physical_thread_count =
-      d.getNumCudaMultiProcessors() * d.maxCudaThreadsPerMultiProcessor();
+      d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor();
 
   const int max_blocks = std::max(physical_thread_count / kThreadsPerBlock, 1);
 
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 8c24076aa9c708769f28c048a4ab5dde993eecd1..cb088faec1ece7cffde4499df900be9d8dd16bc5 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -480,4 +480,16 @@ std::vector<string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
   }
 }
 
+/*static*/ Status DeviceNameUtils::DeviceNameToCpuDeviceName(
+    const string& device_name, string* host_device_name) {
+  DeviceNameUtils::ParsedName device;
+  if (!DeviceNameUtils::ParseFullName(device_name, &device)) {
+    return errors::Internal("Could not parse device name ", device_name);
+  }
+  device.type = "CPU";
+  device.id = 0;
+  *host_device_name = DeviceNameUtils::ParsedNameToString(device);
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index 3f0bc60562329b989682268e6239ca965a6fdc8b..bb5e2b3f0c42b321bc7ab45cdad2ec951671be96 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -169,6 +169,11 @@ class DeviceNameUtils {
   // mapping.
   static std::vector<string> GetLocalNamesForDeviceMappings(
       const ParsedName& pn);
+
+  // Returns name of the CPU:0 device on the same host as the device
+  // `device_name`.
+  static Status DeviceNameToCpuDeviceName(const string& device_name,
+                                          string* host_device_name);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..523d37ecc244b3634545ea82385b377c871569c8
--- /dev/null
+++ b/tensorflow/core/util/dump_graph.cc
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for dumping Graphs, GraphDefs, and FunctionDefs to files for
+// debugging.
+
+#include "tensorflow/core/util/dump_graph.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+struct NameCounts {
+  mutex counts_mutex;
+  std::unordered_map<string, int> counts;
+};
+
+string MakeUniqueFilename(string name) {
+  static NameCounts& instance = *new NameCounts;
+
+  // Remove illegal characters from `name`.
+  for (int i = 0; i < name.size(); ++i) {
+    char ch = name[i];
+    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
+      name[i] = '_';
+    }
+  }
+
+  int count;
+  {
+    mutex_lock lock(instance.counts_mutex);
+    count = instance.counts[name]++;
+  }
+
+  string filename = name;
+  if (count > 0) {
+    absl::StrAppend(&filename, "_", count);
+  }
+  absl::StrAppend(&filename, ".pbtxt");
+  return filename;
+}
+
+#if defined(TENSORFLOW_LITE_PROTOS)
+Status WriteToFile(const string& filepath,
+                   const ::tensorflow::protobuf::MessageLite& proto) {
+  string s;
+  if (!SerializeToStringDeterministic(proto, &s)) {
+    return errors::Internal("Failed to serialize proto to string.");
+  }
+  return WriteStringToFile(Env::Default(), filepath, s);
+}
+#else
+Status WriteToFile(const string& filepath,
+                   const ::tensorflow::protobuf::Message& proto) {
+  return WriteTextProto(Env::Default(), filepath, proto);
+}
+#endif
+
+template <class T>
+string WriteTextProtoToUniqueFile(Env* env, const string& name,
+                                  const char* proto_type, T& proto,
+                                  const string& dirname) {
+  const char* dir = nullptr;
+  if (!dirname.empty()) {
+    dir = dirname.c_str();
+  } else {
+    dir = getenv("TF_DUMP_GRAPH_PREFIX");
+  }
+  if (!dir) {
+    return "(TF_DUMP_GRAPH_PREFIX not specified)";
+  }
+  Status status = env->RecursivelyCreateDir(dir);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to create " << dir << " for dumping " << proto_type
+                 << ": " << status;
+    return "(unavailable)";
+  }
+  string filepath = absl::StrCat(dir, "/", MakeUniqueFilename(name));
+  status = WriteToFile(filepath, proto);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
+                 << " : " << status;
+    return "(unavailable)";
+  }
+  LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
+  return filepath;
+}
+
+}  // anonymous namespace
+
+string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
+                          const string& dirname) {
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef", graph_def,
+                                    dirname);
+}
+
+string DumpGraphToFile(const string& name, Graph const& graph,
+                       const FunctionLibraryDefinition* flib_def,
+                       const string& dirname) {
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+  if (flib_def) {
+    *graph_def.mutable_library() = flib_def->ToProto();
+  }
+  return DumpGraphDefToFile(name, graph_def, dirname);
+}
+
+string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
+                             const string& dirname) {
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef,
+                                    dirname);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/dump_graph.h b/tensorflow/core/util/dump_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..03dc807a2b342edaea57ad8558495462a6af0109
--- /dev/null
+++ b/tensorflow/core/util/dump_graph.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for dumping Graphs, GraphDefs, and FunctionDefs to files for
+// debugging.
+
+#ifndef TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
+#define TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Dumps 'graph_def' to a file, as a GraphDef text proto. Returns the file name
+// chosen.
+//
+// Automatically picks a file name. Prefixes 'name' with the value of the
+// TF_DUMP_GRAPH_PREFIX environment variable if 'dirname' is empty, and suffixes
+// 'name' with ".pbtxt" to form a name. If a graph has already been dumped by
+// this process with the same name, suffixes with "_n.pbtxt", where 'n' is a
+// sequence number.
+string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
+                          const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, but builds the GraphDef to dump from a 'graph'
+// and an optional function library 'flib_def'. Returns the file name chosen.
+string DumpGraphToFile(const string& name, Graph const& graph,
+                       const FunctionLibraryDefinition* flib_def = nullptr,
+                       const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, but dumps a function as a FunctionDef text
+// proto. Returns the file name chosen.
+string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
+                             const string& dirname = "");
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
diff --git a/tensorflow/core/util/dump_graph_test.cc b/tensorflow/core/util/dump_graph_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d01c1c5a0290197d8b52899ab703c1f183c0545b
--- /dev/null
+++ b/tensorflow/core/util/dump_graph_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(DumpGraph, DumpGraphToFileSuccess) {
+  Graph graph(OpRegistry::Global());
+  Node* node;
+  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+
+  setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
+  string ret = DumpGraphToFile("graph", graph);
+  EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph.pbtxt"));
+  ret = DumpGraphToFile("graph", graph);
+  EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph_1.pbtxt"));
+
+  GraphDef gdef;
+  TF_CHECK_OK(ReadTextProto(
+      Env::Default(), io::JoinPath(testing::TmpDir(), "graph.pbtxt"), &gdef));
+  string read, written;
+  gdef.AppendToString(&read);
+  graph.ToGraphDefDebug().AppendToString(&written);
+  EXPECT_EQ(read, written);
+}
+
+TEST(DumpGraph, DumpGraphToFileNoEnvPrefix) {
+  Graph graph(OpRegistry::Global());
+  unsetenv("TF_DUMP_GRAPH_PREFIX");
+  string ret = DumpGraphToFile("graph", graph);
+  EXPECT_EQ(ret, "(TF_DUMP_GRAPH_PREFIX not specified)");
+}
+
+TEST(DumpGraph, DumpFunctionDefToFileSuccess) {
+  FunctionDef fdef;
+  setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
+  string ret = DumpFunctionDefToFile("function", fdef);
+  EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "function.pbtxt"));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index e52d55e2ffef00feaabc25454da2979284034dff..3cc75bbd1f353183184462ec9495c0492cf1442b 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -16,13 +16,13 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb_text.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -164,7 +164,7 @@ class Feature {
         while (!stream.ExpectAtEnd()) {
           uint32 buffer32;
           if (!stream.ReadLittleEndian32(&buffer32)) return false;
-          float_list->push_back(bit_cast<float>(buffer32));
+          float_list->push_back(absl::bit_cast<float>(buffer32));
         }
 
         stream.PopLimit(packed_limit);
@@ -173,7 +173,7 @@ class Feature {
           if (!stream.ExpectTag(kFixed32Tag(1))) return false;
           uint32 buffer32;
           if (!stream.ReadLittleEndian32(&buffer32)) return false;
-          float_list->push_back(bit_cast<float>(buffer32));
+          float_list->push_back(absl::bit_cast<float>(buffer32));
         }
       }
     }
@@ -1600,7 +1600,7 @@ inline int ParseFloatFeature(protobuf::io::CodedInputStream* stream,
           return -1;
         }
         if (out != nullptr) {
-          *out++ = bit_cast<float>(buffer32);
+          *out++ = absl::bit_cast<float>(buffer32);
         }
         num_elements++;
       }
@@ -1613,7 +1613,7 @@ inline int ParseFloatFeature(protobuf::io::CodedInputStream* stream,
           return -1;
         }
         if (out != nullptr) {
-          *out++ = bit_cast<float>(buffer32);
+          *out++ = absl::bit_cast<float>(buffer32);
         }
         num_elements++;
       }
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 04aaea4f894c5c46014167139eb86c63b808c9d7..928807458aca3c79d52e14509eb4238e134b5cdf 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
-#include <string>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -54,9 +54,9 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
-#include "tensorflow/core/util/env_var.h"
 
 #ifndef INTEL_MKL_ML_ONLY
 #include "mkldnn.hpp"
@@ -83,7 +83,12 @@ namespace tensorflow {
 // MKL operation, and did not go through a conversion to a standard
 // Tensorflow tensor.
 
+// For use with MKL ML, has been deprecated
 typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
+
+// The dimensions order that MKL DNN internally uses for 2D activations
+// [Batch, Channel, Height, Width] and
+// for 2D filters [Out_Channel, In_Channel, Height, Width].
 typedef enum {
   Dim_N = 0,
   Dim_C = 1,
@@ -93,6 +98,9 @@ typedef enum {
   Dim_I = 1
 } MklDnnDims;
 
+// The dimensions order that MKL DNN internally uses for 3D activations
+// [Batch, Channel, Depth, Height, Width] and
+// for 3D filters [Out_Channel, In_Channel, Depth, Height, Width].
 typedef enum {
   Dim3d_N = 0,
   Dim3d_C = 1,
@@ -103,6 +111,13 @@ typedef enum {
   Dim3d_I = 1
 } MklDnnDims3D;
 
+// Enum used to templatize MklOp kernel implementations
+// that support both fp32 and int8 versions.
+enum class MklQuantization {
+  QUANTIZED_VERSION,
+  FP_VERSION,
+};
+
 static const int kSmallBatchSize = 32;
 
 #ifdef INTEL_MKL_ML_ONLY
@@ -653,7 +668,6 @@ class MklDnnShape {
     }
   }
 
-
   inline void SetTfDimOrder(const size_t dimension, memory::format format) {
     TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
     SetTfDimOrder(dimension, data_format);
@@ -782,7 +796,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
 }
 #else
 using mkldnn::stream;
-template <typename T> class MklDnnData;
+template <typename T>
+class MklDnnData;
 
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
@@ -792,11 +807,12 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
     if (!mkl_shape.IsMklTensor())
       return mkl_tensor;  // return input since it is already TF tensor
 
-    TensorShape output_shape = mkl_shape.GetTfShape();;
+    TensorShape output_shape = mkl_shape.GetTfShape();
+    ;
 
     // Allocate output tensor.
-    context->allocate_temp(DataTypeToEnum<T>::v(),
-        output_shape, &output_tensor);
+    context->allocate_temp(DataTypeToEnum<T>::v(), output_shape,
+                           &output_tensor);
 
     auto cpu_engine = engine(engine::cpu, 0);
     MklDnnData<T> input(&cpu_engine);
@@ -811,7 +827,7 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
     if (input.IsReorderNeeded(output_tf_pd)) {
       std::vector<primitive> net;
       CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
-             true);
+               true);
       stream(stream::kind::eager).submit(net).wait();
     } else {
       // If not, just forward input tensor to output tensor.
@@ -1386,6 +1402,18 @@ template <>
 memory::data_type MklDnnType<float>() {
   return memory::data_type::f32;
 }
+template <>
+memory::data_type MklDnnType<quint8>() {
+  return memory::data_type::u8;
+}
+template <>
+memory::data_type MklDnnType<qint8>() {
+  return memory::data_type::s8;
+}
+template <>
+memory::data_type MklDnnType<qint32>() {
+  return memory::data_type::s32;
+}
 
 /// Map TensorFlow's data format into MKL-DNN 3D data format
 /// @input: TensorFlow data format
@@ -1616,6 +1644,9 @@ class MklDnnData {
         cpu_engine_(e) {}
 
   ~MklDnnData() {
+    if (allocated_buffer_ != nullptr) {
+      cpu_allocator()->DeallocateRaw(allocated_buffer_);
+    }
     cpu_engine_ = nullptr;  // We don't own this.
     delete (user_memory_);
     delete (reorder_memory_);
@@ -2003,8 +2034,7 @@ const mkldnn::memory::dims NONE_DIMS = {};
 template <typename T>
 class MklPrimitiveFactory {
  public:
-  MklPrimitiveFactory() {
-  }
+  MklPrimitiveFactory() {}
 
   ~MklPrimitiveFactory() {}
 
@@ -2032,8 +2062,8 @@ class MklPrimitiveFactory {
   /// For those legacy device(w/o AVX512 and AVX2),
   /// MKL-DNN GEMM will be used.
   static inline bool IsLegacyPlatform() {
-    return (!port::TestCPUFeature(port::CPUFeature::AVX512F)
-                   && !port::TestCPUFeature(port::CPUFeature::AVX2));
+    return (!port::TestCPUFeature(port::CPUFeature::AVX512F) &&
+            !port::TestCPUFeature(port::CPUFeature::AVX2));
   }
 
   /// Fuction to check whether primitive memory optimization is enabled
@@ -2054,15 +2084,13 @@ class MklPrimitiveFactory {
 // utility class for creating keys of MKL primitive pool.
 class FactoryKeyCreator {
  public:
-  FactoryKeyCreator() {
-    key_.reserve(kMaxKeyLength);
-  }
+  FactoryKeyCreator() { key_.reserve(kMaxKeyLength); }
 
   ~FactoryKeyCreator() {}
 
   void AddAsKey(const string& str) { Append(str); }
 
-  void AddAsKey(const mkldnn::memory::dims &dims) {
+  void AddAsKey(const mkldnn::memory::dims& dims) {
     for (unsigned int i = 0; i < dims.size(); i++) {
       AddAsKey<int>(dims[i]);
     }
@@ -2070,7 +2098,7 @@ class FactoryKeyCreator {
 
   template <typename T>
   void AddAsKey(const T data) {
-    auto buffer = reinterpret_cast<const char *>(&data);
+    auto buffer = reinterpret_cast<const char*>(&data);
     Append(StringPiece(buffer, sizeof(T)));
   }
 
@@ -2086,7 +2114,6 @@ class FactoryKeyCreator {
   }
 };
 
-
 static inline memory::format get_desired_format(int channel,
                                                 bool is_2d = true) {
   memory::format fmt_desired = memory::format::any;
@@ -2108,37 +2135,34 @@ class MklReorderPrimitive : public MklPrimitive {
   explicit MklReorderPrimitive(const memory* from, const memory* to) {
     Setup(from, to);
   }
-    ~MklReorderPrimitive() {}
+  ~MklReorderPrimitive() {}
 
-    std::shared_ptr<primitive> GetPrimitive() {
-      return context_.reorder_prim;
-    }
+  std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
 
-    void SetMemory(const memory* from, const memory* to) {
-      context_.src_mem->set_data_handle(from->get_data_handle());
-      context_.dst_mem->set_data_handle(to->get_data_handle());
-    }
+  void SetMemory(const memory* from, const memory* to) {
+    context_.src_mem->set_data_handle(from->get_data_handle());
+    context_.dst_mem->set_data_handle(to->get_data_handle());
+  }
 
  private:
-    struct ReorderContext {
-      std::shared_ptr<mkldnn::memory> src_mem;
-      std::shared_ptr<mkldnn::memory> dst_mem;
-      std::shared_ptr<primitive> reorder_prim;
-      ReorderContext():
-        src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {
-      }
-    } context_;
-
-    engine cpu_engine_ = engine(engine::cpu, 0);
-
-    void Setup(const memory* from, const memory* to) {
-      context_.src_mem.reset(new memory(
-            {from->get_primitive_desc().desc(), cpu_engine_}, DummyData));
-      context_.dst_mem.reset(new memory(
-            {to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
-      context_.reorder_prim = std::make_shared<mkldnn::reorder>(
-          reorder(*context_.src_mem, *context_.dst_mem));
-    }
+  struct ReorderContext {
+    std::shared_ptr<mkldnn::memory> src_mem;
+    std::shared_ptr<mkldnn::memory> dst_mem;
+    std::shared_ptr<primitive> reorder_prim;
+    ReorderContext()
+        : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
+  } context_;
+
+  engine cpu_engine_ = engine(engine::cpu, 0);
+
+  void Setup(const memory* from, const memory* to) {
+    context_.src_mem.reset(new memory(
+        {from->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    context_.dst_mem.reset(
+        new memory({to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    context_.reorder_prim = std::make_shared<mkldnn::reorder>(
+        reorder(*context_.src_mem, *context_.dst_mem));
+  }
 };
 
 template <typename T>
@@ -2156,41 +2180,51 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     return reorderPrim;
   }
 
-    static MklReorderPrimitiveFactory & GetInstance() {
-      static MklReorderPrimitiveFactory instance_;
-      return instance_;
-    }
+  static MklReorderPrimitiveFactory& GetInstance() {
+    static MklReorderPrimitiveFactory instance_;
+    return instance_;
+  }
 
  private:
-    MklReorderPrimitiveFactory() {}
-    ~MklReorderPrimitiveFactory() {}
-
-    static string CreateKey(const memory* from, const memory* to) {
-      string prefix = "reorder";
-      FactoryKeyCreator key_creator;
-      auto const &from_desc =  from->get_primitive_desc().desc().data;
-      auto const &to_desc =  to->get_primitive_desc().desc().data;
-      memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
-      memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
-      key_creator.AddAsKey(prefix);
-      key_creator.AddAsKey(static_cast<int>(from_desc.format));
-      key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
-      key_creator.AddAsKey(from_dims);
-      key_creator.AddAsKey(static_cast<int>(to_desc.format));
-      key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
-      key_creator.AddAsKey(to_dims);
-      return key_creator.GetKey();
-    }
-
-    MklPrimitive* GetReorder(const memory* from, const memory* to) {
-      string key = CreateKey(from, to);
-      return this->GetOp(key);
-    }
-
-    void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
-      string key = CreateKey(from, to);
-      this->SetOp(key, op);
-    }
+  MklReorderPrimitiveFactory() {}
+  ~MklReorderPrimitiveFactory() {}
+
+  static string CreateKey(const memory* from, const memory* to) {
+    string prefix = "reorder";
+    FactoryKeyCreator key_creator;
+    auto const& from_desc = from->get_primitive_desc().desc().data;
+    auto const& to_desc = to->get_primitive_desc().desc().data;
+    const int KIdxFirstStride = 0;
+    memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
+    memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
+    memory::dims from_strides(
+        from_desc.layout_desc.blocking.strides[KIdxFirstStride],
+        &from_desc.layout_desc.blocking
+             .strides[KIdxFirstStride][from_desc.ndims]);
+    memory::dims to_strides(
+        to_desc.layout_desc.blocking.strides[KIdxFirstStride],
+        &to_desc.layout_desc.blocking.strides[KIdxFirstStride][to_desc.ndims]);
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(static_cast<int>(from_desc.format));
+    key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
+    key_creator.AddAsKey(from_dims);
+    key_creator.AddAsKey(from_strides);
+    key_creator.AddAsKey(static_cast<int>(to_desc.format));
+    key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
+    key_creator.AddAsKey(to_dims);
+    key_creator.AddAsKey(to_strides);
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetReorder(const memory* from, const memory* to) {
+    string key = CreateKey(from, to);
+    return this->GetOp(key);
+  }
+
+  void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
+    string key = CreateKey(from, to);
+    this->SetOp(key, op);
+  }
 };
 
 /// Fuction to find(or create) a reorder from memory pointed by
diff --git a/tensorflow/core/util/permutation_input_iterator.h b/tensorflow/core/util/permutation_input_iterator.h
index f6375b25157644cda97aa195958b60ac27b8a4d6..649318ebf3b4542a244f98342702cef087d28fce 100644
--- a/tensorflow/core/util/permutation_input_iterator.h
+++ b/tensorflow/core/util/permutation_input_iterator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
-#define TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
 
 #include <iostream>
 #include <iterator>
@@ -131,4 +131,4 @@ class PermutationInputIterator {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/permutation_output_iterator.h b/tensorflow/core/util/permutation_output_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..638c0f4545853b28dd5822817c1ec8759bb3a80b
--- /dev/null
+++ b/tensorflow/core/util/permutation_output_iterator.h
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename ValueType, typename OutputIteratorT, typename IndexIteratorT,
+          typename OffsetT = ptrdiff_t>
+class PermutationOutputIterator {
+ public:
+  // Required iterator traits
+  typedef PermutationOutputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;  ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef ValueType
+      value_type;  ///< The type of the element the iterator can point to
+  typedef ValueType* pointer;    ///< The type of a pointer to an element the
+                                 ///< iterator can point to
+  typedef ValueType& reference;  ///< The type of a reference to an element the
+                                 ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+ private:
+  OutputIteratorT output_itr;
+  IndexIteratorT index_itr;
+
+ public:
+  /// Constructor
+  __host__ __device__ __forceinline__ PermutationOutputIterator(
+      OutputIteratorT output_itr,  ///< Input iterator to wrap
+      IndexIteratorT index_itr)    ///< Conversion functor to wrap
+      : output_itr(output_itr), index_itr(index_itr) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    index_itr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    index_itr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return output_itr[*index_itr];
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(output_itr, index_itr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    index_itr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(output_itr, index_itr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    index_itr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return index_itr - other.index_itr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return output_itr[index_itr[n]];
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (index_itr == rhs.index_itr && output_itr == rhs.output_itr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return !(*this == rhs);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
index cbcb203ee76471674429f133d54d4d0875dd9d5d..8dde14dffcdc5ffe4d64360f3af40521efe29bf8 100644
--- a/tensorflow/core/util/proto/decode.h
+++ b/tensorflow/core/util/proto/decode.h
@@ -318,7 +318,7 @@ inline int ReadPackedPrimitives(const void* bufp, const size_t len,
   return count;
 }
 
-// Reads a primitive value field from a serialized proto.
+// Reads a value of a primitive type field from a serialized proto.
 // The value is parsed from the serialized format, then static_cast
 // to the desired type for TensorFlow and stored.
 template <class ValueType, class TensorType,
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index b9ca8ab395bb85048e9dfca1db48303ce92e8316..89c163aa5133fafc23b01c7153ac40d32efcaaf6 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -238,15 +238,6 @@ class SparseTensor {
   static Status Split(const SparseTensor& tensor, const int split_dim,
                       const int num_split, std::vector<SparseTensor>* result);
 
-  template <typename T>
-  ABSL_DEPRECATED(
-      "Use the form of Split() that takes an output pointer and returns a "
-      "status instead.")
-  static std::vector<SparseTensor> Split(const SparseTensor& tensor,
-                                         const int split_dim,
-                                         const int num_split,
-                                         Status* status = nullptr);
-
   // Slice() will slice the input SparseTensor into a SparseTensor based on
   // specified start and size. Both start and size are 1-D array with each
   // element of the array representing one dimension. The start is the start
@@ -578,10 +569,9 @@ SparseTensor SparseTensor::Concat(
 }
 
 template <typename T>
-std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
-                                              const int split_dim,
-                                              const int num_split,
-                                              Status* status /* = nullptr */) {
+Status SparseTensor::Split(const SparseTensor& input_tensor,
+                           const int split_dim, const int num_split,
+                           std::vector<SparseTensor>* result) {
   std::vector<Tensor> output_indices;
   std::vector<Tensor> output_values;
   std::vector<TensorShape> output_shapes;
@@ -601,17 +591,15 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
   const int split_dim_size = input_tensor.shape()[split_dim];
   const int split_size = split_dim_size / num_split;
 
-  if (!(num_split > 0 && num_split <= split_dim_size) && status != nullptr) {
-    *status = Status(error::INVALID_ARGUMENT,
-                     strings::StrCat("num_split must be in the interval (0, ",
-                                     split_dim_size, "]"));
-    return {};
+  if (!(num_split > 0 && num_split <= split_dim_size)) {
+    return Status(error::INVALID_ARGUMENT,
+                  strings::StrCat("num_split must be in the interval (0, ",
+                                  split_dim_size, "]"));
   }
   if (!(split_dim >= 0 && split_dim < num_dim)) {
-    *status = Status(
+    return Status(
         error::INVALID_ARGUMENT,
         strings::StrCat("num_dim must be in the interval [0, ", num_dim, ")"));
-    return {};
   }
 
   const int residual = split_dim_size % num_split;
@@ -649,28 +637,18 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
     }
   }
 
-  std::vector<SparseTensor> output_tensors;
-  output_tensors.reserve(num_split);
+  result->clear();
+  result->reserve(num_split);
   for (int i = 0; i < num_split; ++i) {
     SparseTensor tensor;
     Status create_status =
         Create(output_indices[i], output_values[i], output_shapes[i], &tensor);
-    if (!create_status.ok() && status != nullptr) {
-      *status = create_status;
-      return {};
+    if (!create_status.ok()) {
+      return create_status;
     }
-    output_tensors.push_back(std::move(tensor));
+    result->push_back(std::move(tensor));
   }
-  return output_tensors;
-}
-
-template <typename T>
-Status SparseTensor::Split(const SparseTensor& input_tensor,
-                           const int split_dim, const int num_split,
-                           std::vector<SparseTensor>* result) {
-  Status status;
-  *result = Split<T>(input_tensor, split_dim, num_split, &status);
-  return status;
+  return Status::OK();
 }
 
 template <typename T>
diff --git a/tensorflow/core/util/stats_calculator.cc b/tensorflow/core/util/stats_calculator.cc
index eb077546501327c62aff5c9d68eb5d0ba1c9aa1c..bce650f2456029b578356e572393c0ec08df2441 100644
--- a/tensorflow/core/util/stats_calculator.cc
+++ b/tensorflow/core/util/stats_calculator.cc
@@ -53,7 +53,7 @@ std::string StatsCalculator::HeaderString(const std::string& title) const {
          << " ==============================" << std::endl;
 
   InitField(stream, 24) << "[node type]";
-  InitField(stream, 9) << "[start]";
+  InitField(stream, 17) << "[start]";
   InitField(stream, 9) << "[first]";
   InitField(stream, 9) << "[avg ms]";
   InitField(stream, 8) << "[%]";
@@ -77,7 +77,7 @@ std::string StatsCalculator::ColumnString(const Detail& detail,
 
   std::stringstream stream;
   InitField(stream, 24) << detail.type;
-  InitField(stream, 9) << start_ms;
+  InitField(stream, 17) << start_ms;
   InitField(stream, 9) << first_time_ms;
   InitField(stream, 9) << avg_time_ms;
   InitField(stream, 7) << percentage << "%";
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index ad8a44a518489b3b60738df9902d395666afc96b..55688e580848e42bdd453a270a530a5423fb3aec 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -83,10 +83,17 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
   {
     int full_index = 0;
 
-    const auto& strides_flat = sparse.strides_tensor.flat<T>();
+    const T* const strides_flat = sparse.strides_tensor.vec<T>().data();
     dense->begin_valid = sparse.begin_tensor != nullptr;
     dense->end_valid = sparse.end_tensor != nullptr;
 
+    const T* const begin_flat = sparse.begin_tensor != nullptr
+                                    ? sparse.begin_tensor->vec<T>().data()
+                                    : nullptr;
+    const T* const end_flat = sparse.end_tensor != nullptr
+                                  ? sparse.end_tensor->vec<T>().data()
+                                  : nullptr;
+
     for (int i = 0; i < sparse.dims; i++) {
       if ((1 << i) & sparse.ellipsis_mask) {
         // Expand the ellipsis into the appropriate indices
@@ -112,16 +119,14 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
         }
 
         // Gather slicing spec into appropriate index
-        if (sparse.begin_tensor != nullptr) {
-          const auto& begin_flat = sparse.begin_tensor->flat<T>();
-          dense->begin[full_index] = internal::SubtleMustCopy<T>(begin_flat(i));
+        if (begin_flat != nullptr) {
+          dense->begin[full_index] = internal::SubtleMustCopy<T>(begin_flat[i]);
         }
-        if (sparse.end_tensor != nullptr) {
-          const auto& end_flat = sparse.end_tensor->flat<T>();
-          dense->end[full_index] = internal::SubtleMustCopy<T>(end_flat(i));
+        if (end_flat != nullptr) {
+          dense->end[full_index] = internal::SubtleMustCopy<T>(end_flat[i]);
         }
         dense->strides[full_index] =
-            internal::SubtleMustCopy<T>(strides_flat(i));
+            internal::SubtleMustCopy<T>(strides_flat[i]);
         if (sparse.begin_mask & (1 << i)) {
           dense->begin_mask |= (1 << full_index);
         }
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 2dcb57a1f9bd22eeee746debb32e08551ef2d6ec..3709ee5ae30f9a01652c98b5188ca3229109c1a1 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -785,7 +785,7 @@ Status BundleReader::GetBundleEntryProto(StringPiece key,
   TF_RETURN_IF_ERROR(
       ParseEntryProto(iter_->key(), iter_->value(), &entry_copy));
   if (!TensorShape::IsValid(entry_copy.shape())) {
-    return errors::DataLoss("Invaid tensor shape: ", key, " ",
+    return errors::DataLoss("Invalid tensor shape: ", key, " ",
                             ProtoShortDebugString(entry_copy.shape()));
   }
 
@@ -895,7 +895,7 @@ Status BundleReader::ReadCurrent(Tensor* val) {
   BundleEntryProto entry;
   TF_RETURN_IF_ERROR(ParseEntryProto(iter_->key(), iter_->value(), &entry));
   if (!TensorShape::IsValid(entry.shape())) {
-    return errors::DataLoss("Invaid tensor shape: ", iter_->key(), " ",
+    return errors::DataLoss("Invalid tensor shape: ", iter_->key(), " ",
                             ProtoShortDebugString(entry.shape()));
   }
 
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index b0c349dd907b71f1a33854930802e1692b3cfb69..a296fb447e252e62809aeb17d9d00cf35ad15fc9 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -498,7 +498,8 @@ inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
   dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
   for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
     auto dim_size = spatial[dim];
-    if (format == FORMAT_NHWC_VECT_W && dim == spatial.size() - 1) {
+    if (format == FORMAT_NHWC_VECT_W &&
+        static_cast<size_t>(dim) == spatial.size() - 1) {
       CHECK_EQ(0, dim_size % 4)
           << "FORMAT_NHWC_VECT_W requires W to be a multiple of 4, but W="
           << dim_size;
diff --git a/tensorflow/core/util/tensor_ops_util.h b/tensorflow/core/util/tensor_ops_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..615f088a9b9e4dfce918473cb5e0ef8c9e551230
--- /dev/null
+++ b/tensorflow/core/util/tensor_ops_util.h
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+Status ZerosLikeTensor(OpKernelContext* ctx, const Tensor& x, Tensor* out) {
+  AllocatorAttributes attr;
+  if (x.dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(x.dtype(), x.shape(), out, attr));
+
+  switch (out->dtype()) {
+#define DTYPE_CASE(dtype)                                       \
+  case DataTypeToEnum<dtype>::value:                            \
+    /* TODO(skyewm): use SetZeroFunctor like in ZerosLikeOp? */ \
+    out->flat<dtype>().device(ctx->eigen_device<Device>()) =    \
+        out->flat<dtype>().constant(dtype(0));                  \
+    break;
+
+    TF_CALL_POD_TYPES(DTYPE_CASE)
+#undef DTYPE_CASE
+
+    case DT_INVALID: {
+      *out = Tensor(DT_INVALID);
+      break;
+    }
+    case DataTypeToEnum<Variant>::value: {
+      Variant* out_variant = out->scalar<Variant>().data();
+      TF_RETURN_IF_ERROR(
+          UnaryOpVariant<Device>(ctx, ZEROS_LIKE_VARIANT_UNARY_OP,
+                                 x.scalar<Variant>()(), out_variant));
+      break;
+    }
+    default:
+      return errors::InvalidArgument(
+          "Trying to compute zeros_like for unsupported dtype ",
+          DataTypeString(out->dtype()));
+  }
+  return Status::OK();
+}
+
+template <typename Device>
+Status BinaryAddTensors(OpKernelContext* ctx, const Tensor& a, const Tensor& b,
+                        Tensor* out) {
+  if (a.dtype() == DT_INVALID) {
+    *out = b;
+    return Status::OK();
+  }
+  if (b.dtype() == DT_INVALID) {
+    *out = a;
+    return Status::OK();
+  }
+  if (a.dtype() != b.dtype()) {
+    return errors::InvalidArgument(
+        "Trying to add two tensors with incompatible element types. ",
+        "One is ", DataTypeString(a.dtype()), " and the other is ",
+        DataTypeString(b.dtype()));
+  }
+  if (a.shape() != b.shape()) {
+    // TODO(apassos) support broadcasting additions here?
+    return errors::InvalidArgument(
+        "Trying to add two tensors with incompatible element shapes. ",
+        "One is ", a.shape().DebugString(), " and the other is ",
+        b.shape().DebugString());
+  }
+
+  AllocatorAttributes attr;
+  if (a.dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(a.dtype(), a.shape(), out, attr));
+
+  switch (out->dtype()) {
+#define DTYPE_CASE(dtype)                                    \
+  case DataTypeToEnum<dtype>::value:                         \
+    out->flat<dtype>().device(ctx->eigen_device<Device>()) = \
+        a.flat<dtype>() + b.flat<dtype>();                   \
+    break;
+
+    TF_CALL_NUMBER_TYPES(DTYPE_CASE)
+#undef DTYPE_CASE
+
+    case DataTypeToEnum<Variant>::value: {
+      Variant* out_variant = out->scalar<Variant>().data();
+      TF_RETURN_IF_ERROR(BinaryOpVariants<Device>(
+          ctx, ADD_VARIANT_BINARY_OP, a.scalar<Variant>()(),
+          b.scalar<Variant>()(), out_variant));
+      break;
+    }
+    default:
+      return errors::InvalidArgument("Trying to add unsupported dtype ",
+                                     out->dtype());
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index cf8054be6a3e89a307a10fdb711a62ac3a46d410..2b39b3683f260b840b36e7f991b0d0c8e19aa18b 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -68,6 +68,7 @@ py_test(
     size = "small",
     srcs = ["zero_out_1_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notap"],
     deps = [
         ":zero_out_op_1",
         "//tensorflow:tensorflow_py",
@@ -79,6 +80,7 @@ py_test(
     size = "small",
     srcs = ["zero_out_2_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notap"],
     deps = [
         ":zero_out_grad_2",
         ":zero_out_op_2",
@@ -91,6 +93,7 @@ py_test(
     size = "small",
     srcs = ["zero_out_3_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notap"],
     deps = [
         ":zero_out_op_3",
         "//tensorflow:tensorflow_py",
@@ -116,7 +119,7 @@ py_test(
     size = "small",
     srcs = ["cuda_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + ["notap"],
     deps = [
         ":cuda_op",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/examples/adding_an_op/fact_test.py b/tensorflow/examples/adding_an_op/fact_test.py
index 11163e7ba5c6421554afa0486f4c102d0743e5e2..46beaebe0cc01d2fea29defa6a58573a45ec091b 100644
--- a/tensorflow/examples/adding_an_op/fact_test.py
+++ b/tensorflow/examples/adding_an_op/fact_test.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
+from tensorflow.python.framework import test_util
 
 
 class FactTest(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       print(tf.user_ops.my_fact().eval())
diff --git a/tensorflow/examples/adding_an_op/zero_out_1_test.py b/tensorflow/examples/adding_an_op/zero_out_1_test.py
index 342d3a020cc325de4991b1f620f4cd2110ed0906..459ac2dc279ef6adfd7bbc1773fa3745c15ca35d 100644
--- a/tensorflow/examples/adding_an_op/zero_out_1_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_1_test.py
@@ -23,10 +23,12 @@ import os.path
 
 import tensorflow as tf
 from tensorflow.examples.adding_an_op import zero_out_op_1
+from tensorflow.python.framework import test_util
 
 
 class ZeroOut1Test(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       result = zero_out_op_1.zero_out([5, 4, 3, 2, 1])
diff --git a/tensorflow/examples/adding_an_op/zero_out_2_test.py b/tensorflow/examples/adding_an_op/zero_out_2_test.py
index 45045978176a65fb7aaacd4c8d6f1b209f6e82ac..650fd9546b5501f603306d935ade8d08b86b133a 100644
--- a/tensorflow/examples/adding_an_op/zero_out_2_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_2_test.py
@@ -24,20 +24,24 @@ import tensorflow as tf
 
 from tensorflow.examples.adding_an_op import zero_out_grad_2  # pylint: disable=unused-import
 from tensorflow.examples.adding_an_op import zero_out_op_2
+from tensorflow.python.framework import test_util
 
 
 class ZeroOut2Test(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       result = zero_out_op_2.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
+  @test_util.run_deprecated_v1
   def test_2d(self):
     with self.cached_session():
       result = zero_out_op_2.zero_out([[6, 5, 4], [3, 2, 1]])
       self.assertAllEqual(result.eval(), [[6, 0, 0], [0, 0, 0]])
 
+  @test_util.run_deprecated_v1
   def test_grad(self):
     with self.cached_session():
       shape = (5,)
@@ -46,6 +50,7 @@ class ZeroOut2Test(tf.test.TestCase):
       err = tf.test.compute_gradient_error(x, shape, y, shape)
       self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def test_grad_2d(self):
     with self.cached_session():
       shape = (2, 3)
diff --git a/tensorflow/examples/adding_an_op/zero_out_3_test.py b/tensorflow/examples/adding_an_op/zero_out_3_test.py
index 15d62495aaee769f8aad79b844e3bb9b0a1e0df2..8cbe2b6793a436be7f3b954e64dd85d8ae5c891f 100644
--- a/tensorflow/examples/adding_an_op/zero_out_3_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_3_test.py
@@ -21,31 +21,36 @@ from __future__ import print_function
 
 import tensorflow as tf
 from tensorflow.examples.adding_an_op import zero_out_op_3
+from tensorflow.python.framework import test_util
 
 
 class ZeroOut3Test(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
+  @test_util.run_deprecated_v1
   def testAttr(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=3)
       self.assertAllEqual(result.eval(), [0, 0, 0, 2, 0])
 
+  @test_util.run_deprecated_v1
   def testNegative(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=-1)
       with self.assertRaisesOpError("Need preserve_index >= 0, got -1"):
-        result.eval()
+        self.evaluate(result)
 
+  @test_util.run_deprecated_v1
   def testLarge(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=17)
       with self.assertRaisesOpError("preserve_index out of range"):
-        result.eval()
+        self.evaluate(result)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.cc b/tensorflow/examples/android/jni/object_tracking/keypoint_detector.cc
index eb431328a7af52b5d14ce72df7034d8662c7f2bc..fc60d2a5ca97e8ee0b287a6d64ae1b2469c470a0 100644
--- a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.cc
+++ b/tensorflow/examples/android/jni/object_tracking/keypoint_detector.cc
@@ -311,7 +311,7 @@ int KeypointDetector::AddExtraCandidatesForBoxes(
           return num_keypoints_added;
         }
 
-        Keypoint curr_keypoint = keypoints[num_keypoints_added++];
+        Keypoint& curr_keypoint = keypoints[num_keypoints_added++];
         curr_keypoint.pos_ = Point2f(
             box.left_ + box.GetWidth() * (i + 0.5f) / kNumToAddAsCandidates,
             box.top_ + box.GetHeight() * (j + 0.5f) / kNumToAddAsCandidates);
diff --git a/tensorflow/examples/autograph/integration_tests/BUILD b/tensorflow/examples/autograph/integration_tests/BUILD
index 3630b41fc8d071887931f659a97e7dbe0e38d696..2a4a0f75e7a120d554c882025ad2a0e280913a6d 100644
--- a/tensorflow/examples/autograph/integration_tests/BUILD
+++ b/tensorflow/examples/autograph/integration_tests/BUILD
@@ -16,25 +16,12 @@ filegroup(
     visibility = ["//tensorflow:__subpackages__"],
 )
 
-py_test(
-    name = "errors_test",
-    srcs = [
-        "errors_test.py",
-    ],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 py_test(
     name = "keras_test",
     srcs = [
         "keras_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
@@ -46,7 +33,6 @@ py_test(
         "list_literals_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/examples/autograph/integration_tests/errors_test.py b/tensorflow/examples/autograph/integration_tests/errors_test.py
deleted file mode 100644
index 9c10dad9aa358a4f085dfd30751f0706e107bedf..0000000000000000000000000000000000000000
--- a/tensorflow/examples/autograph/integration_tests/errors_test.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Error traceback rewriting integration tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.python import autograph as ag
-
-
-class ErrorsTest(tf.test.TestCase):
-
-  def test_graph_construction_error_rewriting_call_tree(self):
-
-    def test_fn():
-      return tf.random_normal((2, 3), mean=0.0, dtype=tf.int32)
-
-    def inner_caller():
-      return test_fn()
-
-    def caller():
-      return inner_caller()
-
-    with self.assertRaises(ag.GraphConstructionError) as error:
-      graph = ag.to_graph(caller)
-      graph()
-    expected = error.exception
-    custom_traceback = expected.custom_traceback
-    found_correct_filename = False
-    num_test_fn_names = 0
-    num_inner_caller_names = 0
-    num_caller_names = 0
-    for frame in custom_traceback:
-      filename, _, fn_name, _ = frame
-      self.assertFalse('/tmp/' in filename)
-      found_correct_filename |= __file__ in filename
-      self.assertNotEqual('tf__test_fn', fn_name)
-      num_test_fn_names += int('test_fn' == fn_name)
-      self.assertNotEqual('tf__inner_caller', fn_name)
-      num_inner_caller_names += int('inner_caller' == fn_name)
-      self.assertNotEqual('tf__caller', fn_name)
-      num_caller_names += int('caller' == fn_name)
-    self.assertTrue(found_correct_filename)
-    self.assertEqual(num_test_fn_names, 1)
-    self.assertEqual(num_inner_caller_names, 1)
-    self.assertEqual(num_caller_names, 1)
-
-  def test_graph_construction_error_rewriting_class(self):
-
-    class TestClass(object):
-
-      def test_fn(self):
-        return tf.random_normal((2, 3), mean=0.0, dtype=tf.int32)
-
-      def inner_caller(self):
-        return self.test_fn()
-
-      def caller(self):
-        return self.inner_caller()
-
-    # Note we expect a TypeError here because the traceback will not be
-    # rewritten for classes.
-    with self.assertRaises(TypeError):
-      graph = ag.to_graph(TestClass)
-      graph().caller()
-
-  def test_runtime_error_rewriting(self):
-
-    def g(x, s):
-      while tf.reduce_sum(x) > s:
-        x //= 0
-      return x
-
-    def test_fn(x):
-      return g(x, 10)
-
-    compiled_fn = ag.to_graph(test_fn)
-
-    with self.assertRaises(ag.TfRuntimeError) as error:
-      with self.cached_session() as sess:
-        x = compiled_fn(tf.constant([4, 8]))
-        with ag.improved_errors(compiled_fn):
-          sess.run(x)
-    expected = error.exception
-    custom_traceback = expected.custom_traceback
-    found_correct_filename = False
-    num_test_fn_frames = 0
-    num_g_frames = 0
-    for frame in custom_traceback:
-      filename, _, fn_name, source_code = frame
-      self.assertFalse('/tmp/' in filename)
-      self.assertFalse('control_flow.py' in filename)
-      self.assertFalse('ag__.' in fn_name)
-      found_correct_filename |= __file__ in filename
-      num_test_fn_frames += int('test_fn' == fn_name and
-                                'return g(x, 10)' in source_code)
-      num_g_frames += int('g' == fn_name and 'x //= 0' in source_code)
-    self.assertTrue(found_correct_filename)
-    self.assertEqual(num_test_fn_frames, 1)
-    self.assertEqual(num_g_frames, 1)
-
-  def test_runtime_error_rewriting_nested(self):
-
-    def test_fn(x):
-
-      def g(y):
-        return y**2 // 0
-
-      s = 0
-      for xi in x:
-        s += g(xi)
-      return s
-
-    compiled_fn = ag.to_graph(test_fn)
-
-    # TODO(b/111408261): Nested functions currently do not rewrite correctly,
-    # when they do we should change this test to check for the same traceback
-    # properties as the other tests.  This should throw a runtime error with a
-    # frame with "g" as the function name but because we don't yet add
-    # try/except blocks to inner functions the name is "tf__g".
-    with self.assertRaises(ag.TfRuntimeError) as error:
-      with self.cached_session() as sess:
-        x = compiled_fn(tf.constant([4, 8]))
-        with ag.improved_errors(compiled_fn):
-          sess.run(x)
-    expected = error.exception
-    custom_traceback = expected.custom_traceback
-    num_tf_g_frames = 0
-    for frame in custom_traceback:
-      _, _, fn_name, _ = frame
-      self.assertNotEqual('g', fn_name)
-      num_tf_g_frames += int('tf__g' == fn_name)
-    self.assertEqual(num_tf_g_frames, 1)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/examples/autograph/integration_tests/keras_test.py b/tensorflow/examples/autograph/integration_tests/keras_test.py
index dca7c07b470498394593756a93a69af48c4ece43..3fe33df920d008845bfd1002075fd6b5dc25b31f 100644
--- a/tensorflow/examples/autograph/integration_tests/keras_test.py
+++ b/tensorflow/examples/autograph/integration_tests/keras_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
 
 from tensorflow.python import autograph
+from tensorflow.python.framework import test_util
 
 
 class MinimalKeras(tf.keras.Model):
@@ -84,6 +85,7 @@ class KerasTest(tf.test.TestCase):
     model = ModelWithStaticConditional(True)
     self.assertEqual(model.call(), 25)
 
+  @test_util.run_deprecated_v1
   def test_recursive_true(self):
     with self.assertRaisesRegexp(NotImplementedError,
                                  'Object conversion is not yet supported.'):
@@ -93,10 +95,10 @@ class KerasTest(tf.test.TestCase):
         init = tf.global_variables_initializer()
 
         with tf.Session() as sess:
-          sess.run(init)
+          self.evaluate(init)
           sample_input = tf.random_uniform((1, 10, 10, 1))
           output = model(sample_input)  # pylint: disable=not-callable
-          self.assertEqual(sess.run(output).shape, (1, 3))
+          self.assertEqual(self.evaluate(output).shape, (1, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/autograph/integration_tests/list_literals_test.py b/tensorflow/examples/autograph/integration_tests/list_literals_test.py
index 917f5ff9d849d131d18e7e6c748d9c679b1b119e..e85d4abcfc9adfbb4bc6390589b846f7e59f3739 100644
--- a/tensorflow/examples/autograph/integration_tests/list_literals_test.py
+++ b/tensorflow/examples/autograph/integration_tests/list_literals_test.py
@@ -34,7 +34,7 @@ class ListLiteralsTest(tf.test.TestCase):
     result = converted()
 
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(result), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(result), [1, 2, 3])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/benchmark/BUILD b/tensorflow/examples/benchmark/BUILD
deleted file mode 100644
index 98611a9aadf6f456dd4f9fe4f423e3e2ce9722ec..0000000000000000000000000000000000000000
--- a/tensorflow/examples/benchmark/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-# Description:
-# Examples of adding a benchmark to TensorFlow.
-
-load(
-    "//tensorflow/tools/test:performance.bzl",
-    "tf_py_logged_benchmark",
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_test(
-    name = "sample_benchmark",
-    srcs = ["sample_benchmark.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-tf_py_logged_benchmark(
-    name = "sample_logged_benchmark",
-    target = "//tensorflow/examples/benchmark:sample_benchmark",
-)
diff --git a/tensorflow/examples/benchmark/sample_benchmark.py b/tensorflow/examples/benchmark/sample_benchmark.py
deleted file mode 100644
index e98d7a2b5f09c08f8796d982e218081ca248de58..0000000000000000000000000000000000000000
--- a/tensorflow/examples/benchmark/sample_benchmark.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Sample TensorFlow benchmark."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import tensorflow as tf
-
-
-# Define a class that extends from tf.test.Benchmark.
-class SampleBenchmark(tf.test.Benchmark):
-
-  # Note: benchmark method name must start with `benchmark`.
-  def benchmarkSum(self):
-    with tf.Session() as sess:
-      x = tf.constant(10)
-      y = tf.constant(5)
-      result = tf.add(x, y)
-
-      iters = 100
-      start_time = time.time()
-      for _ in range(iters):
-        sess.run(result)
-      total_wall_time = time.time() - start_time
-
-      # Call report_benchmark to report a metric value.
-      self.report_benchmark(
-          name="sum_wall_time",
-          # This value should always be per iteration.
-          wall_time=total_wall_time/iters,
-          iters=iters)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tensorflow/examples/get_started/regression/custom_regression.py b/tensorflow/examples/get_started/regression/custom_regression.py
index 2e34362c5ced96ac6aec5a9258519bb49ef9157d..7b7cbb78666f0de5e77858b79eda721adc493ecb 100644
--- a/tensorflow/examples/get_started/regression/custom_regression.py
+++ b/tensorflow/examples/get_started/regression/custom_regression.py
@@ -100,12 +100,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The first way assigns a unique weight to each category. To do this you must
   # specify the category's vocabulary (values outside this specification will
diff --git a/tensorflow/examples/get_started/regression/dnn_regression.py b/tensorflow/examples/get_started/regression/dnn_regression.py
index 951c93b52e73a8e7f4497e9c4b0e91038de85620..94669a5082b26cac79e2879da43cc8aa6e5e83d0 100644
--- a/tensorflow/examples/get_started/regression/dnn_regression.py
+++ b/tensorflow/examples/get_started/regression/dnn_regression.py
@@ -45,12 +45,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The first way assigns a unique weight to each category. To do this you must
   # specify the category's vocabulary (values outside this specification will
diff --git a/tensorflow/examples/get_started/regression/linear_regression.py b/tensorflow/examples/get_started/regression/linear_regression.py
deleted file mode 100644
index 74651e7446cdf0d5f14eee18bb97d082200f7977..0000000000000000000000000000000000000000
--- a/tensorflow/examples/get_started/regression/linear_regression.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Linear regression using the LinearRegressor Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-import imports85  # pylint: disable=g-bad-import-order
-
-STEPS = 1000
-PRICE_NORM_FACTOR = 1000
-
-
-def main(argv):
-  """Builds, trains, and evaluates the model."""
-  assert len(argv) == 1
-  (train, test) = imports85.dataset()
-
-  # Switch the labels to units of thousands for better convergence.
-  def to_thousands(features, labels):
-    return features, labels / PRICE_NORM_FACTOR
-
-  train = train.map(to_thousands)
-  test = test.map(to_thousands)
-
-  # Build the training input_fn.
-  def input_train():
-    return (
-        # Shuffling with a buffer larger than the data set ensures
-        # that the examples are well mixed.
-        train.shuffle(1000).batch(128)
-        # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
-
-  # Build the validation input_fn.
-  def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
-
-  feature_columns = [
-      # "curb-weight" and "highway-mpg" are numeric columns.
-      tf.feature_column.numeric_column(key="curb-weight"),
-      tf.feature_column.numeric_column(key="highway-mpg"),
-  ]
-
-  # Build the Estimator.
-  model = tf.estimator.LinearRegressor(feature_columns=feature_columns)
-
-  # Train the model.
-  # By default, the Estimators log output every 100 steps.
-  model.train(input_fn=input_train, steps=STEPS)
-
-  # Evaluate how the model performs on data it has not yet seen.
-  eval_result = model.evaluate(input_fn=input_test)
-
-  # The evaluation returns a Python dictionary. The "average_loss" key holds the
-  # Mean Squared Error (MSE).
-  average_loss = eval_result["average_loss"]
-
-  # Convert MSE to Root Mean Square Error (RMSE).
-  print("\n" + 80 * "*")
-  print("\nRMS error for the test set: ${:.0f}"
-        .format(PRICE_NORM_FACTOR * average_loss**0.5))
-
-  # Run the model in prediction mode.
-  input_dict = {
-      "curb-weight": np.array([2000, 3000]),
-      "highway-mpg": np.array([30, 40])
-  }
-  predict_input_fn = tf.estimator.inputs.numpy_input_fn(
-      input_dict, shuffle=False)
-  predict_results = model.predict(input_fn=predict_input_fn)
-
-  # Print the prediction results.
-  print("\nPrediction results:")
-  for i, prediction in enumerate(predict_results):
-    msg = ("Curb weight: {: 4d}lbs, "
-           "Highway: {: 0d}mpg, "
-           "Prediction: ${: 9.2f}")
-    msg = msg.format(input_dict["curb-weight"][i], input_dict["highway-mpg"][i],
-                     PRICE_NORM_FACTOR * prediction["predictions"][0])
-
-    print("    " + msg)
-  print()
-
-
-if __name__ == "__main__":
-  # The Estimator periodically generates "INFO" logs; make these logs visible.
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tf.app.run(main=main)
diff --git a/tensorflow/examples/get_started/regression/linear_regression_categorical.py b/tensorflow/examples/get_started/regression/linear_regression_categorical.py
index e2ad415fbcb161a599cff7d123597e5156d11770..5312272a9592973e757e6cdd5a2305c0c04372a9 100644
--- a/tensorflow/examples/get_started/regression/linear_regression_categorical.py
+++ b/tensorflow/examples/get_started/regression/linear_regression_categorical.py
@@ -45,12 +45,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The following code demonstrates two of the ways that `feature_columns` can
   # be used to build a model with categorical inputs.
diff --git a/tensorflow/examples/how_tos/reading_data/BUILD b/tensorflow/examples/how_tos/reading_data/BUILD
index 64a054d3712035252666ca84e676add3d079e52a..e846b291467539e599aca9d68561eb6d016a8075 100644
--- a/tensorflow/examples/how_tos/reading_data/BUILD
+++ b/tensorflow/examples/how_tos/reading_data/BUILD
@@ -28,29 +28,3 @@ py_binary(
         "//tensorflow/examples/tutorials/mnist",
     ],
 )
-
-py_binary(
-    name = "fully_connected_preloaded",
-    srcs = [
-        "fully_connected_preloaded.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/examples/tutorials/mnist",
-        "//tensorflow/examples/tutorials/mnist:input_data",
-    ],
-)
-
-py_binary(
-    name = "fully_connected_preloaded_var",
-    srcs = [
-        "fully_connected_preloaded_var.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/examples/tutorials/mnist",
-        "//tensorflow/examples/tutorials/mnist:input_data",
-    ],
-)
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
deleted file mode 100644
index 34ff59f0d168ec0b8ea78e79a2ef7c6f0b0467db..0000000000000000000000000000000000000000
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Trains the MNIST network using preloaded data in a constant.
-
-Run using bazel:
-
-bazel run --config opt \
-    <...>/tensorflow/examples/how_tos/reading_data:fully_connected_preloaded
-
-or, if installed via pip:
-
-cd tensorflow/examples/how_tos/reading_data
-python fully_connected_preloaded.py
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-import time
-
-import tensorflow as tf
-
-from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.examples.tutorials.mnist import mnist
-
-# Basic model parameters as external flags.
-FLAGS = None
-
-
-def run_training():
-  """Train MNIST for a number of epochs."""
-  # Get the sets of images and labels for training, validation, and
-  # test on MNIST.
-  data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
-
-  # Tell TensorFlow that the model will be built into the default Graph.
-  with tf.Graph().as_default():
-    with tf.name_scope('input'):
-      # Input data, pin to CPU because rest of pipeline is CPU-only
-      with tf.device('/cpu:0'):
-        input_images = tf.constant(data_sets.train.images)
-        input_labels = tf.constant(data_sets.train.labels)
-
-      image, label = tf.train.slice_input_producer(
-          [input_images, input_labels], num_epochs=FLAGS.num_epochs)
-      label = tf.cast(label, tf.int32)
-      images, labels = tf.train.batch(
-          [image, label], batch_size=FLAGS.batch_size)
-
-    # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2)
-
-    # Add to the Graph the Ops for loss calculation.
-    loss = mnist.loss(logits, labels)
-
-    # Add to the Graph the Ops that calculate and apply gradients.
-    train_op = mnist.training(loss, FLAGS.learning_rate)
-
-    # Add the Op to compare the logits to the labels during evaluation.
-    eval_correct = mnist.evaluation(logits, labels)
-
-    # Build the summary operation based on the TF collection of Summaries.
-    summary_op = tf.summary.merge_all()
-
-    # Create a saver for writing training checkpoints.
-    saver = tf.train.Saver()
-
-    # Create the op for initializing variables.
-    init_op = tf.group(tf.global_variables_initializer(),
-                       tf.local_variables_initializer())
-    # Create a session for running Ops on the Graph.
-    sess = tf.Session()
-
-    # Run the Op to initialize the variables.
-    sess.run(init_op)
-
-    # Instantiate a SummaryWriter to output summaries and the Graph.
-    summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
-
-    # Start input enqueue threads.
-    coord = tf.train.Coordinator()
-    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
-
-    # And then after everything is built, start the training loop.
-    try:
-      step = 0
-      while not coord.should_stop():
-        start_time = time.time()
-
-        # Run one step of the model.
-        _, loss_value = sess.run([train_op, loss])
-
-        duration = time.time() - start_time
-
-        # Write the summaries and print an overview fairly often.
-        if step % 100 == 0:
-          # Print status to stdout.
-          print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
-                                                     duration))
-          # Update the events file.
-          summary_str = sess.run(summary_op)
-          summary_writer.add_summary(summary_str, step)
-          step += 1
-
-        # Save a checkpoint periodically.
-        if (step + 1) % 1000 == 0:
-          print('Saving')
-          saver.save(sess, FLAGS.train_dir, global_step=step)
-
-        step += 1
-    except tf.errors.OutOfRangeError:
-      print('Saving')
-      saver.save(sess, FLAGS.train_dir, global_step=step)
-      print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
-    finally:
-      # When done, ask the threads to stop.
-      coord.request_stop()
-
-    # Wait for threads to finish.
-    coord.join(threads)
-    sess.close()
-
-
-def main(_):
-  run_training()
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--learning_rate',
-      type=float,
-      default=0.01,
-      help='Initial learning rate.'
-  )
-  parser.add_argument(
-      '--num_epochs',
-      type=int,
-      default=2,
-      help='Number of epochs to run trainer.'
-  )
-  parser.add_argument(
-      '--hidden1',
-      type=int,
-      default=128,
-      help='Number of units in hidden layer 1.'
-  )
-  parser.add_argument(
-      '--hidden2',
-      type=int,
-      default=32,
-      help='Number of units in hidden layer 2.'
-  )
-  parser.add_argument(
-      '--batch_size',
-      type=int,
-      default=100,
-      help='Batch size.  Must divide evenly into the dataset sizes.'
-  )
-  parser.add_argument(
-      '--train_dir',
-      type=str,
-      default='/tmp/data',
-      help='Directory to put the training data.'
-  )
-  parser.add_argument(
-      '--fake_data',
-      default=False,
-      help='If true, uses fake data for unit testing.',
-      action='store_true'
-  )
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py b/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
deleted file mode 100644
index e29387ab9d01bcd6615cc8204cdb333ac85e10e9..0000000000000000000000000000000000000000
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_preloaded_var.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Trains the MNIST network using preloaded data stored in a variable.
-
-Run using bazel:
-
-bazel run --config opt \
-    <...>/tensorflow/examples/how_tos/reading_data:fully_connected_preloaded_var
-
-or, if installed via pip:
-
-cd tensorflow/examples/how_tos/reading_data
-python fully_connected_preloaded_var.py
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-import time
-
-import tensorflow as tf
-
-from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.examples.tutorials.mnist import mnist
-
-# Basic model parameters as external flags.
-FLAGS = None
-
-
-def run_training():
-  """Train MNIST for a number of epochs."""
-  # Get the sets of images and labels for training, validation, and
-  # test on MNIST.
-  data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
-
-  # Tell TensorFlow that the model will be built into the default Graph.
-  with tf.Graph().as_default():
-    with tf.name_scope('input'):
-      # Input data
-      images_initializer = tf.placeholder(
-          dtype=data_sets.train.images.dtype,
-          shape=data_sets.train.images.shape)
-      labels_initializer = tf.placeholder(
-          dtype=data_sets.train.labels.dtype,
-          shape=data_sets.train.labels.shape)
-      input_images = tf.Variable(
-          images_initializer, trainable=False, collections=[])
-      input_labels = tf.Variable(
-          labels_initializer, trainable=False, collections=[])
-
-      image, label = tf.train.slice_input_producer(
-          [input_images, input_labels], num_epochs=FLAGS.num_epochs)
-      label = tf.cast(label, tf.int32)
-      images, labels = tf.train.batch(
-          [image, label], batch_size=FLAGS.batch_size)
-
-    # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(images, FLAGS.hidden1, FLAGS.hidden2)
-
-    # Add to the Graph the Ops for loss calculation.
-    loss = mnist.loss(logits, labels)
-
-    # Add to the Graph the Ops that calculate and apply gradients.
-    train_op = mnist.training(loss, FLAGS.learning_rate)
-
-    # Add the Op to compare the logits to the labels during evaluation.
-    eval_correct = mnist.evaluation(logits, labels)
-
-    # Build the summary operation based on the TF collection of Summaries.
-    summary_op = tf.summary.merge_all()
-
-    # Create a saver for writing training checkpoints.
-    saver = tf.train.Saver()
-
-    # Create the op for initializing variables.
-    init_op = tf.group(tf.global_variables_initializer(),
-                       tf.local_variables_initializer())
-
-    # Create a session for running Ops on the Graph.
-    sess = tf.Session()
-
-    # Run the Op to initialize the variables.
-    sess.run(init_op)
-    sess.run(input_images.initializer,
-             feed_dict={images_initializer: data_sets.train.images})
-    sess.run(input_labels.initializer,
-             feed_dict={labels_initializer: data_sets.train.labels})
-
-    # Instantiate a SummaryWriter to output summaries and the Graph.
-    summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
-
-    # Start input enqueue threads.
-    coord = tf.train.Coordinator()
-    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
-
-    # And then after everything is built, start the training loop.
-    try:
-      step = 0
-      while not coord.should_stop():
-        start_time = time.time()
-
-        # Run one step of the model.
-        _, loss_value = sess.run([train_op, loss])
-
-        duration = time.time() - start_time
-
-        # Write the summaries and print an overview fairly often.
-        if step % 100 == 0:
-          # Print status to stdout.
-          print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
-                                                     duration))
-          # Update the events file.
-          summary_str = sess.run(summary_op)
-          summary_writer.add_summary(summary_str, step)
-          step += 1
-
-        # Save a checkpoint periodically.
-        if (step + 1) % 1000 == 0:
-          print('Saving')
-          saver.save(sess, FLAGS.train_dir, global_step=step)
-
-        step += 1
-    except tf.errors.OutOfRangeError:
-      print('Saving')
-      saver.save(sess, FLAGS.train_dir, global_step=step)
-      print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
-    finally:
-      # When done, ask the threads to stop.
-      coord.request_stop()
-
-    # Wait for threads to finish.
-    coord.join(threads)
-    sess.close()
-
-
-def main(_):
-  run_training()
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--learning_rate',
-      type=float,
-      default=0.01,
-      help='Initial learning rate.'
-  )
-  parser.add_argument(
-      '--num_epochs',
-      type=int,
-      default=2,
-      help='Number of epochs to run trainer.'
-  )
-  parser.add_argument(
-      '--hidden1',
-      type=int,
-      default=128,
-      help='Number of units in hidden layer 1.'
-  )
-  parser.add_argument(
-      '--hidden2',
-      type=int,
-      default=32,
-      help='Number of units in hidden layer 2.'
-  )
-  parser.add_argument(
-      '--batch_size',
-      type=int,
-      default=100,
-      help='Batch size.  Must divide evenly into the dataset sizes.'
-  )
-  parser.add_argument(
-      '--train_dir',
-      type=str,
-      default='/tmp/data',
-      help='Directory to put the training data.'
-  )
-  parser.add_argument(
-      '--fake_data',
-      default=False,
-      help='If true, uses fake data for unit testing.',
-      action='store_true'
-  )
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index 740224744860fdd76bea9c4531242a4976b20784..5c52a2c8461660e19ef6e98c01a6a58a3f3c0920 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -126,7 +126,7 @@ def inputs(train, batch_size, num_epochs):
     dataset = dataset.repeat(num_epochs)
     dataset = dataset.batch(batch_size)
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
   return iterator.get_next()
 
 
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index bdbcb0b1638a400f12f66bb3c4ee9d852fe145d2..d6ec1f393bab82a45f0c1032670b5abed42bf6d3 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -9,30 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-py_binary(
-    name = "boston",
-    srcs = ["boston.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-py_binary(
-    name = "hdf5_classification",
-    srcs = ["hdf5_classification.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_binary(
-    name = "iris",
-    srcs = ["iris.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
 py_binary(
     name = "iris_custom_decay_dnn",
     srcs = ["iris_custom_decay_dnn.py"],
@@ -47,89 +23,6 @@ py_binary(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
-py_binary(
-    name = "iris_run_config",
-    srcs = ["iris_run_config.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-py_binary(
-    name = "random_forest_mnist",
-    srcs = ["random_forest_mnist.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/examples/tutorials/mnist:input_data",
-        "//tensorflow/python:platform",
-    ],
-)
-
-py_binary(
-    name = "resnet",
-    srcs = ["resnet.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
-py_binary(
-    name = "text_classification",
-    srcs = ["text_classification.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_binary(
-    name = "text_classification_character_cnn",
-    srcs = ["text_classification_character_cnn.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_binary(
-    name = "text_classification_character_rnn",
-    srcs = ["text_classification_character_rnn.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_binary(
-    name = "text_classification_cnn",
-    srcs = ["text_classification_cnn.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_binary(
-    name = "mnist",
-    srcs = ["mnist.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_binary(
-    name = "multiple_gpu",
-    srcs = ["multiple_gpu.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
 sh_test(
     name = "examples_test",
     size = "large",
diff --git a/tensorflow/examples/learn/README.md b/tensorflow/examples/learn/README.md
index b74a8f39d98123d3e7ca6d5bbeb0a4b806097670..07f9e051374ef930f8975195aa3731f183ebb6d7 100644
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@@ -6,29 +6,11 @@ create, train, and use deep learning models easily.
 See the [Quickstart tutorial](https://www.tensorflow.org/get_started/estimator)
 for an introduction to the API.
 
-To run most of these examples, you need to install the `scikit learn` library
-(`pip install -U scikit-learn`). Some examples use the `pandas` library for data
-processing (`pip install -U pandas`).
-
 ## Basics
 
-* [Deep Neural Network Regression with Boston Data](https://www.tensorflow.org/code/tensorflow/examples/learn/boston.py)
-* [Deep Neural Network Classification with Iris Data](https://www.tensorflow.org/code/tensorflow/examples/learn/iris.py)
 * [Building a Custom Model](https://www.tensorflow.org/code/tensorflow/examples/learn/iris_custom_model.py)
-* [Building a Model Using Different GPU Configurations](https://www.tensorflow.org/code/tensorflow/examples/learn/iris_run_config.py)
 
 ## Techniques
 
 * [Deep Neural Network with Customized Decay Function](https://www.tensorflow.org/code/tensorflow/examples/learn/iris_custom_decay_dnn.py)
 
-## Specialized Models
-* [Building a Random Forest Model](https://www.tensorflow.org/code/tensorflow/examples/learn/random_forest_mnist.py)
-* [Building a Wide & Deep Model](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py)
-* [Building a Residual Network Model](https://www.tensorflow.org/code/tensorflow/examples/learn/resnet.py)
-
-## Text classification
-
-* [Text Classification Using Recurrent Neural Networks on Words](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification.py)
-* [Text Classification Using Convolutional Neural Networks on Words](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_cnn.py)
-* [Text Classification Using Recurrent Neural Networks on Characters](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_rnn.py)
-* [Text Classification Using Convolutional Neural Networks on Characters](https://www.tensorflow.org/code/tensorflow/examples/learn/text_classification_character_cnn.py)
diff --git a/tensorflow/examples/learn/boston.py b/tensorflow/examples/learn/boston.py
deleted file mode 100644
index c9ce508dfdb05569f4f212137032a7dd16e86a55..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/boston.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of DNNRegressor for Housing dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from sklearn import datasets
-from sklearn import metrics
-from sklearn import model_selection
-from sklearn import preprocessing
-
-import tensorflow as tf
-
-
-def main(unused_argv):
-  # Load dataset
-  boston = datasets.load_boston()
-  x, y = boston.data, boston.target
-
-  # Split dataset into train / test
-  x_train, x_test, y_train, y_test = model_selection.train_test_split(
-      x, y, test_size=0.2, random_state=42)
-
-  # Scale data (training set) to 0 mean and unit standard deviation.
-  scaler = preprocessing.StandardScaler()
-  x_train = scaler.fit_transform(x_train)
-
-  # Build 2 layer fully connected DNN with 10, 10 units respectively.
-  feature_columns = [
-      tf.feature_column.numeric_column('x', shape=np.array(x_train).shape[1:])]
-  regressor = tf.estimator.DNNRegressor(
-      feature_columns=feature_columns, hidden_units=[10, 10])
-
-  # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={'x': x_train}, y=y_train, batch_size=1, num_epochs=None, shuffle=True)
-  regressor.train(input_fn=train_input_fn, steps=2000)
-
-  # Predict.
-  x_transformed = scaler.transform(x_test)
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={'x': x_transformed}, y=y_test, num_epochs=1, shuffle=False)
-  predictions = regressor.predict(input_fn=test_input_fn)
-  y_predicted = np.array(list(p['predictions'] for p in predictions))
-  y_predicted = y_predicted.reshape(np.array(y_test).shape)
-
-  # Score with sklearn.
-  score_sklearn = metrics.mean_squared_error(y_predicted, y_test)
-  print('MSE (sklearn): {0:f}'.format(score_sklearn))
-
-  # Score with tensorflow.
-  scores = regressor.evaluate(input_fn=test_input_fn)
-  print('MSE (tensorflow): {0:f}'.format(scores['average_loss']))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/examples_test.sh b/tensorflow/examples/learn/examples_test.sh
index ef5e8a5de25068a74b1f3ea9c3b2ce87aa470f89..e26848b0074d4ed90b5007c4c3e9361fc21dd11a 100755
--- a/tensorflow/examples/learn/examples_test.sh
+++ b/tensorflow/examples/learn/examples_test.sh
@@ -44,15 +44,5 @@ function test() {
   fi
 }
 
-test boston
-test iris
 test iris_custom_decay_dnn
 test iris_custom_model
-test iris_run_config
-test random_forest_mnist
-test resnet
-test text_classification --test_with_fake_data
-test text_classification_builtin_rnn_model --test_with_fake_data
-test text_classification_character_cnn --test_with_fake_data
-test text_classification_character_rnn --test_with_fake_data
-test text_classification_cnn --test_with_fake_data
diff --git a/tensorflow/examples/learn/hdf5_classification.py b/tensorflow/examples/learn/hdf5_classification.py
deleted file mode 100644
index 3a46bbcf41c68187f493ac18bb7d4725ad91dbfc..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/hdf5_classification.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of DNNClassifier for Iris plant dataset, hdf5 format."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from sklearn import datasets
-from sklearn import metrics
-from sklearn import model_selection
-import tensorflow as tf
-import h5py  # pylint: disable=g-bad-import-order
-
-
-X_FEATURE = 'x'  # Name of the input feature.
-
-
-def main(unused_argv):
-  # Load dataset.
-  iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = model_selection.train_test_split(
-      iris.data, iris.target, test_size=0.2, random_state=42)
-
-  # Note that we are saving and load iris data as h5 format as a simple
-  # demonstration here.
-  h5f = h5py.File('/tmp/test_hdf5.h5', 'w')
-  h5f.create_dataset('X_train', data=x_train)
-  h5f.create_dataset('X_test', data=x_test)
-  h5f.create_dataset('y_train', data=y_train)
-  h5f.create_dataset('y_test', data=y_test)
-  h5f.close()
-
-  h5f = h5py.File('/tmp/test_hdf5.h5', 'r')
-  x_train = np.array(h5f['X_train'])
-  x_test = np.array(h5f['X_test'])
-  y_train = np.array(h5f['y_train'])
-  y_test = np.array(h5f['y_test'])
-
-  # Build 3 layer DNN with 10, 20, 10 units respectively.
-  feature_columns = [
-      tf.feature_column.numeric_column(
-          X_FEATURE, shape=np.array(x_train).shape[1:])]
-  classifier = tf.estimator.DNNClassifier(
-      feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)
-
-  # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=200)
-
-  # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
-  predictions = classifier.predict(input_fn=test_input_fn)
-  y_predicted = np.array(list(p['class_ids'] for p in predictions))
-  y_predicted = y_predicted.reshape(np.array(y_test).shape)
-
-  # Score with sklearn.
-  score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy (sklearn): {0:f}'.format(score))
-
-  # Score with tensorflow.
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
deleted file mode 100644
index 86f5204ec3e8713d5d22156419b6414acb2fa677..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/iris.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of DNNClassifier for Iris plant dataset.
-
-This example uses APIs in Tensorflow 1.4 or above.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from six.moves.urllib.request import urlretrieve
-
-import tensorflow as tf
-
-# Data sets
-IRIS_TRAINING = 'iris_training.csv'
-IRIS_TRAINING_URL = 'http://download.tensorflow.org/data/iris_training.csv'
-
-IRIS_TEST = 'iris_test.csv'
-IRIS_TEST_URL = 'http://download.tensorflow.org/data/iris_test.csv'
-
-FEATURE_KEYS = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
-
-
-def maybe_download_iris_data(file_name, download_url):
-  """Downloads the file and returns the number of data."""
-  if not os.path.exists(file_name):
-    urlretrieve(download_url, file_name)
-
-  # The first line is a comma-separated string. The first one is the number of
-  # total data in the file.
-  with open(file_name, 'r') as f:
-    first_line = f.readline()
-  num_elements = first_line.split(',')[0]
-  return int(num_elements)
-
-
-def input_fn(file_name, num_data, batch_size, is_training):
-  """Creates an input_fn required by Estimator train/evaluate."""
-  # If the data sets aren't stored locally, download them.
-
-  def _parse_csv(rows_string_tensor):
-    """Takes the string input tensor and returns tuple of (features, labels)."""
-    # Last dim is the label.
-    num_features = len(FEATURE_KEYS)
-    num_columns = num_features + 1
-    columns = tf.decode_csv(rows_string_tensor,
-                            record_defaults=[[]] * num_columns)
-    features = dict(zip(FEATURE_KEYS, columns[:num_features]))
-    labels = tf.cast(columns[num_features], tf.int32)
-    return features, labels
-
-  def _input_fn():
-    """The input_fn."""
-    dataset = tf.data.TextLineDataset([file_name])
-    # Skip the first line (which does not have data).
-    dataset = dataset.skip(1)
-    dataset = dataset.map(_parse_csv)
-
-    if is_training:
-      # For this small dataset, which can fit into memory, to achieve true
-      # randomness, the shuffle buffer size is set as the total number of
-      # elements in the dataset.
-      dataset = dataset.shuffle(num_data)
-      dataset = dataset.repeat()
-
-    dataset = dataset.batch(batch_size)
-    iterator = dataset.make_one_shot_iterator()
-    features, labels = iterator.get_next()
-    return features, labels
-
-  return _input_fn
-
-
-def main(unused_argv):
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  num_training_data = maybe_download_iris_data(
-      IRIS_TRAINING, IRIS_TRAINING_URL)
-  num_test_data = maybe_download_iris_data(IRIS_TEST, IRIS_TEST_URL)
-
-  # Build 3 layer DNN with 10, 20, 10 units respectively.
-  feature_columns = [
-      tf.feature_column.numeric_column(key, shape=1) for key in FEATURE_KEYS]
-  classifier = tf.estimator.DNNClassifier(
-      feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3)
-
-  # Train.
-  train_input_fn = input_fn(IRIS_TRAINING, num_training_data, batch_size=32,
-                            is_training=True)
-  classifier.train(input_fn=train_input_fn, steps=400)
-
-  # Eval.
-  test_input_fn = input_fn(IRIS_TEST, num_test_data, batch_size=32,
-                           is_training=False)
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
index 4a219694d10ef075e0e0403cdd7ed100c39ddadd..73bf20fada488a818471f47c6f0b5f0d6073ce25 100644
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/learn/iris_custom_decay_dnn.py
@@ -76,12 +76,12 @@ def main(unused_argv):
   classifier = tf.estimator.Estimator(model_fn=my_model)
 
   # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
   classifier.train(input_fn=train_input_fn, steps=1000)
 
   # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+  test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
diff --git a/tensorflow/examples/learn/iris_custom_model.py b/tensorflow/examples/learn/iris_custom_model.py
index c6bdb86ba52b9715b977909d9b7d0fbc59161a53..bf34d72ba07860569183f3eec49fa29f1d577cbf 100644
--- a/tensorflow/examples/learn/iris_custom_model.py
+++ b/tensorflow/examples/learn/iris_custom_model.py
@@ -73,12 +73,12 @@ def main(unused_argv):
   classifier = tf.estimator.Estimator(model_fn=my_model)
 
   # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
   classifier.train(input_fn=train_input_fn, steps=1000)
 
   # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+  test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
diff --git a/tensorflow/examples/learn/iris_run_config.py b/tensorflow/examples/learn/iris_run_config.py
deleted file mode 100644
index 286c824e30f7f85be9751a852d79c60150100d9a..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/iris_run_config.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-"""Example of DNNClassifier for Iris plant dataset, with run config."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from sklearn import datasets
-from sklearn import metrics
-from sklearn import model_selection
-import tensorflow as tf
-
-
-X_FEATURE = 'x'  # Name of the input feature.
-
-
-def main(unused_argv):
-  # Load dataset.
-  iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = model_selection.train_test_split(
-      iris.data, iris.target, test_size=0.2, random_state=42)
-
-  # You can define you configurations by providing a RunConfig object to
-  # estimator to control session configurations, e.g. tf_random_seed.
-  run_config = tf.estimator.RunConfig().replace(tf_random_seed=1)
-
-  # Build 3 layer DNN with 10, 20, 10 units respectively.
-  feature_columns = [
-      tf.feature_column.numeric_column(
-          X_FEATURE, shape=np.array(x_train).shape[1:])]
-  classifier = tf.estimator.DNNClassifier(
-      feature_columns=feature_columns, hidden_units=[10, 20, 10], n_classes=3,
-      config=run_config)
-
-  # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=200)
-
-  # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
-  predictions = classifier.predict(input_fn=test_input_fn)
-  y_predicted = np.array(list(p['class_ids'] for p in predictions))
-  y_predicted = y_predicted.reshape(np.array(y_test).shape)
-
-  # Score with sklearn.
-  score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy (sklearn): {0:f}'.format(score))
-
-  # Score with tensorflow.
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
deleted file mode 100644
index 3ead8614b68959b95ccad43623d4df4a5c4665bd..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/mnist.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""This showcases how simple it is to build image classification networks.
-
-It follows description from this TensorFlow tutorial:
-    https://www.tensorflow.org/versions/master/tutorials/mnist/pros/index.html#deep-mnist-for-experts
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-
-N_DIGITS = 10  # Number of digits.
-X_FEATURE = 'x'  # Name of the input feature.
-
-
-def conv_model(features, labels, mode):
-  """2-layer convolution model."""
-  # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
-  # image width and height final dimension being the number of color channels.
-  feature = tf.reshape(features[X_FEATURE], [-1, 28, 28, 1])
-
-  # First conv layer will compute 32 features for each 5x5 patch
-  with tf.variable_scope('conv_layer1'):
-    h_conv1 = tf.layers.conv2d(
-        feature,
-        filters=32,
-        kernel_size=[5, 5],
-        padding='same',
-        activation=tf.nn.relu)
-    h_pool1 = tf.layers.max_pooling2d(
-        h_conv1, pool_size=2, strides=2, padding='same')
-
-  # Second conv layer will compute 64 features for each 5x5 patch.
-  with tf.variable_scope('conv_layer2'):
-    h_conv2 = tf.layers.conv2d(
-        h_pool1,
-        filters=64,
-        kernel_size=[5, 5],
-        padding='same',
-        activation=tf.nn.relu)
-    h_pool2 = tf.layers.max_pooling2d(
-        h_conv2, pool_size=2, strides=2, padding='same')
-    # reshape tensor into a batch of vectors
-    h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
-
-  # Densely connected layer with 1024 neurons.
-  h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu)
-  h_fc1 = tf.layers.dropout(
-      h_fc1, 
-      rate=0.5, 
-      training=(mode == tf.estimator.ModeKeys.TRAIN))
-
-  # Compute logits (1 per class) and compute loss.
-  logits = tf.layers.dense(h_fc1, N_DIGITS, activation=None)
-
-  # Compute predictions.
-  predicted_classes = tf.argmax(logits, 1)
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    predictions = {
-        'class': predicted_classes,
-        'prob': tf.nn.softmax(logits)
-    }
-    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
-
-  # Compute loss.
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
-  # Create training op.
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
-    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-
-  # Compute evaluation metrics.
-  eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
-  }
-  return tf.estimator.EstimatorSpec(
-      mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_args):
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  ### Download and load MNIST dataset.
-  mnist = tf.contrib.learn.datasets.DATASETS['mnist']('/tmp/mnist')
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: mnist.train.images},
-      y=mnist.train.labels.astype(np.int32),
-      batch_size=100,
-      num_epochs=None,
-      shuffle=True)
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: mnist.train.images},
-      y=mnist.train.labels.astype(np.int32),
-      num_epochs=1,
-      shuffle=False)
-
-  ### Linear classifier.
-  feature_columns = [
-      tf.feature_column.numeric_column(
-          X_FEATURE, shape=mnist.train.images.shape[1:])]
-
-  classifier = tf.estimator.LinearClassifier(
-      feature_columns=feature_columns, n_classes=N_DIGITS)
-  classifier.train(input_fn=train_input_fn, steps=200)
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy (LinearClassifier): {0:f}'.format(scores['accuracy']))
-
-  ### Convolutional network
-  classifier = tf.estimator.Estimator(model_fn=conv_model)
-  classifier.train(input_fn=train_input_fn, steps=200)
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy (conv_model): {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/multiple_gpu.py b/tensorflow/examples/learn/multiple_gpu.py
deleted file mode 100644
index 3bad22ddf66b7981930637d64cc8653e3fb29cdf..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/multiple_gpu.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of using Estimator with multiple GPUs to distribute one model.
-
-This example only runs if you have multiple GPUs to assign to.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from sklearn import datasets
-from sklearn import metrics
-from sklearn import model_selection
-import tensorflow as tf
-
-
-X_FEATURE = 'x'  # Name of the input feature.
-
-
-def my_model(features, labels, mode):
-  """DNN with three hidden layers, and dropout of 0.1 probability.
-
-  Note: If you want to run this example with multiple GPUs, Cuda Toolkit 7.0 and
-  CUDNN 6.5 V2 from NVIDIA need to be installed beforehand.
-
-  Args:
-    features: Dict of input `Tensor`.
-    labels: Label `Tensor`.
-    mode: One of `ModeKeys`.
-
-  Returns:
-    `EstimatorSpec`.
-  """
-  # Create three fully connected layers respectively of size 10, 20, and 10 with
-  # each layer having a dropout probability of 0.1.
-  net = features[X_FEATURE]
-  with tf.device('/device:GPU:1'):
-    for units in [10, 20, 10]:
-      net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
-      net = tf.layers.dropout(net, rate=0.1)
-
-  with tf.device('/device:GPU:2'):
-    # Compute logits (1 per class).
-    logits = tf.layers.dense(net, 3, activation=None)
-
-    # Compute predictions.
-    predicted_classes = tf.argmax(logits, 1)
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      predictions = {
-          'class': predicted_classes,
-          'prob': tf.nn.softmax(logits)
-      }
-      return tf.estimator.EstimatorSpec(mode, predictions=predictions)
-
-    # Compute loss.
-    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
-    # Create training op.
-    if mode == tf.estimator.ModeKeys.TRAIN:
-      optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
-      train_op = optimizer.minimize(
-          loss, global_step=tf.train.get_global_step())
-      return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-
-    # Compute evaluation metrics.
-    eval_metric_ops = {
-        'accuracy': tf.metrics.accuracy(
-            labels=labels, predictions=predicted_classes)
-    }
-    return tf.estimator.EstimatorSpec(
-        mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_argv):
-  iris = datasets.load_iris()
-  x_train, x_test, y_train, y_test = model_selection.train_test_split(
-      iris.data, iris.target, test_size=0.2, random_state=42)
-
-  classifier = tf.estimator.Estimator(model_fn=my_model)
-
-  # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=100)
-
-  # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
-  predictions = classifier.predict(input_fn=test_input_fn)
-  y_predicted = np.array(list(p['class'] for p in predictions))
-  y_predicted = y_predicted.reshape(np.array(y_test).shape)
-
-  # Score with sklearn.
-  score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy (sklearn): {0:f}'.format(score))
-
-  # Score with tensorflow.
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/random_forest_mnist.py b/tensorflow/examples/learn/random_forest_mnist.py
deleted file mode 100644
index 72c935cdae2196a1309097e4e6f15bd6f22f96a5..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/random_forest_mnist.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A stand-alone example for tf.learn's random forest model on mnist."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-import tempfile
-
-import numpy
-
-from tensorflow.contrib.learn.python.learn import metric_spec
-from tensorflow.contrib.tensor_forest.client import eval_metrics
-from tensorflow.contrib.tensor_forest.client import random_forest
-from tensorflow.contrib.tensor_forest.python import tensor_forest
-from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.platform import app
-
-FLAGS = None
-
-
-def build_estimator(model_dir):
-  """Build an estimator."""
-  params = tensor_forest.ForestHParams(
-      num_classes=10,
-      num_features=784,
-      num_trees=FLAGS.num_trees,
-      max_nodes=FLAGS.max_nodes)
-  graph_builder_class = tensor_forest.RandomForestGraphs
-  if FLAGS.use_training_loss:
-    graph_builder_class = tensor_forest.TrainingLossForest
-  return random_forest.TensorForestEstimator(
-      params, graph_builder_class=graph_builder_class, model_dir=model_dir)
-
-
-def train_and_eval():
-  """Train and evaluate the model."""
-  model_dir = tempfile.mkdtemp() if not FLAGS.model_dir else FLAGS.model_dir
-  print('model directory = %s' % model_dir)
-
-  est = build_estimator(model_dir)
-
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=False)
-
-  train_input_fn = numpy_io.numpy_input_fn(
-      x={'images': mnist.train.images},
-      y=mnist.train.labels.astype(numpy.int32),
-      batch_size=FLAGS.batch_size,
-      num_epochs=None,
-      shuffle=True)
-  est.fit(input_fn=train_input_fn, steps=None)
-
-  metric_name = 'accuracy'
-  metric = {
-      metric_name:
-          metric_spec.MetricSpec(
-              eval_metrics.get_metric(metric_name),
-              prediction_key=eval_metrics.get_prediction_key(metric_name))
-  }
-
-  test_input_fn = numpy_io.numpy_input_fn(
-      x={'images': mnist.test.images},
-      y=mnist.test.labels.astype(numpy.int32),
-      num_epochs=1,
-      batch_size=FLAGS.batch_size,
-      shuffle=False)
-
-  results = est.evaluate(input_fn=test_input_fn, metrics=metric)
-  for key in sorted(results):
-    print('%s: %s' % (key, results[key]))
-
-
-def main(_):
-  train_and_eval()
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--model_dir',
-      type=str,
-      default='',
-      help='Base directory for output models.'
-  )
-  parser.add_argument(
-      '--data_dir',
-      type=str,
-      default='/tmp/data/',
-      help='Directory for storing data'
-  )
-  parser.add_argument(
-      '--train_steps',
-      type=int,
-      default=1000,
-      help='Number of training steps.'
-  )
-  parser.add_argument(
-      '--batch_size',
-      type=str,
-      default=1000,
-      help='Number of examples in a training batch.'
-  )
-  parser.add_argument(
-      '--num_trees',
-      type=int,
-      default=100,
-      help='Number of trees in the forest.'
-  )
-  parser.add_argument(
-      '--max_nodes',
-      type=int,
-      default=1000,
-      help='Max total nodes in a single tree.'
-  )
-  parser.add_argument(
-      '--use_training_loss',
-      type=bool,
-      default=False,
-      help='If true, use training loss as termination criteria.'
-  )
-  FLAGS, unparsed = parser.parse_known_args()
-  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/resnet.py b/tensorflow/examples/learn/resnet.py
deleted file mode 100755
index c00de932a8707ad5717aaf1251cf5c88464a28b0..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/resnet.py
+++ /dev/null
@@ -1,202 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""This example builds deep residual network for mnist data.
-
-Reference Paper: http://arxiv.org/pdf/1512.03385.pdf
-
-Note that this is still a work-in-progress. Feel free to submit a PR
-to make this better.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import namedtuple
-from math import sqrt
-
-import numpy as np
-import tensorflow as tf
-
-
-N_DIGITS = 10  # Number of digits.
-X_FEATURE = 'x'  # Name of the input feature.
-
-
-def res_net_model(features, labels, mode):
-  """Builds a residual network."""
-
-  # Configurations for each bottleneck group.
-  BottleneckGroup = namedtuple('BottleneckGroup',
-                               ['num_blocks', 'num_filters', 'bottleneck_size'])
-  groups = [
-      BottleneckGroup(3, 128, 32), BottleneckGroup(3, 256, 64),
-      BottleneckGroup(3, 512, 128), BottleneckGroup(3, 1024, 256)
-  ]
-
-  x = features[X_FEATURE]
-  input_shape = x.get_shape().as_list()
-
-  # Reshape the input into the right shape if it's 2D tensor
-  if len(input_shape) == 2:
-    ndim = int(sqrt(input_shape[1]))
-    x = tf.reshape(x, [-1, ndim, ndim, 1])
-
-  training = (mode == tf.estimator.ModeKeys.TRAIN)
-  
-  # First convolution expands to 64 channels
-  with tf.variable_scope('conv_layer1'):
-    net = tf.layers.conv2d(
-        x,
-        filters=64,
-        kernel_size=7,
-        activation=tf.nn.relu)
-    net = tf.layers.batch_normalization(net, training=training)
-
-  # Max pool
-  net = tf.layers.max_pooling2d(
-      net, pool_size=3, strides=2, padding='same')
-
-  # First chain of resnets
-  with tf.variable_scope('conv_layer2'):
-    net = tf.layers.conv2d(
-        net,
-        filters=groups[0].num_filters,
-        kernel_size=1,
-        padding='valid')
-
-  # Create the bottleneck groups, each of which contains `num_blocks`
-  # bottleneck groups.
-  for group_i, group in enumerate(groups):
-    for block_i in range(group.num_blocks):
-      name = 'group_%d/block_%d' % (group_i, block_i)
-
-      # 1x1 convolution responsible for reducing dimension
-      with tf.variable_scope(name + '/conv_in'):
-        conv = tf.layers.conv2d(
-            net,
-            filters=group.num_filters,
-            kernel_size=1,
-            padding='valid',
-            activation=tf.nn.relu)
-        conv = tf.layers.batch_normalization(conv, training=training)
-
-      with tf.variable_scope(name + '/conv_bottleneck'):
-        conv = tf.layers.conv2d(
-            conv,
-            filters=group.bottleneck_size,
-            kernel_size=3,
-            padding='same',
-            activation=tf.nn.relu)
-        conv = tf.layers.batch_normalization(conv, training=training)
-
-      # 1x1 convolution responsible for restoring dimension
-      with tf.variable_scope(name + '/conv_out'):
-        input_dim = net.get_shape()[-1].value
-        conv = tf.layers.conv2d(
-            conv,
-            filters=input_dim,
-            kernel_size=1,
-            padding='valid',
-            activation=tf.nn.relu)
-        conv = tf.layers.batch_normalization(conv, training=training)
-
-      # shortcut connections that turn the network into its counterpart
-      # residual function (identity shortcut)
-      net = conv + net
-
-    try:
-      # upscale to the next group size
-      next_group = groups[group_i + 1]
-      with tf.variable_scope('block_%d/conv_upscale' % group_i):
-        net = tf.layers.conv2d(
-            net,
-            filters=next_group.num_filters,
-            kernel_size=1,
-            padding='same',
-            activation=None,
-            bias_initializer=None)
-    except IndexError:
-      pass
-
-  net_shape = net.get_shape().as_list()
-  net = tf.nn.avg_pool(
-      net,
-      ksize=[1, net_shape[1], net_shape[2], 1],
-      strides=[1, 1, 1, 1],
-      padding='VALID')
-
-  net_shape = net.get_shape().as_list()
-  net = tf.reshape(net, [-1, net_shape[1] * net_shape[2] * net_shape[3]])
-
-  # Compute logits (1 per class) and compute loss.
-  logits = tf.layers.dense(net, N_DIGITS, activation=None)
-
-  # Compute predictions.
-  predicted_classes = tf.argmax(logits, 1)
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    predictions = {
-        'class': predicted_classes,
-        'prob': tf.nn.softmax(logits)
-    }
-    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
-
-  # Compute loss.
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-
-  # Create training op.
-  if training:
-    optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
-    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
-    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-
-  # Compute evaluation metrics.
-  eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
-  }
-  return tf.estimator.EstimatorSpec(
-      mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_args):
-  # Download and load MNIST data.
-  mnist = tf.contrib.learn.datasets.DATASETS['mnist']('/tmp/mnist')
-
-  # Create a new resnet classifier.
-  classifier = tf.estimator.Estimator(model_fn=res_net_model)
-
-  tf.logging.set_verbosity(tf.logging.INFO)  # Show training logs.
-
-  # Train model and save summaries into logdir.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: mnist.train.images},
-      y=mnist.train.labels.astype(np.int32),
-      batch_size=100,
-      num_epochs=None,
-      shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=100)
-
-  # Calculate accuracy.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={X_FEATURE: mnist.test.images},
-      y=mnist.test.labels.astype(np.int32),
-      num_epochs=1,
-      shuffle=False)
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy: {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
deleted file mode 100644
index e4e61862b02f9827f42c8d0052a7be8a57502dd8..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/text_classification.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of Estimator for DNN-based text classification with DBpedia data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import numpy as np
-import pandas
-from sklearn import metrics
-import tensorflow as tf
-
-FLAGS = None
-
-MAX_DOCUMENT_LENGTH = 10
-EMBEDDING_SIZE = 50
-n_words = 0
-MAX_LABEL = 15
-WORDS_FEATURE = 'words'  # Name of the input words feature.
-
-
-def estimator_spec_for_softmax_classification(logits, labels, mode):
-  """Returns EstimatorSpec instance for softmax classification."""
-  predicted_classes = tf.argmax(logits, 1)
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    return tf.estimator.EstimatorSpec(
-        mode=mode,
-        predictions={
-            'class': predicted_classes,
-            'prob': tf.nn.softmax(logits)
-        })
-
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
-    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
-    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-
-  eval_metric_ops = {
-      'accuracy':
-          tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
-  }
-  return tf.estimator.EstimatorSpec(
-      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def bag_of_words_model(features, labels, mode):
-  """A bag-of-words model. Note it disregards the word order in the text."""
-  bow_column = tf.feature_column.categorical_column_with_identity(
-      WORDS_FEATURE, num_buckets=n_words)
-  bow_embedding_column = tf.feature_column.embedding_column(
-      bow_column, dimension=EMBEDDING_SIZE)
-  bow = tf.feature_column.input_layer(
-      features, feature_columns=[bow_embedding_column])
-  logits = tf.layers.dense(bow, MAX_LABEL, activation=None)
-
-  return estimator_spec_for_softmax_classification(
-      logits=logits, labels=labels, mode=mode)
-
-
-def rnn_model(features, labels, mode):
-  """RNN model to predict from sequence of words to a class."""
-  # Convert indexes of words into embeddings.
-  # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
-  # maps word indexes of the sequence into [batch_size, sequence_length,
-  # EMBEDDING_SIZE].
-  word_vectors = tf.contrib.layers.embed_sequence(
-      features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
-
-  # Split into list of embedding per word, while removing doc length dim.
-  # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
-  word_list = tf.unstack(word_vectors, axis=1)
-
-  # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
-  cell = tf.nn.rnn_cell.GRUCell(EMBEDDING_SIZE)
-
-  # Create an unrolled Recurrent Neural Networks to length of
-  # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
-  _, encoding = tf.nn.static_rnn(cell, word_list, dtype=tf.float32)
-
-  # Given encoding of RNN, take encoding of last step (e.g hidden size of the
-  # neural network of last step) and pass it as features for softmax
-  # classification over output classes.
-  logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
-  return estimator_spec_for_softmax_classification(
-      logits=logits, labels=labels, mode=mode)
-
-
-def main(unused_argv):
-  global n_words
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  # Prepare training and testing data
-  dbpedia = tf.contrib.learn.datasets.load_dataset(
-      'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
-  x_train = pandas.Series(dbpedia.train.data[:, 1])
-  y_train = pandas.Series(dbpedia.train.target)
-  x_test = pandas.Series(dbpedia.test.data[:, 1])
-  y_test = pandas.Series(dbpedia.test.target)
-
-  # Process vocabulary
-  vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
-      MAX_DOCUMENT_LENGTH)
-
-  x_transform_train = vocab_processor.fit_transform(x_train)
-  x_transform_test = vocab_processor.transform(x_test)
-
-  x_train = np.array(list(x_transform_train))
-  x_test = np.array(list(x_transform_test))
-
-  n_words = len(vocab_processor.vocabulary_)
-  print('Total words: %d' % n_words)
-
-  # Build model
-  # Switch between rnn_model and bag_of_words_model to test different models.
-  model_fn = rnn_model
-  if FLAGS.bow_model:
-    # Subtract 1 because VocabularyProcessor outputs a word-id matrix where word
-    # ids start from 1 and 0 means 'no word'. But
-    # categorical_column_with_identity assumes 0-based count and uses -1 for
-    # missing word.
-    x_train -= 1
-    x_test -= 1
-    model_fn = bag_of_words_model
-  classifier = tf.estimator.Estimator(model_fn=model_fn)
-
-  # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_train},
-      y=y_train,
-      batch_size=len(x_train),
-      num_epochs=None,
-      shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=100)
-
-  # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
-  predictions = classifier.predict(input_fn=test_input_fn)
-  y_predicted = np.array(list(p['class'] for p in predictions))
-  y_predicted = y_predicted.reshape(np.array(y_test).shape)
-
-  # Score with sklearn.
-  score = metrics.accuracy_score(y_test, y_predicted)
-  print('Accuracy (sklearn): {0:f}'.format(score))
-
-  # Score with tensorflow.
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy (tensorflow): {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--test_with_fake_data',
-      default=False,
-      help='Test the example code with fake data.',
-      action='store_true')
-  parser.add_argument(
-      '--bow_model',
-      default=False,
-      help='Run with BOW model instead of RNN.',
-      action='store_true')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/text_classification_character_cnn.py b/tensorflow/examples/learn/text_classification_character_cnn.py
deleted file mode 100644
index b8506fa8a42b7723097e152c4cf04797391b3706..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/text_classification_character_cnn.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of using convolutional networks over characters for DBpedia dataset.
-
-This model is similar to one described in this paper:
-   "Character-level Convolutional Networks for Text Classification"
-   http://arxiv.org/abs/1509.01626
-
-and is somewhat alternative to the Lua code from here:
-   https://github.com/zhangxiangxiao/Crepe
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import numpy as np
-import pandas
-import tensorflow as tf
-
-FLAGS = None
-
-MAX_DOCUMENT_LENGTH = 100
-N_FILTERS = 10
-FILTER_SHAPE1 = [20, 256]
-FILTER_SHAPE2 = [20, N_FILTERS]
-POOLING_WINDOW = 4
-POOLING_STRIDE = 2
-MAX_LABEL = 15
-CHARS_FEATURE = 'chars'  # Name of the input character feature.
-
-
-def char_cnn_model(features, labels, mode):
-  """Character level convolutional neural network model to predict classes."""
-  features_onehot = tf.one_hot(features[CHARS_FEATURE], 256)
-  input_layer = tf.reshape(
-      features_onehot, [-1, MAX_DOCUMENT_LENGTH, 256, 1])
-  with tf.variable_scope('CNN_Layer1'):
-    # Apply Convolution filtering on input sequence.
-    conv1 = tf.layers.conv2d(
-        input_layer,
-        filters=N_FILTERS,
-        kernel_size=FILTER_SHAPE1,
-        padding='VALID',
-        # Add a ReLU for non linearity.
-        activation=tf.nn.relu)
-    # Max pooling across output of Convolution+Relu.
-    pool1 = tf.layers.max_pooling2d(
-        conv1,
-        pool_size=POOLING_WINDOW,
-        strides=POOLING_STRIDE,
-        padding='SAME')
-    # Transpose matrix so that n_filters from convolution becomes width.
-    pool1 = tf.transpose(pool1, [0, 1, 3, 2])
-  with tf.variable_scope('CNN_Layer2'):
-    # Second level of convolution filtering.
-    conv2 = tf.layers.conv2d(
-        pool1,
-        filters=N_FILTERS,
-        kernel_size=FILTER_SHAPE2,
-        padding='VALID')
-    # Max across each filter to get useful features for classification.
-    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), axis=[1])
-
-  # Apply regular WX + B and classification.
-  logits = tf.layers.dense(pool2, MAX_LABEL, activation=None)
-
-  predicted_classes = tf.argmax(logits, 1)
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    return tf.estimator.EstimatorSpec(
-        mode=mode,
-        predictions={
-            'class': predicted_classes,
-            'prob': tf.nn.softmax(logits)
-        })
-
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
-    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
-    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-
-  eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
-  }
-  return tf.estimator.EstimatorSpec(
-      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_argv):
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  # Prepare training and testing data
-  dbpedia = tf.contrib.learn.datasets.load_dataset(
-      'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data, size='large')
-  x_train = pandas.DataFrame(dbpedia.train.data)[1]
-  y_train = pandas.Series(dbpedia.train.target)
-  x_test = pandas.DataFrame(dbpedia.test.data)[1]
-  y_test = pandas.Series(dbpedia.test.target)
-
-  # Process vocabulary
-  char_processor = tf.contrib.learn.preprocessing.ByteProcessor(
-      MAX_DOCUMENT_LENGTH)
-  x_train = np.array(list(char_processor.fit_transform(x_train)))
-  x_test = np.array(list(char_processor.transform(x_test)))
-
-  x_train = x_train.reshape([-1, MAX_DOCUMENT_LENGTH, 1, 1])
-  x_test = x_test.reshape([-1, MAX_DOCUMENT_LENGTH, 1, 1])
-
-  # Build model
-  classifier = tf.estimator.Estimator(model_fn=char_cnn_model)
-
-  # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={CHARS_FEATURE: x_train},
-      y=y_train,
-      batch_size=128,
-      num_epochs=None,
-      shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=100)
-
-  # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={CHARS_FEATURE: x_test},
-      y=y_test,
-      num_epochs=1,
-      shuffle=False)
-  predictions = classifier.predict(input_fn=test_input_fn)
-  y_predicted = np.array(list(p['class'] for p in predictions))
-  y_predicted = y_predicted.reshape(np.array(y_test).shape)
-
-  # Score with tensorflow.
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy: {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--test_with_fake_data',
-      default=False,
-      help='Test the example code with fake data.',
-      action='store_true')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/text_classification_character_rnn.py b/tensorflow/examples/learn/text_classification_character_rnn.py
deleted file mode 100644
index 15733821fb17eb17269fea295020f6690bb62854..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/text_classification_character_rnn.py
+++ /dev/null
@@ -1,122 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of recurrent neural networks over characters for DBpedia dataset.
-
-This model is similar to one described in this paper:
-   "Character-level Convolutional Networks for Text Classification"
-   http://arxiv.org/abs/1509.01626
-
-and is somewhat alternative to the Lua code from here:
-   https://github.com/zhangxiangxiao/Crepe
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import numpy as np
-import pandas
-import tensorflow as tf
-
-FLAGS = None
-
-MAX_DOCUMENT_LENGTH = 100
-HIDDEN_SIZE = 20
-MAX_LABEL = 15
-CHARS_FEATURE = 'chars'  # Name of the input character feature.
-
-
-def char_rnn_model(features, labels, mode):
-  """Character level recurrent neural network model to predict classes."""
-  byte_vectors = tf.one_hot(features[CHARS_FEATURE], 256, 1., 0.)
-  byte_list = tf.unstack(byte_vectors, axis=1)
-
-  cell = tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE)
-  _, encoding = tf.nn.static_rnn(cell, byte_list, dtype=tf.float32)
-
-  logits = tf.layers.dense(encoding, MAX_LABEL, activation=None)
-
-  predicted_classes = tf.argmax(logits, 1)
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    return tf.estimator.EstimatorSpec(
-        mode=mode,
-        predictions={
-            'class': predicted_classes,
-            'prob': tf.nn.softmax(logits)
-        })
-
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
-    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
-    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-
-  eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
-  }
-  return tf.estimator.EstimatorSpec(
-      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_argv):
-  # Prepare training and testing data
-  dbpedia = tf.contrib.learn.datasets.load_dataset(
-      'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
-  x_train = pandas.DataFrame(dbpedia.train.data)[1]
-  y_train = pandas.Series(dbpedia.train.target)
-  x_test = pandas.DataFrame(dbpedia.test.data)[1]
-  y_test = pandas.Series(dbpedia.test.target)
-
-  # Process vocabulary
-  char_processor = tf.contrib.learn.preprocessing.ByteProcessor(
-      MAX_DOCUMENT_LENGTH)
-  x_train = np.array(list(char_processor.fit_transform(x_train)))
-  x_test = np.array(list(char_processor.transform(x_test)))
-
-  # Build model
-  classifier = tf.estimator.Estimator(model_fn=char_rnn_model)
-
-  # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={CHARS_FEATURE: x_train},
-      y=y_train,
-      batch_size=128,
-      num_epochs=None,
-      shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=100)
-
-  # Eval.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={CHARS_FEATURE: x_test},
-      y=y_test,
-      num_epochs=1,
-      shuffle=False)
-
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy: {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--test_with_fake_data',
-      default=False,
-      help='Test the example code with fake data.',
-      action='store_true')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
deleted file mode 100644
index a40a9eaecbd9bbaea38f49cd10463c5ba9782e40..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Example of Estimator for CNN-based text classification with DBpedia data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import numpy as np
-import pandas
-import tensorflow as tf
-
-FLAGS = None
-
-MAX_DOCUMENT_LENGTH = 100
-EMBEDDING_SIZE = 20
-N_FILTERS = 10
-WINDOW_SIZE = 20
-FILTER_SHAPE1 = [WINDOW_SIZE, EMBEDDING_SIZE]
-FILTER_SHAPE2 = [WINDOW_SIZE, N_FILTERS]
-POOLING_WINDOW = 4
-POOLING_STRIDE = 2
-n_words = 0
-MAX_LABEL = 15
-WORDS_FEATURE = 'words'  # Name of the input words feature.
-
-
-def cnn_model(features, labels, mode):
-  """2 layer ConvNet to predict from sequence of words to a class."""
-  # Convert indexes of words into embeddings.
-  # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
-  # maps word indexes of the sequence into [batch_size, sequence_length,
-  # EMBEDDING_SIZE].
-  word_vectors = tf.contrib.layers.embed_sequence(
-      features[WORDS_FEATURE], vocab_size=n_words, embed_dim=EMBEDDING_SIZE)
-  word_vectors = tf.expand_dims(word_vectors, 3)
-  with tf.variable_scope('CNN_Layer1'):
-    # Apply Convolution filtering on input sequence.
-    conv1 = tf.layers.conv2d(
-        word_vectors,
-        filters=N_FILTERS,
-        kernel_size=FILTER_SHAPE1,
-        padding='VALID',
-        # Add a ReLU for non linearity.
-        activation=tf.nn.relu)
-    # Max pooling across output of Convolution+Relu.
-    pool1 = tf.layers.max_pooling2d(
-        conv1,
-        pool_size=POOLING_WINDOW,
-        strides=POOLING_STRIDE,
-        padding='SAME')
-    # Transpose matrix so that n_filters from convolution becomes width.
-    pool1 = tf.transpose(pool1, [0, 1, 3, 2])
-  with tf.variable_scope('CNN_Layer2'):
-    # Second level of convolution filtering.
-    conv2 = tf.layers.conv2d(
-        pool1,
-        filters=N_FILTERS,
-        kernel_size=FILTER_SHAPE2,
-        padding='VALID')
-    # Max across each filter to get useful features for classification.
-    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), axis=[1])
-
-  # Apply regular WX + B and classification.
-  logits = tf.layers.dense(pool2, MAX_LABEL, activation=None)
-
-  predicted_classes = tf.argmax(logits, 1)
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    return tf.estimator.EstimatorSpec(
-        mode=mode,
-        predictions={
-            'class': predicted_classes,
-            'prob': tf.nn.softmax(logits)
-        })
-
-  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
-    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
-    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
-
-  eval_metric_ops = {
-      'accuracy': tf.metrics.accuracy(
-          labels=labels, predictions=predicted_classes)
-  }
-  return tf.estimator.EstimatorSpec(
-      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_argv):
-  global n_words
-  # Prepare training and testing data
-  dbpedia = tf.contrib.learn.datasets.load_dataset(
-      'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
-  x_train = pandas.DataFrame(dbpedia.train.data)[1]
-  y_train = pandas.Series(dbpedia.train.target)
-  x_test = pandas.DataFrame(dbpedia.test.data)[1]
-  y_test = pandas.Series(dbpedia.test.target)
-
-  # Process vocabulary
-  vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
-      MAX_DOCUMENT_LENGTH)
-  x_train = np.array(list(vocab_processor.fit_transform(x_train)))
-  x_test = np.array(list(vocab_processor.transform(x_test)))
-  n_words = len(vocab_processor.vocabulary_)
-  print('Total words: %d' % n_words)
-
-  # Build model
-  classifier = tf.estimator.Estimator(model_fn=cnn_model)
-
-  # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_train},
-      y=y_train,
-      batch_size=len(x_train),
-      num_epochs=None,
-      shuffle=True)
-  classifier.train(input_fn=train_input_fn, steps=100)
-
-  # Evaluate.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={WORDS_FEATURE: x_test},
-      y=y_test,
-      num_epochs=1,
-      shuffle=False)
-
-  scores = classifier.evaluate(input_fn=test_input_fn)
-  print('Accuracy: {0:f}'.format(scores['accuracy']))
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--test_with_fake_data',
-      default=False,
-      help='Test the example code with fake data.',
-      action='store_true')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/saved_model/saved_model_half_plus_two.py b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
index 2d1e0c6f6de88ae116fe1951ca24505d41743fa9..dfdde445404a5ec99f3d821dff6d9f217bfadefc 100644
--- a/tensorflow/examples/saved_model/saved_model_half_plus_two.py
+++ b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
@@ -153,11 +153,14 @@ def _generate_saved_model_for_half_plus_two(export_dir,
         tf_example = tf.parse_example(serialized_tf_example, feature_configs)
       # Use tf.identity() to assign name
       x = tf.identity(tf_example["x"], name="x")
-      y = tf.add(tf.multiply(a, x), b, name="y")
-      y2 = tf.add(tf.multiply(a, x), c, name="y2")
+      y = tf.add(tf.multiply(a, x), b)
+      y = tf.identity(y, name="y")
+      y2 = tf.add(tf.multiply(a, x), c)
+      y2 = tf.identity(y2, name="y2")
 
       x2 = tf.identity(tf_example["x2"], name="x2")
-      y3 = tf.add(tf.multiply(a, x2), c, name="y3")
+      y3 = tf.add(tf.multiply(a, x2), c)
+      y3 = tf.identity(y3, name="y3")
 
     # Create an assets file that can be saved and restored as part of the
     # SavedModel.
@@ -215,7 +218,7 @@ def _generate_saved_model_for_half_plus_two(export_dir,
           sess, [tf.saved_model.tag_constants.SERVING],
           signature_def_map=signature_def_map,
           assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
-          legacy_init_op=tf.group(assign_filename_op))
+          main_op=tf.group(assign_filename_op))
   builder.save(as_text)
 
 
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index 7a44e2ee4fdf690ce576f720bb371785f88779b4..7f3c764fac62ee11c6351e11229198fc726d3804 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -101,6 +101,31 @@ tf_py_test(
     ],
 )
 
+py_binary(
+    name = "wav_to_features",
+    srcs = [
+        "wav_to_features.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":input_data",
+        ":models",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "wav_to_features_test",
+    size = "small",
+    srcs = ["wav_to_features_test.py"],
+    additional_deps = [
+        ":wav_to_features",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_binary(
     name = "generate_streaming_test_wav",
     srcs = [
diff --git a/tensorflow/examples/speech_commands/freeze_test.py b/tensorflow/examples/speech_commands/freeze_test.py
index 0c7ca9bc011886f4b8155b7f1d876ce183221ad4..9ed9050035baee7081ff7413c1c2fc41b86c607d 100644
--- a/tensorflow/examples/speech_commands/freeze_test.py
+++ b/tensorflow/examples/speech_commands/freeze_test.py
@@ -19,11 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.examples.speech_commands import freeze
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FreezeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCreateInferenceGraphWithMfcc(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
@@ -43,6 +45,7 @@ class FreezeTest(test.TestCase):
       ops = [node.op for node in sess.graph_def.node]
       self.assertEqual(1, ops.count('Mfcc'))
 
+  @test_util.run_deprecated_v1
   def testCreateInferenceGraphWithoutMfcc(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
@@ -62,6 +65,7 @@ class FreezeTest(test.TestCase):
       ops = [node.op for node in sess.graph_def.node]
       self.assertEqual(0, ops.count('Mfcc'))
 
+  @test_util.run_deprecated_v1
   def testFeatureBinCount(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index 30f2cfa9fef7d0b5800c7e557bde4702dbafaf26..1079a302fa47bea7f5dadd35165bae2b090bb2bc 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -148,18 +148,46 @@ def save_wav_file(filename, wav_data, sample_rate):
         })
 
 
+def get_features_range(model_settings):
+  """Returns the expected min/max for generated features.
+
+  Args:
+    model_settings: Information about the current model being trained.
+
+  Returns:
+    Min/max float pair holding the range of features.
+
+  Raises:
+    Exception: If preprocessing mode isn't recognized.
+  """
+  # TODO(petewarden): These values have been derived from the observed ranges
+  # of spectrogram and MFCC inputs. If the preprocessing pipeline changes,
+  # they may need to be updated.
+  if model_settings['preprocess'] == 'average':
+    features_min = 0.0
+    features_max = 127.5
+  elif model_settings['preprocess'] == 'mfcc':
+    features_min = -247.0
+    features_max = 30.0
+  else:
+    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
+                    ' "average")' % (model_settings['preprocess']))
+  return features_min, features_max
+
+
 class AudioProcessor(object):
   """Handles loading, partitioning, and preparing audio training data."""
 
   def __init__(self, data_url, data_dir, silence_percentage, unknown_percentage,
                wanted_words, validation_percentage, testing_percentage,
                model_settings, summaries_dir):
-    self.data_dir = data_dir
-    self.maybe_download_and_extract_dataset(data_url, data_dir)
-    self.prepare_data_index(silence_percentage, unknown_percentage,
-                            wanted_words, validation_percentage,
-                            testing_percentage)
-    self.prepare_background_data()
+    if data_dir:
+      self.data_dir = data_dir
+      self.maybe_download_and_extract_dataset(data_url, data_dir)
+      self.prepare_data_index(silence_percentage, unknown_percentage,
+                              wanted_words, validation_percentage,
+                              testing_percentage)
+      self.prepare_background_data()
     self.prepare_processing_graph(model_settings, summaries_dir)
 
   def maybe_download_and_extract_dataset(self, data_url, dest_directory):
@@ -421,8 +449,9 @@ class AudioProcessor(object):
       # Merge all the summaries and write them out to /tmp/retrain_logs (by
       # default)
       self.merged_summaries_ = tf.summary.merge_all(scope='data')
-      self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data',
-                                                   tf.get_default_graph())
+      if summaries_dir:
+        self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data',
+                                                     tf.get_default_graph())
 
   def set_size(self, mode):
     """Calculates the number of samples in the dataset partition.
@@ -538,6 +567,34 @@ class AudioProcessor(object):
       labels[i - offset] = label_index
     return data, labels
 
+  def get_features_for_wav(self, wav_filename, model_settings, sess):
+    """Applies the feature transformation process to the input_wav.
+
+    Runs the feature generation process (generally producing a spectrogram from
+    the input samples) on the WAV file. This can be useful for testing and
+    verifying implementations being run on other platforms.
+
+    Args:
+      wav_filename: The path to the input audio file.
+      model_settings: Information about the current model being trained.
+      sess: TensorFlow session that was active when processor was created.
+
+    Returns:
+      Numpy data array containing the generated features.
+    """
+    desired_samples = model_settings['desired_samples']
+    input_dict = {
+        self.wav_filename_placeholder_: wav_filename,
+        self.time_shift_padding_placeholder_: [[0, 0], [0, 0]],
+        self.time_shift_offset_placeholder_: [0, 0],
+        self.background_data_placeholder_: np.zeros([desired_samples, 1]),
+        self.background_volume_placeholder_: 0,
+        self.foreground_volume_placeholder_: 1,
+    }
+    # Run the graph to produce the output audio.
+    data_tensor = sess.run([self.output_], feed_dict=input_dict)
+    return data_tensor
+
   def get_unprocessed_data(self, how_many, model_settings, mode):
     """Retrieve sample data for the given partition, with no transformations.
 
diff --git a/tensorflow/examples/speech_commands/input_data_test.py b/tensorflow/examples/speech_commands/input_data_test.py
index aa4e80777941accd7608bc9b61f4f3c8c5baaa85..9269bb6c0bc780e06ee0c42617478e3a1486100e 100644
--- a/tensorflow/examples/speech_commands/input_data_test.py
+++ b/tensorflow/examples/speech_commands/input_data_test.py
@@ -26,6 +26,7 @@ import tensorflow as tf
 from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
 from tensorflow.examples.speech_commands import input_data
 from tensorflow.examples.speech_commands import models
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -35,7 +36,7 @@ class InputDataTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([32000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
@@ -96,6 +97,7 @@ class InputDataTest(test.TestCase):
         input_data.which_set("foo_nohash_0.wav", 10, 10),
         input_data.which_set("foo_nohash_1.wav", 10, 10))
 
+  @test_util.run_deprecated_v1
   def testPrepareDataIndex(self):
     tmp_dir = self.get_temp_dir()
     self._saveWavFolders(tmp_dir, ["a", "b", "c"], 100)
@@ -125,6 +127,7 @@ class InputDataTest(test.TestCase):
                                     10, self._model_settings(), tmp_dir)
     self.assertTrue("Expected to find" in str(e.exception))
 
+  @test_util.run_deprecated_v1
   def testPrepareBackgroundData(self):
     tmp_dir = self.get_temp_dir()
     background_dir = os.path.join(tmp_dir, "_background_noise_")
@@ -156,6 +159,7 @@ class InputDataTest(test.TestCase):
     self.assertIsNotNone(loaded_data)
     self.assertEqual(16000, len(loaded_data))
 
+  @test_util.run_deprecated_v1
   def testPrepareProcessingGraph(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
@@ -186,15 +190,19 @@ class InputDataTest(test.TestCase):
     self.assertIsNotNone(audio_processor.background_volume_placeholder_)
     self.assertIsNotNone(audio_processor.output_)
 
+  @test_util.run_deprecated_v1
   def testGetDataAverage(self):
     self._runGetDataTest("average", 10)
 
+  @test_util.run_deprecated_v1
   def testGetDataAverageLongWindow(self):
     self._runGetDataTest("average", 30)
 
+  @test_util.run_deprecated_v1
   def testGetDataMfcc(self):
     self._runGetDataTest("mfcc", 30)
 
+  @test_util.run_deprecated_v1
   def testGetUnprocessedData(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
@@ -216,6 +224,63 @@ class InputDataTest(test.TestCase):
     self.assertEqual(10, len(result_data))
     self.assertEqual(10, len(result_labels))
 
+  @test_util.run_deprecated_v1
+  def testGetFeaturesForWav(self):
+    tmp_dir = self.get_temp_dir()
+    wav_dir = os.path.join(tmp_dir, "wavs")
+    os.mkdir(wav_dir)
+    self._saveWavFolders(wav_dir, ["a", "b", "c"], 1)
+    desired_samples = 1600
+    model_settings = {
+        "desired_samples": desired_samples,
+        "fingerprint_size": 40,
+        "label_count": 4,
+        "window_size_samples": 100,
+        "window_stride_samples": 100,
+        "fingerprint_width": 40,
+        "average_window_width": 6,
+        "preprocess": "average",
+    }
+    with self.cached_session() as sess:
+      audio_processor = input_data.AudioProcessor(
+          "", wav_dir, 10, 10, ["a", "b"], 10, 10, model_settings, tmp_dir)
+      sample_data = np.zeros([desired_samples, 1])
+      for i in range(desired_samples):
+        phase = i % 4
+        if phase == 0:
+          sample_data[i, 0] = 0
+        elif phase == 1:
+          sample_data[i, 0] = -1
+        elif phase == 2:
+          sample_data[i, 0] = 0
+        elif phase == 3:
+          sample_data[i, 0] = 1
+      test_wav_path = os.path.join(tmp_dir, "test_wav.wav")
+      input_data.save_wav_file(test_wav_path, sample_data, 16000)
+
+      results = audio_processor.get_features_for_wav(test_wav_path,
+                                                     model_settings, sess)
+      spectrogram = results[0]
+      self.assertEqual(1, spectrogram.shape[0])
+      self.assertEqual(16, spectrogram.shape[1])
+      self.assertEqual(11, spectrogram.shape[2])
+      self.assertNear(0, spectrogram[0, 0, 0], 0.1)
+      self.assertNear(200, spectrogram[0, 0, 5], 0.1)
+
+  def testGetFeaturesRange(self):
+    model_settings = {
+        "preprocess": "average",
+    }
+    features_min, _ = input_data.get_features_range(model_settings)
+    self.assertNear(0.0, features_min, 1e-5)
+
+  def testGetMfccFeaturesRange(self):
+    model_settings = {
+        "preprocess": "mfcc",
+    }
+    features_min, features_max = input_data.get_features_range(model_settings)
+    self.assertLess(features_min, features_max)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/examples/speech_commands/label_wav_test.py b/tensorflow/examples/speech_commands/label_wav_test.py
index f0af2a4798785d53fe937fde45dbc9c9d67acfbc..77a88f98e165758994ddbbd21acab8823dcf5686 100644
--- a/tensorflow/examples/speech_commands/label_wav_test.py
+++ b/tensorflow/examples/speech_commands/label_wav_test.py
@@ -33,7 +33,7 @@ class LabelWavTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([1000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
diff --git a/tensorflow/examples/speech_commands/models_test.py b/tensorflow/examples/speech_commands/models_test.py
index 04478c09626f565e7d439afb45999f587da050ab..cb9304eab8df47800145e14e7e28c739af44292b 100644
--- a/tensorflow/examples/speech_commands/models_test.py
+++ b/tensorflow/examples/speech_commands/models_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
 
 from tensorflow.examples.speech_commands import models
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -47,6 +48,7 @@ class ModelsTest(test.TestCase):
             feature_bin_count=40,
             preprocess="mfcc"))
 
+  @test_util.run_deprecated_v1
   def testCreateModelConvTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -58,6 +60,7 @@ class ModelsTest(test.TestCase):
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
       self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
 
+  @test_util.run_deprecated_v1
   def testCreateModelConvInference(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -67,6 +70,7 @@ class ModelsTest(test.TestCase):
       self.assertIsNotNone(logits)
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
 
+  @test_util.run_deprecated_v1
   def testCreateModelLowLatencyConvTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -78,6 +82,7 @@ class ModelsTest(test.TestCase):
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
       self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
 
+  @test_util.run_deprecated_v1
   def testCreateModelFullyConnectedTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -98,6 +103,7 @@ class ModelsTest(test.TestCase):
                             "bad_architecture", True)
       self.assertTrue("not recognized" in str(e.exception))
 
+  @test_util.run_deprecated_v1
   def testCreateModelTinyConvTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index eca34f8812b76a60168c97a745f5890bf3ee0269..f6e39b0b5519cad8f9c90500c960c613f6c8cf4c 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -125,18 +125,8 @@ def main(_):
   input_placeholder = tf.placeholder(
       tf.float32, [None, fingerprint_size], name='fingerprint_input')
   if FLAGS.quantize:
-    # TODO(petewarden): These values have been derived from the observed ranges
-    # of spectrogram and MFCC inputs. If the preprocessing pipeline changes,
-    # they may need to be updated.
-    if FLAGS.preprocess == 'average':
-      fingerprint_min = 0.0
-      fingerprint_max = 2048.0
-    elif FLAGS.preprocess == 'mfcc':
-      fingerprint_min = -247.0
-      fingerprint_max = 30.0
-    else:
-      raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
-                      ' "average")' % (FLAGS.preprocess))
+    fingerprint_min, fingerprint_max = input_data.get_features_range(
+        model_settings)
     fingerprint_input = tf.fake_quant_with_min_max_args(
         input_placeholder, fingerprint_min, fingerprint_max)
   else:
diff --git a/tensorflow/examples/speech_commands/wav_to_features.py b/tensorflow/examples/speech_commands/wav_to_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6c8f45c5bfaf8cdb669c3f024a27624be8d76ba
--- /dev/null
+++ b/tensorflow/examples/speech_commands/wav_to_features.py
@@ -0,0 +1,184 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Converts WAV audio files into input features for neural networks.
+
+The models used in this example take in two-dimensional spectrograms as the
+input to their neural network portions. For testing and porting purposes it's
+useful to be able to generate these spectrograms outside of the full model, so
+that on-device implementations using their own FFT and streaming code can be
+tested against the version used in training for example. The output is as a
+C source file, so it can be easily linked into an embedded test application.
+
+To use this, run:
+
+bazel run tensorflow/examples/speech_commands:wav_to_features -- \
+--input_wav=my.wav --output_c_file=my_wav_data.c
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os.path
+import sys
+
+import tensorflow as tf
+
+import input_data
+import models
+from tensorflow.python.platform import gfile
+
+FLAGS = None
+
+
+def wav_to_features(sample_rate, clip_duration_ms, window_size_ms,
+                    window_stride_ms, feature_bin_count, quantize, preprocess,
+                    input_wav, output_c_file):
+  """Converts an audio file into its corresponding feature map.
+
+  Args:
+    sample_rate: Expected sample rate of the wavs.
+    clip_duration_ms: Expected duration in milliseconds of the wavs.
+    window_size_ms: How long each spectrogram timeslice is.
+    window_stride_ms: How far to move in time between spectogram timeslices.
+    feature_bin_count: How many bins to use for the feature fingerprint.
+    quantize: Whether to train the model for eight-bit deployment.
+    preprocess: Spectrogram processing mode. Can be "mfcc" or "average".
+    input_wav: Path to the audio WAV file to read.
+    output_c_file: Where to save the generated C source file.
+  """
+
+  # Start a new TensorFlow session.
+  sess = tf.InteractiveSession()
+
+  model_settings = models.prepare_model_settings(
+      0, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms,
+      feature_bin_count, preprocess)
+  audio_processor = input_data.AudioProcessor(None, None, 0, 0, '', 0, 0,
+                                              model_settings, None)
+
+  results = audio_processor.get_features_for_wav(input_wav, model_settings,
+                                                 sess)
+  features = results[0]
+
+  variable_base = os.path.splitext(os.path.basename(input_wav).lower())[0]
+
+  # Save a C source file containing the feature data as an array.
+  with gfile.GFile(output_c_file, 'w') as f:
+    f.write('/* File automatically created by\n')
+    f.write(' * tensorflow/examples/speech_commands/wav_to_features.py \\\n')
+    f.write(' * --sample_rate=%d \\\n' % sample_rate)
+    f.write(' * --clip_duration_ms=%d \\\n' % clip_duration_ms)
+    f.write(' * --window_size_ms=%d \\\n' % window_size_ms)
+    f.write(' * --window_stride_ms=%d \\\n' % window_stride_ms)
+    f.write(' * --feature_bin_count=%d \\\n' % feature_bin_count)
+    if quantize:
+      f.write(' * --quantize \\\n')
+    f.write(' * --preprocess="%s" \\\n' % preprocess)
+    f.write(' * --input_wav="%s" \\\n' % input_wav)
+    f.write(' * --output_c_file="%s" \\\n' % output_c_file)
+    f.write(' */\n\n')
+    f.write('const int g_%s_width = %d;\n' % (variable_base, features.shape[2]))
+    f.write(
+        'const int g_%s_height = %d;\n' % (variable_base, features.shape[1]))
+    if quantize:
+      features_min, features_max = input_data.get_features_range(model_settings)
+      f.write('const unsigned char g_%s_data[] = {' % variable_base)
+      i = 0
+      for value in features.flatten():
+        quantized_value = int(
+            round(
+                (255 * (value - features_min)) / (features_max - features_min)))
+        if quantized_value < 0:
+          quantized_value = 0
+        if quantized_value > 255:
+          quantized_value = 255
+        if i == 0:
+          f.write('\n  ')
+        f.write('%d, ' % quantized_value)
+        i = (i + 1) % 10
+    else:
+      f.write('const float g_%s_data[] = {\n' % variable_base)
+      i = 0
+      for value in features.flatten():
+        if i == 0:
+          f.write('\n  ')
+        f.write(' ,%f' % value)
+        i = (i + 1) % 10
+    f.write('\n};\n')
+
+
+def main(_):
+  # We want to see all the logging messages.
+  tf.logging.set_verbosity(tf.logging.INFO)
+  wav_to_features(FLAGS.sample_rate, FLAGS.clip_duration_ms,
+                  FLAGS.window_size_ms, FLAGS.window_stride_ms,
+                  FLAGS.feature_bin_count, FLAGS.quantize, FLAGS.preprocess,
+                  FLAGS.input_wav, FLAGS.output_c_file)
+  tf.logging.info('Wrote to "%s"' % (FLAGS.output_c_file))
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--sample_rate',
+      type=int,
+      default=16000,
+      help='Expected sample rate of the wavs',)
+  parser.add_argument(
+      '--clip_duration_ms',
+      type=int,
+      default=1000,
+      help='Expected duration in milliseconds of the wavs',)
+  parser.add_argument(
+      '--window_size_ms',
+      type=float,
+      default=30.0,
+      help='How long each spectrogram timeslice is.',)
+  parser.add_argument(
+      '--window_stride_ms',
+      type=float,
+      default=10.0,
+      help='How far to move in time between spectogram timeslices.',)
+  parser.add_argument(
+      '--feature_bin_count',
+      type=int,
+      default=40,
+      help='How many bins to use for the MFCC fingerprint',
+  )
+  parser.add_argument(
+      '--quantize',
+      type=bool,
+      default=False,
+      help='Whether to train the model for eight-bit deployment')
+  parser.add_argument(
+      '--preprocess',
+      type=str,
+      default='mfcc',
+      help='Spectrogram processing mode. Can be "mfcc" or "average"')
+  parser.add_argument(
+      '--input_wav',
+      type=str,
+      default=None,
+      help='Path to the audio WAV file to read')
+  parser.add_argument(
+      '--output_c_file',
+      type=str,
+      default=None,
+      help='Where to save the generated C source file containing the features')
+
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/wav_to_features_test.py b/tensorflow/examples/speech_commands/wav_to_features_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6234490b26760c99e3184cfc9a51b56169ec63bb
--- /dev/null
+++ b/tensorflow/examples/speech_commands/wav_to_features_test.py
@@ -0,0 +1,71 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for data input for speech commands."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
+from tensorflow.examples.speech_commands import wav_to_features
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class WavToFeaturesTest(test.TestCase):
+
+  def _getWavData(self):
+    with self.cached_session() as sess:
+      sample_data = tf.zeros([32000, 2])
+      wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
+      wav_data = self.evaluate(wav_encoder)
+    return wav_data
+
+  def _saveTestWavFile(self, filename, wav_data):
+    with open(filename, "wb") as f:
+      f.write(wav_data)
+
+  def _saveWavFolders(self, root_dir, labels, how_many):
+    wav_data = self._getWavData()
+    for label in labels:
+      dir_name = os.path.join(root_dir, label)
+      os.mkdir(dir_name)
+      for i in range(how_many):
+        file_path = os.path.join(dir_name, "some_audio_%d.wav" % i)
+        self._saveTestWavFile(file_path, wav_data)
+
+  @test_util.run_deprecated_v1
+  def testWavToFeatures(self):
+    tmp_dir = self.get_temp_dir()
+    wav_dir = os.path.join(tmp_dir, "wavs")
+    os.mkdir(wav_dir)
+    self._saveWavFolders(wav_dir, ["a", "b", "c"], 100)
+    input_file_path = os.path.join(tmp_dir, "input.wav")
+    output_file_path = os.path.join(tmp_dir, "output.c")
+    wav_data = self._getWavData()
+    self._saveTestWavFile(input_file_path, wav_data)
+    wav_to_features.wav_to_features(16000, 1000, 10, 10, 40, True, "average",
+                                    input_file_path, output_file_path)
+    with open(output_file_path, "rb") as f:
+      content = f.read()
+      self.assertTrue(b"const unsigned char g_input_data" in content)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/examples/tf2_showcase/BUILD b/tensorflow/examples/tf2_showcase/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..922bc96b25bcc940685c7e01f033856e7d39f5f8
--- /dev/null
+++ b/tensorflow/examples/tf2_showcase/BUILD
@@ -0,0 +1,32 @@
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = ["//visibility:private"],
+)
+
+test_suite(
+    name = "all_tests",
+    tags = [
+        "manual",
+        "no_oss",
+        "notap",
+    ],
+    tests = [
+        ":mnist",
+    ],
+)
+
+py_test(
+    name = "mnist",
+    srcs = ["mnist.py"],
+    tags = [
+        "manual",
+        "no_oss",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/absl:app",
+        "//third_party/py/absl/flags",
+    ],
+)
diff --git a/tensorflow/examples/tf2_showcase/README.md b/tensorflow/examples/tf2_showcase/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8211fb1d30d8f9da8966d4babed05c33367bcecc
--- /dev/null
+++ b/tensorflow/examples/tf2_showcase/README.md
@@ -0,0 +1,25 @@
+# TF 2.0 Showcase
+
+The code here shows idiomatic ways to write TensorFlow 2.0 code. It doubles as
+an integration test.
+
+## General guidelines for showcase code:
+
+- Code should minimize dependencies and be self-contained in one file. A user
+  should be able to copy-paste the example code into their project and have it
+  just work.
+- Code should emphasize simplicity over performance, as long as it performs
+  within a factor of 2-3x of the optimized implementation.
+- Code should work on CPU and single GPU.
+- Code should run in Python 3.
+- Code should conform to the [Google Python Style Guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md)
+
+
+- Code should follow these guidelines:
+  - Prefer Keras.
+  - Split code into separate input pipeline and model code segments.
+  - Don't use tf.cond or tf.while_loop; instead, make use of AutoGraph's
+    functionality to compile Python `for`, `while`, and `if` statements.
+  - Prefer a simple training loop over Estimator
+  - Save and restore a SavedModel.
+  - Write basic TensorBoard metrics - loss, accuracy,
diff --git a/tensorflow/examples/tf2_showcase/mnist.py b/tensorflow/examples/tf2_showcase/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4bfe4e53a8e16f7bb615ce481018cb3dce02150
--- /dev/null
+++ b/tensorflow/examples/tf2_showcase/mnist.py
@@ -0,0 +1,262 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MNIST model training with TensorFlow eager execution.
+
+See:
+https://research.googleblog.com/2017/10/eager-execution-imperative-define-by.html
+
+This program demonstrates training, export, and inference of a convolutional
+neural network model with eager execution enabled.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow as tf
+
+tfe = tf.contrib.eager
+
+flags.DEFINE_integer(
+    name='log_interval',
+    default=10,
+    help='batches between logging training status')
+
+flags.DEFINE_float(name='learning_rate', default=0.01, help='Learning rate.')
+
+flags.DEFINE_float(
+    name='momentum', short_name='m', default=0.5, help='SGD momentum.')
+
+flags.DEFINE_integer(
+    name='batch_size',
+    default=100,
+    help='Batch size to use during training / eval')
+
+flags.DEFINE_integer(
+    name='train_epochs', default=10, help='Number of epochs to train')
+
+flags.DEFINE_string(
+    name='model_dir',
+    default='/tmp/tensorflow/mnist',
+    help='Where to save checkpoints, tensorboard summaries, etc.')
+
+flags.DEFINE_bool(
+    name='clean',
+    default=False,
+    help='Whether to clear model directory before training')
+
+FLAGS = flags.FLAGS
+
+
+def create_model():
+  """Model to recognize digits in the MNIST dataset.
+
+  Network structure is equivalent to:
+  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
+  and
+  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
+  But uses the tf.keras API.
+  Returns:
+    A tf.keras.Model.
+  """
+  # Assumes data_format == 'channel_last'.
+  # See https://www.tensorflow.org/performance/performance_guide#data_formats
+
+  input_shape = [28, 28, 1]
+
+  l = tf.keras.layers
+  max_pool = l.MaxPooling2D((2, 2), (2, 2), padding='same')
+  # The model consists of a sequential chain of layers, so tf.keras.Sequential
+  # (a subclass of tf.keras.Model) makes for a compact description.
+  model = tf.keras.Sequential(
+      [
+          l.Reshape(
+              target_shape=input_shape,
+              input_shape=(28 * 28,)),
+          l.Conv2D(2, 5, padding='same', activation=tf.nn.relu),
+          max_pool,
+          l.Conv2D(4, 5, padding='same', activation=tf.nn.relu),
+          max_pool,
+          l.Flatten(),
+          l.Dense(32, activation=tf.nn.relu),
+          l.Dropout(0.4),
+          l.Dense(10)
+      ])
+  # TODO(brianklee): Remove when @kaftan makes this happen by default.
+  # TODO(brianklee): remove `autograph=True` when kwarg default is flipped.
+  model.call = tfe.function(model.call, autograph=True)
+  # Needs to have input_signature specified in order to be exported
+  # since model.predict() is never called before saved_model.export()
+  # TODO(brianklee): Update with input signature, depending on how the impl of
+  # saved_model.restore() pans out.
+  model.predict = tfe.function(model.predict, autograph=True)
+  # ,input_signature=(tensor_spec.TensorSpec(shape=[28, 28, None], dtype=tf.float32),) # pylint: disable=line-too-long
+  return model
+
+
+def mnist_datasets():
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32.
+  x_train, x_test = x_train / np.float32(255), x_test / np.float32(255)
+  y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
+  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  return train_dataset, test_dataset
+
+
+def loss(logits, labels):
+  return tf.reduce_mean(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=labels))
+
+
+def compute_accuracy(logits, labels):
+  predictions = tf.argmax(logits, axis=1, output_type=tf.int64)
+  labels = tf.cast(labels, tf.int64)
+  return tf.reduce_mean(
+      tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
+
+
+# TODO(brianklee): Enable @tf.function on the training loop when zip, enumerate
+# are supported by autograph.
+def train(model, optimizer, dataset, step_counter, log_interval=None,
+          num_steps=None):
+  """Trains model on `dataset` using `optimizer`."""
+  start = time.time()
+  for (batch, (images, labels)) in enumerate(dataset):
+    if num_steps is not None and batch > num_steps:
+      break
+    with tf.contrib.summary.record_summaries_every_n_global_steps(
+        10, global_step=step_counter):
+      # Record the operations used to compute the loss given the input,
+      # so that the gradient of the loss with respect to the variables
+      # can be computed.
+      with tf.GradientTape() as tape:
+        logits = model(images, training=True)
+        loss_value = loss(logits, labels)
+        tf.contrib.summary.scalar('loss', loss_value)
+        tf.contrib.summary.scalar('accuracy', compute_accuracy(logits, labels))
+      grads = tape.gradient(loss_value, model.variables)
+      optimizer.apply_gradients(
+          zip(grads, model.variables), global_step=step_counter)
+      if log_interval and batch % log_interval == 0:
+        rate = log_interval / (time.time() - start)
+        print('Step #%d\tLoss: %.6f (%d steps/sec)' % (batch, loss_value, rate))
+        start = time.time()
+
+
+def test(model, dataset):
+  """Perform an evaluation of `model` on the examples from `dataset`."""
+  avg_loss = tfe.metrics.Mean('loss', dtype=tf.float32)
+  accuracy = tfe.metrics.Accuracy('accuracy', dtype=tf.float32)
+
+  for (images, labels) in dataset:
+    logits = model(images, training=False)
+    avg_loss(loss(logits, labels))
+    accuracy(
+        tf.argmax(logits, axis=1, output_type=tf.int64),
+        tf.cast(labels, tf.int64))
+  print('Test set: Average loss: %.4f, Accuracy: %4f%%\n' %
+        (avg_loss.result(), 100 * accuracy.result()))
+  with tf.contrib.summary.always_record_summaries():
+    tf.contrib.summary.scalar('loss', avg_loss.result())
+    tf.contrib.summary.scalar('accuracy', accuracy.result())
+
+
+def train_and_export(flags_obj):
+  """Run MNIST training and eval loop in eager mode.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+  """
+  # Load the datasets
+  train_ds, test_ds = mnist_datasets()
+  train_ds = train_ds.shuffle(60000).batch(flags_obj.batch_size)
+  test_ds = test_ds.batch(flags_obj.batch_size)
+
+  # Create the model and optimizer
+  model = create_model()
+  optimizer = tf.train.MomentumOptimizer(
+      flags_obj.learning_rate, flags_obj.momentum)
+
+  # See summaries with `tensorboard --logdir=<model_dir>`
+  train_dir = os.path.join(flags_obj.model_dir, 'summaries', 'train')
+  test_dir = os.path.join(flags_obj.model_dir, 'summaries', 'eval')
+  summary_writer = tf.contrib.summary.create_file_writer(
+      train_dir, flush_millis=10000)
+  test_summary_writer = tf.contrib.summary.create_file_writer(
+      test_dir, flush_millis=10000, name='test')
+
+  # Create and restore checkpoint (if one exists on the path)
+  checkpoint_dir = os.path.join(flags_obj.model_dir, 'checkpoints')
+  checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
+  step_counter = tf.train.get_or_create_global_step()
+  checkpoint = tf.train.Checkpoint(
+      model=model, optimizer=optimizer, step_counter=step_counter)
+  # Restore variables on creation if a checkpoint exists.
+  checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
+
+  # Train and evaluate for a set number of epochs.
+  for _ in range(flags_obj.train_epochs):
+    start = time.time()
+    with summary_writer.as_default():
+      train(model, optimizer, train_ds, step_counter,
+            flags_obj.log_interval, num_steps=1)
+    end = time.time()
+    print('\nTrain time for epoch #%d (%d total steps): %f' %
+          (checkpoint.save_counter.numpy() + 1,
+           step_counter.numpy(),
+           end - start))
+    with test_summary_writer.as_default():
+      test(model, test_ds)
+    checkpoint.save(checkpoint_prefix)
+
+  # TODO(brianklee): Enable this functionality after @allenl implements this.
+  # export_path = os.path.join(flags_obj.model_dir, 'export')
+  # tf.saved_model.save(export_path, model)
+
+
+def import_and_eval(flags_obj):
+  export_path = os.path.join(flags_obj.model_dir, 'export')
+  model = tf.saved_model.restore(export_path)
+  _, (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  x_test = x_test / np.float32(255)
+  y_predict = model(x_test)
+  accuracy = compute_accuracy(y_predict, y_test)
+  print('Model accuracy: {:0.2f}%'.format(accuracy.numpy() * 100))
+
+
+def apply_clean(flags_obj):
+  if flags_obj.clean and tf.gfile.Exists(flags_obj.model_dir):
+    tf.logging.info('--clean flag set. Removing existing model dir: {}'.format(
+        flags_obj.model_dir))
+    tf.gfile.DeleteRecursively(flags_obj.model_dir)
+
+
+def main(_):
+  apply_clean(flags.FLAGS)
+  train_and_export(flags.FLAGS)
+  # TODO(brianklee): Enable this functionality after @allenl implements this.
+  # import_and_eval(flags.FLAGS)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/examples/tutorials/estimators/BUILD b/tensorflow/examples/tutorials/estimators/BUILD
deleted file mode 100644
index bab609f208b6ca3dd6daa8ecfd0c0c762ef87a22..0000000000000000000000000000000000000000
--- a/tensorflow/examples/tutorials/estimators/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-# Example Estimator model
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "abalone",
-    srcs = [
-        "abalone.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/contrib/learn",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/examples/tutorials/estimators/abalone.py b/tensorflow/examples/tutorials/estimators/abalone.py
deleted file mode 100644
index 737b3ee5d6a5a71b3093fcd219c699eda228f903..0000000000000000000000000000000000000000
--- a/tensorflow/examples/tutorials/estimators/abalone.py
+++ /dev/null
@@ -1,185 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""DNNRegressor with custom estimator for abalone dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-import tempfile
-
-from six.moves import urllib
-
-import numpy as np
-import tensorflow as tf
-
-FLAGS = None
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-# Learning rate for the model
-LEARNING_RATE = 0.001
-
-
-def maybe_download(train_data, test_data, predict_data):
-  """Maybe downloads training data and returns train and test file names."""
-  if train_data:
-    train_file_name = train_data
-  else:
-    train_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "http://download.tensorflow.org/data/abalone_train.csv",
-        train_file.name)
-    train_file_name = train_file.name
-    train_file.close()
-    print("Training data is downloaded to %s" % train_file_name)
-
-  if test_data:
-    test_file_name = test_data
-  else:
-    test_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "http://download.tensorflow.org/data/abalone_test.csv", test_file.name)
-    test_file_name = test_file.name
-    test_file.close()
-    print("Test data is downloaded to %s" % test_file_name)
-
-  if predict_data:
-    predict_file_name = predict_data
-  else:
-    predict_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "http://download.tensorflow.org/data/abalone_predict.csv",
-        predict_file.name)
-    predict_file_name = predict_file.name
-    predict_file.close()
-    print("Prediction data is downloaded to %s" % predict_file_name)
-
-  return train_file_name, test_file_name, predict_file_name
-
-
-def model_fn(features, labels, mode, params):
-  """Model function for Estimator."""
-
-  # Connect the first hidden layer to input layer
-  # (features["x"]) with relu activation
-  first_hidden_layer = tf.layers.dense(features["x"], 10, activation=tf.nn.relu)
-
-  # Connect the second hidden layer to first hidden layer with relu
-  second_hidden_layer = tf.layers.dense(
-      first_hidden_layer, 10, activation=tf.nn.relu)
-
-  # Connect the output layer to second hidden layer (no activation fn)
-  output_layer = tf.layers.dense(second_hidden_layer, 1)
-
-  # Reshape output layer to 1-dim Tensor to return predictions
-  predictions = tf.reshape(output_layer, [-1])
-
-  # Provide an estimator spec for `ModeKeys.PREDICT`.
-  if mode == tf.estimator.ModeKeys.PREDICT:
-    return tf.estimator.EstimatorSpec(
-        mode=mode,
-        predictions={"ages": predictions})
-
-  # Calculate loss using mean squared error
-  loss = tf.losses.mean_squared_error(labels, predictions)
-
-  optimizer = tf.train.GradientDescentOptimizer(
-      learning_rate=params["learning_rate"])
-  train_op = optimizer.minimize(
-      loss=loss, global_step=tf.train.get_global_step())
-
-  # Calculate root mean squared error as additional eval metric
-  eval_metric_ops = {
-      "rmse": tf.metrics.root_mean_squared_error(
-          tf.cast(labels, tf.float64), predictions)
-  }
-
-  # Provide an estimator spec for `ModeKeys.EVAL` and `ModeKeys.TRAIN` modes.
-  return tf.estimator.EstimatorSpec(
-      mode=mode,
-      loss=loss,
-      train_op=train_op,
-      eval_metric_ops=eval_metric_ops)
-
-
-def main(unused_argv):
-  # Load datasets
-  abalone_train, abalone_test, abalone_predict = maybe_download(
-      FLAGS.train_data, FLAGS.test_data, FLAGS.predict_data)
-
-  # Training examples
-  training_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_train, target_dtype=np.int, features_dtype=np.float64)
-
-  # Test examples
-  test_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_test, target_dtype=np.int, features_dtype=np.float64)
-
-  # Set of 7 examples for which to predict abalone ages
-  prediction_set = tf.contrib.learn.datasets.base.load_csv_without_header(
-      filename=abalone_predict, target_dtype=np.int, features_dtype=np.float64)
-
-  # Set model params
-  model_params = {"learning_rate": LEARNING_RATE}
-
-  # Instantiate Estimator
-  nn = tf.estimator.Estimator(model_fn=model_fn, params=model_params)
-
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"x": np.array(training_set.data)},
-      y=np.array(training_set.target),
-      num_epochs=None,
-      shuffle=True)
-
-  # Train
-  nn.train(input_fn=train_input_fn, steps=5000)
-
-  # Score accuracy
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"x": np.array(test_set.data)},
-      y=np.array(test_set.target),
-      num_epochs=1,
-      shuffle=False)
-
-  ev = nn.evaluate(input_fn=test_input_fn)
-  print("Loss: %s" % ev["loss"])
-  print("Root Mean Squared Error: %s" % ev["rmse"])
-
-  # Print out predictions
-  predict_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"x": prediction_set.data},
-      num_epochs=1,
-      shuffle=False)
-  predictions = nn.predict(input_fn=predict_input_fn)
-  for i, p in enumerate(predictions):
-    print("Prediction %s: %s" % (i + 1, p["ages"]))
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-      "--train_data", type=str, default="", help="Path to the training data.")
-  parser.add_argument(
-      "--test_data", type=str, default="", help="Path to the test data.")
-  parser.add_argument(
-      "--predict_data",
-      type=str,
-      default="",
-      help="Path to the prediction data.")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/input_fn/boston.py b/tensorflow/examples/tutorials/input_fn/boston.py
deleted file mode 100644
index 34f350e9acd3d9541fe24c235c6f2cb5c8170c35..0000000000000000000000000000000000000000
--- a/tensorflow/examples/tutorials/input_fn/boston.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""DNNRegressor with custom input_fn for Housing dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-import pandas as pd
-import tensorflow as tf
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-COLUMNS = ["crim", "zn", "indus", "nox", "rm", "age",
-           "dis", "tax", "ptratio", "medv"]
-FEATURES = ["crim", "zn", "indus", "nox", "rm",
-            "age", "dis", "tax", "ptratio"]
-LABEL = "medv"
-
-
-def get_input_fn(data_set, num_epochs=None, shuffle=True):
-  return tf.estimator.inputs.pandas_input_fn(
-      x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
-      y=pd.Series(data_set[LABEL].values),
-      num_epochs=num_epochs,
-      shuffle=shuffle)
-
-
-def main(unused_argv):
-  # Load datasets
-  training_set = pd.read_csv("boston_train.csv", skipinitialspace=True,
-                             skiprows=1, names=COLUMNS)
-  test_set = pd.read_csv("boston_test.csv", skipinitialspace=True,
-                         skiprows=1, names=COLUMNS)
-
-  # Set of 6 examples for which to predict median house values
-  prediction_set = pd.read_csv("boston_predict.csv", skipinitialspace=True,
-                               skiprows=1, names=COLUMNS)
-
-  # Feature cols
-  feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
-
-  # Build 2 layer fully connected DNN with 10, 10 units respectively.
-  regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols,
-                                        hidden_units=[10, 10],
-                                        model_dir="/tmp/boston_model")
-
-  # Train
-  regressor.train(input_fn=get_input_fn(training_set), steps=5000)
-
-  # Evaluate loss over one epoch of test_set.
-  ev = regressor.evaluate(
-      input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
-  loss_score = ev["loss"]
-  print("Loss: {0:f}".format(loss_score))
-
-  # Print out predictions over a slice of prediction_set.
-  y = regressor.predict(
-      input_fn=get_input_fn(prediction_set, num_epochs=1, shuffle=False))
-  # .predict() returns an iterator of dicts; convert to a list and print
-  # predictions
-  predictions = list(p["predictions"] for p in itertools.islice(y, 6))
-  print("Predictions: {}".format(str(predictions)))
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensorflow/examples/tutorials/layers/cnn_mnist.py b/tensorflow/examples/tutorials/layers/cnn_mnist.py
index 1e8d7d05e1c6af08d788857e74c04134333d019c..670e929236f26363fb1682e8e9576543cc27fb38 100644
--- a/tensorflow/examples/tutorials/layers/cnn_mnist.py
+++ b/tensorflow/examples/tutorials/layers/cnn_mnist.py
@@ -134,7 +134,7 @@ def main(unused_argv):
       tensors=tensors_to_log, every_n_iter=50)
 
   # Train the model
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": train_data},
       y=train_labels,
       batch_size=100,
@@ -146,11 +146,8 @@ def main(unused_argv):
       hooks=[logging_hook])
 
   # Evaluate the model and print results
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"x": eval_data},
-      y=eval_labels,
-      num_epochs=1,
-      shuffle=False)
+  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
+      x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False)
   eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
   print(eval_results)
 
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 99da44d6d52c64ab79c3e9563dc981e127bba4cf..5f12374bdbd7f48d9c6bd858abf6c5ef15afddf1 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -71,18 +71,12 @@ py_binary(
     ],
 )
 
-py_binary(
-    name = "mnist_softmax",
-    srcs = [
-        "mnist_softmax.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":input_data",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
+# Note: We need to set the evironment variable to use CPU JIT.
+# The way to achieve this is via setting the following:
+# TF_XLA_FLAGS='--tf_xla_cpu_global_jit=true'
+# before the run command. To use XLA, we also must build
+# with --define=with_xla_support=true flag.
+# Note (GPU): Add --config=cuda to the build command.
 py_binary(
     name = "mnist_softmax_xla",
     srcs = [
@@ -95,18 +89,6 @@ py_binary(
     ],
 )
 
-py_binary(
-    name = "mnist_deep",
-    srcs = [
-        "mnist_deep.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":input_data",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 py_test(
     name = "fully_connected_feed_test",
     size = "medium",
diff --git a/tensorflow/examples/tutorials/mnist/mnist_deep.py b/tensorflow/examples/tutorials/mnist/mnist_deep.py
deleted file mode 100644
index 5d8d8d84fe26c0a3ec69791885f3c7ce5e0fba15..0000000000000000000000000000000000000000
--- a/tensorflow/examples/tutorials/mnist/mnist_deep.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""A deep MNIST classifier using convolutional layers.
-
-See extensive documentation at
-https://www.tensorflow.org/get_started/mnist/pros
-"""
-# Disable linter warnings to maintain consistency with tutorial.
-# pylint: disable=invalid-name
-# pylint: disable=g-bad-import-order
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-import tempfile
-
-from tensorflow.examples.tutorials.mnist import input_data
-
-import tensorflow as tf
-
-import numpy
-
-FLAGS = None
-
-
-def deepnn(x):
-  """deepnn builds the graph for a deep net for classifying digits.
-
-  Args:
-    x: an input tensor with the dimensions (N_examples, 784), where 784 is the
-    number of pixels in a standard MNIST image.
-
-  Returns:
-    A tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values
-    equal to the logits of classifying the digit into one of 10 classes (the
-    digits 0-9). keep_prob is a scalar placeholder for the probability of
-    dropout.
-  """
-  # Reshape to use within a convolutional neural net.
-  # Last dimension is for "features" - there is only one here, since images are
-  # grayscale -- it would be 3 for an RGB image, 4 for RGBA, etc.
-  with tf.name_scope('reshape'):
-    x_image = tf.reshape(x, [-1, 28, 28, 1])
-
-  # First convolutional layer - maps one grayscale image to 32 feature maps.
-  with tf.name_scope('conv1'):
-    W_conv1 = weight_variable([5, 5, 1, 32])
-    b_conv1 = bias_variable([32])
-    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
-
-  # Pooling layer - downsamples by 2X.
-  with tf.name_scope('pool1'):
-    h_pool1 = max_pool_2x2(h_conv1)
-
-  # Second convolutional layer -- maps 32 feature maps to 64.
-  with tf.name_scope('conv2'):
-    W_conv2 = weight_variable([5, 5, 32, 64])
-    b_conv2 = bias_variable([64])
-    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
-
-  # Second pooling layer.
-  with tf.name_scope('pool2'):
-    h_pool2 = max_pool_2x2(h_conv2)
-
-  # Fully connected layer 1 -- after 2 round of downsampling, our 28x28 image
-  # is down to 7x7x64 feature maps -- maps this to 1024 features.
-  with tf.name_scope('fc1'):
-    W_fc1 = weight_variable([7 * 7 * 64, 1024])
-    b_fc1 = bias_variable([1024])
-
-    h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
-    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
-
-  # Dropout - controls the complexity of the model, prevents co-adaptation of
-  # features.
-  with tf.name_scope('dropout'):
-    keep_prob = tf.placeholder(tf.float32)
-    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
-
-  # Map the 1024 features to 10 classes, one for each digit
-  with tf.name_scope('fc2'):
-    W_fc2 = weight_variable([1024, 10])
-    b_fc2 = bias_variable([10])
-
-    y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
-  return y_conv, keep_prob
-
-
-def conv2d(x, W):
-  """conv2d returns a 2d convolution layer with full stride."""
-  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
-
-
-def max_pool_2x2(x):
-  """max_pool_2x2 downsamples a feature map by 2X."""
-  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
-                        strides=[1, 2, 2, 1], padding='SAME')
-
-
-def weight_variable(shape):
-  """weight_variable generates a weight variable of a given shape."""
-  initial = tf.truncated_normal(shape, stddev=0.1)
-  return tf.Variable(initial)
-
-
-def bias_variable(shape):
-  """bias_variable generates a bias variable of a given shape."""
-  initial = tf.constant(0.1, shape=shape)
-  return tf.Variable(initial)
-
-
-def main(_):
-  # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir)
-
-  # Create the model
-  x = tf.placeholder(tf.float32, [None, 784])
-
-  # Define loss and optimizer
-  y_ = tf.placeholder(tf.int64, [None])
-
-  # Build the graph for the deep net
-  y_conv, keep_prob = deepnn(x)
-
-  with tf.name_scope('loss'):
-    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
-        labels=y_, logits=y_conv)
-  cross_entropy = tf.reduce_mean(cross_entropy)
-
-  with tf.name_scope('adam_optimizer'):
-    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
-
-  with tf.name_scope('accuracy'):
-    correct_prediction = tf.equal(tf.argmax(y_conv, 1), y_)
-    correct_prediction = tf.cast(correct_prediction, tf.float32)
-  accuracy = tf.reduce_mean(correct_prediction)
-
-  graph_location = tempfile.mkdtemp()
-  print('Saving graph to: %s' % graph_location)
-  train_writer = tf.summary.FileWriter(graph_location)
-  train_writer.add_graph(tf.get_default_graph())
-
-  with tf.Session() as sess:
-    sess.run(tf.global_variables_initializer())
-    for i in range(20000):
-      batch = mnist.train.next_batch(50)
-      if i % 100 == 0:
-        train_accuracy = accuracy.eval(feed_dict={
-            x: batch[0], y_: batch[1], keep_prob: 1.0})
-        print('step %d, training accuracy %g' % (i, train_accuracy))
-      train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
-
-    # compute in batches to avoid OOM on GPUs 
-    accuracy_l = []
-    for _ in range(20):
-      batch = mnist.test.next_batch(500, shuffle=False)
-      accuracy_l.append(accuracy.eval(feed_dict={x: batch[0], 
-                                                 y_: batch[1], 
-                                                 keep_prob: 1.0}))
-    print('test accuracy %g' % numpy.mean(accuracy_l))
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--data_dir', type=str,
-                      default='/tmp/tensorflow/mnist/input_data',
-                      help='Directory for storing input data')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
deleted file mode 100644
index 47dd6a1947811765101529826c2b24d9798fef1f..0000000000000000000000000000000000000000
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A very simple MNIST classifier.
-
-See extensive documentation at
-https://www.tensorflow.org/get_started/mnist/beginners
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-from tensorflow.examples.tutorials.mnist import input_data
-
-import tensorflow as tf
-
-FLAGS = None
-
-
-def main(_):
-  # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir)
-
-  # Create the model
-  x = tf.placeholder(tf.float32, [None, 784])
-  W = tf.Variable(tf.zeros([784, 10]))
-  b = tf.Variable(tf.zeros([10]))
-  y = tf.matmul(x, W) + b
-
-  # Define loss and optimizer
-  y_ = tf.placeholder(tf.int64, [None])
-
-  # The raw formulation of cross-entropy,
-  #
-  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
-  #                                 reduction_indices=[1]))
-  #
-  # can be numerically unstable.
-  #
-  # So here we use tf.losses.sparse_softmax_cross_entropy on the raw
-  # outputs of 'y', and then average across the batch.
-  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
-  train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
-
-  sess = tf.InteractiveSession()
-  tf.global_variables_initializer().run()
-  # Train
-  for _ in range(1000):
-    batch_xs, batch_ys = mnist.train.next_batch(100)
-    sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
-
-  # Test trained model
-  correct_prediction = tf.equal(tf.argmax(y, 1), y_)
-  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  print(sess.run(
-      accuracy, feed_dict={
-          x: mnist.test.images,
-          y_: mnist.test.labels
-      }))
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--data_dir',
-      type=str,
-      default='/tmp/tensorflow/mnist/input_data',
-      help='Directory for storing input data')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py b/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
index e89317494f9b7171a93b2706d9d612d456ddf937..a9cb20fdfd3a376d021607c05f1fbb6aebce869d 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Simple MNIST classifier example with JIT XLA and timelines.
 
+  Note: Please see further comments in the BUILD file to invoke XLA.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -79,7 +80,7 @@ def main(_):
                options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
                run_metadata=run_metadata)
       trace = timeline.Timeline(step_stats=run_metadata.step_stats)
-      with open('timeline.ctf.json', 'w') as trace_file:
+      with open('/tmp/timeline.ctf.json', 'w') as trace_file:
         trace_file.write(trace.generate_chrome_trace_format())
     else:
       sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 7967e22d6a0319a530cb2f00e54872f022ac0095..1854e84d490d6c2ff462ee3bc3cc57b48c4d9328 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -183,7 +183,8 @@ def main(_):
   if tf.gfile.Exists(FLAGS.log_dir):
     tf.gfile.DeleteRecursively(FLAGS.log_dir)
   tf.gfile.MakeDirs(FLAGS.log_dir)
-  train()
+  with tf.Graph().as_default():
+    train()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/tutorials/monitors/BUILD b/tensorflow/examples/tutorials/monitors/BUILD
deleted file mode 100644
index 1c49e3fe5390ad48a3dea7cd5688996270b1dc9d..0000000000000000000000000000000000000000
--- a/tensorflow/examples/tutorials/monitors/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-# Example Estimator model
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "iris_monitors",
-    srcs = [
-        "iris_monitors.py",
-    ],
-    data = [
-        "iris_test.csv",
-        "iris_training.csv",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/examples/tutorials/monitors/iris_monitors.py b/tensorflow/examples/tutorials/monitors/iris_monitors.py
deleted file mode 100644
index a2b7fe60237da0604f74f31c0a09951f708e908b..0000000000000000000000000000000000000000
--- a/tensorflow/examples/tutorials/monitors/iris_monitors.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""Model training for Iris data set using Validation Monitor."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-import tensorflow as tf
-
-tf.logging.set_verbosity(tf.logging.INFO)
-
-# Data sets
-IRIS_TRAINING = os.path.join(os.path.dirname(__file__), "iris_training.csv")
-IRIS_TEST = os.path.join(os.path.dirname(__file__), "iris_test.csv")
-
-
-def main(unused_argv):
-  # Load datasets.
-  training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TRAINING, target_dtype=np.int, features_dtype=np.float32)
-  test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-      filename=IRIS_TEST, target_dtype=np.int, features_dtype=np.float32)
-
-  validation_metrics = {
-      "accuracy":
-          tf.contrib.learn.MetricSpec(
-              metric_fn=tf.contrib.metrics.streaming_accuracy,
-              prediction_key="classes"),
-      "precision":
-          tf.contrib.learn.MetricSpec(
-              metric_fn=tf.contrib.metrics.streaming_precision,
-              prediction_key="classes"),
-      "recall":
-          tf.contrib.learn.MetricSpec(
-              metric_fn=tf.contrib.metrics.streaming_recall,
-              prediction_key="classes")
-  }
-  validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
-      test_set.data,
-      test_set.target,
-      every_n_steps=50,
-      metrics=validation_metrics,
-      early_stopping_metric="loss",
-      early_stopping_metric_minimize=True,
-      early_stopping_rounds=200)
-
-  # Specify that all features have real-value data
-  feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
-
-  # Build 3 layer DNN with 10, 20, 10 units respectively.
-  classifier = tf.contrib.learn.DNNClassifier(
-      feature_columns=feature_columns,
-      hidden_units=[10, 20, 10],
-      n_classes=3,
-      model_dir="/tmp/iris_model",
-      config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1))
-
-  # Fit model.
-  classifier.fit(x=training_set.data,
-                 y=training_set.target,
-                 steps=2000,
-                 monitors=[validation_monitor])
-
-  # Evaluate accuracy.
-  accuracy_score = classifier.evaluate(
-      x=test_set.data, y=test_set.target)["accuracy"]
-  print("Accuracy: {0:f}".format(accuracy_score))
-
-  # Classify two new flower samples.
-  new_samples = np.array(
-      [[6.4, 3.2, 4.5, 1.5], [5.8, 3.1, 5.0, 1.7]], dtype=np.float32)
-  y = list(classifier.predict(new_samples))
-  print("Predictions: {}".format(str(y)))
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensorflow/examples/tutorials/monitors/iris_test.csv b/tensorflow/examples/tutorials/monitors/iris_test.csv
deleted file mode 100644
index 5929d91f52e6b7f1ad0adc0f50c2b8f189b94493..0000000000000000000000000000000000000000
--- a/tensorflow/examples/tutorials/monitors/iris_test.csv
+++ /dev/null
@@ -1,31 +0,0 @@
-30,4,setosa,versicolor,virginica
-5.9,3.0,4.2,1.5,1
-6.9,3.1,5.4,2.1,2
-5.1,3.3,1.7,0.5,0
-6.0,3.4,4.5,1.6,1
-5.5,2.5,4.0,1.3,1
-6.2,2.9,4.3,1.3,1
-5.5,4.2,1.4,0.2,0
-6.3,2.8,5.1,1.5,2
-5.6,3.0,4.1,1.3,1
-6.7,2.5,5.8,1.8,2
-7.1,3.0,5.9,2.1,2
-4.3,3.0,1.1,0.1,0
-5.6,2.8,4.9,2.0,2
-5.5,2.3,4.0,1.3,1
-6.0,2.2,4.0,1.0,1
-5.1,3.5,1.4,0.2,0
-5.7,2.6,3.5,1.0,1
-4.8,3.4,1.9,0.2,0
-5.1,3.4,1.5,0.2,0
-5.7,2.5,5.0,2.0,2
-5.4,3.4,1.7,0.2,0
-5.6,3.0,4.5,1.5,1
-6.3,2.9,5.6,1.8,2
-6.3,2.5,4.9,1.5,1
-5.8,2.7,3.9,1.2,1
-6.1,3.0,4.6,1.4,1
-5.2,4.1,1.5,0.1,0
-6.7,3.1,4.7,1.5,1
-6.7,3.3,5.7,2.5,2
-6.4,2.9,4.3,1.3,1
diff --git a/tensorflow/examples/tutorials/monitors/iris_training.csv b/tensorflow/examples/tutorials/monitors/iris_training.csv
deleted file mode 100644
index f5ae1054a16eebeb9cbaea1787f81bd4059d9457..0000000000000000000000000000000000000000
--- a/tensorflow/examples/tutorials/monitors/iris_training.csv
+++ /dev/null
@@ -1,121 +0,0 @@
-120,4,setosa,versicolor,virginica
-6.4,2.8,5.6,2.2,2
-5.0,2.3,3.3,1.0,1
-4.9,2.5,4.5,1.7,2
-4.9,3.1,1.5,0.1,0
-5.7,3.8,1.7,0.3,0
-4.4,3.2,1.3,0.2,0
-5.4,3.4,1.5,0.4,0
-6.9,3.1,5.1,2.3,2
-6.7,3.1,4.4,1.4,1
-5.1,3.7,1.5,0.4,0
-5.2,2.7,3.9,1.4,1
-6.9,3.1,4.9,1.5,1
-5.8,4.0,1.2,0.2,0
-5.4,3.9,1.7,0.4,0
-7.7,3.8,6.7,2.2,2
-6.3,3.3,4.7,1.6,1
-6.8,3.2,5.9,2.3,2
-7.6,3.0,6.6,2.1,2
-6.4,3.2,5.3,2.3,2
-5.7,4.4,1.5,0.4,0
-6.7,3.3,5.7,2.1,2
-6.4,2.8,5.6,2.1,2
-5.4,3.9,1.3,0.4,0
-6.1,2.6,5.6,1.4,2
-7.2,3.0,5.8,1.6,2
-5.2,3.5,1.5,0.2,0
-5.8,2.6,4.0,1.2,1
-5.9,3.0,5.1,1.8,2
-5.4,3.0,4.5,1.5,1
-6.7,3.0,5.0,1.7,1
-6.3,2.3,4.4,1.3,1
-5.1,2.5,3.0,1.1,1
-6.4,3.2,4.5,1.5,1
-6.8,3.0,5.5,2.1,2
-6.2,2.8,4.8,1.8,2
-6.9,3.2,5.7,2.3,2
-6.5,3.2,5.1,2.0,2
-5.8,2.8,5.1,2.4,2
-5.1,3.8,1.5,0.3,0
-4.8,3.0,1.4,0.3,0
-7.9,3.8,6.4,2.0,2
-5.8,2.7,5.1,1.9,2
-6.7,3.0,5.2,2.3,2
-5.1,3.8,1.9,0.4,0
-4.7,3.2,1.6,0.2,0
-6.0,2.2,5.0,1.5,2
-4.8,3.4,1.6,0.2,0
-7.7,2.6,6.9,2.3,2
-4.6,3.6,1.0,0.2,0
-7.2,3.2,6.0,1.8,2
-5.0,3.3,1.4,0.2,0
-6.6,3.0,4.4,1.4,1
-6.1,2.8,4.0,1.3,1
-5.0,3.2,1.2,0.2,0
-7.0,3.2,4.7,1.4,1
-6.0,3.0,4.8,1.8,2
-7.4,2.8,6.1,1.9,2
-5.8,2.7,5.1,1.9,2
-6.2,3.4,5.4,2.3,2
-5.0,2.0,3.5,1.0,1
-5.6,2.5,3.9,1.1,1
-6.7,3.1,5.6,2.4,2
-6.3,2.5,5.0,1.9,2
-6.4,3.1,5.5,1.8,2
-6.2,2.2,4.5,1.5,1
-7.3,2.9,6.3,1.8,2
-4.4,3.0,1.3,0.2,0
-7.2,3.6,6.1,2.5,2
-6.5,3.0,5.5,1.8,2
-5.0,3.4,1.5,0.2,0
-4.7,3.2,1.3,0.2,0
-6.6,2.9,4.6,1.3,1
-5.5,3.5,1.3,0.2,0
-7.7,3.0,6.1,2.3,2
-6.1,3.0,4.9,1.8,2
-4.9,3.1,1.5,0.1,0
-5.5,2.4,3.8,1.1,1
-5.7,2.9,4.2,1.3,1
-6.0,2.9,4.5,1.5,1
-6.4,2.7,5.3,1.9,2
-5.4,3.7,1.5,0.2,0
-6.1,2.9,4.7,1.4,1
-6.5,2.8,4.6,1.5,1
-5.6,2.7,4.2,1.3,1
-6.3,3.4,5.6,2.4,2
-4.9,3.1,1.5,0.1,0
-6.8,2.8,4.8,1.4,1
-5.7,2.8,4.5,1.3,1
-6.0,2.7,5.1,1.6,1
-5.0,3.5,1.3,0.3,0
-6.5,3.0,5.2,2.0,2
-6.1,2.8,4.7,1.2,1
-5.1,3.5,1.4,0.3,0
-4.6,3.1,1.5,0.2,0
-6.5,3.0,5.8,2.2,2
-4.6,3.4,1.4,0.3,0
-4.6,3.2,1.4,0.2,0
-7.7,2.8,6.7,2.0,2
-5.9,3.2,4.8,1.8,1
-5.1,3.8,1.6,0.2,0
-4.9,3.0,1.4,0.2,0
-4.9,2.4,3.3,1.0,1
-4.5,2.3,1.3,0.3,0
-5.8,2.7,4.1,1.0,1
-5.0,3.4,1.6,0.4,0
-5.2,3.4,1.4,0.2,0
-5.3,3.7,1.5,0.2,0
-5.0,3.6,1.4,0.2,0
-5.6,2.9,3.6,1.3,1
-4.8,3.1,1.6,0.2,0
-6.3,2.7,4.9,1.8,2
-5.7,2.8,4.1,1.3,1
-5.0,3.0,1.6,0.2,0
-6.3,3.3,6.0,2.5,2
-5.0,3.5,1.6,0.6,0
-5.5,2.6,4.4,1.2,1
-5.7,3.0,4.2,1.2,1
-4.4,2.9,1.4,0.2,0
-4.8,3.0,1.4,0.1,0
-5.5,2.4,3.7,1.0,1
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index f80c56d1c181edcb26c93c01bf9ba4e486c6d146..c8ab24871c4168eb69363a2cc99492e542ca5bec 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -3,6 +3,12 @@ Assignments for Udacity Deep Learning class with TensorFlow
 
 Course information can be found at https://www.udacity.com/course/deep-learning--ud730
 
+## Getting Started with Docker
+
+If you are new to Docker, follow
+[Docker document](https://docs.docker.com/machine/get-started/) to start a
+docker instance. Kindly read the requirements of Windows and Mac carefully.
+
 Running the Docker container from the Google Cloud repository
 -------------------------------------------------------------
 
@@ -17,11 +23,20 @@ Accessing the Notebooks
 
 On linux, go to: http://127.0.0.1:8888
 
-On mac, find the virtual machine's IP using:
+On mac, go to terminal and find the virtual machine's IP using:
+
+    docker-machine ip default
+
+Then go to: http://(ip address received from the above command):8888 (likely
+http://192.168.99.100:8888)
+
+On Windows, use powershell to find the virtual machine's IP using:
 
     docker-machine ip default
+    
 
-Then go to: http://IP:8888 (likely http://192.168.99.100:8888)
+Then go to: http://(ip address received from the above command):8888 (likely
+http://192.168.99.100:8888)
 
 FAQ
 ---
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
index 4a429837b7b997f0f6571060280a9a15543b9f54..464484dab830e73fbc11cc9a2bfd9310bac88653 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 using tensorflow::DT_FLOAT;
 using tensorflow::DT_UINT8;
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 3989f9b25a4f5f47dd235ba55da9a20dae5e7ff4..31bba1ffbfae1d6ae2ae2b106b262486ff3b56a7 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -23,8 +23,8 @@ from source.
 
 -   [bazel](https://www.bazel.build/versions/master/docs/install.html)
 -   Environment to build TensorFlow from source code
-    ([Linux of macOS](https://www.tensorflow.org/install/source)).
-    If you don't need GPU support, then try the following:
+    ([Linux or macOS](https://www.tensorflow.org/install/source)). If you don't
+    need GPU support, then try the following:
 
     ```sh
     sudo apt-get install python swig python-numpy # Linux
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 32a77550ee2fa5606b402600aa6429950d8e72a5..6ff41ca916930f52f660a08e0089dc9f7f1a8e24 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -53,6 +53,17 @@ type Graph struct {
 	c *C.TF_Graph
 }
 
+// Graph execution options
+type GraphImportOptions struct {
+	// Node prefix
+	Prefix string
+
+	// Execution device
+	Device string
+
+	// TODO: extend this structure to support more options from TF_ImportGraphDefOptions
+}
+
 // NewGraph returns a new Graph.
 func NewGraph() *Graph {
 	g := &Graph{C.TF_NewGraph()}
@@ -88,18 +99,32 @@ func (g *Graph) WriteTo(w io.Writer) (int64, error) {
 	return int64(n), err
 }
 
-// Import imports the nodes and edges from a serialized representation of
+// ImportWithOptions imports the nodes and edges from a serialized representation of
 // another Graph into g.
 //
-// Names of imported nodes will be prefixed with prefix.
-func (g *Graph) Import(def []byte, prefix string) error {
-	cprefix := C.CString(prefix)
+// Multiple options can be specified for the newly imported nodes.
+func (g *Graph) ImportWithOptions(def []byte, options GraphImportOptions) error {
+	cprefix := C.CString(options.Prefix)
 	defer C.free(unsafe.Pointer(cprefix))
 
 	opts := C.TF_NewImportGraphDefOptions()
 	defer C.TF_DeleteImportGraphDefOptions(opts)
 	C.TF_ImportGraphDefOptionsSetPrefix(opts, cprefix)
 
+	if len(options.Device) != 0 {
+		// TODO(ashankar): Remove this error and uncomment below
+		// when a release of the C library which includes
+		// https://github.com/tensorflow/tensorflow/commit/e0af5ac53e5a8ad9b07cdd5738c0a8e12f938c4e
+		// has been made.
+		// See https://github.com/tensorflow/tensorflow/issues/23257
+		return fmt.Errorf("GraphImportOptions.Device is only supported with the TensorFlow C library versions after 1.12 (or built from master). See https://github.com/tensorflow/tensorflow/issues/23257")
+		/*
+			cdev := C.CString(options.Device)
+			defer C.free(unsafe.Pointer(cdev))
+			C.TF_ImportGraphDefOptionsSetDefaultDevice(opts, cdev)
+		*/
+	}
+
 	buf := C.TF_NewBuffer()
 	defer C.TF_DeleteBuffer(buf)
 	// Would have preferred to use C.CBytes, but that does not play well
@@ -114,13 +139,23 @@ func (g *Graph) Import(def []byte, prefix string) error {
 	C.memcpy(buf.data, unsafe.Pointer(&def[0]), buf.length)
 
 	status := newStatus()
+
 	C.TF_GraphImportGraphDef(g.c, buf, opts, status.c)
 	if err := status.Err(); err != nil {
 		return err
 	}
+
 	return nil
 }
 
+// Import imports the nodes and edges from a serialized representation of
+// another Graph into g.
+//
+// Names of imported nodes will be prefixed with prefix.
+func (g *Graph) Import(def []byte, prefix string) error {
+	return g.ImportWithOptions(def, GraphImportOptions{Prefix: prefix})
+}
+
 // Operation returns the Operation named name in the Graph, or nil if no such
 // operation is present.
 func (g *Graph) Operation(name string) *Operation {
@@ -147,6 +182,68 @@ func (g *Graph) Operations() []Operation {
 	return ops
 }
 
+// AddGradients adds operations to compute the partial derivatives of the sum of tensors in y
+// with respect to tensors in x, i.e., d(y[0] + y[1] + ...) / d x[0], d(y[0] + y[1] + ... ) / d x[1] etc.
+//
+// prefix, if non-empty, is the name prefix used for all operations added to the graph to compute
+// these gradients.
+func (g *Graph) AddGradients(prefix string, y []Output, x []Output, dx []Output) ([]Output, error) {
+	var (
+		cprefix *C.char
+
+		cy  = make([]C.TF_Output, len(y))
+		cx  = make([]C.TF_Output, len(x))
+		cdx = make([]C.TF_Output, len(dx))
+		cdy = make([]C.TF_Output, len(x))
+
+		pcy  *C.TF_Output
+		pcx  *C.TF_Output
+		pcdx *C.TF_Output
+		pcdy *C.TF_Output
+
+		status = newStatus()
+	)
+
+	if len(y) > 0 {
+		pcy = &cy[0]
+		for i, o := range y {
+			cy[i] = o.c()
+		}
+	}
+	if len(x) > 0 {
+		pcx = &cx[0]
+		for i, o := range x {
+			cx[i] = o.c()
+		}
+		pcdy = &cdy[0]
+	}
+	if len(dx) > 0 {
+		pcdx = &cdx[0]
+		for i, o := range dx {
+			cdx[i] = o.c()
+		}
+	}
+
+	// If prefix is "", the C.TF_AddGradientsWithPrefix need cprefix to be nil but not ""
+	if len(prefix) != 0 {
+		cprefix = C.CString(prefix)
+		defer C.free(unsafe.Pointer(cprefix))
+	}
+
+	C.TF_AddGradientsWithPrefix(g.c, cprefix, pcy, C.int(len(y)), pcx, C.int(len(x)), pcdx, status.c, pcdy)
+
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+	dy := make([]Output, len(x))
+	for i, co := range cdy {
+		op := &Operation{co.oper, g}
+		dy[i] = Output{op, int(co.index)}
+	}
+
+	return dy, nil
+}
+
 // OpSpec is the specification of an Operation to be added to a Graph
 // (using Graph.AddOperation).
 type OpSpec struct {
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index b8d65c54f697153ad236f5e27d9f27d048c3a22e..067c7db5c3cd9c880e6f257b199c0742178d29fd 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -19,6 +19,7 @@ package tensorflow
 import (
 	"bytes"
 	"fmt"
+	"strings"
 	"testing"
 )
 
@@ -80,3 +81,260 @@ func TestGraphWriteToAndImport(t *testing.T) {
 		t.Error(err)
 	}
 }
+
+func TestGraphAddGradients(t *testing.T) {
+	g := NewGraph()
+	x1, err := Placeholder(g, "x1", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	x2, err := Placeholder(g, "x2", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x1},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+	op1, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y1",
+		Input: []Input{y0},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y1 := op1.Output(0)
+	op2, err := g.AddOperation(OpSpec{
+		Type:  "AddN",
+		Input: []Input{OutputList([]Output{y0, x2})},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y2 := op2.Output(0)
+
+	grads0, err := g.AddGradients("", []Output{y1}, []Output{x1}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), Float)
+	}
+
+	grads1, err := g.AddGradients("", []Output{y2}, []Output{x1, x2}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 2 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), Float)
+	}
+	if grads1[1].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[1].DataType(), Float)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c1, _ := NewTensor(float32(3.0))
+	c2, _ := NewTensor(float32(2.0))
+	outputs, err := sess.Run(
+		map[Output]*Tensor{x1: c1, x2: c2},
+		[]Output{grads0[0], grads1[0], grads1[1]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(outputs) != 3 {
+		t.Fatal(len(outputs))
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+	if outputs[1].Value().(float32) != 6.0 {
+		t.Fatalf("Got %v, wanted float 6.0", outputs[1].Value())
+	}
+	if outputs[2].Value().(float32) != 1.0 {
+		t.Fatalf("Got %v, wanted float 1.0", outputs[2].Value())
+	}
+}
+
+func TestGraphAddGradientsSums(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+	op1, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y1",
+		Input: []Input{y0},
+	})
+	y1 := op1.Output(0)
+
+	grad, err := g.AddGradients("", []Output{y0, y1}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grad) != 1 {
+		t.Fatal(len(grad))
+	}
+	if grad[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grad[0].DataType(), Float)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[Output]*Tensor{x: c},
+		[]Output{grad[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 114.0 {
+		t.Fatalf("Got %v, wanted float 114.0", outputs[0].Value())
+	}
+}
+
+func TestGraphAddGradientsWithInitialValues(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+	op1, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y1",
+		Input: []Input{y0},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y1 := op1.Output(0)
+
+	grads0, err := g.AddGradients("", []Output{y1}, []Output{y0}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), Float)
+	}
+
+	grads1, err := g.AddGradients("", []Output{y0}, []Output{x}, []Output{grads0[0]})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 1 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), Float)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[Output]*Tensor{x: c},
+		[]Output{grads1[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+}
+
+func TestGraphValidateGradientsNames(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+
+	grads0, err := g.AddGradients("", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads0[0].Op.Name(), "gradients/") {
+		t.Fatalf("Got name %v, wanted started with gradients/", grads0[0].Op.Name())
+	}
+
+	grads1, err := g.AddGradients("", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads1[0].Op.Name(), "gradients_1/") {
+		t.Fatalf("Got name %v, wanted started with gradients_1/", grads1[0].Op.Name())
+	}
+
+	grads2, err := g.AddGradients("more_gradients", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads2[0].Op.Name(), "more_gradients/") {
+		t.Fatalf("Got name %v, wanted started with more_gradients/", grads2[0].Op.Name())
+	}
+
+	grads3, err := g.AddGradients("even_more_gradients", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads3[0].Op.Name(), "even_more_gradients/") {
+		t.Fatalf("Got name %v, wanted started with even_more_gradients/", grads3[0].Op.Name())
+	}
+
+	_, err = g.AddGradients("even_more_gradients", []Output{y0}, []Output{x}, nil)
+	if err == nil {
+		t.Error("AddGradients should have failed if gradients name is already existing")
+	}
+}
diff --git a/tensorflow/go/op/gradients.go b/tensorflow/go/op/gradients.go
new file mode 100644
index 0000000000000000000000000000000000000000..c5956789f426f4cabad11c54d6c30ca2c1fa39d7
--- /dev/null
+++ b/tensorflow/go/op/gradients.go
@@ -0,0 +1,49 @@
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package op
+
+import (
+	"fmt"
+
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+)
+
+// Gradients adds gradients computation ops to the graph according to scope.
+//
+// Arguments:
+//  y: output of the function to derive
+//  x: inputs of the function for which partial derivatives are computed
+//  dx: if not null, the partial derivatives of some loss function L w.r.t. y
+//
+//  return the partial derivatives
+func Gradients(scope *Scope, y []tf.Output, x []tf.Output, dx ...tf.Output) (output []tf.Output) {
+	if len(scope.controlDependencies) > 0 {
+		scope.UpdateErr("Gradients", fmt.Errorf("Gradients does not currently support control dependencies (via Scope.WithControlDependencies)."))
+		return
+	}
+	if scope.device != "" {
+		scope.UpdateErr("Gradients", fmt.Errorf("Gradients does not currently support device annotations (via Scope.WithDevice)."))
+		return
+	}
+
+	var err error
+	if output, err = scope.graph.AddGradients(scope.opName("Gradients"), y, x, dx); err != nil {
+		scope.UpdateErr("Gradients", err)
+		return
+	}
+	return output
+}
diff --git a/tensorflow/go/op/gradients_test.go b/tensorflow/go/op/gradients_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..3d1d57b77eac44b5048d0b41bf63d271005c52ee
--- /dev/null
+++ b/tensorflow/go/op/gradients_test.go
@@ -0,0 +1,246 @@
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package op
+
+import (
+	"strings"
+	"testing"
+
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+)
+
+func TestAddGradients(t *testing.T) {
+	var (
+		s  = NewScope()
+		x1 = Placeholder(s.SubScope("x1"), tf.Float)
+		x2 = Placeholder(s.SubScope("x2"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x1)
+		y1 = Square(s.SubScope("y1"), y0)
+		y2 = AddN(s.SubScope("y2"), []tf.Output{y0, x2})
+	)
+
+	grads0 := Gradients(s, []tf.Output{y1}, []tf.Output{x1})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), tf.Float)
+	}
+
+	sub := s.SubScope("sub")
+	grads1 := Gradients(sub, []tf.Output{y2}, []tf.Output{x1, x2})
+	if err := sub.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 2 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), tf.Float)
+	}
+	if grads1[1].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[1].DataType(), tf.Float)
+	}
+
+	graph, err := sub.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c1, _ := tf.NewTensor(float32(3.0))
+	c2, _ := tf.NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[tf.Output]*tf.Tensor{x1: c1, x2: c2},
+		[]tf.Output{grads0[0], grads1[0], grads1[1]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(outputs) != 3 {
+		t.Fatal(len(outputs))
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+	if outputs[1].Value().(float32) != 6.0 {
+		t.Fatalf("Got %v, wanted float 6.0", outputs[1].Value())
+	}
+	if outputs[2].Value().(float32) != 1.0 {
+		t.Fatalf("Got %v, wanted float 1.0", outputs[2].Value())
+	}
+}
+
+func TestAddGradientsSums(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+		y1 = Square(s.SubScope("y1"), y0)
+	)
+
+	grad := Gradients(s, []tf.Output{y0, y1}, []tf.Output{x})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grad) != 1 {
+		t.Fatal(len(grad))
+	}
+	if grad[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grad[0].DataType(), tf.Float)
+	}
+
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := tf.NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[tf.Output]*tf.Tensor{x: c},
+		[]tf.Output{grad[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 114.0 {
+		t.Fatalf("Got %v, wanted float 114.0", outputs[0].Value())
+	}
+}
+
+func TestAddGradientsWithInitialValues(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x1"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+		y1 = Square(s.SubScope("y1"), y0)
+	)
+
+	grads0 := Gradients(s, []tf.Output{y1}, []tf.Output{y0})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), tf.Float)
+	}
+
+	sub := s.SubScope("sub")
+	grads1 := Gradients(sub, []tf.Output{y0}, []tf.Output{x}, grads0[0])
+	if err := sub.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 1 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), tf.Float)
+	}
+
+	graph, err := sub.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := tf.NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[tf.Output]*tf.Tensor{x: c},
+		[]tf.Output{grads1[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+}
+
+func TestValidateGradientsNames(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+	)
+
+	grads0 := Gradients(s, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads0[0].Op.Name(), "Gradients/") {
+		t.Fatalf("Got name %v, wanted started with Gradients/", grads0[0].Op.Name())
+	}
+
+	sub := s.SubScope("sub")
+	grads1 := Gradients(sub, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads1[0].Op.Name(), "sub/Gradients/") {
+		t.Fatalf("Got name %v, wanted started with sub/Gradients/", grads1[0].Op.Name())
+	}
+
+	Gradients(sub, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err == nil {
+		t.Error("Gradients should have failed if executed more than once for scope of the same namespace")
+	}
+}
+
+func TestAddGradientsWithControlDependencies(t *testing.T) {
+	var (
+		s        = NewScope()
+		zero     = Const(s.SubScope("zero"), int32(0))
+		x        = Placeholder(s.SubScope("x"), tf.Float)
+		y0       = Square(s.SubScope("y0"), x)
+		variable = VarHandleOp(s, tf.Int32, tf.ScalarShape())
+		init     = AssignVariableOp(s, variable, zero)
+		readDeps = []*tf.Operation{init}
+	)
+	s = s.WithControlDependencies(readDeps...)
+	Gradients(s, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err == nil {
+		t.Error("Gradients should have failed when control dependencies are set")
+	}
+}
+
+func TestAddGradientsWithDevice(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+	)
+	s = s.WithDevice("/device:GPU:0")
+	Gradients(s, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err == nil {
+		t.Error("Gradients should have failed when device is set")
+	}
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a7bbb80c82a0c2ee121175c3e1dffd2d095df119..6e49fbb9eae047b4b45758165ad47a5c1923aaf6 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -463,6 +463,14 @@ func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
 	}
 }
 
+// QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_TO_EVEN"
+func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
 // Quantizes then dequantizes a tensor.
 //
 // This op simulates the precision loss from the quantized forward pass by:
@@ -515,6 +523,8 @@ func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
 //
 // output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
 //
+// The above round function uses half to even rounding.
+//
 //
 // Arguments:
 //	input: Tensor to quantize and then dequantize.
@@ -3485,30 +3495,6 @@ func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resou
 	return scope.AddOperation(opspec)
 }
 
-// Add the quantile summaries to each quantile stream resource.
-//
-// An op that adds a list of quantile summaries to a quantile stream resource. Each
-// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
-// for a single feature.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
-//
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(summaries),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Makes the summary of quantiles for the batch.
 //
 // An op that takes a list of tensors and outputs the quantile summaries for each tensor.
@@ -4396,168 +4382,207 @@ func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
+// Forwards `data` to the output port determined by `pred`.
+//
+// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
+// the data goes to `output_false`.
+//
+// See also `RefSwitch` and `Merge`.
+//
+// Arguments:
+//	data: The tensor to be forwarded to the appropriate output.
+//	pred: A scalar that specifies which output port will receive data.
+//
+// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
+func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
+	opspec := tf.OpSpec{
+		Type: "Switch",
+		Input: []tf.Input{
+			data, pred,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
+// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
+type AudioSpectrogramAttr func(optionalAttr)
 
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
+//
+// value: Whether to return the squared magnitude or just the
+// magnitude. Using squared magnitude can avoid extra calculations.
+// If not specified, defaults to false
+func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["magnitude_squared"] = value
 	}
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
+// Produces a visualization of audio data over time.
 //
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// Spectrograms are a standard way of representing audio information as a series of
+// slices of frequency information, one slice for each window of time. By joining
+// these together into a sequence, they form a distinctive fingerprint of the sound
+// over time.
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// This op expects to receive audio data as an input, stored as floats in the range
+// -1 to 1, together with a window width in samples, and a stride specifying how
+// far to move the window between slices. From this it generates a three
+// dimensional output. The lowest dimension has an amplitude value for each
+// frequency during that time slice. The next dimension is time, with successive
+// frequency slices. The final dimension is for the channels in the input, so a
+// stereo audio input would have two here for example.
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// This means the layout when converted and saved as an image is rotated 90 degrees
+// clockwise from a typical spectrogram. Time is descending down the Y axis, and
+// the frequency decreases from left to right.
+//
+// Each value in the result represents the square root of the sum of the real and
+// imaginary parts of an FFT on the current window of samples. In this way, the
+// lowest dimension represents the power of each frequency in the current window,
+// and adjacent windows are concatenated in the next dimension.
+//
+// To get a more intuitive and visual look at what this operation does, you can run
+// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+// resulting spectrogram as a PNG image.
+//
+// Arguments:
+//	input: Float representation of audio data.
+//	window_size: How wide the input window is in samples. For the highest efficiency
+// this should be a power of two, but other values are accepted.
+//	stride: How widely apart the center of adjacent sample windows should be.
+//
+// Returns 3D representation of the audio frequencies as an image.
+func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "AudioSpectrogram",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
 
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["merge_repeated"] = value
 	}
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
+// Performs beam search decoding on the logits given in input.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
 //
 // Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "CTCBeamSearchDecoder",
 		Input: []tf.Input{
-			input, dimension,
+			inputs, sequence_length,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns which elements of x are finite.
-//
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "IsFinite",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
+// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
+type CTCGreedyDecoderAttr func(optionalAttr)
 
-// MatMulTransposeA sets the optional transpose_a attribute to value.
+// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
 //
-// value: If true, "a" is transposed before multiplication.
+// value: If True, merge repeated classes in output.
 // If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
+func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["merge_repeated"] = value
 	}
 }
 
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// Performs greedy decoding on the logits given in inputs.
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
+// A note about the attribute merge_repeated: if enabled, when
+// consecutive logits' maximum indices are the same, only the first of
+// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
+// becomes "A B B" if merge_repeated = True and "A B B B B" if
+// merge_repeated = False.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// Regardless of the value of merge_repeated, if the maximum index of a given
+// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
+// element is emitted.
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
+//
+// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
+// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
+// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
+// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
+// log-probabilities.
+func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4566,258 +4591,453 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "CTCGreedyDecoder",
 		Input: []tf.Input{
-			a, b,
+			inputs, sequence_length,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
-//
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
-//
-// For example:
+// CTCLossAttr is an optional argument to CTCLoss.
+type CTCLossAttr func(optionalAttr)
+
+// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
 //
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
 //
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
 //
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
 //
-// ```
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
 //
 // Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
 //
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
-//
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "CTCLoss",
 		Input: []tf.Input{
-			condition, x, y,
+			inputs, labels_indices, labels_values, sequence_length,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns the truth value of x OR y element-wise.
-//
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogicalOr",
-		Input: []tf.Input{
-			x, y,
-		},
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
+
+// ShapeNOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-//
+// Returns shape of tensors.
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "ShapeN",
 		Input: []tf.Input{
-			a, b, x,
+			tf.OutputList(input),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return output
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
-//
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan2",
-		Input: []tf.Input{
-			y, x,
-		},
+// CudnnRNNParamsToCanonicalAttr is an optional argument to CudnnRNNParamsToCanonical.
+type CudnnRNNParamsToCanonicalAttr func(optionalAttr)
+
+// CudnnRNNParamsToCanonicalRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsToCanonicalRnnMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that passes a sliding window over `input_dataset`.
-//
-// Arguments:
-//
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//
-//
-func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SlideDataset",
-		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
-		},
-		Attrs: attrs,
+// CudnnRNNParamsToCanonicalInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsToCanonicalInputMode(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
+// CudnnRNNParamsToCanonicalDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsToCanonicalDirection(value string) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
 
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
-//
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
+// CudnnRNNParamsToCanonicalDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalDropout(value float32) CudnnRNNParamsToCanonicalAttr {
 	return func(m optionalAttr) {
-		m["normalize"] = value
+		m["dropout"] = value
 	}
 }
 
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
+// CudnnRNNParamsToCanonicalSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsToCanonicalSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Retrieves CudnnRNN params in canonical form.
 //
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
+// Retrieves a set of weights from the opaque params buffer that can be saved and
+// restored in a way compatible with future runs.
 //
-// Returns A dense float tensor with rank R - 1.
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
 //
-// For the example input:
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, params tf.Output, num_params int64, optional ...CudnnRNNParamsToCanonicalAttr) (weights []tf.Output, biases []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_params": num_params}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNParamsToCanonical",
+		Input: []tf.Input{
+			num_layers, num_units, input_size, params,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if weights, idx, err = makeOutputList(op, idx, "weights"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	if biases, idx, err = makeOutputList(op, idx, "biases"); err != nil {
+		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
+		return
+	}
+	return weights, biases
+}
+
+// CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
+type CudnnRNNBackpropV2Attr func(optionalAttr)
+
+// CudnnRNNBackpropV2RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropV2RnnMode(value string) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV2InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropV2InputMode(value string) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropV2Direction(value string) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV2Dropout(value float32) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV2Seed(value int64) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropV2Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV2Seed2(value int64) CudnnRNNBackpropV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNN.
 //
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
+//     cudnnRNNAlgo_t and cudnnMathType_t.
 //
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in the forward operation.
+// host_reserved: The same host_reserved produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV2Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackpropV2",
+		Input: []tf.Input{
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// CudnnRNNV2Attr is an optional argument to CudnnRNNV2.
+type CudnnRNNV2Attr func(optionalAttr)
+
+// CudnnRNNV2RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNV2RnnMode(value string) CudnnRNNV2Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNV2InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNV2InputMode(value string) CudnnRNNV2Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNV2Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNV2Direction(value string) CudnnRNNV2Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNV2Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV2Dropout(value float32) CudnnRNNV2Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNV2Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV2Seed(value int64) CudnnRNNV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNV2Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV2Seed2(value int64) CudnnRNNV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNV2IsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV2IsTraining(value bool) CudnnRNNV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
 //
-// The output will be:
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer. Produces one extra output "host_reserved" than CudnnRNN.
 //
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is true.
+// host_reserved: An opaque tensor that can be used in backprop calculation. It is
+//   only produced if is_training is true. It is output on host memory rather than
+//   device memory.
+func CudnnRNNV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNV2Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -4826,236 +5046,240 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EditDistance",
+		Type: "CudnnRNNV2",
 		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+			input, input_h, input_c, params,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["file_random_seed"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+// value: Shifts the list of files after the list is randomly
+// shuffled.
+// If not specified, defaults to 0
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["file_shuffle_shift_ratio"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the input.
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
+//
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_buffer_size"] = value
+	}
+}
+
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+//
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_parallelism"] = value
+	}
+}
+
+// RecordInputBatchSize sets the optional batch_size attribute to value.
+//
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["batch_size"] = value
+	}
+}
+
+// RecordInputCompressionType sets the optional compression_type attribute to value.
+//
+// value: The type of compression for the file. Currently ZLIB and
+// GZIP are supported. Defaults to none.
+// If not specified, defaults to ""
+func RecordInputCompressionType(value string) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Emits randomized records.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	file_pattern: Glob pattern for the data files.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
+		Type: "RecordInput",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
+// OrderedMapClearAttr is an optional argument to OrderedMapClear.
+type OrderedMapClearAttr func(optionalAttr)
 
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+// OrderedMapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
 	return func(m optionalAttr) {
-		m["tolerance"] = value
+		m["capacity"] = value
 	}
 }
 
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearContainer(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
-		Input: []tf.Input{
-			x, y,
-		},
+		Type: "OrderedMapClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x / y element-wise.
+// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
+type OrderedMapIncompleteSizeAttr func(optionalAttr)
+
+// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Div",
-		Input: []tf.Input{
-			x, y,
-		},
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x * y element-wise.
+// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mul",
-		Input: []tf.Input{
-			x, y,
-		},
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
+// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Adds `bias` to `value`.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
-//
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// Op returns the number of incomplete elements in the underlying container.
+func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
-		Input: []tf.Input{
-			value, bias,
-		},
+		Type: "OrderedMapIncompleteSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
+// BoostedTreesQuantileStreamResourceHandleOpAttr is an optional argument to BoostedTreesQuantileStreamResourceHandleOp.
+type BoostedTreesQuantileStreamResourceHandleOpAttr func(optionalAttr)
 
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+// BoostedTreesQuantileStreamResourceHandleOpContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesQuantileStreamResourceHandleOpContainer(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["container"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// BoostedTreesQuantileStreamResourceHandleOpSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BoostedTreesQuantileStreamResourceHandleOpSharedName(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a BoostedTreesQuantileStreamResource.
+func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...BoostedTreesQuantileStreamResourceHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5064,804 +5288,688 @@ func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
+		Type: "BoostedTreesQuantileStreamResourceHandleOp",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
+// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
+type OrderedMapSizeAttr func(optionalAttr)
 
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// OrderedMapSizeCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["capacity"] = value
 	}
 }
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
+// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// OrderedMapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
-		Input: []tf.Input{
-			true_classes,
-		},
+		Type: "OrderedMapSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns x + y element-wise.
+// Generate the bucket boundaries for each feature based on accumulated summaries.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// An op that returns a list of float tensors for a quantile stream resource. Each
+// tensor is Rank 1 containing bucket boundaries for a single feature.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	num_features: inferred int; number of features to get bucket boundaries for.
+//
+// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_features": num_features}
 	opspec := tf.OpSpec{
-		Type: "AddV2",
+		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
 		Input: []tf.Input{
-			x, y,
+			quantile_stream_resource_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-//
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Sign",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
+		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return bucket_boundaries
 }
 
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
+// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
+type OrderedMapUnstageAttr func(optionalAttr)
 
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["capacity"] = value
 	}
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
-//
-// Arguments:
+// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+// REQUIRES: value >= 0
+func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "OrderedMapUnstage",
 		Input: []tf.Input{
-			input, dimension,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// output range specified with 'requested_output_min' and 'requested_output_max'.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "Requantize",
-		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
-		},
-		Attrs: attrs,
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstage", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return values
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
 
-// PreventGradientMessage sets the optional message attribute to value.
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// REQUIRES: value >= 0
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["capacity"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
-//
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	input: any tensor.
+// REQUIRES: value >= 0
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
-			input,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapPeek", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the determinant of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
-//
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
 
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+// MapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeCapacity(value int64) MapSizeAttr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["capacity"] = value
 	}
 }
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+// REQUIRES: value >= 0
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+// MapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapSizeContainer(value string) MapSizeAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["container"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+// MapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapSizeSharedName(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
+		Type: "MapSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the number of work units this Reader has finished processing.
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Lgamma",
-		Input: []tf.Input{
-			x,
-		},
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
-//
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
-//
-// Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
-		Input: []tf.Input{
-			l, grad,
-		},
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
+		Type: "MapUnstage",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Sinh",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
-// ```
-//
-// Arguments:
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "MapPeek",
 		Input: []tf.Input{
-			x,
+			key, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
+
+// MapStageCapacity sets the optional capacity attribute to value.
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
-		Input: []tf.Input{
-			gradients, features,
-		},
+// REQUIRES: value >= 0
+func MapStageCapacity(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageMemoryLimit(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
 
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+// MapStageContainer sets the optional container attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func MapStageContainer(value string) MapStageAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["container"] = value
 	}
 }
 
-// Resize `images` to `size` using bicubic interpolation.
+// MapStageSharedName sets the optional shared_name attribute to value.
 //
-// Input images can be of different types but output images are always float.
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func MapStageSharedName(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a hashtable.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	key: int64
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "MapStage",
 		Input: []tf.Input{
-			images, size,
+			key, indices, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes natural logarithm of x element-wise.
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Rounds the values of a tensor to the nearest integer, element-wise.
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Round",
-		Input: []tf.Input{
-			x,
-		},
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// RecordInputAttr is an optional argument to RecordInput.
-type RecordInputAttr func(optionalAttr)
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
-//
-// value: Random seeds used to produce randomized records.
-// If not specified, defaults to 301
-func RecordInputFileRandomSeed(value int64) RecordInputAttr {
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
 	return func(m optionalAttr) {
-		m["file_random_seed"] = value
+		m["shared_name"] = value
 	}
 }
 
-// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
+// Op removes all elements in the underlying container.
 //
-// value: Shifts the list of files after the list is randomly
-// shuffled.
-// If not specified, defaults to 0
-func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_shuffle_shift_ratio"] = value
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
 }
 
-// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
+
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: The randomization shuffling buffer.
-// If not specified, defaults to 10000
-func RecordInputFileBufferSize(value int64) RecordInputAttr {
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
 	return func(m optionalAttr) {
-		m["file_buffer_size"] = value
+		m["capacity"] = value
 	}
 }
 
-// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: How many sstables are opened and concurrently iterated over.
-// If not specified, defaults to 16
-func RecordInputFileParallelism(value int64) RecordInputAttr {
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
 	return func(m optionalAttr) {
-		m["file_parallelism"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// RecordInputBatchSize sets the optional batch_size attribute to value.
-//
-// value: The batch size.
-// If not specified, defaults to 32
-func RecordInputBatchSize(value int64) RecordInputAttr {
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
 	return func(m optionalAttr) {
-		m["batch_size"] = value
+		m["container"] = value
 	}
 }
 
-// RecordInputCompressionType sets the optional compression_type attribute to value.
-//
-// value: The type of compression for the file. Currently ZLIB and
-// GZIP are supported. Defaults to none.
+// StageSizeSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func RecordInputCompressionType(value string) RecordInputAttr {
+func StageSizeSharedName(value string) StageSizeAttr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Emits randomized records.
-//
-// Arguments:
-//	file_pattern: Glob pattern for the data files.
-//
-// Returns A tensor of shape [batch_size].
-func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RecordInput",
+		Type: "StageSize",
 
 		Attrs: attrs,
 	}
@@ -5869,208 +5977,235 @@ func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr)
 	return op.Output(0)
 }
 
-// Computes reciprocal of square root of x element-wise.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// The regularized incomplete beta integral is defined as:
+//
+//
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rsqrt",
+		Type: "Betainc",
 		Input: []tf.Input{
-			x,
+			a, b, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
-type AudioSpectrogramAttr func(optionalAttr)
-
-// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
-//
-// value: Whether to return the squared magnitude or just the
-// magnitude. Using squared magnitude can avoid extra calculations.
-// If not specified, defaults to false
-func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
-	return func(m optionalAttr) {
-		m["magnitude_squared"] = value
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Produces a visualization of audio data over time.
-//
-// Spectrograms are a standard way of representing audio information as a series of
-// slices of frequency information, one slice for each window of time. By joining
-// these together into a sequence, they form a distinctive fingerprint of the sound
-// over time.
-//
-// This op expects to receive audio data as an input, stored as floats in the range
-// -1 to 1, together with a window width in samples, and a stride specifying how
-// far to move the window between slices. From this it generates a three
-// dimensional output. The lowest dimension has an amplitude value for each
-// frequency during that time slice. The next dimension is time, with successive
-// frequency slices. The final dimension is for the channels in the input, so a
-// stereo audio input would have two here for example.
-//
-// This means the layout when converted and saved as an image is rotated 90 degrees
-// clockwise from a typical spectrogram. Time is descending down the Y axis, and
-// the frequency decreases from left to right.
-//
-// Each value in the result represents the square root of the sum of the real and
-// imaginary parts of an FFT on the current window of samples. In this way, the
-// lowest dimension represents the power of each frequency in the current window,
-// and adjacent windows are concatenated in the next dimension.
-//
-// To get a more intuitive and visual look at what this operation does, you can run
-// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
-// resulting spectrogram as a PNG image.
-//
-// Arguments:
-//	input: Float representation of audio data.
-//	window_size: How wide the input window is in samples. For the highest efficiency
-// this should be a power of two, but other values are accepted.
-//	stride: How widely apart the center of adjacent sample windows should be.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// Returns 3D representation of the audio frequencies as an image.
-func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AudioSpectrogram",
+		Type: "Atan2",
 		Input: []tf.Input{
-			input,
+			y, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
 
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+// EditDistanceNormalize sets the optional normalize attribute to value.
 //
-// value: If true, merge repeated classes in output.
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
 // If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
+func EditDistanceNormalize(value bool) EditDistanceAttr {
 	return func(m optionalAttr) {
-		m["merge_repeated"] = value
+		m["normalize"] = value
 	}
 }
 
-// Performs beam search decoding on the logits given in input.
+// Computes the (possibly normalized) Levenshtein Edit Distance.
 //
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
 //
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
+		Type: "EditDistance",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
+func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Xlogy",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
-	log_probability = op.Output(idx)
-	return decoded_indices, decoded_values, decoded_shape, log_probability
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
 	}
 }
 
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
-//
-// The op uses LU decomposition with partial pivoting to compute the inverses.
-//
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
+// Computes the gradients of depthwise convolution with respect to the input.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "DepthwiseConv2dNativeBackpropInput",
 		Input: []tf.Input{
-			input,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6078,16 +6213,16 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
+// Returns x / y element-wise.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Add",
+		Type: "Div",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -6096,269 +6231,345 @@ func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
-func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+// Returns x * y element-wise.
+//
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGammaGrad",
+		Type: "Mul",
 		Input: []tf.Input{
-			alpha, sample,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes square of x element-wise.
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
+
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Adds `bias` to `value`.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Square",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			x,
+			value, bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Elu",
+		Type: "SparseReduceSumSparse",
 		Input: []tf.Input{
-			features,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the reciprocal of x element-wise.
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
+
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Reciprocal",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			x,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
-//
-// Arguments:
-//
+// Returns x + y element-wise.
 //
-//	dense_defaults: A dict mapping string keys to `Tensor`s.
-// The keys of the dict must match the dense_keys of the feature.
-//	sparse_keys: A list of string keys in the examples features.
-// The results for these keys will be returned as `SparseTensor` objects.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples features associated with dense values.
-//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-// and `tf.string` (`BytesList`) are supported.
-//	dense_shapes: List of tuples with the same length as `dense_keys`.
-// The shape of the data for each dense feature referenced by `dense_keys`.
-// Required for any input tensors identified by `dense_keys`.  Must be
-// either fully defined, or may contain an unknown first dimension.
-// An unknown first dimension means the feature is treated as having
-// a variable number of blocks, and the output shape along this dimension
-// is considered unknown at graph build time.  Padding is applied for
-// minibatch elements smaller than the maximum number of blocks for the
-// given feature along this dimension.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-func ParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParseExampleDataset",
+		Type: "AddV2",
 		Input: []tf.Input{
-			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a batched matrix tensor with new batched diagonal values.
-//
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
-//
-// The output is computed as follows:
-//
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-//
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-//
-// Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
+// Returns an element-wise indication of the sign of a number.
 //
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
+		Type: "Sign",
 		Input: []tf.Input{
-			input, diagonal,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+// Returns which elements of x are finite.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "IsFinite",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// OrderedMapClearAttr is an optional argument to OrderedMapClear.
-type OrderedMapClearAttr func(optionalAttr)
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
 
-// OrderedMapClearCapacity sets the optional capacity attribute to value.
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["begin_mask"] = value
 	}
 }
 
-// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["end_mask"] = value
 	}
 }
 
-// OrderedMapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearContainer(value string) OrderedMapClearAttr {
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["ellipsis_mask"] = value
 	}
 }
 
-// OrderedMapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["new_axis_mask"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
+//
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+//
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
 //
 // Returns the created operation.
-func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the reciprocal of x element-wise.
-//
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			x,
+			ref, begin, end, strides, value,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
 
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["output_type"] = value
 	}
 }
 
-// Computes the complex absolute value of a tensor.
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6367,9 +6578,9 @@ func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			x,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
@@ -6377,82 +6588,115 @@ func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Out
 	return op.Output(0)
 }
 
-// Returns the truth value of x AND y element-wise.
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
 //
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
+}
+
+// An identity op that triggers an error if a gradient is requested.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "PreventGradient",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CastAttr is an optional argument to Cast.
-type CastAttr func(optionalAttr)
-
-// CastTruncate sets the optional Truncate attribute to value.
-// If not specified, defaults to false
-func CastTruncate(value bool) CastAttr {
-	return func(m optionalAttr) {
-		m["Truncate"] = value
-	}
-}
-
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "Asin",
 		Input: []tf.Input{
 			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// MaxKeepDims sets the optional keep_dims attribute to value.
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes the maximum of elements across dimensions of a tensor.
+// Converts a sparse representation into a dense tensor.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
 //
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6461,9 +6705,9 @@ func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Max",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			input, axis,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
 		Attrs: attrs,
 	}
@@ -6471,308 +6715,182 @@ func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (ou
 	return op.Output(0)
 }
 
-// Quantized Batch normalization.
+// Computes the sum along sparse segments of a tensor.
 //
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
-		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+// For example:
 //
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// Arguments:
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
 //
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
 //
+// Arguments:
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
+			data, indices, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Produces the average pool of the input tensor for quantized types.
+// Computes the determinant of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Extract `patches` from `input` and put them in the "depth" output
-// dimension. 3D extension of `extract_image_patches`.
-//
-// Arguments:
-//	input: 5-D Tensor with shape `[batch, in_planes, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `input`.
-//	strides: 1-D of length 5. How far the centers of two consecutive patches are in
-// `input`. Must be: `[1, stride_planes, stride_rows, stride_cols, 1]`.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
-//       strides = [1, stride_planes, strides_rows, strides_cols, 1]
-// ```
-//
-// Returns 5-D Tensor with shape `[batch, out_planes, out_rows, out_cols,
-// ksize_planes * ksize_rows * ksize_cols * depth]` containing patches
-// with size `ksize_planes x ksize_rows x ksize_cols x depth` vectorized
-// in the "depth" dimension. Note `out_planes`, `out_rows` and `out_cols`
-// are the dimensions of the output patches.
-func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides []int64, padding string) (patches tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ExtractVolumePatches",
+		Type: "Sin",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
-
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
-	}
-}
-
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
-	}
-}
-
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Performs fractional average pooling on the input.
-//
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
+		Type: "Digamma",
 		Input: []tf.Input{
-			value,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
 
-// RandomCropSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// RandomCropSeed2 sets the optional seed2 attribute to value.
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Randomly crop `image`.
-//
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
-//
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
 //
 // Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomCrop",
+		Type: "Conv2DBackpropFilter",
 		Input: []tf.Input{
-			image, size,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6780,104 +6898,48 @@ func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...Rando
 	return op.Output(0)
 }
 
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
-
-// TopKV2Sorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
-	return func(m optionalAttr) {
-		m["sorted"] = value
-	}
-}
-
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
-//
-// If two elements are equal, the lower-index element appears first.
+// Returns the number of work units this Reader has finished processing.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TopKV2",
+		Type: "ReaderNumWorkUnitsCompletedV2",
 		Input: []tf.Input{
-			input, k,
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns x // y element-wise.
-//
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FloorDiv",
+		Type: "ExperimentalIgnoreErrorsDataset",
 		Input: []tf.Input{
-			x, y,
+			input_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
-//
-// For example:
-//
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
+		Type: "Lgamma",
 		Input: []tf.Input{
 			x,
 		},
@@ -6886,205 +6948,205 @@ func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
-//
-// For each batch `i` and class `j` we have
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
 //
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			logits,
+			l, grad,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
-//
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "TensorSliceDataset",
 		Input: []tf.Input{
-			x, y,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
-//
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softmax",
+		Type: "Sinh",
 		Input: []tf.Input{
-			logits,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeBmpAttr is an optional argument to DecodeBmp.
-type DecodeBmpAttr func(optionalAttr)
-
-// DecodeBmpChannels sets the optional channels attribute to value.
-// If not specified, defaults to 0
-func DecodeBmpChannels(value int64) DecodeBmpAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// Decode the first frame of a BMP-encoded image to a uint8 tensor.
+// Computes the sum along sparse segments of a tensor.
 //
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-// Accepted values are:
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
-// *   0: Use the number of channels in the BMP-encoded image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
 //
 // Arguments:
-//	contents: 0-D.  The BMP-encoded image.
 //
-// Returns 3-D with shape `[height, width, channels]`. RGB order
-func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBmp",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			contents,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softsign gradients for a softsign operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+// Computes natural logarithm of x element-wise.
 //
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
+		Type: "Log",
 		Input: []tf.Input{
-			gradients, features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Provides the time since epoch in seconds.
-//
-// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// Note: the timestamp is computed when the op is executed, not when it is added
-// to the graph.
-func Timestamp(scope *Scope) (ts tf.Output) {
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Timestamp",
+		Type: "Round",
+		Input: []tf.Input{
+			x,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
+// Computes reciprocal of square root of x element-wise.
 //
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Rsqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
+
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
 // If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
 	return func(m optionalAttr) {
-		m["adj_y"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
+// Computes the inverse of one or more square invertible matrices or their
 //
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+// adjoints (conjugate transposes).
 //
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
 //
-// It is computed as:
+// The op uses LU decomposition with partial pivoting to compute the inverses.
 //
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
 // Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7093,9 +7155,9 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -7103,324 +7165,248 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 	return op.Output(0)
 }
 
-// Returns which elements of x are NaN.
+// Returns x + y element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "Add",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
+func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
+		Type: "RandomGammaGrad",
 		Input: []tf.Input{
-			input,
+			alpha, sample,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// Computes square of x element-wise.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
+		Type: "Square",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+			x,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Gets next element for the provided shard number.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
-//	shard_num: Integer representing which shard to fetch data for.
-//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// Returns Result of the get_next on the dataset.
-func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorGetNextFromShard",
+		Type: "Elu",
 		Input: []tf.Input{
-			multi_device_iterator, shard_num, incarnation_id,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6",
+		Type: "Reciprocal",
 		Input: []tf.Input{
-			features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
-// for an explanation of segments.
+// Returns a batched matrix tensor with new batched diagonal values.
 //
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the minimum such that:
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
 //
-// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
-// that `segment_ids[j...] == i`.
+// The output is computed as follows:
 //
-// If the minimum is empty for a given segment ID `i`, it outputs the largest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::max()`.
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
 //
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
 //
 // Arguments:
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMin",
+		Type: "MatrixSetDiag",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input, diagonal,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear gradients for a Relu operation.
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReluGrad",
+		Type: "SparseSparseMaximum",
 		Input: []tf.Input{
-			gradients, features,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// Computes the reciprocal of x element-wise.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "Inv",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
-//
-// The polygamma function is defined as:
-//
-//
-// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
+
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Computes the complex absolute value of a tensor.
 //
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
-			a, x,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// Returns the truth value of x AND y element-wise.
 //
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
+		Type: "LogicalAnd",
 		Input: []tf.Input{
-			input, grad, argmax,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
 
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["Truncate"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
+	attrs := map[string]interface{}{"DstT": DstT}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
+		Type: "Cast",
 		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -7428,28 +7414,29 @@ func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output
 	return op.Output(0)
 }
 
-// Computes gradients of the maxpooling function.
+// Outputs a tensor containing the reduction across all input tensors.
 //
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// Outputs a tensor containing the reduction across all input tensors passed to ops
+// within the same `shared_name.
 //
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// The graph should be constructed so if one op runs with shared_name value `c`,
+// then `num_devices` ops will run with shared_name value `c`.  Failure to do so
+// will cause the graph execution to fail to complete.
+//
+// input: the input to the reduction
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+// num_devices: The number of devices participating in this reduction.
+// shared_name: Identifier that shared between ops of the same reduction.
+func NcclAllReduce(scope *Scope, input tf.Output, reduction string, num_devices int64, shared_name string) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"reduction": reduction, "num_devices": num_devices, "shared_name": shared_name}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
+		Type: "NcclAllReduce",
 		Input: []tf.Input{
-			input, grad, argmax,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -7457,35 +7444,31 @@ func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax
 	return op.Output(0)
 }
 
-// MutexV2Attr is an optional argument to MutexV2.
-type MutexV2Attr func(optionalAttr)
+// RegexReplaceAttr is an optional argument to RegexReplace.
+type RegexReplaceAttr func(optionalAttr)
 
-// MutexV2Container sets the optional container attribute to value.
+// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
 //
-// value: If non-empty, this variable is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutexV2Container(value string) MutexV2Attr {
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["replace_global"] = value
 	}
 }
 
-// MutexV2SharedName sets the optional shared_name attribute to value.
+// Replaces the match of pattern in input with rewrite.
 //
-// value: If non-empty, this variable is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func MutexV2SharedName(value string) MutexV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a Mutex resource that can be locked by `MutexLock`.
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
-// Returns The mutex resource.
-func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
+// Arguments:
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expresion.
+//
+// Returns The text after applying pattern and rewrite.
+func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7494,287 +7477,317 @@ func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutexV2",
-
+		Type: "RegexReplace",
+		Input: []tf.Input{
+			input, pattern, rewrite,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// Quantized Batch normalization.
 //
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
 //
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
+//
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "Mod",
+		Type: "QuantizedBatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			x, y,
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes offsets of concat inputs within its output.
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
-// For example:
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
 //
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
+// Arguments:
 //
-// This is typically used by gradient computations for a concat operation.
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
 //
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
+		Type: "QuantizedBiasAdd",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
+			input, bias, min_input, max_input, min_bias, max_bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Produces the average pool of the input tensor for quantized types.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "QuantizedAvgPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
+		Attrs: attrs,
 	}
-	return offset
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Compute the lower regularized incomplete Gamma function `P(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+// Extract `patches` from `input` and put them in the "depth" output dimension. 3D extension of `extract_image_patches`.
 //
-// where
+// Arguments:
+//	input: 5-D Tensor with shape `[batch, in_planes, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `input`.
+//	strides: 1-D of length 5. How far the centers of two consecutive patches are in
+// `input`. Must be: `[1, stride_planes, stride_rows, stride_cols, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+// We specify the size-related attributes as:
 //
-// is the lower incomplete Gamma function.
+// ```python
+//       ksizes = [1, ksize_planes, ksize_rows, ksize_cols, 1]
+//       strides = [1, stride_planes, strides_rows, strides_cols, 1]
+// ```
 //
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Returns 5-D Tensor with shape `[batch, out_planes, out_rows, out_cols,
+// ksize_planes * ksize_rows * ksize_cols * depth]` containing patches
+// with size `ksize_planes x ksize_rows x ksize_cols x depth` vectorized
+// in the "depth" dimension. Note `out_planes`, `out_rows` and `out_cols`
+// are the dimensions of the output patches.
+func ExtractVolumePatches(scope *Scope, input tf.Output, ksizes []int64, strides []int64, padding string) (patches tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Igamma",
+		Type: "ExtractVolumePatches",
 		Input: []tf.Input{
-			a, x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthToSpaceAttr is an optional argument to DepthToSpace.
-type DepthToSpaceAttr func(optionalAttr)
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
 
-// DepthToSpaceDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// DepthToSpace for tensors of type T.
-//
-// Rearranges data from depth into blocks of spatial data.
-// This is the reverse transformation of SpaceToDepth. More specifically,
-// this op outputs a copy of the input tensor where values from the `depth`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions.
-// The attr `block_size` indicates the input block size and how the data is moved.
-//
-//   * Chunks of data of size `block_size * block_size` from depth are rearranged
-//     into non-overlapping blocks of size `block_size x block_size`
-//   * The width the output tensor is `input_depth * block_size`, whereas the
-//     height is `input_height * block_size`.
-//   * The Y, X coordinates within each block of the output image are determined
-//     by the high order component of the input channel index.
-//   * The depth of the input tensor must be divisible by
-//     `block_size * block_size`.
-//
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
-//
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-//                         within the input image, bX, bY means coordinates
-//                         within the output block, oC means output channels).
-//      The output would be the input transposed to the following layout:
-//      n,iY,bY,iX,bX,oC
-//
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
-//
-// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-// block_size = 2:
-//
-// ```
-// x = [[[[1, 2, 3, 4]]]]
-//
-// ```
-//
-// This operation will output a tensor of shape `[1, 2, 2, 1]`:
-//
-// ```
-//    [[[[1], [2]],
-//      [[3], [4]]]]
-// ```
-//
-// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-// the corresponding output will have 2x2 elements and will have a depth of
-// 1 channel (1 = `4 / (block_size * block_size)`).
-// The output element shape is `[2, 2, 1]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-//
-// ```
-// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
 //
-// This operation, for block size of 2, will return the following tensor of shape
-// `[1, 2, 2, 3]`
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// ```
-//    [[[[1, 2, 3], [4, 5, 6]],
-//      [[7, 8, 9], [10, 11, 12]]]]
+// `index  0  1  2  3  4`
 //
-// ```
+// `value  20 5  16 3  7`
 //
-// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
 //
-// ```
-// x =  [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
 //
-// the operator will return the following tensor of shape `[1 4 4 1]`:
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
 //
-// ```
-// x = [[[ [1],   [2],  [5],  [6]],
-//       [ [3],   [4],  [7],  [8]],
-//       [ [9],  [10], [13],  [14]],
-//       [ [11], [12], [15],  [16]]]]
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
 //
-// ```
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
 //
 // Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
 //
-//	block_size: The size of the spatial block, same as in Space2Depth.
-func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthToSpace",
+		Type: "FractionalAvgPool",
 		Input: []tf.Input{
-			input,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
 
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+// RandomCropSeed sets the optional seed attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed"] = value
 	}
 }
 
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+// RandomCropSeed2 sets the optional seed2 attribute to value.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["seed2"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// Randomly crop `image`.
+//
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
+		Type: "RandomCrop",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			image, size,
 		},
 		Attrs: attrs,
 	}
@@ -7782,274 +7795,216 @@ func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output
 	return op.Output(0)
 }
 
-// Computes square root of x element-wise.
-//
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
-type Conv3DBackpropFilterAttr func(optionalAttr)
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
 
-// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+// TopKV2Sorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["sorted"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
+		Type: "TopKV2",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input, k,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
+// Returns x // y element-wise.
 //
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
+		Type: "FloorDiv",
 		Input: []tf.Input{
-			y, dy,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
+// Computes the inverse permutation of a tensor.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
 //
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
+// The values must include 0. There can be no duplicate values or negative values.
+//
+// For example:
 //
 // ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
 // ```
 //
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
 // Arguments:
+//	x: 1-D.
 //
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "InvertPermutation",
 		Input: []tf.Input{
-			input, filter,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
+// Computes log softmax activations.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
+// For each batch `i` and class `j` we have
+//
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "LogSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x <= y) element-wise.
+//
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a reader to a previously saved state.
+// Computes softmax activations.
 //
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
+// For each batch `i` and class `j` we have
+//
+//     $$softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))$$
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
+//	logits: 2-D with shape `[batch_size, num_classes]`.
 //
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
+		Type: "Softmax",
 		Input: []tf.Input{
-			reader_handle, state,
+			logits,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
 
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["channels"] = value
 	}
 }
 
-// Computes gradients of the maxpooling function.
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	contents: 0-D.  The BMP-encoded image.
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
+		Type: "DecodeBmp",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -8057,71 +8012,56 @@ func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad
 	return op.Output(0)
 }
 
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
 
-// CropAndResizeMethod sets the optional method attribute to value.
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
 //
-// value: A string specifying the sampling method for resizing. It can be either
-// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
-// methods are supported: Bilinear and Nearest Neighbor.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["adj_x"] = value
 	}
 }
 
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
 //
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
 	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
+		m["adj_y"] = value
 	}
 }
 
-// Extracts crops from the input image tensor and resizes them.
+// Multiplies slices of two tensors in batches.
 //
-// Extracts crops from the input image tensor and resizes them using bilinear
-// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
-// common output size specified by `crop_size`. This is more general than the
-// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
-// and does not allow resizing or aspect ratio change.
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
 //
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear or nearest neighbor interpolation) to a fixed
-// `size = [crop_height, crop_width]`. The result is a 4-D tensor
-// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
-// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
-// results to using `tf.image.resize_bilinear()` or
-// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
-// `align_corners=True`.
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
 //
 // Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
 //
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8130,9 +8070,9 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -8140,99 +8080,72 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 	return op.Output(0)
 }
 
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
-//
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
-//
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
-//
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
-//
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
-//
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-//
-//     empty_row_indicator[i] = True iff row i was an empty row.
-//
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
-//
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+// Returns which elements of x are NaN.
 //
-// Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsNan",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Identity op for gradient debugging.
 //
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
+		Type: "DebugGradientIdentity",
 		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
 
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// The backward operation for "BiasAdd" on the "bias" tensor.
-//
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// var: Should be from a Variable().
 //
 // Arguments:
-//	out_backprop: Any number of dimensions.
 //
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8241,108 +8154,70 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			out_backprop,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
+// Gets next element for the provided shard number.
 //
 // Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
+//	multi_device_iterator: A MultiDeviceIterator resource.
+//	shard_num: Integer representing which shard to fetch data for.
+//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
 //
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+// Returns Result of the get_next on the dataset.
+func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
+		Type: "MultiDeviceIteratorGetNextFromShard",
 		Input: []tf.Input{
-			input,
+			multi_device_iterator, shard_num, incarnation_id,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
-type FusedBatchNormV2Attr func(optionalAttr)
-
-// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
+		return
 	}
+	return components
 }
 
-// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+// LeakyReluGradAttr is an optional argument to LeakyReluGrad.
+type LeakyReluGradAttr func(optionalAttr)
+
+// LeakyReluGradAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluGradAlpha(value float32) LeakyReluGradAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["alpha"] = value
 	}
 }
 
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Computes rectified linear gradients for a LeakyRelu operation.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	gradients: The backpropagated gradients to the corresponding LeakyRelu operation.
+//	features: The features passed as input to the corresponding LeakyRelu operation,
+// OR the outputs of that operation (both work equivalently).
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns `gradients * (features > 0) + alpha * gradients * (featurs <= 0)`.
+func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, optional ...LeakyReluGradAttr) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8351,276 +8226,311 @@ func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
+		Type: "LeakyReluGrad",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			gradients, features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [3] or 'dims' is [-1]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
-//
-// Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-// `[-rank(tensor), rank(tensor))`.
+// Deprecated. Use TensorArrayGradV3
 //
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseV2",
+		Type: "TensorArrayWriteV2",
 		Input: []tf.Input{
-			tensor, axis,
+			handle, index, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
-//
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// LeakyReluAttr is an optional argument to LeakyRelu.
+type LeakyReluAttr func(optionalAttr)
+
+// LeakyReluAlpha sets the optional alpha attribute to value.
+// If not specified, defaults to 0.2
+func LeakyReluAlpha(value float32) LeakyReluAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// Computes rectified linear: `max(features, features * alpha)`.
+func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "LeakyRelu",
 		Input: []tf.Input{
-			value, bias,
+			features,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Shuffle dimensions of x according to a permutation.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Transpose",
+		Type: "Relu6",
 		Input: []tf.Input{
-			x, perm,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
+// SdcaOptimizerV2Attr is an optional argument to SdcaOptimizerV2.
+type SdcaOptimizerV2Attr func(optionalAttr)
 
-// MinKeepDims sets the optional keep_dims attribute to value.
+// SdcaOptimizerV2Adaptive sets the optional adaptive attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerV2Adaptive(value bool) SdcaOptimizerV2Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["adaptive"] = value
 	}
 }
 
-// Computes the minimum of elements across dimensions of a tensor.
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
 //
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerV2Attr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Min",
+		Type: "SdcaOptimizerV2",
 		Input: []tf.Input{
-			input, axis,
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizerV2", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// Computes the Bessel i1e function of `x` element-wise.
+// Computes the minimum along segments of a tensor.
 //
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+// for an explanation of segments.
 //
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the minimum such that:
+//
+// \\(output_i = \min_{j...} data_[j...]\\) where min is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the minimum is empty for a given segment ID `i`, it outputs the largest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::max()`.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BesselI1e",
+		Type: "UnsortedSegmentMin",
 		Input: []tf.Input{
-			x,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
+// Computes rectified linear gradients for a Relu operation.
 //
 // Arguments:
-//	tensor: A Tensor of type `T`.
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
 //
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
+		Type: "ReluGrad",
 		Input: []tf.Input{
-			tensor,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the gradient of morphological 2-D dilation with respect to the input.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Acos",
+		Type: "Dilation2DBackpropInput",
 		Input: []tf.Input{
-			x,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnbatchGradAttr is an optional argument to UnbatchGrad.
-type UnbatchGradAttr func(optionalAttr)
-
-// UnbatchGradContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradContainer(value string) UnbatchGradAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
+//
+// The polygamma function is defined as:
+//
+//
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// UnbatchGradSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradSharedName(value string) UnbatchGradAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "Polygamma",
+		Input: []tf.Input{
+			a, x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Gradient of Unbatch.
+// Computes second-order gradients of the maxpooling function.
 //
-// Acts like Batch but using the given batch_index index of batching things as they
-// become available. This ensures that the gradients are propagated back in the
-// same session which did the forward pass.
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// original_input: The input to the Unbatch operation this is the gradient of.
-// batch_index: The batch_index given to the Unbatch operation this is the gradient
-// of.
-// grad: The downstream gradient.
-// id: The id scalar emitted by Batch.
-// batched_grad: The return value, either an empty tensor or the batched gradient.
-// container: Container to control resource sharing.
-// shared_name: Instances of UnbatchGrad with the same container and shared_name
-//  are assumed to possibly belong to the same batch. If left empty, the op name
-//  will be used as the shared name.
-func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "UnbatchGrad",
+		Type: "MaxPoolGradGradWithArgmax",
 		Input: []tf.Input{
-			original_input, batch_index, grad, id,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
@@ -8628,47 +8538,47 @@ func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output,
 	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
 
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of average pooling function.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
+		Type: "MaxPoolGradGradV2",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -8676,115 +8586,64 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
-type ParseSingleSequenceExampleAttr func(optionalAttr)
-
-// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// Computes gradients of the maxpooling function.
 //
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradWithArgmax",
+		Input: []tf.Input{
+			input, grad, argmax,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
+// MutexV2Attr is an optional argument to MutexV2.
+type MutexV2Attr func(optionalAttr)
 
-// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// MutexV2Container sets the optional container attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+// value: If non-empty, this variable is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutexV2Container(value string) MutexV2Attr {
 	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
+		m["container"] = value
 	}
 }
 
-// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// MutexV2SharedName sets the optional shared_name attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+// value: If non-empty, this variable is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func MutexV2SharedName(value string) MutexV2Attr {
 	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+// Creates a Mutex resource that can be locked by `MutexLock`.
 //
-// Arguments:
-//	serialized: A scalar containing a binary serialized SequenceExample proto.
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExample.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExample.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	debug_name: A scalar containing the name of the serialized proto.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty scalar if no name is available.
-func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+// Returns The mutex resource.
+func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8793,109 +8652,52 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleSequenceExample",
-		Input: []tf.Input{
-			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
-		},
+		Type: "MutexV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
-}
-
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
-
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
+	return op.Output(0)
 }
 
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
-	}
-}
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
 
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
 	return func(m optionalAttr) {
-		m["input_max"] = value
+		m["data_format"] = value
 	}
 }
 
-// Use QuantizeAndDequantizeV2 instead.
+// Performs 3D average pooling on the input.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
+		Type: "AvgPool3D",
 		Input: []tf.Input{
 			input,
 		},
@@ -8905,499 +8707,554 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 	return op.Output(0)
 }
 
-// Returns locations of nonzero / true values in a tensor.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes offsets of concat inputs within its output.
 //
 // For example:
 //
 // ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
 //
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// This is typically used by gradient computations for a concat operation.
 //
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// Arguments:
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "ConcatOffset",
 		Input: []tf.Input{
-			condition,
+			concat_dim, tf.OutputList(shape),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
+}
+
+// Compute the lower regularized incomplete Gamma function `P(a, x)`.
+//
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
+// DepthToSpaceAttr is an optional argument to DepthToSpace.
+type DepthToSpaceAttr func(optionalAttr)
 
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+// DepthToSpaceDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["data_format"] = value
 	}
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
+// DepthToSpace for tensors of type T.
 //
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
+// Rearranges data from depth into blocks of spatial data.
+// This is the reverse transformation of SpaceToDepth. More specifically,
+// this op outputs a copy of the input tensor where values from the `depth`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions.
+// The attr `block_size` indicates the input block size and how the data is moved.
 //
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+//   * Chunks of data of size `block_size * block_size` from depth are rearranged
+//     into non-overlapping blocks of size `block_size x block_size`
+//   * The width the output tensor is `input_depth * block_size`, whereas the
+//     height is `input_height * block_size`.
+//   * The Y, X coordinates within each block of the output image are determined
+//     by the high order component of the input channel index.
+//   * The depth of the input tensor must be divisible by
+//     `block_size * block_size`.
+//
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+//                         within the input image, bX, bY means coordinates
+//                         within the output block, oC means output channels).
+//      The output would be the input transposed to the following layout:
+//      n,iY,bY,iX,bX,oC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1, 2, 3, 4]]]]
+//
+// ```
+//
+// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+//
+// ```
+//    [[[[1], [2]],
+//      [[3], [4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+// the corresponding output will have 2x2 elements and will have a depth of
+// 1 channel (1 = `4 / (block_size * block_size)`).
+// The output element shape is `[2, 2, 1]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// This operation, for block size of 2, will return the following tensor of shape
+// `[1, 2, 2, 3]`
+//
+// ```
+//    [[[[1, 2, 3], [4, 5, 6]],
+//      [[7, 8, 9], [10, 11, 12]]]]
+//
+// ```
+//
+// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+//
+// ```
+// x =  [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 4 4 1]`:
+//
+// ```
+// x = [[[ [1],   [2],  [5],  [6]],
+//       [ [3],   [4],  [7],  [8]],
+//       [ [9],  [10], [13],  [14]],
+//       [ [11], [12], [15],  [16]]]]
+//
+// ```
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+//	block_size: The size of the spatial block, same as in Space2Depth.
+func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"block_size": block_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "DepthToSpace",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
-}
-
-// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
-type ParseSequenceExampleAttr func(optionalAttr)
-
-// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_dense"] = value
-	}
-}
-
-// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_sparse"] = value
-	}
-}
-
-// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_dense"] = value
-	}
-}
-
-// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
-//
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
-	}
-}
-
-// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
-	}
+	return op.Output(0)
 }
 
-// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
-	}
-}
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
 
-// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
+		m["data_format"] = value
 	}
 }
 
-// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
+		m["dilations"] = value
 	}
 }
 
-// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
 // Arguments:
-//	serialized: A vector containing binary serialized SequenceExample protos.
-//	debug_name: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no name is available.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExamples.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExamples.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSequenceExample",
+		Type: "Conv3DBackpropInputV2",
 		Input: []tf.Input{
-			serialized, debug_name, tf.OutputList(context_dense_defaults),
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes square root of x element-wise.
+//
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
 	}
-	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
+
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
 	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Erf",
+		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
-			x,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the gradient for the rsqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Floor",
+		Type: "RsqrtGrad",
 		Input: []tf.Input{
-			x,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
 
-// OneHotAxis sets the optional axis attribute to value.
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
 //
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["data_format"] = value
 	}
 }
 
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
 //
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
 //
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
 //
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
 // ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
 // ```
 //
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
-// Examples
-// =========
-//
-// Suppose that
-//
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
+// Arguments:
 //
-// Then output is `[4 x 3]`:
 //
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNative",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
+
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
 //
-// Suppose that
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
 //
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Then output is `[3 x 4]`:
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradV2",
+		Input: []tf.Input{
+			orig_input, orig_output, grad, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a reader to a previously saved state.
 //
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
-// Suppose that
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
 //
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
 //
-// Then output is `[2 x 2 x 3]`:
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderRestoreStateV2",
+		Input: []tf.Input{
+			reader_handle, state,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
+
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
 //
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OneHot",
+		Type: "MaxPoolGrad",
 		Input: []tf.Input{
-			indices, depth, on_value, off_value,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -9405,52 +9262,71 @@ func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output
 	return op.Output(0)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exp",
-		Input: []tf.Input{
-			x,
-		},
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
+
+// CropAndResizeMethod sets the optional method attribute to value.
+//
+// value: A string specifying the sampling method for resizing. It can be either
+// `"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
+// methods are supported: Bilinear and Nearest Neighbor.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
 //
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["extrapolation_value"] = value
 	}
 }
 
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
+// Extracts crops from the input image tensor and resizes them.
 //
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+// Extracts crops from the input image tensor and resizes them using bilinear
+// sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+// common output size specified by `crop_size`. This is more general than the
+// `crop_to_bounding_box` op which extracts a fixed size slice from the input image
+// and does not allow resizing or aspect ratio change.
 //
-//     values.shape = input.shape[:-1]
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear or nearest neighbor interpolation) to a fixed
+// `size = [crop_height, crop_width]`. The result is a 4-D tensor
+// `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+// In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+// results to using `tf.image.resize_bilinear()` or
+// `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+// `align_corners=True`.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
 //
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9459,9 +9335,9 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NthElement",
+		Type: "CropAndResize",
 		Input: []tf.Input{
-			input, n,
+			image, boxes, box_ind, crop_size,
 		},
 		Attrs: attrs,
 	}
@@ -9469,170 +9345,136 @@ func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthEleme
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
 //
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum such that:
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
 //
-// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
-// that `segment_ids[j...] == i`.
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
 //
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::lowest()`.
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
 //
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
 //
-// Arguments:
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.END
-//   }
-//   out_arg {
-//     name: "output"
-//     description: <<END
-// Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
 //
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+//
+// Arguments:
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
+//
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "SparseFillEmptyRows",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			indices, values, dense_shape, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// Reduces `input` from `num_devices` using `reduction` to a single device.
 //
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// The graph should be constructed so that all inputs have a valid device
+// assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the reduction.
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	attrs := map[string]interface{}{"reduction": reduction}
 	opspec := tf.OpSpec{
-		Type: "ParseExample",
+		Type: "NcclReduce",
 		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+			tf.OutputList(input),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
 
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
 //
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
-		m["little_endian"] = value
+		m["data_format"] = value
 	}
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// Arguments:
-//	bytes: All the elements must have the same length.
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
+// Arguments:
+//	out_backprop: Any number of dimensions.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			bytes,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -9640,79 +9482,75 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
-//
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
-//
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+// Returns 0 if x == 0, and x / y otherwise, elementwise.
+func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
+		Type: "Xdivy",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
+// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
+type FusedBatchNormV2Attr func(optionalAttr)
 
-// RandomShuffleSeed sets the optional seed attribute to value.
+// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["epsilon"] = value
 	}
 }
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Randomly shuffles a tensor along its first dimension.
+// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
 //
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
 //
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	value: The tensor to be shuffled.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9721,132 +9559,156 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "FusedBatchNormV2",
 		Input: []tf.Input{
-			value,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
-type OrderedMapIncompleteSizeAttr func(optionalAttr)
-
-// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Reverses specific dimensions of a tensor.
 //
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
 //
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [3] or 'dims' is [-1]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
+//
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "ReverseV2",
+		Input: []tf.Input{
+			tensor, axis,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op returns the number of incomplete elements in the underlying container.
-func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapIncompleteSize",
-
-		Attrs: attrs,
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x > y) element-wise.
+// Shuffle dimensions of x according to a permutation.
 //
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Greater",
+		Type: "Transpose",
 		Input: []tf.Input{
-			x, y,
+			x, perm,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// MinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9855,239 +9717,152 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "Min",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+// Computes the Bessel i1e function of `x` element-wise.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// This function is faster and numerically stabler than `bessel_i1(x)`.
+func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
-//
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["min_object_covered"] = value
+	opspec := tf.OpSpec{
+		Type: "BesselI1e",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+		m["capacity"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["area_range"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
-//
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["container"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
-//
-// Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
+// Op removes all elements in the underlying container.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
-		Input: []tf.Input{
-			image_size, bounding_boxes,
-		},
+		Type: "MapClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Computes sigmoid of `x` element-wise.
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sigmoid",
-		Input: []tf.Input{
-			x,
-		},
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["use_quote_delim"] = value
 	}
 }
 
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// DecodeCSVNaValue sets the optional na_value attribute to value.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["na_value"] = value
 	}
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["select_cols"] = value
 	}
 }
 
-// Batch normalization.
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10096,111 +9871,125 @@ func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			records, tf.OutputList(record_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
 	}
+	return output
 }
 
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+// Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
+// Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	tensor: A Tensor of type `T`.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "SerializeTensor",
+		Input: []tf.Input{
+			tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "Acos",
 		Input: []tf.Input{
-			shape,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
 
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+// UnbatchGradContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradContainer(value string) UnbatchGradAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["container"] = value
 	}
 }
 
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+// UnbatchGradSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnbatchGradSharedName(value string) UnbatchGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// Gradient of Unbatch.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+// Acts like Batch but using the given batch_index index of batching things as they
+// become available. This ensures that the gradients are propagated back in the
+// same session which did the forward pass.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// original_input: The input to the Unbatch operation this is the gradient of.
+// batch_index: The batch_index given to the Unbatch operation this is the gradient
+// of.
+// grad: The downstream gradient.
+// id: The id scalar emitted by Batch.
+// batched_grad: The return value, either an empty tensor or the batched gradient.
+// container: Container to control resource sharing.
+// shared_name: Instances of UnbatchGrad with the same container and shared_name
+//  are assumed to possibly belong to the same batch. If left empty, the op name
+//  will be used as the shared name.
+func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10209,9 +9998,9 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "UnbatchGrad",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			original_input, batch_index, grad, id,
 		},
 		Attrs: attrs,
 	}
@@ -10219,57 +10008,47 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 	return op.Output(0)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["data_format"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// Computes gradients of average pooling function.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
+		Type: "AvgPool3DGrad",
 		Input: []tf.Input{
-			input, size, paddings, filter,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -10277,95 +10056,115 @@ func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, padd
 	return op.Output(0)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
 
-// RandomUniformSeed sets the optional seed attribute to value.
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["context_sparse_types"] = value
 	}
 }
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["feature_list_dense_types"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniform",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
-
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["feature_list_sparse_types"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
 //
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+// Arguments:
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExample.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExample.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10374,134 +10173,100 @@ func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "ParseSingleSequenceExample",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Encode audio data using the WAV file format.
-//
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
-//
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
-//
-// Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
-//
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+	op := scope.AddOperation(opspec)
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "EncodeWav",
-		Input: []tf.Input{
-			audio, sample_rate,
-		},
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Atan",
-		Input: []tf.Input{
-			x,
-		},
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
 
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["num_bits"] = value
 	}
 }
 
-// Update '*var' according to the AdaMax algorithm.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
 
-// AssertSummarize sets the optional summarize attribute to value.
-//
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["input_max"] = value
 	}
 }
 
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
-//
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+// Use QuantizeAndDequantizeV2 instead.
 //
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10510,168 +10275,132 @@ func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...Ass
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "QuantizeAndDequantize",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
+// Returns locations of nonzero / true values in a tensor.
 //
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
 //
-// Graphically the output tensors are:
+// For example:
 //
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
 //
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "Where",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			condition,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
+	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
 
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-//
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
+		m["timeout_ms"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Calculates gains for each feature and returns the best possible split information for the feature.
-//
-// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
-//
-// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+// Dequeues a tuple of one or more tensors from the given queue.
 //
-// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
 //
-// The length of output lists are all of the same length, `num_features`.
-// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
-//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
-//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"max_splits": max_splits}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesCalculateBestGainsPerFeature",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
+			handle,
 		},
 		Attrs: attrs,
 	}
@@ -10681,233 +10410,438 @@ func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Out
 	}
 	var idx int
 	var err error
-	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
 		return
 	}
-	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
+	return components
+}
+
+// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
+type ParseSequenceExampleAttr func(optionalAttr)
+
+// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_sparse"] = value
 	}
-	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
+}
+
+// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Ncontext_dense"] = value
 	}
-	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
+}
+
+// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_sparse"] = value
 	}
-	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
+}
+
+// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["Nfeature_list_dense"] = value
 	}
-	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
+// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
+	}
+}
 
-// EncodePngCompression sets the optional compression attribute to value.
+// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
 //
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["compression"] = value
+		m["feature_list_dense_types"] = value
 	}
 }
 
-// PNG-encode an image.
+// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
 //
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
 //
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
 //
-// Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing binary serialized SequenceExample protos.
+//	debug_name: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no name is available.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExamples.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExamples.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
+		Type: "ParseSequenceExample",
 		Input: []tf.Input{
-			image,
+			serialized, debug_name, tf.OutputList(context_dense_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
-
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
 	}
-}
-
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
+	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
 	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
-// Returns the permuted vector/tensor in the destination data format given the
-//
-// one in the source data format.
-//
-// Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
-//
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
+		Type: "Erf",
 		Input: []tf.Input{
 			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Initializes the multi device iterator with the given dataset.
-//
-// Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
-//
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
+		Type: "Floor",
 		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of `igamma(a, x)` wrt `a`.
-func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
+
+// OneHotAxis sets the optional axis attribute to value.
+//
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns a one-hot tensor.
+//
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
+//
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
+//
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
+//
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
+//
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
+//
+//
+// Examples
+// =========
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+//
+//     ```output =
+//       [5.0 0.0 0.0]  // one_hot(0)
+//       [0.0 0.0 5.0]  // one_hot(2)
+//       [0.0 0.0 0.0]  // one_hot(-1)
+//       [0.0 5.0 0.0]  // one_hot(1)
+//     ```
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+//
+//     ```output =
+//       [0.0 3.0 3.0 3.0]
+//       [3.0 3.0 3.0 0.0]
+//       [3.0 3.0 3.0 3.0]
+//       [3.0 0.0 3.0 3.0]
+//     //  ^                one_hot(0)
+//     //      ^            one_hot(2)
+//     //          ^        one_hot(-1)
+//     //              ^    one_hot(1)
+//     ```
+// Suppose that
+//
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+//
+//     ```output =
+//       [
+//         [1.0, 0.0, 0.0]  // one_hot(0)
+//         [0.0, 0.0, 1.0]  // one_hot(2)
+//       ][
+//         [0.0, 1.0, 0.0]  // one_hot(1)
+//         [0.0, 0.0, 0.0]  // one_hot(-1)
+//       ]```
+//
+// Arguments:
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IgammaGradA",
+		Type: "OneHot",
 		Input: []tf.Input{
-			a, x,
+			indices, depth, on_value, off_value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
-//
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-//
-// Arguments:
-//
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "Exp",
 		Input: []tf.Input{
-			string_tensor,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
-type StaticRegexReplaceAttr func(optionalAttr)
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
 
-// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+// NthElementReverse sets the optional reverse attribute to value.
 //
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
 	return func(m optionalAttr) {
-		m["replace_global"] = value
+		m["reverse"] = value
 	}
 }
 
-// Replaces the match of pattern in input with rewrite.
+// Finds values of the `n`-th order statistic for the last dimension.
 //
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
+//
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+//
+//     values.shape = input.shape[:-1]
 //
 // Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expresion.
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
 //
-// Returns The text after applying pattern and rewrite.
-func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StaticRegexReplace",
+		Type: "NthElement",
 		Input: []tf.Input{
-			input,
+			input, n,
 		},
 		Attrs: attrs,
 	}
@@ -10915,170 +10849,298 @@ func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite s
 	return op.Output(0)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum such that:
+//
+// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
+// that `segment_ids[j...] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.END
+//   }
+//   out_arg {
+//     name: "output"
+//     description: <<END
+// Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+//
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "UnsortedSegmentMax",
 		Input: []tf.Input{
-			gradients, outputs,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
-//
-//
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			input_dataset, count,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Reads the value of a variable.
-//
-// The tensor returned by this operation is immutable.
+// Compute the pairwise cross product.
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
+		Type: "Cross",
 		Input: []tf.Input{
-			resource,
+			a, b,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// This op consumes a lock created by `MutexLock`.
-//
-// This op exists to consume a tensor created by `MutexLock` (other than
-// direct control dependencies).  It should be the only that consumes the tensor,
-// and will raise an error if it is not.  Its only purpose is to keep the
-// mutex lock tensor alive until it is consumed by this op.
-//
-// **NOTE**: This operation must run on the same device as its input.  This may
-// be enforced via the `colocate_with` mechanism.
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
+
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
 //
-// Arguments:
-//	mutex_lock: A tensor returned by `MutexLock`.
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
 //
-// Returns the created operation.
-func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConsumeMutexLock",
+		Type: "CudnnRNN",
 		Input: []tf.Input{
-			mutex_lock,
+			input, input_h, input_c, params,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
 
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["compression_type"] = value
 	}
 }
 
-// Adds sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_add(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
+// Decompress strings.
 //
-//     [1, 12, 3, 14, 14, 6, 7, 20]
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
+//	bytes: A Tensor of string which is compressed.
 //
-// Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11087,62 +11149,68 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
+		Type: "DecodeCompressed",
 		Input: []tf.Input{
-			ref, indices, updates,
+			bytes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Updates the tree ensemble by either adding a layer to the last tree being grown
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
+
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// or by starting a new tree.
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+	return func(m optionalAttr) {
+		m["little_endian"] = value
+	}
+}
+
+// Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the ensemble variable.
-//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
-// the feature that will be used in the split.
-//	node_ids: List of rank 1 tensors representing the nodes for which this feature
-// has a split.
-//	gains: List of rank 1 tensors representing the gains for each of the feature's
-// split.
-//	thresholds: List of rank 1 tensors representing the thesholds for each of the
-// feature's split.
-//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
-// the feature's splits. Will be added to the previous node values to constitute
-// the values of the left nodes.
-//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
-// of the feature's splits. Will be added to the previous node values to constitute
-// the values of the right nodes.
-//	max_depth: Max depth of the tree to build.
-//	learning_rate: shrinkage const for each new tree.
-//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+//	bytes: All the elements must have the same length.
 //
-// Returns the created operation.
-func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesUpdateEnsemble",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
+			bytes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes natural logarithm of (1 + x) element-wise.
+//
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "Log1p",
 		Input: []tf.Input{
 			x,
 		},
@@ -11151,124 +11219,55 @@ func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
-//
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
-	}
-}
-
-// EncodeJpegQuality sets the optional quality attribute to value.
-//
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
-	}
-}
-
-// EncodeJpegProgressive sets the optional progressive attribute to value.
-//
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
-	}
-}
-
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
-//
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// Computes rectified linear 6 gradients for a Relu6 operation.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EncodeJpegXDensity sets the optional x_density attribute to value.
-//
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegYDensity sets the optional y_density attribute to value.
-//
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
-	}
-}
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["align_corners"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// Resize `images` to `size` using bicubic interpolation.
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11277,9 +11276,9 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "ResizeBicubic",
 		Input: []tf.Input{
-			image,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -11287,234 +11286,262 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
+// Gather ragged slices from `params` axis `0` according to `indices`.
 //
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
+// Outputs a `RaggedTensor` output composed from `output_dense_values` and
+// `output_nested_splits`, such that:
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// ```python
+// output.shape = indices.shape + params.shape[1:]
+// output.ragged_rank = indices.shape.ndims + params.ragged_rank
+// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+// ```
+//
+// where
+//
+// * `params =
+//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+//    provides the values that should be gathered.
+// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+//    values should be gathered.
+// * `output =
+//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+//    is the output tensor.
+//
+// (Note: This c++ op is used to implement the higher-level python
+// `tf.ragged.gather` op, which also supports ragged indices.)
+//
+//
+// Arguments:
+//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// `params` RaggedTensor input.
+//	params_dense_values: The `inner_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to inner_values, so dense_values is the
+// deprecated name.
+//	indices: Indices in the outermost dimension of `params` of the values that should be
+// gathered.
+//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+// this number of `row_splits` tensors. This value should equal
+// `indices.shape.ndims + params.ragged_rank - 1`.
+//
+// Returns The `nested_row_splits` tensors that define the row-partitioning for the
+// returned RaggedTensor.The `inner_values` for the returned RaggedTensor.
+func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
+	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
+	opspec := tf.OpSpec{
+		Type: "RaggedGather",
+		Input: []tf.Input{
+			tf.OutputList(params_nested_splits), params_dense_values, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedGather", err)
+		return
 	}
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
 }
 
-// Draws samples from a multinomial distribution.
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+//
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "NonMaxSuppressionV2",
 		Input: []tf.Input{
-			logits, num_samples,
+			boxes, scores, max_output_size, iou_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `inner_values` for the `RaggedTensor`.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "RaggedTensorToSparse",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			tf.OutputList(rt_nested_splits), rt_dense_values,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
-
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// Check if the input matches the regex pattern.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// The input is a string tensor of any shape. The pattern is a scalar
+// string tensor which is applied to every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	input: A string tensor of the text to be processed.
+//	pattern: A scalar string tensor containing the regular expression to match the input.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+// Returns A bool tensor with the same shape as `input`.
+func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "RegexFullMatch",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			input, pattern,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns which elements of x are Inf.
+// Says whether the targets are in the top `K` predictions.
 //
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "InTopKV2",
 		Input: []tf.Input{
-			x,
+			predictions, targets, k,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
 
-// TruncatedNormalSeed sets the optional seed attribute to value.
+// RandomShuffleSeed sets the optional seed attribute to value.
 //
 // value: If either `seed` or `seed2` are set to be non-zero, the random number
 // generator is seeded by the given seed.  Otherwise, it is seeded by a
 // random seed.
 // If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
 //
 // value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Outputs random values from a truncated normal distribution.
+// Randomly shuffles a tensor along its first dimension.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	value: The tensor to be shuffled.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "RandomShuffle",
 		Input: []tf.Input{
-			shape,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -11522,143 +11549,199 @@ func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// SkipgramWindowSize sets the optional window_size attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["window_size"] = value
+		m["use_locking"] = value
 	}
 }
 
-// SkipgramMinCount sets the optional min_count attribute to value.
+// Update '*var' according to the RMSProp algorithm.
 //
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
-	}
-}
-
-// Parses a text file and creates a batch of examples.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
 //
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
+		Type: "ResourceSparseApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return scope.AddOperation(opspec)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
 
-// StringToNumberOutType sets the optional out_type attribute to value.
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["seed"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "StringToNumber",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
 
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
 // If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["use_image_if_no_bounding_boxes"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// Generate a single randomly distorted bounding box for an image.
 //
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-//	lr_power: Scaling factor. Must be a scalar.
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11667,65 +11750,87 @@ func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "SampleDistortedBoundingBox",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			image_size, bounding_boxes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// Computes sigmoid of `x` element-wise.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "Sigmoid",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
 
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["epsilon"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11734,61 +11839,62 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			shape, seed,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["seed"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
 //
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution.
 //
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -11796,83 +11902,57 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 	return op.Output(0)
 }
 
-// Divides sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] /= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] /= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterDiv",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["resize_align_corners"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
-//
-// The generated values will have mean 0 and standard deviation 1.
+// Performs a resize and padding as a preprocess during a convolution.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			shape, seed,
+			input, size, paddings, filter,
 		},
 		Attrs: attrs,
 	}
@@ -11880,83 +11960,95 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
 //
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
 //
-// For example:
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
 //
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-// ```
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
 // Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			start, limit, delta,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
 
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
+//	linear: Should be from a Variable().
 //	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11965,189 +12057,199 @@ func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "ResourceApplyFtrl",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Exits the current frame to its parent frame.
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
 //
-// Exit makes its input `data` available to the parent frame.
+// is alive, any other request to use `MutexLock` with this mutex will wait.
+//
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
+//
+// ```python
+//
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
+// ```
+//
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
 //
 // Arguments:
-//	data: The tensor to be made available to the parent frame.
+//	mutex: The mutex resource to lock.
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exit",
+		Type: "MutexLock",
 		Input: []tf.Input{
-			data,
+			mutex,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "ParseTensor",
 		Input: []tf.Input{
-			reader_handle,
+			serialized,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["Targmax"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+// Performs max pooling on the input and outputs both max values and indices.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the filter.
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
+//
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
 //
 // Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "MaxPoolWithArgmax",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns immutable tensor from memory region.
+// Creates a TensorList which, when stacked, has the value of `tensor`.
 //
-// The current implementation memmaps the tensor from a file.
+// Each tensor in the result list corresponds to one row of the input tensor.
 //
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
+		Type: "TensorListFromTensor",
+		Input: []tf.Input{
+			tensor, element_shape,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
+// Assigns sparse updates to the variable referenced by `resource`.
 //
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
+// This operation computes
 //
-// with the given separator (default is an empty separator).
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
+		Type: "ResourceScatterUpdate",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			resource, indices, updates,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // MaxPoolAttr is an optional argument to MaxPool.
@@ -12196,175 +12298,62 @@ func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
-	}
-}
-
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
-	}
-}
-
-// Multiply matrix "a" by matrix "b".
-//
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
-// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
-// "b" is sparse, in the sense that they have a large proportion of zero values.
-// The breakeven for using this versus a dense matrix multiply on one platform was
-// 30% zero values in the sparse matrix.
-//
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
-		Input: []tf.Input{
-			a, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise AND of `x` and `y`.
-//
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates quantized tensors along one dimension.
+// Multiplies sparse updates into the variable referenced by `resource`.
 //
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
+// This operation computes
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Slice a `SparseTensor` based on the `start` and `size`.
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
 //
-// For example, if the input is
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
 //
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
 //
-// Graphically the output tensors are:
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
 //
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
+		Type: "ResourceScatterMul",
 		Input: []tf.Input{
-			indices, values, shape, start, size,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+// Subtracts sparse updates from the variable referenced by `resource`.
 //
 // This operation computes
 //
 //     # Scalar indices
-//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+//     ref[indices, ...] -= updates[...]
 //
 //     # Vector indices (for each i)
-//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+//     ref[indices[i], ...] -= updates[i, ...]
 //
 //     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
 //
 // Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
+// the same location, their contributions add.
 //
 // Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
@@ -12378,12 +12367,12 @@ func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Out
 //	updates: A tensor of updated values to add to `ref`.
 //
 // Returns the created operation.
-func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMin",
+		Type: "ResourceScatterSub",
 		Input: []tf.Input{
 			resource, indices, updates,
 		},
@@ -12391,135 +12380,157 @@ func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-// ```
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
-		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the truth value of (x != y) element-wise.
-//
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NotEqual",
+		Type: "ResourceSparseApplyFtrlV2",
 		Input: []tf.Input{
-			x, y,
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
+// Calculates gains for each feature and returns the best possible split information for the feature.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
 //
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
 //
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
 //
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
 		Input: []tf.Input{
-			input, fft_length,
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
 }
 
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
 
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
 	return func(m optionalAttr) {
-		m["skip_empty"] = value
+		m["compression"] = value
 	}
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
+// PNG-encode an image.
 //
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
 //
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12528,62 +12539,48 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			input, delimiter,
+			image,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
 
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["src_format"] = value
 	}
 }
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["dst_format"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// one in the source data format.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12592,82 +12589,77 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "DataFormatVecPermute",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			x,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the complex conjugate of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
-//
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+// Initializes the multi device iterator with the given dataset.
 //
-// For example:
+// Arguments:
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Conj",
+		Type: "MultiDeviceIteratorInit",
 		Input: []tf.Input{
-			input,
+			dataset, multi_device_iterator, max_buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
-
-// ProdKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// Computes the gradient of `igamma(a, x)` wrt `a`.
+func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IgammaGradA",
+		Input: []tf.Input{
+			a, x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the product of elements across dimensions of a tensor.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
-			input, axis,
+			string_tensor,
 		},
 		Attrs: attrs,
 	}
@@ -12675,43 +12667,42 @@ func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (
 	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
+// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
+type StaticRegexReplaceAttr func(optionalAttr)
 
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["replace_global"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
+// Replaces the match of pattern in input with rewrite.
 //
-// Input images can be of different types but output images are always float.
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expresion.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+// Returns The text after applying pattern and rewrite.
+func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "StaticRegexReplace",
 		Input: []tf.Input{
-			images, size,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -12719,75 +12710,46 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 	return op.Output(0)
 }
 
-// Creates a TensorList which, when stacked, has the value of `tensor`.
+// Computes gradients for the exponential linear (Elu) operation.
 //
-// Each tensor in the result list corresponds to one row of the input tensor.
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
 //
-// tensor: The input tensor.
-// output_handle: The list.
-func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListFromTensor",
+		Type: "EluGrad",
 		Input: []tf.Input{
-			tensor, element_shape,
+			gradients, outputs,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates and returns an empty tensor list.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
-// All list elements must be tensors of dtype element_dtype and shape compatible
-// with element_shape.
 //
-// handle: an empty tensor list.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "EmptyTensorList",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			element_shape,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -12795,46 +12757,27 @@ func EmptyTensorList(scope *Scope, element_shape tf.Output, element_dtype tf.Dat
 	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// Reads the value of a variable.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
+// The tensor returned by this operation is immutable.
 //
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "ReadVariableOp",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			resource,
 		},
 		Attrs: attrs,
 	}
@@ -12842,337 +12785,283 @@ func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high overlaps
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
-// which allows for defining a custom overlap criterium (eg. intersection over union,
-// intersection over area, etc.).
+// This op consumes a lock created by `MutexLock`.
 //
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
 //
-//   selected_indices = tf.image.non_max_suppression_with_overlaps(
-//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
 //
 // Arguments:
-//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
-// the n-by-n box overlap values.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	mutex_lock: A tensor returned by `MutexLock`.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionWithOverlaps",
+		Type: "ConsumeMutexLock",
 		Input: []tf.Input{
-			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+			mutex_lock,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
 
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
 //
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["use_locking"] = value
 	}
 }
 
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Adds sparse `updates` to individual values or slices within a given
 //
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_add(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 12, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
 //
 // Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StageClear",
-
+		Type: "ResourceScatterNdAdd",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+// Updates the tree ensemble by either adding a layer to the last tree being grown
 //
-// Arguments:
+// or by starting a new tree.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Arguments:
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "BoostedTreesUpdateEnsemble",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
-
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
-//
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["header_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
-//
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
-//
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// Bucketizes 'input' based on 'boundaries'.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
 //
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
 //
 // Arguments:
-//	record_bytes: Number of bytes in the record.
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
 //
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
+		Type: "Bucketize",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringLengthAttr is an optional argument to StringLength.
-type StringLengthAttr func(optionalAttr)
-
-// StringLengthUnit sets the optional unit attribute to value.
-// If not specified, defaults to "BYTE"
-func StringLengthUnit(value string) StringLengthAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// String lengths of `input`.
+// Encode audio data using the WAV file format.
 //
-// Computes the length of each string given in the input tensor.
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
+//
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	input: The string for which to compute the length.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// Returns Integer tensor that has the same shape as `input`. The output contains the
-// element-wise string lengths of `input`.
-func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StringLength",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			input,
+			audio, sample_rate,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
-//
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
+		Type: "Atan",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RegexReplaceAttr is an optional argument to RegexReplace.
-type RegexReplaceAttr func(optionalAttr)
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
 
-// RegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
 	return func(m optionalAttr) {
-		m["replace_global"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Replaces the match of pattern in input with rewrite.
+// Update '*var' according to the AdaMax algorithm.
 //
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
 //
 // Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expresion.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns The text after applying pattern and rewrite.
-func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13181,180 +13070,256 @@ func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RegexReplace",
+		Type: "ResourceApplyAdaMax",
 		Input: []tf.Input{
-			input, pattern, rewrite,
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes numerical negative value element-wise.
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Neg",
-		Input: []tf.Input{
-			x,
-		},
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor.
+// Asserts that the given condition is true.
 //
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
 // Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
-//
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
+		Type: "Assert",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+	return scope.AddOperation(opspec)
+}
+
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
+
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
+}
+
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
 	}
-	return outputs
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
 
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["dropout"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNN.
 //
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+// Compute the backprop of both data and weights in a RNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackprop",
+		Input: []tf.Input{
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			split_dim, indices, values, shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
 	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Returns the element-wise sum of a list of tensors.
+//
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "AccumulateNV2",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -13362,118 +13327,60 @@ func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Outpu
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "StatelessRandomUniformInt",
 		Input: []tf.Input{
-			inputs, min, max,
+			shape, seed, minval, maxval,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
 
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["dtype"] = value
 	}
 }
 
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-//     [1, 11, 3, 10, 9, 6, 7, 12]
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13482,179 +13389,134 @@ func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, upd
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
-			ref, indices, updates,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Produces a string handle for the given MultiDeviceIterator.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
-//
-// Returns A string representing the resource.
-func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorToStringHandle",
-		Input: []tf.Input{
-			multi_device_iterator,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
+
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
 //
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
 //
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			file_pattern, tensor_name, shape_and_slice,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+// Divides sparse updates into the variable referenced by `resource`.
 //
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
+// This operation computes
 //
-// `data.shape` must start with `partitions.shape`.
+//     # Scalar indices
+//     ref[indices, ...] /= updates[...]
 //
-// For example:
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] /= updates[i, ...]
 //
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
 //
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
-// ```
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
 //
-// See `dynamic_stitch` for an example on how to merge partitions back.
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
 // <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
 // </div>
 //
 // Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+// Returns the created operation.
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
+		Type: "ResourceScatterDiv",
 		Input: []tf.Input{
-			data, partitions,
+			resource, indices, updates,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
 	}
-	return outputs
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
-		m["update_slots"] = value
+		m["dtype"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13663,371 +13525,249 @@ func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Return the shape of s0 op s1 with broadcast.
-//
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
+		Type: "Erfc",
 		Input: []tf.Input{
-			s0, s1,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
-
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
-//
-// Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
+// Returns the number of tensors in the input tensor list.
 //
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "TensorListLength",
 		Input: []tf.Input{
-			x,
+			input_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+// Determine the script codes of a given tensor of Unicode integer code points.
+//
+// This operation converts Unicode code points to script codes corresponding to
+// each code point. Script codes correspond to International Components for
+// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
+// match input shape.
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
+//	input: A Tensor of int32 Unicode code points.
 //
-// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
-// layer.
-func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+// Returns A Tensor of int32 script codes corresponding to each input code point.
+func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesGetEnsembleStates",
+		Type: "UnicodeScript",
 		Input: []tf.Input{
-			tree_ensemble_handle,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
-
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+// Creates a sequence of numbers.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
+// For example:
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
 //
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "Range",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
+			start, limit, delta,
 		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
-type CudnnRNNBackpropV2Attr func(optionalAttr)
-
-// CudnnRNNBackpropV2RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropV2RnnMode(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNBackpropV2InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropV2InputMode(value string) CudnnRNNBackpropV2Attr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// CudnnRNNBackpropV2Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropV2Direction(value string) CudnnRNNBackpropV2Attr {
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["direction"] = value
+		m["capacity"] = value
 	}
 }
 
-// CudnnRNNBackpropV2Dropout sets the optional dropout attribute to value.
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNBackpropV2Dropout(value float32) CudnnRNNBackpropV2Attr {
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["dropout"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// CudnnRNNBackpropV2Seed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Seed(value int64) CudnnRNNBackpropV2Attr {
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["container"] = value
 	}
 }
 
-// CudnnRNNBackpropV2Seed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropV2Seed2(value int64) CudnnRNNBackpropV2Attr {
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Backprop step of CudnnRNN.
-//
-// Compute the backprop of both data and weights in a RNN. Takes an extra
-//     "host_reserved" inupt than CudnnRNNBackprop, which is used to determine RNN
-//     cudnnRNNAlgo_t and cudnnMathType_t.
+// Op removes and returns the (key, value) element with the smallest
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in the forward operation.
-// host_reserved: The same host_reserved produced in the forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackpropV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV2Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackpropV2",
+		Type: "OrderedMapUnstageNoKey",
 		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+			indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
-//
-// is alive, any other request to use `MutexLock` with this mutex will wait.
-//
-// This is particularly useful for creating a critical section when used in
-// conjunction with `MutexLockIdentity`:
-//
-// ```python
-//
-// mutex = mutex_v2(
-//   shared_name=handle_name, container=container, name=name)
-//
-// def execute_in_critical_section(fn, *args, **kwargs):
-//   lock = gen_resource_variable_ops.mutex_lock(mutex)
-//
-//   with ops.control_dependencies([lock]):
-//     r = fn(*args, **kwargs)
-//
-//   with ops.control_dependencies(nest.flatten(r)):
-//     with ops.colocate_with(mutex):
-//       ensure_lock_exists = mutex_lock_identity(lock)
+// Returns element-wise integer closest to x.
 //
-//     # Make sure that if any element of r is accessed, all of
-//     # them are executed together.
-//     r = nest.map_structure(tf.identity, r)
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
 //
-//   with ops.control_dependencies([ensure_lock_exists]):
-//     return nest.map_structure(tf.identity, r)
 // ```
-//
-// While `fn` is running in the critical section, no other functions which wish to
-// use this critical section may run.
-//
-// Often the use case is that two executions of the same graph, in parallel,
-// wish to run `fn`; and we wish to ensure that only one of them executes
-// at a time.  This is especially important if `fn` modifies one or more
-// variables at a time.
-//
-// It is also useful if two separate functions must share a resource, but we
-// wish to ensure the usage is exclusive.
-//
-// Arguments:
-//	mutex: The mutex resource to lock.
-//
-// Returns A tensor that keeps a shared pointer to a lock on the mutex;
-// when the Tensor is destroyed, the use count on the shared pointer is decreased
-// by 1.  When it reaches 0, the lock is released.
-func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MutexLock",
+		Type: "Rint",
 		Input: []tf.Input{
-			mutex,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringFormatAttr is an optional argument to StringFormat.
-type StringFormatAttr func(optionalAttr)
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
 
-// StringFormatTemplate sets the optional template attribute to value.
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: A string, the template to format tensor summaries into.
-// If not specified, defaults to "%s"
-func StringFormatTemplate(value string) StringFormatAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["template"] = value
+		m["use_locking"] = value
 	}
 }
 
-// StringFormatPlaceholder sets the optional placeholder attribute to value.
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
-// If not specified, defaults to "%s"
-func StringFormatPlaceholder(value string) StringFormatAttr {
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["placeholder"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// StringFormatSummarize sets the optional summarize attribute to value.
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
-// If not specified, defaults to 3
-func StringFormatSummarize(value int64) StringFormatAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Formats a string template using a list of tensors.
+// want to use Nesterov momentum.
 //
-// Formats a string template using a list of tensors, pretty-printing tensor summaries.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	inputs: The list of tensors to format into the placeholder string.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns = The resulting string scalar.
-func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14036,38 +13776,117 @@ func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringFormat",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
 
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+// SubstrUnit sets the optional unit attribute to value.
+//
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["unit"] = value
 	}
 }
 
-// Returns the shape of a tensor.
+// Return substrings from `Tensor` of strings.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
 //
-// For example:
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
+//
+// A negative `pos` indicates distance within the string backwards from the end.
+//
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
+//
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
 //
+// output = [b'ell', b'orl']
 // ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
 // ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14076,9 +13895,9 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "Substr",
 		Input: []tf.Input{
-			input,
+			input, pos, len,
 		},
 		Attrs: attrs,
 	}
@@ -14086,330 +13905,294 @@ func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Outp
 	return op.Output(0)
 }
 
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// Exits the current frame to its parent frame.
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pow",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes fingerprints of the input strings.
+// Exit makes its input `data` available to the parent frame.
 //
 // Arguments:
-//	input: vector of strings to compute fingerprints on.
+//	data: The tensor to be made available to the parent frame.
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "Exit",
 		Input: []tf.Input{
-			input,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient operator for the SparseAdd op.
+// Produce a string tensor that encodes the state of a Reader.
 //
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
-//
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
+		Type: "ReaderSerializeStateV2",
 		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
+			reader_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
+// Concatenates quantized tensors along one dimension.
 //
 // Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "QuantizedConcat",
 		Input: []tf.Input{
-			data, segment_ids,
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
+// Slice a `SparseTensor` based on the `start` and `size`.
 //
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
+// For example, if the input is
 //
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// Graphically the output tensors are:
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+// Arguments:
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
 //
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "SparseSlice",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			indices, values, shape, start, size,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
 //
-// Arguments:
+// This operation computes
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
+//     # Scalar indices
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
 //
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "ResourceScatterMin",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			resource, indices, updates,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is a scalar
-// string tensor which is applied to every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// ```
 //
 // Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: A scalar string tensor containing the regular expression to match the input.
 //
-// Returns A bool tensor with the same shape as `input`.
-func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RegexFullMatch",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			input, pattern,
+			tensor, shape, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Says whether the targets are in the top `K` predictions.
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
+
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
 //
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
+	return func(m optionalAttr) {
+		m["skip_empty"] = value
+	}
+}
+
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
 //
-// More formally, let
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
 //
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InTopKV2",
+		Type: "StringSplit",
 		Input: []tf.Input{
-			predictions, targets, k,
+			input, delimiter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
-// RandomPoissonV2Seed sets the optional seed attribute to value.
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["use_locking"] = value
 	}
 }
 
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14418,125 +14201,145 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
+		Type: "ResourceSparseApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, indices, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns the complex conjugate of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
+//
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Conj",
 		Input: []tf.Input{
-			shape, rate,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+// Computes numerical negative value element-wise.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
+	opspec := tf.OpSpec{
+		Type: "Neg",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// Execute a sub graph on a remote processor.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Arguments:
+//	inputs: Arbitrary number of tensors with arbitrary data types
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
 	}
+	return outputs
 }
 
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["data_format"] = value
 	}
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "MaxPool3DGradGrad",
 		Input: []tf.Input{
-			contents, crop_window,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -14544,250 +14347,362 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 	return op.Output(0)
 }
 
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
-		},
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
-
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["dilations"] = value
 	}
 }
 
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilterV2",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["num_bits"] = value
 	}
 }
 
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["narrow_range"] = value
 	}
 }
 
-// Op peeks at the values at the specified key.  If the
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
+// and `max` to 'outputs' tensor of same shape as `inputs`.
+//
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
+//
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			key, indices,
+			inputs, min, max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
 
-// LRNDepthRadius sets the optional depth_radius attribute to value.
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
 //
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["use_locking"] = value
 	}
 }
 
-// LRNBias sets the optional bias attribute to value.
+// Applies sparse `updates` to individual values or slices within a given
 //
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
+// variable according to `indices`.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// LRNBeta sets the optional beta attribute to value.
+// Produces a string handle for the given MultiDeviceIterator.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
+// Arguments:
+//	multi_device_iterator: A MultiDeviceIterator resource.
+//
+// Returns A string representing the resource.
+func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIteratorToStringHandle",
+		Input: []tf.Input{
+			multi_device_iterator,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Local Response Normalization.
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
 //
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			input,
+			sp_indices, sp_values, sp_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
+//
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
+//
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+//
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
+//
+// `data.shape` must start with `partitions.shape`.
+//
+// For example:
+//
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
+//
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "ZipDataset",
+		Type: "DynamicPartition",
 		Input: []tf.Input{
-			tf.OutputList(input_datasets),
+			data, partitions,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
+	}
+	return outputs
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
 // value: If `True`, updating of the var and accum tensors will be protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
 // If not specified, defaults to true
-func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
 		m["update_slots"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
+// Update '*var' according to the adagrad scheme.
 //
-// That is for rows we have grad for, we update var and accum as follows:
 // accum += grad * grad
 // var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14796,100 +14711,127 @@ func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, l
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
+			var_, accum, lr, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
 //
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
 //
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RightShift",
+		Type: "BoostedTreesGetEnsembleStates",
 		Input: []tf.Input{
-			x, y,
+			tree_ensemble_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// TensorListStackAttr is an optional argument to TensorListStack.
-type TensorListStackAttr func(optionalAttr)
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
 
-// TensorListStackNumElements sets the optional num_elements attribute to value.
-// If not specified, defaults to -1
-func TensorListStackNumElements(value int64) TensorListStackAttr {
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	return func(m optionalAttr) {
-		m["num_elements"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Stacks all tensors in the list.
+// Update '*var' according to the AddSign update.
 //
-// Requires that all tensors have the same shape.
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
-// input_handle: the input list
-// tensor: the gathered result
-// num_elements: optional. If not -1, the number of elements in the list.
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
-func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListStack",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			input_handle,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// StringFormatAttr is an optional argument to StringFormat.
+type StringFormatAttr func(optionalAttr)
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// StringFormatTemplate sets the optional template attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// value: A string, the template to format tensor summaries into.
+// If not specified, defaults to "%s"
+func StringFormatTemplate(value string) StringFormatAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["template"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
+// StringFormatPlaceholder sets the optional placeholder attribute to value.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
+// If not specified, defaults to "%s"
+func StringFormatPlaceholder(value string) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["placeholder"] = value
+	}
+}
+
+// StringFormatSummarize sets the optional summarize attribute to value.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
+// If not specified, defaults to 3
+func StringFormatSummarize(value int64) StringFormatAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Formats a string template using a list of tensors.
+//
+// Formats a string template using a list of tensors, pretty-printing tensor summaries.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	inputs: The list of tensors to format into the placeholder string.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// Returns = The resulting string scalar.
+func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14898,9 +14840,9 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "StringFormat",
 		Input: []tf.Input{
-			shape, seed,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -14908,146 +14850,151 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
 //
-// Arguments:
-//	data: The tensor to be made available to the next iteration.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "Shape",
 		Input: []tf.Input{
-			data,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
+// Computes the power of one value to another.
+//
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
+//
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes fingerprints of the input strings.
 //
 // Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	input: vector of strings to compute fingerprints on.
+//
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "SdcaFprint",
 		Input: []tf.Input{
-			serialized_sparse,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
 
-// SqueezeAxis sets the optional axis attribute to value.
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNBias sets the optional bias attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+		m["bias"] = value
 	}
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
+// LRNAlpha sets the optional alpha attribute to value.
 //
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
 //
-// For example:
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
 //
-// Or, to remove specific size 1 dimensions:
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 //
 // Arguments:
-//	input: The `input` to squeeze.
-//
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15056,7 +15003,7 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
+		Type: "LRN",
 		Input: []tf.Input{
 			input,
 		},
@@ -15066,38 +15013,61 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
+//	lr: Learning rate. Must be a scalar.
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15106,365 +15076,379 @@ func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
-
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
-//
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
-	return func(m optionalAttr) {
-		m["iou_threshold"] = value
-	}
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Elementwise computes the bitwise right-shift of `x` and `y`.
 //
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "RightShift",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
+	return func(m optionalAttr) {
+		m["num_elements"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+// Stacks all tensors in the list.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Requires that all tensors have the same shape.
+//
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
+//
+func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
+		Type: "TensorListStack",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// 2D real-valued fast Fourier transform.
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			input, fft_length,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
+// Makes its input available to the next iteration.
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
 //
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			input, paddings,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
-//
-// Arguments:
-//	resource: the input resource handle.
-//
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
-		Input: []tf.Input{
-			resource,
-		},
+		Type: "Fact",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// Deserialize `SparseTensor` objects.
 //
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
 //
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "DeserializeSparse",
 		Input: []tf.Input{
-			input,
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the last element of the input list as well as a list with all but that element.
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
+
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// Fails if the list is empty.
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
 //
-// input_handle: the input list
-// tensor: the withdrawn last element of the list
-// element_dtype: the type of elements in the list
-// element_shape: the shape of the output tensor
-func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
+	}
+}
+
+// Removes dimensions of size 1 from the shape of a tensor.
+//
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
+//
+// For example:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
+//
+// Arguments:
+//	input: The `input` to squeeze.
+//
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListPopBack",
+		Type: "Squeeze",
 		Input: []tf.Input{
-			input_handle,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Update '*var' according to the adadelta scheme.
+//
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["iou_threshold"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// All elements selected by `indices` must have the same shape.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			boxes, scores, max_output_size,
 		},
 		Attrs: attrs,
 	}
@@ -15472,172 +15456,191 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for integer types.
-//
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
-//
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
+		Type: "TensorDataset",
 		Input: []tf.Input{
-			x, y,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
-//
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
 //
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
+		Type: "SparseDenseCwiseMul",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
-	}
-	return tensors
+	return op.Output(0)
 }
 
-// Receives a tensor value broadcast from another device.
-func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+// 2D real-valued fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "CollectiveBcastRecv",
-
-		Attrs: attrs,
+		Type: "RFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
+// Pads a tensor with zeros.
 //
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-// Arguments:
-//	input: Base64 strings to decode.
+// The padded size of each dimension D of the output is:
 //
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+//
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "Pad",
 		Input: []tf.Input{
-			input,
+			input, paddings,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
+// Checks whether a resource handle-based variable has been initialized.
 //
 // Arguments:
-//	value: The tensor to be stored.
+//	resource: the input resource handle.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
+		Type: "VarIsInitializedOp",
 		Input: []tf.Input{
-			value,
+			resource,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
+//	linear: Should be from a Variable().
 //	grad: The gradient.
 //	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15646,110 +15649,83 @@ func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
-
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of max pooling function.
+// Returns which elements of x are Inf.
 //
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "IsInf",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
 
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// TruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["seed"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -15757,525 +15733,486 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
 
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["window_size"] = value
 	}
 }
 
-// Returns the shape of the variable pointed to by `resource`.
+// SkipgramMinCount sets the optional min_count attribute to value.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
 //
-// For example:
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "Skipgram",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
-//
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
 //
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			string_tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
-		Input: []tf.Input{
-			features, labels,
-		},
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "ResourceApplyFtrlV2",
 		Input: []tf.Input{
-			serialized,
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
 
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["format"] = value
 	}
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
+// EncodeJpegQuality sets the optional quality attribute to value.
 //
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
 //
-// Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "LogicalNot",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// 3D real-valued fast Fourier transform.
+// JPEG-encode an image.
 //
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
 //
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			input, fft_length,
+			image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
-//
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
 
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["seed"] = value
 	}
 }
 
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+// MultinomialSeed2 sets the optional seed2 attribute to value.
 //
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
 	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
+		m["seed2"] = value
 	}
 }
 
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
+// Draws samples from a multinomial distribution.
 //
 // Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "Multinomial",
 		Input: []tf.Input{
-			size,
+			logits, num_samples,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Runs multiple additive regression ensemble predictors on input instances and
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// computes the logits. It is designed to be used during prediction.
-// It traverses all the trees and calculates the final score for each instance.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
 // Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
-//
-// Returns Output rank 2 Tensor containing logits for each example.
-func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesPredict",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			tree_ensemble_handle, tf.OutputList(bucketized_features),
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
-//
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
+// Returns the last element of the input list as well as a list with all but that element.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+// Fails if the list is empty.
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "TensorListPopBack",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			input_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
 
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16284,9 +16221,9 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			input,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -16294,42 +16231,45 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
 
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// Gather specific elements from the TensorArray into output `value`.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -16337,204 +16277,229 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 	return op.Output(0)
 }
 
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// Returns x / y element-wise for integer types.
 //
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
 //
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
+		Type: "TruncateDiv",
 		Input: []tf.Input{
-			resource, indices, updates,
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// Restores tensors from a V2 checkpoint.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' as FOBOS algorithm with fixed learning rate.
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
 //
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
+//
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
 //
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
+		Type: "RestoreV2",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
+			prefix, tensor_names, shape_and_slices,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
 }
 
-// Returns 0 if the denominator is zero.
-//
-//
-// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "DivNoNan",
-		Input: []tf.Input{
-			x, y,
-		},
+		Type: "CollectiveBcastRecv",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// Decode web-safe base64-encoded strings.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
+//
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
+		Type: "DecodeBase64",
 		Input: []tf.Input{
-			y, dy,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the value of the tensor specified by its handle.
+// Store the input tensor in the state of the current session.
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+//	value: The tensor to be stored.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "GetSessionHandle",
 		Input: []tf.Input{
-			handle,
+			value,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x - y element-wise.
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "ResourceSparseApplyProximalAdagrad",
 		Input: []tf.Input{
-			x, y,
+			var_, accum, lr, l1, l2, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
 
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Use RandomPoissonV2 instead.
+// Computes gradients of max pooling function.
 //
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			shape, rate,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -16542,107 +16507,119 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-//
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Maximum",
+		Type: "ExperimentalIteratorGetDevice",
 		Input: []tf.Input{
-			x, y,
+			resource,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
+
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
 //
-// Inputs are the logits, not probabilities.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			features, labels,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
 
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// value: If `True`, retain reduced dimensions with length `1`.
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
 // If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["adjoint_a"] = value
 	}
 }
 
-// ReduceJoinSeparator sets the optional separator attribute to value.
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
 //
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["adjoint_b"] = value
 	}
 }
 
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
-// indices are not specified, joins across all dimensions beginning from `n - 1`
-// through `0`.
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 //
-// For example:
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> "acbd"
-// tf.reduce_join(a, [1, 0]) ==> "abcd"
-// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
-// ```
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
 //
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
-//
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16651,9 +16628,9 @@ func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			a_indices, a_values, a_shape, b,
 		},
 		Attrs: attrs,
 	}
@@ -16661,78 +16638,46 @@ func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, opt
 	return op.Output(0)
 }
 
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
 
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["use_locking"] = value
 	}
 }
 
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+// Update '*var' according to the RMSProp algorithm.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16741,293 +16686,312 @@ func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return scope.AddOperation(opspec)
 }
 
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
 
-// TopKSorted sets the optional sorted attribute to value.
+// SerializeManySparseOutType sets the optional out_type attribute to value.
 //
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["out_type"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 //
-// If two elements are equal, the lower-index element appears first.
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
 //
-// If `k` varies dynamically, use `TopKV2` below.
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
-//
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "SerializeManySparse",
 		Input: []tf.Input{
-			input,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
-//
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "Acosh",
 		Input: []tf.Input{
-			x, q,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
-//
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
 //
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IdentityN",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+// Performs max pooling on the input.
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			x, y,
+			input, ksize, strides,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
+		m["container"] = value
 	}
 }
 
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
+		m["shared_name"] = value
 	}
 }
 
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 //
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
-	}
-}
-
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
 //
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
-	}
-}
-
-// Transforms a spectrogram into a form that's useful for speech recognition.
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
+//
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17036,9 +17000,9 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mfcc",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			spectrogram, sample_rate,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -17046,98 +17010,91 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 	return op.Output(0)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+// Concatenates tensors along one dimension.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
 //
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReadFile",
+		Input: []tf.Input{
+			filename,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns immutable tensor from memory region.
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// The current implementation memmaps the tensor from a file.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
-		Input: []tf.Input{
-			tag, tensor,
-		},
+		Type: "ImmutableConst",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
 
-// QrFullMatrices sets the optional full_matrices attribute to value.
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["separator"] = value
 	}
-}
-
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// with the given separator (default is an empty separator).
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17146,26 +17103,33 @@ func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates and returns an empty tensor list.
+//
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
+//
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "BytesProducedStatsDataset",
+		Type: "EmptyTensorList",
 		Input: []tf.Input{
-			input_dataset, tag,
+			element_shape, max_num_elements,
 		},
 		Attrs: attrs,
 	}
@@ -17173,66 +17137,66 @@ func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Out
 	return op.Output(0)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is the
-// regular expression to be matched with every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// Computes softsign gradients for a softsign operation.
 //
 // Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: The regular expression to match the input.
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
 //
-// Returns A bool tensor with the same shape as `input`.
-func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pattern": pattern}
 	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
+		Type: "SoftsignGrad",
 		Input: []tf.Input{
-			input,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// Provides the time since epoch in seconds.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Timestamp",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["out_type"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+// Returns the shape of the variable pointed to by `resource`.
 //
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// For example:
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17241,53 +17205,56 @@ func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, al
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
+		Type: "VariableShape",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
 
-// MeanKeepDims sets the optional keep_dims attribute to value.
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mean",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			input, axis,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -17295,100 +17262,127 @@ func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (
 	return op.Output(0)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
-
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
+// pruning away boxes that have high overlaps
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
+// which allows for defining a custom overlap criterium (eg. intersection over union,
+// intersection over area, etc.).
 //
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["vocab_size"] = value
-	}
-}
-
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
 //
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
-	return func(m optionalAttr) {
-		m["delimiter"] = value
+//   selected_indices = tf.image.non_max_suppression_with_overlaps(
+//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
+// the n-by-n box overlap values.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NonMaxSuppressionWithOverlaps",
+		Input: []tf.Input{
+			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Initializes a table from a text file.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
 //
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
 //
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
+		Type: "LogicalNot",
 		Input: []tf.Input{
-			table_handle, filename,
+			x,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Real-valued fast Fourier transform.
+// 3D real-valued fast Fourier transform.
 //
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
 //
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
 // corresponding dimension of `input`, the dimension is cropped. If it is larger,
 // the dimension is padded with zeros.
 //
 // Arguments:
 //	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.rfft
+// Equivalent to np.fft.rfftn with 3 dimensions.
 // @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
+		Type: "RFFT3D",
 		Input: []tf.Input{
 			input, fft_length,
 		},
@@ -17397,362 +17391,376 @@ func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output
 	return op.Output(0)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
 
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
 //
-// Arguments:
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "TensorArrayV3",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
+// Runs multiple additive regression ensemble predictors on input instances and
 //
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "BoostedTreesPredict",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySplitV3
+// Elementwise computes the bitwise OR of `x` and `y`.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
+		Type: "BitwiseOr",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
 //
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
 //
-// Reordering does not affect the shape of the SparseTensor.
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
 //
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseReorder",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			features,
+			matrix, rhs, l2_regularizer,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
 
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["data_format"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
-type CudnnRNNBackpropAttr func(optionalAttr)
-
-// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNBackpropDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNBackpropSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
 
-// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dilations"] = value
 	}
 }
 
-// Backprop step of CudnnRNN.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-// Compute the backprop of both data and weights in a RNN.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in for forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackprop",
+		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// UpperBoundAttr is an optional argument to UpperBound.
-type UpperBoundAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// UpperBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["data_format"] = value
 	}
 }
 
-// Applies upper_bound(sorted_search_values, values) along each row.
-//
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='right')`.
-//
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
-//
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
-//
-//   result = UpperBound(sorted_sequence, values)
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
 //
-//   result == [[1, 2, 4],
-//              [0, 2, 5]]
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UpperBound",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			sorted_inputs, values,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -17760,41 +17768,69 @@ func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optiona
 	return op.Output(0)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
-
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
 //
-// `index  0  1  2  3  4`
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
 //
-// `value  20 5  16 3  7`
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketStrong",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringLengthAttr is an optional argument to StringLength.
+type StringLengthAttr func(optionalAttr)
+
+// StringLengthUnit sets the optional unit attribute to value.
+//
+// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
+// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
+// encoded Unicode code points in each string).  Results are undefined
+// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
+// valid UTF-8.
+// If not specified, defaults to "BYTE"
+func StringLengthUnit(value string) StringLengthAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["unit"] = value
 	}
 }
 
-// Computes gradient of the FractionalMaxPool function.
+// String lengths of `input`.
+//
+// Computes the length of each string given in the input tensor.
 //
 // Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	input: The string for which to compute the length.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// Returns Integer tensor that has the same shape as `input`. The output contains the
+// element-wise string lengths of `input`.
+func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17803,9 +17839,9 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "StringLength",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -17813,34 +17849,34 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 	return op.Output(0)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
 
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// Update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
+//	alpha: Scaling factor. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
 //	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	delta: The change.
 //
 // Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17849,115 +17885,116 @@ func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			var_, alpha, l1, l2, delta,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
-func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
+// Returns 0 if the denominator is zero.
+//
+//
+// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FilterByLastComponentDataset",
+		Type: "DivNoNan",
 		Input: []tf.Input{
-			input_dataset,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
-type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
-
-// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
+// Computes the gradient for the sqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
+	opspec := tf.OpSpec{
+		Type: "SqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
+// Get the value of the tensor specified by its handle.
+//
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
+//
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "GetSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
 // If not specified, defaults to 0
-func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Converts CudnnRNN params from canonical form to usable form.
-//
-// Writes a set of weights into the opaque params buffer so they can be used in
-// upcoming training or inferences.
-//
-// Note that the params buffer may not be compatible across different GPUs. So any
-// save and restoration should be converted to and from the canonical weights and
-// biases.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// weights: the canonical form of weights that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// biases: the canonical form of biases that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     The actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//     dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
+//
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17966,9 +18003,9 @@ func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNCanonicalToParams",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -17976,41 +18013,107 @@ func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.
 	return op.Output(0)
 }
 
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+//
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// value: If true, retain reduced dimensions with length 1.
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
+
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If `True`, retain reduced dimensions with length `1`.
 // If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins a string Tensor across the given dimensions.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
+// indices are not specified, joins across all dimensions beginning from `n - 1`
+// through `0`.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> "acbd"
+// tf.reduce_join(a, [1, 0]) ==> "abcd"
+// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
+// ```
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18019,265 +18122,348 @@ func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
-//
-//
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "Cos",
 		Input: []tf.Input{
-			input_dataset, count,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["epsilon"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["data_format"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
 //
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGrad",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
+
+// TopKSorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"k": k}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "TopK",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Concatenates tensors along one dimension.
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
 //
 // Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConcatV2",
+		Type: "SparseAddGrad",
 		Input: []tf.Input{
-			tf.OutputList(values), axis,
+			backprop_val_grad, a_indices, b_indices, sum_indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+// Returns a list of tensors with the same shapes and contents as the input
+//
+// tensors.
+//
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
+//
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
+//
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReadFile",
+		Type: "IdentityN",
 		Input: []tf.Input{
-			filename,
+			tf.OutputList(input),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
+	}
+	return output
 }
 
-// Multiplies sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-//     # Scalar indices
-//     ref[indices, ...] *= updates[...]
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] *= updates[i, ...]
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMul",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Component-wise divides a SparseTensor by a dense Tensor.
-//
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
-//
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "ResourceApplyCenteredRMSProp",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18286,225 +18472,221 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns element-wise integer closest to x.
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
+// Arguments:
 //
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Rint",
+		Type: "BatchDataset",
 		Input: []tf.Input{
-			x,
+			input_dataset, batch_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
 
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// RandomPoissonV2Seed sets the optional seed attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["seed"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["seed2"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["dtype"] = value
 	}
 }
 
-// Op removes and returns the (key, value) element with the smallest
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+//
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
+//
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
+		Type: "RandomPoissonV2",
 		Input: []tf.Input{
-			indices,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return op.Output(0)
 }
 
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
 
-// SerializeManySparseOutType sets the optional out_type attribute to value.
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["channels"] = value
 	}
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
-//
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["fancy_upscaling"] = value
 	}
 }
 
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
 // If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["try_recover_truncated"] = value
 	}
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["acceptable_fraction"] = value
 	}
 }
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
 // If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["dct_method"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "DecodeAndCropJpeg",
 		Input: []tf.Input{
-			size,
+			contents, crop_window,
 		},
 		Attrs: attrs,
 	}
@@ -18512,64 +18694,65 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
 //
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
 //
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
 //
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
 	return func(m optionalAttr) {
-		m["select_cols"] = value
+		m["out_type"] = value
 	}
 }
 
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18578,195 +18761,239 @@ func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
 
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
 //
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["header_bytes"] = value
 	}
 }
 
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+//
+// value: Number of bytes in the footer, defaults to 0.
 // If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
 //
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["hop_bytes"] = value
 	}
 }
 
-// MapClearContainer sets the optional container attribute to value.
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapClearSharedName sets the optional shared_name attribute to value.
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
 // If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
 //
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
+//
+// Arguments:
+//	record_bytes: Number of bytes in the record.
+//
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapClear",
+		Type: "FixedLengthRecordReaderV2",
 
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// Computes the QR decompositions of one or more matrices.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "Qr",
 		Input: []tf.Input{
-			true_classes,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// Check if the input matches the regex pattern.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// The input is a string tensor of any shape. The pattern is the
+// regular expression to be matched with every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: A string tensor of the text to be processed.
+//	pattern: The regular expression to match the input.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+// Returns A bool tensor with the same shape as `input`.
+func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"pattern": pattern}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "StaticRegexFullMatch",
 		Input: []tf.Input{
-			input, ksize, strides,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -18774,229 +19001,291 @@ func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output
 	return op.Output(0)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
 
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["use_locking"] = value
 	}
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalGradientDescent",
+		Input: []tf.Input{
+			var_, alpha, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+// Real-valued fast Fourier transform.
 //
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+// Adds a value to the current value of a variable.
 //
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignAddVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
-//
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["max_load_factor"] = value
+		m["out_type"] = value
 	}
 }
 
-// Creates an empty hash table that uses tensors as the backing store.
-//
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// Computes Quantized Rectified Linear: `max(features, 0)`
 //
 // Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//	value_dtype: Type of the table values.
 //
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "QuantizedRelu",
 		Input: []tf.Input{
-			empty_key,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Inverse fast Fourier transform.
+// Reshapes a SparseTensor to represent values in a new dense shape.
 //
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+// Reshaping does not affect the order of values in the SparseTensor.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT",
+		Type: "SparseReshape",
 		Input: []tf.Input{
-			input,
+			input_indices, input_shape, new_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// 2D fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+// Deprecated. Use TensorArraySplitV3
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT2D",
+		Type: "TensorArraySplitV2",
 		Input: []tf.Input{
-			input,
+			handle, value, lengths, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D fast Fourier transform.
+// Reorders a SparseTensor into the canonical, row-major ordering.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// Reordering does not affect the shape of the SparseTensor.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
+		Type: "SparseReorder",
 		Input: []tf.Input{
-			input,
+			input_indices, input_values, input_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu",
+		Input: []tf.Input{
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
 
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// Update '*var' according to the AddSign update.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
+//	m: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19005,453 +19294,490 @@ func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncateMod",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			x, y,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
+
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// `index  0  1  2  3  4`
 //
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalAvgPool function.
+//
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalAvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			input, fft_length,
+			a_indices, a_values, a_shape, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
-
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
 
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
 // If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
 //
-// Accepted values are:
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
 //
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			contents,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
-
-// StageSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
 
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
 //
-// REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageSizeContainer sets the optional container attribute to value.
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// StageSizeSharedName sets the optional shared_name attribute to value.
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
 // If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "StageSize",
+}
 
-		Attrs: attrs,
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Produces the max pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softplus",
-		Input: []tf.Input{
-			features,
-		},
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+//
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["max_load_factor"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
+// Creates an empty hash table that uses tensors as the backing store.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			x,
+			empty_key, deleted_key,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
+// UpperBoundAttr is an optional argument to UpperBound.
+type UpperBoundAttr func(optionalAttr)
+
+// UpperBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Applies upper_bound(sorted_search_values, values) along each row.
 //
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='right')`.
+//
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
+//
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
+//
+//   result = UpperBound(sorted_sequence, values)
+//
+//   result == [[1, 2, 4],
+//              [0, 2, 5]]
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
+//
+// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "UpperBound",
 		Input: []tf.Input{
-			reader_handle,
+			sorted_inputs, values,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
+// `index  0  1  2  3  4`
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// `value  20 5  16 3  7`
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
 //
 // Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "FractionalMaxPoolGrad",
 		Input: []tf.Input{
-			data, segment_ids,
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
-//
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned can be non-deterministic.
+// Update '*var' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			pattern,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
 
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Return histogram of values.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
-//
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19460,501 +19786,428 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			values, value_range, nbins,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			x, y,
+			input_dataset, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
-//
-// Our Conv3D implements a form of cross-correlation.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "ReciprocalGrad",
 		Input: []tf.Input{
-			input, filter,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
-//
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
-//
-// Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "Minimum",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["upper_frequency_limit"] = value
 	}
 }
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["lower_frequency_limit"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-//
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-//
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-//
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-//
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-//
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
 //
-// Now we can quantize the elements of our tensor:
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
+	}
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
 //
-// ```c++
-// result = round(input * s)
-// ```
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
 //
 // Arguments:
-//
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "Mfcc",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			spectrogram, sample_rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "Zeta",
 		Input: []tf.Input{
-			x, y,
+			x, q,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+// Inverse fast Fourier transform.
 //
-// Arguments:
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
 //
+// Arguments:
+//	input: A complex64 tensor.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "IFFT",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+// 2D fast Fourier transform.
 //
-// Arguments:
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a batch.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
+// Arguments:
+//	input: A complex64 tensor.
 //
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
 //
-func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchDatasetV2",
+		Type: "FFT2D",
 		Input: []tf.Input{
-			input_dataset, batch_size, drop_remainder,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// Inverse 2D fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT2D",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
+// Inverse 2D real-valued fast Fourier transform.
 //
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
+//
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
 
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["output_dtype"] = value
+		m["channels"] = value
 	}
 }
 
-// Draws samples from a multinomial distribution.
+// DecodeJpegRatio sets the optional ratio attribute to value.
 //
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
-		Input: []tf.Input{
-			logits, num_samples, seed,
-		},
-		Attrs: attrs,
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
-
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["dct_method"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
+// Decode a JPEG-encoded image to a uint8 tensor.
 //
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// Accepted values are:
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			resource, indices,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -19962,68 +20215,58 @@ func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype t
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
-//
-// This enables the user to close and release the resource in the middle
-// of a step/run.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+// Inverse 3D real-valued fast Fourier transform.
 //
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Saves the input tensors to disk.
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
 //
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// See also `SaveSlices`.
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "IRFFT3D",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			input, fft_length,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-//
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// Returns the truth value of (x != y) element-wise.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorMod",
+		Type: "NotEqual",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -20032,517 +20275,505 @@ func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			input, min_input, max_input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softplus",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x - 1 element-wise.
 //
-// Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "Expm1",
 		Input: []tf.Input{
-			serialized_sparse,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// Returns the number of records this Reader has produced.
 //
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			input, fft_length,
+			reader_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
+// Returns the set of files matching one or more glob patterns.
 //
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
 //
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
 //
-// then the output will be
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatchingFiles",
+		Input: []tf.Input{
+			pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
+
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Return histogram of values.
 //
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
 //
-// Graphically this is equivalent to doing
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "HistogramFixedWidth",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			values, value_range, nbins,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
+// Returns the truth value of (x >= y) element-wise.
 //
-// if hashed_output=true then the output will be
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GreaterEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
 //
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DDilations sets the optional dilations attribute to value.
 //
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
+// Our Conv3D implements a form of cross-correlation.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Arguments:
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			input, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
-
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
 //
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "SparseDenseCwiseAdd",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
-
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["mode"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["round_mode"] = value
 	}
 }
 
-// Creates an empty hash table.
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// The gradient operator for the SparseSlice op.
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8: out[i] -= (range(T) + 1) / 2.0
+// ```
 //
-// This op takes in the upstream gradient w.r.t. non-empty values of
-// the sliced `SparseTensor`, and outputs the gradients w.r.t.
-// the non-empty values of input `SparseTensor`.
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+//
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+//
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+//
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+//
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+//
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
 //
 // Arguments:
-//	backprop_val_grad: 1-D. The gradient with respect to
-// the non-empty values of the sliced `SparseTensor`.
-//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
-//	input_start: 1-D. tensor represents the start of the slice.
-//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
 //
-// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
-func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"T": T}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSliceGrad",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			backprop_val_grad, input_indices, input_start, output_indices,
+			input, min_range, max_range,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the gradient of the sigmoid of `x` wrt its input.
+// Returns the truth value of (x < y) element-wise.
 //
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
+		Type: "Less",
 		Input: []tf.Input{
-			y, dy,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
+
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Arguments:
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
 //
-// Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			images,
+			features, max_value, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset by applying optimizations to `input_dataset`.
-//
-// Creates a dataset by applying optimizations to `input_dataset`.
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
 //
+//	batch_size: A scalar representing the number of elements to accumulate in a batch.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
 //
-func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//
+func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "OptimizeDataset",
+		Type: "BatchDatasetV2",
 		Input: []tf.Input{
-			input_dataset, optimizations,
+			input_dataset, batch_size, drop_remainder,
 		},
 		Attrs: attrs,
 	}
@@ -20550,118 +20781,136 @@ func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Out
 	return op.Output(0)
 }
 
-// Returns the element-wise min of two SparseTensors.
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
+
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2D convolution given quantized 4D input and filter tensors.
+//
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
 
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// Draws samples from a multinomial distribution.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-//
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
-//
-// ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-// ```
-//
-// and
-//
-// ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-// ```
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// then the final `SparseTensor` will be:
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-// ```
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20670,161 +20919,185 @@ func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "ResourceGather",
 		Input: []tf.Input{
-			sparse_handles,
+			resource, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Assigns a new value to a variable.
+// Delete the TensorArray from its resource container.
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
 // Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
+		Type: "TensorArrayCloseV3",
 		Input: []tf.Input{
-			resource, value,
+			handle,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Strip leading and trailing whitespaces from the Tensor.
+// Saves the input tensors to disk.
+//
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
+//
+// See also `SaveSlices`.
 //
 // Arguments:
-//	input: A string `Tensor` of any shape.
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
 //
-// Returns A string `Tensor` of the same shape as the input.
-func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StringStrip",
+		Type: "Save",
 		Input: []tf.Input{
-			input,
+			filename, tensor_names, tf.OutputList(data),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
-// Arguments:
-//	x: a tensor of type T.
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "FloorMod",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient of SparseFillEmptyRows.
-//
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
-//
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
 //
 // Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
+		Type: "Dilation2DBackpropFilter",
 		Input: []tf.Input{
-			reverse_index_map, grad_values,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
-// if < 0, `scale * features` otherwise.
-//
-// To be used together with
-// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
 //
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Selu",
+		Type: "TensorListPushBack",
 		Input: []tf.Input{
-			features,
+			input_handle, tensor,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
 
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["container"] = value
 	}
 }
 
-// Number of unique elements along last dimension of input `set`.
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20833,9 +21106,9 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "AddSparseToTensorsMap",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -20843,323 +21116,298 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 	return op.Output(0)
 }
 
-// Computes the sign and the log of the absolute value of the determinant of
-//
-// one or more square matrices.
-//
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
-//
-// Arguments:
-//	input: Shape is `[N, M, M]`.
-//
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// The indicator function
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
 //
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// For example:
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
 //
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
 //
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
+// and
 //
-// Useful special cases:
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
 //
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
+// then the final deserialized `SparseTensor` will be:
 //
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			serialized_sparse,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
-
-// SumKeepDims sets the optional keep_dims attribute to value.
+// Elementwise computes the bitwise AND of `x` and `y`.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the sum of elements across dimensions of a tensor.
+// Inverse real-valued fast Fourier transform.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			input, axis,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
-
-// EnterIsConstant sets the optional is_constant attribute to value.
+// Concatenates a list of `SparseTensor` along the specified dimension.
 //
-// value: If true, the output is constant within the child frame.
-// If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
-	return func(m optionalAttr) {
-		m["is_constant"] = value
-	}
-}
-
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
 //
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
-	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
-	}
-}
-
-// Creates or finds a child frame, and makes `data` available to the child frame.
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
 //
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
 //
 // Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
 //
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "Enter",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			data,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Add all input tensors element wise.
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
 //
 // Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "AddN",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TryRpcAttr is an optional argument to TryRpc.
-type TryRpcAttr func(optionalAttr)
-
-// TryRpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func TryRpcProtocol(value string) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// TryRpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func TryRpcFailFast(value bool) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
-	}
-}
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
 //
-// Unlike the standard `Rpc` op, if the connection fails or the remote worker
-// returns an error status, this op does **not** reraise the exception.
-// Instead, the `status_code` and `status_message` entry for the corresponding RPC
-// call is set with the error returned from the RPC call.  The `response` tensor
-// will contain valid response values for those minibatch entries whose RPCs did
-// not fail; the rest of the entries will have empty strings.
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
-// returned from the RPC calls.
-func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21168,362 +21416,264 @@ func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TryRpc",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			address, method, request,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Delete the tensor specified by its handle in the session.
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
-		Input: []tf.Input{
-			handle,
-		},
+}
+
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// L2 Loss.
-//
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
 //
-//     output = sum(t ** 2) / 2
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "L2Loss",
-		Input: []tf.Input{
-			t,
-		},
+		Type: "MutableHashTableOfTensorsV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
-
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
+// The gradient operator for the SparseSlice op.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// This op takes in the upstream gradient w.r.t. non-empty values of
+// the sliced `SparseTensor`, and outputs the gradients w.r.t.
+// the non-empty values of input `SparseTensor`.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
+//	backprop_val_grad: 1-D. The gradient with respect to
+// the non-empty values of the sliced `SparseTensor`.
+//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
+//	input_start: 1-D. tensor represents the start of the slice.
+//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
+func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "SparseSliceGrad",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			backprop_val_grad, input_indices, input_start, output_indices,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Subtracts a value from the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the decremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// Computes the gradient of the sigmoid of `x` wrt its input.
 //
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
+		Type: "SigmoidGrad",
 		Input: []tf.Input{
-			resource, value,
+			y, dy,
 		},
 	}
-	return scope.AddOperation(opspec)
-}
-
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
-
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
-	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
+// Convert one or more images from HSV to RGB.
 //
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// See also `RestoreSlice`.
+// See `rgb_to_hsv` for a description of the HSV encoding.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
 //
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "HSVToRGB",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			images,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
+// Creates a dataset by applying optimizations to `input_dataset`.
 //
-// Input images and output images must be quantized types.
+// Creates a dataset by applying optimizations to `input_dataset`.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
+//	input_dataset: A variant tensor representing the input dataset.
+//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
 //
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
+		Type: "OptimizeDataset",
 		Input: []tf.Input{
-			images, size, min, max,
+			input_dataset, optimizations,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+// Returns the element-wise min of two SparseTensors.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			data, segment_ids,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["capacity"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
-//
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "MapUnstageNoKey",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			indices,
 		},
 		Attrs: attrs,
 	}
@@ -21533,331 +21683,325 @@ func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feat
 	}
 	var idx int
 	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
 		return
 	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+	return key, values
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
+// HashTableV2Container sets the optional container attribute to value.
 //
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["lower"] = value
+		m["container"] = value
 	}
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to scipy.linalg.solve_triangular
-// @end_compatibility
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
 // If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
-//
-// backsubstitution.
-//
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+// Creates a non-initialized hash table.
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
-		Input: []tf.Input{
-			matrix, rhs,
-		},
+		Type: "HashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Asinh",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-// Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
+//
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
+//
+// Arguments:
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
+//
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			start, stop, step,
+			sparse_handles,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Stops gradient computation.
-//
-// When executed in a graph, this op outputs its input tensor as-is.
+// Assigns a new value to a variable.
 //
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
 //
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
 //
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StopGradient",
+		Type: "AssignVariableOp",
 		Input: []tf.Input{
-			input,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Eagerly executes a python function to compute func(input)->output. The
+// Strip leading and trailing whitespaces from the Tensor.
 //
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
-	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
-		Input: []tf.Input{
-			tf.OutputList(input),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
+// Arguments:
+//	input: A string `Tensor` of any shape.
+//
+// Returns A string `Tensor` of the same shape as the input.
+func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "StringStrip",
+		Input: []tf.Input{
+			input,
+		},
 	}
-	return output
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adds sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// Returns a tensor of ones with the same shape and type as x.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	x: a tensor of type T.
 //
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
+		Type: "OnesLike",
 		Input: []tf.Input{
-			resource, indices, updates,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
+// The gradient of SparseFillEmptyRows.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "SparseFillEmptyRowsGrad",
 		Input: []tf.Input{
-			predictions, targets,
+			reverse_index_map, grad_values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// if < 0, `scale * features` otherwise.
+//
+// To be used together with
+// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "Selu",
 		Input: []tf.Input{
-			x, y,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
+// Number of unique elements along last dimension of input `set`.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21866,9 +22010,9 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "SetSize",
 		Input: []tf.Input{
-			shape, alpha,
+			set_indices, set_values, set_shape,
 		},
 		Attrs: attrs,
 	}
@@ -21876,294 +22020,318 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+// Computes the sign and the log of the absolute value of the determinant of
 //
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
+// one or more square matrices.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
 //
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
+// Arguments:
+//	input: Shape is `[N, M, M]`.
 //
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogMatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// Arguments:
+// to zero.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
 //
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "MatrixBandPart",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			input, num_lower, num_upper,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
-//
-// `num_segments` should equal the number of distinct segment IDs.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
+// Delete the tensor specified by its handle in the session.
 //
 // Arguments:
+//	handle: The handle for a tensor stored in the session state.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "DeleteSessionTensor",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the product of all
-// entries belonging to a segment such that:
-//
-// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
-// `j...` such that `segment_ids[j...] == i`.
+// L2 Loss.
 //
-// If there is no entry for a given segment ID `i`, it outputs 1.
+// Computes half the L2 norm of a tensor without the `sqrt`:
 //
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
+//     output = sum(t ** 2) / 2
 //
 // Arguments:
+//	t: Typically 2-D, but may have any dimensions.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentProd",
+		Type: "L2Loss",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			t,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
+
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			set1, set2_indices, set2_values, set2_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Deserializes a serialized tree ensemble config and replaces current tree
+// Subtracts a value from the current value of a variable.
 //
-// ensemble.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
 //
 // Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	stamp_token: Token to use as the new value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
 // Returns the created operation.
-func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesDeserializeEnsemble",
+		Type: "AssignSubVariableOp",
 		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+			resource, value,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Transforms a tf.Example proto (as a string) into typed tensors.
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
 //
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
-		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
 	}
-	op := scope.AddOperation(opspec)
+}
+
+// Restores a tensor from checkpoint files.
+//
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
 	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Restore",
+		Input: []tf.Input{
+			file_pattern, tensor_name,
+		},
+		Attrs: attrs,
 	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
 
-// WholeFileReaderV2Container sets the optional container attribute to value.
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["align_corners"] = value
 	}
 }
 
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the entire contents of a file as a value.
+// Input images and output images must be quantized types.
 //
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+//
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22172,169 +22340,305 @@ func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
+		Type: "QuantizedResizeBilinear",
+		Input: []tf.Input{
+			images, size, min, max,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Pop the element at the top of the stack.
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+//
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+	return func(m optionalAttr) {
+		m["adaptative"] = value
+	}
+}
+
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
+//
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "SdcaOptimizer",
 		Input: []tf.Input{
-			handle,
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Cosh",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// Computes the mean along sparse segments of a tensor.
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
 //
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
+	}
+}
+
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
 //
-// Arguments:
+// @compatibility(numpy)
+// Equivalent to scipy.linalg.solve_triangular
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations with upper or lower triangular matrices by
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// backsubstitution.
 //
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			matrix, rhs,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
-type CudnnRNNParamsSizeAttr func(optionalAttr)
-
-// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
+// Saves tensors in V2 checkpoint format.
+//
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
+//
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
+	opspec := tf.OpSpec{
+		Type: "SaveV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
+// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
+type UnicodeTranscodeAttr func(optionalAttr)
 
-// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+// UnicodeTranscodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
 	return func(m optionalAttr) {
-		m["dropout"] = value
+		m["errors"] = value
 	}
 }
 
-// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+//
+// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
+// as ' ', will preserve string alignment to the source since invalid bytes will be
+// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
+// replacement character will preserve byte alignment to the source.
+// If not specified, defaults to 65533
+func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["replacement_char"] = value
 	}
 }
 
-// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["replace_control_characters"] = value
 	}
 }
 
-// Computes size of weights that can be used by a Cudnn RNN model.
+// Transcode the input text from a source encoding to a destination encoding.
 //
-// Return the params size that can be used by the Cudnn RNN model. Subsequent
-// weight allocation and initialization should use this size.
+// The input is a string tensor of any shape. The output is a string tensor of
+// the same shape containing the transcoded strings. Output strings are always
+// valid unicode. If the input contains invalid encoding positions, the
+// `errors` attribute sets the policy for how to deal with them. If the default
+// error-handling policy is used, invalid formatting will be substituted in the
+// output by the `replacement_char`. If the errors policy is to `ignore`, any
+// invalid encoding positions in the input are skipped and not included in the
+// output. If it set to `strict` then any invalid formatting will result in an
+// InvalidArgument error.
 //
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   The actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//   dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-// params_size: The size of the params buffer that should be allocated and
-//   initialized for this RNN model. Note that this params buffer may not be
-//   compatible across GPUs. Please use CudnnRNNParamsWeights and
-//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
-//   across different runs.
-func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+// This operation can be used with `output_encoding = input_encoding` to enforce
+// correct formatting for inputs even if they are already in the desired encoding.
+//
+// If the input is prefixed by a Byte Order Mark needed to determine encoding
+// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
+// BOM will be consumed and not emitted into the output. If the input encoding
+// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
+// interpreted as a non-breaking-space and is preserved in the output (including
+// always for UTF-8).
+//
+// The end result is that if the input is marked as an explicit endianness the
+// transcoding is faithful to all codepoints in the source. If it is not marked
+// with an explicit endianness, the BOM is not considered part of the string itself
+// but as metadata, and so is not preserved in the output.
+//
+// Arguments:
+//	input: The text to be processed. Can have any shape.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	output_encoding: The unicode encoding to use in the output. Must be one of
+// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
+//
+// Returns A string tensor containing unicode text encoded using `output_encoding`.
+func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "S": S}
+	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsSize",
+		Type: "UnicodeTranscode",
 		Input: []tf.Input{
-			num_layers, num_units, input_size,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -22342,196 +22646,248 @@ func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output,
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asinh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
+//
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			start, stop, step,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
+// Stops gradient computation.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// When executed in a graph, this op outputs its input tensor as-is.
 //
-// Arguments:
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "StopGradient",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+// Eagerly executes a python function to compute func(input)->output. The
 //
-// The upper regularized incomplete Gamma function is defined as:
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "EagerPyFunc",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
+}
+
+// Adds sparse updates to the variable referenced by `resource`.
 //
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+// This operation computes
 //
-// where
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
 //
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
 //
-// is the upper incomplete Gama function.
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
 //
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "ResourceScatterAdd",
 		Input: []tf.Input{
-			a, x,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// Says whether the targets are in the top `K` predictions.
 //
-// N is the size of the segment being reduced.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// More formally, let
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
 //
-// Arguments:
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
+		Type: "InTopK",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			predictions, targets,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// Returns (x - y)(x - y) element-wise.
 //
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "SquaredDifference",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
 
-// LRNGradBias sets the optional bias attribute to value.
+// RandomGammaSeed sets the optional seed attribute to value.
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["seed"] = value
 	}
 }
 
-// LRNGradAlpha sets the optional alpha attribute to value.
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["seed2"] = value
 	}
 }
 
-// LRNGradBeta sets the optional beta attribute to value.
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22540,9 +22896,9 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
@@ -22550,75 +22906,67 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 	return op.Output(0)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
-
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the "logical or" of elements across dimensions of a tensor.
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			input, axis,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
 
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
-//
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
 	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
+		m["tolerance"] = value
 	}
 }
 
-// Deletes the resource specified by the handle.
-//
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
-//
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22627,75 +22975,130 @@ func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyReso
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			resource,
+			x, y,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generates values in an interval.
+// Returns the truth value of x OR y element-wise.
 //
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalOr",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Selects elements from `x` or `y`, depending on `condition`.
+//
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
+//
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
 //
 // For example:
 //
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
 // ```
 //
 // Arguments:
-//	start: 0-D tensor. First entry in the range.
-//	stop: 0-D tensor. Last entry in the range.
-//	num: 0-D tensor. Number of values to generate.
 //
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "Select",
 		Input: []tf.Input{
-			start, stop, num,
+			condition, x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
 
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
+// MatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
+// MatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// The input tensors `real` and `imag` must have the same shape.
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// Multiply the matrix "a" by the matrix "b".
 //
-// For example:
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22704,9 +23107,9 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "MatMul",
 		Input: []tf.Input{
-			real, imag,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -22714,31 +23117,53 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
 
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
 //
-// For example:
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+// "b" is sparse, in the sense that they have a large proportion of zero values.
+// The breakeven for using this versus a dense matrix multiply on one platform was
+// 30% zero values in the sparse matrix.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22747,9 +23172,9 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "SparseMatMul",
 		Input: []tf.Input{
-			input,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -22757,107 +23182,146 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
+type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+
+// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
+// value: The maximum degree of parallelism to use within operations that execute on this
+// threadpool.
+// If not specified, defaults to 1
+func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["max_intra_op_parallelism"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
+}
+
+// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
 // Arguments:
+//	num_threads: The number of threads in the thread pool.
+//	display_name: A human-readable name for the threads that may be visible in some
+// visualizations.
+// threadpool.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+// ops.
+func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
+		Type: "ExperimentalThreadPoolHandle",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
+type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// CudnnRNNCanonicalToParamsRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNCanonicalToParamsRnnMode(value string) CudnnRNNCanonicalToParamsAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// Returns the real part of a complex number.
+// CudnnRNNCanonicalToParamsInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNCanonicalToParamsInputMode(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNCanonicalToParamsDirection(value string) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsDropout(value float32) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNCanonicalToParamsSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNCanonicalToParamsSeed2(value int64) CudnnRNNCanonicalToParamsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Converts CudnnRNN params from canonical form to usable form.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// Writes a set of weights into the opaque params buffer so they can be used in
+// upcoming training or inferences.
 //
-// For example:
+// Note that the params buffer may not be compatible across different GPUs. So any
+// save and restoration should be converted to and from the canonical weights and
+// biases.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// weights: the canonical form of weights that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// biases: the canonical form of biases that can be used for saving
+//     and restoration. They are more likely to be compatible across different
+//     generations.
+// num_params: number of parameter sets for all layers.
+//     Each layer may contain multiple parameter sets, with each set consisting of
+//     a weight matrix and a bias vector.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     The actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//     dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+func CudnnRNNCanonicalToParams(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, weights []tf.Output, biases []tf.Output, optional ...CudnnRNNCanonicalToParamsAttr) (params tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22866,9 +23330,9 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "CudnnRNNCanonicalToParams",
 		Input: []tf.Input{
-			input,
+			num_layers, num_units, input_size, tf.OutputList(weights), tf.OutputList(biases),
 		},
 		Attrs: attrs,
 	}
@@ -22876,53 +23340,16 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+// Creates a dataset containing elements of first component of `input_dataset` having true in the last component.
+func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "FilterByLastComponentDataset",
 		Input: []tf.Input{
-			images, size,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -22930,94 +23357,99 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
 
-// VarHandleOpContainer sets the optional container attribute to value.
+// SumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
+// Computes the sum of elements across dimensions of a tensor.
 //
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a Variable resource.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
+		Type: "Sum",
+		Input: []tf.Input{
+			input, axis,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
 
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
+// EnterIsConstant sets the optional is_constant attribute to value.
+//
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["is_constant"] = value
 	}
 }
 
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
 //
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
+	}
+}
+
+// Creates or finds a child frame, and makes `data` available to the child frame.
 //
-// For example:
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
-// ```
+// Arguments:
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
 //
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"frame_name": frame_name}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Angle",
+		Type: "Enter",
 		Input: []tf.Input{
-			input,
+			data,
 		},
 		Attrs: attrs,
 	}
@@ -23025,208 +23457,237 @@ func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Outp
 	return op.Output(0)
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
+// Add all input tensors element wise.
 //
 // Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
-//
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ClipByValue",
+		Type: "AddN",
 		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
+
+// TryRpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// TryRpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
 //
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
 //
-// Values in `arr` outside of the range [0, size) are ignored.
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
 //
 // Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "TryRpc",
 		Input: []tf.Input{
-			arr, size, weights,
+			address, method, request,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
 
-// CumsumExclusive sets the optional exclusive attribute to value.
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
 //
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["vocab_size"] = value
 	}
 }
 
-// CumsumReverse sets the optional reverse attribute to value.
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["delimiter"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
+// Initializes a table from a text file.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
+//
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			x, axis,
+			table_handle, filename,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
 
-// CumprodReverse sets the optional reverse attribute to value.
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// value: A `bool` (default: False).
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
+// Computes the mean of elements across dimensions of a tensor.
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23235,9 +23696,9 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "Mean",
 		Input: []tf.Input{
-			x, axis,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -23245,65 +23706,77 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 	return op.Output(0)
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
-
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
 
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// ProdKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If true, `a` is transposed before multiplication.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+func ProdKeepDims(value bool) ProdAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// Computes the product of elements across dimensions of a tensor.
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Prod",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["Tactivation"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+// Resize `images` to `size` using bilinear interpolation.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23312,62 +23785,54 @@ func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, ma
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
+			images, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
+
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "Max",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -23375,18 +23840,16 @@ func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayReadV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Creates a dataset that contains the unique elements of `input_dataset`.
+func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "ExperimentalUniqueDataset",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -23394,32 +23857,27 @@ func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
 
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["output_type"] = value
 	}
 }
 
-// Returns x * y element-wise, working on quantized buffers.
-//
-// Arguments:
-//
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// Arguments:
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23428,352 +23886,447 @@ func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "ArgMin",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
-
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+//
+// output range specified with 'requested_output_min' and 'requested_output_max'.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "Requantize",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns x + y element-wise, working on quantized buffers.
+// Creates a dataset that emits the lines of one or more text files.
 //
 // Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along segments of a tensor.
 //
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
+// Computes the mean along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
 //
 // Arguments:
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Rolls the elements of a tensor along an axis.
-//
-// The elements are shifted positively (towards larger indices) by the offset of
-// `shift` along the dimension of `axis`. Negative `shift` values will shift
-// elements in the opposite direction. Elements that roll passed the last position
-// will wrap around to the first and vice versa. Multiple shifts along multiple
-// axes may be specified.
+// Computes the minimum along segments of a tensor.
 //
-// For example:
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-// ```
-// # 't' is [0, 1, 2, 3, 4]
-// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// # shifting along multiple dimensions
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// # shifting along the same axis multiple times
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
-// ```
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
 //
 // Arguments:
 //
-//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
-// elements are shifted positively (towards larger indices) along the dimension
-// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
-// direction.
-//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
-// `shift[i]` should occur. If the same axis is referenced more than once, the
-// total shift for that axis will be the sum of all the shifts that belong to that
-// axis.
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// Returns Has the same shape and size as the input. The elements are shifted
-// positively (towards larger indices) by the offsets of `shift` along the
-// dimensions of `axis`.
-func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Roll",
+		Type: "SegmentMin",
 		Input: []tf.Input{
-			input, shift, axis,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Removes keys and its associated values from a table.
 //
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
 //
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
 //
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+// Returns the created operation.
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MapPeek",
+		Type: "LookupTableRemoveV2",
 		Input: []tf.Input{
-			key, indices,
+			table_handle, keys,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
+	return scope.AddOperation(opspec)
 }
 
-// Looks up keys in a table, outputs the corresponding values.
+// Computes the sum along segments of a tensor.
 //
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
+// Computes a tensor such that
+// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
 //
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
 //
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			table_handle, keys, default_value,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Updates the table to associates keys with values.
+// Computes the product along segments of a tensor.
 //
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
+// `j...` such that `segment_ids[j...] == i`.
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
 //
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
+		Type: "UnsortedSegmentProd",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			data, segment_ids, num_segments,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// Computes the mean along sparse segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // Arguments:
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
-func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDatasetV2",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise smallest integer not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+// Deserializes a serialized tree ensemble config and replaces current tree
+//
+// ensemble.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//
+// Returns the created operation.
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Ceil",
+		Type: "BoostedTreesDeserializeEnsemble",
 		Input: []tf.Input{
-			x,
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the number of elements in the given table.
+// Transforms a tf.Example proto (as a string) into typed tensors.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
+		Type: "ParseSingleExample",
 		Input: []tf.Input{
-			table_handle,
+			serialized, tf.OutputList(dense_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// WholeFileReaderV2Container sets the optional container attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["container"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
+//
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23782,408 +24335,411 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
-		Input: []tf.Input{
-			grads, original_image,
-		},
+		Type: "WholeFileReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs all keys and values in the table.
+// Pop the element at the top of the stack.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//
-//
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
 //
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "StackPopV2",
 		Input: []tf.Input{
-			table_handle,
+			handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Replaces the contents of the table with the specified keys and values.
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
 //
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
 //
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "SparseSegmentMeanWithNumSegments",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			data, indices, segment_ids, num_segments,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
 
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["dropout"] = value
 	}
 }
 
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["seed"] = value
 	}
 }
 
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed2"] = value
 	}
 }
 
-// Op removes and returns a random (key, value)
+// Computes size of weights that can be used by a Cudnn RNN model.
 //
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
+//
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"T": T, "S": S}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
+		Type: "CudnnRNNParamsSize",
 		Input: []tf.Input{
-			indices,
+			num_layers, num_units, input_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return op.Output(0)
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
-
-// HashTableV2Container sets the optional container attribute to value.
+// Computes gradients for SparseSegmentMean.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// HashTableV2SharedName sets the optional shared_name attribute to value.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMeanGrad",
+		Input: []tf.Input{
+			grad, indices, segment_ids, output_dim0,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a non-initialized hash table.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+// N is the size of the segment being reduced.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
-		Attrs: attrs,
+		Type: "SparseSegmentSqrtN",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
-type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
-
-// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
-// value: The type list for the return values.
-// If not specified, defaults to <>
+// The upper regularized incomplete Gamma function is defined as:
 //
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igammac",
+		Input: []tf.Input{
+			a, x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// value: The list of shapes being produced.
-// If not specified, defaults to <>
+// N is the size of the segment being reduced.
 //
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSqrtNWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generates a MultiDeviceIterator resource from its provided string handle.
+// Computes gradients for SparseSegmentSqrtN.
 //
-// Arguments:
-//	string_handle: String representing the resource.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-// Returns A MultiDeviceIterator resource.
-func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorFromStringHandle",
+		Type: "SparseSegmentSqrtNGrad",
 		Input: []tf.Input{
-			string_handle,
+			grad, indices, segment_ids, output_dim0,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
 
-// MutableHashTableV2Container sets the optional container attribute to value.
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["bias"] = value
 	}
 }
 
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// LRNGradAlpha sets the optional alpha attribute to value.
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["alpha"] = value
 	}
 }
 
-// Creates an empty hash table.
+// LRNGradBeta sets the optional beta attribute to value.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
+		Type: "LRNGrad",
+		Input: []tf.Input{
+			input_grads, input_image, output_image,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
 
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
+// AnyKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// if T == qint8, in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
+// Computes the "logical or" of elements across dimensions of a tensor.
 //
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24192,9 +24748,9 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "Any",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -24202,133 +24758,150 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 	return op.Output(0)
 }
 
-// Flips all bits elementwise.
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Invert",
-		Input: []tf.Input{
-			x,
-		},
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
+// Deletes the resource specified by the handle.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//	resource: handle to the resource to delete.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			input,
+			resource,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// Generates values in an interval.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: 0-D tensor. First entry in the range.
+//	stop: 0-D tensor. Last entry in the range.
+//	num: 0-D tensor. Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			start, stop, num,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
 //
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
+		Type: "Complex",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			real, imag,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PrintFirstN sets the optional first_n attribute to value.
-//
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
-	}
-}
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
 
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["Tout"] = value
 	}
 }
 
-// Prints a list of tensors.
+// Returns the imaginary part of a complex number.
 //
-// Passes `input` through to `output` and prints `data` when evaluating.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
+// For example:
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24337,9 +24910,9 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Print",
+		Type: "Imag",
 		Input: []tf.Input{
-			input, tf.OutputList(data),
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -24347,44 +24920,75 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
 //
 // Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
 // Arguments:
 //
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
 //
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -24392,48 +24996,31 @@ func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Outpu
 	return op.Output(0)
 }
 
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
-}
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
 
-// TensorSummaryLabels sets the optional labels attribute to value.
-//
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["labels"] = value
+		m["Tout"] = value
 	}
 }
 
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
+// Returns the real part of a complex number.
 //
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["display_name"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with a tensor.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
+// For example:
 //
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24442,9 +25029,9 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
+		Type: "Real",
 		Input: []tf.Input{
-			tensor,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -24452,24 +25039,26 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 	return op.Output(0)
 }
 
-// Read an element from the TensorArray into output `value`.
+// Sends `input` to all devices that are connected to the output.
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
+// Sends `input` to all devices that are connected to the output.
 //
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+// The graph should be constructed so that all ops connected to the output have a
+// valid device assignment, and the op itself is assigned one of these devices.
 //
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// input: The input to the broadcast.
+// output: The same as input.
+// shape: The shape of the input tensor.
+//
+func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
+		Type: "NcclBroadcast",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -24477,258 +25066,278 @@ func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = max(ref[indices, ...], updates[...])
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+// Resize `images` to `size` using area interpolation.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
+// Input images can be of different types but output images are always float.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns the created operation.
-func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMax",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			resource, indices, updates,
+			images, size,
 		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
+//
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
 //
 // Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
-//
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
-		Input: []tf.Input{
-			tags, values,
-		},
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
+
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the argument of a complex number.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
 //
-// Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+// For example:
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "Angle",
 		Input: []tf.Input{
-			tag, values,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given queue.
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "ClipByValue",
 		Input: []tf.Input{
-			handle,
+			t, clip_value_min, clip_value_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
-
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
-//
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["bad_color"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// Counts the number of occurrences of each value in an integer array.
 //
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "Bincount",
 		Input: []tf.Input{
-			tag, tensor,
+			arr, size, weights,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
 
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["reverse"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
 //
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24737,9 +25346,9 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			tag, tensor, sample_rate,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
@@ -24747,105 +25356,68 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 	return op.Output(0)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
-
-// AvgPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs average pooling on the input.
-//
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
-//
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
+// Return the shape of s0 op s1 with broadcast.
 //
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
-			value,
+			s0, s1,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
-//
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
-//
-// Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "MergeSummary",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
+}
+
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
+// Returns the dimension index in the destination data format given the one in
+//
+// the source data format.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -24853,55 +25425,67 @@ func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, o
 	return op.Output(0)
 }
 
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
 
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["exclusive"] = value
 	}
 }
 
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// CumprodReverse sets the optional reverse attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["reverse"] = value
 	}
 }
 
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
 //
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24910,9 +25494,9 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
+		Type: "Cumprod",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
@@ -24920,81 +25504,129 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
+
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// tensor: The tensor to put on the list.
-// input_handle: The old list.
-// output_handle: A list with the elements of the old list followed by tensor.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorListPushBack",
-		Input: []tf.Input{
-			input_handle, tensor,
-		},
+}
+
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+//
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the number of tensors in the input tensor list.
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
 //
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
+//
+// Arguments:
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListLength",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			input_handle,
+			a, b, min_a, max_a, min_b, max_b,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// The shape of the elements of the given list, as a tensor.
+// Does nothing. Serves as a control trigger for scheduling.
 //
-//   input_handle: the list
-//   element_shape: the shape of elements of the list
-func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "TensorListElementShape",
-		Input: []tf.Input{
-			input_handle,
-		},
-		Attrs: attrs,
+		Type: "ControlTrigger",
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the item in the list with the given index.
+// Batch normalization.
 //
-// input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
 //
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			input_handle, index,
+			t, m, v, beta, gamma,
 		},
 		Attrs: attrs,
 	}
@@ -25002,83 +25634,51 @@ func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, el
 	return op.Output(0)
 }
 
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-//
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
+// Deprecated. Use TensorArrayReadV3
 //
-// Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "TensorArrayReadV2",
 		Input: []tf.Input{
-			diagonal,
+			handle, index, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
 
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["Toutput"] = value
 	}
 }
 
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// Returns x * y element-wise, working on quantized buffers.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution. The parameters may each be a
+// Arguments:
 //
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
 //
-// Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
 //
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25087,265 +25687,214 @@ func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Sets the index-th position of the list to contain the given tensor.
-//
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
-//
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
-		Input: []tf.Input{
-			input_handle, index, item,
-		},
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
+
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a Tensor by indexing into the TensorList.
+// Returns x + y element-wise, working on quantized buffers.
 //
-// Each row in the produced Tensor corresponds to the element in the TensorList
-// specified by the given index (see `tf.gather`).
+// Arguments:
 //
-// input_handle: The input tensor list.
-// indices: The indices used to index into the list.
-// values: The tensor.
-func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_dtype tf.DataType) (values tf.Output) {
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListGather",
+		Type: "QuantizedAdd",
 		Input: []tf.Input{
-			input_handle, indices,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a TensorList by indexing into a Tensor.
+// Given a quantized tensor described by (input, input_min, input_max), outputs a
 //
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
+// range that covers the actual values present in that tensor.  This op is
+// typically used to produce the requested_output_min and requested_output_max for
+// Requantize.
 //
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// output_handle: The TensorList.
-func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListScatter",
-		Input: []tf.Input{
-			tensor, indices, element_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated, use python implementation tf.linalg.matrix_exponential.
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
 //
-// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
+		Type: "RequantizationRange",
 		Input: []tf.Input{
-			input,
+			input, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
-type QueueDequeueUpToV2Attr func(optionalAttr)
-
-// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
+// Rolls the elements of a tensor along an axis.
 //
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues `n` tuples of one or more tensors from the given queue.
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
 //
-// This operation is not supported by all queues.  If a queue does not support
-// DequeueUpTo, then an Unimplemented error is returned.
+// For example:
 //
-// If the queue is closed and there are more than 0 but less than `n`
-// elements remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If
-// the queue is closed and there are 0 elements left in the queue, then
-// an OutOfRange error is returned just like in QueueDequeueMany.
-// Otherwise the behavior is identical to QueueDequeueMany:
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
 //
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
+//
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueUpToV2",
+		Type: "Roll",
 		Input: []tf.Input{
-			handle, n,
+			input, shift, axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueUpToV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
 //
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
 //
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			input,
+			table_handle, keys, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes contents to the file at input filename. Creates file and recursively
+// Updates the table to associates keys with values.
 //
-// creates directory if not existing.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
 // Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
 //
 // Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "WriteFile",
+		Type: "LookupTableInsertV2",
 		Input: []tf.Input{
-			filename, contents,
+			table_handle, keys, values,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
-
-// AllKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the "logical and" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
 //
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
+//
+func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "PaddedBatchDatasetV2",
 		Input: []tf.Input{
-			input, axis,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
 		},
 		Attrs: attrs,
 	}
@@ -25353,101 +25902,101 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-//
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
-//
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
-// are sorted in non-decreasing order.
+// Returns element-wise smallest integer not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the number of elements in the given table.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	table_handle: Handle to the table.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "LookupTableSizeV2",
 		Input: []tf.Input{
-			input,
+			table_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softplus gradients for a softplus operation.
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
+
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of bilinear interpolation.
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			gradients, features,
+			grads, original_image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
-
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
-//
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
-	return func(m optionalAttr) {
-		m["compute_v"] = value
-	}
-}
-
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
+// Outputs all keys and values in the table.
 //
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
-// are sorted in non-decreasing order.
+// Arguments:
+//	table_handle: Handle to the table.
 //
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
 //
-// Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
 //
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
+		Type: "LookupTableExportV2",
 		Input: []tf.Input{
-			input,
+			table_handle,
 		},
 		Attrs: attrs,
 	}
@@ -25455,63 +26004,64 @@ func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV
 	return op.Output(0), op.Output(1)
 }
 
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// Replaces the contents of the table with the specified keys and values.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
 //
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
+		Type: "LookupTableImportV2",
 		Input: []tf.Input{
-			images, scale,
+			table_handle, keys, values,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
+// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
+type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
 
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+// value: The type list for the return values.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["output_types"] = value
 	}
 }
 
-// Solves systems of linear equations.
+// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// value: The list of shapes being produced.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
+	}
+}
+
+// Generates a MultiDeviceIterator resource from its provided string handle.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	string_handle: String representing the resource.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// Returns A MultiDeviceIterator resource.
+func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25520,9 +26070,9 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "MultiDeviceIteratorFromStringHandle",
 		Input: []tf.Input{
-			matrix, rhs,
+			string_handle,
 		},
 		Attrs: attrs,
 	}
@@ -25530,170 +26080,162 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 	return op.Output(0)
 }
 
-// Returns a serialized GraphDef representing `input_dataset`.
-//
-// Returns a graph representation for `input_dataset`.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
+
+// MutableHashTableV2Container sets the optional container attribute to value.
 //
-// Returns The graph representation of the dataset (as serialized GraphDef).
-func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DatasetToGraph",
-		Input: []tf.Input{
-			input_dataset,
-		},
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["compute_uv"] = value
+		m["shared_name"] = value
 	}
 }
 
-// SvdFullMatrices sets the optional full_matrices attribute to value.
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
 // If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Computes the singular value decompositions of one or more matrices.
-//
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+// Creates an empty hash table.
 //
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Svd",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "MutableHashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// PrintV2Attr is an optional argument to PrintV2.
-type PrintV2Attr func(optionalAttr)
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
 
-// PrintV2OutputStream sets the optional output_stream attribute to value.
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
+	}
+}
+
+// Dequantize the 'input' tensor into a float Tensor.
+//
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// if T == qint8: in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
 //
-// value: A string specifying the output stream or logging level to print to.
-// If not specified, defaults to "stderr"
-func PrintV2OutputStream(value string) PrintV2Attr {
-	return func(m optionalAttr) {
-		m["output_stream"] = value
-	}
-}
-
-// Prints a string scalar.
+// *SCALED mode Example*
 //
-// Prints a string scalar to the desired output_stream.
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
 //
-// Arguments:
-//	input: The string scalar to print.
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
 //
-// Returns the created operation.
-func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "PrintV2",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
-
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
 //
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Enqueues zero or more tuples of one or more tensors in the given queue.
+// Our input tensor range is then `[-m, m]`.
 //
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
 //
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
 //
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
 //
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25702,551 +26244,543 @@ func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "Dequantize",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Flips all bits elementwise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "Invert",
 		Input: []tf.Input{
-			data, segment_ids,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
-//
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Inverse 3D fast Fourier transform.
 //
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
 //
 // Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//	input: A complex64 tensor.
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "IFFT3D",
 		Input: []tf.Input{
-			images,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
-
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
-//
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
-	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
-	}
-}
-
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
-//
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
-//
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
-//
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
-//
-// Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+// Deprecated. Disallowed in GraphDef version >= 2.
 //
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "AdjustContrast",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			images, contrast_factor, min_value, max_value,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
-//
-// See also `Save`.
+// Table initializer that takes two tensors for keys and values respectively.
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
 //
 // Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SaveSlices",
+		Type: "InitializeTableV2",
 		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+			table_handle, keys, values,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
 
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+// PrintMessage sets the optional message attribute to value.
+//
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["message"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// PrintFirstN sets the optional first_n attribute to value.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["first_n"] = value
+	}
+}
+
+// PrintSummarize sets the optional summarize attribute to value.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
 //
-// Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+// Passes `input` through to `output` and prints `data` when evaluating.
 //
+// Arguments:
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "Print",
 		Input: []tf.Input{
-			set1, set2,
+			input, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
+		Type: "TensorSummaryV2",
 		Input: []tf.Input{
-			basename, shard, num_shards,
+			tag, tensor, serialized_summary_metadata,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchToSpace for N-D tensors of type T.
-//
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
 // Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
-//
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
-//
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
-//
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
 //
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
 //
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
 //
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "PrefetchDataset",
+		Input: []tf.Input{
+			input_dataset, buffer_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
+
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["description"] = value
+	}
+}
+
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// The output tensor has shape `[1, 4, 4, 1]` and value:
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["labels"] = value
+	}
+}
+
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["display_name"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with a tensor.
 //
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
 //
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorSummary",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Read an element from the TensorArray into output `value`.
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Arguments:
+//	handle: The handle to a TensorArray.
 //
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
+		Type: "TensorArrayReadV3",
 		Input: []tf.Input{
-			input, block_shape, crops,
+			handle, index, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
-
-// UnpackAxis sets the optional axis attribute to value.
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
 //
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
-}
-
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+// This operation computes
 //
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
 //
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
 //
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
 //
-// This is the opposite of `pack`.
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
 //
-// Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "ResourceScatterMax",
 		Input: []tf.Input{
-			value,
+			resource, indices, updates,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
+	return scope.AddOperation(opspec)
+}
+
+// Computes the gradient for the tanh of `x` wrt its input.
+//
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
-	return output
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
 //
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			resource,
+			tags, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the stack from its resource container.
+// Outputs a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	handle: The handle to a stack.
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
 //
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			handle,
+			tag, values,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+// Computes the number of elements in the given queue.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			basename, num_shards,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
 
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
 //
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
+		m["max_images"] = value
 	}
 }
 
-// TextLineReaderV2Container sets the optional container attribute to value.
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["bad_color"] = value
 	}
 }
 
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+// Outputs a `Summary` protocol buffer with images.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
+
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// A Reader that outputs the lines of a file delimited by '\n'.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26255,97 +26789,57 @@ func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
+		Type: "AudioSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, sample_rate,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
 
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
 	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
+		m["data_format"] = value
 	}
 }
 
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-//
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
-//
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
-//
-// The remappings are 1-D tensors with the following properties:
-//
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
-//
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
-//
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
-//
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
+// Performs average pooling on the input.
 //
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
+		Type: "AvgPool",
 		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -26353,172 +26847,155 @@ func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Ou
 	return op.Output(0)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
-
-// TFRecordReaderV2Container sets the optional container attribute to value.
+// Merges summaries.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
+	opspec := tf.OpSpec{
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// A Reader that outputs the records from a TensorFlow Records file.
+// The shape of the elements of the given list, as a tensor.
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
+		Type: "TensorListElementShape",
+		Input: []tf.Input{
+			input_handle,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
+// Returns the item in the list with the given index.
+//
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
+//
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Quantizes then dequantizes a tensor.
+// Returns a diagonal tensor with a given diagonal values.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+//
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
+//
+// Arguments:
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
+		Type: "Diag",
 		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
+			diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
 
-// IdentityReaderV2Container sets the optional container attribute to value.
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["seed"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the queued work as both the key and value.
-//
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
-//
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
-		Attrs: attrs,
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+// Outputs random values from a normal distribution. The parameters may each be a
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
 //
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26527,357 +27004,295 @@ func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			shape, means, stdevs, minvals, maxvals,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
+// Sets the index-th position of the list to contain the given tensor.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "TensorListSetItem",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			input_handle, index, item,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
+// Creates a Tensor by indexing into the TensorList.
 //
-// Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+// Each row in the produced Tensor corresponds to the element in the TensorList
+// specified by the given index (see `tf.gather`).
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+// input_handle: The input tensor list.
+// indices: The indices used to index into the list.
+// values: The tensor.
+func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_dtype tf.DataType) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
+		Type: "TensorListGather",
 		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
+			input_handle, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// BatchAttr is an optional argument to Batch.
-type BatchAttr func(optionalAttr)
-
-// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
-// If not specified, defaults to 10
-func BatchMaxEnqueuedBatches(value int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["max_enqueued_batches"] = value
-	}
-}
-
-// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to <>
-func BatchAllowedBatchSizes(value []int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["allowed_batch_sizes"] = value
-	}
-}
-
-// BatchContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BatchContainer(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// BatchSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BatchSharedName(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// BatchBatchingQueue sets the optional batching_queue attribute to value.
-// If not specified, defaults to ""
-func BatchBatchingQueue(value string) BatchAttr {
-	return func(m optionalAttr) {
-		m["batching_queue"] = value
-	}
+	return op.Output(0)
 }
 
-// Batches all input tensors nondeterministically.
-//
-// When many instances of this Op are being run concurrently with the same
-// container/shared_name in the same device, some will output zero-shaped Tensors
-// and others will output Tensors of size up to max_batch_size.
-//
-// All Tensors in in_tensors are batched together (so, for example, labels and
-// features should be batched with a single instance of this operation.
-//
-// Each invocation of batch emits an `id` scalar which will be used to identify
-// this particular invocation when doing unbatch or its gradient.
-//
-// Each op which emits a non-empty batch will also emit a non-empty batch_index
-// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-// start, and length of elements of each set of Tensors present in batched_tensors.
+// Creates a TensorList by indexing into a Tensor.
 //
-// Batched tensors are concatenated along the first dimension, and all tensors in
-// in_tensors must have the first dimension of the same size.
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
 //
-// in_tensors: The tensors to be batched.
-// num_batch_threads: Number of scheduling threads for processing batches of work.
-//  Determines the number of batches processed in parallel.
-// max_batch_size: Batch sizes will never be bigger than this.
-// batch_timeout_micros: Maximum number of microseconds to wait before outputting
-//  an incomplete batch.
-// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
-//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
-//  batches up to one of those sizes. The entries must increase monotonically, and
-//  the final entry must equal max_batch_size.
-// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-// batch_index: If out_tensors is non-empty, has information to invert it.
-// container: Controls the scope of sharing of this batch.
-// id: always contains a scalar with a unique ID for this invocation of Batch.
-// shared_name: Concurrently running instances of batch in the same device with the
-//  same container and shared_name will batch their elements together. If left
-//  empty, the op name will be used as the shared name.
-// T: the types of tensors to be batched.
-func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// output_handle: The TensorList.
+func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Batch",
+		Type: "TensorListScatter",
 		Input: []tf.Input{
-			tf.OutputList(in_tensors),
+			tensor, indices, element_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
-		scope.UpdateErr("Batch", err)
-		return
-	}
-	batch_index = op.Output(idx)
-	id = op.Output(idx)
-	return batched_tensors, batch_index, id
+	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
+//
 //
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
+// ```python
+// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
+// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// >>> print result.eval().tolist()
+// [[2],               # result[0] = range(2, 3)
+//  [],                # result[1] = range(5, 5)
+//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
+// ```
+//
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+// Returns The `row_splits` for the returned `RaggedTensor`.The `inner_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "RaggedRange",
 		Input: []tf.Input{
-			images, delta,
+			starts, limits, deltas,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// Deprecated, use python implementation tf.linalg.matrix_exponential.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "MatrixExponential",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
+type QueueDequeueUpToV2Attr func(optionalAttr)
+
+// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+// This operation is not supported by all queues.  If a queue does not support
+// DequeueUpTo, then an Unimplemented error is returned.
+//
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
+//
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size n in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "QueueDequeueUpToV2",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueUpToV2", err)
+		return
+	}
+	return components
 }
 
-// Store the input tensor in the state of the current session.
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
+//
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
 //
 // Arguments:
-//	value: The tensor to be stored.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
+		Type: "Cholesky",
 		Input: []tf.Input{
-			value,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
-
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+// Writes contents to the file at input filename. Creates file and recursively
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Computes the gradient of bicubic interpolation.
+// creates directory if not existing.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
+		Type: "WriteFile",
 		Input: []tf.Input{
-			grads, original_image,
+			filename, contents,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
 
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Resize `images` to `size` using nearest neighbor interpolation.
+// Computes the "logical and" of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26886,9 +27301,9 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "All",
 		Input: []tf.Input{
-			images, size,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -26896,71 +27311,90 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
-
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
+//
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SelfAdjointEig",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient of nearest neighbor interpolation.
+// Computes softplus gradients for a softplus operation.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
+		Type: "SoftplusGrad",
 		Input: []tf.Input{
-			grads, size,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
 
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
 //
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["compute_v"] = value
 	}
 }
 
-// Extract the shape information of a JPEG-encoded image.
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
-// This op only parses the image header, so it is much faster than DecodeJpeg.
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
+//
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
 //
 // Arguments:
-//	contents: 0-D. The JPEG-encoded image.
+//	input: `Tensor` input of shape `[N, N]`.
 //
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26969,254 +27403,215 @@ func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegS
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
+		Type: "SelfAdjointEigV2",
 		Input: []tf.Input{
-			contents,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
-
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+// Adjust the saturation of one or more images.
 //
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
 //
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustSaturation",
+		Input: []tf.Input{
+			images, scale,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
+
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["adjoint"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
+// Solves systems of linear equations.
 //
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
+		Type: "MatrixSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
-
-// DecodePngChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
-//
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+// Returns a serialized GraphDef representing `input_dataset`.
 //
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
+// Returns a graph representation for `input_dataset`.
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns The graph representation of the dataset (as serialized GraphDef).
+func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "DatasetToGraph",
 		Input: []tf.Input{
-			contents,
+			input_dataset,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
+// Computes the matrix square root of one or more square matrices:
 //
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
+// matmul(sqrtm(A), sqrtm(A)) = A
 //
-//     convert $src.gif -coalesce $dst.gif
+// The input matrix should be invertible. If the input matrix is real, it should
+// have no eigenvalues which are real and negative (pairs of complex conjugate
+// eigenvalues are allowed).
 //
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
+// The matrix square root is computed by first reducing the matrix to
+// quasi-triangular form with the real Schur decomposition. The square root
+// of the quasi-triangular matrix is then computed directly. Details of
+// the algorithm can be found in: Nicholas J. Higham, "Computing real
+// square roots of a real matrix", Linear Algebra Appl., 1987.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the matrix square root for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.sqrtm
+// @end_compatibility
+func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "MatrixSquareRoot",
 		Input: []tf.Input{
-			contents,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
 
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// SvdComputeUv sets the optional compute_uv attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["compute_uv"] = value
 	}
 }
 
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// SvdFullMatrices sets the optional full_matrices attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// Computes the singular value decompositions of one or more matrices.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
+		Type: "Svd",
 		Input: []tf.Input{
-			true_classes,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -27224,27 +27619,28 @@ func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_tr
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
+// PrintV2Attr is an optional argument to PrintV2.
+type PrintV2Attr func(optionalAttr)
 
-// SerializeSparseOutType sets the optional out_type attribute to value.
+// PrintV2OutputStream sets the optional output_stream attribute to value.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+// value: A string specifying the output stream or logging level to print to.
+// If not specified, defaults to "stderr"
+func PrintV2OutputStream(value string) PrintV2Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["output_stream"] = value
 	}
 }
 
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+// Prints a string scalar.
+//
+// Prints a string scalar to the desired output_stream.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+//	input: The string scalar to print.
+//
+// Returns the created operation.
+func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27253,538 +27649,522 @@ func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "PrintV2",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			input,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
 
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
 // If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+// Enqueues zero or more tuples of one or more tensors in the given queue.
 //
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
 //
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// Arguments:
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueEnqueueManyV2",
+		Input: []tf.Input{
+			handle, tf.OutputList(components),
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// A queue that randomizes the order of elements.
+// Computes the product along segments of a tensor.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
 //
 // Arguments:
-//	component_types: The type of each component in a value.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
-		Attrs: attrs,
+		Type: "SegmentProd",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
+// Converts one or more images from RGB to HSV.
 //
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// Parts of the bounding box may fall outside the image.
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			images, boxes,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator.
+// Does nothing. Only useful as a placeholder for control edges.
 //
-// This operation is a synchronous version IteratorGetNext. It should only be used
-// in situations where the iterator does not block the calling thread, or where
-// the calling thread is not a member of the thread pool used to execute parallel
-// operations (e.g. in eager mode).
-func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNextSync",
-		Input: []tf.Input{
-			iterator,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNextSync", err)
-		return
+		Type: "NoOp",
 	}
-	return components
+	return scope.AddOperation(opspec)
 }
 
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["delete_old_dirs"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MergeV2Checkpoints",
+		Input: []tf.Input{
+			checkpoint_prefixes, destination_prefix,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
+// Saves input tensors slices to disk.
 //
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
 //
-// For example,
+// Elements of the `shapes_and_slices` input must either be:
 //
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
 //
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// See also `Save`.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
+		Type: "SaveSlices",
 		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
-
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["centered"] = value
-	}
-}
-
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
-//
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["normalized"] = value
 	}
+	return scope.AddOperation(opspec)
 }
 
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
-//
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["uniform_noise"] = value
-	}
-}
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
 
-// Extracts a glimpse from the input tensor.
-//
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
-//
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `Tensor` inputs.
 //
-// The argument `normalized` and `centered` controls how the windows are built:
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			input, size, offsets,
+			set1, set2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// A container for an iterator resource.
+// Generate a sharded filename. The filename is printf formatted as
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Iterator",
+		Type: "ShardedFilename",
+		Input: []tf.Input{
+			basename, shard, num_shards,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-		Attrs: attrs,
+// BatchToSpace for N-D tensors of type T.
+//
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
+//
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+//
+// This operation is equivalent to the following steps:
+//
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
+//
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BatchToSpaceND",
+		Input: []tf.Input{
+			input, block_shape, crops,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
 
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
+// UnpackAxis sets the optional axis attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["axis"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+//
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
+//
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
 //
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{"num": num}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "Unpack",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
-type ShuffleDatasetAttr func(optionalAttr)
-
-// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
-//
-// value: If true, each iterator over this dataset will be given
-// a different pseudorandomly generated seed, based on a sequence seeded by the
-// `seed` and `seed2` inputs. If false, each iterator will be given the same
-// seed, and repeated iteration over this dataset will yield the exact same
-// sequence of results.
-// If not specified, defaults to true
-func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
-	return func(m optionalAttr) {
-		m["reshuffle_each_iteration"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
 	}
+	return output
 }
 
-// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
 //
 // Arguments:
-//
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
 //
 //
-func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "ShuffleDataset",
+		Type: "ResourceCountUpTo",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2,
+			resource,
 		},
 		Attrs: attrs,
 	}
@@ -27792,69 +28172,79 @@ func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output
 	return op.Output(0)
 }
 
-// 3D fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
+// Delete the stack from its resource container.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
+//	handle: The handle to a stack.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT3D",
+		Type: "StackCloseV2",
 		Input: []tf.Input{
-			input,
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShardedFilespec",
+		Input: []tf.Input{
+			basename, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
 
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["skip_header_lines"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// TextLineReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
+//
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27863,65 +28253,141 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
-		Input: []tf.Input{
-			grads, image, boxes, box_ind,
-		},
+		Type: "TextLineReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
 //
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
+	}
+}
+
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+//
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
+//
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
+//
+// The remappings are 1-D tensors with the following properties:
+//
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
+//
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+//
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
+//
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
+//
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
 //
 // Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
-type StatsAggregatorHandleAttr func(optionalAttr)
-
-// StatsAggregatorHandleContainer sets the optional container attribute to value.
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
 // If not specified, defaults to ""
-func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["shared_name"] = value
 	}
 }
 
-// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
 // If not specified, defaults to ""
-func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["compression_type"] = value
 	}
 }
 
-// Creates a statistics manager resource.
-func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
+// A Reader that outputs the records from a TensorFlow Records file.
+//
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27930,7 +28396,7 @@ func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatsAggregatorHandle",
+		Type: "TFRecordReaderV2",
 
 		Attrs: attrs,
 	}
@@ -27938,148 +28404,119 @@ func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr)
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
+		Type: "QuantizeAndDequantizeV3",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
+			input, input_min, input_max, num_bits,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
+//
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV3",
-		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
-		},
+		Type: "IdentityReaderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
-type NonMaxSuppressionV4Attr func(optionalAttr)
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
 
-// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, the output `selected_indices` is padded to be of length
-// `max_output_size`. Defaults to false.
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["pad_to_max_output_size"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
-// `selected_indices`, with the valid elements appearing first.
-func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28088,79 +28525,65 @@ func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV4",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
+			var_, alpha, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the matrix logarithm of one or more square matrices:
-//
-//
-// \\(log(exp(A)) = A\\)
-//
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
-//
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
+		Type: "ReaderReadV2",
 		Input: []tf.Input{
-			input,
+			reader_handle, queue_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-//   This op is used as a placeholder in If branch functions. It doesn't provide a
-//   valid output when run, so must either be removed (e.g. replaced with a
-//   function input) or guaranteed not to be used (e.g. if mirroring an
-//   intermediate output needed for the gradient computation of the other branch).
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
 // Arguments:
-//	dtype: The type of the output.
-//	shape:     The purported shape of the output. This is only used for shape inference;
-//     the output will not necessarily have this shape. Can be a partial shape.
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-// Returns     \"Fake\" output value. This should not be consumed by another op.
-func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "FakeParam",
-
-		Attrs: attrs,
+		Type: "ReaderReadUpToV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle, num_records,
+		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 //     Adds v into specified rows of x.
@@ -28206,124 +28629,254 @@ func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// A dataset that splits the elements of its input into multiple elements.
-func UnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
+
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["max_enqueued_batches"] = value
+	}
+}
+
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
+	return func(m optionalAttr) {
+		m["allowed_batch_sizes"] = value
+	}
+}
+
+// BatchContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func BatchContainer(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// BatchSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func BatchSharedName(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
+//
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
+//
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
+//
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
+//
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
+//
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnbatchDataset",
+		Type: "Batch",
 		Input: []tf.Input{
-			input_dataset,
+			tf.OutputList(in_tensors),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
+	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
 }
 
-// RpcAttr is an optional argument to Rpc.
-type RpcAttr func(optionalAttr)
-
-// RpcProtocol sets the optional protocol attribute to value.
+// Adjust the hue of one or more images.
 //
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func RpcProtocol(value string) RpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RpcFailFast sets the optional fail_fast attribute to value.
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func RpcFailFast(value bool) RpcAttr {
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["fail_fast"] = value
+		m["use_locking"] = value
 	}
 }
 
-// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func RpcTimeoutInMs(value int64) RpcAttr {
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the RPC method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
+// Update '*var' according to the Adam algorithm.
 //
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 //
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Store the input tensor in the state of the current session.
 //
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
+// Arguments:
+//	value: The tensor to be stored.
 //
-// If the connection fails or the remote worker returns an error
-// status, the op reraises this exception locally.
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandleV2",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
+
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of bicubic interpolation.
 //
 // Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
-func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28332,9 +28885,9 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Rpc",
+		Type: "ResizeBicubicGrad",
 		Input: []tf.Input{
-			address, method, request,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
@@ -28342,104 +28895,113 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
 
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["align_corners"] = value
 	}
 }
 
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeNearestNeighbor",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
+// Computes the gradient of nearest neighbor interpolation.
 //
 // Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
 //
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
+			grads, size,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
 
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
 //
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
 	return func(m optionalAttr) {
-		m["swap_memory"] = value
+		m["output_type"] = value
 	}
 }
 
-// Push an element onto the stack.
+// Extract the shape information of a JPEG-encoded image.
+//
+// This op only parses the image header, so it is much faster than DecodeJpeg.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
+//	contents: 0-D. The JPEG-encoded image.
 //
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28448,9 +29010,9 @@ func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...Sta
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StackPushV2",
+		Type: "ExtractJpegShape",
 		Input: []tf.Input{
-			handle, elem,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -28458,153 +29020,143 @@ func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...Sta
 	return op.Output(0)
 }
 
-// StringSplitV2Attr is an optional argument to StringSplitV2.
-type StringSplitV2Attr func(optionalAttr)
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
 
-// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
-// If not specified, defaults to -1
-func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["maxsplit"] = value
+		m["shapes"] = value
 	}
 }
 
-// Split elements of `source` based on `sep` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `source` based on `sep` and return a `SparseTensor`
-// containing the split tokens. Empty tokens are ignored.
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
-// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-// then the output will be
-// ```
-// st.indices = [0, 0;
-//               0, 1;
-//               1, 0;
-//               1, 1;
-//               1, 2]
-// st.shape = [2, 3]
-// st.values = ['hello', 'world', 'a', 'b', 'c']
-// ```
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
 //
-// If `sep` is given, consecutive delimiters are not grouped together and are
-// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-// string, consecutive whitespace are regarded as a single separator, and the
-// result will contain no empty strings at the startor end if the string has
-// leading or trailing whitespace.
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// Note that the above mentioned behavior matches python's str.split.
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
 //
 // Arguments:
-//	input: `1-D` string `Tensor`, the strings to split.
-//	sep: `0-D` string `Tensor`, the delimiter character.
-func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplitV2",
-		Input: []tf.Input{
-			input, sep,
-		},
+		Type: "PaddingFIFOQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softsign",
-		Input: []tf.Input{
-			features,
-		},
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
+
+// DecodePngChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// EncodeProtoAttr is an optional argument to EncodeProto.
-type EncodeProtoAttr func(optionalAttr)
-
-// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
-// If not specified, defaults to "local://"
-func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
 	return func(m optionalAttr) {
-		m["descriptor_source"] = value
+		m["dtype"] = value
 	}
 }
 
-// The op serializes protobuf messages provided in the input tensors.
-//
-// The types of the tensors in `values` must match the schema for the
-// fields specified in `field_names`. All the tensors in `values` must
-// have a common shape prefix, *batch_shape*.
-//
-// The `sizes` tensor specifies repeat counts for each field.  The repeat
-// count (last dimension) of a each tensor in `values` must be greater
-// than or equal to corresponding repeat count in `sizes`.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
 //
-// There are a few special cases in the value mapping:
+// Accepted values are:
 //
-// Submessage and group fields must be pre-serialized as TensorFlow strings.
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
-// TensorFlow lacks support for unsigned int64s, so they must be
-// represented as `tf.int64` with the same twos-complement bit pattern
-// (the obvious way).
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
 //
-// Unsigned int32 values can be represented exactly with `tf.int64`, or
-// with sign wrapping if the input is of type `tf.int32`.
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-//	values: List of tensors containing values for the corresponding field.
-//	field_names: List of strings containing proto field names.
-//	message_type: Name of the proto message type to decode.
+//	contents: 0-D.  The PNG-encoded image.
 //
-// Returns Tensor of serialized protos with shape `batch_shape`.
-func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeProto",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			sizes, tf.OutputList(values),
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -28612,151 +29164,139 @@ func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names
 	return op.Output(0)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
-//
-// If the given TensorArray gradient already exists, returns a reference to it.
-//
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
-//
-// **A note about the input flow_in:**
-//
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
-//
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
-//
-// **A note about the source attribute:**
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
 //
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
 //
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
+//     convert $src.gif -coalesce $dst.gif
 //
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			handle, flow_in,
+			contents,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
-		Input: []tf.Input{
-			indices, values, dense_shape,
-		},
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
+		Type: "LearnedUnigramCandidateSampler",
 		Input: []tf.Input{
-			input_dataset, another_dataset,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
 //
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LatencyStatsDataset",
+		Type: "SerializeSparse",
 		Input: []tf.Input{
-			input_dataset, tag,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -28764,310 +29304,406 @@ func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, o
 	return op.Output(0)
 }
 
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
 
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["shapes"] = value
 	}
 }
 
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["capacity"] = value
 	}
 }
 
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+//
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["min_after_dequeue"] = value
 	}
 }
 
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["seed"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapSize",
-
-		Attrs: attrs,
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
 //
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
+// A queue that randomizes the order of elements.
 //
 // Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+//	component_types: The type of each component in a value.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
-		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
-		},
+		Type: "RandomShuffleQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
-//
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
+// Draw bounding boxes on a batch of images.
 //
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
 //
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
+// Parts of the bounding box may fall outside the image.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
 //
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2D",
+		Type: "DrawBoundingBoxes",
 		Input: []tf.Input{
-			input, filter,
+			images, boxes,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts the given variant tensor to an iterator and stores it in the given resource.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
-// resource.
+// Gets the next output from the given iterator.
 //
-// Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+// This operation is a synchronous version IteratorGetNext. It should only be used
+// in situations where the iterator does not block the calling thread, or where
+// the calling thread is not a member of the thread pool used to execute parallel
+// operations (e.g. in eager mode).
+func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
+		Type: "IteratorGetNextSync",
 		Input: []tf.Input{
-			resource_handle, serialized,
+			iterator,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNextSync", err)
+		return
+	}
+	return components
 }
 
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
 
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["seed"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
+		Type: "SampleDistortedBoundingBoxV2",
 		Input: []tf.Input{
-			handle, flow_in,
+			image_size, bounding_boxes, min_object_covered,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
-//
-// Arguments:
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["centered"] = value
+	}
+}
+
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
 //
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
-		},
-		Attrs: attrs,
+}
+
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+//
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that batches input elements into a SparseTensor.
+// Extracts a glimpse from the input tensor.
+//
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
 //
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
 //
+// Arguments:
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
 //
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
+		Type: "ExtractGlimpse",
 		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
+			input, size, offsets,
 		},
 		Attrs: attrs,
 	}
@@ -29075,132 +29711,72 @@ func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
+// A container for an iterator resource.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
+		Type: "Iterator",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
 
-// SubstrUnit sets the optional unit attribute to value.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
 	return func(m optionalAttr) {
-		m["unit"] = value
+		m["method"] = value
 	}
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// A negative `pos` indicates distance within the string backwards from the end.
-//
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
-//
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
-//
-// Broadcasting `input` onto `pos` and `len`:
-//
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
-//
-// output = [b'hir', b'ee', b'n']
-// ```
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "CropAndResizeGradImage",
 		Input: []tf.Input{
-			input, pos, len,
+			grads, boxes, box_ind, image_size,
 		},
 		Attrs: attrs,
 	}
@@ -29208,34 +29784,24 @@ func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optiona
 	return op.Output(0)
 }
 
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
+// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
+type ShuffleDatasetAttr func(optionalAttr)
+
+// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
 //
-func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
-		Attrs: attrs,
+// value: If true, each iterator over this dataset will be given
+// a different pseudorandomly generated seed, based on a sequence seeded by the
+// `seed` and `seed2` inputs. If false, each iterator will be given the same
+// seed, and repeated iteration over this dataset will yield the exact same
+// sequence of results.
+// If not specified, defaults to true
+func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
+	return func(m optionalAttr) {
+		m["reshuffle_each_iteration"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that shuffles and repeats elements from `input_dataset`
-//
-// pseudorandomly.
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
 // Arguments:
 //
@@ -29246,19 +29812,20 @@ func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types [
 // `seed2` is set to be non-zero, the random number generator is seeded
 // by the given seed.  Otherwise, a random seed is used.
 //	seed2: A second scalar seed to avoid seed collision.
-//	count: A scalar representing the number of times the underlying dataset
-// should be repeated. The default is `-1`, which results in infinite repetition.
 //
 //
-func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ShuffleAndRepeatDataset",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2, count,
+			input_dataset, buffer_size, seed, seed2,
 		},
 		Attrs: attrs,
 	}
@@ -29266,412 +29833,373 @@ func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size
 	return op.Output(0)
 }
 
-// Creates a dataset that caches elements from `input_dataset`.
+// 3D fast Fourier transform.
 //
-// A CacheDataset will iterate over the input_dataset, and store tensors. If the
-// cache already exists, the cache will be used. If the cache is inappropriate
-// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-// will the returned when used.
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
 //
 // Arguments:
+//	input: A complex64 tensor.
 //
-//	filename: A path on the filesystem where we should cache the dataset. Note: this
-// will be a directory.
-//
-//
-func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "CacheDataset",
-		Input: []tf.Input{
-			input_dataset, filename,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that emits the records from one or more binary files.
-//
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	header_bytes: A scalar representing the number of bytes to skip at the
-// beginning of a file.
-//	record_bytes: A scalar representing the number of bytes in each record.
-//	footer_bytes: A scalar representing the number of bytes to skip at the end
-// of a file.
-//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
-func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordDataset",
+		Type: "FFT3D",
 		Input: []tf.Input{
-			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
+
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
 //
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+	return func(m optionalAttr) {
+		m["method"] = value
+	}
+}
+
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "CropAndResizeGradBoxes",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			grads, image, boxes, box_ind,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the records from one or more TFRecord files.
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	filenames: A scalar or vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar representing the number of bytes to buffer. A value of
-// 0 means no buffering will be performed.
-func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordDataset",
+		Type: "NonMaxSuppressionV3",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// A container for an iterator resource.
-//
-// Returns A handle to the iterator that can be passed to a "MakeIterator" or
-// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
-// resource sharing by name, and does not keep a reference to the resource
-// container.
-func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "AnonymousIterator",
+// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
+type NonMaxSuppressionV4Attr func(optionalAttr)
 
-		Attrs: attrs,
+// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
+//
+// value: If true, the output `selected_indices` is padded to be of length
+// `max_output_size`. Defaults to false.
+// If not specified, defaults to false
+func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+	return func(m optionalAttr) {
+		m["pad_to_max_output_size"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
-//
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
+//
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
+// `selected_indices`, with the valid elements appearing first.
+func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
+		Type: "NonMaxSuppressionV4",
 		Input: []tf.Input{
-			input, crops,
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+// Computes the matrix logarithm of one or more square matrices:
 //
-// This operation may be executed multiple times. Each execution will reset the
-// iterator in `iterator` to the first element of `dataset`.
 //
-// Returns the created operation.
-func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+// \\(log(exp(A)) = A\\)
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MakeIterator",
+		Type: "MatrixLogarithm",
 		Input: []tf.Input{
-			dataset, iterator,
+			input,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Adjust the contrast of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
-//
-// Contrast is adjusted independently for each channel of each image.
-//
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
+//   This op is used as a placeholder in If branch functions. It doesn't provide a
+//   valid output when run, so must either be removed (e.g. replaced with a
+//   function input) or guaranteed not to be used (e.g. if mirroring an
+//   intermediate output needed for the gradient computation of the other branch).
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+//	dtype: The type of the output.
+//	shape:     The purported shape of the output. This is only used for shape inference;
+//     the output will not necessarily have this shape. Can be a partial shape.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+// Returns     \"Fake\" output value. This should not be consumed by another op.
+func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
-		Input: []tf.Input{
-			images, contrast_factor,
-		},
+		Type: "FakeParam",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator .
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			iterator,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Outputs the single element from the given dataset.
-//
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
-//
-//
+// List of the given size with empty elements.
 //
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
+		Type: "TensorListReserve",
 		Input: []tf.Input{
-			dataset,
+			element_shape, num_elements,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Converts the given `resource_handle` representing an iterator to a string.
+// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
+// `N` data inputs should produce the next output element.
+//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
+// the values of `selector_input_dataset`.
 //
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+//
+func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
+		Type: "ExperimentalDirectedInterleaveDataset",
 		Input: []tf.Input{
-			resource_handle,
+			selector_input_dataset, tf.OutputList(data_input_datasets),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["output_types"] = value
+		m["seed"] = value
 	}
 }
 
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["output_shapes"] = value
+		m["seed2"] = value
 	}
 }
 
-// Converts the given string representing a handle to an iterator to a resource.
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
 // Arguments:
-//	string_handle: A string representation of the given handle.
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29680,478 +30208,455 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
-		Input: []tf.Input{
-			string_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Gather slices from `params` axis `axis` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
-//
-// ```python
-//     # Scalar indices (output is rank(params) - 1).
-//     output[a_0, ..., a_n, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
-//
-//     # Vector indices (output is rank(params)).
-//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
-//
-//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
-//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, a 0 is stored in the
-// corresponding output value.
-//
-// See also `tf.batch_gather` and `tf.gather_nd`.
-//
-// Arguments:
-//	params: The tensor from which to gather values. Must be at least rank
-// `axis + 1`.
-//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-//	axis: The axis in `params` to gather `indices` from. Defaults to the first
-// dimension. Supports negative indexes.
-//
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GatherV2",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			params, indices, axis,
+			shape, minval, maxval,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
+// Add the quantile summaries to each quantile stream resource.
+//
+// An op that adds a list of quantile summaries to a quantile stream resource. Each
+// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
+// for a single feature.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
 //
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
+		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
 		Input: []tf.Input{
-			resource_handle,
+			quantile_stream_resource_handle, tf.OutputList(summaries),
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
-
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
+// StringSplitV2Attr is an optional argument to StringSplitV2.
+type StringSplitV2Attr func(optionalAttr)
 
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
+// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
 // If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["maxsplit"] = value
 	}
 }
 
-// FIFOQueueV2Container sets the optional container attribute to value.
+// Split elements of `source` based on `sep` into a `SparseTensor`.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `source` based on `sep` and return a `SparseTensor`
+// containing the split tokens. Empty tokens are ignored.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements in first-in first-out order.
+// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+// then the output will be
+// ```
+// st.indices = [0, 0;
+//               0, 1;
+//               1, 0;
+//               1, 1;
+//               1, 2]
+// st.shape = [2, 3]
+// st.values = ['hello', 'world', 'a', 'b', 'c']
+// ```
 //
-// Arguments:
-//	component_types: The type of each component in a value.
+// If `sep` is given, consecutive delimiters are not grouped together and are
+// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+// string, consecutive whitespace are regarded as a single separator, and the
+// result will contain no empty strings at the startor end if the string has
+// leading or trailing whitespace.
 //
-// Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+// Note that the above mentioned behavior matches python's str.split.
+//
+// Arguments:
+//	input: `1-D` string `Tensor`, the strings to split.
+//	sep: `0-D` string `Tensor`, the delimiter character.
+func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
-
+		Type: "StringSplitV2",
+		Input: []tf.Input{
+			input, sep,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Produces a summary of any statistics recorded by the given statistics manager.
-func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
+//
+//
+func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "StatsAggregatorSummary",
+		Type: "ExperimentalThreadPoolDataset",
 		Input: []tf.Input{
-			iterator,
+			input_dataset, thread_pool,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cross",
+		Type: "Softsign",
 		Input: []tf.Input{
-			a, b,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes the given dataset to the given file using the TFRecord format.
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
+
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+	return func(m optionalAttr) {
+		m["descriptor_source"] = value
+	}
+}
+
+// The op serializes protobuf messages provided in the input tensors.
+//
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
+//
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
 //
 // Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
 //
-// Returns the created operation.
-func DatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DatasetToTFRecord",
+		Type: "EncodeProto",
 		Input: []tf.Input{
-			input_dataset, filename, compression_type,
+			sizes, tf.OutputList(values),
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
+// Creates a TensorArray for storing the gradients of values in the given handle.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D average pooling on the input.
+// If the given TensorArray gradient already exists, returns a reference to it.
+//
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
+//
+// **A note about the input flow_in:**
+//
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
+//
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
+//
+// **A note about the source attribute:**
+//
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
 //
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
 //
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
+		Type: "TensorArrayGradV3",
 		Input: []tf.Input{
-			input,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// A placeholder for input pipeline graph optimizations.
-//
-// A placeholder for input pipeline graph optimizations.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-func SinkDataset(scope *Scope, input_dataset tf.Output) (handle tf.Output) {
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SinkDataset",
+		Type: "SparseTensorSliceDataset",
 		Input: []tf.Input{
-			input_dataset,
+			indices, values, dense_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Constructs an Optional variant from a tuple of tensors.
-func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
+// Returns x / y element-wise for real types.
+//
+// If `x` and `y` are reals, this will return the floating-point division.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OptionalFromValue",
+		Type: "RealDiv",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
-type DecodeProtoV2Attr func(optionalAttr)
-
-// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
-//
-// value: Either the special value `local://` or a path to a file containing
-// a serialized `FileDescriptorSet`.
-// If not specified, defaults to "local://"
-func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
-}
-
-// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
-//
-// value: Either `binary` or `text`.
-// If not specified, defaults to "binary"
-func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["message_format"] = value
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
-//
-// value: Whether to sanitize the result or not.
-// If not specified, defaults to false
-func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["sanitize"] = value
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ConcatenateDataset",
+		Input: []tf.Input{
+			input_dataset, another_dataset,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// The op extracts fields from a serialized protocol buffers message into tensors.
-//
-// The `decode_proto` op extracts fields from a serialized protocol buffers
-// message into tensors.  The fields in `field_names` are decoded and converted
-// to the corresponding `output_types` if possible.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// Each output tensor is a dense tensor. This means that it is padded to
-// hold the largest number of repeated elements seen in the input
-// minibatch. (The shape is also padded by one to prevent zero-sized
-// dimensions). The actual repeat counts for each example in the
-// minibatch can be found in the `sizes` output. In many cases the output
-// of `decode_proto` is fed immediately into tf.squeeze if missing values
-// are not a concern. When using tf.squeeze, always pass the squeeze
-// dimension explicitly to avoid surprises.
-//
-// For the most part, the mapping between Proto field types and
-// TensorFlow dtypes is straightforward. However, there are a few
-// special cases:
-//
-// - A proto field that contains a submessage or group can only be converted
-// to `DT_STRING` (the serialized submessage). This is to reduce the
-// complexity of the API. The resulting string can be used as input
-// to another instance of the decode_proto op.
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
-// - TensorFlow lacks support for unsigned integers. The ops represent uint64
-// types as a `DT_INT64` with the same twos-complement bit pattern
-// (the obvious way). Unsigned int32 values can be represented exactly by
-// specifying type `DT_INT64`, or using twos-complement if the caller
-// specifies `DT_INT32` in the `output_types` attribute.
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
 //
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
 //
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
 //
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
 //
-// Both binary and text proto serializations are supported, and can be
-// chosen using the `format` attribute.
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
 //
 // Arguments:
-//	bytes: Tensor of serialized protos with shape `batch_shape`.
-//	message_type: Name of the proto message type to decode.
-//	field_names: List of strings containing proto field names.
-//	output_types: List of TF types to use for the respective field in field_names.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-// Each entry is the number of values found for the corresponding field.
-// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
-// `values[i]` has datatype `output_types[i]`
-// and shape `[batch_shape, max(sizes[...,i])]`.
-func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "DecodeProtoV2",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			bytes,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	sizes = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("DecodeProtoV2", err)
-		return
-	}
-	return sizes, values
+	return op.Output(0)
 }
 
-// Creates an Optional variant with no value.
-func OptionalNone(scope *Scope) (optional tf.Output) {
+// Converts the given variant tensor to an iterator and stores it in the given resource.
+//
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//	serialized: A variant tensor storing the state of the iterator contained in the
+// resource.
+//
+// Returns the created operation.
+func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OptionalNone",
+		Type: "DeserializeIterator",
+		Input: []tf.Input{
+			resource_handle, serialized,
+		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns true if and only if the given Optional variant has a value.
-func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
+
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape_except0"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "OptionalHasValue",
+		Type: "TensorArrayConcatV2",
 		Input: []tf.Input{
-			optional,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
 //
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
 //
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SqlDataset",
+		Type: "PaddedBatchDataset",
 		Input: []tf.Input{
-			driver_name, data_source_name, query,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
 		},
 		Attrs: attrs,
 	}
@@ -30159,42 +30664,61 @@ func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output,
 	return op.Output(0)
 }
 
-// Returns the value stored in an Optional variant or raises an error if none exists.
-func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
+//
+// pseudorandomly.
+//
+// Arguments:
+//
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
+//
+//
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "OptionalGetValue",
+		Type: "ShuffleAndRepeatDataset",
 		Input: []tf.Input{
-			optional,
+			input_dataset, buffer_size, seed, seed2, count,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("OptionalGetValue", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Gets the next output from the given iterator as an Optional variant.
-func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
+// Creates a dataset that caches elements from `input_dataset`.
+//
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
+//
+// Arguments:
+//
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
+//
+//
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNextAsOptional",
+		Type: "CacheDataset",
 		Input: []tf.Input{
-			iterator,
+			input_dataset, filename,
 		},
 		Attrs: attrs,
 	}
@@ -30202,682 +30726,673 @@ func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []
 	return op.Output(0)
 }
 
-// Fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
+// Creates a dataset that emits the records from one or more binary files.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT",
+		Type: "FixedLengthRecordDataset",
 		Input: []tf.Input{
-			input,
+			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Identity transformation that models performance.
+// Gradients for batch normalization.
 //
-// Identity transformation that models performance.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
 //
-func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "ModelDataset",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			input_dataset,
+			t, m, v, gamma, backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Performs a padding as a preprocess during a convolution.
-//
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// Creates a dataset that emits the records from one or more TFRecord files.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar representing the number of bytes to buffer. A value of
+// 0 means no buffering will be performed.
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
+		Type: "TFRecordDataset",
 		Input: []tf.Input{
-			input, paddings, filter,
+			filenames, compression_type, buffer_size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+// A container for an iterator resource.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+// Returns A handle to the iterator that can be passed to a "MakeIterator" or
+// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
+// resource sharing by name, and does not keep a reference to the resource
+// container.
+func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
+		Type: "AnonymousIterator",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
+// BatchToSpace for 4-D tensors of type T.
+//
+// This is a legacy version of the more general BatchToSpaceND.
+//
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
+//
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
+//
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+//
+//
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
+//
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
+//
+// The attr `block_size` must be greater than one. It indicates the block size.
 //
-// Builds a merged tensor such that
+// Some examples:
 //
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
 // ```
 //
-// For example, if each `indices[m]` is scalar or vector, we have
+// The output tensor has shape `[1, 2, 2, 1]` and value:
 //
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
 //
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
 // ```
 //
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
+// The output tensor has shape `[1, 2, 2, 3]` and value:
 //
-//     merged.shape = [max(indices)] + constant
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
 //
-// Values are merged in order, so if an index appears in both `indices[m][i]` and
-// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-// merged result. If you do not need this guarantee, ParallelDynamicStitch might
-// perform better on some devices.
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
 //
-// For example:
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
 // ```
 //
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
 // ```
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "DynamicStitch",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			input, crops,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x == y) element-wise.
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
+//
+// Returns the created operation.
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Equal",
+		Type: "MakeIterator",
 		Input: []tf.Input{
-			x, y,
+			dataset, iterator,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
-
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Deprecated. Use TensorArrayGatherV3
+// Adjust the contrast of one or more images.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
+//
+// Contrast is adjusted independently for each channel of each image.
+//
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
+//
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
+		Type: "AdjustContrastv2",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			images, contrast_factor,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
-//
-// For example:
-//
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
-//
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// Gets the next output from the given iterator .
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
+		Type: "IteratorGetNext",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			iterator,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
+	}
+	return components
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// Outputs the single element from the given dataset.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Arguments:
+//	dataset: A handle to a dataset that contains a single element.
+//
+//
+//
+// Returns The components of the single element of `input`.
+func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "InvGrad",
+		Type: "DatasetToSingleElement",
 		Input: []tf.Input{
-			y, dy,
+			dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("DatasetToSingleElement", err)
+		return
+	}
+	return components
 }
 
-// List of the given size with empty elements.
+// Converts the given `resource_handle` representing an iterator to a string.
 //
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
+		Type: "IteratorToStringHandle",
 		Input: []tf.Input{
-			element_shape, num_elements,
+			resource_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
-type PriorityQueueV2Attr func(optionalAttr)
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
 
-// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// value: The type of each component in a value.
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["component_types"] = value
-	}
-}
-
-// PriorityQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["output_types"] = value
 	}
 }
 
-// PriorityQueueV2Container sets the optional container attribute to value.
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["output_shapes"] = value
 	}
 }
 
-// A queue that produces elements sorted by the first component value.
-//
-// Note that the PriorityQueue requires the first component of any element
-// to be a scalar int64, in addition to the other elements declared by
-// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-// entry in their input (resp. output) lists.
+// Converts the given string representing a handle to an iterator to a resource.
 //
 // Arguments:
-//	shapes: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
+//	string_handle: A string representation of the given handle.
 //
-// Returns The handle to the queue.
-func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PriorityQueueV2",
-
+		Type: "IteratorFromStringHandle",
+		Input: []tf.Input{
+			string_handle,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnstageAttr is an optional argument to Unstage.
-type UnstageAttr func(optionalAttr)
-
-// UnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Gather slices from `params` axis `axis` according to `indices`.
 //
-// REQUIRES: value >= 0
-func UnstageCapacity(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
+//
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
+//
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+//
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
+// See also `tf.batch_gather` and `tf.gather_nd`.
+//
+// Arguments:
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "GatherV2",
+		Input: []tf.Input{
+			params, indices, axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
 //
-// REQUIRES: value >= 0
-func UnstageMemoryLimit(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeIterator",
+		Input: []tf.Input{
+			resource_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnstageContainer(value string) UnstageAttr {
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
+
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["shapes"] = value
 	}
 }
 
-// UnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnstageSharedName(value string) UnstageAttr {
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["capacity"] = value
 	}
 }
 
-// Op is similar to a lightweight Dequeue.
+// FIFOQueueV2Container sets the optional container attribute to value.
 //
-// The basic functionality is similar to dequeue with many fewer
-// capabilities and options.  This Op is optimized for performance.
-func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unstage",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("Unstage", err)
-		return
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	return values
 }
 
-// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
-type QueueEnqueueV2Attr func(optionalAttr)
-
-// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If the queue is full, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Enqueues a tuple of one or more tensors in the given queue.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
-//
-// N.B. If the queue is full, this operation will block until the given
-// element has been enqueued (or 'timeout_ms' elapses, if specified).
+// A queue that produces elements in first-in first-out order.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should be taken.
+//	component_types: The type of each component in a value.
 //
-// Returns the created operation.
-func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueV2",
-		Input: []tf.Input{
-			handle, tf.OutputList(components),
-		},
+		Type: "FIFOQueueV2",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the Bessel i0e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i0(x)`.
-func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
+// Constructs an Optional variant from a tuple of tensors.
+func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BesselI0e",
+		Type: "OptionalFromValue",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(components),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
+// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
+type DecodeProtoV2Attr func(optionalAttr)
 
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
 //
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
+// value: Either the special value `local://` or a path to a file containing
+// a serialized `FileDescriptorSet`.
+// If not specified, defaults to "local://"
+func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
+// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
 //
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
+// value: Either `binary` or `text`.
+// If not specified, defaults to "binary"
+func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["message_format"] = value
+	}
+}
+
+// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
 //
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
+// value: Whether to sanitize the result or not.
+// If not specified, defaults to false
+func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+	return func(m optionalAttr) {
+		m["sanitize"] = value
+	}
+}
+
+// The op extracts fields from a serialized protocol buffers message into tensors.
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// The `decode_proto` op extracts fields from a serialized protocol buffers
+// message into tensors.  The fields in `field_names` are decoded and converted
+// to the corresponding `output_types` if possible.
 //
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// Each output tensor is a dense tensor. This means that it is padded to
+// hold the largest number of repeated elements seen in the input
+// minibatch. (The shape is also padded by one to prevent zero-sized
+// dimensions). The actual repeat counts for each example in the
+// minibatch can be found in the `sizes` output. In many cases the output
+// of `decode_proto` is fed immediately into tf.squeeze if missing values
+// are not a concern. When using tf.squeeze, always pass the squeeze
+// dimension explicitly to avoid surprises.
+//
+// For the most part, the mapping between Proto field types and
+// TensorFlow dtypes is straightforward. However, there are a few
+// special cases:
+//
+// - A proto field that contains a submessage or group can only be converted
+// to `DT_STRING` (the serialized submessage). This is to reduce the
+// complexity of the API. The resulting string can be used as input
+// to another instance of the decode_proto op.
+//
+// - TensorFlow lacks support for unsigned integers. The ops represent uint64
+// types as a `DT_INT64` with the same twos-complement bit pattern
+// (the obvious way). Unsigned int32 values can be represented exactly by
+// specifying type `DT_INT64`, or using twos-complement if the caller
+// specifies `DT_INT32` in the `output_types` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// Both binary and text proto serializations are supported, and can be
+// chosen using the `format` attribute.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
+//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	message_type: Name of the proto message type to decode.
+//	field_names: List of strings containing proto field names.
+//	output_types: List of TF types to use for the respective field in field_names.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+// Each entry is the number of values found for the corresponding field.
+// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
+// `values[i]` has datatype `output_types[i]`
+// and shape `[batch_shape, max(sizes[...,i])]`.
+func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
+		Type: "DecodeProtoV2",
 		Input: []tf.Input{
-			handle, n,
+			bytes,
 		},
 		Attrs: attrs,
 	}
@@ -30887,264 +31402,261 @@ func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	}
 	var idx int
 	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
+	sizes = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("DecodeProtoV2", err)
 		return
 	}
-	return components
+	return sizes, values
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
-//
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
-	return func(m optionalAttr) {
-		m["pad"] = value
+// Creates an Optional variant with no value.
+func OptionalNone(scope *Scope) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalNone",
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Encode strings into web-safe base64 format.
-//
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
-//
-// Arguments:
-//	input: Strings to be encoded.
-//
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns true if and only if the given Optional variant has a value.
+func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "OptionalHasValue",
 		Input: []tf.Input{
-			input,
+			optional,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// A dataset that creates window datasets from the input dataset.
-//
-// Arguments:
-//
-//	size: A scalar representing the number of elements to accumulate in a window.
-//	shift: A scalar representing the steps moving the sliding window forward in one
-// iteration. It must be positive.
-//	stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
-// smaller than desired.
-//
-//
-func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the value stored in an Optional variant or raises an error if none exists.
+func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "WindowDataset",
+		Type: "OptionalGetValue",
 		Input: []tf.Input{
-			input_dataset, size, shift, stride, drop_remainder,
+			optional,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("OptionalGetValue", err)
+		return
+	}
+	return components
 }
 
-// Deprecated. Use TensorArrayCloseV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
-//
-// Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Gets the next output from the given iterator as an Optional variant.
+func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
+		Type: "IteratorGetNextAsOptional",
 		Input: []tf.Input{
-			handle,
+			iterator,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
+// Fast Fourier transform.
 //
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
 //
 // Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
+//	input: A complex64 tensor.
 //
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Merge",
+		Type: "FFT",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
-
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
-//
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
-	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
-	}
+	return op.Output(0)
 }
 
-// Closes the given queue.
+// Identity transformation that models performance.
 //
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// Identity transformation that models performance.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	input_dataset: A variant tensor representing the input dataset.
 //
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+//
+func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "ModelDataset",
 		Input: []tf.Input{
-			handle,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atanh",
+		Type: "Greater",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns true if queue is closed.
+// Performs a padding as a preprocess during a convolution.
 //
-// This operation returns true if the queue is closed and false if the queue
-// is open.
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
+		Type: "FusedPadConv2D",
 		Input: []tf.Input{
-			handle,
+			input, paddings, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the absolute value of a tensor.
-//
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Abs",
-		Input: []tf.Input{
-			x,
-		},
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
+
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
 
-// StackV2StackName sets the optional stack_name attribute to value.
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
 //
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["stack_name"] = value
+		m["dilations"] = value
 	}
 }
 
-// A stack that produces elements in first-in last-out order.
+// Computes the gradients of convolution with respect to the input.
 //
 // Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "Conv2DBackpropInput",
 		Input: []tf.Input{
-			max_size,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -31152,600 +31664,519 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
-
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+// Builds a merged tensor such that
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values are merged in order, so if an index appears in both `indices[m][i]` and
+// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
+// merged result. If you do not need this guarantee, ParallelDynamicStitch might
+// perform better on some devices.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
 //
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
+		Type: "DynamicStitch",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
-
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
-//
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
-	}
+	return op.Output(0)
 }
 
-// Decompress strings.
-//
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
-//
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
-//
-// Arguments:
-//	bytes: A Tensor of string which is compressed.
+// Returns the truth value of (x == y) element-wise.
 //
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
+		Type: "Equal",
 		Input: []tf.Input{
-			bytes,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNAttr is an optional argument to CudnnRNN.
-type CudnnRNNAttr func(optionalAttr)
-
-// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNRnnMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNInputMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNDirection(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNDropout(value float32) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// CudnnRNNSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed2(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
 
-// CudnnRNNIsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["element_shape"] = value
 	}
 }
 
-// A RNN backed by cuDNN.
-//
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer.
+// Deprecated. Use TensorArrayGatherV3
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is false.
-func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNN",
+		Type: "TensorArrayGatherV2",
 		Input: []tf.Input{
-			input, input_h, input_c, params,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Creates a TensorArray for storing multiple gradients of values in the given handle.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// Similar to TensorArrayGradV3. However it creates an accumulator with an
-// expanded shape compared to the input TensorArray whose gradient is being
-// computed. This enables multiple gradients for the same TensorArray to be
-// calculated using the same accumulator.
+// Builds a merged tensor such that
 //
-// Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
-// have shape which is this shape_to_prepend value concatenated with shape of the
-// elements in the TensorArray corresponding to the input handle.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradWithShape",
-		Input: []tf.Input{
-			handle, flow_in, shape_to_prepend,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
 //
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
 //
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
+// For example:
 //
 // ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
 // ```
 //
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
-//
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
 //
-// Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
 //
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
+		Type: "ParallelDynamicStitch",
 		Input: []tf.Input{
-			input, threshold,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Push an element onto the tensor_array.
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
+
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	index: The position to write to inside the TensorArray.
-//	value: The tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+// value: The type of each component in a value.
+// If not specified, defaults to <>
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["component_types"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV3",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
+}
+
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Scatter the data from the input value into specific TensorArray elements.
+// PriorityQueueV2Container sets the optional container attribute to value.
 //
-// `indices` must be a vector, its length must match the first dim of `value`.
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements sorted by the first component value.
+//
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations at which to write the tensor elements.
-//	value: The concatenated tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shapes": shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV3",
-		Input: []tf.Input{
-			handle, indices, value, flow_in,
-		},
+		Type: "PriorityQueueV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EmptyAttr is an optional argument to Empty.
-type EmptyAttr func(optionalAttr)
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
 
-// EmptyInit sets the optional init attribute to value.
+// UnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
-// If not specified, defaults to false
-func EmptyInit(value bool) EmptyAttr {
+// REQUIRES: value >= 0
+func UnstageCapacity(value int64) UnstageAttr {
 	return func(m optionalAttr) {
-		m["init"] = value
+		m["capacity"] = value
 	}
 }
 
-// Creates a tensor with the given shape.
-//
-// This operation creates a tensor of `shape` and `dtype`.
-//
-// Arguments:
-//	shape: 1-D. Represents the shape of the output tensor.
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
+// REQUIRES: value >= 0
+func UnstageMemoryLimit(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// UnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnstageContainer(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnstageSharedName(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op is similar to a lightweight Dequeue.
 //
-// Returns A `Tensor` of type `T`.
-func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Empty",
-		Input: []tf.Input{
-			shape,
-		},
+		Type: "Unstage",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("Unstage", err)
+		return
+	}
+	return values
 }
 
-// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
-type TensorArrayConcatV3Attr func(optionalAttr)
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
 
-// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: The expected shape of an element, if known,
-// excluding the first dimension. Used to validate the shapes of
-// TensorArray elements. If this shape is not fully specified, concatenating
-// zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Concat the elements from the TensorArray into value `value`.
-//
-// Takes `T` elements of shapes
-//
-//   ```
-//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-//   ```
-//
-// and concatenates them into a Tensor of shape:
+// Enqueues a tuple of one or more tensors in the given queue.
 //
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// All elements must have the same shape (excepting the first dimension).
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
 //
-// Returns All of the elements in the TensorArray, concatenated along the first
-// axis.A vector of the row sizes of the original T elements in the
-// value output.  In the example above, this would be the values:
-// `(n1, n2, ..., n(T-1))`.
-func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
+// Returns the created operation.
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV3",
+		Type: "QueueEnqueueV2",
 		Input: []tf.Input{
-			handle, flow_in,
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Split the data from the input value into TensorArray elements.
-//
-// Assuming that `lengths` takes on values
-//
-//   ```(n0, n1, ..., n(T-1))```
-//
-// and that `value` has shape
-//
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
-//
-// this splits values into a TensorArray with T tensors.
-//
-// TensorArray index t will be the subtensor of values with starting position
-//
-//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
-//
-// and having size
-//
-//   ```nt x d0 x d1 x ...```
+// Computes the Bessel i0e function of `x` element-wise.
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	value: The concatenated tensor to write to the TensorArray.
-//	lengths: The vector of lengths, how to split the rows of value into the
-// TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// This function is faster and numerically stabler than `bessel_i0(x)`.
+func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV3",
+		Type: "BesselI0e",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for the scaled exponential linear (Selu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
+
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SeluGrad",
-		Input: []tf.Input{
-			gradients, outputs,
-		},
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Get the current size of the TensorArray.
+// Dequeues `n` tuples of one or more tensors from the given queue.
+//
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
+//
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns The current size of the TensorArray.
-func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV3",
+		Type: "QueueDequeueManyV2",
 		Input: []tf.Input{
-			handle, flow_in,
+			handle, n,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
 
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: Bool whether padding is applied at the ends.
 // If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["pad"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// Encode strings into web-safe base64 format.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// Web-safe means that the encoder uses - and _ instead of + and /.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	input: Strings to be encoded.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -31754,9 +32185,9 @@ func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -31764,79 +32195,28 @@ func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
-//
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
-	}
-}
-
-// AsStringScientific sets the optional scientific attribute to value.
-//
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["scientific"] = value
-	}
-}
-
-// AsStringShortest sets the optional shortest attribute to value.
+// A dataset that creates window datasets from the input dataset.
 //
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["shortest"] = value
-	}
-}
-
-// AsStringWidth sets the optional width attribute to value.
+// Arguments:
 //
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["width"] = value
-	}
-}
-
-// AsStringFill sets the optional fill attribute to value.
+//	size: A scalar representing the number of elements to accumulate in a window.
+//	shift: A scalar representing the steps moving the sliding window forward in one
+// iteration. It must be positive.
+//	stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
+// smaller than desired.
 //
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
-	return func(m optionalAttr) {
-		m["fill"] = value
-	}
-}
-
-// Converts each entry in the given tensor to strings.  Supports many numeric
 //
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "AsString",
+		Type: "WindowDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, size, shift, stride, drop_remainder,
 		},
 		Attrs: attrs,
 	}
@@ -31844,408 +32224,407 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayScatterV3
+// Deprecated. Use TensorArrayCloseV3
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
-func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+//
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV2",
+		Type: "TensorArrayCloseV2",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
-//
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
-//
-// The resulting value `output` would look like this:
+// Forwards the value of an available tensor from `inputs` to `output`.
 //
-//     [1, 13, 3, 14, 14, 6, 7, 20]
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
 //
-// See `tf.scatter_nd` for more details about how to make updates to slices.
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
 //
 // Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
+//	inputs: The input tensors, exactly one of which will become available.
 //
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
+		Type: "Merge",
 		Input: []tf.Input{
-			input, indices, updates,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
 
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
 //
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
 // If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
 	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+		m["cancel_pending_enqueues"] = value
 	}
 }
 
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
+// Closes the given queue.
 //
-// `index  0  1  2  3  4`
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
 //
-// `value  20 5  16 3  7`
+// Arguments:
+//	handle: The handle to a queue.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+	opspec := tf.OpSpec{
+		Type: "QueueCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atanh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
-//
-// Then, row_pooling_sequence should satisfy:
-//
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
+// Returns true if queue is closed.
 //
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+// This operation returns true if the queue is closed and false if the queue
+// is open.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
-//
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "QueueIsClosedV2",
 		Input: []tf.Input{
-			value,
+			handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a MultiDeviceIterator resource.
-//
-// Arguments:
-//	devices: A list of devices the iterator works across.
-//	shared_name: If non-empty, this resource will be shared under the given name
-// across multiple sessions.
-//	container: If non-empty, this resource is placed in the given container.
-// Otherwise, a default container is used.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
+// Computes the absolute value of a tensor.
 //
-// Returns Handle to the resource created.
-func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIterator",
-
-		Attrs: attrs,
+		Type: "Abs",
+		Input: []tf.Input{
+			x,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySizeV3
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
+
+// StackV2StackName sets the optional stack_name attribute to value.
+//
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
+	return func(m optionalAttr) {
+		m["stack_name"] = value
+	}
+}
+
+// A stack that produces elements in first-in last-out order.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Arguments:
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
+//
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
+		Type: "StackV2",
 		Input: []tf.Input{
-			handle, flow_in,
+			max_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
 
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["capacity"] = value
 	}
 }
 
-// Conv2DDataFormat sets the optional data_format attribute to value.
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Conv2DDilations sets the optional dilations attribute to value.
+// OrderedMapStageContainer sets the optional container attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DDilations(value []int64) Conv2DAttr {
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["container"] = value
 	}
 }
 
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
 //
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
 //
-// In detail, with the default NHWC format,
+// associative container.   Elements are ordered by key.
 //
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
+// Arguments:
+//	key: int64
 //
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
 //
-// Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-// `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
 //
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2D",
+		Type: "OrderedMapStage",
 		Input: []tf.Input{
-			input, filter,
+			key, indices, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// StageAttr is an optional argument to Stage.
-type StageAttr func(optionalAttr)
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
 
-// StageCapacity sets the optional capacity attribute to value.
+// RpcProtocol sets the optional protocol attribute to value.
 //
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
 //
-// REQUIRES: value >= 0
-func StageCapacity(value int64) StageAttr {
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["fail_fast"] = value
 	}
 }
 
-// StageMemoryLimit sets the optional memory_limit attribute to value.
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
 //
-// value: The maximum number of bytes allowed for Tensors in the Staging Area.
-// If > 0, inserts will block until sufficient space is available.
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageMemoryLimit(value int64) StageAttr {
+func RpcTimeoutInMs(value int64) RpcAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["timeout_in_ms"] = value
 	}
 }
 
-// StageContainer sets the optional container attribute to value.
+// Perform batches of RPC requests.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func StageContainer(value string) StageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StageSharedName sets the optional shared_name attribute to value.
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func StageSharedName(value string) StageAttr {
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["swap_memory"] = value
 	}
 }
 
-// Stage values similar to a lightweight Enqueue.
-//
-// The basic functionality of this Op is similar to a queue with many
-// fewer capabilities and options.  This Op is optimized for performance.
+// Push an element onto the stack.
 //
 // Arguments:
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
 //
-// Returns the created operation.
-func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32254,417 +32633,455 @@ func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Opera
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Stage",
+		Type: "StackPushV2",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			handle, elem,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StagePeekAttr is an optional argument to StagePeek.
-type StagePeekAttr func(optionalAttr)
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
 
-// StagePeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
 //
-// REQUIRES: value >= 0
-func StagePeekCapacity(value int64) StagePeekAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["epsilon"] = value
 	}
 }
 
-// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
 //
-// REQUIRES: value >= 0
-func StagePeekMemoryLimit(value int64) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StagePeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StagePeekContainer(value string) StagePeekAttr {
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["data_format"] = value
 	}
 }
 
-// StagePeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StagePeekSharedName(value string) StagePeekAttr {
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["is_training"] = value
 	}
 }
 
-// Op peeks at the values at the specified index.  If the
+// Gradient for batch normalization.
 //
-// underlying container does not contain sufficient elements
-// this op will block until it does.   This Op is optimized for
-// performance.
-func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StagePeek",
+		Type: "FusedBatchNormGradV2",
 		Input: []tf.Input{
-			index,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Creates a TensorArray for storing multiple gradients of values in the given handle.
+//
+// Similar to TensorArrayGradV3. However it creates an accumulator with an
+// expanded shape compared to the input TensorArray whose gradient is being
+// computed. This enables multiple gradients for the same TensorArray to be
+// calculated using the same accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
+// have shape which is this shape_to_prepend value concatenated with shape of the
+// elements in the TensorArray corresponding to the input handle.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("StagePeek", err)
-		return
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradWithShape",
+		Input: []tf.Input{
+			handle, flow_in, shape_to_prepend,
+		},
+		Attrs: attrs,
 	}
-	return values
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// MapStageAttr is an optional argument to MapStage.
-type MapStageAttr func(optionalAttr)
-
-// MapStageCapacity sets the optional capacity attribute to value.
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
 //
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
 //
-// REQUIRES: value >= 0
-func MapStageCapacity(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+//
+// Arguments:
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
+//
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CompareAndBitpack",
+		Input: []tf.Input{
+			input, threshold,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Push an element onto the tensor_array.
 //
-// REQUIRES: value >= 0
-func MapStageMemoryLimit(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayWriteV3",
+		Input: []tf.Input{
+			handle, index, value, flow_in,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapStageContainer sets the optional container attribute to value.
+// Scatter the data from the input value into specific TensorArray elements.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func MapStageContainer(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// `indices` must be a vector, its length must match the first dim of `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV3",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapStageSharedName sets the optional shared_name attribute to value.
+// EmptyAttr is an optional argument to Empty.
+type EmptyAttr func(optionalAttr)
+
+// EmptyInit sets the optional init attribute to value.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func MapStageSharedName(value string) MapStageAttr {
+// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
+// If not specified, defaults to false
+func EmptyInit(value bool) EmptyAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["init"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a hashtable.
+// Creates a tensor with the given shape.
+//
+// This operation creates a tensor of `shape` and `dtype`.
 //
 // Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
+//	shape: 1-D. Represents the shape of the output tensor.
 //
 //
-// Returns the created operation.
-func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
+// Returns A `Tensor` of type `T`.
+func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapStage",
+		Type: "Empty",
 		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
+			shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
+// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
+type TensorArrayConcatV3Attr func(optionalAttr)
 
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
 //
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
+// value: The expected shape of an element, if known,
+// excluding the first dimension. Used to validate the shapes of
+// TensorArray elements. If this shape is not fully specified, concatenating
+// zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["element_shape_except0"] = value
 	}
 }
 
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Concat the elements from the TensorArray into value `value`.
 //
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
+// Takes `T` elements of shapes
 //
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+//   ```
+//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+//   ```
+//
+// and concatenates them into a Tensor of shape:
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+//
+// All elements must have the same shape (excepting the first dimension).
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstage",
+		Type: "TensorArrayConcatV3",
 		Input: []tf.Input{
-			key, indices,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
-		return
-	}
-	return values
+	return op.Output(0), op.Output(1)
 }
 
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
-
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Split the data from the input value into TensorArray elements.
 //
-// REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Assuming that `lengths` takes on values
 //
-// REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+//   ```(n0, n1, ..., n(T-1))```
+//
+// and that `value` has shape
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+//
+// this splits values into a TensorArray with T tensors.
+//
+// TensorArray index t will be the subtensor of values with starting position
+//
+//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+//
+// and having size
+//
+//   ```nt x d0 x d1 x ...```
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	value: The concatenated tensor to write to the TensorArray.
+//	lengths: The vector of lengths, how to split the rows of value into the
+// TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
-
-		Attrs: attrs,
+		Type: "TensorArraySplitV3",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Generate the bucket boundaries for each feature based on accumulated summaries.
-//
-// An op that returns a list of float tensors for a quantile stream resource. Each
-// tensor is Rank 1 containing bucket boundaries for a single feature.
+// Computes gradients for the scaled exponential linear (Selu) operation.
 //
 // Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	num_features: inferred int; number of features to get bucket boundaries for.
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
 //
-// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
-func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_features": num_features}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
+		Type: "SeluGrad",
 		Input: []tf.Input{
-			quantile_stream_resource_handle,
+			gradients, outputs,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
-		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
-		return
-	}
-	return bucket_boundaries
+	return op.Output(0)
 }
 
-// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
-type OrderedMapUnstageAttr func(optionalAttr)
-
-// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Get the current size of the TensorArray.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Returns The current size of the TensorArray.
+func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "TensorArraySizeV3",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op removes and returns the values associated with the key
+// Deprecated. Use TensorArrayGradV3
 //
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstage",
+		Type: "TensorArrayGradV2",
 		Input: []tf.Input{
-			key, indices,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstage", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// BoostedTreesQuantileStreamResourceHandleOpAttr is an optional argument to BoostedTreesQuantileStreamResourceHandleOp.
-type BoostedTreesQuantileStreamResourceHandleOpAttr func(optionalAttr)
-
-// BoostedTreesQuantileStreamResourceHandleOpContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func BoostedTreesQuantileStreamResourceHandleOpContainer(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
 
-// BoostedTreesQuantileStreamResourceHandleOpSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func BoostedTreesQuantileStreamResourceHandleOpSharedName(value string) BoostedTreesQuantileStreamResourceHandleOpAttr {
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Creates a handle to a BoostedTreesQuantileStreamResource.
-func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...BoostedTreesQuantileStreamResourceHandleOpAttr) (resource tf.Output) {
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32673,394 +33090,498 @@ func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...Booste
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceHandleOp",
-
+		Type: "SparseReduceMax",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
-type OrderedMapSizeAttr func(optionalAttr)
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
 
-// OrderedMapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// AsStringPrecision sets the optional precision attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["precision"] = value
 	}
 }
 
-// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// AsStringScientific sets the optional scientific attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["scientific"] = value
 	}
 }
 
-// OrderedMapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
+// AsStringShortest sets the optional shortest attribute to value.
+//
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["shortest"] = value
 	}
 }
 
-// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
+// AsStringWidth sets the optional width attribute to value.
+//
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["width"] = value
+	}
+}
+
+// AsStringFill sets the optional fill attribute to value.
+//
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
 // If not specified, defaults to ""
-func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
+func AsStringFill(value string) AsStringAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["fill"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
+// Converts each entry in the given tensor to strings.  Supports many numeric
+//
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapSize",
-
+		Type: "AsString",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNV2Attr is an optional argument to CudnnRNNV2.
-type CudnnRNNV2Attr func(optionalAttr)
+// Deprecated. Use TensorArrayScatterV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV2",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// CudnnRNNV2RnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNV2RnnMode(value string) CudnnRNNV2Attr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
+// Applies sparse addition to `input` using individual values or slices
+//
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
+//
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNdNonAliasingAdd",
+		Input: []tf.Input{
+			input, indices, updates,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNV2InputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNV2InputMode(value string) CudnnRNNV2Attr {
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["input_mode"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// CudnnRNNV2Direction sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNV2Direction(value string) CudnnRNNV2Attr {
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["direction"] = value
+		m["overlapping"] = value
 	}
 }
 
-// CudnnRNNV2Dropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNV2Dropout(value float32) CudnnRNNV2Attr {
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["dropout"] = value
+		m["deterministic"] = value
 	}
 }
 
-// CudnnRNNV2Seed sets the optional seed attribute to value.
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func CudnnRNNV2Seed(value int64) CudnnRNNV2Attr {
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// CudnnRNNV2Seed2 sets the optional seed2 attribute to value.
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
 // If not specified, defaults to 0
-func CudnnRNNV2Seed2(value int64) CudnnRNNV2Attr {
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// CudnnRNNV2IsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNV2IsTraining(value bool) CudnnRNNV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// A RNN backed by cuDNN.
+// Performs fractional max pooling on the input.
 //
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer. Produces one extra output "host_reserved" than CudnnRNN.
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicates whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is true.
-// host_reserved: An opaque tensor that can be used in backprop calculation. It is
-//   only produced if is_training is true. It is output on host memory rather than
-//   device memory.
-func CudnnRNNV2(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNV2Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNV2",
+		Type: "FractionalMaxPool",
 		Input: []tf.Input{
-			input, input_h, input_c, params,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ShapeNAttr is an optional argument to ShapeN.
-type ShapeNAttr func(optionalAttr)
+// Creates a MultiDeviceIterator resource.
+//
+// Arguments:
+//	devices: A list of devices the iterator works across.
+//	shared_name: If non-empty, this resource will be shared under the given name
+// across multiple sessions.
+//	container: If non-empty, this resource is placed in the given container.
+// Otherwise, a default container is used.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+//
+// Returns Handle to the resource created.
+func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIterator",
 
-// ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeNOutType(value tf.DataType) ShapeNAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns shape of tensors.
+// Deprecated. Use TensorArraySizeV3
 //
-// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ShapeN",
+		Type: "TensorArraySizeV2",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			handle, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ShapeN", err)
-		return
-	}
-	return output
-}
-
-// CudnnRNNParamsToCanonicalAttr is an optional argument to CudnnRNNParamsToCanonical.
-type CudnnRNNParamsToCanonicalAttr func(optionalAttr)
-
-// CudnnRNNParamsToCanonicalRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsToCanonicalRnnMode(value string) CudnnRNNParamsToCanonicalAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNParamsToCanonicalInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsToCanonicalInputMode(value string) CudnnRNNParamsToCanonicalAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
+	return op.Output(0)
 }
 
-// CudnnRNNParamsToCanonicalDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsToCanonicalDirection(value string) CudnnRNNParamsToCanonicalAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
 
-// CudnnRNNParamsToCanonicalDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsToCanonicalDropout(value float32) CudnnRNNParamsToCanonicalAttr {
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
 	return func(m optionalAttr) {
-		m["dropout"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// CudnnRNNParamsToCanonicalSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsToCanonicalSeed(value int64) CudnnRNNParamsToCanonicalAttr {
+// Conv2DDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["data_format"] = value
 	}
 }
 
-// CudnnRNNParamsToCanonicalSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsToCanonicalSeed2(value int64) CudnnRNNParamsToCanonicalAttr {
+// Conv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["dilations"] = value
 	}
 }
 
-// Retrieves CudnnRNN params in canonical form.
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
+//
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
 //
-// Retrieves a set of weights from the opaque params buffer that can be saved and
-// restored in a way compatible with future runs.
+// In detail, with the default NHWC format,
 //
-// Note that the params buffer may not be compatible across different GPUs. So any
-// save and restoration should be converted to and from the canonical weights and
-// biases.
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
 //
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// num_params: number of parameter sets for all layers.
-//     Each layer may contain multiple parameter sets, with each set consisting of
-//     a weight matrix and a bias vector.
-// weights: the canonical form of weights that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// biases: the canonical form of biases that can be used for saving
-//     and restoration. They are more likely to be compatible across different
-//     generations.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     The actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//     dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, params tf.Output, num_params int64, optional ...CudnnRNNParamsToCanonicalAttr) (weights []tf.Output, biases []tf.Output) {
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
+//
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_params": num_params}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsToCanonical",
+		Type: "Conv2D",
 		Input: []tf.Input{
-			num_layers, num_units, input_size, params,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if weights, idx, err = makeOutputList(op, idx, "weights"); err != nil {
-		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
-		return
-	}
-	if biases, idx, err = makeOutputList(op, idx, "biases"); err != nil {
-		scope.UpdateErr("CudnnRNNParamsToCanonical", err)
-		return
-	}
-	return weights, biases
+	return op.Output(0)
 }
 
-// CTCLossAttr is an optional argument to CTCLoss.
-type CTCLossAttr func(optionalAttr)
+// StageAttr is an optional argument to Stage.
+type StageAttr func(optionalAttr)
 
-// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+// StageCapacity sets the optional capacity attribute to value.
 //
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageCapacity(value int64) StageAttr {
 	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
+		m["capacity"] = value
 	}
 }
 
-// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+// StageMemoryLimit sets the optional memory_limit attribute to value.
 //
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+// value: The maximum number of bytes allowed for Tensors in the Staging Area.
+// If > 0, inserts will block until sufficient space is available.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageMemoryLimit(value int64) StageAttr {
 	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+// StageContainer sets the optional container attribute to value.
 //
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
-// If not specified, defaults to false
-func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func StageContainer(value string) StageAttr {
 	return func(m optionalAttr) {
-		m["ignore_longer_outputs_than_inputs"] = value
+		m["container"] = value
 	}
 }
 
-// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+// StageSharedName sets the optional shared_name attribute to value.
 //
-// the gradient.  This class performs the softmax operation for you, so inputs
-// should be e.g. linear projections of outputs by an LSTM.
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func StageSharedName(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage values similar to a lightweight Enqueue.
+//
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
-// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-// `(batch b, time t)`.
-//	labels_values: The values (labels) associated with the given batch and time.
-//	sequence_length: A vector containing sequence lengths (batch).
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
 //
-// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
-// `(max_time x batch_size x num_classes)`.
-func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
+// Returns the created operation.
+func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -33069,91 +33590,83 @@ func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CTCLoss",
+		Type: "Stage",
 		Input: []tf.Input{
-			inputs, labels_indices, labels_values, sequence_length,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
-type CTCGreedyDecoderAttr func(optionalAttr)
+// StagePeekAttr is an optional argument to StagePeek.
+type StagePeekAttr func(optionalAttr)
 
-// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+// StagePeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If True, merge repeated classes in output.
-// If not specified, defaults to false
-func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
+// REQUIRES: value >= 0
+func StagePeekCapacity(value int64) StagePeekAttr {
 	return func(m optionalAttr) {
-		m["merge_repeated"] = value
+		m["capacity"] = value
 	}
 }
 
-// Performs greedy decoding on the logits given in inputs.
-//
-// A note about the attribute merge_repeated: if enabled, when
-// consecutive logits' maximum indices are the same, only the first of
-// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
-// becomes "A B B" if merge_repeated = True and "A B B B B" if
-// merge_repeated = False.
-//
-// Regardless of the value of merge_repeated, if the maximum index of a given
-// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
-// element is emitted.
+// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
+// REQUIRES: value >= 0
+func StagePeekMemoryLimit(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StagePeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StagePeekContainer(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StagePeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StagePeekSharedName(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified index.  If the
 //
-// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
-// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
-// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
-// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
-// log-probabilities.
-func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
+// underlying container does not contain sufficient elements
+// this op will block until it does.   This Op is optimized for
+// performance.
+func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CTCGreedyDecoder",
+		Type: "StagePeek",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			index,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Forwards `data` to the output port determined by `pred`.
-//
-// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-// the data goes to `output_false`.
-//
-// See also `RefSwitch` and `Merge`.
-//
-// Arguments:
-//	data: The tensor to be forwarded to the appropriate output.
-//	pred: A scalar that specifies which output port will receive data.
-//
-// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
-func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Switch",
-		Input: []tf.Input{
-			data, pred,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("StagePeek", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return values
 }
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 9dce78b9a367cdf5243dfab621cc6fc77d732ee5..10808e162ee4cc679430c0573e5bff8322ad6fff 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -381,6 +381,7 @@ tf_cc_binary(
     linkshared = 1,
     linkstatic = 1,
     deps = [
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/java/src/main/native",
         LINKER_VERSION_SCRIPT,
         LINKER_EXPORTED_SYMBOLS,
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 7ef862ae79245efb8249c0960114973ce5516a6f..951e8bdd0dd8aae46a361a8ffcff276579433641 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -1,12 +1,13 @@
 # TensorFlow for Java
 
 > *WARNING*: The TensorFlow Java API is not currently covered by the TensorFlow
-> [API stability guarantees](https://www.tensorflow.org/guide/version_semantics).
+> [API stability guarantees](https://www.tensorflow.org/guide/version_compat).
 >
 > For using TensorFlow on Android refer instead to
 > [contrib/android](https://www.tensorflow.org/code/tensorflow/contrib/android),
 > [makefile](https://www.tensorflow.org/code/tensorflow/contrib/makefile#android)
-> and/or the [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
+> and/or the
+> [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
 
 ## Quickstart
 
@@ -14,6 +15,19 @@
 -   [Javadoc](https://www.tensorflow.org/api_docs/java/reference/org/tensorflow/package-summary)
 -   [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.tensorflow/tensorflow)
 
+## Nightly builds
+
+Releases built from release branches are available on Maven Central.
+Additionally, every day binaries are built from the `master` branch on GitHub:
+
+- [JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow.jar)
+- [Sourc JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-src.jar)
+- JNI:
+  - [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz)
+  - [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz)
+  - [MacOS](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz)
+  - Windows: (No nightly builds available yet)
+
 ## Building from source
 
 If the quickstart instructions above do not work out, the TensorFlow Java and
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 6b3e305e5d015f0820f837cd54027e6e3946781d..db3a3609f1ac4fda18ff5a1248e61c675a8bf9f9 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index f1305159344f12248461cafbe000b75197c5c06d..53f7a2d63ef5bc8cfe4fbe372cf2fd3f58a0fe33 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 67ecc2d597d18995fa37fc4a9e55794bf062f111..a17724c805e38239c61dd27a5cc9ec918bbb2e0f 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 8ba859da011708069d67f7d9a64533038dfd558e..30831f90b9f7b4beb5ae3f2ceebadcb6e1f8771e 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.11.0</version>
+  <version>1.12.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index dcd654d713aa41867dda993ad0110f172a3b213c..dd6b52be62487ba6cb989b4917a15df7f473a848 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index 45214f834cb597ba19911e44ce589d63fd9f5b2e..f47c11809d58464953028c388d491b91f67c3510 100644
--- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -6,7 +6,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>spark-tensorflow-connector_2.11</artifactId>
     <packaging>jar</packaging>
-    <version>1.11.0</version>
+    <version>1.12.0</version>
     <name>spark-tensorflow-connector</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
@@ -33,7 +33,7 @@
         <scala.test.version>2.2.6</scala.test.version>
         <maven.compiler.version>3.0</maven.compiler.version>
         <java.version>1.8</java.version>
-        <spark.version>2.3.0</spark.version>
+        <spark.version>2.3.1</spark.version>
         <yarn.api.version>2.7.3</yarn.api.version>
         <junit.version>4.11</junit.version>
     </properties>
@@ -213,6 +213,20 @@
         </plugins>
     </build>
 
+    <repositories>
+        <repository>
+            <id>apache.snapshots</id>
+            <name>Apache Development Snapshot Repository</name>
+            <url>https://repository.apache.org/content/repositories/snapshots/</url>
+            <releases>
+                <enabled>false</enabled>
+            </releases>
+            <snapshots>
+                <enabled>true</enabled>
+            </snapshots>
+        </repository>
+    </repositories>
+
     <profiles>
         <profile>
             <id>test</id>
diff --git a/tensorflow/java/maven/tensorflow-hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
index a8669ee72b1b73ad45d552cb968fcee3bc3e3bfb..11aaba983f6ded9a6e757703fd9a2411db82ceb6 100644
--- a/tensorflow/java/maven/tensorflow-hadoop/pom.xml
+++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
@@ -5,7 +5,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>tensorflow-hadoop</artifactId>
     <packaging>jar</packaging>
-    <version>1.11.0</version>
+    <version>1.12.0</version>
     <name>tensorflow-hadoop</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 67d628ba111408e2d5673068b310235ee89f2b47..07fcfa5144600f7d9cbf6edbfbecbecc7c115631 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.11.0</version>
+    <version>1.12.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Input.java b/tensorflow/java/src/main/java/org/tensorflow/Input.java
deleted file mode 100644
index 13bc463e7d6a991858332a353681b24fff417547..0000000000000000000000000000000000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/Input.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow;
-
-/**
- * Interface implemented by operands of a TensorFlow operation.
- *
- * <p>Example usage:
- *
- * <pre>{@code
- * // The "decodeJpeg" operation can be used as input to the "cast" operation
- * Input decodeJpeg = ops.image().decodeJpeg(...);
- * ops.math().cast(decodeJpeg, DataType.FLOAT);
- *
- * // The output "y" of the "unique" operation can be used as input to the "cast" operation
- * Output y = ops.array().unique(...).y();
- * ops.math().cast(y, DataType.FLOAT);
- *
- * // The "split" operation can be used as input list to the "concat" operation
- * Iterable<? extends Input> split = ops.array().split(...);
- * ops.array().concat(0, split);
- * }</pre>
- */
-public interface Input<T> {
-
-  /**
-   * Returns the symbolic handle of a tensor.
-   *
-   * <p>Inputs to TensorFlow operations are outputs of another TensorFlow operation. This method is
-   * used to obtain a symbolic handle that represents the computation of the input.
-   *
-   * @see OperationBuilder#addInput(Output)
-   */
-  Output<T> asOutput();
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Server.java b/tensorflow/java/src/main/java/org/tensorflow/Server.java
new file mode 100644
index 0000000000000000000000000000000000000000..6adcdba17b3b56ef5b65314e1d225c41c7d63fd3
--- /dev/null
+++ b/tensorflow/java/src/main/java/org/tensorflow/Server.java
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow;
+
+/**
+ * An in-process TensorFlow server, for use in distributed training.
+ *
+ * <p>A {@code Server} instance encapsulates a set of devices and a {@link org.tensorflow.Session}
+ * target that can participate in distributed training. A server belongs to a cluster (specified by
+ * a {@code ClusterSpec}), and corresponds to a particular task in a named job. The server can
+ * communicate with any other server in the same cluster. The server will not serve any requests
+ * until {@link #start()} is invoked. The server will stop serving requests once {@link #stop()} or
+ * {@link #close()} is invoked. Be aware that {@link #close()} method stops the server if it is
+ * running.
+ *
+ * <p><b>WARNING:</b> A {@code Server} owns resources that <b>must</b> be explicitly freed by
+ * invoking {@link #close()}.
+ *
+ * <p>Instances of a {@code Server} are thread-safe.
+ *
+ * <p>Using example:
+ *
+ * <pre>{@code
+ * import org.tensorflow.Server;
+ * import org.tensorflow.distruntime.ClusterDef;
+ * import org.tensorflow.distruntime.JobDef;
+ * import org.tensorflow.distruntime.ServerDef;
+ *
+ * ClusterDef clusterDef = ClusterDef.newBuilder()
+ *   .addJob(JobDef.newBuilder()
+ *   .setName("worker")
+ *   .putTasks(0, "localhost:4321")
+ *   .build()
+ * ).build();
+ *
+ * ServerDef serverDef = ServerDef.newBuilder()
+ *   .setCluster(clusterDef)
+ *   .setJobName("worker")
+ *   .setTaskIndex(0)
+ *   .setProtocol("grpc")
+ * .build();
+ *
+ * try (Server srv = new Server(serverDef.toByteArray())) {
+ *   srv.start();
+ *   srv.join();
+ * }
+ * }</pre>
+ */
+public final class Server implements AutoCloseable {
+  /**
+   * Constructs a new instance of server.
+   *
+   * @param serverDef Server definition specified as a serialized <a
+   *     href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/tensorflow_server.proto">ServerDef</a>
+   *     protocol buffer.
+   */
+  public Server(byte[] serverDef) {
+    nativeHandle = allocate(serverDef);
+  }
+
+  /** Starts an in-process TensorFlow server. */
+  public synchronized void start() {
+    start(nativeHandle);
+  }
+
+  /** Stops an in-process TensorFlow server. */
+  public synchronized void stop() {
+    stop(nativeHandle);
+  }
+
+  /** Blocks until the server has been successfully stopped. */
+  public void join() {
+    long handle = 0;
+    synchronized (this) {
+      handle = nativeHandle;
+      if (handle != 0) {
+        numJoining++;
+      }
+    }
+    try {
+      join(handle);
+    } finally {
+      synchronized (this) {
+        if (handle != 0) {
+          numJoining--;
+        }
+        notifyAll();
+      }
+    }
+  }
+
+  /** Destroy an in-process TensorFlow server, frees memory. */
+  @Override
+  public synchronized void close() throws InterruptedException {
+    stop();
+    while (numJoining > 0) {
+      wait();
+    }
+    delete(nativeHandle);
+    nativeHandle = 0;
+  }
+
+  private static native long allocate(byte[] serverDef);
+
+  private static native void start(long nativeHandle);
+
+  private static native void stop(long nativeHandle);
+
+  private static native void join(long nativeHandle);
+
+  private static native void delete(long nativeHandle);
+
+  private long nativeHandle;
+
+  private int numJoining;
+
+  static {
+    TensorFlow.init();
+  }
+}
diff --git a/tensorflow/java/src/main/native/server_jni.cc b/tensorflow/java/src/main/native/server_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d92685740458831011c3f6cc1ee8df8a995e9363
--- /dev/null
+++ b/tensorflow/java/src/main/native/server_jni.cc
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/java/src/main/native/server_jni.h"
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/java/src/main/native/exception_jni.h"
+#include "tensorflow/java/src/main/native/utils_jni.h"
+
+namespace {
+
+TF_Server* requireHandle(JNIEnv* env, jlong handle) {
+  static_assert(sizeof(jlong) >= sizeof(TF_Server*),
+                "Cannot package C object pointers as a Java long");
+  if (handle == 0) {
+    throwException(env, kIllegalStateException,
+                   "close() has been called on the Server");
+    return nullptr;
+  }
+
+  return reinterpret_cast<TF_Server*>(handle);
+}
+
+}  // namespace
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Server_allocate(
+    JNIEnv* env, jclass clazz, jbyteArray server_def) {
+  TF_Status* status = TF_NewStatus();
+
+  jbyte* server_def_ptr = env->GetByteArrayElements(server_def, nullptr);
+
+  TF_Server* server = TF_NewServer(
+      server_def_ptr, static_cast<size_t>(env->GetArrayLength(server_def)),
+      status);
+
+  env->ReleaseByteArrayElements(server_def, server_def_ptr, JNI_ABORT);
+  bool ok = throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+
+  return ok ? reinterpret_cast<jlong>(server) : 0;
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv* env,
+                                                        jclass clazz,
+                                                        jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_Status* status = TF_NewStatus();
+
+  TF_ServerStart(server, status);
+  throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv* env,
+                                                       jclass clazz,
+                                                       jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_Status* status = TF_NewStatus();
+
+  TF_ServerStop(server, status);
+  throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv* env,
+                                                       jclass clazz,
+                                                       jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_Status* status = TF_NewStatus();
+
+  TF_ServerJoin(server, status);
+  throwExceptionIfNotOK(env, status);
+
+  TF_DeleteStatus(status);
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv* env,
+                                                         jclass clazz,
+                                                         jlong handle) {
+  TF_Server* server = requireHandle(env, handle);
+  if (server == nullptr) return;
+
+  TF_DeleteServer(server);
+}
diff --git a/tensorflow/java/src/main/native/server_jni.h b/tensorflow/java/src/main/native/server_jni.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bfe90b7a85b1a21f91ffe136c8bbf717da59d78
--- /dev/null
+++ b/tensorflow/java/src/main/native/server_jni.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_
+#define TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Class:     org_tensorflow_Server
+ * Method:    allocate
+ * Signature: ([B)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_Server_allocate(JNIEnv *, jclass, jbyteArray server_def);
+
+/*
+ * Class:     org_tensorflow_Server
+ * Method:    start
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_start(JNIEnv *, jclass,
+                                                        jlong);
+
+/*
+ * Class:     org_tensorflow_Server
+ * Method:    stop
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_stop(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    join
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_join(JNIEnv *, jclass, jlong);
+
+/*
+ * Class:     org_tensorflow_Session
+ * Method:    delete
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_Server_delete(JNIEnv *, jclass,
+                                                         jlong);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_JAVA_SRC_MAIN_NATIVE_SERVER_JNI_H_
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index 1bd00a763ddff2f067183f57cfa80fdcbed84fd2..3229cce2776dd305a67d5936c37db5b1d9626402 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.DoubleBuffer;
@@ -100,7 +101,7 @@ public class TensorTest {
                     : ByteOrder.LITTLE_ENDIAN)
             .asDoubleBuffer()
             .put(doubles);
-    buf.flip();
+    flipBuffer(buf);
     try (Tensor<Double> t = Tensor.create(new long[] {doubles.length}, buf)) {
       double[] actual = new double[doubles.length];
       assertArrayEquals(doubles, t.copyTo(actual), EPSILON);
@@ -179,30 +180,30 @@ public class TensorTest {
       {
         ByteBuffer bbuf = ByteBuffer.allocate(1024).order(ByteOrder.nativeOrder());
 
-        bbuf.clear(); // FLOAT
+        clearBuffer(bbuf); // FLOAT
         tfloats.writeTo(bbuf);
         assertEquals(tfloats.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(floats[0], bbuf.asFloatBuffer().get(0), EPSILON);
-        bbuf.clear(); // DOUBLE
+        clearBuffer(bbuf); // DOUBLE
         tdoubles.writeTo(bbuf);
         assertEquals(tdoubles.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(doubles[0], bbuf.asDoubleBuffer().get(0), EPSILON);
-        bbuf.clear(); // INT32
+        clearBuffer(bbuf); // INT32
         tints.writeTo(bbuf);
         assertEquals(tints.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(ints[0], bbuf.asIntBuffer().get(0));
-        bbuf.clear(); // INT64
+        clearBuffer(bbuf); // INT64
         tlongs.writeTo(bbuf);
         assertEquals(tlongs.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(longs[0], bbuf.asLongBuffer().get(0));
-        bbuf.clear(); // BOOL
+        clearBuffer(bbuf); // BOOL
         tbools.writeTo(bbuf);
         assertEquals(tbools.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(bools[0], bbuf.get(0) != 0);
       }
 
@@ -254,7 +255,7 @@ public class TensorTest {
                         : ByteOrder.LITTLE_ENDIAN)
                 .asDoubleBuffer();
         tdoubles.writeTo(foreignBuf);
-        foreignBuf.flip();
+        flipBuffer(foreignBuf);
         double[] actual = new double[foreignBuf.remaining()];
         foreignBuf.get(actual);
         assertArrayEquals(doubles, actual, EPSILON);
@@ -547,4 +548,25 @@ public class TensorTest {
       // expected.
     }
   }
+
+  // Workaround for cross compiliation
+  // (e.g., javac -source 1.9 -target 1.8).
+  //
+  // In Java 8 and prior, subclasses of java.nio.Buffer (e.g., java.nio.DoubleBuffer) inherited the
+  // "flip()" and "clear()" methods from java.nio.Buffer resulting in the signature:
+  //   Buffer flip();
+  // In Java 9 these subclasses had their own methods like:
+  //   DoubleBuffer flip();
+  // As a result, compiling for 1.9 source for a target of JDK 1.8 would result in errors at runtime
+  // like:
+  //
+  // java.lang.NoSuchMethodError: java.nio.DoubleBuffer.flip()Ljava/nio/DoubleBuffer
+  private static void flipBuffer(Buffer buf) {
+    buf.flip();
+  }
+
+  // See comment for flipBuffer()
+  private static void clearBuffer(Buffer buf) {
+    buf.clear();
+  }
 }
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8fca01624cfa2c21cd428e63ed1eadf7b853f107
--- /dev/null
+++ b/tensorflow/lite/BUILD
@@ -0,0 +1,355 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+
+exports_files(glob([
+    "testdata/*.bin",
+    "testdata/*.pb",
+    "models/testdata/*",
+]))
+
+config_setting(
+    name = "mips",
+    values = {
+        "cpu": "mips",
+    },
+)
+
+config_setting(
+    name = "mips64",
+    values = {
+        "cpu": "mips64",
+    },
+)
+
+# Enables inclusion of select TensorFlow kernels via the TF Lite Flex delegate.
+# WARNING: This build flag is experimental and subject to change.
+config_setting(
+    name = "with_select_tf_ops",
+    define_values = {"with_select_tf_ops": "true"},
+    visibility = ["//visibility:public"],
+)
+
+TFLITE_DEFAULT_COPTS = if_not_windows([
+    "-Wall",
+    "-Wno-comment",
+])
+
+cc_library(
+    name = "schema_fbs_version",
+    hdrs = ["version.h"],
+    copts = TFLITE_DEFAULT_COPTS,
+)
+
+cc_library(
+    name = "arena_planner",
+    srcs = ["arena_planner.cc"],
+    hdrs = ["arena_planner.h"],
+    copts = TFLITE_DEFAULT_COPTS,
+    deps = [
+        ":graph_info",
+        ":memory_planner",
+        ":simple_memory_arena",
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_test(
+    name = "arena_planner_test",
+    size = "small",
+    srcs = ["arena_planner_test.cc"],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":arena_planner",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Main library. No ops are included here.
+# TODO(aselle): Resolve problems preventing C99 usage.
+cc_library(
+    name = "context",
+    hdrs = ["context.h"],
+    copts = TFLITE_DEFAULT_COPTS,
+    deps = ["//tensorflow/lite/c:c_api_internal"],
+)
+
+cc_library(
+    name = "graph_info",
+    hdrs = ["graph_info.h"],
+    copts = TFLITE_DEFAULT_COPTS,
+    deps = ["//tensorflow/lite/c:c_api_internal"],
+)
+
+cc_library(
+    name = "memory_planner",
+    hdrs = ["memory_planner.h"],
+    copts = TFLITE_DEFAULT_COPTS,
+    deps = ["//tensorflow/lite/c:c_api_internal"],
+)
+
+cc_library(
+    name = "simple_memory_arena",
+    srcs = ["simple_memory_arena.cc"],
+    hdrs = ["simple_memory_arena.h"],
+    copts = TFLITE_DEFAULT_COPTS,
+    deps = ["//tensorflow/lite/c:c_api_internal"],
+)
+
+cc_library(
+    name = "builtin_op_data",
+    hdrs = [
+        "builtin_op_data.h",
+    ],
+    deps = ["//tensorflow/lite/c:c_api_internal"],
+)
+
+cc_library(
+    name = "kernel_api",
+    hdrs = [
+        "builtin_op_data.h",
+        "builtin_ops.h",
+        "context_util.h",
+    ],
+    deps = ["//tensorflow/lite/c:c_api_internal"],
+)
+
+exports_files(["builtin_ops.h"])
+
+cc_library(
+    name = "string",
+    hdrs = [
+        "string.h",
+    ],
+    copts = TFLITE_DEFAULT_COPTS,
+)
+
+# TODO(ahentz): investigate dependency on gemm_support requiring usage of tf_copts.
+cc_library(
+    name = "framework",
+    srcs = [
+        "allocation.cc",
+        "core/subgraph.cc",
+        "graph_info.cc",
+        "interpreter.cc",
+        "model.cc",
+        "mutable_op_resolver.cc",
+        "optional_debug_tools.cc",
+        "stderr_reporter.cc",
+    ] + select({
+        "//tensorflow:android": [
+            "nnapi_delegate.cc",
+            "mmap_allocation.cc",
+        ],
+        "//tensorflow:windows": [
+            "nnapi_delegate_disabled.cc",
+            "mmap_allocation_disabled.cc",
+        ],
+        "//conditions:default": [
+            "nnapi_delegate_disabled.cc",
+            "mmap_allocation.cc",
+        ],
+    }),
+    hdrs = [
+        "allocation.h",
+        "context.h",
+        "context_util.h",
+        "core/subgraph.h",
+        "error_reporter.h",
+        "graph_info.h",
+        "interpreter.h",
+        "model.h",
+        "mutable_op_resolver.h",
+        "nnapi_delegate.h",
+        "op_resolver.h",
+        "optional_debug_tools.h",
+        "stderr_reporter.h",
+    ],
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
+    linkopts = [
+    ] + select({
+        "//tensorflow:android": [
+            "-llog",
+        ],
+        "//conditions:default": [
+        ],
+    }),
+    deps = [
+        ":arena_planner",
+        ":graph_info",
+        ":memory_planner",
+        ":schema_fbs_version",
+        ":simple_memory_arena",
+        ":string",
+        ":util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api:api",
+        "//tensorflow/lite/kernels:eigen_support",
+        "//tensorflow/lite/kernels:gemm_support",
+        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/profiling:profiler",
+        "//tensorflow/lite/schema:schema_fbs",
+    ] + select({
+        ":with_select_tf_ops": [
+            "//tensorflow/lite/delegates/flex:delegate",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "string_util",
+    srcs = ["string_util.cc"],
+    hdrs = ["string_util.h"],
+    copts = TFLITE_DEFAULT_COPTS,
+    deps = [
+        ":string",
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_test(
+    name = "string_util_test",
+    size = "small",
+    srcs = ["string_util_test.cc"],
+    deps = [
+        ":framework",
+        ":string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test main interpreter
+cc_test(
+    name = "interpreter_test",
+    size = "small",
+    srcs = ["interpreter_test.cc"],
+    deps = [
+        ":framework",
+        ":string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test graph utils
+cc_test(
+    name = "graph_info_test",
+    size = "small",
+    srcs = ["graph_info_test.cc"],
+    deps = [
+        ":framework",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test arena allocator
+cc_test(
+    name = "simple_memory_arena_test",
+    size = "small",
+    srcs = ["simple_memory_arena_test.cc"],
+    deps = [
+        ":simple_memory_arena",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test model framework.
+cc_test(
+    name = "model_test",
+    size = "small",
+    srcs = ["model_test.cc"],
+    data = [
+        "testdata/0_subgraphs.bin",
+        "testdata/2_subgraphs.bin",
+        "testdata/empty_model.bin",
+        "testdata/multi_add_flex.bin",
+        "testdata/test_model.bin",
+        "testdata/test_model_broken.bin",
+    ],
+    deps = [
+        ":framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test model framework with the flex library linked into the target.
+tf_cc_test(
+    name = "model_flex_test",
+    size = "small",
+    srcs = ["model_flex_test.cc"],
+    data = [
+        "testdata/multi_add_flex.bin",
+    ],
+    tags = [
+        "no_gpu",  # GPU + flex is not officially supported.
+        "no_windows",  # TODO(b/116667551): No weak symbols with MSVC.
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates/flex:delegate",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test OpResolver.
+cc_test(
+    name = "mutable_op_resolver_test",
+    size = "small",
+    srcs = ["mutable_op_resolver_test.cc"],
+    deps = [
+        ":framework",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_test(
+    name = "util_test",
+    size = "small",
+    srcs = ["util_test.cc"],
+    deps = [
+        ":util",
+        "//tensorflow/lite/c:c_api_internal",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/README.md b/tensorflow/lite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..589d4f93481e501485de3bf3f0c21129299d8471
--- /dev/null
+++ b/tensorflow/lite/README.md
@@ -0,0 +1,8 @@
+# TensorFlow Lite
+
+TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded
+devices. It enables low-latency inference of on-device machine learning models
+with a small binary size and fast performance supporting hardware acceleration.
+
+See the documentation: https://www.tensorflow.org/lite/
+Documentation edits can be made here: [tensorflow/lite/g3doc](./g3doc/)
diff --git a/tensorflow/contrib/lite/allocation.cc b/tensorflow/lite/allocation.cc
similarity index 94%
rename from tensorflow/contrib/lite/allocation.cc
rename to tensorflow/lite/allocation.cc
index 21cb1832a7af49fd8441d3b6104b46489bef9237..f9a34322f0cbf51780b6cebbc4a94219ef47087e 100644
--- a/tensorflow/contrib/lite/allocation.cc
+++ b/tensorflow/lite/allocation.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/lite/allocation.h"
 
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -23,8 +23,8 @@ limitations under the License.
 #include <cstring>
 #include <utility>
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/allocation.h b/tensorflow/lite/allocation.h
similarity index 88%
rename from tensorflow/contrib/lite/allocation.h
rename to tensorflow/lite/allocation.h
index 182bc0977f62f17baa6b24106a19447ebb4b4805..f25d7fa232a7407b8f4d95b084472e136080c815 100644
--- a/tensorflow/contrib/lite/allocation.h
+++ b/tensorflow/lite/allocation.h
@@ -14,16 +14,16 @@ limitations under the License.
 ==============================================================================*/
 // Main abstraction controlling the tflite interpreter.
 // See context.h for the API for defining operations (TfLiteRegistration).
-#ifndef TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
-#define TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+#ifndef TENSORFLOW_LITE_ALLOCATION_H_
+#define TENSORFLOW_LITE_ALLOCATION_H_
 
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/simple_memory_arena.h"
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/simple_memory_arena.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 
@@ -94,4 +94,4 @@ class MemoryAllocation : public Allocation {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+#endif  // TENSORFLOW_LITE_ALLOCATION_H_
diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
similarity index 99%
rename from tensorflow/contrib/lite/arena_planner.cc
rename to tensorflow/lite/arena_planner.cc
index 02442575b3aeed04ac6569440dd52a4d5ddd4d98..8200b6adaa1c6eed64ca8963c7d0d422e573ffb8 100644
--- a/tensorflow/contrib/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/arena_planner.h"
+#include "tensorflow/lite/arena_planner.h"
 #include <utility>
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/lite/arena_planner.h
similarity index 92%
rename from tensorflow/contrib/lite/arena_planner.h
rename to tensorflow/lite/arena_planner.h
index 382577045b6d54c65e08028ac8cdc27c2bb1265a..beaadaf4eff7582be1c78ca4fa3ea620a93f1a2d 100644
--- a/tensorflow/contrib/lite/arena_planner.h
+++ b/tensorflow/lite/arena_planner.h
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
-#define TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
+#ifndef TENSORFLOW_LITE_ARENA_PLANNER_H_
+#define TENSORFLOW_LITE_ARENA_PLANNER_H_
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/graph_info.h"
-#include "tensorflow/contrib/lite/memory_planner.h"
-#include "tensorflow/contrib/lite/simple_memory_arena.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/simple_memory_arena.h"
 
 namespace tflite {
 
@@ -124,4 +124,4 @@ class ArenaPlanner : public MemoryPlanner {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_ARENA_PLANNER_H_
+#endif  // TENSORFLOW_LITE_ARENA_PLANNER_H_
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/arena_planner_test.cc
rename to tensorflow/lite/arena_planner_test.cc
index 7d7c41289cad95b73423a7218bf1e0516b2e87a2..479f25cafef5c47eed3226717eae2af7918549c6 100644
--- a/tensorflow/contrib/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/arena_planner.h"
+#include "tensorflow/lite/arena_planner.h"
 
 #include <cstdarg>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
similarity index 76%
rename from tensorflow/contrib/lite/build_def.bzl
rename to tensorflow/lite/build_def.bzl
index 7ef26de69f2699e3d9f55a15737b96a3505cf6eb..c17eddf47bc86c9537364117c302df38e390c8da 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -29,8 +29,11 @@ def tflite_copts():
         ],
         str(Label("//tensorflow:windows")): [
             "/DTF_COMPILE_LIBRARY",
+            "/wd4018",  # -Wno-sign-compare
+        ],
+        "//conditions:default": [
+            "-Wno-sign-compare",
         ],
-        "//conditions:default": [],
     }) + select({
         str(Label("//tensorflow:with_default_optimizations")): [],
         "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
@@ -38,7 +41,7 @@ def tflite_copts():
 
     return copts
 
-LINKER_SCRIPT = "//tensorflow/contrib/lite/java/src/main/native:version_script.lds"
+LINKER_SCRIPT = "//tensorflow/lite/java/src/main/native:version_script.lds"
 
 def tflite_linkopts_unstripped():
     """Defines linker flags to reduce size of TFLite binary.
@@ -108,7 +111,9 @@ def tflite_jni_binary(
         linkscript = LINKER_SCRIPT,
         linkshared = 1,
         linkstatic = 1,
-        deps = []):
+        testonly = 0,
+        deps = [],
+        srcs = []):
     """Builds a jni binary for TFLite."""
     linkopts = linkopts + [
         "-Wl,--version-script",  # Export only jni functions & classes.
@@ -120,7 +125,9 @@ def tflite_jni_binary(
         linkshared = linkshared,
         linkstatic = linkstatic,
         deps = deps + [linkscript],
+        srcs = srcs,
         linkopts = linkopts,
+        testonly = testonly,
     )
 
 def tflite_cc_shared_object(
@@ -150,7 +157,7 @@ def tf_to_tflite(name, src, options, out):
     """
 
     toco_cmdline = " ".join([
-        "//tensorflow/contrib/lite/toco:toco",
+        "//tensorflow/lite/toco:toco",
         "--input_format=TENSORFLOW_GRAPHDEF",
         "--output_format=TFLITE",
         ("--input_file=$(location %s)" % src),
@@ -161,7 +168,7 @@ def tf_to_tflite(name, src, options, out):
         srcs = [src],
         outs = [out],
         cmd = toco_cmdline,
-        tools = ["//tensorflow/contrib/lite/toco:toco"],
+        tools = ["//tensorflow/lite/toco:toco"],
     )
 
 def tflite_to_json(name, src, out):
@@ -174,7 +181,7 @@ def tflite_to_json(name, src, out):
     """
 
     flatc = "@flatbuffers//:flatc"
-    schema = "//tensorflow/contrib/lite/schema:schema.fbs"
+    schema = "//tensorflow/lite/schema:schema.fbs"
     native.genrule(
         name = name,
         srcs = [schema, src],
@@ -197,7 +204,7 @@ def json_to_tflite(name, src, out):
     """
 
     flatc = "@flatbuffers//:flatc"
-    schema = "//tensorflow/contrib/lite/schema:schema_fbs"
+    schema = "//tensorflow/lite/schema:schema_fbs"
     native.genrule(
         name = name,
         srcs = [schema, src],
@@ -212,9 +219,11 @@ def json_to_tflite(name, src, out):
 
 # This is the master list of generated examples that will be made into tests. A
 # function called make_XXX_tests() must also appear in generate_examples.py.
-# Disable a test by commenting it out. If you do, add a link to a bug or issue.
+# Disable a test by adding it to the blacklists specified in
+# generated_test_models_failing().
 def generated_test_models():
     return [
+        "abs",
         "add",
         "arg_min_max",
         "avg_pool",
@@ -230,17 +239,21 @@ def generated_test_models():
         "equal",
         "exp",
         "expand_dims",
+        "fill",
         "floor",
         "floor_div",
+        "floor_mod",
         "fully_connected",
         "fused_batch_norm",
         "gather",
+        "gather_with_constant",
         "global_batch_norm",
         "greater",
         "greater_equal",
         "sum",
         "l2norm",
         "l2_pool",
+        "leaky_relu",
         "less",
         "less_equal",
         "local_response_norm",
@@ -254,6 +267,7 @@ def generated_test_models():
         "maximum",
         "mean",
         "minimum",
+        "mirror_pad",
         "mul",
         "neg",
         "not_equal",
@@ -261,8 +275,10 @@ def generated_test_models():
         "pack",
         "pad",
         "padv2",
+        "placeholder_with_default",
         "prelu",
         "pow",
+        "range",
         "reduce_any",
         "reduce_max",
         "reduce_min",
@@ -282,21 +298,36 @@ def generated_test_models():
         "space_to_depth",
         "sparse_to_dense",
         "split",
+        "splitv",
         "sqrt",
         "square",
+        "squared_difference",
         "squeeze",
         "strided_slice",
         "strided_slice_1d_exhaustive",
+        "strided_slice_buggy",
         "sub",
         "tile",
         "topk",
         "transpose",
-        #"transpose_conv",   # disabled due to b/111213074
+        "transpose_conv",
         "unpack",
+        "unroll_batch_matmul",
         "where",
         "zeros_like",
     ]
 
+# List of models that fail generated tests for the conversion mode.
+# If you have to disable a test, please add here with a link to the appropriate
+# bug or issue.
+def generated_test_models_failing(conversion_mode):
+    if conversion_mode == "toco-flex":
+        return [
+            "lstm",  # TODO(b/117510976): Restore when lstm flex conversion works.
+        ]
+
+    return []
+
 def generated_test_conversion_modes():
     """Returns a list of conversion modes."""
 
@@ -307,16 +338,28 @@ def generated_test_models_all():
     """Generates a list of all tests with the different converters.
 
     Returns:
-      List of tuples representing (conversion mode, name of test).
+      List of tuples representing:
+            (conversion mode, name of test, test tags, test args).
     """
     conversion_modes = generated_test_conversion_modes()
     tests = generated_test_models()
     options = []
     for conversion_mode in conversion_modes:
+        failing_tests = generated_test_models_failing(conversion_mode)
         for test in tests:
+            tags = []
+            args = []
+            if test in failing_tests:
+                tags.append("notap")
+                tags.append("manual")
             if conversion_mode:
                 test += "_%s" % conversion_mode
-            options.append((conversion_mode, test))
+
+            # Flex conversion shouldn't suffer from the same conversion bugs
+            # listed for the default TFLite kernel backend.
+            if conversion_mode == "toco-flex":
+                args.append("--ignore_known_bugs=false")
+            options.append((conversion_mode, test, tags, args))
     return options
 
 def gen_zip_test(name, test_name, conversion_mode, **kwargs):
@@ -329,16 +372,13 @@ def gen_zip_test(name, test_name, conversion_mode, **kwargs):
         list above.
       **kwargs: tf_cc_test kwargs
     """
-    toco = "//tensorflow/contrib/lite/toco:toco"
+    toco = "//tensorflow/lite/toco:toco"
     flags = ""
     if conversion_mode:
         # TODO(nupurgarg): Comment in when pb2lite is in open source. b/113614050.
         # if conversion_mode == "pb2lite":
-        #     toco = "//tensorflow/contrib/lite/experimental/pb2lite:pb2lite"
+        #     toco = "//tensorflow/lite/experimental/pb2lite:pb2lite"
         flags = "--ignore_toco_errors --run_with_flex"
-        kwargs["tags"].append("skip_already_failing")
-        kwargs["tags"].append("no_oss")
-        kwargs["tags"].append("notap")
 
     gen_zipped_test_file(
         name = "zip_%s" % test_name,
@@ -381,8 +421,8 @@ def gen_selected_ops(name, model):
       model: TFLite model to interpret.
     """
     out = name + "_registration.cc"
-    tool = "//tensorflow/contrib/lite/tools:generate_op_registrations"
-    tflite_path = "//tensorflow/contrib/lite"
+    tool = "//tensorflow/lite/tools:generate_op_registrations"
+    tflite_path = "//tensorflow/lite"
     native.genrule(
         name = name,
         srcs = [model],
@@ -392,40 +432,48 @@ def gen_selected_ops(name, model):
         tools = [tool],
     )
 
-def gen_full_model_test(conversion_modes, models, data, test_suite_tag):
+def flex_dep(target_op_sets):
+    if "SELECT_TF_OPS" in target_op_sets:
+        return ["//tensorflow/lite/delegates/flex:delegate"]
+    else:
+        return []
+
+def gen_model_coverage_test(model_name, data, failure_type, tags):
     """Generates Python test targets for testing TFLite models.
 
     Args:
-      conversion_modes: List of conversion modes to test the models on.
-      models: List of models to test.
+      model_name: Name of the model to test (must be also listed in the 'data'
+        dependencies)
       data: List of BUILD targets linking the data.
-      test_suite_tag: Tag identifying the model test suite.
+      failure_type: List of failure types (none, toco, crash, inference)
+        expected for the corresponding combinations of op sets
+        ("TFLITE_BUILTINS", "TFLITE_BUILTINS,SELECT_TF_OPS", "SELECT_TF_OPS").
+      tags: List of strings of additional tags.
     """
-    options = [
-        (conversion_mode, model)
-        for model in models
-        for conversion_mode in conversion_modes
-    ]
-
-    for conversion_mode, model_name in options:
+    i = 0
+    for target_op_sets in ["TFLITE_BUILTINS", "TFLITE_BUILTINS,SELECT_TF_OPS", "SELECT_TF_OPS"]:
+        args = []
+        if failure_type[i] != "none":
+            args.append("--failure_type=%s" % failure_type[i])
+        i = i + 1
         native.py_test(
-            name = "model_coverage_test_%s_%s" % (model_name, conversion_mode.lower()),
+            name = "model_coverage_test_%s_%s" % (model_name, target_op_sets.lower().replace(",", "_")),
             srcs = ["model_coverage_test.py"],
+            size = "large",
             main = "model_coverage_test.py",
             args = [
                 "--model_name=%s" % model_name,
-                "--converter_mode=%s" % conversion_mode,
-            ],
+                "--target_ops=%s" % target_op_sets,
+            ] + args,
             data = data,
             srcs_version = "PY2AND3",
             tags = [
                 "no_oss",
                 "no_windows",
-                "notap",
-            ] + [test_suite_tag],
+            ] + tags,
             deps = [
-                "//tensorflow/contrib/lite/testing:model_coverage_lib",
-                "//tensorflow/contrib/lite/python:lite",
+                "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
+                "//tensorflow/lite/python:lite",
                 "//tensorflow/python:client_testlib",
-            ],
+            ] + flex_dep(target_op_sets),
         )
diff --git a/tensorflow/lite/builtin_op_data.h b/tensorflow/lite/builtin_op_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9d4284513de944ccbae16cb9ad57bca05103438
--- /dev/null
+++ b/tensorflow/lite/builtin_op_data.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Compatibility shim for new location of interface definitions.
+
+#ifndef TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+
+#endif  // TENSORFLOW_LITE_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
similarity index 91%
rename from tensorflow/contrib/lite/builtin_ops.h
rename to tensorflow/lite/builtin_ops.h
index 7809d114e2f72991be98bfa760f1f240864b5aa6..f97d3ac4bf0b27cdd9b1f5ab7258a12036c29179 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
-#define TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
+#ifndef TENSORFLOW_LITE_BUILTIN_OPS_H_
+#define TENSORFLOW_LITE_BUILTIN_OPS_H_
 
 // DO NOT EDIT MANUALLY: This file is automatically generated by
 // `schema/builtin_ops_header/generator.cc`.
@@ -120,9 +120,17 @@ typedef enum {
   kTfLiteBuiltinSquare = 92,
   kTfLiteBuiltinZerosLike = 93,
   kTfLiteBuiltinFill = 94,
+  kTfLiteBuiltinFloorMod = 95,
+  kTfLiteBuiltinRange = 96,
+  kTfLiteBuiltinResizeNearestNeighbor = 97,
+  kTfLiteBuiltinLeakyRelu = 98,
+  kTfLiteBuiltinSquaredDifference = 99,
+  kTfLiteBuiltinMirrorPad = 100,
+  kTfLiteBuiltinAbs = 101,
+  kTfLiteBuiltinSplitV = 102,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
+#endif  // TENSORFLOW_LITE_BUILTIN_OPS_H_
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..91c04a5f1fb5bb1a15bd1da074a1276a3d8e7793
--- /dev/null
+++ b/tensorflow/lite/c/BUILD
@@ -0,0 +1,40 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "c_api_internal",
+    srcs = ["c_api_internal.c"],
+    hdrs = [
+        "builtin_op_data.h",
+        "c_api_internal.h",
+    ],
+    visibility = [
+        "//tensorflow/contrib/lite:__subpackages__",
+        "//tensorflow/lite:__subpackages__",
+    ],
+)
+
+# Test the C extension API code.
+cc_test(
+    name = "c_api_internal_test",
+    size = "small",
+    srcs = ["c_api_internal_test.cc"],
+    deps = [
+        ":c_api_internal",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "builtin_op_data_test",
+    size = "small",
+    srcs = ["builtin_op_data_test.cc"],
+    copts = ["-Wno-unused-variable"],
+    deps = [
+        ":c_api_internal",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a5a027a9dc94bb2a11081276d269a7007c86cad
--- /dev/null
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -0,0 +1,353 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
+#define TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
+
+#include <stdint.h>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TODO(aselle): Consider using "if this then that" for testing.
+
+// IMPORTANT: All new members of structs must be added at the end to ensure
+// backwards compatibility.
+
+// Possible padding types (for convolutions)
+typedef enum {
+  kTfLitePaddingUnknown = 0,
+  kTfLitePaddingSame,
+  kTfLitePaddingValid,
+} TfLitePadding;
+
+typedef enum {
+  kTfLiteMirrorPaddingUnknown = 0,
+  kTfLiteMirrorPaddingReflect,
+  kTfLiteMirrorPaddingSymmetric,
+} TfLiteMirrorPaddingMode;
+
+typedef struct {
+  int width;
+  int height;
+} TfLitePaddingValues;
+
+typedef struct {
+  TfLiteMirrorPaddingMode mode;
+} TfLiteMirrorPaddingParams;
+
+// Possible fused activation functions.
+// TODO(aselle): rename to TfLiteActivation
+typedef enum {
+  kTfLiteActNone = 0,
+  kTfLiteActRelu,
+  kTfLiteActRelu1,
+  kTfLiteActRelu6,
+  kTfLiteActTanh,
+  kTfLiteActSignBit,
+  kTfLiteActSigmoid,
+} TfLiteFusedActivation;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  int dilation_width_factor;
+  int dilation_height_factor;
+  TfLiteFusedActivation activation;
+} TfLiteConvParams;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  int filter_width;
+  int filter_height;
+  TfLiteFusedActivation activation;
+  struct {
+    TfLitePaddingValues padding;
+  } computed;
+} TfLitePoolParams;
+
+typedef struct {
+  // Parameters for DepthwiseConv version 1 or above.
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  int depth_multiplier;
+  TfLiteFusedActivation activation;
+  // Parameters for DepthwiseConv version 2 or above.
+  int dilation_width_factor;
+  int dilation_height_factor;
+} TfLiteDepthwiseConvParams;
+
+typedef struct {
+  int rank;
+  TfLiteFusedActivation activation;
+} TfLiteSVDFParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteRNNParams;
+
+typedef struct {
+  bool time_major;
+  TfLiteFusedActivation activation;
+} TfLiteSequenceRNNParams;
+
+typedef struct {
+  bool time_major;
+  TfLiteFusedActivation activation;
+  bool merge_outputs;
+} TfLiteBidirectionalSequenceRNNParams;
+
+typedef enum {
+  kTfLiteFullyConnectedWeightsFormatDefault = 0,
+  kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8 = 1,
+} TfLiteFullyConnectedWeightsFormat;
+
+typedef struct {
+  // Parameters for FullyConnected version 1 or above.
+  TfLiteFusedActivation activation;
+
+  // Parameters for FullyConnected version 2 or above.
+  TfLiteFullyConnectedWeightsFormat weights_format;
+} TfLiteFullyConnectedParams;
+
+typedef enum {
+  kTfLiteLshProjectionUnknown = 0,
+  kTfLiteLshProjectionSparse = 1,
+  kTfLiteLshProjectionDense = 2,
+} TfLiteLSHProjectionType;
+
+typedef struct {
+  TfLiteLSHProjectionType type;
+} TfLiteLSHProjectionParams;
+
+typedef struct {
+  float beta;
+} TfLiteSoftmaxParams;
+
+typedef struct {
+  int axis;
+  TfLiteFusedActivation activation;
+} TfLiteConcatenationParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteAddParams;
+
+typedef struct {
+} TfLiteSpaceToBatchNDParams;
+
+typedef struct {
+} TfLiteBatchToSpaceNDParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteMulParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteSubParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteDivParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteL2NormParams;
+
+typedef struct {
+  int radius;
+  float bias;
+  float alpha;
+  float beta;
+} TfLiteLocalResponseNormParams;
+
+typedef enum {
+  kTfLiteLSTMFullKernel = 0,
+  kTfLiteLSTMBasicKernel
+} TfLiteLSTMKernelType;
+
+typedef struct {
+  // Parameters for LSTM version 1.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // Parameters for LSTM version 2.
+  // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
+  TfLiteLSTMKernelType kernel_type;
+} TfLiteLSTMParams;
+
+typedef struct {
+  // Parameters needed for the underlying LSTM.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // If set to true then the first dimension is time, otherwise batch.
+  bool time_major;
+} TfLiteUnidirectionalSequenceLSTMParams;
+
+typedef struct {
+  // Parameters for the LSTM kernel.
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+
+  // If true, store the outputs of both directions in the first output.
+  bool merge_outputs;
+} TfLiteBidirectionalSequenceLSTMParams;
+
+typedef struct {
+  bool align_corners;
+} TfLiteResizeBilinearParams;
+
+typedef struct {
+  bool align_corners;
+} TfLiteResizeNearestNeighborParams;
+
+typedef struct {
+} TfLitePadParams;
+
+typedef struct {
+} TfLitePadV2Params;
+
+typedef struct {
+  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
+  // For now we will fix the maximum possible number of dimensions.
+  int shape[8];
+  int num_dimensions;
+} TfLiteReshapeParams;
+
+typedef struct {
+  int ngram_size;
+  int max_skip_size;
+  bool include_all_ngrams;
+} TfLiteSkipGramParams;
+
+typedef struct {
+  int block_size;
+} TfLiteSpaceToDepthParams;
+
+typedef struct {
+  TfLiteType in_data_type;
+  TfLiteType out_data_type;
+} TfLiteCastParams;
+
+typedef enum {
+  kTfLiteCombinerTypeSum = 0,
+  kTfLiteCombinerTypeMean = 1,
+  kTfLiteCombinerTypeSqrtn = 2,
+} TfLiteCombinerType;
+
+typedef struct {
+  TfLiteCombinerType combiner;
+} TfLiteEmbeddingLookupSparseParams;
+
+typedef struct {
+  int axis;
+} TfLiteGatherParams;
+
+typedef struct {
+} TfLiteTransposeParams;
+
+typedef struct {
+  bool keep_dims;
+} TfLiteReducerParams;
+
+typedef struct {
+  int num_splits;
+} TfLiteSplitParams;
+
+typedef struct {
+  int num_splits;
+} TfLiteSplitVParams;
+
+typedef struct {
+  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
+  // For now we will fix the maximum possible number of dimensions.
+  int squeeze_dims[8];
+  int num_squeeze_dims;
+} TfLiteSqueezeParams;
+
+typedef struct {
+  int begin_mask;
+  int end_mask;
+  int ellipsis_mask;
+  int new_axis_mask;
+  int shrink_axis_mask;
+} TfLiteStridedSliceParams;
+
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMaxParams;
+
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMinParams;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+} TfLiteTransposeConvParams;
+
+typedef struct {
+  bool validate_indices;
+} TfLiteSparseToDenseParams;
+
+typedef struct {
+  TfLiteType out_type;
+} TfLiteShapeParams;
+
+typedef struct {
+  // Parameters supported by version 1:
+  float min;
+  float max;
+  int num_bits;
+
+  // Parameters supported by version 2:
+  bool narrow_range;
+} TfLiteFakeQuantParams;
+
+typedef struct {
+  int values_count;
+  int axis;
+} TfLitePackParams;
+
+typedef struct {
+  int axis;
+} TfLiteOneHotParams;
+
+typedef struct {
+  int num;
+  int axis;
+} TfLiteUnpackParams;
+
+typedef struct {
+  float alpha;
+} TfLiteLeakyReluParams;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_C_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/contrib/lite/c/builtin_op_data_test.cc b/tensorflow/lite/c/builtin_op_data_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/c/builtin_op_data_test.cc
rename to tensorflow/lite/c/builtin_op_data_test.cc
index ba458b4252c53ebc91adcd0afbd16f783037dd42..4ce7c481e1c26e6fcfdaa680e9ca666b82968d53 100644
--- a/tensorflow/contrib/lite/c/builtin_op_data_test.cc
+++ b/tensorflow/lite/c/builtin_op_data_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include <gtest/gtest.h>
 
 namespace tflite {
@@ -63,6 +63,7 @@ TEST(IntArray, CanCompileStructs) {
   TfLiteTransposeParams transpose_params;
   TfLiteReducerParams reducer_params;
   TfLiteSplitParams split_params;
+  TfLiteSplitVParams split_v_params;
   TfLiteSqueezeParams squeeze_params;
   TfLiteStridedSliceParams strided_slice_params;
   TfLiteArgMaxParams arg_max_params;
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
new file mode 100644
index 0000000000000000000000000000000000000000..2923dbad4ef285c497ca2c84d86168954fe8ec99
--- /dev/null
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -0,0 +1,152 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#ifndef TF_LITE_STATIC_MEMORY
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif  // TF_LITE_STATIC_MEMORY
+
+int TfLiteIntArrayGetSizeInBytes(int size) {
+  static TfLiteIntArray dummy;
+  return sizeof(dummy) + sizeof(dummy.data[0]) * size;
+}
+
+int TfLiteIntArrayEqual(TfLiteIntArray* a, TfLiteIntArray* b) {
+  if (a == b) return 1;
+  if (a == NULL || b == NULL) return 0;
+  return TfLiteIntArrayEqualsArray(a, b->size, b->data);
+}
+
+int TfLiteIntArrayEqualsArray(TfLiteIntArray* a, int b_size, int b_data[]) {
+  if (a == NULL) return (b_size == 0);
+  if (a->size != b_size) return 0;
+  int i = 0;
+  for (; i < a->size; i++)
+    if (a->data[i] != b_data[i]) return 0;
+  return 1;
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+
+TfLiteIntArray* TfLiteIntArrayCreate(int size) {
+  TfLiteIntArray* ret =
+      (TfLiteIntArray*)malloc(TfLiteIntArrayGetSizeInBytes(size));
+  ret->size = size;
+  return ret;
+}
+
+void TfLiteIntArrayPrint(const char* s, TfLiteIntArray* a) {
+  printf("%s: length=%d [", s, a->size);
+  if (a->size) printf("%d", a->data[0]);
+  int i = 1;
+  for (; i < a->size; i++) {
+    printf(" %d", a->data[i]);
+  }
+  printf("]\n");
+}
+
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) {
+  if (!src) return NULL;
+  TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size);
+  if (ret) {
+    memcpy(ret->data, src->data, src->size * sizeof(int));
+  }
+  return ret;
+}
+
+void TfLiteIntArrayFree(TfLiteIntArray* a) { free(a); }
+
+void TfLiteTensorDataFree(TfLiteTensor* t) {
+  if (t->allocation_type == kTfLiteDynamic && t->data.raw) {
+    free(t->data.raw);
+  }
+  t->data.raw = NULL;
+}
+
+void TfLiteTensorFree(TfLiteTensor* t) {
+  TfLiteTensorDataFree(t);
+  if (t->dims) TfLiteIntArrayFree(t->dims);
+  t->dims = NULL;
+}
+
+void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
+                       TfLiteQuantizationParams quantization, char* buffer,
+                       size_t size, TfLiteAllocationType allocation_type,
+                       const void* allocation, bool is_variable,
+                       TfLiteTensor* tensor) {
+  TfLiteTensorFree(tensor);
+  tensor->type = type;
+  tensor->name = name;
+  tensor->dims = dims;
+  tensor->params = quantization;
+  tensor->data.raw = buffer;
+  tensor->bytes = size;
+  tensor->allocation_type = allocation_type;
+  tensor->allocation = allocation;
+  tensor->is_variable = is_variable;
+}
+
+void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLiteDynamic) {
+    return;
+  }
+  if (!tensor->data.raw) {
+    tensor->data.raw = malloc(num_bytes);
+  } else if (num_bytes > tensor->bytes) {
+    tensor->data.raw = realloc(tensor->data.raw, num_bytes);
+  }
+  tensor->bytes = num_bytes;
+}
+#endif  // TF_LITE_STATIC_MEMORY
+
+const char* TfLiteTypeGetName(TfLiteType type) {
+  switch (type) {
+    case kTfLiteNoType:
+      return "NOTYPE";
+    case kTfLiteFloat32:
+      return "FLOAT32";
+    case kTfLiteInt16:
+      return "INT16";
+    case kTfLiteInt32:
+      return "INT32";
+    case kTfLiteUInt8:
+      return "UINT8";
+    case kTfLiteInt8:
+      return "INT8";
+    case kTfLiteInt64:
+      return "INT64";
+    case kTfLiteBool:
+      return "BOOL";
+    case kTfLiteComplex64:
+      return "COMPLEX64";
+    case kTfLiteString:
+      return "STRING";
+  }
+  return "Unknown type";
+}
+
+TfLiteDelegate TfLiteDelegateCreate() {
+  TfLiteDelegate d = {
+      .data_ = NULL,
+      .Prepare = NULL,
+      .CopyFromBufferHandle = NULL,
+      .CopyToBufferHandle = NULL,
+      .FreeBufferHandle = NULL,
+      .flags = kTfLiteDelegateFlagsNone,
+  };
+  return d;
+}
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cd84eff5c436abb781c74d1ac287b709558133f
--- /dev/null
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -0,0 +1,538 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file defines a C API for implementing operations in tflite.
+// These operations can be defined using c++ but the interface between
+// the interpreter and the operations are C.
+//
+// Summary of abstractions
+// TF_LITE_ENSURE - Self-sufficient error checking
+// TfLiteStatus - Status reporting
+// TfLiteIntArray - stores tensor shapes (dims),
+// TfLiteContext - allows an op to access the tensors
+// TfLiteTensor - tensor (a multidimensional array)
+// TfLiteNode - a single node or operation
+// TfLiteRegistration - the implementation of a conceptual operation.
+//
+// Some abstractions in this file are created and managed by Interpreter.
+#ifndef TENSORFLOW_LITE_C_C_API_INTERNAL_H_
+#define TENSORFLOW_LITE_C_C_API_INTERNAL_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef enum { kTfLiteOk = 0, kTfLiteError = 1 } TfLiteStatus;
+
+// The list of external context types known to TF Lite. This list exists solely
+// to avoid conflicts and to ensure ops can share the external contexts they
+// need. Access to the external contexts is controled by one of the
+// corresponding support files.
+typedef enum {
+  kTfLiteEigenContext = 0,     // include eigen_support.h to use.
+  kTfLiteGemmLowpContext = 1,  // include gemm_support.h to use.
+  kTfLiteEdgeTpuContext = 2,   // Placeholder for Edge TPU support.
+  kTfLiteMaxExternalContexts = 3
+} TfLiteExternalContextType;
+
+// An external context is a collection of information unrelated to the TF Lite
+// framework, but useful to a subset of the ops. TF Lite knows very little
+// about about the actual contexts, but it keeps a list of them, and is able to
+// refresh them if configurations like the number of recommended threads
+// change.
+typedef struct {
+  TfLiteExternalContextType type;
+  TfLiteStatus (*Refresh)(struct TfLiteContext* context);
+} TfLiteExternalContext;
+
+// Forward declare so GetNode can use this is in Context.
+typedef struct _TfLiteRegistration TfLiteRegistration;
+typedef struct _TfLiteDelegate TfLiteDelegate;
+
+#define kOptionalTensor (-1)
+
+// Fixed size list of integers. Used for dimensions and inputs/outputs tensor
+// indices
+typedef struct {
+  int size;
+// gcc 6.1+ have a bug where flexible members aren't properly handled
+// https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+    __GNUC_MINOR__ >= 1
+  int data[0];
+#else
+  int data[];
+#endif
+} TfLiteIntArray;
+
+// Given the size (number of elements) in a TfLiteIntArray, calculate its size
+// in bytes.
+int TfLiteIntArrayGetSizeInBytes(int size);
+
+// Create a array of a given `size` (uninitialized entries).
+// This returns a pointer, that you must free using TfLiteIntArrayFree().
+TfLiteIntArray* TfLiteIntArrayCreate(int size);
+
+// Check if two intarrays are equal. Returns 1 if they are equal, 0 otherwise.
+int TfLiteIntArrayEqual(TfLiteIntArray* a, TfLiteIntArray* b);
+
+// Check if an intarray equals an array. Returns 1 if equals, 0 otherwise.
+int TfLiteIntArrayEqualsArray(TfLiteIntArray* a, int b_size, int b_data[]);
+
+// Create a copy of an array passed as `src`.
+// You are expected to free memory with TfLiteIntArrayFree
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
+
+// Free memory of array `v`.
+void TfLiteIntArrayFree(TfLiteIntArray* v);
+
+// Since we must not depend on any libraries, define a minimal subset of
+// error macros while avoiding names that have pre-conceived meanings like
+// assert and check.
+
+// Check whether value is true, and if not return kTfLiteError from
+// the current function (and report the error string msg).
+#define TF_LITE_ENSURE_MSG(context, value, msg)            \
+  do {                                                     \
+    if (!(value)) {                                        \
+      (context)->ReportError((context), __FILE__ " " msg); \
+      return kTfLiteError;                                 \
+    }                                                      \
+  } while (0)
+
+// Check whether the value `a` is true, and if not return kTfLiteError from
+// the current function, while also reporting the location of the error.
+#define TF_LITE_ENSURE(context, a)                                          \
+  do {                                                                      \
+    if (!(a)) {                                                             \
+      (context)->ReportError((context), "%s:%d %s was not true.", __FILE__, \
+                             __LINE__, #a);                                 \
+      return kTfLiteError;                                                  \
+    }                                                                       \
+  } while (0)
+
+#define TF_LITE_ENSURE_STATUS(a) \
+  do {                           \
+    if ((a) != kTfLiteOk) {      \
+      return kTfLiteError;       \
+    }                            \
+  } while (0)
+
+// Check whether the value `a == b` is true, and if not return kTfLiteError from
+// the current function, while also reporting the location of the error.
+// `a` and `b` may be evaluated more than once, so no side effects or
+// extremely expensive computations should be done.
+#define TF_LITE_ENSURE_EQ(context, a, b)                                       \
+  do {                                                                         \
+    if ((a) != (b)) {                                                          \
+      (context)->ReportError((context), "%s:%d %s != %s (%d != %d)", __FILE__, \
+                             __LINE__, #a, #b, (a), (b));                      \
+      return kTfLiteError;                                                     \
+    }                                                                          \
+  } while (0)
+
+#define TF_LITE_ENSURE_TYPES_EQ(context, a, b)                                 \
+  do {                                                                         \
+    if ((a) != (b)) {                                                          \
+      (context)->ReportError((context), "%s:%d %s != %s (%s != %s)", __FILE__, \
+                             __LINE__, #a, #b, TfLiteTypeGetName(a),           \
+                             TfLiteTypeGetName(b));                            \
+      return kTfLiteError;                                                     \
+    }                                                                          \
+  } while (0)
+
+#define TF_LITE_ENSURE_OK(context, status) \
+  do {                                     \
+    if ((status) != kTfLiteOk) {           \
+      return kTfLiteError;                 \
+    }                                      \
+  } while (0)
+
+// Single-precision complex data type compatible with the C99 definition.
+typedef struct {
+  float re, im;  // real and imaginary parts, respectively.
+} TfLiteComplex64;
+
+// Types supported by tensor
+typedef enum {
+  kTfLiteNoType = 0,
+  kTfLiteFloat32 = 1,
+  kTfLiteInt32 = 2,
+  kTfLiteUInt8 = 3,
+  kTfLiteInt64 = 4,
+  kTfLiteString = 5,
+  kTfLiteBool = 6,
+  kTfLiteInt16 = 7,
+  kTfLiteComplex64 = 8,
+  kTfLiteInt8 = 9,
+} TfLiteType;
+
+// Return the name of a given type, for error reporting purposes.
+const char* TfLiteTypeGetName(TfLiteType type);
+
+// Parameters for asymmetric quantization. Quantized values can be converted
+// back to float using:
+//    real_value = scale * (quantized_value - zero_point);
+typedef struct {
+  float scale;
+  int32_t zero_point;
+} TfLiteQuantizationParams;
+
+// A union of pointers that points to memory for a given tensor.
+typedef union {
+  int* i32;
+  int64_t* i64;
+  float* f;
+  char* raw;
+  const char* raw_const;
+  uint8_t* uint8;
+  bool* b;
+  int16_t* i16;
+  TfLiteComplex64* c64;
+  int8_t* int8;
+} TfLitePtrUnion;
+
+// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
+// data (or data externally allocated). kTfLiteArenaRw is arena allocated
+// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+typedef enum {
+  kTfLiteMemNone = 0,
+  kTfLiteMmapRo,
+  kTfLiteArenaRw,
+  kTfLiteArenaRwPersistent,
+  kTfLiteDynamic,
+} TfLiteAllocationType;
+
+// The delegates should use zero or positive integers to represent handles.
+// -1 is reserved from unallocated status.
+typedef int TfLiteBufferHandle;
+const TfLiteBufferHandle kTfLiteNullBufferHandle = -1;
+
+// An tensor in the interpreter system which is a wrapper around a buffer of
+// data including a dimensionality (or NULL if not currently defined).
+typedef struct {
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+  // Quantization information.
+  TfLiteQuantizationParams params;
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // An opaque pointer to a tflite::MMapAllocation
+  const void* allocation;
+
+  // Null-terminated name of this tensor.
+  const char* name;
+
+  // The delegate which knows how to handle `buffer_handle`.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteDelegate* delegate;
+
+  // An integer buffer handle that can be handled by `delegate`.
+  // The value is valid only when delegate is not null.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteBufferHandle buffer_handle;
+
+  // If the delegate uses its own buffer (e.g. GPU memory), the delegate is
+  // responsible to set data_is_stale to true.
+  // `delegate->CopyFromBufferHandle` can be called to copy the data from
+  // delegate buffer.
+  // WARNING: This is an // experimental interface that is subject to change.
+  bool data_is_stale;
+
+  // True if the tensor is a variable.
+  bool is_variable;
+} TfLiteTensor;
+
+// Free data memory of tensor `t`;
+void TfLiteTensorDataFree(TfLiteTensor* t);
+
+// Free memory of tensor `t`;
+void TfLiteTensorFree(TfLiteTensor* t);
+
+// Set all of a tensor's fields (and free any previously allocated data).
+void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
+                       TfLiteQuantizationParams quantization, char* buffer,
+                       size_t size, TfLiteAllocationType allocation_type,
+                       const void* allocation, bool is_variable,
+                       TfLiteTensor* tensor);
+
+// Resize the allocated data of a (dynamic) tensor. Tensors with allocation
+// types other than kTfLiteDynamic will be ignored.
+void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
+
+// A structure representing an instance of a node.
+// This structure only exhibits the inputs, outputs and user defined data, not
+// other features like the type.
+typedef struct {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // Temporary tensors uses during the computations. This usually contains no
+  // tensors, but ops are allowed to change that if they need scratch space of
+  // any sort.
+  TfLiteIntArray* temporaries;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin. This is usually
+  // a structure defined in builtin_op_data.h
+  void* builtin_data;
+
+  // Custom initial data. This is the opaque data provided in the flatbuffer.
+  // WARNING: This is an experimental interface that is subject to change.
+  const void* custom_initial_data;
+  int custom_initial_data_size;
+
+  // The pointer to the delegate. This is non-null only when the node is
+  // created by calling `interpreter.ModifyGraphWithDelegate`.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteDelegate* delegate;
+} TfLiteNode;
+
+typedef struct TfLiteContext {
+  // Number of tensors in the context.
+  size_t tensors_size;
+
+  // The execution plan contains a list of the node indices in execution
+  // order. execution_plan->size is the current number of nodes. And,
+  // execution_plan->data[0] is the first node that needs to be run.
+  // TfLiteDelegates can traverse the current execution plan by iterating
+  // through each member of this array and using GetNodeAndRegistration() to
+  // access details about a node. i.e.
+  // TfLiteIntArray* execution_plan;
+  // TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &execution_plan));
+  // for (int exec_index = 0; exec_index < execution_plan->size; exec_index++) {
+  //    int node_index = execution_plan->data[exec_index];
+  //    TfLiteNode* node;
+  //    TfLiteRegistration* reg;
+  //    context->GetNodeAndRegistration(context, node_index, &node, &reg);
+  // }
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetExecutionPlan)(struct TfLiteContext* context,
+                                   TfLiteIntArray** execution_plan);
+
+  // An array of tensors in the interpreter context (of length `tensors_size`)
+  TfLiteTensor* tensors;
+
+  // opaque full context ptr (an opaque c++ data structure)
+  void* impl_;
+
+  // Request memory pointer be resized. Updates dimensions on the tensor.
+  // NOTE: ResizeTensor takes ownership of newSize.
+  TfLiteStatus (*ResizeTensor)(struct TfLiteContext*, TfLiteTensor* tensor,
+                               TfLiteIntArray* new_size);
+  // Request that a error be reported with format string msg.
+  void (*ReportError)(struct TfLiteContext*, const char* msg, ...);
+
+  // Add `tensors_to_add` tensors, preserving pre-existing Tensor entries.  If
+  // non-null, the value pointed to by `first_new_tensor_index` will be set to
+  // the index of the first new tensor.
+  TfLiteStatus (*AddTensors)(struct TfLiteContext*, int tensors_to_add,
+                             int* first_new_tensor_index);
+
+  // Get a Tensor node by node_index.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus (*GetNodeAndRegistration)(struct TfLiteContext*, int node_index,
+                                         TfLiteNode** node,
+                                         TfLiteRegistration** registration);
+
+  // Replace ops with one or more stub delegate operations. This function
+  // does not take ownership of `nodes_to_replace`.
+  TfLiteStatus (*ReplaceNodeSubsetsWithDelegateKernels)(
+      struct TfLiteContext*, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
+
+  // Number of threads that are recommended to subsystems like gemmlowp and
+  // eigen.
+  int recommended_num_threads;
+
+  // Access external contexts by type.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteExternalContext* (*GetExternalContext)(struct TfLiteContext*,
+                                               TfLiteExternalContextType);
+  // Set the value of a external context. Does not take ownership of the
+  // pointer.
+  // WARNING: This is an experimental interface that is subject to change.
+  void (*SetExternalContext)(struct TfLiteContext*, TfLiteExternalContextType,
+                             TfLiteExternalContext*);
+
+  // Flag for allowing float16 precision for FP32 calculation.
+  // default: false.
+  // WARNING: This is an experimental API and subject to change.
+  bool allow_fp32_relax_to_fp16;
+} TfLiteContext;
+
+typedef struct _TfLiteRegistration {
+  // Initializes the op from serialized data.
+  // If a built-in op:
+  //   `buffer` is the op's params data (TfLiteLSTMParams*).
+  //   `length` is zero.
+  // If custom op:
+  //   `buffer` is the op's `custom_options`.
+  //   `length` is the size of the buffer.
+  //
+  // Returns a type-punned (i.e. void*) opaque data (e.g. a primitive pointer
+  // or an instance of a struct).
+  //
+  // The returned pointer will be stored with the node in the `user_data` field,
+  // accessible within prepare and invoke functions below.
+  // NOTE: if the data is already in the desired format, simply implement this
+  // function to return `nullptr` and implement the free function to be a no-op.
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+
+  // The pointer `buffer` is the data previously returned by an init invocation.
+  void (*free)(TfLiteContext* context, void* buffer);
+
+  // prepare is called when the inputs this node depends on have been resized.
+  // context->ResizeTensor() can be called to request output tensors to be
+  // resized.
+  //
+  // Returns kTfLiteOk on success.
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+
+  // Execute the node (should read node->inputs and output to node->outputs).
+  // Returns kTfLiteOk on success.
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+
+  // profiling_string is called during summarization of profiling information
+  // in order to group executions together. Providing a value here will cause a
+  // given op to appear multiple times is the profiling report. This is
+  // particularly useful for custom ops that can perform significantly
+  // different calculations depending on their `user-data`.
+  const char* (*profiling_string)(const TfLiteContext* context,
+                                  const TfLiteNode* node);
+
+  // Builtin codes. If this kernel refers to a builtin this is the code
+  // of the builtin. This is so we can do marshaling to other frameworks like
+  // NN API.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
+  int32_t builtin_code;
+
+  // Custom op name. If the op is a builtin, this will be null.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
+  // WARNING: This is an experimental interface that is subject to change.
+  const char* custom_name;
+
+  // The version of the op.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
+  int version;
+} TfLiteRegistration;
+
+// The flags used in `TfLiteDelegate`. Note that this is a bitmask, so the
+// values should be 1, 2, 4, 8, ...etc.
+typedef enum {
+  kTfLiteDelegateFlagsNone = 0,
+  // The flag is set if the delegate can handle dynamic sized tensors.
+  // For example, the output shape of a `Resize` op with non-constant shape
+  // can only be inferred when the op is invoked.
+  // In this case, the Delegate is responsible for calling
+  // `SetTensorToDynamic` to mark the tensor as a dynamic tensor, and calling
+  // `ResizeTensor` when invoking the op.
+  //
+  // If the delegate isn't capable to handle dynamic tensors, this flag need
+  // to be set to false.
+  kTfLiteDelegateFlagsAllowDynamicTensors = 1
+} TfLiteDelegateFlags;
+
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct _TfLiteDelegate {
+  // Data that delegate needs to identify itself. This data is owned by the
+  // delegate. The delegate is owned in the user code, so the delegate is
+  // responsible for doing this when it is destroyed.
+  void* data_;
+
+  // Invoked by ModifyGraphWithDelegate. This prepare is called, giving the
+  // delegate a view of the current graph through TfLiteContext*. It typically
+  // will look at the nodes and call ReplaceNodeSubsetsWithDelegateKernels()
+  // to ask the TensorFlow lite runtime to create macro-nodes to represent
+  // delegated subgraphs of the original graph.
+  TfLiteStatus (*Prepare)(TfLiteContext* context, TfLiteDelegate* delegate);
+
+  // Copy the data from delegate buffer handle into raw memory of the given
+  // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
+  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
+  TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
+                                       TfLiteDelegate* delegate,
+                                       TfLiteBufferHandle buffer_handle,
+                                       TfLiteTensor* tensor);
+
+  // Copy the data from raw memory of the given 'tensor' to delegate buffer
+  // handle. This can be null if the delegate doesn't use its own buffer.
+  TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
+                                     TfLiteDelegate* delegate,
+                                     TfLiteBufferHandle buffer_handle,
+                                     TfLiteTensor* tensor);
+
+  // Free the Delegate Buffer Handle. Note: This only frees the handle, but
+  // this doesn't release the underlying resource (e.g. textures). The
+  // resources are either owned by application layer or the delegate.
+  // This can be null if the delegate doesn't use its own buffer.
+  void (*FreeBufferHandle)(TfLiteContext* context, TfLiteDelegate* delegate,
+                           TfLiteBufferHandle* handle);
+
+  // Bitmask flags. See the comments in `TfLiteDelegateFlags`.
+  int64_t flags;
+} TfLiteDelegate;
+
+// Build a 'null' delegate, with all the fields properly set to their default
+// values.
+TfLiteDelegate TfLiteDelegateCreate();
+
+// WARNING: This is an experimental interface that is subject to change.
+//
+// Currently, TfLiteDelegateParams has to be allocated in a way that it's
+// trivially destructable. It will be stored as `builtin_data` field in
+// `TfLiteNode` of the delegate node.
+//
+// See also the `CreateDelegateParams` function in `interpreter.cc` details.
+typedef struct {
+  TfLiteDelegate* delegate;
+  TfLiteIntArray* nodes_to_replace;
+  TfLiteIntArray* input_tensors;
+  TfLiteIntArray* output_tensors;
+} TfLiteDelegateParams;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_LITE_C_C_API_INTERNAL_H_
diff --git a/tensorflow/contrib/lite/c/c_api_internal_test.cc b/tensorflow/lite/c/c_api_internal_test.cc
similarity index 77%
rename from tensorflow/contrib/lite/c/c_api_internal_test.cc
rename to tensorflow/lite/c/c_api_internal_test.cc
index af398f32075b46e2ea487d49448f13435c4b5768..acf0dfc5be8e233b642ccea42f72cbf6af2d4c5d 100644
--- a/tensorflow/contrib/lite/c/c_api_internal_test.cc
+++ b/tensorflow/lite/c/c_api_internal_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 #include <gtest/gtest.h>
 
 namespace tflite {
@@ -65,6 +65,22 @@ TEST(IntArray, TestIntArrayEqual) {
   TfLiteIntArrayFree(d);
 }
 
+TEST(Types, TestTypeNames) {
+  auto type_name = [](TfLiteType t) {
+    return std::string(TfLiteTypeGetName(t));
+  };
+  EXPECT_EQ(type_name(kTfLiteNoType), "NOTYPE");
+  EXPECT_EQ(type_name(kTfLiteFloat32), "FLOAT32");
+  EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
+  EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
+  EXPECT_EQ(type_name(kTfLiteUInt8), "UINT8");
+  EXPECT_EQ(type_name(kTfLiteInt8), "INT8");
+  EXPECT_EQ(type_name(kTfLiteInt64), "INT64");
+  EXPECT_EQ(type_name(kTfLiteBool), "BOOL");
+  EXPECT_EQ(type_name(kTfLiteComplex64), "COMPLEX64");
+  EXPECT_EQ(type_name(kTfLiteString), "STRING");
+}
+
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/lite/context.h
similarity index 79%
rename from tensorflow/contrib/lite/context.h
rename to tensorflow/lite/context.h
index b86c2819b821d7ce3d9da2073998301b9d29adda..3d3c8c08b24e697d39d642af4e0f7451b02d70ae 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/lite/context.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // Compatibility shim for moved header location.
-#ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
-#define TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+#ifndef TENSORFLOW_LITE_CONTEXT_H_
+#define TENSORFLOW_LITE_CONTEXT_H_
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
-#endif  // TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+#endif  // TENSORFLOW_LITE_CONTEXT_H_
diff --git a/tensorflow/contrib/lite/context_util.h b/tensorflow/lite/context_util.h
similarity index 89%
rename from tensorflow/contrib/lite/context_util.h
rename to tensorflow/lite/context_util.h
index ccda4c7393dd169a67d9e9400cf8ab57dda049e7..68b91ea0b93e602c20d1db3284a523e9f55dfd5b 100644
--- a/tensorflow/contrib/lite/context_util.h
+++ b/tensorflow/lite/context_util.h
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 // This provides a few C++ helpers that are useful for manipulating C structures
 // in C++.
-#ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_CONTEXT_UTIL_H_
+#ifndef TENSORFLOW_LITE_CONTEXT_UTIL_H_
+#define TENSORFLOW_LITE_CONTEXT_UTIL_H_
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 
@@ -45,4 +45,4 @@ class TfLiteIntArrayView {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_CONTEXT_UTIL_H_
+#endif  // TENSORFLOW_LITE_CONTEXT_UTIL_H_
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6a43b0322d17041a5ae4a0527376d1465a539b1d
--- /dev/null
+++ b/tensorflow/lite/core/api/BUILD
@@ -0,0 +1,57 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+
+cc_library(
+    name = "api",
+    srcs = [
+        "error_reporter.cc",
+        "flatbuffer_conversions.cc",
+        "op_resolver.cc",
+    ],
+    hdrs = [
+        "error_reporter.h",
+        "flatbuffer_conversions.h",
+        "op_resolver.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_test(
+    name = "error_reporter_test",
+    size = "small",
+    srcs = ["error_reporter_test.cc"],
+    deps = [
+        ":api",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "op_resolver_test",
+    size = "small",
+    srcs = ["op_resolver_test.cc"],
+    deps = [
+        ":api",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "flatbuffer_conversions_test",
+    size = "small",
+    srcs = ["flatbuffer_conversions_test.cc"],
+    deps = [
+        ":api",
+        "//tensorflow/lite/c:c_api_internal",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/core/api/error_reporter.cc b/tensorflow/lite/core/api/error_reporter.cc
similarity index 95%
rename from tensorflow/contrib/lite/core/api/error_reporter.cc
rename to tensorflow/lite/core/api/error_reporter.cc
index 423f83b1a9f4c90b3c3e286061ec03262890af03..7070eaa57c589a6f1481517936ca6b795ce9e9e9 100644
--- a/tensorflow/contrib/lite/core/api/error_reporter.cc
+++ b/tensorflow/lite/core/api/error_reporter.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include <cstdarg>
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/error_reporter.h b/tensorflow/lite/core/api/error_reporter.h
new file mode 100644
index 0000000000000000000000000000000000000000..357722cc45911f435e8678e6eb24effc44e56644
--- /dev/null
+++ b/tensorflow/lite/core/api/error_reporter.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_API_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_CORE_API_ERROR_REPORTER_H_
+
+#include <cstdarg>
+
+namespace tflite {
+
+// A functor that reports error to supporting system. Invoked similar to
+// printf.
+//
+// Usage:
+//  ErrorReporter foo;
+//  foo.Report("test %d", 5);
+// or
+//  va_list args;
+//  foo.Report("test %d", args); // where args is va_list
+//
+// Subclass ErrorReporter to provide another reporting destination.
+// For example, if you have a GUI program, you might redirect to a buffer
+// that drives a GUI error log box.
+class ErrorReporter {
+ public:
+  virtual ~ErrorReporter() {}
+  virtual int Report(const char* format, va_list args) = 0;
+  int Report(const char* format, ...);
+  int ReportError(void*, const char* format, ...);
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_API_ERROR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/core/api/error_reporter_test.cc b/tensorflow/lite/core/api/error_reporter_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/core/api/error_reporter_test.cc
rename to tensorflow/lite/core/api/error_reporter_test.cc
index 0463eee6be554ea25469a221b614f87f0331bf55..4e44a6465d1ed9cdf866c96e0d6af52a4ef95910 100644
--- a/tensorflow/contrib/lite/core/api/error_reporter_test.cc
+++ b/tensorflow/lite/core/api/error_reporter_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 
 #include <cstdio>
 
diff --git a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
similarity index 89%
rename from tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
rename to tensorflow/lite/core/api/flatbuffer_conversions.cc
index eac7db9a88d2adbee9a751957beb5272fbe0e652..c00a0a3a546b1b2b0167663b5f00c5e25e261f15 100644
--- a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
 #include <cstdlib>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 
 namespace tflite {
 
@@ -61,6 +61,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_UINT8:
       *type = kTfLiteUInt8;
       break;
+    case TensorType_INT8:
+      *type = kTfLiteInt8;
+      break;
     case TensorType_INT64:
       *type = kTfLiteInt64;
       break;
@@ -371,7 +374,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
     case BuiltinOperator_LSTM: {
       auto params = allocator->AllocatePOD<TfLiteLSTMParams>();
       if (auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
@@ -391,6 +393,20 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: {
+      auto* params =
+          allocator->AllocatePOD<TfLiteUnidirectionalSequenceLSTMParams>();
+      if (auto* seq_lstm_params =
+              op->builtin_options_as_UnidirectionalSequenceLSTMOptions()) {
+        params->activation =
+            parse_activation(seq_lstm_params->fused_activation_function());
+        params->cell_clip = seq_lstm_params->cell_clip();
+        params->proj_clip = seq_lstm_params->proj_clip();
+        params->time_major = seq_lstm_params->time_major();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM: {
       auto params =
           allocator->AllocatePOD<TfLiteBidirectionalSequenceLSTMParams>();
@@ -414,6 +430,21 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
+      // Large functions confuse MacOS builds with XCode 8 so a lambda is
+      // required to minimize function size. TODO(b/118447267): Simplify
+      // ParseOpData function and reduce its length.
+      [&]() {
+        auto* params =
+            allocator->AllocatePOD<TfLiteResizeNearestNeighborParams>();
+        if (auto* schema_params =
+                op->builtin_options_as_ResizeNearestNeighborOptions()) {
+          params->align_corners = schema_params->align_corners();
+        }
+        *builtin_data = reinterpret_cast<void*>(params);
+      }();
+      break;
+    }
     case BuiltinOperator_RESHAPE: {
       auto* params = allocator->AllocatePOD<TfLiteReshapeParams>();
       if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
@@ -475,6 +506,14 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_SPLIT_V: {
+      auto* params = allocator->AllocatePOD<TfLiteSplitParams>();
+      if (auto* schema_params = op->builtin_options_as_SplitVOptions()) {
+        params->num_splits = schema_params->num_splits();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_SQUEEZE: {
       auto* params = allocator->AllocatePOD<TfLiteSqueezeParams>();
       if (auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
@@ -589,8 +628,31 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_LEAKY_RELU: {
+      TfLiteLeakyReluParams* params =
+          allocator->AllocatePOD<TfLiteLeakyReluParams>();
+      if (auto* leaky_relu_params = op->builtin_options_as_LeakyReluOptions()) {
+        params->alpha = leaky_relu_params->alpha();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_MIRROR_PAD: {
+      TfLiteMirrorPaddingParams* params =
+          allocator->AllocatePOD<TfLiteMirrorPaddingParams>();
+      auto* mirror_pad_params = op->builtin_options_as_MirrorPadOptions();
+      if (mirror_pad_params != nullptr) {
+        params->mode =
+            mirror_pad_params->mode() == tflite::MirrorPadMode_REFLECT
+                ? TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect
+                : TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingSymmetric;
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
+    case BuiltinOperator_ABS:
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
     // ok for now, since there is no call implementation either.
@@ -638,6 +700,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_SQUARE:
     case BuiltinOperator_ZEROS_LIKE:
     case BuiltinOperator_FILL:
+    case BuiltinOperator_FLOOR_MOD:
+    case BuiltinOperator_RANGE:
+    case BuiltinOperator_SQUARED_DIFFERENCE:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
similarity index 85%
rename from tensorflow/contrib/lite/core/api/flatbuffer_conversions.h
rename to tensorflow/lite/core/api/flatbuffer_conversions.h
index c770e627fd572dc252c6261bd3713d3105d225f1..0132a431c5daad3dc21516de5455c4693a9f10cf 100644
--- a/tensorflow/contrib/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
-#define TENSORFLOW_CONTRIB_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
+#ifndef TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
+#define TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
 
 // These functions transform codes and data structures that are defined in the
 // flatbuffer serialization format into in-memory values that are used by the
 // runtime API and interpreter.
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/core/api/op_resolver.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -65,4 +65,4 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
+#endif  // TENSORFLOW_LITE_CORE_API_FLATBUFFER_CONVERSIONS_H_
diff --git a/tensorflow/contrib/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/core/api/flatbuffer_conversions_test.cc
rename to tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 8ae94e1d330c1958b857cff0b44c38108f153550..4d1d1b21fda106b3196ff43421996f45ab83af4f 100644
--- a/tensorflow/contrib/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
 #include <cstring>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/core/api/op_resolver.cc b/tensorflow/lite/core/api/op_resolver.cc
similarity index 97%
rename from tensorflow/contrib/lite/core/api/op_resolver.cc
rename to tensorflow/lite/core/api/op_resolver.cc
index 55ee92484305c353a1cfffc5011fbb1d01bf3109..94d76889d07903da2d548568d9702af65f984aa0 100644
--- a/tensorflow/contrib/lite/core/api/op_resolver.cc
+++ b/tensorflow/lite/core/api/op_resolver.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/core/api/op_resolver.h b/tensorflow/lite/core/api/op_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8c7479f334c7c68c3effdb548ccc48c760a666e
--- /dev/null
+++ b/tensorflow/lite/core/api/op_resolver.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Abstract interface that returns TfLiteRegistrations given op codes or custom
+// op names. This is the mechanism that ops being referenced in the flatbuffer
+// model are mapped to executable function pointers (TfLiteRegistrations).
+class OpResolver {
+ public:
+  // Finds the op registration for a builtin operator by enum code.
+  virtual const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                           int version) const = 0;
+  // Finds the op registration of a custom operator by op name.
+  virtual const TfLiteRegistration* FindOp(const char* op,
+                                           int version) const = 0;
+  virtual ~OpResolver() {}
+};
+
+// Handles the logic for converting between an OperatorCode structure extracted
+// from a flatbuffer and information about a registered operator implementation.
+TfLiteStatus GetRegistrationFromOpCode(const OperatorCode* opcode,
+                                       const OpResolver& op_resolver,
+                                       ErrorReporter* error_reporter,
+                                       const TfLiteRegistration** registration);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_API_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/core/api/op_resolver_test.cc b/tensorflow/lite/core/api/op_resolver_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/core/api/op_resolver_test.cc
rename to tensorflow/lite/core/api/op_resolver_test.cc
index 167463110ed8ec6d59479cc9e7cd6f9845eaa897..cd8d0929b6449597c94442d2889c0e5e8afb9a8e 100644
--- a/tensorflow/contrib/lite/core/api/op_resolver_test.cc
+++ b/tensorflow/lite/core/api/op_resolver_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 
 #include <cstring>
 
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90361faeae3c085fd4bd73a22b64635ce4b2969e
--- /dev/null
+++ b/tensorflow/lite/core/subgraph.cc
@@ -0,0 +1,970 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/arena_planner.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/nnapi_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+namespace {
+TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
+                           const TfLiteRegistration& registration,
+                           int node_index, const char* message) {
+  context->ReportError(
+      context, "Node number %d (%s) %s.\n", node_index,
+      registration.custom_name
+          ? registration.custom_name
+          : EnumNameBuiltinOperator(
+                static_cast<BuiltinOperator>(registration.builtin_code)),
+      message);
+  return kTfLiteError;
+}
+
+// Stub method which returns kTfLiteError when the function is forbidden.
+// We're registrating this function to several different function to save
+// compiled binary size. Please note the restrictions:
+// * The type of first parameter have to be `TfLiteContext*`.
+// * All paramteters must be trivailly destructible. (E.g. No C++ class)
+TfLiteStatus ForbiddenContextFunction(TfLiteContext* context, ...) {
+  context->ReportError(context,
+                       "The function is forbidden if not calling in delegate.");
+  return kTfLiteError;
+}
+
+// Set the ForbiddenContextFunction to a compatible function pointer.
+template <typename FunctionType>
+void SetForbiddenContextFunction(FunctionType* func) {
+  *func = reinterpret_cast<FunctionType>(ForbiddenContextFunction);
+}
+
+// Returns true if at least one tensor in the given list is kTfLiteDynamic.
+template <typename TensorIntArray>
+bool HasDynamicTensorImpl(const TfLiteContext& context,
+                          const TensorIntArray& int_array) {
+  for (int i : int_array) {
+    const TfLiteTensor& tensor = context.tensors[i];
+    if (tensor.allocation_type == kTfLiteDynamic) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HasDynamicTensor(const TfLiteContext& context,
+                      const TfLiteIntArray* int_array) {
+  return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
+}
+
+}  // namespace
+
+// A trivial implementation of GraphInfo around the Interpreter.
+// NOTE: this interpreter info represents the subset of the
+// graph that is executed according to execution plan. Thus,
+// the indices are execution plan indices rather than raw node
+// indices.
+class InterpreterInfo : public GraphInfo {
+ public:
+  explicit InterpreterInfo(Subgraph* subgraph) : subgraph_(subgraph) {}
+
+  size_t num_tensors() const override { return subgraph_->tensors().size(); }
+  TfLiteTensor* tensor(size_t index) override {
+    return &subgraph_->tensors()[index];
+  }
+  size_t num_nodes() const override {
+    return subgraph_->execution_plan().size();
+  }
+  const TfLiteNode& node(size_t index) const override {
+    int node_index = subgraph_->execution_plan()[index];
+    return subgraph_->nodes_and_registration()[node_index].first;
+  }
+  const std::vector<int>& inputs() const override {
+    return subgraph_->inputs();
+  }
+  const std::vector<int>& outputs() const override {
+    return subgraph_->outputs();
+  }
+  const std::vector<int>& variables() const override {
+    return subgraph_->variables();
+  }
+
+ public:
+  Subgraph* subgraph_;
+};
+
+Subgraph::Subgraph(ErrorReporter* error_reporter,
+                   TfLiteExternalContext** external_contexts,
+                   std::vector<std::unique_ptr<Subgraph>>* subgraphs)
+    : context_(&owned_context_),
+      error_reporter_(error_reporter),
+      next_execution_plan_index_to_prepare_(0),
+      external_contexts_(external_contexts),
+      subgraphs_(subgraphs) {
+  context_->impl_ = static_cast<void*>(this);
+  context_->ResizeTensor = ResizeTensor;
+  context_->ReportError = ReportErrorC;
+  context_->AddTensors = AddTensors;
+  context_->tensors = nullptr;
+  context_->tensors_size = 0;
+  context_->allow_fp32_relax_to_fp16 = false;
+  context_->recommended_num_threads = -1;
+  context_->GetExternalContext = GetExternalContext;
+  context_->SetExternalContext = SetExternalContext;
+
+  // Reserve some space for the tensors to avoid excessive resizing.
+  tensors_.reserve(kTensorsReservedCapacity);
+  nodes_and_registration().reserve(kTensorsReservedCapacity);
+  // Invalid to call these these except from TfLiteDelegate
+  SwitchToKernelContext();
+}
+
+Subgraph::~Subgraph() {
+  for (auto& node_and_reg : nodes_and_registration_) {
+    TfLiteNode& node = node_and_reg.first;
+    TfLiteIntArrayFree(node.inputs);
+    TfLiteIntArrayFree(node.outputs);
+    TfLiteIntArrayFree(node.temporaries);
+    if (node.builtin_data) free(node.builtin_data);
+    OpFree(node_and_reg.second, node.user_data);
+    node.builtin_data = nullptr;
+  }
+
+  for (size_t i = 0; i < context_->tensors_size; i++) {
+    TfLiteTensor* tensor = &context_->tensors[i];
+    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
+        tensor->delegate->FreeBufferHandle != nullptr) {
+      tensor->delegate->FreeBufferHandle(context_, tensor->delegate,
+                                         &tensor->buffer_handle);
+    }
+    TfLiteTensorFree(tensor);
+  }
+}
+
+TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
+    TfLiteContext* context, TfLiteRegistration registration,
+    const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->ReplaceNodeSubsetsWithDelegateKernels(registration, nodes_to_replace,
+                                              delegate);
+}
+
+namespace {
+
+// Copy a std::vector<int> to an existing TfLiteIntArray.
+// This is a low-level data manipulation function, and it's caller's
+// responsibility to ensure TfLiteIntArray has enough size.
+void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
+                                TfLiteIntArray* arr) {
+  arr->size = vec.size();
+  memcpy(arr->data, vec.data(), sizeof(int) * arr->size);
+}
+
+// This function allocates a continuous memory space that contains a
+// TfLiteDelegateParams followed by a several TfLiteIntArray.
+// When calling `free` at TfLiteDelegateParams*, all the allocated space
+// will be freed together.
+//
+// +-----------------------------------+
+// | TfLiteDelegateParams              |
+// | TfLiteDelegate* delegate;         |
+// | TfLiteIntArray* nodes_to_replace; |--\
+// | TfLiteIntArray* input_tensors;    |--+--\
+// | TfLiteIntArray* output_tensors;   |--+--+--\
+// +-----------------------------------+  |  |  |
+// | TfLiteIntArray (variable size)    |<-/  |  |
+// +-----------------------------------+     |  |
+// | TfLiteIntArray (variable size)    |<----/  |
+// +-----------------------------------+        |
+// | TfLiteIntArray (variable size)    |<-------/
+// +-----------------------------------+
+TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
+                                           const NodeSubset& node_subset) {
+  // Step 1: Calculate the allocation size.
+  int allocation_size = sizeof(TfLiteDelegateParams);
+
+  int nodes_to_replace_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.nodes.size());
+  allocation_size += nodes_to_replace_size;
+
+  int input_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.input_tensors.size());
+  allocation_size += input_tensors_size;
+
+  int output_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.output_tensors.size());
+  allocation_size += output_tensors_size;
+
+  // Step 2: Allocate the memory.
+  // Use `char*` for conveniently step through the allocated space by bytes.
+  char* allocation = reinterpret_cast<char*>(malloc(allocation_size));
+
+  // Step 3: Fill all data structures structures.
+  TfLiteDelegateParams* params =
+      reinterpret_cast<TfLiteDelegateParams*>(allocation);
+  params->delegate = delegate;
+  allocation += sizeof(TfLiteDelegateParams);
+
+  params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.nodes, params->nodes_to_replace);
+  allocation += nodes_to_replace_size;
+
+  params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.input_tensors, params->input_tensors);
+  allocation += input_tensors_size;
+
+  params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.output_tensors,
+                             params->output_tensors);
+  allocation += output_tensors_size;
+
+  return params;
+}
+
+}  // namespace
+
+TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
+    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+    TfLiteDelegate* delegate) {
+  // Annotate the registration as DELEGATE op.
+  registration.builtin_code = BuiltinOperator_DELEGATE;
+
+  // Analyze the graph to find all independent node_subsets that are either
+  // fully not-this-delegate or this-delegate computation.
+  InterpreterInfo info(this);
+  std::vector<NodeSubset> node_subsets;
+  PartitionGraphIntoIndependentNodeSubsets(&info, nodes_to_replace,
+                                           &node_subsets);
+
+  execution_plan_.clear();
+
+  for (auto& node_subset : node_subsets) {
+    // Subsets calimed by the delegate should have a "macro" op created, the
+    // other node_subsets (kTfNonPartition) just have their nodes added back to
+    // the execution plan.
+    switch (node_subset.type) {
+      case NodeSubset::kTfNonPartition:
+        for (auto it = node_subset.nodes.begin(); it != node_subset.nodes.end();
+             ++it) {
+          execution_plan_.push_back(*it);
+        }
+        break;
+      case NodeSubset::kTfPartition: {
+        int node_index;
+
+        TfLiteDelegateParams* params =
+            CreateDelegateParams(delegate, node_subset);
+        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
+            node_subset.input_tensors, node_subset.output_tensors, nullptr, 0,
+            params, &registration, &node_index));
+
+        // Initialize the output tensors's delegate-related fields.
+        for (int tensor_index : node_subset.output_tensors) {
+          TfLiteTensor* tensor = &tensors_[tensor_index];
+          TF_LITE_ENSURE(context_, tensor->delegate == nullptr ||
+                                       tensor->delegate == delegate);
+          tensor->delegate = delegate;
+        }
+
+        // Associate the node with the delegate.
+        TfLiteNode* node = &nodes_and_registration_[node_index].first;
+        node->delegate = delegate;
+      } break;
+      case NodeSubset::kTfUnexplored:
+        return kTfLiteError;
+        break;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteExternalContext* Subgraph::GetExternalContext(
+    TfLiteExternalContextType type) {
+  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
+    return external_contexts_[type];
+  }
+  return nullptr;
+}
+
+TfLiteExternalContext* Subgraph::GetExternalContext(
+    struct TfLiteContext* context, TfLiteExternalContextType type) {
+  return static_cast<Subgraph*>(context->impl_)->GetExternalContext(type);
+}
+
+void Subgraph::SetExternalContext(TfLiteExternalContextType type,
+                                  TfLiteExternalContext* ctx) {
+  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
+    external_contexts_[type] = ctx;
+  }
+}
+
+void Subgraph::SetExternalContext(struct TfLiteContext* context,
+                                  TfLiteExternalContextType type,
+                                  TfLiteExternalContext* ctx) {
+  return static_cast<Subgraph*>(context->impl_)->SetExternalContext(type, ctx);
+}
+
+// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
+// this memory and it is only guaranteed to exist during the invocation of the
+// delegate prepare.
+TfLiteStatus Subgraph::GetExecutionPlan(TfLiteIntArray** execution_plan) {
+  // TODO(aselle): Do not make a copy here
+  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
+  *execution_plan = plan_cache_.get();
+  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
+                "TfLiteIntArray and execution_plan do not contain same type.");
+  std::memcpy(plan_cache_->data, execution_plan_.data(),
+              sizeof(plan_cache_->data[0]) * execution_plan_.size());
+  return kTfLiteOk;
+}
+
+// WARNING: This is an experimental interface that is subject to change.
+// Entry point for C node plugin API to get the execution plan
+TfLiteStatus Subgraph::GetExecutionPlan(struct TfLiteContext* context,
+                                        TfLiteIntArray** execution_plan) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->GetExecutionPlan(execution_plan);
+}
+
+TfLiteStatus Subgraph::SetInputs(std::vector<int> inputs) {
+  TF_LITE_ENSURE_OK(&context_,
+                    CheckTensorIndices("inputs", inputs.data(), inputs.size()));
+  inputs_ = std::move(inputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetOutputs(std::vector<int> outputs) {
+  TF_LITE_ENSURE_OK(
+      &context_, CheckTensorIndices("outputs", outputs.data(), outputs.size()));
+  outputs_ = std::move(outputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetVariables(std::vector<int> variables) {
+  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("variables", variables.data(),
+                                                  variables.size()));
+  variables_ = std::move(variables);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
+                                          int length) {
+  // Making sure kOptionalTensor is not re-defined to something other than -1.
+  static_assert(kOptionalTensor == -1, "kOptionalTensor should be defined -1");
+
+  for (int i = 0; i < length; i++) {
+    int index = indices[i];
+    // Continue if index == kOptionalTensor before additional comparisons below,
+    // size_t(-1) is always >= context_tensors_size.
+    if (index == kOptionalTensor) {
+      continue;
+    }
+    if (index < 0 || static_cast<size_t>(index) >= context_->tensors_size) {
+      ReportError("Invalid tensor index %d in %s\n", index, label);
+      consistent_ = false;
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::BytesRequired(TfLiteType type, const int* dims,
+                                     size_t dims_size, size_t* bytes) {
+  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
+  // MultiplyWithoutOverflow.
+  TF_LITE_ENSURE(context_, bytes != nullptr);
+  size_t count = 1;
+  for (int k = 0; k < dims_size; k++) count *= dims[k];
+  switch (type) {
+    case kTfLiteFloat32:
+      *bytes = sizeof(float) * count;
+      break;
+    case kTfLiteInt16:
+      *bytes = sizeof(int16_t) * count;
+      break;
+    case kTfLiteInt32:
+      *bytes = sizeof(int32_t) * count;
+      break;
+    case kTfLiteUInt8:
+      *bytes = sizeof(uint8_t) * count;
+      break;
+    case kTfLiteInt64:
+      *bytes = sizeof(int64_t) * count;
+      break;
+    case kTfLiteBool:
+      *bytes = sizeof(bool) * count;
+      break;
+    case kTfLiteComplex64:
+      *bytes = sizeof(std::complex<float>) * count;
+      break;
+    case kTfLiteInt8:
+      *bytes = sizeof(int8_t) * count;
+      break;
+    default:
+      ReportError(
+          "Only float32, int8, int16, int32, int64, uint8, bool, complex64 "
+          "supported currently.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AllocateTensors() {
+  if (!consistent_) {
+    ReportError("AllocateTensors() called on inconsistent model.");
+    return kTfLiteError;
+  }
+
+  // Explicit (re)allocation is necessary if nodes have been changed or tensors
+  // have been resized. For inputs marked as dynamic, we can't short-circuit the
+  // allocation as the client may have done the resize manually.
+  if (state_ != kStateUninvokable &&
+      !HasDynamicTensorImpl(*context_, inputs())) {
+    return kTfLiteOk;
+  }
+
+  next_execution_plan_index_to_prepare_ = 0;
+  if (memory_planner_) {
+    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
+  }
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+
+  state_ = kStateInvokable;
+
+  // Reset the variable tensors to zero after (re)allocating the tensors.
+  // Developers shouldn't rely on the side effect of this function to reset
+  // variable tesnsors. They should call `ResetVariableTensors` directly
+  // instead.
+  ResetVariableTensors();
+
+  return kTfLiteOk;
+}
+
+// TODO(ycling): Support non-zero default values.
+TfLiteStatus Subgraph::ResetVariableTensors() {
+  for (auto& tensor : tensors_) {
+    if (!tensor.is_variable) {
+      continue;
+    }
+
+    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
+    // allocated after the initial `PrepareOpsAndTensors()` is called.
+    TF_LITE_ENSURE_EQ(context_, tensor.allocation_type,
+                      kTfLiteArenaRwPersistent);
+    TF_LITE_ENSURE(context_, tensor.data.raw != nullptr);
+
+    memset(tensor.data.raw, 0, tensor.bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AddNodeWithParameters(
+    const std::vector<int>& inputs, const std::vector<int>& outputs,
+    const char* init_data, size_t init_data_size, void* builtin_data,
+    const TfLiteRegistration* registration, int* node_index) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError("AddNodeWithParameters is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  state_ = kStateUninvokable;
+
+  std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
+                                                              free);
+
+  TF_LITE_ENSURE_OK(context_, CheckTensorIndices("node inputs", inputs.data(),
+                                                 inputs.size()));
+  TF_LITE_ENSURE_OK(
+      &context_,
+      CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
+
+  int new_node_index = nodes_and_registration_.size();
+  if (node_index) *node_index = new_node_index;
+  nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
+  auto& node_and_reg = nodes_and_registration_.back();
+  TfLiteNode& node = node_and_reg.first;
+  if (node.inputs) TfLiteIntArrayFree(node.inputs);
+  if (node.outputs) TfLiteIntArrayFree(node.outputs);
+  if (node.temporaries) TfLiteIntArrayFree(node.temporaries);
+
+  // NOTE, here we are not using move semantics yet, since our internal
+  // representation isn't std::vector, but in the future we would like to avoid
+  // copies, so we want the interface to take r-value references now.
+  node.inputs = ConvertVectorToTfLiteIntArray(inputs);
+  node.outputs = ConvertVectorToTfLiteIntArray(outputs);
+  node.temporaries = TfLiteIntArrayCreate(0);
+  if (init_data) {
+    node.user_data = OpInit(*registration, init_data, init_data_size);
+  } else {
+    node.user_data =
+        OpInit(*registration,
+               reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
+  }
+
+  node.builtin_data = builtin_data_deleter.release();
+  // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
+  // properly for nodes generated by ReplaceNodeSubsetsWithDelegateKernels.
+
+  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
+    // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
+    // `Operator` table is passed in.
+    node.custom_initial_data = init_data;
+    node.custom_initial_data_size = init_data_size;
+  } else {
+    node.custom_initial_data = nullptr;
+    node.custom_initial_data_size = 0;
+  }
+
+  node.delegate = nullptr;
+  node_and_reg.second = *registration;
+  execution_plan_.push_back(new_node_index);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
+                                         const std::vector<int>& dims) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError("ResizeInputTensor is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
+  // TODO(aselle): All bounds checks can be implemented as one-sided bounds
+  // checks by casting to unsigned for efficiency. Profile before doing this.
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  TfLiteTensor* tensor = &context_->tensors[tensor_index];
+
+  // Short-circuit the state change if the dimensions don't change, avoiding
+  // unnecessary (re)allocations.
+  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
+    return kTfLiteOk;
+  }
+
+  state_ = kStateUninvokable;
+  return ResizeTensorImpl(tensor, ConvertVectorToTfLiteIntArray(dims));
+}
+
+TfLiteStatus Subgraph::PrepareOpsStartingAt(
+    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+  if (first_execution_plan_index == 0) {
+    has_dynamic_tensors_ = false;
+  }
+  for (int execution_plan_index = first_execution_plan_index;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
+    EnsureTensorsVectorCapacity();
+    if (OpPrepare(registration, &node) == kTfLiteError) {
+      return ReportOpError(context_, node, registration, node_index,
+                           "failed to prepare");
+    }
+
+    *last_execution_plan_index_prepared = execution_plan_index;
+
+    // Discontinue if the node has dynamic outputs. Note that we don't
+    // stop for dynamic temporary tensors since they won't affect the
+    // sizes of other tensors in the graph.
+    if (HasDynamicTensor(*context_, node.outputs)) {
+      has_dynamic_tensors_ = true;
+      return kTfLiteOk;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::PrepareOpsAndTensors() {
+  if (!memory_planner_) {
+    memory_planner_.reset(new ArenaPlanner(
+        context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this)),
+        /*preserve_inputs=*/true, /*preserve_intermediates*/ false));
+    memory_planner_->PlanAllocations();
+  }
+
+  int last_exec_plan_index_prepared = 0;
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
+  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
+      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
+
+  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::Invoke() {
+  if (!consistent_) {
+    ReportError("Invoke called on model that is not consistent.");
+    return kTfLiteError;
+  }
+
+  TfLiteStatus status = kTfLiteOk;
+  if (state_ == kStateUninvokable) {
+    ReportError("Invoke called on model that is not ready.");
+    return kTfLiteError;
+  }
+
+  if (nnapi_delegate_) {
+    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
+      TF_LITE_ENSURE_OK(context_, nnapi_delegate_->Invoke(this));
+      return kTfLiteOk;
+    } else {
+      // TODO(aselle): In the future, we would like this to be an
+      // automatic tflite CPU fallback.
+      ReportError(
+          "NNAPI was requested, but dependent sized tensors "
+          "being used.\n");
+      return kTfLiteError;
+    }
+  }
+
+  // Invocations are always done in node order.
+  // Note that calling Invoke repeatedly will cause the original memory plan to
+  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
+  // called.
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
+      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+      TF_LITE_ENSURE(context_, next_execution_plan_index_to_prepare_ >=
+                                   execution_plan_index);
+    }
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
+    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
+
+    // TODO(ycling): This is an extra loop through inputs to check if the data
+    // need to be copied from Delegate buffer to raw memory, which is often not
+    // needed. We may want to cache this in prepare to know if this needs to be
+    // done for a node or not.
+    for (int i = 0; i < node.inputs->size; ++i) {
+      int tensor_index = node.inputs->data[i];
+      if (tensor_index == kOptionalTensor) {
+        continue;
+      }
+      TfLiteTensor* tensor = &tensors_[tensor_index];
+      if (tensor->delegate && tensor->delegate != node.delegate &&
+          tensor->data_is_stale) {
+        EnsureTensorDataIsReadable(tensor_index);
+      }
+    }
+
+    EnsureTensorsVectorCapacity();
+    tensor_resized_since_op_invoke_ = false;
+    if (OpInvoke(registration, &node) == kTfLiteError) {
+      status = ReportOpError(context_, node, registration, node_index,
+                             "failed to invoke");
+    }
+
+    // Force execution prep for downstream ops if the latest op triggered the
+    // resize of a dynamic tensor.
+    if (tensor_resized_since_op_invoke_ &&
+        HasDynamicTensor(*context_, node.outputs)) {
+      next_execution_plan_index_to_prepare_ = execution_plan_index + 1;
+    }
+  }
+
+  return status;
+}
+
+TfLiteStatus Subgraph::ResizeTensor(TfLiteContext* context,
+                                    TfLiteTensor* tensor,
+                                    TfLiteIntArray* new_size) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function ResizeTensorImpl
+  // (this function is static).
+  return static_cast<Subgraph*>(context->impl_)
+      ->ResizeTensorImpl(tensor, new_size);
+}
+
+void Subgraph::ReportErrorImpl(const char* format, va_list args) {
+  error_reporter_->Report(format, args);
+}
+
+void Subgraph::ReportErrorC(TfLiteContext* context, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  auto* f = static_cast<Subgraph*>(context->impl_);
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Subgraph to call into the member function ReportErrorImpl
+  // (this function is static).
+  f->ReportErrorImpl(format, args);
+  va_end(args);
+}
+
+// Entry point for C node plugin API to report an error.
+void Subgraph::ReportError(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  auto* f = static_cast<Subgraph*>(context_->impl_);
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Subgraph to call into the member function ReportErrorImpl
+  // (this function is static).
+  f->ReportErrorImpl(format, args);
+  va_end(args);
+}
+
+TfLiteStatus Subgraph::AddTensors(int tensors_to_add,
+                                  int* first_new_tensor_index) {
+  const size_t base_index = tensors_.size();
+  if (first_new_tensor_index) *first_new_tensor_index = base_index;
+  tensors_.resize(tensors_.size() + tensors_to_add);
+  for (size_t i = base_index; i < tensors_.size(); i++) {
+    memset(&tensors_[i], 0, sizeof(tensors_[i]));
+    tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
+  }
+  context_->tensors = tensors_.data();
+  context_->tensors_size = tensors_.size();
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AddTensors(TfLiteContext* context, int tensors_to_add,
+                                  int* first_new_tensor_index) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function AddTensors
+  // (this function is static).
+  return static_cast<Subgraph*>(context->impl_)
+      ->AddTensors(tensors_to_add, first_new_tensor_index);
+}
+
+TfLiteStatus Subgraph::GetNodeAndRegistration(
+    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
+  TF_LITE_ENSURE(context_, node_index >= 0);
+  auto nodes_size = nodes_and_registration_.size();
+  TF_LITE_ENSURE(context_, static_cast<size_t>(node_index) < nodes_size);
+  TF_LITE_ENSURE(context_, node != nullptr && registration != nullptr);
+  auto& node_and_reg = nodes_and_registration_[node_index];
+  *node = &node_and_reg.first;
+  *registration = &node_and_reg.second;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::GetNodeAndRegistration(
+    struct TfLiteContext* context, int node_index, TfLiteNode** node,
+    TfLiteRegistration** registration) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->GetNodeAndRegistration(node_index, node, registration);
+}
+
+TfLiteStatus Subgraph::SetTensorParametersReadOnly(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
+    size_t bytes, const Allocation* allocation) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  // For most tensors we know exactly how much memory is necessary so we can
+  // ensure the buffer is large enough. However, we need to skip string tensors
+  // because their sizes change with the contents of the individual strings.
+  if (type != kTfLiteString) {
+    size_t required_bytes;
+    TF_LITE_ENSURE_OK(context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
+    TF_LITE_ENSURE_EQ(context_, required_bytes, bytes);
+  }
+
+  TfLiteTensor& tensor = context_->tensors[tensor_index];
+  if (type == tensor.type &&
+      EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
+    // Fast path which does not invalidate the invokable property.
+    TfLiteTensorDataFree(&tensor);
+    tensor.data.raw = const_cast<char*>(buffer);
+    if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
+    tensor.params = quantization;
+    tensor.allocation_type = kTfLiteMmapRo;
+    tensor.allocation = allocation;
+  } else {
+    state_ = kStateUninvokable;
+    TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                      quantization, const_cast<char*>(buffer), bytes,
+                      kTfLiteMmapRo, allocation, false, &tensor);
+  }
+  return kTfLiteOk;
+}
+
+// Set description of inputs/outputs/data/fptrs for node `node_index`.
+// This variant assumes an external buffer has been allocated of size
+// bytes. The lifetime of buffer must be ensured to be greater or equal
+// to Interpreter.
+TfLiteStatus Subgraph::SetTensorParametersReadWrite(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  size_t required_bytes = 0;
+  if (type != kTfLiteString) {
+    // These types will be allocated in our arena so we need to record how
+    // many bytes we will need based on the dimensions. String tensors are
+    // allocated dynamically and we can't know ahead of time how much space
+    // they will require.
+    TF_LITE_ENSURE_OK(context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
+  }
+
+  TfLiteAllocationType allocation_type = kTfLiteArenaRw;
+  if (type == kTfLiteString) {
+    if (is_variable) {
+      // We don't have a real use case for string variable tensor.
+      ReportError("String variable tensor isn't supported.");
+      return kTfLiteError;
+    }
+    allocation_type = kTfLiteDynamic;
+  } else if (is_variable) {
+    allocation_type = kTfLiteArenaRwPersistent;
+  }
+
+  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                    quantization,
+                    /*buffer=*/nullptr, required_bytes, allocation_type,
+                    nullptr, is_variable, &context_->tensors[tensor_index]);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetExecutionPlan(const std::vector<int>& new_plan) {
+  for (int node_index : new_plan) {
+    TF_LITE_ENSURE(context_, node_index >= 0 &&
+                                 node_index < nodes_and_registration_.size());
+  }
+  execution_plan_ = new_plan;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
+                                        TfLiteIntArray* new_size) {
+  // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
+  if (tensor->allocation_type == kTfLiteArenaRw ||
+      tensor->allocation_type == kTfLiteDynamic ||
+      tensor->allocation_type == kTfLiteArenaRwPersistent) {
+    tensor_resized_since_op_invoke_ |=
+        TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
+    if (tensor->type != kTfLiteString) {
+      size_t bytesRequired;
+      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
+                                          new_size->size, &bytesRequired);
+      if (status != kTfLiteOk) {
+        TfLiteIntArrayFree(new_size);
+        return kTfLiteError;
+      }
+
+      // Realloc space for kTfLiteDynamic tensors.
+      TfLiteTensorRealloc(bytesRequired, tensor);
+      tensor->bytes = bytesRequired;
+    }
+    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
+    tensor->dims = new_size;
+
+    if (tensor->allocation_type != kTfLiteDynamic) {
+      tensor->data.raw = nullptr;
+    }
+  } else {
+    // kTfLiteMmapRo tensors are stored in the flatbuffer and are therefore
+    // of fixed size.
+    TfLiteIntArrayFree(new_size);
+    ReportError("Attempting to resize a fixed-size tensor.");
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+void Subgraph::UseNNAPI(bool enable) {
+  // TODO(aselle): This is a workaround for finding if NNAPI exists.
+  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
+  // prefixed.
+  if (!NNAPIDelegate::IsSupported()) enable = false;
+  if (!enable) {
+    nnapi_delegate_.reset();
+  } else if (!nnapi_delegate_) {
+    nnapi_delegate_.reset(new NNAPIDelegate);
+  }
+}
+
+void Subgraph::SwitchToDelegateContext() {
+  context_->GetNodeAndRegistration = GetNodeAndRegistration;
+  context_->ReplaceNodeSubsetsWithDelegateKernels =
+      ReplaceNodeSubsetsWithDelegateKernels;
+  context_->GetExecutionPlan = GetExecutionPlan;
+}
+
+void Subgraph::SwitchToKernelContext() {
+  context_->GetNodeAndRegistration = [](struct TfLiteContext* context,
+                                        int node_index, TfLiteNode** node,
+                                        TfLiteRegistration** registration) {
+    return ForbiddenContextFunction(context);
+  };
+  context_->ReplaceNodeSubsetsWithDelegateKernels =
+      [](TfLiteContext* context, TfLiteRegistration registration,
+         const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
+        return ForbiddenContextFunction(context);
+      };
+  context_->GetExecutionPlan = [](struct TfLiteContext* context,
+                                  TfLiteIntArray**) {
+    return ForbiddenContextFunction(context);
+  };
+}
+
+TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
+    int last_execution_plan_index_prepared;
+    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
+                                     0, &last_execution_plan_index_prepared));
+    if (has_dynamic_tensors_) {
+      ReportError(
+          "Attempting to use a delegate that only supports static-sized "
+          "tensors with a graph that has dynamic-sized tensors.");
+      return kTfLiteError;
+    }
+  }
+
+  // TODO(aselle): Consider if it is worth storing pointers to delegates.
+  // Setup additional context interface.
+  SwitchToDelegateContext();
+
+  TfLiteStatus status = delegate->Prepare(context_, delegate);
+
+  // Remove additional context info.
+  SwitchToKernelContext();
+
+  TF_LITE_ENSURE_OK(context_, status);
+
+  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
+    // Reset the state to force tensor/op reallocation.
+    state_ = kStateUninvokable;
+    TF_LITE_ENSURE_OK(context_, AllocateTensors());
+    TF_LITE_ENSURE_EQ(context_, state_, kStateInvokable);
+    // After using a delegate which doesn't support dynamic tensors, make the
+    // entire graph immutable.
+    state_ = kStateInvokableAndImmutable;
+  }
+
+  return status;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a7c3a7c322e55500d9edb7d7c1b9763e9a76e88
--- /dev/null
+++ b/tensorflow/lite/core/subgraph.h
@@ -0,0 +1,501 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+#define TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+
+#include <cstdlib>
+#include <vector>
+
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/profiling/profiler.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+// Forward declare since NNAPIDelegate uses Interpreter.
+class NNAPIDelegate;
+
+class Subgraph {
+ public:
+  friend class Interpreter;
+
+  Subgraph(ErrorReporter* error_reporter,
+           TfLiteExternalContext** external_contexts,
+           std::vector<std::unique_ptr<Subgraph>>* subgraphs);
+
+  Subgraph(const Subgraph&) = delete;
+
+  // Subgraphs should be movable but not copyable.
+  Subgraph(Subgraph&&) = default;
+  Subgraph& operator=(const Subgraph&) = delete;
+  virtual ~Subgraph();
+
+  // Provide a list of tensor indexes that are inputs to the model.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetInputs(std::vector<int> inputs);
+
+  // Provide a list of tensor indexes that are outputs to the model
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetOutputs(std::vector<int> outputs);
+
+  // Provide a list of tensor indexes that are variable tensors.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetVariables(std::vector<int> variables);
+
+  // Adds a node with the given parameters and returns the index of the new
+  // node in `node_index` (optionally). Interpreter will take ownership of
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
+  // remains with the caller.
+  TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
+                                     const std::vector<int>& outputs,
+                                     const char* init_data,
+                                     size_t init_data_size, void* builtin_data,
+                                     const TfLiteRegistration* registration,
+                                     int* node_index);
+
+  // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
+  // The value pointed to by `first_new_tensor_index` will be set to the
+  // index of the first new tensor if `first_new_tensor_index` is non-null.
+  TfLiteStatus AddTensors(int tensors_to_add, int* first_new_tensor_index);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization, bool is_variable);
+
+  // WARNING: Experimental interface, subject to change
+  // Overrides execution plan. This bounds checks indices sent in.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
+  // Get a mutable tensor data structure.
+  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
+  // read/write access to structure
+  TfLiteTensor* tensor(int tensor_index) {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_->tensors_size) {
+      return nullptr;
+    }
+    return &context_->tensors[tensor_index];
+  }
+
+  // Get an immutable tensor data structure.
+  const TfLiteTensor* tensor(int tensor_index) const {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_->tensors_size) {
+      return nullptr;
+    }
+    return &context_->tensors[tensor_index];
+  }
+
+  // Read only access to list of inputs.
+  std::vector<int>& inputs() { return inputs_; }
+
+  // Read only access to list of inputs.
+  const std::vector<int>& inputs() const { return inputs_; }
+
+  // Read only access to list of outputs.
+  std::vector<int>& outputs() { return outputs_; }
+
+  // Read only access to list of outputs.
+  const std::vector<int>& outputs() const { return outputs_; }
+
+  // Read only access to list of variable tensors.
+  std::vector<int>& variables() { return variables_; }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& variables() const { return variables_; }
+
+  size_t tensors_size() const { return tensors_.size(); }
+
+  // Return the number of ops in the model.
+  size_t nodes_size() const { return nodes_and_registration_.size(); }
+
+  // Read only access to list of variable tensors.
+  std::vector<int>& execution_plan() { return execution_plan_; }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& execution_plan() const { return execution_plan_; }
+
+  // Mutable form of tensors (TEMPORARY for refactor).
+  // TODO(b/119495520): remove when refactoring complete.
+  std::vector<TfLiteTensor>& tensors() { return tensors_; }
+  // Mutable form of tensors (TEMPORARY for refactor).
+  // TODO(b/119495520): remove when refactoring complete.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() {
+    return nodes_and_registration_;
+  }
+
+  const std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() const {
+    return nodes_and_registration_;
+  }
+
+  // Get a pointer to an operation and registration data structure if in bounds.
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) const {
+    if (node_index < 0 || static_cast<size_t>(node_index) >= nodes_size())
+      return nullptr;
+    return &nodes_and_registration_[node_index];
+  }
+
+  // Change the dimensionality of a given tensor. Note, this is only acceptable
+  // for tensor indices that are inputs.
+  // Returns status of failure or success.
+  // TODO(aselle): Consider implementing ArraySlice equivalent to make this
+  //   more adept at accepting data without an extra copy. Use absl::ArraySlice
+  //   if our partners determine that dependency is acceptable.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims);
+
+  // Update allocations for all tensors. This will redim dependent tensors using
+  // the input tensor dimensionality as given. This is relatively expensive.
+  // If you know that your sizes are not changing, you need not call this.
+  // Returns status of success or failure.
+  TfLiteStatus AllocateTensors();
+
+  // Invoke the subgraph (run the whole graph in dependency order).
+  //
+  // NOTE: It is possible that the interpreter is not in a ready state
+  // to evaluate (i.e. if a ResizeTensor() has been performed without an
+  // AllocateTensors().
+  // Returns status of success or failure.
+  TfLiteStatus Invoke();
+
+  // Entry point for C node plugin API to report an error.
+  void ReportError(const char* format, ...);
+
+  void UseNNAPI(bool enable);
+
+  // Return the subgraph specific context.
+  TfLiteContext* context() { return context_; }
+
+  // Set the value of an external context.
+  void SetExternalContext(TfLiteExternalContextType type,
+                          TfLiteExternalContext* ctx);
+  // Get the half precision flag.
+  // WARNING: This is an experimental API and subject to change.
+  bool GetAllowFp16PrecisionForFp32() const {
+    return context_->allow_fp32_relax_to_fp16;
+  }
+
+  // Ensure the data in `tensor.data` is readable. In case delegate is used,
+  // it might require to copy the data from delegate buffer to raw memory.
+  // WARNING: This is an experimental API and subject to change.
+  // TODO(b/119495520): make this private when refactoring complete.
+  TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
+    TfLiteTensor* t = &tensors_[tensor_index];
+    TF_LITE_ENSURE(context_, t != nullptr);
+    if (t->data_is_stale) {
+      TF_LITE_ENSURE(context_, t->delegate != nullptr);
+      TF_LITE_ENSURE(context_, t->buffer_handle != kTfLiteNullBufferHandle);
+      TF_LITE_ENSURE(context_, t->delegate->CopyFromBufferHandle != nullptr);
+      // TODO(b/120420546): we must add a test that exercise this code.
+      TF_LITE_ENSURE_STATUS(t->delegate->CopyFromBufferHandle(
+          context_, t->delegate, t->buffer_handle, t));
+      t->data_is_stale = false;
+    }
+    return kTfLiteOk;
+  }
+
+  // The default capacity of `tensors_` vector.
+  static constexpr int kTensorsReservedCapacity = 128;
+  // The capacity headroom of `tensors_` vector before calling ops'
+  // `prepare` and `invoke` function. In these functions, it's guaranteed
+  // allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
+  // pointers to existing tensors.
+  static constexpr int kTensorsCapacityHeadroom = 16;
+
+  // Reset all variable tensors to the default value.
+  // If a variable tensor doesn't have a buffer, reset it to zero.
+  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  // to the value of the buffer.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ResetVariableTensors();
+
+  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+
+  profiling::Profiler* GetProfiler() { return profiler_; }
+
+  // Returns a pointer to vector of subgraphs.
+  // WARNING: This is an experimental API and subject to change.
+  std::vector<std::unique_ptr<Subgraph>>* GetSubgraphs() { return subgraphs_; }
+
+  // True if all tensors in the graph has static size after calling
+  // `AllocateTensors` function.
+  // Before `AllocateTensors` is called, this will always return true;
+  bool HasDynamicTensors() { return has_dynamic_tensors_; }
+
+ private:
+  // Prevent 'context_' from accessing functions that are only available to
+  // delegated kernels.
+  void SwitchToKernelContext();
+
+  // Add delegate-only functions to 'context_'.
+  void SwitchToDelegateContext();
+
+  // Give 'op_reg' a chance to initialize itself using the contents of
+  // 'buffer'.
+  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
+               size_t length) {
+    if (op_reg.init == nullptr) return nullptr;
+    return op_reg.init(context_, buffer, length);
+  }
+
+  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
+  void OpFree(const TfLiteRegistration& op_reg, void* buffer) {
+    if (op_reg.free == nullptr) return;
+    if (buffer) {
+      op_reg.free(context_, buffer);
+    }
+  }
+
+  // Prepare the given 'node' for execution.
+  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.prepare == nullptr) return kTfLiteOk;
+    return op_reg.prepare(context_, node);
+  }
+
+  // Invoke the operator represented by 'node'.
+  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.invoke == nullptr) return kTfLiteError;
+    return op_reg.invoke(context_, node);
+  }
+
+  // Call OpPrepare() for as many ops as possible, allocating memory for their
+  // tensors. If an op containing dynamic tensors is found, preparation will be
+  // postponed until this function is called again. This allows the interpreter
+  // to wait until Invoke() to resolve the sizes of dynamic tensors.
+  TfLiteStatus PrepareOpsAndTensors();
+
+  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
+  // dynamic tensors is found or all ops have been prepared. Fill
+  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
+  // the last in the graph.
+  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    int* last_execution_plan_index_prepared);
+
+  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
+  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
+  // `context_` whenever this std::vector is reallocated. Currently this
+  // only happens in `AddTensors()`.
+  std::vector<TfLiteTensor> tensors_;
+
+  // Check if an array of tensor indices are valid with respect to the Tensor
+  // array.
+  // NOTE: this changes consistent_ to be false if indices are out of bounds.
+  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
+                                  int length);
+
+  // Compute the number of bytes required to represent a tensor with dimensions
+  // specified by the array dims (of length dims_size). Returns the status code
+  // and bytes.
+  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
+                             size_t* bytes);
+
+  // Request an tensor be resized implementation. If the given tensor is of
+  // type kTfLiteDynamic it will also be allocated new memory.
+  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
+
+  // Report a detailed error string (will be printed to stderr).
+  // TODO(aselle): allow user of class to provide alternative destinations.
+  void ReportErrorImpl(const char* format, va_list args);
+
+  // Entry point for C node plugin API to request an tensor be resized.
+  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
+                                   TfLiteIntArray* new_size);
+  // Entry point for C node plugin API to report an error.
+  static void ReportErrorC(TfLiteContext* context, const char* format, ...);
+
+  // Entry point for C node plugin API to add new tensors.
+  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
+                                 int* first_new_tensor_index);
+
+  // WARNING: This is an experimental API and subject to change.
+  // Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
+  static TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteContext* context, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
+
+  // Update the execution graph to replace some of the nodes with stub
+  // nodes. Specifically any node index that has `nodes[index]==1` will be
+  // slated for replacement with a delegate kernel specified by registration.
+  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegate* delegate);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets the internal pointer to a TensorFlow lite node by node_index.
+  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
+                                      TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get a node by index.
+  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
+                                             int node_index, TfLiteNode** node,
+                                             TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
+  // owns this memory and it is only guaranteed to exist during the invocation
+  // of the delegate prepare.
+  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get the execution plan.
+  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
+                                       TfLiteIntArray** execution_plan);
+
+  // Retrieve an existing external context by type.
+  TfLiteExternalContext* GetExternalContext(TfLiteExternalContextType type);
+  static TfLiteExternalContext* GetExternalContext(
+      struct TfLiteContext* context, TfLiteExternalContextType type);
+
+  // Set the value of an external context.
+  static void SetExternalContext(struct TfLiteContext* context,
+                                 TfLiteExternalContextType type,
+                                 TfLiteExternalContext* ctx);
+
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+
+  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
+  // capacity. Calling this function may invalidate existing pointers to
+  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
+  // more tensors won't invalidate the pointer to existing tensors.
+  void EnsureTensorsVectorCapacity() {
+    const size_t required_capacity = tensors_.size() + kTensorsCapacityHeadroom;
+    if (required_capacity > tensors_.capacity()) {
+      tensors_.reserve(required_capacity);
+      context_->tensors = tensors_.data();
+    }
+  }
+
+  // The state of the Interpreter.
+  enum State {
+    // The interpreter isn't ready to be invoked.
+    // `AllocateTensor` need to be called to enter an invokable state.
+    kStateUninvokable = 0,
+    // The interpreter is ready to be invoked.
+    kStateInvokable,
+    // The interpreter is ready to be invoked, and graph can't be further
+    // modified. The interpreter will enter this state when calling
+    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
+    kStateInvokableAndImmutable,
+  };
+  State state_ = kStateUninvokable;
+
+  // A pure C data structure used to communicate with the pure C plugin
+  // interface. To avoid copying tensor metadata, this is also the definitive
+  // structure to store tensors.
+  // TODO(b/119495520): Get rid of owned and just make context_ a instance.
+  TfLiteContext owned_context_;
+  TfLiteContext* context_;
+
+  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
+  // function pointers to actual implementation.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
+      nodes_and_registration_;
+
+  // Whether the model is consistent. That is to say if the inputs and outputs
+  // of every node and the global inputs and outputs are valid indexes into
+  // the tensor array.
+  bool consistent_ = true;
+
+  // Array of indices representing the tensors that are inputs to the
+  // interpreter.
+  std::vector<int> inputs_;
+
+  // Array of indices representing the tensors that are outputs to the
+  // interpreter.
+  std::vector<int> outputs_;
+
+  // Array of indices representing the tensors that are variable tensors.
+  std::vector<int> variables_;
+
+  // The error reporter delegate that tflite will forward queries errors to.
+  ErrorReporter* error_reporter_;
+
+  // Index of the next node to prepare.
+  // During Invoke(), Interpreter will allocate input tensors first, which are
+  // known to be fixed size. Then it will allocate outputs from nodes as many
+  // as possible. When there is a node that produces dynamic sized tensor.
+  // Interpreter will stop allocating tensors, set the value of next allocate
+  // node id, and execute the node to generate the output tensor before continue
+  // to allocate successors. This process repeats until all nodes are executed.
+  // NOTE: this relies on the order of nodes that is in topological order.
+  int next_execution_plan_index_to_prepare_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  std::vector<int> execution_plan_;
+
+  // In the future, we'd like a TfLiteIntArray compatible representation.
+  // TODO(aselle): replace execution_plan_ with this.
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
+
+  // Whether to delegate to NN API
+  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
+
+  std::unique_ptr<MemoryPlanner> memory_planner_;
+
+  // Tracking bit for whether a tensor was resized in the course of an op
+  // invocation. This is a useful hint to ensure that dynamic tensor outputs
+  // trigger downstream reallocation after op invocation.
+  bool tensor_resized_since_op_invoke_ = false;
+
+  // External contexts (kTfLiteMaxExternalContexts).
+  TfLiteExternalContext** external_contexts_;
+
+  // Profiler for this interpreter instance.
+  profiling::Profiler* profiler_ = nullptr;
+
+  // A pointer to vector of subgraphs. The vector is owned by the interpreter.
+  std::vector<std::unique_ptr<Subgraph>>* subgraphs_ = nullptr;
+
+  // True if all tensors in the graph has static size after calling
+  // `PrepareOpsStartingAt` function (which is called by the `AllocateTensors`
+  // public function).
+  // The value is invalid before `PrepareOpStartingAt` is called.
+  bool has_dynamic_tensors_ = true;
+};
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_CORE_SUBGRAPH_H_
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..75083bf95a126fe7a8d1ca92af2cfa0c5a85f371
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -0,0 +1,241 @@
+#
+# This is a TF Lite delegate that is powered by TensorFlow's Eager.
+#
+package(default_visibility = [
+    "//visibility:private",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "buffer_map",
+    srcs = ["buffer_map.cc"],
+    hdrs = ["buffer_map.h"],
+    deps = [
+        ":util",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite:string_util",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:framework",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "buffer_map_test",
+    size = "small",
+    srcs = ["buffer_map_test.cc"],
+    deps = [
+        ":buffer_map",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Delegate implementation that pulls in the standard set of TensorFlow ops and
+# kernels.
+cc_library(
+    name = "delegate",
+    hdrs = [
+        "delegate.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":delegate_data",
+        ":delegate_only_runtime",
+        "//tensorflow/lite/c:c_api_internal",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:tensorflow",
+        ],
+    }),
+    alwayslink = 1,
+)
+
+# Delegate implementation that does *not* pull in the standard set of TensorFlow
+# ops and kernels.
+cc_library(
+    name = "delegate_only_runtime",
+    srcs = [
+        "delegate.cc",
+    ],
+    hdrs = [
+        "delegate.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":buffer_map",
+        ":delegate_data",
+        ":kernel",
+        ":util",
+        "@com_google_absl//absl/strings:strings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite:util",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+        ],
+    }),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "delegate_test",
+    size = "small",
+    srcs = ["delegate_test.cc"],
+    tags = ["no_gpu"],  # GPU + flex is not officially supported.
+    deps = [
+        ":delegate",
+        ":test_util",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "delegate_data",
+    srcs = ["delegate_data.cc"],
+    hdrs = ["delegate_data.h"],
+    deps = [
+        ":buffer_map",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/core/common_runtime/eager:context",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:lib",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "delegate_data_test",
+    size = "small",
+    srcs = ["delegate_data_test.cc"],
+    deps = [
+        ":delegate_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "kernel",
+    srcs = ["kernel.cc"],
+    hdrs = ["kernel.h"],
+    deps = [
+        ":delegate_data",
+        ":util",
+        "@flatbuffers",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:execute",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+    ] + select({
+        # TODO(b/111881878): The android_tensorflow_lib target pulls in the full
+        # set of core TensorFlow kernels. We may want to revisit this dependency
+        # to allow selective registration via build targets.
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "kernel_test",
+    size = "small",
+    srcs = ["kernel_test.cc"],
+    tags = ["no_gpu"],  # GPU + flex is not officially supported.
+    deps = [
+        ":delegate_data",
+        ":kernel",
+        ":test_util",
+        "@com_google_googletest//:gtest",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:tensorflow",
+        ],
+    }),
+)
+
+cc_library(
+    name = "test_util",
+    testonly = True,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    deps = [
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite:kernel_api",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "util_test",
+    size = "small",
+    srcs = ["util_test.cc"],
+    deps = [
+        ":util",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d0c953636672e33130a991b1a302f410e42f381
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/flex/buffer_map.h"
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/lite/delegates/flex/util.h"
+#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/log_memory.h"
+
+namespace tflite {
+namespace flex {
+namespace {
+// A tensor buffer that is allocated, deallocated and populated by TF Lite.
+class BaseTfLiteTensorBuffer : public tensorflow::TensorBuffer {
+  using tensorflow::TensorBuffer::TensorBuffer;
+
+  TensorBuffer* root_buffer() override { return this; }
+  void FillAllocationDescription(
+      tensorflow::AllocationDescription* proto) const override {
+    tensorflow::int64 rb = size();
+    proto->set_requested_bytes(rb);
+    proto->set_allocator_name(tensorflow::cpu_allocator()->Name());
+  }
+
+  // Prevents input forwarding from mutating this buffer.
+  bool OwnsMemory() const override { return false; }
+
+ protected:
+  void LogAllocation() {
+    if (tensorflow::LogMemory::IsEnabled() && data() != nullptr) {
+      tensorflow::LogMemory::RecordRawAllocation(
+          "TfLiteTensorBuffer_New",
+          tensorflow::LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID, size(),
+          data(), tensorflow::cpu_allocator());
+    }
+  }
+  void LogDeallocation() {
+    if (tensorflow::LogMemory::IsEnabled() && data() != nullptr) {
+      tensorflow::LogMemory::RecordRawDeallocation(
+          "TfLiteTensorBuffer_Delete",
+          tensorflow::LogMemory::EXTERNAL_TENSOR_ALLOCATION_STEP_ID, data(),
+          tensorflow::cpu_allocator(), false);
+    }
+  }
+};
+
+// A tensor buffer for most data types. Numeric types have exactly the same
+// representation in TFLITE and TF, so we just need use memcpy().
+class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
+ public:
+  explicit TfLiteTensorBuffer(const TfLiteTensor* tensor)
+      : BaseTfLiteTensorBuffer(tensorflow::cpu_allocator()->AllocateRaw(
+            EIGEN_MAX_ALIGN_BYTES, tensor->bytes)) {
+    // TODO(ahentz): if we can guarantee that TF Lite allocated tensors with
+    // the same alignment as TensorFlow (EIGEN_MAX_ALIGN_BYTES), then we can
+    // potentially eliminate the copy below.
+    len_ = tensor->bytes;
+
+    LogAllocation();
+
+    if (data()) {
+      std::memcpy(data(), tensor->data.raw, tensor->bytes);
+    }
+  }
+
+  ~TfLiteTensorBuffer() override {
+    LogDeallocation();
+    tensorflow::cpu_allocator()->DeallocateRaw(data());
+  }
+
+  size_t size() const override { return len_; }
+
+ private:
+  size_t len_;
+};
+
+// A string buffer. TFLITE string tensor format is different than
+// TF's so we need perform the conversion here.
+class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
+ public:
+  explicit StringTfLiteTensorBuffer(const TfLiteTensor* tensor)
+      : StringTfLiteTensorBuffer(tensor, tensor->data.raw != nullptr
+                                             ? GetStringCount(tensor->data.raw)
+                                             : 0) {}
+
+  ~StringTfLiteTensorBuffer() override {
+    LogDeallocation();
+    tensorflow::cpu_allocator()->Deallocate<string>(
+        static_cast<string*>(data()), num_strings_);
+  }
+
+  size_t size() const override { return num_strings_ * sizeof(string); }
+
+ private:
+  StringTfLiteTensorBuffer(const TfLiteTensor* tensor, int num_strings)
+      : BaseTfLiteTensorBuffer(
+            num_strings != 0
+                ? tensorflow::cpu_allocator()->Allocate<string>(num_strings)
+                : nullptr),
+        num_strings_(num_strings) {
+    LogAllocation();
+
+    if (data()) {
+      string* p = static_cast<string*>(data());
+      for (size_t i = 0; i < num_strings_; ++p, ++i) {
+        auto ref = GetString(tensor->data.raw, i);
+        p->assign(ref.str, ref.len);
+      }
+    }
+  }
+
+  int num_strings_;
+};
+
+}  // namespace
+
+BufferMap::BufferMap() {}
+
+BufferMap::~BufferMap() {}
+
+bool BufferMap::HasTensor(int tensor_index) const {
+  return id_to_tensor_.count(tensor_index) != 0;
+}
+
+bool BufferMap::IsTensorFlowTensor(int tensor_index) const {
+  return HasTensor(tensor_index) && owned_by_tf_.count(tensor_index) > 0;
+}
+
+tensorflow::Tensor BufferMap::GetTensor(int tensor_index) const {
+  return id_to_tensor_.at(tensor_index);
+}
+
+void BufferMap::SetFromTfLite(int tensor_index, const TfLiteTensor* tensor) {
+  tensorflow::TensorShape shape;
+  int num_dims = tensor->dims->size;
+  for (int i = 0; i < num_dims; ++i) {
+    shape.AddDim(tensor->dims->data[i]);
+  }
+  // TODO(ahentz): we assume this is a new tensor and allocate a new buffer
+  // for it. This is not always the best approach. For example, this might
+  // be a reallocation after resizing tensors. In that case it would be
+  // preferable to somehow reuse the buffer.
+  BaseTfLiteTensorBuffer* buf;
+  if (tensor->type == kTfLiteString) {
+    buf = new StringTfLiteTensorBuffer(tensor);
+  } else {
+    buf = new TfLiteTensorBuffer(tensor);
+  }
+  tensorflow::Tensor t = tensorflow::TensorCApi::MakeTensor(
+      GetTensorFlowDataType(tensor->type), shape, buf);
+  buf->Unref();
+
+  id_to_tensor_[tensor_index] = std::move(t);
+  owned_by_tf_.erase(tensor_index);
+}
+
+void BufferMap::SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor) {
+  id_to_tensor_[tensor_index] = std::move(tensor);
+  owned_by_tf_.insert(tensor_index);
+}
+
+}  // namespace flex
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/buffer_map.h b/tensorflow/lite/delegates/flex/buffer_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..b73ed88d3789d5df8dadaee19d468596ccd4c782
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/buffer_map.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
+
+#include <map>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tflite {
+namespace flex {
+
+// Maps a TF Lite tensor index into a TensorFlow tensor.
+//
+// The TF Lite interpreter assigns integer indices to each of its tensors, but
+// the Flex delegate deals in terms of TensorFlow tensors. This class maps
+// from indices to tensors and allows the creation of new tensors to be
+// associated with a given index.
+class BufferMap {
+ public:
+  BufferMap();
+  ~BufferMap();
+
+  // Returns true if the given 'tensor_index' has a corresponding
+  // tensorflow::Tensor.
+  bool HasTensor(int tensor_index) const;
+
+  // Returns true if the given 'tensor_index' has a corresponding
+  // tensorflow::Tensor *and* the content is owned by TensorFlow (that is, the
+  // mapping was added by SetFromTensorFlow()).
+  bool IsTensorFlowTensor(int tensor_index) const;
+
+  // Returns the tensorflow::Tensor associated with the given 'tensor_index'.
+  // Precondition: HasTensor() is true.
+  tensorflow::Tensor GetTensor(int tensor_index) const;
+
+  // Associates the given tensorflow::Tensor with the given 'tensor_index'.
+  // Note that TensorFlow Tensors share data buffers, so this method is only a
+  // shallow copy.
+  void SetFromTensorFlow(int tensor_index, tensorflow::Tensor tensor);
+
+  // Same as above but creates a new tensorflow::Tensor with a copy of the
+  // given TfLiteTensor's data.
+  void SetFromTfLite(int tensor_index, const TfLiteTensor* tensor);
+
+ private:
+  // Mapping from TL Lite tensor ID to TensorFlow's Tensor. All tensors that
+  // are inputs or outputs of a subgraph will be added here, irrespective of
+  // whether their data are managed by TF Lite or TensorFlow.
+  std::map<int, tensorflow::Tensor> id_to_tensor_;
+  // A list of tensors that are completely managed by TensorFlow. Most of the
+  // time, TF Lite will populate tensors that are inputs to subgraphs, while
+  // TensorFlow will populate output tensors. Occasionally, however, an input
+  // tensor is coming from a previous subgraph and could have been populated by
+  // TensorFlow. This set keeps track of all input or output tensors that have
+  // been populated by tensorflow.
+  std::set<int> owned_by_tf_;
+};
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_BUFFER_MAP_H_
diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e8472f1e7d2c3e0f5e73f3e5ce98bae7f15063f
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc
@@ -0,0 +1,233 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/flex/buffer_map.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace flex {
+namespace {
+
+using ::testing::ElementsAre;
+
+// A bit of RAII to simplify handling of TfLiteTensors in the tests.
+using UniqueTfLiteTensor =
+    std::unique_ptr<TfLiteTensor, std::function<void(TfLiteTensor*)>>;
+
+template <typename T>
+UniqueTfLiteTensor MakeLiteTensor(const std::vector<int>& shape,
+                                  const std::vector<T>& data) {
+  auto tensor = UniqueTfLiteTensor(new TfLiteTensor, [](TfLiteTensor* t) {
+    TfLiteTensorDataFree(t);
+    TfLiteIntArrayFree(t->dims);
+    delete t;
+  });
+  tensor->allocation_type = kTfLiteDynamic;
+  tensor->type = typeToTfLiteType<T>();
+  tensor->dims = ConvertVectorToTfLiteIntArray(shape);
+  tensor->data.raw = nullptr;
+  tensor->is_variable = false;
+  TfLiteTensorRealloc(data.size() * sizeof(T), tensor.get());
+  memcpy(tensor->data.raw, data.data(), data.size() * sizeof(T));
+  return tensor;
+}
+
+template <>
+UniqueTfLiteTensor MakeLiteTensor<string>(const std::vector<int>& shape,
+                                          const std::vector<string>& data) {
+  auto tensor = UniqueTfLiteTensor(new TfLiteTensor, [](TfLiteTensor* t) {
+    TfLiteTensorDataFree(t);
+    TfLiteIntArrayFree(t->dims);
+    delete t;
+  });
+  tensor->allocation_type = kTfLiteDynamic;
+  tensor->type = typeToTfLiteType<string>();
+  tensor->dims = ConvertVectorToTfLiteIntArray(shape);
+  tensor->data.raw = nullptr;
+  tensor->is_variable = false;
+  TfLiteTensorRealloc(data.size() * sizeof(string), tensor.get());
+
+  DynamicBuffer b;
+  for (const string& s : data) {
+    b.AddString(s.data(), s.size());
+  }
+  b.WriteToTensor(tensor.get(), ConvertVectorToTfLiteIntArray(shape));
+  return tensor;
+}
+
+template <typename T>
+tensorflow::Tensor MakeTensor(const std::vector<int>& shape,
+                              const std::vector<T>& data) {
+  BufferMap buffer_map;  // BufferMap is the easiest way to build the tensor.
+  UniqueTfLiteTensor t1 = MakeLiteTensor<T>(shape, data);
+  buffer_map.SetFromTfLite(0, t1.get());
+  return buffer_map.GetTensor(0);
+}
+
+std::vector<tensorflow::int64> GetTensorShape(const tensorflow::Tensor& t) {
+  std::vector<tensorflow::int64> shape(t.dims());
+  for (int i = 0; i < t.dims(); ++i) {
+    shape[i] = t.dim_size(i);
+  }
+  return shape;
+}
+
+template <typename T>
+std::vector<T> GetTensorData(const tensorflow::Tensor& t) {
+  const T* data = t.flat<T>().data();
+  return std::vector<T>(data, data + t.NumElements());
+}
+
+TEST(BufferMapTest, EmptyBuffer) {
+  BufferMap buffer_map;
+  EXPECT_FALSE(buffer_map.HasTensor(0));
+}
+
+TEST(BufferMapTest, SetFromTfLite) {
+  BufferMap buffer_map;
+
+  UniqueTfLiteTensor t =
+      MakeLiteTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
+  buffer_map.SetFromTfLite(0, t.get());
+  ASSERT_TRUE(buffer_map.HasTensor(0));
+
+  EXPECT_THAT(GetTensorData<float>(buffer_map.GetTensor(0)),
+              ElementsAre(0, 0, 0, 0.123f, 0, 0));
+
+  // Also check details of the tensor.
+  tensorflow::Tensor out_tensor = buffer_map.GetTensor(0);
+  ASSERT_EQ(out_tensor.dtype(), tensorflow::DT_FLOAT);
+  ASSERT_EQ(out_tensor.NumElements(), 6);
+  ASSERT_THAT(GetTensorShape(out_tensor), ElementsAre(1, 2, 1, 3));
+}
+
+TEST(BufferMapTest, SetFromTfLiteString) {
+  BufferMap buffer_map;
+
+  UniqueTfLiteTensor t =
+      MakeLiteTensor<string>({1, 2, 1, 3}, {"", "", "", "str1", "", ""});
+  buffer_map.SetFromTfLite(0, t.get());
+  ASSERT_TRUE(buffer_map.HasTensor(0));
+
+  EXPECT_THAT(GetTensorData<string>(buffer_map.GetTensor(0)),
+              ElementsAre("", "", "", "str1", "", ""));
+
+  // Also check details of the tensor.
+  tensorflow::Tensor out_tensor = buffer_map.GetTensor(0);
+  ASSERT_EQ(out_tensor.dtype(), tensorflow::DT_STRING);
+  ASSERT_EQ(out_tensor.NumElements(), 6);
+  ASSERT_THAT(GetTensorShape(out_tensor), ElementsAre(1, 2, 1, 3));
+}
+
+TEST(BufferMapTest, SetFromTfLiteTwice) {
+  UniqueTfLiteTensor t1 =
+      MakeLiteTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
+  UniqueTfLiteTensor t2 =
+      MakeLiteTensor<int>({1, 2, 4}, {0, 0, 0, 3, 0, 0, 1, 2});
+
+  BufferMap buffer_map;
+  buffer_map.SetFromTfLite(0, t1.get());
+  buffer_map.SetFromTfLite(0, t2.get());
+
+  EXPECT_THAT(GetTensorData<int>(buffer_map.GetTensor(0)),
+              ElementsAre(0, 0, 0, 3, 0, 0, 1, 2));
+}
+
+TEST(BufferMapTest, SetFromTfLiteStringTwice) {
+  UniqueTfLiteTensor t1 =
+      MakeLiteTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
+  UniqueTfLiteTensor t2 =
+      MakeLiteTensor<string>({1, 2, 4}, {"", "", "", "s3", "", "", "s1", "s2"});
+
+  BufferMap buffer_map;
+  buffer_map.SetFromTfLite(0, t1.get());
+  buffer_map.SetFromTfLite(0, t2.get());
+
+  EXPECT_THAT(GetTensorData<string>(buffer_map.GetTensor(0)),
+              ElementsAre("", "", "", "s3", "", "", "s1", "s2"));
+}
+
+TEST(BufferMapTest, SetFromTensorFlow) {
+  tensorflow::Tensor t1 =
+      MakeTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
+
+  BufferMap buffer_map;
+  buffer_map.SetFromTensorFlow(0, t1);
+
+  EXPECT_THAT(GetTensorData<float>(buffer_map.GetTensor(0)),
+              ElementsAre(0, 0, 0, 0.123f, 0, 0));
+
+  // Also check details of the tensor.
+  tensorflow::Tensor out_tensor = buffer_map.GetTensor(0);
+  ASSERT_EQ(out_tensor.dtype(), tensorflow::DT_FLOAT);
+  ASSERT_EQ(out_tensor.NumElements(), 6);
+  ASSERT_THAT(GetTensorShape(out_tensor), ElementsAre(1, 2, 1, 3));
+}
+
+TEST(BufferMapTest, SetFromTensorFlowTwice) {
+  tensorflow::Tensor t1 =
+      MakeTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
+  tensorflow::Tensor t2 = MakeTensor<int>({1, 2, 4}, {0, 0, 0, 3, 0, 0, 1, 2});
+  BufferMap buffer_map;
+  buffer_map.SetFromTensorFlow(0, t1);
+  buffer_map.SetFromTensorFlow(0, t2);
+
+  EXPECT_THAT(GetTensorData<int>(buffer_map.GetTensor(0)),
+              ElementsAre(0, 0, 0, 3, 0, 0, 1, 2));
+}
+
+TEST(BufferMapTest, TfLiteOverwritesTensorFlow) {
+  tensorflow::Tensor t1 =
+      MakeTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
+  UniqueTfLiteTensor t2 =
+      MakeLiteTensor<int>({1, 2, 4}, {0, 0, 0, 3, 0, 0, 1, 2});
+
+  BufferMap buffer_map;
+  buffer_map.SetFromTensorFlow(0, t1);
+  buffer_map.SetFromTfLite(0, t2.get());
+
+  EXPECT_FALSE(buffer_map.IsTensorFlowTensor(0));
+  EXPECT_THAT(GetTensorData<int>(buffer_map.GetTensor(0)),
+              ElementsAre(0, 0, 0, 3, 0, 0, 1, 2));
+}
+
+TEST(BufferMapTest, TensorFlowOverwritesTfLite) {
+  tensorflow::Tensor t1 =
+      MakeTensor<float>({1, 2, 1, 3}, {0, 0, 0, 0.123f, 0, 0});
+  UniqueTfLiteTensor t2 =
+      MakeLiteTensor<int>({1, 2, 4}, {0, 0, 0, 3, 0, 0, 1, 2});
+  BufferMap buffer_map;
+  buffer_map.SetFromTfLite(0, t2.get());
+  buffer_map.SetFromTensorFlow(0, t1);
+
+  EXPECT_TRUE(buffer_map.IsTensorFlowTensor(0));
+  EXPECT_THAT(GetTensorData<float>(buffer_map.GetTensor(0)),
+              ElementsAre(0, 0, 0, 0.123f, 0, 0));
+}
+
+}  // namespace
+}  // namespace flex
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca7314fbaee6644cf9385a1d7b0b2964d6a2762f
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/flex/delegate.h"
+
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/flex/buffer_map.h"
+#include "tensorflow/lite/delegates/flex/kernel.h"
+#include "tensorflow/lite/delegates/flex/util.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace flex {
+namespace delegate {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  // Get the nodes in the current execution plan. Interpreter owns this array.
+  TfLiteIntArray* plan;
+  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
+
+  // Add all custom ops starting with "Flex" to list of supported nodes.
+  std::vector<int> supported_nodes;
+  for (int node_index : TfLiteIntArrayView(plan)) {
+    TfLiteNode* node;
+    TfLiteRegistration* registration;
+    TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
+        context, node_index, &node, &registration));
+
+    if (IsFlexOp(registration->custom_name)) {
+      supported_nodes.push_back(node_index);
+    }
+  }
+
+  // Request TFLite to partition the graph and make kernels for each independent
+  // node sub set.
+  TfLiteIntArray* size_and_nodes =
+      ConvertVectorToTfLiteIntArray(supported_nodes);
+  context->ReplaceNodeSubsetsWithDelegateKernels(context, GetKernel(),
+                                                 size_and_nodes, delegate);
+  TfLiteIntArrayFree(size_and_nodes);
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
+                                  TfLiteDelegate* delegate,
+                                  TfLiteBufferHandle buffer_handle,
+                                  TfLiteTensor* output) {
+  BufferMap* buffer_map =
+      reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap(context);
+
+  if (!buffer_map->HasTensor(buffer_handle)) {
+    context->ReportError(context, "Invalid tensor index %d.", buffer_handle);
+    return kTfLiteError;
+  }
+
+  tensorflow::Tensor t = buffer_map->GetTensor(buffer_handle);
+
+  if (output->type == kTfLiteString) {
+    if (t.dtype() != tensorflow::DT_STRING) {
+      context->ReportError(context,
+                           "Inconsistent type for TF string tensor index %d.",
+                           buffer_handle);
+      return kTfLiteError;
+    }
+    DynamicBuffer dynamic_buffer;
+
+    auto tf_data = t.flat<string>();
+    for (int i = 0; i < t.NumElements(); ++i) {
+      dynamic_buffer.AddString(tf_data(i).data(), tf_data(i).size());
+    }
+
+    dynamic_buffer.WriteToTensor(output, /*new_shape=*/nullptr);
+    return kTfLiteOk;
+  }
+
+  tensorflow::StringPiece t_data = t.tensor_data();
+
+  if (output->bytes != t_data.size()) {
+    context->ReportError(context,
+                         absl::StrCat("The given ", output->bytes,
+                                      " bytes are not enough to store "
+                                      "TensorFlow's aligned buffer of size ",
+                                      t_data.size(), " bytes.")
+                             .c_str());
+    return kTfLiteError;
+  }
+
+  memcpy(output->data.raw, t_data.data(), t_data.size());
+  return kTfLiteOk;
+}
+
+}  // namespace delegate
+}  // namespace flex
+
+// Corresponding weak declaration found in lite/model.cc.
+std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
+AcquireFlexDelegate() {
+  return std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
+      tflite::FlexDelegate::Create().release(), [](TfLiteDelegate* delegate) {
+        delete reinterpret_cast<tflite::FlexDelegate*>(delegate);
+      });
+}
+
+std::unique_ptr<FlexDelegate> FlexDelegate::Create() {
+  std::unique_ptr<flex::DelegateData> delegate_data;
+  if (!flex::DelegateData::Create(&delegate_data).ok()) {
+    fprintf(stderr, "Unable to initialize TensorFlow context.\n");
+    return nullptr;
+  }
+
+  return std::unique_ptr<FlexDelegate>(
+      new FlexDelegate(std::move(delegate_data)));
+}
+
+FlexDelegate::FlexDelegate(std::unique_ptr<flex::DelegateData> delegate_data)
+    : TfLiteDelegate(TfLiteDelegateCreate()),
+      delegate_data_(std::move(delegate_data)) {
+  data_ = delegate_data_.get();
+  Prepare = &flex::delegate::Prepare;
+  CopyFromBufferHandle = &flex::delegate::CopyFromBufferHandle;
+  flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+}
+
+FlexDelegate::~FlexDelegate() {}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/flex/delegate.h b/tensorflow/lite/delegates/flex/delegate.h
similarity index 85%
rename from tensorflow/contrib/lite/delegates/flex/delegate.h
rename to tensorflow/lite/delegates/flex/delegate.h
index 1017780dc75de1cd334e0cca901bbe20ddf0bf41..018ff3e0b0e1fe7a842154581e2201b82412f885 100644
--- a/tensorflow/contrib/lite/delegates/flex/delegate.h
+++ b/tensorflow/lite/delegates/flex/delegate.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_H_
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/delegates/flex/delegate_data.h"
 
 namespace tflite {
 
@@ -56,4 +56,4 @@ class FlexDelegate : public TfLiteDelegate {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_H_
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
similarity index 86%
rename from tensorflow/contrib/lite/delegates/flex/delegate_data.cc
rename to tensorflow/lite/delegates/flex/delegate_data.cc
index 8f985f770cfba9fc6a7184cfdb0a35e9e6c754af..1483a530388d1dd48ff6179de4ddc2084ddb3d87 100644
--- a/tensorflow/contrib/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -12,22 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/lite/delegates/flex/delegate_data.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tflite {
 namespace flex {
 tensorflow::Status DelegateData::Create(std::unique_ptr<DelegateData>* data) {
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
       tensorflow::SessionOptions(), "/job:localhost/replica:0/task:0",
       &devices));
 
-  std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
-      new tensorflow::DeviceMgr(devices));
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr =
+      absl::make_unique<tensorflow::DeviceMgr>(std::move(devices));
   // Note that Rendezvous is ref-counted so it will be automatically deleted.
   tensorflow::Rendezvous* rendezvous =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
diff --git a/tensorflow/contrib/lite/delegates/flex/delegate_data.h b/tensorflow/lite/delegates/flex/delegate_data.h
similarity index 87%
rename from tensorflow/contrib/lite/delegates/flex/delegate_data.h
rename to tensorflow/lite/delegates/flex/delegate_data.h
index 8d75f0b0efe758074d035f0ebcf0f5f12602323b..a88cc98d03cd40d33ab9f5eaf312086dc2b2a7cc 100644
--- a/tensorflow/contrib/lite/delegates/flex/delegate_data.h
+++ b/tensorflow/lite/delegates/flex/delegate_data.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
 
-#include "tensorflow/contrib/lite/delegates/flex/buffer_map.h"
+#include "tensorflow/lite/delegates/flex/buffer_map.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 
 namespace tflite {
@@ -49,4 +49,4 @@ class DelegateData {
 }  // namespace flex
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
diff --git a/tensorflow/contrib/lite/delegates/flex/delegate_data_test.cc b/tensorflow/lite/delegates/flex/delegate_data_test.cc
similarity index 90%
rename from tensorflow/contrib/lite/delegates/flex/delegate_data_test.cc
rename to tensorflow/lite/delegates/flex/delegate_data_test.cc
index 30b10f435a23785f88e2645714a414501bc2fab9..cd274e7cb1ccb51d9b5e7ece845f2120e7c5a79e 100644
--- a/tensorflow/contrib/lite/delegates/flex/delegate_data_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data_test.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/lite/delegates/flex/delegate_data.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace flex {
diff --git a/tensorflow/contrib/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
similarity index 91%
rename from tensorflow/contrib/lite/delegates/flex/delegate_test.cc
rename to tensorflow/lite/delegates/flex/delegate_test.cc
index 1813952cef99ef10b638ade7bcfcca486b2b3b76..ee37090d94eaadca2a767a0ea9a2ad105618da97 100644
--- a/tensorflow/contrib/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/delegate.h"
+#include "tensorflow/lite/delegates/flex/delegate.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/delegates/flex/test_util.h"
+#include "tensorflow/lite/delegates/flex/test_util.h"
 
 namespace tflite {
 namespace flex {
 namespace {
 
-using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
 
 class DelegateTest : public testing::FlexModelTest {
@@ -40,8 +39,7 @@ class DelegateTest : public testing::FlexModelTest {
   }
 
   void ConfigureDelegate() {
-    ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(
-                  delegate_.get(), /*allow_dynamic_tensors=*/true),
+    ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(delegate_.get()),
               kTfLiteOk);
   }
 
@@ -94,6 +92,25 @@ TEST_F(DelegateTest, NonFloatTypeInference) {
   ASSERT_EQ(GetType(2), kTfLiteInt32);
 }
 
+TEST_F(DelegateTest, StringInference) {
+  AddTensors(3, {0, 1}, {2}, kTfLiteString, {2});
+
+  AddTfOp(testing::kAdd, {0, 1}, {2});
+
+  ConfigureDelegate();
+
+  SetShape(0, {2, 2});
+  SetStringValues(0, {"1", "2", "3", "4"});
+  SetShape(1, {2, 2});
+  SetStringValues(1, {"4", "3", "2", "1"});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(2), ElementsAre(2, 2));
+  ASSERT_THAT(GetStringValues(2), ElementsAre("14", "23", "32", "41"));
+  ASSERT_EQ(GetType(2), kTfLiteString);
+}
+
 TEST_F(DelegateTest, MixedGraph) {
   AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
 
diff --git a/tensorflow/contrib/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
similarity index 89%
rename from tensorflow/contrib/lite/delegates/flex/kernel.cc
rename to tensorflow/lite/delegates/flex/kernel.cc
index e4f1aea990da97da08a3e5adf2dd70307b20fe88..02da1d1a224ee87c34c2a019bff6430fd0e7d88a 100644
--- a/tensorflow/contrib/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -12,21 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/flex/kernel.h"
+#include "tensorflow/lite/delegates/flex/kernel.h"
 
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/builtin_ops.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/context_util.h"
-#include "tensorflow/contrib/lite/delegates/flex/delegate_data.h"
-#include "tensorflow/contrib/lite/delegates/flex/util.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/string.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/execute.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/lite/delegates/flex/util.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/string.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
 // completed soon.
@@ -78,11 +79,18 @@ tensorflow::Status ExecuteFlexOp(tensorflow::EagerContext* eager_context,
                                  const std::vector<int>& inputs,
                                  const std::vector<int>& outputs) {
   const tensorflow::AttrTypeMap* attr_types;
+  bool is_function = false;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types),
+      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types, &is_function),
       " (while processing attributes of '", op_name, "')");
-
-  tensorflow::EagerOperation op(eager_context, op_name.c_str(), attr_types);
+  if (is_function) {
+    return tensorflow::errors::NotFound(
+        "Operation '", op_name,
+        "' is not registered.  (while processing attributes of '", op_name,
+        "')");
+  }
+  tensorflow::EagerOperation op(eager_context, op_name.c_str(),
+                                /*is_function=*/false, attr_types);
   for (const auto& attr : nodedef.attr()) {
     op.MutableAttrs()->Set(attr.first, attr.second);
   }
@@ -251,7 +259,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   for (auto tensor_index : op_data->subgraph_inputs) {
     TfLiteTensor* tensor = &context->tensors[tensor_index];
     if (!IsConstantTensor(tensor)) {
-      buffer_map->SetFromTfLite(tensor_index, tensor);
+      // If this tensor is part of an earlier TF subgraph we should not add it
+      // to the BufferMap again, because TF already knows about it and its
+      // contents are kept automatically up-to-date.
+      if (!buffer_map->IsTensorFlowTensor(tensor_index)) {
+        buffer_map->SetFromTfLite(tensor_index, tensor);
+      }
     }
   }
 
diff --git a/tensorflow/contrib/lite/delegates/flex/kernel.h b/tensorflow/lite/delegates/flex/kernel.h
similarity index 83%
rename from tensorflow/contrib/lite/delegates/flex/kernel.h
rename to tensorflow/lite/delegates/flex/kernel.h
index ac9313a37bd5a3f5e23057512f07674c44801989..ca74c28570f6aa11e42162fa8dde1fb2ab411437 100644
--- a/tensorflow/contrib/lite/delegates/flex/kernel.h
+++ b/tensorflow/lite/delegates/flex/kernel.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_KERNEL_H_
-#define TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_KERNEL_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 namespace flex {
@@ -31,4 +31,4 @@ TfLiteRegistration GetKernel();
 }  // namespace flex
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_FLEX_KERNEL_H_
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..efb7300b0bd9693f93fc4b7fb3078c384130cf65
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -0,0 +1,281 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/flex/kernel.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/lite/delegates/flex/test_util.h"
+
+namespace tflite {
+namespace flex {
+namespace {
+
+using ::testing::ContainsRegex;
+using ::testing::ElementsAre;
+
+TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
+                            const std::vector<int>& supported_nodes) {
+  TfLiteIntArray* size_and_nodes =
+      ConvertVectorToTfLiteIntArray(supported_nodes);
+  TF_LITE_ENSURE_STATUS(context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, flex::GetKernel(), size_and_nodes, delegate));
+  TfLiteIntArrayFree(size_and_nodes);
+  return kTfLiteOk;
+}
+
+class KernelTest : public testing::FlexModelTest {
+ public:
+  KernelTest() {
+    CHECK(DelegateData::Create(&delegate_data_).ok());
+    interpreter_.reset(new Interpreter(&error_reporter_));
+  }
+
+  ~KernelTest() override {
+    // The data needs to be released before the interpreter because the
+    // interpreter references the data.
+    delegate_data_.reset();
+    interpreter_.reset();
+  }
+
+  template <typename T>
+  void ConfigureDelegate(T prepare_function) {
+    delegate_.data_ = delegate_data_.get();
+    delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+    delegate_.FreeBufferHandle = nullptr;
+    delegate_.Prepare = prepare_function;
+    delegate_.CopyFromBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* output) {
+      auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
+      tensorflow::StringPiece values = delegate_data->GetBufferMap(context)
+                                           ->GetTensor(buffer_handle)
+                                           .tensor_data();
+      memcpy(output->data.raw, values.data(), values.size());
+      return kTfLiteOk;
+    };
+    CHECK(interpreter_->ModifyGraphWithDelegate(&delegate_) == kTfLiteOk);
+  }
+
+ private:
+  std::unique_ptr<DelegateData> delegate_data_;
+  TfLiteDelegate delegate_;
+};
+
+TEST_F(KernelTest, FullGraph) {
+  // Define the graph.
+  AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
+
+  AddTfOp(testing::kUnpack, {0}, {1, 2});
+  AddTfOp(testing::kUnpack, {3}, {4, 5});
+  AddTfOp(testing::kAdd, {1, 4}, {6});
+  AddTfOp(testing::kAdd, {2, 5}, {7});
+  AddTfOp(testing::kMul, {6, 7}, {8});
+
+  // Apply Delegate.
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1, 2, 3, 4});
+  });
+
+  // Define inputs.
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+  SetShape(3, {2, 2, 1});
+  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+
+  // Try again with different inputs
+  SetShape(0, {2, 3, 1});
+  SetValues(0, {2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f});
+  SetShape(3, {2, 3, 1});
+  SetValues(3, {2.0f, 2.0f, 3.0f, 3.0f, 4.0f, 4.0f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(3, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(24.0f, 32.0f, 48.0f));
+}
+
+TEST_F(KernelTest, BadTensorFlowOp) {
+  AddTensors(2, {0}, {1}, kTfLiteFloat32, {3});
+  AddTfOp(testing::kNonExistent, {0}, {1});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("while processing attributes of 'NonExistentOp'"));
+}
+
+TEST_F(KernelTest, BadNumberOfOutputs) {
+  AddTensors(3, {0}, {1, 2}, kTfLiteFloat32, {3});
+  AddTfOp(testing::kIdentity, {0}, {1, 2});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("Unexpected number of outputs"));
+}
+
+TEST_F(KernelTest, IncompatibleNodeDef) {
+  AddTensors(2, {0}, {1}, kTfLiteFloat32, {3});
+
+  // Cast is a TF op, but we don't add the proper nodedef to it in AddTfOp.
+  AddTfOp(testing::kIncompatibleNodeDef, {0}, {1});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("while executing 'Cast' via Eager"));
+}
+
+TEST_F(KernelTest, WrongSetOfNodes) {
+  AddTensors(4, {0}, {3}, kTfLiteFloat32, {3});
+  AddTfOp(testing::kUnpack, {0}, {1, 2});
+  AddTfLiteMulOp({1, 2}, {3});
+
+  // Specify that testing::kMul (#1) is supported when it actually isn't.
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("Invalid NodeDef in Flex op"));
+}
+
+TEST_F(KernelTest, MixedGraph) {
+  AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
+
+  AddTfOp(testing::kUnpack, {0}, {1, 2});
+  AddTfOp(testing::kUnpack, {3}, {4, 5});
+  AddTfOp(testing::kAdd, {1, 4}, {6});
+  AddTfOp(testing::kAdd, {2, 5}, {7});
+  AddTfLiteMulOp({6, 7}, {8});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1, 2, 3});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+  SetShape(3, {2, 2, 1});
+  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+}
+
+// We will build a complex graph where most of the ops are TF ops, but one
+// of them, right in the middle is handle natively by TF Lite. This results
+// in two flex subgraphs to handle the TF ops, and some of the tensors
+// connect those two subgraphs directly.
+TEST_F(KernelTest, SplitGraph) {
+  std::vector<float> a = {3.0f, 1.0f, 0.5f, -1.0f, 4.0f, -1.0f, -2.0f, 5.0f};
+  std::vector<float> b = {0.0f, 1.0f, 1.5f, 3.0f};
+
+  AddTensors(18, {0, 1}, {17}, kTfLiteFloat32, {3});
+
+  // Split the first input. Each branch below uses one half of it.
+  AddTfOp(testing::kUnpack, {0}, {2, 10});
+
+  // The left branch: l = (a0 + b0) * (a2 + b2) + (a1 + b1) * (a3 + b3) = 10
+  AddTfOp(testing::kAdd, {1, 2}, {3});     // => 3, 2, 2, 2
+  AddTfOp(testing::kUnpack, {3}, {4, 5});  // => 3, 2 --- 2, 2
+  AddTfLiteMulOp({4, 5}, {6});             // => 6, 4
+  AddTfOp(testing::kUnpack, {6}, {7, 8});  // => 6 -- 4
+  AddTfOp(testing::kAdd, {7, 8}, {9});     // => 10
+
+  // The right branch: r = (a4 + a6) + (a5 + a7) = 6
+  AddTfOp(testing::kUnpack, {10}, {11, 12});  // => 4, -1 --- -2, 5
+  AddTfOp(testing::kAdd, {11, 12}, {13});     // => 2, 4
+  AddTfOp(testing::kUnpack, {13}, {14, 15});  // => 2 --- 4
+  AddTfOp(testing::kAdd, {14, 15}, {16});     // => 6
+
+  // The two branches added together:
+  AddTfOp(testing::kAdd, {9, 16}, {17});  // => 16
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    // All ops by #3 are TF ops, handled by the delegate. However, because #4
+    // depends on the non-TF op, two subgraphs are necessary:
+    //    TF subgraph 1: 0, 1, 2, 6, 7, 8, 9
+    //    TF Lite Op: 3
+    //    TF subgraph 2: 4, 5, 10
+    return GenericPrepare(context, delegate, {0, 1, 2, 4, 5, 6, 7, 8, 9, 10});
+  });
+
+  SetShape(0, {2, 2, 2, 1});
+  SetValues(0, a);
+  SetShape(1, {2, 2, 1});
+  SetValues(1, b);
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(17), ElementsAre(1));
+  ASSERT_THAT(GetValues(17), ElementsAre(16.0f));
+
+  // Same as above but with slightly different output.
+  // We still expect the result to be l + r where
+  //     l = (a0 + b0) * (a2 + b2) + (a1 + b1) * (a3 + b3)
+  //     r = (a4 + a6) + (a5 + a7)
+  SetShape(0, {2, 2, 2, 1});
+  SetValues(0, {4.0f, 1.0f, 1.5f, -2.0f, 2.0f, 0.0f, -2.0f, 3.0f});
+  SetShape(1, {2, 2, 1});
+  SetValues(1, {0.0f, 2.0f, 1.5f, 3.0f});
+  // So l = (4 + 0) * (1.5 + 1.5) + (1 + 2) * (-2 + 3) =  12 + 3 = 15
+  //    r = (2 - 2) + (0 + 3) = 3
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(17), ElementsAre(1));
+  ASSERT_THAT(GetValues(17), ElementsAre(18.0f));
+}
+
+}  // namespace
+}  // namespace flex
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/delegates/flex/test_util.cc b/tensorflow/lite/delegates/flex/test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa24675a7b1beab8632435debc8dd1fc04f347e7
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/test_util.cc
@@ -0,0 +1,190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/flex/test_util.h"
+
+#include "absl/memory/memory.h"
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/string.h"
+
+namespace tflite {
+namespace flex {
+namespace testing {
+
+bool FlexModelTest::Invoke() { return interpreter_->Invoke() == kTfLiteOk; }
+
+void FlexModelTest::SetStringValues(int tensor_index,
+                                    const std::vector<string>& values) {
+  DynamicBuffer dynamic_buffer;
+  for (const string& s : values) {
+    dynamic_buffer.AddString(s.data(), s.size());
+  }
+  dynamic_buffer.WriteToTensor(interpreter_->tensor(tensor_index),
+                               /*new_shape=*/nullptr);
+}
+
+std::vector<string> FlexModelTest::GetStringValues(int tensor_index) const {
+  std::vector<string> result;
+
+  TfLiteTensor* tensor = interpreter_->tensor(tensor_index);
+  auto num_strings = GetStringCount(tensor->data.raw);
+  for (size_t i = 0; i < num_strings; ++i) {
+    auto ref = GetString(tensor->data.raw, i);
+    result.push_back(string(ref.str, ref.len));
+  }
+
+  return result;
+}
+
+void FlexModelTest::SetShape(int tensor_index, const std::vector<int>& values) {
+  ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+}
+
+std::vector<int> FlexModelTest::GetShape(int tensor_index) {
+  std::vector<int> result;
+  auto* dims = interpreter_->tensor(tensor_index)->dims;
+  result.reserve(dims->size);
+  for (int i = 0; i < dims->size; ++i) {
+    result.push_back(dims->data[i]);
+  }
+  return result;
+}
+
+TfLiteType FlexModelTest::GetType(int tensor_index) {
+  return interpreter_->tensor(tensor_index)->type;
+}
+
+void FlexModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
+                               const std::vector<int>& outputs, TfLiteType type,
+                               const std::vector<int>& dims) {
+  interpreter_->AddTensors(num_tensors);
+  for (int i = 0; i < num_tensors; ++i) {
+    TfLiteQuantizationParams quant;
+    // Suppress explicit output type specification to ensure type inference
+    // works properly.
+    if (std::find(outputs.begin(), outputs.end(), i) != outputs.end()) {
+      type = kTfLiteFloat32;
+    }
+    CHECK_EQ(interpreter_->SetTensorParametersReadWrite(i, type,
+                                                        /*name=*/"",
+                                                        /*dims=*/dims, quant),
+             kTfLiteOk);
+  }
+
+  CHECK_EQ(interpreter_->SetInputs(inputs), kTfLiteOk);
+  CHECK_EQ(interpreter_->SetOutputs(outputs), kTfLiteOk);
+}
+
+void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
+                                   const std::vector<int>& outputs) {
+  static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+  reg.builtin_code = BuiltinOperator_MUL;
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    auto* i0 = &context->tensors[node->inputs->data[0]];
+    auto* o = &context->tensors[node->outputs->data[0]];
+    return context->ResizeTensor(context, o, TfLiteIntArrayCopy(i0->dims));
+  };
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    auto* i0 = &context->tensors[node->inputs->data[0]];
+    auto* i1 = &context->tensors[node->inputs->data[1]];
+    auto* o = &context->tensors[node->outputs->data[0]];
+    for (int i = 0; i < o->bytes / sizeof(float); ++i) {
+      o->data.f[i] = i0->data.f[i] * i1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+
+  CHECK_EQ(interpreter_->AddNodeWithParameters(inputs, outputs, nullptr, 0,
+                                               nullptr, &reg),
+           kTfLiteOk);
+}
+
+void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
+                            const std::vector<int>& outputs) {
+  auto attr = [](const string& key, const string& value) {
+    return " attr{ key: '" + key + "' value {" + value + "}}";
+  };
+
+  string type_attribute;
+  switch (interpreter_->tensor(inputs[0])->type) {
+    case kTfLiteInt32:
+      type_attribute = attr("T", "type: DT_INT32");
+      break;
+    case kTfLiteFloat32:
+      type_attribute = attr("T", "type: DT_FLOAT");
+      break;
+    case kTfLiteString:
+      type_attribute = attr("T", "type: DT_STRING");
+      break;
+    default:
+      // TODO(b/113613439): Use nodedef string utilities to properly handle all
+      // types.
+      LOG(FATAL) << "Type not supported";
+      break;
+  }
+
+  if (op == kUnpack) {
+    string attributes =
+        type_attribute + attr("num", "i: 2") + attr("axis", "i: 0");
+    AddTfOp("FlexUnpack", "Unpack", attributes, inputs, outputs);
+  } else if (op == kIdentity) {
+    string attributes = type_attribute;
+    AddTfOp("FlexIdentity", "Identity", attributes, inputs, outputs);
+  } else if (op == kAdd) {
+    string attributes = type_attribute;
+    AddTfOp("FlexAdd", "Add", attributes, inputs, outputs);
+  } else if (op == kMul) {
+    string attributes = type_attribute;
+    AddTfOp("FlexMul", "Mul", attributes, inputs, outputs);
+  } else if (op == kNonExistent) {
+    AddTfOp("NonExistentOp", "NonExistentOp", "", inputs, outputs);
+  } else if (op == kIncompatibleNodeDef) {
+    // "Cast" op is created without attributes - making it incompatible.
+    AddTfOp("FlexCast", "Cast", "", inputs, outputs);
+  }
+}
+
+void FlexModelTest::AddTfOp(const char* tflite_name, const string& tf_name,
+                            const string& nodedef_str,
+                            const std::vector<int>& inputs,
+                            const std::vector<int>& outputs) {
+  static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+  reg.builtin_code = BuiltinOperator_CUSTOM;
+  reg.custom_name = tflite_name;
+
+  tensorflow::NodeDef nodedef;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      nodedef_str + " op: '" + tf_name + "'", &nodedef));
+  string serialized_nodedef;
+  CHECK(nodedef.SerializeToString(&serialized_nodedef));
+  flexbuffers::Builder fbb;
+  fbb.Vector([&]() {
+    fbb.String(nodedef.op());
+    fbb.String(serialized_nodedef);
+  });
+  fbb.Finish();
+
+  flexbuffers_.push_back(fbb.GetBuffer());
+  auto& buffer = flexbuffers_.back();
+  CHECK_EQ(interpreter_->AddNodeWithParameters(
+               inputs, outputs, reinterpret_cast<const char*>(buffer.data()),
+               buffer.size(), nullptr, &reg),
+           kTfLiteOk);
+}
+
+}  // namespace testing
+}  // namespace flex
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/test_util.h b/tensorflow/lite/delegates/flex/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cc2dc30e92586535687187105057d41ab5c0350
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/test_util.h
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_TEST_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_TEST_UTIL_H_
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace flex {
+namespace testing {
+
+enum TfOpType {
+  kUnpack,
+  kIdentity,
+  kAdd,
+  kMul,
+  // Represents an op that does not exist in TensorFlow.
+  kNonExistent,
+  // Represents an valid TensorFlow op where the NodeDef is incompatible.
+  kIncompatibleNodeDef,
+};
+
+// This class creates models with TF and TFLite ops. In order to use this class
+// to test the Flex delegate, implement a function that calls
+// interpreter->ModifyGraphWithDelegate.
+class FlexModelTest : public ::testing::Test {
+ public:
+  FlexModelTest() {}
+  ~FlexModelTest() {}
+
+  bool Invoke();
+
+  // Sets the (typed) tensor's values at the given index.
+  template <typename T>
+  void SetTypedValues(int tensor_index, const std::vector<T>& values) {
+    memcpy(interpreter_->typed_tensor<T>(tensor_index), values.data(),
+           values.size() * sizeof(T));
+  }
+
+  // Returns the (typed) tensor's values at the given index.
+  template <typename T>
+  std::vector<T> GetTypedValues(int tensor_index) {
+    const TfLiteTensor* t = interpreter_->tensor(tensor_index);
+    const T* tdata = interpreter_->typed_tensor<T>(tensor_index);
+    return std::vector<T>(tdata, tdata + t->bytes / sizeof(T));
+  }
+
+  // Sets the tensor's values at the given index.
+  void SetValues(int tensor_index, const std::vector<float>& values) {
+    SetTypedValues<float>(tensor_index, values);
+  }
+  void SetStringValues(int tensor_index, const std::vector<string>& values);
+
+  // Returns the tensor's values at the given index.
+  std::vector<float> GetValues(int tensor_index) {
+    return GetTypedValues<float>(tensor_index);
+  }
+  std::vector<string> GetStringValues(int tensor_index) const;
+
+  // Sets the tensor's shape at the given index.
+  void SetShape(int tensor_index, const std::vector<int>& values);
+
+  // Returns the tensor's shape at the given index.
+  std::vector<int> GetShape(int tensor_index);
+
+  // Returns the tensor's type at the given index.
+  TfLiteType GetType(int tensor_index);
+
+  const TestErrorReporter& error_reporter() const { return error_reporter_; }
+
+  // Adds `num_tensor` tensors to the model. `inputs` contains the indices of
+  // the input tensors and `outputs` contains the indices of the output
+  // tensors. All tensors are set to have `type` and `dims`.
+  void AddTensors(int num_tensors, const std::vector<int>& inputs,
+                  const std::vector<int>& outputs, TfLiteType type,
+                  const std::vector<int>& dims);
+
+  // Adds a TFLite Mul op. `inputs` contains the indices of the input tensors
+  // and `outputs` contains the indices of the output tensors.
+  void AddTfLiteMulOp(const std::vector<int>& inputs,
+                      const std::vector<int>& outputs);
+
+  // Adds a TensorFlow op. `inputs` contains the indices of the
+  // input tensors and `outputs` contains the indices of the output tensors.
+  // This function is limited to the set of ops defined in TfOpType.
+  void AddTfOp(TfOpType op, const std::vector<int>& inputs,
+               const std::vector<int>& outputs);
+
+ protected:
+  std::unique_ptr<Interpreter> interpreter_;
+  TestErrorReporter error_reporter_;
+
+ private:
+  // Helper method to add a TensorFlow op. tflite_names needs to start with
+  // "Flex" in order to work with the Flex delegate.
+  void AddTfOp(const char* tflite_name, const string& tf_name,
+               const string& nodedef_str, const std::vector<int>& inputs,
+               const std::vector<int>& outputs);
+
+  std::vector<std::vector<uint8_t>> flexbuffers_;
+};
+
+}  // namespace testing
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_TEST_UTIL_H_
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c995b360f9d5ecfaced217a372af38690aee74f6
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/flex/util.h"
+
+namespace tflite {
+namespace flex {
+
+TfLiteStatus ConvertStatus(TfLiteContext* context,
+                           const tensorflow::Status& status) {
+  if (!status.ok()) {
+    context->ReportError(context, "%s", status.error_message().c_str());
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CopyShapeAndType(TfLiteContext* context,
+                              const tensorflow::Tensor& src,
+                              TfLiteTensor* tensor) {
+  tensor->type = GetTensorFlowLiteType(static_cast<TF_DataType>(src.dtype()));
+  if (tensor->type == kTfLiteNoType) {
+    context->ReportError(context,
+                         "TF Lite does not support TensorFlow data type: %s",
+                         DataTypeString(src.dtype()).c_str());
+    return kTfLiteError;
+  }
+
+  int num_dims = src.dims();
+  TfLiteIntArray* shape = TfLiteIntArrayCreate(num_dims);
+  for (int j = 0; j < num_dims; ++j) {
+    // We need to cast from TensorFlow's int64 to TF Lite's int32. Let's
+    // make sure there's no overflow.
+    if (src.dim_size(j) >= std::numeric_limits<int>::max()) {
+      context->ReportError(context,
+                           "Dimension value in TensorFlow shape is larger than "
+                           "supported by TF Lite");
+      TfLiteIntArrayFree(shape);
+      return kTfLiteError;
+    }
+    shape->data[j] = static_cast<int>(src.dim_size(j));
+  }
+  return context->ResizeTensor(context, tensor, shape);
+}
+
+TF_DataType GetTensorFlowDataType(TfLiteType type) {
+  switch (type) {
+    case kTfLiteNoType:
+      return TF_FLOAT;
+    case kTfLiteFloat32:
+      return TF_FLOAT;
+    case kTfLiteInt16:
+      return TF_INT16;
+    case kTfLiteInt32:
+      return TF_INT32;
+    case kTfLiteUInt8:
+      return TF_UINT8;
+    case kTfLiteInt8:
+      return TF_INT8;
+    case kTfLiteInt64:
+      return TF_INT64;
+    case kTfLiteComplex64:
+      return TF_COMPLEX64;
+    case kTfLiteString:
+      return TF_STRING;
+    case kTfLiteBool:
+      return TF_BOOL;
+  }
+}
+
+TfLiteType GetTensorFlowLiteType(TF_DataType type) {
+  switch (type) {
+    case TF_FLOAT:
+      return kTfLiteFloat32;
+    case TF_INT16:
+      return kTfLiteInt16;
+    case TF_INT32:
+      return kTfLiteInt32;
+    case TF_UINT8:
+      return kTfLiteUInt8;
+    case TF_INT8:
+      return kTfLiteInt8;
+    case TF_INT64:
+      return kTfLiteInt64;
+    case TF_COMPLEX64:
+      return kTfLiteComplex64;
+    case TF_STRING:
+      return kTfLiteString;
+    case TF_BOOL:
+      return kTfLiteBool;
+    default:
+      return kTfLiteNoType;
+  }
+}
+
+}  // namespace flex
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/util.h b/tensorflow/lite/delegates/flex/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..8aaa73d1b3e3708ec819bb6e9b97c0a5951097da
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/util.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_FLEX_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_FLEX_UTIL_H_
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tflite {
+namespace flex {
+
+// Converts a tensorflow:Status into a TfLiteStatus. If the original status
+// represented an error, reports it using the given 'context'.
+TfLiteStatus ConvertStatus(TfLiteContext* context,
+                           const tensorflow::Status& status);
+
+// Copies the given shape and type of the TensorFlow 'src' tensor into a TF Lite
+// 'tensor'. Logs an error and returns kTfLiteError if the shape or type can't
+// be converted.
+TfLiteStatus CopyShapeAndType(TfLiteContext* context,
+                              const tensorflow::Tensor& src,
+                              TfLiteTensor* tensor);
+
+// Returns the TF C API Data type that corresponds to the given TfLiteType.
+TF_DataType GetTensorFlowDataType(TfLiteType type);
+
+// Returns the TfLiteType that corresponds to the given TF C API Data type.
+TfLiteType GetTensorFlowLiteType(TF_DataType);
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_FLEX_UTIL_H_
diff --git a/tensorflow/lite/delegates/flex/util_test.cc b/tensorflow/lite/delegates/flex/util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87104751b81b6af71ac6d7b2a0a22615c1005255
--- /dev/null
+++ b/tensorflow/lite/delegates/flex/util_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/flex/util.h"
+
+#include <cstdarg>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/string.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace flex {
+namespace {
+
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT32;
+using tensorflow::Tensor;
+using ::testing::ElementsAre;
+
+struct TestContext : public TfLiteContext {
+  string error;
+  std::vector<int> new_size;
+};
+
+void ReportError(TfLiteContext* context, const char* format, ...) {
+  TestContext* c = static_cast<TestContext*>(context);
+  const size_t kBufferSize = 1024;
+  char temp_buffer[kBufferSize];
+
+  va_list args;
+  va_start(args, format);
+  vsnprintf(temp_buffer, kBufferSize, format, args);
+  va_end(args);
+
+  c->error = temp_buffer;
+}
+
+TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
+                          TfLiteIntArray* new_size) {
+  TestContext* c = static_cast<TestContext*>(context);
+  c->new_size.clear();
+  for (int i = 0; i < new_size->size; ++i) {
+    c->new_size.push_back(new_size->data[i]);
+  }
+  TfLiteIntArrayFree(new_size);
+  return kTfLiteOk;
+}
+
+TEST(UtilTest, ConvertStatus) {
+  TestContext context;
+  context.ReportError = ReportError;
+
+  EXPECT_EQ(ConvertStatus(&context, tensorflow::errors::Internal("Some Error")),
+            kTfLiteError);
+  EXPECT_EQ(context.error, "Some Error");
+
+  context.error.clear();
+  EXPECT_EQ(ConvertStatus(&context, tensorflow::Status()), kTfLiteOk);
+  EXPECT_TRUE(context.error.empty());
+}
+
+TEST(UtilTest, CopyShapeAndType) {
+  TestContext context;
+  context.ReportError = ReportError;
+  context.ResizeTensor = ResizeTensor;
+
+  TfLiteTensor dst;
+
+  EXPECT_EQ(CopyShapeAndType(&context, Tensor(), &dst), kTfLiteOk);
+  EXPECT_THAT(context.new_size, ElementsAre(0));
+  EXPECT_EQ(dst.type, kTfLiteFloat32);
+
+  EXPECT_EQ(CopyShapeAndType(&context, Tensor(DT_FLOAT, {1, 2}), &dst),
+            kTfLiteOk);
+  EXPECT_THAT(context.new_size, ElementsAre(1, 2));
+  EXPECT_EQ(dst.type, kTfLiteFloat32);
+
+  EXPECT_EQ(CopyShapeAndType(&context, Tensor(DT_INT32, {1, 2}), &dst),
+            kTfLiteOk);
+  EXPECT_THAT(context.new_size, ElementsAre(1, 2));
+  EXPECT_EQ(dst.type, kTfLiteInt32);
+
+  EXPECT_EQ(CopyShapeAndType(&context, Tensor(DT_FLOAT, {1LL << 44, 2}), &dst),
+            kTfLiteError);
+  EXPECT_EQ(context.error,
+            "Dimension value in TensorFlow shape is larger than supported by "
+            "TF Lite");
+
+  EXPECT_EQ(
+      CopyShapeAndType(&context, Tensor(tensorflow::DT_HALF, {1, 2}), &dst),
+      kTfLiteError);
+  EXPECT_EQ(context.error,
+            "TF Lite does not support TensorFlow data type: half");
+}
+
+TEST(UtilTest, TypeConversionsFromTFLite) {
+  EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteNoType));
+  EXPECT_EQ(TF_FLOAT, GetTensorFlowDataType(kTfLiteFloat32));
+  EXPECT_EQ(TF_INT16, GetTensorFlowDataType(kTfLiteInt16));
+  EXPECT_EQ(TF_INT32, GetTensorFlowDataType(kTfLiteInt32));
+  EXPECT_EQ(TF_UINT8, GetTensorFlowDataType(kTfLiteUInt8));
+  EXPECT_EQ(TF_INT64, GetTensorFlowDataType(kTfLiteInt64));
+  EXPECT_EQ(TF_COMPLEX64, GetTensorFlowDataType(kTfLiteComplex64));
+  EXPECT_EQ(TF_STRING, GetTensorFlowDataType(kTfLiteString));
+  EXPECT_EQ(TF_BOOL, GetTensorFlowDataType(kTfLiteBool));
+}
+
+TEST(UtilTest, TypeConversionsFromTensorFlow) {
+  EXPECT_EQ(kTfLiteFloat32, GetTensorFlowLiteType(TF_FLOAT));
+  EXPECT_EQ(kTfLiteInt16, GetTensorFlowLiteType(TF_INT16));
+  EXPECT_EQ(kTfLiteInt32, GetTensorFlowLiteType(TF_INT32));
+  EXPECT_EQ(kTfLiteUInt8, GetTensorFlowLiteType(TF_UINT8));
+  EXPECT_EQ(kTfLiteInt64, GetTensorFlowLiteType(TF_INT64));
+  EXPECT_EQ(kTfLiteComplex64, GetTensorFlowLiteType(TF_COMPLEX64));
+  EXPECT_EQ(kTfLiteString, GetTensorFlowLiteType(TF_STRING));
+  EXPECT_EQ(kTfLiteBool, GetTensorFlowLiteType(TF_BOOL));
+  EXPECT_EQ(kTfLiteNoType, GetTensorFlowLiteType(TF_RESOURCE));
+  EXPECT_EQ(kTfLiteNoType, GetTensorFlowLiteType(TF_VARIANT));
+}
+
+}  // namespace
+}  // namespace flex
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..fd954ba222627ab0457711b87baf9c3f7573e129
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -0,0 +1,34 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "nnapi_delegate",
+    srcs = ["nnapi_delegate.cc"],
+    hdrs = ["nnapi_delegate.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/nnapi:nnapi_lib",
+    ],
+)
+
+tf_cc_test(
+    name = "nnapi_delegate_test",
+    size = "small",
+    srcs = ["nnapi_delegate_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":nnapi_delegate",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4fe07004a82ff30228d866bcc7a90067e5940aca
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -0,0 +1,1221 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+
+#ifdef __ANDROID__
+#include <sys/mman.h>
+#include <sys/system_properties.h>
+#include <unistd.h>
+#endif
+
+namespace tflite {
+namespace {
+
+// TODO(b/80621585): Consider printing error string, but don't for now to
+// minimize binary size.
+#define CHECK_NN(context, code)                                           \
+  if (code != ANEURALNETWORKS_NO_ERROR) {                                 \
+    context->ReportError(context, "NN API returned error (%d).\n", code); \
+    return kTfLiteError;                                                  \
+  }
+
+namespace {
+int32_t GetAndroidSdkVersion() {
+#ifdef __ANDROID__
+  const char* sdkProp = "ro.build.version.sdk";
+  char sdkVersion[PROP_VALUE_MAX];
+  int length = __system_property_get(sdkProp, sdkVersion);
+  if (length != 0) {
+    for (int i = 0; i < length; ++i) {
+      int digit = sdkVersion[i] - '0';
+      if (digit < 0 || digit > 9) {
+        // Non-numeric SDK version, assume it's higher then expected;
+        return std::numeric_limits<int32_t>::max();
+      }
+    }
+    return atoi(sdkVersion);
+  }
+#endif  // __ANDROID__
+  return 0;
+}
+
+constexpr int32_t kMinSdkVersionForNNAPI = 27;
+constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
+static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
+
+}  // namespace
+
+// RAII NN API Model Destructor for use with std::unique_ptr
+struct NNFreeModel {
+  void operator()(ANeuralNetworksModel* model) {
+    ANeuralNetworksModel_free(model);
+  }
+};
+// RAII NN API Compilation Destructor for use with std::unique_ptr
+struct NNFreeCompilation {
+  void operator()(ANeuralNetworksCompilation* model) {
+    ANeuralNetworksCompilation_free(model);
+  }
+};
+
+// Manage NNAPI shared memory handle
+class NNMemory {
+ public:
+  NNMemory(const char* name, size_t size) {
+#ifdef __ANDROID__
+    byte_size_ = size;
+    fd_ = ASharedMemory_create(name, size);
+    data_ptr_ = reinterpret_cast<uint8_t*>(
+        mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
+    ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE, fd_, 0,
+                                       &nn_memory_handle_);
+#endif
+  }
+
+  ~NNMemory() {
+#ifdef __ANDROID__
+    if (data_ptr_) {
+      munmap(data_ptr_, byte_size_);
+    }
+    if (nn_memory_handle_) {
+      ANeuralNetworksMemory_free(nn_memory_handle_);
+    }
+    if (fd_ > 0) close(fd_);
+#endif
+  }
+
+  ANeuralNetworksMemory* get_handle() { return nn_memory_handle_; }
+  uint8_t* get_data_ptr() { return data_ptr_; }
+
+ private:
+#ifdef __ANDROID__
+  int fd_ = 0;
+  size_t byte_size_ = 0;
+#endif
+  uint8_t* data_ptr_ = nullptr;
+  ANeuralNetworksMemory* nn_memory_handle_ = nullptr;
+};  // namespace
+
+// Track tensor indices to NN API tensor indices mapping.
+class OperandMapping {
+ public:
+  // Given a TFLite index return the ANN index. If it doesn't exist
+  // return -1.
+  int lite_index_to_ann(int index) const {
+    if (index < lite_tensor_to_ann_tensor_.size())
+      return lite_tensor_to_ann_tensor_[index];
+    else
+      return -1;
+  }
+
+  // NN API uses non tensor operands instead of structs. This creates one
+  // and returns the index. It uses a std::vector and resizes it as needed
+  // keeping -1 to unmapped values. Intermediate tensors likely will not
+  // be mapped.
+  int add_new_non_tensor_operand() { return next_ann_tensor_index_++; }
+
+  // Add a new mapping from `tflite_index` and return the NN API tensor index.
+  int add_new_ann_tensor_index(int tflite_index) {
+    if (tflite_index >= lite_tensor_to_ann_tensor_.size()) {
+      lite_tensor_to_ann_tensor_.resize(tflite_index + 1, -1);
+    }
+    int new_tensor_index = next_ann_tensor_index_++;
+    lite_tensor_to_ann_tensor_[tflite_index] = new_tensor_index;
+    return new_tensor_index;
+  }
+
+ private:
+  // Next index of ann tensor
+  int next_ann_tensor_index_ = 0;
+
+  // Mapping from lite index. Use a std::vector for speed and code size
+  // rather than a map.
+  std::vector<int> lite_tensor_to_ann_tensor_;
+};
+
+// Abstract builder for building an op in the NN API graph. This handles
+// the disparity between TFLite and NN API operand types. NN API has singular
+// operands for both tensors and parameters, and TFLite separates the two.
+class NNAPIOpBuilder {
+ public:
+  NNAPIOpBuilder(TfLiteContext* context, OperandMapping* tensor_mapping,
+                 ANeuralNetworksModel* nn_model)
+      : context_(context),
+        operand_mapping_(tensor_mapping),
+        nn_model_(nn_model) {}
+
+  TfLiteStatus AddScalarInt32Operand(int32_t value) {
+    return AddScalarOperand<int32_t>(value, ANEURALNETWORKS_INT32);
+  }
+
+  TfLiteStatus AddScalarFloat32Operand(float value) {
+    return AddScalarOperand<float>(value, ANEURALNETWORKS_FLOAT32);
+  }
+
+  TfLiteStatus AddVectorInt32Operand(const int32_t* values,
+                                     uint32_t num_values) {
+    return AddVectorOperand<int32_t>(values, num_values,
+                                     ANEURALNETWORKS_TENSOR_INT32);
+  }
+
+  TfLiteStatus AddVectorFloat32Operand(const float* values,
+                                       uint32_t num_values) {
+    return AddVectorOperand<float>(values, num_values,
+                                   ANEURALNETWORKS_TENSOR_FLOAT32);
+  }
+
+  TfLiteStatus AddPoolingParams(void* data) {
+    auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
+    AddScalarInt32Operand(builtin->padding);
+    AddScalarInt32Operand(builtin->stride_width);
+    AddScalarInt32Operand(builtin->stride_height);
+    AddScalarInt32Operand(builtin->filter_width);
+    AddScalarInt32Operand(builtin->filter_height);
+    AddScalarInt32Operand(builtin->activation);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus AddTensorInput(int tensor_index) {
+    int ann_index;
+    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
+    augmented_inputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus AddTensorOutput(int tensor_index) {
+    int ann_index;
+    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
+    augmented_outputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus AddAdditionalFloat32OutputTensor(uint32_t dimension_count) {
+    std::vector<uint32_t> dims(dimension_count, 0);
+    ANeuralNetworksOperandType operand_type{
+        .type = ANEURALNETWORKS_TENSOR_FLOAT32,
+        .dimensionCount = dimension_count,
+        .dimensions = dims.data()};
+    CHECK_NN(context_,
+             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
+    augmented_outputs_.push_back(ann_operand);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus AddStateFloat32Tensor(int tensor_index,
+                                     int* ann_tensor_index_out) {
+    TfLiteTensor* tensor = &context_->tensors[tensor_index];
+    int ann_index = operand_mapping_->add_new_non_tensor_operand();
+
+    ANeuralNetworksOperandType operand_type{
+        ANEURALNETWORKS_TENSOR_FLOAT32,
+        static_cast<uint32_t>(tensor->dims->size),
+        reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
+        tensor->params.zero_point};
+    CHECK_NN(context_,
+             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    augmented_outputs_.push_back(ann_index);
+
+    *ann_tensor_index_out = ann_index;
+    return kTfLiteOk;
+  }
+
+  // Adds a new NN API tensor that shadows the TF Lite tensor `tensor_index`.
+  // This returns the NN API tensor index corresponding to the created tensor.
+  // If another caller previously created a NN API tensor for `tensor_index`
+  // then the existing one is returned.
+  TfLiteStatus AddTensor(int tensor_index, int* ann_tensor_index_out) {
+    int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
+    if (ann_tensor_index != -1) {
+      *ann_tensor_index_out = ann_tensor_index;
+      return kTfLiteOk;
+    }
+    // Allocate a new tensor index
+    ann_tensor_index = operand_mapping_->add_new_ann_tensor_index(tensor_index);
+
+    // Parameters needed for new type.
+    int32_t nn_type = 0;
+    float scale = 0.0f;
+    int32_t zeroPoint = 0;
+    TfLiteTensor* tensor = &context_->tensors[tensor_index];
+    switch (tensor->type) {
+      case kTfLiteNoType:
+        // Tensors added during initialization of Ops don't have a type yet and
+        // should not be registered with the NNAPI.
+        *ann_tensor_index_out = -1;
+        return kTfLiteOk;
+      case kTfLiteFloat32:
+        nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+        break;
+      case kTfLiteUInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        if (scale == 0) {
+          // TENSOR_QUANT8_ASYMM with zero scale is not valid in NNAPI.
+          scale = 1;
+        }
+        break;
+      case kTfLiteInt32:
+        nn_type = ANEURALNETWORKS_TENSOR_INT32;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
+      default:
+        context_->ReportError(context_, "Logic error in NN API Delegate.\n");
+        return kTfLiteError;
+    }
+
+    ANeuralNetworksOperandType operand_type{
+        nn_type, static_cast<uint32_t>(tensor->dims->size),
+        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
+    CHECK_NN(context_,
+             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+
+    if (tensor->allocation_type == kTfLiteMmapRo) {
+      // TODO(b/80630405): Use NNAPIAllocation.
+      CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
+                             nn_model_, ann_tensor_index, tensor->data.raw,
+                             tensor->bytes));
+    }
+
+    *ann_tensor_index_out = ann_tensor_index;
+    return kTfLiteOk;
+  }
+
+  // Finish emitting the op (of type `type`) into the NN API.
+  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
+    // Actually add a NN API operation
+    CHECK_NN(context_, ANeuralNetworksModel_addOperation(
+                           nn_model_, type,
+                           static_cast<uint32_t>(augmented_inputs_.size()),
+                           augmented_inputs_.data(),
+                           static_cast<uint32_t>(augmented_outputs_.size()),
+                           augmented_outputs_.data()));
+    augmented_inputs_.clear();
+    augmented_outputs_.clear();
+    return kTfLiteOk;
+  }
+
+ private:
+  template <typename T>
+  TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
+    ANeuralNetworksOperandType operand_type{.type = nn_type};
+    CHECK_NN(context_,
+             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
+    CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
+                           nn_model_, ann_operand, &value, sizeof(T)));
+    augmented_inputs_.push_back(ann_operand);
+    return kTfLiteOk;
+  }
+
+  template <typename T>
+  TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
+                                int32_t nn_type) {
+    ANeuralNetworksOperandType operand_type{
+        .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
+    CHECK_NN(context_,
+             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
+    CHECK_NN(context_,
+             ANeuralNetworksModel_setOperandValue(
+                 nn_model_, ann_operand, values, sizeof(T) * num_values));
+    augmented_inputs_.push_back(ann_operand);
+    return kTfLiteOk;
+  }
+
+  // TfLiteContext for error handling. Must be named context for macros to
+  // work.
+  TfLiteContext* context_;
+
+  // Tracks relationship between indices
+  OperandMapping* operand_mapping_;
+
+  // The model
+  ANeuralNetworksModel* nn_model_;
+
+  // Inputs and outputs for the current op. These are augmented in the sense
+  // that NN API uses operands for all arguments, not just tensors, unlike
+  // TensorFlow lite.
+  std::vector<uint32_t> augmented_inputs_;
+  std::vector<uint32_t> augmented_outputs_;
+};
+
+struct NNAPIOpMappingArgs {
+  TfLiteContext* context;
+  NNAPIOpBuilder* builder;
+  TfLiteNode* node;
+  std::vector<int>* model_state_outputs;
+  std::vector<int>* model_state_tfl_inputs;
+};
+
+// The kernel that represents the node sub set of TF Lite being run on NN API.
+class NNAPIDelegateKernel {
+ public:
+  NNAPIDelegateKernel() = default;
+
+  typedef ANeuralNetworksOperationType (*MappingFn)(
+      const NNAPIOpMappingArgs& mapping_args);
+
+  // Return a function that knows how to translate a node into its operands
+  // when called. You can use this function to see if a node is supported
+  // (i.e. that MappingFn is not nullptr).
+  MappingFn Map(TfLiteContext* context, int builtin_code, int version,
+                TfLiteNode* node) {
+    switch (builtin_code) {
+      case kTfLiteBuiltinAdd:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteAddParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_ADD;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinMul:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteMulParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_MUL;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinAveragePool2d:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            mapping_args.builder->AddPoolingParams(
+                mapping_args.node->builtin_data);
+            return ANEURALNETWORKS_AVERAGE_POOL_2D;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinMaxPool2d:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            mapping_args.builder->AddPoolingParams(
+                mapping_args.node->builtin_data);
+            return ANEURALNETWORKS_MAX_POOL_2D;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinL2Pool2d:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            mapping_args.builder->AddPoolingParams(
+                mapping_args.node->builtin_data);
+            return ANEURALNETWORKS_L2_POOL_2D;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinConv2d:
+        if (version == 1) {
+          auto builtin =
+              reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+          if (builtin->dilation_width_factor != 1 ||
+              builtin->dilation_height_factor != 1 || node->inputs->size != 3) {
+            // NNAPI does not support dilated Conv2D.
+            return nullptr;
+          }
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteConvParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->padding);
+            mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
+            mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_CONV_2D;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinDepthwiseConv2d:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->padding);
+            mapping_args.builder->AddScalarInt32Operand(builtin->stride_width);
+            mapping_args.builder->AddScalarInt32Operand(builtin->stride_height);
+            mapping_args.builder->AddScalarInt32Operand(
+                builtin->depth_multiplier);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_DEPTHWISE_CONV_2D;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinFullyConnected:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_FULLY_CONNECTED;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinSoftmax:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
+            return ANEURALNETWORKS_SOFTMAX;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinReshape:
+        if (version == 1 && node->inputs->size == 2) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RESHAPE;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinSqueeze:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteSqueezeParams*>(
+                mapping_args.node->builtin_data);
+            // Note that we add the squeeze dimensions even if the dimensions
+            // were unspecified (empty), as NNAPI requires the operand.
+            mapping_args.builder->AddVectorInt32Operand(
+                builtin->squeeze_dims,
+                static_cast<uint32_t>(builtin->num_squeeze_dims));
+            return ANEURALNETWORKS_SQUEEZE;
+          };
+        } else {
+          return nullptr;
+        }
+      case kTfLiteBuiltinL2Normalization: {
+        auto builtin =
+            reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
+        if (builtin->activation != kTfLiteActNone) {
+          // NNAPI does not support activations
+          return nullptr;
+        }
+        return [](const NNAPIOpMappingArgs& mapping_args)
+                   -> ANeuralNetworksOperationType {
+          return ANEURALNETWORKS_L2_NORMALIZATION;
+        };
+      }
+      case kTfLiteBuiltinLocalResponseNormalization:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteLocalResponseNormParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->radius);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->bias);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->alpha);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
+            return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
+          };
+        } else {
+          // TODO(miaowang): clean-up code and return early in the unsupported
+          // case.
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinLshProjection:
+        if (version == 1) {
+          // NNAPI does not support sparse projection correctly (b/111751836).
+          if (reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data)
+                  ->type == kTfLiteLshProjectionSparse) {
+            return nullptr;
+          }
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteLSHProjectionParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->type);
+            return ANEURALNETWORKS_LSH_PROJECTION;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinConcatenation:
+        if (version == 1 &&
+            reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data)
+                    ->activation == kTfLiteActNone) {
+          if (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8) {
+            // NNAPI only support concatenating quantized tensor of the same
+            // scale and offset.
+            auto first_param = context->tensors[node->inputs->data[0]].params;
+            for (int i = 0; i < node->inputs->size; i++) {
+              auto curr_param = context->tensors[node->inputs->data[i]].params;
+              if (curr_param.scale != first_param.scale ||
+                  curr_param.zero_point != first_param.zero_point) {
+                return nullptr;
+              }
+            }
+          }
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->axis);
+            return ANEURALNETWORKS_CONCATENATION;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinDequantize:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_DEQUANTIZE;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinFloor:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_FLOOR;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinRelu:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RELU;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinReluN1To1:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RELU1;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinRelu6:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RELU6;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinLogistic:
+        if (version == 1) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_LOGISTIC;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinTanh:
+        // TODO(miaowang): add additional checks for the parameters.
+        if (version == 1 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI only support float tanh.
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_TANH;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinSub:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI only support float sub.
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteSubParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_SUB;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinDiv:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI only support float div.
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteDivParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_DIV;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinPad:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            node->inputs->size == 2 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI does not support specifying the padding value.
+          // NNAPI pads physical zero for quantized tensors, so only delegate
+          // float pad to NNAPI.
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_PAD;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinSpaceToBatchNd:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_SPACE_TO_BATCH_ND;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinStridedSlice:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->begin_mask);
+            mapping_args.builder->AddScalarInt32Operand(builtin->end_mask);
+            mapping_args.builder->AddScalarInt32Operand(
+                builtin->shrink_axis_mask);
+            return ANEURALNETWORKS_STRIDED_SLICE;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinTranspose:
+        // Note that the permutation input tensor value dictates the output
+        // dimensions.
+        // TODO(b/110888333): Support dynamically-sized tensors in delegates.
+        if ((version == 1) &&
+            (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) &&
+            (node->inputs->size > 1) &&
+            (context->tensors[node->inputs->data[1]].allocation_type ==
+             kTfLiteMmapRo)) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_TRANSPOSE;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinRnn:
+        // NNAPI only support float32 weights.
+        if (version == 1 && node->inputs->size == 5 &&
+            context->tensors[node->inputs->data[/*kWeightsTensor*/ 1]].type ==
+                kTfLiteFloat32) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            // NNAPI need both state_in and state_out.
+            int ann_index;
+            mapping_args.builder->AddStateFloat32Tensor(
+                mapping_args.node->inputs->data[/*kHiddenStateTensor*/ 4],
+                &ann_index);
+            mapping_args.model_state_outputs->push_back(ann_index);
+            mapping_args.model_state_tfl_inputs->push_back(
+                mapping_args.node->inputs->data[/*kHiddenStateTensor*/ 4]);
+            auto builtin = reinterpret_cast<TfLiteRNNParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_RNN;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinSvdf:
+        // NNAPI only support float32 weights.
+        if (version == 1 && node->inputs->size == 5 &&
+            context->tensors[node->inputs->data[/*kWeightsFeatureTensor*/ 1]]
+                    .type == kTfLiteFloat32) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            // NNAPI need both state_in and state_out.
+            int ann_index;
+            mapping_args.builder->AddStateFloat32Tensor(
+                mapping_args.node->inputs
+                    ->data[/*kInputActivationStateTensor*/ 4],
+                &ann_index);
+            mapping_args.model_state_outputs->push_back(ann_index);
+            mapping_args.model_state_tfl_inputs->push_back(
+                mapping_args.node->inputs
+                    ->data[/*kInputActivationStateTensor*/ 4]);
+
+            auto builtin = reinterpret_cast<TfLiteSVDFParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->rank);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_SVDF;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinLstm:
+        // NNAPI only support float32 weights.
+        // TODO(miaowang): add loggings to indicate why the op is rejected.
+        if (version == 1 && node->inputs->size == 20 &&
+            context->tensors[node->inputs
+                                 ->data[/*kInputToOutputWeightsTensor*/ 4]]
+                    .type == kTfLiteFloat32) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteLSTMParams*>(
+                mapping_args.node->builtin_data);
+            mapping_args.builder->AddScalarInt32Operand(builtin->activation);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->cell_clip);
+            mapping_args.builder->AddScalarFloat32Operand(builtin->proj_clip);
+
+            // Current NNAPI implementation requires the sratch_buffer as
+            // output.
+            mapping_args.builder->AddAdditionalFloat32OutputTensor(2);
+
+            // NNAPI need both state_in and state_out for cell_state and
+            // output_state.
+            int ann_index;
+            mapping_args.builder->AddStateFloat32Tensor(
+                mapping_args.node->inputs
+                    ->data[/*kInputActivationStateTensor*/ 18],
+                &ann_index);
+            mapping_args.model_state_outputs->push_back(ann_index);
+            mapping_args.model_state_tfl_inputs->push_back(
+                mapping_args.node->inputs
+                    ->data[/*kInputActivationStateTensor*/ 18]);
+            mapping_args.builder->AddStateFloat32Tensor(
+                mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19],
+                &ann_index);
+            mapping_args.model_state_outputs->push_back(ann_index);
+            mapping_args.model_state_tfl_inputs->push_back(
+                mapping_args.node->inputs->data[/*kInputCellStateTensor*/ 19]);
+
+            return ANEURALNETWORKS_LSTM;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinMean:
+        // NNAPI does not support generating a scalar as output for MEAN.
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 &&
+            context->tensors[node->outputs->data[0]].dims->size > 0) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteReducerParams*>(
+                mapping_args.node->builtin_data);
+            int32_t keep_dims = 0;
+            if (builtin->keep_dims) keep_dims = 1;
+            mapping_args.builder->AddScalarInt32Operand(keep_dims);
+            return ANEURALNETWORKS_MEAN;
+          };
+        } else {
+          return nullptr;
+        }
+      case kTfLiteBuiltinEmbeddingLookup:
+        // NNAPI only support float32 values.
+        if (version == 1 &&
+            context->tensors[node->inputs->data[1]].type == kTfLiteFloat32) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_EMBEDDING_LOOKUP;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinHashtableLookup:
+        // NNAPI only support float32 output.
+        if (version == 1 &&
+            context->tensors[node->outputs->data[0]].type == kTfLiteFloat32) {
+          return [](const NNAPIOpMappingArgs& mapping_args)
+                     -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_HASHTABLE_LOOKUP;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      default:
+        return nullptr;
+    }
+  }
+
+  // Initialize the kernel (a NN model).
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) {
+    for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
+      nodes_.push_back(node_index);
+    }
+
+    if (!nn_model_) {
+      ANeuralNetworksModel* model;
+      CHECK_NN(context, ANeuralNetworksModel_create(&model));
+      nn_model_.reset(model);
+
+      TF_LITE_ENSURE_STATUS(
+          BuildGraph(context, params->input_tensors, params->output_tensors));
+    }
+
+    if (!nn_compilation_) {
+      ANeuralNetworksCompilation* compilation;
+      CHECK_NN(context, ANeuralNetworksCompilation_create(nn_model_.get(),
+                                                          &compilation));
+      CHECK_NN(context, ANeuralNetworksCompilation_finish(compilation));
+      nn_compilation_.reset(compilation);
+    }
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
+    ANeuralNetworksExecution* execution = nullptr;
+    CHECK_NN(context, ANeuralNetworksExecution_create(nn_compilation_.get(),
+                                                      &execution));
+
+    // Set the input tensor buffers. Note: we access tflite tensors using
+    // absolute indices but NN api indices inputs by relative indices.
+    int relative_input_index = 0;
+
+    size_t input_offset = 0;
+    for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
+      if (absolute_input_index == kOptionalTensor) {
+        continue;
+      }
+      TfLiteTensor* tensor = &context->tensors[absolute_input_index];
+      // TODO(miaowang): make sure the delegation works with dequantized weights
+      // as intermediate tensors.
+      if (tensor->allocation_type != kTfLiteMmapRo) {
+        // copy data to pre-allocated shared memory.
+        memcpy(nn_input_memory_->get_data_ptr() + input_offset,
+               tensor->data.raw, tensor->bytes);
+        CHECK_NN(context, ANeuralNetworksExecution_setInputFromMemory(
+                              execution, relative_input_index, nullptr,
+                              nn_input_memory_->get_handle(), input_offset,
+                              tensor->bytes));
+        input_offset += tensor->bytes;
+        relative_input_index++;
+      }
+    }
+
+    // Set the output tensor buffers.
+    int relative_output_index = 0;
+    size_t output_offset = 0;
+    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+      TfLiteTensor* tensor = &context->tensors[output_index];
+      CHECK_NN(context, ANeuralNetworksExecution_setOutputFromMemory(
+                            execution, relative_output_index, nullptr,
+                            nn_output_memory_->get_handle(), output_offset,
+                            tensor->bytes));
+      output_offset += tensor->bytes;
+      relative_output_index++;
+    }
+
+    // The state_out of previous invocation need to be mapped to state_in of
+    // current invocation.
+    for (size_t i = 0; i < model_state_tfl_inputs_.size(); i++) {
+      int state_tensor_idx = model_state_tfl_inputs_[i];
+      TfLiteTensor* tensor = &context->tensors[state_tensor_idx];
+      // Here we are using a deep copy for state_in tensors so that we are not
+      // reading and writing into the same buffer during a invocation.
+      // TODO(110369471): using double shared buffer to minimize the copies.
+      CHECK_NN(context, ANeuralNetworksExecution_setOutput(
+                            execution, relative_output_index, nullptr,
+                            tensor->data.raw, tensor->bytes));
+      relative_output_index++;
+    }
+    // Invoke ANN in blocking fashion.
+    ANeuralNetworksEvent* event = nullptr;
+    CHECK_NN(context, ANeuralNetworksExecution_startCompute(execution, &event));
+    CHECK_NN(context, ANeuralNetworksEvent_wait(event));
+    ANeuralNetworksEvent_free(event);
+    ANeuralNetworksExecution_free(execution);
+
+    // copy results from shared memory to the destination.
+    output_offset = 0;
+    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+      TfLiteTensor* tensor = &context->tensors[output_index];
+      memcpy(tensor->data.raw,
+             nn_output_memory_->get_data_ptr() + output_offset, tensor->bytes);
+      output_offset += tensor->bytes;
+    }
+
+    return kTfLiteOk;
+  }
+
+ private:
+  // ANN API state.
+  std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
+  std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
+      nn_compilation_;
+  // Node indices that this delegate is responsible for. Indices here
+  // indexes into the nodes array in the TfLiteContext.
+  std::vector<int> nodes_;
+  // Track indices we use
+  OperandMapping operand_mapping_;
+
+  std::vector<int> model_state_outputs_;
+  std::vector<int> model_state_tfl_inputs_;
+
+  std::unique_ptr<NNMemory> nn_input_memory_;
+  std::unique_ptr<NNMemory> nn_output_memory_;
+
+  TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
+    // The operand builder allows creating a single op. We create it at this
+    // reduced power position rather than in the for loop to avoid reallocating
+    // the vectors.
+    NNAPIOpBuilder builder(context, &operand_mapping_, nn_model_.get());
+    // Add Tensors
+    // allocate outside to avoid realloc
+    for (auto node_index : nodes_) {
+      // Obtain the op and registration.
+      TfLiteNode* node;
+      TfLiteRegistration* reg;
+      context->GetNodeAndRegistration(context, node_index, &node, &reg);
+      // Map inputs to NN API tensor indices.
+      for (auto input_index : TfLiteIntArrayView(node->inputs)) {
+        if (input_index == kOptionalTensor &&
+            (reg->builtin_code == kTfLiteBuiltinLstm ||
+             reg->builtin_code == kTfLiteBuiltinSvdf)) {
+          // properly handle the optional tensor for LSTM and SVDF.
+          // currently only support float32.
+          // TODO(miaowang): make sure this is also able to handle quantized
+          // tensor when supported by NNAPI.
+          TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0));
+        } else {
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index));
+        }
+      }
+      // Get op type and operands
+      int nn_op_type = Map(context, reg->builtin_code, reg->version, node)(
+          {context, &builder, node, &model_state_outputs_,
+           &model_state_tfl_inputs_});
+      // Map outputs to NN API tensor indices.
+      for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+        TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index));
+      }
+
+      builder.FinalizeAddOperation(nn_op_type);
+    }
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus BuildGraph(TfLiteContext* context,
+                          const TfLiteIntArray* input_tensors,
+                          const TfLiteIntArray* output_tensors) {
+    // Build the ops and tensors.
+    TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context));
+    // Map input and output tensor indices to ANN
+    std::vector<uint32_t> inputs;
+    inputs.reserve(input_tensors->size);
+    std::vector<uint32_t> outputs;
+    outputs.reserve(output_tensors->size);
+
+    size_t total_input_byte_size = 0;
+    // Make the TensorFlow lite inputs and outputs to ann_indices.
+    for (int i : TfLiteIntArrayView(input_tensors)) {
+      // Constant tensors are not NNAPI inputs.
+      if (i != kOptionalTensor &&
+          context->tensors[i].allocation_type != kTfLiteMmapRo) {
+        inputs.push_back(operand_mapping_.lite_index_to_ann(i));
+        total_input_byte_size += context->tensors[i].bytes;
+      }
+    }
+
+    size_t total_output_byte_size = 0;
+    for (int i : TfLiteIntArrayView(output_tensors)) {
+      outputs.push_back(operand_mapping_.lite_index_to_ann(i));
+      total_output_byte_size += context->tensors[i].bytes;
+    }
+
+    // Add state output tensors as model inputs
+    for (int i : model_state_outputs_) {
+      outputs.push_back(i);
+    }
+
+    // Tell ANN to declare inputs/outputs
+    CHECK_NN(context, ANeuralNetworksModel_identifyInputsAndOutputs(
+                          nn_model_.get(), inputs.size(), inputs.data(),
+                          outputs.size(), outputs.data()));
+
+    // Set relaxed computation mode for fp32 if possible.
+    if (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+      CHECK_NN(context,
+               ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+                   nn_model_.get(), context->allow_fp32_relax_to_fp16));
+    }
+
+    // Finalize the model
+    CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get()));
+
+    // Create shared memory pool for inputs and outputs.
+    nn_input_memory_.reset(new NNMemory("input_pool", total_input_byte_size));
+    nn_output_memory_.reset(
+        new NNMemory("output_pool", total_output_byte_size));
+
+    return kTfLiteOk;
+  }
+};
+
+}  // namespace
+
+// Return a NN API Delegate struct that can check for support of ops.
+TfLiteDelegate* NnApiDelegate() {
+  static TfLiteDelegate delegate = {
+      .data_ = nullptr,
+      .flags = kTfLiteDelegateFlagsNone,
+      .Prepare = [](TfLiteContext* context,
+                    TfLiteDelegate* delegate) -> TfLiteStatus {
+        // Do not check nodes_ if NN API is unavailable.
+        if (kAndroidSdkVersion < kMinSdkVersionForNNAPI || !NNAPIExists()) {
+          return kTfLiteOk;
+        }
+
+        std::vector<int> supported_nodes(1);
+        // We don't care about all nodes_, we only care about ones in the
+        // current plan.
+        TfLiteIntArray* plan;
+        TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
+        int total_supported_nodes = 0;
+
+        // Check for every node if it is supported
+        // TODO(b/80625235): Fix this to do more careful checking of versioning.
+        for (int node_index : TfLiteIntArrayView(plan)) {
+          TfLiteNode* node;
+          TfLiteRegistration* registration;
+          TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
+              context, node_index, &node, &registration));
+          NNAPIDelegateKernel dummy_kernel;
+          if (dummy_kernel.Map(context, registration->builtin_code,
+                               registration->version, node)) {
+            supported_nodes.push_back(node_index);
+          }
+          total_supported_nodes += 1;
+        }
+        // Put the size at the beginning of the array.
+        supported_nodes[0] = supported_nodes.size() - 1;
+
+        // NN API Delegate Registration (the pseudo kernel that will invoke NN
+        // API node sub sets)
+        static const TfLiteRegistration nnapi_delegate_kernel = {
+            .init = [](TfLiteContext* context, const char* buffer,
+                       size_t length) -> void* {
+              const TfLiteDelegateParams* params =
+                  reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+              NNAPIDelegateKernel* kernel_state = new NNAPIDelegateKernel;
+              kernel_state->Init(context, params);
+              return kernel_state;
+            },
+
+            .free = [](TfLiteContext* context, void* buffer) -> void {
+              delete reinterpret_cast<NNAPIDelegateKernel*>(buffer);
+            },
+
+            .prepare = [](TfLiteContext* context,
+                          TfLiteNode* node) -> TfLiteStatus {
+              // Since the underlying resize happened ahead of delegation
+              // worked. This does nothing.
+              return kTfLiteOk;
+            },
+
+            .invoke = [](TfLiteContext* context,
+                         TfLiteNode* node) -> TfLiteStatus {
+              NNAPIDelegateKernel* state =
+                  reinterpret_cast<NNAPIDelegateKernel*>(node->user_data);
+              return state->Invoke(context, node);
+            },
+
+            .builtin_code = kTfLiteBuiltinDelegate,
+        };
+
+        // Request TFLite to partition the graph and make kernels
+        // for each independent node sub set a new nnapi_delegate_kernel.
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, nnapi_delegate_kernel,
+            reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
+            delegate);
+        return kTfLiteOk;
+      }};
+
+  return &delegate;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
new file mode 100644
index 0000000000000000000000000000000000000000..099fb724292d795a5a3297df801fbbf52c96193b
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+
+// Return a delegate that can be used to use the NN API.
+// e.g.
+//   NnApiDelegate* delegate = NnApiDelegate();
+//   interpreter->ModifyGraphWithDelegate(&delegate);
+// NnApiDelegate() returns a singleton, so you should not free this
+// pointer or worry about its lifetime.
+TfLiteDelegate* NnApiDelegate();
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
rename to tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 9626c54c7473bfcb1547e04d1629ebbb17879597..ca48af0c95211e644fc7e2a1a1472a2f1b46ad35 100644
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -31,9 +31,14 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   SingleOpModelWithNNAPI() {
     this->SetApplyDelegate([](Interpreter* interpreter) {
-      interpreter->ModifyGraphWithDelegate(NnApiDelegate(), false);
+      interpreter->ModifyGraphWithDelegate(NnApiDelegate());
     });
   }
+
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims) {
+    return interpreter_->ResizeInputTensor(tensor_index, dims);
+  }
 };
 
 class FloatAddOpModel : public SingleOpModelWithNNAPI {
@@ -97,6 +102,17 @@ TEST(NNAPIDelegate, AddWithRelu) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.4, 1.0, 1.3}));
 }
 
+// Verify that resize attempts fail.
+// TODO(b/113110851): Verify success after the delegate supports resizing.
+TEST(NNAPIDelegate, ResizeFails) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  EXPECT_EQ(m.ResizeInputTensor(m.input1(), {1, 3, 3, 1}), kTfLiteError);
+}
+
 class FloatMulOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatMulOpModel(const TensorData& input1, const TensorData& input2,
diff --git a/tensorflow/lite/error_reporter.h b/tensorflow/lite/error_reporter.h
new file mode 100644
index 0000000000000000000000000000000000000000..38518d63321edacf06c8f096d0b2198065f18aff
--- /dev/null
+++ b/tensorflow/lite/error_reporter.h
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Compatibility shim for moved header location.
+#ifndef TENSORFLOW_LITE_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_ERROR_REPORTER_H_
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/stderr_reporter.h"
+
+#endif  // TENSORFLOW_LITE_ERROR_REPORTER_H_
diff --git a/tensorflow/lite/examples/android/BUILD b/tensorflow/lite/examples/android/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..80cefd415a579ad053c9f4cfcd59f63a64566931
--- /dev/null
+++ b/tensorflow/lite/examples/android/BUILD
@@ -0,0 +1,61 @@
+# Description:
+#   TensorFlow camera demo app for Android.
+
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Build the demo native demo lib from the original directory to reduce code
+# reuse. Note that the Java counterparts (ObjectTracker.java and
+# ImageUtils.java) are still duplicated.
+cc_library(
+    name = "tensorflow_native_libs",
+    srcs = [
+        "//tensorflow/examples/android:libtensorflow_demo.so",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
+android_binary(
+    name = "tflite_demo",
+    srcs = glob([
+        "app/src/main/java/**/*.java",
+    ]),
+    aapt_version = "aapt",
+    # Package assets from assets dir as well as all model targets.
+    # Remove undesired models (and corresponding Activities in source)
+    # to reduce APK size.
+    assets = [
+        "//tensorflow/lite/examples/android/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
+        "@tflite_mobilenet_quant//:mobilenet_v1_1.0_224_quant.tflite",
+        "@tflite_conv_actions_frozen//:conv_actions_frozen.tflite",
+        "//tensorflow/lite/examples/android/app/src/main/assets:conv_actions_labels.txt",
+        "@tflite_mobilenet_ssd//:mobilenet_ssd.tflite",
+        "@tflite_mobilenet_ssd_quant//:detect.tflite",
+        "//tensorflow/lite/examples/android/app/src/main/assets:box_priors.txt",
+        "//tensorflow/lite/examples/android/app/src/main/assets:coco_labels_list.txt",
+    ],
+    assets_dir = "",
+    custom_package = "org.tensorflow.lite.demo",
+    inline_constants = 1,
+    manifest = "app/src/main/AndroidManifest.xml",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["app/src/main/res/**"]),
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":tensorflow_native_libs",
+        "//tensorflow/lite/java:tensorflowlite",
+    ],
+)
diff --git a/tensorflow/contrib/lite/examples/android/android.iml b/tensorflow/lite/examples/android/android.iml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/android.iml
rename to tensorflow/lite/examples/android/android.iml
diff --git a/tensorflow/lite/examples/android/app/README.md b/tensorflow/lite/examples/android/app/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e2b1b2691bb926e9dbdcb73a246d31cc51ac78c6
--- /dev/null
+++ b/tensorflow/lite/examples/android/app/README.md
@@ -0,0 +1,54 @@
+# TF Lite Android App Example
+
+A simple Android example that demonstrates image classification and object
+detection using the camera, as well as speech recognition using the microphone.
+
+## Building in Android Studio with TensorFlow Lite AAR from JCenter.
+The build.gradle is configured to use TensorFlow Lite's nightly build.
+
+If you see a build error related to compatibility with Tensorflow Lite's Java
+API (example: method X is undefined for type Interpreter), there has likely been
+a backwards compatible change to the API. You will need to pull new app code
+that's compatible with the nightly build and may need to first wait a few days
+for our external and internal code to merge.
+
+## Building from Source with Bazel
+
+1. Follow the [Bazel steps for the TF Demo App](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel):
+
+  1. [Install Bazel and Android Prerequisites](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-bazel-and-android-prerequisites).
+     It's easiest with Android Studio.
+
+      - You'll need at least SDK version 23.
+      - Make sure to install the latest version of Bazel. Some distributions
+        ship with Bazel 0.5.4, which is too old.
+      - Bazel requires Android Build Tools `26.0.1` or higher.
+      - You also need to install the Android Support Repository, available
+        through Android Studio under `Android SDK Manager -> SDK Tools ->
+        Android Support Repository`.
+
+  2. [Edit your `WORKSPACE`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#edit-workspace)
+     to add SDK and NDK targets.
+
+     NOTE: As long as you have the SDK and NDK installed, the `./configure`
+     script will create these rules for you. Answer "Yes" when the script asks
+     to automatically configure the `./WORKSPACE`.
+
+      - Make sure the `api_level` in `WORKSPACE` is set to an SDK version that
+        you have installed.
+      - By default, Android Studio will install the SDK to `~/Android/Sdk` and
+        the NDK to `~/Android/Sdk/ndk-bundle`.
+
+2. Build this demo app with Bazel. The demo needs C++11. We configure the fat_apk_cpu flag to package support for 4 hardware variants. You may replace it with --config=android_arm64 on a 64-bit device and --config=android_arm for 32-bit device:
+
+  ```shell
+  bazel build -c opt --cxxopt='--std=c++11' --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+    //tensorflow/lite/examples/android:tflite_demo
+  ```
+
+3. Install the demo on a
+   [debug-enabled device](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install):
+
+  ```shell
+  adb install bazel-bin/tensorflow/lite/examples/android/tflite_demo.apk
+  ```
diff --git a/tensorflow/contrib/lite/examples/android/app/build.gradle b/tensorflow/lite/examples/android/app/build.gradle
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/build.gradle
rename to tensorflow/lite/examples/android/app/build.gradle
diff --git a/tensorflow/lite/examples/android/app/download-models.gradle b/tensorflow/lite/examples/android/app/download-models.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..36bd177a1fd6bb21a27edd6d2b6e82fa7aa5d57b
--- /dev/null
+++ b/tensorflow/lite/examples/android/app/download-models.gradle
@@ -0,0 +1,81 @@
+/*
+ * download-models.gradle
+ *     Downloads model files from ${MODEL_URL} into application's asset folder
+ * Input:
+ *     project.ext.TMP_DIR: absolute path to hold downloaded zip files
+ *     project.ext.ASSET_DIR: absolute path to save unzipped model files
+ * Output:
+ *     3 model files will be downloaded into given folder of ext.ASSET_DIR
+ */
+// hard coded model files
+
+def models = ['https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip',
+              'https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip',
+              'https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip',
+              'http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz',
+              'http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz']
+
+// Root URL for model archives
+def MODEL_URL = 'https://storage.googleapis.com/download.tensorflow.org/models/tflite'
+
+buildscript {
+    repositories {
+        jcenter()
+    }
+    dependencies {
+        classpath 'de.undercouch:gradle-download-task:3.2.0'
+    }
+}
+
+import de.undercouch.gradle.tasks.download.Download
+task downloadFile(type: Download){
+    for (modelUrl in models) {
+        def localFile = modelUrl.split("/")[-1]
+        println "Downloading ${localFile} from ${modelUrl}"
+        src modelUrl
+    }
+
+    dest new File(project.ext.TMP_DIR)
+    overwrite true
+}
+
+task extractModels(type: Copy) {
+    for (f in models) {
+        def localFile = f.split("/")[-1]
+        def localExt = localFile.split("[.]")[-1]
+        if (localExt == "tgz") {
+            from tarTree(project.ext.TMP_DIR + '/' + localFile)
+        } else {
+            from zipTree(project.ext.TMP_DIR + '/' + localFile)
+        }
+    }
+
+    into file(project.ext.ASSET_DIR)
+    fileMode  0644
+    exclude '**/LICENSE'
+
+    def needDownload = false
+    for (f in models) {
+        def localFile = f.split("/")[-1]
+        if (!(new File(project.ext.TMP_DIR + '/' + localFile)).exists()) {
+            needDownload = true
+        }
+    }
+
+    if (needDownload) {
+        dependsOn downloadFile
+    }
+}
+
+
+
+
+tasks.whenTaskAdded { task ->
+    if (task.name == 'assembleDebug') {
+        task.dependsOn 'extractModels'
+    }
+    if (task.name == 'assembleRelease') {
+        task.dependsOn 'extractModels'
+    }
+}
+
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/AndroidManifest.xml b/tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/AndroidManifest.xml
rename to tensorflow/lite/examples/android/app/src/main/AndroidManifest.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/assets/BUILD b/tensorflow/lite/examples/android/app/src/main/assets/BUILD
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/assets/BUILD
rename to tensorflow/lite/examples/android/app/src/main/assets/BUILD
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/assets/box_priors.txt b/tensorflow/lite/examples/android/app/src/main/assets/box_priors.txt
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/assets/box_priors.txt
rename to tensorflow/lite/examples/android/app/src/main/assets/box_priors.txt
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/assets/coco_labels_list.txt b/tensorflow/lite/examples/android/app/src/main/assets/coco_labels_list.txt
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/assets/coco_labels_list.txt
rename to tensorflow/lite/examples/android/app/src/main/assets/coco_labels_list.txt
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/assets/conv_actions_labels.txt b/tensorflow/lite/examples/android/app/src/main/assets/conv_actions_labels.txt
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/assets/conv_actions_labels.txt
rename to tensorflow/lite/examples/android/app/src/main/assets/conv_actions_labels.txt
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/assets/labels_mobilenet_quant_v1_224.txt b/tensorflow/lite/examples/android/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
rename to tensorflow/lite/examples/android/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/assets/pets_labels_list.txt b/tensorflow/lite/examples/android/app/src/main/assets/pets_labels_list.txt
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/assets/pets_labels_list.txt
rename to tensorflow/lite/examples/android/app/src/main/assets/pets_labels_list.txt
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/AutoFitTextureView.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/AutoFitTextureView.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/AutoFitTextureView.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/AutoFitTextureView.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraActivity.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraActivity.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraActivity.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraConnectionFragment.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraConnectionFragment.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraConnectionFragment.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/Classifier.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/Classifier.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/Classifier.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/Classifier.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
similarity index 98%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
index dcbbefbeab6627b37579902cd25841c0ae257dda..698251d8b4aff3423808126ff490fe277a7ed283 100644
--- a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
@@ -65,7 +65,7 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
   // --input_binary=true
   private static final int INPUT_SIZE = 224;
 
-  private static final String MODEL_FILE = "mobilenet_quant_v1_224.tflite";
+  private static final String MODEL_FILE = "mobilenet_v1_1.0_224_quant.tflite";
   private static final String LABEL_FILE = "labels_mobilenet_quant_v1_224.txt";
 
   private static final boolean MAINTAIN_ASPECT = true;
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/LegacyCameraConnectionFragment.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/LegacyCameraConnectionFragment.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/LegacyCameraConnectionFragment.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/OverlayView.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/OverlayView.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/OverlayView.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/OverlayView.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognitionScoreView.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognitionScoreView.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognitionScoreView.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognitionScoreView.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognizeCommands.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognizeCommands.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognizeCommands.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/RecognizeCommands.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/ResultsView.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ResultsView.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/ResultsView.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ResultsView.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/SpeechActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/SpeechActivity.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/SpeechActivity.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/SpeechActivity.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteImageClassifier.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteImageClassifier.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteImageClassifier.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteImageClassifier.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/AssetUtils.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/AssetUtils.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/AssetUtils.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/AssetUtils.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/BorderedText.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/BorderedText.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/BorderedText.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/BorderedText.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/ImageUtils.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/ImageUtils.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/ImageUtils.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Logger.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Logger.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Logger.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Logger.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Size.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Size.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Size.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/Size.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/SplitTimer.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/SplitTimer.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/SplitTimer.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/env/SplitTimer.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/MultiBoxTracker.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/MultiBoxTracker.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/MultiBoxTracker.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/ObjectTracker.java
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/ObjectTracker.java
rename to tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/tracking/ObjectTracker.java
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/animator/color_animation.xml b/tensorflow/lite/examples/android/app/src/main/res/animator/color_animation.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/animator/color_animation.xml
rename to tensorflow/lite/examples/android/app/src/main/res/animator/color_animation.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-hdpi/ic_action_info.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_action_info.png
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-hdpi/ic_action_info.png
rename to tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_action_info.png
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-hdpi/ic_launcher.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-hdpi/ic_launcher.png
rename to tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-hdpi/tile.9.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/tile.9.png
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-hdpi/tile.9.png
rename to tensorflow/lite/examples/android/app/src/main/res/drawable-hdpi/tile.9.png
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-mdpi/ic_action_info.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_action_info.png
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-mdpi/ic_action_info.png
rename to tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_action_info.png
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-mdpi/ic_launcher.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-mdpi/ic_launcher.png
rename to tensorflow/lite/examples/android/app/src/main/res/drawable-mdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_action_info.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_action_info.png
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_action_info.png
rename to tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_action_info.png
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_launcher.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_launcher.png
rename to tensorflow/lite/examples/android/app/src/main/res/drawable-xhdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_action_info.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_action_info.png
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_action_info.png
rename to tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_action_info.png
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_launcher.png
rename to tensorflow/lite/examples/android/app/src/main/res/drawable-xxhdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/drawable/border.xml b/tensorflow/lite/examples/android/app/src/main/res/drawable/border.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/drawable/border.xml
rename to tensorflow/lite/examples/android/app/src/main/res/drawable/border.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/layout/activity_camera.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/activity_camera.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/layout/activity_camera.xml
rename to tensorflow/lite/examples/android/app/src/main/res/layout/activity_camera.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/layout/activity_speech.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/activity_speech.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/layout/activity_speech.xml
rename to tensorflow/lite/examples/android/app/src/main/res/layout/activity_speech.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/layout/camera_connection_fragment.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/layout/camera_connection_fragment.xml
rename to tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_stylize.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_stylize.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_stylize.xml
rename to tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_stylize.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_tracking.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_tracking.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_tracking.xml
rename to tensorflow/lite/examples/android/app/src/main/res/layout/camera_connection_fragment_tracking.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/layout/list_text_item.xml b/tensorflow/lite/examples/android/app/src/main/res/layout/list_text_item.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/layout/list_text_item.xml
rename to tensorflow/lite/examples/android/app/src/main/res/layout/list_text_item.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values-sw600dp/template-dimens.xml b/tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-dimens.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values-sw600dp/template-dimens.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-dimens.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values-sw600dp/template-styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values-sw600dp/template-styles.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values-sw600dp/template-styles.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values-v11/styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v11/styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values-v11/styles.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values-v11/styles.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values-v11/template-styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v11/template-styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values-v11/template-styles.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values-v11/template-styles.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values-v14/styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v14/styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values-v14/styles.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values-v14/styles.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values-v21/base-colors.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v21/base-colors.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values-v21/base-colors.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values-v21/base-colors.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values-v21/base-template-styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values-v21/base-template-styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values-v21/base-template-styles.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values-v21/base-template-styles.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values/attrs.xml b/tensorflow/lite/examples/android/app/src/main/res/values/attrs.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values/attrs.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values/attrs.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values/base-strings.xml b/tensorflow/lite/examples/android/app/src/main/res/values/base-strings.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values/base-strings.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values/base-strings.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values/colors.xml b/tensorflow/lite/examples/android/app/src/main/res/values/colors.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values/colors.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values/colors.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values/strings.xml b/tensorflow/lite/examples/android/app/src/main/res/values/strings.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values/strings.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values/strings.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values/styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values/styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values/styles.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values/styles.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values/template-dimens.xml b/tensorflow/lite/examples/android/app/src/main/res/values/template-dimens.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values/template-dimens.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values/template-dimens.xml
diff --git a/tensorflow/contrib/lite/examples/android/app/src/main/res/values/template-styles.xml b/tensorflow/lite/examples/android/app/src/main/res/values/template-styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/app/src/main/res/values/template-styles.xml
rename to tensorflow/lite/examples/android/app/src/main/res/values/template-styles.xml
diff --git a/tensorflow/lite/examples/android/build.gradle b/tensorflow/lite/examples/android/build.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..74dacbcddbdafa65d061e83a0199bfc2d60a361b
--- /dev/null
+++ b/tensorflow/lite/examples/android/build.gradle
@@ -0,0 +1,28 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+    repositories {
+        jcenter()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:3.0.1'
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        google()
+        jcenter()
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
+
+// Changed since default name 'build' conflicts with
+// bazel BUILD file name.
+buildDir = "gradle-build"
diff --git a/tensorflow/contrib/lite/examples/android/settings.gradle b/tensorflow/lite/examples/android/settings.gradle
similarity index 100%
rename from tensorflow/contrib/lite/examples/android/settings.gradle
rename to tensorflow/lite/examples/android/settings.gradle
diff --git a/tensorflow/contrib/lite/examples/ios/camera/.gitignore b/tensorflow/lite/examples/ios/camera/.gitignore
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/camera/.gitignore
rename to tensorflow/lite/examples/ios/camera/.gitignore
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.h b/tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.h
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.h
rename to tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.h
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.m b/tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.m
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.m
rename to tensorflow/lite/examples/ios/camera/CameraExampleAppDelegate.m
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
new file mode 100644
index 0000000000000000000000000000000000000000..438e6adc79a2eb6ca0ed9a61d278eef79546ce8d
--- /dev/null
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
@@ -0,0 +1,62 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <AVFoundation/AVFoundation.h>
+#import <UIKit/UIKit.h>
+
+#include <vector>
+
+// TensorFlow Lite was migrated out of `contrib/` directory. The change
+// wasn't reflected in newest CocoaPod release yet (1.12.0).
+// Change this to 0 when using a TFLite version which is newer than 1.12.0.
+// TODO(ycling): Remove the macro when we release the next version.
+#ifndef TFLITE_USE_CONTRIB_LITE
+#define TFLITE_USE_CONTRIB_LITE 1
+#endif
+
+// Set TFLITE_USE_GPU_DELEGATE to 1 to use TFLite GPU Delegate.
+// Note: TFLite GPU Delegate binary isn't releast yet, and we're working
+// on it.
+#ifndef TFLITE_USE_GPU_DELEGATE
+#define TFLITE_USE_GPU_DELEGATE 0
+#endif
+
+#if TFLITE_USE_GPU_DELEGATE && TFLITE_USE_CONTRIB_LITE
+// Sanity check.
+#error "GPU Delegate only works with newer TFLite " \
+    "after migrating out of contrib"
+#endif
+
+@interface CameraExampleViewController
+    : UIViewController<UIGestureRecognizerDelegate, AVCaptureVideoDataOutputSampleBufferDelegate> {
+  IBOutlet UIView* previewView;
+  AVCaptureVideoPreviewLayer* previewLayer;
+  AVCaptureVideoDataOutput* videoDataOutput;
+  dispatch_queue_t videoDataOutputQueue;
+  UIView* flashView;
+  BOOL isUsingFrontFacingCamera;
+  NSMutableDictionary* oldPredictionValues;
+  NSMutableArray* labelLayers;
+  AVCaptureSession* session;
+
+  std::vector<std::string> labels;
+  double total_latency;
+  int total_count;
+}
+@property(strong, nonatomic) CATextLayer* predictionTextLayer;
+
+- (IBAction)takePicture:(id)sender;
+- (IBAction)switchCameras:(id)sender;
+
+@end
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
similarity index 77%
rename from tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
rename to tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 734b15e0a10bfbd485b0a0a89296b27546ea5f40..48cd313c9d7a94328d990e45243e2b84c9dc7a62 100644
--- a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -23,28 +23,48 @@
 #include <iostream>
 #include <queue>
 
+#if TFLITE_USE_CONTRIB_LITE
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#else
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/string_util.h"
+#if TFLITE_USE_GPU_DELEGATE
+#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
+#endif
+#endif
 
 #define LOG(x) std::cerr
 
+namespace {
+
 // If you have your own model, modify this to the file name, and make sure
 // you've added the file to your app resources too.
-static NSString* model_file_name = @"mobilenet_quant_v1_224";
-static NSString* model_file_type = @"tflite";
-
+#if TFLITE_USE_GPU_DELEGATE
+// GPU Delegate only supports float model now.
+NSString* model_file_name = @"mobilenet_v1_1.0_224";
+#else
+NSString* model_file_name = @"mobilenet_quant_v1_224.tflite";
+#endif
+NSString* model_file_type = @"tflite";
 // If you have your own model, point this to the labels file.
-static NSString* labels_file_name = @"labels";
-static NSString* labels_file_type = @"txt";
+NSString* labels_file_name = @"labels";
+NSString* labels_file_type = @"txt";
 
 // These dimensions need to match those the model was trained with.
-static const int wanted_input_width = 224;
-static const int wanted_input_height = 224;
-static const int wanted_input_channels = 3;
-
-static NSString* FilePathForResourceName(NSString* name, NSString* extension) {
+const int wanted_input_width = 224;
+const int wanted_input_height = 224;
+const int wanted_input_channels = 3;
+const float input_mean = 127.5f;
+const float input_std = 127.5f;
+const std::string input_layer_name = "input";
+const std::string output_layer_name = "softmax1";
+
+NSString* FilePathForResourceName(NSString* name, NSString* extension) {
   NSString* file_path = [[NSBundle mainBundle] pathForResource:name ofType:extension];
   if (file_path == NULL) {
     LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "." << [extension UTF8String]
@@ -53,8 +73,7 @@ static NSString* FilePathForResourceName(NSString* name, NSString* extension) {
   return file_path;
 }
 
-static void LoadLabels(NSString* file_name, NSString* file_type,
-                       std::vector<std::string>* label_strings) {
+void LoadLabels(NSString* file_name, NSString* file_type, std::vector<std::string>* label_strings) {
   NSString* labels_path = FilePathForResourceName(file_name, file_type);
   if (!labels_path) {
     LOG(ERROR) << "Failed to find model proto at" << [file_name UTF8String]
@@ -72,16 +91,17 @@ static void LoadLabels(NSString* file_name, NSString* file_type,
 
 // Returns the top N confidence values over threshold in the provided vector,
 // sorted by confidence in descending order.
-static void GetTopN(const uint8_t* prediction, const int prediction_size, const int num_results,
-                    const float threshold, std::vector<std::pair<float, int>>* top_results) {
+void GetTopN(
+    const float* prediction, const int prediction_size, const int num_results,
+    const float threshold, std::vector<std::pair<float, int> >* top_results) {
   // Will contain top N results in ascending order.
-  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>,
-                      std::greater<std::pair<float, int>>>
+  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int> >,
+                      std::greater<std::pair<float, int> > >
       top_result_pq;
 
   const long count = prediction_size;
   for (int i = 0; i < count; ++i) {
-    const float value = prediction[i] / 255.0;
+    const float value = prediction[i];
     // Only add it if it beats the threshold and has a chance at being in
     // the top N.
     if (value < threshold) {
@@ -104,12 +124,54 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   std::reverse(top_results->begin(), top_results->end());
 }
 
+// Preprocess the input image and feed the TFLite interpreter buffer for a float model.
+void ProcessInputWithFloatModel(
+    uint8_t* input, float* buffer, int image_width, int image_height, int image_channels) {
+  for (int y = 0; y < wanted_input_height; ++y) {
+    float* out_row = buffer + (y * wanted_input_width * wanted_input_channels);
+    for (int x = 0; x < wanted_input_width; ++x) {
+      const int in_x = (y * image_width) / wanted_input_width;
+      const int in_y = (x * image_height) / wanted_input_height;
+      uint8_t* input_pixel =
+          input + (in_y * image_width * image_channels) + (in_x * image_channels);
+      float* out_pixel = out_row + (x * wanted_input_channels);
+      for (int c = 0; c < wanted_input_channels; ++c) {
+        out_pixel[c] = (input_pixel[c] - input_mean) / input_std;
+      }
+    }
+  }
+}
+
+// Preprocess the input image and feed the TFLite interpreter buffer for a quantized model.
+void ProcessInputWithQuantizedModel(
+    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
+  for (int y = 0; y < wanted_input_height; ++y) {
+    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
+    for (int x = 0; x < wanted_input_width; ++x) {
+      const int in_x = (y * image_width) / wanted_input_width;
+      const int in_y = (x * image_height) / wanted_input_height;
+      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
+      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
+      for (int c = 0; c < wanted_input_channels; ++c) {
+        out_pixel[c] = in_pixel[c];
+      }
+    }
+  }
+}
+
+}  // namespace
+
 @interface CameraExampleViewController (InternalMethods)
 - (void)setupAVCapture;
 - (void)teardownAVCapture;
 @end
 
-@implementation CameraExampleViewController
+@implementation CameraExampleViewController {
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  TfLiteDelegate* delegate;
+}
 
 - (void)setupAVCapture {
   NSError* error = nil;
@@ -251,39 +313,58 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   uint8_t* in = sourceStartAddr;
 
   int input = interpreter->inputs()[0];
+  TfLiteTensor *input_tensor = interpreter->tensor(input);
+
+  bool is_quantized;
+  switch (input_tensor->type) {
+  case kTfLiteFloat32:
+    is_quantized = false;
+    break;
+  case kTfLiteUInt8:
+    is_quantized = true;
+    break;
+  default:
+    NSLog(@"Input data type is not supported by this demo app.");
+    return;
+  }
 
-  uint8_t* out = interpreter->typed_tensor<uint8_t>(input);
-  for (int y = 0; y < wanted_input_height; ++y) {
-    uint8_t* out_row = out + (y * wanted_input_width * wanted_input_channels);
-    for (int x = 0; x < wanted_input_width; ++x) {
-      const int in_x = (y * image_width) / wanted_input_width;
-      const int in_y = (x * image_height) / wanted_input_height;
-      uint8_t* in_pixel = in + (in_y * image_width * image_channels) + (in_x * image_channels);
-      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
-      for (int c = 0; c < wanted_input_channels; ++c) {
-        out_pixel[c] = in_pixel[c];
-      }
-    }
+  if (is_quantized) {
+    uint8_t* out = interpreter->typed_tensor<uint8_t>(input);
+    ProcessInputWithQuantizedModel(in, out, image_width, image_height, image_channels);
+  } else {
+    float* out = interpreter->typed_tensor<float>(input);
+    ProcessInputWithFloatModel(in, out, image_width, image_height, image_channels);
   }
 
-  double startTimestamp = [[NSDate new] timeIntervalSince1970];
+  double start = [[NSDate new] timeIntervalSince1970];
   if (interpreter->Invoke() != kTfLiteOk) {
     LOG(FATAL) << "Failed to invoke!";
   }
-  double endTimestamp = [[NSDate new] timeIntervalSince1970];
-  total_latency += (endTimestamp - startTimestamp);
+  double end = [[NSDate new] timeIntervalSince1970];
+  total_latency += (end - start);
   total_count += 1;
-  NSLog(@"Time: %.4lf, avg: %.4lf, count: %d", endTimestamp - startTimestamp,
-        total_latency / total_count, total_count);
+  NSLog(@"Time: %.4lf, avg: %.4lf, count: %d", end - start, total_latency / total_count,
+        total_count);
 
   const int output_size = 1000;
   const int kNumResults = 5;
   const float kThreshold = 0.1f;
 
-  std::vector<std::pair<float, int>> top_results;
+  std::vector<std::pair<float, int> > top_results;
 
-  uint8_t* output = interpreter->typed_output_tensor<uint8_t>(0);
-  GetTopN(output, output_size, kNumResults, kThreshold, &top_results);
+  if (is_quantized) {
+    uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
+    int32_t zero_point = input_tensor->params.zero_point;
+    float scale = input_tensor->params.scale;
+    float output[output_size];
+    for (int i = 0; i < output_size; ++i) {
+      output[i] = (quantized_output[i] - zero_point) * scale;
+    }
+    GetTopN(output, output_size, kNumResults, kThreshold, &top_results);
+  } else {
+    float* output = interpreter->typed_output_tensor<float>(0);
+    GetTopN(output, output_size, kNumResults, kThreshold, &top_results);
+  }
 
   NSMutableDictionary* newValues = [NSMutableDictionary dictionary];
   for (const auto& result : top_results) {
@@ -298,11 +379,15 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   });
 
   CVPixelBufferUnlockBaseAddress(pixelBuffer, unlockFlags);
-
   CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
 }
 
 - (void)dealloc {
+#if TFLITE_USE_GPU_DELEGATE
+  if (delegate) {
+    DeleteGpuDelegate(delegate);
+  }
+#endif
   [self teardownAVCapture];
 }
 
@@ -328,6 +413,21 @@ static void GetTopN(const uint8_t* prediction, const int prediction_size, const
   LoadLabels(labels_file_name, labels_file_type, &labels);
 
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+
+#if TFLITE_USE_GPU_DELEGATE
+  GpuDelegateOptions options;
+  options.allow_precision_loss = true;
+  options.wait_type = GpuDelegateOptions::WaitType::kActive;
+  delegate = NewGpuDelegate(&options);
+  interpreter->ModifyGraphWithDelegate(delegate);
+#endif
+
+  // Explicitly resize the input tensor.
+  {
+    int input = interpreter->inputs()[0];
+    std::vector<int> sizes = {1, 224, 224, 3};
+    interpreter->ResizeInputTensor(input, sizes);
+  }
   if (!interpreter) {
     LOG(FATAL) << "Failed to construct interpreter";
   }
diff --git a/tensorflow/contrib/lite/examples/ios/camera/Info.plist b/tensorflow/lite/examples/ios/camera/Info.plist
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/camera/Info.plist
rename to tensorflow/lite/examples/ios/camera/Info.plist
diff --git a/tensorflow/contrib/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard b/tensorflow/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard
rename to tensorflow/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard
diff --git a/tensorflow/lite/examples/ios/camera/Podfile b/tensorflow/lite/examples/ios/camera/Podfile
new file mode 100644
index 0000000000000000000000000000000000000000..2e15cc63decb30eb2b8c9bffab3b5d1bff10e9b3
--- /dev/null
+++ b/tensorflow/lite/examples/ios/camera/Podfile
@@ -0,0 +1,13 @@
+platform :ios, '8.0'
+inhibit_all_warnings!
+
+project 'tflite_camera_example.xcodeproj'
+
+target 'tflite_camera_example'
+  # Comment 'TensorFlowLite' pod and un-comment 'TensorFlowLiteGpuExperimental'
+  # to use TFLite GPU Delegate.
+  # Note: TFLite GPU Delegate binary isn't releast yet, and we're working
+  # on it.
+
+  pod 'TensorFlowLite', '1.12.0'
+  # pod 'TensorFlowLiteGpuExperimental', '0.0.1'
diff --git a/tensorflow/lite/examples/ios/camera/README.md b/tensorflow/lite/examples/ios/camera/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..98d4e686d1e0a8f5b3ab7ff44fe0da0f5a760698
--- /dev/null
+++ b/tensorflow/lite/examples/ios/camera/README.md
@@ -0,0 +1,12 @@
+## Using the iOS Demo App
+
+Please read the [TensorFlow Lite iOS Demo App](https://www.tensorflow.org/lite/demo_ios) page.
+
+## Using the iOS Demo App with support for select TensorFlow ops
+
+TODO(ycling): Link to the select TensorFlow ops documentation when it's
+done.
+
+Follow the guide to TensorFlow Lite iOS Library with support for select
+TensorFlow ops, then open `tflite_camera_example_with_flex.xcodeproj`.
+Note that this project setting is not using CocoaPod.
diff --git a/tensorflow/contrib/lite/examples/ios/camera/data/.gitignore b/tensorflow/lite/examples/ios/camera/data/.gitignore
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/camera/data/.gitignore
rename to tensorflow/lite/examples/ios/camera/data/.gitignore
diff --git a/tensorflow/contrib/lite/examples/ios/camera/main.mm b/tensorflow/lite/examples/ios/camera/main.mm
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/camera/main.mm
rename to tensorflow/lite/examples/ios/camera/main.mm
diff --git a/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..9b5c2b32a8f176e58a2d28d11ee3e41ef875e722
--- /dev/null
+++ b/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -0,0 +1,371 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 46;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1C3C9DCA1ED3AB4200B8B5FA /* main.mm */; };
+		1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */; };
+		1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */; };
+		1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */; };
+		1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */; };
+		1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */; };
+		54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */; };
+		AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = AC1F82641FBA3CBD0052BA77 /* labels.txt */; };
+		AC31178921BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */; };
+		AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
+		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
+		1C3C9DCA1ED3AB4200B8B5FA /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
+		1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tflite_camera_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; path = MainStoryboard_iPhone.storyboard; sourceTree = "<group>"; };
+		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
+		1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = System/Library/Frameworks/CoreMedia.framework; sourceTree = SDKROOT; };
+		1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
+		1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleAppDelegate.h; sourceTree = "<group>"; };
+		1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = CameraExampleAppDelegate.m; sourceTree = "<group>"; };
+		1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleViewController.h; sourceTree = "<group>"; };
+		1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CameraExampleViewController.mm; sourceTree = "<group>"; };
+		1CDB2D4D1ED3AA35007929E9 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tflite_camera_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+		3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
+		55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.release.xcconfig"; sourceTree = "<group>"; };
+		AC1F82641FBA3CBD0052BA77 /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
+		AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_quant_v1_224.tflite; sourceTree = "<group>"; };
+		AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		1C564C0A1ED3A92E00087306 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */,
+				1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */,
+				54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		24D7686C331131624F4454A0 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */,
+				1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */,
+				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
+				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
+				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
+				3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		3E9FC355632FB928EA23BEED /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */,
+				55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
+		591157921CF4011C00C31E3A = {
+			isa = PBXGroup;
+			children = (
+				1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */,
+				1C3C9DCA1ED3AB4200B8B5FA /* main.mm */,
+				1CDB2D4D1ED3AA35007929E9 /* Info.plist */,
+				1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */,
+				1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */,
+				1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */,
+				1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */,
+				59A3CFF31CF4E68100C4259F /* data */,
+				5911579C1CF4011C00C31E3A /* Products */,
+				3E9FC355632FB928EA23BEED /* Pods */,
+				24D7686C331131624F4454A0 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		5911579C1CF4011C00C31E3A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		59A3CFF31CF4E68100C4259F /* data */ = {
+			isa = PBXGroup;
+			children = (
+				AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */,
+				AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */,
+				AC1F82641FBA3CBD0052BA77 /* labels.txt */,
+			);
+			path = data;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		1C564C0C1ED3A92E00087306 /* tflite_camera_example */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example" */;
+			buildPhases = (
+				66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */,
+				1C564C091ED3A92E00087306 /* Sources */,
+				1C564C0A1ED3A92E00087306 /* Frameworks */,
+				1C564C0B1ED3A92E00087306 /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = tflite_camera_example;
+			productName = tflite_camera_example;
+			productReference = 1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		591157931CF4011C00C31E3A /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 0830;
+				LastUpgradeCheck = 0830;
+				ORGANIZATIONNAME = Google;
+				TargetAttributes = {
+					1C564C0C1ED3A92E00087306 = {
+						CreatedOnToolsVersion = 8.3.2;
+						DevelopmentTeam = EQHXZ8M8AV;
+						ProvisioningStyle = Automatic;
+					};
+				};
+			};
+			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example" */;
+			compatibilityVersion = "Xcode 3.2";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 591157921CF4011C00C31E3A;
+			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				1C564C0C1ED3A92E00087306 /* tflite_camera_example */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		1C564C0B1ED3A92E00087306 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */,
+				1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */,
+				AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */,
+				AC31178921BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
+				"${PODS_ROOT}/Manifest.lock",
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+				"$(DERIVED_FILE_DIR)/Pods-tflite_camera_example-checkManifestLockResult.txt",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		1C564C091ED3A92E00087306 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */,
+				1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */,
+				1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		1C564C361ED3A92E00087306 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				DEVELOPMENT_TEAM = EQHXZ8M8AV;
+				INFOPLIST_FILE = Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 3.0;
+			};
+			name = Debug;
+		};
+		1C564C371ED3A92E00087306 /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				DEVELOPMENT_TEAM = EQHXZ8M8AV;
+				INFOPLIST_FILE = Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule";
+				SWIFT_VERSION = 3.0;
+			};
+			name = Release;
+		};
+		591157B01CF4011D00C31E3A /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				HEADER_SEARCH_PATHS = "$(inherited)";
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		591157B11CF4011D00C31E3A /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				HEADER_SEARCH_PATHS = "$(inherited)";
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1C564C361ED3A92E00087306 /* Debug */,
+				1C564C371ED3A92E00087306 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				591157B01CF4011D00C31E3A /* Debug */,
+				591157B11CF4011D00C31E3A /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 591157931CF4011C00C31E3A /* Project object */;
+}
diff --git a/tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj/project.pbxproj b/tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..8112d5d3114aaf0f80d70747200eec41250a54bf
--- /dev/null
+++ b/tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj/project.pbxproj
@@ -0,0 +1,365 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 46;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1C3C9DCA1ED3AB4200B8B5FA /* main.mm */; };
+		1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */; };
+		1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */; };
+		1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */; };
+		1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */; };
+		1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */; };
+		AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = AC1F82641FBA3CBD0052BA77 /* labels.txt */; };
+		AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */; };
+		AC421C73217D079300A7DFC2 /* libtensorflow-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = AC421C72217D079300A7DFC2 /* libtensorflow-lite.a */; };
+		ACD35BF0217A99D600BBC881 /* libprotobuf.a in Frameworks */ = {isa = PBXBuildFile; fileRef = ACD35BEF217A99D600BBC881 /* libprotobuf.a */; };
+		ACD35BF2217A9A4E00BBC881 /* nsync.a in Frameworks */ = {isa = PBXBuildFile; fileRef = ACD35BF1217A9A4E00BBC881 /* nsync.a */; };
+		ACD35BF4217A9E7E00BBC881 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = ACD35BF3217A9E7E00BBC881 /* Accelerate.framework */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
+		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
+		1C3C9DCA1ED3AB4200B8B5FA /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
+		1C564C0D1ED3A92E00087306 /* tflite_camera_example_with_select_tf_ops.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tflite_camera_example_with_select_tf_ops.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; path = MainStoryboard_iPhone.storyboard; sourceTree = "<group>"; };
+		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
+		1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = System/Library/Frameworks/CoreMedia.framework; sourceTree = SDKROOT; };
+		1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
+		1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleAppDelegate.h; sourceTree = "<group>"; };
+		1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = CameraExampleAppDelegate.m; sourceTree = "<group>"; };
+		1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleViewController.h; sourceTree = "<group>"; };
+		1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CameraExampleViewController.mm; sourceTree = "<group>"; };
+		1CDB2D4D1ED3AA35007929E9 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		AC1F82641FBA3CBD0052BA77 /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
+		AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
+		AC421C72217D079300A7DFC2 /* libtensorflow-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libtensorflow-lite.a"; path = "../../../../makefile/gen/lib/libtensorflow-lite.a"; sourceTree = "<group>"; };
+		ACD35BEF217A99D600BBC881 /* libprotobuf.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libprotobuf.a; path = ../../../../makefile/gen/protobuf_ios/lib/libprotobuf.a; sourceTree = "<group>"; };
+		ACD35BF1217A9A4E00BBC881 /* nsync.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = nsync.a; path = ../../../../makefile/gen/lib/nsync.a; sourceTree = "<group>"; };
+		ACD35BF3217A9E7E00BBC881 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		1C564C0A1ED3A92E00087306 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				AC421C73217D079300A7DFC2 /* libtensorflow-lite.a in Frameworks */,
+				ACD35BF4217A9E7E00BBC881 /* Accelerate.framework in Frameworks */,
+				ACD35BF2217A9A4E00BBC881 /* nsync.a in Frameworks */,
+				ACD35BF0217A99D600BBC881 /* libprotobuf.a in Frameworks */,
+				1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */,
+				1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		24D7686C331131624F4454A0 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				AC421C72217D079300A7DFC2 /* libtensorflow-lite.a */,
+				ACD35BF3217A9E7E00BBC881 /* Accelerate.framework */,
+				ACD35BF1217A9A4E00BBC881 /* nsync.a */,
+				ACD35BEF217A99D600BBC881 /* libprotobuf.a */,
+				1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */,
+				1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */,
+				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
+				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
+				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		591157921CF4011C00C31E3A = {
+			isa = PBXGroup;
+			children = (
+				1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */,
+				1C3C9DCA1ED3AB4200B8B5FA /* main.mm */,
+				1CDB2D4D1ED3AA35007929E9 /* Info.plist */,
+				1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */,
+				1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */,
+				1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */,
+				1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */,
+				59A3CFF31CF4E68100C4259F /* data */,
+				5911579C1CF4011C00C31E3A /* Products */,
+				24D7686C331131624F4454A0 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		5911579C1CF4011C00C31E3A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				1C564C0D1ED3A92E00087306 /* tflite_camera_example_with_select_tf_ops.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		59A3CFF31CF4E68100C4259F /* data */ = {
+			isa = PBXGroup;
+			children = (
+				AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */,
+				AC1F82641FBA3CBD0052BA77 /* labels.txt */,
+			);
+			path = data;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		1C564C0C1ED3A92E00087306 /* tflite_camera_example_with_select_tf_ops */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example_with_select_tf_ops" */;
+			buildPhases = (
+				1C564C091ED3A92E00087306 /* Sources */,
+				1C564C0A1ED3A92E00087306 /* Frameworks */,
+				1C564C0B1ED3A92E00087306 /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = tflite_camera_example_with_select_tf_ops;
+			productName = tflite_camera_example_with_select_tf_ops;
+			productReference = 1C564C0D1ED3A92E00087306 /* tflite_camera_example_with_select_tf_ops.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		591157931CF4011C00C31E3A /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 0830;
+				LastUpgradeCheck = 0830;
+				ORGANIZATIONNAME = Google;
+				TargetAttributes = {
+					1C564C0C1ED3A92E00087306 = {
+						CreatedOnToolsVersion = 8.3.2;
+						DevelopmentTeam = EQHXZ8M8AV;
+						ProvisioningStyle = Automatic;
+					};
+				};
+			};
+			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example_with_select_tf_ops" */;
+			compatibilityVersion = "Xcode 3.2";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 591157921CF4011C00C31E3A;
+			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				1C564C0C1ED3A92E00087306 /* tflite_camera_example_with_select_tf_ops */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		1C564C0B1ED3A92E00087306 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */,
+				1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */,
+				AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		1C564C091ED3A92E00087306 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */,
+				1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */,
+				1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		1C564C361ED3A92E00087306 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				DEVELOPMENT_TEAM = EQHXZ8M8AV;
+				INFOPLIST_FILE = Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				OTHER_LDFLAGS = (
+					"-force_load",
+					"${SRCROOT}/../../../../makefile/gen/lib/libtensorflow-lite.a",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 3.0;
+			};
+			name = Debug;
+		};
+		1C564C371ED3A92E00087306 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				DEVELOPMENT_TEAM = EQHXZ8M8AV;
+				INFOPLIST_FILE = Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				OTHER_LDFLAGS = (
+					"-force_load",
+					"${SRCROOT}/../../../../makefile/gen/lib/libtensorflow-lite.a",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule";
+				SWIFT_VERSION = 3.0;
+			};
+			name = Release;
+		};
+		591157B01CF4011D00C31E3A /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				HEADER_SEARCH_PATHS = (
+					"$(inherited)",
+					../../../../../../,
+					../../../../../contrib/makefile/downloads/flatbuffers/include/,
+					../../../../../contrib/makefile/downloads/eigen/,
+					../../../../../contrib/makefile/downloads/,
+				);
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				LIBRARY_SEARCH_PATHS = ../../../../../contrib/makefile/gen/lib/;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		591157B11CF4011D00C31E3A /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				HEADER_SEARCH_PATHS = (
+					"$(inherited)",
+					../../../../../../,
+					../../../../../contrib/makefile/downloads/flatbuffers/include/,
+					../../../../../contrib/makefile/downloads/eigen/,
+					../../../../../contrib/makefile/downloads/,
+				);
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				LIBRARY_SEARCH_PATHS = ../../../../../contrib/makefile/gen/lib/;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example_with_select_tf_ops" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1C564C361ED3A92E00087306 /* Debug */,
+				1C564C371ED3A92E00087306 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example_with_select_tf_ops" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				591157B01CF4011D00C31E3A /* Debug */,
+				591157B11CF4011D00C31E3A /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 591157931CF4011C00C31E3A /* Project object */;
+}
diff --git a/tensorflow/contrib/lite/examples/ios/download_models.sh b/tensorflow/lite/examples/ios/download_models.sh
similarity index 92%
rename from tensorflow/contrib/lite/examples/ios/download_models.sh
rename to tensorflow/lite/examples/ios/download_models.sh
index ccd163758c5830dc9367e023dcb3a604e07ca5db..4828617d95e94c1b6ad811e04d3b94b659bd8f74 100755
--- a/tensorflow/contrib/lite/examples/ios/download_models.sh
+++ b/tensorflow/lite/examples/ios/download_models.sh
@@ -53,5 +53,6 @@ download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_model
 file ${DOWNLOADS_DIR}/models
 
 cp ${DOWNLOADS_DIR}/models/models/* simple/data/
-cp ${DOWNLOADS_DIR}/quantized_models/* camera/data/
-
+cp ${DOWNLOADS_DIR}/models/models/* camera/data/
+cp "${DOWNLOADS_DIR}/quantized_models/mobilenet_quant_v1_224.tflite" \
+   'camera/data/mobilenet_quant_v1_224.tflite'
diff --git a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h b/tensorflow/lite/examples/ios/simple/AppDelegate.h
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h
rename to tensorflow/lite/examples/ios/simple/AppDelegate.h
diff --git a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm b/tensorflow/lite/examples/ios/simple/AppDelegate.mm
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm
rename to tensorflow/lite/examples/ios/simple/AppDelegate.mm
diff --git a/tensorflow/lite/examples/ios/simple/Podfile b/tensorflow/lite/examples/ios/simple/Podfile
new file mode 100644
index 0000000000000000000000000000000000000000..931b72c1f5e946e8be61ac6dec3c6106a75b9685
--- /dev/null
+++ b/tensorflow/lite/examples/ios/simple/Podfile
@@ -0,0 +1,5 @@
+platform :ios, '8.0'
+inhibit_all_warnings!
+
+target 'tflite_simple_example'
+       pod 'TensorFlowLite', '1.12.0'
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist b/tensorflow/lite/examples/ios/simple/RunModel-Info.plist
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist
rename to tensorflow/lite/examples/ios/simple/RunModel-Info.plist
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h b/tensorflow/lite/examples/ios/simple/RunModelViewController.h
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h
rename to tensorflow/lite/examples/ios/simple/RunModelViewController.h
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm b/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
rename to tensorflow/lite/examples/ios/simple/RunModelViewController.mm
index 650c73f7322c3169e60231ce52e86d2cdc86d0a4..32da7f7e4fce5cafc3c4746e5847315172542fc9 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
+++ b/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
@@ -24,8 +24,8 @@
 
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/string_util.h"
 #include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/string_util.h"
 
 #include "ios_image_load.h"
 
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.xib b/tensorflow/lite/examples/ios/simple/RunModelViewController.xib
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.xib
rename to tensorflow/lite/examples/ios/simple/RunModelViewController.xib
diff --git a/tensorflow/contrib/lite/examples/ios/simple/data/grace_hopper.jpg b/tensorflow/lite/examples/ios/simple/data/grace_hopper.jpg
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/data/grace_hopper.jpg
rename to tensorflow/lite/examples/ios/simple/data/grace_hopper.jpg
diff --git a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h b/tensorflow/lite/examples/ios/simple/ios_image_load.h
similarity index 78%
rename from tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
rename to tensorflow/lite/examples/ios/simple/ios_image_load.h
index 96d28109375a71de87dcc0b7957ed557ee30be99..74c6cf3c7b1ac60f7743ce042fdbdf629d2c75da 100644
--- a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
+++ b/tensorflow/lite/examples/ios/simple/ios_image_load.h
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
+#ifndef TENSORFLOW_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
+#define TENSORFLOW_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
 
 #include <vector>
 
 std::vector<uint8_t> LoadImageFromFile(const char* file_name, int* out_width,
                                        int* out_height, int* out_channels);
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
+#endif  // TENSORFLOW_LITE_EXAMPLES_IOS_SIMPLE_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm b/tensorflow/lite/examples/ios/simple/ios_image_load.mm
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm
rename to tensorflow/lite/examples/ios/simple/ios_image_load.mm
diff --git a/tensorflow/contrib/lite/examples/ios/simple/main.mm b/tensorflow/lite/examples/ios/simple/main.mm
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/main.mm
rename to tensorflow/lite/examples/ios/simple/main.mm
diff --git a/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj b/tensorflow/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
similarity index 100%
rename from tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
rename to tensorflow/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4fc8648d46c4bdefe3865381a23f4d73c87c284b
--- /dev/null
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -0,0 +1,70 @@
+# Description:
+# TensorFlow Lite Example Label Image.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
+
+exports_files(glob([
+    "testdata/*.bmp",
+]))
+
+tf_cc_binary(
+    name = "label_image",
+    srcs = [
+        "get_top_n.h",
+        "get_top_n_impl.h",
+        "label_image.cc",
+    ],
+    linkopts = tflite_linkopts() + select({
+        "//tensorflow:android": [
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":bitmap_helpers",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
+
+cc_library(
+    name = "bitmap_helpers",
+    srcs = ["bitmap_helpers.cc"],
+    hdrs = [
+        "bitmap_helpers.h",
+        "bitmap_helpers_impl.h",
+        "label_image.h",
+    ],
+    deps = [
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_test(
+    name = "label_image_test",
+    srcs = [
+        "get_top_n.h",
+        "get_top_n_impl.h",
+        "label_image_test.cc",
+    ],
+    data = [
+        "testdata/grace_hopper.bmp",
+    ],
+    deps = [
+        ":bitmap_helpers",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc b/tensorflow/lite/examples/label_image/bitmap_helpers.cc
similarity index 98%
rename from tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc
rename to tensorflow/lite/examples/label_image/bitmap_helpers.cc
index 2735d1f5ea4e2a104f71a3a6f874d9acb2f48142..0adad68ddca8927835fbe4101f58ae093db6d469 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc
+++ b/tensorflow/lite/examples/label_image/bitmap_helpers.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include <unistd.h>  // NOLINT(build/include_order)
 
-#include "tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h"
+#include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
 
 #define LOG(x) std::cerr
 
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h b/tensorflow/lite/examples/label_image/bitmap_helpers.h
similarity index 79%
rename from tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
rename to tensorflow/lite/examples/label_image/bitmap_helpers.h
index 7881ee80cad4327e5f498ecb089358ea0dd6f121..05209963a16c1280a7d77718b8badbd699e07a6c 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
+++ b/tensorflow/lite/examples/label_image/bitmap_helpers.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
 
-#include "tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h"
-#include "tensorflow/contrib/lite/examples/label_image/label_image.h"
+#include "tensorflow/lite/examples/label_image/bitmap_helpers_impl.h"
+#include "tensorflow/lite/examples/label_image/label_image.h"
 
 namespace tflite {
 namespace label_image {
@@ -39,4 +39,4 @@ template void resize<float>(float*, unsigned char*, int, int, int, int, int,
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/lite/examples/label_image/bitmap_helpers_impl.h
similarity index 84%
rename from tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
rename to tensorflow/lite/examples/label_image/bitmap_helpers_impl.h
index 21ad39a6bf75e536ed099cb6120407be880404f0..b581d8077342138a0591f7c49073d468b2c6fbbf 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
+++ b/tensorflow/lite/examples/label_image/bitmap_helpers_impl.h
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
 
-#include "tensorflow/contrib/lite/examples/label_image/label_image.h"
+#include "tensorflow/lite/examples/label_image/label_image.h"
 
-#include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace label_image {
@@ -93,4 +93,4 @@ void resize(T* out, uint8_t* in, int image_height, int image_width,
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_BITMAP_HELPERS_IMPL_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/get_top_n.h b/tensorflow/lite/examples/label_image/get_top_n.h
similarity index 82%
rename from tensorflow/contrib/lite/examples/label_image/get_top_n.h
rename to tensorflow/lite/examples/label_image/get_top_n.h
index adef434c00a6808786557e30f8f9b09364968707..47fea2f775826d7545dbb5f66a7b897c6a250a9c 100644
--- a/tensorflow/contrib/lite/examples/label_image/get_top_n.h
+++ b/tensorflow/lite/examples/label_image/get_top_n.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
 
-#include "tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h"
+#include "tensorflow/lite/examples/label_image/get_top_n_impl.h"
 
 namespace tflite {
 namespace label_image {
@@ -35,4 +35,4 @@ template void get_top_n<float>(float*, int, size_t, float,
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h b/tensorflow/lite/examples/label_image/get_top_n_impl.h
similarity index 90%
rename from tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h
rename to tensorflow/lite/examples/label_image/get_top_n_impl.h
index 708cf2f2b1cab96f76520321b49382dd2276ec8a..563ac09114c2343233878befc15d8772f5de9fe1 100644
--- a/tensorflow/contrib/lite/examples/label_image/get_top_n_impl.h
+++ b/tensorflow/lite/examples/label_image/get_top_n_impl.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
 
 #include <algorithm>
 #include <queue>
@@ -67,4 +67,4 @@ void get_top_n(T* prediction, int prediction_size, size_t num_results,
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_GET_TOP_N_IMPL_H_
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
similarity index 97%
rename from tensorflow/contrib/lite/examples/label_image/label_image.cc
rename to tensorflow/lite/examples/label_image/label_image.cc
index 7c6f523041ad5a516f348c1b4f66683128838228..b8dc2840dfb49f8c067fbd2bf09432f7b06d6265 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -32,13 +32,13 @@ limitations under the License.
 #include <sys/uio.h>    // NOLINT(build/include_order)
 #include <unistd.h>     // NOLINT(build/include_order)
 
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/optional_debug_tools.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/optional_debug_tools.h"
+#include "tensorflow/lite/string_util.h"
 
-#include "tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h"
-#include "tensorflow/contrib/lite/examples/label_image/get_top_n.h"
+#include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
+#include "tensorflow/lite/examples/label_image/get_top_n.h"
 
 #define LOG(x) std::cerr
 
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/lite/examples/label_image/label_image.h
similarity index 82%
rename from tensorflow/contrib/lite/examples/label_image/label_image.h
rename to tensorflow/lite/examples/label_image/label_image.h
index f0be881b58573a84c34c362c827845a723c23c4d..88b047fecc4b3efd10ef025193a79516516c03f1 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.h
+++ b/tensorflow/lite/examples/label_image/label_image.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
-#define TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
+#ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
+#define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
 
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace label_image {
@@ -40,4 +40,4 @@ struct Settings {
 }  // namespace label_image
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
+#endif  // TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
diff --git a/tensorflow/lite/examples/label_image/label_image.md b/tensorflow/lite/examples/label_image/label_image.md
new file mode 100644
index 0000000000000000000000000000000000000000..fd9f49918b4494eab845da7716a350ad6246f532
--- /dev/null
+++ b/tensorflow/lite/examples/label_image/label_image.md
@@ -0,0 +1,78 @@
+label_image for TensorFlow Lite inspired by TensorFlow's label_image.
+
+To build label_image for Android, run $TENSORFLOW_ROOT/configure 
+and set Android NDK or configure NDK setting in 
+$TENSORFLOW_ROOT/WORKSPACE first.
+ 
+To build it for android ARMv8:
+```
+> bazel build --config monolithic --cxxopt=-std=c++11 \
+  --crosstool_top=//external:android/crosstool \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --cpu=arm64-v8a \
+  //tensorflow/lite/examples/label_image:label_image
+```
+or
+```
+> bazel build --config android_arm64 --config monolithic --cxxopt=-std=c++11 \
+  //tensorflow/lite/examples/label_image:label_image
+```
+
+To build it for android arm-v7a:
+```
+> bazel build --config monolithic --cxxopt=-std=c++11 \
+  --crosstool_top=//external:android/crosstool \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --cpu=armeabi-v7a \
+  //tensorflow/lite/examples/label_image:label_image
+```
+or
+```
+> bazel build --config android_arm --config monolithic --cxxopt=-std=c++11 \
+  //tensorflow/lite/examples/label_image:label_image
+```
+
+Build it for desktop machines (tested on Ubuntu and OS X)
+```
+> bazel build --config opt --cxxopt=-std=c++11 //tensorflow/lite/examples/label_image:label_image
+```
+To run it. Prepare `./mobilenet_quant_v1_224.tflite`, `./grace_hopper.bmp`, and `./labels.txt`.
+
+Run it:
+```
+> ./label_image                                        
+Loaded model ./mobilenet_quant_v1_224.tflite
+resolved reporter
+invoked
+average time: 100.986 ms 
+0.439216: 653 military uniform
+0.372549: 458 bow tie
+0.0705882: 466 bulletproof vest
+0.0235294: 514 cornet
+0.0196078: 835 suit
+```
+Run `interpreter->Invoker()` 100 times:
+```
+> ./label_image   -c 100                               
+Loaded model ./mobilenet_quant_v1_224.tflite
+resolved reporter
+invoked
+average time: 33.4694 ms
+...
+```
+
+Run a floating point (`mobilenet_v1_1.0_224.tflite`) model,
+```
+> ./label_image -f 1 -m mobilenet_v1_1.0_224.tflite
+Loaded model mobilenet_v1_1.0_224.tflite
+resolved reporter
+invoked
+average time: 263.493 ms 
+0.88615: 653 military uniform
+0.0422316: 440 bearskin
+0.0109948: 466 bulletproof vest
+0.0105327: 401 academic gown
+0.00947104: 723 ping-pong bal
+```
+
+See the source code for other command line options.
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image_test.cc b/tensorflow/lite/examples/label_image/label_image_test.cc
similarity index 84%
rename from tensorflow/contrib/lite/examples/label_image/label_image_test.cc
rename to tensorflow/lite/examples/label_image/label_image_test.cc
index de7de21f7741d3d46cb96e793e8bc4bfb21384fe..4db139f048d44a263fa1bbe38099b55ee45fd593 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image_test.cc
+++ b/tensorflow/lite/examples/label_image/label_image_test.cc
@@ -16,18 +16,16 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
-#include "tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h"
-#include "tensorflow/contrib/lite/examples/label_image/get_top_n.h"
-#include "tensorflow/contrib/lite/examples/label_image/label_image.h"
-
-using ::testing::ElementsAreArray;
+#include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
+#include "tensorflow/lite/examples/label_image/get_top_n.h"
+#include "tensorflow/lite/examples/label_image/label_image.h"
 
 namespace tflite {
 namespace label_image {
 
 TEST(LabelImageTest, GraceHopper) {
   std::string lena_file =
-      "tensorflow/contrib/lite/examples/label_image/testdata/"
+      "tensorflow/lite/examples/label_image/testdata/"
       "grace_hopper.bmp";
   int height, width, channels;
   Settings s;
diff --git a/tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp b/tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp
similarity index 100%
rename from tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp
rename to tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp
diff --git a/tensorflow/lite/examples/minimal/BUILD b/tensorflow/lite/examples/minimal/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..cdd67af1e93661c1f65cc46d9b687acc1fa56fe8
--- /dev/null
+++ b/tensorflow/lite/examples/minimal/BUILD
@@ -0,0 +1,27 @@
+# Description:
+#   TensorFlow Lite minimal example.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
+
+tf_cc_binary(
+    name = "minimal",
+    srcs = [
+        "minimal.cc",
+    ],
+    linkopts = tflite_linkopts() + select({
+        "//tensorflow:android": [
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/lite/examples/minimal/minimal.cc
similarity index 90%
rename from tensorflow/contrib/lite/examples/minimal/minimal.cc
rename to tensorflow/lite/examples/minimal/minimal.cc
index 8b65cde7b79fde19280ad778ea874c64b01d169a..9bbfee60851e0d9a1cd1e7549338341b634f0aa6 100644
--- a/tensorflow/contrib/lite/examples/minimal/minimal.cc
+++ b/tensorflow/lite/examples/minimal/minimal.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdio>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/optional_debug_tools.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/optional_debug_tools.h"
 
 // This is an example that is minimal to read a model
 // from disk and perform inference. There is no data being loaded
@@ -50,7 +50,7 @@ int main(int argc, char* argv[]) {
 
   // Build the interpreter
   tflite::ops::builtin::BuiltinOpResolver resolver;
-  InterpreterBuilder builder(*model.get(), resolver);
+  InterpreterBuilder builder(*model, resolver);
   std::unique_ptr<Interpreter> interpreter;
   builder(&interpreter);
   TFLITE_MINIMAL_CHECK(interpreter != nullptr);
diff --git a/tensorflow/lite/examples/python/BUILD b/tensorflow/lite/examples/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a606d1aa563261142ce155082ab92c14a5c34610
--- /dev/null
+++ b/tensorflow/lite/examples/python/BUILD
@@ -0,0 +1,13 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+py_binary(
+    name = "label_image",
+    srcs = ["label_image.py"],
+    main = "label_image.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/lite/python:lite",
+    ],
+)
diff --git a/tensorflow/lite/examples/python/label_image.md b/tensorflow/lite/examples/python/label_image.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4ec42f52594cfc320bfc5b7d5b65bb83ef0665d
--- /dev/null
+++ b/tensorflow/lite/examples/python/label_image.md
@@ -0,0 +1,50 @@
+
+With model, input image (grace_hopper.bmp), and labels file (labels.txt)
+in /tmp.
+
+The example input image and labels file are from TensorFlow repo and
+MobileNet V1 model files.
+
+```
+curl https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp > /tmp/grace_hopper.bmp
+
+curl  https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz  | tar xzv -C /tmp  mobilenet_v1_1.0_224/labels.txt
+mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/
+
+```
+
+Run
+
+```
+curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz | tar xzv -C /tmp
+bazel run --config opt //tensorflow/lite/examples/python:label_image
+```
+
+We can get results like
+
+```
+0.470588: military uniform
+0.337255: Windsor tie
+0.047059: bow tie
+0.031373: mortarboard
+0.019608: suit
+```
+
+Run
+
+```
+curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp
+bazel run --config opt //tensorflow/lite/examples/python:label_image \
+-- --model_file /tmp/mobilenet_v1_1.0_224.tflite
+```
+
+We can get results like
+```
+0.728693: military uniform
+0.116163: Windsor tie
+0.035517: bow tie
+0.014874: mortarboard
+0.011758: bolo tie
+```
+
+Check [models](../../g3doc/models.md) for models hosted by Google.
diff --git a/tensorflow/contrib/lite/examples/python/label_image.py b/tensorflow/lite/examples/python/label_image.py
similarity index 97%
rename from tensorflow/contrib/lite/examples/python/label_image.py
rename to tensorflow/lite/examples/python/label_image.py
index 282118a1d2b43a08930b24366110a021fc634b5e..0bc15d36a8ac2e25483271a78fbb90b31b709b97 100644
--- a/tensorflow/contrib/lite/examples/python/label_image.py
+++ b/tensorflow/lite/examples/python/label_image.py
@@ -23,7 +23,7 @@ import numpy as np
 
 from PIL import Image
 
-from tensorflow.contrib.lite.python import interpreter as interpreter_wrapper
+from tensorflow.lite.python import interpreter as interpreter_wrapper
 
 def load_labels(filename):
   my_labels = []
diff --git a/tensorflow/lite/experimental/c/BUILD b/tensorflow/lite/experimental/c/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..cde53e283830aca9c7990e3d8c4901f997621bc2
--- /dev/null
+++ b/tensorflow/lite/experimental/c/BUILD
@@ -0,0 +1,115 @@
+package(default_visibility = ["//visibility:private"])
+
+package_group(
+    name = "experimental",
+    packages = [
+        "//tensorflow/lite/experimental/...",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/lite:build_def.bzl",
+    "tflite_cc_shared_object",
+    "tflite_copts",
+    "tflite_jni_binary",
+)
+
+tflite_cc_shared_object(
+    name = "libtensorflowlite_c.so",
+    linkopts = select({
+        "//tensorflow:darwin": [
+            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
+            "$(location //tensorflow/lite/experimental/c:exported_symbols.lds)",
+            "-Wl,-install_name,@rpath/libtensorflowlite_c.so",
+        ],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "-z defs",
+            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
+            "$(location //tensorflow/lite/experimental/c:version_script.lds)",
+        ],
+    }),
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        ":exported_symbols.lds",
+        ":version_script.lds",
+    ],
+)
+
+cc_library(
+    name = "c_api_internal",
+    srcs = ["c_api.h"],
+    hdrs = ["c_api_internal.h"],
+    copts = tflite_copts(),
+    visibility = [
+        "//tensorflow/lite/experimental/c:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/lite:context",
+        "//tensorflow/lite:framework",
+    ],
+)
+
+cc_library(
+    name = "c_api",
+    srcs = ["c_api.cc"],
+    hdrs = ["c_api.h"],
+    copts = tflite_copts(),
+    tags = ["swift_module=TensorFlowLiteCAPI"],
+    visibility = [
+        ":experimental",
+    ],
+    deps = [
+        ":c_api_internal",
+        "//tensorflow/lite:context",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
+
+cc_library(
+    name = "c_api_experimental",
+    srcs = ["c_api_experimental.cc"],
+    hdrs = ["c_api_experimental.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":c_api",
+        ":c_api_internal",
+        "//tensorflow/lite:kernel_api",
+    ],
+)
+
+cc_test(
+    name = "c_api_test",
+    size = "small",
+    srcs = ["c_api_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/add_quantized.bin",
+    ],
+    deps = [
+        ":c_api",
+        "//tensorflow/lite:context",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "c_api_experimental_test",
+    size = "small",
+    srcs = ["c_api_experimental_test.cc"],
+    data = ["//tensorflow/lite:testdata/add.bin"],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/c/c_api.cc b/tensorflow/lite/experimental/c/c_api.cc
similarity index 92%
rename from tensorflow/contrib/lite/experimental/c/c_api.cc
rename to tensorflow/lite/experimental/c/c_api.cc
index 9c29f9d8b9ddfd311ee1f4cd20722880e87d3b46..9caacfeb3614a9d2c0a2a17d799216e2e7e97fe0 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api.cc
+++ b/tensorflow/lite/experimental/c/c_api.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/experimental/c/c_api.h"
+#include "tensorflow/lite/experimental/c/c_api.h"
 
 #include <memory>
 
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/error_reporter.h"
-#include "tensorflow/contrib/lite/experimental/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/error_reporter.h"
+#include "tensorflow/lite/experimental/c/c_api_internal.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -181,6 +181,10 @@ void* TFL_TensorData(const TFL_Tensor* tensor) {
 
 const char* TFL_TensorName(const TFL_Tensor* tensor) { return tensor->name; }
 
+TFL_QuantizationParams TFL_TensorQuantizationParams(const TFL_Tensor* tensor) {
+  return tensor->params;
+}
+
 TFL_Status TFL_TensorCopyFromBuffer(TFL_Tensor* tensor, const void* input_data,
                                     size_t input_data_size) {
   if (tensor->bytes != input_data_size) {
@@ -199,7 +203,7 @@ TFL_Status TFL_TensorCopyToBuffer(const TFL_Tensor* tensor, void* output_data,
   return kTfLiteOk;
 }
 
-// LINT.ThenChange(//tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs)
+// LINT.ThenChange(//tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/tensorflow/contrib/lite/experimental/c/c_api.h b/tensorflow/lite/experimental/c/c_api.h
similarity index 93%
rename from tensorflow/contrib/lite/experimental/c/c_api.h
rename to tensorflow/lite/experimental/c/c_api.h
index f52ab8f9ed65aa288a74e4e486ac060fa9dbebe0..49089011d1376b35d0a8948b45c77229b12d9802 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api.h
+++ b/tensorflow/lite/experimental/c/c_api.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_H_
 
 #include <stdarg.h>
 #include <stdint.h>
@@ -21,7 +21,7 @@ limitations under the License.
 // Eventually the various C APIs defined in context.h will be migrated into
 // the appropriate /c/c_api*.h header. For now, we pull in existing definitions
 // for convenience.
-#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/lite/context.h"
 
 // --------------------------------------------------------------------------
 // Experimental C API for TensorFlowLite.
@@ -53,6 +53,7 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
+typedef TfLiteQuantizationParams TFL_QuantizationParams;
 typedef TfLiteRegistration TFL_Registration;
 typedef TfLiteStatus TFL_Status;
 typedef TfLiteTensor TFL_Tensor;
@@ -200,6 +201,13 @@ TFL_CAPI_EXPORT extern void* TFL_TensorData(const TFL_Tensor* tensor);
 // Returns the (null-terminated) name of the tensor.
 TFL_CAPI_EXPORT extern const char* TFL_TensorName(const TFL_Tensor* tensor);
 
+// Returns the parameters for asymmetric quantization. The quantization
+// parameters are only valid when the tensor type is `kTfLiteUInt8` and the
+// `scale != 0`. Quantized values can be converted back to float using:
+//    real_value = scale * (quantized_value - zero_point);
+TFL_CAPI_EXPORT extern TFL_QuantizationParams TFL_TensorQuantizationParams(
+    const TFL_Tensor* tensor);
+
 // Copies from the provided input buffer into the tensor's buffer.
 // REQUIRES: input_data_size == TFL_TensorByteSize(tensor)
 TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyFromBuffer(
@@ -215,4 +223,4 @@ TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyToBuffer(
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_H_
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc b/tensorflow/lite/experimental/c/c_api_experimental.cc
similarity index 92%
rename from tensorflow/contrib/lite/experimental/c/c_api_experimental.cc
rename to tensorflow/lite/experimental/c/c_api_experimental.cc
index 29f8701f53407dc47adfaca8c85c86210e4cb09a..a246ed99cd37360b80b012ebe1f8a20b978efa80 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api_experimental.cc
+++ b/tensorflow/lite/experimental/c/c_api_experimental.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/c/c_api_experimental.h"
+#include "tensorflow/lite/experimental/c/c_api_experimental.h"
 
-#include "tensorflow/contrib/lite/experimental/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/c/c_api_internal.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental.h b/tensorflow/lite/experimental/c/c_api_experimental.h
similarity index 87%
rename from tensorflow/contrib/lite/experimental/c/c_api_experimental.h
rename to tensorflow/lite/experimental/c/c_api_experimental.h
index fca5d92f77caff987f6a70c3a8fd03849bce1165..e4cd084520e52cd92c4987153b2ead3805cc4de9 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api_experimental.h
+++ b/tensorflow/lite/experimental/c/c_api_experimental.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_
 
-#include "tensorflow/contrib/lite/builtin_ops.h"
-#include "tensorflow/contrib/lite/experimental/c/c_api.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/experimental/c/c_api.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -54,4 +54,4 @@ void TFL_InterpreterOptionsAddCustomOp(TFL_InterpreterOptions* options,
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_EXPERIMENTAL_H_
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc b/tensorflow/lite/experimental/c/c_api_experimental_test.cc
similarity index 86%
rename from tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc
rename to tensorflow/lite/experimental/c/c_api_experimental_test.cc
index 1b1bedb75470638d4b3cfac92819e18b8fe6e65a..e79c7204c6e7b9f1dfa864d146e3c81ae01dc9bc 100644
--- a/tensorflow/contrib/lite/experimental/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/experimental/c/c_api_experimental_test.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/c/c_api_experimental.h"
+#include "tensorflow/lite/experimental/c/c_api_experimental.h"
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/builtin_ops.h"
-#include "tensorflow/contrib/lite/experimental/c/c_api.h"
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/experimental/c/c_api.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace {
 
@@ -34,7 +34,7 @@ TfLiteRegistration* GetDummyRegistration() {
 
 TEST(CApiExperimentalSimple, Smoke) {
   TFL_Model* model = TFL_NewModelFromFile(
-      "tensorflow/contrib/lite/testdata/add.bin");
+      "tensorflow/lite/testdata/add.bin");
   ASSERT_NE(model, nullptr);
 
   TFL_InterpreterOptions* options = TFL_NewInterpreterOptions();
diff --git a/tensorflow/lite/experimental/c/c_api_internal.h b/tensorflow/lite/experimental/c/c_api_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a2987c8f1c88fc5fa78d1f7fe9a7ee716d6b086
--- /dev/null
+++ b/tensorflow/lite/experimental/c/c_api_internal.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
+
+#include "tensorflow/lite/experimental/c/c_api.h"
+
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+
+// Internal structures used by the C API. These are likely to change and should
+// not be depended on.
+//
+// NOTE: This header does not follow C conventions and does not define a C API.
+// It is effectively an (internal) implementation detail of the C API.
+
+struct TFL_Model {
+  // Sharing is safe as FlatBufferModel is const.
+  std::shared_ptr<const tflite::FlatBufferModel> impl;
+};
+
+struct TFL_InterpreterOptions {
+  enum {
+    kDefaultNumThreads = -1,
+  };
+  int num_threads = kDefaultNumThreads;
+
+  tflite::MutableOpResolver op_resolver;
+
+  void (*error_reporter)(void* user_data, const char* format,
+                         va_list args) = nullptr;
+  void* error_reporter_user_data = nullptr;
+};
+
+struct TFL_Interpreter {
+  // Taking a reference to the (const) model data avoids lifetime-related issues
+  // and complexity with the TFL_Model's existence.
+  std::shared_ptr<const tflite::FlatBufferModel> model;
+
+  // The interpreter does not take ownership of the provided ErrorReporter
+  // instance, so we ensure its validity here. Note that the interpreter may use
+  // the reporter in its destructor, so it should be declared first.
+  std::unique_ptr<tflite::ErrorReporter> optional_error_reporter;
+
+  std::unique_ptr<tflite::Interpreter> impl;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_C_C_API_INTERNAL_H_
diff --git a/tensorflow/lite/experimental/c/c_api_test.cc b/tensorflow/lite/experimental/c/c_api_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5fb14f342cba8908e1f6e7907123b696b4aea01d
--- /dev/null
+++ b/tensorflow/lite/experimental/c/c_api_test.cc
@@ -0,0 +1,195 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/lite/experimental/c/c_api.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace {
+
+TEST(CApiSimple, Smoke) {
+  TFL_Model* model = TFL_NewModelFromFile(
+      "tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  TFL_InterpreterOptions* options = TFL_NewInterpreterOptions();
+  ASSERT_NE(options, nullptr);
+  TFL_InterpreterOptionsSetNumThreads(options, 2);
+
+  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options);
+  ASSERT_NE(interpreter, nullptr);
+
+  // The options/model can be deleted immediately after interpreter creation.
+  TFL_DeleteInterpreterOptions(options);
+  TFL_DeleteModel(model);
+
+  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+  ASSERT_EQ(TFL_InterpreterGetInputTensorCount(interpreter), 1);
+  ASSERT_EQ(TFL_InterpreterGetOutputTensorCount(interpreter), 1);
+
+  std::array<int, 1> input_dims = {2};
+  ASSERT_EQ(TFL_InterpreterResizeInputTensor(interpreter, 0, input_dims.data(),
+                                             input_dims.size()),
+            kTfLiteOk);
+  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  TFL_Tensor* input_tensor = TFL_InterpreterGetInputTensor(interpreter, 0);
+  ASSERT_NE(input_tensor, nullptr);
+  EXPECT_EQ(TFL_TensorType(input_tensor), kTfLiteFloat32);
+  EXPECT_EQ(TFL_TensorNumDims(input_tensor), 1);
+  EXPECT_EQ(TFL_TensorDim(input_tensor, 0), 2);
+  EXPECT_EQ(TFL_TensorByteSize(input_tensor), sizeof(float) * 2);
+  EXPECT_NE(TFL_TensorData(input_tensor), nullptr);
+  EXPECT_STREQ(TFL_TensorName(input_tensor), "input");
+
+  TFL_QuantizationParams input_params =
+      TFL_TensorQuantizationParams(input_tensor);
+  EXPECT_EQ(input_params.scale, 0.f);
+  EXPECT_EQ(input_params.zero_point, 0);
+
+  std::array<float, 2> input = {1.f, 3.f};
+  ASSERT_EQ(TFL_TensorCopyFromBuffer(input_tensor, input.data(),
+                                     input.size() * sizeof(float)),
+            kTfLiteOk);
+
+  ASSERT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TFL_Tensor* output_tensor =
+      TFL_InterpreterGetOutputTensor(interpreter, 0);
+  ASSERT_NE(output_tensor, nullptr);
+  EXPECT_EQ(TFL_TensorType(output_tensor), kTfLiteFloat32);
+  EXPECT_EQ(TFL_TensorNumDims(output_tensor), 1);
+  EXPECT_EQ(TFL_TensorDim(output_tensor, 0), 2);
+  EXPECT_EQ(TFL_TensorByteSize(output_tensor), sizeof(float) * 2);
+  EXPECT_NE(TFL_TensorData(output_tensor), nullptr);
+  EXPECT_STREQ(TFL_TensorName(output_tensor), "output");
+
+  TFL_QuantizationParams output_params =
+      TFL_TensorQuantizationParams(output_tensor);
+  EXPECT_EQ(output_params.scale, 0.f);
+  EXPECT_EQ(output_params.zero_point, 0);
+
+  std::array<float, 2> output;
+  ASSERT_EQ(TFL_TensorCopyToBuffer(output_tensor, output.data(),
+                                   output.size() * sizeof(float)),
+            kTfLiteOk);
+  EXPECT_EQ(output[0], 3.f);
+  EXPECT_EQ(output[1], 9.f);
+
+  TFL_DeleteInterpreter(interpreter);
+}
+
+TEST(CApiSimple, QuantizationParams) {
+  TFL_Model* model = TFL_NewModelFromFile(
+      "tensorflow/lite/testdata/add_quantized.bin");
+  ASSERT_NE(model, nullptr);
+
+  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, nullptr);
+  ASSERT_NE(interpreter, nullptr);
+
+  TFL_DeleteModel(model);
+
+  const std::array<int, 1> input_dims = {2};
+  ASSERT_EQ(TFL_InterpreterResizeInputTensor(interpreter, 0, input_dims.data(),
+                                             input_dims.size()),
+            kTfLiteOk);
+  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  TFL_Tensor* input_tensor = TFL_InterpreterGetInputTensor(interpreter, 0);
+  ASSERT_NE(input_tensor, nullptr);
+  EXPECT_EQ(TFL_TensorType(input_tensor), kTfLiteUInt8);
+  EXPECT_EQ(TFL_TensorNumDims(input_tensor), 1);
+  EXPECT_EQ(TFL_TensorDim(input_tensor, 0), 2);
+
+  TFL_QuantizationParams input_params =
+      TFL_TensorQuantizationParams(input_tensor);
+  EXPECT_EQ(input_params.scale, 0.003922f);
+  EXPECT_EQ(input_params.zero_point, 0);
+
+  const std::array<uint8_t, 2> input = {1, 3};
+  ASSERT_EQ(TFL_TensorCopyFromBuffer(input_tensor, input.data(),
+                                     input.size() * sizeof(uint8_t)),
+            kTfLiteOk);
+
+  ASSERT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TFL_Tensor* output_tensor =
+      TFL_InterpreterGetOutputTensor(interpreter, 0);
+  ASSERT_NE(output_tensor, nullptr);
+
+  TFL_QuantizationParams output_params =
+      TFL_TensorQuantizationParams(output_tensor);
+  EXPECT_EQ(output_params.scale, 0.003922f);
+  EXPECT_EQ(output_params.zero_point, 0);
+
+  std::array<uint8_t, 2> output;
+  ASSERT_EQ(TFL_TensorCopyToBuffer(output_tensor, output.data(),
+                                   output.size() * sizeof(uint8_t)),
+            kTfLiteOk);
+  EXPECT_EQ(output[0], 3);
+  EXPECT_EQ(output[1], 9);
+
+  const float dequantizedOutput0 =
+      output_params.scale * (output[0] - output_params.zero_point);
+  const float dequantizedOutput1 =
+      output_params.scale * (output[1] - output_params.zero_point);
+  EXPECT_EQ(dequantizedOutput0, 0.011766f);
+  EXPECT_EQ(dequantizedOutput1, 0.035298f);
+
+  TFL_DeleteInterpreter(interpreter);
+}
+
+TEST(CApiSimple, ErrorReporter) {
+  TFL_Model* model = TFL_NewModelFromFile(
+      "tensorflow/lite/testdata/add.bin");
+  TFL_InterpreterOptions* options = TFL_NewInterpreterOptions();
+
+  // Install a custom error reporter into the interpreter by way of options.
+  tflite::TestErrorReporter reporter;
+  TFL_InterpreterOptionsSetErrorReporter(
+      options,
+      [](void* user_data, const char* format, va_list args) {
+        reinterpret_cast<tflite::TestErrorReporter*>(user_data)->Report(format,
+                                                                        args);
+      },
+      &reporter);
+  TFL_Interpreter* interpreter = TFL_NewInterpreter(model, options);
+
+  // The options/model can be deleted immediately after interpreter creation.
+  TFL_DeleteInterpreterOptions(options);
+  TFL_DeleteModel(model);
+
+  // Invoke the interpreter before tensor allocation.
+  EXPECT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteError);
+
+  // The error should propagate to the custom error reporter.
+  EXPECT_EQ(reporter.error_messages(),
+            "Invoke called on model that is not ready.");
+  EXPECT_EQ(reporter.num_calls(), 1);
+
+  TFL_DeleteInterpreter(interpreter);
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/experimental/c/exported_symbols.lds b/tensorflow/lite/experimental/c/exported_symbols.lds
similarity index 100%
rename from tensorflow/contrib/lite/experimental/c/exported_symbols.lds
rename to tensorflow/lite/experimental/c/exported_symbols.lds
diff --git a/tensorflow/contrib/lite/experimental/c/version_script.lds b/tensorflow/lite/experimental/c/version_script.lds
similarity index 100%
rename from tensorflow/contrib/lite/experimental/c/version_script.lds
rename to tensorflow/lite/experimental/c/version_script.lds
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0c351ee4eccee515ed34ec5e8607914f7064ffbf
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -0,0 +1,40 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "tflite_lstm",
+    srcs = ["tflite_lstm.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "unidirectional_sequence_lstm_test",
+    size = "large",
+    srcs = ["unidirectional_sequence_lstm_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":tflite_lstm",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/tools:optimize_for_inference",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py b/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fe8ebf9e99f8b0e592e83c2e473dd2f8395c6c0
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
@@ -0,0 +1,396 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TfLite LSTMCell wrapper.
+
+TODO(renjieliu): Find a better home for this one.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+
+from tensorflow.lite.python import lite
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import initializers
+from tensorflow.python.layers import base as base_layer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.platform import tf_logging as logging
+
+
+class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
+  """Long short-term memory unit (LSTM) recurrent network cell.
+
+  This is used only for TfLite, it provides hints and it also makes the
+  variables in the desired for the tflite ops  (transposed and seaparated).
+
+  The default non-peephole implementation is based on:
+
+    https://pdfs.semanticscholar.org/1154/0131eae85b2e11d53df7f1360eeb6476e7f4.pdf
+
+  Felix Gers, Jurgen Schmidhuber, and Fred Cummins.
+  "Learning to forget: Continual prediction with LSTM." IET, 850-855, 1999.
+
+  The peephole implementation is based on:
+
+    https://research.google.com/pubs/archive/43905.pdf
+
+  Hasim Sak, Andrew Senior, and Francoise Beaufays.
+  "Long short-term memory recurrent neural network architectures for
+   large scale acoustic modeling." INTERSPEECH, 2014.
+
+  The class uses optional peep-hole connections, optional cell clipping, and
+  an optional projection layer.
+
+  Note that this cell is not optimized for performance. Please use
+  `tf.contrib.cudnn_rnn.CudnnLSTM` for better performance on GPU, or
+  `tf.contrib.rnn.LSTMBlockCell` and `tf.contrib.rnn.LSTMBlockFusedCell` for
+  better performance on CPU.
+  """
+
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               num_unit_shards=None,
+               num_proj_shards=None,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None):
+    """Initialize the parameters for an LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell.
+      use_peepholes: bool, set True to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+        provided, then the projected values are clipped elementwise to within
+        `[-proj_clip, proj_clip]`.
+      num_unit_shards: Deprecated, will be removed by Jan. 2017. Use a
+        variable_scope partitioner instead.
+      num_proj_shards: Deprecated, will be removed by Jan. 2017. Use a
+        variable_scope partitioner instead.
+      forget_bias: Biases of the forget gate are initialized by default to 1 in
+        order to reduce the scale of forgetting at the beginning of the
+        training. Must set it manually to `0.0` when restoring from CudnnLSTM
+        trained checkpoints.
+      state_is_tuple: If True, accepted and returned states are 2-tuples of the
+        `c_state` and `m_state`.  If False, they are concatenated along the
+        column axis.  This latter behavior will soon be deprecated.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      reuse: (optional) Python boolean describing whether to reuse variables in
+        an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+      name: String, the name of the layer. Layers with the same name will share
+        weights, but to avoid mistakes we require reuse=True in such cases.
+      dtype: Default dtype of the layer (default of `None` means use the type of
+        the first input). Required when `build` is called before `call`.  When
+        restoring from CudnnLSTM-trained checkpoints, use
+        `CudnnCompatibleLSTMCell` instead.
+    """
+    super(TFLiteLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
+    # TODO(raziel): decide if we want to just support tuples (yes please!).
+    if not state_is_tuple:
+      logging.warn(
+          "%s: Using a concatenated state is slower and will soon be "
+          "deprecated.  Use state_is_tuple=True.", self)
+    if num_unit_shards is not None or num_proj_shards is not None:
+      logging.warn(
+          "%s: The num_unit_shards and proj_unit_shards parameters are "
+          "deprecated and will be removed in Jan 2017.  "
+          "Use a variable scope with a partitioner instead.", self)
+
+    # Inputs must be 2-dimensional.
+    # TODO(raziel): layers stuff -- chop if un-layerizing Op.
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
+    self._tflite_wrapper = lite.OpHint("UnidirectionalSequenceLstm")
+
+    self._num_units = num_units
+    self._use_peepholes = use_peepholes
+    self._cell_clip = cell_clip
+    self._initializer = initializer
+    self._num_proj = num_proj
+    self._proj_clip = proj_clip
+    self._num_unit_shards = num_unit_shards
+    self._num_proj_shards = num_proj_shards
+    self._forget_bias = forget_bias
+    self._state_is_tuple = state_is_tuple
+    self._activation = activation or math_ops.tanh
+
+    self._output_size = num_proj if num_proj else num_units
+    self._state_size = (
+        tf.nn.rnn_cell.LSTMStateTuple(num_units, self._output_size)
+        if state_is_tuple else num_units + self._output_size)
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def build(self, inputs_shape):
+    """Build TfLite LSTM cell graph.
+
+    Args:
+      inputs_shape: The inputs_shape must be known, and is [batch_size,
+        input_size] shape.
+
+    Raises:
+      ValueError: if the inputs_shape is invalid.
+    """
+    if len(inputs_shape) != 2 or inputs_shape[1].value is None:
+      raise ValueError("Invalid inputs_shape, saw shape: %s" % inputs_shape)
+
+    input_depth = inputs_shape[1].value
+    maybe_partitioner = (
+        partitioned_variables.fixed_size_partitioner(self._num_unit_shards)
+        if self._num_unit_shards is not None else None)
+    input_weight_shape = [self._num_units, input_depth]
+    cell_weight_shape = [self._num_units, self._output_size]
+    bias_shape = [self._num_units]
+
+    def add_variable_wrapped(name, shape, initializer, index, partitioner):
+      var = self.add_variable(
+          name, shape=shape, initializer=initializer, partitioner=partitioner)
+      return self._tflite_wrapper.add_input(
+          var, name="name", index_override=index)
+
+    weight_initializer = self._initializer
+    if self.dtype is None:
+      bias_initializer = init_ops.zeros_initializer
+    else:
+      bias_initializer = init_ops.zeros_initializer(dtype=self.dtype)
+
+    self.input_to_input_w = add_variable_wrapped(
+        "input_to_input_w", input_weight_shape, weight_initializer, 1,
+        maybe_partitioner)
+    self.input_to_forget_w = add_variable_wrapped(
+        "input_to_forget_w", input_weight_shape, weight_initializer, 2,
+        maybe_partitioner)
+    self.input_to_cell_w = add_variable_wrapped(
+        "input_to_cell_w", input_weight_shape, weight_initializer, 3,
+        maybe_partitioner)
+    self.input_to_output_w = add_variable_wrapped(
+        "input_to_output_w", input_weight_shape, weight_initializer, 4,
+        maybe_partitioner)
+    self.cell_to_input_w = add_variable_wrapped(
+        "cell_to_input_w", cell_weight_shape, weight_initializer, 5,
+        maybe_partitioner)
+    self.cell_to_forget_w = add_variable_wrapped(
+        "cell_to_forget_w", cell_weight_shape, weight_initializer, 6,
+        maybe_partitioner)
+    self.cell_to_cell_w = add_variable_wrapped(
+        "cell_to_cell_w", cell_weight_shape, weight_initializer, 7,
+        maybe_partitioner)
+    self.cell_to_output_w = add_variable_wrapped(
+        "cell_to_output_w", cell_weight_shape, weight_initializer, 8,
+        maybe_partitioner)
+
+    self.input_bias = add_variable_wrapped(
+        "input_bias", bias_shape, bias_initializer, 12, maybe_partitioner)
+    self.forget_bias = add_variable_wrapped(
+        "forget_bias", bias_shape, bias_initializer, 13, maybe_partitioner)
+    self.cell_bias = add_variable_wrapped(
+        "cell_bias", bias_shape, bias_initializer, 14, maybe_partitioner)
+    self.output_bias = add_variable_wrapped(
+        "output_bias", bias_shape, bias_initializer, 15, maybe_partitioner)
+
+    # index 9, 10, 11.
+    # f stands for forget, i stands for input and o stands for output.
+    if self._use_peepholes:
+      self._w_f_diag = add_variable_wrapped("w_f_diag", [self._num_units],
+                                            self._initializer, 9,
+                                            maybe_partitioner)
+      self._w_i_diag = add_variable_wrapped("w_i_diag", [self._num_units],
+                                            self._initializer, 10,
+                                            maybe_partitioner)
+      self._w_o_diag = add_variable_wrapped("w_o_diag", [self._num_units],
+                                            self._initializer, 11,
+                                            maybe_partitioner)
+
+    # index 16 for proj kernel.
+    if self._num_proj is not None:
+      maybe_proj_partitioner = (
+          partitioned_variables.fixed_size_partitioner(self._num_proj_shards)
+          if self._num_proj_shards is not None else None)
+      self._proj_kernel = add_variable_wrapped(
+          "projection/kernel", [self._num_proj, self._num_units],
+          self._initializer,
+          16,
+          partitioner=maybe_proj_partitioner)
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Run one step of LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, `[batch, num_units]`.
+      state: if `state_is_tuple` is False, this must be a state Tensor, `2-D,
+        [batch, state_size]`.  If `state_is_tuple` is True, this must be a tuple
+        of state Tensors, both `2-D`, with column sizes `c_state` and `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch, output_dim]`, Tensor representing the output of the
+        LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of LSTM after reading `inputs` when
+        the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    inputs = self._tflite_wrapper.add_input(
+        inputs, tag="input", name="input", aggregate="stack", index_override=0)
+
+    # Make sure inputs and bias_initializer has the same type.
+    assert inputs.dtype == self.input_to_input_w.dtype
+
+    num_proj = self._num_units if self._num_proj is None else self._num_proj
+    sigmoid = math_ops.sigmoid
+
+    if self._state_is_tuple:
+      (c_prev, m_prev) = state
+    else:
+      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
+      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
+
+    # Note: For TfLite, cell_state is at index 19 while activation state at
+    # index 18.
+    c_prev = self._tflite_wrapper.add_input(
+        c_prev,
+        tag="c_prev",
+        name="c_prev",
+        aggregate="first",
+        index_override=19)
+    m_prev = self._tflite_wrapper.add_input(
+        m_prev,
+        tag="m_prev",
+        name="m_prev",
+        aggregate="first",
+        index_override=18)
+
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    inputs_and_m_prev = array_ops.concat([inputs, m_prev], axis=1)
+
+    # i stands for input gate.
+    # f stands for forget gate activation.
+    # o outputs.
+    # j output of LSTM unit.
+    # c is the final state.
+    # m is the output.
+    i = nn_ops.bias_add(
+        tf.matmul(
+            inputs_and_m_prev,
+            tf.concat([self.input_to_input_w, self.cell_to_input_w], axis=1),
+            transpose_b=True), self.input_bias)
+    f = nn_ops.bias_add(
+        tf.matmul(
+            inputs_and_m_prev,
+            tf.concat([self.input_to_forget_w, self.cell_to_forget_w], axis=1),
+            transpose_b=True), self.forget_bias)
+    o = nn_ops.bias_add(
+        tf.matmul(
+            inputs_and_m_prev,
+            tf.concat([self.input_to_output_w, self.cell_to_output_w], axis=1),
+            transpose_b=True), self.output_bias)
+    j = nn_ops.bias_add(
+        tf.matmul(
+            inputs_and_m_prev,
+            tf.concat([self.input_to_cell_w, self.cell_to_cell_w], axis=1),
+            transpose_b=True), self.cell_bias)
+
+    # Diagonal connections
+    if self._use_peepholes:
+      c = (
+          sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev +
+          sigmoid(i + self._w_i_diag * c_prev) * self._activation(j))
+    else:
+      c = (
+          sigmoid(f + self._forget_bias) * c_prev +
+          sigmoid(i) * self._activation(j))
+
+    if self._cell_clip is not None:
+      # pylint: disable=invalid-unary-operand-type
+      c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+      # pylint: enable=invalid-unary-operand-type
+    if self._use_peepholes:
+      m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
+    else:
+      m = sigmoid(o) * self._activation(c)
+
+    if self._num_proj is not None:
+      transposed_proj_kernel = tf.transpose(self._proj_kernel)
+      m = math_ops.matmul(m, transposed_proj_kernel)
+
+      if self._proj_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+        # pylint: enable=invalid-unary-operand-type
+
+    c = self._tflite_wrapper.add_output(
+        c, tag="c", name="c", aggregate="last", index_override=1)
+    m = self._tflite_wrapper.add_output(
+        m, tag="m", name="m", index_override=2, aggregate="stack")
+
+    new_state = (
+        tf.nn.rnn_cell.LSTMStateTuple(c, m)
+        if self._state_is_tuple else array_ops.concat([c, m], 1))
+    return m, new_state
+
+  def get_config(self):
+    config = {
+        "num_units": self._num_units,
+        "use_peepholes": self._use_peepholes,
+        "cell_clip": self._cell_clip,
+        "initializer": initializers.serialize(self._initializer),
+        "num_proj": self._num_proj,
+        "proj_clip": self._proj_clip,
+        "num_unit_shards": self._num_unit_shards,
+        "num_proj_shards": self._num_proj_shards,
+        "forget_bias": self._forget_bias,
+        "state_is_tuple": self._state_is_tuple,
+        "activation": activations.serialize(self._activation),
+        "reuse": self._reuse,
+    }
+    base_config = super(TFLiteLSTMCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeb48d123113c5924a74286ad1e0851eb484cdb8
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -0,0 +1,227 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.tools import optimize_for_inference_lib
+
+# Number of steps to train model.
+TRAIN_STEPS = 1
+
+CONFIG = tf.ConfigProto(device_count={"GPU": 0})
+
+
+class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+    # Import MNIST dataset
+    self.mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
+
+    # Define constants
+    # Unrolled through 28 time steps
+    self.time_steps = 28
+    # Rows of 28 pixels
+    self.n_input = 28
+    # Learning rate for Adam optimizer
+    self.learning_rate = 0.001
+    # MNIST is meant to be classified in 10 classes(0-9).
+    self.n_classes = 10
+    # Batch size
+    self.batch_size = 16
+    # Lstm Units.
+    self.num_units = 16
+
+  def buildLstmLayer(self):
+    return tf.nn.rnn_cell.MultiRNNCell([
+        TFLiteLSTMCell(
+            self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
+        TFLiteLSTMCell(self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
+        TFLiteLSTMCell(
+            self.num_units // 2,
+            use_peepholes=True,
+            num_proj=8,
+            forget_bias=0,
+            name="rnn3"),
+        TFLiteLSTMCell(self.num_units, forget_bias=0, name="rnn4")
+    ])
+
+  def buildModel(self, lstm_layer, is_dynamic_rnn, is_train):
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(
+        tf.random_normal([self.num_units, self.n_classes]))
+    out_bias = tf.Variable(tf.random_normal([self.n_classes]))
+
+    # input image placeholder
+    x = tf.placeholder(
+        "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
+
+    # For dynamic_rnn, train with dynamic_rnn and inference with static_rnn.
+    # x is shaped [batch_size,time_steps,num_inputs]
+    if is_dynamic_rnn:
+      if is_train:
+        lstm_input = x
+        outputs, _ = tf.nn.dynamic_rnn(lstm_layer, lstm_input, dtype="float32")
+        outputs = tf.unstack(outputs, axis=1)
+      else:
+        lstm_input = tf.unstack(x, self.time_steps, 1)
+        outputs, _ = tf.nn.static_rnn(lstm_layer, lstm_input, dtype="float32")
+    else:
+      lstm_input = tf.unstack(x, self.time_steps, 1)
+      outputs, _ = tf.nn.static_rnn(lstm_layer, lstm_input, dtype="float32")
+
+    # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
+    # by the softmax layer's out_weight of shape [num_units,n_classes]
+    # plus out_bias
+    prediction = tf.matmul(outputs[-1], out_weights) + out_bias
+    output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
+
+    return x, prediction, output_class
+
+  def trainModel(self, x, prediction, output_class, sess):
+    # input label placeholder
+    y = tf.placeholder("float", [None, self.n_classes])
+    # Loss function
+    loss = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
+    # Optimization
+    opt = tf.train.AdamOptimizer(
+        learning_rate=self.learning_rate).minimize(loss)
+
+    # Initialize variables
+    init = tf.global_variables_initializer()
+    sess.run(init)
+    for _ in range(TRAIN_STEPS):
+      batch_x, batch_y = self.mnist.train.next_batch(
+          batch_size=self.batch_size, shuffle=False)
+
+      batch_x = batch_x.reshape((self.batch_size, self.time_steps,
+                                 self.n_input))
+      sess.run(opt, feed_dict={x: batch_x, y: batch_y})
+
+  def saveAndRestoreModel(self, lstm_layer, sess, saver, is_dynamic_rnn):
+    model_dir = tempfile.mkdtemp()
+    saver.save(sess, model_dir)
+
+    # Reset the graph.
+    tf.reset_default_graph()
+    x, prediction, output_class = self.buildModel(
+        lstm_layer, is_dynamic_rnn, is_train=False)
+
+    new_sess = tf.Session(config=CONFIG)
+    saver = tf.train.Saver()
+    saver.restore(new_sess, model_dir)
+    return x, prediction, output_class, new_sess
+
+  def getInferenceResult(self, x, output_class, sess):
+    b1, _ = self.mnist.train.next_batch(batch_size=1)
+    sample_input = np.reshape(b1, (1, self.time_steps, self.n_input))
+
+    expected_output = sess.run(output_class, feed_dict={x: sample_input})
+    frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, [output_class.op.name])
+    return sample_input, expected_output, frozen_graph
+
+  def tfliteInvoke(self, graph, test_inputs, outputs):
+    tf.reset_default_graph()
+    # Turn the input into placeholder of shape 1
+    tflite_input = tf.placeholder(
+        "float", [1, self.time_steps, self.n_input], name="INPUT_IMAGE_LITE")
+    tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
+    with tf.Session() as sess:
+      curr = sess.graph_def
+      curr = convert_op_hints_to_stubs(graph_def=curr)
+
+    curr = optimize_for_inference_lib.optimize_for_inference(
+        curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
+        [tf.float32.as_datatype_enum])
+
+    tflite = tf.lite.toco_convert(
+        curr, [tflite_input], [outputs], allow_custom_ops=False)
+    interpreter = tf.lite.Interpreter(model_content=tflite)
+
+    try:
+      interpreter.allocate_tensors()
+    except ValueError:
+      assert False
+
+    input_index = (interpreter.get_input_details()[0]["index"])
+    interpreter.set_tensor(input_index, test_inputs)
+    interpreter.invoke()
+    output_index = (interpreter.get_output_details()[0]["index"])
+    result = interpreter.get_tensor(output_index)
+    # Reset all variables so it will not pollute other inferences.
+    interpreter.reset_all_variables()
+    return result
+
+  def testStaticRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildLstmLayer(), is_dynamic_rnn=False, is_train=True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildLstmLayer(), sess, saver, is_dynamic_rnn=False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildLstmLayer(), is_dynamic_rnn=True, is_train=True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    # Since we don't yet support OpHints for dynamic, we will load the model
+    # back in as a static model. This requires the variables to have the same
+    # names as if they were trained as a static. Thus, we get rid of while/rnn
+    # names.
+    variables_to_save = {}
+    for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
+      op_name = i.name
+      if op_name.startswith("while/rnn/"):
+        op_name = op_name.split("while/rnn/")[1]
+      if op_name.endswith(":0"):
+        op_name = op_name.split(":0")[0]
+      variables_to_save[op_name] = i
+    saver = tf.train.Saver(variables_to_save)
+
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildLstmLayer(), sess, saver, is_dynamic_rnn=True)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset
diff --git a/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c85ebfb63885fae628b766ab919cdb9832903b0
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md
@@ -0,0 +1,29 @@
+# TF Lite Experimental Unity Plugin
+
+This directory contains an experimental sample Unity (2017) Plugin, based on
+the experimental TF Lite C API. The sample demonstrates running inference within
+Unity by way of a C# `Interpreter` wrapper.
+
+Note that the native TF Lite plugin(s) *must* be built before using the Unity
+Plugin, and placed in Assets/TensorFlowLite/SDK/Plugins/. For the editor (note
+that this has only been tested on Linux; the syntax may differ on Mac/Windows):
+
+```sh
+bazel build -c opt --cxxopt=--std=c++11 \
+  //tensorflow/lite/experimental/c:libtensorflowlite_c.so
+```
+
+and for Android:
+
+```sh
+bazel build -c opt --cxxopt=--std=c++11 \
+  --crosstool_top=//external:android/crosstool \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --cpu=armeabi-v7a \
+  //tensorflow/lite/experimental/c:libtensorflowlite_c.so
+```
+
+If you encounter issues with native plugin discovery on Mac ("Darwin")
+platforms, try renaming `libtensorflowlite_c.so` to `tensorflowlite_c.bundle`.
+Similarly, on Windows you'll likely need to rename `libtensorflowlite_c.so` to
+`tensorflowlite_c.dll`.
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json b/tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json
similarity index 100%
rename from tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json
rename to tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..dd314545cb6488ea2a76494df39b4b69e92eca33
--- /dev/null
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -0,0 +1,85 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+# ctc support classes imported directly from TensorFlow.
+cc_library(
+    name = "ctc_utils",
+    hdrs = [
+        "ctc_beam_entry.h",
+        "ctc_beam_scorer.h",
+        "ctc_beam_search.h",
+        "ctc_decoder.h",
+        "ctc_loss_util.h",
+    ],
+    deps = [
+        ":top_n",
+        "//tensorflow/lite/kernels/internal:types",
+        "//third_party/eigen3",
+    ],
+)
+
+# top_n support classes imported directly from TensorFlow.
+cc_library(
+    name = "top_n",
+    hdrs = [
+        "top_n.h",
+    ],
+    deps = [
+        "//tensorflow/lite/kernels/internal:types",
+    ],
+)
+
+cc_library(
+    name = "experimental_ops",
+    srcs = [
+        "ctc_beam_search_decoder.cc",
+    ],
+    # Suppress warnings that are introduced by Eigen Tensor.
+    copts = tflite_copts() + [
+        "-Wno-error=reorder",
+    ] + select({
+        "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"],
+        "//conditions:default": [
+        ],
+    }),
+    deps = [
+        ":ctc_utils",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:gemm_support",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels/internal:kernel_utils",
+        "//tensorflow/lite/kernels/internal:optimized",
+        "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "ctc_beam_search_decoder_test",
+    size = "small",
+    srcs = ["ctc_beam_search_decoder_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":experimental_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h b/tensorflow/lite/experimental/kernels/ctc_beam_entry.h
similarity index 94%
rename from tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h
rename to tensorflow/lite/experimental/kernels/ctc_beam_entry.h
index a60ff2a1c53f1b3f9f490ab5cf2bc429ba09dff0..70fbefa2ba52c6dfc23987479b9e30b5a2ae7a57 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h
+++ b/tensorflow/lite/experimental/kernels/ctc_beam_entry.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Copied from tensorflow/core/util/ctc/ctc_beam_entry.h
 // TODO(b/111524997): Remove this file.
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_ENTRY_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_ENTRY_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_ENTRY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_ENTRY_H_
 
 #include <algorithm>
 #include <memory>
@@ -24,7 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h"
+#include "tensorflow/lite/experimental/kernels/ctc_loss_util.h"
 
 namespace tflite {
 namespace experimental {
@@ -147,4 +147,4 @@ class BeamComparer {
 }  // namespace experimental
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_ENTRY_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_ENTRY_H_
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_scorer.h b/tensorflow/lite/experimental/kernels/ctc_beam_scorer.h
similarity index 91%
rename from tensorflow/contrib/lite/experimental/kernels/ctc_beam_scorer.h
rename to tensorflow/lite/experimental/kernels/ctc_beam_scorer.h
index ec60e26257b0f4126e7a7abed6a663abe277ef12..202b2af28ee14f00ca02a51387b78990157c88af 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_scorer.h
+++ b/tensorflow/lite/experimental/kernels/ctc_beam_scorer.h
@@ -23,10 +23,10 @@ limitations under the License.
 
 // Copied from tensorflow/core/util/ctc/ctc_beam_scorer.h
 // TODO(b/111524997): Remove this file.
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SCORER_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SCORER_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SCORER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SCORER_H_
 
-#include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h"
+#include "tensorflow/lite/experimental/kernels/ctc_beam_entry.h"
 
 namespace tflite {
 namespace experimental {
@@ -76,4 +76,4 @@ class BaseBeamScorer {
 }  // namespace experimental
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SCORER_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SCORER_H_
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h b/tensorflow/lite/experimental/kernels/ctc_beam_search.h
similarity index 96%
rename from tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h
rename to tensorflow/lite/experimental/kernels/ctc_beam_search.h
index 7c5099235a4e3263d206c091ff3a5335f5f1eb36..1cc3ab7605ec3b86227778afaf30e7d2a87c5844 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h
+++ b/tensorflow/lite/experimental/kernels/ctc_beam_search.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Copied from tensorflow/core/util/ctc/ctc_beam_search.h
 // TODO(b/111524997): Remove this file.
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SEARCH_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SEARCH_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SEARCH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SEARCH_H_
 
 #include <algorithm>
 #include <cmath>
@@ -25,12 +25,12 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_entry.h"
-#include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_scorer.h"
-#include "tensorflow/contrib/lite/experimental/kernels/ctc_decoder.h"
-#include "tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h"
-#include "tensorflow/contrib/lite/experimental/kernels/top_n.h"
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/experimental/kernels/ctc_beam_entry.h"
+#include "tensorflow/lite/experimental/kernels/ctc_beam_scorer.h"
+#include "tensorflow/lite/experimental/kernels/ctc_decoder.h"
+#include "tensorflow/lite/experimental/kernels/ctc_loss_util.h"
+#include "tensorflow/lite/experimental/kernels/top_n.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
 namespace experimental {
@@ -429,4 +429,4 @@ bool CTCBeamSearchDecoder<CTCBeamState, CTCBeamComparer>::TopPaths(
 }  // namespace experimental
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SEARCH_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_BEAM_SEARCH_H_
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc b/tensorflow/lite/experimental/kernels/ctc_beam_search_decoder.cc
similarity index 96%
rename from tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc
rename to tensorflow/lite/experimental/kernels/ctc_beam_search_decoder.cc
index b1ebe4a804a971043d19b588f07ffc54b1d1aa38..9b1a05ee6e77c4886d6d7ac665294e6249b269a4 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder.cc
+++ b/tensorflow/lite/experimental/kernels/ctc_beam_search_decoder.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/experimental/kernels/ctc_beam_search.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/kernels/ctc_beam_search.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc b/tensorflow/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
rename to tensorflow/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
index 942dbbbeae553ba55ea75b3257aca28b9b12eb77..572b56f1225ccc8c7da86fe51c549012a1c34770 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
+++ b/tensorflow/lite/experimental/kernels/ctc_beam_search_decoder_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_decoder.h b/tensorflow/lite/experimental/kernels/ctc_decoder.h
similarity index 94%
rename from tensorflow/contrib/lite/experimental/kernels/ctc_decoder.h
rename to tensorflow/lite/experimental/kernels/ctc_decoder.h
index 596ad4a5f7264ae24caa5592d10c09c256629b06..1ceb3f7de47667a42975f24d805bcd8028c871de 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_decoder.h
+++ b/tensorflow/lite/experimental/kernels/ctc_decoder.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Copied from tensorflow/core/util/ctc/ctc_decoder.h
 // TODO(b/111524997): Remove this file.
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_DECODER_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_DECODER_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_DECODER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_DECODER_H_
 
 #include <memory>
 #include <vector>
@@ -111,4 +111,4 @@ class CTCGreedyDecoder : public CTCDecoder {
 }  // namespace experimental
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_DECODER_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_DECODER_H_
diff --git a/tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h b/tensorflow/lite/experimental/kernels/ctc_loss_util.h
similarity index 88%
rename from tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h
rename to tensorflow/lite/experimental/kernels/ctc_loss_util.h
index 0bae732533716ac047a55ea31633c8ed51253fe0..f2206dbcc07e75c985298f7d0139feafa06cfd01 100644
--- a/tensorflow/contrib/lite/experimental/kernels/ctc_loss_util.h
+++ b/tensorflow/lite/experimental/kernels/ctc_loss_util.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Copied from tensorflow/core/util/ctc/ctc_loss_util.h
 // TODO(b/111524997): Remove this file.
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_LOSS_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_LOSS_UTIL_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_LOSS_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_LOSS_UTIL_H_
 
 #include <cmath>
 #include <limits>
@@ -47,4 +47,4 @@ inline float LogSumExp(float log_prob_1, float log_prob_2) {
 }  // namespace experimental
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_CTC_LOSS_UTIL_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_CTC_LOSS_UTIL_H_
diff --git a/tensorflow/contrib/lite/experimental/kernels/top_n.h b/tensorflow/lite/experimental/kernels/top_n.h
similarity index 98%
rename from tensorflow/contrib/lite/experimental/kernels/top_n.h
rename to tensorflow/lite/experimental/kernels/top_n.h
index cd2a2f1c80276d4659ccd2f8f05af3af030acb90..4e2581cc71785c5acb4d43099cbcda66eeb8e66e 100644
--- a/tensorflow/contrib/lite/experimental/kernels/top_n.h
+++ b/tensorflow/lite/experimental/kernels/top_n.h
@@ -38,8 +38,8 @@ limitations under the License.
 
 // Copied from tensorflow/core/lib/gtl/top_n.h
 // TODO(b/111524997): Remove this file.
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_TOP_N_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_TOP_N_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_TOP_N_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_TOP_N_H_
 
 #include <stddef.h>
 #include <algorithm>
@@ -47,7 +47,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
 namespace gtl {
@@ -338,4 +338,4 @@ void TopN<T, Cmp>::Reset() {
 }  // namespace gtl
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_KERNELS_TOP_N_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_TOP_N_H_
diff --git a/tensorflow/lite/experimental/micro/BUILD b/tensorflow/lite/experimental/micro/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e11159868e11a09e1b10d59da274cd08ee472593
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/BUILD
@@ -0,0 +1,76 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+cc_library(
+    name = "micro_framework",
+    srcs = [
+        "micro_error_reporter.cc",
+        "micro_interpreter.cc",
+        "micro_mutable_op_resolver.cc",
+        "simple_tensor_allocator.cc",
+    ],
+    hdrs = [
+        "compatibility.h",
+        "micro_error_reporter.h",
+        "micro_interpreter.h",
+        "micro_mutable_op_resolver.h",
+        "simple_tensor_allocator.h",
+    ],
+    deps = [
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_error_reporter_test",
+    srcs = [
+        "micro_error_reporter_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_mutable_op_resolver_test",
+    srcs = [
+        "micro_mutable_op_resolver_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_interpreter_test",
+    srcs = [
+        "micro_interpreter_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "simple_tensor_allocator_test",
+    srcs = [
+        "simple_tensor_allocator_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..673daed74c41a1880e6f8803258033cce8d333ca
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -0,0 +1,128 @@
+# TensorFlow Lite for Microcontrollers
+
+This an experimental port of TensorFlow Lite aimed at micro controllers and other devices with only kilobytes of memory. It doesn't require any operating system support, any standard C or C++ libraries, or dynamic memory allocation, so it's designed to be portable even to 'bare metal' systems. The core runtime fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword detection model, takes up a total of 22KB.
+
+The design goals are for the framework to be:
+
+- **Readable**: We want embedded software engineers to be able to understand what's required to run ML inference without having to study research papers. We've tried to keep the code base small, modular, and have reference implementations of all operations to help with this.
+
+- **Easy to modify**: We know that there are a lot of different platforms and requirements in the embedded world, and we don't expect to cover all of them in one framework. Instead, we're hoping that it can be a good starting point for developers to build on top of to meet their own needs. For example, we tried to make it easy to replace the implementations of key computational operators that are often crucial for performance, without having to touch the data flow and other runtime code. We want it to make more sense to use our workflow to handle things like model import and less-important operations, and customize the parts that matter, rather than having to reimplement everything in your own engine.
+
+- **Well-tested**: If you're modifying code, you need to know if your changes are correct. Having an easy way to test lets you develop much faster. To help there, we've written tests for all the components, and we've made sure that the tests can be run on almost any platform, with no dependencies apart from the ability to log text to a debug console somewhere. We also provide an easy way to run all the tests on-device as part of an automated test framework, and we use qemu/Renode emulation so that tests can be run even without physical devices present.
+
+- **Easy to integrate**: We want to be as open a system as possible, and use the best code available for each platform. To do that, we're going to rely on projects like [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html), [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to handle as much performance-critical code as possible. We know that there are an increasing number of options to accelerate neural networks on microcontrollers, so we're aiming to be a good host for deploying those hardware technologies too.
+
+- **Compatible**: We're using the same file schema, interpreter API, and kernel interface as regular TensorFlow Lite, so we leverage the large existing set of tools, documentation, and examples for the project. The biggest barrier to deploying ML models is getting them from a training environment into a form that's easy to run inference on, so we see reusing this rich ecosystem as being crucial to being easily usable. We also hope to integrate this experimental work back into the main codebase in the future.
+
+To meet those goals, we've made some tradeoffs:
+
+- **Simple C++**: To help with readability, our code is written in a modern version of C++, but we generally treat it as a "better C", rather relying on more complex features such as template meta-programming. As mentioned earlier, we avoid any use of dynamic memory allocation (new/delete) or the standard C/C++ libraries, so we believe this should still be fairly portable. It does mean that some older devices with C-only toolchains won't be supported, but we're hoping that the reference operator implementations (which are simple C-like functions) can still be useful in those cases. The interfaces are also designed to be C-only, so it should be possible to integrate the resulting library with pure C projects.
+
+- **Interpreted**: Code generation is a popular pattern for embedded code, because it gives standalone code that's easy to modify and step through, but we've chosen to go with an interpreted approach. In our internal microcontroller work we've found that using an extremely stripped-down interpreter with almost no dependencies gives us a lot of the same advantages, but is easier to maintain. For example, when new updates come out for the underlying library, you can just merge your local modifications in a single step, rather than having to regenerate new code and then patch in any changes you subsequently made. The coarse granularity of the interpreted primitives means that each operation call typically takes hundreds of thousands of instruction cycles at least, so we don't see noticeable performance gains from avoiding what's essentially a single switch statement at the interpreter level to call each operation. We're still working on improving the packaging though, for example we're considering having the ability to snapshot all the source files and headers used for a particular model, being able to compile the code and data together as a library, and then access it through a minimal set of C interface calls which hide the underlying complexity.
+
+- **Flatbuffers**: We represent our models using [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs), with the difference that we always keep it in read-only program memory (typically flash) rather than relying on having a file system to read it from. This is a good fit because flatbuffer's serialized format is designed to be mapped into memory without requiring any extra memory allocations or modifications to access it. All of the functions to read model values work directly on the serialized bytes, and large sections of data like weights are directly accessible as sequential C-style arrays of their data type, with no strides or unpacking needed. We do get a lot of value from using flatbuffers, but there is a cost in complexity. The flat buffer library code is all inline [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h), but it isn't straightforward to inspect their implementations, and the model data structures aren't easy to comprehend from the debugger. The header for the schema itself also has to be periodically updated when new information is added to the file format, though we try to handle that transparently for most developers by checking in a pre-generated version.
+
+- **Code Duplication**: Some of the code in this prototype largely duplicates the logic in other parts of the TensorFlow Lite code base, for example the operator wrappers. We've tried to keep share as much as we can between the two interpreters, but there are some assumptions built into the original runtime that make this difficult. We'll be working on modularizing the main interpreter so that we can move to an entirely shared system.
+
+This initial preview release is designed to get early feedback, and is not intended to be a final product. It only includes enough operations to run a simple keyword recognition model, and the implementations are not optimized. We're hoping this will be a good way to get feedback and collaborate to improve the framework.
+
+## Getting Started
+
+Building requires a Linux or OS X machine.
+
+ - Open a terminal
+ - Download the TensorFlow source with `git clone https://github.com/tensorflow`
+ - Enter the source root directory by running `cd tensorflow`
+ - Download the dependencies by running `tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`. This may take a few minutes
+ - Build and test the library with `make -f tensorflow/lite/experimental/micro/tools/make/Makefile test`
+
+You should see a series of compilation steps, followed by `~~~ALL TESTS
+PASSED~~~` for the various tests of the code that it will run. If there's an
+error, you should get an informative message from make about what went wrong.
+
+These tests are all built as simple binaries with few dependencies, so you can run them manually. For example, here's how to run the depthwise convolution test, and its output:
+
+```
+tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/bin/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test
+
+Testing SimpleTest
+Testing SimpleTestQuantized
+Testing SimpleTestRelu
+Testing SimpleTestReluQuantized
+4/4 tests passed
+~ALL TESTS PASSED~~~
+```
+
+Looking at the [depthwise_conv_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc) code, you'll see a sequence that looks like this:
+
+```
+...
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+...
+}
+...
+TF_LITE_MICRO_TESTS_END
+```
+
+These macros work a lot like
+[the Google test framework](https://github.com/google/googletest), but they
+don't require any dependencies and just write results to stderr, rather than
+aborting the program. If all the tests pass, then `~~~ALL TESTS PASSED~~~` is
+output, and the test harness that runs the binary during the make process knows
+that everything ran correctly. If there's an error, the lack of the expected
+string lets the harness know that the test failed.
+
+So, why are we running tests in this complicated way? So far, we've been building binaries that run locally on the Mac OS or Linux machine you're building on, but this approach becomes important when we're targeting simple micro controller devices.
+
+## Building for the "Blue Pill" STM32F103
+
+The goal of this library is to enable machine learning on resource-constrained micro controllers and DSPs, and as part of that we've targeted the ["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) as a cheap and popular platform. It only has 20KB of RAM and 64KB of flash, so it's a good device to ensure we can run efficiently on small chips.
+
+It's fairly easy to [buy and wire up a physical board](https://github.com/google/stm32_bare_lib#wiring-up-your-blue-pill), but even if you don't have an actual device, the [Renode project](https://renode.io/) makes it easy to run a faithful emulation on your desktop machine. You'll need [Docker](https://www.docker.com/) installed, but once you have that set up, try running the following command:
+
+`make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill test`
+
+You should see a similar set of outputs as you did in the previous section, with the addition of some extra Docker logging messages. These are because we're using Docker to run the Renode micro controller emulation tool, and the tests themselves are being run on a simulated STM32F103 device. The communication channels between an embedded device and the host are quite limited, so the test harness looks at the output of the debug log to see if tests have passed, just as it did in the previous section. This makes it a very flexible way to run cross-platform tests, even when a platform has no operating system facilities, as long as it can output debugging text logs.
+
+To understand what's happening here, try running the same depthwise convolution test, but through the emulated device test harness, with the following command:
+
+```
+tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh \
+tensorflow/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test \
+'~~~ALL TESTS PASSED~~~'
+
+```
+
+You should see output that looks something like this:
+
+```
+Sending build context to Docker daemon   21.5kB
+Step 1/2 : FROM antmicro/renode:latest
+ ---> 1b670a243e8f
+Step 2/2 : LABEL maintainer="Pete Warden <petewarden@google.com>"
+ ---> Using cache
+ ---> 3afcd410846d
+Successfully built 3afcd410846d
+Successfully tagged renode_bluepill:latest
+LOGS:
+...
+03:27:32.4340 [INFO] machine-0: Machine started.
+03:27:32.4790 [DEBUG] cpu.uartSemihosting: [+0.22s host +0s virt 0s virt from start] Testing SimpleTest
+03:27:32.4812 [DEBUG] cpu.uartSemihosting: [+2.21ms host +0s virt 0s virt from start]   Testing SimpleTestQuantized
+03:27:32.4833 [DEBUG] cpu.uartSemihosting: [+2.14ms host +0s virt 0s virt from start]   Testing SimpleTestRelu
+03:27:32.4834 [DEBUG] cpu.uartSemihosting: [+0.18ms host +0s virt 0s virt from start]   Testing SimpleTestReluQuantized
+03:27:32.4838 [DEBUG] cpu.uartSemihosting: [+0.4ms host +0s virt 0s virt from start]   4/4 tests passed
+03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+41µs host +0s virt 0s virt from start]   ~~~ALL TESTS PASSED~~~
+03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+5µs host +0s virt 0s virt from start]   
+...
+tensorflow/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test: PASS
+```
+
+There's a lot of output here, but you should be able to see that the same tests
+that were covered when we ran locally on the development machine show up in the
+debug logs here, along with the magic string `~~~ALL TESTS PASSED~~~`. This is
+the exact same code as before, just compiled and run on the STM32F103 rather
+than your desktop. We hope that the simplicity of this testing approach will
+help make adding support for new platforms as easy as possible.
diff --git a/tensorflow/lite/experimental/micro/compatibility.h b/tensorflow/lite/experimental/micro/compatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fa91644bdd64e3b96c7935147593a12eca98487
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/compatibility.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_COMPATIBILITY_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_COMPATIBILITY_H_
+
+// C++ will automatically create class-specific delete operators for virtual
+// objects, which by default call the global delete function. For embedded
+// applications we want to avoid this, and won't be calling new/delete on these
+// objects, so we need to override the default implementation with one that does
+// nothing to avoid linking in ::delete().
+// This macro needs to be included in all subclasses of a virtual base class in
+// the private section.
+#ifdef TF_LITE_STATIC_MEMORY
+#define TF_LITE_REMOVE_VIRTUAL_DELETE \
+  void operator delete(void* p) {}
+#else
+#define TF_LITE_REMOVE_VIRTUAL_DELETE
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_COMPATIBILITY_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..799b2e5a5dd097c6e017f574449d339992f7c41b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -0,0 +1,243 @@
+# Description:
+#   TensorFlow Lite microcontroller example.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+cc_library(
+    name = "model_settings",
+    srcs = [
+        "model_settings.cc",
+    ],
+    hdrs = [
+        "model_settings.h",
+    ],
+)
+
+cc_library(
+    name = "tiny_conv_model_data",
+    srcs = [
+        "tiny_conv_model_data.cc",
+    ],
+    hdrs = [
+        "tiny_conv_model_data.h",
+    ],
+)
+
+cc_library(
+    name = "features_test_data",
+    srcs = [
+        "no_features_data.cc",
+        "yes_features_data.cc",
+    ],
+    hdrs = [
+        "no_features_data.h",
+        "yes_features_data.h",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_speech_test",
+    srcs = [
+        "micro_speech_test.cc",
+    ],
+    deps = [
+        ":features_test_data",
+        ":tiny_conv_model_data",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
+        "//tensorflow/lite/experimental/micro/kernels:micro_ops",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_library(
+    name = "preprocessor_test_data",
+    srcs = [
+        "no_30ms_sample_data.cc",
+        "no_power_spectrum_data.cc",
+        "yes_30ms_sample_data.cc",
+        "yes_power_spectrum_data.cc",
+    ],
+    hdrs = [
+        "no_30ms_sample_data.h",
+        "no_power_spectrum_data.h",
+        "yes_30ms_sample_data.h",
+        "yes_power_spectrum_data.h",
+    ],
+)
+
+cc_library(
+    name = "preprocessor_reference",
+    srcs = [
+        "preprocessor.cc",
+    ],
+    hdrs = [
+        "preprocessor.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "preprocessor_reference_test",
+    srcs = [
+        "preprocessor_test.cc",
+    ],
+    deps = [
+        ":model_settings",
+        ":preprocessor_reference",
+        ":preprocessor_test_data",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "preprocessor_fixed",
+    srcs = [
+        "fixed_point/preprocessor.cc",
+    ],
+    hdrs = [
+        "preprocessor.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "preprocessor_fixed_test",
+    srcs = [
+        "preprocessor_test.cc",
+    ],
+    deps = [
+        ":model_settings",
+        ":preprocessor_fixed",
+        ":preprocessor_test_data",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "audio_provider",
+    srcs = [
+        "audio_provider.cc",
+    ],
+    hdrs = [
+        "audio_provider.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "audio_provider_test",
+    srcs = [
+        "audio_provider_test.cc",
+    ],
+    deps = [
+        ":audio_provider",
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "feature_provider",
+    srcs = [
+        "feature_provider.cc",
+    ],
+    hdrs = [
+        "feature_provider.h",
+    ],
+    deps = [
+        ":audio_provider",
+        ":model_settings",
+        ":preprocessor_reference",
+        ":timer",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "feature_provider_test",
+    srcs = [
+        "feature_provider_test.cc",
+    ],
+    deps = [
+        ":audio_provider",
+        ":feature_provider",
+        ":model_settings",
+        ":timer",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "timer",
+    srcs = [
+        "timer.cc",
+    ],
+    hdrs = [
+        "timer.h",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "timer_test",
+    srcs = [
+        "timer_test.cc",
+    ],
+    deps = [
+        ":timer",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_binary(
+    name = "micro_speech",
+    srcs = [
+        "main.cc",
+    ],
+    deps = [
+        ":audio_provider",
+        ":feature_provider",
+        ":features_test_data",
+        ":model_settings",
+        ":preprocessor_reference",
+        ":timer",
+        ":tiny_conv_model_data",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
+        "//tensorflow/lite/experimental/micro/kernels:micro_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..500eed33bab0187f9b2cf9647c046f4a541b9e2c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
@@ -0,0 +1,103 @@
+# Micro Speech Example
+
+This examples shows how you can use TensorFlow Lite to run a 20 kilobyte neural network model to recognize keywords in speech. It's designed to run on systems with very small amounts of memory such as microcontrollers and DSPs. The code itself also has a small footprint (for example around 22 kilobytes on a Cortex M3) and only uses about 10 kilobytes of RAM for working memory, so it's able to run on systems like an STM32F103 with only 20 kilobytes of total SRAM and 64 kilobytes of Flash.
+
+## Table of Contents
+
+  - [Getting Started](#getting-started)
+  - [Getting Started on a Microcontroller](#getting-started-on-a-microcontroller)
+  - [Calculating the Input to the Neural Network](#calculating-the-input-to-the-neural-network)
+  - [Creating Your Own Model](#creating-your-own-model)
+
+## Getting Started
+
+To compile and test this example on a desktop Linux or MacOS machine, download [the TensorFlow source code](https://github.com/tensorflow/tensorflow), `cd` into the source directory from a terminal, and then retrieve the support libraries you need by running:
+
+```
+tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+```
+
+This will take a few minutes, and downloads frameworks the code uses like [CMSIS](https://developer.arm.com/embedded/cmsis) and [flatbuffers](https://google.github.io/flatbuffers/). Once that process has finished, run:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile test_micro_speech
+```
+
+You should see a series of files get compiled, followed by some logging output from a test, which should conclude with "~~~ALL TESTS PASSED~~~". If you see this, it means that a small program has been built and run that loads a trained TensorFlow model, runs some example inputs through it, and got the expected outputs. This particular test runs spectrograms generated from recordings of people saying "Yes" and "No", and checks that the network correctly identifies them.
+
+To understand how TensorFlow Lite does this, you can look at the `TestInvoke()` function in [micro_speech_test.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc). It's a fairly small amount of code, creating an interpreter, getting a handle to a model that's been compiled into the program, and then invoking the interpreter with the model and sample inputs.
+
+## Getting Started on a Microcontroller
+
+Once you have downloaded the dependencies and got the x86/Linux build working, you can try building a version for the STM32F103 'bluepill' device. The following command will build the test and then run it on an emulator, assuming you have Docker installed:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill test_micro_speech
+```
+
+If you have a real device [(see here for how to set one up)](https://github.com/google/stm32_bare_lib/tree/master/README.md) you can then convert the ELF file into a  a `.bin` format executable to load onto it by running:
+
+```
+arm-none-eabi-objcopy \
+tensorflow/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/micro_speech_test \
+tensorflow/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/micro_speech_test.bin \
+--output binary
+```
+
+## Calculating the Input to the Neural Network
+
+The TensorFlow Lite model doesn't take in raw audio sample data. Instead it works with spectrograms, which are two dimensional arrays that are made up of slices of frequency information, each taken from a different time window. This test uses spectrograms that have been pre-calculated from one-second WAV files in the test data set. In a complete application these spectrograms would be calculated at runtime from microphone inputs, but the code for doing that is not yet included in this sample code.
+
+The recipe for creating the spectrogram data is that each frequency slice is created by running an FFT across a 30ms section of the audio sample data. The input samples are treated as being between -1 and +1 as real values (encoded as -32,768 and 32,767 in 16-bit signed integer samples). This results in an FFT with 256 entries. Every sequence of six entries is averaged together, giving a total of 43 frequency buckets in the final slice. The results are stored as unsigned eight-bit values, where 0 represents a real number of zero, and 255 represents 127.5 as a real number. Each adjacent frequency entry is stored in ascending memory order (frequency bucket 0 at data[0], bucket 1 at data [1], etc). The window for the frequency analysis is then moved forward by 20ms, and the process repeated, storing the results in the next memory row (for example bucket 0 in this moved window would be in data[43 + 0], etc). This process happens 49 times in total, producing a single channel image that is 43 pixels wide, and 49 rows high. Here's an illustration of the process:
+
+![spectrogram diagram](https://storage.googleapis.com/download.tensorflow.org/example_images/spectrogram_diagram.png)
+
+
+The test data files have been generated by running the following commands:
+
+```
+bazel run tensorflow/examples/speech_commands:wav_to_features -- \
+--input_wav=${HOME}/speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav \
+--output_c_file=yes_features_data.cc \
+--window_stride=20 --preprocess=average --quantize=1
+
+bazel run tensorflow/examples/speech_commands:wav_to_features -- \
+--input_wav=${HOME}/speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav \
+--output_c_file=no_features_data.cc \
+--window_stride=20 --preprocess=average --quantize=1
+```
+
+## Creating Your Own Model
+
+The neural network model used in this example was built using the [TensorFlow speech commands tutorial](https://www.tensorflow.org/tutorials/sequences/audio_recognition). If you would like to create your own, you can start by training a model with this command:
+
+```
+bazel run -c opt --copt=-mavx2 --copt=-mfma \
+tensorflow/examples/speech_commands:train -- \
+--model_architecture=tiny_conv --window_stride=20 --preprocess=average \
+--wanted_words="yes,no" --silence_percentage=25 --unknown_percentage=25 --quantize=1
+```
+
+If you see a compiling error on older machines, try leaving out the `--copt` arguments, they are just there to accelerate training on chips that support the extensions. The training process is likely to take a couple of hours. Once it has completed, the next step is to freeze the variables:
+
+```
+bazel run tensorflow/examples/speech_commands:freeze -- \
+--model_architecture=tiny_conv --window_stride=20 --preprocess=average \
+--wanted_words="yes,no" --quantize=1 --output_file=/tmp/tiny_conv.pb
+```
+
+The next step is to create a TensorFlow Lite file from the frozen graph:
+
+```
+bazel run tensorflow/lite/toco:toco -- \
+--input_file=/tmp/tiny_conv.pb --output_file=/tmp/tiny_conv.tflite \
+--input_shapes=1,49,43,1 --input_arrays=Reshape_1 --output_arrays='labels_softmax' \
+--inference_type=QUANTIZED_UINT8 --mean_values=0 --std_values=2 \
+--change_concat_input_ranges=false
+```
+
+Finally, convert the file into a C source file that can be compiled into an embedded system:
+
+```
+xxd -i /tmp/tiny_conv.tflite > /tmp/tiny_conv_model_data.cc
+```
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0365d56901b503628b323a2fe09a4fa0de9165e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+namespace {
+int16_t g_dummy_audio_data[kMaxAudioSampleSize];
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+    g_dummy_audio_data[i] = 0;
+  }
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_dummy_audio_data;
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e2442a5e83ee1f809f82587c816adb01dc09e5e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// This is an abstraction around an audio source like a microphone, and is
+// expected to return 16-bit PCM sample data for a given point in time. The
+// sample data itself should be used as quickly as possible by the caller, since
+// to allow memory optimizations there are no guarantees that the samples won't
+// be overwritten by new data in the future. In practice, implementations should
+// ensure that there's a reasonable time allowed for clients to access the data
+// before any reuse.
+// The reference implementation can have no platform-specific dependencies, so
+// it just returns an array filled with zeros. For real applications, you should
+// ensure there's a specialized implementation that accesses hardware APIs.
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f7c7605f0feb3fd3179a0edd5e51574b867ce68
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestAudioProvider) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  int audio_samples_size = 0;
+  int16_t* audio_samples = nullptr;
+  TfLiteStatus get_status =
+      GetAudioSamples(error_reporter, 0, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+
+  // Make sure we can read all of the returned memory locations.
+  int total = 0;
+  for (int i = 0; i < audio_samples_size; ++i) {
+    total += audio_samples[i];
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4c52ac0ff3696a05192465f8ac911b5d6a83925
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
+
+namespace {
+// Stores the timestamp for the previous fetch of audio data, so that we can
+// avoid recalculating all the features from scratch if some earlier timeslices
+// are still present.
+int32_t g_last_time_in_ms = 0;
+// Make sure we don't try to use cached information if this is the first call
+// into the provider.
+bool g_is_first_run = true;
+}  // namespace
+
+FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
+    : feature_size_(feature_size), feature_data_(feature_data) {
+  // Initialize the feature data to default values.
+  for (int n = 0; n < feature_size_; ++n) {
+    feature_data_[n] = 0;
+  }
+}
+
+FeatureProvider::~FeatureProvider() {}
+
+TfLiteStatus FeatureProvider::PopulateFeatureData(
+    tflite::ErrorReporter* error_reporter, int* how_many_new_slices) {
+  if (feature_size_ != kFeatureElementCount) {
+    error_reporter->Report("Requested feature_data_ size %d doesn't match %d",
+                           feature_size_, kFeatureElementCount);
+    return kTfLiteError;
+  }
+
+  const int32_t time_in_ms = TimeInMilliseconds();
+  // Quantize the time into steps as long as each window stride, so we can
+  // figure out which audio data we need to fetch.
+  const int last_step = (g_last_time_in_ms / kFeatureSliceStrideMs);
+  const int current_step = (time_in_ms / kFeatureSliceStrideMs);
+  g_last_time_in_ms = time_in_ms;
+
+  int slices_needed = current_step - last_step;
+  // If this is the first call, make sure we don't use any cached information.
+  if (g_is_first_run) {
+    g_is_first_run = false;
+    slices_needed = kFeatureSliceCount;
+  }
+  if (slices_needed > kFeatureSliceCount) {
+    slices_needed = kFeatureSliceCount;
+  }
+  *how_many_new_slices = slices_needed;
+
+  const int slices_to_keep = kFeatureSliceCount - slices_needed;
+  const int slices_to_drop = kFeatureSliceCount - slices_to_keep;
+  // If we can avoid recalculating some slices, just move the existing data
+  // up in the spectrogram, to perform something like this:
+  // last time = 80ms          current time = 120ms
+  // +-----------+             +-----------+
+  // | data@20ms |         --> | data@60ms |
+  // +-----------+       --    +-----------+
+  // | data@40ms |     --  --> | data@80ms |
+  // +-----------+   --  --    +-----------+
+  // | data@60ms | --  --      |  <empty>  |
+  // +-----------+   --        +-----------+
+  // | data@80ms | --          |  <empty>  |
+  // +-----------+             +-----------+
+  if (slices_to_keep > 0) {
+    for (int dest_slice = 0; dest_slice < slices_to_keep; ++dest_slice) {
+      uint8_t* dest_slice_data =
+          feature_data_ + (dest_slice * kFeatureSliceSize);
+      const int src_slice = dest_slice + slices_to_drop;
+      const uint8_t* src_slice_data =
+          feature_data_ + (src_slice * kFeatureSliceSize);
+      for (int i = 0; i < kFeatureSliceSize; ++i) {
+        dest_slice_data[i] = src_slice_data[i];
+      }
+    }
+  }
+  // Any slices that need to be filled in with feature data have their
+  // appropriate audio data pulled, and features calculated for that slice.
+  if (slices_needed > 0) {
+    for (int new_slice = slices_to_keep; new_slice < kFeatureSliceCount;
+         ++new_slice) {
+      const int new_step = (current_step - kFeatureSliceCount + 1) + new_slice;
+      const int32_t slice_start_ms = (new_step * kFeatureSliceStrideMs);
+      int16_t* audio_samples = nullptr;
+      int audio_samples_size = 0;
+      GetAudioSamples(error_reporter, slice_start_ms, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
+      if (audio_samples_size < kMaxAudioSampleSize) {
+        error_reporter->Report("Audio data size %d  too small, want %d",
+                               audio_samples_size, kMaxAudioSampleSize);
+        return kTfLiteError;
+      }
+      uint8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
+      TfLiteStatus preprocess_status =
+          Preprocess(error_reporter, audio_samples, audio_samples_size,
+                     kFeatureSliceSize, new_slice_data);
+      if (preprocess_status != kTfLiteOk) {
+        return preprocess_status;
+      }
+    }
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
new file mode 100644
index 0000000000000000000000000000000000000000..a86c56ebf053a8807e38c42c6a7088c146a31b9e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Binds itself to an area of memory intended to hold the input features for an
+// audio-recognition neural network model, and fills that data area with the
+// features representing the current audio input, for example from a microphone.
+// The audio features themselves are a two-dimensional array, made up of
+// horizontal slices representing the frequencies at one point in time, stacked
+// on top of each other to form a spectrogram showing how those frequencies
+// changed over time.
+class FeatureProvider {
+ public:
+  // Create the provider, and bind it to an area of memory. This memory should
+  // remain accessible for the lifetime of the provider object, since subsequent
+  // calls will fill it with feature data. The provider does no memory
+  // management of this data.
+  FeatureProvider(int feature_size, uint8_t* feature_data);
+  ~FeatureProvider();
+
+  // Fills the feature data with information from audio inputs, and returns how
+  // many feature slices were updated.
+  TfLiteStatus PopulateFeatureData(tflite::ErrorReporter* error_reporter,
+                                   int* how_many_new_slices);
+
+ private:
+  int feature_size_;
+  uint8_t* feature_data_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e52aec8d2741678a0f79f643bb7dcf42c848a58
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestFeatureProvider) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t feature_data[kFeatureElementCount];
+  FeatureProvider feature_provider(kFeatureElementCount, feature_data);
+
+  int how_many_new_slices = 0;
+  TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
+      error_reporter, &how_many_new_slices);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
+  TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b623d8d11b75d59600cc6a029527d3957084a328
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
@@ -0,0 +1,212 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of the preprocessing pipeline, with the same
+// results as the audio tutorial at
+// https://www.tensorflow.org/tutorials/sequences/audio_recognition
+// This module takes 30ms of PCM-encoded signed 16-bit audio samples (at 16KHz,
+// so 480 values), and extracts a power spectrum of frequencies. There are 43
+// frequency bands in the result, derived from the original 256 output from the
+// discrete Fourier transform, and averaged together in groups of 6.
+// It's expected that most platforms will have optimized versions of the
+// functions used here, for example replacing the DFT with an FFT, so this
+// version shouldn't be used where performance is critical.
+// This implementation uses fixed point for any non-constant calculations,
+// instead of floating point, to help show how this can work on platforms that
+// don't have good float support.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+
+#include <cmath>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+namespace {
+
+// q format notation: qx.y => 1 sign bit, x-1 integer bits, y fraction bits.
+// Use standard (non-saturating) arithmetic with signed ints of size x+y bits.
+// Sacrifice some precision to avoid use of 64-bit ints.
+
+// q1.15 * q1.15 => q2.30
+inline int32_t Q1_15_FixedMultiply_Q2_30(int16_t a, int16_t b) {
+  int32_t big_a = a;
+  int32_t big_b = b;
+  return big_a * big_b;
+}
+
+// q2.30 * q2.30 => q10.22
+inline int32_t Q2_30_FixedMultiply_Q10_22(int32_t a, int32_t b) {
+  // q2.30 result
+  int32_t tmp = (a >> 15) * (b >> 15);
+  // q10.22 result
+  return tmp >> 8;
+}
+
+// q10.22 * q10.22 => q10.22
+// Will overflow if product is >= 512.
+// Largest product in small test set is 465.25
+inline int32_t Q10_22_FixedMultiply_Q10_22(int32_t a, int32_t b) {
+  // q10.22 result
+  return (a >> 11) * (b >> 11);
+}
+
+// float => q2.30
+// No checking for saturation.  Only used for inputs in range [-1, 1].
+inline int32_t FloatToFixed_Q2_30(float input) {
+  return static_cast<int32_t>(roundf(input * (1 << 30)));
+}
+
+// Performs a discrete Fourier transform on the real inputs. This corresponds to
+// rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
+// and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft.
+// It takes in an array of float real values, and returns a result of the same
+// length with q10.22 fixed point real and imaginary components interleaved, so
+// fourier_output[0] is the first real value, fourier_output[1] is the first
+// imaginary, fourier_output[2] is the second real, and so on.
+// The calling function should ensure that the array passed in as fourier_output
+// is at least time_series_size in length. Most optimized FFT implementations
+// require the length to be a power of two as well, but this version doesn't
+// enforce that.
+
+// input: q2.30 fixed point.  output: q10.22 fixed point.
+// Outputs interpreted as q10.22 fixed point are un-scaled.
+void CalculateDiscreteFourierTransform(int32_t* time_series,
+                                       int time_series_size,
+                                       int32_t* fourier_output) {
+  for (int i = 0; i < time_series_size / 2; ++i) {
+    int32_t real = 0;
+    for (int j = 0; j < time_series_size; ++j) {
+      const int32_t real_scale =
+          FloatToFixed_Q2_30(cos(j * i * M_PI * 2 / time_series_size));
+      real += Q2_30_FixedMultiply_Q10_22(time_series[j], real_scale);
+    }
+    int32_t imaginary = 0;
+    for (int j = 0; j < time_series_size; ++j) {
+      const int32_t imaginary_scale =
+          FloatToFixed_Q2_30(sin(j * i * M_PI * 2 / time_series_size));
+      imaginary -= Q2_30_FixedMultiply_Q10_22(time_series[j], imaginary_scale);
+    }
+    fourier_output[(i * 2) + 0] = real;
+    fourier_output[(i * 2) + 1] = imaginary;
+  }
+}
+
+// Produces a simple sine curve that is used to ensure frequencies at the center
+// of the current sample window are weighted more heavily than those at the end.
+// q1.15 output format.
+void CalculatePeriodicHann(int window_length, int16_t* window_function) {
+  for (int i = 0; i < window_length; ++i) {
+    const float real_value = (0.5 - 0.5 * cos((2 * M_PI * i) / window_length));
+    int tmp = static_cast<int32_t>(roundf(real_value * (1 << 15)));
+    // Saturate the 0x8000 value to 0x7fff
+    if (tmp > 0x7fff) tmp = 0x7fff;
+    window_function[i] = tmp;
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                        const int16_t* input, int input_size, int output_size,
+                        uint8_t* output) {
+  // Ensure our input and output data arrays are valid.
+  if (input_size > kMaxAudioSampleSize) {
+    error_reporter->Report("Input size %d larger than %d", input_size,
+                           kMaxAudioSampleSize);
+    return kTfLiteError;
+  }
+  if (output_size != kFeatureSliceSize) {
+    error_reporter->Report("Requested output size %d doesn't match %d",
+                           output_size, kFeatureSliceSize);
+    return kTfLiteError;
+  }
+
+  // Pre-calculate the window function we'll be applying to the input data.
+  // In a real application, we'd calculate this table once in an initialization
+  // function and store it for repeated reuse.
+  // q1.15 format.
+  int16_t window_function[kMaxAudioSampleSize];
+  CalculatePeriodicHann(input_size, window_function);
+
+  // Apply the window function to our time series input, and pad it with zeroes
+  // to the next power of two.
+  int32_t fixed_input[kMaxAudioSampleSize];
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+    if (i < input_size) {
+      // input is int16_t.  Treat as q1.15 fixed point value in range [-1,1)
+      // window_function is also q1.15 fixed point number
+      fixed_input[i] = Q1_15_FixedMultiply_Q2_30(input[i], window_function[i]);
+    } else {
+      fixed_input[i] = 0;
+    }
+  }
+
+  // Pull the frequency data from the time series sample.
+  // Calculated in q10.22 format from q2.30 inputs.
+  int32_t fourier_values[kMaxAudioSampleSize];
+  CalculateDiscreteFourierTransform(fixed_input, kMaxAudioSampleSize,
+                                    fourier_values);
+
+  // We have the complex numbers giving us information about each frequency
+  // band, but all we want to know is how strong each frequency is, so calculate
+  // the squared magnitude by adding together the squares of each component.
+  int32_t power_spectrum[kMaxAudioSampleSize / 2];
+  for (int i = 0; i < (kMaxAudioSampleSize / 2); ++i) {
+    const int32_t real = fourier_values[(i * 2) + 0];
+    const int32_t imaginary = fourier_values[(i * 2) + 1];
+    // q10.22 results
+    power_spectrum[i] = Q10_22_FixedMultiply_Q10_22(real, real) +
+                        Q10_22_FixedMultiply_Q10_22(imaginary, imaginary);
+  }
+
+  // Finally, reduce the size of the output by averaging together six adjacent
+  // frequencies into each slot, producing an array of 43 values.
+  // Power_spectrum numbers are q10.22.  Divide by kAverageWindowSize inside
+  // loop to prevent overflow.
+  for (int i = 0; i < kFeatureSliceSize; ++i) {
+    int32_t average = 0;
+    for (int j = 0; j < kAverageWindowSize; ++j) {
+      const int index = (i * kAverageWindowSize) + j;
+      if (index < (kMaxAudioSampleSize / 2)) {
+        average += power_spectrum[index] / kAverageWindowSize;
+      }
+    }
+    // Quantize the result into eight bits, effectively multiplying by two.
+    // The 127.5 constant here has to match the features_max value defined in
+    // tensorflow/examples/speech_commands/input_data.py, and this also assumes
+    // that features_min is zero.
+    //
+    // q10.22 input
+    // integer output
+    //
+    // output = (input - features_min) *
+    //     (output_max - output_min) / (features_max - features_min)
+    // == (input) * (255) / (127.5)
+    // == input * 2
+    // == input << 1
+    // Also want to round to nearest integer and only keep integer bits
+    // => ((input << 1) + 0x200000) >> 22
+    // == (input + 0x100000) >> 21
+    int32_t quantized_average = (average + 0x100000) >> 21;
+    if (quantized_average < 0) {
+      quantized_average = 0;
+    }
+    if (quantized_average > 255) {
+      quantized_average = 255;
+    }
+    output[i] = quantized_average;
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1890c25cf2b44c96c549757b31f88255d4a9ee09
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+int main(int argc, char* argv[]) {
+  // Set up logging.
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model->version(), TFLITE_SCHEMA_VERSION);
+    return 1;
+  }
+
+  // This pulls in all the operation implementations we need.
+  tflite::ops::micro::AllOpsResolver resolver;
+
+  // Create an area of memory to use for input, output, and intermediate arrays.
+  // The size of this will depend on the model you're using, and may need to be
+  // determined by experimentation.
+  const int tensor_arena_size = 10 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                                 tensor_arena_size);
+
+  // Build an interpreter to run the model with.
+  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                       error_reporter);
+
+  // Get information about the memory area to use for the model's input.
+  TfLiteTensor* model_input = interpreter.input(0);
+  if ((model_input->dims->size != 4) || (model_input->dims->data[0] != 1) ||
+      (model_input->dims->data[1] != kFeatureSliceCount) ||
+      (model_input->dims->data[2] != kFeatureSliceSize) ||
+      (model_input->type != kTfLiteUInt8)) {
+    error_reporter->Report("Bad input tensor parameters in model");
+    return 1;
+  }
+
+  // Prepare to access the audio spectrograms from a microphone or other source
+  // that will provide the inputs to the neural network.
+  FeatureProvider feature_provider(kFeatureElementCount,
+                                   model_input->data.uint8);
+
+  // Keep reading and analysing audio data in an infinite loop.
+  while (true) {
+    // Fetch the spectrogram for the current time.
+    int how_many_new_slices = 0;
+    TfLiteStatus feature_status = feature_provider.PopulateFeatureData(
+        error_reporter, &how_many_new_slices);
+    if (feature_status != kTfLiteOk) {
+      error_reporter->Report("Feature generation failed");
+      return 1;
+    }
+    // If no new audio samples have been received since last time, don't bother
+    // running the network model.
+    if (how_many_new_slices == 0) {
+      continue;
+    }
+
+    // Run the model on the spectrogram input and make sure it succeeds.
+    TfLiteStatus invoke_status = interpreter.Invoke();
+    if (invoke_status != kTfLiteOk) {
+      error_reporter->Report("Invoke failed");
+      return 1;
+    }
+
+    // The output from the model is a vector containing the scores for each
+    // kind of prediction, so figure out what the highest scoring category was.
+    TfLiteTensor* output = interpreter.output(0);
+    uint8_t top_category_score = 0;
+    int top_category_index = 0;
+    for (int category_index = 0; category_index < kCategoryCount;
+         ++category_index) {
+      const uint8_t category_score = output->data.uint8[category_index];
+      if (category_score > top_category_score) {
+        top_category_score = category_score;
+        top_category_index = category_index;
+      }
+    }
+
+    error_reporter->Report("Heard %s", kCategoryLabels[top_category_index]);
+  }
+
+  return 0;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e54ff670eb9badd648aee99cf154c0d3b988bff
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
@@ -0,0 +1,137 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestInvoke) {
+  // Set up logging.
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model->version(), TFLITE_SCHEMA_VERSION);
+  }
+
+  // This pulls in all the operation implementations we need.
+  tflite::ops::micro::AllOpsResolver resolver;
+
+  // Create an area of memory to use for input, output, and intermediate arrays.
+  const int tensor_arena_size = 10 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                                 tensor_arena_size);
+
+  // Build an interpreter to run the model with.
+  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                       error_reporter);
+
+  // Get information about the memory area to use for the model's input.
+  TfLiteTensor* input = interpreter.input(0);
+
+  // Make sure the input has the properties we expect.
+  TF_LITE_MICRO_EXPECT_NE(nullptr, input);
+  TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(43, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
+
+  // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
+  // into the memory area used for the input.
+  const uint8_t* yes_features_data = g_yes_f2e59fea_nohash_1_data;
+  for (int i = 0; i < input->bytes; ++i) {
+    input->data.uint8[i] = yes_features_data[i];
+  }
+
+  // Run the model on this input and make sure it succeeds.
+  TfLiteStatus invoke_status = interpreter.Invoke();
+  if (invoke_status != kTfLiteOk) {
+    error_reporter->Report("Invoke failed\n");
+  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
+
+  // Get the output from the model, and make sure it's the expected size and
+  // type.
+  TfLiteTensor* output = interpreter.output(0);
+  TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+
+  // There are four possible classes in the output, each with a score.
+  const int kSilenceIndex = 0;
+  const int kUnknownIndex = 1;
+  const int kYesIndex = 2;
+  const int kNoIndex = 3;
+
+  // Make sure that the expected "Yes" score is higher than the other classes.
+  uint8_t silence_score = output->data.uint8[kSilenceIndex];
+  uint8_t unknown_score = output->data.uint8[kUnknownIndex];
+  uint8_t yes_score = output->data.uint8[kYesIndex];
+  uint8_t no_score = output->data.uint8[kNoIndex];
+  TF_LITE_MICRO_EXPECT_GT(yes_score, silence_score);
+  TF_LITE_MICRO_EXPECT_GT(yes_score, unknown_score);
+  TF_LITE_MICRO_EXPECT_GT(yes_score, no_score);
+
+  // Now test with a different input, from a recording of "No".
+  const uint8_t* no_features_data = g_no_f9643d42_nohash_4_data;
+  for (int i = 0; i < input->bytes; ++i) {
+    input->data.uint8[i] = no_features_data[i];
+  }
+
+  // Run the model on this "No" input.
+  invoke_status = interpreter.Invoke();
+  if (invoke_status != kTfLiteOk) {
+    error_reporter->Report("Invoke failed\n");
+  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
+
+  // Get the output from the model, and make sure it's the expected size and
+  // type.
+  output = interpreter.output(0);
+  TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+
+  // Make sure that the expected "No" score is higher than the other classes.
+  silence_score = output->data.uint8[kSilenceIndex];
+  unknown_score = output->data.uint8[kUnknownIndex];
+  yes_score = output->data.uint8[kYesIndex];
+  no_score = output->data.uint8[kNoIndex];
+  TF_LITE_MICRO_EXPECT_GT(no_score, silence_score);
+  TF_LITE_MICRO_EXPECT_GT(no_score, unknown_score);
+  TF_LITE_MICRO_EXPECT_GT(no_score, yes_score);
+
+  error_reporter->Report("Ran successfully\n");
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9b8fb37b19d384fe92edf8ce2292aee19b99b7f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+const char* kCategoryLabels[kCategoryCount] = {
+    "silence",
+    "unknown",
+    "yes",
+    "no",
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d8f3123a57bc5b807d39151adaf64f29d2f5f95
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+
+// Keeping these as constant expressions allow us to allocate fixed-sized arrays
+// on the stack for our working memory.
+
+// The size of the input time series data we pass to the FFT to produce the
+// frequency information. This has to be a power of two, and since we're dealing
+// with 30ms of 16KHz inputs, which means 480 samples, this is the next value.
+constexpr int kMaxAudioSampleSize = 512;
+
+// All of these values are derived from the values used during model training,
+// if you change your model you'll need to update these constants.
+constexpr int kAverageWindowSize = 6;
+constexpr int kFeatureSliceSize =
+    ((kMaxAudioSampleSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+constexpr int kFeatureSliceCount = 49;
+constexpr int kFeatureElementCount = (kFeatureSliceSize * kFeatureSliceCount);
+constexpr int kFeatureSliceStrideMs = 20;
+constexpr int kFeatureSliceDurationMs = 30;
+
+constexpr int kCategoryCount = 4;
+constexpr int kSilenceIndex = 0;
+constexpr int kUnknownIndex = 1;
+extern const char* kCategoryLabels[kCategoryCount];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6eaa5c2fed61fa90f66bf04f7f0c8b36520f11e4
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h"
+
+const int g_no_30ms_sample_data_size = 480;
+const int16_t g_no_30ms_sample_data[480] = {
+    5713,  5735,  5735,  5737,  5701,  5691,  5656,  5633,  5611,  5552,  5475,
+    5394,  5293,  5177,  5064,  4924,  4737,  4599,  4420,  4237,  4048,  3828,
+    3623,  3413,  3183,  2915,  2622,  2308,  1980,  1657,  1261,  901,   549,
+    205,   -85,   -383,  -688,  -969,  -1246, -1530, -1850, -2206, -2561, -2915,
+    -3224, -3482, -3713, -3921, -4107, -4287, -4470, -4660, -4850, -5057, -5239,
+    -5395, -5540, -5619, -5697, -5724, -5697, -5675, -5633, -5590, -5579, -5530,
+    -5486, -5442, -5426, -5391, -5348, -5276, -5197, -5124, -5039, -4925, -4808,
+    -4677, -4581, -4479, -4343, -4218, -4087, -3970, -3858, -3729, -3570, -3384,
+    -3206, -3020, -2839, -2636, -2453, -2287, -2185, -2154, -1926, -1562, -1223,
+    -758,  -473,  -64,   395,   599,   880,   814,   938,   1172,  1498,  1928,
+    2127,  2422,  2608,  2841,  2937,  2886,  2815,  2985,  3324,  3757,  4152,
+    4481,  4652,  4917,  4965,  4766,  4583,  4328,  4503,  4815,  5118,  5408,
+    5682,  5956,  6082,  6055,  5744,  5426,  5341,  5427,  5606,  5882,  6065,
+    6226,  6428,  6477,  6385,  6009,  5728,  5552,  5439,  5339,  5200,  5008,
+    4947,  4835,  4614,  4330,  3887,  3521,  3111,  2460,  1983,  1297,  650,
+    279,   -353,  -720,  -1044, -1518, -1668, -2117, -2496, -2743, -3266, -3607,
+    -3790, -4149, -4075, -4042, -4096, -3981, -4138, -4226, -4214, -4503, -4455,
+    -4577, -4642, -4346, -4351, -4270, -4263, -4522, -4521, -4673, -4814, -4731,
+    -4950, -5011, -5004, -5288, -5341, -5566, -5833, -5783, -5929, -5847, -5765,
+    -5828, -5644, -5613, -5615, -5428, -5291, -5014, -4554, -4277, -3964, -3854,
+    -3829, -3612, -3603, -3438, -3137, -2831, -2164, -1438, -939,  -330,  -156,
+    46,    242,   73,    242,   220,   239,   542,   565,   739,   872,   801,
+    857,   676,   543,   586,   567,   828,   1142,  1490,  1985,  2508,  2982,
+    3438,  3699,  3939,  4069,  4178,  4420,  4622,  4917,  5338,  5801,  6285,
+    6658,  6963,  7213,  7233,  7328,  7176,  7038,  7031,  6860,  6957,  6767,
+    6599,  6523,  6212,  6147,  6063,  5860,  6020,  6015,  6033,  6184,  5722,
+    5607,  5016,  4337,  4063,  3229,  3080,  3006,  2804,  3035,  2541,  2136,
+    1879,  1012,  401,   -575,  -1584, -1930, -2278, -2485, -2477, -2712, -2747,
+    -2766, -3320, -3592, -4188, -4669, -4672, -4939, -4789, -4426, -4203, -3674,
+    -3563, -3656, -3759, -4067, -4257, -4522, -4970, -5204, -5237, -5139, -4907,
+    -4911, -4917, -4921, -5007, -5230, -5654, -6122, -6464, -6733, -6948, -7067,
+    -6972, -6800, -6520, -6132, -5830, -5382, -5091, -4797, -4546, -4472, -4362,
+    -4350, -4235, -3851, -3454, -3144, -2735, -2341, -1845, -1262, -958,  -549,
+    -166,  66,    382,   366,   352,   341,   85,    -13,   -176,  -303,  -235,
+    -341,  -309,  -227,  -249,  -50,   143,   384,   874,   1149,  1552,  2155,
+    2767,  3499,  3994,  4460,  4920,  5288,  5569,  5704,  5881,  6094,  6461,
+    6653,  6803,  7115,  7311,  7521,  7612,  7443,  7380,  7124,  6742,  6495,
+    5964,  5656,  5415,  5167,  5656,  5813,  6027,  6401,  6351,  6787,  7019,
+    6581,  6512,  5965,  5308,  5140,  4336,  4147,  3899,  3398,  3360,  2830,
+    2624,  1968,  1026,  395,   -699,  -1424, -2327, -3006, -3192, -3435, -3337,
+    -3686, -3513, -3350, -3502, -3261, -3878, -4005, -4063, -4187, -3767, -3598,
+    -3384, -3300, -3094, -2857, -3023, -3274, -3851, -4352, -4523, -4943, -5477,
+    -5612, -5682, -5733, -5714, -5965, -6110, -5950, -6158, -6548, -6897, -7165,
+    -7281, -7352, -7258, -7185, -6659, -5946, -5470,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff6b874089903d2709480444a7b9ea189b51720f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was created from the PCM data in a WAV file held in v2 of the
+// Speech Commands test dataset, at the path:
+// speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav
+// The data was extracted starting at an offset of 8,960, which corresponds to
+// the 29th spectrogram slice. It's designed to be used to test the
+// preprocessing pipeline, to ensure that the expected spectrogram slice is
+// produced given this input.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_30MS_SAMPLE_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_30MS_SAMPLE_DATA_H_
+
+#include <cstdint>
+
+extern const int g_no_30ms_sample_data_size;
+extern const int16_t g_no_30ms_sample_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_30MS_SAMPLE_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e98c84f7ed2e678eb91580a2b6fb69514cee4740
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h"
+
+/* File automatically created by
+ * tensorflow/examples/speech_commands/wav_to_features.py \
+ * --sample_rate=16000 \
+ * --clip_duration_ms=1000 \
+ * --window_size_ms=30 \
+ * --window_stride_ms=20 \
+ * --feature_bin_count=40 \
+ * --quantize \
+ * --preprocess="average" \
+ * --input_wav="speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav" \
+ * --output_c_file="no_features_data.cc" \
+ */
+
+const int g_no_f9643d42_nohash_4_width = 43;
+const int g_no_f9643d42_nohash_4_height = 49;
+const unsigned char g_no_f9643d42_nohash_4_data[] = {
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   5,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   67, 2,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   139, 2,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   195, 2,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   230, 2,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  255, 7,
+    6, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 255, 7,  16, 1,   1,   0,  2, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 255, 7,   22, 0,  1,   0,
+    1, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 238, 5,   20, 3, 4,   1,  1,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  144, 4,   19, 3, 5,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  42, 6,   3,
+    1, 3,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  3, 1,   5,  0,  1,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  5, 1,   3,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    1, 0,   1,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
+    0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2ee0c46cf13b00b310bd22b7ca1cb5a9751c6e6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
+
+extern const int g_no_f9643d42_nohash_4_width;
+extern const int g_no_f9643d42_nohash_4_height;
+extern const unsigned char g_no_f9643d42_nohash_4_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4fc5c33bb329cba4e1abcf6d36b01f14e9e2b27
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h"
+
+const uint8_t g_no_power_spectrum_data[g_no_power_spectrum_data_size] = {
+    255, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa39d3c70d78ce261db81cf8ad7c416efd2c468c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was extracted from the larger feature data held in
+// no_features_data.cc and consists of the 29th spectrogram slice of 43 values.
+// This is the expected result of running the sample data in
+// no_30ms_sample_data.cc through through the preprocessing pipeline.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
+
+#include <cstdint>
+
+constexpr int g_no_power_spectrum_data_size = 43;
+extern const uint8_t g_no_power_spectrum_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4a7f801cc6251b82339509f691fd64012fbe390
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of the preprocessing pipeline, with the same
+// results as the audio tutorial at
+// https://www.tensorflow.org/tutorials/sequences/audio_recognition
+// This module takes 30ms of PCM-encoded signed 16-bit audio samples (at 16KHz,
+// so 480 values), and extracts a power spectrum of frequencies. There are 43
+// frequency bands in the result, derived from the original 256 output from the
+// discrete Fourier transform, and averaged together in groups of 6.
+// It's expected that most platforms will have optimized versions of the
+// functions used here, for example replacing the DFT with an FFT, so this
+// version shouldn't be used where performance is critical.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+
+#include <cmath>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+namespace {
+
+// Performs a discrete Fourier transform on the real inputs. This corresponds to
+// rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
+// and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft.
+// It takes in an array of float real values, and returns a result of the same
+// length with float real and imaginary components interleaved, so
+// fourier_output[0] is the first real value, fourier_output[1] is the first
+// imaginary, fourier_output[2] is the second real, and so on.
+// The calling function should ensure that the array passed in as fourier_output
+// is at least time_series_size in length. Most optimized FFT implementations
+// require the length to be a power of two as well, but this version doesn't
+// enforce that.
+void CalculateDiscreteFourierTransform(float* time_series, int time_series_size,
+                                       float* fourier_output) {
+  for (int i = 0; i < time_series_size / 2; ++i) {
+    float real = 0;
+    for (int j = 0; j < time_series_size; ++j) {
+      real += time_series[j] * cos(j * i * M_PI * 2 / time_series_size);
+    }
+    float imaginary = 0;
+    for (int j = 0; j < time_series_size; ++j) {
+      imaginary -= time_series[j] * sin(j * i * M_PI * 2 / time_series_size);
+    }
+    fourier_output[(i * 2) + 0] = real;
+    fourier_output[(i * 2) + 1] = imaginary;
+  }
+}
+
+// Produces a simple sine curve that is used to ensure frequencies at the center
+// of the current sample window are weighted more heavily than those at the end.
+void CalculatePeriodicHann(int window_length, float* window_function) {
+  for (int i = 0; i < window_length; ++i) {
+    window_function[i] = 0.5 - 0.5 * cos((2 * M_PI * i) / window_length);
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                        const int16_t* input, int input_size, int output_size,
+                        uint8_t* output) {
+  // Ensure our input and output data arrays are valid.
+  if (input_size > kMaxAudioSampleSize) {
+    error_reporter->Report("Input size %d larger than %d", input_size,
+                           kMaxAudioSampleSize);
+    return kTfLiteError;
+  }
+  if (output_size != kFeatureSliceSize) {
+    error_reporter->Report("Requested output size %d doesn't match %d",
+                           output_size, kFeatureSliceSize);
+    return kTfLiteError;
+  }
+
+  // Pre-calculate the window function we'll be applying to the input data.
+  // In a real application, we'd calculate this table once in an initialization
+  // function and store it for repeated reuse.
+  float window_function[kMaxAudioSampleSize];
+  CalculatePeriodicHann(input_size, window_function);
+
+  // Apply the window function to our time series input, and pad it with zeroes
+  // to the next power of two.
+  float float_input[kMaxAudioSampleSize];
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+    if (i < input_size) {
+      float_input[i] =
+          (input[i] * window_function[i]) / static_cast<float>(1 << 15);
+    } else {
+      float_input[i] = 0.0f;
+    }
+  }
+
+  // Pull the frequency data from the time series sample.
+  float fourier_values[kMaxAudioSampleSize];
+  CalculateDiscreteFourierTransform(float_input, kMaxAudioSampleSize,
+                                    fourier_values);
+
+  // We have the complex numbers giving us information about each frequency
+  // band, but all we want to know is how strong each frequency is, so calculate
+  // the squared magnitude by adding together the squares of each component.
+  float power_spectrum[kMaxAudioSampleSize / 2];
+  for (int i = 0; i < (kMaxAudioSampleSize / 2); ++i) {
+    const float real = fourier_values[(i * 2) + 0];
+    const float imaginary = fourier_values[(i * 2) + 1];
+    power_spectrum[i] = (real * real) + (imaginary * imaginary);
+  }
+
+  // Finally, reduce the size of the output by averaging together six adjacent
+  // frequencies into each slot, producing an array of 43 values.
+  for (int i = 0; i < kFeatureSliceSize; ++i) {
+    float total = 0.0f;
+    for (int j = 0; j < kAverageWindowSize; ++j) {
+      const int index = (i * kAverageWindowSize) + j;
+      if (index < (kMaxAudioSampleSize / 2)) {
+        total += power_spectrum[index];
+      }
+    }
+    const float average = total / kAverageWindowSize;
+    // Quantize the result into eight bits, effectively multiplying by two.
+    // The 127.5 constant here has to match the features_max value defined in
+    // tensorflow/examples/speech_commands/input_data.py, and this also assumes
+    // that features_min is zero. It it wasn't, we'd have to subtract it first.
+    int quantized_average = roundf(average * (255.0f / 127.5f));
+    if (quantized_average < 0) {
+      quantized_average = 0;
+    }
+    if (quantized_average > 255) {
+      quantized_average = 255;
+    }
+    output[i] = quantized_average;
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..adff790d6cc527578dbfb9dc481c99c1021b92db
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Converts audio sample data into a more compact form that's appropriate for
+// feeding into a neural network. There are reference implementations that use
+// both floating point and fixed point available, but because the calculations
+// involved can be time-consuming, it's recommended that you use or write
+// specialized versions for your platform.
+TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                        const int16_t* input, int input_size, int output_size,
+                        uint8_t* output);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8b49f67e3d72faa4700c4bdec7f94a5b79cd72e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestPreprocessor) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t yes_calculated_data[g_yes_power_spectrum_data_size];
+  TfLiteStatus yes_status = Preprocess(
+      error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
+      g_yes_power_spectrum_data_size, yes_calculated_data);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
+
+  for (int i = 0; i < g_yes_power_spectrum_data_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_power_spectrum_data[i],
+                            yes_calculated_data[i]);
+    if (g_yes_power_spectrum_data[i] != yes_calculated_data[i]) {
+      error_reporter->Report("Expected value %d but found %d",
+                             g_yes_power_spectrum_data[i],
+                             yes_calculated_data[i]);
+    }
+  }
+
+  uint8_t no_calculated_data[g_yes_power_spectrum_data_size];
+  TfLiteStatus no_status = Preprocess(
+      error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
+      g_no_power_spectrum_data_size, no_calculated_data);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
+
+  for (int i = 0; i < g_no_power_spectrum_data_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_power_spectrum_data[i], no_calculated_data[i]);
+    if (g_no_power_spectrum_data[i] != no_calculated_data[i]) {
+      error_reporter->Report("Expected value %d but found %d",
+                             g_no_power_spectrum_data[i],
+                             no_calculated_data[i]);
+    }
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c96a61ab517487413e875dc7369bddb1c9a0d9a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
+
+int32_t TimeInMilliseconds() {
+  static int current_time = 0;
+  current_time += 100;
+  return current_time;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..162952844a832ebd0b0273d13a929fec6fa22892
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+
+#include <cstdint>
+
+// Returns the time in milliseconds. There's no contract about what time zero
+// represents, the accuracy, or the granularity of the result. Subsequent calls
+// will generally not return a lower value, but even that's not guaranteed if
+// there's an overflow  wraparound.
+// The reference implementation of this function just returns a constantly
+// incrementing value for each call, since it would need a non-portable platform
+// call to access time information. For real applications, you'll need to write
+// your own platform-specific implementation.
+int32_t TimeInMilliseconds();
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0487a12b25fc17208f1d9ab2b51538102f7ec914
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
+
+#include <limits>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestTimer) {
+  // Make sure that the technically-undefined overflow behavior we rely on below
+  // works on this platform. It's still not guaranteed, but at least this is a
+  // sanity check.  Turn off when running with ASan, as it will complain about
+  // the following undefined behavior.
+#ifndef ADDRESS_SANITIZER
+  int32_t overflow_value = std::numeric_limits<int32_t>::max();
+  overflow_value += 1;
+  TF_LITE_MICRO_EXPECT_EQ(std::numeric_limits<int32_t>::min(), overflow_value);
+#endif
+
+  const int32_t first_time = TimeInMilliseconds();
+  const int32_t second_time = TimeInMilliseconds();
+
+  // It's possible that the timer may have wrapped around from +BIG_NUM to
+  // -BIG_NUM between the first and second calls, since we're storing
+  // milliseconds in a 32-bit integer. It's not reasonable that the call itself
+  // would have taken more than 2^31 milliseconds though, so look at the
+  // difference and rely on integer overflow to ensure it's accurate.
+  const int32_t time_delta = (second_time - first_time);
+  TF_LITE_MICRO_EXPECT_LE(0, time_delta);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62e4359859a422c96ec368b6f91cba99e3c4a4eb
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
@@ -0,0 +1,1673 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Automatically created from a TensorFlow Lite flatbuffer using the command:
+// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
+// See the README for a full description of the creation process.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+
+const unsigned char g_tiny_conv_model_data[] = {
+    0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
+    0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x4d, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xf4, 0x47, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
+    0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74,
+    0x65, 0x64, 0x2e, 0x00, 0x09, 0x00, 0x00, 0x00, 0xd4, 0x47, 0x00, 0x00,
+    0xb4, 0x47, 0x00, 0x00, 0xe4, 0x02, 0x00, 0x00, 0xb4, 0x02, 0x00, 0x00,
+    0xac, 0x02, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb8, 0xb3, 0xff, 0xff,
+    0xbc, 0xb3, 0xff, 0xff, 0xc0, 0xb3, 0xff, 0xff, 0x1e, 0xb4, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00, 0x89, 0xa5, 0xe8, 0xc1,
+    0xb1, 0x89, 0x5b, 0xc6, 0x4f, 0x9b, 0xd3, 0x74, 0x93, 0x88, 0xff, 0xaf,
+    0x89, 0xff, 0xf4, 0x70, 0xcc, 0x75, 0x78, 0xbf, 0x92, 0xcd, 0xa9, 0xa8,
+    0xd6, 0x6a, 0x6f, 0x7b, 0x7f, 0xd8, 0xa8, 0xb1, 0xe6, 0x32, 0x21, 0x70,
+    0xa0, 0x9c, 0x6f, 0xc8, 0xc6, 0x59, 0x67, 0x93, 0x97, 0xca, 0x3f, 0xde,
+    0xcb, 0x74, 0x7c, 0xb5, 0xa4, 0xd9, 0x66, 0xc6, 0x87, 0x98, 0xa5, 0xd0,
+    0xbb, 0xb9, 0xc2, 0xb2, 0xaa, 0x79, 0x25, 0xb9, 0x6d, 0x5a, 0xc8, 0x7f,
+    0x70, 0x85, 0x79, 0xbc, 0x6a, 0x9b, 0xd1, 0x9a, 0x9c, 0x51, 0x53, 0x71,
+    0x89, 0xc0, 0xb4, 0xac, 0xae, 0x47, 0x67, 0x70, 0x79, 0xd2, 0x81, 0xa5,
+    0xd2, 0x09, 0x38, 0x82, 0x74, 0xc9, 0x5d, 0xaf, 0xc1, 0x4f, 0x53, 0x99,
+    0xcb, 0xb7, 0x3a, 0xba, 0xe8, 0x7f, 0x76, 0xb9, 0xb3, 0xd3, 0x60, 0xc0,
+    0x93, 0x9f, 0x87, 0xbd, 0xd0, 0xb8, 0xca, 0xc1, 0xb6, 0x6c, 0x01, 0xc1,
+    0x5c, 0x5d, 0xb2, 0x82, 0x76, 0x77, 0x39, 0xbc, 0x72, 0x6a, 0xc3, 0xb4,
+    0x79, 0x21, 0x48, 0x42, 0x86, 0xa6, 0xbd, 0xaf, 0xae, 0x23, 0x9c, 0x69,
+    0x78, 0xc3, 0x6b, 0xb3, 0xab, 0x43, 0xb2, 0x88, 0x71, 0xc6, 0x6b, 0xbe,
+    0xc3, 0x75, 0xc2, 0xc3, 0xa5, 0xcf, 0x32, 0xbe, 0xcb, 0xb0, 0xb8, 0xc1,
+    0x9c, 0xcf, 0x64, 0xc4, 0xb4, 0x96, 0xa8, 0xb9, 0xcb, 0xc0, 0xc0, 0xb8,
+    0xb8, 0x77, 0x65, 0xc0, 0xc4, 0xb3, 0xc5, 0x77, 0x9b, 0x61, 0xd4, 0xac,
+    0x7e, 0x36, 0xb1, 0xae, 0x36, 0x36, 0xb8, 0x39, 0x6b, 0x70, 0x9c, 0xb5,
+    0x88, 0x5c, 0xb3, 0x6a, 0xad, 0xc5, 0x7b, 0xb4, 0xad, 0xaa, 0xc4, 0x84,
+    0x5e, 0xc4, 0x67, 0xc1, 0xde, 0xba, 0xcf, 0xbd, 0xa0, 0xd3, 0x35, 0xb3,
+    0xe7, 0xc8, 0xb8, 0xb8, 0xaf, 0xb4, 0x59, 0xb8, 0xb4, 0xac, 0xac, 0xaa,
+    0xc7, 0xad, 0xc8, 0xb6, 0xac, 0x99, 0xa0, 0xcb, 0xc1, 0xc8, 0xcb, 0x89,
+    0xc3, 0xac, 0xca, 0x8b, 0x97, 0x1f, 0xbd, 0xbf, 0x13, 0xad, 0xc8, 0x41,
+    0x56, 0x3c, 0x86, 0xb2, 0x61, 0xc4, 0xbb, 0x71, 0xba, 0x92, 0x8d, 0xc3,
+    0x86, 0xcb, 0xc5, 0x8d, 0x88, 0xc8, 0x6a, 0xbf, 0x9c, 0xcd, 0xcd, 0xc0,
+    0x81, 0xb1, 0x47, 0xb5, 0xf0, 0xce, 0xb1, 0xc1, 0xaa, 0xa8, 0x54, 0xcb,
+    0xbc, 0xc7, 0xc5, 0x8e, 0xc3, 0xce, 0xc7, 0xb9, 0xb9, 0xa1, 0xc5, 0xbd,
+    0xb8, 0xb8, 0xb7, 0x81, 0xb6, 0xba, 0xd2, 0x90, 0xbc, 0x96, 0xbe, 0xba,
+    0x53, 0xb5, 0xc7, 0x3c, 0x3c, 0x1f, 0x90, 0xaa, 0x5a, 0xb8, 0xba, 0x7e,
+    0xbc, 0x9e, 0xc2, 0xb1, 0x6e, 0xc0, 0xc4, 0x91, 0xf0, 0xb5, 0x60, 0xad,
+    0x73, 0xba, 0xcd, 0xba, 0x6e, 0x94, 0x39, 0xb5, 0xe4, 0xbe, 0xb4, 0xb5,
+    0xa0, 0xa9, 0x51, 0xac, 0xbc, 0xc2, 0xb3, 0x8a, 0xbd, 0x9a, 0xca, 0xb3,
+    0xbf, 0xaf, 0xb5, 0x9a, 0xb9, 0xc3, 0xb6, 0x92, 0xb5, 0xc1, 0xb0, 0x95,
+    0xd6, 0xcc, 0xbb, 0xbb, 0xa9, 0xb9, 0xac, 0x4a, 0x62, 0x27, 0xa7, 0xa7,
+    0x30, 0xbd, 0xb1, 0x73, 0xa1, 0x74, 0xc2, 0xb7, 0x58, 0xc0, 0xae, 0x8f,
+    0xe1, 0xac, 0x4e, 0xb0, 0x55, 0xc9, 0xc8, 0x9f, 0x83, 0x8e, 0x3e, 0xd5,
+    0xb5, 0xbe, 0xcd, 0xb2, 0xa6, 0xc8, 0x64, 0xac, 0xc0, 0xc8, 0xaf, 0x99,
+    0xc5, 0x9e, 0xb8, 0xbd, 0xa9, 0xc2, 0xb3, 0x81, 0xb4, 0xc2, 0xb4, 0x8f,
+    0xbc, 0xb8, 0x9c, 0x88, 0xbe, 0xc6, 0xbf, 0xba, 0xc8, 0xb4, 0xab, 0x5b,
+    0x92, 0x51, 0xb1, 0x9a, 0x44, 0xb9, 0xab, 0x80, 0xa5, 0x3e, 0xc0, 0xa5,
+    0x5c, 0xb6, 0xa8, 0xa2, 0xb3, 0x9a, 0x6b, 0xb3, 0x34, 0xc6, 0x7e, 0x96,
+    0xcb, 0x88, 0x48, 0xc6, 0xa3, 0xbb, 0xd2, 0xa2, 0xaf, 0xd0, 0x6e, 0xae,
+    0xb4, 0xce, 0xc8, 0x8f, 0xd7, 0xad, 0xc8, 0xb0, 0xae, 0xb7, 0xb2, 0x70,
+    0xb9, 0xad, 0xc1, 0xa0, 0xcb, 0xa2, 0xb0, 0x9b, 0xbe, 0xd3, 0xca, 0xb6,
+    0xbd, 0xaf, 0xa9, 0x82, 0xa1, 0xd7, 0xbc, 0x9b, 0x8b, 0xac, 0xaa, 0xac,
+    0xad, 0x37, 0xb7, 0xb6, 0x46, 0xae, 0xa9, 0xbd, 0x6b, 0x90, 0x5e, 0xcd,
+    0x23, 0xa4, 0x76, 0xa1, 0xc4, 0x96, 0x50, 0xcc, 0x95, 0x99, 0x93, 0xa7,
+    0xb2, 0xe1, 0x7c, 0xbd, 0xbd, 0xb5, 0xbf, 0x9a, 0xca, 0x80, 0xd7, 0xae,
+    0x79, 0xa8, 0xaa, 0xb2, 0xbc, 0x51, 0xda, 0xa3, 0x80, 0x8b, 0xa2, 0xc8,
+    0xd1, 0x94, 0xe1, 0xc4, 0xbd, 0xae, 0xae, 0xcc, 0xb3, 0xca, 0xd5, 0xa1,
+    0xd5, 0xa7, 0xaf, 0xd2, 0xb4, 0x8d, 0xcc, 0xc8, 0x63, 0xa3, 0xa4, 0xdf,
+    0x6f, 0x7e, 0x98, 0xdf, 0x1b, 0x7b, 0x43, 0x99, 0xb0, 0x99, 0x71, 0xdb,
+    0x63, 0x7b, 0x69, 0x9c, 0xba, 0xcd, 0x90, 0xd0, 0xb6, 0xa6, 0x9e, 0x95,
+    0x50, 0xb6, 0xff, 0xff, 0xae, 0xb6, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0xc7, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00,
+    0xda, 0xb6, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0xc0, 0x44, 0x00, 0x00,
+    0x2c, 0x30, 0x38, 0x5a, 0x3d, 0x4c, 0x44, 0x3b, 0x48, 0x48, 0x44, 0x57,
+    0x3f, 0x43, 0x45, 0x3a, 0x24, 0x32, 0x21, 0x5c, 0x3f, 0x3a, 0x38, 0x3a,
+    0x35, 0x35, 0x2f, 0x51, 0x3c, 0x3a, 0x45, 0x3a, 0x3b, 0x41, 0x39, 0x55,
+    0x3c, 0x41, 0x39, 0x44, 0x3a, 0x40, 0x37, 0x48, 0x33, 0x47, 0x36, 0x3e,
+    0x3c, 0x41, 0x3f, 0x3e, 0x3e, 0x47, 0x36, 0x3e, 0x41, 0x33, 0x3e, 0x3b,
+    0x3a, 0x46, 0x45, 0x40, 0x48, 0x3a, 0x35, 0x4b, 0x45, 0x4d, 0x3c, 0x49,
+    0x42, 0x44, 0x3c, 0x4c, 0x3e, 0x3c, 0x44, 0x32, 0x33, 0x41, 0x36, 0x4b,
+    0x38, 0x3b, 0x3c, 0x38, 0x3b, 0x45, 0x34, 0x46, 0x40, 0x4e, 0x44, 0x35,
+    0x43, 0x36, 0x3d, 0x40, 0x3e, 0x48, 0x40, 0x34, 0x3a, 0x46, 0x45, 0x43,
+    0x45, 0x3f, 0x47, 0x37, 0x36, 0x35, 0x44, 0x3a, 0x3e, 0x37, 0x39, 0x40,
+    0x3a, 0x3f, 0x3f, 0x4c, 0x3e, 0x41, 0x43, 0x35, 0x3f, 0x3d, 0x3d, 0x4c,
+    0x3c, 0x4a, 0x46, 0x3c, 0x3a, 0x41, 0x40, 0x4e, 0x36, 0x47, 0x40, 0x3b,
+    0x47, 0x42, 0x38, 0x4d, 0x48, 0x47, 0x3c, 0x3c, 0x33, 0x3b, 0x3e, 0x42,
+    0x3f, 0x3e, 0x3a, 0x3d, 0x32, 0x39, 0x41, 0x46, 0x3a, 0x3a, 0x3e, 0x3e,
+    0x47, 0x48, 0x4e, 0x36, 0x44, 0x40, 0x41, 0x45, 0x3a, 0x3c, 0x38, 0x55,
+    0x2e, 0x26, 0x2f, 0x32, 0x3f, 0x41, 0x3e, 0x4c, 0x45, 0x36, 0x40, 0x31,
+    0x17, 0x2e, 0x14, 0x53, 0x34, 0x30, 0x34, 0x3f, 0x2e, 0x44, 0x2b, 0x4e,
+    0x34, 0x3e, 0x34, 0x43, 0x3d, 0x35, 0x3f, 0x46, 0x39, 0x40, 0x38, 0x3e,
+    0x35, 0x3b, 0x35, 0x45, 0x3d, 0x40, 0x38, 0x37, 0x40, 0x3e, 0x32, 0x3e,
+    0x41, 0x39, 0x30, 0x41, 0x3a, 0x32, 0x3e, 0x3d, 0x39, 0x31, 0x33, 0x3e,
+    0x41, 0x47, 0x40, 0x47, 0x35, 0x33, 0x3c, 0x32, 0x40, 0x3c, 0x42, 0x49,
+    0x34, 0x38, 0x39, 0x37, 0x39, 0x35, 0x40, 0x4d, 0x37, 0x43, 0x42, 0x3e,
+    0x3f, 0x3c, 0x3e, 0x51, 0x36, 0x37, 0x42, 0x41, 0x36, 0x31, 0x43, 0x3d,
+    0x46, 0x43, 0x37, 0x46, 0x32, 0x45, 0x42, 0x36, 0x3f, 0x42, 0x42, 0x41,
+    0x3d, 0x46, 0x39, 0x41, 0x3c, 0x3f, 0x38, 0x3c, 0x43, 0x43, 0x3d, 0x3c,
+    0x3d, 0x41, 0x38, 0x42, 0x3a, 0x3d, 0x43, 0x42, 0x41, 0x40, 0x39, 0x36,
+    0x3a, 0x3c, 0x3c, 0x4f, 0x44, 0x36, 0x39, 0x35, 0x46, 0x46, 0x36, 0x4a,
+    0x3a, 0x42, 0x43, 0x39, 0x3f, 0x3d, 0x3c, 0x47, 0x38, 0x3f, 0x43, 0x40,
+    0x36, 0x3c, 0x45, 0x3b, 0x33, 0x36, 0x3b, 0x39, 0x3c, 0x35, 0x40, 0x38,
+    0x40, 0x3e, 0x3f, 0x48, 0x3f, 0x34, 0x40, 0x53, 0x26, 0x2c, 0x29, 0x39,
+    0x2a, 0x38, 0x3f, 0x45, 0x32, 0x31, 0x4a, 0x37, 0x1c, 0x28, 0x09, 0x43,
+    0x35, 0x3b, 0x33, 0x3c, 0x32, 0x3f, 0x28, 0x41, 0x36, 0x35, 0x3a, 0x37,
+    0x41, 0x39, 0x32, 0x3c, 0x40, 0x3c, 0x3c, 0x32, 0x38, 0x39, 0x37, 0x44,
+    0x3a, 0x33, 0x41, 0x36, 0x37, 0x3c, 0x35, 0x3a, 0x3d, 0x30, 0x3d, 0x41,
+    0x37, 0x3c, 0x45, 0x3a, 0x37, 0x2f, 0x36, 0x3c, 0x3a, 0x3d, 0x39, 0x48,
+    0x46, 0x33, 0x3a, 0x3e, 0x40, 0x3d, 0x3b, 0x52, 0x38, 0x45, 0x34, 0x47,
+    0x39, 0x36, 0x37, 0x56, 0x42, 0x3f, 0x33, 0x36, 0x38, 0x3f, 0x40, 0x53,
+    0x3e, 0x37, 0x3d, 0x3c, 0x48, 0x3a, 0x3d, 0x33, 0x39, 0x40, 0x3e, 0x35,
+    0x3d, 0x46, 0x38, 0x36, 0x37, 0x43, 0x3a, 0x3c, 0x40, 0x38, 0x39, 0x3b,
+    0x39, 0x3a, 0x42, 0x3d, 0x34, 0x3f, 0x35, 0x43, 0x3a, 0x35, 0x46, 0x3a,
+    0x48, 0x38, 0x3b, 0x48, 0x3c, 0x35, 0x42, 0x3d, 0x3a, 0x3d, 0x38, 0x42,
+    0x3e, 0x3c, 0x33, 0x39, 0x34, 0x30, 0x42, 0x44, 0x41, 0x3d, 0x3c, 0x39,
+    0x3c, 0x3a, 0x39, 0x41, 0x3d, 0x44, 0x3c, 0x40, 0x3f, 0x3e, 0x42, 0x3f,
+    0x37, 0x40, 0x39, 0x3b, 0x42, 0x43, 0x49, 0x37, 0x39, 0x46, 0x35, 0x3c,
+    0x3e, 0x39, 0x45, 0x52, 0x24, 0x2d, 0x38, 0x35, 0x3a, 0x3a, 0x3c, 0x44,
+    0x39, 0x32, 0x51, 0x3f, 0x16, 0x34, 0x0a, 0x49, 0x39, 0x38, 0x39, 0x3e,
+    0x2f, 0x36, 0x24, 0x3f, 0x37, 0x34, 0x38, 0x3b, 0x34, 0x34, 0x30, 0x3b,
+    0x3d, 0x36, 0x35, 0x42, 0x33, 0x40, 0x37, 0x35, 0x43, 0x3f, 0x3f, 0x39,
+    0x3a, 0x43, 0x36, 0x3e, 0x39, 0x3d, 0x3f, 0x3d, 0x47, 0x3b, 0x39, 0x37,
+    0x35, 0x42, 0x3f, 0x3b, 0x41, 0x3a, 0x42, 0x4b, 0x3d, 0x3f, 0x3d, 0x3e,
+    0x38, 0x3b, 0x34, 0x4e, 0x3f, 0x39, 0x36, 0x43, 0x39, 0x35, 0x41, 0x4d,
+    0x3c, 0x39, 0x43, 0x33, 0x37, 0x3b, 0x41, 0x48, 0x3c, 0x3f, 0x39, 0x32,
+    0x35, 0x3d, 0x42, 0x35, 0x3d, 0x3e, 0x37, 0x3b, 0x38, 0x3a, 0x44, 0x36,
+    0x42, 0x35, 0x48, 0x40, 0x3a, 0x44, 0x44, 0x39, 0x43, 0x41, 0x3c, 0x37,
+    0x47, 0x3b, 0x42, 0x42, 0x45, 0x3a, 0x40, 0x46, 0x35, 0x3f, 0x3a, 0x48,
+    0x35, 0x44, 0x3f, 0x37, 0x33, 0x3e, 0x45, 0x49, 0x39, 0x43, 0x47, 0x37,
+    0x3f, 0x3f, 0x3b, 0x44, 0x38, 0x3d, 0x39, 0x42, 0x37, 0x3e, 0x40, 0x45,
+    0x3b, 0x3f, 0x40, 0x34, 0x42, 0x3f, 0x43, 0x3c, 0x43, 0x41, 0x38, 0x38,
+    0x38, 0x41, 0x55, 0x33, 0x33, 0x39, 0x39, 0x3c, 0x35, 0x39, 0x38, 0x42,
+    0x27, 0x26, 0x32, 0x41, 0x41, 0x32, 0x3f, 0x47, 0x3a, 0x38, 0x48, 0x37,
+    0x11, 0x27, 0x08, 0x49, 0x35, 0x42, 0x3c, 0x2e, 0x34, 0x43, 0x25, 0x3b,
+    0x3a, 0x33, 0x37, 0x30, 0x3c, 0x36, 0x2d, 0x3c, 0x3b, 0x39, 0x3b, 0x40,
+    0x46, 0x3a, 0x30, 0x42, 0x35, 0x32, 0x36, 0x3a, 0x3a, 0x34, 0x34, 0x33,
+    0x3d, 0x30, 0x3b, 0x42, 0x41, 0x3f, 0x3d, 0x3b, 0x44, 0x3d, 0x41, 0x41,
+    0x3d, 0x3f, 0x40, 0x51, 0x42, 0x42, 0x36, 0x45, 0x30, 0x40, 0x32, 0x4f,
+    0x3a, 0x3c, 0x40, 0x39, 0x3d, 0x3b, 0x3e, 0x4b, 0x3d, 0x37, 0x42, 0x46,
+    0x40, 0x40, 0x47, 0x3d, 0x35, 0x3c, 0x3f, 0x46, 0x37, 0x37, 0x3a, 0x2e,
+    0x3d, 0x3c, 0x3a, 0x46, 0x3a, 0x44, 0x3c, 0x3a, 0x32, 0x44, 0x31, 0x41,
+    0x43, 0x36, 0x49, 0x39, 0x3d, 0x37, 0x3f, 0x41, 0x3b, 0x3b, 0x3c, 0x42,
+    0x3c, 0x34, 0x3f, 0x3b, 0x40, 0x3e, 0x48, 0x47, 0x3e, 0x3c, 0x38, 0x39,
+    0x3f, 0x35, 0x39, 0x3f, 0x3e, 0x3e, 0x3b, 0x43, 0x41, 0x40, 0x43, 0x41,
+    0x3f, 0x37, 0x39, 0x41, 0x46, 0x32, 0x3d, 0x41, 0x36, 0x3f, 0x3e, 0x3f,
+    0x36, 0x48, 0x43, 0x3d, 0x43, 0x3f, 0x34, 0x3d, 0x34, 0x35, 0x4f, 0x32,
+    0x3c, 0x3f, 0x3d, 0x3f, 0x39, 0x3c, 0x3d, 0x47, 0x23, 0x36, 0x33, 0x45,
+    0x37, 0x2e, 0x42, 0x42, 0x39, 0x34, 0x4f, 0x3f, 0x19, 0x2b, 0x01, 0x50,
+    0x35, 0x3f, 0x37, 0x3c, 0x33, 0x35, 0x25, 0x32, 0x38, 0x3e, 0x40, 0x40,
+    0x2f, 0x38, 0x35, 0x3d, 0x31, 0x42, 0x44, 0x3c, 0x3a, 0x3d, 0x2d, 0x3e,
+    0x3b, 0x3e, 0x3d, 0x31, 0x3b, 0x37, 0x35, 0x31, 0x36, 0x35, 0x34, 0x31,
+    0x41, 0x3a, 0x33, 0x32, 0x3c, 0x31, 0x3e, 0x3d, 0x40, 0x3b, 0x34, 0x45,
+    0x36, 0x39, 0x3e, 0x3f, 0x3c, 0x45, 0x37, 0x4b, 0x42, 0x3d, 0x33, 0x43,
+    0x3e, 0x40, 0x35, 0x4e, 0x38, 0x36, 0x3a, 0x33, 0x38, 0x44, 0x3f, 0x3c,
+    0x3f, 0x40, 0x3a, 0x3c, 0x3c, 0x3c, 0x44, 0x29, 0x3a, 0x40, 0x35, 0x3a,
+    0x3d, 0x48, 0x3b, 0x30, 0x45, 0x41, 0x45, 0x40, 0x37, 0x32, 0x3a, 0x35,
+    0x3f, 0x38, 0x3b, 0x43, 0x3b, 0x3f, 0x33, 0x40, 0x3b, 0x40, 0x38, 0x33,
+    0x39, 0x3c, 0x3c, 0x3f, 0x43, 0x33, 0x43, 0x40, 0x43, 0x3d, 0x33, 0x42,
+    0x40, 0x32, 0x3e, 0x36, 0x40, 0x38, 0x43, 0x40, 0x44, 0x38, 0x34, 0x3c,
+    0x3e, 0x39, 0x47, 0x43, 0x40, 0x3b, 0x3f, 0x3f, 0x3c, 0x3b, 0x4b, 0x33,
+    0x36, 0x49, 0x32, 0x41, 0x48, 0x45, 0x57, 0x3a, 0x40, 0x42, 0x40, 0x46,
+    0x36, 0x35, 0x3c, 0x46, 0x22, 0x2e, 0x33, 0x3e, 0x3c, 0x39, 0x44, 0x4d,
+    0x3f, 0x41, 0x51, 0x44, 0x15, 0x2e, 0x02, 0x4e, 0x39, 0x3a, 0x3c, 0x35,
+    0x30, 0x38, 0x1e, 0x31, 0x40, 0x3b, 0x39, 0x3d, 0x3a, 0x37, 0x35, 0x36,
+    0x46, 0x36, 0x3c, 0x3e, 0x39, 0x3e, 0x32, 0x40, 0x3b, 0x35, 0x42, 0x41,
+    0x41, 0x38, 0x41, 0x35, 0x42, 0x36, 0x3c, 0x42, 0x3d, 0x41, 0x35, 0x31,
+    0x3f, 0x44, 0x3e, 0x41, 0x3f, 0x35, 0x42, 0x4b, 0x3e, 0x36, 0x37, 0x34,
+    0x36, 0x3d, 0x40, 0x49, 0x41, 0x3e, 0x3d, 0x3b, 0x38, 0x37, 0x40, 0x47,
+    0x35, 0x32, 0x43, 0x38, 0x36, 0x3b, 0x33, 0x47, 0x33, 0x34, 0x3d, 0x47,
+    0x3c, 0x37, 0x3d, 0x2b, 0x3a, 0x36, 0x3b, 0x3d, 0x43, 0x38, 0x35, 0x32,
+    0x32, 0x37, 0x43, 0x36, 0x3f, 0x48, 0x38, 0x30, 0x3a, 0x3c, 0x42, 0x34,
+    0x37, 0x3c, 0x37, 0x40, 0x48, 0x3e, 0x35, 0x3b, 0x3f, 0x38, 0x39, 0x3e,
+    0x37, 0x35, 0x36, 0x3d, 0x3b, 0x3c, 0x40, 0x3d, 0x34, 0x40, 0x46, 0x42,
+    0x3f, 0x3c, 0x3c, 0x3e, 0x40, 0x40, 0x3d, 0x3f, 0x3f, 0x44, 0x46, 0x41,
+    0x32, 0x43, 0x40, 0x41, 0x3c, 0x42, 0x39, 0x38, 0x48, 0x44, 0x3d, 0x38,
+    0x34, 0x40, 0x4e, 0x31, 0x3c, 0x42, 0x39, 0x48, 0x3c, 0x33, 0x3e, 0x40,
+    0x20, 0x27, 0x39, 0x45, 0x45, 0x36, 0x47, 0x4c, 0x35, 0x3e, 0x4a, 0x36,
+    0x16, 0x2f, 0x04, 0x4f, 0x3a, 0x35, 0x36, 0x3a, 0x2d, 0x36, 0x21, 0x34,
+    0x3b, 0x32, 0x3d, 0x3c, 0x3c, 0x3f, 0x3b, 0x3b, 0x41, 0x46, 0x40, 0x3d,
+    0x3b, 0x44, 0x33, 0x42, 0x34, 0x33, 0x3e, 0x45, 0x3f, 0x46, 0x39, 0x33,
+    0x3b, 0x37, 0x37, 0x37, 0x42, 0x47, 0x3c, 0x35, 0x31, 0x41, 0x44, 0x3a,
+    0x3b, 0x33, 0x39, 0x44, 0x42, 0x33, 0x3d, 0x3f, 0x43, 0x33, 0x41, 0x4a,
+    0x35, 0x46, 0x36, 0x3e, 0x39, 0x41, 0x41, 0x4c, 0x34, 0x3d, 0x38, 0x33,
+    0x3c, 0x3f, 0x43, 0x44, 0x37, 0x35, 0x35, 0x3c, 0x43, 0x34, 0x3e, 0x2d,
+    0x3f, 0x35, 0x38, 0x3c, 0x33, 0x35, 0x43, 0x2a, 0x40, 0x33, 0x34, 0x40,
+    0x3d, 0x38, 0x36, 0x2d, 0x36, 0x3c, 0x43, 0x3d, 0x37, 0x3d, 0x39, 0x38,
+    0x3b, 0x3e, 0x3c, 0x46, 0x35, 0x35, 0x43, 0x44, 0x39, 0x40, 0x34, 0x39,
+    0x3d, 0x34, 0x40, 0x45, 0x38, 0x35, 0x3e, 0x39, 0x3c, 0x44, 0x48, 0x44,
+    0x41, 0x3e, 0x3c, 0x45, 0x3a, 0x3c, 0x3c, 0x46, 0x3a, 0x40, 0x39, 0x43,
+    0x35, 0x35, 0x3e, 0x45, 0x3a, 0x34, 0x3c, 0x39, 0x46, 0x3a, 0x4f, 0x35,
+    0x32, 0x3d, 0x36, 0x41, 0x32, 0x38, 0x3f, 0x45, 0x2d, 0x34, 0x2a, 0x35,
+    0x43, 0x3f, 0x41, 0x49, 0x41, 0x3c, 0x4b, 0x3f, 0x17, 0x31, 0x02, 0x4f,
+    0x30, 0x38, 0x39, 0x40, 0x33, 0x3a, 0x25, 0x38, 0x35, 0x3c, 0x39, 0x35,
+    0x34, 0x41, 0x34, 0x43, 0x40, 0x40, 0x46, 0x3d, 0x40, 0x38, 0x3f, 0x3b,
+    0x35, 0x39, 0x3c, 0x39, 0x34, 0x38, 0x3f, 0x36, 0x3a, 0x38, 0x44, 0x3f,
+    0x3f, 0x38, 0x3c, 0x33, 0x41, 0x42, 0x38, 0x33, 0x3c, 0x3b, 0x3c, 0x46,
+    0x38, 0x3b, 0x3f, 0x33, 0x3f, 0x48, 0x3b, 0x49, 0x3f, 0x3a, 0x3d, 0x3f,
+    0x47, 0x3d, 0x30, 0x45, 0x36, 0x42, 0x3d, 0x36, 0x43, 0x38, 0x3b, 0x3d,
+    0x3c, 0x30, 0x3b, 0x43, 0x3d, 0x41, 0x34, 0x2e, 0x43, 0x3d, 0x43, 0x46,
+    0x43, 0x3c, 0x3c, 0x2e, 0x3c, 0x43, 0x34, 0x43, 0x3e, 0x43, 0x3f, 0x2b,
+    0x45, 0x40, 0x3a, 0x43, 0x36, 0x39, 0x3f, 0x3d, 0x3a, 0x3c, 0x35, 0x3b,
+    0x36, 0x3f, 0x45, 0x3e, 0x45, 0x40, 0x3f, 0x36, 0x45, 0x42, 0x35, 0x3e,
+    0x3a, 0x3a, 0x3f, 0x40, 0x3e, 0x3c, 0x39, 0x46, 0x43, 0x3e, 0x3f, 0x3f,
+    0x40, 0x3c, 0x40, 0x4b, 0x41, 0x35, 0x3b, 0x3e, 0x49, 0x32, 0x3e, 0x41,
+    0x31, 0x37, 0x3d, 0x3b, 0x3f, 0x45, 0x50, 0x3a, 0x3f, 0x3c, 0x44, 0x36,
+    0x43, 0x37, 0x3d, 0x4b, 0x29, 0x39, 0x2f, 0x38, 0x45, 0x36, 0x40, 0x4e,
+    0x39, 0x3f, 0x48, 0x43, 0x23, 0x3c, 0x06, 0x51, 0x37, 0x3b, 0x3e, 0x3b,
+    0x28, 0x45, 0x2b, 0x37, 0x3f, 0x33, 0x3f, 0x41, 0x31, 0x36, 0x33, 0x3a,
+    0x3a, 0x35, 0x3b, 0x33, 0x3e, 0x36, 0x35, 0x40, 0x3a, 0x34, 0x3a, 0x38,
+    0x34, 0x3a, 0x3a, 0x34, 0x42, 0x45, 0x40, 0x3e, 0x40, 0x38, 0x39, 0x34,
+    0x38, 0x37, 0x3f, 0x3e, 0x3c, 0x32, 0x3f, 0x46, 0x3f, 0x44, 0x3b, 0x3e,
+    0x44, 0x45, 0x36, 0x3e, 0x36, 0x3f, 0x3b, 0x40, 0x39, 0x34, 0x38, 0x41,
+    0x42, 0x3e, 0x3d, 0x47, 0x3e, 0x45, 0x33, 0x40, 0x3e, 0x3a, 0x44, 0x3d,
+    0x3c, 0x3a, 0x3a, 0x2c, 0x3a, 0x3d, 0x35, 0x45, 0x3c, 0x41, 0x36, 0x30,
+    0x32, 0x32, 0x3a, 0x3b, 0x35, 0x3c, 0x43, 0x2d, 0x35, 0x3f, 0x41, 0x37,
+    0x3f, 0x46, 0x34, 0x39, 0x3c, 0x43, 0x40, 0x3e, 0x3e, 0x36, 0x3e, 0x3c,
+    0x37, 0x3a, 0x3d, 0x3a, 0x3c, 0x38, 0x44, 0x41, 0x3f, 0x3b, 0x3c, 0x47,
+    0x40, 0x3b, 0x41, 0x47, 0x3e, 0x45, 0x39, 0x3e, 0x37, 0x45, 0x4b, 0x4c,
+    0x37, 0x37, 0x37, 0x3c, 0x3c, 0x3d, 0x40, 0x38, 0x39, 0x3e, 0x43, 0x3f,
+    0x38, 0x45, 0x51, 0x3c, 0x31, 0x34, 0x3b, 0x48, 0x46, 0x41, 0x40, 0x40,
+    0x2c, 0x39, 0x32, 0x42, 0x3c, 0x2e, 0x49, 0x4d, 0x3c, 0x3f, 0x45, 0x38,
+    0x20, 0x38, 0x03, 0x55, 0x33, 0x3e, 0x32, 0x39, 0x32, 0x3b, 0x24, 0x2b,
+    0x42, 0x35, 0x45, 0x32, 0x2e, 0x3b, 0x2f, 0x3f, 0x3c, 0x37, 0x39, 0x3b,
+    0x34, 0x34, 0x3d, 0x36, 0x3d, 0x39, 0x3b, 0x30, 0x3c, 0x3e, 0x40, 0x32,
+    0x3d, 0x3c, 0x3c, 0x3e, 0x33, 0x33, 0x3f, 0x3a, 0x33, 0x3e, 0x46, 0x36,
+    0x3a, 0x3d, 0x40, 0x40, 0x3f, 0x41, 0x3a, 0x42, 0x34, 0x32, 0x34, 0x46,
+    0x3b, 0x31, 0x40, 0x37, 0x37, 0x32, 0x3e, 0x47, 0x3f, 0x3b, 0x3e, 0x43,
+    0x49, 0x45, 0x3a, 0x3d, 0x3e, 0x44, 0x40, 0x31, 0x39, 0x3e, 0x3b, 0x2d,
+    0x3b, 0x3a, 0x33, 0x3d, 0x39, 0x37, 0x3e, 0x32, 0x41, 0x3c, 0x3a, 0x37,
+    0x3b, 0x40, 0x39, 0x2f, 0x3e, 0x3f, 0x47, 0x32, 0x3e, 0x3b, 0x3e, 0x3e,
+    0x40, 0x3e, 0x40, 0x3c, 0x41, 0x39, 0x38, 0x46, 0x45, 0x32, 0x47, 0x31,
+    0x36, 0x47, 0x37, 0x49, 0x3a, 0x3f, 0x47, 0x3a, 0x41, 0x3b, 0x3c, 0x4f,
+    0x3e, 0x36, 0x3b, 0x47, 0x35, 0x39, 0x41, 0x4e, 0x3d, 0x3e, 0x3b, 0x46,
+    0x38, 0x39, 0x3b, 0x45, 0x3e, 0x3f, 0x44, 0x42, 0x44, 0x3f, 0x55, 0x3b,
+    0x41, 0x3d, 0x43, 0x43, 0x37, 0x3f, 0x3d, 0x4c, 0x28, 0x3d, 0x36, 0x3c,
+    0x3e, 0x3e, 0x48, 0x50, 0x3e, 0x39, 0x45, 0x41, 0x22, 0x37, 0x07, 0x4f,
+    0x2e, 0x33, 0x38, 0x3f, 0x31, 0x3a, 0x1b, 0x36, 0x34, 0x38, 0x3c, 0x37,
+    0x37, 0x3e, 0x36, 0x35, 0x36, 0x3b, 0x3d, 0x38, 0x42, 0x48, 0x3d, 0x40,
+    0x40, 0x44, 0x3d, 0x39, 0x37, 0x3b, 0x3d, 0x33, 0x3d, 0x35, 0x42, 0x3c,
+    0x39, 0x3e, 0x43, 0x2d, 0x3c, 0x40, 0x43, 0x43, 0x45, 0x35, 0x3c, 0x44,
+    0x34, 0x3c, 0x3d, 0x31, 0x39, 0x40, 0x39, 0x3d, 0x3e, 0x34, 0x3e, 0x3b,
+    0x40, 0x38, 0x42, 0x4a, 0x40, 0x3b, 0x35, 0x3d, 0x36, 0x38, 0x35, 0x42,
+    0x3c, 0x3c, 0x3d, 0x3b, 0x38, 0x39, 0x45, 0x28, 0x3a, 0x37, 0x37, 0x35,
+    0x3a, 0x3d, 0x35, 0x2a, 0x3c, 0x3f, 0x37, 0x34, 0x37, 0x3f, 0x3e, 0x2b,
+    0x39, 0x43, 0x3b, 0x45, 0x35, 0x36, 0x36, 0x42, 0x33, 0x38, 0x3b, 0x35,
+    0x31, 0x3f, 0x41, 0x41, 0x3c, 0x41, 0x45, 0x42, 0x3b, 0x3c, 0x39, 0x46,
+    0x3c, 0x3e, 0x3a, 0x41, 0x39, 0x3d, 0x41, 0x4b, 0x40, 0x3f, 0x43, 0x3d,
+    0x39, 0x39, 0x44, 0x44, 0x37, 0x42, 0x3f, 0x44, 0x3e, 0x37, 0x42, 0x35,
+    0x44, 0x3f, 0x40, 0x42, 0x3f, 0x3a, 0x47, 0x3d, 0x38, 0x3a, 0x3b, 0x3a,
+    0x42, 0x36, 0x3a, 0x97, 0x32, 0x31, 0x30, 0x36, 0x47, 0x3e, 0x46, 0x51,
+    0x42, 0x34, 0x50, 0x34, 0x26, 0x3b, 0x06, 0x55, 0x3c, 0x3b, 0x2d, 0x3a,
+    0x37, 0x37, 0x1b, 0x32, 0x39, 0x3d, 0x36, 0x40, 0x3b, 0x3f, 0x33, 0x33,
+    0x3d, 0x37, 0x35, 0x37, 0x44, 0x3f, 0x35, 0x39, 0x33, 0x3c, 0x43, 0x39,
+    0x3f, 0x42, 0x3e, 0x34, 0x38, 0x38, 0x39, 0x3c, 0x48, 0x3c, 0x2f, 0x30,
+    0x40, 0x3c, 0x41, 0x3e, 0x3f, 0x3e, 0x36, 0x43, 0x40, 0x3c, 0x36, 0x43,
+    0x43, 0x38, 0x3a, 0x47, 0x3e, 0x37, 0x39, 0x3a, 0x43, 0x45, 0x38, 0x43,
+    0x3b, 0x45, 0x37, 0x44, 0x36, 0x45, 0x3a, 0x3e, 0x3e, 0x3e, 0x3d, 0x33,
+    0x39, 0x36, 0x48, 0x33, 0x30, 0x42, 0x33, 0x39, 0x37, 0x3a, 0x3f, 0x34,
+    0x34, 0x40, 0x40, 0x40, 0x3f, 0x3d, 0x3f, 0x33, 0x41, 0x40, 0x3b, 0x43,
+    0x3b, 0x3a, 0x40, 0x3a, 0x38, 0x3e, 0x38, 0x3b, 0x38, 0x42, 0x40, 0x40,
+    0x41, 0x35, 0x37, 0x38, 0x3b, 0x3c, 0x39, 0x4b, 0x32, 0x39, 0x42, 0x3c,
+    0x36, 0x3d, 0x32, 0x52, 0x3a, 0x31, 0x40, 0x40, 0x3a, 0x43, 0x3d, 0x46,
+    0x3c, 0x3e, 0x3e, 0x33, 0x3f, 0x41, 0x4d, 0x37, 0x39, 0x39, 0x3e, 0x3b,
+    0x40, 0x39, 0x53, 0x2d, 0x46, 0x3c, 0x32, 0x42, 0x3d, 0x40, 0x40, 0x4d,
+    0x2e, 0x34, 0x39, 0x3b, 0x46, 0x3b, 0x42, 0x4f, 0x3d, 0x39, 0x4e, 0x36,
+    0x1a, 0x31, 0x0e, 0x56, 0x36, 0x42, 0x38, 0x44, 0x36, 0x3a, 0x20, 0x30,
+    0x36, 0x34, 0x37, 0x38, 0x40, 0x41, 0x2a, 0x35, 0x3b, 0x3b, 0x3a, 0x38,
+    0x33, 0x39, 0x36, 0x41, 0x43, 0x39, 0x35, 0x3d, 0x37, 0x3d, 0x33, 0x31,
+    0x45, 0x33, 0x3f, 0x3b, 0x44, 0x38, 0x39, 0x34, 0x38, 0x39, 0x38, 0x3d,
+    0x3a, 0x3a, 0x41, 0x40, 0x44, 0x3e, 0x3f, 0x45, 0x34, 0x31, 0x34, 0x43,
+    0x3b, 0x34, 0x42, 0x3c, 0x3c, 0x43, 0x35, 0x45, 0x36, 0x38, 0x3d, 0x3c,
+    0x3f, 0x3d, 0x3e, 0x45, 0x41, 0x43, 0x35, 0x3f, 0x40, 0x3f, 0x3a, 0x34,
+    0x3d, 0x32, 0x41, 0x3d, 0x48, 0x42, 0x37, 0x2a, 0x3c, 0x3a, 0x3e, 0x49,
+    0x38, 0x36, 0x38, 0x2e, 0x36, 0x37, 0x34, 0x3e, 0x3c, 0x43, 0x43, 0x39,
+    0x39, 0x3b, 0x44, 0x46, 0x44, 0x43, 0x37, 0x46, 0x43, 0x34, 0x3b, 0x35,
+    0x42, 0x41, 0x3f, 0x3d, 0x3d, 0x3a, 0x42, 0x3e, 0x38, 0x47, 0x3d, 0x49,
+    0x45, 0x49, 0x3a, 0x3c, 0x3e, 0x37, 0x40, 0x46, 0x41, 0x33, 0x45, 0x36,
+    0x37, 0x44, 0x49, 0x3b, 0x44, 0x40, 0x33, 0x46, 0x37, 0x39, 0x4e, 0x3a,
+    0x43, 0x38, 0x3a, 0x42, 0x3a, 0x3d, 0x45, 0x50, 0x26, 0x34, 0x3b, 0x3c,
+    0x46, 0x46, 0x4c, 0x54, 0x3f, 0x35, 0x4e, 0x47, 0x21, 0x39, 0x0e, 0x54,
+    0x3a, 0x3a, 0x2f, 0x40, 0x2d, 0x3a, 0x1f, 0x31, 0x31, 0x42, 0x34, 0x45,
+    0x37, 0x36, 0x30, 0x3b, 0x3a, 0x3a, 0x36, 0x40, 0x32, 0x36, 0x3c, 0x3c,
+    0x37, 0x42, 0x35, 0x3e, 0x39, 0x47, 0x36, 0x32, 0x41, 0x30, 0x42, 0x39,
+    0x39, 0x44, 0x37, 0x30, 0x41, 0x3b, 0x3d, 0x3d, 0x43, 0x3b, 0x38, 0x45,
+    0x3b, 0x3a, 0x39, 0x3a, 0x31, 0x33, 0x43, 0x46, 0x3f, 0x41, 0x44, 0x3f,
+    0x3b, 0x44, 0x3a, 0x4c, 0x33, 0x33, 0x33, 0x3e, 0x37, 0x3e, 0x45, 0x45,
+    0x36, 0x42, 0x3e, 0x43, 0x40, 0x34, 0x36, 0x31, 0x38, 0x34, 0x41, 0x3b,
+    0x32, 0x38, 0x3e, 0x29, 0x47, 0x33, 0x37, 0x45, 0x3c, 0x3d, 0x43, 0x2c,
+    0x36, 0x3a, 0x3c, 0x40, 0x3d, 0x46, 0x3c, 0x37, 0x40, 0x44, 0x37, 0x38,
+    0x3e, 0x41, 0x3c, 0x40, 0x33, 0x3f, 0x44, 0x32, 0x44, 0x3a, 0x43, 0x42,
+    0x3e, 0x38, 0x44, 0x3b, 0x41, 0x48, 0x3f, 0x4e, 0x3f, 0x44, 0x35, 0x45,
+    0x34, 0x3f, 0x42, 0x4b, 0x37, 0x37, 0x3e, 0x45, 0x46, 0x45, 0x46, 0x3d,
+    0x3e, 0x39, 0x3b, 0x3a, 0x46, 0x3a, 0x56, 0x35, 0x46, 0x3d, 0x40, 0x3b,
+    0x36, 0x39, 0x3f, 0x54, 0x27, 0x2b, 0x34, 0x3c, 0x48, 0x3d, 0x49, 0x4c,
+    0x3e, 0x3d, 0x4e, 0x42, 0x25, 0x3b, 0x10, 0x4d, 0x30, 0x36, 0x3e, 0x36,
+    0x2e, 0x31, 0x1d, 0x37, 0x3a, 0x39, 0x33, 0x3f, 0x39, 0x38, 0x2e, 0x36,
+    0x44, 0x3e, 0x41, 0x37, 0x3b, 0x30, 0x3b, 0x48, 0x31, 0x39, 0x41, 0x3e,
+    0x37, 0x37, 0x34, 0x2f, 0x35, 0x3b, 0x3a, 0x3e, 0x45, 0x3e, 0x3f, 0x35,
+    0x39, 0x39, 0x3b, 0x44, 0x43, 0x3c, 0x3e, 0x46, 0x40, 0x3a, 0x36, 0x45,
+    0x41, 0x40, 0x36, 0x44, 0x3a, 0x37, 0x47, 0x47, 0x3d, 0x36, 0x43, 0x4e,
+    0x3b, 0x38, 0x40, 0x48, 0x44, 0x43, 0x45, 0x3f, 0x43, 0x3c, 0x3b, 0x37,
+    0x43, 0x41, 0x39, 0x2f, 0x3d, 0x45, 0x3e, 0x3e, 0x42, 0x40, 0x41, 0x2f,
+    0x47, 0x38, 0x3a, 0x48, 0x3e, 0x35, 0x37, 0x2a, 0x34, 0x38, 0x41, 0x3b,
+    0x3d, 0x37, 0x3b, 0x35, 0x38, 0x3e, 0x41, 0x3c, 0x41, 0x43, 0x3d, 0x46,
+    0x47, 0x47, 0x3d, 0x35, 0x48, 0x41, 0x3d, 0x3e, 0x34, 0x47, 0x38, 0x38,
+    0x39, 0x3e, 0x38, 0x4d, 0x43, 0x36, 0x42, 0x40, 0x3e, 0x41, 0x3f, 0x4c,
+    0x3e, 0x3e, 0x37, 0x44, 0x3e, 0x3b, 0x47, 0x3e, 0x3f, 0x3b, 0x39, 0x3c,
+    0x3c, 0x3c, 0x53, 0x3b, 0x3b, 0x32, 0x3e, 0x3f, 0x32, 0x3c, 0x37, 0x4b,
+    0x33, 0x30, 0x2f, 0x41, 0x47, 0x42, 0x49, 0x4f, 0x3b, 0x42, 0x4c, 0x44,
+    0x1f, 0x37, 0x16, 0x4e, 0x3b, 0x3f, 0x30, 0x36, 0x35, 0x38, 0x26, 0x36,
+    0x32, 0x3b, 0x38, 0x3c, 0x30, 0x3e, 0x34, 0x3e, 0x3d, 0x34, 0x39, 0x3c,
+    0x36, 0x47, 0x34, 0x41, 0x31, 0x39, 0x44, 0x3e, 0x39, 0x41, 0x32, 0x36,
+    0x3b, 0x3f, 0x32, 0x3d, 0x36, 0x3e, 0x40, 0x3d, 0x45, 0x32, 0x45, 0x42,
+    0x38, 0x43, 0x40, 0x42, 0x34, 0x3a, 0x43, 0x38, 0x47, 0x3f, 0x41, 0x47,
+    0x34, 0x44, 0x41, 0x39, 0x3c, 0x46, 0x36, 0x4f, 0x41, 0x3e, 0x38, 0x38,
+    0x3a, 0x3b, 0x43, 0x44, 0x37, 0x3f, 0x35, 0x43, 0x34, 0x3d, 0x40, 0x32,
+    0x3a, 0x3b, 0x3d, 0x34, 0x35, 0x43, 0x31, 0x2c, 0x3b, 0x36, 0x38, 0x41,
+    0x3c, 0x38, 0x3d, 0x31, 0x45, 0x46, 0x42, 0x41, 0x33, 0x3f, 0x3f, 0x3a,
+    0x36, 0x3f, 0x3c, 0x3c, 0x3c, 0x3e, 0x39, 0x3e, 0x40, 0x37, 0x47, 0x3e,
+    0x35, 0x39, 0x3d, 0x3d, 0x37, 0x36, 0x3e, 0x45, 0x38, 0x3d, 0x45, 0x43,
+    0x3a, 0x32, 0x3b, 0x3a, 0x32, 0x3c, 0x3d, 0x43, 0x3d, 0x33, 0x3b, 0x3d,
+    0x46, 0x3a, 0x44, 0x45, 0x3b, 0x3e, 0x3c, 0x42, 0x37, 0x37, 0x52, 0x2a,
+    0x3a, 0x35, 0x35, 0x3f, 0x40, 0x38, 0x40, 0x5b, 0x35, 0x32, 0x2b, 0x3d,
+    0x4a, 0x3c, 0x46, 0x56, 0x44, 0x30, 0x4d, 0x39, 0x20, 0x32, 0x0f, 0x4f,
+    0x33, 0x3c, 0x35, 0x35, 0x3a, 0x45, 0x29, 0x3b, 0x31, 0x38, 0x34, 0x38,
+    0x42, 0x45, 0x37, 0x3e, 0x37, 0x2e, 0x36, 0x43, 0x3f, 0x38, 0x2f, 0x41,
+    0x3f, 0x41, 0x3c, 0x31, 0x37, 0x36, 0x37, 0x39, 0x41, 0x3a, 0x3a, 0x40,
+    0x3e, 0x47, 0x3d, 0x37, 0x3c, 0x38, 0x35, 0x39, 0x3a, 0x43, 0x3f, 0x42,
+    0x42, 0x38, 0x3e, 0x40, 0x3c, 0x3a, 0x45, 0x48, 0x37, 0x3a, 0x3e, 0x35,
+    0x3a, 0x3d, 0x45, 0x4a, 0x3d, 0x37, 0x38, 0x3a, 0x3d, 0x46, 0x46, 0x41,
+    0x37, 0x41, 0x40, 0x48, 0x37, 0x34, 0x3b, 0x2c, 0x39, 0x34, 0x37, 0x35,
+    0x3a, 0x43, 0x39, 0x2e, 0x39, 0x3f, 0x40, 0x3e, 0x40, 0x40, 0x3c, 0x2d,
+    0x3e, 0x3c, 0x37, 0x39, 0x3c, 0x3b, 0x3d, 0x3f, 0x41, 0x48, 0x3b, 0x3d,
+    0x3b, 0x41, 0x45, 0x3e, 0x3a, 0x38, 0x3f, 0x3c, 0x3d, 0x3e, 0x40, 0x42,
+    0x46, 0x38, 0x43, 0x34, 0x35, 0x47, 0x3d, 0x46, 0x3f, 0x3e, 0x32, 0x3f,
+    0x3e, 0x3d, 0x47, 0x46, 0x38, 0x41, 0x45, 0x3f, 0x34, 0x3f, 0x41, 0x43,
+    0x3e, 0x3e, 0x44, 0x3b, 0x3b, 0x36, 0x51, 0x32, 0x37, 0x3c, 0x42, 0x43,
+    0x33, 0x39, 0x42, 0x61, 0x2c, 0x3b, 0x2e, 0x39, 0x42, 0x39, 0x42, 0x54,
+    0x3c, 0x3a, 0x48, 0x35, 0x26, 0x34, 0x15, 0x51, 0x35, 0x40, 0x36, 0x3c,
+    0x2d, 0x37, 0x25, 0x38, 0x33, 0x3d, 0x3d, 0x39, 0x3e, 0x3b, 0x2e, 0x4b,
+    0x3d, 0x3b, 0x42, 0x37, 0x37, 0x40, 0x37, 0x40, 0x35, 0x45, 0x37, 0x37,
+    0x3f, 0x41, 0x36, 0x39, 0x3c, 0x32, 0x3e, 0x38, 0x41, 0x40, 0x3e, 0x3f,
+    0x3b, 0x3c, 0x43, 0x35, 0x3e, 0x3d, 0x44, 0x44, 0x3a, 0x36, 0x39, 0x3f,
+    0x3a, 0x31, 0x42, 0x4d, 0x40, 0x33, 0x40, 0x45, 0x44, 0x3d, 0x40, 0x49,
+    0x41, 0x3f, 0x42, 0x3a, 0x34, 0x46, 0x38, 0x46, 0x42, 0x34, 0x3a, 0x40,
+    0x40, 0x41, 0x3d, 0x32, 0x35, 0x48, 0x35, 0x3e, 0x44, 0x41, 0x40, 0x2c,
+    0x46, 0x38, 0x38, 0x3f, 0x36, 0x40, 0x38, 0x2a, 0x43, 0x41, 0x3e, 0x35,
+    0x46, 0x3a, 0x45, 0x46, 0x46, 0x42, 0x3a, 0x3b, 0x40, 0x38, 0x35, 0x43,
+    0x38, 0x3d, 0x3b, 0x41, 0x36, 0x44, 0x3f, 0x3f, 0x34, 0x3e, 0x3c, 0x3d,
+    0x49, 0x36, 0x37, 0x4b, 0x38, 0x3c, 0x43, 0x37, 0x3a, 0x3f, 0x31, 0x45,
+    0x3b, 0x39, 0x3f, 0x40, 0x37, 0x3c, 0x42, 0x3f, 0x3c, 0x33, 0x40, 0x3b,
+    0x32, 0x3c, 0x52, 0x31, 0x3d, 0x44, 0x3b, 0x31, 0x46, 0x38, 0x40, 0x60,
+    0x2b, 0x3c, 0x37, 0x34, 0x43, 0x38, 0x45, 0x57, 0x37, 0x39, 0x49, 0x33,
+    0x2d, 0x3f, 0x18, 0x4e, 0x39, 0x39, 0x32, 0x3b, 0x34, 0x3b, 0x2c, 0x45,
+    0x33, 0x37, 0x45, 0x42, 0x3d, 0x37, 0x2a, 0x4c, 0x3d, 0x3f, 0x3c, 0x36,
+    0x37, 0x3c, 0x39, 0x47, 0x3d, 0x44, 0x3d, 0x40, 0x3d, 0x41, 0x34, 0x3e,
+    0x40, 0x34, 0x3b, 0x3a, 0x41, 0x36, 0x37, 0x40, 0x3e, 0x3f, 0x3a, 0x36,
+    0x3e, 0x35, 0x3b, 0x48, 0x41, 0x40, 0x3c, 0x42, 0x34, 0x41, 0x3f, 0x44,
+    0x34, 0x39, 0x33, 0x39, 0x39, 0x47, 0x40, 0x48, 0x38, 0x3a, 0x43, 0x43,
+    0x48, 0x3a, 0x3f, 0x46, 0x35, 0x3a, 0x33, 0x36, 0x32, 0x3c, 0x40, 0x34,
+    0x40, 0x3a, 0x42, 0x3a, 0x39, 0x38, 0x41, 0x35, 0x3a, 0x3f, 0x35, 0x40,
+    0x3f, 0x39, 0x39, 0x36, 0x38, 0x40, 0x3e, 0x3e, 0x3a, 0x31, 0x32, 0x44,
+    0x40, 0x47, 0x3a, 0x3c, 0x43, 0x43, 0x46, 0x48, 0x40, 0x35, 0x3d, 0x37,
+    0x44, 0x37, 0x33, 0x44, 0x3b, 0x3e, 0x3f, 0x37, 0x36, 0x3a, 0x38, 0x47,
+    0x3a, 0x44, 0x36, 0x42, 0x3e, 0x44, 0x34, 0x46, 0x33, 0x43, 0x44, 0x3e,
+    0x30, 0x48, 0x37, 0x38, 0x33, 0x3c, 0x46, 0x42, 0x38, 0x3d, 0x50, 0x39,
+    0x33, 0x38, 0x3e, 0x40, 0x3b, 0x2b, 0x3b, 0x5f, 0x2b, 0x32, 0x2f, 0x37,
+    0x3f, 0x3a, 0x40, 0x4e, 0x34, 0x38, 0x47, 0x37, 0x27, 0x2b, 0x1b, 0x4f,
+    0x36, 0x38, 0x3a, 0x3a, 0x3b, 0x38, 0x2e, 0x3f, 0x3f, 0x42, 0x42, 0x42,
+    0x36, 0x3e, 0x3c, 0x55, 0x39, 0x40, 0x44, 0x43, 0x3e, 0x33, 0x3c, 0x43,
+    0x38, 0x44, 0x3b, 0x46, 0x3f, 0x45, 0x34, 0x38, 0x3c, 0x41, 0x42, 0x3d,
+    0x42, 0x36, 0x43, 0x3f, 0x3c, 0x39, 0x3e, 0x39, 0x39, 0x42, 0x33, 0x47,
+    0x36, 0x3d, 0x3f, 0x3b, 0x40, 0x39, 0x3b, 0x49, 0x36, 0x40, 0x3d, 0x41,
+    0x40, 0x34, 0x3b, 0x4e, 0x3b, 0x36, 0x3b, 0x45, 0x40, 0x32, 0x3b, 0x49,
+    0x37, 0x38, 0x3a, 0x47, 0x37, 0x40, 0x3e, 0x38, 0x40, 0x3f, 0x3c, 0x3a,
+    0x47, 0x41, 0x42, 0x30, 0x40, 0x3c, 0x42, 0x3f, 0x31, 0x44, 0x39, 0x38,
+    0x3b, 0x38, 0x42, 0x43, 0x41, 0x35, 0x3a, 0x39, 0x3e, 0x38, 0x39, 0x3e,
+    0x3c, 0x42, 0x3d, 0x49, 0x47, 0x3c, 0x3f, 0x35, 0x41, 0x3a, 0x36, 0x43,
+    0x43, 0x3b, 0x39, 0x3b, 0x36, 0x43, 0x43, 0x4e, 0x3e, 0x35, 0x37, 0x3b,
+    0x3f, 0x37, 0x41, 0x48, 0x32, 0x44, 0x43, 0x32, 0x38, 0x39, 0x45, 0x39,
+    0x3e, 0x3d, 0x35, 0x39, 0x35, 0x39, 0x50, 0x37, 0x39, 0x40, 0x43, 0x47,
+    0x32, 0x2a, 0x40, 0x62, 0x24, 0x30, 0x36, 0x3e, 0x41, 0x32, 0x47, 0x58,
+    0x39, 0x36, 0x44, 0x34, 0x26, 0x34, 0x1e, 0x50, 0x3c, 0x3b, 0x3f, 0x42,
+    0x35, 0x3d, 0x2a, 0x4e, 0x40, 0x38, 0x36, 0x31, 0x3a, 0x30, 0x37, 0x4b,
+    0x3c, 0x3b, 0x3b, 0x41, 0x3b, 0x3c, 0x2e, 0x45, 0x44, 0x3f, 0x3b, 0x35,
+    0x3e, 0x33, 0x37, 0x3d, 0x40, 0x39, 0x39, 0x37, 0x40, 0x3e, 0x3a, 0x3e,
+    0x3c, 0x3c, 0x45, 0x40, 0x3c, 0x3f, 0x3a, 0x51, 0x47, 0x3a, 0x34, 0x39,
+    0x3b, 0x34, 0x44, 0x4c, 0x36, 0x3d, 0x3a, 0x35, 0x34, 0x36, 0x38, 0x4b,
+    0x3f, 0x40, 0x3f, 0x3e, 0x40, 0x41, 0x47, 0x43, 0x32, 0x38, 0x46, 0x44,
+    0x46, 0x43, 0x43, 0x37, 0x39, 0x49, 0x37, 0x36, 0x3e, 0x3d, 0x37, 0x3c,
+    0x39, 0x37, 0x34, 0x43, 0x45, 0x32, 0x3a, 0x3a, 0x38, 0x43, 0x3b, 0x40,
+    0x3b, 0x3f, 0x3d, 0x41, 0x40, 0x3d, 0x3a, 0x3b, 0x48, 0x37, 0x3d, 0x41,
+    0x40, 0x3e, 0x38, 0x41, 0x3d, 0x3a, 0x38, 0x49, 0x40, 0x3c, 0x42, 0x41,
+    0x3a, 0x38, 0x38, 0x4c, 0x3e, 0x41, 0x40, 0x3b, 0x3d, 0x3e, 0x3c, 0x46,
+    0x3e, 0x42, 0x41, 0x38, 0x42, 0x42, 0x41, 0x3e, 0x3e, 0x37, 0x3c, 0x43,
+    0x43, 0x3b, 0x54, 0x2b, 0x45, 0x3b, 0x43, 0x41, 0x41, 0x26, 0x3f, 0x60,
+    0x25, 0x2b, 0x2e, 0x3a, 0x40, 0x31, 0x40, 0x49, 0x40, 0x31, 0x46, 0x3c,
+    0x1e, 0x2a, 0x1a, 0x47, 0x33, 0x37, 0x37, 0x34, 0x31, 0x36, 0x25, 0x41,
+    0x2e, 0x36, 0x35, 0x33, 0x33, 0x34, 0x31, 0x45, 0x3a, 0x3f, 0x3d, 0x40,
+    0x3c, 0x41, 0x30, 0x3c, 0x3f, 0x46, 0x37, 0x3c, 0x3a, 0x3c, 0x36, 0x3a,
+    0x47, 0x3d, 0x31, 0x3f, 0x40, 0x3e, 0x36, 0x44, 0x41, 0x3d, 0x36, 0x3f,
+    0x37, 0x3f, 0x34, 0x4b, 0x31, 0x47, 0x43, 0x3e, 0x3e, 0x3a, 0x3b, 0x4b,
+    0x37, 0x32, 0x38, 0x3d, 0x37, 0x47, 0x46, 0x4d, 0x36, 0x3c, 0x3f, 0x3a,
+    0x41, 0x31, 0x47, 0x43, 0x3d, 0x3d, 0x3e, 0x35, 0x3d, 0x46, 0x49, 0x2a,
+    0x37, 0x3c, 0x39, 0x3d, 0x47, 0x3c, 0x34, 0x2c, 0x3e, 0x38, 0x47, 0x32,
+    0x36, 0x36, 0x41, 0x38, 0x35, 0x44, 0x48, 0x3b, 0x39, 0x3e, 0x38, 0x3e,
+    0x40, 0x36, 0x37, 0x46, 0x39, 0x3b, 0x34, 0x45, 0x40, 0x3b, 0x48, 0x36,
+    0x34, 0x44, 0x37, 0x46, 0x3f, 0x42, 0x33, 0x36, 0x43, 0x3c, 0x41, 0x46,
+    0x31, 0x42, 0x43, 0x44, 0x44, 0x3e, 0x42, 0x3b, 0x3b, 0x3a, 0x3c, 0x37,
+    0x42, 0x41, 0x46, 0x38, 0x41, 0x3b, 0x40, 0x44, 0x37, 0x3c, 0x4c, 0x2e,
+    0x3a, 0x3e, 0x3b, 0x36, 0x33, 0x27, 0x37, 0x5d, 0x27, 0x34, 0x32, 0x41,
+    0x41, 0x3f, 0x40, 0x5d, 0x40, 0x3d, 0x48, 0x39, 0x2e, 0x30, 0x1f, 0x3f,
+    0x38, 0x3f, 0x40, 0x33, 0x40, 0x38, 0x31, 0x3f, 0x42, 0x3e, 0x3b, 0x3a,
+    0x42, 0x36, 0x3a, 0x42, 0x3c, 0x3b, 0x3d, 0x41, 0x3d, 0x40, 0x40, 0x3e,
+    0x36, 0x41, 0x47, 0x3d, 0x33, 0x32, 0x33, 0x44, 0x3e, 0x3a, 0x3e, 0x3d,
+    0x45, 0x3f, 0x38, 0x3f, 0x40, 0x3a, 0x3c, 0x46, 0x32, 0x42, 0x3c, 0x51,
+    0x33, 0x38, 0x3a, 0x38, 0x41, 0x34, 0x45, 0x4e, 0x35, 0x3c, 0x42, 0x3e,
+    0x3f, 0x45, 0x44, 0x4e, 0x39, 0x47, 0x3a, 0x33, 0x3e, 0x3b, 0x45, 0x42,
+    0x37, 0x3a, 0x3e, 0x33, 0x41, 0x48, 0x32, 0x2a, 0x3b, 0x37, 0x3f, 0x3d,
+    0x3a, 0x42, 0x41, 0x2f, 0x34, 0x3e, 0x49, 0x3b, 0x38, 0x3e, 0x3d, 0x3a,
+    0x37, 0x3c, 0x44, 0x41, 0x39, 0x42, 0x3f, 0x39, 0x40, 0x35, 0x3d, 0x41,
+    0x3b, 0x45, 0x44, 0x48, 0x3d, 0x42, 0x36, 0x33, 0x3e, 0x44, 0x3f, 0x41,
+    0x42, 0x40, 0x49, 0x34, 0x48, 0x41, 0x3f, 0x40, 0x3c, 0x45, 0x47, 0x34,
+    0x41, 0x37, 0x47, 0x3e, 0x41, 0x41, 0x39, 0x42, 0x3f, 0x3a, 0x46, 0x33,
+    0x39, 0x41, 0x38, 0x38, 0x3e, 0x42, 0x41, 0x38, 0x35, 0x32, 0x33, 0x38,
+    0x3a, 0x3f, 0x45, 0x66, 0x33, 0x47, 0x38, 0x3c, 0x41, 0x2f, 0x48, 0x55,
+    0x33, 0x3e, 0x49, 0x3b, 0x3c, 0x30, 0x24, 0x45, 0x3c, 0x44, 0x43, 0x32,
+    0x3d, 0x3f, 0x35, 0x3b, 0x3e, 0x36, 0x38, 0x3a, 0x36, 0x37, 0x3b, 0x41,
+    0x38, 0x42, 0x3e, 0x43, 0x39, 0x3f, 0x3c, 0x40, 0x37, 0x43, 0x3e, 0x3b,
+    0x3d, 0x35, 0x35, 0x3d, 0x43, 0x3f, 0x3a, 0x35, 0x37, 0x3c, 0x31, 0x47,
+    0x44, 0x45, 0x40, 0x32, 0x44, 0x36, 0x38, 0x51, 0x3c, 0x41, 0x45, 0x37,
+    0x39, 0x44, 0x3e, 0x4f, 0x3c, 0x3a, 0x38, 0x40, 0x3f, 0x34, 0x39, 0x4e,
+    0x3d, 0x39, 0x45, 0x3f, 0x3e, 0x3c, 0x3b, 0x42, 0x3b, 0x3b, 0x34, 0x3d,
+    0x41, 0x44, 0x39, 0x2e, 0x37, 0x44, 0x45, 0x37, 0x3d, 0x41, 0x3f, 0x33,
+    0x3f, 0x3e, 0x3e, 0x40, 0x44, 0x3f, 0x37, 0x32, 0x35, 0x3e, 0x43, 0x41,
+    0x39, 0x37, 0x35, 0x3f, 0x48, 0x3d, 0x43, 0x49, 0x38, 0x35, 0x3f, 0x48,
+    0x3b, 0x3a, 0x34, 0x3f, 0x3c, 0x44, 0x3a, 0x40, 0x36, 0x35, 0x44, 0x36,
+    0x44, 0x3b, 0x3d, 0x38, 0x3c, 0x44, 0x47, 0x3a, 0x3b, 0x45, 0x41, 0x3a,
+    0x39, 0x35, 0x44, 0x3a, 0x49, 0x36, 0x48, 0x31, 0x42, 0x43, 0x42, 0x34,
+    0x41, 0x40, 0x4d, 0x36, 0x3e, 0x35, 0x39, 0x3b, 0x3f, 0x41, 0x38, 0x39,
+    0x3c, 0x44, 0x3f, 0x39, 0x3a, 0x36, 0x3d, 0x36, 0x3a, 0x3a, 0x34, 0x3b,
+    0x38, 0x2f, 0x40, 0x34, 0x32, 0x4d, 0x43, 0x45, 0x4e, 0x3f, 0x48, 0x35,
+    0x3b, 0x4d, 0x4f, 0x39, 0x42, 0x36, 0x46, 0x36, 0x4a, 0x3c, 0x37, 0x41,
+    0x40, 0x43, 0x50, 0x36, 0x3e, 0x39, 0x44, 0x40, 0x36, 0x47, 0x3f, 0x36,
+    0x45, 0x40, 0x45, 0x41, 0x3b, 0x37, 0x41, 0x39, 0x3b, 0x48, 0x37, 0x34,
+    0x41, 0x45, 0x49, 0x3f, 0x39, 0x49, 0x3f, 0x3a, 0x42, 0x34, 0x38, 0x37,
+    0x44, 0x34, 0x3c, 0x3d, 0x40, 0x47, 0x3a, 0x36, 0x3f, 0x3c, 0x41, 0x3e,
+    0x47, 0x46, 0x46, 0x43, 0x3f, 0x38, 0x3b, 0x40, 0x3f, 0x48, 0x3b, 0x4c,
+    0x3d, 0x4b, 0x34, 0x3b, 0x44, 0x43, 0x3c, 0x49, 0x38, 0x42, 0x41, 0x36,
+    0x33, 0x36, 0x40, 0x46, 0x40, 0x3a, 0x42, 0x3c, 0x3d, 0x35, 0x3c, 0x52,
+    0x3e, 0x40, 0x43, 0x43, 0x41, 0x3b, 0x3e, 0x44, 0x3f, 0x40, 0x40, 0x43,
+    0x3d, 0x3f, 0x36, 0x42, 0x3f, 0x3c, 0x34, 0x3d, 0x33, 0x41, 0x3c, 0x39,
+    0x34, 0x43, 0x3f, 0x34, 0x3c, 0x3a, 0x3a, 0x37, 0x42, 0x41, 0x40, 0x3e,
+    0x3d, 0x3c, 0x41, 0x3c, 0x38, 0x33, 0x49, 0x46, 0x40, 0x40, 0x3a, 0x46,
+    0x38, 0x3c, 0x37, 0x34, 0x3e, 0x3d, 0x32, 0x38, 0x3c, 0x4c, 0x3a, 0x34,
+    0x35, 0x32, 0x39, 0x40, 0x3a, 0x58, 0x40, 0x46, 0x42, 0x33, 0x45, 0x39,
+    0x34, 0x4f, 0x53, 0x45, 0x43, 0x3e, 0x41, 0x36, 0x3e, 0x3f, 0x40, 0x47,
+    0x4e, 0x3d, 0x53, 0x2b, 0x41, 0x36, 0x3e, 0x38, 0x47, 0x41, 0x3f, 0x34,
+    0x47, 0x40, 0x38, 0x39, 0x3d, 0x42, 0x3f, 0x3c, 0x48, 0x3a, 0x35, 0x3c,
+    0x45, 0x49, 0x3c, 0x33, 0x33, 0x3f, 0x3c, 0x46, 0x43, 0x3f, 0x45, 0x31,
+    0x35, 0x43, 0x46, 0x3a, 0x45, 0x3c, 0x37, 0x3a, 0x37, 0x36, 0x35, 0x3f,
+    0x38, 0x49, 0x34, 0x3f, 0x3c, 0x42, 0x49, 0x3e, 0x3e, 0x3c, 0x39, 0x49,
+    0x3e, 0x3c, 0x3b, 0x43, 0x44, 0x45, 0x39, 0x4b, 0x47, 0x47, 0x3e, 0x33,
+    0x3c, 0x31, 0x34, 0x4f, 0x45, 0x43, 0x40, 0x3d, 0x42, 0x3b, 0x43, 0x50,
+    0x3c, 0x3b, 0x37, 0x42, 0x47, 0x42, 0x3e, 0x4a, 0x3f, 0x3a, 0x48, 0x3d,
+    0x48, 0x45, 0x3e, 0x40, 0x3a, 0x3c, 0x3d, 0x39, 0x41, 0x42, 0x3c, 0x42,
+    0x43, 0x3c, 0x3b, 0x3d, 0x47, 0x49, 0x38, 0x3c, 0x46, 0x3a, 0x3c, 0x3f,
+    0x3a, 0x46, 0x3a, 0x3b, 0x3d, 0x3a, 0x49, 0x46, 0x38, 0x40, 0x3e, 0x38,
+    0x37, 0x32, 0x40, 0x3c, 0x42, 0x3d, 0x3b, 0x40, 0x3a, 0x38, 0x49, 0x33,
+    0x40, 0x38, 0x2b, 0x3a, 0x3c, 0x4f, 0x4d, 0x3e, 0x35, 0x3d, 0x3b, 0x40,
+    0x3a, 0x54, 0x3e, 0x3e, 0x43, 0x30, 0x47, 0x3d, 0x3b, 0x53, 0x52, 0x4a,
+    0x43, 0x41, 0x49, 0x37, 0x3b, 0x35, 0x44, 0x3c, 0x45, 0x40, 0x4f, 0x36,
+    0x4b, 0x42, 0x41, 0x3a, 0x41, 0x44, 0x47, 0x32, 0x43, 0x35, 0x3f, 0x37,
+    0x43, 0x41, 0x43, 0x36, 0x3f, 0x3b, 0x3d, 0x38, 0x3d, 0x40, 0x42, 0x36,
+    0x44, 0x3a, 0x39, 0x47, 0x37, 0x34, 0x42, 0x3a, 0x37, 0x38, 0x37, 0x3f,
+    0x36, 0x3b, 0x45, 0x3f, 0x3f, 0x3d, 0x39, 0x3d, 0x39, 0x41, 0x37, 0x3f,
+    0x3f, 0x3d, 0x3f, 0x41, 0x43, 0x41, 0x45, 0x43, 0x41, 0x3c, 0x3e, 0x40,
+    0x40, 0x39, 0x41, 0x4f, 0x47, 0x42, 0x46, 0x48, 0x3b, 0x3b, 0x3c, 0x46,
+    0x47, 0x3e, 0x46, 0x37, 0x38, 0x3d, 0x38, 0x52, 0x36, 0x46, 0x3c, 0x3a,
+    0x3b, 0x37, 0x48, 0x4b, 0x3f, 0x42, 0x3c, 0x36, 0x40, 0x37, 0x33, 0x4c,
+    0x39, 0x34, 0x41, 0x34, 0x3f, 0x3b, 0x35, 0x4b, 0x3b, 0x45, 0x43, 0x31,
+    0x3e, 0x39, 0x30, 0x3d, 0x32, 0x43, 0x44, 0x3c, 0x3e, 0x38, 0x43, 0x41,
+    0x3e, 0x37, 0x41, 0x39, 0x39, 0x44, 0x43, 0x38, 0x3f, 0x37, 0x48, 0x3f,
+    0x3b, 0x44, 0x37, 0x3f, 0x3a, 0x3f, 0x3b, 0x33, 0x42, 0x3e, 0x2f, 0x42,
+    0x44, 0x4f, 0x52, 0x3c, 0x34, 0x33, 0x39, 0x46, 0x31, 0x55, 0x43, 0x4e,
+    0x49, 0x38, 0x4d, 0x48, 0x34, 0x4d, 0x5c, 0x4d, 0x49, 0x37, 0x4f, 0x40,
+    0x3c, 0x3d, 0x41, 0x42, 0x3f, 0x51, 0x4b, 0x2f, 0x46, 0x35, 0x39, 0x3c,
+    0x49, 0x3d, 0x4e, 0x32, 0x43, 0x47, 0x31, 0x3e, 0x42, 0x4a, 0x4c, 0x39,
+    0x43, 0x46, 0x3e, 0x3f, 0x44, 0x3c, 0x42, 0x30, 0x3e, 0x34, 0x3b, 0x3b,
+    0x3a, 0x3c, 0x42, 0x3d, 0x3d, 0x48, 0x48, 0x36, 0x3a, 0x45, 0x38, 0x40,
+    0x3c, 0x41, 0x3f, 0x49, 0x42, 0x41, 0x38, 0x3d, 0x3d, 0x44, 0x3b, 0x3d,
+    0x35, 0x48, 0x43, 0x3b, 0x32, 0x41, 0x3e, 0x3a, 0x46, 0x41, 0x40, 0x54,
+    0x38, 0x3f, 0x3c, 0x36, 0x3b, 0x36, 0x43, 0x50, 0x38, 0x3c, 0x44, 0x3b,
+    0x43, 0x47, 0x32, 0x50, 0x3d, 0x46, 0x3d, 0x3b, 0x39, 0x37, 0x3b, 0x4a,
+    0x47, 0x43, 0x46, 0x3d, 0x3d, 0x41, 0x43, 0x45, 0x3b, 0x3c, 0x39, 0x47,
+    0x43, 0x42, 0x39, 0x4c, 0x34, 0x41, 0x45, 0x3b, 0x38, 0x3e, 0x37, 0x3f,
+    0x45, 0x43, 0x39, 0x42, 0x3c, 0x3d, 0x3d, 0x3c, 0x48, 0x39, 0x3b, 0x3a,
+    0x46, 0x45, 0x3d, 0x3a, 0x3f, 0x3a, 0x45, 0x36, 0x3d, 0x43, 0x36, 0x43,
+    0x42, 0x3d, 0x41, 0x3f, 0x3a, 0x3f, 0x31, 0x37, 0x48, 0x4f, 0x4e, 0x36,
+    0x30, 0x3a, 0x3e, 0x3e, 0x38, 0x57, 0x40, 0x47, 0x47, 0x38, 0x4f, 0x46,
+    0x3d, 0x4a, 0x50, 0x4c, 0x42, 0x3b, 0x4d, 0x3d, 0x3d, 0x33, 0x40, 0x41,
+    0x48, 0x4b, 0x46, 0x39, 0x4d, 0x30, 0x45, 0x38, 0x48, 0x3c, 0x48, 0x3b,
+    0x4d, 0x40, 0x3b, 0x40, 0x46, 0x41, 0x51, 0x34, 0x40, 0x43, 0x3f, 0x42,
+    0x45, 0x42, 0x3e, 0x35, 0x3d, 0x38, 0x37, 0x3a, 0x42, 0x40, 0x43, 0x3c,
+    0x3c, 0x3d, 0x43, 0x40, 0x45, 0x3a, 0x3e, 0x3a, 0x3e, 0x40, 0x43, 0x35,
+    0x37, 0x3f, 0x3f, 0x3e, 0x39, 0x3f, 0x47, 0x38, 0x3e, 0x44, 0x3b, 0x3c,
+    0x3b, 0x32, 0x40, 0x3e, 0x42, 0x45, 0x3a, 0x52, 0x3a, 0x3e, 0x45, 0x40,
+    0x41, 0x48, 0x3f, 0x4e, 0x3e, 0x42, 0x3d, 0x39, 0x3a, 0x33, 0x3f, 0x4b,
+    0x3e, 0x38, 0x36, 0x3e, 0x31, 0x41, 0x3a, 0x40, 0x3b, 0x37, 0x3f, 0x3e,
+    0x3e, 0x3f, 0x35, 0x44, 0x3d, 0x42, 0x3d, 0x44, 0x42, 0x3f, 0x3e, 0x44,
+    0x3e, 0x45, 0x37, 0x3a, 0x3b, 0x42, 0x3f, 0x41, 0x3b, 0x3f, 0x41, 0x41,
+    0x3e, 0x34, 0x47, 0x39, 0x46, 0x46, 0x37, 0x39, 0x3f, 0x45, 0x39, 0x39,
+    0x3a, 0x40, 0x38, 0x3a, 0x31, 0x34, 0x3a, 0x41, 0x38, 0x41, 0x3a, 0x41,
+    0x44, 0x37, 0x2d, 0x41, 0x43, 0x4d, 0x4b, 0x3b, 0x2c, 0x30, 0x42, 0x3b,
+    0x31, 0x56, 0x43, 0x47, 0x47, 0x38, 0x50, 0x44, 0x40, 0x52, 0x5a, 0x50,
+    0x44, 0x3f, 0x4b, 0x35, 0x3a, 0x36, 0x41, 0x44, 0x47, 0x4e, 0x52, 0x36,
+    0x45, 0x39, 0x38, 0x3c, 0x42, 0x44, 0x40, 0x3b, 0x4b, 0x38, 0x35, 0x35,
+    0x3f, 0x40, 0x4f, 0x39, 0x3d, 0x37, 0x34, 0x3e, 0x41, 0x4c, 0x40, 0x37,
+    0x3d, 0x3b, 0x37, 0x37, 0x40, 0x42, 0x35, 0x39, 0x41, 0x42, 0x3d, 0x34,
+    0x3c, 0x37, 0x3a, 0x3d, 0x46, 0x46, 0x46, 0x3f, 0x44, 0x3d, 0x3c, 0x40,
+    0x3c, 0x3a, 0x3d, 0x3b, 0x3b, 0x41, 0x47, 0x3a, 0x43, 0x43, 0x43, 0x3b,
+    0x3e, 0x3e, 0x42, 0x46, 0x36, 0x37, 0x45, 0x35, 0x3c, 0x3b, 0x31, 0x4b,
+    0x3c, 0x3e, 0x3a, 0x3a, 0x42, 0x42, 0x34, 0x47, 0x37, 0x34, 0x41, 0x3d,
+    0x3e, 0x39, 0x43, 0x47, 0x31, 0x3b, 0x40, 0x3b, 0x42, 0x3d, 0x44, 0x44,
+    0x37, 0x39, 0x44, 0x3b, 0x40, 0x3a, 0x3d, 0x44, 0x3c, 0x40, 0x42, 0x3b,
+    0x40, 0x3e, 0x32, 0x3d, 0x3c, 0x3e, 0x44, 0x3e, 0x47, 0x3d, 0x3f, 0x2e,
+    0x3e, 0x3d, 0x3f, 0x3b, 0x3b, 0x43, 0x43, 0x3c, 0x3a, 0x3c, 0x3a, 0x36,
+    0x38, 0x46, 0x30, 0x3e, 0x3f, 0x35, 0x3e, 0x34, 0x3c, 0x34, 0x32, 0x4a,
+    0x41, 0x48, 0x48, 0x3f, 0x34, 0x37, 0x42, 0x43, 0x36, 0x59, 0x42, 0x3f,
+    0x4b, 0x3d, 0x5d, 0x45, 0x3b, 0x51, 0x51, 0x4c, 0x41, 0x40, 0x4d, 0x36,
+    0x3f, 0x34, 0x39, 0x3d, 0x4a, 0x4b, 0x4f, 0x33, 0x48, 0x32, 0x3c, 0x32,
+    0x48, 0x4c, 0x4d, 0x3a, 0x49, 0x3a, 0x3a, 0x2e, 0x4b, 0x44, 0x4f, 0x33,
+    0x3a, 0x48, 0x34, 0x43, 0x38, 0x45, 0x44, 0x35, 0x3b, 0x3f, 0x40, 0x37,
+    0x35, 0x34, 0x38, 0x3e, 0x41, 0x3e, 0x3b, 0x47, 0x41, 0x47, 0x3c, 0x3c,
+    0x39, 0x40, 0x3e, 0x45, 0x36, 0x41, 0x3f, 0x3f, 0x3c, 0x44, 0x3f, 0x43,
+    0x3d, 0x3c, 0x49, 0x42, 0x3e, 0x3f, 0x48, 0x37, 0x43, 0x37, 0x43, 0x3d,
+    0x32, 0x42, 0x44, 0x39, 0x36, 0x37, 0x40, 0x46, 0x47, 0x3d, 0x3a, 0x42,
+    0x3f, 0x38, 0x37, 0x48, 0x39, 0x40, 0x3c, 0x37, 0x33, 0x38, 0x38, 0x40,
+    0x41, 0x3c, 0x3f, 0x3b, 0x40, 0x3a, 0x47, 0x46, 0x3a, 0x37, 0x42, 0x47,
+    0x3b, 0x3f, 0x3b, 0x40, 0x33, 0x3f, 0x3a, 0x3c, 0x38, 0x3a, 0x36, 0x38,
+    0x36, 0x40, 0x48, 0x42, 0x48, 0x3c, 0x43, 0x36, 0x32, 0x3b, 0x34, 0x39,
+    0x38, 0x46, 0x37, 0x3b, 0x44, 0x34, 0x36, 0x38, 0x3c, 0x43, 0x33, 0x3c,
+    0x3b, 0x45, 0x38, 0x38, 0x44, 0x33, 0x36, 0x4a, 0x46, 0x4c, 0x4a, 0x34,
+    0x36, 0x37, 0x43, 0x42, 0x33, 0x58, 0x43, 0x48, 0x44, 0x38, 0x5f, 0x3f,
+    0x3c, 0x4d, 0x53, 0x52, 0x43, 0x47, 0x52, 0x3e, 0x3b, 0x2d, 0x3b, 0x3a,
+    0x4b, 0x49, 0x53, 0x38, 0x4c, 0x2f, 0x38, 0x31, 0x42, 0x40, 0x48, 0x3f,
+    0x44, 0x3c, 0x3c, 0x34, 0x46, 0x3f, 0x49, 0x3a, 0x43, 0x3d, 0x34, 0x42,
+    0x36, 0x47, 0x51, 0x3c, 0x3d, 0x39, 0x39, 0x3a, 0x3b, 0x35, 0x35, 0x41,
+    0x47, 0x3c, 0x3b, 0x43, 0x3f, 0x45, 0x3e, 0x40, 0x3c, 0x3f, 0x3c, 0x42,
+    0x3b, 0x3e, 0x38, 0x3f, 0x3f, 0x41, 0x39, 0x39, 0x3d, 0x43, 0x4f, 0x3d,
+    0x48, 0x3b, 0x44, 0x45, 0x3d, 0x3b, 0x49, 0x43, 0x44, 0x3d, 0x37, 0x3b,
+    0x3c, 0x45, 0x46, 0x44, 0x35, 0x3e, 0x32, 0x35, 0x34, 0x3b, 0x40, 0x43,
+    0x3e, 0x45, 0x37, 0x3d, 0x3f, 0x43, 0x36, 0x3f, 0x3f, 0x43, 0x39, 0x44,
+    0x3e, 0x3e, 0x45, 0x40, 0x3e, 0x44, 0x3b, 0x3e, 0x42, 0x42, 0x3b, 0x3d,
+    0x3a, 0x40, 0x39, 0x3a, 0x32, 0x36, 0x41, 0x30, 0x39, 0x46, 0x33, 0x3f,
+    0x46, 0x40, 0x3c, 0x31, 0x41, 0x3a, 0x3f, 0x3f, 0x3b, 0x36, 0x3f, 0x38,
+    0x36, 0x3e, 0x35, 0x35, 0x3b, 0x3d, 0x3f, 0x39, 0x46, 0x37, 0x3a, 0x47,
+    0x37, 0x39, 0x2c, 0x55, 0x40, 0x4b, 0x4a, 0x39, 0x35, 0x42, 0x3d, 0x40,
+    0x3a, 0x54, 0x41, 0x48, 0x51, 0x3b, 0x61, 0x3e, 0x3e, 0x4d, 0x51, 0x52,
+    0x3e, 0x43, 0x52, 0x41, 0x48, 0x2d, 0x35, 0x35, 0x4b, 0x44, 0x4d, 0x3c,
+    0x54, 0x33, 0x39, 0x27, 0x4a, 0x44, 0x4a, 0x41, 0x3c, 0x3a, 0x31, 0x2f,
+    0x3d, 0x42, 0x48, 0x3f, 0x42, 0x40, 0x44, 0x3b, 0x40, 0x3e, 0x49, 0x3a,
+    0x3c, 0x35, 0x30, 0x3e, 0x3e, 0x3d, 0x36, 0x3a, 0x3e, 0x3a, 0x4a, 0x3e,
+    0x3d, 0x49, 0x40, 0x43, 0x3e, 0x45, 0x3f, 0x3c, 0x3b, 0x42, 0x3a, 0x39,
+    0x3b, 0x47, 0x3f, 0x39, 0x49, 0x46, 0x3d, 0x34, 0x32, 0x44, 0x46, 0x42,
+    0x47, 0x39, 0x49, 0x48, 0x3b, 0x38, 0x45, 0x45, 0x37, 0x38, 0x46, 0x46,
+    0x37, 0x42, 0x35, 0x34, 0x45, 0x42, 0x35, 0x43, 0x3b, 0x3a, 0x43, 0x43,
+    0x40, 0x42, 0x35, 0x3f, 0x38, 0x3f, 0x3a, 0x3a, 0x3b, 0x3f, 0x3e, 0x36,
+    0x3f, 0x3c, 0x48, 0x3b, 0x3a, 0x41, 0x41, 0x35, 0x33, 0x3f, 0x3b, 0x45,
+    0x48, 0x36, 0x40, 0x38, 0x47, 0x3d, 0x35, 0x40, 0x41, 0x42, 0x41, 0x37,
+    0x41, 0x3e, 0x36, 0x48, 0x3e, 0x3c, 0x32, 0x39, 0x41, 0x40, 0x38, 0x3f,
+    0x46, 0x43, 0x33, 0x40, 0x43, 0x43, 0x3a, 0x49, 0x3f, 0x35, 0x2c, 0x5d,
+    0x43, 0x49, 0x52, 0x3b, 0x3c, 0x41, 0x40, 0x4a, 0x33, 0x50, 0x41, 0x46,
+    0x52, 0x41, 0x68, 0x48, 0x44, 0x53, 0x54, 0x55, 0x42, 0x42, 0x57, 0x44,
+    0x47, 0x35, 0x35, 0x3e, 0x4b, 0x44, 0x4e, 0x38, 0x55, 0x2f, 0x36, 0x2d,
+    0x40, 0x48, 0x4b, 0x41, 0x48, 0x36, 0x32, 0x32, 0x44, 0x42, 0x47, 0x42,
+    0x48, 0x3d, 0x3d, 0x39, 0x3e, 0x35, 0x4b, 0x39, 0x38, 0x3a, 0x39, 0x46,
+    0x38, 0x3f, 0x3a, 0x42, 0x4b, 0x45, 0x3e, 0x32, 0x46, 0x43, 0x3b, 0x40,
+    0x45, 0x41, 0x3e, 0x43, 0x37, 0x3d, 0x43, 0x3b, 0x46, 0x48, 0x42, 0x3b,
+    0x3d, 0x48, 0x4a, 0x3c, 0x3b, 0x42, 0x40, 0x3c, 0x3a, 0x42, 0x38, 0x47,
+    0x3b, 0x3b, 0x3d, 0x41, 0x3f, 0x38, 0x3f, 0x4a, 0x44, 0x3f, 0x47, 0x3a,
+    0x47, 0x44, 0x43, 0x43, 0x34, 0x3d, 0x3a, 0x3c, 0x47, 0x3f, 0x3e, 0x39,
+    0x42, 0x4a, 0x40, 0x36, 0x40, 0x41, 0x42, 0x3f, 0x3f, 0x43, 0x39, 0x38,
+    0x3c, 0x3b, 0x4c, 0x2f, 0x41, 0x39, 0x40, 0x42, 0x3f, 0x42, 0x40, 0x36,
+    0x3b, 0x45, 0x41, 0x41, 0x44, 0x45, 0x42, 0x37, 0x3d, 0x3a, 0x33, 0x3e,
+    0x3b, 0x3b, 0x3c, 0x3d, 0x38, 0x49, 0x44, 0x39, 0x3f, 0x48, 0x3d, 0x41,
+    0x42, 0x43, 0x44, 0x3e, 0x41, 0x3d, 0x32, 0x59, 0x45, 0x4b, 0x4b, 0x38,
+    0x37, 0x3d, 0x48, 0x42, 0x3d, 0x52, 0x43, 0x46, 0x54, 0x48, 0x67, 0x4d,
+    0x45, 0x4e, 0x49, 0x52, 0x45, 0x45, 0x58, 0x3b, 0x41, 0x38, 0x3f, 0x3f,
+    0x49, 0x44, 0x4f, 0x48, 0x57, 0x31, 0x3c, 0x2a, 0x3e, 0x4c, 0x41, 0x40,
+    0x47, 0x3f, 0x33, 0x34, 0x3f, 0x42, 0x48, 0x43, 0x4b, 0x38, 0x39, 0x3d,
+    0x3f, 0x3e, 0x4b, 0x3f, 0x35, 0x36, 0x3c, 0x46, 0x3c, 0x45, 0x37, 0x3b,
+    0x3c, 0x39, 0x41, 0x40, 0x41, 0x43, 0x44, 0x41, 0x45, 0x4f, 0x44, 0x43,
+    0x44, 0x3c, 0x45, 0x34, 0x42, 0x45, 0x3f, 0x46, 0x3f, 0x43, 0x3d, 0x3a,
+    0x39, 0x47, 0x45, 0x3d, 0x3f, 0x3b, 0x3d, 0x42, 0x38, 0x48, 0x48, 0x3b,
+    0x3c, 0x3a, 0x3f, 0x41, 0x44, 0x4b, 0x44, 0x48, 0x41, 0x3c, 0x3d, 0x3c,
+    0x3e, 0x3a, 0x4a, 0x3b, 0x49, 0x35, 0x3a, 0x3d, 0x41, 0x3f, 0x49, 0x39,
+    0x44, 0x37, 0x3f, 0x3c, 0x42, 0x40, 0x4a, 0x46, 0x39, 0x38, 0x46, 0x37,
+    0x41, 0x46, 0x41, 0x45, 0x40, 0x3b, 0x3b, 0x33, 0x3b, 0x39, 0x3c, 0x43,
+    0x37, 0x3c, 0x44, 0x3d, 0x46, 0x39, 0x3c, 0x3c, 0x44, 0x48, 0x41, 0x44,
+    0x41, 0x43, 0x46, 0x3b, 0x47, 0x41, 0x31, 0x41, 0x44, 0x40, 0x43, 0x42,
+    0x3e, 0x43, 0x34, 0x65, 0x4f, 0x50, 0x4d, 0x3a, 0x37, 0x43, 0x4d, 0x4a,
+    0x3d, 0x54, 0x40, 0x42, 0x5b, 0x3b, 0x71, 0x49, 0x44, 0x4f, 0x54, 0x56,
+    0x48, 0x40, 0x52, 0x41, 0x42, 0x38, 0x3c, 0x49, 0x4a, 0x45, 0x51, 0x35,
+    0x54, 0x2f, 0x35, 0x25, 0x4d, 0x3f, 0x4d, 0x43, 0x49, 0x33, 0x32, 0x3a,
+    0x46, 0x48, 0x48, 0x3d, 0x43, 0x3a, 0x3c, 0x3a, 0x48, 0x40, 0x4b, 0x3b,
+    0x45, 0x3b, 0x3f, 0x38, 0x37, 0x41, 0x31, 0x3b, 0x41, 0x43, 0x43, 0x37,
+    0x48, 0x3f, 0x48, 0x37, 0x40, 0x4a, 0x43, 0x45, 0x3d, 0x39, 0x37, 0x37,
+    0x3c, 0x3f, 0x47, 0x48, 0x43, 0x3e, 0x41, 0x3f, 0x3e, 0x38, 0x3e, 0x37,
+    0x45, 0x45, 0x35, 0x44, 0x38, 0x3a, 0x49, 0x43, 0x40, 0x41, 0x40, 0x44,
+    0x3c, 0x3e, 0x40, 0x38, 0x42, 0x41, 0x3c, 0x41, 0x3a, 0x3b, 0x3c, 0x3a,
+    0x49, 0x3c, 0x42, 0x44, 0x3f, 0x39, 0x45, 0x32, 0x45, 0x43, 0x45, 0x39,
+    0x43, 0x41, 0x4b, 0x39, 0x32, 0x3c, 0x3c, 0x36, 0x39, 0x3f, 0x46, 0x32,
+    0x39, 0x35, 0x4f, 0x32, 0x3e, 0x40, 0x3d, 0x3e, 0x3a, 0x39, 0x4c, 0x38,
+    0x43, 0x38, 0x49, 0x3b, 0x33, 0x39, 0x3b, 0x36, 0x36, 0x43, 0x3b, 0x3c,
+    0x32, 0x3c, 0x3a, 0x45, 0x31, 0x3d, 0x37, 0x40, 0x3f, 0x3f, 0x35, 0xff,
+    0x49, 0x4e, 0x4c, 0x3c, 0x36, 0x43, 0x46, 0x45, 0x41, 0x59, 0x44, 0x4a,
+    0x53, 0x44, 0x71, 0x4a, 0x39, 0x4f, 0x50, 0x4b, 0x47, 0x42, 0x5a, 0x3c,
+    0x45, 0x38, 0x3e, 0x42, 0x53, 0x43, 0x52, 0x3a, 0x52, 0x34, 0x31, 0x20,
+    0x49, 0x4e, 0x46, 0x43, 0x4b, 0x3d, 0x2b, 0x27, 0x46, 0x46, 0x47, 0x41,
+    0x42, 0x37, 0x39, 0x38, 0x45, 0x3f, 0x51, 0x3d, 0x48, 0x3f, 0x33, 0x3f,
+    0x38, 0x45, 0x31, 0x38, 0x41, 0x3d, 0x47, 0x39, 0x42, 0x40, 0x4c, 0x3f,
+    0x40, 0x42, 0x41, 0x41, 0x41, 0x42, 0x39, 0x35, 0x3f, 0x46, 0x45, 0x36,
+    0x3f, 0x43, 0x3b, 0x39, 0x41, 0x38, 0x43, 0x37, 0x3d, 0x44, 0x3b, 0x40,
+    0x36, 0x3d, 0x42, 0x41, 0x41, 0x3d, 0x38, 0x4a, 0x40, 0x4a, 0x4c, 0x38,
+    0x3f, 0x40, 0x45, 0x3c, 0x3f, 0x4b, 0x43, 0x41, 0x43, 0x3e, 0x43, 0x3f,
+    0x36, 0x40, 0x40, 0x39, 0x3f, 0x3a, 0x3a, 0x30, 0x41, 0x3c, 0x3c, 0x34,
+    0x46, 0x38, 0x43, 0x34, 0x3a, 0x42, 0x43, 0x42, 0x40, 0x41, 0x49, 0x34,
+    0x35, 0x40, 0x47, 0x3d, 0x3d, 0x3e, 0x4c, 0x33, 0x3c, 0x3b, 0x39, 0x43,
+    0x3a, 0x3e, 0x3b, 0x37, 0x3f, 0x42, 0x31, 0x3d, 0x41, 0x3e, 0x32, 0x47,
+    0x34, 0x41, 0x3d, 0x35, 0x39, 0x40, 0x38, 0x69, 0x4f, 0x4a, 0x49, 0x37,
+    0x37, 0x44, 0x43, 0x46, 0x40, 0x58, 0x43, 0x48, 0x54, 0x46, 0x6c, 0x50,
+    0x3a, 0x50, 0x50, 0x57, 0x47, 0x46, 0x5c, 0x40, 0x40, 0x39, 0x3e, 0x46,
+    0x53, 0x46, 0x5c, 0x36, 0x4f, 0x32, 0x30, 0x2d, 0x4a, 0x48, 0x41, 0x45,
+    0x47, 0x2f, 0x32, 0x2b, 0x43, 0x40, 0x43, 0x3c, 0x40, 0x44, 0x3e, 0x37,
+    0x39, 0x3e, 0x48, 0x42, 0x45, 0x36, 0x47, 0x3f, 0x3b, 0x41, 0x35, 0x35,
+    0x3b, 0x3e, 0x35, 0x43, 0x3e, 0x41, 0x3d, 0x36, 0x41, 0x3c, 0x40, 0x44,
+    0x3d, 0x40, 0x35, 0x32, 0x48, 0x3e, 0x39, 0x42, 0x44, 0x3d, 0x39, 0x3b,
+    0x3b, 0x45, 0x40, 0x4a, 0x3f, 0x41, 0x43, 0x39, 0x42, 0x44, 0x4c, 0x3c,
+    0x3f, 0x3e, 0x3f, 0x43, 0x40, 0x42, 0x4c, 0x3b, 0x3e, 0x3d, 0x49, 0x42,
+    0x40, 0x44, 0x40, 0x34, 0x36, 0x40, 0x45, 0x39, 0x42, 0x40, 0x3e, 0x44,
+    0x45, 0x37, 0x3c, 0x38, 0x3e, 0x49, 0x3e, 0x3c, 0x41, 0x3d, 0x42, 0x32,
+    0x40, 0x45, 0x3e, 0x36, 0x44, 0x3a, 0x4e, 0x38, 0x43, 0x38, 0x40, 0x38,
+    0x49, 0x42, 0x40, 0x3d, 0x42, 0x48, 0x48, 0x3d, 0x41, 0x3a, 0x3f, 0x41,
+    0x38, 0x3c, 0x44, 0x39, 0x3a, 0x32, 0x3a, 0x3e, 0x3d, 0x3b, 0x39, 0x38,
+    0x3a, 0x43, 0x3a, 0x6b, 0x45, 0x50, 0x47, 0x33, 0x38, 0x48, 0x4d, 0x4f,
+    0x39, 0x4b, 0x46, 0x4a, 0x4f, 0x42, 0x6f, 0x4b, 0x40, 0x55, 0x54, 0x50,
+    0x42, 0x47, 0x5e, 0x46, 0x40, 0x34, 0x40, 0x47, 0x52, 0x46, 0x55, 0x3b,
+    0x4f, 0x2b, 0x35, 0x33, 0x4c, 0x44, 0x44, 0x48, 0x47, 0x37, 0x35, 0x27,
+    0x4a, 0x3b, 0x41, 0x40, 0x40, 0x3e, 0x36, 0x39, 0x3e, 0x3c, 0x45, 0x3f,
+    0x4d, 0x41, 0x3d, 0x48, 0x47, 0x46, 0x33, 0x3d, 0x3d, 0x3e, 0x34, 0x3f,
+    0x3e, 0x3a, 0x41, 0x35, 0x3b, 0x3e, 0x42, 0x3c, 0x42, 0x42, 0x40, 0x31,
+    0x37, 0x40, 0x36, 0x42, 0x48, 0x39, 0x3d, 0x3c, 0x3a, 0x43, 0x39, 0x3d,
+    0x47, 0x49, 0x43, 0x3d, 0x45, 0x39, 0x44, 0x37, 0x3e, 0x4d, 0x3d, 0x40,
+    0x3d, 0x4c, 0x4d, 0x44, 0x3c, 0x3d, 0x46, 0x41, 0x41, 0x42, 0x40, 0x40,
+    0x41, 0x3a, 0x3c, 0x3b, 0x3c, 0x44, 0x40, 0x34, 0x44, 0x38, 0x3b, 0x33,
+    0x45, 0x45, 0x44, 0x3f, 0x3e, 0x3a, 0x3b, 0x3b, 0x43, 0x39, 0x3a, 0x45,
+    0x3b, 0x3a, 0x4b, 0x39, 0x3d, 0x38, 0x41, 0x39, 0x42, 0x45, 0x43, 0x40,
+    0x3e, 0x35, 0x44, 0x3f, 0x45, 0x41, 0x40, 0x3e, 0x43, 0x42, 0x37, 0x3a,
+    0x38, 0x35, 0x3a, 0x48, 0x3e, 0x3b, 0x40, 0x38, 0x3c, 0x3c, 0x3b, 0x6a,
+    0x48, 0x4d, 0x4d, 0x34, 0x38, 0x40, 0x4a, 0x45, 0x3c, 0x4f, 0x41, 0x4b,
+    0x58, 0x46, 0x71, 0x49, 0x3d, 0x53, 0x44, 0x52, 0x42, 0x3e, 0x57, 0x4c,
+    0x4c, 0x38, 0x40, 0x3b, 0x5c, 0x4c, 0x52, 0x3e, 0x4c, 0x2d, 0x32, 0x37,
+    0x49, 0x3f, 0x41, 0x47, 0x4a, 0x3b, 0x2f, 0x26, 0x45, 0x40, 0x47, 0x42,
+    0x3d, 0x39, 0x2d, 0x2c, 0x3f, 0x45, 0x46, 0x44, 0x48, 0x43, 0x42, 0x48,
+    0x40, 0x41, 0x3b, 0x3b, 0x41, 0x3b, 0x39, 0x40, 0x3b, 0x47, 0x3f, 0x38,
+    0x3f, 0x49, 0x3b, 0x35, 0x40, 0x45, 0x38, 0x35, 0x36, 0x34, 0x3e, 0x3d,
+    0x46, 0x3e, 0x33, 0x38, 0x43, 0x48, 0x3f, 0x45, 0x31, 0x44, 0x38, 0x35,
+    0x3c, 0x41, 0x4b, 0x44, 0x3d, 0x43, 0x38, 0x48, 0x3c, 0x39, 0x4a, 0x42,
+    0x3d, 0x43, 0x3f, 0x49, 0x3e, 0x47, 0x49, 0x41, 0x3b, 0x3c, 0x47, 0x3a,
+    0x3d, 0x40, 0x4a, 0x38, 0x3d, 0x3b, 0x47, 0x3a, 0x36, 0x47, 0x42, 0x46,
+    0x3c, 0x3d, 0x45, 0x3b, 0x48, 0x3f, 0x38, 0x36, 0x39, 0x46, 0x43, 0x3a,
+    0x41, 0x3d, 0x39, 0x39, 0x46, 0x37, 0x3f, 0x3f, 0x3a, 0x46, 0x3f, 0x39,
+    0x49, 0x44, 0x42, 0x3a, 0x3a, 0x43, 0x3e, 0x42, 0x3d, 0x3d, 0x43, 0x40,
+    0x43, 0x3c, 0x3f, 0x43, 0x40, 0x42, 0x3b, 0x57, 0x4a, 0x4f, 0x4a, 0x2d,
+    0x3b, 0x48, 0x45, 0x42, 0x34, 0x4c, 0x3e, 0x4f, 0x4d, 0x40, 0x6c, 0x4b,
+    0x3b, 0x4d, 0x4c, 0x57, 0x49, 0x3d, 0x5d, 0x44, 0x43, 0x29, 0x42, 0x3f,
+    0x5b, 0x47, 0x4f, 0x3e, 0x54, 0x2e, 0x34, 0x34, 0x4b, 0x47, 0x46, 0x46,
+    0x4b, 0x34, 0x36, 0x28, 0x3e, 0x3f, 0x42, 0x40, 0x3b, 0x38, 0x39, 0x42,
+    0x49, 0x3d, 0x49, 0x47, 0x47, 0x3b, 0x43, 0x34, 0x39, 0x36, 0x42, 0x3d,
+    0x37, 0x40, 0x37, 0x38, 0x46, 0x42, 0x49, 0x37, 0x44, 0x3f, 0x38, 0x3e,
+    0x36, 0x32, 0x33, 0x38, 0x40, 0x46, 0x42, 0x34, 0x41, 0x42, 0x3e, 0x38,
+    0x44, 0x3e, 0x3f, 0x43, 0x3f, 0x43, 0x35, 0x3f, 0x4d, 0x3b, 0x43, 0x39,
+    0x40, 0x47, 0x3f, 0x4a, 0x3a, 0x3f, 0x45, 0x45, 0x48, 0x42, 0x3b, 0x47,
+    0x42, 0x4b, 0x47, 0x3e, 0x3c, 0x42, 0x46, 0x39, 0x41, 0x3f, 0x48, 0x33,
+    0x45, 0x34, 0x3d, 0x30, 0x40, 0x4c, 0x40, 0x40, 0x39, 0x37, 0x40, 0x33,
+    0x49, 0x42, 0x45, 0x38, 0x3c, 0x43, 0x45, 0x35, 0x37, 0x33, 0x34, 0x3b,
+    0x3b, 0x38, 0x39, 0x41, 0x42, 0x40, 0x3e, 0x3e, 0x41, 0x33, 0x3a, 0x36,
+    0x40, 0x3a, 0x3c, 0x45, 0x43, 0x3c, 0x40, 0x41, 0x49, 0x47, 0x35, 0x34,
+    0x3a, 0x3d, 0x3a, 0x68, 0x4f, 0x48, 0x43, 0x36, 0x37, 0x3e, 0x45, 0x49,
+    0x3a, 0x4d, 0x41, 0x3d, 0x46, 0x45, 0x65, 0x46, 0x38, 0x4d, 0x4a, 0x53,
+    0x43, 0x41, 0x5d, 0x47, 0x41, 0x34, 0x39, 0x43, 0x4e, 0x48, 0x50, 0x38,
+    0x53, 0x32, 0x30, 0x2e, 0x49, 0x4c, 0x4d, 0x3f, 0x46, 0x38, 0x34, 0x2b,
+    0x44, 0x44, 0x41, 0x41, 0x36, 0x40, 0x3f, 0x32, 0x46, 0x38, 0x50, 0x45,
+    0x3f, 0x3d, 0x3b, 0x36, 0x3b, 0x43, 0x3a, 0x34, 0x36, 0x3f, 0x39, 0x35,
+    0x3c, 0x40, 0x40, 0x37, 0x3c, 0x39, 0x3d, 0x36, 0x48, 0x3d, 0x43, 0x34,
+    0x3b, 0x46, 0x43, 0x41, 0x33, 0x3e, 0x44, 0x3d, 0x44, 0x44, 0x4c, 0x3c,
+    0x37, 0x49, 0x42, 0x35, 0x45, 0x3a, 0x3c, 0x41, 0x3a, 0x45, 0x46, 0x41,
+    0x3c, 0x48, 0x46, 0x36, 0x36, 0x42, 0x3b, 0x46, 0x42, 0x45, 0x44, 0x47,
+    0x3f, 0x44, 0x3a, 0x35, 0x37, 0x46, 0x40, 0x38, 0x40, 0x3d, 0x36, 0x2c,
+    0x34, 0x47, 0x40, 0x38, 0x3f, 0x3f, 0x44, 0x2d, 0x3b, 0x3d, 0x3e, 0x44,
+    0x3c, 0x40, 0x3e, 0x33, 0x3c, 0x3a, 0x49, 0x40, 0x42, 0x42, 0x3a, 0x3b,
+    0x33, 0x3d, 0x3c, 0x43, 0x3e, 0x3d, 0x3a, 0x3a, 0x48, 0x3e, 0x3c, 0x39,
+    0x3f, 0x44, 0x37, 0x40, 0x3f, 0x3c, 0x3e, 0x3d, 0x38, 0x42, 0x34, 0x62,
+    0x51, 0x47, 0x44, 0x3f, 0x32, 0x3c, 0x3f, 0x46, 0x3d, 0x46, 0x3e, 0x45,
+    0x4a, 0x3e, 0x5d, 0x43, 0x45, 0x49, 0x4a, 0x55, 0x41, 0x3c, 0x5a, 0x44,
+    0x43, 0x3b, 0x3c, 0x3a, 0x4b, 0x4e, 0x4d, 0x42, 0x49, 0x30, 0x3b, 0x38,
+    0x42, 0x44, 0x51, 0x40, 0x48, 0x33, 0x3f, 0x2b, 0x3c, 0x41, 0x3c, 0x45,
+    0x35, 0x39, 0x42, 0x37, 0x40, 0x46, 0x46, 0x3f, 0x41, 0x45, 0x42, 0x3d,
+    0x43, 0x38, 0x3e, 0x38, 0x3c, 0x39, 0x40, 0x38, 0x37, 0x36, 0x3d, 0x3d,
+    0x38, 0x47, 0x45, 0x3b, 0x45, 0x44, 0x42, 0x2e, 0x37, 0x40, 0x42, 0x42,
+    0x3c, 0x36, 0x3b, 0x39, 0x44, 0x4d, 0x42, 0x3f, 0x3a, 0x3e, 0x45, 0x34,
+    0x3c, 0x43, 0x47, 0x43, 0x3f, 0x48, 0x3b, 0x44, 0x3d, 0x44, 0x43, 0x3e,
+    0x40, 0x4a, 0x31, 0x42, 0x42, 0x43, 0x48, 0x45, 0x3a, 0x42, 0x36, 0x2f,
+    0x3c, 0x3e, 0x3b, 0x3b, 0x44, 0x3f, 0x3a, 0x2c, 0x47, 0x3f, 0x4a, 0x40,
+    0x40, 0x40, 0x3c, 0x2a, 0x3e, 0x44, 0x40, 0x43, 0x3a, 0x42, 0x39, 0x34,
+    0x49, 0x3e, 0x36, 0x42, 0x3f, 0x42, 0x33, 0x3b, 0x3c, 0x45, 0x39, 0x3f,
+    0x3e, 0x3f, 0x41, 0x3d, 0x32, 0x3b, 0x31, 0x40, 0x3f, 0x44, 0x3c, 0x3f,
+    0x40, 0x46, 0x45, 0x36, 0x36, 0x42, 0x30, 0x57, 0x47, 0x44, 0x48, 0x3f,
+    0x35, 0x37, 0x3f, 0x3f, 0x38, 0x4a, 0x41, 0x46, 0x50, 0x3d, 0x5b, 0x41,
+    0x3e, 0x3c, 0x4a, 0x54, 0x45, 0x41, 0x5b, 0x46, 0x3d, 0x3b, 0x43, 0x33,
+    0x45, 0x4e, 0x43, 0x3b, 0x44, 0x37, 0x37, 0x32, 0x4c, 0x3d, 0x4c, 0x3f,
+    0x49, 0x3b, 0x37, 0x3a, 0x33, 0x43, 0x3f, 0x40, 0x44, 0x36, 0x3b, 0x44,
+    0x45, 0x40, 0x3c, 0x3c, 0x41, 0x44, 0x3b, 0x3d, 0x33, 0x37, 0x3c, 0x35,
+    0x3d, 0x3f, 0x39, 0x38, 0x33, 0x43, 0x3e, 0x39, 0x3b, 0x3e, 0x41, 0x35,
+    0x40, 0x46, 0x43, 0x35, 0x41, 0x3d, 0x32, 0x39, 0x3c, 0x40, 0x3e, 0x3f,
+    0x42, 0x38, 0x3b, 0x45, 0x3a, 0x3d, 0x40, 0x36, 0x3a, 0x40, 0x46, 0x44,
+    0x48, 0x45, 0x3f, 0x3a, 0x45, 0x45, 0x3c, 0x3b, 0x40, 0x4c, 0x39, 0x3a,
+    0x38, 0x39, 0x46, 0x3a, 0x3e, 0x4b, 0x34, 0x39, 0x3d, 0x3f, 0x40, 0x39,
+    0x45, 0x31, 0x45, 0x29, 0x3f, 0x38, 0x3a, 0x3f, 0x38, 0x3b, 0x36, 0x2d,
+    0x43, 0x3d, 0x45, 0x3c, 0x46, 0x3f, 0x40, 0x3c, 0x3a, 0x3e, 0x3d, 0x38,
+    0x3f, 0x3c, 0x3f, 0x42, 0x35, 0x3f, 0x3a, 0x43, 0x3d, 0x43, 0x3d, 0x33,
+    0x3d, 0x48, 0x42, 0x3d, 0x45, 0x46, 0x3d, 0x35, 0x32, 0x44, 0x42, 0x37,
+    0x3d, 0x40, 0x3c, 0x47, 0x4a, 0x45, 0x47, 0x2f, 0x33, 0x36, 0x3f, 0x42,
+    0x38, 0x43, 0x3e, 0x3a, 0x41, 0x3f, 0x5f, 0x3f, 0x48, 0x3a, 0x44, 0x47,
+    0x41, 0x3e, 0x57, 0x42, 0x41, 0x33, 0x34, 0x39, 0x42, 0x44, 0x42, 0x3c,
+    0x49, 0x34, 0x37, 0x33, 0x47, 0x38, 0x43, 0x3d, 0x43, 0x3e, 0x3e, 0x36,
+    0x41, 0x41, 0x37, 0x40, 0x39, 0x3e, 0x3b, 0x3b, 0x3e, 0x41, 0x3d, 0x3b,
+    0x43, 0x3e, 0x39, 0x43, 0x2f, 0x3e, 0x33, 0x40, 0x45, 0x47, 0x30, 0x46,
+    0x3f, 0x3f, 0x37, 0x42, 0x3d, 0x42, 0x43, 0x37, 0x38, 0x3c, 0x35, 0x34,
+    0x41, 0x43, 0x3e, 0x3e, 0x3f, 0x49, 0x35, 0x35, 0x38, 0x36, 0x3a, 0x43,
+    0x38, 0x46, 0x48, 0x36, 0x3f, 0x39, 0x3b, 0x3e, 0x48, 0x47, 0x41, 0x34,
+    0x3b, 0x3c, 0x37, 0x3e, 0x40, 0x41, 0x3b, 0x3d, 0x43, 0x42, 0x3a, 0x39,
+    0x3b, 0x43, 0x38, 0x2b, 0x43, 0x41, 0x48, 0x35, 0x44, 0x44, 0x3e, 0x2c,
+    0x46, 0x40, 0x3e, 0x41, 0x38, 0x34, 0x35, 0x37, 0x34, 0x3f, 0x3d, 0x46,
+    0x33, 0x3c, 0x3c, 0x2e, 0x3b, 0x45, 0x3d, 0x3e, 0x3a, 0x42, 0x3c, 0x36,
+    0x3a, 0x42, 0x39, 0x43, 0x35, 0x39, 0x40, 0x44, 0x47, 0x41, 0x44, 0x3d,
+    0x41, 0x3e, 0x38, 0x39, 0x45, 0x3a, 0x35, 0x43, 0x3f, 0x44, 0x41, 0x49,
+    0x47, 0x3f, 0x44, 0x40, 0x38, 0x43, 0x40, 0x3e, 0x39, 0x42, 0x32, 0x3b,
+    0x42, 0x47, 0x57, 0x37, 0x36, 0x38, 0x43, 0x49, 0x3b, 0x34, 0x54, 0x42,
+    0x3d, 0x3f, 0x3e, 0x3b, 0x38, 0x41, 0x43, 0x3a, 0x44, 0x39, 0x34, 0x2c,
+    0x38, 0x43, 0x4b, 0x3f, 0x40, 0x3e, 0x32, 0x33, 0x3d, 0x44, 0x45, 0x44,
+    0x3e, 0x35, 0x37, 0x39, 0x40, 0x3e, 0x40, 0x3c, 0x34, 0x43, 0x37, 0x40,
+    0x39, 0x3e, 0x3d, 0x43, 0x3a, 0x44, 0x43, 0x44, 0x3d, 0x3b, 0x45, 0x3b,
+    0x3a, 0x3a, 0x3f, 0x37, 0x43, 0x3b, 0x33, 0x35, 0x40, 0x47, 0x3e, 0x3c,
+    0x39, 0x3c, 0x34, 0x29, 0x3c, 0x3e, 0x46, 0x3e, 0x3c, 0x38, 0x3f, 0x2d,
+    0x3d, 0x3d, 0x3f, 0x3f, 0x3d, 0x45, 0x3b, 0x32, 0x39, 0x3f, 0x41, 0x38,
+    0x36, 0x3e, 0x3a, 0x35, 0x40, 0x3f, 0x3b, 0x32, 0x3c, 0x39, 0x3e, 0x35,
+    0x3e, 0x45, 0x34, 0x38, 0x44, 0x39, 0x3f, 0x31, 0x34, 0x39, 0x3f, 0x38,
+    0x44, 0x42, 0x3f, 0x3b, 0x39, 0x3d, 0x39, 0x3b, 0x44, 0x46, 0x38, 0x3d,
+    0x45, 0x37, 0x40, 0x3a, 0x3a, 0x39, 0x35, 0x3c, 0x39, 0x40, 0x47, 0x3e,
+    0x38, 0x42, 0x41, 0x3b, 0x48, 0x3f, 0x3a, 0x3e, 0x3d, 0x3f, 0x32, 0x3b,
+    0x3f, 0x3d, 0x3e, 0x44, 0x43, 0x41, 0x44, 0x47, 0x48, 0x41, 0x41, 0x36,
+    0x3a, 0x33, 0x3c, 0x3c, 0x37, 0x3e, 0x40, 0x34, 0x3f, 0x42, 0x53, 0x40,
+    0x3f, 0x35, 0x3e, 0x46, 0x3a, 0x3e, 0x4b, 0x41, 0x46, 0x32, 0x39, 0x36,
+    0x3b, 0x4f, 0x36, 0x3c, 0x40, 0x3a, 0x40, 0x40, 0x47, 0x3e, 0x49, 0x37,
+    0x3f, 0x31, 0x3e, 0x40, 0x3b, 0x3f, 0x43, 0x44, 0x3a, 0x3d, 0x31, 0x41,
+    0x41, 0x33, 0x43, 0x40, 0x3c, 0x3a, 0x41, 0x40, 0x37, 0x3f, 0x34, 0x3e,
+    0x44, 0x42, 0x3d, 0x3f, 0x3f, 0x34, 0x36, 0x34, 0x31, 0x41, 0x32, 0x39,
+    0x3e, 0x3d, 0x42, 0x35, 0x3e, 0x3a, 0x41, 0x47, 0x3d, 0x42, 0x33, 0x32,
+    0x43, 0x42, 0x36, 0x41, 0x3e, 0x39, 0x46, 0x39, 0x35, 0x3d, 0x3d, 0x40,
+    0x38, 0x44, 0x3d, 0x31, 0x44, 0x39, 0x3a, 0x45, 0x42, 0x41, 0x3d, 0x36,
+    0x3f, 0x3c, 0x39, 0x3d, 0x32, 0x39, 0x42, 0x34, 0x3f, 0x38, 0x44, 0x3c,
+    0x43, 0x45, 0x41, 0x2d, 0x44, 0x42, 0x3d, 0x3f, 0x44, 0x38, 0x3d, 0x35,
+    0x3a, 0x48, 0x40, 0x3b, 0x3d, 0x36, 0x3b, 0x40, 0x3f, 0x3a, 0x3a, 0x3f,
+    0x3c, 0x33, 0x39, 0x3c, 0x3c, 0x38, 0x47, 0x36, 0x3d, 0x41, 0x46, 0x41,
+    0x34, 0x46, 0x48, 0x46, 0x3d, 0x3c, 0x40, 0x43, 0x3d, 0x41, 0x37, 0x3e,
+    0x39, 0x47, 0x3f, 0x39, 0x46, 0x43, 0x3f, 0x41, 0x45, 0x37, 0x40, 0x3a,
+    0x3d, 0x44, 0x3f, 0x3b, 0x3b, 0x40, 0x4f, 0x3d, 0x3d, 0x41, 0x3c, 0x43,
+    0x3e, 0x46, 0x4e, 0x40, 0x3f, 0x34, 0x48, 0x29, 0x45, 0x44, 0x46, 0x41,
+    0x45, 0x32, 0x3e, 0x38, 0x39, 0x3a, 0x3e, 0x3e, 0x4c, 0x34, 0x3c, 0x40,
+    0x4a, 0x44, 0x3d, 0x46, 0x3b, 0x3e, 0x42, 0x42, 0x3a, 0x41, 0x43, 0x41,
+    0x39, 0x3f, 0x3e, 0x3c, 0x36, 0x48, 0x3f, 0x3e, 0x3e, 0x37, 0x3f, 0x3f,
+    0x3b, 0x40, 0x3e, 0x35, 0x32, 0x35, 0x3f, 0x33, 0x3f, 0x38, 0x43, 0x37,
+    0x49, 0x38, 0x37, 0x3c, 0x3c, 0x40, 0x40, 0x3a, 0x3a, 0x46, 0x37, 0x34,
+    0x34, 0x3b, 0x3d, 0x2f, 0x3a, 0x38, 0x3d, 0x46, 0x3d, 0x3b, 0x3d, 0x38,
+    0x35, 0x37, 0x44, 0x3c, 0x3d, 0x3e, 0x40, 0x3a, 0x40, 0x33, 0x3e, 0x38,
+    0x40, 0x3e, 0x45, 0x37, 0x3f, 0x3b, 0x3c, 0x40, 0x3b, 0x3c, 0x3b, 0x33,
+    0x41, 0x3f, 0x3b, 0x42, 0x31, 0x3b, 0x3a, 0x39, 0x3d, 0x41, 0x39, 0x40,
+    0x43, 0x45, 0x39, 0x3b, 0x3a, 0x42, 0x43, 0x3d, 0x3f, 0x40, 0x47, 0x39,
+    0x37, 0x3f, 0x47, 0x3f, 0x45, 0x41, 0x39, 0x3a, 0x41, 0x38, 0x3c, 0x3c,
+    0x39, 0x40, 0x39, 0x3b, 0x3b, 0x3e, 0x38, 0x3b, 0x37, 0x48, 0x41, 0x3f,
+    0x3e, 0x37, 0x3d, 0x44, 0x3c, 0x3e, 0x40, 0x39, 0x41, 0x42, 0x3d, 0x45,
+    0x3b, 0x3e, 0x4c, 0x3b, 0x3a, 0x3a, 0x3e, 0x47, 0x3c, 0x3f, 0x48, 0x3f,
+    0x46, 0x3f, 0x39, 0x25, 0x44, 0x3a, 0x3b, 0x40, 0x41, 0x39, 0x39, 0x47,
+    0x3b, 0x32, 0x49, 0x42, 0x41, 0x3a, 0x43, 0x41, 0x3e, 0x35, 0x37, 0x3d,
+    0x49, 0x40, 0x45, 0x3b, 0x3c, 0x38, 0x48, 0x3c, 0x3c, 0x35, 0x3f, 0x41,
+    0x41, 0x4c, 0x36, 0x39, 0x37, 0x3d, 0x3b, 0x3e, 0x44, 0x32, 0x3d, 0x3f,
+    0x3a, 0x3b, 0x3a, 0x47, 0x38, 0x42, 0x36, 0x34, 0x43, 0x3f, 0x3e, 0x40,
+    0x34, 0x31, 0x36, 0x33, 0x42, 0x37, 0x41, 0x41, 0x40, 0x3d, 0x3d, 0x37,
+    0x43, 0x3a, 0x3e, 0x44, 0x43, 0x3c, 0x35, 0x38, 0x38, 0x3c, 0x43, 0x36,
+    0x3a, 0x38, 0x40, 0x3f, 0x3d, 0x3e, 0x37, 0x3b, 0x41, 0x3a, 0x3b, 0x3d,
+    0x3c, 0x41, 0x3c, 0x41, 0x47, 0x3f, 0x3f, 0x3b, 0x3d, 0x3f, 0x3b, 0x45,
+    0x38, 0x38, 0x40, 0x38, 0x46, 0x42, 0x39, 0x3d, 0x3d, 0x3b, 0x42, 0x36,
+    0x42, 0x41, 0x3e, 0x3e, 0x36, 0x3f, 0x37, 0x3f, 0x36, 0x48, 0x3b, 0x39,
+    0x3d, 0x3f, 0x43, 0x3e, 0x3c, 0x40, 0x48, 0x46, 0x43, 0x36, 0x42, 0x39,
+    0x46, 0x3c, 0x37, 0x38, 0x49, 0x37, 0x36, 0x39, 0x3e, 0x42, 0x48, 0x3a,
+    0x3c, 0x3e, 0x42, 0x30, 0x3e, 0x34, 0x39, 0x3b, 0x46, 0x61, 0x46, 0x1e,
+    0x4c, 0x3b, 0x40, 0x2d, 0x3c, 0x42, 0x32, 0x30, 0x49, 0x3e, 0x39, 0x34,
+    0x30, 0x40, 0x31, 0x38, 0x40, 0x3d, 0x3c, 0x35, 0x3a, 0x36, 0x40, 0x3b,
+    0x41, 0x40, 0x3b, 0x39, 0x37, 0x37, 0x3f, 0x3b, 0x3c, 0x3a, 0x40, 0x3a,
+    0x36, 0x3c, 0x42, 0x39, 0x3e, 0x36, 0x40, 0x42, 0x39, 0x40, 0x3b, 0x34,
+    0x37, 0x33, 0x36, 0x3f, 0x43, 0x33, 0x33, 0x27, 0x3d, 0x46, 0x40, 0x31,
+    0x38, 0x3e, 0x41, 0x20, 0x3f, 0x39, 0x42, 0x35, 0x35, 0x45, 0x40, 0x1e,
+    0x32, 0x35, 0x32, 0x3c, 0x35, 0x44, 0x46, 0x29, 0x3a, 0x3d, 0x37, 0x42,
+    0x3b, 0x45, 0x3a, 0x26, 0x38, 0x40, 0x30, 0x37, 0x41, 0x40, 0x39, 0x2b,
+    0x49, 0x3f, 0x43, 0x43, 0x40, 0x3a, 0x38, 0x29, 0x43, 0x3a, 0x37, 0x40,
+    0x3f, 0x35, 0x3a, 0x28, 0x36, 0x3e, 0x3f, 0x43, 0x3c, 0x39, 0x42, 0x2c,
+    0x38, 0x42, 0x38, 0x3d, 0x42, 0x38, 0x35, 0x2d, 0x34, 0x38, 0x3d, 0x43,
+    0x46, 0x3e, 0x3c, 0x27, 0x3e, 0x40, 0x46, 0x39, 0x35, 0x3d, 0x42, 0x35,
+    0x42, 0x36, 0x40, 0x3e, 0x3a, 0x3e, 0x3c, 0x37, 0x3a, 0x3c, 0x48, 0x48,
+    0x48, 0x37, 0x3d, 0x38, 0x4b, 0x40, 0x43, 0x3b, 0x41, 0x46, 0x3c, 0x34,
+    0x46, 0x3c, 0x3c, 0x3c, 0x4b, 0x64, 0x4a, 0x22, 0x52, 0x41, 0x42, 0x3b,
+    0x42, 0x4a, 0x34, 0x37, 0x4b, 0x44, 0x3b, 0x4a, 0x38, 0x3f, 0x38, 0x3a,
+    0x40, 0x41, 0x42, 0x3c, 0x33, 0x3e, 0x3c, 0x42, 0x2c, 0x4e, 0x47, 0x3f,
+    0x38, 0x33, 0x39, 0x3f, 0x3b, 0x45, 0x37, 0x3a, 0x42, 0x42, 0x44, 0x3f,
+    0x3c, 0x3c, 0x3e, 0x3d, 0x3c, 0x3c, 0x40, 0x2c, 0x3c, 0x3d, 0x42, 0x39,
+    0x3a, 0x37, 0x43, 0x2a, 0x3d, 0x40, 0x41, 0x41, 0x46, 0x46, 0x42, 0x28,
+    0x39, 0x3c, 0x37, 0x44, 0x46, 0x41, 0x47, 0x2b, 0x44, 0x33, 0x39, 0x3f,
+    0x3f, 0x43, 0x3d, 0x23, 0x3a, 0x43, 0x41, 0x3b, 0x41, 0x42, 0x33, 0x1f,
+    0x43, 0x3e, 0x3d, 0x40, 0x37, 0x33, 0x42, 0x28, 0x3b, 0x38, 0x37, 0x3c,
+    0x34, 0x40, 0x44, 0x2a, 0x3c, 0x3a, 0x41, 0x37, 0x45, 0x3f, 0x3e, 0x26,
+    0x41, 0x40, 0x35, 0x3d, 0x45, 0x3e, 0x3d, 0x29, 0x3c, 0x39, 0x3f, 0x3c,
+    0x3d, 0x39, 0x38, 0x2d, 0x39, 0x38, 0x38, 0x44, 0x3c, 0x3e, 0x38, 0x26,
+    0x40, 0x36, 0x39, 0x38, 0x3f, 0x32, 0x39, 0x35, 0x3d, 0x3e, 0x35, 0x3a,
+    0x3f, 0x3f, 0x31, 0x35, 0x34, 0x45, 0x3e, 0x43, 0x48, 0x3b, 0x37, 0x39,
+    0x4d, 0x46, 0x54, 0x40, 0x41, 0x4e, 0x3d, 0x38, 0x4d, 0x38, 0x3a, 0x3b,
+    0x49, 0x5a, 0x4a, 0x1e, 0x5e, 0x39, 0x38, 0x37, 0x3a, 0x51, 0x3a, 0x3c,
+    0x50, 0x3f, 0x40, 0x42, 0x33, 0x3b, 0x2e, 0x4a, 0x3f, 0x4a, 0x3b, 0x43,
+    0x36, 0x3e, 0x3d, 0x42, 0x39, 0x46, 0x4b, 0x3c, 0x3b, 0x3b, 0x35, 0x3e,
+    0x3d, 0x4b, 0x3f, 0x41, 0x3f, 0x3b, 0x42, 0x42, 0x38, 0x3a, 0x41, 0x3d,
+    0x36, 0x41, 0x37, 0x2f, 0x38, 0x37, 0x3f, 0x34, 0x35, 0x35, 0x45, 0x30,
+    0x31, 0x42, 0x31, 0x3a, 0x3a, 0x3e, 0x3d, 0x23, 0x3f, 0x43, 0x3b, 0x41,
+    0x35, 0x3b, 0x40, 0x25, 0x45, 0x3e, 0x42, 0x3b, 0x31, 0x40, 0x36, 0x28,
+    0x43, 0x42, 0x30, 0x42, 0x32, 0x32, 0x36, 0x2c, 0x35, 0x3a, 0x3d, 0x3a,
+    0x3c, 0x36, 0x3e, 0x30, 0x41, 0x42, 0x38, 0x41, 0x41, 0x3e, 0x3c, 0x23,
+    0x37, 0x40, 0x3c, 0x3e, 0x3e, 0x3a, 0x37, 0x2b, 0x36, 0x40, 0x41, 0x42,
+    0x3e, 0x38, 0x44, 0x22, 0x46, 0x38, 0x33, 0x3b, 0x3a, 0x3a, 0x3a, 0x24,
+    0x36, 0x3b, 0x38, 0x44, 0x34, 0x38, 0x40, 0x28, 0x38, 0x3d, 0x36, 0x44,
+    0x31, 0x3e, 0x37, 0x37, 0x36, 0x3f, 0x47, 0x38, 0x3b, 0x3e, 0x2c, 0x4c,
+    0x36, 0x3c, 0x3b, 0x41, 0x4c, 0x3d, 0x3d, 0x40, 0x49, 0x44, 0x52, 0x3f,
+    0x3b, 0x4d, 0x3c, 0x3a, 0x4f, 0x3b, 0x36, 0x3b, 0x4a, 0x5f, 0x4e, 0x1f,
+    0x57, 0x3c, 0x3d, 0x3d, 0x46, 0x59, 0x42, 0x45, 0x52, 0x3d, 0x3a, 0x41,
+    0x31, 0x39, 0x39, 0x4f, 0x43, 0x4e, 0x3e, 0x37, 0x3a, 0x37, 0x33, 0x47,
+    0x32, 0x45, 0x47, 0x43, 0x31, 0x33, 0x38, 0x43, 0x3e, 0x47, 0x3d, 0x32,
+    0x3b, 0x39, 0x3c, 0x42, 0x3d, 0x47, 0x42, 0x40, 0x3d, 0x3f, 0x3c, 0x34,
+    0x3b, 0x3e, 0x42, 0x3d, 0x43, 0x35, 0x42, 0x2c, 0x35, 0x3d, 0x3c, 0x3d,
+    0x3a, 0x3c, 0x46, 0x25, 0x43, 0x35, 0x3d, 0x39, 0x3a, 0x3c, 0x40, 0x2b,
+    0x33, 0x40, 0x3d, 0x46, 0x45, 0x37, 0x3c, 0x36, 0x43, 0x37, 0x3e, 0x3a,
+    0x3c, 0x47, 0x3f, 0x38, 0x36, 0x3e, 0x3a, 0x42, 0x3c, 0x42, 0x33, 0x39,
+    0x3c, 0x3a, 0x3c, 0x40, 0x48, 0x3b, 0x40, 0x32, 0x37, 0x47, 0x34, 0x38,
+    0x33, 0x3d, 0x49, 0x2d, 0x36, 0x42, 0x3d, 0x3e, 0x47, 0x3c, 0x42, 0x2c,
+    0x3b, 0x31, 0x3f, 0x3c, 0x3d, 0x3c, 0x3f, 0x2b, 0x41, 0x35, 0x33, 0x43,
+    0x47, 0x39, 0x34, 0x2a, 0x3a, 0x3a, 0x40, 0x3d, 0x44, 0x3c, 0x39, 0x34,
+    0x43, 0x40, 0x33, 0x3a, 0x3b, 0x42, 0x38, 0x3b, 0x34, 0x35, 0x40, 0x43,
+    0x4b, 0x41, 0x3d, 0x38, 0x49, 0x44, 0x4d, 0x37, 0x3a, 0x4b, 0x40, 0x39,
+    0x4e, 0x3b, 0x30, 0x38, 0x47, 0x5d, 0x50, 0x1f, 0x54, 0x35, 0x3a, 0x39,
+    0x40, 0x4c, 0x46, 0x42, 0x52, 0x39, 0x39, 0x45, 0x41, 0x3c, 0x30, 0x5b,
+    0x43, 0x4d, 0x4a, 0x3e, 0x31, 0x39, 0x41, 0x4c, 0x36, 0x44, 0x4c, 0x39,
+    0x32, 0x41, 0x47, 0x3e, 0x34, 0x49, 0x45, 0x3b, 0x34, 0x3a, 0x3b, 0x47,
+    0x43, 0x3e, 0x43, 0x32, 0x40, 0x3e, 0x3e, 0x38, 0x37, 0x3e, 0x37, 0x3a,
+    0x3a, 0x40, 0x48, 0x2f, 0x3e, 0x3e, 0x46, 0x3a, 0x3e, 0x35, 0x49, 0x30,
+    0x3a, 0x41, 0x3e, 0x39, 0x34, 0x45, 0x3d, 0x34, 0x48, 0x43, 0x43, 0x42,
+    0x33, 0x39, 0x3b, 0x3f, 0x30, 0x46, 0x41, 0x39, 0x48, 0x3a, 0x3c, 0x3e,
+    0x3f, 0x36, 0x40, 0x3d, 0x43, 0x40, 0x3e, 0x39, 0x44, 0x40, 0x44, 0x3b,
+    0x43, 0x42, 0x39, 0x38, 0x3a, 0x3f, 0x3b, 0x3f, 0x38, 0x3d, 0x34, 0x30,
+    0x34, 0x3d, 0x3f, 0x42, 0x44, 0x3e, 0x34, 0x32, 0x37, 0x46, 0x44, 0x38,
+    0x3c, 0x45, 0x39, 0x2b, 0x41, 0x3c, 0x40, 0x40, 0x3a, 0x3a, 0x3c, 0x32,
+    0x45, 0x42, 0x3d, 0x46, 0x38, 0x3b, 0x34, 0x35, 0x38, 0x43, 0x3d, 0x34,
+    0x42, 0x3b, 0x38, 0x3d, 0x37, 0x43, 0x3f, 0x39, 0x4e, 0x39, 0x40, 0x3f,
+    0x4d, 0x43, 0x49, 0x3f, 0x36, 0x41, 0x44, 0x39, 0x48, 0x3a, 0x35, 0x39,
+    0x48, 0x59, 0x4e, 0x25, 0x58, 0x39, 0x42, 0x35, 0x43, 0x4e, 0x42, 0x3f,
+    0x4a, 0x43, 0x3b, 0x3f, 0x3b, 0x37, 0x2b, 0x5a, 0x3d, 0x44, 0x3b, 0x40,
+    0x31, 0x38, 0x37, 0x44, 0x32, 0x3e, 0x41, 0x3d, 0x2c, 0x42, 0x42, 0x3c,
+    0x37, 0x45, 0x41, 0x41, 0x3d, 0x39, 0x41, 0x40, 0x3a, 0x46, 0x41, 0x40,
+    0x40, 0x3d, 0x38, 0x31, 0x37, 0x3f, 0x42, 0x38, 0x3f, 0x3c, 0x48, 0x30,
+    0x3e, 0x39, 0x3f, 0x3d, 0x3d, 0x44, 0x52, 0x35, 0x3b, 0x32, 0x42, 0x32,
+    0x3a, 0x43, 0x39, 0x3b, 0x31, 0x43, 0x36, 0x3c, 0x3c, 0x3c, 0x41, 0x45,
+    0x42, 0x49, 0x41, 0x3b, 0x42, 0x3e, 0x41, 0x44, 0x36, 0x41, 0x3f, 0x3c,
+    0x3e, 0x47, 0x45, 0x41, 0x38, 0x41, 0x3f, 0x43, 0x35, 0x32, 0x41, 0x39,
+    0x36, 0x47, 0x35, 0x42, 0x44, 0x3b, 0x3f, 0x34, 0x48, 0x41, 0x43, 0x42,
+    0x36, 0x3e, 0x3c, 0x3d, 0x3d, 0x3b, 0x42, 0x44, 0x3a, 0x44, 0x36, 0x2a,
+    0x41, 0x39, 0x3a, 0x41, 0x46, 0x3c, 0x44, 0x2f, 0x36, 0x39, 0x3b, 0x3f,
+    0x38, 0x45, 0x3c, 0x3c, 0x3e, 0x41, 0x3c, 0x39, 0x3e, 0x40, 0x2f, 0x45,
+    0x3b, 0x41, 0x40, 0x3c, 0x4e, 0x38, 0x3e, 0x48, 0x46, 0x40, 0x48, 0x44,
+    0x40, 0x4a, 0x45, 0x3c, 0x4f, 0x39, 0x37, 0x3a, 0x4e, 0x59, 0x5c, 0x22,
+    0x58, 0x32, 0x38, 0x34, 0x40, 0x4b, 0x43, 0x43, 0x4f, 0x3e, 0x39, 0x40,
+    0x37, 0x3e, 0x2f, 0x55, 0x3f, 0x40, 0x38, 0x3f, 0x3a, 0x33, 0x37, 0x3d,
+    0x34, 0x4c, 0x37, 0x3f, 0x32, 0x39, 0x45, 0x34, 0x44, 0x4c, 0x3f, 0x3b,
+    0x3c, 0x36, 0x36, 0x43, 0x36, 0x47, 0x41, 0x46, 0x41, 0x3e, 0x41, 0x3a,
+    0x43, 0x3a, 0x48, 0x42, 0x42, 0x3e, 0x4c, 0x36, 0x3d, 0x39, 0x43, 0x46,
+    0x3d, 0x42, 0x42, 0x3b, 0x45, 0x43, 0x3c, 0x40, 0x39, 0x37, 0x34, 0x45,
+    0x3f, 0x40, 0x34, 0x38, 0x43, 0x3f, 0x36, 0x47, 0x3f, 0x3b, 0x49, 0x3c,
+    0x3a, 0x3a, 0x42, 0x4c, 0x37, 0x3e, 0x3b, 0x32, 0x47, 0x40, 0x45, 0x4d,
+    0x39, 0x3b, 0x39, 0x40, 0x3e, 0x3c, 0x3d, 0x3a, 0x3d, 0x3b, 0x3e, 0x43,
+    0x3e, 0x3f, 0x3a, 0x3c, 0x41, 0x40, 0x39, 0x3c, 0x3a, 0x38, 0x39, 0x37,
+    0x36, 0x33, 0x43, 0x45, 0x3f, 0x45, 0x41, 0x30, 0x3b, 0x34, 0x3c, 0x39,
+    0x3b, 0x45, 0x37, 0x2e, 0x36, 0x34, 0x36, 0x44, 0x3d, 0x40, 0x3a, 0x3c,
+    0x3d, 0x3b, 0x38, 0x41, 0x42, 0x3a, 0x32, 0x4b, 0x38, 0x3e, 0x41, 0x46,
+    0x57, 0x3a, 0x44, 0x48, 0x47, 0x45, 0x47, 0x3e, 0x43, 0x42, 0x45, 0x3b,
+    0x50, 0x39, 0x37, 0x3f, 0x47, 0x51, 0x5e, 0x22, 0x59, 0x33, 0x3c, 0x37,
+    0x43, 0x50, 0x49, 0x47, 0x46, 0x42, 0x39, 0x44, 0x44, 0x3d, 0x2f, 0x53,
+    0x35, 0x41, 0x40, 0x3d, 0x2d, 0x35, 0x2f, 0x3e, 0x3f, 0x37, 0x38, 0x3e,
+    0x30, 0x45, 0x46, 0x38, 0x33, 0x3c, 0x3e, 0x3b, 0x44, 0x42, 0x47, 0x49,
+    0x43, 0x40, 0x3d, 0x3c, 0x38, 0x43, 0x3e, 0x38, 0x3d, 0x40, 0x36, 0x43,
+    0x43, 0x3e, 0x40, 0x3c, 0x44, 0x47, 0x43, 0x3d, 0x41, 0x39, 0x3e, 0x45,
+    0x39, 0x3d, 0x39, 0x40, 0x42, 0x40, 0x3b, 0x4a, 0x40, 0x41, 0x3f, 0x37,
+    0x43, 0x41, 0x37, 0x4c, 0x3f, 0x3d, 0x38, 0x3a, 0x42, 0x46, 0x43, 0x4d,
+    0x3c, 0x3a, 0x43, 0x3e, 0x3b, 0x3d, 0x46, 0x4a, 0x38, 0x3d, 0x3d, 0x39,
+    0x3e, 0x3c, 0x3b, 0x3e, 0x3a, 0x40, 0x40, 0x34, 0x41, 0x3f, 0x3e, 0x3f,
+    0x47, 0x3c, 0x32, 0x3a, 0x3c, 0x44, 0x3f, 0x42, 0x41, 0x43, 0x3e, 0x3a,
+    0x3b, 0x42, 0x41, 0x39, 0x39, 0x37, 0x39, 0x3e, 0x3d, 0x33, 0x3e, 0x35,
+    0x44, 0x37, 0x40, 0x35, 0x3f, 0x47, 0x37, 0x41, 0x35, 0x38, 0x47, 0x40,
+    0x43, 0x44, 0x2e, 0x48, 0x35, 0x44, 0x41, 0x3c, 0x47, 0x3d, 0x3d, 0x52,
+    0x48, 0x41, 0x44, 0x41, 0x42, 0x4b, 0x3e, 0x3d, 0x4e, 0x32, 0x34, 0x47,
+    0x55, 0x57, 0x5f, 0x22, 0x57, 0x33, 0x40, 0x37, 0x40, 0x4a, 0x4d, 0x47,
+    0x48, 0x38, 0x3e, 0x46, 0x37, 0x42, 0x28, 0x57, 0x38, 0x42, 0x36, 0x43,
+    0x35, 0x37, 0x39, 0x39, 0x42, 0x39, 0x38, 0x3c, 0x35, 0x3c, 0x3c, 0x3a,
+    0x3c, 0x4c, 0x45, 0x3f, 0x43, 0x3d, 0x45, 0x45, 0x40, 0x47, 0x3e, 0x3e,
+    0x3d, 0x4b, 0x49, 0x35, 0x43, 0x3c, 0x36, 0x46, 0x3c, 0x46, 0x42, 0x44,
+    0x3c, 0x42, 0x3d, 0x42, 0x44, 0x3c, 0x4a, 0x40, 0x40, 0x3c, 0x3b, 0x3c,
+    0x35, 0x34, 0x2e, 0x46, 0x38, 0x3d, 0x38, 0x44, 0x41, 0x40, 0x3c, 0x52,
+    0x3b, 0x3d, 0x3b, 0x3f, 0x42, 0x47, 0x44, 0x52, 0x44, 0x44, 0x39, 0x3f,
+    0x43, 0x35, 0x3c, 0x4d, 0x39, 0x3d, 0x3b, 0x37, 0x3e, 0x38, 0x3e, 0x49,
+    0x3a, 0x37, 0x3c, 0x49, 0x40, 0x41, 0x3c, 0x40, 0x3d, 0x38, 0x39, 0x3f,
+    0x44, 0x3e, 0x42, 0x3e, 0x47, 0x40, 0x34, 0x46, 0x48, 0x37, 0x45, 0x3e,
+    0x46, 0x3f, 0x35, 0x39, 0x38, 0x3f, 0x36, 0x2c, 0x40, 0x38, 0x3e, 0x3c,
+    0x32, 0x3c, 0x46, 0x3a, 0x3f, 0x41, 0x36, 0x49, 0x42, 0x38, 0x36, 0x43,
+    0x3d, 0x41, 0x46, 0x35, 0x4f, 0x3a, 0x41, 0x5c, 0x4a, 0x42, 0x4e, 0x42,
+    0x46, 0x54, 0x3f, 0x45, 0x4c, 0x30, 0x33, 0x44, 0x56, 0x5d, 0x68, 0x26,
+    0x60, 0x33, 0x3e, 0x3a, 0x42, 0x49, 0x52, 0x47, 0x51, 0x46, 0x40, 0x47,
+    0x41, 0x3b, 0x1b, 0x4f, 0x3c, 0x45, 0x3d, 0x3d, 0x32, 0x2f, 0x3e, 0x3c,
+    0x3c, 0x3f, 0x3b, 0x3c, 0x2c, 0x3a, 0x41, 0x3c, 0x35, 0x3e, 0x3e, 0x3c,
+    0x3d, 0x3f, 0x3e, 0x40, 0x40, 0x44, 0x42, 0x3c, 0x3c, 0x3c, 0x41, 0x3c,
+    0x3c, 0x3d, 0x3e, 0x3d, 0x3c, 0x3d, 0x4a, 0x46, 0x3f, 0x35, 0x33, 0x43,
+    0x42, 0x41, 0x4d, 0x48, 0x48, 0x44, 0x3e, 0x41, 0x41, 0x36, 0x3c, 0x4c,
+    0x34, 0x47, 0x42, 0x39, 0x3e, 0x43, 0x3a, 0x53, 0x3b, 0x3b, 0x42, 0x3d,
+    0x41, 0x3c, 0x3e, 0x52, 0x3a, 0x44, 0x34, 0x43, 0x3d, 0x3d, 0x3a, 0x50,
+    0x3e, 0x33, 0x41, 0x40, 0x3f, 0x38, 0x43, 0x42, 0x3b, 0x37, 0x3e, 0x43,
+    0x3f, 0x3c, 0x41, 0x49, 0x40, 0x32, 0x40, 0x3e, 0x3b, 0x3e, 0x44, 0x3c,
+    0x35, 0x37, 0x3d, 0x41, 0x34, 0x3f, 0x3a, 0x3c, 0x47, 0x32, 0x41, 0x3d,
+    0x3c, 0x3a, 0x4a, 0x31, 0x43, 0x38, 0x45, 0x37, 0x49, 0x3c, 0x34, 0x3f,
+    0x3d, 0x3d, 0x3d, 0x45, 0x47, 0x3e, 0x37, 0x48, 0x40, 0x3b, 0x45, 0x3d,
+    0x4e, 0x42, 0x3f, 0x57, 0x4b, 0x43, 0x4b, 0x3d, 0x3f, 0x47, 0x4a, 0x43,
+    0x4e, 0x30, 0x38, 0x45, 0x59, 0x60, 0x64, 0x2d, 0x5a, 0x2d, 0x34, 0x35,
+    0x47, 0x54, 0x4e, 0x3f, 0x44, 0x45, 0x3c, 0x43, 0x3d, 0x40, 0x1c, 0x5a,
+    0x36, 0x3f, 0x3a, 0x39, 0x37, 0x3c, 0x32, 0x3b, 0x2d, 0x4a, 0x42, 0x35,
+    0x30, 0x41, 0x43, 0x3d, 0x3d, 0x45, 0x38, 0x36, 0x3e, 0x40, 0x3a, 0x4a,
+    0x34, 0x3d, 0x44, 0x3c, 0x39, 0x3b, 0x52, 0x38, 0x40, 0x3b, 0x3f, 0x3f,
+    0x35, 0x37, 0x46, 0x48, 0x38, 0x3b, 0x40, 0x36, 0x3d, 0x3a, 0x4f, 0x45,
+    0x35, 0x3a, 0x35, 0x33, 0x37, 0x43, 0x42, 0x52, 0x37, 0x3b, 0x3d, 0x42,
+    0x44, 0x3d, 0x48, 0x58, 0x33, 0x3f, 0x41, 0x44, 0x44, 0x3f, 0x3b, 0x52,
+    0x47, 0x39, 0x32, 0x3b, 0x38, 0x35, 0x48, 0x50, 0x34, 0x30, 0x39, 0x43,
+    0x42, 0x40, 0x3b, 0x4b, 0x43, 0x3d, 0x34, 0x44, 0x33, 0x39, 0x44, 0x4b,
+    0x45, 0x3e, 0x3c, 0x3f, 0x3a, 0x3e, 0x3c, 0x45, 0x36, 0x3e, 0x3d, 0x40,
+    0x43, 0x46, 0x37, 0x3d, 0x3b, 0x42, 0x43, 0x3f, 0x3a, 0x41, 0x48, 0x2f,
+    0x3e, 0x39, 0x3a, 0x39, 0x3f, 0x3a, 0x41, 0x40, 0x40, 0x3c, 0x3b, 0x3b,
+    0x3f, 0x40, 0x3e, 0x42, 0x38, 0x3f, 0x38, 0x3c, 0x49, 0x45, 0x3f, 0x62,
+    0x55, 0x47, 0x4c, 0x3c, 0x3c, 0x4a, 0x4c, 0x46, 0x4f, 0x39, 0x3a, 0x3b,
+    0x5e, 0x58, 0x6f, 0x2b, 0x5a, 0x2f, 0x3a, 0x35, 0x4b, 0x47, 0x4a, 0x46,
+    0x45, 0x3e, 0x38, 0x4f, 0x3b, 0x3d, 0x21, 0x4b, 0x3d, 0x40, 0x37, 0x40,
+    0x2d, 0x2c, 0x43, 0x3f, 0x2b, 0x3e, 0x3d, 0x39, 0x2f, 0x39, 0x44, 0x3c,
+    0x39, 0x39, 0x43, 0x3b, 0x3d, 0x3b, 0x44, 0x39, 0x42, 0x42, 0x3e, 0x40,
+    0x3b, 0x42, 0x53, 0x40, 0x32, 0x3d, 0x35, 0x3f, 0x3d, 0x45, 0x48, 0x46,
+    0x3d, 0x43, 0x3c, 0x36, 0x35, 0x39, 0x3d, 0x4a, 0x39, 0x39, 0x3e, 0x41,
+    0x38, 0x36, 0x3b, 0x53, 0x3c, 0x36, 0x32, 0x3b, 0x43, 0x3d, 0x42, 0x57,
+    0x35, 0x2f, 0x38, 0x40, 0x2f, 0x3d, 0x3c, 0x4c, 0x40, 0x2f, 0x3a, 0x36,
+    0x39, 0x3c, 0x3a, 0x51, 0x3d, 0x37, 0x39, 0x3c, 0x42, 0x40, 0x43, 0x52,
+    0x3e, 0x42, 0x3e, 0x45, 0x36, 0x34, 0x42, 0x4b, 0x3a, 0x38, 0x37, 0x3f,
+    0x36, 0x41, 0x3a, 0x45, 0x3e, 0x38, 0x35, 0x41, 0x35, 0x34, 0x37, 0x3c,
+    0x3f, 0x31, 0x3c, 0x35, 0x33, 0x43, 0x36, 0x28, 0x44, 0x42, 0x3e, 0x42,
+    0x3a, 0x41, 0x43, 0x35, 0x3d, 0x3f, 0x40, 0x3e, 0x3d, 0x33, 0x31, 0x41,
+    0x3d, 0x40, 0x3b, 0x40, 0x51, 0x40, 0x3f, 0xfb, 0x51, 0x49, 0x4c, 0x3d,
+    0x44, 0x4e, 0x47, 0x42, 0x50, 0x39, 0x39, 0x40, 0x59, 0x5d, 0x70, 0x2c,
+    0x59, 0x39, 0x38, 0x2f, 0x46, 0x50, 0x51, 0x47, 0x4c, 0x3c, 0x39, 0x48,
+    0x44, 0x3a, 0x1a, 0x51, 0x35, 0x3e, 0x34, 0x3a, 0x3d, 0x2b, 0x41, 0x39,
+    0x37, 0x4d, 0x3e, 0x43, 0x38, 0x3b, 0x3a, 0x35, 0x36, 0x3a, 0x43, 0x39,
+    0x39, 0x3a, 0x46, 0x3b, 0x39, 0x3c, 0x46, 0x36, 0x3e, 0x3d, 0x4b, 0x3d,
+    0x3b, 0x46, 0x3a, 0x41, 0x31, 0x3c, 0x44, 0x4a, 0x37, 0x42, 0x39, 0x43,
+    0x43, 0x3e, 0x40, 0x47, 0x3c, 0x3e, 0x3b, 0x43, 0x34, 0x3a, 0x43, 0x53,
+    0x3f, 0x37, 0x39, 0x37, 0x3e, 0x3b, 0x46, 0x59, 0x37, 0x37, 0x33, 0x3d,
+    0x38, 0x42, 0x36, 0x58, 0x2e, 0x32, 0x2b, 0x45, 0x32, 0x33, 0x36, 0x50,
+    0x41, 0x3f, 0x37, 0x3d, 0x3f, 0x3d, 0x46, 0x49, 0x41, 0x38, 0x33, 0x3d,
+    0x33, 0x32, 0x3a, 0x49, 0x41, 0x41, 0x3d, 0x33, 0x3b, 0x3b, 0x3a, 0x46,
+    0x34, 0x44, 0x3f, 0x3b, 0x2f, 0x3f, 0x32, 0x3c, 0x3f, 0x43, 0x3e, 0x45,
+    0x3a, 0x3c, 0x43, 0x26, 0x46, 0x37, 0x38, 0x3e, 0x36, 0x31, 0x3e, 0x34,
+    0x39, 0x3a, 0x38, 0x42, 0x38, 0x3e, 0x32, 0x42, 0x37, 0x37, 0x3c, 0x3a,
+    0x48, 0x44, 0x3a, 0x68, 0x56, 0x46, 0x4d, 0x47, 0x40, 0x4e, 0x42, 0x46,
+    0x51, 0x40, 0x38, 0x43, 0x58, 0x5d, 0x6a, 0x31, 0x57, 0x32, 0x3c, 0x36,
+    0x49, 0x56, 0x52, 0x48, 0x4b, 0x41, 0x2f, 0x4d, 0x31, 0x43, 0x1b, 0x4c,
+    0x30, 0x44, 0x33, 0x36, 0x2c, 0x3d, 0x45, 0x3a, 0x35, 0x46, 0x3d, 0x39,
+    0x2e, 0x38, 0x3f, 0x37, 0x41, 0x44, 0x46, 0x31, 0x33, 0x46, 0x37, 0x37,
+    0x3f, 0x41, 0x45, 0x30, 0x46, 0x3b, 0x50, 0x3b, 0x40, 0x39, 0x42, 0x43,
+    0x35, 0x37, 0x40, 0x44, 0x3b, 0x41, 0x3d, 0x37, 0x3a, 0x41, 0x3d, 0x46,
+    0x36, 0x41, 0x38, 0x41, 0x38, 0x3d, 0x45, 0x58, 0x3d, 0x3a, 0x3d, 0x44,
+    0x45, 0x38, 0x48, 0x5c, 0x3d, 0x39, 0x43, 0x45, 0x41, 0x3e, 0x4a, 0x56,
+    0x40, 0x33, 0x30, 0x31, 0x42, 0x39, 0x38, 0x56, 0x30, 0x3a, 0x35, 0x3e,
+    0x3f, 0x38, 0x36, 0x47, 0x3c, 0x3a, 0x3d, 0x3f, 0x37, 0x35, 0x3b, 0x4d,
+    0x43, 0x36, 0x39, 0x37, 0x3e, 0x42, 0x3d, 0x3f, 0x40, 0x3f, 0x34, 0x3b,
+    0x3f, 0x3e, 0x3b, 0x39, 0x3b, 0x3a, 0x3a, 0x3c, 0x34, 0x3f, 0x3c, 0x2a,
+    0x49, 0x3b, 0x36, 0x3c, 0x35, 0x46, 0x38, 0x3b, 0x3c, 0x39, 0x38, 0x42,
+    0x39, 0x36, 0x2e, 0x4a, 0x3d, 0x39, 0x3f, 0x3f, 0x4b, 0x45, 0x3e, 0x67,
+    0x4b, 0x4b, 0x49, 0x3e, 0x3f, 0x53, 0x4c, 0x55, 0x47, 0x32, 0x3b, 0x39,
+    0x54, 0x5b, 0x6f, 0x29, 0x5a, 0x34, 0x3e, 0x26, 0x45, 0x52, 0x59, 0x44,
+    0x59, 0x39, 0x3c, 0x47, 0x36, 0x46, 0x16, 0x50, 0x32, 0x46, 0x34, 0x35,
+    0x35, 0x2d, 0x39, 0x38, 0x2c, 0x42, 0x43, 0x3b, 0x32, 0x3f, 0x37, 0x2f,
+    0x34, 0x43, 0x46, 0x3b, 0x3b, 0x41, 0x3c, 0x37, 0x3e, 0x43, 0x4b, 0x36,
+    0x3e, 0x3c, 0x4c, 0x42, 0x40, 0x3f, 0x49, 0x40, 0x3c, 0x40, 0x3c, 0x48,
+    0x35, 0x42, 0x3f, 0x42, 0x44, 0x40, 0x45, 0x4f, 0x3f, 0x3f, 0x40, 0x42,
+    0x3b, 0x3d, 0x49, 0x55, 0x42, 0x39, 0x41, 0x3b, 0x3f, 0x38, 0x44, 0x60,
+    0x34, 0x40, 0x3b, 0x3b, 0x35, 0x3d, 0x41, 0x4e, 0x35, 0x33, 0x30, 0x3a,
+    0x3a, 0x32, 0x42, 0x4f, 0x33, 0x34, 0x2f, 0x38, 0x49, 0x38, 0x40, 0x4c,
+    0x35, 0x38, 0x3e, 0x46, 0x3f, 0x3a, 0x3a, 0x45, 0x3b, 0x34, 0x2e, 0x39,
+    0x32, 0x3e, 0x40, 0x48, 0x35, 0x44, 0x3a, 0x34, 0x3f, 0x35, 0x3b, 0x32,
+    0x40, 0x43, 0x3e, 0x38, 0x3b, 0x43, 0x3c, 0x2b, 0x46, 0x43, 0x40, 0x32,
+    0x42, 0x3b, 0x49, 0x2e, 0x3b, 0x3a, 0x3e, 0x41, 0x3c, 0x3f, 0x31, 0x3b,
+    0x41, 0x33, 0x41, 0x3c, 0x4d, 0x40, 0x38, 0x68, 0x4c, 0x4c, 0x4e, 0x3f,
+    0x3f, 0x54, 0x4a, 0x3d, 0x4c, 0x33, 0x3b, 0x3a, 0x5d, 0x60, 0x71, 0x2b,
+    0x59, 0x33, 0x3c, 0x2c, 0x47, 0x52, 0x4f, 0x51, 0x56, 0x3d, 0x39, 0x44,
+    0x35, 0x41, 0x1b, 0x4a, 0x35, 0x41, 0x37, 0x35, 0x2c, 0x35, 0x37, 0x35,
+    0x38, 0x41, 0x38, 0x3e, 0x3c, 0x40, 0x3c, 0x2f, 0x38, 0x3e, 0x3f, 0x45,
+    0x40, 0x3d, 0x3c, 0x35, 0x3c, 0x46, 0x43, 0x39, 0x37, 0x42, 0x4e, 0x3c,
+    0x42, 0x46, 0x37, 0x33, 0x43, 0x3f, 0x47, 0x4a, 0x3d, 0x3e, 0x40, 0x40,
+    0x40, 0x3f, 0x4b, 0x54, 0x36, 0x3f, 0x37, 0x40, 0x39, 0x39, 0x47, 0x51,
+    0x3d, 0x39, 0x36, 0x36, 0x40, 0x40, 0x41, 0x5a, 0x38, 0x39, 0x42, 0x38,
+    0x40, 0x39, 0x43, 0x50, 0x3a, 0x3a, 0x32, 0x3c, 0x3c, 0x35, 0x44, 0x4a,
+    0x37, 0x35, 0x36, 0x3c, 0x35, 0x30, 0x48, 0x4b, 0x3c, 0x33, 0x37, 0x3e,
+    0x42, 0x3c, 0x42, 0x4e, 0x41, 0x32, 0x3e, 0x33, 0x49, 0x39, 0x3e, 0x42,
+    0x3d, 0x39, 0x37, 0x36, 0x35, 0x41, 0x3e, 0x37, 0x37, 0x3e, 0x3d, 0x38,
+    0x3a, 0x3c, 0x41, 0x29, 0x3c, 0x3b, 0x39, 0x40, 0x43, 0x3d, 0x3e, 0x33,
+    0x3f, 0x3f, 0x3e, 0x43, 0x43, 0x38, 0x38, 0x41, 0x3b, 0x38, 0x35, 0x3a,
+    0x4b, 0x44, 0x44, 0x55, 0x4e, 0x44, 0x4d, 0x49, 0x3e, 0x53, 0x45, 0x3f,
+    0x45, 0x3d, 0x36, 0x36, 0x4f, 0x5b, 0x6b, 0x28, 0x59, 0x34, 0x39, 0x34,
+    0x4f, 0x4d, 0x52, 0x3e, 0x51, 0x34, 0x35, 0x4a, 0x3b, 0x3f, 0x21, 0x45,
+    0x36, 0x3f, 0x38, 0x33, 0x2c, 0x37, 0x32, 0x2f, 0x2b, 0x44, 0x47, 0x3f,
+    0x38, 0x3a, 0x3f, 0x2e, 0x41, 0x3f, 0x3d, 0x41, 0x35, 0x48, 0x43, 0x40,
+    0x33, 0x44, 0x40, 0x38, 0x47, 0x44, 0x4c, 0x3d, 0x41, 0x3b, 0x39, 0x36,
+    0x3e, 0x44, 0x49, 0x48, 0x3c, 0x3b, 0x34, 0x34, 0x3f, 0x3c, 0x42, 0x52,
+    0x43, 0x41, 0x3c, 0x3c, 0x3d, 0x43, 0x48, 0x54, 0x39, 0x35, 0x39, 0x3c,
+    0x43, 0x3c, 0x44, 0x5f, 0x39, 0x3d, 0x38, 0x3f, 0x36, 0x3d, 0x43, 0x58,
+    0x33, 0x3d, 0x43, 0x33, 0x3f, 0x36, 0x39, 0x54, 0x3a, 0x37, 0x2d, 0x46,
+    0x43, 0x41, 0x47, 0x46, 0x3e, 0x42, 0x34, 0x49, 0x3a, 0x3f, 0x38, 0x50,
+    0x3a, 0x3b, 0x42, 0x3a, 0x3e, 0x3c, 0x3b, 0x40, 0x42, 0x45, 0x37, 0x3b,
+    0x2f, 0x3b, 0x46, 0x30, 0x42, 0x3b, 0x3b, 0x44, 0x3b, 0x3e, 0x40, 0x1e,
+    0x33, 0x40, 0x40, 0x3d, 0x39, 0x3a, 0x41, 0x33, 0x45, 0x3e, 0x3c, 0x3f,
+    0x3f, 0x38, 0x31, 0x46, 0x3b, 0x35, 0x42, 0x39, 0x49, 0x3e, 0x3d, 0x66,
+    0x53, 0x3f, 0x44, 0x40, 0x43, 0x45, 0x48, 0x45, 0x49, 0x2d, 0x3e, 0x3a,
+    0x4f, 0x5a, 0x62, 0x27, 0x54, 0x37, 0x35, 0x34, 0x42, 0x50, 0x54, 0x43,
+    0x4d, 0x38, 0x39, 0x48, 0x38, 0x4c, 0x21, 0x3f, 0x40, 0x3a, 0x3a, 0x2f,
+    0x37, 0x2f, 0x29, 0x2c, 0x36, 0x47, 0x3f, 0x41, 0x31, 0x33, 0x3e, 0x32,
+    0x3e, 0x40, 0x42, 0x40, 0x42, 0x3a, 0x46, 0x33, 0x44, 0x40, 0x3c, 0x43,
+    0x3d, 0x41, 0x4d, 0x3d, 0x3c, 0x47, 0x46, 0x43, 0x42, 0x3e, 0x44, 0x4e,
+    0x41, 0x3a, 0x44, 0x38, 0x45, 0x3b, 0x49, 0x4c, 0x40, 0x3f, 0x37, 0x3e,
+    0x3e, 0x46, 0x41, 0x51, 0x3f, 0x39, 0x30, 0x40, 0x3e, 0x38, 0x43, 0x5b,
+    0x33, 0x3e, 0x31, 0x42, 0x3d, 0x2f, 0x49, 0x57, 0x37, 0x31, 0x46, 0x44,
+    0x3e, 0x35, 0x40, 0x55, 0x36, 0x35, 0x3d, 0x3c, 0x38, 0x33, 0x42, 0x52,
+    0x3b, 0x39, 0x34, 0x31, 0x45, 0x34, 0x3c, 0x51, 0x33, 0x39, 0x3c, 0x40,
+    0x36, 0x36, 0x42, 0x3e, 0x37, 0x3e, 0x3b, 0x40, 0x3d, 0x36, 0x41, 0x30,
+    0x42, 0x45, 0x40, 0x49, 0x3d, 0x32, 0x46, 0x26, 0x40, 0x44, 0x3a, 0x3f,
+    0x3d, 0x46, 0x45, 0x31, 0x33, 0x34, 0x3e, 0x37, 0x46, 0x3b, 0x32, 0x3a,
+    0x3d, 0x31, 0x3c, 0x36, 0x50, 0x41, 0x3b, 0x5d, 0x53, 0x42, 0x44, 0x37,
+    0x3e, 0x4d, 0x41, 0x4b, 0x49, 0x2f, 0x35, 0x3a, 0x4e, 0x59, 0x5d, 0x27,
+    0x5c, 0x30, 0x3d, 0x3a, 0x46, 0x50, 0x57, 0x4a, 0x4c, 0x36, 0x37, 0x46,
+    0x48, 0x41, 0x24, 0x49, 0x36, 0x3e, 0x41, 0x45, 0x37, 0x38, 0x2e, 0x2e,
+    0x34, 0x3c, 0x38, 0x41, 0x36, 0x3d, 0x43, 0x36, 0x3e, 0x3e, 0x41, 0x3b,
+    0x42, 0x3c, 0x43, 0x38, 0x3e, 0x3d, 0x41, 0x48, 0x47, 0x4c, 0x45, 0x3b,
+    0x37, 0x41, 0x38, 0x41, 0x3d, 0x41, 0x46, 0x4e, 0x36, 0x45, 0x38, 0x39,
+    0x42, 0x42, 0x37, 0x4c, 0x34, 0x46, 0x3c, 0x44, 0x4a, 0x39, 0x45, 0x53,
+    0x3c, 0x3f, 0x41, 0x35, 0x3c, 0x45, 0x4c, 0x5a, 0x44, 0x41, 0x30, 0x35,
+    0x40, 0x39, 0x42, 0x5a, 0x36, 0x36, 0x3a, 0x3b, 0x43, 0x35, 0x3c, 0x56,
+    0x35, 0x38, 0x2b, 0x4a, 0x3c, 0x40, 0x45, 0x54, 0x37, 0x37, 0x3a, 0x44,
+    0x42, 0x3b, 0x3d, 0x4a, 0x3f, 0x37, 0x3b, 0x35, 0x34, 0x3f, 0x40, 0x48,
+    0x45, 0x3e, 0x37, 0x38, 0x41, 0x41, 0x3d, 0x37, 0x43, 0x3d, 0x3d, 0x45,
+    0x3a, 0x38, 0x3f, 0x23, 0x4a, 0x37, 0x42, 0x3c, 0x3f, 0x43, 0x42, 0x33,
+    0x37, 0x39, 0x35, 0x3b, 0x41, 0x36, 0x2f, 0x3b, 0x41, 0x3a, 0x44, 0x3d,
+    0x3e, 0x45, 0x44, 0x50, 0x47, 0x47, 0x48, 0x3c, 0x3f, 0x45, 0x43, 0x3f,
+    0x4a, 0x33, 0x3c, 0x3a, 0x52, 0x52, 0x5a, 0x23, 0x58, 0x31, 0x3b, 0x3b,
+    0x47, 0x44, 0x54, 0x45, 0x42, 0x38, 0x38, 0x40, 0x43, 0x3f, 0x2a, 0x46,
+    0x3b, 0x46, 0x3b, 0x46, 0x35, 0x37, 0x29, 0x35, 0x38, 0x41, 0x3a, 0x31,
+    0x44, 0x41, 0x39, 0x36, 0x45, 0x41, 0x40, 0x3e, 0x40, 0x44, 0x47, 0x37,
+    0x3f, 0x42, 0x49, 0x34, 0x46, 0x3d, 0x4b, 0x3d, 0x42, 0x3b, 0x42, 0x3e,
+    0x41, 0x3b, 0x3f, 0x43, 0x47, 0x45, 0x47, 0x41, 0x40, 0x3a, 0x3d, 0x45,
+    0x40, 0x36, 0x3b, 0x3b, 0x44, 0x37, 0x46, 0x55, 0x35, 0x42, 0x3f, 0x3a,
+    0x41, 0x41, 0x44, 0x5c, 0x31, 0x44, 0x3d, 0x46, 0x39, 0x38, 0x46, 0x59,
+    0x41, 0x3b, 0x3d, 0x39, 0x33, 0x3e, 0x41, 0x58, 0x33, 0x44, 0x34, 0x31,
+    0x48, 0x3e, 0x4d, 0x56, 0x36, 0x3c, 0x37, 0x46, 0x46, 0x38, 0x45, 0x53,
+    0x35, 0x3d, 0x3a, 0x31, 0x42, 0x48, 0x45, 0x44, 0x3b, 0x3b, 0x3c, 0x41,
+    0x3d, 0x42, 0x3f, 0x2f, 0x38, 0x3c, 0x3e, 0x41, 0x44, 0x3a, 0x4a, 0x24,
+    0x37, 0x3e, 0x37, 0x48, 0x40, 0x3f, 0x46, 0x3c, 0x47, 0x4a, 0x38, 0x47,
+    0x34, 0x45, 0x31, 0x42, 0x43, 0x44, 0x3f, 0x3f, 0x49, 0x40, 0x3c, 0x41,
+    0x4d, 0x43, 0x42, 0x39, 0x39, 0x48, 0x41, 0x38, 0x47, 0x3c, 0x3c, 0x42,
+    0x44, 0x55, 0x62, 0x2a, 0x5c, 0x32, 0x3a, 0x37, 0x4c, 0x44, 0x4f, 0x3e,
+    0x4e, 0x42, 0x3a, 0x42, 0x41, 0x4a, 0x35, 0x44, 0x45, 0x3b, 0x43, 0x41,
+    0x33, 0x38, 0x28, 0x36, 0x40, 0x47, 0x3e, 0x3e, 0x3e, 0x39, 0x3a, 0x37,
+    0x44, 0x44, 0x3f, 0x3b, 0x41, 0x3c, 0x45, 0x36, 0x38, 0x3a, 0x3c, 0x42,
+    0x42, 0x3f, 0x59, 0x3c, 0x47, 0x3d, 0x38, 0x3a, 0x42, 0x44, 0x41, 0x46,
+    0x3f, 0x43, 0x48, 0x42, 0x44, 0x35, 0x3f, 0x45, 0x36, 0x3f, 0x38, 0x3a,
+    0x44, 0x3d, 0x3d, 0x4e, 0x3e, 0x45, 0x40, 0x42, 0x3c, 0x33, 0x43, 0x5a,
+    0x38, 0x3e, 0x45, 0x3a, 0x3e, 0x42, 0x45, 0x52, 0x3c, 0x42, 0x3a, 0x38,
+    0x3d, 0x3b, 0x4a, 0x57, 0x38, 0x37, 0x47, 0x44, 0x3e, 0x3c, 0x38, 0x48,
+    0x36, 0x41, 0x3f, 0x41, 0x3a, 0x3a, 0x46, 0x47, 0x42, 0x40, 0x32, 0x33,
+    0x43, 0x37, 0x41, 0x43, 0x3e, 0x40, 0x3d, 0x3a, 0x3e, 0x38, 0x42, 0x30,
+    0x3e, 0x40, 0x46, 0x42, 0x40, 0x44, 0x42, 0x23, 0x31, 0x40, 0x3f, 0x3d,
+    0x3b, 0x33, 0x40, 0x33, 0x41, 0x33, 0x43, 0x41, 0x3a, 0x3e, 0x36, 0x40,
+    0x40, 0x45, 0x37, 0x42, 0x46, 0x42, 0x39, 0x48, 0x44, 0x40, 0x40, 0x45,
+    0x3c, 0x49, 0x41, 0x3f, 0x4c, 0x3d, 0x2f, 0x3f, 0x47, 0x52, 0x54, 0x2c,
+    0x55, 0x42, 0x44, 0x3b, 0x46, 0x4f, 0x48, 0x3c, 0x45, 0x39, 0x3f, 0x4b,
+    0x3f, 0x3f, 0x36, 0x42, 0x41, 0x48, 0x44, 0x44, 0x36, 0x3b, 0x37, 0x40,
+    0x39, 0x49, 0x3a, 0x35, 0x3e, 0x48, 0x31, 0x30, 0x44, 0x38, 0x4c, 0x3c,
+    0x41, 0x3e, 0x46, 0x32, 0x44, 0x3b, 0x42, 0x3c, 0x38, 0x3a, 0x47, 0x3f,
+    0x3a, 0x42, 0x3a, 0x43, 0x40, 0x4b, 0x47, 0x3c, 0x42, 0x46, 0x45, 0x42,
+    0x3c, 0x46, 0x3d, 0x3f, 0x3e, 0x36, 0x38, 0x3e, 0x46, 0x3c, 0x4d, 0x43,
+    0x49, 0x41, 0x48, 0x3c, 0x3d, 0x39, 0x43, 0x58, 0x3a, 0x41, 0x3f, 0x38,
+    0x37, 0x3f, 0x46, 0x5d, 0x3c, 0x3c, 0x39, 0x36, 0x3d, 0x46, 0x43, 0x50,
+    0x3a, 0x47, 0x39, 0x36, 0x41, 0x3f, 0x3e, 0x51, 0x31, 0x36, 0x3e, 0x3c,
+    0x3c, 0x3a, 0x48, 0x41, 0x3a, 0x43, 0x49, 0x3e, 0x42, 0x46, 0x3f, 0x41,
+    0x49, 0x33, 0x42, 0x41, 0x45, 0x40, 0x3d, 0x2b, 0x3d, 0x38, 0x40, 0x37,
+    0x3a, 0x31, 0x45, 0x26, 0x33, 0x3d, 0x3f, 0x39, 0x36, 0x3c, 0x38, 0x33,
+    0x34, 0x3f, 0x35, 0x44, 0x3a, 0x39, 0x32, 0x41, 0x35, 0x40, 0x3c, 0x3b,
+    0x4a, 0x3f, 0x3e, 0x3e, 0x4a, 0x3e, 0x42, 0x35, 0x38, 0x43, 0x3c, 0x37,
+    0x3d, 0x3c, 0x39, 0x43, 0x3f, 0x4e, 0x54, 0x33, 0x4b, 0x37, 0x43, 0x3b,
+    0x43, 0x48, 0x43, 0x42, 0x3d, 0x46, 0x45, 0x49, 0x3a, 0x39, 0x36, 0x4a,
+    0x48, 0x48, 0x37, 0x4b, 0x42, 0x47, 0x34, 0x34, 0x43, 0x42, 0x3a, 0x3d,
+    0x3c, 0x46, 0x34, 0x39, 0x40, 0x3b, 0x3e, 0x3e, 0x37, 0x3d, 0x53, 0x3b,
+    0x48, 0x3c, 0x43, 0x44, 0x3b, 0x3f, 0x57, 0x3d, 0x39, 0x3c, 0x39, 0x3a,
+    0x3e, 0x3f, 0x43, 0x3e, 0x41, 0x47, 0x3c, 0x41, 0x40, 0x41, 0x37, 0x3f,
+    0x3b, 0x43, 0x35, 0x3e, 0x45, 0x40, 0x47, 0x59, 0x41, 0x49, 0x3b, 0x3f,
+    0x47, 0x49, 0x4b, 0x61, 0x39, 0x48, 0x39, 0x3e, 0x44, 0x34, 0x3b, 0x59,
+    0x3c, 0x42, 0x45, 0x35, 0x42, 0x41, 0x39, 0x52, 0x42, 0x3c, 0x3d, 0x3e,
+    0x3d, 0x4a, 0x4a, 0x4d, 0x3c, 0x34, 0x44, 0x3c, 0x41, 0x34, 0x38, 0x46,
+    0x38, 0x45, 0x40, 0x45, 0x40, 0x3a, 0x3d, 0x44, 0x3a, 0x37, 0x3a, 0x3a,
+    0x3b, 0x42, 0x40, 0x34, 0x3b, 0x3c, 0x42, 0x40, 0x3d, 0x32, 0x40, 0x27,
+    0x37, 0x39, 0x37, 0x46, 0x48, 0x31, 0x40, 0x30, 0x42, 0x42, 0x3a, 0x40,
+    0x3d, 0x37, 0x2a, 0x40, 0x41, 0x37, 0x3c, 0x4a, 0x46, 0x45, 0x3d, 0x34,
+    0x48, 0x41, 0x42, 0x3e, 0x3f, 0x39, 0x3c, 0x3a, 0x4f, 0x3b, 0x32, 0x3e,
+    0x43, 0x51, 0x4f, 0x2a, 0x46, 0x3a, 0x3d, 0x3b, 0x40, 0x3d, 0x4c, 0x3c,
+    0x48, 0x40, 0x36, 0x4a, 0x3a, 0x38, 0x42, 0x43, 0x4c, 0x3d, 0x47, 0x47,
+    0x33, 0x3f, 0x2d, 0x37, 0x4a, 0x43, 0x38, 0x3e, 0x49, 0x42, 0x42, 0x3d,
+    0x43, 0x47, 0x41, 0x38, 0x46, 0x37, 0x46, 0x38, 0x47, 0x42, 0x49, 0x3d,
+    0x3b, 0x37, 0x4c, 0x3c, 0x3a, 0x45, 0x3f, 0x37, 0x36, 0x3d, 0x3c, 0x40,
+    0x3e, 0x45, 0x46, 0x41, 0x41, 0x40, 0x3c, 0x44, 0x47, 0x43, 0x37, 0x3f,
+    0x3e, 0x3a, 0x3a, 0x4b, 0x3a, 0x36, 0x3d, 0x3f, 0x38, 0x3f, 0x3c, 0x58,
+    0x40, 0x49, 0x3d, 0x42, 0x38, 0x3a, 0x47, 0x50, 0x3b, 0x49, 0x40, 0x44,
+    0x3e, 0x3c, 0x38, 0x52, 0x3a, 0x3e, 0x44, 0x3c, 0x35, 0x44, 0x3a, 0x47,
+    0x3e, 0x49, 0x3f, 0x47, 0x45, 0x39, 0x3b, 0x46, 0x44, 0x3e, 0x41, 0x46,
+    0x40, 0x41, 0x40, 0x40, 0x3a, 0x35, 0x3e, 0x36, 0x3e, 0x3e, 0x3d, 0x35,
+    0x3b, 0x3c, 0x38, 0x46, 0x3b, 0x3c, 0x41, 0x2c, 0x3f, 0x42, 0x38, 0x3b,
+    0x36, 0x3b, 0x39, 0x40, 0x40, 0x38, 0x36, 0x33, 0x34, 0x42, 0x2f, 0x44,
+    0x41, 0x40, 0x39, 0x35, 0x3b, 0x44, 0x42, 0x2c, 0x41, 0x3b, 0x44, 0x41,
+    0x35, 0x44, 0x3b, 0x34, 0x44, 0x49, 0x36, 0x39, 0x3a, 0x52, 0x4d, 0x2b,
+    0x40, 0x40, 0x3e, 0x39, 0x48, 0x42, 0x3c, 0x44, 0x46, 0x49, 0x3f, 0x54,
+    0x43, 0x40, 0x2e, 0x40, 0x4f, 0x36, 0x3e, 0x3f, 0x38, 0x48, 0x44, 0x3c,
+    0x44, 0x43, 0x41, 0x47, 0x40, 0x46, 0x40, 0x37, 0x41, 0x34, 0x3a, 0x41,
+    0x41, 0x3b, 0x49, 0x39, 0x42, 0x38, 0x3d, 0x39, 0x34, 0x35, 0x43, 0x36,
+    0x3e, 0x44, 0x3f, 0x40, 0x43, 0x40, 0x40, 0x3a, 0x47, 0x42, 0x3e, 0x42,
+    0x46, 0x35, 0x3a, 0x46, 0x3c, 0x3c, 0x3c, 0x3d, 0x3f, 0x40, 0x43, 0x4c,
+    0x3a, 0x37, 0x3f, 0x43, 0x47, 0x38, 0x42, 0x58, 0x42, 0x3b, 0x34, 0x37,
+    0x3e, 0x48, 0x3c, 0x57, 0x44, 0x3c, 0x3d, 0x3a, 0x36, 0x48, 0x3c, 0x51,
+    0x3d, 0x48, 0x45, 0x45, 0x38, 0x45, 0x40, 0x3f, 0x3b, 0x35, 0x3d, 0x3f,
+    0x38, 0x47, 0x39, 0x3b, 0x36, 0x49, 0x43, 0x40, 0x3f, 0x46, 0x38, 0x40,
+    0x3f, 0x3e, 0x39, 0x32, 0x47, 0x42, 0x35, 0x33, 0x39, 0x47, 0x3c, 0x36,
+    0x3b, 0x37, 0x43, 0x35, 0x3b, 0x3b, 0x34, 0x3b, 0x38, 0x3d, 0x3e, 0x3a,
+    0x35, 0x49, 0x38, 0x40, 0x3f, 0x3f, 0x3e, 0x37, 0x43, 0x3b, 0x3e, 0x3e,
+    0x3b, 0x40, 0x44, 0x39, 0x3d, 0x3f, 0x31, 0x42, 0x42, 0x3b, 0x41, 0x3d,
+    0x3e, 0x3c, 0x37, 0x34, 0x48, 0x3d, 0x49, 0x4a, 0x47, 0x36, 0x3a, 0x34,
+    0x37, 0x36, 0x3e, 0x38, 0x33, 0x45, 0x39, 0x44, 0x34, 0x49, 0x3a, 0x3d,
+    0x34, 0x31, 0x31, 0x3d, 0x34, 0x3d, 0x41, 0x3e, 0x49, 0x41, 0x34, 0x3f,
+    0x3a, 0x42, 0x3e, 0x40, 0x3f, 0x33, 0x46, 0x3f, 0x34, 0x39, 0x37, 0x46,
+    0x3e, 0x32, 0x3f, 0x45, 0x45, 0x41, 0x3b, 0x4b, 0x35, 0x35, 0x3b, 0x4a,
+    0x3d, 0x43, 0x3b, 0x44, 0x3c, 0x38, 0x31, 0x43, 0x39, 0x35, 0x41, 0x45,
+    0x37, 0x3e, 0x43, 0x47, 0x39, 0x40, 0x41, 0x41, 0x40, 0x32, 0x37, 0x3e,
+    0x3d, 0x39, 0x3b, 0x49, 0x33, 0x35, 0x38, 0x41, 0x45, 0x37, 0x3c, 0x49,
+    0x3b, 0x34, 0x34, 0x41, 0x3a, 0x3f, 0x3e, 0x47, 0x39, 0x3c, 0x34, 0x3a,
+    0x38, 0x44, 0x40, 0x51, 0x3a, 0x37, 0x3b, 0x3f, 0x3d, 0x3a, 0x45, 0x48,
+    0x3f, 0x46, 0x35, 0x43, 0x38, 0x43, 0x35, 0x4c, 0x42, 0x47, 0x44, 0x3d,
+    0x40, 0x3a, 0x39, 0x4e, 0x3d, 0x37, 0x3c, 0x42, 0x40, 0x48, 0x44, 0x4c,
+    0x31, 0x40, 0x42, 0x3b, 0x45, 0x45, 0x3f, 0x3e, 0x3d, 0x44, 0x3f, 0x31,
+    0x3f, 0x44, 0x45, 0x37, 0x3e, 0x3d, 0x35, 0x3b, 0x2d, 0x44, 0x4a, 0x3a,
+    0x2b, 0x37, 0x38, 0x46, 0x41, 0x39, 0x3c, 0x3c, 0x46, 0x33, 0x36, 0x3c,
+    0x4b, 0x34, 0x49, 0x50, 0x30, 0x3c, 0x33, 0x41, 0x44, 0x33, 0x43, 0x39,
+    0x36, 0x45, 0x33, 0x3b, 0x3d, 0x36, 0x47, 0x30, 0x42, 0x37, 0x49, 0x3e,
+    0x3b, 0x49, 0x3d, 0x3b, 0x3a, 0x41, 0x38, 0x44, 0x42, 0x3b, 0x3f, 0x40,
+    0x46, 0x35, 0x38, 0x3c, 0x48, 0x3a, 0x46, 0x41, 0x36, 0x36, 0x41, 0x3e,
+    0x43, 0x3e, 0x32, 0x39, 0x3a, 0x41, 0x30, 0x3e, 0x40, 0x3e, 0x36, 0x3a,
+    0x45, 0x45, 0x3a, 0x3c, 0x31, 0x3b, 0x47, 0x3f, 0x36, 0x3a, 0x3c, 0x41,
+    0x3b, 0x41, 0x39, 0x46, 0x3f, 0x3c, 0x34, 0x3e, 0x41, 0x45, 0x41, 0x42,
+    0x39, 0x40, 0x40, 0x44, 0x45, 0x42, 0x34, 0x3f, 0x3e, 0x31, 0x3b, 0x41,
+    0x33, 0x43, 0x37, 0x44, 0x44, 0x3a, 0x36, 0x36, 0x48, 0x3c, 0x37, 0x47,
+    0x39, 0x3e, 0x3e, 0x3c, 0x3c, 0x41, 0x3c, 0x44, 0x3b, 0x42, 0x3f, 0x3a,
+    0x43, 0x3b, 0x3e, 0x48, 0x36, 0x3f, 0x3d, 0x34, 0x40, 0x43, 0x35, 0x4f,
+    0x34, 0x39, 0x3b, 0x41, 0x40, 0x39, 0x37, 0x4c, 0x39, 0x36, 0x39, 0x39,
+    0x47, 0x41, 0x43, 0x3f, 0x3f, 0x33, 0x42, 0x3f, 0x42, 0x40, 0x37, 0x40,
+    0x3f, 0x34, 0x45, 0x3d, 0x2d, 0x3c, 0x44, 0x3b, 0x43, 0x37, 0x26, 0x50,
+    0x43, 0x44, 0x3d, 0x43, 0x42, 0x2d, 0x3c, 0x33, 0x4a, 0x32, 0x4a, 0x53,
+    0x33, 0x38, 0x27, 0x36, 0x42, 0x30, 0x47, 0x3d, 0x36, 0x45, 0x46, 0x36,
+    0x3b, 0x3b, 0x40, 0x33, 0x37, 0x36, 0x44, 0x46, 0x3d, 0x35, 0x40, 0x38,
+    0x3b, 0x40, 0x36, 0x3c, 0x3d, 0x37, 0x31, 0x41, 0x33, 0x3c, 0x38, 0x3f,
+    0x43, 0x3a, 0x40, 0x49, 0x38, 0x39, 0x38, 0x3d, 0x43, 0x3d, 0x39, 0x3b,
+    0x3d, 0x3f, 0x38, 0x42, 0x34, 0x43, 0x33, 0x3e, 0x43, 0x3e, 0x40, 0x42,
+    0x3b, 0x45, 0x37, 0x44, 0x43, 0x39, 0x3c, 0x3d, 0x37, 0x44, 0x3a, 0x3b,
+    0x47, 0x3f, 0x3a, 0x3c, 0x3a, 0x3b, 0x3f, 0x43, 0x3e, 0x3d, 0x46, 0x3e,
+    0x37, 0x36, 0x3f, 0x40, 0x42, 0x42, 0x37, 0x36, 0x48, 0x35, 0x44, 0x44,
+    0x39, 0x3c, 0x3b, 0x41, 0x44, 0x49, 0x3a, 0x40, 0x41, 0x36, 0x33, 0x3a,
+    0x3c, 0x3d, 0x40, 0x3f, 0x43, 0x36, 0x3c, 0x3a, 0x3f, 0x4b, 0x32, 0x49,
+    0x49, 0x3e, 0x3a, 0x3e, 0x3f, 0x41, 0x3c, 0x47, 0x40, 0x41, 0x45, 0x3e,
+    0x47, 0x47, 0x3f, 0x47, 0x45, 0x3e, 0x31, 0x43, 0x4a, 0x44, 0x36, 0x40,
+    0x41, 0x47, 0x3e, 0x42, 0x37, 0x40, 0x3b, 0x46, 0x37, 0x41, 0x3e, 0x3c,
+    0x27, 0x40, 0x49, 0x42, 0x42, 0x39, 0x30, 0x49, 0x43, 0x38, 0x3d, 0x42,
+    0x43, 0x2f, 0x3b, 0x37, 0x4b, 0x2d, 0x4f, 0x52, 0x30, 0x31, 0x2f, 0x3a,
+    0x49, 0x38, 0x4f, 0x45, 0x2e, 0x47, 0x3a, 0x32, 0x33, 0x3f, 0x4a, 0x2e,
+    0x33, 0x3b, 0x3e, 0x3e, 0x49, 0x45, 0x44, 0x38, 0x3c, 0x35, 0x45, 0x47,
+    0x41, 0x3b, 0x3c, 0x48, 0x46, 0x39, 0x39, 0x3b, 0x3f, 0x41, 0x38, 0x42,
+    0x3d, 0x46, 0x33, 0x41, 0x36, 0x3f, 0x3f, 0x3c, 0x33, 0x3e, 0x3e, 0x40,
+    0x44, 0x40, 0x3c, 0x38, 0x46, 0x3a, 0x40, 0x36, 0x42, 0x35, 0x3f, 0x3b,
+    0x3b, 0x43, 0x3c, 0x40, 0x40, 0x49, 0x2e, 0x39, 0x40, 0x3f, 0x45, 0x41,
+    0x3f, 0x30, 0x42, 0x3d, 0x40, 0x3c, 0x3a, 0x3b, 0x3b, 0x40, 0x39, 0x42,
+    0x3a, 0x3f, 0x3f, 0x3e, 0x35, 0x3b, 0x38, 0x45, 0x47, 0x35, 0x44, 0x3e,
+    0x3b, 0x3f, 0x3f, 0x40, 0x3a, 0x35, 0x30, 0x49, 0x45, 0x35, 0x3b, 0x39,
+    0x3b, 0x48, 0x3f, 0x37, 0x39, 0x40, 0x43, 0x45, 0x3d, 0x40, 0x41, 0x3a,
+    0x33, 0x3d, 0x3a, 0x4b, 0x40, 0x42, 0x40, 0x42, 0x43, 0x39, 0x3c, 0x49,
+    0x3e, 0x47, 0x3e, 0x44, 0x3f, 0x3a, 0x40, 0x41, 0x3f, 0x42, 0x42, 0x37,
+    0x3e, 0x3b, 0x36, 0x3e, 0x3b, 0x3c, 0x48, 0x43, 0x2d, 0x46, 0x4a, 0x38,
+    0x45, 0x3a, 0x29, 0x46, 0x40, 0x3c, 0x40, 0x44, 0x40, 0x33, 0x2f, 0x33,
+    0x48, 0x2e, 0x51, 0x4f, 0x2b, 0x32, 0x2e, 0x2d, 0x45, 0x33, 0x4d, 0x41,
+    0x29, 0x4b, 0x41, 0x39, 0x2f, 0x3a, 0x49, 0x31, 0x37, 0x40, 0x47, 0x4c,
+    0x3e, 0x31, 0x41, 0x3f, 0x43, 0x37, 0x45, 0x4f, 0x41, 0x3c, 0x30, 0x4a,
+    0x37, 0x37, 0x36, 0x39, 0x31, 0x3d, 0x36, 0x4b, 0x37, 0x44, 0x3c, 0x43,
+    0x44, 0x36, 0x3f, 0x3b, 0x34, 0x3e, 0x3a, 0x35, 0x38, 0x3f, 0x33, 0x37,
+    0x3b, 0x3d, 0x46, 0x38, 0x3b, 0x37, 0x38, 0x3b, 0x31, 0x3e, 0x3d, 0x3b,
+    0x3d, 0x39, 0x35, 0x33, 0x33, 0x3c, 0x39, 0x39, 0x48, 0x39, 0x39, 0x3f,
+    0x3e, 0x36, 0x47, 0x3a, 0x44, 0x40, 0x32, 0x3c, 0x37, 0x35, 0x40, 0x3f,
+    0x3a, 0x38, 0x3b, 0x3d, 0x46, 0x45, 0x36, 0x43, 0x40, 0x3d, 0x41, 0x41,
+    0x47, 0x3a, 0x3d, 0x3e, 0x43, 0x42, 0x32, 0x36, 0x41, 0x37, 0x3b, 0x35,
+    0x36, 0x44, 0x36, 0x3c, 0x43, 0x32, 0x3e, 0x3e, 0x42, 0x45, 0x32, 0x3c,
+    0x3a, 0x3b, 0x35, 0x43, 0x41, 0x3d, 0x44, 0x50, 0x43, 0x31, 0x3e, 0x44,
+    0x44, 0x41, 0x3a, 0x44, 0x36, 0x39, 0x3b, 0x3c, 0x32, 0x38, 0x3b, 0x45,
+    0x38, 0x43, 0x40, 0x42, 0x33, 0x3e, 0x4a, 0x42, 0x45, 0x39, 0x2f, 0x42,
+    0x39, 0x35, 0x44, 0x3e, 0x39, 0x2f, 0x34, 0x33, 0x49, 0x29, 0x50, 0x4f,
+    0x2b, 0x36, 0x34, 0x2d, 0x47, 0x33, 0x49, 0x3c, 0x33, 0x51, 0x49, 0x3f,
+    0x34, 0x39, 0x4a, 0x2c, 0x34, 0x45, 0x4f, 0x47, 0x34, 0x42, 0x3a, 0x3d,
+    0x36, 0x4a, 0x3b, 0x43, 0x36, 0x3f, 0x39, 0x4b, 0x38, 0x3a, 0x31, 0x3d,
+    0x32, 0x42, 0x3a, 0x47, 0x48, 0x3e, 0x44, 0x3f, 0x39, 0x3e, 0x44, 0x35,
+    0x41, 0x3c, 0x45, 0x3a, 0x3e, 0x3b, 0x3d, 0x2f, 0x37, 0x40, 0x3e, 0x43,
+    0x39, 0x39, 0x33, 0x3b, 0x37, 0x3b, 0x37, 0x37, 0x37, 0x39, 0x36, 0x31,
+    0x39, 0x3b, 0x41, 0x39, 0x3b, 0x40, 0x36, 0x37, 0x42, 0x39, 0x3a, 0x46,
+    0x3f, 0x30, 0x38, 0x39, 0x35, 0x32, 0x3e, 0x3a, 0x43, 0x43, 0x3e, 0x33,
+    0x42, 0x3f, 0x41, 0x3c, 0x46, 0x34, 0x34, 0x40, 0x43, 0x37, 0x32, 0x43,
+    0x3c, 0x37, 0x36, 0x33, 0x3d, 0x36, 0x3a, 0x40, 0x39, 0x38, 0x32, 0x3e,
+    0x32, 0x3d, 0x37, 0x49, 0x42, 0x47, 0x41, 0x3b, 0x3d, 0x3c, 0x3a, 0x37,
+    0x3c, 0x45, 0x3a, 0x45, 0x36, 0x44, 0x3a, 0x3a, 0x3a, 0x3c, 0x43, 0x3b,
+    0x3b, 0x35, 0x38, 0x47, 0x36, 0x40, 0x32, 0x43, 0x3e, 0x39, 0x42, 0x40,
+    0x2c, 0x3c, 0x4c, 0x4c, 0x43, 0x3b, 0x37, 0x4a, 0x3f, 0x3c, 0x45, 0x44,
+    0x3f, 0x30, 0x36, 0x31, 0x4f, 0x2f, 0x5d, 0x4b, 0x34, 0x34, 0x2d, 0x2b,
+    0x44, 0x31, 0x4e, 0x40, 0x2e, 0x4d, 0x48, 0x3e, 0x37, 0x2b, 0x49, 0x25,
+    0x31, 0x49, 0x44, 0x49, 0x39, 0x39, 0x4b, 0x3a, 0x3a, 0x41, 0x3e, 0x42,
+    0x3c, 0x36, 0x36, 0x4a, 0x32, 0x44, 0x3e, 0x48, 0x3e, 0x3c, 0x37, 0x49,
+    0x3d, 0x34, 0x3f, 0x37, 0x33, 0x36, 0x46, 0x3a, 0x3a, 0x31, 0x45, 0x3f,
+    0x3a, 0x31, 0x3b, 0x33, 0x41, 0x42, 0x35, 0x39, 0x38, 0x44, 0x36, 0x3a,
+    0x3f, 0x3b, 0x37, 0x3e, 0x3b, 0x38, 0x2f, 0x32, 0x44, 0x3d, 0x44, 0x41,
+    0x39, 0x36, 0x3a, 0x34, 0x39, 0x38, 0x34, 0x3f, 0x3b, 0x37, 0x34, 0x34,
+    0x40, 0x3d, 0x34, 0x3a, 0x46, 0x42, 0x3f, 0x34, 0x38, 0x33, 0x39, 0x44,
+    0x3f, 0x41, 0x3c, 0x31, 0x40, 0x32, 0x3f, 0x37, 0x37, 0x41, 0x3e, 0x35,
+    0x37, 0x48, 0x3b, 0x41, 0x3d, 0x3a, 0x3f, 0x35, 0x33, 0x3c, 0x36, 0x3b,
+    0x3a, 0x48, 0x33, 0x42, 0x37, 0x33, 0x39, 0x41, 0x3c, 0x3d, 0x3b, 0x4d,
+    0x39, 0x3a, 0x3e, 0x44, 0x3d, 0x41, 0x3b, 0x38, 0x49, 0x41, 0x3a, 0x38,
+    0x34, 0x38, 0x38, 0x3c, 0x45, 0x3c, 0x37, 0x3b, 0x36, 0x3e, 0x4a, 0x4b,
+    0x42, 0x3f, 0x32, 0x45, 0x46, 0x35, 0x46, 0x41, 0x38, 0x33, 0x39, 0x37,
+    0x44, 0x2b, 0x60, 0x4a, 0x2a, 0x2e, 0x35, 0x2d, 0x43, 0x37, 0x51, 0x47,
+    0x2f, 0x4d, 0x50, 0x3e, 0x3a, 0x33, 0x4f, 0x2a, 0x35, 0x45, 0x4a, 0x4c,
+    0x3b, 0x3d, 0x43, 0x44, 0x3d, 0x3f, 0x4a, 0x3e, 0x49, 0x37, 0x2e, 0x4f,
+    0x39, 0x3f, 0x32, 0x3c, 0x37, 0x3b, 0x39, 0x4d, 0x34, 0x3f, 0x46, 0x44,
+    0x3d, 0x40, 0x3f, 0x40, 0x39, 0x33, 0x39, 0x3e, 0x3d, 0x40, 0x31, 0x30,
+    0x35, 0x3d, 0x3e, 0x3a, 0x3e, 0x32, 0x31, 0x3e, 0x48, 0x3c, 0x40, 0x43,
+    0x3f, 0x3f, 0x34, 0x2e, 0x3a, 0x3e, 0x3b, 0x43, 0x45, 0x32, 0x3a, 0x31,
+    0x37, 0x38, 0x31, 0x35, 0x34, 0x3d, 0x42, 0x36, 0x46, 0x37, 0x32, 0x47,
+    0x41, 0x3c, 0x35, 0x35, 0x36, 0x41, 0x3a, 0x3b, 0x42, 0x44, 0x36, 0x31,
+    0x3c, 0x3d, 0x34, 0x34, 0x3b, 0x40, 0x40, 0x2e, 0x40, 0x46, 0x3b, 0x43,
+    0x3f, 0x40, 0x3b, 0x3a, 0x32, 0x40, 0x46, 0x39, 0x3c, 0x49, 0x2f, 0x3d,
+    0x49, 0x3e, 0x44, 0x3c, 0x3e, 0x35, 0x3f, 0x44, 0x41, 0x40, 0x3e, 0x47,
+    0x3d, 0x40, 0x3f, 0x41, 0x3b, 0x41, 0x41, 0x3f, 0x40, 0x3f, 0x3e, 0x3e,
+    0x3f, 0x43, 0x35, 0x40, 0x2b, 0x42, 0x45, 0x56, 0x40, 0x3c, 0x2f, 0x44,
+    0x44, 0x3d, 0x3e, 0x3d, 0x40, 0x2d, 0x39, 0x31, 0x54, 0x2f, 0x61, 0x48,
+    0x2e, 0x37, 0x37, 0x32, 0x3e, 0x2d, 0x52, 0x4d, 0x2d, 0x4d, 0x4c, 0x3a,
+    0x3a, 0x31, 0x4e, 0x2d, 0x31, 0x48, 0x47, 0x54, 0x45, 0x38, 0x3b, 0x3d,
+    0x42, 0x41, 0x44, 0x4a, 0x48, 0x42, 0x2f, 0x4d, 0x31, 0x34, 0x3a, 0x46,
+    0x37, 0x44, 0x2c, 0x45, 0x46, 0x43, 0x40, 0x3f, 0x34, 0x33, 0x40, 0x39,
+    0x32, 0x35, 0x3a, 0x40, 0x3f, 0x3f, 0x36, 0x32, 0x3f, 0x3d, 0x35, 0x48,
+    0x3c, 0x48, 0x37, 0x39, 0x35, 0x3f, 0x37, 0x3d, 0x44, 0x46, 0x2d, 0x2a,
+    0x47, 0x38, 0x3a, 0x39, 0x45, 0x3b, 0x40, 0x2d, 0x37, 0x33, 0x41, 0x3c,
+    0x40, 0x35, 0x3f, 0x32, 0x3a, 0x36, 0x40, 0x41, 0x3a, 0x3c, 0x33, 0x31,
+    0x42, 0x3f, 0x41, 0x3a, 0x41, 0x46, 0x38, 0x2f, 0x3c, 0x3d, 0x3d, 0x39,
+    0x3b, 0x46, 0x41, 0x31, 0x46, 0x36, 0x40, 0x48, 0x3c, 0x33, 0x42, 0x32,
+    0x3b, 0x40, 0x3f, 0x36, 0x37, 0x44, 0x34, 0x35, 0x32, 0x32, 0x37, 0x38,
+    0x33, 0x3b, 0x37, 0x4a, 0x3f, 0x46, 0x3a, 0x41, 0x32, 0x37, 0x30, 0x3e,
+    0x40, 0x35, 0x41, 0x40, 0x37, 0x41, 0x2b, 0x40, 0x3d, 0x3d, 0x32, 0x38,
+    0x34, 0x3e, 0x47, 0x61, 0x43, 0x3b, 0x3c, 0x42, 0x46, 0x3d, 0x40, 0x4a,
+    0x3c, 0x2d, 0x33, 0x35, 0x55, 0x38, 0x69, 0x4f, 0x33, 0x37, 0x30, 0x39,
+    0x44, 0x2e, 0x58, 0x4b, 0x2a, 0x51, 0x4b, 0x3c, 0x39, 0x2e, 0x51, 0x2d,
+    0x30, 0x4a, 0x42, 0x53, 0x3f, 0x39, 0x3e, 0x44, 0x3b, 0x40, 0x47, 0x44,
+    0x47, 0x3e, 0x39, 0x4b, 0x40, 0x3d, 0x42, 0x39, 0x3b, 0x39, 0x32, 0x42,
+    0x36, 0x36, 0x36, 0x42, 0x44, 0x34, 0x33, 0x40, 0x40, 0x40, 0x3a, 0x3a,
+    0x41, 0x3f, 0x31, 0x30, 0x3f, 0x31, 0x30, 0x39, 0x46, 0x36, 0x35, 0x34,
+    0x40, 0x43, 0x3c, 0x41, 0x31, 0x46, 0x35, 0x26, 0x44, 0x32, 0x3d, 0x35,
+    0x3d, 0x3c, 0x36, 0x32, 0x39, 0x3a, 0x30, 0x40, 0x48, 0x3e, 0x38, 0x37,
+    0x44, 0x3b, 0x3d, 0x42, 0x3d, 0x3c, 0x32, 0x2b, 0x3f, 0x41, 0x39, 0x3d,
+    0x3e, 0x3f, 0x35, 0x2f, 0x46, 0x3d, 0x3d, 0x3b, 0x45, 0x37, 0x31, 0x35,
+    0x44, 0x40, 0x3a, 0x45, 0x3a, 0x3c, 0x39, 0x31, 0x3b, 0x3d, 0x3b, 0x3a,
+    0x43, 0x44, 0x39, 0x47, 0x44, 0x36, 0x3e, 0x39, 0x48, 0x3f, 0x39, 0x4b,
+    0x3c, 0x36, 0x3d, 0x44, 0x44, 0x3f, 0x39, 0x43, 0x3f, 0x37, 0x3f, 0x37,
+    0x3b, 0x3b, 0x38, 0x3b, 0x3f, 0x40, 0x31, 0x44, 0x30, 0x44, 0x46, 0x5b,
+    0x46, 0x3f, 0x39, 0x40, 0x40, 0x37, 0x4a, 0x46, 0x3f, 0x36, 0x40, 0x39,
+    0x59, 0x3e, 0x66, 0x57, 0x32, 0x34, 0x2e, 0x33, 0x46, 0x31, 0x58, 0x44,
+    0x26, 0x4c, 0x4b, 0x3c, 0x39, 0x2e, 0x4d, 0x35, 0x32, 0x46, 0x52, 0x52,
+    0x3e, 0x40, 0x39, 0x3c, 0x39, 0x3d, 0x53, 0x48, 0x41, 0x3c, 0x3b, 0x4d,
+    0x3c, 0x3e, 0x38, 0x44, 0x3a, 0x3a, 0x29, 0x4a, 0x3c, 0x37, 0x36, 0x38,
+    0x3a, 0x31, 0x37, 0x39, 0x3a, 0x40, 0x46, 0x32, 0x42, 0x38, 0x32, 0x2e,
+    0x3a, 0x45, 0x44, 0x34, 0x34, 0x38, 0x32, 0x2e, 0x35, 0x40, 0x3a, 0x41,
+    0x42, 0x3d, 0x37, 0x2c, 0x3f, 0x37, 0x3c, 0x3d, 0x3a, 0x36, 0x33, 0x35,
+    0x3c, 0x34, 0x3c, 0x39, 0x3c, 0x3a, 0x37, 0x30, 0x30, 0x3e, 0x3d, 0x3a,
+    0x44, 0x37, 0x36, 0x32, 0x36, 0x37, 0x36, 0x3a, 0x3c, 0x41, 0x3a, 0x35,
+    0x36, 0x3a, 0x34, 0x40, 0x39, 0x40, 0x3e, 0x32, 0x34, 0x46, 0x33, 0x3f,
+    0x36, 0x45, 0x3e, 0x35, 0x3f, 0x38, 0x3f, 0x3e, 0x3b, 0x3a, 0x36, 0x3b,
+    0x36, 0x38, 0x32, 0x3f, 0x44, 0x3c, 0x35, 0x48, 0x38, 0x39, 0x31, 0x49,
+    0x3d, 0x43, 0x36, 0x3f, 0x31, 0x43, 0x36, 0x3e, 0x3e, 0x41, 0x39, 0x3b,
+    0x40, 0x42, 0x3c, 0x43, 0x36, 0x4a, 0x48, 0x67, 0x4e, 0x43, 0x36, 0x46,
+    0x44, 0x3f, 0x4b, 0x4b, 0x3f, 0x38, 0x3c, 0x3c, 0x5e, 0x38, 0x70, 0x52,
+    0x38, 0x32, 0x3b, 0x36, 0x4a, 0x2c, 0x52, 0x46, 0x29, 0x4f, 0x48, 0x42,
+    0x2d, 0x2e, 0x4f, 0x28, 0x28, 0x45, 0x4d, 0x52, 0x42, 0x3e, 0x3f, 0x41,
+    0x3c, 0x3a, 0x47, 0x50, 0x44, 0x45, 0x33, 0x4b, 0x3e, 0x3f, 0x42, 0x3d,
+    0x43, 0x34, 0x27, 0x3f, 0x42, 0x3e, 0x43, 0x3e, 0x3a, 0x3c, 0x37, 0x3b,
+    0x3f, 0x30, 0x3a, 0x3e, 0x3c, 0x34, 0x37, 0x24, 0x3d, 0x43, 0x40, 0x44,
+    0x40, 0x46, 0x31, 0x2f, 0x43, 0x38, 0x38, 0x39, 0x3c, 0x34, 0x2d, 0x2a,
+    0x38, 0x31, 0x43, 0x3b, 0x39, 0x3b, 0x32, 0x34, 0x3e, 0x39, 0x41, 0x3b,
+    0x3e, 0x33, 0x3a, 0x2a, 0x41, 0x3f, 0x3c, 0x43, 0x3b, 0x3e, 0x35, 0x2c,
+    0x38, 0x41, 0x33, 0x31, 0x3e, 0x3f, 0x3a, 0x3c, 0x3b, 0x35, 0x3f, 0x3d,
+    0x42, 0x3a, 0x3c, 0x35, 0x3f, 0x40, 0x3c, 0x3e, 0x37, 0x41, 0x3d, 0x38,
+    0x34, 0x31, 0x36, 0x3d, 0x3d, 0x47, 0x36, 0x44, 0x3f, 0x45, 0x3c, 0x3c,
+    0x35, 0x36, 0x31, 0x4f, 0x46, 0x3a, 0x41, 0x42, 0x40, 0x32, 0x33, 0x41,
+    0x34, 0x40, 0x3d, 0x43, 0x3b, 0x3a, 0x32, 0x3c, 0x42, 0x42, 0x3d, 0x43,
+    0x37, 0x45, 0x45, 0xff, 0x4b, 0x45, 0x3b, 0x40, 0x43, 0x3e, 0x47, 0x49,
+    0x3d, 0x3b, 0x3e, 0x33, 0x58, 0x35, 0x71, 0x54, 0x2f, 0x38, 0x38, 0x33,
+    0x47, 0x35, 0x5b, 0x46, 0x2c, 0x4c, 0x43, 0x37, 0x36, 0x39, 0x4f, 0x30,
+    0x26, 0x48, 0x51, 0x48, 0x46, 0x45, 0x3b, 0x39, 0x42, 0x50, 0x47, 0x4c,
+    0x4b, 0x3b, 0x3d, 0x4d, 0x41, 0x34, 0x40, 0x44, 0x38, 0x32, 0x2d, 0x43,
+    0x39, 0x36, 0x3b, 0x3b, 0x40, 0x3d, 0x37, 0x3c, 0x44, 0x39, 0x42, 0x37,
+    0x38, 0x38, 0x32, 0x2f, 0x41, 0x40, 0x3f, 0x3a, 0x37, 0x35, 0x3b, 0x2a,
+    0x37, 0x30, 0x3c, 0x37, 0x40, 0x38, 0x3a, 0x27, 0x44, 0x3d, 0x43, 0x40,
+    0x35, 0x3f, 0x3e, 0x32, 0x3e, 0x3c, 0x40, 0x39, 0x39, 0x3a, 0x41, 0x31,
+    0x3b, 0x3f, 0x34, 0x43, 0x3a, 0x38, 0x42, 0x2a, 0x47, 0x46, 0x3b, 0x38,
+    0x47, 0x45, 0x39, 0x31, 0x43, 0x40, 0x37, 0x3a, 0x3d, 0x3e, 0x39, 0x30,
+    0x36, 0x37, 0x3a, 0x43, 0x3f, 0x32, 0x31, 0x41, 0x45, 0x3e, 0x43, 0x38,
+    0x3f, 0x37, 0x3c, 0x49, 0x3b, 0x33, 0x3d, 0x3a, 0x37, 0x44, 0x32, 0x50,
+    0x39, 0x44, 0x3e, 0x3f, 0x3d, 0x41, 0x3e, 0x3e, 0x42, 0x44, 0x45, 0x3f,
+    0x36, 0x3f, 0x37, 0x39, 0x3b, 0x3d, 0x3b, 0x3b, 0x2f, 0x46, 0x40, 0x6d,
+    0x50, 0x45, 0x3b, 0x45, 0x46, 0x3b, 0x42, 0x48, 0x42, 0x3c, 0x39, 0x37,
+    0x57, 0x3b, 0x6c, 0x5b, 0x32, 0x35, 0x3d, 0x39, 0x48, 0x31, 0x5c, 0x46,
+    0x29, 0x4c, 0x3f, 0x3e, 0x37, 0x33, 0x58, 0x32, 0x2a, 0x43, 0x4c, 0x50,
+    0x3b, 0x44, 0x3c, 0x41, 0x39, 0x48, 0x55, 0x4c, 0x42, 0x38, 0x3b, 0x51,
+    0x3f, 0x38, 0x44, 0x46, 0x36, 0x3b, 0x38, 0x4a, 0x3f, 0x37, 0x36, 0x3c,
+    0x31, 0x3d, 0x32, 0x39, 0x3b, 0x3f, 0x3e, 0x35, 0x38, 0x3f, 0x34, 0x2b,
+    0x37, 0x36, 0x39, 0x40, 0x37, 0x41, 0x32, 0x27, 0x36, 0x33, 0x40, 0x3a,
+    0x3f, 0x44, 0x3f, 0x25, 0x38, 0x34, 0x42, 0x3c, 0x3a, 0x40, 0x38, 0x31,
+    0x49, 0x3e, 0x33, 0x3d, 0x31, 0x36, 0x39, 0x2b, 0x44, 0x2f, 0x43, 0x34,
+    0x34, 0x37, 0x39, 0x33, 0x3b, 0x34, 0x42, 0x3c, 0x40, 0x45, 0x36, 0x31,
+    0x43, 0x47, 0x3e, 0x3f, 0x40, 0x3a, 0x33, 0x34, 0x41, 0x44, 0x3a, 0x43,
+    0x3e, 0x38, 0x36, 0x31, 0x42, 0x44, 0x40, 0x41, 0x44, 0x43, 0x33, 0x42,
+    0x3d, 0x41, 0x3d, 0x3e, 0x3c, 0x39, 0x3e, 0x4f, 0x3f, 0x37, 0x31, 0x40,
+    0x3b, 0x38, 0x35, 0x3b, 0x44, 0x41, 0x41, 0x37, 0x40, 0x42, 0x2d, 0x3d,
+    0x39, 0x48, 0x44, 0x3e, 0x34, 0x48, 0x49, 0x6d, 0x45, 0x4b, 0x3a, 0x44,
+    0x49, 0x40, 0x4d, 0x51, 0x3f, 0x34, 0x3b, 0x40, 0x52, 0x34, 0x6f, 0x56,
+    0x33, 0x3e, 0x40, 0x39, 0x41, 0x32, 0x5d, 0x45, 0x2e, 0x51, 0x48, 0x3c,
+    0x2e, 0x2e, 0x51, 0x39, 0x32, 0x45, 0x4a, 0x4c, 0x3b, 0x40, 0x40, 0x3b,
+    0x36, 0x41, 0x54, 0x4e, 0x4a, 0x49, 0x3b, 0x4d, 0x3c, 0x41, 0x38, 0x47,
+    0x3d, 0x3c, 0x37, 0x48, 0x3f, 0x42, 0x3e, 0x36, 0x39, 0x46, 0x37, 0x3e,
+    0x3b, 0x38, 0x40, 0x3b, 0x39, 0x32, 0x3e, 0x29, 0x37, 0x35, 0x3c, 0x3d,
+    0x37, 0x3b, 0x35, 0x2f, 0x32, 0x3b, 0x37, 0x3c, 0x40, 0x3e, 0x39, 0x27,
+    0x3b, 0x38, 0x37, 0x36, 0x39, 0x37, 0x37, 0x35, 0x42, 0x3e, 0x3b, 0x43,
+    0x41, 0x3c, 0x37, 0x2a, 0x3a, 0x3e, 0x38, 0x40, 0x36, 0x3e, 0x44, 0x2e,
+    0x3e, 0x3a, 0x37, 0x3b, 0x3e, 0x41, 0x3d, 0x30, 0x3b, 0x3f, 0x41, 0x45,
+    0x3a, 0x48, 0x37, 0x2f, 0x3a, 0x37, 0x34, 0x43, 0x42, 0x3d, 0x38, 0x41,
+    0x3b, 0x3c, 0x39, 0x3c, 0x39, 0x47, 0x2e, 0x41, 0x42, 0x40, 0x32, 0x36,
+    0x43, 0x40, 0x3d, 0x4c, 0x38, 0x3e, 0x3b, 0x41, 0x3d, 0x3b, 0x34, 0x43,
+    0x43, 0x3f, 0x44, 0x3c, 0x3a, 0x33, 0x39, 0x42, 0x43, 0x3f, 0x33, 0x3d,
+    0x33, 0x3e, 0x48, 0x6b, 0x48, 0x43, 0x36, 0x47, 0x49, 0x44, 0x4a, 0x49,
+    0x3c, 0x31, 0x35, 0x3e, 0x5c, 0x34, 0x73, 0x53, 0x33, 0x3c, 0x32, 0x3b,
+    0x43, 0x27, 0x59, 0x4e, 0x2b, 0x51, 0x4f, 0x37, 0x36, 0x34, 0x56, 0x34,
+    0x32, 0x4f, 0x46, 0x50, 0x40, 0x40, 0x3c, 0x3e, 0x34, 0x37, 0x50, 0x49,
+    0x43, 0x47, 0x3e, 0x52, 0x44, 0x38, 0x3b, 0x4f, 0x3a, 0x3d, 0x2b, 0x4c,
+    0x40, 0x38, 0x3a, 0x35, 0x3a, 0x3a, 0x3d, 0x38, 0x3d, 0x3b, 0x37, 0x48,
+    0x3d, 0x3d, 0x32, 0x30, 0x3a, 0x34, 0x3f, 0x3a, 0x3b, 0x3e, 0x35, 0x2f,
+    0x3b, 0x3a, 0x45, 0x3d, 0x42, 0x33, 0x33, 0x24, 0x44, 0x39, 0x3c, 0x3d,
+    0x41, 0x3c, 0x37, 0x2c, 0x3b, 0x36, 0x34, 0x41, 0x3d, 0x3f, 0x39, 0x32,
+    0x3c, 0x40, 0x44, 0x3d, 0x41, 0x3d, 0x3a, 0x29, 0x3e, 0x3e, 0x43, 0x33,
+    0x3f, 0x3e, 0x3e, 0x31, 0x38, 0x3a, 0x34, 0x3d, 0x3f, 0x3e, 0x3a, 0x3d,
+    0x3e, 0x48, 0x45, 0x3d, 0x44, 0x37, 0x33, 0x3d, 0x45, 0x39, 0x40, 0x40,
+    0x42, 0x3f, 0x3f, 0x3d, 0x3a, 0x3b, 0x41, 0x33, 0x41, 0x3c, 0x32, 0x55,
+    0x43, 0x3a, 0x32, 0x40, 0x3c, 0x3e, 0x40, 0x43, 0x37, 0x3f, 0x40, 0x38,
+    0x43, 0x41, 0x36, 0x42, 0x44, 0x3c, 0x32, 0x3f, 0x38, 0x42, 0x46, 0x59,
+    0x4c, 0x41, 0x39, 0x47, 0x46, 0x46, 0x44, 0x44, 0x35, 0x42, 0x32, 0x39,
+    0x4f, 0x34, 0x6d, 0x55, 0x31, 0x3b, 0x3a, 0x3f, 0x44, 0x2c, 0x5d, 0x43,
+    0x26, 0x4a, 0x4f, 0x40, 0x36, 0x32, 0x4d, 0x33, 0x2f, 0x50, 0x4d, 0x57,
+    0x3b, 0x40, 0x42, 0x44, 0x41, 0x3f, 0x52, 0x4e, 0x35, 0x41, 0x44, 0x52,
+    0x40, 0x35, 0x39, 0x4b, 0x45, 0x34, 0x2c, 0x4a, 0x3b, 0x41, 0x31, 0x33,
+    0x3f, 0x3a, 0x36, 0x3c, 0x3c, 0x33, 0x30, 0x38, 0x43, 0x3f, 0x32, 0x2d,
+    0x3f, 0x3a, 0x38, 0x41, 0x39, 0x45, 0x36, 0x2e, 0x3c, 0x38, 0x45, 0x3f,
+    0x40, 0x3f, 0x3e, 0x26, 0x41, 0x37, 0x3c, 0x44, 0x3f, 0x3f, 0x35, 0x37,
+    0x46, 0x34, 0x37, 0x3e, 0x48, 0x38, 0x36, 0x34, 0x33, 0x39, 0x40, 0x3c,
+    0x42, 0x3d, 0x3b, 0x31, 0x38, 0x3b, 0x44, 0x42, 0x45, 0x38, 0x41, 0x30,
+    0x3d, 0x42, 0x36, 0x3f, 0x3b, 0x45, 0x37, 0x32, 0x3c, 0x37, 0x3d, 0x42,
+    0x38, 0x3d, 0x2f, 0x31, 0x39, 0x40, 0x3f, 0x44, 0x3a, 0x41, 0x44, 0x46,
+    0x3d, 0x3a, 0x32, 0x3b, 0x34, 0x47, 0x36, 0x4c, 0x47, 0x35, 0x3c, 0x33,
+    0x3b, 0x3c, 0x30, 0x43, 0x43, 0x3f, 0x31, 0x40, 0x3a, 0x37, 0x30, 0x46,
+    0x39, 0x3b, 0x42, 0x40, 0x2d, 0x3f, 0x3e, 0x6a, 0x50, 0x3b, 0x31, 0x54,
+    0x47, 0x3d, 0x48, 0x4e, 0x3b, 0x41, 0x3a, 0x39, 0x49, 0x36, 0x64, 0x4e,
+    0x32, 0x39, 0x3d, 0x37, 0x42, 0x2c, 0x5c, 0x43, 0x2a, 0x4b, 0x4b, 0x46,
+    0x30, 0x29, 0x52, 0x31, 0x35, 0x44, 0x4a, 0x4b, 0x3d, 0x3b, 0x4e, 0x42,
+    0x3d, 0x39, 0x42, 0x52, 0x3f, 0x36, 0x3e, 0x50, 0x3f, 0x32, 0x35, 0x3a,
+    0x40, 0x39, 0x35, 0x48, 0x3b, 0x3e, 0x41, 0x43, 0x43, 0x45, 0x2f, 0x36,
+    0x38, 0x34, 0x3f, 0x44, 0x32, 0x3f, 0x37, 0x33, 0x33, 0x35, 0x2e, 0x41,
+    0x37, 0x3e, 0x38, 0x28, 0x49, 0x30, 0x46, 0x39, 0x3b, 0x30, 0x38, 0x28,
+    0x3b, 0x3d, 0x3a, 0x43, 0x3f, 0x34, 0x43, 0x36, 0x39, 0x3c, 0x3e, 0x3e,
+    0x39, 0x3b, 0x39, 0x32, 0x3c, 0x36, 0x3e, 0x38, 0x34, 0x3c, 0x3a, 0x2a,
+    0x46, 0x3d, 0x40, 0x37, 0x3b, 0x39, 0x3b, 0x34, 0x38, 0x31, 0x43, 0x46,
+    0x3b, 0x43, 0x39, 0x2b, 0x38, 0x40, 0x3e, 0x39, 0x35, 0x3d, 0x2c, 0x36,
+    0x37, 0x40, 0x36, 0x40, 0x41, 0x38, 0x32, 0x3f, 0x36, 0x46, 0x34, 0x31,
+    0x40, 0x3e, 0x3c, 0x4e, 0x42, 0x3d, 0x36, 0x3f, 0x42, 0x3f, 0x33, 0x40,
+    0x34, 0x37, 0x3c, 0x3b, 0x31, 0x47, 0x32, 0x3c, 0x34, 0x3d, 0x42, 0x3b,
+    0x37, 0x41, 0x3b, 0x64, 0x52, 0x40, 0x36, 0x4e, 0x46, 0x3f, 0x3f, 0x47,
+    0x3c, 0x3a, 0x3a, 0x41, 0x4a, 0x32, 0x5e, 0x50, 0x2d, 0x39, 0x3a, 0x38,
+    0x3d, 0x2c, 0x5a, 0x3e, 0x2e, 0x47, 0x3e, 0x3e, 0x33, 0x29, 0x4c, 0x35,
+    0x30, 0x4d, 0x4d, 0x4d, 0x38, 0x42, 0x51, 0x47, 0x39, 0x3c, 0x43, 0x4b,
+    0x42, 0x3f, 0x3a, 0x4b, 0x44, 0x3f, 0x3a, 0x44, 0x3e, 0x37, 0x30, 0x45,
+    0x3d, 0x36, 0x34, 0x3f, 0x36, 0x35, 0x37, 0x36, 0x43, 0x3b, 0x37, 0x3e,
+    0x35, 0x3e, 0x32, 0x34, 0x32, 0x38, 0x3c, 0x3a, 0x3a, 0x3c, 0x30, 0x2b,
+    0x31, 0x37, 0x30, 0x42, 0x36, 0x37, 0x36, 0x2c, 0x3c, 0x31, 0x41, 0x37,
+    0x44, 0x41, 0x3b, 0x37, 0x41, 0x3f, 0x38, 0x3b, 0x3a, 0x3a, 0x3c, 0x2f,
+    0x47, 0x41, 0x3e, 0x33, 0x42, 0x3a, 0x32, 0x34, 0x44, 0x40, 0x43, 0x3d,
+    0x34, 0x41, 0x38, 0x35, 0x35, 0x3b, 0x45, 0x38, 0x32, 0x37, 0x3c, 0x2e,
+    0x39, 0x40, 0x30, 0x3e, 0x42, 0x35, 0x3d, 0x36, 0x3e, 0x3d, 0x39, 0x46,
+    0x3f, 0x36, 0x37, 0x49, 0x41, 0x39, 0x3d, 0x3d, 0x33, 0x44, 0x42, 0x50,
+    0x3d, 0x3c, 0x3e, 0x3f, 0x42, 0x42, 0x3b, 0x3d, 0x41, 0x31, 0x39, 0x3a,
+    0x44, 0x34, 0x38, 0x47, 0x44, 0x38, 0x3b, 0x42, 0x30, 0x42, 0x44, 0x57,
+    0x49, 0x3a, 0x39, 0x4f, 0x41, 0x3e, 0x40, 0x43, 0x37, 0x42, 0x3b, 0x48,
+    0x50, 0x29, 0x5b, 0x44, 0x2c, 0x40, 0x3f, 0x3c, 0x46, 0x34, 0x5c, 0x41,
+    0x2c, 0x48, 0x46, 0x46, 0x35, 0x32, 0x4c, 0x35, 0x2f, 0x3b, 0x48, 0x44,
+    0x41, 0x41, 0x49, 0x45, 0x34, 0x37, 0x44, 0x45, 0x43, 0x3b, 0x42, 0x44,
+    0x3a, 0x37, 0x48, 0x49, 0x34, 0x39, 0x33, 0x4a, 0x40, 0x3d, 0x33, 0x39,
+    0x39, 0x3b, 0x30, 0x31, 0x3d, 0x47, 0x3c, 0x3a, 0x34, 0x3c, 0x3a, 0x2b,
+    0x3a, 0x34, 0x41, 0x40, 0x42, 0x36, 0x44, 0x2c, 0x40, 0x47, 0x3b, 0x37,
+    0x38, 0x42, 0x44, 0x29, 0x36, 0x3d, 0x3d, 0x36, 0x42, 0x3b, 0x35, 0x36,
+    0x43, 0x39, 0x41, 0x3d, 0x45, 0x41, 0x31, 0x32, 0x40, 0x3d, 0x3c, 0x41,
+    0x3e, 0x3d, 0x35, 0x34, 0x32, 0x38, 0x36, 0x3f, 0x3b, 0x3d, 0x39, 0x36,
+    0x40, 0x3e, 0x3d, 0x3a, 0x3a, 0x3b, 0x3c, 0x32, 0x40, 0x34, 0x3a, 0x36,
+    0x42, 0x47, 0x3e, 0x33, 0x3a, 0x44, 0x30, 0x39, 0x40, 0x3a, 0x36, 0x44,
+    0x3c, 0x3b, 0x3f, 0x33, 0x3e, 0x3c, 0x35, 0x53, 0x43, 0x3c, 0x3f, 0x43,
+    0x3d, 0x44, 0x33, 0x47, 0x42, 0x40, 0x37, 0x3b, 0x43, 0x3f, 0x33, 0x41,
+    0x38, 0x42, 0x44, 0x3d, 0x2d, 0x3f, 0x46, 0x49, 0x4e, 0x3f, 0x36, 0x45,
+    0x45, 0x39, 0x40, 0x42, 0x39, 0x39, 0x3a, 0x42, 0x45, 0x2c, 0x61, 0x44,
+    0x30, 0x45, 0x38, 0x3a, 0x40, 0x37, 0x58, 0x39, 0x31, 0x3e, 0x3a, 0x3e,
+    0x37, 0x32, 0x4a, 0x39, 0x2e, 0x47, 0x3e, 0x4e, 0x3f, 0x3e, 0x48, 0x45,
+    0x3f, 0x48, 0x3a, 0x3f, 0x40, 0x36, 0x3a, 0x44, 0x36, 0x3e, 0x3d, 0x41,
+    0x45, 0x36, 0x36, 0x4b, 0x3a, 0x3d, 0x45, 0x48, 0x38, 0x45, 0x39, 0x38,
+    0x38, 0x3a, 0x42, 0x34, 0x3f, 0x34, 0x39, 0x34, 0x32, 0x3f, 0x3c, 0x3d,
+    0x3d, 0x47, 0x3a, 0x2f, 0x3c, 0x3e, 0x3f, 0x39, 0x35, 0x42, 0x3c, 0x2a,
+    0x3b, 0x35, 0x42, 0x44, 0x46, 0x39, 0x38, 0x39, 0x43, 0x3a, 0x38, 0x42,
+    0x3d, 0x3a, 0x40, 0x35, 0x34, 0x39, 0x3a, 0x38, 0x43, 0x42, 0x42, 0x2d,
+    0x31, 0x3b, 0x33, 0x40, 0x3b, 0x47, 0x35, 0x30, 0x3a, 0x3c, 0x3b, 0x47,
+    0x3a, 0x3c, 0x38, 0x35, 0x3c, 0x35, 0x3e, 0x3e, 0x39, 0x3d, 0x39, 0x40,
+    0x37, 0x33, 0x49, 0x38, 0x3c, 0x43, 0x34, 0x40, 0x39, 0x42, 0x3c, 0x3b,
+    0x3e, 0x45, 0x3e, 0x51, 0x3d, 0x3f, 0x3b, 0x34, 0x37, 0x3c, 0x40, 0x47,
+    0x3c, 0x41, 0x3f, 0x41, 0x37, 0x3e, 0x36, 0x3c, 0x42, 0x40, 0x3f, 0x3a,
+    0x3b, 0x42, 0x44, 0x4b, 0x4b, 0x37, 0x41, 0x4d, 0x41, 0x45, 0x40, 0x41,
+    0x40, 0x38, 0x37, 0x40, 0x42, 0x2c, 0x57, 0x43, 0x2d, 0x49, 0x3a, 0x3e,
+    0x37, 0x2f, 0x52, 0x37, 0x31, 0x42, 0x3b, 0x3f, 0x39, 0x38, 0x48, 0x3c,
+    0x37, 0x3d, 0x3a, 0x39, 0x3a, 0x45, 0x4b, 0x49, 0x3e, 0x44, 0x48, 0x49,
+    0x3d, 0x39, 0x3c, 0x41, 0x41, 0x38, 0x45, 0x38, 0x33, 0x3d, 0x37, 0x47,
+    0x34, 0x3f, 0x3b, 0x3d, 0x39, 0x34, 0x30, 0x39, 0x44, 0x36, 0x34, 0x3c,
+    0x37, 0x38, 0x45, 0x34, 0x40, 0x33, 0x41, 0x3a, 0x3e, 0x3c, 0x3b, 0x3a,
+    0x40, 0x3f, 0x3b, 0x3d, 0x3b, 0x46, 0x41, 0x2a, 0x3a, 0x3c, 0x42, 0x46,
+    0x33, 0x3f, 0x2d, 0x3a, 0x45, 0x45, 0x38, 0x3b, 0x44, 0x34, 0x35, 0x3f,
+    0x34, 0x43, 0x38, 0x3e, 0x41, 0x3b, 0x42, 0x38, 0x3d, 0x3f, 0x38, 0x45,
+    0x3b, 0x35, 0x39, 0x3c, 0x43, 0x43, 0x38, 0x34, 0x44, 0x43, 0x2e, 0x39,
+    0x39, 0x40, 0x39, 0x41, 0x41, 0x34, 0x3e, 0x44, 0x3d, 0x43, 0x3a, 0x3a,
+    0x3b, 0x3b, 0x36, 0x45, 0x3c, 0x43, 0x3d, 0x48, 0x36, 0x36, 0x39, 0x55,
+    0x35, 0x40, 0x3e, 0x49, 0x40, 0x3a, 0x3d, 0x3d, 0x34, 0x47, 0x40, 0x41,
+    0x40, 0x47, 0x39, 0x3e, 0x3b, 0x38, 0x3c, 0x3a, 0x35, 0x3e, 0x41, 0x4a,
+    0x4b, 0x3f, 0x36, 0x3d, 0x40, 0x3c, 0x39, 0x32, 0x33, 0x36, 0x30, 0x42,
+    0x42, 0x36, 0x54, 0x48, 0x2e, 0x4c, 0x34, 0x3c, 0x39, 0x36, 0x4e, 0x37,
+    0x2f, 0x3e, 0x30, 0x3d, 0x36, 0x3b, 0x45, 0x36, 0x37, 0x3e, 0x41, 0x4b,
+    0x3b, 0x36, 0x45, 0x3b, 0x38, 0x45, 0x3e, 0x43, 0x48, 0x46, 0x44, 0x44,
+    0x3e, 0x3b, 0x37, 0x3b, 0x3a, 0x3f, 0x3d, 0x44, 0x39, 0x38, 0x45, 0x43,
+    0x3d, 0x35, 0x39, 0x2c, 0x44, 0x41, 0x36, 0x40, 0x3d, 0x39, 0x3d, 0x2f,
+    0x3d, 0x39, 0x42, 0x3d, 0x36, 0x46, 0x43, 0x2c, 0x41, 0x3a, 0x30, 0x45,
+    0x3f, 0x41, 0x35, 0x2b, 0x3b, 0x38, 0x3a, 0x44, 0x32, 0x32, 0x39, 0x3c,
+    0x3a, 0x3a, 0x3c, 0x3a, 0x35, 0x40, 0x3b, 0x31, 0x36, 0x33, 0x35, 0x34,
+    0x3c, 0x3b, 0x3d, 0x36, 0x48, 0x3b, 0x3f, 0x42, 0x3e, 0x33, 0x2f, 0x3a,
+    0x49, 0x41, 0x39, 0x3e, 0x3c, 0x44, 0x3c, 0x39, 0x33, 0x39, 0x36, 0x35,
+    0x3d, 0x42, 0x34, 0x3e, 0x38, 0x45, 0x40, 0x45, 0x3d, 0x48, 0x42, 0x4a,
+    0x3f, 0x45, 0x38, 0x42, 0x44, 0x40, 0x34, 0x49, 0x44, 0x3d, 0x3a, 0x39,
+    0x3e, 0x3a, 0x42, 0x3e, 0x48, 0x42, 0x3e, 0x3a, 0x3f, 0x3f, 0x32, 0x3b,
+    0x38, 0x41, 0x3c, 0x39, 0x33, 0x45, 0x44, 0x3c, 0x48, 0x41, 0x41, 0x3d,
+    0x3a, 0x3c, 0x37, 0x33, 0x41, 0x3f, 0x38, 0x3a, 0x3f, 0x37, 0x51, 0x3c,
+    0x37, 0x3a, 0x43, 0x37, 0x40, 0x31, 0x4f, 0x34, 0x3b, 0x44, 0x45, 0x39,
+    0x40, 0x33, 0x49, 0x33, 0x3e, 0x35, 0x44, 0x3d, 0x3b, 0x3f, 0x43, 0x41,
+    0x43, 0x43, 0x48, 0x44, 0x46, 0x3b, 0x43, 0x3f, 0x3c, 0x3f, 0x3e, 0x3d,
+    0x3b, 0x41, 0x3c, 0x43, 0x30, 0x34, 0x39, 0x33, 0x3f, 0x38, 0x36, 0x2e,
+    0x33, 0x3f, 0x3c, 0x40, 0x3d, 0x3b, 0x3b, 0x31, 0x36, 0x41, 0x3b, 0x38,
+    0x46, 0x36, 0x34, 0x31, 0x42, 0x44, 0x33, 0x35, 0x3f, 0x36, 0x3c, 0x30,
+    0x3f, 0x31, 0x39, 0x3e, 0x3f, 0x47, 0x3e, 0x34, 0x36, 0x36, 0x34, 0x39,
+    0x37, 0x46, 0x40, 0x33, 0x3b, 0x3a, 0x3f, 0x41, 0x37, 0x44, 0x3a, 0x3f,
+    0x34, 0x45, 0x37, 0x33, 0x3f, 0x47, 0x41, 0x36, 0x39, 0x3e, 0x40, 0x38,
+    0x41, 0x3d, 0x3d, 0x36, 0x40, 0x3a, 0x3b, 0x3b, 0x41, 0x3b, 0x3a, 0x3f,
+    0x3f, 0x3b, 0x35, 0x42, 0x46, 0x3a, 0x30, 0x45, 0x40, 0x37, 0x39, 0x39,
+    0x3d, 0x38, 0x3f, 0x45, 0x3f, 0x31, 0x32, 0x3b, 0x35, 0x3e, 0x3b, 0x38,
+    0x3b, 0x44, 0x37, 0x39, 0x37, 0x42, 0x3f, 0x44, 0x38, 0x36, 0x37, 0x44,
+    0x45, 0x46, 0x41, 0x3b, 0x46, 0x42, 0x43, 0x43, 0x3a, 0x4b, 0x37, 0x35,
+    0x3b, 0x40, 0x32, 0x38, 0x41, 0x38, 0x4f, 0x3e, 0x36, 0x3f, 0x47, 0x3b,
+    0x47, 0x3b, 0x4a, 0x2e, 0x3d, 0x45, 0x3b, 0x46, 0x3e, 0x38, 0x43, 0x38,
+    0x41, 0x48, 0x3a, 0x39, 0x40, 0x45, 0x3b, 0x43, 0x40, 0x3e, 0x43, 0x41,
+    0x41, 0x3e, 0x39, 0x3f, 0x35, 0x42, 0x33, 0x3f, 0x3d, 0x32, 0x45, 0x3c,
+    0x41, 0x31, 0x45, 0x38, 0x43, 0x45, 0x41, 0x35, 0x35, 0x40, 0x44, 0x36,
+    0x3a, 0x3b, 0x3c, 0x2c, 0x3e, 0x41, 0x33, 0x3d, 0x46, 0x34, 0x3b, 0x30,
+    0x30, 0x42, 0x43, 0x3d, 0x3d, 0x3d, 0x43, 0x31, 0x3f, 0x40, 0x3a, 0x3f,
+    0x48, 0x3e, 0x3b, 0x39, 0x44, 0x43, 0x3b, 0x3a, 0x42, 0x38, 0x38, 0x3b,
+    0x3f, 0x44, 0x37, 0x3e, 0x45, 0x40, 0x41, 0x3b, 0x3c, 0x3a, 0x38, 0x37,
+    0x3b, 0x33, 0x3f, 0x35, 0x43, 0x3d, 0x33, 0x41, 0x3b, 0x46, 0x39, 0x32,
+    0x39, 0x3f, 0x3b, 0x39, 0x47, 0x3c, 0x3f, 0x39, 0x34, 0x3d, 0x3c, 0x46,
+    0x3f, 0x3e, 0x3e, 0x44, 0x34, 0x40, 0x3f, 0x39, 0x3c, 0x38, 0x36, 0x45,
+    0x42, 0x46, 0x3b, 0x44, 0x3a, 0x3d, 0x3b, 0x42, 0x3b, 0x3b, 0x3c, 0x45,
+    0x42, 0x3d, 0x36, 0x37, 0x3d, 0x43, 0x3f, 0x48, 0xa6, 0xfb, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00,
+    0x39, 0xff, 0xff, 0xff, 0xe5, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x68, 0xfb, 0xff, 0xff, 0xbc, 0xfc, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xe8, 0x03, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x70, 0x02, 0x00, 0x00,
+    0x70, 0x03, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0xf0, 0x01, 0x00, 0x00,
+    0x80, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x50, 0x01, 0x00, 0x00,
+    0xa4, 0x02, 0x00, 0x00, 0xba, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x24, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65,
+    0x6c, 0x73, 0x5f, 0x73, 0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x3c, 0xfd, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x80, 0x3b, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2a, 0xfd, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x03, 0x1c, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x61, 0x64, 0x64, 0x5f, 0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa4, 0xfd, 0xff, 0xff,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x97, 0xf5, 0x3f,
+    0x01, 0x00, 0x00, 0x00, 0x87, 0x35, 0xa0, 0x43, 0x01, 0x00, 0x00, 0x00,
+    0xd6, 0xd7, 0x28, 0xc3, 0x92, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x1c, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75,
+    0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x19, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x14, 0xfe, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x05, 0x80, 0xbf, 0x3f, 0x01, 0x00, 0x00, 0x00, 0x85, 0xc0, 0xbe, 0x43,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xfe, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x03, 0x3c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+    0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e,
+    0x74, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57,
+    0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72,
+    0x73, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x0a, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0xa4, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xae, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x17, 0xac, 0x6e, 0x3a, 0x01, 0x00, 0x00, 0x00,
+    0x20, 0x4e, 0x97, 0x3d, 0x01, 0x00, 0x00, 0x00, 0xaf, 0x27, 0x21, 0xbe,
+    0x96, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x20, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x31, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x1c, 0xff, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x3f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x42,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x02, 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+    0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xfc, 0xfe, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x17, 0xac, 0xee, 0x39, 0x5a, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x48, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x54, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67,
+    0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f,
+    0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74,
+    0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f,
+    0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x30, 0x11, 0x00, 0x00,
+    0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x3d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x9d, 0xaf, 0xd0, 0x3a, 0x01, 0x00, 0x00, 0x00,
+    0xe7, 0x29, 0x9e, 0x3e, 0x01, 0x00, 0x00, 0x00, 0x5b, 0x91, 0xc3, 0xbd,
+    0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,
+    0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d,
+    0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x62, 0x1b, 0x1c, 0x3b,
+    0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x14, 0x00, 0x1c, 0x00,
+    0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+    0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00,
+    0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
+    0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00,
+    0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
+    0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
+    0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
+const int g_tiny_conv_model_data_len = 19800;
diff --git a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
similarity index 78%
rename from tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
index 2953cc852d98fa9b5551ae5036048de9c2ebf674..a465dbfabf7cbba44473ae7e2ff94b1de2092b20 100644
--- a/tensorflow/contrib/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
@@ -18,10 +18,10 @@ limitations under the License.
 // don't have a file system. It was created using the command:
 // xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
 
 extern const unsigned char g_tiny_conv_model_data[];
 extern const int g_tiny_conv_model_data_len;
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f089ef82f3a7cc76f9d6a58b848ac281f1582a5a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h"
+
+const int g_yes_30ms_sample_data_size = 480;
+const int16_t g_yes_30ms_sample_data[480] = {
+    -876,   -470,   510,   803,   170,   -787,   -1568,  -1893,  -1598, -1027,
+    -992,   -1803,  -2610, -2484, -1905, -2113,  -3113,  -3399,  -2267, -1261,
+    -2007,  -3637,  -3909, -2340, -893,  -1158,  -2272,  -2486,  -1639, -915,
+    -777,   -596,   -91,   196,   85,    210,    875,    1373,   1247,  1219,
+    1958,   2718,   2328,  1196,  1008,  2350,   3677,   3269,   1503,  366,
+    922,    2264,   2810,  1996,  608,   -168,   75,     680,    811,   395,
+    -56,    -318,   -607,  -966,  -1108, -925,   -613,   -368,   -369,  -919,
+    -1926,  -2460,  -1685, -300,  155,   -611,   -1524,  -2204,  -3227, -3859,
+    -2037,  1622,   2382,  -2583, -8448, -7544,  -84,    4814,   915,   -6423,
+    -7558,  -1746,  2515,  -59,   -4587, -3858,  1260,   3625,   187,   -4148,
+    -3500,  1542,   5467,  4780,  1256,  -1127,  -403,   2481,   5332,  6346,
+    5014,   2536,   1216,  2467,  5039,  6238,   5070,   3381,   3269,  4173,
+    3905,   2248,   1586,  3299,  5240,  4362,   1004,   -1382,  -489,  2113,
+    3168,   1620,   -742,  -1824, -1435, -897,   -1058,  -1500,  -1545, -1398,
+    -1965,  -3266,  -4136, -3756, -2609, -1804,  -1986,  -3087,  -4599, -5296,
+    -4051,  -1731,  -781,  -2228, -4092, -3977,  -2325,  -1353,  -1568, -1490,
+    -428,   178,    -672,  -1650, -1058, 749,    2039,   2079,   1540,  897,
+    310,    572,    2266,  4265,  4265,  1869,   -231,   559,    3332,  4752,
+    3229,   768,    101,   1364,  2463,  1984,   819,    411,    723,   675,
+    -162,   -923,   -743,  -32,   185,   -516,   -1653,  -2359,  -2103, -986,
+    42,     -205,   -1702, -2870, -2337, -809,   -221,   -982,   -1544, -946,
+    -598,   -2117,  -4291, -4100, -857,  1948,   338,    -4799,  -7972, -5403,
+    173,    2371,   -1063, -5533, -5578, -1777,  605,    -985,   -3249, -2213,
+    1184,   2691,   560,   -2356, -2288, 1233,   5244,   6441,   4004,  370,
+    -663,   2555,   7404,  9282,  6573,  2612,   1836,   4662,   7467,  7393,
+    5421,   4262,   4741,  5362,  4705,  3163,   2397,   3337,   4887,  4810,
+    2254,   -749,   -1316, 772,   2706,  2016,   -573,   -2552,  -2746, -2012,
+    -1647,  -1978,  -2579, -3105, -3473, -3911,  -4484,  -4891,  -4795, -4163,
+    -3543,  -3538,  -4275, -5356, -5743, -4637,  -2614,  -1301,  -1825, -3341,
+    -4011,  -2937,  -751,  1007,  1245,  235,    -639,   -61,    1626,  2864,
+    2967,   2734,   3013,  3329,  2914,  2312,   2666,   3839,   4308,  3162,
+    1453,   768,    1255,  1887,  2006,  1715,   1031,   -297,   -1660, -1690,
+    -277,   813,    -30,   -2137, -3370, -2854,  -1553,  -593,   -413,  -1146,
+    -2567,  -3440,  -2369, -205,  379,   -1258,  -2315,  -812,   262,   -3205,
+    -8576,  -7894,  738,   7492,  1951,  -11595, -17098, -6934,  7139,  8065,
+    -4575,  -14199, -8946, 3606,  7504,  -547,   -8242,  -5113,  4406,  8113,
+    2134,   -5040,  -4089, 4157,  10934, 10158,  4167,   -565,   -192,  4428,
+    9765,   12201,  9861,  4512,  1225,  3451,   8483,   10133,  6497,  2574,
+    3333,   6806,   6986,  2487,  -1214, 623,    5416,   6647,   2204,  -3289,
+    -4556,  -1565,  1544,  1525,  -1236, -4293,  -5695,  -5174,  -3995, -3403,
+    -3449,  -3750,  -4505, -6014, -7296, -6523,  -3849,  -2096,  -3288, -5722,
+    -6004,  -3581,  -1497, -1960, -3330, -2800,  -434,   964,    -111,  -1739,
+    -1136,  1736,   4151,  3736,  1274,  -451,   469,    3386,   5833,  5898,
+    3646,   1085,   272,   1743,  4061,  5108,   3837,   1490,   246,   967,
+    1866,   859,    -1069, -974,  1542,  2835,   47,     -4285,  -5068, -1567,
+    1781,   1223,   -1997, -4227, -3747, -1720,  41,     245,    -1228, -2972,
+    -2673,  22,     1980,  -930,  -7721, -11271, -5725,  4974,   8484,  -2007,
+    -16979, -19255, -4670, 11057, 9690,  -6417,  -17537, -10841, 4262,  9292,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..daaeb514a806d02b3f9e7e6d8ca0e6409a63f29b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was created from the PCM data in a WAV file held in v2 of the
+// Speech Commands test dataset, at the path:
+// speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav
+// The data was extracted starting at an offset of 8,000, which corresponds to
+// the 26th spectrogram slice. It's designed to be used to test the
+// preprocessing pipeline, to ensure that the expected spectrogram slice is
+// produced given this input.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_30MS_SAMPLE_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_30MS_SAMPLE_DATA_H_
+
+#include <cstdint>
+
+extern const int g_yes_30ms_sample_data_size;
+extern const int16_t g_yes_30ms_sample_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_30MS_SAMPLE_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2eb737fb8e1204a02f7ea4852016e85d03980bfd
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
@@ -0,0 +1,158 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h"
+
+/* File automatically created by
+ * tensorflow/examples/speech_commands/wav_to_features.py \
+ * --sample_rate=16000 \
+ * --clip_duration_ms=1000 \
+ * --window_size_ms=30 \
+ * --window_stride_ms=20 \
+ * --feature_bin_count=40 \
+ * --quantize \
+ * --preprocess="average" \
+ * --input_wav="speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav" \
+ * --output_c_file="yes_features_data.cc" \
+ */
+
+const int g_yes_f2e59fea_nohash_1_width = 43;
+const int g_yes_f2e59fea_nohash_1_height = 49;
+const unsigned char g_yes_f2e59fea_nohash_1_data[] = {
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  1,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  1,   1,  1,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  4,   5,   1,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  1,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   2,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    1,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   1,  19, 1,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   1,   0,  1,  3,   3,   1,  1,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   8,   89, 8,   0,   0,  0,  0,   0,   0,  0,  0,   4,  13,
+    1,  6,  23,  20,  6,   4,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  19, 177, 42, 1,
+    1,  0,  0,   0,   0,   2,  3,   119, 51, 5,  139, 92,  58, 58, 15,  2,  1,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   13, 165, 176, 3,  1,  1,   0,   0,  1,  1,   32, 214,
+    26, 19, 113, 103, 28,  22, 27,  3,   1,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  12,  55, 128,
+    27, 1,  1,   0,   1,   4,  2,   52,  93, 10, 28,  156, 10, 21, 21,  3,  3,
+    1,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  14,  99,  32, 65, 7,   1,   2,  2,  6,   13, 121,
+    36, 15, 11,  112, 125, 14, 5,   13,  4,  4,  2,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   24, 25,
+    32, 5,  1,   0,   0,   0,  1,   0,   7,  5,  1,   1,   3,  3,  0,   3,  3,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   13,  13, 5,  1,   0,   0,  0,  0,   0,  3,
+    4,  1,  0,   1,   2,   3,  1,   1,   1,  4,  8,   1,   2,  1,  3,   1,  1,
+    0,  1,  1,   3,   1,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  1,
+    8,  2,  1,   0,   0,   0,  0,   0,   1,  1,  0,   0,   1,  1,  2,   0,  2,
+    1,  0,  2,   0,   2,   2,  3,   1,   1,  0,  1,   1,   4,  5,  1,   0,  1,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  1,   1,   1,  0,  1,   2,   1,  0,  1,   3,  1,
+    1,  3,  1,   1,   6,   2,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  2,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  1,   1,   0,  1,  2,   6,   2,  4,  2,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  3,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  1,
+    0,  0,  1,   2,   1,   1,  2,   1,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  4,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   2,  1,  0,   0,   2,  3,  5,   2,  0,
+    1,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   1,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   1,   2,  2,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  1,  0,   0,   0,  0,  1,   2,  3,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   1,  1,   1,   1,  0,  0,   0,   1,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
+    0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..39a3bb914cc1986aa851ace0e39ce63ed1a93282
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
+
+extern const int g_yes_f2e59fea_nohash_1_width;
+extern const int g_yes_f2e59fea_nohash_1_height;
+extern const unsigned char g_yes_f2e59fea_nohash_1_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a34a2045a221e2eee8c51f23000e819b1638499
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h"
+
+const uint8_t g_yes_power_spectrum_data[g_yes_power_spectrum_data_size] = {
+    8, 89, 8, 0, 0, 0, 0, 0, 0, 0, 0, 4, 13, 1, 6, 23, 20, 6, 4, 0, 0, 0,
+    0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0,  0,  0, 0, 0, 0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c8c00ac1116dcbd7ad4aeda1828603e962c2001
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was extracted from the larger feature data held in
+// no_features_data.cc and consists of the 26th spectrogram slice of 43 values.
+// This is the expected result of running the sample data in
+// yes_30ms_sample_data.cc through through the preprocessing pipeline.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
+
+#include <cstdint>
+
+constexpr int g_yes_power_spectrum_data_size = 43;
+extern const uint8_t g_yes_power_spectrum_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a54fd41760d58f2023e6b7b2aac72ac5f5e95ae3
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -0,0 +1,107 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+cc_library(
+    name = "micro_ops",
+    srcs = [
+        "depthwise_conv.cc",
+        "fully_connected.cc",
+        "softmax.cc",
+    ],
+    hdrs = [
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels:padding",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+    ],
+)
+
+cc_library(
+    name = "all_ops_resolver",
+    srcs = [
+        "all_ops_resolver.cc",
+    ],
+    hdrs = [
+        "all_ops_resolver.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":micro_ops",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "test_utils",
+    srcs = [
+    ],
+    hdrs = [
+        "test_utils.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "depthwise_conv_test",
+    srcs = [
+        "depthwise_conv_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":test_utils",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "fully_connected_test",
+    srcs = [
+        "fully_connected_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":test_utils",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "softmax_test",
+    srcs = [
+        "softmax_test.cc",
+    ],
+    deps = [
+        ":all_ops_resolver",
+        ":test_utils",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
similarity index 95%
rename from tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.cc
rename to tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
index bd0a37badb8ab1e739fdee9c8be9c3f800e80e2e..b733949e45df9cb88eb394900c513d0f0763f9a1 100644
--- a/tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc
@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9ba8c882624bfbbdd289c00794fc2e691fd6b3c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
+
+#include "tensorflow/lite/experimental/micro/compatibility.h"
+#include "tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+class AllOpsResolver : public MicroMutableOpResolver {
+ public:
+  AllOpsResolver();
+
+ private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_ALL_OPS_RESOLVER_H_
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce821a94787796251ba4becc327e257352e0ad05
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv.cc
@@ -0,0 +1,208 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace depthwise_conv {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
+  data->padding.height = ComputePadding(params->stride_height, 1, height,
+                                        filter_height, out_height);
+  data->padding.width =
+      ComputePadding(params->stride_width, 1, width, filter_width, out_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, OpData* data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+}
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, OpData* data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+
+  tflite::reference_ops::DepthwiseConv(
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+      GetTensorShape(filter), GetTensorData<uint8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<uint8_t>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
+      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+  int out_width = ComputeOutSize(params->padding, width, filter_width,
+                                 params->stride_width);
+  int out_height = ComputeOutSize(params->padding, height, filter_height,
+                                  params->stride_height);
+  OpData local_data_object;
+  OpData* data = &local_data_object;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, out_width,
+                                        out_height, data_type, data));
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      EvalFloat(context, node, params, data, input, filter, bias, output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
+      break;
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+  static TfLiteRegistration r = {depthwise_conv::Init, depthwise_conv::Free,
+                                 depthwise_conv::Prepare, depthwise_conv::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f70437a4b943e6e71547e010a0fea9ab551194db
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
@@ -0,0 +1,406 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestDepthwiseConvFloat(std::initializer_list<int> input_dims_data,
+                            std::initializer_list<float> input_data,
+                            std::initializer_list<int> filter_dims_data,
+                            std::initializer_list<float> filter_data,
+                            std::initializer_list<int> bias_dims_data,
+                            std::initializer_list<float> bias_data,
+                            std::initializer_list<float> expected_output_data,
+                            std::initializer_list<int> output_dims_data,
+                            TfLiteFusedActivation activation,
+                            float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInitializer(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(filter_data, filter_dims, "filter_tensor"),
+      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int input_depth = input_dims->data[3];
+  int output_depth = filter_dims->data[3];
+  int depth_mul = output_depth / input_depth;
+  TfLiteDepthwiseConvParams builtin_data = {
+      kTfLitePaddingValid, 1, 1, depth_mul, activation,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestDepthwiseConvQuantized(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<uint8_t> input_data, float input_min, float input_max,
+    std::initializer_list<int> filter_dims_data,
+    std::initializer_list<uint8_t> filter_data, float filter_min,
+    float filter_max, std::initializer_list<int> bias_dims_data,
+    std::initializer_list<int32_t> bias_data, float bias_min, float bias_max,
+    std::initializer_list<uint8_t> expected_output_data,
+    std::initializer_list<int> output_dims_data, float output_min,
+    float output_max, TfLiteFusedActivation activation, uint8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInitializer(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(filter_data, filter_dims, "filter_tensor",
+                            filter_min, filter_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_min,
+                              bias_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int input_depth = input_dims->data[3];
+  int output_depth = filter_dims->data[3];
+  int depth_mul = output_depth / input_depth;
+  TfLiteDepthwiseConvParams builtin_data = {
+      kTfLitePaddingValid, 1, 1, depth_mul, activation,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+  const int output_dims_count = 8;
+  float output_data[output_dims_count];
+  tflite::testing::TestDepthwiseConvFloat(  //
+      {4, 1, 3, 2, 2},                      // Input shape.
+      {
+          1, 2, 7, 8,    // Input values.
+          3, 4, 9, 10,   //
+          5, 6, 11, 12,  //
+      },
+      {4, 1, 2, 2, 4},  // Filters shape.
+      {
+          1, 2, 3, 4,        // Filters values.
+          -9, 10, -11, 12,   //
+          5, 6, 7, 8,        //
+          13, -14, 15, -16,  //
+      },
+      {1, 4},  // Bias shape.
+      {
+          1, 2, 3, 4,  // Bias values.
+      },
+      {
+          71, -34, 99, -20,  // Expected results.
+          91, -26, 127, -4,  //
+      },
+      {4, 1, 2, 1, 4},  // Output shape.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float filter_min = -63.5f;
+  const float filter_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 8;
+  uint8_t output_data[output_dims_count];
+
+  tflite::testing::TestDepthwiseConvQuantized(  //
+      {4, 1, 3, 2, 2},                          // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(7, input_min, input_max),
+          F2Q(8, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(9, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(5, input_min, input_max),
+          F2Q(6, input_min, input_max),
+          F2Q(11, input_min, input_max),
+          F2Q(12, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {4, 1, 2, 2, 4},       // Filter shape.
+      {
+          // Filter values.
+          F2Q(1, filter_min, filter_max),
+          F2Q(2, filter_min, filter_max),
+          F2Q(3, filter_min, filter_max),
+          F2Q(4, filter_min, filter_max),
+          F2Q(-9, filter_min, filter_max),
+          F2Q(10, filter_min, filter_max),
+          F2Q(-11, filter_min, filter_max),
+          F2Q(12, filter_min, filter_max),
+          F2Q(5, filter_min, filter_max),
+          F2Q(6, filter_min, filter_max),
+          F2Q(7, filter_min, filter_max),
+          F2Q(8, filter_min, filter_max),
+          F2Q(13, filter_min, filter_max),
+          F2Q(-14, filter_min, filter_max),
+          F2Q(15, filter_min, filter_max),
+          F2Q(-16, filter_min, filter_max),
+      },
+      filter_min, filter_max,  // Filter quantization range.
+      {1, 4},                  // Bias shape.
+      {
+          // Bias values.
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+          F2Q32(4, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(71, output_min, output_max),
+          F2Q(-34, output_min, output_max),
+          F2Q(99, output_min, output_max),
+          F2Q(-20, output_min, output_max),
+          F2Q(91, output_min, output_max),
+          F2Q(-26, output_min, output_max),
+          F2Q(127, output_min, output_max),
+          F2Q(-4, output_min, output_max),
+      },
+      {4, 1, 2, 1, 4},         // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestRelu) {
+  const int output_dims_count = 8;
+  float output_data[output_dims_count];
+  tflite::testing::TestDepthwiseConvFloat(  //
+      {4, 1, 3, 2, 2},                      // Input shape.
+      {
+          1, 2, 7, 8,    // Input values.
+          3, 4, 9, 10,   //
+          5, 6, 11, 12,  //
+      },
+      {4, 1, 2, 2, 4},  // Filters shape.
+      {
+          1, 2, 3, 4,        // Filters values.
+          -9, 10, -11, 12,   //
+          5, 6, 7, 8,        //
+          13, -14, 15, -16,  //
+      },
+      {1, 4},  // Bias shape.
+      {
+          1, 2, 3, 4,  // Bias values.
+      },
+      {
+          71, 0, 99, 0,   // Expected results.
+          91, 0, 127, 0,  //
+      },
+      {4, 1, 2, 1, 4},  // Output shape.
+      kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float filter_min = -63.5f;
+  const float filter_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 8;
+  uint8_t output_data[output_dims_count];
+
+  tflite::testing::TestDepthwiseConvQuantized(  //
+      {4, 1, 3, 2, 2},                          // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(7, input_min, input_max),
+          F2Q(8, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(9, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(5, input_min, input_max),
+          F2Q(6, input_min, input_max),
+          F2Q(11, input_min, input_max),
+          F2Q(12, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {4, 1, 2, 2, 4},       // Filter shape.
+      {
+          // Filter values.
+          F2Q(1, filter_min, filter_max),
+          F2Q(2, filter_min, filter_max),
+          F2Q(3, filter_min, filter_max),
+          F2Q(4, filter_min, filter_max),
+          F2Q(-9, filter_min, filter_max),
+          F2Q(10, filter_min, filter_max),
+          F2Q(-11, filter_min, filter_max),
+          F2Q(12, filter_min, filter_max),
+          F2Q(5, filter_min, filter_max),
+          F2Q(6, filter_min, filter_max),
+          F2Q(7, filter_min, filter_max),
+          F2Q(8, filter_min, filter_max),
+          F2Q(13, filter_min, filter_max),
+          F2Q(-14, filter_min, filter_max),
+          F2Q(15, filter_min, filter_max),
+          F2Q(-16, filter_min, filter_max),
+      },
+      filter_min, filter_max,  // Filter quantization range.
+      {1, 4},                  // Bias shape.
+      {
+          // Bias values.
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+          F2Q32(4, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(71, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(99, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(91, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(127, output_min, output_max),
+          F2Q(0, output_min, output_max),
+      },
+      {4, 1, 2, 1, 4},         // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/kernels/fully_connected.cc b/tensorflow/lite/experimental/micro/kernels/fully_connected.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a344c4ffbeded910c26bf38ec2f6f58334592cda
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/fully_connected.cc
@@ -0,0 +1,184 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace fully_connected {
+namespace {
+
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int input_quantized_index;
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus CalculateOpData(TfLiteContext* context,
+                             TfLiteFullyConnectedParams* params,
+                             TfLiteType data_type, const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output,
+                             OpData* data) {
+  TfLiteStatus status = kTfLiteOk;
+  if (data_type != kTfLiteFloat32) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+  }
+  return status;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteFullyConnectedParams* params, OpData* data,
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* output) {
+  const int32_t input_offset = -input->params.zero_point;
+  const int32_t filter_offset = -filter->params.zero_point;
+  const int32_t output_offset = output->params.zero_point;
+
+  tflite::FullyConnectedParams op_params;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+#define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
+  reference_ops::FullyConnected(                                       \
+      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
+      GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
+      GetTensorShape(output), GetTensorData<output_data_type>(output), \
+      nullptr)
+  switch (output->type) {
+    case kTfLiteUInt8:
+      TF_LITE_FULLY_CONNECTED(uint8_t);
+      break;
+    case kTfLiteInt16:
+      TF_LITE_FULLY_CONNECTED(int16_t);
+      break;
+    default:
+      context->ReportError(
+          context,
+          "Quantized FullyConnected expects output data type uint8 or int16");
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteFullyConnectedParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+  tflite::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  tflite::reference_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<float>(input),
+      GetTensorShape(filter), GetTensorData<float>(filter),
+      GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
+      GetTensorData<float>(output));
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TfLiteType data_type = input->type;
+  OpData local_data_object;
+  OpData* data = &local_data_object;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
+                                        filter, bias, output, data));
+
+  switch (filter->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      return EvalFloat(context, node, params, data, input, filter, bias,
+                       output);
+    case kTfLiteUInt8:
+      return EvalQuantized(context, node, params, data, input, filter, bias,
+                           output);
+
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           filter->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace fully_connected
+
+TfLiteRegistration* Register_FULLY_CONNECTED() {
+  static TfLiteRegistration r = {fully_connected::Init, fully_connected::Free,
+                                 fully_connected::Prepare,
+                                 fully_connected::Eval};
+  return &r;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..300f8aaf78ad38a2cd4a7c715cf63315a0b2e751
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
@@ -0,0 +1,643 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestFullyConnectedFloat(std::initializer_list<int> input_dims_data,
+                             std::initializer_list<float> input_data,
+                             std::initializer_list<int> weights_dims_data,
+                             std::initializer_list<float> weights_data,
+                             std::initializer_list<int> bias_dims_data,
+                             std::initializer_list<float> bias_data,
+                             std::initializer_list<float> expected_output_data,
+                             std::initializer_list<int> output_dims_data,
+                             TfLiteFusedActivation activation,
+                             float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInitializer(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(weights_data, weights_dims, "weights_tensor"),
+      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteFullyConnectedParams builtin_data = {
+      activation,
+      kTfLiteFullyConnectedWeightsFormatDefault,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestFullyConnectedQuantized(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<uint8_t> input_data, float input_min, float input_max,
+    std::initializer_list<int> weights_dims_data,
+    std::initializer_list<uint8_t> weights_data, float weights_min,
+    float weights_max, std::initializer_list<int> bias_dims_data,
+    std::initializer_list<int32_t> bias_data, float bias_min, float bias_max,
+    std::initializer_list<uint8_t> expected_output_data,
+    std::initializer_list<int> output_dims_data, float output_min,
+    float output_max, TfLiteFusedActivation activation, uint8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInitializer(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInitializer(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(weights_data, weights_dims, "weights_tensor",
+                            weights_min, weights_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_min,
+                              bias_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteFullyConnectedParams builtin_data = {
+      activation,
+      kTfLiteFullyConnectedWeightsFormatDefault,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(  //
+      {2, 2, 10},                            // Input shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+      },
+      {2, 3, 10},  // Weights shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+      },
+      {1, 3},  // Bias shape.
+      {
+          1, 2, 3,  // Bias values.
+      },
+      {
+          24, 25, 26, 58, 59, 60,  // Expected results.
+      },
+      {2, 2, 3},  // Output shape.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest2) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(  //
+      {2, 2, 2},                             // Input shape.
+      {
+          1, 2,  // b = 0
+          2, 1,  // b = 1
+      },
+      {2, 1, 2},  // Weights shape.
+      {
+          2, 4,  // u = 0
+      },
+      {1, 1},  // Bias shape.
+      {
+          1,  // Bias values.
+      },
+      {
+          11, 9,  // Expected results.
+      },
+      {2, 2, 1},  // Output shape.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestRelu) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(  //
+      {2, 2, 10},                            // Input shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+      },
+      {2, 3, 10},  // Weights shape.
+      {
+          1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 0
+          -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,  // u = 1
+          1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 2
+      },
+      {1, 3},  // Bias shape.
+      {
+          1, -2, 3,  // Bias values.
+      },
+      {
+          24, 0, 26, 58, 0, 60,  // Expected results.
+      },
+      {2, 2, 3},  // Output shape.
+      kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {2, 2, 10},                                // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(25, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(59, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedRelu) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {2, 2, 10},                                // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
+          F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max),
+          F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max),
+          F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max),
+          F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max),
+          F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(0, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(0, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedOutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -127.0f;
+  const float weights_max = 128.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 256.0f * (1 << 24);
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {2, 2, 10},                                // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(25, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(59, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInput) {
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(  //
+      {4, 1, 1, 5, 1},                       // Input shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+          1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+      },
+      {2, 3, 10},  // Weights shape.
+      {
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+          1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+      },
+      {1, 3},  // Bias shape.
+      {
+          1, 2, 3,  // Bias values.
+      },
+      {
+          24, 25, 26, 58, 59, 60,  // Expected results.
+      },
+      {2, 2, 3},  // Output shape.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantized) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 64.0f * (1 << 24);
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {4, 1, 1, 5, 1},                           // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(25, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(59, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedOutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -127.0f;
+  const float weights_max = 128.0f;
+  const float bias_min = 0.0f;
+  const float bias_max = 256.0f * (1 << 24);
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized(  //
+      {4, 1, 1, 5, 1},                           // Input shape.
+      {
+          // Input values.
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+          F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+          F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+          F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+          F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+          F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+          F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantization range.
+      {2, 3, 10},            // Weights shape.
+      {
+          // Weight values.
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+          F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+          F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+          F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+          F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+          F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      },
+      weights_min, weights_max,  // Weights quantization range.
+      {1, 3},                    // Bias shape.
+      {
+          F2Q32(1, bias_min, bias_max),
+          F2Q32(2, bias_min, bias_max),
+          F2Q32(3, bias_min, bias_max),
+      },
+      bias_min, bias_max,  // Bias quantization range.
+      {
+          // Expected results.
+          F2Q(24, output_min, output_max),
+          F2Q(25, output_min, output_max),
+          F2Q(26, output_min, output_max),
+          F2Q(58, output_min, output_max),
+          F2Q(59, output_min, output_max),
+          F2Q(60, output_min, output_max),
+      },
+      {2, 2, 3},               // Output shape.
+      output_min, output_max,  // Output quantization range.
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/softmax.cc b/tensorflow/lite/experimental/micro/kernels/softmax.cc
similarity index 93%
rename from tensorflow/contrib/lite/experimental/micro/kernels/softmax.cc
rename to tensorflow/lite/experimental/micro/kernels/softmax.cc
index a4019a067c563cac25d9918e4bdf75913bdfa3d6..6d2d8b470fcad5c48506206687bd90305c4b3d2e 100644
--- a/tensorflow/contrib/lite/experimental/micro/kernels/softmax.cc
+++ b/tensorflow/lite/experimental/micro/kernels/softmax.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/kernels/internal/reference/softmax.h"
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/experimental/micro/kernels/softmax_test.cc b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7253b3be8ce20ff6d30ca725060da606c416c8e1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
@@ -0,0 +1,220 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestSoftmaxFloat(std::initializer_list<int> input_dims_data,
+                      std::initializer_list<float> input_data,
+                      std::initializer_list<float> expected_output_data,
+                      std::initializer_list<int> output_dims_data,
+                      float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 2;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSoftmaxParams builtin_data = {1.0f};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestSoftmaxQuantized(std::initializer_list<int> input_dims_data,
+                          std::initializer_list<uint8_t> input_data,
+                          float input_min, float input_max,
+                          std::initializer_list<uint8_t> expected_output_data,
+                          std::initializer_list<int> output_dims_data,
+                          float output_min, float output_max,
+                          uint8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteSoftmaxParams builtin_data = {1.0f};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+  const int output_dims_count = 10;
+  float output_data[output_dims_count];
+  tflite::testing::TestSoftmaxFloat(  //
+      {2, 2, 5},                      // Input shape.
+      {
+          1.0, 2.0, 3.0, 4.0, 5.0,       // b = 0
+          -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 0
+      },
+      {
+          // Expected results.
+          0.011656231,
+          0.031684921,
+          0.086128544,
+          0.234121657,
+          0.636408647,
+          0.636408647,
+          0.234121657,
+          0.086128544,
+          0.031684921,
+          0.011656231,
+      },
+      {2, 2, 5},  // Output shape.
+      output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+  using tflite::testing::F2Q;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float output_min = 0.0f;
+  const float output_max = (255.0f / 256.0f);
+  const int output_dims_count = 5;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestSoftmaxQuantized(  //
+      {2, 1, 5},                          // Input shape.
+      {
+          F2Q(1.0, input_min, input_max),
+          F2Q(2.0, input_min, input_max),
+          F2Q(3.0, input_min, input_max),
+          F2Q(4.0, input_min, input_max),
+          F2Q(5.0, input_min, input_max),
+      },
+      input_min, input_max,  // Input quantized range.
+      {
+          // Expected results.
+          F2Q(0.011656231, output_min, output_max),
+          F2Q(0.031684921, output_min, output_max),
+          F2Q(0.086128544, output_min, output_max),
+          F2Q(0.234121657, output_min, output_max),
+          F2Q(0.636408647, output_min, output_max),
+      },
+      {2, 1, 5},               // Output shape.
+      output_min, output_max,  // Output quantized range.
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h b/tensorflow/lite/experimental/micro/kernels/test_utils.h
similarity index 90%
rename from tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h
rename to tensorflow/lite/experimental/micro/kernels/test_utils.h
index 789a48ece8bd68544649fb05548355cb796ccabb..95f2d8a9d217a1b1f23c0198ddce5156e1c6cb36 100644
--- a/tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h
+++ b/tensorflow/lite/experimental/micro/kernels/test_utils.h
@@ -12,18 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
 
 #include <cstdarg>
 #include <initializer_list>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/experimental/micro/kernels/test_utils.h"
-#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace tflite {
 namespace testing {
@@ -89,7 +89,7 @@ inline void PopulateContext(TfLiteTensor* tensors, int tensors_size,
   context->ReportError = ReportOpError;
   context->AddTensors = nullptr;
   context->GetNodeAndRegistration = nullptr;
-  context->ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context->ReplaceNodeSubsetsWithDelegateKernels = nullptr;
   context->recommended_num_threads = 1;
   context->GetExternalContext = nullptr;
   context->SetExternalContext = nullptr;
@@ -167,4 +167,4 @@ inline TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.cc b/tensorflow/lite/experimental/micro/micro_error_reporter.cc
similarity index 78%
rename from tensorflow/contrib/lite/experimental/micro/micro_error_reporter.cc
rename to tensorflow/lite/experimental/micro/micro_error_reporter.cc
index 99dd8836611c287b7f76104c29c12a73d219ccb3..6bfe541f8063068205bee4b6d662f446ea0a853f 100644
--- a/tensorflow/contrib/lite/experimental/micro/micro_error_reporter.cc
+++ b/tensorflow/lite/experimental/micro/micro_error_reporter.cc
@@ -13,19 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h"
-
-#ifdef TF_LITE_MCU_DEBUG_LOG
-#include <debug_log.h>
-#else  // TF_LITE_MCU_DEBUG_LOG
-#include <cstdint>
-#include <cstdio>
-void DebugLog(const char* s) { fprintf(stderr, "%s", s); }
-void DebugLogInt32(int32_t i) { fprintf(stderr, "%d", i); }
-void DebugLogUInt32(uint32_t i) { fprintf(stderr, "%d", i); }
-void DebugLogHex(uint32_t i) { fprintf(stderr, "0x%8x", i); }
-void DebugLogFloat(float i) { fprintf(stderr, "%f", i); }
-#endif  // TF_LITE_MCU_DEBUG_LOG
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/experimental/micro/micro_error_reporter.h b/tensorflow/lite/experimental/micro/micro_error_reporter.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ab853ec2ac915a8eb3da87eb8b86f2ecec697c7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/micro_error_reporter.h
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MICRO_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MICRO_ERROR_REPORTER_H_
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/experimental/micro/compatibility.h"
+
+#ifdef TF_LITE_MCU_DEBUG_LOG
+// These functions should be supplied by the micro target library
+extern "C" {
+#include <stdint.h>
+void DebugLog(const char* s);
+void DebugLogInt32(int32_t i);
+void DebugLogUInt32(uint32_t i);
+void DebugLogHex(uint32_t i);
+void DebugLogFloat(float i);
+}
+#else  // TF_LITE_MCU_DEBUG_LOG
+#include <cstdint>
+#include <cstdio>
+static void inline DebugLog(const char* s) { fprintf(stderr, "%s", s); }
+static void inline DebugLogInt32(int32_t i) { fprintf(stderr, "%d", i); }
+static void inline DebugLogUInt32(uint32_t i) { fprintf(stderr, "%d", i); }
+static void inline DebugLogHex(uint32_t i) { fprintf(stderr, "0x%8x", i); }
+static void inline DebugLogFloat(float i) { fprintf(stderr, "%f", i); }
+#endif  // TF_LITE_MCU_DEBUG_LOG
+
+namespace tflite {
+
+class MicroErrorReporter : public ErrorReporter {
+ public:
+  ~MicroErrorReporter() {}
+  int Report(const char* format, va_list args) override;
+
+ private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MICRO_ERROR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_error_reporter_test.cc b/tensorflow/lite/experimental/micro/micro_error_reporter_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/experimental/micro/micro_error_reporter_test.cc
rename to tensorflow/lite/experimental/micro/micro_error_reporter_test.cc
index ef3c32050c0e826c005f185553974170da7e486a..ca89de9739fe1dd696dc3a2a9db269d9bba48a26 100644
--- a/tensorflow/contrib/lite/experimental/micro/micro_error_reporter_test.cc
+++ b/tensorflow/lite/experimental/micro/micro_error_reporter_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 
 int main(int argc, char** argv) {
   tflite::MicroErrorReporter micro_error_reporter;
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc
similarity index 94%
rename from tensorflow/contrib/lite/experimental/micro/micro_interpreter.cc
rename to tensorflow/lite/experimental/micro/micro_interpreter.cc
index 0f38991bb0ef3d0134b4d9a1eb6e148a140fe6f9..f1c236fb62f002fc17b06852d09c8675f4ccb755 100644
--- a/tensorflow/contrib/lite/experimental/micro/micro_interpreter.cc
+++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
 
-#include "tensorflow/contrib/lite/core/api/flatbuffer_conversions.h"
-#include "tensorflow/contrib/lite/experimental/micro/compatibility.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/experimental/micro/compatibility.h"
 
 namespace tflite {
 namespace {
@@ -82,7 +82,7 @@ MicroInterpreter::MicroInterpreter(const Model* model,
   context_.tensors_size = tensors_->Length();
   context_.tensors =
       reinterpret_cast<TfLiteTensor*>(tensor_allocator_->AllocateMemory(
-          sizeof(TfLiteTensor) * context_.tensors_size));
+          sizeof(TfLiteTensor) * context_.tensors_size, 4));
   for (int i = 0; i < subgraph_->inputs()->Length(); ++i) {
     const int tensor_index = subgraph_->inputs()->Get(i);
     const auto* tensor = tensors_->Get(tensor_index);
@@ -94,10 +94,10 @@ MicroInterpreter::MicroInterpreter(const Model* model,
     }
   }
 
-  int* first_created = reinterpret_cast<int*>(
-      tensor_allocator_->AllocateMemory(sizeof(int) * tensors_->Length()));
-  int* last_used = reinterpret_cast<int*>(
-      tensor_allocator_->AllocateMemory(sizeof(int) * tensors_->Length()));
+  int* first_created = reinterpret_cast<int*>(tensor_allocator_->AllocateMemory(
+      sizeof(int) * tensors_->Length(), sizeof(int)));
+  int* last_used = reinterpret_cast<int*>(tensor_allocator_->AllocateMemory(
+      sizeof(int) * tensors_->Length(), sizeof(int)));
   for (int i = 0; i < tensors_->Length(); ++i) {
     first_created[i] = -1;
     last_used[i] = -1;
@@ -149,7 +149,7 @@ MicroInterpreter::MicroInterpreter(const Model* model,
   context_.ReportError = ReportOpError;
   context_.AddTensors = nullptr;
   context_.GetNodeAndRegistration = nullptr;
-  context_.ReplaceSubgraphsWithDelegateKernels = nullptr;
+  context_.ReplaceNodeSubsetsWithDelegateKernels = nullptr;
   context_.recommended_num_threads = 1;
   context_.GetExternalContext = nullptr;
   context_.SetExternalContext = nullptr;
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_interpreter.h b/tensorflow/lite/experimental/micro/micro_interpreter.h
similarity index 81%
rename from tensorflow/contrib/lite/experimental/micro/micro_interpreter.h
rename to tensorflow/lite/experimental/micro/micro_interpreter.h
index a88514cde849595244d36a31900e6d1c2ae1714b..6450dcce96204b6cf8df21c5f7f77b57c3a0c5fa 100644
--- a/tensorflow/contrib/lite/experimental/micro/micro_interpreter.h
+++ b/tensorflow/lite/experimental/micro/micro_interpreter.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_INTERPRETER_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_INTERPRETER_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MICRO_INTERPRETER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MICRO_INTERPRETER_H_
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/core/api/op_resolver.h"
-#include "tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -68,4 +68,4 @@ class MicroInterpreter {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_INTERPRETER_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MICRO_INTERPRETER_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_interpreter_test.cc b/tensorflow/lite/experimental/micro/micro_interpreter_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/experimental/micro/micro_interpreter_test.cc
rename to tensorflow/lite/experimental/micro/micro_interpreter_test.cc
index 251e5f72037717f74bc3472b69144cff299f0668..0c0c71f0792dd3510556b13046a0b3b5e6acc9d8 100644
--- a/tensorflow/contrib/lite/experimental/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/experimental/micro/micro_interpreter_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
 
-#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.cc b/tensorflow/lite/experimental/micro/micro_mutable_op_resolver.cc
similarity index 97%
rename from tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.cc
rename to tensorflow/lite/experimental/micro/micro_mutable_op_resolver.cc
index 40c21c6448c39f27c12e95ae36038510cb346362..1e8b5c0e573bd92468befe33378c33d405f73141 100644
--- a/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.cc
+++ b/tensorflow/lite/experimental/micro/micro_mutable_op_resolver.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h b/tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h
similarity index 79%
rename from tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h
rename to tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h
index f3750a248416cc7244e0dea82be167562fd59ee7..f613203909e2d4339295036242ab018182a5d510 100644
--- a/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 
-#include "tensorflow/contrib/lite/core/api/op_resolver.h"
-#include "tensorflow/contrib/lite/experimental/micro/compatibility.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/experimental/micro/compatibility.h"
 
 #ifndef TFLITE_REGISTRATIONS_MAX
 #define TFLITE_REGISTRATIONS_MAX (128)
@@ -43,4 +43,4 @@ class MicroMutableOpResolver : public OpResolver {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/experimental/micro/micro_mutable_op_resolver_test.cc
similarity index 94%
rename from tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver_test.cc
rename to tensorflow/lite/experimental/micro/micro_mutable_op_resolver_test.cc
index 5420a33e8778d93d5aad2150438fdba80df372b8..f551830865dd935ff4cb030ff3b652a0155b989e 100644
--- a/tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/experimental/micro/micro_mutable_op_resolver_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_mutable_op_resolver.h"
 
-#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ce14edea53decd40d8449e9c160725c361d5ced
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc
@@ -0,0 +1,162 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
+
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+
+namespace tflite {
+namespace {
+
+TfLiteStatus TfLiteTypeSizeOf(TfLiteType type, size_t* size,
+                              ErrorReporter* reporter) {
+  switch (type) {
+    case kTfLiteFloat32:
+      *size = sizeof(float);
+      break;
+    case kTfLiteInt16:
+      *size = sizeof(int16_t);
+      break;
+    case kTfLiteInt32:
+      *size = sizeof(int32_t);
+      break;
+    case kTfLiteUInt8:
+      *size = sizeof(uint8_t);
+      break;
+    case kTfLiteInt64:
+      *size = sizeof(int64_t);
+      break;
+    case kTfLiteBool:
+      *size = sizeof(bool);
+      break;
+    case kTfLiteComplex64:
+      *size = sizeof(float) * 2;
+      break;
+    default:
+      reporter->Report(
+          "Only float32, int16, int32, int64, uint8, bool, complex64 "
+          "supported currently.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus BytesRequired(const tflite::Tensor& flatbuffer_tensor,
+                           size_t dims_size, size_t* bytes, size_t* type_size,
+                           ErrorReporter* error_reporter) {
+  TfLiteType tf_lite_type;
+  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
+                                          &tf_lite_type, error_reporter));
+  TF_LITE_ENSURE_STATUS(
+      TfLiteTypeSizeOf(tf_lite_type, type_size, error_reporter));
+  *bytes = dims_size * (*type_size);
+  return kTfLiteOk;
+}
+
+uint8_t* AlignPointerRoundUp(uint8_t* data, size_t alignment) {
+  size_t data_as_size_t = reinterpret_cast<size_t>(data);
+  uint8_t* aligned_result = reinterpret_cast<uint8_t*>(
+      ((data_as_size_t + (alignment - 1)) / alignment) * alignment);
+  return aligned_result;
+}
+
+}  // namespace
+
+TfLiteStatus SimpleTensorAllocator::AllocateTensor(
+    const tflite::Tensor& flatbuffer_tensor, int create_before,
+    int destroy_after,
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+    ErrorReporter* error_reporter, TfLiteTensor* result) {
+  TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(),
+                                          &result->type, error_reporter));
+  result->is_variable = flatbuffer_tensor.is_variable();
+
+  result->data.raw = nullptr;
+  result->bytes = 0;
+  if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) {
+    if (auto* array = buffer->data()) {
+      if (size_t array_size = array->size()) {
+        result->data.raw =
+            const_cast<char*>(reinterpret_cast<const char*>(array->data()));
+        size_t type_size;
+        TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, array_size,
+                                            &result->bytes, &type_size,
+                                            error_reporter));
+      }
+    }
+  }
+  if (result->data.raw) {
+    result->allocation_type = kTfLiteMmapRo;
+  } else {
+    int data_size = 1;
+    for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+      data_size *= flatbuffer_tensor.shape()->Get(n);
+    }
+    size_t type_size;
+    TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, data_size,
+                                        &result->bytes, &type_size,
+                                        error_reporter));
+    result->data.raw =
+        reinterpret_cast<char*>(AllocateMemory(result->bytes, type_size));
+    if (result->data.raw == nullptr) {
+      const char* tensor_name = flatbuffer_tensor.name()->c_str();
+      if (tensor_name == nullptr) {
+        tensor_name = "<None>";
+      }
+      error_reporter->Report(
+          "Couldn't allocate memory for tensor '%s', wanted %d bytes but only "
+          "%d were available",
+          tensor_name, result->bytes, (data_size_max_ - data_size_));
+      return kTfLiteError;
+    }
+    result->allocation_type = kTfLiteArenaRw;
+  }
+  result->dims = reinterpret_cast<TfLiteIntArray*>(AllocateMemory(
+      sizeof(int) * (flatbuffer_tensor.shape()->Length() + 1), sizeof(int)));
+  result->dims->size = flatbuffer_tensor.shape()->Length();
+  for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+    result->dims->data[n] = flatbuffer_tensor.shape()->Get(n);
+  }
+  if (flatbuffer_tensor.quantization()) {
+    result->params.scale = flatbuffer_tensor.quantization()->scale()->Get(0);
+    result->params.zero_point =
+        flatbuffer_tensor.quantization()->zero_point()->Get(0);
+  }
+  result->allocation = nullptr;
+  if (flatbuffer_tensor.name()) {
+    result->name = flatbuffer_tensor.name()->c_str();
+  } else {
+    result->name = "<No name>";
+  }
+  result->delegate = nullptr;
+  result->buffer_handle = 0;
+  result->data_is_stale = false;
+  return kTfLiteOk;
+}
+
+uint8_t* SimpleTensorAllocator::AllocateMemory(size_t size, size_t alignment) {
+  uint8_t* current_data = data_ + data_size_;
+  uint8_t* aligned_result = AlignPointerRoundUp(current_data, alignment);
+  uint8_t* next_free = aligned_result + size;
+  size_t aligned_size = (next_free - current_data);
+  if ((data_size_ + aligned_size) > data_size_max_) {
+    // TODO(petewarden): Add error reporting beyond returning null!
+    return nullptr;
+  }
+  data_size_ += aligned_size;
+  return aligned_result;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h b/tensorflow/lite/experimental/micro/simple_tensor_allocator.h
similarity index 76%
rename from tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h
rename to tensorflow/lite/experimental/micro/simple_tensor_allocator.h
index 4f16a9d0e54cba6fb3b635ceeb39ab10ff59ae73..3530ecdfe265f8e065f57c3d4ae2e75fa9c27f25 100644
--- a/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator.h
+++ b/tensorflow/lite/experimental/micro/simple_tensor_allocator.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -36,7 +36,7 @@ class SimpleTensorAllocator {
       const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
       ErrorReporter* error_reporter, TfLiteTensor* result);
 
-  uint8_t* AllocateMemory(size_t size);
+  uint8_t* AllocateMemory(size_t size, size_t alignment);
 
   int GetDataSize() const { return data_size_; }
 
@@ -48,4 +48,4 @@ class SimpleTensorAllocator {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_SIMPLE_TENSOR_ALLOCATOR_H_
diff --git a/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator_test.cc b/tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
similarity index 82%
rename from tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator_test.cc
rename to tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
index c83542724395328cb6a5e038b64dba4b9f4f655b..b82017c7fe60e941633b08ac01f7dd0957930c24 100644
--- a/tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator_test.cc
+++ b/tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
 
-#include "tensorflow/contrib/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace tflite {
 namespace {
@@ -121,7 +121,7 @@ TF_LITE_MICRO_TEST(TestTooLarge) {
   uint8_t arena[arena_size];
   tflite::SimpleTensorAllocator allocator(arena, arena_size);
 
-  const tflite::Tensor* tensor = tflite::Create1dTensor(10000);
+  const tflite::Tensor* tensor = tflite::Create1dTensor(2000);
   const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
       tflite::CreateBuffers();
 
@@ -137,8 +137,33 @@ TF_LITE_MICRO_TEST(TestJustFits) {
   uint8_t arena[arena_size];
   tflite::SimpleTensorAllocator allocator(arena, arena_size);
 
-  uint8_t* result = allocator.AllocateMemory(arena_size);
+  uint8_t* result = allocator.AllocateMemory(arena_size, 1);
   TF_LITE_MICRO_EXPECT_NE(nullptr, result);
 }
 
+TF_LITE_MICRO_TEST(TestAligned) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleTensorAllocator allocator(arena, arena_size);
+
+  uint8_t* result = allocator.AllocateMemory(1, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+
+  result = allocator.AllocateMemory(16, 4);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+  TF_LITE_MICRO_EXPECT_EQ(0, reinterpret_cast<size_t>(result) & 3);
+}
+
+TF_LITE_MICRO_TEST(TestMultipleTooLarge) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleTensorAllocator allocator(arena, arena_size);
+
+  uint8_t* result = allocator.AllocateMemory(768, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, result);
+
+  result = allocator.AllocateMemory(768, 1);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, result);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/testing/BUILD b/tensorflow/lite/experimental/micro/testing/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5a31a709ca3f0205b8764528d6e8f2c0fe0f93d0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/BUILD
@@ -0,0 +1,17 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["test_linux_binary.sh"])
+
+cc_library(
+    name = "micro_test",
+    hdrs = [
+        "micro_test.h",
+    ],
+    deps = [
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/Dockerfile.bluepill b/tensorflow/lite/experimental/micro/testing/Dockerfile.bluepill
similarity index 100%
rename from tensorflow/contrib/lite/experimental/micro/testing/Dockerfile.bluepill
rename to tensorflow/lite/experimental/micro/testing/Dockerfile.bluepill
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/bluepill.resc b/tensorflow/lite/experimental/micro/testing/bluepill.resc
similarity index 97%
rename from tensorflow/contrib/lite/experimental/micro/testing/bluepill.resc
rename to tensorflow/lite/experimental/micro/testing/bluepill.resc
index 9333dc42bfbfbc0c6185a88db096b2cb2102d5be..c46b33e3fb0b188c0c108e69ebc05063c0e00575 100644
--- a/tensorflow/contrib/lite/experimental/micro/testing/bluepill.resc
+++ b/tensorflow/lite/experimental/micro/testing/bluepill.resc
@@ -31,6 +31,3 @@ macro reset
 
 runMacro $reset
 
-emulation RunFor @1
-
-quit
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/testing/bluepill.robot b/tensorflow/lite/experimental/micro/testing/bluepill.robot
new file mode 100644
index 0000000000000000000000000000000000000000..37612168576280d3d83005ed5659d6863e5d516a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/bluepill.robot
@@ -0,0 +1,23 @@
+*** Settings ***
+Suite Setup                   Setup
+Suite Teardown                Teardown
+Test Setup                    Reset Emulation
+Resource                      /opt/renode/tests/renode-keywords.robot
+
+*** Variables ***
+${UART}                       sysbus.cpu.uartSemihosting
+
+*** Test Cases ***
+Should Run Bluepill Test
+    [Documentation]           Runs a Bluepill test and waits for a specific string on the semihosting UART
+    [Tags]                    bluepill  uart  tensorflow  arm
+    ${BIN} =                  Get Environment Variable    BIN
+    ${SCRIPT} =               Get Environment Variable    SCRIPT
+    ${EXPECTED} =             Get Environment Variable    EXPECTED
+    Execute Command           $bin = @${BIN}
+    Execute Script            ${SCRIPT}
+
+    Create Terminal Tester    ${UART}  timeout=30
+    Start Emulation
+
+    Wait For Line On Uart     ${EXPECTED}
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl b/tensorflow/lite/experimental/micro/testing/micro_test.bzl
similarity index 93%
rename from tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl
rename to tensorflow/lite/experimental/micro/testing/micro_test.bzl
index 91e349cb242737ae7b18a3a981171ff34f21052e..7a7ba15ca5fca8e903f9a0917a72788ea79f7485 100644
--- a/tensorflow/contrib/lite/experimental/micro/testing/micro_test.bzl
+++ b/tensorflow/lite/experimental/micro/testing/micro_test.bzl
@@ -10,6 +10,7 @@ def tflite_micro_cc_test(
         nocopts = "",
         linkopts = [],
         deps = [],
+        tags = [],
         visibility = None):
     """Tests a C/C++ binary without testing framework  dependencies`.
 
@@ -43,13 +44,14 @@ def tflite_micro_cc_test(
         nocopts = nocopts,
         linkopts = linkopts,
         deps = deps,
+        tags = tags,
         visibility = visibility,
     )
     native.sh_test(
         name = name,
         size = "medium",
         srcs = [
-            "//tensorflow/contrib/lite/experimental/micro/testing:test_linux_binary.sh",
+            "//tensorflow/lite/experimental/micro/testing:test_linux_binary.sh",
         ],
         args = [
             native.package_name() + "/" + name + "_binary",
@@ -61,4 +63,5 @@ def tflite_micro_cc_test(
         ],
         deps = [
         ],
+        tags = tags,
     )
diff --git a/tensorflow/lite/experimental/micro/testing/micro_test.h b/tensorflow/lite/experimental/micro/testing/micro_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f20dd5ac77dfd3f304c7cc93be0b865a0c2f0cb
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/micro_test.h
@@ -0,0 +1,174 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An ultra-lightweight testing framework designed for use with microcontroller
+// applications. Its only dependency is on TensorFlow Lite's ErrorReporter
+// interface, where log messages are output. This is designed to be usable even
+// when no standard C or C++ libraries are available, and without any dynamic
+// memory allocation or reliance on global constructors.
+//
+// To build a test, you use syntax similar to gunit, but with some extra
+// decoration to create a hidden 'main' function containing each of the tests to
+// be run. Your code should look something like:
+// ----------------------------------------------------------------------------
+// #include "path/to/this/header"
+//
+// TF_LITE_MICRO_TESTS_BEGIN
+//
+// TF_LITE_MICRO_TEST(SomeTest) {
+//   TF_LITE_LOG_EXPECT_EQ(true, true);
+// }
+//
+// TF_LITE_MICRO_TESTS_END
+// ----------------------------------------------------------------------------
+// If you compile this for your platform, you'll get a normal binary that you
+// should be able to run. Executing it will output logging information like this
+// to stderr (or whatever equivalent is available and written to by
+// ErrorReporter):
+// ----------------------------------------------------------------------------
+// Testing SomeTest
+// 1/1 tests passed
+// ~~~ALL TESTS PASSED~~~
+// ----------------------------------------------------------------------------
+// This is designed to be human-readable, so you can just run tests manually,
+// but the string "~~~ALL TESTS PASSED~~~" should only appear if all of the
+// tests do pass. This makes it possible to integrate with automated test
+// systems by scanning the output logs and looking for that magic value.
+//
+// This framework is intended to be a rudimentary alternative to no testing at
+// all on systems that struggle to run more conventional approaches, so use with
+// caution!
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
+
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+namespace micro_test {
+extern int tests_passed;
+extern int tests_failed;
+extern bool is_test_complete;
+extern bool did_test_fail;
+extern tflite::ErrorReporter* reporter;
+}  // namespace micro_test
+
+#define TF_LITE_MICRO_TESTS_BEGIN              \
+  namespace micro_test {                       \
+  int tests_passed;                            \
+  int tests_failed;                            \
+  bool is_test_complete;                       \
+  bool did_test_fail;                          \
+  tflite::ErrorReporter* reporter;             \
+  }                                            \
+                                               \
+  int main(int argc, char** argv) {            \
+    micro_test::tests_passed = 0;              \
+    micro_test::tests_failed = 0;              \
+    tflite::MicroErrorReporter error_reporter; \
+    micro_test::reporter = &error_reporter;
+
+#define TF_LITE_MICRO_TESTS_END                                \
+  micro_test::reporter->Report(                                \
+      "%d/%d tests passed", micro_test::tests_passed,          \
+      (micro_test::tests_failed + micro_test::tests_passed));  \
+  if (micro_test::tests_failed == 0) {                         \
+    micro_test::reporter->Report("~~~ALL TESTS PASSED~~~\n");  \
+  } else {                                                     \
+    micro_test::reporter->Report("~~~SOME TESTS FAILED~~~\n"); \
+  }                                                            \
+  }
+
+// TODO(petewarden): I'm going to hell for what I'm doing to this poor for loop.
+#define TF_LITE_MICRO_TEST(name)                                           \
+  micro_test::reporter->Report("Testing %s", #name);                       \
+  for (micro_test::is_test_complete = false,                               \
+      micro_test::did_test_fail = false;                                   \
+       !micro_test::is_test_complete; micro_test::is_test_complete = true, \
+      micro_test::tests_passed += (micro_test::did_test_fail) ? 0 : 1,     \
+      micro_test::tests_failed += (micro_test::did_test_fail) ? 1 : 0)
+
+#define TF_LITE_MICRO_EXPECT(x)                                                \
+  do {                                                                         \
+    if (!(x)) {                                                                \
+      micro_test::reporter->Report(#x " failed at %s:%d", __FILE__, __LINE__); \
+      micro_test::did_test_fail = true;                                        \
+    }                                                                          \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_EQ(x, y)                                         \
+  do {                                                                        \
+    if ((x) != (y)) {                                                         \
+      micro_test::reporter->Report(#x " == " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_NE(x, y)                                         \
+  do {                                                                        \
+    if ((x) == (y)) {                                                         \
+      micro_test::reporter->Report(#x " != " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_NEAR(x, y, epsilon)                      \
+  do {                                                                \
+    auto delta = ((x) > (y)) ? ((x) - (y)) : ((y) - (x));             \
+    if (delta > epsilon) {                                            \
+      micro_test::reporter->Report(#x " near " #y " failed at %s:%d", \
+                                   __FILE__, __LINE__);               \
+      micro_test::did_test_fail = true;                               \
+    }                                                                 \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_GT(x, y)                                        \
+  do {                                                                       \
+    if ((x) <= (y)) {                                                        \
+      micro_test::reporter->Report(#x " > " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                \
+      micro_test::did_test_fail = true;                                      \
+    }                                                                        \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_LT(x, y)                                        \
+  do {                                                                       \
+    if ((x) >= (y)) {                                                        \
+      micro_test::reporter->Report(#x " < " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                \
+      micro_test::did_test_fail = true;                                      \
+    }                                                                        \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_GE(x, y)                                         \
+  do {                                                                        \
+    if ((x) < (y)) {                                                          \
+      micro_test::reporter->Report(#x " >= " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_LE(x, y)                                         \
+  do {                                                                        \
+    if ((x) > (y)) {                                                          \
+      micro_test::reporter->Report(#x " <= " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
diff --git a/tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh b/tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e288c6cf5683146796ddb43e9d467f4eca826fa3
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh
@@ -0,0 +1,56 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests a 'bluepill' STM32F103 ELF by parsing the log output of Renode emulation.
+#
+# First argument is the ELF location.
+# Second argument is a regular expression that's required to be in the output logs
+# for the test to pass.
+
+declare -r ROOT_DIR=`pwd`
+declare -r TEST_TMPDIR=/tmp/test_bluepill_binary/
+declare -r MICRO_LOG_PATH=${TEST_TMPDIR}
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+mkdir -p ${MICRO_LOG_PATH}
+
+docker build -t renode_bluepill \
+  -f ${ROOT_DIR}/tensorflow/lite/experimental/micro/testing/Dockerfile.bluepill \
+  ${ROOT_DIR}/tensorflow/lite/experimental/micro/testing/
+
+exit_code=0
+# running in `if` to avoid setting +e
+if ! docker run \
+  --log-driver=none -a stdout -a stderr \
+  -v ${ROOT_DIR}:/workspace \
+  -v /tmp:/tmp \
+  -e BIN=/workspace/$1 \
+  -e SCRIPT=/workspace/tensorflow/lite/experimental/micro/testing/bluepill.resc \
+  -e EXPECTED="$2" \
+  -it renode_bluepill \
+  /bin/bash -c "/opt/renode/tests/test.sh /workspace/tensorflow/lite/experimental/micro/testing/bluepill.robot 2>&1 >${MICRO_LOG_FILENAME}"
+then
+  exit_code=1
+fi
+
+echo "LOGS:"
+cat ${MICRO_LOG_FILENAME}
+if [ $exit_code -eq 0 ]
+then
+  echo "$1: PASS"
+else
+  echo "$1: FAIL - '$2' not found in logs."
+fi
+exit $exit_code
diff --git a/tensorflow/contrib/lite/experimental/micro/testing/test_linux_binary.sh b/tensorflow/lite/experimental/micro/testing/test_linux_binary.sh
similarity index 100%
rename from tensorflow/contrib/lite/experimental/micro/testing/test_linux_binary.sh
rename to tensorflow/lite/experimental/micro/testing/test_linux_binary.sh
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..0caf0ca099e0520f90530b02f9a95efbe6e3d299
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -0,0 +1,218 @@
+MAKEFILE_DIR := tensorflow/lite/experimental/micro/tools/make
+
+# Try to figure out the host system
+HOST_OS :=
+ifeq ($(OS),Windows_NT)
+	HOST_OS = windows
+else
+	UNAME_S := $(shell uname -s)
+	ifeq ($(UNAME_S),Linux)
+		HOST_OS := linux
+	endif
+	ifeq ($(UNAME_S),Darwin)
+		HOST_OS := osx
+	endif
+endif
+
+HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi)
+
+# Override these on the make command line to target a specific architecture. For example:
+# make -f tensorflow/lite/Makefile TARGET=rpi TARGET_ARCH=armv7l
+TARGET := $(HOST_OS)
+TARGET_ARCH := $(HOST_ARCH)
+
+INCLUDES := \
+-I. \
+-I$(MAKEFILE_DIR)/../../../../../ \
+-I$(MAKEFILE_DIR)/../../../../../../ \
+-I$(MAKEFILE_DIR)/downloads/ \
+-I$(MAKEFILE_DIR)/downloads/gemmlowp \
+-I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
+-I$(OBJDIR)
+# This is at the end so any globally-installed frameworks like protobuf don't
+# override local versions in the source tree.
+INCLUDES += -I/usr/local/include
+
+TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_linux_binary.sh
+
+MICROLITE_LIBS := -lm
+
+# There are no rules for compiling objects for the host system (since we don't
+# generate things like the protobuf compiler that require that), so all of
+# these settings are for the target compiler.
+CXXFLAGS := -O3 -DNDEBUG
+CXXFLAGS += --std=c++11 -g -DTF_LITE_STATIC_MEMORY
+CCFLAGS := -DNDEBUG -g -DTF_LITE_STATIC_MEMORY
+LDOPTS := -L/usr/local/lib
+ARFLAGS := -r
+TARGET_TOOLCHAIN_PREFIX :=
+CC_PREFIX :=
+
+# This library is the main target for this makefile. It will contain a minimal
+# runtime that can be linked in to other programs.
+MICROLITE_LIB_NAME := libtensorflow-microlite.a
+
+# Test binary for the microcontroller speech model.
+MICRO_SPEECH_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
+
+# Test binary for the microcontroller speech model.
+PREPROCESSOR_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
+
+PREPROCESSOR_REFERENCE_TEST_SRCS = \
+$(PREPROCESSOR_TEST_SRCS) \
+tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+
+PREPROCESSOR_FIXED_TEST_SRCS += \
+$(PREPROCESSOR_TEST_SRCS) \
+tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
+
+MICROLITE_TEST_SRCS := \
+$(wildcard tensorflow/lite/experimental/micro/*test.cc) \
+$(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc)
+
+MICROLITE_CC_BASE_SRCS := \
+$(wildcard tensorflow/lite/experimental/micro/*.cc) \
+$(wildcard tensorflow/lite/experimental/micro/kernels/*.cc) \
+tensorflow/lite/c/c_api_internal.c \
+tensorflow/lite/core/api/error_reporter.cc \
+tensorflow/lite/core/api/flatbuffer_conversions.cc \
+tensorflow/lite/core/api/op_resolver.cc \
+tensorflow/lite/kernels/kernel_util.cc \
+tensorflow/lite/kernels/internal/quantization_util.cc
+MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
+
+# These target-specific makefiles should modify or replace options like
+# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
+# based on platforms or architectures should happen within these files, to
+# keep this main makefile focused on the sources and dependencies.
+include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
+
+ALL_SRCS := \
+	$(MICRO_SPEECH_TEST_SRCS) \
+	$(PREPROCESSOR_REFERENCE_TEST_SRCS) \
+	$(PREPROCESSOR_FIXED_TEST_SRCS) \
+	$(MICROLITE_CC_SRCS) \
+	$(MICROLITE_TEST_SRCS)
+
+# Where compiled objects are stored.
+GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
+OBJDIR := $(GENDIR)obj/
+BINDIR := $(GENDIR)bin/
+LIBDIR := $(GENDIR)lib/
+
+MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
+
+MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
+PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
+PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
+
+CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
+CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
+AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
+
+MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
+
+PREPROCESSOR_REFERENCE_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
+
+PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
+
+MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
+
+MICROLITE_TEST_TARGETS := $(addprefix $(BINDIR), \
+$(patsubst %_test.cc,%.test_target,$(MICROLITE_TEST_SRCS)))
+
+# For normal manually-created TensorFlow C++ source files.
+$(OBJDIR)%.o: %.cc
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# For normal manually-created TensorFlow C source files.
+$(OBJDIR)%.o: %.c
+	@mkdir -p $(dir $@)
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
+# The target that's compiled if there's no command-line arguments.
+all: $(MICROLITE_LIB_PATH) $(MICRO_SPEECH_TEST_BINARY) $(PREPROCESSOR_TEST_BINARY)
+
+microlite: $(MICROLITE_LIB_PATH)
+
+# Hack for generating schema file bypassing flatbuffer parsing
+tensorflow/lite/schema/schema_generated.h:
+	@cp -u tensorflow/lite/schema/schema_generated.h.OPENSOURCE tensorflow/lite/schema/schema_generated.h
+
+# Gathers together all the objects we've compiled into a single '.a' archive.
+$(MICROLITE_LIB_PATH): tensorflow/lite/schema/schema_generated.h $(MICROLITE_LIB_OBJS)
+	@mkdir -p $(dir $@)
+	$(AR) $(ARFLAGS) $(MICROLITE_LIB_PATH) $(MICROLITE_LIB_OBJS)
+
+$(MICRO_SPEECH_TEST_BINARY): $(MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(MICRO_SPEECH_TEST_BINARY) $(MICRO_SPEECH_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+
+micro_speech_test: $(MICRO_SPEECH_TEST_BINARY)
+micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
+
+test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
+	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+$(PREPROCESSOR_REFERENCE_TEST_BINARY): $(PREPROCESSOR_REFERENCE_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_REFERENCE_TEST_BINARY) $(PREPROCESSOR_REFERENCE_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+
+preprocessor_reference_test: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
+preprocessor_reference_test_bin: $(PREPROCESSOR_REFERENCE_TEST_BINARY).bin
+
+test_preprocessor_reference: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_REFERENCE_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+
+preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY)
+preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
+
+test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+$(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $@ $< \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+
+$(BINDIR)%.test_target: $(BINDIR)%_test
+	$(TEST_SCRIPT) $< '~~~ALL TESTS PASSED~~~'
+
+$(info $(MICROLITE_TEST_TARGETS))
+
+test: test_micro_speech $(MICROLITE_TEST_TARGETS)
+
+# Gets rid of all generated files.
+clean:
+	rm -rf $(MAKEFILE_DIR)/gen
+
+$(DEPDIR)/%.d: ;
+.PRECIOUS: $(DEPDIR)/%.d
+.PRECIOUS: $(BINDIR)%_test
+
+-include $(patsubst %,$(DEPDIR)/%.d,$(basename $(ALL_SRCS)))
diff --git a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6749858bdb9ffe7942efcc1dc22acb4c6aa6a533
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../../../../.."
+
+DOWNLOADS_DIR=tensorflow/lite/experimental/micro/tools/make/downloads
+BZL_FILE_PATH=tensorflow/workspace.bzl
+
+# Ensure it is being run from repo root
+if [ ! -f $BZL_FILE_PATH ]; then
+  echo "Could not find ${BZL_FILE_PATH}":
+  echo "Likely you are not running this from the root directory of the repository.";
+  exit 1;
+fi
+
+GEMMLOWP_URL="https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37f7f98adcc7fc9f425.zip"
+FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz"
+CMSIS_URL="https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
+STM32_BARE_LIB_URL="https://github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
+
+download_and_extract() {
+  local usage="Usage: download_and_extract URL DIR"
+  local url="${1:?${usage}}"
+  local dir="${2:?${usage}}"
+  echo "downloading ${url}" >&2
+  mkdir -p "${dir}"
+  if [[ "${url}" == *gz ]]; then
+    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
+  elif [[ "${url}" == *zip ]]; then
+    tempdir=$(mktemp -d)
+    tempdir2=$(mktemp -d)
+
+    curl -L ${url} > ${tempdir}/zipped.zip
+    unzip ${tempdir}/zipped.zip -d ${tempdir2}
+
+    # If the zip file contains nested directories, extract the files from the
+    # inner directory.
+    if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
+      # unzip has no strip components, so unzip to a temp dir, and move the
+      # files we want from the tempdir to destination.
+      cp -R ${tempdir2}/*/* ${dir}/
+    else
+      cp -R ${tempdir2}/* ${dir}/
+    fi
+    rm -rf ${tempdir2} ${tempdir}
+  fi
+
+  # Delete any potential BUILD files, which would interfere with Bazel builds.
+  find "${dir}" -type f -name '*BUILD' -delete
+}
+
+download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
+download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+download_and_extract "${CMSIS_URL}" "${DOWNLOADS_DIR}/cmsis"
+download_and_extract "${STM32_BARE_LIB_URL}" "${DOWNLOADS_DIR}/stm32_bare_lib"
+
+echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..bd238ac55f96dbe62aa16a92180a5995ce395945
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c
@@ -0,0 +1,123 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+#include "am_mcu_apollo.h"              // Defines AM_CMSIS_REGS
+#include "am_bsp.h"
+#include "am_util.h"
+
+//*****************************************************************************
+//
+// The entry point for the application.
+//
+//*****************************************************************************
+extern int main(int argc, char**argv);
+
+void DebugLog(const char* s) { am_util_stdio_printf( "%s", s); }
+void DebugLogInt32(int32_t i) { am_util_stdio_printf( "%d", i); }
+void DebugLogUInt32(uint32_t i) { am_util_stdio_printf( "%d", i); }
+void DebugLogHex(uint32_t i) { am_util_stdio_printf( "0x%8x", i); }
+void DebugLogFloat(float i) { am_util_stdio_printf( "%f", i); }
+
+int _main(void)
+{
+    am_util_id_t sIdDevice;
+    uint32_t ui32StrBuf;
+
+    //
+    // Set the clock frequency.
+    //
+    am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
+
+    //
+    // Set the default cache configuration
+    //
+    am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
+    am_hal_cachectrl_enable();
+
+    //
+    // Configure the board for low power operation.
+    //
+    am_bsp_low_power_init();
+
+    //
+    // Initialize the printf interface for UART output
+    //
+    am_bsp_uart_printf_enable();
+
+    //
+    // Print the banner.
+    //
+    am_util_stdio_terminal_clear();
+    am_util_stdio_printf("Hello World!\n\n");
+
+    //
+    // Print the device info.
+    //
+    am_util_id_device(&sIdDevice);
+    am_util_stdio_printf("Vendor Name: %s\n", sIdDevice.pui8VendorName);
+    am_util_stdio_printf("Device type: %s\n", sIdDevice.pui8DeviceName);
+
+
+    am_util_stdio_printf("Qualified: %s\n",
+                         sIdDevice.sMcuCtrlDevice.ui32Qualified ?
+                         "Yes" : "No");
+
+    am_util_stdio_printf("Device Info:\n"
+                         "\tPart number: 0x%08X\n"
+                         "\tChip ID0:    0x%08X\n"
+                         "\tChip ID1:    0x%08X\n"
+                         "\tRevision:    0x%08X (Rev%c%c)\n",
+                         sIdDevice.sMcuCtrlDevice.ui32ChipPN,
+                         sIdDevice.sMcuCtrlDevice.ui32ChipID0,
+                         sIdDevice.sMcuCtrlDevice.ui32ChipID1,
+                         sIdDevice.sMcuCtrlDevice.ui32ChipRev,
+                         sIdDevice.ui8ChipRevMaj, sIdDevice.ui8ChipRevMin );
+
+    //
+    // If not a multiple of 1024 bytes, append a plus sign to the KB.
+    //
+    ui32StrBuf = ( sIdDevice.sMcuCtrlDevice.ui32FlashSize % 1024 ) ? '+' : 0;
+    am_util_stdio_printf("\tFlash size:  %7d (%d KB%s)\n",
+                         sIdDevice.sMcuCtrlDevice.ui32FlashSize,
+                         sIdDevice.sMcuCtrlDevice.ui32FlashSize / 1024,
+                         &ui32StrBuf);
+
+    ui32StrBuf = ( sIdDevice.sMcuCtrlDevice.ui32SRAMSize % 1024 ) ? '+' : 0;
+    am_util_stdio_printf("\tSRAM size:   %7d (%d KB%s)\n\n",
+                         sIdDevice.sMcuCtrlDevice.ui32SRAMSize,
+                         sIdDevice.sMcuCtrlDevice.ui32SRAMSize / 1024,
+                         &ui32StrBuf);
+
+    //
+    // Print the compiler version.
+    //
+    am_util_stdio_printf("App Compiler:    %s\n", COMPILER_VERSION);
+#ifdef AM_PART_APOLLO3
+    am_util_stdio_printf("HAL Compiler:    %s\n", g_ui8HALcompiler);
+    am_util_stdio_printf("HAL SDK version: %d.%d.%d\n",
+                         g_ui32HALversion.s.Major,
+                         g_ui32HALversion.s.Minor,
+                         g_ui32HALversion.s.Revision);
+    am_util_stdio_printf("HAL compiled with %s-style registers\n",
+                         g_ui32HALversion.s.bAMREGS ? "AM_REG" : "CMSIS");
+
+    am_util_stdio_printf("&sIdDevice: 0x%x, &ui32StrBuf: 0x%x\n", &sIdDevice, &ui32StrBuf);
+    am_hal_security_info_t secInfo;
+    char sINFO[32];
+    uint32_t ui32Status;
+#endif // AM_PART_APOLLO3
+    main(0, NULL);
+}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/apollo3evb.ld b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/apollo3evb.ld
new file mode 100644
index 0000000000000000000000000000000000000000..cd1182f804e48a713ca75d47343f42287f333b33
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/apollo3evb.ld
@@ -0,0 +1,79 @@
+/******************************************************************************
+ *
+ * apollo3evb.ld - Linker script for applications using startup_gcc.c
+ *
+ *****************************************************************************/
+ENTRY(Reset_Handler)
+
+MEMORY
+{
+    FLASH (rx) : ORIGIN = 0x0000C000, LENGTH = 960K
+    SRAM (rwx) : ORIGIN = 0x10000000, LENGTH = 384K
+}
+
+SECTIONS
+{
+    .text :
+    {
+        . = ALIGN(4);
+        KEEP(*(.isr_vector))
+        KEEP(*(.patch))
+        *(.text)
+        *(.text*)
+
+	/* These are the C++ global constructors.  Stick them all here and
+	 * then walk through the array in main() calling them all.
+	 */
+	_init_array_start = .;
+	KEEP (*(SORT(.init_array*)))
+	_init_array_end = .;
+
+	/* XXX Currently not doing anything for global destructors. */
+
+        *(.rodata)
+        *(.rodata*)
+        . = ALIGN(4);
+        _etext = .;
+    } > FLASH
+
+  /* User stack section initialized by startup code. */
+  .stack (NOLOAD):
+    {
+        . = ALIGN(8);
+        *(.stack)
+        *(.stack*)
+        . = ALIGN(8);
+    } > SRAM
+
+    .data :
+    {
+        . = ALIGN(4);
+        _sdata = .;
+        *(.data)
+        *(.data*)
+        . = ALIGN(4);
+        _edata = .;
+    } > SRAM AT>FLASH
+
+    /* used by startup to initialize data */
+    _init_data = LOADADDR(.data);
+
+    .bss :
+    {
+        . = ALIGN(4);
+        _sbss = .;
+        *(.bss)
+        *(.bss*)
+        *(COMMON)
+        . = ALIGN(4);
+        _ebss = .;
+    } > SRAM
+    /* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)
+     * to denote the HEAP start.
+     */
+   end = .;
+
+  .ARM.attributes 0 : { *(.ARM.attributes) }
+}
+
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..f722204feaded521945cd269b36576e560dac3e4
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -0,0 +1,104 @@
+# Settings for apollo3 evb platforms.
+ifeq ($(TARGET), apollo3evb)
+  TARGET_ARCH := cortex-m4
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+  # Download the Ambiq Apollo3 SDK and set this variable to find the header
+  # files:
+  APOLLO3_SDK := /ssd/ambiq/AmbiqSuite\ SDK\ for\ Apollo3/Apollo3-SDK-2018.08.13/
+  # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
+  # with the softfp interfaces.
+  GCC_ARM := /ssd/gnu_arm_toolchain/gcc-arm-none-eabi-7-2018-q2-update/
+
+  PLATFORM_FLAGS = \
+    -DPART_apollo3 \
+    -DAM_PACKAGE_BGA \
+    -DAM_PART_APOLLO3 \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTF_LITE_STATIC_MEMORY \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -fno-rtti \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-unwind-tables \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m4 \
+    -mthumb \
+    -mfpu=fpv4-sp-d16 \
+    -mfloat-abi=softfp \
+    -std=gnu++11 \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -fpermissive \
+    -nostdlib \
+    -g \
+    -Os
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+  LDFLAGS += \
+    -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=softfp \
+    -nostartfiles -static \
+    -Wl,--gc-sections -Wl,--entry,Reset_Handler \
+    -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
+    -fno-exceptions \
+    -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
+    -Wl,-T,$(MAKEFILE_DIR)/targets/apollo3evb/apollo3evb.ld \
+    -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
+  BUILD_TYPE := micro
+  # The apollo3evb libs should be copied from the SDK after building them.
+  MICROLITE_LIBS := \
+    $(MAKEFILE_DIR)/targets/apollo3evb/libam_bsp.a \
+    $(MAKEFILE_DIR)/targets/apollo3evb/libam_hal.a \
+    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
+    -lm
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -I$(GCC_ARM)/arm-none-eabi/ \
+    -I$(APOLLO3_SDK)/mcu/apollo3/ \
+    -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \
+    -I$(APOLLO3_SDK)/boards/apollo3_evb/bsp/ \
+    -I$(APOLLO3_SDK)/devices/ \
+    -I$(APOLLO3_SDK)/utils/
+
+  # The startup_gcc.c file is an altered version of the examples/hello_world/gcc/startup_gcc.c
+  # file from Ambiq:
+  #   - Increase the stack size from 1k to 20k
+  #   - Change the application entry call from main() to _main()
+  # The am_*.c files should be copied from the Ambiq Apollo3 SDK
+  # _main.c contains application and target specific initialization, like
+  # setting clock speed, default uart setups, etc. and an implementation
+  # of the DebugLog interfaces.
+  MICROLITE_CC_SRCS += \
+    $(MAKEFILE_DIR)/targets/apollo3evb/startup_gcc.c \
+    $(MAKEFILE_DIR)/targets/apollo3evb/_main.c \
+    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_delay.c \
+    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_faultisr.c \
+    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_id.c \
+    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_stdio.c
+
+  TEST_SCRIPT := tensorflow/lite/experimental/log_test/test_apollo3evb_binary.sh
+  # These are tests that don't currently work on the blue pill.
+  EXCLUDED_TESTS := \
+    tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
+    tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
+  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+
+$(BINDIR)/%.bin: $(BINDIR)/%
+	@mkdir -p $(dir $@)
+	$(OBJCOPY) $< $@ -O binary
+
+endif
diff --git a/tensorflow/contrib/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
similarity index 87%
rename from tensorflow/contrib/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
rename to tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
index 022a8422dc89c048797d0f9ba224f67060d210d7..5e3105a109b99b061a35b9c6f6c7c5f3681e2b45 100644
--- a/tensorflow/contrib/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
@@ -47,11 +47,11 @@ ifeq ($(TARGET), bluepill)
   MICROLITE_CC_SRCS += \
     $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
     $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
-    TEST_SCRIPT := tensorflow/contrib/lite/experimental/micro/testing/test_bluepill_binary.sh
+    TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh
   # These are tests that don't currently work on the blue pill.
   EXCLUDED_TESTS := \
-    tensorflow/contrib/lite/experimental/micro/micro_interpreter_test.cc \
-    tensorflow/contrib/lite/experimental/micro/simple_tensor_allocator_test.cc
+    tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
+    tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
 # These are microcontroller-specific rules for converting the ELF output
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..8ea78e8f3e3db75f86ce39e6adf9b82ff4080ff1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/linux_x86_makefile.inc
@@ -0,0 +1,9 @@
+# Settings for x86 on Linux
+ifeq ($(TARGET), linux)
+  ifeq ($(TARGET_ARCH), x86_64)
+    PLATFORM_FLAGS = \
+      -DTF_LITE_DISABLE_X86_NEON
+    CXXFLAGS += $(PLATFORM_FLAGS)
+    CCFLAGS += $(PLATFORM_FLAGS)
+  endif
+endif
diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2f881e3acabdae526ffba0a0b94f273da2d260ef
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -0,0 +1,103 @@
+# TensorFlow ops for audio front-end processing.
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_py_test",
+    "tf_cc_test",
+)
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "audio_microfrontend",
+    srcs = ["audio_microfrontend.cc"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/experimental/microfrontend/lib:frontend",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:reference",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "audio_microfrontend_test",
+    size = "small",
+    srcs = ["audio_microfrontend_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":audio_microfrontend",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_custom_op_library(
+    name = "python/ops/_audio_microfrontend_op.so",
+    srcs = [
+        "ops/audio_microfrontend_op.cc",
+    ],
+    deps = [
+        "//tensorflow/lite/experimental/microfrontend/lib:frontend",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["audio_microfrontend_op"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/experimental/microfrontend/lib:frontend",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "audio_microfrontend_op",
+    deps = [":audio_microfrontend_op_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "audio_microfrontend_py",
+    srcs = [
+        "python/ops/audio_microfrontend_op.py",
+    ],
+    dso = [":python/ops/_audio_microfrontend_op.so"],
+    kernels = [
+        ":audio_microfrontend_op_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":audio_microfrontend_op",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:common_shapes",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
+    name = "audio_microfrontend_op_test",
+    size = "small",
+    srcs = ["python/kernel_tests/audio_microfrontend_op_test.py"],
+    additional_deps = [
+        ":audio_microfrontend_py",
+        "//tensorflow:tensorflow_py",
+    ],
+    tags = ["no_pip"],
+)
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4367fe74a484445289f15c83860ca08ca4e144db
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
@@ -0,0 +1,204 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace audio_microfrontend {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+typedef struct {
+  int sample_rate;
+  FrontendState* state;
+  int left_context;
+  int right_context;
+  int frame_stride;
+  bool zero_padding;
+  int out_scale;
+  bool out_float;
+} TfLiteAudioMicrofrontendParams;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new TfLiteAudioMicrofrontendParams;
+
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+
+  data->sample_rate = m["sample_rate"].AsInt32();
+
+  struct FrontendConfig config;
+  config.window.size_ms = m["window_size"].AsInt32();
+  config.window.step_size_ms = m["window_step"].AsInt32();
+  config.filterbank.num_channels = m["num_channels"].AsInt32();
+  config.filterbank.upper_band_limit = m["upper_band_limit"].AsFloat();
+  config.filterbank.lower_band_limit = m["lower_band_limit"].AsFloat();
+  config.noise_reduction.smoothing_bits = m["smoothing_bits"].AsInt32();
+  config.noise_reduction.even_smoothing = m["even_smoothing"].AsFloat();
+  config.noise_reduction.odd_smoothing = m["odd_smoothing"].AsFloat();
+  config.noise_reduction.min_signal_remaining =
+      m["min_signal_remaining"].AsFloat();
+  config.pcan_gain_control.enable_pcan = m["enable_pcan"].AsBool();
+  config.pcan_gain_control.strength = m["pcan_strength"].AsFloat();
+  config.pcan_gain_control.offset = m["pcan_offset"].AsFloat();
+  config.pcan_gain_control.gain_bits = m["gain_bits"].AsInt32();
+  config.log_scale.enable_log = m["enable_log"].AsBool();
+  config.log_scale.scale_shift = m["scale_shift"].AsInt32();
+
+  data->state = new FrontendState;
+  FrontendPopulateState(&config, data->state, data->sample_rate);
+
+  data->left_context = m["left_context"].AsInt32();
+  data->right_context = m["right_context"].AsInt32();
+  data->frame_stride = m["frame_stride"].AsInt32();
+  data->zero_padding = m["zero_padding"].AsBool();
+  data->out_scale = m["out_scale"].AsInt32();
+  data->out_float = m["out_float"].AsBool();
+
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  auto* data = reinterpret_cast<TfLiteAudioMicrofrontendParams*>(buffer);
+  FrontendFreeStateContents(data->state);
+  delete data->state;
+  delete data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* data =
+      reinterpret_cast<TfLiteAudioMicrofrontendParams*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt16);
+  output->type = kTfLiteInt32;
+  if (data->out_float) {
+    output->type = kTfLiteFloat32;
+  }
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(2);
+  int num_frames = 0;
+  if (input->dims->data[0] >= data->state->window.size) {
+    num_frames = (input->dims->data[0] - data->state->window.size) /
+                     data->state->window.step / data->frame_stride +
+                 1;
+  }
+  output_size->data[0] = num_frames;
+  output_size->data[1] = data->state->filterbank.num_channels *
+                         (1 + data->left_context + data->right_context);
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+void GenerateFeatures(TfLiteAudioMicrofrontendParams* data,
+                      const TfLiteTensor* input, TfLiteTensor* output) {
+  const int16_t* audio_data = GetTensorData<int16_t>(input);
+  int64_t audio_size = input->dims->data[0];
+
+  T* filterbanks_flat = GetTensorData<T>(output);
+
+  int num_frames = 0;
+  if (audio_size >= data->state->window.size) {
+    num_frames = (input->dims->data[0] - data->state->window.size) /
+                     data->state->window.step +
+                 1;
+  }
+  std::vector<std::vector<T>> frame_buffer(num_frames);
+
+  int frame_index = 0;
+  while (audio_size > 0) {
+    size_t num_samples_read;
+    struct FrontendOutput output = FrontendProcessSamples(
+        data->state, audio_data, audio_size, &num_samples_read);
+    audio_data += num_samples_read;
+    audio_size -= num_samples_read;
+
+    if (output.values != nullptr) {
+      frame_buffer[frame_index].reserve(output.size);
+      for (int i = 0; i < output.size; ++i) {
+        frame_buffer[frame_index].push_back(static_cast<T>(output.values[i]) /
+                                            data->out_scale);
+      }
+      ++frame_index;
+    }
+  }
+
+  int index = 0;
+  std::vector<T> pad(data->state->filterbank.num_channels, 0);
+  for (int anchor = 0; anchor < frame_buffer.size();
+       anchor += data->frame_stride) {
+    for (int frame = anchor - data->left_context;
+         frame <= anchor + data->right_context; ++frame) {
+      std::vector<T>* feature;
+      if (data->zero_padding && (frame < 0 || frame >= frame_buffer.size())) {
+        feature = &pad;
+      } else if (frame < 0) {
+        feature = &frame_buffer[0];
+      } else if (frame >= frame_buffer.size()) {
+        feature = &frame_buffer[frame_buffer.size() - 1];
+      } else {
+        feature = &frame_buffer[frame];
+      }
+      for (auto f : *feature) {
+        filterbanks_flat[index++] = f;
+      }
+    }
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* data =
+      reinterpret_cast<TfLiteAudioMicrofrontendParams*>(node->user_data);
+  FrontendReset(data->state);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (data->out_float) {
+    GenerateFeatures<float>(data, input, output);
+  } else {
+    GenerateFeatures<int32>(data, input, output);
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace audio_microfrontend
+
+TfLiteRegistration* Register_AUDIO_MICROFRONTEND() {
+  static TfLiteRegistration r = {
+      audio_microfrontend::Init, audio_microfrontend::Free,
+      audio_microfrontend::Prepare, audio_microfrontend::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9119d01831f6892dbf887930f3626445fc8a8e3
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Micro Frontend op.
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_AUDIO_MICROFRONTEND();
+
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class MicroFrontendOpModel : public SingleOpModel {
+ public:
+  MicroFrontendOpModel(int n_input, int n_frame, int n_frequency_per_frame,
+                       int n_left_context, int n_right_context,
+                       int n_frame_stride,
+                       const std::vector<std::vector<int>>& input_shapes)
+      : n_input_(n_input),
+        n_frame_(n_frame),
+        n_frequency_per_frame_(n_frequency_per_frame),
+        n_left_context_(n_left_context),
+        n_right_context_(n_right_context),
+        n_frame_stride_(n_frame_stride) {
+    input_ = AddInput(TensorType_INT16);
+    output_ = AddOutput(TensorType_INT32);
+
+    // Set up and pass in custom options using flexbuffer.
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      // Parameters to initialize FFT state.
+      fbb.Int("sample_rate", 1000);
+      fbb.Int("window_size", 25);
+      fbb.Int("window_step", 10);
+      fbb.Int("num_channels", 2);
+      fbb.Float("upper_band_limit", 450.0);
+      fbb.Float("lower_band_limit", 8.0);
+      fbb.Int("smoothing_bits", 10);
+      fbb.Float("even_smoothing", 0.025);
+      fbb.Float("odd_smoothing", 0.06);
+      fbb.Float("min_signal_remaining", 0.05);
+      fbb.Bool("enable_pcan", true);
+      fbb.Float("pcan_strength", 0.95);
+      fbb.Float("pcan_offset", 80.0);
+      fbb.Int("gain_bits", 21);
+      fbb.Bool("enable_log", true);
+      fbb.Int("scale_shift", 6);
+
+      // Parameters for micro frontend.
+      fbb.Int("left_context", n_left_context);
+      fbb.Int("right_context", n_right_context);
+      fbb.Int("frame_stride", n_frame_stride);
+      fbb.Bool("zero_padding", true);
+      fbb.Int("out_scale", 1);
+      fbb.Bool("out_float", false);
+    });
+    fbb.Finish();
+    SetCustomOp("MICRO_FRONTEND", fbb.GetBuffer(),
+                Register_AUDIO_MICROFRONTEND);
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetInput(const std::vector<int16_t>& data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<int> GetOutput() { return ExtractVector<int>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_frmes() { return n_frame_; }
+  int num_frequency_per_frame() { return n_frequency_per_frame_; }
+  int num_left_context() { return n_left_context_; }
+  int num_right_context() { return n_right_context_; }
+  int num_frame_stride() { return n_frame_stride_; }
+
+ protected:
+  int input_;
+  int output_;
+  int n_input_;
+  int n_frame_;
+  int n_frequency_per_frame_;
+  int n_left_context_;
+  int n_right_context_;
+  int n_frame_stride_;
+};
+
+class BaseMicroFrontendTest : public ::testing::Test {
+ protected:
+  // Micro frontend input.
+  std::vector<int16_t> micro_frontend_input_;
+
+  // Compares output up to tolerance to the result of the micro_frontend given
+  // the input.
+  void VerifyGoldens(const std::vector<int16_t>& input,
+                     const std::vector<std::vector<int>>& output,
+                     MicroFrontendOpModel* micro_frontend,
+                     float tolerance = 1e-5) {
+    // Dimensionality check.
+    const int num_inputs = micro_frontend->num_inputs();
+    EXPECT_GT(num_inputs, 0);
+
+    const int num_frames = micro_frontend->num_frmes();
+    EXPECT_GT(num_frames, 0);
+    EXPECT_EQ(num_frames, output.size());
+
+    const int num_frequency_per_frame =
+        micro_frontend->num_frequency_per_frame();
+    EXPECT_GT(num_frequency_per_frame, 0);
+    EXPECT_EQ(num_frequency_per_frame, output[0].size());
+
+    // Set up input.
+    micro_frontend->SetInput(input);
+
+    // Call Invoke.
+    micro_frontend->Invoke();
+
+    // Mimic padding behaviour with zero_padding = true.
+    std::vector<int> output_flattened;
+    for (int anchor = 0; anchor < output.size();
+         anchor += micro_frontend->num_frame_stride()) {
+      for (int frame = anchor - micro_frontend->num_left_context();
+           frame <= anchor + micro_frontend->num_right_context(); ++frame) {
+        if (frame < 0 || frame >= output.size()) {
+          // Padding with zeros.
+          for (int j = 0; j < num_frequency_per_frame; ++j) {
+            output_flattened.push_back(0.0);
+          }
+        } else {
+          // Copy real output.
+          for (auto data_point : output[frame]) {
+            output_flattened.push_back(data_point);
+          }
+        }
+      }
+    }
+
+    // Validate result.
+    EXPECT_THAT(micro_frontend->GetOutput(),
+                ElementsAreArray(output_flattened));
+  }
+};  // namespace
+
+class TwoConsecutive36InputsMicroFrontendTest : public BaseMicroFrontendTest {
+  void SetUp() override {
+    micro_frontend_input_ = {
+        0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+        0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+        0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
+  }
+};
+
+TEST_F(TwoConsecutive36InputsMicroFrontendTest, MicroFrontendBlackBoxTest) {
+  const int n_input = 36;
+  const int n_frame = 2;
+  const int n_frequency_per_frame = 2;
+
+  MicroFrontendOpModel micro_frontend(n_input, n_frame, n_frequency_per_frame,
+                                      1, 1, 1,
+                                      {
+                                          {n_input},
+                                      });
+
+  // Verify the final output.
+  const std::vector<std::vector<int>> micro_frontend_golden_output = {
+      {479, 425}, {436, 378}};
+  VerifyGoldens(micro_frontend_input_, micro_frontend_golden_output,
+                &micro_frontend);
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/BUILD b/tensorflow/lite/experimental/microfrontend/lib/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a055e52f71001295cf95dfcbe790bc4118140fed
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/BUILD
@@ -0,0 +1,188 @@
+# Library for generating feature vectors from audio data
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "bits",
+    hdrs = ["bits.h"],
+)
+
+cc_library(
+    name = "fft",
+    srcs = [
+        "fft.c",
+        "fft_util.c",
+    ],
+    hdrs = [
+        "fft.h",
+        "fft_util.h",
+    ],
+    deps = ["@kissfft//:kiss_fftr_16"],
+)
+
+cc_library(
+    name = "filterbank",
+    srcs = [
+        "filterbank.c",
+        "filterbank_util.c",
+    ],
+    hdrs = [
+        "filterbank.h",
+        "filterbank_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":fft",
+    ],
+)
+
+cc_library(
+    name = "frontend",
+    srcs = [
+        "frontend.c",
+        "frontend_util.c",
+    ],
+    hdrs = [
+        "frontend.h",
+        "frontend_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":fft",
+        ":filterbank",
+        ":log_scale",
+        ":noise_reduction",
+        ":pcan_gain_control",
+        ":window",
+    ],
+)
+
+cc_library(
+    name = "log_scale",
+    srcs = [
+        "log_lut.c",
+        "log_scale.c",
+        "log_scale_util.c",
+    ],
+    hdrs = [
+        "log_lut.h",
+        "log_scale.h",
+        "log_scale_util.h",
+    ],
+    deps = [
+        ":bits",
+    ],
+)
+
+cc_library(
+    name = "noise_reduction",
+    srcs = [
+        "noise_reduction.c",
+        "noise_reduction_util.c",
+    ],
+    hdrs = [
+        "noise_reduction.h",
+        "noise_reduction_util.h",
+    ],
+)
+
+cc_library(
+    name = "pcan_gain_control",
+    srcs = [
+        "pcan_gain_control.c",
+        "pcan_gain_control_util.c",
+    ],
+    hdrs = [
+        "pcan_gain_control.h",
+        "pcan_gain_control_util.h",
+    ],
+    deps = [
+        ":bits",
+    ],
+)
+
+cc_library(
+    name = "window",
+    srcs = [
+        "window.c",
+        "window_util.c",
+    ],
+    hdrs = [
+        "window.h",
+        "window_util.h",
+    ],
+)
+
+cc_test(
+    name = "fft_test",
+    size = "small",
+    srcs = ["fft_test.cc"],
+    deps = [
+        ":fft",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "filterbank_test",
+    size = "small",
+    srcs = ["filterbank_test.cc"],
+    deps = [
+        ":filterbank",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "frontend_test",
+    size = "small",
+    srcs = ["frontend_test.cc"],
+    deps = [
+        ":frontend",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "log_scale_test",
+    size = "small",
+    srcs = ["log_scale_test.cc"],
+    deps = [
+        ":log_scale",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "noise_reduction_test",
+    size = "small",
+    srcs = ["noise_reduction_test.cc"],
+    deps = [
+        ":noise_reduction",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "pcan_gain_control_test",
+    size = "small",
+    srcs = ["pcan_gain_control_test.cc"],
+    deps = [
+        ":pcan_gain_control",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "window_test",
+    size = "small",
+    srcs = ["window_test.cc"],
+    deps = [
+        ":window",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/experimental/microfrontend/lib/README b/tensorflow/lite/experimental/microfrontend/lib/README
new file mode 100644
index 0000000000000000000000000000000000000000..731d88c5bdaafe225d847a619192ba16cec7c25f
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/README
@@ -0,0 +1,9 @@
+The binary frontend_main shows sample usage of the frontend, printing out
+coefficients when it has processed enough data.
+
+The binary frontend_memmap_main shows a sample usage of how to avoid all the
+init code in your runtime, by first running "frontend_generate_memmap" to
+create a header/source file that uses a baked in frontend state. This command
+could be automated as part of your build process, or you can just use the output
+directly.
+
diff --git a/tensorflow/lite/experimental/microfrontend/lib/bits.h b/tensorflow/lite/experimental/microfrontend/lib/bits.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf15466a3d6484c3059a1ded1bb51e4d4287b1bf
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/bits.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_
+
+#ifdef __cplusplus
+#include <cstdint>
+
+extern "C" {
+#endif
+
+static inline int CountLeadingZeros32Slow(uint64_t n) {
+  int zeroes = 28;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros32(uint32_t n) {
+#if defined(_MSC_VER)
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse(&result, n)) {
+    return 31 - result;
+  }
+  return 32;
+#elif defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clz(0) is undefined.
+  if (n == 0) {
+    return 32;
+  }
+  return __builtin_clz(n);
+#else
+  return CountLeadingZeros32Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit32(uint32_t n) {
+  return 32 - CountLeadingZeros32(n);
+}
+
+static inline int CountLeadingZeros64Slow(uint64_t n) {
+  int zeroes = 60;
+  if (n >> 32) zeroes -= 32, n >>= 32;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros64(uint64_t n) {
+#if defined(_MSC_VER) && defined(_M_X64)
+  // MSVC does not have __buitin_clzll. Use _BitScanReverse64.
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse64(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif defined(_MSC_VER)
+  // MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
+    return 31 - result;
+  }
+  if (_BitScanReverse(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clzll(0) is undefined.
+  if (n == 0) {
+    return 64;
+  }
+  return __builtin_clzll(n);
+#else
+  return CountLeadingZeros64Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit64(uint64_t n) {
+  return 64 - CountLeadingZeros64(n);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_BITS_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft.c b/tensorflow/lite/experimental/microfrontend/lib/fft.c
new file mode 100644
index 0000000000000000000000000000000000000000..c1dd62fb7d4254f3a8f8941fb1dfa95dc0fdb5ba
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft.c
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+
+#include <string.h>
+
+#define FIXED_POINT 16
+#include "kiss_fft.h"
+// Internal test dependency placeholder1
+// Internal test dependency placeholder2
+#include "tools/kiss_fftr.h"
+// Internal test dependency placeholder3
+
+void FftCompute(struct FftState* state, const int16_t* input,
+                int input_scale_shift) {
+  const size_t input_size = state->input_size;
+  const size_t fft_size = state->fft_size;
+
+  int16_t* fft_input = state->input;
+  // First, scale the input by the given shift.
+  int i;
+  for (i = 0; i < input_size; ++i) {
+    *fft_input++ = (*input++) << input_scale_shift;
+  }
+  // Zero out whatever else remains in the top part of the input.
+  for (; i < fft_size; ++i) {
+    *fft_input++ = 0;
+  }
+
+  // Apply the FFT.
+  kiss_fftr((const kiss_fftr_cfg)state->scratch, state->input,
+            (kiss_fft_cpx*)state->output);
+}
+
+void FftInit(struct FftState* state) {
+  // All the initialization is done in FftPopulateState()
+}
+
+void FftReset(struct FftState* state) {
+  memset(state->input, 0, state->fft_size * sizeof(*state->input));
+  memset(state->output, 0, (state->fft_size / 2 + 1) * sizeof(*state->output));
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft.h b/tensorflow/lite/experimental/microfrontend/lib/fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaffa69debb17d1614edd3378c452c29dd16f079
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct complex_int16_t {
+  int16_t real;
+  int16_t imag;
+};
+
+struct FftState {
+  int16_t* input;
+  struct complex_int16_t* output;
+  size_t fft_size;
+  size_t input_size;
+  void* scratch;
+  size_t scratch_size;
+};
+
+void FftCompute(struct FftState* state, const int16_t* input,
+                int input_scale_shift);
+
+void FftInit(struct FftState* state);
+
+void FftReset(struct FftState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_io.c b/tensorflow/lite/experimental/microfrontend/lib/fft_io.c
new file mode 100644
index 0000000000000000000000000000000000000000..b01a8848e9d5cb07ae7fcee7646e84480f43a279
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_io.c
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/fft_io.h"
+
+void FftWriteMemmapPreamble(FILE* fp, const struct FftState* state) {
+  fprintf(fp, "static int16_t fft_input[%zu];\n", state->fft_size);
+  fprintf(fp, "static struct complex_int16_t fft_output[%zu];\n",
+          state->fft_size / 2 + 1);
+  fprintf(fp, "static char fft_scratch[%zu];\n", state->scratch_size);
+  fprintf(fp, "\n");
+}
+
+void FftWriteMemmap(FILE* fp, const struct FftState* state,
+                       const char* variable) {
+  fprintf(fp, "%s->input = fft_input;\n", variable);
+  fprintf(fp, "%s->output = fft_output;\n", variable);
+  fprintf(fp, "%s->fft_size = %zu;\n", variable, state->fft_size);
+  fprintf(fp, "%s->input_size = %zu;\n", variable, state->input_size);
+  fprintf(fp, "%s->scratch = fft_scratch;\n", variable);
+  fprintf(fp, "%s->scratch_size = %zu;\n", variable, state->scratch_size);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_io.h b/tensorflow/lite/experimental/microfrontend/lib/fft_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a59af68266381f25e1cc9b9bd72e04acdcfa60a
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_io.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void FftWriteMemmapPreamble(FILE* fp, const struct FftState* state);
+void FftWriteMemmap(FILE* fp, const struct FftState* state,
+                    const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_IO_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c1ee2d852201cc52a53ae07bf6e00ebf6f1ab47
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+const int16_t kFakeWindow[] = {
+    0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
+    0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
+const int kScaleShift = 0;
+
+TEST(FftTest, CheckOutputValues) {
+  struct FftState state;
+  ASSERT_TRUE(
+      FftPopulateState(&state, sizeof(kFakeWindow) / sizeof(kFakeWindow[0])));
+
+  FftInit(&state);
+  FftCompute(&state, kFakeWindow, kScaleShift);
+
+  const struct complex_int16_t expected[] = {
+      {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
+      {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
+      {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
+  ASSERT_EQ(state.fft_size / 2 + 1, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i <= state.fft_size / 2; ++i) {
+    EXPECT_EQ(state.output[i].real, expected[i].real);
+    EXPECT_EQ(state.output[i].imag, expected[i].imag);
+  }
+
+  FftFreeStateContents(&state);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_util.c b/tensorflow/lite/experimental/microfrontend/lib/fft_util.c
new file mode 100644
index 0000000000000000000000000000000000000000..40cb9f87358087159d031613311077542aa77fa4
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_util.c
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h"
+
+#include <stdio.h>
+
+#define FIXED_POINT 16
+#include "kiss_fft.h"
+#include "tools/kiss_fftr.h"
+
+int FftPopulateState(struct FftState* state, size_t input_size) {
+  state->input_size = input_size;
+  state->fft_size = 1;
+  while (state->fft_size < state->input_size) {
+    state->fft_size <<= 1;
+  }
+
+  state->input = malloc(state->fft_size * sizeof(*state->input));
+  if (state->input == NULL) {
+    fprintf(stderr, "Failed to alloc fft input buffer\n");
+    return 0;
+  }
+
+  state->output =
+      malloc((state->fft_size / 2 + 1) * sizeof(*state->output) * 2);
+  if (state->output == NULL) {
+    fprintf(stderr, "Failed to alloc fft output buffer\n");
+    return 0;
+  }
+
+  // Ask kissfft how much memory it wants.
+  size_t scratch_size = 0;
+  kiss_fftr_cfg kfft_cfg =
+      kiss_fftr_alloc(state->fft_size, 0, NULL, &scratch_size);
+  if (kfft_cfg != NULL) {
+    fprintf(stderr, "Kiss memory sizing failed.\n");
+    return 0;
+  }
+  state->scratch = malloc(scratch_size);
+  if (state->scratch == NULL) {
+    fprintf(stderr, "Failed to alloc fft scratch buffer\n");
+    return 0;
+  }
+  state->scratch_size = scratch_size;
+  // Let kissfft configure the scratch space we just allocated
+  kfft_cfg = kiss_fftr_alloc(state->fft_size, 0, state->scratch, &scratch_size);
+  if (kfft_cfg != state->scratch) {
+    fprintf(stderr, "Kiss memory preallocation strategy failed.\n");
+    return 0;
+  }
+  return 1;
+}
+
+void FftFreeStateContents(struct FftState* state) {
+  free(state->input);
+  free(state->output);
+  free(state->scratch);
+}
+
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_util.h b/tensorflow/lite/experimental/microfrontend/lib/fft_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a471301c3f0a548e3032ced65639832d6f03b02
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Prepares and FFT for the given input size.
+int FftPopulateState(struct FftState* state, size_t input_size);
+
+// Frees any allocated buffers.
+void FftFreeStateContents(struct FftState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FFT_UTIL_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank.c b/tensorflow/lite/experimental/microfrontend/lib/filterbank.c
new file mode 100644
index 0000000000000000000000000000000000000000..22cfaf78ab4662c99f5548e59ce017d1f37d8208
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank.c
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
+
+#include <string.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
+
+void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
+                                         struct complex_int16_t* fft_output,
+                                         int32_t* energy) {
+  const int end_index = state->end_index;
+  int i;
+  energy += state->start_index;
+  fft_output += state->start_index;
+  for (i = state->start_index; i < end_index; ++i) {
+    const int32_t real = fft_output->real;
+    const int32_t imag = fft_output->imag;
+    fft_output++;
+    const uint32_t mag_squared = (real * real) + (imag * imag);
+    *energy++ = mag_squared;
+  }
+}
+
+void FilterbankAccumulateChannels(struct FilterbankState* state,
+                                  const int32_t* energy) {
+  uint64_t* work = state->work;
+  uint64_t weight_accumulator = 0;
+  uint64_t unweight_accumulator = 0;
+
+  const int16_t* channel_frequency_starts = state->channel_frequency_starts;
+  const int16_t* channel_weight_starts = state->channel_weight_starts;
+  const int16_t* channel_widths = state->channel_widths;
+
+  int num_channels_plus_1 = state->num_channels + 1;
+  int i;
+  for (i = 0; i < num_channels_plus_1; ++i) {
+    const int32_t* magnitudes = energy + *channel_frequency_starts++;
+    const int16_t* weights = state->weights + *channel_weight_starts;
+    const int16_t* unweights = state->unweights + *channel_weight_starts++;
+    const int width = *channel_widths++;
+    int j;
+    for (j = 0; j < width; ++j) {
+      weight_accumulator += *weights++ * ((uint64_t) *magnitudes);
+      unweight_accumulator += *unweights++ * ((uint64_t) *magnitudes);
+      ++magnitudes;
+    }
+    *work++ = weight_accumulator;
+    weight_accumulator = unweight_accumulator;
+    unweight_accumulator = 0;
+  }
+}
+
+static uint16_t Sqrt32(uint32_t num) {
+  if (num == 0) {
+    return 0;
+  }
+  uint32_t res = 0;
+  int max_bit_number = 32 - MostSignificantBit32(num);
+  max_bit_number |= 1;
+  uint32_t bit = 1U << (31 - max_bit_number);
+  int iterations = (31 - max_bit_number) / 2 + 1;
+  while (iterations--) {
+    if (num >= res + bit) {
+      num -= res + bit;
+      res = (res >> 1U) + bit;
+    } else {
+      res >>= 1U;
+    }
+    bit >>= 2U;
+  }
+  // Do rounding - if we have the bits.
+  if (num > res && res != 0xFFFF) {
+    ++res;
+  }
+  return res;
+}
+
+static uint32_t Sqrt64(uint64_t num) {
+  // Take a shortcut and just use 32 bit operations if the upper word is all
+  // clear. This will cause a slight off by one issue for numbers close to 2^32,
+  // but it probably isn't going to matter (and gives us a big performance win).
+  if ((num >> 32) == 0) {
+    return Sqrt32((uint32_t) num);
+  }
+  uint64_t res = 0;
+  int max_bit_number = 64 - MostSignificantBit64(num);
+  max_bit_number |= 1;
+  uint64_t bit = 1ULL << (63 - max_bit_number);
+  int iterations = (63 - max_bit_number) / 2 + 1;
+  while (iterations--) {
+    if (num >= res + bit) {
+      num -= res + bit;
+      res = (res >> 1U) + bit;
+    } else {
+      res >>= 1U;
+    }
+    bit >>= 2U;
+  }
+  // Do rounding - if we have the bits.
+  if (num > res && res != 0xFFFFFFFFLL) {
+    ++res;
+  }
+  return res;
+}
+
+uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) {
+  const int num_channels = state->num_channels;
+  const int64_t* work = state->work + 1;
+  // Reuse the work buffer since we're fine clobbering it at this point to hold
+  // the output.
+  uint32_t* output = (uint32_t*) state->work;
+  int i;
+  for (i = 0; i < num_channels; ++i) {
+    *output++ = Sqrt64(*work++) >> scale_down_shift;
+  }
+  return (uint32_t*) state->work;
+}
+
+void FilterbankReset(struct FilterbankState* state) {
+  memset(state->work, 0, (state->num_channels + 1) * sizeof(*state->work));
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank.h b/tensorflow/lite/experimental/microfrontend/lib/filterbank.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e6d3885f2c2273ed8bf718e0c1148b8714cbfcf
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank.h
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+
+#define kFilterbankBits 12
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FilterbankState {
+  int num_channels;
+  int start_index;
+  int end_index;
+  int16_t* channel_frequency_starts;
+  int16_t* channel_weight_starts;
+  int16_t* channel_widths;
+  int16_t* weights;
+  int16_t* unweights;
+  uint64_t* work;
+};
+
+// Converts the relevant complex values of an FFT output into energy (the
+// square magnitude).
+void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
+                                         struct complex_int16_t* fft_output,
+                                         int32_t* energy);
+
+// Computes the mel-scale filterbank on the given energy array. Output is cached
+// internally - to fetch it, you need to call FilterbankSqrt.
+void FilterbankAccumulateChannels(struct FilterbankState* state,
+                                  const int32_t* energy);
+
+// Applies an integer square root to the 64 bit intermediate values of the
+// filterbank, and returns a pointer to them. Memory will be invalidated the
+// next time FilterbankAccumulateChannels is called.
+uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift);
+
+void FilterbankReset(struct FilterbankState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c b/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c
new file mode 100644
index 0000000000000000000000000000000000000000..2dbb4b3bf09654df3be0165f14c6f3da742268f1
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_io.h"
+
+static void PrintArray(FILE* fp, const char* name, const int16_t* values,
+                       size_t size) {
+  fprintf(fp, "static int16_t filterbank_%s[] = {", name);
+  for (int i = 0; i < size; ++i) {
+    fprintf(fp, "%d", values[i]);
+    if (i < size - 1) {
+      fprintf(fp, ", ");
+    }
+  }
+  fprintf(fp, "};\n");
+}
+
+void FilterbankWriteMemmapPreamble(FILE* fp,
+                                   const struct FilterbankState* state) {
+  const int num_channels_plus_1 = state->num_channels + 1;
+
+  PrintArray(fp, "channel_frequency_starts", state->channel_frequency_starts,
+             num_channels_plus_1);
+  PrintArray(fp, "channel_weight_starts", state->channel_weight_starts,
+             num_channels_plus_1);
+  PrintArray(fp, "channel_widths", state->channel_widths, num_channels_plus_1);
+  int num_weights = 0;
+  int i;
+  for (i = 0; i < num_channels_plus_1; ++i) {
+    num_weights += state->channel_widths[i];
+  }
+  PrintArray(fp, "weights", state->weights, num_weights);
+  PrintArray(fp, "unweights", state->unweights, num_weights);
+
+  fprintf(fp, "static uint64_t filterbank_work[%d];\n", num_channels_plus_1);
+  fprintf(fp, "\n");
+}
+
+void FilterbankWriteMemmap(FILE* fp, const struct FilterbankState* state,
+                           const char* variable) {
+  fprintf(fp, "%s->num_channels = %d;\n", variable, state->num_channels);
+  fprintf(fp, "%s->start_index = %d;\n", variable, state->start_index);
+  fprintf(fp, "%s->end_index = %d;\n", variable, state->end_index);
+
+  fprintf(
+      fp,
+      "%s->channel_frequency_starts = filterbank_channel_frequency_starts;\n",
+      variable);
+  fprintf(fp, "%s->channel_weight_starts = filterbank_channel_weight_starts;\n",
+          variable);
+  fprintf(fp, "%s->channel_widths = filterbank_channel_widths;\n", variable);
+  fprintf(fp, "%s->weights = filterbank_weights;\n", variable);
+  fprintf(fp, "%s->unweights = filterbank_unweights;\n", variable);
+  fprintf(fp, "%s->work = filterbank_work;\n", variable);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.h b/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fc96845897c566dd62271a84a936638562784e2
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void FilterbankWriteMemmapPreamble(FILE* fp,
+                                   const struct FilterbankState* state);
+void FilterbankWriteMemmap(FILE* fp, const struct FilterbankState* state,
+                           const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_IO_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..808d527186eaa920a9eb5319b328b96de6047174
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
@@ -0,0 +1,194 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h"
+
+#include <cstring>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kSpectrumSize = 17;
+const int kStartIndex = 1;
+const int kEndIndex = 15;
+const int32_t kEnergy[] = {-1,     181,      400,      181,      625,    28322,
+                           786769, 18000000, 40972801, 18000000, 784996, 28085,
+                           625,    181,      361,      -1,       -1};
+const uint64_t kWork[] = {1835887, 61162970173, 258694800000};
+const int kScaleShift = 0;
+
+// Test filterbank generation using scaled-down defaults.
+class FilterbankTest : public ::testing::Test {
+ protected:
+  FilterbankTest() {
+    config_.num_channels = 2;
+    config_.lower_band_limit = 8.0;
+    config_.upper_band_limit = 450.0;
+  }
+
+  struct FilterbankConfig config_;
+};
+
+TEST_F(FilterbankTest, CheckStartIndex) {
+  struct FilterbankState state;
+  ASSERT_TRUE(
+      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+
+  EXPECT_EQ(state.start_index, kStartIndex);
+
+  FilterbankFreeStateContents(&state);
+}
+
+TEST_F(FilterbankTest, CheckEndIndex) {
+  struct FilterbankState state;
+  ASSERT_TRUE(
+      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+
+  EXPECT_EQ(state.end_index, kEndIndex);
+
+  FilterbankFreeStateContents(&state);
+}
+
+TEST_F(FilterbankTest, CheckChannelFrequencyStarts) {
+  struct FilterbankState state;
+  ASSERT_TRUE(
+      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 4, 8};
+  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i <= state.num_channels; ++i) {
+    EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
+  }
+
+  FilterbankFreeStateContents(&state);
+}
+
+TEST_F(FilterbankTest, CheckChannelWeightStarts) {
+  struct FilterbankState state;
+  ASSERT_TRUE(
+      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 8, 16};
+  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i <= state.num_channels; ++i) {
+    EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
+  }
+
+  FilterbankFreeStateContents(&state);
+}
+
+TEST_F(FilterbankTest, CheckChannelWidths) {
+  struct FilterbankState state;
+  ASSERT_TRUE(
+      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {8, 8, 8};
+  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i <= state.num_channels; ++i) {
+    EXPECT_EQ(state.channel_widths[i], expected[i]);
+  }
+
+  FilterbankFreeStateContents(&state);
+}
+
+TEST_F(FilterbankTest, CheckWeights) {
+  struct FilterbankState state;
+  ASSERT_TRUE(
+      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 3277, 2217, 1200, 222,  0,   0,   0,
+                              0, 3376, 2468, 1591, 744,  0,   0,   0,
+                              0, 4020, 3226, 2456, 1708, 983, 277, 0};
+  ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
+                state.channel_widths[state.num_channels],
+            sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    EXPECT_EQ(state.weights[i], expected[i]);
+  }
+
+  FilterbankFreeStateContents(&state);
+}
+
+TEST_F(FilterbankTest, CheckUnweights) {
+  struct FilterbankState state;
+  ASSERT_TRUE(
+      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 819, 1879, 2896, 3874, 0,    0,    0,
+                              0, 720, 1628, 2505, 3352, 0,    0,    0,
+                              0, 76,  870,  1640, 2388, 3113, 3819, 0};
+  ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
+                state.channel_widths[state.num_channels],
+            sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    EXPECT_EQ(state.unweights[i], expected[i]);
+  }
+
+  FilterbankFreeStateContents(&state);
+}
+
+TEST_F(FilterbankTest, CheckConvertFftComplexToEnergy) {
+  struct FilterbankState state;
+  state.start_index = kStartIndex;
+  state.end_index = kEndIndex;
+
+  struct complex_int16_t fake_fft[] = {
+      {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
+      {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
+      {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
+  int32_t* energy = reinterpret_cast<int32_t*>(fake_fft);
+  FilterbankConvertFftComplexToEnergy(&state, fake_fft, energy);
+
+  for (int i = state.start_index; i < state.end_index; ++i) {
+    EXPECT_EQ(energy[i], kEnergy[i]);
+  }
+}
+
+TEST_F(FilterbankTest, CheckAccumulateChannels) {
+  struct FilterbankState state;
+  ASSERT_TRUE(
+      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+
+  FilterbankAccumulateChannels(&state, kEnergy);
+
+  ASSERT_EQ(state.num_channels + 1, sizeof(kWork) / sizeof(kWork[0]));
+  for (int i = 0; i <= state.num_channels; ++i) {
+    EXPECT_EQ(state.work[i], kWork[i]);
+  }
+
+  FilterbankFreeStateContents(&state);
+}
+
+TEST_F(FilterbankTest, CheckSqrt) {
+  struct FilterbankState state;
+  ASSERT_TRUE(
+      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  std::memcpy(state.work, kWork, sizeof(kWork));
+
+  uint32_t* scaled_filterbank = FilterbankSqrt(&state, kScaleShift);
+
+  const uint32_t expected[] = {247311, 508620};
+  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < state.num_channels; ++i) {
+    EXPECT_EQ(scaled_filterbank[i], expected[i]);
+  }
+
+  FilterbankFreeStateContents(&state);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c b/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c
new file mode 100644
index 0000000000000000000000000000000000000000..ce8b4acc0f696f9c4123bab9daeb1e8802c3e828
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.c
@@ -0,0 +1,225 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#define kFilterbankIndexAlignment 4
+#define kFilterbankChannelBlockSize 4
+
+void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config) {
+  config->num_channels = 32;
+  config->lower_band_limit = 125.0f;
+  config->upper_band_limit = 7500.0f;
+  config->output_scale_shift = 7;
+}
+
+static float FreqToMel(float freq) {
+  return 1127.0 * log(1.0 + (freq / 700.0));
+}
+
+static void CalculateCenterFrequencies(const int num_channels,
+                                       const float lower_frequency_limit,
+                                       const float upper_frequency_limit,
+                                       float* center_frequencies) {
+  assert(lower_frequency_limit >= 0.0f);
+  assert(upper_frequency_limit > lower_frequency_limit);
+
+  const float mel_low = FreqToMel(lower_frequency_limit);
+  const float mel_hi = FreqToMel(upper_frequency_limit);
+  const float mel_span = mel_hi - mel_low;
+  const float mel_spacing = mel_span / ((float) num_channels);
+  int i;
+  for (i = 0; i < num_channels; ++i) {
+    center_frequencies[i] = mel_low + (mel_spacing * (i + 1));
+  }
+}
+
+static void QuantizeFilterbankWeights(const float float_weight,
+                                      int16_t* weight, int16_t* unweight) {
+  *weight = floor(float_weight * (1 << kFilterbankBits) + 0.5);
+  *unweight = floor((1.0 - float_weight) * (1 << kFilterbankBits) + 0.5);
+}
+
+int FilterbankPopulateState(const struct FilterbankConfig* config,
+                            struct FilterbankState* state,
+                            int sample_rate, int spectrum_size) {
+  state->num_channels = config->num_channels;
+  const int num_channels_plus_1 = config->num_channels + 1;
+
+  // How should we align things to index counts given the byte alignment?
+  const int index_alignment =
+      (kFilterbankIndexAlignment < sizeof(int16_t)
+           ? 1
+           : kFilterbankIndexAlignment / sizeof(int16_t));
+
+  state->channel_frequency_starts =
+      malloc(num_channels_plus_1 * sizeof(*state->channel_frequency_starts));
+  state->channel_weight_starts =
+      malloc(num_channels_plus_1 * sizeof(*state->channel_weight_starts));
+  state->channel_widths =
+      malloc(num_channels_plus_1 * sizeof(*state->channel_widths));
+  state->work = malloc(num_channels_plus_1 * sizeof(*state->work));
+
+  float* center_mel_freqs =
+      malloc(num_channels_plus_1 * sizeof(*center_mel_freqs));
+  int16_t* actual_channel_starts =
+      malloc(num_channels_plus_1 * sizeof(*actual_channel_starts));
+  int16_t* actual_channel_widths =
+      malloc(num_channels_plus_1 * sizeof(*actual_channel_widths));
+
+  if (state->channel_frequency_starts == NULL ||
+      state->channel_weight_starts == NULL ||
+      state->channel_widths == NULL ||
+      center_mel_freqs == NULL ||
+      actual_channel_starts == NULL ||
+      actual_channel_widths == NULL) {
+    free(center_mel_freqs);
+    free(actual_channel_starts);
+    free(actual_channel_widths);
+    fprintf(stderr, "Failed to allocate channel buffers\n");
+    return 0;
+  }
+
+  CalculateCenterFrequencies(num_channels_plus_1, config->lower_band_limit,
+                             config->upper_band_limit, center_mel_freqs);
+
+  // Always exclude DC.
+  const float hz_per_sbin = 0.5 * sample_rate / ((float) spectrum_size - 1);
+  state->start_index = 1.5 + config->lower_band_limit / hz_per_sbin;
+  state->end_index = 0;  // Initialized to zero here, but actually set below.
+
+  // For each channel, we need to figure out what frequencies belong to it, and
+  // how much padding we need to add so that we can efficiently multiply the
+  // weights and unweights for accumulation. To simplify the multiplication
+  // logic, all channels will have some multiplication to do (even if there are
+  // no frequencies that accumulate to that channel) - they will be directed to
+  // a set of zero weights.
+  int chan_freq_index_start = state->start_index;
+  int weight_index_start = 0;
+  int needs_zeros = 0;
+
+  int chan;
+  for (chan = 0; chan < num_channels_plus_1; ++chan) {
+    // Keep jumping frequencies until we overshoot the bound on this channel.
+    int freq_index = chan_freq_index_start;
+    while (FreqToMel((freq_index) * hz_per_sbin) <= center_mel_freqs[chan]) {
+      ++freq_index;
+    }
+
+    const int width = freq_index - chan_freq_index_start;
+    actual_channel_starts[chan] = chan_freq_index_start;
+    actual_channel_widths[chan] = width;
+
+    if (width == 0) {
+      // This channel doesn't actually get anything from the frequencies, it's
+      // always zero. We need then to insert some 'zero' weights into the
+      // output, and just redirect this channel to do a single multiplication at
+      // this point. For simplicity, the zeros are placed at the beginning of
+      // the weights arrays, so we have to go and update all the other
+      // weight_starts to reflect this shift (but only once).
+      state->channel_frequency_starts[chan] = 0;
+      state->channel_weight_starts[chan] = 0;
+      state->channel_widths[chan] = kFilterbankChannelBlockSize;
+      if (!needs_zeros) {
+        needs_zeros = 1;
+        int j;
+        for (j = 0; j < chan; ++j) {
+          state->channel_weight_starts[j] += kFilterbankChannelBlockSize;
+        }
+        weight_index_start += kFilterbankChannelBlockSize;
+      }
+    } else {
+      // How far back do we need to go to ensure that we have the proper
+      // alignment?
+      const int aligned_start =
+          (chan_freq_index_start / index_alignment) * index_alignment;
+      const int aligned_width =
+          (chan_freq_index_start - aligned_start + width);
+      const int padded_width =
+          (((aligned_width - 1) / kFilterbankChannelBlockSize) + 1) *
+          kFilterbankChannelBlockSize;
+
+      state->channel_frequency_starts[chan] = aligned_start;
+      state->channel_weight_starts[chan] = weight_index_start;
+      state->channel_widths[chan] = padded_width;
+      weight_index_start += padded_width;
+    }
+    chan_freq_index_start = freq_index;
+  }
+
+  // Allocate the two arrays to store the weights - weight_index_start contains
+  // the index of what would be the next set of weights that we would need to
+  // add, so that's how many weights we need to allocate.
+  state->weights = calloc(weight_index_start, sizeof(*state->weights));
+  state->unweights = calloc(weight_index_start, sizeof(*state->unweights));
+
+  // If the alloc failed, we also need to nuke the arrays.
+  if (state->weights == NULL || state->unweights == NULL) {
+    free(center_mel_freqs);
+    free(actual_channel_starts);
+    free(actual_channel_widths);
+    fprintf(stderr, "Failed to allocate weights or unweights\n");
+    return 0;
+  }
+
+  // Next pass, compute all the weights. Since everything has been memset to
+  // zero, we only need to fill in the weights that correspond to some frequency
+  // for a channel.
+  const float mel_low = FreqToMel(config->lower_band_limit);
+  for (chan = 0; chan < num_channels_plus_1; ++chan) {
+    int frequency = actual_channel_starts[chan];
+    const int num_frequencies = actual_channel_widths[chan];
+    const int frequency_offset =
+        frequency - state->channel_frequency_starts[chan];
+    const int weight_start = state->channel_weight_starts[chan];
+    const float denom_val = (chan == 0) ? mel_low : center_mel_freqs[chan - 1];
+
+    int j;
+    for (j = 0; j < num_frequencies; ++j, ++frequency) {
+      const float weight =
+          (center_mel_freqs[chan] - FreqToMel(frequency * hz_per_sbin)) /
+          (center_mel_freqs[chan] - denom_val);
+
+      // Make the float into an integer for the weights (and unweights).
+      const int weight_index = weight_start + frequency_offset + j;
+      QuantizeFilterbankWeights(weight, state->weights + weight_index,
+                                state->unweights + weight_index);
+    }
+    if (frequency > state->end_index) {
+      state->end_index = frequency;
+    }
+  }
+
+  free(center_mel_freqs);
+  free(actual_channel_starts);
+  free(actual_channel_widths);
+  if (state->end_index >= spectrum_size) {
+    fprintf(stderr, "Filterbank end_index is above spectrum size.\n");
+    return 0;
+  }
+  return 1;
+}
+
+void FilterbankFreeStateContents(struct FilterbankState* state) {
+  free(state->channel_frequency_starts);
+  free(state->channel_weight_starts);
+  free(state->channel_widths);
+  free(state->weights);
+  free(state->unweights);
+  free(state->work);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h b/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..781d102479b4283111108196b7117f33fde07092
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FilterbankConfig {
+  // number of frequency channel buckets for filterbank
+  int num_channels;
+  // maximum frequency to include
+  float upper_band_limit;
+  // minimum frequency to include
+  float lower_band_limit;
+  // unused
+  int output_scale_shift;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config);
+
+// Allocates any buffers.
+int FilterbankPopulateState(const struct FilterbankConfig* config,
+                            struct FilterbankState* state, int sample_rate,
+                            int spectrum_size);
+
+// Frees any allocated buffers.
+void FilterbankFreeStateContents(struct FilterbankState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FILTERBANK_UTIL_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend.c b/tensorflow/lite/experimental/microfrontend/lib/frontend.c
new file mode 100644
index 0000000000000000000000000000000000000000..7a618d9af5e79792970abc506e152a85b898a758
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend.c
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+
+#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
+
+struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
+                                             const int16_t* samples,
+                                             size_t num_samples,
+                                             size_t* num_samples_read) {
+  struct FrontendOutput output;
+  output.values = NULL;
+  output.size = 0;
+
+  // Try to apply the window - if it fails, return and wait for more data.
+  if (!WindowProcessSamples(&state->window, samples, num_samples,
+                            num_samples_read)) {
+    return output;
+  }
+
+  // Apply the FFT to the window's output (and scale it so that the fixed point
+  // FFT can have as much resolution as possible).
+  int input_shift =
+      15 - MostSignificantBit32(state->window.max_abs_output_value);
+  FftCompute(&state->fft, state->window.output, input_shift);
+
+  // We can re-ruse the fft's output buffer to hold the energy.
+  int32_t* energy = (int32_t*) state->fft.output;
+
+  FilterbankConvertFftComplexToEnergy(&state->filterbank, state->fft.output,
+                                      energy);
+
+  FilterbankAccumulateChannels(&state->filterbank, energy);
+  uint32_t* scaled_filterbank = FilterbankSqrt(&state->filterbank, input_shift);
+
+  // Apply noise reduction.
+  NoiseReductionApply(&state->noise_reduction, scaled_filterbank);
+
+  if (state->pcan_gain_control.enable_pcan) {
+    PcanGainControlApply(&state->pcan_gain_control, scaled_filterbank);
+  }
+
+  // Apply the log and scale.
+  int correction_bits =
+      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
+  uint16_t* logged_filterbank =
+      LogScaleApply(&state->log_scale, scaled_filterbank,
+                    state->filterbank.num_channels, correction_bits);
+
+  output.size = state->filterbank.num_channels;
+  output.values = logged_filterbank;
+  return output;
+}
+
+void FrontendReset(struct FrontendState* state) {
+  WindowReset(&state->window);
+  FftReset(&state->fft);
+  FilterbankReset(&state->filterbank);
+  NoiseReductionReset(&state->noise_reduction);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend.h b/tensorflow/lite/experimental/microfrontend/lib/frontend.h
new file mode 100644
index 0000000000000000000000000000000000000000..883df5fd3d05c506b8cbbb8f8b24fa02d9ee7f3c
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FrontendState {
+  struct WindowState window;
+  struct FftState fft;
+  struct FilterbankState filterbank;
+  struct NoiseReductionState noise_reduction;
+  struct PcanGainControlState pcan_gain_control;
+  struct LogScaleState log_scale;
+};
+
+struct FrontendOutput {
+  const uint16_t* values;
+  size_t size;
+};
+
+// Main entry point to processing frontend samples. Updates num_samples_read to
+// contain the number of samples that have been consumed from the input array.
+// Returns a struct containing the generated output. If not enough samples were
+// added to generate a feature vector, the returned size will be 0 and the
+// values pointer will be NULL. Note that the output pointer will be invalidated
+// as soon as FrontendProcessSamples is called again, so copy the contents
+// elsewhere if you need to use them later.
+struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
+                                             const int16_t* samples,
+                                             size_t num_samples,
+                                             size_t* num_samples_read);
+
+void FrontendReset(struct FrontendState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_io.c b/tensorflow/lite/experimental/microfrontend/lib/frontend_io.c
new file mode 100644
index 0000000000000000000000000000000000000000..b422d078a6faaf65221805fd79d0e4e373a92d6a
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_io.c
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend_io.h"
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft_io.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_io.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_io.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/window_io.h"
+
+int WriteFrontendStateMemmap(const char* header, const char* source,
+                             const struct FrontendState* state) {
+  // Write a header that just has our init function.
+  FILE* fp = fopen(header, "w");
+  if (!fp) {
+    fprintf(stderr, "Failed to open header '%s' for write\n", header);
+    return 0;
+  }
+  fprintf(fp, "#ifndef FRONTEND_STATE_MEMMAP_H_\n");
+  fprintf(fp, "#define FRONTEND_STATE_MEMMAP_H_\n");
+  fprintf(fp, "\n");
+  fprintf(fp, "#include \"frontend.h\"\n");
+  fprintf(fp, "\n");
+  fprintf(fp, "struct FrontendState* GetFrontendStateMemmap();\n");
+  fprintf(fp, "\n");
+  fprintf(fp, "#endif  // FRONTEND_STATE_MEMMAP_H_\n");
+  fclose(fp);
+
+  // Write out the source file that actually has everything in it.
+  fp = fopen(source, "w");
+  if (!fp) {
+    fprintf(stderr, "Failed to open source '%s' for write\n", source);
+    return 0;
+  }
+  fprintf(fp, "#include \"%s\"\n", header);
+  fprintf(fp, "\n");
+  WindowWriteMemmapPreamble(fp, &state->window);
+  FftWriteMemmapPreamble(fp, &state->fft);
+  FilterbankWriteMemmapPreamble(fp, &state->filterbank);
+  NoiseReductionWriteMemmapPreamble(fp, &state->noise_reduction);
+  fprintf(fp, "static struct FrontendState state;\n");
+  fprintf(fp, "struct FrontendState* GetFrontendStateMemmap() {\n");
+  WindowWriteMemmap(fp, &state->window, "  (&state.window)");
+  FftWriteMemmap(fp, &state->fft, "  (&state.fft)");
+  FilterbankWriteMemmap(fp, &state->filterbank, "  (&state.filterbank)");
+  NoiseReductionWriteMemmap(fp, &state->noise_reduction,
+                            "  (&state.noise_reduction)");
+  LogScaleWriteMemmap(fp, &state->log_scale, "  (&state.log_scale)");
+  fprintf(fp, "  FftInit(&state.fft);\n");
+  fprintf(fp, "  FrontendReset(&state);\n");
+  fprintf(fp, "  return &state;\n");
+  fprintf(fp, "}\n");
+  fclose(fp);
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_io.h b/tensorflow/lite/experimental/microfrontend/lib/frontend_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d59eda7d093f07f5a98c96ef4d0dc7d3b1fa28c
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_io.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_IO_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int WriteFrontendStateMemmap(const char* header, const char* source,
+                             const struct FrontendState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_IO_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_main.c b/tensorflow/lite/experimental/microfrontend/lib/frontend_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..4a8411b6214df3ae930367342ee222ea2196c619
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_main.c
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
+
+int main(int argc, char** argv) {
+  struct FrontendConfig frontend_config;
+  FrontendFillConfigWithDefaults(&frontend_config);
+
+  char* filename = argv[1];
+  int sample_rate = 16000;
+
+  struct FrontendState frontend_state;
+  if (!FrontendPopulateState(&frontend_config, &frontend_state, sample_rate)) {
+    fprintf(stderr, "Failed to populate frontend state\n");
+    FrontendFreeStateContents(&frontend_state);
+    return 1;
+  }
+
+
+  FILE* fp = fopen(filename, "r");
+  if (fp == NULL) {
+    fprintf(stderr, "Failed to open %s for read\n", filename);
+    return 1;
+  }
+  fseek(fp, 0L, SEEK_END);
+  size_t audio_file_size = ftell(fp) / sizeof(int16_t);
+  fseek(fp, 0L, SEEK_SET);
+  int16_t* audio_data = malloc(audio_file_size * sizeof(int16_t));
+  int16_t* original_audio_data = audio_data;
+  if (audio_file_size !=
+      fread(audio_data, sizeof(int16_t), audio_file_size, fp)) {
+    fprintf(stderr, "Failed to read in all audio data\n");
+    return 1;
+  }
+
+  while (audio_file_size > 0) {
+    size_t num_samples_read;
+    struct FrontendOutput output = FrontendProcessSamples(
+        &frontend_state, audio_data, audio_file_size, &num_samples_read);
+    audio_data += num_samples_read;
+    audio_file_size -= num_samples_read;
+
+    if (output.values != NULL) {
+      int i;
+      for (i = 0; i < output.size; ++i) {
+        printf("%d ", output.values[i]);
+      }
+      printf("\n");
+    }
+  }
+
+  FrontendFreeStateContents(&frontend_state);
+  free(original_audio_data);
+  return 0;
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c b/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c
new file mode 100644
index 0000000000000000000000000000000000000000..766b7f2ad568dbec20f9b1a8525825020c411d9e
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_generator.c
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend_io.h"
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    fprintf(stderr,
+            "%s requires exactly two parameters - the names of the header and "
+            "source files to save\n");
+    return 1;
+  }
+  struct FrontendConfig frontend_config;
+  FrontendFillConfigWithDefaults(&frontend_config);
+
+  int sample_rate = 16000;
+  struct FrontendState frontend_state;
+  if (!FrontendPopulateState(&frontend_config, &frontend_state, sample_rate)) {
+    fprintf(stderr, "Failed to populate frontend state\n");
+    FrontendFreeStateContents(&frontend_state);
+    return 1;
+  }
+
+  if (!WriteFrontendStateMemmap(argv[1], argv[2], &frontend_state)) {
+    fprintf(stderr, "Failed to write memmap\n");
+    FrontendFreeStateContents(&frontend_state);
+    return 1;
+  }
+
+  FrontendFreeStateContents(&frontend_state);
+  return 0;
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_main.c b/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..cf39e93a78361d1da9a1c837610c482309135c9a
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_memmap_main.c
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+#include "memmap.h"
+
+int main(int argc, char** argv) {
+  struct FrontendState* frontend_state = GetFrontendStateMemmap();
+
+  char* filename = argv[1];
+  FILE* fp = fopen(filename, "r");
+  if (fp == NULL) {
+    fprintf(stderr, "Failed to open %s for read\n", filename);
+    return 1;
+  }
+  fseek(fp, 0L, SEEK_END);
+  size_t audio_file_size = ftell(fp) / sizeof(int16_t);
+  fseek(fp, 0L, SEEK_SET);
+  int16_t* audio_data = malloc(audio_file_size * sizeof(int16_t));
+  int16_t* original_audio_data = audio_data;
+  if (audio_file_size !=
+      fread(audio_data, sizeof(int16_t), audio_file_size, fp)) {
+    fprintf(stderr, "Failed to read in all audio data\n");
+    return 1;
+  }
+
+  while (audio_file_size > 0) {
+    size_t num_samples_read;
+    struct FrontendOutput output = FrontendProcessSamples(
+        frontend_state, audio_data, audio_file_size, &num_samples_read);
+    audio_data += num_samples_read;
+    audio_file_size -= num_samples_read;
+
+    if (output.values != NULL) {
+      int i;
+      for (i = 0; i < output.size; ++i) {
+        printf("%d ", output.values[i]);
+      }
+      printf("\n");
+    }
+  }
+
+  free(original_audio_data);
+  return 0;
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..993e866cc08850cdfea129278783420e827d67f2
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kWindowSamples = 25;
+const int kStepSamples = 10;
+const int16_t kFakeAudioData[] = {
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
+
+// Test end-to-end frontend behaviors.
+class FrontendTest : public ::testing::Test {
+ protected:
+  FrontendTest() {
+    config_.window.size_ms = 25;
+    config_.window.step_size_ms = 10;
+    config_.noise_reduction.smoothing_bits = 10;
+    config_.filterbank.num_channels = 2;
+    config_.filterbank.lower_band_limit = 8.0;
+    config_.filterbank.upper_band_limit = 450.0;
+    config_.noise_reduction.smoothing_bits = 10;
+    config_.noise_reduction.even_smoothing = 0.025;
+    config_.noise_reduction.odd_smoothing = 0.06;
+    config_.noise_reduction.min_signal_remaining = 0.05;
+    config_.pcan_gain_control.enable_pcan = true;
+    config_.pcan_gain_control.strength = 0.95;
+    config_.pcan_gain_control.offset = 80.0;
+    config_.pcan_gain_control.gain_bits = 21;
+    config_.log_scale.enable_log = true;
+    config_.log_scale.scale_shift = 6;
+  }
+
+  struct FrontendConfig config_;
+};
+
+TEST_F(FrontendTest, CheckOutputValues) {
+  struct FrontendState state;
+  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  size_t num_samples_read;
+
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read);
+
+  const uint16_t expected[] = {479, 425};
+  ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < output.size; ++i) {
+    EXPECT_EQ(output.values[i], expected[i]);
+  }
+
+  FrontendFreeStateContents(&state);
+}
+
+TEST_F(FrontendTest, CheckConsecutiveWindow) {
+  struct FrontendState state;
+  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  size_t num_samples_read;
+
+  FrontendProcessSamples(&state, kFakeAudioData,
+                         sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]),
+                         &num_samples_read);
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read);
+
+  const int16_t expected[] = {436, 378};
+  ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < output.size; ++i) {
+    EXPECT_EQ(output.values[i], expected[i]);
+  }
+
+  FrontendFreeStateContents(&state);
+}
+
+TEST_F(FrontendTest, CheckNotEnoughSamples) {
+  struct FrontendState state;
+  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  size_t num_samples_read;
+
+  FrontendProcessSamples(&state, kFakeAudioData,
+                         sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]),
+                         &num_samples_read);
+  FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read);
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples + kStepSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples -
+          kStepSamples,
+      &num_samples_read);
+
+  EXPECT_EQ(output.size, 0);
+  EXPECT_EQ(output.values, nullptr);
+
+  FrontendFreeStateContents(&state);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_util.c b/tensorflow/lite/experimental/microfrontend/lib/frontend_util.c
new file mode 100644
index 0000000000000000000000000000000000000000..94c15adcafe9261527fa239487f59eb70e63f6a3
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_util.c
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
+
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config) {
+  WindowFillConfigWithDefaults(&config->window);
+  FilterbankFillConfigWithDefaults(&config->filterbank);
+  NoiseReductionFillConfigWithDefaults(&config->noise_reduction);
+  PcanGainControlFillConfigWithDefaults(&config->pcan_gain_control);
+  LogScaleFillConfigWithDefaults(&config->log_scale);
+}
+
+int FrontendPopulateState(const struct FrontendConfig* config,
+                          struct FrontendState* state, int sample_rate) {
+  memset(state, 0, sizeof(*state));
+
+  if (!WindowPopulateState(&config->window, &state->window, sample_rate)) {
+    fprintf(stderr, "Failed to populate window state\n");
+    return 0;
+  }
+
+  if (!FftPopulateState(&state->fft, state->window.size)) {
+    fprintf(stderr, "Failed to populate fft state\n");
+    return 0;
+  }
+  FftInit(&state->fft);
+
+  if (!FilterbankPopulateState(&config->filterbank, &state->filterbank,
+                               sample_rate, state->fft.fft_size / 2 + 1)) {
+    fprintf(stderr, "Failed to populate filterbank state\n");
+    return 0;
+  }
+
+  if (!NoiseReductionPopulateState(&config->noise_reduction,
+                                   &state->noise_reduction,
+                                   state->filterbank.num_channels)) {
+    fprintf(stderr, "Failed to populate noise reduction state\n");
+    return 0;
+  }
+
+  int input_correction_bits =
+      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
+  if (!PcanGainControlPopulateState(&config->pcan_gain_control,
+                                    &state->pcan_gain_control,
+                                    state->noise_reduction.estimate,
+                                    state->filterbank.num_channels,
+                                    state->noise_reduction.smoothing_bits,
+                                    input_correction_bits)) {
+    fprintf(stderr, "Failed to populate pcan gain control state\n");
+    return 0;
+  }
+
+  if (!LogScalePopulateState(&config->log_scale, &state->log_scale)) {
+    fprintf(stderr, "Failed to populate log scale state\n");
+    return 0;
+  }
+
+  FrontendReset(state);
+
+  // All good, return a true value.
+  return 1;
+}
+
+void FrontendFreeStateContents(struct FrontendState* state) {
+  WindowFreeStateContents(&state->window);
+  FftFreeStateContents(&state->fft);
+  FilterbankFreeStateContents(&state->filterbank);
+  NoiseReductionFreeStateContents(&state->noise_reduction);
+  PcanGainControlFreeStateContents(&state->pcan_gain_control);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h b/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..895ce6cd2b2b08e858196fc773a45ae9d29b41b0
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_util.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/filterbank_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/window_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FrontendConfig {
+  struct WindowConfig window;
+  struct FilterbankConfig filterbank;
+  struct NoiseReductionConfig noise_reduction;
+  struct PcanGainControlConfig pcan_gain_control;
+  struct LogScaleConfig log_scale;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config);
+
+// Allocates any buffers.
+int FrontendPopulateState(const struct FrontendConfig* config,
+                          struct FrontendState* state, int sample_rate);
+
+// Frees any allocated buffers.
+void FrontendFreeStateContents(struct FrontendState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_FRONTEND_UTIL_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_lut.c b/tensorflow/lite/experimental/microfrontend/lib/log_lut.c
new file mode 100644
index 0000000000000000000000000000000000000000..f59618e028f48889029d31bb6b64784db9b98c2c
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_lut.c
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/log_lut.h"
+const uint16_t kLogLut[]
+#ifndef _MSC_VER
+    __attribute__((aligned(4)))
+#endif  // _MSV_VER
+    = {0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
+       2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,
+       3960, 4068, 4172, 4272, 4368, 4460, 4549, 4633, 4714, 4791, 4864, 4934,
+       5001, 5063, 5123, 5178, 5231, 5280, 5326, 5368, 5408, 5444, 5477, 5507,
+       5533, 5557, 5578, 5595, 5610, 5622, 5631, 5637, 5640, 5641, 5638, 5633,
+       5626, 5615, 5602, 5586, 5568, 5547, 5524, 5498, 5470, 5439, 5406, 5370,
+       5332, 5291, 5249, 5203, 5156, 5106, 5054, 5000, 4944, 4885, 4825, 4762,
+       4697, 4630, 4561, 4490, 4416, 4341, 4264, 4184, 4103, 4020, 3935, 3848,
+       3759, 3668, 3575, 3481, 3384, 3286, 3186, 3084, 2981, 2875, 2768, 2659,
+       2549, 2437, 2323, 2207, 2090, 1971, 1851, 1729, 1605, 1480, 1353, 1224,
+       1094, 963,  830,  695,  559,  421,  282,  142,  0,    0};
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_lut.h b/tensorflow/lite/experimental/microfrontend/lib/log_lut.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2448a32289a91c653c2fbe9b1080622112f0242
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_lut.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Number of segments in the log lookup table. The table will be kLogSegments+1
+// in length (with some padding).
+#define kLogSegments 128
+#define kLogSegmentsLog2 7
+
+// Scale used by lookup table.
+#define kLogScale 65536
+#define kLogScaleLog2 16
+#define kLogCoeff 45426
+
+extern const uint16_t kLogLut[];
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_LUT_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale.c b/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
new file mode 100644
index 0000000000000000000000000000000000000000..54f370e7d9f55250279cd6c9a81b9a17e0d6e071
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
+
+#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/log_lut.h"
+
+#define kuint16max 0x0000FFFF
+
+// The following functions implement integer logarithms of various sizes. The
+// approximation is calculated according to method described in
+//       www.inti.gob.ar/electronicaeinformatica/instrumentacion/utic/
+//       publicaciones/SPL2007/Log10-spl07.pdf
+// It first calculates log2 of the input and then converts it to natural
+// logarithm.
+
+static uint32_t Log2FractionPart(const uint32_t x, const uint32_t log2x) {
+  // Part 1
+  int32_t frac = x - (1LL << log2x);
+  if (log2x < kLogScaleLog2) {
+    frac <<= kLogScaleLog2 - log2x;
+  } else {
+    frac >>= log2x - kLogScaleLog2;
+  }
+  // Part 2
+  const uint32_t base_seg = frac >> (kLogScaleLog2 - kLogSegmentsLog2);
+  const uint32_t seg_unit =
+      (((uint32_t) 1) << kLogScaleLog2) >> kLogSegmentsLog2;
+
+  const int32_t c0 = kLogLut[base_seg];
+  const int32_t c1 = kLogLut[base_seg + 1];
+  const int32_t seg_base = seg_unit * base_seg;
+  const int32_t rel_pos = ((c1 - c0) * (frac - seg_base)) >> kLogScaleLog2;
+  return frac + c0 + rel_pos;
+}
+
+static uint32_t Log(const uint32_t x, const uint32_t scale_shift) {
+  const uint32_t integer = MostSignificantBit32(x) - 1;
+  const uint32_t fraction = Log2FractionPart(x, integer);
+  const uint32_t log2 = (integer << kLogScaleLog2) + fraction;
+  const uint32_t round = kLogScale / 2;
+  const uint32_t loge =
+      (((uint64_t) kLogCoeff) * log2 + round) >> kLogScaleLog2;
+  // Finally scale to our output scale
+  const uint32_t loge_scaled = ((loge << scale_shift) + round) >> kLogScaleLog2;
+  return loge_scaled;
+}
+
+uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
+                        int signal_size, int correction_bits) {
+  const int scale_shift = state->scale_shift;
+  uint16_t* output = (uint16_t*) signal;
+  uint16_t* ret = output;
+  for (int i = 0; i < signal_size; ++i) {
+    uint32_t value = *signal++;
+    if (state->enable_log) {
+      if (correction_bits < 0) {
+        value >>= -correction_bits;
+      } else {
+        value <<= correction_bits;
+      }
+      if (value > 1) {
+        value = Log(value, scale_shift);
+      } else {
+        value = 0;
+      }
+    }
+    *output++ = (value < kuint16max) ? value : kuint16max;
+  }
+  return ret;
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale.h b/tensorflow/lite/experimental/microfrontend/lib/log_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..a383f32f5bc6df63f148e537d55e63be05aadbc2
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct LogScaleState {
+  int enable_log;
+  int scale_shift;
+};
+
+// Applies a fixed point logarithm to the signal and converts it to 16 bit. Note
+// that the signal array will be modified.
+uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
+                        int signal_size, int correction_bits);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.c b/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.c
new file mode 100644
index 0000000000000000000000000000000000000000..a04760de58e5fb8f3918e67b52505f867e9432b8
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.c
@@ -0,0 +1,21 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_io.h"
+
+void LogScaleWriteMemmap(FILE* fp, const struct LogScaleState* state,
+                         const char* variable) {
+  fprintf(fp, "%s->enable_log = %d;\n", variable, state->enable_log);
+  fprintf(fp, "%s->scale_shift = %d;\n", variable, state->scale_shift);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.h b/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d447ac9018b12fcdf206b8c3d5f1c6d32c18230
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale_io.h
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void LogScaleWriteMemmap(FILE* fp, const struct LogScaleState* state,
+                         const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_IO_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91ca657e543d2a5f89a55483df8bdfbee1365951
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+const int kScaleShift = 6;
+const int kCorrectionBits = -1;
+
+TEST(LogScaleTest, CheckOutputValues) {
+  struct LogScaleState state;
+  state.enable_log = true;
+  state.scale_shift = kScaleShift;
+
+  uint32_t fake_signal[] = {3578, 1533};
+  uint16_t* output = LogScaleApply(&state, fake_signal,
+                                   sizeof(fake_signal) / sizeof(fake_signal[0]),
+                                   kCorrectionBits);
+
+  const uint16_t expected[] = {479, 425};
+  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    EXPECT_EQ(output[i], expected[i]);
+  }
+}
+
+TEST(LogScaleTest, CheckOutputValuesNoLog) {
+  struct LogScaleState state;
+  state.enable_log = false;
+  state.scale_shift = kScaleShift;
+
+  uint32_t fake_signal[] = {85964, 45998};
+  uint16_t* output = LogScaleApply(&state, fake_signal,
+                                   sizeof(fake_signal) / sizeof(fake_signal[0]),
+                                   kCorrectionBits);
+
+  const uint16_t expected[] = {65535, 45998};
+  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    EXPECT_EQ(output[i], expected[i]);
+  }
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.c b/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e3dd1d1e94687f341f9d14ebdcd63913123fc98
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.c
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h"
+
+void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config) {
+  config->enable_log = 1;
+  config->scale_shift = 6;
+}
+
+int LogScalePopulateState(const struct LogScaleConfig* config,
+                          struct LogScaleState* state) {
+  state->enable_log = config->enable_log;
+  state->scale_shift = config->scale_shift;
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h b/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..11f7d9eeb9b7939e15c1fb5120a1b5dff2aacdbe
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct LogScaleConfig {
+  // set to false (0) to disable this module
+  int enable_log;
+  // scale results by 2^(scale_shift)
+  int scale_shift;
+};
+
+// Populates the LogScaleConfig with "sane" default values.
+void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config);
+
+// Allocates any buffers.
+int LogScalePopulateState(const struct LogScaleConfig* config,
+                          struct LogScaleState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_LOG_SCALE_UTIL_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.c b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.c
new file mode 100644
index 0000000000000000000000000000000000000000..b6fcb5e9409c071805cdd30643481d0a07cd8d0e
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.c
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
+
+#include <string.h>
+
+void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal) {
+  int i;
+  for (i = 0; i < state->num_channels; ++i) {
+    const uint32_t smoothing =
+        ((i & 1) == 0) ? state->even_smoothing : state->odd_smoothing;
+    const uint32_t one_minus_smoothing = (1 << kNoiseReductionBits) - smoothing;
+
+    // Update the estimate of the noise.
+    const uint32_t signal_scaled_up = signal[i] << state->smoothing_bits;
+    uint32_t estimate =
+        (((uint64_t) signal_scaled_up * smoothing) +
+         ((uint64_t) state->estimate[i] * one_minus_smoothing)) >>
+        kNoiseReductionBits;
+    state->estimate[i] = estimate;
+
+    // Make sure that we can't get a negative value for the signal - estimate.
+    if (estimate > signal_scaled_up) {
+      estimate = signal_scaled_up;
+    }
+
+    const uint32_t floor =
+        ((uint64_t) signal[i] * state->min_signal_remaining) >>
+        kNoiseReductionBits;
+    const uint32_t subtracted = (signal_scaled_up - estimate) >>
+        state->smoothing_bits;
+    const uint32_t output = subtracted > floor ? subtracted : floor;
+    signal[i] = output;
+  }
+}
+
+void NoiseReductionReset(struct NoiseReductionState* state) {
+  memset(state->estimate, 0, sizeof(*state->estimate) * state->num_channels);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..46d3f52e60e37693496e4597992fae6678f87c7e
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_
+
+#define kNoiseReductionBits 14
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct NoiseReductionState {
+  int smoothing_bits;
+  uint16_t even_smoothing;
+  uint16_t odd_smoothing;
+  uint16_t min_signal_remaining;
+  int num_channels;
+  uint32_t* estimate;
+};
+
+// Removes stationary noise from each channel of the signal using a low pass
+// filter.
+void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal);
+
+void NoiseReductionReset(struct NoiseReductionState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.c b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.c
new file mode 100644
index 0000000000000000000000000000000000000000..19c32b32759ed6d6c1b24e0758ffb491a7fda766
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.c
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.h"
+
+void NoiseReductionWriteMemmapPreamble(
+    FILE* fp, const struct NoiseReductionState* state) {
+  fprintf(fp, "static uint32_t noise_reduction_estimate[%zu];\n",
+          state->num_channels);
+  fprintf(fp, "\n");
+}
+
+void NoiseReductionWriteMemmap(FILE* fp,
+                               const struct NoiseReductionState* state,
+                               const char* variable) {
+  fprintf(fp, "%s->even_smoothing = %d;\n", variable, state->even_smoothing);
+  fprintf(fp, "%s->odd_smoothing = %d;\n", variable, state->odd_smoothing);
+  fprintf(fp, "%s->min_signal_remaining = %d;\n", variable,
+          state->min_signal_remaining);
+  fprintf(fp, "%s->num_channels = %d;\n", variable, state->num_channels);
+
+  fprintf(fp, "%s->estimate = noise_reduction_estimate;\n", variable);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.h b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..ded52118f0cab05dcbe375860c8371f7fb7ecab7
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_io.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void NoiseReductionWriteMemmapPreamble(FILE* fp,
+                                       const struct NoiseReductionState* state);
+void NoiseReductionWriteMemmap(FILE* fp,
+                               const struct NoiseReductionState* state,
+                               const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_IO_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..16140564879305de86947044f8b8efd055a4793c
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+const int kNumChannels = 2;
+
+// Test noise reduction using default config values.
+class NoiseReductionTest : public ::testing::Test {
+ protected:
+  NoiseReductionTest() {
+    config_.smoothing_bits = 10;
+    config_.even_smoothing = 0.025;
+    config_.odd_smoothing = 0.06;
+    config_.min_signal_remaining = 0.05;
+  }
+
+  struct NoiseReductionConfig config_;
+};
+
+TEST_F(NoiseReductionTest, TestNoiseReductionEstimate) {
+  struct NoiseReductionState state;
+  ASSERT_TRUE(NoiseReductionPopulateState(&config_, &state, kNumChannels));
+
+  uint32_t signal[] = {247311, 508620};
+  NoiseReductionApply(&state, signal);
+
+  const uint32_t expected[] = {6321887, 31248341};
+  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < state.num_channels; ++i) {
+    EXPECT_EQ(state.estimate[i], expected[i]);
+  }
+
+  NoiseReductionFreeStateContents(&state);
+}
+
+TEST_F(NoiseReductionTest, TestNoiseReduction) {
+  struct NoiseReductionState state;
+  ASSERT_TRUE(NoiseReductionPopulateState(&config_, &state, kNumChannels));
+
+  uint32_t signal[] = {247311, 508620};
+  NoiseReductionApply(&state, signal);
+
+  const uint32_t expected[] = {241137, 478104};
+  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < state.num_channels; ++i) {
+    EXPECT_EQ(signal[i], expected[i]);
+  }
+
+  NoiseReductionFreeStateContents(&state);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.c b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.c
new file mode 100644
index 0000000000000000000000000000000000000000..a6c9234eb888daa719b792e8988e951a7b4a98f7
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.c
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h"
+
+#include <stdio.h>
+
+void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config) {
+  config->smoothing_bits = 10;
+  config->even_smoothing = 0.025;
+  config->odd_smoothing = 0.06;
+  config->min_signal_remaining = 0.05;
+}
+
+int NoiseReductionPopulateState(const struct NoiseReductionConfig* config,
+                                struct NoiseReductionState* state,
+                                int num_channels) {
+  state->smoothing_bits = config->smoothing_bits;
+  state->odd_smoothing = config->odd_smoothing * (1 << kNoiseReductionBits);
+  state->even_smoothing = config->even_smoothing * (1 << kNoiseReductionBits);
+  state->min_signal_remaining =
+      config->min_signal_remaining * (1 << kNoiseReductionBits);
+  state->num_channels = num_channels;
+  state->estimate = calloc(state->num_channels, sizeof(*state->estimate));
+  if (state->estimate == NULL) {
+    fprintf(stderr, "Failed to alloc estimate buffer\n");
+    return 0;
+  }
+  return 1;
+}
+
+void NoiseReductionFreeStateContents(struct NoiseReductionState* state) {
+  free(state->estimate);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa55539143fca69f3824a80d9a3cb98d3191f5bc
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct NoiseReductionConfig {
+  // scale the signal up by 2^(smoothing_bits) before reduction
+  int smoothing_bits;
+  // smoothing coefficient for even-numbered channels
+  float even_smoothing;
+  // smoothing coefficient for odd-numbered channels
+  float odd_smoothing;
+  // fraction of signal to preserve (1.0 disables this module)
+  float min_signal_remaining;
+};
+
+// Populates the NoiseReductionConfig with "sane" default values.
+void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config);
+
+// Allocates any buffers.
+int NoiseReductionPopulateState(const struct NoiseReductionConfig* config,
+                                struct NoiseReductionState* state,
+                                int num_channels);
+
+// Frees any allocated buffers.
+void NoiseReductionFreeStateContents(struct NoiseReductionState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_NOISE_REDUCTION_UTIL_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
new file mode 100644
index 0000000000000000000000000000000000000000..b49eb301370a7e95497478625a97333225a83341
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
+
+#include "tensorflow/lite/experimental/microfrontend/lib/bits.h"
+
+int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut) {
+  if (x <= 2) {
+    return lut[x];
+  }
+
+  const int16_t interval = MostSignificantBit32(x);
+  lut += 4 * interval - 6;
+
+  const int16_t frac = ((interval < 11)
+                        ? (x << (11 - interval))
+                        : (x >> (interval - 11))
+                       ) & 0x3FF;
+
+  int32_t result = ((int32_t) lut[2] * frac) >> 5;
+  result += ((int32_t) lut[1]) << 5;
+  result *= frac;
+  result = (result + (1 << 14)) >> 15;
+  result += lut[0];
+  return (int16_t) result;
+}
+
+uint32_t PcanShrink(const uint32_t x) {
+  if (x < (2 << kPcanSnrBits)) {
+    return (x * x) >> (2 + 2 * kPcanSnrBits - kPcanOutputBits);
+  } else {
+    return (x >> (kPcanSnrBits - kPcanOutputBits)) - (1 << kPcanOutputBits);
+  }
+}
+
+void PcanGainControlApply(struct PcanGainControlState* state,
+                          uint32_t* signal) {
+  for (int i = 0; i < state->num_channels; ++i) {
+    const uint32_t gain = WideDynamicFunction(state->noise_estimate[i],
+                                              state->gain_lut);
+    const uint32_t snr = ((uint64_t) signal[i] * gain) >> state->snr_shift;
+    signal[i] = PcanShrink(snr);
+  }
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h
new file mode 100644
index 0000000000000000000000000000000000000000..815579132233614c28167d3520b9aa2abe083423
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define kPcanSnrBits 12
+#define kPcanOutputBits 6
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct PcanGainControlState {
+  int enable_pcan;
+  uint32_t* noise_estimate;
+  int num_channels;
+  int16_t* gain_lut;
+  int32_t snr_shift;
+};
+
+int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut);
+
+uint32_t PcanShrink(const uint32_t x);
+
+void PcanGainControlApply(struct PcanGainControlState* state, uint32_t* signal);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..830db89edd8eb39fc68d24bfa4a61fe82ef3eace
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+const int kNumChannels = 2;
+const int kSmoothingBits = 10;
+const int kCorrectionBits = -1;
+
+// Test pcan auto gain control using default config values.
+class PcanGainControlTest : public ::testing::Test {
+ protected:
+  PcanGainControlTest() {
+    config_.enable_pcan = 1;
+    config_.strength = 0.95;
+    config_.offset = 80.0;
+    config_.gain_bits = 21;
+  }
+
+  struct PcanGainControlConfig config_;
+};
+
+TEST_F(PcanGainControlTest, TestPcanGainControl) {
+  uint32_t estimate[] = {6321887, 31248341};
+  struct PcanGainControlState state;
+  ASSERT_TRUE(PcanGainControlPopulateState(&config_, &state, estimate,
+                                           kNumChannels, kSmoothingBits,
+                                           kCorrectionBits));
+
+  uint32_t signal[] = {241137, 478104};
+  PcanGainControlApply(&state, signal);
+
+  const uint32_t expected[] = {3578, 1533};
+  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < state.num_channels; ++i) {
+    EXPECT_EQ(signal[i], expected[i]);
+  }
+
+  PcanGainControlFreeStateContents(&state);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
new file mode 100644
index 0000000000000000000000000000000000000000..dbe44c494ae07fb8c356723287cb32bf63381d27
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#define kint16max 0x00007FFF
+
+void PcanGainControlFillConfigWithDefaults(
+    struct PcanGainControlConfig* config) {
+  config->enable_pcan = 0;
+  config->strength = 0.95;
+  config->offset = 80.0;
+  config->gain_bits = 21;
+}
+
+int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
+                               int32_t input_bits, uint32_t x) {
+  const float x_as_float = ((float) x) / ((uint32_t) 1 << input_bits);
+  const float gain_as_float = ((uint32_t) 1 << config->gain_bits) *
+      powf(x_as_float + config->offset, -config->strength);
+
+  if (gain_as_float > kint16max) {
+    return kint16max;
+  }
+  return (int16_t) (gain_as_float + 0.5f);
+}
+
+int PcanGainControlPopulateState(const struct PcanGainControlConfig* config,
+                                 struct PcanGainControlState* state,
+                                 uint32_t* noise_estimate,
+                                 const int num_channels,
+                                 const uint16_t smoothing_bits,
+                                 const int32_t input_correction_bits) {
+  state->enable_pcan = config->enable_pcan;
+  if (!state->enable_pcan) {
+    return 1;
+  }
+  state->noise_estimate = noise_estimate;
+  state->num_channels = num_channels;
+  state->gain_lut = malloc(kWideDynamicFunctionLUTSize * sizeof(int16_t));
+  if (state->gain_lut == NULL) {
+    fprintf(stderr, "Failed to allocate gain LUT\n");
+    return 0;
+  }
+  state->snr_shift = config->gain_bits - input_correction_bits - kPcanSnrBits;
+
+  const int32_t input_bits = smoothing_bits - input_correction_bits;
+  state->gain_lut[0] = PcanGainLookupFunction(config, input_bits, 0);
+  state->gain_lut[1] = PcanGainLookupFunction(config, input_bits, 1);
+  state->gain_lut -= 6;
+  for (int interval = 2; interval <= kWideDynamicFunctionBits; ++interval) {
+    const uint32_t x0 = (uint32_t) 1 << (interval - 1);
+    const uint32_t x1 = x0 + (x0 >> 1);
+    const uint32_t x2 = (interval == kWideDynamicFunctionBits)
+        ? x0 + (x0 - 1) : 2 * x0;
+
+    const int16_t y0 = PcanGainLookupFunction(config, input_bits, x0);
+    const int16_t y1 = PcanGainLookupFunction(config, input_bits, x1);
+    const int16_t y2 = PcanGainLookupFunction(config, input_bits, x2);
+
+    const int32_t diff1 = (int32_t) y1 - y0;
+    const int32_t diff2 = (int32_t) y2 - y0;
+    const int32_t a1 = 4 * diff1 - diff2;
+    const int32_t a2 = diff2 - a1;
+
+    state->gain_lut[4 * interval] = y0;
+    state->gain_lut[4 * interval + 1] = (int16_t) a1;
+    state->gain_lut[4 * interval + 2] = (int16_t) a2;
+  }
+  state->gain_lut += 6;
+  return 1;
+}
+
+void PcanGainControlFreeStateContents(struct PcanGainControlState* state) {
+  free(state->gain_lut);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4bfaa2ed71d2208613c1582fe730547c5db3247
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
+
+#define kWideDynamicFunctionBits 32
+#define kWideDynamicFunctionLUTSize (4 * kWideDynamicFunctionBits - 3)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct PcanGainControlConfig {
+  // set to false (0) to disable this module
+  int enable_pcan;
+  // gain normalization exponent (0.0 disables, 1.0 full strength)
+  float strength;
+  // positive value added in the normalization denominator
+  float offset;
+  // number of fractional bits in the gain
+  int gain_bits;
+};
+
+void PcanGainControlFillConfigWithDefaults(
+    struct PcanGainControlConfig* config);
+
+int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
+                               int32_t input_bits, uint32_t x);
+
+int PcanGainControlPopulateState(const struct PcanGainControlConfig* config,
+                                 struct PcanGainControlState* state,
+                                 uint32_t* noise_estimate,
+                                 const int num_channels,
+                                 const uint16_t smoothing_bits,
+                                 const int32_t input_correction_bits);
+
+void PcanGainControlFreeStateContents(struct PcanGainControlState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_PCAN_GAIN_CONTROL_UTIL_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window.c b/tensorflow/lite/experimental/microfrontend/lib/window.c
new file mode 100644
index 0000000000000000000000000000000000000000..517b60487becb9fd7b3bb9ebdaa9551338db6b4f
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/window.c
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
+
+#include <string.h>
+
+int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
+                         size_t num_samples, size_t* num_samples_read) {
+  const int size = state->size;
+
+  // Copy samples from the samples buffer over to our local input.
+  size_t max_samples_to_copy = state->size - state->input_used;
+  if (max_samples_to_copy > num_samples) {
+    max_samples_to_copy = num_samples;
+  }
+  memcpy(state->input + state->input_used, samples,
+         max_samples_to_copy * sizeof(*samples));
+  *num_samples_read = max_samples_to_copy;
+  state->input_used += max_samples_to_copy;
+
+  if (state->input_used < state->size) {
+    // We don't have enough samples to compute a window.
+    return 0;
+  }
+
+  // Apply the window to the input.
+  const int16_t* coefficients = state->coefficients;
+  const int16_t* input = state->input;
+  int16_t* output = state->output;
+  int i;
+  int16_t max_abs_output_value = 0;
+  for (i = 0; i < size; ++i) {
+    int16_t new_value =
+        (((int32_t) *input++) * *coefficients++) >> kFrontendWindowBits;
+    *output++ = new_value;
+    if (new_value < 0) {
+      new_value = -new_value;
+    }
+    if (new_value > max_abs_output_value) {
+      max_abs_output_value = new_value;
+    }
+  }
+  // Shuffle the input down by the step size, and update how much we have used.
+  memmove(state->input, state->input + state->step,
+          sizeof(*state->input) * (state->size - state->step));
+  state->input_used -= state->step;
+  state->max_abs_output_value = max_abs_output_value;
+
+  // Indicate that the output buffer is valid for the next stage.
+  return 1;
+}
+
+void WindowReset(struct WindowState* state) {
+  memset(state->input, 0, state->size * sizeof(*state->input));
+  memset(state->output, 0, state->size * sizeof(*state->output));
+  state->input_used = 0;
+  state->max_abs_output_value = 0;
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window.h b/tensorflow/lite/experimental/microfrontend/lib/window.h
new file mode 100644
index 0000000000000000000000000000000000000000..bad8151412fe227064ed6bf3fc6b87dd24054783
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/window.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define kFrontendWindowBits 12
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct WindowState {
+  size_t size;
+  int16_t* coefficients;
+  size_t step;
+
+  int16_t* input;
+  size_t input_used;
+  int16_t* output;
+  int16_t max_abs_output_value;
+};
+
+// Applies a window to the samples coming in, stepping forward at the given
+// rate.
+int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
+                         size_t num_samples, size_t* num_samples_read);
+
+void WindowReset(struct WindowState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_io.c b/tensorflow/lite/experimental/microfrontend/lib/window_io.c
new file mode 100644
index 0000000000000000000000000000000000000000..ed4ac5eb110c0f1358656ca9e1b79d6b37052258
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_io.c
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/window_io.h"
+
+void WindowWriteMemmapPreamble(FILE* fp, const struct WindowState* state) {
+  fprintf(fp, "static int16_t window_coefficients[] = {\n");
+  for (int i = 0; i < state->size; ++i) {
+    fprintf(fp, "%d", state->coefficients[i]);
+    if (i < state->size - 1) {
+      fprintf(fp, ", ");
+    }
+  }
+  fprintf(fp, "};\n");
+  fprintf(fp, "static int16_t window_input[%zu];\n", state->size);
+  fprintf(fp, "static int16_t window_output[%zu];\n", state->size);
+  fprintf(fp, "\n");
+}
+
+void WindowWriteMemmap(FILE* fp, const struct WindowState* state,
+                       const char* variable) {
+  fprintf(fp, "%s->size = %zu;\n", variable, state->size);
+  fprintf(fp, "%s->coefficients = window_coefficients;\n", variable);
+  fprintf(fp, "%s->step = %zu;\n", variable, state->step);
+
+  fprintf(fp, "%s->input = window_input;\n", variable);
+  fprintf(fp, "%s->input_used = %zu;\n", variable, state->input_used);
+  fprintf(fp, "%s->output = window_output;\n", variable);
+  fprintf(fp, "%s->max_abs_output_value = %d;\n", variable,
+          state->max_abs_output_value);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_io.h b/tensorflow/lite/experimental/microfrontend/lib/window_io.h
new file mode 100644
index 0000000000000000000000000000000000000000..a76b2dc3e812383d43e2b8fca74d75c7b1c67215
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_io.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_IO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_IO_H_
+
+#include <stdio.h>
+
+#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void WindowWriteMemmapPreamble(FILE* fp, const struct WindowState* state);
+void WindowWriteMemmap(FILE* fp, const struct WindowState* state,
+                       const char* variable);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_IO_H_
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_test.cc b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c6c19188d3e128e7bb3b1d007fff10ec271da95
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/window_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kWindowSamples = 25;
+const int kStepSamples = 10;
+const int16_t kFakeAudioData[] = {
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
+
+// Test window function behaviors using default config values.
+class WindowTest : public ::testing::Test {
+ protected:
+  WindowTest() {
+    config_.size_ms = 25;
+    config_.step_size_ms = 10;
+  }
+
+  struct WindowConfig config_;
+};
+
+TEST_F(WindowTest, CheckCoefficients) {
+  struct WindowState state;
+  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+
+  const int16_t expected[] = {16,   144,  391,  743,  1176, 1664, 2177,
+                              2681, 3145, 3541, 3843, 4032, 4096, 4032,
+                              3843, 3541, 3145, 2681, 2177, 1664, 1176,
+                              743,  391,  144,  16};
+  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < state.size; ++i) {
+    EXPECT_EQ(state.coefficients[i], expected[i]);
+  }
+
+  WindowFreeStateContents(&state);
+}
+
+TEST_F(WindowTest, CheckResidualInput) {
+  struct WindowState state;
+  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  size_t num_samples_read;
+
+  ASSERT_TRUE(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  for (int i = kStepSamples; i < kWindowSamples; ++i) {
+    EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
+  }
+
+  WindowFreeStateContents(&state);
+}
+
+TEST_F(WindowTest, CheckOutputValues) {
+  struct WindowState state;
+  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  size_t num_samples_read;
+
+  ASSERT_TRUE(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  const int16_t expected[] = {
+      0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
+      0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
+  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < state.size; ++i) {
+    EXPECT_EQ(state.output[i], expected[i]);
+  }
+
+  WindowFreeStateContents(&state);
+}
+
+TEST_F(WindowTest, CheckMaxAbsValue) {
+  struct WindowState state;
+  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  size_t num_samples_read;
+
+  ASSERT_TRUE(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  EXPECT_EQ(state.max_abs_output_value, 32256);
+
+  WindowFreeStateContents(&state);
+}
+
+TEST_F(WindowTest, CheckConsecutiveWindow) {
+  struct WindowState state;
+  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  size_t num_samples_read;
+
+  ASSERT_TRUE(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+  ASSERT_TRUE(WindowProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read));
+
+  const int16_t expected[] = {
+      0, -1152, 0, 5943,   0, -13312, 0, 21447, 0, -28328, 0, 32255, 0, -32256,
+      0, 28327, 0, -21448, 0, 13311,  0, -5944, 0, 1151,   0};
+  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  for (int i = 0; i < state.size; ++i) {
+    EXPECT_EQ(state.output[i], expected[i]);
+  }
+
+  WindowFreeStateContents(&state);
+}
+
+TEST_F(WindowTest, CheckNotEnoughSamples) {
+  struct WindowState state;
+  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  size_t num_samples_read;
+
+  ASSERT_TRUE(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+  ASSERT_TRUE(WindowProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read));
+  ASSERT_FALSE(WindowProcessSamples(
+      &state, kFakeAudioData + kWindowSamples + kStepSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples -
+          kStepSamples,
+      &num_samples_read));
+
+  EXPECT_EQ(
+      state.input_used,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - 2 * kStepSamples);
+
+  WindowFreeStateContents(&state);
+}
+
+}  // namespace
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_util.c b/tensorflow/lite/experimental/microfrontend/lib/window_util.c
new file mode 100644
index 0000000000000000000000000000000000000000..2445c343be11e8764c43f8ded0099d21b85b0573
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_util.c
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/window_util.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+void WindowFillConfigWithDefaults(struct WindowConfig* config) {
+  config->size_ms = 25;
+  config->step_size_ms = 10;
+}
+
+int WindowPopulateState(const struct WindowConfig* config,
+                        struct WindowState* state, int sample_rate) {
+  state->size = config->size_ms * sample_rate / 1000;
+  state->step = config->step_size_ms * sample_rate / 1000;
+
+  state->coefficients = malloc(
+      state->size * sizeof(*state->coefficients));
+  if (state->coefficients == NULL) {
+    fprintf(stderr, "Failed to allocate window coefficients\n");
+    return 0;
+  }
+
+  // Populate the window values.
+  const float arg = M_PI * 2.0 / ((float) state->size);
+  int i;
+  for (i = 0; i < state->size; ++i) {
+    float float_value = 0.5 - (0.5 * cos(arg * (i + 0.5)));
+    // Scale it to fixed point and round it.
+    state->coefficients[i] =
+        floor(float_value * (1 << kFrontendWindowBits) + 0.5);
+  }
+
+  state->input_used = 0;
+  state->input = malloc(
+      state->size * sizeof(*state->input));
+  if (state->input == NULL) {
+    fprintf(stderr, "Failed to allocate window input\n");
+    return 0;
+  }
+
+  state->output = malloc(
+      state->size * sizeof(*state->output));
+  if (state->output == NULL) {
+    fprintf(stderr, "Failed to allocate window output\n");
+    return 0;
+  }
+
+  return 1;
+}
+
+void WindowFreeStateContents(struct WindowState* state) {
+  free(state->coefficients);
+  free(state->input);
+  free(state->output);
+}
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_util.h b/tensorflow/lite/experimental/microfrontend/lib/window_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..68e4de9eb586ec9056fbdfe91084d5918f9e9638
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_util.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_
+
+#include "tensorflow/lite/experimental/microfrontend/lib/window.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct WindowConfig {
+  // length of window frame in milliseconds
+  size_t size_ms;
+  // length of step for next frame in milliseconds
+  size_t step_size_ms;
+};
+
+// Populates the WindowConfig with "sane" default values.
+void WindowFillConfigWithDefaults(struct WindowConfig* config);
+
+// Allocates any buffers.
+int WindowPopulateState(const struct WindowConfig* config,
+                        struct WindowState* state, int sample_rate);
+
+// Frees any allocated buffers.
+void WindowFreeStateContents(struct WindowState* state);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICROFRONTEND_LIB_WINDOW_UTIL_H_
diff --git a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51094a976d297af8e807ae4f828702ace9a9306a
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
@@ -0,0 +1,305 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
+#include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+
+using tensorflow::OpKernel;
+using tensorflow::OpKernelConstruction;
+using tensorflow::OpKernelContext;
+using tensorflow::Status;
+using tensorflow::Tensor;
+using tensorflow::TensorShape;
+using tensorflow::TensorShapeUtils;
+using tensorflow::errors::Internal;
+using tensorflow::errors::InvalidArgument;
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+namespace tensorflow {
+REGISTER_OP("AudioMicrofrontend")
+    .Input("audio: int16")
+    .Output("filterbanks: out_type")
+    .Attr("sample_rate: int = 16000")
+    .Attr("window_size: int = 25")
+    .Attr("window_step: int = 10")
+    .Attr("num_channels: int = 32")
+    .Attr("upper_band_limit: float = 7500.0")
+    .Attr("lower_band_limit: float = 125.0")
+    .Attr("smoothing_bits: int = 10")
+    .Attr("even_smoothing: float = 0.025")
+    .Attr("odd_smoothing: float = 0.06")
+    .Attr("min_signal_remaining: float = 0.05")
+    .Attr("enable_pcan: bool = false")
+    .Attr("pcan_strength: float = 0.95")
+    .Attr("pcan_offset: float = 80.0")
+    .Attr("gain_bits: int = 21")
+    .Attr("enable_log: bool = true")
+    .Attr("scale_shift: int = 6")
+    .Attr("left_context: int = 0")
+    .Attr("right_context: int = 0")
+    .Attr("frame_stride: int = 1")
+    .Attr("zero_padding: bool = false")
+    .Attr("out_scale: int = 1")
+    .Attr("out_type: {uint16, float} = DT_UINT16")
+    .SetShapeFn([](InferenceContext* ctx) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 1, &input));
+
+      int sample_rate;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("sample_rate", &sample_rate));
+      int window_size;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("window_size", &window_size));
+      window_size *= sample_rate / 1000;
+      int window_step;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("window_step", &window_step));
+      window_step *= sample_rate / 1000;
+
+      int num_channels;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("num_channels", &num_channels));
+      int left_context;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("left_context", &left_context));
+      int right_context;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("right_context", &right_context));
+      int frame_stride;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("frame_stride", &frame_stride));
+
+      DimensionHandle num_frames = ctx->Dim(input, 0);
+      if (ctx->Value(num_frames) < window_size) {
+        num_frames = ctx->MakeDim(0);
+      } else {
+        TF_RETURN_IF_ERROR(ctx->Subtract(num_frames, window_size, &num_frames));
+        TF_RETURN_IF_ERROR(
+            ctx->Divide(num_frames, window_step, false, &num_frames));
+        TF_RETURN_IF_ERROR(
+            ctx->Divide(num_frames, frame_stride, false, &num_frames));
+        TF_RETURN_IF_ERROR(ctx->Add(num_frames, 1, &num_frames));
+      }
+
+      int stack_size = 1 + left_context + right_context;
+      DimensionHandle num_features = ctx->MakeDim(num_channels);
+      TF_RETURN_IF_ERROR(
+          ctx->Multiply(num_features, stack_size, &num_features));
+
+      ShapeHandle output = ctx->MakeShape({num_frames, num_features});
+      ctx->set_output(0, output);
+      return tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+Audio Microfrontend Op.
+
+This Op converts a sequence of audio data into one or more
+feature vectors containing filterbanks of the input. The
+conversion process uses a lightweight library to perform:
+
+1. A slicing window function
+2. Short-time FFTs
+3. Filterbank calculations
+4. Noise reduction
+5. PCAN Auto Gain Control
+6. Logarithmic scaling
+
+Arguments
+  audio: 1D Tensor, int16 audio data in temporal ordering.
+  sample_rate: Integer, the sample rate of the audio in Hz.
+  window_size: Integer, length of desired time frames in ms.
+  window_step: Integer, length of step size for the next frame in ms.
+  num_channels: Integer, the number of filterbank channels to use.
+  upper_band_limit: Float, the highest frequency included in the filterbanks.
+  lower_band_limit: Float, the lowest frequency included in the filterbanks.
+  smoothing_bits: Int, scale up signal by 2^(smoothing_bits) before reduction.
+  even_smoothing: Float, smoothing coefficient for even-numbered channels.
+  odd_smoothing: Float, smoothing coefficient for odd-numbered channels.
+  min_signal_remaining: Float, fraction of signal to preserve in smoothing.
+  enable_pcan: Bool, enable PCAN auto gain control.
+  pcan_strength: Float, gain normalization exponent.
+  pcan_offset: Float, positive value added in the normalization denominator.
+  gain_bits: Int, number of fractional bits in the gain.
+  enable_log: Bool, enable logarithmic scaling of filterbanks.
+  scale_shift: Integer, scale filterbanks by 2^(scale_shift).
+  left_context: Integer, number of preceding frames to attach to each frame.
+  right_context: Integer, number of preceding frames to attach to each frame.
+  frame_stride: Integer, M frames to skip over, where output[n] = frame[n*M].
+  zero_padding: Bool, if left/right context is out-of-bounds, attach frame of
+                zeroes. Otherwise, frame[0] or frame[size-1] will be copied.
+  out_scale: Integer, divide all filterbanks by this number.
+  out_type: DType, type of the output Tensor, defaults to UINT16.
+
+Returns
+  filterbanks: 2D Tensor, each row is a time frame, each column is a channel.
+)doc");
+
+template <typename T>
+class AudioMicrofrontendOp : public OpKernel {
+ public:
+  explicit AudioMicrofrontendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("sample_rate", &sample_rate_));
+
+    int window_size;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("window_size", &window_size));
+    config_.window.size_ms = window_size;
+
+    int window_step;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("window_step", &window_step));
+    config_.window.step_size_ms = window_step;
+
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("num_channels", &config_.filterbank.num_channels));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("upper_band_limit",
+                                     &config_.filterbank.upper_band_limit));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("lower_band_limit",
+                                     &config_.filterbank.lower_band_limit));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("smoothing_bits",
+                                     &config_.noise_reduction.smoothing_bits));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("even_smoothing",
+                                     &config_.noise_reduction.even_smoothing));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("odd_smoothing",
+                                     &config_.noise_reduction.odd_smoothing));
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("min_signal_remaining",
+                                &config_.noise_reduction.min_signal_remaining));
+
+    bool enable_pcan;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_pcan", &enable_pcan));
+    config_.pcan_gain_control.enable_pcan = enable_pcan;
+
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("pcan_strength",
+                                     &config_.pcan_gain_control.strength));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("pcan_offset", &config_.pcan_gain_control.offset));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("gain_bits", &config_.pcan_gain_control.gain_bits));
+
+    bool enable_log;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_log", &enable_log));
+    config_.log_scale.enable_log = enable_log;
+
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("scale_shift", &config_.log_scale.scale_shift));
+
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("left_context", &left_context_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("right_context", &right_context_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("frame_stride", &frame_stride_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("zero_padding", &zero_padding_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_scale", &out_scale_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* audio;
+    OP_REQUIRES_OK(ctx, ctx->input("audio", &audio));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(audio->shape()),
+                InvalidArgument("audio is not a vector"));
+
+    auto audio_data =
+        reinterpret_cast<const int16_t*>(audio->tensor_data().data());
+    int audio_size = audio->NumElements();
+
+    Tensor* filterbanks = nullptr;
+    int window_size = config_.window.size_ms * sample_rate_ / 1000;
+    int window_step = config_.window.step_size_ms * sample_rate_ / 1000;
+    int num_frames = 0;
+    int sampled_frames = 0;
+    if (audio_size >= window_size) {
+      num_frames = (audio_size - window_size) / window_step + 1;
+      sampled_frames = (num_frames - 1) / frame_stride_ + 1;
+    }
+    TensorShape filterbanks_shape{
+        sampled_frames,
+        config_.filterbank.num_channels * (1 + left_context_ + right_context_)};
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, filterbanks_shape, &filterbanks));
+    auto filterbanks_flat = filterbanks->flat<T>();
+
+    struct FrontendState state;
+    if (!TF_PREDICT_TRUE(
+            FrontendPopulateState(&config_, &state, sample_rate_))) {
+      ctx->CtxFailure(__FILE__, __LINE__,
+                      Internal("failed to populate frontend state"));
+      FrontendFreeStateContents(&state);
+      return;
+    }
+
+    std::vector<std::vector<T>> frame_buffer(num_frames);
+    int frame_index = 0;
+    while (audio_size > 0) {
+      size_t num_samples_read;
+      struct FrontendOutput output = FrontendProcessSamples(
+          &state, audio_data, audio_size, &num_samples_read);
+      audio_data += num_samples_read;
+      audio_size -= num_samples_read;
+
+      if (output.values != nullptr) {
+        frame_buffer[frame_index].reserve(output.size);
+        for (int i = 0; i < output.size; ++i) {
+          frame_buffer[frame_index].push_back(static_cast<T>(output.values[i]) /
+                                              out_scale_);
+        }
+        ++frame_index;
+      }
+    }
+    FrontendFreeStateContents(&state);
+
+    int index = 0;
+    std::vector<T> pad(config_.filterbank.num_channels, 0);
+    for (int anchor = 0; anchor < frame_buffer.size();
+         anchor += frame_stride_) {
+      for (int frame = anchor - left_context_; frame <= anchor + right_context_;
+           ++frame) {
+        std::vector<T>* feature;
+        if (zero_padding_ && (frame < 0 || frame >= frame_buffer.size())) {
+          feature = &pad;
+        } else if (frame < 0) {
+          feature = &frame_buffer[0];
+        } else if (frame >= frame_buffer.size()) {
+          feature = &frame_buffer[frame_buffer.size() - 1];
+        } else {
+          feature = &frame_buffer[frame];
+        }
+        for (auto f : *feature) {
+          filterbanks_flat(index++) = f;
+        }
+      }
+    }
+  }
+
+ protected:
+  int sample_rate_;
+  struct FrontendConfig config_;
+  int left_context_;
+  int right_context_;
+  int frame_stride_;
+  bool zero_padding_;
+  int out_scale_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AudioMicrofrontendOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("AudioMicrofrontend")
+                            .Device(tensorflow::DEVICE_CPU)
+                            .TypeConstraint<uint16>("out_type"),
+                        AudioMicrofrontendOp<uint16>);
+REGISTER_KERNEL_BUILDER(Name("AudioMicrofrontend")
+                            .Device(tensorflow::DEVICE_CPU)
+                            .TypeConstraint<float>("out_type"),
+                        AudioMicrofrontendOp<float>);
+}  // namespace tensorflow
diff --git a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ce861707fda767a3ec1c6e2d23e6a70c6131f24
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
@@ -0,0 +1,167 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AudioMicrofrontend."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op
+from tensorflow.python.framework import test_util
+
+SAMPLE_RATE = 1000
+WINDOW_SIZE = 25
+WINDOW_STEP = 10
+NUM_CHANNELS = 2
+UPPER_BAND_LIMIT = 450.0
+LOWER_BAND_LIMIT = 8.0
+SMOOTHING_BITS = 10
+
+
+class AudioFeatureGenerationTest(tf.test.TestCase):
+
+  @test_util.run_v1_only("b/120545219")
+  def testSimple(self):
+    with self.test_session():
+      audio = tf.constant(
+          [0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4),
+          tf.int16)
+      filterbanks = frontend_op.audio_microfrontend(
+          audio,
+          sample_rate=SAMPLE_RATE,
+          window_size=WINDOW_SIZE,
+          window_step=WINDOW_STEP,
+          num_channels=NUM_CHANNELS,
+          upper_band_limit=UPPER_BAND_LIMIT,
+          lower_band_limit=LOWER_BAND_LIMIT,
+          smoothing_bits=SMOOTHING_BITS,
+          enable_pcan=True)
+      self.assertAllEqual(filterbanks.eval(),
+                          [[479, 425], [436, 378], [410, 350], [391, 325]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testSimpleFloatScaled(self):
+    with self.test_session():
+      audio = tf.constant(
+          [0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4),
+          tf.int16)
+      filterbanks = frontend_op.audio_microfrontend(
+          audio,
+          sample_rate=SAMPLE_RATE,
+          window_size=WINDOW_SIZE,
+          window_step=WINDOW_STEP,
+          num_channels=NUM_CHANNELS,
+          upper_band_limit=UPPER_BAND_LIMIT,
+          lower_band_limit=LOWER_BAND_LIMIT,
+          smoothing_bits=SMOOTHING_BITS,
+          enable_pcan=True,
+          out_scale=64,
+          out_type=tf.float32)
+      self.assertAllEqual(filterbanks.eval(),
+                          [[7.484375, 6.640625], [6.8125, 5.90625],
+                           [6.40625, 5.46875], [6.109375, 5.078125]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testStacking(self):
+    with self.test_session():
+      audio = tf.constant(
+          [0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4),
+          tf.int16)
+      filterbanks = frontend_op.audio_microfrontend(
+          audio,
+          sample_rate=SAMPLE_RATE,
+          window_size=WINDOW_SIZE,
+          window_step=WINDOW_STEP,
+          num_channels=NUM_CHANNELS,
+          upper_band_limit=UPPER_BAND_LIMIT,
+          lower_band_limit=LOWER_BAND_LIMIT,
+          smoothing_bits=SMOOTHING_BITS,
+          enable_pcan=True,
+          right_context=1,
+          frame_stride=2)
+      self.assertAllEqual(filterbanks.eval(),
+                          [[479, 425, 436, 378], [410, 350, 391, 325]])
+
+  def testStackingWithOverlap(self):
+    with self.test_session():
+      audio = tf.constant(
+          [0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4),
+          tf.int16)
+      filterbanks = frontend_op.audio_microfrontend(
+          audio,
+          sample_rate=SAMPLE_RATE,
+          window_size=WINDOW_SIZE,
+          window_step=WINDOW_STEP,
+          num_channels=NUM_CHANNELS,
+          upper_band_limit=UPPER_BAND_LIMIT,
+          lower_band_limit=LOWER_BAND_LIMIT,
+          smoothing_bits=SMOOTHING_BITS,
+          enable_pcan=True,
+          left_context=1,
+          right_context=1)
+      self.assertAllEqual(
+          self.evaluate(filterbanks),
+          [[479, 425, 479, 425, 436, 378], [479, 425, 436, 378, 410, 350],
+           [436, 378, 410, 350, 391, 325], [410, 350, 391, 325, 391, 325]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testStackingDropFrame(self):
+    with self.test_session():
+      audio = tf.constant(
+          [0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4),
+          tf.int16)
+      filterbanks = frontend_op.audio_microfrontend(
+          audio,
+          sample_rate=SAMPLE_RATE,
+          window_size=WINDOW_SIZE,
+          window_step=WINDOW_STEP,
+          num_channels=NUM_CHANNELS,
+          upper_band_limit=UPPER_BAND_LIMIT,
+          lower_band_limit=LOWER_BAND_LIMIT,
+          smoothing_bits=SMOOTHING_BITS,
+          enable_pcan=True,
+          left_context=1,
+          frame_stride=2)
+      self.assertAllEqual(filterbanks.eval(),
+                          [[479, 425, 479, 425], [436, 378, 410, 350]])
+
+  def testZeroPadding(self):
+    with self.test_session():
+      audio = tf.constant(
+          [0, 32767, 0, -32768] * ((WINDOW_SIZE + 7 * WINDOW_STEP) // 4),
+          tf.int16)
+      filterbanks = frontend_op.audio_microfrontend(
+          audio,
+          sample_rate=SAMPLE_RATE,
+          window_size=WINDOW_SIZE,
+          window_step=WINDOW_STEP,
+          num_channels=NUM_CHANNELS,
+          upper_band_limit=UPPER_BAND_LIMIT,
+          lower_band_limit=LOWER_BAND_LIMIT,
+          smoothing_bits=SMOOTHING_BITS,
+          enable_pcan=True,
+          left_context=2,
+          frame_stride=3,
+          zero_padding=True)
+      self.assertAllEqual(
+          self.evaluate(filterbanks),
+          [[0, 0, 0, 0, 479, 425], [436, 378, 410, 350, 391, 325],
+           [374, 308, 362, 292, 352, 275]])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py b/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d49482f4ecd34ec47df1d3baa3e6dccf8ae4bef
--- /dev/null
+++ b/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py
@@ -0,0 +1,113 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""AudioMicrofrontend Op creates filterbanks from audio data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.lite.experimental.microfrontend.ops import gen_audio_microfrontend_op
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
+
+_audio_microfrontend_op = loader.load_op_library(
+    resource_loader.get_path_to_datafile("_audio_microfrontend_op.so"))
+
+
+def audio_microfrontend(audio,
+                        sample_rate=16000,
+                        window_size=25,
+                        window_step=10,
+                        num_channels=32,
+                        upper_band_limit=7500.0,
+                        lower_band_limit=125.0,
+                        smoothing_bits=10,
+                        even_smoothing=0.025,
+                        odd_smoothing=0.06,
+                        min_signal_remaining=0.05,
+                        enable_pcan=True,
+                        pcan_strength=0.95,
+                        pcan_offset=80.0,
+                        gain_bits=21,
+                        enable_log=True,
+                        scale_shift=6,
+                        left_context=0,
+                        right_context=0,
+                        frame_stride=1,
+                        zero_padding=False,
+                        out_scale=1,
+                        out_type=tf.uint16):
+  """Audio Microfrontend Op.
+
+  This Op converts a sequence of audio data into one or more
+  feature vectors containing filterbanks of the input. The
+  conversion process uses a lightweight library to perform:
+
+  1. A slicing window function
+  2. Short-time FFTs
+  3. Filterbank calculations
+  4. Noise reduction
+  5. PCAN Auto Gain Control
+  6. Logarithmic scaling
+
+  Args:
+    audio: 1D Tensor, int16 audio data in temporal ordering.
+    sample_rate: Integer, the sample rate of the audio in Hz.
+    window_size: Integer, length of desired time frames in ms.
+    window_step: Integer, length of step size for the next frame in ms.
+    num_channels: Integer, the number of filterbank channels to use.
+    upper_band_limit: Float, the highest frequency included in the filterbanks.
+    lower_band_limit: Float, the lowest frequency included in the filterbanks.
+    smoothing_bits: Int, scale up signal by 2^(smoothing_bits) before reduction.
+    even_smoothing: Float, smoothing coefficient for even-numbered channels.
+    odd_smoothing: Float, smoothing coefficient for odd-numbered channels.
+    min_signal_remaining: Float, fraction of signal to preserve in smoothing.
+    enable_pcan: Bool, enable PCAN auto gain control.
+    pcan_strength: Float, gain normalization exponent.
+    pcan_offset: Float, positive value added in the normalization denominator.
+    gain_bits: Int, number of fractional bits in the gain.
+    enable_log: Bool, enable logarithmic scaling of filterbanks.
+    scale_shift: Integer, scale filterbanks by 2^(scale_shift).
+    left_context: Integer, number of preceding frames to attach to each frame.
+    right_context: Integer, number of preceding frames to attach to each frame.
+    frame_stride: Integer, M frames to skip over, where output[n] = frame[n*M].
+    zero_padding: Bool, if left/right context is out-of-bounds, attach frame of
+      zeroes. Otherwise, frame[0] or frame[size-1] will be copied.
+    out_scale: Integer, divide all filterbanks by this number.
+    out_type: DType, type of the output Tensor, defaults to UINT16.
+
+  Returns:
+    filterbanks: 2D Tensor, each row is a time frame, each column is a channel.
+
+  Raises:
+    ValueError: If the audio tensor is not explicitly a vector.
+  """
+  audio_shape = audio.get_shape()
+  if audio_shape.ndims is None:
+    raise ValueError("Input to `AudioMicrofrontend` should have known rank.")
+  if len(audio_shape) > 1:
+    audio = tf.reshape(audio, [-1])
+
+  return gen_audio_microfrontend_op.audio_microfrontend(
+      audio, sample_rate, window_size, window_step, num_channels,
+      upper_band_limit, lower_band_limit, smoothing_bits, even_smoothing,
+      odd_smoothing, min_signal_remaining, enable_pcan, pcan_strength,
+      pcan_offset, gain_bits, enable_log, scale_shift, left_context,
+      right_context, frame_stride, zero_padding, out_scale, out_type)
+
+
+tf.NotDifferentiable("AudioMicrofrontend")
diff --git a/tensorflow/lite/experimental/writer/BUILD b/tensorflow/lite/experimental/writer/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..57ce63636714aa616cb50e04fe2c15210cc2eb1c
--- /dev/null
+++ b/tensorflow/lite/experimental/writer/BUILD
@@ -0,0 +1,69 @@
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    features = ["-parse_headers"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+cc_binary(
+    name = "option_writer_generator",
+    srcs = ["option_writer_generator.cc"],
+    deps = [
+        "//tensorflow/lite/schema:schema_fbs_with_reflection",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "writer_lib",
+    srcs = [
+        "enum_mapping.h",
+        "writer_lib.cc",
+    ],
+    hdrs = [
+        "writer_lib.h",
+    ],
+    data = [
+        ":option_writer_gen",
+    ],
+    textual_hdrs = ["option_writer_generated.h"],
+    deps = [
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs_with_reflection",
+    ],
+)
+
+cc_binary(
+    name = "writer",
+    srcs = ["writer.cc"],
+    deps = [
+        ":writer_lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
+
+cc_test(
+    name = "writer_lib_test",
+    size = "small",
+    srcs = ["writer_lib_test.cc"],
+    deps = [
+        ":writer_lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+genrule(
+    name = "option_writer_gen",
+    outs = ["option_writer_generated.h"],
+    cmd = "$(location :option_writer_generator) $(@)",
+    tools = [":option_writer_generator"],
+)
diff --git a/tensorflow/contrib/lite/experimental/writer/enum_mapping.h b/tensorflow/lite/experimental/writer/enum_mapping.h
similarity index 91%
rename from tensorflow/contrib/lite/experimental/writer/enum_mapping.h
rename to tensorflow/lite/experimental/writer/enum_mapping.h
index 8bc464fd7188a2f530707d9bf7c0309ac8ca0b06..cb6ec3e0d7e0f1b53cc8b84e10cb1be4b1f023c0 100644
--- a/tensorflow/contrib/lite/experimental/writer/enum_mapping.h
+++ b/tensorflow/lite/experimental/writer/enum_mapping.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_WRITER_ENUM_MAPPING_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_WRITER_ENUM_MAPPING_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_WRITER_ENUM_MAPPING_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_WRITER_ENUM_MAPPING_H_
 
-#include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/schema/reflection/schema_generated.h"
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/schema/reflection/schema_generated.h"
 
 // TODO(aselle): Ideally extract this from the schema.
 
@@ -113,4 +113,4 @@ inline LSHProjectionType LSHProjectionTypeToSchema(
 }
 
 }  // namespace tflite
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_WRITER_ENUM_MAPPING_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_WRITER_ENUM_MAPPING_H_
diff --git a/tensorflow/contrib/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
similarity index 97%
rename from tensorflow/contrib/lite/experimental/writer/option_writer_generator.cc
rename to tensorflow/lite/experimental/writer/option_writer_generator.cc
index b35c6e06553c44c10979cd3edb68fa76638e6602..fa360a2f47e3dba34e05d2e32616821294f0e678 100644
--- a/tensorflow/contrib/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 #include "flatbuffers/minireflect.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/schema/reflection/schema_generated.h"
+#include "tensorflow/lite/schema/reflection/schema_generated.h"
 
 namespace tflite {
 namespace {
 // This is generated by grepping
-//  cat  third_party/tensorflow/contrib/lite/builtin_op_data.h
+//  cat  third_party/tensorflow/lite/builtin_op_data.h
 //| grep "^} TfLite" | sed 's/^} TfLite\(.*\)Params;/\1Params/g' | grep -v "^}"
 static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLitePoolParams",
@@ -44,6 +44,7 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteLocalResponseNormParams",
                                       "TfLiteLSTMParams",
                                       "TfLiteResizeBilinearParams",
+                                      "TfLiteResizeNearestNeighborParams",
                                       "TfLitePadParams",
                                       "TfLitePadV2Params",
                                       "TfLiteReshapeParams",
@@ -55,6 +56,7 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteTransposeParams",
                                       "TfLiteReducerParams",
                                       "TfLiteSplitParams",
+                                      "TfLiteSplitVParams",
                                       "TfLiteSqueezeParams",
                                       "TfLiteStridedSliceParams",
                                       "TfLiteArgMaxParams",
@@ -65,6 +67,8 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteFakeQuantParams",
                                       "TfLitePackParams",
                                       "TfLiteOneHotParams",
+                                      "TfLiteLeakyReluParams",
+                                      "TfLiteMirrorPaddingParams",
                                       nullptr};
 }  // namespace
 
@@ -151,6 +155,7 @@ class OpOptionData {
     op_to_option_["BIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
+    op_to_option_["MIRROR_PAD"] = "";  // TODO(karimnosseir): MirrorPadOptions.
     // Manually specified mappings between ops and options (none)
     op_to_option_["EMBEDDING_LOOKUP"] =
         "";  // TODO(aselle): maybe something else.
diff --git a/tensorflow/contrib/lite/experimental/writer/writer.cc b/tensorflow/lite/experimental/writer/writer.cc
similarity index 89%
rename from tensorflow/contrib/lite/experimental/writer/writer.cc
rename to tensorflow/lite/experimental/writer/writer.cc
index 20ede214fba79578acdd53b13dde3556207ca292..c1de0333676041202ba4262eeb2adbd30e650ce9 100644
--- a/tensorflow/contrib/lite/experimental/writer/writer.cc
+++ b/tensorflow/lite/experimental/writer/writer.cc
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include <iostream>
 
-#include "tensorflow/contrib/lite/experimental/writer/writer_lib.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/experimental/writer/writer_lib.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
 
 int main(int argc, char* argv[]) {
   if (argc != 3) {
diff --git a/tensorflow/contrib/lite/experimental/writer/writer_lib.cc b/tensorflow/lite/experimental/writer/writer_lib.cc
similarity index 95%
rename from tensorflow/contrib/lite/experimental/writer/writer_lib.cc
rename to tensorflow/lite/experimental/writer/writer_lib.cc
index 555a9cc4b09f30e2344ff30c409d2d2c37e6ea41..a0ce4b716d62c5a24342f5a3863e58eb203f7441 100644
--- a/tensorflow/contrib/lite/experimental/writer/writer_lib.cc
+++ b/tensorflow/lite/experimental/writer/writer_lib.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/experimental/writer/writer_lib.h"
+#include "tensorflow/lite/experimental/writer/writer_lib.h"
 #include <cstdlib>
 #include <cstring>
 #include <unordered_map>
-#include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/context_util.h"
-#include "tensorflow/contrib/lite/experimental/writer/enum_mapping.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/schema/reflection/schema_generated.h"
-#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/experimental/writer/enum_mapping.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/reflection/schema_generated.h"
+#include "tensorflow/lite/version.h"
 
 namespace tflite {
 template <class T>
@@ -33,7 +33,7 @@ using FlatBufferBuilder = flatbuffers::FlatBufferBuilder;
 std::pair<BuiltinOptions, Offset<void>> CreateBuiltinUnion(
     FlatBufferBuilder* fbb, enum BuiltinOperator op, void* builtin_op_data) {
   switch (op) {
-#include "tensorflow/contrib/lite/experimental/writer/option_writer_generated.h"
+#include "tensorflow/lite/experimental/writer/option_writer_generated.h"
   }
   return std::make_pair(BuiltinOptions_NONE, Offset<void>());
 }
diff --git a/tensorflow/contrib/lite/experimental/writer/writer_lib.h b/tensorflow/lite/experimental/writer/writer_lib.h
similarity index 88%
rename from tensorflow/contrib/lite/experimental/writer/writer_lib.h
rename to tensorflow/lite/experimental/writer/writer_lib.h
index a5f14697cfd223a637770e66bdc02278383144b2..08c0436932ffc9d8fba7b3530269b4b09b58fc09 100644
--- a/tensorflow/contrib/lite/experimental/writer/writer_lib.h
+++ b/tensorflow/lite/experimental/writer/writer_lib.h
@@ -16,7 +16,7 @@ limitations under the License.
 //
 // Usage:
 //  From command line:
-//   bazel run third_party/tensorflow/contrib/lite/experimental/writer:writer
+//   bazel run third_party/tensorflow/lite/experimental/writer:writer
 //     -- foo.tflite foo.out.tflite
 //
 // From C++
@@ -24,16 +24,16 @@ limitations under the License.
 //   // Build Interpreter however
 //   // ... <omitted>
 //   InterpreterWriter(interpreter.get()).Write("output.tflite");
-#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_WRITER_WRITER_LIB_H_
-#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_WRITER_WRITER_LIB_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_WRITER_WRITER_LIB_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_WRITER_WRITER_LIB_H_
 #include <iostream>
 #include <unordered_map>
-#include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/context_util.h"
-#include "tensorflow/contrib/lite/experimental/writer/enum_mapping.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/schema/reflection/schema_generated.h"
-#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/experimental/writer/enum_mapping.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/reflection/schema_generated.h"
+#include "tensorflow/lite/version.h"
 
 namespace tflite {
 
@@ -128,4 +128,4 @@ class InterpreterWriter {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_WRITER_WRITER_LIB_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_WRITER_WRITER_LIB_H_
diff --git a/tensorflow/contrib/lite/experimental/writer/writer_lib_test.cc b/tensorflow/lite/experimental/writer/writer_lib_test.cc
similarity index 89%
rename from tensorflow/contrib/lite/experimental/writer/writer_lib_test.cc
rename to tensorflow/lite/experimental/writer/writer_lib_test.cc
index 49194a76c8c084bad2ae69634aa8702dc1fc0a7f..e04c678a50f72a22e76fc6bdd7b4af6e9f3b5f38 100644
--- a/tensorflow/contrib/lite/experimental/writer/writer_lib_test.cc
+++ b/tensorflow/lite/experimental/writer/writer_lib_test.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/experimental/writer/writer_lib.h"
+#include "tensorflow/lite/experimental/writer/writer_lib.h"
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 // Make an interpreter that has no tensors and no nodes
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..36bf4f4618c42f4e56ce79b73c50c0454644a26d
--- /dev/null
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -0,0 +1,86 @@
+upper_tabs:
+# Tabs left of dropdown menu
+- include: /_upper_tabs_left.yaml
+- include: /api_docs/_upper_tabs_api.yaml
+# Dropdown menu
+- name: Resources
+  path: /resources
+  is_default: true
+  menu:
+  - include: /resources/_menu_toc.yaml
+  lower_tabs:
+    # Subsite tabs
+    other:
+    - name: Guide
+      contents:
+      - title: Overview
+        path: /lite/overview
+      - title: Developer guide
+        path: /lite/devguide
+      - title: Android demo app
+        path: /lite/demo_android
+      - title: iOS demo app
+        path: /lite/demo_ios
+      - break: true
+      - title: TensorFlow Lite APIs
+        path: /lite/apis
+      - title: Custom operators
+        path: /lite/custom_operators
+      - title: TensorFlow Lite ops versioning
+        path: /lite/ops_versioning
+      - title: TensorFlow Lite compatibility guide
+        path: /lite/tf_ops_compatibility
+      - title: List of hosted models
+        path: /lite/models
+      - title: TensorFlow Lite for iOS
+        path: /lite/ios
+      - title: TensorFlow Lite for Raspberry Pi
+        path: /lite/rpi
+
+      - heading: TF Lite converter
+      - title: Overview
+        path: /lite/convert/
+      - title: Python API guide
+        path: /lite/convert/python_api
+      - title: Command line examples
+        path: /lite/convert/cmdline_examples
+      - title: Command line reference
+        path: /lite/convert/cmdline_reference
+
+      - heading: Performance
+      - title: Best practices
+        path: /lite/performance/best_practices
+      - title: Benchmarks
+        path: /lite/performance/benchmarks
+      - title: Model optimization
+        path: /lite/performance/model_optimization
+      - title: Post-training quantization
+        path: /lite/performance/post_training_quantization
+      - title: Post-training quantization example
+        path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb
+        status: external
+
+      - title: TF Mobile
+        style: accordion
+        status: deprecated
+        section:
+        - title: Overview
+          path: /lite/tfmobile/
+        - title: Building TensorFlow on Android
+          path: /lite/tfmobile/android_build
+        - title: Building TensorFlow on IOS
+          path: /lite/tfmobile/ios_build
+        - title: Integrating TensorFlow libraries
+          path: /lite/tfmobile/linking_libs
+        - title: Preparing models for mobile deployment
+          path: /lite/tfmobile/prepare_models
+        - title: Optimizing for mobile
+          path: /lite/tfmobile/optimizing
+
+    - name: API
+      skip_translation: true
+      contents:
+      - title: API
+        path: /api_docs/python/tf/lite
+
+- include: /_upper_tabs_right.yaml
diff --git a/tensorflow/lite/g3doc/_index.yaml b/tensorflow/lite/g3doc/_index.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1b3f1d616ae953e3c6a659301d7a7dd6860dcbf2
--- /dev/null
+++ b/tensorflow/lite/g3doc/_index.yaml
@@ -0,0 +1,224 @@
+project_path: /lite/_project.yaml
+book_path: /lite/_book.yaml
+description: <!--no description-->
+landing_page:
+  custom_css_path: /site-assets/css/style.css
+  rows:
+  - heading: TensorFlow Lite is for mobile and embedded devices.
+    description: >
+      <p style="max-width: 75%;">
+        TensorFlow Lite is the official solution for running machine learning
+        models on mobile and embedded devices. It enables on&#8209;device machine
+        learning inference with low latency and a small binary size on Android,
+        iOS, and other operating systems.
+      </p>
+      <style>
+      .tfo-landing-row-heading {
+        padding-top: 0 !important;
+      }
+      .tfo-landing-row-heading h2 {
+        margin-top: 0 !important;
+      }
+      .tfo-landing-row-heading-list ol, .tfo-landing-row-heading-list ul {
+        margin-top: 0;
+      }
+      </style>
+
+  - classname: tfo-landing-row-heading tfo-landing-row-heading-list
+    heading: Many benefits
+    description: >
+      On-device ML inference is difficult because of the many constraints—TensorFlow Lite can solve these:
+    items:
+    - list:
+      - heading: Performance
+        description: >
+          TF Lite is fast with no noticeable accuracy loss—see the <a href="./performance">metrics</a>.
+        icon:
+          icon_name: lens
+          foreground: theme
+      - heading: Portability
+        description: >
+          <a href="https://developer.android.com/ndk/guides/neuralnetworks/" class="external">Android</a>,
+          iOS, and more specialized IoT devices.
+        icon:
+          icon_name: lens
+          foreground: theme
+    - list:
+      - heading: Low latency
+        description: >
+          Optimized float- and fixed-point CPU kernels, op&#8209;fusing, and more.
+        icon:
+          icon_name: lens
+          foreground: theme
+      - heading: Acceleration
+        description: >
+          Integration with GPU and internal/external accelerators.
+        icon:
+          icon_name: lens
+          foreground: theme
+    - list:
+      - heading: Small model size
+        description: >
+          Controlled dependencies, <a href="https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3" class="external">quantization</a>,
+          and op&nbsp;registration.
+        icon:
+          icon_name: lens
+          foreground: theme
+      - heading: Tooling
+        description: >
+          Conversion, compression, benchmarking, power-consumption, and more.
+        icon:
+          icon_name: lens
+          foreground: theme
+
+  - classname: devsite-landing-row-logos tfo-landing-row-heading
+    heading: Companies using TensorFlow Lite
+    items:
+    - custom_image:
+        path: ./images/landing-page/photos_logo.png
+      path: https://www.photos.google.com
+    - custom_image:
+        path: ./images/landing-page/gboard_logo.png
+      path: https://play.google.com/store/apps/details?id=com.google.android.inputmethod.latin&hl=en_US
+    - custom_image:
+        path: ./images/landing-page/gmail_logo.png
+      path: https://www.google.com/gmail/
+    - custom_image:
+        path: ./images/landing-page/assistant_logo.png
+      path: https://assistant.google.com/
+
+  - classname: devsite-landing-row-logos
+    items:
+    - custom_image:
+        path: ./images/landing-page/vsco_logo.png
+      path: https://vsco.co
+    - custom_image:
+        path: ./images/landing-page/shazam_logo.png
+      path: https://www.shazam.com/
+    - custom_image:
+        path: ./images/landing-page/nest_logo.png
+      path: https://nest.com/
+    - custom_image:
+        path: ./images/landing-page/loseit_logo.png
+      path: https://www.loseit.com/
+
+  - classname: devsite-landing-row-no-image-background devsite-landing-row-67
+    background: grey
+    items:
+    - description: >
+        <em>“TensorFlow Lite helped us introduce machine learning and AI into our
+        app in an easy and streamlined way. We could reduce the size of our
+        models while keeping the accuracy high. This helped us create an amazing
+        fishing experience for our users by allowing them to identify any fish
+        species with just a photo.”</em>
+      image_path: ./images/landing-page/fishbrain_logo_big.png
+
+  - heading: How it works
+    items:
+    - heading: Build
+      icon:
+        icon_name: build
+      description: >
+        Build a new model or retrain an existing one, such as using transfer learning.
+      buttons:
+      - label: Read the developer guide
+        path: /lite/devguide
+        classname: button button-primary tfo-button-primary
+    - heading: Convert
+      icon:
+        icon_name: autorenew
+      description: >
+        Convert a TensorFlow model into a compressed flat buffer with the
+        TensorFlow Lite Converter.
+      buttons:
+      - label: Read the converter guide
+        path: /lite/convert/
+        classname: button button-primary tfo-button-primary
+    - heading: Deploy
+      icon:
+        icon_name: bolt
+      description: >
+        Take the compressed <code>.tflite</code> file and load it into a mobile
+        or embedded device.<br/>
+        See the <a href="#build-your-first-tensorflow-lite-app">tutorials below</a> to build an app.
+
+  - heading: Build your first TensorFlow Lite app
+    background: grey
+    items:
+    - classname: tfo-landing-row-item-inset-white
+      heading: Get started
+      description: >
+        <ul>
+          <li>Beginner: <a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/" class="external">TensorFlow for Poets</a></li>
+          <li>Beginner: <a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-tflite/" class="external">TensorFlow for Poets 2: Android</a></li>
+          <li>Beginner: <a href="https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-ios/" class="external">TensorFlow for Poets 2: iOS </a></li>
+          <li>Intermediate: <a href="https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193" class="external">Object detection tutorial</a>
+        </ul>
+    - classname: tfo-landing-row-item-inset-white
+      heading: Share your TensorFlow Lite story
+      description: >
+        We love to hear what you're working on—it may even get highlighted on
+        our social media! <a href="https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss" class="external">Tell us</a>.
+
+  - classname: devsite-landing-row-no-image-background devsite-landing-row-67
+    items:
+    - description: >
+        <p>
+          <em>“The release of TensorFlow Lite has allowed us to deploy an engaging
+          real-time experience to our users that eliminates the requirement
+          for a data connection. TensorFlow Lite’s ability to compress and
+          optimize the TensorFlow graph for mobile deployment has been
+          transformative in expanding the capabilities of Snap It.</em>
+        </p>
+        <p>
+          <em>Through TensorFlow Lite, our users can now enjoy a state of the
+          art, computer-vision-based food logging experience without worrying
+          about signal strength. We look forward to future collaborations
+          with the TensorFlow Lite team.”</em>
+        </p>
+      image_path: ./images/landing-page/loseit_logo_big.png
+
+  - classname: devsite-landing-row-cards
+    background: grey
+    heading: Updates
+    items:
+    - heading: "AI in motion: react in the real world"
+      image_path: ./images/landing-page/ai_in_motion.png
+      path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
+      buttons:
+      - label: Read more
+        path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
+    - heading: "Introducing the Model Optimization Toolkit"
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
+      buttons:
+      - label: Read on TensorFlow blog
+        path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
+    - heading: "East Africa Cassava App"
+      image_path: ./images/landing-page/detect_crop_disease_in_africa.png
+      path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
+      buttons:
+      - label: Read more
+        path: https://heartbeat.fritz.ai/community-spotlight-nuru-a-mobile-app-by-plantvillage-to-detect-crop-disease-in-africa-28d142bf63d5
+
+  - classname: devsite-landing-row-cards
+    background: grey
+    items:
+    - heading: "Using TensorFlow Lite on Android"
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
+      buttons:
+      - label: Read on TensorFlow blog
+        path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
+    - heading: "TensorFlow Lite at the Dev Summit"
+      youtube_id: FAMfy7izB6A
+      buttons:
+      - label: Watch the video
+        path: https://www.youtube.com/watch?v=FAMfy7izB6A
+    - heading: "TensorFlow Lite on GitHub"
+      image_path: /resources/images/github-card-16x9.png
+      path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite
+      buttons:
+      - label: View on GitHub
+        path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite
+    - classname: devsite-landing-row-item-hidden
diff --git a/tensorflow/contrib/lite/g3doc/_project.yaml b/tensorflow/lite/g3doc/_project.yaml
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/_project.yaml
rename to tensorflow/lite/g3doc/_project.yaml
diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/lite/g3doc/apis.md
similarity index 95%
rename from tensorflow/contrib/lite/g3doc/apis.md
rename to tensorflow/lite/g3doc/apis.md
index 69616c7b8a3c1cb93663a6f37eee506cfcdbae72..b15159ce4145727863c335126557e06402f8dbd3 100644
--- a/tensorflow/contrib/lite/g3doc/apis.md
+++ b/tensorflow/lite/g3doc/apis.md
@@ -304,6 +304,13 @@ one of the following primitive types:
 *   `long`
 *   `byte`
 
+`String` types are also supported, but they are encoded differently than the
+primitive types. In particular, the shape of a string Tensor dictates the number
+and arrangement of strings in the Tensor, with each element itself being a
+variable length string. In this sense, the (byte) size of the Tensor cannot be
+computed from the shape and type alone, and consequently strings cannot be
+provided as a single, flat `ByteBuffer` argument.
+
 If other data types, including boxed types like `Integer` and `Float`, are used,
 an `IllegalArgumentException` will be thrown.
 
@@ -345,13 +352,12 @@ interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
 ```
 
 where each entry in `inputs` corresponds to an input tensor and
-`map_of_indices_to_outputs` maps indices of output tensors to the
-corresponding output data. In both cases the tensor indices should correspond to
-the values given to the [TensorFlow Lite Optimized Converter](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md)
+`map_of_indices_to_outputs` maps indices of output tensors to the corresponding
+output data. In both cases the tensor indices should correspond to the values
+given to the [TensorFlow Lite Optimized Converter](convert/cmdline_examples.md)
 when the model was created. Be aware that the order of tensors in `input` must
 match the order given to the `TensorFlow Lite Optimized Converter`.
 
-
 The Java API also provides convenient functions for app developers to get the
 index of any model input or output using a tensor name:
 
diff --git a/tensorflow/lite/g3doc/convert/cmdline_examples.md b/tensorflow/lite/g3doc/convert/cmdline_examples.md
new file mode 100644
index 0000000000000000000000000000000000000000..de81e2cfdd41d6232ee1b76985a2e7dc9167e88f
--- /dev/null
+++ b/tensorflow/lite/g3doc/convert/cmdline_examples.md
@@ -0,0 +1,360 @@
+# Converter command-line examples
+
+This page shows how to use the TensorFlow Lite Converter in the command line.
+
+[TOC]
+
+## Command-line tools <a name="tools"></a>
+
+There are two approaches to running the converter in the command line.
+
+*   `tflite_convert`: Starting from TensorFlow 1.9, the command-line tool
+    `tflite_convert` is installed as part of the Python package. All of the
+    examples below use `tflite_convert` for simplicity.
+    *   Example: `tflite_convert --output_file=...`
+*   `bazel`: In order to run the latest version of the TensorFlow Lite Converter
+    either install the nightly build using
+    [pip](https://www.tensorflow.org/install/pip) or
+    [clone the TensorFlow repository](https://www.tensorflow.org/install/source)
+    and use `bazel`.
+    *   Example: `bazel run
+        //tensorflow/lite/python:tflite_convert --
+        --output_file=...`
+
+### Converting models prior to TensorFlow 1.9 <a name="pre_tensorflow_1.9"></a>
+
+The recommended approach for using the converter prior to TensorFlow 1.9 is the
+[Python API](python_api.md#pre_tensorflow_1.9). If a command line tool is
+desired, the `toco` command line tool was available in TensorFlow 1.7. Enter
+`toco --help` in Terminal for additional details on the command-line flags
+available. There were no command line tools in TensorFlow 1.8.
+
+## Basic examples <a name="basic"></a>
+
+The following section shows examples of how to convert a basic float-point model
+from each of the supported data formats into a TensorFlow Lite FlatBuffers.
+
+### Convert a TensorFlow GraphDef <a name="graphdef"></a>
+
+The follow example converts a basic TensorFlow GraphDef (frozen by
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py))
+into a TensorFlow Lite FlatBuffer to perform floating-point inference. Frozen
+graphs contain the variables stored in Checkpoint files as Const ops.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+tflite_convert \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1
+```
+
+The value for `input_shapes` is automatically determined whenever possible.
+
+### Convert a TensorFlow SavedModel <a name="savedmodel"></a>
+
+The follow example converts a basic TensorFlow SavedModel into a Tensorflow Lite
+FlatBuffer to perform floating-point inference.
+
+```
+tflite_convert \
+  --output_file=/tmp/foo.tflite \
+  --saved_model_dir=/tmp/saved_model
+```
+
+[SavedModel](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators)
+has fewer required flags than frozen graphs due to access to additional data
+contained within the SavedModel. The values for `--input_arrays` and
+`--output_arrays` are an aggregated, alphabetized list of the inputs and outputs
+in the [SignatureDefs](https://www.tensorflow.org/serving/signature_defs) within
+the
+[MetaGraphDef](https://www.tensorflow.org/guide/saved_model#apis_to_build_and_load_a_savedmodel)
+specified by `--saved_model_tag_set`. As with the GraphDef, the value for
+`input_shapes` is automatically determined whenever possible.
+
+There is currently no support for MetaGraphDefs without a SignatureDef or for
+MetaGraphDefs that use the [`assets/`
+directory](https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory).
+
+### Convert a tf.Keras model <a name="keras"></a>
+
+The following example converts a `tf.keras` model into a TensorFlow Lite
+Flatbuffer. The `tf.keras` file must contain both the model and the weights.
+
+```
+tflite_convert \
+  --output_file=/tmp/foo.tflite \
+  --keras_model_file=/tmp/keras_model.h5
+```
+
+## Quantization
+
+### Convert a TensorFlow GraphDef for quantized inference <a name="graphdef_quant"></a>
+
+The TensorFlow Lite Converter is compatible with fixed point quantization models
+described
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/q
+uantize/README.md). These are float models with `FakeQuant*` ops inserted at the
+boundaries of fused layers to record min-max range information. This generates a
+quantized inference workload that reproduces the quantization behavior that was
+used during training.
+
+The following command generates a quantized TensorFlow Lite FlatBuffer from a
+"quantized" TensorFlow GraphDef.
+
+```
+tflite_convert \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/some_quantized_graph.pb \
+  --inference_type=QUANTIZED_UINT8 \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --mean_values=128 \
+  --std_dev_values=127
+```
+
+### Use \"dummy-quantization\" to try out quantized inference on a float graph <a name="dummy_quant"></a>
+
+In order to evaluate the possible benefit of generating a quantized graph, the
+converter allows "dummy-quantization" on float graphs. The flags
+`--default_ranges_min` and `--default_ranges_max` accept plausible values for
+the min-max ranges of the values in all arrays that do not have min-max
+information. "Dummy-quantization" will produce lower accuracy but will emulate
+the performance of a correctly quantized model.
+
+The example below contains a model using Relu6 activation functions. Therefore,
+a reasonable guess is that most activation ranges should be contained in [0, 6].
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+tflite_convert \
+  --output_file=/tmp/foo.cc \
+  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --inference_type=QUANTIZED_UINT8 \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --default_ranges_min=0 \
+  --default_ranges_max=6 \
+  --mean_values=128 \
+  --std_dev_values=127
+```
+
+## Specifying input and output arrays
+
+### Multiple input arrays
+
+The flag `input_arrays` takes in a comma-separated list of input arrays as seen
+in the example below. This is useful for models or subgraphs with multiple
+inputs.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+tflite_convert \
+  --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --output_file=/tmp/foo.tflite \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
+  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --output_arrays=InceptionV1/Logits/Predictions/Reshape_1
+```
+
+Note that `input_shapes` is provided as a colon-separated list. Each input shape
+corresponds to the input array at the same position in the respective list.
+
+### Multiple output arrays
+
+The flag `output_arrays` takes in a comma-separated list of output arrays as
+seen in the example below. This is useful for models or subgraphs with multiple
+outputs.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+tflite_convert \
+  --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --output_file=/tmp/foo.tflite \
+  --input_arrays=input \
+  --output_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu
+```
+
+### Specifying subgraphs
+
+Any array in the input file can be specified as an input or output array in
+order to extract subgraphs out of an input graph file. The TensorFlow Lite
+Converter discards the parts of the graph outside of the specific subgraph. Use
+[graph visualizations](#graph_visualizations) to identify the input and output
+arrays that make up the desired subgraph.
+
+The follow command shows how to extract a single fused layer out of a TensorFlow
+GraphDef.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+tflite_convert \
+  --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --output_file=/tmp/foo.pb \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
+  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --output_arrays=InceptionV1/InceptionV1/Mixed_3b/concat_v2
+```
+
+Note that the final representation in TensorFlow Lite FlatBuffers tends to have
+coarser granularity than the very fine granularity of the TensorFlow GraphDef
+representation. For example, while a fully-connected layer is typically
+represented as at least four separate ops in TensorFlow GraphDef (Reshape,
+MatMul, BiasAdd, Relu...), it is typically represented as a single "fused" op
+(FullyConnected) in the converter's optimized representation and in the final
+on-device representation. As the level of granularity gets coarser, some
+intermediate arrays (say, the array between the MatMul and the BiasAdd in the
+TensorFlow GraphDef) are dropped.
+
+When specifying intermediate arrays as `--input_arrays` and `--output_arrays`,
+it is desirable (and often required) to specify arrays that are meant to survive
+in the final form of the graph, after fusing. These are typically the outputs of
+activation functions (since everything in each layer until the activation
+function tends to get fused).
+
+## Logging
+
+
+## Graph visualizations
+
+The converter can export a graph to the Graphviz Dot format for easy
+visualization using either the `--output_format` flag or the
+`--dump_graphviz_dir` flag. The subsections below outline the use cases for
+each.
+
+### Using `--output_format=GRAPHVIZ_DOT` <a name="using_output_format_graphviz_dot"></a>
+
+The first way to get a Graphviz rendering is to pass `GRAPHVIZ_DOT` into
+`--output_format`. This results in a plausible visualization of the graph. This
+reduces the requirements that exist during conversion from a TensorFlow GraphDef
+to a TensorFlow Lite FlatBuffer. This may be useful if the conversion to TFLite
+is failing.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+tflite_convert \
+  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.dot \
+  --output_format=GRAPHVIZ_DOT \
+  --input_shape=1,128,128,3 \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1
+```
+
+The resulting `.dot` file can be rendered into a PDF as follows:
+
+```
+dot -Tpdf -O /tmp/foo.dot
+```
+
+And the resulting `.dot.pdf` can be viewed in any PDF viewer, but we suggest one
+with a good ability to pan and zoom across a very large page. Google Chrome does
+well in that respect.
+
+```
+google-chrome /tmp/foo.dot.pdf
+```
+
+Example PDF files are viewable online in the next section.
+
+### Using `--dump_graphviz_dir`
+
+The second way to get a Graphviz rendering is to pass the `--dump_graphviz_dir`
+flag, specifying a destination directory to dump Graphviz rendering to. Unlike
+the previous approach, this one retains the original output format. This
+provides a visualization of the actual graph resulting from a specific
+conversion process.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+tflite_convert \
+  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.tflite \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --dump_graphviz_dir=/tmp
+```
+
+This generates a few files in the destination directory. The two most important
+files are `toco_AT_IMPORT.dot` and `/tmp/toco_AFTER_TRANSFORMATIONS.dot`.
+`toco_AT_IMPORT.dot` represents the original graph containing only the
+transformations done at import time. This tends to be a complex visualization
+with limited information about each node. It is useful in situations where a
+conversion command fails.
+
+`toco_AFTER_TRANSFORMATIONS.dot` represents the graph after all transformations
+were applied to it, just before it is exported. Typically, this is a much
+smaller graph with more information about each node.
+
+As before, these can be rendered to PDFs:
+
+```
+dot -Tpdf -O /tmp/toco_*.dot
+```
+
+Sample output files can be seen here below. Note that it is the same
+`AveragePool` node in the top right of each image.
+
+<table><tr>
+  <td>
+    <a target="_blank" href="https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AT_IMPORT.dot.pdf">
+      <img src="../images/convert/sample_before.png"/>
+    </a>
+  </td>
+  <td>
+    <a target="_blank" href="https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AFTER_TRANSFORMATIONS.dot.pdf">
+      <img src="../images/convert/sample_after.png"/>
+    </a>
+  </td>
+</tr>
+<tr><td>before</td><td>after</td></tr>
+</table>
+
+### Graph "video" logging
+
+When `--dump_graphviz_dir` is used, one may additionally pass
+`--dump_graphviz_video`. This causes a graph visualization to be dumped after
+each individual graph transformation, resulting in thousands of files.
+Typically, one would then bisect into these files to understand when a given
+change was introduced in the graph.
+
+### Legend for the graph visualizations <a name="graphviz_legend"></a>
+
+*   Operators are red square boxes with the following hues of red:
+    *   Most operators are
+        <span style="background-color:#db4437;color:white;border:1px;border-style:solid;border-color:black;padding:1px">bright
+        red</span>.
+    *   Some typically heavy operators (e.g. Conv) are rendered in a
+        <span style="background-color:#c53929;color:white;border:1px;border-style:solid;border-color:black;padding:1px">darker
+        red</span>.
+*   Arrays are octagons with the following colors:
+    *   Constant arrays are
+        <span style="background-color:#4285f4;color:white;border:1px;border-style:solid;border-color:black;padding:1px">blue</span>.
+    *   Activation arrays are gray:
+        *   Internal (intermediate) activation arrays are
+            <span style="background-color:#f5f5f5;border:1px;border-style:solid;border-color:black;border:1px;border-style:solid;border-color:black;padding:1px">light
+            gray</span>.
+        *   Those activation arrays that are designated as `--input_arrays` or
+            `--output_arrays` are
+            <span style="background-color:#9e9e9e;border:1px;border-style:solid;border-color:black;padding:1px">dark
+            gray</span>.
+    *   RNN state arrays are green. Because of the way that the converter
+        represents RNN back-edges explicitly, each RNN state is represented by a
+        pair of green arrays:
+        *   The activation array that is the source of the RNN back-edge (i.e.
+            whose contents are copied into the RNN state array after having been
+            computed) is
+            <span style="background-color:#b7e1cd;border:1px;border-style:solid;border-color:black;padding:1px">light
+            green</span>.
+        *   The actual RNN state array is
+            <span style="background-color:#0f9d58;color:white;border:1px;border-style:solid;border-color:black;padding:1px">dark
+            green</span>. It is the destination of the RNN back-edge updating
+            it.
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/lite/g3doc/convert/cmdline_reference.md
similarity index 91%
rename from tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
rename to tensorflow/lite/g3doc/convert/cmdline_reference.md
index 00bc8d4ccb8aedcfe701377419e6cd41d0b59855..d72a46760d48dae46d63f1e914d8afda3f527e27 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/lite/g3doc/convert/cmdline_reference.md
@@ -1,19 +1,10 @@
-# TensorFlow Lite Optimizing Converter command-line glossary
+# Converter command-line reference
 
-This page is complete reference of command-line flags used by TOCO's command
-line starting from TensorFlow 1.9 up until the most recent build of TensorFlow.
-It is complemented by the following other documents:
+This page is complete reference of command-line flags used by the TensorFlow
+Lite Converter's command line starting from TensorFlow 1.9 up until the most
+recent build of TensorFlow.
 
-*   [README](../README.md)
-*   [Command-line examples](cmdline_examples.md)
-*   [Python API examples](python_api.md)
-
-Table of contents:
-
-*   [High-level flags](#high-level-flags)
-*   [Model flags](#model-flags)
-*   [Transformation flags](#transformation-flags)
-*   [Logging flags](#logging-flags)
+[TOC]
 
 ## High-level flags
 
@@ -32,7 +23,7 @@ files. The flag `--output_file` is always required. Additionally, either
 *   `--output_format`. Type: string. Default: `TFLITE`. Specifies the format of
     the output file. Allowed values:
     *   `TFLITE`: TensorFlow Lite FlatBuffer format.
-    *   `GRAPHVIZ_DOT`: GraphViz `.dot` format containg a visualization of the
+    *   `GRAPHVIZ_DOT`: GraphViz `.dot` format containing a visualization of the
         graph after graph transformations.
         *   Note that passing `GRAPHVIZ_DOT` to `--output_format` leads to loss
             of TFLite specific transformations. Therefore, the resulting
@@ -68,7 +59,7 @@ based on index.
 *   `--input_shapes`. Type: colon-separated list of comma-separated lists of
     integers. Each comma-separated list of integers gives the shape of one of
     the input arrays specified in
-    [TensorFlow convention](https://www.tensorflow.org/versions/r1.2/programmers_guide/dims_types#shape).
+    [TensorFlow convention](https://www.tensorflow.org/guide/tensors#shape).
     *   Example: `--input_shapes=1,60,80,3` for a typical vision model means a
         batch size of 1, an input image height of 60, an input image width of
         80, and an input image depth of 3 (representing RGB channels).
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..60fa265c295174453b1a910f5279807dd0be32cb
--- /dev/null
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -0,0 +1,25 @@
+# TensorFlow Lite Converter
+
+The TensorFlow Lite Converter takes a TensorFlow graph file and creates a graph
+file used by the TensorFlow Lite interpreter.
+
+## From model training to device deployment
+
+After a TensorFlow model is trained, the TensorFlow Lite converter uses that
+model to generate a TensorFlow Lite
+[FlatBuffer](https://google.github.io/flatbuffers/) file (`.tflite`). The
+converter supports as input:
+[SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators),
+frozen graphs (models generated by
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)),
+and `tf.keras` HDF5 models. The TensorFlow Lite `FlatBuffer` file is deployed to
+a client device (generally a mobile or embedded device), and the TensorFlow Lite
+interpreter uses the compressed model for on-device inference. This conversion
+process is shown in the diagram below:
+
+![TFLite converter workflow](../images/convert/workflow.svg)
+
+The TensorFlow Lite Converter can be used either from [Python](python_api.md) or
+from the [command line](cmdline_examples.md). This allows you to integrate the
+conversion step into the model design workflow, ensuring the model is easy to
+convert to a mobile inference graph.
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..4d2c7361c9f399848c161ccc706c71894625725d
--- /dev/null
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -0,0 +1,256 @@
+# Converter Python API guide
+
+This page provides examples on how to use the TensorFlow Lite Converter and the
+TensorFlow Lite interpreter using the Python API.
+
+Note: These docs describe the converter in the TensorFlow nightly release,
+installed using `pip install tf-nightly`. For docs describing older versions
+reference ["Converting models from TensorFlow 1.12"](#pre_tensorflow_1.12).
+
+[TOC]
+
+
+## High-level overview
+
+While the TensorFlow Lite Converter can be used from the command line, it is
+often convenient to use in a Python script as part of the model development
+pipeline. This allows you to know early that you are designing a model that can
+be targeted to devices with mobile.
+
+## API
+
+The API for converting TensorFlow models to TensorFlow Lite is
+`tf.lite.TFLiteConverter`. The API for calling the Python interpreter is
+`tf.lite.Interpreter`.
+
+`TFLiteConverter` provides class methods based on the original format of the
+model. `TFLiteConverter.from_session()` is available for GraphDefs.
+`TFLiteConverter.from_saved_model()` is available for SavedModels.
+`TFLiteConverter.from_keras_model_file()` is available for `tf.Keras` files.
+Example usages for simple float-point models are shown in
+[Basic Examples](#basic). Examples usages for more complex models is shown in
+[Complex Examples](#complex).
+
+## Basic examples <a name="basic"></a>
+
+The following section shows examples of how to convert a basic float-point model
+from each of the supported data formats into a TensorFlow Lite FlatBuffers.
+
+### Exporting a GraphDef from tf.Session <a name="basic_graphdef_sess"></a>
+
+The following example shows how to convert a TensorFlow GraphDef into a
+TensorFlow Lite FlatBuffer from a `tf.Session` object.
+
+```python
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+var = tf.get_variable("weights", dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + var
+out = tf.identity(val, name="out")
+
+with tf.Session() as sess:
+  sess.run(tf.global_variables_initializer())
+  converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out])
+  tflite_model = converter.convert()
+  open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+### Exporting a GraphDef from file <a name="basic_graphdef_file"></a>
+
+The following example shows how to convert a TensorFlow GraphDef into a
+TensorFlow Lite FlatBuffer when the GraphDef is stored in a file. Both `.pb` and
+`.pbtxt` files are accepted.
+
+The example uses
+[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
+The function only supports GraphDefs frozen using
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
+
+```python
+import tensorflow as tf
+
+graph_def_file = "/path/to/Downloads/mobilenet_v1_1.0_224/frozen_graph.pb"
+input_arrays = ["input"]
+output_arrays = ["MobilenetV1/Predictions/Softmax"]
+
+converter = tf.lite.TFLiteConverter.from_frozen_graph(
+  graph_def_file, input_arrays, output_arrays)
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+### Exporting a SavedModel <a name="basic_savedmodel"></a>
+
+The following example shows how to convert a SavedModel into a TensorFlow Lite
+FlatBuffer.
+
+```python
+import tensorflow as tf
+
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+For more complex SavedModels, the optional parameters that can be passed into
+`TFLiteConverter.from_saved_model()` are `input_arrays`, `input_shapes`,
+`output_arrays`, `tag_set` and `signature_key`. Details of each parameter are
+available by running `help(tf.lite.TFLiteConverter)`.
+
+### Exporting a tf.keras File <a name="basic_keras_file"></a>
+
+The following example shows how to convert a `tf.keras` model into a TensorFlow
+Lite FlatBuffer. This example requires
+[`h5py`](http://docs.h5py.org/en/latest/build.html) to be installed.
+
+```python
+import tensorflow as tf
+
+converter = tf.lite.TFLiteConverter.from_keras_model_file("keras_model.h5")
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+The `tf.keras` file must contain both the model and the weights. A comprehensive
+example including model construction can be seen below.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+# Generate tf.keras model.
+model = tf.keras.models.Sequential()
+model.add(tf.keras.layers.Dense(2, input_shape=(3,)))
+model.add(tf.keras.layers.RepeatVector(3))
+model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(3)))
+model.compile(loss=tf.keras.losses.MSE,
+              optimizer=tf.keras.optimizers.RMSprop(lr=0.0001),
+              metrics=[tf.keras.metrics.categorical_accuracy],
+              sample_weight_mode='temporal')
+
+x = np.random.random((1, 3))
+y = np.random.random((1, 3, 3))
+model.train_on_batch(x, y)
+model.predict(x)
+
+# Save tf.keras model in HDF5 format.
+keras_file = "keras_model.h5"
+tf.keras.models.save_model(model, keras_file)
+
+# Convert to TensorFlow Lite model.
+converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_file)
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+## Complex examples <a name="complex"></a>
+
+For models where the default value of the attributes is not sufficient, the
+attribute's values should be set before calling `convert()`. In order to call
+any constants use `tf.lite.constants.<CONSTANT_NAME>` as seen below with
+`QUANTIZED_UINT8`. Run `help(tf.lite.TFLiteConverter)` in the Python
+terminal for detailed documentation on the attributes.
+
+Although the examples are demonstrated on GraphDefs containing only constants.
+The same logic can be applied irrespective of the input data format.
+
+### Exporting a quantized GraphDef <a name="complex_quant"></a>
+
+The following example shows how to convert a quantized model into a TensorFlow
+Lite FlatBuffer.
+
+```python
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+val = img + const
+out = tf.fake_quant_with_min_max_args(val, min=0., max=1., name="output")
+
+with tf.Session() as sess:
+  converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out])
+  converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
+  input_arrays = converter.get_input_arrays()
+  converter.quantized_input_stats = {input_arrays[0] : (0., 1.)}  # mean, std_dev
+  tflite_model = converter.convert()
+  open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+## TensorFlow Lite Python interpreter <a name="interpreter"></a>
+
+### Using the interpreter from a model file <a name="interpreter_file"></a>
+
+The following example shows how to use the TensorFlow Lite Python interpreter
+when provided a TensorFlow Lite FlatBuffer file. The example also demonstrates
+how to run inference on random input data. Run
+`help(tf.lite.Interpreter)` in the Python terminal to get detailed
+documentation on the interpreter.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+# Load TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_path="converted_model.tflite")
+interpreter.allocate_tensors()
+
+# Get input and output tensors.
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+
+# Test model on random input data.
+input_shape = input_details[0]['shape']
+input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
+interpreter.set_tensor(input_details[0]['index'], input_data)
+
+interpreter.invoke()
+output_data = interpreter.get_tensor(output_details[0]['index'])
+print(output_data)
+```
+
+### Using the interpreter from model data <a name="interpreter_data"></a>
+
+The following example shows how to use the TensorFlow Lite Python interpreter
+when starting with the TensorFlow Lite Flatbuffer model previously loaded. This
+example shows an end-to-end use case, starting from building the TensorFlow
+model.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+val = img + const
+out = tf.identity(val, name="out")
+
+with tf.Session() as sess:
+  converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out])
+  tflite_model = converter.convert()
+
+# Load TFLite model and allocate tensors.
+interpreter = tf.lite.Interpreter(model_content=tflite_model)
+interpreter.allocate_tensors()
+```
+
+## Additional instructions
+
+### Build from source code <a name="latest_package"></a>
+
+In order to run the latest version of the TensorFlow Lite Converter Python API,
+either install the nightly build with
+[pip](https://www.tensorflow.org/install/pip) (recommended) or
+[Docker](https://www.tensorflow.org/install/docker), or
+[build the pip package from source](https://www.tensorflow.org/install/source).
+
+### Converting models from TensorFlow 1.12 <a name="pre_tensorflow_1.12"></a>
+
+Reference the following table to convert TensorFlow models to TensorFlow Lite in
+and before TensorFlow 1.12. Run `help()` to get details of each API.
+
+TensorFlow Version | Python API
+------------------ | ---------------------------------
+1.12               | `tf.contrib.lite.TFLiteConverter`
+1.9-1.11           | `tf.contrib.lite.TocoConverter`
+1.7-1.8            | `tf.contrib.lite.toco_convert`
diff --git a/tensorflow/contrib/lite/g3doc/custom_operators.md b/tensorflow/lite/g3doc/custom_operators.md
similarity index 98%
rename from tensorflow/contrib/lite/g3doc/custom_operators.md
rename to tensorflow/lite/g3doc/custom_operators.md
index ee6150b60e8e8511dc5552bbbf0c71c71d80d1fe..4a22d6a67577cf5c06f2c0d32e30650fd4d4bb32 100644
--- a/tensorflow/contrib/lite/g3doc/custom_operators.md
+++ b/tensorflow/lite/g3doc/custom_operators.md
@@ -103,7 +103,7 @@ operations instead of a single operator.
     pre-allocating the memory using temporary tensors. You may need to use
     OpData struct to reference the tensor indices in other functions. See
     example in the
-    [kernel for convolution](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/conv.cc).
+    [kernel for convolution](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/conv.cc).
     A sample code snippet is below
 
     ```
@@ -164,7 +164,7 @@ for node in frozen_graph_def.node:
           tf.TensorShape([10]),
       ])
       node.attr['_output_quantized'].b = False
-tflite_model = tf.contrib.lite.toco_convert(
+tflite_model = tf.lite.toco_convert(
         frozen_graph_def,...)
 ```
 
diff --git a/tensorflow/contrib/lite/g3doc/demo_android.md b/tensorflow/lite/g3doc/demo_android.md
similarity index 90%
rename from tensorflow/contrib/lite/g3doc/demo_android.md
rename to tensorflow/lite/g3doc/demo_android.md
index c38b928684848b858e3f6cc9df6f05e31f778b05..772598d5cfd36a388d253becd7fc3026f31375c9 100644
--- a/tensorflow/contrib/lite/g3doc/demo_android.md
+++ b/tensorflow/lite/g3doc/demo_android.md
@@ -2,7 +2,7 @@
 # Android Demo App
 
 An example Android application using TensorFLow Lite is available
-[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo).
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo).
 The demo is a sample camera app that classifies images continuously
 using either a quantized Mobilenet model or a floating point Inception-v3 model.
 To run the demo, a device running Android 5.0 ( API 21) or higher is required.
@@ -41,23 +41,23 @@ app:
   [Android Studio](https://developer.android.com/studio/index.html).
 * Make sure the Android SDK version is greater than 26 and NDK version is greater
   than 14 (in the Android Studio settings).
-* Import the `tensorflow/contrib/lite/java/demo` directory as a new
+* Import the `tensorflow/lite/java/demo` directory as a new
   Android Studio project.
 * Install all the Gradle extensions it requests.
 
 Now you can build and run the demo app. 
 
-The build process downloads the quantized [Mobilenet TensorFlow Lite model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip), and unzips it into the assets directory: `tensorflow/contrib/lite/java/demo/app/src/main/assets/`.
+The build process downloads the quantized [Mobilenet TensorFlow Lite model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip), and unzips it into the assets directory: `tensorflow/lite/java/demo/app/src/main/assets/`.
 
 Some additional details are available on the
-[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
+[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo/README.md).
 
 ### Using other models
 
 To use a different model:
 * Download the floating point [Inception-v3 model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip).
 * Unzip and copy `inceptionv3_non_slim_2015.tflite` to the assets directory. 
-* Change the chosen classifier in [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java)<br>
+* Change the chosen classifier in [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java)<br>
   from: `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`<br>
   to: `classifier = new ImageClassifierFloatInception(getActivity());`.
 
@@ -114,14 +114,14 @@ android_ndk_repository(
 ```
 
 Some additional details are available on the
-[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
+[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo/README.md).
 
 ### Build the source code
 
 To build the demo app, run `bazel`:
 
 ```
-bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
+bazel build --cxxopt=--std=c++11 //tensorflow/lite/java/demo/app/src/main:TfLiteCameraDemo
 ```
 
 Caution: Because of an bazel bug, we only support building the Android demo app
diff --git a/tensorflow/contrib/lite/g3doc/demo_ios.md b/tensorflow/lite/g3doc/demo_ios.md
similarity index 95%
rename from tensorflow/contrib/lite/g3doc/demo_ios.md
rename to tensorflow/lite/g3doc/demo_ios.md
index 7579ad84a049ec592aafb16ce95a4b703ac78c5a..fbf1dd6392591183d0dc484018bba501de1851d8 100644
--- a/tensorflow/contrib/lite/g3doc/demo_ios.md
+++ b/tensorflow/lite/g3doc/demo_ios.md
@@ -38,11 +38,11 @@ instructions walk you through building and running the demo on an iOS device.
 2. Download the model files used by the demo app (this is done from inside the
    cloned directory):
 
-        sh tensorflow/contrib/lite/examples/ios/download_models.sh
+        sh tensorflow/lite/examples/ios/download_models.sh
 
 3. Install the pod to generate the workspace file:
 
-        cd tensorflow/contrib/lite/examples/ios/camera
+        cd tensorflow/lite/examples/ios/camera
         pod install
 
     If you have installed this pod before and that command doesn't work, try
diff --git a/tensorflow/lite/g3doc/devguide.md b/tensorflow/lite/g3doc/devguide.md
new file mode 100644
index 0000000000000000000000000000000000000000..fdd02638f9b78e05e77cfeb22644bfb37878a580
--- /dev/null
+++ b/tensorflow/lite/g3doc/devguide.md
@@ -0,0 +1,223 @@
+# TF Lite Developer Guide
+
+Using a TensorFlow Lite model in your mobile app requires multiple
+considerations: you must choose a pre-trained or custom model, convert the model
+to a TensorFLow Lite format, and finally, integrate the model in your app.
+
+## 1. Choose a model
+
+Depending on the use case, you can choose one of the popular open-sourced models,
+such as *InceptionV3* or *MobileNets*, and re-train these models with a custom
+data set or even build your own custom model.
+
+### Use a pre-trained model
+
+[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
+is a family of mobile-first computer vision models for TensorFlow designed to
+effectively maximize accuracy, while taking into consideration the restricted
+resources for on-device or embedded applications. MobileNets are small,
+low-latency, low-power models parameterized to meet the resource constraints for
+a variety of uses. They can be used for classification, detection, embeddings, and
+segmentation—similar to other popular large scale models, such as
+[Inception](https://arxiv.org/pdf/1602.07261.pdf). Google provides 16 pre-trained
+[ImageNet](http://www.image-net.org/challenges/LSVRC/) classification checkpoints
+for MobileNets that can be used in mobile projects of all sizes.
+
+[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model
+that achieves fairly high accuracy recognizing general objects with 1000 classes,
+for example, "Zebra", "Dalmatian", and "Dishwasher". The model extracts general
+features from input images using a convolutional neural network and classifies
+them based on those features with fully-connected and softmax layers.
+
+[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+is an on-device model that provides one-touch replies for incoming text messages
+by suggesting contextually relevant messages. The model is built specifically for
+memory constrained devices, such as watches and phones, and has been successfully
+used in Smart Replies on Android Wear. Currently, this model is Android-specific.
+
+These pre-trained models are [available for download](models.md).
+
+### Re-train Inception-V3 or MobileNet for a custom data set
+
+These pre-trained models were trained on the *ImageNet* data set which contains
+1000 predefined classes. If these classes are not sufficient for your use case,
+the model will need to be re-trained. This technique is called
+*transfer learning* and starts with a model that has been already trained on a
+problem, then retrains the model on a similar problem. Deep learning from
+scratch can take days, but transfer learning is fairly quick. In order to do
+this, you need to generate a custom data set labeled with the relevant classes.
+
+The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
+codelab walks through the re-training process step-by-step. The code supports
+both floating point and quantized inference.
+
+### Train a custom model
+
+A developer may choose to train a custom model using Tensorflow (see the
+[TensorFlow tutorials](../tutorials/) for examples of building and training
+models). If you have already written a model, the first step is to export this
+to a `tf.GraphDef` file. This is required because some formats do not store the
+model structure outside the code, and we must communicate with other parts of
+the framework. See
+[Exporting the Inference Graph](https://www.tensorflow.org/tutorials/keras/save_and_restore_models#save_the_entire_model)
+to create file for the custom model.
+
+TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to
+the [TensorFlow Lite & TensorFlow Compatibility Guide](tf_ops_compatibility.md)
+for supported operators and their usage. This set of operators will continue to
+grow in future Tensorflow Lite releases.
+
+## 2. Convert the model format
+
+The [TensorFlow Lite Converter](convert/index.md) accepts the following file
+formats:
+
+*   `SavedModel` — A `GraphDef` and checkpoint with a signature that labels
+    input and output arguments to a model. See the documentation for converting
+    SavedModels using [Python](convert/python_api.md#basic_savedmodel) or using
+    the [command line](convert/cmdline_examples.md#savedmodel).
+*   `tf.keras` - A HDF5 file containing a model with weights and input and
+    output arguments generated by `tf.Keras`. See the documentation for
+    converting HDF5 models using
+    [Python](convert/python_api.md#basic_keras_file) or using the
+    [command line](convert/cmdline_examples.md#keras).
+*   `frozen tf.GraphDef` — A subclass of `tf.GraphDef` that does not contain
+    variables. A `GraphDef` can be converted to a `frozen GraphDef` by taking a
+    checkpoint and a `GraphDef`, and converting each variable into a constant
+    using the value retrieved from the checkpoint. Instructions on converting a
+    `tf.GraphDef` to a TensorFlow Lite model are described in the next
+    subsection.
+
+### Converting a tf.GraphDef
+
+TensorFlow models may be saved as a .pb or .pbtxt `tf.GraphDef` file. In order
+to convert the `tf.GraphDef` file to TensorFlow Lite, the model must first be
+frozen. This process invovles several file formats including the `frozen
+GraphDef`:
+
+*   `tf.GraphDef` (.pb or .pbtxt) — A protobuf that represents the TensorFlow
+    training or computation graph. It contains operators, tensors, and variables
+    definitions.
+*   *checkpoint* (.ckpt) — Serialized variables from a TensorFlow graph. Since
+    this does not contain a graph structure, it cannot be interpreted by itself.
+*   *TensorFlow Lite model* (.tflite) — A serialized
+    [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
+    Lite operators and tensors for the TensorFlow Lite interpreter.
+
+You must have checkpoints that contain trained weights. The `tf.GraphDef` file
+only contains the structure of the graph. The process of merging the checkpoint
+values with the graph structure is called *freezing the graph*.
+
+`tf.GraphDef` and checkpoint files for MobileNet models are available
+[here](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md).
+
+To freeze the graph, use the following command (changing the arguments):
+
+```
+freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
+  --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
+  --input_binary=true \
+  --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
+  --output_node_names=MobileNetV1/Predictions/Reshape_1
+```
+
+Set the `input_binary` flag to `True` when reading a binary protobuf, a `.pb`
+file. Set to `False` for a `.pbtxt` file.
+
+Set `input_graph` and `input_checkpoint` to the respective filenames. The
+`output_node_names` may not be obvious outside of the code that built the model.
+The easiest way to find them is to visualize the graph, either with
+[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) or
+`graphviz`.
+
+The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
+(.tflite) for use on Android or iOS devices. For Android, the TensorFlow Lite
+Converter tool supports both float and quantized models. To convert the frozen
+`GraphDef` to the .tflite format use a command similar to the following:
+
+```
+tflite_convert \
+  --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
+  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1
+```
+
+The
+[frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
+file used here is available for download. Setting the `input_array` and
+`output_array` arguments is not straightforward. The easiest way to find these
+values is to explore the graph using
+[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard). Reuse
+the arguments for specifying the output nodes for inference in the
+`freeze_graph` step.
+
+### Full converter reference
+
+The [TensorFlow Lite Converter](convert/index.md) can be
+[Python](convert/python_api.md) or from the
+[command line](convert/cmdline_examples.md). This allows you to integrate the
+conversion step into the model design workflow, ensuring the model is easy to
+convert to a mobile inference graph.
+
+### Ops compatibility
+
+Refer to the [ops compatibility guide](tf_ops_compatibility.md) for
+troubleshooting help, and if that doesn't help, please
+[file an issue](https://github.com/tensorflow/tensorflow/issues).
+
+### Graph vizualization tool
+
+The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
+to visualize TensorFlow Lite models after conversion. To build the
+[visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
+tool:
+
+```sh
+bazel run tensorflow/lite/tools:visualize -- model.tflite model_viz.html
+```
+
+This generates an interactive HTML page listing subgraphs, operations, and a
+graph visualization.
+
+
+## 3. Use the TensorFlow Lite model for inference in a mobile app
+
+After completing the prior steps, you should now have a `.tflite` model file.
+
+### Android
+
+Since Android apps are written in Java and the core TensorFlow library is in C++,
+a JNI library is provided as an interface. This is only meant for inference—it
+provides the ability to load a graph, set up inputs, and run the model to
+calculate outputs.
+
+The open source Android demo app uses the JNI interface and is available
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/java/demo/app).
+You can also download a
+[prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+See the <a href="./demo_android.md">Android demo</a> guide for details.
+
+The <a href="./android_build.md">Android mobile</a> guide has instructions for
+installing TensorFlow on Android and setting up `bazel` and Android Studio.
+
+### iOS
+
+To integrate a TensorFlow model in an iOS app, see the
+[TensorFlow Lite for iOS](ios.md) guide and <a href="./demo_ios.md">iOS demo</a>
+guide.
+
+#### Core ML support
+
+Core ML is a machine learning framework used in Apple products. In addition to
+using Tensorflow Lite models directly in your applications, you can convert
+trained Tensorflow models to the
+[CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
+devices. To use the converter, refer to the
+[Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+
+### Raspberry Pi
+
+Compile Tensorflow Lite for a Raspberry Pi by following the
+[RPi build instructions](rpi.md) This compiles a static library file (`.a`) used
+to build your app. There are plans for Python bindings and a demo app.
diff --git a/tensorflow/lite/g3doc/images/convert/sample_after.png b/tensorflow/lite/g3doc/images/convert/sample_after.png
new file mode 100644
index 0000000000000000000000000000000000000000..6c451f97903f7f70a9f28dee8abf6daeb7ec5693
Binary files /dev/null and b/tensorflow/lite/g3doc/images/convert/sample_after.png differ
diff --git a/tensorflow/lite/g3doc/images/convert/sample_before.png b/tensorflow/lite/g3doc/images/convert/sample_before.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5317ef295062e79c66430512ef1c45925858ce0
Binary files /dev/null and b/tensorflow/lite/g3doc/images/convert/sample_before.png differ
diff --git a/tensorflow/lite/g3doc/images/convert/workflow.svg b/tensorflow/lite/g3doc/images/convert/workflow.svg
new file mode 100644
index 0000000000000000000000000000000000000000..3dfcbd67d8919bd1ffe2a09d7b291a7c3182fccd
--- /dev/null
+++ b/tensorflow/lite/g3doc/images/convert/workflow.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 720.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path fill="#434343" d="m338.89157 372.61328q0 1.0 -0.75 1.546875q-0.734375 0.53125 -2.078125 0.53125q-1.421875 0 -2.21875 -0.4375l0 -1.015625q0.515625 0.265625 1.109375 0.421875q0.59375 0.140625 1.140625 0.140625q0.84375 0 1.296875 -0.265625q0.453125 -0.265625 0.453125 -0.828125q0 -0.40625 -0.359375 -0.703125q-0.359375 -0.296875 -1.40625 -0.703125q-1.0 -0.375 -1.421875 -0.640625q-0.421875 -0.28125 -0.625 -0.625q-0.203125 -0.359375 -0.203125 -0.84375q0 -0.875 0.703125 -1.375q0.71875 -0.515625 1.953125 -0.515625q1.15625 0 2.25 0.46875l-0.375 0.875q-1.078125 -0.4375 -1.953125 -0.4375q-0.765625 0 -1.15625 0.25q-0.390625 0.234375 -0.390625 0.65625q0 0.28125 0.140625 0.484375q0.15625 0.203125 0.46875 0.390625q0.328125 0.171875 1.25 0.53125q1.28125 0.453125 1.71875 0.921875q0.453125 0.46875 0.453125 1.171875zm4.7644653 2.078125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024414 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375zm3.7374573 7.265625l-2.703125 -7.125l1.15625 0l1.53125 4.21875q0.53125 1.484375 0.625 1.9375l0.046875 0q0.0625 -0.359375 0.4375 -1.4375q0.390625 -1.078125 1.71875 -4.71875l1.15625 0l-2.703125 7.125l-1.265625 0zm8.1302185 0.125q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm8.024414 -0.90625q0.46875 0 0.84375 0.078125l-0.140625 1.0q-0.453125 -0.09375 -0.78125 -0.09375q-0.875 0 -1.5 0.703125q-0.609375 0.703125 -0.609375 1.75l0 3.828125l-1.078125 0l0 -7.125l0.890625 0l0.125 1.3125l0.0625 0q0.390625 -0.703125 0.953125 -1.078125q0.5625 -0.375 1.234375 -0.375z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path fill="#434343" d="m134.67014 56.769344l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm6.576172 0.125q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375zm8.669922 5.71875l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm7.1152344 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm7.1308594 -1.46875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm9.529297 -3.328125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm3.1015625 6.546875l-1.0 0l0 -8.5625l4.78125 0l0 0.875l-3.78125 0l0 3.140625l3.546875 0l0 0.890625l-3.546875 0l0 3.65625zm6.0214844 0l-0.96875 0l0 -9.125l0.96875 0l0 9.125zm7.6132812 -3.21875q0 1.578125 -0.796875 2.46875q-0.78125 0.875 -2.1875 0.875q-0.859375 0 -1.53125 -0.40625q-0.65625 -0.40625 -1.03125 -1.15625q-0.359375 -0.765625 -0.359375 -1.78125q0 -1.5625 0.78125 -2.4375q0.796875 -0.890625 2.1875 -0.890625q1.34375 0 2.140625 0.90625q0.796875 0.890625 0.796875 2.421875zm-4.890625 0q0 1.234375 0.484375 1.875q0.5 0.640625 1.453125 0.640625q0.953125 0 1.4375 -0.640625q0.5 -0.640625 0.5 -1.875q0 -1.21875 -0.5 -1.859375q-0.484375 -0.640625 -1.453125 -0.640625q-0.953125 0 -1.4375 0.640625q-0.484375 0.625 -0.484375 1.859375zm11.841797 3.21875l-1.1875 -3.765625q-0.109375 -0.34375 -0.40625 -1.578125l-0.046875 0q-0.234375 1.03125 -0.421875 1.59375l-1.203125 3.75l-1.125 0l-1.75 -6.421875l1.015625 0q0.625 2.421875 0.9375 3.6875q0.328125 1.265625 0.375 1.703125l0.046875 0q0.0625 -0.328125 0.203125 -0.859375q0.15625 -0.53125 0.265625 -0.84375l1.171875 -3.6875l1.046875 0l1.15625 3.6875q0.328125 1.0 0.4375 1.6875l0.046875 0q0.03125 -0.203125 0.125 -0.640625q0.109375 -0.453125 1.234375 -4.734375l1.0 0l-1.765625 6.421875l-1.15625 0zm12.732422 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.345703 -1.75q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m409.75067 374.69928q-1.546875 0 -2.40625 -0.953125q-0.84375 -0.953125 -0.84375 -2.6875q0 -1.796875 0.859375 -2.765625q0.859375 -0.984375 2.453125 -0.984375q0.515625 0 1.03125 0.109375q0.515625 0.109375 0.8125 0.265625l-0.328125 0.921875q-0.359375 -0.15625 -0.796875 -0.25q-0.421875 -0.09375 -0.734375 -0.09375q-2.171875 0 -2.171875 2.78125q0 1.3125 0.515625 2.015625q0.53125 0.703125 1.578125 0.703125q0.890625 0 1.828125 -0.390625l0 0.96875q-0.71875 0.359375 -1.796875 0.359375zm4.5639343 -0.125l-1.078125 0l0 -10.125l1.078125 0l0 10.125zm3.3710632 0l-1.078125 0l0 -7.125l1.078125 0l0 7.125zm-1.171875 -9.0625q0 -0.375 0.1875 -0.546875q0.1875 -0.171875 0.453125 -0.171875q0.265625 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.546875q0 0.359375 -0.1875 0.546875q-0.1875 0.171875 -0.453125 0.171875q-0.265625 0 -0.453125 -0.171875q-0.1875 -0.1875 -0.1875 -0.546875zm6.4804688 9.1875q-1.578125 0 -2.5 -0.953125q-0.90625 -0.96875 -0.90625 -2.671875q0 -1.734375 0.84375 -2.75q0.859375 -1.015625 2.28125 -1.015625q1.34375 0 2.125 0.890625q0.78125 0.875 0.78125 2.328125l0 0.671875l-4.90625 0q0.03125 1.265625 0.625 1.921875q0.609375 0.640625 1.703125 0.640625q1.140625 0 2.265625 -0.484375l0 0.96875q-0.5625 0.25 -1.078125 0.34375q-0.515625 0.109375 -1.234375 0.109375zm-0.296875 -6.484375q-0.859375 0 -1.375 0.5625q-0.5 0.5625 -0.59375 1.546875l3.734375 0q0 -1.015625 -0.453125 -1.5625q-0.453125 -0.546875 -1.3125 -0.546875zm9.649414 6.359375l0 -4.609375q0 -0.875 -0.40625 -1.296875q-0.390625 -0.4375 -1.234375 -0.4375q-1.125 0 -1.65625 0.609375q-0.515625 0.59375 -0.515625 2.0l0 3.734375l-1.078125 0l0 -7.125l0.890625 0l0.171875 0.96875l0.046875 0q0.328125 -0.53125 0.921875 -0.8125q0.609375 -0.296875 1.34375 -0.296875q1.296875 0 1.9375 0.625q0.65625 0.625 0.65625 1.984375l0 4.65625l-1.078125 0zm5.602295 -0.765625q0.28125 0 0.546875 -0.03125q0.265625 -0.046875 0.421875 -0.09375l0 0.828125q-0.171875 0.078125 -0.515625 0.125q-0.34375 0.0625 -0.609375 0.0625q-2.078125 0 -2.078125 -2.171875l0 -4.25l-1.015625 0l0 -0.515625l1.015625 -0.453125l0.453125 -1.515625l0.625 0l0 1.65625l2.078125 0l0 0.828125l-2.078125 0l0 4.203125q0 0.640625 0.3125 0.984375q0.3125 0.34375 0.84375 0.34375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path fill="#000000" d="m237.07405 214.53435l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm5.906296 2.296875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.281296 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm4.796921 3.40625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.1250305 2.6875l3.40625 -4.140625l-3.140625 0l0 -0.75l4.25 0l0 0.59375l-3.328125 4.140625l3.09375 0q0.1875 0 0.28125 -0.015625q0.09375 -0.03125 0.15625 -0.09375l0.09375 0l0 0.859375l-4.8125 0l0 -0.59375zm8.671936 0.71875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm1.9531555 4.234375l0 -0.71875l5.171875 0l0 0.71875l-5.171875 0zm11.281311 -5.875q-0.109375 -0.015625 -0.3125 -0.015625q-0.578125 0 -0.96875 0.25q0.203125 0.421875 0.203125 0.90625q0 0.546875 -0.234375 0.984375q-0.234375 0.4375 -0.671875 0.6875q-0.4375 0.25 -1.0 0.25q-0.46875 0 -0.859375 -0.171875q-0.21875 0.265625 -0.21875 0.5q0 0.296875 0.359375 0.421875q0.375 0.125 1.328125 0.125q1.15625 0 1.578125 0.359375q0.4375 0.34375 0.4375 1.03125q0 0.703125 -0.578125 1.171875q-0.578125 0.484375 -1.796875 0.484375q-1.1875 0 -1.859375 -0.359375q-0.65625 -0.359375 -0.65625 -1.0625q0 -0.421875 0.234375 -0.734375q0.25 -0.3125 0.625 -0.5q-0.40625 -0.265625 -0.40625 -0.71875q0 -0.484375 0.5 -1.015625q-0.296875 -0.265625 -0.453125 -0.640625q-0.15625 -0.375 -0.15625 -0.8125q0 -0.53125 0.234375 -0.96875q0.234375 -0.4375 0.671875 -0.6875q0.4375 -0.25 1.015625 -0.25q0.875 0 1.390625 0.5625q0.296875 -0.265625 0.59375 -0.390625q0.3125 -0.125 0.6875 -0.125l0.203125 0.015625l0.109375 0.703125zm-2.984375 2.34375q0.515625 0 0.84375 -0.34375q0.34375 -0.359375 0.34375 -0.859375q0 -0.5 -0.34375 -0.84375q-0.328125 -0.359375 -0.84375 -0.359375q-0.515625 0 -0.859375 0.359375q-0.34375 0.34375 -0.34375 0.84375q0 0.5 0.34375 0.859375q0.34375 0.34375 0.859375 0.34375zm1.828125 3.125q0 -0.296875 -0.109375 -0.46875q-0.09375 -0.15625 -0.390625 -0.234375q-0.296875 -0.078125 -0.90625 -0.078125q-0.859375 0 -1.375 -0.15625q-0.296875 0.203125 -0.40625 0.40625q-0.109375 0.203125 -0.109375 0.53125q0 0.40625 0.453125 0.640625q0.453125 0.234375 1.265625 0.234375q0.796875 0 1.1875 -0.234375q0.390625 -0.234375 0.390625 -0.640625zm6.5625305 -4.890625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.937561 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm4.1094055 -4.953125l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm3.781311 -7.359375l0.953125 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.9375q0.328125 -0.5 0.8125 -0.796875q0.484375 -0.296875 0.984375 -0.296875q0.78125 0 1.21875 0.546875q0.453125 0.546875 0.453125 1.734375l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.390625 0 -0.765625 0.21875q-0.375 0.203125 -0.625 0.59375q-0.234375 0.390625 -0.234375 0.890625l0 3.15625l-0.84375 0l0 -7.984375zm7.9844055 8.109375q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm3.859436 -5.609375l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm8.2187805 -4.859375q-0.0625 0.5 -0.21875 0.984375q-0.15625 0.46875 -0.421875 1.265625l-1.375 3.859375q-0.265625 0.765625 -0.71875 1.109375q-0.4375 0.359375 -1.046875 0.359375q-0.78125 0 -1.265625 -0.46875l0.375 -0.59375l0 -0.015625l0.015625 0.015625q0.203125 0.1875 0.40625 0.265625q0.21875 0.078125 0.515625 0.078125q0.4375 0 0.703125 -0.3125q0.265625 -0.3125 0.515625 -1.03125l-2.25 -5.515625l0.90625 0l1.703125 4.390625l0.71875 -2.203125q0.265625 -0.828125 0.390625 -1.265625q0.125 -0.4375 0.1875 -0.921875l0.859375 0zm-4.53125 6.46875q0 0.03125 -0.03125 0.03125l-0.09375 -0.03125l0.046875 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.046875 -0.0625 -0.03125 -0.046875q0.03125 0.015625 0.046875 0.015625l-0.015625 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m132.49081 319.42978l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m132.49081 319.42978l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m160.73871 332.44586l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437546 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859421 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406296 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343796 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m152.62926 345.8521q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm4.093796 5.8125q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.8594208 -4.859375l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.593796 0q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm6.343796 3.796875q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375458 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m312.37085 332.41437l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.4375305 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.859436 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.4062805 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.343811 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.9375305 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m298.80826 346.41437l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm6.0156555 0l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0zm7.968811 -5.609375q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.7187805 0.03125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.859436 0.625q-0.484375 0 -0.90625 -0.21875q-0.421875 -0.21875 -0.703125 -0.625l-0.3125 0.71875l-0.546875 0l0 -7.984375l0.984375 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.8125q0.265625 -0.453125 0.71875 -0.703125q0.453125 -0.265625 0.921875 -0.265625q1.03125 0 1.640625 0.71875q0.609375 0.71875 0.609375 2.09375q0 0.9375 -0.328125 1.609375q-0.3125 0.65625 -0.84375 0.984375q-0.53125 0.328125 -1.125 0.328125zm-0.109375 -0.765625q0.65625 0 1.078125 -0.5q0.4375 -0.515625 0.4375 -1.609375q0 -1.046875 -0.40625 -1.578125q-0.390625 -0.546875 -1.078125 -0.546875q-0.671875 0 -1.09375 0.609375q-0.421875 0.59375 -0.421875 1.546875q0 2.078125 1.484375 2.078125zm5.5781555 0.765625q-0.875 0 -1.390625 -0.640625q-0.515625 -0.640625 -0.5 -1.90625l0.015625 -3.0625l0.84375 0l0 3.0625q0 0.984375 0.328125 1.421875q0.34375 0.4375 0.921875 0.4375q0.609375 0 1.03125 -0.484375q0.4375 -0.484375 0.4375 -1.40625l0 -3.03125l0.84375 0l0 4.625q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.09375 0.375l-0.828125 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.46875q-0.265625 0.453125 -0.71875 0.71875q-0.453125 0.25 -0.984375 0.25zm8.562561 -6.78125l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm6.0156555 -0.015625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm4.062561 6.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.6406555 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m441.27414 332.41437l0 -6.734375l-2.125 0l0 -0.75l5.1875 0l0 0.75l-2.21875 0l0 6.734375l-0.84375 0zm4.437561 0l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm5.8594055 -7.46875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.234375l3.59375 0l0 0.71875l-4.46875 0l0 -7.46875zm6.406311 7.46875l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm8.3437805 6.140625q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.937561 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0z" fill-rule="nonzero"/><path fill="#000000" d="m424.97714 346.41437l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm3.875061 1.15625l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.4687805 4.984375q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.937561 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.6406555 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm1.734436 -1.1875l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm8.0156555 -3.71875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.281311 4.421875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.8594055 2.78125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.937561 0.625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.6406555 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m371.61902 334.89435l41.417297 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m371.61902 334.89435l37.990234 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m409.60925 334.89435l-1.1245728 1.1246033l3.0897522 -1.1246033l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m584.8932 293.06122l2.90625 0l0 4.15625q-0.6875 0.21875 -1.390625 0.328125q-0.703125 0.125 -1.625 0.125q-1.9375 0 -3.03125 -1.15625q-1.078125 -1.171875 -1.078125 -3.25q0 -1.34375 0.53125 -2.34375q0.546875 -1.0 1.546875 -1.53125q1.015625 -0.53125 2.359375 -0.53125q1.375 0 2.5625 0.5l-0.390625 0.875q-1.15625 -0.484375 -2.234375 -0.484375q-1.5625 0 -2.453125 0.9375q-0.875 0.921875 -0.875 2.578125q0 1.734375 0.84375 2.640625q0.859375 0.890625 2.5 0.890625q0.890625 0 1.734375 -0.21875l0 -2.625l-1.90625 0l0 -0.890625zm10.392578 -1.59375q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m585.3668 331.6175q-1.40625 0 -2.234375 0.9375q-0.8125 0.9375 -0.8125 2.578125q0 1.671875 0.78125 2.59375q0.796875 0.921875 2.25 0.921875q0.90625 0 2.046875 -0.328125l0 0.875q-0.890625 0.34375 -2.1875 0.34375q-1.890625 0 -2.921875 -1.15625q-1.03125 -1.15625 -1.03125 -3.265625q0 -1.328125 0.484375 -2.3125q0.5 -1.0 1.4375 -1.53125q0.9375 -0.546875 2.203125 -0.546875q1.34375 0 2.359375 0.484375l-0.421875 0.859375q-0.984375 -0.453125 -1.953125 -0.453125zm9.3359375 1.71875q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm12.693359 -4.34375l0 5.53125q0 1.46875 -0.890625 2.3125q-0.875 0.84375 -2.421875 0.84375q-1.546875 0 -2.390625 -0.84375q-0.84375 -0.859375 -0.84375 -2.328125l0 -5.515625l1.0 0l0 5.578125q0 1.078125 0.578125 1.65625q0.59375 0.578125 1.71875 0.578125q1.09375 0 1.671875 -0.578125q0.59375 -0.578125 0.59375 -1.65625l0 -5.578125l0.984375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m219.98688 334.92584l64.12598 -0.03149414" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.98688 334.92584l60.698914 -0.029815674" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m280.68576 334.89603l-1.1240234 1.1251526l3.0892334 -1.1260986l-3.090332 -1.1230774z" fill-rule="evenodd"/><path fill="#d9ead3" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m437.52493 135.68242l77.480316 0l0 31.748032l-77.480316 0z" fill-rule="evenodd"/><path fill="#000000" d="m454.54056 151.84303q0 2.109375 -1.15625 3.234375q-1.140625 1.125 -3.3125 1.125l-2.375 0l0 -8.5625l2.625 0q2.0 0 3.109375 1.109375q1.109375 1.09375 1.109375 3.09375zm-1.046875 0.03125q0 -1.671875 -0.84375 -2.515625q-0.84375 -0.859375 -2.5 -0.859375l-1.453125 0l0 6.84375l1.21875 0q1.78125 0 2.671875 -0.875q0.90625 -0.875 0.90625 -2.59375zm6.763672 4.328125l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm7.001953 0q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm6.111328 0.6875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm10.822266 0.6875l-1.0 0l0 -7.6875l-2.703125 0l0 -0.875l6.421875 0l0 0.875l-2.71875 0l0 7.6875zm2.8417969 -6.421875l1.046875 0l1.40625 3.65625q0.453125 1.265625 0.5625 1.8125l0.046875 0q0.078125 -0.296875 0.3125 -1.015625q0.25 -0.734375 1.609375 -4.453125l1.03125 0l-2.75 7.3125q-0.421875 1.078125 -0.96875 1.53125q-0.546875 0.46875 -1.34375 0.46875q-0.4375 0 -0.875 -0.109375l0 -0.78125q0.328125 0.078125 0.71875 0.078125q1.0 0 1.4375 -1.125l0.359375 -0.921875l-2.59375 -6.453125zm10.046875 6.546875q-0.625 0 -1.140625 -0.234375q-0.515625 -0.234375 -0.875 -0.71875l-0.0625 0q0.0625 0.5625 0.0625 1.0625l0 2.65625l-0.96875 0l0 -9.3125l0.796875 0l0.125 0.875l0.046875 0q0.375 -0.53125 0.875 -0.765625q0.5 -0.234375 1.140625 -0.234375q1.28125 0 1.96875 0.875q0.703125 0.875 0.703125 2.453125q0 1.578125 -0.703125 2.46875q-0.703125 0.875 -1.96875 0.875zm-0.140625 -5.84375q-0.984375 0 -1.421875 0.546875q-0.4375 0.546875 -0.453125 1.734375l0 0.21875q0 1.359375 0.453125 1.9375q0.453125 0.578125 1.453125 0.578125q0.828125 0 1.296875 -0.671875q0.46875 -0.671875 0.46875 -1.859375q0 -1.203125 -0.46875 -1.84375q-0.46875 -0.640625 -1.328125 -0.640625zm7.2285156 5.84375q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m544.40155 135.68242l100.0 0l0 31.748032l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m554.5734 156.20241l0 -8.5625l1.0 0l0 8.5625l-1.0 0zm7.595703 0l0 -4.15625q0 -0.78125 -0.359375 -1.171875q-0.34375 -0.390625 -1.109375 -0.390625q-1.015625 0 -1.484375 0.546875q-0.46875 0.546875 -0.46875 1.796875l0 3.375l-0.96875 0l0 -6.421875l0.796875 0l0.15625 0.875l0.046875 0q0.296875 -0.46875 0.828125 -0.734375q0.546875 -0.265625 1.203125 -0.265625q1.171875 0 1.75 0.5625q0.59375 0.5625 0.59375 1.796875l0 4.1875l-0.984375 0zm5.8652344 -5.671875l-1.625 0l0 5.671875l-0.984375 0l0 -5.671875l-1.140625 0l0 -0.4375l1.140625 -0.34375l0 -0.359375q0 -2.375 2.078125 -2.375q0.5 0 1.1875 0.203125l-0.25 0.78125q-0.5625 -0.171875 -0.953125 -0.171875q-0.5625 0 -0.828125 0.375q-0.25 0.359375 -0.25 1.15625l0 0.421875l1.625 0l0 0.75zm4.1132812 -0.875q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm5.9140625 6.546875l-0.203125 -0.921875l-0.046875 0q-0.46875 0.609375 -0.953125 0.828125q-0.46875 0.21875 -1.1875 0.21875q-0.953125 0 -1.5 -0.5q-0.546875 -0.5 -0.546875 -1.40625q0 -1.9375 3.109375 -2.03125l1.09375 -0.03125l0 -0.40625q0 -0.75 -0.328125 -1.109375q-0.3125 -0.359375 -1.03125 -0.359375q-0.8125 0 -1.8125 0.484375l-0.3125 -0.75q0.484375 -0.25 1.046875 -0.390625q0.5625 -0.15625 1.140625 -0.15625q1.140625 0 1.6875 0.515625q0.5625 0.5 0.5625 1.625l0 4.390625l-0.71875 0zm-2.203125 -0.6875q0.90625 0 1.421875 -0.5q0.53125 -0.5 0.53125 -1.390625l0 -0.578125l-0.984375 0.03125q-1.15625 0.046875 -1.671875 0.375q-0.5 0.3125 -0.5 0.984375q0 0.53125 0.3125 0.8125q0.3125 0.265625 0.890625 0.265625zm9.064453 -1.0625q0 0.890625 -0.671875 1.390625q-0.65625 0.484375 -1.875 0.484375q-1.265625 0 -1.984375 -0.40625l0 -0.90625q0.46875 0.234375 0.984375 0.375q0.53125 0.125 1.03125 0.125q0.765625 0 1.171875 -0.234375q0.40625 -0.25 0.40625 -0.75q0 -0.375 -0.328125 -0.640625q-0.3125 -0.265625 -1.265625 -0.625q-0.890625 -0.34375 -1.28125 -0.59375q-0.375 -0.25 -0.5625 -0.5625q-0.171875 -0.3125 -0.171875 -0.75q0 -0.78125 0.640625 -1.234375q0.640625 -0.46875 1.75 -0.46875q1.03125 0 2.03125 0.421875l-0.359375 0.796875q-0.953125 -0.390625 -1.75 -0.390625q-0.6875 0 -1.046875 0.21875q-0.34375 0.203125 -0.34375 0.59375q0 0.25 0.125 0.4375q0.140625 0.171875 0.421875 0.34375q0.296875 0.15625 1.140625 0.46875q1.140625 0.421875 1.53125 0.84375q0.40625 0.421875 0.40625 1.0625zm3.6621094 1.0625q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm5.095703 -5.859375q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm2.8828125 0.125l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.005859 6.546875q-1.390625 0 -2.15625 -0.859375q-0.765625 -0.859375 -0.765625 -2.4375q0 -1.609375 0.78125 -2.484375q0.78125 -0.890625 2.203125 -0.890625q0.46875 0 0.921875 0.109375q0.46875 0.09375 0.734375 0.234375l-0.296875 0.828125q-0.328125 -0.140625 -0.703125 -0.21875q-0.375 -0.078125 -0.671875 -0.078125q-1.953125 0 -1.953125 2.484375q0 1.1875 0.46875 1.828125q0.484375 0.625 1.421875 0.625q0.796875 0 1.640625 -0.34375l0 0.859375q-0.640625 0.34375 -1.625 0.34375zm5.2285156 -0.8125q0.25 0 0.484375 -0.03125q0.25 -0.046875 0.390625 -0.078125l0 0.734375q-0.15625 0.078125 -0.46875 0.125q-0.296875 0.0625 -0.546875 0.0625q-1.859375 0 -1.859375 -1.96875l0 -3.828125l-0.921875 0l0 -0.46875l0.921875 -0.40625l0.40625 -1.359375l0.5625 0l0 1.484375l1.859375 0l0 0.75l-1.859375 0l0 3.78125q0 0.578125 0.265625 0.890625q0.28125 0.3125 0.765625 0.3125zm3.0800781 -5.734375l0 4.171875q0 0.78125 0.34375 1.171875q0.359375 0.375 1.125 0.375q1.015625 0 1.46875 -0.546875q0.46875 -0.546875 0.46875 -1.796875l0 -3.375l0.96875 0l0 6.421875l-0.796875 0l-0.140625 -0.859375l-0.046875 0q-0.296875 0.46875 -0.828125 0.734375q-0.53125 0.25 -1.21875 0.25q-1.171875 0 -1.75 -0.5625q-0.578125 -0.5625 -0.578125 -1.78125l0 -4.203125l0.984375 0zm9.380859 -0.125q0.421875 0 0.765625 0.078125l-0.140625 0.90625q-0.390625 -0.09375 -0.703125 -0.09375q-0.78125 0 -1.34375 0.640625q-0.546875 0.625 -0.546875 1.5625l0 3.453125l-0.96875 0l0 -6.421875l0.796875 0l0.125 1.1875l0.046875 0q0.34375 -0.625 0.84375 -0.96875q0.515625 -0.34375 1.125 -0.34375zm4.6796875 6.671875q-1.421875 0 -2.25 -0.875q-0.828125 -0.875 -0.828125 -2.40625q0 -1.5625 0.765625 -2.46875q0.765625 -0.921875 2.0625 -0.921875q1.203125 0 1.90625 0.796875q0.703125 0.796875 0.703125 2.09375l0 0.625l-4.421875 0q0.03125 1.125 0.5625 1.71875q0.546875 0.578125 1.53125 0.578125q1.03125 0 2.046875 -0.4375l0 0.875q-0.515625 0.21875 -0.984375 0.3125q-0.453125 0.109375 -1.09375 0.109375zm-0.265625 -5.84375q-0.78125 0 -1.25 0.5q-0.453125 0.5 -0.53125 1.390625l3.359375 0q0 -0.921875 -0.40625 -1.40625q-0.40625 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#d9ead3" d="m31.874912 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m31.874912 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m64.2166 265.0834l0.984375 0l0 0.078125q-0.0625 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.671875l2.9375 0l0 -3.1875l0.984375 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 6.953125l-0.890625 0l0 -3.5625l-2.921875 0l0 3.5625l-0.875 0l0 -7.46875zm6.046921 0l1.75 0q0.921875 0 1.453125 0.25q0.546875 0.234375 0.9375 0.765625q0.734375 0.984375 0.734375 2.75q-0.0625 1.8125 -0.84375 2.78125q-0.765625 0.953125 -2.421875 0.9375l-1.609375 0l0 -7.484375zm1.5625 6.828125q2.484375 0 2.484375 -3.0q-0.015625 -1.53125 -0.578125 -2.328125q-0.546875 -0.796875 -1.765625 -0.796875l-0.90625 0l0 6.125l0.765625 0zm4.734421 0.640625l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm8.140671 -4.859375q0.65625 0 1.15625 0.3125q0.515625 0.296875 0.8125 0.875q0.296875 0.5625 0.296875 1.3125q0 0.765625 -0.3125 1.328125q-0.296875 0.5625 -0.84375 0.859375q-0.53125 0.296875 -1.203125 0.296875q-0.6875 0 -1.265625 -0.296875q-0.578125 -0.296875 -0.953125 -0.84375l0.671875 -0.515625l0.015625 0q0.015625 0 0.015625 0q0 0 0 0q0.3125 0.484375 0.65625 0.71875q0.34375 0.21875 0.90625 0.21875q0.390625 0 0.71875 -0.21875q0.34375 -0.234375 0.53125 -0.625q0.203125 -0.40625 0.203125 -0.953125q0 -0.8125 -0.4375 -1.28125q-0.4375 -0.484375 -1.09375 -0.484375q-0.390625 0 -0.765625 0.1875q-0.359375 0.171875 -0.640625 0.515625l-0.53125 -0.21875l0.25 -3.796875l3.796875 0l0 0.75l-3.078125 0l-0.125 2.140625q0.59375 -0.28125 1.21875 -0.28125zm-1.625 3.328125q-0.0625 -0.09375 0.015625 -0.015625l-0.015625 0.015625zm0.125 0q0 0.0625 -0.109375 -0.015625l0.0625 -0.046875l0.046875 0.0625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m215.40347 151.18938l0 2.9375q-1.0625 0.78125 -2.1875 0.78125q-0.890625 0 -1.546875 -0.46875q-0.65625 -0.46875 -1.0 -1.3125q-0.34375 -0.859375 -0.34375 -1.984375q-0.015625 -1.5 0.421875 -2.359375q0.453125 -0.875 1.09375 -1.21875q0.65625 -0.34375 1.3125 -0.34375q0.671875 0 1.296875 0.359375q0.625 0.34375 0.9375 0.90625l-0.59375 0.46875l-0.015625 0.015625q-0.40625 -0.546875 -0.75 -0.765625q-0.34375 -0.234375 -0.859375 -0.234375q-0.90625 0 -1.46875 0.703125q-0.546875 0.6875 -0.546875 2.28125q0 1.53125 0.546875 2.375q0.546875 0.828125 1.515625 0.828125q0.359375 0 0.75 -0.109375q0.390625 -0.125 0.65625 -0.34375l0 -1.78125l-1.34375 0l0 -0.734375l2.125 0zm-0.75 -2.21875q0 -0.078125 0.109375 0l-0.0625 0.0625l-0.046875 -0.0625zm0.125 0q0.0625 0.078125 -0.015625 0l0.015625 0zm6.218796 1.46875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm4.109421 -4.953125l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm3.7812958 -7.359375l0.953125 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.9375q0.328125 -0.5 0.8125 -0.796875q0.484375 -0.296875 0.984375 -0.296875q0.78125 0 1.21875 0.546875q0.453125 0.546875 0.453125 1.734375l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.390625 0 -0.765625 0.21875q-0.375 0.203125 -0.625 0.59375q-0.234375 0.390625 -0.234375 0.890625l0 3.15625l-0.84375 0l0 -7.984375zm5.765671 0.515625l1.75 0q0.921875 0 1.453125 0.25q0.546875 0.234375 0.9375 0.765625q0.734375 0.984375 0.734375 2.75q-0.0625 1.8125 -0.84375 2.78125q-0.765625 0.953125 -2.421875 0.9375l-1.609375 0l0 -7.484375zm1.5625 6.828125q2.484375 0 2.484375 -3.0q-0.015625 -1.53125 -0.578125 -2.328125q-0.546875 -0.796875 -1.765625 -0.796875l-0.90625 0l0 6.125l0.765625 0zm7.062546 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.750061 -3.375l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65626526 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125153 0l0 0.6875l-1.8125153 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53126526 0 0.98439026 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m259.7939 265.55215l0 -7.484375l4.3125 0l0 0.734375l-3.46875 0l0 2.34375l2.796875 0l0 0.734375l-2.796875 0l0 3.671875l-0.84375 0zm10.1719055 -4.34375l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm4.015686 4.390625q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm3.5937805 0.03125l3.40625 -4.140625l-3.140625 0l0 -0.75l4.25 0l0 0.59375l-3.328125 4.140625l3.09375 0q0.1875 0 0.28125 -0.015625q0.09375 -0.03125 0.15625 -0.09375l0.09375 0l0 0.859375l-4.8125 0l0 -0.59375zm8.671936 0.71875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.3906555 -2.203125l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375z" fill-rule="nonzero"/><path fill="#000000" d="m258.37198 275.9584l0 2.9375q-1.0625 0.78125 -2.1875 0.78125q-0.890625 0 -1.546875 -0.46875q-0.65625 -0.46875 -1.0 -1.3125q-0.34375 -0.859375 -0.34375 -1.984375q-0.015625 -1.5 0.421875 -2.359375q0.453125 -0.875 1.09375 -1.21875q0.65625 -0.34375 1.3125 -0.34375q0.671875 0 1.296875 0.359375q0.625 0.34375 0.9375 0.90625l-0.59375 0.46875l-0.015625 0.015625q-0.40625 -0.546875 -0.75 -0.765625q-0.34375 -0.234375 -0.859375 -0.234375q-0.90625 0 -1.46875 0.703125q-0.546875 0.6875 -0.546875 2.28125q0 1.53125 0.546875 2.375q0.546875 0.828125 1.515625 0.828125q0.359375 0 0.75 -0.109375q0.390625 -0.125 0.65625 -0.34375l0 -1.78125l-1.34375 0l0 -0.734375l2.125 0zm-0.75 -2.21875q0 -0.078125 0.109375 0l-0.0625 0.0625l-0.046875 -0.0625zm0.125 0q0.0625 0.078125 -0.015625 0l0.015625 0zm6.218811 1.46875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375305 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm4.109436 -4.953125l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm3.7812805 -7.359375l0.953125 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.9375q0.328125 -0.5 0.8125 -0.796875q0.484375 -0.296875 0.984375 -0.296875q0.78125 0 1.21875 0.546875q0.453125 0.546875 0.453125 1.734375l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.390625 0 -0.765625 0.21875q-0.375 0.203125 -0.625 0.59375q-0.234375 0.390625 -0.234375 0.890625l0 3.15625l-0.84375 0l0 -7.984375zm5.765686 0.515625l1.75 0q0.921875 0 1.453125 0.25q0.546875 0.234375 0.9375 0.765625q0.734375 0.984375 0.734375 2.75q-0.0625 1.8125 -0.84375 2.78125q-0.765625 0.953125 -2.421875 0.9375l-1.609375 0l0 -7.484375zm1.5625 6.828125q2.484375 0 2.484375 -3.0q-0.015625 -1.53125 -0.578125 -2.328125q-0.546875 -0.796875 -1.765625 -0.796875l-0.90625 0l0 6.125l0.765625 0zm7.0625305 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.750061 -3.375l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 232.16667l0 20.377945" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 232.16667l0 16.950867" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.85565 249.11754l-1.1246033 -1.124588l1.1246033 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#f4cccc" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m56.94999 87.87236q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm5.890671 -6.15625l-0.015625 0.015625q-0.328125 -0.421875 -0.5625 -0.5625q-0.234375 -0.15625 -0.65625 -0.15625q-0.5625 0 -0.890625 0.34375q-0.328125 0.328125 -0.328125 1.078125l0 0.453125l1.8125 0l0 0.6875l-1.8125 0l0 4.796875l-0.828125 0l0 -4.796875l-1.1875 0l0 -0.6875l1.1875 0l0 -0.453125q0 -1.0625 0.546875 -1.578125q0.546875 -0.53125 1.46875 -0.53125q0.53125 0 0.984375 0.1875q0.453125 0.1875 0.734375 0.5625l-0.453125 0.640625zm-0.140625 0.03125q0 -0.03125 0.046875 0q0.046875 0.015625 0.0625 0.015625l-0.046875 0.046875l-0.0625 -0.046875l0 -0.015625zm0.125 -0.015625q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm3.6562958 6.765625q-0.296875 0 -0.5 -0.203125q-0.203125 -0.203125 -0.203125 -0.46875q0 -0.28125 0.203125 -0.484375q0.203125 -0.203125 0.5 -0.203125q0.265625 0 0.46875 0.203125q0.21875 0.203125 0.21875 0.484375q0 0.265625 -0.21875 0.46875q-0.203125 0.203125 -0.46875 0.203125zm5.125046 -3.875l-0.578125 0.65625l0 3.09375l-0.90625 0l0 -7.46875l1.015625 0l0 0.078125q-0.078125 0.078125 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.921875l3.125 -3.5q0.296875 0.0625 0.609375 0.0625l0.3125 0l-2.828125 3.21875l3.03125 4.25l-1.078125 0.046875l-2.59375 -3.796875zm7.281296 3.875q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm6.640671 -1.0625l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625zm3.9375458 -1.3125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm6.781296 -2.703125q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m132.49081 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m132.49081 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m149.94176 88.37367l0 -7.484375l4.59375 0l0 0.734375l-3.796875 0l0 2.46875l3.125 0l0 0.765625l-3.125 0l0 2.765625l3.75 0l0 0.75l-4.546875 0zm8.687546 -3.234375q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125zm10.156296 1.078125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm1.8750458 0.5l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm3.4687958 1.15625l0.796875 0l0 0.546875q0.1875 -0.3125 0.484375 -0.484375q0.3125 -0.1875 0.625 -0.1875q0.359375 0 0.625 0.234375q0.28125 0.21875 0.359375 0.5625q0.140625 -0.359375 0.46875 -0.578125q0.34375 -0.21875 0.765625 -0.21875q0.53125 0 0.796875 0.390625q0.28125 0.375 0.25 1.0l0 4.21875l-0.78125 0l0 -3.890625q0 -0.6875 -0.140625 -0.890625q-0.125 -0.203125 -0.390625 -0.203125q-0.203125 0 -0.40625 0.203125q-0.203125 0.203125 -0.34375 0.53125q-0.125 0.3125 -0.125 0.625l0 3.625l-0.796875 0l0 -3.8125q0 -0.671875 -0.125 -0.90625q-0.125 -0.234375 -0.46875 -0.234375q-0.1875 0 -0.375 0.171875q-0.1875 0.15625 -0.3125 0.453125q-0.109375 0.28125 -0.109375 0.671875l0 3.65625l-0.796875 0l0 -5.484375zm8.468796 -0.125q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.718796 0.03125q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm3.6719208 0.59375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm8.109421 -3.71875l0 0.015625q-0.34375 -0.328125 -0.578125 -0.4375q-0.234375 -0.109375 -0.546875 -0.109375q-0.4375 0 -0.828125 0.21875q-0.375 0.21875 -0.625 0.65625q-0.234375 0.4375 -0.234375 1.09375l0 2.921875l-0.875 0l0 -5.5l0.90625 0l-0.03125 1.015625q0.234375 -0.546875 0.703125 -0.84375q0.484375 -0.296875 1.046875 -0.296875q0.875 0 1.453125 0.59375l-0.390625 0.671875zm0 0.015625q0.09375 0.0625 0.046875 0.0625q-0.046875 -0.015625 -0.0625 -0.03125l0.015625 -0.03125zm-0.125 0.03125q0 -0.03125 0.03125 -0.015625q0.03125 0 0.078125 0.015625l-0.03125 0.0625l-0.078125 -0.046875l0 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#666666" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#999999" d="m259.68378 88.37367l1.8125 -8.5625l1.0 0l-1.625 7.65625l3.3125 0l-0.1875 0.90625l-4.3125 0zm8.080078 0.125q-1.078125 0 -1.703125 -0.640625q-0.609375 -0.640625 -0.609375 -1.78125q0 -1.09375 0.4375 -2.109375q0.4375 -1.015625 1.15625 -1.578125q0.71875 -0.5625 1.578125 -0.5625q0.90625 0 1.359375 0.390625q0.453125 0.390625 0.453125 1.09375q0 1.046875 -0.984375 1.65625q-0.96875 0.59375 -2.78125 0.59375l-0.1875 0l-0.03125 0.46875q0 0.765625 0.359375 1.203125q0.359375 0.4375 1.125 0.4375q0.359375 0 0.75 -0.109375q0.390625 -0.109375 0.96875 -0.390625l0 0.859375q-0.546875 0.25 -0.96875 0.359375q-0.421875 0.109375 -0.921875 0.109375zm0.8125 -5.828125q-0.609375 0 -1.140625 0.5625q-0.53125 0.546875 -0.8125 1.515625l0.078125 0q1.328125 0 2.03125 -0.34375q0.71875 -0.34375 0.71875 -1.015625q0 -0.3125 -0.21875 -0.515625q-0.203125 -0.203125 -0.65625 -0.203125zm8.667969 -0.71875l-0.15625 0.625l-1.234375 0.140625q0.234375 0.359375 0.234375 0.921875q0 1.125 -0.6875 1.796875q-0.671875 0.65625 -1.8125 0.65625q-0.328125 0 -0.5 -0.046875q-0.8125 0.3125 -0.8125 0.765625q0 0.25 0.1875 0.328125q0.203125 0.078125 0.578125 0.125l0.6875 0.078125q1.046875 0.125 1.53125 0.515625q0.484375 0.390625 0.484375 1.140625q0 1.078125 -0.859375 1.671875q-0.859375 0.59375 -2.40625 0.59375q-1.140625 0 -1.78125 -0.4375q-0.65625 -0.4375 -0.65625 -1.21875q0 -0.609375 0.421875 -1.0625q0.421875 -0.453125 1.375 -0.765625q-0.453125 -0.25 -0.453125 -0.71875q0 -0.40625 0.296875 -0.6875q0.296875 -0.296875 0.828125 -0.546875q-0.375 -0.1875 -0.609375 -0.546875q-0.234375 -0.375 -0.234375 -0.859375q0 -1.140625 0.703125 -1.859375q0.703125 -0.734375 1.796875 -0.734375q0.453125 0 0.890625 0.125l2.1875 0zm-6.25 7.5625q0 0.453125 0.375 0.71875q0.390625 0.265625 1.140625 0.265625q1.0625 0 1.65625 -0.375q0.609375 -0.375 0.609375 -1.046875q0 -0.375 -0.3125 -0.59375q-0.3125 -0.203125 -1.078125 -0.28125l-0.9375 -0.09375q-0.703125 0.140625 -1.078125 0.515625q-0.375 0.375 -0.375 0.890625zm1.625 -5.125q0 0.5 0.265625 0.765625q0.265625 0.25 0.734375 0.25q0.46875 0 0.8125 -0.234375q0.34375 -0.25 0.53125 -0.671875q0.1875 -0.4375 0.1875 -0.953125q0 -0.46875 -0.265625 -0.71875q-0.25 -0.265625 -0.734375 -0.265625q-0.453125 0 -0.796875 0.25q-0.34375 0.234375 -0.546875 0.65625q-0.1875 0.421875 -0.1875 0.921875zm7.9335938 -2.5625q0.53125 0 0.953125 0.296875q0.421875 0.296875 0.65625 0.828125l0.0625 0l0.390625 -1.0l0.75 0l-1.359375 6.421875l-0.78125 0l0.15625 -1.21875l-0.046875 0q-1.0625 1.34375 -2.21875 1.34375q-0.8125 0 -1.28125 -0.578125q-0.453125 -0.59375 -0.453125 -1.59375q0 -1.21875 0.40625 -2.265625q0.421875 -1.046875 1.15625 -1.640625q0.734375 -0.59375 1.609375 -0.59375zm-1.125 5.84375q0.578125 0 1.140625 -0.53125q0.5625 -0.546875 0.90625 -1.40625q0.359375 -0.875 0.359375 -1.75q0 -0.609375 -0.328125 -0.96875q-0.328125 -0.359375 -0.859375 -0.359375q-0.609375 0 -1.140625 0.515625q-0.515625 0.5 -0.828125 1.359375q-0.296875 0.859375 -0.296875 1.8125q0 0.671875 0.28125 1.0q0.28125 0.328125 0.765625 0.328125zm7.123047 0.828125q-1.140625 0 -1.765625 -0.625q-0.625 -0.640625 -0.625 -1.78125q0 -1.171875 0.421875 -2.15625q0.4375 -1.0 1.203125 -1.546875q0.765625 -0.5625 1.71875 -0.5625q0.8125 0 1.578125 0.3125l-0.28125 0.8125q-0.703125 -0.296875 -1.28125 -0.296875q-0.65625 0 -1.203125 0.453125q-0.53125 0.453125 -0.84375 1.25q-0.3125 0.796875 -0.3125 1.734375q0 0.75 0.390625 1.171875q0.390625 0.40625 1.078125 0.40625q0.421875 0 0.796875 -0.109375q0.375 -0.125 0.734375 -0.28125l0 0.84375q-0.71875 0.375 -1.609375 0.375zm3.0117188 -6.546875l0.984375 0l0.4375 3.1875q0.046875 0.40625 0.09375 1.203125q0.0625 0.78125 0.0625 1.265625l0.046875 0q0.203125 -0.515625 0.5 -1.171875q0.3125 -0.671875 0.453125 -0.921875l1.90625 -3.5625l1.046875 0l-4.078125 7.515625q-0.546875 1.0 -1.078125 1.390625q-0.53125 0.40625 -1.28125 0.40625q-0.421875 0 -0.828125 -0.125l0 -0.796875q0.375 0.109375 0.765625 0.109375q0.484375 0 0.828125 -0.296875q0.34375 -0.296875 0.671875 -0.875l0.4375 -0.796875l-0.96875 -6.53125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m296.64194 154.18938q0.4375 0 0.84375 -0.25q0.40625 -0.25 0.65625 -0.671875l0.625 0.40625q-0.375 0.625 -0.875 0.9375q-0.5 0.296875 -1.21875 0.296875q-0.84375 0 -1.5 -0.40625q-0.65625 -0.421875 -1.046875 -1.265625q-0.390625 -0.859375 -0.390625 -2.15625q0 -1.375 0.421875 -2.234375q0.421875 -0.859375 1.0625 -1.21875q0.65625 -0.375 1.40625 -0.375q0.78125 0 1.359375 0.390625q0.59375 0.390625 0.890625 1.078125l-0.71875 0.34375q-0.015625 0 -0.015625 0q0 -0.015625 0 -0.015625q-0.3125 -0.625 -0.703125 -0.875q-0.375 -0.25 -0.84375 -0.25q-0.9375 0 -1.484375 0.828125q-0.546875 0.8125 -0.546875 2.28125q0 0.921875 0.265625 1.640625q0.28125 0.71875 0.75 1.125q0.484375 0.390625 1.0625 0.390625zm1.375 -5.171875q0.015625 -0.015625 0.015625 -0.015625q0.03125 0 0.109375 0.0625l-0.09375 0.046875l-0.03125 -0.09375zm0.140625 0.046875q0.046875 0.109375 -0.015625 0l0.015625 0zm1.9687805 -2.265625l0.953125 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 2.9375q0.328125 -0.5 0.8125 -0.796875q0.484375 -0.296875 0.984375 -0.296875q0.78125 0 1.21875 0.546875q0.453125 0.546875 0.453125 1.734375l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.390625 0 -0.765625 0.21875q-0.375 0.203125 -0.625 0.59375q-0.234375 0.390625 -0.234375 0.890625l0 3.15625l-0.84375 0l0 -7.984375zm8.390686 8.109375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm5.0000305 2.640625q0.828125 0 1.421875 -0.671875l0.515625 0.578125q-0.8125 0.859375 -1.984375 0.859375q-0.796875 0 -1.421875 -0.359375q-0.625 -0.375 -0.984375 -1.03125q-0.34375 -0.65625 -0.34375 -1.46875q0 -0.8125 0.34375 -1.453125q0.359375 -0.65625 0.984375 -1.03125q0.625 -0.375 1.40625 -0.375q0.65625 0 1.1875 0.28125q0.546875 0.265625 0.890625 0.734375l-0.546875 0.53125l0 0.015625q-0.359375 -0.453125 -0.71875 -0.640625q-0.359375 -0.1875 -0.90625 -0.1875q-0.46875 0 -0.875 0.265625q-0.390625 0.25 -0.640625 0.71875q-0.234375 0.46875 -0.234375 1.078125q0 0.609375 0.234375 1.109375q0.25 0.484375 0.6875 0.765625q0.4375 0.28125 0.984375 0.28125zm1.328125 -3.375q0 -0.078125 0.109375 0l-0.046875 0.0625l-0.0625 -0.0625zm0.140625 -0.015625q0.046875 0.078125 0.015625 0.0625q-0.015625 -0.03125 -0.046875 -0.046875l0.03125 -0.015625zm3.453186 1.28125l-0.71875 0.671875l0 2.078125l-0.875 0l0 -7.984375l0.984375 0l0 0.09375q-0.078125 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0 4.484375l2.75 -2.546875q0.296875 0.0625 0.625 0.0625l0.3125 0l-2.296875 2.171875l2.6875 3.28125l-1.125 0.046875l-2.234375 -2.796875zm4.3281555 -2.734375l0.875 0l0 1.078125q0.1875 -0.59375 0.625 -0.890625q0.4375 -0.3125 1.046875 -0.3125q0.625 0 1.140625 0.328125q0.53125 0.3125 0.84375 0.953125q0.3125 0.625 0.3125 1.546875q0 0.921875 -0.328125 1.59375q-0.328125 0.65625 -0.859375 1.0q-0.53125 0.328125 -1.140625 0.328125q-0.484375 0 -0.921875 -0.21875q-0.421875 -0.234375 -0.703125 -0.640625l0 2.71875l-0.890625 0l0 -7.484375zm2.375 4.859375q0.65625 0 1.109375 -0.5q0.453125 -0.5 0.453125 -1.625q0 -1.015625 -0.40625 -1.5625q-0.390625 -0.546875 -1.125 -0.546875q-0.671875 0 -1.109375 0.578125q-0.421875 0.5625 -0.421875 1.71875q0.03125 0.953125 0.421875 1.453125q0.390625 0.484375 1.078125 0.484375zm5.906311 0.71875q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm4.2031555 0.625l0 -0.703125l1.40625 0l0 -4.078125l-1.34375 0l0 -0.703125l2.203125 0l0 4.78125l1.28125 0l0 0.703125l-3.546875 0zm1.78125 -6.640625q-0.25 0 -0.4375 -0.171875q-0.171875 -0.1875 -0.171875 -0.4375q0 -0.265625 0.171875 -0.4375q0.171875 -0.1875 0.4375 -0.1875q0.25 0 0.4375 0.1875q0.1875 0.1875 0.1875 0.4375q0 0.25 -0.1875 0.4375q-0.1875 0.171875 -0.4375 0.171875zm3.875061 1.15625l0.84375 0l0 0.96875q0.328125 -0.5 0.8125 -0.796875q0.5 -0.296875 1.046875 -0.296875q0.734375 0 1.171875 0.5625q0.4375 0.546875 0.4375 1.71875l0 3.328125l-0.84375 0l0 -3.296875q0 -0.8125 -0.28125 -1.1875q-0.265625 -0.375 -0.71875 -0.375q-0.375 0 -0.75 0.21875q-0.375 0.21875 -0.625 0.609375q-0.25 0.390625 -0.25 0.875l0 3.15625l-0.84375 0l0 -5.484375zm10.4687805 4.984375q-0.796875 0.578125 -1.734375 0.578125q-0.921875 0 -1.296875 -0.546875q-0.375 -0.546875 -0.375 -1.796875q0 -0.203125 0.015625 -0.703125l0.109375 -1.8125l-1.203125 0l0 -0.703125l1.25 0l0.09375 -1.46875l0.953125 -0.15625l0.125 0l0.015625 0.0625q-0.09375 0.125 -0.140625 0.21875q-0.03125 0.078125 -0.046875 0.25l-0.125 1.09375l1.8125 0l0 0.703125l-1.859375 0l-0.109375 1.859375q-0.03125 0.484375 -0.03125 0.640625q0 0.96875 0.21875 1.3125q0.234375 0.328125 0.71875 0.328125q0.359375 0 0.65625 -0.125q0.3125 -0.140625 0.6875 -0.421875l0.265625 0.6875zm4.062561 -2.734375q1.03125 0.3125 1.453125 0.6875q0.4375 0.359375 0.4375 0.953125q0 0.734375 -0.59375 1.234375q-0.578125 0.484375 -1.671875 0.484375q-1.390625 0 -2.328125 -0.875l0.46875 -0.8125l0.015625 -0.015625l0.015625 0.015625q0.375 0.484375 0.765625 0.734375q0.40625 0.234375 1.078125 0.234375q0.65625 0 1.015625 -0.234375q0.375 -0.234375 0.375 -0.640625q0 -0.359375 -0.296875 -0.578125q-0.296875 -0.234375 -1.078125 -0.484375q-2.0625 -0.59375 -2.0625 -1.703125q0 -0.640625 0.515625 -1.0q0.53125 -0.375 1.5 -0.375q0.75 0 1.25 0.203125q0.515625 0.203125 0.9375 0.65625l-0.5 0.59375l0 0.015625q-0.265625 -0.390625 -0.734375 -0.609375q-0.453125 -0.21875 -0.921875 -0.21875q-0.515625 0 -0.859375 0.1875q-0.328125 0.171875 -0.328125 0.5q0 0.296875 0.328125 0.546875q0.34375 0.25 1.21875 0.5zm1.15625 -0.875q0 -0.0625 0.09375 0l-0.03125 0.046875l-0.0625 -0.046875zm0.140625 -0.03125q0.03125 0.046875 0.015625 0.0625q0 0.015625 -0.03125 0q-0.015625 -0.015625 -0.03125 -0.03125l0.046875 -0.03125zm-3.375 2.53125q0 0.046875 -0.109375 0l0.03125 -0.0625l0.078125 0.046875l0 0.015625zm-0.140625 0.03125q-0.03125 -0.046875 -0.03125 -0.046875q0.015625 0 0.0625 0.015625l-0.03125 0.03125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m233.89502 131.35573l-1.124588 -1.124588l1.124588 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m326.24146 131.35573l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m580.4762 255.67682l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm9.046875 0l-1.140625 0l-4.6875 -7.1875l-0.046875 0q0.09375 1.265625 0.09375 2.3125l0 4.875l-0.921875 0l0 -8.5625l1.125 0l4.671875 7.15625l0.046875 0q0 -0.15625 -0.046875 -1.015625q-0.046875 -0.859375 -0.03125 -1.234375l0 -4.90625l0.9375 0l0 8.5625zm10.8515625 0l-1.0625 -2.71875l-3.4375 0l-1.046875 2.71875l-1.015625 0l3.390625 -8.609375l0.828125 0l3.375 8.609375l-1.03125 0zm-1.375 -3.625l-1.0 -2.65625q-0.1875 -0.5 -0.390625 -1.234375q-0.140625 0.5625 -0.375 1.234375l-1.0 2.65625l2.765625 0zm9.015625 -2.453125q0 1.3125 -0.890625 2.015625q-0.890625 0.6875 -2.53125 0.6875l-1.015625 0l0 3.375l-1.0 0l0 -8.5625l2.234375 0q3.203125 0 3.203125 2.484375zm-4.4375 1.859375l0.90625 0q1.3125 0 1.90625 -0.421875q0.59375 -0.4375 0.59375 -1.390625q0 -0.84375 -0.5625 -1.25q-0.546875 -0.421875 -1.734375 -0.421875l-1.109375 0l0 3.484375zm6.2246094 4.21875l0 -8.5625l1.0 0l0 8.5625l-1.0 0z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m588.3893 213.18306q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375zm3.1933594 0q0 -0.390625 0.171875 -0.59375q0.1875 -0.203125 0.515625 -0.203125q0.34375 0 0.53125 0.203125q0.1875 0.203125 0.1875 0.59375q0 0.390625 -0.1875 0.59375q-0.1875 0.203125 -0.53125 0.203125q-0.296875 0 -0.5 -0.1875q-0.1875 -0.1875 -0.1875 -0.609375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m75.62294 283.52823l0 17.950958l100.62993 0l0 17.954529" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m75.62295 283.52823l0 17.950928l100.62992 0l0 14.527496" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m176.25287 316.00665l-1.124588 -1.1246033l1.124588 3.0897827l1.124588 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85654 283.52823l0 17.950958l-100.62991 0l0 17.954529" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85654 283.52823l0 17.950928l-100.62991 0l0 14.527496" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m176.22662 316.00665l-1.124588 -1.1246033l1.124588 3.0897827l1.124588 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 0.06298828l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 334.95734l-1.1245728 1.1246033l3.0897827 -1.1246033l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -41.858246l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -41.858246l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 293.0361l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.1246033z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -83.74802l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -83.74802l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 251.14633l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -125.60629l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -125.60629l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 209.28806l-1.1245728 1.124588l3.0897827 -1.124588l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m233.88803 165.75919l0 17.70752l42.960632 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.88805 165.75919l0 17.70752l42.960617 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.84866 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.86572 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#d9ead3" d="m132.49171 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m132.49171 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m149.81764 268.42715q1.0625 0.4375 1.46875 0.90625q0.40625 0.46875 0.40625 1.171875q0 0.5625 -0.265625 1.0625q-0.265625 0.484375 -0.828125 0.796875q-0.5625 0.3125 -1.390625 0.3125q-1.453125 0 -2.34375 -0.953125l0.4375 -0.75l0 -0.015625q0 0 0 0.015625q0 0 0 0q0.328125 0.421875 0.828125 0.6875q0.515625 0.25 1.1875 0.25q0.671875 0 1.109375 -0.359375q0.4375 -0.375 0.4375 -0.921875q0 -0.34375 -0.140625 -0.578125q-0.125 -0.234375 -0.484375 -0.453125q-0.359375 -0.234375 -1.078125 -0.546875q-1.109375 -0.4375 -1.59375 -0.984375q-0.46875 -0.5625 -0.46875 -1.234375q0 -0.84375 0.609375 -1.34375q0.625 -0.515625 1.671875 -0.515625q0.609375 0 1.140625 0.25q0.546875 0.25 0.9375 0.6875l-0.46875 0.625l-0.015625 0.015625q-0.359375 -0.484375 -0.75 -0.671875q-0.390625 -0.203125 -0.96875 -0.203125q-0.578125 0 -0.9375 0.328125q-0.34375 0.3125 -0.34375 0.765625q0 0.34375 0.140625 0.609375q0.15625 0.25 0.546875 0.5q0.390625 0.25 1.15625 0.546875zm1.03125 -1.84375q0 -0.046875 0.046875 -0.015625q0.046875 0.015625 0.0625 0.015625l-0.03125 0.046875l-0.078125 -0.046875l0 0zm0.125 -0.03125q0.078125 0.09375 0.03125 0.0625q-0.03125 -0.03125 -0.046875 -0.03125l0.015625 -0.03125zm-3.546875 4.375q0 0.03125 -0.046875 0.015625q-0.046875 -0.03125 -0.0625 -0.03125l0.03125 -0.046875l0.078125 0.046875l0 0.015625zm-0.125 0.03125q-0.078125 -0.09375 0.015625 -0.046875l-0.015625 0.046875zm7.859421 -4.015625q1.1875 0 1.796875 0.625q0.625 0.609375 0.625 2.0625l0 2.921875l-0.9375 0l0 -0.84375q-0.5 0.96875 -1.875 0.96875q-0.90625 0 -1.421875 -0.40625q-0.515625 -0.421875 -0.515625 -1.09375q0 -0.578125 0.359375 -1.0q0.375 -0.4375 1.0 -0.671875q0.640625 -0.234375 1.390625 -0.234375q0.6875 0 1.234375 0.0625q-0.0625 -0.921875 -0.484375 -1.296875q-0.40625 -0.375 -1.21875 -0.375q-0.421875 0 -0.796875 0.15625q-0.375 0.15625 -0.6875 0.453125l-0.421875 -0.5625q0.765625 -0.765625 1.953125 -0.765625zm-0.3125 5.078125q0.890625 0 1.40625 -0.515625q0.515625 -0.53125 0.5625 -1.515625q-0.53125 -0.078125 -1.15625 -0.078125q-0.90625 0 -1.4375 0.296875q-0.53125 0.296875 -0.53125 0.921875q0 0.890625 1.15625 0.890625zm8.843796 -4.953125q-0.1875 0.96875 -0.796875 2.40625l-1.328125 3.078125l-0.671875 0l-2.171875 -5.484375l0.859375 0l1.6875 4.296875l0.890625 -2.03125q0.546875 -1.25 0.71875 -2.265625l0.8125 0zm3.8125458 5.609375q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm4.343796 3.40625q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm3.5469208 0.640625l0 -7.46875l0.671875 0l1.84375 3.65625l1.890625 -3.671875l0.625 0l0 7.484375l-0.78125 0l0 -5.640625l-1.625 3.015625l-0.328125 0l-1.515625 -2.984375l0 5.609375l-0.78125 0zm8.515671 0.09375q-0.734375 0 -1.3125 -0.359375q-0.578125 -0.359375 -0.90625 -1.0q-0.3125 -0.65625 -0.3125 -1.484375q0 -0.828125 0.3125 -1.46875q0.328125 -0.65625 0.90625 -1.015625q0.578125 -0.375 1.3125 -0.375q0.734375 0 1.3125 0.375q0.578125 0.359375 0.890625 1.015625q0.328125 0.640625 0.328125 1.46875q0 0.828125 -0.328125 1.484375q-0.3125 0.640625 -0.890625 1.0q-0.578125 0.359375 -1.3125 0.359375zm0 -0.71875q0.46875 0 0.828125 -0.265625q0.375 -0.28125 0.578125 -0.765625q0.21875 -0.484375 0.21875 -1.109375q0 -0.9375 -0.46875 -1.53125q-0.453125 -0.59375 -1.15625 -0.59375q-0.703125 0 -1.171875 0.59375q-0.453125 0.59375 -0.453125 1.53125q0 0.625 0.203125 1.109375q0.21875 0.484375 0.578125 0.765625q0.375 0.265625 0.84375 0.265625zm5.812546 0.75q-0.59375 0 -1.109375 -0.328125q-0.515625 -0.34375 -0.84375 -1.0q-0.3125 -0.65625 -0.3125 -1.59375q0 -0.953125 0.328125 -1.578125q0.34375 -0.640625 0.859375 -0.9375q0.53125 -0.3125 1.125 -0.3125q0.546875 0 0.953125 0.25q0.421875 0.25 0.640625 0.6875l0 -3.296875l0.90625 0l0 0.09375q-0.0625 0.0625 -0.09375 0.15625q-0.015625 0.078125 -0.015625 0.28125l0.015625 6.59375q0 0.296875 0.015625 0.484375q0.015625 0.1875 0.109375 0.375l-0.859375 0q-0.078125 -0.1875 -0.09375 -0.375q-0.015625 -0.1875 -0.015625 -0.484375q-0.265625 0.46875 -0.6875 0.734375q-0.40625 0.25 -0.921875 0.25zm0.125 -0.765625q0.75 0 1.09375 -0.578125q0.34375 -0.59375 0.34375 -1.546875q0 -0.984375 -0.375 -1.5625q-0.375 -0.59375 -1.125 -0.59375q-0.734375 0 -1.125 0.53125q-0.375 0.53125 -0.375 1.46875q0 1.046875 0.40625 1.671875q0.40625 0.609375 1.15625 0.609375zm6.328171 0.765625q-1.21875 0 -1.953125 -0.75q-0.71875 -0.75 -0.71875 -2.09375q0 -0.90625 0.328125 -1.5625q0.34375 -0.65625 0.90625 -0.984375q0.578125 -0.34375 1.28125 -0.34375q1.0 0 1.609375 0.65625q0.625 0.65625 0.625 1.9375q0 0.140625 -0.03125 0.40625l-3.890625 0q0.046875 1.0 0.5625 1.515625q0.515625 0.515625 1.296875 0.515625q0.875 0 1.421875 -0.609375l0.484375 0.46875q-0.703125 0.84375 -1.921875 0.84375zm1.203125 -3.40625q0 -0.78125 -0.390625 -1.21875q-0.390625 -0.453125 -1.03125 -0.453125q-0.59375 0 -1.046875 0.421875q-0.4375 0.421875 -0.546875 1.25l3.015625 0zm2.4844208 3.28125l0 -0.703125l1.609375 0l0 -6.578125l-1.546875 0l0 -0.703125l2.421875 0l0 7.28125l1.609375 0l0 0.703125l-4.09375 0z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m176.23885 99.34974l0 153.19684" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m176.23885 99.34974l0 149.76978" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m176.23885 249.1195l-1.124588 -1.124588l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m176.23975 283.52823l0 17.950958l0.06298828 0l0 17.954529" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m176.23975 283.52823l0 17.950928l0.06298828 0l0 14.527496" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m176.30273 316.00665l-1.1245728 -1.1246033l1.1245728 3.0897827l1.124588 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m75.62205 99.34843l0 153.19684" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m75.62205 99.34843l0 149.76978" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m75.62205 249.1182l-1.1245804 -1.124588l1.1245804 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m99.50131 100.0l0 76.0l54.992126 0l0 76.0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m99.50131 100.0l0 76.0l54.992126 0l0 72.57292" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m154.49344 248.5729l-1.124588 -1.1245728l1.124588 3.0897675l1.124588 -3.0897675z" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png b/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png
new file mode 100644
index 0000000000000000000000000000000000000000..b8eedce7eaeb0f0440d7c36a243cfd729c3699d0
Binary files /dev/null and b/tensorflow/lite/g3doc/images/landing-page/ai_in_motion.png differ
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/assistant_logo.png b/tensorflow/lite/g3doc/images/landing-page/assistant_logo.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/assistant_logo.png
rename to tensorflow/lite/g3doc/images/landing-page/assistant_logo.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/detect_crop_disease_in_africa.png b/tensorflow/lite/g3doc/images/landing-page/detect_crop_disease_in_africa.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/detect_crop_disease_in_africa.png
rename to tensorflow/lite/g3doc/images/landing-page/detect_crop_disease_in_africa.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/fishbrain_logo.png b/tensorflow/lite/g3doc/images/landing-page/fishbrain_logo.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/fishbrain_logo.png
rename to tensorflow/lite/g3doc/images/landing-page/fishbrain_logo.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/fishbrain_logo_big.png b/tensorflow/lite/g3doc/images/landing-page/fishbrain_logo_big.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/fishbrain_logo_big.png
rename to tensorflow/lite/g3doc/images/landing-page/fishbrain_logo_big.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/gboard_logo.png b/tensorflow/lite/g3doc/images/landing-page/gboard_logo.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/gboard_logo.png
rename to tensorflow/lite/g3doc/images/landing-page/gboard_logo.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/gmail_logo.png b/tensorflow/lite/g3doc/images/landing-page/gmail_logo.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/gmail_logo.png
rename to tensorflow/lite/g3doc/images/landing-page/gmail_logo.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/loseit_logo.png b/tensorflow/lite/g3doc/images/landing-page/loseit_logo.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/loseit_logo.png
rename to tensorflow/lite/g3doc/images/landing-page/loseit_logo.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/loseit_logo_big.png b/tensorflow/lite/g3doc/images/landing-page/loseit_logo_big.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/loseit_logo_big.png
rename to tensorflow/lite/g3doc/images/landing-page/loseit_logo_big.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/nest_logo.png b/tensorflow/lite/g3doc/images/landing-page/nest_logo.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/nest_logo.png
rename to tensorflow/lite/g3doc/images/landing-page/nest_logo.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/photos_logo.png b/tensorflow/lite/g3doc/images/landing-page/photos_logo.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/photos_logo.png
rename to tensorflow/lite/g3doc/images/landing-page/photos_logo.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/shazam_logo.png b/tensorflow/lite/g3doc/images/landing-page/shazam_logo.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/shazam_logo.png
rename to tensorflow/lite/g3doc/images/landing-page/shazam_logo.png
diff --git a/tensorflow/contrib/lite/g3doc/images/landing-page/vsco_logo.png b/tensorflow/lite/g3doc/images/landing-page/vsco_logo.png
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/images/landing-page/vsco_logo.png
rename to tensorflow/lite/g3doc/images/landing-page/vsco_logo.png
diff --git a/tensorflow/lite/g3doc/images/performance/model_size_vs_accuracy.png b/tensorflow/lite/g3doc/images/performance/model_size_vs_accuracy.png
new file mode 100644
index 0000000000000000000000000000000000000000..44d0ccd3128dea1c947e57ccbc4e18b2d34cef88
Binary files /dev/null and b/tensorflow/lite/g3doc/images/performance/model_size_vs_accuracy.png differ
diff --git a/tensorflow/lite/g3doc/images/performance/model_size_vs_latency.png b/tensorflow/lite/g3doc/images/performance/model_size_vs_latency.png
new file mode 100644
index 0000000000000000000000000000000000000000..94a6310612828db2370d19a094795341478e90f8
Binary files /dev/null and b/tensorflow/lite/g3doc/images/performance/model_size_vs_latency.png differ
diff --git a/tensorflow/contrib/lite/g3doc/ios.md b/tensorflow/lite/g3doc/ios.md
similarity index 78%
rename from tensorflow/contrib/lite/g3doc/ios.md
rename to tensorflow/lite/g3doc/ios.md
index 3b9fcca8117dc1859d075ae5f048cfc9f0d988a3..c195b6abf4f76f88d1f60b192cd19165aefe9a11 100644
--- a/tensorflow/contrib/lite/g3doc/ios.md
+++ b/tensorflow/lite/g3doc/ios.md
@@ -41,24 +41,24 @@ brew link libtool
 Then you need to run a shell script to download the dependencies you need:
 
 ```bash
-tensorflow/contrib/lite/tools/make/download_dependencies.sh
+tensorflow/lite/tools/make/download_dependencies.sh
 ```
 
 This will fetch copies of libraries and data from the web and install them in
-`tensorflow/contrib/lite/downloads`.
+`tensorflow/lite/downloads`.
 
 With all of the dependencies set up, you can now build the library for all five
 supported architectures on iOS:
 
 ```bash
-tensorflow/contrib/lite/tools/make/build_ios_universal_lib.sh
+tensorflow/lite/tools/make/build_ios_universal_lib.sh
 ```
 
-Under the hood this uses a makefile in `tensorflow/contrib/lite` to build the
+Under the hood this uses a makefile in `tensorflow/lite` to build the
 different versions of the library, followed by a call to `lipo` to bundle them
 into a universal file containing armv7, armv7s, arm64, i386, and x86_64
 architectures. The resulting library is in
-`tensorflow/contrib/lite/tools/make/gen/lib/libtensorflow-lite.a`.
+`tensorflow/lite/tools/make/gen/lib/libtensorflow-lite.a`.
 
 If you get an error such as `no such file or directory: 'x86_64'` when running 
 `build_ios_universal_lib.sh`: open Xcode > Preferences > Locations, and ensure 
@@ -68,19 +68,19 @@ a value is selected in the "Command Line Tools" dropdown.
 
 You'll need to update various settings in your app to link against TensorFlow
 Lite. You can view them in the example project at
-`tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj` but here's a full
+`tensorflow/lite/examples/ios/simple/simple.xcodeproj` but here's a full
 rundown:
 
 -   You'll need to add the library at
-    `tensorflow/contrib/lite/gen/lib/libtensorflow-lite.a` to your linking build
-    stage, and in Search Paths add `tensorflow/contrib/lite/gen/lib` to the
+    `tensorflow/lite/gen/lib/libtensorflow-lite.a` to your linking build
+    stage, and in Search Paths add `tensorflow/lite/gen/lib` to the
     Library Search Paths setting.
 
 -   The _Header Search_ paths needs to contain:
 
     -   the root folder of tensorflow,
-    -   `tensorflow/contrib/lite/downloads`
-    -   `tensorflow/contrib/lite/downloads/flatbuffers/include`
+    -   `tensorflow/lite/downloads`
+    -   `tensorflow/lite/downloads/flatbuffers/include`
 
 -   C++11 support (or later) should be enabled by setting `C++ Language Dialect`
     to `GNU++11` (or `GNU++14`), and `C++ Standard Library` to `libc++`.
diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/lite/g3doc/models.md
similarity index 94%
rename from tensorflow/contrib/lite/g3doc/models.md
rename to tensorflow/lite/g3doc/models.md
index 279764ce964e523c769addda2b477690694dc048..62b3f17c79aa3688011a1452da18e098008f414e 100644
--- a/tensorflow/contrib/lite/g3doc/models.md
+++ b/tensorflow/lite/g3doc/models.md
@@ -7,13 +7,13 @@ Model Name          | Paper_Model_Files | Model_Size | Top-1 Accuracy | Top-5 Ac
 ------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ---------------------:
 MnasNet_0.50_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_0.5_224_09_07_2018.tgz) | 8.5 Mb    | 68.03%          | 87.79%          | 37 ms
 MnasNet_0.75_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_0.75_224_09_07_2018.tgz) | 12 Mb     | 71.72%          | 90.17%          | 61 ms
-MnasNet_1.0_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_224_09_07_2018.tgz) | 17 Mb     | 74.08%          | 91.75%          | 93 ms
-MnasNet_1.3_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.3_224_09_07_2018.tgz) | 24 Mb     | 75.24%          | 92.55%          | 152 ms
 MnasNet_1.0_96| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_96_09_07_2018.tgz) | 17 Mb    | 62.33%          | 83.98%          | 23 ms
 MnasNet_1.0_128| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_128_09_07_2018.tgz) | 17 Mb    | 67.32%          | 87.70%          | 34 ms
 MnasNet_1.0_160| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_160_09_07_2018.tgz) | 17 Mb    | 70.63%          | 89.58%          | 51 ms
 MnasNet_1.0_192| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_192_09_07_2018.tgz) | 17 Mb    | 72.56%          | 90.76%          | 70 ms
 MnasNet_1.0_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_224_09_07_2018.tgz) | 17 Mb    | 74.08%          | 91.75%          | 93 ms
+MnasNet_1.3_224| [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.3_224_09_07_2018.tgz) | 24 Mb     | 75.24%          | 92.55%          | 152 ms
+
 
 ^ Performance numbers are generated on Pixel-1 using single thread large BIG core.
 
@@ -76,8 +76,11 @@ Mobilenet_V1_1.0_128_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tf
 Mobilenet_V1_1.0_160_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz)  | 4.3 Mb     | 66.9%          | 86.7%          | 37.4 ms
 Mobilenet_V1_1.0_192_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz)  | 4.3 Mb     | 69.1%          | 88.1%          | 51.9 ms
 Mobilenet_V1_1.0_224_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)  | 4.3 Mb     | 70.0%          | 89.0%          | 70.2 ms
-Mobilenet_v2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
-Inception_v3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
+Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
+Inception_V1_quant          | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](http://download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz)                          | 6.4 Mb     | 70.1%          | 89.8%          | 154.5 ms
+Inception_V2_quant          | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](http://download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz)                         | 11 Mb      | 73.5%          | 91.4%          | 235.0 ms
+Inception_V3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
+Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](http://download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz)                         | 41 Mb      | 79.5%          | 93.9%          | 1250.8 ms
 
 ## Other models
 
diff --git a/tensorflow/contrib/lite/g3doc/ops_versioning.md b/tensorflow/lite/g3doc/ops_versioning.md
similarity index 100%
rename from tensorflow/contrib/lite/g3doc/ops_versioning.md
rename to tensorflow/lite/g3doc/ops_versioning.md
diff --git a/tensorflow/lite/g3doc/overview.md b/tensorflow/lite/g3doc/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d747a9b59f734a007ef54d13223aed22f38cb1d
--- /dev/null
+++ b/tensorflow/lite/g3doc/overview.md
@@ -0,0 +1,202 @@
+
+# Introduction to TensorFlow Lite
+
+TensorFlow Lite is TensorFlow’s lightweight solution for mobile and embedded
+devices. It enables on-device machine learning inference with low latency and a
+small binary size. TensorFlow Lite also supports hardware acceleration with the
+[Android Neural Networks
+API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
+
+TensorFlow Lite uses many techniques for achieving low latency such as
+optimizing the kernels for mobile apps, pre-fused activations, and quantized
+kernels that allow smaller and faster (fixed-point math) models.
+
+Most of our TensorFlow Lite documentation is [on
+GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite)
+for the time being.
+
+## What does TensorFlow Lite contain?
+
+TensorFlow Lite supports a set of core operators, both quantized and
+float, which have been tuned for mobile platforms. They incorporate pre-fused
+activations and biases to further enhance performance and quantized
+accuracy. Additionally, TensorFlow Lite also supports using custom operations in
+models.
+
+TensorFlow Lite defines a new model file format, based on
+[FlatBuffers](https://google.github.io/flatbuffers/). FlatBuffers is an
+efficient open-source cross-platform serialization library. It is similar to
+[protocol buffers](https://developers.google.com/protocol-buffers/?hl=en), but
+the primary difference is that FlatBuffers does not need a parsing/unpacking
+step to a secondary representation before you can access data, often coupled
+with per-object memory allocation. Also, the code footprint of FlatBuffers is an
+order of magnitude smaller than protocol buffers.
+
+TensorFlow Lite has a new mobile-optimized interpreter, which has the key goals
+of keeping apps lean and fast. The interpreter uses a static graph ordering and
+a custom (less-dynamic) memory allocator to ensure minimal load, initialization,
+and execution latency.
+
+TensorFlow Lite provides an interface to leverage hardware acceleration, if
+available on the device. It does so via the
+[Android Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/index.html),
+available on Android 8.1 (API level 27) and higher.
+
+## Why do we need a new mobile-specific library?
+
+Machine Learning is changing the computing paradigm, and we see an emerging
+trend of new use cases on mobile and embedded devices. Consumer expectations are
+also trending toward natural, human-like interactions with their devices, driven
+by the camera and voice interaction models.
+
+There are several factors which are fueling interest in this domain:
+
+- Innovation at the silicon layer is enabling new possibilities for hardware
+  acceleration, and frameworks such as the Android Neural Networks API make it
+  easy to leverage these.
+
+- Recent advances in real-time computer-vision and spoken language understanding
+  have led to mobile-optimized benchmark models being open sourced
+  (e.g. MobileNets, SqueezeNet).
+
+- Widely-available smart appliances create new possibilities for
+  on-device intelligence.
+
+- Interest in stronger user data privacy paradigms where user data does not need
+  to leave the mobile device.
+
+- Ability to serve ‘offline’ use cases, where the device does not need to be
+  connected to a network.
+
+We believe the next wave of machine learning applications will have significant
+processing on mobile and embedded devices.
+
+## TensorFlow Lite highlights
+
+TensorFlow Lite provides:
+
+- A set of core operators, both quantized and float, many of which have been
+  tuned for mobile platforms.  These can be used to create and run custom
+  models.  Developers can also write their own custom operators and use them in
+  models.
+
+- A new [FlatBuffers](https://google.github.io/flatbuffers/)-based
+  model file format.
+
+- On-device interpreter with kernels optimized for faster execution on mobile.
+
+- TensorFlow converter to convert TensorFlow-trained models to the TensorFlow
+  Lite format.
+
+- Smaller in size: TensorFlow Lite is smaller than 300KB when all supported
+  operators are linked and less than 200KB when using only the operators needed
+  for supporting InceptionV3 and Mobilenet.
+
+- **Pre-tested models:**
+
+    All of the following models are guaranteed to work out of the box:
+
+    - Inception V3, a popular model for detecting the dominant objects
+      present in an image.
+
+    - [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md),
+      a family of mobile-first computer vision models designed to effectively
+      maximize accuracy while being mindful of the restricted resources for an
+      on-device or embedded application. They are small, low-latency, low-power
+      models parameterized to meet the resource constraints of a variety of use
+      cases. They can be built upon for classification, detection, embeddings
+      and segmentation. MobileNet models are smaller but [lower in
+      accuracy](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
+      than Inception V3.
+
+    - On Device Smart Reply, an on-device model which provides one-touch
+      replies for an incoming text message by suggesting contextually relevant
+      messages. The model was built specifically for memory constrained devices
+      such as watches & phones and it has been successfully used to surface
+      [Smart Replies on Android
+      Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+      to all first-party and third-party apps.
+
+    Also see the complete list of
+    [TensorFlow Lite's supported models](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models.md),
+    including the model sizes, performance numbers, and downloadable model files.
+
+- Quantized versions of the MobileNet model, which runs faster than the
+  non-quantized (float) version on CPU.
+
+- New Android demo app to illustrate the use of TensorFlow Lite with a quantized
+  MobileNet model for object classification.
+
+- Java and C++ API support
+
+
+## Getting Started
+
+We recommend you try out TensorFlow Lite with the pre-tested models indicated
+above. If you have an existing model, you will need to test whether your model
+is compatible with both the converter and the supported operator set.  To test
+your model, see the
+[documentation on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite).
+
+### Retrain Inception-V3 or MobileNet for a custom data set
+
+The pre-trained models mentioned above have been trained on the ImageNet data
+set, which consists of 1000 predefined classes. If those classes are not
+relevant or useful for your use case, you will need to retrain those
+models. This technique is called transfer learning, which starts with a model
+that has been already trained on a problem and will then be retrained on a
+similar problem. Deep learning from scratch can take days, but transfer learning
+can be done fairly quickly. In order to do this, you'll need to generate your
+custom data set labeled with the relevant classes.
+
+The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
+codelab walks through this process step-by-step. The retraining code supports
+retraining for both floating point and quantized inference.
+
+## TensorFlow Lite Architecture
+
+The following diagram shows the architectural design of TensorFlow Lite:
+
+<img src="https://www.tensorflow.org/images/tflite-architecture.jpg"
+     alt="TensorFlow Lite architecture diagram"
+     style="max-width:600px;">
+
+Starting with a trained TensorFlow model on disk, you'll convert that model to
+the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
+Converter. Then you can use that converted file in your mobile application.
+
+Deploying the TensorFlow Lite model file uses:
+
+- Java API: A convenience wrapper around the C++ API on Android.
+
+- C++ API: Loads the TensorFlow Lite Model File and invokes the Interpreter. The
+  same library is available on both Android and iOS.
+
+- Interpreter: Executes the model using a set of kernels. The interpreter
+  supports selective kernel loading; without kernels it is only 100KB, and 300KB
+  with all the kernels loaded. This is a significant reduction from the 1.5M
+  required by TensorFlow Mobile.
+
+- On select Android devices, the Interpreter will use the Android Neural
+  Networks API for hardware acceleration, or default to CPU execution if none
+  are available.
+
+You can also implement custom kernels using the C++ API that can be used by the
+Interpreter.
+
+## Future Work
+
+In future releases, TensorFlow Lite will support more models and built-in
+operators, contain performance improvements for both fixed point and floating
+point models, improvements to the tools to enable easier developer workflows and
+support for other smaller devices and more. As we continue development, we hope
+that TensorFlow Lite will greatly simplify the developer experience of targeting
+a model for small devices.
+
+Future plans include using specialized machine learning hardware to get the best
+possible performance for a particular model on a particular device.
+
+## Next Steps
+
+The TensorFlow Lite [GitHub repository](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite).
+contains additional docs, code samples, and demo applications.
diff --git a/tensorflow/lite/g3doc/performance/benchmarks.md b/tensorflow/lite/g3doc/performance/benchmarks.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a1e5586beecad4876c9d0390a0fa31e78705195
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/benchmarks.md
@@ -0,0 +1,174 @@
+
+# Performance
+
+This document lists TensorFlow Lite performance benchmarks when running well
+known models on some Android and iOS devices.
+
+These performance benchmark numbers were generated with the
+[Android TFLite benchmark binary](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark)
+and the [iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
+
+# Android performance benchmarks
+
+For Android benchmarks, the CPU affinity is set to use big cores on the device to
+reduce variance (see [details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#reducing-variance-between-runs-on-android)).
+
+It assumes that models were download and unzipped to the
+`/data/local/tmp/tflite_models` directory. The benchmark binary is built
+using [these instructions](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#on-android)
+and assumed in the `/data/local/tmp` directory.
+
+To run the benchmark:
+
+```
+adb shell taskset ${CPU_MASK} /data/local/tmp/benchmark_model \
+  --num_threads=1 \
+  --graph=/data/local/tmp/tflite_models/${GRAPH} \
+  --warmup_runs=1 \
+  --num_runs=50 \
+  --use_nnapi=false
+```
+
+Here, `${GRAPH}` is the name of model and `${CPU_MASK}` is the CPU affinity
+chosen according to the following table:
+
+Device | CPU_MASK |
+-------| ----------
+Pixel 2 | f0 |
+Pixel xl | 0c |
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Name</th>
+      <th>Device </th>
+      <th>Mean inference time (std dev)</th>
+    </tr>
+  </thead>
+  <tr>
+    <td rowspan = 2>
+      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+    </td>
+    <td>Pixel 2 </td>
+    <td>166.5 ms (2.6 ms)</td>
+  </tr>
+   <tr>
+     <td>Pixel xl </td>
+     <td>122.9 ms (1.8 ms)  </td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
+    </td>
+    <td>Pixel 2 </td>
+    <td>69.5 ms (0.9 ms)</td>
+  </tr>
+   <tr>
+     <td>Pixel xl </td>
+     <td>78.9 ms (2.2 ms)  </td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
+    </td>
+    <td>Pixel 2 </td>
+    <td>273.8 ms (3.5 ms)</td>
+  </tr>
+   <tr>
+     <td>Pixel xl </td>
+     <td>210.8 ms (4.2 ms)</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
+    </td>
+    <td>Pixel 2 </td>
+    <td>234.0 ms (2.1 ms)</td>
+  </tr>
+   <tr>
+     <td>Pixel xl </td>
+     <td>158.0 ms (2.1 ms)</td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
+    </td>
+    <td>Pixel 2 </td>
+    <td>2846.0 ms (15.0 ms)</td>
+  </tr>
+   <tr>
+     <td>Pixel xl </td>
+     <td>1973.0 ms (15.0 ms)  </td>
+  </tr>
+  <tr>
+    <td rowspan = 2>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
+    </td>
+    <td>Pixel 2 </td>
+    <td>3180.0 ms (11.7 ms)</td>
+  </tr>
+   <tr>
+     <td>Pixel xl </td>
+     <td>2262.0 ms (21.0 ms)  </td>
+  </tr>
+
+ </table>
+
+# iOS benchmarks
+
+To run iOS benchmarks, the [benchmark
+app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios)
+was modified to include the appropriate model and `benchmark_params.json` was
+modified  to set `num_threads` to 1.
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Name</th>
+      <th>Device </th>
+      <th>Mean inference time (std dev)</th>
+    </tr>
+  </thead>
+  <tr>
+    <td>
+      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
+    </td>
+    <td>iPhone 8 </td>
+    <td>32.2 ms (0.8 ms)</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
+    </td>
+    <td>iPhone 8 </td>
+    <td>24.4 ms (0.8 ms)</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
+    </td>
+    <td>iPhone 8 </td>
+    <td>60.3 ms (0.6 ms)</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
+    </td>
+    <td>iPhone 8 </td>
+    <td>44.3 (0.7 ms)</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
+    </td>
+    <td>iPhone 8</td>
+    <td>562.4 ms (18.2 ms)</td>
+  </tr>
+  <tr>
+    <td>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
+    </td>
+    <td>iPhone 8 </td>
+    <td>661.0 ms (29.2 ms)</td>
+  </tr>
+ </table>
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
new file mode 100644
index 0000000000000000000000000000000000000000..b76414cebe0d7092086073a478eb6330cbea713e
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -0,0 +1,44 @@
+# Performance best practices
+
+Mobile and embedded devices have limited computational resources and it is important to keep your application resource efficient. We have compiled a list of best practices and strategies you can use to optimize your model and application when using Tensorflow Lite.
+
+## Choose the best model for the task
+Depending on the task you will need to make a tradeoff between model complexity and size. If your task requires high accuracy then you may need a large and complex model. Some tasks may work with a less precise model, for these tasks it is better to use a smaller but less precise model. Smaller models not only use less disk space and memory but are generally faster and more energy efficient. For example, graphs below show accuracy and latency tradeoff for some common image classification models.
+
+![accuracy vs model size](../images/performance/model_size_vs_accuracy.png "Accuracy vs Model size")
+
+
+![latency vs model size](../images/performance/model_size_vs_latency.png "Latency vs Model size")
+
+One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. Tensorflow Lite [models page](../models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
+
+You can retrain the listed models on your own dataset by using transfer learning. Check out our transfer learning tutorial for
+[image classification](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0) and
+ [object detection](https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193).
+
+
+## Profile your model
+Once you have selected a candidate model that is right for your task, it is a good practice to profile and benchmark your model. Tensorflow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
+
+## Profile and optimize operators in the graph
+If a particular operator appears frequently in the model and based on profiling you find the operator consuming the most amount of time, you can look into optimizing the operator.
+ This scenario should be rare as Tensorflow Lite has optimized versions for most ops. However you may be able to write a faster version of a custom op, if you know the constraints in which the operator is executed. Check out our [custom operator documentation](../custom_operators.md).
+
+## Quantize your model
+If your model uses floating point weights or activations then it may be possible to reduce the size of model up to ~4x by using quantization and other model optimizations. Check out our [model optimization toolkit](model_optimization.md) for details about optimizing your model. 
+
+## Tweak the number of threads
+Tensorflow Lite supports multi-threaded kernels for many operators. You can increase the number of threads and speed up execution of operators. Increasing the number of threads will however make your model use more resources and power. For some applications latency may be more important than energy efficiency. You can increase the number of threads by setting the number of [interpreter](https://github.com/tensorflow/tensorflow/blob/1084594657a5d139102ac794f84d1427a710e39a/tensorflow/lite/interpreter.h#L337) threads. Multi-threaded execution however comes at the cost of increased performance variability depending on what else is been executed concurrently. This is particularly the case for mobile apps. For example, isolated tests may show 2x speed up vs single-threaded but if another app is executing at the same time may result in worst performance than single-threaded.
+
+## Eliminate redundant copies
+If your application is not careful, there can be redundant copies when feeding the input to the model and reading output from the model. Make sure to eliminate redundant copies. If you are using higher level APIs like Java API, make sure to carefully check the documentation for performance caveats. For example, the Java API is a lot faster if ByteBuffers are used as [inputs](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L151).
+
+## Profile your application with platform specific tools
+Platform specific tools like [Android profiler](https://developer.android.com/studio/profile/android-profiler) and [Instruments](https://help.apple.com/instruments/mac/current/) provide a wealth of profiling information that can be used to debug your app. Sometimes the performance bug may be not in the model but in parts of application code that interact with the model. Make sure to familiarize yourself with platform specific profiling tools and best practices for your platform.
+
+## Evaluate whether your model benefits from using hardware accelerators available on the device
+Tensorflow Lite is working on adding support for accelerators like GPU and provides acceleration through [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/) on Android.
+You can utilize these hardware accelerator backends to improve the speed and efficiency of your model. To enable Neural Networks API call [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/lite/interpreter.h#L334) on the interpreter instance.
+
+## Need more help
+The Tensorflow team is happy to help diagnose and address specific performance issues you may be facing. Please file an issue on [GitHub](https://github.com/tensorflow/tensorflow/issues) with details of the issue.
diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..2eb432c008168794c3722fcd4f9ab6df0771e48f
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/model_optimization.md
@@ -0,0 +1,88 @@
+# Model optimization
+
+Inference efficiency is a critical issue when deploying machine learning
+models to mobile devices. Where the computational demand for *training*
+grows with the number of models trained on different architectures, the
+computational demand for *inference* grows in proportion to the number of
+users. The *Tensorflow Model Optimization Toolkit* minimizes the complexity
+of inference—the model size, the latency and power consumption.
+
+
+## Use cases
+
+Model optimization is useful for:
+
+* Deploying models to edge devices with restrictions on processing, memory, or power-consumption.
+  For example, mobile and Internet of Things (IoT) devices.
+* Reduce the payload size for over-the-air model updates.
+* Execution on hardware constrained by fixed-point operations.
+* Optimize models for special purpose hardware accelerators.
+
+
+## Optimization methods
+
+Model optimization uses multiple techniques:
+
+* Reduced parameter count, for example, pruning and structured pruning.
+* Reduced representational precision, for example, quantization.
+* Update the original model topology to a more efficient one, with reduced parameters or faster execution, for example, tensor decomposition methods and distillation.
+
+## Model quantization
+
+Quantizing deep neural networks uses techniques that allow for reduced precision
+representations of weights and, optionally, activations for both storage and
+computation. Quantization provides several benefits:
+
+* Support on existing CPU platforms.
+* Quantizing activations reduces memory access costs for reading and storing intermediate activations.
+* Many CPU and hardware accelerator implementations provide SIMD instruction capabilities, which are especially beneficial for quantization.
+
+TensorFlow Lite provides several levels of support for quantization.
+
+[Post-training quantization](post_training_quantization.md) quantizes weights and activations post training and is very easy to use.
+[Quantization-aware training](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md){:.external} allows for training networks that can be quantized with minimal accuracy drop and is only available
+for a subset of convolutional neural network architectures.
+
+
+### Latency and accuracy results
+
+Below are the results of the latency and accuracy of post-training quantization and
+quantization-aware training on a few models. All latency numbers are measured on
+Pixel&nbsp;2 devices using a single big core. As the toolkit improves, so will the numbers here:
+
+<figure>
+  <table>
+    <tr>
+      <th>Model</th>
+      <th>Top-1 Accuracy (Original) </th>
+      <th>Top-1 Accuracy (Post Training Quantized) </th>
+      <th>Top-1 Accuracy (Quantization Aware Training) </th>
+      <th>Latency (Original) (ms) </th>
+      <th>Latency (Post Training Quantized) (ms) </th>
+      <th>Latency (Quantization Aware Training) (ms) </th>
+      <th> Size (Original) (MB)</th>
+      <th> Size (Optimized) (MB)</th>
+    </tr>
+    <tr><td>Mobilenet-v1-1-224</td><td>0.709</td><td>0.657</td><td>0.70</td>
+      <td>180</td><td>145</td><td>80.2</td><td>16.9</td><td>4.3</td></tr>
+    <tr><td>Mobilenet-v2-1-224</td><td>0.719</td><td>0.637</td><td>0.709</td>
+      <td>117</td><td>121</td><td>80.3</td><td>14</td><td>3.6</td></tr>
+   <tr><td>Inception_v3</td><td>0.78</td><td>0.772</td><td>0.775</td>
+      <td>1585</td><td>1187</td><td>637</td><td>95.7</td><td>23.9</td></tr>
+   <tr><td>Resnet_v2_101</td><td>0.770</td><td>0.768</td><td>N/A</td>
+      <td>3973</td><td>2868</td><td>N/A</td><td>178.3</td><td>44.9</td></tr>
+ </table>
+  <figcaption>
+    <b>Table 1</b> Benefits of model quantization for select CNN models
+  </figcaption>
+</figure>
+
+## Choice of quantization tool
+
+As a starting point, check if the models in the TensorFlow Lite model repository can work for
+your application. If not, we recommend that users start with the post-training quantization tool
+since this is broadly applicable and does not require training data. For cases where the accuracy
+and latency targets are not met, or hardware accelerator support is important, quantization-aware
+training is the better option.
+
+Note: Quantization-aware training supports a subset of convolutional neural network architectures.
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf4d70b2deb3370d0acdde1fcaa8d7fce0cf3bf2
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -0,0 +1,76 @@
+# Post-training quantization
+
+Post-training quantization is a general technique to reduce the model size while also
+providing up to 3x lower latency with little degradation in model accuracy. Post-training
+quantization quantizes weights to 8-bits of precision from floating-point. This technique
+is enabled as an option in [TensorFlow Lite model converter](../convert):
+
+```
+import tensorflow as tf
+converter = tf.lite.TocoConverter.from_saved_model(saved_model_dir)
+converter.post_training_quantize = True
+tflite_quantized_model = converter.convert()
+open("quantized_model.tflite", "wb").write(tflite_quantized_model)
+
+```
+
+At inference, weights are converted from 8-bits of precision to floating-point and
+computed using floating point kernels. This conversion is done once and cached to reduce latency.
+
+To further improve latency, hybrid operators dynamically quantize activations to 8-bits and
+perform computations with 8-bit weights and activations. This optimization provides latencies
+close to fully fixed-point inference. However, the outputs are still stored using
+floating-point, so the speedup with hybrid ops is less than a full fixed-point computation.
+Hybrid ops are available for the most compute-intensive operators in a network:
+
+*  [tf.contrib.layers.fully_connected](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected)
+*  [tf.nn.conv2d](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d)
+*  [tf.nn.embedding_lookup](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup)
+*  [BasicRNN](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicRNNCell)
+*  [tf.nn.bidirectional_dynamic_rnn for BasicRNNCell type](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
+*  [tf.nn.dynamic_rnn for LSTM and BasicRNN Cell types](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
+
+
+Since weights are quantized post-training, there could be an accuracy loss, particularly for
+smaller networks. Pre-trained fully quantized models are provided for specific networks in
+the [TensorFlow Lite model repository](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models.md#image-classification-quantized-models){:.external}. It is important to check the accuracy of the quantized model to verify that any degradation
+in accuracy is within acceptable limits. There is a tool to evaluate [TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/README.md){:.external}.
+
+If the accuracy drop is too high, consider using [quantization aware training](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md){:.external}.
+
+### Representation for quantized tensors
+
+TensorFlow approaches the conversion of floating-point arrays of numbers into
+8-bit representations as a compression problem. Since the weights and activation
+tensors in trained neural network models tend to have values that are distributed
+across comparatively small ranges (for example, -15 to +15 for weights or -500 to
+1000 for image model activations). And since neural nets tend to be robust
+handling noise, the error introduced by quantizing to a small set of values
+maintains the precision of the overall results within an acceptable threshold. A
+chosen representation must perform fast calculations, especially the large matrix
+multiplications that comprise the bulk of the computations while running a model.
+
+This is represented with two floats that store the overall minimum and maximum
+values corresponding to the lowest and highest quantized value. Each entry in the
+quantized array represents a float value in that range, distributed linearly
+between the minimum and maximum. For example, with a minimum of -10.0 and maximum
+of 30.0f, and an 8-bit array, the quantized values represent the following:
+
+<figure>
+  <table>
+    <tr><th>Quantized</th><th>Float</th></tr>
+    <tr><td>0</td><td>-10.0</td></tr>
+    <tr><td>128</td><td>10.0</td></tr>
+    <tr><td>255</td><td>30.0</td></tr>
+  </table>
+  <figcaption>
+    <b>Table 2</b>: Example quantized value range
+  </figcaption>
+</figure>
+
+The advantages of this representation format are:
+
+* It efficiently represents an arbitrary magnitude of ranges.
+* The values don't have to be symmetrical.
+* The format represents both signed and unsigned values.
+* The linear spread makes multiplications straightforward.
diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/lite/g3doc/rpi.md
similarity index 78%
rename from tensorflow/contrib/lite/g3doc/rpi.md
rename to tensorflow/lite/g3doc/rpi.md
index 41a1892b6f179f98560ce26afcf7263f1048f8d8..708d9e328cbdfffb491d487e4592d789b4fd06af 100644
--- a/tensorflow/contrib/lite/g3doc/rpi.md
+++ b/tensorflow/lite/g3doc/rpi.md
@@ -23,18 +23,18 @@ Clone this Tensorflow repository, Run this script at the root of the repository
 > The Tensorflow repository is in `/tensorflow` if you are using `tensorflow/tensorflow:nightly-devel` docker image, just try it.
 
 ```bash
-./tensorflow/contrib/lite/tools/make/download_dependencies.sh
+./tensorflow/lite/tools/make/download_dependencies.sh
 ```
 Note that you only need to do this once.
 
 You should then be able to compile:
 
 ```bash
-./tensorflow/contrib/lite/tools/make/build_rpi_lib.sh
+./tensorflow/lite/tools/make/build_rpi_lib.sh
 ```
 
 This should compile a static library in:
-`tensorflow/contrib/lite/gen/lib/rpi_armv7/libtensorflow-lite.a`.
+`tensorflow/lite/gen/lib/rpi_armv7/libtensorflow-lite.a`.
 
 ## Native compiling
 This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1).
@@ -48,14 +48,14 @@ sudo apt-get install build-essential
 First, clone the TensorFlow repository. Run this at the root of the repository:
 
 ```bash
-./tensorflow/contrib/lite/tools/make/download_dependencies.sh
+./tensorflow/lite/tools/make/download_dependencies.sh
 ```
 Note that you only need to do this once.
 
 You should then be able to compile:
 ```bash
-./tensorflow/contrib/lite/tools/make/build_rpi_lib.sh
+./tensorflow/lite/tools/make/build_rpi_lib.sh
 ```
 
 This should compile a static library in:
-`tensorflow/contrib/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`.
+`tensorflow/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`.
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/lite/g3doc/tf_ops_compatibility.md
similarity index 90%
rename from tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
rename to tensorflow/lite/g3doc/tf_ops_compatibility.md
index b0dfb0fed1f7a072487a06c11bddf5545911ffdf..dcfda72137cafbc676dec2fb5dbf5da8ab8cb45a 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/lite/g3doc/tf_ops_compatibility.md
@@ -1,4 +1,3 @@
-
 # TensorFlow Lite & TensorFlow Compatibility Guide
 
 TensorFlow Lite supports a number of TensorFlow operations used in common
@@ -75,6 +74,7 @@ counterparts:
     0D tensor*
 *   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze) - *as
     long as axis is not provided*
+*   [tf.squared_difference](https://www.tensorflow.org/versions/master/api_docs/python/tf/squared_difference)
 *   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice) -
     *as long as ellipsis_mask and new_axis_mask are not used*
 *   [tf.transpose](https://www.tensorflow.org/versions/master/api_docs/python/tf/transpose) -
@@ -139,6 +139,17 @@ following common ops are not supported at the moment:
 The following TensorFlow Lite operations are fully supported and used in place
 of the TensorFlow operations listed above:
 
+**ABS**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: elementwise abs of the input
+}
+```
+
 **ADD**
 
 ```
@@ -154,6 +165,30 @@ Options {
 }
 ```
 
+**ARG_MAX**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: A tensor of indices of maximum values.
+}
+```
+
+**ARG_MIN**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: A tensor of indices of minium values.
+}
+```
+
 **AVERAGE_POOL_2D**
 
 ```
@@ -280,6 +315,18 @@ Outputs {
 }
 ```
 
+**FILL**
+
+```
+Inputs {
+  0: a 1D tensor
+  1: a 0D (scalar) tensor
+}
+Outputs {
+  0: A tensor of shape `tensor 0` filled with the value in `tensor 1`.
+}
+```
+
 **FLOOR**
 
 ```
@@ -291,6 +338,30 @@ outputs: {
 }
 ```
 
+**FLOOR_DIV**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: result of computing element-wise floor of `tensor 0` divided by `tensor 1`.
+}
+```
+
+**FLOOR_MOD**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: result of computing element-wise floor of `tensor 0` modulo `tensor 1`.
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
@@ -378,6 +449,34 @@ Options {
 }
 ```
 
+**LEAKY_RELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(input, input * alpha)
+}
+Options {
+  alpha: slope of the activation at x < 0 (provided alpha <= 1)
+}
+```
+
+**LEAKY_RELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(input, input * alpha)
+}
+Options {
+  alpha
+}
+```
+
 **LESS**
 
 ```
@@ -421,6 +520,18 @@ Options {
 }
 ```
 
+**LOGICAL_OR**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: a list of tensors.
+}
+Outputs {
+  0: A tensor of logical_or output tensors.
+}
+```
+
 **LOGISTIC**
 
 ```
@@ -498,6 +609,18 @@ Outputs {
 }
 ```
 
+**PACK**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: an integer.
+}
+Outputs {
+  0: A tensor of stacked tensors.
+}
+```
+
 **PAD**
 
 ```
@@ -539,6 +662,35 @@ Outputs {
 }
 ```
 
+**POW**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: elementwise pow of the input tensors
+}
+```
+
+**RANGE**
+
+```
+Inputs {
+  0: a 0D (scalar) tensor
+  1: a 0D (scalar) tensor
+  2: a 0D (scalar) tensor
+}
+Outputs {
+  0: A 1D tensor of type `dtype` defined by a sequence where `tensor 0` is the
+  start, `tensor 1` is the limit, and `tensor 2` is the delta.
+}
+Options {
+  dtype
+}
+```
+
 **RELU**
 
 ```
@@ -587,6 +739,22 @@ Options {
 }
 ```
 
+**RESIZE_NEAREST_NEIGHBOR**
+
+```
+Inputs {
+  0: a 4D tensor
+  1: a 1D tensor with 2 elements
+}
+Outputs {
+  0: A tensor of type `tensor 0` resized according to `tensor 1` heigh/width values
+  using nearest neighbors interpolation.
+}
+Options {
+  align_corners
+}
+```
+
 **RSQRT**
 
 ```
@@ -698,6 +866,22 @@ Options {
 }
 ```
 
+**SPLIT_V**
+
+```
+Inputs {
+  0: tensor (input)
+  1: 1-D tensor (size_splits)
+  2: 0-D tensor (axis)
+}
+Outputs {
+  0-N: subtensors built from the input tensors
+}
+Options {
+  num_splits: Specifies number of outputs
+}
+```
+
 **SQRT**
 
 ```
@@ -781,66 +965,6 @@ Outputs {
 }
 ```
 
-**POW**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise pow of the input tensors
-}
-```
-
-**ARG_MAX**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of maximum values.
-}
-```
-
-**ARG_MIN**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of minium values.
-}
-```
-
-**PACK**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: an integer.
-}
-Outputs {
-  0: A tensor of stacked tensors.
-}
-```
-
-**LOGICAL_OR**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
-}
-Outputs {
-  0: A tensor of logical_or output tensors.
-}
-```
-
 **UNPACK**
 
 ```
@@ -854,26 +978,26 @@ Outputs {
 }
 ```
 
-**FLOOR_DIV**
+**ZEROS_LIKE**
 
 ```
 Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
+  0: a tensor
 }
 Outputs {
-  0: A tensor of floor_div output tensors.
+  0: A tensor of the same shape and type as x but filled with zeros
 }
 ```
 
-**ZEROS_LIKE**
+**FILL**
 
 ```
 Inputs {
-  0: a tensor
+  0: A Tensor. Must be one of the following types: int32, int64. 1-D. Represents the shape of the output tensor.
+  1: A Tensor. 0-D (scalar). Value to fill the returned tensor.
 }
 Outputs {
-  0: A tensor of the same shape and type as x but filled with zeros
+  0: A tensor of the same type as value (input1).
 }
 ```
 
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md b/tensorflow/lite/g3doc/tfmobile/android_build.md
similarity index 92%
rename from tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
rename to tensorflow/lite/g3doc/tfmobile/android_build.md
index b0f32a8d6ca91229489c73c2c6f52d9c82d37b37..2eb776d10cf8ec68987d13b580eddf2f1bda8e78 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
+++ b/tensorflow/lite/g3doc/tfmobile/android_build.md
@@ -1,6 +1,22 @@
-
 # Building TensorFlow on Android
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 To get you started working with TensorFlow on Android, we'll walk through two
 ways to build our TensorFlow mobile demos and deploying them on an Android
 device. The first is Android Studio, which lets you build and deploy in an
diff --git a/tensorflow/lite/g3doc/tfmobile/index.md b/tensorflow/lite/g3doc/tfmobile/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..15f0fd396134e40e89266182cb308080d9d250cb
--- /dev/null
+++ b/tensorflow/lite/g3doc/tfmobile/index.md
@@ -0,0 +1,298 @@
+# Overview
+
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
+TensorFlow was designed to be a good deep learning solution for mobile
+platforms. Currently we have two solutions for deploying machine learning
+applications on mobile and embedded devices: TensorFlow for Mobile and
+<a href="../../lite">TensorFlow Lite</a>.
+
+## TensorFlow Lite versus TensorFlow Mobile
+
+Here are a few of the differences between the two:
+
+- TensorFlow Lite is an evolution of TensorFlow Mobile.  In most cases, apps
+  developed with TensorFlow Lite will have a smaller binary size, fewer
+  dependencies, and better performance.
+
+- TensorFlow Lite is in developer preview, so not all use cases are covered yet.
+  We expect you to use TensorFlow Mobile to cover production cases.
+
+- TensorFlow Lite supports only a limited set of operators, so not all models
+  will work on it by default. TensorFlow for Mobile has a fuller set of
+  supported functionality.
+
+TensorFlow Lite provides better performance and a small binary size on mobile
+platforms as well as the ability to leverage hardware acceleration if available
+on their platforms. In addition, it has many fewer dependencies so it can be
+built and hosted on simpler, more constrained device scenarios. TensorFlow Lite
+also allows targeting accelerators through the [Neural Networks
+API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
+
+TensorFlow Lite currently has coverage for a limited set of operators. While
+TensorFlow for Mobile supports only a constrained set of ops by default, in
+principle if you use an arbitrary operator in TensorFlow, it can be customized
+to build that kernel. Thus use cases which are not currently supported by
+TensorFlow Lite should continue to use TensorFlow for Mobile. As TensorFlow Lite
+evolves, it will gain additional operators, and the decision will be easier to
+make.
+
+
+## Introduction to TensorFlow Mobile
+
+TensorFlow was designed from the ground up to be a good deep learning solution
+for mobile platforms like Android and iOS. This mobile guide should help you
+understand how machine learning can work on mobile platforms and how to
+integrate TensorFlow into your mobile apps effectively and efficiently.
+
+## About this Guide
+
+This guide is aimed at developers who have a TensorFlow model that’s
+successfully working in a desktop environment, who want to integrate it into
+a mobile application, and cannot use TensorFlow Lite. Here are the
+main challenges you’ll face during that process:
+
+- Understanding how to use Tensorflow for mobile.
+- Building TensorFlow for your platform.
+- Integrating the TensorFlow library into your application.
+- Preparing your model file for mobile deployment.
+- Optimizing for latency, RAM usage, model file size, and binary size.
+
+## Common use cases for mobile machine learning
+
+**Why run TensorFlow on mobile?**
+
+Traditionally, deep learning has been associated with data centers and giant
+clusters of high-powered GPU machines. However, it can be very expensive and
+time-consuming to send all of the data a device has access to across a network
+connection. Running on mobile makes it possible to deliver very interactive
+applications in a way that’s not possible when you have to wait for a network
+round trip.
+
+Here are some common use cases for on-device deep learning:
+
+### Speech Recognition
+
+There are a lot of interesting applications that can be built with a
+speech-driven interface, and many of these require on-device processing. Most of
+the time a user isn’t giving commands, and so streaming audio continuously to a
+remote server would be a waste of bandwidth, since it would mostly be silence or
+background noises. To solve this problem it’s common to have a small neural
+network running on-device
+[listening out for a particular keyword](../tutorials/sequences/audio_recognition).
+Once that keyword has been spotted, the rest of the
+conversation can be transmitted over to the server for further processing if
+more computing power is needed.
+
+### Image Recognition
+
+It can be very useful for a mobile app to be able to make sense of a camera
+image. If your users are taking photos, recognizing what’s in them can help your
+camera apps apply appropriate filters, or label the photos so they’re easily
+findable. It’s important for embedded applications too, since you can use image
+sensors to detect all sorts of interesting conditions, whether it’s spotting
+endangered animals in the wild
+or
+[reporting how late your train is running](https://svds.com/tensorflow-image-recognition-raspberry-pi/).
+
+TensorFlow comes with several examples of recognizing the types of objects
+inside images along with a variety of different pre-trained models, and they can
+all be run on mobile devices. You can try out
+our
+[Tensorflow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/index.html#0) and
+[Tensorflow for Poets 2: Optimize for Mobile](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/index.html#0) codelabs to
+see how to take a pretrained model and run some very fast and lightweight
+training to teach it to recognize specific objects, and then optimize it to
+run on mobile.
+
+### Object Localization
+
+Sometimes it’s important to know where objects are in an image as well as what
+they are. There are lots of augmented reality use cases that could benefit a
+mobile app, such as guiding users to the right component when offering them
+help fixing their wireless network or providing informative overlays on top of
+landscape features. Embedded applications often need to count objects that are
+passing by them, whether it’s pests in a field of crops, or people, cars and
+bikes going past a street lamp.
+
+TensorFlow offers a pretrained model for drawing bounding boxes around people
+detected in images, together with tracking code to follow them over time. The
+tracking is especially important for applications where you’re trying to count
+how many objects are present over time, since it gives you a good idea when a
+new object enters or leaves the scene. We have some sample code for this
+available for Android [on
+GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android),
+and also a [more general object detection
+model](https://github.com/tensorflow/models/tree/master/research/object_detection/README.md)
+available as well.
+
+### Gesture Recognition
+
+It can be useful to be able to control applications with hand or other
+gestures, either recognized from images or through analyzing accelerometer
+sensor data. Creating those models is beyond the scope of this guide, but
+TensorFlow is an effective way of deploying them.
+
+### Optical Character Recognition
+
+Google Translate’s live camera view is a great example of how effective
+interactive on-device detection of text can be.
+
+<div class="video-wrapper">
+  <iframe class="devsite-embedded-youtube-video" data-video-id="06olHmcJjS0"
+            data-autohide="1" data-showinfo="0" frameborder="0" allowfullscreen>
+  </iframe>
+</div>
+
+There are multiple steps involved in recognizing text in images. You first have
+to identify the areas where the text is present, which is a variation on the
+object localization problem, and can be solved with similar techniques. Once you
+have an area of text, you then need to interpret it as letters, and then use a
+language model to help guess what words they represent. The simplest way to
+estimate what letters are present is to segment the line of text into individual
+letters, and then apply a simple neural network to the bounding box of each. You
+can get good results with the kind of models used for MNIST, which you can find
+in TensorFlow’s tutorials, though you may want a higher-resolution input.  A
+more advanced alternative is to use an LSTM model to process a whole line of
+text at once, with the model itself handling the segmentation into different
+characters.
+
+### Translation
+
+Translating from one language to another quickly and accurately, even if you
+don’t have a network connection, is an important use case. Deep networks are
+very effective at this sort of task, and you can find descriptions of a lot of
+different models in the literature. Often these are sequence-to-sequence
+recurrent models where you’re able to run a single graph to do the whole
+translation, without needing to run separate parsing stages.
+
+### Text Classification
+
+If you want to suggest relevant prompts to users based on what they’re typing or
+reading, it can be very useful to understand the meaning of the text. This is
+where text classification comes in. Text classification is an umbrella term
+that covers everything from sentiment analysis to topic discovery. You’re likely
+to have your own categories or labels that you want to apply, so the best place
+to start is with an example
+like
+[Skip-Thoughts](https://github.com/tensorflow/models/tree/master/research/skip_thoughts/),
+and then train on your own examples.
+
+### Voice Synthesis
+
+A synthesized voice can be a great way of giving users feedback or aiding
+accessibility, and recent advances such as
+[WaveNet](https://deepmind.com/blog/wavenet-generative-model-raw-audio/) show
+that deep learning can offer very natural-sounding speech.
+
+## Mobile machine learning and the cloud
+
+These examples of use cases give an idea of how on-device networks can
+complement cloud services. Cloud has a great deal of computing power in a
+controlled environment, but running on devices can offer higher interactivity.
+In situations where the cloud is unavailable, or your cloud capacity is limited,
+you can provide an offline experience, or reduce cloud workload by processing
+easy cases on device.
+
+Doing on-device computation can also signal when it's time to switch to working
+on the cloud. A good example of this is hotword detection in speech. Since
+devices are able to constantly listen out for the keywords, this then triggers a
+lot of traffic to cloud-based speech recognition once one is recognized. Without
+the on-device component, the whole application wouldn’t be feasible, and this
+pattern exists across several other applications as well. Recognizing that some
+sensor input is interesting enough for further processing makes a lot of
+interesting products possible.
+
+## What hardware and software should you have?
+
+TensorFlow runs on Ubuntu Linux, Windows 10, and OS X. For a list of all
+supported operating systems and instructions to install TensorFlow, see
+<a href="https://www.tensorflow.org/install">Installing Tensorflow</a>.
+
+Note that some of the sample code we provide for mobile TensorFlow requires you
+to compile TensorFlow from source, so you’ll need more than just `pip install`
+to work through all the sample code.
+
+To try out the mobile examples, you’ll need a device set up for development,
+using
+either [Android Studio](https://developer.android.com/studio/install.html),
+or [XCode](https://developer.apple.com/xcode/) if you're developing for iOS.
+
+## What should you do before you get started?
+
+Before thinking about how to get your solution on mobile:
+
+1. Determine whether your problem is solvable by mobile machine learning
+2. Create a labelled dataset to define your problem
+3. Pick an effective model for the problem
+
+We'll discuss these in more detail below.
+
+### Is your problem solvable by mobile machine learning?
+
+Once you have an idea of the problem you want to solve, you need to make a plan
+of how to build your solution. The most important first step is making sure that
+your problem is actually solvable, and the best way to do that is to mock it up
+using humans in the loop.
+
+For example, if you want to drive a robot toy car using voice commands, try
+recording some audio from the device and listen back to it to see if you can
+make sense of what’s being said. Often you’ll find there are problems in the
+capture process, such as the motor drowning out speech or not being able to hear
+at a distance, and you should tackle these problems before investing in the
+modeling process.
+
+Another example would be giving photos taken from your app to people see if they
+can classify what’s in them, in the way you’re looking for. If they can’t do
+that (for example, trying to estimate calories in food from photos may be
+impossible because all white soups look the same), then you’ll need to redesign
+your experience to cope with that. A good rule of thumb is that if a human can’t
+handle the task then it will be difficult to train a computer to do better.
+
+### Create a labelled dataset
+
+After you’ve solved any fundamental issues with your use case, you need to
+create a labeled dataset to define what problem you’re trying to solve. This
+step is extremely important, more than picking which model to use. You want it
+to be as representative as possible of your actual use case, since the model
+will only be effective at the task you teach it. It’s also worth investing in
+tools to make labeling the data as efficient and accurate as possible. For
+example, if you’re able to switch from having to click a button on a web
+interface to simple keyboard shortcuts, you may be able to speed up the
+generation process a lot. You should also start by doing the initial labeling
+yourself, so you can learn about the difficulties and likely errors, and
+possibly change your labeling or data capture process to avoid them. Once you
+and your team are able to consistently label examples (that is once you
+generally agree on the same labels for most examples), you can then try and
+capture your knowledge in a manual and teach external raters how to run the same
+process.
+
+### Pick an effective model
+
+The next step is to pick an effective model to use. You might be able to avoid
+training a model from scratch if someone else has already implemented a model
+similar to what you need; we have a repository of models implemented in
+TensorFlow [on GitHub](https://github.com/tensorflow/models) that you can look
+through. Lean towards the simplest model you can find, and try to get started as
+soon as you have even a small amount of labelled data, since you’ll get the best
+results when you’re able to iterate quickly. The shorter the time it takes to
+try training a model and running it in its real application, the better overall
+results you’ll see. It’s common for an algorithm to get great training accuracy
+numbers but then fail to be useful within a real application because there’s a
+mismatch between the dataset and real usage. Prototype end-to-end usage as soon
+as possible to create a consistent user experience.
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md b/tensorflow/lite/g3doc/tfmobile/ios_build.md
similarity index 87%
rename from tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
rename to tensorflow/lite/g3doc/tfmobile/ios_build.md
index be8b4100c89f4b02e651b1585faf438881c9119d..d922907cdc5fe5ccec8864b456586fce0293a0af 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
+++ b/tensorflow/lite/g3doc/tfmobile/ios_build.md
@@ -1,6 +1,22 @@
-
 # Building TensorFlow on iOS
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 ## Using CocoaPods
 
 The simplest way to get started with TensorFlow on iOS is using the CocoaPods
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md b/tensorflow/lite/g3doc/tfmobile/linking_libs.md
similarity index 94%
rename from tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
rename to tensorflow/lite/g3doc/tfmobile/linking_libs.md
index 4d4bb3bc081d613714271f8b0bf7461cb1e0f4d5..fd0e322c93493ed835ae7ec9766a708885c6ac88 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
+++ b/tensorflow/lite/g3doc/tfmobile/linking_libs.md
@@ -1,6 +1,22 @@
-
 # Integrating TensorFlow libraries
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 Once you have made some progress on a model that addresses the problem you’re
 trying to solve, it’s important to test it out inside your application
 immediately. There are often unexpected differences between your training data
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md b/tensorflow/lite/g3doc/tfmobile/optimizing.md
similarity index 97%
rename from tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
rename to tensorflow/lite/g3doc/tfmobile/optimizing.md
index 7436594fd8580151ba66562eccd408cc7e6c4201..59ff8e774c6c63a01668aee7d6caeea01171468d 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
+++ b/tensorflow/lite/g3doc/tfmobile/optimizing.md
@@ -1,6 +1,22 @@
-
 # Optimizing for mobile
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 There are some special issues that you have to deal with when you’re trying to
 ship on mobile or embedded devices, and you’ll need to think about these as
 you’re developing your model.
diff --git a/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md b/tensorflow/lite/g3doc/tfmobile/prepare_models.md
similarity index 96%
rename from tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
rename to tensorflow/lite/g3doc/tfmobile/prepare_models.md
index d1c67d4c61608bcbc9b0bcee5b60f46a73b44692..1d373251ddf3ba6a0119bd57bf14caf100ef371a 100644
--- a/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
+++ b/tensorflow/lite/g3doc/tfmobile/prepare_models.md
@@ -1,6 +1,22 @@
-
 # Preparing models for mobile deployment
 
+Warning: We expect to deprecate TensorFlow Mobile in early 2019
+
+<div class="caution">
+  <p>
+    <a href="../">TensorFlow Lite</a> is our main mobile and embedded offering. We are
+    working hard to close the feature gap between TensorFlow Mobile and
+    TensorFlow Lite. We expect to deprecate TensorFlow Mobile in early 2019. We
+    will give ample notice to our users when we get to that point and will
+    provide help and support to ensure easy migrations.
+  </p>
+  <p>
+    In the meantime, please use TensorFlow Lite. If you have a feature request,
+    such as a missing op, please post to our <a
+    href="https://github.com/tensorflow/tensorflow/issues">GitHub</a>.
+  </p>
+</div>
+
 The requirements for storing model information during training are very
 different from when you want to release it as part of a mobile app. This section
 covers the tools involved in converting from a training model to something
diff --git a/tensorflow/lite/g3doc/using_select_tf_ops.md b/tensorflow/lite/g3doc/using_select_tf_ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa51f58baa4ecf01fbe75d2ce9095bb1a5286ae8
--- /dev/null
+++ b/tensorflow/lite/g3doc/using_select_tf_ops.md
@@ -0,0 +1,249 @@
+# [Experimental] Using TensorFlow Lite with select TensorFlow ops
+
+The TensorFlow Lite builtin op library has grown rapidly, and will continue to
+grow, but there remains a long tail of TensorFlow ops that are not yet natively
+supported by TensorFlow Lite . These unsupported ops can be a point of friction
+in the TensorFlow Lite model conversion process. To that end, the team has
+recently been working on an experimental mechanism for reducing this friction.
+
+This document outlines how to use TensorFlow Lite with select TensorFlow ops.
+*Note that this feature is experimental and is under active development.* As you
+use this feature, keep in mind the [known limitations](#known-limitations), and
+please send feedback about models that work and issues you are facing to
+tflite@tensorflow.org.
+
+TensorFlow Lite will continue to have
+[TensorFlow Lite builtin ops](tf_ops_compatibility.md) optimized for mobile and
+embedded devices. However, TensorFlow Lite models can now use a subset of
+TensorFlow ops when TFLite builtin ops are not sufficient.
+
+Models converted with TensorFlow ops will require a TensorFlow Lite interpreter
+that has a larger binary size than the interpreter with only TFLite builtin ops.
+Additionally, performance optimizations will not be available for any TensorFlow
+ops in the TensorFlow Lite model.
+
+This document outlines how to [convert](#converting-the-model) and
+[run](#running-the-model) a TFLite model with TensorFlow ops on your platform of
+choice. It also discusses some [known limitations](#known-limitations), the
+[future plans](#future-plans) for this feature, and basic
+[performance and size metrics](#metrics).
+
+## Converting the model
+
+To convert a TensorFlow model to a TensorFlow Lite model with TensorFlow ops,
+use the `target_ops` argument in the
+[TensorFlow Lite converter](https://www.tensorflow.org/lite/convert/). The
+following values are valid options for `target_ops`:
+
+*   `TFLITE_BUILTINS` - Converts models using TensorFlow Lite builtin ops.
+*   `SELECT_TF_OPS` - Converts models using TensorFlow ops. The exact subset of
+    supported ops can be found in the whitelist at
+    `lite/toco/tflite/whitelisted_flex_ops.cc`.
+
+The recommended approach is to convert the model with `TFLITE_BUILTINS`, then
+with both `TFLITE_BUILTINS,SELECT_TF_OPS`, and finally with only
+`SELECT_TF_OPS`. Using both options (i.e. `TFLITE_BUILTINS,SELECT_TF_OPS`)
+creates models with TensorFlow Lite ops where possible. Using only
+`SELECT_TF_OPS` is useful when the model contains TensorFlow ops that are only
+partially supported by TensorFlow Lite, and one would like to avoid those
+limitations.
+
+The following example shows how to use `target_ops` in the
+[`TFLiteConverter`](https://www.tensorflow.org/lite/convert/python_api) Python
+API.
+
+```
+import tensorflow as tf
+
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.target_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
+                        tf.lite.OpsSet.SELECT_TF_OPS]
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+The following example shows how to use `target_ops` in the
+[`tflite_convert`](https://www.tensorflow.org/lite/convert/cmdline_examples)
+command line tool.
+
+```
+tflite_convert \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/foo.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+When building and running `tflite_convert` directly with `bazel`, please pass
+`--define=with_select_tf_ops=true` as an additional argument.
+
+```
+bazel run --define=with_select_tf_ops=true tflite_convert -- \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/foo.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+## Running the model
+
+When using a TensorFlow Lite model that has been converted with support for
+select TensorFlow ops, the client must also use a TensorFlow Lite runtime that
+includes the necessary library of TensorFlow ops.
+
+### Android AAR
+
+A new Android AAR target with select TensorFlow ops has been added for
+convenience. Assuming a <a href="./demo_android.md">working TensorFlow Lite
+build environment</a>, build the Android AAR with select TensorFlow ops as
+follows:
+
+```sh
+bazel build --cxxopt='--std=c++11' -c opt             \
+  --config=android_arm --config=monolithic          \
+  //tensorflow/lite/java:tensorflow-lite-with-select-tf-ops
+```
+
+This will generate an AAR file in `bazel-genfiles/tensorflow/lite/java/`. From
+there, you can either import the AAR directly into your project, or publish the
+custom AAR to your local Maven repository:
+
+```sh
+mvn install:install-file \
+  -Dfile=bazel-genfiles/tensorflow/lite/java/tensorflow-lite-with-select-tf-ops.aar \
+  -DgroupId=org.tensorflow \
+  -DartifactId=tensorflow-lite-with-select-tf-ops -Dversion=0.1.100 -Dpackaging=aar
+```
+
+Finally, in your app's `build.gradle`, ensure you have the `mavenLocal()`
+dependency and replace the standard TensorFlow Lite dependency with the one that
+has support for select TensorFlow ops:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+        mavenLocal()
+    }
+}
+
+dependencies {
+    compile 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
+}
+```
+
+### iOS
+
+With XCode Command Line Tools installed, TensorFlow Lite with select TensorFlow
+ops support can be built with the following command:
+
+```sh
+tensorflow/contrib/makefile/build_all_ios_with_tflite.sh
+```
+
+This will generate the required static linking libraries in the
+`tensorflow/contrib/makefile/gen/lib/` directory.
+
+The TensorFlow Lite camera example app can be used to test this. A new
+TensorFlow Lite XCode project with support for select TensorFlow ops has been
+added to
+`tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj`.
+
+To use this feature in a your own project, either clone the example project or
+set the project settings for a new or existing project to the following:
+
+*   In Build Phases -> Link Binary With Libraries, add the static libraries
+    under `tensorflow/contrib/makefile/gen/lib/` directory:
+    *   `libtensorflow-lite.a`
+    *   `libprotobuf.a`
+    *   `nsync.a`
+*   In Build Settings -> Header Search Paths, add the following directories:
+    *   `tensorflow/lite/`
+    *   `tensorflow/contrib/makefile/downloads/flatbuffer/include`
+    *   `tensorflow/contrib/makefile/downloads/eigen`
+*   In Build Settings -> Other Linker Flags, add `-force_load
+    tensorflow/contrib/makefile/gen/lib/libtensorflow-lite.a`.
+
+A CocoaPod with support for select TensorFlow ops will also be released in the
+future.
+
+### C++
+
+When building TensorFlow Lite libraries using the bazel pipeline, the additional
+TensorFlow ops library can be included and enabled as follows:
+
+*   Enable monolithic builds if necessary by adding the `--config=monolithic`
+    build flag.
+*   Do one of the following:
+    *   Include the `--define=with_select_tf_ops=true` build flag in the `bazel
+        build` invocation when building TensorFlow Lite.
+    *   Add the TensorFlow ops delegate library dependency to the build
+        dependencies: `tensorflow/lite/delegates/flex:delegate`.
+
+Note that the necessary `TfLiteDelegate` will be installed automatically when
+creating the interpreter at runtime as long as the delegate is linked into the
+client library. It is not necessary to explicitly install the delegate instance
+as is typically required with other delegate types.
+
+### Python pip Package
+
+Python support is actively under development.
+
+## Metrics
+
+### Performance
+
+When using a mixture of both builtin and select TensorFlow ops, all of the same
+TensorFlow Lite optimizations and optimized builtin kernels will be be available
+and usable with the converted model. For the TensorFlow ops, performance should
+generally be comparable to that of
+[TensorFlow Mobile](https://www.tensorflow.org/lite/tfmobile/).
+
+The following table describes the average time taken to run inference on
+MobileNet on a Pixel 2. The listed times are an average of 100 runs. These
+targets were built for Android using the flags: `--config=android_arm64 -c opt`.
+
+Build                                | Time (milliseconds)
+------------------------------------ | -------------------
+Only built-in ops (`TFLITE_BUILTIN`) | 260.7
+Using only TF ops (`SELECT_TF_OPS`)  | 264.5
+
+### Binary Size
+
+The following table describes the binary size of TensorFlow Lite for each build.
+These targets were built for Android using `--config=android_arm -c opt`.
+
+Build                 | C++ Binary Size | Android APK Size
+--------------------- | --------------- | ----------------
+Only built-in ops     | 796 KB          | 561 KB
+Built-in ops + TF ops | 23.0 MB         | 8.0 MB
+
+## Known Limitations
+
+The following is a list of some of the known limitations:
+
+*   Control flow ops are not yet supported.
+*   The
+    [`post_training_quantization`](https://www.tensorflow.org/performance/post_training_quantization)
+    flag is currently not supported for TensorFlow ops so it will not quantize
+    weights for any TensorFlow ops. In models with both TensorFlow Lite builtin
+    ops and TensorFlow ops, the weights for the builtin ops will be quantized.
+*   Ops that require explicit initialization from resources, like HashTableV2,
+    are not yet supported.
+*   Certain TensorFlow ops may not support the full set of input/output types
+    that are typically available on stock TensorFlow.
+
+## Future Plans
+
+The following is a list of improvements to this pipeline that are in progress:
+
+*   *Selective registration* - There is work being done to make it simple to
+    generate TFLite interpreter binaries that only contain the TensorFlow ops
+    required for a particular set of models.
+*   *Improved usability* - The conversion process will be simplified to only
+    require a single pass through the converter. Additionally, pre-built Android
+    AAR and iOS CocoaPod binaries will be provided.
+*   *Improved performance* - There is work being done to ensure TensorFlow Lite
+    with TensorFlow ops has performance parity to TensorFlow Mobile.
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1cec0d0c290679c7755cbf84858317489c0ba159
--- /dev/null
+++ b/tensorflow/lite/graph_info.cc
@@ -0,0 +1,224 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/graph_info.h"
+#include <algorithm>
+
+namespace tflite {
+
+namespace {
+
+// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite
+// C api uses. Can't use the google array_view, since we can't depend on even
+// absl for embedded device reasons.
+// TODO(aselle): Move this into central utilities.
+class TfLiteIntArrayView {
+ public:
+  // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null
+  // and this view does not take ownership of it.
+  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
+      : int_array_(int_array) {}
+
+  typedef const int* const_iterator;
+  const_iterator begin() const { return int_array_->data; }
+  const_iterator end() const { return &int_array_->data[int_array_->size]; }
+
+  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
+  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
+
+ private:
+  const TfLiteIntArray* int_array_;
+};
+
+// Helper class that actually performs partitioning by node sub set.
+// Outputs to a provided `NodeSubset` structure.
+//
+// Example usage:
+// PartitionGraphIntoIndependentNodeSubsetsImpl partitioner(
+//     info, nodes_to_part, node_subsets);
+// partitioner.Partition();
+class PartitionGraphIntoIndependentNodeSubsetsImpl {
+ public:
+  PartitionGraphIntoIndependentNodeSubsetsImpl(
+      const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+      std::vector<NodeSubset>* node_subsets)
+      : info_(info),
+        node_subsets_(node_subsets),
+        node_type_(info->num_nodes(), NodeSubset::kTfNonPartition) {
+    // Populate the node_type_ map.
+    for (auto node_index : TfLiteIntArrayView(nodes_to_partition)) {
+      node_type_[node_index] = NodeSubset::kTfPartition;
+    }
+  }
+
+  // Actually partition the graph.
+  void Partition() {
+    // Initialize here to make Partition() re-entrant.
+    node_subsets_->clear();
+    tensor_epochs_.clear();
+    tensor_epochs_.resize(info_->num_tensors(), kEpochAlwaysReady);
+    node_epochs_.clear();
+    node_epochs_.resize(info_->num_nodes(), kEpochNotReady);
+    // Set computed tensors to be kEpochNotReady (initializer set everything to
+    // AlwaysReady).
+    for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+      const TfLiteNode& node = info_->node(node_index);
+      for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
+        tensor_epochs_[output_tensor_index] = kEpochNotReady;
+      }
+    }
+
+    // Do a graph traversal where each iteration in the loop is an epoch
+    // that corresponds to a node sub set that only contains nodes that are of
+    // the same node_type_.
+    while (true) {
+      BuildNodeSubset();
+      if (node_subsets_->back().nodes.empty()) {
+        node_subsets_->pop_back();
+        break;
+      }
+    }
+
+    // Mark model outputs as node sub set outputs. All the rest have already
+    // been identified.
+    for (int output_index : info_->outputs()) {
+      int output_epoch = tensor_epochs_[output_index];
+      NodeSubset& output_subset = (*node_subsets_)[output_epoch];
+      output_subset.output_tensors.push_back(output_index);
+    }
+    // Make sure every node sub set's inputs and outputs are unique. Since the
+    // list of inputs and outputs is generated in a way that produces
+    // duplicates.
+    for (NodeSubset& node_subset : *node_subsets_) {
+      // Sort and uniquefy using standard library algorithms.
+      auto uniquefy = [](std::vector<int>* items) {
+        std::sort(items->begin(), items->end());
+        auto last = std::unique(items->begin(), items->end());
+        items->erase(last, items->end());
+      };
+      uniquefy(&node_subset.input_tensors);
+      uniquefy(&node_subset.output_tensors);
+    }
+  }
+
+ private:
+  // Special integer values needed for tensor_epochs_ and node_epochs_.
+  enum {
+    // The node or tensor is not ready to be assigned an epoch. e.g. a node's
+    // inputs have not all been assigned epochs.
+    kEpochNotReady = -1,
+    // Used for tensor_epochs_. This means that the tensor is always ready.
+    // e.g. an input to the whole model or a constant that has no dependencies.
+    kEpochAlwaysReady = -2
+  };
+
+  // Updates the  node `node_index` and returns true if it is assigned to an
+  // epoch. False is returned if the node is already set to an epoch, its inputs
+  // are not all assigned to epochs, or if it cannot be assigned to the current
+  // epoch since the epoch's node_type doesn't match.
+  bool UpdateNode(int node_index) {
+    const TfLiteNode& node = info_->node(node_index);
+    NodeSubset& current_subset = node_subsets_->back();
+    int current_epoch = node_subsets_->size() - 1;
+    // Check if node is already done.
+    if (node_epochs_[node_index] != kEpochNotReady) {
+      return false;
+    }
+    // See if all dependencies of this node are already assigned to a
+    // node sub set.
+    for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
+      if (tensor_epochs_[input_tensor_index] == kEpochNotReady) {
+        return false;
+      }
+    }
+    // When we are starting a new epoch, the first ready node defines
+    // the type of that epoch.
+    if (current_subset.type == NodeSubset::kTfUnexplored) {
+      current_subset.type = node_type_[node_index];
+    }
+    // The node gets assigned to this epoch if it is the same type as
+    // the epoch's assigned type. Note, if this is the current ready
+    // node encountered during this epoch, this condition will be
+    // automatically true.
+    if (current_subset.type == node_type_[node_index]) {
+      node_epochs_[node_index] = current_epoch;
+      current_subset.nodes.push_back(node_index);
+      // All outputs of this node now are assigned to this epoch as
+      // well.
+      for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
+        tensor_epochs_[output_tensor_index] = current_epoch;
+      }
+      // Look at our inputs one more time to update that tensor's
+      // epochs' outputs
+      for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
+        int input_epoch = tensor_epochs_[input_tensor_index];
+        int node_epoch = current_epoch;
+        if (input_epoch != node_epoch) {
+          current_subset.input_tensors.push_back(input_tensor_index);
+          // Set inputs to be outputs of the node sub set where they reside.
+          // the if condition makes sure inputs to the whole computation
+          // are not included (i.e. those initialized to -2 above).
+          if (input_epoch >= 0) {
+            NodeSubset& input_subset = (*node_subsets_)[input_epoch];
+            input_subset.output_tensors.push_back(input_tensor_index);
+          }
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  // Completely populates the current node_subset by doing graph traversal
+  void BuildNodeSubset() {
+    node_subsets_->emplace_back(NodeSubset());
+    // loop until no more nodes can be updated.
+    while (true) {
+      bool did_something = false;
+      for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+        if (UpdateNode(node_index)) {
+          did_something = true;
+        }
+      }
+      if (!did_something) return;
+    }
+  }
+
+  // Temporary data needed for partitioning.
+  const GraphInfo* info_;
+  // List of node_subsets to populate
+  std::vector<NodeSubset>* node_subsets_;
+  std::vector<NodeSubset::Type> node_type_;
+  // Maps from tensor index to the epoch in which it is assigned. Also special
+  // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
+  // is an input or constant.
+  std::vector<int> tensor_epochs_;
+  // Maps from tensor index to the epoch in which it is assigned. Also special
+  // negative values of kEpochNotAssigned if not assigned.
+  std::vector<int> node_epochs_;
+};
+
+}  // namespace
+
+TfLiteStatus PartitionGraphIntoIndependentNodeSubsets(
+    const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+    std::vector<NodeSubset>* node_subsets) {
+  PartitionGraphIntoIndependentNodeSubsetsImpl(info, nodes_to_partition,
+                                               node_subsets)
+      .Partition();
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/graph_info.h b/tensorflow/lite/graph_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..4da696c132e27ce4a57fccd7935c78dd015e6850
--- /dev/null
+++ b/tensorflow/lite/graph_info.h
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_GRAPH_INFO_H_
+#define TENSORFLOW_LITE_GRAPH_INFO_H_
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+
+// Basic information about an inference graph, where execution nodes
+// are connected via tensors.
+class GraphInfo {
+ public:
+  virtual ~GraphInfo() {}
+
+  // Total number of tensors in the graph.
+  virtual size_t num_tensors() const = 0;
+
+  // Returns a tensor given its index which is expected to be between 0 and
+  // num_tensors().
+  virtual TfLiteTensor* tensor(size_t index) = 0;
+
+  // Total number of nodes in the graph.
+  virtual size_t num_nodes() const = 0;
+
+  // Returns a node given its index which is expected to be between 0 and
+  // num_nodes().
+  virtual const TfLiteNode& node(size_t index) const = 0;
+
+  // Returns the indices of the input tensors.
+  virtual const std::vector<int>& inputs() const = 0;
+
+  // Returns the indices of the output tensors.
+  virtual const std::vector<int>& outputs() const = 0;
+
+  // Returns the indices of the variable tensors.
+  virtual const std::vector<int>& variables() const = 0;
+};
+
+// Represents a subset of nodes in a TensorFlow Lite graph.
+struct NodeSubset {
+  enum Type {
+    kTfUnexplored = 0,  // temporarily used during creation
+    kTfPartition,
+    kTfNonPartition
+  };
+  Type type = kTfUnexplored;
+  // Nodes within the node sub set
+  std::vector<int> nodes;
+  // Tensors that stride output from another node sub set that this depends on,
+  // or global inputs to the TensorFlow Lite full graph.
+  std::vector<int> input_tensors;
+  // Outputs that are consumed by other node sub sets or are global output
+  // tensors. All output tensors of the nodes in the node sub set that do not
+  // appear in this list are intermediate results that can be potentially
+  // elided.
+  std::vector<int> output_tensors;
+};
+
+// Partitions a list of node indices `nodes_to_partition` into node sub sets.
+// Each node sub set is in dependency order (i.e. all members of the node sub
+// sets). `node_subsets` is assumed to be empty.
+TfLiteStatus PartitionGraphIntoIndependentNodeSubsets(
+    const GraphInfo* info, const TfLiteIntArray* nodes_to_partition,
+    std::vector<NodeSubset>* node_subsets);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_GRAPH_INFO_H_
diff --git a/tensorflow/contrib/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
similarity index 85%
rename from tensorflow/contrib/lite/graph_info_test.cc
rename to tensorflow/lite/graph_info_test.cc
index 89a8f36b416b5dec54c1e374cdcdae3ab9ab0cde..4d8bbdc0eef49b3f79b3c74c1d07fd86467e1d65 100644
--- a/tensorflow/contrib/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
-#include "tensorflow/contrib/lite/graph_info.h"
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace {
@@ -76,17 +76,18 @@ class SimpleTestGraph : public GraphInfo {
 // TfLiteIntArray. Populates `subgraphs` with resulting generated subgraphs.
 void PartitionGraph(const SimpleTestGraph& graph,
                     const std::vector<int>& nodes_to_partition,
-                    std::vector<Subgraph>* subgraphs) {
+                    std::vector<NodeSubset>* subgraphs) {
   TfLiteIntArray* nodes_to_partition_int_array =
       ConvertVector(nodes_to_partition);
-  PartitionGraphIntoIndependentSubgraphs(&graph, nodes_to_partition_int_array,
-                                         subgraphs);
+  PartitionGraphIntoIndependentNodeSubsets(&graph, nodes_to_partition_int_array,
+                                           subgraphs);
   TfLiteIntArrayFree(nodes_to_partition_int_array);
 }
 
 // Check a generated list of subgraphs against the expected list of subgraphs.
-void CheckPartitionSubgraphs(const std::vector<Subgraph>& generated_subgraphs,
-                             const std::vector<Subgraph>& expected_subgraphs) {
+void CheckPartitionSubgraphs(
+    const std::vector<NodeSubset>& generated_subgraphs,
+    const std::vector<NodeSubset>& expected_subgraphs) {
   ASSERT_EQ(generated_subgraphs.size(), expected_subgraphs.size());
   for (int subgraph_index = 0; subgraph_index < generated_subgraphs.size();
        subgraph_index++) {
@@ -103,7 +104,7 @@ void CheckPartitionSubgraphs(const std::vector<Subgraph>& generated_subgraphs,
 TEST(PartitionTest, Nodes0_PartitionNodes0) {
   SimpleTestGraph graph;
   std::vector<int> nodes_to_partition = {};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
   CheckPartitionSubgraphs(generated_subgraphs, {});
 }
@@ -117,11 +118,11 @@ TEST(PartitionTest, Nodes1PartitionNodes0) {
   graph.AddNode({0}, {1});
   graph.SetInputsAndOutputs({0}, {1});
   std::vector<int> nodes_to_partition = {};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph;
-  expected_subgraph.type = Subgraph::kTfNonPartition;
+  NodeSubset expected_subgraph;
+  expected_subgraph.type = NodeSubset::kTfNonPartition;
   expected_subgraph.nodes = {0};
   expected_subgraph.input_tensors = {0};
   expected_subgraph.output_tensors = {1};
@@ -136,12 +137,12 @@ TEST(PartitionTest, Nodes1PartitionNodes0Inputs0) {
   graph.AddTensors(1);
   graph.AddNode({}, {0});
   graph.SetInputsAndOutputs({}, {0});
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   std::vector<int> nodes_to_partition = {0};
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph;
-  expected_subgraph.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph;
+  expected_subgraph.type = NodeSubset::kTfPartition;
   expected_subgraph.nodes = {0};
   expected_subgraph.input_tensors = {};
   expected_subgraph.output_tensors = {0};
@@ -157,11 +158,11 @@ TEST(PartitionTest, Nodes1PartitionNodes1) {
   graph.AddNode({0}, {1});
   graph.SetInputsAndOutputs({0}, {1});
   std::vector<int> nodes_to_partition = {0};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph;
-  expected_subgraph.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph;
+  expected_subgraph.type = NodeSubset::kTfPartition;
   expected_subgraph.nodes = {0};
   expected_subgraph.input_tensors = {0};
   expected_subgraph.output_tensors = {1};
@@ -180,16 +181,16 @@ TEST(PartitionTest, Nodes2PartitionNodes1) {
   graph.AddNode({1}, {2});
   graph.SetInputsAndOutputs({0}, {2});
   std::vector<int> nodes_to_partition = {1};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph0;
-  expected_subgraph0.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
   expected_subgraph0.nodes = {0};
   expected_subgraph0.input_tensors = {0};
   expected_subgraph0.output_tensors = {1};
-  Subgraph expected_subgraph1;
-  expected_subgraph1.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph1;
+  expected_subgraph1.type = NodeSubset::kTfPartition;
   expected_subgraph1.nodes = {1};
   expected_subgraph1.input_tensors = {1};
   expected_subgraph1.output_tensors = {2};
@@ -208,11 +209,11 @@ TEST(PartitionTest, Nodes2PartitionNodes2) {
   graph.AddNode({1}, {2});
   graph.SetInputsAndOutputs({0}, {2});
   std::vector<int> nodes_to_partition = {0, 1};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph0;
-  expected_subgraph0.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
   expected_subgraph0.nodes = {0, 1};
   expected_subgraph0.input_tensors = {0};
   expected_subgraph0.output_tensors = {2};
@@ -239,21 +240,21 @@ TEST(PartitionTest, Nodes3PartitionNodes2) {
   graph.AddNode({1, 2}, {3});
   graph.SetInputsAndOutputs({0}, {3});
   std::vector<int> nodes_to_partition = {0, 2};
-  std::vector<Subgraph> generated_subgraphs;
+  std::vector<NodeSubset> generated_subgraphs;
   PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
 
-  Subgraph expected_subgraph0;
-  expected_subgraph0.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
   expected_subgraph0.nodes = {0};
   expected_subgraph0.input_tensors = {0};
   expected_subgraph0.output_tensors = {1};
-  Subgraph expected_subgraph1;
-  expected_subgraph1.type = Subgraph::kTfNonPartition;
+  NodeSubset expected_subgraph1;
+  expected_subgraph1.type = NodeSubset::kTfNonPartition;
   expected_subgraph1.nodes = {1};
   expected_subgraph1.input_tensors = {1};
   expected_subgraph1.output_tensors = {2};
-  Subgraph expected_subgraph2;
-  expected_subgraph2.type = Subgraph::kTfPartition;
+  NodeSubset expected_subgraph2;
+  expected_subgraph2.type = NodeSubset::kTfPartition;
   expected_subgraph2.nodes = {2};
   expected_subgraph2.input_tensors = {1, 2};
   expected_subgraph2.output_tensors = {3};
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2129ed46d94061211e02445a437f7adca51363e
--- /dev/null
+++ b/tensorflow/lite/interpreter.cc
@@ -0,0 +1,215 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/interpreter.h"
+
+#include <cassert>
+#include <cstdarg>
+#include <cstdint>
+#include <cstring>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/nnapi_delegate.h"
+#include "tensorflow/lite/profiling/profiler.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+Interpreter::Interpreter(ErrorReporter* error_reporter)
+    : error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {
+  // There's always at least 1 subgraph which is the primary subgraph.
+  AddSubgraphs(1);
+  context_ = primary_subgraph().context();
+
+  // Reserve some space for the tensors to avoid excessive resizing.
+  for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
+    external_contexts_[i] = nullptr;
+  }
+
+  UseNNAPI(false);
+}
+
+Interpreter::~Interpreter() {}
+
+void Interpreter::SetExternalContext(TfLiteExternalContextType type,
+                                     TfLiteExternalContext* ctx) {
+  primary_subgraph().SetExternalContext(type, ctx);
+}
+
+TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
+  return primary_subgraph().SetInputs(inputs);
+}
+
+TfLiteStatus Interpreter::SetOutputs(std::vector<int> outputs) {
+  return primary_subgraph().SetOutputs(outputs);
+}
+
+TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
+  return primary_subgraph().SetVariables(variables);
+}
+
+TfLiteStatus Interpreter::AllocateTensors() {
+  return primary_subgraph().AllocateTensors();
+}
+
+void Interpreter::ReserveNodes(int count) {
+  primary_subgraph().nodes_and_registration().reserve(count);
+}
+
+void Interpreter::AddSubgraphs(int subgraphs_to_add,
+                               int* first_new_subgraph_index) {
+  const size_t base_index = subgraphs_.size();
+  if (first_new_subgraph_index) *first_new_subgraph_index = base_index;
+
+  subgraphs_.reserve(base_index + subgraphs_to_add);
+  for (int i = 0; i < subgraphs_to_add; ++i) {
+    Subgraph* subgraph =
+        new Subgraph(error_reporter_, external_contexts_, &subgraphs_);
+    subgraphs_.emplace_back(subgraph);
+  }
+}
+
+TfLiteStatus Interpreter::AddNodeWithParameters(
+    const std::vector<int>& inputs, const std::vector<int>& outputs,
+    const char* init_data, size_t init_data_size, void* builtin_data,
+    const TfLiteRegistration* registration, int* node_index) {
+  return primary_subgraph().AddNodeWithParameters(inputs, outputs, init_data,
+                                                  init_data_size, builtin_data,
+                                                  registration, node_index);
+}
+
+TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
+                                            const std::vector<int>& dims) {
+  return primary_subgraph().ResizeInputTensor(tensor_index, dims);
+}
+
+TfLiteStatus Interpreter::Invoke() {
+  TfLiteStatus status = primary_subgraph().Invoke();
+
+  if (!allow_buffer_handle_output_) {
+    for (int tensor_index : outputs()) {
+      primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
+    }
+  }
+
+  return status;
+}
+
+TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
+                                     int* first_new_tensor_index) {
+  return primary_subgraph().AddTensors(tensors_to_add, first_new_tensor_index);
+}
+
+TfLiteStatus Interpreter::ResetVariableTensors() {
+  return primary_subgraph().ResetVariableTensors();
+}
+
+TfLiteStatus Interpreter::SetTensorParametersReadOnly(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
+    size_t bytes, const Allocation* allocation) {
+  return primary_subgraph().SetTensorParametersReadOnly(
+      tensor_index, type, name, rank, dims, quantization, buffer, bytes,
+      allocation);
+}
+
+// Set description of inputs/outputs/data/fptrs for node `node_index`.
+// This variant assumes an external buffer has been allocated of size
+// bytes. The lifetime of buffer must be ensured to be greater or equal
+// to Interpreter.
+TfLiteStatus Interpreter::SetTensorParametersReadWrite(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
+  return primary_subgraph().SetTensorParametersReadWrite(
+      tensor_index, type, name, rank, dims, quantization, is_variable);
+}
+
+TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
+  return primary_subgraph().SetExecutionPlan(new_plan);
+}
+
+void Interpreter::UseNNAPI(bool enable) { primary_subgraph().UseNNAPI(enable); }
+
+void Interpreter::SetNumThreads(int num_threads) {
+  for (auto& subgraph : subgraphs_) {
+    subgraph->context()->recommended_num_threads = num_threads;
+  }
+
+  for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
+    auto* c = external_contexts_[i];
+    if (c && c->Refresh) {
+      c->Refresh(context_);
+    }
+  }
+}
+
+void Interpreter::SetAllowFp16PrecisionForFp32(bool allow) {
+  for (auto& subgraph : subgraphs_) {
+    subgraph->context()->allow_fp32_relax_to_fp16 = allow;
+  }
+}
+
+TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  return primary_subgraph().ModifyGraphWithDelegate(delegate);
+}
+
+TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
+                                          TfLiteBufferHandle buffer_handle,
+                                          TfLiteDelegate* delegate) {
+  TF_LITE_ENSURE(context_, tensor_index < tensors_size());
+  std::vector<TfLiteTensor>& tensors = primary_subgraph().tensors();
+  TfLiteTensor* tensor = &tensors[tensor_index];
+
+  TF_LITE_ENSURE(context_,
+                 tensor->delegate == nullptr || tensor->delegate == delegate);
+  tensor->delegate = delegate;
+  if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
+    TF_LITE_ENSURE(context_, tensor->delegate->FreeBufferHandle != nullptr);
+    tensor->delegate->FreeBufferHandle(context_, tensor->delegate,
+                                       &tensor->buffer_handle);
+  }
+  tensor->buffer_handle = buffer_handle;
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
+                                          TfLiteBufferHandle* buffer_handle,
+                                          TfLiteDelegate** delegate) {
+  TF_LITE_ENSURE(context_, tensor_index < tensors_size());
+  std::vector<TfLiteTensor>& tensors = primary_subgraph().tensors();
+  TfLiteTensor* tensor = &tensors[tensor_index];
+
+  *delegate = tensor->delegate;
+  *buffer_handle = tensor->buffer_handle;
+
+  return kTfLiteOk;
+}
+
+void Interpreter::SetProfiler(profiling::Profiler* profiler) {
+  for (auto& subgraph : subgraphs_) subgraph->SetProfiler(profiler);
+}
+
+profiling::Profiler* Interpreter::GetProfiler() {
+  return primary_subgraph().GetProfiler();
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..6192d56ca2b5810d7ffaddbf4cc7ae3c1b27c268
--- /dev/null
+++ b/tensorflow/lite/interpreter.h
@@ -0,0 +1,500 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Main abstraction controlling the tflite interpreter.
+// See context.h for the API for defining operations (TfLiteRegistration).
+#ifndef TENSORFLOW_LITE_INTERPRETER_H_
+#define TENSORFLOW_LITE_INTERPRETER_H_
+
+#include <complex>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/profiling/profiler.h"
+#include "tensorflow/lite/stderr_reporter.h"
+
+namespace tflite {
+
+// Map statically from a c++ type to a TfLiteType (used below for safe casts).
+template <class T>
+constexpr TfLiteType typeToTfLiteType() {
+  return kTfLiteNoType;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<int>() {
+  return kTfLiteInt32;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<int16_t>() {
+  return kTfLiteInt16;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<int64_t>() {
+  return kTfLiteInt64;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<float>() {
+  return kTfLiteFloat32;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<unsigned char>() {
+  return kTfLiteUInt8;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<int8_t>() {
+  return kTfLiteInt8;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<bool>() {
+  return kTfLiteBool;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<std::complex<float>>() {
+  return kTfLiteComplex64;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<string>() {
+  return kTfLiteString;
+}
+
+// An interpreter for a graph of nodes that input and output from tensors.
+// Each node of the graph processes a set of input tensors and produces a
+// set of output Tensors. All inputs/output tensors are referenced by index.
+//
+// Usage:
+//
+// -- Create basic model
+// Interpreter foo(2, 1);
+// foo.SetTensorParametersReadWrite(0, ...);
+// foo.SetTensorParametersReadOnly(1, ...);
+// foo.SetNodeParameters(0, ...)
+//
+// -- Resize input array to 1 length.
+// foo.ResizeInputTensor(0, 1);
+// foo.AllocateTensors();
+// -- Install array data
+// foo.typed_tensor<float>(0)[0] = 3;
+// foo.Invoke();
+// foo.typed_tensor<float>(0)[0] = 4;
+// foo.Invoke();
+// -- Resize input array and set data.
+// foo.ResizeInputTensor(0, 2);
+// foo.AllocateTensors();
+// foo.typed_tensor<float>(0)[0] = 4;
+// foo.typed_tensor<float>(0)[1] = 8;
+// foo.Invoke();
+//
+
+class Interpreter {
+ public:
+  // Instantiate an interpreter. All errors associated with reading and
+  // processing this model will be forwarded to the error_reporter object.
+  //
+  // Note, if error_reporter is nullptr, then a default StderrReporter is
+  // used. Ownership of 'error_reporter' remains with the caller.
+  explicit Interpreter(ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  ~Interpreter();
+
+  // Interpreters are not copyable as they have non-trivial memory semantics.
+  Interpreter(const Interpreter&) = delete;
+  Interpreter& operator=(const Interpreter&) = delete;
+
+  // Functions to build interpreter
+
+  // Provide a list of tensor indexes that are inputs to the model.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetInputs(std::vector<int> inputs);
+
+  // Provide a list of tensor indexes that are outputs to the model
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetOutputs(std::vector<int> outputs);
+
+  // Provide a list of tensor indexes that are variable tensors.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetVariables(std::vector<int> variables);
+
+  // Ensure the internal node storage memory allocates at least `count`
+  // spots for node. NOTE, this doesn't actually add operators. This is an
+  // efficiency optimization that is subject to change.
+  void ReserveNodes(int count);
+
+  // Adds a node with the given parameters and returns the index of the new
+  // node in `node_index` (optionally). Interpreter will take ownership of
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
+  // remains with the caller.
+  TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
+                                     const std::vector<int>& outputs,
+                                     const char* init_data,
+                                     size_t init_data_size, void* builtin_data,
+                                     const TfLiteRegistration* registration,
+                                     int* node_index = nullptr);
+
+  // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
+  // The value pointed to by `first_new_tensor_index` will be set to the
+  // index of the first new tensor if `first_new_tensor_index` is non-null.
+  TfLiteStatus AddTensors(int tensors_to_add,
+                          int* first_new_tensor_index = nullptr);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  inline TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes,
+      const Allocation* allocation = nullptr) {
+    return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
+                                       dims.data(), quantization, buffer, bytes,
+                                       allocation);
+  }
+
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  inline TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantizationParams quantization,
+      bool is_variable = false) {
+    return SetTensorParametersReadWrite(tensor_index, type, name, dims.size(),
+                                        dims.data(), quantization, is_variable);
+  }
+  TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization,
+      bool is_variable = false);
+
+  // Functions to access tensor data
+
+  // Read only access to list of inputs.
+  const std::vector<int>& inputs() const { return primary_subgraph().inputs(); }
+
+  // Return the name of a given input. The given index must be between 0 and
+  // inputs().size().
+  const char* GetInputName(int index) const {
+    return context_->tensors[inputs()[index]].name;
+  }
+
+  // Read only access to list of outputs.
+  const std::vector<int>& outputs() const {
+    return primary_subgraph().outputs();
+  }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& variables() const {
+    return primary_subgraph().variables();
+  }
+
+  // Return the name of a given output. The given index must be between 0 and
+  // outputs().size().
+  const char* GetOutputName(int index) const {
+    return context_->tensors[outputs()[index]].name;
+  }
+
+  // Return the number of tensors in the model.
+  size_t tensors_size() const { return context_->tensors_size; }
+
+  // Return the number of ops in the model.
+  size_t nodes_size() const { return primary_subgraph().nodes_size(); }
+
+  // WARNING: Experimental interface, subject to change
+  const std::vector<int>& execution_plan() const {
+    return primary_subgraph().execution_plan();
+  }
+
+  // WARNING: Experimental interface, subject to change
+  // Overrides execution plan. This bounds checks indices sent in.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
+  // Get a mutable tensor data structure.
+  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
+  // read/write access to structure
+  TfLiteTensor* tensor(int tensor_index) {
+    return primary_subgraph().tensor(tensor_index);
+  }
+
+  // Get an immutable tensor data structure.
+  const TfLiteTensor* tensor(int tensor_index) const {
+    return primary_subgraph().tensor(tensor_index);
+  }
+
+  // Get a pointer to an operation and registration data structure if in bounds.
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) const {
+    return primary_subgraph().node_and_registration(node_index);
+  }
+
+  // Perform a checked cast to the appropriate tensor type (mutable pointer
+  // version).
+  template <class T>
+  T* typed_tensor(int tensor_index) {
+    if (TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
+      if (tensor_ptr->type == typeToTfLiteType<T>()) {
+        return reinterpret_cast<T*>(tensor_ptr->data.raw);
+      }
+    }
+    return nullptr;
+  }
+
+  // Perform a checked cast to the appropriate tensor type (immutable pointer
+  // version).
+  template <class T>
+  const T* typed_tensor(int tensor_index) const {
+    if (const TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
+      if (tensor_ptr->type == typeToTfLiteType<T>()) {
+        return reinterpret_cast<const T*>(tensor_ptr->data.raw);
+      }
+    }
+    return nullptr;
+  }
+
+  // Return a mutable pointer into the data of a given input tensor. The given
+  // index must be between 0 and inputs().size().
+  template <class T>
+  T* typed_input_tensor(int index) {
+    return typed_tensor<T>(inputs()[index]);
+  }
+
+  // Return an immutable pointer into the data of a given input tensor. The
+  // given index must be between 0 and inputs().size().
+  template <class T>
+  const T* typed_input_tensor(int index) const {
+    return typed_tensor<T>(inputs()[index]);
+  }
+
+  // Return a mutable pointer into the data of a given output tensor. The given
+  // index must be between 0 and outputs().size().
+  template <class T>
+  T* typed_output_tensor(int index) {
+    return typed_tensor<T>(outputs()[index]);
+  }
+
+  // Return an immutable pointer into the data of a given output tensor. The
+  // given index must be between 0 and outputs().size().
+  template <class T>
+  const T* typed_output_tensor(int index) const {
+    return typed_tensor<T>(outputs()[index]);
+  }
+
+  // Change the dimensionality of a given tensor. Note, this is only acceptable
+  // for tensor indices that are inputs.
+  // Returns status of failure or success.
+  // TODO(aselle): Consider implementing ArraySlice equivalent to make this
+  //   more adept at accepting data without an extra copy. Use absl::ArraySlice
+  //   if our partners determine that dependency is acceptable.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims);
+
+  // Update allocations for all tensors. This will redim dependent tensors using
+  // the input tensor dimensionality as given. This is relatively expensive.
+  // If you know that your sizes are not changing, you need not call this.
+  // Returns status of success or failure.
+  TfLiteStatus AllocateTensors();
+
+  // Invoke the interpreter (run the whole graph in dependency order).
+  //
+  // NOTE: It is possible that the interpreter is not in a ready state
+  // to evaluate (i.e. if a ResizeTensor() has been performed without an
+  // AllocateTensors().
+  // Returns status of success or failure.
+  TfLiteStatus Invoke();
+
+  // Enable or disable the NN API (true to enable)
+  void UseNNAPI(bool enable);
+
+  // Set the number of threads available to the interpreter.
+  void SetNumThreads(int num_threads);
+
+  // Allow float16 precision for FP32 calculation when possible.
+  // default: not allow.
+  // WARNING: This is an experimental API and subject to change.
+  void SetAllowFp16PrecisionForFp32(bool allow);
+
+  // Get the half precision flag.
+  // WARNING: This is an experimental API and subject to change.
+  bool GetAllowFp16PrecisionForFp32() const {
+    return context_->allow_fp32_relax_to_fp16;
+  }
+
+  // Owning handle to a TfLiteDelegate instance.
+  using TfLiteDelegatePtr =
+      std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+
+  // Ensure the data in `tensor.data` is readable. In case delegate is used,
+  // it might require to copy the data from delegate buffer to raw memory.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
+    return primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
+  }
+
+  // Set the delegate buffer handle to a tensor. It can be called in the
+  // following cases:
+  // 1. Set the buffer handle to a tensor that's not being written by a
+  //    delegate. For example, feeding an OpenGL texture as the input of the
+  //    inference graph.
+  // 2. Set the buffer handle to a tensor that uses the same delegate.
+  //    For example, set an OpenGL texture as the output of inference, while
+  //    the node which produces output is an OpenGL delegate node.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus SetBufferHandle(int tensor_index,
+                               TfLiteBufferHandle buffer_handle,
+                               TfLiteDelegate* delegate);
+
+  // Get the delegate buffer handle, and the delegate which can process the
+  // buffer handle.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus GetBufferHandle(int tensor_index,
+                               TfLiteBufferHandle* buffer_handle,
+                               TfLiteDelegate** delegate);
+
+  void SetProfiler(profiling::Profiler* profiler);
+
+  profiling::Profiler* GetProfiler();
+
+  // The default capacity of `tensors_` vector.
+  static constexpr int kTensorsReservedCapacity = 128;
+  // The capacity headroom of `tensors_` vector before calling ops'
+  // `prepare` and `invoke` function. In these functions, it's guaranteed
+  // allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
+  // pointers to existing tensors.
+  static constexpr int kTensorsCapacityHeadroom = 16;
+
+  // Set if buffer handle output is allowed.
+  //
+  // When using hardware delegation, Interpreter will make the data of output
+  // tensors available in `tensor->data` by default. If the application can
+  // consume the buffer handle directly (e.g. reading output from OpenGL
+  // texture), it can set this flag to false, so Interpreter won't copy the data
+  // from buffer handle to CPU memory.
+  // WARNING: This is an experimental API and subject to change.
+  void SetAllowBufferHandleOutput(bool allow_buffer_handle_output) {
+    allow_buffer_handle_output_ = allow_buffer_handle_output;
+  }
+
+  // Reset all variable tensors to the default value.
+  // If a variable tensor doesn't have a buffer, reset it to zero.
+  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  // to the value of the buffer.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ResetVariableTensors();
+
+  // Retrieve an operator's description of its work, for profiling purposes.
+  const char* OpProfilingString(const TfLiteRegistration& op_reg,
+                                const TfLiteNode* node) const {
+    if (op_reg.profiling_string == nullptr) return nullptr;
+    return op_reg.profiling_string(context_, node);
+  }
+
+  // Set the value of an external context.
+  void SetExternalContext(TfLiteExternalContextType type,
+                          TfLiteExternalContext* ctx);
+
+  // Adds `subgraphs_to_add` subgraphs, preserving pre-existing Subgraph
+  // entries. The value pointed to by `first_new_subgraph_index` will be set to
+  // the index of the first new subgraph if `first_new_subgraph_index` is
+  // non-null.
+  // WARNING: This is an experimental API and subject to change.
+  void AddSubgraphs(int subgraphs_to_add,
+                    int* first_new_subgraph_index = nullptr);
+
+  // Return the number of subgraphs in the model.
+  // WARNING: This is an experimental API and subject to change.
+  size_t subgraphs_size() const { return subgraphs_.size(); }
+
+  // Get a pointer to a subgraph if in bounds.
+  // WARNING: This is an experimental API and subject to change.
+  Subgraph* subgraph(int subgraph_index) {
+    if (subgraph_index < 0 ||
+        static_cast<size_t>(subgraph_index) >= subgraphs_size())
+      return nullptr;
+    return &*subgraphs_[subgraph_index];
+  }
+
+  // WARNING: Experimental interface, subject to change
+  Subgraph& primary_subgraph() {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
+  }
+
+  // WARNING: Experimental interface, subject to change
+  const Subgraph& primary_subgraph() const {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
+  }
+
+ private:
+  friend class InterpreterBuilder;
+  friend class InterpreterTest;
+
+  // Set the value of an external context.
+  static void SetExternalContext(struct TfLiteContext* context,
+                                 TfLiteExternalContextType type,
+                                 TfLiteExternalContext* ctx);
+
+  // Variant of the public ModifyGraphWithDelegate method that additionally
+  // Assumes ownership of the provided delegate.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegatePtr delegate) {
+    // Note that we retain ownership of the delegate even if graph modification
+    // fails, as delegate use will be in an indeterminate state at that point.
+    owned_delegates_.push_back(std::move(delegate));
+    return ModifyGraphWithDelegate(owned_delegates_.back().get());
+  }
+
+  // A pure C data structure used to communicate with the pure C plugin
+  // interface. To avoid copying tensor metadata, this is also the definitive
+  // structure to store tensors.
+  // This is the primary subgraph context.
+  TfLiteContext* context_;
+
+  // The error reporter delegate that tflite will forward queries errors to.
+  ErrorReporter* error_reporter_;
+
+  // List of delegates that have been installed and are owned by this
+  // interpreter instance. Useful if client delegate ownership is burdensome.
+  // WARNING: This is an experimental API and subject to change.
+  // TODO(b/116667551): Use TfLiteExternalContext for storing state.
+  std::vector<TfLiteDelegatePtr> owned_delegates_;
+
+  bool allow_buffer_handle_output_ = false;
+
+  // List of active external contexts.
+  TfLiteExternalContext* external_contexts_[kTfLiteMaxExternalContexts];
+
+  // Subgraphs
+  std::vector<std::unique_ptr<Subgraph>> subgraphs_;
+};
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_INTERPRETER_H_
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/interpreter_test.cc
rename to tensorflow/lite/interpreter_test.cc
index 6c71d5a8d7bb3e275379637b151ab8f998b04f41..78b5d1b8873b8b3558b098031ffa33c7857a31e5 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/lite/interpreter.h"
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 
@@ -38,7 +38,7 @@ class InterpreterTest : public ::testing::Test {
   }
 
  protected:
-  TfLiteContext* GetInterpreterContext() { return &interpreter_.context_; }
+  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
 
   Interpreter interpreter_;
 };
@@ -566,7 +566,7 @@ TEST(BasicInterpreter, ThreeStepAllocate) {
     DynamicBuffer buf;
     StringRef str_ref = GetString(input, 0);
     buf.AddString(str_ref);
-    buf.WriteToTensor(output);
+    buf.WriteToTensorAsVector(output);
     return kTfLiteOk;
   };
 
@@ -698,7 +698,7 @@ TEST(BasicInterpreter, TestUnsupportedDelegateFunctions) {
                                                   nullptr};
       TfLiteIntArray nodes_to_replace;
       nodes_to_replace.size = 0;
-      EXPECT_EQ(context->ReplaceSubgraphsWithDelegateKernels(
+      EXPECT_EQ(context->ReplaceNodeSubsetsWithDelegateKernels(
                     context, delegate_registration, &nodes_to_replace, nullptr),
                 kTfLiteError);
     }
@@ -1085,22 +1085,22 @@ class TestDelegate : public ::testing::Test {
           TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
         }
 
-        context->ReplaceSubgraphsWithDelegateKernels(
+        context->ReplaceNodeSubsetsWithDelegateKernels(
             context, FakeFusedRegistration(), nodes_to_separate, delegate);
         TfLiteIntArrayFree(nodes_to_separate);
         return kTfLiteOk;
       };
-      delegate_.CopyToBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle, void* data,
-             size_t size) -> TfLiteStatus {
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
       delegate_.CopyFromBufferHandle =
           [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle, void* data,
-             size_t size) -> TfLiteStatus {
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
@@ -1109,6 +1109,7 @@ class TestDelegate : public ::testing::Test {
              TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
       // Store type-punned data SimpleDelegate structure.
       delegate_.data_ = reinterpret_cast<void*>(this);
+      delegate_.flags = kTfLiteDelegateFlagsNone;
     }
 
     static TfLiteRegistration FakeFusedRegistration() {
@@ -1210,7 +1211,7 @@ TEST_F(TestDelegate, SetInvalidHandleToTensor) {
   interpreter_->Invoke();
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate, true);
+  interpreter_->ModifyGraphWithDelegate(delegate);
 
   SimpleDelegate another_simple_delegate({0, 1, 2});
 
@@ -1264,10 +1265,11 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
       TfLiteIntArray* execution_plan;
       TF_LITE_ENSURE_STATUS(
           context->GetExecutionPlan(context, &execution_plan));
-      context->ReplaceSubgraphsWithDelegateKernels(
+      context->ReplaceNodeSubsetsWithDelegateKernels(
           context, DelegateRegistration(), execution_plan, delegate);
       return kTfLiteOk;
     };
+    delegate_.flags = kTfLiteDelegateFlagsNone;
   }
 
   static TfLiteRegistration DynamicCopyOpRegistration() {
@@ -1296,7 +1298,7 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
 };
 
 TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
-  interpreter_->ModifyGraphWithDelegate(&delegate_, false);
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
 
   ASSERT_EQ(interpreter_->execution_plan().size(), 1);
   // The interpreter should not call delegate's `Prepare` when dynamic tensors
@@ -1305,7 +1307,8 @@ TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
 }
 
 TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
-  interpreter_->ModifyGraphWithDelegate(&delegate_, true);
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
 
   ASSERT_EQ(interpreter_->execution_plan().size(), 1);
   // The node should be replaced because dynamic tensors are allowed. Therefore
@@ -1317,6 +1320,7 @@ TEST(TestDelegateOwnership, ProperlyDisposed) {
   struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate {
     TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared)
         : destroyed(destroyed), prepared(prepared) {
+      flags = kTfLiteDelegateFlagsNone;
       Prepare = [](TfLiteContext*, TfLiteDelegate* delegate) -> TfLiteStatus {
         *static_cast<TfLiteInterpreterOwnedDelegate*>(delegate)->prepared =
             true;
diff --git a/tensorflow/contrib/lite/java/AndroidManifest.xml b/tensorflow/lite/java/AndroidManifest.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/AndroidManifest.xml
rename to tensorflow/lite/java/AndroidManifest.xml
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..adf7bc9087878ad84824844139058c140d7084f8
--- /dev/null
+++ b/tensorflow/lite/java/BUILD
@@ -0,0 +1,225 @@
+# Description:
+# TensorFlow Lite Java API.
+
+package(default_visibility = [
+    "//tensorflow/lite/java/ovic:__pkg__",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
+load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
+load("//tensorflow/lite/java:aar_with_jni.bzl", "aar_with_jni")
+
+JAVA_SRCS = glob([
+    "src/main/java/org/tensorflow/lite/*.java",
+])
+
+# Building tensorflow-lite.aar including 4 variants of .so
+# To build an aar for release, run below command:
+# bazel build --cxxopt='--std=c++11' -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+# tensorflow/lite/java:tensorflow-lite
+aar_with_jni(
+    name = "tensorflow-lite",
+    android_library = ":tensorflowlite",
+)
+
+# EXPERIMENTAL: AAR target that supports TensorFlow op execution with TFLite.
+aar_with_jni(
+    name = "tensorflow-lite-with-select-tf-ops",
+    android_library = ":tensorflowlite_flex",
+)
+
+android_library(
+    name = "tensorflowlite",
+    srcs = JAVA_SRCS,
+    manifest = "AndroidManifest.xml",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tensorflowlite_native",
+        "@org_checkerframework_qual",
+    ],
+)
+
+# EXPERIMENTAL: Android target that supports TensorFlow op execution with TFLite.
+android_library(
+    name = "tensorflowlite_flex",
+    srcs = JAVA_SRCS,
+    manifest = "AndroidManifest.xml",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tensorflowlite_native_flex",
+        "@org_checkerframework_qual",
+    ],
+)
+
+android_library(
+    name = "tensorflowlite_java",
+    srcs = JAVA_SRCS,
+    visibility = ["//visibility:public"],
+    deps = [
+        "@org_checkerframework_qual",
+    ],
+)
+
+java_library(
+    name = "tensorflowlitelib",
+    srcs = JAVA_SRCS,
+    javacopts = JAVACOPTS,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtensorflowlite_jni.so",
+        "@org_checkerframework_qual",
+    ],
+)
+
+# EXPERIMENTAL: Java target that supports TensorFlow op execution with TFLite.
+java_library(
+    name = "tensorflowlitelib_flex",
+    srcs = JAVA_SRCS,
+    javacopts = JAVACOPTS,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtensorflowlite_flex_jni.so",
+        "@org_checkerframework_qual",
+    ],
+)
+
+java_test(
+    name = "TensorFlowLiteTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.TensorFlowLiteTest",
+    deps = [
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "DataTypeTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/DataTypeTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.DataTypeTest",
+    deps = [
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "NativeInterpreterWrapperTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java"],
+    data = [
+        "src/testdata/add.bin",
+        "src/testdata/int32.bin",
+        "src/testdata/int64.bin",
+        "src/testdata/invalid_model.bin",
+        "src/testdata/quantized.bin",
+        "src/testdata/uint8.bin",
+        "src/testdata/with_custom_op.lite",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
+    deps = [
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+# TODO: generate large models at runtime, instead of storing them.
+java_test(
+    name = "InterpreterTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/InterpreterTest.java"],
+    data = [
+        "src/testdata/add.bin",
+        "//tensorflow/lite:testdata/multi_add.bin",
+        "//tensorflow/lite:testdata/multi_add_flex.bin",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.InterpreterTest",
+    visibility = ["//visibility:private"],
+    deps = [
+        ":tensorflowlitelib",
+        "//tensorflow/lite/java/src/test/native:libtensorflowlite_test_jni.so",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "InterpreterFlexTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/InterpreterFlexTest.java"],
+    data = [
+        "//tensorflow/lite:testdata/multi_add_flex.bin",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.InterpreterFlexTest",
+    visibility = ["//visibility:private"],
+    deps = [
+        ":tensorflowlitelib_flex",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "TensorTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/TensorTest.java"],
+    data = [
+        "src/testdata/add.bin",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.TensorTest",
+    deps = [
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+filegroup(
+    name = "libtensorflowlite_jni",
+    srcs = select({
+        "//conditions:default": [":libtensorflowlite_jni.so"],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "tensorflowlite_native",
+    srcs = ["libtensorflowlite_jni.so"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "tensorflowlite_native_flex",
+    srcs = ["libtensorflowlite_flex_jni.so"],
+    visibility = ["//visibility:public"],
+)
+
+tflite_jni_binary(
+    name = "libtensorflowlite_jni.so",
+    deps = [
+        "//tensorflow/lite/java/src/main/native",
+    ],
+)
+
+# EXPERIMENTAL: Native target that supports TensorFlow op execution with TFLite.
+tflite_jni_binary(
+    name = "libtensorflowlite_flex_jni.so",
+    deps = [
+        "//tensorflow/lite/delegates/flex:delegate",
+        "//tensorflow/lite/java/src/main/native",
+        "//tensorflow/lite/java/src/main/native:init_tensorflow",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/aar_with_jni.bzl b/tensorflow/lite/java/aar_with_jni.bzl
similarity index 95%
rename from tensorflow/contrib/lite/java/aar_with_jni.bzl
rename to tensorflow/lite/java/aar_with_jni.bzl
index 9d2aead266f897e8b08520d06ea60654927029e9..360d622b1bcf5cf379987ceefc43c74b1b6ce5fb 100644
--- a/tensorflow/contrib/lite/java/aar_with_jni.bzl
+++ b/tensorflow/lite/java/aar_with_jni.bzl
@@ -30,7 +30,10 @@ EOF
         # In some platforms we don't have an Android SDK/NDK and this target
         # can't be built. We need to prevent the build system from trying to
         # use the target in that case.
-        tags = ["manual"],
+        tags = [
+            "manual",
+            "no_cuda_on_cpu_tap",
+        ],
     )
 
     native.genrule(
diff --git a/tensorflow/contrib/lite/java/build_aar_for_release.sh b/tensorflow/lite/java/build_aar_for_release.sh
similarity index 98%
rename from tensorflow/contrib/lite/java/build_aar_for_release.sh
rename to tensorflow/lite/java/build_aar_for_release.sh
index fbcb1e7db9a3f9b885505e989b7ff7224f2d2b15..54be643fc7e0ae2fb50b2688db5054520bc30e47 100755
--- a/tensorflow/contrib/lite/java/build_aar_for_release.sh
+++ b/tensorflow/lite/java/build_aar_for_release.sh
@@ -22,7 +22,7 @@ trap "rm -rf $TMPDIR" EXIT
 VERSION=1.0
 
 BUILDER=bazel
-BASEDIR=tensorflow/contrib/lite
+BASEDIR=tensorflow/lite
 CROSSTOOL="//external:android/crosstool"
 HOST_CROSSTOOL="@bazel_tools//tools/cpp:toolchain"
 
diff --git a/tensorflow/contrib/lite/java/demo/.gitignore b/tensorflow/lite/java/demo/.gitignore
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/.gitignore
rename to tensorflow/lite/java/demo/.gitignore
diff --git a/tensorflow/lite/java/demo/README.md b/tensorflow/lite/java/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b5bfe39ce7f6aba75cc5d1e0d9d01ba985f7d909
--- /dev/null
+++ b/tensorflow/lite/java/demo/README.md
@@ -0,0 +1,53 @@
+# TF Lite Android Image Classifier App Example
+
+A simple Android example that demonstrates image classification using the camera.
+
+## Building in Android Studio with TensorFlow Lite AAR from JCenter.
+The build.gradle is configured to use TensorFlow Lite's nightly build.
+
+If you see a build error related to compatibility with Tensorflow Lite's Java API (example: method X is
+undefined for type Interpreter), there has likely been a backwards compatible
+change to the API. You will need to pull new app code that's compatible with the
+nightly build and may need to first wait a few days for our external and internal
+code to merge.
+
+## Building from Source with Bazel
+
+1. Follow the [Bazel steps for the TF Demo App](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel):
+
+  1. [Install Bazel and Android Prerequisites](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-bazel-and-android-prerequisites).
+     It's easiest with Android Studio.
+
+      - You'll need at least SDK version 23.
+      - Make sure to install the latest version of Bazel. Some distributions
+        ship with Bazel 0.5.4, which is too old.
+      - Bazel requires Android Build Tools `26.0.1` or higher.
+      - You also need to install the Android Support Repository, available
+        through Android Studio under `Android SDK Manager -> SDK Tools ->
+        Android Support Repository`.
+
+  2. [Edit your `WORKSPACE`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#edit-workspace)
+     to add SDK and NDK targets.
+
+     NOTE: As long as you have the SDK and NDK installed, the `./configure`
+     script will create these rules for you. Answer "Yes" when the script asks
+     to automatically configure the `./WORKSPACE`.
+
+      - Make sure the `api_level` in `WORKSPACE` is set to an SDK version that
+        you have installed.
+      - By default, Android Studio will install the SDK to `~/Android/Sdk` and
+        the NDK to `~/Android/Sdk/ndk-bundle`.
+
+2. Build the app with Bazel. The demo needs C++11:
+
+  ```shell
+  bazel build -c opt --cxxopt='--std=c++11' \
+    //tensorflow/lite/java/demo/app/src/main:TfLiteCameraDemo
+  ```
+
+3. Install the demo on a
+   [debug-enabled device](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install):
+
+  ```shell
+  adb install bazel-bin/tensorflow/lite/java/demo/app/src/main/TfLiteCameraDemo.apk
+  ```
diff --git a/tensorflow/lite/java/demo/app/build.gradle b/tensorflow/lite/java/demo/app/build.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..b8fc282cb1dfe8a9c80692759e985bf369fc163d
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/build.gradle
@@ -0,0 +1,134 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 26
+    buildToolsVersion "26.0.1"
+    defaultConfig {
+        applicationId "android.example.com.tflitecamerademo"
+        // Required by Camera2 API.
+        minSdkVersion 21
+        targetSdkVersion 26
+        versionCode 1
+        versionName "1.0"
+
+        // Remove this block.
+        jackOptions {
+            enabled true
+        }
+    }
+    lintOptions {
+        abortOnError false
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+}
+
+repositories {
+    maven {
+        url 'https://google.bintray.com/tensorflow'
+    }
+}
+allprojects {
+    repositories {
+        // Uncomment if you want to use a local repo.
+        // mavenLocal()
+        jcenter()
+    }
+}
+
+
+
+dependencies {
+    compile fileTree(dir: 'libs', include: ['*.jar'])
+    compile 'com.android.support:appcompat-v7:25.2.0'
+    compile 'com.android.support.constraint:constraint-layout:1.0.2'
+    compile 'com.android.support:design:25.2.0'
+    compile 'com.android.support:support-annotations:25.3.1'
+    compile 'com.android.support:support-v13:25.2.0'
+
+    // Build off of nightly TensorFlow Lite
+    compile 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    // Use local TensorFlow library
+    // compile 'org.tensorflow:tensorflow-lite-local:0.0.0'
+}
+
+def targetFolder = "src/main/assets"
+def modelFloatDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz"
+def modelQuantDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
+def localCacheFloat = "build/intermediates/mobilenet_v1_1.0_224.tgz"
+def localCacheQuant = "build/intermediates/mmobilenet_v1_1.0_224_quant.tgz"
+
+
+task downloadModelFloat(type: DownloadUrlTask) {
+    doFirst {
+        println "Downloading ${modelFloatDownloadUrl}"
+    }
+    sourceUrl = "${modelFloatDownloadUrl}"
+    target = file("${localCacheFloat}")
+}
+
+task downloadModelQuant(type: DownloadUrlTask) {
+    doFirst {
+        println "Downloading ${modelQuantDownloadUrl}"
+    }
+    sourceUrl = "${modelQuantDownloadUrl}"
+    target = file("${localCacheQuant}")
+}
+
+task unzipModelFloat(type: Copy, dependsOn: 'downloadModelFloat') {
+    doFirst {
+        println "Unzipping ${localCacheFloat}"
+    }
+    from tarTree("${localCacheFloat}")
+    into "${targetFolder}"
+}
+
+task unzipModelQuant(type: Copy, dependsOn: 'downloadModelQuant') {
+    doFirst {
+        println "Unzipping ${localCacheQuant}"
+    }
+    from tarTree("${localCacheQuant}")
+    into "${targetFolder}"
+}
+
+task cleanUnusedFiles(type: Delete, dependsOn: ['unzipModelFloat', 'unzipModelQuant']) {
+    delete fileTree("${targetFolder}").matching {
+        include "*.pb"
+        include "*.ckpt.*"
+        include "*.pbtxt.*"
+        include "*.quant_info.*"
+        include "*.meta"
+    }
+}
+
+
+// Ensure the model file is downloaded and extracted before every build
+preBuild.dependsOn unzipModelFloat
+preBuild.dependsOn unzipModelQuant
+preBuild.dependsOn cleanUnusedFiles
+
+class DownloadUrlTask extends DefaultTask {
+    @Input
+    String sourceUrl
+
+    @OutputFile
+    File target
+
+    @TaskAction
+    void download() {
+        ant.get(src: sourceUrl, dest: target)
+    }
+}
+
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml b/tensorflow/lite/java/demo/app/src/main/AndroidManifest.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml
rename to tensorflow/lite/java/demo/app/src/main/AndroidManifest.xml
diff --git a/tensorflow/lite/java/demo/app/src/main/BUILD b/tensorflow/lite/java/demo/app/src/main/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9a7c1d0b61192c61896813f41b2db1e03ff65ecb
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/BUILD
@@ -0,0 +1,33 @@
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
+
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+android_binary(
+    name = "TfLiteCameraDemo",
+    srcs = glob(["java/**/*.java"]),
+    aapt_version = "aapt",
+    assets = [
+        "//tensorflow/lite/java/demo/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
+        "@tflite_mobilenet_quant//:mobilenet_v1_1.0_224_quant.tflite",
+        "@tflite_mobilenet_float//:mobilenet_v1_1.0_224.tflite",
+    ],
+    assets_dir = "",
+    custom_package = "com.example.android.tflitecamerademo",
+    manifest = "AndroidManifest.xml",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    # In some platforms we don't have an Android SDK/NDK and this target
+    # can't be built. We need to prevent the build system from trying to
+    # use the target in that case.
+    tags = ["manual"],
+    deps = [
+        "//tensorflow/lite/java:tensorflowlite",
+        "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@androidsdk//com.android.support:support-v13-25.2.0",
+        "@androidsdk//com.android.support:support-v4-25.2.0",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD b/tensorflow/lite/java/demo/app/src/main/assets/BUILD
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
rename to tensorflow/lite/java/demo/app/src/main/assets/BUILD
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels_imagenet_slim.txt b/tensorflow/lite/java/demo/app/src/main/assets/labels_imagenet_slim.txt
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/assets/labels_imagenet_slim.txt
rename to tensorflow/lite/java/demo/app/src/main/assets/labels_imagenet_slim.txt
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt b/tensorflow/lite/java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
rename to tensorflow/lite/java/demo/app/src/main/assets/labels_mobilenet_quant_v1_224.txt
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/AutoFitTextureView.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/AutoFitTextureView.java
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/AutoFitTextureView.java
rename to tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/AutoFitTextureView.java
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
similarity index 86%
rename from tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
rename to tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 3596e4201150abaecc1cd8fdd736510a0afc97bb..165d33510131ac9c9fc08070f0a4d08653188fae 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -56,11 +56,12 @@ import android.view.Surface;
 import android.view.TextureView;
 import android.view.View;
 import android.view.ViewGroup;
-import android.widget.CompoundButton;
+import android.widget.AdapterView;
+import android.widget.ArrayAdapter;
+import android.widget.ListView;
 import android.widget.NumberPicker;
 import android.widget.TextView;
 import android.widget.Toast;
-import android.widget.ToggleButton;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -70,6 +71,7 @@ import java.util.List;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
 
+
 /** Basic fragments for the Camera. */
 public class Camera2BasicFragment extends Fragment
     implements FragmentCompat.OnRequestPermissionsResultCallback {
@@ -87,9 +89,11 @@ public class Camera2BasicFragment extends Fragment
   private boolean runClassifier = false;
   private boolean checkedPermissions = false;
   private TextView textView;
-  private ToggleButton toggle;
   private NumberPicker np;
   private ImageClassifier classifier;
+  private ListView deviceView;
+  private ListView modelView;
+
 
   /** Max preview width that is guaranteed by Camera2 API */
   private static final int MAX_PREVIEW_WIDTH = 1920;
@@ -123,6 +127,15 @@ public class Camera2BasicFragment extends Fragment
         public void onSurfaceTextureUpdated(SurfaceTexture texture) {}
       };
 
+  // Model parameter constants.
+  private String gpu;
+  private String cpu;
+  private String nnApi;
+  private String mobilenetV1Quant;
+  private String mobilenetV1Float;
+
+
+
   /** ID of the current {@link CameraDevice}. */
   private String cameraId;
 
@@ -169,6 +182,14 @@ public class Camera2BasicFragment extends Fragment
         }
       };
 
+  private ArrayList<String> deviceStrings = new ArrayList<String>();
+  private ArrayList<String> modelStrings = new ArrayList<String>();
+
+  /** Current indices of device and model. */
+  int currentDevice = -1;
+
+  int currentModel = -1;
+
   /** An additional thread for running tasks that shouldn't block the UI. */
   private HandlerThread backgroundThread;
 
@@ -298,17 +319,113 @@ public class Camera2BasicFragment extends Fragment
     return inflater.inflate(R.layout.fragment_camera2_basic, container, false);
   }
 
+  private void updateActiveModel() {
+    // Get UI information before delegating to background
+    final int modelIndex = modelView.getCheckedItemPosition();
+    final int deviceIndex = deviceView.getCheckedItemPosition();
+
+    backgroundHandler.post(() -> {
+      if (modelIndex == currentModel && deviceIndex == currentDevice) {
+        return;
+      }
+      currentModel = modelIndex;
+      currentDevice = deviceIndex;
+
+      // Disable classifier while updating
+      if (classifier != null) {
+        classifier.close();
+        classifier = null;
+      }
+
+      // Lookup names of parameters.
+      String model = modelStrings.get(modelIndex);
+      String device = deviceStrings.get(deviceIndex);
+
+      Log.i(TAG, "Changing model to " + model + " device " + device);
+
+      // Try to load model.
+      try {
+        if (model.equals(mobilenetV1Quant)) {
+          classifier = new ImageClassifierQuantizedMobileNet(getActivity());
+        } else if (model.equals(mobilenetV1Float)) {
+          classifier = new ImageClassifierFloatMobileNet(getActivity());
+        } else {
+          showToast("Failed to load model");
+        }
+      } catch (IOException e) {
+        Log.d(TAG, "Failed to load", e);
+        classifier = null;
+      }
+
+      // Customzie the interpreter to the type of device we want to use.
+      if (device.equals(cpu)) {
+      } else if (device.equals(gpu)) {
+        if (!GpuDelegateHelper.isGpuDelegateAvailable()) {
+          showToast("gpu not in this build.");
+          classifier = null;
+        } else if (model.equals(mobilenetV1Quant)) {
+          showToast("gpu requires float model.");
+          classifier = null;
+        } else {
+          classifier.useGpu();
+        }
+      } else if (device.equals(nnApi)) {
+        classifier.useNNAPI();
+      }
+    });
+  }
+
   /** Connect the buttons to their event handler. */
   @Override
   public void onViewCreated(final View view, Bundle savedInstanceState) {
+    gpu = getString(R.string.gpu);
+    cpu = getString(R.string.cpu);
+    nnApi = getString(R.string.nnapi);
+    mobilenetV1Quant = getString(R.string.mobilenetV1Quant);
+    mobilenetV1Float = getString(R.string.mobilenetV1Float);
+
+    // Get references to widgets.
     textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
     textView = (TextView) view.findViewById(R.id.text);
-    toggle = (ToggleButton) view.findViewById(R.id.button);
-
-    toggle.setOnCheckedChangeListener(
-        new CompoundButton.OnCheckedChangeListener() {
-          public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
-            backgroundHandler.post(() -> classifier.setUseNNAPI(isChecked));
+    deviceView = (ListView) view.findViewById(R.id.device);
+    modelView = (ListView) view.findViewById(R.id.model);
+
+    // Build list of models
+    modelStrings.add(mobilenetV1Quant);
+    modelStrings.add(mobilenetV1Float);
+
+    // Build list of devices
+    int defaultModelIndex = 0;
+    deviceStrings.add(cpu);
+    if (GpuDelegateHelper.isGpuDelegateAvailable()) {
+      deviceStrings.add(gpu);
+    }
+    deviceStrings.add(nnApi);
+
+    deviceView.setAdapter(
+        new ArrayAdapter<String>(
+            getContext(), R.layout.listview_row, R.id.listview_row_text, deviceStrings));
+    deviceView.setChoiceMode(ListView.CHOICE_MODE_SINGLE);
+    deviceView.setOnItemClickListener(
+        new AdapterView.OnItemClickListener() {
+          @Override
+          public void onItemClick(AdapterView<?> parent, View view, int position, long id) {
+            updateActiveModel();
+          }
+        });
+    deviceView.setItemChecked(0, true);
+
+    modelView.setChoiceMode(ListView.CHOICE_MODE_SINGLE);
+    ArrayAdapter<String> modelAdapter =
+        new ArrayAdapter<>(
+            getContext(), R.layout.listview_row, R.id.listview_row_text, modelStrings);
+    modelView.setAdapter(modelAdapter);
+    modelView.setItemChecked(defaultModelIndex, true);
+    modelView.setOnItemClickListener(
+        new AdapterView.OnItemClickListener() {
+          @Override
+          public void onItemClick(AdapterView<?> parent, View view, int position, long id) {
+            updateActiveModel();
           }
         });
 
@@ -323,18 +440,14 @@ public class Camera2BasicFragment extends Fragment
             backgroundHandler.post(() -> classifier.setNumThreads(newVal));
           }
         });
+
+    // Start initial model.
   }
 
   /** Load the model and labels. */
   @Override
   public void onActivityCreated(Bundle savedInstanceState) {
     super.onActivityCreated(savedInstanceState);
-    try {
-      // create either a new ImageClassifierQuantizedMobileNet or an ImageClassifierFloatInception
-      classifier = new ImageClassifierQuantizedMobileNet(getActivity());
-    } catch (IOException e) {
-      Log.e(TAG, "Failed to initialize an image classifier.", e);
-    }
     startBackgroundThread();
   }
 
@@ -562,10 +675,12 @@ public class Camera2BasicFragment extends Fragment
     backgroundThread = new HandlerThread(HANDLE_THREAD_NAME);
     backgroundThread.start();
     backgroundHandler = new Handler(backgroundThread.getLooper());
+    // Start the classification train & load an initial model.
     synchronized (lock) {
       runClassifier = true;
     }
     backgroundHandler.post(periodicClassify);
+    updateActiveModel();
   }
 
   /** Stops the background thread and its {@link Handler}. */
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/CameraActivity.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/CameraActivity.java
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/CameraActivity.java
rename to tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/CameraActivity.java
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java
new file mode 100644
index 0000000000000000000000000000000000000000..8dca17744eb7a3d1e69612abf61deafb6370e4ff
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import org.tensorflow.lite.Delegate;
+
+/**
+ * Helper class for {@code GpuDelegate}.
+ *
+ * <p>WARNING: This is an experimental API and subject to change.
+ */
+public class GpuDelegateHelper {
+  private GpuDelegateHelper() {}
+
+  /** Checks whether {@code GpuDelegate} is available. */
+  public static boolean isGpuDelegateAvailable() {
+    try {
+      Class.forName("org.tensorflow.lite.experimental.GpuDelegate");
+      return true;
+    } catch (Exception e) {
+      return false;
+    }
+  }
+
+  /** Returns an instance of {@code GpuDelegate} if available. */
+  public static Delegate createGpuDelegate() {
+    try {
+      return Class.forName("org.tensorflow.lite.experimental.GpuDelegate")
+          .asSubclass(Delegate.class)
+          .getDeclaredConstructor()
+          .newInstance();
+    } catch (Exception e) {
+      throw new IllegalStateException(e);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
similarity index 95%
rename from tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
rename to tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index 2d11a57434be98b1b3a7ff398b5ff2ca66df878d..512f8b64db1637385e7be56db6d0889c44abb2fb 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -38,6 +38,7 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.PriorityQueue;
+import org.tensorflow.lite.Delegate;
 import org.tensorflow.lite.Interpreter;
 
 /**
@@ -93,6 +94,9 @@ public abstract class ImageClassifier {
             }
           });
 
+  /** holds a gpu delegate */
+  Delegate gpuDelegate = null;
+
   /** Initializes an {@code ImageClassifier}. */
   ImageClassifier(Activity activity) throws IOException {
     tfliteModel = loadModelFile(activity);
@@ -112,8 +116,6 @@ public abstract class ImageClassifier {
 
   /** Classifies a frame from the preview stream. */
   void classifyFrame(Bitmap bitmap, SpannableStringBuilder builder) {
-    printTopKLabels(builder);
-
     if (tflite == null) {
       Log.e(TAG, "Image classifier has not been initialized; Skipped.");
       builder.append(new SpannableString("Uninitialized Classifier."));
@@ -129,6 +131,7 @@ public abstract class ImageClassifier {
     applyFilter();
 
     // Print the results.
+    printTopKLabels(builder);
     long duration = endTime - startTime;
     SpannableString span = new SpannableString(duration + " ms");
     span.setSpan(new ForegroundColorSpan(android.graphics.Color.LTGRAY), 0, span.length(), 0);
@@ -160,12 +163,27 @@ public abstract class ImageClassifier {
   private void recreateInterpreter() {
     if (tflite != null) {
       tflite.close();
+      // TODO(b/120679982)
+      // gpuDelegate.close();
       tflite = new Interpreter(tfliteModel, tfliteOptions);
     }
   }
 
-  public void setUseNNAPI(Boolean nnapi) {
-    tfliteOptions.setUseNNAPI(nnapi);
+  public void useGpu() {
+    if (gpuDelegate == null && GpuDelegateHelper.isGpuDelegateAvailable()) {
+      gpuDelegate = GpuDelegateHelper.createGpuDelegate();
+      tfliteOptions.addDelegate(gpuDelegate);
+      recreateInterpreter();
+    }
+  }
+
+  public void useCPU() {
+    tfliteOptions.setUseNNAPI(false);
+    recreateInterpreter();
+  }
+
+  public void useNNAPI() {
+    tfliteOptions.setUseNNAPI(true);
     recreateInterpreter();
   }
 
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
rename to tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatInception.java
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java
new file mode 100644
index 0000000000000000000000000000000000000000..c87ffff8f6c39dc1d87c2cf0c09b5602edd9329c
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import android.app.Activity;
+import java.io.IOException;
+
+/** This classifier works with the float MobileNet model. */
+public class ImageClassifierFloatMobileNet extends ImageClassifier {
+
+  /**
+   * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
+   * of the super class, because we need a primitive array here.
+   */
+  private float[][] labelProbArray = null;
+
+  /**
+   * Initializes an {@code ImageClassifierFloatMobileNet}.
+   *
+   * @param activity
+   */
+  ImageClassifierFloatMobileNet(Activity activity) throws IOException {
+    super(activity);
+    labelProbArray = new float[1][getNumLabels()];
+  }
+
+  @Override
+  protected String getModelPath() {
+    // you can download this file from
+    // see build.gradle for where to obtain this file. It should be auto
+    // downloaded into assets.
+    return "mobilenet_v1_1.0_224.tflite";
+  }
+
+  @Override
+  protected String getLabelPath() {
+    return "labels_mobilenet_quant_v1_224.txt";
+  }
+
+  @Override
+  protected int getImageSizeX() {
+    return 224;
+  }
+
+  @Override
+  protected int getImageSizeY() {
+    return 224;
+  }
+
+  @Override
+  protected int getNumBytesPerChannel() {
+    return 4; // Float.SIZE / Byte.SIZE;
+  }
+
+  @Override
+  protected void addPixelValue(int pixelValue) {
+    imgData.putFloat(((pixelValue >> 16) & 0xFF) / 255.f);
+    imgData.putFloat(((pixelValue >> 8) & 0xFF) / 255.f);
+    imgData.putFloat((pixelValue & 0xFF) / 255.f);
+  }
+
+  @Override
+  protected float getProbability(int labelIndex) {
+    return labelProbArray[0][labelIndex];
+  }
+
+  @Override
+  protected void setProbability(int labelIndex, Number value) {
+    labelProbArray[0][labelIndex] = value.floatValue();
+  }
+
+  @Override
+  protected float getNormalizedProbability(int labelIndex) {
+    return labelProbArray[0][labelIndex];
+  }
+
+  @Override
+  protected void runInference() {
+    tflite.run(imgData, labelProbArray);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
similarity index 93%
rename from tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
rename to tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
index e164ac75543ebab12e6b1c057c4ed487eb9accdf..6310a5616838ac6b4258ec05028efa12e8cadab5 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
@@ -42,8 +42,9 @@ public class ImageClassifierQuantizedMobileNet extends ImageClassifier {
   @Override
   protected String getModelPath() {
     // you can download this file from
-    // https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip
-    return "mobilenet_quant_v1_224.tflite";
+    // see build.gradle for where to obtain this file. It should be auto
+    // downloaded into assets.
+    return "mobilenet_v1_1.0_224_quant.tflite";
   }
 
   @Override
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_action_info.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-hdpi/ic_action_info.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_action_info.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-hdpi/ic_action_info.png
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/tile.9.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-hdpi/tile.9.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/tile.9.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-hdpi/tile.9.png
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_action_info.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-mdpi/ic_action_info.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_action_info.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-mdpi/ic_action_info.png
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_action_info.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_action_info.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_action_info.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_action_info.png
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_action_info.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_action_info.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_action_info.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_action_info.png
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png b/tensorflow/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png
rename to tensorflow/lite/java/demo/app/src/main/res/drawable-xxhdpi/logo.png
diff --git a/tensorflow/lite/java/demo/app/src/main/res/drawable/item_selector.xml b/tensorflow/lite/java/demo/app/src/main/res/drawable/item_selector.xml
new file mode 100644
index 0000000000000000000000000000000000000000..202c900769fdd3be15d6b1252d5c2c4f7f728d8c
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/res/drawable/item_selector.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <!-- pressed -->
+    <item android:drawable="@color/selection_highlight" android:state_pressed="true" />
+    <!-- focused -->
+    <item android:drawable="@color/selection_focus" android:state_activated="true" />
+    <!-- default -->
+    <item android:drawable="@color/item_normal" />
+
+</selector>
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ee71ab808f4810ac092b37b0d996331072f44652
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<LinearLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#bb7700"
+    android:orientation="horizontal">
+
+  <com.example.android.tflitecamerademo.AutoFitTextureView
+      android:id="@+id/texture"
+      android:layout_width="0dp"
+      android:layout_height="match_parent"
+      android:layout_weight=".8"/>
+
+  <LinearLayout
+      android:layout_width="0dp"
+      android:layout_height="match_parent"
+      android:layout_weight=".2"
+      android:orientation="vertical">
+
+    <ImageView
+        android:id="@+id/logoview"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:scaleType="centerInside"
+        android:src="@drawable/logo"/>
+
+    <RadioGroup
+        android:gravity="center"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:orientation="horizontal">
+        <RadioButton
+            android:id="@+id/radio_cpu"
+            android:background="#0000000f"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:text="@string/cpu"
+            android:textColor="@android:color/white" />
+        <RadioButton
+            android:id="@+id/radio_nnapi"
+            android:background="#0000000f"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:text="@string/nnapi"
+            android:textColor="@android:color/white" />
+        </RadioGroup>
+
+    <NumberPicker
+        android:id="@+id/np"
+        android:layout_width="wrap_content"
+        android:layout_height="47dp"
+        android:layout_gravity="center_horizontal"
+        android:visibility="visible"/>
+
+    <TextView
+        android:id="@+id/text"
+        android:textStyle="bold"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:paddingTop="20dp"
+        android:textColor="#FFF"
+        android:textSize="20sp"/>
+
+  </LinearLayout>
+</LinearLayout>
+
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
new file mode 100644
index 0000000000000000000000000000000000000000..70eedfdd02ad3ac03f6d413c0d5e2357a320751f
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
@@ -0,0 +1,140 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#bb7700">
+
+    <com.example.android.tflitecamerademo.AutoFitTextureView
+        android:id="@+id/texture"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_weight="1" />
+
+    <LinearLayout
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_above="@+id/bottom_info_view"
+        android:layout_alignParentEnd="false"
+        android:layout_alignParentStart="true"
+        android:layout_alignParentTop="false"
+        android:background="#bb7700"
+        android:orientation="vertical"
+        android:weightSum="100">
+
+        <ImageView
+            android:id="@+id/logoview2"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_weight="30"
+            android:scaleType="fitStart"
+            android:src="@drawable/logo" />
+
+        <TextView
+            android:id="@+id/text"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_alignParentBottom="true"
+            android:layout_alignParentEnd="true"
+            android:layout_alignParentRight="true"
+            android:layout_weight="30"
+            android:textColor="#FFF"
+            android:textSize="20sp"
+            android:textStyle="bold" />
+
+    </LinearLayout>
+
+    <LinearLayout
+        android:id="@+id/bottom_info_view"
+        android:layout_width="match_parent"
+        android:layout_height="200dp"
+
+        android:layout_alignParentBottom="true"
+        android:layout_marginBottom="10dp"
+        android:background="#513400"
+        android:orientation="horizontal">
+
+        <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:gravity="center"
+                android:text="Threads"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <NumberPicker
+                android:id="@+id/np"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginLeft="10dp"
+                android:theme="@style/AppTheme.Picker"
+                android:visibility="visible" />
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/modelLayout"
+            android:layout_width="150dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/modelLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/model"
+                android:layout_width="match_parent"
+                android:layout_height="180dp">
+
+            </ListView>
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/deviceLayout"
+            android:layout_width="140dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView2"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/deviceLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/device"
+                android:layout_width="match_parent"
+                android:layout_height="180dp" />
+
+        </LinearLayout>
+
+    </LinearLayout>
+
+
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/activity_camera.xml b/tensorflow/lite/java/demo/app/src/main/res/layout/activity_camera.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/layout/activity_camera.xml
rename to tensorflow/lite/java/demo/app/src/main/res/layout/activity_camera.xml
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
new file mode 100644
index 0000000000000000000000000000000000000000..f8312cc0f7567a5298e5b0a851f011e4d0d6c0bb
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -0,0 +1,138 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#bb7700">
+
+    <com.example.android.tflitecamerademo.AutoFitTextureView
+        android:id="@+id/texture"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_weight="1" />
+
+    <LinearLayout
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_above="@+id/bottom_info_view"
+        android:layout_alignParentEnd="false"
+        android:layout_alignParentStart="true"
+        android:layout_alignParentTop="false"
+        android:background="#bb7700"
+        android:orientation="vertical"
+        android:weightSum="100">
+
+        <ImageView
+            android:id="@+id/logoview2"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_weight="30"
+            android:scaleType="fitStart"
+            android:src="@drawable/logo" />
+
+        <TextView
+            android:id="@+id/text"
+            android:layout_width="match_parent"
+            android:layout_height="wrap_content"
+            android:layout_alignParentBottom="true"
+            android:layout_alignParentEnd="true"
+            android:layout_alignParentRight="true"
+            android:layout_weight="30"
+            android:textColor="#FFF"
+            android:textSize="20sp"
+            android:textStyle="bold" />
+
+    </LinearLayout>
+
+    <LinearLayout
+        android:id="@+id/bottom_info_view"
+        android:layout_width="match_parent"
+        android:layout_height="200dp"
+
+        android:layout_alignParentBottom="true"
+        android:layout_marginBottom="10dp"
+        android:background="#513400"
+        android:orientation="horizontal">
+
+      <LinearLayout
+            android:layout_width="wrap_content"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+        <TextView
+            android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+            android:gravity="center"
+                android:text="Threads"
+            android:textAlignment="center"
+            android:textColor="@android:color/white" />
+
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_marginLeft="10dp"
+            android:theme="@style/AppTheme.Picker"
+            android:visibility="visible" />
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/modelLayout"
+            android:layout_width="150dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/modelLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/model"
+                android:layout_width="match_parent"
+                android:layout_height="180dp">
+
+            </ListView>
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/deviceLayout"
+            android:layout_width="140dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView2"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/deviceLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/device"
+                android:layout_width="match_parent"
+                android:layout_height="180dp" />
+
+        </LinearLayout>
+
+    </LinearLayout>
+</RelativeLayout>
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout/listview_row.xml b/tensorflow/lite/java/demo/app/src/main/res/layout/listview_row.xml
new file mode 100644
index 0000000000000000000000000000000000000000..349b0f63b4dbae11d21dbb0a58c3cda47299cbf0
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout/listview_row.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+
+    <TextView
+        android:id="@+id/listview_row_text"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_marginRight="2dp"
+        android:background="@drawable/item_selector"
+        android:padding="10dp"
+        android:textSize="18sp"
+        android:textStyle="bold" />
+
+</LinearLayout>
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-dimens.xml b/tensorflow/lite/java/demo/app/src/main/res/values-sw600dp/template-dimens.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-dimens.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values-sw600dp/template-dimens.xml
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-styles.xml b/tensorflow/lite/java/demo/app/src/main/res/values-sw600dp/template-styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-styles.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values-sw600dp/template-styles.xml
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v11/template-styles.xml b/tensorflow/lite/java/demo/app/src/main/res/values-v11/template-styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values-v11/template-styles.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values-v11/template-styles.xml
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-colors.xml b/tensorflow/lite/java/demo/app/src/main/res/values-v21/base-colors.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-colors.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values-v21/base-colors.xml
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-template-styles.xml b/tensorflow/lite/java/demo/app/src/main/res/values-v21/base-template-styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-template-styles.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values-v21/base-template-styles.xml
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml b/tensorflow/lite/java/demo/app/src/main/res/values/base-strings.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values/base-strings.xml
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/colors.xml b/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml
similarity index 82%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values/colors.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values/colors.xml
index 4b75d2b2bda0f95166d0442ebae19cedcad162d8..c30f1dc3ac79a7ef33908a625710f7ac96bfc858 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/colors.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml
@@ -16,4 +16,7 @@
 -->
 <resources>
     <color name="control_background">#cc4285f4</color>
+    <color name="selection_highlight">#aaaaaa</color>
+    <color name="selection_focus">#eeaa55</color>
+    <color name="item_normal">#eeeeee</color>
 </resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml b/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml
similarity index 79%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values/strings.xml
index 29a033bcd437c951ef6e8ba78f4fc3a0fcafac96..8cc88f25652256665acbab2855c60ee1a10293c4 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml
@@ -23,4 +23,11 @@
     <string name="toggle">Use NNAPI</string>
     <string name="tflite">tflite</string>
     <string name="nnapi">NNAPI</string>
+    <string name="gpu">GPU</string>
+    <string name="cpu">CPU</string>
+    <string name="modelLabel">Model</string>
+    <string name="deviceLabel">Device</string>
+    <string name="mobilenetV1Quant">mobilenet v1 quant</string>;
+    <string name="mobilenetV1Float">mobilenet v1 float</string>;;
+
 </resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml b/tensorflow/lite/java/demo/app/src/main/res/values/styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values/styles.xml
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-dimens.xml b/tensorflow/lite/java/demo/app/src/main/res/values/template-dimens.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-dimens.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values/template-dimens.xml
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-styles.xml b/tensorflow/lite/java/demo/app/src/main/res/values/template-styles.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-styles.xml
rename to tensorflow/lite/java/demo/app/src/main/res/values/template-styles.xml
diff --git a/tensorflow/contrib/lite/java/demo/build.gradle b/tensorflow/lite/java/demo/build.gradle
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/build.gradle
rename to tensorflow/lite/java/demo/build.gradle
diff --git a/tensorflow/contrib/lite/java/demo/gradle.properties b/tensorflow/lite/java/demo/gradle.properties
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/gradle.properties
rename to tensorflow/lite/java/demo/gradle.properties
diff --git a/tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.jar b/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.jar
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.jar
rename to tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.jar
diff --git a/tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
rename to tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
diff --git a/tensorflow/contrib/lite/java/demo/gradlew b/tensorflow/lite/java/demo/gradlew
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/gradlew
rename to tensorflow/lite/java/demo/gradlew
diff --git a/tensorflow/contrib/lite/java/demo/gradlew.bat b/tensorflow/lite/java/demo/gradlew.bat
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/gradlew.bat
rename to tensorflow/lite/java/demo/gradlew.bat
diff --git a/tensorflow/contrib/lite/java/demo/settings.gradle b/tensorflow/lite/java/demo/settings.gradle
similarity index 100%
rename from tensorflow/contrib/lite/java/demo/settings.gradle
rename to tensorflow/lite/java/demo/settings.gradle
diff --git a/tensorflow/lite/java/jni/BUILD b/tensorflow/lite/java/jni/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ce17ac4fa0d37cb0b790617c4258ea469d14a664
--- /dev/null
+++ b/tensorflow/lite/java/jni/BUILD
@@ -0,0 +1,47 @@
+package(default_visibility = ["//tensorflow/lite:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+# Helper target for exposing JNI headers across multiple platforms.
+cc_library(
+    name = "jni",
+    hdrs = select({
+        # The Android toolchain makes "jni.h" available in the include path.
+        # For non-Android toolchains, generate jni.h and jni_md.h.
+        "//tensorflow:android": [],
+        "//conditions:default": [
+            ":jni.h",
+            ":jni_md.h",
+        ],
+    }),
+    includes = select({
+        "//tensorflow:android": [],
+        "//conditions:default": ["."],
+    }),
+)
+
+# Silly rules to make
+# #include <jni.h>
+# in the source headers work
+# (in combination with the "includes" attribute of the tf_cuda_library rule
+# above. Not needed when using the Android toolchain).
+#
+# Inspired from:
+# https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
+# but hopefully there is a simpler alternative to this.
+genrule(
+    name = "copy_jni_h",
+    srcs = ["@bazel_tools//tools/jdk:jni_header"],
+    outs = ["jni.h"],
+    cmd = "cp -f $< $@",
+)
+
+genrule(
+    name = "copy_jni_md_h",
+    srcs = select({
+        "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
+        "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
+    }),
+    outs = ["jni_md.h"],
+    cmd = "cp -f $< $@",
+)
diff --git a/tensorflow/lite/java/ovic/BUILD b/tensorflow/lite/java/ovic/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..774320871eec9afb2fae31824dc021fb7d338e1e
--- /dev/null
+++ b/tensorflow/lite/java/ovic/BUILD
@@ -0,0 +1,132 @@
+# Description:
+# OVIC Benchmarker Java API.
+
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
+
+# Build targets for OVIC classification.
+java_test(
+    name = "OvicClassifierTest",
+    size = "medium",
+    srcs = ["src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+    data = [
+        "//tensorflow/lite/java/ovic/src/testdata:labels.txt",
+        "//tensorflow/lite/java/ovic/src/testdata:ovic_testdata",
+    ],
+    javacopts = JAVACOPTS,
+    tags = ["no_oss"],
+    test_class = "org.tensorflow.ovic.OvicClassifierTest",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/java/ovic:ovicbenchmarkerlib_java",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_binary(
+    name = "ovic_validator",
+    srcs = ["src/main/java/org/tensorflow/ovic/OvicValidator.java"],
+    data = [
+        "//tensorflow/lite/java/ovic/src/testdata:labels.txt",
+    ],
+    main_class = "org.tensorflow.ovic.OvicValidator",
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/lite/java/ovic:ovicbenchmarkerlib_java",
+        "//tensorflow/lite/java/ovic:ovicdetectionbenchmarkerlib_java",
+    ],
+)
+
+android_library(
+    name = "ovicbenchmarkerlib",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/OvicBenchmarker.java",
+        "src/main/java/org/tensorflow/ovic/OvicClassificationResult.java",
+        "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+        "src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java",
+    ],
+    manifest = "//tensorflow/lite/java:AndroidManifest.xml",
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/lite/java:tensorflowlite",
+        "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
+
+java_library(
+    name = "ovicbenchmarkerlib_java",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/OvicClassificationResult.java",
+        "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+    ],
+    javacopts = JAVACOPTS,
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/lite/java:libtensorflowlite_jni.so",
+        "//tensorflow/lite/java:tensorflowlite_java",
+        "//tensorflow/lite/java/src/main/native",
+        "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
+
+# Build targets for OVIC detection.
+java_test(
+    name = "OvicDetectorTest",
+    size = "medium",
+    srcs = ["src/test/java/org/tensorflow/ovic/OvicDetectorTest.java"],
+    data = [
+        "//tensorflow/lite/java/ovic/src/testdata:coco_labels.txt",
+        "//tensorflow/lite/java/ovic/src/testdata:ovic_testdata",
+    ],
+    javacopts = JAVACOPTS,
+    tags = ["no_oss"],
+    test_class = "org.tensorflow.ovic.OvicDetectorTest",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/java/ovic:ovicdetectionbenchmarkerlib_java",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+android_library(
+    name = "ovicdetectionbenchmarkerlib",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/BoundingBox.java",
+        "src/main/java/org/tensorflow/ovic/OvicBenchmarker.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetectionResult.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetector.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java",
+    ],
+    manifest = "//tensorflow/lite/java:AndroidManifest.xml",
+    deps = [
+        "//tensorflow/lite/java:tensorflowlite",
+        "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
+
+java_library(
+    name = "ovicdetectionbenchmarkerlib_java",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/BoundingBox.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetectionResult.java",
+        "src/main/java/org/tensorflow/ovic/OvicDetector.java",
+    ],
+    javacopts = JAVACOPTS,
+    deps = [
+        "//tensorflow/lite/java:libtensorflowlite_jni.so",
+        "//tensorflow/lite/java:tensorflowlite_java",
+        "//tensorflow/lite/java/src/main/native",
+        "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
diff --git a/tensorflow/lite/java/ovic/README.md b/tensorflow/lite/java/ovic/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..368c486f4f1ddd021e0bcfcdf9d82034ba5db82b
--- /dev/null
+++ b/tensorflow/lite/java/ovic/README.md
@@ -0,0 +1,186 @@
+# OVIC Benchmarker for NIPS 2018
+
+This folder contains the SDK for track one of the [Low Power ImageNet Recognition Challenge workshop at NIPS 2018.](https://lpirc.ecn.purdue.edu/)
+
+## Pre-requisite
+
+Follow the steps [here](https://www.tensorflow.org/lite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
+
+## Test the benchmarker:
+
+The testing utilities helps the developers (you) to make sure that your submissions in TfLite format will be processed as expected in the competition's benchmarking system.
+
+Note: for now the tests only provides correctness checks, i.e. classifier predicts the correct category on the test image, but no on-device latency measurements. To test the latency measurement functionality, the tests will print the latency running on a desktop computer, which is not indicative of the on-device run-time.
+We are releasing an benchmarker Apk that would allow developers to measure latency on their own devices.
+
+### Obtain the sample models
+
+The test data (models and images) should be downloaded automatically for you by Bazel. In case they are not, you can manually install them as below.
+
+Note: all commands should be called from your tensorflow installation folder (under this folder you should find `tensorflow/lite`).
+
+
+* Download the [testdata package](https://storage.googleapis.com/download.tensorflow.org/data/ovic_2018_10_23.zip):
+
+```sh
+curl -L https://storage.googleapis.com/download.tensorflow.org/data/ovic_2018_10_23.zip -o /tmp/ovic.zip
+```
+
+* Unzip the package into the testdata folder:
+
+```sh
+unzip -j /tmp/ovic.zip -d tensorflow/lite/java/ovic/src/testdata/
+```
+
+### Run tests
+
+You can run test with Bazel as below. This helps to ensure that the installation is correct.
+
+```sh
+bazel test --cxxopt=--std=c++11 //tensorflow/lite/java/ovic:OvicClassifierTest --cxxopt=-Wno-all --test_output=all
+
+bazel test --cxxopt=--std=c++11 //tensorflow/lite/java/ovic:OvicDetectorTest --cxxopt=-Wno-all --test_output=all
+```
+
+### Test your submissions
+
+Once you have a submission that follows the instructions from the [competition site](https://gdoc.pub/doc/e/2PACX-1vSFTEMAE_N6RgtidT-4DVTje6f6HRJv7Q_zaCab5H66BFyqEiZ8PsUfD_-YmBE7_z67qDiNgk-CJqeE), you can verify it in two ways:
+
+#### Validate using randomly generated images
+
+You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output for classification should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
+
+```sh
+bazel build --cxxopt=--std=c++11 //tensorflow/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel-bin/tensorflow/lite/java/ovic/ovic_validator /path/to/my_model.lite classify
+```
+
+Successful validation should print the following message to terminal:
+
+```
+Successfully validated /path/to/my_model.lite.
+
+```
+
+To validate detection models, use the same command but provide "detect" as the second argument instead of "classify".
+
+
+#### Test that the model produces sensible outcomes
+
+You can go a step further to verify that the model produces results as expected. This helps you catch bugs during TOCO conversion (e.g. using the wrong mean and std values).
+
+* Move your submission to the testdata folder:
+
+```sh
+cp /path/to/my_model.lite tensorflow/lite/java/ovic/src/testdata/
+```
+
+* Resize the test image to the resolutions that are expected by your submission:
+
+The test images can be found at `tensorflow/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these images if your image resolutions are 128x128 or 224x224.
+
+* Add your model and test image to the BUILD rule at `tensorflow/lite/java/ovic/src/testdata/BUILD`:
+
+```JSON
+filegroup(
+    name = "ovic_testdata",
+    srcs = [
+        "@tflite_ovic_testdata//:detect.lite",
+        "@tflite_ovic_testdata//:float_model.lite",
+        "@tflite_ovic_testdata//:low_res_model.lite",
+        "@tflite_ovic_testdata//:quantized_model.lite",
+        "@tflite_ovic_testdata//:test_image_128.jpg",
+        "@tflite_ovic_testdata//:test_image_224.jpg"
+        "my_model.lite",        # <--- Your submission.
+        "my_test_image.jpg",    # <--- Your test image.
+    ],
+    ...
+```
+
+* For classification models, modify `OvicClassifierTest.java`:
+  * change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
+
+  * change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize).
+
+  * change `TEST_IMAGE_GROUNDTRUTH` (ImageNet class ID) to be consistent with your test image.
+
+* For detection models, modify `OvicDetectorTest.java`:
+  * change `TEST_IMAGE_PATH` to `my_test_image.jpg`.
+  * change `MODEL_PATH` to `my_model.lite`.
+  * change `GROUNDTRUTH` (COCO class ID) to be consistent with your test image.
+
+Now you can run the bazel tests to catch any runtime issues with the submission.
+
+Note: Please make sure that your submission passes the test. If a submission fails to pass the test it will not be processed by the submission server.
+
+## Measure on-device latency
+
+We provide two ways to measure the on-device latency of your submission. The first is through our competition server, which is reliable and repeatable, but is limited to a few trials per day. The second is through the benchmarker Apk, which requires a device and may not be as accurate as the server, but has a fast turn-around and no access limitations. We recommend that the participants use the benchmarker apk for early development, and reserve the competition server for evaluating promising submissions.
+
+### Running the benchmarker app
+
+Make sure that you have followed instructions in [Test your submissions](#test-your-submissions) to add your model to the testdata folder and to the corresponding build rules.
+
+Modify `tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java`:
+
+* Add your model to the benchmarker apk by changing `modelPath` and `testImagePath` to your submission and test image.
+
+```
+  if (benchmarkClassification) {
+    ...
+    testImagePath = "my_test_image.jpg";
+    modelPath = "my_model.lite";
+  } else {  // Benchmarking detection.
+  ...
+```
+If you are adding a detection model, simply modify `modelPath` and `testImagePath` in the else block above.
+
+* Adjust the benchmark parameters when needed:
+
+You can chnage the length of each experiment, and the processor affinity below. `BIG_CORE_MASK` is an integer whose binary encoding represents the set of used cores. This number is phone-specific. For example, Pixel 2 has 8 cores: the 4 little cores are represented by the 4 less significant bits, and the 4 big cores by the 4 more significant bits. Therefore a mask value of 16, or in binary `00010000`, represents using only the first big core. The mask 32, or in binary `00100000` uses the second big core and should deliver identical results as the mask 16 because the big cores are interchangeable.
+
+```
+  /** Wall time for each benchmarking experiment. */
+  private static final double WALL_TIME = 3000;
+  /** Maximum number of iterations in each benchmarking experiment. */
+  private static final int MAX_ITERATIONS = 100;
+  /** Mask for binding to a single big core. Pixel 1 (4), Pixel 2 (16). */
+  private static final int BIG_CORE_MASK = 16;
+```
+
+Note: You'll need ROOT access to the phone to change processor affinity.
+
+* Build and install the app.
+
+```
+bazel build -c opt --cxxopt=--std=c++11 --cxxopt=-Wno-all //tensorflow/lite/java/ovic/demo/app:ovic_benchmarker_binary
+adb install -r bazel-bin/tensorflow/lite/java/ovic/demo/app/ovic_benchmarker_binary.apk
+```
+
+Start the app and pick a task by clicking either the `CLF` button for classification or the `DET` button for detection. The button should turn bright green, signaling that the experiment is running. The benchmarking results will be displayed after about the `WALL_TIME` you specified above. For example:
+
+```
+my_model.lite: Average latency=158.6ms after 20 runs.
+```
+
+### Sample latencies
+
+Note: the benchmarking results can be quite different depending on the background processes running on the phone. A few things that help stabilize the app's readings are placing the phone on a cooling plate, restarting the phone, and shutting down internet access.
+
+| Classification Model | Pixel 1 latency (ms)  | Pixel 2 latency (ms) |
+| -------------------- |:---------------------:| --------------------:|
+|  float_model.lite    | 120                   | 155                  |
+| quantized_model.lite | 85                    | 74                   |
+|  low_res_model.lite  | 4.2                   | 4.0                  |
+
+
+| Detection Model      | Pixel 2 latency (ms)  |
+| -------------------- |:---------------------:|
+|  detect.lite         | 331                   |
+| quantized_detect.lite| 95                    |
+
+
+Since Pixel 2 has excellent support for 8-bit quantized models, we strongly recommend you to check out the [quantization training tutorial](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize).
+
+The detection models above are both single-shot models (i.e. no object proposal generation) using TfLite's *fast* version of Non-Max-Suppression (NMS). The fast NMS is significant faster than the regular NMS (used by the ObjectDetectionAPI in training) at the expense of about 1% mAP for the listed models.
+
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/AndroidManifest.xml b/tensorflow/lite/java/ovic/demo/app/AndroidManifest.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/app/AndroidManifest.xml
rename to tensorflow/lite/java/ovic/demo/app/AndroidManifest.xml
diff --git a/tensorflow/lite/java/ovic/demo/app/BUILD b/tensorflow/lite/java/ovic/demo/app/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b3548deaf536899c062bd6f363df06949d43e348
--- /dev/null
+++ b/tensorflow/lite/java/ovic/demo/app/BUILD
@@ -0,0 +1,33 @@
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
+
+# Sample app for OVIC benchmarking.
+licenses(["notice"])  # Apache 2.0
+
+android_binary(
+    name = "ovic_benchmarker_binary",
+    srcs = [
+        "OvicBenchmarkerActivity.java",
+    ],
+    aapt_version = "aapt",
+    assets = [
+        "//tensorflow/lite/java/ovic/src/testdata:coco_labels.txt",
+        "//tensorflow/lite/java/ovic/src/testdata:labels.txt",
+        "//tensorflow/lite/java/ovic/src/testdata:ovic_testdata",
+    ],
+    assets_dir = "",
+    custom_package = "ovic.demo.app",
+    manifest = "AndroidManifest.xml",
+    nocompress_extensions = [
+        ".lite",
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = ["manual"],
+    deps = [
+        "//tensorflow/lite/java:tensorflowlite",
+        "//tensorflow/lite/java/ovic:ovicbenchmarkerlib",
+        "//tensorflow/lite/java/ovic:ovicdetectionbenchmarkerlib",
+        "@androidsdk//com.android.support:support-v13-25.2.0",
+        "@androidsdk//com.android.support:support-v4-25.2.0",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java b/tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
similarity index 99%
rename from tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
rename to tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
index 48c29ecebeed42ac9a2e0bc801cab1fb1f9201e8..144530390ff89558dc961cece3474c4bfd679551 100644
--- a/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
+++ b/tensorflow/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
@@ -95,7 +95,7 @@ public class OvicBenchmarkerActivity extends Activity {
       benchmarker = new OvicDetectorBenchmarker(WALL_TIME);
       labelPath = "coco_labels.txt";
       testImagePath = "test_image_224.jpg";
-      modelPath = "detect.tflite";
+      modelPath = "detect.lite";
     }
     AssetManager am = getAssets();
     AssetFileDescriptor fileDescriptor = am.openFd(modelPath);
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/build.gradle b/tensorflow/lite/java/ovic/demo/app/build.gradle
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/app/build.gradle
rename to tensorflow/lite/java/ovic/demo/app/build.gradle
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png b/tensorflow/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png
rename to tensorflow/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png b/tensorflow/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png
rename to tensorflow/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable/start_button_color.xml b/tensorflow/lite/java/ovic/demo/app/res/drawable/start_button_color.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/app/res/drawable/start_button_color.xml
rename to tensorflow/lite/java/ovic/demo/app/res/drawable/start_button_color.xml
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml b/tensorflow/lite/java/ovic/demo/app/res/layout/activity_main.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml
rename to tensorflow/lite/java/ovic/demo/app/res/layout/activity_main.xml
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/values/dimens.xml b/tensorflow/lite/java/ovic/demo/app/res/values/dimens.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/app/res/values/dimens.xml
rename to tensorflow/lite/java/ovic/demo/app/res/values/dimens.xml
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml b/tensorflow/lite/java/ovic/demo/app/res/values/strings.xml
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml
rename to tensorflow/lite/java/ovic/demo/app/res/values/strings.xml
diff --git a/tensorflow/contrib/lite/java/ovic/demo/build.gradle b/tensorflow/lite/java/ovic/demo/build.gradle
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/build.gradle
rename to tensorflow/lite/java/ovic/demo/build.gradle
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradle.properties b/tensorflow/lite/java/ovic/demo/gradle.properties
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/gradle.properties
rename to tensorflow/lite/java/ovic/demo/gradle.properties
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar b/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar
rename to tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
rename to tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradlew b/tensorflow/lite/java/ovic/demo/gradlew
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/gradlew
rename to tensorflow/lite/java/ovic/demo/gradlew
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradlew.bat b/tensorflow/lite/java/ovic/demo/gradlew.bat
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/gradlew.bat
rename to tensorflow/lite/java/ovic/demo/gradlew.bat
diff --git a/tensorflow/contrib/lite/java/ovic/demo/settings.gradle b/tensorflow/lite/java/ovic/demo/settings.gradle
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/demo/settings.gradle
rename to tensorflow/lite/java/ovic/demo/settings.gradle
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/BoundingBox.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/BoundingBox.java
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/BoundingBox.java
rename to tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/BoundingBox.java
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
similarity index 84%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
rename to tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
index 15d9511f509c71f840345c5bee36d2b1015d9354..32bdd5a97a716736b34fa37b93d562f8b2d78623 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
@@ -47,13 +47,6 @@ public abstract class OvicBenchmarker {
   protected int imgHeight = 224;
   protected int imgWidth = 224;
 
-  /** Preprocess parameters (only used when input is float). */
-  protected static final float IMAGE_MEAN = 127.5f;
-  protected static final float IMAGE_STD = 127.5f;
-
-  /** Whether input is float or quantized. */
-  protected Boolean quantizedInput = null;
-
   /* Preallocated buffers for storing image data in. */
   protected int[] intValues = null;
 
@@ -131,7 +124,7 @@ public abstract class OvicBenchmarker {
    * Input buffer must be loaded in intValues and output will be placed in imgData.
   */
   protected void loadsInputToByteBuffer() {
-    if (imgData == null || intValues == null || quantizedInput == null) {
+    if (imgData == null || intValues == null) {
       throw new RuntimeException("Benchmarker is not yet ready to test.");
     }
     // Convert the image to ByteBuffer.
@@ -142,17 +135,9 @@ public abstract class OvicBenchmarker {
     for (int i = 0; i < imgHeight; ++i) {
       for (int j = 0; j < imgWidth; ++j) {
         final int pixelValue = intValues[pixel++];
-        if (quantizedInput) {
-          // Quantized model
-          imgData.put((byte) ((pixelValue >> 16) & 0xFF));
-          imgData.put((byte) ((pixelValue >> 8) & 0xFF));
-          imgData.put((byte) (pixelValue & 0xFF));
-        } else {
-          // Float model
-          imgData.putFloat((((pixelValue >> 16) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
-          imgData.putFloat((((pixelValue >> 8) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
-          imgData.putFloat(((pixelValue & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
-        }
+        imgData.put((byte) ((pixelValue >> 16) & 0xFF));
+        imgData.put((byte) ((pixelValue >> 8) & 0xFF));
+        imgData.put((byte) (pixelValue & 0xFF));
       }
     }
     long endTime = SystemClock.uptimeMillis();
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java
rename to tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassificationResult.java
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
rename to tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
similarity index 99%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
rename to tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
index 0cdd0f7bec83b3c47fde08449f596948c261fe79..b35b8ff2c34bc0f246394d397ffae322ce38581c 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifierBenchmarker.java
@@ -57,7 +57,6 @@ public final class OvicClassifierBenchmarker extends OvicBenchmarker {
       int [] inputDims = classifier.getInputDims();
       imgHeight = inputDims[1];
       imgWidth = inputDims[2];
-      quantizedInput = true;
       // Only accept QUANTIZED_UINT8 input.
       imgData = ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE);
       imgData.order(ByteOrder.nativeOrder());
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java
rename to tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectionResult.java
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
similarity index 88%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
rename to tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
index 56836a79e5b3e4af3a3727aebe38c10ca9bcfcf7..84c9816d2b1a0b48e3f3925639ad7b83ab97051c 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetector.java
@@ -40,11 +40,8 @@ public class OvicDetector implements AutoCloseable {
   /** Labels corresponding to the output of the vision model. */
   private final List<String> labelList;
 
-  /** Define the output format. */
-  private final Boolean inputIsFloat;
-
   /** Number of detections per image. 10 for demo, 100 for the actual competition. */
-  private static final int NUM_RESULTS = 10;
+  private static final int NUM_RESULTS = 100;
 
   /** The output arrays for the mobilenet SSD. */
   private float[][][] outputLocations;
@@ -66,7 +63,9 @@ public class OvicDetector implements AutoCloseable {
     // Create the TfLite interpreter.
     tflite = new Interpreter(model, new Interpreter.Options().setNumThreads(1));
     inputDims = TestHelper.getInputDims(tflite, 0);
-    inputIsFloat = TestHelper.getInputDataType(tflite, 0).equals("float");
+    if (TestHelper.getInputDataType(tflite, 0).equals("float")) {
+      throw new RuntimeException("The model's input must be QUANTIZED_UINT8.");
+    }
     if (inputDims.length != 4) {
       throw new RuntimeException("The model's input dimensions must be 4 (BWHC).");
     }
@@ -102,10 +101,6 @@ public class OvicDetector implements AutoCloseable {
     result = new OvicDetectionResult(NUM_RESULTS);
   }
 
-  public Boolean quantizedInput() {
-    return !inputIsFloat;
-  }
-
   /** Reads label list from Assets. */
   private static List<String> loadLabelList(InputStream labelInputStream) throws IOException {
     List<String> labelList = new ArrayList<>();
@@ -132,9 +127,6 @@ public class OvicDetector implements AutoCloseable {
     if (tflite == null) {
       throw new RuntimeException(TAG + ": Detector has not been initialized; Failed.");
     }
-    if (inputIsFloat == null) {
-      throw new RuntimeException(TAG + ": Detector input type has not been resolved.");
-    }
 
     Object[] inputArray = {imgData};
     tflite.runForMultipleInputsOutputs(inputArray, outputMap);
@@ -144,12 +136,17 @@ public class OvicDetector implements AutoCloseable {
     // Update the results.
     result.resetTo(latency, imageId);
     for (int i = 0; i < NUM_RESULTS; i++) {
-      result.addBox(outputLocations[0][i][1] * inputDims[1],
-              outputLocations[0][i][0] * inputDims[1],
-              outputLocations[0][i][3] * inputDims[2],
-              outputLocations[0][i][2] * inputDims[2],
-              Math.round(outputClasses[0][i] + 1 /* Label offset */),
-              outputScores[0][i]);
+      // The model returns normalized coordinates [start_y, start_x, end_y, end_x].
+      // The boxes expect pixel coordinates [x1, y1, x2, y2].
+      // The height and width of the input are in inputDims[1] and inputDims[2].
+      // The following command converts between model outputs to bounding boxes.
+      result.addBox(
+          outputLocations[0][i][1] * inputDims[2],
+          outputLocations[0][i][0] * inputDims[1],
+          outputLocations[0][i][3] * inputDims[2],
+          outputLocations[0][i][2] * inputDims[1],
+          Math.round(outputClasses[0][i] + 1 /* Label offset */),
+          outputScores[0][i]);
     }
     return true;  // Marks that the result is available.
   }
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
similarity index 93%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
rename to tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
index 1a4e193ff23f242c83bb8c2d7a894038b32c049a..15a4c988123f1c75c304685a2000c9274a645354 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicDetectorBenchmarker.java
@@ -62,16 +62,10 @@ public final class OvicDetectorBenchmarker extends OvicBenchmarker {
     try {
       Log.i(TAG, "Creating detector.");
       detector = new OvicDetector(labelInputStream, model);
-      quantizedInput = detector.quantizedInput();
       int[] inputDims = detector.getInputDims();
       imgHeight = inputDims[1];
       imgWidth = inputDims[2];
-      if (quantizedInput) {
-        imgData = ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE);
-      } else {
-        imgData =
-            ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE * 4);
-      }
+      imgData = ByteBuffer.allocateDirect(DIM_BATCH_SIZE * imgHeight * imgWidth * DIM_PIXEL_SIZE);
       imgData.order(ByteOrder.nativeOrder());
       intValues = new int[imgHeight * imgWidth];
       benchmarkStarted = false;
diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
new file mode 100644
index 0000000000000000000000000000000000000000..0a7aee043271b822bd3a684955f334b7edc2ad64
--- /dev/null
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
@@ -0,0 +1,103 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.Random;
+
+/** Validate a submission model. */
+public class OvicValidator {
+  private static void printUsage(PrintStream s) {
+    s.println("Java program that validates a submission model.");
+    s.println();
+    s.println("Usage: ovic_validator <submission file> [<task>]");
+    s.println();
+    s.println("Where:");
+    s.println("<submission file> is the model in TfLite format,");
+    s.println("<task> is the type of the task: \"classify\" (default) or \"detect\";");
+  }
+
+  public static void main(String[] args) {
+    if (args.length != 2) {
+      printUsage(System.err);
+      System.exit(1);
+    }
+    final String modelFile = args[0];
+    final String taskString = args[1];
+    final boolean isDetection = taskString.equals("detect");
+    // Label file for detection is never used, so the same label file is used for both tasks.
+    final String labelPath =
+        "tensorflow/lite/java/ovic/src/testdata/labels.txt";
+
+    try {
+      MappedByteBuffer model = loadModelFile(modelFile);
+      File labelsfile = new File(labelPath);
+      InputStream labelsInputStream = new FileInputStream(labelsfile);
+
+      if (isDetection) {
+        OvicDetector detector = new OvicDetector(labelsInputStream, model);
+        int[] inputDims = detector.getInputDims();
+        ByteBuffer imgData = createByteBuffer(inputDims[1], inputDims[2]);
+        if (!detector.detectByteBuffer(imgData, /*imageId=*/ 0)) {
+          throw new RuntimeException("Failed to return detections.");
+        }
+      } else {
+        OvicClassifier classifier = new OvicClassifier(labelsInputStream, model);
+        int[] inputDims = classifier.getInputDims();
+        ByteBuffer imgData = createByteBuffer(inputDims[1], inputDims[2]);
+        OvicClassificationResult testResult = classifier.classifyByteBuffer(imgData);
+        if (testResult.topKClasses.isEmpty()) {
+          throw new RuntimeException("Failed to return top K predictions.");
+        }
+      }
+      System.out.printf("Successfully validated %s.%n", modelFile);
+    } catch (Exception e) {
+      System.out.println(e.getMessage());
+      System.out.printf("Failed to validate %s.%n", modelFile);
+    }
+  }
+
+  private static ByteBuffer createByteBuffer(int imgWidth, int imgHeight) {
+    ByteBuffer imgData = ByteBuffer.allocateDirect(imgHeight * imgWidth * 3);
+    imgData.order(ByteOrder.nativeOrder());
+    Random rand = new Random();
+    for (int y = 0; y < imgHeight; y++) {
+      for (int x = 0; x < imgWidth; x++) {
+        int val = rand.nextInt();
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    return imgData;
+  }
+
+  private static MappedByteBuffer loadModelFile(String modelFilePath) throws IOException {
+    File modelfile = new File(modelFilePath);
+    FileInputStream inputStream = new FileInputStream(modelfile);
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = 0L;
+    long declaredLength = fileChannel.size();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
similarity index 98%
rename from tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
rename to tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
index 99e874ca786a22f55ca1d50b01887ab7bb487f1f..c309c5bd55114ba1d0dcdfc1d8c9129f467c46ba 100644
--- a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
+++ b/tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -45,7 +45,7 @@ public final class OvicClassifierTest {
   private ByteBuffer lowResTestImage = null;
   private OvicClassificationResult testResult = null;
   private static final String LABELS_PATH =
-      "tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
+      "tensorflow/lite/java/ovic/src/testdata/labels.txt";
   private static final String QUANTIZED_MODEL_PATH =
       "external/tflite_ovic_testdata/quantized_model.lite";
   private static final String LOW_RES_MODEL_PATH =
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java b/tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java
similarity index 79%
rename from tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java
rename to tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java
index 4681e260524406dca74704eeb1394c698c386132..709f8fb5c3293366fe6f6dd26c6779fb422a8a16 100644
--- a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java
+++ b/tensorflow/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicDetectorTest.java
@@ -40,14 +40,10 @@ public final class OvicDetectorTest {
   private MappedByteBuffer model = null;
   private ByteBuffer testImage = null;
 
-  private static final float IMAGE_MEAN = 128f;
-  private static final float IMAGE_STD = 128f;
-
-  private Boolean quantizedInput = null;
   private static final String LABELS_PATH =
-      "tensorflow/contrib/lite/java/ovic/src/testdata/coco_labels.txt";
+      "tensorflow/lite/java/ovic/src/testdata/coco_labels.txt";
   private static final String MODEL_PATH =
-      "external/tflite_mobilenet_ssd_quant/detect.tflite";
+      "external/tflite_ovic_testdata/quantized_detect.lite";
   private static final String TEST_IMAGE_PATH =
       "external/tflite_ovic_testdata/test_image_224.jpg";
   private static final int GROUNDTRUTH = 1 /* Person */;
@@ -64,7 +60,6 @@ public final class OvicDetectorTest {
 
       // Create detector.
       detector = new OvicDetector(labelsInputStream, model);
-      quantizedInput = detector.quantizedInput();
 
       // Load test image and convert into byte buffer.
       File imageFile = new File(TEST_IMAGE_PATH);
@@ -91,28 +86,15 @@ public final class OvicDetectorTest {
     return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
   }
 
-  private ByteBuffer toByteBuffer(BufferedImage image) {
-    ByteBuffer imgData;
-    if (quantizedInput) {
-      imgData = ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3);
-    } else {
-      imgData = ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 12);
-    }
+  private static ByteBuffer toByteBuffer(BufferedImage image) {
+    ByteBuffer imgData = ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3);
     imgData.order(ByteOrder.nativeOrder());
     for (int y = 0; y < image.getHeight(); y++) {
       for (int x = 0; x < image.getWidth(); x++) {
         int pixelValue = image.getRGB(x, y);
-        if (quantizedInput) {
-          // Quantized model
-          imgData.put((byte) ((pixelValue >> 16) & 0xFF));
-          imgData.put((byte) ((pixelValue >> 8) & 0xFF));
-          imgData.put((byte) (pixelValue & 0xFF));
-        } else {
-          // Float model
-          imgData.putFloat((((pixelValue >> 16) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
-          imgData.putFloat((((pixelValue >> 8) & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
-          imgData.putFloat(((pixelValue & 0xFF) - IMAGE_MEAN) / IMAGE_STD);
-        }
+        imgData.put((byte) ((pixelValue >> 16) & 0xFF));
+        imgData.put((byte) ((pixelValue >> 8) & 0xFF));
+        imgData.put((byte) (pixelValue & 0xFF));
       }
     }
     return imgData;
diff --git a/tensorflow/lite/java/ovic/src/testdata/BUILD b/tensorflow/lite/java/ovic/src/testdata/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..53f382f7c18588bf6904a7584f5902cd77ce18f6
--- /dev/null
+++ b/tensorflow/lite/java/ovic/src/testdata/BUILD
@@ -0,0 +1,24 @@
+# Testdata for OVIC benchmarker demo App and tests.
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "ovic_testdata",
+    srcs = [
+        "@tflite_ovic_testdata//:detect.lite",
+        "@tflite_ovic_testdata//:float_model.lite",
+        "@tflite_ovic_testdata//:low_res_model.lite",
+        "@tflite_ovic_testdata//:quantized_detect.lite",
+        "@tflite_ovic_testdata//:quantized_model.lite",
+        "@tflite_ovic_testdata//:test_image_128.jpg",
+        "@tflite_ovic_testdata//:test_image_224.jpg",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+exports_files(
+    [
+        "labels.txt",
+        "coco_labels.txt",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/lite/java/ovic/src/testdata/coco_labels.txt b/tensorflow/lite/java/ovic/src/testdata/coco_labels.txt
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/src/testdata/coco_labels.txt
rename to tensorflow/lite/java/ovic/src/testdata/coco_labels.txt
diff --git a/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt b/tensorflow/lite/java/ovic/src/testdata/labels.txt
similarity index 100%
rename from tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt
rename to tensorflow/lite/java/ovic/src/testdata/labels.txt
diff --git a/tensorflow/contrib/lite/java/proguard.flags b/tensorflow/lite/java/proguard.flags
similarity index 100%
rename from tensorflow/contrib/lite/java/proguard.flags
rename to tensorflow/lite/java/proguard.flags
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
similarity index 95%
rename from tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
rename to tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
index 41093e8ffe6407d31659c51e13717ef67014dec5..bd47574f71b28989378eb50faab40e64e543bd1c 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
@@ -27,7 +27,10 @@ public enum DataType {
   UINT8(3),
 
   /** 64-bit signed integer. */
-  INT64(4);
+  INT64(4),
+
+  /** Strings. */
+  STRING(5);
 
   private final int value;
 
@@ -46,6 +49,8 @@ public enum DataType {
         return 1;
       case INT64:
         return 8;
+      case STRING:
+        return -1;
     }
     throw new IllegalArgumentException(
         "DataType error: DataType " + this + " is not supported yet");
@@ -82,6 +87,8 @@ public enum DataType {
         return "byte";
       case INT64:
         return "long";
+      case STRING:
+        return "string";
     }
     throw new IllegalArgumentException(
         "DataType error: DataType " + this + " is not supported yet");
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Delegate.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Delegate.java
new file mode 100644
index 0000000000000000000000000000000000000000..5a57734024e239d5619b802526e2f76e2309fa2e
--- /dev/null
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Delegate.java
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+/**
+ * Wrapper for a native TensorFlow Lite Delegate.
+ *
+ * <p>WARNING: This is an experimental interface that is subject to change.
+ *
+ * <p>If a delegate implementation holds additional resources or memory that should be explicitly
+ * freed, then best practice is to add a {@code close()} method to the implementation and have the
+ * client call that explicitly when the delegate instance is no longer in use. While this approach
+ * technically allows sharing of a single delegate instance across multiple interpreter instances,
+ * the delegate implementation must explicitly support this.
+ */
+public interface Delegate {
+  /**
+   * Returns a native handle to the TensorFlow Lite delegate implementation.
+   *
+   * <p>Note: The Java {@link Delegate} maintains ownership of the native delegate instance, and
+   * must ensure its existence for the duration of usage with any {@link Interpreter}.
+   *
+   * @return The native delegate handle.
+   */
+  public long getNativeHandle();
+}
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
similarity index 84%
rename from tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
rename to tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 5cc6e754f3380331a85714c2ed69f8c8e49ba4dc..2203d5fbdb260aaf2bf826343343426a5015e889 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -18,7 +18,9 @@ package org.tensorflow.lite;
 import java.io.File;
 import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
 
@@ -41,15 +43,34 @@ import org.checkerframework.checker.nullness.qual.NonNull;
  * <pre>{@code
  * Object[] inputs = {input0, input1, ...};
  * Map<Integer, Object> map_of_indices_to_outputs = new HashMap<>();
- * float[][][] ith_output = new float[3][2][4];
+ * ByteBuffer ith_output = ByteBuffer.allocateDirect(3 * 2 * 4 * 4);  // Float tensor, shape 3x2x4.
+ * ith_output.order(ByteOrder.nativeOrder());
  * map_of_indices_to_outputs.put(i, ith_output);
  * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
  *   interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
  * }
  * }</pre>
  *
+ * <p>If a model takes or produces string tensors:
+ *
+ * <pre>{@code
+ * String[] input = {"foo", "bar"};  // Input tensor shape is [2].
+ * String[] output = new String[3][2];  // Output tensor shape is [3, 2].
+ * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
+ *   interpreter.runForMultipleInputsOutputs(input, output);
+ * }
+ * }</pre>
+ *
  * <p>Orders of inputs and outputs are determined when converting TensorFlow model to TensorFlowLite
- * model with Toco.
+ * model with Toco, as are the default shapes of the inputs.
+ *
+ * <p>When inputs are provided as (multi-dimensional) arrays, the corresponding input tensor(s) will
+ * be implicitly resized according to that array's shape. When inputs are provided as {@link
+ * ByteBuffer} types, no implicit resizing is done; the caller must ensure that the {@link
+ * ByteBuffer} byte size either matches that of the corresponding tensor, or that they first resize
+ * the tensor via {@link #resizeInput()}. Tensor shape and type information can be obtained via the
+ * {@link Tensor} class, available via {@link #getInputTensor(int)} and {@link
+ * #getOutputTensor(int)}.
  *
  * <p><b>WARNING:</b>Instances of a {@code Interpreter} is <b>not</b> thread-safe. A {@code
  * Interpreter} owns resources that <b>must</b> be explicitly freed by invoking {@link #close()}
@@ -85,9 +106,20 @@ public final class Interpreter implements AutoCloseable {
       return this;
     }
 
+    /**
+     * Adds a {@link Delegate} to be applied during interpreter creation.
+     *
+     * <p>WARNING: This is an experimental interface that is subject to change.
+     */
+    public Options addDelegate(Delegate delegate) {
+      delegates.add(delegate);
+      return this;
+    }
+
     int numThreads = -1;
     boolean useNNAPI = false;
     boolean allowFp16PrecisionForFp32 = false;
+    final List<Delegate> delegates = new ArrayList<>();
   }
 
   /**
@@ -179,12 +211,13 @@ public final class Interpreter implements AutoCloseable {
    * Runs model inference if the model takes only one input, and provides only one output.
    *
    * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
-   * consider using {@link ByteBuffer} to feed input data for better performance.
+   * consider using {@link ByteBuffer} to feed primitive input data for better performance.
    *
    * @param input an array or multidimensional array, or a {@link ByteBuffer} of primitive types
    *     including int, float, long, and byte. {@link ByteBuffer} is the preferred way to pass large
-   *     input data. When {@link ByteBuffer} is used, its content should remain unchanged until
-   *     model inference is done.
+   *     input data for primitive types, whereas string types require using the (multi-dimensional)
+   *     array input path. When {@link ByteBuffer} is used, its content should remain unchanged
+   *     until model inference is done.
    * @param output a multidimensional array of output data, or a {@link ByteBuffer} of primitive
    *     types including int, float, long, and byte.
    */
@@ -199,13 +232,14 @@ public final class Interpreter implements AutoCloseable {
    * Runs model inference if the model takes multiple inputs, or returns multiple outputs.
    *
    * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
-   * consider using {@link ByteBuffer} to feed input data for better performance.
+   * consider using {@link ByteBuffer} to feed primitive input data for better performance.
    *
    * @param inputs an array of input data. The inputs should be in the same order as inputs of the
    *     model. Each input can be an array or multidimensional array, or a {@link ByteBuffer} of
    *     primitive types including int, float, long, and byte. {@link ByteBuffer} is the preferred
-   *     way to pass large input data. When {@link ByteBuffer} is used, its content should remain
-   *     unchanged until model inference is done.
+   *     way to pass large input data, whereas string types require using the (multi-dimensional)
+   *     array input path. When {@link ByteBuffer} is used, its content should remain unchanged
+   *     until model inference is done.
    * @param outputs a map mapping output indices to multidimensional arrays of output data or {@link
    *     ByteBuffer}s of primitive types including int, float, long, and byte. It only needs to keep
    *     entries for the outputs to be used.
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
similarity index 88%
rename from tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
rename to tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 6f03e7853a4654408b040534e4bbb6756f8b0b01..1952db0267bb7b26f24d819a69f9f312caf776ac 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -18,7 +18,9 @@ package org.tensorflow.lite;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 /**
@@ -35,18 +37,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   }
 
   NativeInterpreterWrapper(String modelPath, Interpreter.Options options) {
-    if (options == null) {
-      options = new Interpreter.Options();
-    }
-    errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
-    modelHandle = createModel(modelPath, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle, options.numThreads);
-    isMemoryAllocated = true;
-    inputTensors = new Tensor[getInputCount(interpreterHandle)];
-    outputTensors = new Tensor[getOutputCount(interpreterHandle)];
-    if (options.allowFp16PrecisionForFp32) {
-      setAllowFp16PrecisionForFp32(options.allowFp16PrecisionForFp32);
-    }
+    long errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
+    long modelHandle = createModel(modelPath, errorHandle);
+    init(errorHandle, modelHandle, options);
   }
 
   NativeInterpreterWrapper(ByteBuffer byteBuffer) {
@@ -54,9 +47,6 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   }
 
   NativeInterpreterWrapper(ByteBuffer buffer, Interpreter.Options options) {
-    if (options == null) {
-      options = new Interpreter.Options();
-    }
     if (buffer == null
         || (!(buffer instanceof MappedByteBuffer)
             && (!buffer.isDirect() || buffer.order() != ByteOrder.nativeOrder()))) {
@@ -64,19 +54,33 @@ final class NativeInterpreterWrapper implements AutoCloseable {
           "Model ByteBuffer should be either a MappedByteBuffer of the model file, or a direct "
               + "ByteBuffer using ByteOrder.nativeOrder() which contains bytes of model content.");
     }
-    modelByteBuffer = buffer;
-    errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
-    modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle, options.numThreads);
-    isMemoryAllocated = true;
-    inputTensors = new Tensor[getInputCount(interpreterHandle)];
-    outputTensors = new Tensor[getOutputCount(interpreterHandle)];
+    this.modelByteBuffer = buffer;
+    long errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
+    long modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
+    init(errorHandle, modelHandle, options);
+  }
+
+  private void init(long errorHandle, long modelHandle, Interpreter.Options options) {
+    if (options == null) {
+      options = new Interpreter.Options();
+    }
+    this.errorHandle = errorHandle;
+    this.modelHandle = modelHandle;
+    this.interpreterHandle = createInterpreter(modelHandle, errorHandle, options.numThreads);
+    this.inputTensors = new Tensor[getInputCount(interpreterHandle)];
+    this.outputTensors = new Tensor[getOutputCount(interpreterHandle)];
     if (options.useNNAPI) {
       setUseNNAPI(options.useNNAPI);
     }
     if (options.allowFp16PrecisionForFp32) {
       setAllowFp16PrecisionForFp32(options.allowFp16PrecisionForFp32);
     }
+    for (Delegate delegate : options.delegates) {
+      applyDelegate(interpreterHandle, errorHandle, delegate.getNativeHandle());
+      delegates.add(delegate);
+    }
+    allocateTensors(interpreterHandle, errorHandle);
+    this.isMemoryAllocated = true;
   }
 
   /** Releases resources associated with this {@code NativeInterpreterWrapper}. */
@@ -103,6 +107,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     inputsIndexes = null;
     outputsIndexes = null;
     isMemoryAllocated = false;
+    delegates.clear();
   }
 
   /** Sets inputs, runs model inference and returns outputs. */
@@ -278,7 +283,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   /** Gets the number of output tensors. */
   int getOutputTensorCount() {
-    return inputTensors.length;
+    return outputTensors.length;
   }
 
   /**
@@ -322,11 +327,15 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   private Map<String, Integer> outputsIndexes;
 
   // Lazily constructed and populated arrays of input and output Tensor wrappers.
-  private final Tensor[] inputTensors;
-  private final Tensor[] outputTensors;
+  private Tensor[] inputTensors;
+  private Tensor[] outputTensors;
 
   private boolean isMemoryAllocated = false;
 
+  // As the Java Delegate owns the native delegate instance, we keep a strong ref to any injected
+  // delegates for safety.
+  private final List<Delegate> delegates = new ArrayList<>();
+
   private static native long allocateTensors(long interpreterHandle, long errorHandle);
 
   private static native int getInputTensorIndex(long interpreterHandle, int inputIdx);
@@ -355,6 +364,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native long createInterpreter(long modelHandle, long errorHandle, int numThreads);
 
+  private static native void applyDelegate(
+      long interpreterHandle, long errorHandle, long delegateHandle);
+
   private static native void delete(long errorHandle, long modelHandle, long interpreterHandle);
 
   static {
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
similarity index 99%
rename from tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
rename to tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 6ca47aa3edff34ba312754f4cd769e1bebaf4d27..7aa24b4198a110f68680c0f8ec2a527b23c5e1bc 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -162,6 +162,8 @@ public final class Tensor {
         return DataType.UINT8;
       } else if (long.class.equals(c)) {
         return DataType.INT64;
+      } else if (String.class.equals(c)) {
+        return DataType.STRING;
       }
     }
     throw new IllegalArgumentException(
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
new file mode 100644
index 0000000000000000000000000000000000000000..deded1018285f66f848dfb796e3d90506ef37c3f
--- /dev/null
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+/** Static utility methods loading the TensorFlowLite runtime. */
+public final class TensorFlowLite {
+
+  private static final String PRIMARY_LIBNAME = "tensorflowlite_jni";
+  private static final String FALLBACK_LIBNAME = "tensorflowlite_flex_jni";
+
+  private TensorFlowLite() {}
+
+  /** Returns the version of the underlying TensorFlowLite runtime. */
+  public static native String version();
+
+  /**
+   * Initialize tensorflow's libraries. This will throw an exception if used when TensorFlow isn't
+   * linked in.
+   */
+  static native void initTensorFlow();
+
+  /**
+   * Load the TensorFlowLite runtime C library.
+   */
+  static boolean init() {
+    Throwable primaryLibException;
+    try {
+      System.loadLibrary(PRIMARY_LIBNAME);
+      return true;
+    } catch (UnsatisfiedLinkError e) {
+      primaryLibException = e;
+    }
+
+    try {
+      System.loadLibrary(FALLBACK_LIBNAME);
+      return true;
+    } catch (UnsatisfiedLinkError e) {
+      // If the fallback fails, log the error for the primary load instead.
+      System.err.println(
+          "TensorFlowLite: failed to load native library: " + primaryLibException.getMessage());
+    }
+
+    return false;
+  }
+
+  static {
+    init();
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/package-info.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/package-info.java
similarity index 100%
rename from tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/package-info.java
rename to tensorflow/lite/java/src/main/java/org/tensorflow/lite/package-info.java
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..52194e86db32a259ca1fe640ca72d42010ba1a44
--- /dev/null
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -0,0 +1,76 @@
+# Description:
+# Java Native Interface (JNI) library intended for implementing the
+# TensorFlow Lite Java API using the TensorFlow Lite CC library.
+
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "native_framework_only",
+    srcs = [
+        "exception_jni.cc",
+        "nativeinterpreterwrapper_jni.cc",
+        "tensor_jni.cc",
+        "tensorflow_lite_jni.cc",
+    ],
+    hdrs = [
+        "exception_jni.h",
+        "nativeinterpreterwrapper_jni.h",
+        "tensor_jni.h",
+        "tensorflow_lite_jni.h",
+    ],
+    copts = tflite_copts(),
+    linkopts = [
+        "-lm",
+        "-ldl",
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/java/jni",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "init_tensorflow",
+    srcs = [
+        "init_tensorflow_jni.cc",
+    ],
+    hdrs = [
+        "init_tensorflow_jni.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/java/jni",
+        "//tensorflow/lite/testing:init_tensorflow",
+    ],
+    alwayslink = 1,
+)
+
+# This includes all ops. If you want a smaller binary, you should copy and
+# modify builtin_ops_jni.cc.  You should then link your binary against both
+# ":native_framework_only" and your own version of ":native_builtin_ops".
+cc_library(
+    name = "native",
+    srcs = [
+        "builtin_ops_jni.cc",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":native_framework_only",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+    alwayslink = 1,
+)
+
+exports_files(
+    [
+        "version_script.lds",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/src/main/native/builtin_ops_jni.cc b/tensorflow/lite/java/src/main/native/builtin_ops_jni.cc
similarity index 95%
rename from tensorflow/contrib/lite/java/src/main/native/builtin_ops_jni.cc
rename to tensorflow/lite/java/src/main/native/builtin_ops_jni.cc
index cce356370fa770de3e44438f08470077fb07c04c..95bc0a4fa8d1d4c31b03c92d220c7a49b52baa58 100644
--- a/tensorflow/contrib/lite/java/src/main/native/builtin_ops_jni.cc
+++ b/tensorflow/lite/java/src/main/native/builtin_ops_jni.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/register.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc b/tensorflow/lite/java/src/main/native/exception_jni.cc
similarity index 96%
rename from tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
rename to tensorflow/lite/java/src/main/native/exception_jni.cc
index 34d91be04cd6c855a2068510ca810c0b93637584..5406c7197f0c6ba6fd17c3472a365ef2d56d07a4 100644
--- a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
+++ b/tensorflow/lite/java/src/main/native/exception_jni.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "tensorflow/contrib/lite/java/src/main/native/exception_jni.h"
+#include "tensorflow/lite/java/src/main/native/exception_jni.h"
 
 const char kIllegalArgumentException[] = "java/lang/IllegalArgumentException";
 const char kIllegalStateException[] = "java/lang/IllegalStateException";
@@ -49,6 +49,7 @@ BufferErrorReporter::BufferErrorReporter(JNIEnv* env, int limit) {
                    limit);
     return;
   }
+  buffer_[0] = '\0';
   start_idx_ = 0;
   end_idx_ = limit - 1;
 }
diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.h b/tensorflow/lite/java/src/main/native/exception_jni.h
similarity index 84%
rename from tensorflow/contrib/lite/java/src/main/native/exception_jni.h
rename to tensorflow/lite/java/src/main/native/exception_jni.h
index 2a4bbdbeadcc64d76dc60a9e2642557bfd899bec..ebd91e875b5b58b675eb2c2c613260cb1df91e84 100644
--- a/tensorflow/contrib/lite/java/src/main/native/exception_jni.h
+++ b/tensorflow/lite/java/src/main/native/exception_jni.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
-#define TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
 
 #include <jni.h>
-#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/lite/error_reporter.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -47,4 +47,4 @@ class BufferErrorReporter : public tflite::ErrorReporter {
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_EXCEPTION_JNI_H_
diff --git a/tensorflow/lite/java/src/main/native/init_tensorflow_jni.cc b/tensorflow/lite/java/src/main/native/init_tensorflow_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1fa9d1f50e50d23945ea4003b3218e7a3f04c83a
--- /dev/null
+++ b/tensorflow/lite/java/src/main/native/init_tensorflow_jni.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/java/src/main/native/init_tensorflow_jni.h"
+#include "tensorflow/lite/testing/init_tensorflow.h"
+
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_TensorFlowLite_initTensorFlow(
+    JNIEnv* env, jclass clazz) {
+  ::tflite::InitTensorFlow();
+}
diff --git a/tensorflow/lite/java/src/main/native/init_tensorflow_jni.h b/tensorflow/lite/java/src/main/native/init_tensorflow_jni.h
new file mode 100644
index 0000000000000000000000000000000000000000..1454d6d4633d4f2d8aa695f637bcec024208b176
--- /dev/null
+++ b/tensorflow/lite/java/src/main/native/init_tensorflow_jni.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_INIT_TENSORFLOW_JNI_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_INIT_TENSORFLOW_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/*
+ *  Class:     org_tensorflow_lite_TensorFlowLite
+ *  Method:    initTensorFlow
+ *  Signature: ()V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_TensorFlowLite_initTensorFlow(
+    JNIEnv* env, jclass clazz);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_INIT_TENSORFLOW_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
similarity index 92%
rename from tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
rename to tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 4dc73fbcf8392cb3a48b92e5a69672af09a801cb..1e98f942504b7e4f238d8715de1dc75eedf046cf 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h"
+#include "tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h"
 namespace {
 
 tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
@@ -43,6 +43,15 @@ BufferErrorReporter* convertLongToErrorReporter(JNIEnv* env, jlong handle) {
   return reinterpret_cast<BufferErrorReporter*>(handle);
 }
 
+TfLiteDelegate* convertLongToDelegate(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Invalid handle to delegate.");
+    return nullptr;
+  }
+  return reinterpret_cast<TfLiteDelegate*>(handle);
+}
+
 std::vector<int> convertJIntArrayToVector(JNIEnv* env, jintArray inputs) {
   int size = static_cast<int>(env->GetArrayLength(inputs));
   std::vector<int> outputs(size, 0);
@@ -69,6 +78,8 @@ int getDataType(TfLiteType data_type) {
       return 3;
     case kTfLiteInt64:
       return 4;
+    case kTfLiteString:
+      return 5;
     default:
       return -1;
   }
@@ -335,16 +346,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
                    error_reporter->CachedErrorMessage());
     return 0;
   }
-  // allocates memory
-  status = interpreter->AllocateTensors();
-  if (status != kTfLiteOk) {
-    throwException(
-        env, kIllegalStateException,
-        "Internal error: Unexpected failure when preparing tensor allocations:"
-        " %s",
-        error_reporter->CachedErrorMessage());
-    return 0;
-  }
+  // Note that tensor allocation is performed explicitly by the owning Java
+  // NativeInterpreterWrapper instance.
   return reinterpret_cast<jlong>(interpreter.release());
 }
 
@@ -449,6 +452,29 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
   return is_changed ? JNI_TRUE : JNI_FALSE;
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_applyDelegate(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
+    jlong delegate_handle) {
+  tflite::Interpreter* interpreter =
+      convertLongToInterpreter(env, interpreter_handle);
+  if (interpreter == nullptr) return;
+
+  BufferErrorReporter* error_reporter =
+      convertLongToErrorReporter(env, error_handle);
+  if (error_reporter == nullptr) return;
+
+  TfLiteDelegate* delegate = convertLongToDelegate(env, delegate_handle);
+  if (delegate == nullptr) return;
+
+  TfLiteStatus status = interpreter->ModifyGraphWithDelegate(delegate);
+  if (status != kTfLiteOk) {
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Failed to apply delegate: %s",
+                   error_reporter->CachedErrorMessage());
+  }
+}
+
 JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
     JNIEnv* env, jclass clazz, jlong error_handle, jlong model_handle,
     jlong interpreter_handle) {
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
similarity index 90%
rename from tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
rename to tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
index f8f3e7028c7d861f5af42e17a17806c6ea527ce4..e184b8f1a783d56c803ac0619b323a62491ba93f 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
-#define TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
 
 #include <jni.h>
 #include <stdio.h>
 #include <time.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/java/src/main/native/exception_jni.h"
-#include "tensorflow/contrib/lite/java/src/main/native/tensor_jni.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/java/src/main/native/exception_jni.h"
+#include "tensorflow/lite/java/src/main/native/tensor_jni.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 // This is to be provided at link-time by a library.
@@ -223,6 +223,16 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
     JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
     jint input_idx, jintArray dims);
 
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:    applyDelegate
+ *  Signature: (JJJ)V
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_applyDelegate(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
+    jlong delegate_handle);
+
 /*
  *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
  *  Method:
@@ -235,4 +245,4 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_NATIVEINTERPRETERWRAPPER_JNI_H_
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82d2679de9c868694668bca23ce6c8a6fb55dbe8
--- /dev/null
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -0,0 +1,401 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/java/src/main/native/tensor_jni.h"
+#include <cstring>
+#include <memory>
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/java/src/main/native/exception_jni.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace {
+
+// Convenience handle for obtaining a TfLiteTensor given an interpreter and
+// tensor index.
+//
+// Historically, the Java Tensor class used a TfLiteTensor pointer as its native
+// handle. However, this approach isn't generally safe, as the interpreter may
+// invalidate all TfLiteTensor* handles during inference or allocation.
+class TensorHandle {
+ public:
+  TensorHandle(tflite::Interpreter* interpreter, int tensor_index)
+      : interpreter_(interpreter), tensor_index_(tensor_index) {}
+
+  TfLiteTensor* tensor() const { return interpreter_->tensor(tensor_index_); }
+
+ private:
+  tflite::Interpreter* const interpreter_;
+  const int tensor_index_;
+};
+
+TfLiteTensor* GetTensorFromHandle(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Invalid handle to TfLiteTensor.");
+    return nullptr;
+  }
+  return reinterpret_cast<TensorHandle*>(handle)->tensor();
+}
+
+size_t ElementByteSize(TfLiteType data_type) {
+  // The code in this file makes the assumption that the
+  // TensorFlow TF_DataTypes and the Java primitive types
+  // have the same byte sizes. Validate that:
+  switch (data_type) {
+    case kTfLiteFloat32:
+      static_assert(sizeof(jfloat) == 4,
+                    "Interal error: Java float not compatible with "
+                    "kTfLiteFloat");
+      return 4;
+    case kTfLiteInt32:
+      static_assert(sizeof(jint) == 4,
+                    "Interal error: Java int not compatible with kTfLiteInt");
+      return 4;
+    case kTfLiteUInt8:
+      static_assert(sizeof(jbyte) == 1,
+                    "Interal error: Java byte not compatible with "
+                    "kTfLiteUInt8");
+      return 1;
+    case kTfLiteInt64:
+      static_assert(sizeof(jlong) == 8,
+                    "Interal error: Java long not compatible with "
+                    "kTfLiteInt64");
+      return 8;
+    default:
+      return 0;
+  }
+}
+
+size_t WriteOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
+                                void* dst, size_t dst_size) {
+  jarray array = static_cast<jarray>(object);
+  const int num_elements = env->GetArrayLength(array);
+  size_t to_copy = num_elements * ElementByteSize(type);
+  if (to_copy > dst_size) {
+    throwException(env, kIllegalStateException,
+                   "Internal error: cannot write Java array of %d bytes to "
+                   "Tensor of %d bytes",
+                   to_copy, dst_size);
+    return 0;
+  }
+  switch (type) {
+    case kTfLiteFloat32: {
+      jfloatArray float_array = static_cast<jfloatArray>(array);
+      jfloat* float_dst = static_cast<jfloat*>(dst);
+      env->GetFloatArrayRegion(float_array, 0, num_elements, float_dst);
+      return to_copy;
+    }
+    case kTfLiteInt32: {
+      jintArray int_array = static_cast<jintArray>(array);
+      jint* int_dst = static_cast<jint*>(dst);
+      env->GetIntArrayRegion(int_array, 0, num_elements, int_dst);
+      return to_copy;
+    }
+    case kTfLiteInt64: {
+      jlongArray long_array = static_cast<jlongArray>(array);
+      jlong* long_dst = static_cast<jlong*>(dst);
+      env->GetLongArrayRegion(long_array, 0, num_elements, long_dst);
+      return to_copy;
+    }
+    case kTfLiteUInt8: {
+      jbyteArray byte_array = static_cast<jbyteArray>(array);
+      jbyte* byte_dst = static_cast<jbyte*>(dst);
+      env->GetByteArrayRegion(byte_array, 0, num_elements, byte_dst);
+      return to_copy;
+    }
+    default: {
+      throwException(env, kUnsupportedOperationException,
+                     "DataType error: TensorFlowLite currently supports float "
+                     "(32 bits), int (32 bits), byte (8 bits), and long "
+                     "(64 bits), support for other types (DataType %d in this "
+                     "case) will be added in the future",
+                     kTfLiteFloat32, type);
+      return 0;
+    }
+  }
+}
+
+size_t ReadOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
+                               const void* src, size_t src_size, jarray dst) {
+  const int len = env->GetArrayLength(dst);
+  const size_t size = len * ElementByteSize(data_type);
+  if (size > src_size) {
+    throwException(
+        env, kIllegalStateException,
+        "Internal error: cannot fill a Java array of %d bytes with a Tensor of "
+        "%d bytes",
+        size, src_size);
+    return 0;
+  }
+  switch (data_type) {
+    case kTfLiteFloat32: {
+      jfloatArray float_array = static_cast<jfloatArray>(dst);
+      env->SetFloatArrayRegion(float_array, 0, len,
+                               static_cast<const jfloat*>(src));
+      return size;
+    }
+    case kTfLiteInt32: {
+      jintArray int_array = static_cast<jintArray>(dst);
+      env->SetIntArrayRegion(int_array, 0, len, static_cast<const jint*>(src));
+      return size;
+    }
+    case kTfLiteInt64: {
+      jlongArray long_array = static_cast<jlongArray>(dst);
+      env->SetLongArrayRegion(long_array, 0, len,
+                              static_cast<const jlong*>(src));
+      return size;
+    }
+    case kTfLiteUInt8: {
+      jbyteArray byte_array = static_cast<jbyteArray>(dst);
+      env->SetByteArrayRegion(byte_array, 0, len,
+                              static_cast<const jbyte*>(src));
+      return size;
+    }
+    default: {
+      throwException(env, kIllegalStateException,
+                     "DataType error: invalid DataType(%d)", data_type);
+    }
+  }
+  return 0;
+}
+
+size_t ReadMultiDimensionalArray(JNIEnv* env, TfLiteType data_type, char* src,
+                                 size_t src_size, int dims_left, jarray dst) {
+  if (dims_left == 1) {
+    return ReadOneDimensionalArray(env, data_type, src, src_size, dst);
+  } else {
+    jobjectArray ndarray = static_cast<jobjectArray>(dst);
+    int len = env->GetArrayLength(ndarray);
+    size_t size = 0;
+    for (int i = 0; i < len; ++i) {
+      jarray row = static_cast<jarray>(env->GetObjectArrayElement(ndarray, i));
+      size += ReadMultiDimensionalArray(env, data_type, src + size,
+                                        src_size - size, dims_left - 1, row);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return size;
+    }
+    return size;
+  }
+}
+
+// Returns the total number of strings read.
+int ReadMultiDimensionalStringArray(JNIEnv* env, TfLiteTensor* tensor,
+                                    int dims_left, int start_str_index,
+                                    jarray dst) {
+  jobjectArray object_array = static_cast<jobjectArray>(dst);
+  int len = env->GetArrayLength(object_array);
+  int num_strings_read = 0;
+
+  // If dst is a 1-dimensional array, copy the strings into it. Else
+  // recursively call ReadMultiDimensionalStringArray over sub-dimensions.
+  if (dims_left == 1) {
+    for (int i = 0; i < len; ++i) {
+      const tflite::StringRef strref =
+          tflite::GetString(tensor, start_str_index + num_strings_read);
+      jstring string_dest = env->NewStringUTF(strref.str);
+      env->SetObjectArrayElement(object_array, i, string_dest);
+      env->DeleteLocalRef(string_dest);
+      ++num_strings_read;
+    }
+  } else {
+    for (int i = 0; i < len; ++i) {
+      jarray row =
+          static_cast<jarray>(env->GetObjectArrayElement(object_array, i));
+      num_strings_read += ReadMultiDimensionalStringArray(
+          env, tensor, dims_left - 1, start_str_index + num_strings_read, row);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return num_strings_read;
+    }
+  }
+
+  return num_strings_read;
+}
+
+size_t WriteMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
+                                  int dims_left, char** dst, int dst_size) {
+  if (dims_left <= 1) {
+    return WriteOneDimensionalArray(env, src, type, *dst, dst_size);
+  } else {
+    jobjectArray ndarray = static_cast<jobjectArray>(src);
+    int len = env->GetArrayLength(ndarray);
+    size_t sz = 0;
+    for (int i = 0; i < len; ++i) {
+      jobject row = env->GetObjectArrayElement(ndarray, i);
+      char* next_dst = *dst + sz;
+      sz += WriteMultiDimensionalArray(env, row, type, dims_left - 1, &next_dst,
+                                       dst_size - sz);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return sz;
+    }
+    return sz;
+  }
+}
+
+void PopulateStringDynamicBuffer(JNIEnv* env, jobject src,
+                                 tflite::DynamicBuffer* dst_buffer,
+                                 int dims_left) {
+  jobjectArray object_array = static_cast<jobjectArray>(src);
+  const int num_elements = env->GetArrayLength(object_array);
+
+  // If src is a 1-dimensional array, add the strings into dst_buffer. Else
+  // recursively call populateStringDynamicBuffer over sub-dimensions.
+  if (dims_left <= 1) {
+    for (int i = 0; i < num_elements; ++i) {
+      jstring string_obj =
+          static_cast<jstring>(env->GetObjectArrayElement(object_array, i));
+      const char* chars = env->GetStringUTFChars(string_obj, nullptr);
+      // + 1 for terminating character.
+      const int byte_len = env->GetStringUTFLength(string_obj) + 1;
+      dst_buffer->AddString(chars, byte_len);
+      env->ReleaseStringUTFChars(string_obj, chars);
+      env->DeleteLocalRef(string_obj);
+    }
+  } else {
+    for (int i = 0; i < num_elements; ++i) {
+      jobject row = env->GetObjectArrayElement(object_array, i);
+      PopulateStringDynamicBuffer(env, row, dst_buffer, dims_left - 1);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return;
+    }
+  }
+}
+
+void WriteMultiDimensionalStringArray(JNIEnv* env, jobject src,
+                                      TfLiteTensor* tensor) {
+  tflite::DynamicBuffer dst_buffer;
+  PopulateStringDynamicBuffer(env, src, &dst_buffer, tensor->dims->size);
+  if (!env->ExceptionCheck()) {
+    dst_buffer.WriteToTensor(tensor, /*new_shape=*/nullptr);
+  }
+}
+
+}  // namespace
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_Tensor_create(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jint tensor_index) {
+  tflite::Interpreter* interpreter =
+      reinterpret_cast<tflite::Interpreter*>(interpreter_handle);
+  return reinterpret_cast<jlong>(new TensorHandle(interpreter, tensor_index));
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_delete(JNIEnv* env,
+                                                              jclass clazz,
+                                                              jlong handle) {
+  delete reinterpret_cast<TensorHandle*>(handle);
+}
+
+JNIEXPORT jobject JNICALL Java_org_tensorflow_lite_Tensor_buffer(JNIEnv* env,
+                                                                 jclass clazz,
+                                                                 jlong handle) {
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return nullptr;
+  if (tensor->data.raw == nullptr) {
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Tensor hasn't been allocated.");
+    return nullptr;
+  }
+  return env->NewDirectByteBuffer(static_cast<void*>(tensor->data.raw),
+                                  static_cast<jlong>(tensor->bytes));
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_writeDirectBuffer(
+    JNIEnv* env, jclass clazz, jlong handle, jobject src) {
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return;
+
+  char* src_data_raw = static_cast<char*>(env->GetDirectBufferAddress(src));
+  if (!src_data_raw) {
+    throwException(env, kIllegalArgumentException,
+                   "Input ByteBuffer is not a direct buffer");
+    return;
+  }
+
+  tensor->data.raw = src_data_raw;
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
+                                                          jclass clazz,
+                                                          jlong handle,
+                                                          jobject value) {
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return;
+  int num_dims = tensor->dims->size;
+  if (num_dims == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Cannot copy empty/scalar Tensors.");
+    return;
+  }
+  if (tensor->type == kTfLiteString) {
+    ReadMultiDimensionalStringArray(env, tensor, num_dims, 0,
+                                    static_cast<jarray>(value));
+  } else {
+    ReadMultiDimensionalArray(env, tensor->type, tensor->data.raw,
+                              tensor->bytes, num_dims,
+                              static_cast<jarray>(value));
+  }
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jobject src) {
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return;
+  if (tensor->type != kTfLiteString && tensor->data.raw == nullptr) {
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Target Tensor hasn't been allocated.");
+    return;
+  }
+  if (tensor->dims->size == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Cannot copy empty/scalar Tensors.");
+    return;
+  }
+  if (tensor->type == kTfLiteString) {
+    WriteMultiDimensionalStringArray(env, src, tensor);
+  } else {
+    WriteMultiDimensionalArray(env, src, tensor->type, tensor->dims->size,
+                               &tensor->data.raw, tensor->bytes);
+  }
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_dtype(JNIEnv* env,
+                                                             jclass clazz,
+                                                             jlong handle) {
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return 0;
+  return static_cast<jint>(tensor->type);
+}
+
+JNIEXPORT jintArray JNICALL
+Java_org_tensorflow_lite_Tensor_shape(JNIEnv* env, jclass clazz, jlong handle) {
+  TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return nullptr;
+  int num_dims = tensor->dims->size;
+  jintArray result = env->NewIntArray(num_dims);
+  env->SetIntArrayRegion(result, 0, num_dims, tensor->dims->data);
+  return result;
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_numBytes(JNIEnv* env,
+                                                                jclass clazz,
+                                                                jlong handle) {
+  const TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return 0;
+  return static_cast<jint>(tensor->bytes);
+}
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h b/tensorflow/lite/java/src/main/native/tensor_jni.h
similarity index 93%
rename from tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
rename to tensorflow/lite/java/src/main/native/tensor_jni.h
index c5e9690e9a04bac8be9c047fa6e8a8251879711b..ec0442e93f6f9d8b7e90eb1cf6b6556abac0097b 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
-#define TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
 
 #include <jni.h>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -109,4 +109,4 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSOR_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.cc b/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.cc
similarity index 88%
rename from tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.cc
rename to tensorflow/lite/java/src/main/native/tensorflow_lite_jni.cc
index 2e7f2f56921b871a6ace2b6cb984fcd185a4d2ab..2b8cf4201cea950f0c8698e07787a185a179ac03 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <stdio.h>
 
-#include "tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h"
-#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/lite/java/src/main/native/tensorflow_lite_jni.h"
+#include "tensorflow/lite/version.h"
 
 JNIEXPORT jstring JNICALL
 Java_org_tensorflow_lite_TensorFlowLite_version(JNIEnv* env, jclass /*clazz*/) {
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h b/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.h
similarity index 81%
rename from tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h
rename to tensorflow/lite/java/src/main/native/tensorflow_lite_jni.h
index 5e2a7ded1b495ed349b90d6ad440b0358a5b377f..de3e703110c455ceec9e1ed944318b9c0916ab1e 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h
+++ b/tensorflow/lite/java/src/main/native/tensorflow_lite_jni.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
-#define TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
+#ifndef TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
+#define TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
 
 #include <jni.h>
 
@@ -33,4 +33,4 @@ Java_org_tensorflow_lite_TensorFlowLite_version(JNIEnv*, jclass);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
+#endif  // TENSORFLOW_LITE_JAVA_SRC_MAIN_NATIVE_TENSORFLOW_LITE_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/version_script.lds b/tensorflow/lite/java/src/main/native/version_script.lds
similarity index 100%
rename from tensorflow/contrib/lite/java/src/main/native/version_script.lds
rename to tensorflow/lite/java/src/main/native/version_script.lds
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
similarity index 96%
rename from tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
rename to tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
index 6d6417f895e88584b46f619565a593a61921189d..8412ec0e9dacd5e837286e629603e0e354d2341c 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
@@ -30,6 +30,7 @@ public final class DataTypeTest {
     assertThat(DataType.INT32.byteSize()).isEqualTo(4);
     assertThat(DataType.UINT8.byteSize()).isEqualTo(1);
     assertThat(DataType.INT64.byteSize()).isEqualTo(8);
+    assertThat(DataType.STRING.byteSize()).isEqualTo(-1);
   }
 
   @Test
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..21c431a82bf0f3ddb1684a224dffaa7bbf6d004e
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterFlexTest.java
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.io.File;
+import java.util.HashMap;
+import java.util.Map;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/**
+ * Unit tests for {@link org.tensorflow.lite.Interpreter} that validate execution with models that
+ * have TensorFlow ops.
+ */
+@RunWith(JUnit4.class)
+public final class InterpreterFlexTest {
+
+  private static final File FLEX_MODEL_FILE =
+      new File("tensorflow/lite/testdata/multi_add_flex.bin");
+
+  /** Smoke test validating that flex model loading works when the flex delegate is linked. */
+  @Test
+  public void testFlexModel() throws Exception {
+    try (Interpreter interpreter = new Interpreter(FLEX_MODEL_FILE)) {
+      assertThat(interpreter.getInputTensorCount()).isEqualTo(4);
+      assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getInputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getInputTensor(2).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getInputTensor(3).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getOutputTensorCount()).isEqualTo(2);
+      assertThat(interpreter.getOutputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+      assertThat(interpreter.getOutputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+      Object[] inputs = new Object[] {new float[1], new float[1], new float[1], new float[1]};
+      Map<Integer, Object> outputs = new HashMap<>();
+      outputs.put(0, new float[1]);
+      outputs.put(1, new float[1]);
+      interpreter.runForMultipleInputsOutputs(inputs, outputs);
+    }
+  }
+
+  static {
+    TensorFlowLite.initTensorFlow();
+  }
+}
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..b69bfa076e226850f3de305ab4f0a4e03a302764
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import java.io.File;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.lite.Interpreter} agains a MobileNet model. */
+@RunWith(JUnit4.class)
+public final class InterpreterMobileNetTest {
+
+  private static final File MOBILENET_MODEL_FILE =
+      new File("tensorflow/lite/java/src/testdata/mobilenet.tflite.bin");
+
+  @Test
+  public void testMobilenetRun() {
+    // Create a gray image.
+    float[][][][] img = new float[1][224][224][3];
+    for (int i = 0; i < 224; ++i) {
+      for (int j = 0; j < 224; ++j) {
+        img[0][i][j][0] = 0.5f;
+        img[0][i][j][1] = 0.5f;
+        img[0][i][j][2] = 0.5f;
+      }
+    }
+
+    // Allocate memory to receive the output values.
+    float[][] labels = new float[1][1001];
+
+    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    interpreter.run(img, labels);
+    assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+    assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+    interpreter.close();
+
+    assertThat(labels[0])
+        .usingExactEquality()
+        .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
similarity index 75%
rename from tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
rename to tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index a98fca01325c73994c28fa96770a04424f7997fe..e635515de8cfdc2b4ed283adc8fc64803816258e 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -38,10 +38,13 @@ import org.junit.runners.JUnit4;
 public final class InterpreterTest {
 
   private static final File MODEL_FILE =
-      new File("tensorflow/contrib/lite/java/src/testdata/add.bin");
+      new File("tensorflow/lite/java/src/testdata/add.bin");
 
-  private static final File MOBILENET_MODEL_FILE =
-      new File("tensorflow/contrib/lite/java/src/testdata/mobilenet.tflite.bin");
+  private static final File MULTIPLE_INPUTS_MODEL_FILE =
+      new File("tensorflow/lite/testdata/multi_add.bin");
+
+  private static final File FLEX_MODEL_FILE =
+      new File("tensorflow/lite/testdata/multi_add_flex.bin");
 
   @Test
   public void testInterpreter() throws Exception {
@@ -164,20 +167,29 @@ public final class InterpreterTest {
 
   @Test
   public void testRunForMultipleInputsOutputs() {
-    Interpreter interpreter = new Interpreter(MODEL_FILE);
-    float[] oneD = {1.23f, 6.54f, 7.81f};
-    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
-    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
-    float[][][][] fourD = {threeD, threeD};
-    Object[] inputs = {fourD};
-    float[][][][] parsedOutputs = new float[2][8][8][3];
+    Interpreter interpreter = new Interpreter(MULTIPLE_INPUTS_MODEL_FILE);
+    assertThat(interpreter.getInputTensorCount()).isEqualTo(4);
+    assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getInputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getInputTensor(2).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getInputTensor(3).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getOutputTensorCount()).isEqualTo(2);
+    assertThat(interpreter.getOutputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
+    assertThat(interpreter.getOutputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
+
+    float[] input0 = {1.23f};
+    float[] input1 = {2.43f};
+    Object[] inputs = {input0, input1, input0, input1};
+    float[] parsedOutput0 = new float[1];
+    float[] parsedOutput1 = new float[1];
     Map<Integer, Object> outputs = new HashMap<>();
-    outputs.put(0, parsedOutputs);
+    outputs.put(0, parsedOutput0);
+    outputs.put(1, parsedOutput1);
     interpreter.runForMultipleInputsOutputs(inputs, outputs);
-    float[] outputOneD = parsedOutputs[0][0][0];
-    float[] expected = {3.69f, 19.62f, 23.43f};
-    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
-    interpreter.close();
+    float[] expected0 = {4.89f};
+    float[] expected1 = {6.09f};
+    assertThat(parsedOutput0).usingTolerance(0.1f).containsExactly(expected0).inOrder();
+    assertThat(parsedOutput1).usingTolerance(0.1f).containsExactly(expected1).inOrder();
   }
 
   @Test
@@ -211,32 +223,6 @@ public final class InterpreterTest {
     }
   }
 
-  @Test
-  public void testMobilenetRun() {
-    // Create a gray image.
-    float[][][][] img = new float[1][224][224][3];
-    for (int i = 0; i < 224; ++i) {
-      for (int j = 0; j < 224; ++j) {
-        img[0][i][j][0] = 0.5f;
-        img[0][i][j][1] = 0.5f;
-        img[0][i][j][2] = 0.5f;
-      }
-    }
-
-    // Allocate memory to receive the output values.
-    float[][] labels = new float[1][1001];
-
-    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
-    interpreter.run(img, labels);
-    assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
-    assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
-    interpreter.close();
-
-    assertThat(labels[0])
-        .usingExactEquality()
-        .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
-  }
-
   @Test
   public void testRunWithWrongInputType() {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
@@ -283,7 +269,7 @@ public final class InterpreterTest {
 
   @Test
   public void testGetInputIndex() {
-    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
     try {
       interpreter.getInputIndex("WrongInputName");
       fail();
@@ -300,7 +286,7 @@ public final class InterpreterTest {
 
   @Test
   public void testGetOutputIndex() {
-    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
     try {
       interpreter.getOutputIndex("WrongOutputName");
       fail();
@@ -309,9 +295,9 @@ public final class InterpreterTest {
           .hasMessageThat()
           .contains(
               "'WrongOutputName' is not a valid name for any output. Names of outputs and their"
-                  + " indexes are {MobilenetV1/Predictions/Softmax=0}");
+                  + " indexes are {output=0}");
     }
-    int index = interpreter.getOutputIndex("MobilenetV1/Predictions/Softmax");
+    int index = interpreter.getOutputIndex("output");
     assertThat(index).isEqualTo(0);
   }
 
@@ -345,4 +331,83 @@ public final class InterpreterTest {
     interpreter.close();
     interpreter.close();
   }
+
+  /** Smoke test validating that flex model loading fails when the flex delegate is not linked. */
+  @Test
+  public void testFlexModel() throws Exception {
+    try {
+      new Interpreter(FLEX_MODEL_FILE);
+      fail();
+    } catch (IllegalStateException e) {
+      // Expected failure.
+    }
+  }
+
+  @Test
+  public void testDelegate() throws Exception {
+    System.loadLibrary("tensorflowlite_test_jni");
+    Delegate delegate =
+        new Delegate() {
+          @Override
+          public long getNativeHandle() {
+            return getNativeHandleForDelegate();
+          }
+        };
+    Interpreter interpreter =
+        new Interpreter(MODEL_FILE, new Interpreter.Options().addDelegate(delegate));
+
+    // The native delegate stubs out the graph with a single op that produces the scalar value 7.
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[] output = new float[1];
+    interpreter.run(fourD, output);
+    float[] expected = {7.0f};
+    assertThat(output).usingTolerance(0.1f).containsExactly(expected).inOrder();
+
+    interpreter.close();
+  }
+
+  @Test
+  public void testInvalidDelegate() throws Exception {
+    System.loadLibrary("tensorflowlite_test_jni");
+    Delegate delegate =
+        new Delegate() {
+          @Override
+          public long getNativeHandle() {
+            return getNativeHandleForInvalidDelegate();
+          }
+        };
+    try {
+      Interpreter interpreter =
+          new Interpreter(MODEL_FILE, new Interpreter.Options().addDelegate(delegate));
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("Internal error: Failed to apply delegate");
+    }
+  }
+
+  @Test
+  public void testNullDelegate() throws Exception {
+    System.loadLibrary("tensorflowlite_test_jni");
+    Delegate delegate =
+        new Delegate() {
+          @Override
+          public long getNativeHandle() {
+            return 0;
+          }
+        };
+    try {
+      Interpreter interpreter =
+          new Interpreter(MODEL_FILE, new Interpreter.Options().addDelegate(delegate));
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("Internal error: Invalid handle to delegate");
+    }
+  }
+
+  private static native long getNativeHandleForDelegate();
+
+  private static native long getNativeHandleForInvalidDelegate();
 }
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
similarity index 89%
rename from tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
rename to tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index 270bd6703a101db39202ca03e700138fcc237932..b00efa77cbf60296f0ee3db8059bac01edd6ccea 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -32,28 +32,31 @@ import org.junit.runners.JUnit4;
 public final class NativeInterpreterWrapperTest {
 
   private static final String FLOAT_MODEL_PATH =
-      "tensorflow/contrib/lite/java/src/testdata/add.bin";
+      "tensorflow/lite/java/src/testdata/add.bin";
 
   private static final String INT_MODEL_PATH =
-      "tensorflow/contrib/lite/java/src/testdata/int32.bin";
+      "tensorflow/lite/java/src/testdata/int32.bin";
 
   private static final String LONG_MODEL_PATH =
-      "tensorflow/contrib/lite/java/src/testdata/int64.bin";
+      "tensorflow/lite/java/src/testdata/int64.bin";
 
   private static final String BYTE_MODEL_PATH =
-      "tensorflow/contrib/lite/java/src/testdata/uint8.bin";
+      "tensorflow/lite/java/src/testdata/uint8.bin";
+
+  private static final String STRING_MODEL_PATH =
+      "tensorflow/lite/java/src/testdata/string.bin";
 
   private static final String QUANTIZED_MODEL_PATH =
-      "tensorflow/contrib/lite/java/src/testdata/quantized.bin";
+      "tensorflow/lite/java/src/testdata/quantized.bin";
 
   private static final String INVALID_MODEL_PATH =
-      "tensorflow/contrib/lite/java/src/testdata/invalid_model.bin";
+      "tensorflow/lite/java/src/testdata/invalid_model.bin";
 
   private static final String MODEL_WITH_CUSTOM_OP_PATH =
-      "tensorflow/contrib/lite/java/src/testdata/with_custom_op.lite";
+      "tensorflow/lite/java/src/testdata/with_custom_op.lite";
 
   private static final String NONEXISTING_MODEL_PATH =
-      "tensorflow/contrib/lite/java/src/testdata/nonexisting_model.bin";
+      "tensorflow/lite/java/src/testdata/nonexisting_model.bin";
 
   @Test
   public void testConstructor() {
@@ -224,6 +227,50 @@ public final class NativeInterpreterWrapperTest {
     wrapper.close();
   }
 
+  @Test
+  public void testRunWithString() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(STRING_MODEL_PATH);
+    String[] oneD = {"s1", "s22", "s333"};
+    String[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    String[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    String[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    String[][][][] parsedOutputs = new String[2][4][4][12];
+    Map<Integer, Object> outputs = new HashMap<>();
+    outputs.put(0, parsedOutputs);
+    wrapper.run(inputs, outputs);
+    String[] outputOneD = parsedOutputs[0][0][0];
+    String[] expected = {
+      "s1", "s22", "s333", "s1", "s22", "s333", "s1", "s22", "s333", "s1", "s22", "s333"
+    };
+    assertThat(outputOneD).isEqualTo(expected);
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithString_wrongShapeError() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(STRING_MODEL_PATH);
+    String[] oneD = {"s1", "s22", "s333"};
+    String[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    String[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    String[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    String[][][][] parsedOutputs = new String[2][4][4][10];
+    Map<Integer, Object> outputs = new HashMap<>();
+    outputs.put(0, parsedOutputs);
+    try {
+      wrapper.run(inputs, outputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Cannot copy between a TensorFlowLite tensor with shape [2, 4, 4, 12] and "
+                  + "a Java object with shape [2, 4, 4, 10]");
+    }
+    wrapper.close();
+  }
+
   @Test
   public void testRunWithByteBufferHavingBytes() {
     NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(BYTE_MODEL_PATH);
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java
similarity index 100%
rename from tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java
rename to tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
similarity index 99%
rename from tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
rename to tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
index 56a38ea3e225e9d11d36f9fcfe58a3cc8f280ac2..35ff4328b83e3b6bfc83c2bedf3f20c4ebed9b89 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
@@ -33,7 +33,7 @@ import org.junit.runners.JUnit4;
 public final class TensorTest {
 
   private static final String MODEL_PATH =
-      "tensorflow/contrib/lite/java/src/testdata/add.bin";
+      "tensorflow/lite/java/src/testdata/add.bin";
 
   private NativeInterpreterWrapper wrapper;
   private Tensor tensor;
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..481aea7ecd5dd8f9c26307e3b00992e21e6c2501
--- /dev/null
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -0,0 +1,26 @@
+# Description:
+# Java Native Interface (JNI) library for testing the TensorFlow Lite Java API.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
+
+cc_library(
+    name = "native",
+    testonly = 1,
+    srcs = [
+        "interpreter_test_jni.cc",
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/java/jni",
+    ],
+)
+
+tflite_jni_binary(
+    name = "libtensorflowlite_test_jni.so",
+    testonly = 1,
+    deps = [":native"],
+)
diff --git a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a0072a7c67b418975625aefff3a4dd84b4e6bf9
--- /dev/null
+++ b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <jni.h>
+#include "tensorflow/lite/c/c_api_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
+    JNIEnv* env, jclass clazz) {
+  // A simple op which outputs a vector of length 1 with the value [7].
+  static TfLiteRegistration registration = {
+      .init = nullptr,
+      .free = nullptr,
+      .prepare =
+          [](TfLiteContext* context, TfLiteNode* node) {
+            TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+            TfLiteIntArray* scalar_size = TfLiteIntArrayCreate(1);
+            scalar_size->data[0] = 1;
+            output->type = kTfLiteFloat32;
+            return context->ResizeTensor(context, output, scalar_size);
+          },
+      .invoke =
+          [](TfLiteContext* context, TfLiteNode* node) {
+            TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+            output->data.f[0] = 7.0f;
+            return kTfLiteOk;
+          },
+      .profiling_string = nullptr,
+      .builtin_code = 0,
+      .custom_name = "",
+      .version = 1,
+  };
+  // A simple delegate which replaces all ops with a single op that outputs a
+  // vector of length 1 with the value [7].
+  static TfLiteDelegate delegate = {
+      .data_ = nullptr,
+      .Prepare = [](TfLiteContext* context,
+                    TfLiteDelegate* delegate) -> TfLiteStatus {
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, registration, execution_plan, delegate);
+        return kTfLiteOk;
+      },
+      .CopyFromBufferHandle = nullptr,
+      .CopyToBufferHandle = nullptr,
+      .FreeBufferHandle = nullptr,
+      .flags = kTfLiteDelegateFlagsAllowDynamicTensors,
+  };
+  return reinterpret_cast<jlong>(&delegate);
+}
+
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForInvalidDelegate(
+    JNIEnv* env, jclass clazz) {
+  // A simple delegate that fails during preparation.
+  static TfLiteDelegate delegate = {
+      .data_ = nullptr,
+      .Prepare = [](TfLiteContext* context, TfLiteDelegate* delegate)
+          -> TfLiteStatus { return kTfLiteError; },
+      .CopyFromBufferHandle = nullptr,
+      .CopyToBufferHandle = nullptr,
+      .FreeBufferHandle = nullptr,
+      .flags = kTfLiteDelegateFlagsNone,
+  };
+  return reinterpret_cast<jlong>(&delegate);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/java/src/testdata/add.bin b/tensorflow/lite/java/src/testdata/add.bin
similarity index 100%
rename from tensorflow/contrib/lite/java/src/testdata/add.bin
rename to tensorflow/lite/java/src/testdata/add.bin
diff --git a/tensorflow/contrib/lite/java/src/testdata/float32.bin b/tensorflow/lite/java/src/testdata/float32.bin
similarity index 100%
rename from tensorflow/contrib/lite/java/src/testdata/float32.bin
rename to tensorflow/lite/java/src/testdata/float32.bin
diff --git a/tensorflow/contrib/lite/java/src/testdata/int32.bin b/tensorflow/lite/java/src/testdata/int32.bin
similarity index 100%
rename from tensorflow/contrib/lite/java/src/testdata/int32.bin
rename to tensorflow/lite/java/src/testdata/int32.bin
diff --git a/tensorflow/contrib/lite/java/src/testdata/int64.bin b/tensorflow/lite/java/src/testdata/int64.bin
similarity index 100%
rename from tensorflow/contrib/lite/java/src/testdata/int64.bin
rename to tensorflow/lite/java/src/testdata/int64.bin
diff --git a/tensorflow/contrib/lite/java/src/testdata/invalid_model.bin b/tensorflow/lite/java/src/testdata/invalid_model.bin
similarity index 100%
rename from tensorflow/contrib/lite/java/src/testdata/invalid_model.bin
rename to tensorflow/lite/java/src/testdata/invalid_model.bin
diff --git a/tensorflow/contrib/lite/java/src/testdata/quantized.bin b/tensorflow/lite/java/src/testdata/quantized.bin
similarity index 100%
rename from tensorflow/contrib/lite/java/src/testdata/quantized.bin
rename to tensorflow/lite/java/src/testdata/quantized.bin
diff --git a/tensorflow/lite/java/src/testdata/string.bin b/tensorflow/lite/java/src/testdata/string.bin
new file mode 100644
index 0000000000000000000000000000000000000000..36a2509acdfa17841d0c128674e7b4e382ad00fc
Binary files /dev/null and b/tensorflow/lite/java/src/testdata/string.bin differ
diff --git a/tensorflow/contrib/lite/java/src/testdata/uint8.bin b/tensorflow/lite/java/src/testdata/uint8.bin
similarity index 100%
rename from tensorflow/contrib/lite/java/src/testdata/uint8.bin
rename to tensorflow/lite/java/src/testdata/uint8.bin
diff --git a/tensorflow/contrib/lite/java/src/testdata/with_custom_op.lite b/tensorflow/lite/java/src/testdata/with_custom_op.lite
similarity index 100%
rename from tensorflow/contrib/lite/java/src/testdata/with_custom_op.lite
rename to tensorflow/lite/java/src/testdata/with_custom_op.lite
diff --git a/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD b/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..88641c86ed64e7e8fe33c0abe017fb372cce74b7
--- /dev/null
+++ b/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
@@ -0,0 +1,20 @@
+# Description:
+# Internal helper function to test TF Lite API.
+
+load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+android_library(
+    name = "testhelper",
+    srcs = glob(
+        [
+            "*.java",
+        ],
+    ),
+    deps = [
+        "//tensorflow/lite/java:tensorflowlite_java",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java b/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
similarity index 100%
rename from tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
rename to tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bad1c4aebf1e9d9c7c6d33f87a6e7ea9cab8d700
--- /dev/null
+++ b/tensorflow/lite/kernels/BUILD
@@ -0,0 +1,1407 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_opts_nortti_if_android")
+
+# Suppress warnings that are introduced by Eigen Tensor.
+EXTRA_EIGEN_COPTS = select({
+    "//tensorflow:ios": [
+        "-Wno-error=invalid-partial-specialization",
+        "-Wno-error=reorder",
+    ],
+    "//tensorflow:windows": [
+        "/DEIGEN_HAS_C99_MATH",
+        "/DEIGEN_AVOID_STL_ARRAY",
+    ],
+    "//conditions:default": ["-Wno-error=reorder"],
+})
+
+tf_cc_test(
+    name = "optional_tensor_test",
+    size = "small",
+    srcs = ["optional_tensor_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "eigen_support",
+    srcs = [
+        "eigen_support.cc",
+    ],
+    hdrs = [
+        "eigen_support.h",
+    ],
+    copts = tflite_copts() + EXTRA_EIGEN_COPTS,
+    deps = [
+        ":op_macros",
+        "//tensorflow/lite:arena_planner",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:optimized",
+    ],
+)
+
+cc_library(
+    name = "gemm_support",
+    srcs = [
+        "gemm_support.cc",
+    ],
+    hdrs = [
+        "gemm_support.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":op_macros",
+        "//tensorflow/lite/c:c_api_internal",
+        "@gemmlowp",
+    ],
+)
+
+cc_library(
+    name = "activation_functor",
+    hdrs = [
+        "activation_functor.h",
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_library(
+    name = "op_macros",
+    hdrs = [
+        "op_macros.h",
+    ],
+)
+
+cc_library(
+    name = "kernel_util",
+    srcs = [
+        "kernel_util.cc",
+    ],
+    hdrs = [
+        "kernel_util.h",
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:round",
+        "//tensorflow/lite/kernels/internal:types",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_util_test",
+    size = "small",
+    srcs = ["kernel_util_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":kernel_util",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "test_util_test",
+    size = "small",
+    srcs = ["test_util_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    deps = [
+        ":test_util",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "padding",
+    srcs = [],
+    hdrs = ["padding.h"],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_library(
+    name = "builtin_op_kernels",
+    srcs = [
+        "activations.cc",
+        "add.cc",
+        "arg_min_max.cc",
+        "audio_spectrogram.cc",
+        "basic_rnn.cc",
+        "batch_to_space_nd.cc",
+        "bidirectional_sequence_lstm.cc",
+        "bidirectional_sequence_rnn.cc",
+        "cast.cc",
+        "comparisons.cc",
+        "concatenation.cc",
+        "conv.cc",
+        "depthwise_conv.cc",
+        "dequantize.cc",
+        "detection_postprocess.cc",
+        "div.cc",
+        "elementwise.cc",
+        "embedding_lookup.cc",
+        "embedding_lookup_sparse.cc",
+        "exp.cc",
+        "expand_dims.cc",
+        "fake_quant.cc",
+        "fill.cc",
+        "floor.cc",
+        "floor_div.cc",
+        "floor_mod.cc",
+        "fully_connected.cc",
+        "gather.cc",
+        "hashtable_lookup.cc",
+        "l2norm.cc",
+        "layer_norm_lstm.cc",
+        "local_response_norm.cc",
+        "logical.cc",
+        "lsh_projection.cc",
+        "lstm.cc",
+        "maximum_minimum.cc",
+        "mfcc.cc",
+        "mirror_pad.cc",
+        "mul.cc",
+        "neg.cc",
+        "one_hot.cc",
+        "pack.cc",
+        "pad.cc",
+        "pooling.cc",
+        "pow.cc",
+        "range.cc",
+        "reduce.cc",
+        "relu1.cc",
+        "reshape.cc",
+        "resize_bilinear.cc",
+        "resize_nearest_neighbor.cc",
+        "select.cc",
+        "shape.cc",
+        "skip_gram.cc",
+        "slice.cc",
+        "space_to_batch_nd.cc",
+        "space_to_depth.cc",
+        "sparse_output_fully_connected.cc",
+        "sparse_to_dense.cc",
+        "split.cc",
+        "split_v.cc",
+        "squared_difference.cc",
+        "squeeze.cc",
+        "strided_slice.cc",
+        "sub.cc",
+        "svdf.cc",
+        "tile.cc",
+        "topk_v2.cc",
+        "transpose.cc",
+        "transpose_conv.cc",
+        "unidirectional_sequence_lstm.cc",
+        "unidirectional_sequence_rnn.cc",
+        "unpack.cc",
+        "zeros_like.cc",
+    ],
+    hdrs = [
+    ],
+    copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
+    visibility = ["//visibility:private"],
+    deps = [
+        ":activation_functor",
+        ":eigen_support",
+        ":kernel_util",
+        ":lstm_eval",
+        ":op_macros",
+        ":padding",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:gemm_support",
+        "//tensorflow/lite/kernels/internal:audio_utils",
+        "//tensorflow/lite/kernels/internal:kernel_utils",
+        "//tensorflow/lite/kernels/internal:optimized",
+        "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "@farmhash_archive//:farmhash",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "lstm_eval",
+    srcs = ["lstm_eval.cc"],
+    hdrs = ["lstm_eval.h"],
+    deps = [
+        ":op_macros",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:kernel_utils",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+    ],
+)
+
+cc_library(
+    name = "builtin_ops",
+    srcs = ["register.cc"],
+    hdrs = ["register.h"],
+    deps = [
+        ":builtin_op_kernels",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "audio_spectrogram_test",
+    size = "small",
+    srcs = ["audio_spectrogram_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "mfcc_test",
+    size = "small",
+    srcs = ["mfcc_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "detection_postprocess_test",
+    size = "small",
+    srcs = ["detection_postprocess_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "relu1_test",
+    size = "small",
+    srcs = ["relu1_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "sparse_output_fully_connected_test",
+    size = "small",
+    srcs = ["sparse_output_fully_connected_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "activations_test",
+    size = "small",
+    srcs = ["activations_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "add_test",
+    size = "small",
+    srcs = ["add_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "arg_min_max_test",
+    size = "small",
+    srcs = ["arg_min_max_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "div_test",
+    size = "small",
+    srcs = ["div_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "sub_test",
+    size = "small",
+    srcs = ["sub_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "transpose_test",
+    size = "small",
+    srcs = ["transpose_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:reference",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "space_to_batch_nd_test",
+    size = "small",
+    srcs = ["space_to_batch_nd_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "batch_to_space_nd_test",
+    size = "small",
+    srcs = ["batch_to_space_nd_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "cast_test",
+    size = "small",
+    srcs = ["cast_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "concatenation_test",
+    size = "small",
+    srcs = ["concatenation_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "conv_test",
+    size = "small",
+    srcs = ["conv_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "depthwise_conv_test",
+    size = "small",
+    srcs = ["depthwise_conv_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "dequantize_test",
+    size = "small",
+    srcs = ["dequantize_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "basic_rnn_test",
+    size = "small",
+    srcs = ["basic_rnn_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "bidirectional_sequence_lstm_test",
+    size = "small",
+    srcs = ["bidirectional_sequence_lstm_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "floor_test",
+    size = "small",
+    srcs = ["floor_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "elementwise_test",
+    size = "small",
+    srcs = ["elementwise_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "unidirectional_sequence_lstm_test",
+    size = "small",
+    srcs = ["unidirectional_sequence_lstm_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "bidirectional_sequence_rnn_test",
+    size = "small",
+    srcs = ["bidirectional_sequence_rnn_test.cc"],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "unidirectional_sequence_rnn_test",
+    size = "small",
+    srcs = ["unidirectional_sequence_rnn_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "l2norm_test",
+    size = "small",
+    srcs = ["l2norm_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "exp_test",
+    size = "small",
+    srcs = ["exp_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "fake_quant_test",
+    size = "small",
+    srcs = ["fake_quant_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "maximum_minimum_test",
+    size = "small",
+    srcs = ["maximum_minimum_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "reduce_test",
+    size = "small",
+    srcs = ["reduce_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "mul_test",
+    size = "small",
+    srcs = ["mul_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "pad_test",
+    size = "small",
+    srcs = ["pad_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "reshape_test",
+    size = "small",
+    srcs = ["reshape_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "gather_test",
+    size = "small",
+    srcs = ["gather_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "topk_v2_test",
+    size = "small",
+    srcs = ["topk_v2_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "resize_bilinear_test",
+    size = "small",
+    srcs = ["resize_bilinear_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "resize_nearest_neighbor_test",
+    size = "small",
+    srcs = ["resize_nearest_neighbor_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "svdf_test",
+    size = "small",
+    srcs = ["svdf_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "embedding_lookup_test",
+    size = "small",
+    srcs = ["embedding_lookup_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "embedding_lookup_sparse_test",
+    size = "small",
+    srcs = ["embedding_lookup_sparse_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "fully_connected_test",
+    size = "small",
+    srcs = ["fully_connected_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "local_response_norm_test",
+    size = "small",
+    srcs = ["local_response_norm_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "pooling_test",
+    size = "small",
+    srcs = ["pooling_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "softmax_test",
+    size = "small",
+    srcs = ["softmax_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "log_softmax_test",
+    size = "small",
+    srcs = ["log_softmax_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "lsh_projection_test",
+    size = "small",
+    srcs = ["lsh_projection_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "hashtable_lookup_test",
+    size = "small",
+    srcs = ["hashtable_lookup_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "layer_norm_lstm_test",
+    size = "small",
+    srcs = ["layer_norm_lstm_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "lstm_test",
+    size = "small",
+    srcs = ["lstm_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "skip_gram_test",
+    size = "small",
+    srcs = ["skip_gram_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "space_to_depth_test",
+    size = "small",
+    srcs = ["space_to_depth_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "split_test",
+    size = "small",
+    srcs = ["split_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "split_v_test",
+    size = "small",
+    srcs = ["split_v_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "squeeze_test",
+    size = "small",
+    srcs = ["squeeze_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "strided_slice_test",
+    size = "small",
+    srcs = ["strided_slice_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "tile_test",
+    size = "small",
+    srcs = ["tile_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "comparisons_test",
+    size = "small",
+    srcs = [
+        "comparisons_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "neg_test",
+    size = "small",
+    srcs = ["neg_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "select_test",
+    size = "small",
+    srcs = [
+        "select_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "slice_test",
+    size = "small",
+    srcs = [
+        "slice_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "transpose_conv_test",
+    size = "small",
+    srcs = ["transpose_conv_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "expand_dims_test",
+    size = "small",
+    srcs = ["expand_dims_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "sparse_to_dense_test",
+    size = "small",
+    srcs = ["sparse_to_dense_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "shape_test",
+    size = "small",
+    srcs = ["shape_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "pow_test",
+    size = "small",
+    srcs = ["pow_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "pack_test",
+    size = "small",
+    srcs = ["pack_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "one_hot_test",
+    size = "small",
+    srcs = ["one_hot_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "logical_test",
+    size = "small",
+    srcs = ["logical_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "unpack_test",
+    size = "small",
+    srcs = ["unpack_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "floor_div_test",
+    size = "small",
+    srcs = ["floor_div_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "zeros_like_test",
+    size = "small",
+    srcs = ["zeros_like_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "floor_mod_test",
+    size = "small",
+    srcs = ["floor_mod_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "range_test",
+    size = "small",
+    srcs = ["range_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "squared_difference_test",
+    size = "small",
+    srcs = ["squared_difference_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "fill_test",
+    size = "small",
+    srcs = ["fill_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tflite_portable_test_suite()
+
+tf_cc_test(
+    name = "mirror_pad_test",
+    srcs = ["mirror_pad_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/contrib/lite/kernels/activation_functor.h b/tensorflow/lite/kernels/activation_functor.h
similarity index 86%
rename from tensorflow/contrib/lite/kernels/activation_functor.h
rename to tensorflow/lite/kernels/activation_functor.h
index e075dc705410bb3e0dd2a4fc9c626107529817cf..60e93c185a9c07a4d63cf233625df475a6a3f67a 100644
--- a/tensorflow/contrib/lite/kernels/activation_functor.h
+++ b/tensorflow/lite/kernels/activation_functor.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#ifndef TENSORFLOW_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#define TENSORFLOW_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
 
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 
 namespace tflite {
 
@@ -55,4 +55,4 @@ class ActivationFunctor {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#endif  // TENSORFLOW_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
similarity index 84%
rename from tensorflow/contrib/lite/kernels/activations.cc
rename to tensorflow/lite/kernels/activations.cc
index 9aed4f09b82cc0ac70c68a4da46706a6244084aa..a76654256044702736a2855d4bb12d445c90be55 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -45,6 +45,11 @@ struct LogSoftmaxOpData : public OpData {
   int32_t reverse_scaling_right_shift = 0;
 };
 
+struct PreluOpData : public OpData {
+  int32_t output_multiplier = 0;
+  int output_shift = 0;
+};
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
@@ -57,6 +62,10 @@ void* LogSoftmaxInit(TfLiteContext* context, const char* buffer,
   return new LogSoftmaxOpData;
 }
 
+void* PreluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  return new PreluOpData;
+}
+
 void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
@@ -65,6 +74,10 @@ void LogSoftmaxFree(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<LogSoftmaxOpData*>(buffer);
 }
 
+void PreluFree(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<PreluOpData*>(buffer);
+}
+
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -253,13 +266,18 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
+  PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
 
-  // Currently only Float32 is supported
-  // TODO(ycling): Support other data types.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, alpha->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->type, alpha->type);
   output->type = input->type;
 
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+    double real_multiplier =
+        input->params.scale * alpha->params.scale / output->params.scale;
+    QuantizeMultiplierSmallerThanOneExp(
+        real_multiplier, &data->output_multiplier, &data->output_shift);
+  }
+
   // PRelu (parameteric Relu) shares the same alpha value on "shared axis".
   // This means it's always required to "broadcast" alpha values in PRelu.
   TfLiteIntArray* output_size = nullptr;
@@ -288,8 +306,8 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -309,8 +327,8 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -327,9 +345,24 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
     } break;
+    case kTfLiteUInt8: {
+      ActivationParams params;
+      params.activation_type = FusedActivationFunctionType::kRelu6;
+      params.quantized_activation_min = std::max(
+          0, output->params.zero_point +
+                 static_cast<int32>(roundf(0.f / output->params.scale)));
+      params.quantized_activation_max = std::min(
+          255, output->params.zero_point +
+                   static_cast<int32>(roundf(6.f / output->params.scale)));
+      optimized_ops::ReluX(params, GetTensorShape(input),
+                           GetTensorData<uint8>(input), GetTensorShape(output),
+                           GetTensorData<uint8>(output));
+      return kTfLiteOk;
+    } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(
+          context, "Only float32 and uint8 supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -367,8 +400,8 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -407,9 +440,8 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
-      return kTfLiteError;
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
   }
   return kTfLiteOk;
 }
@@ -604,8 +636,8 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
     }
     default:
       context->ReportError(
-          context, "Only float32 and uint8_t supported currently, got %d.",
-          input->type);
+          context, "Only float32 and uint8_t supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -636,8 +668,8 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently., got %d",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently., got %s",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -651,16 +683,57 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  if (input->type != kTfLiteFloat32) {
-    context->ReportError(context, "Only float32 supported currently, got %d.",
-                         input->type);
-    return kTfLiteError;
+  const PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+          GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(alpha), GetTensorData<float>(alpha),
+          GetTensorShape(output), GetTensorData<float>(output),
+          ApplyPrelu<float>);
+      return kTfLiteOk;
+    } break;
+    case kTfLiteUInt8: {
+      PreluParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.alpha_offset = -alpha->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier = data->output_multiplier;
+      op_params.output_shift = data->output_shift;
+      reference_ops::BroadcastPrelu4DSlow(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context,
+                           "Only float32, uint8 supported currently, got %d.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  const auto* params =
+      reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
+
+  LeakyReluParams op_params;
+  op_params.alpha = params->alpha;
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      optimized_ops::LeakyRelu(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
   }
-  reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-      GetTensorShape(input), GetTensorData<float>(input), GetTensorShape(alpha),
-      GetTensorData<float>(alpha), GetTensorShape(output),
-      GetTensorData<float>(output), ApplyPrelu<float>);
-  return kTfLiteOk;
 }
 
 }  // namespace activations
@@ -715,12 +788,19 @@ TfLiteRegistration* Register_LOG_SOFTMAX() {
 }
 
 TfLiteRegistration* Register_PRELU() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+  static TfLiteRegistration r = {activations::PreluInit, activations::PreluFree,
                                  activations::PreluPrepare,
                                  activations::PreluEval};
   return &r;
 }
 
+TfLiteRegistration* Register_LEAKY_RELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::LeakyReluEval};
+  return &r;
+}
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
similarity index 81%
rename from tensorflow/contrib/lite/kernels/activations_test.cc
rename to tensorflow/lite/kernels/activations_test.cc
index 9fa47e190a1dc797264e31979b9a6603ce8c5498..67f137baff29808d7a03571e1880901e44c34712 100644
--- a/tensorflow/contrib/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -170,6 +170,29 @@ TEST(FloatActivationsOpTest, Tanh) {
                              })));
 }
 
+TEST(QuantizedActivationsOpTest, Relu6) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_RELU6,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax});
+  m.SetInput<uint8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0, 0, 2, 4,  //
+                      3, 0, 6, 1,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({128, 128, 160, 192, 176, 128, 224, 144}));
+}
+
 TEST(QuantizedActivationsOpTest, Tanh) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
@@ -563,15 +586,29 @@ TEST(QuantizedActivationsOpTest, LogSoftmax) {
               ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
 }
 
-class PReluOpModel : public SingleOpModel {
+// A base class of PRelu op model. It provides the constructor for
+// FloatPReluOpModel and QuantizedPReluOpModel.
+class BasePReluOpModel : public SingleOpModel {
  public:
-  PReluOpModel(const TensorData& input, const TensorData& alpha) {
+  BasePReluOpModel(const TensorData& input, const TensorData& alpha) {
     input_ = AddInput(input);
     alpha_ = AddInput(alpha);
-    output_ = AddOutput(input);
+    output_ = AddOutput({input.type, input.shape, input.min, input.max});
     SetBuiltinOp(BuiltinOperator_PRELU, BuiltinOptions_NONE, 0);
     BuildInterpreter({GetShape(input_), GetShape(alpha_)});
   }
+
+ protected:
+  int input_;
+  int alpha_;
+  int output_;
+};
+
+// The FloatPReluOpModel class handles float input and output.
+class FloatPReluOpModel : public BasePReluOpModel {
+ public:
+  using BasePReluOpModel::BasePReluOpModel;
+
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }
@@ -579,16 +616,35 @@ class PReluOpModel : public SingleOpModel {
     PopulateTensor(alpha_, data);
   }
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
 
- protected:
-  int input_;
-  int alpha_;
-  int output_;
+// The QuantizedPReluOpModel class handles quantized input and output.
+class QuantizedPReluOpModel : public BasePReluOpModel {
+ public:
+  using BasePReluOpModel::BasePReluOpModel;
+
+  template <typename T>
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+  template <typename T>
+  void SetAlpha(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(alpha_, data);
+  }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
 };
 
 TEST(FloatActivationsOpTest, PRelu) {
-  PReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
-                 {TensorType_FLOAT32, {1, 1, 3}});
+  FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                      {TensorType_FLOAT32, {1, 1, 3}});
 
   m.SetInput({
       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
@@ -606,6 +662,69 @@ TEST(FloatActivationsOpTest, PRelu) {
                              }));
 }
 
+TEST(QuantizedActivationsOpTest, PRelu) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_UINT8, {1, 1, 3}, kMin, kMax});
+  m.SetInput<uint8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+  });
+  m.SetAlpha<uint8_t>({0.0f, 0.5f, -0.5f});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          128, 128, 128,  // Row 1, Column 1
+                                          192, 192, 192,  // Row 1, Column 2
+                                          128, 64, 192,   // Row 2, Column 1
+                                          128, 112, 144,  // Row 1, Column 2
+                                      }));
+}
+
+class LeakyReluOpModel : public SingleOpModel {
+ public:
+  LeakyReluOpModel(const TensorData& input, float alpha) {
+    input_ = AddInput(input);
+    output_ = AddOutput(input);
+    SetBuiltinOp(BuiltinOperator_LEAKY_RELU, BuiltinOptions_LeakyReluOptions,
+                 CreateLeakyReluOptions(builder_, alpha).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(FloatActivationsOpTest, LeakyRelu) {
+  LeakyReluOpModel m({TensorType_FLOAT32, {2, 3}}, 0.5f);
+
+  m.SetInput({
+      0.0f, 1.0f, 3.0f,    // Row 1
+      1.0f, -1.0f, -2.0f,  // Row 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 1.0f, 3.0f,    // Row 1
+                                 1.0f, -0.5f, -1.0f,  // Row 2
+                             }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
similarity index 87%
rename from tensorflow/contrib/lite/kernels/add.cc
rename to tensorflow/lite/kernels/add.cc
index b4393e8097f7f5e28e654269371a54097333c75f..32a7c100ce53101063d81345bcb052e680e64a28 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -220,30 +220,38 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
   if (output->type == kTfLiteUInt8) {
+    tflite::ArithmeticParams op_params;
+    op_params.left_shift = data->left_shift;
+    op_params.input1_offset = data->input1_offset;
+    op_params.input1_multiplier = data->input1_multiplier;
+    op_params.input1_shift = data->input1_shift;
+    op_params.input2_offset = data->input2_offset;
+    op_params.input2_multiplier = data->input2_multiplier;
+    op_params.input2_shift = data->input2_shift;
+    op_params.output_offset = data->output_offset;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
 #define TF_LITE_ADD(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  op_params.left_shift = data->left_shift;                             \
-  op_params.input1_offset = data->input1_offset;                       \
-  op_params.input1_multiplier = data->input1_multiplier;               \
-  op_params.input1_shift = data->input1_shift;                         \
-  op_params.input2_offset = data->input2_offset;                       \
-  op_params.input2_multiplier = data->input2_multiplier;               \
-  op_params.input2_shift = data->input2_shift;                         \
-  op_params.output_offset = data->output_offset;                       \
-  op_params.output_multiplier = data->output_multiplier;               \
-  op_params.output_shift = data->output_shift;                         \
-  SetActivationParams(data->output_activation_min,                     \
-                      data->output_activation_max, &op_params);        \
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
                GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-    // The quantized version of Add doesn't support activations, so we
-    // always use BroadcastAdd.
+               GetTensorData<uint8_t>(output));
     if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
+      } else {
+        TF_LITE_ADD(reference_ops, Add);
+      }
     } else {
-      TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow);
+      if (need_broadcast) {
+        TF_LITE_ADD(optimized_ops, BroadcastAddFivefold);
+      } else {
+        TF_LITE_ADD(optimized_ops, Add);
+      }
     }
 #undef TF_LITE_ADD
   } else if (output->type == kTfLiteInt16) {
diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
similarity index 89%
rename from tensorflow/contrib/lite/kernels/add_test.cc
rename to tensorflow/lite/kernels/add_test.cc
index 0b5844321133de103919de76d367574f018a6698..1d33adf1999ecde581badf041276ec15b4370689 100644
--- a/tensorflow/contrib/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -108,7 +108,7 @@ TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) {
 }
 
 TEST(FloatAddOpModel, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -124,7 +124,7 @@ TEST(FloatAddOpModel, VariousInputShapes) {
 }
 
 TEST(FloatAddOpModel, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -161,7 +161,7 @@ TEST(IntegerAddOpModel, ActivationRELU_N1_TO_1) {
 }
 
 TEST(IntegerAddOpModel, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerAddOpModel m({TensorType_INT32, test_shapes[i]},
@@ -176,7 +176,7 @@ TEST(IntegerAddOpModel, VariousInputShapes) {
 }
 
 TEST(IntegerAddOpModel, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerAddOpModel m({TensorType_INT32, test_shapes[i]},
@@ -193,11 +193,11 @@ TEST(IntegerAddOpModel, WithBroadcast) {
 
 TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
-  std::vector<std::initializer_list<float>> inputs1 = {
+  std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
-  std::vector<std::initializer_list<float>> inputs2 = {
+  std::vector<std::vector<float>> inputs2 = {
       {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
-  std::vector<std::initializer_list<float>> results = {
+  std::vector<std::vector<float>> results = {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
@@ -217,11 +217,11 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
   const float kMin = -1.f;
   const float kMax = 32767.f / 32768.f;
   float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
-  std::vector<std::initializer_list<float>> inputs1 = {
+  std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
-  std::vector<std::initializer_list<float>> inputs2 = {
+  std::vector<std::vector<float>> inputs2 = {
       {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
-  std::vector<std::initializer_list<float>> results = {
+  std::vector<std::vector<float>> results = {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
@@ -240,12 +240,12 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
 
 TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
-  std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
-                                                       {-0.8, 0.2, 0.7, 0.3}};
-  std::vector<std::initializer_list<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
-                                                       {0.6, 0.4, -0.8, 0.5}};
-  std::vector<std::initializer_list<float>> results = {{-0.2, 0.6, 1.0, -0.1},
-                                                       {-0.2, 0.6, -0.1, 0.8}};
+  std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                             {-0.8, 0.2, 0.7, 0.3}};
+  std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                             {0.6, 0.4, -0.8, 0.5}};
+  std::vector<std::vector<float>> results = {{-0.2, 0.6, 1.0, -0.1},
+                                             {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
@@ -262,7 +262,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
 
 TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
@@ -281,7 +281,7 @@ TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
 
 TEST(QuantizedAddOpModel, QuantizedWithBroadcast) {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
diff --git a/tensorflow/contrib/lite/kernels/arg_min_max.cc b/tensorflow/lite/kernels/arg_min_max.cc
similarity index 83%
rename from tensorflow/contrib/lite/kernels/arg_min_max.cc
rename to tensorflow/lite/kernels/arg_min_max.cc
index b91e348c27f4a5a0d6af3462612db8cbfb97af05..eea2de27f74af8bf73df92c28ed6042e4d8fa4ff 100644
--- a/tensorflow/contrib/lite/kernels/arg_min_max.cc
+++ b/tensorflow/lite/kernels/arg_min_max.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -29,6 +29,19 @@ constexpr int kInputTensor = 0;
 constexpr int kAxis = 1;
 constexpr int kOutputTensor = 0;
 
+TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* input,
+                          const TfLiteTensor* axis, TfLiteTensor* output) {
+  int axis_value = *GetTensorData<int>(axis);
+  if (axis_value < 0) {
+    axis_value += NumDimensions(input);
+  }
+
+  // Copy the input dimensions to output except make the axis dimension 1.
+  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
+  output_dims->data[axis_value] = 1;
+  return context->ResizeTensor(context, output, output_dims);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -37,10 +50,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* axis = GetInput(context, node, kAxis);
   // Make sure the axis is only 1 dimension.
   TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
-
   // Make sure the axis is only either int32 or int64.
   TF_LITE_ENSURE(context,
                  axis->type == kTfLiteInt32 || axis->type == kTfLiteInt64);
+
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   auto* params = reinterpret_cast<TfLiteArgMaxParams*>(node->builtin_data);
@@ -72,12 +85,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
-  // Copy the input dimensions to output except make the last dimension 1.
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
-  output_size->data[NumDimensions(input) - 1] = 1;
 
-  return context->ResizeTensor(context, output, output_size);
+  if (IsConstantTensor(axis)) {
+    TF_LITE_ENSURE_STATUS(ResizeOutput(context, input, axis, output));
+  } else {
+    SetTensorToDynamic(output);
+  }
+
+  return kTfLiteOk;
 }
 
 template <typename T>
@@ -89,12 +105,13 @@ std::function<bool(T, T)> GetComparefunction(bool is_arg_max) {
   }
 }
 
-// The current impl actually ignores the axis argument.
-// Only determine the index of the maximum value in the last dimension.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* axis = GetInput(context, node, kAxis);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_STATUS(ResizeOutput(context, input, axis, output));
+  }
 
 #define TF_LITE_ARG_MIN_MAX(data_type, axis_type, output_type) \
   optimized_ops::ArgMinMax(                                    \
diff --git a/tensorflow/contrib/lite/kernels/arg_min_max_test.cc b/tensorflow/lite/kernels/arg_min_max_test.cc
similarity index 85%
rename from tensorflow/contrib/lite/kernels/arg_min_max_test.cc
rename to tensorflow/lite/kernels/arg_min_max_test.cc
index 90e5fdc532c821691aaeca6e6faa4c24919ca2c8..dcdff74cc6f376b3418b64c025e8eb4a36c429a0 100644
--- a/tensorflow/contrib/lite/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/kernels/arg_min_max_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -79,7 +79,6 @@ TEST(ArgMaxOpTest, GetMaxArgFloat) {
   ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_FLOAT32,
                                TensorType_INT32, TensorType_INT32);
   model.PopulateTensor<float>(model.input(), {0.1, 0.9, 0.7, 0.3});
-  // Currently only support the last dimension.
   model.PopulateTensor<int>(model.axis(), {3});
   model.Invoke();
 
@@ -91,7 +90,6 @@ TEST(ArgMaxOpTest, GetMaxArgInt) {
   ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT32, TensorType_INT32,
                                TensorType_INT32);
   model.PopulateTensor<int>(model.input(), {1, 9, 7, 3});
-  // Currently only support the last dimension.
   model.PopulateTensor<int>(model.axis(), {3});
   model.Invoke();
 
@@ -103,7 +101,6 @@ TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
   ArgMaxOpModel<int32_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT32,
                                TensorType_INT32);
   model.PopulateTensor<int>(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
-  // Currently only support the last dimension.
   model.PopulateTensor<int>(model.axis(), {3});
   model.Invoke();
 
@@ -111,11 +108,21 @@ TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
 }
 
+TEST(ArgMaxOpTest, GetMaxArgNegativeAxis) {
+  ArgMaxOpModel<int32_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
+  model.PopulateTensor<int>(model.axis(), {-2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1, 0, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
 TEST(ArgMaxOpTest, GetMaxArgOutput64) {
   ArgMaxOpModel<int64_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT64,
                                TensorType_INT64);
   model.PopulateTensor<int>(model.input(), {10, 2, 7, 8, 1, 9, 7, 3});
-  // Currently only support the last dimension.
   model.PopulateTensor<int>(model.axis(), {3});
   model.Invoke();
 
@@ -127,7 +134,6 @@ TEST(ArgMinOpTest, GetMinArgFloat) {
   ArgMinOpModel<int32_t> model({1, 1, 1, 4}, TensorType_FLOAT32,
                                TensorType_INT32, TensorType_INT32);
   model.PopulateTensor<float>(model.input(), {0.1, 0.9, 0.7, 0.3});
-  // Currently only support the last dimension.
   model.PopulateTensor<int>(model.axis(), {3});
   model.Invoke();
 
@@ -139,7 +145,6 @@ TEST(ArgMinOpTest, GetMinArgInt) {
   ArgMinOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT32, TensorType_INT32,
                                TensorType_INT32);
   model.PopulateTensor<int>(model.input(), {1, 9, 7, 3});
-  // Currently only support the last dimension.
   model.PopulateTensor<int>(model.axis(), {3});
   model.Invoke();
 
@@ -151,7 +156,6 @@ TEST(ArgMinOpTest, GetMinArgMulDimensions) {
   ArgMinOpModel<int32_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT32,
                                TensorType_INT32);
   model.PopulateTensor<int>(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
-  // Currently only support the last dimension.
   model.PopulateTensor<int>(model.axis(), {3});
   model.Invoke();
 
@@ -159,11 +163,21 @@ TEST(ArgMinOpTest, GetMinArgMulDimensions) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
 }
 
+TEST(ArgMinOpTest, GetMinArgNegativeAxis) {
+  ArgMinOpModel<int32_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
+  model.PopulateTensor<int>(model.axis(), {-2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 0, 0, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
 TEST(ArgMinOpTest, GetMinArgOutput64) {
   ArgMinOpModel<int64_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT64,
                                TensorType_INT64);
   model.PopulateTensor<int>(model.input(), {10, 2, 7, 8, 1, 9, 7, 3});
-  // Currently only support the last dimension.
   model.PopulateTensor<int>(model.axis(), {3});
   model.Invoke();
 
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc b/tensorflow/lite/kernels/audio_spectrogram.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/audio_spectrogram.cc
rename to tensorflow/lite/kernels/audio_spectrogram.cc
index 0d2d5e775f82a281cedcd9abd1541820246b8299..5a995b31ca5e6f2c6666df1bbb539a1bd538a511 100644
--- a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
+++ b/tensorflow/lite/kernels/audio_spectrogram.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/spectrogram.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/spectrogram.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
 
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc b/tensorflow/lite/kernels/audio_spectrogram_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
rename to tensorflow/lite/kernels/audio_spectrogram_test.cc
index 7e4ff6fc16f26d74d6decab9bc534accc8607a19..527af2767b1bfbc7995252d3f2f307ac4fb46671 100644
--- a/tensorflow/contrib/lite/kernels/audio_spectrogram_test.cc
+++ b/tensorflow/lite/kernels/audio_spectrogram_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/lite/kernels/basic_rnn.cc
similarity index 85%
rename from tensorflow/contrib/lite/kernels/basic_rnn.cc
rename to tensorflow/lite/kernels/basic_rnn.cc
index 1aa27602e5e40ca7607488ad9ae5929410266329..7c66ce1992f4c341d7518742cd209a53fa1de16b 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn.cc
+++ b/tensorflow/lite/kernels/basic_rnn.cc
@@ -15,12 +15,12 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -63,10 +63,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // input configuration.
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
-  TF_LITE_ASSERT_EQ(input->dims->data[1], input_weights->dims->data[1]);
-  TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
-  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
-  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, input->dims->data[1],
+                    input_weights->dims->data[1]);
+  TF_LITE_ENSURE_EQ(context, input_weights->dims->data[0], bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, recurrent_weights->dims->data[0],
+                    bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, recurrent_weights->dims->data[1],
+                    bias->dims->data[0]);
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type);
   TF_LITE_ENSURE_EQ(context, NumDimensions(hidden_state), 2);
@@ -114,9 +117,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-    scaling_factors_size->data[0] = batch_size;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+    int scaling_dims[1] = {batch_size};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = batch_size;
       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
                                                        scaling_factors_size));
     }
@@ -133,6 +137,8 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
 
   // Initialize the pointer to hidden state.
   float* hidden_state_ptr_batch = hidden_state->data.f;
@@ -144,10 +150,10 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
   const float* recurrent_weights_ptr = recurrent_weights->data.f;
   const float* bias_ptr = bias->data.f;
 
-  kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
-                             recurrent_weights_ptr, bias_ptr, input_size,
-                             num_units, batch_size, params->activation,
-                             hidden_state_ptr_batch, output_ptr_batch);
+  kernel_utils::RnnBatchStep(
+      input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr,
+      input_size, num_units, batch_size, output_batch_leading_dim,
+      params->activation, hidden_state_ptr_batch, output_ptr_batch);
   return kTfLiteOk;
 }
 
@@ -162,6 +168,8 @@ TfLiteStatus EvalHybrid(const TfLiteTensor* input,
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
 
   // Initialize the pointer to hidden state.
   float* hidden_state_ptr_batch = hidden_state->data.f;
@@ -187,9 +195,9 @@ TfLiteStatus EvalHybrid(const TfLiteTensor* input,
   kernel_utils::RnnBatchStep(
       input_ptr_batch, input_weights_ptr, input_weights_scale,
       recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size,
-      num_units, batch_size, params->activation, quantized_input_ptr,
-      quantized_hidden_state_ptr, scaling_factors_ptr, hidden_state_ptr_batch,
-      output_ptr_batch);
+      num_units, batch_size, output_batch_leading_dim, params->activation,
+      quantized_input_ptr, quantized_hidden_state_ptr, scaling_factors_ptr,
+      hidden_state_ptr_batch, output_ptr_batch);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/lite/kernels/basic_rnn_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/kernels/basic_rnn_test.cc
rename to tensorflow/lite/kernels/basic_rnn_test.cc
index d1797354044c2f2086f1af0cffb7f1edff65f24c..240057d18a176dbb77e4962b48493c1a8d2dddab 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/lite/kernels/basic_rnn_test.cc
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/lite/kernels/batch_to_space_nd.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
rename to tensorflow/lite/kernels/batch_to_space_nd.cc
index fe2865dfb9a9934962dfb1c4c2f7f8817815b3f9..34fdf34f70c9660266e23260bd5a6b645a3c5ccb 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd.cc
@@ -14,13 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3e06d4c89327050625ac514d41bc29c4f6493f3
--- /dev/null
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BatchToSpaceNDOpModel : public SingleOpModel {
+ public:
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetCrops(std::initializer_list<int> data) {
+    PopulateTensor<int>(crops_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int block_shape_;
+  int crops_;
+  int output_;
+};
+
+// Tests case where block_shape and crops are const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpConstModel m(input_shape, block_shape, crops);
+//    m.SetInput(input_data);
+//    m.Invoke();
+class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> crops) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+// Tests case where block_shape and crops are non-const tensors.
+//
+// Example usage is as follows:
+//    BatchToSpaceNDOpDynamicModel m(input_shape);
+//    m.SetInput(input_data);
+//    m.SetBlockShape(block_shape);
+//    m.SetPaddings(crops);
+//    m.Invoke();
+class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
+ public:
+  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddInput(TensorType_INT32);
+    crops_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
+                 BuiltinOptions_BatchToSpaceNDOptions,
+                 CreateBatchToSpaceNDOptions(builder_).Union());
+    BuildInterpreter({input_shape, {2}, {2, 2}});
+  }
+};
+
+TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
+  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
+                                               4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
+                                               4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
+               "Cannot allocate tensors");
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, -1}),
+               "crops.3. >= 0 was not true.");
+}
+
+TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, -1, 0});
+  EXPECT_DEATH(m.Invoke(), "crops.2. >= 0 was not true.");
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c345bba69e4879586c6204dab21c1d28e404870
--- /dev/null
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -0,0 +1,992 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/lstm_eval.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace bidirectional_sequence_lstm {
+
+// Input Tensors of size {max_time, n_batch, n_input}
+constexpr int kInputTensor = 0;
+
+// Forward LSTM cell tensors.
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kFwInputToInputWeightsTensor = 1;  // Optional
+constexpr int kFwInputToForgetWeightsTensor = 2;
+constexpr int kFwInputToCellWeightsTensor = 3;
+constexpr int kFwInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kFwRecurrentToInputWeightsTensor = 5;  // Optional
+constexpr int kFwRecurrentToForgetWeightsTensor = 6;
+constexpr int kFwRecurrentToCellWeightsTensor = 7;
+constexpr int kFwRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kFwCellToInputWeightsTensor = 9;    // Optional
+constexpr int kFwCellToForgetWeightsTensor = 10;  // Optional
+constexpr int kFwCellToOutputWeightsTensor = 11;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kFwInputGateBiasTensor = 12;  // Optional
+constexpr int kFwForgetGateBiasTensor = 13;
+constexpr int kFwCellGateBiasTensor = 14;
+constexpr int kFwOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kFwProjectionWeightsTensor = 16;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kFwProjectionBiasTensor = 17;  // Optional
+
+// Backward LSTM cell tensors.
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kBwInputToInputWeightsTensor = 18;  // Optional
+constexpr int kBwInputToForgetWeightsTensor = 19;
+constexpr int kBwInputToCellWeightsTensor = 20;
+constexpr int kBwInputToOutputWeightsTensor = 21;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kBwRecurrentToInputWeightsTensor = 22;  // Optional
+constexpr int kBwRecurrentToForgetWeightsTensor = 23;
+constexpr int kBwRecurrentToCellWeightsTensor = 24;
+constexpr int kBwRecurrentToOutputWeightsTensor = 25;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kBwCellToInputWeightsTensor = 26;   // Optional
+constexpr int kBwCellToForgetWeightsTensor = 27;  // Optional
+constexpr int kBwCellToOutputWeightsTensor = 28;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kBwInputGateBiasTensor = 29;  // Optional
+constexpr int kBwForgetGateBiasTensor = 30;
+constexpr int kBwCellGateBiasTensor = 31;
+constexpr int kBwOutputGateBiasTensor = 32;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kBwProjectionWeightsTensor = 33;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kBwProjectionBiasTensor = 34;  // Optional
+
+// Stateful input tensors that are variables and will be modified by the Op.
+// Activation state tensors of size {n_batch, n_output}
+constexpr int kFwInputActivationStateTensor = 35;
+// Cell state tensors of size {n_batch, n_cell}
+constexpr int kFwInputCellStateTensor = 36;
+// Activation state tensors of size {n_batch, n_output}
+constexpr int kBwInputActivationStateTensor = 37;
+// Cell state tensors of size {n_batch, n_cell}
+constexpr int kBwInputCellStateTensor = 38;
+
+// Auxiliary input and weights when stacking.
+constexpr int kAuxInputTensor = 39;  // Optional
+// Forward weights.
+constexpr int kFwAuxInputToInputWeightsTensor = 40;   // Optional
+constexpr int kFwAuxInputToForgetWeightsTensor = 41;  // Optional
+constexpr int kFwAuxInputToCellWeightsTensor = 42;    // Optional
+constexpr int kFwAuxInputToOutputWeightsTensor = 43;  // Optional
+// Backward weights.
+constexpr int kBwAuxInputToInputWeightsTensor = 44;   // Optional
+constexpr int kBwAuxInputToForgetWeightsTensor = 45;  // Optional
+constexpr int kBwAuxInputToCellWeightsTensor = 46;    // Optional
+constexpr int kBwAuxInputToOutputWeightsTensor = 47;  // Optional
+
+// Output tensors.
+constexpr int kFwOutputTensor = 0;
+constexpr int kBwOutputTensor = 1;  // Ignored if merge_outputs is set.
+
+// Temporary tensors.
+enum TemporaryTensor {
+  // Scratch buffers for input, forget, etc. gates
+  kFwScratchBuffer = 0,
+  kBwScratchBuffer = 1,
+  // Quantized tensors needed for the hybrid kernel.
+  kInputQuantized = 2,
+  kFwActivationStateQuantized = 3,
+  kBwActivationStateQuantized = 4,
+  kFwCellStateQuantized = 5,
+  kBwCellStateQuantized = 6,
+  kScalingFactors = 7,
+  kProductScalingFactors = 8,
+  kRecoveredCellWeights = 9,
+  kAuxInputQuantized = 10,  // Optional, quantized tensor for auxiliary input.
+  kNumTemporaryTensors = 11
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus CheckLstmTensorDimensionsAndTypes(
+    TfLiteContext* context, TfLiteNode* node, int n_input, int n_output,
+    int n_cell, int input_to_input_weights_tensor,
+    int input_to_forget_weights_tensor, int input_to_cell_weights_tensor,
+    int input_to_output_weights_tensor, int recurrent_to_input_weights_tensor,
+    int recurrent_to_forget_weights_tensor,
+    int recurrent_to_cell_weights_tensor,
+    int recurrent_to_output_weights_tensor, int cell_to_input_weights_tensor,
+    int cell_to_forget_weights_tensor, int cell_to_output_weights_tensor,
+    int input_gate_bias_tensor, int forget_gate_bias_tensor,
+    int cell_gate_bias_tensor, int output_gate_bias_tensor,
+    int projection_weights_tensor, int projection_bias_tensor) {
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
+      node->builtin_data);
+
+  // Making sure clipping parameters have valid values.
+  // == 0 means no clipping
+  //  > 0 means clipping
+  TF_LITE_ENSURE(context, params->cell_clip >= 0);
+  TF_LITE_ENSURE(context, params->proj_clip >= 0);
+
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, input_to_forget_weights_tensor);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+  TF_LITE_ENSURE(context, (input_to_forget_weights->type == kTfLiteFloat32) ||
+                              (input_to_forget_weights->type == kTfLiteUInt8));
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, input_to_input_weights_tensor);
+  if (input_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->type,
+                      input_to_forget_weights->type);
+  }
+
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, input_to_cell_weights_tensor);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->type,
+                    input_to_forget_weights->type);
+
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, input_to_output_weights_tensor);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->type,
+                    input_to_forget_weights->type);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, recurrent_to_input_weights_tensor);
+  if (recurrent_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+                      n_output);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->type,
+                      input_to_forget_weights->type);
+  }
+
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, recurrent_to_forget_weights_tensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
+                    n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+                    n_output);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->type,
+                    input_to_forget_weights->type);
+
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, recurrent_to_cell_weights_tensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+                    n_output);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->type,
+                    input_to_forget_weights->type);
+
+  // We make sure the input-gate's parameters are either both present (regular
+  // LSTM) or not at all (CIFG-LSTM).
+  const bool cifg_weights_all_or_none =
+      ((input_to_input_weights != nullptr) &&
+       (recurrent_to_input_weights != nullptr)) ||
+      ((input_to_input_weights == nullptr) &&
+       (recurrent_to_input_weights == nullptr));
+  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, cell_to_input_weights_tensor);
+  if (cell_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->type,
+                      input_to_forget_weights->type);
+  }
+
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, cell_to_forget_weights_tensor);
+  if (cell_to_forget_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->type,
+                      input_to_forget_weights->type);
+  }
+
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, cell_to_output_weights_tensor);
+  if (cell_to_output_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->type,
+                      input_to_forget_weights->type);
+  }
+
+  // Making sure the peephole weights are there all or none.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool peephole_weights_all_or_none =
+      ((cell_to_input_weights != nullptr || use_cifg) &&
+       (cell_to_forget_weights != nullptr) &&
+       (cell_to_output_weights != nullptr)) ||
+      ((cell_to_input_weights == nullptr) &&
+       (cell_to_forget_weights == nullptr) &&
+       (cell_to_output_weights == nullptr));
+  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
+
+  // Make sure the input gate bias is present only when not a CIFG-LSTM.
+  const TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, input_gate_bias_tensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->type, kTfLiteFloat32);
+  }
+
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, forget_gate_bias_tensor);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
+
+  const TfLiteTensor* cell_bias =
+      GetInput(context, node, cell_gate_bias_tensor);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, cell_bias->type, kTfLiteFloat32);
+
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, output_gate_bias_tensor);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->type, kTfLiteFloat32);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, projection_weights_tensor);
+  if (projection_weights) {
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+    TF_LITE_ENSURE_EQ(context, projection_weights->type,
+                      input_to_forget_weights->type);
+  }
+
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, projection_bias_tensor);
+  if (projection_bias) {
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_bias->type, kTfLiteFloat32);
+  }
+
+  // Making sure the projection tensors are consistent:
+  // 1) If projection weight is not present, then projection bias should not be
+  // present.
+  // 2) If projection weight is present, then projection bias is optional.
+  // TODO(ghodrat): make sure this is correct.
+  const bool projecton_tensors_consistent =
+      ((projection_weights != nullptr) || (projection_bias == nullptr));
+  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
+                                        TfLiteNode* node, int n_input,
+                                        int n_output, int n_cell) {
+  TF_LITE_ENSURE_OK(
+      context,
+      CheckLstmTensorDimensionsAndTypes(
+          context, node, n_input, n_output, n_cell,
+          kFwInputToInputWeightsTensor, kFwInputToForgetWeightsTensor,
+          kFwInputToCellWeightsTensor, kFwInputToOutputWeightsTensor,
+          kFwRecurrentToInputWeightsTensor, kFwRecurrentToForgetWeightsTensor,
+          kFwRecurrentToCellWeightsTensor, kFwRecurrentToOutputWeightsTensor,
+          kFwCellToInputWeightsTensor, kFwCellToForgetWeightsTensor,
+          kFwCellToOutputWeightsTensor, kFwInputGateBiasTensor,
+          kFwForgetGateBiasTensor, kFwCellGateBiasTensor,
+          kFwOutputGateBiasTensor, kFwProjectionWeightsTensor,
+          kFwProjectionBiasTensor));
+
+  TF_LITE_ENSURE_OK(
+      context,
+      CheckLstmTensorDimensionsAndTypes(
+          context, node, n_input, n_output, n_cell,
+          kBwInputToInputWeightsTensor, kBwInputToForgetWeightsTensor,
+          kBwInputToCellWeightsTensor, kBwInputToOutputWeightsTensor,
+          kBwRecurrentToInputWeightsTensor, kBwRecurrentToForgetWeightsTensor,
+          kBwRecurrentToCellWeightsTensor, kBwRecurrentToOutputWeightsTensor,
+          kBwCellToInputWeightsTensor, kBwCellToForgetWeightsTensor,
+          kBwCellToOutputWeightsTensor, kBwInputGateBiasTensor,
+          kBwForgetGateBiasTensor, kBwCellGateBiasTensor,
+          kBwOutputGateBiasTensor, kBwProjectionWeightsTensor,
+          kBwProjectionBiasTensor));
+
+  // Check if Forward and Backward tensors match along required dimensions.
+  return kTfLiteOk;
+}
+
+// Resize the output and scratch tensors based on the sizes of the input
+// tensors. Also check that the size of the input tensors match each other.
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
+      node->builtin_data);
+
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 48);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size,
+                    params->merge_outputs ? 1 : 2);
+
+  // Inferring batch size, number of outputs and sequence length and
+  // number of cells from the input tensors.
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
+  const int max_time = input->dims->data[0];
+  const int n_batch = input->dims->data[1];
+  const int n_input = input->dims->data[2];
+
+  const TfLiteTensor* fw_input_to_output_weights =
+      GetInput(context, node, kFwInputToOutputWeightsTensor);
+  const int n_fw_cell = fw_input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->data[1],
+                    n_input);
+
+  const TfLiteTensor* bw_input_to_output_weights =
+      GetInput(context, node, kBwInputToOutputWeightsTensor);
+  const int n_bw_cell = bw_input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1],
+                    n_input);
+  TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->type,
+                    fw_input_to_output_weights->type);
+
+  const TfLiteTensor* fw_recurrent_to_output_weights =
+      GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->data[0],
+                    n_fw_cell);
+  TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->type,
+                    fw_input_to_output_weights->type);
+  const int n_fw_output = fw_recurrent_to_output_weights->dims->data[1];
+
+  const TfLiteTensor* bw_recurrent_to_output_weights =
+      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0],
+                    n_bw_cell);
+  TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->type,
+                    fw_input_to_output_weights->type);
+  const int n_bw_output = bw_recurrent_to_output_weights->dims->data[1];
+
+  // Check that input tensor dimensions matches with each other.
+  TF_LITE_ENSURE_OK(
+      context, CheckInputTensorDimensions(context, node, n_input, n_fw_output,
+                                          n_fw_cell));
+
+  // Get (optional) auxiliary inputs and weights.
+  const TfLiteTensor* aux_input =
+      GetOptionalInputTensor(context, node, kAuxInputTensor);
+  const TfLiteTensor* fw_aux_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwAuxInputToInputWeightsTensor);
+  const TfLiteTensor* fw_aux_input_to_forget_weights =
+      GetOptionalInputTensor(context, node, kFwAuxInputToForgetWeightsTensor);
+  const TfLiteTensor* fw_aux_input_to_cell_weights =
+      GetOptionalInputTensor(context, node, kFwAuxInputToCellWeightsTensor);
+  const TfLiteTensor* fw_aux_input_to_output_weights =
+      GetOptionalInputTensor(context, node, kFwAuxInputToOutputWeightsTensor);
+  const TfLiteTensor* bw_aux_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwAuxInputToInputWeightsTensor);
+  const TfLiteTensor* bw_aux_input_to_forget_weights =
+      GetOptionalInputTensor(context, node, kBwAuxInputToForgetWeightsTensor);
+  const TfLiteTensor* bw_aux_input_to_cell_weights =
+      GetOptionalInputTensor(context, node, kBwAuxInputToCellWeightsTensor);
+  const TfLiteTensor* bw_aux_input_to_output_weights =
+      GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
+
+  const bool aux_inputs_all_or_none =
+      ((aux_input != nullptr) && (fw_aux_input_to_cell_weights != nullptr) &&
+       (fw_aux_input_to_forget_weights != nullptr) &&
+       (fw_aux_input_to_output_weights != nullptr) &&
+       (bw_aux_input_to_cell_weights != nullptr) &&
+       (bw_aux_input_to_forget_weights != nullptr) &&
+       (bw_aux_input_to_output_weights != nullptr)) ||
+      ((fw_aux_input_to_cell_weights == nullptr) &&
+       (fw_aux_input_to_forget_weights == nullptr) &&
+       (fw_aux_input_to_output_weights == nullptr) &&
+       (bw_aux_input_to_cell_weights == nullptr) &&
+       (bw_aux_input_to_forget_weights == nullptr) &&
+       (bw_aux_input_to_output_weights == nullptr));
+  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
+  const bool has_aux_input = (aux_input != nullptr);
+
+  if (has_aux_input) {
+    // Check that aux_input has the same dimensions (except last) as the input.
+    TF_LITE_ASSERT_EQ(aux_input->dims->data[0], input->dims->data[0]);
+    TF_LITE_ASSERT_EQ(aux_input->dims->data[1], input->dims->data[1]);
+  }
+
+  // Get the pointer to output, activation_state and cell_state buffer tensors.
+  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+  TfLiteTensor* fw_activation_state =
+      GetVariableInput(context, node, kFwInputActivationStateTensor);
+  TfLiteTensor* fw_cell_state =
+      GetVariableInput(context, node, kFwInputCellStateTensor);
+
+  // Check the shape of input state tensors.
+  // These tensor may be 1D or 2D. It's fine as long as the total size is
+  // correct.
+  TF_LITE_ENSURE_EQ(context, NumElements(fw_activation_state),
+                    n_batch * n_fw_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(fw_cell_state), n_batch * n_fw_cell);
+
+  // Resize the output tensors.
+  TfLiteIntArray* fw_output_size = TfLiteIntArrayCreate(3);
+  fw_output_size->data[0] = max_time;
+  fw_output_size->data[1] = n_batch;
+  fw_output_size->data[2] =
+      params->merge_outputs ? n_bw_output + n_fw_output : n_fw_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, fw_output, fw_output_size));
+
+  // The weights are of consistent type, so it suffices to check one.
+  const bool is_hybrid_op = (fw_input_to_output_weights->type == kTfLiteUInt8);
+
+  TfLiteIntArrayFree(node->temporaries);
+  if (is_hybrid_op) {
+    node->temporaries = TfLiteIntArrayCreate(
+        has_aux_input ? kNumTemporaryTensors : kNumTemporaryTensors - 1);
+  } else {
+    node->temporaries = TfLiteIntArrayCreate(2);  // the two scratch buffers.
+  }
+  // Create a scratch buffer tensor.
+  node->temporaries->data[kFwScratchBuffer] = *scratch_tensor_index;
+  TfLiteTensor* fw_scratch_buffer =
+      GetTemporary(context, node, kFwScratchBuffer);
+  fw_scratch_buffer->type = input->type;
+  fw_scratch_buffer->allocation_type = kTfLiteArenaRw;
+
+  const TfLiteTensor* fw_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
+  const bool fw_use_cifg = (fw_input_to_input_weights == nullptr);
+  if (has_aux_input && !fw_use_cifg) {
+    TF_LITE_ENSURE_EQ(context, fw_aux_input_to_input_weights->dims->data[0],
+                      fw_input_to_input_weights->dims->data[0]);
+  }
+  TfLiteIntArray* fw_scratch_buffer_size = TfLiteIntArrayCreate(2);
+  fw_scratch_buffer_size->data[0] = n_batch;
+  if (fw_use_cifg) {
+    // Reserving space for Cell, Forget, Output gates
+    fw_scratch_buffer_size->data[1] = n_fw_cell * 3;
+  } else {
+    // Reserving space for Input, Cell, Forget, Output gates
+    fw_scratch_buffer_size->data[1] = n_fw_cell * 4;
+  }
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_scratch_buffer,
+                                                   fw_scratch_buffer_size));
+  // Same for the backward cell.
+
+  // Check that input tensor dimensions matches with each other.
+  TF_LITE_ENSURE_OK(
+      context, CheckInputTensorDimensions(context, node, n_input, n_bw_output,
+                                          n_bw_cell));
+
+  // Get the pointer to activation_state and cell_state buffer tensors.
+  TfLiteTensor* bw_activation_state =
+      GetVariableInput(context, node, kBwInputActivationStateTensor);
+  TfLiteTensor* bw_cell_state =
+      GetVariableInput(context, node, kBwInputCellStateTensor);
+
+  // Resize the output tensors.
+  if (!params->merge_outputs) {
+    TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+    TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3);
+    bw_output_size->data[0] = max_time;
+    bw_output_size->data[1] = n_batch;
+    bw_output_size->data[2] = n_bw_output;
+    TF_LITE_ENSURE_OK(
+        context, context->ResizeTensor(context, bw_output, bw_output_size));
+  }
+
+  // Check the shape of input state tensors.
+  // These tensor may be 1D or 2D. It's fine as long as the total size is
+  // correct.
+  TF_LITE_ENSURE_EQ(context, NumElements(bw_activation_state),
+                    n_batch * n_bw_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(bw_cell_state), n_batch * n_bw_cell);
+
+  // Create a scratch buffer tensor.
+  node->temporaries->data[kBwScratchBuffer] =
+      *(scratch_tensor_index) + kBwScratchBuffer;
+  TfLiteTensor* bw_scratch_buffer =
+      GetTemporary(context, node, kBwScratchBuffer);
+  bw_scratch_buffer->type = input->type;
+  bw_scratch_buffer->allocation_type = kTfLiteArenaRw;
+
+  const TfLiteTensor* bw_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
+  const bool bw_use_cifg = (bw_input_to_input_weights == nullptr);
+  if (has_aux_input && !bw_use_cifg) {
+    TF_LITE_ENSURE_EQ(context, bw_aux_input_to_input_weights->dims->data[0],
+                      bw_input_to_input_weights->dims->data[0]);
+  }
+  TfLiteIntArray* bw_scratch_buffer_size = TfLiteIntArrayCreate(2);
+  bw_scratch_buffer_size->data[0] = n_batch;
+  if (bw_use_cifg) {
+    // Reserving space for Cell, Forget, Output gates
+    bw_scratch_buffer_size->data[1] = n_bw_cell * 3;
+  } else {
+    // Reserving space for Input, Cell, Forget, Output gates
+    bw_scratch_buffer_size->data[1] = n_bw_cell * 4;
+  }
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_scratch_buffer,
+                                                   bw_scratch_buffer_size));
+  if (is_hybrid_op) {
+    // Allocate temporary tensors to store quantized values of input, aux_input
+    // (if present), activation_state and cell_state tensors.
+    node->temporaries->data[kInputQuantized] =
+        *scratch_tensor_index + kInputQuantized;
+    TfLiteTensor* input_quantized =
+        GetTemporary(context, node, kInputQuantized);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+
+    node->temporaries->data[kFwActivationStateQuantized] =
+        *scratch_tensor_index + kFwActivationStateQuantized;
+    TfLiteTensor* fw_activation_state_quantized =
+        GetTemporary(context, node, kFwActivationStateQuantized);
+    fw_activation_state_quantized->type = kTfLiteUInt8;
+    fw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(fw_activation_state_quantized->dims,
+                             fw_activation_state->dims)) {
+      TfLiteIntArray* fw_activation_state_quantized_size =
+          TfLiteIntArrayCopy(fw_activation_state->dims);
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, fw_activation_state_quantized,
+                                         fw_activation_state_quantized_size));
+    }
+    node->temporaries->data[kBwActivationStateQuantized] =
+        *scratch_tensor_index + kBwActivationStateQuantized;
+    TfLiteTensor* bw_activation_state_quantized =
+        GetTemporary(context, node, kBwActivationStateQuantized);
+    bw_activation_state_quantized->type = kTfLiteUInt8;
+    bw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(bw_activation_state_quantized->dims,
+                             bw_activation_state->dims)) {
+      TfLiteIntArray* bw_activation_state_quantized_size =
+          TfLiteIntArrayCopy(bw_activation_state->dims);
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, bw_activation_state_quantized,
+                                         bw_activation_state_quantized_size));
+    }
+    node->temporaries->data[kFwCellStateQuantized] =
+        *scratch_tensor_index + kFwCellStateQuantized;
+    TfLiteTensor* fw_cell_state_quantized =
+        GetTemporary(context, node, kFwCellStateQuantized);
+    fw_cell_state_quantized->type = kTfLiteUInt8;
+    fw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(fw_cell_state_quantized->dims,
+                             fw_cell_state->dims)) {
+      TfLiteIntArray* fw_cell_state_quantized_size =
+          TfLiteIntArrayCopy(fw_cell_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, fw_cell_state_quantized,
+                                              fw_cell_state_quantized_size));
+    }
+    node->temporaries->data[kBwCellStateQuantized] =
+        *scratch_tensor_index + kBwCellStateQuantized;
+    TfLiteTensor* bw_cell_state_quantized =
+        GetTemporary(context, node, kBwCellStateQuantized);
+    bw_cell_state_quantized->type = kTfLiteUInt8;
+    bw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(bw_cell_state_quantized->dims,
+                             bw_cell_state->dims)) {
+      TfLiteIntArray* bw_cell_state_quantized_size =
+          TfLiteIntArrayCopy(bw_cell_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, bw_cell_state_quantized,
+                                              bw_cell_state_quantized_size));
+    }
+
+    // Allocate temporary tensors to store scaling factors and product scaling
+    // factors. The latter is a convenience storage which allows to quantize
+    // a vector once (which produces the scaling factors) and multiply it with
+    // different matrices (which requires multiplying the scaling factors with
+    // the scaling factor of the matrix).
+    node->temporaries->data[kScalingFactors] =
+        *scratch_tensor_index + kScalingFactors;
+    TfLiteTensor* scaling_factors =
+        GetTemporary(context, node, kScalingFactors);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    int scaling_dims[1] = {n_batch};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+    node->temporaries->data[kProductScalingFactors] =
+        *scratch_tensor_index + kProductScalingFactors;
+    TfLiteTensor* prod_scaling_factors =
+        GetTemporary(context, node, kProductScalingFactors);
+    prod_scaling_factors->type = kTfLiteFloat32;
+    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
+                                   scaling_dims)) {
+      TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
+      prod_scaling_factors_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, prod_scaling_factors,
+                                              prod_scaling_factors_size));
+    }
+
+    // Allocate a temporary tensor to store the recovered cell weights. Since
+    // this is used for diagonal matrices, only need to store n_cell values.
+    node->temporaries->data[kRecoveredCellWeights] =
+        *scratch_tensor_index + kRecoveredCellWeights;
+    TfLiteTensor* recovered_cell_weights =
+        GetTemporary(context, node, kRecoveredCellWeights);
+    recovered_cell_weights->type = kTfLiteFloat32;
+    recovered_cell_weights->allocation_type = kTfLiteArenaRw;
+    int recovered_cell_dims[1] = {n_fw_cell};
+    if (!TfLiteIntArrayEqualsArray(recovered_cell_weights->dims, 1,
+                                   recovered_cell_dims)) {
+      TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1);
+      recovered_cell_weights_size->data[0] = n_fw_cell;
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, recovered_cell_weights,
+                                              recovered_cell_weights_size));
+    }
+
+    // Only allocate a temporary tensor for quantized auxiliary input if we are
+    // actually going to use it.
+    if (has_aux_input) {
+      node->temporaries->data[kAuxInputQuantized] =
+          *scratch_tensor_index + kAuxInputQuantized;
+      TfLiteTensor* aux_input_quantized =
+          GetTemporary(context, node, kAuxInputQuantized);
+      aux_input_quantized->type = kTfLiteUInt8;
+      aux_input_quantized->allocation_type = kTfLiteArenaRw;
+      if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
+        TfLiteIntArray* aux_input_quantized_size =
+            TfLiteIntArrayCopy(aux_input->dims);
+        TF_LITE_ENSURE_OK(context,
+                          context->ResizeTensor(context, aux_input_quantized,
+                                                aux_input_quantized_size));
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+// The LSTM Op engine.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceLSTMParams*>(
+      node->builtin_data);
+
+  // Input tensor.
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  // Tensors for the forward cell.
+  const TfLiteTensor* fw_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
+  const TfLiteTensor* fw_input_to_forget_weights =
+      GetInput(context, node, kFwInputToForgetWeightsTensor);
+  const TfLiteTensor* fw_input_to_cell_weights =
+      GetInput(context, node, kFwInputToCellWeightsTensor);
+  const TfLiteTensor* fw_input_to_output_weights =
+      GetInput(context, node, kFwInputToOutputWeightsTensor);
+
+  const TfLiteTensor* fw_recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwRecurrentToInputWeightsTensor);
+  const TfLiteTensor* fw_recurrent_to_forget_weights =
+      GetInput(context, node, kFwRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* fw_recurrent_to_cell_weights =
+      GetInput(context, node, kFwRecurrentToCellWeightsTensor);
+  const TfLiteTensor* fw_recurrent_to_output_weights =
+      GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* fw_cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwCellToInputWeightsTensor);
+  const TfLiteTensor* fw_cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kFwCellToForgetWeightsTensor);
+  const TfLiteTensor* fw_cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kFwCellToOutputWeightsTensor);
+
+  const TfLiteTensor* fw_input_gate_bias =
+      GetOptionalInputTensor(context, node, kFwInputGateBiasTensor);
+  const TfLiteTensor* fw_forget_gate_bias =
+      GetInput(context, node, kFwForgetGateBiasTensor);
+  const TfLiteTensor* fw_cell_bias =
+      GetInput(context, node, kFwCellGateBiasTensor);
+  const TfLiteTensor* fw_output_gate_bias =
+      GetInput(context, node, kFwOutputGateBiasTensor);
+
+  const TfLiteTensor* fw_projection_weights =
+      GetOptionalInputTensor(context, node, kFwProjectionWeightsTensor);
+  const TfLiteTensor* fw_projection_bias =
+      GetOptionalInputTensor(context, node, kFwProjectionBiasTensor);
+
+  TfLiteTensor* fw_activation_state =
+      GetVariableInput(context, node, kFwInputActivationStateTensor);
+  TfLiteTensor* fw_cell_state =
+      GetVariableInput(context, node, kFwInputCellStateTensor);
+  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+
+  // Tensors for the backward cell.
+  const TfLiteTensor* bw_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
+  const TfLiteTensor* bw_input_to_forget_weights =
+      GetInput(context, node, kBwInputToForgetWeightsTensor);
+  const TfLiteTensor* bw_input_to_cell_weights =
+      GetInput(context, node, kBwInputToCellWeightsTensor);
+  const TfLiteTensor* bw_input_to_output_weights =
+      GetInput(context, node, kBwInputToOutputWeightsTensor);
+
+  const TfLiteTensor* bw_recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwRecurrentToInputWeightsTensor);
+  const TfLiteTensor* bw_recurrent_to_forget_weights =
+      GetInput(context, node, kBwRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* bw_recurrent_to_cell_weights =
+      GetInput(context, node, kBwRecurrentToCellWeightsTensor);
+  const TfLiteTensor* bw_recurrent_to_output_weights =
+      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* bw_cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwCellToInputWeightsTensor);
+  const TfLiteTensor* bw_cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kBwCellToForgetWeightsTensor);
+  const TfLiteTensor* bw_cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kBwCellToOutputWeightsTensor);
+
+  const TfLiteTensor* bw_input_gate_bias =
+      GetOptionalInputTensor(context, node, kBwInputGateBiasTensor);
+  const TfLiteTensor* bw_forget_gate_bias =
+      GetInput(context, node, kBwForgetGateBiasTensor);
+  const TfLiteTensor* bw_cell_bias =
+      GetInput(context, node, kBwCellGateBiasTensor);
+  const TfLiteTensor* bw_output_gate_bias =
+      GetInput(context, node, kBwOutputGateBiasTensor);
+
+  const TfLiteTensor* bw_projection_weights =
+      GetOptionalInputTensor(context, node, kBwProjectionWeightsTensor);
+  const TfLiteTensor* bw_projection_bias =
+      GetOptionalInputTensor(context, node, kBwProjectionBiasTensor);
+
+  // State tensors.
+  TfLiteTensor* bw_activation_state =
+      GetVariableInput(context, node, kBwInputActivationStateTensor);
+  TfLiteTensor* bw_cell_state =
+      GetVariableInput(context, node, kBwInputCellStateTensor);
+  TfLiteTensor* bw_output = params->merge_outputs
+                                ? nullptr
+                                : GetOutput(context, node, kBwOutputTensor);
+
+  // Temporary tensors.
+  TfLiteTensor* fw_scratch_buffer =
+      GetTemporary(context, node, kFwScratchBuffer);
+  TfLiteTensor* bw_scratch_buffer =
+      GetTemporary(context, node, kBwScratchBuffer);
+
+  // (Optional) auxiliary inputs.
+  const TfLiteTensor* aux_input =
+      GetOptionalInputTensor(context, node, kAuxInputTensor);
+  const TfLiteTensor* fw_aux_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kFwAuxInputToInputWeightsTensor);
+  const TfLiteTensor* fw_aux_input_to_forget_weights =
+      GetOptionalInputTensor(context, node, kFwAuxInputToForgetWeightsTensor);
+  const TfLiteTensor* fw_aux_input_to_cell_weights =
+      GetOptionalInputTensor(context, node, kFwAuxInputToCellWeightsTensor);
+  const TfLiteTensor* fw_aux_input_to_output_weights =
+      GetOptionalInputTensor(context, node, kFwAuxInputToOutputWeightsTensor);
+  const TfLiteTensor* bw_aux_input_to_input_weights =
+      GetOptionalInputTensor(context, node, kBwAuxInputToInputWeightsTensor);
+  const TfLiteTensor* bw_aux_input_to_forget_weights =
+      GetOptionalInputTensor(context, node, kBwAuxInputToForgetWeightsTensor);
+  const TfLiteTensor* bw_aux_input_to_cell_weights =
+      GetOptionalInputTensor(context, node, kBwAuxInputToCellWeightsTensor);
+  const TfLiteTensor* bw_aux_input_to_output_weights =
+      GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
+
+  // Populate a TfLiteLSTMParams struct for the evaluation functions.
+  TfLiteLSTMParams lstm_params = {params->activation, params->cell_clip,
+                                  params->proj_clip, kTfLiteLSTMFullKernel};
+
+  const int bw_output_offset =
+      params->merge_outputs ? fw_recurrent_to_output_weights->dims->data[1] : 0;
+  const auto actual_bw_output = params->merge_outputs ? fw_output : bw_output;
+
+  // TODO(mirkov): add batch_major support (http://b/117326122).
+  switch (fw_input_to_output_weights->type) {
+    case kTfLiteFloat32: {
+      TfLiteStatus fw_pass_status = lstm_eval::EvalFloat(
+          input, fw_input_to_input_weights, fw_input_to_forget_weights,
+          fw_input_to_cell_weights, fw_input_to_output_weights,
+          fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
+          fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights,
+          fw_cell_to_input_weights, fw_cell_to_forget_weights,
+          fw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights,
+          fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
+          fw_aux_input_to_output_weights, fw_input_gate_bias,
+          fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias,
+          fw_projection_weights, fw_projection_bias, &lstm_params,
+          /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
+          fw_scratch_buffer, fw_activation_state, fw_cell_state, fw_output);
+      TF_LITE_ENSURE_OK(context, fw_pass_status);
+
+      TfLiteStatus bw_pass_status = lstm_eval::EvalFloat(
+          input, bw_input_to_input_weights, bw_input_to_forget_weights,
+          bw_input_to_cell_weights, bw_input_to_output_weights,
+          bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
+          bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
+          bw_cell_to_input_weights, bw_cell_to_forget_weights,
+          bw_cell_to_output_weights, aux_input, bw_aux_input_to_input_weights,
+          bw_aux_input_to_forget_weights, bw_aux_input_to_cell_weights,
+          bw_aux_input_to_output_weights, bw_input_gate_bias,
+          bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias,
+          bw_projection_weights, bw_projection_bias, &lstm_params,
+          /*forward_sequence=*/false, /*time_major=*/true, bw_output_offset,
+          bw_scratch_buffer, bw_activation_state, bw_cell_state,
+          actual_bw_output);
+      TF_LITE_ENSURE_OK(context, bw_pass_status);
+      return kTfLiteOk;
+    }
+    case kTfLiteUInt8: {
+      TfLiteTensor* input_quantized =
+          GetTemporary(context, node, kInputQuantized);
+      TfLiteTensor* fw_activation_state_quantized =
+          GetTemporary(context, node, kFwActivationStateQuantized);
+      TfLiteTensor* bw_activation_state_quantized =
+          GetTemporary(context, node, kBwActivationStateQuantized);
+      TfLiteTensor* fw_cell_state_quantized =
+          GetTemporary(context, node, kFwCellStateQuantized);
+      TfLiteTensor* bw_cell_state_quantized =
+          GetTemporary(context, node, kBwCellStateQuantized);
+      TfLiteTensor* scaling_factors =
+          GetTemporary(context, node, kScalingFactors);
+      TfLiteTensor* prod_scaling_factors =
+          GetTemporary(context, node, kProductScalingFactors);
+      TfLiteTensor* recovered_cell_weights =
+          GetTemporary(context, node, kRecoveredCellWeights);
+      TfLiteTensor* aux_input_quantized =
+          (aux_input == nullptr)
+              ? nullptr
+              : GetTemporary(context, node, kAuxInputQuantized);
+
+      TfLiteStatus fw_pass_status = lstm_eval::EvalHybrid(
+          input, fw_input_to_input_weights, fw_input_to_forget_weights,
+          fw_input_to_cell_weights, fw_input_to_output_weights,
+          fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
+          fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights,
+          fw_cell_to_input_weights, fw_cell_to_forget_weights,
+          fw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights,
+          fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
+          fw_aux_input_to_output_weights, fw_input_gate_bias,
+          fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias,
+          fw_projection_weights, fw_projection_bias, &lstm_params,
+          /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
+          fw_scratch_buffer, scaling_factors, prod_scaling_factors,
+          recovered_cell_weights, input_quantized, aux_input_quantized,
+          fw_activation_state_quantized, fw_cell_state_quantized,
+          fw_activation_state, fw_cell_state, fw_output);
+      TF_LITE_ENSURE_OK(context, fw_pass_status);
+
+      TfLiteStatus bw_pass_status = lstm_eval::EvalHybrid(
+          input, bw_input_to_input_weights, bw_input_to_forget_weights,
+          bw_input_to_cell_weights, bw_input_to_output_weights,
+          bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
+          bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
+          bw_cell_to_input_weights, bw_cell_to_forget_weights,
+          bw_cell_to_output_weights, aux_input, bw_aux_input_to_input_weights,
+          bw_aux_input_to_forget_weights, bw_aux_input_to_cell_weights,
+          bw_aux_input_to_output_weights, bw_input_gate_bias,
+          bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias,
+          bw_projection_weights, bw_projection_bias, &lstm_params,
+          /*forward_sequence=*/false, /*time_major=*/true, bw_output_offset,
+          bw_scratch_buffer, scaling_factors, prod_scaling_factors,
+          recovered_cell_weights, input_quantized, aux_input_quantized,
+          bw_activation_state_quantized, bw_cell_state_quantized,
+          bw_activation_state, bw_cell_state, actual_bw_output);
+      TF_LITE_ENSURE_OK(context, bw_pass_status);
+      return kTfLiteOk;
+    }
+    default:
+      context->ReportError(context, "Type %d is not currently supported.",
+                           fw_input_to_output_weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace bidirectional_sequence_lstm
+
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM() {
+  static TfLiteRegistration r = {
+      bidirectional_sequence_lstm::Init, bidirectional_sequence_lstm::Free,
+      bidirectional_sequence_lstm::Prepare, bidirectional_sequence_lstm::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
rename to tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index 9cc04907e1e7e710ba5d1e883ca2e784f89184d6..b865322682a6dbe2aa7337af0692830fe79efe23 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -14,16 +14,18 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Bidirectional LSTM op.
 
+#include <initializer_list>
 #include <iomanip>
 #include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
@@ -37,6 +39,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
                            bool use_peephole, bool use_projection_weights,
                            bool use_projection_bias, bool merge_outputs,
                            float cell_clip, float proj_clip,
+                           bool quantize_weights,
                            const std::vector<std::vector<int>>& input_shapes)
       : n_batch_(n_batch),
         n_input_(n_input),
@@ -44,37 +47,40 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
         n_bw_cell_(n_cell),
         n_fw_output_(n_output),
         n_bw_output_(n_output),
-        sequence_length_(sequence_length) {
+        sequence_length_(sequence_length),
+        quantize_weights_(quantize_weights) {
     input_ = AddInput(TensorType_FLOAT32);
+    const auto weight_type =
+        quantize_weights_ ? TensorType_UINT8 : TensorType_FLOAT32;
 
     if (use_cifg) {
       fw_input_to_input_weights_ = AddNullInput();
     } else {
-      fw_input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      fw_input_to_input_weights_ = AddInput(weight_type);
     }
 
-    fw_input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-    fw_input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
-    fw_input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    fw_input_to_forget_weights_ = AddInput(weight_type);
+    fw_input_to_cell_weights_ = AddInput(weight_type);
+    fw_input_to_output_weights_ = AddInput(weight_type);
 
     if (use_cifg) {
       fw_recurrent_to_input_weights_ = AddNullInput();
     } else {
-      fw_recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      fw_recurrent_to_input_weights_ = AddInput(weight_type);
     }
 
-    fw_recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-    fw_recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
-    fw_recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    fw_recurrent_to_forget_weights_ = AddInput(weight_type);
+    fw_recurrent_to_cell_weights_ = AddInput(weight_type);
+    fw_recurrent_to_output_weights_ = AddInput(weight_type);
 
     if (use_peephole) {
       if (use_cifg) {
         fw_cell_to_input_weights_ = AddNullInput();
       } else {
-        fw_cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+        fw_cell_to_input_weights_ = AddInput(weight_type);
       }
-      fw_cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-      fw_cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+      fw_cell_to_forget_weights_ = AddInput(weight_type);
+      fw_cell_to_output_weights_ = AddInput(weight_type);
     } else {
       fw_cell_to_input_weights_ = AddNullInput();
       fw_cell_to_forget_weights_ = AddNullInput();
@@ -105,31 +111,31 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
     if (use_cifg) {
       bw_input_to_input_weights_ = AddNullInput();
     } else {
-      bw_input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      bw_input_to_input_weights_ = AddInput(weight_type);
     }
 
-    bw_input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-    bw_input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
-    bw_input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    bw_input_to_forget_weights_ = AddInput(weight_type);
+    bw_input_to_cell_weights_ = AddInput(weight_type);
+    bw_input_to_output_weights_ = AddInput(weight_type);
 
     if (use_cifg) {
       bw_recurrent_to_input_weights_ = AddNullInput();
     } else {
-      bw_recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      bw_recurrent_to_input_weights_ = AddInput(weight_type);
     }
 
-    bw_recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-    bw_recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
-    bw_recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    bw_recurrent_to_forget_weights_ = AddInput(weight_type);
+    bw_recurrent_to_cell_weights_ = AddInput(weight_type);
+    bw_recurrent_to_output_weights_ = AddInput(weight_type);
 
     if (use_peephole) {
       if (use_cifg) {
         bw_cell_to_input_weights_ = AddNullInput();
       } else {
-        bw_cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+        bw_cell_to_input_weights_ = AddInput(weight_type);
       }
-      bw_cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-      bw_cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+      bw_cell_to_forget_weights_ = AddInput(weight_type);
+      bw_cell_to_output_weights_ = AddInput(weight_type);
     } else {
       bw_cell_to_input_weights_ = AddNullInput();
       bw_cell_to_forget_weights_ = AddNullInput();
@@ -146,7 +152,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
     bw_output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
-      bw_projection_weights_ = AddInput(TensorType_FLOAT32);
+      bw_projection_weights_ = AddInput(weight_type);
       if (use_projection_bias) {
         bw_projection_bias_ = AddInput(TensorType_FLOAT32);
       } else {
@@ -198,88 +204,96 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
     BuildInterpreter(input_shapes);
   }
 
+  void PopulateWeightTensor(int tensor_id, const std::vector<float>& f) {
+    if (quantize_weights_) {
+      SymmetricQuantizeAndPopulate(tensor_id, f);
+    } else {
+      PopulateTensor(tensor_id, f);
+    }
+  }
+
   // Set weights in forward and backward cells to be the same.
-  void SetInputToInputWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_input_to_input_weights_, f);
-    PopulateTensor(bw_input_to_input_weights_, f);
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_input_to_input_weights_, f);
+    PopulateWeightTensor(bw_input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_input_to_forget_weights_, f);
-    PopulateTensor(bw_input_to_forget_weights_, f);
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_input_to_forget_weights_, f);
+    PopulateWeightTensor(bw_input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_input_to_cell_weights_, f);
-    PopulateTensor(bw_input_to_cell_weights_, f);
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_input_to_cell_weights_, f);
+    PopulateWeightTensor(bw_input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_input_to_output_weights_, f);
-    PopulateTensor(bw_input_to_output_weights_, f);
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_input_to_output_weights_, f);
+    PopulateWeightTensor(bw_input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_recurrent_to_input_weights_, f);
-    PopulateTensor(bw_recurrent_to_input_weights_, f);
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_recurrent_to_input_weights_, f);
+    PopulateWeightTensor(bw_recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_recurrent_to_forget_weights_, f);
-    PopulateTensor(bw_recurrent_to_forget_weights_, f);
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_recurrent_to_forget_weights_, f);
+    PopulateWeightTensor(bw_recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_recurrent_to_cell_weights_, f);
-    PopulateTensor(bw_recurrent_to_cell_weights_, f);
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_recurrent_to_cell_weights_, f);
+    PopulateWeightTensor(bw_recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_recurrent_to_output_weights_, f);
-    PopulateTensor(bw_recurrent_to_output_weights_, f);
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_recurrent_to_output_weights_, f);
+    PopulateWeightTensor(bw_recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_cell_to_input_weights_, f);
-    PopulateTensor(bw_cell_to_input_weights_, f);
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_cell_to_input_weights_, f);
+    PopulateWeightTensor(bw_cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_cell_to_forget_weights_, f);
-    PopulateTensor(bw_cell_to_forget_weights_, f);
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_cell_to_forget_weights_, f);
+    PopulateWeightTensor(bw_cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_cell_to_output_weights_, f);
-    PopulateTensor(bw_cell_to_output_weights_, f);
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_cell_to_output_weights_, f);
+    PopulateWeightTensor(bw_cell_to_output_weights_, f);
   }
 
-  void SetInputGateBias(std::initializer_list<float> f) {
+  void SetInputGateBias(const std::vector<float>& f) {
     PopulateTensor(fw_input_gate_bias_, f);
     PopulateTensor(bw_input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::initializer_list<float> f) {
+  void SetForgetGateBias(const std::vector<float>& f) {
     PopulateTensor(fw_forget_gate_bias_, f);
     PopulateTensor(bw_forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::initializer_list<float> f) {
+  void SetCellBias(const std::vector<float>& f) {
     PopulateTensor(fw_cell_bias_, f);
     PopulateTensor(bw_cell_bias_, f);
   }
 
-  void SetOutputGateBias(std::initializer_list<float> f) {
+  void SetOutputGateBias(const std::vector<float>& f) {
     PopulateTensor(fw_output_gate_bias_, f);
     PopulateTensor(bw_output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::initializer_list<float> f) {
-    PopulateTensor(fw_projection_weights_, f);
-    PopulateTensor(bw_projection_weights_, f);
+  void SetProjectionWeights(const std::vector<float>& f) {
+    PopulateWeightTensor(fw_projection_weights_, f);
+    PopulateWeightTensor(bw_projection_weights_, f);
   }
 
-  void SetProjectionBias(std::initializer_list<float> f) {
+  void SetProjectionBias(const std::vector<float>& f) {
     PopulateTensor(fw_projection_bias_, f);
     PopulateTensor(bw_projection_bias_, f);
   }
@@ -370,21 +384,30 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
   int n_fw_output_;
   int n_bw_output_;
   int sequence_length_;
+
+  bool quantize_weights_;
 };
 
-TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
+// Declare LSTMOpTest as a parameterized test, where the parameter is a boolean
+// indicating whether to use quantization or not.
+class LSTMOpTest : public ::testing::TestWithParam<bool> {};
+
+INSTANTIATE_TEST_CASE_P(QuantizationOrNot, LSTMOpTest, ::testing::Bool());
+
+TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
   const int n_cell = 4;
   const int n_output = 4;
   const int sequence_length = 3;
+  const bool quantize_weights = GetParam();
 
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
       /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0,
+      /*proj_clip=*/0.0, quantize_weights,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -502,9 +525,8 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
       -0.03716109, 0.12507336, 0.41193449, -0.20860538,
       -0.15053082, 0.09120187, 0.24278517, -0.12222792};
   static float lstm_bw_golden_output[] = {
-      -0.0806187, 0.139077, 0.400476, -0.197842,
-      -0.0332076, 0.123838, 0.309777, -0.17621,
-      -0.0490733, 0.0739237, 0.067706, -0.0208124};
+      -0.0806187, 0.139077, 0.400476,   -0.197842, -0.0332076, 0.123838,
+      0.309777,   -0.17621, -0.0490733, 0.0739237, 0.067706,   -0.0208124};
 
   float* batch0_start = lstm_input;
   float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
@@ -519,7 +541,8 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
   std::vector<float> fw_expected;
   fw_expected.insert(fw_expected.end(), fw_golden_start, fw_golden_end);
   EXPECT_THAT(lstm.GetFwOutput(),
-              ElementsAreArray(ArrayFloatNear(fw_expected)));
+              ElementsAreArray(
+                  ArrayFloatNear(fw_expected, quantize_weights ? 1e-2 : 1e-5)));
 
   float* bw_golden_start = lstm_bw_golden_output;
   float* bw_golden_end =
@@ -527,23 +550,26 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
   std::vector<float> bw_expected;
   bw_expected.insert(bw_expected.end(), bw_golden_start, bw_golden_end);
   EXPECT_THAT(lstm.GetBwOutput(),
-              ElementsAreArray(ArrayFloatNear(bw_expected)));
+              ElementsAreArray(
+                  ArrayFloatNear(bw_expected, quantize_weights ? 1e-2 : 1e-5)));
 }
 
-// Same as the previous test, yet with a single merged output tensor.
-TEST(LSTMOpTest, BlackBoxTestMergedOutput) {
-  const int n_batch = 1;
+// Same as the previous test, yet with a single merged output tensor and n_batch
+// of 2.
+TEST_P(LSTMOpTest, BlackBoxTestMergedOutput) {
+  const int n_batch = 2;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
   const int n_cell = 4;
   const int n_output = 4;
   const int sequence_length = 3;
+  const bool quantize_weights = GetParam();
 
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
       /*use_projection_bias=*/false, /*merge_outputs=*/true, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0,
+      /*proj_clip=*/0.0, quantize_weights,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -655,24 +681,29 @@ TEST(LSTMOpTest, BlackBoxTestMergedOutput) {
        -0.51818722, -0.15390486, 0.0468148, 0.39922136});
 
   // Input should have n_input * sequence_length many values.
-  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_input[] = {2., 3., 2., 3., 3., 4., 3., 4., 1., 1., 1., 1.};
   static float lstm_fw_golden_output[] = {
-      -0.02973187, 0.1229473,  0.20885126, -0.15358765,
-      -0.03716109, 0.12507336, 0.41193449, -0.20860538,
-      -0.15053082, 0.09120187, 0.24278517, -0.12222792};
+      -0.02973187, 0.1229473,   0.20885126,  -0.15358765, -0.02973187,
+      0.1229473,   0.20885126,  -0.15358765, -0.03716109, 0.12507336,
+      0.41193449,  -0.20860538, -0.03716109, 0.12507336,  0.41193449,
+      -0.20860538, -0.15053082, 0.09120187,  0.24278517,  -0.12222792,
+      -0.15053082, 0.09120187,  0.24278517,  -0.12222792};
   static float lstm_bw_golden_output[] = {
-      -0.0806187, 0.139077, 0.400476,   -0.197842, -0.0332076, 0.123838,
-      0.309777,   -0.17621, -0.0490733, 0.0739237, 0.067706,   -0.0208124};
+      -0.0806187, 0.139077,   0.400476,   -0.197842, -0.0806187, 0.139077,
+      0.400476,   -0.197842,  -0.0332076, 0.123838,  0.309777,   -0.17621,
+      -0.0332076, 0.123838,   0.309777,   -0.17621,  -0.0490733, 0.0739237,
+      0.067706,   -0.0208124, -0.0490733, 0.0739237, 0.067706,   -0.0208124};
 
   float* batch0_start = lstm_input;
-  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.num_batches() *
+                                         lstm.sequence_length();
 
   lstm.SetInput(0, batch0_start, batch0_end);
 
   lstm.Invoke();
 
   std::vector<float> merged_expected;
-  for (int k = 0; k < lstm.sequence_length(); k++) {
+  for (int k = 0; k < lstm.sequence_length() * lstm.num_batches(); k++) {
     merged_expected.insert(
         merged_expected.end(),
         lstm_fw_golden_output + k * lstm.num_fw_outputs(),
@@ -683,7 +714,8 @@ TEST(LSTMOpTest, BlackBoxTestMergedOutput) {
         lstm_bw_golden_output + (k + 1) * lstm.num_bw_outputs());
   }
   EXPECT_THAT(lstm.GetFwOutput(),
-              ElementsAreArray(ArrayFloatNear(merged_expected)));
+              ElementsAreArray(ArrayFloatNear(merged_expected,
+                                              quantize_weights ? 1e-2 : 1e-5)));
 }
 
 TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
@@ -698,7 +730,7 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
       /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0,
+      /*proj_clip=*/0.0, /*quantize_weights=*/false,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -858,7 +890,7 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
       /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0,
+      /*proj_clip=*/0.0, /*quantize_weights=*/false,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -1008,7 +1040,7 @@ TEST(LSTMOpTest,
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
       /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0,
+      /*proj_clip=*/0.0, /*quantize_weights=*/false,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -1158,7 +1190,7 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/true, /*use_projection_weights=*/true,
       /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0,
+      /*proj_clip=*/0.0, /*quantize_weights=*/false,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -1779,31 +1811,28 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
        0.0412031,    0.0118723,   0.0239643,   0.0394009}};
 
   static float lstm_combined_golden_output[][64] = {
-    {
-      -0.022014,  0.073544, -0.002235,  0.040068, -0.037136, -0.052788,
-      0.075325, -0.029378,  0.024298, -0.07733 , -0.030674, -0.060229,
-      0.040599,  0.011608,  0.042005,  0.045977, -0.039225,  0.076294,
-      0.000735,  0.032852, -0.069869, -0.053312,  0.073527, -0.028136,
-      0.021585, -0.102679, -0.004327, -0.043304,  0.072861,  0.027077,
-      0.034558,  0.068292, -0.036292,  0.069832, -0.003032,  0.053829,
-      -0.043821, -0.072713,  0.085029, -0.040374,  0.020014, -0.104521,
-      -0.034504, -0.059759,  0.062569,  0.025652,  0.049306,  0.061189,
-      -0.025146,  0.079643, -0.005188,  0.033080, -0.048079, -0.048082,
-      0.069369, -0.028900,  0.024572, -0.077547, -0.022517, -0.054477,
-      0.038857,  0.013336,  0.043234,  0.044788},
-    {
-      -0.039186,  0.070792, -0.005913,  0.02642,  -0.068274, -0.05022,
-      0.061444, -0.031241,  0.014996, -0.094544, -0.004146, -0.03464,
-      0.058981,  0.026097,  0.039781,  0.058408, -0.031887,  0.069252,
-      0.00576,   0.054062, -0.042801, -0.059974,  0.085272, -0.034453,
-      0.026097, -0.0959,   -0.031164, -0.058699,  0.06839,   0.020512,
-      0.044727,  0.063609, -0.039863,  0.084819, -0.003909,  0.028666,
-      -0.075677, -0.045125,  0.070379, -0.033895,  0.022111, -0.097184,
-      -0.004921, -0.040851,  0.062316,  0.017435,  0.041437,  0.064568,
-      -0.039656,  0.060726, -0.003402,  0.036854, -0.056503, -0.058554,
-      0.068588, -0.034879,  0.01352,  -0.09962,  -0.01434,  -0.039505,
-      0.065133,  0.024321,  0.038473,  0.062438
-    }};
+      {-0.022014, 0.073544,  -0.002235, 0.040068,  -0.037136, -0.052788,
+       0.075325,  -0.029378, 0.024298,  -0.07733,  -0.030674, -0.060229,
+       0.040599,  0.011608,  0.042005,  0.045977,  -0.039225, 0.076294,
+       0.000735,  0.032852,  -0.069869, -0.053312, 0.073527,  -0.028136,
+       0.021585,  -0.102679, -0.004327, -0.043304, 0.072861,  0.027077,
+       0.034558,  0.068292,  -0.036292, 0.069832,  -0.003032, 0.053829,
+       -0.043821, -0.072713, 0.085029,  -0.040374, 0.020014,  -0.104521,
+       -0.034504, -0.059759, 0.062569,  0.025652,  0.049306,  0.061189,
+       -0.025146, 0.079643,  -0.005188, 0.033080,  -0.048079, -0.048082,
+       0.069369,  -0.028900, 0.024572,  -0.077547, -0.022517, -0.054477,
+       0.038857,  0.013336,  0.043234,  0.044788},
+      {-0.039186, 0.070792,  -0.005913, 0.02642,   -0.068274, -0.05022,
+       0.061444,  -0.031241, 0.014996,  -0.094544, -0.004146, -0.03464,
+       0.058981,  0.026097,  0.039781,  0.058408,  -0.031887, 0.069252,
+       0.00576,   0.054062,  -0.042801, -0.059974, 0.085272,  -0.034453,
+       0.026097,  -0.0959,   -0.031164, -0.058699, 0.06839,   0.020512,
+       0.044727,  0.063609,  -0.039863, 0.084819,  -0.003909, 0.028666,
+       -0.075677, -0.045125, 0.070379,  -0.033895, 0.022111,  -0.097184,
+       -0.004921, -0.040851, 0.062316,  0.017435,  0.041437,  0.064568,
+       -0.039656, 0.060726,  -0.003402, 0.036854,  -0.056503, -0.058554,
+       0.068588,  -0.034879, 0.01352,   -0.09962,  -0.01434,  -0.039505,
+       0.065133,  0.024321,  0.038473,  0.062438}};
 
   for (int i = 0; i < lstm.sequence_length(); i++) {
     float* batch0_start = lstm_input[0] + i * lstm.num_inputs();
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5194c2463092eedd41f634dda8b8db201b03e699
--- /dev/null
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
@@ -0,0 +1,651 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace bidirectional_sequence_rnn {
+
+constexpr int kInputTensor = 0;
+// Forward and backward cell tensors.
+constexpr int kFwWeightsTensor = 1;
+constexpr int kFwRecurrentWeightsTensor = 2;
+constexpr int kFwBiasTensor = 3;
+constexpr int kFwHiddenStateTensor = 4;
+constexpr int kBwWeightsTensor = 5;
+constexpr int kBwRecurrentWeightsTensor = 6;
+constexpr int kBwBiasTensor = 7;
+constexpr int kBwHiddenStateTensor = 8;
+// Auxiliary inputs.
+constexpr int kAuxInputTensor = 9;       // Optional.
+constexpr int kFwAuxWeightsTensor = 10;  // Optional.
+constexpr int kBwAuxWeightsTensor = 11;  // Optional.
+// Output tensors.
+constexpr int kFwOutputTensor = 0;
+constexpr int kBwOutputTensor = 1;  // Only if merge_outputs is false.
+
+// Temporary tensors.
+enum TemporaryTensor {
+  kInputQuantized = 0,
+  kFwHiddenStateQuantized = 1,
+  kBwHiddenStateQuantized = 2,
+  kScalingFactors = 3,
+  kAuxInputQuantized = 4,
+  kNumTemporaryTensors = 5
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceRNNParams*>(
+      node->builtin_data);
+
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 12);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size,
+                    params->merge_outputs ? 1 : 2);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* fw_input_weights =
+      GetInput(context, node, kFwWeightsTensor);
+  const TfLiteTensor* fw_recurrent_weights =
+      GetInput(context, node, kFwRecurrentWeightsTensor);
+  const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor);
+  const TfLiteTensor* fw_hidden_state =
+      GetInput(context, node, kFwHiddenStateTensor);
+  const TfLiteTensor* bw_input_weights =
+      GetInput(context, node, kBwWeightsTensor);
+  const TfLiteTensor* bw_recurrent_weights =
+      GetInput(context, node, kBwRecurrentWeightsTensor);
+  const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor);
+  const TfLiteTensor* bw_hidden_state =
+      GetInput(context, node, kBwHiddenStateTensor);
+
+  const TfLiteTensor* aux_input =
+      GetOptionalInputTensor(context, node, kAuxInputTensor);
+  const TfLiteTensor* fw_aux_input_weights =
+      GetOptionalInputTensor(context, node, kFwAuxWeightsTensor);
+  const TfLiteTensor* bw_aux_input_weights =
+      GetOptionalInputTensor(context, node, kBwAuxWeightsTensor);
+
+  const bool aux_inputs_all_or_none =
+      ((aux_input != nullptr) && (fw_aux_input_weights != nullptr) &&
+       (bw_aux_input_weights != nullptr)) ||
+      ((aux_input == nullptr) && (fw_aux_input_weights == nullptr) &&
+       (bw_aux_input_weights == nullptr));
+  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
+  const bool has_aux_input = (aux_input != nullptr);
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
+  const bool time_major = params->time_major;
+  const int batch_size =
+      (time_major) ? input->dims->data[1] : input->dims->data[0];
+  const int max_time =
+      (time_major) ? input->dims->data[0] : input->dims->data[1];
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, input->dims->data[2],
+                    fw_input_weights->dims->data[1]);
+  TF_LITE_ENSURE_EQ(context, input->dims->data[2],
+                    bw_input_weights->dims->data[1]);
+  TF_LITE_ENSURE_EQ(context, fw_input_weights->dims->data[0],
+                    fw_bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, bw_input_weights->dims->data[0],
+                    bw_bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, fw_recurrent_weights->dims->data[0],
+                    fw_bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, bw_recurrent_weights->dims->data[1],
+                    bw_bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(fw_hidden_state), 2);
+  TF_LITE_ENSURE_EQ(context, fw_hidden_state->dims->data[0], batch_size);
+  TF_LITE_ENSURE_EQ(context, fw_hidden_state->dims->data[1], fw_num_units);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(bw_hidden_state), 2);
+  TF_LITE_ENSURE_EQ(context, bw_hidden_state->dims->data[0], batch_size);
+  TF_LITE_ENSURE_EQ(context, bw_hidden_state->dims->data[1], bw_num_units);
+
+  if (has_aux_input) {
+    // Check that aux_input has the same dimensions (except last) as the input.
+    TF_LITE_ASSERT_EQ(aux_input->dims->data[0], input->dims->data[0]);
+    TF_LITE_ASSERT_EQ(aux_input->dims->data[1], input->dims->data[1]);
+    // Check that aux_input_weights has the same dimensions (except last) as
+    // the input_weights.
+    TF_LITE_ASSERT_EQ(fw_aux_input_weights->dims->data[0], fw_num_units);
+    TF_LITE_ASSERT_EQ(bw_aux_input_weights->dims->data[0], bw_num_units);
+    TF_LITE_ASSERT_EQ(aux_input->dims->data[2],
+                      fw_aux_input_weights->dims->data[1]);
+    TF_LITE_ASSERT_EQ(aux_input->dims->data[2],
+                      bw_aux_input_weights->dims->data[1]);
+  }
+
+  const bool is_hybrid_op =
+      (fw_input_weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32);
+
+  if (is_hybrid_op) {
+    int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+
+    TfLiteIntArrayFree(node->temporaries);
+    if (has_aux_input) {
+      node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
+    } else {
+      // No need to create a temporary tensor for the non-existent aux_input.
+      node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors - 1);
+    }
+
+    node->temporaries->data[kInputQuantized] =
+        *scratch_tensor_index + kInputQuantized;
+    TfLiteTensor* input_quantized =
+        GetTemporary(context, node, kInputQuantized);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+
+    node->temporaries->data[kFwHiddenStateQuantized] =
+        *scratch_tensor_index + kFwHiddenStateQuantized;
+    TfLiteTensor* fw_hidden_state_quantized =
+        GetTemporary(context, node, kFwHiddenStateQuantized);
+    fw_hidden_state_quantized->type = kTfLiteUInt8;
+    fw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(fw_hidden_state_quantized->dims,
+                             fw_hidden_state->dims)) {
+      TfLiteIntArray* fw_hidden_state_quantized_size =
+          TfLiteIntArrayCopy(fw_hidden_state->dims);
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, fw_hidden_state_quantized,
+                                         fw_hidden_state_quantized_size));
+    }
+
+    node->temporaries->data[kBwHiddenStateQuantized] =
+        *scratch_tensor_index + kBwHiddenStateQuantized;
+    TfLiteTensor* bw_hidden_state_quantized =
+        GetTemporary(context, node, kBwHiddenStateQuantized);
+    bw_hidden_state_quantized->type = kTfLiteUInt8;
+    bw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(bw_hidden_state_quantized->dims,
+                             bw_hidden_state->dims)) {
+      TfLiteIntArray* bw_hidden_state_quantized_size =
+          TfLiteIntArrayCopy(bw_hidden_state->dims);
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, bw_hidden_state_quantized,
+                                         bw_hidden_state_quantized_size));
+    }
+
+    // Allocate temporary tensors to store scaling factors of quantization.
+    node->temporaries->data[kScalingFactors] =
+        *scratch_tensor_index + kScalingFactors;
+    TfLiteTensor* scaling_factors =
+        GetTemporary(context, node, kScalingFactors);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    int scaling_dims[1] = {batch_size};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = batch_size;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+
+    if (has_aux_input) {
+      node->temporaries->data[kAuxInputQuantized] =
+          *scratch_tensor_index + kAuxInputQuantized;
+      TfLiteTensor* aux_input_quantized =
+          GetTemporary(context, node, kAuxInputQuantized);
+      aux_input_quantized->type = kTfLiteUInt8;
+      aux_input_quantized->allocation_type = kTfLiteArenaRw;
+      if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
+        TfLiteIntArray* aux_input_quantized_size =
+            TfLiteIntArrayCopy(aux_input->dims);
+        TF_LITE_ENSURE_OK(context,
+                          context->ResizeTensor(context, aux_input_quantized,
+                                                aux_input_quantized_size));
+      }
+    }
+  }
+
+  // Resize outputs.
+  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+  TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3);
+  fw_output_size_array->data[0] = (time_major) ? max_time : batch_size;
+  fw_output_size_array->data[1] = (time_major) ? batch_size : max_time;
+  fw_output_size_array->data[2] =
+      params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, fw_output, fw_output_size_array));
+  if (!params->merge_outputs) {
+    TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+    TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
+    bw_output_size_array->data[0] = batch_size;
+    bw_output_size_array->data[1] = max_time;
+    bw_output_size_array->data[2] = bw_num_units;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_output,
+                                                     bw_output_size_array));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalFloat(
+    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
+    const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
+    const TfLiteTensor* bw_input_weights,
+    const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
+    const TfLiteTensor* aux_input, const TfLiteTensor* fw_aux_input_weights,
+    const TfLiteTensor* bw_aux_input_weights,
+    const TfLiteBidirectionalSequenceRNNParams* params,
+    TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
+    TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
+  const bool time_major = params->time_major;
+  const int batch_size =
+      (time_major) ? input->dims->data[1] : input->dims->data[0];
+  const int max_time =
+      (time_major) ? input->dims->data[0] : input->dims->data[1];
+  const int input_size = input->dims->data[2];
+  const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
+
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const float* fw_bias_ptr = fw_bias->data.f;
+  const float* fw_input_weights_ptr = fw_input_weights->data.f;
+  const float* fw_recurrent_weights_ptr = fw_recurrent_weights->data.f;
+
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  const float* bw_bias_ptr = bw_bias->data.f;
+  const float* bw_input_weights_ptr = bw_input_weights->data.f;
+  const float* bw_recurrent_weights_ptr = bw_recurrent_weights->data.f;
+
+  const float* fw_aux_input_weights_ptr = (fw_aux_input_weights != nullptr)
+                                              ? fw_aux_input_weights->data.f
+                                              : nullptr;
+  const float* bw_aux_input_weights_ptr = (bw_aux_input_weights != nullptr)
+                                              ? bw_aux_input_weights->data.f
+                                              : nullptr;
+
+  const int fw_output_step =
+      params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
+  const int bw_output_step =
+      params->merge_outputs ? fw_num_units + bw_num_units : bw_num_units;
+  if (time_major) {
+    // Forward cell.
+    float* fw_hidden_state_ptr_batch = fw_hidden_state->data.f;
+    for (int s = 0; s < max_time; s++) {
+      const float* input_ptr_batch =
+          input->data.f + s * input_size * batch_size;
+      const float* aux_input_ptr_batch =
+          (aux_input != nullptr)
+              ? aux_input->data.f + s * input_size * batch_size
+              : nullptr;
+      float* output_ptr_batch =
+          fw_output->data.f + s * fw_output_step * batch_size;
+
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, fw_input_weights_ptr, aux_input_ptr_batch,
+          fw_aux_input_weights_ptr, fw_recurrent_weights_ptr, fw_bias_ptr,
+          input_size, aux_input_size, fw_num_units, batch_size, fw_output_step,
+          params->activation, fw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+    // Backward cell.
+    float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
+    for (int s = max_time - 1; s >= 0; s--) {
+      const float* input_ptr_batch =
+          input->data.f + s * input_size * batch_size;
+      const float* aux_input_ptr_batch =
+          (aux_input != nullptr)
+              ? aux_input->data.f + s * input_size * batch_size
+              : nullptr;
+      float* output_ptr_batch =
+          (params->merge_outputs ? fw_output->data.f + fw_num_units
+                                 : bw_output->data.f) +
+          s * bw_output_step * batch_size;
+
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, bw_input_weights_ptr, aux_input_ptr_batch,
+          bw_aux_input_weights_ptr, bw_recurrent_weights_ptr, bw_bias_ptr,
+          input_size, aux_input_size, bw_num_units, batch_size, bw_output_step,
+          params->activation, bw_hidden_state_ptr_batch, output_ptr_batch);
+    }
+  } else {
+    for (int b = 0; b < batch_size; b++) {
+      // Forward cell.
+      float* fw_hidden_state_ptr_batch =
+          fw_hidden_state->data.f + b * fw_num_units;
+      float* fw_output_offset =
+          fw_output->data.f + b * fw_output_step * max_time;
+      for (int s = 0; s < max_time; s++) {
+        const float* input_ptr_batch =
+            input->data.f + b * input_size * max_time + s * input_size;
+        const float* aux_input_ptr_batch =
+            (aux_input != nullptr)
+                ? aux_input->data.f + b * input_size * max_time + s * input_size
+                : nullptr;
+        float* output_ptr_batch = fw_output_offset + s * fw_output_step;
+
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, fw_input_weights_ptr, aux_input_ptr_batch,
+            fw_aux_input_weights_ptr, fw_recurrent_weights_ptr, fw_bias_ptr,
+            input_size, aux_input_size, fw_num_units, /*batch_size=*/1,
+            fw_output_step, params->activation, fw_hidden_state_ptr_batch,
+            output_ptr_batch);
+      }
+      // Backward cell.
+      float* bw_hidden_state_ptr_batch =
+          bw_hidden_state->data.f + b * bw_num_units;
+      float* bw_output_offset =
+          params->merge_outputs
+              ? fw_output->data.f + b * bw_output_step * max_time + fw_num_units
+              : bw_output->data.f + b * bw_output_step * max_time;
+      for (int s = max_time - 1; s >= 0; s--) {
+        const float* input_ptr_batch =
+            input->data.f + b * input_size * max_time + s * input_size;
+        const float* aux_input_ptr_batch =
+            (aux_input != nullptr)
+                ? aux_input->data.f + b * input_size * max_time + s * input_size
+                : nullptr;
+        float* output_ptr_batch = bw_output_offset + s * bw_output_step;
+
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, bw_input_weights_ptr, aux_input_ptr_batch,
+            bw_aux_input_weights_ptr, bw_recurrent_weights_ptr, bw_bias_ptr,
+            input_size, aux_input_size, bw_num_units, /*batch_size=*/1,
+            bw_output_step, params->activation, bw_hidden_state_ptr_batch,
+            output_ptr_batch);
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
+    const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
+    const TfLiteTensor* bw_input_weights,
+    const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
+    const TfLiteTensor* aux_input, const TfLiteTensor* aux_fw_input_weights,
+    const TfLiteTensor* aux_bw_input_weights,
+    const TfLiteBidirectionalSequenceRNNParams* params,
+    TfLiteTensor* scaling_factors, TfLiteTensor* input_quantized,
+    TfLiteTensor* aux_input_quantized, TfLiteTensor* fw_hidden_state_quantized,
+    TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
+    TfLiteTensor* bw_hidden_state_quantized, TfLiteTensor* bw_hidden_state,
+    TfLiteTensor* bw_output) {
+  const bool time_major = params->time_major;
+  const int batch_size =
+      (time_major) ? input->dims->data[1] : input->dims->data[0];
+  const int max_time =
+      (time_major) ? input->dims->data[0] : input->dims->data[1];
+  const int input_size = input->dims->data[2];
+  const int aux_input_size = (aux_input) ? aux_input->dims->data[2] : 0;
+
+  const int fw_num_units = fw_input_weights->dims->data[0];
+  const float* fw_bias_ptr = fw_bias->data.f;
+  const int8_t* fw_input_weights_ptr =
+      reinterpret_cast<const int8_t*>(fw_input_weights->data.uint8);
+  float fw_input_weights_scale = fw_input_weights->params.scale;
+  const int8_t* fw_recurrent_weights_ptr =
+      reinterpret_cast<const int8_t*>(fw_recurrent_weights->data.uint8);
+  float fw_recurrent_weights_scale = fw_recurrent_weights->params.scale;
+
+  const int bw_num_units = bw_input_weights->dims->data[0];
+  const float* bw_bias_ptr = bw_bias->data.f;
+  const int8_t* bw_input_weights_ptr =
+      reinterpret_cast<const int8_t*>(bw_input_weights->data.uint8);
+  float bw_input_weights_scale = bw_input_weights->params.scale;
+  const int8_t* bw_recurrent_weights_ptr =
+      reinterpret_cast<const int8_t*>(bw_recurrent_weights->data.uint8);
+  float bw_recurrent_weights_scale = bw_recurrent_weights->params.scale;
+
+  // Set the auxiliary pointers and scales if needed.
+  int8_t* aux_fw_input_weights_ptr = nullptr;
+  float aux_fw_input_weights_scale = 0.0f;
+  int8_t* aux_bw_input_weights_ptr = nullptr;
+  float aux_bw_input_weights_scale = 0.0f;
+  int8_t* aux_quantized_input_ptr = nullptr;
+  if (aux_input_size > 0) {
+    aux_fw_input_weights_ptr =
+        reinterpret_cast<int8_t*>(aux_fw_input_weights->data.uint8);
+    aux_fw_input_weights_scale = aux_fw_input_weights->params.scale;
+    aux_bw_input_weights_ptr =
+        reinterpret_cast<int8_t*>(aux_bw_input_weights->data.uint8);
+    aux_bw_input_weights_scale = aux_bw_input_weights->params.scale;
+    aux_quantized_input_ptr = reinterpret_cast<int8_t*>(aux_input_quantized);
+  }
+
+  // Initialize temporary storage for quantized values.
+  int8_t* quantized_input_ptr =
+      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  int8_t* fw_quantized_hidden_state_ptr =
+      reinterpret_cast<int8_t*>(fw_hidden_state_quantized->data.uint8);
+  int8_t* bw_quantized_hidden_state_ptr =
+      reinterpret_cast<int8_t*>(bw_hidden_state_quantized->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
+
+  const int fw_output_step =
+      params->merge_outputs ? fw_num_units + bw_num_units : fw_num_units;
+  const int bw_output_step =
+      params->merge_outputs ? fw_num_units + bw_num_units : bw_num_units;
+  if (time_major) {
+    for (int t = 0; t < max_time; t++) {
+      // Forward cell.
+      float* fw_hidden_state_ptr_batch = fw_hidden_state->data.f;
+      for (int s = 0; s < max_time; s++) {
+        const float* input_ptr_batch =
+            input->data.f + s * input_size * batch_size;
+        const float* aux_input_ptr_batch =
+            (aux_input != nullptr)
+                ? aux_input->data.f + s * input_size * batch_size
+                : nullptr;
+        float* output_ptr_batch =
+            fw_output->data.f + s * fw_output_step * batch_size;
+
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, fw_input_weights_ptr, fw_input_weights_scale,
+            aux_input_ptr_batch, aux_fw_input_weights_ptr,
+            aux_fw_input_weights_scale, fw_recurrent_weights_ptr,
+            fw_recurrent_weights_scale, fw_bias_ptr, input_size, aux_input_size,
+            fw_num_units, batch_size, fw_output_step, params->activation,
+            quantized_input_ptr, aux_quantized_input_ptr,
+            fw_quantized_hidden_state_ptr, scaling_factors_ptr,
+            fw_hidden_state_ptr_batch, output_ptr_batch);
+      }
+      // Backward cell.
+      float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
+      for (int s = max_time - 1; s >= 0; s--) {
+        const float* input_ptr_batch =
+            input->data.f + s * input_size * batch_size;
+        const float* aux_input_ptr_batch =
+            (aux_input != nullptr)
+                ? aux_input->data.f + s * input_size * batch_size
+                : nullptr;
+        float* output_ptr_batch =
+            (params->merge_outputs ? fw_output->data.f + fw_num_units
+                                   : bw_output->data.f) +
+            s * bw_output_step * batch_size;
+
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, bw_input_weights_ptr, bw_input_weights_scale,
+            aux_input_ptr_batch, aux_bw_input_weights_ptr,
+            aux_bw_input_weights_scale, bw_recurrent_weights_ptr,
+            bw_recurrent_weights_scale, bw_bias_ptr, input_size, aux_input_size,
+            bw_num_units, batch_size, bw_output_step, params->activation,
+            quantized_input_ptr, aux_quantized_input_ptr,
+            bw_quantized_hidden_state_ptr, scaling_factors_ptr,
+            bw_hidden_state_ptr_batch, output_ptr_batch);
+      }
+    }
+  } else {
+    for (int b = 0; b < batch_size; b++) {
+      // Forward cell.
+      float* fw_hidden_state_ptr_batch =
+          fw_hidden_state->data.f + b * fw_num_units;
+      float* fw_output_offset =
+          fw_output->data.f + b * fw_output_step * max_time;
+      for (int s = 0; s < max_time; s++) {
+        const float* input_ptr_batch =
+            input->data.f + b * input_size * max_time + s * input_size;
+        const float* aux_input_ptr_batch =
+            (aux_input != nullptr)
+                ? aux_input->data.f + b * input_size * max_time + s * input_size
+                : nullptr;
+        float* output_ptr_batch = fw_output_offset + s * fw_output_step;
+
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, fw_input_weights_ptr, fw_input_weights_scale,
+            aux_input_ptr_batch, aux_fw_input_weights_ptr,
+            aux_fw_input_weights_scale, fw_recurrent_weights_ptr,
+            fw_recurrent_weights_scale, fw_bias_ptr, input_size, aux_input_size,
+            fw_num_units, /*batch_size=*/1, fw_output_step, params->activation,
+            quantized_input_ptr, aux_quantized_input_ptr,
+            fw_quantized_hidden_state_ptr, scaling_factors_ptr,
+            fw_hidden_state_ptr_batch, output_ptr_batch);
+      }
+      // Backward cell.
+      float* bw_hidden_state_ptr_batch =
+          bw_hidden_state->data.f + b * bw_num_units;
+      float* bw_output_offset =
+          params->merge_outputs
+              ? fw_output->data.f + b * bw_output_step * max_time + fw_num_units
+              : bw_output->data.f + b * bw_output_step * max_time;
+      for (int s = max_time - 1; s >= 0; s--) {
+        const float* input_ptr_batch =
+            input->data.f + b * input_size * max_time + s * input_size;
+        const float* aux_input_ptr_batch =
+            (aux_input != nullptr)
+                ? aux_input->data.f + b * input_size * max_time + s * input_size
+                : nullptr;
+        float* output_ptr_batch = bw_output_offset + s * bw_output_step;
+
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, bw_input_weights_ptr, bw_input_weights_scale,
+            aux_input_ptr_batch, aux_bw_input_weights_ptr,
+            aux_bw_input_weights_scale, bw_recurrent_weights_ptr,
+            bw_recurrent_weights_scale, bw_bias_ptr, input_size, aux_input_size,
+            bw_num_units, /*batch_size=*/1, bw_output_step, params->activation,
+            quantized_input_ptr, aux_quantized_input_ptr,
+            bw_quantized_hidden_state_ptr, scaling_factors_ptr,
+            bw_hidden_state_ptr_batch, output_ptr_batch);
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceRNNParams*>(
+      node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* fw_input_weights =
+      GetInput(context, node, kFwWeightsTensor);
+  const TfLiteTensor* fw_recurrent_weights =
+      GetInput(context, node, kFwRecurrentWeightsTensor);
+  const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor);
+  const TfLiteTensor* bw_input_weights =
+      GetInput(context, node, kBwWeightsTensor);
+  const TfLiteTensor* bw_recurrent_weights =
+      GetInput(context, node, kBwRecurrentWeightsTensor);
+  const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor);
+
+  // Get auxiliary inputs.
+  const TfLiteTensor* aux_input =
+      GetOptionalInputTensor(context, node, kAuxInputTensor);
+  const TfLiteTensor* fw_aux_input_weights =
+      GetOptionalInputTensor(context, node, kFwAuxWeightsTensor);
+  const TfLiteTensor* bw_aux_input_weights =
+      GetOptionalInputTensor(context, node, kBwAuxWeightsTensor);
+
+  TfLiteTensor* fw_hidden_state =
+      GetVariableInput(context, node, kFwHiddenStateTensor);
+  TfLiteTensor* bw_hidden_state =
+      GetVariableInput(context, node, kBwHiddenStateTensor);
+
+  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+  TfLiteTensor* bw_output = params->merge_outputs
+                                ? nullptr
+                                : GetOutput(context, node, kBwOutputTensor);
+
+  switch (fw_input_weights->type) {
+    case kTfLiteFloat32:
+      return EvalFloat(input, fw_input_weights, fw_recurrent_weights, fw_bias,
+                       bw_input_weights, bw_recurrent_weights, bw_bias,
+                       aux_input, fw_aux_input_weights, bw_aux_input_weights,
+                       params, fw_hidden_state, fw_output, bw_hidden_state,
+                       bw_output);
+    case kTfLiteUInt8: {
+      TfLiteTensor* input_quantized =
+          GetTemporary(context, node, kInputQuantized);
+      TfLiteTensor* fw_hidden_state_quantized =
+          GetTemporary(context, node, kFwHiddenStateQuantized);
+      TfLiteTensor* bw_hidden_state_quantized =
+          GetTemporary(context, node, kBwHiddenStateQuantized);
+      TfLiteTensor* scaling_factors =
+          GetTemporary(context, node, kScalingFactors);
+      TfLiteTensor* aux_input_quantized =
+          (aux_input != nullptr)
+              ? GetTemporary(context, node, kAuxInputQuantized)
+              : nullptr;
+
+      return EvalHybrid(input, fw_input_weights, fw_recurrent_weights, fw_bias,
+                        bw_input_weights, bw_recurrent_weights, bw_bias,
+                        aux_input, fw_aux_input_weights, bw_aux_input_weights,
+                        params, scaling_factors, input_quantized,
+                        aux_input_quantized, fw_hidden_state_quantized,
+                        fw_hidden_state, fw_output, bw_hidden_state_quantized,
+                        bw_hidden_state, bw_output);
+    }
+    default:
+      context->ReportError(context, "Type not currently supported.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace bidirectional_sequence_rnn
+
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN() {
+  static TfLiteRegistration r = {
+      bidirectional_sequence_rnn::Init, bidirectional_sequence_rnn::Free,
+      bidirectional_sequence_rnn::Prepare, bidirectional_sequence_rnn::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
similarity index 90%
rename from tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
rename to tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
index f555c472f51b7c4d301c855c4edf83e219fea445..5bad8e02c29608fa058d0d1104acbf09626f1b66 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -14,15 +14,15 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite Bidirectional RNN op.
 
-#include <vector>
 #include <iomanip>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -654,7 +654,8 @@ const std::initializer_list<float> recurrent_weights = {
 class BidirectionalRNNOpModel : public SingleOpModel {
  public:
   BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
-                          int bw_units, int input_size, bool merge_outputs)
+                          int bw_units, int input_size, bool time_major,
+                          bool merge_outputs)
       : batches_(batches),
         sequence_len_(sequence_len),
         fw_units_(fw_units),
@@ -679,25 +680,29 @@ class BidirectionalRNNOpModel : public SingleOpModel {
       bw_output_ = AddOutput(TensorType_FLOAT32);
     }
 
-    SetBuiltinOp(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
-                 BuiltinOptions_BidirectionalSequenceRNNOptions,
-                 CreateBidirectionalSequenceRNNOptions(
-                     builder_, /*time_major=*/false,
-                     ActivationFunctionType_RELU, merge_outputs)
-                     .Union());
+    SetBuiltinOp(
+        BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+        BuiltinOptions_BidirectionalSequenceRNNOptions,
+        CreateBidirectionalSequenceRNNOptions(
+            builder_, time_major, ActivationFunctionType_RELU, merge_outputs)
+            .Union());
+    const auto input_shape =
+        (time_major) ? std::vector<int>({sequence_len_, batches_, input_size_})
+                     : std::vector<int>({batches_, sequence_len_, input_size_});
+
     BuildInterpreter({
-        {batches_, sequence_len_, input_size_},  // input
-        {fw_units_, input_size_},                // fw_weights
-        {fw_units_, fw_units_},                  // fw_recurrent_weights
-        {fw_units_},                             // fw_bias
-        {batches_, fw_units_},                   // fw_hidden_state
-        {bw_units_, input_size_},                // bw_weights
-        {bw_units_, bw_units_},                  // bw_recurrent_weights
-        {bw_units_},                             // bw_bias
-        {batches_, bw_units_},                   // bw_hidden_state
-        {batches_, sequence_len_, 0},            // aux_input
-        {fw_units_, 0},                          // aux_fw_weights
-        {bw_units_, 0},                          // aux_bw_weights
+        input_shape,                   // input
+        {fw_units_, input_size_},      // fw_weights
+        {fw_units_, fw_units_},        // fw_recurrent_weights
+        {fw_units_},                   // fw_bias
+        {batches_, fw_units_},         // fw_hidden_state
+        {bw_units_, input_size_},      // bw_weights
+        {bw_units_, bw_units_},        // bw_recurrent_weights
+        {bw_units_},                   // bw_bias
+        {batches_, bw_units_},         // bw_hidden_state
+        {batches_, sequence_len_, 0},  // aux_input
+        {fw_units_, 0},                // aux_fw_weights
+        {bw_units_, 0},                // aux_bw_weights
     });
   }
 
@@ -770,7 +775,8 @@ class BidirectionalRNNOpModel : public SingleOpModel {
 TEST(BidirectionalRNNOpTest, BlackBoxTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*merge_outputs=*/false);
+                              /*input_size=*/8, /*time_major=*/false,
+                              /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
   rnn.SetFwBias(biases);
@@ -803,11 +809,49 @@ TEST(BidirectionalRNNOpTest, BlackBoxTest) {
   EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
 }
 
-// Same as the previous test, yet with merged outputs.
+// Same as BlackBox test, but input is reshuffled to time_major format.
+TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> fw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_fw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  }
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+}
+
+// Same as BlackBox test, yet with merged outputs.
 TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*merge_outputs=*/true);
+                              /*input_size=*/8, /*time_major=*/false,
+                              /*merge_outputs=*/true);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
   rnn.SetFwBias(biases);
@@ -840,12 +884,56 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
               ElementsAreArray(ArrayFloatNear(merged_expected)));
 }
 
+// Same as BlackBox test, but input is reshuffled to time_major format.
+TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajorMergeOutputs) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*time_major=*/true,
+                              /*merge_outputs=*/true);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> merged_expected;
+  for (int step = 0; step < rnn.sequence_len(); step++) {
+    for (int bid = 0; bid < rnn.num_batches(); bid++) {
+      merged_expected.insert(
+          merged_expected.end(),
+          rnn_golden_fw_output + rnn.num_fw_units() * step,
+          rnn_golden_fw_output + rnn.num_fw_units() * (step + 1));
+      merged_expected.insert(
+          merged_expected.end(),
+          rnn_golden_bw_output + rnn.num_bw_units() * step,
+          rnn_golden_bw_output + rnn.num_bw_units() * (step + 1));
+    }
+  }
+  EXPECT_THAT(rnn.GetFwOutput(),
+              ElementsAreArray(ArrayFloatNear(merged_expected)));
+}
+
 // Check that if the input sequence is reversed the outputs are the same just
 // forward and backward are swapped (and reversed).
 TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*merge_outputs=*/false);
+                              /*input_size=*/8, /*time_major=*/false,
+                              /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
   rnn.SetFwBias(biases);
@@ -891,7 +979,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
 TEST(BidirectionalRNNOpTest, EndToEndTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*merge_outputs=*/false);
+                              /*input_size=*/8, /*time_major=*/false,
+                              /*merge_outputs=*/false);
   const int output_size = 4;
   float dnn_weights[] = {
       -0.5782342,  -0.052212059, 0.73036242,  -0.81216097, -0.80088139,
@@ -908,8 +997,8 @@ TEST(BidirectionalRNNOpTest, EndToEndTest) {
       0.3492105,   0.56452453,   0.4389236,   -0.59929526, -0.19762468,
       -0.36868393, -0.13198286,  -0.53800809, -0.22850353};
 
-  std::initializer_list<float> dnn_biases = {
-    0.29177809, -0.98799044, 0.065919638, 0.68781924};
+  std::initializer_list<float> dnn_biases = {0.29177809, -0.98799044,
+                                             0.065919638, 0.68781924};
 
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
diff --git a/tensorflow/contrib/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/cast.cc
rename to tensorflow/lite/kernels/cast.cc
index a7972140ac9f228ad8137804653194cc62265736..ac6c85b96921dc1f9a5b9204c7b19cd6fce1943f 100644
--- a/tensorflow/contrib/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -15,13 +15,13 @@ limitations under the License.
 #include <string.h>
 #include <algorithm>
 #include <complex>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/cast_test.cc
rename to tensorflow/lite/kernels/cast_test.cc
index 954f998206563a38c74a1382092851cfbee1013b..acdc331a7ea78e90f93ea3adc6105396f484e81e 100644
--- a/tensorflow/contrib/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 #include <complex>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/comparisons.cc
rename to tensorflow/lite/kernels/comparisons.cc
index 3926af5b973947f0cfd079825e8633ed931f9cd8..a914449ae552e37249f2cecb5c88f3b49e83f133 100644
--- a/tensorflow/contrib/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
@@ -41,7 +41,7 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context,
                  input1->type != kTfLiteString || input1->type != kTfLiteBool);
   // Currently only support tensors have the same type.
-  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = kTfLiteBool;
 
   bool requires_broadcast = !HaveSameShapes(input1, input2);
diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/comparisons_test.cc
rename to tensorflow/lite/kernels/comparisons_test.cc
index 04c8bf2e3017bf5ec19ff8ebf6313918a60c838a..ab10c959a4d6b234cb6ae0810174e8f1c48898d1 100644
--- a/tensorflow/contrib/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -455,7 +455,7 @@ TEST(ComparisonsTest, LessEqualQuantized) {
 TEST(ComparisonsTest, QuantizedEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -473,7 +473,7 @@ TEST(ComparisonsTest, QuantizedEqualWithBroadcast) {
 TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -491,7 +491,7 @@ TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
 TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -509,7 +509,7 @@ TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
 TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -527,7 +527,7 @@ TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
 TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
@@ -545,7 +545,7 @@ TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
 TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     ComparisonOpModel model({TensorType_UINT8, test_shapes[i], kMin, kMax},
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/concatenation.cc
rename to tensorflow/lite/kernels/concatenation.cc
index 7ad3399ffd39339f8f46964a6e99b6f4de0eb27d..a8dd160c8dbb42ba2c2363af55b30eb0b79f86af 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -19,13 +19,13 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/concatenation_test.cc
rename to tensorflow/lite/kernels/concatenation_test.cc
index 467ff6f7e149e35ae1fd11031c10d7087c4b398c..422380a03eaf9073958d4984eb2234890d555780 100644
--- a/tensorflow/contrib/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/conv.cc
rename to tensorflow/lite/kernels/conv.cc
index dbcadbee14ceb040e087e221bb1cc4e0a40a47df..1fd870be93eda12d1c057e29b017d80e2a96412b 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -20,20 +20,19 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/eigen_support.h"
-#include "tensorflow/contrib/lite/kernels/gemm_support.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/cblas_conv.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/kernels/padding.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/eigen_support.h"
+#include "tensorflow/lite/kernels/gemm_support.h"
+#include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
 namespace ops {
@@ -387,12 +386,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         GetTemporary(context, node, data->scaling_factors_index);
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
     // Only one scale factor per batch is typically necessary. See optimized
     // implementation for why we need to allocate for the height of the inputs
     // flattened to 2D.
-    scaling_factors_size->data[0] = NumElements(input) / channels_in;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+    const int height = NumElements(input) / channels_in;
+    int scaling_dims[1] = {height};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = height;
       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
                                                        scaling_factors_size));
     }
@@ -489,11 +490,10 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
   KernelType effective_kernel_type;
-  if ((kernel_type == kMultithreadOptimized ||
-       kernel_type == kCblasOptimized) &&
+  if ((kernel_type == kMultithreadOptimized) &&
       (params->dilation_width_factor != 1 ||
        params->dilation_height_factor != 1)) {
-    // kMultithreadOptimized and kCblasOptimized do not support dilation.
+    // kMultithreadOptimized does not support dilation.
     // Therefore, fallback to optimized.
     effective_kernel_type = kGenericOptimized;
   } else {
@@ -519,6 +519,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                           GetTensorData<float>(im2col));
       break;
     }
+    case kCblasOptimized:
     case kGenericOptimized: {
       optimized_ops::Conv(op_params, GetTensorShape(input),
                           GetTensorData<float>(input), GetTensorShape(filter),
@@ -544,15 +545,6 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
           GetTensorData<float>(im2col));
       break;
     }
-    case kCblasOptimized: {
-      cblas_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<float>(input), GetTensorShape(filter),
-                      GetTensorData<float>(filter), GetTensorShape(bias),
-                      GetTensorData<float>(bias), GetTensorShape(output),
-                      GetTensorData<float>(output), GetTensorShape(im2col),
-                      GetTensorData<float>(im2col));
-      break;
-    }
   }
 }
 
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/kernels/conv_test.cc
rename to tensorflow/lite/kernels/conv_test.cc
index f7e6f083ed23f8be5fd00ce1e8b202ba63305334..eebf9f9de4694352cf3bf959f0f639380a3054f7 100644
--- a/tensorflow/contrib/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -16,10 +16,10 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f4ae5087b267a62d4d4237a8f5f534ff346a493
--- /dev/null
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -0,0 +1,317 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace depthwise_conv {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// This file has three implementation of DepthwiseConv.
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+};
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // This is a builtin op, so we don't use the contents in 'buffer', if any.
+  // Instead, we allocate a new object to carry information from Prepare() to
+  // Eval().
+  return new OpData;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  // TODO(ahentz): use could use GetOptionalInputTensor() here, but we need to
+  // decide whether we are OK with optional tensors being completely absent, as
+  // opposed to having -1 as their index.
+  bool hasBias = NumInputs(node) == 3;
+
+  TF_LITE_ENSURE(context, hasBias || NumInputs(node) == 2);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias = nullptr;
+
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 4);
+
+  // The parameter 'depth_multiplier' is redundant, so we check here to make
+  // sure it is consistent with the given dimensions.
+  TF_LITE_ENSURE_EQ(context,
+                    params->depth_multiplier * SizeOfDimension(input, 3),
+                    SizeOfDimension(filter, 3));
+
+  const TfLiteType data_type = input->type;
+  TF_LITE_ENSURE(context,
+                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
+  TF_LITE_ENSURE_EQ(context, output->type, data_type);
+  TF_LITE_ENSURE_EQ(context, filter->type, data_type);
+
+  if (hasBias) {
+    bias = GetInput(context, node, kBiasTensor);
+    if (data_type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+      TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+    } else {
+      TF_LITE_ENSURE_EQ(context, bias->type, data_type);
+    }
+    TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
+    TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 3),
+                      SizeOfDimension(bias, 0));
+  }
+
+  int channels_out = SizeOfDimension(filter, 3);
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+  int batches = SizeOfDimension(input, 0);
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params->padding;
+  auto compute_out_size = [padding](int image_size, int filter_size, int stride,
+                                    int dilation_rate) -> int {
+    int effective_filter_size = (filter_size - 1) * dilation_rate + 1;
+    return padding == kTfLitePaddingSame
+               ? (image_size + stride - 1) / stride
+               : padding == kTfLitePaddingValid
+                     ? (image_size - effective_filter_size + stride) / stride
+                     : 0;
+  };
+
+  int out_width = compute_out_size(width, filter_width, params->stride_width,
+                                   params->dilation_width_factor);
+  int out_height =
+      compute_out_size(height, filter_height, params->stride_height,
+                       params->dilation_height_factor);
+
+  data->padding.height =
+      ComputePadding(params->stride_height, params->dilation_height_factor,
+                     height, filter_height, out_height);
+  data->padding.width =
+      ComputePadding(params->stride_width, params->dilation_width_factor, width,
+                     filter_width, out_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  }
+
+  TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
+  outputSize->data[0] = batches;
+  outputSize->data[1] = out_height;
+  outputSize->data[2] = out_width;
+  outputSize->data[3] = channels_out;
+  return context->ResizeTensor(context, output, outputSize);
+}
+
+template <KernelType kernel_type>
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, OpData* data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+
+  void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&,
+                         const float*, const RuntimeShape&, const float*,
+                         const RuntimeShape&, const float*, const RuntimeShape&,
+                         float*);
+  if (kernel_type == kReference) {
+    depthwise_conv = &reference_ops::DepthwiseConv;
+  } else {
+    depthwise_conv = &optimized_ops::DepthwiseConv;
+  }
+
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  depthwise_conv(op_params, GetTensorShape(input), GetTensorData<float>(input),
+                 GetTensorShape(filter), GetTensorData<float>(filter),
+                 GetTensorShape(bias), GetTensorData<float>(bias),
+                 GetTensorShape(output), GetTensorData<float>(output));
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, OpData* data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
+  auto input_offset = -input->params.zero_point;
+  auto filter_offset = -filter->params.zero_point;
+  auto output_offset = output->params.zero_point;
+
+  void (*depthwise_conv)(const DepthwiseParams&, const RuntimeShape&,
+                         const uint8*, const RuntimeShape&, const uint8*,
+                         const RuntimeShape&, const int32*, const RuntimeShape&,
+                         uint8*);
+
+  if (kernel_type == kReference) {
+    depthwise_conv = &reference_ops::DepthwiseConv;
+  } else {
+    depthwise_conv = &optimized_ops::DepthwiseConv;
+  }
+
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  depthwise_conv(op_params, GetTensorShape(input),
+                 GetTensorData<uint8_t>(input), GetTensorShape(filter),
+                 GetTensorData<uint8_t>(filter), GetTensorShape(bias),
+                 GetTensorData<int32_t>(bias), GetTensorShape(output),
+                 GetTensorData<uint8_t>(output));
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
+      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
+                             output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized<kernel_type>(context, node, params, data, input, filter,
+                                 bias, output);
+      break;
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF() {
+  static TfLiteRegistration r = {
+      depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
+      depthwise_conv::Eval<depthwise_conv::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
+      depthwise_conv::Eval<depthwise_conv::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT() {
+  static TfLiteRegistration r = {
+      depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
+      depthwise_conv::Eval<depthwise_conv::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+#ifdef USE_NEON
+  return Register_DEPTHWISE_CONVOLUTION_NEON_OPT();
+#else
+  return Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d924e6f700781e4aceef3d8554ed3d88d17ed774
--- /dev/null
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -0,0 +1,455 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF();
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT();
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT();
+
+}  // namespace builtin
+}  // namespace ops
+
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
+ public:
+  // TODO(ahentz): Also test different activation types, bias, padding types,
+  // stride values.
+  BaseDepthwiseConvolutionOpModel(TfLiteRegistration* registration,
+                                  const TensorData& input,
+                                  const TensorData& filter,
+                                  const TensorData& output,
+                                  Padding padding_type,
+                                  int dilation_factor = 1) {
+    input_ = AddInput(input);
+    filter_ = AddInput(filter);
+
+    int bias_size = GetShape(filter_)[3];
+    if (input.type == TensorType_FLOAT32) {
+      bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+    } else {
+      // This is a quantized version. The scale of 'bias' depends on the scales
+      // of input and filter. Supposedly this is correctly set during quantized
+      // training.
+      auto bias_scale = GetScale(input_) * GetScale(filter_);
+      TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
+
+    output_ = AddOutput(output);
+
+    int input_depth = GetShape(input_)[3];
+    int output_depth = GetShape(filter_)[3];
+    int depth_mul = output_depth / input_depth;
+
+    SetBuiltinOp(
+        BuiltinOperator_DEPTHWISE_CONV_2D,
+        BuiltinOptions_DepthwiseConv2DOptions,
+        CreateDepthwiseConv2DOptions(builder_, padding_type, 1, 1, depth_mul,
+                                     ActivationFunctionType_NONE,
+                                     dilation_factor, dilation_factor)
+            .Union());
+
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_DEPTHWISE_CONV_2D, registration);
+
+    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
+  }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+};
+
+class DepthwiseConvolutionOpModel : public BaseDepthwiseConvolutionOpModel {
+ public:
+  using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
+
+  void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
+
+  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_DEPTHWISE_CONVOLUTION_REF()},
+    {"GenericOptimized",
+     ops::builtin::Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT()},
+    {"NeonOptimized", ops::builtin::Register_DEPTHWISE_CONVOLUTION_NEON_OPT()},
+});
+
+class DepthwiseConvolutionOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+TEST_P(DepthwiseConvolutionOpTest, SimpleTest) {
+  DepthwiseConvolutionOpModel m(GetRegistration(),
+                                {TensorType_FLOAT32, {1, 3, 2, 2}},
+                                {TensorType_FLOAT32, {1, 2, 2, 4}},
+                                {TensorType_FLOAT32, {}}, Padding_VALID);
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 71, -34, 99, -20,  //
+                                 91, -26, 127, -4,  //
+                             }));
+}
+
+TEST_P(DepthwiseConvolutionOpTest, SimpleDilatedTestPaddingValid) {
+  const int depth = 1;
+  const int image_width = 9;
+  const int image_height = 9;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int dilation_factor = 3;
+  DepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, Padding_VALID, dilation_factor);
+
+  // The image matrix is:
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  // clang-format on
+  // The filter matrix is:
+  // | 1 | 2 | 3 |
+  // | 4 | 5 | 6 |
+  // | 7 | 8 | 9 |
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  // No bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Since the dilation rate is 3 this will reduce the size of the output from
+  // 10x10 to 3x3 of all 5s. Specifically:
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
+}
+
+TEST_P(DepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
+  const int depth = 1;
+  const int image_width = 3;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 2;
+  const int filter_count = 1;
+  const int dilation_factor = 2;
+  DepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, Padding_SAME, dilation_factor);
+
+  // The image matrix is:
+  // | 1 | 1 | 1 |
+  // | 1 | 1 | 1 |
+  // | 1 | 1 | 1 |
+  m.SetInput({1, 1, 1, 1, 1, 1, 1, 1, 1});
+  // The filter matrix is:
+  // | 1 | 2 |
+  // | 3 | 4 |
+  m.SetFilter({1, 2, 3, 4});
+  // No bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Output:
+  // | 4 | 7 | 3 |
+  // | 6 |10 | 4 |
+  // | 2 | 3 | 1 |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
+}
+
+class QuantizedDepthwiseConvolutionOpModel
+    : public BaseDepthwiseConvolutionOpModel {
+ public:
+  using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int32_t>(bias_, data);
+  }
+
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+class QuantizedDepthwiseConvolutionOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+// In this test we set the input and output scales so that the results match
+// exactly the 'non-quantized' version.
+TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) {
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64},
+      {TensorType_UINT8, {1, 2, 2, 4}, -63.5, 64},
+      {TensorType_UINT8, {}, -127, 128}, Padding_VALID);
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                            {
+                                                71, -34, 99, -20,  //
+                                                91, -26, 127, -4,  //
+                                            },
+                                            1e-5)));
+  // For good  measure, let's also verify the quantized values:
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 198, 93, 226, 107,   //
+                                 218, 101, 254, 123,  //
+                             }));
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest,
+       SimpleTestQuantizedFilterMultiplierGreaterThan1) {
+  QuantizedDepthwiseConvolutionOpModel quant_op(
+      GetRegistration(), {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64},
+      {TensorType_UINT8, {1, 2, 2, 4}, -128.5, 128},
+      {TensorType_UINT8, {}, -127, 128}, Padding_VALID);
+  DepthwiseConvolutionOpModel float_op(GetRegistration(),
+                                       {TensorType_FLOAT32, {1, 3, 2, 2}},
+                                       {TensorType_FLOAT32, {1, 2, 2, 4}},
+                                       {TensorType_FLOAT32, {}}, Padding_VALID);
+
+  std::initializer_list<float> input = {
+      1, 2, 7,  8,   // column 1
+      3, 4, 9,  10,  // column 2
+      5, 6, 11, 12,  // column 3
+  };
+  std::initializer_list<float> filter = {
+      1,  2,   3,   4,    //
+      -9, 10,  -11, 12,   //
+      5,  6,   7,   8,    //
+      13, -14, 15,  -16,  //
+  };
+  std::initializer_list<float> bias = {1, 2, 3, 4};
+
+  quant_op.SetInput(input);
+  quant_op.SetFilter(filter);
+  quant_op.SetBias(bias);
+  quant_op.Invoke();
+
+  float_op.SetInput(input);
+  float_op.SetFilter(filter);
+  float_op.SetBias(bias);
+  float_op.Invoke();
+
+  EXPECT_THAT(quant_op.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(float_op.GetOutput(), 1)));
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingValid) {
+  const int depth = 1;
+  const int image_width = 9;
+  const int image_height = 9;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int dilation_factor = 3;
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {image_batch_count, image_height, image_width, depth},
+       0,
+       255},
+      {TensorType_UINT8,
+       {depth, filter_size, filter_size, filter_count},
+       0,
+       255},
+      {TensorType_UINT8, {}, 0, 255}, Padding_VALID, dilation_factor);
+
+  // The image matrix is:
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
+  // clang-format off
+  m.SetInput({0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 1, 1, 1, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0,
+              0, 0, 0, 0, 0, 0, 0, 0, 0});
+  // clang-format on
+  // The filter matrix is:
+  // | 1 | 2 | 3 |
+  // | 4 | 5 | 6 |
+  // | 7 | 8 | 9 |
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  // No bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Since the dilation rate is 3 this will reduce the size of the output from
+  // 10x10 to 3x3 of all 5s. Specifically:
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  // | 5 | 5 | 5 |
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
+}
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
+  const int depth = 1;
+  const int image_width = 3;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 2;
+  const int filter_count = 1;
+  const int dilation_factor = 2;
+  QuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_UINT8,
+       {image_batch_count, image_height, image_width, depth},
+       0,
+       255},
+      {TensorType_UINT8,
+       {depth, filter_size, filter_size, filter_count},
+       0,
+       255},
+      {TensorType_UINT8, {}, 0, 255}, Padding_SAME, dilation_factor);
+
+  // The image matrix is:
+  // | 1 | 1 | 1 |
+  // | 1 | 1 | 1 |
+  // | 1 | 1 | 1 |
+  m.SetInput({1, 1, 1, 1, 1, 1, 1, 1, 1});
+  // The filter matrix is:
+  // | 1 | 2 |
+  // | 3 | 4 |
+  m.SetFilter({1, 2, 3, 4});
+  // No bias for this test.
+  m.SetBias({0});
+  m.Invoke();
+
+  // Output:
+  // | 4 | 7 | 3 |
+  // | 6 |10 | 4 |
+  // | 2 | 3 | 1 |
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
+INSTANTIATE_TEST_CASE_P(
+    QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f03c73c9c960e3c134e33bf78a572f100405b7a
--- /dev/null
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace dequantize {
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    input = GetInput(context, node, 0);
+    output = GetOutput(context, node, 0);
+  }
+  const TfLiteTensor* input;
+  TfLiteTensor* output;
+};
+
+struct OpData {
+  // This boolean value is only used when the input tensor is constant.
+  bool float_dequantized_weights_initialized;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData();
+  op_data->float_dequantized_weights_initialized = false;
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  OpContext op_context(context, node);
+
+  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8 ||
+                              op_context.input->type == kTfLiteInt8);
+
+  op_context.output->type = kTfLiteFloat32;
+  // If the input tensor is constant, we can persist the dequantized value in
+  // the output tensor. Otherwise we run dequantize upon each eval.
+  if (IsConstantTensor(op_context.input)) {
+    op_context.output->allocation_type = kTfLiteArenaRwPersistent;
+  }
+  return context->ResizeTensor(context, op_context.output,
+                               TfLiteIntArrayCopy(op_context.input->dims));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  OpContext op_context(context, node);
+  if (IsConstantTensor(op_context.input) &&
+      op_data->float_dequantized_weights_initialized) {
+    return kTfLiteOk;
+  }
+
+  tflite::DequantizationParams op_params;
+  op_params.zero_point = op_context.input->params.zero_point;
+  op_params.scale = op_context.input->params.scale;
+  switch (op_context.input->type) {
+    case kTfLiteUInt8:
+      optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
+                                GetTensorData<uint8_t>(op_context.input),
+                                GetTensorShape(op_context.output),
+                                GetTensorData<float>(op_context.output));
+      break;
+    case kTfLiteInt8:
+      reference_integer_ops::Dequantize(
+          op_params, GetTensorShape(op_context.input),
+          GetTensorData<int8_t>(op_context.input),
+          GetTensorShape(op_context.output),
+          GetTensorData<float>(op_context.output));
+      break;
+    default:
+      context->ReportError(context, "Type %d not supported.",
+                           op_context.input->type);
+      return kTfLiteError;
+  }
+
+  if (IsConstantTensor(op_context.input)) {
+    op_data->float_dequantized_weights_initialized = true;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace dequantize
+
+TfLiteRegistration* Register_DEQUANTIZE_OPT() {
+  static TfLiteRegistration r = {dequantize::Init, dequantize::Free,
+                                 dequantize::Prepare, dequantize::Eval};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEQUANTIZE() { return Register_DEQUANTIZE_OPT(); }
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb5f1e74a8b0174209043e14af9c35db32bf14b5
--- /dev/null
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class DequantizeOpModel : public SingleOpModel {
+ public:
+  DequantizeOpModel(TensorType type, std::initializer_list<int> shape,
+                    float scale, int32_t zero_point) {
+    TensorData input_tensor_data;
+    input_tensor_data.type = type;
+    input_tensor_data.shape = shape;
+    input_tensor_data.min = 0;
+    input_tensor_data.max = 0;
+    input_tensor_data.scale = scale;
+    input_tensor_data.zero_point = zero_point;
+    input_ = AddInput(input_tensor_data);
+    output_ = AddOutput({TensorType_FLOAT32, shape});
+    SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
+                 CreateDequantizeOptions(builder_).Union());
+
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(DequantizeOpTest, UINT8) {
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  DequantizeOpModel m(TensorType_UINT8, {2, 5}, 0.5, 127);
+
+  m.SetInput<uint8>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
+}
+
+TEST(DequantizeOpTest, INT8) {
+  // [-63.5, 64] -> scale=0.5, zero_point=1 for INT8
+  DequantizeOpModel m(TensorType_INT8, {2, 5}, 0.5, -1);
+
+  m.SetInput<int8>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
similarity index 76%
rename from tensorflow/contrib/lite/kernels/detection_postprocess.cc
rename to tensorflow/lite/kernels/detection_postprocess.cc
index e21dc5ced9d606159b8d8decd41e3d46d73f0e62..84e2a0efb27c5e2381d76dba89ddf3445077576c 100644
--- a/tensorflow/contrib/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -43,9 +43,11 @@ constexpr int kOutputTensorNumDetections = 3;
 constexpr int kNumCoordBox = 4;
 constexpr int kBatchSize = 1;
 
+constexpr int kNumDetectionsPerClass = 100;
+
 // Object Detection model produces axis-aligned boxes in two formats:
-// BoxCorner represents the upper right (xmin, ymin) and
-// lower left corner (xmax, ymax).
+// BoxCorner represents the lower left corner (xmin, ymin) and
+// the upper right corner (xmax, ymax).
 // CenterSize represents the center (xcenter, ycenter), height and width.
 // BoxCornerEncoding and CenterSizeEncoding are related as follows:
 // ycenter = y / y_scale * anchor.h + anchor.y;
@@ -77,10 +79,12 @@ static_assert(sizeof(CenterSizeEncoding) == sizeof(float) * kNumCoordBox,
 
 struct OpData {
   int max_detections;
-  int max_classes_per_detection;
+  int max_classes_per_detection;  // Fast Non-Max-Suppression
+  int detections_per_class;       // Regular Non-Max-Suppression
   float non_max_suppression_score_threshold;
   float intersection_over_union_threshold;
   int num_classes;
+  bool use_regular_non_max_suppression;
   CenterSizeEncoding scale_values;
   // Indices of Temporary tensors
   int decoded_boxes_index;
@@ -94,6 +98,15 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
   op_data->max_detections = m["max_detections"].AsInt32();
   op_data->max_classes_per_detection = m["max_classes_per_detection"].AsInt32();
+  if (m["detections_per_class"].IsNull())
+    op_data->detections_per_class = kNumDetectionsPerClass;
+  else
+    op_data->detections_per_class = m["detections_per_class"].AsInt32();
+  if (m["use_regular_nms"].IsNull())
+    op_data->use_regular_non_max_suppression = false;
+  else
+    op_data->use_regular_non_max_suppression = m["use_regular_nms"].AsBool();
+
   op_data->non_max_suppression_score_threshold =
       m["nms_score_threshold"].AsFloat();
   op_data->intersection_over_union_threshold = m["nms_iou_threshold"].AsFloat();
@@ -350,19 +363,21 @@ float ComputeIntersectionOverUnion(const TfLiteTensor* decoded_boxes,
   return intersection_area / (area_i + area_j - intersection_area);
 }
 
-// NonMaxSuppressionSingleClass() is O(n^2) pairwise comparison between boxes
+// NonMaxSuppressionSingleClass() prunes out the box locations with high overlap
+// before selecting the highest scoring boxes (max_detections in number)
 // It assumes all boxes are good in beginning and sorts based on the scores.
 // If lower-scoring box has too much overlap with a higher-scoring box,
 // we get rid of the lower-scoring box.
+// Complexity is O(N^2) pairwise comparison between boxes
 TfLiteStatus NonMaxSuppressionSingleClassHelper(
     TfLiteContext* context, TfLiteNode* node, OpData* op_data,
-    const std::vector<float>& scores, std::vector<int>* selected) {
+    const std::vector<float>& scores, std::vector<int>* selected,
+    int max_detections) {
   const TfLiteTensor* input_box_encodings =
       GetInput(context, node, kInputTensorBoxEncodings);
   const TfLiteTensor* decoded_boxes =
       &context->tensors[op_data->decoded_boxes_index];
   const int num_boxes = input_box_encodings->dims->data[1];
-  const int max_detections = op_data->max_detections;
   const float non_max_suppression_score_threshold =
       op_data->non_max_suppression_score_threshold;
   const float intersection_over_union_threshold =
@@ -389,7 +404,6 @@ TfLiteStatus NonMaxSuppressionSingleClassHelper(
   sorted_indices.resize(num_scores_kept);
   DecreasingPartialArgSort(keep_scores.data(), num_scores_kept, num_scores_kept,
                            sorted_indices.data());
-
   const int num_boxes_kept = num_scores_kept;
   const int output_size = std::min(num_boxes_kept, max_detections);
   selected->clear();
@@ -427,6 +441,130 @@ TfLiteStatus NonMaxSuppressionSingleClassHelper(
   return kTfLiteOk;
 }
 
+// This function implements a regular version of Non Maximal Suppression (NMS)
+// for multiple classes where
+// 1) we do NMS separately for each class across all anchors and
+// 2) keep only the highest anchor scores across all classes
+// 3) The worst runtime of the regular NMS is O(K*N^2)
+// where N is the number of anchors and K the number of
+// classes.
+TfLiteStatus NonMaxSuppressionMultiClassRegularHelper(TfLiteContext* context,
+                                                      TfLiteNode* node,
+                                                      OpData* op_data,
+                                                      const float* scores) {
+  const TfLiteTensor* input_box_encodings =
+      GetInput(context, node, kInputTensorBoxEncodings);
+  const TfLiteTensor* decoded_boxes =
+      &context->tensors[op_data->decoded_boxes_index];
+
+  TfLiteTensor* detection_boxes =
+      GetOutput(context, node, kOutputTensorDetectionBoxes);
+  TfLiteTensor* detection_classes =
+      GetOutput(context, node, kOutputTensorDetectionClasses);
+  TfLiteTensor* detection_scores =
+      GetOutput(context, node, kOutputTensorDetectionScores);
+  TfLiteTensor* num_detections =
+      GetOutput(context, node, kOutputTensorNumDetections);
+
+  const int num_boxes = input_box_encodings->dims->data[1];
+  const int num_classes = op_data->num_classes;
+  const int num_detections_per_class = op_data->detections_per_class;
+  const int max_detections = op_data->max_detections;
+  // The row index offset is 1 if background class is included and 0 otherwise.
+  const int label_offset = 1;
+  TF_LITE_ENSURE(context, label_offset != -1);
+  TF_LITE_ENSURE(context, num_detections_per_class > 0);
+  const int num_classes_with_background = num_classes + label_offset;
+
+  // For each class, perform non-max suppression.
+  std::vector<float> class_scores(num_boxes);
+
+  std::vector<int> box_indices_after_regular_non_max_suppression(
+      num_boxes + max_detections);
+  std::vector<float> scores_after_regular_non_max_suppression(num_boxes +
+                                                              max_detections);
+
+  int size_of_sorted_indices = 0;
+  std::vector<int> sorted_indices;
+  sorted_indices.resize(max_detections);
+  std::vector<float> sorted_values;
+  sorted_values.resize(max_detections);
+
+  for (int col = 0; col < num_classes; col++) {
+    for (int row = 0; row < num_boxes; row++) {
+      // Get scores of boxes corresponding to all anchors for single class
+      class_scores[row] =
+          *(scores + row * num_classes_with_background + col + label_offset);
+    }
+    // Perform non-maximal suppression on single class
+    std::vector<int> selected;
+    NonMaxSuppressionSingleClassHelper(context, node, op_data, class_scores,
+                                       &selected, num_detections_per_class);
+    // Add selected indices from non-max suppression of boxes in this class
+    int output_index = size_of_sorted_indices;
+    for (int selected_index : selected) {
+      box_indices_after_regular_non_max_suppression[output_index] =
+          (selected_index * num_classes_with_background + col + label_offset);
+      scores_after_regular_non_max_suppression[output_index] =
+          class_scores[selected_index];
+      output_index++;
+    }
+    // Sort the max scores among the selected indices
+    // Get the indices for top scores
+    int num_indices_to_sort = std::min(output_index, max_detections);
+    DecreasingPartialArgSort(scores_after_regular_non_max_suppression.data(),
+                             output_index, num_indices_to_sort,
+                             sorted_indices.data());
+
+    // Copy values to temporary vectors
+    for (int row = 0; row < num_indices_to_sort; row++) {
+      int temp = sorted_indices[row];
+      sorted_indices[row] = box_indices_after_regular_non_max_suppression[temp];
+      sorted_values[row] = scores_after_regular_non_max_suppression[temp];
+    }
+    // Copy scores and indices from temporary vectors
+    for (int row = 0; row < num_indices_to_sort; row++) {
+      box_indices_after_regular_non_max_suppression[row] = sorted_indices[row];
+      scores_after_regular_non_max_suppression[row] = sorted_values[row];
+    }
+    size_of_sorted_indices = num_indices_to_sort;
+  }
+
+  // Allocate output tensors
+  for (int output_box_index = 0; output_box_index < max_detections;
+       output_box_index++) {
+    if (output_box_index < size_of_sorted_indices) {
+      const int anchor_index = floor(
+          box_indices_after_regular_non_max_suppression[output_box_index] /
+          num_classes_with_background);
+      const int class_index =
+          box_indices_after_regular_non_max_suppression[output_box_index] -
+          anchor_index * num_classes_with_background - label_offset;
+      const float selected_score =
+          scores_after_regular_non_max_suppression[output_box_index];
+      // detection_boxes
+      ReInterpretTensor<BoxCornerEncoding*>(detection_boxes)[output_box_index] =
+          ReInterpretTensor<const BoxCornerEncoding*>(
+              decoded_boxes)[anchor_index];
+      // detection_classes
+      detection_classes->data.f[output_box_index] = class_index;
+      // detection_scores
+      detection_scores->data.f[output_box_index] = selected_score;
+    } else {
+      ReInterpretTensor<BoxCornerEncoding*>(
+          detection_boxes)[output_box_index] = {0.0f, 0.0f, 0.0f, 0.0f};
+      // detection_classes
+      detection_classes->data.f[output_box_index] = 0.0f;
+      // detection_scores
+      detection_scores->data.f[output_box_index] = 0.0f;
+    }
+  }
+  num_detections->data.f[0] = size_of_sorted_indices;
+  box_indices_after_regular_non_max_suppression.clear();
+  scores_after_regular_non_max_suppression.clear();
+  return kTfLiteOk;
+}
+
 // This function implements a fast version of Non Maximal Suppression for
 // multiple classes where
 // 1) we keep the top-k scores for each anchor and
@@ -477,7 +615,7 @@ TfLiteStatus NonMaxSuppressionMultiClassFastHelper(TfLiteContext* context,
   // Perform non-maximal suppression on max scores
   std::vector<int> selected;
   NonMaxSuppressionSingleClassHelper(context, node, op_data, max_scores,
-                                     &selected);
+                                     &selected, op_data->max_detections);
   // Allocate output tensors
   int output_box_index = 0;
   for (const auto& selected_index : selected) {
@@ -549,8 +687,13 @@ TfLiteStatus NonMaxSuppressionMultiClass(TfLiteContext* context,
       // Unsupported type.
       return kTfLiteError;
   }
-  NonMaxSuppressionMultiClassFastHelper(context, node, op_data,
-                                        GetTensorData<float>(scores));
+  if (op_data->use_regular_non_max_suppression)
+    NonMaxSuppressionMultiClassRegularHelper(context, node, op_data,
+                                             GetTensorData<float>(scores));
+  else
+    NonMaxSuppressionMultiClassFastHelper(context, node, op_data,
+                                          GetTensorData<float>(scores));
+
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/kernels/detection_postprocess_test.cc b/tensorflow/lite/kernels/detection_postprocess_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1c061a3cad4407ec965b67387f006b5e0be4ec7
--- /dev/null
+++ b/tensorflow/lite/kernels/detection_postprocess_test.cc
@@ -0,0 +1,578 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_DETECTION_POSTPROCESS();
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+// Tests for scenarios where we DO NOT set use_regular_nms flag
+class BaseDetectionPostprocessOpModel : public SingleOpModel {
+ public:
+  BaseDetectionPostprocessOpModel(const TensorData& input1,
+                            const TensorData& input2,
+                            const TensorData& input3,
+                            const TensorData& output1,
+                            const TensorData& output2,
+                            const TensorData& output3,
+                            const TensorData& output4) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    input3_ = AddInput(input3);
+    output1_ = AddOutput(output1);
+    output2_ = AddOutput(output2);
+    output3_ = AddOutput(output3);
+    output4_ = AddOutput(output4);
+
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Int("max_detections", 3);
+      fbb.Int("max_classes_per_detection", 1);
+      fbb.Float("nms_score_threshold", 0.0);
+      fbb.Float("nms_iou_threshold", 0.5);
+      fbb.Int("num_classes", 2);
+      fbb.Float("y_scale", 10.0);
+      fbb.Float("x_scale", 10.0);
+      fbb.Float("h_scale", 5.0);
+      fbb.Float("w_scale", 5.0);
+    });
+    fbb.Finish();
+    SetCustomOp("TFLite_Detection_PostProcess", fbb.GetBuffer(),
+                Register_DETECTION_POSTPROCESS);
+    BuildInterpreter({GetShape(input1_), GetShape(input2_), GetShape(input3_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  int input3() { return input3_; }
+
+  template <class T>
+  void SetInput1(std::initializer_list<T> data) {
+    PopulateTensor<T>(input1_, data);
+  }
+
+  template <class T>
+  void SetInput2(std::initializer_list<T> data) {
+    PopulateTensor<T>(input2_, data);
+  }
+
+  template <class T>
+  void SetInput3(std::initializer_list<T> data) {
+    PopulateTensor<T>(input3_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput1() {
+    return ExtractVector<T>(output1_);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput2() {
+    return ExtractVector<T>(output2_);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput3() {
+    return ExtractVector<T>(output3_);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput4() {
+    return ExtractVector<T>(output4_);
+  }
+
+  std::vector<int> GetOutputShape1() { return GetTensorShape(output1_); }
+  std::vector<int> GetOutputShape2() { return GetTensorShape(output2_); }
+  std::vector<int> GetOutputShape3() { return GetTensorShape(output3_); }
+  std::vector<int> GetOutputShape4() { return GetTensorShape(output4_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int input3_;
+  int output1_;
+  int output2_;
+  int output3_;
+  int output4_;
+};
+
+TEST(DetectionPostprocessOpTest, FloatTest) {
+  BaseDetectionPostprocessOpModel m(
+      {TensorType_FLOAT32, {1, 6, 4}}, {TensorType_FLOAT32, {1, 6, 3}},
+      {TensorType_FLOAT32, {6, 4}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}});
+
+  // six boxes in center-size encoding
+  m.SetInput1<float>({
+      0.0, 0.0,  0.0, 0.0,  // box #1
+      0.0, 1.0,  0.0, 0.0,  // box #2
+      0.0, -1.0, 0.0, 0.0,  // box #3
+      0.0, 0.0,  0.0, 0.0,  // box #4
+      0.0, 1.0,  0.0, 0.0,  // box #5
+      0.0, 0.0,  0.0, 0.0   // box #6
+  });
+  // class scores - two classes with background
+  m.SetInput2<float>({0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0.,
+                      .5, .4, 0., .3, .2});
+  // six anchors in center-size encoding
+  m.SetInput3<float>({
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  //  anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  });
+  // Same boxes in box-corner encoding:
+  // { 0.0, 0.0, 1.0, 1.0,
+  //   0.0, 0.1, 1.0, 1.1,
+  //   0.0, -0.1, 1.0, 0.9,
+  //   0.0, 10.0, 1.0, 11.0,
+  //   0.0, 10.1, 1.0, 11.1,
+  //   0.0, 100.0, 1.0, 101.0}
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(
+      m.GetOutput1<float>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
+          1e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+}
+
+TEST(DetectionPostprocessOpTest, QuantizedTest) {
+  BaseDetectionPostprocessOpModel m(
+      {TensorType_UINT8, {1, 6, 4}, -1.0, 1.0},
+      {TensorType_UINT8, {1, 6, 3}, 0.0, 1.0},
+      {TensorType_UINT8, {6, 4}, 0.0, 100.5}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}});
+  // six boxes in center-size encoding
+  std::vector<std::vector<float>> inputs1 = {{
+      0.0, 0.0,  0.0, 0.0,  // box #1
+      0.0, 1.0,  0.0, 0.0,  // box #2
+      0.0, -1.0, 0.0, 0.0,  // box #3
+      0.0, 0.0,  0.0, 0.0,  // box #4
+      0.0, 1.0,  0.0, 0.0,  // box #5
+      0.0, 0.0,  0.0, 0.0   // box #6
+  }};
+  m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
+  // class scores - two classes with background
+  std::vector<std::vector<float>> inputs2 = {{0., .9, .8, 0., .75, .72, 0., .6,
+                                              .5, 0., .93, .95, 0., .5, .4, 0.,
+                                              .3, .2}};
+  m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
+  // six anchors in center-size encoding
+  std::vector<std::vector<float>> inputs3 = {{
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  // anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  }};
+  m.QuantizeAndPopulate<uint8_t>(m.input3(), inputs3[0]);
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(
+      m.GetOutput1<float>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
+          3e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+}
+
+// Tests for scenarios where we set use_regular_nms flag
+class DetectionPostprocessOpModelwithRegularNMS : public SingleOpModel {
+ public:
+  DetectionPostprocessOpModelwithRegularNMS(
+      const TensorData& input1, const TensorData& input2,
+      const TensorData& input3, const TensorData& output1,
+      const TensorData& output2, const TensorData& output3,
+      const TensorData& output4, bool use_regular_nms) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    input3_ = AddInput(input3);
+    output1_ = AddOutput(output1);
+    output2_ = AddOutput(output2);
+    output3_ = AddOutput(output3);
+    output4_ = AddOutput(output4);
+
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Int("max_detections", 3);
+      fbb.Int("max_classes_per_detection", 1);
+      fbb.Int("detections_per_class", 1);
+      fbb.Bool("use_regular_nms", use_regular_nms);
+      fbb.Float("nms_score_threshold", 0.0);
+      fbb.Float("nms_iou_threshold", 0.5);
+      fbb.Int("num_classes", 2);
+      fbb.Float("y_scale", 10.0);
+      fbb.Float("x_scale", 10.0);
+      fbb.Float("h_scale", 5.0);
+      fbb.Float("w_scale", 5.0);
+    });
+    fbb.Finish();
+    SetCustomOp("TFLite_Detection_PostProcess", fbb.GetBuffer(),
+                Register_DETECTION_POSTPROCESS);
+    BuildInterpreter({GetShape(input1_), GetShape(input2_), GetShape(input3_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  int input3() { return input3_; }
+
+  template <class T>
+  void SetInput1(std::initializer_list<T> data) {
+    PopulateTensor<T>(input1_, data);
+  }
+
+  template <class T>
+  void SetInput2(std::initializer_list<T> data) {
+    PopulateTensor<T>(input2_, data);
+  }
+
+  template <class T>
+  void SetInput3(std::initializer_list<T> data) {
+    PopulateTensor<T>(input3_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput1() {
+    return ExtractVector<T>(output1_);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput2() {
+    return ExtractVector<T>(output2_);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput3() {
+    return ExtractVector<T>(output3_);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput4() {
+    return ExtractVector<T>(output4_);
+  }
+
+  std::vector<int> GetOutputShape1() { return GetTensorShape(output1_); }
+  std::vector<int> GetOutputShape2() { return GetTensorShape(output2_); }
+  std::vector<int> GetOutputShape3() { return GetTensorShape(output3_); }
+  std::vector<int> GetOutputShape4() { return GetTensorShape(output4_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int input3_;
+  int output1_;
+  int output2_;
+  int output3_;
+  int output4_;
+};
+
+TEST(DetectionPostprocessOpTest, FloatTestFastNMS) {
+  DetectionPostprocessOpModelwithRegularNMS m(
+      {TensorType_FLOAT32, {1, 6, 4}}, {TensorType_FLOAT32, {1, 6, 3}},
+      {TensorType_FLOAT32, {6, 4}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, false);
+
+  // six boxes in center-size encoding
+  m.SetInput1<float>({
+      0.0, 0.0,  0.0, 0.0,  // box #1
+      0.0, 1.0,  0.0, 0.0,  // box #2
+      0.0, -1.0, 0.0, 0.0,  // box #3
+      0.0, 0.0,  0.0, 0.0,  // box #4
+      0.0, 1.0,  0.0, 0.0,  // box #5
+      0.0, 0.0,  0.0, 0.0   // box #6
+  });
+  // class scores - two classes with background
+  m.SetInput2<float>({0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0.,
+                      .5, .4, 0., .3, .2});
+  // six anchors in center-size encoding
+  m.SetInput3<float>({
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  //  anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  });
+  // Same boxes in box-corner encoding:
+  // { 0.0, 0.0, 1.0, 1.0,
+  //   0.0, 0.1, 1.0, 1.1,
+  //   0.0, -0.1, 1.0, 0.9,
+  //   0.0, 10.0, 1.0, 11.0,
+  //   0.0, 10.1, 1.0, 11.1,
+  //   0.0, 100.0, 1.0, 101.0}
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(
+      m.GetOutput1<float>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
+          1e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+}
+
+TEST(DetectionPostprocessOpTest, QuantizedTestFastNMS) {
+  DetectionPostprocessOpModelwithRegularNMS m(
+      {TensorType_UINT8, {1, 6, 4}, -1.0, 1.0},
+      {TensorType_UINT8, {1, 6, 3}, 0.0, 1.0},
+      {TensorType_UINT8, {6, 4}, 0.0, 100.5}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, false);
+  // six boxes in center-size encoding
+  std::vector<std::vector<float>> inputs1 = {{
+      0.0, 0.0,  0.0, 0.0,  // box #1
+      0.0, 1.0,  0.0, 0.0,  // box #2
+      0.0, -1.0, 0.0, 0.0,  // box #3
+      0.0, 0.0,  0.0, 0.0,  // box #4
+      0.0, 1.0,  0.0, 0.0,  // box #5
+      0.0, 0.0,  0.0, 0.0   // box #6
+  }};
+  m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
+  // class scores - two classes with background
+  std::vector<std::vector<float>> inputs2 = {{0., .9, .8, 0., .75, .72, 0., .6,
+                                              .5, 0., .93, .95, 0., .5, .4, 0.,
+                                              .3, .2}};
+  m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
+  // six anchors in center-size encoding
+  std::vector<std::vector<float>> inputs3 = {{
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  // anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  }};
+  m.QuantizeAndPopulate<uint8_t>(m.input3(), inputs3[0]);
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(
+      m.GetOutput1<float>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0, 10.0, 1.0, 11.0, 0.0, 0.0, 1.0, 1.0, 0.0, 100.0, 1.0, 101.0},
+          3e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.3}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({3.0}, 1e-1)));
+}
+
+TEST(DetectionPostprocessOpTest, FloatTestRegularNMS) {
+  DetectionPostprocessOpModelwithRegularNMS m(
+      {TensorType_FLOAT32, {1, 6, 4}}, {TensorType_FLOAT32, {1, 6, 3}},
+      {TensorType_FLOAT32, {6, 4}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, true);
+  // six boxes in center-size encoding
+  m.SetInput1<float>({
+      0.0, 0.0,  0.0, 0.0,  // box #1
+      0.0, 1.0,  0.0, 0.0,  // box #2
+      0.0, -1.0, 0.0, 0.0,  // box #3
+      0.0, 0.0,  0.0, 0.0,  // box #4
+      0.0, 1.0,  0.0, 0.0,  // box #5
+      0.0, 0.0,  0.0, 0.0   // box #6
+  });
+  // class scores - two classes with background
+  m.SetInput2<float>({0., .9, .8, 0., .75, .72, 0., .6, .5, 0., .93, .95, 0.,
+                      .5, .4, 0., .3, .2});
+  // six anchors in center-size encoding
+  m.SetInput3<float>({
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  //  anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  });
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(m.GetOutput1<float>(),
+              ElementsAreArray(ArrayFloatNear({0.0, 10.0, 1.0, 11.0, 0.0, 10.0,
+                                               1.0, 11.0, 0.0, 0.0, 0.0, 0.0},
+                                              3e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.0}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({2.0}, 1e-1)));
+}
+
+TEST(DetectionPostprocessOpTest, QuantizedTestRegularNMS) {
+  DetectionPostprocessOpModelwithRegularNMS m(
+      {TensorType_UINT8, {1, 6, 4}, -1.0, 1.0},
+      {TensorType_UINT8, {1, 6, 3}, 0.0, 1.0},
+      {TensorType_UINT8, {6, 4}, 0.0, 100.5}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}},
+      {TensorType_FLOAT32, {}}, true);
+  // six boxes in center-size encoding
+  std::vector<std::vector<float>> inputs1 = {{
+      0.0, 0.0,  0.0, 0.0,  // box #1
+      0.0, 1.0,  0.0, 0.0,  // box #2
+      0.0, -1.0, 0.0, 0.0,  // box #3
+      0.0, 0.0,  0.0, 0.0,  // box #4
+      0.0, 1.0,  0.0, 0.0,  // box #5
+      0.0, 0.0,  0.0, 0.0   // box #6
+  }};
+  m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[0]);
+  // class scores - two classes with background
+  std::vector<std::vector<float>> inputs2 = {{0., .9, .8, 0., .75, .72, 0., .6,
+                                              .5, 0., .93, .95, 0., .5, .4, 0.,
+                                              .3, .2}};
+  m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[0]);
+  // six anchors in center-size encoding
+  std::vector<std::vector<float>> inputs3 = {{
+      0.5, 0.5,   1.0, 1.0,  // anchor #1
+      0.5, 0.5,   1.0, 1.0,  // anchor #2
+      0.5, 0.5,   1.0, 1.0,  // anchor #3
+      0.5, 10.5,  1.0, 1.0,  // anchor #4
+      0.5, 10.5,  1.0, 1.0,  // anchor #5
+      0.5, 100.5, 1.0, 1.0   // anchor #6
+  }};
+  m.QuantizeAndPopulate<uint8_t>(m.input3(), inputs3[0]);
+  m.Invoke();
+  // detection_boxes
+  // in center-size
+  std::vector<int> output_shape1 = m.GetOutputShape1();
+  EXPECT_THAT(output_shape1, ElementsAre(1, 3, 4));
+  EXPECT_THAT(m.GetOutput1<float>(),
+              ElementsAreArray(ArrayFloatNear({0.0, 10.0, 1.0, 11.0, 0.0, 10.0,
+                                               1.0, 11.0, 0.0, 0.0, 0.0, 0.0},
+                                              3e-1)));
+  // detection_classes
+  std::vector<int> output_shape2 = m.GetOutputShape2();
+  EXPECT_THAT(output_shape2, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput2<float>(),
+              ElementsAreArray(ArrayFloatNear({1, 0, 0}, 1e-1)));
+  // detection_scores
+  std::vector<int> output_shape3 = m.GetOutputShape3();
+  EXPECT_THAT(output_shape3, ElementsAre(1, 3));
+  EXPECT_THAT(m.GetOutput3<float>(),
+              ElementsAreArray(ArrayFloatNear({0.95, 0.9, 0.0}, 1e-1)));
+  // num_detections
+  std::vector<int> output_shape4 = m.GetOutputShape4();
+  EXPECT_THAT(output_shape4, ElementsAre(1));
+  EXPECT_THAT(m.GetOutput4<float>(),
+              ElementsAreArray(ArrayFloatNear({2.0}, 1e-1)));
+}
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/lite/kernels/div.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/div.cc
rename to tensorflow/lite/kernels/div.cc
index 8d4bb5100664a3397c5d70809884294551aedd4d..fb40953123505afa1fc311b5e16c2c98dbe3be72 100644
--- a/tensorflow/contrib/lite/kernels/div.cc
+++ b/tensorflow/lite/kernels/div.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/div_test.cc b/tensorflow/lite/kernels/div_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/div_test.cc
rename to tensorflow/lite/kernels/div_test.cc
index 97aa2fe04e27416b99f48ab61ece54b745597ae3..68a8855dd1346fb157e8bb3f2dd16ee136b539d9 100644
--- a/tensorflow/contrib/lite/kernels/div_test.cc
+++ b/tensorflow/lite/kernels/div_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
similarity index 86%
rename from tensorflow/contrib/lite/kernels/eigen_support.cc
rename to tensorflow/lite/kernels/eigen_support.cc
index e542ad076528fa30152abba074a5c7dcd6ca1f48..bad5975a7c187cc4bdcd65721d397897ff2cf09d 100644
--- a/tensorflow/contrib/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/eigen_support.h"
+#include "tensorflow/lite/kernels/eigen_support.h"
 
 #include <utility>
 
-#include "tensorflow/contrib/lite/arena_planner.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/arena_planner.h"
+#include "tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace eigen_support {
@@ -34,6 +34,15 @@ static_assert(
     "kDefaultArenaAlignment doesn't comply with Eigen alignment requirement.");
 #endif  // EIGEN_DONT_ALIGN
 
+// Helper routine for updating the global Eigen thread count used for OpenMP.
+void SetEigenNbThreads(int threads) {
+#if defined(EIGEN_HAS_OPENMP)
+  // The global Eigen thread count is only used when OpenMP is enabled. As this
+  // call causes problems with tsan, make it only when OpenMP is available.
+  Eigen::setNbThreads(context->recommended_num_threads);
+#endif  // defined(EIGEN_HAS_OPENMP)
+}
+
 // We have a single global threadpool for all convolution operations. This means
 // that inferences started from different threads may block each other, but
 // since the underlying resource of CPU cores should be consumed by the
@@ -78,7 +87,7 @@ void InitDevice(TfLiteContext* context, RefCountedEigenContext* ptr) {
 }
 
 TfLiteStatus Refresh(TfLiteContext* context) {
-  Eigen::setNbThreads(context->recommended_num_threads);
+  SetEigenNbThreads(context->recommended_num_threads);
 
   auto* ptr = GetEigenContext(context);
   if (ptr != nullptr) {
@@ -94,7 +103,7 @@ void IncrementUsageCounter(TfLiteContext* context) {
   auto* ptr = GetEigenContext(context);
   if (ptr == nullptr) {
     if (context->recommended_num_threads != -1) {
-      Eigen::setNbThreads(context->recommended_num_threads);
+      SetEigenNbThreads(context->recommended_num_threads);
     }
     ptr = new RefCountedEigenContext;
     ptr->type = kTfLiteEigenContext;
diff --git a/tensorflow/contrib/lite/kernels/eigen_support.h b/tensorflow/lite/kernels/eigen_support.h
similarity index 85%
rename from tensorflow/contrib/lite/kernels/eigen_support.h
rename to tensorflow/lite/kernels/eigen_support.h
index feb1543f7be348995ecd406428e2d0016ce1cc58..c24ae6896a7e9783ddd32bc510881ccc1a5d27bf 100644
--- a/tensorflow/contrib/lite/kernels/eigen_support.h
+++ b/tensorflow/lite/kernels/eigen_support.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_EIGEN_SUPPORT_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_EIGEN_SUPPORT_H_
+#ifndef TENSORFLOW_LITE_KERNELS_EIGEN_SUPPORT_H_
+#define TENSORFLOW_LITE_KERNELS_EIGEN_SUPPORT_H_
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace EigenForTFLite {
 struct ThreadPoolDevice;
@@ -38,4 +38,4 @@ const EigenForTFLite::ThreadPoolDevice* GetThreadPoolDevice(
 }  // namespace eigen_support
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_EIGEN_SUPPORT_H_
+#endif  // TENSORFLOW_LITE_KERNELS_EIGEN_SUPPORT_H_
diff --git a/tensorflow/contrib/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
similarity index 89%
rename from tensorflow/contrib/lite/kernels/elementwise.cc
rename to tensorflow/lite/kernels/elementwise.cc
index 8c624b320808d2d2c4f7feb8e75d59a548d67160..a79388b900eb89b56a4d18f887dbe52e84fb123f 100644
--- a/tensorflow/contrib/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -14,9 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cmath>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -74,6 +75,10 @@ inline TfLiteStatus EvalLogical(TfLiteContext* context, TfLiteNode* node,
   return EvalImpl<bool>(context, node, bool_func, kTfLiteBool);
 }
 
+TfLiteStatus AbsEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::abs);
+}
+
 TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::sin);
 }
@@ -101,6 +106,14 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 }  // namespace elementwise
 
+TfLiteRegistration* Register_ABS() {
+  static TfLiteRegistration r = {
+      /*init=*/nullptr, /*free=*/nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::AbsEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_SIN() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
diff --git a/tensorflow/contrib/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
similarity index 83%
rename from tensorflow/contrib/lite/kernels/elementwise_test.cc
rename to tensorflow/lite/kernels/elementwise_test.cc
index 5dd89a0eaec13b94a8acd326a8c3b0b2e5be3e37..7d24320081257925508b2aa53503c1cf71d0e913 100644
--- a/tensorflow/contrib/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -74,6 +74,19 @@ TEST(ElementWise, Log) {
   EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
 }
 
+TEST(FloatActivationsOpTest, Abs) {
+  ElementWiseOpFloatModel m(BuiltinOperator_ABS, {1, 2, 4, 1});
+  m.PopulateTensor<float>(m.input(), {
+                                         0.f, -6.2f, 2.f, 4.f,  //
+                                         3.f, -2.f, 10.f, 1.f,  //
+                                     });
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({
+                                                      0.f, 6.2f, 2.f, 4.f,  //
+                                                      3.f, 2.f, 10.f, 1.f,  //
+                                                  }));
+}
+
 TEST(ElementWise, Sqrt) {
   ElementWiseOpFloatModel m(BuiltinOperator_SQRT, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {0, 1, 2, 4});
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
similarity index 88%
rename from tensorflow/contrib/lite/kernels/embedding_lookup.cc
rename to tensorflow/lite/kernels/embedding_lookup.cc
index fe33f98eb0d78d988af4ded79065621b53b659f3..fad32607b4980ce5d0e6b6a8540adf3b19529403 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -37,10 +37,10 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -78,7 +78,10 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
     int idx = lookup->data.i32[i];
     if (idx >= row_size || idx < 0) {
-      context->ReportError(context, "Embedding Lookup: index out of bounds.");
+      context->ReportError(context,
+                           "Embedding Lookup: index out of bounds. "
+                           "Got %d, and bounds are [0, %d]",
+                           idx, row_size - 1);
       return kTfLiteError;
     } else {
       memcpy(output->data.raw + i * row_bytes,
@@ -104,7 +107,10 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
     int idx = lookup->data.i32[i];
     if (idx >= row_size || idx < 0) {
-      context->ReportError(context, "Embedding Lookup: index out of bounds.");
+      context->ReportError(context,
+                           "Embedding Lookup: index out of bounds. "
+                           "Got %d, and bounds are [0, %d]",
+                           idx, row_size - 1);
       return kTfLiteError;
     } else {
       // Dequantize embedding values.
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
rename to tensorflow/lite/kernels/embedding_lookup_sparse.cc
index aa75b03990208f9ee61bb2ef3bfc9cbefcfbaf2b..72bfe5b4f5d71fd9725923514788d6056132ab23 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
@@ -65,11 +65,11 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -188,7 +188,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     int idx = ids->data.i32[i];
     if (idx >= num_rows || idx < 0) {
       context->ReportError(context,
-                           "Embedding Lookup Sparse: index out of bounds.");
+                           "Embedding Lookup Sparse: index out of bounds. "
+                           "Got %d, and bounds are [0, %d]",
+                           idx, num_rows - 1);
       return kTfLiteError;
     }
 
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
rename to tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
index ef2b5422253ea880a9ded4d3c0efc5cec07178a9..0c555fdd7de61fa0ea6fd4e08f2b103f35bff92f 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
rename to tensorflow/lite/kernels/embedding_lookup_test.cc
index 4a88d168c60203f10802e634def9b1d1316c9c6d..8ea98a5f0dcbfbcec826c0b9dee0d28cd0bd2885 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -20,10 +20,10 @@ License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/exp.cc b/tensorflow/lite/kernels/exp.cc
similarity index 88%
rename from tensorflow/contrib/lite/kernels/exp.cc
rename to tensorflow/lite/kernels/exp.cc
index 673e7be90a6d5747246e5af4c0656c1142a14bae..607b398ebd73f67e1707d5a4b1260d490beffa7e 100644
--- a/tensorflow/contrib/lite/kernels/exp.cc
+++ b/tensorflow/lite/kernels/exp.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/exp_test.cc b/tensorflow/lite/kernels/exp_test.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/exp_test.cc
rename to tensorflow/lite/kernels/exp_test.cc
index eed67369a1f30e57cd29a3975a899db41938def0..fa71fe351a421a35258101b1ffe26a4baa9c7c61 100644
--- a/tensorflow/contrib/lite/kernels/exp_test.cc
+++ b/tensorflow/lite/kernels/exp_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/expand_dims.cc b/tensorflow/lite/kernels/expand_dims.cc
similarity index 90%
rename from tensorflow/contrib/lite/kernels/expand_dims.cc
rename to tensorflow/lite/kernels/expand_dims.cc
index fa1140b19c09ddc25530b7776849c239d5dd4241..dd2479f34e6e8f0e28ac3ed6885ce2302e993141 100644
--- a/tensorflow/contrib/lite/kernels/expand_dims.cc
+++ b/tensorflow/lite/kernels/expand_dims.cc
@@ -15,12 +15,12 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 namespace tflite {
 namespace ops {
 namespace builtin {
diff --git a/tensorflow/contrib/lite/kernels/expand_dims_test.cc b/tensorflow/lite/kernels/expand_dims_test.cc
similarity index 90%
rename from tensorflow/contrib/lite/kernels/expand_dims_test.cc
rename to tensorflow/lite/kernels/expand_dims_test.cc
index a3bc1813dbc776718ad180a863e363272fbb5ec2..ea0c6c0fc830ec4a033215eff18a66d52c8e12c1 100644
--- a/tensorflow/contrib/lite/kernels/expand_dims_test.cc
+++ b/tensorflow/lite/kernels/expand_dims_test.cc
@@ -14,11 +14,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/fake_quant.cc b/tensorflow/lite/kernels/fake_quant.cc
similarity index 89%
rename from tensorflow/contrib/lite/kernels/fake_quant.cc
rename to tensorflow/lite/kernels/fake_quant.cc
index b51af72fe66a6945e9fee64000a0c09a5dd608bc..9c799a7ec2247d56927da10f0140cffa09b63b3c 100644
--- a/tensorflow/contrib/lite/kernels/fake_quant.cc
+++ b/tensorflow/lite/kernels/fake_quant.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/fake_quant_test.cc b/tensorflow/lite/kernels/fake_quant_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/fake_quant_test.cc
rename to tensorflow/lite/kernels/fake_quant_test.cc
index 11a02f7ed7474e05b887955c111179d2d403f0e6..ce14703421e1cd8bced21aacdd0324352724b2c5 100644
--- a/tensorflow/contrib/lite/kernels/fake_quant_test.cc
+++ b/tensorflow/lite/kernels/fake_quant_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/fill.cc b/tensorflow/lite/kernels/fill.cc
new file mode 100644
index 0000000000000000000000000000000000000000..079ee44f3719f9fa283bf617ee3917eb4c377aff
--- /dev/null
+++ b/tensorflow/lite/kernels/fill.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace fill {
+
+namespace {
+
+constexpr int kDimsTensor = 0;
+constexpr int kValueTensor = 1;
+constexpr int kOutputTensor = 0;
+
+template <typename T>
+TfLiteStatus ResizeOutputImpl(TfLiteContext* context, const TfLiteTensor* dims,
+                              TfLiteTensor* output) {
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(dims->dims->data[0]);
+  for (int i = 0; i < output_shape->size; ++i) {
+    T data = GetTensorData<T>(dims)[i];
+    if (data < 0) {
+      context->ReportError(context, "Fill dimensions must be >= 0", dims->type);
+      return kTfLiteError;
+    }
+    output_shape->data[i] = data;
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* dims,
+                          TfLiteTensor* output) {
+  switch (dims->type) {
+    case kTfLiteInt32:
+      return ResizeOutputImpl<int32_t>(context, dims, output);
+    case kTfLiteInt64:
+      return ResizeOutputImpl<int64_t>(context, dims, output);
+    default:
+      context->ReportError(
+          context,
+          "Fill only currently supports int32, int64 for input 0, "
+          "got %d.",
+          dims->type);
+      return kTfLiteError;
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
+  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+
+  // Make sure the 1st input tensor is 1-D.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(dims), 1);
+
+  // Make sure the 1st input tensor is int32 or int64.
+  const auto dtype = dims->type;
+  TF_LITE_ENSURE(context, dtype == kTfLiteInt32 || dtype == kTfLiteInt64);
+
+  // Make sure the 2nd input tensor is a scalar.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(value), 0);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = value->type;
+
+  if (IsConstantTensor(dims)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, dims, output));
+  } else {
+    SetTensorToDynamic(output);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, dims, output));
+  }
+#define TF_LITE_FILL(data_type)                                               \
+  reference_ops::Fill(GetTensorShape(value), GetTensorData<data_type>(value), \
+                      GetTensorShape(output),                                 \
+                      GetTensorData<data_type>(output))
+  switch (output->type) {
+    case kTfLiteInt32:
+      TF_LITE_FILL(int32_t);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_FILL(int64_t);
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_FILL(float);
+      break;
+    default:
+      context->ReportError(
+          context,
+          "Fill only currently supports int32, int64, float32 for input 1,"
+          "got %d.",
+          value->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_FILL
+  return kTfLiteOk;
+}
+
+}  // namespace fill
+
+TfLiteRegistration* Register_FILL() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 fill::Prepare, fill::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08044d76f9d95774fa1b0e37ebb6a9716e9809cb
--- /dev/null
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
+
+class FillOpModel : public SingleOpModel {
+ public:
+  explicit FillOpModel(const TensorData& input1, const TensorData& input2) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(input1);
+    SetBuiltinOp(BuiltinOperator_FILL, BuiltinOptions_FillOptions,
+                 CreateFillOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  int output() { return output_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(FillOpModel, FillInt32) {
+  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT32});
+  m.PopulateTensor<int32_t>(m.input1(), {2, 3});
+  m.PopulateTensor<int32_t>(m.input2(), {-11});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
+              ElementsAreArray({-11, -11, -11, -11, -11, -11}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 3}));
+}
+
+TEST(FillOpModel, FillInt64) {
+  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT64});
+  m.PopulateTensor<int32_t>(m.input1(), {2, 4});
+  m.PopulateTensor<int64_t>(m.input2(), {2 ^ 45});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int64_t>(m.output()),
+              ElementsAreArray({2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45,
+                                2 ^ 45, 2 ^ 45}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 4}));
+}
+
+TEST(FillOpModel, FillFloat) {
+  FillOpModel m({TensorType_INT64, {3}}, {TensorType_FLOAT32});
+  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
+  m.PopulateTensor<float>(m.input2(), {4.0});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray({4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+}
+
+TEST(FillOpModel, FillOutputScalar) {
+  FillOpModel m({TensorType_INT64, {0}}, {TensorType_FLOAT32});
+  m.PopulateTensor<float>(m.input2(), {4.0});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({4.0}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), IsEmpty());
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/floor.cc b/tensorflow/lite/kernels/floor.cc
similarity index 88%
rename from tensorflow/contrib/lite/kernels/floor.cc
rename to tensorflow/lite/kernels/floor.cc
index 59ff77f35b8d3f1e4abb41687b2985cd75dd45a2..aa117e3cacfc4624d347ba812e23801c223bae7b 100644
--- a/tensorflow/contrib/lite/kernels/floor.cc
+++ b/tensorflow/lite/kernels/floor.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/floor_div.cc b/tensorflow/lite/kernels/floor_div.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/floor_div.cc
rename to tensorflow/lite/kernels/floor_div.cc
index 5d62cd27550f4f78d33e2f357cf6553a15fd2356..9d404af5b0b5e94d56c17c2be49a91ad2bb60b2e 100644
--- a/tensorflow/contrib/lite/kernels/floor_div.cc
+++ b/tensorflow/lite/kernels/floor_div.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/floor_div_test.cc b/tensorflow/lite/kernels/floor_div_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/floor_div_test.cc
rename to tensorflow/lite/kernels/floor_div_test.cc
index eea69b61ac161ea66d62e06e6d778666f289f510..8816260d9b45da705285f540914d8436019b8e3b 100644
--- a/tensorflow/contrib/lite/kernels/floor_div_test.cc
+++ b/tensorflow/lite/kernels/floor_div_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/floor_mod.cc b/tensorflow/lite/kernels/floor_mod.cc
new file mode 100644
index 0000000000000000000000000000000000000000..878716a5b4a97be62aa3f966b03bd90194e75aae
--- /dev/null
+++ b/tensorflow/lite/kernels/floor_mod.cc
@@ -0,0 +1,176 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <functional>
+#include <type_traits>
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+// TODO(b/117523611): We should factor out a binary_op and put binary ops there.
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace floor_mod {
+namespace {
+
+// Input/output tensor index.
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+// Op data for floor_mod op.
+struct OpData {
+  bool requires_broadcast;
+};
+
+struct FloatMod {
+  float operator()(const float lhs, const float rhs) const {
+    return std::fmod(lhs, rhs);
+  }
+};
+
+// TODO(b/117912007): Move the implementation to reference_ops.h
+// TODO(b/117912880): Support quantization.
+template <typename T>
+T FloorMod(T input1, T input2) {
+  using ModFunc = typename std::conditional<std::is_integral<T>::value,
+                                            std::modulus<T>, FloatMod>::type;
+
+  ModFunc mod_func;
+  T trunc_mod = mod_func(input1, input2);
+  return (input1 < T(0)) == (input2 < T(0))
+             ? trunc_mod
+             : mod_func(trunc_mod + input2, input2);
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  // Reinterprete the opaque data provided by user.
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
+
+  const TfLiteType type = input1->type;
+  if (type != kTfLiteInt32 && type != kTfLiteFloat32 && type != kTfLiteInt64) {
+    context->ReportError(context, "Type '%s' is not supported by floor_mod.",
+                         TfLiteTypeGetName(type));
+    return kTfLiteError;
+  }
+  output->type = type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+TfLiteStatus EvalImpl(TfLiteContext* context, bool requires_broadcast,
+                      const TfLiteTensor* input1, const TfLiteTensor* input2,
+                      TfLiteTensor* output) {
+  const T* denominator_data = GetTensorData<T>(input2);
+
+  if (input2->type == kTfLiteInt32 || input2->type == kTfLiteInt64) {
+    // Validate the denominator only for integer.
+    const int num_elements = NumElements(input2);
+    for (int i = 0; i < num_elements; ++i) {
+      if (denominator_data[i] == 0) {
+        context->ReportError(context, "Division by 0");
+        return kTfLiteError;
+      }
+    }
+  }
+  if (requires_broadcast) {
+    reference_ops::BroadcastBinaryFunction4DSlow<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), denominator_data, GetTensorShape(output),
+        GetTensorData<T>(output), FloorMod<T>);
+  } else {
+    reference_ops::BinaryFunction<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), GetTensorData<T>(input2),
+        GetTensorShape(output), GetTensorData<T>(output), FloorMod<T>);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (input1->type) {
+    case kTfLiteInt32: {
+      return EvalImpl<int32_t>(context, data->requires_broadcast, input1,
+                               input2, output);
+    }
+    case kTfLiteInt64: {
+      return EvalImpl<int64_t>(context, data->requires_broadcast, input1,
+                               input2, output);
+    }
+    case kTfLiteFloat32: {
+      return EvalImpl<float>(context, data->requires_broadcast, input1, input2,
+                             output);
+    }
+    default: {
+      context->ReportError(context, "Type '%s' is not supported by floor_mod.",
+                           TfLiteTypeGetName(input1->type));
+      return kTfLiteError;
+    }
+  }
+}
+
+}  // namespace
+}  // namespace floor_mod
+
+TfLiteRegistration* Register_FLOOR_MOD() {
+  // Init, Free, Prepare, Eval are satisfying the Interface required by
+  // TfLiteRegistration.
+  static TfLiteRegistration r = {floor_mod::Init, floor_mod::Free,
+                                 floor_mod::Prepare, floor_mod::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/floor_mod_test.cc b/tensorflow/lite/kernels/floor_mod_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9d78673f320d8f797012dfc63220a99b091a6419
--- /dev/null
+++ b/tensorflow/lite/kernels/floor_mod_test.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+template <typename T>
+class FloorModModel : public SingleOpModel {
+ public:
+  FloorModModel(const TensorData& input1, const TensorData& input2,
+                const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_FLOOR_MOD, BuiltinOptions_FloorModOptions,
+                 CreateFloorModOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(FloorModModel, Simple) {
+  FloorModModel<int32_t> model({TensorType_INT32, {1, 2, 2, 1}},
+                               {TensorType_INT32, {1, 2, 2, 1}},
+                               {TensorType_INT32, {}});
+  model.PopulateTensor<int32_t>(model.input1(), {10, 9, 11, 3});
+  model.PopulateTensor<int32_t>(model.input2(), {2, 2, 3, 4});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(0, 1, 2, 3));
+}
+
+TEST(FloorModModel, NegativeValue) {
+  FloorModModel<int32_t> model({TensorType_INT32, {1, 2, 2, 1}},
+                               {TensorType_INT32, {1, 2, 2, 1}},
+                               {TensorType_INT32, {}});
+  model.PopulateTensor<int32_t>(model.input1(), {10, -9, -11, 7});
+  model.PopulateTensor<int32_t>(model.input2(), {2, 2, -3, -4});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(0, 1, -2, -1));
+}
+
+TEST(FloorModModel, BroadcastFloorMod) {
+  FloorModModel<int32_t> model({TensorType_INT32, {1, 2, 2, 1}},
+                               {TensorType_INT32, {1}}, {TensorType_INT32, {}});
+  model.PopulateTensor<int32_t>(model.input1(), {10, -9, -11, 7});
+  model.PopulateTensor<int32_t>(model.input2(), {-3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(-2, 0, -2, -2));
+}
+
+TEST(FloorModModel, Int64WithBroadcast) {
+  FloorModModel<int64_t> model({TensorType_INT64, {1, 2, 2, 1}},
+                               {TensorType_INT64, {1}}, {TensorType_INT64, {}});
+  model.PopulateTensor<int64_t>(model.input1(), {10, -9, -11, (1LL << 34) + 9});
+  model.PopulateTensor<int64_t>(model.input2(), {-(1LL << 33)});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAre(-8589934582, -9, -11, -8589934583));
+}
+
+TEST(FloorModModel, FloatSimple) {
+  FloorModModel<float> model({TensorType_FLOAT32, {1, 2, 2, 1}},
+                             {TensorType_FLOAT32, {1, 2, 2, 1}},
+                             {TensorType_FLOAT32, {}});
+  model.PopulateTensor<float>(model.input1(), {10, 9, 11, 3});
+  model.PopulateTensor<float>(model.input2(), {2, 2, 3, 4});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(0, 1, 2, 3));
+}
+
+TEST(FloorModModel, FloatNegativeValue) {
+  FloorModModel<float> model({TensorType_FLOAT32, {1, 2, 2, 1}},
+                             {TensorType_FLOAT32, {1, 2, 2, 1}},
+                             {TensorType_FLOAT32, {}});
+  model.PopulateTensor<float>(model.input1(), {10, -9, -11, 7});
+  model.PopulateTensor<float>(model.input2(), {2, 2, -3, -4});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(0, 1, -2, -1));
+}
+
+TEST(FloorModModel, FloatBroadcastFloorMod) {
+  FloorModModel<float> model({TensorType_FLOAT32, {1, 2, 2, 1}},
+                             {TensorType_FLOAT32, {1}},
+                             {TensorType_FLOAT32, {}});
+  model.PopulateTensor<float>(model.input1(), {10, -9, -11, 7});
+  model.PopulateTensor<float>(model.input2(), {-3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 2, 2, 1));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(-2, 0, -2, -2));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/floor_test.cc b/tensorflow/lite/kernels/floor_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/floor_test.cc
rename to tensorflow/lite/kernels/floor_test.cc
index b71e0400b6dc92899721342fc4ebbd51a8876455..9bcbdba8a4f0b20213715c02112f47eca0ce013d 100644
--- a/tensorflow/contrib/lite/kernels/floor_test.cc
+++ b/tensorflow/lite/kernels/floor_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1eecb284ab647e8b7fc7b18dfd8ad82aedeece3
--- /dev/null
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -0,0 +1,508 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/gemm_support.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace fully_connected {
+
+// This file has four implementations of FullyConnected
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+  kPie,  // Used by the PIE team
+};
+
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int scratch_tensor_index;
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+constexpr int kShuffledInputWorkspaceTensor = 1;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // This is a builtin op, so we don't use the contents in 'buffer', if any.
+  // Instead, we allocate a new object to carry information from Prepare() to
+  // Eval().
+  gemm_support::IncrementUsageCounter(context);
+  auto* op_data = new OpData();
+  context->AddTensors(context, /*tensors_to_add=*/2,
+                      &op_data->scratch_tensor_index);
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  gemm_support::DecrementUsageCounter(context);
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 3);
+  // Shuffled formats need a workspace to store the shuffled input activations.
+  const int expected_outputs_count =
+      params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault ? 1
+                                                                          : 2;
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, expected_outputs_count);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  int input_size = 1;
+  for (int i = 0; i < input->dims->size; i++) {
+    input_size *= input->dims->data[i];
+  }
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2);
+  const int batch_size = input_size / filter->dims->data[1];
+  const int num_units = filter->dims->data[0];
+
+  TF_LITE_ENSURE_EQ(context, input_size, batch_size * filter->dims->data[1]);
+  if (bias) {
+    TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
+  }
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  TfLiteType data_type = input->type;
+  if (data_type != kTfLiteFloat32 && data_type != kTfLiteInt32) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
+    TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
+        context, params->activation, output, &data->output_activation_min,
+        &data->output_activation_max));
+  }
+
+  // If we have to perform on-the-fly quantization (with quantized weights and
+  // float inputs) first we need to quantize the inputs. Allocate a temporary
+  // buffer to store the intermediate quantized values.
+  if (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8) {
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(2);
+    node->temporaries->data[0] = data->scratch_tensor_index;
+
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+
+    // TODO(raziel): add this logic to ResizeTensor.
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+    node->temporaries->data[1] = data->scratch_tensor_index + 1;
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/1);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    int scaling_dims[1] = {batch_size};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = batch_size;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+  }
+
+  // Resize output.
+  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
+  output_size_array->data[0] = batch_size;
+  output_size_array->data[1] = num_units;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
+                     TfLiteFullyConnectedParams* params, OpData* data,
+                     const TfLiteTensor* input, const TfLiteTensor* filter,
+                     const TfLiteTensor* bias, TfLiteTensor* output) {
+  int total_input_size = 1;
+  for (int i = 0; i < input->dims->size; i++) {
+    total_input_size *= input->dims->data[i];
+  }
+
+  int input_size = filter->dims->data[1];
+  const int batch_size = total_input_size / filter->dims->data[1];
+  const int num_units = filter->dims->data[0];
+
+  // Output = bias if bias tensor exists.
+  if (bias) {
+    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
+                                          output->data.f);
+  } else {
+    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
+  }
+
+  // Compute output += weight * input
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      filter->data.f, num_units, input_size, input->data.f, batch_size,
+      output->data.f, /*result_stride=*/1);
+
+  // Apply activation function
+  tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
+                                        params->activation, output->data.f);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
+                        TfLiteFullyConnectedParams* params, OpData* data,
+                        const TfLiteTensor* input, const TfLiteTensor* filter,
+                        const TfLiteTensor* bias, TfLiteTensor* input_quantized,
+                        TfLiteTensor* scaling_factors, TfLiteTensor* output) {
+  // Check the types for this hybrid Op.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteUInt8);
+  TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+
+  int total_input_size = 1;
+  for (int i = 0; i < input->dims->size; i++) {
+    total_input_size *= input->dims->data[i];
+  }
+
+  const int input_size = filter->dims->data[1];
+  const int batch_size = total_input_size / filter->dims->data[1];
+  const int num_units = filter->dims->data[0];
+
+  // Output = bias if bias tensor exists.
+  if (bias) {
+    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
+                                          output->data.f);
+  } else {
+    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
+  }
+
+  // Save matrix multiplication computation for all zero input.
+  if (tensor_utils::IsZeroVector(input->data.f, total_input_size)) {
+    tensor_utils::ApplyActivationToVector(output->data.f,
+                                          batch_size * num_units,
+                                          params->activation, output->data.f);
+    return kTfLiteOk;
+  }
+
+  // Quantize input from float to uint8 + quantization params (scaling factor).
+  float unused_min, unused_max;
+  float* scaling_factors_ptr = scaling_factors->data.f;
+  int8_t* quant_data = reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+
+  // Quantize each batch independently.
+  for (int b = 0; b < batch_size; ++b) {
+    const int offset = b * input_size;
+    tensor_utils::SymmetricQuantizeFloats(input->data.f + offset, input_size,
+                                          quant_data + offset, &unused_min,
+                                          &unused_max, &scaling_factors_ptr[b]);
+    // Incorporate scaling of the filter.
+    scaling_factors_ptr[b] *= filter->params.scale;
+  }
+
+  // Compute output += weight * quantized_input
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      reinterpret_cast<int8_t*>(filter->data.uint8), num_units, input_size,
+      quant_data, scaling_factors_ptr, batch_size, output->data.f,
+      /*result_stride=*/1);
+
+  // Apply activation function to floats.
+  tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
+                                        params->activation, output->data.f);
+  return kTfLiteOk;
+}
+
+#define TF_LITE_MACRO_DISPATCH(macro_name, params, target_namespace) \
+  if (params->activation == kTfLiteActNone) {                        \
+    macro_name(target_namespace, kNone);                             \
+  }                                                                  \
+  if (params->activation == kTfLiteActRelu) {                        \
+    macro_name(target_namespace, kRelu);                             \
+  }                                                                  \
+  if (params->activation == kTfLiteActRelu6) {                       \
+    macro_name(target_namespace, kRelu6);                            \
+  }
+
+template <KernelType kernel_type>
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteFullyConnectedParams* params, OpData* data,
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* output) {
+  gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
+
+  int32_t input_offset = -input->params.zero_point;
+  int32_t filter_offset = -filter->params.zero_point;
+  int32_t output_offset = output->params.zero_point;
+#define TF_LITE_FULLY_CONNECTED(type, output_data_type)                  \
+  {                                                                      \
+    FullyConnectedParams op_params;                                      \
+    op_params.input_offset = input_offset;                               \
+    op_params.weights_offset = filter_offset;                            \
+    op_params.output_offset = output_offset;                             \
+    op_params.output_multiplier = data->output_multiplier;               \
+    op_params.output_shift = -data->output_shift;                        \
+    op_params.quantized_activation_min = data->output_activation_min;    \
+    op_params.quantized_activation_max = data->output_activation_max;    \
+    type::FullyConnected(                                                \
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
+        GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
+        GetTensorShape(output), GetTensorData<output_data_type>(output), \
+        gemm_context);                                                   \
+  }
+  if (kernel_type == kReference) {
+    switch (output->type) {
+      case kTfLiteUInt8:
+        TF_LITE_FULLY_CONNECTED(reference_ops, uint8_t);
+        break;
+      case kTfLiteInt16:
+        TF_LITE_FULLY_CONNECTED(reference_ops, int16_t);
+        break;
+      default:
+        context->ReportError(
+            context,
+            "Quantized FullyConnected expects output data type uint8 or int16");
+        return kTfLiteError;
+    }
+  } else if (kernel_type == kPie && input->type == kTfLiteFloat32) {
+    // Pie currently only supports quantized models and float inputs/outputs.
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/1);
+    return EvalHybrid(context, node, params, data, input, filter, bias,
+                      input_quantized, scaling_factors, output);
+  } else {
+    switch (output->type) {
+      case kTfLiteUInt8:
+        TF_LITE_FULLY_CONNECTED(optimized_ops, uint8_t);
+        break;
+      case kTfLiteInt16:
+        TF_LITE_FULLY_CONNECTED(optimized_ops, int16_t);
+        break;
+      default:
+        context->ReportError(
+            context,
+            "Quantized FullyConnected expects output data type uint8 or int16");
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_FULLY_CONNECTED
+
+  return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus EvalShuffledQuantized(TfLiteContext* context, TfLiteNode* node,
+                                   TfLiteFullyConnectedParams* params,
+                                   OpData* data, const TfLiteTensor* input,
+                                   const TfLiteTensor* filter,
+                                   const TfLiteTensor* bias,
+                                   TfLiteTensor* output,
+                                   TfLiteTensor* shuffled_input_workspace) {
+  gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
+
+  // TODO(b/110697972) decide more consistently if / how / where we want
+  // to perform this kind of runtime data type checks.
+  if (input->type != kTfLiteUInt8 || filter->type != kTfLiteUInt8 ||
+      bias->type != kTfLiteInt32 || output->type != kTfLiteInt16 ||
+      shuffled_input_workspace->type != kTfLiteUInt8) {
+    context->ReportError(context, "Unexpected data type");
+    return kTfLiteError;
+  }
+
+#define TF_LITE_SHUFFLED_FULLY_CONNECTED(type)                           \
+  {                                                                      \
+    FullyConnectedParams op_params;                                      \
+    op_params.output_multiplier = data->output_multiplier;               \
+    op_params.output_shift = -data->output_shift;                        \
+    op_params.quantized_activation_min = data->output_activation_min;    \
+    op_params.quantized_activation_max = data->output_activation_max;    \
+    type::ShuffledFullyConnected(                                        \
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
+        GetTensorShape(filter), GetTensorData<uint8_t>(filter),          \
+        GetTensorShape(bias), GetTensorData<int32_t>(bias),              \
+        GetTensorShape(output), GetTensorData<int16_t>(output),          \
+        GetTensorData<uint8_t>(shuffled_input_workspace), gemm_context); \
+  }
+  if (kernel_type == kReference) {
+    TF_LITE_SHUFFLED_FULLY_CONNECTED(reference_ops);
+  } else {
+    TF_LITE_SHUFFLED_FULLY_CONNECTED(optimized_ops);
+  }
+#undef TF_LITE_SHUFFLED_FULLY_CONNECTED
+
+  return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteFullyConnectedParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRange(params->activation, &output_activation_min,
+                           &output_activation_max);
+#define TF_LITE_FULLY_CONNECTED(type)                                         \
+  {                                                                           \
+    FullyConnectedParams op_params;                                           \
+    op_params.float_activation_min = output_activation_min;                   \
+    op_params.float_activation_max = output_activation_max;                   \
+    type::FullyConnected(op_params, GetTensorShape(input),                    \
+                         GetTensorData<float>(input), GetTensorShape(filter), \
+                         GetTensorData<float>(filter), GetTensorShape(bias),  \
+                         GetTensorData<float>(bias), GetTensorShape(output),  \
+                         GetTensorData<float>(output));                       \
+  }
+  if (kernel_type == kReference) {
+    TF_LITE_FULLY_CONNECTED(reference_ops);
+  } else if (kernel_type == kPie) {
+    return EvalPie(context, node, params, data, input, filter, bias, output);
+  } else {
+    TF_LITE_FULLY_CONNECTED(optimized_ops);
+  }
+#undef TF_LITE_FULLY_CONNECTED
+
+  return kTfLiteOk;
+}
+
+#undef TF_LITE_MACRO_DISPATCH
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (filter->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      return EvalFloat<kernel_type>(context, node, params, data, input, filter,
+                                    bias, output);
+    case kTfLiteUInt8:
+      if (params->weights_format ==
+          kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8) {
+        TfLiteTensor* shuffled_input_workspace =
+            GetOutput(context, node, kShuffledInputWorkspaceTensor);
+        return EvalShuffledQuantized<kernel_type>(context, node, params, data,
+                                                  input, filter, bias, output,
+                                                  shuffled_input_workspace);
+      } else if (params->weights_format ==
+                 kTfLiteFullyConnectedWeightsFormatDefault) {
+        return EvalQuantized<kernel_type>(context, node, params, data, input,
+                                          filter, bias, output);
+      } else {
+        context->ReportError(context,
+                             "Unhandled fully-connected weights format");
+        return kTfLiteError;
+      }
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           filter->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace fully_connected
+
+TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
+  static TfLiteRegistration r = {
+      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+      fully_connected::Eval<fully_connected::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT() {
+  static TfLiteRegistration r = {
+      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+      fully_connected::Eval<fully_connected::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+      fully_connected::Eval<fully_connected::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED_PIE() {
+  static TfLiteRegistration r = {fully_connected::Init, fully_connected::Free,
+                                 fully_connected::Prepare,
+                                 fully_connected::Eval<fully_connected::kPie>};
+  return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED() {
+  // TODO(ahentz): We don't have a dedicated quantized version of the PIE
+  // kernel. For now, the quantized version just defer to the corresponding
+  // optimized MINI kernel. At some point we will allow different libraries to
+  // be built with different kernels, but for now we have to pick one here.
+  return Register_FULLY_CONNECTED_PIE();
+#ifdef USE_NEON
+  return Register_FULLY_CONNECTED_NEON_OPT();
+#else
+  return Register_FULLY_CONNECTED_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3351a30b123b12751f2411f71037f2ecfb1d4b43
--- /dev/null
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -0,0 +1,766 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite FULLY_CONNECTED op.
+
+#include <iomanip>
+#include <random>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_FULLY_CONNECTED_REF();
+TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT();
+TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT();
+TfLiteRegistration* Register_FULLY_CONNECTED_PIE();
+
+}  // namespace builtin
+}  // namespace ops
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+static float fully_connected_input[] = {
+    0.503691, 0.196961, 0.521017, 0.554248, 0.288678, 0.792476, 0.561653,
+    0.462230, 0.650736, 0.163132, 0.029658, 0.411544, 0.470539, 0.572390,
+    0.538755, 0.212030, 0.264309, 0.193908, 0.777480, 0.745661, 0.423314,
+    0.470804, 0.175501, 0.492225, 0.192743, 0.540183, 0.372514, 0.446550,
+    0.498173, 0.126472, 0.132706, 0.001864, 0.323433, 0.653723, 0.556112,
+    0.612111, 0.446199, 0.117765, 0.074341, 0.096935, 0.280897, 0.103999,
+    0.508479, 0.751437, 0.676389, 0.047234, 0.963467, 0.940698, 0.241142,
+    0.740947, 0.686359, 0.664456, 0.211751, 0.861860, 0.156681, 0.404494,
+    0.402043, 0.529195, 0.851044, 0.900216, 0.655667, 0.983750, 0.902081,
+    0.979100, 0.637473, 0.458193, 0.591211, 0.083671, 0.575958, 0.665552,
+    0.180606, 0.856856, 0.769551, 0.689086, 0.608293, 0.445940, 0.736320,
+    0.571760, 0.386637, 0.977461, 0.312707, 0.072996, 0.641918, 0.524458,
+    0.934856, 0.798598, 0.928951, 0.336899, 0.327793, 0.779995, 0.237115,
+    0.983460, 0.763746, 0.139196, 0.962560, 0.401218, 0.597389, 0.553771,
+    0.484890, 0.173347, 0.219322, 0.665496, 0.030203, 0.988873, 0.354582,
+    0.638496, 0.434813, 0.090902, 0.210256, 0.821450, 0.068363, 0.522962,
+    0.894446, 0.710280, 0.047420, 0.829302, 0.508879, 0.976371, 0.166202,
+    0.836672, 0.756367, 0.403317, 0.820132, 0.520112, 0.542513, 0.782691,
+    0.921330, 0.139902};
+
+static float fully_connected_golden_output[] = {
+    0,        0.0732134,   0,        0,          0,         0.280859,
+    0,        0.128927,    0,        0.0777251,  0,         0.270268,
+    0.271435, 0.0173503,   0.335465, 0.235562,
+
+    0,        0.0745866,   0,        0.051611,   0,         0.253876,
+    0,        0.0814873,   0,        0.104104,   0,         0.248529,
+    0.264194, 0,           0.302973, 0.166252,
+
+    0,        0.0170409,   0,        0.0509851,  0,         0.212834,
+    0,        0.0208326,   0,        0.129932,   0.203978,  0.103428,
+    0.298051, 0,           0.332233, 0.00445903,
+
+    0,        0.125246,    0,        0.0735336,  0,         0.0910256,
+    0,        0,           0,        0.18933,    0.378111,  0.0712443,
+    0.277298, 0.0123414,   0.267454, 0,
+
+    0,        0.14687,     0,        0.155495,   0.0300215, 0.147256,
+    0,        0,           0,        0.156412,   0.434914,  0.0461529,
+    0.246508, 0,           0.363138, 0,
+
+    0,        0,           0,        0.0212949,  0,         0.301708,
+    0,        0.35497,     0,        0.406223,   0.0260211, 0.049195,
+    0.197161, 0,           0.37316,  0,
+
+    0,        0.221783,    0,        0,          0.0116515, 0.281945,
+    0,        0,           0,        0,          0.285626,  0.181773,
+    0.296401, 0.170452,    0.367135, 0.142597,
+
+    0,        0,           0,        0,          0,         0.418886,
+    0,        0.291063,    0,        0.227541,   0.0424759, 0.27589,
+    0.398286, 0.177146,    0.40359,  0.121452,
+
+    0,        0.0834884,   0,        0,          0,         0.287441,
+    0,        0.0046838,   0,        0.0122087,  0,         0.217376,
+    0.140183, 0.0948412,   0.436677, 0.0589876,
+
+    0,        0.0289969,   0,        0.0921397,  0,         0.396802,
+    0,        0.0126157,   0,        0.0968433,  0,         0.172271,
+    0.173295, 0.0664741,   0.53645,  0.00915603,
+
+    0,        0,           0,        0,          0,         0.147942,
+    0,        0.263795,    0,        0.39782,    0,         0.382435,
+    0.561072, 0.0579847,   0.145712, 0.13508,
+
+    0,        0,           0,        0.16382,    0,         0.322294,
+    0,        0.163798,    0,        0.405211,   0.367953,  0.076852,
+    0.342473, 0.0834118,   0.377537, 0,
+
+    0,        0.206,       0,        0,          0,         0.375769,
+    0,        0,           0,        0,          0,         0.125165,
+    0,        0.105591,    0.52055,  0.0536445,
+
+    0,        0.259261,    0,        0,          0,         0.247707,
+    0,        0,           0,        0,          0,         0.215862,
+    0.149153, 0.224678,    0.359519, 0.129419,
+
+    0,        0.17611,     0,        0.280895,   0,         0.576484,
+    0,        0.000418848, 0,        0,          0,         0.151112,
+    0.211902, 0,           0.566341, 0.106305,
+
+    0,        0.0246284,   0,        0,          0,         0.196267,
+    0,        0.0248624,   0,        0.265635,   0,         0.436199,
+    0.408079, 0.134514,    0.328489, 0.411368};
+
+class BaseFullyConnectedOpModel : public SingleOpModel {
+ public:
+  // TODO(ahentz): test different activation types too.
+  BaseFullyConnectedOpModel(
+      TfLiteRegistration* registration, int units, int batches,
+      const TensorData& input, const TensorData& output = {TensorType_FLOAT32},
+      ActivationFunctionType activation_func = ActivationFunctionType_RELU,
+      FullyConnectedOptionsWeightsFormat weights_format =
+          FullyConnectedOptionsWeightsFormat_DEFAULT)
+      : batches_(batches), units_(units) {
+    int total_input_size = 1;
+    for (int i = 0; i < input.shape.size(); ++i) {
+      total_input_size *= input.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+
+    input_ = AddInput(input);
+    weights_ =
+        AddInput({input.type, {units_, input_size_}, input.min, input.max});
+
+    if (input.type == TensorType_FLOAT32) {
+      bias_ = AddInput({TensorType_FLOAT32, {units_}});
+    } else {
+      // This is a quantized version. The scale of 'bias' depends on the scales
+      // of input and filter. Supposedly this is correctly set during quantized
+      // training.
+      auto bias_scale = GetScale(input_) * GetScale(weights_);
+      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
+
+    output_ = AddOutput(output);
+    if (weights_format != FullyConnectedOptionsWeightsFormat_DEFAULT) {
+      AddOutput({TensorType_UINT8, input.shape});
+    }
+
+    SetBuiltinOp(
+        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
+        CreateFullyConnectedOptions(builder_, activation_func, weights_format)
+            .Union());
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_FULLY_CONNECTED, registration);
+    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+  }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+
+ protected:
+  int input_;
+  int weights_;
+  int bias_;
+  int output_;
+
+  int batches_;
+  int units_;
+  int input_size_;
+};
+
+class FloatFullyConnectedOpModel : public BaseFullyConnectedOpModel {
+ public:
+  using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel;
+
+  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
+
+  void SetWeights(const std::vector<float>& f) { PopulateTensor(weights_, f); }
+
+  void SetInput(const std::vector<float>& data) {
+    PopulateTensor(input_, data);
+  }
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
+ public:
+  using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel;
+
+  void SetBias(const std::vector<float>& data) {
+    QuantizeAndPopulate<int32_t>(bias_, data);
+  }
+  void SetWeights(const std::vector<float>& data) {
+    QuantizeAndPopulate<uint8_t>(weights_, data);
+  }
+  void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth,
+                            int output_depth) {
+    std::vector<float> shuffled_data(data.size());
+    CHECK_EQ(input_depth % 16, 0);
+    CHECK_EQ(output_depth % 4, 0);
+    float* shuffled_data_ptr = shuffled_data.data();
+    for (int block_o = 0; block_o < output_depth; block_o += 4) {
+      for (int block_i = 0; block_i < input_depth; block_i += 16) {
+        for (int o = 0; o < 4; o++) {
+          for (int i = 0; i < 16; i++) {
+            *shuffled_data_ptr++ =
+                data[(block_o + o) * input_depth + block_i + i];
+          }
+        }
+      }
+    }
+    TfLiteTensor* t = interpreter_->tensor(weights_);
+    auto quantized_data =
+        Quantize<uint8_t>(shuffled_data, t->params.scale, t->params.zero_point);
+    for (uint8_t& q : quantized_data) {
+      q ^= 0x80;
+    }
+    PopulateTensor(weights_, 0, quantized_data.data(),
+                   quantized_data.data() + quantized_data.size());
+  }
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+};
+
+// In the hybrid model the weights are quantized (to uint8). But the bias,
+// input (and output) are expected to be in float precision.
+class HybridFullyConnectedOpModel : public SingleOpModel {
+ public:
+  HybridFullyConnectedOpModel(int units, int batches, const TensorData& input,
+                              const TensorData& weights,
+                              const TensorData& output = {TensorType_FLOAT32})
+      : batches_(batches), units_(units) {
+    int total_input_size = 1;
+    for (int i = 0; i < input.shape.size(); ++i) {
+      total_input_size *= input.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+
+    input_ = AddInput(input);
+    weights_ = AddInput(weights);
+
+    TensorData bias{TensorType_FLOAT32, {units_}};
+    bias_ = AddInput(bias);
+
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(
+        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
+        CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
+            .Union());
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_FULLY_CONNECTED,
+        ops::builtin::Register_FULLY_CONNECTED_PIE());
+    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+  }
+  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
+  void SetWeights(const std::vector<float>& data) {
+    SymmetricQuantizeAndPopulate(weights_, data);
+  }
+
+  void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+
+ protected:
+  int input_;
+  int weights_;
+  int bias_;
+  int output_;
+
+  int batches_;
+  int units_;
+  int input_size_;
+};
+
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
+    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
+    {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
+    {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
+});
+
+class FloatFullyConnectedOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+const auto kKernelMapNoPie = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
+    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
+    {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
+});
+
+class QuantizedFullyConnectedOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMapNoPie;
+  }
+};
+
+const auto kKernelMapPie = new std::map<string, TfLiteRegistration*>({
+    {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
+});
+
+// Hybrid mode is used by the Pie quantized kernel.
+class HybridFullyConnectedOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMapPie;
+  }
+};
+
+// TODO(ahentz): add more small tests like this one, focused on making sure the
+// calculations are correct.
+TEST_P(FloatFullyConnectedOpTest, SimpleTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/3, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {2, 10}});
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
+}
+
+TEST_P(FloatFullyConnectedOpTest, SimpleTest2) {
+  FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/1, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {2, 2}});
+  m.SetWeights({
+      2, 4,  // u = 0
+  });
+  m.SetBias({1});
+
+  m.SetInput({
+      1, 2,  // b = 0
+      2, 1,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(11, 9));
+}
+
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128});
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  //
+                  58, 59, 60,  //
+              })));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAre(151, 152, 153, 185, 186, 187));
+}
+
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestQuantizedOutputMultiplierGreaterThan1) {
+  // real_multiplier = 2.
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -127, 128},
+      /*output=*/{TensorType_UINT8, {}, -63.5, 64});
+
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  // first batch
+                  58, 59, 60,  // second batch
+              })));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAre(175, 177, 179, 243, 245, 247));
+}
+
+void SimpleTestQuantizedInt16OutputCase(
+    TfLiteRegistration* registration, int input_depth, int output_depth,
+    int batches, FullyConnectedOptionsWeightsFormat weights_format) {
+  const uint8_t kWeightsZeroPoint = 128;
+  const float kWeightsScale = 1.f / 128.f;
+  const uint8_t kInputZeroPoint = 128;
+  const float kInputScale = 1.f / 128.f;
+  const float kInputMin = (0 - kInputZeroPoint) * kInputScale;
+  const float kInputMax = (255 - kInputZeroPoint) * kInputScale;
+  // Output ranges in [-8..8] encoded as int16
+  const float kOutputScale = 8.f / 32768.f;
+  const float kOutputMin = -32768 * kOutputScale;
+  const float kOutputMax = 32767 * kOutputScale;
+
+  QuantizedFullyConnectedOpModel m(
+      registration, output_depth, batches,
+      /*input=*/
+      {TensorType_UINT8, {batches, input_depth}, kInputMin, kInputMax},
+      /*output=*/{TensorType_INT16, {}, kOutputMin, kOutputMax},
+      /*activation_func=*/ActivationFunctionType_NONE, weights_format);
+
+  std::mt19937 random_engine;
+  std::uniform_int_distribution<uint8_t> weights_dist;
+
+  std::vector<float> weights_data(input_depth * output_depth);
+  for (auto& w : weights_data) {
+    uint8_t q = weights_dist(random_engine);
+    w = (q - kWeightsZeroPoint) * kWeightsScale;
+  }
+
+  // Based on weights_format, enforce any shape requirement for that format/path
+  // and set the (possibly shuffled) weights.
+  switch (weights_format) {
+    case FullyConnectedOptionsWeightsFormat_DEFAULT:
+      m.SetWeights(weights_data);
+      break;
+    case FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8:
+      // The shuffled path currently supports only a restrictive subset of
+      // shapes, described by the following assertions:
+      CHECK_EQ(input_depth % 16, 0);
+      CHECK_EQ(output_depth % 4, 0);
+      CHECK(batches == 1 || batches == 4);
+      m.ShuffleAndSetWeights(weights_data, input_depth, output_depth);
+      break;
+    default:
+      LOG(FATAL) << "Unhandled weights format";
+  }
+
+  std::uniform_int_distribution<uint8_t> input_dist;
+  std::vector<float> input_data(input_depth * batches);
+  for (auto& i : input_data) {
+    uint8_t q = input_dist(random_engine);
+    i = (q - kInputZeroPoint) * kInputScale;
+  }
+
+  std::vector<float> bias_data(output_depth);
+  // As the output ranges in [-8, 8], it's reasonable to have bias values
+  // in [-1, 1], this won't result in too much saturation.
+  std::uniform_real_distribution<float> bias_dist(-1.f, 1.f);
+  for (auto& b : bias_data) {
+    b = bias_dist(random_engine);
+  }
+
+  m.SetBias(bias_data);
+  m.SetInput(input_data);
+
+  m.Invoke();
+
+  std::vector<float> expected_output_data(output_depth * batches);
+  for (int b = 0; b < batches; b++) {
+    for (int o = 0; o < output_depth; o++) {
+      float accum = bias_data[o];
+      for (int i = 0; i < input_depth; i++) {
+        accum +=
+            input_data[b * input_depth + i] * weights_data[o * input_depth + i];
+      }
+      accum = std::min(accum, kOutputMax);
+      accum = std::max(accum, kOutputMin);
+      expected_output_data[b * output_depth + o] = accum;
+    }
+  }
+
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(expected_output_data, 3e-4f)));
+}
+
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestQuantizedInt16OutputDefaultWeights) {
+  for (int input_depth : {1, 3, 10, 100}) {
+    for (int output_depth : {1, 3, 10, 100}) {
+      for (int batch : {1, 3, 10, 100}) {
+        SimpleTestQuantizedInt16OutputCase(
+            GetRegistration(), input_depth, output_depth, batch,
+            FullyConnectedOptionsWeightsFormat_DEFAULT);
+      }
+    }
+  }
+}
+
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestQuantizedInt16OutputShuffled4x16Int8Weights) {
+  // The shuffled weights block shape is 4x16. The shape of the weights matrix
+  // is: rows = output_depth, cols = input_depth. It must be a multiple of 4x16.
+  // This means that output_depth must be a multiple of 4, and input_deth must
+  // be a multiple of 16.
+  for (int input_depth_numblocks : {1, 3}) {
+    for (int output_depth_numblocks : {1, 3}) {
+      int input_depth = 16 * input_depth_numblocks;
+      int output_depth = 4 * output_depth_numblocks;
+      // The fast shuffled path is currently supporting only batch sizes of 1
+      // and 4. The idea is that the whole point of that path is to go as fast
+      // as possible for small batch size, which requires fully specializing
+      // it for each batch size, and for larger batch sizes the generic
+      // gemmlowp-based implementation is fast enough.
+      for (int batch : {1, 4}) {
+        SimpleTestQuantizedInt16OutputCase(
+            GetRegistration(), input_depth, output_depth, batch,
+            FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8);
+      }
+    }
+  }
+}
+
+TEST(HybridFullyConnectedOpTest, SimpleTestQuantized) {
+  HybridFullyConnectedOpModel m(
+      /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_FLOAT32, {2, 10}},
+      /*weights=*/{TensorType_UINT8, {3, 10}, -63.5, 64});  // PIE
+
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     24, 25, 26,  //
+                                     58, 59, 60,  //
+                                 },
+                                 /*max_abs_error=*/1.3f)));
+}
+
+TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput) {
+  // Note that it is not required that the first dimension be the number of
+  // batches. All we care is that the input can be evenly distributed in
+  // batches. In this case, we need the input to have multiples of '2'.
+  FloatFullyConnectedOpModel m(GetRegistration(),
+                               /*units=*/3, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}});
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // first batch
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // second batch
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 24, 25, 26,  // first batch
+                                 58, 59, 60,  // second batch
+                             }));
+}
+
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128});
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  //
+                  58, 59, 60,  //
+              })));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAre(151, 152, 153, 185, 186, 187));
+}
+
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) {
+  // real_multiplier = 2.
+  QuantizedFullyConnectedOpModel m(
+      GetRegistration(), /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128},
+      /*output=*/{TensorType_UINT8, {}, -63.5, 64});
+
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  // first batch
+                  58, 59, 60,  // second batch
+              })));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAre(175, 177, 179, 243, 245, 247));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    FloatFullyConnectedOpTest, FloatFullyConnectedOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
+INSTANTIATE_TEST_CASE_P(
+    QuantizedFullyConnectedOpTest, QuantizedFullyConnectedOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
+
+// TODO(ahentz): Reconsider this test. Having arbitrary weights makes it hard
+// to debug errors and doesn't necessarily test all the important details.
+TEST_P(FloatFullyConnectedOpTest, BlackBoxTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/16, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {2, 8}});
+  m.SetWeights(
+      {0.091327,  0.103366,  -0.316505, -0.083120, 0.149366,  -0.196636,
+       -0.123672, 0.062800,  0.063031,  0.191670,  -0.062001, -0.061504,
+       -0.275581, 0.059388,  -0.118497, -0.079224, 0.109758,  0.008307,
+       -0.062657, -0.060962, -0.049782, -0.106719, -0.319482, -0.103650,
+       0.266455,  0.051517,  -0.123448, 0.322464,  0.043282,  -0.173782,
+       -0.190381, 0.002013,  0.096086,  0.131157,  0.031164,  0.100638,
+       -0.312191, -0.080923, -0.101318, -0.116614, 0.142238,  0.086540,
+       -0.139154, 0.174268,  -0.073161, 0.080072,  0.006874,  0.229382,
+       -0.104321, -0.176035, -0.208587, -0.001019, -0.162032, 0.080824,
+       -0.025021, 0.074460,  -0.252595, -0.161750, -0.136403, 0.008308,
+       0.005710,  0.096600,  0.289839,  0.218816,  -0.304651, -0.070958,
+       0.054598,  0.147113,  -0.139112, -0.072798, -0.163335, -0.167863,
+       -0.128762, -0.035780, 0.117262,  0.017177,  0.263335,  -0.176612,
+       0.262961,  -0.093654, -0.339283, 0.333071,  0.180827,  0.287583,
+       0.066350,  -0.197947, -0.114449, -0.236035, 0.103532,  -0.034284,
+       0.093299,  -0.145361, 0.054001,  0.250570,  0.157010,  -0.143480,
+       -0.139061, -0.048873, 0.067557,  0.139038,  0.324106,  0.227041,
+       0.037793,  -0.225747, -0.241619, 0.357835,  0.135762,  -0.306764,
+       -0.125982, 0.091916,  0.266587,  0.030135,  0.265148,  0.141627,
+       0.020120,  0.083815,  -0.124556, -0.100124, -0.048159, 0.181172,
+       0.302309,  -0.041084, 0.146334,  -0.061511, -0.232605, 0.281324,
+       0.145408,  -0.221897});
+  m.SetBias({-0.160594, 0.205770, -0.078307, -0.077984, 0.001937, 0.015860,
+             0.036810, 0.012346, 0.001028, 0.038551, 0.075415, 0.020804,
+             0.048478, -0.032270, 0.175688, -0.085662});
+
+  const int input_sequence_size = sizeof(fully_connected_input) /
+                                  sizeof(float) /
+                                  (m.input_size() * m.num_batches());
+  for (int i = 0; i < input_sequence_size; i++) {
+    // TODO(ahentz): This is what the original test was doing: two equal
+    // batches per invocation. We could instead use two different batches.
+    float* batch_start = fully_connected_input + i * m.input_size();
+    float* batch_end = batch_start + m.input_size();
+    m.SetInput(0, batch_start, batch_end);
+    m.SetInput(m.input_size(), batch_start, batch_end);
+
+    m.Invoke();
+
+    float* golden_start = fully_connected_golden_output + i * m.num_units();
+    float* golden_end = golden_start + m.num_units();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f205daae1343cb0abecc95e7d1b280c10f55d897
--- /dev/null
+++ b/tensorflow/lite/kernels/gather.cc
@@ -0,0 +1,183 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace gather {
+constexpr int kInputTensor = 0;
+constexpr int kInputPositions = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const auto* params =
+      reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (positions->type) {
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    default:
+      context->ReportError(
+          context, "Positions of type '%s' are not supported by gather.",
+          TfLiteTypeGetName(positions->type));
+      return kTfLiteError;
+  }
+
+  // Assign to output the input type.
+  output->type = input->type;
+
+  // Check conditions for different types.
+  switch (input->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    case kTfLiteString: {
+      // Only 1D input is supported.
+      TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+    } break;
+    default:
+      context->ReportError(context, "Type '%s' is not supported by gather.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+
+  int axis = params->axis;
+  if (axis < 0) {
+    axis += NumDimensions(input);
+  }
+  TF_LITE_ENSURE(context, 0 <= axis && axis < NumDimensions(input));
+
+  const int num_dimensions =
+      NumDimensions(input) + NumDimensions(positions) - 1;
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
+  int output_index = 0;
+  for (int i = 0; i < axis; ++i) {
+    output_shape->data[output_index++] = input->dims->data[i];
+  }
+  for (int i = 0; i < positions->dims->size; ++i) {
+    output_shape->data[output_index++] = positions->dims->data[i];
+  }
+  for (int i = axis + 1; i < input->dims->size; ++i) {
+    output_shape->data[output_index++] = input->dims->data[i];
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+template <typename InputT, typename PositionsT>
+TfLiteStatus Gather(const TfLiteGatherParams& params, const TfLiteTensor* input,
+                    const TfLiteTensor* positions, TfLiteTensor* output) {
+  tflite::GatherParams op_params;
+  op_params.axis = params.axis;
+  optimized_ops::Gather(op_params, GetTensorShape(input),
+                        GetTensorData<InputT>(input), GetTensorShape(positions),
+                        GetTensorData<PositionsT>(positions),
+                        GetTensorShape(output), GetTensorData<InputT>(output));
+  return kTfLiteOk;
+}
+
+template <typename PositionT>
+TfLiteStatus GatherStrings(TfLiteContext* context, const TfLiteTensor* input,
+                           const TfLiteTensor* positions,
+                           TfLiteTensor* output) {
+  // TODO(mgubin): Currently support only for 1D output tensors.
+  DynamicBuffer buffer;
+  const PositionT* indexes = GetTensorData<PositionT>(positions);
+  const PositionT num_strings = GetStringCount(input);
+  for (int i = 0; i < positions->dims->data[0]; ++i) {
+    const PositionT pos = indexes[i];
+    TF_LITE_ENSURE(context, pos < num_strings);
+    const auto string_ref = GetString(input, pos);
+    buffer.AddString(string_ref.str, string_ref.len);
+  }
+  buffer.WriteToTensorAsVector(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params =
+      reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (positions->type == kTfLiteInt32) {
+    switch (input->type) {
+      case kTfLiteFloat32:
+        return Gather<float, int32_t>(*params, input, positions, output);
+      case kTfLiteUInt8:
+        return Gather<uint8_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt32:
+        return Gather<int32_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt64:
+        return Gather<int64_t, int32_t>(*params, input, positions, output);
+      case kTfLiteString:
+        return GatherStrings<int32_t>(context, input, positions, output);
+      default:
+        context->ReportError(context, "Type '%s' is not supported by gather.",
+                             TfLiteTypeGetName(input->type));
+        return kTfLiteError;
+    }
+  }
+  if (positions->type == kTfLiteInt64) {
+    switch (input->type) {
+      case kTfLiteFloat32:
+        return Gather<float, int64_t>(*params, input, positions, output);
+      case kTfLiteUInt8:
+        return Gather<uint8_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt32:
+        return Gather<int32_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt64:
+        return Gather<int64_t, int64_t>(*params, input, positions, output);
+      case kTfLiteString:
+        return GatherStrings<int64_t>(context, input, positions, output);
+      default:
+        context->ReportError(context, "Type '%s' is not supported by gather.",
+                             TfLiteTypeGetName(input->type));
+        return kTfLiteError;
+    }
+  }
+  context->ReportError(context,
+                       "Positions of type '%s' are not supported by gather.",
+                       TfLiteTypeGetName(positions->type));
+  return kTfLiteError;
+}
+}  // namespace gather
+
+TfLiteRegistration* Register_GATHER() {
+  static TfLiteRegistration r = {nullptr, nullptr, gather::Prepare,
+                                 gather::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7b5f84348903a3cc436f1bd6cf32b3175b2f5815
--- /dev/null
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -0,0 +1,243 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class GatherOpModel : public SingleOpModel {
+ public:
+  GatherOpModel(const TensorData& input, const TensorData& positions,
+                int axis = 0) {
+    input_ = AddInput(input);
+    positions_ = AddInput(positions);
+    output_ = AddOutput(input.type);
+    SetBuiltinOp(BuiltinOperator_GATHER, BuiltinOptions_GatherOptions,
+                 CreateGatherOptions(builder_, axis).Union());
+    BuildInterpreter({GetShape(input_), GetShape(positions_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+
+  void SetStringInput(std::initializer_list<string> data) {
+    PopulateStringTensor(input_, data);
+  }
+
+  template <typename T>
+  void SetPositions(std::initializer_list<T> data) {
+    PopulateTensor<T>(positions_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<string> GetStringOutput() {
+    return ExtractVector<string>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int positions_;
+  int output_;
+};
+
+TEST(GatherOpTest, Shuffle) {
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({0.7, 0.8, -2, 0.2})));
+}
+
+TEST(GatherOpTest, Test0DIndex) {
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({0.7, 0.8})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(GatherOpTest, Test0DIndexWith0DResult) {
+  // 0D tensor is special case in current TFLite. Test it once to make sure
+  // existing workarounds are fine with it.
+  GatherOpModel m({TensorType_FLOAT32, {3}}, {TensorType_INT32, {}});
+  m.SetInput<float>({1.0, 2.0, 3.0});
+  m.SetPositions<int32_t>({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({2.0})));
+  EXPECT_TRUE(m.GetOutputShape().empty());
+}
+
+TEST(GatherOpTest, Test2DIndexWith2DResult) {
+  GatherOpModel m({TensorType_FLOAT32, {3}}, {TensorType_INT32, {1, 2}});
+  m.SetInput<float>({1.0, 2.0, 3.0});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({2.0, 1.0})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+}
+
+TEST(FloatGatherOpTest, Duplicate) {
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({0, 0});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput<float>(),
+      ElementsAreArray(ArrayFloatNear({-2, 0.2, 0.7, 0.8, -2, 0.2, 0.7, 0.8})));
+}
+
+TEST(FloatGatherOpTest, Slice) {
+  GatherOpModel m({TensorType_FLOAT32, {4, 1}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({-2.0, 0.2, 0.7, 0.8});
+  m.SetPositions<int32_t>({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({0.2, 0.8})));
+}
+
+TEST(FloatGatherOpTest, Axis1) {
+  const int axis = 1;
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 3}}, {TensorType_INT32, {2}},
+                  axis);
+  m.SetInput<float>({1, 2, 3, 4, 5, 6});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({4, 5, 6, 1, 2, 3})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3}));
+}
+
+TEST(FloatGatherOpTest, LastAxis) {
+  const int axis = -1;
+  GatherOpModel m({TensorType_FLOAT32, {1, 2, 3}}, {TensorType_INT32, {2}},
+                  axis);
+  m.SetInput<float>({1, 2, 3, 4, 5, 6});
+  m.SetPositions<int32_t>({2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 1, 6, 4})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2}));
+}
+
+TEST(TypesGatherOpTest, Float32Int32) {
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<float>({13.3, -13.4, -1.4, 1.5});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.4, 1.5, 13.3, -13.4}));
+}
+
+TEST(TypesGatherOpTest, Float32Int64) {
+  GatherOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<float>({13.3, -13.4, -1.4, 1.5});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.4, 1.5, 13.3, -13.4}));
+}
+
+TEST(TypesGatherOpTest, Int32Int32) {
+  GatherOpModel m({TensorType_INT32, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int32_t>({-1330, 1340, 140, -150});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({140, -150, -1330, 1340}));
+}
+
+TEST(TypesGatherOpTest, Int32Int64) {
+  GatherOpModel m({TensorType_INT32, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int32_t>({-1330, 1340, 140, -150});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({140, -150, -1330, 1340}));
+}
+
+TEST(TypesGatherOpTest, Uint8Int32) {
+  GatherOpModel m({TensorType_UINT8, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<uint8_t>({133, 134, 14, 15});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({14, 15, 133, 134}));
+}
+
+TEST(TypesGatherOpTest, Uint8Int64) {
+  GatherOpModel m({TensorType_UINT8, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<uint8_t>({133, 134, 14, 15});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({14, 15, 133, 134}));
+}
+
+TEST(TypesGatherOpTest, Int64Int32) {
+  GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({14LL, 15LL, -(1LL << 34), 134LL}));
+}
+
+TEST(TypesGatherOpTest, Int64Int64) {
+  GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({14LL, 15LL, -(1LL << 34), 134LL}));
+}
+
+TEST(GatherOpTest, SimpleString) {
+  GatherOpModel m({TensorType_STRING, {3}}, {TensorType_INT32, {2}});
+  m.SetStringInput({"A", "B", "C"});
+  m.SetPositions<int32_t>({0, 2});
+  m.Invoke();
+  ASSERT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"A", "C"}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.cc b/tensorflow/lite/kernels/gemm_support.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/gemm_support.cc
rename to tensorflow/lite/kernels/gemm_support.cc
index ed334af2da877edf9f591612478e22f04cf15931..cc224cb88401255950e678815776090b8031fa3d 100644
--- a/tensorflow/contrib/lite/kernels/gemm_support.cc
+++ b/tensorflow/lite/kernels/gemm_support.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/lite/kernels/gemm_support.h"
 
 #include <memory>
 
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace gemm_support {
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.h b/tensorflow/lite/kernels/gemm_support.h
similarity index 89%
rename from tensorflow/contrib/lite/kernels/gemm_support.h
rename to tensorflow/lite/kernels/gemm_support.h
index 43cd2b3055c5c3ac8b4952a06aa346aa52d4ed01..1feb638952acb0414697117b9863d4b785cdfe20 100644
--- a/tensorflow/contrib/lite/kernels/gemm_support.h
+++ b/tensorflow/lite/kernels/gemm_support.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#ifndef TENSORFLOW_LITE_KERNELS_GEMM_SUPPORT_H_
+#define TENSORFLOW_LITE_KERNELS_GEMM_SUPPORT_H_
 
 #include "public/gemmlowp.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 namespace gemm_support {
@@ -48,4 +48,4 @@ void DecrementUsageCounter(TfLiteContext* context);
 }  // namespace gemm_support
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#endif  // TENSORFLOW_LITE_KERNELS_GEMM_SUPPORT_H_
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc b/tensorflow/lite/kernels/hashtable_lookup.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/hashtable_lookup.cc
rename to tensorflow/lite/kernels/hashtable_lookup.cc
index c0b3c3c0c5beae934b508d49e2424d958b8c9230..da1116cf858667b1fc35f3f88269b66f81afcdb7 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup.cc
@@ -39,11 +39,11 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
@@ -137,7 +137,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   }
   if (output->type == kTfLiteString) {
-    buf.WriteToTensor(output);
+    buf.WriteToTensorAsVector(output);
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc b/tensorflow/lite/kernels/hashtable_lookup_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
rename to tensorflow/lite/kernels/hashtable_lookup_test.cc
index ba0ed5ce06392613238b757308dddc2b22e7eb30..d2ca76a206783f561c659faa3cd7632225b08d68 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..69816583f5020843aeff76890f51c6c306f11a4f
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -0,0 +1,705 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+tflite_deps_intel = [
+    "@arm_neon_2_x86_sse",
+]
+
+HARD_FP_FLAGS_IF_APPLICABLE = select({
+    "//tensorflow:android_arm": ["-mfloat-abi=softfp"],
+    "//tensorflow:android_arm64": ["-mfloat-abi=softfp"],
+    "//tensorflow:android_armeabi": ["-mfloat-abi=softfp"],
+    "//conditions:default": [],
+})
+
+NEON_FLAGS_IF_APPLICABLE = select({
+    ":arm": [
+        "-O3",
+        "-mfpu=neon",
+    ],
+    ":armeabi-v7a": [
+        "-O3",
+        "-mfpu=neon",
+    ],
+    ":armv7a": [
+        "-O3",
+        "-mfpu=neon",
+    ],
+    "//conditions:default": [
+        "-O3",
+    ],
+})
+
+cc_library(
+    name = "types",
+    srcs = [],
+    hdrs = [
+        "compatibility.h",
+        "types.h",
+    ],
+    deps = [
+        "//tensorflow/lite/kernels:op_macros",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_library(
+    name = "legacy_types",
+    srcs = [],
+    hdrs = [
+        "compatibility.h",
+        "legacy_types.h",
+        "types.h",
+    ],
+    deps = [
+        "//tensorflow/lite/kernels:op_macros",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+config_setting(
+    name = "arm",
+    values = {
+        "cpu": "arm",
+    },
+)
+
+config_setting(
+    name = "arm64-v8a",
+    values = {
+        "cpu": "arm64-v8a",
+    },
+)
+
+config_setting(
+    name = "armv7a",
+    values = {
+        "cpu": "armv7a",
+    },
+)
+
+config_setting(
+    name = "armeabi-v7a",
+    values = {
+        "cpu": "armeabi-v7a",
+    },
+)
+
+config_setting(
+    name = "haswell",
+    values = {
+        "cpu": "haswell",
+    },
+)
+
+config_setting(
+    name = "ios_x86_64",
+    values = {
+        "cpu": "ios_x86_64",
+    },
+)
+
+config_setting(
+    name = "ios_armv7",
+    values = {
+        "cpu": "ios_armv7",
+    },
+)
+
+config_setting(
+    name = "ios_arm64",
+    values = {
+        "cpu": "ios_arm64",
+    },
+)
+
+config_setting(
+    name = "k8",
+    values = {
+        "cpu": "k8",
+    },
+)
+
+config_setting(
+    name = "x86",
+    values = {
+        "cpu": "x86",
+    },
+)
+
+config_setting(
+    name = "x86_64",
+    values = {
+        "cpu": "x86_64",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {
+        "cpu": "darwin",
+    },
+)
+
+config_setting(
+    name = "darwin_x86_64",
+    values = {
+        "cpu": "darwin_x86_64",
+    },
+)
+
+config_setting(
+    name = "freebsd",
+    values = {
+        "cpu": "freebsd",
+    },
+)
+
+cc_library(
+    name = "optimized_base",
+    srcs = [],
+    hdrs = [
+        "common.h",
+        "optimized/depthwiseconv_float.h",
+        "optimized/depthwiseconv_uint8.h",
+        "optimized/depthwiseconv_uint8_3x3_filter.h",
+        "optimized/optimized_ops.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":quantization_util",
+        ":strided_slice_logic",
+        ":types",
+        ":reference_base",
+        ":round",
+        ":tensor_utils",
+        "//third_party/eigen3",
+        "@gemmlowp",
+        "//tensorflow/lite/c:c_api_internal",
+    ] + select({
+        ":haswell": tflite_deps_intel,
+        ":ios_x86_64": tflite_deps_intel,
+        ":k8": tflite_deps_intel,
+        ":x86": tflite_deps_intel,
+        ":x86_64": tflite_deps_intel,
+        ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "legacy_optimized_base",
+    srcs = [],
+    hdrs = [
+        "common.h",
+        "optimized/depthwiseconv_float.h",
+        "optimized/depthwiseconv_uint8.h",
+        "optimized/depthwiseconv_uint8_3x3_filter.h",
+        "optimized/legacy_optimized_ops.h",
+        "optimized/optimized_ops.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":quantization_util",
+        ":strided_slice_logic",
+        ":tensor_utils",
+        ":types",
+        ":legacy_types",
+        ":legacy_reference_base",
+        ":round",
+        "//third_party/eigen3",
+        "@gemmlowp",
+        "//tensorflow/lite/c:c_api_internal",
+    ] + select({
+        ":haswell": tflite_deps_intel,
+        ":ios_x86_64": tflite_deps_intel,
+        ":k8": tflite_deps_intel,
+        ":x86": tflite_deps_intel,
+        ":x86_64": tflite_deps_intel,
+        ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "optimized",
+    hdrs = [
+        "optimized/eigen_spatial_convolutions.h",
+        "optimized/eigen_tensor_reduced_instantiations_oss.h",
+        "optimized/multithreaded_conv.h",
+        # FIXME(petewarden) - This should be removed, since it's a header from the
+        # :tensor dependency below.
+        "tensor.h",
+    ],
+    deps = [
+        ":optimized_base",
+        ":tensor",
+        ":types",
+        "//tensorflow/lite/c:c_api_internal",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_test(
+    name = "tensor_test",
+    srcs = ["tensor_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    deps = [
+        ":tensor",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "round",
+    srcs = [],
+    hdrs = ["round.h"],
+)
+
+cc_library(
+    name = "quantization_util",
+    srcs = ["quantization_util.cc"],
+    hdrs = [
+        "compatibility.h",
+        "quantization_util.h",
+    ],
+    deps = [
+        ":round",
+        ":types",
+        "//tensorflow/lite/kernels:op_macros",
+    ],
+)
+
+cc_test(
+    name = "quantization_util_test",
+    srcs = ["quantization_util_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    deps = [
+        ":quantization_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "strided_slice_logic",
+    srcs = [],
+    hdrs = [
+        "strided_slice_logic.h",
+    ],
+    deps = [
+        ":types",
+    ],
+)
+
+cc_library(
+    name = "reference_base",
+    srcs = [],
+    hdrs = [
+        "common.h",
+        "reference/depthwiseconv_float.h",
+        "reference/depthwiseconv_uint8.h",
+        "reference/fully_connected.h",
+        "reference/integer_ops/dequantize.h",
+        "reference/reference_ops.h",
+        "reference/softmax.h",
+    ],
+    deps = [
+        ":quantization_util",
+        ":round",
+        ":strided_slice_logic",
+        ":types",
+        "@gemmlowp",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:op_macros",
+    ] + select({
+        ":haswell": tflite_deps_intel,
+        ":ios_x86_64": tflite_deps_intel,
+        ":k8": tflite_deps_intel,
+        ":x86": tflite_deps_intel,
+        ":x86_64": tflite_deps_intel,
+        ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "legacy_reference_base",
+    srcs = [],
+    hdrs = [
+        "common.h",
+        "reference/depthwiseconv_float.h",
+        "reference/depthwiseconv_uint8.h",
+        "reference/fully_connected.h",
+        "reference/legacy_reference_ops.h",
+        "reference/reference_ops.h",
+        "reference/softmax.h",
+    ],
+    deps = [
+        ":quantization_util",
+        ":round",
+        ":strided_slice_logic",
+        ":legacy_types",
+        ":types",
+        "@gemmlowp",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:op_macros",
+    ] + select({
+        ":haswell": tflite_deps_intel,
+        ":ios_x86_64": tflite_deps_intel,
+        ":k8": tflite_deps_intel,
+        ":x86": tflite_deps_intel,
+        ":x86_64": tflite_deps_intel,
+        ":darwin": tflite_deps_intel,
+        ":darwin_x86_64": tflite_deps_intel,
+        ":freebsd": tflite_deps_intel,
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "tensor",
+    hdrs = [
+        "tensor.h",
+        "tensor_ctypes.h",
+    ],
+    deps = [
+        ":types",
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+# Deprecated version of :tensor, kept for backwards compatibility.
+cc_library(
+    name = "reference",
+    hdrs = [
+        "tensor.h",
+        "tensor_ctypes.h",
+    ],
+    deps = [
+        ":types",
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_library(
+    name = "portable_tensor_utils",
+    srcs = [
+        "reference/portable_tensor_utils.cc",
+    ],
+    hdrs = [
+        "reference/portable_tensor_utils.h",
+    ],
+    deps = [
+        ":round",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:activation_functor",
+        "//tensorflow/lite/kernels:op_macros",
+    ],
+)
+
+cc_library(
+    name = "neon_tensor_utils",
+    srcs = [
+        "optimized/neon_tensor_utils.cc",
+        "reference/portable_tensor_utils.cc",
+        "reference/portable_tensor_utils.h",
+    ],
+    hdrs = [
+        "common.h",
+        "compatibility.h",
+        "optimized/cpu_check.h",
+        "optimized/neon_tensor_utils.h",
+        "optimized/tensor_utils_impl.h",
+    ],
+    copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
+    deps = [
+        ":cpu_check",
+        ":round",
+        ":types",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:activation_functor",
+        "//tensorflow/lite/kernels:op_macros",
+        "@arm_neon_2_x86_sse",
+        "@gemmlowp",
+    ],
+)
+
+cc_library(
+    name = "kernel_utils",
+    srcs = ["kernel_utils.cc"],
+    hdrs = ["kernel_utils.h"],
+    deps = [
+        ":tensor_utils",
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+# Audio support classes imported directly from TensorFlow.
+cc_library(
+    name = "audio_utils",
+    srcs = [
+        "mfcc.cc",
+        "mfcc_dct.cc",
+        "mfcc_mel_filterbank.cc",
+        "spectrogram.cc",
+    ],
+    hdrs = [
+        "mfcc.h",
+        "mfcc_dct.h",
+        "mfcc_mel_filterbank.h",
+        "spectrogram.h",
+    ],
+    deps = [
+        "//third_party/fft2d:fft2d_headers",
+        "@fft2d",
+    ],
+)
+
+cc_library(
+    name = "tensor_utils",
+    srcs = [
+        "tensor_utils.cc",
+    ],
+    hdrs = [
+        "common.h",
+        "compatibility.h",
+        "optimized/cpu_check.h",
+        "optimized/neon_tensor_utils.h",
+        "optimized/tensor_utils_impl.h",
+        "reference/portable_tensor_utils.h",
+        "tensor_utils.h",
+        "types.h",
+    ],
+    copts = NEON_FLAGS_IF_APPLICABLE,
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "//tensorflow/lite/c:c_api_internal",
+        "@arm_neon_2_x86_sse",
+        "//tensorflow/lite/kernels:op_macros",
+        "@gemmlowp",
+    ] + select({
+        ":arm": [
+            ":neon_tensor_utils",
+        ],
+        ":arm64-v8a": [
+            ":neon_tensor_utils",
+        ],
+        ":armeabi-v7a": [
+            ":neon_tensor_utils",
+        ],
+        ":armv7a": [
+            ":neon_tensor_utils",
+        ],
+        ":haswell": [
+            ":neon_tensor_utils",
+        ],
+        ":ios_armv7": [
+            ":neon_tensor_utils",
+        ],
+        ":ios_arm64": [
+            ":neon_tensor_utils",
+        ],
+        ":ios_x86_64": [
+            ":neon_tensor_utils",
+        ],
+        ":x86_64": [
+            ":neon_tensor_utils",
+        ],
+        ":x86": [
+            ":neon_tensor_utils",
+        ],
+        ":k8": [
+            ":neon_tensor_utils",
+        ],
+        ":darwin": [
+            ":neon_tensor_utils",
+        ],
+        ":darwin_x86_64": [
+            ":neon_tensor_utils",
+        ],
+        "//conditions:default": [
+            ":portable_tensor_utils",
+        ],
+    }),
+)
+
+cc_library(
+    name = "test_util",
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        ":types",
+        "//tensorflow/lite:string",
+    ],
+)
+
+cc_test(
+    name = "tensor_utils_test",
+    srcs = ["tensor_utils_test.cc"],
+    copts = NEON_FLAGS_IF_APPLICABLE,
+    linkopts = select({
+        "//tensorflow:android": [
+            "-fPIE -pie",
+        ],
+        "//conditions:default": [],
+    }),
+    linkstatic = 1,
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":tensor_utils",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "depthwiseconv_float_test",
+    srcs = ["depthwiseconv_float_test.cc"],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "depthwiseconv_quantized_test",
+    srcs = ["depthwiseconv_quantized_test.cc"],
+    shard_count = 2,
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "resize_bilinear_test",
+    srcs = ["resize_bilinear_test.cc"],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "resize_nearest_neighbor_test",
+    srcs = ["resize_nearest_neighbor_test.cc"],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "softmax_quantized_test",
+    timeout = "long",
+    srcs = [
+        "softmax_quantized_test.cc",
+    ],
+    deps = [
+        ":optimized_base",
+        ":quantization_util",
+        ":reference_base",
+        ":test_util",
+        "//tensorflow/lite:string",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "logsoftmax_quantized_test",
+    timeout = "long",
+    srcs = [
+        "logsoftmax_quantized_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":optimized_base",
+        ":quantization_util",
+        ":reference_base",
+        ":test_util",
+        "//tensorflow/lite:string",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "log_quantized_test",
+    srcs = ["log_quantized_test.cc"],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        "//tensorflow/lite:string",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "cpu_check",
+    hdrs = [
+        "optimized/cpu_check.h",
+    ],
+    deps = [
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "@androidndk//:cpufeatures",
+            ],
+            "//conditions:default": [],
+        },
+    ),
+)
+
+cc_test(
+    name = "batch_to_space_nd_test",
+    srcs = ["batch_to_space_nd_test.cc"],
+    deps = [
+        ":optimized_base",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/kernels/internal/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/internal/batch_to_space_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5fc2c93ba0e3a78a8950c17fee2051e207e7a83a
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/batch_to_space_nd_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+// A light wrapper of GetIndexRange which returns a pair of start / end
+// indices.
+std::pair<int, int> GetIndexRange(int spatial_index_dim, int block_shape_dim,
+                                  int input_dim, int output_dim) {
+  int index_start = 0;
+  int index_end = 0;
+  optimized_ops::GetIndexRange(spatial_index_dim, block_shape_dim, input_dim,
+                               output_dim, &index_start, &index_end);
+  return {index_start, index_end};
+}
+
+TEST(BatchToSpaceNDTest, TestIndexRange) {
+  // Simple test case, no cropping.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/6,
+                          /*input_dim=*/1, /*output_dim=*/6),
+            std::make_pair(0, 1));
+
+  // No cropping and input_dim > 1.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/2, /*block_shape_dim=*/6,
+                          /*input_dim=*/5, /*output_dim=*/30),
+            std::make_pair(0, 5));
+
+  // With small cropping values (can be either at the beginning or at the end).
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/2,
+                          /*input_dim=*/3, /*output_dim=*/4),
+            std::make_pair(0, 2));
+
+  // With positive cropping values at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-2, /*block_shape_dim=*/2,
+                          /*input_dim=*/3, /*output_dim=*/4),
+            std::make_pair(1, 3));
+
+  // Large crop at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(6, 7));
+
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-26, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(6, 7));
+
+  // Large crop at the end.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/4, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  // Rounding up incorrectly will fail this test.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/3, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/5),
+            std::make_pair(0, 1));
+
+  // Extreme cropping with output of a single spatial location.
+  // Valid position 1, when large crop at the end.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/0, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(0, 1));
+
+  // Valid position 2, when large crop at the beginning.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-30, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(6, 7));
+
+  // Invalid positions.
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/1, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(0, 0));
+  EXPECT_EQ(GetIndexRange(/*spatial_index_dim=*/-29, /*block_shape_dim=*/5,
+                          /*input_dim=*/7, /*output_dim=*/1),
+            std::make_pair(6, 6));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdb72037f84e4cea9018516ef70eb8c8fa039082
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -0,0 +1,269 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
+
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif
+#endif
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined __GNUC__ && defined __SSE4_1__ && !defined TF_LITE_DISABLE_X86_NEON
+#define USE_NEON
+
+#define OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wattributes"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wsequence-point"
+
+#include "NEON_2_SSE.h"
+
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+inline void GetActivationMinMax(FusedActivationFunctionType ac,
+                                float* output_activation_min,
+                                float* output_activation_max) {
+  switch (ac) {
+    case FusedActivationFunctionType::kNone:
+      *output_activation_min = std::numeric_limits<float>::lowest();
+      *output_activation_max = std::numeric_limits<float>::max();
+      break;
+    case FusedActivationFunctionType::kRelu:
+      *output_activation_min = 0.f;
+      *output_activation_max = std::numeric_limits<float>::max();
+      break;
+    case FusedActivationFunctionType::kRelu1:
+      *output_activation_min = -1.f;
+      *output_activation_max = 1.f;
+      break;
+    case FusedActivationFunctionType::kRelu6:
+      *output_activation_min = 0.f;
+      *output_activation_max = 6.f;
+      break;
+  }
+}
+
+inline float ActivationFunctionWithMinMax(float x, float output_activation_min,
+                                          float output_activation_max) {
+  return std::min(std::max(x, output_activation_min), output_activation_max);
+}
+
+// Legacy function, left for compatibility only.
+template <FusedActivationFunctionType Ac>
+float ActivationFunction(float x) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  return ActivationFunctionWithMinMax(x, output_activation_min,
+                                      output_activation_max);
+}
+
+inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32 x, int32 quantized_multiplier, int left_shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+}
+
+inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32 x, int32 quantized_multiplier, int left_shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                           quantized_multiplier);
+}
+
+inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
+                                           int shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  int left_shift = shift > 0 ? shift : 0;
+  int right_shift = shift > 0 ? 0 : -shift;
+  return RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
+                                 x * (1 << left_shift), quantized_multiplier),
+                             right_shift);
+}
+
+template <typename T>
+int CountLeadingZeros(T integer_input) {
+  static_assert(std::is_unsigned<T>::value,
+                "Only unsigned integer types handled.");
+#if defined(__GNUC__)
+  return integer_input ? __builtin_clz(integer_input) : 0;
+#else
+  const T one_in_leading_positive = static_cast<T>(1)
+                                    << (std::numeric_limits<T>::digits - 1);
+  int leading_zeros = 0;
+  while (integer_input < one_in_leading_positive) {
+    integer_input <<= 1;
+    ++leading_zeros;
+  }
+  return leading_zeros;
+#endif
+}
+
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N>
+struct NdArrayDesc {
+  // The "extent" of each dimension. Indices along dimension d must be in the
+  // half-open interval [0, extents[d]).
+  int extents[N];
+
+  // The number of *elements* (not bytes) between consecutive indices of each
+  // dimension.
+  int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+                            int i3) {
+  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+         i3 * desc.strides[3];
+}
+
+// Given the dimensions of the operands for an element-wise binary broadcast,
+// adjusts them so that they can be directly iterated over with simple loops.
+// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
+// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
+//
+// This function assumes that the two input shapes are compatible up to
+// broadcasting and the shorter one has already been prepended with 1s to be the
+// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
+// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
+// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
+// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
+//
+// When two shapes are compatible up to broadcasting, for each dimension d,
+// the input extents are either equal, or one of them is 1.
+//
+// This function performs the following for each dimension d:
+// - If the extents are equal, then do nothing since the loop that walks over
+//   both of the input arrays is correct.
+// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
+//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
+//   array0 to be referenced *at any index* in dimension d and still access the
+//   same slice.
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
+                                                const Dims<N>& input1_dims,
+                                                NdArrayDesc<N>* desc0_out,
+                                                NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  // Copy dims to desc.
+  for (int i = 0; i < N; ++i) {
+    desc0_out->extents[i] = input0_dims.sizes[i];
+    desc0_out->strides[i] = input0_dims.strides[i];
+    desc1_out->extents[i] = input1_dims.sizes[i];
+    desc1_out->strides[i] = input1_dims.strides[i];
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = ArraySize(input0_dims, i);
+    const int extent1 = ArraySize(input1_dims, i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(
+    const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
+    NdArrayDesc<N>* desc0_out, NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape);
+  auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape);
+
+  // Copy dims to desc, calculating strides.
+  int desc0_stride = 1;
+  int desc1_stride = 1;
+  for (int i = N - 1; i >= 0; --i) {
+    desc0_out->extents[i] = extended_input0_shape.Dims(i);
+    desc0_out->strides[i] = desc0_stride;
+    desc0_stride *= extended_input0_shape.Dims(i);
+    desc1_out->extents[i] = extended_input1_shape.Dims(i);
+    desc1_out->strides[i] = desc1_stride;
+    desc1_stride *= extended_input1_shape.Dims(i);
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = extended_input0_shape.Dims(i);
+    const int extent1 = extended_input1_shape.Dims(i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/lite/kernels/internal/compatibility.h b/tensorflow/lite/kernels/internal/compatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfd021ac48df5bc4103b86c58c1ecc8d35c35a5c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/compatibility.h
@@ -0,0 +1,110 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/op_macros.h"
+
+#ifndef TFLITE_DCHECK
+#define TFLITE_DCHECK(condition) (condition) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_EQ
+#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_NE
+#define TFLITE_DCHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_GE
+#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_GT
+#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_LE
+#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+#ifndef TFLITE_DCHECK_LT
+#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ASSERT_FALSE
+#endif
+
+// TODO(ahentz): Clean up: We should stick to the DCHECK versions.
+#ifndef TFLITE_CHECK
+#define TFLITE_CHECK(condition) (condition) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_EQ
+#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_NE
+#define TFLITE_CHECK_NE(x, y) ((x) != (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_GE
+#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_GT
+#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_LE
+#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+#ifndef TFLITE_CHECK_LT
+#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : TFLITE_ABORT
+#endif
+
+// TODO(ahentz): Clean up.
+using int8 = std::int8_t;
+using uint8 = std::uint8_t;
+using int16 = std::int16_t;
+using uint16 = std::uint16_t;
+using int32 = std::int32_t;
+using uint32 = std::uint32_t;
+
+// TFLITE_DEPRECATED()
+//
+// Duplicated from absl/base/macros.h to avoid pulling in that library.
+// Marks a deprecated class, struct, enum, function, method and variable
+// declarations. The macro argument is used as a custom diagnostic message (e.g.
+// suggestion of a better alternative).
+//
+// Example:
+//
+//   class TFLITE_DEPRECATED("Use Bar instead") Foo {...};
+//   TFLITE_DEPRECATED("Use Baz instead") void Bar() {...}
+//
+// Every usage of a deprecated entity will trigger a warning when compiled with
+// clang's `-Wdeprecated-declarations` option. This option is turned off by
+// default, but the warnings will be reported by clang-tidy.
+#if defined(__clang__) && __cplusplus >= 201103L
+#define TFLITE_DEPRECATED(message) __attribute__((deprecated(message)))
+#endif
+
+#ifndef TFLITE_DEPRECATED
+#define TFLITE_DEPRECATED(message)
+#endif
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/depthwiseconv_float_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_float_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/internal/depthwiseconv_float_test.cc
rename to tensorflow/lite/kernels/internal/depthwiseconv_float_test.cc
index 41862a21a6ed5a1985c7689bf78da7a0c9c141ea..3602b9ffd8435763d5392840c98d47bf6bf6ecf0 100644
--- a/tensorflow/contrib/lite/kernels/internal/depthwiseconv_float_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_float_test.cc
@@ -17,13 +17,13 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 #define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
-#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3682499d494cc4e63712b6c57d80482899b2185d
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -0,0 +1,544 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+
+namespace tflite {
+namespace {
+
+enum class ForceKernelInvocation {
+  // Run all tests against kUseStandardEntry even if also testing another
+  // kernel, since we need to be sure that the main DepthwiseConv() function in
+  // optimized_ops.h dispatches to a correctly-executing kernel.
+  kNone = 0,  // The "default" option: use the normal DepthwiseConv
+              // kernel (entry) function.
+  kUseGenericKernel,
+  kUseNeon3x3,            // 3x3 kernel that uses NEON when available.
+  kUseNeon3x3DotProduct,  // 3x3 kernel that uses dot-product enabled NEON when
+                          // available.
+};
+
+inline void DispatchDepthwiseConv(
+    ForceKernelInvocation forced_invocation, const DepthwiseParams& params,
+    const RuntimeShape& input_shape, const uint8* input_data,
+    const RuntimeShape& filter_shape, const uint8* filter_data,
+    const RuntimeShape& bias_shape, const int32* bias_data,
+    const RuntimeShape& output_shape, uint8* output_data) {
+  switch (forced_invocation) {
+    case ForceKernelInvocation::kUseNeon3x3: {
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+      const int stride_width = params.stride_width;
+      const int stride_height = params.stride_height;
+      const int pad_width = params.padding_values.width;
+      const int pad_height = params.padding_values.height;
+      const int output_shift = params.output_shift;
+      const int depth_multiplier = params.depth_multiplier;
+      const int dilation_width_factor = params.dilation_width_factor;
+      const int dilation_height_factor = params.dilation_height_factor;
+
+      // Check that parameter combination is supported.
+      const bool basic_3x3_kernel_supported =
+          optimized_ops::Fast3x3FilterKernelSupported(
+              input_shape, filter_shape, stride_width, stride_height,
+              dilation_width_factor, dilation_height_factor, pad_width,
+              pad_height, depth_multiplier, output_shape, output_shift);
+      ASSERT_TRUE(basic_3x3_kernel_supported)
+          << "pad_width = " << params.padding_values.width
+          << " pad_height = " << params.padding_values.height
+          << " input_width = " << input_shape.Dims(1)
+          << " input_height = " << input_shape.Dims(2)
+          << " output_width = " << output_shape.Dims(1)
+          << " output_height = " << output_shape.Dims(2);
+
+      // Call kernel optimized for depthwise convolutions using 3x3 filters.
+      optimized_ops::DepthwiseConv3x3Filter(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
+#else
+      break;
+#endif
+    }
+    case ForceKernelInvocation::kUseNeon3x3DotProduct: {
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__) && \
+    !defined(GOOGLE_L4T)
+      using optimized_ops::DotProduct3x3KernelType;
+      DotProduct3x3KernelType kernel_type =
+          optimized_ops::CategorizeDotProductKernel(params);
+      switch (kernel_type) {
+        case DotProduct3x3KernelType::kPlain:
+          // TODO(b/118430534): Implement optimized kernel.
+          optimized_ops::DepthwiseConv3x3Filter(
+              params, input_shape, input_data, filter_shape, filter_data,
+              bias_shape, bias_data, output_shape, output_data);
+          return;
+        case DotProduct3x3KernelType::kWithDepthMultiplication:
+          // TODO(b/118430338): Implement optimized kernel.
+          optimized_ops::DepthwiseConvGeneral(
+              params, input_shape, input_data, filter_shape, filter_data,
+              bias_shape, bias_data, output_shape, output_data);
+          return;
+        case DotProduct3x3KernelType::kWithPad0Stride2:
+          // TODO(b/118430338): Implement optimized kernel.
+          optimized_ops::DepthwiseConv3x3Filter(
+              params, input_shape, input_data, filter_shape, filter_data,
+              bias_shape, bias_data, output_shape, output_data);
+          return;
+        case DotProduct3x3KernelType::kWithPad1Stride1:
+          // TODO(b/118430338): Implement optimized kernel.
+          optimized_ops::DepthwiseConvGeneral(
+              params, input_shape, input_data, filter_shape, filter_data,
+              bias_shape, bias_data, output_shape, output_data);
+          return;
+        case DotProduct3x3KernelType::kNone:
+        default:
+          break;
+      }
+#endif
+      break;
+    }
+    case ForceKernelInvocation::kUseGenericKernel: {
+      optimized_ops::DepthwiseConvGeneral(params, input_shape, input_data,
+                                          filter_shape, filter_data, bias_shape,
+                                          bias_data, output_shape, output_data);
+      return;
+    }
+    case ForceKernelInvocation::kNone:
+    default:
+      break;
+  }
+  optimized_ops::DepthwiseConv(params, input_shape, input_data, filter_shape,
+                               filter_data, bias_shape, bias_data, output_shape,
+                               output_data);
+}
+
+// Runs the DepthwiseConv and compares against the reference implementation.
+int TestOneDepthwiseConvWithGivenOutputShift(
+    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const RuntimeShape& input_shape, std::int32_t input_offset,
+    const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
+    std::int32_t filter_offset, const std::int32_t* bias_data,
+    const RuntimeShape& bias_shape, int stride, PaddingType padding_type,
+    int pad_width, int pad_height, int depth_multiplier,
+    std::int32_t output_offset, std::int32_t output_multiplier,
+    int output_shift, std::int32_t output_activation_min,
+    std::int32_t output_activation_max, const RuntimeShape& output_shape) {
+  const int output_buffer_size = output_shape.FlatSize();
+  std::vector<std::uint8_t> output_data(output_buffer_size);
+  std::vector<std::uint8_t> reference_output_data(output_buffer_size);
+
+  tflite::DepthwiseParams op_params;
+  op_params.padding_type = padding_type;
+  op_params.padding_values.width = pad_width;
+  op_params.padding_values.height = pad_height;
+  op_params.stride_width = stride;
+  op_params.stride_height = stride;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.depth_multiplier = depth_multiplier;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = -output_shift;
+  reference_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape,
+                               filter_data, bias_shape, bias_data, output_shape,
+                               reference_output_data.data());
+  DispatchDepthwiseConv(forced_invocation, op_params, input_shape, input_data,
+                        filter_shape, filter_data, bias_shape, bias_data,
+                        output_shape, output_data.data());
+  int saturated_min = 0;
+  int saturated_max = 0;
+  std::vector<int> diff(output_buffer_size);
+  std::int64_t sum_diff = 0;
+  std::int64_t sum_abs_diff = 0;
+  for (int i = 0; i < output_buffer_size; i++) {
+    diff[i] = static_cast<int>(output_data[i]) -
+              static_cast<int>(reference_output_data[i]);
+    sum_diff += diff[i];
+    sum_abs_diff += std::abs(diff[i]);
+    saturated_min += output_data[i] == output_activation_min;
+    saturated_max += output_data[i] == output_activation_max;
+  }
+  // These stats help understand test failures.
+  std::sort(std::begin(diff), std::end(diff));
+  const int min_diff = diff.front();
+  const int max_diff = diff.back();
+  const int median_diff = diff[diff.size() / 2];
+  const float mean_diff = static_cast<float>(sum_diff) / output_buffer_size;
+  const float mean_abs_diff =
+      static_cast<float>(sum_abs_diff) / output_buffer_size;
+  // Normally we should require bit-for-bit exact results. Unfortunately a bug
+  // in the Intel arm_neon_sse.h translation header that we use for x86 tests
+  // causes 1-bit inaccuracy in
+  // the vqrdmulh_n_s32 intrinsic, which causes off-by-1 errors in quantized
+  // DepthwiseConv ops. So we have to live with a few off-by-one errors for now,
+  // yet still ensure that no more than a small minority of values are wrong.
+  EXPECT_TRUE(std::abs(mean_diff) < 1e-5f && mean_abs_diff < 1e-5f &&
+              std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 &&
+              std::abs(max_diff) <= 1);
+  if (saturated_min > 2 * saturated_max) {
+    return -1;
+  }
+  if (saturated_max > 2 * saturated_min) {
+    return 1;
+  }
+  return 0;
+}
+
+// The point of this function is that we can't practically know which
+// output_shift value to pass to test DepthwiseConv. It's not easy to guess (we
+// could do some
+// statistics for large size, but they would be fragile at smaller sizes), and
+// guessing wrong would mean that all the values get saturated so the test
+// becomes
+// vacuous. So we just bisect our way to reasonable output_shift values.
+void TestOneDepthwiseConvBisectOutputShift(
+    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const RuntimeShape& input_shape, std::int32_t input_offset,
+    const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
+    std::int32_t filter_offset, const std::int32_t* bias_data,
+    const RuntimeShape& bias_shape, int stride, PaddingType padding_type,
+    int pad_width, int pad_height, int depth_multiplier,
+    std::int32_t output_offset, std::int32_t output_multiplier,
+    int output_activation_bisect_start, int output_activation_bisect_end,
+    std::int32_t output_activation_min, std::int32_t output_activation_max,
+    const RuntimeShape& output_shape) {
+  ASSERT_LT(output_activation_bisect_start, output_activation_bisect_end)
+      << "Bisection failed ?!?!";
+  int output_shift_bisect_midpoint =
+      (output_activation_bisect_start + output_activation_bisect_end) / 2;
+  int bisect_result = TestOneDepthwiseConvWithGivenOutputShift(
+      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
+      pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
+      output_shift_bisect_midpoint, output_activation_min,
+      output_activation_max, output_shape);
+  // At this point we know that the test succeeded (otherwise it would have
+  // aborted).
+  if (bisect_result == 0) {
+    // The result isn't particularly saturated on one or the other side.
+    // All good, we're done.
+    return;
+  }
+  if (output_activation_bisect_start == output_activation_bisect_end - 1) {
+    // There is still some saturation on one side, but the bisection is
+    // finished anyways. We're done; nothing more we can do about it. This
+    // happens
+    // in particular when using an activation with a narrow range.
+    return;
+  }
+  // Continue the bisection based on the present result.
+  int new_output_activation_bisect_start = bisect_result == 1
+                                               ? output_shift_bisect_midpoint
+                                               : output_activation_bisect_start;
+  int new_output_activation_bisect_end = bisect_result == 1
+                                             ? output_activation_bisect_end
+                                             : output_shift_bisect_midpoint;
+  TestOneDepthwiseConvBisectOutputShift(
+      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
+      pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
+      new_output_activation_bisect_start, new_output_activation_bisect_end,
+      output_activation_min, output_activation_max, output_shape);
+}
+
+void TestOneDepthwiseConv(
+    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const RuntimeShape& input_shape, std::int32_t input_offset,
+    const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
+    std::int32_t filter_offset, const std::int32_t* bias_data,
+    const RuntimeShape& bias_shape, int stride, PaddingType padding_type,
+    int pad_width, int pad_height, int depth_multiplier,
+    std::int32_t output_offset, std::int32_t output_multiplier,
+    std::int32_t output_activation_min, std::int32_t output_activation_max,
+    const RuntimeShape& output_shape) {
+  TestOneDepthwiseConvBisectOutputShift(
+      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
+      pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
+      0, 32, output_activation_min, output_activation_max, output_shape);
+}
+
+bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
+                          int input_depth, int input_width, int input_height,
+                          int filter_width, int filter_height,
+                          int depth_multiplier, int stride,
+                          int dilation_width_factor, int dilation_height_factor,
+                          PaddingType padding_type) {
+  const int output_depth = input_depth * depth_multiplier;
+  // The optimized DepthwiseConv implementation currently uses a fixed-size
+  // accumulator buffer on the stack, with that size. This currently means
+  // that it does not support larger output depths. It CHECK's for it,
+  // so it's safe in the sense that if a larger output depth was encountered,
+  // it would explicitly fail. We just need to adjust our testing to that
+  // constraint.
+  const int kMaxSupportedOutputDepth = 1024;
+  if (output_depth > kMaxSupportedOutputDepth) {
+    return false;
+  }
+  int output_activation_min = 0;
+  int output_activation_max = 255;
+  if (UniformRandomInt(0, 1)) {
+    output_activation_min = UniformRandomInt(0, 50);
+    output_activation_max = UniformRandomInt(200, 255);
+  }
+  const std::int32_t output_multiplier =
+      UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
+  const std::int32_t input_offset = UniformRandomInt(-256, 0);
+  const std::int32_t filter_offset = UniformRandomInt(-256, 0);
+  const std::int32_t output_offset = UniformRandomInt(-256, 0);
+  RuntimeShape input_shape_inference(
+      {batch, input_height, input_width, input_depth});
+  RuntimeShape output_shape_inference;
+  int pad_width, pad_height;
+  if (!ComputeConvSizes(input_shape_inference, output_depth, filter_width,
+                        filter_height, stride, dilation_width_factor,
+                        dilation_height_factor, padding_type,
+                        &output_shape_inference, &pad_width, &pad_height)) {
+    return false;
+  }
+  RuntimeShape filter_shape_inference(
+      {1, filter_height, filter_width, output_depth});
+  RuntimeShape bias_shape_inference({1, 1, 1, output_depth});
+  const int input_buffer_size = input_shape_inference.FlatSize();
+  const int filter_buffer_size = filter_shape_inference.FlatSize();
+  std::vector<std::uint8_t> input_data(input_buffer_size);
+  std::vector<std::uint8_t> filter_data(filter_buffer_size);
+  std::vector<std::int32_t> bias_data(output_depth);
+  FillRandom(&input_data);
+  FillRandom(&filter_data);
+  FillRandom(&bias_data, -10000, 10000);
+  TestOneDepthwiseConv(
+      forced_invocation, input_data.data(), input_shape_inference, input_offset,
+      filter_data.data(), filter_shape_inference, filter_offset,
+      bias_data.data(), bias_shape_inference, stride, padding_type, pad_width,
+      pad_height, depth_multiplier, output_offset, output_multiplier,
+      output_activation_min, output_activation_max, output_shape_inference);
+  return true;
+}
+
+// This function picks some random DepthwiseConv params, which may or may not
+// be legal. If they're not legal, it returns false. If they're legal,
+// it runs the DepthwiseConv test and returns true. This allows the caller
+// to loop until a test has been run.
+bool TryTestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
+  // We have to pick a lot of positive values, where we are particularly
+  // interested in small values because they are most likely to be special
+  // cases in optimized implementations, and secondarily because they allow
+  // tests to run fast, which means we can run more tests and get more
+  // coverage.
+  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+  const int input_depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int filter_width = ExponentialRandomPositiveInt(0.9f, 4, 10);
+  const int filter_height = ExponentialRandomPositiveInt(0.9f, 4, 10);
+  const int depth_multiplier = ExponentialRandomPositiveInt(0.8f, 6, 50);
+  const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
+  const int dilation_width_factor = RandomElement(std::vector<int>({1, 2, 4}));
+  const int dilation_height_factor = RandomElement(std::vector<int>({1, 2, 4}));
+  const auto padding_type =
+      UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
+
+  return TryTestDepthwiseConv(
+      forced_invocation, batch, input_depth, input_width, input_height,
+      filter_width, filter_height, depth_multiplier, stride,
+      dilation_width_factor, dilation_height_factor, padding_type);
+}
+
+// Tests parameters for the 3x3 filter kernel.
+bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
+  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+  const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
+  int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int filter_width = 3;
+  const int filter_height = 3;
+  const int depth_multiplier = 1;
+  const int stride = UniformRandomInt(1, 2);
+  // We don't support dilations in the 3x3 filter.
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const auto padding_type =
+      UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
+
+  // Adjust for, or reject, special cases.
+  if (forced_invocation != ForceKernelInvocation::kNone) {
+    // With stride == 2 and SAME, padding width and height are the left and top
+    // padding amounts. When there is an even input dimension, padding + 1 is
+    // required on the right / bottom. This is not handled by these kernels, so
+    // we bump the input dimensions.
+    if (padding_type == PaddingType::kSame && stride == 2) {
+      input_width = 2 * (input_width / 2) + 1;
+      input_height = 2 * (input_height / 2) + 1;
+    }
+
+    // The padded 3x3 kernel (with kSame) does not support input_width == 1 when
+    // input_height > 1, and vice versa.
+    if (padding_type == PaddingType::kSame &&
+        (input_width > 1) != (input_height > 1)) {
+      return false;
+    }
+  }
+
+  return TryTestDepthwiseConv(
+      forced_invocation, batch, input_depth, input_width, input_height,
+      filter_width, filter_height, depth_multiplier, stride,
+      dilation_width_factor, dilation_height_factor, padding_type);
+}
+
+// Tests with parameters suited to dot-product-NEON 3x3 filter kernels.
+bool TryTestOneNeonDot3x3(ForceKernelInvocation forced_invocation,
+                          bool test_stride, bool test_pad,
+                          bool test_depth_multiplier) {
+  const int batch = 1;
+  const int input_depth = test_depth_multiplier
+                              ? 1
+                              : 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
+  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int filter_width = 3;
+  const int filter_height = 3;
+  const int depth_multiplier =
+      test_depth_multiplier ? 8 * ExponentialRandomPositiveInt(0.8f, 1, 6) : 1;
+  const int stride = test_stride ? 2 : 1;
+  // We don't support dilations in the 3x3 filter.
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const auto padding_type = test_pad ? PaddingType::kSame : PaddingType::kValid;
+
+  return TryTestDepthwiseConv(
+      forced_invocation, batch, input_depth, input_width, input_height,
+      filter_width, filter_height, depth_multiplier, stride,
+      dilation_width_factor, dilation_height_factor, padding_type);
+}
+
+void TestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
+  while (!TryTestOneDepthwiseConv(forced_invocation)) {
+  }
+}
+
+void TestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
+  while (!TryTestOneDepthwiseConv3x3Filter(forced_invocation)) {
+  }
+}
+
+void TestOneNeonDot3x3(ForceKernelInvocation forced_invocation,
+                       bool test_stride, bool test_pad,
+                       bool test_depth_multiplier) {
+  while (!TryTestOneNeonDot3x3(forced_invocation, test_stride, test_pad,
+                               test_depth_multiplier)) {
+  }
+}
+
+TEST(TestDepthwiseConv, TestDepthwiseConv) {
+  const int kTestsToRun = 10 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneDepthwiseConv(ForceKernelInvocation::kNone);
+  }
+}
+
+// Run basic coverage test against the generic kernel.
+TEST(TestDepthwiseConv, TestGenericKernel) {
+  const int kTestsToRun = 10 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneDepthwiseConv(ForceKernelInvocation::kUseGenericKernel);
+  }
+}
+
+TEST(TestDepthwiseConv, TestKernel3x3Filter) {
+  const int kTestsToRun = 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kNone);
+  }
+}
+
+// While the 3x3 coverage test is primarily targeted at specialized kernels, we
+// also run it against the generic kernel, optionally with fewer invocations.
+TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) {
+  const int kTestsToRun = 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kUseGenericKernel);
+  }
+}
+
+TEST(TestDepthwiseConv, TestNeon3x3Filter) {
+  const int kTestsToRun = 3 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kUseNeon3x3);
+  }
+}
+
+// No stride, no depth multiplier, no pad.
+TEST(TestDepthwiseConv, TestNeonDot3x3Plain) {
+  const int kTestsToRun = 3 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
+                      /*test_stride=*/false, /*test_pad=*/false,
+                      /*test_depth_multiplier=*/false);
+  }
+}
+
+TEST(TestDepthwiseConv, TestNeonDot3x3DepthMultiplier) {
+  const int kTestsToRun = 3 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
+                      /*test_stride=*/false, /*test_pad=*/false,
+                      /*test_depth_multiplier=*/true);
+  }
+}
+
+TEST(TestDepthwiseConv, TestNeonDot3x3Stride2) {
+  const int kTestsToRun = 3 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
+                      /*test_stride=*/true, /*test_pad=*/false,
+                      /*test_depth_multiplier=*/false);
+  }
+}
+
+TEST(TestDepthwiseConv, TestNeonDot3x3Pad1) {
+  const int kTestsToRun = 3 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
+                      /*test_stride=*/false, /*test_pad=*/true,
+                      /*test_depth_multiplier=*/false);
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/kernel_utils.cc b/tensorflow/lite/kernels/internal/kernel_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0836a3b662d356a0e068ce9403fdc44033afce2e
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/kernel_utils.cc
@@ -0,0 +1,321 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+
+namespace tflite {
+namespace kernel_utils {
+
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  int output_batch_leading_dim,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  RnnBatchStep(input_ptr_batch, input_weights_ptr,
+               /*aux_input_ptr_batch=*/nullptr,
+               /*aux_input_weights_ptr=*/nullptr, recurrent_weights_ptr,
+               bias_ptr, input_size, /*aux_input_size=*/0, num_units,
+               batch_size, output_batch_leading_dim, activation,
+               hidden_state_ptr_batch, output_ptr_batch);
+}
+
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* aux_input_ptr_batch,
+                  const float* aux_input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int aux_input_size, int num_units,
+                  int batch_size, int output_batch_leading_dim,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  // Since the output batch rows may not be contiguous (output_batch_leading_dim
+  // != n_output), we unroll the batched operations where this is the case.
+  if (output_batch_leading_dim == num_units) {
+    // Output = bias
+    tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
+                                          output_ptr_batch);
+
+    // Output += input * input_weights
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
+        output_ptr_batch, /*result_stride=*/1);
+
+    // Output += aux_input * aux_input_weights (if they are not empty).
+    if (aux_input_size > 0) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch,
+          batch_size, output_ptr_batch, /*result_stride=*/1);
+    }
+
+    // Output += recurrent_weights * hidden_state
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
+        batch_size, output_ptr_batch, /*result_stride=*/1);
+
+    // Output = activation(Output) and update hidden_state
+    tensor_utils::ApplyActivationToVector(
+        output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
+    tensor_utils::CopyVector(output_ptr_batch, num_units * batch_size,
+                             hidden_state_ptr_batch);
+  } else {
+    // Output = bias
+    for (int k = 0; k < batch_size; k++) {
+      tensor_utils::CopyVector(bias_ptr, num_units,
+                               output_ptr_batch + k * output_batch_leading_dim);
+    }
+
+    // Output += input * input_weights
+    for (int k = 0; k < batch_size; k++) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_weights_ptr, num_units, input_size,
+          input_ptr_batch + k * input_size, /*n_batch=*/1,
+          output_ptr_batch + k * output_batch_leading_dim, /*result_stride=*/1);
+    }
+
+    // Output += aux_input * aux_input_weights (if they are not empty).
+    if (aux_input_size > 0) {
+      for (int k = 0; k < batch_size; k++) {
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            aux_input_weights_ptr, num_units, aux_input_size,
+            aux_input_ptr_batch + k * aux_input_size,
+            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
+            /*result_stride=*/1);
+      }
+    }
+
+    // Output += recurrent_weights * hidden_state
+    for (int k = 0; k < batch_size; k++) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_weights_ptr, num_units, num_units,
+          hidden_state_ptr_batch + k * num_units,
+          /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
+          /*result_stride=*/1);
+    }
+
+    // Output = activation(Output) and update hidden_state
+    for (int k = 0; k < batch_size; k++) {
+      tensor_utils::ApplyActivationToVector(
+          output_ptr_batch + k * output_batch_leading_dim, num_units,
+          activation, output_ptr_batch + k * output_batch_leading_dim);
+      tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
+                               num_units,
+                               hidden_state_ptr_batch + k * num_units);
+    }
+  }
+}
+
+void RnnBatchStep(
+    const float* input_ptr_batch, const int8_t* input_weights_ptr,
+    float input_weights_scale, const int8_t* recurrent_weights_ptr,
+    float recurrent_weights_scale, const float* bias_ptr, int input_size,
+    int num_units, int batch_size, int output_batch_leading_dim,
+    TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
+    int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
+    float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  RnnBatchStep(input_ptr_batch, input_weights_ptr, input_weights_scale,
+               /*aux_input_ptr_batch=*/nullptr,
+               /*aux_input_weights_ptr=*/nullptr,
+               /*aux_input_weights_scale=*/0.0f, recurrent_weights_ptr,
+               recurrent_weights_scale, bias_ptr, input_size,
+               /*aux_input_size=*/0, num_units, batch_size,
+               output_batch_leading_dim, activation, quantized_input_ptr_batch,
+               /*aux_quantized_input_ptr_batch=*/nullptr,
+               quantized_hidden_state_ptr_batch, scaling_factors,
+               hidden_state_ptr_batch, output_ptr_batch);
+}
+
+void RnnBatchStep(
+    const float* input_ptr_batch, const int8_t* input_weights_ptr,
+    float input_weights_scale, const float* aux_input_ptr_batch,
+    const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
+    const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
+    const float* bias_ptr, int input_size, int aux_input_size, int num_units,
+    int batch_size, int output_batch_leading_dim,
+    TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
+    int8_t* aux_quantized_input_ptr_batch,
+    int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
+    float* hidden_state_ptr_batch, float* output_ptr_batch) {
+  // Since the output batch rows may not be contiguous (output_batch_leading_dim
+  // != n_output), we unroll the batched operations where this is the case.
+  if (output_batch_leading_dim == num_units) {
+    // Output = bias
+    tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
+                                          output_ptr_batch);
+
+    // Save quantization and matmul computation for all zero input.
+    if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
+      // Quantize input from float to uint8 + quantization params (scaling
+      // factor).
+      float unused_min, unused_max;
+      // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
+      // whichever is faster.
+      for (int b = 0; b < batch_size; ++b) {
+        const int offset = b * input_size;
+        tensor_utils::SymmetricQuantizeFloats(
+            input_ptr_batch + offset, input_size,
+            quantized_input_ptr_batch + offset, &unused_min, &unused_max,
+            &scaling_factors[b]);
+        scaling_factors[b] *= input_weights_scale;
+      }
+
+      // Output += input * input_weights
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
+          scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
+    }
+
+    if (aux_input_ptr_batch &&
+        !tensor_utils::IsZeroVector(aux_input_ptr_batch,
+                                    batch_size * aux_input_size)) {
+      float unused_min, unused_max;
+      for (int b = 0; b < batch_size; ++b) {
+        const int offset = b * aux_input_size;
+        tensor_utils::SymmetricQuantizeFloats(
+            aux_input_ptr_batch + offset, aux_input_size,
+            aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
+            &scaling_factors[b]);
+        scaling_factors[b] *= aux_input_weights_scale;
+      }
+
+      // Output += aux_input * aux_input_weights
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          aux_input_weights_ptr, num_units, aux_input_size,
+          aux_quantized_input_ptr_batch, scaling_factors, batch_size,
+          output_ptr_batch, /*result_stride=*/1);
+    }
+
+    // Save quantization and matmul computation for all zero input.
+    if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
+                                    batch_size * num_units)) {
+      // Quantize hidden_state
+      float unused_min, unused_max;
+      for (int b = 0; b < batch_size; ++b) {
+        const int offset = b * num_units;
+        tensor_utils::SymmetricQuantizeFloats(
+            hidden_state_ptr_batch + offset, num_units,
+            quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
+            &scaling_factors[b]);
+        scaling_factors[b] *= recurrent_weights_scale;
+      }
+
+      // Output += recurrent_weights * hidden_state
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_weights_ptr, num_units, num_units,
+          quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
+          output_ptr_batch, /*result_stride=*/1);
+    }
+
+    // Output = activation(Output) and update hidden_state
+    tensor_utils::ApplyActivationToVector(
+        output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
+    tensor_utils::CopyVector(output_ptr_batch, num_units * batch_size,
+                             hidden_state_ptr_batch);
+  } else {
+    // Output = bias
+    for (int k = 0; k < batch_size; k++) {
+      tensor_utils::CopyVector(bias_ptr, num_units,
+                               output_ptr_batch + k * output_batch_leading_dim);
+    }
+
+    // Save quantization and matmul computation for all zero input.
+    if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
+      // Quantize input from float to uint8 + quantization params (scaling
+      // factor).
+      float unused_min, unused_max;
+      // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
+      // whichever is faster.
+      for (int b = 0; b < batch_size; ++b) {
+        const int offset = b * input_size;
+        tensor_utils::SymmetricQuantizeFloats(
+            input_ptr_batch + offset, input_size,
+            quantized_input_ptr_batch + offset, &unused_min, &unused_max,
+            &scaling_factors[b]);
+        scaling_factors[b] *= input_weights_scale;
+      }
+
+      // Output += input * input_weights
+      for (int k = 0; k < batch_size; k++) {
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            input_weights_ptr, num_units, input_size,
+            quantized_input_ptr_batch + k * input_size, &scaling_factors[k],
+            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
+            /*result_stride=*/1);
+      }
+    }
+
+    if (aux_input_ptr_batch &&
+        !tensor_utils::IsZeroVector(aux_input_ptr_batch,
+                                    batch_size * aux_input_size)) {
+      float unused_min, unused_max;
+      for (int b = 0; b < batch_size; ++b) {
+        const int offset = b * aux_input_size;
+        tensor_utils::SymmetricQuantizeFloats(
+            aux_input_ptr_batch + offset, aux_input_size,
+            aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
+            &scaling_factors[b]);
+        scaling_factors[b] *= aux_input_weights_scale;
+      }
+
+      // Output += aux_input * aux_input_weights
+      for (int k = 0; k < batch_size; k++) {
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            aux_input_weights_ptr, num_units, aux_input_size,
+            aux_quantized_input_ptr_batch + k * aux_input_size,
+            &scaling_factors[k],
+            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
+            /*result_stride=*/1);
+      }
+    }
+
+    // Save quantization and matmul computation for all zero input.
+    if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
+                                    batch_size * num_units)) {
+      // Quantize hidden_state
+      float unused_min, unused_max;
+      for (int b = 0; b < batch_size; ++b) {
+        const int offset = b * num_units;
+        tensor_utils::SymmetricQuantizeFloats(
+            hidden_state_ptr_batch + offset, num_units,
+            quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
+            &scaling_factors[b]);
+        scaling_factors[b] *= recurrent_weights_scale;
+      }
+
+      // Output += recurrent_weights * hidden_state
+      for (int k = 0; k < batch_size; k++) {
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            recurrent_weights_ptr, num_units, num_units,
+            quantized_hidden_state_ptr_batch + k * num_units,
+            &scaling_factors[k],
+            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
+            /*result_stride=*/1);
+      }
+    }
+
+    // Output = activation(Output) and update hidden_state
+    for (int k = 0; k < batch_size; k++) {
+      tensor_utils::ApplyActivationToVector(
+          output_ptr_batch + k * output_batch_leading_dim, num_units,
+          activation, output_ptr_batch + k * output_batch_leading_dim);
+      tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
+                               num_units,
+                               hidden_state_ptr_batch + k * num_units);
+    }
+  }
+}
+
+}  // namespace kernel_utils
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/kernel_utils.h b/tensorflow/lite/kernels/internal/kernel_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebb91678fecd94e0ada3783d52660bf14b3e638b
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/kernel_utils.h
@@ -0,0 +1,89 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+
+namespace tflite {
+namespace kernel_utils {
+
+// Performs an RNN batch inference step for inputs specified by input_ptr_batch.
+// The RNN cell is specified by the pointers to its input and recurrent weights,
+// and biases, along with the input size, number of units, activation.
+//
+// The pointers to the hidden state and the output are updated as a result.
+//
+// The pointers with the suffix "_batch" point to data aligned in batch_major
+// order, and each step processes batch_size many inputs from input_ptr_batch,
+// and updates batch_size many outputs and hidden states.
+//
+// The output_batch_dim is output.shape[-1], i.e. the outermost dimension of the
+// output tensor, and in most cases will be equal to num_units. It is usually
+// not when we want to store the RNN output into a slice of the output tensor,
+// e.g. for bidirectional RNNs with merge_outputs. In this case, the batched
+// operations cannot be used since they assume that the batched outputs are
+// contiguous, and we manually loop over the batched outputs.
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  int output_batch_leading_dim,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch);
+
+// Same as above but includes an auxiliary input with the corresponding weights.
+void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
+                  const float* aux_input_ptr_batch,
+                  const float* aux_input_weights_ptr,
+                  const float* recurrent_weights_ptr, const float* bias_ptr,
+                  int input_size, int aux_input_size, int num_units,
+                  int batch_size, int output_batch_leading_dim,
+                  TfLiteFusedActivation activation,
+                  float* hidden_state_ptr_batch, float* output_ptr_batch);
+
+// Performs a quantized RNN batch inference step. Same as above, but for
+// quantization purposes, we also pass in quantized_hidden_state_ptr_batch and
+// quantized_input_ptr_batch pointers for temporary storage of the quantized
+// values of hidden_state_ptr_batch and input_ptr_batch, respectively.
+// These temporary storages are expected to be preallocated to the same size as
+// the respective pointers.
+// An additional preallocated temporary storage 'scaling_factors' (of size
+// batch_size) is used to store the scaling factors of the quantization (used
+// for recovery).
+// {input,recurrent}_weights_scale params are used for dequantization/recovery.
+void RnnBatchStep(
+    const float* input_ptr_batch, const int8_t* input_weights_ptr,
+    float input_weights_scale, const int8_t* recurrent_weights_ptr,
+    float recurrent_weights_scale, const float* bias_ptr, int input_size,
+    int num_units, int batch_size, int output_batch_leading_dim,
+    TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
+    int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
+    float* hidden_state_ptr_batch, float* output_ptr_batch);
+
+void RnnBatchStep(
+    const float* input_ptr_batch, const int8_t* input_weights_ptr,
+    float input_weights_scale, const float* aux_input_ptr_batch,
+    const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
+    const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
+    const float* bias_ptr, int input_size, int aux_input_size, int num_units,
+    int batch_size, int output_batch_leading_dim,
+    TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
+    int8_t* aux_quantized_input_ptr_batch,
+    int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
+    float* hidden_state_ptr_batch, float* output_ptr_batch);
+
+}  // namespace kernel_utils
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/lite/kernels/internal/legacy_types.h b/tensorflow/lite/kernels/internal/legacy_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..c19a1adb90f48cee4730a315345480cbe710e651
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/legacy_types.h
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_LEGACY_TYPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_LEGACY_TYPES_H_
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// TODO(b/116772710): Insert legacy Dims<> code in here.
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_LEGACY_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc b/tensorflow/lite/kernels/internal/log_quantized_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc
rename to tensorflow/lite/kernels/internal/log_quantized_test.cc
index 8963abb9afd9d51473fe5a22d8e88d314b385ad9..8c39350ab1dd8996799e6539755f040399974106 100644
--- a/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/log_quantized_test.cc
@@ -27,9 +27,9 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc
rename to tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
index 2252ca1bcc2190d03de84c8389d586f6f423582f..889a726f3a915fb592511d34c036b9726542fee9 100644
--- a/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/internal/mfcc.cc b/tensorflow/lite/kernels/internal/mfcc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fddd4c46b6094a5fb4a51ed326f5f8dabd781281
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/mfcc.cc
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <math.h>
+
+#include "tensorflow/lite/kernels/internal/mfcc.h"
+
+namespace tflite {
+namespace internal {
+
+const double kDefaultUpperFrequencyLimit = 4000;
+const double kDefaultLowerFrequencyLimit = 20;
+const double kFilterbankFloor = 1e-12;
+const int kDefaultFilterbankChannelCount = 40;
+const int kDefaultDCTCoefficientCount = 13;
+
+Mfcc::Mfcc()
+    : initialized_(false),
+      lower_frequency_limit_(kDefaultLowerFrequencyLimit),
+      upper_frequency_limit_(kDefaultUpperFrequencyLimit),
+      filterbank_channel_count_(kDefaultFilterbankChannelCount),
+      dct_coefficient_count_(kDefaultDCTCoefficientCount) {}
+
+bool Mfcc::Initialize(int input_length, double input_sample_rate) {
+  bool initialized = mel_filterbank_.Initialize(
+      input_length, input_sample_rate, filterbank_channel_count_,
+      lower_frequency_limit_, upper_frequency_limit_);
+  initialized &=
+      dct_.Initialize(filterbank_channel_count_, dct_coefficient_count_);
+  initialized_ = initialized;
+  return initialized;
+}
+
+void Mfcc::Compute(const std::vector<double>& spectrogram_frame,
+                   std::vector<double>* output) const {
+  if (!initialized_) {
+    // LOG(ERROR) << "Mfcc not initialized.";
+    return;
+  }
+  std::vector<double> working;
+  mel_filterbank_.Compute(spectrogram_frame, &working);
+  for (int i = 0; i < working.size(); ++i) {
+    double val = working[i];
+    if (val < kFilterbankFloor) {
+      val = kFilterbankFloor;
+    }
+    working[i] = log(val);
+  }
+  dct_.Compute(working, output);
+}
+
+}  // namespace internal
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc.h b/tensorflow/lite/kernels/internal/mfcc.h
similarity index 88%
rename from tensorflow/contrib/lite/kernels/internal/mfcc.h
rename to tensorflow/lite/kernels/internal/mfcc.h
index d8500ecdcf38e5dcfe9eb89915501678455b3dd9..8dae91efdeb542109c151f7e01ef4cf63f722e9c 100644
--- a/tensorflow/contrib/lite/kernels/internal/mfcc.h
+++ b/tensorflow/lite/kernels/internal/mfcc.h
@@ -15,13 +15,13 @@ limitations under the License.
 
 // Basic class for computing MFCCs from spectrogram slices.
 
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_H_
 
 #include <vector>
 
-#include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h"
-#include "tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h"
+#include "tensorflow/lite/kernels/internal/mfcc_dct.h"
+#include "tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h"
 
 namespace tflite {
 namespace internal {
@@ -75,4 +75,4 @@ class Mfcc {
 }  // namespace internal
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc_dct.cc b/tensorflow/lite/kernels/internal/mfcc_dct.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/internal/mfcc_dct.cc
rename to tensorflow/lite/kernels/internal/mfcc_dct.cc
index b0b7d181bdcf01688a387f33a3e64fc904324b50..c249fdb020a3ac8e40c1b90db76993cd0864a94a 100644
--- a/tensorflow/contrib/lite/kernels/internal/mfcc_dct.cc
+++ b/tensorflow/lite/kernels/internal/mfcc_dct.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/kernels/internal/mfcc_dct.h"
+#include "tensorflow/lite/kernels/internal/mfcc_dct.h"
 
 #include <math.h>
 
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc_dct.h b/tensorflow/lite/kernels/internal/mfcc_dct.h
similarity index 86%
rename from tensorflow/contrib/lite/kernels/internal/mfcc_dct.h
rename to tensorflow/lite/kernels/internal/mfcc_dct.h
index a53f5cbd9bb70c7c9dd49672681140bb9cbd2f4f..f2947b506b2aedb21ec423173480074864680edc 100644
--- a/tensorflow/contrib/lite/kernels/internal/mfcc_dct.h
+++ b/tensorflow/lite/kernels/internal/mfcc_dct.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Basic minimal DCT class for MFCC speech processing.
 
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
 
 #include <vector>
 
@@ -40,4 +40,4 @@ class MfccDct {
 }  // namespace internal
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_DCT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.cc b/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc
similarity index 99%
rename from tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.cc
rename to tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc
index c3deb33d91a47bfe54b7c84d2a615df2422f90cc..9748da39862edd7565fdb2bcce2ce92b9d767429 100644
--- a/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.cc
+++ b/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.cc
@@ -28,7 +28,7 @@ limitations under the License.
 // channels may end up with no contributing FFT bins.  The resulting mel
 // spectrum output will have some channels that are always zero.
 
-#include "tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h"
+#include "tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h"
 
 #include <math.h>
 
diff --git a/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h b/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h
similarity index 91%
rename from tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h
rename to tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h
index c1db28243eea39a694b7613ac7144dce9b294897..53d05bff5f45e4c183389f3eccd2e5bf865a456f 100644
--- a/tensorflow/contrib/lite/kernels/internal/mfcc_mel_filterbank.h
+++ b/tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h
@@ -15,8 +15,8 @@ limitations under the License.
 
 // Basic class for applying a mel-scale mapping to a power spectrum.
 
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
 
 #include <vector>
 
@@ -60,4 +60,4 @@ class MfccMelFilterbank {
 }  // namespace internal
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MFCC_MEL_FILTERBANK_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
similarity index 89%
rename from tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
rename to tensorflow/lite/kernels/internal/optimized/cpu_check.h
index 934308ef291956babcfa288668354e924fb6cd5a..ac4ea7d6dae04532ac92f73b34f75096724a5137 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 
 namespace tflite {
 
@@ -58,4 +58,4 @@ inline bool TestCPUFeatureNeon() { return false; }
                        : Portable##funcname(__VA_ARGS__)
 #endif
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..c77715de57990666b362b08dae7c21b9707d942c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -0,0 +1,1075 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+
+template <>
+struct FloatDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    const float32x2_t filters = vld1_f32(filter_ptr);
+    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      const float32x4_t input = vld1q_f32(input_ptr);
+      input_ptr += 4;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filters_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += 2;
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filters);
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters
+        float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
+        float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
+        float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
+        float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
+        float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
+        float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
+        float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
+        local_input_ptr += 16;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+        float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+        float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+        float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+        // Multiply-accumulate
+        acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
+        acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
+        acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
+        acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x4_t filter;
+        filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        float32x4_t input;
+        input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc;
+        acc = vld1q_f32(acc_buffer_ptr);
+        // Multiply-accumulate
+        acc = vmlaq_f32(acc, input, filter);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr, acc);
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const float input_val = *local_input_ptr++;
+        const float filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += filter_val * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[2];
+        for (int i = 0; i < 2; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+// Note this implementation is very slow for input_depths < 8
+// (e.g. comparable to reference implementation) see, specializations for
+// input_depth=3 below.
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4x2_t input_dup2[2];
+        for (int i = 0; i < 2; i++) {
+          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+          input_dup2[i] = vzipq_f32(input, input);
+        }
+        local_input_ptr += 8;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x2_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float32x4_t input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        const float32x4_t filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+        }
+        local_filter_ptr += 2;
+        acc_buffer_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x2_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1_f32(filter_ptr + 2 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x2_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+      }
+      // Multiply-accumulate for each input channel there 2 outputs
+      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+      }
+      acc_buffer_ptr += 6;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // NOTE: we only want 3 values, so we read it as two ops where
+      // the second op just duplicates the lane
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate all outputs.
+      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 12;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 32> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+    float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
+    float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
+    float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
+      float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
+      float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
+      acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
+      acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 20> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 16> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      for (int ic = 0; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    float32x2_t filter = vld1_f32(filter_ptr);
+    float32x4_t filter_x4 = vcombine_f32(filter, filter);
+    int outp = 0;
+
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      float32x2_t input_1 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x2_t input_2 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x4_t input = vcombine_f32(input_1, input_2);
+
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter_x4);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filter);
+
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    float32x4_t filter = vld1q_f32(filter_ptr);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input = vld1q_f32(input_ptr);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
+                                int input_depth, int input_width,
+                                const float* input_data, int pad_width,
+                                int depth_multiplier, int filter_width,
+                                const float* filter_data,
+                                int out_x_buffer_start, int out_x_buffer_end,
+                                int output_depth, float* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  TFLITE_DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclampled = 0;
+    int out_x_loop_end_unclampled = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled = (pad_width + input_width -
+                                     dilation_factor * filter_x + stride - 1) /
+                                    stride;
+      }
+    } else {
+      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclampled =
+          pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
+                             kFixedDepthMultiplier>::Run(num_output_pixels,
+                                                         input_depth,
+                                                         depth_multiplier,
+                                                         input_ptr,
+                                                         input_ptr_increment,
+                                                         filter_base_ptr,
+                                                         acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(
+    int stride, int dilation_factor, int input_depth, int input_width,
+    const float* input_data, int pad_width, int depth_multiplier,
+    int filter_width, const float* filter_data, int out_x_buffer_start,
+    int out_x_buffer_end, int output_depth, float* acc_buffer) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start,
+        (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end = std::min(
+        out_x_buffer_end,
+        (pad_width + input_width - dilation_factor * filter_x + stride - 1) /
+            stride);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const float* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const float input_val = *input_ptr++;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const float filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += filter_val * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const float* bias_data,
+                                       float* acc_buffer) {
+  // TODO(benoitjacob): This might need optimized specializations
+  // for small output_depth values, if that ever becomes an important
+  // case (like it was for some quantized DepthwiseConv cases).
+  for (int i = 0; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  static const int kAccBufferMaxSize = 4832;
+  float acc_buffer[kAccBufferMaxSize];
+  TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+                   kAccBufferActualSize);
+  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
+    row_accum_func =                                                      \
+        FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
+                                   FIXED_DEPTH_MULTIPLIER>;               \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
+
+#endif  // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func) {
+    row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+  }
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  float* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+          std::max(0, (-in_y_origin + dilation_height_factor - 1) /
+                          dilation_height_factor);
+      const int filter_y_end =
+          std::min(filter_height,
+                   (input_height - in_y_origin + dilation_height_factor - 1) /
+                       dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(
+              stride_width, dilation_width_factor, input_depth, input_width,
+              input_data + in_y * input_height_stride + b * input_batch_stride,
+              pad_width, depth_multiplier, filter_width,
+              filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+              out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating. Now store to destination.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+        // Handle 16 values at a time
+        for (; i <= num_output_values - 16; i += 16) {
+          float32x4_t acc[4];
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+          }
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vmaxq_f32(
+                vdupq_n_f32(output_activation_min),
+                vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+          }
+          for (int k = 0; k < 4; k++) {
+            vst1q_f32(output_ptr + 4 * k, acc[k]);
+          }
+          output_ptr += 16;
+        }
+        // Handle 4 values at a time
+        for (; i <= num_output_values - 4; i += 4) {
+          float32x4_t acc = vld1q_f32(acc_buffer + i);
+
+          acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                          vminq_f32(vdupq_n_f32(output_activation_max), acc));
+
+          vst1q_f32(output_ptr, acc);
+          output_ptr += 4;
+        }
+#endif
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          float acc = acc_buffer[i];
+          acc = std::max(output_activation_min,
+                         std::min(output_activation_max, acc));
+
+          *output_ptr++ = acc;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3dca799a7cca4a3048cd2d19477ba2b57fbcdac
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -0,0 +1,2001 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8x2_t filter_u8;
+    filter_u8.val[0] = vld1_u8(filter_ptr);
+    filter_u8.val[1] = vld1_u8(filter_ptr + 8);
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
+                            vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4x2_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
+                                  vget_low_s16(input_dup2.val[i]));
+        acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
+                                  vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+      acc[1] =
+          vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+      acc[3] =
+          vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      acc[0] = vld1q_s32(acc_buffer_ptr);
+      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc[0]);
+      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
+                                   vget_low_s16(input_dup2.val[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
+                                   vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4x2_t input_dup2 = vzip_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+    int outp = 0;
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32 input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32 input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vmlal_n_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      }
+      input_ptr += 16;
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+        acc[2 * i + 1] =
+            vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
+                              vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
+                              vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
+                              vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
+                              vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
+                              vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
+                              vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
+                              vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
+                              vget_high_s16(input), 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 3> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // We will have to duplicate bytes in a NEON register, 3-fold.
+    // We will do that by register-level table-look-up using VTBL instructions.
+    // Here we prepare the registers containing the table-lookup indices.
+    static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
+                                                   {2, 3, 3, 3, 4, 4, 4, 5},
+                                                   {5, 5, 6, 6, 6, 7, 7, 7}};
+    uint8x8_t dup3_indices[3];
+    for (int i = 0; i < 3; i++) {
+      dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
+    }
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8* local_filter_ptr = filter_ptr;
+      const uint8* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[3];
+        uint8x8x3_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
+        local_filter_ptr += 24;
+        for (int i = 0; i < 3; i++) {
+          const int16x8_t filter_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, duplicate 3-fold, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+
+        uint8x8_t input_u8_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
+        }
+        int16x8_t input_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          const int16x8_t input_s16_dup3 =
+              vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
+          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4x3_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+        }
+        // Multiply-accumulate
+        for (int j = 0; j < 3; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
+                                    vget_low_s16(filter[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
+                                    vget_high_s16(filter[j]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+        }
+        acc_buffer_ptr += 24;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16 input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 3; i++) {
+          const int16 filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+        }
+        local_filter_ptr += 3;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8* local_filter_ptr = filter_ptr;
+      const uint8* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[2];
+        uint8x8x2_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        local_filter_ptr += 16;
+        for (int i = 0; i < 2; i++) {
+          const int16x8_t filter_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, add input_offset, duplicate 2-fold.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+        // Load the accumulators from acc_buffer.
+        int32x4x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+        }
+        // Multiply-accumulate.
+        for (int j = 0; j < 2; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
+                                    vget_low_s16(input_dup2.val[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
+                                    vget_high_s16(input_dup2.val[j]));
+        }
+        // Store the accumulators back to acc_buffer.
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs.
+        const int16 input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 2; i++) {
+          const int16 filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+        }
+        local_filter_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8* local_filter_ptr = filter_ptr;
+      const uint8* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters, add filter_offset.
+        uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
+        uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
+        local_filter_ptr += 16;
+        int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+        int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+        filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+        filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+        // Load the inputs, add input_offset.
+        uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
+        uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
+        local_input_ptr += 16;
+        int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+        int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+        acc_1 =
+            vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+        acc_3 =
+            vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+        // Store the accumulators back to acc_buffer
+        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
+        local_filter_ptr += 8;
+        const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+        const int16x8_t filter =
+            vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        // Load the inputs, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16 input_val = *local_input_ptr++ + input_offset;
+        const int16 filter_val = *local_filter_ptr++ + filter_offset;
+        *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 16, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += input_ptr_increment;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
+                                   vget_low_s16(filter[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
+                                   vget_high_s16(filter[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 16> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+        acc[2 * i + 1] =
+            vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 32> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+    uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
+    uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
+    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
+    int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
+    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+    filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
+    filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 20> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+    // We load the first 16 bytes into filter_u8_{0,1} as usual.
+    // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
+    // This is redundant: the first 4 bytes of filter_u8_x are the same
+    // as the last 4 bytes of filter_u8_x.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+    uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
+    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
+    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+    filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter = vaddq_s16(
+        vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint16x4_t input_u16 = vdup_n_u16(0);
+      input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
+                                input_u16, 0);
+      input_ptr += input_ptr_increment;
+      input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
+                                input_u16, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(
+          vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    if (num_output_pixels <= 0) {
+      return;
+    }
+
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+
+    // Handle one output pixel at a time until second to the last pixel. Second
+    // to the last because we read eight input pixels while only processing
+    // four.
+    for (; outp < num_output_pixels - 1; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle the last output pixel.
+    // Load the accumulators from acc_buffer
+    int32x4_t acc;
+    acc = vld1q_s32(acc_buffer_ptr);
+
+    // Load the inputs, add input_offset.
+    uint8x8_t input_u8 = vdup_n_u8(0);
+    input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+    input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+    input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+    input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+    const int16x4_t input_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+    // Multiply-accumulate
+    acc = vmlal_s16(acc, filter, input);
+    // Store the accumulators back to acc_buffer
+    vst1q_s32(acc_buffer_ptr, acc);
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 12, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
+    int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
+    filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
+    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8_0 = vld1_u8(input_ptr);
+      uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
+      input_ptr += input_ptr_increment;
+      int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+      int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+      // Multiply-accumulate
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+      acc_buffer_ptr += 12;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
+                                    int input_depth, int input_width,
+                                    const uint8* input_data, int16 input_offset,
+                                    int pad_width, int depth_multiplier,
+                                    int filter_width, const uint8* filter_data,
+                                    int16 filter_offset, int out_x_buffer_start,
+                                    int out_x_buffer_end, int output_depth,
+                                    int32* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  TFLITE_DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const uint8* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclampled = 0;
+    int out_x_loop_end_unclampled = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled = (pad_width + input_width -
+                                     dilation_factor * filter_x + stride - 1) /
+                                    stride;
+      }
+    } else {
+      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclampled =
+          pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+    int32* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const uint8* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    QuantizedDepthwiseConvKernel<
+        kAllowStrided, kFixedInputDepth,
+        kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
+                                    depth_multiplier, input_ptr, input_offset,
+                                    input_ptr_increment, filter_base_ptr,
+                                    filter_offset, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(
+    int stride, int dilation_factor, int input_depth, int input_width,
+    const uint8* input_data, int16 input_offset, int pad_width,
+    int depth_multiplier, int filter_width, const uint8* filter_data,
+    int16 filter_offset, int out_x_buffer_start, int out_x_buffer_end,
+    int output_depth, int32* acc_buffer) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+  const uint8* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start,
+        (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end = std::min(
+        out_x_buffer_end,
+        (pad_width + input_width - dilation_factor * filter_x + stride - 1) /
+            stride);
+
+    int32* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const uint8* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const uint8* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const int16 input_val = *input_ptr++ + input_offset;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const int16 filter_val = *filter_ptr++ + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const int32* bias_data,
+                                       int32* acc_buffer) {
+  int i = 0;
+#ifdef USE_NEON
+  if (output_depth == 1) {
+    const int32x4_t b = vdupq_n_s32(bias_data[0]);
+    for (; i <= num_output_pixels - 16; i += 16) {
+      vst1q_s32(acc_buffer + i + 0, b);
+      vst1q_s32(acc_buffer + i + 4, b);
+      vst1q_s32(acc_buffer + i + 8, b);
+      vst1q_s32(acc_buffer + i + 12, b);
+    }
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + i, b);
+    }
+  } else if (output_depth == 2) {
+    int32x4_t b = vdupq_n_s32(bias_data[0]);
+    b = vsetq_lane_s32(bias_data[1], b, 1);
+    b = vsetq_lane_s32(bias_data[1], b, 3);
+    for (; i <= num_output_pixels - 8; i += 8) {
+      vst1q_s32(acc_buffer + 2 * i + 0, b);
+      vst1q_s32(acc_buffer + 2 * i + 4, b);
+      vst1q_s32(acc_buffer + 2 * i + 8, b);
+      vst1q_s32(acc_buffer + 2 * i + 12, b);
+    }
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 2 * i, b);
+    }
+  } else if (output_depth == 4) {
+    const int32x4_t b = vld1q_s32(bias_data);
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + 4 * i + 0, b);
+      vst1q_s32(acc_buffer + 4 * i + 4, b);
+      vst1q_s32(acc_buffer + 4 * i + 8, b);
+      vst1q_s32(acc_buffer + 4 * i + 12, b);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 4 * i, b);
+    }
+  } else if (output_depth == 8) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+      vst1q_s32(acc_buffer + 8 * i + 8, b0);
+      vst1q_s32(acc_buffer + 8 * i + 12, b1);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+    }
+  } else if (output_depth == 16) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    const int32x4_t b2 = vld1q_s32(bias_data + 8);
+    const int32x4_t b3 = vld1q_s32(bias_data + 12);
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 16 * i + 0, b0);
+      vst1q_s32(acc_buffer + 16 * i + 4, b1);
+      vst1q_s32(acc_buffer + 16 * i + 8, b2);
+      vst1q_s32(acc_buffer + 16 * i + 12, b3);
+    }
+  }
+#endif
+  for (; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConvGeneral(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+#ifdef USE_NEON
+  const bool shift_left = (output_shift > 0);
+  const int32 multiplier_power_of_two = shift_left ? (1 << output_shift) : 1;
+#endif
+
+  static const int kAccBufferMaxSize = 2048;
+  int32 acc_buffer[kAccBufferMaxSize];
+  TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+                   kAccBufferActualSize);
+  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
+    row_accum_func =                                                      \
+        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
+                                       FIXED_DEPTH_MULTIPLIER>;           \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif  // USE_NEON
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func) {
+    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+  }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  uint8* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+          std::max(0, (-in_y_origin + dilation_height_factor - 1) /
+                          dilation_height_factor);
+      const int filter_y_end =
+          std::min(filter_height,
+                   (input_height - in_y_origin + dilation_height_factor - 1) /
+                       dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(
+              stride_width, dilation_width_factor, input_depth, input_width,
+              input_data + in_y * input_height_stride + b * input_batch_stride,
+              input_offset, pad_width, depth_multiplier, filter_width,
+              filter_data + filter_y * filter_height_stride, filter_offset,
+              out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32 values. Now need to convert them to
+        // the final 8bit form and store them.
+        gemmlowp::ScopedProfilingLabel label("downquantize+store");
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+#ifdef USE_NEON
+        using gemmlowp::RoundingDivideByPOT;
+        const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+        const int32x4_t output_activation_min_vec =
+            vdupq_n_s32(output_activation_min);
+        const int32x4_t output_activation_max_vec =
+            vdupq_n_s32(output_activation_max);
+        // Handle 16 values at once.
+        // This allows us to issue 4 mutually independent int32
+        // multiplications (vqrdmulh), which should alleviate most of their
+        // high latency.
+        for (; i <= num_output_values - 16; i += 16) {
+          int32x4_t acc[4];
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
+          }
+
+          if (!shift_left) {
+            // Fixed-point multiplication.
+            for (int j = 0; j < 4; j++) {
+              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+            }
+            for (int j = 0; j < 4; j++) {
+              acc[j] = RoundingDivideByPOT(acc[j], -output_shift);
+            }
+          } else {
+            // Fixed-point multiplication.
+            for (int j = 0; j < 4; j++) {
+              acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
+              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+            }
+          }
+          // Add the output offset.
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vaddq_s32(acc[j], output_offset_vec);
+          }
+          // Apply the activation function.
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
+          }
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vminq_s32(acc[j], output_activation_max_vec);
+          }
+          // Saturating cast to uint8 and store to destination.
+          int16x4_t acc_s16[4];
+          for (int j = 0; j < 4; j++) {
+            acc_s16[j] = vqmovn_s32(acc[j]);
+          }
+          const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
+          const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
+          const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
+          const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
+          vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
+          output_ptr += 16;
+        }
+        // Handle 8 values at once.
+        // Not as good as 16 (now we're only issuing 2 mutually independent
+        // vqrdmulh instructions, so we're probably paying for their high
+        // latency).
+        for (; i <= num_output_values - 8; i += 8) {
+          int32x4_t acc0 = vld1q_s32(acc_buffer + i);
+          int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
+          if (!shift_left) {
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            // Rounding right shift.
+            acc0 = RoundingDivideByPOT(acc0, -output_shift);
+            acc1 = RoundingDivideByPOT(acc1, -output_shift);
+          } else {
+            // Fixed-point multiplication.
+            acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+
+            acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          }
+          // Add the output offset.
+          acc0 = vaddq_s32(acc0, output_offset_vec);
+          acc1 = vaddq_s32(acc1, output_offset_vec);
+          // Apply the activation function.
+          acc0 = vmaxq_s32(acc0, output_activation_min_vec);
+          acc1 = vmaxq_s32(acc1, output_activation_min_vec);
+          acc0 = vminq_s32(acc0, output_activation_max_vec);
+          acc1 = vminq_s32(acc1, output_activation_max_vec);
+          // Saturating cast to uint8 and store to destination.
+          const int16x4_t acc0_s16 = vqmovn_s32(acc0);
+          const int16x4_t acc1_s16 = vqmovn_s32(acc1);
+          const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_u8(output_ptr, res_u8);
+          output_ptr += 8;
+        }
+        // Handle 4 values at once. Now we're paying the full price of the
+        // high latency of vqrdmulh. Also, storing only 4 bytes at the end
+        // (without any alignment) can only be done 1 byte at a time.
+        // Yet, that is still worth doing to minimize the amount of leftover
+        // that will have to go through the very slow scalar code.
+        for (; i <= num_output_values - 4; i += 4) {
+          int32x4_t acc = vld1q_s32(acc_buffer + i);
+          if (!shift_left) {
+            // Fixed-point multiplication.
+            acc = vqrdmulhq_n_s32(acc, output_multiplier);
+            // Rounding right shift.
+            acc = RoundingDivideByPOT(acc, -output_shift);
+          } else {
+            // Fixed-point multiplication.
+            acc = vmulq_n_s32(acc, multiplier_power_of_two);
+            acc = vqrdmulhq_n_s32(acc, output_multiplier);
+          }
+          // Add the output offset.
+          acc = vaddq_s32(acc, output_offset_vec);
+          // Apply the activation function.
+          acc = vmaxq_s32(acc, output_activation_min_vec);
+          acc = vminq_s32(acc, output_activation_max_vec);
+          // Saturating cast to uint8 and store to destination.
+          const int16x4_t acc_s16 = vqmovn_s32(acc);
+          const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_lane_u8(output_ptr + 0, res_u8, 0);
+          vst1_lane_u8(output_ptr + 1, res_u8, 1);
+          vst1_lane_u8(output_ptr + 2, res_u8, 2);
+          vst1_lane_u8(output_ptr + 3, res_u8, 3);
+          output_ptr += 4;
+        }
+#endif  // USE_NEON
+
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          int32 acc = acc_buffer[i];
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          *output_ptr++ = static_cast<uint8>(acc);
+        }
+      }
+    }
+  }
+}
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  TFLITE_DCHECK_GE(dilation_width_factor, 1);
+  TFLITE_DCHECK_GE(dilation_height_factor, 1);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int output_shift = params.output_shift;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (Fast3x3FilterKernelSupported(
+          input_shape, filter_shape, stride_width, stride_height,
+          dilation_width_factor, dilation_height_factor, pad_width, pad_height,
+          depth_multiplier, output_shape, output_shift)) {
+    DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
+                           filter_data, bias_shape, bias_data, output_shape,
+                           output_data);
+    return;
+  }
+#endif
+
+  DepthwiseConvGeneral(params, input_shape, input_data, filter_shape,
+                       filter_data, bias_shape, bias_data, output_shape,
+                       output_data);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
similarity index 96%
rename from tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
rename to tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 4809ddd02a60e7166f32ec10f70f79ac8a20d378..5859bcaed4ac2b991ca22e7d9c17d34d3267a120 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -12,26 +12,58 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
 
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace optimized_ops {
 
+// See CategorizeDotProductKernel for definitive taxonomy.
+enum class DotProduct3x3KernelType {
+  kNone = 0,  // Parameter combination is not supported for dot product kernels.
+  kPlain,
+  kWithDepthMultiplication,
+  kWithPad0Stride2,
+  kWithPad1Stride1,
+};
+
+inline DotProduct3x3KernelType CategorizeDotProductKernel(
+    const DepthwiseParams& params) {
+  const int padding = params.padding_values.width;
+  const int stride = params.stride_width;
+  if (padding != params.padding_values.height ||
+      stride != params.stride_height) {
+    return DotProduct3x3KernelType::kNone;
+  }
+
+  if (params.depth_multiplier == 1) {
+    if (padding == 0 && stride == 1) {
+      return DotProduct3x3KernelType::kPlain;
+    } else if (padding == 0 && stride == 2) {
+      return DotProduct3x3KernelType::kWithPad0Stride2;
+    } else if (padding == 1 && stride == 1) {
+      return DotProduct3x3KernelType::kWithPad1Stride1;
+    } else {
+      return DotProduct3x3KernelType::kNone;
+    }
+  } else {
+    if (padding == 0 && stride == 1) {
+      return DotProduct3x3KernelType::kWithDepthMultiplication;
+    } else {
+      return DotProduct3x3KernelType::kNone;
+    }
+  }
+}
+
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
 #include <stddef.h>
-// clang-format gets confused with this file and ends up formatting lines to
-// be larger than 80 characters. Turn off here and back on at the end of the
-// file.
-
-// clang-format off
 
 #define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
 
@@ -83,42 +115,58 @@ struct DepthwiseConvParams {
 #define OFFSET_OUTPUT_WIDTH 84
 #define OFFSET_OUTPUT_HEIGHT 88
 
-static_assert(offsetof(DepthwiseConvParams, input_depth) ==
-                  OFFSET_INPUT_DEPTH, "");
+static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
-                  OFFSET_INPUT_ROW_SIZE, "");
+                  OFFSET_INPUT_ROW_SIZE,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_depth) ==
-                  OFFSET_OUTPUT_DEPTH, "");
+                  OFFSET_OUTPUT_DEPTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
-                  OFFSET_OUTPUT_ROW_SIZE, "");
+                  OFFSET_OUTPUT_ROW_SIZE,
+              "");
 static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
-                  OFFSET_FILTER_ROW_SIZE, "");
+                  OFFSET_FILTER_ROW_SIZE,
+              "");
 static_assert(offsetof(DepthwiseConvParams, input_offset) ==
-                  OFFSET_INPUT_OFFSET, "");
+                  OFFSET_INPUT_OFFSET,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_offset) ==
-                  OFFSET_OUTPUT_OFFSET, "");
+                  OFFSET_OUTPUT_OFFSET,
+              "");
 static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
-                  OFFSET_FILTER_OFFSET, "");
+                  OFFSET_FILTER_OFFSET,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
-                  OFFSET_OUTPUT_MULTIPLIER, "");
+                  OFFSET_OUTPUT_MULTIPLIER,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
-                  OFFSET_OUTPUT_ACTIVATION_MIN, "");
+                  OFFSET_OUTPUT_ACTIVATION_MIN,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
-                  OFFSET_OUTPUT_ACTIVATION_MAX, "");
+                  OFFSET_OUTPUT_ACTIVATION_MAX,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
-                  OFFSET_OUTPUT_RIGHT_SHIFT, "");
-static_assert(offsetof(DepthwiseConvParams, input_width) ==
-                  OFFSET_INPUT_WIDTH, "");
+                  OFFSET_OUTPUT_RIGHT_SHIFT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, input_height) ==
-                  OFFSET_INPUT_HEIGHT, "");
+                  OFFSET_INPUT_HEIGHT,
+              "");
 static_assert(offsetof(DepthwiseConvParams, stride_width) ==
-                  OFFSET_STRIDE_WIDTH, "");
+                  OFFSET_STRIDE_WIDTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, stride_height) ==
-                  OFFSET_STRIDE_HEIGHT, "");
+                  OFFSET_STRIDE_HEIGHT,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_width) ==
-                  OFFSET_OUTPUT_WIDTH, "");
+                  OFFSET_OUTPUT_WIDTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_height) ==
-                  OFFSET_OUTPUT_HEIGHT, "");
+                  OFFSET_OUTPUT_HEIGHT,
+              "");
 
 template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight>
 struct DepthwiseConvWindow {};
@@ -127,10 +175,10 @@ template <>
 struct DepthwiseConvWindow<8, 1, 1> {
  public:
   static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                  const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
-                  int64_t input_row_size, int32 output_window_height,
-                  int32 output_window_width,
-                  const DepthwiseConvParams* params_ptr) {
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
     const int64_t input_width_increment = 2 * input_depth;
     const int64_t input_height_increment = 2 * input_row_size;
     const int64_t output_height_increment = 2 * params_ptr->output_row_size;
@@ -1110,10 +1158,10 @@ struct DepthwiseConvWindow<8, 1, 1> {
 template <>
 struct DepthwiseConvWindow<8, 2, 2> {
   static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                  const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
-                  int64_t input_row_size, int32 output_window_height,
-                  int32 output_window_width,
-                  const DepthwiseConvParams* params_ptr) {
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
     const int64_t input_width_increment = 4 * input_depth;
     const int64_t input_height_increment = 4 * input_row_size;
     const int64_t output_height_increment = 2 * params_ptr->output_row_size;
@@ -2953,11 +3001,10 @@ struct ShuffleParams {
   ShuffleParams() = default;
   ShuffleParams(int32 output_width, int32 output_height, int32 stride_width,
                 int32 stride_height)
-  : output_width(output_width)
-  , output_height(output_height)
-  , input_width(get_shuffle_input_size(stride_width, output_width))
-  , input_height(get_shuffle_input_size(stride_height, output_height)) {
-  }
+      : output_width(output_width),
+        output_height(output_height),
+        input_width(get_shuffle_input_size(stride_width, output_width)),
+        input_height(get_shuffle_input_size(stride_height, output_height)) {}
 };
 
 template <int32 kStrideWidth, int32 kStrideHeight>
@@ -2966,10 +3013,10 @@ struct DepthwiseConvThroughDepth {
   // |start_depth| to |end_depth|. Keep this not inlined to maintain a small
   // binary size. We use a DepthwiseConvParams struct for read only params
   // to minimize call overhead.
-  static __attribute__((noinline)) void Run(const uint8* input_ptr,
-      const uint8* filter_ptr, const int32* bias_ptr, uint8* output_ptr,
-      int64_t start_depth, int64_t end_depth, int64_t input_depth,
-      int64_t input_row_size, int32 output_window_height,
+  static __attribute__((noinline)) void Run(
+      const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
+      uint8* output_ptr, int64_t start_depth, int64_t end_depth,
+      int64_t input_depth, int64_t input_row_size, int32 output_window_height,
       int32 output_window_width, const DepthwiseConvParams& params) {
     for (; start_depth <= end_depth - 8; start_depth += 8) {
       DepthwiseConvWindow<8, kStrideWidth, kStrideHeight>::Run(
@@ -2992,12 +3039,15 @@ struct DepthwiseConvMultiRow {
                          uint8* output_data, const DepthwiseConvParams& params,
                          const ShuffleParams& shuffle_params,
                          uint8* shuffle_workspace) {
-    TFLITE_DCHECK(shuffle_params.input_height ==
+    TFLITE_DCHECK(
+        shuffle_params.input_height ==
         get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
-    TFLITE_DCHECK(shuffle_params.input_width ==
+    TFLITE_DCHECK(
+        shuffle_params.input_width ==
         get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
-    TFLITE_DCHECK(64 * shuffle_params.input_width * shuffle_params.input_height
-                  <= DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
+    TFLITE_DCHECK(64 * shuffle_params.input_width *
+                      shuffle_params.input_height <=
+                  DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
 
     int32 out_x = start_x;
 
@@ -3008,7 +3058,7 @@ struct DepthwiseConvMultiRow {
     if (params.output_depth > 64 ||
         (params.output_depth <= 64 && params.input_width > 150)) {
       for (; out_x <= (end_x - shuffle_params.output_width);
-             out_x += shuffle_params.output_width) {
+           out_x += shuffle_params.output_width) {
         const uint8* input_ptr = input_data;
         const int32* bias_ptr = bias_data;
         const uint8* filter_ptr = filter_data;
@@ -3054,8 +3104,8 @@ struct DepthwiseConvMultiRow {
         }
 
         // Handle leftover depth.
-        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr,
-                        depth, params.output_depth, params.input_depth,
+        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr, depth,
+                        params.output_depth, params.input_depth,
                         params.input_row_size, shuffle_params.output_height,
                         shuffle_params.output_width, params);
 
@@ -3082,13 +3132,15 @@ struct DepthwiseConvMultiRow {
 //   * Horizontal edges.
 //   * Vertical edges.
 inline void DepthwiseConvHandlePadding(const uint8* input_data,
-    const uint8* filter_data, const int32* bias_data, uint8* output_data,
-    const DepthwiseConvParams& params) {
+                                       const uint8* filter_data,
+                                       const int32* bias_data,
+                                       uint8* output_data,
+                                       const DepthwiseConvParams& params) {
   if (params.input_width == 1 && params.input_height == 1) {
-    const uint8* filter_ptr = filter_data + params.filter_row_size
-        + params.output_depth;
-    DepthwiseConvPartial<EdgeType::kCenter, 1, 1>::Run(input_data, filter_ptr,
-        bias_data, output_data, &params);
+    const uint8* filter_ptr =
+        filter_data + params.filter_row_size + params.output_depth;
+    DepthwiseConvPartial<EdgeType::kCenter, 1, 1>::Run(
+        input_data, filter_ptr, bias_data, output_data, &params);
     return;
   }
 
@@ -3099,27 +3151,27 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
 
   // Handle top row.
   const uint8* input_ptr = input_data;
-  const uint8* filter_ptr = filter_data + params.filter_row_size
-      + params.output_depth;
+  const uint8* filter_ptr =
+      filter_data + params.filter_row_size + params.output_depth;
   uint8* output_ptr = output_data;
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 
   input_ptr += (params.stride_width - 1) * params.input_depth;
   filter_ptr = filter_data + params.filter_row_size;
   output_ptr += params.output_depth;
 
   for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
-           out_x++) {
+       out_x++) {
     DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_depth;
     output_ptr += params.output_depth;
   }
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 
   // Handle left side.
   input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
@@ -3127,7 +3179,7 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
   output_ptr = output_data + params.output_row_size;
 
   for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
-           out_y++) {
+       out_y++) {
     DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_row_size;
@@ -3135,14 +3187,14 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
   }
 
   // Handle right side.
-  input_ptr = input_data + (params.input_width - 2) * params.input_depth
-      + (params.stride_width - 1) * params.input_row_size;
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth +
+              (params.stride_width - 1) * params.input_row_size;
   filter_ptr = filter_data;
   output_ptr = output_data + params.output_row_size +
-      (params.output_width - 1) * params.output_depth;
+               (params.output_width - 1) * params.output_depth;
 
   for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
-         out_y++) {
+       out_y++) {
     DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_row_size;
@@ -3152,26 +3204,26 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
   // Handle bottom row.
   input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
   filter_ptr = filter_data + params.output_depth;
-  output_ptr = output_data +
-      (params.output_height - 1) * params.output_row_size;
+  output_ptr =
+      output_data + (params.output_height - 1) * params.output_row_size;
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 
   input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
   filter_ptr = filter_data;
   output_ptr += params.output_depth;
 
   for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
-           out_x++) {
+       out_x++) {
     DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_depth;
     output_ptr += params.output_depth;
   }
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 }
 
 inline bool Fast3x3FilterKernelSupported(
@@ -3346,8 +3398,8 @@ inline void DepthwiseConv3x3Filter(
       const int in_x = (out_x * stride_width) - pad_width;
       const int in_y = (out_y * stride_height) - pad_height;
       input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
-      output_ptr += out_y * params.output_row_size
-          + out_x * params.output_depth;
+      output_ptr +=
+          out_y * params.output_row_size + out_x * params.output_depth;
     }
 
     // Shuffling shapes that maximize width over the shuffle workspace size
@@ -3402,11 +3454,10 @@ inline void DepthwiseConv3x3Filter(
     }
   }
 }
-// clang-format on
 
 #endif  // __aarch64__
 
 }  // namespace optimized_ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_3X3_FILTER_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
similarity index 95%
rename from tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
rename to tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
index ce3cde76999c77e1f9bf1eaccdba7e84ed508dda..29e3f534a38d4295381bb5a013612cce020704df 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -16,8 +16,8 @@ limitations under the License.
 // Copied from tensorflow/core/kernels/eigen_spatial_convolutions.h.
 // TODO(petewarden) - move this to a common location in Eigen itself.
 
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
 
 #define EIGEN_USE_CUSTOM_THREAD_POOL
 #define EIGEN_USE_THREADS
@@ -32,9 +32,9 @@ limitations under the License.
 #define TFLITE_REDUCE_INSTANTIATIONS_OPEN_SOURCE
 #define Eigen EigenForTFLite
 #if defined(TFLITE_REDUCE_INSTANTIATIONS_GOOGLE)
-#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h"
+#include "tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h"
 #elif defined(TFLITE_REDUCE_INSTANTIATIONS_OPEN_SOURCE)
-#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h"
+#include "tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h"
 #else
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #endif
@@ -226,4 +226,4 @@ EIGEN_DEVICE_FUNC
 
 // clang-format on
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
similarity index 78%
rename from tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
rename to tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
index 6443f425b7d6436d2f4c5b98d5512875785864dc..6461a5e5426f9eaffb0fadb2b7e5b2f3e2848254 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -12,25 +12,55 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
 
-#define EIGEN_USE_CUSTOM_THREAD_POOL
-#define EIGEN_USE_THREADS
+// This is essentially unsupported/CXX11/Eigen/Tensor.h
+// TODO(petewarden) - move this to a common location in Eigen itself.
 
 // clang-format off
 
-#include <stdint.h>
 
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+
+
+#include "Eigen/Core"
+
+#if defined(EIGEN_USE_SYCL)
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <utility>
+#endif
+#include <cmath>
 #include <cstddef>
 #include <cstring>
-#include <cmath>
+
+
+
+
+
+#ifdef _WIN32
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#include <windows.h>
+#else
+#include <stdint.h>
+#include <unistd.h>
+#endif
+
+#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
 #include <random>
-#include <atomic>
-#include <condition_variable>  // NOLINT(build/c++11)
-#include <mutex>  // NOLINT(build/c++11)
-#include <thread>  // NOLINT(build/c++11)
-#include <functional>
+#endif
 
 #ifdef _WIN32
 #include <windows.h>
@@ -40,58 +70,53 @@ limitations under the License.
 #include <time.h>
 #endif
 
+// #if defined(EIGEN_USE_LIBXSMM)
+// #include "libxsmm.h"
+// #endif
 
-// Because some programs may link Eigen in through other frameworks with
-// different flags, we can run into multiple definition issues if we don't have
-// a private namespace for our versions. This is a nasty hack, but a similar
-// approach is used elsewhere to handle the problem, so it should be stable.
-#define Eigen EigenForTFLite
+#ifdef EIGEN_USE_THREADS
+#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
+#endif
 
-#include "Eigen/src/Core/util/StaticAssert.h"
-#include "unsupported/Eigen/CXX11/Core"
-#include "unsupported/Eigen/SpecialFunctions"
 
 #include "Eigen/src/Core/util/DisableStupidWarnings.h"
 
-#include "Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/SpecialFunctions"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/util/CXX11Meta.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h"
+
 
-// Beware: the order of the include matters to some compilers. For example
-// TensorIndexList.h should be included before TensorDimensions.h in order to
-// use index lists to encode tensor dimensions when compiling with llvm.
-// We're defining this ourselves rather than using the Eigen Tensor header file
-// so that we can alter the macro definition of TENSOR_CONTRACTION_DISPATCH to
-// reduce binary size.
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/ThreadPoolInterface.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorNonBlockingThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStats.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
-
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+
 #undef TENSOR_CONTRACTION_DISPATCH
 #define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)    \
   if (this->m_lhs_inner_dim_contiguous &&                       \
@@ -102,8 +127,9 @@ limitations under the License.
     eigen_assert(false && "Unsupported contraction formats");   \
   }
 
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
@@ -125,19 +151,18 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h"
-
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
 
 #include "Eigen/src/Core/util/ReenableStupidWarnings.h"
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
similarity index 91%
rename from tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
rename to tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
index d34708b8fd0c0732c13ddbd8d70c87a278c40ff8..f5576fbff7005d359b3766a3708f45f487744ff4 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
+++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
@@ -19,8 +19,8 @@ limitations under the License.
 // clang-format off
 
 
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
 
 
 #include "Eigen/Core"
@@ -94,7 +94,7 @@ typedef unsigned __int64 uint64_t;
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
@@ -106,10 +106,11 @@ typedef unsigned __int64 uint64_t;
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
@@ -128,7 +129,7 @@ typedef unsigned __int64 uint64_t;
 
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
-#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
@@ -164,4 +165,4 @@ typedef unsigned __int64 uint64_t;
 #include "Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
similarity index 99%
rename from tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
rename to tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
index 4218be20a4a08f76c112675ac059fb7374539194..5485d907c29399ca8f4663f090e6b26b8a9be7ed 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -12,18 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
 
 #include <stdint.h>
 #include <sys/types.h>
 
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace optimized_ops {
@@ -1869,4 +1869,4 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
 
 }  // namespace optimized_ops
 }  // namespace tflite
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_LEGACY_OPTIMIZED_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h
similarity index 92%
rename from tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
rename to tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h
index 4139cf4eba98d1191854453bc200642a035e4669..12dfd1abb619729491661f87d4d82c7ea73b5932 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
 
 #include <assert.h>
 #include <stdint.h>
@@ -26,11 +26,11 @@ limitations under the License.
 #include <tuple>
 #include <type_traits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace multithreaded_ops {
@@ -174,4 +174,4 @@ inline void Conv(const Eigen::ThreadPoolDevice& device,
 }  // namespace multithreaded_ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_MULTITHREADED_CONV_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
similarity index 98%
rename from tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
rename to tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 36c15dbc5789308b2eb2ee579e26b1fe14535b3f..cf40ebb241d013a4853854f57fd55ebbce8a1752 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -15,12 +15,12 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
-#include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
+#include "tensorflow/lite/kernels/internal/round.h"
 
 #ifdef USE_NEON
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
similarity index 93%
rename from tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
rename to tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 630a6bbf2970866a4f3f7e49fb58cf3878f4095a..903f4c80139cd326b354ef6292a393c75af11608 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h"
 
 namespace tflite {
 namespace tensor_utils {
@@ -153,4 +153,4 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
similarity index 94%
rename from tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
rename to tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 77f84e0c1c2b76b5d84324d945d4774876a51031..c79b69a22e4dcdac5c32d03c0edd9f3cfb09a0ae 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
 
 #include <assert.h>
 #include <stdint.h>
@@ -25,17 +25,21 @@ limitations under the License.
 #include <tuple>
 #include <type_traits>
 
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+#include <Accelerate/Accelerate.h>
+#endif
+
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/round.h"
-#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace optimized_ops {
@@ -60,16 +64,19 @@ using reference_ops::DepthConcatenation;
 using reference_ops::Dequantize;
 using reference_ops::Div;
 using reference_ops::FakeQuant;
+using reference_ops::Fill;
 using reference_ops::Gather;
 using reference_ops::Greater;
 using reference_ops::GreaterEqual;
 using reference_ops::GreaterEqualWithScaling;
 using reference_ops::GreaterWithScaling;
+using reference_ops::LeakyRelu;
 using reference_ops::Less;
 using reference_ops::LessEqual;
 using reference_ops::LessEqualWithScaling;
 using reference_ops::LessWithScaling;
 using reference_ops::Mean;
+using reference_ops::ProcessBroadcastShapes;
 using reference_ops::RankOneSelect;
 using reference_ops::Relu1;
 using reference_ops::Relu6;
@@ -1866,18 +1873,45 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
     gemm_input_shape = &input_shape;
   }
 
-  const auto im2col_matrix_map =
-      MapAsMatrixWithLastDimAsRows(gemm_input_data, *gemm_input_shape);
-  const auto filter_matrix_map =
-      MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape);
-  auto output_matrix_map =
-      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
-
-  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
-
-  AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
-                                   bias_shape, bias_data, output_shape,
-                                   output_data);
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+#else
+  // When an optimized CBLAS implementation is not available, fall back
+  // to using Eigen.
+  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
+      Matrix;
+  typedef Eigen::Map<Matrix> MatrixRef;
+  typedef Eigen::Map<const Matrix> ConstMatrixRef;
+
+  MatrixRef matrix_c(c, m, n);
+  ConstMatrixRef matrix_a(a, m, k);
+  ConstMatrixRef matrix_b(b, n, k);
+  matrix_c.noalias() = matrix_a * matrix_b.transpose();
+#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      output_activation_min, output_activation_max, bias_shape, bias_data,
+      output_shape, output_data);
 }
 
 inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
@@ -3151,12 +3185,12 @@ inline void LstmCell(
   // Combined memory state and final output calculation
   gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
   output_state_map =
-      input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      input_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
           new_input_sm.tanh() +
-      forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      forget_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
           prev_state_map;
   output_activ_map =
-      output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      output_gate_sm.unaryExpr(Eigen::internal::scalar_logistic_op<float>()) *
       output_state_map.tanh();
 }
 
@@ -3548,8 +3582,8 @@ inline void AveragePool(const PoolParams& params,
             std::min(params.filter_height, input_height - in_y_origin);
         const int filter_count =
             (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
-        // 1280 required by Inception v3
-        static constexpr int kAccBufferMaxSize = 2048;
+        // 2560 is required by MobileNetV2 with depth multiplier 2.
+        static constexpr int kAccBufferMaxSize = 4096;
         TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
         uint16 acc[kAccBufferMaxSize];
         memset(acc, 0, depth * sizeof(acc[0]));
@@ -3714,8 +3748,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
         const int filter_y_start = std::max(0, -in_y_origin);
         const int filter_y_end =
             std::min(params.filter_height, input_height - in_y_origin);
-        // 2048 required by Inception v3
-        static constexpr int kAccBufferMaxSize = 2048;
+        // 2560 is required by MobileNetV2 with depth multiplier 2.
+        static constexpr int kAccBufferMaxSize = 4096;
         TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
         uint8 acc[kAccBufferMaxSize];
         memset(acc, 0, depth * sizeof(acc[0]));
@@ -4291,7 +4325,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   using FixedPointScaledDiff =
       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
@@ -4367,7 +4400,7 @@ inline void Logistic(const RuntimeShape& input_shape, const float* input_data,
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
   output_map.array() =
-      input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>());
+      input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
 }
 
 // Convenience version that allows, for example, generated-code calls to be
@@ -5428,6 +5461,9 @@ void TypedMemset(void* ptr, T value, size_t num) {
   }
 }
 
+// This makes heavy use of Offset, along with conditional branches. There may be
+// opportunities for improvement.
+//
 // There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
 // scalar input that provides the padding value.  Therefore pad_value_ptr can be
 // equivalent to a simple input1_data.  For Pad, it should point to a zero
@@ -5440,7 +5476,7 @@ inline void PadImpl(const tflite::PadParams& op_params,
                     const RuntimeShape& input_shape, const T* input_data,
                     const P* pad_value_ptr, const RuntimeShape& output_shape,
                     T* output_data) {
-  gemmlowp::ScopedProfilingLabel label("Pad");
+  gemmlowp::ScopedProfilingLabel label("Pad4DSlowImpl");
   const RuntimeShape ext_input_shape =
       RuntimeShape::ExtendedShape(4, input_shape);
   const RuntimeShape ext_output_shape =
@@ -5448,8 +5484,8 @@ inline void PadImpl(const tflite::PadParams& op_params,
   TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
   TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
 
-  // Runtime calls are currently fixed at 4 dimensions. Copy inputs so
-  // we can pad them to 4 dims (yes, we are "padding the padding").
+  // Pad kernels are limited to max 4 dimensions. Copy inputs so we can pad them
+  // to 4 dims (yes, we are "padding the padding").
   std::vector<int> left_padding_copy(4, 0);
   const int left_padding_extend = 4 - op_params.left_padding_count;
   for (int i = 0; i < op_params.left_padding_count; ++i) {
@@ -5571,6 +5607,162 @@ inline void Pad(const tflite::PadParams& op_params,
           output_data);
 }
 
+// TODO(b/117643175): Optimize. (This is an introductory copy of standard Pad.)
+//
+// This pad requires that (a) left and right paddings are in the 4D patterns
+// {0, h_pad, w_pad, 0}, and (b) memset can be used: *pad_value_ptr == 0 and/or
+// T is uint8.
+//
+// There are two versions of pad: Pad and PadV2.  In PadV2 there is a second
+// scalar input that provides the padding value.  Therefore pad_value_ptr can be
+// equivalent to a simple input1_data.  For Pad, it should point to a zero
+// value.
+//
+// Note that two typenames are required, so that T=P=int32 is considered a
+// specialization distinct from P=int32.
+template <typename T, typename P>
+inline void PadImageStyleMemset(const tflite::PadParams& op_params,
+                                const RuntimeShape& input_shape,
+                                const T* input_data, const P* pad_value_ptr,
+                                const RuntimeShape& output_shape,
+                                T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("PadImageStyle");
+  const RuntimeShape ext_input_shape =
+      RuntimeShape::ExtendedShape(4, input_shape);
+  const RuntimeShape ext_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+  TFLITE_DCHECK_LE(op_params.left_padding_count, 4);
+  TFLITE_DCHECK_LE(op_params.right_padding_count, 4);
+
+  // Pad kernels are limited to max 4 dimensions. Copy inputs so we can pad them
+  // to 4 dims (yes, we are "padding the padding").
+  std::vector<int> left_padding_copy(4, 0);
+  const int left_padding_extend = 4 - op_params.left_padding_count;
+  for (int i = 0; i < op_params.left_padding_count; ++i) {
+    left_padding_copy[left_padding_extend + i] = op_params.left_padding[i];
+  }
+  std::vector<int> right_padding_copy(4, 0);
+  const int right_padding_extend = 4 - op_params.right_padding_count;
+  for (int i = 0; i < op_params.right_padding_count; ++i) {
+    right_padding_copy[right_padding_extend + i] = op_params.right_padding[i];
+  }
+  // The following padding restrictions are contractual requirements, and
+  // embody what it means for a padding op to be "image-style".
+  TFLITE_DCHECK_EQ(left_padding_copy[0], 0);
+  TFLITE_DCHECK_EQ(left_padding_copy[3], 0);
+  TFLITE_DCHECK_EQ(right_padding_copy[0], 0);
+  TFLITE_DCHECK_EQ(right_padding_copy[3], 0);
+
+  const int batch = MatchingDim(ext_input_shape, 0, ext_output_shape, 0);
+  const int output_height = ext_output_shape.Dims(1);
+  const int output_width = ext_output_shape.Dims(2);
+  const int input_height = ext_input_shape.Dims(1);
+  const int input_width = ext_input_shape.Dims(2);
+  const int depth = MatchingDim(ext_input_shape, 3, ext_output_shape, 3);
+
+  const int left_h_padding = left_padding_copy[1];
+  const int left_w_padding = left_padding_copy[2];
+  const int right_h_padding = right_padding_copy[1];
+  const int right_w_padding = right_padding_copy[2];
+
+  TFLITE_DCHECK_EQ(output_height,
+                   input_height + left_h_padding + right_h_padding);
+  TFLITE_DCHECK_EQ(output_width,
+                   input_width + left_w_padding + right_w_padding);
+
+  const T pad_value = *pad_value_ptr;
+  const int top_block_size = left_h_padding * output_width * depth;
+  const size_t num_top_block_bytes = top_block_size * sizeof(T);
+  const int bottom_block_size = right_h_padding * output_width * depth;
+  const size_t num_bottom_block_bytes = bottom_block_size * sizeof(T);
+  const int left_blocks_size = left_w_padding * depth;
+  const size_t num_left_block_bytes = left_blocks_size * sizeof(T);
+  const int right_blocks_size = right_w_padding * depth;
+  const size_t num_right_block_bytes = right_blocks_size * sizeof(T);
+  const int inner_line_size = input_width * depth;
+  const size_t num_inner_line_bytes = inner_line_size * sizeof(T);
+
+  if (input_height == 0) {
+    memset(output_data, pad_value,
+           num_top_block_bytes + num_bottom_block_bytes);
+  } else {
+    for (int i = 0; i < batch; ++i) {
+      // For each image in the batch, apply the top padding, then iterate
+      // through rows, then apply the bottom padding.
+      //
+      // By unwinding one iteration, we can combine the first left-margin
+      // padding with the top padding, and the last right-margin padding with
+      // the bottom padding.
+      memset(output_data, pad_value,
+             num_top_block_bytes + num_left_block_bytes);
+      output_data += top_block_size + left_blocks_size;
+      memcpy(output_data, input_data, num_inner_line_bytes);
+      input_data += inner_line_size;
+      output_data += inner_line_size;
+      // One iteration unwound.
+      // Unwinding this loop affords the opportunity to reorder the loop work
+      // and hence combine memset() calls.
+      //
+      // Before unwinding:
+      // for (int j = 0; j < input_height; ++j) {
+      //   // Pad on left, copy central data, pad on right.
+      //   memset(output_data, pad_value, num_left_block_bytes);
+      //   output_data += left_blocks_size;
+      //   memcpy(output_data, input_data, num_inner_line_bytes);
+      //   input_data += inner_line_size;
+      //   output_data += inner_line_size;
+      //   memset(output_data, pad_value, num_right_block_bytes);
+      //   output_data += right_blocks_size;
+      // }
+      for (int j = 1; j < input_height; ++j) {
+        memset(output_data, pad_value,
+               num_right_block_bytes + num_left_block_bytes);
+        output_data += right_blocks_size + left_blocks_size;
+        memcpy(output_data, input_data, num_inner_line_bytes);
+        input_data += inner_line_size;
+        output_data += inner_line_size;
+      }
+      memset(output_data, pad_value,
+             num_right_block_bytes + num_bottom_block_bytes);
+      output_data += right_blocks_size + bottom_block_size;
+    }
+  }
+}
+
+template <typename T, typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape, const T* input_data,
+                          const P* pad_value_ptr,
+                          const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_ASSERT_FALSE;
+}
+
+template <typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape,
+                          const uint8* input_data, const P* pad_value_ptr,
+                          const RuntimeShape& output_shape,
+                          uint8* output_data) {
+  PadImageStyleMemset(op_params, input_shape, input_data, pad_value_ptr,
+                      output_shape, output_data);
+}
+
+template <typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape,
+                          const float* input_data, const P* pad_value_ptr,
+                          const RuntimeShape& output_shape,
+                          float* output_data) {
+  const float converted_pad_value = static_cast<float>(*pad_value_ptr);
+  if (converted_pad_value == 0.0f) {
+    PadImageStyleMemset(op_params, input_shape, input_data, pad_value_ptr,
+                        output_shape, output_data);
+  } else {
+    PadImpl(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+            output_data);
+  }
+}
+
 template <typename T>
 inline void Slice(const tflite::SliceParams& op_params,
                   const RuntimeShape& input_shape, const T* input_data,
@@ -5674,12 +5866,12 @@ void TransposeIm2col(const ConvParams& params, uint8 zero_byte,
   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
   const int input_height = input_shape.Dims(1);
   const int input_width = input_shape.Dims(2);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
-  MatchingDim(output_shape, 3, filter_shape, 3);  // output_depth
+  MatchingDim(output_shape, 3, filter_shape, 0);  // output_depth
 
   // Construct the MxN sized im2col matrix.
   // The rows M, are sub-ordered B x H x W
@@ -5753,6 +5945,76 @@ inline void TransposeConv(
   Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
 }
 
+// Integer-only version of ResizeNearestNeighbor. Since scales are represented
+// in fixed-point and thus approximated, |in_x| or |in_y| may differ from the
+// reference version. Debug checks are in place to test if this occurs.
+inline void ResizeNearestNeighbor(
+    const tflite::ResizeNearestNeighborParams& op_params,
+    const RuntimeShape& unextended_input_shape, const uint8* input_data,
+    const RuntimeShape& output_size_shape, const int32* output_size_data,
+    const RuntimeShape& unextended_output_shape, uint8* output_data) {
+  // Align corners = true is not supported.
+  TFLITE_DCHECK(!op_params.align_corners);
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32 input_height = input_shape.Dims(1);
+  int32 input_width = input_shape.Dims(2);
+  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  // The Tensorflow version of this op allows resize on the width and height
+  // axis only.
+  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
+  int32 output_height = output_size_data[0];
+  int32 output_width = output_size_data[1];
+
+  // Convert scales to fixed-point with 16 fractional bits. We add 1 as an
+  // error factor and to avoid zero scales. For example, with input_height = 1,
+  // output_height = 3, the float scaling factor would be non-zero at 1/3.
+  // With fixed-point, this is zero.
+  int32 height_scale = (input_height << 16) / output_height + 1;
+  int32 width_scale = (input_width << 16) / output_width + 1;
+
+  const int col_offset = input_shape.Dims(3);
+  const int row_offset = input_shape.Dims(2) * col_offset;
+  const int batch_offset = input_shape.Dims(1) * row_offset;
+
+  const uint8* input_ptr = input_data;
+  uint8* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      int32 in_y = std::min((y * height_scale) >> 16, input_height - 1);
+      // Check offset calculation is the same as the reference version. See
+      // function comment for details. We check using a non-float version of:
+      // TFLITE_DCHECK_EQ(in_y, std::floor(y * (static_cast<float>(input_height)
+      //                                            / output_height)));
+      TFLITE_DCHECK_LT(y * input_height, output_height + in_y * output_height);
+      TFLITE_DCHECK_GE(y * input_height, in_y * output_height);
+      const uint8* y_input_ptr = input_ptr + in_y * row_offset;
+      for (int x = 0; x < output_width; ++x) {
+        int32 in_x = std::min((x * width_scale) >> 16, input_width - 1);
+        // Check offset calculation is the same as the reference version. See
+        // function comment for details. We check using a non-float version of:
+        // TFLITE_DCHECK_EQ(in_y,
+        //                  std::floor(y * (static_cast<float>(input_width)
+        //                                      / output_width)));
+        TFLITE_DCHECK_LT(x * input_width, output_width + in_x * output_width);
+        TFLITE_DCHECK_GE(x * input_width, in_x * output_width);
+        const uint8* x_input_ptr = y_input_ptr + in_x * col_offset;
+        memcpy(output_ptr, x_input_ptr, depth);
+        output_ptr += depth;
+      }
+    }
+    input_ptr += batch_offset;
+  }
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
@@ -5761,4 +6023,4 @@ inline void TransposeConv(
 #pragma GCC diagnostic pop
 #endif
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_OPTIMIZED_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
similarity index 96%
rename from tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
rename to tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
index f87760a6c3e4a512d5e16828ce1c2dc46f8053d8..8f52ef131dedf4d0270c0346b1094add57f52dfc 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
@@ -183,4 +183,4 @@ void PortableMeanStddevNormalization(const float* input_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
diff --git a/tensorflow/lite/kernels/internal/quantization_util.cc b/tensorflow/lite/kernels/internal/quantization_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0279d2a9229e02721c01d15d380db1919b7bfd23
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/quantization_util.cc
@@ -0,0 +1,369 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+
+namespace tflite {
+
+namespace {
+// These constants are used to manipulate the binary representation of doubles.
+// Double-precision binary64 floating point format is:
+// Bit |  63  |  62-52   |   51-0   |
+//     | Sign | Exponent | Fraction |
+// To avoid 64-bit integers as much as possible, I break this into high and
+// low 32-bit chunks. High is:
+// Bit |  31  |  30-20   |      19-0     |
+//     | Sign | Exponent | High Fraction |
+// Low is:
+// Bit |     31-0     |
+//     | Low Fraction |
+// We then access the components through logical bit-wise operations to
+// extract the parts needed, with the positions and masks derived from the
+// layout shown above.
+constexpr uint64_t kSignMask = 0x8000000000000000LL;
+constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
+constexpr int32_t kExponentShift = 52;
+constexpr int32_t kExponentBias = 1023;
+constexpr uint32_t kExponentIsBadNum = 0x7ff;
+constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
+constexpr uint32_t kFractionShift = 22;
+constexpr uint32_t kFractionRoundingMask = 0x003fffff;
+constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
+}  // namespace
+
+void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
+                        int* shift) {
+  if (double_multiplier == 0.) {
+    *quantized_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+#ifdef TFLITE_EMULATE_FLOAT
+  // If we're trying to avoid the use of floating-point instructions (for
+  // example on microcontrollers) then use an alternative implementation
+  // that only requires integer and bitwise operations. To enable this, you
+  // need to set the define during the build process for your platform.
+  int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
+#else   // TFLITE_EMULATE_FLOAT
+  const double q = std::frexp(double_multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
+#endif  // TFLITE_EMULATE_FLOAT
+  TFLITE_CHECK(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31)) {
+    q_fixed /= 2;
+    ++*shift;
+  }
+  TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift) {
+  TFLITE_CHECK_GT(double_multiplier, 1.);
+  QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
+  TFLITE_CHECK_GE(*left_shift, 0);
+}
+
+void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
+                                         int32_t* quantized_multiplier,
+                                         int* left_shift) {
+  TFLITE_CHECK_LT(double_multiplier, 1.);
+  TFLITE_CHECK_GT(double_multiplier, 0.);
+  int shift;
+  QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
+  TFLITE_CHECK_LE(shift, 0);
+  *left_shift = shift;
+}
+
+int64_t IntegerFrExp(double input, int* shift) {
+  // Make sure our assumptions about the double layout hold.
+  TFLITE_CHECK_EQ(8, sizeof(double));
+
+  // We want to access the bits of the input double value directly, which is
+  // tricky to do safely, so use a union to handle the casting.
+  union {
+    double double_value;
+    uint64_t double_as_uint;
+  } cast_union;
+  cast_union.double_value = input;
+  const uint64_t u = cast_union.double_as_uint;
+
+  // If the bitfield is all zeros apart from the sign bit, this is a normalized
+  // zero value, so return standard values for this special case.
+  if ((u & ~kSignMask) == 0) {
+    *shift = 0;
+    return 0;
+  }
+
+  // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
+  // the exponent, and distinguished by whether the fractions are zero or
+  // non-zero.
+  const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
+  if (exponent_part == kExponentIsBadNum) {
+    *shift = std::numeric_limits<int>::max();
+    if (u & kFractionMask) {
+      // NaN, so just return zero (with the exponent set to INT_MAX).
+      return 0;
+    } else {
+      // Infinity, so return +/- INT_MAX.
+      if (u & kSignMask) {
+        return std::numeric_limits<int64_t>::min();
+      } else {
+        return std::numeric_limits<int64_t>::max();
+      }
+    }
+  }
+
+  // The shift is fairly easy to extract from the high bits of the double value,
+  // just by masking it out and applying a bias. The std::frexp() implementation
+  // always returns values between 0.5 and 1.0 though, whereas the exponent
+  // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
+  // interface.
+  *shift = (exponent_part - kExponentBias) + 1;
+
+  // There's an implicit high bit in the double format definition, so make sure
+  // we include that at the top, and then reconstruct the rest of the fractional
+  // value from the remaining fragments.
+  int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
+
+  // We're cutting off some bits at the bottom, so to exactly match the standard
+  // frexp implementation here we'll apply rounding by adding one to the least
+  // significant bit of the result if the discarded portion is over half of the
+  // maximum.
+  if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
+    fraction += 1;
+  }
+  // Negate the fraction if the sign bit was set.
+  if (u & kSignMask) {
+    fraction *= -1;
+  }
+
+  return fraction;
+}
+
+double DoubleFromFractionAndShift(int64_t fraction, int shift) {
+  union {
+    double double_value;
+    uint64_t double_as_uint;
+  } result;
+
+  // Detect NaNs and infinities.
+  if (shift == std::numeric_limits<int>::max()) {
+    if (fraction == 0) {
+      return NAN;
+    } else if (fraction > 0) {
+      return INFINITY;
+    } else {
+      return -INFINITY;
+    }
+  }
+
+  // Return a normalized zero for a zero fraction.
+  if (fraction == 0) {
+    result.double_as_uint = 0;
+    return result.double_value;
+  }
+
+  bool is_negative = (fraction < 0);
+  int64_t encoded_fraction = is_negative ? -fraction : fraction;
+  int64_t encoded_shift = (shift - 1);
+  while (encoded_fraction < 0x40000000) {
+    encoded_fraction *= 2;
+    encoded_shift -= 1;
+  }
+  while (encoded_fraction > 0x80000000) {
+    encoded_fraction /= 2;
+    encoded_shift += 1;
+  }
+  encoded_fraction -= 0x40000000;
+  if (encoded_shift < -1022) {
+    encoded_shift = -1023;
+  } else if (encoded_shift > 1022) {
+    encoded_shift = 1023;
+  }
+  encoded_shift += kExponentBias;
+  uint64_t encoded_sign = is_negative ? kSignMask : 0;
+  result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
+                          (encoded_fraction << kFractionShift);
+  return result.double_value;
+}
+
+double IntegerDoubleMultiply(double a, double b) {
+  int a_shift;
+  const int64_t a_fraction = IntegerFrExp(a, &a_shift);
+  int b_shift;
+  const int64_t b_fraction = IntegerFrExp(b, &b_shift);
+  // Detect NaNs and infinities.
+  if (a_shift == std::numeric_limits<int>::max() ||
+      (b_shift == std::numeric_limits<int>::max())) {
+    return NAN;
+  }
+  const int result_shift = a_shift + b_shift + 1;
+  const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
+  return DoubleFromFractionAndShift(result_fraction, result_shift);
+}
+
+int IntegerDoubleCompare(double a, double b) {
+  int a_shift;
+  const int64_t a_fraction = IntegerFrExp(a, &a_shift);
+  int b_shift;
+  const int64_t b_fraction = IntegerFrExp(b, &b_shift);
+
+  // Detect NaNs and infinities.
+  if (a_shift == std::numeric_limits<int>::max() ||
+      (b_shift == std::numeric_limits<int>::max())) {
+    return 1;
+  }
+
+  if ((a_fraction == 0) && (b_fraction < 0)) {
+    return 1;
+  } else if ((a_fraction < 0) && (b_fraction == 0)) {
+    return -1;
+  } else if (a_shift < b_shift) {
+    return -1;
+  } else if (a_shift > b_shift) {
+    return 1;
+  } else if (a_fraction < b_fraction) {
+    return -1;
+  } else if (a_fraction > b_fraction) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+void PreprocessSoftmaxScaling(double beta, double input_scale,
+                              int input_integer_bits,
+                              int32_t* quantized_multiplier, int* left_shift) {
+  // If the overall multiplier (input and beta) is large, then exp() of an
+  // input difference of 1 scaled by this will be large.  In other words, we
+  // can cap the multiplier and know that, when it is used, the output will be
+  // (round to) zero wherever the input is not at the maximum value.
+
+  // If the overall scale is less than one, and input_integer_bits=0, then the
+  // result is double equivalent of Q0.31 (actually with more precision). Thus
+  // this generates a Q(input_integer_bits).(31-input_integer_bits)
+  // representation.
+#ifdef TFLITE_EMULATE_FLOAT
+  const double input_beta = IntegerDoubleMultiply(beta, input_scale);
+  int shift;
+  int64_t fraction = IntegerFrExp(input_beta, &shift);
+  shift += (31 - input_integer_bits);
+  double input_beta_real_multiplier =
+      DoubleFromFractionAndShift(fraction, shift);
+  if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
+    input_beta_real_multiplier = (1ll << 31) - 1.0;
+  }
+#else   // TFLITE_EMULATE_FLOAT
+  const double input_beta_real_multiplier = std::min(
+      beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
+#endif  // TFLITE_EMULATE_FLOAT
+
+  QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
+                                   quantized_multiplier, left_shift);
+}
+
+void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
+                                    int input_integer_bits,
+                                    int32_t* quantized_multiplier,
+                                    int* left_shift,
+                                    int32_t* reverse_scaling_divisor,
+                                    int* reverse_scaling_left_shift) {
+  PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
+                           quantized_multiplier, left_shift);
+
+  // Also calculate what amounts to the inverse scaling factor for the input.
+  const double real_reverse_scaling_divisor =
+      (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
+  tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
+                                              reverse_scaling_divisor,
+                                              reverse_scaling_left_shift);
+}
+
+int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
+#ifdef TFLITE_EMULATE_FLOAT
+  int64_t result = (1 << input_integer_bits) - 1;
+  result <<= (31 - input_integer_bits);
+  result >>= input_left_shift;
+  return result;
+#else   // TFLITE_EMULATE_FLOAT
+  const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
+                                    (1ll << (31 - input_integer_bits)) /
+                                    (1ll << input_left_shift);
+  // Tighten bound using floor.  Suppose that we could use the exact value.
+  // After scaling the difference, the result would be at the maximum.  Thus we
+  // must ensure that our value has lower magnitude.
+  return static_cast<int>(std::floor(max_input_rescaled));
+#endif  // TFLITE_EMULATE_FLOAT
+}
+
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max,
+                            float* nudged_scale) {
+  // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
+  const float quant_min_float = static_cast<float>(quant_min);
+  const float quant_max_float = static_cast<float>(quant_max);
+  *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
+  const float zero_point_from_min = quant_min_float - min / *nudged_scale;
+  uint16 nudged_zero_point;
+  if (zero_point_from_min < quant_min_float) {
+    nudged_zero_point = static_cast<uint16>(quant_min);
+  } else if (zero_point_from_min > quant_max_float) {
+    nudged_zero_point = static_cast<uint16>(quant_max);
+  } else {
+    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
+  }
+  *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
+  *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
+}
+
+void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
+                       const float nudged_max, const float* input_data,
+                       float* output_data, const float size) {
+  // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
+  const float inv_nudged_scale = 1.0f / nudged_scale;
+
+  for (int i = 0; i < size; i++) {
+    const float src_val = input_data[i];
+    const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
+    const float clamped_shifted = clamped - nudged_min;
+    const float dst_val =
+        TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+        nudged_min;
+    output_data[i] = dst_val;
+  }
+}
+
+bool CheckedLog2(const float x, int* log2_result) {
+  // Using TfLiteRound instead of std::round and std::log instead of
+  // std::log2 to work around these fuctions being missing in a toolchain
+  // used in some TensorFlow tests as of May 2018.
+  const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
+  const float x_log2_rounded = TfLiteRound(x_log2);
+  const float x_log2_fracpart = x_log2 - x_log2_rounded;
+
+  *log2_result = static_cast<int>(x_log2_rounded);
+  return std::abs(x_log2_fracpart) < 1e-3;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/quantization_util.h b/tensorflow/lite/kernels/internal/quantization_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf313f39cd8b407f6fb57dcbdf0540e98d96b7e8
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/quantization_util.h
@@ -0,0 +1,280 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// Given the min and max values of a float array, return
+// reasonable quantization parameters to use for this array.
+template <typename T>
+QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
+                                            bool narrow_range) {
+  const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
+  const T qmax = std::numeric_limits<T>::max();
+  const double qmin_double = qmin;
+  const double qmax_double = qmax;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_CHECK_LE(rmin, 0.);
+  TFLITE_CHECK_GE(rmax, 0.);
+  if (rmin == rmax) {
+    // Special case where the min,max range is a point. Should be {0}.
+    TFLITE_CHECK_EQ(rmin, 0.);
+    TFLITE_CHECK_EQ(rmax, 0.);
+    QuantizationParams quantization_params;
+    quantization_params.zero_point = 0;
+    quantization_params.scale = 0.;
+    return quantization_params;
+  }
+
+  // General case.
+  //
+  // First determine the scale.
+  const double scale = (rmax - rmin) / (qmax_double - qmin_double);
+
+  // Zero-point computation.
+  // First the initial floating-point computation. The zero-point can be
+  // determined from solving an affine equation for any known pair
+  // (real value, corresponding quantized value).
+  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair
+  // will be roughly machine_epsilon * (sum of absolute values of terms)
+  // so we want to use the variant that adds the smaller terms.
+  const double zero_point_from_min = qmin_double - rmin / scale;
+  const double zero_point_from_max = qmax_double - rmax / scale;
+  const double zero_point_from_min_error =
+      std::abs(qmin_double) + std::abs(rmin / scale);
+  const double zero_point_from_max_error =
+      std::abs(qmax_double) + std::abs(rmax / scale);
+
+  const double zero_point_double =
+      zero_point_from_min_error < zero_point_from_max_error
+          ? zero_point_from_min
+          : zero_point_from_max;
+
+  // Now we need to nudge the zero point to be an integer
+  // (our zero points are integer, and this is motivated by the requirement
+  // to be able to represent the real value "0" exactly as a quantized value,
+  // which is required in multiple places, for example in Im2col with SAME
+  // padding).
+  T nudged_zero_point = 0;
+  if (zero_point_double < qmin_double) {
+    nudged_zero_point = qmin;
+  } else if (zero_point_double > qmax_double) {
+    nudged_zero_point = qmax;
+  } else {
+    nudged_zero_point = static_cast<T>(round(zero_point_double));
+  }
+  // The zero point should always be in the range of quantized value,
+  // [qmin, qmax].
+  TFLITE_CHECK_GE(nudged_zero_point, qmin);
+  TFLITE_CHECK_LE(nudged_zero_point, qmax);
+
+  // Finally, store the result nudged quantization params.
+  QuantizationParams quantization_params;
+  quantization_params.zero_point = nudged_zero_point;
+  quantization_params.scale = scale;
+  return quantization_params;
+}
+
+template <typename T>
+QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
+  return ChooseQuantizationParams<T>(rmin, rmax, false);
+}
+
+// Converts a floating-point number to an integer. For all inputs x where
+// static_cast<IntOut>(x) is legal according to the C++ standard, the result
+// is identical to that cast (i.e. the result is x with its fractional part
+// truncated whenever that is representable as IntOut).
+//
+// static_cast would cause undefined behavior for the following cases, which
+// have well-defined behavior for this function:
+//
+//  1. If x is NaN, the result is zero.
+//
+//  2. If the truncated form of x is above the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::max().
+//
+//  3. If the truncated form of x is below the representable range of IntOut,
+//     the result is std::numeric_limits<IntOut>::min().
+//
+// Note that cases #2 and #3 cover infinities as well as finite numbers.
+//
+// The range of FloatIn must include the range of IntOut, otherwise
+// the results are undefined.
+// TODO(sfeuz): Replace by absl::SafeCast once available.
+template <class IntOut, class FloatIn>
+IntOut SafeCast(FloatIn x) {
+  static_assert(!std::numeric_limits<FloatIn>::is_integer,
+                "FloatIn is integer");
+  static_assert(std::numeric_limits<IntOut>::is_integer,
+                "IntOut is not integer");
+  static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
+
+  // Special case NaN, for which the logic below doesn't work.
+  if (std::isnan(x)) {
+    return 0;
+  }
+
+  // Negative values all clip to zero for unsigned results.
+  if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
+    return 0;
+  }
+
+  // Handle infinities.
+  if (std::isinf(x)) {
+    return x < 0 ? std::numeric_limits<IntOut>::min()
+                 : std::numeric_limits<IntOut>::max();
+  }
+
+  // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
+  // unless x is zero in which case exp == 0. Note that this implies that the
+  // magnitude of x is strictly less than 2^exp.
+  int exp = 0;
+  std::frexp(x, &exp);
+
+  // Let N be the number of non-sign bits in the representation of IntOut. If
+  // the magnitude of x is strictly less than 2^N, the truncated version of x
+  // is representable as IntOut. The only representable integer for which this
+  // is not the case is kMin for signed types (i.e. -2^N), but that is covered
+  // by the fall-through below.
+  if (exp <= std::numeric_limits<IntOut>::digits) {
+    return x;
+  }
+
+  // Handle numbers with magnitude >= 2^N.
+  return x < 0 ? std::numeric_limits<IntOut>::min()
+               : std::numeric_limits<IntOut>::max();
+}
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of NEGATIVE its exponent ---
+// this is intended as a RIGHT-shift.
+//
+// Restricted to the case where the multiplier < 1 (and non-negative).
+void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
+                                         int32_t* quantized_multiplier,
+                                         int* left_shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Restricted to the case where the multiplier > 1.
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Handles an arbitrary positive multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
+                        int* shift);
+
+// Splits a double input value into a returned fraction, and a shift value from
+// the exponent, using only bitwise and integer operations to support
+// microcontrollers and other environments without floating-point support.
+//
+// This is designed to be a replacement for how std::frexp() is used within the
+// QuantizeMultiplier() function, and so has a different signature than the
+// standard version, returning a 64-bit integer rather than a double. This
+// result has a maximum value of 1<<31, with the fraction expressed as a
+// proportion of that maximum.
+//
+// std::frexp() returns NaNs and infinities unmodified, but since we're
+// returning integers that can't represent those values, instead we return
+// a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
+// result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
+// std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
+// result in return values that end up truncating some bits at the end,
+// reflecting the loss of precision inherent in denormalization.
+int64_t IntegerFrExp(double input, int* shift);
+
+// Converts an integer fraction in the format produced by IntegerFrExp (where
+// 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
+// IEEE binary64 double format result. The implementation uses only integer and
+// bitwise operators, so no floating point hardware support or emulation is
+// needed. This is here so quantized operations can run non-time-critical
+// preparation calculations on microcontrollers and other platforms without
+// float support.
+double DoubleFromFractionAndShift(int64_t fraction, int shift);
+
+// Performs a multiplication of two numbers in double format, using only integer
+// and bitwise instructions. This is aimed at supporting housekeeping functions
+// for quantized operations on microcontrollers without floating-point hardware.
+double IntegerDoubleMultiply(double a, double b);
+
+// Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
+// greater than b. It is implemented using only integer and logical instructions
+// so that it can be easily run on microcontrollers for quantized operations.
+int IntegerDoubleCompare(double a, double b);
+
+// This first creates a multiplier in a double equivalent of
+// Q(input_integer_bits).(31-input_integer_bits) representation, with extra
+// precision in the double's fractional bits.  It then splits the result into
+// significand and exponent.
+void PreprocessSoftmaxScaling(double beta, double input_scale,
+                              int input_integer_bits,
+                              int32_t* quantized_multiplier, int* left_shift);
+// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
+void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
+                                    int input_integer_bits,
+                                    int32_t* quantized_multiplier,
+                                    int* left_shift,
+                                    int32_t* reverse_scaling_divisor,
+                                    int* reverse_scaling_left_shift);
+// Calculate the largest input that will result in a within-bounds intermediate
+// result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
+// it must not overflow before we reduce the value by multiplication by the
+// input multiplier.  The negative radius is used as the minimum difference in
+// Softmax.
+int CalculateInputRadius(int input_integer_bits, int input_left_shift);
+
+// Nudges a min/max quantization range to ensure zero is zero.
+// Gymnastics with nudged zero point is to ensure that real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+// Outputs nudged_min, nudged_max, nudged_scale.
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max,
+                            float* nudged_scale);
+
+// Fake quantizes (quantizes and dequantizes) input_data using the scale,
+// nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
+// in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
+void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
+                       const float nudged_max, const float* input_data,
+                       float* output_data, const float size);
+
+// If x is approximately a power of two (with any positive or negative
+// exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
+// returns false.
+bool CheckedLog2(const float x, int* log2_result);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/lite/kernels/internal/quantization_util_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
rename to tensorflow/lite/kernels/internal/quantization_util_test.cc
index 25ea72b886a06ea31d35d468441e82814813aee3..2f8f7713795bf0e736fe85fcb582744974654b9e 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/lite/kernels/internal/quantization_util_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cecb16b48c9199655b393acdd347ea2e54817da
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h
@@ -0,0 +1,100 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& filter_shape,
+    const float* filter_data, const RuntimeShape& bias_shape,
+    const float* bias_data, const RuntimeShape& output_shape,
+    float* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int ic = 0; ic < input_depth; ++ic) {
+          for (int m = 0; m < depth_multiplier; m++) {
+            const int oc = m + ic * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            float total = 0.f;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  float input_value =
+                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                  float filter_value = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, oc)];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+            float bias_value = 0.0f;
+            if (bias_data) {
+              bias_value = bias_data[oc];
+            }
+            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                ActivationFunctionWithMinMax(total + bias_value,
+                                             output_activation_min,
+                                             output_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // end namespace reference_ops
+}  // end namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..002444b6810925910a651dd5c919a46ac8e5fb47
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int ic = 0; ic < input_depth; ++ic) {
+          for (int m = 0; m < depth_multiplier; m++) {
+            const int oc = m + ic * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  int32 input_val =
+                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                  int32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, oc)];
+                  acc +=
+                      (filter_val + filter_offset) * (input_val + input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[oc];
+            }
+            acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                                output_shift);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                static_cast<uint8>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // end namespace reference_ops
+}  // end namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h b/tensorflow/lite/kernels/internal/reference/fully_connected.h
similarity index 96%
rename from tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h
rename to tensorflow/lite/kernels/internal/reference/fully_connected.h
index 3c7fd2925671311ce3e4bf15ce18c749f630db3c..8495452220b8e7692a5e0e3e2c841c33d8425597 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/reference/fully_connected.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
 
 #include "fixedpoint/fixedpoint.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/round.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace reference_ops {
@@ -323,4 +323,4 @@ inline void ShuffledFullyConnected(
 }  // namespace reference_ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/README.md b/tensorflow/lite/kernels/internal/reference/integer_ops/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b1d3c91d50a4c77865ec25fa9961f745a489aea
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/README.md
@@ -0,0 +1,8 @@
+This directory contains reference implementations for int8 fully integer kernels.
+
+Weight filters of convs are expected to be symmetric per-channel quantized in
+the range [-127, 127].
+Inputs/activations are expected to be asymmetric per-layer quantized in the
+range [-128, 127].
+
+THESE ARE EXPERIMENTAL AND PRONE TO CHANGE.
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h b/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h
new file mode 100644
index 0000000000000000000000000000000000000000..03dcb6c220d3fcbbd219df3a1a1ea5f3b2b29c81
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape, const int8* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  const int32 zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int32 val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
similarity index 99%
rename from tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
rename to tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index be99240b1f193148791c36c808ddce7a883fb72e..380fc8f98ebbdd90bb68144a46903640734bff08 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_
 
 #include <stdint.h>
 #include <sys/types.h>
 
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/legacy_types.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 
 namespace tflite {
 
@@ -30,6 +30,11 @@ namespace reference_ops {
 
 static constexpr int kDepthwiseReverseShift = -1;
 
+inline void ShapeFromDims(const tflite::Dims<4>& dims, RuntimeShape* shape) {
+  shape->BuildFrom(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
 inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
                           const float* filter_data, const Dims<4>& filter_dims,
                           const float* bias_data, const Dims<4>& bias_dims,
@@ -797,7 +802,7 @@ inline void Gather(const T* input_data, const Dims<4>& input_dims,
                    const Dims<4>& coords_dims, T* output_data,
                    const Dims<4>& output_dims) {
   tflite::GatherParams op_params;
-  op_params.input_rank = input_rank;
+  op_params.axis = 4 - input_rank;
 
   Gather(op_params, DimsToShape(input_dims), input_data,
          DimsToShape(coords_dims), coords_data, DimsToShape(output_dims),
@@ -2117,4 +2122,4 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
 
 }  // namespace reference_ops
 }  // namespace tflite
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_LEGACY_REFERENCE_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
rename to tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 70d25c4bd9357ac842027429e91663e2603ecb69..d692063a968dab654eaf46b9956ddcd338b64410 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include <string.h>
 #include <algorithm>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/internal/round.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
similarity index 97%
rename from tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
rename to tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 714b1164ee2d84ae5a580c2b4137bd07124f8946..a06ebc1600d4fe47cf054b4e157bc21a5f70ddfc 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
 
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
@@ -265,4 +265,4 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
similarity index 93%
rename from tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
rename to tensorflow/lite/kernels/internal/reference/reference_ops.h
index 59f17ae85495173f328421203fa99bd789991f9a..ea3ab06da1f775b5ea0771bbb3f32c91c9caacd0 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
 
 #include <stdint.h>
 #include <sys/types.h>
@@ -26,13 +26,13 @@ limitations under the License.
 
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/fully_connected.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/softmax.h"
-#include "tensorflow/contrib/lite/kernels/internal/round.h"
-#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 
@@ -100,9 +100,96 @@ gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
 
 namespace reference_ops {
 
-inline void ShapeFromDims(const tflite::Dims<4>& dims, RuntimeShape* shape) {
-  shape->BuildFrom(
-      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+// Return true for broadcast case, false otherwise.
+inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
+                                   const RuntimeShape& shape1,
+                                   tflite::ArithmeticParams* params) {
+  const int dims_count =
+      std::max(shape0.DimensionsCount(), shape1.DimensionsCount());
+
+  params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  RuntimeShape scalar_shape(dims_count, 1);
+
+  auto extended_shape0 = RuntimeShape::ExtendedShape(dims_count, shape0);
+  auto extended_shape1 = RuntimeShape::ExtendedShape(dims_count, shape1);
+
+  // Check for "exact" match, implicitly accepting any scalar shapes.
+  if (extended_shape0 == extended_shape1) {
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
+    return false;
+  }
+
+  for (int i = dims_count - 1; i >= 0; --i) {
+    if (extended_shape0.Dims(i) == extended_shape1.Dims(i)) {
+      continue;
+    } else if (extended_shape0.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kFirstInputBroadcastsFast;
+      break;
+    } else if (extended_shape1.Dims(i) == 1) {
+      params->broadcast_category =
+          BroadcastableOpCategory::kSecondInputBroadcastsFast;
+      break;
+    } else {
+      params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+      break;
+    }
+  }
+
+  if (params->broadcast_category !=
+          BroadcastableOpCategory::kFirstInputBroadcastsFast &&
+      params->broadcast_category !=
+          BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    return false;
+  }
+
+  // From this point it is assumed contractually that corresponding dimensions
+  // in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
+  const bool swap_inputs = params->broadcast_category ==
+                           BroadcastableOpCategory::kSecondInputBroadcastsFast;
+  const RuntimeShape* shape_a =
+      swap_inputs ? &extended_shape1 : &extended_shape0;
+  const RuntimeShape* shape_b =
+      swap_inputs ? &extended_shape0 : &extended_shape1;
+
+  int i = dims_count - 1;
+  params->broadcast_shape[0] = 1;
+  params->broadcast_shape[1] = 1;
+  params->broadcast_shape[2] = 1;
+  params->broadcast_shape[3] = 1;
+  params->broadcast_shape[4] = 1;
+  // y_0 is greedy: include dims if both or neither equal 1: in other words,
+  // test for equality rather than (shape_a->Dims(i) != 1).
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[4] *= shape_b->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).  If it is input_b
+  // that has the unit dimension, the next two loops are not entered.
+  while (i >= 0 && shape_a->Dims(i) == 1) {
+    params->broadcast_shape[3] *= shape_b->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[2] *= shape_a->Dims(i);
+    --i;
+  }
+  // Here either input_a or input_b has dim of 1 (if i >= 0).
+  while (i >= 0 && shape_b->Dims(i) == 1) {
+    params->broadcast_shape[1] *= shape_a->Dims(i);
+    --i;
+  }
+  while (i >= 0 && shape_a->Dims(i) == shape_b->Dims(i)) {
+    params->broadcast_shape[0] *= shape_b->Dims(i);
+    --i;
+  }
+
+  // Rarer case is when the broadcast dimensions cannot be handled by a fivefold
+  // loop.
+  if (i >= 0) {
+    params->broadcast_category = BroadcastableOpCategory::kGenericBroadcast;
+  }
+  return true;
 }
 
 template <typename T>
@@ -471,6 +558,19 @@ inline void ReluX(const tflite::ActivationParams& params,
   }
 }
 
+inline void LeakyRelu(const tflite::LeakyReluParams& params,
+                      const RuntimeShape& input_shape, const float* input_data,
+                      const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("LeakyRelu (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    // Note that this implementation matches that of TensorFlow, and corresponds
+    // to the traditional LeakyRelu equation only for alpha <= 1.
+    output_data[i] = std::max(val, val * params.alpha);
+  }
+}
+
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                             const RuntimeShape& input_shape,
                             const float* input_data,
@@ -2636,7 +2736,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   using FixedPointScaledDiff =
       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
@@ -2941,41 +3040,39 @@ inline void Floor(const RuntimeShape& input_shape, const float* input_data,
   }
 }
 
-template <typename T>
+template <typename T, typename CoordsT = int32>
 inline void Gather(const tflite::GatherParams& op_params,
-                   const RuntimeShape& unextended_input_shape,
-                   const T* input_data, const RuntimeShape& coords_shape,
-                   const int32* coords_data,
-                   const RuntimeShape& unextended_output_shape,
-                   T* output_data) {
-  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
-  const RuntimeShape input_shape =
-      RuntimeShape::ExtendedShape(4, unextended_input_shape);
-  const RuntimeShape output_shape =
-      RuntimeShape::ExtendedShape(4, unextended_output_shape);
-
-  const int input_rank = op_params.input_rank;
-  const int gather_dimensions = output_shape.DimensionsCount();
-  TFLITE_DCHECK_GE(input_shape.DimensionsCount(), gather_dimensions);
-  const int axis = gather_dimensions - input_rank;
-  TFLITE_DCHECK_LT(axis, gather_dimensions);
+                   const RuntimeShape& input_shape, const T* input_data,
+                   const RuntimeShape& coords_shape, const CoordsT* coords_data,
+                   const RuntimeShape& output_shape, T* output_data) {
+  int axis = op_params.axis;
+  if (axis < 0) {
+    axis += input_shape.DimensionsCount();
+  }
   TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, input_shape.DimensionsCount());
+  const int axis_size = input_shape.Dims(axis);
   const int coords_count = coords_shape.FlatSize();
-  TFLITE_DCHECK_EQ(coords_count, output_shape.Dims(axis));
 
-  int64_t stride = 1;
-  for (int i = axis + 1; i < gather_dimensions; ++i) {
-    stride *= input_shape.Dims(i);
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) {
+    inner_size *= input_shape.Dims(i);
   }
-  T* out = output_data;
 
-  for (int i = 0; i < coords_count; ++i) {
-    TFLITE_DCHECK_GE(coords_data[i], 0);
-    TFLITE_DCHECK_LT(coords_data[i], input_shape.Dims(axis));
-    const T* in = input_data + coords_data[i] * stride;
-    memcpy(out, in, sizeof(T) * stride);
-    out += stride;
+  for (int outer = 0; outer < outer_size; ++outer) {
+    for (int i = 0; i < coords_count; ++i) {
+      TFLITE_DCHECK_GE(coords_data[i], 0);
+      TFLITE_DCHECK_LT(coords_data[i], axis_size);
+      std::memcpy(
+          output_data + (outer * coords_count + i) * inner_size,
+          input_data + (outer * axis_size + coords_data[i]) * inner_size,
+          sizeof(T) * inner_size);
+    }
   }
 }
 
@@ -3255,6 +3352,36 @@ inline void Pad(const tflite::PadParams& op_params,
           output_data);
 }
 
+// One could make all PadImageStyle calls simply delegate the work to the
+// ordinary Pad.  However, it is better that the reference code asserts false in
+// similar cases.
+template <typename T, typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape, const T* input_data,
+                          const P* pad_value_ptr,
+                          const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_ASSERT_FALSE;
+}
+
+template <typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape,
+                          const uint8* input_data, const P* pad_value_ptr,
+                          const RuntimeShape& output_shape,
+                          uint8* output_data) {
+  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+      output_data);
+}
+
+template <typename P>
+inline void PadImageStyle(const tflite::PadParams& op_params,
+                          const RuntimeShape& input_shape,
+                          const float* input_data, const P* pad_value_ptr,
+                          const RuntimeShape& output_shape,
+                          float* output_data) {
+  Pad(op_params, input_shape, input_data, pad_value_ptr, output_shape,
+      output_data);
+}
 template <typename T>
 inline void StridedSlice(const tflite::StridedSliceParams& op_params,
                          const RuntimeShape& unextended_input_shape,
@@ -3536,8 +3663,10 @@ inline void Mean(const tflite::MeanParams& op_params,
                  const RuntimeShape& unextended_output_shape, T* output_data) {
   gemmlowp::ScopedProfilingLabel label("Mean");
 
-  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
   const RuntimeShape input_shape =
       RuntimeShape::ExtendedShape(4, unextended_input_shape);
   const RuntimeShape output_shape =
@@ -3551,8 +3680,6 @@ inline void Mean(const tflite::MeanParams& op_params,
   const int input_height = input_shape.Dims(1);
   const int input_width = input_shape.Dims(2);
 
-  // The current implementation only supports simultaneous reduction over
-  // width and height.
   TFLITE_DCHECK_EQ(op_params.axis_count, 2);
   TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
                 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
@@ -3732,32 +3859,48 @@ template <typename T1, typename T2, typename T3, typename Cmp>
 void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
                const T3* input2_data, const RuntimeShape& output_shape,
                T2* output_data, const Cmp& cmp) {
-  // The current ArgMax implemention can only determine the index of the maximum
-  // value in the last dimension. So the axis argument is ignored.
-
   // For ArgMax, the number of output dimensions = (number of input dimensions -
   // 1). For the sake of simplicity, the output dimensions are equal to the
-  // input dimensions here. We enforce the constraint that the last dimension
+  // input dimensions here. We enforce the constraint that the axis dimension
   // must always be 1.
-  const int trailing_dim = output_shape.DimensionsCount() - 1;
   TFLITE_DCHECK_EQ(input1_shape.DimensionsCount(),
                    output_shape.DimensionsCount());
-  TFLITE_DCHECK_EQ(output_shape.Dims(trailing_dim), 1);
-  const int outer_size =
-      MatchingFlatSizeSkipDim(input1_shape, trailing_dim, output_shape);
-  const int depth = input1_shape.Dims(trailing_dim);
 
-  for (int i = 0; i < outer_size; ++i) {
-    auto min_max_value = input1_data[i * depth];
-    int min_max_index = 0;
-    for (int d = 1; d < depth; ++d) {
-      const auto& curr_value = input1_data[i * depth + d];
-      if (cmp(curr_value, min_max_value)) {
-        min_max_value = curr_value;
-        min_max_index = d;
+  int axis = input2_data[0];
+  if (axis < 0) {
+    axis += input1_shape.DimensionsCount();
+  }
+
+  const int axis_size = input1_shape.Dims(axis);
+  TFLITE_DCHECK_EQ(output_shape.Dims(axis), 1);
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
+    outer_size *= input1_shape.Dims(i);
+  }
+
+  int inner_size = 1;
+  const int dims_count = input1_shape.DimensionsCount();
+  for (int i = axis + 1; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
+    inner_size *= input1_shape.Dims(i);
+  }
+
+  for (int outer = 0; outer < outer_size; ++outer) {
+    for (int inner = 0; inner < inner_size; ++inner) {
+      auto min_max_value = input1_data[outer * axis_size * inner_size + inner];
+      int min_max_index = 0;
+      for (int i = 1; i < axis_size; ++i) {
+        const auto& curr_value =
+            input1_data[(outer * axis_size + i) * inner_size + inner];
+        if (cmp(curr_value, min_max_value)) {
+          min_max_value = curr_value;
+          min_max_index = i;
+        }
       }
+      output_data[outer * inner_size + inner] = min_max_index;
     }
-    output_data[i] = min_max_index;
   }
 }
 
@@ -4369,7 +4512,118 @@ inline void BinaryFunction(const RuntimeShape& input1_shape,
   }
 }
 
+template <typename T>
+inline void ResizeNearestNeighbor(
+    const tflite::ResizeNearestNeighborParams& op_params,
+    const RuntimeShape& unextended_input_shape, const T* input_data,
+    const RuntimeShape& output_size_shape, const int32* output_size_data,
+    const RuntimeShape& unextended_output_shape, T* output_data) {
+  // Align corners = true is not supported.
+  TFLITE_DCHECK(!op_params.align_corners);
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  int32 batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32 input_height = input_shape.Dims(1);
+  int32 input_width = input_shape.Dims(2);
+  int32 depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  // The Tensorflow version of this op allows resize on the width and height
+  // axis only.
+  TFLITE_DCHECK_EQ(output_size_shape.FlatSize(), 2);
+  int32 output_height = output_size_data[0];
+  int32 output_width = output_size_data[1];
+
+  // We use float to ensure agreement with the Tensorflow implementation.
+  const float height_scale = static_cast<float>(input_height) / output_height;
+  const float width_scale = static_cast<float>(input_width) / output_width;
+
+  const int col_offset = input_shape.Dims(3);
+  const int row_offset = input_shape.Dims(2) * col_offset;
+  const int batch_offset = input_shape.Dims(1) * row_offset;
+
+  const T* input_ptr = input_data;
+  T* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      int32 in_y = std::min(static_cast<int32>(std::floor(y * height_scale)),
+                            input_height - 1);
+      const T* y_input_ptr = input_ptr + in_y * row_offset;
+      for (int x = 0; x < output_width; ++x) {
+        int32 in_x = std::min(static_cast<int32>(std::floor(x * width_scale)),
+                              input_width - 1);
+        const T* x_input_ptr = y_input_ptr + in_x * col_offset;
+        memcpy(output_ptr, x_input_ptr, depth * sizeof(T));
+        output_ptr += depth;
+      }
+    }
+    input_ptr += batch_offset;
+  }
+}
+
+inline void BroadcastPrelu4DSlow(const PreluParams& params,
+                                 const RuntimeShape& input_shape,
+                                 const uint8* input_data,
+                                 const RuntimeShape& alpha_shape,
+                                 const uint8* alpha_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          int output_index = Offset(extended_output_shape, b, y, x, c);
+          int input_index = SubscriptToIndex(desc1, b, y, x, c);
+          const int32 input_value =
+              params.input_offset + input_data[input_index];
+          if (input_value >= 0) {
+            output_data[output_index] = input_data[input_index];
+          } else {
+            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
+            const int32 alpha_value =
+                params.alpha_offset + alpha_data[alpha_index];
+            const int32 unclamped_output =
+                params.output_offset +
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    input_value * alpha_value, params.output_multiplier,
+                    params.output_shift);
+            const int32 quantized_min = std::numeric_limits<uint8_t>::min();
+            const int32 quantized_max = std::numeric_limits<uint8_t>::max();
+            const int32 clamped_output = std::min(
+                quantized_max, std::max(quantized_min, unclamped_output));
+            output_data[output_index] = static_cast<uint8>(clamped_output);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void Fill(const RuntimeShape& value_shape, const T* value_data,
+          const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = *value_data;
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
similarity index 92%
rename from tensorflow/contrib/lite/kernels/internal/reference/softmax.h
rename to tensorflow/lite/kernels/internal/reference/softmax.h
index 7d442961349e349b3101d8a1798ee7dd388426d3..51de6b51aa5308b69dd5b9ad6bf29cd18c0550ba 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
 
 #include "fixedpoint/fixedpoint.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/round.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace reference_ops {
@@ -176,4 +176,4 @@ inline void Softmax(const float* in, const int input_size, const int batch_size,
 }  // namespace reference_ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
diff --git a/tensorflow/lite/kernels/internal/resize_bilinear_test.cc b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c5ac1992f0f649ca47e2a5bc81ea332abc46bf5
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace {
+template <typename T>
+void TestOneResizeBilinear(int batch, int depth, int input_width,
+                           int input_height, int output_width,
+                           int output_height, float error_threshold) {
+  RuntimeShape input_dims_inference({batch, input_height, input_width, depth});
+  RuntimeShape output_dims_inference(
+      {batch, output_height, output_width, depth});
+
+  const int input_buffer_size = input_dims_inference.FlatSize();
+  const int output_buffer_size = output_dims_inference.FlatSize();
+
+  std::vector<T> input_data(input_buffer_size, 0);
+  std::vector<T> reference_output_data(output_buffer_size, 0);
+  // Initialize the output data with something other than zero, so we can catch
+  // issue with kernels failing to initialize the output.
+  std::vector<T> output_data(output_buffer_size, 3);
+
+  const T min_amplitude = static_cast<T>(0);
+  const T max_amplitude = static_cast<T>(255);
+  FillRandom(&input_data, min_amplitude, max_amplitude);
+
+  RuntimeShape output_size_dims({1, 1, 1, 2});
+  std::vector<int32> output_size_data = {output_height, output_width};
+
+  tflite::ResizeBilinearParams op_params;
+  op_params.align_corners = false;
+
+  reference_ops::ResizeBilinear(op_params, input_dims_inference,
+                                input_data.data(), output_size_dims,
+                                output_size_data.data(), output_dims_inference,
+                                reference_output_data.data());
+  optimized_ops::ResizeBilinear(
+      op_params, input_dims_inference, input_data.data(), output_size_dims,
+      output_size_data.data(), output_dims_inference, output_data.data());
+
+  double sum_diff = 0;
+  float max_abs_val = 0;
+  for (int i = 0; i < output_buffer_size; i++) {
+    sum_diff += std::abs(static_cast<float>(output_data[i]) -
+                         static_cast<float>(reference_output_data[i]));
+    max_abs_val = std::max(
+        max_abs_val, std::abs(static_cast<float>(reference_output_data[i])));
+  }
+
+  if (sum_diff != 0.f) {
+    const float mean_diff = static_cast<float>(sum_diff / output_buffer_size);
+    const float relative_error = std::abs(mean_diff) / max_abs_val;
+    ASSERT_LT(relative_error, error_threshold);
+  }
+}
+
+TEST(ResizeBilinear, TestResizeBilinear8Bit) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+
+    TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 0.025);
+  }
+}
+
+TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = input_width * 2;
+    const int output_height = input_height * 2;
+
+    TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 1e-5);
+  }
+}
+
+TEST(ResizeBilinear, TestResizeBilinear) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+
+    TestOneResizeBilinear<float>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 1e-5);
+  }
+}
+
+TEST(ResizeBilinear2x2, TestResizeBilinear) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = input_width * 2;
+    const int output_height = input_height * 2;
+
+    TestOneResizeBilinear<float>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 1e-5);
+  }
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..102ee04e6a89bd5ca53452a8567464bca95e4675
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
@@ -0,0 +1,239 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace {
+
+template <typename T>
+void TestReferenceResizeNearestNeighbor(
+    const RuntimeShape& input_shape, const std::vector<T>& input_data,
+    const std::vector<int32>& output_size_data,
+    const RuntimeShape& output_shape,
+    const std::vector<T>& expected_output_data) {
+  ResizeNearestNeighborParams op_params{/*align_corners=*/false};
+  RuntimeShape output_size_shape({1, 1, 1, 2});
+
+  std::vector<T> output_data(expected_output_data.size());
+  reference_ops::ResizeNearestNeighbor(
+      op_params, input_shape, input_data.data(), output_size_shape,
+      output_size_data.data(), output_shape, output_data.data());
+  ASSERT_EQ(expected_output_data, output_data);
+}
+
+// Sanity test values are from
+// third_party/tensorflow/core/kernels/resize_nearest_neighbor_op_test.cc.
+
+TEST(ResizeNearestNeighborReference, Test2x2To1x1) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<float> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {1, 1};
+  RuntimeShape output_shape = {1, 1, 1, 1};
+  std::vector<float> output_data = {1};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
+TEST(ResizeNearestNeighborReference, Test2x2To3x3) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<uint8> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {1, 3, 3, 1};
+  std::vector<uint8> output_data = {1, 1, 2, 1, 1, 2, 3, 3, 4};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
+TEST(ResizeNearestNeighborReference, Test3x3To2x2) {
+  RuntimeShape input_shape = {1, 3, 3, 1};
+  std::vector<float> input_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  std::vector<int32> output_size_data = {2, 2};
+  RuntimeShape output_shape = {1, 2, 2, 1};
+  std::vector<float> output_data = {1, 2, 4, 5};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
+TEST(ResizeNearestNeighborReference, Test2x2To2x5) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<uint8> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {2, 5};
+  RuntimeShape output_shape = {1, 2, 5, 1};
+  std::vector<uint8> output_data = {1, 1, 1, 2, 2, 3, 3, 3, 4, 4};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
+TEST(ResizeNearestNeighborReference, Test4x4To3x3) {
+  RuntimeShape input_shape = {1, 4, 4, 1};
+  std::vector<uint8> input_data = {1, 2,  3,  4,  5,  6,  7,  8,
+                                   9, 10, 11, 12, 13, 14, 15, 16};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {1, 3, 3, 1};
+  std::vector<uint8> output_data = {1, 2, 3, 5, 6, 7, 9, 10, 11};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
+TEST(ResizeNearestNeighborReference, Test2x2To5x2) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<float> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {5, 2};
+  RuntimeShape output_shape = {1, 5, 2, 1};
+  std::vector<float> output_data = {1, 2, 1, 2, 1, 2, 3, 4, 3, 4};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
+TEST(ResizeNearestNeighborReference, Test2x2To4x4) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<uint8> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {4, 4};
+  RuntimeShape output_shape = {1, 4, 4, 1};
+  std::vector<uint8> output_data = {1, 1, 2, 2, 1, 1, 2, 2,
+                                    3, 3, 4, 4, 3, 3, 4, 4};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
+TEST(ResizeNearestNeighborReference, Test2x2x2x2To2x3x3x2) {
+  // Input:
+  //  [ [ 1, 1 ], [ 2, 2 ],
+  //    [ 3, 3 ], [ 4, 4 ] ],
+  //  [ [ 5, 5 ], [ 6, 6 ],
+  //    [ 7, 7 ], [ 8, 8 ] ]
+  RuntimeShape input_shape = {2, 2, 2, 2};
+  std::vector<float> input_data = {1, 1, 2, 2, 3, 3, 4, 4,
+                                   5, 5, 6, 6, 7, 7, 8, 8};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {2, 3, 3, 2};
+  // Output:
+  //  [ [ 1, 1 ], [ 1, 1 ], [ 2, 2 ],
+  //    [ 1, 1 ], [ 1, 1 ], [ 2, 2 ],
+  //    [ 3, 3 ], [ 3, 3 ], [ 4, 4 ] ],
+  //  [ [ 5, 5 ], [ 5, 5 ], [ 6, 6 ],
+  //    [ 5, 5 ], [ 5, 5 ], [ 6, 6 ],
+  //    [ 7, 7 ], [ 7, 7 ], [ 8, 8 ] ]
+  std::vector<float> output_data = {1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2,
+                                    3, 3, 3, 3, 4, 4, 5, 5, 5, 5, 6, 6,
+                                    5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 8, 8};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
+void TestOptimizedResizeNearestNeighbor(int batch, int depth, int input_width,
+                                        int input_height, int output_width,
+                                        int output_height) {
+  ResizeNearestNeighborParams op_params{/*align_corners=*/false};
+  RuntimeShape output_size_shape({1, 1, 1, 2});
+
+  RuntimeShape input_shape({batch, input_height, input_width, depth});
+  RuntimeShape output_shape({batch, output_height, output_width, depth});
+
+  std::vector<uint8> input_data(input_shape.FlatSize(), 0);
+  FillRandom(&input_data, static_cast<uint8>(0), static_cast<uint8>(255));
+
+  std::vector<uint8> reference_output_data(output_shape.FlatSize(), 0);
+  // Initialize the output data with something other than zero, so we can catch
+  // issue with kernels failing to initialize the output.
+  std::vector<uint8> output_data(output_shape.FlatSize(), 3);
+  std::vector<int32> output_size_data = {output_height, output_width};
+
+  // Test the optimized version against the reference version.
+  reference_ops::ResizeNearestNeighbor(
+      op_params, input_shape, input_data.data(), output_size_shape,
+      output_size_data.data(), output_shape, reference_output_data.data());
+  optimized_ops::ResizeNearestNeighbor(
+      op_params, input_shape, input_data.data(), output_size_shape,
+      output_size_data.data(), output_shape, output_data.data());
+
+  ASSERT_EQ(reference_output_data, output_data);
+}
+
+// Since the optimized version uses fixed-point and the reference version uses
+// float, offsets may differ. Test if the input/output image combination results
+// in the same offsets before running parity tests.
+bool is_valid_scale(int input_width, int input_height, int output_width,
+                    int output_height) {
+  const float height_scale_float =
+      static_cast<float>(input_height) / output_height;
+  const float width_scale_float =
+      static_cast<float>(input_width) / output_width;
+
+  int32 height_scale_int = (input_height << 16) / output_height + 1;
+  int32 width_scale_int = (input_width << 16) / output_width + 1;
+
+  for (int y = 0; y < output_height; ++y) {
+    int32 in_y_float =
+        std::min(static_cast<int32>(std::floor(y * height_scale_float)),
+                 input_height - 1);
+    int32 in_y_int = std::min((y * height_scale_int) >> 16, input_height - 1);
+    if (in_y_int != in_y_float) {
+      return false;
+    }
+    for (int x = 0; x < output_width; ++x) {
+      int32 in_x_float =
+          std::min(static_cast<int32>(std::floor(x * width_scale_float)),
+                   input_width - 1);
+      int32 in_x_int = std::min((x * width_scale_int) >> 16, input_width - 1);
+      if (in_x_int != in_x_float) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+TEST(ResizeNearestNeighborOptimized, TestReferenceParity) {
+  int invalid_count = 0;
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+
+    if (is_valid_scale(input_width, input_height, output_width,
+                       output_height)) {
+      TestOptimizedResizeNearestNeighbor(
+          batch, depth, input_width, input_height, output_width, output_height);
+    } else {
+      invalid_count++;
+    }
+  }
+  // Test that the total number of invalid tests are a small percentage.
+  ASSERT_LT(static_cast<float>(invalid_count) / kTestsToRun, 0.001f);
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/round.h b/tensorflow/lite/kernels/internal/round.h
similarity index 86%
rename from tensorflow/contrib/lite/kernels/internal/round.h
rename to tensorflow/lite/kernels/internal/round.h
index f299d0bd8733dc603c4950091c8ac3d7890548a7..cb494bfd5374d90bac0c8f444e186f137f45a91f 100644
--- a/tensorflow/contrib/lite/kernels/internal/round.h
+++ b/tensorflow/lite/kernels/internal/round.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_ROUND_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_ROUND_H_
 
 #include <cmath>
 
@@ -36,4 +36,4 @@ inline T TfLiteRound(const T x) {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_ROUND_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc
rename to tensorflow/lite/kernels/internal/softmax_quantized_test.cc
index 831fb3c24353b2e321c15808c1cac78ca9e9a093..743ce0355c96fd2766fd2315299c2419703f11b7 100644
--- a/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/test_util.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc b/tensorflow/lite/kernels/internal/spectrogram.cc
similarity index 99%
rename from tensorflow/contrib/lite/kernels/internal/spectrogram.cc
rename to tensorflow/lite/kernels/internal/spectrogram.cc
index 20abcb725859d03f83c969369bddf1429895e0ba..58769ad8cc7a06b414cfde65b7b86738307f9b20 100644
--- a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
+++ b/tensorflow/lite/kernels/internal/spectrogram.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/kernels/internal/spectrogram.h"
+#include "tensorflow/lite/kernels/internal/spectrogram.h"
 
 #include <assert.h>
 #include <math.h>
diff --git a/tensorflow/contrib/lite/kernels/internal/spectrogram.h b/tensorflow/lite/kernels/internal/spectrogram.h
similarity index 95%
rename from tensorflow/contrib/lite/kernels/internal/spectrogram.h
rename to tensorflow/lite/kernels/internal/spectrogram.h
index b77a68f7dfe6edb07ec4e5db540c673b2d6f6d6e..b885b9d7d2d84584faa8e2af02216d2d2bc72e56 100644
--- a/tensorflow/contrib/lite/kernels/internal/spectrogram.h
+++ b/tensorflow/lite/kernels/internal/spectrogram.h
@@ -28,8 +28,8 @@ limitations under the License.
 // window = hann(window_length_samples, 'periodic');
 // S = abs(spectrogram(audio, window, overlap_samples)).^2;
 
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
 
 #include <complex>
 #include <deque>
@@ -107,4 +107,4 @@ class Spectrogram {
 }  // namespace internal
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_SPECTROGRAM_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h b/tensorflow/lite/kernels/internal/strided_slice_logic.h
similarity index 94%
rename from tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h
rename to tensorflow/lite/kernels/internal/strided_slice_logic.h
index af5db1064c1b7b13665edadf16425b8e8e8247f6..e7fd5ca93195562cdfa0bd2533ab0efba34b9412 100644
--- a/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h
+++ b/tensorflow/lite/kernels/internal/strided_slice_logic.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
 
 #include <limits>
 #include <vector>
-#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace strided_slice {
@@ -195,4 +195,4 @@ inline tflite::StridedSliceParams BuildStridedSliceParams(
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/lite/kernels/internal/tensor.h
similarity index 91%
rename from tensorflow/contrib/lite/kernels/internal/tensor.h
rename to tensorflow/lite/kernels/internal/tensor.h
index 689cea03e75875c27e72a40701da25391bfbd32b..b806753d886132d996d57c20d80e7616da758497 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/lite/kernels/internal/tensor.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
 
 #include <complex>
 #include <vector>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 
@@ -111,4 +111,4 @@ class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
similarity index 83%
rename from tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h
rename to tensorflow/lite/kernels/internal/tensor_ctypes.h
index 9f5b33d21753512df7b6d3bbfc5abdbec1bc71b7..4a94b703f8b299e503305aaa897a2ebc65e50d3b 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 
@@ -53,6 +53,11 @@ inline bool* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.b : nullptr;
 }
 
+template <>
+inline int8_t* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.int8 : nullptr;
+}
+
 template <typename T>
 inline const T* GetTensorData(const TfLiteTensor* tensor);
 
@@ -66,6 +71,11 @@ inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.uint8 : nullptr;
 }
 
+template <>
+inline const int8_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.int8 : nullptr;
+}
+
 template <>
 inline const int16_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.i16 : nullptr;
@@ -99,4 +109,4 @@ inline RuntimeShape GetTensorShape(const TfLiteTensor* tensor) {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_CTYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_test.cc b/tensorflow/lite/kernels/internal/tensor_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/internal/tensor_test.cc
rename to tensorflow/lite/kernels/internal/tensor_test.cc
index 2ed73ba82d6473b6756de712b7d8232ae9d20d3f..7bfe280d6f883ca03dbd2d517509355e578e4375 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.cc b/tensorflow/lite/kernels/internal/tensor_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..701e5a66aa1bac15dedfa677ec34c8142c0a4309
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/tensor_utils.cc
@@ -0,0 +1,28 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+#endif  //  USE_NEON
+
+#ifdef USE_NEON
+#include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h"
+#else
+#include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h"
+#endif  // USE_NEON
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
similarity index 96%
rename from tensorflow/contrib/lite/kernels/internal/tensor_utils.h
rename to tensorflow/lite/kernels/internal/tensor_utils.h
index b0fe5adf65de83269af55781b48f9fd9bd819a5e..71ae69522f9a45745a9ed9eae211db3d048ba43d 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
@@ -165,4 +165,4 @@ void MeanStddevNormalization(const float* input_vector, float* output_vector,
 }  // namespace tensor_utils
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
rename to tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 6458af714b8c714f7132dc17adf4eca20ece3e37..29866d066406e58e06e6caa2e5b410460564c966 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include <gmock/gmock.h>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/test_util.h"
 
 namespace tflite {
 namespace tensor_utils {
diff --git a/tensorflow/lite/kernels/internal/test_util.cc b/tensorflow/lite/kernels/internal/test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4462775ddbdd250c00b65b15de2082e1219e358b
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/test_util.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/test_util.h"
+
+#include <cmath>
+#include <iterator>
+
+namespace tflite {
+
+// this is a copied from an internal function in propagate_fixed_sizes.cc
+bool ComputeConvSizes(const RuntimeShape& input_shape, int output_depth,
+                      int filter_width, int filter_height, int stride,
+                      int dilation_width_factor, int dilation_height_factor,
+                      PaddingType padding_type, RuntimeShape* output_shape,
+                      int* pad_width, int* pad_height) {
+  const int input_width = input_shape.Dims(2);
+  const int input_height = input_shape.Dims(1);
+  const int batch = input_shape.Dims(0);
+
+  int dilated_filter_width = dilation_width_factor * (filter_width - 1) + 1;
+  int dilated_filter_height = dilation_height_factor * (filter_height - 1) + 1;
+
+  int output_height = 0;
+  int output_width = 0;
+  if (padding_type == PaddingType::kValid) {
+    // Official TF is
+    // ceil((input_height - (dilated_filter_height - 1)) / stride),
+    // implemented as
+    // floor(
+    //   (input_height - (dilated_filter_height - 1) + (stride - 1)) / stride).
+    output_height = (input_height + stride - dilated_filter_height) / stride;
+    output_width = (input_width + stride - dilated_filter_width) / stride;
+  } else if (padding_type == PaddingType::kSame) {
+    output_height = (input_height + stride - 1) / stride;
+    output_width = (input_width + stride - 1) / stride;
+  } else {
+    return false;
+  }
+
+  if (output_width <= 0 || output_height <= 0) {
+    return false;
+  }
+
+  *pad_height = std::max(
+      0, ((output_height - 1) * stride + dilated_filter_height - input_height) /
+             2);
+  *pad_width = std::max(
+      0,
+      ((output_width - 1) * stride + dilated_filter_width - input_width) / 2);
+
+  output_shape->BuildFrom({batch, output_height, output_width, output_depth});
+  return true;
+}
+
+std::mt19937& RandomEngine() {
+  static std::mt19937 engine;
+  return engine;
+}
+
+int UniformRandomInt(int min, int max) {
+  std::uniform_int_distribution<int> dist(min, max);
+  return dist(RandomEngine());
+}
+
+float UniformRandomFloat(float min, float max) {
+  std::uniform_real_distribution<float> dist(min, max);
+  return dist(RandomEngine());
+}
+
+int ExponentialRandomPositiveInt(float percentile, int percentile_val,
+                                 int max_val) {
+  const float lambda =
+      -std::log(1.f - percentile) / static_cast<float>(percentile_val);
+  std::exponential_distribution<float> dist(lambda);
+  float val;
+  do {
+    val = dist(RandomEngine());
+  } while (!val || !std::isfinite(val) || val > max_val);
+  return static_cast<int>(std::ceil(val));
+}
+
+float ExponentialRandomPositiveFloat(float percentile, float percentile_val,
+                                     float max_val) {
+  const float lambda =
+      -std::log(1.f - percentile) / static_cast<float>(percentile_val);
+  std::exponential_distribution<float> dist(lambda);
+  float val;
+  do {
+    val = dist(RandomEngine());
+  } while (!std::isfinite(val) || val > max_val);
+  return val;
+}
+
+void FillRandom(std::vector<float>* vec, float min, float max) {
+  std::uniform_real_distribution<float> dist(min, max);
+  auto gen = std::bind(dist, RandomEngine());
+  std::generate(std::begin(*vec), std::end(*vec), gen);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/test_util.h b/tensorflow/lite/kernels/internal/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..766a627c99e03b37cbb780d0bebe231ea3a6f256
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/test_util.h
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <random>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// Computes output and padding dimensions.
+bool ComputeConvSizes(const RuntimeShape& input_shape, int output_depth,
+                      int filter_width, int filter_height, int stride,
+                      int dilation_width_factor, int dilation_height_factor,
+                      PaddingType padding_type, RuntimeShape* output_shape,
+                      int* pad_width, int* pad_height);
+
+// Returns a mt19937 random engine.
+std::mt19937& RandomEngine();
+
+// Returns a random integer uniformly distributed between |min| and |max|.
+int UniformRandomInt(int min, int max);
+
+// Returns a random float uniformly distributed between |min| and |max|.
+float UniformRandomFloat(float min, float max);
+
+// Returns a random element in |v|.
+template <typename T>
+const T& RandomElement(const std::vector<T>& v) {
+  return v[UniformRandomInt(0, v.size() - 1)];
+}
+
+// Returns a random exponentially distributed integer.
+int ExponentialRandomPositiveInt(float percentile, int percentile_val,
+                                 int max_val);
+
+// Returns a random exponentially distributed float.
+float ExponentialRandomPositiveFloat(float percentile, float percentile_val,
+                                     float max_val);
+
+// Fills a vector with random floats between |min| and |max|.
+void FillRandom(std::vector<float>* vec, float min, float max);
+
+// Fills a vector with random numbers between |min| and |max|.
+template <typename T>
+void FillRandom(std::vector<T>* vec, T min, T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  auto gen = std::bind(dist, RandomEngine());
+  std::generate(std::begin(*vec), std::end(*vec), gen);
+}
+
+// Fills a vector with random numbers.
+template <typename T>
+void FillRandom(std::vector<T>* vec) {
+  FillRandom(vec, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+}
+
+template <typename T>
+void FillRandom(typename std::vector<T>::iterator begin_it,
+                typename std::vector<T>::iterator end_it, T min, T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  auto gen = std::bind(dist, RandomEngine());
+  std::generate(begin_it, end_it, gen);
+}
+
+// Fill with a "skyscraper" pattern, in which there is a central section (across
+// the depth) with higher values than the surround.
+template <typename T>
+void FillRandomSkyscraper(std::vector<T>* vec, int depth,
+                          double middle_proportion, uint8 middle_min,
+                          uint8 sides_max) {
+  for (auto base_it = std::begin(*vec); base_it != std::end(*vec);
+       base_it += depth) {
+    auto left_it = base_it + std::ceil(0.5 * depth * (1.0 - middle_proportion));
+    auto right_it =
+        base_it + std::ceil(0.5 * depth * (1.0 + middle_proportion));
+    FillRandom(base_it, left_it, std::numeric_limits<T>::min(), sides_max);
+    FillRandom(left_it, right_it, middle_min, std::numeric_limits<T>::max());
+    FillRandom(right_it, base_it + depth, std::numeric_limits<T>::min(),
+               sides_max);
+  }
+}
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..859ec8c68252538e3cf6d06ce7864f62d2a236dc
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -0,0 +1,1047 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
+
+#include <algorithm>
+#include <cstring>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
+enum class PaddingType : uint8 { kNone, kSame, kValid };
+
+struct PaddingValues {
+  int16 width;
+  int16 height;
+};
+
+// This enumeration allows for non-default formats for the weights array
+// of a fully-connected operator, allowing the use of special optimized
+// runtime paths.
+enum class FullyConnectedWeightsFormat : uint8 {
+  // Default format (flat 2D layout, the inner contiguous dimension
+  // is input_depth, the outer non-contiguous dimension is output_depth)
+  kDefault,
+  // Summary: optimized layout for fast CPU runtime implementation,
+  // aimed specifically at ARM CPUs at the moment, and specialized for
+  // 8-bit quantized layers.
+  //
+  // The use case we're concerned with here is: 8-bit quantization,
+  // large weights matrix that doesn't fit in cache (e.g. 4096x2048 in
+  // a key application that drove this), very small batch size (e.g. 1 -- 4).
+  //
+  // Even with 8-bit quantization of weights, the performance of memory
+  // accesses to the weights can become the dominant issue when
+  // the batch size is small, so each weight value is used in only a few
+  // arithmetic ops, i.e. the fully-connected node has a low arithmetic
+  // intensity. The specific issues that arise are of three kinds:
+  // (1) One may, ideally, max out DRAM bandwidth, i.e. be truly memory
+  //     bound. That's the "good" issue to run into.
+  // (2) One may run into sub-optimal pre-fetching: the data hasn't been
+  //     prefetched into the cache by the time we need it.
+  // (3) One may run into cache aliasing: multiple values that are
+  //     pre-fetched, alias each other in the L1 cache (which typically
+  //     has only 4-way set associativity in ARM CPUs) and thus evict
+  //     each other before we get to using them.
+  //
+  // The point of this shuffling is to avoid issues (2) and (3) so that
+  // we get as fast as possible given only the hard constraint (1).
+  // This is achieved by turning the difficulty into a solution: the
+  // difficulty, that each value loaded from memory is used only in
+  // one kernel iteration, making this operation memory-intensive, hints at
+  // the solution, of shuffling the weights so that they are stored in the
+  // exact order as the kernel needs to load them, so that the memory
+  // accesses made by the kernel are trivial. This solves (2) because the
+  // trivial memory access pattern allows the CPU's automatic prefetching
+  // to perform very well (no need even for preload instructions), and this
+  // solves (3) because the values being loaded concurrently are now
+  // contiguous in the address space, thus don't alias each other in the cache.
+  //
+  // On ARM, we typically want our kernel to process a 4x16 block of weights
+  // at a time, because:
+  //   - 16 is the number of bytes in a NEON register.
+  //   - 4 is how many rows we need to handle concurrently in the kernel in
+  //     order to have sufficient mutual independence of instructions to
+  //     maximize arithmetic throughput.
+  //
+  // Finally, the 'Int8' part in the name refers to the fact that this
+  // weights format has each weights value encoded as a signed int8 value,
+  // even if the data type of the weights buffer is uint8.  This is intended
+  // to save runtime kernels the effort to have to XOR the top bit of these
+  // bytes before using them in signed arithmetic, see this file for more
+  // explanations on the 'signed int8 trick' in matrix multiplication kernels:
+  //
+  //   tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+  //
+  kShuffled4x16Int8,
+};
+
+// Quantization parameters, determining the mapping of quantized values
+// to real values (i.e. determining how quantized values are mathematically
+// interpreted).
+//
+// The correspondence is as follows:
+//
+//   real_value = scale * (quantized_value - zero_point);
+//
+// In other words, zero_point designates which quantized value corresponds to
+// the real 0 value, and scale designates the difference between the real values
+// corresponding to consecutive quantized values differing by 1.
+struct QuantizationParams {
+  int32 zero_point = 0;
+  double scale = 0.0;
+};
+
+inline bool operator==(const QuantizationParams& qp1,
+                       const QuantizationParams& qp2) {
+  return qp1.zero_point == qp2.zero_point && qp1.scale == qp2.scale;
+}
+
+template <int N>
+struct Dims {
+  int sizes[N];
+  int strides[N];
+};
+
+class RuntimeShape {
+ public:
+  // Shapes with dimensions up to 4 are stored directly in the structure, while
+  // larger shapes are separately allocated.
+  static constexpr int kMaxSmallSize = 4;
+
+  RuntimeShape& operator=(RuntimeShape const&) = delete;
+
+  RuntimeShape() : size_(0) {}
+
+  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
+    if (dimensions_count > kMaxSmallSize) {
+#ifdef TF_LITE_STATIC_MEMORY
+      TFLITE_CHECK(false && "No shape resizing supported on this platform");
+#else   // TF_LITE_STATIC_MEMORY
+      dims_pointer_ = new int32[dimensions_count];
+#endif  // TF_LITE_STATIC_MEMORY
+    }
+  }
+
+  RuntimeShape(int shape_size, int32 value) : size_(0) {
+    Resize(shape_size);
+    for (int i = 0; i < shape_size; ++i) {
+      SetDim(i, value);
+    }
+  }
+
+  RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
+    ReplaceWith(dimensions_count, dims_data);
+  }
+
+  RuntimeShape(const std::initializer_list<int> init_list) : size_(0) {
+    BuildFrom(init_list);
+  }
+
+  // Avoid using this constructor.  We should be able to delete it when C++17
+  // rolls out.
+  RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
+    if (size_ > kMaxSmallSize) {
+      dims_pointer_ = new int32[size_];
+    }
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32) * size_);
+  }
+
+  bool operator==(const RuntimeShape& comp) const {
+    return this->size_ == comp.size_ &&
+           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32)) == 0;
+  }
+
+  ~RuntimeShape() {
+    if (size_ > kMaxSmallSize) {
+#ifdef TF_LITE_STATIC_MEMORY
+      TFLITE_CHECK(false && "No shape resizing supported on this platform");
+#else   // TF_LITE_STATIC_MEMORY
+      delete[] dims_pointer_;
+#endif  // TF_LITE_STATIC_MEMORY
+    }
+  }
+
+  inline int32 DimensionsCount() const { return size_; }
+  inline int32 Dims(int i) const {
+    TFLITE_DCHECK_GE(i, 0);
+    TFLITE_DCHECK_LT(i, size_);
+    return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
+  }
+  inline void SetDim(int i, int32 val) {
+    TFLITE_DCHECK_GE(i, 0);
+    TFLITE_DCHECK_LT(i, size_);
+    if (size_ > kMaxSmallSize) {
+      dims_pointer_[i] = val;
+    } else {
+      dims_[i] = val;
+    }
+  }
+
+  inline int32* DimsData() {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+  inline const int32* DimsData() const {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+  // The caller must ensure that the shape is no bigger than 4-D.
+  inline const int32* DimsDataUpTo4D() const { return dims_; }
+
+  inline void Resize(int dimensions_count) {
+    if (size_ > kMaxSmallSize) {
+#ifdef TF_LITE_STATIC_MEMORY
+      TFLITE_CHECK(false && "No shape resizing supported on this platform");
+#else   // TF_LITE_STATIC_MEMORY
+      delete[] dims_pointer_;
+#endif  // TF_LITE_STATIC_MEMORY
+    }
+    size_ = dimensions_count;
+    if (dimensions_count > kMaxSmallSize) {
+#ifdef TF_LITE_STATIC_MEMORY
+      TFLITE_CHECK(false && "No shape resizing supported on this platform");
+#else   // TF_LITE_STATIC_MEMORY
+      dims_pointer_ = new int32[dimensions_count];
+#endif  // TF_LITE_STATIC_MEMORY
+    }
+  }
+
+  inline void ReplaceWith(int dimensions_count, const int32* dims_data) {
+    Resize(dimensions_count);
+    int32* dst_dims = DimsData();
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32));
+  }
+
+  template <typename T>
+  inline void BuildFrom(const T& src_iterable) {
+    const int dimensions_count =
+        std::distance(src_iterable.begin(), src_iterable.end());
+    Resize(dimensions_count);
+    int32* data = DimsData();
+    for (auto it : src_iterable) {
+      *data = it;
+      ++data;
+    }
+  }
+
+  // This will probably be factored out. Old code made substantial use of 4-D
+  // shapes, and so this function is used to extend smaller shapes. Note that
+  // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
+  // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
+  // inputs should already be 4-D, so this function should not be needed.
+  inline static RuntimeShape ExtendedShape(int new_shape_size,
+                                           const RuntimeShape& shape) {
+    return RuntimeShape(new_shape_size, shape, 1);
+  }
+
+  inline void BuildFrom(const std::initializer_list<int> init_list) {
+    BuildFrom<const std::initializer_list<int>>(init_list);
+  }
+
+  // Returns the total count of elements, that is the size when flattened into a
+  // vector.
+  inline int FlatSize() const {
+    int buffer_size = 1;
+    const int* dims_data = DimsData();
+    for (int i = 0; i < size_; i++) {
+      const int dim = dims_data[i];
+      TFLITE_DCHECK_GE(dim, 1);
+      buffer_size *= dim;
+    }
+    return buffer_size;
+  }
+
+  bool operator!=(const RuntimeShape& comp) const { return !((*this) == comp); }
+
+ private:
+  // For use only by ExtendedShape(), written to guarantee (return-value) copy
+  // elision in C++17.
+  // This creates a shape padded to the desired size with the specified value.
+  RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
+      : size_(0) {
+    // If the following check fails, it is likely because a 4D-only kernel is
+    // being used with an array of larger dimension count.
+    TFLITE_CHECK_GE(new_shape_size, shape.DimensionsCount());
+    Resize(new_shape_size);
+    const int size_increase = new_shape_size - shape.DimensionsCount();
+    for (int i = 0; i < size_increase; ++i) {
+      SetDim(i, pad_value);
+    }
+    std::memcpy(DimsData() + size_increase, shape.DimsData(),
+                sizeof(int32) * shape.DimensionsCount());
+  }
+
+  int32 size_;
+  union {
+    int32 dims_[kMaxSmallSize];
+    int32* dims_pointer_;
+  };
+};
+
+// Converts inference-style shape to legacy tflite::Dims<4>.
+inline tflite::Dims<4> ToRuntimeDims(const tflite::RuntimeShape& array_shape) {
+  tflite::Dims<4> result;
+  const int dimensions_count = array_shape.DimensionsCount();
+  TFLITE_CHECK_LE(dimensions_count, 4);
+  int cum_prod = 1;
+  for (int i = 0; i < 4; i++) {
+    const int new_dim =
+        (i < dimensions_count) ? array_shape.Dims(dimensions_count - 1 - i) : 1;
+    result.sizes[i] = new_dim;
+    result.strides[i] = cum_prod;
+    cum_prod *= new_dim;
+  }
+  return result;
+}
+
+// TODO(b/80418076): Move to legacy ops file, update invocations.
+inline RuntimeShape DimsToShape(const tflite::Dims<4>& dims) {
+  return RuntimeShape(
+      {dims.sizes[3], dims.sizes[2], dims.sizes[1], dims.sizes[0]});
+}
+
+// Gets next index to iterate through a multidimensional array.
+inline bool NextIndex(const int num_dims, const int* dims, int* current) {
+  if (num_dims == 0) {
+    return false;
+  }
+  TFLITE_DCHECK(dims != nullptr);
+  TFLITE_DCHECK(current != nullptr);
+  int carry = 1;
+  for (int idx = num_dims - 1; idx >= 0; --idx) {
+    int current_val = current[idx] + carry;
+    TFLITE_DCHECK_GE(dims[idx], current_val);
+    if (dims[idx] == current_val) {
+      current[idx] = 0;
+    } else {
+      current[idx] = current_val;
+      carry = 0;
+      break;
+    }
+  }
+  return (carry == 0);
+}
+
+// Gets offset of index if reducing on axis. When reducing, the flattened offset
+// will not change, if the input index changes on the given axis. For example,
+// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0,
+// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened
+// offset.
+// TODO(kanlig): uses Dims to represent dimensions.
+inline size_t ReducedOutputOffset(const int num_dims, const int* dims,
+                                  const int* index, const int num_axis,
+                                  const int* axis) {
+  if (num_dims == 0) {
+    return 0;
+  }
+  TFLITE_DCHECK(dims != nullptr);
+  TFLITE_DCHECK(index != nullptr);
+  size_t offset = 0;
+  for (int idx = 0; idx < num_dims; ++idx) {
+    // if we need to skip this axis
+    bool is_axis = false;
+    if (axis != nullptr) {
+      for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
+        if (idx == axis[axis_idx]) {
+          is_axis = true;
+          break;
+        }
+      }
+    }
+    if (!is_axis) {
+      offset = offset * static_cast<size_t>(dims[idx]) +
+               static_cast<size_t>(index[idx]);
+    }
+  }
+  return offset;
+}
+
+inline int Offset(const RuntimeShape& shape, int i0, int i1, int i2, int i3) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), 4);
+  const int* dims_data = shape.DimsDataUpTo4D();
+  TFLITE_DCHECK(i0 >= 0 && i0 < dims_data[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < dims_data[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < dims_data[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < dims_data[3]);
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
+  TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
+  return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
+         i3 * dims.strides[3];
+}
+
+inline int Offset(const Dims<4>& dims, int* index) {
+  return Offset(dims, index[0], index[1], index[2], index[3]);
+}
+
+inline int Offset(const RuntimeShape& shape, int* index) {
+  return Offset(shape, index[0], index[1], index[2], index[3]);
+}
+
+// Get array size, DCHECKing that the dim index is in range.
+//
+// Note that this will be phased out with Dims<4>, since RuntimeShape::Dims()
+// already performs this check.
+template <int N>
+int ArraySize(const Dims<N>& array, int index) {
+  TFLITE_DCHECK(index >= 0 && index < N);
+  return array.sizes[index];
+}
+
+// Get common array size, DCHECKing that they all agree.
+template <typename ArrayType1, typename ArrayType2>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2) {
+  TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return ArraySize(array1, index1);
+}
+
+template <typename ArrayType1, typename ArrayType2, typename... Args>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2, Args... args) {
+  TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return MatchingArraySize(array1, index1, args...);
+}
+
+// Get common shape dim, DCHECKing that they all agree.
+inline int MatchingDim(const RuntimeShape& shape1, int index1,
+                       const RuntimeShape& shape2, int index2) {
+  TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
+  return shape1.Dims(index1);
+}
+
+template <typename... Args>
+int MatchingDim(const RuntimeShape& shape1, int index1,
+                const RuntimeShape& shape2, int index2, Args... args) {
+  TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
+  return MatchingDim(shape1, index1, args...);
+}
+
+// Will be phased out with Dims<4>, replaced by RuntimeShape::FlatSize().
+template <int N>
+inline int FlatSize(const Dims<N>& dims) {
+  int flat_size = 1;
+  for (int i = 0; i < N; ++i) {
+    flat_size *= dims.sizes[i];
+  }
+  return flat_size;
+}
+
+TFLITE_DEPRECATED("Prefer FlatSize.")
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  return FlatSize(dims);
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return shape.FlatSize();
+}
+
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0,
+                            const RuntimeShape& check_shape_1) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return MatchingFlatSize(shape, check_shape_1);
+}
+
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0,
+                            const RuntimeShape& check_shape_1,
+                            const RuntimeShape& check_shape_2) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return MatchingFlatSize(shape, check_shape_1, check_shape_2);
+}
+
+inline int MatchingFlatSize(const RuntimeShape& shape,
+                            const RuntimeShape& check_shape_0,
+                            const RuntimeShape& check_shape_1,
+                            const RuntimeShape& check_shape_2,
+                            const RuntimeShape& check_shape_3) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+  }
+  return MatchingFlatSize(shape, check_shape_1, check_shape_2, check_shape_3);
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; ++i) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; ++i) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; ++i) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2,
+                            const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; ++i) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
+}
+
+// Data is required to be contiguous, and so many operators can use either the
+// full array flat size or the flat size with one dimension skipped (commonly
+// the depth).
+template <int N>
+inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
+  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < N);
+  int flat_size = 1;
+  for (int i = 0; i < N; ++i) {
+    flat_size *= (i == skip_dim) ? 1 : dims.sizes[i];
+  }
+  return flat_size;
+}
+
+// A combination of MatchingFlatSize() and FlatSizeSkipDim().
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return FlatSizeSkipDim(dims, skip_dim);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2,
+                                   const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2,
+                                 check_dims_3);
+}
+
+// Data is required to be contiguous, and so many operators can use either the
+// full array flat size or the flat size with one dimension skipped (commonly
+// the depth).
+inline int FlatSizeSkipDim(const RuntimeShape& shape, int skip_dim) {
+  const int dims_count = shape.DimensionsCount();
+  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < dims_count);
+  const auto* dims_data = shape.DimsData();
+  int flat_size = 1;
+  for (int i = 0; i < dims_count; ++i) {
+    flat_size *= (i == skip_dim) ? 1 : dims_data[i];
+  }
+  return flat_size;
+}
+
+// A combination of MatchingFlatSize() and FlatSizeSkipDim().
+inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
+                                   const RuntimeShape& check_shape_0) {
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+    }
+  }
+  return FlatSizeSkipDim(shape, skip_dim);
+}
+
+inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
+                                   const RuntimeShape& check_shape_0,
+                                   const RuntimeShape& check_shape_1) {
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1);
+}
+
+inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
+                                   const RuntimeShape& check_shape_0,
+                                   const RuntimeShape& check_shape_1,
+                                   const RuntimeShape& check_shape_2) {
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1, check_shape_2);
+}
+
+inline int MatchingFlatSizeSkipDim(const RuntimeShape& shape, int skip_dim,
+                                   const RuntimeShape& check_shape_0,
+                                   const RuntimeShape& check_shape_1,
+                                   const RuntimeShape& check_shape_2,
+                                   const RuntimeShape& check_shape_3) {
+  const int dims_count = shape.DimensionsCount();
+  for (int i = 0; i < dims_count; ++i) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(shape, skip_dim, check_shape_1, check_shape_2,
+                                 check_shape_3);
+}
+
+template <int N>
+bool IsPackedWithoutStrides(const Dims<N>& dims) {
+  int expected_stride = 1;
+  for (int d = 0; d < N; d++) {
+    if (dims.strides[d] != expected_stride) return false;
+    expected_stride *= dims.sizes[d];
+  }
+  return true;
+}
+
+template <int N>
+void ComputeStrides(Dims<N>* dims) {
+  dims->strides[0] = 1;
+  for (int d = 1; d < N; d++) {
+    dims->strides[d] = dims->strides[d - 1] * dims->sizes[d - 1];
+  }
+}
+
+enum class BroadcastableOpCategory : uint8 {
+  kNone,
+  kNonBroadcast,               // Matching input shapes.
+  kFirstInputBroadcastsFast,   // Fivefold nested loops.
+  kSecondInputBroadcastsFast,  // Fivefold nested loops.
+  kGenericBroadcast,           // Fall-back.
+};
+
+struct MinMax {
+  float min;
+  float max;
+};
+static_assert(sizeof(MinMax) == 8, "");
+
+struct ActivationParams {
+  FusedActivationFunctionType activation_type;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+};
+
+// Styles of resizing op usages. For example, kImageStyle can be used with a Pad
+// op for pattern-specific optimization.
+enum class ResizingCategory : uint8 {
+  kNone,
+  kImageStyle,  // 4D, operating on inner dimensions, say {0, a, b, 0}.
+  kGenericResize,
+};
+
+// For Add, Sub, Mul ops.
+struct ArithmeticParams {
+  // Shape dependent / common to data / op types.
+  BroadcastableOpCategory broadcast_category;
+  // uint8 inference params.
+  int32 input1_offset;
+  int32 input2_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  // Add / Sub, not Mul, uint8 inference params.
+  int left_shift;
+  int32 input1_multiplier;
+  int input1_shift;
+  int32 input2_multiplier;
+  int input2_shift;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+
+  // Processed output dimensions.
+  // Let input "a" be the one that broadcasts in the faster-changing dimension.
+  // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and
+  // {b0, b1, b2, b3, b4},
+  // broadcast_shape[4] = b0 = a0.
+  // broadcast_shape[3] = b1; a1 = 1.
+  // broadcast_shape[2] = b2 = a2.
+  // broadcast_shape[1] = a3; b3 = 1.
+  // broadcast_shape[0] = b4 = a4.
+  int broadcast_shape[5];
+};
+
+struct ConcatenationParams {
+  int8 axis;
+  const int32* input_zeropoint;
+  const float* input_scale;
+  uint16 inputs_count;
+  int32 output_zeropoint;
+  float output_scale;
+};
+
+struct ComparisonParams {
+  // uint8 inference params.
+  int left_shift;
+  int32 input1_offset;
+  int32 input1_multiplier;
+  int input1_shift;
+  int32 input2_offset;
+  int32 input2_multiplier;
+  int input2_shift;
+  // Shape dependent / common to inference types.
+  bool is_broadcast;
+};
+
+struct ConvParams {
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  // TODO(starka): This was just "stride", so check that width+height is OK.
+  int16 stride_width;
+  int16 stride_height;
+  int16 dilation_width_factor;
+  int16 dilation_height_factor;
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct DepthToSpaceParams {
+  int32 block_size;
+};
+
+struct DepthwiseParams {
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int16 stride_width;
+  int16 stride_height;
+  int16 dilation_width_factor;
+  int16 dilation_height_factor;
+  int16 depth_multiplier;
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct DequantizationParams {
+  double scale;
+  int32 zero_point;
+};
+
+struct FakeQuantParams {
+  MinMax minmax;
+  int32 num_bits;
+};
+
+struct FullyConnectedParams {
+  // uint8 inference params.
+  // TODO(b/65838351): Use smaller types if appropriate.
+  int32 input_offset;
+  int32 weights_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+  FullyConnectedWeightsFormat weights_format;
+};
+
+struct GatherParams {
+  int16 axis;
+};
+
+struct L2NormalizationParams {
+  // uint8 inference params.
+  int32 input_zero_point;
+};
+
+struct LocalResponseNormalizationParams {
+  int32 range;
+  double bias;
+  double alpha;
+  double beta;
+};
+
+struct LogisticParams {
+  // uint8 inference params.
+  int32 input_zero_point;
+  int32 input_range_radius;
+  int32 input_multiplier;
+  int input_left_shift;
+};
+
+struct LstmCellParams {
+  int32 weights_zero_point;
+  int32 accum_multiplier;
+  int accum_shift;
+  int state_integer_bits;
+};
+
+struct MeanParams {
+  int8 axis_count;
+  int16 axis[4];
+};
+
+struct PackParams {
+  int8 axis;
+  const int32* input_zeropoint;
+  const float* input_scale;
+  uint16 inputs_count;
+  int32 output_zeropoint;
+  float output_scale;
+};
+
+struct PadParams {
+  int8 left_padding_count;
+  int32 left_padding[4];
+  int8 right_padding_count;
+  int32 right_padding[4];
+  ResizingCategory resizing_category;
+};
+
+struct PreluParams {
+  int32 input_offset;
+  int32 alpha_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+};
+
+struct PoolParams {
+  FusedActivationFunctionType activation;
+  PaddingType padding_type;
+  PaddingValues padding_values;
+  int stride_height;
+  int stride_width;
+  int filter_height;
+  int filter_width;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+};
+
+struct ReshapeParams {
+  int8 shape_count;
+  int32 shape[4];
+};
+
+struct ResizeBilinearParams {
+  bool align_corners;
+};
+
+struct ResizeNearestNeighborParams {
+  bool align_corners;
+};
+
+struct SliceParams {
+  int8 begin_count;
+  int32 begin[4];
+  int8 size_count;
+  int32 size[4];
+};
+
+struct SoftmaxParams {
+  // beta is not really used (not a Tensorflow parameter) and not implemented
+  // for LogSoftmax.
+  double beta;
+  // uint8 inference params.  Used even when beta defaults to 1.0.
+  int32 input_multiplier;
+  int32 input_left_shift;
+  // Reverse scaling is only used by LogSoftmax.
+  int32 reverse_scaling_divisor;
+  int32 reverse_scaling_right_shift;
+  int diff_min;
+};
+
+struct SpaceToBatchParams {
+  // "Zero" padding for uint8 means padding with the output offset.
+  int32 output_offset;
+};
+
+struct SpaceToDepthParams {
+  int32 block_size;
+};
+
+struct SplitParams {
+  // Graphs that split into, say, 2000 nodes are encountered.  The indices in
+  // OperatorEdges are of type uint16.
+  uint16 num_split;
+  int16 axis;
+};
+
+struct SqueezeParams {
+  int8 squeeze_dims_count;
+  int32 squeeze_dims[4];
+};
+
+struct StridedSliceParams {
+  int8 start_indices_count;
+  int16 start_indices[4];
+  int8 stop_indices_count;
+  int16 stop_indices[4];
+  int8 strides_count;
+  int16 strides[4];
+
+  int16 begin_mask;
+  int16 ellipsis_mask;
+  int16 end_mask;
+  int16 new_axis_mask;
+  int16 shrink_axis_mask;
+};
+
+struct TanhParams {
+  int32 input_zero_point;
+  int32 input_range_radius;
+  int32 input_multiplier;
+  int input_left_shift;
+};
+
+struct TransposeParams {
+  int8 perm_count;
+  int32 perm[4];
+};
+
+struct UnpackParams {
+  uint16 num_split;
+  int16 axis;
+};
+
+struct LeakyReluParams {
+  float alpha;
+};
+
+template <typename P>
+inline void SetActivationParams(float min, float max, P* params) {
+  params->float_activation_min = min;
+  params->float_activation_max = max;
+}
+
+template <typename P>
+inline void SetActivationParams(int32 min, int32 max, P* params) {
+  params->quantized_activation_min = min;
+  params->quantized_activation_max = max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, int32* min, int32* max) {
+  *min = params.quantized_activation_min;
+  *max = params.quantized_activation_max;
+}
+
+template <typename P>
+inline void GetActivationParams(const P& params, float* min, float* max) {
+  *min = params.float_activation_min;
+  *max = params.float_activation_max;
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/kernel_util.cc
rename to tensorflow/lite/kernels/kernel_util.cc
index 503ef28459191208fb16d3abf8b04b9f9abb1332..e39890e3320eb4d1e2dcd0c8256bb96631e75011 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 #include <algorithm>
 #include <cmath>
 #include <memory>
 
-#include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/round.h"
 
 namespace tflite {
 
@@ -117,6 +117,10 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
   int64_t dims1 = NumDimensions(input1);
   int64_t dims2 = NumDimensions(input2);
   int64_t out_dims = std::max(dims1, dims2);
+  if (NumElements(input1) == 0) {
+    *output_shape = TfLiteIntArrayCopy(input1->dims);
+    return kTfLiteOk;
+  }
   std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
       TfLiteIntArrayCreate(out_dims), TfLiteIntArrayFree);
   for (int i = 0; i < out_dims; ++i) {
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
similarity index 95%
rename from tensorflow/contrib/lite/kernels/kernel_util.h
rename to tensorflow/lite/kernels/kernel_util.h
index e9a5fd7a4052cd63db5c3b92e5b5235d3d369668..3cc00588d63feddc90d17997cebe2c8d063c45eb 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#ifndef TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
 
 #include <algorithm>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 
@@ -135,4 +135,4 @@ TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
                                         TfLiteIntArray** output_shape);
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#endif  // TENSORFLOW_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/kernel_util_test.cc
rename to tensorflow/lite/kernels/kernel_util_test.cc
index bf6f249acc85ee050681eb6f33067be2a1aa037e..70eb18365891097686d579bde4a5457703e84aee 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/lite/kernels/l2norm.cc
similarity index 90%
rename from tensorflow/contrib/lite/kernels/l2norm.cc
rename to tensorflow/lite/kernels/l2norm.cc
index e02d7df9ef1a383bcb930d345f33e798724bb921..19a4824e9398decec862bb7f5d20ac05b2652226 100644
--- a/tensorflow/contrib/lite/kernels/l2norm.cc
+++ b/tensorflow/lite/kernels/l2norm.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/lite/kernels/l2norm_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/l2norm_test.cc
rename to tensorflow/lite/kernels/l2norm_test.cc
index 070ed60040997f18f7e8053acc9532adc2377400..50108a5a264c3624bbd9c230f50c65f5897480bb 100644
--- a/tensorflow/contrib/lite/kernels/l2norm_test.cc
+++ b/tensorflow/lite/kernels/l2norm_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/layer_norm_lstm.cc b/tensorflow/lite/kernels/layer_norm_lstm.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/layer_norm_lstm.cc
rename to tensorflow/lite/kernels/layer_norm_lstm.cc
index 9739fd4514bd582fb5c4d8ad6b9cb55a817db80c..49e8a53c829a0c4a8ae355f8e7a6b97e3bbb81e1 100644
--- a/tensorflow/contrib/lite/kernels/layer_norm_lstm.cc
+++ b/tensorflow/lite/kernels/layer_norm_lstm.cc
@@ -17,9 +17,9 @@ limitations under the License.
 // deviation to the activation of the LSTM layers. Please see
 // https://arxiv.org/abs/1607.06450 for details.
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -55,7 +55,7 @@ constexpr int kCellToForgetWeightsTensor = 10;  // Optional
 constexpr int kCellToOutputWeightsTensor = 11;  // Optional
 
 // Layer norm weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kInputLayerNormWeightsTensor = 12;
+constexpr int kInputLayerNormWeightsTensor = 12;  // Optional
 constexpr int kForgetLayerNormWeightsTensor = 13;
 constexpr int kCellLayerNormWeightsTensor = 14;
 constexpr int kOutputLayerNormWeightsTensor = 15;
@@ -118,7 +118,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  if (input_to_input_weights != nullptr) {
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  if (!use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
@@ -138,7 +139,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (recurrent_to_input_weights != nullptr) {
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights, nullptr);
+  } else {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
                       n_cell);
@@ -161,15 +164,6 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
                     n_output);
 
-  // We make sure the input-gate's parameters are either both present (regular
-  // LSTM) or not at all (CIFG-LSTM).
-  const bool cifg_weights_all_or_none =
-      ((input_to_input_weights != nullptr) &&
-       (recurrent_to_input_weights != nullptr)) ||
-      ((input_to_input_weights == nullptr) &&
-       (recurrent_to_input_weights == nullptr));
-  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
-
   const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
   if (cell_to_input_weights) {
@@ -192,7 +186,6 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   }
 
   // Making sure the peephole weights are there all or none.
-  const bool use_cifg = (input_to_input_weights == nullptr);
   const bool peephole_weights_all_or_none =
       ((cell_to_input_weights != nullptr || use_cifg) &&
        (cell_to_forget_weights != nullptr) &&
@@ -204,10 +197,14 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   // Making sure layer norm weights are not null and have the right dimension.
   const TfLiteTensor* input_layer_norm_weights =
-      GetInput(context, node, kInputLayerNormWeightsTensor);
-  TF_LITE_ENSURE(context, input_layer_norm_weights != nullptr);
-  TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->data[0], n_cell);
+      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights, nullptr);
+  } else {
+    TF_LITE_ENSURE(context, input_layer_norm_weights != nullptr);
+    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->data[0], n_cell);
+  }
 
   const TfLiteTensor* forget_layer_norm_weights =
       GetInput(context, node, kForgetLayerNormWeightsTensor);
@@ -409,9 +406,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-    scaling_factors_size->data[0] = n_batch;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+    int scaling_dims[1] = {n_batch};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = n_batch;
       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
                                                        scaling_factors_size));
     }
@@ -420,10 +418,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         GetTemporary(context, node, /*index=*/5);
     prod_scaling_factors->type = kTfLiteFloat32;
     prod_scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
-    prod_scaling_factors_size->data[0] = n_batch;
-    if (!TfLiteIntArrayEqual(prod_scaling_factors->dims,
-                             prod_scaling_factors_size)) {
+    if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
+                                   scaling_dims)) {
+      TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
+      prod_scaling_factors_size->data[0] = n_batch;
       TF_LITE_ENSURE_OK(context,
                         context->ResizeTensor(context, prod_scaling_factors,
                                               prod_scaling_factors_size));
@@ -435,9 +433,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* recovered_weights = GetTemporary(context, node, /*index=*/6);
     recovered_weights->type = kTfLiteFloat32;
     recovered_weights->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* recovered_weights_size = TfLiteIntArrayCreate(1);
-    recovered_weights_size->data[0] = n_cell;
-    if (!TfLiteIntArrayEqual(recovered_weights->dims, recovered_weights_size)) {
+    int recovered_dims[1] = {n_cell};
+    if (!TfLiteIntArrayEqualsArray(recovered_weights->dims, 1,
+                                   recovered_dims)) {
+      TfLiteIntArray* recovered_weights_size = TfLiteIntArrayCreate(1);
+      recovered_weights_size->data[0] = n_cell;
       TF_LITE_ENSURE_OK(context,
                         context->ResizeTensor(context, recovered_weights,
                                               recovered_weights_size));
@@ -975,6 +975,9 @@ TfLiteStatus EvalFloat(
       (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
   const float* projection_bias_ptr =
       (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+  const float* input_layer_norm_weight_ptr =
+      (input_layer_norm_weights == nullptr) ? nullptr
+                                            : input_layer_norm_weights->data.f;
 
   // Required tensors, pointers are non-null.
   const float* input_ptr_batch = input->data.f;
@@ -987,7 +990,6 @@ TfLiteStatus EvalFloat(
       recurrent_to_cell_weights->data.f;
   const float* recurrent_to_output_weights_ptr =
       recurrent_to_output_weights->data.f;
-  const float* input_layer_norm_weight_ptr = input_layer_norm_weights->data.f;
   const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
   const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
   const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
@@ -1112,6 +1114,9 @@ TfLiteStatus EvalHybrid(
       (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
   const float* projection_bias_ptr =
       (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+  const float* input_layer_norm_weight_ptr =
+      (input_layer_norm_weights == nullptr) ? nullptr
+                                            : input_layer_norm_weights->data.f;
 
   // Required tensors, pointers are non-null.
   const float* input_ptr_batch = input->data.f;
@@ -1138,7 +1143,6 @@ TfLiteStatus EvalHybrid(
       reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
   const float recurrent_to_output_weights_scale =
       recurrent_to_output_weights->params.scale;
-  const float* input_layer_norm_weight_ptr = input_layer_norm_weights->data.f;
   const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
   const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
   const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
@@ -1218,7 +1222,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
 
   const TfLiteTensor* input_layer_norm_weights =
-      GetInput(context, node, kInputLayerNormWeightsTensor);
+      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
   const TfLiteTensor* forget_layer_norm_weights =
       GetInput(context, node, kForgetLayerNormWeightsTensor);
   const TfLiteTensor* cell_layer_norm_weights =
diff --git a/tensorflow/lite/kernels/layer_norm_lstm_test.cc b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c13cee1c3f66ed2a3459cd2bcc32211c3b1f00e
--- /dev/null
+++ b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
@@ -0,0 +1,883 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Layer Norm LSTM op.
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_LAYER_NORM_LSTM();
+
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LayerNormLSTMOpModel : public SingleOpModel {
+ public:
+  LayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                       bool use_cifg, bool use_peephole,
+                       bool use_projection_weights, bool use_projection_bias,
+                       float cell_clip, float proj_clip,
+                       const std::vector<std::vector<int>>& input_shapes,
+                       const TensorType& weight_type = TensorType_FLOAT32)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output) {
+    input_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ = AddInput(weight_type);
+    }
+
+    input_to_forget_weights_ = AddInput(weight_type);
+    input_to_cell_weights_ = AddInput(weight_type);
+    input_to_output_weights_ = AddInput(weight_type);
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ = AddInput(weight_type);
+    }
+
+    recurrent_to_forget_weights_ = AddInput(weight_type);
+    recurrent_to_cell_weights_ = AddInput(weight_type);
+    recurrent_to_output_weights_ = AddInput(weight_type);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput(weight_type);
+      }
+      cell_to_forget_weights_ = AddInput(weight_type);
+      cell_to_output_weights_ = AddInput(weight_type);
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_layer_norm_weights_ = AddNullInput();
+    } else {
+      input_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
+    }
+    forget_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
+    cell_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
+    output_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput(TensorType_FLOAT32);
+    }
+    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+    cell_bias_ = AddInput(TensorType_FLOAT32);
+    output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput(weight_type);
+      if (use_projection_bias) {
+        projection_bias_ = AddInput(TensorType_FLOAT32);
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    // Adding the 2 state tensors.
+    output_state_ =
+        AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true);
+    cell_state_ =
+        AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
+
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    // Set up and pass in custom options using flexbuffer.
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Int("cell_clip", cell_clip);
+      fbb.Int("proj_clip", proj_clip);
+      fbb.String("fused_activation_function", "TANH");
+    });
+    fbb.Finish();
+    SetCustomOp("LAYER_NORM_LSTM", fbb.GetBuffer(), Register_LAYER_NORM_LSTM);
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetInputToInputWeights(std::vector<float> f) {
+    PopulateTensor(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(std::vector<float> f) {
+    PopulateTensor(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(std::vector<float> f) {
+    PopulateTensor(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(std::vector<float> f) {
+    PopulateTensor(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(std::vector<float> f) {
+    PopulateTensor(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(std::vector<float> f) {
+    PopulateTensor(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(std::vector<float> f) {
+    PopulateTensor(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(std::vector<float> f) {
+    PopulateTensor(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(std::vector<float> f) {
+    PopulateTensor(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::vector<float> f) {
+    PopulateTensor(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::vector<float> f) {
+    PopulateTensor(cell_to_output_weights_, f);
+  }
+
+  void SetInputLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(input_layer_norm_weights_, f);
+  }
+
+  void SetForgetLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(forget_layer_norm_weights_, f);
+  }
+
+  void SetCellLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(cell_layer_norm_weights_, f);
+  }
+
+  void SetOutputLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(output_layer_norm_weights_, f);
+  }
+
+  void SetInputGateBias(std::vector<float> f) {
+    PopulateTensor(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(std::vector<float> f) {
+    PopulateTensor(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
+
+  void SetOutputGateBias(std::vector<float> f) {
+    PopulateTensor(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(std::vector<float> f) {
+    PopulateTensor(projection_weights_, f);
+  }
+
+  void SetProjectionBias(std::vector<float> f) {
+    PopulateTensor(projection_bias_, f);
+  }
+
+  void SetInput(int offset, const float* begin, const float* end) {
+    PopulateTensor(input_, offset, const_cast<float*>(begin),
+                   const_cast<float*>(end));
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+
+ protected:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_layer_norm_weights_;
+  int forget_layer_norm_weights_;
+  int cell_layer_norm_weights_;
+  int output_layer_norm_weights_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+
+  int output_state_;
+  int cell_state_;
+
+  int output_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+};
+
+class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
+ public:
+  HybridLayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                             bool use_cifg, bool use_peephole,
+                             bool use_projection_weights,
+                             bool use_projection_bias, float cell_clip,
+                             float proj_clip,
+                             const std::vector<std::vector<int>>& input_shapes)
+      : LayerNormLSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg,
+                             use_peephole, use_projection_weights,
+                             use_projection_bias, cell_clip, proj_clip,
+                             input_shapes, TensorType_UINT8) {}
+
+  void SetInputToInputWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+  }
+
+  void SetInputLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(input_layer_norm_weights_, f);
+  }
+
+  void SetForgetLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(forget_layer_norm_weights_, f);
+  }
+
+  void SetCellLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(cell_layer_norm_weights_, f);
+  }
+
+  void SetOutputLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(output_layer_norm_weights_, f);
+  }
+
+  void SetProjectionWeights(std::vector<float> f) {
+    SymmetricQuantizeAndPopulate(projection_weights_, f);
+  }
+};
+
+class BaseLayerNormLstmTest : public ::testing::Test {
+ protected:
+  // Weights of the Layer Norm LSTM model. Some are optional.
+  std::vector<float> input_to_input_weights_;
+  std::vector<float> input_to_cell_weights_;
+  std::vector<float> input_to_forget_weights_;
+  std::vector<float> input_to_output_weights_;
+  std::vector<float> input_gate_bias_;
+  std::vector<float> cell_gate_bias_;
+  std::vector<float> forget_gate_bias_;
+  std::vector<float> output_gate_bias_;
+  std::vector<float> recurrent_to_input_weights_;
+  std::vector<float> recurrent_to_cell_weights_;
+  std::vector<float> recurrent_to_forget_weights_;
+  std::vector<float> recurrent_to_output_weights_;
+  std::vector<float> cell_to_input_weights_;
+  std::vector<float> cell_to_forget_weights_;
+  std::vector<float> cell_to_output_weights_;
+  std::vector<float> input_layer_norm_weights_;
+  std::vector<float> forget_layer_norm_weights_;
+  std::vector<float> cell_layer_norm_weights_;
+  std::vector<float> output_layer_norm_weights_;
+  std::vector<float> projection_weights_;
+
+  // Layer Norm LSTM input is stored as num_batch x num_inputs vector.
+  std::vector<std::vector<float>> layer_norm_lstm_input_;
+
+  // Compares output up to tolerance to the result of the layer_norm_lstm given
+  // the input.
+  void VerifyGoldens(const std::vector<std::vector<float>>& input,
+                     const std::vector<std::vector<float>>& output,
+                     LayerNormLSTMOpModel* layer_norm_lstm,
+                     float tolerance = 1e-5) {
+    const int num_batches = input.size();
+    EXPECT_GT(num_batches, 0);
+    const int num_inputs = layer_norm_lstm->num_inputs();
+    EXPECT_GT(num_inputs, 0);
+    const int input_sequence_size = input[0].size() / num_inputs;
+    EXPECT_GT(input_sequence_size, 0);
+    for (int i = 0; i < input_sequence_size; ++i) {
+      for (int b = 0; b < num_batches; ++b) {
+        const float* batch_start = input[b].data() + i * num_inputs;
+        const float* batch_end = batch_start + num_inputs;
+
+        layer_norm_lstm->SetInput(b * layer_norm_lstm->num_inputs(),
+                                  batch_start, batch_end);
+      }
+
+      layer_norm_lstm->Invoke();
+
+      const int num_outputs = layer_norm_lstm->num_outputs();
+      std::vector<float> expected;
+      for (int b = 0; b < num_batches; ++b) {
+        const float* golden_start_batch = output[b].data() + i * num_outputs;
+        const float* golden_end_batch = golden_start_batch + num_outputs;
+        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+      }
+      EXPECT_THAT(layer_norm_lstm->GetOutput(),
+                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
+    }
+  }
+};
+
+class NoCifgPeepholeProjectionNoClippingLayerNormLstmTest
+    : public BaseLayerNormLstmTest {
+  void SetUp() override {
+    input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
+                               0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
+                               -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
+
+    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+
+    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+
+    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+    input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
+
+    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+
+    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+
+    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+    recurrent_to_input_weights_ = {-0.2, -0.3, 0.4,  0.1,  -0.5, 0.9,
+                                   -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
+
+    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+
+    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+
+    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+    cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
+
+    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+
+    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+    input_layer_norm_weights_ = {0.1, 0.2, 0.3, 0.5};
+    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
+    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
+    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
+
+    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+    layer_norm_lstm_input_ = {
+        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
+         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
+         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
+         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
+
+        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
+         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
+         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
+         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
+    };
+  }
+};
+
+TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       LayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  LayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_layer_norm_weight tensor
+          {n_cell},  // forget_layer_norm_weight tensor
+          {n_cell},  // cell_layer_norm_weight tensor
+          {n_cell},  // output_layer_norm_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
+  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0244077, 0.128027, -0.00170918,  // seq 0
+          0.0137642, 0.140751, 0.0395835,    // seq 1
+          -0.00459231, 0.155278, 0.0837377,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.00692428, 0.0848741, 0.063445,  // seq 0
+          -0.00403912, 0.139963, 0.072681,   // seq 1
+          0.00752706, 0.161903, 0.0561371,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_layer_norm_weight tensor
+          {n_cell},  // forget_layer_norm_weight tensor
+          {n_cell},  // cell_layer_norm_weight tensor
+          {n_cell},  // output_layer_norm_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
+  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0244576, 0.127847, -0.00181765,  // seq 0
+          0.0137518, 0.140892, 0.0402234,    // seq 1
+          -0.0048839, 0.155096, 0.0840309,   // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.00728636, 0.0843957, 0.0634786,  // seq 0
+          -0.00448382, 0.139278, 0.0737372,   // seq 1
+          0.00734616, 0.161793, 0.0560238,    // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+class CifgPeepholeProjectionNoClippingLayerNormLstmTest
+    : public BaseLayerNormLstmTest {
+  void SetUp() override {
+    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
+    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
+    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
+    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+    layer_norm_lstm_input_ = {
+        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
+         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
+         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
+         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
+
+        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
+         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
+         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
+         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
+    };
+  }
+};
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       LayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  LayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_layer_norm_weight tensor
+          {n_cell},  // forget_layer_norm_weight tensor
+          {n_cell},  // cell_layer_norm_weight tensor
+          {n_cell},  // output_layer_norm_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.02129706, 0.140816242, 0.0112733059,     // seq 0
+          0.0132302344, 0.152308047, 0.0346313119,   // seq 1
+          -0.0123688057, 0.165790111, 0.0893077999,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0226350538, 0.0916948169, 0.0769175813,  // seq 0
+          -0.0269966982, 0.149707705, 0.094149217,    // seq 1
+          -0.0103429332, 0.173016444, 0.0720508844,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_layer_norm_weight tensor
+          {n_cell},  // forget_layer_norm_weight tensor
+          {n_cell},  // cell_layer_norm_weight tensor
+          {n_cell},  // output_layer_norm_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0212250091, 0.140474007, 0.0115012666,   // seq 0
+          0.0130806509, 0.152660668, 0.0347516984,   // seq 1
+          -0.0124010444, 0.166042402, 0.0898982584,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0228835996, 0.0917588323, 0.0778886303,  // seq 0
+          -0.0275101066, 0.148769245, 0.0938384682,   // seq 1
+          -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm.cc b/tensorflow/lite/kernels/local_response_norm.cc
similarity index 89%
rename from tensorflow/contrib/lite/kernels/local_response_norm.cc
rename to tensorflow/lite/kernels/local_response_norm.cc
index 334d2a2788d10f072b30990ed9a0245d6204af81..5cbf5d9eae700fa52b7fc74f2066df3c68483cd8 100644
--- a/tensorflow/contrib/lite/kernels/local_response_norm.cc
+++ b/tensorflow/lite/kernels/local_response_norm.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm_test.cc b/tensorflow/lite/kernels/local_response_norm_test.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/local_response_norm_test.cc
rename to tensorflow/lite/kernels/local_response_norm_test.cc
index d75ce258a04c820d8f82735988c01d0154ef36f2..bd644e07f46562f8bbdc6c9cada709337feae889 100644
--- a/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
+++ b/tensorflow/lite/kernels/local_response_norm_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/log_softmax_test.cc b/tensorflow/lite/kernels/log_softmax_test.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/log_softmax_test.cc
rename to tensorflow/lite/kernels/log_softmax_test.cc
index 1acc966cdc947cc68d3f58be08fabd9ef7679f88..fb126295e6afdf485a7633c5e0373e89277ab794 100644
--- a/tensorflow/contrib/lite/kernels/log_softmax_test.cc
+++ b/tensorflow/lite/kernels/log_softmax_test.cc
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/logical.cc b/tensorflow/lite/kernels/logical.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/logical.cc
rename to tensorflow/lite/kernels/logical.cc
index f770cb35d1b9ffa3f8164f8a76d7a69e4713162d..582bcff64a882e4431b3682b78a596b29701ad76 100644
--- a/tensorflow/contrib/lite/kernels/logical.cc
+++ b/tensorflow/lite/kernels/logical.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/logical_test.cc b/tensorflow/lite/kernels/logical_test.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/logical_test.cc
rename to tensorflow/lite/kernels/logical_test.cc
index 206cbde98fa48ec5f7c863bbced9dccc9cab5207..b31616452717b14c1688f43c1264ee2f01577136 100644
--- a/tensorflow/contrib/lite/kernels/logical_test.cc
+++ b/tensorflow/lite/kernels/logical_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/lite/kernels/lsh_projection.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/lsh_projection.cc
rename to tensorflow/lite/kernels/lsh_projection.cc
index 9fa1c5f1002d893daa776528d53a6249bfe0ac98..f68ff4d634a7c959c74b3ed7f3f4137af40313c1 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection.cc
+++ b/tensorflow/lite/kernels/lsh_projection.cc
@@ -59,10 +59,10 @@ limitations under the License.
 #include <limits>
 #include <memory>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 #include <farmhash.h>
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection_test.cc b/tensorflow/lite/kernels/lsh_projection_test.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/lsh_projection_test.cc
rename to tensorflow/lite/kernels/lsh_projection_test.cc
index 414d728dfc153058ec878d3c766f58e86815cd3f..cb2724a6ccebd9d620ede38fbaae1fab0a1f31c0 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
+++ b/tensorflow/lite/kernels/lsh_projection_test.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b57e2883b05232325d55ae9e6a08ed142b9a2dbb
--- /dev/null
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -0,0 +1,783 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/gemm_support.h"
+#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/lstm_eval.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lstm {
+
+struct OpData {
+  // Which kernel type to use. Full kernel (20 inputs) or basic kernel
+  // (5 inputs).
+  TfLiteLSTMKernelType kernel_type;
+
+  // These fields are only used by full kernel.
+  int activation_state_tensor_index;
+  int cell_state_tensor_index;
+  int scratch_tensor_index;
+};
+
+// For full inputs kernel (20-inputs).
+namespace full {
+
+// Input Tensors of size {n_batch, n_input}
+constexpr int kInputTensor = 0;
+
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kInputToInputWeightsTensor = 1;  // Optional
+constexpr int kInputToForgetWeightsTensor = 2;
+constexpr int kInputToCellWeightsTensor = 3;
+constexpr int kInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
+constexpr int kRecurrentToForgetWeightsTensor = 6;
+constexpr int kRecurrentToCellWeightsTensor = 7;
+constexpr int kRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kCellToInputWeightsTensor = 9;    // Optional
+constexpr int kCellToForgetWeightsTensor = 10;  // Optional
+constexpr int kCellToOutputWeightsTensor = 11;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kInputGateBiasTensor = 12;  // Optional
+constexpr int kForgetGateBiasTensor = 13;
+constexpr int kCellGateBiasTensor = 14;
+constexpr int kOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kProjectionWeightsTensor = 16;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kProjectionBiasTensor = 17;  // Optional
+
+// These state tensors are defined as variable tensors, and will be modified by
+// this op.
+constexpr int kInputActivationStateTensor = 18;
+constexpr int kInputCellStateTensor = 19;
+
+// Output tensors.
+constexpr int kOutputTensor = 0;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData();
+  op_data->kernel_type = kTfLiteLSTMFullKernel;
+  context->AddTensors(context, /*tensors_to_add=*/7,
+                      &op_data->scratch_tensor_index);
+  return op_data;
+}
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
+                                        TfLiteNode* node, int n_input,
+                                        int n_output, int n_cell) {
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+  // Making sure clipping parameters have valid values.
+  // == 0 means no clipping
+  //  > 0 means clipping
+  TF_LITE_ENSURE(context, params->cell_clip >= 0);
+  TF_LITE_ENSURE(context, params->proj_clip >= 0);
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  if (input_to_input_weights != nullptr) {
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+  }
+
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  if (recurrent_to_input_weights != nullptr) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+                      n_output);
+  }
+
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
+                    n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+                    n_output);
+
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+                    n_output);
+
+  // We make sure the input-gate's parameters are either both present (regular
+  // LSTM) or not at all (CIFG-LSTM).
+  const bool cifg_weights_all_or_none =
+      ((input_to_input_weights != nullptr) &&
+       (recurrent_to_input_weights != nullptr)) ||
+      ((input_to_input_weights == nullptr) &&
+       (recurrent_to_input_weights == nullptr));
+  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  if (cell_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+  }
+
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  if (cell_to_forget_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+  }
+
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+  if (cell_to_output_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+  }
+
+  // Making sure the peephole weights are there all or none.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool peephole_weights_all_or_none =
+      ((cell_to_input_weights != nullptr || use_cifg) &&
+       (cell_to_forget_weights != nullptr) &&
+       (cell_to_output_weights != nullptr)) ||
+      ((cell_to_input_weights == nullptr) &&
+       (cell_to_forget_weights == nullptr) &&
+       (cell_to_output_weights == nullptr));
+  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
+
+  // Make sure the input gate bias is present only when not a CIFG-LSTM.
+  const TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+  }
+
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  if (projection_weights != nullptr) {
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+  }
+
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+  if (projection_bias != nullptr) {
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+  }
+
+  // Making sure the projection tensors are consistent:
+  // 1) If projection weight is not present, then projection bias should not be
+  // present.
+  // 2) If projection weight is present, then projection bias is optional.
+  // TODO(ghodrat): make sure this is correct.
+  const bool projection_tensors_consistent =
+      ((projection_weights != nullptr) || (projection_bias == nullptr));
+  TF_LITE_ENSURE(context, projection_tensors_consistent == true);
+
+  return kTfLiteOk;
+}
+
+// Resize the output, state tensors based on the sizes of the input tensors.
+// Allocate a temporary scratch tensor. Also check that the sizes of the input
+// tensors match each other.
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 20);
+
+  op_data->activation_state_tensor_index =
+      node->inputs->data[kInputActivationStateTensor];
+  op_data->cell_state_tensor_index = node->inputs->data[kInputCellStateTensor];
+
+  // Inferring batch size, number of outputs and number of cells from the
+  // input tensors.
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE(context, input->dims->size > 1);
+  const int n_batch = input->dims->data[0];
+  const int n_input = input->dims->data[1];
+
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+  const int n_cell = input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
+
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
+                    n_cell);
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Check that input tensor dimensions matches with each other.
+  TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input,
+                                                        n_output, n_cell));
+
+  // Get the pointer to output, activation_state and cell_state tensors.
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TfLiteTensor* activation_state =
+      &context->tensors[op_data->activation_state_tensor_index];
+  TfLiteTensor* cell_state =
+      &context->tensors[op_data->cell_state_tensor_index];
+
+  // Check the shape of input state tensors.
+  // These tensor may be 1D or 2D. It's fine as long as the total size is
+  // correct.
+  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
+
+  // Resize the output tensors.
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(2);
+  output_size->data[0] = n_batch;
+  output_size->data[1] = n_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size));
+
+  // The weights are of consistent type, so it suffices to check one.
+  // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
+  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
+                             input->type == kTfLiteFloat32);
+
+  TfLiteIntArrayFree(node->temporaries);
+  if (is_hybrid_op) {
+    node->temporaries = TfLiteIntArrayCreate(7);
+  } else {
+    node->temporaries = TfLiteIntArrayCreate(1);
+  }
+  node->temporaries->data[0] = op_data->scratch_tensor_index;
+
+  // Create a scratch buffer tensor.
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+  scratch_buffer->type = input->type;
+  scratch_buffer->allocation_type = kTfLiteArenaRw;
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+  scratch_buffer_size->data[0] = n_batch;
+  if (use_cifg) {
+    // Reserving space for Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 3;
+  } else {
+    // Reserving space for Input, Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 4;
+  }
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                   scratch_buffer_size));
+
+  if (is_hybrid_op) {
+    // Allocate temporary tensors to store quantized values of input,
+    // activation_state and cell_state tensors.
+    node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+    node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
+    TfLiteTensor* activation_state_quantized =
+        GetTemporary(context, node, /*index=*/2);
+    activation_state_quantized->type = kTfLiteUInt8;
+    activation_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
+                             activation_state->dims)) {
+      TfLiteIntArray* activation_state_quantized_size =
+          TfLiteIntArrayCopy(activation_state->dims);
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, activation_state_quantized,
+                                         activation_state_quantized_size));
+    }
+    node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
+    TfLiteTensor* cell_state_quantized =
+        GetTemporary(context, node, /*index=*/3);
+    cell_state_quantized->type = kTfLiteUInt8;
+    cell_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
+      TfLiteIntArray* cell_state_quantized_size =
+          TfLiteIntArrayCopy(cell_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, cell_state_quantized,
+                                              cell_state_quantized_size));
+    }
+
+    // Allocate temporary tensors to store scaling factors and product scaling
+    // factors. The latter is a convenience storage which allows to quantize
+    // a vector once (which produces the scaling factors) and multiply it with
+    // different matrices (which requires multiplying the scaling factors with
+    // the scaling factor of the matrix).
+    node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    int scaling_dims[1] = {n_batch};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+    node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
+    TfLiteTensor* prod_scaling_factors =
+        GetTemporary(context, node, /*index=*/5);
+    prod_scaling_factors->type = kTfLiteFloat32;
+    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
+                                   scaling_dims)) {
+      TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
+      prod_scaling_factors_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, prod_scaling_factors,
+                                              prod_scaling_factors_size));
+    }
+
+    // Allocate a temporary tensor to store the recovered cell weights. Since
+    // this is used for diagonal matrices, only need to store n_cell values.
+    node->temporaries->data[6] = op_data->scratch_tensor_index + 6;
+    TfLiteTensor* recovered_cell_weights =
+        GetTemporary(context, node, /*index=*/6);
+    recovered_cell_weights->type = kTfLiteFloat32;
+    recovered_cell_weights->allocation_type = kTfLiteArenaRw;
+    int recovered_cell_dims[1] = {n_cell};
+    if (!TfLiteIntArrayEqualsArray(recovered_cell_weights->dims, 1,
+                                   recovered_cell_dims)) {
+      TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1);
+      recovered_cell_weights_size->data[0] = n_cell;
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, recovered_cell_weights,
+                                              recovered_cell_weights_size));
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  const TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+
+  TfLiteTensor* activation_state =
+      &context->tensors[op_data->activation_state_tensor_index];
+  TfLiteTensor* cell_state =
+      &context->tensors[op_data->cell_state_tensor_index];
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(mirkov): add a check that weights are all uint8s or all floats.
+  switch (input_to_output_weights->type) {
+    case kTfLiteFloat32: {
+      return lstm_eval::EvalFloat(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+          projection_bias, params, /*forward_sequence=*/true,
+          /*time_major=*/true,
+          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
+          output);
+    }
+    case kTfLiteUInt8: {
+      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
+      TfLiteTensor* activation_state_quantized =
+          GetTemporary(context, node, /*index=*/2);
+      TfLiteTensor* cell_state_quantized =
+          GetTemporary(context, node, /*index=*/3);
+      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
+      TfLiteTensor* prod_scaling_factors =
+          GetTemporary(context, node, /*index=*/5);
+      TfLiteTensor* recovered_cell_weights =
+          GetTemporary(context, node, /*index=*/6);
+      return lstm_eval::EvalHybrid(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+          projection_bias, params, /*forward_sequence=*/true,
+          /*time_major=*/true, /*output_offset=*/0, scratch_buffer,
+          scaling_factors, prod_scaling_factors, recovered_cell_weights,
+          input_quantized,
+          /*aux_input_quantized=*/nullptr, activation_state_quantized,
+          cell_state_quantized, activation_state, cell_state, output);
+    }
+    default:
+      context->ReportError(context, "Type %d is not currently supported.",
+                           input_to_output_weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace full
+
+// For basic kernel (5-inputs).
+namespace basic {
+
+enum InputTensor {
+  kInputData = 0,
+  kInputPrevActivation = 1,
+  kInputWeights = 2,
+  kInputBiases = 3,
+  kInputPrevState = 4,
+  kInputNum = 5,
+};
+
+enum OutputTensor {
+  kOutputActivation = 0,
+  kOutputState = 1,
+  kOutputConcatTemp = 2,
+  kOutputActivationTemp = 3,
+  kOutputNum = 4,
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData();
+  op_data->kernel_type = kTfLiteLSTMBasicKernel;
+  // `scratch_tensor_index` is unused in this kernel.
+  op_data->scratch_tensor_index = -1;
+  return op_data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, node->inputs->size == kInputNum);
+  TF_LITE_ENSURE(context, node->outputs->size == kOutputNum);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputData);
+  const TfLiteTensor* prev_activation =
+      GetInput(context, node, kInputPrevActivation);
+  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
+  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
+  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
+
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 2);
+  const int num_batches = input->dims->data[0];
+  const int input_depth = input->dims->data[1];
+
+  TF_LITE_ENSURE_EQ(context, prev_activation->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, prev_activation->dims->data[0], num_batches);
+  const int activation_depth = prev_activation->dims->data[1];
+  const int total_depth = input_depth + activation_depth;
+
+  TF_LITE_ENSURE_EQ(context, weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, weights->dims->data[0], 4 * activation_depth);
+  TF_LITE_ENSURE_EQ(context, weights->dims->data[1], total_depth);
+
+  TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, bias->dims->data[0], 4 * activation_depth);
+
+  TF_LITE_ENSURE_EQ(context, prev_state->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, prev_state->dims->data[0], num_batches);
+  TF_LITE_ENSURE_EQ(context, prev_state->dims->data[1], activation_depth);
+
+  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
+  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
+  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
+  TfLiteTensor* activation_temp =
+      GetOutput(context, node, kOutputActivationTemp);
+
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(
+                                 context, activation_out,
+                                 TfLiteIntArrayCopy(prev_activation->dims)));
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, state_out,
+                                     TfLiteIntArrayCopy(prev_state->dims)));
+
+  TfLiteIntArray* concat_temp_size = TfLiteIntArrayCreate(2);
+  concat_temp_size->data[0] = num_batches;
+  concat_temp_size->data[1] = total_depth;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, concat_temp, concat_temp_size));
+  TfLiteIntArray* activation_temp_size = TfLiteIntArrayCreate(2);
+  activation_temp_size->data[0] = num_batches;
+  activation_temp_size->data[1] = 4 * activation_depth;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, activation_temp,
+                                                   activation_temp_size));
+
+  // Set the state tensors as persistent.
+  for (auto index : {kInputPrevActivation, kInputPrevState}) {
+    TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]];
+    tensor->allocation_type = kTfLiteArenaRwPersistent;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputData);
+  const TfLiteTensor* prev_activation =
+      GetInput(context, node, kInputPrevActivation);
+  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
+  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
+  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
+
+  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
+  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
+  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
+  TfLiteTensor* activation_temp =
+      GetOutput(context, node, kOutputActivationTemp);
+
+  if (input->type == kTfLiteFloat32 &&
+      prev_activation->type == kTfLiteFloat32 &&
+      weights->type == kTfLiteFloat32 && bias->type == kTfLiteFloat32 &&
+      prev_state->type == kTfLiteFloat32 && state_out->type == kTfLiteFloat32 &&
+      activation_out->type == kTfLiteFloat32 &&
+      concat_temp->type == kTfLiteFloat32 &&
+      activation_temp->type == kTfLiteFloat32) {
+    tflite::LstmCellParams op_params;
+    // Float LSTM cell does not need parameters to be set: leave untouched.
+    optimized_ops::LstmCell(
+        op_params,
+        // Inputs.
+        GetTensorShape(input), GetTensorData<float>(input),
+        GetTensorShape(prev_activation), GetTensorData<float>(prev_activation),
+        GetTensorShape(weights), GetTensorData<float>(weights),
+        GetTensorShape(bias), GetTensorData<float>(bias),
+        GetTensorShape(prev_state), GetTensorData<float>(prev_state),
+        // Outputs.
+        GetTensorShape(state_out), GetTensorData<float>(state_out),
+        GetTensorShape(activation_out), GetTensorData<float>(activation_out),
+        GetTensorShape(concat_temp), GetTensorData<float>(concat_temp),
+        GetTensorShape(activation_temp), GetTensorData<float>(activation_temp));
+  } else if (input->type == kTfLiteUInt8 &&
+             prev_activation->type == kTfLiteUInt8 &&
+             weights->type == kTfLiteUInt8 && bias->type == kTfLiteInt32 &&
+             prev_state->type == kTfLiteInt16 &&
+             state_out->type == kTfLiteInt16 &&
+             activation_out->type == kTfLiteUInt8 &&
+             concat_temp->type == kTfLiteUInt8 &&
+             activation_temp->type == kTfLiteInt16) {
+    gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
+    int state_scale_log2_rounded;
+    if (!CheckedLog2(state_out->params.scale, &state_scale_log2_rounded)) {
+      context->ReportError(
+          context,
+          "The internal state of a LSTM cell must have a power-of-two scale.");
+      return kTfLiteError;
+    }
+    const int state_integer_bits = 15 + state_scale_log2_rounded;
+    if (state_integer_bits != 4) {
+      context->ReportError(context,
+                           "The only case of quantized LstmCell currently "
+                           "supported is with StateIntegerBits==4");
+      return kTfLiteError;
+    }
+
+    double real_accum_multiplier = 4096 * bias->params.scale;
+    int32 accum_multiplier;
+    int accum_shift;
+    tflite::QuantizeMultiplier(real_accum_multiplier, &accum_multiplier,
+                               &accum_shift);
+    tflite::LstmCellParams op_params;
+    op_params.weights_zero_point = weights->params.zero_point;
+    op_params.accum_multiplier = accum_multiplier;
+    op_params.accum_shift = accum_shift;
+    optimized_ops::LstmCell<4>(
+        op_params,
+        // Inputs.
+        GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(prev_activation),
+        GetTensorData<uint8_t>(prev_activation), GetTensorShape(weights),
+        GetTensorData<uint8_t>(weights), GetTensorShape(bias),
+        GetTensorData<int32_t>(bias), GetTensorShape(prev_state),
+        GetTensorData<int16_t>(prev_state),
+        // Outputs.
+        GetTensorShape(state_out), GetTensorData<int16_t>(state_out),
+        GetTensorShape(activation_out), GetTensorData<uint8_t>(activation_out),
+        GetTensorShape(concat_temp), GetTensorData<uint8_t>(concat_temp),
+        GetTensorShape(activation_temp),
+        GetTensorData<int16_t>(activation_temp), gemm_context);
+  } else {
+    context->ReportError(context,
+                         "Unsupported combination of data types for LstmCell");
+    return kTfLiteError;
+  }
+
+  // TODO(ycling): Investigate if this copy can be avoided with the 5-inputs
+  // LSTM kernel.
+  memcpy(prev_activation->data.raw, activation_out->data.raw,
+         activation_out->bytes);
+  memcpy(prev_state->data.raw, state_out->data.raw, state_out->bytes);
+
+  return kTfLiteOk;
+}
+
+}  // namespace basic
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  gemm_support::IncrementUsageCounter(context);
+
+  const auto* params = reinterpret_cast<const TfLiteLSTMParams*>(buffer);
+  switch (params->kernel_type) {
+    case kTfLiteLSTMFullKernel:
+      return full::Init(context, buffer, length);
+    case kTfLiteLSTMBasicKernel:
+      return basic::Init(context, buffer, length);
+  }
+}
+void Free(TfLiteContext* context, void* buffer) {
+  gemm_support::DecrementUsageCounter(context);
+
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<const OpData*>(node->user_data);
+  switch (op_data->kernel_type) {
+    case kTfLiteLSTMFullKernel:
+      return full::Prepare(context, node);
+    case kTfLiteLSTMBasicKernel:
+      return basic::Prepare(context, node);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<const OpData*>(node->user_data);
+  switch (op_data->kernel_type) {
+    case kTfLiteLSTMFullKernel:
+      return full::Eval(context, node);
+    case kTfLiteLSTMBasicKernel:
+      return basic::Eval(context, node);
+  }
+}
+
+}  // namespace lstm
+
+TfLiteRegistration* Register_LSTM() {
+  static TfLiteRegistration r = {lstm::Init, lstm::Free, lstm::Prepare,
+                                 lstm::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f179ecb195e4dd999cb6e3ed0582e6385a3436b0
--- /dev/null
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -0,0 +1,1139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/lstm_eval.h"
+
+#include <stdint.h>
+
+#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lstm_eval {
+
+namespace {
+
+// Performs an LSTM batch inference step for input specified by input_ptr_batch.
+// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
+// biases (*_bias_ptr), and buffers (*_scratch), along with additional
+// parameters:
+//  - params: various LSTM params including activation, clipping, etc.,
+//  - n_batch: size of batch,
+//  - n_cell: number of cells (or units),
+//  - n_input: the input size,
+//  - n_output: the output size.
+//  - output_batch_leading_dim: the leading dimension of the output buffer.
+//
+// The pointers to the cell and output state and the output are updated.
+//
+// The pointers with the suffix "_batch" point to data aligned in batch_major
+// order, and each step processes batch_size many inputs from input_ptr_batch,
+// and updates batch_size many cell and output states.
+//
+// The output_batch_dim is output.shape[-1], i.e. the outermost dimension of the
+// output tensor, and in most cases will be equal to n_output. It is usually not
+// when we want to store the LSTM output into a slice of the output tensor, e.g.
+// for bidirectional LSTMs with merge_outputs. In this case, the batched
+// operations cannot be used since they assume that the batched outputs are
+// contiguous, and we manually loop over the batched outputs.
+inline void LstmStepWithAuxInput(
+    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
+    const float* input_to_forget_weights_ptr,
+    const float* input_to_cell_weights_ptr,
+    const float* input_to_output_weights_ptr, const float* aux_input_ptr_batch,
+    const float* aux_input_to_input_weights_ptr,
+    const float* aux_input_to_forget_weights_ptr,
+    const float* aux_input_to_cell_weights_ptr,
+    const float* aux_input_to_output_weights_ptr,
+    const float* recurrent_to_input_weights_ptr,
+    const float* recurrent_to_forget_weights_ptr,
+    const float* recurrent_to_cell_weights_ptr,
+    const float* recurrent_to_output_weights_ptr,
+    const float* cell_to_input_weights_ptr,
+    const float* cell_to_forget_weights_ptr,
+    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
+    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
+    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
+    int output_batch_leading_dim, float* output_state_ptr,
+    float* cell_state_ptr, float* input_gate_scratch,
+    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
+    float* output_ptr_batch) {
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+  // Initialize scratch buffers with bias.
+  if (!use_cifg) {
+    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
+                                          input_gate_scratch);
+  }
+  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                        forget_gate_scratch);
+  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                        cell_scratch);
+  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                        output_gate_scratch);
+
+  // For each batch and cell: compute input_weight * input.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+        input_gate_scratch, /*result_stride=*/1);
+  }
+
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+      forget_gate_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+      cell_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
+      output_gate_scratch, /*result_stride=*/1);
+
+  // If auxiliary input is available then compute aux_input_weight * aux_input
+  if (aux_input_ptr_batch != nullptr) {
+    if (!use_cifg) {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          aux_input_to_input_weights_ptr, n_cell, n_aux_input,
+          aux_input_ptr_batch, n_batch, input_gate_scratch,
+          /*result_stride=*/1);
+    }
+
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_forget_weights_ptr, n_cell, n_aux_input,
+        aux_input_ptr_batch, n_batch, forget_gate_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_cell_weights_ptr, n_cell, n_aux_input, aux_input_ptr_batch,
+        n_batch, cell_scratch, /*result_stride=*/1);
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_output_weights_ptr, n_cell, n_aux_input,
+        aux_input_ptr_batch, n_batch, output_gate_scratch, /*result_stride=*/1);
+  }
+
+  // For each batch and cell: compute recurrent_weight * output_state.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
+        n_batch, input_gate_scratch, /*result_stride=*/1);
+  }
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, forget_gate_scratch,
+      /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, cell_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
+      n_batch, output_gate_scratch,
+      /*result_stride=*/1);
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole) {
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
+          input_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                       input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
+        forget_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                     forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
+                                         n_batch * n_cell, cell_state_ptr);
+  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             forget_gate_scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  }
+  if (params->cell_clip > 0.0) {
+    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
+                             params->cell_clip, cell_state_ptr);
+  }
+
+  // For each batch and cell: update the output gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
+        output_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                     output_gate_scratch);
+  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                         n_batch * n_cell, output_gate_scratch);
+
+  const bool use_projection_weight = (projection_weights_ptr != nullptr);
+  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+
+  // For each batch: update the projection and output_state. Note that since
+  // the output batch rows may not be contiguous (output_batch_leading_dim !=
+  // n_output), we unroll the batched operations where this is the case.
+  if (output_batch_leading_dim == n_output) {
+    if (use_projection_weight) {
+      if (use_projection_bias) {
+        tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                              n_batch, output_ptr_batch);
+      } else {
+        tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          projection_weights_ptr, n_output, n_cell, output_gate_scratch,
+          n_batch, output_ptr_batch, /*result_stride=*/1);
+      if (params->proj_clip > 0.0) {
+        tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
+                                 params->proj_clip, output_ptr_batch);
+      }
+    } else {
+      tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                               output_ptr_batch);
+    }
+    tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
+                             output_state_ptr);
+  } else {
+    if (use_projection_weight) {
+      if (use_projection_bias) {
+        for (int k = 0; k < n_batch; k++) {
+          tensor_utils::CopyVector(
+              projection_bias_ptr, n_output,
+              output_ptr_batch + k * output_batch_leading_dim);
+        }
+      } else {
+        for (int k = 0; k < n_batch; k++) {
+          tensor_utils::ZeroVector(
+              output_ptr_batch + k * output_batch_leading_dim, n_output);
+        }
+      }
+      for (int k = 0; k < n_batch; k++) {
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            projection_weights_ptr, n_output, n_cell,
+            output_gate_scratch + k * n_cell,
+            /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
+            /*result_stride=*/1);
+        if (params->proj_clip > 0.0) {
+          tensor_utils::ClipVector(
+              output_ptr_batch + k * output_batch_leading_dim, n_output,
+              params->proj_clip,
+              output_ptr_batch + k * output_batch_leading_dim);
+        }
+      }
+    } else {
+      for (int k = 0; k < n_batch; k++) {
+        tensor_utils::CopyVector(
+            output_gate_scratch + k * n_output, n_output,
+            output_ptr_batch + k * output_batch_leading_dim);
+      }
+    }
+    for (int k = 0; k < n_batch; k++) {
+      tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
+                               n_output, output_state_ptr + k * n_output);
+    }
+  }
+}
+
+// Same as above but with quantized weight matrices. In detail:
+// Input of size 'n_batch * n_input':
+//   input_ptr_batch
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional (can be nullptr)
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_input_weights
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Quantized projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Weight scales (scalars) for each of the weights above.
+//   input_to_input_weights_scale      - optional
+//   input_to_forget_weights_scale
+//   input_to_cell_weights_scale
+//   input_to_output_weights_scale
+//   recurrent_to_input_weights_scale  - optional
+//   recurrent_to_forget_weights_scale
+//   recurrent_to_cell_weights_scale
+//   recurrent_to_output_weights_scale
+//   cell_to_input_weights_scale,
+//   cell_to_forget_weights_scale,
+//   cell_to_output_weights_scale,
+//   projection_weights_scale          - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Temporary pre-allocated storage for quantized values:
+//   quantized_input_ptr_batch (same size as input_ptr_batch)
+//   quantized_output_state_ptr (same size as output_state_ptr)
+//   quantized_cell_state_ptr (same size as cell_state_ptr)
+// Temporary pre-allocated storage for recovered values:
+//   recovered_cell_weights (same size as cell_to_*_weights)
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr_batch - size 'n_batch * output_batch_leading_dim'
+inline void LstmStepWithAuxInput(
+    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
+    float input_to_input_weights_scale,
+    const int8_t* input_to_forget_weights_ptr,
+    float input_to_forget_weights_scale,
+    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
+    const int8_t* input_to_output_weights_ptr,
+    float input_to_output_weights_scale, const float* aux_input_ptr_batch,
+    const int8_t* aux_input_to_input_weights_ptr,
+    float aux_input_to_input_weights_scale,
+    const int8_t* aux_input_to_forget_weights_ptr,
+    float aux_input_to_forget_weights_scale,
+    const int8_t* aux_input_to_cell_weights_ptr,
+    float aux_input_to_cell_weights_scale,
+    const int8_t* aux_input_to_output_weights_ptr,
+    float aux_input_to_output_weights_scale,
+    const int8_t* recurrent_to_input_weights_ptr,
+    float recurrent_to_input_weights_scale,
+    const int8_t* recurrent_to_forget_weights_ptr,
+    float recurrent_to_forget_weights_scale,
+    const int8_t* recurrent_to_cell_weights_ptr,
+    float recurrent_to_cell_weights_scale,
+    const int8_t* recurrent_to_output_weights_ptr,
+    float recurrent_to_output_weights_scale,
+    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
+    const int8_t* cell_to_forget_weights_ptr,
+    float cell_to_forget_weights_scale,
+    const int8_t* cell_to_output_weights_ptr,
+    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_aux_input, int n_output, int output_batch_leading_dim,
+    float* input_gate_scratch, float* forget_gate_scratch, float* cell_scratch,
+    float* output_gate_scratch, float* scaling_factors,
+    float* product_scaling_factors, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr_batch, int8_t* quantized_aux_input_ptr_batch,
+    int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
+    float* output_state_ptr, float* cell_state_ptr, float* output_ptr_batch) {
+  // Since we have already checked that weights are all there or none, we
+  // can check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+  // Initialize scratch buffers with bias.
+  if (!use_cifg) {
+    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
+                                          input_gate_scratch);
+  }
+  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                        forget_gate_scratch);
+  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                        cell_scratch);
+  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                        output_gate_scratch);
+
+  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_input;
+      tensor_utils::SymmetricQuantizeFloats(
+          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
+          &unused_min, &unused_max, &scaling_factors[b]);
+    }
+    // For each batch and cell: compute input_weight * input.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * input_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_to_input_weights_ptr, n_cell, n_input,
+          quantized_input_ptr_batch, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, forget_gate_scratch,
+        /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_output_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, output_gate_scratch,
+        /*result_stride=*/1);
+  }
+
+  if (aux_input_ptr_batch != nullptr &&
+      !tensor_utils::IsZeroVector(aux_input_ptr_batch, n_batch * n_input)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_input;
+      tensor_utils::SymmetricQuantizeFloats(
+          aux_input_ptr_batch + offset, n_input,
+          quantized_aux_input_ptr_batch + offset, &unused_min, &unused_max,
+          &scaling_factors[b]);
+    }
+    // For each batch and cell: compute input_weight * input.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * aux_input_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          aux_input_to_input_weights_ptr, n_cell, n_input,
+          quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * aux_input_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_forget_weights_ptr, n_cell, n_input,
+        quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+        forget_gate_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * aux_input_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_cell_weights_ptr, n_cell, n_input,
+        quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+        cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * aux_input_to_output_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        aux_input_to_output_weights_ptr, n_cell, n_input,
+        quantized_aux_input_ptr_batch, product_scaling_factors, n_batch,
+        output_gate_scratch, /*result_stride=*/1);
+  }
+
+  if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_output;
+      tensor_utils::SymmetricQuantizeFloats(output_state_ptr + offset, n_output,
+                                            quantized_output_state_ptr + offset,
+                                            &unused_min, &unused_max,
+                                            &scaling_factors[b]);
+    }
+    // For each batch and cell: compute recurrent_weight * output_state.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * recurrent_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_input_weights_ptr, n_cell, n_output,
+          quantized_output_state_ptr, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_forget_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        forget_gate_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_cell_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_output_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_output_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        output_gate_scratch, /*result_stride=*/1);
+  }
+
+  // Save quantization and matmul computation for all zero input.
+  bool is_cell_state_all_zeros =
+      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole && !is_cell_state_all_zeros) {
+      tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell,
+                                         cell_to_input_weights_scale,
+                                         recovered_cell_weights);
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+          input_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                       input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole && !is_cell_state_all_zeros) {
+    tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell,
+                                       cell_to_forget_weights_scale,
+                                       recovered_cell_weights);
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+        forget_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                     forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
+                                         n_batch * n_cell, cell_state_ptr);
+  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             forget_gate_scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  }
+  if (params->cell_clip > 0.0) {
+    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
+                             params->cell_clip, cell_state_ptr);
+  }
+
+  is_cell_state_all_zeros =
+      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
+  // For each batch and cell: update the output gate.
+  if (use_peephole && !is_cell_state_all_zeros) {
+    tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell,
+                                       cell_to_output_weights_scale,
+                                       recovered_cell_weights);
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+        output_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                     output_gate_scratch);
+  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                         n_batch * n_cell, output_gate_scratch);
+
+  const bool use_projection_weight = (projection_weights_ptr != nullptr);
+  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+
+  // For each batch: update the projection and output_state. Note that since
+  // the output batch rows may not be contiguous (output_batch_leading_dim !=
+  // n_output), we unroll the batched operations where this is the case.
+  if (output_batch_leading_dim == n_output) {
+    if (use_projection_weight) {
+      if (use_projection_bias) {
+        tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                              n_batch, output_ptr_batch);
+      } else {
+        tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+      }
+      if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
+        // Save quantization and matmul computation for all zero input.
+        float unused_min, unused_max;
+        for (int b = 0; b < n_batch; ++b) {
+          const int offset = b * n_cell;
+          tensor_utils::SymmetricQuantizeFloats(
+              output_gate_scratch + offset, n_cell,
+              quantized_cell_state_ptr + offset, &unused_min, &unused_max,
+              &scaling_factors[b]);
+        }
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * projection_weights_scale;
+        }
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
+            product_scaling_factors, n_batch, output_ptr_batch,
+            /*result_stride=*/1);
+      }
+      if (params->proj_clip > 0.0) {
+        tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
+                                 params->proj_clip, output_ptr_batch);
+      }
+    } else {
+      tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                               output_ptr_batch);
+    }
+    tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
+                             output_state_ptr);
+  } else {
+    if (use_projection_weight) {
+      if (use_projection_bias) {
+        for (int k = 0; k < n_batch; k++) {
+          tensor_utils::CopyVector(
+              projection_bias_ptr, n_output,
+              output_ptr_batch + k * output_batch_leading_dim);
+        }
+      } else {
+        for (int k = 0; k < n_batch; k++) {
+          tensor_utils::ZeroVector(
+              output_ptr_batch + k * output_batch_leading_dim, n_output);
+        }
+      }
+      if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
+        // Save quantization and matmul computation for all zero input.
+        float unused_min, unused_max;
+        for (int b = 0; b < n_batch; ++b) {
+          const int offset = b * n_cell;
+          tensor_utils::SymmetricQuantizeFloats(
+              output_gate_scratch + offset, n_cell,
+              quantized_cell_state_ptr + offset, &unused_min, &unused_max,
+              &scaling_factors[b]);
+        }
+        for (int b = 0; b < n_batch; ++b) {
+          product_scaling_factors[b] =
+              scaling_factors[b] * projection_weights_scale;
+        }
+        for (int k = 0; k < n_batch; k++) {
+          tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+              projection_weights_ptr, n_output, n_cell,
+              quantized_cell_state_ptr + k * n_cell,
+              &product_scaling_factors[k],
+              /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
+              /*result_stride=*/1);
+        }
+      }
+      if (params->proj_clip > 0.0) {
+        for (int k = 0; k < n_batch; k++) {
+          tensor_utils::ClipVector(
+              output_ptr_batch + k * output_batch_leading_dim, n_output,
+              params->proj_clip,
+              output_ptr_batch + k * output_batch_leading_dim);
+        }
+      }
+    } else {
+      for (int k = 0; k < n_batch; k++) {
+        tensor_utils::CopyVector(
+            output_gate_scratch + k * n_output, n_output,
+            output_ptr_batch + k * output_batch_leading_dim);
+      }
+    }
+    for (int k = 0; k < n_batch; k++) {
+      tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
+                               n_output, output_state_ptr + k * n_output);
+    }
+  }
+}
+}  // namespace
+
+TfLiteStatus EvalFloat(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
+    int output_offset, TfLiteTensor* scratch_buffer,
+    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output) {
+  TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
+  int max_time, n_batch;
+  if (input->dims->size == 3) {
+    max_time = (time_major) ? input->dims->data[0] : input->dims->data[1];
+    n_batch = (time_major) ? input->dims->data[1] : input->dims->data[0];
+  } else {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  }
+  const int n_input = input->dims->data[input->dims->size - 1];
+  const int aux_input_size =
+      (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0;
+
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer->data.f;
+    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer->data.f;
+    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+  }
+
+  // Check optional tensors, the respective pointers can be null.
+  const float* input_to_input_weights_ptr =
+      (use_cifg) ? nullptr : input_to_input_weights->data.f;
+  const float* recurrent_to_input_weights_ptr =
+      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
+  const float* input_gate_bias_ptr =
+      (use_cifg) ? nullptr : input_gate_bias->data.f;
+  const float* cell_to_input_weights_ptr =
+      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
+  const float* cell_to_forget_weights_ptr =
+      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
+  const float* cell_to_output_weights_ptr =
+      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
+  const float* projection_weights_ptr =
+      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
+  const float* projection_bias_ptr =
+      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+
+  float* aux_input_ptr = nullptr;
+  float* aux_input_to_input_weights_ptr = nullptr;
+  float* aux_input_to_forget_weights_ptr = nullptr;
+  float* aux_input_to_cell_weights_ptr = nullptr;
+  float* aux_input_to_output_weights_ptr = nullptr;
+  if (aux_input_size > 0) {
+    if (!use_cifg) {
+      aux_input_to_input_weights_ptr = aux_input_to_input_weights->data.f;
+    }
+    aux_input_to_forget_weights_ptr = aux_input_to_forget_weights->data.f;
+    aux_input_to_cell_weights_ptr = aux_input_to_cell_weights->data.f;
+    aux_input_to_output_weights_ptr = aux_input_to_output_weights->data.f;
+  }
+
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+  if (time_major) {
+    // Loop through the sequence.
+    const int input_step = n_batch * n_input;
+    const int output_step = n_batch * output_batch_leading_dim;
+    for (int t = 0; t < max_time; t++) {
+      // If this is the forward_sequence, step forward, otherwise step
+      // backwards.
+      const int t_rel = forward_sequence ? t : max_time - t - 1;
+      const float* input_ptr = input->data.f + t_rel * input_step;
+      if (aux_input) {
+        aux_input_ptr = aux_input->data.f + t_rel * input_step;
+      }
+      float* output_ptr_time =
+          output->data.f + t_rel * output_step + output_offset;
+
+      LstmStepWithAuxInput(
+          input_ptr, input_to_input_weights_ptr,
+          input_to_forget_weights->data.f, input_to_cell_weights->data.f,
+          input_to_output_weights->data.f, aux_input_ptr,
+          aux_input_to_input_weights_ptr, aux_input_to_forget_weights_ptr,
+          aux_input_to_cell_weights_ptr, aux_input_to_output_weights_ptr,
+          recurrent_to_input_weights_ptr, recurrent_to_forget_weights->data.f,
+          recurrent_to_cell_weights->data.f,
+          recurrent_to_output_weights->data.f, cell_to_input_weights_ptr,
+          cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
+          input_gate_bias_ptr, forget_gate_bias->data.f, cell_bias->data.f,
+          output_gate_bias->data.f, projection_weights_ptr, projection_bias_ptr,
+          params, n_batch, n_cell, n_input, aux_input_size, n_output,
+          output_batch_leading_dim, activation_state->data.f,
+          cell_state->data.f, input_gate_scratch, forget_gate_scratch,
+          cell_scratch, output_gate_scratch, output_ptr_time);
+    }
+  } else {
+    for (int b = 0; b < n_batch; b++) {
+      const int input_step = n_input;
+      const int output_step = output_batch_leading_dim;
+      for (int t = 0; t < max_time; t++) {
+        // If this is the forward_sequence, step forward, otherwise step
+        // backwards.
+        const int t_rel = forward_sequence ? t : max_time - t - 1;
+        const float* input_ptr = input->data.f + t_rel * input_step;
+        if (aux_input) {
+          aux_input_ptr = aux_input->data.f + t_rel * input_step;
+        }
+        float* output_ptr_time =
+            output->data.f + t_rel * output_step + output_offset;
+
+        LstmStepWithAuxInput(
+            input_ptr, input_to_input_weights_ptr,
+            input_to_forget_weights->data.f, input_to_cell_weights->data.f,
+            input_to_output_weights->data.f, aux_input_ptr,
+            aux_input_to_input_weights_ptr, aux_input_to_forget_weights_ptr,
+            aux_input_to_cell_weights_ptr, aux_input_to_output_weights_ptr,
+            recurrent_to_input_weights_ptr, recurrent_to_forget_weights->data.f,
+            recurrent_to_cell_weights->data.f,
+            recurrent_to_output_weights->data.f, cell_to_input_weights_ptr,
+            cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
+            input_gate_bias_ptr, forget_gate_bias->data.f, cell_bias->data.f,
+            output_gate_bias->data.f, projection_weights_ptr,
+            projection_bias_ptr, params, /*n_batch=*/1, n_cell, n_input,
+            aux_input_size, n_output, output_batch_leading_dim,
+            activation_state->data.f, cell_state->data.f, input_gate_scratch,
+            forget_gate_scratch, cell_scratch, output_gate_scratch,
+            output_ptr_time);
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
+    int output_offset, TfLiteTensor* scratch_buffer,
+    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
+    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
+    TfLiteTensor* aux_input_quantized, TfLiteTensor* output_state_quantized,
+    TfLiteTensor* cell_state_quantized, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output) {
+  TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
+  const int n_input = input->dims->data[input->dims->size - 1];
+  int max_time, n_batch;
+  if (input->dims->size == 2) {
+    max_time = 1;
+    n_batch = input->dims->data[0];
+  } else {
+    max_time = (time_major) ? input->dims->data[0] : input->dims->data[1];
+    n_batch = (time_major) ? input->dims->data[1] : input->dims->data[0];
+  }
+  const int aux_input_size =
+      (aux_input) ? aux_input->dims->data[aux_input->dims->size - 1] : 0;
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer->data.f;
+    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer->data.f;
+    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+  }
+
+  // Check optional tensors, the respective pointers can be null.
+  int8_t* input_to_input_weights_ptr = nullptr;
+  float input_to_input_weights_scale = 1.0f;
+  int8_t* recurrent_to_input_weights_ptr = nullptr;
+  float recurrent_to_input_weights_scale = 1.0f;
+  float* input_gate_bias_ptr = nullptr;
+  if (!use_cifg) {
+    input_to_input_weights_ptr =
+        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
+    recurrent_to_input_weights_ptr =
+        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
+    input_gate_bias_ptr = input_gate_bias->data.f;
+    input_to_input_weights_scale = input_to_input_weights->params.scale;
+    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
+  }
+
+  int8_t* cell_to_input_weights_ptr = nullptr;
+  int8_t* cell_to_forget_weights_ptr = nullptr;
+  int8_t* cell_to_output_weights_ptr = nullptr;
+  float cell_to_input_weights_scale = 1.0f;
+  float cell_to_forget_weights_scale = 1.0f;
+  float cell_to_output_weights_scale = 1.0f;
+  if (use_peephole) {
+    if (!use_cifg) {
+      cell_to_input_weights_ptr =
+          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
+      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
+    }
+    cell_to_forget_weights_ptr =
+        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
+    cell_to_output_weights_ptr =
+        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
+    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
+    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
+  }
+
+  const int8_t* projection_weights_ptr =
+      (projection_weights == nullptr)
+          ? nullptr
+          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
+  const float projection_weights_scale =
+      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
+  const float* projection_bias_ptr =
+      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+
+  // Required tensors, pointers are non-null.
+  const int8_t* input_to_forget_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
+  const float input_to_forget_weights_scale =
+      input_to_forget_weights->params.scale;
+  const int8_t* input_to_cell_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
+  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
+  const int8_t* input_to_output_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
+  const float input_to_output_weights_scale =
+      input_to_output_weights->params.scale;
+  const int8_t* recurrent_to_forget_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
+  const float recurrent_to_forget_weights_scale =
+      recurrent_to_forget_weights->params.scale;
+  const int8_t* recurrent_to_cell_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
+  const float recurrent_to_cell_weights_scale =
+      recurrent_to_cell_weights->params.scale;
+  const int8_t* recurrent_to_output_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
+  const float recurrent_to_output_weights_scale =
+      recurrent_to_output_weights->params.scale;
+  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
+  const float* cell_bias_ptr = cell_bias->data.f;
+  const float* output_gate_bias_ptr = output_gate_bias->data.f;
+
+  float* output_state_ptr = output_state->data.f;
+  float* cell_state_ptr = cell_state->data.f;
+
+  // Temporary storage for quantized values and scaling factors.
+  int8_t* quantized_input_ptr =
+      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  int8_t* quantized_aux_input_ptr =
+      (aux_input_quantized == nullptr)
+          ? nullptr
+          : reinterpret_cast<int8_t*>(aux_input_quantized->data.uint8);
+  int8_t* quantized_output_state_ptr =
+      reinterpret_cast<int8_t*>(output_state_quantized->data.uint8);
+  int8_t* quantized_cell_state_ptr =
+      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
+  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
+  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
+
+  // Auxiliary input and weights.
+  float* aux_input_ptr = nullptr;
+  int8_t* aux_input_to_input_weights_ptr = nullptr;
+  int8_t* aux_input_to_forget_weights_ptr = nullptr;
+  int8_t* aux_input_to_cell_weights_ptr = nullptr;
+  int8_t* aux_input_to_output_weights_ptr = nullptr;
+  float aux_input_to_input_weights_scale = 0.0f;
+  float aux_input_to_forget_weights_scale = 0.0f;
+  float aux_input_to_cell_weights_scale = 0.0f;
+  float aux_input_to_output_weights_scale = 0.0f;
+  if (aux_input_size > 0) {
+    if (!use_cifg) {
+      aux_input_to_input_weights_ptr =
+          reinterpret_cast<int8_t*>(aux_input_to_input_weights->data.uint8);
+    }
+    aux_input_to_forget_weights_ptr =
+        reinterpret_cast<int8_t*>(aux_input_to_forget_weights->data.uint8);
+    aux_input_to_cell_weights_ptr =
+        reinterpret_cast<int8_t*>(aux_input_to_cell_weights->data.uint8);
+    aux_input_to_output_weights_ptr =
+        reinterpret_cast<int8_t*>(aux_input_to_output_weights->data.uint8);
+    if (!use_cifg) {
+      aux_input_to_input_weights_scale =
+          aux_input_to_input_weights->params.scale;
+    }
+    aux_input_to_forget_weights_scale =
+        aux_input_to_forget_weights->params.scale;
+    aux_input_to_cell_weights_scale = aux_input_to_cell_weights->params.scale;
+    aux_input_to_output_weights_scale =
+        aux_input_to_output_weights->params.scale;
+  }
+
+  const int output_batch_leading_dim =
+      output->dims->data[output->dims->size - 1];
+  if (time_major) {
+    // Feed the sequence into the LSTM step-by-step.
+    const int input_step = n_batch * n_input;
+    const int output_step = n_batch * output_batch_leading_dim;
+    for (int t = 0; t < max_time; t++) {
+      // If this is the forward_sequence, step forward, otherwise step
+      // backwards.
+      const int t_rel = forward_sequence ? t : max_time - t - 1;
+      const float* input_ptr = input->data.f + t_rel * input_step;
+      if (aux_input) {
+        aux_input_ptr = aux_input->data.f + t_rel * input_step;
+      }
+      float* output_ptr = output->data.f + t_rel * output_step + output_offset;
+
+      LstmStepWithAuxInput(
+          input_ptr, input_to_input_weights_ptr, input_to_input_weights_scale,
+          input_to_forget_weights_ptr, input_to_forget_weights_scale,
+          input_to_cell_weights_ptr, input_to_cell_weights_scale,
+          input_to_output_weights_ptr, input_to_output_weights_scale,
+          aux_input_ptr, aux_input_to_input_weights_ptr,
+          aux_input_to_input_weights_scale, aux_input_to_forget_weights_ptr,
+          aux_input_to_forget_weights_scale, aux_input_to_cell_weights_ptr,
+          aux_input_to_cell_weights_scale, aux_input_to_output_weights_ptr,
+          aux_input_to_output_weights_scale, recurrent_to_input_weights_ptr,
+          recurrent_to_input_weights_scale, recurrent_to_forget_weights_ptr,
+          recurrent_to_forget_weights_scale, recurrent_to_cell_weights_ptr,
+          recurrent_to_cell_weights_scale, recurrent_to_output_weights_ptr,
+          recurrent_to_output_weights_scale, cell_to_input_weights_ptr,
+          cell_to_input_weights_scale, cell_to_forget_weights_ptr,
+          cell_to_forget_weights_scale, cell_to_output_weights_ptr,
+          cell_to_output_weights_scale, input_gate_bias_ptr,
+          forget_gate_bias_ptr, cell_bias_ptr, output_gate_bias_ptr,
+          projection_weights_ptr, projection_weights_scale, projection_bias_ptr,
+          params, n_batch, n_cell, n_input, aux_input_size, n_output,
+          output_batch_leading_dim, input_gate_scratch, forget_gate_scratch,
+          cell_scratch, output_gate_scratch, scaling_factors_ptr,
+          prod_scaling_factors_ptr, recovered_cell_weights_ptr,
+          quantized_input_ptr, quantized_aux_input_ptr,
+          quantized_output_state_ptr, quantized_cell_state_ptr,
+          output_state_ptr, cell_state_ptr, output_ptr);
+    }
+  } else {
+    for (int b = 0; b < n_batch; b++) {
+      const int input_step = n_input;
+      const int output_step = output_batch_leading_dim;
+      for (int t = 0; t < max_time; t++) {
+        // If this is the forward_sequence, step forward, otherwise step
+        // backwards.
+        const int t_rel = forward_sequence ? t : max_time - t - 1;
+        const float* input_ptr = input->data.f + t_rel * input_step;
+        if (aux_input) {
+          aux_input_ptr = aux_input->data.f + t_rel * input_step;
+        }
+        float* output_ptr =
+            output->data.f + t_rel * output_step + output_offset;
+
+        LstmStepWithAuxInput(
+            input_ptr, input_to_input_weights_ptr, input_to_input_weights_scale,
+            input_to_forget_weights_ptr, input_to_forget_weights_scale,
+            input_to_cell_weights_ptr, input_to_cell_weights_scale,
+            input_to_output_weights_ptr, input_to_output_weights_scale,
+            aux_input_ptr, aux_input_to_input_weights_ptr,
+            aux_input_to_input_weights_scale, aux_input_to_forget_weights_ptr,
+            aux_input_to_forget_weights_scale, aux_input_to_cell_weights_ptr,
+            aux_input_to_cell_weights_scale, aux_input_to_output_weights_ptr,
+            aux_input_to_output_weights_scale, recurrent_to_input_weights_ptr,
+            recurrent_to_input_weights_scale, recurrent_to_forget_weights_ptr,
+            recurrent_to_forget_weights_scale, recurrent_to_cell_weights_ptr,
+            recurrent_to_cell_weights_scale, recurrent_to_output_weights_ptr,
+            recurrent_to_output_weights_scale, cell_to_input_weights_ptr,
+            cell_to_input_weights_scale, cell_to_forget_weights_ptr,
+            cell_to_forget_weights_scale, cell_to_output_weights_ptr,
+            cell_to_output_weights_scale, input_gate_bias_ptr,
+            forget_gate_bias_ptr, cell_bias_ptr, output_gate_bias_ptr,
+            projection_weights_ptr, projection_weights_scale,
+            projection_bias_ptr, params, n_batch, n_cell, n_input,
+            aux_input_size, n_output, output_batch_leading_dim,
+            input_gate_scratch, forget_gate_scratch, cell_scratch,
+            output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
+            recovered_cell_weights_ptr, quantized_input_ptr,
+            quantized_aux_input_ptr, quantized_output_state_ptr,
+            quantized_cell_state_ptr, output_state_ptr, cell_state_ptr,
+            output_ptr);
+      }
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace lstm_eval
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8a4d284f3c431e88fd0d52c98807161de14cba9
--- /dev/null
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_LSTM_EVAL_H_
+#define TENSORFLOW_LITE_KERNELS_LSTM_EVAL_H_
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lstm_eval {
+
+TfLiteStatus EvalFloat(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
+    int output_offset, TfLiteTensor* scratch_buffer,
+    TfLiteTensor* activation_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output);
+
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* aux_input_to_input_weights,
+    const TfLiteTensor* aux_input_to_forget_weights,
+    const TfLiteTensor* aux_input_to_cell_weights,
+    const TfLiteTensor* aux_input_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
+    int output_offset, TfLiteTensor* scratch_buffer,
+    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
+    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
+    TfLiteTensor* aux_input_quantized, TfLiteTensor* output_state_quantized,
+    TfLiteTensor* cell_state_quantized, TfLiteTensor* output_state,
+    TfLiteTensor* cell_state, TfLiteTensor* output);
+
+}  // namespace lstm_eval
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_LSTM_EVAL_H_
diff --git a/tensorflow/contrib/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/lstm_test.cc
rename to tensorflow/lite/kernels/lstm_test.cc
index e7ddfceb4527c4c32cece224e9b155db4ff0ea4f..03ad2e899d29b17d430bf51721e9b8b75cdb79d4 100644
--- a/tensorflow/contrib/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -22,10 +22,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -116,71 +116,69 @@ class LSTMOpModel : public SingleOpModel {
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::initializer_list<float> f) {
+  void SetInputToInputWeights(std::vector<float> f) {
     PopulateTensor(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::initializer_list<float> f) {
+  void SetInputToForgetWeights(std::vector<float> f) {
     PopulateTensor(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::initializer_list<float> f) {
+  void SetInputToCellWeights(std::vector<float> f) {
     PopulateTensor(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::initializer_list<float> f) {
+  void SetInputToOutputWeights(std::vector<float> f) {
     PopulateTensor(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToInputWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+  void SetRecurrentToForgetWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+  void SetRecurrentToCellWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToOutputWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::initializer_list<float> f) {
+  void SetCellToInputWeights(std::vector<float> f) {
     PopulateTensor(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::initializer_list<float> f) {
+  void SetCellToForgetWeights(std::vector<float> f) {
     PopulateTensor(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::initializer_list<float> f) {
+  void SetCellToOutputWeights(std::vector<float> f) {
     PopulateTensor(cell_to_output_weights_, f);
   }
 
-  void SetInputGateBias(std::initializer_list<float> f) {
+  void SetInputGateBias(std::vector<float> f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::initializer_list<float> f) {
+  void SetForgetGateBias(std::vector<float> f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::initializer_list<float> f) {
-    PopulateTensor(cell_bias_, f);
-  }
+  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
 
-  void SetOutputGateBias(std::initializer_list<float> f) {
+  void SetOutputGateBias(std::vector<float> f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::initializer_list<float> f) {
+  void SetProjectionWeights(std::vector<float> f) {
     PopulateTensor(projection_weights_, f);
   }
 
-  void SetProjectionBias(std::initializer_list<float> f) {
+  void SetProjectionBias(std::vector<float> f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -243,51 +241,51 @@ class HybridLSTMOpModel : public LSTMOpModel {
                     use_projection_weights, use_projection_bias, cell_clip,
                     proj_clip, input_shapes, TensorType_UINT8) {}
 
-  void SetInputToInputWeights(std::initializer_list<float> f) {
+  void SetInputToInputWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::initializer_list<float> f) {
+  void SetInputToForgetWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::initializer_list<float> f) {
+  void SetInputToCellWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::initializer_list<float> f) {
+  void SetInputToOutputWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToInputWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+  void SetRecurrentToForgetWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+  void SetRecurrentToCellWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToOutputWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::initializer_list<float> f) {
+  void SetCellToInputWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::initializer_list<float> f) {
+  void SetCellToForgetWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::initializer_list<float> f) {
+  void SetCellToOutputWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
   }
 
-  void SetProjectionWeights(std::initializer_list<float> f) {
+  void SetProjectionWeights(std::vector<float> f) {
     SymmetricQuantizeAndPopulate(projection_weights_, f);
   }
 };
@@ -295,22 +293,22 @@ class HybridLSTMOpModel : public LSTMOpModel {
 class BaseLstmTest : public ::testing::Test {
  protected:
   // Weights of the LSTM model. Some are optional.
-  std::initializer_list<float> input_to_input_weights_;
-  std::initializer_list<float> input_to_cell_weights_;
-  std::initializer_list<float> input_to_forget_weights_;
-  std::initializer_list<float> input_to_output_weights_;
-  std::initializer_list<float> input_gate_bias_;
-  std::initializer_list<float> cell_gate_bias_;
-  std::initializer_list<float> forget_gate_bias_;
-  std::initializer_list<float> output_gate_bias_;
-  std::initializer_list<float> recurrent_to_input_weights_;
-  std::initializer_list<float> recurrent_to_cell_weights_;
-  std::initializer_list<float> recurrent_to_forget_weights_;
-  std::initializer_list<float> recurrent_to_output_weights_;
-  std::initializer_list<float> cell_to_input_weights_;
-  std::initializer_list<float> cell_to_forget_weights_;
-  std::initializer_list<float> cell_to_output_weights_;
-  std::initializer_list<float> projection_weights_;
+  std::vector<float> input_to_input_weights_;
+  std::vector<float> input_to_cell_weights_;
+  std::vector<float> input_to_forget_weights_;
+  std::vector<float> input_to_output_weights_;
+  std::vector<float> input_gate_bias_;
+  std::vector<float> cell_gate_bias_;
+  std::vector<float> forget_gate_bias_;
+  std::vector<float> output_gate_bias_;
+  std::vector<float> recurrent_to_input_weights_;
+  std::vector<float> recurrent_to_cell_weights_;
+  std::vector<float> recurrent_to_forget_weights_;
+  std::vector<float> recurrent_to_output_weights_;
+  std::vector<float> cell_to_input_weights_;
+  std::vector<float> cell_to_forget_weights_;
+  std::vector<float> cell_to_output_weights_;
+  std::vector<float> projection_weights_;
 
   // LSTM input is stored as num_batch x num_inputs vector.
   std::vector<std::vector<float>> lstm_input_;
diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/maximum_minimum.cc
rename to tensorflow/lite/kernels/maximum_minimum.cc
index 7cb01465eef45cead2bcd4cbeffc70b599937dff..3bcaabf675eba4f528fe73b01610d915e7780f85 100644
--- a/tensorflow/contrib/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
rename to tensorflow/lite/kernels/maximum_minimum_test.cc
index fd4d5367c5a6369b5ffeeea30a910262bc0796a9..acb74e09d3fb47c33c6c146af4d0b1b1030491be 100644
--- a/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/mfcc.cc b/tensorflow/lite/kernels/mfcc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5b0212728e02b83bfb69a562ca948a7f44b2002
--- /dev/null
+++ b/tensorflow/lite/kernels/mfcc.cc
@@ -0,0 +1,154 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/internal/mfcc.h"
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/mfcc_dct.h"
+#include "tensorflow/lite/kernels/internal/mfcc_mel_filterbank.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace mfcc {
+
+enum KernelType {
+  kReference,
+};
+
+typedef struct {
+  float upper_frequency_limit;
+  float lower_frequency_limit;
+  int filterbank_channel_count;
+  int dct_coefficient_count;
+} TfLiteMfccParams;
+
+constexpr int kInputTensorWav = 0;
+constexpr int kInputTensorRate = 1;
+constexpr int kOutputTensor = 0;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new TfLiteMfccParams;
+
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  data->upper_frequency_limit = m["upper_frequency_limit"].AsInt64();
+  data->lower_frequency_limit = m["lower_frequency_limit"].AsInt64();
+  data->filterbank_channel_count = m["filterbank_channel_count"].AsInt64();
+  data->dct_coefficient_count = m["dct_coefficient_count"].AsInt64();
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<TfLiteMfccParams*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(inputWav), 3);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(inputRate), 1);
+
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, inputWav->type, output->type);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
+  output_size->data[0] = inputWav->dims->data[0];
+  output_size->data[1] = inputWav->dims->data[1];
+  output_size->data[2] = params->dct_coefficient_count;
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+// Input is a single squared-magnitude spectrogram frame. The input spectrum
+// is converted to linear magnitude and weighted into bands using a
+// triangular mel filterbank, and a discrete cosine transform (DCT) of the
+// values is taken. Output is populated with the lowest dct_coefficient_count
+// of these values.
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
+
+  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  const int32 sample_rate = *GetTensorData<int>(inputRate);
+
+  const int spectrogram_channels = inputWav->dims->data[2];
+  const int spectrogram_samples = inputWav->dims->data[1];
+  const int audio_channels = inputWav->dims->data[0];
+
+  internal::Mfcc mfcc;
+  mfcc.set_upper_frequency_limit(params->upper_frequency_limit);
+  mfcc.set_lower_frequency_limit(params->lower_frequency_limit);
+  mfcc.set_filterbank_channel_count(params->filterbank_channel_count);
+  mfcc.set_dct_coefficient_count(params->dct_coefficient_count);
+
+  mfcc.Initialize(spectrogram_channels, sample_rate);
+
+  const float* spectrogram_flat = GetTensorData<float>(inputWav);
+  float* output_flat = GetTensorData<float>(output);
+
+  for (int audio_channel = 0; audio_channel < audio_channels; ++audio_channel) {
+    for (int spectrogram_sample = 0; spectrogram_sample < spectrogram_samples;
+         ++spectrogram_sample) {
+      const float* sample_data =
+          spectrogram_flat +
+          (audio_channel * spectrogram_samples * spectrogram_channels) +
+          (spectrogram_sample * spectrogram_channels);
+      std::vector<double> mfcc_input(sample_data,
+                                     sample_data + spectrogram_channels);
+      std::vector<double> mfcc_output;
+      mfcc.Compute(mfcc_input, &mfcc_output);
+      TF_LITE_ENSURE_EQ(context, params->dct_coefficient_count,
+                        mfcc_output.size());
+      float* output_data = output_flat +
+                           (audio_channel * spectrogram_samples *
+                            params->dct_coefficient_count) +
+                           (spectrogram_sample * params->dct_coefficient_count);
+      for (int i = 0; i < params->dct_coefficient_count; ++i) {
+        output_data[i] = mfcc_output[i];
+      }
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace mfcc
+
+TfLiteRegistration* Register_MFCC() {
+  static TfLiteRegistration r = {mfcc::Init, mfcc::Free, mfcc::Prepare,
+                                 mfcc::Eval<mfcc::kReference>};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/mfcc_test.cc b/tensorflow/lite/kernels/mfcc_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/mfcc_test.cc
rename to tensorflow/lite/kernels/mfcc_test.cc
index fe692232227966e42008856ad5c0dd5c041ad0e2..ade5bf53d11f7dd26a80fd33a85599834fe0293f 100644
--- a/tensorflow/contrib/lite/kernels/mfcc_test.cc
+++ b/tensorflow/lite/kernels/mfcc_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e74e47f7a37b0f449fb2a63237e95066bb452de6
--- /dev/null
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -0,0 +1,374 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace mirror_pad {
+namespace {
+
+// Simple class that represents a mirror padded tensor - which is the output
+// from the Op.
+struct PaddedTensor {
+  // If not null that means this is a scalar value.
+  // Note: This is not owned by default. It will point to the value
+  // in the input tensor.
+  const void* value = nullptr;
+  // If this tensor is not one value, then this vector will have
+  // all the tensors that belongs to this tensor.
+  // Pointers are owned.
+  std::vector<std::unique_ptr<PaddedTensor>> values;
+  // Pointers to PaddedTensors that are padded on the left of the current
+  // tensor.
+  std::vector<PaddedTensor*> left_pad_ptrs;
+  // Pointers to PaddedTensors that are padded on the right of the current
+  // tensor.
+  std::vector<PaddedTensor*> right_pad_ptrs;
+
+  // Returns mutable pointer to the tensor identified by 'indices'.
+  PaddedTensor* GetMutable(const std::vector<int>& indices) {
+    auto* result = this;
+    for (int i = 0; i < indices.size(); ++i) {
+      if (indices[i] >= result->values.size()) {
+        return nullptr;
+      }
+      result = result->values[indices[i]].get();
+      if (result == nullptr) break;
+    }
+    return result;
+  }
+};
+
+// Util method to initialize the memory of the padded tensor.
+void InitializeTensorMemory(const TfLiteIntArray* const dims, int dim_index,
+                            int dims_size, PaddedTensor* padded_tensor) {
+  if (dim_index >= dims_size) {
+    return;
+  }
+  padded_tensor->values.reserve(dims->data[dim_index]);
+  for (int i = 0; i < dims->data[dim_index]; ++i) {
+    padded_tensor->values.emplace_back(new PaddedTensor());
+    InitializeTensorMemory(dims, dim_index + 1, dims_size,
+                           padded_tensor->values.back().get());
+  }
+}
+
+// Returns pointer to the value at the specified index in 'data'.
+inline const void* GetValuePointerAtIndex(const void* data, int index,
+                                          const TfLiteType data_type) {
+  switch (data_type) {
+    case kTfLiteFloat32:
+      return static_cast<const float*>(data) + index;
+    case kTfLiteInt32:
+      return static_cast<const int32_t*>(data) + index;
+    case kTfLiteUInt8:
+      return static_cast<const uint8_t*>(data) + index;
+    case kTfLiteInt64:
+      return static_cast<const int64_t*>(data) + index;
+    case kTfLiteBool:
+      return static_cast<const bool*>(data) + index;
+    case kTfLiteInt16:
+      return static_cast<const int16_t*>(data) + index;
+    case kTfLiteInt8:
+      return static_cast<const int8_t*>(data) + index;
+    // Unsupported types ?
+    default:
+      return nullptr;
+  }
+  return nullptr;
+}
+
+// Util method that increment index in the N-d array.
+void IncrementTensorIndex(const TfLiteIntArray* dims,
+                          std::vector<int>* tensor_index_ptr) {
+  int dimension_index = dims->size - 1;
+  auto& tensor_index = *tensor_index_ptr;
+  tensor_index[dimension_index]++;
+  while (dimension_index >= 0 &&
+         tensor_index[dimension_index] == dims->data[dimension_index]) {
+    tensor_index[dimension_index] = 0;
+    dimension_index--;
+    if (dimension_index >= 0) tensor_index[dimension_index]++;
+  }
+}
+
+// Fills the 'padded_tensor' with data from 'input_tensor'.
+TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor,
+                                 PaddedTensor* padded_tensor) {
+  const auto* dims = input_tensor->dims;
+  const auto data_type = input_tensor->type;
+  const void* data = static_cast<const void*>(input_tensor->data.raw_const);
+  // Either invalid input or unsupported type.+
+  if (data == nullptr) {
+    return kTfLiteError;
+  }
+  // Index of current processing tensor.
+  std::vector<int> tensor_index(dims->size, 0);
+  int flat_index = 0;
+  const int num_elements = NumElements(input_tensor);
+  while (flat_index < num_elements) {
+    auto* tensor = padded_tensor->GetMutable(tensor_index);
+    if (tensor == nullptr) {
+      return kTfLiteError;
+    }
+    tensor->value = GetValuePointerAtIndex(data, flat_index, data_type);
+    IncrementTensorIndex(dims, &tensor_index);
+    ++flat_index;
+  }
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+inline void GetPadding(const T* data, int offset, int64_t* left_pad,
+                       int64_t* right_pad) {
+  *left_pad = static_cast<int64_t>(*(data + offset * 2));
+  *right_pad = static_cast<int64_t>(*(data + offset * 2 + 1));
+}
+
+inline TfLiteStatus GetPadding(const TfLiteTensor* padding_matrix,
+                               int dimension, int64_t* left_pad,
+                               int64_t* right_pad) {
+  switch (padding_matrix->type) {
+    case kTfLiteInt32:
+      GetPadding(padding_matrix->data.i32, dimension, left_pad, right_pad);
+      break;
+    case kTfLiteInt64:
+      GetPadding(padding_matrix->data.i64, dimension, left_pad, right_pad);
+      break;
+    default:
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ValidateTensor(const TfLiteTensor* padding_matrix, int offset,
+                            int dimension_index, PaddedTensor* padded_tensor,
+                            TfLiteContext* context) {
+  if (dimension_index >= padding_matrix->dims->data[0]) {
+    return kTfLiteOk;
+  }
+
+  int64_t left_pad = 0, right_pad = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
+  // If we are not going to include border we must have enough values
+  // to use.
+  if (left_pad + offset > padded_tensor->values.size()) {
+    context->ReportError(
+        context, "Not enough values for Mirror Pad, required %d, available %d.",
+        left_pad + offset, padded_tensor->values.size());
+    return kTfLiteError;
+  }
+  if (right_pad + offset > padded_tensor->values.size()) {
+    context->ReportError(
+        context, "Not enough values for Mirror Pad, required %d, available %d.",
+        right_pad + offset, padded_tensor->values.size());
+    return kTfLiteError;
+  }
+  if (!padded_tensor->values.empty()) {
+    ValidateTensor(padding_matrix, offset, dimension_index + 1,
+                   padded_tensor->values[0].get(), context);
+  }
+  return kTfLiteOk;
+}
+
+// Fills 'padded_tensor' with the padding information based on
+// 'padding_matrix'.
+// 'dimension_index' represents which dimension the function is operating on.
+TfLiteStatus PadTensor(const TfLiteTensor* padding_matrix, int offset,
+                       int dimension_index, PaddedTensor* padded_tensor,
+                       TfLiteContext* context) {
+  if (dimension_index >= padding_matrix->dims->data[0]) return kTfLiteOk;
+
+  int64_t left_pad = 0, right_pad = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
+
+  for (int i = left_pad + offset - 1; i >= offset && left_pad > 0;
+       --i, --left_pad) {
+    padded_tensor->left_pad_ptrs.push_back(padded_tensor->values[i].get());
+  }
+  for (int i = padded_tensor->values.size() - (1 + offset);
+       i >= 0 && right_pad > 0; --i, --right_pad) {
+    padded_tensor->right_pad_ptrs.push_back(padded_tensor->values[i].get());
+  }
+
+  for (auto& tensor : padded_tensor->values) {
+    TF_LITE_ENSURE_STATUS(PadTensor(padding_matrix, offset, dimension_index + 1,
+                                    tensor.get(), context));
+  }
+  return kTfLiteOk;
+}
+
+// Fills 'output_data' with data from 'padded_tensor'.
+// The function does this recursively by setting left padding first then
+// original data, followed by the right padding.
+template <typename T>
+int FillOutput(const PaddedTensor* padded_tensor, T* output_data,
+               int index_in_output) {
+  if (padded_tensor == nullptr || output_data == nullptr) {
+    return -1;
+  }
+  if (padded_tensor->value != nullptr) {
+    output_data[index_in_output] = *static_cast<const T*>(padded_tensor->value);
+    return index_in_output + 1;
+  }
+  for (const auto* tensor : padded_tensor->left_pad_ptrs) {
+    index_in_output = FillOutput(tensor, output_data, index_in_output);
+  }
+  for (const auto& tensor : padded_tensor->values) {
+    index_in_output = FillOutput(tensor.get(), output_data, index_in_output);
+  }
+  for (const auto* tensor : padded_tensor->right_pad_ptrs) {
+    index_in_output = FillOutput(tensor, output_data, index_in_output);
+  }
+  return index_in_output;
+}
+
+// Returns the shape of the final output after padding.
+std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> GetPaddedOutputShape(
+    const TfLiteTensor* input, const TfLiteTensor* padding_matrix) {
+  const int input_dims = NumDimensions(input);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(input_dims), TfLiteIntArrayFree);
+
+  int64_t left_pad = 0, right_pad = 0;
+  for (int i = 0; i < input_dims; ++i) {
+    GetPadding(padding_matrix, i, &left_pad, &right_pad);
+    shape->data[i] = SizeOfDimension(input, i) + left_pad + right_pad;
+  }
+  return shape;
+}
+
+}  // namespace
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
+  auto* params =
+      reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
+
+  if (params == nullptr) {
+    return kTfLiteError;
+  }
+  const int input_dims = NumDimensions(input_tensor);
+
+  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+  if (IsDynamicTensor(output_tensor)) {
+    auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix);
+    if (output_size == nullptr) {
+      return kTfLiteError;
+    }
+    TF_LITE_ENSURE_STATUS(
+        context->ResizeTensor(context, output_tensor, output_size.release()));
+  }
+
+  PaddedTensor padded_tensor;
+  // Initialize memory.
+  InitializeTensorMemory(input_tensor->dims, 0, input_dims, &padded_tensor);
+  // Set the values from the input_tensor.
+  TF_LITE_ENSURE_STATUS(InitFromInputTensor(input_tensor, &padded_tensor));
+
+  const int offset =
+      params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0
+                                                                           : 1;
+  // Make sure padding values are sufficient and valid to use.
+  TF_LITE_ENSURE_STATUS(
+      ValidateTensor(padding_matrix, offset, 0, &padded_tensor, context));
+  // Apply padding.
+  TF_LITE_ENSURE_STATUS(
+      PadTensor(padding_matrix, offset, 0, &padded_tensor, context));
+
+  // Fill the output tensor from the padded tensor.
+  TfLiteStatus status = kTfLiteOk;
+
+#define TF_LITE_MIRROR_PAD(type) \
+  FillOutput(&padded_tensor, GetTensorData<type>(output_tensor), 0);
+
+  switch (output_tensor->type) {
+    case kTfLiteFloat32: {
+      TF_LITE_MIRROR_PAD(float);
+      break;
+    }
+    case kTfLiteInt32: {
+      TF_LITE_MIRROR_PAD(int32_t);
+      break;
+    }
+    case kTfLiteUInt8: {
+      TF_LITE_MIRROR_PAD(uint8_t);
+      break;
+    }
+    case kTfLiteInt64: {
+      TF_LITE_MIRROR_PAD(int64_t);
+      break;
+    }
+    default:
+      status = kTfLiteError;
+      break;
+  }
+#undef TF_LITE_MIRROR_PAD
+  return status;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
+  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0),
+                    NumDimensions(input_tensor));
+
+  if (!IsConstantTensor(padding_matrix)) {
+    SetTensorToDynamic(output_tensor);
+    return kTfLiteOk;
+  }
+  // We have constant padding, so we can infer output size.
+
+  auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix);
+  if (output_size == nullptr) {
+    return kTfLiteError;
+  }
+  return context->ResizeTensor(context, output_tensor, output_size.release());
+}
+
+}  // namespace mirror_pad
+TfLiteRegistration* Register_MIRROR_PAD() {
+  static TfLiteRegistration r = {mirror_pad::Init, mirror_pad::Free,
+                                 mirror_pad::Prepare, mirror_pad::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/mirror_pad_test.cc b/tensorflow/lite/kernels/mirror_pad_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd09e6e4493d3a29bffecfcd4a4d1946840a4e5e
--- /dev/null
+++ b/tensorflow/lite/kernels/mirror_pad_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class BaseMirrorPadOpModel : public SingleOpModel {
+ public:
+  BaseMirrorPadOpModel(const TensorData& input,
+                       const TensorData& padding_matrix,
+                       const TensorData& output,
+                       const tflite::MirrorPadMode mode) {
+    input_id_ = AddInput(input);
+    padding_matrix_id_ = AddInput(padding_matrix);
+    output_id_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MIRROR_PAD, BuiltinOptions_MirrorPadOptions,
+                 CreateMirrorPadOptions(builder_, mode).Union());
+    BuildInterpreter({GetShape(input_id_), GetShape(padding_matrix_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+  int padding_matrix_tensor_id() { return padding_matrix_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+
+ protected:
+  int input_id_;
+  int padding_matrix_id_;
+  int output_id_;
+};
+
+TEST(MirrorPadTest, EmptyPad) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 0, 0, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 1, 0, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 2, 4, 5, 6, 5, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 0, 1, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 2, 1, 2, 3, 5, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 1, 0, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 3, 4, 5, 6, 6, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 0, 1, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 1, 1, 2, 3, 4, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 1, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 3, 1, 1, 2, 3, 3,
+                                4, 4, 5, 6, 6, 4, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 1, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 5, 2, 1, 2, 3, 2,
+                                5, 4, 5, 6, 5, 2, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric_Whole) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {2, 2, 3, 3});
+  model.Invoke();
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1,
+                        3, 2, 1, 1, 2, 3, 3, 2, 1, 6, 5, 4, 4, 5, 6, 6, 5, 4,
+                        6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_Whole) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 2, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1,
+                                6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 2, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({2, 1, 1, 2, 3, 3, 2, 2, 1, 1, 2, 3, 3, 2,
+                                5, 4, 4, 5, 6, 6, 5, 5, 4, 4, 5, 6, 6, 5}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2}));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
similarity index 87%
rename from tensorflow/contrib/lite/kernels/mul.cc
rename to tensorflow/lite/kernels/mul.cc
index e0aac8a84244ddb048e6055aa16c6a34e0f1e2c3..01039a705438af2a92a68b01c2146daf69c46250 100644
--- a/tensorflow/contrib/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -153,26 +153,34 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            const TfLiteTensor* input2, TfLiteTensor* output) {
   if (input1->type == kTfLiteUInt8 && input2->type == kTfLiteUInt8 &&
       output->type == kTfLiteUInt8) {
+    tflite::ArithmeticParams op_params;
+    SetActivationParams(data->output_activation_min,
+                        data->output_activation_max, &op_params);
+    op_params.input1_offset = -input1->params.zero_point;
+    op_params.input2_offset = -input2->params.zero_point;
+    op_params.output_offset = output->params.zero_point;
+    op_params.output_multiplier = data->output_multiplier;
+    op_params.output_shift = data->output_shift;
+    bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+        GetTensorShape(input1), GetTensorShape(input2), &op_params);
 #define TF_LITE_MUL(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  SetActivationParams(data->output_activation_min,                     \
-                      data->output_activation_max, &op_params);        \
-  op_params.input1_offset = -input1->params.zero_point;                \
-  op_params.input2_offset = -input2->params.zero_point;                \
-  op_params.output_offset = output->params.zero_point;                 \
-  op_params.output_multiplier = data->output_multiplier;               \
-  op_params.output_shift = data->output_shift;                         \
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
                GetTensorData<uint8_t>(input2), GetTensorShape(output), \
                GetTensorData<uint8_t>(output))
 
-    // The quantized version of Mul doesn't support activations, so we
-    // always use BroadcastMul.
     if (kernel_type == kReference) {
-      TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
+      if (need_broadcast) {
+        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
+      } else {
+        TF_LITE_MUL(reference_ops, Mul);
+      }
     } else {
-      TF_LITE_MUL(optimized_ops, BroadcastMul4DSlow);
+      if (need_broadcast) {
+        TF_LITE_MUL(optimized_ops, BroadcastMulFivefold);
+      } else {
+        TF_LITE_MUL(optimized_ops, Mul);
+      }
     }
 #undef TF_LITE_MUL
   } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/mul_test.cc
rename to tensorflow/lite/kernels/mul_test.cc
index 2807550a6b07f3f9f1f1e3f72acc9882c76d166a..200cc26dadc3527813a7dabd3b9ca4811d4c8856 100644
--- a/tensorflow/contrib/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -107,7 +107,7 @@ TEST(FloatMulOpTest, ActivationRELU_N1_TO_1) {
 }
 
 TEST(FloatMulOpTest, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -124,7 +124,7 @@ TEST(FloatMulOpTest, VariousInputShapes) {
 }
 
 TEST(FloatMulOpTest, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -161,7 +161,7 @@ TEST(IntegerMulOpTest, ActivationRELU_N1_TO_1) {
 }
 
 TEST(IntegerMulOpTest, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerMulOpModel m({TensorType_INT32, test_shapes[i]},
@@ -176,7 +176,7 @@ TEST(IntegerMulOpTest, VariousInputShapes) {
 }
 
 TEST(IntegerMulOpTest, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerMulOpModel m({TensorType_INT32, test_shapes[i]},
@@ -245,7 +245,7 @@ float GetTolerance(int min, int max) {
 
 TEST(QuantizedMulOpTest, WithBroadcast) {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
diff --git a/tensorflow/contrib/lite/kernels/neg.cc b/tensorflow/lite/kernels/neg.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/neg.cc
rename to tensorflow/lite/kernels/neg.cc
index 0ddd0644f5a1cc9e271e9feb04f2b0053ffe0937..e9a1aa232542308e4ee091f9b7cb9e8e75330bfd 100644
--- a/tensorflow/contrib/lite/kernels/neg.cc
+++ b/tensorflow/lite/kernels/neg.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/neg_test.cc b/tensorflow/lite/kernels/neg_test.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/neg_test.cc
rename to tensorflow/lite/kernels/neg_test.cc
index 3d3594c60bbe1684dff7b1816f5f8a715b1abc60..d461ede3c480e2b9611cade2e8eeb5189387b264 100644
--- a/tensorflow/contrib/lite/kernels/neg_test.cc
+++ b/tensorflow/lite/kernels/neg_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/one_hot.cc b/tensorflow/lite/kernels/one_hot.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/one_hot.cc
rename to tensorflow/lite/kernels/one_hot.cc
index 910aed6f142dc9c8af446fe83cacb4f714882357..2ac12fe9308f382357bd708263640793b0eec371 100644
--- a/tensorflow/contrib/lite/kernels/one_hot.cc
+++ b/tensorflow/lite/kernels/one_hot.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/one_hot_test.cc b/tensorflow/lite/kernels/one_hot_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/one_hot_test.cc
rename to tensorflow/lite/kernels/one_hot_test.cc
index 6b604ec7a7f86b333805d91a95cb5054f0257ae4..85438327e7e3a31ec22a0361e7f503e78b465ffb 100644
--- a/tensorflow/contrib/lite/kernels/one_hot_test.cc
+++ b/tensorflow/lite/kernels/one_hot_test.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include <initializer_list>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
similarity index 86%
rename from tensorflow/contrib/lite/kernels/op_macros.h
rename to tensorflow/lite/kernels/op_macros.h
index 11e814daee12b67ddc98f8e80bb9feec4505e38d..1a54a378b03d6440a254ee12ac51d5976cc0a815 100644
--- a/tensorflow/contrib/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_
+#ifndef TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
+#define TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
 
 // If we're on a platform without standard IO functions, fall back to a
 // non-portable function.
 #ifdef TF_LITE_MCU_DEBUG_LOG
 
-// This header is pulled in from the support library at
-// https://github.com/google/stm32_bare_lib
-#include <debug_log.h>
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 
 #define DEBUG_LOG(x) \
   do {               \
@@ -69,4 +67,4 @@ inline void InfiniteLoop() {
     if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \
   } while (0)
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_OP_MACROS_H_
+#endif  // TENSORFLOW_LITE_KERNELS_OP_MACROS_H_
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/lite/kernels/optional_tensor_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/kernels/optional_tensor_test.cc
rename to tensorflow/lite/kernels/optional_tensor_test.cc
index 90a915bb023b2b3db86e8334e93e2f1d41e0a9f2..a09f86015894c457525c5ce7638aa4a05b77a211 100644
--- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/lite/kernels/optional_tensor_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
similarity index 85%
rename from tensorflow/contrib/lite/kernels/pack.cc
rename to tensorflow/lite/kernels/pack.cc
index c368582ef76c725d575e1b2c50d08d656c876eca..479495c875dac5d4e827864548c6b4a188e284ee 100644
--- a/tensorflow/contrib/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -40,10 +40,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // TODO(renjieliu): Support negative axis.
   TF_LITE_ENSURE(context, data->axis >= 0);
   if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 &&
-      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16) {
-    context->ReportError(context,
-                         "Currently pack only supports "
-                         "float32/uint8/int16/int32.");
+      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16 &&
+      input0->type != kTfLiteInt64) {
+    context->ReportError(context, "Type '%s' is not supported by pack.",
+                         TfLiteTypeGetName(input0->type));
     return kTfLiteError;
   }
   // Make sure all inputs have the same shape and type.
@@ -111,10 +111,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       PackImpl<int32_t>(context, node, output, data->values_count, data->axis);
       break;
     }
+    case kTfLiteInt64: {
+      PackImpl<int64_t>(context, node, output, data->values_count, data->axis);
+      break;
+    }
     default: {
-      context->ReportError(context,
-                           "Currently pack only supports "
-                           "float32/uint8/int32.");
+      context->ReportError(context, "Type '%s' is not supported by pack.",
+                           TfLiteTypeGetName(output->type));
       return kTfLiteError;
     }
   }
diff --git a/tensorflow/contrib/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
similarity index 77%
rename from tensorflow/contrib/lite/kernels/pack_test.cc
rename to tensorflow/lite/kernels/pack_test.cc
index c70dbd2764b615530a9587b521a3616eece92cb6..4f58debc5c872ea640ed97cd51884a39b412ff2f 100644
--- a/tensorflow/contrib/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -113,6 +113,40 @@ TEST(PackOpTest, Int32MultilDimensions) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
+// int64 tests.
+TEST(PackOpTest, Int64ThreeInputs) {
+  PackOpModel<int64_t> model({TensorType_INT64, {2}}, 0, 3);
+  model.SetInput(0, {1LL << 33, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, -(1LL << 34)});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1LL << 33, 4LL, 2LL, 5LL, 3LL, -(1LL << 34)}));
+}
+
+TEST(PackOpTest, Int64ThreeInputsDifferentAxis) {
+  PackOpModel<int64_t> model({TensorType_INT64, {2}}, 1, 3);
+  model.SetInput(0, {1LL << 33, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, -(1LL << 34)});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1LL << 33, 2LL, 3LL, 4LL, 5LL, -(1LL << 34)}));
+}
+
+TEST(PackOpTest, Int64MultilDimensions) {
+  PackOpModel<int64_t> model({TensorType_INT64, {2, 3}}, 1, 2);
+  model.SetInput(0, {1LL << 33, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, -(1LL << 34), 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1LL << 33, 2LL, 3LL, 7LL, 8LL, -(1LL << 34),
+                                4LL, 5LL, 6LL, 10LL, 11LL, 12LL}));
+}
+
 // uint8
 TEST(PackOpTest, Uint8ThreeInputs) {
   PackOpModel<uint8_t> model({TensorType_UINT8, {2}}, 0, 3);
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
similarity index 77%
rename from tensorflow/contrib/lite/kernels/pad.cc
rename to tensorflow/lite/kernels/pad.cc
index 0d939405f62c3e9554dc8b2d02f77a47bb0a1877..8e6ed6e741f782f070714164a7af7b4f98a1558f 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -14,13 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -44,12 +44,24 @@ struct PadContext {
     }
     output = GetOutput(context, node, 0);
     dims = NumDimensions(input);
+
+    resizing_category = ResizingCategory::kGenericResize;
+    const int paddings_total = GetTensorShape(paddings).FlatSize();
+    const int32* paddings_data = GetTensorData<int32>(paddings);
+    // Paddings will be a n,2 array, and we need to detect 4D arrays with the
+    // pattern { {0,0}, {a, b}, {c, d}, {0,0} }.
+    if (IsConstantTensor(paddings) && paddings_total == 8 &&
+        (paddings_data[0] == 0 && paddings_data[1] == 0) &&
+        (paddings_data[6] == 0 && paddings_data[7] == 0)) {
+      resizing_category = ResizingCategory::kImageStyle;
+    }
   }
   const TfLiteTensor* constant_values;
   const TfLiteTensor* input;
   const TfLiteTensor* paddings;
   TfLiteTensor* output;
   int dims;
+  ResizingCategory resizing_category;
 };
 
 // Resizes output array based on the input size and padding size. This function
@@ -134,31 +146,39 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     after_padding.push_back(paddings_data[idx * 2 + 1]);
   }
 
-#define TF_LITE_PAD(type, scalar, pad_value)                             \
-  TF_LITE_ENSURE(context, before_padding.size() <= 4);                   \
-  TF_LITE_ENSURE(context, after_padding.size() <= 4);                    \
-  tflite::PadParams op_params;                                           \
-  op_params.left_padding_count = before_padding.size();                  \
-  op_params.right_padding_count = after_padding.size();                  \
-  for (int i = 0; i < op_context.dims; ++i) {                            \
-    op_params.left_padding[i] = before_padding[op_context.dims - 1 - i]; \
-    op_params.right_padding[i] = after_padding[op_context.dims - 1 - i]; \
-  }                                                                      \
-  const scalar pad_value_copy = pad_value;                               \
-                                                                         \
-  type::Pad(op_params, GetTensorShape(op_context.input),                 \
-            GetTensorData<scalar>(op_context.input), &pad_value_copy,    \
-            GetTensorShape(op_context.output),                           \
-            GetTensorData<scalar>(op_context.output))
+#define TF_LITE_PAD(type, op_name, scalar, pad_value)                     \
+  TF_LITE_ENSURE(context, before_padding.size() <= 4);                    \
+  TF_LITE_ENSURE(context, after_padding.size() <= 4);                     \
+  tflite::PadParams op_params;                                            \
+  op_params.left_padding_count = before_padding.size();                   \
+  op_params.right_padding_count = after_padding.size();                   \
+  for (int i = 0; i < op_context.dims; ++i) {                             \
+    op_params.left_padding[i] = before_padding[op_context.dims - 1 - i];  \
+    op_params.right_padding[i] = after_padding[op_context.dims - 1 - i];  \
+  }                                                                       \
+  const scalar pad_value_copy = pad_value;                                \
+                                                                          \
+  type::op_name(op_params, GetTensorShape(op_context.input),              \
+                GetTensorData<scalar>(op_context.input), &pad_value_copy, \
+                GetTensorShape(op_context.output),                        \
+                GetTensorData<scalar>(op_context.output))
   switch (op_context.input->type) {
     case kTfLiteFloat32: {
       float pad_value = op_context.constant_values == nullptr
                             ? 0.f
                             : *GetTensorData<float>(op_context.constant_values);
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, float, pad_value);
+        if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+          TF_LITE_PAD(reference_ops, PadImageStyle, float, pad_value);
+        } else {
+          TF_LITE_PAD(reference_ops, Pad, float, pad_value);
+        }
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, float, pad_value);
+        if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+          TF_LITE_PAD(optimized_ops, PadImageStyle, float, pad_value);
+        } else {
+          TF_LITE_PAD(optimized_ops, Pad, float, pad_value);
+        }
       }
     } break;
     case kTfLiteUInt8: {
@@ -181,9 +201,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         pad_value = *GetTensorData<uint8_t>(op_context.constant_values);
       }
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, uint8_t, pad_value);
+        if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+          TF_LITE_PAD(reference_ops, PadImageStyle, uint8_t, pad_value);
+        } else {
+          TF_LITE_PAD(reference_ops, Pad, uint8_t, pad_value);
+        }
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, uint8_t, pad_value);
+        if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+          TF_LITE_PAD(optimized_ops, PadImageStyle, uint8_t, pad_value);
+        } else {
+          TF_LITE_PAD(optimized_ops, Pad, uint8_t, pad_value);
+        }
       }
     } break;
     case kTfLiteInt32: {
@@ -192,9 +220,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               ? 0
               : *GetTensorData<int32_t>(op_context.constant_values);
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int32_t, pad_value);
+        TF_LITE_PAD(reference_ops, Pad, int32_t, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int32_t, pad_value);
+        TF_LITE_PAD(optimized_ops, Pad, int32_t, pad_value);
       }
     } break;
     case kTfLiteInt64: {
@@ -203,9 +231,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               ? 0L
               : *GetTensorData<int64_t>(op_context.constant_values);
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int64_t, pad_value);
+        TF_LITE_PAD(reference_ops, Pad, int64_t, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int64_t, pad_value);
+        TF_LITE_PAD(optimized_ops, Pad, int64_t, pad_value);
       }
     } break;
     default:
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/pad_test.cc
rename to tensorflow/lite/kernels/pad_test.cc
index f663899713f6f9878c8e9390ca5db790f9d79b21..415a285c707e6aa7a5a2029822cdf54d57692839 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -58,19 +58,6 @@ class PadOpModel : public SingleOpModel {
   int constant_values_;
 };
 
-namespace {
-
-// Returns the corresponding TensorType given the type T.
-template <typename T>
-TensorType GetTensorType() {
-  if (std::is_same<T, float>::value) return TensorType_FLOAT32;
-  if (std::is_same<T, int32_t>::value) return TensorType_INT32;
-  if (std::is_same<T, uint8_t>::value) return TensorType_UINT8;
-  return TensorType_MIN;  // default value
-}
-
-}  // namespace
-
 // Tests case where paddings is a const tensor. Type T is the dtype.
 template <typename T>
 class PadV2OpConstModel : public PadOpModel<T> {
@@ -210,6 +197,19 @@ TEST(PadOpTest, InvalidPadValue) {
 }
 
 TEST(PadOpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                    {1, 1, 0, 0, 1, 1, 0, 0}, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,
+                                0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2, 4, 1}));
+}
+
+TEST(PadOpTest, SimpleConstImageStyleTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
   PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
@@ -221,6 +221,26 @@ TEST(PadOpTest, SimpleConstTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
+// Optimized versions may choose to handle zero-sized images differently.
+TEST(PadOpTest, ZeroHeightConstImageStyleTest) {
+  PadOpConstModel m({TensorType_FLOAT32, {1, 0, 2, 1}}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
+  // Nothing to SetInput().
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 4, 1}));
+}
+
+// Optimized versions may choose to handle zero-sized images differently.
+TEST(PadOpTest, ZeroWidthConstImageStyleTest) {
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 0, 1}}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
+  // Nothing to SetInput().
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 2, 1}));
+}
+
 TEST(PadOpTest, SimpleConst1DTest) {
   PadOpConstModel m({TensorType_FLOAT32, {2}}, {1, 2}, {1, 2},
                     {TensorType_FLOAT32});
@@ -242,6 +262,19 @@ TEST(PadOpTest, SimpleDynamicTest) {
 }
 
 TEST(PadOpTest, AdvancedConstTest) {
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                    {1, 0, 0, 2, 0, 3, 0, 0}, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                        0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 4, 5,
+                        6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 4, 6, 1}));
+}
+
+TEST(PadOpTest, AdvancedConstImageStyleTest) {
   PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
                     {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/lite/kernels/padding.h
similarity index 90%
rename from tensorflow/contrib/lite/kernels/padding.h
rename to tensorflow/lite/kernels/padding.h
index 42b6b45d3bfc4ae1c4b4fdc93f838a9b165ed580..30aa4f1bd330e2866091948b89786ea2b6290a9e 100644
--- a/tensorflow/contrib/lite/kernels/padding.h
+++ b/tensorflow/lite/kernels/padding.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#ifndef TENSORFLOW_LITE_KERNELS_PADDING_H_
+#define TENSORFLOW_LITE_KERNELS_PADDING_H_
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 
 namespace tflite {
 
@@ -55,4 +55,4 @@ inline TfLitePaddingValues ComputePaddingHeightWidth(
 }
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#endif  // TENSORFLOW_LITE_KERNELS_PADDING_H_
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/pooling.cc
rename to tensorflow/lite/kernels/pooling.cc
index 6451142391599e3279b7d20dd0c2a941c909dbdc..694a36ffbcf3c8c9d8fe65e1b922ca03921883b3 100644
--- a/tensorflow/contrib/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -19,14 +19,14 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/kernels/padding.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
similarity index 75%
rename from tensorflow/contrib/lite/kernels/pooling_test.cc
rename to tensorflow/lite/kernels/pooling_test.cc
index 01c91b2ba905e249c36af19f175c68a7e7f17f6d..98777f1c13ff97551c05cddc1d319918ea6ed69a 100644
--- a/tensorflow/contrib/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -67,6 +67,10 @@ class QuantizedPoolingOpModel : public BasePoolingOpModel {
     QuantizeAndPopulate<uint8_t>(input_, data);
   }
 
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
   std::vector<float> GetDequantizedOutput() {
     return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
@@ -106,6 +110,45 @@ TEST(QuantizedPoolingOpTest, AveragePool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({44, 92}));
 }
 
+// Send in a white image, expect a white pixel.
+TEST(QuantizedPoolingOpTest, AveragePoolImageSize16) {
+  int image_size = 16;
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, image_size, image_size, 1}, 0, 16},
+      /*filter_width=*/image_size,
+      /*filter_height=*/image_size,
+      /*output=*/{TensorType_UINT8, {}, 0, 16});
+
+  std::vector<float> input(image_size * image_size, 16.f);
+  m.SetInput(input);
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ::testing::ElementsAre(255));
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({16})));
+}
+
+// Send in a white image, expect something other than a white pixel, due to
+// overflow.
+TEST(QuantizedPoolingOpTest, AveragePoolImageSize17) {
+  int image_size = 17;
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, image_size, image_size, 1}, 0, 16},
+      /*filter_width=*/image_size,
+      /*filter_height=*/image_size,
+      /*output=*/{TensorType_UINT8, {}, 0, 16});
+
+  std::vector<float> input(image_size * image_size, 16.f);
+  m.SetInput(input);
+  m.Invoke();
+
+  // Ordinarily we would see '255' here. However, the optimized version of
+  // AveragePool uses a uint16 accumulator which causes it to overflow for
+  // images this large.
+  EXPECT_THAT(m.GetOutput(), ::testing::ElementsAre(28));
+}
+
 TEST(FloatPoolingOpTest, MaxPool) {
   FloatPoolingOpModel m(BuiltinOperator_MAX_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
diff --git a/tensorflow/contrib/lite/kernels/pow.cc b/tensorflow/lite/kernels/pow.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/pow.cc
rename to tensorflow/lite/kernels/pow.cc
index 1e96cc80b167914a37aadf39739730a0f8793983..9f84e1cc5e6d838adc62c2f3a69fd122f6d3195c 100644
--- a/tensorflow/contrib/lite/kernels/pow.cc
+++ b/tensorflow/lite/kernels/pow.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/pow_test.cc b/tensorflow/lite/kernels/pow_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/pow_test.cc
rename to tensorflow/lite/kernels/pow_test.cc
index 74b3aef5bd39d8bdb6375f24bd00d793889deef8..60d674e9779f0a94af667affb86e4d11e85a0a11 100644
--- a/tensorflow/contrib/lite/kernels/pow_test.cc
+++ b/tensorflow/lite/kernels/pow_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/range.cc b/tensorflow/lite/kernels/range.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eefe5db1ecee7a762c88a97959adf35411726646
--- /dev/null
+++ b/tensorflow/lite/kernels/range.cc
@@ -0,0 +1,171 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace range {
+namespace {
+
+constexpr int kStartTensor = 0;
+constexpr int kLimitTensor = 1;
+constexpr int kDeltaTensor = 2;
+constexpr int kOutputTensor = 0;
+
+template <typename T>
+TfLiteStatus GetSize(TfLiteContext* context, T start, T limit, T delta,
+                     int* size) {
+  TF_LITE_ENSURE(context, !std::equal_to<T>()(delta, 0));
+  TF_LITE_ENSURE(context,
+                 (start > limit && delta < 0) || (start < limit && delta > 0));
+  *size =
+      (std::is_integral<T>::value
+           ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
+           : std::ceil(std::abs((limit - start) / delta)));
+  return kTfLiteOk;
+}
+
+TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* start,
+                          const TfLiteTensor* limit, const TfLiteTensor* delta,
+                          TfLiteTensor* output) {
+  // The output will always be a 1-d array.
+  int size = 0;
+  switch (start->type) {
+    case kTfLiteInt32: {
+      TF_LITE_ENSURE_OK(context,
+                        GetSize(context, *GetTensorData<int32_t>(start),
+                                *GetTensorData<int32_t>(limit),
+                                *GetTensorData<int32_t>(delta), &size));
+      break;
+    }
+    case kTfLiteFloat32: {
+      TF_LITE_ENSURE_OK(context, GetSize(context, *GetTensorData<float>(start),
+                                         *GetTensorData<float>(limit),
+                                         *GetTensorData<float>(delta), &size));
+      break;
+    }
+    default: {
+      context->ReportError(context, "Unknown data type: %d", start->type);
+      return kTfLiteError;
+    }
+  }
+  TfLiteIntArray* output_shape_array = TfLiteIntArrayCreate(1);
+  output_shape_array->data[0] = size;
+  return context->ResizeTensor(context, output, output_shape_array);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* start = GetInput(context, node, kStartTensor);
+  const TfLiteTensor* limit = GetInput(context, node, kLimitTensor);
+  const TfLiteTensor* delta = GetInput(context, node, kDeltaTensor);
+  // Make sure all the inputs are scalars.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(start), 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(limit), 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(delta), 0);
+
+  // Currently only supports int32 and float.
+  // TODO(b/117912892): Support quantization as well.
+  const auto dtype = start->type;
+  TF_LITE_ENSURE(context, dtype == kTfLiteInt32 || dtype == kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, limit->type, dtype);
+  TF_LITE_ENSURE_EQ(context, delta->type, dtype);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (dtype) {
+    case kTfLiteInt32:
+      output->type = kTfLiteInt32;
+      break;
+    case kTfLiteFloat32:
+      output->type = kTfLiteFloat32;
+      break;
+    default:
+      context->ReportError(context, "Unknown index output data type: %d",
+                           dtype);
+      return kTfLiteError;
+  }
+
+  if (IsConstantTensor(start) && IsConstantTensor(limit) &&
+      IsConstantTensor(delta)) {
+    return ResizeOutput(context, start, limit, delta, output);
+  }
+
+  SetTensorToDynamic(output);
+  return kTfLiteOk;
+}
+
+template <typename T>
+void EvalImpl(const TfLiteTensor* start, const TfLiteTensor* delta,
+              TfLiteTensor* output) {
+  const T start_value = *GetTensorData<T>(start);
+  const T delta_value = *GetTensorData<T>(delta);
+  T* output_data = GetTensorData<T>(output);
+  const int num_elements = NumElements(output);
+  T value = start_value;
+  for (int i = 0; i < num_elements; ++i) {
+    output_data[i] = value;
+    value += delta_value;
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* start = GetInput(context, node, kStartTensor);
+  const TfLiteTensor* limit = GetInput(context, node, kLimitTensor);
+  const TfLiteTensor* delta = GetInput(context, node, kDeltaTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutput(context, start, limit, delta, output));
+  }
+
+  switch (output->type) {
+    case kTfLiteInt32: {
+      EvalImpl<int32_t>(start, delta, output);
+      break;
+    }
+    case kTfLiteFloat32: {
+      EvalImpl<float>(start, delta, output);
+      break;
+    }
+    default: {
+      context->ReportError(context, "Unsupported data type: %d", output->type);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+}  // namespace range
+
+TfLiteRegistration* Register_RANGE() {
+  static TfLiteRegistration r = {nullptr, nullptr, range::Prepare, range::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/range_test.cc b/tensorflow/lite/kernels/range_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1d4aaba433050c9b5c32651d1f317ef04134b10
--- /dev/null
+++ b/tensorflow/lite/kernels/range_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+template <typename T>
+class RangeOpModel : public SingleOpModel {
+ public:
+  explicit RangeOpModel(const TensorType& dtype) {
+    start_ = AddInput(dtype);
+    limit_ = AddInput(dtype);
+    delta_ = AddInput(dtype);
+    output_ = AddOutput(dtype);
+    SetBuiltinOp(BuiltinOperator_RANGE, BuiltinOptions_RangeOptions,
+                 CreateRangeOptions(builder_).Union());
+    BuildInterpreter({GetShape(start_), GetShape(limit_), GetShape(delta_)});
+  }
+
+  int start() { return start_; }
+  int limit() { return limit_; }
+  int delta() { return delta_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int start_;
+  int limit_;
+  int delta_;
+  int output_;
+};
+
+TEST(RangeOpModel, Simple) {
+  RangeOpModel<int32_t> model(TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.start(), {0});
+  model.PopulateTensor<int32_t>(model.limit(), {4});
+  model.PopulateTensor<int32_t>(model.delta(), {1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(0, 1, 2, 3));
+}
+
+TEST(RangeOpModel, DeltaGreaterThanOne) {
+  RangeOpModel<int32_t> model(TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.start(), {2});
+  model.PopulateTensor<int32_t>(model.limit(), {9});
+  model.PopulateTensor<int32_t>(model.delta(), {2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(2, 4, 6, 8));
+}
+
+TEST(RangeOpModel, NegativeDelta) {
+  RangeOpModel<int32_t> model(TensorType_INT32);
+  model.PopulateTensor<int32_t>(model.start(), {10});
+  model.PopulateTensor<int32_t>(model.limit(), {3});
+  model.PopulateTensor<int32_t>(model.delta(), {-3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(10, 7, 4));
+}
+
+TEST(RangeOpModel, FloatSimple) {
+  RangeOpModel<float> model(TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.start(), {0});
+  model.PopulateTensor<float>(model.limit(), {4});
+  model.PopulateTensor<float>(model.delta(), {1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(0, 1, 2, 3));
+}
+
+TEST(RangeOpModel, FloatDeltaGreaterThanOne) {
+  RangeOpModel<float> model(TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.start(), {2});
+  model.PopulateTensor<float>(model.limit(), {9});
+  model.PopulateTensor<float>(model.delta(), {2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(2, 4, 6, 8));
+}
+
+TEST(RangeOpModel, FloatNegativeDelta) {
+  RangeOpModel<float> model(TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.start(), {10});
+  model.PopulateTensor<float>(model.limit(), {3});
+  model.PopulateTensor<float>(model.delta(), {-3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(10, 7, 4));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/reduce.cc
rename to tensorflow/lite/kernels/reduce.cc
index 4732a37a65a37a295f0041634b65b9b5460a547c..336e827ca4c76abf3a08492249dfc0ce9cd81439 100644
--- a/tensorflow/contrib/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -15,13 +15,15 @@ limitations under the License.
 #include <string.h>
 #include <limits>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -229,6 +231,17 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
   return ResizeTempSum(context, &op_context, temp_sum);
 }
 
+void ResolveAxis(const int* axis_data, int axis_count,
+                 tflite::MeanParams* op_params) {
+  int i = 0;
+  for (; i < axis_count; ++i) {
+    op_params->axis[i] = static_cast<int16>(axis_data[i]);
+  }
+  for (; i < 4; ++i) {
+    op_params->axis[i] = 1;
+  }
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
@@ -257,9 +270,23 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
 
   if (kernel_type == kReference) {
     switch (op_context.input->type) {
-      case kTfLiteFloat32:
-        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
-        break;
+      case kTfLiteFloat32: {
+        tflite::MeanParams op_params;
+        op_params.axis_count = num_axis;
+        ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+        const TfLiteTensor* input = op_context.input;
+        if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
+            op_params.axis_count == 2 &&
+            ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+             (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
+          reference_ops::Mean(op_params, GetTensorShape(input),
+                              GetTensorData<float>(input),
+                              GetTensorShape(op_context.output),
+                              GetTensorData<float>(op_context.output));
+        } else {
+          TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
+        }
+      } break;
       case kTfLiteInt32:
         TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int, int64_t));
         break;
@@ -286,7 +313,8 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
                   GetTensorData<int>(op_context.axis), num_axis,
                   op_context.params->keep_dims, GetTensorData<int>(temp_index),
                   GetTensorData<int>(resolved_axis),
-                  GetTensorData<int>(temp_sum), /*compute_sum=*/false));
+                  GetTensorData<int>(temp_sum),
+                  /*compute_sum=*/false));
         }
         break;
       default:
diff --git a/tensorflow/contrib/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/kernels/reduce_test.cc
rename to tensorflow/lite/kernels/reduce_test.cc
index fb2ec58ab28ebc12b511590a67181f472464d6e1..c1526bddb719e74a6396dc4aeac4b5827220a65a 100644
--- a/tensorflow/contrib/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/register.cc
rename to tensorflow/lite/kernels/register.cc
index 9402105fa7a55c2d9da496f2af9e027c00d9e9fb..c0e6f6994fd2334917b178d4d3b16d73c27121c4 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace ops {
@@ -31,6 +31,7 @@ TfLiteRegistration* Register_RELU_1();
 
 namespace builtin {
 
+TfLiteRegistration* Register_ABS();
 TfLiteRegistration* Register_RELU();
 TfLiteRegistration* Register_RELU_N1_TO_1();
 TfLiteRegistration* Register_RELU6();
@@ -67,12 +68,14 @@ TfLiteRegistration* Register_PAD();
 TfLiteRegistration* Register_PADV2();
 TfLiteRegistration* Register_RESHAPE();
 TfLiteRegistration* Register_RESIZE_BILINEAR();
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR();
 TfLiteRegistration* Register_SKIP_GRAM();
 TfLiteRegistration* Register_SPACE_TO_DEPTH();
 TfLiteRegistration* Register_GATHER();
 TfLiteRegistration* Register_TRANSPOSE();
 TfLiteRegistration* Register_MEAN();
 TfLiteRegistration* Register_SPLIT();
+TfLiteRegistration* Register_SPLIT_V();
 TfLiteRegistration* Register_SQUEEZE();
 TfLiteRegistration* Register_STRIDED_SLICE();
 TfLiteRegistration* Register_EXP();
@@ -120,6 +123,12 @@ TfLiteRegistration* Register_UNPACK();
 TfLiteRegistration* Register_FLOOR_DIV();
 TfLiteRegistration* Register_SQUARE();
 TfLiteRegistration* Register_ZEROS_LIKE();
+TfLiteRegistration* Register_FLOOR_MOD();
+TfLiteRegistration* Register_RANGE();
+TfLiteRegistration* Register_LEAKY_RELU();
+TfLiteRegistration* Register_SQUARED_DIFFERENCE();
+TfLiteRegistration* Register_FILL();
+TfLiteRegistration* Register_MIRROR_PAD();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -149,6 +158,7 @@ const TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op,
 }
 
 BuiltinOpResolver::BuiltinOpResolver() {
+  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
   AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
@@ -194,6 +204,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
   AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
+  AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+             Register_RESIZE_NEAREST_NEIGHBOR());
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
   AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH());
   AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
@@ -202,6 +214,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DIV, Register_DIV());
   AddBuiltin(BuiltinOperator_SUB, Register_SUB());
   AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
   AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
@@ -209,7 +222,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
   AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
-  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
+  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
   AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
@@ -249,6 +264,12 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV());
   AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
   AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
+  AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD());
+  AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
+  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
+  AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
+  AddBuiltin(BuiltinOperator_FILL, Register_FILL());
+  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
similarity index 76%
rename from tensorflow/contrib/lite/kernels/register.h
rename to tensorflow/lite/kernels/register.h
index 61856ab9de6563b15232a88d8c198cdbc099d835..059c9d165ee8a81096cce3885fc940f5977d7342 100644
--- a/tensorflow/contrib/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
-#define TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+#ifndef TENSORFLOW_LITE_KERNELS_REGISTER_H_
+#define TENSORFLOW_LITE_KERNELS_REGISTER_H_
 
-#include <unordered_map>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
 
 namespace tflite {
 namespace ops {
@@ -37,4 +36,4 @@ class BuiltinOpResolver : public MutableOpResolver {
 }  // namespace ops
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+#endif  // TENSORFLOW_LITE_KERNELS_REGISTER_H_
diff --git a/tensorflow/contrib/lite/kernels/relu1.cc b/tensorflow/lite/kernels/relu1.cc
similarity index 92%
rename from tensorflow/contrib/lite/kernels/relu1.cc
rename to tensorflow/lite/kernels/relu1.cc
index abafee2d576fd7f7958513218286afaa6b2f2fc9..5a55631405b6b32a602cfe21ba863d0dc92213ea 100644
--- a/tensorflow/contrib/lite/kernels/relu1.cc
+++ b/tensorflow/lite/kernels/relu1.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/relu1_test.cc b/tensorflow/lite/kernels/relu1_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/relu1_test.cc
rename to tensorflow/lite/kernels/relu1_test.cc
index b1d25a9f504fe290a308119fb1cef0d8f63b599c..f52d10b0b7f32af3444c702835f0674d7181bb7a 100644
--- a/tensorflow/contrib/lite/kernels/relu1_test.cc
+++ b/tensorflow/lite/kernels/relu1_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/reshape.cc b/tensorflow/lite/kernels/reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d040c677019a8ebfe9abb67e60365a4f2e017740
--- /dev/null
+++ b/tensorflow/lite/kernels/reshape.cc
@@ -0,0 +1,169 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace reshape {
+
+constexpr int kInputTensor = 0;
+constexpr int kShapeTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node,
+                          TfLiteIntArray* output_shape) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Tensorflow's Reshape allows one of the shape components to have the
+  // special -1 value, meaning it will be calculated automatically based on the
+  // input. Here we calculate what that dimension should be so that the number
+  // of output elements in the same as the number of input elements.
+  int num_input_elements = NumElements(input);
+
+  int num_output_elements = 1;
+  int stretch_dim = -1;
+  for (int i = 0; i < output_shape->size; ++i) {
+    int value = output_shape->data[i];
+    if (value == -1) {
+      TF_LITE_ENSURE_EQ(context, stretch_dim, -1);
+      stretch_dim = i;
+    } else {
+      num_output_elements *= value;
+    }
+  }
+  if (stretch_dim != -1) {
+    output_shape->data[stretch_dim] = num_input_elements / num_output_elements;
+    num_output_elements *= output_shape->data[stretch_dim];
+  }
+
+  TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteIntArray* GetOutputShapeFromTensor(TfLiteContext* context,
+                                         TfLiteNode* node) {
+  const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(shape->dims->data[0]);
+  for (int i = 0; i < output_shape->size; ++i) {
+    output_shape->data[i] = shape->data.i32[i];
+  }
+
+  return output_shape;
+}
+
+TfLiteIntArray* GetOutputShapeFromParam(TfLiteContext* context,
+                                        TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
+
+  // The function is returned above this line if the shape tensor is usable.
+  // Now fallback to the shape parameter in `TfLiteReshapeParams`.
+  int num_dimensions = params->num_dimensions;
+  if (num_dimensions == 1 && params->shape[0] == 0) {
+    // Legacy tflite models use a shape parameter of [0] to indicate scalars,
+    // so adjust accordingly. TODO(b/111614235): Allow zero-sized buffers during
+    // toco conversion.
+    num_dimensions = 0;
+  }
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
+  for (int i = 0; i < num_dimensions; ++i) {
+    output_shape->data[i] = params->shape[i];
+  }
+
+  return output_shape;
+}
+
+// Check if the shape tensor is valid. Shapes should be int32 vectors.
+bool ShapeIsVector(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
+  return (shape->dims->size == 1 && shape->type == kTfLiteInt32);
+}
+
+TfLiteIntArray* GetOutputShape(TfLiteContext* context, TfLiteNode* node) {
+  if (NumInputs(node) == 2 && ShapeIsVector(context, node)) {
+    return GetOutputShapeFromTensor(context, node);
+  } else {
+    return GetOutputShapeFromParam(context, node);
+  }
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  // Always postpone sizing string tensors, even if we could in principle
+  // calculate their shapes now. String tensors don't benefit from having their
+  // shapes precalculated because the actual memory can only be allocated after
+  // we know all the content.
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  if (output->type != kTfLiteString) {
+    if (NumInputs(node) == 1 ||
+        IsConstantTensor(GetInput(context, node, kShapeTensor))) {
+      TF_LITE_ENSURE_OK(
+          context, ResizeOutput(context, node, GetOutputShape(context, node)));
+    } else {
+      SetTensorToDynamic(output);
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // There are two ways in which the 'output' can be made dynamic: it could be
+  // a string tensor, or its shape cannot be calculated during Prepare(). In
+  // either case, we now have all the information to calculate its shape.
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(
+        context, ResizeOutput(context, node, GetOutputShape(context, node)));
+  }
+
+  // Note that string tensors are always "dynamic" in the sense that their size
+  // is not known until we have all the content. This applies even when their
+  // shape is known ahead of time. As a result, a string tensor is never given
+  // any memory by ResizeOutput(), and we need to do it manually here. Since
+  // reshape doesn't change the data, the output tensor needs exactly as many
+  // bytes as the input tensor.
+  if (output->type == kTfLiteString) {
+    auto bytes_required = input->bytes;
+    TfLiteTensorRealloc(bytes_required, output);
+    output->bytes = bytes_required;
+  }
+
+  memcpy(output->data.raw, input->data.raw, input->bytes);
+
+  return kTfLiteOk;
+}
+
+}  // namespace reshape
+
+TfLiteRegistration* Register_RESHAPE() {
+  static TfLiteRegistration r = {nullptr, nullptr, reshape::Prepare,
+                                 reshape::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/reshape_test.cc b/tensorflow/lite/kernels/reshape_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..00bbbef57eccef67d043e85c02ebe80c3f9387ef
--- /dev/null
+++ b/tensorflow/lite/kernels/reshape_test.cc
@@ -0,0 +1,239 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
+
+// There are three ways to specify the output shape of a Reshape
+// op.
+enum ShapeSpecificationType {
+  // The output shape is hardcoded in the ReshapeOptions object.
+  kAsReshapeOption,
+  // The output shape is specified as an input tensor, which is connected to a
+  // Const node, which is guaranteed not to change once inference starts. The
+  // shape is also hardcoded as in kAsReshapeOption.
+  kAsConstantTensor,
+  // The output shape is specifed as an input tensor that can change based on
+  // external input. That is, the shape is not know before the inference
+  // starts. The shape is also hardcoded as in kAsReshapeOption.
+  kAsTensor,
+};
+
+class ReshapeOpTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<ShapeSpecificationType> {};
+
+template <typename T>
+class ReshapeOpModel : public SingleOpModel {
+ public:
+  ReshapeOpModel(std::initializer_list<int> input_shape,
+                 std::initializer_list<int> shape_shape,
+                 std::initializer_list<int> shape_data,
+                 ShapeSpecificationType shape_type) {
+    switch (shape_type) {
+      case kAsTensor:
+        BuildWithTensorShape(input_shape, shape_shape, shape_data);
+        break;
+      case kAsConstantTensor:
+        BuildWithConstantTensorShape(input_shape, shape_shape, shape_data);
+        break;
+      case kAsReshapeOption:
+        // In this case the shape of the new shape doesn't matter. It is
+        // always hardcoded as a flat vector.
+        BuildWithHardcodedShape(input_shape, shape_data);
+        break;
+    }
+  }
+
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+
+  void SetStringInput(std::initializer_list<string> data) {
+    PopulateStringTensor(input_, data);
+  }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  void BuildWithHardcodedShape(std::initializer_list<int> input_shape,
+                               std::initializer_list<int> shape_data) {
+    input_ = AddInput({GetTensorType<T>(), input_shape});
+    output_ = AddOutput(GetTensorType<T>());
+    SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(builder_, builder_.CreateVector<int>(shape_data))
+            .Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  void BuildWithTensorShape(std::initializer_list<int> input_shape,
+                            std::initializer_list<int> shape_shape,
+                            std::initializer_list<int> shape_data) {
+    input_ = AddInput({GetTensorType<T>(), input_shape});
+    output_ = AddOutput(GetTensorType<T>());
+    int shape_input_tensor = AddInput({TensorType_INT32, shape_shape});
+    // Note how shape also appears in ReshapeOptions
+    SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(builder_, builder_.CreateVector<int>(shape_data))
+            .Union());
+    BuildInterpreter({GetShape(input_), GetShape(shape_input_tensor)});
+    if (shape_data.size() != 0) {
+      PopulateTensor<int32_t>(shape_input_tensor, shape_data);
+    }
+  }
+
+  void BuildWithConstantTensorShape(std::initializer_list<int> input_shape,
+                                    std::initializer_list<int> shape_shape,
+                                    std::initializer_list<int> shape_data) {
+    input_ = AddInput({GetTensorType<T>(), input_shape});
+    output_ = AddOutput(GetTensorType<T>());
+    AddConstInput(TensorType_INT32, shape_data, shape_shape);
+    // Note how the shape also appears in the ReshapeOptions.
+    SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(builder_, builder_.CreateVector<int>(shape_data))
+            .Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input_;
+  int output_;
+};
+
+TEST_P(ReshapeOpTest, MismatchedDimensions) {
+  if (GetParam() == kAsTensor) {
+    ReshapeOpModel<float> m({1, 2, 4, 1}, {2}, {2, 1}, GetParam());
+    m.SetInput({3});
+    EXPECT_DEATH(m.Invoke(), "num_input_elements != num_output_elements");
+  } else {
+    EXPECT_DEATH(ReshapeOpModel<float>({1, 2, 4, 1}, {2}, {2, 1}, GetParam()),
+                 "num_input_elements != num_output_elements");
+  }
+}
+
+TEST_P(ReshapeOpTest, TooManyDimensions) {
+  if (GetParam() == kAsReshapeOption) {
+    EXPECT_DEATH(ReshapeOpModel<float>({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
+                                       {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam()),
+                 "Found too many dimensions");
+  } else {
+    ReshapeOpModel<float> m({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
+                            {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam());
+    m.SetInput({3, 4});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 4}));
+    EXPECT_THAT(m.GetOutputShape(),
+                ElementsAreArray({1, 1, 1, 1, 1, 1, 1, 1, 2}));
+  }
+}
+
+TEST_P(ReshapeOpTest, TooManySpecialDimensions) {
+  if (GetParam() != kAsTensor) {
+    EXPECT_DEATH(
+        ReshapeOpModel<float>({1, 2, 4, 1}, {4}, {-1, -1, 2, 4}, GetParam()),
+        "stretch_dim != -1");
+  } else {
+    ReshapeOpModel<float> m({1, 2, 4, 1}, {4}, {-1, -1, 2, 4}, GetParam());
+    EXPECT_DEATH(m.Invoke(), "stretch_dim != -1");
+  }
+}
+
+// Create the model with a 2x2 shape. Processing still works because the new
+// shape ends up being hardcoded as a flat vector.
+TEST_P(ReshapeOpTest, InvalidShape) {
+  ReshapeOpModel<float> m({1, 2, 2}, {2, 2}, {1, 2, 2, 1}, GetParam());
+  m.SetInput({5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 6, 7, 8}));
+}
+
+// This is the normal scenario, where shape is a vector.
+TEST_P(ReshapeOpTest, RegularShapes) {
+  ReshapeOpModel<float> m({1, 2, 4, 1}, {3}, {2, 2, 2}, GetParam());
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
+}
+
+TEST_P(ReshapeOpTest, WithStretchDimension) {
+  ReshapeOpModel<float> m({1, 2, 4, 1}, {3}, {2, 1, -1}, GetParam());
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 4}));
+}
+
+// Shape is specified as '[]', which is the modern way to represent scalar
+// input and output.
+TEST_P(ReshapeOpTest, ScalarOutput) {
+  ReshapeOpModel<float> m({1}, {0}, {}, GetParam());
+  m.SetInput({3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+}
+
+// Some old models specify '[0]' as the new shape, indicating that both input
+// and output are scalars.
+TEST_P(ReshapeOpTest, LegacyScalarOutput) {
+  if (GetParam() == kAsConstantTensor) {
+    EXPECT_DEATH(ReshapeOpModel<float>({1}, {1}, {0}, GetParam()),
+                 "num_input_elements != num_output_elements");
+  } else if (GetParam() == kAsTensor) {
+    ReshapeOpModel<float> m({1}, {1}, {0}, GetParam());
+    m.SetInput({3});
+    EXPECT_DEATH(m.Invoke(), "num_input_elements != num_output_elements");
+  } else {
+    ReshapeOpModel<float> m({1}, {1}, {0}, GetParam());
+    m.SetInput({3});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+    EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  }
+}
+
+TEST_P(ReshapeOpTest, Strings) {
+  ReshapeOpModel<string> m({1, 2, 4, 1}, {3}, {2, 2, 2}, GetParam());
+  m.SetStringInput({"1", "2", "3", "4", "5", "6", "7", "8"});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({"1", "2", "3", "4", "5", "6", "7", "8"}));
+}
+
+INSTANTIATE_TEST_CASE_P(VariedShapeSpec, ReshapeOpTest,
+                        ::testing::Values(kAsReshapeOption, kAsConstantTensor,
+                                          kAsTensor));
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/lite/kernels/resize_bilinear.cc
similarity index 92%
rename from tensorflow/contrib/lite/kernels/resize_bilinear.cc
rename to tensorflow/lite/kernels/resize_bilinear.cc
index fb045d15f357350b38319e329d179668c7366761..d42cb188669587a957dd085f9ecb123f44b59437 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/lite/kernels/resize_bilinear.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3f4837a287accd93c23e17fa3a361efd4120101
--- /dev/null
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -0,0 +1,314 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+using uint8 = std::uint8_t;
+
+class ResizeBilinearOpModel : public SingleOpModel {
+ public:
+  explicit ResizeBilinearOpModel(const TensorData& input,
+                                 std::initializer_list<int> size_data = {}) {
+    bool const_size = size_data.size() != 0;
+    input_ = AddInput(input);
+    if (const_size) {
+      size_ = AddConstInput(TensorType_INT32, size_data, {2});
+    } else {
+      size_ = AddInput({TensorType_INT32, {2}});
+    }
+    output_ = AddOutput(input.type);
+    SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
+                 BuiltinOptions_ResizeBilinearOptions,
+                 CreateResizeBilinearOptions(builder_).Union());
+    if (const_size) {
+      BuildInterpreter({GetShape(input_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_)});
+    }
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+ private:
+  int input_;
+  int size_;
+  int output_;
+};
+
+TEST(ResizeBilinearOpTest, HorizontalResize) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
+  m.SetInput<float>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<float>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
+TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
+  m.SetInput<uint8>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<uint8>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
+TEST(ResizeBilinearOpTest, VerticalResize) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
+  m.SetInput<float>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<float>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
+TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
+  m.SetInput<uint8>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<uint8>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
+  m.SetInput<float>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,    //
+                                        7, 9, 10,   //
+                                        9, 11, 12,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<float>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,    //
+                                              7, 9, 10,   //
+                                              9, 11, 12,  //
+                                          })));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
+  m.SetInput<uint8>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,    //
+                                        7, 9, 10,   //
+                                        9, 11, 12,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<uint8>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,    //
+                                              7, 9, 10,   //
+                                              9, 11, 12,  //
+                                          })));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
+  m.SetInput<float>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,     //
+                                        7, 9, 10,    //
+                                        9, 11, 12,   //
+                                        4, 8, 10,    //
+                                        8, 12, 14,   //
+                                        10, 14, 16,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<float>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,     //
+                                              7, 9, 10,    //
+                                              9, 11, 12,   //
+                                              4, 8, 10,    //
+                                              8, 12, 14,   //
+                                              10, 14, 16,  //
+                                          })));
+}
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
+  m.SetInput<float>({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 5, 8, 6, 10,      //
+                                        7, 8, 9, 12, 10, 14,    //
+                                        9, 10, 11, 14, 12, 16,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<float>({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 4, 5, 8, 6, 10,      //
+                                              7, 8, 9, 12, 10, 14,    //
+                                              9, 10, 11, 14, 12, 16,  //
+                                          })));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
+  m.SetInput<uint8>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,     //
+                                        7, 9, 10,    //
+                                        9, 11, 12,   //
+                                        4, 8, 10,    //
+                                        9, 12, 14,   //
+                                        12, 14, 16,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<uint8>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,     //
+                                              7, 9, 10,    //
+                                              9, 11, 12,   //
+                                              4, 8, 10,    //
+                                              9, 12, 14,   //
+                                              12, 14, 16,  //
+                                          })));
+}
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
+  m.SetInput<uint8>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 5, 8, 6, 10,       //
+                                        7, 9, 10, 12, 11, 14,    //
+                                        10, 12, 12, 14, 14, 16,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<uint8>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 4, 5, 8, 6, 10,       //
+                                              7, 9, 10, 12, 11, 14,    //
+                                              10, 12, 12, 14, 14, 16,  //
+                                          })));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a48d8004f8b6cead177286328082310237af515a
--- /dev/null
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -0,0 +1,152 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace resize_nearest_neighbor {
+
+// This file has three implementations of RESIZE_NEAREST_NEIGHBOR.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+  kNeonOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kSizeTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                const TfLiteTensor* input,
+                                const TfLiteTensor* size,
+                                TfLiteTensor* output) {
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  const int32* size_data = GetTensorData<int32>(size);
+  output_size->data[1] = size_data[0];
+  output_size->data[2] = size_data[1];
+  output_size->data[3] = input->dims->data[3];
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(ahentz): Our current implementations rely on the inputs being 4D.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
+
+  TF_LITE_ENSURE_EQ(context, size->type, kTfLiteInt32);
+  output->type = input->type;
+
+  if (!IsConstantTensor(size)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, input, size, output);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteResizeNearestNeighborParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputTensor(context, input, size, output));
+  }
+
+  tflite::ResizeNearestNeighborParams op_params;
+  op_params.align_corners = params->align_corners;
+
+  if (output->type == kTfLiteFloat32) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, GetTensorShape(input), GetTensorData<int32>(input),
+        GetTensorShape(size), GetTensorData<int32>(size),
+        GetTensorShape(output), GetTensorData<int32>(output));
+  } else if (output->type == kTfLiteUInt8) {
+    if (kernel_type == kReference) {
+      reference_ops::ResizeNearestNeighbor(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(size), GetTensorData<int32>(size),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+    }
+    if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
+      optimized_ops::ResizeNearestNeighbor(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(size), GetTensorData<int32>(size),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+    }
+  } else {
+    context->ReportError(context, "Output type is %d, requires float or uint8.",
+                         output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace resize_nearest_neighbor
+
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, resize_nearest_neighbor::Prepare,
+      resize_nearest_neighbor::Eval<resize_nearest_neighbor::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, resize_nearest_neighbor::Prepare,
+      resize_nearest_neighbor::Eval<
+          resize_nearest_neighbor::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR_NEON_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, resize_nearest_neighbor::Prepare,
+      resize_nearest_neighbor::Eval<resize_nearest_neighbor::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR() {
+#ifdef USE_NEON
+  return Register_RESIZE_NEAREST_NEIGHBOR_NEON_OPT();
+#else
+  return Register_RESIZE_NEAREST_NEIGHBOR_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03e2effd84c4adb13db1bb3ada4f5cfe1c0b12c9
--- /dev/null
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
@@ -0,0 +1,325 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+using uint8 = std::uint8_t;
+
+class ResizeNearestNeighborOpModel : public SingleOpModel {
+ public:
+  explicit ResizeNearestNeighborOpModel(
+      const TensorData& input, std::initializer_list<int> size_data = {}) {
+    bool const_size = size_data.size() != 0;
+    input_ = AddInput(input);
+    if (const_size) {
+      size_ = AddConstInput(TensorType_INT32, size_data, {2});
+    } else {
+      size_ = AddInput({TensorType_INT32, {2}});
+    }
+    output_ = AddOutput(input.type);
+    SetBuiltinOp(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+                 BuiltinOptions_ResizeNearestNeighborOptions,
+                 CreateResizeNearestNeighborOptions(builder_).Union());
+    if (const_size) {
+      BuildInterpreter({GetShape(input_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_)});
+    }
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+ private:
+  int input_;
+  int size_;
+  int output_;
+};
+
+TEST(ResizeNearestNeighborOpTest, HorizontalResize) {
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
+  m.SetInput<float>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, 6})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}},
+                                       {1, 3});
+  const_m.SetInput<float>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, 6})));
+}
+
+TEST(ResizeNearestNeighborOpTest, HorizontalResize8Bit) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
+  m.SetInput<uint8>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, 6})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_UINT8, {1, 1, 2, 1}},
+                                       {1, 3});
+  const_m.SetInput<uint8>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, 6})));
+}
+
+TEST(ResizeNearestNeighborOpTest, VerticalResize) {
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
+  m.SetInput<float>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, 9})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}},
+                                       {3, 1});
+  const_m.SetInput<float>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, 9})));
+}
+
+TEST(ResizeNearestNeighborOpTest, VerticalResize8Bit) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
+  m.SetInput<uint8>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, 9})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_UINT8, {1, 2, 1, 1}},
+                                       {3, 1});
+  const_m.SetInput<uint8>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, 9})));
+}
+
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize) {
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
+  m.SetInput<float>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 3, 6,   //
+                                        3, 3, 6,   //
+                                        9, 9, 12,  //
+                                    })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                       {3, 3});
+  const_m.SetInput<float>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 3, 6,   //
+                                              3, 3, 6,   //
+                                              9, 9, 12,  //
+                                          })));
+}
+
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize8Bit) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
+  m.SetInput<uint8>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 3, 6,   //
+                                        3, 3, 6,   //
+                                        9, 9, 12,  //
+                                    })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_UINT8, {1, 2, 2, 1}},
+                                       {3, 3});
+  const_m.SetInput<uint8>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 3, 6,   //
+                                              3, 3, 6,   //
+                                              9, 9, 12,  //
+                                          })));
+}
+
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches) {
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
+  m.SetInput<float>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 3, 6,     //
+                                        3, 3, 6,     //
+                                        9, 9, 12,    //
+                                        4, 4, 10,    //
+                                        4, 4, 10,    //
+                                        10, 10, 16,  //
+                                    })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}},
+                                       {3, 3});
+  const_m.SetInput<float>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 3, 6,     //
+                                              3, 3, 6,     //
+                                              9, 9, 12,    //
+                                              4, 4, 10,    //
+                                              4, 4, 10,    //
+                                              10, 10, 16,  //
+                                          })));
+}
+
+TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize) {
+  ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
+  m.SetInput<float>({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 3, 4, 6, 10,     //
+                                        3, 4, 3, 4, 6, 10,     //
+                                        9, 10, 9, 10, 12, 16,  //
+                                    })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}},
+                                       {3, 3});
+  const_m.SetInput<float>({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 4, 3, 4, 6, 10,     //
+                                              3, 4, 3, 4, 6, 10,     //
+                                              9, 10, 9, 10, 12, 16,  //
+                                          })));
+}
+
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
+  m.SetInput<uint8>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 3, 6,     //
+                                        3, 3, 6,     //
+                                        9, 9, 12,    //
+                                        4, 4, 10,    //
+                                        4, 4, 10,    //
+                                        12, 12, 16,  //
+                                    })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_UINT8, {2, 2, 2, 1}},
+                                       {3, 3});
+  const_m.SetInput<uint8>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 3, 6,     //
+                                              3, 3, 6,     //
+                                              9, 9, 12,    //
+                                              4, 4, 10,    //
+                                              4, 4, 10,    //
+                                              12, 12, 16,  //
+                                          })));
+}
+
+TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize8Bit) {
+  ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
+  m.SetInput<uint8>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 3, 4, 6, 10,       //
+                                        3, 4, 3, 4, 6, 10,       //
+                                        10, 12, 10, 12, 14, 16,  //
+                                    })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_UINT8, {1, 2, 2, 2}},
+                                       {3, 3});
+  const_m.SetInput<uint8>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 4, 3, 4, 6, 10,       //
+                                              3, 4, 3, 4, 6, 10,       //
+                                              10, 12, 10, 12, 14, 16,  //
+                                          })));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/select.cc
rename to tensorflow/lite/kernels/select.cc
index 4780a86ee51ee354be6c6baede16f8a40bba7726..4687ab44171fab73ff1b4ef93592b25680f3a59f 100644
--- a/tensorflow/contrib/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/select_test.cc b/tensorflow/lite/kernels/select_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/select_test.cc
rename to tensorflow/lite/kernels/select_test.cc
index 5b2e61cd29a7fd7c699fd81cb81e5f9a12c4b18f..5111300e479a92ad9cbf00628750dc61effc50d3 100644
--- a/tensorflow/contrib/lite/kernels/select_test.cc
+++ b/tensorflow/lite/kernels/select_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/shape.cc b/tensorflow/lite/kernels/shape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..934f0846b9e8391ad183101811d08c6deabe679e
--- /dev/null
+++ b/tensorflow/lite/kernels/shape.cc
@@ -0,0 +1,93 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace shape {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+template <typename OutType>
+void ExtractShape(const TfLiteTensor* input, OutType* output_data) {
+  for (int i = 0; i < NumDimensions(input); ++i) {
+    output_data[i] = SizeOfDimension(input, i);
+  }
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  auto* params = reinterpret_cast<TfLiteShapeParams*>(node->builtin_data);
+  switch (params->out_type) {
+    case kTfLiteInt32:
+      output->type = kTfLiteInt32;
+      break;
+    case kTfLiteInt64:
+      output->type = kTfLiteInt64;
+      break;
+    default:
+      context->ReportError(context, "Unknown shape output data type: %d",
+                           params->out_type);
+      return kTfLiteError;
+  }
+
+  // Shape always produces a 1-dimensional output tensor, where each output
+  // element is the length of the corresponding input tensor's dimension.
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(1);
+  output_size->data[0] = NumDimensions(input);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TFLITE_DCHECK_EQ(NumDimensions(output), 1);
+  TFLITE_DCHECK_EQ(SizeOfDimension(output, 0), NumDimensions(input));
+
+  switch (output->type) {
+    case kTfLiteInt32:
+      ExtractShape(input, GetTensorData<int32_t>(output));
+      break;
+    case kTfLiteInt64:
+      ExtractShape(input, GetTensorData<int64_t>(output));
+      break;
+    default:
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace shape
+
+TfLiteRegistration* Register_SHAPE() {
+  static TfLiteRegistration r = {nullptr, nullptr, shape::Prepare, shape::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/shape_test.cc b/tensorflow/lite/kernels/shape_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c13ff45b0a3c0a5f22644c68224c50fab3b018b
--- /dev/null
+++ b/tensorflow/lite/kernels/shape_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ShapeOpModel : public SingleOpModel {
+ public:
+  ShapeOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+               TensorType output_type) {
+    input_ = AddInput(input_type);
+    output_ = AddOutput(output_type);
+    SetBuiltinOp(BuiltinOperator_SHAPE, BuiltinOptions_ShapeOptions,
+                 CreateShapeOptions(builder_, output_type).Union());
+    BuildInterpreter({input_shape});
+  }
+
+  TfLiteStatus InvokeWithResult() { return interpreter_->Invoke(); }
+
+  int input() { return input_; }
+
+  int32_t GetOutputSize() { return GetTensorSize(output_); }
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(ShapeOpTest, OutTypeInt) {
+  ShapeOpModel<int32_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
+                              TensorType_INT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
+}
+
+TEST(ShapeOpTest, OutTypeInt64) {
+  ShapeOpModel<int64_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
+                              TensorType_INT64);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
+}
+
+TEST(ShapeOpTest, ScalarTensor) {
+  ShapeOpModel<int32_t> model({}, TensorType_FLOAT32, TensorType_INT32);
+  model.Invoke();
+
+  EXPECT_EQ(model.GetOutputSize(), 0);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({0}));
+}
+
+TEST(ShapeOpTest, EmptyTensor) {
+  ShapeOpModel<int32_t> model({1, 0}, TensorType_FLOAT32, TensorType_INT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/skip_gram.cc b/tensorflow/lite/kernels/skip_gram.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/skip_gram.cc
rename to tensorflow/lite/kernels/skip_gram.cc
index de80a4016ecd6f88df99a5383739c2eb42a1f706..265ba18a3e39d3316fef2d41306540e7a170e675 100644
--- a/tensorflow/contrib/lite/kernels/skip_gram.cc
+++ b/tensorflow/lite/kernels/skip_gram.cc
@@ -33,11 +33,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
@@ -107,7 +107,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Generate n-grams recursively.
   tflite::DynamicBuffer buf;
   if (words.size() < params->ngram_size) {
-    buf.WriteToTensor(GetOutput(context, node, 0));
+    buf.WriteToTensorAsVector(GetOutput(context, node, 0));
     return kTfLiteOk;
   }
 
@@ -145,7 +145,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   }
 
-  buf.WriteToTensor(GetOutput(context, node, 0));
+  buf.WriteToTensorAsVector(GetOutput(context, node, 0));
   return kTfLiteOk;
 }
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/skip_gram_test.cc b/tensorflow/lite/kernels/skip_gram_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/skip_gram_test.cc
rename to tensorflow/lite/kernels/skip_gram_test.cc
index 185b64cb44969b57588ea5d0b40f55b6ddf8e11f..d4430b8a3430409d39c9c772b5384a46fe787b8f 100644
--- a/tensorflow/contrib/lite/kernels/skip_gram_test.cc
+++ b/tensorflow/lite/kernels/skip_gram_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/slice.cc
rename to tensorflow/lite/kernels/slice.cc
index ccfee41b9ca58f59b7ece5da7feb7f7bf8d78c27..116c81e4d57a9a27dfb0581fe0096f461aa6ab81 100644
--- a/tensorflow/contrib/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include <string.h>
 #include <cmath>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -107,7 +107,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Ensure validity of input tensor and its dimension.
-  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE(context,
                  begin->type == kTfLiteInt32 || begin->type == kTfLiteInt64);
   TF_LITE_ENSURE(context,
diff --git a/tensorflow/contrib/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/slice_test.cc
rename to tensorflow/lite/kernels/slice_test.cc
index 4828f88f36bc1e7daf84ab6831a2ccc98bfaed40..563329ddb164d3aa5f13c8ee0d6482d79b84ed32 100644
--- a/tensorflow/contrib/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/softmax_test.cc b/tensorflow/lite/kernels/softmax_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb9d7c1d9de69423d6917c83bb6820413604e65c
--- /dev/null
+++ b/tensorflow/lite/kernels/softmax_test.cc
@@ -0,0 +1,144 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite SOFTMAX op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+class SoftmaxOpModel : public SingleOpModel {
+ public:
+  SoftmaxOpModel(int batches, int size, float beta)
+      : batches_(batches), input_size_(size), beta_(beta) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_SOFTMAX, BuiltinOptions_SoftmaxOptions,
+                 CreateSoftmaxOptions(builder_, beta_).Union());
+    BuildInterpreter({{batches_, input_size_}});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+
+  int batches_;
+  int input_size_;
+  float beta_;
+};
+
+TEST(SoftmaxOpTest, SimpleTest) {
+  SoftmaxOpModel m(/*batches=*/2, /*size=*/5, /*beta=*/1.0);
+  m.SetInput({
+      1.0, 2.0, 3.0, 4.0, 5.0,       // b = 0
+      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 0
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.011656231, 0.031684921, 0.086128544, 0.234121657, 0.636408647,
+           0.636408647, 0.234121657, 0.086128544, 0.031684921, 0.011656231},
+          1e-6)));
+}
+
+TEST(SoftmaxOpTest, CompareWithTFminiBetaEq1) {
+  const int batch_size = 2;
+  const int input_size = 5;
+  const float beta = 1.0;
+  static float input_buffer[] = {
+      1.0,  2.0,  3.0,  4.0,  5.0,   // b = 0
+      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 1
+  };
+
+  SoftmaxOpModel m(batch_size, input_size, beta);
+
+  m.SetInput(0, input_buffer, input_buffer + input_size * batch_size);
+
+  m.Invoke();
+
+  std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]);
+  auto input_shape = RuntimeShape({batch_size, 1, 1, input_size});
+  SoftmaxParams params;
+  params.beta = beta;
+  tflite::reference_ops::Softmax(params, input_shape, input_buffer, input_shape,
+                                 output_buffer.get());
+
+  std::vector<float> expected;
+  expected.insert(expected.end(), output_buffer.get(),
+                  output_buffer.get() + input_size * batch_size);
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected, 1e-6)));
+}
+
+TEST(SoftmaxOpTest, CompareWithTFminiBetaNotEq1) {
+  const int batch_size = 2;
+  const int input_size = 5;
+  const float beta = 0.5;
+  static float input_buffer[] = {
+      1.0,  2.0,  3.0,  4.0,  5.0,   // b = 0
+      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 1
+  };
+
+  SoftmaxOpModel m(batch_size, input_size, beta);
+
+  m.SetInput(0, input_buffer, input_buffer + input_size * batch_size);
+
+  m.Invoke();
+
+  std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]);
+  auto input_shape = RuntimeShape({batch_size, 1, 1, input_size});
+  SoftmaxParams params;
+  params.beta = beta;
+  tflite::reference_ops::Softmax(params, input_shape, input_buffer, input_shape,
+                                 output_buffer.get());
+
+  std::vector<float> expected;
+  expected.insert(expected.end(), output_buffer.get(),
+                  output_buffer.get() + input_size * batch_size);
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected, 1e-6)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
rename to tensorflow/lite/kernels/space_to_batch_nd.cc
index 3a10d2e60cf6171ff2396984982ac7238dcedfd9..1c61b2ef30379e808085f3b0d16a5b1157bea314 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -14,13 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
rename to tensorflow/lite/kernels/space_to_batch_nd_test.cc
index 5756573629a51917e39a312117a1fcd29c150dc0..4d55ba56b71c5e0c44f0145981db56cbef6ec99a 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth.cc b/tensorflow/lite/kernels/space_to_depth.cc
similarity index 91%
rename from tensorflow/contrib/lite/kernels/space_to_depth.cc
rename to tensorflow/lite/kernels/space_to_depth.cc
index 64c56c017b0b4a8a1a449309238df86dbec21d4b..79e28bf47d98b64572d9e7404f8d69788cd30e08 100644
--- a/tensorflow/contrib/lite/kernels/space_to_depth.cc
+++ b/tensorflow/lite/kernels/space_to_depth.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth_test.cc b/tensorflow/lite/kernels/space_to_depth_test.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/space_to_depth_test.cc
rename to tensorflow/lite/kernels/space_to_depth_test.cc
index 997f354861a235fb511235e4d64544dc8c3ddb34..5744669b6d62af61a0b20e7723b78c72f6db952d 100644
--- a/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/kernels/space_to_depth_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc b/tensorflow/lite/kernels/sparse_output_fully_connected.cc
similarity index 92%
rename from tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc
rename to tensorflow/lite/kernels/sparse_output_fully_connected.cc
index 843ed0768c050006bdbe759520f453b58798a6a2..73d850f0e2d094e9cc620f4f4733354d603b2a77 100644
--- a/tensorflow/contrib/lite/kernels/sparse_output_fully_connected.cc
+++ b/tensorflow/lite/kernels/sparse_output_fully_connected.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 // SparseOutputFullyConnected is a fully connected layer that uses a single
 // row in the weights and bias via a lookup.
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
@@ -88,6 +88,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const bool is_hybrid_op =
       (weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32);
 
+  // Resize output.
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(1);
+  output_size_array->data[0] = 1;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
+
   if (is_hybrid_op) {
     TfLiteIntArrayFree(node->temporaries);
     node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
@@ -111,9 +118,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         GetTemporary(context, node, /*index=*/kScalingFactors);
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-    scaling_factors_size->data[0] = n_batch;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+    int scaling_dims[1] = {n_batch};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = n_batch;
       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
                                                        scaling_factors_size));
     }
diff --git a/tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc b/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc
rename to tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
index 365986a5c177ee58604138f279ca1186bacc742e..c25a32bde001e632afff2a34ad168467c092bcf5 100644
--- a/tensorflow/contrib/lite/kernels/sparse_output_fully_connected_test.cc
+++ b/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc b/tensorflow/lite/kernels/sparse_to_dense.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/sparse_to_dense.cc
rename to tensorflow/lite/kernels/sparse_to_dense.cc
index 349fa0bd281ce3d4d26ceb69565a0684151c8ad9..de4d863facb50b4e98f289a9a84ed91ebe8bc603 100644
--- a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
+++ b/tensorflow/lite/kernels/sparse_to_dense.cc
@@ -19,13 +19,13 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
-#include "tensorflow/contrib/lite/kernels/padding.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/padding.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc b/tensorflow/lite/kernels/sparse_to_dense_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc
rename to tensorflow/lite/kernels/sparse_to_dense_test.cc
index a51ec17afcefd791680d7aa42cef467f481f6dbc..ee135c220ede171dabfcc1b23df9ececf67f8f26 100644
--- a/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc
+++ b/tensorflow/lite/kernels/sparse_to_dense_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/split.cc b/tensorflow/lite/kernels/split.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7902ed2a46d297cca6f076bf1bb48580f3c4bf40
--- /dev/null
+++ b/tensorflow/lite/kernels/split.cc
@@ -0,0 +1,165 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace split {
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteSplitParams*>(node->builtin_data);
+    axis = GetInput(context, node, 0);
+    input = GetInput(context, node, 1);
+  }
+  TfLiteSplitParams* params;
+  const TfLiteTensor* axis;
+  const TfLiteTensor* input;
+};
+
+TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    SetTensorToDynamic(GetOutput(context, node, i));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node,
+                                 const TfLiteTensor* axis,
+                                 const TfLiteTensor* input, int num_splits) {
+  int axis_value = GetTensorData<int>(axis)[0];
+  if (axis_value < 0) {
+    axis_value += NumDimensions(input);
+  }
+
+  const int input_size = SizeOfDimension(input, axis_value);
+  TF_LITE_ENSURE_MSG(context, input_size % num_splits == 0,
+                     "Not an even split");
+  const int slice_size = input_size / num_splits;
+
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
+    output_dims->data[axis_value] = slice_size;
+    TfLiteTensor* output = GetOutput(context, node, i);
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_dims));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+
+  OpContext op_context(context, node);
+
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), op_context.params->num_splits);
+
+  auto input_type = op_context.input->type;
+  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
+                              input_type == kTfLiteUInt8 ||
+                              input_type == kTfLiteInt16);
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    GetOutput(context, node, i)->type = input_type;
+  }
+
+  // If we know the contents of the 'axis' tensor, resize all outputs.
+  // Otherwise, wait until Eval().
+  if (IsConstantTensor(op_context.axis)) {
+    return ResizeOutputTensors(context, node, op_context.axis, op_context.input,
+                               op_context.params->num_splits);
+  } else {
+    return UseDynamicOutputTensors(context, node);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+
+  // When the 'axis' tensor is non-const we can't resize output tensors in
+  // Prepare(), and we have to do it now.
+  if (!IsConstantTensor(op_context.axis)) {
+    TF_LITE_ENSURE_OK(
+        context,
+        ResizeOutputTensors(context, node, op_context.axis, op_context.input,
+                            op_context.params->num_splits));
+  }
+
+  int axis_value = GetTensorData<int>(op_context.axis)[0];
+  if (axis_value < 0) {
+    axis_value += NumDimensions(op_context.input);
+  }
+
+  // TODO(ahentz): Our usage of VectorOfTensors could be optimized by
+  // calculating it in Prepare, unless we defer shape calculation.
+  // TODO(ahentz): We can improve the optimized_ops version to handle other
+  // cases too.
+#define TF_LITE_SPLIT(scalar)                                         \
+  VectorOfTensors<scalar> all_outputs(*context, *node->outputs);      \
+  tflite::SplitParams op_params;                                      \
+  op_params.num_split = NumOutputs(node);                             \
+  op_params.axis = axis_value;                                        \
+  if (axis_value == 0) {                                              \
+    optimized_ops::Split(op_params, GetTensorShape(op_context.input), \
+                         GetTensorData<scalar>(op_context.input),     \
+                         all_outputs.shapes(), all_outputs.data());   \
+  } else {                                                            \
+    reference_ops::Split(op_params, GetTensorShape(op_context.input), \
+                         GetTensorData<scalar>(op_context.input),     \
+                         all_outputs.shapes(), all_outputs.data());   \
+  }
+  switch (op_context.input->type) {
+    case kTfLiteFloat32: {
+      TF_LITE_SPLIT(float);
+      break;
+    }
+    case kTfLiteUInt8: {
+      TF_LITE_SPLIT(uint8_t);
+      break;
+    }
+    case kTfLiteInt16: {
+      TF_LITE_SPLIT(int16_t);
+      break;
+    }
+    default:
+      context->ReportError(
+          context,
+          "Only float32, uint8 and int16 are currently supported, got %d.",
+          op_context.input->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_SPLIT
+
+  return kTfLiteOk;
+}
+
+}  // namespace split
+
+TfLiteRegistration* Register_SPLIT() {
+  static TfLiteRegistration r = {nullptr, nullptr, split::Prepare, split::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/split_test.cc b/tensorflow/lite/kernels/split_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3d9ea3bf4158dd51b5102b942125b7561024c19
--- /dev/null
+++ b/tensorflow/lite/kernels/split_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+constexpr int kAxisIsATensor = -1000;
+
+class SplitOpModel : public SingleOpModel {
+ public:
+  SplitOpModel(const TensorData& input, int num_splits,
+               int axis = kAxisIsATensor) {
+    if (axis == kAxisIsATensor) {
+      axis_ = AddInput({TensorType_INT32, {1}});
+    } else {
+      axis_ = AddConstInput(TensorType_INT32, {axis}, {1});
+    }
+    input_ = AddInput(input);
+    for (int i = 0; i < num_splits; ++i) {
+      outputs_.push_back(AddOutput(input.type));
+    }
+    SetBuiltinOp(BuiltinOperator_SPLIT, BuiltinOptions_SplitOptions,
+                 CreateSplitOptions(builder_, num_splits).Union());
+    if (axis == kAxisIsATensor) {
+      BuildInterpreter({GetShape(axis_), GetShape(input_)});
+    } else {
+      BuildInterpreter({{}, GetShape(input_)});
+    }
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetAxis(int axis) { PopulateTensor(axis_, {axis}); }
+
+  std::vector<float> GetOutput(int i) {
+    return ExtractVector<float>(outputs_[i]);
+  }
+  std::vector<int> GetOutputShape(int i) { return GetTensorShape(outputs_[i]); }
+
+ private:
+  int input_;
+  int axis_;
+  std::vector<int> outputs_;
+};
+
+using TensorValues = std::initializer_list<float>;
+
+void Check(int axis, int num_splits, std::initializer_list<int> input_shape,
+           std::initializer_list<int> output_shape,
+           const TensorValues& input_data,
+           const std::vector<TensorValues>& output_data) {
+  auto debug = [&](int i) {
+    std::stringstream ss;
+    ss << "for output tensor " << i << " axis=" << axis
+       << " and num_splits=" << num_splits;
+    return ss.str();
+  };
+  SplitOpModel m({TensorType_FLOAT32, input_shape}, num_splits);
+  m.SetInput(input_data);
+  m.SetAxis(axis);
+  m.Invoke();
+  for (int i = 0; i < num_splits; ++i) {
+    EXPECT_THAT(m.GetOutput(i), ElementsAreArray(output_data[i])) << debug(i);
+    EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shape))
+        << debug(i);
+  }
+
+  SplitOpModel const_m({TensorType_FLOAT32, input_shape}, num_splits, axis);
+  const_m.SetInput(input_data);
+  const_m.Invoke();
+  for (int i = 0; i < num_splits; ++i) {
+    EXPECT_THAT(const_m.GetOutput(i), ElementsAreArray(output_data[i]))
+        << debug(i);
+    EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shape))
+        << debug(i);
+  }
+}
+
+TEST(SplitOpTest, FourDimensional) {
+  Check(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            {1, 2, 3, 4, 5, 6, 7, 8},
+            {9, 10, 11, 12, 13, 14, 15, 16},
+        });
+  Check(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            {1, 2, 3, 4, 9, 10, 11, 12},
+            {5, 6, 7, 8, 13, 14, 15, 16},
+        });
+  Check(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            {1, 2, 5, 6, 9, 10, 13, 14},
+            {3, 4, 7, 8, 11, 12, 15, 16},
+        });
+  Check(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            {1, 3, 5, 7, 9, 11, 13, 15},
+            {2, 4, 6, 8, 10, 12, 14, 16},
+        });
+}
+
+TEST(SplitOpTest, OneDimensional) {
+  Check(/*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+        {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+}
+
+TEST(SplitOpTest, NegativeAxis) {
+  Check(/*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+        {
+            {1, 2, 3, 4, 5, 6, 7, 8},
+            {9, 10, 11, 12, 13, 14, 15, 16},
+        });
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/split_v.cc b/tensorflow/lite/kernels/split_v.cc
new file mode 100644
index 0000000000000000000000000000000000000000..060e3c5f79c808cd3c8d4b21efd7f2595a68b8e8
--- /dev/null
+++ b/tensorflow/lite/kernels/split_v.cc
@@ -0,0 +1,207 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace split_v {
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteSplitVParams*>(node->builtin_data);
+    input = GetInput(context, node, 0);
+    size_splits = GetInput(context, node, 1);
+    axis = GetInput(context, node, 2);
+  }
+  TfLiteSplitVParams* params;
+  const TfLiteTensor* input;
+  const TfLiteTensor* size_splits;
+  const TfLiteTensor* axis;
+};
+
+TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    SetTensorToDynamic(GetOutput(context, node, i));
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+void GetSizeSplitsVector(const TfLiteTensor* size_splits,
+                         std::vector<int64_t>* size_splits_vector) {
+  const auto num_elements = NumElements(size_splits);
+  for (int i = 0; i < num_elements; ++i) {
+    size_splits_vector->push_back(GetTensorData<T>(size_splits)[i]);
+  }
+}
+
+TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node,
+                                 const TfLiteTensor* input,
+                                 const TfLiteTensor* size_splits,
+                                 const TfLiteTensor* axis) {
+  int axis_value = GetTensorData<int>(axis)[0];
+  if (axis_value < 0) {
+    axis_value += NumDimensions(input);
+  }
+
+  std::vector<int64_t> size_splits_vector;
+  if (size_splits->type == kTfLiteInt32) {
+    GetSizeSplitsVector<int32_t>(size_splits, &size_splits_vector);
+  } else if (size_splits->type == kTfLiteInt64) {
+    GetSizeSplitsVector<int64_t>(size_splits, &size_splits_vector);
+  } else {
+    context->ReportError(context, "size_splits only support type int32|int64.");
+    return kTfLiteError;
+  }
+
+  int minus_one_index = -1;
+  int64_t size_splits_sum = 0;
+
+  for (int i = 0; i < size_splits_vector.size(); ++i) {
+    if (size_splits_vector.at(i) == -1) {
+      if (minus_one_index == -1) {
+        minus_one_index = i;
+      } else {
+        context->ReportError(context,
+                             "The size_splits contains more than one -1.");
+      }
+    } else {
+      size_splits_sum += size_splits_vector.at(i);
+    }
+  }
+
+  const int input_size = SizeOfDimension(input, axis_value);
+
+  if (minus_one_index != -1) {
+    if (size_splits_sum > input_size) {
+      context->ReportError(
+          context,
+          "The sum of size_splits must be less than the dimension of value.");
+    } else {
+      size_splits_vector[minus_one_index] = input_size - size_splits_sum;
+    }
+  } else if (size_splits_sum != input_size) {
+    context->ReportError(
+        context,
+        "The size_splits must sum to the dimension of value along axis.");
+  }
+
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
+    output_dims->data[axis_value] = size_splits_vector.at(i);
+    TfLiteTensor* output = GetOutput(context, node, i);
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_dims));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+
+  OpContext op_context(context, node);
+
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), op_context.params->num_splits);
+
+  auto input_type = op_context.input->type;
+  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
+                              input_type == kTfLiteUInt8 ||
+                              input_type == kTfLiteInt16);
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    GetOutput(context, node, i)->type = input_type;
+  }
+
+  auto size_splits = op_context.size_splits;
+  TF_LITE_ENSURE_EQ(context, NumDimensions(size_splits), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), NumElements(size_splits));
+
+  // If we know the contents of the 'size_splits' tensor and the 'axis' tensor,
+  // resize all outputs. Otherwise, wait until Eval().
+  if (IsConstantTensor(op_context.size_splits) &&
+      IsConstantTensor(op_context.axis)) {
+    return ResizeOutputTensors(context, node, op_context.input,
+                               op_context.size_splits, op_context.axis);
+  } else {
+    return UseDynamicOutputTensors(context, node);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+
+  // When the 'size_splits' and the 'axis' tensor is non-const we can't resize
+  // output tensors in Prepare(), and we have to do it now.
+  if (!IsConstantTensor(op_context.axis) ||
+      !IsConstantTensor(op_context.size_splits)) {
+    TF_LITE_ENSURE_OK(
+        context, ResizeOutputTensors(context, node, op_context.input,
+                                     op_context.size_splits, op_context.axis));
+  }
+
+  int axis_value = GetTensorData<int>(op_context.axis)[0];
+
+  // Use split function to build the outputs since they share the same logic.
+#define TF_LITE_SPLIT_V(scalar)                                     \
+  VectorOfTensors<scalar> all_outputs(*context, *node->outputs);    \
+  tflite::SplitParams op_params;                                    \
+  op_params.num_split = NumOutputs(node);                           \
+  op_params.axis = axis_value;                                      \
+  reference_ops::Split(op_params, GetTensorShape(op_context.input), \
+                       GetTensorData<scalar>(op_context.input),     \
+                       all_outputs.shapes(), all_outputs.data());
+  switch (op_context.input->type) {
+    case kTfLiteFloat32: {
+      TF_LITE_SPLIT_V(float);
+      break;
+    }
+    case kTfLiteUInt8: {
+      TF_LITE_SPLIT_V(uint8_t);
+      break;
+    }
+    case kTfLiteInt16: {
+      TF_LITE_SPLIT_V(int16_t);
+      break;
+    }
+    default:
+      context->ReportError(
+          context,
+          "Only float32, uint8 and int16 are currently supported, got %d.",
+          op_context.input->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_SPLIT_V
+
+  return kTfLiteOk;
+}
+
+}  // namespace split_v
+
+TfLiteRegistration* Register_SPLIT_V() {
+  static TfLiteRegistration r = {nullptr, nullptr, split_v::Prepare,
+                                 split_v::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/split_v_test.cc b/tensorflow/lite/kernels/split_v_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d1d36d6851c12d1b05374cda5ef32255e162875
--- /dev/null
+++ b/tensorflow/lite/kernels/split_v_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <initializer_list>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+constexpr int kAxisIsATensor = -1000;
+
+class SplitVOpModel : public SingleOpModel {
+ public:
+  SplitVOpModel(const TensorData& input, const TensorData& size_splits,
+                int num_splits, int axis) {
+    input_ = AddInput(input);
+    size_splits_ = AddInput(size_splits);
+    if (axis == kAxisIsATensor) {
+      axis_ = AddInput({TensorType_INT32, {1}});
+    } else {
+      axis_ = AddConstInput(TensorType_INT32, {axis}, {1});
+    }
+    for (int i = 0; i < num_splits; ++i) {
+      outputs_.push_back(AddOutput(input.type));
+    }
+    SetBuiltinOp(BuiltinOperator_SPLIT_V, BuiltinOptions_SplitVOptions,
+                 CreateSplitVOptions(builder_, num_splits).Union());
+    if (axis == kAxisIsATensor) {
+      BuildInterpreter(
+          {GetShape(input_), GetShape(size_splits_), GetShape(axis_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_splits_), {}});
+    }
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetSizeSplits(std::initializer_list<int> data) {
+    PopulateTensor(size_splits_, data);
+  }
+  void SetAxis(int axis) { PopulateTensor(axis_, {axis}); }
+
+  std::vector<float> GetOutput(int i) {
+    return ExtractVector<float>(outputs_[i]);
+  }
+  std::vector<int> GetOutputShape(int i) { return GetTensorShape(outputs_[i]); }
+
+ private:
+  int input_;
+  int size_splits_;
+  int axis_;
+  std::vector<int> outputs_;
+};
+
+// TODO(ruic): Add tests to test quantized values. b/119638735
+using TensorValues = std::initializer_list<float>;
+
+void Check(int axis, std::initializer_list<int> input_shape,
+           std::initializer_list<int> size_splits_shape,
+           std::vector<std::initializer_list<int>> output_shapes,
+           const TensorValues& input_data,
+           const std::initializer_list<int>& size_splits_data,
+           const std::vector<TensorValues>& output_data) {
+  int num_splits = size_splits_data.size();
+  SplitVOpModel m({TensorType_FLOAT32, input_shape},
+                  {TensorType_INT32, size_splits_shape}, num_splits,
+                  kAxisIsATensor);
+  m.SetInput(input_data);
+  m.SetSizeSplits(size_splits_data);
+  m.SetAxis(axis);
+  m.Invoke();
+  for (int i = 0; i < num_splits; ++i) {
+    EXPECT_THAT(m.GetOutput(i), ElementsAreArray(output_data[i]));
+    EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shapes[i]));
+  }
+
+  SplitVOpModel const_m({TensorType_FLOAT32, input_shape},
+                        {TensorType_INT32, size_splits_shape}, num_splits,
+                        axis);
+  const_m.SetInput(input_data);
+  const_m.SetSizeSplits(size_splits_data);
+  const_m.Invoke();
+  for (int i = 0; i < num_splits; ++i) {
+    EXPECT_THAT(const_m.GetOutput(i), ElementsAreArray(output_data[i]));
+    EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shapes[i]));
+  }
+}
+
+TEST(SplitVOpTest, TwoDimensional) {
+  // Input shape: {4, 3}
+  // size_splits: {1, 1, 3}
+  // axis: 0
+  // We should have 3 outpus with shapes respectively:
+  //  output 0 : {1, 3}
+  //  output 1 : {1, 3}
+  //  output 1 : {2, 3}
+  Check(/*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
+        {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
+}
+
+TEST(SplitVOpTest, FourDimensional) {
+  Check(/*axis=*/0, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 2, 3, 4, 5, 6, 7, 8},
+            {9, 10, 11, 12, 13, 14, 15, 16},
+        });
+  Check(/*axis=*/1, {2, 2, 2, 2}, {2}, {{2, 1, 2, 2}, {2, 1, 2, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, -1},
+        {
+            {1, 2, 3, 4, 9, 10, 11, 12},
+            {5, 6, 7, 8, 13, 14, 15, 16},
+        });
+  Check(/*axis=*/2, {2, 2, 2, 2}, {2}, {{2, 2, 1, 2}, {2, 2, 1, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 2, 5, 6, 9, 10, 13, 14},
+            {3, 4, 7, 8, 11, 12, 15, 16},
+        });
+  Check(/*axis=*/3, {2, 2, 2, 2}, {2}, {{2, 2, 2, 1}, {2, 2, 2, 1}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 3, 5, 7, 9, 11, 13, 15},
+            {2, 4, 6, 8, 10, 12, 14, 16},
+        });
+}
+
+TEST(SplitVOpTest, OneDimensional) {
+  Check(/*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {1}, {1}},
+        {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 1, 1},
+        {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+}
+
+TEST(SplitVOpTest, OneDimensional2) {
+  Check(/*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {2}, {0}},
+        {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 2, -1},
+        {{1}, {2}, {3}, {4}, {5}, {6}, {7, 8}, {}});
+}
+
+TEST(SplitVOpTest, NegativeAxis) {
+  Check(/*axis=*/-4, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 2, 3, 4, 5, 6, 7, 8},
+            {9, 10, 11, 12, 13, 14, 15, 16},
+        });
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59b53a6287dbbc863a61875be82090c1b9c6d442
--- /dev/null
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace squared_difference {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  bool requires_broadcast;
+};
+
+template <typename T>
+T SquaredDifference(T input1, T input2) {
+  const T difference = input1 - input2;
+  return difference * difference;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+void EvalSquaredDifference(TfLiteContext* context, TfLiteNode* node,
+                           const OpData* data, const TfLiteTensor* input1,
+                           const TfLiteTensor* input2, TfLiteTensor* output) {
+  if (data->requires_broadcast) {
+    reference_ops::BroadcastBinaryFunction4DSlow<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), GetTensorData<T>(input2),
+        GetTensorShape(output), GetTensorData<T>(output), SquaredDifference<T>);
+  } else {
+    reference_ops::BinaryFunction<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), GetTensorData<T>(input2),
+        GetTensorShape(output), GetTensorData<T>(output), SquaredDifference<T>);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalSquaredDifference<float>(context, node, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt32) {
+    EvalSquaredDifference<int32_t>(context, node, data, input1, input2, output);
+  } else {
+    context->ReportError(context,
+                         "SquaredDifference only supports FLOAT32, INT32 and "
+                         "quantized UINT8 now, got %d.",
+                         output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace squared_difference
+
+TfLiteRegistration* Register_SQUARED_DIFFERENCE() {
+  static TfLiteRegistration r = {
+      squared_difference::Init, squared_difference::Free,
+      squared_difference::Prepare, squared_difference::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/squared_difference_test.cc b/tensorflow/lite/kernels/squared_difference_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32bcab3b87f5f0cf5ad47724cc06c98f1a561e4a
--- /dev/null
+++ b/tensorflow/lite/kernels/squared_difference_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseSquaredDifferenceOpModel : public SingleOpModel {
+ public:
+  BaseSquaredDifferenceOpModel(const TensorData& input1,
+                               const TensorData& input2,
+                               const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SQUARED_DIFFERENCE,
+                 BuiltinOptions_SquaredDifferenceOptions,
+                 CreateSquaredDifferenceOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatSquaredDifferenceOpModel : public BaseSquaredDifferenceOpModel {
+ public:
+  using BaseSquaredDifferenceOpModel::BaseSquaredDifferenceOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class IntegerSquaredDifferenceOpModel : public BaseSquaredDifferenceOpModel {
+ public:
+  using BaseSquaredDifferenceOpModel::BaseSquaredDifferenceOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_SameShape) {
+  FloatSquaredDifferenceOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                  {TensorType_FLOAT32, {1, 2, 2, 1}},
+                                  {TensorType_FLOAT32, {}});
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.5, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({0.49, 0.0, 0.09, 0.09})));
+}
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_VariousInputShapes) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSquaredDifferenceOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                                    {TensorType_FLOAT32, test_shapes[i]},
+                                    {TensorType_FLOAT32, {}});
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.3, 0.8, 1.1, -2.0});
+    m.PopulateTensor<float>(m.input2(), {1.0, 0.2, 0.6, 0.4, -1.0, -0.0});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({9.0, 0.0, 0.09, 0.16, 4.41, 4.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_WithBroadcast) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSquaredDifferenceOpModel m(
+        {TensorType_FLOAT32, test_shapes[i]},
+        {TensorType_FLOAT32, {}},  // always a scalar
+        {TensorType_FLOAT32, {}});
+    m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, 0.5, 0.8, 0.11, 1.1});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({0.09, 0.01, 0.16, 0.49, 0.0001, 1.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_SameShape) {
+  IntegerSquaredDifferenceOpModel m({TensorType_INT32, {1, 2, 2, 1}},
+                                    {TensorType_INT32, {1, 2, 2, 1}},
+                                    {TensorType_INT32, {}});
+  m.PopulateTensor<int32_t>(m.input1(), {-2, 2, -15, 8});
+  m.PopulateTensor<int32_t>(m.input2(), {5, -2, -3, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({49, 16, 144, 9}));
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_VariousInputShapes) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    IntegerSquaredDifferenceOpModel m({TensorType_INT32, test_shapes[i]},
+                                      {TensorType_INT32, test_shapes[i]},
+                                      {TensorType_INT32, {}});
+    m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 3, 8, 11, -20});
+    m.PopulateTensor<int32_t>(m.input2(), {1, 2, 6, 5, -5, -20});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({441, 0, 9, 9, 256, 0}))
+        << "With shape number " << i;
+  }
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_WithBroadcast) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    IntegerSquaredDifferenceOpModel m(
+        {TensorType_INT32, test_shapes[i]},
+        {TensorType_INT32, {}},  // always a scalar
+        {TensorType_INT32, {}});
+    m.PopulateTensor<int32_t>(m.input1(), {-20, 10, 7, 3, 1, 13});
+    m.PopulateTensor<int32_t>(m.input2(), {3});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({529, 49, 16, 0, 4, 100}))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/squeeze.cc b/tensorflow/lite/kernels/squeeze.cc
similarity index 92%
rename from tensorflow/contrib/lite/kernels/squeeze.cc
rename to tensorflow/lite/kernels/squeeze.cc
index 080c51cd18204ae3823cdceb80011e5ddcc32a2c..8be0c6b9de0810595ffb39f093eef438f8af459a 100644
--- a/tensorflow/contrib/lite/kernels/squeeze.cc
+++ b/tensorflow/lite/kernels/squeeze.cc
@@ -14,11 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/squeeze_test.cc b/tensorflow/lite/kernels/squeeze_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/squeeze_test.cc
rename to tensorflow/lite/kernels/squeeze_test.cc
index a8aab88357cacbb72784a4bc6e860aeb47783eb3..4a02a8ee7e17ba8907faaaeb2ef258d0ffd56ebd 100644
--- a/tensorflow/contrib/lite/kernels/squeeze_test.cc
+++ b/tensorflow/lite/kernels/squeeze_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/strided_slice.cc
rename to tensorflow/lite/kernels/strided_slice.cc
index 06b36dd1967a0276e3a98f09d5006824b64029cd..c797a98e9f1bda8595e6822638949bab48cb2eab 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -15,12 +15,12 @@ limitations under the License.
 #include <string.h>
 #include <cmath>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/kernels/strided_slice_test.cc
rename to tensorflow/lite/kernels/strided_slice_test.cc
index c5d4f9affb46c82b4dec15bc0653d7315d132335..122e01b99ecbed1255ea4b2d29e82b57f04be80c 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/sub.cc
rename to tensorflow/lite/kernels/sub.cc
index 1be0c83f17a34c7ec12f03c0a29f0bd7fa1a96a5..06a3b3499a005f19bfd1461dfe861835f8331b96 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
similarity index 88%
rename from tensorflow/contrib/lite/kernels/sub_test.cc
rename to tensorflow/lite/kernels/sub_test.cc
index 5978c574d35492eda6b903fd83d95ecbd6b62148..41503300ab599fbfcfee425c41033dd3bc10d2ea 100644
--- a/tensorflow/contrib/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -99,7 +99,7 @@ TEST(FloatSubOpModel, ActivationRELU_N1_TO_1) {
 }
 
 TEST(FloatSubOpModel, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -116,7 +116,7 @@ TEST(FloatSubOpModel, VariousInputShapes) {
 }
 
 TEST(FloatSubOpModel, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatSubOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -153,7 +153,7 @@ TEST(IntegerSubOpModel, ActivationRELU_N1_TO_1) {
 }
 
 TEST(IntegerSubOpModel, VariousInputShapes) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerSubOpModel m({TensorType_INT32, test_shapes[i]},
@@ -168,7 +168,7 @@ TEST(IntegerSubOpModel, VariousInputShapes) {
 }
 
 TEST(IntegerSubOpModel, WithBroadcast) {
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerSubOpModel m({TensorType_INT32, test_shapes[i]},
@@ -185,14 +185,13 @@ TEST(IntegerSubOpModel, WithBroadcast) {
 
 TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
-  std::vector<std::initializer_list<float>> inputs1 = {
+  std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
-  std::vector<std::initializer_list<float>> inputs2 = {
+  std::vector<std::vector<float>> inputs2 = {
       {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.2}, {0.6, 0.4, -0.18, 0.5}};
-  std::vector<std::initializer_list<float>> results = {
-      {-0.5, -0.2, 0.0, 0.3},
-      {-0.8, -0.2, -0.1, 0.9},
-      {-0.61, -0.2, 0.88, -0.2}};
+  std::vector<std::vector<float>> results = {{-0.5, -0.2, 0.0, 0.3},
+                                             {-0.8, -0.2, -0.1, 0.9},
+                                             {-0.61, -0.2, 0.88, -0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
@@ -209,12 +208,12 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
 
 TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
-  std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
-                                                       {-0.8, 0.2, 0.7, 0.5}};
-  std::vector<std::initializer_list<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
-                                                       {0.6, 0.4, -0.8, 0.3}};
-  std::vector<std::initializer_list<float>> results = {{-1.0, -0.2, 0.0, 1.0},
-                                                       {-1.0, -0.2, 1.0, 0.2}};
+  std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                             {-0.8, 0.2, 0.7, 0.5}};
+  std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                             {0.6, 0.4, -0.8, 0.3}};
+  std::vector<std::vector<float>> results = {{-1.0, -0.2, 0.0, 1.0},
+                                             {-1.0, -0.2, 1.0, 0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
                           {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
@@ -231,7 +230,7 @@ TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
 
 TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
@@ -250,7 +249,7 @@ TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
 
 TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::initializer_list<int>> test_shapes = {
+  std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/lite/kernels/svdf.cc
similarity index 95%
rename from tensorflow/contrib/lite/kernels/svdf.cc
rename to tensorflow/lite/kernels/svdf.cc
index 9903fd5c35794238912fb8e23f7da9314d43a08b..f07937140e9ac4abfbae47a1679ddbfba4d30938 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/lite/kernels/svdf.cc
@@ -23,12 +23,12 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -40,7 +40,6 @@ namespace {
 struct OpData {
   int scratch_tensor_index;
   bool float_weights_time_initialized;
-
   int activation_state_tensor_index;
 };
 
@@ -147,15 +146,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int rank = params->rank;
   const int batch_size = input->dims->data[0];
   const int num_filters = weights_feature->dims->data[0];
-  TF_LITE_ASSERT_EQ(num_filters % rank, 0);
+  TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
   const int num_units = num_filters / rank;
   const int memory_size = weights_time->dims->data[1];
-  TF_LITE_ASSERT_EQ(input->dims->data[1], weights_feature->dims->data[1]);
-  TF_LITE_ASSERT_EQ(weights_time->dims->data[0], num_filters);
+  TF_LITE_ENSURE_EQ(context, input->dims->data[1],
+                    weights_feature->dims->data[1]);
+  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters);
 
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   if (bias) {
-    TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
+    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
   }
 
   TfLiteTensor* activation_state =
@@ -216,9 +216,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-    scaling_factors_size->data[0] = batch_size;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+    int scaling_dims[1] = {batch_size};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = batch_size;
       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
                                                        scaling_factors_size));
     }
diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/lite/kernels/svdf_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/kernels/svdf_test.cc
rename to tensorflow/lite/kernels/svdf_test.cc
index 6d60dc63f401144a5eda84d9f88992ce1f9ee47e..8accaa465ca8a51f2b6e00648a6195f31039d3f7 100644
--- a/tensorflow/contrib/lite/kernels/svdf_test.cc
+++ b/tensorflow/lite/kernels/svdf_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..549ea78f5b45b20139b023552a98c3dcb0d75610
--- /dev/null
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -0,0 +1,167 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/test_util.h"
+
+#include "tensorflow/lite/version.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tflite {
+
+using ::testing::FloatNear;
+using ::testing::Matcher;
+
+std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
+                                           float max_abs_error) {
+  std::vector<Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    matchers.emplace_back(FloatNear(v, max_abs_error));
+  }
+  return matchers;
+}
+
+std::vector<Matcher<std::complex<float>>> ArrayComplex64Near(
+    const std::vector<std::complex<float>>& values, float max_abs_error) {
+  std::vector<Matcher<std::complex<float>>> matchers;
+  matchers.reserve(values.size());
+  for (const std::complex<float>& v : values) {
+    matchers.emplace_back(
+        AllOf(::testing::Property(&std::complex<float>::real,
+                                  FloatNear(v.real(), max_abs_error)),
+              ::testing::Property(&std::complex<float>::imag,
+                                  FloatNear(v.imag(), max_abs_error))));
+  }
+  return matchers;
+}
+
+int SingleOpModel::AddInput(const TensorData& t, bool is_variable) {
+  int id = AddTensor<float>(t, {}, is_variable);
+  inputs_.push_back(id);
+  return id;
+}
+
+int SingleOpModel::AddNullInput() {
+  int id = kOptionalTensor;
+  inputs_.push_back(id);
+  return id;
+}
+
+int SingleOpModel::AddOutput(const TensorData& t) {
+  int id = AddTensor<float>(t, {});
+  outputs_.push_back(id);
+  return id;
+}
+
+void SingleOpModel::SetBuiltinOp(BuiltinOperator type,
+                                 BuiltinOptions builtin_options_type,
+                                 flatbuffers::Offset<void> builtin_options) {
+  opcodes_.push_back(CreateOperatorCode(builder_, type, 0));
+  operators_.push_back(CreateOperator(
+      builder_, /*opcode_index=*/0, builder_.CreateVector<int32_t>(inputs_),
+      builder_.CreateVector<int32_t>(outputs_), builtin_options_type,
+      builtin_options,
+      /*custom_options=*/0, CustomOptionsFormat_FLEXBUFFERS));
+}
+
+void SingleOpModel::SetCustomOp(
+    const string& name, const std::vector<uint8_t>& custom_option,
+    const std::function<TfLiteRegistration*()>& registration) {
+  custom_registrations_[name] = registration;
+  opcodes_.push_back(
+      CreateOperatorCodeDirect(builder_, BuiltinOperator_CUSTOM, name.data()));
+  operators_.push_back(CreateOperator(
+      builder_, /*opcode_index=*/0, builder_.CreateVector<int32_t>(inputs_),
+      builder_.CreateVector<int32_t>(outputs_), BuiltinOptions_NONE, 0,
+      builder_.CreateVector<uint8_t>(custom_option),
+      CustomOptionsFormat_FLEXBUFFERS));
+}
+
+void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
+                                     bool allow_fp32_relax_to_fp16) {
+  auto opcodes = builder_.CreateVector(opcodes_);
+  auto operators = builder_.CreateVector(operators_);
+  auto tensors = builder_.CreateVector(tensors_);
+  auto inputs = builder_.CreateVector<int32_t>(inputs_);
+  auto outputs = builder_.CreateVector<int32_t>(outputs_);
+  // Create a single subgraph
+  std::vector<flatbuffers::Offset<SubGraph>> subgraphs;
+  auto subgraph = CreateSubGraph(builder_, tensors, inputs, outputs, operators);
+  subgraphs.push_back(subgraph);
+  auto subgraphs_flatbuffer = builder_.CreateVector(subgraphs);
+
+  auto buffers = builder_.CreateVector(buffers_);
+  auto description = builder_.CreateString("programmatic model");
+  builder_.Finish(CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                              subgraphs_flatbuffer, description, buffers));
+
+  auto* model = GetModel(builder_.GetBufferPointer());
+
+  if (!resolver_) {
+    auto resolver = new ops::builtin::BuiltinOpResolver();
+    for (const auto& reg : custom_registrations_) {
+      resolver->AddCustom(reg.first.data(), reg.second());
+    }
+    resolver_ = std::unique_ptr<OpResolver>(resolver);
+  }
+  CHECK(InterpreterBuilder(model, *resolver_)(&interpreter_) == kTfLiteOk);
+
+  CHECK(interpreter_ != nullptr);
+
+  int i = 0;
+  for (const auto& shape : input_shapes) {
+    int input_idx = interpreter_->inputs()[i++];
+    if (input_idx == kOptionalTensor) continue;
+    if (shape.empty()) continue;
+    CHECK(interpreter_->ResizeInputTensor(input_idx, shape) == kTfLiteOk);
+  }
+
+  interpreter_->SetAllowFp16PrecisionForFp32(allow_fp32_relax_to_fp16);
+
+  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
+      << "Cannot allocate tensors";
+  interpreter_->ResetVariableTensors();
+
+  // Modify delegate with function.
+  if (apply_delegate_fn_) {
+    apply_delegate_fn_(interpreter_.get());
+  }
+}
+
+void SingleOpModel::Invoke() { CHECK(interpreter_->Invoke() == kTfLiteOk); }
+
+int32_t SingleOpModel::GetTensorSize(int index) const {
+  TfLiteTensor* t = interpreter_->tensor(index);
+  CHECK(t);
+  int total_size = 1;
+  for (int i = 0; i < t->dims->size; ++i) {
+    total_size *= t->dims->data[i];
+  }
+  return total_size;
+}
+
+template <>
+std::vector<string> SingleOpModel::ExtractVector(int index) {
+  TfLiteTensor* tensor_ptr = interpreter_->tensor(index);
+  CHECK(tensor_ptr != nullptr);
+  const int num_strings = GetStringCount(tensor_ptr);
+  std::vector<string> result;
+  result.reserve(num_strings);
+  for (int i = 0; i < num_strings; ++i) {
+    const auto str = GetString(tensor_ptr, i);
+    result.emplace_back(str.str, str.len);
+  }
+  return result;
+}
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..dadabb86abbe3b13da74fda9224e693d310ada26
--- /dev/null
+++ b/tensorflow/lite/kernels/test_util.h
@@ -0,0 +1,417 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
+
+#include <complex>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tflite {
+
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+std::vector<::testing::Matcher<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5);
+
+// A gmock matcher that check that elements of a complex vector match to a given
+// tolerance.
+std::vector<::testing::Matcher<std::complex<float>>> ArrayComplex64Near(
+    const std::vector<std::complex<float>>& values, float max_abs_error = 1e-5);
+
+template <typename T>
+inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
+                               int32_t zero_point) {
+  std::vector<T> q;
+  for (float f : data) {
+    q.push_back(static_cast<T>(std::max<float>(
+        std::numeric_limits<T>::min(),
+        std::min<float>(std::numeric_limits<T>::max(),
+                        std::round(zero_point + (f / scale))))));
+  }
+  return q;
+}
+
+template <typename T>
+inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
+                                     int32_t zero_point) {
+  std::vector<float> f;
+  for (T q : data) {
+    f.push_back(scale * (q - zero_point));
+  }
+  return f;
+}
+
+// A test model that contains a single operator. All operator inputs and
+// output are external to the model, so the tests can directly access them.
+// Typical usage:
+//    SingleOpModel m;
+//    int a = m.AddInput({TensorType_FLOAT32, a_shape});
+//    int b = m.AddInput({TensorType_FLOAT32, b_shape});
+//    int c = m.AddOutput({TensorType_FLOAT32, {}});
+//    m.SetBuiltinOp(...);
+//    m.BuildInterpreter({GetShape(a), GetShape(b)});
+//    m.PopulateTensor(a, {...});
+//    m.PopulateTensor(b, {...});
+//    m.Invoke();
+//    EXPECT_THAT(m.ExtractVector<float>(c), ArrayFloatNear({...}));
+//
+
+// A helper struct to construct test tensors. This is particularly useful for
+// quantized tensor which must have their scale and zero_point defined before
+// the actual data is known. This mimics what happens in practice: quantization
+// parameters are calculated during training.
+struct TensorData {
+  TensorType type;
+  std::vector<int> shape;
+  float min;
+  float max;
+  float scale;
+  int32_t zero_point;
+};
+
+class SingleOpResolver : public OpResolver {
+ public:
+  SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration)
+      : op_(op), registration_(*registration) {
+    registration_.builtin_code = static_cast<int32_t>(op);
+    registration_.version = 1;
+  }
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const override {
+    if (op == op_) {
+      return &registration_;
+    }
+    return nullptr;
+  }
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+    return nullptr;
+  }
+
+ private:
+  const BuiltinOperator op_;
+  TfLiteRegistration registration_;
+};
+
+class SingleOpModel {
+ public:
+  SingleOpModel() {}
+  ~SingleOpModel() {}
+
+  // Set a function callback that is run right after graph is prepared
+  // that allows applying external delegates. This is useful for testing
+  // other runtimes like NN API or GPU.
+  void SetApplyDelegate(std::function<void(Interpreter*)> apply_delegate_fn) {
+    apply_delegate_fn_ = apply_delegate_fn;
+  }
+
+  // Copying or assignment is disallowed to simplify ownership semantics.
+  SingleOpModel(const SingleOpModel&) = delete;
+  SingleOpModel& operator=(const SingleOpModel&) = delete;
+
+  // Add a TensorType input tensor and return its index.
+  int AddInput(TensorType type, bool is_variable = false) {
+    return AddInput(TensorData{type}, is_variable);
+  }
+  int AddInput(const TensorData& t, bool is_variable = false);
+
+  // Templated version of AddConstInput().
+  template <typename T>
+  int AddConstInput(TensorType type, std::initializer_list<T> data,
+                    std::initializer_list<int> shape) {
+    int id = AddTensor(TensorData{type, shape}, data);
+    inputs_.push_back(id);
+    return id;
+  }
+
+  // Add a null input tensor (optional input) and return kOptionalTensor.
+  int AddNullInput();
+
+  // Add a TensorType output tensor and return its index.
+  int AddOutput(TensorType type) { return AddOutput(TensorData{type}); }
+  int AddOutput(const TensorData& t);
+
+  template <typename T>
+  void QuantizeAndPopulate(int index, const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto q = Quantize<T>(data, t->params.scale, t->params.zero_point);
+    PopulateTensor(index, 0, q.data(), q.data() + q.size());
+  }
+
+  void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    const int length = data.size();
+    std::vector<int8_t> q(length);
+    float min, max, scaling_factor;
+    tensor_utils::SymmetricQuantizeFloats(data.data(), length, q.data(), &min,
+                                          &max, &scaling_factor);
+    // Update quantization params.
+    t->params.scale = scaling_factor;
+    t->params.zero_point = 0;
+    PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
+                   reinterpret_cast<uint8_t*>(q.data() + q.size()));
+  }
+
+  const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
+
+  float GetScale(int id) { return tensor_data_.at(id).scale; }
+  int32_t GetZeroPoint(int id) { return tensor_data_.at(id).zero_point; }
+
+  // Define the operator in this model.
+  void SetBuiltinOp(BuiltinOperator type, BuiltinOptions builtin_options_type,
+                    flatbuffers::Offset<void> builtin_options);
+  void SetCustomOp(const string& name,
+                   const std::vector<uint8_t>& custom_option,
+                   const std::function<TfLiteRegistration*()>& registeration);
+
+  // Build the interpreter for this model. Also, resize and allocate all
+  // tensors given the shapes of the inputs.
+  void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
+                        bool allow_fp32_relax_to_fp16 = false);
+
+  void Invoke();
+
+  void PopulateStringTensor(int index, const std::vector<string>& content) {
+    auto tensor = interpreter_->tensor(index);
+    DynamicBuffer buf;
+    for (const string& s : content) {
+      buf.AddString(s.data(), s.length());
+    }
+    buf.WriteToTensor(tensor, /*new_shape=*/nullptr);
+  }
+
+  // Populate the tensor given its index.
+  // TODO(b/110696148) clean up and merge with vector-taking variant below.
+  template <typename T>
+  void PopulateTensor(int index, const std::initializer_list<T>& data) {
+    T* v = interpreter_->typed_tensor<T>(index);
+    if (!v) {
+      auto* t = interpreter_->tensor(index);
+      CHECK(t) << "No tensor with index " << index << ".";
+      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
+      CHECK(v) << "Type mismatch for tensor with index " << index
+               << ". Requested " << typeToTfLiteType<T>() << ", got "
+               << t->type;
+    }
+    for (T f : data) {
+      *v = f;
+      ++v;
+    }
+  }
+
+  // Populate the tensor given its index.
+  // TODO(b/110696148) clean up and merge with initializer_list-taking variant
+  // above.
+  template <typename T>
+  void PopulateTensor(int index, const std::vector<T>& data) {
+    T* v = interpreter_->typed_tensor<T>(index);
+    if (!v) {
+      auto* t = interpreter_->tensor(index);
+      CHECK(t) << "No tensor with index " << index << ".";
+      CHECK(t->data.raw) << "Empty data for tensor with index " << index << ".";
+      CHECK(v) << "Type mismatch for tensor with index " << index
+               << ". Requested " << typeToTfLiteType<T>() << ", got "
+               << t->type;
+    }
+    for (T f : data) {
+      *v = f;
+      ++v;
+    }
+  }
+
+  // Partially populate the tensor, starting at the given offset.
+  template <typename T>
+  void PopulateTensor(int index, int offset, T* begin, T* end) {
+    T* v = interpreter_->typed_tensor<T>(index);
+    memcpy(v + offset, begin, (end - begin) * sizeof(T));
+  }
+
+  // Return a vector with the flattened contents of a tensor.
+  template <typename T>
+  std::vector<T> ExtractVector(int index) {
+    T* v = interpreter_->typed_tensor<T>(index);
+    CHECK(v);
+    return std::vector<T>(v, v + GetTensorSize(index));
+  }
+
+  std::vector<int> GetTensorShape(int index) {
+    std::vector<int> result;
+    TfLiteTensor* t = interpreter_->tensor(index);
+    for (int i = 0; i < t->dims->size; ++i) {
+      result.push_back(t->dims->data[i]);
+    }
+    return result;
+  }
+
+  void SetResolver(std::unique_ptr<OpResolver> resolver) {
+    resolver_ = std::move(resolver);
+  }
+
+ protected:
+  int32_t GetTensorSize(int index) const;
+
+  flatbuffers::FlatBufferBuilder builder_;
+  std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<OpResolver> resolver_;
+
+ private:
+  // TODO(gavinbelson): sync this method with
+  // //tensorflow/lite/kernels/internal/quantization_util.h?l=31
+  template <typename T>
+  std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
+    // These are required by many quantized operations.
+    CHECK_LE(f_min, 0);
+    CHECK_GE(f_max, 0);
+    T q_min = std::numeric_limits<T>::min();
+    T q_max = std::numeric_limits<T>::max();
+    float range = q_max - q_min;
+    float scale = (f_max - f_min) / range;
+    int32_t zero_point = std::min(
+        q_max,
+        std::max(q_min, static_cast<T>(std::round(q_min - f_min / scale))));
+    return {scale, zero_point};
+  }
+
+  template <typename T>
+  int AddTensor(TensorData t, std::initializer_list<T> data,
+                bool is_variable = false) {
+    int id = tensors_.size();
+
+    // This is slightly different depending on whether we are adding a
+    // quantized or a regular tensor.
+    bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
+
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
+
+    if (is_quantized) {
+      if (t.min != 0 || t.max != 0) {
+        // TODO(b/119422369): Handle signed int8 here.
+        if (t.type == TensorType_UINT8) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<uint8_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT32) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int32_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT16) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int16_t>(t.min, t.max);
+        } else {
+          LOG(FATAL) << "No support for the requested quantized type";
+        }
+        t.min = 0;
+        t.max = 0;
+      }
+
+      q_params = CreateQuantizationParameters(
+          builder_, /*min=*/0, /*max=*/0,
+          builder_.CreateVector<float>({t.scale}),
+          builder_.CreateVector<int64_t>({t.zero_point}));
+    }
+
+    int buffer_id = 0;
+    if (data.size()) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      // Add data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      auto data_buffer =
+          builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.begin()),
+                                sizeof(T) * data.size());
+      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+    }
+
+    tensors_.push_back(CreateTensor(builder_,
+                                    builder_.CreateVector<int>(t.shape), t.type,
+                                    /*buffer=*/buffer_id,
+                                    /*name=*/0, q_params, is_variable));
+
+    tensor_data_[id] = t;
+
+    return id;
+  }
+
+  std::map<int, TensorData> tensor_data_;
+  std::vector<int32_t> inputs_;
+  std::vector<int32_t> outputs_;
+  std::vector<flatbuffers::Offset<Tensor>> tensors_;
+  std::vector<flatbuffers::Offset<OperatorCode>> opcodes_;
+  std::vector<flatbuffers::Offset<Operator>> operators_;
+  std::vector<flatbuffers::Offset<Buffer>> buffers_;
+  std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
+  // A function pointer that gets called after the interpreter is created but
+  // before evaluation happens. This is useful for applying a delegate.
+  std::function<void(Interpreter*)> apply_delegate_fn_;
+};
+
+// Base class for single op unit tests.
+// The tests are parameterized to test multiple kernels for a single op.
+// The parameters are strings like "optimized" and "reference" to have better
+// readability in test reports.
+//
+// To use this class:
+// * Define a constant map from strings to TfLiteRegistration.
+// * Implement a test class that inherits SingleOpTest.
+// * Instantiate the test cases with SingleOpTest::GetKernelTags helper
+//   function.
+// * Call GetRegistration to get the TfLiteRegistration to be used before
+//   building the interpreter.
+class SingleOpTest : public ::testing::TestWithParam<string> {
+ public:
+  static std::vector<string> GetKernelTags(
+      const std::map<string, TfLiteRegistration*>& kernel_map) {
+    std::vector<string> tags;
+    for (auto it : kernel_map) {
+      tags.push_back(it.first);
+    }
+    return tags;
+  }
+
+ protected:
+  virtual const std::map<string, TfLiteRegistration*>& GetKernelMap() = 0;
+  TfLiteRegistration* GetRegistration() {
+    return GetKernelMap().at(GetParam());
+  }
+};
+
+// Returns the corresponding TensorType given the type T.
+template <typename T>
+TensorType GetTensorType() {
+  if (std::is_same<T, float>::value) return TensorType_FLOAT32;
+  if (std::is_same<T, int32_t>::value) return TensorType_INT32;
+  if (std::is_same<T, uint8_t>::value) return TensorType_UINT8;
+  if (std::is_same<T, string>::value) return TensorType_STRING;
+  return TensorType_MIN;  // default value
+}
+
+// Strings have a special implementation that is in test_util.cc
+template <>
+std::vector<string> SingleOpModel::ExtractVector(int index);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/test_util_test.cc b/tensorflow/lite/kernels/test_util_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/test_util_test.cc
rename to tensorflow/lite/kernels/test_util_test.cc
index 236580347254d336609a3081736f54e069b5cb5a..7abb7011f9d23e52f370479b5351c0c8ad1a125d 100644
--- a/tensorflow/contrib/lite/kernels/test_util_test.cc
+++ b/tensorflow/lite/kernels/test_util_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
 #include <gtest/gtest.h>
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
similarity index 86%
rename from tensorflow/contrib/lite/kernels/tile.cc
rename to tensorflow/lite/kernels/tile.cc
index 49421eb87081626c05c78ad809d4adea1cdf1e8b..1b7479747431ad903f64f3011045266c415dc6c5 100644
--- a/tensorflow/contrib/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 namespace tflite {
 namespace ops {
 namespace builtin {
@@ -63,7 +63,9 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
           MultiplyShapeDims<int64_t>(*input->dims, multipliers,
                                      num_dimensions));
     default:
-      context->ReportError(context, "Tile not supported multiply tensor type.");
+      context->ReportError(
+          context, "Multipliers of type '%s' are not supported by tile.",
+          TfLiteTypeGetName(multipliers->type));
       return kTfLiteError;
   }
 }
@@ -143,10 +145,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
   // Only int32 and int64 multipliers type is supported.
-  TF_LITE_ENSURE_MSG(context,
-                     (multipliers->type == kTfLiteInt32) ||
-                         (multipliers->type == kTfLiteInt64),
-                     "Tile only supports int32 and int64 mutlipliers.");
+  if (multipliers->type != kTfLiteInt32 && multipliers->type != kTfLiteInt64) {
+    context->ReportError(context,
+                         "Multipliers of type '%s' are not supported by tile.",
+                         TfLiteTypeGetName(multipliers->type));
+    return kTfLiteError;
+  }
 
   if (IsConstantTensor(multipliers)) {
     TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
@@ -178,8 +182,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt64:
       Tile<int64_t>(*(input->dims), input, multipliers, output);
       break;
+    case kTfLiteBool:
+      Tile<bool>(*(input->dims), input, multipliers, output);
+      break;
     default:
-      context->ReportError(context, "Type is currently not supported by Tile.");
+      context->ReportError(context, "Type '%s' is not supported by tile.",
+                           TfLiteTypeGetName(output->type));
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/tile_test.cc b/tensorflow/lite/kernels/tile_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a88ff66f0754549c96077d6edf655039caf62e34
--- /dev/null
+++ b/tensorflow/lite/kernels/tile_test.cc
@@ -0,0 +1,253 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+class TileOpModel : public SingleOpModel {
+ public:
+  TileOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+              TensorType multiply_type) {
+    input_ = AddInput(input_type);
+    multipliers_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(input_type);
+    SetBuiltinOp(BuiltinOperator_TILE, BuiltinOptions_TileOptions, 0);
+    BuildInterpreter({input_shape, {static_cast<int>(input_shape.size())}});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+
+  void SetMultipliers(std::initializer_list<int32_t> data) {
+    PopulateTensor<int32_t>(multipliers_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int multipliers_;
+  int output_;
+};
+
+TEST(TileTest, Float32Vector) {
+  TileOpModel m({3}, TensorType_FLOAT32, TensorType_INT32);
+  m.SetInput<float>({1.f, 2.f, 3.f});
+  m.SetMultipliers({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({1.f, 2.f, 3.f, 1.f, 2.f, 3.f}));
+}
+
+TEST(TileTest, Float32Matrix) {
+  TileOpModel m({2, 3}, TensorType_FLOAT32, TensorType_INT32);
+  m.SetInput<float>({
+      11.f,
+      12.f,
+      13.f,
+      21.f,
+      22.f,
+      23.f,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({
+                                        11.f,
+                                        12.f,
+                                        13.f,
+                                        21.f,
+                                        22.f,
+                                        23.f,
+                                        11.f,
+                                        12.f,
+                                        13.f,
+                                        21.f,
+                                        22.f,
+                                        23.f,
+                                    }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Float32HighDimension) {
+  TileOpModel m({1, 2, 3}, TensorType_FLOAT32, TensorType_INT32);
+  m.SetInput<float>({
+      11.f,
+      12.f,
+      13.f,
+      21.f,
+      22.f,
+      23.f,
+  });
+  m.SetMultipliers({2, 3, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput<float>(),
+      ElementsAreArray({11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
+                        21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f,
+                        11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
+                        21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 3}));
+}
+
+TEST(TileTest, Uint8Matrix) {
+  TileOpModel m({2, 3}, TensorType_UINT8, TensorType_INT32);
+  m.SetInput<uint8_t>({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Int32Matrix) {
+  TileOpModel m({2, 3}, TensorType_INT32, TensorType_INT32);
+  m.SetInput<int32_t>({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, BooleanMatrix) {
+  TileOpModel m({2, 3}, TensorType_BOOL, TensorType_INT32);
+  m.SetInput<bool>({true, false, false, true, true, false});
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<bool>(),
+              ElementsAreArray({
+                  true, false, false, true, true, false,  // first tiletrue,
+                  true, false, false, true, true, false   // second tile
+              }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Int64Matrix) {
+  TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT32);
+  m.SetInput<int64_t>({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Int64Matrix64Multipliers) {
+  TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT64);
+  m.SetInput<int64_t>({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                          11,
+                                          12,
+                                          13,
+                                          21,
+                                          22,
+                                          23,
+                                      }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
similarity index 96%
rename from tensorflow/contrib/lite/kernels/topk_v2.cc
rename to tensorflow/lite/kernels/topk_v2.cc
index 6c38b6739e8751c50394b2ed5d3974fb7479ae5f..444b01e7b2e055ab4e26a2ea1dce28642dc430b7 100644
--- a/tensorflow/contrib/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -14,11 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include <algorithm>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 namespace tflite {
 namespace ops {
 namespace builtin {
diff --git a/tensorflow/contrib/lite/kernels/topk_v2_test.cc b/tensorflow/lite/kernels/topk_v2_test.cc
similarity index 94%
rename from tensorflow/contrib/lite/kernels/topk_v2_test.cc
rename to tensorflow/lite/kernels/topk_v2_test.cc
index 16106fdafeeaaaf5206c2f4771f68a5075186e97..108b8123666aaddcc8ba8438bac82c91ce98d50d 100644
--- a/tensorflow/contrib/lite/kernels/topk_v2_test.cc
+++ b/tensorflow/lite/kernels/topk_v2_test.cc
@@ -14,11 +14,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/transpose.cc
rename to tensorflow/lite/kernels/transpose.cc
index e42a30420b278a5053224d8f0c87d8eebe6e5241..7a6d320674ad1c8302f8bf3a9d1d5153223deed3 100644
--- a/tensorflow/contrib/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59eee51068c0efcf26d66d933e13ee2f931463bc
--- /dev/null
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -0,0 +1,269 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/eigen_support.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace transpose_conv {
+
+// This file has 2 implementation of TransposeConv.
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+};
+
+constexpr int kOutputShapeTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kDataInputTensor = 2;
+constexpr int kOutputTensor = 0;
+
+const int kTensorNotAllocated = -1;
+
+struct OpData {
+  // IDs are the arbitrary identifiers used by TF Lite to identify and access
+  // memory buffers.
+  int im2col_id = kTensorNotAllocated;
+
+  // im2col is the only temporary currently tracked, therefore always index 0.
+  // If more temporaries are added, they should be properly tracked.
+  int32_t im2col_index = 0;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // This is a builtin op, so we don't use the contents in 'buffer', if any.
+  // Instead, we allocate a new object to use as scratch space for im2col, and
+  // to carry information from Prepare() to Eval().
+  auto* data = new OpData;
+  eigen_support::IncrementUsageCounter(context);
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  eigen_support::DecrementUsageCounter(context);
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                const TfLiteTensor* shape_tensor,
+                                TfLiteTensor* output) {
+  // Currently only support int32 for output shape.
+  if (shape_tensor->type != kTfLiteInt32) {
+    context->ReportError(context, "Output shape is %d, not int32.",
+                         shape_tensor->type);
+    return kTfLiteError;
+  }
+
+  TfLiteIntArray* shape = TfLiteIntArrayCreate(NumElements(shape_tensor));
+  for (int i = 0; i < shape->size; ++i) {
+    shape->data[i] = GetTensorData<int32_t>(shape_tensor)[i];
+  }
+
+  return context->ResizeTensor(context, output, shape);
+}
+
+static TfLiteStatus AllocateIm2colTensorIfRequired(TfLiteContext* context,
+                                                   TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  if (data->im2col_id == kTensorNotAllocated) {
+    context->AddTensors(context, 1, &data->im2col_id);
+    context->tensors[data->im2col_id].type = kTfLiteFloat32;
+  }
+
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(1);
+  node->temporaries->data[data->im2col_index] = data->im2col_id;
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus ResizeIm2ColTensor(TfLiteContext* context,
+                                const TfLiteTensor* output_shape,
+                                const TfLiteTensor* weights,
+                                const TfLiteTensor* input,
+                                TfLiteTensor* im2col) {
+  if (output_shape->type != kTfLiteInt32) {
+    context->ReportError(context, "im2col shape is %d, not int32.",
+                         output_shape->type);
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE_EQ(context, NumElements(output_shape), 4);
+  TfLiteIntArray* im2col_shape_array = TfLiteIntArrayCreate(4);
+  im2col_shape_array->data[0] = output_shape->data.i32[0];
+  im2col_shape_array->data[1] = output_shape->data.i32[1];
+  im2col_shape_array->data[2] = output_shape->data.i32[2];
+  const int input_depth = SizeOfDimension(input, 3);
+  const int filter_width = SizeOfDimension(weights, 1);
+  const int filter_height = SizeOfDimension(weights, 2);
+  im2col_shape_array->data[3] = input_depth * filter_height * filter_width;
+
+  im2col->type = input->type;
+  im2col->allocation_type = kTfLiteDynamic;
+  return context->ResizeTensor(context, im2col, im2col_shape_array);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Sanity checks on op
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  // Allocate Im2col Tensor
+  TF_LITE_ENSURE_STATUS(AllocateIm2colTensorIfRequired(context, node));
+
+  // Retrieve tensors
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  OpData* user_data = reinterpret_cast<OpData*>(node->user_data);
+  TfLiteTensor* im2col =
+      &context->tensors[node->temporaries->data[user_data->im2col_index]];
+
+  // Tensor sanity checks
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 4);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, weights->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  // Ensure that weights and inputs have the same channel dimension.
+  // Note: TOCO will reorder weights in the following format: OHWI.
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(input, 3),
+                    SizeOfDimension(weights, 3));
+
+  if (!IsConstantTensor(output_shape)) {
+    // Defer resizing until Eval().
+    SetTensorToDynamic(output);
+    SetTensorToDynamic(im2col);
+  } else {
+    TF_LITE_ENSURE_STATUS(ResizeOutputTensor(context, output_shape, output));
+    TF_LITE_ENSURE_STATUS(
+        ResizeIm2ColTensor(context, output_shape, weights, input, im2col));
+  }
+  return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  // Retrieve tensors (All should be allocated by now)
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  OpData* user_data = reinterpret_cast<OpData*>(node->user_data);
+  TfLiteTensor* im2col =
+      &context->tensors[node->temporaries->data[user_data->im2col_index]];
+  const auto* params =
+      reinterpret_cast<TfLiteTransposeConvParams*>(node->builtin_data);
+
+  // Resize any deferred dynamic tensors
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputTensor(context, output_shape, output));
+  }
+  if (IsDynamicTensor(im2col)) {
+    TF_LITE_ENSURE_OK(context, ResizeIm2ColTensor(context, output_shape,
+                                                  weights, input, im2col));
+  }
+
+  // Get height and width of the output image.
+  const int width = SizeOfDimension(output, 2);
+  const int height = SizeOfDimension(output, 1);
+  const int filter_width = SizeOfDimension(weights, 1);
+  const int filter_height = SizeOfDimension(weights, 2);
+
+  const int stride_width = params->stride_width;
+  const int stride_height = params->stride_height;
+
+  const TfLitePaddingValues& padding_size =
+      ComputePaddingHeightWidth(stride_height, stride_width, 1, height, width,
+                                filter_height, filter_width, params->padding);
+
+  // Currently only support float32.
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      tflite::ConvParams op_params;
+      op_params.padding_type = PaddingType::kSame;
+      op_params.padding_values.width = padding_size.width;
+      op_params.padding_values.height = padding_size.height;
+      op_params.stride_width = stride_width;
+      op_params.stride_height = stride_height;
+      switch (kernel_type) {
+        case kReference: {
+          reference_ops::TransposeConv(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(weights), GetTensorData<float>(weights),
+              GetTensorShape(output), GetTensorData<float>(output),
+              GetTensorShape(im2col), GetTensorData<float>(im2col));
+          break;
+        }
+        case kGenericOptimized: {
+          optimized_ops::TransposeConv(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(weights), GetTensorData<float>(weights),
+              GetTensorShape(output), GetTensorData<float>(output),
+              GetTensorShape(im2col), GetTensorData<float>(im2col));
+          break;
+        }
+      }
+      break;
+    }
+    default:
+      context->ReportError(context, "Type %d, not currently supported.",
+                           input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace transpose_conv
+
+TfLiteRegistration* Register_TRANSPOSECONV_REF() {
+  static TfLiteRegistration r = {
+      transpose_conv::Init, transpose_conv::Free, transpose_conv::Prepare,
+      transpose_conv::Eval<transpose_conv::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_TRANSPOSECONV_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      transpose_conv::Init, transpose_conv::Free, transpose_conv::Prepare,
+      transpose_conv::Eval<transpose_conv::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_TRANSPOSE_CONV() {
+  return Register_TRANSPOSECONV_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/transpose_conv_test.cc b/tensorflow/lite/kernels/transpose_conv_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0520d84a30b50212bb3d86288236b49da523f4c2
--- /dev/null
+++ b/tensorflow/lite/kernels/transpose_conv_test.cc
@@ -0,0 +1,266 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_TRANSPOSECONV_REF();
+TfLiteRegistration* Register_TRANSPOSECONV_GENERIC_OPT();
+
+}  // namespace builtin
+}  // namespace ops
+
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class TransposeConvOpModel : public SingleOpModel {
+ public:
+  TransposeConvOpModel(TfLiteRegistration* registration,
+                       const TensorData& filter, const TensorData& input,
+                       const TensorData& output, Padding padding, int stride_w,
+                       int stride_h) {
+    // Just to be confusing, transpose_conv has an _input_ named "output_shape"
+    // that sets the shape of the output tensor of the op :). It must always be
+    // an int32 1D four element tensor.
+    output_shape_ = AddInput({TensorType_INT32, {4}});
+    filter_ = AddInput(filter);
+    input_ = AddInput(input);
+
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(
+        BuiltinOperator_TRANSPOSE_CONV, BuiltinOptions_TransposeConvOptions,
+        CreateTransposeConvOptions(builder_, padding, stride_w, stride_h)
+            .Union());
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_TRANSPOSE_CONV, registration);
+    BuildInterpreter(
+        {GetShape(output_shape_), GetShape(input_), GetShape(filter_)});
+  }
+
+  void SetOutputShape(std::initializer_list<int> i) {
+    PopulateTensor(output_shape_, i);
+  }
+  void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int output_shape_;
+  int filter_;
+  int input_;
+  int output_;
+};
+
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_TRANSPOSECONV_REF()},
+    {"GenericOptimized", ops::builtin::Register_TRANSPOSECONV_GENERIC_OPT()},
+});
+
+class TransposeConvOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
+// Test case:
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 4, 4, 1 ]),
+//     tf.constant(np.arange(1, 10), shape=[ 3, 3, 1, 1 ], dtype=tf.float32),
+//     tf.constant(np.arange(1, 17), shape=[ 1, 4, 4, 1 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "SAME")
+TEST_P(TransposeConvOpTest, SimpleTest) {
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 4, 4, 1}},
+                         {TensorType_FLOAT32, {1, 3, 3, 1}},
+                         {TensorType_FLOAT32, {}}, Padding_SAME, 1, 1);
+  m.SetOutputShape({1, 4, 4, 1});
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({29, 62, 83, 75, 99, 192, 237, 198, 207, 372,
+                                417, 330, 263, 446, 485, 365}));
+  // GetOutputShape() should always be same as m.SetOutputShape(...);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 19),
+//                      shape=[ 3, 3, 1, 2 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 4, 4, 1 ]),
+//     filter,
+//     tf.constant(np.arange(1, 33), shape=[ 1, 4, 4, 2 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "SAME")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[18, 1])
+TEST_P(TransposeConvOpTest, TwoFiltersTest) {
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 4, 4, 2}},
+                         {TensorType_FLOAT32, {1, 3, 3, 2}},
+                         {TensorType_FLOAT32, {}}, Padding_SAME, 1, 1);
+  m.SetOutputShape({1, 4, 4, 1});
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
+  m.SetInput({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+              17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({184, 412, 568, 528, 678, 1347, 1689, 1434, 1494,
+                                2715, 3057, 2442, 1968, 3352, 3652, 2760}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 19),
+//                      shape=[ 3, 3, 1, 2 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 6, 6, 1 ]),
+//     filter,
+//     tf.constant(np.arange(1, 33), shape=[ 1, 4, 4, 2 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "VALID")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[1, 18])
+TEST_P(TransposeConvOpTest, PaddingValidTest) {
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 4, 4, 2}},
+                         {TensorType_FLOAT32, {1, 3, 3, 2}},
+                         {TensorType_FLOAT32, {}}, Padding_VALID, 1, 1);
+  m.SetOutputShape({1, 6, 6, 1});
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
+  m.SetInput({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+              17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({5,    22,   59,   101,  114,  83,   52,   184,  412,
+                        568,  528,  344,  237,  678,  1347, 1689, 1434, 879,
+                        597,  1494, 2715, 3057, 2442, 1431, 856,  1968, 3352,
+                        3652, 2760, 1548, 689,  1534, 2543, 2729, 2010, 1103}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 6, 6, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 10),
+//                      shape=[ 3, 3, 1, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 5, 5, 1 ]),
+//     filter,
+//     tf.constant(np.arange(1, 5), shape=[ 1, 2, 2, 1 ], dtype=tf.float32),
+//     [1, 2, 2, 1 ],
+//     "VALID")
+TEST_P(TransposeConvOpTest, StrideValidTest) {
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 2, 2, 1}},
+                         {TensorType_FLOAT32, {1, 3, 3, 1}},
+                         {TensorType_FLOAT32, {}}, Padding_VALID, 2, 2);
+  m.SetOutputShape({1, 5, 5, 1});
+  m.SetFilter({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1,  2,  5,  4,  6,  4,  5,  14, 10, 12, 10, 14, 36,
+                        24, 30, 12, 15, 34, 20, 24, 21, 24, 55, 32, 36}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 5, 5, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 19),
+//                      shape=[ 3, 3, 2, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 5, 5, 2 ]),
+//     filter,
+//     tf.constant(np.arange(1, 5), shape=[ 1, 2, 2, 1 ], dtype=tf.float32),
+//     [1, 2, 2, 1 ],
+//     "VALID")
+TEST_P(TransposeConvOpTest, MultiChannelTest) {
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 2, 2, 1}},
+                         {TensorType_FLOAT32, {2, 3, 3, 1}},
+                         {TensorType_FLOAT32, {}}, Padding_VALID, 2, 2);
+  m.SetOutputShape({1, 5, 5, 2});
+  m.SetFilter({1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1,  2,  3,  4,  7,  10,  6,   8,  10, 12, 7,  8,  9,
+                        10, 25, 28, 18, 20, 22,  24,  16, 20, 24, 28, 62, 72,
+                        42, 48, 54, 60, 21, 24,  27,  30, 61, 68, 36, 40, 44,
+                        48, 39, 42, 45, 48, 103, 110, 60, 64, 68, 72}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 5, 5, 2}));
+}
+
+// Test case:
+// filter = tf.constant(np.random.randint(1, 10, size=9),
+//                      shape=[ 3, 3, 1, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 3, 4, 1 ]),
+//     filter,
+//     tf.constant([323, 521], shape=[ 1, 1, 2, 1], dtype=tf.float32),
+//     [1, 3, 3, 1 ],
+//     "SAME")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[-1])
+TEST_P(TransposeConvOpTest, AccuracyTest) {
+  TransposeConvOpModel m(GetRegistration(), {TensorType_FLOAT32, {1, 1, 2, 1}},
+                         {TensorType_FLOAT32, {1, 3, 3, 1}},
+                         {TensorType_FLOAT32, {}}, Padding_SAME, 3, 3);
+  m.SetOutputShape({1, 3, 4, 1});
+  m.SetFilter({9, 5, 6, 9, 8, 5, 3, 1, 4});
+  m.SetInput({323, 521});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {1615., 1938., 4689., 2605., 2584., 1615.,
+                                  4689., 4168., 323., 1292., 1563., 521.})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 4, 1}));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    TransposeConvOpTest, TransposeConvOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/transpose_test.cc
rename to tensorflow/lite/kernels/transpose_test.cc
index 79ef0a7c562d070f0f7e6edc4f5923894e1f896f..3ebaf3ca27ffd285ef86a81b2e63409fde565ef1 100644
--- a/tensorflow/contrib/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..497777b9aff6c6bd5084f2d36b505c998b12273b
--- /dev/null
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -0,0 +1,563 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/lstm_eval.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace unidirectional_sequence_lstm {
+
+// Input Tensors of size {max_time, n_batch, n_input}
+constexpr int kInputTensor = 0;
+
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kInputToInputWeightsTensor = 1;  // Optional
+constexpr int kInputToForgetWeightsTensor = 2;
+constexpr int kInputToCellWeightsTensor = 3;
+constexpr int kInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
+constexpr int kRecurrentToForgetWeightsTensor = 6;
+constexpr int kRecurrentToCellWeightsTensor = 7;
+constexpr int kRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kCellToInputWeightsTensor = 9;    // Optional
+constexpr int kCellToForgetWeightsTensor = 10;  // Optional
+constexpr int kCellToOutputWeightsTensor = 11;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kInputGateBiasTensor = 12;  // Optional
+constexpr int kForgetGateBiasTensor = 13;
+constexpr int kCellGateBiasTensor = 14;
+constexpr int kOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kProjectionWeightsTensor = 16;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kProjectionBiasTensor = 17;  // Optional
+
+// Stateful input tensors that are variables and will be modified by the Op.
+// Activation state tensor of size {n_batch, n_output}
+constexpr int kInputActivationStateTensor = 18;
+// Cell state tensor of size {n_batch, n_cell}
+constexpr int kInputCellStateTensor = 19;
+
+// Output tensors.
+constexpr int kOutputTensor = 0;
+
+// Temporary tensors
+enum TemporaryTensor {
+  kScratchBuffer = 0,
+  kInputQuantized = 1,
+  kOutputStateQuantized = 2,
+  kCellStateQuantized = 3,
+  kScalingFactors = 4,
+  kProductScalingFactors = 5,
+  kRecoveredCellWeights = 6,
+  kNumTemporaryTensors = 7
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int();
+  context->AddTensors(context, kNumTemporaryTensors, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
+                                        TfLiteNode* node, int n_input,
+                                        int n_output, int n_cell) {
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+  // Making sure clipping parameters have valid values.
+  // == 0 means no clipping
+  //  > 0 means clipping
+  TF_LITE_ENSURE(context, params->cell_clip >= 0);
+  TF_LITE_ENSURE(context, params->proj_clip >= 0);
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  if (input_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+  }
+
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  if (recurrent_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+                      n_output);
+  }
+
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
+                    n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+                    n_output);
+
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+                    n_output);
+
+  // We make sure the input-gate's parameters are either both present (regular
+  // LSTM) or not at all (CIFG-LSTM).
+  const bool cifg_weights_all_or_none =
+      ((input_to_input_weights != nullptr) &&
+       (recurrent_to_input_weights != nullptr)) ||
+      ((input_to_input_weights == nullptr) &&
+       (recurrent_to_input_weights == nullptr));
+  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  if (cell_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+  }
+
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  if (cell_to_forget_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+  }
+
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+  if (cell_to_output_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+  }
+
+  // Making sure the peephole weights are there all or none.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool peephole_weights_all_or_none =
+      ((cell_to_input_weights != nullptr || use_cifg) &&
+       (cell_to_forget_weights != nullptr) &&
+       (cell_to_output_weights != nullptr)) ||
+      ((cell_to_input_weights == nullptr) &&
+       (cell_to_forget_weights == nullptr) &&
+       (cell_to_output_weights == nullptr));
+  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
+
+  // Make sure the input gate bias is present only when not a CIFG-LSTM.
+  const TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+  }
+
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  if (projection_weights) {
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+  }
+
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+  if (projection_bias) {
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+  }
+
+  // Making sure the projection tensors are consistent:
+  // 1) If projection weight is not present, then projection bias should not be
+  // present.
+  // 2) If projection weight is present, then projection bias is optional.
+  // TODO(ghodrat): make sure this is correct.
+  const bool projecton_tensors_consistent =
+      ((projection_weights != nullptr) || (projection_bias == nullptr));
+  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
+
+  return kTfLiteOk;
+}
+
+// Resize the output and  state tensors based on the sizes of the input tensors.
+// Allocate a temprory scratch tensor. Also check that the sizes of the input
+// tensors match each other.
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 20);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  // Inferring batch size, number of outputs and sequence length and
+  // number of cells from the input tensors.
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE(context, input->dims->size > 1);
+  const auto* params =
+      reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
+          node->builtin_data);
+  const bool time_major = params->time_major;
+  const int n_batch = time_major ? input->dims->data[1] : input->dims->data[0];
+  const int n_input = input->dims->data[2];
+
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+  const int n_cell = input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
+
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
+                    n_cell);
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Check that input tensor dimensions matches with each other.
+  TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input,
+                                                        n_output, n_cell));
+
+  // Get the pointer to output, activation_state and cell_state buffer tensors.
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TfLiteTensor* activation_state =
+      GetVariableInput(context, node, kInputActivationStateTensor);
+  TfLiteTensor* cell_state =
+      GetVariableInput(context, node, kInputCellStateTensor);
+
+  // Check the shape of input state tensors.
+  // These tensor may be 1D or 2D. It's fine as long as the total size is
+  // correct.
+  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
+  TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
+
+  // Resize the output tensors.
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  output_size->data[input->dims->size - 1] = n_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size));
+
+  // The weights are of consistent type, so it suffices to check one.
+  // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
+  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
+                             input->type == kTfLiteFloat32);
+
+  TfLiteIntArrayFree(node->temporaries);
+  if (is_hybrid_op) {
+    node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
+  } else {
+    node->temporaries = TfLiteIntArrayCreate(1);
+  }
+  node->temporaries->data[0] = *scratch_tensor_index;
+
+  // Create a scratch buffer tensor.
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer);
+  scratch_buffer->type = input->type;
+  scratch_buffer->allocation_type = kTfLiteArenaRw;
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+  scratch_buffer_size->data[0] = n_batch;
+  if (use_cifg) {
+    // Reserving space for Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 3;
+  } else {
+    // Reserving space for Input, Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 4;
+  }
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                   scratch_buffer_size));
+
+  if (is_hybrid_op) {
+    // Allocate temporary tensors to store quantized values of input,
+    // activation_state and cell_state tensors.
+    node->temporaries->data[kInputQuantized] =
+        *scratch_tensor_index + kInputQuantized;
+    TfLiteTensor* input_quantized =
+        GetTemporary(context, node, kInputQuantized);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+    node->temporaries->data[kOutputStateQuantized] =
+        *scratch_tensor_index + kOutputStateQuantized;
+    TfLiteTensor* activation_state_quantized =
+        GetTemporary(context, node, kOutputStateQuantized);
+    activation_state_quantized->type = kTfLiteUInt8;
+    activation_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
+                             activation_state->dims)) {
+      TfLiteIntArray* activation_state_quantized_size =
+          TfLiteIntArrayCopy(activation_state->dims);
+      TF_LITE_ENSURE_OK(
+          context, context->ResizeTensor(context, activation_state_quantized,
+                                         activation_state_quantized_size));
+    }
+    node->temporaries->data[kCellStateQuantized] =
+        *scratch_tensor_index + kCellStateQuantized;
+    TfLiteTensor* cell_state_quantized =
+        GetTemporary(context, node, kCellStateQuantized);
+    cell_state_quantized->type = kTfLiteUInt8;
+    cell_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
+      TfLiteIntArray* cell_state_quantized_size =
+          TfLiteIntArrayCopy(cell_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, cell_state_quantized,
+                                              cell_state_quantized_size));
+    }
+
+    // Allocate temporary tensors to store scaling factors and product scaling
+    // factors. The latter is a convenience storage which allows to quantize
+    // a vector once (which produces the scaling factors) and multiply it with
+    // different matrices (which requires multiplying the scaling factors with
+    // the scaling factor of the matrix).
+    node->temporaries->data[kScalingFactors] =
+        *scratch_tensor_index + kScalingFactors;
+    TfLiteTensor* scaling_factors =
+        GetTemporary(context, node, kScalingFactors);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    int scaling_dims[1] = {n_batch};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+    node->temporaries->data[kProductScalingFactors] =
+        *scratch_tensor_index + kProductScalingFactors;
+    TfLiteTensor* prod_scaling_factors =
+        GetTemporary(context, node, kProductScalingFactors);
+    prod_scaling_factors->type = kTfLiteFloat32;
+    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
+                                   scaling_dims)) {
+      TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
+      prod_scaling_factors_size->data[0] = n_batch;
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, prod_scaling_factors,
+                                              prod_scaling_factors_size));
+    }
+
+    // Allocate a temporary tensor to store the recovered cell weights. Since
+    // this is used for diagonal matrices, only need to store n_cell values.
+    node->temporaries->data[kRecoveredCellWeights] =
+        *scratch_tensor_index + kRecoveredCellWeights;
+    TfLiteTensor* recovered_cell_weights =
+        GetTemporary(context, node, kRecoveredCellWeights);
+    recovered_cell_weights->type = kTfLiteFloat32;
+    recovered_cell_weights->allocation_type = kTfLiteArenaRw;
+    int recovered_cell_dims[1] = {n_cell};
+    if (!TfLiteIntArrayEqualsArray(recovered_cell_weights->dims, 1,
+                                   recovered_cell_dims)) {
+      TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1);
+      recovered_cell_weights_size->data[0] = n_cell;
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, recovered_cell_weights,
+                                              recovered_cell_weights_size));
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params =
+      reinterpret_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
+          node->builtin_data);
+  const bool time_major = params->time_major;
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  const TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+
+  TfLiteTensor* activation_state =
+      GetVariableInput(context, node, kInputActivationStateTensor);
+  TfLiteTensor* cell_state =
+      GetVariableInput(context, node, kInputCellStateTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Copy out the LSTM specific params so they can be passed in the function.
+  TfLiteLSTMParams lstm_params;
+  lstm_params.activation = params->activation;
+  lstm_params.cell_clip = params->cell_clip;
+  lstm_params.proj_clip = params->proj_clip;
+
+  switch (input_to_output_weights->type) {
+    case kTfLiteFloat32: {
+      return lstm_eval::EvalFloat(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+          projection_bias, &lstm_params, /*forward_sequence=*/true, time_major,
+          /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
+          output);
+    }
+    case kTfLiteUInt8: {
+      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
+      TfLiteTensor* activation_state_quantized =
+          GetTemporary(context, node, /*index=*/2);
+      TfLiteTensor* cell_state_quantized =
+          GetTemporary(context, node, /*index=*/3);
+      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
+      TfLiteTensor* prod_scaling_factors =
+          GetTemporary(context, node, /*index=*/5);
+      TfLiteTensor* recovered_cell_weights =
+          GetTemporary(context, node, /*index=*/6);
+      return lstm_eval::EvalHybrid(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          /*aux_input=*/nullptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
+          projection_bias, &lstm_params, /*forward_sequence=*/true, time_major,
+          /*output_offset=*/0, scratch_buffer, scaling_factors,
+          prod_scaling_factors, recovered_cell_weights, input_quantized,
+          /*aux_input_quantized=*/nullptr, activation_state_quantized,
+          cell_state_quantized, activation_state, cell_state, output);
+    }
+    default:
+      context->ReportError(context, "Type %d is not currently supported.",
+                           input_to_output_weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+}  // namespace unidirectional_sequence_lstm
+
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM() {
+  static TfLiteRegistration r = {unidirectional_sequence_lstm::Init,
+                                 unidirectional_sequence_lstm::Free,
+                                 unidirectional_sequence_lstm::Prepare,
+                                 unidirectional_sequence_lstm::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
similarity index 89%
rename from tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
rename to tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index cd3aac053262c37433c1dafe35f8d2b49c2b76ff..ae7dd6b2bee1da06d9dc48f259585f541c72842f 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
@@ -32,7 +32,7 @@ using ::testing::ElementsAreArray;
 class UnidirectionalLSTMOpModel : public SingleOpModel {
  public:
   UnidirectionalLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                            int sequence_length, bool use_cifg,
+                            int sequence_length, bool time_major, bool use_cifg,
                             bool use_peephole, bool use_projection_weights,
                             bool use_projection_bias, float cell_clip,
                             float proj_clip,
@@ -111,78 +111,79 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
     output_ = AddOutput(TensorType_FLOAT32);
 
     SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-                 BuiltinOptions_LSTMOptions,
-                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
-                                   cell_clip, proj_clip)
+                 BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+                 CreateUnidirectionalSequenceLSTMOptions(
+                     builder_, ActivationFunctionType_TANH, cell_clip,
+                     proj_clip, time_major)
                      .Union());
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::initializer_list<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::initializer_list<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::initializer_list<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::initializer_list<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::initializer_list<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::initializer_list<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::initializer_list<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_output_weights_, f);
   }
 
-  void SetInputGateBias(std::initializer_list<float> f) {
+  void SetInputGateBias(const std::vector<float>& f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::initializer_list<float> f) {
+  void SetForgetGateBias(const std::vector<float>& f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::initializer_list<float> f) {
+  void SetCellBias(const std::vector<float>& f) {
     PopulateTensor(cell_bias_, f);
   }
 
-  void SetOutputGateBias(std::initializer_list<float> f) {
+  void SetOutputGateBias(const std::vector<float>& f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::initializer_list<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     PopulateTensor(projection_weights_, f);
   }
 
-  void SetProjectionBias(std::initializer_list<float> f) {
+  void SetProjectionBias(const std::vector<float>& f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -240,59 +241,59 @@ class HybridUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel {
  public:
   HybridUnidirectionalLSTMOpModel(
       int n_batch, int n_input, int n_cell, int n_output, int sequence_length,
-      bool use_cifg, bool use_peephole, bool use_projection_weights,
-      bool use_projection_bias, float cell_clip, float proj_clip,
-      const std::vector<std::vector<int>>& input_shapes)
+      bool time_major, bool use_cifg, bool use_peephole,
+      bool use_projection_weights, bool use_projection_bias, float cell_clip,
+      float proj_clip, const std::vector<std::vector<int>>& input_shapes)
       : UnidirectionalLSTMOpModel(
-            n_batch, n_input, n_cell, n_output, sequence_length, use_cifg,
-            use_peephole, use_projection_weights, use_projection_bias,
+            n_batch, n_input, n_cell, n_output, sequence_length, time_major,
+            use_cifg, use_peephole, use_projection_weights, use_projection_bias,
             cell_clip, proj_clip, input_shapes, TensorType_UINT8) {}
 
-  void SetInputToInputWeights(std::initializer_list<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::initializer_list<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::initializer_list<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::initializer_list<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::initializer_list<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::initializer_list<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::initializer_list<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
   }
 
-  void SetProjectionWeights(std::initializer_list<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     SymmetricQuantizeAndPopulate(projection_weights_, f);
   }
 };
@@ -300,22 +301,22 @@ class HybridUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel {
 class BaseLstmTest : public ::testing::Test {
  protected:
   // Weights of the LSTM model. Some are optional.
-  std::initializer_list<float> input_to_input_weights_;
-  std::initializer_list<float> input_to_cell_weights_;
-  std::initializer_list<float> input_to_forget_weights_;
-  std::initializer_list<float> input_to_output_weights_;
-  std::initializer_list<float> input_gate_bias_;
-  std::initializer_list<float> cell_gate_bias_;
-  std::initializer_list<float> forget_gate_bias_;
-  std::initializer_list<float> output_gate_bias_;
-  std::initializer_list<float> recurrent_to_input_weights_;
-  std::initializer_list<float> recurrent_to_cell_weights_;
-  std::initializer_list<float> recurrent_to_forget_weights_;
-  std::initializer_list<float> recurrent_to_output_weights_;
-  std::initializer_list<float> cell_to_input_weights_;
-  std::initializer_list<float> cell_to_forget_weights_;
-  std::initializer_list<float> cell_to_output_weights_;
-  std::initializer_list<float> projection_weights_;
+  std::vector<float> input_to_input_weights_;
+  std::vector<float> input_to_cell_weights_;
+  std::vector<float> input_to_forget_weights_;
+  std::vector<float> input_to_output_weights_;
+  std::vector<float> input_gate_bias_;
+  std::vector<float> cell_gate_bias_;
+  std::vector<float> forget_gate_bias_;
+  std::vector<float> output_gate_bias_;
+  std::vector<float> recurrent_to_input_weights_;
+  std::vector<float> recurrent_to_cell_weights_;
+  std::vector<float> recurrent_to_forget_weights_;
+  std::vector<float> recurrent_to_output_weights_;
+  std::vector<float> cell_to_input_weights_;
+  std::vector<float> cell_to_forget_weights_;
+  std::vector<float> cell_to_output_weights_;
+  std::vector<float> projection_weights_;
 
   // LSTM input is stored as num_batch x num_inputs vector.
   std::vector<std::vector<float>> lstm_input_;
@@ -325,21 +326,32 @@ class BaseLstmTest : public ::testing::Test {
   // Compares output up to tolerance to the result of the lstm given the input.
   void VerifyGoldens(const std::vector<std::vector<float>>& input,
                      const std::vector<std::vector<float>>& output,
-                     UnidirectionalLSTMOpModel* lstm, float tolerance = 1e-5) {
+                     UnidirectionalLSTMOpModel* lstm, float tolerance = 1e-5,
+                     bool time_major = true) {
     const int num_batches = input.size();
     EXPECT_GT(num_batches, 0);
     const int num_inputs = lstm->num_inputs();
     EXPECT_GT(num_inputs, 0);
     const int input_sequence_size = input[0].size() / num_inputs;
     EXPECT_GT(input_sequence_size, 0);
-    // Feed the whole sequence as input.
-    for (int i = 0; i < input_sequence_size; ++i) {
+    if (time_major) {
+      // Feed the whole sequence as input.
+      for (int i = 0; i < input_sequence_size; ++i) {
+        for (int b = 0; b < num_batches; ++b) {
+          const float* batch_start = input[b].data() + i * num_inputs;
+          const float* batch_end = batch_start + num_inputs;
+
+          lstm->SetInput(((i * num_batches) + b) * num_inputs, batch_start,
+                         batch_end);
+        }
+      }
+    } else {
       for (int b = 0; b < num_batches; ++b) {
-        const float* batch_start = input[b].data() + i * num_inputs;
-        const float* batch_end = batch_start + num_inputs;
+        const float* batch_start = input[b].data();
+        const float* batch_end = batch_start + input_sequence_size * num_inputs;
 
-        lstm->SetInput(((i * num_batches) + b) * lstm->num_inputs(),
-                       batch_start, batch_end);
+        lstm->SetInput(b * input_sequence_size * num_inputs, batch_start,
+                       batch_end);
       }
     }
 
@@ -348,15 +360,25 @@ class BaseLstmTest : public ::testing::Test {
     const int num_outputs = lstm->num_outputs();
     EXPECT_GT(num_outputs, 0);
     std::vector<float> expected;
-    for (int i = 0; i < input_sequence_size; ++i) {
+
+    if (time_major) {
+      for (int i = 0; i < input_sequence_size; ++i) {
+        for (int b = 0; b < num_batches; ++b) {
+          const float* golden_start_batch = output[b].data() + i * num_outputs;
+          const float* golden_end_batch = golden_start_batch + num_outputs;
+
+          expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+        }
+      }
+    } else {
       for (int b = 0; b < num_batches; ++b) {
-        const float* golden_start_batch = output[b].data() + i * num_outputs;
-        const float* golden_end_batch = golden_start_batch + num_outputs;
+        const float* golden_batch_start = output[b].data();
+        const float* golden_batch_end =
+            golden_batch_start + input_sequence_size * num_outputs;
 
-        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+        expected.insert(expected.end(), golden_batch_start, golden_batch_end);
       }
     }
-
     EXPECT_THAT(lstm->GetOutput(),
                 ElementsAreArray(ArrayFloatNear(expected, tolerance)));
   }
@@ -421,7 +443,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
   UnidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length,
-      /*use_cifg=*/false, /*use_peephole=*/false,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/false,
       /*use_projection_weights=*/false,
       /*use_projection_bias=*/false,
       /*cell_clip=*/0.0, /*proj_clip=*/0.0,
@@ -472,6 +494,73 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       LstmBlackBoxTestBatchMajor) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  UnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+      });
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  // Reshuffle input and output to batch major format.
+  std::vector<std::vector<float>> input;
+  std::vector<std::vector<float>> output;
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/1e-5,
+                /*time_major=*/false);
+}
+
 TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
   const int n_batch = 1;
   const int n_input = 2;
@@ -482,7 +571,7 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
   HybridUnidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length,
-      /*use_cifg=*/false, /*use_peephole=*/false,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/false,
       /*use_projection_weights=*/false,
       /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
       {
@@ -590,7 +679,7 @@ TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
 
   UnidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length,
-      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*time_major=*/true, /*use_cifg=*/true, /*use_peephole=*/true,
       /*use_projection_weights=*/false,
       /*use_projection_bias=*/false,
       /*cell_clip=*/0.0, /*proj_clip=*/0.0,
@@ -651,7 +740,7 @@ TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
   HybridUnidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length,
-      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*time_major=*/true, /*use_cifg=*/true, /*use_peephole=*/true,
       /*use_projection_weights=*/false,
       /*use_projection_bias=*/false,
       /*cell_clip=*/0.0, /*proj_clip=*/0.0,
@@ -1310,7 +1399,7 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) {
 
   UnidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length,
-      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
       /*use_projection_bias=*/false,
       /*cell_clip=*/0.0, /*proj_clip=*/0.0,
@@ -1376,7 +1465,7 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) {
 
   HybridUnidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length,
-      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/true,
       /*use_projection_weights=*/true,
       /*use_projection_bias=*/false,
       /*cell_clip=*/0.0, /*proj_clip=*/0.0,
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
similarity index 88%
rename from tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
rename to tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
index 744ee7c109adcb71dbb1d0dc9604d08354add570..4c0fe00272a04ef3edc0787839f235f12aa546cb 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
@@ -19,12 +19,12 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/activation_functor.h"
-#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
 
 namespace tflite {
 namespace ops {
@@ -73,10 +73,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int max_time =
       (time_major) ? input->dims->data[0] : input->dims->data[1];
   const int num_units = input_weights->dims->data[0];
-  TF_LITE_ASSERT_EQ(input->dims->data[2], input_weights->dims->data[1]);
-  TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
-  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
-  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, input->dims->data[2],
+                    input_weights->dims->data[1]);
+  TF_LITE_ENSURE_EQ(context, input_weights->dims->data[0], bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, recurrent_weights->dims->data[0],
+                    bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, recurrent_weights->dims->data[1],
+                    bias->dims->data[0]);
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type);
   TF_LITE_ENSURE_EQ(context, NumDimensions(hidden_state), 2);
@@ -125,9 +128,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
-    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-    scaling_factors_size->data[0] = batch_size;
-    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+    int scaling_dims[1] = {batch_size};
+    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
+      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+      scaling_factors_size->data[0] = batch_size;
       TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
                                                        scaling_factors_size));
     }
@@ -166,10 +170,10 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
           input->data.f + s * input_size * batch_size;
       float* output_ptr_batch = output->data.f + s * num_units * batch_size;
 
-      kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
-                                 recurrent_weights_ptr, bias_ptr, input_size,
-                                 num_units, batch_size, params->activation,
-                                 hidden_state_ptr_batch, output_ptr_batch);
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr,
+          input_size, num_units, batch_size, num_units, params->activation,
+          hidden_state_ptr_batch, output_ptr_batch);
     }
   } else {
     // For each batch
@@ -185,8 +189,8 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
 
         kernel_utils::RnnBatchStep(
             input_ptr_batch, input_weights_ptr, recurrent_weights_ptr, bias_ptr,
-            input_size, num_units, /*batch_size=*/1, params->activation,
-            hidden_state_ptr_batch, output_ptr_batch);
+            input_size, num_units, /*batch_size=*/1, num_units,
+            params->activation, hidden_state_ptr_batch, output_ptr_batch);
       }
     }
   }
@@ -237,8 +241,8 @@ TfLiteStatus EvalHybrid(
       kernel_utils::RnnBatchStep(
           input_ptr_batch, input_weights_ptr, input_weights_scale,
           recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size,
-          num_units, batch_size, params->activation, quantized_input_ptr,
-          quantized_hidden_state_ptr, scaling_factors_ptr,
+          num_units, batch_size, num_units, params->activation,
+          quantized_input_ptr, quantized_hidden_state_ptr, scaling_factors_ptr,
           hidden_state_ptr_batch, output_ptr_batch);
     }
   } else {
@@ -256,8 +260,8 @@ TfLiteStatus EvalHybrid(
         kernel_utils::RnnBatchStep(
             input_ptr_batch, input_weights_ptr, input_weights_scale,
             recurrent_weights_ptr, recurrent_weights_scale, bias_ptr,
-            input_size, num_units, /*batch_size=*/1, params->activation,
-            quantized_input_ptr, quantized_hidden_state_ptr,
+            input_size, num_units, /*batch_size=*/1, num_units,
+            params->activation, quantized_input_ptr, quantized_hidden_state_ptr,
             scaling_factors_ptr, hidden_state_ptr_batch, output_ptr_batch);
       }
     }
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
rename to tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
index 6b48e3fff7a9db3f54b6b3308354c0c263d63568..a2f82ac67b1b22b226e7046af7158ed6095dcc8e 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/unpack.cc
rename to tensorflow/lite/kernels/unpack.cc
index a7d3a9bc7672be897a174edfc8f056ff6a9f37d4..1caffe14f90b8ce9d13d8c781e87bf918c02b9f4 100644
--- a/tensorflow/contrib/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/kernels/unpack_test.cc
rename to tensorflow/lite/kernels/unpack_test.cc
index 4efc92a0fdd68082164c5788f99226f81717f91c..9b60cce549804a59e343f3e26f978679a1624c00 100644
--- a/tensorflow/contrib/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/kernels/zeros_like.cc b/tensorflow/lite/kernels/zeros_like.cc
similarity index 93%
rename from tensorflow/contrib/lite/kernels/zeros_like.cc
rename to tensorflow/lite/kernels/zeros_like.cc
index cce5240a9bdc9c1da8185db6b72627c73d898062..a187306fa251c4402b63ed657c9f27e34d3d2bc9 100644
--- a/tensorflow/contrib/lite/kernels/zeros_like.cc
+++ b/tensorflow/lite/kernels/zeros_like.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/kernels/zeros_like_test.cc b/tensorflow/lite/kernels/zeros_like_test.cc
similarity index 92%
rename from tensorflow/contrib/lite/kernels/zeros_like_test.cc
rename to tensorflow/lite/kernels/zeros_like_test.cc
index d3382d1d5b865ee811f235f8fd8abf78d5b66f38..0a1d9afe33f897e2a49cd1548bf34a69c96a7a91 100644
--- a/tensorflow/contrib/lite/kernels/zeros_like_test.cc
+++ b/tensorflow/lite/kernels/zeros_like_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/lib_package/BUILD b/tensorflow/lite/lib_package/BUILD
similarity index 100%
rename from tensorflow/contrib/lite/lib_package/BUILD
rename to tensorflow/lite/lib_package/BUILD
diff --git a/tensorflow/contrib/lite/lib_package/concat_licenses.sh b/tensorflow/lite/lib_package/concat_licenses.sh
similarity index 100%
rename from tensorflow/contrib/lite/lib_package/concat_licenses.sh
rename to tensorflow/lite/lib_package/concat_licenses.sh
diff --git a/tensorflow/lite/lib_package/create_ios_frameworks.sh b/tensorflow/lite/lib_package/create_ios_frameworks.sh
new file mode 100755
index 0000000000000000000000000000000000000000..abf40e7dec6c3f14ba38cb3491be5d2d0acc7caa
--- /dev/null
+++ b/tensorflow/lite/lib_package/create_ios_frameworks.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# TODO(ycling): Refactoring - Move this script into `tools/make`.
+set -e
+
+echo "Starting"
+TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
+
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-g build with GPU delegate"
+  exit 1
+}
+
+USE_GPU_DELEGATE="false"
+FRAMEWORK_NAME="tensorflow_lite"
+while getopts "g" opt_name; do
+  case "$opt_name" in
+    g)
+        USE_GPU_DELEGATE="true"
+        FRAMEWORK_NAME="tensorflow_lite_gpu"
+        ;;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+readonly USE_GPU_DELEGATE
+readonly FRAMEWORK_NAME
+
+if [ $USE_GPU_DELEGATE == "true" ] ; then
+  for filename in metal_delegate.h libmetal_delegate.a ; do
+    if [[ ! -f "${TFLITE_DIR}/delegates/gpu/${filename}" ]] ; then
+      echo "File ${TFLITE_DIR}/delegates/gpu/${filename} doesn't exist."
+      echo "It's requried for building TFLite Framework with GPU. Aborting."
+      exit 1
+    fi
+  done
+fi
+
+TMP_DIR=$(mktemp -d)
+echo "Package dir: " $TMP_DIR
+FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
+FW_DIR_TFLITE=$FW_DIR/$FRAMEWORK_NAME.framework
+FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
+
+echo "Creating target Headers directories"
+mkdir -p $FW_DIR_TFLITE_HDRS
+
+echo "Headers, populating: TensorFlow Lite"
+cd $TFLITE_DIR/../..
+
+find tensorflow/lite -name '*.h' \
+    -not -path 'tensorflow/lite/tools/*' \
+    -not -path 'tensorflow/lite/examples/*' \
+    -not -path 'tensorflow/lite/gen/*' \
+    -not -path 'tensorflow/lite/toco/*' \
+    -not -path 'tensorflow/lite/nnapi/*' \
+    -not -path 'tensorflow/lite/java/*' \
+    | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+echo "Headers, populating: Flatbuffer"
+cd $TFLITE_DIR/tools/make/downloads/flatbuffers/include/
+find . -name '*.h' | tar -cf $FW_DIR_TFLITE_HDRS/tmp.tar -T -
+cd $FW_DIR_TFLITE_HDRS
+tar xf tmp.tar
+rm -f tmp.tar
+
+cd $TFLITE_DIR/../..
+echo "Generate master LICENSE file and copy to target"
+bazel build //tensorflow/tools/lib_package:clicenses_generate
+cp $TFLITE_DIR/../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
+   $FW_DIR_TFLITE
+
+echo "Copying static libraries"
+# Note: There must be a static library with the same name
+# as the framework name.
+cp $TFLITE_DIR/tools/make/gen/lib/libtensorflow-lite.a \
+    $FW_DIR_TFLITE/$FRAMEWORK_NAME
+if [ $USE_GPU_DELEGATE == "true" ] ; then
+  cp "${TFLITE_DIR}/delegates/gpu/libmetal_delegate.a" \
+      $FW_DIR_TFLITE/libmetal_delegate.a
+fi
+
+# This is required, otherwise they interfere with the documentation of the
+# pod at cocoapods.org.
+echo "Remove all README files"
+cd $FW_DIR_TFLITE_HDRS
+find . -type f -name README\* -exec rm -f {} \;
+find . -type f -name readme\* -exec rm -f {} \;
+
+TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
+echo "Moving results to target: " $TARGET_GEN_LOCATION
+cd $FW_DIR
+zip -q -r $FRAMEWORK_NAME.framework.zip $FRAMEWORK_NAME.framework -x .DS_Store
+rm -rf $TARGET_GEN_LOCATION
+mkdir -p $TARGET_GEN_LOCATION
+cp -r $FRAMEWORK_NAME.framework.zip $TARGET_GEN_LOCATION
+
+echo "Cleaning up"
+rm -rf $TMP_DIR
+
+echo "Finished"
diff --git a/tensorflow/contrib/lite/memory_planner.h b/tensorflow/lite/memory_planner.h
similarity index 88%
rename from tensorflow/contrib/lite/memory_planner.h
rename to tensorflow/lite/memory_planner.h
index 2d4707f849f5d1a7a8393d9fb7286f3c714e8a36..fa2a44a1c89d70da33d87d9f1b92209254b96c02 100644
--- a/tensorflow/contrib/lite/memory_planner.h
+++ b/tensorflow/lite/memory_planner.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
-#define TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
+#ifndef TENSORFLOW_LITE_MEMORY_PLANNER_H_
+#define TENSORFLOW_LITE_MEMORY_PLANNER_H_
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 
@@ -42,4 +42,4 @@ class MemoryPlanner {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_MEMORY_PLANNER_H_
+#endif  // TENSORFLOW_LITE_MEMORY_PLANNER_H_
diff --git a/tensorflow/contrib/lite/mmap_allocation.cc b/tensorflow/lite/mmap_allocation.cc
similarity index 94%
rename from tensorflow/contrib/lite/mmap_allocation.cc
rename to tensorflow/lite/mmap_allocation.cc
index 92934d1fd15777b51c66fe9c0e4f94a11fe26044..11e59956996f262a63288c4daa3a7835fe7b3244 100644
--- a/tensorflow/contrib/lite/mmap_allocation.cc
+++ b/tensorflow/lite/mmap_allocation.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <sys/types.h>
 #include <unistd.h>
 
-#include "tensorflow/contrib/lite/allocation.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/mmap_allocation_disabled.cc b/tensorflow/lite/mmap_allocation_disabled.cc
similarity index 96%
rename from tensorflow/contrib/lite/mmap_allocation_disabled.cc
rename to tensorflow/lite/mmap_allocation_disabled.cc
index f3d4cf1a257d43ebd56cc9b8831de0bb1994d40c..efb0991b5941f15a76e89db23eac2e4e690faa23 100644
--- a/tensorflow/contrib/lite/mmap_allocation_disabled.cc
+++ b/tensorflow/lite/mmap_allocation_disabled.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/lite/allocation.h"
 
 #include <cassert>
 
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/lite/model.cc
similarity index 93%
rename from tensorflow/contrib/lite/model.cc
rename to tensorflow/lite/model.cc
index d7b109ac1a68ddd83e5c0a38698ad1cdc9715ee4..5ac0532afeffc0801a207c385be9816fa459b416 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -19,15 +19,15 @@ limitations under the License.
 #include <sys/stat.h>
 #include <sys/types.h>
 
-#include "tensorflow/contrib/lite/allocation.h"
-#include "tensorflow/contrib/lite/c/builtin_op_data.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
-#include "tensorflow/contrib/lite/core/api/flatbuffer_conversions.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/model.h"
 #ifndef TFLITE_MCU
-#include "tensorflow/contrib/lite/nnapi_delegate.h"
+#include "tensorflow/lite/nnapi_delegate.h"
 #endif
-#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/lite/version.h"
 
 namespace tflite {
 
@@ -384,6 +384,32 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
   return status;
 }
 
+TfLiteStatus InterpreterBuilder::ApplyDelegates(Interpreter* interpreter) {
+  // TODO(b/117561550): Move flex delegate application to the OpResolver.
+  if (AcquireFlexDelegate == nullptr) {
+    return kTfLiteOk;
+  }
+
+  bool has_flex_op = false;
+  for (const auto* registration : flatbuffer_op_index_to_registration_) {
+    if ((registration->builtin_code == BuiltinOperator_CUSTOM) &&
+        IsFlexOp(registration->custom_name)) {
+      has_flex_op = true;
+      break;
+    }
+  }
+
+  if (!has_flex_op) {
+    return kTfLiteOk;
+  }
+
+  if (auto flex_delegate = AcquireFlexDelegate()) {
+    return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate));
+  }
+
+  return kTfLiteOk;
+}
+
 TfLiteStatus InterpreterBuilder::operator()(
     std::unique_ptr<Interpreter>* interpreter) {
   return operator()(interpreter, /*num_threads=*/-1);
@@ -466,14 +492,8 @@ TfLiteStatus InterpreterBuilder::operator()(
   }
   (**interpreter).SetVariables(std::move(variables));
 
-  // TODO(b/116667551): Only create the flex delegate if the model has flex ops.
-  if (AcquireFlexDelegate != nullptr) {
-    if (auto flex_delegate = AcquireFlexDelegate()) {
-      (**interpreter)
-          .ModifyGraphWithDelegate(std::move(flex_delegate),
-                                   /*allow_dynamic_tensors=*/true);
-    }
-  }
+  if (ApplyDelegates(interpreter->get()) != kTfLiteOk)
+    return cleanup_and_error();
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
new file mode 100644
index 0000000000000000000000000000000000000000..01e7c682056b2b14155394f978545470c7748c2d
--- /dev/null
+++ b/tensorflow/lite/model.h
@@ -0,0 +1,189 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Deserialization infrastructure for tflite. Provides functionality
+// to go from a serialized tflite model in flatbuffer format to an
+// interpreter.
+//
+// using namespace tflite;
+// StderrReporter error_reporter;
+// auto model = FlatBufferModel::BuildFromFile("interesting_model.tflite",
+//                                             &error_reporter);
+// MyOpResolver resolver;  // You need to subclass OpResolver to provide
+//                         // implementations.
+// InterpreterBuilder builder(*model, resolver);
+// std::unique_ptr<Interpreter> interpreter;
+// if(builder(&interpreter) == kTfLiteOk) {
+//   .. run model inference with interpreter
+// }
+//
+// OpResolver must be defined to provide your kernel implementations to the
+// interpreter. This is environment specific and may consist of just the builtin
+// ops, or some custom operators you defined to extend tflite.
+#ifndef TENSORFLOW_LITE_MODEL_H_
+#define TENSORFLOW_LITE_MODEL_H_
+
+#include <memory>
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Abstract interface that verifies whether a given model is legit.
+// It facilitates the use-case to verify and build a model without loading it
+// twice.
+class TfLiteVerifier {
+ public:
+  // Returns true if the model is legit.
+  virtual bool Verify(const char* data, int length,
+                      ErrorReporter* reporter) = 0;
+  virtual ~TfLiteVerifier() {}
+};
+
+// An RAII object that represents a read-only tflite model, copied from disk,
+// or mmapped. This uses flatbuffers as the serialization format.
+class FlatBufferModel {
+ public:
+  // Builds a model based on a file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromFile(
+      const char* filename,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Verifies whether the content of the file is legit, then builds a model
+  // based on the file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
+      const char* filename, TfLiteVerifier* verifier = nullptr,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Builds a model based on a pre-loaded flatbuffer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
+      const char* buffer, size_t buffer_size,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Builds a model directly from a flatbuffer pointer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromModel(
+      const tflite::Model* model_spec,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Releases memory or unmaps mmaped memory.
+  ~FlatBufferModel();
+
+  // Copying or assignment is disallowed to simplify ownership semantics.
+  FlatBufferModel(const FlatBufferModel&) = delete;
+  FlatBufferModel& operator=(const FlatBufferModel&) = delete;
+
+  bool initialized() const { return model_ != nullptr; }
+  const tflite::Model* operator->() const { return model_; }
+  const tflite::Model* GetModel() const { return model_; }
+  ErrorReporter* error_reporter() const { return error_reporter_; }
+  const Allocation* allocation() const { return allocation_; }
+
+  // Returns true if the model identifier is correct (otherwise false and
+  // reports an error).
+  bool CheckModelIdentifier() const;
+
+ private:
+  // Loads a model from a given allocation. FlatBufferModel will take over the
+  // ownership of `allocation`, and delete it in destructor. The ownership of
+  // `error_reporter`remains with the caller and must have lifetime at least
+  // as much as FlatBufferModel. This is to allow multiple models to use the
+  // same ErrorReporter instance.
+  FlatBufferModel(Allocation* allocation,
+                  ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Loads a model from Model flatbuffer. The `model` has to remain alive and
+  // unchanged until the end of this flatbuffermodel's lifetime.
+  FlatBufferModel(const Model* model, ErrorReporter* error_reporter);
+
+  // Flatbuffer traverser pointer. (Model* is a pointer that is within the
+  // allocated memory of the data allocated by allocation's internals.
+  const tflite::Model* model_ = nullptr;
+  // The error reporter to use for model errors and subsequent errors when
+  // the interpreter is created
+  ErrorReporter* error_reporter_;
+  // The allocator used for holding memory of the model.
+  Allocation* allocation_ = nullptr;
+};
+
+// Build an interpreter capable of interpreting `model`.
+//
+// model: a scoped model whose lifetime must be at least as long as
+//   the interpreter. In principle multiple interpreters can be made from
+//   a single model.
+// op_resolver: An instance that implements the Resolver interface which maps
+//   custom op names and builtin op codes to op registrations.
+// reportError: a functor that is called to report errors that handles
+//   printf var arg semantics. The lifetime of the reportError object must
+//   be greater than or equal to the Interpreter created by operator().
+//
+// Returns a kTfLiteOk when successful and sets interpreter to a valid
+// Interpreter. Note: the user must ensure the model lifetime is at least as
+// long as interpreter's lifetime.
+class InterpreterBuilder {
+ public:
+  InterpreterBuilder(const FlatBufferModel& model,
+                     const OpResolver& op_resolver);
+  // Builds an interpreter given only the raw flatbuffer Model object (instead
+  // of a FlatBufferModel). Mostly used for testing.
+  // If `error_reporter` is null, then DefaultErrorReporter() is used.
+  InterpreterBuilder(const ::tflite::Model* model,
+                     const OpResolver& op_resolver,
+                     ErrorReporter* error_reporter = DefaultErrorReporter());
+  ~InterpreterBuilder();
+  InterpreterBuilder(const InterpreterBuilder&) = delete;
+  InterpreterBuilder& operator=(const InterpreterBuilder&) = delete;
+  TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter);
+  TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter,
+                          int num_threads);
+
+ private:
+  TfLiteStatus BuildLocalIndexToRegistrationMapping();
+  TfLiteStatus ParseNodes(
+      const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
+      Interpreter* interpreter);
+  TfLiteStatus ParseTensors(
+      const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+      const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
+      Interpreter* interpreter);
+  TfLiteStatus ApplyDelegates(Interpreter* interpreter);
+
+  const ::tflite::Model* model_;
+  const OpResolver& op_resolver_;
+  ErrorReporter* error_reporter_;
+
+  std::vector<const TfLiteRegistration*> flatbuffer_op_index_to_registration_;
+  std::vector<BuiltinOperator> flatbuffer_op_index_to_registration_types_;
+  const Allocation* allocation_ = nullptr;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/model_flex_test.cc b/tensorflow/lite/model_flex_test.cc
similarity index 86%
rename from tensorflow/contrib/lite/model_flex_test.cc
rename to tensorflow/lite/model_flex_test.cc
index 52e76bee4941c412434853d49b69affbb1ef895a..88b3c886b21d160f0e99d8ce627866241f6d98e2 100644
--- a/tensorflow/contrib/lite/model_flex_test.cc
+++ b/tensorflow/lite/model_flex_test.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/model.h"
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 
@@ -24,7 +24,7 @@ namespace tflite {
 // appropriate delegate is linked into the client.
 TEST(FlexModel, WithFlexDelegate) {
   auto model = FlatBufferModel::BuildFromFile(
-      "tensorflow/contrib/lite/testdata/multi_add_flex.bin");
+      "tensorflow/lite/testdata/multi_add_flex.bin");
   ASSERT_TRUE(model);
 
   std::unique_ptr<Interpreter> interpreter;
diff --git a/tensorflow/lite/model_test.cc b/tensorflow/lite/model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e677ea94a71b979a01fd4b56e331d592cef76cd5
--- /dev/null
+++ b/tensorflow/lite/model_test.cc
@@ -0,0 +1,327 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "tensorflow/lite/model.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/testing/util.h"
+
+// Comparison for TfLiteRegistration. Since TfLiteRegistration is a C object,
+// we must declare this in global namespace, so argument-dependent operator
+// lookup works.
+inline bool operator==(const TfLiteRegistration& a,
+                       const TfLiteRegistration& b) {
+  return a.invoke == b.invoke && a.init == b.init && a.prepare == b.prepare &&
+         a.free == b.free;
+}
+
+namespace tflite {
+
+// Provide a dummy operation that does nothing.
+namespace {
+void* dummy_init(TfLiteContext*, const char*, size_t) { return nullptr; }
+void dummy_free(TfLiteContext*, void*) {}
+TfLiteStatus dummy_resize(TfLiteContext*, TfLiteNode*) { return kTfLiteOk; }
+TfLiteStatus dummy_invoke(TfLiteContext*, TfLiteNode*) { return kTfLiteOk; }
+TfLiteRegistration dummy_reg = {dummy_init, dummy_free, dummy_resize,
+                                dummy_invoke};
+}  // namespace
+
+// Provide a trivial resolver that returns a constant value no matter what
+// op is asked for.
+class TrivialResolver : public OpResolver {
+ public:
+  explicit TrivialResolver(TfLiteRegistration* constant_return = nullptr)
+      : constant_return_(constant_return) {}
+  // Find the op registration of a custom operator by op name.
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override {
+    return constant_return_;
+  }
+  // Find the op registration of a custom operator by op name.
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+    return constant_return_;
+  }
+
+ private:
+  TfLiteRegistration* constant_return_;
+};
+
+TEST(BasicFlatBufferModel, TestNonExistantFiles) {
+  ASSERT_TRUE(!FlatBufferModel::BuildFromFile("/tmp/tflite_model_1234"));
+}
+
+// Make sure a model with nothing in it loads properly.
+TEST(BasicFlatBufferModel, TestEmptyModelsAndNullDestination) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/empty_model.bin");
+  ASSERT_TRUE(model);
+  // Now try to build it into a model.
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model, TrivialResolver())(&interpreter),
+            kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_NE(InterpreterBuilder(*model, TrivialResolver())(nullptr), kTfLiteOk);
+}
+
+// Make sure currently unsupported # of subgraphs are checked
+// TODO(aselle): Replace this test when multiple subgraphs are supported.
+TEST(BasicFlatBufferModel, TestZeroAndMultipleSubgraphs) {
+  auto m1 = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/0_subgraphs.bin");
+  ASSERT_TRUE(m1);
+  std::unique_ptr<Interpreter> interpreter1;
+  ASSERT_NE(InterpreterBuilder(*m1, TrivialResolver())(&interpreter1),
+            kTfLiteOk);
+
+  auto m2 = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/2_subgraphs.bin");
+  ASSERT_TRUE(m2);
+  std::unique_ptr<Interpreter> interpreter2;
+  ASSERT_NE(InterpreterBuilder(*m2, TrivialResolver())(&interpreter2),
+            kTfLiteOk);
+}
+
+// Test what happens if we cannot bind any of the ops.
+TEST(BasicFlatBufferModel, TestModelWithoutNullRegistrations) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/test_model.bin");
+  ASSERT_TRUE(model);
+  // Check that we get an error code and interpreter pointer is reset.
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+  ASSERT_NE(InterpreterBuilder(*model, TrivialResolver(nullptr))(&interpreter),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter, nullptr);
+}
+
+// Make sure model is read to interpreter propelrly
+TEST(BasicFlatBufferModel, TestModelInInterpreter) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/test_model.bin");
+  ASSERT_TRUE(model);
+  // Check that we get an error code and interpreter pointer is reset.
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+  ASSERT_EQ(
+      InterpreterBuilder(*model, TrivialResolver(&dummy_reg))(&interpreter),
+      kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(interpreter->tensors_size(), 4);
+  ASSERT_EQ(interpreter->nodes_size(), 2);
+  std::vector<int> inputs = {0, 1};
+  std::vector<int> outputs = {2, 3};
+  ASSERT_EQ(interpreter->inputs(), inputs);
+  ASSERT_EQ(interpreter->outputs(), outputs);
+
+  EXPECT_EQ(std::string(interpreter->GetInputName(0)), "input0");
+  EXPECT_EQ(std::string(interpreter->GetInputName(1)), "input1");
+  EXPECT_EQ(std::string(interpreter->GetOutputName(0)), "out1");
+  EXPECT_EQ(std::string(interpreter->GetOutputName(1)), "out2");
+
+  // Make sure all input tensors are correct
+  TfLiteTensor* i0 = interpreter->tensor(0);
+  ASSERT_EQ(i0->type, kTfLiteFloat32);
+  ASSERT_NE(i0->data.raw, nullptr);  // mmapped
+  ASSERT_EQ(i0->allocation_type, kTfLiteMmapRo);
+  TfLiteTensor* i1 = interpreter->tensor(1);
+  ASSERT_EQ(i1->type, kTfLiteFloat32);
+  ASSERT_EQ(i1->data.raw, nullptr);
+  ASSERT_EQ(i1->allocation_type, kTfLiteArenaRw);
+  TfLiteTensor* o0 = interpreter->tensor(2);
+  ASSERT_EQ(o0->type, kTfLiteFloat32);
+  ASSERT_EQ(o0->data.raw, nullptr);
+  ASSERT_EQ(o0->allocation_type, kTfLiteArenaRw);
+  TfLiteTensor* o1 = interpreter->tensor(3);
+  ASSERT_EQ(o1->type, kTfLiteFloat32);
+  ASSERT_EQ(o1->data.raw, nullptr);
+  ASSERT_EQ(o1->allocation_type, kTfLiteArenaRw);
+
+  // Check op 0 which has inputs {0, 1} outputs {2}.
+  {
+    const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg0 =
+        interpreter->node_and_registration(0);
+    ASSERT_NE(node_and_reg0, nullptr);
+    const TfLiteNode& node0 = node_and_reg0->first;
+    const TfLiteRegistration& reg0 = node_and_reg0->second;
+    TfLiteIntArray* desired_inputs = TfLiteIntArrayCreate(2);
+    desired_inputs->data[0] = 0;
+    desired_inputs->data[1] = 1;
+    TfLiteIntArray* desired_outputs = TfLiteIntArrayCreate(1);
+    desired_outputs->data[0] = 2;
+    ASSERT_TRUE(TfLiteIntArrayEqual(node0.inputs, desired_inputs));
+    ASSERT_TRUE(TfLiteIntArrayEqual(node0.outputs, desired_outputs));
+    TfLiteIntArrayFree(desired_inputs);
+    TfLiteIntArrayFree(desired_outputs);
+    ASSERT_EQ(reg0, dummy_reg);
+  }
+
+  // Check op 1 which has inputs {2} outputs {3}.
+  {
+    const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg1 =
+        interpreter->node_and_registration(1);
+    ASSERT_NE(node_and_reg1, nullptr);
+    const TfLiteNode& node1 = node_and_reg1->first;
+    const TfLiteRegistration& reg1 = node_and_reg1->second;
+    TfLiteIntArray* desired_inputs = TfLiteIntArrayCreate(1);
+    TfLiteIntArray* desired_outputs = TfLiteIntArrayCreate(1);
+    desired_inputs->data[0] = 2;
+    desired_outputs->data[0] = 3;
+    ASSERT_TRUE(TfLiteIntArrayEqual(node1.inputs, desired_inputs));
+    ASSERT_TRUE(TfLiteIntArrayEqual(node1.outputs, desired_outputs));
+    TfLiteIntArrayFree(desired_inputs);
+    TfLiteIntArrayFree(desired_outputs);
+    ASSERT_EQ(reg1, dummy_reg);
+  }
+}
+
+// Test that loading a model with TensorFlow ops fails when the flex delegate is
+// not linked into the target.
+TEST(FlexModel, FailureWithoutFlexDelegate) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/multi_add_flex.bin");
+  ASSERT_TRUE(model);
+
+  // Note that creation will succeed when using the BuiltinOpResolver, but
+  // unless the appropriate delegate is linked into the target or the client
+  // explicitly installs the delegate, execution will fail.
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+
+  // As the flex ops weren't resolved implicitly by the flex delegate, runtime
+  // allocation and execution will fail.
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteError);
+}
+
+// This tests on a flatbuffer that defines a shape of 2 to be a memory mapped
+// buffer. But the buffer is provided to be only 1 element.
+TEST(BasicFlatBufferModel, TestBrokenMmap) {
+  ASSERT_FALSE(FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/test_model_broken.bin"));
+}
+
+TEST(BasicFlatBufferModel, TestNullModel) {
+  // Check that we get an error code and interpreter pointer is reset.
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+  ASSERT_NE(
+      InterpreterBuilder(nullptr, TrivialResolver(&dummy_reg))(&interpreter),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter.get(), nullptr);
+}
+
+// Mocks the verifier by setting the result in ctor.
+class FakeVerifier : public tflite::TfLiteVerifier {
+ public:
+  explicit FakeVerifier(bool result) : result_(result) {}
+  bool Verify(const char* data, int length,
+              tflite::ErrorReporter* reporter) override {
+    return result_;
+  }
+
+ private:
+  bool result_;
+};
+
+TEST(BasicFlatBufferModel, TestWithTrueVerifier) {
+  FakeVerifier verifier(true);
+  ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
+      "tensorflow/lite/testdata/test_model.bin",
+      &verifier));
+}
+
+TEST(BasicFlatBufferModel, TestWithFalseVerifier) {
+  FakeVerifier verifier(false);
+  ASSERT_FALSE(FlatBufferModel::VerifyAndBuildFromFile(
+      "tensorflow/lite/testdata/test_model.bin",
+      &verifier));
+}
+
+TEST(BasicFlatBufferModel, TestWithNullVerifier) {
+  ASSERT_TRUE(FlatBufferModel::VerifyAndBuildFromFile(
+      "tensorflow/lite/testdata/test_model.bin", nullptr));
+}
+
+// This makes sure the ErrorReporter is marshalled from FlatBufferModel to
+// the Interpreter.
+TEST(BasicFlatBufferModel, TestCustomErrorReporter) {
+  TestErrorReporter reporter;
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/empty_model.bin",
+      &reporter);
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  TrivialResolver resolver;
+  InterpreterBuilder(*model, resolver)(&interpreter);
+  ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(reporter.num_calls(), 1);
+}
+
+// This makes sure the ErrorReporter is marshalled from FlatBufferModel to
+// the Interpreter.
+TEST(BasicFlatBufferModel, TestNullErrorReporter) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/lite/testdata/empty_model.bin", nullptr);
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  TrivialResolver resolver;
+  InterpreterBuilder(*model, resolver)(&interpreter);
+  ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
+}
+
+// Test that loading model directly from a Model flatbuffer works.
+TEST(BasicFlatBufferModel, TestBuildFromModel) {
+  TestErrorReporter reporter;
+  FileCopyAllocation model_allocation(
+      "tensorflow/lite/testdata/test_model.bin", &reporter);
+  ASSERT_TRUE(model_allocation.valid());
+  ::flatbuffers::Verifier verifier(
+      reinterpret_cast<const uint8_t*>(model_allocation.base()),
+      model_allocation.bytes());
+  ASSERT_TRUE(VerifyModelBuffer(verifier));
+  const Model* model_fb = ::tflite::GetModel(model_allocation.base());
+
+  auto model = FlatBufferModel::BuildFromModel(model_fb);
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(*model, TrivialResolver(&dummy_reg))(&interpreter),
+      kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+}
+
+// TODO(aselle): Add tests for serialization of builtin op data types.
+// These tests will occur with the evaluation tests of individual operators,
+// not here.
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/models/BUILD b/tensorflow/lite/models/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8730160e4005df8846cf14d9186fb424d67f5934
--- /dev/null
+++ b/tensorflow/lite/models/BUILD
@@ -0,0 +1,14 @@
+# Model tests
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+
+exports_files(glob([
+    "testdata/*",
+]))
diff --git a/tensorflow/lite/models/smartreply/BUILD b/tensorflow/lite/models/smartreply/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..078b8e6bc6a288542575293be66c19f7bb733fc4
--- /dev/null
+++ b/tensorflow/lite/models/smartreply/BUILD
@@ -0,0 +1,89 @@
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
+
+licenses(["notice"])  # Apache 2.0
+
+gen_selected_ops(
+    name = "smartreply_ops",
+    model = "@tflite_smartreply//:smartreply.tflite",
+)
+
+cc_library(
+    name = "custom_ops",
+    srcs = [
+        "ops/extract_feature.cc",
+        "ops/normalize.cc",
+        "ops/predict.cc",
+        ":smartreply_ops",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
+cc_library(
+    name = "predictor_lib",
+    srcs = ["predictor.cc"],
+    hdrs = ["predictor.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":custom_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
+
+cc_test(
+    name = "extract_feature_op_test",
+    size = "small",
+    srcs = ["ops/extract_feature_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
+cc_test(
+    name = "normalize_op_test",
+    size = "small",
+    srcs = ["ops/normalize_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "predict_op_test",
+    size = "small",
+    srcs = ["ops/predict_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml b/tensorflow/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml
similarity index 100%
rename from tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml
rename to tensorflow/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml
diff --git a/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD b/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..73326e994bcd1bcbbea13e438b7be3ff26d378e6
--- /dev/null
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
@@ -0,0 +1,69 @@
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/lite:build_def.bzl",
+    "tflite_copts",
+    "tflite_jni_binary",
+)
+
+filegroup(
+    name = "assets",
+    srcs = [
+        "@tflite_smartreply//:model_files",
+    ],
+)
+
+android_binary(
+    name = "SmartReplyDemo",
+    srcs = glob(["java/**/*.java"]),
+    aapt_version = "aapt",
+    assets = [":assets"],
+    assets_dir = "",
+    custom_package = "com.example.android.smartreply",
+    manifest = "AndroidManifest.xml",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = ["manual"],
+    deps = [
+        ":smartreply_runtime",
+        "@androidsdk//com.android.support:support-v13-25.2.0",
+        "@androidsdk//com.android.support:support-v4-25.2.0",
+    ],
+)
+
+cc_library(
+    name = "smartreply_runtime",
+    srcs = ["libsmartreply_jni.so"],
+    visibility = ["//visibility:public"],
+)
+
+tflite_jni_binary(
+    name = "libsmartreply_jni.so",
+    deps = [
+        ":smartreply_jni_lib",
+    ],
+)
+
+cc_library(
+    name = "smartreply_jni_lib",
+    srcs = [
+        "smartreply_jni.cc",
+    ],
+    copts = tflite_copts(),
+    linkopts = [
+        "-lm",
+        "-ldl",
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/java/jni",
+        "//tensorflow/lite/models/smartreply:predictor_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD b/tensorflow/lite/models/smartreply/demo/app/src/main/assets/BUILD
similarity index 100%
rename from tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD
rename to tensorflow/lite/models/smartreply/demo/app/src/main/assets/BUILD
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt b/tensorflow/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt
similarity index 100%
rename from tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt
rename to tensorflow/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java
similarity index 100%
rename from tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java
rename to tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java
similarity index 100%
rename from tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java
rename to tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
similarity index 89%
rename from tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
rename to tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
index d5b1ac0ffbc47283aa0c1bf68c0a85ad6228cdcc..fbd75051e714c011ba0cd747905b4ac8aec6ad75 100644
--- a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
@@ -90,29 +90,26 @@ public class SmartReplyClient implements AutoCloseable {
   }
 
   private MappedByteBuffer loadModelFile() throws IOException {
-    AssetFileDescriptor fileDescriptor = context.getAssets().openFd(MODEL_PATH);
-    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
-    try {
+    try (AssetFileDescriptor fileDescriptor = context.getAssets().openFd(MODEL_PATH);
+        FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor())) {
       FileChannel fileChannel = inputStream.getChannel();
       long startOffset = fileDescriptor.getStartOffset();
       long declaredLength = fileDescriptor.getDeclaredLength();
       return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-    } finally {
-      inputStream.close();
     }
   }
 
   private String[] loadBackoffList() throws IOException {
     List<String> labelList = new ArrayList<String>();
-    BufferedReader reader =
-        new BufferedReader(new InputStreamReader(context.getAssets().open(BACKOFF_PATH)));
-    String line;
-    while ((line = reader.readLine()) != null) {
-      if (!line.isEmpty()) {
-        labelList.add(line);
+    try (BufferedReader reader =
+        new BufferedReader(new InputStreamReader(context.getAssets().open(BACKOFF_PATH)))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        if (!line.isEmpty()) {
+          labelList.add(line);
+        }
       }
     }
-    reader.close();
     String[] ans = new String[labelList.size()];
     labelList.toArray(ans);
     return ans;
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml b/tensorflow/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml
similarity index 100%
rename from tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml
rename to tensorflow/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc b/tensorflow/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
similarity index 97%
rename from tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
rename to tensorflow/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
index f158cc511a9bee0710aee13cd04f77b6f95fb868..9b5df36c37a1d20b977762a9bcfd480d684997ac 100644
--- a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/models/smartreply/predictor.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/models/smartreply/predictor.h"
 
 const char kIllegalStateException[] = "java/lang/IllegalStateException";
 
diff --git a/tensorflow/lite/models/smartreply/g3doc/README.md b/tensorflow/lite/models/smartreply/g3doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b8ff15196cd4d524ec9963d40c35761386c29fe
--- /dev/null
+++ b/tensorflow/lite/models/smartreply/g3doc/README.md
@@ -0,0 +1,146 @@
+# Smart Reply Model
+
+## What is On-Device Smart Reply Model?
+
+Smart Replies are contextually relevant, one-touch responses that help the user
+to reply to an incoming text message (or email) efficiently and effortlessly.
+Smart Replies have been highly successful across several Google products
+including
+[Gmail](https://www.blog.google/products/gmail/save-time-with-smart-reply-in-gmail/),
+[Inbox](https://www.blog.google/products/gmail/computer-respond-to-this-email/)
+and
+[Allo](https://blog.google/products/allo/google-allo-smarter-messaging-app/).
+
+The On-device Smart Reply model is targeted towards text chat use cases. It has
+a completely different architecture from its cloud-based counterparts, and is
+built specifically for memory constraints devices such as phones & watches. It
+has been successfully used to provide [Smart Replies on Android
+Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+to all first- & third-party apps.
+
+The on-device model comes with several benefits. It is:
+
+*   **Faster**: The model resides on the device and does not require internet
+    connectivity. Thus, the inference is very fast and has an average latency of
+    only a few milliseconds.
+*   **Resource efficient**: The model has a small memory footprint on
+    the device.
+*   **Privacy-friendly**: The user data never leaves the device and this
+    eliminates any privacy restrictions.
+
+A caveat, though, is that the on-device model has lower triggering rate than its
+cloud counterparts (triggering rate is the percentage of times the model
+suggests a response for an incoming message).
+
+## When to use this Model?
+
+The On-Device Smart Reply model is aimed towards improving the messaging
+experience for day-to-day conversational chat messages. We recommend using this
+model for similar use cases. Some sample messages on which the model does well
+are provided in this [tsv
+file](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/testdata/smartreply_samples.tsv)
+for reference. The file format is:
+
+```
+   {incoming_message  smart_reply1   [smart_reply2]   [smart_reply3]}
+```
+
+For the current model, we see a triggering rate of about 30-40% for messages
+which are similar to those provided in the tsv file above.
+
+In case the model does not trigger any response, the system falls back to
+suggesting replies from a fixed back-off set that was compiled from popular
+response intents observed in chat conversations. Some of the fallback responses
+are `Ok, Yes, No, 👍, ☺`.
+
+The model can only be used for inference at this time (i.e. it cannot be custom
+trained). If you are interested to know how the model was trained, please refer
+to this [blog
+post](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+and [research paper](https://arxiv.org/pdf/1708.00630).
+
+## How to use this Model?
+
+We have provided a pre-built demo APK that you can download, install and test on
+your phone ([demo APK
+here](http://download.tensorflow.org/deps/tflite/SmartReplyDemo.apk)).
+
+The On-Device Smart Reply demo App works in the following way:
+
+1.  Android app links to the JNI binary with a predictor library.
+
+2.  In the predictor library, `GetSegmentPredictions` is called with a list of input
+    strings.
+
+    2.1 The input string can be 1-3 most recent messages of the conversations in
+    form of string vector. The model will run on these input sentences and
+    provide Smart Replies corresponding to them.
+
+    2.2 The function performs some preprocessing on input data which includes:
+
+    *   Sentence splitting: The input message will be split into sentences if
+        message has more than one sentence. Eg: a message like “How are you?
+        Want to grab lunch?” will be broken down into 2 different sentences.
+    *   Normalization: The individual sentences will be normalized by converting
+        them into lower cases, removing unnecessary punctuations, etc. Eg: “how
+        are you????” will be converted to “how are you?” (refer for NORMALIZE op
+        for more details).
+
+        The input string content will be converted to tensors.
+
+    2.3 The function then runs the prediction model on the input tensors.
+
+    2.4 The function also performs some post-processing which includes
+    aggregating the model predictions for the input sentences from 2.2 and
+    returning the appropriate responses.
+
+3.  Finally, it gets response(s) from `std::vector<PredictorResponse>`, and
+    returns back to Android app. Responses are sorted in descending order of
+    confidence score.
+
+## Ops and Functionality Supported
+
+Following are the ops supported for using On-Device Smart Reply model:
+
+*   **NORMALIZE**
+
+    This is a custom op which normalizes the sentences by:
+
+    *   Converting all sentences into lower case.
+    *   Removing unnecessary punctuations (eg: “how are you????” → “how are
+        you?”).
+    *   Expanding sentences wherever necessary (eg: “ I’m home” → “I am home”).
+
+*   **SKIP_GRAM**
+
+    This is an op inside TensorFlow Lite that converts sentences into a list of
+    skip grams. The configurable parameters are `ngram_size` and
+    `max_skip_size`. For the model provided, the values for these parameters are
+    set to 3 & 2 respectively.
+
+*   **EXTRACT_FEATURES**
+
+    This is a custom op that hashes skip grams to features represented as
+    integers. Longer skip-grams are allocated higher weights.
+
+*   **LSH_PROJECTION**
+
+    This is an op inside TensorFlow Lite that projects input features to a
+    corresponding bit vector space using Locality Sensitive Hashing (LSH).
+
+*   **PREDICT**
+
+    This is a custom op that runs the input features through the projection
+    model (details [here](https://arxiv.org/pdf/1708.00630.pdf)), computes the
+    appropriate response labels along with weights for the projected features,
+    and aggregates the response labels and weights together.
+
+*   **HASHTABLE_LOOKUP**
+
+    This is an op inside TensorFlow Lite that uses label id from predict op and
+    looks up the response text from the given label id.
+
+## Further Information
+
+*   Open source code
+    [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/).
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc b/tensorflow/lite/models/smartreply/ops/extract_feature.cc
similarity index 95%
rename from tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
rename to tensorflow/lite/models/smartreply/ops/extract_feature.cc
index 29c8ad2286d705ea60fcd258e7283f6e1c3b70b8..f9d29229457c402e7eb989008a3b5d85bac709fa 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
+++ b/tensorflow/lite/models/smartreply/ops/extract_feature.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include <algorithm>
 #include <map>
 
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/string_util.h"
 #include <farmhash.h>
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature_test.cc b/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/models/smartreply/ops/extract_feature_test.cc
rename to tensorflow/lite/models/smartreply/ops/extract_feature_test.cc
index 9b8676bab6e81109b01809e7e332448b05a9fbb5..efe59eeb4667cc55fb0a70d3005c1f9c2aaa73ce 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature_test.cc
+++ b/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
 #include <farmhash.h>
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc b/tensorflow/lite/models/smartreply/ops/normalize.cc
similarity index 94%
rename from tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
rename to tensorflow/lite/models/smartreply/ops/normalize.cc
index c55ac9f52f7293a8ba5baf17f2052e11a7422074..3cb11cc055b269a6230a593617a86055e9d34139 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
+++ b/tensorflow/lite/models/smartreply/ops/normalize.cc
@@ -28,9 +28,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/strip.h"
 #include "re2/re2.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 namespace ops {
@@ -92,7 +92,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   tflite::DynamicBuffer buf;
   buf.AddString(result.data(), result.length());
-  buf.WriteToTensor(GetOutput(context, node, 0));
+  buf.WriteToTensorAsVector(GetOutput(context, node, 0));
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/normalize_test.cc b/tensorflow/lite/models/smartreply/ops/normalize_test.cc
similarity index 90%
rename from tensorflow/contrib/lite/models/smartreply/ops/normalize_test.cc
rename to tensorflow/lite/models/smartreply/ops/normalize_test.cc
index 4d35dba9a64a849d0321c3aa89d89f5bb61b0764..8c5131565d5892be946a9a115bb7c6cad8733214 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/normalize_test.cc
+++ b/tensorflow/lite/models/smartreply/ops/normalize_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/predict.cc b/tensorflow/lite/models/smartreply/ops/predict.cc
similarity index 99%
rename from tensorflow/contrib/lite/models/smartreply/ops/predict.cc
rename to tensorflow/lite/models/smartreply/ops/predict.cc
index 7b23adb990cf10d4f0cd5b66cfa40eaa0cc46c41..bb2ed4a3153ceb2ef2e6b6d7f8c640f41616d4b0 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/predict.cc
+++ b/tensorflow/lite/models/smartreply/ops/predict.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/lite/context.h"
 
 namespace tflite {
 namespace ops {
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/predict_test.cc b/tensorflow/lite/models/smartreply/ops/predict_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/models/smartreply/ops/predict_test.cc
rename to tensorflow/lite/models/smartreply/ops/predict_test.cc
index e97c58cbd185023e59c21c93057fd0f094585bf9..ca64dcaad47108e346bd03f0b7b15edfbd6a50dc 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/predict_test.cc
+++ b/tensorflow/lite/models/smartreply/ops/predict_test.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.cc b/tensorflow/lite/models/smartreply/predictor.cc
similarity index 91%
rename from tensorflow/contrib/lite/models/smartreply/predictor.cc
rename to tensorflow/lite/models/smartreply/predictor.cc
index 5d6c47dce8d90192d35a3a51fe6d0beb11f3b23f..59bf4a3cf1ed964e58a3b3dc9c6fb62139fcd56e 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor.cc
+++ b/tensorflow/lite/models/smartreply/predictor.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/models/smartreply/predictor.h"
+#include "tensorflow/lite/models/smartreply/predictor.h"
 
 #include "absl/strings/str_split.h"
 #include "re2/re2.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/op_resolver.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/string_util.h"
 
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 
@@ -49,7 +49,7 @@ void ExecuteTfLite(const std::string& sentence,
     TfLiteTensor* input = interpreter->tensor(interpreter->inputs()[0]);
     tflite::DynamicBuffer buf;
     buf.AddString(sentence.data(), sentence.length());
-    buf.WriteToTensor(input);
+    buf.WriteToTensorAsVector(input);
     interpreter->AllocateTensors();
 
     interpreter->Invoke();
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.h b/tensorflow/lite/models/smartreply/predictor.h
similarity index 91%
rename from tensorflow/contrib/lite/models/smartreply/predictor.h
rename to tensorflow/lite/models/smartreply/predictor.h
index 3151192d9277b6df513a76afb08af30d0379b7b1..6b8f9298a36f6f72813519b5fc7e15ae6a041f08 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor.h
+++ b/tensorflow/lite/models/smartreply/predictor.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
-#define TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#ifndef TENSORFLOW_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#define TENSORFLOW_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
 
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 namespace custom {
@@ -77,4 +77,4 @@ struct SmartReplyConfig {
 }  // namespace custom
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#endif  // TENSORFLOW_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc b/tensorflow/lite/models/smartreply/predictor_test.cc
similarity index 94%
rename from tensorflow/contrib/lite/models/smartreply/predictor_test.cc
rename to tensorflow/lite/models/smartreply/predictor_test.cc
index c7e08814fdf502f1ecfea60af3385fc7aa6055fa..7eba26993e59172d8ae85a8961b6f3b171057a48 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
+++ b/tensorflow/lite/models/smartreply/predictor_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/models/smartreply/predictor.h"
+#include "tensorflow/lite/models/smartreply/predictor.h"
 
 #include <fstream>
 #include <unordered_set>
@@ -22,8 +22,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
-//#include "tensorflow/contrib/lite/models/test_utils.h"
-#include "tensorflow/contrib/lite/string_util.h"
+//#include "tensorflow/lite/models/test_utils.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tflite {
@@ -36,7 +36,7 @@ const char kSamples[] = "smartreply_samples.tsv";
 
 string TestDataPath() {
   return string(absl::StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
-                             "contrib/lite/models/testdata/"));
+                             "lite/models/testdata/"));
 }
 
 MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
diff --git a/tensorflow/contrib/lite/models/speech_test.cc b/tensorflow/lite/models/speech_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/models/speech_test.cc
rename to tensorflow/lite/models/speech_test.cc
index 8ecf0b6154a622fa355c060ba7f2d61e6c670de2..17b7e8f28e8fb0988ee2269d9d833626c2aec701 100644
--- a/tensorflow/contrib/lite/models/speech_test.cc
+++ b/tensorflow/lite/models/speech_test.cc
@@ -21,14 +21,14 @@ limitations under the License.
 
 #include "testing/base/public/googletest.h"
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/testing/parse_testdata.h"
-#include "tensorflow/contrib/lite/testing/split.h"
-#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+#include "tensorflow/lite/testing/parse_testdata.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
 
 namespace tflite {
 namespace {
 
-const char kDataPath[] = "third_party/tensorflow/contrib/lite/models/testdata/";
+const char kDataPath[] = "third_party/tensorflow/lite/models/testdata/";
 
 bool Init(const string& in_file_name, testing::TfLiteDriver* driver,
           std::ifstream* in_file) {
diff --git a/tensorflow/lite/models/testdata/g3doc/README.md b/tensorflow/lite/models/testdata/g3doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2a4f1c143a21722945e8e396b81bd23e3312e87e
--- /dev/null
+++ b/tensorflow/lite/models/testdata/g3doc/README.md
@@ -0,0 +1,143 @@
+## Speech Model Tests
+
+Sample test data has been provided for speech related models in Tensorflow Lite
+to help users working with speech models to verify and test their models.
+
+For the hotword, speaker-id and automatic speech recognition sample models, the
+architecture assumes that the models receive their input from a speech
+pre-processing module. The speech pre-processing module receives the audio
+signal and produces features for the encoder neural network and uses some
+typical signal processing algorithms, like FFT and spectral subtraction, and
+ultimately produces a log-mel filterbank (the log of the triangular mel filters
+applied to the power spectra). The text-to-speech model assumes that the inputs
+are linguistic features describing characteristics of phonemes, syllables,
+words, phrases, and sentence. The outputs are acoustic features including
+mel-cepstral coefficients, log fundamental frequency, and band aperiodicity.
+The pre-processing modules for these models are not provided in the open source
+version of TensorFlow Lite.
+
+The following sections describe the architecture of the sample models at a high
+level:
+
+### Hotword Model
+
+The hotword model is the neural network model we use for keyphrase/hotword
+spotting (i.e. "okgoogle" detection). It is the entry point for voice
+interaction (e.g. Google search app on Android devices or Google Home, etc.).
+The speech hotword model block diagram is shown in Figure below. It has an input
+size of 40 (float), an output size of 7 (float), one Svdf layer, and four fully
+connected layers with the corresponding parameters as shown in figure below.
+
+![hotword_model](hotword.svg "Hotword model")
+
+### Speaker-id Model
+
+The speaker-id model is the neural network model we use for speaker
+verification. It runs after the hotword triggers. The speech speaker-id model
+block diagram is shown in Figure below. It has an input size of 80 (float), an
+output size of 64 (float), three Lstm layers, and one fully connected layers
+with the corresponding parameters as shown in figure below.
+
+![speakerid_model](speakerid.svg "Speaker-id model")
+
+### Text-to-speech (TTS) Model
+
+The text-to-speech model is the neural network model used to generate speech
+from text. The speech text-to-speech model’s block diagram is shown
+in Figure below. It has and input size of 334 (float), an output size of 196
+(float), two fully connected layers, three Lstm layers, and one recurrent layer
+with the corresponding parameters as shown in the figure.
+
+![tts_model](tts.svg "TTS model")
+
+### Automatic Speech Recognizer (ASR) Acoustic Model (AM)
+
+The acoustic model for automatic speech recognition is the neural network model
+for matching phonemes to the input audio features. It generates posterior
+probabilities of phonemes from speech frontend features (log-mel filterbanks).
+It has an input size of 320 (float), an output size of 42 (float), five LSTM
+layers and one fully connected layers with a Softmax activation function, with
+the corresponding parameters as shown in the figure.
+
+![asr_am_model](asr_am.svg "ASR AM model")
+
+### Automatic Speech Recognizer (ASR) Language Model (LM)
+
+The language model for automatic speech recognition is the neural network model
+for predicting the probability of a word given previous words in a sentence.
+It generates posterior probabilities of the next word based from a sequence of
+words. The words are encoded as indices in a fixed size dictionary.
+The model has two inputs both of size one (integer): the current word index and
+next word index, an output size of one (float): the log probability. It consists
+of three embedding layer, three LSTM layers, followed by a multiplication, a
+fully connected layers and an addition.
+The corresponding parameters as shown in the figure.
+
+![asr_lm_model](asr_lm.svg "ASR LM model")
+
+### Endpointer Model
+
+The endpointer model is the neural network model for predicting end of speech
+in an utterance. More precisely, it generates posterior probabilities of various
+events that allow detection of speech start and end events.
+It has an input size of 40 (float) which are speech frontend features
+(log-mel filterbanks), and an output size of four corresponding to:
+speech, intermediate non-speech, initial non-speech, and final non-speech.
+The model consists of a convolutional layer, followed by a fully-connected
+layer, two LSTM layers, and two additional fully-connected layers.
+The corresponding parameters as shown in the figure.
+![endpointer_model](endpointer.svg "Endpointer model")
+
+
+## Speech models test input/output generation
+
+As mentioned above the input to models are generated from a pre-processing
+module (output of a log-mel filterbank, or linguistic features), and the outputs
+are generated by running the equivalent TensorFlow model by feeding them the
+same input.
+
+## Link to the open source code
+
+### Models:
+
+[Speech hotword model (Svdf
+rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
+
+[Speech hotword model (Svdf
+rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
+
+[Speaker-id
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
+
+[TTS
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
+
+[ASR AM
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
+
+### Test benches
+
+[Speech hotword model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_hotword_model_test.cc)
+
+[Speaker-id model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_speakerid_model_test.cc)
+
+[TTS model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_tts_model_test.cc)
+
+[ASR AM model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_asr_am_model_test.cc)
+
+[ASR LM model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_asr_lm_model_test.cc)
+
+[Endpointer model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_endpointer_model_test.cc)
+
+## Android Support
+The models have been tested on Android phones, using the following tests:
+
+[Hotword] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/android/BUILD?rcl=172930882&l=25)
+
+[Speaker-id] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/android/BUILD?rcl=172930882&l=36)
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/asr_am.svg b/tensorflow/lite/models/testdata/g3doc/asr_am.svg
similarity index 100%
rename from tensorflow/contrib/lite/models/testdata/g3doc/asr_am.svg
rename to tensorflow/lite/models/testdata/g3doc/asr_am.svg
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/asr_lm.svg b/tensorflow/lite/models/testdata/g3doc/asr_lm.svg
similarity index 100%
rename from tensorflow/contrib/lite/models/testdata/g3doc/asr_lm.svg
rename to tensorflow/lite/models/testdata/g3doc/asr_lm.svg
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/endpointer.svg b/tensorflow/lite/models/testdata/g3doc/endpointer.svg
similarity index 100%
rename from tensorflow/contrib/lite/models/testdata/g3doc/endpointer.svg
rename to tensorflow/lite/models/testdata/g3doc/endpointer.svg
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/hotword.svg b/tensorflow/lite/models/testdata/g3doc/hotword.svg
similarity index 100%
rename from tensorflow/contrib/lite/models/testdata/g3doc/hotword.svg
rename to tensorflow/lite/models/testdata/g3doc/hotword.svg
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/speakerid.svg b/tensorflow/lite/models/testdata/g3doc/speakerid.svg
similarity index 100%
rename from tensorflow/contrib/lite/models/testdata/g3doc/speakerid.svg
rename to tensorflow/lite/models/testdata/g3doc/speakerid.svg
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/tts.svg b/tensorflow/lite/models/testdata/g3doc/tts.svg
similarity index 100%
rename from tensorflow/contrib/lite/models/testdata/g3doc/tts.svg
rename to tensorflow/lite/models/testdata/g3doc/tts.svg
diff --git a/tensorflow/contrib/lite/models/testdata/smartreply_samples.tsv b/tensorflow/lite/models/testdata/smartreply_samples.tsv
similarity index 100%
rename from tensorflow/contrib/lite/models/testdata/smartreply_samples.tsv
rename to tensorflow/lite/models/testdata/smartreply_samples.tsv
diff --git a/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec b/tensorflow/lite/models/testdata/speech_asr_lm_model.test_spec
similarity index 100%
rename from tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec
rename to tensorflow/lite/models/testdata/speech_asr_lm_model.test_spec
diff --git a/tensorflow/contrib/lite/mutable_op_resolver.cc b/tensorflow/lite/mutable_op_resolver.cc
similarity index 97%
rename from tensorflow/contrib/lite/mutable_op_resolver.cc
rename to tensorflow/lite/mutable_op_resolver.cc
index a36404399bb3e060ec9712532ad1c3bf4d8955e4..36c512dcaacef95282afeef7ef11d0f136c20f90 100644
--- a/tensorflow/contrib/lite/mutable_op_resolver.cc
+++ b/tensorflow/lite/mutable_op_resolver.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/mutable_op_resolver.h b/tensorflow/lite/mutable_op_resolver.h
similarity index 91%
rename from tensorflow/contrib/lite/mutable_op_resolver.h
rename to tensorflow/lite/mutable_op_resolver.h
index efd6cfac2ac899c13156e5a9290eb625bb9a2bb8..b5700595499714d30c70b9226942b69609037b99 100644
--- a/tensorflow/contrib/lite/mutable_op_resolver.h
+++ b/tensorflow/lite/mutable_op_resolver.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_MUTABLE_OP_RESOLVER_H_
-#define TENSORFLOW_CONTRIB_LITE_MUTABLE_OP_RESOLVER_H_
+#ifndef TENSORFLOW_LITE_MUTABLE_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_MUTABLE_OP_RESOLVER_H_
 
 #include <unordered_map>
-#include "tensorflow/contrib/lite/core/api/op_resolver.h"
-#include "tensorflow/contrib/lite/util.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 
@@ -78,4 +78,4 @@ class MutableOpResolver : public OpResolver {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_MUTABLE_OP_RESOLVER_H_
+#endif  // TENSORFLOW_LITE_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/mutable_op_resolver_test.cc b/tensorflow/lite/mutable_op_resolver_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/mutable_op_resolver_test.cc
rename to tensorflow/lite/mutable_op_resolver_test.cc
index b70c7038396782d9a8bfda75d08cc8d4f535d100..64fc68a16ca62d55d3633ac2e8bdbf7836e67ba4 100644
--- a/tensorflow/contrib/lite/mutable_op_resolver_test.cc
+++ b/tensorflow/lite/mutable_op_resolver_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/contrib/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
similarity index 100%
rename from tensorflow/contrib/lite/nnapi/BUILD
rename to tensorflow/lite/nnapi/BUILD
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/lite/nnapi/NeuralNetworksShim.h
similarity index 99%
rename from tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
rename to tensorflow/lite/nnapi/NeuralNetworksShim.h
index eccf4aefb6372b71c3b87dc0cdea24fec22ff625..c39502f4acc5dc6262746a61688cd075861e6135 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksShim.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_NNAPI_NEURALNETWORKSSHIM_H_
-#define TENSORFLOW_CONTRIB_LITE_NNAPI_NEURALNETWORKSSHIM_H_
+#ifndef TENSORFLOW_LITE_NNAPI_NEURALNETWORKSSHIM_H_
+#define TENSORFLOW_LITE_NNAPI_NEURALNETWORKSSHIM_H_
 
 #include <dlfcn.h>
 #include <stdint.h>
@@ -1009,4 +1009,4 @@ inline void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
 
 /**/
 
-#endif  // TENSORFLOW_CONTRIB_LITE_NNAPI_NEURALNETWORKSSHIM_H_
+#endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSSHIM_H_
diff --git a/tensorflow/contrib/lite/nnapi/README.md b/tensorflow/lite/nnapi/README.md
similarity index 100%
rename from tensorflow/contrib/lite/nnapi/README.md
rename to tensorflow/lite/nnapi/README.md
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26d75696a1c889d752f9715358701da6300f49df
--- /dev/null
+++ b/tensorflow/lite/nnapi_delegate.cc
@@ -0,0 +1,858 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/nnapi_delegate.h"
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#include <sys/system_properties.h>
+#endif
+
+namespace tflite {
+
+void logError(const char* format, ...) {
+  // stderr is convenient for native tests, but is not captured for apps
+  va_list args_for_stderr;
+  va_start(args_for_stderr, format);
+  vfprintf(stderr, format, args_for_stderr);
+  va_end(args_for_stderr);
+  fprintf(stderr, "\n");
+  fflush(stderr);
+#ifdef __ANDROID__
+  // produce logcat output for general consumption
+  va_list args_for_log;
+  va_start(args_for_log, format);
+  __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
+  va_end(args_for_log);
+#endif
+}
+
+#define FATAL(...)       \
+  logError(__VA_ARGS__); \
+  exit(1);
+
+// TODO(aselle): Change the error model to use status codes.
+#define CHECK_TFLITE_SUCCESS(x)                                           \
+  if (x != kTfLiteOk) {                                                   \
+    FATAL("Aborting since tflite returned failure nnapi_delegate.cc:%d.", \
+          __LINE__);                                                      \
+  }
+
+#define CHECK_NN(x)                                                     \
+  if (x != ANEURALNETWORKS_NO_ERROR) {                                  \
+    FATAL("Aborting since NNAPI returned failure nnapi_delegate.cc:%d", \
+          __LINE__);                                                    \
+  }
+
+#define RETURN_ERROR_IF_TFLITE_FAILED(x)                                       \
+  if (x != kTfLiteOk) {                                                        \
+    logError(                                                                  \
+        "Returning error since TFLite returned failure nnapi_delegate.cc:%d.", \
+        __LINE__);                                                             \
+    return kTfLiteError;                                                       \
+  }
+
+#define RETURN_ERROR_IF_NN_FAILED(x)                                          \
+  if (x != ANEURALNETWORKS_NO_ERROR) {                                        \
+    logError(                                                                 \
+        "Returning error since NNAPI returned failure nnapi_delegate.cc:%d.", \
+        __LINE__);                                                            \
+    return kTfLiteError;                                                      \
+  }
+
+// Tracking of NNAPI operand ids
+static const int64_t kOperandIdNotSet = -1;
+static const int64_t kOperandNotNeeded = -2;
+
+namespace {
+
+int32_t GetAndroidSdkVersion() {
+#ifdef __ANDROID__
+  const char* sdkProp = "ro.build.version.sdk";
+  char sdkVersion[PROP_VALUE_MAX];
+  int length = __system_property_get(sdkProp, sdkVersion);
+  if (length != 0) {
+    for (int i = 0; i < length; ++i) {
+      int digit = sdkVersion[i] - '0';
+      if (digit < 0 || digit > 9) {
+        // Non-numeric SDK version, assume it's higher then expected;
+        return 0xFFFF;
+      }
+    }
+    return atoi(sdkVersion);
+  }
+  FATAL("No %s prop", sdkProp);
+#endif  // __ANDROID__
+  return 0;
+}
+
+int32_t GetAndroidSdkVersionCached() {
+  static int32_t androidSdkVersion = GetAndroidSdkVersion();
+  return androidSdkVersion;
+}
+
+}  // namespace
+
+NNAPIAllocation::NNAPIAllocation(const char* filename,
+                                 ErrorReporter* error_reporter)
+    : MMAPAllocation(filename, error_reporter) {
+  if (mmapped_buffer_ != MAP_FAILED)
+    CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ,
+                                                mmap_fd_, 0, &handle_));
+}
+
+NNAPIAllocation::~NNAPIAllocation() {
+  if (handle_) {
+    ANeuralNetworksMemory_free(handle_);
+  }
+}
+
+NNAPIDelegate::~NNAPIDelegate() {
+  if (nn_compiled_model_) {
+    ANeuralNetworksCompilation_free(nn_compiled_model_);
+    nn_compiled_model_ = nullptr;
+  }
+  if (nn_model_) {
+    ANeuralNetworksModel_free(nn_model_);
+    nn_model_ = nullptr;
+    // TODO(aselle): Is this thread-safe and callable multiple times?
+  }
+  // ANeuralNetworksShutdown();
+}
+
+// Adds the tensors of the subgraph to the NN API model.
+TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
+                               ANeuralNetworksModel* nn_model,
+                               uint32_t* no_of_operands_added,
+                               std::vector<int64_t>* nnapi_ids) {
+  uint32_t next_id = 0;
+  for (size_t i = 0; i < subgraph->tensors_size(); i++) {
+    // Skip temporaries and RNN back-edges.
+    if ((*nnapi_ids)[i] == kOperandNotNeeded) continue;
+
+    (*nnapi_ids)[i] = int64_t(next_id);
+
+    int32_t nn_type = 0;
+    // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
+    float scale = 0.0f;
+    int32_t zeroPoint = 0;
+    TfLiteTensor* tensor = subgraph->tensor(i);
+    switch (tensor->type) {
+      case kTfLiteNoType:
+        // Tensors added during initialization of Ops don't have a type yet and
+        // should not be registered with the NNAPI.
+        continue;
+      case kTfLiteFloat32:
+        nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+        break;
+      case kTfLiteUInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
+      case kTfLiteInt32:
+        nn_type = ANEURALNETWORKS_TENSOR_INT32;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
+      default:
+        logError("Unsupported tensor type %d", tensor->type);
+        return kTfLiteError;
+    }
+    if (tensor->dims->size == 0) {
+      logError("NNAPI doesn't support tensors with rank 0 (index %d name %s)",
+               i, tensor->name);
+      return kTfLiteError;
+    }
+    if (tensor->dims->size > 4) {
+      logError("NNAPI doesn't support tensors with rank > 4 (index %d name %s)",
+               i, tensor->name);
+      return kTfLiteError;
+    }
+    // TODO(aselle): Note, many of these are intermediate results. Do I need
+    // to ever specify these sizes. I am currently below doing setValue
+    // on all of them, but I shouldn't in the future.
+    // Answer(jeanluc): If all the operators can set the dimension correctly,
+    // you won't need to.
+    ANeuralNetworksOperandType operand_type{
+        nn_type, static_cast<uint32_t>(tensor->dims->size),
+        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
+    RETURN_ERROR_IF_NN_FAILED(
+        ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+    // TODO(aselle): Based on Michael's suggestion, limiting this to read
+    // only memory
+    if (tensor->allocation_type == kTfLiteMmapRo) {
+      if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
+              static_cast<const Allocation*>(tensor->allocation))) {
+        RETURN_ERROR_IF_NN_FAILED(
+            ANeuralNetworksModel_setOperandValueFromMemory(
+                nn_model, next_id, alloc->memory(),
+                alloc->offset(tensor->data.raw), tensor->bytes));
+      } else {
+        RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_setOperandValue(
+            nn_model, next_id, tensor->data.raw, tensor->bytes));
+      }
+    } else if (tensor->bytes == 0) {
+      // These size 0 tensors are optional tensors reserved.
+      RETURN_ERROR_IF_NN_FAILED(
+          ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
+    }
+
+    ++next_id;
+  }
+  *no_of_operands_added = next_id;
+  return kTfLiteOk;
+}
+
+void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count,
+                        std::vector<uint32_t>* into,
+                        const std::vector<int64_t>& map) {
+  for (size_t i = 0; i < from_ids_count; i++) {
+    int from_id = from_ids_buf[i];
+    if (from_id == kOptionalTensor) {
+      into->push_back(from_id);
+    } else {
+      into->push_back(map[from_id]);
+    }
+  }
+}
+
+// Adds the operations and their parameters to the NN API model.
+// 'next-id' is the operand ID of the next operand of the model.
+TfLiteStatus AddOpsAndParams(
+    tflite::Subgraph* subgraph, ANeuralNetworksModel* nn_model,
+    uint32_t next_id, std::vector<int>* model_state_inputs,
+    std::vector<int>* model_state_outputs,
+    const std::vector<int64_t>& tensor_id_to_nnapi_id) {
+  for (size_t i = 0; i < subgraph->nodes_size(); i++) {
+    const auto* node_and_registration = subgraph->node_and_registration(i);
+    const TfLiteNode& node = node_and_registration->first;
+    const TfLiteRegistration& registration = node_and_registration->second;
+    tflite::BuiltinOperator builtin =
+        static_cast<tflite::BuiltinOperator>(registration.builtin_code);
+
+    // Add the parameters.
+    std::vector<uint32_t> augmented_inputs, augmented_outputs;
+    MapAndAddTensorIds(node.inputs->data, node.inputs->size, &augmented_inputs,
+                       tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(node.outputs->data, node.outputs->size,
+                       &augmented_outputs, tensor_id_to_nnapi_id);
+
+    auto add_scalar_int32 = [&nn_model, &augmented_inputs,
+                             &next_id](int value) {
+      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
+                                                    sizeof(int32_t)))
+      augmented_inputs.push_back(next_id++);
+    };
+
+    auto add_scalar_float32 = [&nn_model, &augmented_inputs,
+                               &next_id](float value) {
+      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
+                                                    sizeof(float)))
+      augmented_inputs.push_back(next_id++);
+    };
+
+    auto add_vector_int32 = [&](const int* values, uint32_t num_values) {
+      ANeuralNetworksOperandType operand_type{
+          .type = ANEURALNETWORKS_TENSOR_INT32,
+          .dimensionCount = 1,
+          .dimensions = &num_values};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, values, sizeof(int32_t) * num_values));
+      augmented_inputs.push_back(next_id++);
+    };
+
+    // Handle state tensors of RNN, LSTM, SVDF.
+    // For each state_out tensor, a corresponding state_in operand needs to be
+    // created for NNAPI.
+    auto duplicate_state_tensor_float32 =
+        [subgraph, &nn_model, &next_id, &augmented_inputs, &model_state_inputs,
+         &model_state_outputs](int tensor_id) {
+          const TfLiteTensor* tensor = subgraph->tensor(tensor_id);
+          ANeuralNetworksOperandType operand_type{
+              ANEURALNETWORKS_TENSOR_FLOAT32,
+              static_cast<uint32_t>(tensor->dims->size),
+              reinterpret_cast<uint32_t*>(tensor->dims->data),
+              tensor->params.scale, tensor->params.zero_point};
+          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+          augmented_inputs.push_back(next_id);
+          model_state_inputs->push_back(next_id);
+          model_state_outputs->push_back(tensor_id);
+          next_id++;
+        };
+    auto check_and_add_activation = [&add_scalar_int32](int activation) {
+      if (activation > kTfLiteActRelu6) {
+        logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
+        return kTfLiteError;
+      }
+      add_scalar_int32(activation);
+      return kTfLiteOk;
+    };
+
+    auto add_add_params = [&add_scalar_int32](void* data) {
+      auto* builtin = reinterpret_cast<TfLiteAddParams*>(data);
+      if (builtin->activation > kTfLiteActRelu6) {
+        logError("NNAPI only supports RELU, RELU1 and RELU6 activations");
+        return kTfLiteError;
+      }
+      add_scalar_int32(builtin->activation);
+      return kTfLiteOk;
+    };
+
+    auto add_pooling_params = [&add_scalar_int32,
+                               &check_and_add_activation](void* data) {
+      auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
+      add_scalar_int32(builtin->padding);
+      add_scalar_int32(builtin->stride_width);
+      add_scalar_int32(builtin->stride_height);
+      add_scalar_int32(builtin->filter_width);
+      add_scalar_int32(builtin->filter_height);
+      return check_and_add_activation(builtin->activation);
+    };
+
+    auto add_convolution_params = [&add_scalar_int32,
+                                   &check_and_add_activation](void* data) {
+      auto builtin = reinterpret_cast<TfLiteConvParams*>(data);
+      add_scalar_int32(builtin->padding);
+      add_scalar_int32(builtin->stride_width);
+      add_scalar_int32(builtin->stride_height);
+      return check_and_add_activation(builtin->activation);
+    };
+
+    auto add_depthwise_conv_params = [&add_scalar_int32,
+                                      &check_and_add_activation](void* data) {
+      auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data);
+      add_scalar_int32(builtin->padding);
+      add_scalar_int32(builtin->stride_width);
+      add_scalar_int32(builtin->stride_height);
+      add_scalar_int32(builtin->depth_multiplier);
+      return check_and_add_activation(builtin->activation);
+    };
+
+    auto add_fully_connected_params = [&check_and_add_activation](void* data) {
+      auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data);
+      return check_and_add_activation(builtin->activation);
+    };
+
+    auto add_concatenation_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data);
+      add_scalar_int32(builtin->axis);
+      if (builtin->activation != kTfLiteActNone) {
+        logError("Concatenation does not support fused activation in NNAPI");
+        return kTfLiteError;
+      }
+      return kTfLiteOk;
+    };
+
+    auto add_softmax_params = [&add_scalar_float32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data);
+      add_scalar_float32(builtin->beta);
+    };
+
+    auto add_space_to_depth_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data);
+      add_scalar_int32(builtin->block_size);
+    };
+
+    auto add_lstm_params = [&add_scalar_int32,
+                            &add_scalar_float32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data);
+      add_scalar_int32(builtin->activation);
+      add_scalar_float32(builtin->cell_clip);
+      add_scalar_float32(builtin->proj_clip);
+    };
+
+    // LSTM in NNAPI requires scratch tensor as an output operand.
+    auto add_lstm_scratch_tensor_float32 = [subgraph, &node, &nn_model,
+                                            &next_id, &augmented_outputs]() {
+      if (node.temporaries->size == 0) return;
+      int scratch_buffer_index = node.temporaries->data[0];
+      const TfLiteTensor* tensor = subgraph->tensor(scratch_buffer_index);
+      ANeuralNetworksOperandType operand_type{
+          ANEURALNETWORKS_TENSOR_FLOAT32,
+          static_cast<uint32_t>(tensor->dims->size),
+          reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
+          tensor->params.zero_point};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+      augmented_outputs.insert(augmented_outputs.begin(), next_id++);
+    };
+
+    auto add_mean_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteReducerParams*>(data);
+      add_scalar_int32(builtin->keep_dims);
+    };
+
+    auto add_svdf_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data);
+      add_scalar_int32(builtin->rank);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_rnn_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteRNNParams*>(data);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_squeeze_params = [&](void* data) {
+      const auto* builtin = reinterpret_cast<TfLiteSqueezeParams*>(data);
+      // Note that we add the squeeze dimensions even if the dimensions were
+      // unspecified (empty), as NNAPI requires the operand.
+      add_vector_int32(builtin->squeeze_dims,
+                       static_cast<uint32_t>(builtin->num_squeeze_dims));
+    };
+
+    // Handle optional input tensors.
+    auto add_optional_tensors = [&nn_model, &augmented_inputs,
+                                 &next_id](int nn_type) {
+      for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
+        if (augmented_inputs[idx] == kOptionalTensor) {
+          const std::vector<uint32_t> dim = {0, 0};
+          ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
+          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+          CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id,
+                                                        nullptr, 0))
+          augmented_inputs[idx] = next_id++;
+        }
+      }
+    };
+
+    int nnapi_version = 10;
+    ANeuralNetworksOperationType nn_op_type;
+
+    switch (builtin) {
+      case tflite::BuiltinOperator_ADD:
+        nn_op_type = ANEURALNETWORKS_ADD;
+        RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
+        break;
+      case tflite::BuiltinOperator_MUL:
+        nn_op_type = ANEURALNETWORKS_MUL;
+        RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data));
+        break;
+      case tflite::BuiltinOperator_AVERAGE_POOL_2D:
+        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
+        nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D;
+        break;
+      case tflite::BuiltinOperator_MAX_POOL_2D:
+        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
+        nn_op_type = ANEURALNETWORKS_MAX_POOL_2D;
+        break;
+      case tflite::BuiltinOperator_L2_POOL_2D:
+        RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data));
+        nn_op_type = ANEURALNETWORKS_L2_POOL_2D;
+        break;
+      case tflite::BuiltinOperator_CONV_2D: {
+        auto builtin = reinterpret_cast<TfLiteConvParams*>(node.builtin_data);
+        if (builtin->dilation_width_factor != 1 ||
+            builtin->dilation_height_factor != 1 || node.inputs->size != 3) {
+          logError("NNAPI does not support dilated Conv2D.");
+          return kTfLiteError;
+        }
+      }
+        RETURN_ERROR_IF_TFLITE_FAILED(
+            add_convolution_params(node.builtin_data));
+        nn_op_type = ANEURALNETWORKS_CONV_2D;
+        break;
+      case tflite::BuiltinOperator_RELU:
+        nn_op_type = ANEURALNETWORKS_RELU;
+        break;
+      case tflite::BuiltinOperator_RELU6:
+        nn_op_type = ANEURALNETWORKS_RELU6;
+        break;
+      case tflite::BuiltinOperator_TANH:
+        nn_op_type = ANEURALNETWORKS_TANH;
+        break;
+      case tflite::BuiltinOperator_FLOOR:
+        nn_op_type = ANEURALNETWORKS_FLOOR;
+        break;
+      case tflite::BuiltinOperator_LOGISTIC:
+        nn_op_type = ANEURALNETWORKS_LOGISTIC;
+        break;
+      case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
+        RETURN_ERROR_IF_TFLITE_FAILED(
+            add_depthwise_conv_params(node.builtin_data));
+        nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
+        break;
+      case tflite::BuiltinOperator_CONCATENATION:
+        RETURN_ERROR_IF_TFLITE_FAILED(
+            add_concatenation_params(node.builtin_data));
+        nn_op_type = ANEURALNETWORKS_CONCATENATION;
+        break;
+      case tflite::BuiltinOperator_SOFTMAX:
+        add_softmax_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SOFTMAX;
+        break;
+      case tflite::BuiltinOperator_FULLY_CONNECTED:
+        RETURN_ERROR_IF_TFLITE_FAILED(
+            add_fully_connected_params(node.builtin_data));
+        nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
+        break;
+      case tflite::BuiltinOperator_RESHAPE:
+        if (node.inputs->size != 2) {
+          logError("NNAPI only supports 2-input RESHAPE");
+          return kTfLiteError;
+        }
+        nn_op_type = ANEURALNETWORKS_RESHAPE;
+        // add_reshape_params(node.builtin_data);
+        break;
+      case tflite::BuiltinOperator_SPACE_TO_DEPTH:
+        add_space_to_depth_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
+        break;
+      case tflite::BuiltinOperator_LSTM: {
+        if (node.inputs->size + /* no of params */ 3 != 21) {
+          logError("NNAPI only supports 21-input LSTMs");
+          return kTfLiteError;
+        }
+        duplicate_state_tensor_float32(
+            node.outputs->data[/*kOutputStateTensor*/ 0]);
+        duplicate_state_tensor_float32(
+            node.outputs->data[/*kCellStateTensor*/ 1]);
+        add_lstm_params(node.builtin_data);
+        add_lstm_scratch_tensor_float32();
+        add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32);
+        nn_op_type = ANEURALNETWORKS_LSTM;
+        break;
+      }
+      case tflite::BuiltinOperator_SVDF: {
+        duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]);
+        add_svdf_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SVDF;
+        break;
+      }
+      case tflite::BuiltinOperator_RNN: {
+        duplicate_state_tensor_float32(
+            node.outputs->data[/*kHiddenStateTensor*/ 0]);
+        add_rnn_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_RNN;
+        break;
+      }
+      case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
+        nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP;
+        break;
+      case tflite::BuiltinOperator_PAD:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_PAD;
+        break;
+      case tflite::BuiltinOperator_MEAN:
+        nnapi_version = 11;  // require NNAPI 1.1
+        add_mean_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_MEAN;
+        break;
+      case tflite::BuiltinOperator_DIV:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_DIV;
+        RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
+            reinterpret_cast<TfLiteDivParams*>(node.builtin_data)->activation));
+        break;
+      case tflite::BuiltinOperator_SUB:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_SUB;
+        RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation(
+            reinterpret_cast<TfLiteSubParams*>(node.builtin_data)->activation));
+        break;
+      case tflite::BuiltinOperator_SQUEEZE:
+        nnapi_version = 11;  // requires NNAPI 1.1
+        add_squeeze_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SQUEEZE;
+        break;
+      case tflite::BuiltinOperator_TRANSPOSE:
+        // The permutation input tensor value dictates the output dimensions.
+        // TODO(b/110888333): Support dynamically-sized tensors in delegates.
+        if ((node.inputs->size > 1) &&
+            (subgraph->tensor(node.inputs->data[1])->allocation_type !=
+             kTfLiteMmapRo)) {
+          logError("NNAPI does not yet support dynamic tensors.");
+          return kTfLiteError;
+        }
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_TRANSPOSE;
+        break;
+      case tflite::BuiltinOperator_L2_NORMALIZATION:
+        nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION;
+        if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data)
+                ->activation != kTfLiteActNone) {
+          logError(
+              "NNAPI does not support L2Normalization with fused activations");
+          return kTfLiteError;
+        }
+        if ((node.inputs->size > 0) &&
+            (subgraph->tensor(node.inputs->data[0])->dims->size != 4)) {
+          logError("NNAPI only supports input rank 4 for L2Normalization");
+          return kTfLiteError;
+        }
+        break;
+      case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
+        if (subgraph->tensor(node.outputs->data[0])->type != kTfLiteFloat32) {
+          logError("NNAPI only support HASHTABLE_LOOKUP with float32 output",
+                   builtin);
+          return kTfLiteError;
+        }
+        nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP;
+        break;
+      case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
+      case tflite::BuiltinOperator_LSH_PROJECTION:
+      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
+      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
+      case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
+      case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
+      case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
+      case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
+      case tflite::BuiltinOperator_PADV2:
+      case tflite::BuiltinOperator_RESIZE_BILINEAR:
+      case tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
+      case tflite::BuiltinOperator_CALL:
+      case tflite::BuiltinOperator_SKIP_GRAM:
+      case tflite::BuiltinOperator_RELU_N1_TO_1:
+      case tflite::BuiltinOperator_GATHER:
+      case tflite::BuiltinOperator_SPACE_TO_BATCH_ND:
+      case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
+      case tflite::BuiltinOperator_TOPK_V2:
+      case tflite::BuiltinOperator_SPLIT:
+      case tflite::BuiltinOperator_STRIDED_SLICE:
+      case tflite::BuiltinOperator_EXP:
+      case tflite::BuiltinOperator_LOG_SOFTMAX:
+      case tflite::BuiltinOperator_DEQUANTIZE:
+      case tflite::BuiltinOperator_DELEGATE:
+      case tflite::BuiltinOperator_CAST:
+      case tflite::BuiltinOperator_PRELU:
+      case tflite::BuiltinOperator_MAXIMUM:
+      case tflite::BuiltinOperator_MINIMUM:
+      case tflite::BuiltinOperator_ARG_MAX:
+      case tflite::BuiltinOperator_ARG_MIN:
+      case tflite::BuiltinOperator_GREATER:
+      case tflite::BuiltinOperator_GREATER_EQUAL:
+      case tflite::BuiltinOperator_LESS:
+      case tflite::BuiltinOperator_LESS_EQUAL:
+      case tflite::BuiltinOperator_NEG:
+      case tflite::BuiltinOperator_SELECT:
+      case tflite::BuiltinOperator_SLICE:
+      case tflite::BuiltinOperator_SIN:
+      case tflite::BuiltinOperator_LOG:
+      case tflite::BuiltinOperator_TRANSPOSE_CONV:
+      case tflite::BuiltinOperator_TILE:
+      case tflite::BuiltinOperator_EXPAND_DIMS:
+      case tflite::BuiltinOperator_SPARSE_TO_DENSE:
+      case tflite::BuiltinOperator_EQUAL:
+      case tflite::BuiltinOperator_NOT_EQUAL:
+      case tflite::BuiltinOperator_SUM:
+      case tflite::BuiltinOperator_REDUCE_MAX:
+      case tflite::BuiltinOperator_REDUCE_MIN:
+      case tflite::BuiltinOperator_REDUCE_PROD:
+      case tflite::BuiltinOperator_SQRT:
+      case tflite::BuiltinOperator_RSQRT:
+      case tflite::BuiltinOperator_SHAPE:
+      case tflite::BuiltinOperator_POW:
+      case tflite::BuiltinOperator_FAKE_QUANT:
+      case tflite::BuiltinOperator_PACK:
+      case tflite::BuiltinOperator_LOGICAL_OR:
+      case tflite::BuiltinOperator_ONE_HOT:
+      case tflite::BuiltinOperator_LOGICAL_AND:
+      case tflite::BuiltinOperator_LOGICAL_NOT:
+      case tflite::BuiltinOperator_UNPACK:
+      case tflite::BuiltinOperator_FLOOR_DIV:
+      case tflite::BuiltinOperator_REDUCE_ANY:
+      case tflite::BuiltinOperator_SQUARE:
+      case tflite::BuiltinOperator_ZEROS_LIKE:
+      case tflite::BuiltinOperator_FILL:
+      case tflite::BuiltinOperator_FLOOR_MOD:
+      case tflite::BuiltinOperator_RANGE:
+      case tflite::BuiltinOperator_LEAKY_RELU:
+      case tflite::BuiltinOperator_SQUARED_DIFFERENCE:
+      case tflite::BuiltinOperator_MIRROR_PAD:
+      case tflite::BuiltinOperator_ABS:
+      case tflite::BuiltinOperator_SPLIT_V:
+        logError("Op code %d is currently not delegated to NNAPI", builtin);
+        return kTfLiteError;
+        break;
+      case tflite::BuiltinOperator_CUSTOM:
+        logError("Custom operations are not supported when using NNAPI.");
+        return kTfLiteError;
+        break;
+    }
+
+    if (nnapi_version == 11 && GetAndroidSdkVersionCached() < 28) {
+      logError("Op %d needs NNAPI1.1", builtin);
+      return kTfLiteError;
+    }
+
+    // Add the operation.
+    RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_addOperation(
+        nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
+        augmented_inputs.data(),
+        static_cast<uint32_t>(augmented_outputs.size()),
+        reinterpret_cast<uint32_t*>(augmented_outputs.data())));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
+  if (nn_model_ && nn_compiled_model_) return model_status_;
+
+  // TODO(aselle): This is not correct. need to handle resize invalidation.
+  if (!nn_model_) {
+    CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
+
+    // Find which tensors should be added to NNAPI. TFLite has temporaries
+    // and RNN back-edges which are are not valid for NNAPI. We look through all
+    // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with
+    // kOperandIdNotSet. addTensorOperands will replace those with the
+    // corresponding NNAPI operand ids and skip kOperandNotNeeded entries.
+    std::vector<int64_t> tensor_id_to_nnapi_id(subgraph->tensors_size(),
+                                               kOperandNotNeeded);
+    auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf,
+                                                       size_t count) {
+      for (int j = 0; j < count; j++) {
+        auto tensor_id = buf[j];
+        if (tensor_id != kOptionalTensor) {
+          tensor_id_to_nnapi_id[tensor_id] = kOperandIdNotSet;
+        }
+      }
+    };
+    for (size_t i = 0; i < subgraph->nodes_size(); i++) {
+      const auto* node_and_registration = subgraph->node_and_registration(i);
+      const TfLiteNode& node = node_and_registration->first;
+      set_ids_to_not_set(node.inputs->data, node.inputs->size);
+      set_ids_to_not_set(node.outputs->data, node.outputs->size);
+    }
+    set_ids_to_not_set(subgraph->inputs().data(), subgraph->inputs().size());
+    set_ids_to_not_set(subgraph->outputs().data(), subgraph->outputs().size());
+
+    uint32_t next_id = 0;
+    RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands(
+        subgraph, nn_model_, &next_id, &tensor_id_to_nnapi_id));
+    RETURN_ERROR_IF_TFLITE_FAILED(
+        AddOpsAndParams(subgraph, nn_model_, next_id, &model_states_inputs_,
+                        &model_states_outputs_, tensor_id_to_nnapi_id));
+
+    std::vector<uint32_t> augmented_inputs;
+    MapAndAddTensorIds(subgraph->inputs().data(), subgraph->inputs().size(),
+                       &augmented_inputs, tensor_id_to_nnapi_id);
+    augmented_inputs.insert(augmented_inputs.end(),
+                            model_states_inputs_.begin(),
+                            model_states_inputs_.end());
+    std::vector<uint32_t> augmented_outputs;
+    MapAndAddTensorIds(subgraph->outputs().data(), subgraph->outputs().size(),
+                       &augmented_outputs, tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(model_states_outputs_.data(),
+                       model_states_outputs_.size(), &augmented_outputs,
+                       tensor_id_to_nnapi_id);
+
+    CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
+        nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
+        reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
+        static_cast<uint32_t>(augmented_outputs.size()),
+        reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
+
+    if (GetAndroidSdkVersionCached() >= 28) {
+      CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+          nn_model_, subgraph->GetAllowFp16PrecisionForFp32()));
+    }
+    CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
+  }
+  if (!nn_compiled_model_) {
+    CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_));
+    CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
+  if (!nn_model_) {
+    model_status_ = BuildGraph(subgraph);
+    if (model_status_ != kTfLiteOk) {
+      logError("Failed to build graph for NNAPI");
+    }
+  }
+  if (model_status_ != kTfLiteOk) {
+    return model_status_;
+  }
+
+  ANeuralNetworksExecution* execution = nullptr;
+  CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
+
+  // Currently perform deep copy of input buffer
+  for (size_t i = 0; i < subgraph->inputs().size(); i++) {
+    int input = subgraph->inputs()[i];
+    // TODO(aselle): Is this what we want or do we want input instead?
+    // TODO(aselle): This should be called setInputValue maybe to be cons.
+    TfLiteTensor* tensor = subgraph->tensor(input);
+    CHECK_NN(ANeuralNetworksExecution_setInput(
+        execution, i, nullptr, tensor->data.raw, tensor->bytes));
+  }
+
+  // Tell nn api where to place final data.
+  for (size_t i = 0; i < subgraph->outputs().size(); i++) {
+    int output = subgraph->outputs()[i];
+    TfLiteTensor* tensor = subgraph->tensor(output);
+    CHECK_NN(ANeuralNetworksExecution_setOutput(
+        execution, i, nullptr, tensor->data.raw, tensor->bytes));
+  }
+
+  // The state_out of previous invocation need to be mapped to state_in of
+  // current invocation.
+  for (size_t i = 0; i < model_states_outputs_.size(); i++) {
+    int state_tensor_idx = model_states_outputs_[i];
+    TfLiteTensor* tensor = subgraph->tensor(state_tensor_idx);
+    // Here we are using a deep copy for state_in tensors so that we are not
+    // reading and writing into the same buffer during a invocation.
+    // TODO(miaowang): using double shared buffer to minimize the copies.
+    CHECK_NN(ANeuralNetworksExecution_setInput(
+        execution, i + subgraph->inputs().size(), nullptr, tensor->data.raw,
+        tensor->bytes));
+    // Tell NNAPI where to output the state_out.
+    CHECK_NN(ANeuralNetworksExecution_setOutput(
+        execution, i + subgraph->outputs().size(), nullptr, tensor->data.raw,
+        tensor->bytes));
+  }
+
+  // Currently use blocking compute.
+  ANeuralNetworksEvent* event = nullptr;
+  CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
+  CHECK_NN(ANeuralNetworksEvent_wait(event));
+  ANeuralNetworksEvent_free(event);
+  ANeuralNetworksExecution_free(execution);
+
+#if 0
+  printf("From the NN API:\n");
+  TfLiteTensor* tensor = subgraph->tensor(subgraph->outputs()[0]);
+  if (float* data =
+          subgraph->typed_tensor<float>(subgraph->outputs()[0])) {
+    size_t num = tensor->bytes / sizeof(float);
+    for (float* p = data; p < data + num; p++) {
+      printf(" %f", *p);
+    }
+    printf("\n");
+  }
+#endif
+
+  return kTfLiteOk;
+}
+
+bool NNAPIDelegate::IsSupported() { return NNAPIExists(); }
+
+}  // namespace tflite
diff --git a/tensorflow/lite/nnapi_delegate.h b/tensorflow/lite/nnapi_delegate.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4f8e4ecf3935c41346c78647e631651dbcccb3e
--- /dev/null
+++ b/tensorflow/lite/nnapi_delegate.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_DELEGATE_H_
+#define TENSORFLOW_LITE_NNAPI_DELEGATE_H_
+
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/interpreter.h"
+
+class ANeuralNetworksModel;
+class ANeuralNetworksMemory;
+class ANeuralNetworksCompilation;
+
+namespace tflite {
+
+class NNAPIAllocation : public MMAPAllocation {
+ public:
+  NNAPIAllocation(const char* filename, ErrorReporter* error_reporter);
+  ~NNAPIAllocation();
+
+  size_t offset(const void* ptr) const {
+    auto signed_offset = reinterpret_cast<const uint8_t*>(ptr) -
+                         reinterpret_cast<const uint8_t*>(mmapped_buffer_);
+
+    return static_cast<size_t>(signed_offset);
+  }
+
+  ANeuralNetworksMemory* memory() const { return handle_; }
+  bool valid() const override { return handle_ != nullptr; }
+
+ private:
+  mutable ANeuralNetworksMemory* handle_ = nullptr;
+};
+
+class NNAPIDelegate {
+ public:
+  ~NNAPIDelegate();
+
+  // Convert a tflite graph to NNAPI
+  TfLiteStatus BuildGraph(Subgraph* subgraph);
+
+  // Run
+  TfLiteStatus Invoke(Subgraph* subgraph);
+
+  // Whether the current platform supports NNAPI delegation.
+  static bool IsSupported();
+
+ private:
+  // The NN API model handle
+  ANeuralNetworksModel* nn_model_ = nullptr;
+  // The NN API compilation handle
+  ANeuralNetworksCompilation* nn_compiled_model_ = nullptr;
+  // Model status
+  TfLiteStatus model_status_ = kTfLiteOk;
+
+  // List of state tensors for LSTM, RNN, SVDF.
+  // NN API does not allow ops to maintain states across multiple
+  // invocations. We need to manually create state input tensors from
+  // corresponding state output tensors of TFLite operations, and map them
+  // correctly.
+  std::vector<int> model_states_inputs_;   // holds NNAPI operand ids
+  std::vector<int> model_states_outputs_;  // holds TFLite tensor ids
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_NNAPI_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/nnapi_delegate_disabled.cc b/tensorflow/lite/nnapi_delegate_disabled.cc
similarity index 86%
rename from tensorflow/contrib/lite/nnapi_delegate_disabled.cc
rename to tensorflow/lite/nnapi_delegate_disabled.cc
index e3536d3db6c59fa241dfc1cd866005b94172f779..a8f2c0bfe386f1339c17e34a199cf929c43ecc33 100644
--- a/tensorflow/contrib/lite/nnapi_delegate_disabled.cc
+++ b/tensorflow/lite/nnapi_delegate_disabled.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/nnapi_delegate.h"
+#include "tensorflow/lite/nnapi_delegate.h"
 
 #include <cassert>
 
@@ -35,13 +35,11 @@ NNAPIDelegate::~NNAPIDelegate() {
 #undef UNUSED_MEMBER
 }
 
-TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   return kTfLiteError;
 }
 
-TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
-  return kTfLiteError;
-}
+TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) { return kTfLiteError; }
 
 bool NNAPIDelegate::IsSupported() { return false; }
 
diff --git a/tensorflow/lite/op_resolver.h b/tensorflow/lite/op_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..96490d44b91c101647f3ed2c7cebbec523d687c3
--- /dev/null
+++ b/tensorflow/lite/op_resolver.h
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Compatibility shim for moved header location.
+#ifndef TENSORFLOW_LITE_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_OP_RESOLVER_H_
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+#endif  // TENSORFLOW_LITE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
similarity index 82%
rename from tensorflow/contrib/lite/optional_debug_tools.cc
rename to tensorflow/lite/optional_debug_tools.cc
index 64ba2d8baa2ea22bdc9d6ded57c5086ef4781623..1113bf01b175d93d849dbd51abf2f6c677f450d4 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/optional_debug_tools.h"
+#include "tensorflow/lite/optional_debug_tools.h"
 
 namespace tflite {
 
@@ -44,6 +44,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt32";
     case kTfLiteUInt8:
       return "kTfLiteUInt8";
+    case kTfLiteInt8:
+      return "kTfLiteInt8";
     case kTfLiteInt64:
       return "kTfLiteInt64";
     case kTfLiteString:
@@ -83,26 +85,27 @@ void PrintInterpreterState(Interpreter* interpreter) {
   printf("Outputs:");
   PrintIntVector(interpreter->outputs());
   printf("\n");
-  for (int tensor_index = 0; tensor_index < interpreter->tensors_size();
+  for (size_t tensor_index = 0; tensor_index < interpreter->tensors_size();
        tensor_index++) {
-    TfLiteTensor* tensor = interpreter->tensor(tensor_index);
-    printf("Tensor %3d %-20s %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
+    TfLiteTensor* tensor = interpreter->tensor(static_cast<int>(tensor_index));
+    printf("Tensor %3zu %-20s %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
            tensor->name, TensorTypeName(tensor->type),
            AllocTypeName(tensor->allocation_type), tensor->bytes,
            (static_cast<float>(tensor->bytes) / (1 << 20)));
     PrintTfLiteIntVector(tensor->dims);
   }
   printf("\n");
-  for (int node_index = 0; node_index < interpreter->nodes_size();
+  for (size_t node_index = 0; node_index < interpreter->nodes_size();
        node_index++) {
     const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg =
-        interpreter->node_and_registration(node_index);
+        interpreter->node_and_registration(static_cast<int>(node_index));
     const TfLiteNode& node = node_and_reg->first;
     const TfLiteRegistration& reg = node_and_reg->second;
     if (reg.custom_name != nullptr) {
-      printf("Node %3d Operator Custom Name %s\n", node_index, reg.custom_name);
+      printf("Node %3zu Operator Custom Name %s\n", node_index,
+             reg.custom_name);
     } else {
-      printf("Node %3d Operator Builtin Code %3d\n", node_index,
+      printf("Node %3zu Operator Builtin Code %3d\n", node_index,
              reg.builtin_code);
     }
     printf("  Inputs:");
diff --git a/tensorflow/contrib/lite/optional_debug_tools.h b/tensorflow/lite/optional_debug_tools.h
similarity index 80%
rename from tensorflow/contrib/lite/optional_debug_tools.h
rename to tensorflow/lite/optional_debug_tools.h
index 82a6e114a66eb3865da6f09a634ccb6367454bdb..fb2f78e5ae42abdad955752335d476250e370f62 100644
--- a/tensorflow/contrib/lite/optional_debug_tools.h
+++ b/tensorflow/lite/optional_debug_tools.h
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 // Optional debugging functionality. For small sized binaries, these are not
 // needed.
-#ifndef TENSORFLOW_CONTRIB_LITE_OPTIONAL_DEBUG_TOOLS_H_
-#define TENSORFLOW_CONTRIB_LITE_OPTIONAL_DEBUG_TOOLS_H_
+#ifndef TENSORFLOW_LITE_OPTIONAL_DEBUG_TOOLS_H_
+#define TENSORFLOW_LITE_OPTIONAL_DEBUG_TOOLS_H_
 
-#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/lite/interpreter.h"
 
 namespace tflite {
 
@@ -26,4 +26,4 @@ void PrintInterpreterState(Interpreter* interpreter);
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_OPTIONAL_DEBUG_TOOLS_H_
+#endif  // TENSORFLOW_LITE_OPTIONAL_DEBUG_TOOLS_H_
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..52ea6fe636247ec0a4d5fedb41c56fc095e6ac61
--- /dev/null
+++ b/tensorflow/lite/profiling/BUILD
@@ -0,0 +1,83 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+
+common_copts = [
+    "-Wall",
+] + tflite_copts()
+
+cc_library(
+    name = "profiler",
+    hdrs = ["profiler.h"],
+    copts = common_copts,
+    deps = [":profile_buffer"],
+)
+
+cc_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.cc"],
+    copts = ["-DTFLITE_PROFILING_ENABLED"],
+    defines = ["TFLITE_PROFILING_ENABLED"],
+    deps = [
+        ":profiler",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "profile_buffer",
+    hdrs = ["profile_buffer.h"],
+    copts = common_copts,
+    deps = [":time"],
+)
+
+cc_library(
+    name = "time",
+    srcs = ["time.cc"],
+    hdrs = ["time.h"],
+    copts = common_copts,
+)
+
+cc_library(
+    name = "profile_summarizer",
+    srcs = ["profile_summarizer.cc"],
+    hdrs = ["profile_summarizer.h"],
+    copts = common_copts,
+    deps = [
+        ":profiler",
+        "//tensorflow/core:stats_calculator_portable",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_test(
+    name = "profile_summarizer_test",
+    srcs = ["profile_summarizer_test.cc"],
+    copts = common_copts,
+    deps = [
+        ":profile_summarizer",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "profile_buffer_test",
+    srcs = ["profile_buffer_test.cc"],
+    copts = ["-DTFLITE_PROFILING_ENABLED"],
+    defines = ["TFLITE_PROFILING_ENABLED"],
+    deps = [
+        ":profile_buffer",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/profiling/profile_buffer.h b/tensorflow/lite/profiling/profile_buffer.h
similarity index 95%
rename from tensorflow/contrib/lite/profiling/profile_buffer.h
rename to tensorflow/lite/profiling/profile_buffer.h
index 65d86dce47f397c7dad6cc2beb8ffa1f95b29d45..247ebb37c53e7a1a7197155c0f63c877857289e1 100644
--- a/tensorflow/contrib/lite/profiling/profile_buffer.h
+++ b/tensorflow/lite/profiling/profile_buffer.h
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
-#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
+#ifndef TENSORFLOW_LITE_PROFILING_PROFILE_BUFFER_H_
+#define TENSORFLOW_LITE_PROFILING_PROFILE_BUFFER_H_
 
 #include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/contrib/lite/profiling/time.h"
+#include "tensorflow/lite/profiling/time.h"
 
 namespace tflite {
 namespace profiling {
@@ -143,4 +143,4 @@ class ProfileBuffer {
 }  // namespace profiling
 }  // namespace tflite
 #endif  // TFLITE_PROFILING_ENABLED
-#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_BUFFER_H_
+#endif  // TENSORFLOW_LITE_PROFILING_PROFILE_BUFFER_H_
diff --git a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc b/tensorflow/lite/profiling/profile_buffer_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/profiling/profile_buffer_test.cc
rename to tensorflow/lite/profiling/profile_buffer_test.cc
index b8784cca455cfc301f2cc30c9c6d031b7174f829..6642a15884fdf57cb385e186fd75620183098375 100644
--- a/tensorflow/contrib/lite/profiling/profile_buffer_test.cc
+++ b/tensorflow/lite/profiling/profile_buffer_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/profiling/profile_buffer.h"
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
similarity index 97%
rename from tensorflow/contrib/lite/profiling/profile_summarizer.cc
rename to tensorflow/lite/profiling/profile_summarizer.cc
index 720bd717b9e3b0c45cbdbaaad2b6900edacc3051..64b1bd7ad771c11412a2558bf4454ad2e06c0096 100644
--- a/tensorflow/contrib/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/profiling/profile_summarizer.h"
 
 #include <sstream>
 
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace profiling {
diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer.h b/tensorflow/lite/profiling/profile_summarizer.h
similarity index 84%
rename from tensorflow/contrib/lite/profiling/profile_summarizer.h
rename to tensorflow/lite/profiling/profile_summarizer.h
index a529ff87428d70d002241311d7f70f185521020f..d4f5da7be96adcf8068526686b035f0e7a04f27a 100644
--- a/tensorflow/contrib/lite/profiling/profile_summarizer.h
+++ b/tensorflow/lite/profiling/profile_summarizer.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_SUMMARIZER_H_
-#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_SUMMARIZER_H_
+#ifndef TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARIZER_H_
+#define TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARIZER_H_
 
 #include <vector>
 
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/profiling/profiler.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/core/util/stats_calculator.h"
 
 namespace tflite {
@@ -52,4 +52,4 @@ class ProfileSummarizer {
 }  // namespace profiling
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_SUMMARIZER_H_
+#endif  // TENSORFLOW_LITE_PROFILING_PROFILE_SUMMARIZER_H_
diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer_test.cc b/tensorflow/lite/profiling/profile_summarizer_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/profiling/profile_summarizer_test.cc
rename to tensorflow/lite/profiling/profile_summarizer_test.cc
index 465c294962df77a84f3c71d41f51f855773c3614..bbb64b832aecae2c51f0f9563c8c5d001f7651c7 100644
--- a/tensorflow/contrib/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/lite/profiling/profile_summarizer_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/kernels/kernel_util.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
-#include "tensorflow/contrib/lite/testing/util.h"
-#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace profiling {
diff --git a/tensorflow/contrib/lite/profiling/profiler.h b/tensorflow/lite/profiling/profiler.h
similarity index 95%
rename from tensorflow/contrib/lite/profiling/profiler.h
rename to tensorflow/lite/profiling/profiler.h
index 8c3e4dc76d8061fdd9d238b7647dc07a5ecdf0e2..89c05cba37b37a88b9d91db8f997e1fcecf43174 100644
--- a/tensorflow/contrib/lite/profiling/profiler.h
+++ b/tensorflow/lite/profiling/profiler.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
-#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
+#ifndef TENSORFLOW_LITE_PROFILING_PROFILER_H_
+#define TENSORFLOW_LITE_PROFILING_PROFILER_H_
 
 #include <vector>
 
-#include "tensorflow/contrib/lite/profiling/profile_buffer.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
 
 #ifdef TFLITE_PROFILING_ENABLED
 
@@ -176,4 +176,4 @@ class Profiler {
 
 #endif  // TFLITE_PROFILING_ENABLED
 
-#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILER_H_
+#endif  // TENSORFLOW_LITE_PROFILING_PROFILER_H_
diff --git a/tensorflow/contrib/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
similarity index 79%
rename from tensorflow/contrib/lite/profiling/profiler_test.cc
rename to tensorflow/lite/profiling/profiler_test.cc
index cf56eed2a4643e0e5ab918bd06cbb85fd3e94a83..addebabe1b1556e3853eb0a2bec65132f743d012 100644
--- a/tensorflow/contrib/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -20,18 +20,15 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/profiling/profiler.h"
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/profiling/profiler.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
 namespace {
 
-void AssertDurationOfEventAroundMs(const ProfileEvent* event,
-                                   double expected_ms, double eps_ms) {
-  double duration_ms =
-      (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
-  EXPECT_NEAR(expected_ms, duration_ms, eps_ms);
+double GetDurationOfEventMs(const ProfileEvent* event) {
+  return (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
 }
 
 void SleepForQuarterSecond(Profiler* profiler) {
@@ -84,12 +81,17 @@ TEST(ProfilingTest, ProfilesAreCollected) {
 
 #ifndef ADDRESS_SANITIZER
   // ASAN build is sometimes very slow. Set a large epsilon to avoid flakiness.
+  // Due to flakiness, just verify relative values match.
   const int eps_ms = 50;
-  AssertDurationOfEventAroundMs(profile_events[0], /*expected_ms*/ 500, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[1], /*expected_ms*/ 250, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[2], /*expected_ms*/ 250, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[3], /*expected_ms*/ 250, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[4], /*expected_ms*/ 250, eps_ms);
+  auto parent_ms = GetDurationOfEventMs(profile_events[0]);
+  double child_ms[2], sleep_for_quarter_ms[2];
+  child_ms[0] = GetDurationOfEventMs(profile_events[1]);
+  child_ms[1] = GetDurationOfEventMs(profile_events[3]);
+  sleep_for_quarter_ms[0] = GetDurationOfEventMs(profile_events[2]);
+  sleep_for_quarter_ms[1] = GetDurationOfEventMs(profile_events[4]);
+  EXPECT_NEAR(parent_ms, child_ms[0] + child_ms[1], eps_ms);
+  EXPECT_NEAR(child_ms[0], sleep_for_quarter_ms[0], eps_ms);
+  EXPECT_NEAR(child_ms[1], sleep_for_quarter_ms[1], eps_ms);
 #endif
 }
 
diff --git a/tensorflow/contrib/lite/profiling/time.cc b/tensorflow/lite/profiling/time.cc
similarity index 96%
rename from tensorflow/contrib/lite/profiling/time.cc
rename to tensorflow/lite/profiling/time.cc
index 875ddb02bcfc30f4c2ef543fe1c15bec467e5410..3e7db03d9d8df1eeb0c82d388324716c5e7d7896 100644
--- a/tensorflow/contrib/lite/profiling/time.cc
+++ b/tensorflow/lite/profiling/time.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/profiling/time.h"
+#include "tensorflow/lite/profiling/time.h"
 
 #if defined(_MSC_VER)
 #include <chrono>  // NOLINT(build/c++11)
diff --git a/tensorflow/contrib/lite/profiling/time.h b/tensorflow/lite/profiling/time.h
similarity index 84%
rename from tensorflow/contrib/lite/profiling/time.h
rename to tensorflow/lite/profiling/time.h
index cc2ec319b8a95b3efa0aab0ac9f97a88bf7b5536..66233a480fd390619629e26a05284202057e0f4a 100644
--- a/tensorflow/contrib/lite/profiling/time.h
+++ b/tensorflow/lite/profiling/time.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
-#define TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
+#ifndef TENSORFLOW_LITE_PROFILING_TIME_H_
+#define TENSORFLOW_LITE_PROFILING_TIME_H_
 
 #include <cstdint>
 
@@ -24,4 +24,4 @@ uint64_t NowMicros();
 }  // namespace time
 }  // namespace profiling
 }  // namespace tflite
-#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
+#endif  // TENSORFLOW_LITE_PROFILING_TIME_H_
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..acf827892bfd0081f1bbc7d0c3fa4f65af3a0817
--- /dev/null
+++ b/tensorflow/lite/python/BUILD
@@ -0,0 +1,191 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "interpreter_test_data",
+    srcs = glob(["**/testdata/*"]),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "interpreter",
+    srcs = [
+        "interpreter.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/python/interpreter_wrapper:tensorflow_wrap_interpreter_wrapper",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "interpreter_test",
+    srcs = ["interpreter_test.py"],
+    data = [":interpreter_test_data"],
+    srcs_version = "PY2AND3",
+    tags = ["no_oss"],
+    deps = [
+        ":interpreter",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_binary(
+    name = "tflite_convert",
+    srcs = ["tflite_convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lite",
+    ],
+)
+
+py_library(
+    name = "lite",
+    srcs = ["lite.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":convert",
+        ":convert_saved_model",
+        ":interpreter",
+        ":lite_constants",
+        ":op_hint",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/saved_model:constants",
+        "//tensorflow/python/saved_model:loader",
+    ],
+)
+
+py_test(
+    name = "lite_test",
+    srcs = ["lite_test.py"],
+    data = ["@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_windows",
+    ],
+    deps = [
+        ":lite",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_library(
+    name = "lite_constants",
+    srcs = ["lite_constants.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/lite/toco:toco_flags_proto_py",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
+py_library(
+    name = "convert",
+    srcs = ["convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lite_constants",
+        "//tensorflow/lite/toco:model_flags_proto_py",
+        "//tensorflow/lite/toco:toco_flags_proto_py",
+        "//tensorflow/lite/toco/python:tensorflow_wrap_toco",
+        "//tensorflow/lite/toco/python:toco_from_protos",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "op_hint",
+    srcs = ["op_hint.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/graph_editor:graph_editor_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "convert_test",
+    srcs = ["convert_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convert",
+        ":interpreter",
+        ":op_hint",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+    ],
+)
+
+py_library(
+    name = "convert_saved_model",
+    srcs = ["convert_saved_model.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow/contrib/lite:__subpackages__",
+        "//tensorflow/lite:__subpackages__",
+    ],
+    deps = [
+        ":convert",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/saved_model",
+    ],
+)
+
+py_binary(
+    name = "create_custom_op",
+    srcs = ["create_custom_op.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "@absl_py//absl/flags",
+    ],
+)
+
+py_test(
+    name = "convert_saved_model_test",
+    srcs = ["convert_saved_model_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":convert_saved_model",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/saved_model",
+    ],
+)
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c603998717019ac8624868b16d720e300a30efd
--- /dev/null
+++ b/tensorflow/lite/python/convert.py
@@ -0,0 +1,472 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts a frozen graph into a TFLite FlatBuffer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import enum  # pylint: disable=g-bad-import-order
+
+import os as _os
+import platform as _platform
+import subprocess as _subprocess
+import tempfile as _tempfile
+
+from tensorflow.lite.python import lite_constants
+from tensorflow.lite.toco import model_flags_pb2 as _model_flags_pb2
+from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import resource_loader as _resource_loader
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.python.util.tf_export import tf_export as _tf_export
+
+# Lazy load since some of the performance benchmark skylark rules
+# break dependencies.
+_toco_python = LazyLoader(
+    "tensorflow_wrap_toco", globals(),
+    "tensorflow.lite.toco.python."
+    "tensorflow_wrap_toco")
+del LazyLoader
+
+# Find the toco_from_protos binary using the resource loader if using from
+# bazel, otherwise we are in a pip where console_scripts already has
+# the toco_from_protos tool.
+if lite_constants.EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
+  _toco_from_proto_bin = ""
+else:
+  _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
+      "../toco/python/toco_from_protos")
+
+if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
+  _toco_from_proto_bin = "toco_from_protos"
+
+
+# Map of tf.dtypes to TFLite types_flag_pb2.
+_MAP_TF_TO_TFLITE_TYPES = {
+    dtypes.float32: _types_pb2.FLOAT,
+    dtypes.int32: _types_pb2.INT32,
+    dtypes.int64: _types_pb2.INT64,
+    dtypes.string: _types_pb2.STRING,
+    dtypes.uint8: _types_pb2.QUANTIZED_UINT8,
+    dtypes.complex64: _types_pb2.COMPLEX64
+}
+
+
+def _try_convert_to_unicode(output):
+  if output is None:
+    return u""
+
+  if isinstance(output, bytes):
+    try:
+      return output.decode()
+    except UnicodeDecodeError:
+      pass
+  return output
+
+
+def convert_dtype_to_tflite_type(tf_dtype):
+  """Converts tf.dtype to TFLite proto type.
+
+  Args:
+    tf_dtype: tf.dtype
+
+  Raises:
+    ValueError: Unsupported tf.dtype.
+
+  Returns:
+    types_flag_pb2.
+  """
+  result = _MAP_TF_TO_TFLITE_TYPES.get(tf_dtype)
+  if result is None:
+    raise ValueError("Unsupported tf.dtype {0}".format(tf_dtype))
+  return result
+
+
+@_tf_export("lite.OpsSet")
+class OpsSet(enum.Enum):
+  """Enum class defining the sets of ops available to generate TFLite models.
+
+  WARNING: Experimental interface, subject to change.
+  """
+  # Convert model using TensorFlow Lite builtin ops.
+  TFLITE_BUILTINS = "TFLITE_BUILTINS"
+
+  # Convert model using TensorFlow ops. Not all TensorFlow ops are available.
+  # WARNING: Experimental interface, subject to change.
+  SELECT_TF_OPS = "SELECT_TF_OPS"
+
+  def __str__(self):
+    return self.value
+
+  @staticmethod
+  def get_options():
+    """Returns a list of OpsSet options as a list of strings."""
+    return [str(option) for option in list(OpsSet)]
+
+
+class ConverterError(Exception):
+  """Raised when an error occurs during model conversion."""
+  pass
+
+
+# Don't expose these for now.
+#  @_tf_export("lite.toco_convert_protos")
+def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
+  """Convert `input_data_str` according to model and toco parameters.
+
+  Unless you know what you are doing consider using
+  the more friendly `tf.lite.toco_convert`.
+
+  Args:
+    model_flags_str: Serialized proto describing model properties, see
+      `toco/model_flags.proto`.
+    toco_flags_str: Serialized proto describing conversion properties, see
+      `toco/toco_flags.proto`.
+    input_data_str: Input data in serialized form (e.g. a graphdef is common)
+  Returns:
+    Converted model in serialized form (e.g. a TFLITE model is common).
+  Raises:
+    ConverterError: When conversion fails in TFLiteConverter, usually due to
+      ops not being supported.
+    RuntimeError: When conversion fails, an exception is raised with the error
+      message embedded.
+  """
+  # TODO(aselle): When toco does not use fatal errors for failure, we can
+  # switch this on.
+  if not _toco_from_proto_bin:
+    try:
+      model_str = _toco_python.TocoConvert(model_flags_str, toco_flags_str,
+                                           input_data_str)
+      return model_str
+    except Exception as e:
+      raise ConverterError("TOCO failed: %s" % e)
+
+  # Windows and TemporaryFile are not that useful together,
+  # since you cannot have two readers/writers. So we have to
+  # make the temporaries and close and delete them explicitly.
+  toco_filename, model_filename, input_filename, output_filename = (
+      None, None, None, None)
+  try:
+    # Build all input files
+    with _tempfile.NamedTemporaryFile(delete=False) as fp_toco, \
+             _tempfile.NamedTemporaryFile(delete=False) as fp_model, \
+             _tempfile.NamedTemporaryFile(delete=False) as fp_input:
+      toco_filename = fp_toco.name
+      input_filename = fp_input.name
+      model_filename = fp_model.name
+      fp_model.write(model_flags_str)
+      fp_toco.write(toco_flags_str)
+      fp_input.write(input_data_str)
+      fp_model.flush()
+      fp_toco.flush()
+      fp_input.flush()
+
+    # Reserve an output file
+    with _tempfile.NamedTemporaryFile(delete=False) as fp:
+      output_filename = fp.name
+
+    # Run
+    cmd = [
+        _toco_from_proto_bin, model_filename, toco_filename, input_filename,
+        output_filename
+    ]
+    cmdline = " ".join(cmd)
+    is_windows = _platform.system() == "Windows"
+    proc = _subprocess.Popen(
+        cmdline,
+        shell=True,
+        stdout=_subprocess.PIPE,
+        stderr=_subprocess.STDOUT,
+        close_fds=not is_windows)
+    stdout, stderr = proc.communicate()
+    exitcode = proc.returncode
+    if exitcode == 0:
+      with open(output_filename, "rb") as fp:
+        return fp.read()
+    else:
+      stdout = _try_convert_to_unicode(stdout)
+      stderr = _try_convert_to_unicode(stderr)
+      raise ConverterError(
+          "TOCO failed. See console for info.\n%s\n%s\n" % (stdout, stderr))
+  finally:
+    # Must manually cleanup files.
+    for filename in [
+        toco_filename, input_filename, model_filename, output_filename]:
+      try:
+        _os.unlink(filename)
+      except (OSError, TypeError):
+        pass
+
+
+def tensor_name(x):
+  return x.name.split(":")[0]
+
+
+# Don't expose these for now.
+# @_tf_export("lite.build_toco_convert_protos")
+def build_toco_convert_protos(input_tensors,
+                              output_tensors,
+                              inference_type=lite_constants.FLOAT,
+                              inference_input_type=None,
+                              input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+                              input_shapes=None,
+                              output_format=lite_constants.TFLITE,
+                              quantized_input_stats=None,
+                              default_ranges_stats=None,
+                              drop_control_dependency=True,
+                              reorder_across_fake_quant=False,
+                              allow_custom_ops=False,
+                              change_concat_input_ranges=False,
+                              post_training_quantize=False,
+                              dump_graphviz_dir=None,
+                              dump_graphviz_video=False,
+                              target_ops=None,
+                              allow_nonexistent_arrays=False):
+  """Builds protocol buffers describing a conversion of a model using TOCO.
+
+  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
+  case the default `input_format` and `output_format` are sufficient.
+
+  Args:
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    inference_type: Target data type of real-number arrays in the output file.
+      Must be `{tf.float32, tf.uint8}`.  (default tf.float32)
+    inference_input_type: Target data type of real-number input arrays. Allows
+      for a different type for input arrays in the case of quantization.
+      Must be `{tf.float32, tf.uint8}`. (default `inference_type`)
+    input_format: Type of data to read Currently must be
+      `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
+    input_shapes: Input array shape. It needs to be a list of the same length
+      as `input_tensors`, or None. (default None)
+    output_format: Output file format. Currently must be `{TFLITE,
+      GRAPHVIZ_DOT}`. (default TFLITE)
+    quantized_input_stats: List of tuples of floats representing the mean and
+      standard deviation. Each tuple maps to the corresponding input tensor.
+      Only need if `inference_input_type` is `QUANTIZED_UINT8`.
+      real_input_value = (quantized_input_value - mean_value) / std_dev_value.
+      (default None)
+    default_ranges_stats: Tuple of integers representing (min, max) range values
+      for all arrays without a specified range. Intended for experimenting with
+      quantization via "dummy quantization". (default None)
+    drop_control_dependency: Boolean indicating whether to drop control
+      dependencies silently. This is due to TFLite not supporting control
+      dependencies. (default True)
+    reorder_across_fake_quant: Boolean indicating whether to reorder FakeQuant
+      nodes in unexpected locations. Used when the location of the FakeQuant
+      nodes is preventing graph transformations necessary to convert the graph.
+      Results in a graph that differs from the quantized training graph,
+      potentially causing differing arithmetic behavior. (default False)
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver.
+      (default False)
+    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
+      inputs and outputs of the concat operator for quantized models. Changes
+      the ranges of concat operator overlap when true. (default False)
+    post_training_quantize: Boolean indicating whether to quantize the weights
+      of the converted float model. Model size will be reduced and there will be
+      latency improvements (at the cost of accuracy).
+      (default False)
+    dump_graphviz_dir: Full filepath of folder to dump the graphs at various
+      stages of processing GraphViz .dot files. Preferred over
+      --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
+      output file. (default None)
+    dump_graphviz_video: Boolean indicating whether to dump the graph after
+      every graph transformation. (default False)
+    target_ops: Experimental flag, subject to change. Set of OpsSet
+      options indicating which converter to use.
+      (default set([OpsSet.TFLITE_BUILTINS]))
+    allow_nonexistent_arrays: Allow specifying array names that don't exist
+      or are unused in the final graph. (default False)
+
+  Returns:
+    model_flags, toco_flags: two protocol buffers describing the conversion
+    process.
+
+  Raises:
+    ValueError:
+      If the input tensor type is unknown
+      Missing mean_values or std_dev_values
+    RuntimeError: If TOCO fails to convert (in which case the runtime error's
+      error text will contain the TOCO error log)
+  """
+  toco = _toco_flags_pb2.TocoFlags()
+  toco.input_format = input_format
+  toco.output_format = output_format
+  toco.inference_type = convert_dtype_to_tflite_type(inference_type)
+  if inference_input_type:
+    toco.inference_input_type = convert_dtype_to_tflite_type(
+        inference_input_type)
+  else:
+    toco.inference_input_type = toco.inference_type
+  toco.drop_control_dependency = drop_control_dependency
+  toco.reorder_across_fake_quant = reorder_across_fake_quant
+  toco.allow_custom_ops = allow_custom_ops
+  toco.post_training_quantize = post_training_quantize
+  if default_ranges_stats:
+    toco.default_ranges_min = default_ranges_stats[0]
+    toco.default_ranges_max = default_ranges_stats[1]
+  if dump_graphviz_dir:
+    toco.dump_graphviz_dir = dump_graphviz_dir
+  toco.dump_graphviz_include_video = dump_graphviz_video
+  if target_ops:
+    if set(target_ops) == set([OpsSet.TFLITE_BUILTINS, OpsSet.SELECT_TF_OPS]):
+      toco.enable_select_tf_ops = True
+    elif set(target_ops) == set([OpsSet.SELECT_TF_OPS]):
+      toco.enable_select_tf_ops = True
+      toco.force_select_tf_ops = True
+
+  model = _model_flags_pb2.ModelFlags()
+  model.change_concat_input_ranges = change_concat_input_ranges
+  for idx, input_tensor in enumerate(input_tensors):
+    input_array = model.input_arrays.add()
+    input_array.name = tensor_name(input_tensor)
+    input_array.data_type = convert_dtype_to_tflite_type(input_tensor.dtype)
+
+    if toco.inference_input_type == _types_pb2.QUANTIZED_UINT8:
+      if not quantized_input_stats:
+        raise ValueError("std_dev and mean must be defined when "
+                         "inference_input_type is QUANTIZED_UINT8.")
+      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
+    if input_shapes is None:
+      shape = input_tensor.get_shape()
+    else:
+      shape = input_shapes[idx]
+    input_array.shape.dims.extend(map(int, shape))
+
+  for output_tensor in output_tensors:
+    model.output_arrays.append(tensor_name(output_tensor))
+
+  model.allow_nonexistent_arrays = allow_nonexistent_arrays
+
+  return model, toco
+
+
+def toco_convert_graph_def(input_data, input_arrays_with_shape, output_arrays,
+                           *args, **kwargs):
+  """"Convert a model using TOCO.
+
+  This function is used to convert GraphDefs that cannot be loaded into
+  TensorFlow to TFLite. Conversion can be customized by providing arguments
+  that are forwarded to `build_toco_convert_protos` (see documentation for
+  details).
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`),
+    input_arrays_with_shape: Tuple of strings representing input tensor names
+      and list of integers representing input shapes
+      (e.g., [("foo" : [1, 16, 16, 3])]). Use only when graph cannot be loaded
+      into TensorFlow and when `input_tensors` is None. (default None)
+    output_arrays: List of output tensors to freeze graph with. Use only when
+      graph cannot be loaded into TensorFlow and when `output_tensors` is None.
+      (default None)
+    *args: See `build_toco_convert_protos`,
+    **kwargs: See `build_toco_convert_protos`.
+
+  Returns:
+    The converted data. For example if TFLite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    Defined in `build_toco_convert_protos`.
+  """
+  model_flags, toco_flags = build_toco_convert_protos(
+      input_tensors=[], output_tensors=[], *args, **kwargs)
+
+  for idx, (name, shape) in enumerate(input_arrays_with_shape):
+    input_array = model_flags.input_arrays.add()
+    if toco_flags.inference_input_type == _types_pb2.QUANTIZED_UINT8:
+      if (("quantized_input_stats" not in kwargs) or
+          (not kwargs["quantized_input_stats"])):
+        raise ValueError("std_dev and mean must be defined when "
+                         "inference_input_type is QUANTIZED_UINT8.")
+      input_array.mean_value, input_array.std_value = kwargs[
+          "quantized_input_stats"][idx]
+    input_array.name = name
+    input_array.shape.dims.extend(map(int, shape))
+
+  for name in output_arrays:
+    model_flags.output_arrays.append(name)
+
+  data = toco_convert_protos(model_flags.SerializeToString(),
+                             toco_flags.SerializeToString(),
+                             input_data.SerializeToString())
+  return data
+
+
+def toco_convert_impl(input_data, input_tensors, output_tensors, *args,
+                      **kwargs):
+  """"Convert a model using TOCO.
+
+  Typically this function is used to convert from TensorFlow GraphDef to TFLite.
+  Conversion can be customized by providing arguments that are forwarded to
+  `build_toco_convert_protos` (see documentation for details).
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`),
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    *args: See `build_toco_convert_protos`,
+    **kwargs: See `build_toco_convert_protos`.
+
+  Returns:
+    The converted data. For example if TFLite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    Defined in `build_toco_convert_protos`.
+  """
+  model_flags, toco_flags = build_toco_convert_protos(
+      input_tensors, output_tensors, *args, **kwargs)
+  data = toco_convert_protos(model_flags.SerializeToString(),
+                             toco_flags.SerializeToString(),
+                             input_data.SerializeToString())
+  return data
+
+
+@_tf_export("lite.toco_convert")
+@deprecation.deprecated(None, "Use `lite.TFLiteConverter` instead.")
+def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
+  """Convert a model using TOCO.
+
+  Typically this function is used to convert from TensorFlow GraphDef to TFLite.
+  Conversion can be customized by providing arguments that are forwarded to
+  `build_toco_convert_protos` (see documentation for details). This function has
+  been deprecated. Please use `lite.TFLiteConverter` instead.
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`),
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    *args: See `build_toco_convert_protos`,
+    **kwargs: See `build_toco_convert_protos`.
+
+  Returns:
+    The converted data. For example if TFLite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    Defined in `build_toco_convert_protos`.
+  """
+  return toco_convert_impl(input_data, input_tensors, output_tensors, *args,
+                           **kwargs)
diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/lite/python/convert_saved_model.py
similarity index 91%
rename from tensorflow/contrib/lite/python/convert_saved_model.py
rename to tensorflow/lite/python/convert_saved_model.py
index d18b60d0ea04ee47e83ea6ea4e773a2f86358d11..f8d986b746911c68e0589b587ce0beceafc0c534 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model.py
+++ b/tensorflow/lite/python/convert_saved_model.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.lite.python.convert import tensor_name
+from tensorflow.lite.python.convert import tensor_name
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import graph_util as tf_graph_util
@@ -197,12 +197,27 @@ def set_tensor_shapes(tensors, shapes):
     tensors: TensorFlow ops.Tensor.
     shapes: Dict of strings representing input tensor names to list of
       integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+
+  Raises:
+    ValueError:
+      `shapes` contains an invalid tensor.
+      `shapes` contains an invalid shape for a valid tensor.
   """
   if shapes:
-    for tensor in tensors:
-      shape = shapes.get(tensor_name(tensor))
+    tensor_names_to_tensor = {tensor_name(tensor): tensor for tensor in tensors}
+    for name, shape in shapes.items():
+      if name not in tensor_names_to_tensor:
+        raise ValueError("Invalid tensor \'{}\' found in tensor shapes "
+                         "map.".format(name))
       if shape is not None:
-        tensor.set_shape(shape)
+        tensor = tensor_names_to_tensor[name]
+        try:
+          tensor.set_shape(shape)
+        except ValueError as error:
+          message = ("The shape of tensor '{0}' cannot be changed from {1} to "
+                     "{2}. {3}".format(name, tensor.get_shape(), shape,
+                                       str(error)))
+          raise ValueError(message)
 
 
 def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
diff --git a/tensorflow/lite/python/convert_saved_model_test.py b/tensorflow/lite/python/convert_saved_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..11bfcdc79548378a0cec8d13a089a8d505ccf7b0
--- /dev/null
+++ b/tensorflow/lite/python/convert_saved_model_test.py
@@ -0,0 +1,331 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFLite SavedModel conversion test cases.
+
+  - Tests converting simple SavedModel graph to TFLite FlatBuffer.
+  - Tests converting simple SavedModel graph to frozen graph.
+  - Tests converting MNIST SavedModel to TFLite FlatBuffer.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from tensorflow.lite.python import convert_saved_model
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import saved_model
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+
+
+class TensorFunctionsTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_v1_only("b/120545219")
+  def testGetTensorsValid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    tensors = convert_saved_model.get_tensors_from_tensor_names(
+        sess.graph, ["Placeholder"])
+    self.assertEqual("Placeholder:0", tensors[0].name)
+
+  @test_util.run_v1_only("b/120545219")
+  def testGetTensorsInvalid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    with self.assertRaises(ValueError) as error:
+      convert_saved_model.get_tensors_from_tensor_names(sess.graph,
+                                                        ["invalid-input"])
+    self.assertEqual("Invalid tensors 'invalid-input' were found.",
+                     str(error.exception))
+
+  @test_util.run_v1_only("b/120545219")
+  def testSetTensorShapeValid(self):
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
+    self.assertEqual([5, 3, 5], tensor.shape.as_list())
+
+  @test_util.run_v1_only("b/120545219")
+  def testSetTensorShapeNoneValid(self):
+    tensor = array_ops.placeholder(dtype=dtypes.float32)
+    self.assertEqual(None, tensor.shape)
+
+    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
+    self.assertEqual([1, 3, 5], tensor.shape.as_list())
+
+  @test_util.run_v1_only("b/120545219")
+  def testSetTensorShapeArrayInvalid(self):
+    # Tests set_tensor_shape where the tensor name passed in doesn't exist.
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    with self.assertRaises(ValueError) as error:
+      convert_saved_model.set_tensor_shapes([tensor],
+                                            {"invalid-input": [5, 3, 5]})
+    self.assertEqual(
+        "Invalid tensor 'invalid-input' found in tensor shapes map.",
+        str(error.exception))
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+  @test_util.run_v1_only("b/120545219")
+  def testSetTensorShapeDimensionInvalid(self):
+    # Tests set_tensor_shape where the shape passed in is incompatiable.
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    with self.assertRaises(ValueError) as error:
+      convert_saved_model.set_tensor_shapes([tensor],
+                                            {"Placeholder": [1, 5, 5]})
+    self.assertIn(
+        "The shape of tensor 'Placeholder' cannot be changed from "
+        "(?, 3, 5) to [1, 5, 5].", str(error.exception))
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+  @test_util.run_v1_only("b/120545219")
+  def testSetTensorShapeEmpty(self):
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    convert_saved_model.set_tensor_shapes([tensor], {})
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+
+class FreezeSavedModelTest(test_util.TensorFlowTestCase):
+
+  def _createSimpleSavedModel(self, shape):
+    """Create a simple SavedModel on the fly."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
+    with session.Session() as sess:
+      in_tensor = array_ops.placeholder(shape=shape, dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      inputs = {"x": in_tensor}
+      outputs = {"y": out_tensor}
+      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  def _createSavedModelTwoInputArrays(self, shape):
+    """Create a simple SavedModel."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
+    with session.Session() as sess:
+      in_tensor_1 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputB")
+      in_tensor_2 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputA")
+      out_tensor = in_tensor_1 + in_tensor_2
+      inputs = {"x": in_tensor_1, "y": in_tensor_2}
+      outputs = {"z": out_tensor}
+      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  def _getArrayNames(self, tensors):
+    return [tensor.name for tensor in tensors]
+
+  def _getArrayShapes(self, tensors):
+    dims = []
+    for tensor in tensors:
+      dim_tensor = []
+      for dim in tensor.shape:
+        if isinstance(dim, tensor_shape.Dimension):
+          dim_tensor.append(dim.value)
+        else:
+          dim_tensor.append(dim)
+      dims.append(dim_tensor)
+    return dims
+
+  def _convertSavedModel(self,
+                         saved_model_dir,
+                         input_arrays=None,
+                         input_shapes=None,
+                         output_arrays=None,
+                         tag_set=None,
+                         signature_key=None):
+    if tag_set is None:
+      tag_set = set([tag_constants.SERVING])
+    if signature_key is None:
+      signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    graph_def, in_tensors, out_tensors = convert_saved_model.freeze_saved_model(
+        saved_model_dir=saved_model_dir,
+        input_arrays=input_arrays,
+        input_shapes=input_shapes,
+        output_arrays=output_arrays,
+        tag_set=tag_set,
+        signature_key=signature_key)
+    return graph_def, in_tensors, out_tensors
+
+  def testSimpleSavedModel(self):
+    """Test a SavedModel."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    _, in_tensors, out_tensors = self._convertSavedModel(saved_model_dir)
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
+
+  def testSimpleSavedModelWithNoneBatchSizeInShape(self):
+    """Test a SavedModel with None in input tensor's shape."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
+    _, in_tensors, out_tensors = self._convertSavedModel(saved_model_dir)
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[None, 16, 16, 3]])
+
+  def testSimpleSavedModelWithInvalidSignatureKey(self):
+    """Test a SavedModel that fails due to an invalid signature_key."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    with self.assertRaises(ValueError) as error:
+      self._convertSavedModel(saved_model_dir, signature_key="invalid-key")
+    self.assertEqual(
+        "No 'invalid-key' in the SavedModel's SignatureDefs. "
+        "Possible values are 'serving_default'.", str(error.exception))
+
+  def testSimpleSavedModelWithInvalidOutputArray(self):
+    """Test a SavedModel that fails due to invalid output arrays."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    with self.assertRaises(ValueError) as error:
+      self._convertSavedModel(saved_model_dir, output_arrays=["invalid-output"])
+    self.assertEqual("Invalid tensors 'invalid-output' were found.",
+                     str(error.exception))
+
+  def testSimpleSavedModelWithWrongInputArrays(self):
+    """Test a SavedModel that fails due to invalid input arrays."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+
+    # Check invalid input_arrays.
+    with self.assertRaises(ValueError) as error:
+      self._convertSavedModel(saved_model_dir, input_arrays=["invalid-input"])
+    self.assertEqual("Invalid tensors 'invalid-input' were found.",
+                     str(error.exception))
+
+    # Check valid and invalid input_arrays.
+    with self.assertRaises(ValueError) as error:
+      self._convertSavedModel(
+          saved_model_dir, input_arrays=["Placeholder", "invalid-input"])
+    self.assertEqual("Invalid tensors 'invalid-input' were found.",
+                     str(error.exception))
+
+  def testSimpleSavedModelWithCorrectArrays(self):
+    """Test a SavedModel with correct input_arrays and output_arrays."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
+    _, in_tensors, out_tensors = self._convertSavedModel(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["Placeholder"],
+        output_arrays=["add"])
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[None, 16, 16, 3]])
+
+  def testSimpleSavedModelWithCorrectInputArrays(self):
+    """Test a SavedModel with correct input_arrays and input_shapes."""
+    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
+    _, in_tensors, out_tensors = self._convertSavedModel(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["Placeholder"],
+        input_shapes={"Placeholder": [1, 16, 16, 3]})
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
+
+  def testTwoInputArrays(self):
+    """Test a simple SavedModel."""
+    saved_model_dir = self._createSavedModelTwoInputArrays(shape=[1, 16, 16, 3])
+
+    _, in_tensors, out_tensors = self._convertSavedModel(
+        saved_model_dir=saved_model_dir, input_arrays=["inputB", "inputA"])
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["inputA:0", "inputB:0"])
+    self.assertEqual(
+        self._getArrayShapes(in_tensors), [[1, 16, 16, 3], [1, 16, 16, 3]])
+
+  def testSubsetInputArrays(self):
+    """Test a SavedModel with a subset of the input array names of the model."""
+    saved_model_dir = self._createSavedModelTwoInputArrays(shape=[1, 16, 16, 3])
+
+    # Check case where input shape is given.
+    _, in_tensors, out_tensors = self._convertSavedModel(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["inputA"],
+        input_shapes={"inputA": [1, 16, 16, 3]})
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["inputA:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
+
+    # Check case where input shape is None.
+    _, in_tensors, out_tensors = self._convertSavedModel(
+        saved_model_dir=saved_model_dir, input_arrays=["inputA"])
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["inputA:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
+
+  def testMultipleMetaGraphDef(self):
+    """Test saved model with multiple MetaGraphDefs."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "savedmodel_two_mgd")
+    builder = saved_model.builder.SavedModelBuilder(saved_model_dir)
+    with session.Session(graph=ops.Graph()) as sess:
+      # MetaGraphDef 1
+      in_tensor = array_ops.placeholder(shape=[1, 28, 28], dtype=dtypes.float32)
+      out_tensor = in_tensor + in_tensor
+      sig_input_tensor = saved_model.utils.build_tensor_info(in_tensor)
+      sig_input_tensor_signature = {"x": sig_input_tensor}
+      sig_output_tensor = saved_model.utils.build_tensor_info(out_tensor)
+      sig_output_tensor_signature = {"y": sig_output_tensor}
+      predict_signature_def = (
+          saved_model.signature_def_utils.build_signature_def(
+              sig_input_tensor_signature, sig_output_tensor_signature,
+              saved_model.signature_constants.PREDICT_METHOD_NAME))
+      signature_def_map = {
+          saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+              predict_signature_def
+      }
+      builder.add_meta_graph_and_variables(
+          sess,
+          tags=[saved_model.tag_constants.SERVING, "additional_test_tag"],
+          signature_def_map=signature_def_map)
+
+      # MetaGraphDef 2
+      builder.add_meta_graph(tags=["tflite"])
+      builder.save(True)
+
+    # Convert to tflite
+    _, in_tensors, out_tensors = self._convertSavedModel(
+        saved_model_dir=saved_model_dir,
+        tag_set=set([saved_model.tag_constants.SERVING, "additional_test_tag"]))
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 28, 28]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
similarity index 81%
rename from tensorflow/contrib/lite/python/convert_test.py
rename to tensorflow/lite/python/convert_test.py
index 40a8b5fafb2dbf3b30dfae4ad307737b18782480..cf49ee2b472d2c6617811cde0978eb8ae3a16f8e 100644
--- a/tensorflow/contrib/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -19,10 +19,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.lite.python import convert
-from tensorflow.contrib.lite.python import lite_constants
-from tensorflow.contrib.lite.python import op_hint
-from tensorflow.contrib.lite.python.interpreter import Interpreter
+from tensorflow.lite.python import convert
+from tensorflow.lite.python import lite_constants
+from tensorflow.lite.python import op_hint
+from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -33,6 +34,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class ConvertTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
@@ -65,6 +67,21 @@ class ConvertTest(test_util.TensorFlowTestCase):
         quantized_input_stats=[(0., 1.)])
     self.assertTrue(tflite_model)
 
+  def testQuantizationInvalid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor + in_tensor, min=0., max=1.)
+    sess = session.Session()
+
+    with self.assertRaises(ValueError) as error:
+      convert.toco_convert(
+          sess.graph_def, [in_tensor], [out_tensor],
+          inference_type=lite_constants.QUANTIZED_UINT8)
+    self.assertEqual(
+        "std_dev and mean must be defined when inference_input_type is "
+        "QUANTIZED_UINT8.", str(error.exception))
+
   def testGraphDefBasic(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32, name="input")
@@ -138,7 +155,29 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]["shape"]).all())
     self.assertTrue(output_details[0]["quantization"][0] > 0)  # scale
 
+  def testGraphDefQuantizationInvalid(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputA")
+    in_tensor_2 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputB")
+    _ = array_ops.fake_quant_with_min_max_args(
+        in_tensor_1 + in_tensor_2, min=0., max=1., name="output")
+    sess = session.Session()
 
+    input_arrays_map = [("inputA", [1, 16, 16, 3]), ("inputB", [1, 16, 16, 3])]
+    output_arrays = ["output"]
+    with self.assertRaises(ValueError) as error:
+      convert.toco_convert_graph_def(
+          sess.graph_def,
+          input_arrays_map,
+          output_arrays,
+          inference_type=lite_constants.QUANTIZED_UINT8)
+    self.assertEqual(
+        "std_dev and mean must be defined when inference_input_type is "
+        "QUANTIZED_UINT8.", str(error.exception))
+
+
+@test_util.run_v1_only("b/120545219")
 class ConvertTestOpHint(test_util.TensorFlowTestCase):
   """Test the hint to stub functionality."""
 
@@ -146,7 +185,7 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
     """Returns used op types in `graphdef` reachable from `output_nodes`.
 
     This is used to check that after the stub transformation the expected
-    nodes are there. Typically use this with self.assertCountEqual(...).
+    nodes are there.
 
     NOTE: this is not a exact test that the graph is the correct output, but
       it balances compact expressibility of test with sanity checking.
@@ -196,11 +235,11 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
       stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
           graph_def=sess.graph_def)
 
-      self.assertCountEqual(
+      self.assertEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
               output_nodes=[op_hint._tensor_name_base(output.name)]),
-          ["cool_activation", "Const", "Identity"])
+          set(["cool_activation", "Const", "Identity"]))
 
   def testScaleAndBiasAndIdentity(self):
     """This tests a scaled add which has 3 inputs and 2 outputs."""
@@ -223,11 +262,11 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
       stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
           graph_def=sess.graph_def)
 
-      self.assertCountEqual(
+      self.assertEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
               output_nodes=[op_hint._tensor_name_base(output.name)]),
-          ["scale_and_bias_and_identity", "Const", "Identity", "Pack"])
+          set(["scale_and_bias_and_identity", "Const", "Identity", "Pack"]))
 
   def testTwoFunctions(self):
     """Tests if two functions are converted correctly."""
@@ -248,11 +287,11 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
       self.assertEqual(self._countIdentities(sess.graph_def.node), 5)
       stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
           graph_def=sess.graph_def)
-      self.assertCountEqual(
+      self.assertEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
               output_nodes=[op_hint._tensor_name_base(output.name)]),
-          ["add_test", "Const", "Identity", "Add"])
+          set(["add_test", "Const", "Identity", "Add"]))
 
   def _get_input_index(self, x):
     return x.op.node_def.attr[op_hint.OpHint.FUNCTION_INPUT_INDEX_ATTR].i
@@ -323,11 +362,32 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       stubbed_graphdef = op_hint.convert_op_hints_to_stubs(
           graph_def=sess.graph_def)
-      self.assertCountEqual(
+      self.assertEqual(
           self._getGraphOpTypes(
               stubbed_graphdef,
               output_nodes=[op_hint._tensor_name_base(output.name)]),
-          ["agg", "Const", "Identity"])
+          set(["agg", "Const", "Identity"]))
+
+  def testConvertDtype(self):
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(lite_constants.FLOAT),
+        _types_pb2.FLOAT)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.float32), _types_pb2.FLOAT)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.int32), _types_pb2.INT32)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.int64), _types_pb2.INT64)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.string), _types_pb2.STRING)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.uint8),
+        _types_pb2.QUANTIZED_UINT8)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.complex64),
+        _types_pb2.COMPLEX64)
+    with self.assertRaises(ValueError):
+      convert.convert_dtype_to_tflite_type(dtypes.bool)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/lite/python/create_custom_op.py b/tensorflow/lite/python/create_custom_op.py
similarity index 98%
rename from tensorflow/contrib/lite/python/create_custom_op.py
rename to tensorflow/lite/python/create_custom_op.py
index 830f95358c455047db2cbad15cfed8c221e95dca..344cd28d160f2d3d4f277bbfb41aa21087659af5 100644
--- a/tensorflow/contrib/lite/python/create_custom_op.py
+++ b/tensorflow/lite/python/create_custom_op.py
@@ -19,7 +19,7 @@ portions of a TensorFlow GraphDef to be executed by custom code.
 
 Example:
 
-bazel run tensorflow/contrib/lite/python:create_custom_op  -- \
+bazel run tensorflow/lite/python:create_custom_op  -- \
   --input_graph=/tmp/input.pb \
   --output_graph=/tmp/output.pb \
   --inputs=concat,concat_1 \
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
similarity index 98%
rename from tensorflow/contrib/lite/python/interpreter.py
rename to tensorflow/lite/python/interpreter.py
index 6300552cbe3129bb8fded6c37c8a1d30052d3489..a6183d13b56c787aac0d9d9fc190eff277eb4c8e 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import sys
 import numpy as np
 from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 # Lazy load since some of the performance benchmark skylark rules
 # break dependencies. Must use double quotes to match code internal rewrite
@@ -27,13 +28,14 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 # pylint: disable=g-inconsistent-quotes
 _interpreter_wrapper = LazyLoader(
     "_interpreter_wrapper", globals(),
-    "tensorflow.contrib.lite.python.interpreter_wrapper."
+    "tensorflow.lite.python.interpreter_wrapper."
     "tensorflow_wrap_interpreter_wrapper")
 # pylint: enable=g-inconsistent-quotes
 
 del LazyLoader
 
 
+@_tf_export('lite.Interpreter')
 class Interpreter(object):
   """Interpreter inferace for TF-Lite Models."""
 
@@ -223,6 +225,7 @@ class Interpreter(object):
 
     Usage:
 
+    ```
     interpreter.allocate_tensors()
     input = interpreter.tensor(interpreter.get_input_details()[0]["index"])
     output = interpreter.tensor(interpreter.get_output_details()[0]["index"])
@@ -230,6 +233,7 @@ class Interpreter(object):
       input().fill(3.)
       interpreter.invoke()
       print("inference %s" % output())
+    ```
 
     Notice how this function avoids making a numpy array directly. This is
     because it is important to not hold actual numpy views to the data longer
@@ -240,12 +244,14 @@ class Interpreter(object):
 
     WRONG:
 
+    ```
     input = interpreter.tensor(interpreter.get_input_details()[0]["index"])()
     output = interpreter.tensor(interpreter.get_output_details()[0]["index"])()
     interpreter.allocate_tensors()  # This will throw RuntimeError
     for i in range(10):
       input.fill(3.)
       interpreter.invoke()  # this will throw RuntimeError since input,output
+    ```
 
     Args:
       tensor_index: Tensor index of tensor to get. This value can be gotten from
diff --git a/tensorflow/contrib/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
similarity index 98%
rename from tensorflow/contrib/lite/python/interpreter_test.py
rename to tensorflow/lite/python/interpreter_test.py
index e77d52ca9950ec42300264bb56ebce253d4982b1..7ec56a21c9ffa82e1893d3846d92564539ac34ae 100644
--- a/tensorflow/contrib/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -21,7 +21,7 @@ import io
 import numpy as np
 import six
 
-from tensorflow.contrib.lite.python import interpreter as interpreter_wrapper
+from tensorflow.lite.python import interpreter as interpreter_wrapper
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..767a9fc476398dd8fb60128f73f8ae7c518d9a21
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -0,0 +1,31 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
+cc_library(
+    name = "interpreter_wrapper_lib",
+    srcs = ["interpreter_wrapper.cc"],
+    hdrs = ["interpreter_wrapper.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "tensorflow_wrap_interpreter_wrapper",
+    srcs = [
+        "interpreter_wrapper.i",
+    ],
+    deps = [
+        ":interpreter_wrapper_lib",
+        "//third_party/python_runtime:headers",
+    ],
+)
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
similarity index 98%
rename from tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
rename to tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 1e2384b6d23167026c1e22689581aed113c6d577..d14af439ec0ab600ea260da17ef0041cca25d629 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h"
+#include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
 
 #include <sstream>
 #include <string>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
 
 // Disallow Numpy 1.7 deprecated symbols.
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
@@ -124,6 +124,8 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
       return NPY_INT16;
     case kTfLiteUInt8:
       return NPY_UINT8;
+    case kTfLiteInt8:
+      return NPY_INT8;
     case kTfLiteInt64:
       return NPY_INT64;
     case kTfLiteString:
@@ -150,6 +152,8 @@ TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
       return kTfLiteInt16;
     case NPY_UINT8:
       return kTfLiteUInt8;
+    case NPY_INT8:
+      return kTfLiteInt8;
     case NPY_INT64:
       return kTfLiteInt64;
     case NPY_BOOL:
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
similarity index 93%
rename from tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
rename to tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
index b98046fe8a2ce54c5304df770fae548bc5f4db55..ffb02780255e4141ae12e3acf3e157b5b0539b4d 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
-#define TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
 
 #include <memory>
 #include <string>
@@ -104,4 +104,4 @@ class InterpreterWrapper {
 }  // namespace interpreter_wrapper
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_INTERPRETER_WRAPPER_H_
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.i b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
similarity index 88%
rename from tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.i
rename to tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
index afb2092eacab1d8dcccf8c75cee1d8d5c34d7e75..f52ef1eeca7db397d84d249b74445a3276bc65fb 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.i
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
@@ -18,13 +18,13 @@ limitations under the License.
 
 %{
 #define SWIG_FILE_WITH_INIT
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
 %}
 
 
-%include "tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h"
+%include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
 
 namespace tflite {
 namespace interpreter_wrapper {
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/lite/python/lite.py
similarity index 90%
rename from tensorflow/contrib/lite/python/lite.py
rename to tensorflow/lite/python/lite.py
index 09365f101f1b7b38d970f995d28d091944558f75..1b20ff2f92b6a84c21972ccccbc27ec6f999d74b 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -25,8 +25,6 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 @@convert_op_hints_to_stubs
 @@build_toco_convert_protos
 
-@@FLOAT
-@@QUANTIZED_UINT8
 @@TFLITE
 @@GRAPHVIZ_DOT
 
@@ -39,20 +37,21 @@ from six import PY3
 
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
-from tensorflow.contrib.lite.python import lite_constants as constants
-from tensorflow.contrib.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
-from tensorflow.contrib.lite.python.convert import ConverterMode
-from tensorflow.contrib.lite.python.convert import tensor_name as _tensor_name
-from tensorflow.contrib.lite.python.convert import toco_convert  # pylint: disable=unused-import
-from tensorflow.contrib.lite.python.convert import toco_convert_graph_def as _toco_convert_graph_def
-from tensorflow.contrib.lite.python.convert import toco_convert_impl as _toco_convert_impl
-from tensorflow.contrib.lite.python.convert import toco_convert_protos  # pylint: disable=unused-import
-from tensorflow.contrib.lite.python.convert_saved_model import freeze_saved_model as _freeze_saved_model
-from tensorflow.contrib.lite.python.convert_saved_model import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
-from tensorflow.contrib.lite.python.convert_saved_model import set_tensor_shapes as _set_tensor_shapes
-from tensorflow.contrib.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
-from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
-from tensorflow.contrib.lite.python.op_hint import OpHint  # pylint: disable=unused-import
+from tensorflow.lite.python import lite_constants as constants
+from tensorflow.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
+from tensorflow.lite.python.convert import ConverterError  # pylint: disable=unused-import
+from tensorflow.lite.python.convert import OpsSet
+from tensorflow.lite.python.convert import tensor_name as _tensor_name
+from tensorflow.lite.python.convert import toco_convert  # pylint: disable=unused-import
+from tensorflow.lite.python.convert import toco_convert_graph_def as _toco_convert_graph_def
+from tensorflow.lite.python.convert import toco_convert_impl as _toco_convert_impl
+from tensorflow.lite.python.convert import toco_convert_protos  # pylint: disable=unused-import
+from tensorflow.lite.python.convert_saved_model import freeze_saved_model as _freeze_saved_model
+from tensorflow.lite.python.convert_saved_model import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
+from tensorflow.lite.python.convert_saved_model import set_tensor_shapes as _set_tensor_shapes
+from tensorflow.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
+from tensorflow.lite.python.op_hint import OpHint  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
@@ -64,8 +63,10 @@ from tensorflow.python.lib.io import file_io as _file_io
 from tensorflow.python.saved_model import signature_constants as _signature_constants
 from tensorflow.python.saved_model import tag_constants as _tag_constants
 from tensorflow.python.util import deprecation as _deprecation
+from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
+@_tf_export("lite.TFLiteConverter")
 class TFLiteConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
 
@@ -75,10 +76,10 @@ class TFLiteConverter(object):
   Attributes:
 
     inference_type: Target data type of real-number arrays in the output file.
-      Must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+      Must be `{tf.float32, tf.uint8}`. (default tf.float32)
     inference_input_type: Target data type of real-number input arrays. Allows
       for a different type for input arrays in the case of quantization.
-      Must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+      Must be `{tf.float32, tf.uint8}`. (default `inference_type`)
     output_format: Output file format. Currently must be `{TFLITE,
       GRAPHVIZ_DOT}`. (default TFLITE)
     quantized_input_stats: Dict of strings representing input tensor names
@@ -116,8 +117,9 @@ class TFLiteConverter(object):
       output file. (default None)
     dump_graphviz_video: Boolean indicating whether to dump the graph after
       every graph transformation. (default False)
-    converter_mode: Experimental flag, subject to change. ConverterMode
-      indicating which converter to use. (default ConverterMode.DEFAULT)
+    target_ops: Experimental flag, subject to change. Set of OpsSet
+      options indicating which converter to use.
+      (default set([OpsSet.TFLITE_BUILTINS]))
 
   Example usage:
 
@@ -183,7 +185,7 @@ class TFLiteConverter(object):
     self.post_training_quantize = False
     self.dump_graphviz_dir = None
     self.dump_graphviz_video = False
-    self.converter_mode = ConverterMode.DEFAULT
+    self.target_ops = set([OpsSet.TFLITE_BUILTINS])
 
     # Attributes are used by models that cannot be loaded into TensorFlow.
     if not self._has_valid_tensors():
@@ -394,20 +396,20 @@ class TFLiteConverter(object):
       ValueError:
         Input shape is not specified.
         None value for dimension in input_tensor.
-        ConverterMode option is unsupported for the model.
     """
     # Checks dimensions in input tensor.
     if self._has_valid_tensors():
       for tensor in self._input_tensors:
-        if not tensor.get_shape():
+        shape = tensor.get_shape()
+        if not shape or not shape.as_list():
           raise ValueError("Provide an input shape for input array "
                            "'{0}'.".format(_tensor_name(tensor)))
-        shape = tensor.get_shape().as_list()
-        if None in shape[1:]:
+        shape_list = shape.as_list()
+        if None in shape_list[1:]:
           raise ValueError(
               "None is only supported in the 1st dimension. Tensor '{0}' has "
-              "invalid shape '{1}'.".format(_tensor_name(tensor), shape))
-        elif shape[0] is None:
+              "invalid shape '{1}'.".format(_tensor_name(tensor), shape_list))
+        elif shape_list[0] is None:
           self._set_batch_size(batch_size=1)
 
     # Get quantization stats. Ensures there is one stat per name if the stats
@@ -439,24 +441,19 @@ class TFLiteConverter(object):
         "change_concat_input_ranges": self.change_concat_input_ranges,
         "allow_custom_ops": self.allow_custom_ops,
         "post_training_quantize": self.post_training_quantize,
+        "target_ops": self.target_ops,
         "dump_graphviz_dir": self.dump_graphviz_dir,
         "dump_graphviz_video": self.dump_graphviz_video
     }
 
     # Converts model.
     if self._has_valid_tensors():
-      converter_kwargs["converter_mode"] = self.converter_mode
       result = _toco_convert_impl(
           input_data=self._graph_def,
           input_tensors=self._input_tensors,
           output_tensors=self._output_tensors,
           **converter_kwargs)
     else:
-      # Graphs without valid tensors cannot be loaded into tf.Session since they
-      # contain TFLite operation(s) that cannot be resolved in TensorFlow.
-      if self.converter_mode != ConverterMode.DEFAULT:
-        raise ValueError("This model can only be converted with the default "
-                         "converter.")
       result = _toco_convert_graph_def(
           input_data=self._graph_def,
           input_arrays_with_shape=self._input_arrays_with_shape,
@@ -503,6 +500,7 @@ class TFLiteConverter(object):
       tensor.set_shape(shape)
 
 
+@_tf_export("lite.TocoConverter")
 class TocoConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
 
diff --git a/tensorflow/lite/python/lite_constants.py b/tensorflow/lite/python/lite_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5d6d1037952bed73ffa5adff13b4bdbf264185c
--- /dev/null
+++ b/tensorflow/lite/python/lite_constants.py
@@ -0,0 +1,65 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Constants for TFLite."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export as _tf_export
+
+FLOAT = dtypes.float32
+INT32 = dtypes.int32
+INT64 = dtypes.int64
+STRING = dtypes.string
+QUANTIZED_UINT8 = dtypes.uint8
+COMPLEX64 = dtypes.complex64
+TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
+TFLITE = _toco_flags_pb2.TFLITE
+GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
+
+_tf_export(v1=["lite.constants.FLOAT"]).export_constant(__name__, "FLOAT")
+_tf_export(v1=["lite.constants.INT32"]).export_constant(__name__, "INT32")
+_tf_export(v1=["lite.constants.INT64"]).export_constant(__name__, "INT64")
+_tf_export(v1=["lite.constants.STRING"]).export_constant(__name__, "STRING")
+_tf_export(v1=["lite.constants.QUANTIZED_UINT8"]).export_constant(
+    __name__, "QUANTIZED_UINT8")
+_tf_export("lite.constants.TFLITE").export_constant(__name__, "TFLITE")
+_tf_export("lite.constants.GRAPHVIZ_DOT").export_constant(
+    __name__, "GRAPHVIZ_DOT")
+
+# Currently the default mode of operation is to shell to another python process
+# to protect against crashes. However, it breaks some dependent targets because
+# it forces us to depend on an external py_binary. The experimental API doesn't
+# have that drawback.
+EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
+
+
+_allowed_symbols = [
+    "FLOAT",
+    "INT32",
+    "INT64",
+    "STRING",
+    "QUANTIZED_UINT8",
+    "COMPLEX64",
+    "TENSORFLOW_GRAPHDEF",
+    "TFLITE",
+    "GRAPHVIZ_DOT",
+    "EXPERIMENTAL_USE_TOCO_API_DIRECTLY",
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
similarity index 93%
rename from tensorflow/contrib/lite/python/lite_test.py
rename to tensorflow/lite/python/lite_test.py
index d243a494f6e57d92864a988cd8ffa3cd87587db6..1f9c768b4441cc1385d93285d26eeee9b651ca83 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -22,9 +22,9 @@ import os
 import tempfile
 import numpy as np
 
-from tensorflow.contrib.lite.python import lite
-from tensorflow.contrib.lite.python import lite_constants
-from tensorflow.contrib.lite.python.interpreter import Interpreter
+from tensorflow.lite.python import lite
+from tensorflow.lite.python import lite_constants
+from tensorflow.lite.python.interpreter import Interpreter
 from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
@@ -80,6 +80,7 @@ class FromConstructor(test_util.TensorFlowTestCase):
     self.assertTrue(converter._has_valid_tensors())
 
 
+@test_util.run_v1_only('b/120545219')
 class FromSessionTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -177,12 +178,57 @@ class FromSessionTest(test_util.TensorFlowTestCase):
         'Quantization input stats are not available for input tensors '
         '\'inputB\'.', str(error.exception))
 
+  def testIntermediateInputArray(self):
+    """Convert a model from an intermediate input array."""
+    in_tensor_init = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    in_tensor_final = in_tensor_init + in_tensor_init
+    out_tensor = in_tensor_final + in_tensor_final
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor_final],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('add', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add_1', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
   def testSizeNoneInvalid(self):
     in_tensor = array_ops.placeholder(dtype=dtypes.float32)
     out_tensor = in_tensor + in_tensor
     sess = session.Session()
 
-    # Test invalid shape. None after 1st dimension.
+    # Test None as shape.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
+                     str(error.exception))
+
+  def testSizeEmptyInvalid(self):
+    in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Test empty shape.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     with self.assertRaises(ValueError) as error:
@@ -190,7 +236,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
                      str(error.exception))
 
-  def testBatchSizeInvalid(self):
+  def testSizeInvalid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, None, 16, 3], dtype=dtypes.float32)
     out_tensor = in_tensor + in_tensor
@@ -421,7 +467,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.converter_mode = lite.ConverterMode.TOCO_FLEX_ALL
+    converter.target_ops = set([lite.OpsSet.SELECT_TF_OPS])
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -452,6 +498,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
 
+@test_util.run_v1_only('b/120545219')
 class FromFrozenGraphFile(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -594,8 +641,17 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
   # TODO(nupurgarg): Test model loading in open source.
   def _initObjectDetectionArgs(self):
     # Initializes the arguments required for the object detection model.
-    self._graph_def_file = resource_loader.get_path_to_datafile(
-        'testdata/tflite_graph.pb')
+    # Looks for the model file which is saved in a different location interally
+    # and externally.
+    filename = resource_loader.get_path_to_datafile('testdata/tflite_graph.pb')
+    if not os.path.exists(filename):
+      filename = os.path.join(
+          resource_loader.get_root_dir_with_all_resources(),
+          '../tflite_mobilenet_ssd_quant_protobuf/tflite_graph.pb')
+      if not os.path.exists(filename):
+        raise IOError("File '{0}' does not exist.".format(filename))
+
+    self._graph_def_file = filename
     self._input_arrays = ['normalized_input_image_tensor']
     self._output_arrays = [
         'TFLite_Detection_PostProcess', 'TFLite_Detection_PostProcess:1',
@@ -690,6 +746,7 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
 
+@test_util.run_v1_only('b/120545219')
 class FromSavedModelTest(test_util.TensorFlowTestCase):
 
   def _createSavedModel(self, shape):
@@ -834,6 +891,7 @@ class FromSavedModelTest(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
 
+@test_util.run_v1_only('b/120545219')
 class FromKerasFile(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -922,12 +980,13 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     """Test a Sequential tf.keras model testing input shapes argument."""
     keras_file = self._getSequentialModel()
 
-    # Passing in shape of invalid input array has no impact as long as all input
-    # arrays have a shape.
-    converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, input_shapes={'invalid-input': [2, 3]})
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    # Passing in shape of invalid input array raises error.
+    with self.assertRaises(ValueError) as error:
+      converter = lite.TFLiteConverter.from_keras_model_file(
+          keras_file, input_shapes={'invalid-input': [2, 3]})
+    self.assertEqual(
+        "Invalid tensor 'invalid-input' found in tensor shapes map.",
+        str(error.exception))
 
     # Passing in shape of valid input array.
     converter = lite.TFLiteConverter.from_keras_model_file(
diff --git a/tensorflow/contrib/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
similarity index 98%
rename from tensorflow/contrib/lite/python/op_hint.py
rename to tensorflow/lite/python/op_hint.py
index 8c920132e5c2dd33b61904b83fda1368dc7bfa2e..8d7f9316bfe81255510fc5aca9ffdf9671cd64df 100644
--- a/tensorflow/contrib/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -24,7 +24,7 @@ accelerated tflite op.
 Example:
   def tflite_cool_activation(input):
     # A cool activation function.
-    custom = tf.contrib.lite.OpHint("cool_activation")
+    custom = tf.lite.OpHint("cool_activation")
     input, = custom.add_inputs(input)
     output = tf.sigmoid(input) * input
     output, = custom.add_outputs(output)
@@ -35,8 +35,8 @@ Example:
 
   session = tf.Session()
 
-  graphdef_to_convert = tf.contrib.lite.convert_op_hints_to_stubs(session)
-  tflite_graph = tf.contrib.lite.toco_convert(graphdef_to_convert,
+  graphdef_to_convert = tf.lite.convert_op_hints_to_stubs(session)
+  tflite_graph = tf.lite.toco_convert(graphdef_to_convert,
                                               [image], [output])
                                               [image], [output])
   with open("/tmp/graph.fb", "wb") as fp:
@@ -86,8 +86,10 @@ from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
+@_tf_export("lite.OpHint")
 class OpHint(object):
   """A class that helps build tflite function invocations.
 
@@ -102,9 +104,9 @@ class OpHint(object):
   that make up the pseudo op. A similar process is done to any output that
   is to be exported from the current op.
 
-  TODO(aselle): When TensorFlow functions functionality works for arbitrary
-  constructs, this mechanism can be retired and changed to use python defun's.
   """
+  # TODO(aselle): When TensorFlow functions functionality works for arbitrary
+  # constructs, this mechanism can be retired and changed to use python defun's.
 
   # Attr constants that are used for representation in the GraphDef. These
   # will be used on every Identity op that is involved in a total OpHint.
@@ -136,14 +138,14 @@ class OpHint(object):
   # Types of aggregations
   #  stack: stacks all ophints with matching tags. i.e. for a static rnn.
   #   specifically, this is good for an input or output to a static rnn cell.
-  AGGREGATE_STACK = _compat.as_bytes("stack")
+  AGGREGATE_STACK = "stack"
   # first: only takes the first output (one with lowest sort index)
   # of matching tags. This is good for the input state to an RNN.
-  AGGREGATE_FIRST = _compat.as_bytes("first")
+  AGGREGATE_FIRST = "first"
   # aggregation last takes only the last tag (one with highest sort index).
   # This is good for an output value on the last stack item of a
   # static rnn.
-  AGGREGATE_LAST = _compat.as_bytes("last")
+  AGGREGATE_LAST = "last"
 
   class OpHintArgumentTracker(object):
     """Conceptually tracks indices of arguments of "OpHint functions".
@@ -401,7 +403,7 @@ class _LiteOperand(object):
       out_graphdef: A graphdef that is ready to have this input added.
 
     Returns:
-      The the output that the stub should use as an input for this operand.
+      The output that the stub should use as an input for this operand.
 
     Raises:
       RuntimeError: if the method is not implemented.
@@ -656,7 +658,7 @@ def _find_all_hints_in_graph_def(graphdef):
     if sort == -1: sort = None
     aggregation = None
     if OpHint.FUNCTION_AGGREGATE_ATTR in attr:
-      aggregation = attr[OpHint.FUNCTION_AGGREGATE_ATTR].s
+      aggregation = _compat.as_text(attr[OpHint.FUNCTION_AGGREGATE_ATTR].s)
 
     # Add the input or output
     def put_operand(stuff, index, sort, operand, aggregation):
@@ -936,6 +938,7 @@ def _remove_redundant_stack_unstack(graph_def):
   return curr
 
 
+@_tf_export("lite.convert_op_hints_to_stubs")
 def _convert_op_hints_to_stubs_helper(
     graph_def, write_callback=lambda sess, graph_def: None):
   """Converts a graph_def to a new graph_def where all op hints are stubbed.
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
similarity index 88%
rename from tensorflow/contrib/lite/python/tflite_convert.py
rename to tensorflow/lite/python/tflite_convert.py
index d6d9052a4ec7b33ec52751f1ab13991dd6caf159..341b539bead296ca28c1f5f8c17928e553ebabc4 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -22,21 +22,43 @@ import argparse
 import os
 import sys
 
-from tensorflow.contrib.lite.python import lite
-from tensorflow.contrib.lite.python import lite_constants
-from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
+from tensorflow.lite.python import lite
+from tensorflow.lite.python import lite_constants
+from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.python.platform import app
 
 
 def _parse_array(values, type_fn=str):
-  if values:
+  if values is not None:
     return [type_fn(val) for val in values.split(",") if val]
+  return None
 
 
 def _parse_set(values):
-  if values:
-    return set(values.split(","))
+  if values is not None:
+    return set([item for item in values.split(",") if item])
+  return None
+
+
+def _parse_inference_type(value, flag):
+  """Converts the inference type to the value of the constant.
+
+  Args:
+    value: str representing the inference type.
+    flag: str representing the flag name.
+
+  Returns:
+    tf.dtype.
+
+  Raises:
+    ValueError: Unsupported value.
+  """
+  if value == "FLOAT":
+    return lite_constants.FLOAT
+  if value == "QUANTIZED_UINT8":
+    return lite_constants.QUANTIZED_UINT8
+  raise ValueError("Unsupported value for --{0}. Only FLOAT and "
+                   "QUANTIZED_UINT8 are supported.".format(flag))
 
 
 def _get_toco_converter(flags):
@@ -99,10 +121,11 @@ def _convert_model(flags):
   # Create converter.
   converter = _get_toco_converter(flags)
   if flags.inference_type:
-    converter.inference_type = _types_pb2.IODataType.Value(flags.inference_type)
+    converter.inference_type = _parse_inference_type(flags.inference_type,
+                                                     "inference_type")
   if flags.inference_input_type:
-    converter.inference_input_type = _types_pb2.IODataType.Value(
-        flags.inference_input_type)
+    converter.inference_input_type = _parse_inference_type(
+        flags.inference_input_type, "inference_input_type")
   if flags.output_format:
     converter.output_format = _toco_flags_pb2.FileFormat.Value(
         flags.output_format)
@@ -113,7 +136,7 @@ def _convert_model(flags):
 
     # In quantized inference, mean_value has to be integer so that the real
     # value 0.0 is exactly representable.
-    if flags.inference_type == lite_constants.QUANTIZED_UINT8:
+    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
       mean_values = _parse_array(flags.mean_values, type_fn=int)
     else:
       mean_values = _parse_array(flags.mean_values, type_fn=float)
@@ -143,12 +166,18 @@ def _convert_model(flags):
 
   if flags.allow_custom_ops:
     converter.allow_custom_ops = flags.allow_custom_ops
-  if flags.converter_mode:
-    converter.converter_mode = flags.converter_mode
+  if flags.target_ops:
+    ops_set_options = lite.OpsSet.get_options()
+    converter.target_ops = set()
+    for option in flags.target_ops.split(","):
+      if option not in ops_set_options:
+        raise ValueError("Invalid value for --target_ops. Options: "
+                         "{0}".format(",".join(ops_set_options)))
+      converter.target_ops.add(lite.OpsSet(option))
 
   if flags.post_training_quantize:
     converter.post_training_quantize = flags.post_training_quantize
-    if flags.inference_type == lite_constants.QUANTIZED_UINT8:
+    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
       print("--post_training_quantize quantizes a graph of inference_type "
             "FLOAT. Overriding inference type QUANTIZED_UINT8 to FLOAT.")
       converter.inference_type = lite_constants.FLOAT
@@ -292,8 +321,8 @@ def run_main(_):
       "--saved_model_tag_set",
       type=str,
       help=("Comma-separated set of tags identifying the MetaGraphDef within "
-            "the SavedModel to analyze. All tags must be present. "
-            "(default \"serve\")"))
+            "the SavedModel to analyze. All tags must be present. In order to "
+            "pass in an empty tag set, pass in \"\". (default \"serve\")"))
   parser.add_argument(
       "--saved_model_signature_key",
       type=str,
@@ -377,11 +406,12 @@ def run_main(_):
             "provide these to the TensorFlow Lite runtime with a custom "
             "resolver. (default False)"))
   parser.add_argument(
-      "--converter_mode",
-      type=lite.ConverterMode,
-      choices=list(lite.ConverterMode),
-      help=("Experimental flag, subject to change. ConverterMode indicating "
-            "which converter to use. (default ConverterMode.DEFAULT)"))
+      "--target_ops",
+      type=str,
+      help=("Experimental flag, subject to change. Set of OpsSet options "
+            "indicating which converter to use. Options: {0}. One or more "
+            "option may be specified. (default set([OpsSet.TFLITE_BUILTINS]))"
+            "".format(",".join(lite.OpsSet.get_options()))))
 
   # Logging flags.
   parser.add_argument(
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..69d5458c6e432a2370a2ca4998a5d4664398c528
--- /dev/null
+++ b/tensorflow/lite/schema/BUILD
@@ -0,0 +1,99 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+py_binary(
+    name = "upgrade_schema",
+    srcs = [
+        "upgrade_schema.py",
+    ],
+    data = [
+        "schema_v0.fbs",
+        "schema_v1.fbs",
+        "schema_v2.fbs",
+        "schema_v3.fbs",
+        "@flatbuffers//:flatc",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+# TODO(wvo): re-enable this test once latest FlatBuffers has landed.
+
+py_test(
+    name = "upgrade_schema_test",
+    size = "small",
+    srcs = ["upgrade_schema_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "no_oss",
+        "no_pip",
+        "notap",
+    ],
+    deps = [
+        ":upgrade_schema",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+exports_files([
+    "schema_v0.fbs",
+    "schema_v1.fbs",
+    "schema_v2.fbs",
+    "schema_v3.fbs",
+])
+
+load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+
+# Generic schema for inference on device.
+flatbuffer_cc_library(
+    name = "schema_fbs",
+    srcs = ["schema.fbs"],
+)
+
+# Generic schema for inference on device (but with reflections makes bigger).
+flatbuffer_cc_library(
+    name = "schema_fbs_with_reflection",
+    srcs = ["schema.fbs"],
+    flatc_args = [
+        "--reflect-types",
+        "--reflect-names",
+        "--no-union-value-namespacing",
+        "--gen-object-api",
+    ],
+    gen_reflections = True,
+    out_prefix = "reflection/",
+)
+
+# Schema test to make sure we don't introduce backward incompatible changes
+# to schemas.
+cc_test(
+    name = "flatbuffer_compatibility_test",
+    size = "small",
+    srcs = ["flatbuffer_compatibility_test.cc"],
+    data = [
+        "schema.fbs",
+        "schema_v3.fbs",
+    ],
+    tags = [
+        "no_oss",
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        "//tensorflow/core:lib_platform",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers//:flatc_library",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/schema/builtin_ops_header/BUILD b/tensorflow/lite/schema/builtin_ops_header/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..52cbd052d6aa8cafcf562eb483638915be297cf7
--- /dev/null
+++ b/tensorflow/lite/schema/builtin_ops_header/BUILD
@@ -0,0 +1,43 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "generator",
+    srcs = ["generator.cc"],
+    hdrs = ["generator.h"],
+    deps = [
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_binary(
+    name = "generate",
+    srcs = ["generate.cc"],
+    deps = [
+        ":generator",
+    ],
+)
+
+cc_test(
+    name = "generator_test",
+    srcs = ["generator_test.cc"],
+    deps = [
+        ":generator",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "consistency_test",
+    srcs = ["consistency_test.cc"],
+    data = [
+        "//tensorflow/lite:builtin_ops.h",
+    ],
+    deps = [
+        ":generator",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/schema/builtin_ops_header/README.md b/tensorflow/lite/schema/builtin_ops_header/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e34a30b8182560d1c5d49b75010f2427a1ff0ad6
--- /dev/null
+++ b/tensorflow/lite/schema/builtin_ops_header/README.md
@@ -0,0 +1,12 @@
+# Builtin Ops Header Generator.
+
+This directory contains a code generator to generate a pure C header for
+builtin op definition.
+
+Whenever you add a new builtin op, please execute:
+
+```sh
+bazel run \
+  //tensorflow/lite/schema/builtin_ops_header:generate > \
+  tensorflow/lite/builtin_ops.h
+```
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/consistency_test.cc b/tensorflow/lite/schema/builtin_ops_header/consistency_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/schema/builtin_ops_header/consistency_test.cc
rename to tensorflow/lite/schema/builtin_ops_header/consistency_test.cc
index d55c125c117db3c1b8d67ab0b674abe2e7c39d94..f62dcda2e82851bcd57105ad6e2a65ccec1086b4 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/consistency_test.cc
+++ b/tensorflow/lite/schema/builtin_ops_header/consistency_test.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include <fstream>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/schema/builtin_ops_header/generator.h"
+#include "tensorflow/lite/schema/builtin_ops_header/generator.h"
 
 namespace {
 
 const char* kHeaderFileName =
-    "tensorflow/contrib/lite/builtin_ops.h";
+    "tensorflow/lite/builtin_ops.h";
 
 // The test ensures that `builtin_ops.h` is consistent with the FlatBuffer
 // schema definition. When the schema is modified, it's required to run the
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/generate.cc b/tensorflow/lite/schema/builtin_ops_header/generate.cc
similarity index 92%
rename from tensorflow/contrib/lite/schema/builtin_ops_header/generate.cc
rename to tensorflow/lite/schema/builtin_ops_header/generate.cc
index 72a28987b8d4863b0f03f7861177940177edd884..125dcd485be0fb2eb889c621f7a0cb608dc28688 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generate.cc
+++ b/tensorflow/lite/schema/builtin_ops_header/generate.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <iostream>
-#include "tensorflow/contrib/lite/schema/builtin_ops_header/generator.h"
+#include "tensorflow/lite/schema/builtin_ops_header/generator.h"
 
 // This executable is used to generate builtin_ops.h in TensorFlow Lite.
 // Please see README.md for more details.
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc b/tensorflow/lite/schema/builtin_ops_header/generator.cc
similarity index 92%
rename from tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
rename to tensorflow/lite/schema/builtin_ops_header/generator.cc
index 9dc8daa227dd68ccde2efa4013ac4465a72e6bb0..e2967aee0ff4cba42888c89215dabbf2aaf4469a 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
+++ b/tensorflow/lite/schema/builtin_ops_header/generator.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/schema/builtin_ops_header/generator.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/builtin_ops_header/generator.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace builtin_ops_header {
@@ -35,8 +35,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
-#define TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
+#ifndef TENSORFLOW_LITE_BUILTIN_OPS_H_
+#define TENSORFLOW_LITE_BUILTIN_OPS_H_
 
 // DO NOT EDIT MANUALLY: This file is automatically generated by
 // `schema/builtin_ops_header/generator.cc`.
@@ -56,7 +56,7 @@ const char* kFileFooter =
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
-#endif  // TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
+#endif  // TENSORFLOW_LITE_BUILTIN_OPS_H_
 )";
 }  // anonymous namespace
 
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.h b/tensorflow/lite/schema/builtin_ops_header/generator.h
similarity index 86%
rename from tensorflow/contrib/lite/schema/builtin_ops_header/generator.h
rename to tensorflow/lite/schema/builtin_ops_header/generator.h
index 3241ff83d599ed8a476fc1d5a88c26143ebfbaf2..8c9383a992daa542908cb9802a206f306fe3aba0 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.h
+++ b/tensorflow/lite/schema/builtin_ops_header/generator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // An utility library to generate pure C header for builtin ops definition.
-#ifndef TENSORFLOW_CONTRIB_LITE_SCHEMA_BUILTIN_OPS_HEADER_GENERATOR_H_
-#define TENSORFLOW_CONTRIB_LITE_SCHEMA_BUILTIN_OPS_HEADER_GENERATOR_H_
+#ifndef TENSORFLOW_LITE_SCHEMA_BUILTIN_OPS_HEADER_GENERATOR_H_
+#define TENSORFLOW_LITE_SCHEMA_BUILTIN_OPS_HEADER_GENERATOR_H_
 
 #include <iostream>
 
@@ -35,4 +35,4 @@ bool GenerateHeader(std::ostream& os);
 }  // namespace builtin_ops_header
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_SCHEMA_BUILTIN_OPS_HEADER_GENERATOR_H_
+#endif  // TENSORFLOW_LITE_SCHEMA_BUILTIN_OPS_HEADER_GENERATOR_H_
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/generator_test.cc b/tensorflow/lite/schema/builtin_ops_header/generator_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/schema/builtin_ops_header/generator_test.cc
rename to tensorflow/lite/schema/builtin_ops_header/generator_test.cc
index a7dc8e1b0486eda6e09f38a209dca95c0317a1fb..c508c981bb3a75760059131a983881aa48c2fbc2 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generator_test.cc
+++ b/tensorflow/lite/schema/builtin_ops_header/generator_test.cc
@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/schema/builtin_ops_header/generator.h"
+#include "tensorflow/lite/schema/builtin_ops_header/generator.h"
 #include <fstream>
 #include <gtest/gtest.h>
 
diff --git a/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc b/tensorflow/lite/schema/flatbuffer_compatibility_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
rename to tensorflow/lite/schema/flatbuffer_compatibility_test.cc
index 22b4616ccbb75658ae781ec73b87cd4db771e5bf..86177aeb1272469865918bfb2f3fc627bdcdd1cb 100644
--- a/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
+++ b/tensorflow/lite/schema/flatbuffer_compatibility_test.cc
@@ -62,9 +62,9 @@ TEST(SchemaTest, TestCompatibility) {
   // TODO(aselle): Need a reliable way to load files.
   std::string base_contents, current_contents;
   const char *base_filename =
-      TFLITE_TF_PREFIX "contrib/lite/schema/schema_v3.fbs";
+      TFLITE_TF_PREFIX "lite/schema/schema_v3.fbs";
   const char *current_filename =
-      TFLITE_TF_PREFIX "contrib/lite/schema/schema.fbs";
+      TFLITE_TF_PREFIX "lite/schema/schema.fbs";
 
   ASSERT_TRUE(LoadFileRaw(base_filename, &base_contents));
   ASSERT_TRUE(LoadFileRaw(current_filename, &current_contents));
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
similarity index 88%
rename from tensorflow/contrib/lite/schema/schema.fbs
rename to tensorflow/lite/schema/schema.fbs
index ff8430827c7849408d17d482f6e065d832d8522a..980f13b19b4f6a32fe8b693c560be2b4f4f95fd9 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -25,6 +25,9 @@ file_identifier "TFL3";
 // File extension of any written files.
 file_extension "tflite";
 
+// IMPORTANT: All new members of tables, enums and unions must be added at the
+// end to ensure backwards compatibility.
+
 // The type of data stored in a tensor.
 enum TensorType : byte {
   FLOAT32 = 0,
@@ -36,16 +39,34 @@ enum TensorType : byte {
   BOOL = 6,
   INT16 = 7,
   COMPLEX64 = 8,
+  INT8 = 9,
+}
+
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[ubyte] (force_align: 16);
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
 }
 
-// Parameters for converting a quantized tensor back to float. Given a
-// quantized value q, the corresponding float value f should be:
-//   f = scale * (q - zero_point)
+// Parameters for converting a quantized tensor back to float.
 table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
   min:[float];  // For importing back into tensorflow.
   max:[float];  // For importing back into tensorflow.
   scale:[float];  // For dequantizing the tensor's values.
   zero_point:[long];
+
+  // If this is not none, the quantization parameters above are ignored and the
+  // value of the QuantizationDetails union below should be used.
+  details:QuantizationDetails;
 }
 
 table Tensor {
@@ -176,6 +197,14 @@ enum BuiltinOperator : byte {
   SQUARE = 92,
   ZEROS_LIKE = 93,
   FILL = 94,
+  FLOOR_MOD = 95,
+  RANGE = 96,
+  RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
 }
 
 // Options for the builtin operators.
@@ -250,6 +279,15 @@ union BuiltinOptions {
   FillOptions,
   BidirectionalSequenceLSTMOptions,
   BidirectionalSequenceRNNOptions,
+  UnidirectionalSequenceLSTMOptions,
+  FloorModOptions,
+  RangeOptions,
+  ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -394,6 +432,16 @@ table LSTMOptions {
   kernel_type: LSTMKernelType = FULL;
 }
 
+// An implementation of TensorFlow dynamic_rnn with LSTMCell.
+table UnidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true then first dimension is sequence, otherwise batch.
+  time_major:bool;
+}
+
 table BidirectionalSequenceLSTMOptions {
   fused_activation_function:ActivationFunctionType;
   cell_clip: float; // Optional, 0.0 means no clipping
@@ -409,6 +457,10 @@ table ResizeBilinearOptions {
   align_corners: bool;
 }
 
+table ResizeNearestNeighborOptions {
+  align_corners: bool;
+}
+
 // A call operation options
 table CallOptions {
   // The subgraph index that needs to be called.
@@ -484,6 +536,10 @@ table SplitOptions {
   num_splits: int;
 }
 
+table SplitVOptions {
+  num_splits: int;
+}
+
 table StridedSliceOptions {
   begin_mask: int;
   end_mask: int;
@@ -587,6 +643,10 @@ table OneHotOptions {
   axis:int;
 }
 
+table AbsOptions {
+}
+
+
 table LogicalAndOptions {
 }
 
@@ -610,6 +670,30 @@ table ZerosLikeOptions {
 table FillOptions {
 }
 
+table FloorModOptions {
+}
+
+table RangeOptions {
+}
+
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
similarity index 87%
rename from tensorflow/contrib/lite/schema/schema_generated.h
rename to tensorflow/lite/schema/schema_generated.h
index f3cb113c9c58f8189fda9fb4bdcdab05887362c6..637cbafabdad47892b1e3f4a93837b44d50a5b46 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -22,6 +22,9 @@ limitations under the License.
 
 namespace tflite {
 
+struct CustomQuantization;
+struct CustomQuantizationT;
+
 struct QuantizationParameters;
 struct QuantizationParametersT;
 
@@ -79,12 +82,18 @@ struct LocalResponseNormalizationOptionsT;
 struct LSTMOptions;
 struct LSTMOptionsT;
 
+struct UnidirectionalSequenceLSTMOptions;
+struct UnidirectionalSequenceLSTMOptionsT;
+
 struct BidirectionalSequenceLSTMOptions;
 struct BidirectionalSequenceLSTMOptionsT;
 
 struct ResizeBilinearOptions;
 struct ResizeBilinearOptionsT;
 
+struct ResizeNearestNeighborOptions;
+struct ResizeNearestNeighborOptionsT;
+
 struct CallOptions;
 struct CallOptionsT;
 
@@ -139,6 +148,9 @@ struct SqueezeOptionsT;
 struct SplitOptions;
 struct SplitOptionsT;
 
+struct SplitVOptions;
+struct SplitVOptionsT;
+
 struct StridedSliceOptions;
 struct StridedSliceOptionsT;
 
@@ -217,6 +229,9 @@ struct LogicalOrOptionsT;
 struct OneHotOptions;
 struct OneHotOptionsT;
 
+struct AbsOptions;
+struct AbsOptionsT;
+
 struct LogicalAndOptions;
 struct LogicalAndOptionsT;
 
@@ -238,6 +253,21 @@ struct ZerosLikeOptionsT;
 struct FillOptions;
 struct FillOptionsT;
 
+struct FloorModOptions;
+struct FloorModOptionsT;
+
+struct RangeOptions;
+struct RangeOptionsT;
+
+struct LeakyReluOptions;
+struct LeakyReluOptionsT;
+
+struct SquaredDifferenceOptions;
+struct SquaredDifferenceOptionsT;
+
+struct MirrorPadOptions;
+struct MirrorPadOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -263,11 +293,12 @@ enum TensorType {
   TensorType_BOOL = 6,
   TensorType_INT16 = 7,
   TensorType_COMPLEX64 = 8,
+  TensorType_INT8 = 9,
   TensorType_MIN = TensorType_FLOAT32,
-  TensorType_MAX = TensorType_COMPLEX64
+  TensorType_MAX = TensorType_INT8
 };
 
-inline const TensorType (&EnumValuesTensorType())[9] {
+inline const TensorType (&EnumValuesTensorType())[10] {
   static const TensorType values[] = {
     TensorType_FLOAT32,
     TensorType_FLOAT16,
@@ -277,7 +308,8 @@ inline const TensorType (&EnumValuesTensorType())[9] {
     TensorType_STRING,
     TensorType_BOOL,
     TensorType_INT16,
-    TensorType_COMPLEX64
+    TensorType_COMPLEX64,
+    TensorType_INT8
   };
   return values;
 }
@@ -293,6 +325,7 @@ inline const char * const *EnumNamesTensorType() {
     "BOOL",
     "INT16",
     "COMPLEX64",
+    "INT8",
     nullptr
   };
   return names;
@@ -303,6 +336,87 @@ inline const char *EnumNameTensorType(TensorType e) {
   return EnumNamesTensorType()[index];
 }
 
+enum QuantizationDetails {
+  QuantizationDetails_NONE = 0,
+  QuantizationDetails_CustomQuantization = 1,
+  QuantizationDetails_MIN = QuantizationDetails_NONE,
+  QuantizationDetails_MAX = QuantizationDetails_CustomQuantization
+};
+
+inline const QuantizationDetails (&EnumValuesQuantizationDetails())[2] {
+  static const QuantizationDetails values[] = {
+    QuantizationDetails_NONE,
+    QuantizationDetails_CustomQuantization
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQuantizationDetails() {
+  static const char * const names[] = {
+    "NONE",
+    "CustomQuantization",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQuantizationDetails(QuantizationDetails e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesQuantizationDetails()[index];
+}
+
+template<typename T> struct QuantizationDetailsTraits {
+  static const QuantizationDetails enum_value = QuantizationDetails_NONE;
+};
+
+template<> struct QuantizationDetailsTraits<CustomQuantization> {
+  static const QuantizationDetails enum_value = QuantizationDetails_CustomQuantization;
+};
+
+struct QuantizationDetailsUnion {
+  QuantizationDetails type;
+  void *value;
+
+  QuantizationDetailsUnion() : type(QuantizationDetails_NONE), value(nullptr) {}
+  QuantizationDetailsUnion(QuantizationDetailsUnion&& u) FLATBUFFERS_NOEXCEPT :
+    type(QuantizationDetails_NONE), value(nullptr)
+    { std::swap(type, u.type); std::swap(value, u.value); }
+  QuantizationDetailsUnion(const QuantizationDetailsUnion &) FLATBUFFERS_NOEXCEPT;
+  QuantizationDetailsUnion &operator=(const QuantizationDetailsUnion &u) FLATBUFFERS_NOEXCEPT
+    { QuantizationDetailsUnion t(u); std::swap(type, t.type); std::swap(value, t.value); return *this; }
+  QuantizationDetailsUnion &operator=(QuantizationDetailsUnion &&u) FLATBUFFERS_NOEXCEPT
+    { std::swap(type, u.type); std::swap(value, u.value); return *this; }
+  ~QuantizationDetailsUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T&& val) {
+    Reset();
+    type = QuantizationDetailsTraits<typename T::TableType>::enum_value;
+    if (type != QuantizationDetails_NONE) {
+      value = new T(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  CustomQuantizationT *AsCustomQuantization() {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<CustomQuantizationT *>(value) : nullptr;
+  }
+  const CustomQuantizationT *AsCustomQuantization() const {
+    return type == QuantizationDetails_CustomQuantization ?
+      reinterpret_cast<const CustomQuantizationT *>(value) : nullptr;
+  }
+};
+
+bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type);
+bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types);
+
 enum BuiltinOperator {
   BuiltinOperator_ADD = 0,
   BuiltinOperator_AVERAGE_POOL_2D = 1,
@@ -398,11 +512,19 @@ enum BuiltinOperator {
   BuiltinOperator_SQUARE = 92,
   BuiltinOperator_ZEROS_LIKE = 93,
   BuiltinOperator_FILL = 94,
+  BuiltinOperator_FLOOR_MOD = 95,
+  BuiltinOperator_RANGE = 96,
+  BuiltinOperator_RESIZE_NEAREST_NEIGHBOR = 97,
+  BuiltinOperator_LEAKY_RELU = 98,
+  BuiltinOperator_SQUARED_DIFFERENCE = 99,
+  BuiltinOperator_MIRROR_PAD = 100,
+  BuiltinOperator_ABS = 101,
+  BuiltinOperator_SPLIT_V = 102,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_FILL
+  BuiltinOperator_MAX = BuiltinOperator_SPLIT_V
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[94] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[102] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -497,7 +619,15 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[94] {
     BuiltinOperator_REDUCE_ANY,
     BuiltinOperator_SQUARE,
     BuiltinOperator_ZEROS_LIKE,
-    BuiltinOperator_FILL
+    BuiltinOperator_FILL,
+    BuiltinOperator_FLOOR_MOD,
+    BuiltinOperator_RANGE,
+    BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+    BuiltinOperator_LEAKY_RELU,
+    BuiltinOperator_SQUARED_DIFFERENCE,
+    BuiltinOperator_MIRROR_PAD,
+    BuiltinOperator_ABS,
+    BuiltinOperator_SPLIT_V
   };
   return values;
 }
@@ -599,6 +729,14 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "SQUARE",
     "ZEROS_LIKE",
     "FILL",
+    "FLOOR_MOD",
+    "RANGE",
+    "RESIZE_NEAREST_NEIGHBOR",
+    "LEAKY_RELU",
+    "SQUARED_DIFFERENCE",
+    "MIRROR_PAD",
+    "ABS",
+    "SPLIT_V",
     nullptr
   };
   return names;
@@ -681,11 +819,20 @@ enum BuiltinOptions {
   BuiltinOptions_FillOptions = 68,
   BuiltinOptions_BidirectionalSequenceLSTMOptions = 69,
   BuiltinOptions_BidirectionalSequenceRNNOptions = 70,
+  BuiltinOptions_UnidirectionalSequenceLSTMOptions = 71,
+  BuiltinOptions_FloorModOptions = 72,
+  BuiltinOptions_RangeOptions = 73,
+  BuiltinOptions_ResizeNearestNeighborOptions = 74,
+  BuiltinOptions_LeakyReluOptions = 75,
+  BuiltinOptions_SquaredDifferenceOptions = 76,
+  BuiltinOptions_MirrorPadOptions = 77,
+  BuiltinOptions_AbsOptions = 78,
+  BuiltinOptions_SplitVOptions = 79,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_BidirectionalSequenceRNNOptions
+  BuiltinOptions_MAX = BuiltinOptions_SplitVOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[71] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[80] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -757,7 +904,16 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[71] {
     BuiltinOptions_ZerosLikeOptions,
     BuiltinOptions_FillOptions,
     BuiltinOptions_BidirectionalSequenceLSTMOptions,
-    BuiltinOptions_BidirectionalSequenceRNNOptions
+    BuiltinOptions_BidirectionalSequenceRNNOptions,
+    BuiltinOptions_UnidirectionalSequenceLSTMOptions,
+    BuiltinOptions_FloorModOptions,
+    BuiltinOptions_RangeOptions,
+    BuiltinOptions_ResizeNearestNeighborOptions,
+    BuiltinOptions_LeakyReluOptions,
+    BuiltinOptions_SquaredDifferenceOptions,
+    BuiltinOptions_MirrorPadOptions,
+    BuiltinOptions_AbsOptions,
+    BuiltinOptions_SplitVOptions
   };
   return values;
 }
@@ -835,6 +991,15 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "FillOptions",
     "BidirectionalSequenceLSTMOptions",
     "BidirectionalSequenceRNNOptions",
+    "UnidirectionalSequenceLSTMOptions",
+    "FloorModOptions",
+    "RangeOptions",
+    "ResizeNearestNeighborOptions",
+    "LeakyReluOptions",
+    "SquaredDifferenceOptions",
+    "MirrorPadOptions",
+    "AbsOptions",
+    "SplitVOptions",
     nullptr
   };
   return names;
@@ -1129,6 +1294,42 @@ template<> struct BuiltinOptionsTraits<BidirectionalSequenceRNNOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_BidirectionalSequenceRNNOptions;
 };
 
+template<> struct BuiltinOptionsTraits<UnidirectionalSequenceLSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UnidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<FloorModOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FloorModOptions;
+};
+
+template<> struct BuiltinOptionsTraits<RangeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RangeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<ResizeNearestNeighborOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ResizeNearestNeighborOptions;
+};
+
+template<> struct BuiltinOptionsTraits<LeakyReluOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LeakyReluOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SquaredDifferenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquaredDifferenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<MirrorPadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MirrorPadOptions;
+};
+
+template<> struct BuiltinOptionsTraits<AbsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AbsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SplitVOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SplitVOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1720,6 +1921,78 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_BidirectionalSequenceRNNOptions ?
       reinterpret_cast<const BidirectionalSequenceRNNOptionsT *>(value) : nullptr;
   }
+  UnidirectionalSequenceLSTMOptionsT *AsUnidirectionalSequenceLSTMOptions() {
+    return type == BuiltinOptions_UnidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<UnidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  const UnidirectionalSequenceLSTMOptionsT *AsUnidirectionalSequenceLSTMOptions() const {
+    return type == BuiltinOptions_UnidirectionalSequenceLSTMOptions ?
+      reinterpret_cast<const UnidirectionalSequenceLSTMOptionsT *>(value) : nullptr;
+  }
+  FloorModOptionsT *AsFloorModOptions() {
+    return type == BuiltinOptions_FloorModOptions ?
+      reinterpret_cast<FloorModOptionsT *>(value) : nullptr;
+  }
+  const FloorModOptionsT *AsFloorModOptions() const {
+    return type == BuiltinOptions_FloorModOptions ?
+      reinterpret_cast<const FloorModOptionsT *>(value) : nullptr;
+  }
+  RangeOptionsT *AsRangeOptions() {
+    return type == BuiltinOptions_RangeOptions ?
+      reinterpret_cast<RangeOptionsT *>(value) : nullptr;
+  }
+  const RangeOptionsT *AsRangeOptions() const {
+    return type == BuiltinOptions_RangeOptions ?
+      reinterpret_cast<const RangeOptionsT *>(value) : nullptr;
+  }
+  ResizeNearestNeighborOptionsT *AsResizeNearestNeighborOptions() {
+    return type == BuiltinOptions_ResizeNearestNeighborOptions ?
+      reinterpret_cast<ResizeNearestNeighborOptionsT *>(value) : nullptr;
+  }
+  const ResizeNearestNeighborOptionsT *AsResizeNearestNeighborOptions() const {
+    return type == BuiltinOptions_ResizeNearestNeighborOptions ?
+      reinterpret_cast<const ResizeNearestNeighborOptionsT *>(value) : nullptr;
+  }
+  LeakyReluOptionsT *AsLeakyReluOptions() {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<LeakyReluOptionsT *>(value) : nullptr;
+  }
+  const LeakyReluOptionsT *AsLeakyReluOptions() const {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<const LeakyReluOptionsT *>(value) : nullptr;
+  }
+  SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  const SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() const {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<const SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  MirrorPadOptionsT *AsMirrorPadOptions() {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<MirrorPadOptionsT *>(value) : nullptr;
+  }
+  const MirrorPadOptionsT *AsMirrorPadOptions() const {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<const MirrorPadOptionsT *>(value) : nullptr;
+  }
+  AbsOptionsT *AsAbsOptions() {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<AbsOptionsT *>(value) : nullptr;
+  }
+  const AbsOptionsT *AsAbsOptions() const {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<const AbsOptionsT *>(value) : nullptr;
+  }
+  SplitVOptionsT *AsSplitVOptions() {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<SplitVOptionsT *>(value) : nullptr;
+  }
+  const SplitVOptionsT *AsSplitVOptions() const {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<const SplitVOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -1917,6 +2190,35 @@ inline const char *EnumNameCombinerType(CombinerType e) {
   return EnumNamesCombinerType()[index];
 }
 
+enum MirrorPadMode {
+  MirrorPadMode_REFLECT = 0,
+  MirrorPadMode_SYMMETRIC = 1,
+  MirrorPadMode_MIN = MirrorPadMode_REFLECT,
+  MirrorPadMode_MAX = MirrorPadMode_SYMMETRIC
+};
+
+inline const MirrorPadMode (&EnumValuesMirrorPadMode())[2] {
+  static const MirrorPadMode values[] = {
+    MirrorPadMode_REFLECT,
+    MirrorPadMode_SYMMETRIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesMirrorPadMode() {
+  static const char * const names[] = {
+    "REFLECT",
+    "SYMMETRIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesMirrorPadMode()[index];
+}
+
 enum CustomOptionsFormat {
   CustomOptionsFormat_FLEXBUFFERS = 0,
   CustomOptionsFormat_MIN = CustomOptionsFormat_FLEXBUFFERS,
@@ -1943,12 +2245,75 @@ inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
   return EnumNamesCustomOptionsFormat()[index];
 }
 
+struct CustomQuantizationT : public flatbuffers::NativeTable {
+  typedef CustomQuantization TableType;
+  std::vector<uint8_t> custom;
+  CustomQuantizationT() {
+  }
+};
+
+struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CustomQuantizationT NativeTableType;
+  enum {
+    VT_CUSTOM = 4
+  };
+  const flatbuffers::Vector<uint8_t> *custom() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CUSTOM) &&
+           verifier.VerifyVector(custom()) &&
+           verifier.EndTable();
+  }
+  CustomQuantizationT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CustomQuantization> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CustomQuantizationBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_custom(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom) {
+    fbb_.AddOffset(CustomQuantization::VT_CUSTOM, custom);
+  }
+  explicit CustomQuantizationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CustomQuantizationBuilder &operator=(const CustomQuantizationBuilder &);
+  flatbuffers::Offset<CustomQuantization> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CustomQuantization>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom = 0) {
+  CustomQuantizationBuilder builder_(_fbb);
+  builder_.add_custom(custom);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *custom = nullptr) {
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      custom ? _fbb.CreateVector<uint8_t>(*custom) : 0);
+}
+
+flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct QuantizationParametersT : public flatbuffers::NativeTable {
   typedef QuantizationParameters TableType;
   std::vector<float> min;
   std::vector<float> max;
   std::vector<float> scale;
   std::vector<int64_t> zero_point;
+  QuantizationDetailsUnion details;
   QuantizationParametersT() {
   }
 };
@@ -1959,7 +2324,9 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     VT_MIN = 4,
     VT_MAX = 6,
     VT_SCALE = 8,
-    VT_ZERO_POINT = 10
+    VT_ZERO_POINT = 10,
+    VT_DETAILS_TYPE = 12,
+    VT_DETAILS = 14
   };
   const flatbuffers::Vector<float> *min() const {
     return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
@@ -1973,6 +2340,16 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   const flatbuffers::Vector<int64_t> *zero_point() const {
     return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
   }
+  QuantizationDetails details_type() const {
+    return static_cast<QuantizationDetails>(GetField<uint8_t>(VT_DETAILS_TYPE, 0));
+  }
+  const void *details() const {
+    return GetPointer<const void *>(VT_DETAILS);
+  }
+  template<typename T> const T *details_as() const;
+  const CustomQuantization *details_as_CustomQuantization() const {
+    return details_type() == QuantizationDetails_CustomQuantization ? static_cast<const CustomQuantization *>(details()) : nullptr;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_MIN) &&
@@ -1983,6 +2360,9 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
            verifier.VerifyVector(scale()) &&
            VerifyOffset(verifier, VT_ZERO_POINT) &&
            verifier.VerifyVector(zero_point()) &&
+           VerifyField<uint8_t>(verifier, VT_DETAILS_TYPE) &&
+           VerifyOffset(verifier, VT_DETAILS) &&
+           VerifyQuantizationDetails(verifier, details(), details_type()) &&
            verifier.EndTable();
   }
   QuantizationParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -1990,6 +2370,10 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   static flatbuffers::Offset<QuantizationParameters> Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
+template<> inline const CustomQuantization *QuantizationParameters::details_as<CustomQuantization>() const {
+  return details_as_CustomQuantization();
+}
+
 struct QuantizationParametersBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -2005,6 +2389,12 @@ struct QuantizationParametersBuilder {
   void add_zero_point(flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
     fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
   }
+  void add_details_type(QuantizationDetails details_type) {
+    fbb_.AddElement<uint8_t>(QuantizationParameters::VT_DETAILS_TYPE, static_cast<uint8_t>(details_type), 0);
+  }
+  void add_details(flatbuffers::Offset<void> details) {
+    fbb_.AddOffset(QuantizationParameters::VT_DETAILS, details);
+  }
   explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2022,12 +2412,16 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
     flatbuffers::Offset<flatbuffers::Vector<float>> min = 0,
     flatbuffers::Offset<flatbuffers::Vector<float>> max = 0,
     flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0,
+    QuantizationDetails details_type = QuantizationDetails_NONE,
+    flatbuffers::Offset<void> details = 0) {
   QuantizationParametersBuilder builder_(_fbb);
+  builder_.add_details(details);
   builder_.add_zero_point(zero_point);
   builder_.add_scale(scale);
   builder_.add_max(max);
   builder_.add_min(min);
+  builder_.add_details_type(details_type);
   return builder_.Finish();
 }
 
@@ -2036,13 +2430,17 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersD
     const std::vector<float> *min = nullptr,
     const std::vector<float> *max = nullptr,
     const std::vector<float> *scale = nullptr,
-    const std::vector<int64_t> *zero_point = nullptr) {
+    const std::vector<int64_t> *zero_point = nullptr,
+    QuantizationDetails details_type = QuantizationDetails_NONE,
+    flatbuffers::Offset<void> details = 0) {
   return tflite::CreateQuantizationParameters(
       _fbb,
       min ? _fbb.CreateVector<float>(*min) : 0,
       max ? _fbb.CreateVector<float>(*max) : 0,
       scale ? _fbb.CreateVector<float>(*scale) : 0,
-      zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0);
+      zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0,
+      details_type,
+      details);
 }
 
 flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -3469,6 +3867,96 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
 
 flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct UnidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
+  typedef UnidirectionalSequenceLSTMOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  float cell_clip;
+  float proj_clip;
+  bool time_major;
+  UnidirectionalSequenceLSTMOptionsT()
+      : fused_activation_function(ActivationFunctionType_NONE),
+        cell_clip(0.0f),
+        proj_clip(0.0f),
+        time_major(false) {
+  }
+};
+
+struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UnidirectionalSequenceLSTMOptionsT NativeTableType;
+  enum {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_TIME_MAJOR = 10
+  };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
+           verifier.EndTable();
+  }
+  UnidirectionalSequenceLSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UnidirectionalSequenceLSTMOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(UnidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  explicit UnidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  UnidirectionalSequenceLSTMOptionsBuilder &operator=(const UnidirectionalSequenceLSTMOptionsBuilder &);
+  flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UnidirectionalSequenceLSTMOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    bool time_major = false) {
+  UnidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_time_major(time_major);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct BidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
   typedef BidirectionalSequenceLSTMOptions TableType;
   ActivationFunctionType fused_activation_function;
@@ -3613,6 +4101,60 @@ inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
 
 flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ResizeNearestNeighborOptionsT : public flatbuffers::NativeTable {
+  typedef ResizeNearestNeighborOptions TableType;
+  bool align_corners;
+  ResizeNearestNeighborOptionsT()
+      : align_corners(false) {
+  }
+};
+
+struct ResizeNearestNeighborOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ResizeNearestNeighborOptionsT NativeTableType;
+  enum {
+    VT_ALIGN_CORNERS = 4
+  };
+  bool align_corners() const {
+    return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS) &&
+           verifier.EndTable();
+  }
+  ResizeNearestNeighborOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ResizeNearestNeighborOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ResizeNearestNeighborOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ResizeNearestNeighborOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_align_corners(bool align_corners) {
+    fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
+  }
+  explicit ResizeNearestNeighborOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ResizeNearestNeighborOptionsBuilder &operator=(const ResizeNearestNeighborOptionsBuilder &);
+  flatbuffers::Offset<ResizeNearestNeighborOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ResizeNearestNeighborOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool align_corners = false) {
+  ResizeNearestNeighborOptionsBuilder builder_(_fbb);
+  builder_.add_align_corners(align_corners);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct CallOptionsT : public flatbuffers::NativeTable {
   typedef CallOptions TableType;
   uint32_t subgraph;
@@ -4527,16 +5069,70 @@ inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(
 
 flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct StridedSliceOptionsT : public flatbuffers::NativeTable {
-  typedef StridedSliceOptions TableType;
-  int32_t begin_mask;
-  int32_t end_mask;
-  int32_t ellipsis_mask;
-  int32_t new_axis_mask;
-  int32_t shrink_axis_mask;
-  StridedSliceOptionsT()
-      : begin_mask(0),
-        end_mask(0),
+struct SplitVOptionsT : public flatbuffers::NativeTable {
+  typedef SplitVOptions TableType;
+  int32_t num_splits;
+  SplitVOptionsT()
+      : num_splits(0) {
+  }
+};
+
+struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SplitVOptionsT NativeTableType;
+  enum {
+    VT_NUM_SPLITS = 4
+  };
+  int32_t num_splits() const {
+    return GetField<int32_t>(VT_NUM_SPLITS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SPLITS) &&
+           verifier.EndTable();
+  }
+  SplitVOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SplitVOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SplitVOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num_splits(int32_t num_splits) {
+    fbb_.AddElement<int32_t>(SplitVOptions::VT_NUM_SPLITS, num_splits, 0);
+  }
+  explicit SplitVOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SplitVOptionsBuilder &operator=(const SplitVOptionsBuilder &);
+  flatbuffers::Offset<SplitVOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SplitVOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_splits = 0) {
+  SplitVOptionsBuilder builder_(_fbb);
+  builder_.add_num_splits(num_splits);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct StridedSliceOptionsT : public flatbuffers::NativeTable {
+  typedef StridedSliceOptions TableType;
+  int32_t begin_mask;
+  int32_t end_mask;
+  int32_t ellipsis_mask;
+  int32_t new_axis_mask;
+  int32_t shrink_axis_mask;
+  StridedSliceOptionsT()
+      : begin_mask(0),
+        end_mask(0),
         ellipsis_mask(0),
         new_axis_mask(0),
         shrink_axis_mask(0) {
@@ -5839,6 +6435,46 @@ inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
 
 flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct AbsOptionsT : public flatbuffers::NativeTable {
+  typedef AbsOptions TableType;
+  AbsOptionsT() {
+  }
+};
+
+struct AbsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AbsOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AbsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AbsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AbsOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit AbsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  AbsOptionsBuilder &operator=(const AbsOptionsBuilder &);
+  flatbuffers::Offset<AbsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AbsOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  AbsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct LogicalAndOptionsT : public flatbuffers::NativeTable {
   typedef LogicalAndOptions TableType;
   LogicalAndOptionsT() {
@@ -6145,6 +6781,234 @@ inline flatbuffers::Offset<FillOptions> CreateFillOptions(
 
 flatbuffers::Offset<FillOptions> CreateFillOptions(flatbuffers::FlatBufferBuilder &_fbb, const FillOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct FloorModOptionsT : public flatbuffers::NativeTable {
+  typedef FloorModOptions TableType;
+  FloorModOptionsT() {
+  }
+};
+
+struct FloorModOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef FloorModOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  FloorModOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(FloorModOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FloorModOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FloorModOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit FloorModOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  FloorModOptionsBuilder &operator=(const FloorModOptionsBuilder &);
+  flatbuffers::Offset<FloorModOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FloorModOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  FloorModOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RangeOptionsT : public flatbuffers::NativeTable {
+  typedef RangeOptions TableType;
+  RangeOptionsT() {
+  }
+};
+
+struct RangeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RangeOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RangeOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RangeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RangeOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RangeOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit RangeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  RangeOptionsBuilder &operator=(const RangeOptionsBuilder &);
+  flatbuffers::Offset<RangeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RangeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  RangeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LeakyReluOptionsT : public flatbuffers::NativeTable {
+  typedef LeakyReluOptions TableType;
+  float alpha;
+  LeakyReluOptionsT()
+      : alpha(0.0f) {
+  }
+};
+
+struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LeakyReluOptionsT NativeTableType;
+  enum {
+    VT_ALPHA = 4
+  };
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_ALPHA) &&
+           verifier.EndTable();
+  }
+  LeakyReluOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LeakyReluOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LeakyReluOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LeakyReluOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  explicit LeakyReluOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LeakyReluOptionsBuilder &operator=(const LeakyReluOptionsBuilder &);
+  flatbuffers::Offset<LeakyReluOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LeakyReluOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float alpha = 0.0f) {
+  LeakyReluOptionsBuilder builder_(_fbb);
+  builder_.add_alpha(alpha);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SquaredDifferenceOptionsT : public flatbuffers::NativeTable {
+  typedef SquaredDifferenceOptions TableType;
+  SquaredDifferenceOptionsT() {
+  }
+};
+
+struct SquaredDifferenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SquaredDifferenceOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SquaredDifferenceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SquaredDifferenceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SquaredDifferenceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SquaredDifferenceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SquaredDifferenceOptionsBuilder &operator=(const SquaredDifferenceOptionsBuilder &);
+  flatbuffers::Offset<SquaredDifferenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SquaredDifferenceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SquaredDifferenceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MirrorPadOptionsT : public flatbuffers::NativeTable {
+  typedef MirrorPadOptions TableType;
+  MirrorPadMode mode;
+  MirrorPadOptionsT()
+      : mode(MirrorPadMode_REFLECT) {
+  }
+};
+
+struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MirrorPadOptionsT NativeTableType;
+  enum {
+    VT_MODE = 4
+  };
+  MirrorPadMode mode() const {
+    return static_cast<MirrorPadMode>(GetField<int8_t>(VT_MODE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_MODE) &&
+           verifier.EndTable();
+  }
+  MirrorPadOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MirrorPadOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MirrorPadOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_mode(MirrorPadMode mode) {
+    fbb_.AddElement<int8_t>(MirrorPadOptions::VT_MODE, static_cast<int8_t>(mode), 0);
+  }
+  explicit MirrorPadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  MirrorPadOptionsBuilder &operator=(const MirrorPadOptionsBuilder &);
+  flatbuffers::Offset<MirrorPadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MirrorPadOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    MirrorPadMode mode = MirrorPadMode_REFLECT) {
+  MirrorPadOptionsBuilder builder_(_fbb);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -6488,6 +7352,33 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const BidirectionalSequenceRNNOptions *builtin_options_as_BidirectionalSequenceRNNOptions() const {
     return builtin_options_type() == BuiltinOptions_BidirectionalSequenceRNNOptions ? static_cast<const BidirectionalSequenceRNNOptions *>(builtin_options()) : nullptr;
   }
+  const UnidirectionalSequenceLSTMOptions *builtin_options_as_UnidirectionalSequenceLSTMOptions() const {
+    return builtin_options_type() == BuiltinOptions_UnidirectionalSequenceLSTMOptions ? static_cast<const UnidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const FloorModOptions *builtin_options_as_FloorModOptions() const {
+    return builtin_options_type() == BuiltinOptions_FloorModOptions ? static_cast<const FloorModOptions *>(builtin_options()) : nullptr;
+  }
+  const RangeOptions *builtin_options_as_RangeOptions() const {
+    return builtin_options_type() == BuiltinOptions_RangeOptions ? static_cast<const RangeOptions *>(builtin_options()) : nullptr;
+  }
+  const ResizeNearestNeighborOptions *builtin_options_as_ResizeNearestNeighborOptions() const {
+    return builtin_options_type() == BuiltinOptions_ResizeNearestNeighborOptions ? static_cast<const ResizeNearestNeighborOptions *>(builtin_options()) : nullptr;
+  }
+  const LeakyReluOptions *builtin_options_as_LeakyReluOptions() const {
+    return builtin_options_type() == BuiltinOptions_LeakyReluOptions ? static_cast<const LeakyReluOptions *>(builtin_options()) : nullptr;
+  }
+  const SquaredDifferenceOptions *builtin_options_as_SquaredDifferenceOptions() const {
+    return builtin_options_type() == BuiltinOptions_SquaredDifferenceOptions ? static_cast<const SquaredDifferenceOptions *>(builtin_options()) : nullptr;
+  }
+  const MirrorPadOptions *builtin_options_as_MirrorPadOptions() const {
+    return builtin_options_type() == BuiltinOptions_MirrorPadOptions ? static_cast<const MirrorPadOptions *>(builtin_options()) : nullptr;
+  }
+  const AbsOptions *builtin_options_as_AbsOptions() const {
+    return builtin_options_type() == BuiltinOptions_AbsOptions ? static_cast<const AbsOptions *>(builtin_options()) : nullptr;
+  }
+  const SplitVOptions *builtin_options_as_SplitVOptions() const {
+    return builtin_options_type() == BuiltinOptions_SplitVOptions ? static_cast<const SplitVOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -6799,6 +7690,42 @@ template<> inline const BidirectionalSequenceRNNOptions *Operator::builtin_optio
   return builtin_options_as_BidirectionalSequenceRNNOptions();
 }
 
+template<> inline const UnidirectionalSequenceLSTMOptions *Operator::builtin_options_as<UnidirectionalSequenceLSTMOptions>() const {
+  return builtin_options_as_UnidirectionalSequenceLSTMOptions();
+}
+
+template<> inline const FloorModOptions *Operator::builtin_options_as<FloorModOptions>() const {
+  return builtin_options_as_FloorModOptions();
+}
+
+template<> inline const RangeOptions *Operator::builtin_options_as<RangeOptions>() const {
+  return builtin_options_as_RangeOptions();
+}
+
+template<> inline const ResizeNearestNeighborOptions *Operator::builtin_options_as<ResizeNearestNeighborOptions>() const {
+  return builtin_options_as_ResizeNearestNeighborOptions();
+}
+
+template<> inline const LeakyReluOptions *Operator::builtin_options_as<LeakyReluOptions>() const {
+  return builtin_options_as_LeakyReluOptions();
+}
+
+template<> inline const SquaredDifferenceOptions *Operator::builtin_options_as<SquaredDifferenceOptions>() const {
+  return builtin_options_as_SquaredDifferenceOptions();
+}
+
+template<> inline const MirrorPadOptions *Operator::builtin_options_as<MirrorPadOptions>() const {
+  return builtin_options_as_MirrorPadOptions();
+}
+
+template<> inline const AbsOptions *Operator::builtin_options_as<AbsOptions>() const {
+  return builtin_options_as_AbsOptions();
+}
+
+template<> inline const SplitVOptions *Operator::builtin_options_as<SplitVOptions>() const {
+  return builtin_options_as_SplitVOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -7201,6 +8128,32 @@ inline flatbuffers::Offset<Model> CreateModelDirect(
 
 flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+inline CustomQuantizationT *CustomQuantization::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CustomQuantizationT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CustomQuantization::UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->custom[_i] = _e->Get(_i); } } };
+}
+
+inline flatbuffers::Offset<CustomQuantization> CustomQuantization::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCustomQuantization(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CustomQuantizationT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _custom = _o->custom.size() ? _fbb.CreateVector(_o->custom) : 0;
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      _custom);
+}
+
 inline QuantizationParametersT *QuantizationParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new QuantizationParametersT();
   UnPackTo(_o, _resolver);
@@ -7214,6 +8167,8 @@ inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const
   { auto _e = max(); if (_e) { _o->max.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->max[_i] = _e->Get(_i); } } };
   { auto _e = scale(); if (_e) { _o->scale.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->scale[_i] = _e->Get(_i); } } };
   { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } };
+  { auto _e = details_type(); _o->details.type = _e; };
+  { auto _e = details(); if (_e) _o->details.value = QuantizationDetailsUnion::UnPack(_e, details_type(), _resolver); };
 }
 
 inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -7228,12 +8183,16 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
   auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
   auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
   auto _zero_point = _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
+  auto _details_type = _o->details.type;
+  auto _details = _o->details.Pack(_fbb);
   return tflite::CreateQuantizationParameters(
       _fbb,
       _min,
       _max,
       _scale,
-      _zero_point);
+      _zero_point,
+      _details_type,
+      _details);
 }
 
 inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -7809,29 +8768,64 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBuffe
       _kernel_type);
 }
 
-inline BidirectionalSequenceLSTMOptionsT *BidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new BidirectionalSequenceLSTMOptionsT();
+inline UnidirectionalSequenceLSTMOptionsT *UnidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new UnidirectionalSequenceLSTMOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void BidirectionalSequenceLSTMOptions::UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void UnidirectionalSequenceLSTMOptions::UnPackTo(UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
   { auto _e = cell_clip(); _o->cell_clip = _e; };
   { auto _e = proj_clip(); _o->proj_clip = _e; };
-  { auto _e = merge_outputs(); _o->merge_outputs = _e; };
+  { auto _e = time_major(); _o->time_major = _e; };
 }
 
-inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> BidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateBidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> UnidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUnidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const UnidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceLSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UnidirectionalSequenceLSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  auto _time_major = _o->time_major;
+  return tflite::CreateUnidirectionalSequenceLSTMOptions(
+      _fbb,
+      _fused_activation_function,
+      _cell_clip,
+      _proj_clip,
+      _time_major);
+}
+
+inline BidirectionalSequenceLSTMOptionsT *BidirectionalSequenceLSTMOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new BidirectionalSequenceLSTMOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void BidirectionalSequenceLSTMOptions::UnPackTo(BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
+  { auto _e = cell_clip(); _o->cell_clip = _e; };
+  { auto _e = proj_clip(); _o->proj_clip = _e; };
+  { auto _e = merge_outputs(); _o->merge_outputs = _e; };
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> BidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBidirectionalSequenceLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const BidirectionalSequenceLSTMOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _fused_activation_function = _o->fused_activation_function;
   auto _cell_clip = _o->cell_clip;
   auto _proj_clip = _o->proj_clip;
@@ -7870,6 +8864,32 @@ inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(fl
       _align_corners);
 }
 
+inline ResizeNearestNeighborOptionsT *ResizeNearestNeighborOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ResizeNearestNeighborOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ResizeNearestNeighborOptions::UnPackTo(ResizeNearestNeighborOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = align_corners(); _o->align_corners = _e; };
+}
+
+inline flatbuffers::Offset<ResizeNearestNeighborOptions> ResizeNearestNeighborOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateResizeNearestNeighborOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(flatbuffers::FlatBufferBuilder &_fbb, const ResizeNearestNeighborOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ResizeNearestNeighborOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _align_corners = _o->align_corners;
+  return tflite::CreateResizeNearestNeighborOptions(
+      _fbb,
+      _align_corners);
+}
+
 inline CallOptionsT *CallOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new CallOptionsT();
   UnPackTo(_o, _resolver);
@@ -8323,6 +9343,32 @@ inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBuf
       _num_splits);
 }
 
+inline SplitVOptionsT *SplitVOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SplitVOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SplitVOptions::UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_splits(); _o->num_splits = _e; };
+}
+
+inline flatbuffers::Offset<SplitVOptions> SplitVOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSplitVOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SplitVOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_splits = _o->num_splits;
+  return tflite::CreateSplitVOptions(
+      _fbb,
+      _num_splits);
+}
+
 inline StridedSliceOptionsT *StridedSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new StridedSliceOptionsT();
   UnPackTo(_o, _resolver);
@@ -8984,6 +10030,29 @@ inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatB
       _axis);
 }
 
+inline AbsOptionsT *AbsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new AbsOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void AbsOptions::UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<AbsOptions> AbsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAbsOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AbsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAbsOptions(
+      _fbb);
+}
+
 inline LogicalAndOptionsT *LogicalAndOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new LogicalAndOptionsT();
   UnPackTo(_o, _resolver);
@@ -9151,6 +10220,127 @@ inline flatbuffers::Offset<FillOptions> CreateFillOptions(flatbuffers::FlatBuffe
       _fbb);
 }
 
+inline FloorModOptionsT *FloorModOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new FloorModOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void FloorModOptions::UnPackTo(FloorModOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<FloorModOptions> FloorModOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFloorModOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(flatbuffers::FlatBufferBuilder &_fbb, const FloorModOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const FloorModOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateFloorModOptions(
+      _fbb);
+}
+
+inline RangeOptionsT *RangeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new RangeOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void RangeOptions::UnPackTo(RangeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<RangeOptions> RangeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRangeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RangeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRangeOptions(
+      _fbb);
+}
+
+inline LeakyReluOptionsT *LeakyReluOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LeakyReluOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LeakyReluOptions::UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = alpha(); _o->alpha = _e; };
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> LeakyReluOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLeakyReluOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LeakyReluOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _alpha = _o->alpha;
+  return tflite::CreateLeakyReluOptions(
+      _fbb,
+      _alpha);
+}
+
+inline SquaredDifferenceOptionsT *SquaredDifferenceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SquaredDifferenceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SquaredDifferenceOptions::UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> SquaredDifferenceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSquaredDifferenceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SquaredDifferenceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSquaredDifferenceOptions(
+      _fbb);
+}
+
+inline MirrorPadOptionsT *MirrorPadOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MirrorPadOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void MirrorPadOptions::UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = mode(); _o->mode = _e; };
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> MirrorPadOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMirrorPadOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MirrorPadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _mode = _o->mode;
+  return tflite::CreateMirrorPadOptions(
+      _fbb,
+      _mode);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -9335,6 +10525,75 @@ inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_f
       _metadata_buffer);
 }
 
+inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
+  switch (type) {
+    case QuantizationDetails_NONE: {
+      return true;
+    }
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const CustomQuantization *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return false;
+  }
+}
+
+inline bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector<flatbuffers::Offset<void>> *values, const flatbuffers::Vector<uint8_t> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyQuantizationDetails(
+        verifier,  values->Get(i), types->GetEnum<QuantizationDetails>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *QuantizationDetailsUnion::UnPack(const void *obj, QuantizationDetails type, const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const CustomQuantization *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default: return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> QuantizationDetailsUnion::Pack(flatbuffers::FlatBufferBuilder &_fbb, const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<const CustomQuantizationT *>(value);
+      return CreateCustomQuantization(_fbb, ptr, _rehasher).Union();
+    }
+    default: return 0;
+  }
+}
+
+inline QuantizationDetailsUnion::QuantizationDetailsUnion(const QuantizationDetailsUnion &u) FLATBUFFERS_NOEXCEPT : type(u.type), value(nullptr) {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      value = new CustomQuantizationT(*reinterpret_cast<CustomQuantizationT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void QuantizationDetailsUnion::Reset() {
+  switch (type) {
+    case QuantizationDetails_CustomQuantization: {
+      auto ptr = reinterpret_cast<CustomQuantizationT *>(value);
+      delete ptr;
+      break;
+    }
+    default: break;
+  }
+  value = nullptr;
+  type = QuantizationDetails_NONE;
+}
+
 inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
   switch (type) {
     case BuiltinOptions_NONE: {
@@ -9620,6 +10879,42 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const BidirectionalSequenceRNNOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const UnidirectionalSequenceLSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const FloorModOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<const RangeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const ResizeNearestNeighborOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const AbsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -9918,6 +11213,42 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const BidirectionalSequenceRNNOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const UnidirectionalSequenceLSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const FloorModOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<const RangeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const ResizeNearestNeighborOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const AbsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -10204,6 +11535,42 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const BidirectionalSequenceRNNOptionsT *>(value);
       return CreateBidirectionalSequenceRNNOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const UnidirectionalSequenceLSTMOptionsT *>(value);
+      return CreateUnidirectionalSequenceLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<const FloorModOptionsT *>(value);
+      return CreateFloorModOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<const RangeOptionsT *>(value);
+      return CreateRangeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const ResizeNearestNeighborOptionsT *>(value);
+      return CreateResizeNearestNeighborOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptionsT *>(value);
+      return CreateLeakyReluOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptionsT *>(value);
+      return CreateSquaredDifferenceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptionsT *>(value);
+      return CreateMirrorPadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const AbsOptionsT *>(value);
+      return CreateAbsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const SplitVOptionsT *>(value);
+      return CreateSplitVOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -10490,6 +11857,42 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new BidirectionalSequenceRNNOptionsT(*reinterpret_cast<BidirectionalSequenceRNNOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      value = new UnidirectionalSequenceLSTMOptionsT(*reinterpret_cast<UnidirectionalSequenceLSTMOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FloorModOptions: {
+      value = new FloorModOptionsT(*reinterpret_cast<FloorModOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RangeOptions: {
+      value = new RangeOptionsT(*reinterpret_cast<RangeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      value = new ResizeNearestNeighborOptionsT(*reinterpret_cast<ResizeNearestNeighborOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      value = new LeakyReluOptionsT(*reinterpret_cast<LeakyReluOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      value = new SquaredDifferenceOptionsT(*reinterpret_cast<SquaredDifferenceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      value = new MirrorPadOptionsT(*reinterpret_cast<MirrorPadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      value = new AbsOptionsT(*reinterpret_cast<AbsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      value = new SplitVOptionsT(*reinterpret_cast<SplitVOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -10847,6 +12250,51 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<UnidirectionalSequenceLSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FloorModOptions: {
+      auto ptr = reinterpret_cast<FloorModOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RangeOptions: {
+      auto ptr = reinterpret_cast<RangeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<ResizeNearestNeighborOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<LeakyReluOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<SquaredDifferenceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<MirrorPadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<AbsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<SplitVOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/schema/schema_v0.fbs b/tensorflow/lite/schema/schema_v0.fbs
similarity index 100%
rename from tensorflow/contrib/lite/schema/schema_v0.fbs
rename to tensorflow/lite/schema/schema_v0.fbs
diff --git a/tensorflow/contrib/lite/schema/schema_v1.fbs b/tensorflow/lite/schema/schema_v1.fbs
similarity index 100%
rename from tensorflow/contrib/lite/schema/schema_v1.fbs
rename to tensorflow/lite/schema/schema_v1.fbs
diff --git a/tensorflow/contrib/lite/schema/schema_v2.fbs b/tensorflow/lite/schema/schema_v2.fbs
similarity index 100%
rename from tensorflow/contrib/lite/schema/schema_v2.fbs
rename to tensorflow/lite/schema/schema_v2.fbs
diff --git a/tensorflow/contrib/lite/schema/schema_v3.fbs b/tensorflow/lite/schema/schema_v3.fbs
similarity index 100%
rename from tensorflow/contrib/lite/schema/schema_v3.fbs
rename to tensorflow/lite/schema/schema_v3.fbs
diff --git a/tensorflow/contrib/lite/schema/upgrade_schema.py b/tensorflow/lite/schema/upgrade_schema.py
similarity index 97%
rename from tensorflow/contrib/lite/schema/upgrade_schema.py
rename to tensorflow/lite/schema/upgrade_schema.py
index a2ddf6295014f3b29fa584f2bb367a7e0a4399e7..d9220ba10ca2e93732bae2e43c3251b013ecc8d8 100644
--- a/tensorflow/contrib/lite/schema/upgrade_schema.py
+++ b/tensorflow/lite/schema/upgrade_schema.py
@@ -16,11 +16,11 @@
 
 Usage examples:
 
-bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.json out.json
-bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.bin out.bin
-bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.bin out.json
-bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.json out.bin
-bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.tflite out.tflite
+bazel run tensorflow/lite/schema/upgrade_schema -- in.json out.json
+bazel run tensorflow/lite/schema/upgrade_schema -- in.bin out.bin
+bazel run tensorflow/lite/schema/upgrade_schema -- in.bin out.json
+bazel run tensorflow/lite/schema/upgrade_schema -- in.json out.bin
+bazel run tensorflow/lite/schema/upgrade_schema -- in.tflite out.tflite
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/lite/schema/upgrade_schema_test.py b/tensorflow/lite/schema/upgrade_schema_test.py
similarity index 99%
rename from tensorflow/contrib/lite/schema/upgrade_schema_test.py
rename to tensorflow/lite/schema/upgrade_schema_test.py
index b5002e6f7576b6de533046aaad37fe06746d3644..922968c65aa7601a30f7eef7e0d05ab0e932bedd 100644
--- a/tensorflow/contrib/lite/schema/upgrade_schema_test.py
+++ b/tensorflow/lite/schema/upgrade_schema_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import json
 import tempfile
-from tensorflow.contrib.lite.schema import upgrade_schema as upgrade_schema_lib
+from tensorflow.lite.schema import upgrade_schema as upgrade_schema_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
 
diff --git a/tensorflow/contrib/lite/simple_memory_arena.cc b/tensorflow/lite/simple_memory_arena.cc
similarity index 98%
rename from tensorflow/contrib/lite/simple_memory_arena.cc
rename to tensorflow/lite/simple_memory_arena.cc
index cd0f1f7c17a50f6ce61fa2033e5d13580399f5cf..88bdf50c9b64c6de3e7a10bbcd179da825ad2183 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.cc
+++ b/tensorflow/lite/simple_memory_arena.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/simple_memory_arena.h"
+#include "tensorflow/lite/simple_memory_arena.h"
 
 #include <algorithm>
 #include <cstring>
diff --git a/tensorflow/contrib/lite/simple_memory_arena.h b/tensorflow/lite/simple_memory_arena.h
similarity index 92%
rename from tensorflow/contrib/lite/simple_memory_arena.h
rename to tensorflow/lite/simple_memory_arena.h
index 45d0d8735ee10a2a0c7e49fa3ffe56c4d5f5e318..42203c0c0a32d6b512a423359338501ed7816a45 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.h
+++ b/tensorflow/lite/simple_memory_arena.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
-#define TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+#ifndef TENSORFLOW_LITE_SIMPLE_MEMORY_ARENA_H_
+#define TENSORFLOW_LITE_SIMPLE_MEMORY_ARENA_H_
 
 #include <list>
 #include <memory>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 
@@ -86,4 +86,4 @@ class SimpleMemoryArena {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+#endif  // TENSORFLOW_LITE_SIMPLE_MEMORY_ARENA_H_
diff --git a/tensorflow/contrib/lite/simple_memory_arena_test.cc b/tensorflow/lite/simple_memory_arena_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/simple_memory_arena_test.cc
rename to tensorflow/lite/simple_memory_arena_test.cc
index 60d4d5e768aeda958574422e1c36a7cc2f6a1429..caf13db2c1a6e9b0d495609c159640fb36073981 100644
--- a/tensorflow/contrib/lite/simple_memory_arena_test.cc
+++ b/tensorflow/lite/simple_memory_arena_test.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/simple_memory_arena.h"
+#include "tensorflow/lite/simple_memory_arena.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..e10af3d240eebf2ffac87a67d5a27182c7a516ed
--- /dev/null
+++ b/tensorflow/lite/special_rules.bzl
@@ -0,0 +1,6 @@
+"""External versions of build rules that differ outside of Google."""
+
+def tflite_portable_test_suite(**kwargs):
+    """This is a no-op outside of Google."""
+    _ignore = [kwargs]
+    pass
diff --git a/tensorflow/contrib/lite/stderr_reporter.cc b/tensorflow/lite/stderr_reporter.cc
similarity index 96%
rename from tensorflow/contrib/lite/stderr_reporter.cc
rename to tensorflow/lite/stderr_reporter.cc
index e29a6345fdfe4cf853b79e5ac6da5e9c41600fa6..09eb1d254a608ba2d19c824a323f0b5173afe15f 100644
--- a/tensorflow/contrib/lite/stderr_reporter.cc
+++ b/tensorflow/lite/stderr_reporter.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/stderr_reporter.h"
+#include "tensorflow/lite/stderr_reporter.h"
 #include <cstdarg>
 #include <cstdio>
 
diff --git a/tensorflow/contrib/lite/stderr_reporter.h b/tensorflow/lite/stderr_reporter.h
similarity index 78%
rename from tensorflow/contrib/lite/stderr_reporter.h
rename to tensorflow/lite/stderr_reporter.h
index c6f4ffbdffb4b3ea5059a3e17a612e7f6bfbf5d7..7582b421ee3c9522e7d31e5ac34edf4d5acf2373 100644
--- a/tensorflow/contrib/lite/stderr_reporter.h
+++ b/tensorflow/lite/stderr_reporter.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_STDERR_REPORTER_H_
-#define TENSORFLOW_CONTRIB_LITE_STDERR_REPORTER_H_
+#ifndef TENSORFLOW_LITE_STDERR_REPORTER_H_
+#define TENSORFLOW_LITE_STDERR_REPORTER_H_
 
 #include <cstdarg>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 
 namespace tflite {
 
@@ -31,4 +31,4 @@ ErrorReporter* DefaultErrorReporter();
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_STDERR_REPORTER_H_
+#endif  // TENSORFLOW_LITE_STDERR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/string.h b/tensorflow/lite/string.h
similarity index 86%
rename from tensorflow/contrib/lite/string.h
rename to tensorflow/lite/string.h
index af3fadfcb35074c0a0457096deb77ac7514586eb..65142b11de389f49de7ad406b1dfbb62dfd7f2b0 100644
--- a/tensorflow/contrib/lite/string.h
+++ b/tensorflow/lite/string.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // Abstract string. We don't want even absl at this level.
-#ifndef TENSORFLOW_CONTRIB_LITE_STRING_H_
-#define TENSORFLOW_CONTRIB_LITE_STRING_H_
+#ifndef TENSORFLOW_LITE_STRING_H_
+#define TENSORFLOW_LITE_STRING_H_
 
 #include <string>
 
@@ -26,4 +26,4 @@ using std::string;
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_STRING_H_
+#endif  // TENSORFLOW_LITE_STRING_H_
diff --git a/tensorflow/contrib/lite/string_util.cc b/tensorflow/lite/string_util.cc
similarity index 89%
rename from tensorflow/contrib/lite/string_util.cc
rename to tensorflow/lite/string_util.cc
index b991e999b66daa5897047014873fae045ce51dbd..6efa11d60c55540c099fadc33c7756ae8f77b97f 100644
--- a/tensorflow/contrib/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/string_util.h"
 
+#include <stdlib.h>
 #include <string.h>
 #include <vector>
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 namespace {
@@ -96,14 +96,23 @@ int DynamicBuffer::WriteToBuffer(char** buffer) {
   return bytes;
 }
 
-void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
+void DynamicBuffer::WriteToTensorAsVector(TfLiteTensor* tensor) {
+  auto dims = TfLiteIntArrayCreate(1);
+  dims->data[0] = offset_.size() - 1;  // Store number of strings.
+  WriteToTensor(tensor, dims);
+}
+
+void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor,
+                                  TfLiteIntArray* new_shape) {
   char* tensor_buffer;
   int bytes = WriteToBuffer(&tensor_buffer);
 
+  if (new_shape == nullptr) {
+    new_shape = TfLiteIntArrayCopy(tensor->dims);
+  }
+
   // Set tensor content pointer to tensor_buffer, and release original data.
-  auto dims = TfLiteIntArrayCreate(1);
-  dims->data[0] = offset_.size() - 1;  // Store number of strings.
-  TfLiteTensorReset(tensor->type, tensor->name, dims, tensor->params,
+  TfLiteTensorReset(tensor->type, tensor->name, new_shape, tensor->params,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
                     tensor->is_variable, tensor);
 }
diff --git a/tensorflow/contrib/lite/string_util.h b/tensorflow/lite/string_util.h
similarity index 79%
rename from tensorflow/contrib/lite/string_util.h
rename to tensorflow/lite/string_util.h
index d24627b509558dee815717de14ca114800969b38..f076db76f2d4ef416e5f7ec98ac2ec0aa94d95c2 100644
--- a/tensorflow/contrib/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -37,13 +37,13 @@ limitations under the License.
 //   # described above.
 //   buf.WriteToTensor(tensor)
 
-#ifndef TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+#ifndef TENSORFLOW_LITE_STRING_UTIL_H_
+#define TENSORFLOW_LITE_STRING_UTIL_H_
 
 #include <vector>
 
-#include "tensorflow/contrib/lite/c/c_api_internal.h"
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 
@@ -74,8 +74,18 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
-  // Fill content into a string tensor.
-  void WriteToTensor(TfLiteTensor* tensor);
+  // Fill content into a string tensor, with the given new_shape. The new shape
+  // must match the number of strings in this object. Caller relinquishes
+  // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
+  // existing shape.
+  void WriteToTensor(TfLiteTensor* tensor, TfLiteIntArray* new_shape);
+
+  // Fill content into a string tensor. Set shape to {num_strings}.
+  void WriteToTensorAsVector(TfLiteTensor* tensor);
+
+  // Deprecated. Use WriteToTensorAsVector() or pass in the new shpe.
+  // TODO(b/120230709): remove when people migrate away.
+  void WriteToTensor(TfLiteTensor* tensor) { WriteToTensorAsVector(tensor); }
 
  private:
   // Data buffer to store contents of strings, not including headers.
@@ -94,4 +104,4 @@ StringRef GetString(const char* raw_buffer, int string_index);
 StringRef GetString(const TfLiteTensor* tensor, int string_index);
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+#endif  // TENSORFLOW_LITE_STRING_UTIL_H_
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbf1d7b226af20251d5f70a354a21f1eb40ae1c6
--- /dev/null
+++ b/tensorflow/lite/string_util_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/string_util.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+
+TEST(StringUtil, TestStringUtil) {
+  Interpreter interpreter;
+  interpreter.AddTensors(3);
+
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+
+  TfLiteTensor* t1 = interpreter.tensor(1);
+  t1->type = kTfLiteString;
+  t1->allocation_type = kTfLiteDynamic;
+
+  char data[] = {1, 0, 0, 0, 12, 0, 0, 0, 15, 0, 0, 0, 'X', 'Y', 'Z'};
+
+  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, {}, data,
+                                          15);
+  TfLiteTensor* t2 = interpreter.tensor(2);
+  interpreter.AllocateTensors();
+
+  char s0[] = "ABC";
+  string s1 = "DEFG";
+  char s2[] = "";
+
+  // Write strings to tensors
+  DynamicBuffer buf0;
+  buf0.AddString(s0, 3);
+  DynamicBuffer buf1;
+  buf1.AddString(s1.data(), s1.length());
+  buf0.AddString(s2, 0);
+
+  auto new_shape = TfLiteIntArrayCreate(2);
+  new_shape->data[0] = 2;
+  new_shape->data[1] = 1;
+  buf0.WriteToTensor(t0, new_shape);
+  buf1.WriteToTensorAsVector(t1);
+
+  // Check tensor shapes.
+  EXPECT_EQ(t0->dims->size, 2);
+  EXPECT_EQ(t0->dims->data[0], 2);
+  EXPECT_EQ(t0->dims->data[1], 1);
+
+  EXPECT_EQ(t1->dims->size, 1);
+  EXPECT_EQ(t1->dims->data[0], 1);
+
+  // Read strings from tensors.
+  ASSERT_EQ(GetStringCount(t0), 2);
+  StringRef str_ref;
+  str_ref = GetString(t0, 0);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "ABC");
+  str_ref = GetString(t0, 1);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "");
+  ASSERT_EQ(t0->bytes, 19);
+
+  ASSERT_EQ(GetStringCount(t1), 1);
+  str_ref = GetString(t1, 0);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "DEFG");
+  ASSERT_EQ(t1->bytes, 16);
+
+  ASSERT_EQ(GetStringCount(t2), 1);
+  str_ref = GetString(t2, 0);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "XYZ");
+  ASSERT_EQ(t2->bytes, 15);
+}
+
+TEST(StringUtil, TestAddJoinedString) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+
+  char s0[] = "ABC";
+  char s1[] = "DEFG";
+  char s2[] = "";
+  char s3[] = "XYZ";
+
+  DynamicBuffer buf;
+  buf.AddJoinedString({{s0, 3}, {s1, 4}, {s2, 0}, {s3, 3}}, ' ');
+  buf.WriteToTensorAsVector(t0);
+
+  ASSERT_EQ(GetStringCount(t0), 1);
+  StringRef str_ref;
+  str_ref = GetString(t0, 0);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "ABC DEFG  XYZ");
+  ASSERT_EQ(t0->bytes, 25);
+}
+
+TEST(StringUtil, TestEmptyList) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+  DynamicBuffer buf;
+  buf.WriteToTensorAsVector(t0);
+
+  ASSERT_EQ(GetStringCount(t0), 0);
+  ASSERT_EQ(t0->bytes, 8);
+}
+
+TEST(StringUtil, TestShapes) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+  t0->dims = TfLiteIntArrayCreate(2);
+  t0->dims->data[0] = 2;
+  t0->dims->data[1] = 1;
+
+  // Not setting a new shape: number of strings must match
+  DynamicBuffer buf;
+  buf.AddString("ABC", 3);
+  buf.AddString("X", 1);
+  buf.WriteToTensor(t0, nullptr);
+
+  ASSERT_EQ(t0->dims->size, 2);
+  EXPECT_EQ(t0->dims->data[0], 2);
+  EXPECT_EQ(t0->dims->data[1], 1);
+
+  auto new_shape = TfLiteIntArrayCreate(2);
+  new_shape->data[0] = 1;
+  new_shape->data[1] = 2;
+
+  buf.WriteToTensor(t0, new_shape);
+
+  ASSERT_EQ(t0->dims->size, 2);
+  EXPECT_EQ(t0->dims->data[0], 1);
+  EXPECT_EQ(t0->dims->data[1], 2);
+}
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/testdata/0_subgraphs.bin b/tensorflow/lite/testdata/0_subgraphs.bin
similarity index 100%
rename from tensorflow/contrib/lite/testdata/0_subgraphs.bin
rename to tensorflow/lite/testdata/0_subgraphs.bin
diff --git a/tensorflow/contrib/lite/testdata/2_subgraphs.bin b/tensorflow/lite/testdata/2_subgraphs.bin
similarity index 100%
rename from tensorflow/contrib/lite/testdata/2_subgraphs.bin
rename to tensorflow/lite/testdata/2_subgraphs.bin
diff --git a/tensorflow/lite/testdata/add.bin b/tensorflow/lite/testdata/add.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b4c02350c09130bd5c940db7e17b8eeaa931230d
Binary files /dev/null and b/tensorflow/lite/testdata/add.bin differ
diff --git a/tensorflow/lite/testdata/add.json b/tensorflow/lite/testdata/add.json
new file mode 100644
index 0000000000000000000000000000000000000000..f589bebfbf257b7087b2058169755a532610d2fb
--- /dev/null
+++ b/tensorflow/lite/testdata/add.json
@@ -0,0 +1,79 @@
+{
+  version: 3,
+  operator_codes: [
+    {
+    }
+  ],
+  subgraphs: [
+    {
+      tensors: [
+        {
+          shape: [
+            1,
+            8,
+            8,
+            3
+          ],
+          name: "add"
+        },
+        {
+          shape: [
+            1,
+            8,
+            8,
+            3
+          ],
+          name: "input"
+        },
+        {
+          shape: [
+            1,
+            8,
+            8,
+            3
+          ],
+          name: "output"
+        }
+      ],
+      inputs: [
+        1
+      ],
+      outputs: [
+        2
+      ],
+      operators: [
+        {
+          inputs: [
+            1,
+            1
+          ],
+          outputs: [
+            0
+          ],
+          builtin_options_type: "AddOptions",
+          builtin_options: {
+          }
+        },
+        {
+          inputs: [
+            0,
+            1
+          ],
+          outputs: [
+            2
+          ],
+          builtin_options_type: "AddOptions",
+          builtin_options: {
+          }
+        }
+      ]
+    }
+  ],
+  buffers: [
+    {
+      data: [
+
+      ]
+    }
+  ]
+}
diff --git a/tensorflow/lite/testdata/add_quantized.bin b/tensorflow/lite/testdata/add_quantized.bin
new file mode 100644
index 0000000000000000000000000000000000000000..07d48b93eb87f9774af154b8cdb4d15ecb3e1499
Binary files /dev/null and b/tensorflow/lite/testdata/add_quantized.bin differ
diff --git a/tensorflow/lite/testdata/add_quantized.json b/tensorflow/lite/testdata/add_quantized.json
new file mode 100644
index 0000000000000000000000000000000000000000..f70ed8143e99c7ab9a59897176e278d8650502b5
--- /dev/null
+++ b/tensorflow/lite/testdata/add_quantized.json
@@ -0,0 +1,123 @@
+{
+  version: 3,
+  operator_codes: [
+    {
+    }
+  ],
+  subgraphs: [
+    {
+      tensors: [
+        {
+          shape: [
+            1,
+            8,
+            8,
+            3
+          ],
+          name: "add",
+          quantization: {
+            min: [
+              0.0
+            ],
+            max: [
+              1.0
+            ],
+            scale: [
+              0.003922
+            ],
+            zero_point: [
+              0
+            ]
+          }
+        },
+        {
+          shape: [
+            1,
+            8,
+            8,
+            3
+          ],
+          type: "UINT8",
+          name: "input",
+          quantization: {
+            min: [
+              0.0
+            ],
+            max: [
+              1.0
+            ],
+            scale: [
+              0.003922
+            ],
+            zero_point: [
+              0
+            ]
+          }
+        },
+        {
+          shape: [
+            1,
+            8,
+            8,
+            3
+          ],
+          type: "UINT8",
+          name: "output",
+          quantization: {
+            min: [
+              0.0
+            ],
+            max: [
+              1.0
+            ],
+            scale: [
+              0.003922
+            ],
+            zero_point: [
+              0
+            ]
+          }
+        }
+      ],
+      inputs: [
+        1
+      ],
+      outputs: [
+        2
+      ],
+      operators: [
+        {
+          inputs: [
+            1,
+            1
+          ],
+          outputs: [
+            0
+          ],
+          builtin_options_type: "AddOptions",
+          builtin_options: {
+          }
+        },
+        {
+          inputs: [
+            0,
+            1
+          ],
+          outputs: [
+            2
+          ],
+          builtin_options_type: "AddOptions",
+          builtin_options: {
+          }
+        }
+      ]
+    }
+  ],
+  buffers: [
+    {
+      data: [
+
+      ]
+    }
+  ]
+}
diff --git a/tensorflow/contrib/lite/testdata/empty_model.bin b/tensorflow/lite/testdata/empty_model.bin
similarity index 100%
rename from tensorflow/contrib/lite/testdata/empty_model.bin
rename to tensorflow/lite/testdata/empty_model.bin
diff --git a/tensorflow/contrib/lite/testdata/multi_add.bin b/tensorflow/lite/testdata/multi_add.bin
similarity index 100%
rename from tensorflow/contrib/lite/testdata/multi_add.bin
rename to tensorflow/lite/testdata/multi_add.bin
diff --git a/tensorflow/contrib/lite/testdata/multi_add.json b/tensorflow/lite/testdata/multi_add.json
similarity index 100%
rename from tensorflow/contrib/lite/testdata/multi_add.json
rename to tensorflow/lite/testdata/multi_add.json
diff --git a/tensorflow/contrib/lite/testdata/multi_add.pb b/tensorflow/lite/testdata/multi_add.pb
similarity index 100%
rename from tensorflow/contrib/lite/testdata/multi_add.pb
rename to tensorflow/lite/testdata/multi_add.pb
diff --git a/tensorflow/contrib/lite/testdata/multi_add_flex.bin b/tensorflow/lite/testdata/multi_add_flex.bin
similarity index 100%
rename from tensorflow/contrib/lite/testdata/multi_add_flex.bin
rename to tensorflow/lite/testdata/multi_add_flex.bin
diff --git a/tensorflow/contrib/lite/testdata/no_subgraphs.bin b/tensorflow/lite/testdata/no_subgraphs.bin
similarity index 100%
rename from tensorflow/contrib/lite/testdata/no_subgraphs.bin
rename to tensorflow/lite/testdata/no_subgraphs.bin
diff --git a/tensorflow/contrib/lite/testdata/test_model.bin b/tensorflow/lite/testdata/test_model.bin
similarity index 100%
rename from tensorflow/contrib/lite/testdata/test_model.bin
rename to tensorflow/lite/testdata/test_model.bin
diff --git a/tensorflow/contrib/lite/testdata/test_model_broken.bin b/tensorflow/lite/testdata/test_model_broken.bin
similarity index 100%
rename from tensorflow/contrib/lite/testdata/test_model_broken.bin
rename to tensorflow/lite/testdata/test_model_broken.bin
diff --git a/tensorflow/contrib/lite/testdata/test_model_broken.json b/tensorflow/lite/testdata/test_model_broken.json
similarity index 100%
rename from tensorflow/contrib/lite/testdata/test_model_broken.json
rename to tensorflow/lite/testdata/test_model_broken.json
diff --git a/tensorflow/contrib/lite/testdata/two_subgraphs.bin b/tensorflow/lite/testdata/two_subgraphs.bin
similarity index 100%
rename from tensorflow/contrib/lite/testdata/two_subgraphs.bin
rename to tensorflow/lite/testdata/two_subgraphs.bin
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..22ffed43cc0e08ac45a9a07077450d2642ba7f26
--- /dev/null
+++ b/tensorflow/lite/testing/BUILD
@@ -0,0 +1,392 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/lite:build_def.bzl",
+    "gen_zip_test",
+    "generated_test_models_all",
+)
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "py_test",
+)
+
+[gen_zip_test(
+    name = "zip_test_%s" % test_name,
+    size = "large",
+    srcs = ["generated_examples_zip_test.cc"],
+    args = args + select({
+        "//tensorflow:android": [],
+        "//conditions:default": [
+            "--zip_file_path=$(location :zip_%s)" % test_name,
+            # TODO(angerson) We may be able to add an external unzip binary instead
+            # of relying on an existing one for OSS builds.
+            "--unzip_binary_path=/usr/bin/unzip",
+        ],
+    }),
+    conversion_mode = conversion_mode,
+    data = [
+        ":zip_%s" % test_name,
+    ],
+    shard_count = 20,
+    tags = tags + [
+        "gen_zip_test",
+        "tflite_not_portable_intentional",
+    ],
+    test_name = test_name,
+    deps = [
+        ":parse_testdata_lib",
+        ":tflite_driver",
+        ":util",
+        "@com_google_googletest//:gtest",
+        "@com_googlesource_code_re2//:re2",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ] + select({
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+    }),
+) for conversion_mode, test_name, tags, args in generated_test_models_all()]
+
+test_suite(
+    name = "generated_zip_tests",
+    tags = [
+        "gen_zip_test",
+    ],
+)
+
+py_binary(
+    name = "generate_examples",
+    srcs = ["generate_examples.py"],
+    data = [
+        "//tensorflow/lite/toco",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":generate_examples_report",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:graph_util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "generate_examples_report",
+    srcs = ["generate_examples_report.py"],
+    srcs_version = "PY2AND3",
+)
+
+cc_library(
+    name = "parse_testdata_lib",
+    srcs = ["parse_testdata.cc"],
+    hdrs = ["parse_testdata.h"],
+    deps = [
+        ":message",
+        ":split",
+        ":test_runner",
+        "//tensorflow/lite:framework",
+    ],
+)
+
+cc_library(
+    name = "message",
+    srcs = ["message.cc"],
+    hdrs = ["message.h"],
+    deps = [":tokenize"],
+)
+
+cc_test(
+    name = "message_test",
+    srcs = ["message_test.cc"],
+    deps = [
+        ":message",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "split",
+    srcs = ["split.cc"],
+    hdrs = ["split.h"],
+    deps = [
+        "//tensorflow/lite:string",
+    ],
+)
+
+cc_test(
+    name = "split_test",
+    size = "small",
+    srcs = ["split_test.cc"],
+    deps = [
+        ":split",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "join",
+    hdrs = ["join.h"],
+    deps = ["//tensorflow/lite:string"],
+)
+
+cc_test(
+    name = "join_test",
+    size = "small",
+    srcs = ["join_test.cc"],
+    deps = [
+        ":join",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "tflite_driver",
+    srcs = ["tflite_driver.cc"],
+    hdrs = ["tflite_driver.h"],
+    deps = [
+        ":split",
+        ":test_runner",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/delegates/flex:delegate",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "tflite_driver_test",
+    size = "small",
+    srcs = ["tflite_driver_test.cc"],
+    data = ["//tensorflow/lite:testdata/multi_add.bin"],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":tflite_driver",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "tokenize",
+    srcs = ["tokenize.cc"],
+    hdrs = ["tokenize.h"],
+    deps = [
+        "//tensorflow/lite:string",
+    ],
+)
+
+cc_test(
+    name = "tokenize_test",
+    srcs = ["tokenize_test.cc"],
+    deps = [
+        ":tokenize",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "test_runner",
+    hdrs = ["test_runner.h"],
+    deps = [
+        "//tensorflow/lite:string",
+    ],
+)
+
+cc_library(
+    name = "util",
+    hdrs = ["util.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/core/api",
+    ],
+)
+
+cc_test(
+    name = "test_runner_test",
+    srcs = ["test_runner_test.cc"],
+    deps = [
+        ":test_runner",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_binary(
+    name = "nnapi_example",
+    srcs = ["nnapi_example.cc"],
+    deps = [
+        ":parse_testdata_lib",
+        ":tflite_driver",
+        "//tensorflow/lite/nnapi:nnapi_lib",
+    ],
+)
+
+cc_library(
+    name = "tf_driver",
+    srcs = ["tf_driver.cc"],
+    hdrs = ["tf_driver.h"],
+    deps = [
+        ":join",
+        ":split",
+        ":test_runner",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/lite:string_util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "tf_driver_test",
+    size = "small",
+    srcs = ["tf_driver_test.cc"],
+    data = ["//tensorflow/lite:testdata/multi_add.pb"],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":tf_driver",
+        "//tensorflow/lite:string_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "generate_testspec",
+    srcs = ["generate_testspec.cc"],
+    hdrs = ["generate_testspec.h"],
+    deps = [
+        ":join",
+        ":split",
+        ":tf_driver",
+        "//tensorflow/core:framework",
+        "//tensorflow/lite:string",
+    ],
+)
+
+cc_test(
+    name = "generate_testspec_test",
+    size = "small",
+    srcs = ["generate_testspec_test.cc"],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":generate_testspec",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "init_tensorflow",
+    srcs = [
+        "init_tensorflow.cc",
+    ],
+    hdrs = [
+        "init_tensorflow.h",
+    ],
+    visibility = [
+        "//tensorflow/lite/java/src/main/native:__subpackages__",
+        "//tensorflow/lite/testing:__subpackages__",
+        "//tensorflow/lite/tools/benchmark:__subpackages__",
+    ],
+    deps = select({
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+    }),
+)
+
+cc_library(
+    name = "tflite_diff_util",
+    srcs = ["tflite_diff_util.cc"],
+    hdrs = ["tflite_diff_util.h"],
+    deps = [
+        ":generate_testspec",
+        ":parse_testdata_lib",
+        ":tflite_driver",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+    ],
+)
+
+cc_library(
+    name = "tflite_diff_flags",
+    hdrs = ["tflite_diff_flags.h"],
+    deps = [
+        ":split",
+        ":tflite_diff_util",
+    ] + select({
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "tflite_diff_example_test",
+    size = "medium",
+    srcs = ["tflite_diff_example_test.cc"],
+    args = [
+        "--tensorflow_model=third_party/tensorflow/lite/testdata/multi_add.pb",
+        "--tflite_model=third_party/tensorflow/lite/testdata/multi_add.bin",
+        "--input_layer=a,b,c,d",
+        "--input_layer_type=float,float,float,float",
+        "--input_layer_shape=1,3,4,3:1,3,4,3:1,3,4,3:1,3,4,3",
+        "--output_layer=x,y",
+    ],
+    data = [
+        "//tensorflow/lite:testdata/multi_add.bin",
+        "//tensorflow/lite:testdata/multi_add.pb",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_oss",  # needs test data
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":init_tensorflow",
+        ":tflite_diff_flags",
+        ":tflite_diff_util",
+    ],
+)
+
+cc_binary(
+    name = "tflite_diff",
+    srcs = ["tflite_diff_example_test.cc"],
+    deps = [
+        ":init_tensorflow",
+        ":tflite_diff_flags",
+        ":tflite_diff_util",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
similarity index 87%
rename from tensorflow/contrib/lite/testing/generate_examples.py
rename to tensorflow/lite/testing/generate_examples.py
index 3f2255c4548f641c188fcf8a7a35a8a784e44286..dd7b3d07456fbd9943e9f45b815e6015f4973a94 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -19,7 +19,7 @@ Usage:
 
 generate_examples <output directory>
 
-bazel run //tensorflow/contrib/lite/testing:generate_examples
+bazel run //tensorflow/lite/testing:generate_examples
 
 To more easily debug failures use (or override) the --save_graphdefs flag to
 place text proto graphdefs into the generated zip files.
@@ -51,7 +51,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 import tensorflow as tf
 from google.protobuf import text_format
 # TODO(aselle): switch to TensorFlow's resource_loader
-from tensorflow.contrib.lite.testing import generate_examples_report as report_lib
+from tensorflow.lite.testing import generate_examples_report as report_lib
 from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.ops import rnn
 
@@ -103,8 +103,6 @@ KNOWN_BUGS = {
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
     r"div.*int32": "72051395",
-    # No support for SplitV
-    r"split.*num_or_size_splits=\[2,2\]": "73377559",
 }
 
 
@@ -343,7 +341,7 @@ def toco_convert(graph_def_str, input_tensors, output_tensors,
       opts = ("--input_arrays={0} --output_arrays={1}".format(
           ",".join(input_arrays), ",".join(output_tensors)))
     elif FLAGS.run_with_flex:
-      opts += " --allow_flex_ops --force_flex_ops"
+      opts += " --enable_select_tf_ops --force_select_tf_ops"
     cmd = ("%s --input_file=%s --output_file=%s %s > %s 2>&1" %
            (bin_path, graphdef_file.name, output_file.name, opts,
             stdout_file.name))
@@ -370,7 +368,8 @@ def make_zip_of_tests(zip_path,
                       make_graph,
                       make_test_inputs,
                       extra_toco_options=ExtraTocoOptions(),
-                      use_frozen_graph=False):
+                      use_frozen_graph=False,
+                      expected_tf_success=None):
   """Helper to make a zip file of a bunch of TensorFlow models.
 
   This does a cartestian product of the dictionary of test_parameters and
@@ -390,6 +389,8 @@ def make_zip_of_tests(zip_path,
       `output_tensors` and returns tuple `(input_values, output_values)`.
     extra_toco_options: Additional toco options.
     use_frozen_graph: Whether or not freeze graph before toco converter.
+    expected_tf_success: Number of times tensorflow is supposed to succeed in
+      executing the input graphs. `None` means "unknown".
 
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
@@ -550,6 +551,11 @@ def make_zip_of_tests(zip_path,
                    " and %d TOCO converted graphs (%.1f%%"), zip_path,
                   total_conversions, tf_success, toco_success, percent)
 
+  if expected_tf_success is not None and tf_success != expected_tf_success:
+    raise RuntimeError(
+        "Expected TF to succeed %d times, but that happened %d times" %
+        (expected_tf_success, tf_success))
+
   if not FLAGS.ignore_toco_errors and toco_errors > 0:
     raise RuntimeError(
         "Found %d errors while generating toco models" % toco_errors)
@@ -616,6 +622,30 @@ def make_max_pool_tests(zip_path):
   make_pool_tests(tf.nn.max_pool)(zip_path)
 
 
+def make_abs_tests(zip_path):
+  """Make a set of tests to do relu."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[], [1], [2, 3], [1, 1, 1, 1], [1, 3, 4, 3],
+                      [3, 15, 14, 3], [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.abs(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-10, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
 
@@ -747,6 +777,34 @@ def make_prelu_tests(zip_path):
       use_frozen_graph=True)
 
 
+def make_leaky_relu_tests(zip_path):
+  """Make a set of tests to do LeakyRelu."""
+
+  test_parameters = [
+      {
+          "input_shape": [[], [1], [5], [1, 10, 10, 3], [3, 3, 3, 3]],
+          "alpha": [0.1, 1.0, 2.0, -0.1, -1.0, -2.0],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.leaky_relu(input_tensor, alpha=parameters["alpha"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build the inputs for the test case."""
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-3, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # This function tests various TensorFLow functions that generates Const op,
 # including `tf.ones`, `tf.zeros` and random functions.
 def make_constant_tests(zip_path):
@@ -755,6 +813,7 @@ def make_constant_tests(zip_path):
   test_parameters = [{
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
+      "constant_is_also_output": [True, False],
   }]
 
   def build_graph(parameters):
@@ -764,49 +823,63 @@ def make_constant_tests(zip_path):
         shape=parameters["input_shape"])
     constant = tf.constant(
         create_tensor_data(parameters["dtype"], parameters["input_shape"]))
-    # This maximum node is here to avoid the situation where a graph output is
-    # a constant, which is an error in toco.
-    out = tf.maximum(dummy_input, constant)
-    return [dummy_input], [out]
+    out = [tf.maximum(dummy_input, constant)]
+    if parameters["constant_is_also_output"]:
+      out.append(constant)
+
+    return [dummy_input], out
 
   def build_inputs(parameters, sess, inputs, outputs):
     dummy_input = np.zeros(
         parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
     return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=20)
 
 
 def make_binary_op_tests(zip_path, binary_operator):
   """Make a set of tests to do binary ops with and without broadcast."""
 
-  # These parameters are split because we don't support broadcasting.
-  test_parameters = [{
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[1, 3, 4, 3]],
-      "input_shape_2": [[1, 3, 4, 3]],
-      "activation": [True]
-  }, {
-      "dtype": [tf.float32],
-      "input_shape_1": [[5]],
-      "input_shape_2": [[5]],
-      "activation": [False, True]
-  }, {
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[1, 3, 4, 3]],
-      "input_shape_2": [[3]],
-      "activation": [True, False]
-  }, {
-      "dtype": [tf.float32, tf.int32],
-      "input_shape_1": [[3]],
-      "input_shape_2": [[1, 3, 4, 3]],
-      "activation": [True, False]
-  }, {
-      "dtype": [tf.float32],
-      "input_shape_1": [[]],
-      "input_shape_2": [[]],
-      "activation": [False]
-  }]
+  test_parameters = [
+      # Avoid creating all combinations to keep the test size small.
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape_1": [[1, 3, 4, 3]],
+          "input_shape_2": [[1, 3, 4, 3]],
+          "activation": [True],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[5]],
+          "input_shape_2": [[5]],
+          "activation": [False, True],
+      },
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "input_shape_1": [[1, 3, 4, 3]],
+          "input_shape_2": [[3]],
+          "activation": [True, False],
+      },
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape_1": [[3]],
+          "input_shape_2": [[1, 3, 4, 3]],
+          "activation": [True, False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[]],
+          "input_shape_2": [[]],
+          "activation": [False],
+      },
+      {
+          "dtype": [tf.float32],
+          "input_shape_1": [[0]],
+          "input_shape_2": [[1]],
+          "activation": [False],
+      }
+  ]
 
   def build_graph(parameters):
     """Builds the graph given the current parameters."""
@@ -857,34 +930,46 @@ def make_reduce_tests(reduce_op,
   def f(zip_path):
     """Actual function that generates examples."""
 
-    test_parameters = [{
-        "input_dtype": [tf.float32, tf.int32, tf.int64],
-        "input_shape": [[3, 2, 4]],
-        "axis": [
-            0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
-            [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
-            [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
-        ],
-        "const_axis": [True, False],
-        "keepdims": [True, False],
-    }, {
-        "input_dtype": [tf.float32],
-        "input_shape": [[1, 8, 8, 3]],
-        "axis": [
-            0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
-            [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
-            -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
-            [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
-        ],
-        "const_axis": [True, False],
-        "keepdims": [True, False],
-    }, {
-        "input_dtype": [tf.float32],
-        "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
-        "axis": [None],
-        "const_axis": [True],
-        "keepdims": [True, False],
-    }]
+    test_parameters = [
+        {
+            "input_dtype": [tf.float32, tf.int32, tf.int64],
+            "input_shape": [[3, 3, 2, 4]],
+            "axis": [
+                0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
+                [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1],
+                [-1, 0], [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
+            ],
+            "const_axis": [True, False],
+            "keepdims": [True, False],
+        },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[1, 8, 8, 3]],
+            "axis": [
+                0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2,
+                                                        3], [3, 2, 1, 0],
+                [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2, -3, -4,
+                [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
+                [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
+            ],
+            "const_axis": [True, False],
+            "keepdims": [True, False],
+        },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
+            "axis": [[]],  # shape is: [0]
+            "const_axis": [False],
+            "keepdims": [True, False],
+        },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
+            "axis": [None],  # shape is: []
+            "const_axis": [True],
+            "keepdims": [True, False],
+        }
+    ]
 
     def build_graph(parameters):
       """Build the mean op testing graph."""
@@ -1119,6 +1204,14 @@ def make_floor_div_tests(zip_path):
   make_binary_op_tests(zip_path, tf.floor_div)
 
 
+def make_floor_mod_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.floormod)
+
+
+def make_squared_difference_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.squared_difference)
+
+
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
@@ -1126,11 +1219,11 @@ def make_gather_tests(zip_path):
       # TODO(mgubin): add string tests when they are supported by Toco.
       # TODO(mgubin): add tests for Nd indices when they are supported by
       # TfLite.
-      "params_dtype": [tf.float32, tf.int32],
+      "params_dtype": [tf.float32, tf.int32, tf.int64],
       "params_shape": [[10], [1, 2, 20]],
-      "indices_dtype": [tf.int32],
+      "indices_dtype": [tf.int32, tf.int64],
       "indices_shape": [[3], [5]],
-      "axis": [0, 1],
+      "axis": [-1, 0, 1],
   }]
 
   def build_graph(parameters):
@@ -1143,7 +1236,8 @@ def make_gather_tests(zip_path):
         dtype=parameters["indices_dtype"],
         name="indices",
         shape=parameters["indices_shape"])
-    out = tf.gather(params, indices, axis=parameters["axis"])
+    axis = min(len(parameters["params_shape"]), parameters["axis"])
+    out = tf.gather(params, indices, axis=axis)
     return [params, indices], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -1155,7 +1249,43 @@ def make_gather_tests(zip_path):
     return [params, indices], sess.run(
         outputs, feed_dict=dict(zip(inputs, [params, indices])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  # Note that TF can't execute with index=1 and params_shape=[10].
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=60)
+
+
+def make_gather_with_constant_tests(zip_path):
+  """Make a set of test which feed a constant to gather toco."""
+
+  test_parameters = [{
+      "input_shape": [[3]],
+      "reference_shape": [[2]],
+  }, {
+      "input_shape": [[2, 3]],
+      "reference_shape": [[2, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build a graph where the inputs to Gather are constants."""
+    reference = tf.placeholder(
+        dtype=tf.int32, shape=parameters["reference_shape"])
+    gather_input = tf.constant(
+        create_tensor_data(tf.int32, parameters["input_shape"]))
+    gather_indices = tf.constant([0, 1], tf.int32)
+    out = tf.equal(reference, tf.gather(gather_input, gather_indices))
+    return [reference], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    reference_values = np.zeros(parameters["reference_shape"], dtype=np.int32)
+    return [reference_values], sess.run(
+        outputs, feed_dict={inputs[0]: reference_values})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=2)
 
 
 def make_global_batch_norm_tests(zip_path):
@@ -1323,23 +1453,27 @@ def make_conv_with_shared_weights_tests(zip_path):
     input_shape, filter_shape = get_tensor_shapes(parameters)
     input_tensor = tf.placeholder(
         dtype=tf.float32, name="input", shape=input_shape)
+    input_tensors = [input_tensor]
 
     # Construct a constant weights tensor which will be used by both Conv2D.
     filter_tensor = tf.constant(
         create_tensor_data(np.float32, filter_shape), dtype=tf.float32)
-    input_tensors = [input_tensor]
+
+    # Ensure that FuseBinaryIntoFollowingAffine works with an input which
+    # is shared by multiple affine ops.
+    conv_input = input_tensor + 0.1
 
     # Construct 2 Conv2D operations which use exactly the same input and
     # weights.
     result1 = tf.nn.conv2d(
-        input_tensor,
+        conv_input,
         filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     result2 = tf.nn.conv2d(
-        input_tensor,
+        conv_input,
         filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
@@ -1507,7 +1641,7 @@ def make_split_tests(zip_path):
 
   test_parameters = [{
       "input_shape": [[1, 3, 4, 6], [2, 4, 1], [6, 4], [8]],
-      "num_or_size_splits": [1, 2, 3, 4, 5, [2, 2]],
+      "num_or_size_splits": [1, 2, 3, 4, 5],
       "axis": [0, 1, 2, 3, -4, -3, -2, -1],
   }]
 
@@ -1525,6 +1659,29 @@ def make_split_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_splitv_tests(zip_path):
+  """Make a set of tests to do tf.split_v."""
+
+  test_parameters = [{
+      "input_shape": [[1, 3, 4, 6], [2, 4, 1], [6, 4], [8]],
+      "size_splits": [[2, 2], [1, 3], [4, 2], [5, 3],
+                      [-1, 1], [-1, 2], [-1, 4]],
+      "axis": [0, 1, 2, 3, -4, -3, -2, -1],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.split(input_tensor, parameters["size_splits"], parameters["axis"])
+    return [input_tensor], [out[0]]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [create_tensor_data(np.float32, parameters["input_shape"])]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_concat_tests(zip_path):
   """Make a set of tests to do concatenation."""
 
@@ -1966,6 +2123,36 @@ def make_resize_bilinear_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_resize_nearest_neighbor_tests(zip_path):
+  """Make a set of tests to do resize_nearest_neighbor."""
+
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32],
+      "input_shape": [[1, 3, 4, 3], [1, 10, 2, 1]],
+      "size": [[1, 1], [4, 3], [2, 2], [5, 6]],
+      "align_corners": [False],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    out = tf.image.resize_nearest_neighbor(
+        input_tensor,
+        size=parameters["size"],
+        align_corners=parameters["align_corners"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_sigmoid_tests(zip_path):
   """Make a set of tests to do sigmoid."""
 
@@ -2421,6 +2608,32 @@ def make_strided_slice_1d_exhaustive_tests(zip_path):
   _make_strided_slice_tests(zip_path, test_parameters)
 
 
+def make_strided_slice_buggy_tests(zip_path):
+  """Make a set of tests to show strided_slice yields incorrect results."""
+
+  test_parameters = [{
+      "unused_iteration_counter": [1],
+  }]
+
+  def build_graph(parameters):
+    """Build the strided_slice op testing graph."""
+    del parameters
+    input_values = tf.placeholder(dtype=tf.float32, shape=[4, 2])
+    data = tf.constant([[0, 1, 2, 3],
+                        [4, 5, 6, 7],
+                        [8, 9, 10, 11],
+                        [12, 13, 14, 15]], tf.float32)
+    return [input_values], [input_values + data[:, :2]]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    del parameters
+    input_values = np.zeros([4, 2], dtype=np.float32)
+    return [input_values], sess.run(
+        outputs, feed_dict={inputs[0]: input_values})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_lstm_tests(zip_path):
   """Make a set of tests to do basic Lstm cell."""
 
@@ -2552,7 +2765,6 @@ def make_arg_min_max_tests(zip_path):
       "input_dtype": [tf.float32, tf.int32],
       "input_shape": [[], [1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
       "output_type": [tf.int32, tf.int64],
-      "axis_is_last_dim": [True, False],
       "is_arg_max": [True],
   }]
 
@@ -2562,10 +2774,7 @@ def make_arg_min_max_tests(zip_path):
         dtype=parameters["input_dtype"],
         name="input",
         shape=parameters["input_shape"])
-    if parameters["axis_is_last_dim"]:
-      axis = len(parameters["input_shape"]) - 1
-    else:
-      axis = random.randint(0, max(len(parameters["input_shape"]) - 2, 0))
+    axis = random.randint(0, max(len(parameters["input_shape"]) - 1, 0))
     if parameters["is_arg_max"]:
       out = tf.arg_max(input_value, axis, output_type=parameters["output_type"])
     else:
@@ -3078,7 +3287,7 @@ def make_transpose_conv_tests(zip_path):
 def make_tile_tests(zip_path):
   """Make a set of tests to do tile."""
   test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32],
+      "input_dtype": [tf.float32, tf.int32, tf.bool],
       "input_shape": [[3, 2, 1], [2, 2, 2]],
       "multiplier_dtype": [tf.int32, tf.int64],
       "multiplier_shape": [[3]]
@@ -3100,8 +3309,10 @@ def make_tile_tests(zip_path):
   def build_inputs(parameters, sess, inputs, outputs):
     input_value = create_tensor_data(parameters["input_dtype"],
                                      parameters["input_shape"])
-    multipliers_value = create_tensor_data(parameters["multiplier_dtype"],
-                                           parameters["multiplier_shape"])
+    multipliers_value = create_tensor_data(
+        parameters["multiplier_dtype"],
+        parameters["multiplier_shape"],
+        min_value=0)
     return [input_value, multipliers_value], sess.run(
         outputs,
         feed_dict={
@@ -3206,12 +3417,30 @@ def make_sparse_to_dense_tests(zip_path):
 def make_pack_tests(zip_path):
   """Make a set of tests to do stack."""
 
-  test_parameters = [{
-      "base_shape": [[3, 4, 3], [3, 4], [5]],
-      "num_tensors": [1, 2, 3, 4, 5, 6],
-      "axis": [0, 1, 2, 3],
-      "additional_shape": [1, 2, 3],
-  }]
+  test_parameters = [
+      # Avoid creating all combinations to keep the test size small.
+      {
+          "dtype": [tf.float32],
+          "base_shape": [[3, 4, 3], [3, 4], [5]],
+          "num_tensors": [1, 2, 3, 4, 5, 6],
+          "axis": [0, 1, 2, 3],
+          "additional_shape": [1, 2, 3],
+      },
+      {
+          "dtype": [tf.int32],
+          "base_shape": [[3, 4, 3], [3, 4], [5]],
+          "num_tensors": [6],
+          "axis": [0, 1, 2, 3],
+          "additional_shape": [1, 2, 3],
+      },
+      {
+          "dtype": [tf.int64],
+          "base_shape": [[3, 4, 3], [3, 4], [5]],
+          "num_tensors": [5],
+          "axis": [0, 1, 2, 3],
+          "additional_shape": [1, 2, 3],
+      }
+  ]
 
   def get_shape(parameters):
     """Return a tweaked version of 'base_shape'."""
@@ -3225,7 +3454,9 @@ def make_pack_tests(zip_path):
     all_tensors = []
     for n in range(0, parameters["num_tensors"]):
       input_tensor = tf.placeholder(
-          dtype=tf.float32, name=("input%d" % n), shape=get_shape(parameters))
+          dtype=parameters["dtype"],
+          name=("input%d" % n),
+          shape=get_shape(parameters))
       all_tensors.append(input_tensor)
     out = tf.stack(all_tensors, parameters["axis"])
     return all_tensors, [out]
@@ -3271,6 +3502,67 @@ def make_unpack_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_range_tests(zip_path):
+  """Make a set of tests to do range."""
+
+  test_parameters = [{
+      "dtype": [tf.int32],
+      "offset": [10, 100, 1000],
+      "delta": [1, 2, 3, 4, -1, -2, -3, -4],
+  }]
+
+  def build_graph(parameters):
+    """Build the range op testing graph."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"], name=("start"), shape=[])
+    if parameters["delta"] < 0:
+      offset = parameters["offset"] * -1
+    else:
+      offset = parameters["offset"]
+    delta = parameters["delta"]
+    limit_tensor = input_tensor + offset
+    delta_tensor = tf.constant(delta, dtype=tf.int32)
+    out = tf.range(input_tensor, limit_tensor, delta_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_scalar_data(parameters["dtype"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_fill_tests(zip_path):
+  """Make a set of tests to do fill."""
+
+  test_parameters = [{
+      "dims_dtype": [tf.int32, tf.int64],
+      "dims_shape": [[], [1], [3], [3, 3]],
+      "value_dtype": [tf.int32, tf.int64, tf.float32],
+  }]
+
+  def build_graph(parameters):
+    """Build the fill op testing graph."""
+    input1 = tf.placeholder(
+        dtype=parameters["dims_dtype"],
+        name="dims",
+        shape=parameters["dims_shape"])
+    input2 = tf.placeholder(
+        dtype=parameters["value_dtype"], name="value", shape=[])
+    out = tf.fill(input1, input2)
+    return [input1, input2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input1 = create_tensor_data(parameters["dims_dtype"],
+                                parameters["dims_shape"], 1)
+    input2 = create_scalar_data(parameters["value_dtype"])
+    return [input1, input2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input1, input2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def _make_logical_tests(op):
   """Make a set of tests to do logical operations."""
 
@@ -3322,6 +3614,141 @@ def make_logical_xor_tests(zip_path):
   return _make_logical_tests(tf.logical_xor)(zip_path)
 
 
+def make_mirror_pad_tests(zip_path):
+  """Make a set of tests to do mirror_pad."""
+
+  test_parameters = [
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [1, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.int32, name="input", shape=parameters["input_shape"])
+    if parameters["type"] != "const":
+      padding_matrix = tf.placeholder(
+          dtype=tf.int32,
+          name="padding",
+          shape=[len(parameters["input_shape"]), 2])
+      input_tensors = [input_tensor, padding_matrix]
+    else:
+      padding_matrix = tf.constant(np.array(parameters["padding_matrix"]))
+      input_tensors = [input_tensor]
+    output = tf.pad(
+        input_tensor, paddings=padding_matrix, mode=parameters["mode"])
+
+    return input_tensors, [output]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = [create_tensor_data(tf.int32, parameters["input_shape"])]
+    if parameters["type"] != "const":
+      input_values.append(np.array(parameters["padding_matrix"]))
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=7)
+
+
+def make_unroll_batch_matmul_tests(zip_path):
+  """Make a set of tests to test unroll_batch_matmul."""
+
+  test_parameters = [{"dtype": [tf.float32], "shape": [[(2, 2, 3), (2, 3, 2)]]}]
+
+  def build_graph(parameters):
+    """Build the batch_matmul op testing graph."""
+    input_tensor1 = tf.placeholder(
+        dtype=parameters["dtype"], shape=parameters["shape"][0])
+    input_tensor2 = tf.placeholder(
+        dtype=parameters["dtype"], shape=parameters["shape"][1])
+    # Should be unrolled and replaced with fully_connected ops in the end.
+    out = tf.matmul(input_tensor1, input_tensor2)
+    return [input_tensor1, input_tensor2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(
+        parameters["dtype"], shape=parameters["shape"][0])
+    input_value2 = create_tensor_data(
+        parameters["dtype"], shape=parameters["shape"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_placeholder_with_default_tests(zip_path):
+  """Make a set of tests to test placeholder_with_default."""
+
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32, tf.int64],
+  }]
+
+  def build_graph(parameters):
+    """Build the placeholder_with_default testing graph."""
+    const_node = tf.constant(
+        [1, 2, 2, 0], shape=[2, 2], dtype=parameters["dtype"])
+    input_tensor = tf.placeholder_with_default(
+        const_node, shape=[2, 2], name="input")
+    out = tf.equal(input_tensor, const_node, name="output")
+
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    numpy_type = _TF_TYPE_INFO[parameters["dtype"]][0]
+    input_value = np.array([[1, 0], [2, 1]], numpy_type)
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=3)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/contrib/lite/testing/generate_examples_report.py b/tensorflow/lite/testing/generate_examples_report.py
similarity index 100%
rename from tensorflow/contrib/lite/testing/generate_examples_report.py
rename to tensorflow/lite/testing/generate_examples_report.py
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/lite/testing/generate_testspec.cc
similarity index 95%
rename from tensorflow/contrib/lite/testing/generate_testspec.cc
rename to tensorflow/lite/testing/generate_testspec.cc
index 62cbeccd3315f2a51be73c3488e76444ddd0c927..74e4d2549830f404421e17f111488a2d181d5888 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/lite/testing/generate_testspec.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include <iostream>
 
-#include "tensorflow/contrib/lite/testing/generate_testspec.h"
-#include "tensorflow/contrib/lite/testing/join.h"
-#include "tensorflow/contrib/lite/testing/split.h"
-#include "tensorflow/contrib/lite/testing/tf_driver.h"
+#include "tensorflow/lite/testing/generate_testspec.h"
+#include "tensorflow/lite/testing/join.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/tf_driver.h"
 #include "tensorflow/core/framework/types.h"
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.h b/tensorflow/lite/testing/generate_testspec.h
similarity index 91%
rename from tensorflow/contrib/lite/testing/generate_testspec.h
rename to tensorflow/lite/testing/generate_testspec.h
index b3d0db31c01a8cb1b8f34ff6dbb00c77de29b131..bda636f2c8081f7b2d0d1062ce899fde35c414a4 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.h
+++ b/tensorflow/lite/testing/generate_testspec.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_GENERATE_TESTSPEC_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_GENERATE_TESTSPEC_H_
+#ifndef TENSORFLOW_LITE_TESTING_GENERATE_TESTSPEC_H_
+#define TENSORFLOW_LITE_TESTING_GENERATE_TESTSPEC_H_
 
 #include <functional>
 #include <iostream>
 #include <vector>
 
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace testing {
@@ -65,4 +65,4 @@ std::vector<T> GenerateRandomTensor(const std::vector<int>& shape,
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_GENERATE_TESTSPEC_H_
+#endif  // TENSORFLOW_LITE_TESTING_GENERATE_TESTSPEC_H_
diff --git a/tensorflow/contrib/lite/testing/generate_testspec_test.cc b/tensorflow/lite/testing/generate_testspec_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/testing/generate_testspec_test.cc
rename to tensorflow/lite/testing/generate_testspec_test.cc
index 2a97b757a413246c9ad9b5f453741b13e381c903..4450da289d2e33ac0dd32dff7e5372afc6764db5 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec_test.cc
+++ b/tensorflow/lite/testing/generate_testspec_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/testing/generate_testspec.h"
+#include "tensorflow/lite/testing/generate_testspec.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
similarity index 94%
rename from tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
rename to tensorflow/lite/testing/generated_examples_zip_test.cc
index 349aa5a3b4c2bbc0072136ca5d978b23a237430a..a9a31ad088e6f4b0297ba313c585abbe6189728b 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <sstream>
 #include <gtest/gtest.h>
 #include "re2/re2.h"
-#include "tensorflow/contrib/lite/testing/parse_testdata.h"
-#include "tensorflow/contrib/lite/testing/tflite_driver.h"
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/testing/parse_testdata.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
+#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/subprocess.h"
@@ -85,17 +85,6 @@ std::map<string, string> kBrokenTests = {
     // Transpose only supports 1D-4D input tensors.
     {R"(^\/transpose.*input_shape=\[.,.,.,.,.\])", "71545879"},
 
-    // No support for axis!=0 in GatherV2.
-    {R"(^\/gather.*axis=1)", "76910444"},
-
-    // No support for arbitrary dimensions in ArgMax.
-    {R"(^\/arg_min_max.*axis_is_last_dim=False.*input_shape=\[.,.,.,.\])",
-     "77546240"},
-    {R"(^\/arg_min_max.*axis_is_last_dim=False.*input_shape=\[.,.,.\])",
-     "77546240"},
-    {R"(^\/arg_min_max.*axis_is_last_dim=False.*input_shape=\[.,.\])",
-     "77546240"},
-
     // No Support for float.
     {R"(^\/floor_div.*dtype=tf\.float32)", "112859002"},
 
@@ -104,6 +93,18 @@ std::map<string, string> kBrokenTests = {
     // activation=True. The tests are failing since Relu doesn't support int32.
     {R"(^\/div.*activation=True.*dtype=tf\.int32)", "112968789"},
     {R"(^\/floor_div.*activation=True.*dtype=tf\.int32)", "112968789"},
+    {R"(^\/floor_mod.*activation=True.*dtype=tf\.int32)", "112968789"},
+    {R"(^\/floor_mod.*activation=True.*dtype=tf\.int64)", "112968789"},
+
+    {R"(^\/sub.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/div.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/mul.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/add.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/floor_div.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/squared_difference.*dtype=tf\.int64)", "119126484"},
+
+    // Strided Slice chooses the wrong dimension.
+    {R"(^\/strided_slice_buggy)", "119786029"},
 };
 
 // Allows test data to be unarchived into a temporary directory and makes
@@ -204,7 +205,7 @@ tensorflow::Status ReadManifest(const string& original_file, const string& dir,
   }
   if (!added) {
     string message = "Test had no examples: " + original_file;
-    return tensorflow::Status(tensorflow::error::UNKNOWN, message.c_str());
+    return tensorflow::Status(tensorflow::error::UNKNOWN, message);
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/lite/testing/init_tensorflow.cc b/tensorflow/lite/testing/init_tensorflow.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ed4d12374489ed33b9dd6edec5c60865d216f491
--- /dev/null
+++ b/tensorflow/lite/testing/init_tensorflow.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/init_tensorflow.h"
+
+#include <cstdlib>
+#include <cstring>
+
+#include "tensorflow/core/platform/init_main.h"
+
+namespace tflite {
+void InitTensorFlow() {
+  static const char* kFakeName = "fake program name";
+  int argc = 1;
+  char* fake_name_copy = strdup(kFakeName);
+  char** argv = &fake_name_copy;
+  ::tensorflow::port::InitMain(kFakeName, &argc, &argv);
+  free(fake_name_copy);
+}
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/init_tensorflow.h b/tensorflow/lite/testing/init_tensorflow.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c36a247912b71de2d5f5c7fbffdad60f0060bfa
--- /dev/null
+++ b/tensorflow/lite/testing/init_tensorflow.h
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_INIT_TENSORFLOW_H_
+#define TENSORFLOW_LITE_TESTING_INIT_TENSORFLOW_H_
+
+namespace tflite {
+
+// Initializes tensorflow's libraries. Note that this simulates an empty
+// command line, so flags are not initialized.
+void InitTensorFlow();
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_INIT_TENSORFLOW_H_
diff --git a/tensorflow/contrib/lite/testing/join.h b/tensorflow/lite/testing/join.h
similarity index 84%
rename from tensorflow/contrib/lite/testing/join.h
rename to tensorflow/lite/testing/join.h
index 4be19ad7569c3333b6647b91adbc6e77ff088f10..d1c314608687f045b346cc5526ea46c8149c2755 100644
--- a/tensorflow/contrib/lite/testing/join.h
+++ b/tensorflow/lite/testing/join.h
@@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_JOIN_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_JOIN_H_
+#ifndef TENSORFLOW_LITE_TESTING_JOIN_H_
+#define TENSORFLOW_LITE_TESTING_JOIN_H_
 
 #include <cstdlib>
+#include <iomanip>
 #include <sstream>
 
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace testing {
@@ -30,9 +31,9 @@ string Join(T* data, size_t len, const string& delimiter) {
     return "";
   }
   std::stringstream result;
-  result << data[0];
+  result << std::setprecision(9) << data[0];
   for (int i = 1; i < len; i++) {
-    result << delimiter << data[i];
+    result << std::setprecision(9) << delimiter << data[i];
   }
   return result.str();
 }
@@ -56,4 +57,4 @@ inline string Join<uint8_t>(uint8_t* data, size_t len,
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_JOIN_H_
+#endif  // TENSORFLOW_LITE_TESTING_JOIN_H_
diff --git a/tensorflow/contrib/lite/testing/join_test.cc b/tensorflow/lite/testing/join_test.cc
similarity index 91%
rename from tensorflow/contrib/lite/testing/join_test.cc
rename to tensorflow/lite/testing/join_test.cc
index bd04528381f6d31164728a5cabbf8753e9b8d2b8..0b3c07f37e14e3815ac1eb4acd0aefac3515064c 100644
--- a/tensorflow/contrib/lite/testing/join_test.cc
+++ b/tensorflow/lite/testing/join_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/testing/join.h"
+#include "tensorflow/lite/testing/join.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -28,7 +28,7 @@ TEST(JoinTest, JoinInt) {
 
 TEST(JoinTest, JoinFloat) {
   float data[] = {1.0, -3, 2.3, 1e-5};
-  EXPECT_EQ(Join(data, 4, " "), "1 -3 2.3 1e-05");
+  EXPECT_EQ(Join(data, 4, " "), "1 -3 2.29999995 9.99999975e-06");
 }
 
 TEST(JoinTest, JoinNullData) { EXPECT_THAT(Join<int>(nullptr, 3, ","), ""); }
diff --git a/tensorflow/contrib/lite/testing/message.cc b/tensorflow/lite/testing/message.cc
similarity index 96%
rename from tensorflow/contrib/lite/testing/message.cc
rename to tensorflow/lite/testing/message.cc
index 03fae4bb86a30e692dbc7f38bede6154c3a9a303..08aac6f6aa192c1aad8dce65f17c3ad4993a16bd 100644
--- a/tensorflow/contrib/lite/testing/message.cc
+++ b/tensorflow/lite/testing/message.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/testing/message.h"
+#include "tensorflow/lite/testing/message.h"
 
 #include <stack>
 
-#include "tensorflow/contrib/lite/testing/tokenize.h"
+#include "tensorflow/lite/testing/tokenize.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/contrib/lite/testing/message.h b/tensorflow/lite/testing/message.h
similarity index 94%
rename from tensorflow/contrib/lite/testing/message.h
rename to tensorflow/lite/testing/message.h
index e2bc4082141f0601c141a193fbea75f8f759146a..e6566ab11ca7dd6efc5a5cf9df2d10201e28f232 100644
--- a/tensorflow/contrib/lite/testing/message.h
+++ b/tensorflow/lite/testing/message.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+#ifndef TENSORFLOW_LITE_TESTING_MESSAGE_H_
+#define TENSORFLOW_LITE_TESTING_MESSAGE_H_
 
 #include <memory>
 #include <string>
@@ -79,4 +79,4 @@ class Message {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+#endif  // TENSORFLOW_LITE_TESTING_MESSAGE_H_
diff --git a/tensorflow/contrib/lite/testing/message_test.cc b/tensorflow/lite/testing/message_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/testing/message_test.cc
rename to tensorflow/lite/testing/message_test.cc
index fb6a49bd6f1ea88f1b48c03dfb08a54626bda2eb..bec4915e5853d67ea4cced2b66d0cfe26f4efcc7 100644
--- a/tensorflow/contrib/lite/testing/message_test.cc
+++ b/tensorflow/lite/testing/message_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/testing/message.h"
+#include "tensorflow/lite/testing/message.h"
 
 #include <map>
 
diff --git a/tensorflow/lite/testing/model_coverage/BUILD b/tensorflow/lite/testing/model_coverage/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7e6a65997d38d17576e76893ede8c791af2520dc
--- /dev/null
+++ b/tensorflow/lite/testing/model_coverage/BUILD
@@ -0,0 +1,33 @@
+package(default_visibility = [
+    "//tensorflow/lite:__subpackages__",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+py_binary(
+    name = "model_coverage_lib",
+    srcs = ["model_coverage_lib.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_test(
+    name = "model_coverage_lib_test",
+    srcs = ["model_coverage_lib_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "no_oss",
+        "no_pip",
+        "no_windows",
+        "notap",
+    ],
+    deps = [
+        ":model_coverage_lib",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
similarity index 80%
rename from tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py
rename to tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index 72029ed03ce72d2bbfcca176edec08f8c86f8dbe..804e328d9da248859e806bd070de26a8f5aa37b4 100644
--- a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -20,13 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.lite.python import convert_saved_model as _convert_saved_model
-from tensorflow.contrib.lite.python import lite as _lite
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
+from tensorflow.lite.python import convert_saved_model as _convert_saved_model
+from tensorflow.lite.python import lite as _lite
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
 from tensorflow.python.lib.io import file_io as _file_io
+from tensorflow.python.saved_model import loader as _loader
 from tensorflow.python.saved_model import signature_constants as _signature_constants
 from tensorflow.python.saved_model import tag_constants as _tag_constants
 
@@ -37,13 +38,13 @@ def _convert(converter, **kwargs):
   Args:
     converter: TFLiteConverter object.
     **kwargs: Additional arguments to be passed into the converter. Supported
-      flags are {"converter_mode", "post_training_quantize"}.
+      flags are {"target_ops", "post_training_quantize"}.
 
   Returns:
     The converted TFLite model in serialized format.
   """
-  if "converter_mode" in kwargs:
-    converter.converter_mode = kwargs["converter_mode"]
+  if "target_ops" in kwargs:
+    converter.target_ops = kwargs["target_ops"]
   if "post_training_quantize" in kwargs:
     converter.post_training_quantize = kwargs["post_training_quantize"]
   return converter.convert()
@@ -144,7 +145,7 @@ def evaluate_saved_model(directory, tag_set, signature_key):
     if signature_key is None:
       signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
-    meta_graph = _convert_saved_model.get_meta_graph_def(directory, tag_set)
+    meta_graph = _loader.load(sess, tag_set, directory)
     signature_def = _convert_saved_model.get_signature_def(
         meta_graph, signature_key)
     inputs, outputs = _convert_saved_model.get_inputs_outputs(signature_def)
@@ -165,18 +166,20 @@ def evaluate_keras_model(filename):
   return lambda input_data: [keras_model.predict(input_data)]
 
 
-# TODO(nupurgarg): Make this function a parameter to test_frozen_graph (and
-# related functions) in order to make it easy to use different data generators.
-def compare_models_random_data(tflite_model, tf_eval_func, tolerance=5):
-  """Compares TensorFlow and TFLite models with random data.
+def compare_models(tflite_model, tf_eval_func, input_data=None, tolerance=5):
+  """Compares TensorFlow and TFLite models.
+
+  Unless the input data is provided, the models are compared with random data.
 
   Args:
     tflite_model: Serialized TensorFlow Lite model.
     tf_eval_func: Lambda function that takes in input data and outputs the
       results of the TensorFlow model ([np.ndarray data] : [np.ndarray result]).
+    input_data: np.ndarray to pass into models during inference. (default None)
     tolerance: Decimal place to check accuracy to. (default 5)
   """
-  input_data = _generate_random_input_data(tflite_model)
+  if input_data is None:
+    input_data = _generate_random_input_data(tflite_model)
   tf_results = tf_eval_func(input_data)
   tflite_results = _evaluate_tflite_model(tflite_model, input_data)
   for tf_result, tflite_result in zip(tf_results, tflite_results):
@@ -238,8 +241,8 @@ def test_frozen_graph_quant(filename,
       for float_tensor in float_tensors)
   has_quant_tensor = num_tensors_float != num_tensors_same_dtypes
 
-  if ("converter_mode" in kwargs and
-      kwargs["converter_mode"] == _lite.ConverterMode.TOCO_FLEX_ALL):
+  if ("target_ops" in kwargs and
+      set(kwargs["target_ops"]) == set([_lite.OpsSet.SELECT_TF_OPS])):
     if has_quant_tensor:
       raise ValueError("--post_training_quantize flag unexpectedly altered the "
                        "full Flex mode graph.")
@@ -252,6 +255,7 @@ def test_frozen_graph(filename,
                       input_arrays,
                       output_arrays,
                       input_shapes=None,
+                      input_data=None,
                       **kwargs):
   """Validates the TensorFlow frozen graph converts to a TFLite model.
 
@@ -266,6 +270,7 @@ def test_frozen_graph(filename,
       integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
       Automatically determined when input shapes is None (e.g., {"foo" : None}).
         (default None)
+    input_data: np.ndarray to pass into models during inference. (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
   converter = _lite.TFLiteConverter.from_frozen_graph(
@@ -273,10 +278,15 @@ def test_frozen_graph(filename,
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_frozen_graph(filename, input_arrays, output_arrays)
-  compare_models_random_data(tflite_model, tf_eval_func)
+  compare_models(tflite_model, tf_eval_func, input_data=input_data)
 
 
-def test_saved_model(directory, tag_set=None, signature_key=None, **kwargs):
+def test_saved_model(directory,
+                     input_shapes=None,
+                     tag_set=None,
+                     signature_key=None,
+                     input_data=None,
+                     **kwargs):
   """Validates the TensorFlow SavedModel converts to a TFLite model.
 
   Converts the TensorFlow SavedModel to TFLite and checks the accuracy of the
@@ -284,20 +294,32 @@ def test_saved_model(directory, tag_set=None, signature_key=None, **kwargs):
 
   Args:
     directory: SavedModel directory to convert.
+    input_shapes: Dict of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+        (default None)
     tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
       analyze. All tags in the tag set must be present.
     signature_key: Key identifying SignatureDef containing inputs and outputs.
+    input_data: np.ndarray to pass into models during inference. (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
-  converter = _lite.TFLiteConverter.from_saved_model(directory, tag_set,
-                                                     signature_key)
+  converter = _lite.TFLiteConverter.from_saved_model(
+      directory,
+      input_shapes=input_shapes,
+      tag_set=tag_set,
+      signature_key=signature_key)
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_saved_model(directory, tag_set, signature_key)
-  compare_models_random_data(tflite_model, tf_eval_func)
+  compare_models(tflite_model, tf_eval_func, input_data=input_data)
 
 
-def test_keras_model(filename, **kwargs):
+def test_keras_model(filename,
+                     input_arrays=None,
+                     input_shapes=None,
+                     input_data=None,
+                     **kwargs):
   """Validates the tf.keras model converts to a TFLite model.
 
   Converts the tf.keras model to TFLite and checks the accuracy of the model on
@@ -305,10 +327,17 @@ def test_keras_model(filename, **kwargs):
 
   Args:
     filename: Full filepath of HDF5 file containing the tf.keras model.
+    input_arrays: List of input tensors to freeze graph with.
+    input_shapes: Dict of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+      Automatically determined when input shapes is None (e.g., {"foo" : None}).
+        (default None)
+    input_data: np.ndarray to pass into models during inference. (default None)
     **kwargs: Additional arguments to be passed into the converter.
   """
-  converter = _lite.TFLiteConverter.from_keras_model_file(filename)
+  converter = _lite.TFLiteConverter.from_keras_model_file(
+      filename, input_arrays=input_arrays, input_shapes=input_shapes)
   tflite_model = _convert(converter, **kwargs)
 
   tf_eval_func = evaluate_keras_model(filename)
-  compare_models_random_data(tflite_model, tf_eval_func)
+  compare_models(tflite_model, tf_eval_func, input_data=input_data)
diff --git a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
similarity index 85%
rename from tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py
rename to tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index e07202b1a671f88d64b0dc5d25f412b8daaea809..4e329ac97d7358edf068329b21f0194c94c57cb0 100644
--- a/tensorflow/contrib/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -22,12 +22,13 @@ import os
 import tempfile
 import numpy as np
 
-from tensorflow.contrib.lite.python import lite
-from tensorflow.contrib.lite.testing.model_coverage import model_coverage_lib as model_coverage
+from tensorflow.lite.python import lite
+from tensorflow.lite.testing.model_coverage import model_coverage_lib as model_coverage
 from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -42,6 +43,7 @@ class EvaluateFrozenGraph(test.TestCase):
     write_graph(sess.graph_def, '', graph_def_file, False)
     return graph_def_file
 
+  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     with session.Session().as_default() as sess:
       in_tensor = array_ops.placeholder(
@@ -51,6 +53,7 @@ class EvaluateFrozenGraph(test.TestCase):
 
     model_coverage.test_frozen_graph(filename, ['Placeholder'], ['add'])
 
+  @test_util.run_v1_only('b/120545219')
   def testMultipleOutputs(self):
     with session.Session().as_default() as sess:
       in_tensor_1 = array_ops.placeholder(
@@ -84,29 +87,28 @@ class EvaluateFrozenGraph(test.TestCase):
     filename = self._saveFrozenGraph(sess)
     return filename
 
+  @test_util.run_v1_only('b/120545219')
   def testQuantized(self):
     filename = self._getQuantizedModel()
-    model_coverage.test_frozen_graph_quant(filename, ['inputA', 'inputB'],
-                                           ['output'])
+    model_coverage.test_frozen_graph_quant(filename, ['inputA'], ['output'])
 
+  @test_util.run_v1_only('b/120545219')
   def testQuantizedInputShapes(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(
-        filename, ['inputA', 'inputB'], ['output'],
-        input_shapes={
-            'inputA': [33, 33],
-            'inputB': [33, 33],
-        })
+        filename, ['inputA'], ['output'], input_shapes={'inputA': [33, 33]})
 
+  @test_util.run_v1_only('b/120545219')
   def testQuantizedFlexAll(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(
-        filename, ['inputA', 'inputB'], ['output'],
-        converter_mode=lite.ConverterMode.TOCO_FLEX_ALL)
+        filename, ['inputA'], ['output'],
+        target_ops=set([lite.OpsSet.SELECT_TF_OPS]))
 
 
 class EvaluateSavedModel(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
     with session.Session().as_default() as sess:
@@ -144,24 +146,29 @@ class EvaluateKerasModel(test.TestCase):
       os.close(fd)
     return keras_file
 
+  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file)
 
+  @test_util.run_v1_only('b/120545219')
   def testPostTrainingQuantize(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file, post_training_quantize=True)
 
-  def testConverterMode(self):
+  @test_util.run_v1_only('b/120545219')
+  def testTargetOps(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(
-        keras_file, converter_mode=lite.ConverterMode.TOCO_FLEX)
+        keras_file,
+        target_ops=set([lite.OpsSet.TFLITE_BUILTINS,
+                        lite.OpsSet.SELECT_TF_OPS]))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg b/tensorflow/lite/testing/model_coverage/testdata/grace_hopper.jpg
similarity index 100%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg
rename to tensorflow/lite/testing/model_coverage/testdata/grace_hopper.jpg
diff --git a/tensorflow/contrib/lite/testing/nnapi_example.cc b/tensorflow/lite/testing/nnapi_example.cc
similarity index 91%
rename from tensorflow/contrib/lite/testing/nnapi_example.cc
rename to tensorflow/lite/testing/nnapi_example.cc
index 5870782b69217f292fe60821ea8ce4ea1174c495..22df8dbd8821436ab9a960d0acd4423278c078d8 100644
--- a/tensorflow/contrib/lite/testing/nnapi_example.cc
+++ b/tensorflow/lite/testing/nnapi_example.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // the future.
 //
 // Usage: bazel run -c opt \
-// tensorflow/contrib/lite/nnapi:nnapi_example -- <filename>
+// tensorflow/lite/nnapi:nnapi_example -- <filename>
 //
 #include <dirent.h>
 #include <cstdarg>
@@ -25,9 +25,9 @@ limitations under the License.
 #include <fstream>
 #include <iostream>
 #include <sstream>
-#include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
-#include "tensorflow/contrib/lite/testing/parse_testdata.h"
-#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include "tensorflow/lite/testing/parse_testdata.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
 
 string dirname(const string& s) { return s.substr(0, s.find_last_of("/")); }
 
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.cc b/tensorflow/lite/testing/parse_testdata.cc
similarity index 98%
rename from tensorflow/contrib/lite/testing/parse_testdata.cc
rename to tensorflow/lite/testing/parse_testdata.cc
index 389688d552051ea735ce71533943af33df5059ef..bb540087942b7e7b5a13c899fde19a1dc38b75ea 100644
--- a/tensorflow/contrib/lite/testing/parse_testdata.cc
+++ b/tensorflow/lite/testing/parse_testdata.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // Format is ASCII
 // TODO(aselle): Switch to protobuf, but the android team requested a simple
 // ASCII file.
-#include "tensorflow/contrib/lite/testing/parse_testdata.h"
+#include "tensorflow/lite/testing/parse_testdata.h"
 
 #include <cinttypes>
 #include <cmath>
@@ -26,9 +26,9 @@ limitations under the License.
 #include <iostream>
 #include <streambuf>
 
-#include "tensorflow/contrib/lite/error_reporter.h"
-#include "tensorflow/contrib/lite/testing/message.h"
-#include "tensorflow/contrib/lite/testing/split.h"
+#include "tensorflow/lite/error_reporter.h"
+#include "tensorflow/lite/testing/message.h"
+#include "tensorflow/lite/testing/split.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.h b/tensorflow/lite/testing/parse_testdata.h
similarity index 89%
rename from tensorflow/contrib/lite/testing/parse_testdata.h
rename to tensorflow/lite/testing/parse_testdata.h
index 26ee8258662e68fe4b509e537ac07ec8154f3311..0f3dc32afca97464f4b4d1fa99c7fc0ade804843 100644
--- a/tensorflow/contrib/lite/testing/parse_testdata.h
+++ b/tensorflow/lite/testing/parse_testdata.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_PARSE_TESTDATA_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_PARSE_TESTDATA_H_
+#ifndef TENSORFLOW_LITE_TESTING_PARSE_TESTDATA_H_
+#define TENSORFLOW_LITE_TESTING_PARSE_TESTDATA_H_
 
 #include <vector>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/testing/test_runner.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/testing/test_runner.h"
 
 namespace tflite {
 namespace testing {
@@ -72,4 +72,4 @@ bool ParseAndRunTests(std::istream* input, TestRunner* test_runner,
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_PARSE_TESTDATA_H_
+#endif  // TENSORFLOW_LITE_TESTING_PARSE_TESTDATA_H_
diff --git a/tensorflow/lite/testing/split.cc b/tensorflow/lite/testing/split.cc
new file mode 100644
index 0000000000000000000000000000000000000000..594b0d3f8a2af81c255603f1d67fb69f4e6f23bc
--- /dev/null
+++ b/tensorflow/lite/testing/split.cc
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+std::vector<std::pair<size_t, size_t>> SplitToPos(const string& s,
+                                                  const string& delimiter) {
+  std::vector<std::pair<size_t, size_t>> fields;
+  if (delimiter.length() == 0) {
+    fields.emplace_back(0, s.length());
+    return fields;
+  }
+  size_t pos = 0;
+  size_t start = 0;
+  while ((pos = s.find(delimiter, start)) != string::npos) {
+    if (pos != start) {
+      fields.emplace_back(start, pos);
+    }
+    start = pos + delimiter.length();
+  }
+  if (start != s.length()) {
+    fields.emplace_back(start, s.length());
+  }
+  return fields;
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/split.h b/tensorflow/lite/testing/split.h
similarity index 93%
rename from tensorflow/contrib/lite/testing/split.h
rename to tensorflow/lite/testing/split.h
index 896f2949efa6aeda76940bae18a11dccf3c2f01b..c33738997cae584ac04abff3f8e6e95a0eb5536a 100644
--- a/tensorflow/contrib/lite/testing/split.h
+++ b/tensorflow/lite/testing/split.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+#ifndef TENSORFLOW_LITE_TESTING_SPLIT_H_
+#define TENSORFLOW_LITE_TESTING_SPLIT_H_
 
 #include <cstdlib>
 #include <string>
 #include <utility>
 #include <vector>
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace testing {
@@ -93,4 +93,4 @@ inline std::vector<bool> Split(const string& s, const string& delimiter) {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+#endif  // TENSORFLOW_LITE_TESTING_SPLIT_H_
diff --git a/tensorflow/lite/testing/split_test.cc b/tensorflow/lite/testing/split_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..77de485d5710eab2050bf5a88ceff2343cf58643
--- /dev/null
+++ b/tensorflow/lite/testing/split_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/split.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Pair;
+
+TEST(SplitTest, SplitToPos) {
+  EXPECT_THAT(SplitToPos("test;:1-2-3 ;: test", ";:"),
+              ElementsAre(Pair(0, 4), Pair(6, 12), Pair(14, 19)));
+  EXPECT_THAT(SplitToPos("test;:1-2-3 ;: test", ":"),
+              ElementsAre(Pair(0, 5), Pair(6, 13), Pair(14, 19)));
+  EXPECT_THAT(SplitToPos("test", ":"), ElementsAre(Pair(0, 4)));
+  EXPECT_THAT(SplitToPos("test ", ":"), ElementsAre(Pair(0, 5)));
+  EXPECT_THAT(SplitToPos("", ":"), ElementsAre());
+  EXPECT_THAT(SplitToPos("test ", ""), ElementsAre(Pair(0, 5)));
+  EXPECT_THAT(SplitToPos("::::", ":"), ElementsAre());
+}
+
+TEST(SplitTest, SplitString) {
+  EXPECT_THAT(Split<string>("A;B;C", ";"), ElementsAre("A", "B", "C"));
+}
+
+TEST(SplitTest, SplitFloat) {
+  EXPECT_THAT(Split<float>("1.0 B 1e-5", " "), ElementsAre(1.0, 0.0, 1e-5));
+}
+
+TEST(SplitTest, SplitInt) {
+  EXPECT_THAT(Split<int>("1,-1,258", ","), ElementsAre(1, -1, 258));
+}
+
+TEST(SplitTest, SplitUint8) {
+  EXPECT_THAT(Split<uint8_t>("1,-1,258", ","), ElementsAre(1, 255, 2));
+}
+
+TEST(SplitTest, SplitBool) {
+  EXPECT_THAT(Split<bool>("1, 0, 0, 1", ","),
+              ElementsAre(true, false, false, true));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/test_runner.h b/tensorflow/lite/testing/test_runner.h
similarity index 92%
rename from tensorflow/contrib/lite/testing/test_runner.h
rename to tensorflow/lite/testing/test_runner.h
index fac7d01aab4b1e4c251213041eb4b823cd7d66aa..7cda8b5ec1366f962080e0198dbb7f7f0856707e 100644
--- a/tensorflow/contrib/lite/testing/test_runner.h
+++ b/tensorflow/lite/testing/test_runner.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+#ifndef TENSORFLOW_LITE_TESTING_TEST_RUNNER_H_
+#define TENSORFLOW_LITE_TESTING_TEST_RUNNER_H_
 
 #include <iostream>
 #include <memory>
 #include <string>
 #include <vector>
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace testing {
@@ -54,12 +54,12 @@ class TestRunner {
 
   // Define the contents of the given input tensor. The given 'id' is
   // guaranteed to be one of the ids returned by GetInputs().
-  virtual void SetInput(int id, const string& csv_values) = 0;
+  virtual void SetInput(int id, const string& values_as_string) = 0;
 
   // Define what should be expected for an output tensor after Invoke() runs.
   // The given 'id' is guaranteed to be one of the ids returned by
   // GetOutputs().
-  virtual void SetExpectation(int id, const string& csv_values) = 0;
+  virtual void SetExpectation(int id, const string& values_as_string) = 0;
 
   // Run the model.
   virtual void Invoke() = 0;
@@ -127,4 +127,4 @@ class TestRunner {
 
 }  // namespace testing
 }  // namespace tflite
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+#endif  // TENSORFLOW_LITE_TESTING_TEST_RUNNER_H_
diff --git a/tensorflow/contrib/lite/testing/test_runner_test.cc b/tensorflow/lite/testing/test_runner_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/testing/test_runner_test.cc
rename to tensorflow/lite/testing/test_runner_test.cc
index 3f04aa20bd7de813f0acd3f5897d5ab2df6c0fd7..39ec81582bcd8ff97904ddc50553c617e6b63468 100644
--- a/tensorflow/contrib/lite/testing/test_runner_test.cc
+++ b/tensorflow/lite/testing/test_runner_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/testing/test_runner.h"
+#include "tensorflow/lite/testing/test_runner.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/testing/tf_driver.cc b/tensorflow/lite/testing/tf_driver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffd76e8dc7eeb46404838ba29789ad5f446de2bf
--- /dev/null
+++ b/tensorflow/lite/testing/tf_driver.cc
@@ -0,0 +1,259 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/tf_driver.h"
+
+#include <fstream>
+#include <iostream>
+
+#include "absl/strings/escaping.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/join.h"
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+tensorflow::Tensor CreateTensor(const tensorflow::DataType type,
+                                const std::vector<int64_t>& dim) {
+  tensorflow::TensorShape shape{tensorflow::gtl::ArraySlice<tensorflow::int64>{
+      reinterpret_cast<const tensorflow::int64*>(dim.data()), dim.size()}};
+  return {type, shape};
+}
+
+template <typename T>
+int FillTensorWithData(tensorflow::Tensor* tensor,
+                       const string& values_as_string) {
+  const auto& values = testing::Split<T>(values_as_string, ",");
+
+  if (values.size() == tensor->NumElements()) {
+    auto data = tensor->flat<T>();
+    for (int i = 0; i < values.size(); i++) {
+      data(i) = values[i];
+    }
+  }
+
+  return values.size();
+}
+
+// Assumes 'values_as_string' is a hex string that gets converted into a
+// TF Lite DynamicBuffer. Strings are then extracted and copied into the
+// TensorFlow tensor.
+int FillTensorWithTfLiteHexString(tensorflow::Tensor* tensor,
+                                  const string& values_as_string) {
+  string s = absl::HexStringToBytes(values_as_string);
+
+  int num_strings = values_as_string.empty() ? 0 : GetStringCount(s.data());
+
+  if (num_strings == tensor->NumElements()) {
+    auto data = tensor->flat<string>();
+    for (size_t i = 0; i < num_strings; ++i) {
+      auto ref = GetString(s.data(), i);
+      data(i).assign(ref.str, ref.len);
+    }
+  }
+
+  return num_strings;
+}
+
+template <typename T>
+void FillTensorWithZeros(tensorflow::Tensor* tensor) {
+  auto data = tensor->flat<T>();
+  for (int i = 0; i < tensor->NumElements(); i++) {
+    data(i) = 0;
+  }
+}
+
+template <typename T>
+string TensorDataToCsvString(const tensorflow::Tensor& tensor) {
+  const auto& data = tensor.flat<T>();
+  return Join(data.data(), data.size(), ",");
+}
+
+string TensorDataToTfLiteHexString(const tensorflow::Tensor& tensor) {
+  DynamicBuffer dynamic_buffer;
+
+  auto data = tensor.flat<string>();
+  for (int i = 0; i < tensor.NumElements(); ++i) {
+    dynamic_buffer.AddString(data(i).data(), data(i).size());
+  }
+
+  char* char_buffer = nullptr;
+  size_t size = dynamic_buffer.WriteToBuffer(&char_buffer);
+  string s = absl::BytesToHexString({char_buffer, size});
+  free(char_buffer);
+
+  return s;
+}
+
+}  // namespace
+
+TfDriver::TfDriver(const std::vector<string>& input_layer,
+                   const std::vector<string>& input_layer_type,
+                   const std::vector<string>& input_layer_shape,
+                   const std::vector<string>& output_layer)
+    : input_names_(input_layer), output_names_(output_layer) {
+  CHECK_EQ(input_layer.size(), input_layer_type.size());
+  CHECK_EQ(input_layer.size(), input_layer_shape.size());
+
+  input_ids_.resize(input_layer.size());
+  input_tensors_.reserve(input_layer.size());
+  input_types_.resize(input_layer.size());
+  input_shapes_.resize(input_layer.size());
+  for (int i = 0; i < input_layer.size(); i++) {
+    input_ids_[i] = i;
+    input_tensors_[input_layer[i]] = {};
+    CHECK(DataTypeFromString(input_layer_type[i], &input_types_[i]));
+    input_shapes_[i] = Split<int64_t>(input_layer_shape[i], ",");
+  }
+
+  output_ids_.resize(output_layer.size());
+  output_tensors_.reserve(output_layer.size());
+  for (int i = 0; i < output_layer.size(); i++) {
+    output_ids_[i] = i;
+  }
+}
+
+void TfDriver::LoadModel(const string& bin_file_path) {
+  if (!IsValid()) return;
+  std::ifstream model(bin_file_path);
+  if (model.fail()) {
+    Invalidate("Failed to find the model " + bin_file_path);
+    return;
+  }
+
+  tensorflow::GraphDef graphdef;
+  if (!graphdef.ParseFromIstream(&model)) {
+    Invalidate("Failed to parse tensorflow graphdef");
+    return;
+  }
+
+  tensorflow::SessionOptions options;
+  session_.reset(tensorflow::NewSession(options));
+  auto status = session_->Create(graphdef);
+  if (!status.ok()) {
+    Invalidate("Failed to create session. " + status.error_message());
+  }
+}
+
+void TfDriver::SetInput(const string& values_as_string,
+                        tensorflow::Tensor* tensor) {
+  int num_values_available = 0;
+  switch (tensor->dtype()) {
+    case tensorflow::DT_FLOAT:
+      num_values_available =
+          FillTensorWithData<float>(tensor, values_as_string);
+      break;
+    case tensorflow::DT_INT32:
+      num_values_available =
+          FillTensorWithData<int32_t>(tensor, values_as_string);
+      break;
+    case tensorflow::DT_UINT8:
+      num_values_available =
+          FillTensorWithData<uint8_t>(tensor, values_as_string);
+      break;
+    case tensorflow::DT_STRING:
+      num_values_available =
+          FillTensorWithTfLiteHexString(tensor, values_as_string);
+      break;
+    default:
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              tensorflow::DataType_Name(tensor->dtype()),
+                              " in SetInput"));
+      return;
+  }
+
+  if (tensor->NumElements() != num_values_available) {
+    Invalidate(absl::StrCat("Needed ", tensor->NumElements(),
+                            " values for input tensor, but was given ",
+                            num_values_available, " instead."));
+  }
+}
+
+void TfDriver::SetInput(int id, const string& values_as_string) {
+  if (!IsValid()) return;
+  auto tensor = CreateTensor(input_types_[id], input_shapes_[id]);
+  SetInput(values_as_string, &tensor);
+  input_tensors_[input_names_[id]] = tensor;
+}
+
+void TfDriver::ResetTensor(int id) {
+  if (!IsValid()) return;
+  auto tensor = input_tensors_[input_names_[id]];
+  switch (input_types_[id]) {
+    case tensorflow::DT_FLOAT: {
+      FillTensorWithZeros<float>(&tensor);
+      break;
+    }
+    case tensorflow::DT_INT32: {
+      FillTensorWithZeros<int32_t>(&tensor);
+      break;
+    }
+    default:
+      Invalidate(absl::StrCat("Unsupported tensor type ", input_types_[id],
+                              tensorflow::DataType_Name(input_types_[id]),
+                              " in ResetInput"));
+      return;
+  }
+}
+
+void TfDriver::ReshapeTensor(int id, const string& values_as_string) {
+  input_shapes_[id] = Split<int64_t>(values_as_string, ",");
+  input_tensors_[input_names_[id]] =
+      CreateTensor(input_types_[id], input_shapes_[id]);
+  ResetTensor(id);
+}
+
+string TfDriver::ReadOutput(const tensorflow::Tensor& tensor) {
+  switch (tensor.dtype()) {
+    case tensorflow::DT_FLOAT:
+      return TensorDataToCsvString<float>(tensor);
+    case tensorflow::DT_INT32:
+      return TensorDataToCsvString<int32_t>(tensor);
+    case tensorflow::DT_INT64:
+      return TensorDataToCsvString<tensorflow::int64>(tensor);
+    case tensorflow::DT_UINT8:
+      return TensorDataToCsvString<uint8_t>(tensor);
+    case tensorflow::DT_STRING:
+      return TensorDataToTfLiteHexString(tensor);
+    case tensorflow::DT_BOOL:
+      return TensorDataToCsvString<bool>(tensor);
+    default:
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              tensorflow::DataType_Name(tensor.dtype()),
+                              " in ReadOutput"));
+      return "";
+  }
+}
+
+string TfDriver::ReadOutput(int id) {
+  if (!IsValid()) return "";
+  return ReadOutput(output_tensors_[id]);
+}
+
+void TfDriver::Invoke() {
+  if (!IsValid()) return;
+  auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
+                              output_names_, {}, &output_tensors_);
+  if (!status.ok()) {
+    Invalidate(absl::StrCat("TensorFlow failed to run graph:",
+                            status.error_message()));
+  }
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tf_driver.h b/tensorflow/lite/testing/tf_driver.h
similarity index 80%
rename from tensorflow/contrib/lite/testing/tf_driver.h
rename to tensorflow/lite/testing/tf_driver.h
index b766f85c4ddee9fb7b1513c264d4159e694770ca..46b18980b95fd109fbfe17c0c221cf1bf02dbac6 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.h
+++ b/tensorflow/lite/testing/tf_driver.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TF_DRIVER_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_TF_DRIVER_H_
+#ifndef TENSORFLOW_LITE_TESTING_TF_DRIVER_H_
+#define TENSORFLOW_LITE_TESTING_TF_DRIVER_H_
 
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/testing/split.h"
-#include "tensorflow/contrib/lite/testing/test_runner.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/test_runner.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
@@ -39,23 +39,27 @@ class TfDriver : public TestRunner {
   ~TfDriver() override {}
 
   void LoadModel(const string& bin_file_path) override;
-  void SetInput(int id, const string& csv_values) override;
+  void SetInput(int id, const string& values_as_string) override;
   void Invoke() override;
   string ReadOutput(int id) override;
 
   const std::vector<int>& GetInputs() override { return input_ids_; }
   const std::vector<int>& GetOutputs() override { return output_ids_; }
-  void ReshapeTensor(int id, const string& csv_values) override;
+  void ReshapeTensor(int id, const string& values_as_string) override;
   // Note: ResetTensor only works for input tensor.
   void ResetTensor(int id) override;
 
   // no-op. SetInput will overwrite existing data .
   void AllocateTensors() override {}
   // no-op. Tf driver is not supposed to check the results.
-  void SetExpectation(int id, const string& csv_values) override {}
+  void SetExpectation(int id, const string& values_as_string) override {}
   // tf driver is not supposed to check the results.
   bool CheckResults() override { return false; }
 
+ protected:
+  void SetInput(const string& values_as_string, tensorflow::Tensor*);
+  string ReadOutput(const tensorflow::Tensor& tensor);
+
  private:
   std::unique_ptr<tensorflow::Session> session_;
   std::vector<int> input_ids_;
@@ -72,4 +76,4 @@ class TfDriver : public TestRunner {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TF_DRIVER_H_
+#endif  // TENSORFLOW_LITE_TESTING_TF_DRIVER_H_
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..363d162d56a1670821d29768bc36411bf22d61e9
--- /dev/null
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/tf_driver.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/escaping.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+using ::testing::ElementsAre;
+
+class TestDriver : public TfDriver {
+ public:
+  // No need for a full TfDriver. We just want to test the read/write methods.
+  TestDriver() : TfDriver({}, {}, {}, {}) {}
+  string WriteAndReadBack(tensorflow::DataType type,
+                          const std::vector<int64_t>& shape,
+                          const string& values) {
+    tensorflow::Tensor t = {
+        type,
+        tensorflow::TensorShape{tensorflow::gtl::ArraySlice<tensorflow::int64>{
+            reinterpret_cast<const tensorflow::int64*>(shape.data()),
+            shape.size()}}};
+    SetInput(values, &t);
+    return ReadOutput(t);
+  }
+};
+
+TEST(TfDriverTest, ReadingAndWrintingValues) {
+  TestDriver driver;
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_FLOAT, {1, 2, 2},
+                                    "0.10,0.20,0.30,0.40"),
+            "0.100000001,0.200000003,0.300000012,0.400000006");
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_INT32, {1, 2, 2},
+                                    "10,40,100,-100"),
+            "10,40,100,-100");
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_UINT8, {1, 2, 2},
+                                    "48,49,121, 122"),
+            "0,1,y,z");
+}
+
+TEST(TfDriverTest, ReadingAndWrintingValuesStrings) {
+  TestDriver driver;
+
+  auto set_buffer = [](const std::vector<string>& values, string* buffer) {
+    DynamicBuffer dynamic_buffer;
+    for (const string& s : values) {
+      dynamic_buffer.AddString(s.data(), s.size());
+    }
+
+    char* char_b = nullptr;
+    int size = dynamic_buffer.WriteToBuffer(&char_b);
+    *buffer = absl::BytesToHexString(absl::string_view(char_b, size));
+    free(char_b);
+  };
+
+  string buffer;
+
+  set_buffer({"", "", "", ""}, &buffer);
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_STRING, {1, 2, 2}, buffer),
+            buffer);
+
+  // Note that if we pass the empty string we get the "empty" buffer (where all
+  // the strings are empty).
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_STRING, {1, 2, 2}, ""),
+            buffer);
+
+  set_buffer({"AB", "ABC", "X", "YZ"}, &buffer);
+
+  ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_STRING, {1, 2, 2}, buffer),
+            buffer);
+}
+
+TEST(TfDriverTest, SimpleTest) {
+  std::unique_ptr<TfDriver> runner(
+      new TfDriver({"a", "b", "c", "d"}, {"float", "float", "float", "float"},
+                   {"1,8,8,3", "1,8,8,3", "1,8,8,3", "1,8,8,3"}, {"x", "y"}));
+
+  runner->LoadModel(
+      "third_party/tensorflow/lite/testdata/multi_add.pb");
+  EXPECT_TRUE(runner->IsValid()) << runner->GetErrorMessage();
+
+  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
+  ASSERT_THAT(runner->GetOutputs(), ElementsAre(0, 1));
+
+  for (int i : {0, 1, 2, 3}) {
+    runner->ReshapeTensor(i, "1,2,2,1");
+  }
+  ASSERT_TRUE(runner->IsValid());
+
+  runner->SetInput(0, "0.1,0.2,0.3,0.4");
+  runner->SetInput(1, "0.001,0.002,0.003,0.004");
+  runner->SetInput(2, "0.001,0.002,0.003,0.004");
+  runner->SetInput(3, "0.01,0.02,0.03,0.04");
+  runner->ResetTensor(2);
+  runner->Invoke();
+
+  ASSERT_EQ(runner->ReadOutput(0),
+            "0.101000004,0.202000007,0.303000003,0.404000014");
+  ASSERT_EQ(runner->ReadOutput(1),
+            "0.0109999999,0.0219999999,0.0329999998,0.0439999998");
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc b/tensorflow/lite/testing/tflite_diff_example_test.cc
similarity index 88%
rename from tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
rename to tensorflow/lite/testing/tflite_diff_example_test.cc
index f2c49fe389763110279b3dd1e4f13b1522de0460..cb61cd4e94214050218c4ebb3d6ffbe1b703f486 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
+++ b/tensorflow/lite/testing/tflite_diff_example_test.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/testing/tflite_diff_flags.h"
-#include "tensorflow/contrib/lite/testing/tflite_diff_util.h"
+#include "tensorflow/lite/testing/init_tensorflow.h"
+#include "tensorflow/lite/testing/tflite_diff_flags.h"
+#include "tensorflow/lite/testing/tflite_diff_util.h"
 
 int main(int argc, char** argv) {
+  ::tflite::InitTensorFlow();  // For Flex support.
+
   ::tflite::testing::DiffOptions options =
       ::tflite::testing::ParseTfliteDiffFlags(&argc, argv);
   if (options.tensorflow_model.empty()) return 1;
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/lite/testing/tflite_diff_flags.h
similarity index 92%
rename from tensorflow/contrib/lite/testing/tflite_diff_flags.h
rename to tensorflow/lite/testing/tflite_diff_flags.h
index ad889a2f198644b01feffb397a717ec5882de04b..2fe068eb20f1fb8d6d28fa46f43f096588708ffa 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/lite/testing/tflite_diff_flags.h
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
+#ifndef TENSORFLOW_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
+#define TENSORFLOW_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
 
 #include <cstring>
 
-#include "tensorflow/contrib/lite/testing/split.h"
-#include "tensorflow/contrib/lite/testing/tflite_diff_util.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/tflite_diff_util.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tflite {
@@ -88,4 +88,4 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
+#endif  // TENSORFLOW_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.cc b/tensorflow/lite/testing/tflite_diff_util.cc
similarity index 85%
rename from tensorflow/contrib/lite/testing/tflite_diff_util.cc
rename to tensorflow/lite/testing/tflite_diff_util.cc
index c6ca796ac25d2ce9d6cb66200cd800f14869f69b..0142ae4217eaea02e8760c91df54477f7d496a83 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.cc
+++ b/tensorflow/lite/testing/tflite_diff_util.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <cstdlib>
 #include <sstream>
 
-#include "tensorflow/contrib/lite/testing/generate_testspec.h"
-#include "tensorflow/contrib/lite/testing/parse_testdata.h"
-#include "tensorflow/contrib/lite/testing/tflite_diff_util.h"
-#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+#include "tensorflow/lite/testing/generate_testspec.h"
+#include "tensorflow/lite/testing/parse_testdata.h"
+#include "tensorflow/lite/testing/tflite_diff_util.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.h b/tensorflow/lite/testing/tflite_diff_util.h
similarity index 89%
rename from tensorflow/contrib/lite/testing/tflite_diff_util.h
rename to tensorflow/lite/testing/tflite_diff_util.h
index 28b14bd143ab0e9ec9513fa04c21c111a51cfacc..3f9f10892db2878333dce0457a98068fa460cdd9 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.h
+++ b/tensorflow/lite/testing/tflite_diff_util.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_UTIL_H_
+#ifndef TENSORFLOW_LITE_TESTING_TFLITE_DIFF_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_TFLITE_DIFF_UTIL_H_
 
 #include <vector>
 
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace testing {
@@ -55,4 +55,4 @@ bool RunDiffTest(const DiffOptions& options, int num_invocations);
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_UTIL_H_
+#endif  // TENSORFLOW_LITE_TESTING_TFLITE_DIFF_UTIL_H_
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e11d49f252818f9f7024b8bbafa8b17ad77ad48
--- /dev/null
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -0,0 +1,376 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+#include <iostream>
+
+#include "absl/strings/escaping.h"
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/delegates/flex/delegate.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+// Returns the value in the given position in a tensor.
+template <typename T>
+T Value(const TfLitePtrUnion& data, int index);
+template <>
+float Value(const TfLitePtrUnion& data, int index) {
+  return data.f[index];
+}
+template <>
+int32_t Value(const TfLitePtrUnion& data, int index) {
+  return data.i32[index];
+}
+template <>
+int64_t Value(const TfLitePtrUnion& data, int index) {
+  return data.i64[index];
+}
+template <>
+uint8_t Value(const TfLitePtrUnion& data, int index) {
+  return data.uint8[index];
+}
+template <>
+bool Value(const TfLitePtrUnion& data, int index) {
+  return data.b[index];
+}
+
+template <typename T>
+void SetTensorData(const std::vector<T>& values, TfLitePtrUnion* data) {
+  T* input_ptr = reinterpret_cast<T*>(data->raw);
+  for (const T& v : values) {
+    *input_ptr = v;
+    ++input_ptr;
+  }
+}
+
+}  // namespace
+
+class TfLiteDriver::Expectation {
+ public:
+  Expectation() {
+    data_.raw = nullptr;
+    num_elements_ = 0;
+  }
+  ~Expectation() { delete[] data_.raw; }
+  template <typename T>
+  void SetData(const string& csv_values) {
+    const auto& values = testing::Split<T>(csv_values, ",");
+    num_elements_ = values.size();
+    data_.raw = new char[num_elements_ * sizeof(T)];
+    SetTensorData(values, &data_);
+  }
+
+  template <>
+  void SetData<string>(const string& csv_values) {
+    string s = absl::HexStringToBytes(csv_values);
+    data_.raw = new char[s.size()];
+    memcpy(data_.raw, s.data(), s.size());
+  }
+
+  bool Check(bool verbose, const TfLiteTensor& tensor) {
+    switch (tensor.type) {
+      case kTfLiteFloat32:
+        return TypedCheck<float>(verbose, tensor);
+      case kTfLiteInt32:
+        return TypedCheck<int32_t>(verbose, tensor);
+      case kTfLiteInt64:
+        return TypedCheck<int64_t>(verbose, tensor);
+      case kTfLiteUInt8:
+        return TypedCheck<uint8_t>(verbose, tensor);
+      case kTfLiteBool:
+        return TypedCheck<bool>(verbose, tensor);
+      case kTfLiteString:
+        return TypedCheck<string>(verbose, tensor);
+      default:
+        fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
+        return false;
+    }
+  }
+
+ private:
+  template <typename T>
+  bool TypedCheck(bool verbose, const TfLiteTensor& tensor) {
+    // TODO(ahentz): must find a way to configure the tolerance.
+    constexpr double kRelativeThreshold = 1e-2f;
+    constexpr double kAbsoluteThreshold = 1e-4f;
+
+    size_t tensor_size = tensor.bytes / sizeof(T);
+
+    if (tensor_size != num_elements_) {
+      std::cerr << "Expected a tensor with " << num_elements_
+                << " elements, got " << tensor_size << std::endl;
+      std::cerr << "while checking tensor " << tensor.name << std::endl;
+      return false;
+    }
+
+    bool good_output = true;
+    for (int i = 0; i < tensor_size; ++i) {
+      float computed = Value<T>(tensor.data, i);
+      float reference = Value<T>(data_, i);
+      float diff = std::abs(computed - reference);
+      bool error_is_large = false;
+      // For very small numbers, try absolute error, otherwise go with
+      // relative.
+      if (std::abs(reference) < kRelativeThreshold) {
+        error_is_large = (diff > kAbsoluteThreshold);
+      } else {
+        error_is_large = (diff > kRelativeThreshold * std::abs(reference));
+      }
+      if (error_is_large) {
+        good_output = false;
+        if (verbose) {
+          std::cerr << "  index " << i << ": got " << computed
+                    << ", but expected " << reference << std::endl;
+        }
+      }
+    }
+    return good_output;
+  }
+
+  template <>
+  bool TypedCheck<string>(bool verbose, const TfLiteTensor& tensor) {
+    if (tensor.data.raw == nullptr) {
+      if (verbose) {
+        std::cerr << "  got empty string" << std::endl;
+      }
+      return false;
+    }
+    int expected_num_strings = GetStringCount(data_.raw);
+    int returned_num_strings = GetStringCount(tensor.data.raw);
+    if (expected_num_strings != returned_num_strings) {
+      if (verbose) {
+        std::cerr << "  string count differ: got " << returned_num_strings
+                  << ", but expected " << expected_num_strings << std::endl;
+      }
+      return false;
+    }
+    for (int i = 0; i < returned_num_strings; ++i) {
+      auto expected_ref = GetString(data_.raw, i);
+      auto returned_ref = GetString(tensor.data.raw, i);
+      if (expected_ref.len != returned_ref.len) {
+        if (verbose) {
+          std::cerr << "  index " << i << ": got string of size "
+                    << returned_ref.len << ", but expected size "
+                    << expected_ref.len << std::endl;
+        }
+        return false;
+      }
+      if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
+        if (verbose) {
+          std::cerr << "  index " << i << ": strings are different"
+                    << std::endl;
+        }
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  TfLitePtrUnion data_;
+  size_t num_elements_;
+};
+
+TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
+    : use_nnapi_(use_nnapi) {
+  if (delegate_name == "FLEX") {
+    delegate_ = FlexDelegate::Create();
+  }
+}
+
+TfLiteDriver::~TfLiteDriver() {
+  for (auto t : tensors_to_deallocate_) {
+    DeallocateStringTensor(t.second);
+  }
+  interpreter_.reset();
+}
+
+void TfLiteDriver::AllocateTensors() {
+  if (must_allocate_tensors_) {
+    if (interpreter_->AllocateTensors() != kTfLiteOk) {
+      Invalidate("Failed to allocate tensors");
+      return;
+    }
+    ResetLSTMStateTensors();
+    must_allocate_tensors_ = false;
+  }
+}
+
+void TfLiteDriver::LoadModel(const string& bin_file_path) {
+  if (!IsValid()) return;
+
+  model_ = FlatBufferModel::BuildFromFile(GetFullPath(bin_file_path).c_str());
+  if (!model_) {
+    Invalidate("Failed to mmap model " + bin_file_path);
+    return;
+  }
+  ops::builtin::BuiltinOpResolver builtins;
+  InterpreterBuilder(*model_, builtins)(&interpreter_);
+  if (!interpreter_) {
+    Invalidate("Failed build interpreter");
+    return;
+  }
+  interpreter_->UseNNAPI(use_nnapi_);
+
+  if (delegate_) {
+    if (interpreter_->ModifyGraphWithDelegate(delegate_.get()) != kTfLiteOk) {
+      Invalidate("Unable to the build graph using the delegate");
+      return;
+    }
+  }
+
+  must_allocate_tensors_ = true;
+}
+
+void TfLiteDriver::ResetTensor(int id) {
+  if (!IsValid()) return;
+  auto* tensor = interpreter_->tensor(id);
+  memset(tensor->data.raw, 0, tensor->bytes);
+}
+
+void TfLiteDriver::ReshapeTensor(int id, const string& csv_values) {
+  if (!IsValid()) return;
+  if (interpreter_->ResizeInputTensor(
+          id, testing::Split<int>(csv_values, ",")) != kTfLiteOk) {
+    Invalidate("Failed to resize input tensor " + std::to_string(id));
+    return;
+  }
+  must_allocate_tensors_ = true;
+}
+
+void TfLiteDriver::SetInput(int id, const string& csv_values) {
+  if (!IsValid()) return;
+  auto* tensor = interpreter_->tensor(id);
+  switch (tensor->type) {
+    case kTfLiteFloat32: {
+      const auto& values = testing::Split<float>(csv_values, ",");
+      if (!CheckSizes<float>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
+    case kTfLiteInt32: {
+      const auto& values = testing::Split<int32_t>(csv_values, ",");
+      if (!CheckSizes<int32_t>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
+    case kTfLiteInt64: {
+      const auto& values = testing::Split<int64_t>(csv_values, ",");
+      if (!CheckSizes<int64_t>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
+    case kTfLiteUInt8: {
+      const auto& values = testing::Split<uint8_t>(csv_values, ",");
+      if (!CheckSizes<uint8_t>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
+    case kTfLiteBool: {
+      const auto& values = testing::Split<bool>(csv_values, ",");
+      if (!CheckSizes<bool>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
+    case kTfLiteString: {
+      string s = absl::HexStringToBytes(csv_values);
+
+      DeallocateStringTensor(tensors_to_deallocate_[id]);
+      AllocateStringTensor(id, s.size(), tensor);
+      memcpy(tensor->data.raw, s.data(), s.size());
+
+      break;
+    }
+    default:
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              TfLiteTypeGetName(tensor->type),
+                              " in TfLiteDriver::SetInput"));
+      return;
+  }
+}
+
+void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
+  if (!IsValid()) return;
+  auto* tensor = interpreter_->tensor(id);
+  if (expected_output_.count(id) != 0) {
+    Invalidate(absl::StrCat("Overridden expectation for tensor '", id, "'"));
+  }
+  expected_output_[id].reset(new Expectation);
+  switch (tensor->type) {
+    case kTfLiteFloat32:
+      expected_output_[id]->SetData<float>(csv_values);
+      break;
+    case kTfLiteInt32:
+      expected_output_[id]->SetData<int32_t>(csv_values);
+      break;
+    case kTfLiteInt64:
+      expected_output_[id]->SetData<int64_t>(csv_values);
+      break;
+    case kTfLiteUInt8:
+      expected_output_[id]->SetData<uint8_t>(csv_values);
+      break;
+    case kTfLiteBool:
+      expected_output_[id]->SetData<bool>(csv_values);
+      break;
+    case kTfLiteString:
+      expected_output_[id]->SetData<string>(csv_values);
+      break;
+    default:
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              TfLiteTypeGetName(tensor->type),
+                              " in TfLiteDriver::SetExpectation"));
+      return;
+  }
+}
+
+void TfLiteDriver::Invoke() {
+  if (!IsValid()) return;
+  if (interpreter_->Invoke() != kTfLiteOk) {
+    Invalidate("Failed to invoke interpreter");
+  }
+}
+
+bool TfLiteDriver::CheckResults() {
+  if (!IsValid()) return false;
+  bool success = true;
+  for (const auto& p : expected_output_) {
+    int id = p.first;
+    auto* tensor = interpreter_->tensor(id);
+    if (!p.second->Check(/*verbose=*/false, *tensor)) {
+      // Do not invalidate anything here. Instead, simply output the
+      // differences and return false. Invalidating would prevent all
+      // subsequent invocations from running..
+      std::cerr << "There were errors in invocation '" << GetInvocationId()
+                << "', output tensor '" << id << "':" << std::endl;
+      p.second->Check(/*verbose=*/true, *tensor);
+      success = false;
+      SetOverallSuccess(false);
+    }
+  }
+  expected_output_.clear();
+  return success;
+}
+
+void TfLiteDriver::ResetLSTMStateTensors() {
+  interpreter_->ResetVariableTensors();
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
new file mode 100644
index 0000000000000000000000000000000000000000..1da0533c57cf51f442253f28b6d9ba13078ef9a7
--- /dev/null
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
+#define TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
+
+#include <map>
+
+#include "tensorflow/lite/delegates/flex/delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/testing/test_runner.h"
+
+namespace tflite {
+namespace testing {
+
+// A test runner that feeds inputs into TF Lite and verifies its outputs.
+class TfLiteDriver : public TestRunner {
+ public:
+  explicit TfLiteDriver(bool use_nnapi, const string& delegate = "");
+  ~TfLiteDriver() override;
+
+  void LoadModel(const string& bin_file_path) override;
+  const std::vector<int>& GetInputs() override {
+    return interpreter_->inputs();
+  }
+  const std::vector<int>& GetOutputs() override {
+    return interpreter_->outputs();
+  }
+  void ReshapeTensor(int id, const string& csv_values) override;
+  void AllocateTensors() override;
+  void ResetTensor(int id) override;
+  void SetInput(int id, const string& csv_values) override;
+  void SetExpectation(int id, const string& csv_values) override;
+  void Invoke() override;
+  bool CheckResults() override;
+  string ReadOutput(int id) override { return "no-op"; }
+
+ private:
+  void DeallocateStringTensor(TfLiteTensor* t) {
+    if (t) {
+      free(t->data.raw);
+      t->data.raw = nullptr;
+    }
+  }
+  void AllocateStringTensor(int id, size_t num_bytes, TfLiteTensor* t) {
+    t->data.raw = reinterpret_cast<char*>(malloc(num_bytes));
+    t->bytes = num_bytes;
+    tensors_to_deallocate_[id] = t;
+  }
+
+  void ResetLSTMStateTensors();
+
+  class Expectation;
+
+  std::unique_ptr<FlexDelegate> delegate_;
+  bool use_nnapi_ = false;
+  std::unique_ptr<FlatBufferModel> model_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::map<int, std::unique_ptr<Expectation>> expected_output_;
+  bool must_allocate_tensors_ = true;
+  std::map<int, TfLiteTensor*> tensors_to_deallocate_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
diff --git a/tensorflow/contrib/lite/testing/tflite_driver_test.cc b/tensorflow/lite/testing/tflite_driver_test.cc
similarity index 93%
rename from tensorflow/contrib/lite/testing/tflite_driver_test.cc
rename to tensorflow/lite/testing/tflite_driver_test.cc
index 37010c468f250fdf4ef958b23a38aa38b7a533db..6e953e5e19b8f6cac1a4349145b03a7f8b5e1969 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver_test.cc
+++ b/tensorflow/lite/testing/tflite_driver_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -26,7 +26,7 @@ using ::testing::ElementsAre;
 TEST(TfliteDriverTest, SimpleTest) {
   std::unique_ptr<TestRunner> runner(new TfLiteDriver(/*use_nnapi=*/false));
 
-  runner->SetModelBaseDir("tensorflow/contrib/lite");
+  runner->SetModelBaseDir("tensorflow/lite");
   runner->LoadModel("testdata/multi_add.bin");
   ASSERT_TRUE(runner->IsValid());
 
diff --git a/tensorflow/contrib/lite/testing/tokenize.cc b/tensorflow/lite/testing/tokenize.cc
similarity index 96%
rename from tensorflow/contrib/lite/testing/tokenize.cc
rename to tensorflow/lite/testing/tokenize.cc
index 2e84ea475cae60b197a243953517f401f77e2e46..bb4753580131adfdf488164358fcf45d99525f65 100644
--- a/tensorflow/contrib/lite/testing/tokenize.cc
+++ b/tensorflow/lite/testing/tokenize.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/testing/tokenize.h"
+#include "tensorflow/lite/testing/tokenize.h"
 #include <istream>
 #include <string>
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/contrib/lite/testing/tokenize.h b/tensorflow/lite/testing/tokenize.h
similarity index 89%
rename from tensorflow/contrib/lite/testing/tokenize.h
rename to tensorflow/lite/testing/tokenize.h
index 819539185168dfbc8ac7782ab42890a230476310..7bd2783337a763cafd4576f8f1e4e4b9e4829c6f 100644
--- a/tensorflow/contrib/lite/testing/tokenize.h
+++ b/tensorflow/lite/testing/tokenize.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZE_H_
-#define TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZE_H_
+#ifndef TENSORFLOW_LITE_TESTING_TOKENIZE_H_
+#define TENSORFLOW_LITE_TESTING_TOKENIZE_H_
 
 #include <istream>
 #include <string>
@@ -39,4 +39,4 @@ void Tokenize(std::istream* input, TokenProcessor* processor);
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZE_H_
+#endif  // TENSORFLOW_LITE_TESTING_TOKENIZE_H_
diff --git a/tensorflow/contrib/lite/testing/tokenize_test.cc b/tensorflow/lite/testing/tokenize_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/testing/tokenize_test.cc
rename to tensorflow/lite/testing/tokenize_test.cc
index 80f44aacca7e90efb3a6c8967c7175eada35734b..302ae589d02c3e8f167a97e6f02459c833d4d65d 100644
--- a/tensorflow/contrib/lite/testing/tokenize_test.cc
+++ b/tensorflow/lite/testing/tokenize_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/testing/tokenize.h"
+#include "tensorflow/lite/testing/tokenize.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/testing/util.h b/tensorflow/lite/testing/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..45751497de47bc30d33bafdf00f389232fef14f8
--- /dev/null
+++ b/tensorflow/lite/testing/util.h
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_UTIL_H_
+
+#include <cstdio>
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/string.h"
+
+namespace tflite {
+
+// An ErrorReporter that collects error message in a string, in addition
+// to printing to stderr.
+class TestErrorReporter : public ErrorReporter {
+ public:
+  int Report(const char* format, va_list args) override {
+    char buffer[1024];
+    int size = vsnprintf(buffer, sizeof(buffer), format, args);
+    fprintf(stderr, "%s", buffer);
+    error_messages_ += buffer;
+    num_calls_++;
+    return size;
+  }
+
+  void Reset() {
+    num_calls_ = 0;
+    error_messages_.clear();
+  }
+
+  int num_calls() const { return num_calls_; }
+  const string& error_messages() const { return error_messages_; }
+
+ private:
+  int num_calls_ = 0;
+  string error_messages_;
+};
+
+inline void LogToStderr() {
+#ifdef PLATFORM_GOOGLE
+  FLAGS_logtostderr = true;
+#endif
+}
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_UTIL_H_
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..93d41fcae14c8130de87471bdce64edad131c11f
--- /dev/null
+++ b/tensorflow/lite/toco/BUILD
@@ -0,0 +1,473 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_cc",
+    "tf_proto_library_py",
+)
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
+    "tf_copts",
+)
+
+tf_proto_library_cc(
+    name = "types_proto",
+    srcs = ["types.proto"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_cc(
+    name = "toco_flags_proto",
+    srcs = ["toco_flags.proto"],
+    protodeps = [":types_proto"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_cc(
+    name = "model_flags_proto",
+    srcs = ["model_flags.proto"],
+    protodeps = [":types_proto"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_py(
+    name = "types_proto",
+    srcs = [
+        "types.proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_py(
+    name = "toco_flags_proto",
+    srcs = [
+        "toco_flags.proto",
+    ],
+    protodeps = [":types_proto"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_py(
+    name = "model_flags_proto",
+    srcs = [
+        "model_flags.proto",
+    ],
+    protodeps = [":types_proto"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "tensorflow_core_cc_protos_all",
+    deps = ["//tensorflow/core:protos_all_cc"],
+)
+
+cc_library(
+    name = "runtime",
+    hdrs = [
+        "runtime/common.h",
+        "runtime/types.h",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:types",
+    ],
+)
+
+# :model offers the core data structures representing a model (a.k.a. "graph")
+# for tooling purposes (not needed at inference runtime).
+# That includes the top-level Model structure, and the lower-level Operator,
+# Array, Buffer structures, etc.
+cc_library(
+    name = "model",
+    hdrs = [
+        "model.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model_flags_proto_cc",
+        ":runtime",
+        ":toco_port",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "toco_graphviz_dump_options",
+    srcs = [
+        "toco_graphviz_dump_options.cc",
+    ],
+    hdrs = [
+        "toco_graphviz_dump_options.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "toco_cmdline_flags",
+    srcs = [
+        "toco_cmdline_flags.cc",
+    ],
+    hdrs = [
+        "toco_cmdline_flags.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model_cmdline_flags",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":types_proto_cc",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "model_cmdline_flags",
+    srcs = [
+        "model_cmdline_flags.cc",
+    ],
+    hdrs = [
+        "args.h",
+        "model_cmdline_flags.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model_flags_proto_cc",
+        ":toco_graphviz_dump_options",
+        ":toco_port",
+        ":types_proto_cc",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "toco_port",
+    srcs = [
+        "toco_port.cc",
+    ],
+    hdrs = [
+        "format_port.h",
+        "toco_port.h",
+        "toco_types.h",
+    ],
+    deps = [
+        # Placeholder for internal file dependency.
+        "@protobuf_archive//:protobuf_headers",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "graph_transformations",
+    srcs = [
+        "graph_transformations/convert_expanddims_to_reshape.cc",
+        "graph_transformations/convert_pure_conv_to_depthwise.cc",
+        "graph_transformations/convert_reorder_axes.cc",
+        "graph_transformations/convert_squeeze_to_reshape.cc",
+        "graph_transformations/convert_trivial_addn_to_add.cc",
+        "graph_transformations/convert_trivial_pack_to_reshape.cc",
+        "graph_transformations/convert_trivial_tile_to_concat.cc",
+        "graph_transformations/convert_trivial_transpose_to_reshape.cc",
+        "graph_transformations/create_im2col_arrays.cc",
+        "graph_transformations/dequantize.cc",
+        "graph_transformations/drop_fake_quant.cc",
+        "graph_transformations/drop_im2col_arrays.cc",
+        "graph_transformations/ensure_bias_vectors.cc",
+        "graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc",
+        "graph_transformations/fuse_activation_functions.cc",
+        "graph_transformations/fuse_binary_into_following_affine.cc",
+        "graph_transformations/fuse_binary_into_preceding_affine.cc",
+        "graph_transformations/fuse_broadcast_into_following_binary.cc",
+        "graph_transformations/graph_transformations.cc",
+        "graph_transformations/hardcode_min_max.cc",
+        "graph_transformations/identify_dilated_conv.cc",
+        "graph_transformations/identify_l2_normalization.cc",
+        "graph_transformations/identify_l2_pool.cc",
+        "graph_transformations/identify_lstm.cc",
+        "graph_transformations/identify_lstm_merge_inputs.cc",
+        "graph_transformations/identify_lstm_split_inputs.cc",
+        "graph_transformations/identify_prelu.cc",
+        "graph_transformations/identify_relu1.cc",
+        "graph_transformations/lstm_utils.cc",
+        "graph_transformations/make_initial_dequantize_operator.cc",
+        "graph_transformations/merge_reshape_into_preceding_transpose.cc",
+        "graph_transformations/move_binary_operator_before_reshape.cc",
+        "graph_transformations/propagate_activation_function_into_constants.cc",
+        "graph_transformations/propagate_array_data_types.cc",
+        "graph_transformations/propagate_default_min_max.cc",
+        "graph_transformations/propagate_fake_quant_num_bits.cc",
+        "graph_transformations/propagate_fixed_sizes.cc",
+        "graph_transformations/quantization_util.cc",
+        "graph_transformations/quantization_util.h",
+        "graph_transformations/quantize.cc",
+        "graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc",
+        "graph_transformations/remove_final_dequantize_op.cc",
+        "graph_transformations/remove_tensorflow_assert.cc",
+        "graph_transformations/remove_tensorflow_identity.cc",
+        "graph_transformations/remove_trivial_binary.cc",
+        "graph_transformations/remove_trivial_concatenation.cc",
+        "graph_transformations/remove_trivial_concatenation_input.cc",
+        "graph_transformations/remove_trivial_fake_quant.cc",
+        "graph_transformations/remove_trivial_passthrough.cc",
+        "graph_transformations/remove_trivial_passthrough.h",
+        "graph_transformations/remove_trivial_quantized_activation_func.cc",
+        "graph_transformations/remove_trivial_quantized_min_max.cc",
+        "graph_transformations/remove_trivial_reshape.cc",
+        "graph_transformations/remove_trivial_slice.cc",
+        "graph_transformations/remove_unused_op.cc",
+        "graph_transformations/reorder_elementwise_unary.cc",
+        "graph_transformations/reorder_reshape_transpose.cc",
+        "graph_transformations/resolve_batch_normalization.cc",
+        "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
+        "graph_transformations/resolve_constant_binary.cc",
+        "graph_transformations/resolve_constant_concatenation.cc",
+        "graph_transformations/resolve_constant_fake_quant.cc",
+        "graph_transformations/resolve_constant_fill.cc",
+        "graph_transformations/resolve_constant_gather.cc",
+        "graph_transformations/resolve_constant_pack.cc",
+        "graph_transformations/resolve_constant_random_uniform.cc",
+        "graph_transformations/resolve_constant_range.cc",
+        "graph_transformations/resolve_constant_reshape.cc",
+        "graph_transformations/resolve_constant_select.cc",
+        "graph_transformations/resolve_constant_shape_or_rank.cc",
+        "graph_transformations/resolve_constant_slice.cc",
+        "graph_transformations/resolve_constant_strided_slice.cc",
+        "graph_transformations/resolve_constant_tile.cc",
+        "graph_transformations/resolve_constant_transpose.cc",
+        "graph_transformations/resolve_constant_unary.cc",
+        "graph_transformations/resolve_fake_quant_args_from_vars.cc",
+        "graph_transformations/resolve_gather_attributes.cc",
+        "graph_transformations/resolve_multiply_by_zero.cc",
+        "graph_transformations/resolve_pad_attributes.cc",
+        "graph_transformations/resolve_padv2_attributes.cc",
+        "graph_transformations/resolve_reduce_attributes.cc",
+        "graph_transformations/resolve_reorder_axes.cc",
+        "graph_transformations/resolve_reshape_attributes.cc",
+        "graph_transformations/resolve_slice_attributes.cc",
+        "graph_transformations/resolve_space_to_batch_nd_attributes.cc",
+        "graph_transformations/resolve_squeeze_attributes.cc",
+        "graph_transformations/resolve_strided_slice_attributes.cc",
+        "graph_transformations/resolve_tensorflow_concat.cc",
+        "graph_transformations/resolve_tensorflow_matmul.cc",
+        "graph_transformations/resolve_tensorflow_merge.cc",
+        "graph_transformations/resolve_tensorflow_switch.cc",
+        "graph_transformations/resolve_transpose_attributes.cc",
+        "graph_transformations/shuffle_fc_weights.cc",
+        "graph_transformations/unfuse_activation_functions.cc",
+        "graph_transformations/unpartition_embedding_lookup.cc",
+        "graph_transformations/unroll_batch_matmul.cc",
+    ],
+    hdrs = [
+        "graph_transformations/graph_transformations.h",
+        "graph_transformations/lstm_utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_flags_proto_cc",
+        ":runtime",
+        ":toco_port",
+        ":tooling_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "//tensorflow/lite/kernels/internal:strided_slice_logic",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+# :toco_tooling is the library providing the offline tooling functionality
+# exposed by the :toco command-line tool.
+cc_library(
+    name = "toco_tooling",
+    srcs = [
+        "allocate_transient_arrays.cc",
+        "export_tensorflow.cc",
+        "import_tensorflow.cc",
+        "tensorflow_util.cc",
+        "toco_tooling.cc",
+    ],
+    hdrs = [
+        "allocate_transient_arrays.h",
+        "export_tensorflow.h",
+        "import_tensorflow.h",
+        "tensorflow_util.h",
+        "toco_tooling.h",
+    ],
+    copts = tf_copts() + select({
+        "//tensorflow:darwin": ["-DTOCO_SUPPORT_PORTABLE_PROTOS=0"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_transformations",
+        ":model",
+        ":model_flags_proto_cc",
+        ":types_proto_cc",
+        ":runtime",
+        ":toco_graphviz_dump_options",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":tooling_util",
+        "@protobuf_archive//:protobuf_headers",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/toco/tensorflow_graph_matching:resolve_cluster",
+        "//tensorflow/lite/toco/tflite:export",
+        "//tensorflow/lite/toco/tflite:import",
+    ] + select({
+        # Placeholder for internal darwin rule.
+        "//conditions:default": [],
+    }),
+)
+
+tf_cc_test(
+    name = "import_tensorflow_test",
+    srcs = ["import_tensorflow_test.cc"],
+    deps = [
+        ":toco_tooling",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "tooling_util",
+    srcs = [
+        "dump_graphviz.cc",
+        "tooling_util.cc",
+    ],
+    hdrs = [
+        "dump_graphviz.h",
+        "tooling_util.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_flags_proto_cc",
+        ":runtime",
+        ":toco_flags_proto_cc",
+        ":toco_graphviz_dump_options",
+        ":toco_port",
+        ":types_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/kernels/internal:types",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+tf_cc_test(
+    name = "tooling_util_test",
+    srcs = ["tooling_util_test.cc"],
+    deps = [
+        ":model",
+        ":tooling_util",
+        "//tensorflow/core:lib",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+# :toco is the main public command-line tool exposing the functionality
+# of the :toco_tooling library.
+cc_library(
+    name = "toco_convert",
+    srcs = ["toco_convert.cc"],
+    hdrs = ["toco_convert.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        # We cannot embed the core:ops dependency directly into :toco_tooling as
+        # it can conflict with downstream deps when toco is used as a library.
+        "//tensorflow/core:ops",
+    ],
+)
+
+tf_cc_binary(
+    name = "toco",
+    srcs = ["toco.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_convert",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        # We cannot embed the core:ops dependency directly into :toco_tooling as
+        # it can conflict with downstream deps when toco is used as a library.
+        "//tensorflow/core:ops",
+    ],
+)
+
+tf_cc_test(
+    name = "toco_convert_test",
+    srcs = ["toco_convert_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_convert",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        # We cannot embed the core:ops dependency directly into :toco_tooling as
+        # it can conflict with downstream deps when toco is used as a library.
+        "//tensorflow/core:ops",
+    ],
+)
+
+tf_cc_test(
+    name = "toco_port_test",
+    srcs = ["toco_port_test.cc"],
+    data = [
+        "toco_port_test.cc",
+    ],
+    deps = [
+        ":toco_port",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/toco/README.md b/tensorflow/lite/toco/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe98a90b38583a368b02bd4b422943f6f6b16c9b
--- /dev/null
+++ b/tensorflow/lite/toco/README.md
@@ -0,0 +1,29 @@
+# TensorFlow Lite Converter
+
+The TensorFlow Lite Converter converts TensorFlow graphs into
+TensorFlow Lite graphs. There are additional usages that are also detailed in
+the usage documentation.
+
+## Usage documentation
+
+Usage information is given in these documents:
+
+*   [Command-line glossary](../g3doc/convert/cmdline_reference.md)
+*   [Command-line examples](../g3doc/convert/cmdline_examples.md)
+*   [Python API examples](../g3doc/convert/python_api.md)
+
+## Where the converter fits in the TensorFlow landscape
+
+Once an application developer has a trained TensorFlow model, the TensorFlow
+Lite Converter will accept
+that model and generate a TensorFlow Lite
+[FlatBuffer](https://google.github.io/flatbuffers/) file. The converter currently supports
+[SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators),
+frozen graphs (models generated via
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)),
+and `tf.Keras` model files.  The TensorFlow Lite FlatBuffer file can be shipped
+to client devices, generally mobile devices, where the TensorFlow Lite
+interpreter handles them on-device.  This flow is represented in the diagram
+below.
+
+![drawing](../g3doc/images/convert/workflow.svg)
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/lite/toco/allocate_transient_arrays.cc
similarity index 98%
rename from tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
rename to tensorflow/lite/toco/allocate_transient_arrays.cc
index 18c904c6d4e8ad45420d507326d7948e1c296596..3ec53c9c2d63ee5e2a79ac0cbe87c2b0f3925cd0 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/lite/toco/allocate_transient_arrays.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/allocate_transient_arrays.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/allocate_transient_arrays.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.h b/tensorflow/lite/toco/allocate_transient_arrays.h
similarity index 87%
rename from tensorflow/contrib/lite/toco/allocate_transient_arrays.h
rename to tensorflow/lite/toco/allocate_transient_arrays.h
index 59d8ada1e9bb985f2eaa7ff6d29bc4f1b054a070..5d43d4cc3fa8029061e774b74120ee27c5a1f5e8 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.h
+++ b/tensorflow/lite/toco/allocate_transient_arrays.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#ifndef TENSORFLOW_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#define TENSORFLOW_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
 
-#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/lite/toco/model.h"
 
 namespace toco {
 
@@ -41,4 +41,4 @@ void AllocateTransientArrays(Model* model,
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#endif  // TENSORFLOW_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/lite/toco/args.h
similarity index 96%
rename from tensorflow/contrib/lite/toco/args.h
rename to tensorflow/lite/toco/args.h
index 2699ac76e1d2c3416d3694211220d7ffc7e02c16..188f2f7e7af61c6c9e94da42d528d3fcff4b5e39 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/lite/toco/args.h
@@ -15,20 +15,20 @@ limitations under the License.
 // This abstracts command line arguments in toco.
 // Arg<T> is a parseable type that can register a default value, be able to
 // parse itself, and keep track of whether it was specified.
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+#ifndef TENSORFLOW_LITE_TOCO_ARGS_H_
+#define TENSORFLOW_LITE_TOCO_ARGS_H_
 
 #include <functional>
 #include <unordered_map>
 #include <vector>
-#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_port.h"
 #if defined(PLATFORM_GOOGLE)
 #include "strings/split.h"
 #include "strings/strip.h"
 #endif
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/toco_types.h"
 
 namespace toco {
 
@@ -248,10 +248,10 @@ struct ParsedTocoFlags {
   Arg<int64> dedupe_array_min_size_bytes = Arg<int64>(64);
   Arg<bool> split_tflite_lstm_inputs = Arg<bool>(true);
   // WARNING: Experimental interface, subject to change
-  Arg<bool> allow_flex_ops = Arg<bool>(false);
+  Arg<bool> enable_select_tf_ops = Arg<bool>(false);
   // WARNING: Experimental interface, subject to change
-  Arg<bool> force_flex_ops = Arg<bool>(false);
+  Arg<bool> force_select_tf_ops = Arg<bool>(false);
 };
 
 }  // namespace toco
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+#endif  // TENSORFLOW_LITE_TOCO_ARGS_H_
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/lite/toco/dump_graphviz.cc
similarity index 97%
rename from tensorflow/contrib/lite/toco/dump_graphviz.cc
rename to tensorflow/lite/toco/dump_graphviz.cc
index 30525efd2391bb63afd7035b8134e5858add45f2..8896893f3579abcefa87e3411f9b186ca7a45a1b 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/lite/toco/dump_graphviz.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/dump_graphviz.h"
+#include "tensorflow/lite/toco/dump_graphviz.h"
 
 #include <cmath>
 #include <memory>
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include "absl/strings/str_replace.h"
 #include "absl/strings/strip.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
-#include "tensorflow/contrib/lite/toco/toco_port.h"
-#include "tensorflow/contrib/lite/toco/toco_types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 using toco::port::AppendF;
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.h b/tensorflow/lite/toco/dump_graphviz.h
similarity index 78%
rename from tensorflow/contrib/lite/toco/dump_graphviz.h
rename to tensorflow/lite/toco/dump_graphviz.h
index ea5a4031c39580be00130a2fd3a89c61da2acf01..9697bd6f0dc434aaf98762698c64fb60cb97f2ee 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.h
+++ b/tensorflow/lite/toco/dump_graphviz.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#ifndef TENSORFLOW_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#define TENSORFLOW_LITE_TOCO_DUMP_GRAPHVIZ_H_
 
 #include <string>
 
-#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/lite/toco/model.h"
 
 namespace toco {
 
@@ -25,4 +25,4 @@ void DumpGraphviz(const Model& model, string* output_file_contents);
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#endif  // TENSORFLOW_LITE_TOCO_DUMP_GRAPHVIZ_H_
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
similarity index 95%
rename from tensorflow/contrib/lite/toco/export_tensorflow.cc
rename to tensorflow/lite/toco/export_tensorflow.cc
index 61e9106783f644d0555f71243e5f082ae1950a47..9fff0015527ebadf501f571bdd5ed0a7643d66e0 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -22,11 +22,11 @@ limitations under the License.
 #include "google/protobuf/text_format.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_util.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tensorflow_util.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 using tensorflow::DT_BOOL;
+using tensorflow::DT_COMPLEX64;
 using tensorflow::DT_FLOAT;
 using tensorflow::DT_INT16;
 using tensorflow::DT_INT32;
@@ -47,7 +48,8 @@ using tensorflow::TensorProto;
 namespace toco {
 namespace {
 
-tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
+tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type,
+                                           const string& error_location) {
   switch (data_type) {
     case ArrayDataType::kBool:
       return tensorflow::DT_BOOL;
@@ -61,16 +63,25 @@ tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
       return tensorflow::DT_INT64;
     case ArrayDataType::kString:
       return tensorflow::DT_STRING;
+    case ArrayDataType::kComplex64:
+      return tensorflow::DT_COMPLEX64;
     default:
     case ArrayDataType::kNone:
-      LOG(FATAL) << "Unsupported data type: " << static_cast<int>(data_type);
+      LOG(FATAL) << "Unsupported data type '" << ArrayDataTypeName(data_type)
+                 << "' in " << error_location;
       return tensorflow::DT_INVALID;
   }
 }
 
+tensorflow::DataType GetTensorFlowDataTypeForOp(ArrayDataType data_type,
+                                                const string& op_name) {
+  return GetTensorFlowDataType(data_type, "op '" + op_name + "'");
+}
+
 tensorflow::DataType GetTensorFlowDataType(const Model& model,
                                            const string& array_name) {
-  return GetTensorFlowDataType(model.GetArray(array_name).data_type);
+  return GetTensorFlowDataType(model.GetArray(array_name).data_type,
+                               "array '" + array_name + "'");
 }
 
 // TensorFlow sometimes forbids what it calls "legacy scalars",
@@ -159,16 +170,9 @@ void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
                              AxesOrder input_axes_order,
                              AxesOrder output_axes_order,
                              GraphDef* tensorflow_graph) {
-  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
-    return;
-  }
-  tensorflow::NodeDef* const_op = tensorflow_graph->add_node();
-  const_op->set_op("Const");
-  const_op->set_name(name);
-  (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
-  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
-  ExportFloatArray(input_axes_order, input_shape, input_data, output_axes_order,
-                   tensor, LegacyScalarPolicy::kAvoidLegacyScalars);
+  ConvertFloatTensorConst(name, input_shape, input_data, input_axes_order,
+                          output_axes_order, tensorflow_graph,
+                          LegacyScalarPolicy::kAvoidLegacyScalars);
 }
 
 void ConvertFloatTensorConst(const Model& model, const string& name,
@@ -178,11 +182,6 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
     return;
   }
-  tensorflow::NodeDef* const_op = tensorflow_graph->add_node();
-  const_op->set_op("Const");
-  const_op->set_name(name);
-  (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
-  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
   CHECK(model.HasArray(name));
   const auto& input_array = model.GetArray(name);
   const auto& input_shape = input_array.shape();
@@ -190,8 +189,8 @@ void ConvertFloatTensorConst(const Model& model, const string& name,
   CHECK(input_array.buffer->type == ArrayDataType::kFloat);
   const float* input_data =
       input_array.GetBuffer<ArrayDataType::kFloat>().data.data();
-  ExportFloatArray(input_axes_order, input_shape, input_data, output_axes_order,
-                   tensor, LegacyScalarPolicy::kAvoidLegacyScalars);
+  ConvertFloatTensorConst(name, input_shape, input_data, input_axes_order,
+                          output_axes_order, tensorflow_graph);
 }
 
 void ConvertFloatTensorConst(const Model& model, const string& name,
@@ -287,6 +286,31 @@ void CreateIntTensorConst(const string& name, const std::vector<int32>& data,
   CHECK_EQ(num_elements, data.size());
 }
 
+void ConvertComplex64TensorConst(const Model& model, const string& name,
+                                 GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  CHECK(model.HasArray(name));
+  const auto& array = model.GetArray(name);
+  tensorflow::NodeDef* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_COMPLEX64);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_COMPLEX64);
+  const auto& data = array.GetBuffer<ArrayDataType::kComplex64>().data;
+  for (auto index : data) {
+    tensor->add_scomplex_val(std::real(index));
+    tensor->add_scomplex_val(std::imag(index));
+  }
+  const auto& array_shape = array.shape();
+  auto* shape = tensor->mutable_tensor_shape();
+  for (int i = 0; i < array_shape.dimensions_count(); i++) {
+    shape->add_dim()->set_size(array_shape.dims(i));
+  }
+}
+
 void CreateMatrixShapeTensorConst(const string& name, int rows, int cols,
                                   GraphDef* tensorflow_graph) {
   if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
@@ -1134,6 +1158,29 @@ void ConvertSplitOperator(const Model& model,
                                   tensorflow_graph);
 }
 
+void ConvertSplitVOperator(const Model& model,
+                           const TensorFlowSplitVOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* split_v_op = tensorflow_graph->add_node();
+  split_v_op->set_op("SplitV");
+  split_v_op->set_name(src_op.outputs[0]);
+  for (const auto& input : src_op.inputs) {
+    *split_v_op->add_input() = input;
+  }
+  (*split_v_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*split_v_op->mutable_attr())["num_split"].set_i(src_op.num_split);
+  const auto& split_dim_array = model.GetArray(src_op.inputs[1]);
+  CHECK(split_dim_array.buffer);
+  CHECK(split_dim_array.data_type == ArrayDataType::kInt32);
+  const auto& split_dim_data =
+      split_dim_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CHECK_EQ(split_dim_data.size(), 1);
+  const int split_dim = split_dim_data[0];
+  CreateDummyConcatDimTensorConst(src_op.inputs[0], split_dim,
+                                  tensorflow_graph);
+}
+
 void ConvertCastOperator(const Model& model, const CastOperator& src_op,
                          GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* cast_op = tensorflow_graph->add_node();
@@ -1269,7 +1316,7 @@ void ConvertRangeOperator(const Model& model, const RangeOperator& src_op,
   *range_op->add_input() = src_op.inputs[1];
   *range_op->add_input() = src_op.inputs[2];
   (*range_op->mutable_attr())["Tidx"].set_type(
-      GetTensorFlowDataType(src_op.dtype));
+      GetTensorFlowDataTypeForOp(src_op.dtype, /*op_name=*/src_op.outputs[0]));
 }
 
 void ConvertPackOperator(const Model& model, const PackOperator& src_op,
@@ -1282,7 +1329,8 @@ void ConvertPackOperator(const Model& model, const PackOperator& src_op,
   }
   (*pack_op->mutable_attr())["axis"].set_i(src_op.axis);
   (*pack_op->mutable_attr())["N"].set_i(src_op.inputs.size());
-  (*pack_op->mutable_attr())["T"].set_type(GetTensorFlowDataType(src_op.dtype));
+  (*pack_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataTypeForOp(src_op.dtype, src_op.outputs[0]));
 }
 
 void ConvertFillOperator(const Model& model, const FillOperator& src_op,
@@ -1311,6 +1359,18 @@ void ConvertFloorDivOperator(const Model& model, const FloorDivOperator& src_op,
       GetTensorFlowDataType(model, src_op.inputs[0]));
 }
 
+void ConvertFloorModOperator(const Model& model, const FloorModOperator& src_op,
+                             GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* floor_mod_op = tensorflow_graph->add_node();
+  floor_mod_op->set_op("FloorMod");
+  floor_mod_op->set_name(src_op.outputs[0]);
+  DCHECK_EQ(src_op.inputs.size(), 2);
+  *floor_mod_op->add_input() = src_op.inputs[0];
+  *floor_mod_op->add_input() = src_op.inputs[1];
+  (*floor_mod_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+}
+
 void ConvertExpandDimsOperator(const Model& model,
                                const ExpandDimsOperator& src_op,
                                GraphDef* tensorflow_graph) {
@@ -1859,7 +1919,7 @@ void ConvertRandomUniformOperator(const Model& model,
       GetTensorFlowDataType(model, src_op.inputs[0]);
   (*new_op->mutable_attr())["T"].set_type(shape_type);
   (*new_op->mutable_attr())["dtype"].set_type(
-      GetTensorFlowDataType(src_op.dtype));
+      GetTensorFlowDataTypeForOp(src_op.dtype, src_op.outputs[0]));
   (*new_op->mutable_attr())["seed"].set_i(src_op.seed);
   (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
 }
@@ -2096,6 +2156,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertSplitOperator(model,
                          static_cast<const TensorFlowSplitOperator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSplitV) {
+    ConvertSplitVOperator(model,
+                          static_cast<const TensorFlowSplitVOperator&>(src_op),
+                          tensorflow_graph);
   } else if (src_op.type == OperatorType::kFakeQuant) {
     ConvertFakeQuantOperator(static_cast<const FakeQuantOperator&>(src_op),
                              tensorflow_graph);
@@ -2197,6 +2261,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kFloorDiv) {
     ConvertFloorDivOperator(model, static_cast<const FloorDivOperator&>(src_op),
                             tensorflow_graph);
+  } else if (src_op.type == OperatorType::kFloorMod) {
+    ConvertFloorModOperator(model, static_cast<const FloorModOperator&>(src_op),
+                            tensorflow_graph);
   } else if (src_op.type == OperatorType::kExpandDims) {
     ConvertExpandDimsOperator(model,
                               static_cast<const ExpandDimsOperator&>(src_op),
@@ -2289,6 +2356,9 @@ void AddPlaceholder(const string& name, ArrayDataType type,
     case ArrayDataType::kInt16:
       (*placeholder->mutable_attr())["dtype"].set_type(DT_INT16);
       break;
+    case ArrayDataType::kComplex64:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_COMPLEX64);
+      break;
     default:
       LOG(FATAL) << "Unexpected data type in array \"" << name << "\"";
   }
@@ -2348,6 +2418,9 @@ void ExportTensorFlowGraphDefImplementation(const Model& model,
         case ArrayDataType::kInt32:
           ConvertIntTensorConst(model, array_name, tensorflow_graph);
           break;
+        case ArrayDataType::kComplex64:
+          ConvertComplex64TensorConst(model, array_name, tensorflow_graph);
+          break;
         default:
           break;
       }
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.h b/tensorflow/lite/toco/export_tensorflow.h
similarity index 79%
rename from tensorflow/contrib/lite/toco/export_tensorflow.h
rename to tensorflow/lite/toco/export_tensorflow.h
index d7310bb75f258cde25236da2a9269f18234784e4..09c966ded621d4331bf9eb3e5bb82d1ea911fe0c 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.h
+++ b/tensorflow/lite/toco/export_tensorflow.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#ifndef TENSORFLOW_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#define TENSORFLOW_LITE_TOCO_EXPORT_TENSORFLOW_H_
 
 #include <string>
-#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/lite/toco/model.h"
 
 namespace toco {
 
@@ -26,4 +26,4 @@ void EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(Model* model);
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#endif  // TENSORFLOW_LITE_TOCO_EXPORT_TENSORFLOW_H_
diff --git a/tensorflow/contrib/lite/toco/format_port.h b/tensorflow/lite/toco/format_port.h
similarity index 92%
rename from tensorflow/contrib/lite/toco/format_port.h
rename to tensorflow/lite/toco/format_port.h
index 44e668457152376fd8b2e2fa063301468090c3f0..69833d965c57d3941d6685dc0d9d2a4b90d2b98d 100644
--- a/tensorflow/contrib/lite/toco/format_port.h
+++ b/tensorflow/lite/toco/format_port.h
@@ -16,10 +16,10 @@ limitations under the License.
 // and absl::StrAppendFormat. Unfortunately, type safety is not as good as a
 // a full C++ example.
 // TODO(aselle): When absl adds support for StrFormat, use that instead.
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+#ifndef TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
+#define TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
 
-#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace toco {
@@ -74,4 +74,4 @@ inline string StringF(const char* fmt, Args&&... args) {
 }  // namespace port
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+#endif  // TENSORFLOW_LITE_TOCO_FORMAT_PORT_H_
diff --git a/tensorflow/lite/toco/g3doc/README.md b/tensorflow/lite/toco/g3doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1be8fab3ad39e682a942490bf4af2674c3cf9b3
--- /dev/null
+++ b/tensorflow/lite/toco/g3doc/README.md
@@ -0,0 +1,3 @@
+# TOCO
+
+These files have moved to [../../g3doc/convert](../../g3doc/convert)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
similarity index 83%
rename from tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
rename to tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
index 310a88484c246b8035aa73b5e04ad677d575e4c4..e3b0de5555729190617ce867a906da6786b2634a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
@@ -18,17 +18,20 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ConvertExpandDimsToReshape::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ConvertExpandDimsToReshape::Run(Model* model,
+                                                     std::size_t op_index,
+                                                     bool* modified) {
+  *modified = false;
   auto expand_it = model->operators.begin() + op_index;
   if (expand_it->get()->type != OperatorType::kExpandDims) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   ExpandDimsOperator* expand_op =
       static_cast<ExpandDimsOperator*>(expand_it->get());
@@ -38,18 +41,18 @@ bool ConvertExpandDimsToReshape::Run(Model* model, std::size_t op_index) {
   const auto& input_array = model->GetArray(expand_op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto& axis_array = model->GetArray(expand_op->inputs[1]);
   if (!axis_array.has_shape()) {
     // Yield until input axis array shape has been resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1);
   if (!axis_array.buffer) {
     // Yield until the input axis array is constant
-    return false;
+    return ::tensorflow::Status::OK();
   }
   int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
   std::vector<int> reshape_dims(input_array.shape().dims());
@@ -90,7 +93,8 @@ bool ConvertExpandDimsToReshape::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(expand_it->get(), expand_op);
   model->operators.erase(expand_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
similarity index 84%
rename from tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
rename to tensorflow/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index e88839be5d43670dec45d3a5da5e1d6b9000ac63..a707a906a815cd2ba12306daeab5d20fedd7ca88 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -17,36 +17,39 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ConvertPureConvToDepthwise::Run(Model* model,
+                                                     std::size_t op_index,
+                                                     bool* modified) {
+  *modified = false;
   auto conv_it = model->operators.begin() + op_index;
   if (conv_it->get()->type != OperatorType::kConv) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto* conv_op = static_cast<ConvOperator*>(conv_it->get());
   if (conv_op->stride_width != conv_op->stride_height) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if ((conv_op->dilation_width_factor != 1) ||
       (conv_op->dilation_height_factor != 1)) {
     // Depthwise conv does not support dilation
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto& input_array = model->GetArray(conv_op->inputs[0]);
   if (!input_array.has_shape()) {
     // Shapes not propagated yet
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (input_array.shape().dims(3) != 1) {
     // Not a pure convolution: Conv does accumulation across the depth
     // dimension.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto& weights_name = conv_op->inputs[1];
@@ -56,15 +59,15 @@ bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
         "Not changing %s to DepthwiseConv because the weights is consumed by "
         "another op.",
         LogName(*conv_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto& weights_array = model->GetArray(weights_name);
   if (!weights_array.buffer) {
     // Yield until the weights are resolved as a constant array.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (weights_array.data_type != ArrayDataType::kFloat) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // At this point we know we have a pure conv. Rewrite it as DepthwiseConv.
   AddMessageF(
@@ -112,7 +115,8 @@ bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
   }
   *weights_array.mutable_shape()->mutable_dims() = {1, width, height, depth};
   weights_buffer.data = depthwise_conv_weights_data;
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
similarity index 90%
rename from tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc
rename to tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
index 0d274fc687c8d42d47ddb5beb4f9c6f39b417097..b4cd4635982fd4eb22e1b4dff74d0eae391ceaef 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_reorder_axes.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -86,9 +86,12 @@ TransposeOperator* CreateTransposeFromReorderAxes(
 
 // Converts ReorderAxes into Transpose and Reshape which are compatible with the
 // TFLite interpreter.
-bool ConvertReorderAxes::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ConvertReorderAxes::Run(Model* model, std::size_t op_index,
+                                             bool* modified) {
+  *modified = false;
   auto reorder_it = model->operators.begin() + op_index;
-  if (reorder_it->get()->type != OperatorType::kReorderAxes) return false;
+  if (reorder_it->get()->type != OperatorType::kReorderAxes)
+    return ::tensorflow::Status::OK();
 
   auto* reorder_op = static_cast<ReorderAxesOperator*>(reorder_it->get());
   CHECK_EQ(reorder_op->inputs.size(), 1);
@@ -113,8 +116,9 @@ bool ConvertReorderAxes::Run(Model* model, std::size_t op_index) {
   // Yield if input array contains constants or if output array size has not
   // been adjusted to reflect the permutations in ReorderAxes. ReorderAxes will
   // be merged into a constant array when possible.
-  if (IsConstantParameterArray(*model, constant_input_array_name)) return false;
-  if (!output_array.has_shape()) return false;
+  if (IsConstantParameterArray(*model, constant_input_array_name))
+    return ::tensorflow::Status::OK();
+  if (!output_array.has_shape()) return ::tensorflow::Status::OK();
 
   const auto input_axes_order = reorder_op->input_axes_order;
   const auto output_axes_order = reorder_op->output_axes_order;
@@ -143,7 +147,8 @@ bool ConvertReorderAxes::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(reorder_it->get(), reorder_op);
   model->operators.erase(reorder_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
similarity index 81%
rename from tensorflow/contrib/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
rename to tensorflow/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
index 81cedb5dad751aacbbb32326db73de386aba282d..52aaefb3d74e4e1d17bbb6a11838d2bd17d29d24 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_squeeze_to_reshape.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -30,10 +30,13 @@ namespace toco {
 // means that the data layout will never change with this op, just the shape.
 // By converting these to reshapes once we have run shape propagation we allow
 // standard reshape optimization transforms to do their magic.
-bool ConvertSqueezeToReshape::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ConvertSqueezeToReshape::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   auto squeeze_it = model->operators.begin() + op_index;
   if (squeeze_it->get()->type != OperatorType::kSqueeze) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto squeeze_op = static_cast<SqueezeOperator*>(squeeze_it->get());
   CHECK_EQ(squeeze_op->inputs.size(), 1);
@@ -42,16 +45,16 @@ bool ConvertSqueezeToReshape::Run(Model* model, std::size_t op_index) {
   const auto& input_array = model->GetArray(squeeze_op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (input_array.shape().dimensions_count() == 0) {
     // Input array cannot be 0-D.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (!model->HasArray(squeeze_op->outputs[0]) ||
       !model->GetArray(squeeze_op->outputs[0]).has_shape()) {
     // Yield until shape propagation has set the output shape for us.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // We use the output shape that has been calculated by shape propagation.
@@ -59,7 +62,7 @@ bool ConvertSqueezeToReshape::Run(Model* model, std::size_t op_index) {
 
   // Empty shapes will not work as empty data arrays.
   if (output_shape.dimensions_count() == 0) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   auto* reshape_op = new TensorFlowReshapeOperator;
@@ -79,7 +82,8 @@ bool ConvertSqueezeToReshape::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(squeeze_it->get(), squeeze_op);
   model->operators.erase(squeeze_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
similarity index 77%
rename from tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
rename to tensorflow/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
index dcaaddbf3b5409f0fc3ddaf32e23b1e5eefb6565..130fe58a9d13ff4c3f79874200dcc2e2224a5ae0 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_addn_to_add.cc
@@ -12,18 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
 // This pass will convert an AddN operator with only 2 inputs into a regular Add
 // operator, to which more optimizations may apply.
-bool ConvertTrivialAddNToAdd::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ConvertTrivialAddNToAdd::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   auto addn_it = model->operators.begin() + op_index;
   if (addn_it->get()->type != OperatorType::kAddN) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   AddNOperator* addn_op = static_cast<AddNOperator*>(addn_it->get());
   CHECK_GE(addn_op->inputs.size(), 2);
@@ -31,7 +34,7 @@ bool ConvertTrivialAddNToAdd::Run(Model* model, std::size_t op_index) {
 
   // We only reduce AddN with N=2 to a regular Add.
   if (addn_op->inputs.size() != 2) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Copy inputs & outputs to regular Add.
@@ -45,7 +48,8 @@ bool ConvertTrivialAddNToAdd::Run(Model* model, std::size_t op_index) {
   addn_it = add_it + 1;
   CHECK_EQ(addn_it->get(), addn_op);
   model->operators.erase(addn_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
similarity index 80%
rename from tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
rename to tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
index 75113a2a8c7c446bd13de8b5c1a8d8ef3cf7fdd6..27c503f5ddd14da6d72f27a8ced4a822917bbdc3 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_pack_to_reshape.cc
@@ -18,34 +18,37 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ConvertTrivialPackToReshape::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ConvertTrivialPackToReshape::Run(Model* model,
+                                                      std::size_t op_index,
+                                                      bool* modified) {
+  *modified = false;
   auto pack_it = model->operators.begin() + op_index;
   if (pack_it->get()->type != OperatorType::kPack) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* pack_op = static_cast<PackOperator*>(pack_it->get());
   if (pack_op->inputs.size() > 1) {
     // Not trivial.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   CHECK_EQ(pack_op->outputs.size(), 1);
 
   const auto& input_array = model->GetArray(pack_op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until input dims have been resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (input_array.shape().dimensions_count() == 0) {
     // Input array cannot be 0-D.
     // (Unsure if this is TF behavior, but was required to get a test to pass.)
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Converting trivial %s to a reshape", LogName(*pack_op));
@@ -75,7 +78,8 @@ bool ConvertTrivialPackToReshape::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(pack_it->get(), pack_op);
   model->operators.erase(pack_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
similarity index 83%
rename from tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
rename to tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
index b689be07926ecd9be4cc317735dc88eb90950e13..fb416cabededf53cb3783a41ba38d49e8abe5c58 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_tile_to_concat.cc
@@ -14,17 +14,20 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ConvertTrivialTileToConcat::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ConvertTrivialTileToConcat::Run(Model* model,
+                                                     std::size_t op_index,
+                                                     bool* modified) {
+  *modified = false;
   auto tile_it = model->operators.begin() + op_index;
   if (tile_it->get()->type != OperatorType::kTile) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* tile_op = static_cast<TransposeOperator*>(tile_it->get());
 
@@ -34,13 +37,13 @@ bool ConvertTrivialTileToConcat::Run(Model* model, std::size_t op_index) {
   if (!input_array.has_shape() || !multiples_array.has_shape() ||
       !output_array.has_shape()) {
     // Yield until PropagateFixedSizes has been run on this op.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Note: We can assume we have error checked inputs in PropagateFixedSizes.
 
   if (!multiples_array.buffer) {
     // Yield until the multiples is constant.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   std::vector<int32> const& multiples =
       multiples_array.GetBuffer<ArrayDataType::kInt32>().data;
@@ -59,7 +62,7 @@ bool ConvertTrivialTileToConcat::Run(Model* model, std::size_t op_index) {
     // The tile is non-trivial. Good luck.
     AddMessageF("Tile %s is non-trivial (has more than one multiply dimension)",
                 LogName(*tile_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // The tile is like a concat.
@@ -88,7 +91,8 @@ bool ConvertTrivialTileToConcat::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(tile_it->get(), tile_op);
   model->operators.erase(tile_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
similarity index 86%
rename from tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
rename to tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
index 5a36a90b3841504d6f018832777e50bac95218d7..ae97cef520e0f2ea1bd30ff32832978e9de1a44c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/convert_trivial_transpose_to_reshape.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -48,10 +48,13 @@ bool TransposeAffectsMemoryOrder(std::vector<int> perm,
 
 }  // namespace
 
-bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ConvertTrivialTransposeToReshape::Run(Model* model,
+                                                           std::size_t op_index,
+                                                           bool* modified) {
+  *modified = false;
   auto transpose_it = model->operators.begin() + op_index;
   if (transpose_it->get()->type != OperatorType::kTranspose) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   TransposeOperator* transpose_op =
       static_cast<TransposeOperator*>(transpose_it->get());
@@ -60,14 +63,14 @@ bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
   const auto& output_array = model->GetArray(transpose_op->outputs[0]);
   if (!input_array.has_shape() || !output_array.has_shape()) {
     // Yield until PropagateFixedSizes has been run on this op.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Note: We can assume we have error checked inputs in PropagateFixedSizes.
 
   // Check that the permutation has propogated.
   std::vector<int> const& perm = transpose_op->perm;
   if (perm.empty()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // This transpose is trivial if non-unitary dimensions remain in the same
@@ -76,7 +79,7 @@ bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
   std::vector<int> const& output_dims = output_array.shape().dims();
 
   if (TransposeAffectsMemoryOrder(perm, input_dims)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // This transpose is trivial. Replace it with a Reshape op.
@@ -109,7 +112,8 @@ bool ConvertTrivialTransposeToReshape::Run(Model* model, std::size_t op_index) {
   CHECK_EQ(transpose_it->get(), transpose_op);
   model->operators.erase(transpose_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
similarity index 82%
rename from tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
rename to tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
index 1e68cd678bce6c27f1852a5ae0c13362d8938cdd..8e93bc237897b62ff46228b7f1cc38d961371106 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
+++ b/tensorflow/lite/toco/graph_transformations/create_im2col_arrays.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -73,18 +73,22 @@ bool ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) {
   return true;
 }
 
-bool CreateIm2colArrays::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status CreateIm2colArrays::Run(Model* model, std::size_t op_index,
+                                             bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
 
   switch (op->type) {
     case OperatorType::kConv:
-      return ProcessConvOperator(model, static_cast<ConvOperator*>(op));
+      *modified = ProcessConvOperator(model, static_cast<ConvOperator*>(op));
+      return ::tensorflow::Status::OK();
     case OperatorType::kTransposeConv:
-      return ProcessTransposeConvOperator(
+      *modified = ProcessTransposeConvOperator(
           model, static_cast<TransposeConvOperator*>(op));
+      return ::tensorflow::Status::OK();
     default:
-      return false;
+      return ::tensorflow::Status::OK();
   }
 }
 
diff --git a/tensorflow/lite/toco/graph_transformations/dequantize.cc b/tensorflow/lite/toco/graph_transformations/dequantize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc5dddbb40e7324a034670ede27e0fc6d652ac6b
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/dequantize.cc
@@ -0,0 +1,230 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+template <ArrayDataType A>
+void DequantizeBuffer(Array* array) {
+  const auto old_data = array->GetBuffer<A>().data;
+  array->buffer = nullptr;
+  array->data_type = ArrayDataType::kFloat;
+  auto& new_data = array->GetMutableBuffer<ArrayDataType::kFloat>().data;
+  new_data.resize(old_data.size());
+  const auto& qparams = array->GetQuantizationParams();
+  for (int i = 0; i < old_data.size(); i++) {
+    new_data[i] = qparams.scale * (old_data[i] - qparams.zero_point);
+  }
+}
+
+std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
+    Model* model, const string& array_name) {
+  for (auto it = model->operators.begin(); it != model->operators.end(); ++it) {
+    for (const auto& input : it->get()->inputs) {
+      if (input == array_name) {
+        return it;
+      }
+    }
+  }
+  return model->operators.end();
+}
+
+void ClearArrayQuantizationParams(const string& array_name, Model* model) {
+  auto* array = &model->GetArray(array_name);
+  CHECK(array->quantization_params);
+  for (auto& input_array : *model->flags.mutable_input_arrays()) {
+    if (input_array.name() == array_name) {
+      auto& qparams = *array->quantization_params;
+      const double new_std_value = 1. / qparams.scale;
+      const double new_mean_value = qparams.zero_point;
+      if (input_array.has_std_value()) {
+        CHECK_LE(std::abs(new_std_value - input_array.std_value()), 0.001);
+      } else {
+        input_array.set_std_value(new_std_value);
+      }
+      if (input_array.has_mean_value()) {
+        CHECK_LE(std::abs(new_mean_value - input_array.mean_value()), 0.001);
+      } else {
+        input_array.set_mean_value(new_mean_value);
+      }
+    }
+  }
+  array->quantization_params = nullptr;
+}
+
+bool DequantizeArray(const string& array_name,
+                     GraphTransformation* transformation, Model* model) {
+  auto* array = &model->GetArray(array_name);
+  if (!array->quantization_params) {
+    return false;
+  }
+  transformation->AddMessageF("Dequantizing array: %s", array_name);
+
+  // Dequantize any buffer
+  if (array->buffer) {
+    if (array->data_type == ArrayDataType::kUint8) {
+      DequantizeBuffer<ArrayDataType::kUint8>(array);
+    } else if (array->data_type == ArrayDataType::kInt32) {
+      DequantizeBuffer<ArrayDataType::kInt32>(array);
+    } else {
+      LOG(FATAL) << "Unhandled data type";
+    }
+    CHECK(array->data_type == ArrayDataType::kFloat);
+    CHECK(array->buffer->type == ArrayDataType::kFloat);
+
+    // Clear quantization params, officially makes this a non-quantized array.
+    ClearArrayQuantizationParams(array_name, model);
+    return true;
+  } else {
+    array->data_type = ArrayDataType::kFloat;
+  }
+
+  // Clear quantization params, officially makes this a non-quantized array.
+  ClearArrayQuantizationParams(array_name, model);
+
+  if (array->buffer) {
+    return true;
+  }
+
+  auto* op_outputting_array = GetOpWithOutput(*model, array_name);
+  if (op_outputting_array) {
+    if (op_outputting_array->type == OperatorType::kReshape) {
+      return true;
+    }
+  }
+
+  // If there was no minmax info, we can return now. Indeed,
+  // the below only serves to create a FakeQuant node, but some arrays are
+  // quantized without MinMax (see the CHECK above) and that corresponds to
+  // places where a FakeQuant node is actually not wanted, because the
+  // quantization params are meant to be inferred in another way (e.g. bias
+  // vector for a Conv op, see their special-casing in quantize.cc).
+  if (!array->minmax) {
+    return true;
+  }
+
+  // Determine whether to insert a FakeQuant before or after
+  // this array.
+  bool must_insert_fakequant_before = false;
+  bool must_insert_fakequant_after = false;
+  if (IsInputArray(*model, array_name)) {
+    must_insert_fakequant_after = true;
+  }
+  for (const string& output_array : model->flags.output_arrays()) {
+    if (array_name == output_array) {
+      must_insert_fakequant_before = true;
+    }
+  }
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    if (array_name == rnn_state.state_array()) {
+      must_insert_fakequant_after = true;
+    }
+    if (array_name == rnn_state.back_edge_source_array()) {
+      must_insert_fakequant_before = true;
+    }
+  }
+  CHECK(!(must_insert_fakequant_before && must_insert_fakequant_after));
+
+  // Create and insert the FakeQuant node
+  auto* fakequant_op = new FakeQuantOperator;
+  model->operators.emplace(FindFirstOpWithInput(model, array_name),
+                           fakequant_op);
+  const string& new_array_name = AvailableArrayName(*model, array_name);
+  auto& new_array = model->GetOrCreateArray(new_array_name);
+  new_array.data_type = ArrayDataType::kFloat;
+  new_array.copy_shape(array->shape());
+  new_array.GetOrCreateMinMax() = array->GetMinMax();
+  fakequant_op->minmax.reset(new MinMax);
+  *fakequant_op->minmax = array->GetMinMax();
+  fakequant_op->narrow_range = array->narrow_range;
+  if (must_insert_fakequant_before) {
+    for (const auto& op : model->operators) {
+      for (string& output : op->outputs) {
+        if (output == array_name) {
+          output = new_array_name;
+        }
+      }
+    }
+    fakequant_op->inputs = {new_array_name};
+    fakequant_op->outputs = {array_name};
+  } else {
+    for (const auto& op : model->operators) {
+      for (string& input : op->inputs) {
+        if (input == array_name) {
+          input = new_array_name;
+        }
+      }
+    }
+    fakequant_op->inputs = {array_name};
+    fakequant_op->outputs = {new_array_name};
+  }
+  return true;
+}
+
+}  // namespace
+
+::tensorflow::Status Dequantize::Run(Model* model, std::size_t op_index,
+                                     bool* modified) {
+  *modified = false;
+  const auto op_it = model->operators.begin() + op_index;
+  auto* op = op_it->get();
+
+  if (op->type == OperatorType::kDequantize) {
+    auto& input_array = model->GetArray(op->inputs[0]);
+    if (input_array.data_type == ArrayDataType::kFloat) {
+      return ::tensorflow::Status::OK();
+    }
+    if (input_array.final_data_type != ArrayDataType::kFloat) {
+      return ::tensorflow::Status::OK();
+    }
+    input_array.data_type = ArrayDataType::kFloat;
+    input_array.quantization_params = nullptr;
+    auto& output_array = model->GetArray(op->outputs[0]);
+    output_array.data_type = ArrayDataType::kFloat;
+    output_array.quantization_params = nullptr;
+    *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+    return ::tensorflow::Status::OK();
+  }
+
+  std::vector<string> arrays;
+  for (const string& input : op->inputs) {
+    arrays.push_back(input);
+  }
+  for (const string& output : op->outputs) {
+    arrays.push_back(output);
+  }
+  bool changed = false;
+  for (const string& array : arrays) {
+    if (!model->IsOptionalArray(array)) {
+      changed |= DequantizeArray(array, this, model);
+    }
+  }
+
+  *modified = changed;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb8679bced8077dcb5bd0f740db2d361f36c4b49
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status DropFakeQuant::Run(Model* model, std::size_t op_index,
+                                        bool* modified) {
+  *modified = false;
+  const auto fakequant_it = model->operators.begin() + op_index;
+  auto* fakequant_base_op = fakequant_it->get();
+  if (fakequant_base_op->type != OperatorType::kFakeQuant) {
+    return ::tensorflow::Status::OK();
+  }
+  auto* fakequant_op = static_cast<FakeQuantOperator*>(fakequant_base_op);
+
+  if (!fakequant_op->minmax) {
+    return ::tensorflow::Status::OK();
+  }
+
+  const auto& output_array = model->GetArray(fakequant_op->outputs[0]);
+  if (!output_array.minmax) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Drop min/max inputs
+  for (int i = 1; i < fakequant_op->inputs.size(); i++) {
+    if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
+      model->EraseArray(fakequant_op->inputs[i]);
+    }
+  }
+  fakequant_op->inputs.resize(1);
+
+  *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/drop_im2col_arrays.cc b/tensorflow/lite/toco/graph_transformations/drop_im2col_arrays.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3c95afd967dc3f8fc17379a36aa67602056caf7
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/drop_im2col_arrays.cc
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status DropIm2colArrays::Run(Model* model, std::size_t op_index,
+                                           bool* modified) {
+  *modified = false;
+  auto conv_it = model->operators.begin() + op_index;
+  if (conv_it->get()->type != OperatorType::kConv) {
+    return ::tensorflow::Status::OK();
+  }
+  auto* conv_op = static_cast<ConvOperator*>(conv_it->get());
+  if (conv_op->outputs.size() < 2) {
+    // Conv op does not have im2col.
+    return ::tensorflow::Status::OK();
+  }
+
+  // Drop the im2col array.
+  CHECK_EQ(conv_op->outputs.size(), 2);
+  model->EraseArray(conv_op->outputs[1]);
+  conv_op->outputs.resize(1);
+  AddMessageF("Dropped an im2col array for %s", LogName(*conv_op));
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
similarity index 85%
rename from tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
rename to tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
index e80ed036b311cfc586c40ece410ef6a6432a0cd9..62a4b52bbb877be355d00b65c4bc6e2c159d3452 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_bias_vectors.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -62,17 +62,20 @@ bool ProcessLinearOperator(Model* model, Operator* op) {
 }
 }  // namespace
 
-bool EnsureBiasVectors::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status EnsureBiasVectors::Run(Model* model, std::size_t op_index,
+                                            bool* modified) {
+  *modified = false;
   auto* op = model->operators[op_index].get();
   if (op->type == OperatorType::kConv ||
       op->type == OperatorType::kDepthwiseConv ||
       op->type == OperatorType::kFullyConnected) {
     if (ProcessLinearOperator(model, op)) {
       AddMessageF("Added bias vector to %s as %s", LogName(*op), op->inputs[2]);
-      return true;
+      *modified = true;
+      return ::tensorflow::Status::OK();
     }
   }
-  return false;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
similarity index 94%
rename from tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
rename to tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index c13fc0de7502a9edc80dc399354708a5b1b96b02..918bb489995cd34db4bf168e14e104f6f6d096bc 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -108,8 +108,9 @@ namespace toco {
 // we can foresee these 'fast int8 kernels' to remain important to have into
 // the 2020s.
 //
-bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
-                                                   std::size_t op_index) {
+::tensorflow::Status EnsureUint8WeightsSafeForFastInt8Kernels::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  *modified = false;
   const auto& op = *model->operators[op_index];
   int weights_index = 0;
   switch (op.type) {
@@ -148,16 +149,16 @@ bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
       // That's why at the moment we only handle operators that use a GEMM
       // (Conv, fully-connected --- note that LSTM merely wraps a
       // fully-connected operator).
-      return false;
+      return ::tensorflow::Status::OK();
   }
 
   const string& name = op.inputs[weights_index];
   auto& array = model->GetArray(name);
   if (!array.buffer) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (array.data_type != ArrayDataType::kUint8) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto& buffer_data = array.GetMutableBuffer<ArrayDataType::kUint8>().data;
 
@@ -212,7 +213,8 @@ bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
     AddMessageF("Tweaked weights values for %s", LogName(op));
   }
 
-  return changed;
+  *modified = changed;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
similarity index 80%
rename from tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
rename to tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
index c5ce3fcd95eb0aaf63dcc7f43b96d8a13ed93929..f467a95f3486639897d6a36f8c9f9dc3ef461e36 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -17,35 +17,38 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status FuseActivationFunctions::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   const auto ac_it = model->operators.begin() + op_index;
   const auto* ac_op = ac_it->get();
 
   if (ac_op->type != OperatorType::kRelu6 &&
       ac_op->type != OperatorType::kRelu1 &&
       ac_op->type != OperatorType::kRelu) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Find the op producing the array passed to this activation function
   Operator* op = GetOpWithOutput(*model, ac_op->inputs[0]);
 
-  if (!op) return false;
+  if (!op) return ::tensorflow::Status::OK();
 
   if (CountTrueOutputs(*model, *op) > 1) {
     AddMessageF(
         "Not fusing activation function %s into %s because it has more than "
         "one  consumed output",
         LogName(*ac_op), LogName(*op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(op->outputs[0], ac_op->inputs[0]);
@@ -57,7 +60,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
         "Not fusing activation function into %s because it is consumed by more "
         "than 1 other operator",
         LogName(*ac_op), LogName(*op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!IsDiscardableArray(*model, op->outputs[0])) {
@@ -65,7 +68,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
         "Not fusing activation function %s into %s because output %s it is not "
         "discardable",
         LogName(*ac_op), LogName(*op), op->outputs[0]);
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (op->fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -73,7 +76,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
         "Not fusing activation function %s into %s because it already has a "
         "fused activation function",
         LogName(*ac_op), LogName(*op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!OperatorSupportsFusedActivation(op->type)) {
@@ -81,7 +84,7 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
         "Not fusing activation function %s because the %s op doesn't support "
         "it",
         LogName(*ac_op), LogName(*op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Fusing activation function %s into the preceding %s",
@@ -98,7 +101,8 @@ bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
   model->EraseArray(ac_op->inputs[0]);
   op->outputs[0] = ac_op->outputs[0];
   model->operators.erase(ac_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
similarity index 90%
rename from tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
rename to tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index dcbbead517f26a227363989b5af2a4040c98ff57..436b639253f2e190fcaab895cd077b06796c1ca1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -150,14 +150,17 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
 
 }  // namespace
 
-bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status FuseBinaryIntoFollowingAffine::Run(Model* model,
+                                                        std::size_t op_index,
+                                                        bool* modified) {
+  *modified = false;
   const auto binary_it = model->operators.begin() + op_index;
   auto* binary_op = binary_it->get();
   if (binary_op->type != OperatorType::kAdd &&
       binary_op->type != OperatorType::kMul &&
       binary_op->type != OperatorType::kSub &&
       binary_op->type != OperatorType::kDiv) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(binary_op->inputs.size(), 2);
@@ -175,12 +178,12 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
   };
   if (!is_input_constant[0] && !is_input_constant[1]) {
     // Neither input is constant, so nothing we can fuse into a constant.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (is_input_constant[0] && is_input_constant[1]) {
     // Both inputs are constants. That's a job for constants
     // propagation, not for us to handle here.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
   const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
@@ -192,7 +195,7 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
     if (index_of_constant_input != 1) {
       AddMessageF("Not fusing %s because the denominator is not constant",
                   LogName(*binary_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
 
@@ -204,7 +207,7 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
           "Not fusing %s into the following affine op, because we only know "
           "how to do so when the constant operand is a scalar",
           LogName(*binary_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
 
@@ -212,16 +215,15 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
       FusedActivationFunctionType::kNone) {
     AddMessageF("Not fusing %s because it has a fused activation function",
                 LogName(*binary_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   Operator* following_op = GetOpWithInput(*model, binary_op->outputs[0]);
 
   if (!following_op) {
-    AddMessageF(
-        "Not fusing %s because it is not consumed by exactly one other op",
-        LogName(*binary_op));
-    return false;
+    AddMessageF("Not fusing %s because it is not consumed by any op",
+                LogName(*binary_op));
+    return ::tensorflow::Status::OK();
   }
 
   if (following_op->type != OperatorType::kConv &&
@@ -231,14 +233,14 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
         "Not fusing %s because the following %s is not of one of the supported "
         "types",
         LogName(*binary_op), LogName(*following_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (following_op->inputs.size() < 3) {
     AddMessageF(
         "Not fusing %s because the following %s does not have a bias vector",
         LogName(*following_op), LogName(*binary_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto& weights = model->GetArray(following_op->inputs[1]);
@@ -248,7 +250,7 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
         "Not fusing %s because the following %s has non-constant weights or "
         "bias arrays",
         LogName(*binary_op), LogName(*following_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Try to fuse the binary params into the following op's params
@@ -260,7 +262,7 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
         AddMessageF(
             "Not fusing %s because the following %s does not use VALID padding",
             LogName(*binary_op), LogName(*following_op));
-        return false;
+        return ::tensorflow::Status::OK();
       }
     }
     if (following_op->type == OperatorType::kDepthwiseConv) {
@@ -269,7 +271,7 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
         AddMessageF(
             "Not fusing %s because the following %s does not use VALID padding",
             LogName(*binary_op), LogName(*following_op));
-        return false;
+        return ::tensorflow::Status::OK();
       }
     }
     FuseAddOrSubParamsIntoFollowingAffine(model, following_op, binary_op,
@@ -285,7 +287,10 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
   AddMessageF("Fusing %s into the following %s", LogName(*binary_op),
               LogName(*following_op));
 
-  model->EraseArray(binary_op->outputs[0]);
+  if (CountOpsWithInput(*model, binary_op->outputs[0]) == 1) {
+    model->EraseArray(binary_op->outputs[0]);
+  }
+
   following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
   const auto& old_constant_param_name =
       binary_op->inputs[index_of_constant_input];
@@ -294,7 +299,8 @@ bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
     model->EraseArray(old_constant_param_name);
   }
   model->operators.erase(binary_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
similarity index 89%
rename from tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
rename to tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
index b324631579f9ba6d68db034b62727ec1e17e9a76..a19e51fa9437556ca7be219193ab6f133b682ed4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -188,14 +188,17 @@ void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
 }
 }  // namespace
 
-bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status FuseBinaryIntoPrecedingAffine::Run(Model* model,
+                                                        std::size_t op_index,
+                                                        bool* modified) {
+  *modified = false;
   const auto binary_it = model->operators.begin() + op_index;
   const auto* binary_op = binary_it->get();
   if (binary_op->type != OperatorType::kAdd &&
       binary_op->type != OperatorType::kMul &&
       binary_op->type != OperatorType::kSub &&
       binary_op->type != OperatorType::kDiv) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(binary_op->inputs.size(), 2);
@@ -213,12 +216,12 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
   };
   if (!is_input_constant[0] && !is_input_constant[1]) {
     // Neither input is constant, so nothing we can fuse into a constant.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (is_input_constant[0] && is_input_constant[1]) {
     // Both inputs are constants. That's a job for constants
     // propagation, not for us to handle here.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
   const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
@@ -230,7 +233,7 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
     if (index_of_constant_input != 1) {
       AddMessageF("Not fusing %s because the denominator is not constant",
                   LogName(*binary_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
 
@@ -239,12 +242,12 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
   if (!preceding_op) {
     AddMessageF("Not fusing %s because it is not the output of another op",
                 LogName(*binary_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   for (const string& output_array : model->flags.output_arrays()) {
     if (preceding_op->outputs[0] == output_array) {
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
 
@@ -255,7 +258,7 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
         "Not fusing %s because the preceding %s is not of one of the supported "
         "types",
         LogName(*binary_op), LogName(*preceding_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (preceding_op->fused_activation_function !=
@@ -264,20 +267,32 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
         "Not fusing %s because the preceding %s has a fused activation "
         "function",
         LogName(*binary_op), LogName(*preceding_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (preceding_op->inputs.size() < 3) {
     AddMessageF(
         "Not fusing %s because the preceding %s does not have a bias vector",
         LogName(*binary_op), LogName(*preceding_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto& weights_name = preceding_op->inputs[1];
   const auto& bias_name = preceding_op->inputs[2];
   const auto& weights = model->GetArray(weights_name);
   const auto& bias = model->GetArray(bias_name);
+
+  if (weights.data_type != ArrayDataType::kFloat ||
+      bias.data_type != ArrayDataType::kFloat) {
+    AddMessageF(
+        "Not fusing %s into preceding %s because one of weights or bias array "
+        "is not float (types are %s and %s)",
+        LogName(*binary_op), LogName(*preceding_op),
+        ArrayDataTypeName(weights.data_type),
+        ArrayDataTypeName(bias.data_type));
+    return ::tensorflow::Status::OK();
+  }
+
   const int count_ops_consuming_bias = CountOpsWithInput(*model, bias_name);
   const int count_ops_consuming_weights =
       CountOpsWithInput(*model, weights_name);
@@ -289,14 +304,14 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
           "Not fusing %s because the preceding %s has a non-constant bias "
           "array",
           LogName(*binary_op), LogName(*preceding_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
     if (count_ops_consuming_bias > 1) {
       AddMessageF(
           "Not fusing %s because the bias of the preceding %s is consumed by "
           "another op",
           LogName(*binary_op), LogName(*preceding_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
   } else {
     if (!weights.buffer || !bias.buffer) {
@@ -304,14 +319,14 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
           "Not fusing %s because the preceding %s has non-constant weights or "
           "bias arrays",
           LogName(*binary_op), LogName(*preceding_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
     if (count_ops_consuming_weights > 1 || count_ops_consuming_bias > 1) {
       AddMessageF(
           "Not fusing %s because the weights or bias of the preceding %s is "
           "consumed by another op",
           LogName(*binary_op), LogName(*preceding_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
 
@@ -323,7 +338,7 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
         "Not fusing %s because the output of the preceding %s is consumed by "
         "another op",
         LogName(*binary_op), LogName(*preceding_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Fusing %s into the preceding %s", LogName(*binary_op),
@@ -352,7 +367,8 @@ bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
     model->EraseArray(old_constant_param_name);
   }
   model->operators.erase(binary_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
similarity index 84%
rename from tensorflow/contrib/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
rename to tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
index 874d8def571fbce4219de15285c8df6fd2487a9a..ba3e277f676ce85e80f1ac28471928f2c20a7aa4 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_broadcast_into_following_binary.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -51,19 +51,22 @@ bool IsBroadcastingOp(const Model& model, Operator* op) {
 // Finds an operation that looks like a broadcast (concat of the same sources
 // along the last dimension) and drops it by relying on the ability of certain
 // binary ops to perform an implicit broadcast.
-bool FuseBroadcastIntoFollowingBinary::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status FuseBroadcastIntoFollowingBinary::Run(Model* model,
+                                                           std::size_t op_index,
+                                                           bool* modified) {
+  *modified = false;
   const auto binary_it = model->operators.begin() + op_index;
   auto* binary_op = binary_it->get();
 
   // Test for binary ops of types that we know how to resolve
   if (binary_op->inputs.size() != 2) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (binary_op->type != OperatorType::kAdd &&
       binary_op->type != OperatorType::kMul &&
       binary_op->type != OperatorType::kSub &&
       binary_op->type != OperatorType::kDiv) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // NOTE: either of these ops may be nullptr if the input array is constant.
@@ -78,14 +81,14 @@ bool FuseBroadcastIntoFollowingBinary::Run(Model* model, std::size_t op_index) {
   if (!is_op_0_broadcast && !is_op_1_broadcast) {
     // Neither input is a broadcast-looking thing.
     AddMessageF("Neither input looks broadcasty");
-    return false;
+    return ::tensorflow::Status::OK();
   } else if (is_op_0_broadcast && is_op_1_broadcast) {
     AddMessageF(
         "Unable to fuse broadcast into %s as both inputs (%s, %s) are "
         "broadcasts",
         LogName(*binary_op), op[0] ? LogName(*op[0]) : "(?)",
         op[1] ? LogName(*op[1]) : "(?)");
-    return false;
+    return ::tensorflow::Status::OK();
   }
   int broadcast_index = is_op_0_broadcast ? 0 : 1;
 
@@ -96,7 +99,8 @@ bool FuseBroadcastIntoFollowingBinary::Run(Model* model, std::size_t op_index) {
   binary_op->inputs[broadcast_index] = op[broadcast_index]->inputs[0];
 
   // We leave the broadcast op in; it'll get cleaned up if it's not used later.
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
similarity index 96%
rename from tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
rename to tensorflow/lite/toco/graph_transformations/graph_transformations.cc
index 6961e23690a5e53643f2b2c52bb62ce395d05c95..a0260e24013bfda8718e0dc04052abb49b65debf 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 
 #include <algorithm>
 #include <memory>
@@ -21,8 +21,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/toco_port.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -142,7 +142,7 @@ bool GraphTransformationsPass(int increment, Model* model,
     for (const auto& transformation : transformations) {
       CHECK(!changed_now);
       CHECK(transformation->Messages().empty());
-      changed_now = transformation->Run(model, op_index);
+      CHECK(transformation->Run(model, op_index, &changed_now).ok());
       const char* made_a_change_msg =
           changed_now ? "made a change" : "did NOT make a change";
       const int log_level =
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
similarity index 88%
rename from tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
rename to tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 4d213b3f9cb930007096dbdd06b1981e9bab2c32..187b584b6989cc55894160fc5508c13474a1d2d3 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -12,22 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
 
 #include <cstddef>
 #include <initializer_list>
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/toco_port.h"
 
 namespace toco {
 
 class GraphTransformation {
  public:
-  virtual bool Run(Model* model, std::size_t op_index) = 0;
+  virtual ::tensorflow::Status Run(Model* model, std::size_t op_index,
+                                   bool* modified) = 0;
   virtual const char* Name() const = 0;
   virtual ~GraphTransformation() {}
   // Returns the list of messages that this graph transformation
@@ -104,11 +105,12 @@ class GraphTransformationsSet {
 void RunGraphTransformations(Model* model, const string& message,
                              const GraphTransformationsSet& transformations);
 
-#define DECLARE_GRAPH_TRANSFORMATION(GTName)               \
-  class GTName : public GraphTransformation {              \
-   public:                                                 \
-    bool Run(Model* model, std::size_t op_index) override; \
-    const char* Name() const override { return #GTName; }  \
+#define DECLARE_GRAPH_TRANSFORMATION(GTName)                     \
+  class GTName : public GraphTransformation {                    \
+   public:                                                       \
+    ::tensorflow::Status Run(Model* model, std::size_t op_index, \
+                             bool* modified) override;           \
+    const char* Name() const override { return #GTName; }        \
   };
 
 // List of all graph transformations
@@ -137,7 +139,7 @@ DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
 DECLARE_GRAPH_TRANSFORMATION(MoveBinaryOperatorBeforeReshape)
 DECLARE_GRAPH_TRANSFORMATION(PropagateActivationFunctionIntoConstants)
 DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes)
-DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits);
+DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits)
 DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
 DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
 DECLARE_GRAPH_TRANSFORMATION(Quantize)
@@ -200,7 +202,8 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveGatherAttributes)
 
 class PropagateDefaultMinMax : public GraphTransformation {
  public:
-  bool Run(Model* model, std::size_t op_index) override;
+  ::tensorflow::Status Run(Model* model, std::size_t op_index,
+                           bool* modified) override;
   const char* Name() const override { return "PropagateDefaultMinMax"; }
 
   bool has_any_ranges_defined() const { return !type_ranges_.empty(); }
@@ -218,7 +221,8 @@ class PropagateDefaultMinMax : public GraphTransformation {
 
 class RemoveTrivialReshape : public GraphTransformation {
  public:
-  bool Run(Model* model, std::size_t op_index) override;
+  ::tensorflow::Status Run(Model* model, std::size_t op_index,
+                           bool* modified) override;
   const char* Name() const override { return "RemoveTrivialReshape"; }
   bool treat_expand_dims_as_trivial() const {
     return treat_expand_dims_as_trivial_;
@@ -233,7 +237,8 @@ class RemoveTrivialReshape : public GraphTransformation {
 
 class ResolveConstantFakeQuant : public GraphTransformation {
  public:
-  bool Run(Model* model, std::size_t op_index) override;
+  ::tensorflow::Status Run(Model* model, std::size_t op_index,
+                           bool* modified) override;
   const char* Name() const override { return "ResolveConstantFakeQuant"; }
 
   // True if the num_bits should adjust the final data type.
@@ -250,7 +255,8 @@ class ResolveConstantFakeQuant : public GraphTransformation {
 
 class EnsureUint8WeightsSafeForFastInt8Kernels : public GraphTransformation {
  public:
-  bool Run(Model* model, std::size_t op_index) override;
+  ::tensorflow::Status Run(Model* model, std::size_t op_index,
+                           bool* modified) override;
   const char* Name() const override {
     return "EnsureUint8WeightsSafeForFastInt8Kernels";
   }
@@ -267,7 +273,8 @@ class EnsureUint8WeightsSafeForFastInt8Kernels : public GraphTransformation {
 
 class IdentifyDilatedConv : public GraphTransformation {
  public:
-  bool Run(Model* model, std::size_t op_index) override;
+  ::tensorflow::Status Run(Model* model, std::size_t op_index,
+                           bool* modified) override;
   const char* Name() const override { return "IdentifyDilatedConv"; }
   bool identify_depthwise_conv() const { return identify_depthwise_conv_; }
   void set_identify_depthwise_conv(bool val) { identify_depthwise_conv_ = val; }
@@ -280,4 +287,4 @@ class IdentifyDilatedConv : public GraphTransformation {
 
 }  // end namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
similarity index 92%
rename from tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
rename to tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 3114fa93e8e2741e2d288d165085d677a8d2a96d..2e41767095fb3cde09a7fb5d690ac57b1cfcd762 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -208,12 +208,32 @@ bool HardcodeMinMaxForSelect(Model* model, Operator* op) {
   if (output_array.minmax) {
     return false;
   }
-  const auto& input_array_1 = model->GetArray(op->inputs[1]);
-  if (!input_array_1.minmax) {
+
+  auto& input_array_1 = model->GetArray(op->inputs[1]);
+  auto& input_array_2 = model->GetArray(op->inputs[2]);
+
+  if (!input_array_1.minmax && !input_array_2.minmax) {
     return false;
   }
-  const auto& input_array_2 = model->GetArray(op->inputs[2]);
-  if (!input_array_2.minmax) {
+
+  // Propagate up if one input is quantized and the other is constant.
+  if (!input_array_1.minmax &&
+      IsConstantParameterArray(*model, op->inputs[1])) {
+    auto& minmax_1 = input_array_1.GetOrCreateMinMax();
+    const auto& minmax_2 = input_array_2.GetMinMax();
+    minmax_1.min = minmax_2.min;
+    minmax_1.max = minmax_2.max;
+  }
+
+  if (!input_array_2.minmax &&
+      IsConstantParameterArray(*model, op->inputs[2])) {
+    auto& minmax_2 = input_array_2.GetOrCreateMinMax();
+    const auto& minmax_1 = input_array_1.GetMinMax();
+    minmax_2.min = minmax_1.min;
+    minmax_2.max = minmax_1.max;
+  }
+
+  if (!input_array_1.minmax || !input_array_2.minmax) {
     return false;
   }
 
@@ -372,7 +392,9 @@ bool HardcodeMinMaxForLstmCell(Model* model, Operator* op) {
 }
 }  // namespace
 
-bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status HardcodeMinMax::Run(Model* model, std::size_t op_index,
+                                         bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
   bool changed = false;
@@ -409,6 +431,7 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
       break;
 
     case OperatorType::kResizeBilinear:
+    case OperatorType::kResizeNearestNeighbor:
     case OperatorType::kSlice:
     case OperatorType::kStridedSlice:
     case OperatorType::kSqueeze:
@@ -467,7 +490,8 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
   if (changed) {
     AddMessageF("Hardcoded min-max through %s", LogName(*op));
   }
-  return changed;
+  *modified = changed;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_dilated_conv.cc b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
similarity index 93%
rename from tensorflow/contrib/lite/toco/graph_transformations/identify_dilated_conv.cc
rename to tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
index aac77eb39e4b0650b699d3160f5bbe54aff8cdde..e27f975348b7ecc7ebc3c930859288234e5d506c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_dilated_conv.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_dilated_conv.cc
@@ -15,9 +15,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -168,7 +168,10 @@ bool ResolveDilatedConv(Model* model, Operator* conv_base_op, Operator* stb_op,
   return true;
 }
 
-bool IdentifyDilatedConv::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status IdentifyDilatedConv::Run(Model* model,
+                                              std::size_t op_index,
+                                              bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   auto* stb_op = it->get();
 
@@ -176,17 +179,17 @@ bool IdentifyDilatedConv::Run(Model* model, std::size_t op_index) {
   // ***************************************************************************
   // SpaceToBatch Op.
   if (stb_op->type != OperatorType::kSpaceToBatchND) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (stb_op->inputs.size() != 3) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   CHECK_EQ(stb_op->outputs.size(), 1);
   // Extract the dilation factor from Input[1] of SpaceToBatch
   // TODO(mjmatthews): Support 2D dilation factors.
   const auto& block_shape_array = model->GetArray(stb_op->inputs[1]);
   if (!block_shape_array.buffer) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   CHECK_EQ(block_shape_array.shape().dimensions_count(), 1);
   int dilation_factor =
@@ -195,7 +198,7 @@ bool IdentifyDilatedConv::Run(Model* model, std::size_t op_index) {
   // Expand Op
   auto* post_stb_op = GetOpWithInput(*model, stb_op->outputs[0]);
   if (!post_stb_op) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   bool has_expand_op = false;
   if (post_stb_op->type == OperatorType::kExpandDims) {
@@ -229,7 +232,8 @@ bool IdentifyDilatedConv::Run(Model* model, std::size_t op_index) {
     }
   }
 
-  return changed;
+  *modified = changed;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
similarity index 88%
rename from tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
rename to tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
index b78efd7fc3602dc2d6e03fd28d694c344b61c17c..dabd4bd209f450645d12b76c782b36fa5198f84a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -39,7 +39,10 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
 }
 }  // namespace
 
-bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status IdentifyL2Normalization::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   const auto div_it = model->operators.begin() + op_index;
   const auto* div_or_mul_op = div_it->get();
   OperatorType expected_op_type_producing_div_or_mul_input;
@@ -48,7 +51,7 @@ bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
   } else if (div_or_mul_op->type == OperatorType::kMul) {
     expected_op_type_producing_div_or_mul_input = OperatorType::kRsqrt;
   } else {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   CHECK_EQ(div_or_mul_op->inputs.size(), 2);
   Operator* op_producing_div_or_mul_input[2] = {
@@ -58,14 +61,14 @@ bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
   if (!op_producing_div_or_mul_input[1] ||
       op_producing_div_or_mul_input[1]->type !=
           expected_op_type_producing_div_or_mul_input) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   Operator* sqrt_or_rsqrt_op = op_producing_div_or_mul_input[1];
   CHECK_EQ(sqrt_or_rsqrt_op->inputs.size(), 1);
   Operator* op_producing_sqrt_or_rsqrt_input =
       GetOpWithOutput(*model, sqrt_or_rsqrt_op->inputs[0]);
   if (!op_producing_sqrt_or_rsqrt_input) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // There may be an Add or a Maximum here, adding or clamping to a "small"
@@ -105,7 +108,7 @@ bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
           " because the operator producing the input to the square root, %s,"
           ", does not match the expected pattern",
           LogName(*op_producing_sqrt_or_rsqrt_input));
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
 
@@ -116,7 +119,7 @@ bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
         "Giving up trying to identify L2Normalization subgraph: "
         "expected Sum op, got %s",
         LogName(*sum_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   Operator* square_op = GetOpWithOutput(*model, sum_op->inputs[0]);
@@ -125,7 +128,7 @@ bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
         "Giving up trying to identify L2Normalization subgraph: "
         "expected Square op, got %s",
         LogName(*square_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(square_op->inputs.size(), 1);
@@ -135,7 +138,7 @@ bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
         "Giving up trying to identify L2Normalization subgraph: %s does not "
         "take the same input as the Mul/Div node",
         LogName(*square_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Create and emplace the new L2Normalization
@@ -162,7 +165,8 @@ bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
   model->operators.erase(FindOperator(model, sqrt_or_rsqrt_op));
   model->EraseArray(div_or_mul_op->inputs[1]);
   model->operators.erase(FindOperator(model, div_or_mul_op));
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc b/tensorflow/lite/toco/graph_transformations/identify_l2_pool.cc
similarity index 86%
rename from tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
rename to tensorflow/lite/toco/graph_transformations/identify_l2_pool.cc
index 705e73779b7f74698149d5e9e56f69a371326ceb..6e0a7cdc31af2bbdb4da6b53f20cef66eeb8e68a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_l2_pool.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -38,11 +38,13 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
 }
 }  // namespace
 
-bool IdentifyL2Pool::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status IdentifyL2Pool::Run(Model* model, std::size_t op_index,
+                                         bool* modified) {
+  *modified = false;
   const auto sqrt_it = model->operators.begin() + op_index;
   const auto* sqrt_op = sqrt_it->get();
   if (sqrt_op->type != OperatorType::kSqrt) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(sqrt_op->inputs.size(), 1);
@@ -56,7 +58,7 @@ bool IdentifyL2Pool::Run(Model* model, std::size_t op_index) {
     AddMessageF(
         "Giving up trying to identify L2Pool subgraph: "
         "expected AveragePool op, but Sqrt op has no preceding op");
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (prev_to_sqrt_op->type != OperatorType::kAveragePool) {
@@ -64,7 +66,7 @@ bool IdentifyL2Pool::Run(Model* model, std::size_t op_index) {
         "Giving up trying to identify L2Pool subgraph: "
         "expected AveragePool op, got %s",
         LogName(*prev_to_sqrt_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   avpool_op = static_cast<const AveragePoolOperator*>(prev_to_sqrt_op);
@@ -77,7 +79,7 @@ bool IdentifyL2Pool::Run(Model* model, std::size_t op_index) {
         "Giving up trying to identify L2Pool subgraph: "
         "expected Square op, got %s",
         LogName(*square_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Create and emplace L2Pool node.
@@ -107,7 +109,8 @@ bool IdentifyL2Pool::Run(Model* model, std::size_t op_index) {
   model->operators.erase(FindOperator(model, avpool_op));
   model->operators.erase(FindOperator(model, sqrt_op));
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
similarity index 92%
rename from tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
rename to tensorflow/lite/toco/graph_transformations/identify_lstm.cc
index c0b014b45eb1df25173ce3ca3fa488b0655c3c76..089ecee959a3ab80474782a88fa176b7a9f42001 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
@@ -132,7 +132,9 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
 
 }  // namespace
 
-bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status IdentifyLstmCell::Run(Model* model, std::size_t op_index,
+                                           bool* modified) {
+  *modified = false;
   // This LSTM cell identification method is not invariant to commutation of
   // commutative operator inputs. For example, if input[0] and input[1] of the
   // final output multiplication were swapped, this method would not identify it
@@ -143,13 +145,13 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
   auto op_it = model->operators.begin() + op_index;
   Operator* final_output_mul = op_it->get();
   if (final_output_mul->type != OperatorType::kMul) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   Operator *state_output_tanh, *fc_output_sig;
   if (!MatchOperatorInputs(*final_output_mul, *model, OperatorType::kTanh,
                            &state_output_tanh, OperatorType::kLogistic,
                            &fc_output_sig)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // State output TanH
@@ -158,7 +160,7 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
   Operator* state_combine_add;
   if (!MatchOperatorInputs(*state_output_tanh, *model, OperatorType::kAdd,
                            &state_combine_add)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // State forget & remember addition
@@ -166,7 +168,7 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
   if (!MatchOperatorInputs(*state_combine_add, *model, OperatorType::kMul,
                            &state_forget_mul, OperatorType::kMul,
                            &state_remember_mul)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const string prev_state = state_forget_mul->inputs[0];
 
@@ -175,7 +177,7 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
   if (!MatchOperatorInputs(*state_forget_mul, *model, OperatorType::kNone,
                            nullptr, OperatorType::kLogistic,
                            &state_forget_sig)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // State remember gate
@@ -183,40 +185,40 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
   if (!MatchOperatorInputs(*state_remember_mul, *model, OperatorType::kLogistic,
                            &state_remember_sig, OperatorType::kTanh,
                            &state_info_tanh)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // State remember "information" activation function
   Operator* fc_output_split;
   if (!MatchOperatorInputs(*state_info_tanh, *model, OperatorType::kSplit,
                            &fc_output_split)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // State remember gate activation function
   Operator* tmp;
   if (!MatchOperatorInputs(*state_remember_sig, *model, OperatorType::kSplit,
                            &tmp) ||
       (tmp != fc_output_split)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // State forget gate activation function
   if (!MatchOperatorInputs(*state_forget_sig, *model, OperatorType::kSplit,
                            &tmp) ||
       (tmp != fc_output_split)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Fully connected output activation function
   if (!MatchOperatorInputs(*fc_output_sig, *model, OperatorType::kSplit,
                            &tmp) ||
       (tmp != fc_output_split)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Fully connected output split
   Operator* fully_connected;
   if (!MatchOperatorInputs(*fc_output_split, *model, OperatorType::kNone,
                            nullptr, OperatorType::kFullyConnected,
                            &fully_connected)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Fully connected op
@@ -225,13 +227,13 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
                            OperatorType::kConcatenation, &concat_inputs,
                            OperatorType::kNone, nullptr, OperatorType::kNone,
                            nullptr)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (static_cast<FullyConnectedOperator*>(fully_connected)->weights_format !=
       FullyConnectedWeightsFormat::kDefault) {
     // Not yet implemented: experimental shuffled weights in fused LSTM cell.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Emplace a new LSTM cell operator
@@ -300,7 +302,8 @@ bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
   model->operators.erase(FindOperator(model, *fully_connected));
   DeleteArrayIfUnused(concat_inputs->outputs[0], model);
   model->operators.erase(FindOperator(model, *concat_inputs));
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
similarity index 92%
rename from tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
rename to tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
index 5b6a984ee143a6007471b165510030cd3ad3f73c..2fae01a698727708e7669d491902da3c2e556ef2 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -18,26 +18,29 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
-bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status MergeLstmCellInputs::Run(Model* model,
+                                              std::size_t op_index,
+                                              bool* modified) {
+  *modified = false;
   // Find lstm cell.
   auto op_it = model->operators.begin() + op_index;
   auto src_op = op_it->get();
   if (src_op->type != OperatorType::kLstmCell) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Already a compact LstmCell. Do not need to merge cell inputs.
   const auto* src_lstm_op = static_cast<LstmCellOperator*>(src_op);
   if (src_lstm_op->kernel_type != LstmCellOperator::KERNEL_FULL ||
       src_lstm_op->inputs.size() != kExtendedLstmInputCount) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Identify prev_activ_input, prev_state_input as required Op inputs,
@@ -45,12 +48,12 @@ bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
   string prev_activ_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kOutputTensor],
                            &prev_activ_input)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   string prev_state_input;
   if (!GetMatchingRnnArray(model, src_op->outputs[kCellStateTensor],
                            &prev_state_input)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Get LstmCell's cell, input, output size.
@@ -184,7 +187,8 @@ bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
   DeleteArrayIfUnused(src_op->inputs[kOutputGateBiasTensor], model);
   model->operators.erase(FindOp(*model, src_op));
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
similarity index 91%
rename from tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
rename to tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
index 46d1fce50e5d6e2a74cf5461d731e46469dde5bf..c519e654636a55f228c4428a9016d11c44d0c705 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
@@ -18,26 +18,29 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
-bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status SplitLstmCellInputs::Run(Model* model,
+                                              std::size_t op_index,
+                                              bool* modified) {
+  *modified = false;
   // Find lstm cell.
   auto op_it = model->operators.begin() + op_index;
   auto curr_op = op_it->get();
   if (curr_op->type != OperatorType::kLstmCell) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto* curr_lstm_op = static_cast<LstmCellOperator*>(curr_op);
   // Already an extended LstmCell. Do not need to split cell inputs.
   if (curr_lstm_op->kernel_type != LstmCellOperator::KERNEL_BASIC ||
       curr_lstm_op->inputs.size() != LstmCellOperator::NUM_INPUTS) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Make sure the WEIGHTS_INPUT and BIASES_INPUT are constant arrays,
@@ -46,13 +49,13 @@ bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
           *model, curr_op->inputs[LstmCellOperator::WEIGHTS_INPUT]) ||
       !IsConstantParameterArray(
           *model, curr_op->inputs[LstmCellOperator::BIASES_INPUT])) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Make sure propagate_fixed_sizes has defined the size of the output.
   if (!model->GetArray(curr_op->outputs[LstmCellOperator::ACTIV_OUTPUT])
            .has_shape()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Emplace a new LstmCell operator with extended inputs (kernel/lstm.cc).
@@ -168,7 +171,8 @@ bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
   DeleteArrayIfUnused(curr_op->inputs[LstmCellOperator::BIASES_INPUT], model);
   model->operators.erase(FindOp(*model, curr_op));
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_prelu.cc b/tensorflow/lite/toco/graph_transformations/identify_prelu.cc
similarity index 88%
rename from tensorflow/contrib/lite/toco/graph_transformations/identify_prelu.cc
rename to tensorflow/lite/toco/graph_transformations/identify_prelu.cc
index b90a156a0dcfcd77c3e2b47bb0d77e246f2fc625..1205ddc7304f39fbf893e65d77cbed0db654a452 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_prelu.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_prelu.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 // This transformation rule tries to identify the PRelu structure generated by
@@ -43,13 +43,15 @@ limitations under the License.
 
 namespace toco {
 
-bool IdentifyPRelu::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status IdentifyPRelu::Run(Model* model, std::size_t op_index,
+                                        bool* modified) {
+  *modified = false;
   const auto add_op_it = model->operators.begin() + op_index;
   const auto* add_op = add_op_it->get();
   if (add_op == nullptr || add_op->type != OperatorType::kAdd ||
       add_op->inputs.size() != 2 ||
       add_op->fused_activation_function != FusedActivationFunctionType::kNone) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto* relu_input_op = GetOpWithOutput(*model, add_op->inputs[0]);
@@ -57,7 +59,7 @@ bool IdentifyPRelu::Run(Model* model, std::size_t op_index) {
       relu_input_op->inputs.size() != 1 ||
       relu_input_op->fused_activation_function !=
           FusedActivationFunctionType::kNone) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // TODO(ycling): Both Add and Mul are commutative. Support the case where
@@ -66,7 +68,7 @@ bool IdentifyPRelu::Run(Model* model, std::size_t op_index) {
   if (mul_op == nullptr || mul_op->type != OperatorType::kMul ||
       mul_op->inputs.size() != 2 ||
       mul_op->fused_activation_function != FusedActivationFunctionType::kNone) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto neg_alpha_tensor_name = mul_op->inputs[0];
@@ -75,7 +77,7 @@ bool IdentifyPRelu::Run(Model* model, std::size_t op_index) {
 
   if (relu_neg_input_op == nullptr ||
       relu_neg_input_op->inputs.size() != 1) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const Operator* final_input_op;
@@ -92,13 +94,13 @@ bool IdentifyPRelu::Run(Model* model, std::size_t op_index) {
         relu_neg_input_op->type != OperatorType::kRelu ||
         relu_neg_input_op->fused_activation_function !=
             FusedActivationFunctionType::kNone) {
-      return false;
+      return ::tensorflow::Status::OK();
     }
     final_input_op = neg_input_op;
   }
 
   if (relu_input_op->inputs[0] != final_input_op->inputs[0]) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto input_tensor_name = relu_input_op->inputs[0];
@@ -128,7 +130,8 @@ bool IdentifyPRelu::Run(Model* model, std::size_t op_index) {
   // intermediate tensors aren't used by other ops, those will be removed by
   // other graph transformation rules.
   model->operators.erase(FindOp(*model, add_op));
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc b/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
similarity index 87%
rename from tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
rename to tensorflow/lite/toco/graph_transformations/identify_relu1.cc
index 94820a016622a12654e91967737e05fc91ed404c..bcd5b0ca04a8a2fec1d3d10bb20b483806244861 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_relu1.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -56,13 +56,15 @@ int GetSingleScalarInputIndexOfBinaryOp(Model* model, const Operator* op,
 }
 }  // namespace
 
-bool IdentifyRelu1::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status IdentifyRelu1::Run(Model* model, std::size_t op_index,
+                                        bool* modified) {
+  *modified = false;
   // Follow sequences of min+max and max+min. First get the leading op.
   const auto op_it = model->operators.begin() + op_index;
   const auto* op_0 = op_it->get();
   if (op_0->type != OperatorType::kMinimum &&
       op_0->type != OperatorType::kMaximum) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Get the paired op and ensure it's the counter to the first.
@@ -71,17 +73,17 @@ bool IdentifyRelu1::Run(Model* model, std::size_t op_index) {
       (op_1->type != OperatorType::kMinimum &&
        op_1->type != OperatorType::kMaximum) ||
       op_0->type == op_1->type) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto* min_op = op_0->type == OperatorType::kMinimum ? op_0 : op_1;
   const auto* max_op = op_0->type == OperatorType::kMaximum ? op_0 : op_1;
 
   if (min_op->inputs.size() != 2 || max_op->inputs.size() != 2) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (min_op->outputs.size() != 1 || max_op->outputs.size() != 1) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Get the original input to the min+max pair.
@@ -90,7 +92,7 @@ bool IdentifyRelu1::Run(Model* model, std::size_t op_index) {
   int max_scalar_input_index =
       GetSingleScalarInputIndexOfBinaryOp(model, max_op, -1.0f);
   if (min_scalar_input_index == -1 || max_scalar_input_index == -1) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   int op_0_scalar_input_index =
       op_0 == min_op ? min_scalar_input_index : max_scalar_input_index;
@@ -111,7 +113,8 @@ bool IdentifyRelu1::Run(Model* model, std::size_t op_index) {
   model->operators.erase(FindOperator(model, op_0));
   model->operators.erase(FindOperator(model, op_1));
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc b/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
similarity index 98%
rename from tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc
rename to tensorflow/lite/toco/graph_transformations/lstm_utils.cc
index 910a96058979887972b41f27b2e570e8cb5b4f4c..3414a7fd7fe2d1e56c71927ebcb8cce4ab0b875b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.cc
+++ b/tensorflow/lite/toco/graph_transformations/lstm_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
 
 namespace toco {
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
similarity index 92%
rename from tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
rename to tensorflow/lite/toco/graph_transformations/lstm_utils.h
index 6d8603a1133a7478647b8bcc49ea1eceba28df31..949292ee84b2923abac1daa7a1486026f8b4d0d8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
+++ b/tensorflow/lite/toco/graph_transformations/lstm_utils.h
@@ -12,20 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
 
 #include <iostream>
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
 // For consistency with the parameters defined in extended LstmCell's kernel
-// (tensorflow/contrib/lite/kernels/lstm.cc),
+// (tensorflow/lite/kernels/lstm.cc),
 // use lowercase for these constants.
 
 enum ExtendedLstmCellInputs {
@@ -108,4 +108,4 @@ bool GetMatchingRnnArray(Model* model, const string& back_edge_source_array,
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_LSTM_UTILS_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
similarity index 88%
rename from tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
rename to tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index f684de08abf72d05d4408bf6341fa5a3c2ed11cd..b914838b91c965da66f7ccaecc01fa2b23eaa883 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -97,7 +97,10 @@ bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
   return true;
 }
 
-bool MakeInitialDequantizeOperator::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status MakeInitialDequantizeOperator::Run(Model* model,
+                                                        std::size_t op_index,
+                                                        bool* modified) {
+  *modified = false;
   // This is effectively a transformation applied to edges.  We iterate over the
   // specified node (op) and proceed for input edges.
   const auto it = model->operators.begin() + op_index;
@@ -114,7 +117,8 @@ bool MakeInitialDequantizeOperator::Run(Model* model, std::size_t op_index) {
       }
     }
   }
-  return change_made;
+  *modified = change_made;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
similarity index 86%
rename from tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
rename to tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
index 95bc7f7d4b8b517c1cc5a73b3e85bbd985ce460f..80170fe8bcb73e6e3048138632492b96d8b80f85 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -102,18 +102,19 @@ std::vector<int32> ReshapeToTranspose(const Model& model,
 // to be merged if the reshape does not affect memory ordering and does not
 // affects the number of dimensions. This only occurs when only unary dimensions
 // are shifting position.
-bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
-                                             std::size_t op_index) {
+::tensorflow::Status MergeReshapeIntoPrecedingTranspose::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* reshape_op = ConvertOperator<TensorFlowReshapeOperator*>(
       it->get(), OperatorType::kReshape);
 
   if (reshape_op == nullptr) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const string intermediate_name = reshape_op->inputs[0];
@@ -121,13 +122,13 @@ bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
 
   // Guarantee the input is only consume by the reshape.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Check for the parent operator.
   const auto& transpose_it = FindOpWithOutput(*model, intermediate_name);
   if (transpose_it == model->operators.end()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Find the parent operator and guarantee it is a transpose.
@@ -135,16 +136,16 @@ bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
       transpose_it->get(), OperatorType::kTranspose);
 
   if (transpose_op == nullptr) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
                                       false /*allow_extra_unary_dimensions*/)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Check that the intermediate is not an output array.
@@ -153,7 +154,7 @@ bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
         "Cannot fuse %s and %s as it would invalidate the transpose "
         "output array.",
         LogName(*transpose_op), LogName(*reshape_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Merging operations %s and %s", LogName(*transpose_op),
@@ -172,7 +173,7 @@ bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
 
   // Remove the reshape as passthrough operation.
   if (!RemoveTrivialPassthroughOp(this, model, op_index)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Update transpose_op's constant buffer to contain the new permutation.
@@ -184,7 +185,8 @@ bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
   // transpose_ops's shape will likely has changed.
   model->GetArray(transpose_op->outputs[0]).clear_shape();
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
similarity index 87%
rename from tensorflow/contrib/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
rename to tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
index 7f44c65285bdef6ba314b16122fdd550bfa47e6a..0f3c4d34d66ea2cab63ae5e4168d2d84e25399d6 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/move_binary_operator_before_reshape.cc
@@ -14,9 +14,9 @@
  ==============================================================================*/
 #include <algorithm>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
@@ -54,7 +54,10 @@ bool IsTailOfShape(const Shape& tail, const Shape& shape) {
 //
 // Note we are testing for one particular case of a broader set of possible
 // binary-reshape op transformations. This transformation could be generalized.
-bool MoveBinaryOperatorBeforeReshape::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status MoveBinaryOperatorBeforeReshape::Run(Model* model,
+                                                          std::size_t op_index,
+                                                          bool* modified) {
+  *modified = false;
   const auto binary_it = model->operators.begin() + op_index;
   Operator* binary_op = binary_it->get();
   if (binary_op->type != OperatorType::kAdd &&
@@ -69,7 +72,7 @@ bool MoveBinaryOperatorBeforeReshape::Run(Model* model, std::size_t op_index) {
       binary_op->type != OperatorType::kLessEqual &&
       binary_op->type != OperatorType::kGreater &&
       binary_op->type != OperatorType::kGreaterEqual) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // BINARY OP INPUT CHECKS
@@ -81,11 +84,11 @@ bool MoveBinaryOperatorBeforeReshape::Run(Model* model, std::size_t op_index) {
   if (!input_is_const[0] && !input_is_const[1]) {
     // To limit our scope, we require one constant input. Though there's no
     // reason this transformation wouldn't work with all variable inputs.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (input_is_const[0] && input_is_const[1]) {
     // Both inputs are constants. Leave this for constants propagation.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const int constant_input_idx = input_is_const[0] ? 0 : 1;
   const int variable_input_idx = input_is_const[0] ? 1 : 0;
@@ -98,13 +101,13 @@ bool MoveBinaryOperatorBeforeReshape::Run(Model* model, std::size_t op_index) {
     AddMessageF(
         "Not moving %s because it's non-constant input shape is not resolved.",
         LogName(*binary_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (!IsTailOfShape(
           model->GetArray(binary_op->inputs[constant_input_idx]).shape(),
           model->GetArray(binary_op->inputs[variable_input_idx]).shape())) {
     // Constant array shape must be the latter part of the variable shape.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // RESHAPE OP CHECKS
@@ -113,13 +116,13 @@ bool MoveBinaryOperatorBeforeReshape::Run(Model* model, std::size_t op_index) {
   if (reshape_it == model->operators.end()) {
     AddMessageF("Not moving %s because it's variable input is not connected.",
                 LogName(*binary_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
   Operator* reshape_op = reshape_it->get();
   if (reshape_op->type != OperatorType::kReshape) {
     AddMessageF("Not moving %s because the preceding %s is not a reshape op",
                 LogName(*binary_op), LogName(*reshape_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto& reshape_input_array = model->GetArray(reshape_op->inputs[0]);
   if (!reshape_input_array.has_shape()) {
@@ -127,14 +130,14 @@ bool MoveBinaryOperatorBeforeReshape::Run(Model* model, std::size_t op_index) {
         "Not moving %s because it's non-constant input shape is not resolved "
         "yet",
         LogName(*binary_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (!IsTailOfShape(
           model->GetArray(binary_op->inputs[constant_input_idx]).shape(),
           model->GetArray(reshape_op->outputs[0]).shape())) {
     // Constant array shape must be the latter part of the binary op output
     // shape.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // EXTRA CHECKS ON CONNECTING ARRAY
@@ -143,7 +146,7 @@ bool MoveBinaryOperatorBeforeReshape::Run(Model* model, std::size_t op_index) {
       AddMessageF(
           "Not moving %s because the output of reshape op %s is an output op.",
           LogName(*binary_op), LogName(*reshape_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
   int count_ops_consuming_output =
@@ -154,7 +157,7 @@ bool MoveBinaryOperatorBeforeReshape::Run(Model* model, std::size_t op_index) {
         "Not moving %s because the output of reshape op %s is consumed by "
         "another op",
         LogName(*binary_op), LogName(*reshape_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // SWAP ORDER OF BINARY AND RESHAPE OPS
@@ -172,7 +175,8 @@ bool MoveBinaryOperatorBeforeReshape::Run(Model* model, std::size_t op_index) {
   // Clear binary output shape so it will be re-propagated
   model->GetArray(binary_op->outputs[0]).clear_shape();
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
similarity index 82%
rename from tensorflow/contrib/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
rename to tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
index cf17c49b1098d02468935aa72d1d1e73b4addbe1..95de60262e754d80e8c5e1822aff9fd92b997bba 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_activation_function_into_constants.cc
@@ -17,29 +17,30 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool PropagateActivationFunctionIntoConstants::Run(Model* model,
-                                                   std::size_t op_index) {
+::tensorflow::Status PropagateActivationFunctionIntoConstants::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  *modified = false;
   const auto ac_it = model->operators.begin() + op_index;
   const auto* ac_op = ac_it->get();
   if (ac_op->type != OperatorType::kRelu6 &&
       ac_op->type != OperatorType::kRelu1 &&
       ac_op->type != OperatorType::kRelu) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Find the op producing the array passed to this activation function.
   auto* src_op = GetOpWithOutput(*model, ac_op->inputs[0]);
   if (!src_op) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Ensure the src_op is not used without the activation function applied.
@@ -57,7 +58,7 @@ bool PropagateActivationFunctionIntoConstants::Run(Model* model,
       src_op_input = src_op->inputs[0];
       break;
     default:
-      return false;
+      return ::tensorflow::Status::OK();
   }
   CHECK_EQ(src_op->outputs[0], ac_op->inputs[0]);
 
@@ -69,7 +70,7 @@ bool PropagateActivationFunctionIntoConstants::Run(Model* model,
         "Not propagating activation function %s into %s:%s because it is not "
         "constant",
         LogName(*ac_op), LogName(*src_op), src_op_input);
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Get the array we'll be working with and ensure it's a compatible type.
@@ -79,7 +80,7 @@ bool PropagateActivationFunctionIntoConstants::Run(Model* model,
         "Not propagating activation function %s into %s:%s because it is "
         "non-float data",
         LogName(*ac_op), LogName(*src_op), src_op_input);
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto& const_array_data =
       const_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
@@ -108,14 +109,15 @@ bool PropagateActivationFunctionIntoConstants::Run(Model* model,
       }
       default:
         LOG(FATAL) << "Unsupported activation function " << LogName(*ac_op);
-        return false;
+        return ::tensorflow::Status::OK();
     }
     const_array_data[i] = new_value;
   }
 
   AddMessageF("Propagated activation function %s into %s:%s", LogName(*ac_op),
               LogName(*src_op), src_op_input);
-  return RemoveTrivialPassthroughOp(this, model, op_index);
+  *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
similarity index 88%
rename from tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
rename to tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 323eefcd3a7665a8c01da1bc10d6f8d80da7a15d..cbae6610d7f4703a898d8d6f35351a09cd70173c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -32,7 +32,10 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
 }
 }  // namespace
 
-bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status PropagateArrayDataTypes::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
 
@@ -40,7 +43,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
   for (const auto& input : op->inputs) {
     if (!model->IsOptionalArray(input) &&
         model->GetArray(input).data_type == ArrayDataType::kNone) {
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
   // Record data types of output before processing, so we can see at the
@@ -83,6 +86,13 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
+    case OperatorType::kSplitV: {
+      // These operators produce output with the same type as its 1st input
+      CHECK_GE(op->inputs.size(), 3);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
     case OperatorType::kTransposeConv: {
       // These operators produce an output with the same type as their 3rd input
       CHECK_GE(op->inputs.size(), 3);
@@ -131,7 +141,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       auto* rand_op = static_cast<RandomUniformOperator*>(op);
       // The output type of RandomUniform is specified with an attribute
       if (rand_op->dtype == ArrayDataType::kNone) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       CHECK_EQ(op->outputs.size(), 1);
       SetDataTypeForAllOutputs(model, op, rand_op->dtype);
@@ -153,7 +163,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       // This can make unsupported_op->output_data_types have more elements than
       // op->outputs.
       if (unsupported_op->output_data_types.size() < op->outputs.size()) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
         const string& output = op->outputs[i];
@@ -164,7 +174,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     }
     case OperatorType::kExpandDims: {
       // Yield on ExpandDim until it is converted to Reshape
-      return false;
+      return ::tensorflow::Status::OK();
     }
     case OperatorType::kSelect: {
       // Select produces outputs with the same type as their 2nd input
@@ -236,6 +246,12 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       }
       break;
     }
+    case OperatorType::kUnidirectionalSequenceLstm: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      if (data_type != ArrayDataType::kFloat) return ::tensorflow::Status::OK();
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
@@ -248,10 +264,11 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
   // Return true if any output data type changed, false if none changed.
   for (const auto& output : op->outputs) {
     if (old_output_data_types[output] != model->GetArray(output).data_type) {
-      return true;
+      *modified = true;
+      return ::tensorflow::Status::OK();
     }
   }
-  return false;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
similarity index 84%
rename from tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc
rename to tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
index cd078ef189e922682098a0ec8dc4743060181aac..d31ba956afd986fff61f9c61d4ac9782e7e69bbe 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_default_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_default_min_max.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -39,7 +39,10 @@ bool SupportsMinMax(const Array& array) {
 // When provided a set of min/max values for uint8 arrays this will rescale
 // the values for other data types as required and preserving the floating point
 // range within the new type.
-bool PropagateDefaultMinMax::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status PropagateDefaultMinMax::Run(Model* model,
+                                                 std::size_t op_index,
+                                                 bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   const auto* op = it->get();
 
@@ -61,7 +64,8 @@ bool PropagateDefaultMinMax::Run(Model* model, std::size_t op_index) {
     }
   }
 
-  return did_change;
+  *modified = did_change;
+  return ::tensorflow::Status::OK();
 }
 
 // Sets the min/max on the given array, adjusting the reference_minmax for the
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
similarity index 95%
rename from tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
rename to tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
index 3ad6b0ec6f7a3c4a9a0ab3964c1198ee757ea4b5..04a5a1c1687b4caae2f31548ec549cb95e153df5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -277,11 +277,14 @@ bool RecursivelyForwardPropagateDataType(GraphTransformation* transformation,
 // nice logging and integration with the graphviz video dumping mode.
 // In general you should not copy this style of transformation and stick to
 // local-only changes as seen in the other transformations.
-bool PropagateFakeQuantNumBits::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status PropagateFakeQuantNumBits::Run(Model* model,
+                                                    std::size_t op_index,
+                                                    bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
   if (op->type != OperatorType::kFakeQuant) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* fakequant_op = static_cast<FakeQuantOperator*>(op);
 
@@ -290,7 +293,7 @@ bool PropagateFakeQuantNumBits::Run(Model* model, std::size_t op_index) {
                                            &quantized_data_type)) {
     AddMessageF("FakeQuant op %s num_bits=%d is out of range, ignoring",
                 LogName(*op), fakequant_op->num_bits);
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto& final_minmax = *fakequant_op->minmax;
 
@@ -311,7 +314,8 @@ bool PropagateFakeQuantNumBits::Run(Model* model, std::size_t op_index) {
   did_change |=
       RecursivelyForwardPropagateDataType(this, model, op, quantized_data_type);
 
-  return did_change;
+  *modified = did_change;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
similarity index 89%
rename from tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
rename to tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index d056a8add7a1875f8274c72b0f37b0fc5239223c..0e653f08a04f237c861038639a1469eb62f35dfa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -15,15 +15,16 @@ limitations under the License.
 #include <algorithm>
 #include <iterator>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include "absl/strings/str_join.h"
-#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -786,6 +787,97 @@ void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
   }
 }
 
+void ProcessTensorFlowSplitVOperator(Model* model,
+                                     TensorFlowSplitVOperator* op) {
+  CHECK_EQ(op->inputs.size(), 3);
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const Shape& input_shape = input_array.shape();
+
+  // Yield until size_splits is constant.
+  if (!IsConstantParameterArray(*model, op->inputs[1])) {
+    return;
+  }
+  const auto& size_array = model->GetArray(op->inputs[1]);
+  // Yield until size_splits dims have been resolved.
+  if (!size_array.has_shape()) {
+    return;
+  }
+  const Shape& size_shape = size_array.shape();
+
+  CHECK(size_array.data_type == ArrayDataType::kInt32 ||
+        size_array.data_type == ArrayDataType::kInt64)
+      << "size_splits must be int32, int64";
+  CHECK_EQ(size_shape.dimensions_count(), 1) << "size_splits must be 1-D";
+
+  std::vector<int64> size_splits_vector;
+  if (size_array.data_type == ArrayDataType::kInt32) {
+    for (const auto each_size :
+         size_array.GetBuffer<ArrayDataType::kInt32>().data) {
+      size_splits_vector.push_back(each_size);
+    }
+  } else {
+    size_splits_vector = size_array.GetBuffer<ArrayDataType::kInt64>().data;
+  }
+
+  // Yield until axis is constant.
+  if (!IsConstantParameterArray(*model, op->inputs[2])) {
+    return;
+  }
+  const auto& axis_array = model->GetArray(op->inputs[2]);
+  // Yield until axis dims have been resolved.
+  if (!axis_array.has_shape()) {
+    return;
+  }
+
+  CHECK(axis_array.data_type == ArrayDataType::kInt32)
+      << "Axis array must be int32.";
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1)
+      << "Axis array must be scalar.";
+
+  int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  if (axis < 0) {
+    axis += input_shape.dimensions_count();
+  }
+
+  CHECK_EQ(op->num_split, size_splits_vector.size());
+
+  int64_t minus_one_count = 0, size_splits_sum = 0;
+  for (auto size : size_splits_vector) {
+    if (size == -1) {
+      ++minus_one_count;
+    } else {
+      size_splits_sum += size;
+    }
+  }
+
+  const int input_size = input_shape.dims(axis);
+
+  CHECK_LE(minus_one_count, 1) << "size_splits can contain at most one -1.";
+
+  if (minus_one_count == 1) {
+    CHECK_LE(size_splits_sum, input_size);
+    auto iter =
+        std::find(size_splits_vector.begin(), size_splits_vector.end(), -1);
+    *iter = input_size - size_splits_sum;
+  } else {
+    CHECK_EQ(size_splits_sum, input_size);
+  }
+
+  CHECK_EQ(op->outputs.size(), op->num_split);
+
+  for (int i = 0; i < op->outputs.size(); ++i) {
+    const auto& output = op->outputs[i];
+    Shape output_shape = input_shape;
+    (*output_shape.mutable_dims())[axis] = size_splits_vector.at(i);
+    model->GetArray(output).copy_shape(output_shape);
+  }
+}
+
 void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
   const string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
@@ -866,6 +958,34 @@ void ProcessResizeBilinearOperator(Model* model, ResizeBilinearOperator* op) {
                          output_shape[1], input_data_shape.dims(3)}));
 }
 
+void ProcessResizeNearestNeighborOperator(Model* model,
+                                          ResizeNearestNeighborOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  if (!model->GetArray(op->inputs[0]).has_shape() ||
+      !model->GetArray(op->inputs[1]).has_shape()) {
+    return;
+  }
+  const auto& input_data_shape = model->GetArray(op->inputs[0]).shape();
+
+  const string& output_size_name = op->inputs[1];
+  const auto& output_size_array = model->GetArray(output_size_name);
+  CHECK(output_size_array.data_type == ArrayDataType::kInt32);
+  CHECK(output_size_array.has_shape());
+  const auto& output_size_shape = output_size_array.shape();
+  CHECK_EQ(output_size_shape.dimensions_count(), 1);
+  CHECK_EQ(output_size_shape.dims(0), 2);
+  if (!output_size_array.buffer) {
+    return;
+  }
+  std::vector<int32> output_shape =
+      output_size_array.GetBuffer<ArrayDataType::kInt32>().data;
+  model->GetArray(op->outputs[0])
+      .copy_shape(Shape({input_data_shape.dims(0), output_shape[0],
+                         output_shape[1], input_data_shape.dims(3)}));
+}
+
 void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
   // Only required for compact LstmCell with default NUM_INPUTS of inputs.
   if (op->inputs.size() != LstmCellOperator::NUM_INPUTS) return;
@@ -946,6 +1066,49 @@ void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
       .copy_shape(activ_temp_shape);
 }
 
+void ProcessUnidirectionalSequenceLstmOperator(
+    Model* model, UnidirectionalSequenceLstmOperator* op) {
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
+  // TODO(renjieliu): check the inputs, as well as all kinds of weights.
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const int batch_size = input_shape.dims(1);
+  const int timestamp = input_shape.dims(0);
+
+  const auto& recurrent_to_output_weights_array =
+      model->GetArray(op->inputs[8]);
+  // Yield until input dims have been resolved.
+  if (!recurrent_to_output_weights_array.has_shape()) {
+    return;
+  }
+
+  constexpr int kInputActivationStateTensor = 18;
+  constexpr int kInputCellStateTensor = 19;
+  // b(115961645): This is a hack to work around.
+  model->GetArray(op->inputs[kInputActivationStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kInputCellStateTensor]).buffer.reset();
+
+  const auto& output_weights_shape = recurrent_to_output_weights_array.shape();
+  const int output_size = output_weights_shape.dims(1);
+
+  Shape* output_shape = output_array.mutable_shape();
+  output_shape->ReplaceDims({timestamp, batch_size, output_size});
+}
+
 void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
   const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
@@ -1620,9 +1783,57 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
   }
 }
 
+void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  const auto& padding_matrix = model->GetArray(op->inputs[1]);
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  // If output already computed or padding matrix is non
+  // const then return.
+  if (output_array.has_shape() ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return;
+  }
+  Shape output_shape = input_array.shape();
+  std::vector<int>& dims = *output_shape.mutable_dims();
+
+  std::vector<int64_t> padding;
+  if (padding_matrix.data_type == ArrayDataType::kInt32) {
+    const auto& data = padding_matrix.GetBuffer<ArrayDataType::kInt32>().data;
+    for (auto elem : data) {
+      padding.push_back(static_cast<int64_t>(elem));
+    }
+  } else if (padding_matrix.data_type == ArrayDataType::kInt64) {
+    const auto& data = padding_matrix.GetBuffer<ArrayDataType::kInt64>().data;
+    for (auto elem : data) {
+      padding.push_back(elem);
+    }
+  } else {
+    CHECK(padding_matrix.data_type == ArrayDataType::kInt64 ||
+          padding_matrix.data_type == ArrayDataType::kInt32);
+  }
+  CHECK_EQ(padding_matrix.shape().dimensions_count(), 2);
+  CHECK_EQ(input_array.shape().dimensions_count(),
+           padding_matrix.shape().dims(0));
+  for (int i = 0; i < input_array.shape().dimensions_count(); ++i) {
+    dims[i] += padding[i * 2] + padding[i * 2 + 1];
+  }
+
+  output_array.copy_shape(output_shape);
+}
+
 }  // namespace
 
-bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status PropagateFixedSizes::Run(Model* model,
+                                              std::size_t op_index,
+                                              bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
   std::unordered_map<string, std::vector<int>> old_output_dims;
@@ -1633,6 +1844,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
   }
 
   switch (op->type) {
+    case OperatorType::kAbs:
     case OperatorType::kBatchNormalization:
     case OperatorType::kL2Normalization:
     case OperatorType::kDequantize:
@@ -1640,6 +1852,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
     case OperatorType::kPRelu:
+    case OperatorType::kLeakyRelu:
     case OperatorType::kSoftmax:
     case OperatorType::kLogSoftmax:
     case OperatorType::kLog:
@@ -1685,6 +1898,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kEqual:
     case OperatorType::kNotEqual:
     case OperatorType::kPow:
+    case OperatorType::kSquaredDifference:
       ProcessSimpleBinaryOperator(model, op);
       break;
     case OperatorType::kAddN:
@@ -1760,6 +1974,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       ProcessTensorFlowSplitOperator(model,
                                      static_cast<TensorFlowSplitOperator*>(op));
       break;
+    case OperatorType::kSplitV:
+      ProcessTensorFlowSplitVOperator(
+          model, static_cast<TensorFlowSplitVOperator*>(op));
+      break;
     case OperatorType::kSqueeze:
       ProcessSqueezeOperator(model, static_cast<SqueezeOperator*>(op));
       break;
@@ -1797,6 +2015,14 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       ProcessResizeBilinearOperator(model,
                                     static_cast<ResizeBilinearOperator*>(op));
       break;
+    case OperatorType::kResizeNearestNeighbor:
+      ProcessResizeNearestNeighborOperator(
+          model, static_cast<ResizeNearestNeighborOperator*>(op));
+      break;
+    case OperatorType::kUnidirectionalSequenceLstm:
+      ProcessUnidirectionalSequenceLstmOperator(
+          model, static_cast<UnidirectionalSequenceLstmOperator*>(op));
+      break;
     case OperatorType::kLstmCell:
       ProcessLstmCellOperator(model, static_cast<LstmCellOperator*>(op));
       break;
@@ -1836,7 +2062,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
           static_cast<TensorFlowUnsupportedOperator*>(op);
       // Attribute can be not specified, ignore it.
       if (unsupported_op->output_shapes.size() < op->outputs.size()) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
         const string& output = op->outputs[i];
@@ -1874,6 +2100,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kUnpack:
       ProcessUnpackOperator(model, static_cast<UnpackOperator*>(op));
       break;
+    case OperatorType::kMirrorPad:
+      ProcessMirrorPadOperator(model, static_cast<MirrorPadOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
@@ -1886,10 +2115,11 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
         (old_output_dims[output] != model->GetArray(output).shape().dims())) {
       AddMessageF("Set shape of %s to [%s]", output,
                   absl::StrJoin(model->GetArray(output).shape().dims(), ","));
-      return true;
+      *modified = true;
+      return ::tensorflow::Status::OK();
     }
   }
-  return false;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/quantization_util.cc b/tensorflow/lite/toco/graph_transformations/quantization_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56f83c9793f723baa13e37924fa566d3018aa0d0
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/quantization_util.cc
@@ -0,0 +1,277 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool InferQuantizedDataTypeFromFakeQuant(
+    const FakeQuantOperator& op, ArrayDataType* out_quantized_data_type) {
+  if (op.num_bits <= 8) {
+    *out_quantized_data_type = ArrayDataType::kUint8;
+    return true;
+  } else if (op.num_bits <= 16) {
+    *out_quantized_data_type = ArrayDataType::kInt16;
+    return true;
+  } else {
+    *out_quantized_data_type = ArrayDataType::kNone;
+    return false;
+  }
+}
+
+bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
+                                        double* out_min_value,
+                                        double* out_max_value) {
+  switch (data_type) {
+    case ArrayDataType::kUint8:
+      *out_min_value = 0;
+      *out_max_value = 255;
+      return true;
+    case ArrayDataType::kInt16:
+      *out_min_value = -32768;
+      *out_max_value = 32767;
+      return true;
+    default:
+      return false;
+  }
+}
+
+ArrayDataType GetQuantizedDataType(const Array& array,
+                                   ArrayDataType default_type) {
+  switch (array.final_data_type) {
+    case ArrayDataType::kInt8:
+    case ArrayDataType::kUint8:
+    case ArrayDataType::kInt16:
+    case ArrayDataType::kUint16:
+    case ArrayDataType::kInt32:
+    case ArrayDataType::kUint32:
+    case ArrayDataType::kInt64:
+    case ArrayDataType::kUint64:
+      return array.final_data_type;
+    case ArrayDataType::kFloat:
+    case ArrayDataType::kNone:
+      return default_type;
+    default:
+      LOG(FATAL) << "Unhandled final quantization type "
+                 << static_cast<int>(array.final_data_type);
+  }
+}
+
+template <ArrayDataType A>
+void ChooseQuantizationParamsForArrayAndQuantizedDataType(
+    const Array& array, QuantizationParams* quantization_params) {
+  *quantization_params = ::tflite::ChooseQuantizationParams<DataType<A>>(
+      array.minmax->min, array.minmax->max, array.narrow_range);
+}
+
+void ChooseQuantizationParamsForArrayAndQuantizedDataType(
+    const Array& array, ArrayDataType quantized_data_type,
+    QuantizationParams* quantization_params) {
+  switch (quantized_data_type) {
+    case ArrayDataType::kInt8:
+      ChooseQuantizationParamsForArrayAndQuantizedDataType<
+          ArrayDataType::kInt8>(array, quantization_params);
+      break;
+    case ArrayDataType::kUint8:
+      ChooseQuantizationParamsForArrayAndQuantizedDataType<
+          ArrayDataType::kUint8>(array, quantization_params);
+      break;
+    case ArrayDataType::kInt16:
+      ChooseQuantizationParamsForArrayAndQuantizedDataType<
+          ArrayDataType::kInt16>(array, quantization_params);
+      break;
+    case ArrayDataType::kUint16:
+      ChooseQuantizationParamsForArrayAndQuantizedDataType<
+          ArrayDataType::kUint16>(array, quantization_params);
+      break;
+    case ArrayDataType::kInt32:
+      ChooseQuantizationParamsForArrayAndQuantizedDataType<
+          ArrayDataType::kInt32>(array, quantization_params);
+      break;
+    case ArrayDataType::kUint32:
+      ChooseQuantizationParamsForArrayAndQuantizedDataType<
+          ArrayDataType::kUint32>(array, quantization_params);
+      break;
+    case ArrayDataType::kInt64:
+      ChooseQuantizationParamsForArrayAndQuantizedDataType<
+          ArrayDataType::kInt64>(array, quantization_params);
+      break;
+    case ArrayDataType::kUint64:
+      ChooseQuantizationParamsForArrayAndQuantizedDataType<
+          ArrayDataType::kUint64>(array, quantization_params);
+      break;
+    case ArrayDataType::kFloat:
+    case ArrayDataType::kComplex64:
+    case ArrayDataType::kNone:
+    default:
+      LOG(FATAL) << "Unhandled final quantization type "
+                 << static_cast<int>(quantized_data_type);
+  }
+}
+
+namespace {
+
+template <ArrayDataType A>
+std::unique_ptr<GenericBuffer> QuantizeBuffer(
+    const Array& array, const QuantizationParams& quantization_params) {
+  const GenericBuffer& buffer = *array.buffer;
+  const auto inverse_scale = 1. / quantization_params.scale;
+  CHECK(buffer.type == ArrayDataType::kFloat);
+  const auto& float_buffer =
+      static_cast<const Buffer<ArrayDataType::kFloat>&>(buffer);
+  auto* quantized_buffer = new Buffer<A>;
+  quantized_buffer->data.resize(float_buffer.data.size());
+  for (std::size_t i = 0; i < float_buffer.data.size(); i++) {
+    const float src_val = float_buffer.data[i];
+    double scaled_val;  // Astonishingly, using 'float' degrades accuracy just
+                        // enough to make a few tests fail!
+    if (quantization_params.scale == 0) {
+      CHECK_EQ(src_val, 0) << "The quantization scale for this array is 0, "
+                           << "so all its values should be 0.";
+      scaled_val = quantization_params.zero_point;
+    } else {
+      scaled_val = quantization_params.zero_point + inverse_scale * src_val;
+    }
+    auto integer_val = tflite::SafeCast<DataType<A>>(std::round(scaled_val));
+    // In addition to its effect on the choice of quantization params upstream
+    // of here, narrow_range also means nudge the min quantized value by +1,
+    // so e.g. uint8 values get constrained to [1, 255].
+    if (integer_val == std::numeric_limits<DataType<A>>::min() &&
+        array.narrow_range) {
+      integer_val++;
+    }
+    quantized_buffer->data[i] = integer_val;
+  }
+  return std::unique_ptr<GenericBuffer>(quantized_buffer);
+}
+
+template <ArrayDataType A>
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name,
+                   const QuantizationParams& quantization_params) {
+  auto& array = model->GetArray(name);
+  CHECK(array.data_type == ArrayDataType::kFloat);
+  CHECK(!array.quantization_params);
+  array.GetOrCreateQuantizationParams() = quantization_params;
+  if (array.buffer) {
+    array.buffer = QuantizeBuffer<A>(array, quantization_params);
+  }
+  array.data_type = A;
+  array.final_data_type = A;
+  transformation->AddMessageF(
+      "Quantized array %s to %s zero_point=%g, scale=%g", name,
+      ArrayDataTypeName(array.data_type), quantization_params.zero_point,
+      quantization_params.scale);
+}
+
+}  // namespace
+
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name, ArrayDataType quantized_data_type,
+                   const QuantizationParams& quantization_params) {
+  ArrayDataType adjusted_data_type = quantized_data_type;
+  auto& array = model->GetArray(name);
+  if (array.final_data_type == ArrayDataType::kInt16) {
+    adjusted_data_type = array.final_data_type;
+  }
+
+  switch (adjusted_data_type) {
+    case ArrayDataType::kUint8:
+      return QuantizeArray<ArrayDataType::kUint8>(transformation, model, name,
+                                                  quantization_params);
+    case ArrayDataType::kInt16:
+      return QuantizeArray<ArrayDataType::kInt16>(transformation, model, name,
+                                                  quantization_params);
+    case ArrayDataType::kInt32:
+      return QuantizeArray<ArrayDataType::kInt32>(transformation, model, name,
+                                                  quantization_params);
+    default:
+      LOG(FATAL) << "Unhandled case.";
+  }
+}
+
+bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
+                                 const Array& array, double clamp_min,
+                                 double clamp_max) {
+  ArrayDataType quantized_data_type =
+      GetQuantizedDataType(array, array.data_type);
+  if (quantized_data_type == ArrayDataType::kNone ||
+      quantized_data_type == ArrayDataType::kFloat) {
+    // The array is not (or never will be) quantized.
+    return false;
+  }
+
+  QuantizationParams quantization_params;
+  if (!array.quantization_params) {
+    if (!array.minmax) {
+      transformation->AddMessageF("No quantization params and no minmax");
+      return false;
+    } else {
+      // Work around cases where we are asking for this prior to the Quantize
+      // transformation having added the quantization_params.
+      ChooseQuantizationParamsForArrayAndQuantizedDataType(
+          array, quantized_data_type, &quantization_params);
+      transformation->AddMessageF(
+          "No quantization params - infering from data type %s with minmax "
+          "%g,%g as zero_point=%g, scale=%g",
+          ArrayDataTypeName(quantized_data_type), array.minmax->min,
+          array.minmax->max, quantization_params.zero_point,
+          quantization_params.scale);
+    }
+  } else {
+    quantization_params = array.GetQuantizationParams();
+  }
+
+  double quantized_min, quantized_max;
+  CHECK(GetQuantizedDataTypeNumericalRange(quantized_data_type, &quantized_min,
+                                           &quantized_max))
+      << "Type is not quantized";
+
+  bool has_nontrivial_min_bound = false;
+  bool has_nontrivial_max_bound = false;
+
+  double lowest_representable_output =
+      (quantized_min - quantization_params.zero_point) *
+      quantization_params.scale;
+  if (lowest_representable_output < clamp_min) {
+    has_nontrivial_min_bound = true;
+    transformation->AddMessageF(
+        "Quantized activation function is not trivial: "
+        "the lowest representable output value %g"
+        " less than the clamp min bound %g.",
+        lowest_representable_output, clamp_min);
+  }
+
+  double highest_representable_output =
+      (quantized_max - quantization_params.zero_point) *
+      quantization_params.scale;
+  if (highest_representable_output > clamp_max) {
+    has_nontrivial_max_bound = true;
+    transformation->AddMessageF(
+        "Quantized activation function is not trivial: "
+        "the highest representable output value %g"
+        " is greater than the clamp max bound %g.",
+        highest_representable_output, clamp_max);
+  }
+
+  return !has_nontrivial_min_bound && !has_nontrivial_max_bound;
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/quantization_util.h b/tensorflow/lite/toco/graph_transformations/quantization_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d226aeab8b788c0c52c504a6f60e6d8fdfc2b3fa
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/quantization_util.h
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
+
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+// Gets the target quantized data type of an array based on the fake quant op.
+// For example, if the num_bits is 8 the data type will be kUint8.
+bool InferQuantizedDataTypeFromFakeQuant(
+    const FakeQuantOperator& op, ArrayDataType* out_quantized_data_type);
+
+// Gets the min/max numerical range for the given quantized data type.
+// For example, kUint8 will return [0,255].
+// Returns true if the ranges were set and false if the type is not quantized.
+bool GetQuantizedDataTypeNumericalRange(ArrayDataType data_type,
+                                        double* out_min_value,
+                                        double* out_max_value);
+
+// Returns the quantized data type of an array, falling back to the provided
+// default data type.
+ArrayDataType GetQuantizedDataType(const Array& array,
+                                   ArrayDataType default_type);
+
+// Chooses the quantization params for a given array and a given target
+// quantized data type (which may not be the array's current data type).
+void ChooseQuantizationParamsForArrayAndQuantizedDataType(
+    const Array& array, ArrayDataType quantized_data_type,
+    QuantizationParams* quantization_params);
+
+// Quantizes an array by setting its data type and (if constant) quantizing
+// all values in the array.
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name, ArrayDataType quantized_data_type,
+                   const QuantizationParams& quantization_params);
+
+// Returns true if the given array, when quantized, contains only values between
+// the provided clamp min/max.
+// Either clamp_min or clamp_max may be +/-infinity to indicate that the value
+// is unbounded on that side.
+bool IsArrayQuantizedRangeSubset(GraphTransformation* transformation,
+                                 const Array& array, double clamp_min,
+                                 double clamp_max);
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
similarity index 97%
rename from tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
rename to tensorflow/lite/toco/graph_transformations/quantize.cc
index fb299c31b793619c4fb5203211c79f4b32a82af3..1146078c301fd1b880c99da23e5be8223efe31e3 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -20,11 +20,11 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -57,13 +57,15 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kDepthToSpace ||
          type == OperatorType::kLstmCell || type == OperatorType::kGather ||
          type == OperatorType::kTranspose || type == OperatorType::kMean ||
-         type == OperatorType::kGreater ||
+         type == OperatorType::kEqual || type == OperatorType::kGreater ||
          type == OperatorType::kGreaterEqual || type == OperatorType::kLess ||
          type == OperatorType::kLessEqual || type == OperatorType::kSelect ||
          type == OperatorType::kArgMax || type == OperatorType::kRelu ||
          type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
          type == OperatorType::kShape || type == OperatorType::kExpandDims ||
-         type == OperatorType::kPack || type == OperatorType::kTopK_V2;
+         type == OperatorType::kPack || type == OperatorType::kTopK_V2 ||
+         type == OperatorType::kResizeNearestNeighbor ||
+         type == OperatorType::kPRelu;
 }
 
 // The quantized op allows output arrays of type float using
@@ -359,7 +361,7 @@ bool ChooseQuantizationForOperatorOutput(
       op.type == OperatorType::kSpaceToDepth ||
       op.type == OperatorType::kReshape || op.type == OperatorType::kSplit ||
       op.type == OperatorType::kRelu || op.type == OperatorType::kRelu1 ||
-      op.type == OperatorType::kRelu6) {
+      op.type == OperatorType::kRelu6 || op.type == OperatorType::kPRelu) {
     int data_input_index = 0;
     if (op.type == OperatorType::kSplit) {
       data_input_index = 1;
@@ -439,7 +441,9 @@ void FixMinMaxPostQuantization(GraphTransformation* transformation,
 
 }  // namespace
 
-bool Quantize::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status Quantize::Run(Model* model, std::size_t op_index,
+                                   bool* modified) {
+  *modified = false;
   // Our general "quantization" graph transformation consists in replacing
   //   QuantizedInputArrays[] ->
   //     DequantizeOperators[] ->
@@ -460,7 +464,7 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
   auto& op = *model->operators[op_index];
   if (op.type == OperatorType::kDequantize ||
       op.type == OperatorType::kFakeQuant) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Our assumption here is that the input arrays are already quantized -
@@ -497,7 +501,7 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
       if (!array.minmax && !array.buffer) {
         LOG(ERROR) << "Can't quantize input array " << input
                    << " because it lacks min/max info";
-        return false;
+        return ::tensorflow::Status::OK();
       }
       const auto* other_op = GetOpWithOutput(*model, input);
       if (other_op && other_op->type != OperatorType::kDequantize) {
@@ -507,7 +511,7 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
             "which means that we should yield and let other ops "
             "get quantized first",
             LogName(op), input);
-        return false;
+        return ::tensorflow::Status::OK();
       }
     }
   }
@@ -672,7 +676,8 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
     }
   }
 
-  return changed;
+  *modified = changed;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
similarity index 84%
rename from tensorflow/contrib/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
rename to tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
index eaa9d3bcda5e25da2e1a31bf37be804cbe15bdd0..4d621018dc3fc58236512bbda943991e74030712 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/read_array_minmax_and_narrow_range_from_fake_quant.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -51,18 +51,19 @@ bool ApplyAttrsToArray(GraphTransformation* transformation, Model* model,
 
 }  // end namespace
 
-bool ReadArrayMinmaxAndNarrowRangeFromFakeQuant::Run(Model* model,
-                                                     std::size_t op_index) {
+::tensorflow::Status ReadArrayMinmaxAndNarrowRangeFromFakeQuant::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  *modified = false;
   const auto fakequant_it = model->operators.begin() + op_index;
   auto* fakequant_base_op = fakequant_it->get();
   if (fakequant_base_op->type != OperatorType::kFakeQuant) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* fq_op = static_cast<FakeQuantOperator*>(fakequant_base_op);
 
   if (!fq_op->minmax) {
     // Need to be resolved first by ResolveFakeQuantArgsFromVars.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // At this point, this FakeQuantOperator should have a MinMax
@@ -74,7 +75,8 @@ bool ReadArrayMinmaxAndNarrowRangeFromFakeQuant::Run(Model* model,
   bool changed = false;
   changed |= ApplyAttrsToArray(this, model, *fq_op, fq_op->inputs[0]);
   changed |= ApplyAttrsToArray(this, model, *fq_op, fq_op->outputs[0]);
-  return changed;
+  *modified = changed;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc b/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc
similarity index 76%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
rename to tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc
index c3b2709a33d54213661ba96394b01aa2cfd1a278..ed551d0122348ef4412e9e521bfea89d901e13fc 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_final_dequantize_op.cc
@@ -17,19 +17,22 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool RemoveFinalDequantizeOp::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status RemoveFinalDequantizeOp::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   const auto dequantize_it = model->operators.begin() + op_index;
   const auto* dequantize_op = dequantize_it->get();
   if (dequantize_op->type != OperatorType::kDequantize) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto& output = dequantize_op->outputs[0];
   // We can remove any dequantize op whose output is not consumed by
@@ -38,7 +41,7 @@ bool RemoveFinalDequantizeOp::Run(Model* model, std::size_t op_index) {
   // in the middle of the graph might be designated as an output
   // array.
   if (CountOpsWithInput(*model, output)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // If one of the model's output arrays was actually the Dequantize op's
@@ -53,7 +56,8 @@ bool RemoveFinalDequantizeOp::Run(Model* model, std::size_t op_index) {
   AddMessageF("Removed final %s", LogName(*dequantize_op));
   model->EraseArray(output);
   model->operators.erase(dequantize_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_assert.cc b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc
similarity index 77%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_assert.cc
rename to tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc
index 73ad326299bbd929afbb8dda2c41b97a126afbe1..647146b407116a0c1197f07ad2a3379007b7d184 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_assert.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_assert.cc
@@ -16,18 +16,21 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool RemoveTensorFlowAssert::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status RemoveTensorFlowAssert::Run(Model* model,
+                                                 std::size_t op_index,
+                                                 bool* modified) {
+  *modified = false;
   const auto assert_it = model->operators.begin() + op_index;
   const auto* assert_op = assert_it->get();
   if (assert_op->type != OperatorType::kAssert) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   bool changed = false;
@@ -54,7 +57,8 @@ bool RemoveTensorFlowAssert::Run(Model* model, std::size_t op_index) {
 
   // That's it. We can stop here, no need to duplicate the work that
   // RemoveUnusedOp will do removing this now-unused node.
-  return changed;
+  *modified = changed;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0f7bc9a053b5d8b5a7941221e4383883a4de4f5
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/remove_tensorflow_identity.cc
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status RemoveTensorFlowIdentity::Run(Model* model,
+                                                   std::size_t op_index,
+                                                   bool* modified) {
+  *modified = false;
+  const auto passthru_it = model->operators.begin() + op_index;
+  const auto* passthru_op = passthru_it->get();
+  if (passthru_op->type != OperatorType::kIdentity) {
+    return ::tensorflow::Status::OK();
+  }
+
+  *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
similarity index 84%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
rename to tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
index 0dfdc40e4c3410330135736690af4a85b42a0041..8879a7cd2664ed3f32e32435f9d45c0744dfbea2 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -46,14 +46,17 @@ bool AreAllBufferElementsEqualTo(const std::vector<Scalar>& buffer_data,
 // For example, an Add operator is trivial if
 // one of its operands is constant 0, a Mul operator is trivial
 // if one of its operands is constant 1, etc.
-bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status RemoveTrivialBinaryOperator::Run(Model* model,
+                                                      std::size_t op_index,
+                                                      bool* modified) {
+  *modified = false;
   const auto binary_it = model->operators.begin() + op_index;
   auto* binary_op = binary_it->get();
   if (binary_op->type != OperatorType::kAdd &&
       binary_op->type != OperatorType::kMul &&
       binary_op->type != OperatorType::kSub &&
       binary_op->type != OperatorType::kDiv) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(binary_op->inputs.size(), 2);
@@ -66,12 +69,12 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
   };
   if (!is_input_constant[0] && !is_input_constant[1]) {
     // Neither input is constant, so nothing we can resolve here.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (is_input_constant[0] && is_input_constant[1]) {
     // Both inputs are constants. That's a job for constants
     // propagation, not for us to handle here.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
   const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
@@ -84,7 +87,7 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
   const auto& input_array_1 = model->GetArray(binary_op->inputs[1]);
   if (!input_array_0.has_shape() || !input_array_1.has_shape()) {
     // Both input shapes must be known.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (input_array_0.shape().dimensions_count() ==
           input_array_1.shape().dimensions_count() &&
@@ -94,7 +97,7 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
         "(lhs %s, rhs %s)",
         LogName(*binary_op), ShapeToString(input_array_0.shape()),
         ShapeToString(input_array_1.shape()));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Now check if the constant operand makes this binary
@@ -103,7 +106,7 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
       model->GetArray(binary_op->inputs[index_of_constant_input]);
   // For now, we only handle floats here.
   if (constant_input_array.data_type != ArrayDataType::kFloat) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto& constant_input_float_data =
       constant_input_array.GetBuffer<ArrayDataType::kFloat>().data;
@@ -121,12 +124,13 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
   }
 
   if (!is_trivial) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Now we know that this node is trivial, so we can remove it.
   AddMessageF("Removing trivial %s", LogName(*binary_op));
-  return RemoveTrivialPassthroughOp(this, model, op_index);
+  *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bfa9314a6964f431b4a6ffb7225039c975866da1
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation.cc
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status RemoveTrivialConcatenation::Run(Model* model,
+                                                     std::size_t op_index,
+                                                     bool* modified) {
+  *modified = false;
+  const auto concat_it = model->operators.begin() + op_index;
+  auto* concat_op = concat_it->get();
+  if (concat_op->type != OperatorType::kConcatenation) {
+    return ::tensorflow::Status::OK();
+  }
+  if (concat_op->inputs.size() != 1) {
+    return ::tensorflow::Status::OK();
+  }
+  *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
similarity index 79%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
rename to tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
index 936854a04fd600ea23ab5dda50370f85a311c28c..565ccb663a8008332f41c0ec2565a4ad547ee585 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -18,14 +18,17 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool RemoveTrivialConcatenationInput::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status RemoveTrivialConcatenationInput::Run(Model* model,
+                                                          std::size_t op_index,
+                                                          bool* modified) {
+  *modified = false;
   // TensorFlow allows Concatenation nodes to have 0-D inputs,
   // and they are then treated as empty i.e. omitted from concatenation,
   // in violation of the notion that 0-D is equivalent to 1x1x1x1.
@@ -36,7 +39,7 @@ bool RemoveTrivialConcatenationInput::Run(Model* model, std::size_t op_index) {
   const auto concat_it = model->operators.begin() + op_index;
   auto* concat_op = concat_it->get();
   if (concat_op->type != OperatorType::kConcatenation) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   std::vector<string> trivial_inputs;
   std::vector<string> nontrivial_inputs;
@@ -52,7 +55,7 @@ bool RemoveTrivialConcatenationInput::Run(Model* model, std::size_t op_index) {
   }
 
   if (trivial_inputs.empty()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Drop trivial inputs.
@@ -63,7 +66,8 @@ bool RemoveTrivialConcatenationInput::Run(Model* model, std::size_t op_index) {
     }
   }
   concat_op->inputs = nontrivial_inputs;
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
similarity index 79%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
rename to tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
index 2c8d04440f251f792d2a09155dd26fc01a732109..2891e41f3072c6ea906336039274aee7b32c6071 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_fake_quant.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -64,23 +64,27 @@ bool IsFakeQuantTrivial(GraphTransformation* transformation, const Model& model,
 }  // namespace
 
 // Removes FakeQuant ops that are trivial (have no effect, are redundant, etc).
-bool RemoveTrivialFakeQuant::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status RemoveTrivialFakeQuant::Run(Model* model,
+                                                 std::size_t op_index,
+                                                 bool* modified) {
+  *modified = false;
   const auto op_it = model->operators.begin() + op_index;
   auto* op = op_it->get();
   if (op->type != OperatorType::kFakeQuant) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* fakequant_op = static_cast<FakeQuantOperator*>(op);
 
   if (!IsFakeQuantTrivial(this, *model, *fakequant_op)) {
     AddMessageF("%s is not trivial", LogName(*fakequant_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Removing trivial %s", LogName(*fakequant_op));
 
   CHECK_EQ(fakequant_op->inputs.size(), 1);
-  return RemoveTrivialPassthroughOp(this, model, op_index);
+  *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
similarity index 97%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
rename to tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index d5983a1f12ffbc2b6c06432741f384229461a12d..5239d550762fe3fc76b3d3cd156f63f98ed951ce 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h
similarity index 86%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
rename to tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h
index 663704e5acf745d3768ad682e0a7888f0a690e6c..315edc0121afc6c0b50fd0c9a64a3ec3c2323492 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#ifndef TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#define TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
 
 namespace toco {
 
@@ -55,4 +55,4 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#endif  // TENSORFLOW_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
similarity index 82%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
rename to tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
index 752560e075a087bcc2b0a3cb19dad484fb582d42..56acf22f7f1de293e9c515c5ecf508f44390535f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -17,13 +17,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/toco_types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -94,12 +94,13 @@ bool IsTrivialFusedActivationFunc(
 // Attempts to remove both fused and unfused activation functions if the
 // quantization params indicate that the representable values fall inside the
 // activation range.
-bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
-                                               std::size_t op_index) {
+::tensorflow::Status RemoveTrivialQuantizedActivationFunc::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   auto* op = it->get();
   if (op->inputs.empty()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (IsTrivialUnfusedActivationFunc(this, *model, op->type, op->inputs[0])) {
@@ -107,7 +108,8 @@ bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
         "Removing trivial unfused activation function %s because the input "
         "minmax imply at least as tight a clamp anyway.",
         LogName(*op));
-    return RemoveTrivialPassthroughOp(this, model, op_index);
+    *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+    return ::tensorflow::Status::OK();
   }
   if (IsTrivialFusedActivationFunc(this, *model, op->fused_activation_function,
                                    op->outputs[0])) {
@@ -117,9 +119,10 @@ bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
         "because the output quantization parameters imply at least as tight "
         "a clamp anyway.",
         LogName(*op));
-    return true;
+    *modified = true;
+    return ::tensorflow::Status::OK();
   }
-  return false;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
similarity index 76%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
rename to tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
index 142c876b154755ac9c6b93e560f22ec8d6ec6563..f1037994c976160dcf6bea0af948c417c2ffed7c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_quantized_min_max.cc
@@ -17,13 +17,13 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/toco_types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -69,22 +69,26 @@ bool IsTrivialMinMax(GraphTransformation* transformation, const Model& model,
 
 // Attempts to remove min/max functions if the quantization params indicate that
 // the representable values fall inside the clip range.
-bool RemoveTrivialQuantizedMinMax::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status RemoveTrivialQuantizedMinMax::Run(Model* model,
+                                                       std::size_t op_index,
+                                                       bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   auto* op = it->get();
   if ((op->type != OperatorType::kMinimum &&
        op->type != OperatorType::kMaximum) ||
       op->inputs.size() != 2) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (IsTrivialMinMax(this, *model, op->type, op->inputs[0], op->inputs[1])) {
     AddMessageF(
         "Removing trivial min/max %s because the quantization parameters imply "
         "at least as tight a clamp anyway.",
         LogName(*op));
-    return RemoveTrivialPassthroughOp(this, model, op_index);
+    *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+    return ::tensorflow::Status::OK();
   }
-  return false;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc
similarity index 83%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
rename to tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc
index 5295eeccecb05b05232922f4b5e4ef75a2b04672..7dea3c79c57a484e4d65f2421fda2adb99f0a681 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_reshape.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -81,22 +81,26 @@ bool IsReshapeTrivial(const Model& model, const Operator& op,
 
 }  // namespace
 
-bool RemoveTrivialReshape::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status RemoveTrivialReshape::Run(Model* model,
+                                               std::size_t op_index,
+                                               bool* modified) {
+  *modified = false;
   const auto reshape_it = model->operators.begin() + op_index;
   auto* reshape_op = reshape_it->get();
   if (reshape_op->type != OperatorType::kReshape) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!IsReshapeTrivial(*model, *reshape_op, this)) {
     AddMessageF("%s is not trivial", LogName(*reshape_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Removing trivial %s", LogName(*reshape_op));
 
   CHECK_EQ(reshape_op->inputs.size(), 2);
-  return RemoveTrivialPassthroughOp(this, model, op_index);
+  *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc b/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc
similarity index 75%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc
rename to tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc
index 0cbbcd7c814d38e32ee55e9d9271adf532d20924..330e16b3afdc88c1f3dba55b091c20a166946ebb 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_trivial_slice.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -49,21 +49,24 @@ bool IsSliceTrivial(const Model& model, const Operator& op,
 
 }  // namespace
 
-bool RemoveTrivialSlice::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status RemoveTrivialSlice::Run(Model* model, std::size_t op_index,
+                                             bool* modified) {
+  *modified = false;
   const auto reshape_it = model->operators.begin() + op_index;
   auto* slice_op = reshape_it->get();
   if (slice_op->type != OperatorType::kSlice) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!IsSliceTrivial(*model, *slice_op, this)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Removing trivial %s", LogName(*slice_op));
 
   CHECK_EQ(slice_op->inputs.size(), 3);
-  return RemoveTrivialPassthroughOp(this, model, op_index);
+  *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
similarity index 85%
rename from tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
rename to tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
index dde91234a8240f4518cd105c2cc4e79102735980..ac05afb81947a45855cec3a12c73c928ee210c65 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/lite/toco/graph_transformations/remove_unused_op.cc
@@ -17,15 +17,17 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status RemoveUnusedOp::Run(Model* model, std::size_t op_index,
+                                         bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   const auto* op = it->get();
 
@@ -58,7 +60,7 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     }
     for (const string& output_array : model->flags.output_arrays()) {
       if (output == output_array) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
     }
     for (const auto& rnn_state : model->flags.rnn_states()) {
@@ -67,19 +69,19 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
         if (!IsDiscardableArray(*model, rnn_state.back_edge_source_array()) ||
             !IsDiscardableArray(*model, rnn_state.state_array()) ||
             CountOpsWithInput(*model, rnn_state.state_array())) {
-          return false;
+          return ::tensorflow::Status::OK();
         }
       }
     }
     if (CountOpsWithInput(*model, output)) {
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
 
   if (op->unresolved_outputs) {
     AddMessageF("Not discarding %s because it has unresolved outputs.",
                 LogName(*op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Discarding %s because none of its outputs is used.",
@@ -105,7 +107,8 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
     }
   }
   model->operators.erase(it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
similarity index 87%
rename from tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
rename to tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 550de83018f25a7aa4da82707fedb86434615fb0..6a4b9198548956217d24693bceff2bd6b3b8f0a6 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -63,29 +63,32 @@ bool IsMoveOperator(OperatorType optype) {
 
 // Swap elementwise operators such that all value operators occur before all
 // element move operators, e.g. negation then transpose.
-bool ReorderElementwiseUnary::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ReorderElementwiseUnary::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   const auto element_op_it = model->operators.begin() + op_index;
   std::unique_ptr<Operator>& element_op = *element_op_it;
   if (!IsElementwiseOperator(element_op->type)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const string intermediate_name = element_op->inputs[0];
   auto it = FindOpWithOutput(*model, intermediate_name);
   if (it == model->operators.end()) {
     AddMessageF("No preceding operator");
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   std::unique_ptr<Operator>& move_op = *it;
   if (!IsMoveOperator(move_op->type)) {
     AddMessageF("Preceding operator is not a move operator");
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
     AddMessageF("Input %s used elsewhere", intermediate_name);
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Check that the intermediate is discardable.
@@ -94,7 +97,7 @@ bool ReorderElementwiseUnary::Run(Model* model, std::size_t op_index) {
         "Cannot swap elementwise as it would invalidate %s which is "
         "an output array.",
         intermediate_name);
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // op->inputs may change so we need to keep a value by copy.
@@ -147,7 +150,8 @@ bool ReorderElementwiseUnary::Run(Model* model, std::size_t op_index) {
   // Swap the order of the operators.
   element_op.swap(move_op);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
similarity index 91%
rename from tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
rename to tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index c907a597cb719b68dbf36868a75e49a7c5181423..fdd411c84c2678bc483b00849d5142665e706fac 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -101,37 +101,40 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
 
 // Swaps reshape-transpose to transpose-reshape whenever possible. This is
 // possible when the reshape does not affect memory ordering.
-bool ReorderReshapeTranspose::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ReorderReshapeTranspose::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   auto transpose_it = model->operators.begin() + op_index;
 
   TransposeOperator* transpose_op = ConvertOperator<TransposeOperator*>(
       transpose_it->get(), OperatorType::kTranspose);
 
   if (transpose_op == nullptr) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
     // Wait for values to propagate.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Find the operator that produces the transpose op.
   auto reshape_it = FindOpWithOutput(*model, transpose_op->inputs[0]);
   if (reshape_it == model->operators.end()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   TensorFlowReshapeOperator* reshape_op =
       ConvertOperator<TensorFlowReshapeOperator*>(reshape_it->get(),
                                                   OperatorType::kReshape);
   if (reshape_op == nullptr) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Ignore if the reshape is uninitialized.
   if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Need to copy to keep static if permutated.
@@ -142,7 +145,7 @@ bool ReorderReshapeTranspose::Run(Model* model, std::size_t op_index) {
   // Intermediate should not be consumed by any other operators.
   if (CountOpsWithInput(*model, intermediate_name) != 1) {
     AddMessageF("Input %s used elsewhere", intermediate_name);
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Check that the intermediate is not an output array.
@@ -151,7 +154,7 @@ bool ReorderReshapeTranspose::Run(Model* model, std::size_t op_index) {
         "Cannot reorder reshape-transpose as it would invalidate %s which is "
         "an output array.",
         intermediate_name);
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Get the arrays.
@@ -173,7 +176,7 @@ bool ReorderReshapeTranspose::Run(Model* model, std::size_t op_index) {
   // dimensions then it can be moved between the transpose.
   if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
                                       true /*allow_extra_unary_dims*/)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!IsDiscardableArray(*model, output_name)) {
@@ -242,7 +245,8 @@ bool ReorderReshapeTranspose::Run(Model* model, std::size_t op_index) {
   // Swap the order of the operators.
   transpose_it->swap(*reshape_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
similarity index 90%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 8f2c1f81628398d8c823d27ff50d59e80497d0e1..e972e5c9014865b4ae8486377a1edd6a6cdd1a3f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -17,18 +17,21 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveBatchNormalization::Run(Model* model,
+                                                    std::size_t op_index,
+                                                    bool* modified) {
+  *modified = false;
   auto bn_it = model->operators.begin() + op_index;
   if (bn_it->get()->type != OperatorType::kBatchNormalization) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto* bn_op =
       static_cast<const BatchNormalizationOperator*>(bn_it->get());
@@ -53,7 +56,7 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   // so we need to exit early if these buffers don't exist (i.e. if the params
   // haven't yet been resolved as constants).
   if (!mean_array.buffer || !multiplier_array.buffer || !offset_array.buffer) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Create the new Mul, Add operators
@@ -142,7 +145,8 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   DCHECK_EQ(bn_it->get(), bn_op);
   model->operators.erase(bn_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7aa92de4f6f87877e7751b7d6c4177a7d8408e3c
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_batch_to_space_nd_attributes.cc
@@ -0,0 +1,79 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status ResolveBatchToSpaceNDAttributes::Run(Model* model,
+                                                          std::size_t op_index,
+                                                          bool* modified) {
+  *modified = false;
+  const auto op_it = model->operators.begin() + op_index;
+  if (op_it->get()->type != OperatorType::kBatchToSpaceND)
+    return ::tensorflow::Status::OK();
+
+  auto* op = static_cast<BatchToSpaceNDOperator*>(op_it->get());
+
+  // The attributes are resolved only when the 3 attributes (block_shape,
+  // before_crops, after_crops) are all constant.
+  if (!op->block_shape.empty()) {
+    return ::tensorflow::Status::OK();
+  }
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[1]) ||
+      !IsConstantParameterArray(*model, op->inputs[2]))
+    return ::tensorflow::Status::OK();
+
+  // Handle crops
+  const auto& crops_array = model->GetArray(op->inputs[2]);
+  if (!crops_array.has_shape()) return ::tensorflow::Status::OK();
+  const std::vector<int>& crops_dims = crops_array.shape().dims();
+  if (crops_dims.size() != 2) {
+    // Code only handles crops of 2 dimensions. Perhaps another transformation
+    // will delete this op.
+    return ::tensorflow::Status::OK();
+  }
+  const std::vector<int>& crops_buffer =
+      crops_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < crops_dims[0]; ++i) {
+    op->before_crops.push_back(crops_buffer[i * 2]);
+    op->after_crops.push_back(crops_buffer[i * 2 + 1]);
+  }
+
+  // Handle block_shape
+  const auto& block_shape_array = model->GetArray(op->inputs[1]);
+  if (!block_shape_array.has_shape()) return ::tensorflow::Status::OK();
+  const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
+  CHECK_EQ(block_shape_dims.size(), 1);
+  const std::vector<int>& block_shape_buffer =
+      block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < block_shape_dims[0]; ++i) {
+    op->block_shape.push_back(block_shape_buffer[i]);
+  }
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
similarity index 94%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
index f7e5aa6609bd4f7eb2a95750125e30a7803b36e1..0e1671c61c6b89e9099cf7dfcfafcf18f1e195fa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -188,7 +188,10 @@ void EvaluateBinaryOperatorOnConstantInputs(Model* model,
 }
 }  // namespace
 
-bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantBinaryOperator::Run(Model* model,
+                                                        std::size_t op_index,
+                                                        bool* modified) {
+  *modified = false;
   const auto binary_it = model->operators.begin() + op_index;
   const auto* binary_op = binary_it->get();
   // Test for binary ops of types that we know how to resolve
@@ -204,7 +207,7 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
       binary_op->type != OperatorType::kLessEqual &&
       binary_op->type != OperatorType::kGreater &&
       binary_op->type != OperatorType::kGreaterEqual) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   CHECK_EQ(binary_op->inputs.size(), 2);
 
@@ -212,13 +215,13 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
   const auto& input1_array = model->GetArray(binary_op->inputs[1]);
   // Check if both inputs are constant parameters.
   if (!input0_array.buffer || !input1_array.buffer) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   auto& output_array = model->GetArray(binary_op->outputs[0]);
   // Yield until the output array dims have been resolved.
   if (!output_array.has_shape()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // At the moment we don't want to care about fused activation functions.
@@ -229,7 +232,7 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
     AddMessageF(
         "Not resolving constant %s because it has a fused activation function",
         LogName(*binary_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Check that input data types agree.
@@ -253,7 +256,8 @@ bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
   AddMessageF("Resolved constant %s to the equivalent constant array",
               LogName(*binary_op));
   model->operators.erase(binary_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
similarity index 87%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index d916ae0ddf017fe6a2fb2709db6e9de8c258adfc..98ff4ab02ea621583fb93dda7686839afe18b8a5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -135,11 +135,14 @@ void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
 }  // namespace
 
 // Resolves the concatenation operator if all its inputs are constant arrays.
-bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantConcatenation::Run(Model* model,
+                                                       std::size_t op_index,
+                                                       bool* modified) {
+  *modified = false;
   const auto concat_it = model->operators.begin() + op_index;
   const auto* concat_base_op = concat_it->get();
   if (concat_base_op->type != OperatorType::kConcatenation) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto* concat_op =
       static_cast<const ConcatenationOperator*>(concat_base_op);
@@ -149,11 +152,15 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
     // We  also make sure the shapes of the input arrays are known and they are
     // all discardable.
     const Operator* input_op = GetOpWithOutput(*model, input_name);
-    if (input_op) return false;
-    if (!IsConstantParameterArray(*model, input_name)) return false;
-    if (!model->GetArray(input_name).has_shape()) return false;
-    if (model->GetArray(input_name).quantization_params) return false;
-    if (!IsDiscardableArray(*model, input_name)) return false;
+    if (input_op) return ::tensorflow::Status::OK();
+    if (!IsConstantParameterArray(*model, input_name))
+      return ::tensorflow::Status::OK();
+    if (!model->GetArray(input_name).has_shape())
+      return ::tensorflow::Status::OK();
+    if (model->GetArray(input_name).quantization_params)
+      return ::tensorflow::Status::OK();
+    if (!IsDiscardableArray(*model, input_name))
+      return ::tensorflow::Status::OK();
   }
 
   const int concatenation_axis = concat_op->axis;
@@ -191,6 +198,10 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
       ConcatenateTensorBuffers<ArrayDataType::kString>(
           input_arrays, concatenation_axis, &concatenated_array);
       break;
+    case ArrayDataType::kComplex64:
+      ConcatenateTensorBuffers<ArrayDataType::kComplex64>(
+          input_arrays, concatenation_axis, &concatenated_array);
+      break;
     default:
       LOG(FATAL) << "ArrayDataType not supported";
   }
@@ -205,7 +216,8 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
 
   // Remove concatenate operator.
   model->operators.erase(concat_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
similarity index 88%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
index f5f2f77460c7624298d8e49a0ea30527a45bd960..d52f7d49169c7e84e8103cdf80f1baebeb4be946 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -59,11 +59,14 @@ void GetBoundsForQuantizedDataType(ArrayDataType quantized_data_type,
   }
 }
 
-bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantFakeQuant::Run(Model* model,
+                                                   std::size_t op_index,
+                                                   bool* modified) {
+  *modified = false;
   const auto fakequant_it = model->operators.begin() + op_index;
   const auto* fakequant_base_op = fakequant_it->get();
   if (fakequant_base_op->type != OperatorType::kFakeQuant) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto* fakequant_op =
@@ -71,12 +74,12 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
 
   // Yield until the fakequant MinMax has been resolved.
   if (!fakequant_op->minmax) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // This transformation only applies when the input array is constant.
   if (!IsConstantParameterArray(*model, fakequant_op->inputs[0])) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto& input_array = model->GetArray(fakequant_op->inputs[0]);
@@ -87,7 +90,7 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
   if (!InferQuantizedDataTypeFromFakeQuant(*fakequant_op,
                                            &quantized_data_type)) {
     AddMessageF("Unsupported FakeQuant num_bits=%d", fakequant_op->num_bits);
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Resolving constant %s", LogName(*fakequant_op));
@@ -136,7 +139,8 @@ bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
   }
   model->operators.erase(fakequant_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc
similarity index 77%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc
index f6f95481b57f58f497b119df73d331f13d9705c0..c9021019bf4167147aa89c25691f5c7dabba583f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fill.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_fill.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -41,11 +41,14 @@ bool ComputeFillArray(Model* model, FillOperator* op) {
   return true;
 }
 
-bool ResolveConstantFill::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantFill::Run(Model* model,
+                                              std::size_t op_index,
+                                              bool* modified) {
+  *modified = false;
   const auto fill_it = model->operators.begin() + op_index;
   auto* base_op = fill_it->get();
   if (base_op->type != OperatorType::kFill) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* op = static_cast<FillOperator*>(base_op);
 
@@ -55,44 +58,49 @@ bool ResolveConstantFill::Run(Model* model, std::size_t op_index) {
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto& val_array = model->GetArray(op->inputs[1]);
   if (!val_array.has_shape()) {
     // Yield until the value shape has been resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (!IsConstantParameterArray(*model, op->inputs[1])) {
     // Yield until the value is constant.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   CHECK_EQ(RequiredBufferSizeForShape(val_array.shape()), 1);
 
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
       if (!ComputeFillArray<ArrayDataType::kFloat>(model, op)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       break;
     case ArrayDataType::kUint8:
       if (!ComputeFillArray<ArrayDataType::kUint8>(model, op)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       break;
     case ArrayDataType::kInt32:
       if (!ComputeFillArray<ArrayDataType::kInt32>(model, op)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       break;
     case ArrayDataType::kInt64:
       if (!ComputeFillArray<ArrayDataType::kInt64>(model, op)) {
-        return false;
+        return ::tensorflow::Status::OK();
+      }
+      break;
+    case ArrayDataType::kComplex64:
+      if (!ComputeFillArray<ArrayDataType::kComplex64>(model, op)) {
+        return ::tensorflow::Status::OK();
       }
       break;
     default:
@@ -114,7 +122,8 @@ bool ResolveConstantFill::Run(Model* model, std::size_t op_index) {
   // Erase the operator
   model->operators.erase(fill_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27836efb0b2ff77d72811205617b721cc7106cf1
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -0,0 +1,155 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+// Gathers data from axis 0.
+template <ArrayDataType Type>
+inline void Gather(const Array& input_array, const Array& coords_array,
+                   Array* output_array) {
+  const Shape& input_shape = input_array.shape();
+  const std::vector<DataType<Type>>& input_data =
+      input_array.GetBuffer<Type>().data;
+  const Shape& coords_shape = coords_array.shape();
+  const std::vector<int32>& coords_data =
+      coords_array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  const Shape& output_shape = output_array->shape();
+  std::vector<DataType<Type>>& output_data =
+      output_array->GetMutableBuffer<Type>().data;
+  output_data.resize(RequiredBufferSizeForShape(output_shape));
+
+  CHECK_EQ(coords_shape.dims(0), output_array->shape().dims(0));
+
+  int stride = 1;
+  for (int i = 1; i < input_shape.dimensions_count(); ++i) {
+    stride *= input_shape.dims(i);
+  }
+
+  // Let's make sure we have enough space for all element in the memcpy()
+  // below, which writes 'stride' elements starting at 'i * stride'.
+  CHECK_EQ(stride * coords_shape.dims(0), output_data.size());
+
+  for (int i = 0; i < coords_shape.dims(0); ++i) {
+    DCHECK_GE(coords_data[i], 0);
+    DCHECK_LT(coords_data[i], input_shape.dims(0));
+    DataType<Type>* out = output_data.data() + i * stride;
+    const DataType<Type>* in = input_data.data() + coords_data[i] * stride;
+    memcpy(out, in, sizeof(DataType<Type>) * stride);
+  }
+}
+
+}  // namespace
+
+// Resolves a constant Gather operation.
+// This simply performs the gather and produces the output array with the
+// appropriate values.
+::tensorflow::Status ResolveConstantGather::Run(Model* model,
+                                                std::size_t op_index,
+                                                bool* modified) {
+  *modified = false;
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kGather) {
+    return ::tensorflow::Status::OK();
+  }
+  const auto* op = static_cast<const GatherOperator*>(base_op);
+
+  CHECK_GE(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return ::tensorflow::Status::OK();
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!op->axis) {
+    // Yield until axis has been set by ResolveGatherAttributes.
+    return ::tensorflow::Status::OK();
+  }
+  if (op->axis.value() != 0) {
+    // Only handling axis=0 for now.
+    AddMessageF("%s has axis %d; only axis=0 is supported", LogName(*op),
+                op->axis.value());
+    return ::tensorflow::Status::OK();
+  }
+
+  // We require constant inputs.
+  if (!IsConstantParameterArray(*model, op->inputs[0]) ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return ::tensorflow::Status::OK();
+  }
+  const Array& input_array = model->GetArray(op->inputs[0]);
+  const Array& coords_array = model->GetArray(op->inputs[1]);
+  CHECK(coords_array.data_type == ArrayDataType::kInt32)
+      << "Only int32 indices are supported";
+
+  // Copy min/max info if present. The ranges of the selected values may be
+  // a subset of the original range but we want to ensure the quantization
+  // params stay the same.
+  if (input_array.minmax) {
+    const auto& input_minmax = input_array.GetMinMax();
+    auto& output_minmax = output_array.GetOrCreateMinMax();
+    output_minmax.min = input_minmax.min;
+    output_minmax.max = input_minmax.max;
+  }
+
+  CHECK(!output_array.buffer);
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      Gather<ArrayDataType::kFloat>(input_array, coords_array, &output_array);
+      break;
+    case ArrayDataType::kUint8:
+      Gather<ArrayDataType::kUint8>(input_array, coords_array, &output_array);
+      break;
+    case ArrayDataType::kInt32:
+      Gather<ArrayDataType::kInt32>(input_array, coords_array, &output_array);
+      break;
+    case ArrayDataType::kInt64:
+      Gather<ArrayDataType::kInt64>(input_array, coords_array, &output_array);
+      break;
+    case ArrayDataType::kComplex64:
+      Gather<ArrayDataType::kComplex64>(input_array, coords_array,
+                                        &output_array);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type given to Gather op with output \""
+                 << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used after we remove the op.
+  DeleteArrayIfUsedOnce(op->inputs[0], model);
+  DeleteArrayIfUsedOnce(op->inputs[1], model);
+
+  // Erase the operator.
+  model->operators.erase(it);
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
similarity index 83%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
index e86616574d5a0f1345cde167d4ce0d41665d5a02..168f79bebdaaaaca4c5a4d3d06fa4d41fcc35a8d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_pack.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_pack.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -49,11 +49,14 @@ void Pack(Model* model, PackOperator const& op) {
 
 }  // namespace
 
-bool ResolveConstantPack::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantPack::Run(Model* model,
+                                              std::size_t op_index,
+                                              bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kPack) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto* op = static_cast<const PackOperator*>(base_op);
 
@@ -62,18 +65,18 @@ bool ResolveConstantPack::Run(Model* model, std::size_t op_index) {
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   for (const auto& input : op->inputs) {
     if (!IsConstantParameterArray(*model, input)) {
       // Yield if any input is mutable
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
 
@@ -98,6 +101,9 @@ bool ResolveConstantPack::Run(Model* model, std::size_t op_index) {
     case ArrayDataType::kInt64:
       Pack<ArrayDataType::kInt64>(model, *op);
       break;
+    case ArrayDataType::kComplex64:
+      Pack<ArrayDataType::kComplex64>(model, *op);
+      break;
     default:
       LOG(FATAL) << "Unsupported data type given to Pack op with output \""
                  << op->outputs[0] << "\"";
@@ -111,7 +117,8 @@ bool ResolveConstantPack::Run(Model* model, std::size_t op_index) {
 
   // Erase the operator
   model->operators.erase(it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
similarity index 84%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
index 88d06d7dc75005c89a69b881aa0064d1162227d5..a8afbb7de542046a4b61a3ac176280636fe30d7e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -15,9 +15,9 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 #include "tensorflow/core/lib/random/philox_random.h"
@@ -59,11 +59,14 @@ bool ComputeRandomUniformArray(Model* model, RandomUniformOperator* op) {
   return true;
 }
 
-bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantRandomUniform::Run(Model* model,
+                                                       std::size_t op_index,
+                                                       bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   auto* base_op = it->get();
   if (base_op->type != OperatorType::kRandomUniform) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* op = static_cast<RandomUniformOperator*>(base_op);
 
@@ -73,12 +76,12 @@ bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if ((op->seed == 0) && (op->seed2 == 0)) {
@@ -86,13 +89,13 @@ bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
                  << "\" is truly random (using /dev/random system entropy). "
                     "Therefore, cannot resolve as constant. Set \"seed\" or "
                     "\"seed2\" attr non-zero to fix this";
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
       if (!ComputeRandomUniformArray<ArrayDataType::kFloat>(model, op)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       break;
     // For future support of double or half.
@@ -110,7 +113,8 @@ bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
   // Erase the operator
   model->operators.erase(it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_range.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_range.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4cb27d97ec1c92ad67efe49ccfd38ab98985a1dc
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_range.cc
@@ -0,0 +1,128 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+template <ArrayDataType A, typename T>
+void FillRangeOutput(const Array& start_array, const Array& limit_array,
+                     const Array& delta_array, Array* output_array) {
+  // Compute buffer contents
+  T start = start_array.GetBuffer<A>().data[0];
+  T limit = limit_array.GetBuffer<A>().data[0];
+  T delta = delta_array.GetBuffer<A>().data[0];
+  auto& buffer = output_array->GetMutableBuffer<A>();
+  buffer.data.clear();
+  int size =
+      (std::is_integral<T>::value
+           ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
+           : std::ceil(std::abs((limit - start) / delta)));
+  for (int i = 0; i < size; ++i) {
+    buffer.data.push_back(start + i * delta);
+  }
+  CHECK_EQ(floor((limit - start) / delta), buffer.data.size());
+  CHECK_EQ(buffer.data.size(), output_array->shape().dims()[0]);
+}
+
+::tensorflow::Status ResolveConstantRange::Run(Model* model,
+                                               std::size_t op_index,
+                                               bool* modified) {
+  *modified = false;
+  const auto it = model->operators.begin() + op_index;
+  auto* base_op = it->get();
+  if (base_op->type != OperatorType::kRange) {
+    return ::tensorflow::Status::OK();
+  }
+  auto* op = static_cast<RangeOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 3);
+  const auto& start_array = model->GetArray(op->inputs[0]);
+  if (!start_array.has_shape()) {
+    // Yield until all input dims have been resolved.
+    return ::tensorflow::Status::OK();
+  }
+  const auto& limit_array = model->GetArray(op->inputs[1]);
+  if (!limit_array.has_shape()) {
+    // Yield until all input dims have been resolved.
+    return ::tensorflow::Status::OK();
+  }
+  const auto& delta_array = model->GetArray(op->inputs[2]);
+  if (!delta_array.has_shape()) {
+    // Yield until all input dims have been resolved.
+    return ::tensorflow::Status::OK();
+  }
+
+  for (const auto& input : op->inputs) {
+    if (!IsConstantParameterArray(*model, input)) {
+      // yield if any input is mutable
+      return ::tensorflow::Status::OK();
+    }
+  }
+
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return ::tensorflow::Status::OK();
+  }
+
+  CHECK_EQ(RequiredBufferSizeForShape(start_array.shape()), 1)
+      << "Range op inputs must be scalar.";
+  CHECK_EQ(RequiredBufferSizeForShape(limit_array.shape()), 1)
+      << "Range op inputs must be scalar.";
+  CHECK_EQ(RequiredBufferSizeForShape(delta_array.shape()), 1)
+      << "Range op inputs must be scalar.";
+
+  CHECK(start_array.data_type == ArrayDataType::kInt32 ||
+        start_array.data_type == ArrayDataType::kFloat)
+      << "Range op inputs must be int32 or float.";
+  CHECK(limit_array.data_type == start_array.data_type)
+      << "Range op inputs type must be equal.";
+  CHECK(delta_array.data_type == start_array.data_type)
+      << "Range op inputs type must be equal.";
+
+  if (start_array.data_type == ArrayDataType::kInt32) {
+    FillRangeOutput<ArrayDataType::kInt32, int32_t>(start_array, limit_array,
+                                                    delta_array, &output_array);
+  } else {
+    FillRangeOutput<ArrayDataType::kFloat, float>(start_array, limit_array,
+                                                  delta_array, &output_array);
+  }
+
+  // Delete the input array if no longer used
+  if (IsDiscardableArray(*model, op->inputs[0]) &&
+      CountOpsWithInput(*model, op->inputs[0]) == 1) {
+    model->EraseArray(op->inputs[0]);
+  }
+  if (IsDiscardableArray(*model, op->inputs[1]) &&
+      CountOpsWithInput(*model, op->inputs[1]) == 1) {
+    model->EraseArray(op->inputs[1]);
+  }
+  if (IsDiscardableArray(*model, op->inputs[2]) &&
+      CountOpsWithInput(*model, op->inputs[2]) == 1) {
+    model->EraseArray(op->inputs[2]);
+  }
+
+  // Delete the operator
+  model->operators.erase(it);
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc
similarity index 82%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc
index a6f665b5f00ecc7b39821fa8e0b6170c176e8cf6..9e21fa564e89d49f3932a97247288dff6f47cfc3 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_reshape.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_reshape.cc
@@ -14,19 +14,22 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
 // Resolves a constant reshape operation by copying the buffer.
-bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantReshape::Run(Model* model,
+                                                 std::size_t op_index,
+                                                 bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kReshape) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto* op = static_cast<const TensorFlowReshapeOperator*>(base_op);
 
@@ -36,17 +39,17 @@ bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
   // We require constant inputs.
   if (!IsConstantParameterArray(*model, op->inputs[0]) ||
       !IsConstantParameterArray(*model, op->inputs[1])) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const Array& input_array = model->GetArray(op->inputs[0]);
@@ -54,7 +57,7 @@ bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
     AddMessageF("Constant reshape is non-trivial (%s -> %s)",
                 ShapeToString(input_array.shape()),
                 ShapeToString(output_array.shape()));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK(!output_array.buffer);
@@ -92,10 +95,13 @@ bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
     case ArrayDataType::kString:
       CopyArrayBuffer<ArrayDataType::kString>(input_array, &output_array);
       break;
+    case ArrayDataType::kComplex64:
+      CopyArrayBuffer<ArrayDataType::kComplex64>(input_array, &output_array);
+      break;
     default:
       LOG(FATAL) << "Unsupported data type: "
                  << ArrayDataTypeName(input_array.data_type);
-      return false;
+      return ::tensorflow::Status::OK();
   }
 
   AddMessageF("Resolving constant reshape of %s", LogName(*op));
@@ -112,7 +118,8 @@ bool ResolveConstantReshape::Run(Model* model, std::size_t op_index) {
 
   // Erase the operator.
   model->operators.erase(it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82b2f4ab8782d6fbbbc3574f9c753bd10de9ddbb
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_select.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// Resolves a constant Select operation.
+//
+// This implementation is looking strictly for all-or-nothing on the select
+// condition. It's possible to enhance this by looking per-element and possibly
+// producing a Mul op.
+::tensorflow::Status ResolveConstantSelect::Run(Model* model,
+                                                std::size_t op_index,
+                                                bool* modified) {
+  *modified = false;
+  auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kSelect) {
+    return ::tensorflow::Status::OK();
+  }
+  const auto* op = static_cast<const SelectOperator*>(base_op);
+
+  CHECK_GE(op->inputs.size(), 3);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return ::tensorflow::Status::OK();
+  }
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return ::tensorflow::Status::OK();
+  }
+
+  // We require the cond input to be constant.
+  if (!IsConstantParameterArray(*model, op->inputs[0])) {
+    return ::tensorflow::Status::OK();
+  }
+  const Array& cond_array = model->GetArray(op->inputs[0]);
+  CHECK(cond_array.data_type == ArrayDataType::kBool)
+      << "Only bool conditions are supported";
+  const auto& cond_data = cond_array.GetBuffer<ArrayDataType::kBool>().data;
+  if (cond_data.empty()) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Check if the condition is the same for all elements.
+  bool cond_value = cond_data[0];
+  for (size_t i = 1; i < cond_data.size(); ++i) {
+    if (cond_data[i] != cond_value) {
+      AddMessageF(
+          "Cannot resolve %s as constant; cond_array has differing "
+          "per-element values",
+          LogName(*op));
+      return ::tensorflow::Status::OK();
+    }
+  }
+
+  // Pass-through the selected input.
+  *modified =
+      RemoveTrivialPassthroughOp(this, model, op_index, cond_value ? 1 : 2);
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
similarity index 78%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
index 8a0e3e8995839a737b5671701a97b514b0fc7bf1..00ab85882796b850cf0ccf8442095143c35ea59d 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_shape_or_rank.cc
@@ -12,36 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ResolveConstantShapeOrRank::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantShapeOrRank::Run(Model* model,
+                                                     std::size_t op_index,
+                                                     bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   const auto* op = it->get();
   if (!(op->type == OperatorType::kShape || op->type == OperatorType::kRank)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been resolved
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until the input array's shape has been resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Compute the output
@@ -65,7 +68,8 @@ bool ResolveConstantShapeOrRank::Run(Model* model, std::size_t op_index) {
   }
 
   model->operators.erase(it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
similarity index 82%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
index b35c3e19c43b1c62e6bdbfe379631480e1d41703..503807f2318c74d6396652e675eff89efb29e04e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -86,11 +86,14 @@ bool Slice(SliceOperator const& op, Array const& input_array,
 
 }  // namespace
 
-bool ResolveConstantSlice::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantSlice::Run(Model* model,
+                                               std::size_t op_index,
+                                               bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kSlice) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const SliceOperator* op = static_cast<const SliceOperator*>(base_op);
@@ -99,49 +102,54 @@ bool ResolveConstantSlice::Run(Model* model, std::size_t op_index) {
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (op->begin.empty() || op->size.empty()) {
     // Attributes have not resolved yet.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until the value shape has been resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (!IsConstantParameterArray(*model, op->inputs[0])) {
     // Yield until the value is constant.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK(!output_array.buffer);
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
       if (!Slice<ArrayDataType::kFloat>(*op, input_array, &output_array)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       break;
     case ArrayDataType::kUint8:
       if (!Slice<ArrayDataType::kUint8>(*op, input_array, &output_array)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       break;
     case ArrayDataType::kInt32:
       if (!Slice<ArrayDataType::kInt32>(*op, input_array, &output_array)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       break;
     case ArrayDataType::kInt64:
       if (!Slice<ArrayDataType::kInt64>(*op, input_array, &output_array)) {
-        return false;
+        return ::tensorflow::Status::OK();
+      }
+      break;
+    case ArrayDataType::kComplex64:
+      if (!Slice<ArrayDataType::kComplex64>(*op, input_array, &output_array)) {
+        return ::tensorflow::Status::OK();
       }
       break;
     default:
@@ -159,7 +167,8 @@ bool ResolveConstantSlice::Run(Model* model, std::size_t op_index) {
   // Erase the operator
   model->operators.erase(it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
similarity index 86%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index 8853ed87e61eaf66d323dc38630816c3a98abaf7..0c9effee1fd364fa83f61339251e48070f503d1e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
-#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -103,11 +103,14 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
 
 }  // anonymous namespace
 
-bool ResolveConstantStridedSlice::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantStridedSlice::Run(Model* model,
+                                                      std::size_t op_index,
+                                                      bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kStridedSlice) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const StridedSliceOperator* op =
@@ -117,28 +120,28 @@ bool ResolveConstantStridedSlice::Run(Model* model, std::size_t op_index) {
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (op->start_indices.empty() || op->stop_indices.empty() ||
       op->strides.empty()) {
     // Attributes have not resolved yet.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // Yield until the value shape has been resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (!IsConstantParameterArray(*model, op->inputs[0])) {
     // Yield until the value is constant.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK(!output_array.buffer);
@@ -155,6 +158,9 @@ bool ResolveConstantStridedSlice::Run(Model* model, std::size_t op_index) {
     case ArrayDataType::kInt64:
       StridedSlice<ArrayDataType::kInt64>(*op, input_array, &output_array);
       break;
+    case ArrayDataType::kComplex64:
+      StridedSlice<ArrayDataType::kComplex64>(*op, input_array, &output_array);
+      break;
     default:
       LOG(FATAL)
           << "Unsupported data type input to StridedSlice op with output \""
@@ -164,7 +170,8 @@ bool ResolveConstantStridedSlice::Run(Model* model, std::size_t op_index) {
 
   DeleteOpAndArraysIfUnused(model, it->get());
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_tile.cc
similarity index 88%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_tile.cc
index 5cfa1a5582d2b7cd346764bd68f78720c8cca7e3..75631304968e2115fe5a21870819f617e268bb32 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tile.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_tile.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -97,11 +97,14 @@ inline void Tile(const Array& input_array, const Array& multiples_array,
 }  // namespace
 
 // Resolves a constant Tile operation.
-bool ResolveConstantTile::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantTile::Run(Model* model,
+                                              std::size_t op_index,
+                                              bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kTile) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto* op = static_cast<const TensorFlowTileOperator*>(base_op);
 
@@ -110,17 +113,17 @@ bool ResolveConstantTile::Run(Model* model, std::size_t op_index) {
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // We require constant inputs.
   if (!IsConstantParameterArray(*model, op->inputs[0]) ||
       !IsConstantParameterArray(*model, op->inputs[1])) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const Array& input_array = model->GetArray(op->inputs[0]);
   const Array& multiples_array = model->GetArray(op->inputs[1]);
@@ -147,6 +150,10 @@ bool ResolveConstantTile::Run(Model* model, std::size_t op_index) {
     case ArrayDataType::kInt64:
       Tile<ArrayDataType::kInt64>(input_array, multiples_array, &output_array);
       break;
+    case ArrayDataType::kComplex64:
+      Tile<ArrayDataType::kComplex64>(input_array, multiples_array,
+                                      &output_array);
+      break;
     default:
       LOG(FATAL) << "Unsupported data type given to Tile op with output \""
                  << op->outputs[0] << "\"";
@@ -159,7 +166,8 @@ bool ResolveConstantTile::Run(Model* model, std::size_t op_index) {
 
   // Erase the operator.
   model->operators.erase(it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
similarity index 88%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
index fe15dfa06f4e4a9407121d6fcc63ac9587fa07cb..9514848682f54d73c4fd55f65a7b505f7ec1413a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_transpose.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -101,11 +101,14 @@ void Transpose(Model* model, const Array& input_array,
 
 }  // namespace
 
-bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveConstantTranspose::Run(Model* model,
+                                                   std::size_t op_index,
+                                                   bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   const auto* base_op = it->get();
   if (base_op->type != OperatorType::kTranspose) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto* op = static_cast<const TransposeOperator*>(base_op);
 
@@ -114,17 +117,17 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (!output_array.has_shape()) {
     // Yield until the output shape has been set by PropagateFixedShapes.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // We require constant inputs.
   if (!IsConstantParameterArray(*model, op->inputs[0]) ||
       !IsConstantParameterArray(*model, op->inputs[1])) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const Array& input_array = model->GetArray(op->inputs[0]);
 
@@ -132,7 +135,7 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
 
   if (op->perm.empty()) {
     // Yield until perm has been populated by ResolveTransposeAttributes.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // We currently only support 1-4 dimensions.
@@ -156,6 +159,10 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
       Transpose<ArrayDataType::kInt64>(model, input_array, op->perm,
                                        &output_array);
       break;
+    case ArrayDataType::kComplex64:
+      Transpose<ArrayDataType::kComplex64>(model, input_array, op->perm,
+                                           &output_array);
+      break;
     default:
       LOG(FATAL) << "Unsupported data type given to Transpose op with output \""
                  << op->outputs[0] << "\"";
@@ -174,7 +181,8 @@ bool ResolveConstantTranspose::Run(Model* model, std::size_t op_index) {
 
   // Erase the operator.
   model->operators.erase(it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43070b063c4a426907e80f444e00da44417c0e18
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -0,0 +1,361 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+namespace {
+
+// Using the function reducer, reduce input along all axes in axes.
+// Put the reduced data in output, which should aleady be appropriately sized.
+// check_output_shape is set to what this code computes the final shape
+// to be, so it can be cross checked with the shape computation logic.
+void ReduceGeneric(bool keep_dims, const std::vector<int>& axes,
+                   const Shape& input_shape, const std::vector<float>& input,
+                   Shape* check_output_shape, std::vector<float>* output,
+                   const std::function<float(float, float)>& reducer) {
+  if (!IsNonEmpty(input_shape)) {
+    // Zero-dimensions will break the NextIndices() logic, so just early out if
+    // we have an empty shape.
+    return;
+  }
+
+  // Set up output_shape to be the same length as input_shape, with
+  // appropriate dimensions squashed to 1.  If keep_dims is false, we'll strip
+  // out the one dimensions at the end, but it's convenient to leave them for
+  // now.  We recompute the shape because we need the output shape to have
+  // 1-dims in all the squashed dimensions; the shape from shape computation may
+  // remove those squashed dimensions, depending on the options used.
+  Shape output_shape = input_shape;
+
+  // Reduction mask will be elementwise multiplied against the input
+  // indices to figure out the output index for the element.
+  std::vector<int> reduction_mask(input_shape.dimensions_count(), 1);
+  for (int axis : axes) {
+    CHECK_GE(axis, 0);
+    CHECK_LT(axis, input_shape.dimensions_count());
+    reduction_mask[axis] = 0;
+    output_shape.mutable_dims()->at(axis) = 1;
+  }
+
+  std::vector<int> output_indices(input_shape.dimensions_count());
+  for (int input_offset = 0; input_offset < input.size(); ++input_offset) {
+    std::vector<int> input_indices = ReverseOffset(input_shape, input_offset);
+    // Calculate the output location by squashing input indices to 0
+    // in reduced axes.
+    for (int i = 0; i < input_shape.dimensions_count(); ++i) {
+      output_indices[i] = input_indices[i] * reduction_mask[i];
+    }
+    int output_offset = Offset(output_shape, output_indices);
+    if (input_indices == output_indices) {
+      // Base element for the reduced axes
+      output->at(output_offset) = input.at(input_offset);
+    } else {
+      // Reduce with existing element.
+      output->at(output_offset) =
+          reducer(output->at(output_offset), input.at(input_offset));
+    }
+  }
+
+  if (!keep_dims) {
+    // Strip out the dims from output_shape.
+    std::vector<int> new_dims;
+    for (int i = 0; i < output_shape.dimensions_count(); ++i) {
+      if (reduction_mask[i]) {
+        new_dims.push_back(output_shape.dims(i));
+      }
+    }
+    output_shape.mutable_dims()->swap(new_dims);
+  }
+  *check_output_shape = output_shape;
+}
+
+}  // namespace
+
+bool CopyMinMaxFromFirstInput(const Operator& op, Model* model) {
+  auto& output_array = model->GetArray(op.outputs[0]);
+  if (output_array.minmax) {
+    return false;
+  }
+  const auto& input_array = model->GetArray(op.inputs[0]);
+  if (!input_array.minmax) {
+    return false;
+  }
+  const auto& input_minmax = input_array.GetMinMax();
+  CHECK(!output_array.minmax);
+  auto& output_minmax = output_array.GetOrCreateMinMax();
+  output_minmax.min = input_minmax.min;
+  output_minmax.max = input_minmax.max;
+  return true;
+}
+
+::tensorflow::Status ResolveConstantUnaryOperator::Run(Model* model,
+                                                       std::size_t op_index,
+                                                       bool* modified) {
+  *modified = false;
+  const auto unary_it = model->operators.begin() + op_index;
+  const auto* unary_op = unary_it->get();
+  // Test for unary ops of types that we know how to resolve.
+  switch (unary_op->type) {
+    case OperatorType::kCast:
+    case OperatorType::kExp:
+    case OperatorType::kLog:
+    case OperatorType::kNeg:
+    case OperatorType::kRsqrt:
+    case OperatorType::kSqrt:
+    case OperatorType::kSquare:
+    case OperatorType::kSum:
+    case OperatorType::kReduceMin:  //  Reduction Min
+    case OperatorType::kReduceMax:  //  Reduction Max
+    case OperatorType::kReshape:
+    case OperatorType::kRelu6:
+    case OperatorType::kRelu1:
+    case OperatorType::kRelu:
+      break;
+    default:
+      return ::tensorflow::Status::OK();
+  }
+
+  // Check if the input is a constant parameter.
+  if (!IsConstantParameterArray(*model, unary_op->inputs[0])) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // if the unary op involves a tensor required by a rnn state, ignore it
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    if (unary_op->inputs[0] == rnn_state.back_edge_source_array()) {
+      return ::tensorflow::Status::OK();
+    }
+    if (unary_op->inputs[0] == rnn_state.state_array()) {
+      return ::tensorflow::Status::OK();
+    }
+  }
+
+  auto& output_array = model->GetArray(unary_op->outputs[0]);
+  if (!output_array.has_shape()) {
+    // Yield until the output array dims have been resolved.
+    return ::tensorflow::Status::OK();
+  }
+
+  // At the moment we don't want to care about fused activation functions.
+  // The idea is that we should do the present constants-propagation before
+  // activation functions get fused.
+  if (unary_op->fused_activation_function !=
+      FusedActivationFunctionType::kNone) {
+    AddMessageF(
+        "Not resolving constant %s "
+        " because it has a fused activation function",
+        LogName(*unary_op));
+    return ::tensorflow::Status::OK();
+  }
+
+  // The min-max is only copied for ops that copy data without arithmetic.
+  // In future trivial transpose, etc, can be handled here.
+  if (unary_op->type == OperatorType::kReshape) {
+    CopyMinMaxFromFirstInput(*unary_op, model);
+  }
+
+  const auto& input_array = model->GetArray(unary_op->inputs[0]);
+  // We have already tested above for existence of buffers (synonymous to being
+  // a constant param).
+  CHECK(input_array.buffer);
+  std::vector<DataType<ArrayDataType::kFloat>> const* input_float_data;
+  if (unary_op->type == OperatorType::kCast) {
+    CastOperator const* cast_op = static_cast<CastOperator const*>(unary_op);
+    if (cast_op->dst_data_type != ArrayDataType::kFloat) {
+      AddMessageF(
+          "Not resolving constant %s because we currently only support casting "
+          "to float",
+          LogName(*unary_op));
+      return ::tensorflow::Status::OK();
+    }
+    if (cast_op->src_data_type != input_array.buffer->type) {
+      AddMessageF(
+          "Not resolving constant %s because cast op source type does not "
+          "match input type",
+          LogName(*unary_op));
+    }
+  } else {
+    if (input_array.buffer->type != ArrayDataType::kFloat) {
+      return ::tensorflow::Status::OK();
+    }
+    input_float_data = &(input_array.GetBuffer<ArrayDataType::kFloat>().data);
+  }
+
+  // Create a float buffer on the output array, which are always constant.
+  const Shape& output_shape = output_array.shape();
+  const int output_dims_count = output_shape.dimensions_count();
+  const int output_buffer_size = RequiredBufferSizeForShape(output_shape);
+  auto& output_float_data =
+      output_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+  output_float_data.resize(output_buffer_size);
+
+  const Shape& input_shape = input_array.shape();
+  const int input_buffer_size = RequiredBufferSizeForShape(input_shape);
+  if (unary_op->type == OperatorType::kCast) {
+    for (int i = 0; i < output_buffer_size; i++) {
+      float outval = 0.0f;
+      if (input_array.buffer->type == ArrayDataType::kFloat) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kFloat>().data[i]);
+      } else if (input_array.buffer->type == ArrayDataType::kUint8) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kUint8>().data[i]);
+      } else if (input_array.buffer->type == ArrayDataType::kInt32) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kInt32>().data[i]);
+      } else if (input_array.buffer->type == ArrayDataType::kInt64) {
+        outval = static_cast<float>(
+            input_array.GetBuffer<ArrayDataType::kInt64>().data[i]);
+      } else {
+        LOG(FATAL) << "Unsupported cast op input type";
+      }
+      output_float_data[i] = outval;
+    }
+  } else if (unary_op->type == OperatorType::kReshape) {
+    CHECK(input_buffer_size == output_buffer_size);
+    output_float_data = *input_float_data;
+  } else if (unary_op->type == OperatorType::kSum) {
+    CHECK_EQ(unary_op->inputs.size(), 2) << "Sum needs 2 inputs";
+    if (!IsConstantParameterArray(*model, unary_op->inputs[1])) {
+      AddMessageF("Axis input is non-constant");
+      return ::tensorflow::Status::OK();
+    }
+    auto& axis_array = model->GetArray(unary_op->inputs[1]);
+    CHECK(axis_array.data_type == ArrayDataType::kInt32);
+
+    // We only support keep_dims=true; shape prop will need to change otherwise.
+    auto sum_op = static_cast<const TensorFlowSumOperator*>(unary_op);
+    Shape check_output_shape;
+
+    ReduceGeneric(
+        sum_op->keep_dims, axis_array.GetBuffer<ArrayDataType::kInt32>().data,
+        input_shape, *input_float_data, &check_output_shape, &output_float_data,
+        [](float existing, float current) -> float {
+          return existing + current;
+        });
+    CHECK(check_output_shape == output_shape)
+        << "Shape propagation output shape doesn't match output shape from op";
+  } else if (unary_op->type == OperatorType::kReduceMin) {
+    // At the moment only full reduction across all dimensions is supported.
+    // TODO(starka): Output should not be padded.
+    for (int i = 0; i < output_dims_count; i++) {
+      CHECK_EQ(output_shape.dims(i), 1);
+    }
+    float min = (*input_float_data)[0];
+    for (int i = 0; i < input_buffer_size; i++) {
+      min = std::min(min, (*input_float_data)[i]);
+    }
+    output_float_data[0] = min;
+  } else if (unary_op->type == OperatorType::kReduceMax) {
+    // At the moment only full reduction across all dimensions is supported.
+    // TODO(starka): Output should not be padded.
+    for (int i = 0; i < output_dims_count; i++) {
+      CHECK_EQ(output_shape.dims(i), 1);
+    }
+    float max = (*input_float_data)[0];
+    for (int i = 0; i < input_buffer_size; i++) {
+      max = std::max(max, (*input_float_data)[i]);
+    }
+    output_float_data[0] = max;
+  } else if (unary_op->type == OperatorType::kExp ||
+             unary_op->type == OperatorType::kNeg ||
+             unary_op->type == OperatorType::kLog ||
+             unary_op->type == OperatorType::kRsqrt ||
+             unary_op->type == OperatorType::kSqrt ||
+             unary_op->type == OperatorType::kSquare) {
+    // Element-wise ops. Should have perfectly matching sizes here.
+    for (int i = 0; i < output_dims_count; i++) {
+      CHECK_EQ(output_shape.dims(i), input_shape.dims(i));
+    }
+
+    for (int i = 0; i < output_buffer_size; i++) {
+      const float val = (*input_float_data)[i];
+      float outval = 0.f;
+      if (unary_op->type == OperatorType::kExp) {
+        outval = std::exp(val);
+      } else if (unary_op->type == OperatorType::kNeg) {
+        outval = -val;
+      } else if (unary_op->type == OperatorType::kLog) {
+        outval = std::log(val);
+      } else if (unary_op->type == OperatorType::kRsqrt) {
+        outval = 1.0f / std::sqrt(val);
+      } else if (unary_op->type == OperatorType::kSqrt) {
+        outval = std::sqrt(val);
+      } else if (unary_op->type == OperatorType::kSquare) {
+        outval = val * val;
+      } else {
+        LOG(FATAL) << "should not get here.";
+      }
+      output_float_data[i] = outval;
+    }
+  } else if (unary_op->type == OperatorType::kRelu6 ||
+             unary_op->type == OperatorType::kRelu1 ||
+             unary_op->type == OperatorType::kRelu) {
+    for (size_t i = 0; i < output_buffer_size; ++i) {
+      const float value = (*input_float_data)[i];
+      float new_value = 0.0f;
+      switch (unary_op->type) {
+        case OperatorType::kRelu: {
+          static constexpr float kLower = 0;
+          new_value = value < kLower ? kLower : value;
+          break;
+        }
+        case OperatorType::kRelu1: {
+          static constexpr float kUpper = 1;
+          static constexpr float kLower = -1;
+          new_value = value > kUpper ? kUpper : value < kLower ? kLower : value;
+          break;
+        }
+        case OperatorType::kRelu6: {
+          static constexpr float kUpper = 6;
+          static constexpr float kLower = 0;
+          new_value = value > kUpper ? kUpper : value < kLower ? kLower : value;
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported activation function "
+                     << LogName(*unary_op);
+          return ::tensorflow::Status::OK();
+      }
+      output_float_data[i] = new_value;
+    }
+  } else {
+    LOG(FATAL) << "should not get here.";
+  }
+  for (const auto& input : unary_op->inputs) {
+    if (CountOpsWithInput(*model, input) == 1) {
+      model->EraseArray(input);
+    }
+  }
+  AddMessageF("Resolved constant %s to the equivalent constant array",
+              LogName(*unary_op));
+  model->operators.erase(unary_it);
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
similarity index 82%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
index 0dda1fd0b35fb0cdc3c605360df5126c52c05403..c0becaf7d39cdbc01217bbb9b5a6b50017cc2eaa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
@@ -18,24 +18,27 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ResolveFakeQuantArgsFromVars::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveFakeQuantArgsFromVars::Run(Model* model,
+                                                       std::size_t op_index,
+                                                       bool* modified) {
+  *modified = false;
   const auto fakequant_it = model->operators.begin() + op_index;
   auto* fakequant_base_op = fakequant_it->get();
   if (fakequant_base_op->type != OperatorType::kFakeQuant) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* fakequant_op = static_cast<FakeQuantOperator*>(fakequant_base_op);
 
   if (fakequant_op->minmax) {
     // Already resolved.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(fakequant_op->inputs.size(), 3);
@@ -43,7 +46,7 @@ bool ResolveFakeQuantArgsFromVars::Run(Model* model, std::size_t op_index) {
   // resolved to constant arrays.
   for (int i = 1; i <= 2; i++) {
     if (!IsConstantParameterArray(*model, fakequant_op->inputs[i])) {
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
 
@@ -74,7 +77,8 @@ bool ResolveFakeQuantArgsFromVars::Run(Model* model, std::size_t op_index) {
     DeleteArrayIfUsedOnce(fakequant_op->inputs[i], model);
   }
   fakequant_op->inputs.resize(1);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffad0d0d3151289e2ca2b5bffaacbe87cf089139
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_gather_attributes.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status ResolveGatherAttributes::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
+  auto* gather_op = model->operators[op_index].get();
+  if (gather_op->type != OperatorType::kGather)
+    return ::tensorflow::Status::OK();
+  auto* op = static_cast<GatherOperator*>(gather_op);
+
+  if (op->axis) {
+    // Attributes already resolved
+    return ::tensorflow::Status::OK();
+  }
+  if (op->inputs.size() != 3) return ::tensorflow::Status::OK();
+  if (!IsConstantParameterArray(*model, op->inputs[2]))
+    return ::tensorflow::Status::OK();
+
+  const auto& indices_array = model->GetArray(op->inputs[2]);
+  if (!indices_array.has_shape()) return ::tensorflow::Status::OK();
+  const auto& axis_data = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CHECK_EQ(axis_data.size(), 1)
+      << "Multidimensional gather not supported on " << LogName(*op);
+  op->axis = {axis_data[0]};
+
+  // Drop the axis array as we no longer need it.
+  DeleteArrayIfUsedOnce(op->inputs[2], model);
+  op->inputs.resize(2);
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc b/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
similarity index 78%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
index b2b2ea151bc936d8463d4e4598de5f0d1342edfe..51c724dd1ab058e08a3a29a5408e5d584831a3d9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_multiply_by_zero.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
@@ -29,7 +29,7 @@ namespace {
 template <typename T>
 bool AreAllBufferElementsZero(const std::vector<T>& buffer_data) {
   for (auto x : buffer_data) {
-    if (x != 0) {
+    if (x != T()) {
       return false;
     }
   }
@@ -42,7 +42,7 @@ void FillArrayWithZeros(Array* array) {
   std::vector<DataType<Type>>& data = array->GetMutableBuffer<Type>().data;
   data.resize(RequiredBufferSizeForShape(array->shape()));
   for (size_t i = 0; i < data.size(); i++) {
-    data[i] = 0;
+    data[i] = DataType<Type>();
   }
 }
 
@@ -51,27 +51,30 @@ void FillArrayWithZeros(Array* array) {
 // Removes a multiplication by array of constant zeros by making the output
 // array an array of constant zeros and removing the input arrays if they are no
 // longer needed.
-bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveMultiplyByZero::Run(Model* model,
+                                                std::size_t op_index,
+                                                bool* modified) {
+  *modified = false;
   const auto mul_it = model->operators.begin() + op_index;
   auto* mul_op = mul_it->get();
   if (mul_op->type != OperatorType::kMul) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto& output_array_name = mul_op->outputs[0];
   auto& output_array = model->GetArray(output_array_name);
 
   if (!IsDiscardableArray(*model, output_array_name)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   if (output_array.data_type == ArrayDataType::kNone) {
     // Yield until the output type has been set by PropagateArrayDataTypes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Yield if the output shape is not known yet.
   if (!output_array.has_shape()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // This transformation only handles the case where one operand is all 0's and
@@ -83,12 +86,12 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
   };
   if (!is_input_constant[0] && !is_input_constant[1]) {
     // Neither input is constant, so nothing we can resolve here.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   if (is_input_constant[0] && is_input_constant[1]) {
     // Both inputs are constants. That's a job for constants propagation, not
     // for us to handle here.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
   const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
@@ -105,7 +108,7 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
           constant_input_array.GetBuffer<ArrayDataType::kFloat>().data;
       if (!AreAllBufferElementsZero<DataType<ArrayDataType::kFloat>>(
               constant_input_data)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       FillArrayWithZeros<ArrayDataType::kFloat>(&output_array);
     } break;
@@ -114,7 +117,7 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
           constant_input_array.GetBuffer<ArrayDataType::kUint8>().data;
       if (!AreAllBufferElementsZero<DataType<ArrayDataType::kUint8>>(
               constant_input_data)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       FillArrayWithZeros<ArrayDataType::kUint8>(&output_array);
     } break;
@@ -123,7 +126,7 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
           constant_input_array.GetBuffer<ArrayDataType::kInt32>().data;
       if (!AreAllBufferElementsZero<DataType<ArrayDataType::kInt32>>(
               constant_input_data)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       FillArrayWithZeros<ArrayDataType::kInt32>(&output_array);
     } break;
@@ -132,14 +135,23 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
           constant_input_array.GetBuffer<ArrayDataType::kInt64>().data;
       if (!AreAllBufferElementsZero<DataType<ArrayDataType::kInt64>>(
               constant_input_data)) {
-        return false;
+        return ::tensorflow::Status::OK();
       }
       FillArrayWithZeros<ArrayDataType::kInt64>(&output_array);
     } break;
+    case ArrayDataType::kComplex64: {
+      const auto& constant_input_data =
+          constant_input_array.GetBuffer<ArrayDataType::kComplex64>().data;
+      if (!AreAllBufferElementsZero<DataType<ArrayDataType::kComplex64>>(
+              constant_input_data)) {
+        return ::tensorflow::Status::OK();
+      }
+      FillArrayWithZeros<ArrayDataType::kComplex64>(&output_array);
+    } break;
     default:
       AddMessageF(
           "Cannot resolve multiply by 0 because of unsupported data type\n");
-      return false;
+      return ::tensorflow::Status::OK();
   }
 
   // Erase input arrays to the multiply if no longer used
@@ -149,7 +161,8 @@ bool ResolveMultiplyByZero::Run(Model* model, std::size_t op_index) {
   // Erase the multiply operator.
   model->operators.erase(mul_it);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25b823f8483935d274ab9968f91dfc444afa3c75
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_pad_attributes.cc
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status ResolvePadAttributes::Run(Model* model,
+                                               std::size_t op_index,
+                                               bool* modified) {
+  *modified = false;
+  const auto pad_it = model->operators.begin() + op_index;
+  auto* pad_op = pad_it->get();
+  if (pad_op->type != OperatorType::kPad) return ::tensorflow::Status::OK();
+
+  auto* op = static_cast<PadOperator*>(pad_op);
+  if (!op->left_padding.empty()) return ::tensorflow::Status::OK();
+
+  CHECK_EQ(op->inputs.size(), 2);
+  if (!IsConstantParameterArray(*model, op->inputs[1]))
+    return ::tensorflow::Status::OK();
+
+  const auto& array = model->GetArray(op->inputs[1]);
+  if (!array.has_shape()) return ::tensorflow::Status::OK();
+
+  const std::vector<int>& dims = array.shape().dims();
+  CHECK_EQ(dims.size(), 2);
+
+  std::vector<int> buffer = array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  for (int i = 0; i < dims[0]; ++i) {
+    op->left_padding.push_back(buffer[i * 2]);
+    op->right_padding.push_back(buffer[i * 2 + 1]);
+  }
+
+  // TODO(dkalenichenko): Delete the extra input?
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_padv2_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_padv2_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bcc9f5363ac080b3d14f161a1db476e2105d6b93
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_padv2_attributes.cc
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status ResolvePadV2Attributes::Run(Model* model,
+                                                 std::size_t op_index,
+                                                 bool* modified) {
+  *modified = false;
+  const auto pad_it = model->operators.begin() + op_index;
+  auto* pad_op = pad_it->get();
+  if (pad_op->type != OperatorType::kPadV2) return ::tensorflow::Status::OK();
+
+  auto* op = static_cast<PadV2Operator*>(pad_op);
+  if (!op->left_padding.empty()) return ::tensorflow::Status::OK();
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[1]))
+    return ::tensorflow::Status::OK();
+
+  const auto& array = model->GetArray(op->inputs[1]);
+  if (!array.has_shape()) return ::tensorflow::Status::OK();
+
+  const std::vector<int>& dims = array.shape().dims();
+  CHECK_EQ(dims.size(), 2);
+
+  std::vector<int> buffer = array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  for (int i = 0; i < dims[0]; ++i) {
+    op->left_padding.push_back(buffer[i * 2]);
+    op->right_padding.push_back(buffer[i * 2 + 1]);
+  }
+
+  // TODO(dkalenichenko): Delete the extra input?
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ceba45e93fee10c820f2b0ba01a5948be0787b6
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+template <typename T>
+bool ResolveAttributes(Model* model, T* op) {
+  if (!op->axis.empty()) {
+    // Attributes already resolved
+    return false;
+  }
+  if (op->inputs.size() != 2) return false;
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
+
+  const Array& indices_array = model->GetArray(op->inputs[1]);
+  if (!indices_array.has_shape()) return false;
+
+  // It is ok for indices_array to have a shape for an empty tensor. In that
+  // case, we don't bother setting 'axis'.
+  if (indices_array.buffer->Length() == 0) return false;
+
+  op->axis = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
+  return true;
+}
+
+::tensorflow::Status ResolveReduceAttributes::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
+  Operator* op = model->operators[op_index].get();
+  switch (op->type) {
+    case OperatorType::kMean:
+      *modified = ResolveAttributes(model, static_cast<MeanOperator*>(op));
+      return ::tensorflow::Status::OK();
+    case OperatorType::kSum:
+      *modified =
+          ResolveAttributes(model, static_cast<TensorFlowSumOperator*>(op));
+      return ::tensorflow::Status::OK();
+    case OperatorType::kReduceProd:
+      *modified =
+          ResolveAttributes(model, static_cast<TensorFlowProdOperator*>(op));
+      return ::tensorflow::Status::OK();
+    case OperatorType::kReduceMin:
+      *modified =
+          ResolveAttributes(model, static_cast<TensorFlowMinOperator*>(op));
+      return ::tensorflow::Status::OK();
+    case OperatorType::kReduceMax:
+      *modified =
+          ResolveAttributes(model, static_cast<TensorFlowMaxOperator*>(op));
+      return ::tensorflow::Status::OK();
+    case OperatorType::kAny:
+      *modified =
+          ResolveAttributes(model, static_cast<TensorFlowMaxOperator*>(op));
+      return ::tensorflow::Status::OK();
+    default:
+      return ::tensorflow::Status::OK();
+  }
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
similarity index 89%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
index 8e150db6fa6e7e1d2a8d92babc0d0736f25b535d..f70e80b8e7702b8ad37ecbaf222063d9e3a5c89b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -78,11 +78,13 @@ void ReorderAxes(AxesOrder input_axes_order, AxesOrder output_axes_order,
   }
 }
 
-bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveReorderAxes::Run(Model* model, std::size_t op_index,
+                                             bool* modified) {
+  *modified = false;
   auto it = model->operators.begin() + op_index;
   auto* op = it->get();
   if (op->type != OperatorType::kReorderAxes) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* reorder_op = static_cast<ReorderAxesOperator*>(op);
 
@@ -93,11 +95,11 @@ bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
   auto& input_array = model->GetArray(input_array_name);
   auto& output_array = model->GetArray(output_array_name);
   if (!input_array.buffer) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Yield until output dims have been resolved.
   if (!output_array.has_shape()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Reorder the input array dims and buffer data
   if (input_array.buffer->type == ArrayDataType::kFloat) {
@@ -120,7 +122,8 @@ bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
   DeleteOpAndArraysIfUnused(model, op);
   RenameArray(model, output_array_name, input_array_name);
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reshape_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reshape_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24a3482a6fe9dd8769d94a748558dc1a9cd1eaa5
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reshape_attributes.cc
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status ResolveReshapeAttributes::Run(Model* model,
+                                                   std::size_t op_index,
+                                                   bool* modified) {
+  *modified = false;
+  const auto reshape_it = model->operators.begin() + op_index;
+  auto* reshape_op = reshape_it->get();
+  if (reshape_op->type != OperatorType::kReshape) {
+    return ::tensorflow::Status::OK();
+  }
+
+  auto* op = static_cast<TensorFlowReshapeOperator*>(reshape_op);
+
+  if (!op->shape.empty()) return ::tensorflow::Status::OK();
+
+  if (IsConstantParameterArray(*model, reshape_op->inputs[1])) {
+    const auto& constant_input_array = model->GetArray(reshape_op->inputs[1]);
+    op->shape = constant_input_array.GetBuffer<ArrayDataType::kInt32>().data;
+  }
+
+  if (op->shape.empty()) return ::tensorflow::Status::OK();
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_slice_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_slice_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f86b35c34cab2d24be861e7fa82acba234ee425
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_slice_attributes.cc
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status ResolveSliceAttributes::Run(Model* model,
+                                                 std::size_t op_index,
+                                                 bool* modified) {
+  *modified = false;
+  const auto slice_it = model->operators.begin() + op_index;
+  auto* slice_op = slice_it->get();
+  if (slice_op->type != OperatorType::kSlice) return ::tensorflow::Status::OK();
+
+  auto* op = static_cast<SliceOperator*>(slice_op);
+  if (!op->begin.empty()) return ::tensorflow::Status::OK();
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[1]))
+    return ::tensorflow::Status::OK();
+  if (!IsConstantParameterArray(*model, op->inputs[2]))
+    return ::tensorflow::Status::OK();
+
+  const auto& begin_array = model->GetArray(op->inputs[1]);
+  if (!begin_array.has_shape()) return ::tensorflow::Status::OK();
+
+  const auto& size_array = model->GetArray(op->inputs[2]);
+  if (!size_array.has_shape()) return ::tensorflow::Status::OK();
+
+  op->begin = begin_array.GetBuffer<ArrayDataType::kInt32>().data;
+  op->size = size_array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  // TODO(dkalenichenko): Delete the extra inputs?
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd1e6fccd72aa9c03697237064ba0512b87dcfa9
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_space_to_batch_nd_attributes.cc
@@ -0,0 +1,83 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status ResolveSpaceToBatchNDAttributes::Run(Model* model,
+                                                          std::size_t op_index,
+                                                          bool* modified) {
+  *modified = false;
+  const auto op_it = model->operators.begin() + op_index;
+  if (op_it->get()->type != OperatorType::kSpaceToBatchND)
+    return ::tensorflow::Status::OK();
+
+  auto* op = static_cast<SpaceToBatchNDOperator*>(op_it->get());
+
+  // The attributes are resolved only when the 3 attributes (block_shape,
+  // before_paddings, after_paddings) are all constant.
+  if (!op->block_shape.empty()) {
+    return ::tensorflow::Status::OK();
+  }
+
+  const int block_shape_index = 1;
+  const int paddings_index = 2;
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[block_shape_index]) ||
+      !IsConstantParameterArray(*model, op->inputs[paddings_index]))
+    return ::tensorflow::Status::OK();
+
+  // Handle paddings.
+  const auto& paddings_array = model->GetArray(op->inputs[paddings_index]);
+  if (!paddings_array.has_shape()) return ::tensorflow::Status::OK();
+  const std::vector<int>& paddings_dims = paddings_array.shape().dims();
+  if (paddings_dims.size() != 2) {
+    // Code only handles padding of 2 dimensions. Perhaps another transformation
+    // will delete this op.
+    return ::tensorflow::Status::OK();
+  }
+  const std::vector<int>& paddings_buffer =
+      paddings_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < paddings_dims[0]; ++i) {
+    op->before_paddings.push_back(paddings_buffer[i * 2]);
+    op->after_paddings.push_back(paddings_buffer[i * 2 + 1]);
+  }
+
+  // Handle block_shape.
+  const auto& block_shape_array =
+      model->GetArray(op->inputs[block_shape_index]);
+  if (!block_shape_array.has_shape()) return ::tensorflow::Status::OK();
+  const std::vector<int>& block_shape_dims = block_shape_array.shape().dims();
+  CHECK_EQ(block_shape_dims.size(), 1);
+  const std::vector<int>& block_shape_buffer =
+      block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < block_shape_dims[0]; ++i) {
+    op->block_shape.push_back(block_shape_buffer[i]);
+  }
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f2ae471a2f10aeb107bdd4c2b9cf2bb70abf88e
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_squeeze_attributes.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status ResolveSqueezeAttributes::Run(Model* model,
+                                                   std::size_t op_index,
+                                                   bool* modified) {
+  *modified = false;
+  auto* squeeze_op = model->operators[op_index].get();
+  if (squeeze_op->type != OperatorType::kSqueeze) {
+    return ::tensorflow::Status::OK();
+  }
+  DCHECK_EQ(squeeze_op->inputs.size(), 1);
+  DCHECK_EQ(squeeze_op->outputs.size(), 1);
+
+  // If the output is consumed by a reshape op, it's a trivial squeeze.
+  if (CountOpsWithInput(*model, squeeze_op->outputs[0]) == 1) {
+    const auto* next_op = GetOpWithInput(*model, squeeze_op->outputs[0]);
+    if (next_op->type == OperatorType::kReshape) {
+      AddMessageF(
+          "%s is trivial because its output is only consumed by a "
+          "Reshape op",
+          LogName(*squeeze_op));
+
+      *modified = RemoveTrivialPassthroughOp(this, model, op_index);
+      return ::tensorflow::Status::OK();
+    }
+  }
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
similarity index 77%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index 65132d7d1ef0626e0ad41a88b8e7999a1c1cf684..a62e082e836797ec2371a94690cc0534d4065c47 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -37,40 +37,47 @@ int PadAttributeArray(Array* attribute_array, std::vector<int> pad_values,
   return mask;
 }
 
-bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveStridedSliceAttributes::Run(Model* model,
+                                                        std::size_t op_index,
+                                                        bool* modified) {
+  *modified = false;
   const auto slice_it = model->operators.begin() + op_index;
   auto* slice_op = slice_it->get();
-  if (slice_op->type != OperatorType::kStridedSlice) return false;
+  if (slice_op->type != OperatorType::kStridedSlice)
+    return ::tensorflow::Status::OK();
 
   auto* op = static_cast<StridedSliceOperator*>(slice_op);
   if (!op->start_indices.empty()) {
     // We have already resolved these attributes
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(op->inputs.size(), 4);
   const auto& input_array = model->GetArray(op->inputs[0]);
   if (!input_array.has_shape()) {
     // We require the dimensionality of the input to pad the indices
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   auto& start_array = model->GetArray(op->inputs[1]);
-  if (!start_array.has_shape()) return false;
+  if (!start_array.has_shape()) return ::tensorflow::Status::OK();
   if (toco::RequiredBufferSizeForShape(start_array.shape()) > 4) {
     // Only 1-4D arrays are supported for now.
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   auto& stop_array = model->GetArray(op->inputs[2]);
-  if (!stop_array.has_shape()) return false;
+  if (!stop_array.has_shape()) return ::tensorflow::Status::OK();
 
   auto& stride_array = model->GetArray(op->inputs[3]);
-  if (!stride_array.has_shape()) return false;
+  if (!stride_array.has_shape()) return ::tensorflow::Status::OK();
 
-  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
-  if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
-  if (!IsConstantParameterArray(*model, op->inputs[3])) return false;
+  if (!IsConstantParameterArray(*model, op->inputs[1]))
+    return ::tensorflow::Status::OK();
+  if (!IsConstantParameterArray(*model, op->inputs[2]))
+    return ::tensorflow::Status::OK();
+  if (!IsConstantParameterArray(*model, op->inputs[3]))
+    return ::tensorflow::Status::OK();
 
   int num_input_axes = input_array.shape().dimensions_count();
   int start_indices_size = start_array.shape().dims(0);
@@ -112,6 +119,7 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   op->stop_indices = stop_array.GetBuffer<ArrayDataType::kInt32>().data;
   op->strides = stride_array.GetBuffer<ArrayDataType::kInt32>().data;
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
similarity index 84%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
index fa5ee899334bdf2d39a6861b0e0c4548142e9d2a..ce185847cd0dde476cea78d551f8568271927e3e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
@@ -18,19 +18,22 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ResolveTensorFlowConcat::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveTensorFlowConcat::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   auto concat_it = model->operators.begin() + op_index;
   const auto* tf_concat_op = concat_it->get();
   if (tf_concat_op->type != OperatorType::kConcat &&
       tf_concat_op->type != OperatorType::kConcatV2) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_GE(tf_concat_op->inputs.size(), 2);
@@ -54,7 +57,7 @@ bool ResolveTensorFlowConcat::Run(Model* model, std::size_t op_index) {
   if (!axis_array.buffer) {
     AddMessageF("Waiting for the axis of %s to be resolved to a constant",
                 LogName(*tf_concat_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK(axis_array.data_type == ArrayDataType::kInt32);
@@ -79,7 +82,8 @@ bool ResolveTensorFlowConcat::Run(Model* model, std::size_t op_index) {
   }
   // Remove the TensorFlowConcat op
   model->operators.erase(concat_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
similarity index 93%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index 65346c4fe4ab027c5a24740a7cb39be9a38fcc2c..637ffda533ae1ab165630322c9b65973018e50d7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -55,10 +55,13 @@ TransposeOperator* FindTransposeOpWithInput(const Model& model,
 
 }  // namespace
 
-bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveTensorFlowMatMul::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   auto matmul_it = model->operators.begin() + op_index;
   if (matmul_it->get()->type != OperatorType::kMatMul) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto* matmul_op =
       static_cast<const TensorFlowMatMulOperator*>(matmul_it->get());
@@ -73,7 +76,7 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
         "Not replacing %s by a FullyConnected operator, because it has "
         "the transpose_a attribute",
         LogName(*matmul_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Reorder the axes on the second input. TensorFlow uses row-major ordering
@@ -198,7 +201,8 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
 
   // erase the MatMul operator
   model->operators.erase(matmul_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
similarity index 79%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
index 4edffe3d48fd880c0261b34fc407b8e2ac66ccb9..9ee4e6ec6b7a73836b3886520c7c9bbb8f0559c1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -17,18 +17,21 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveTensorFlowMerge::Run(Model* model,
+                                                 std::size_t op_index,
+                                                 bool* modified) {
+  *modified = false;
   const auto merge_it = model->operators.begin() + op_index;
   const auto* merge_op = merge_it->get();
   if (merge_op->type != OperatorType::kMerge) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // We need to yield until this Merge node has only 1 input, which will mean
@@ -37,7 +40,7 @@ bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
   // non-selected inputs, so that at some point there will be only 1 input left.
   if (merge_op->inputs.size() > 1) {
     AddMessageF("Waiting for %s to be resolved", LogName(*merge_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // Now that the merge node has 1 input exactly, it is the same as an Identity
@@ -57,7 +60,8 @@ bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
   AddMessageF("Removing already-resolved %s", LogName(*merge_op));
   model->EraseArray(merge_op->outputs[0]);
   model->operators.erase(merge_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
similarity index 90%
rename from tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
rename to tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
index 8bef440afd21572d7014e4f376be3aba2d80127d..f26efacaaeec759d96b1657a05eaf53661f1b45b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -17,18 +17,21 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ResolveTensorFlowSwitch::Run(Model* model,
+                                                  std::size_t op_index,
+                                                  bool* modified) {
+  *modified = false;
   const auto switch_it = model->operators.begin() + op_index;
   const auto* switch_op = switch_it->get();
   if (switch_op->type != OperatorType::kSwitch) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   CHECK_EQ(switch_op->inputs.size(), 2);
@@ -40,7 +43,7 @@ bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
     AddMessageF(
         "Waiting for the boolean predicate of %s to be resolved to a constant",
         LogName(*switch_op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   // The predicate should be boolean, and should consist of a single value.
@@ -119,7 +122,8 @@ bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
   // Remove the switch node itself.
   AddMessageF("Removing already-resolved %s", LogName(*switch_op));
   model->operators.erase(switch_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_transpose_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_transpose_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71c0a884da1f73f57128d84ae17302ce19aa0556
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/resolve_transpose_attributes.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+::tensorflow::Status ResolveTransposeAttributes::Run(Model* model,
+                                                     std::size_t op_index,
+                                                     bool* modified) {
+  *modified = false;
+  const auto op_it = model->operators.begin() + op_index;
+  if (op_it->get()->type != OperatorType::kTranspose)
+    return ::tensorflow::Status::OK();
+
+  auto* op = static_cast<TransposeOperator*>(op_it->get());
+  if (!op->perm.empty()) return ::tensorflow::Status::OK();
+
+  CHECK_EQ(op->inputs.size(), 2);
+  if (!IsConstantParameterArray(*model, op->inputs[1]))
+    return ::tensorflow::Status::OK();
+
+  // Handling perm.
+  const auto& perm_array = model->GetArray(op->inputs[1]);
+  if (!perm_array.has_shape()) return ::tensorflow::Status::OK();
+
+  const std::vector<int>& perm_dims = perm_array.shape().dims();
+  CHECK_EQ(perm_dims.size(), 1);
+
+  std::vector<int> perm_buffer =
+      perm_array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (int i = 0; i < perm_dims[0]; ++i) {
+    op->perm.push_back(perm_buffer[i]);
+  }
+
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/shuffle_fc_weights.cc b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
similarity index 89%
rename from tensorflow/contrib/lite/toco/graph_transformations/shuffle_fc_weights.cc
rename to tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
index 22c258cec5fde4144c4b048d5ec60a8604362cbb..195ea70e34bf87afd8c4318c33233c8d959d945a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/shuffle_fc_weights.cc
+++ b/tensorflow/lite/toco/graph_transformations/shuffle_fc_weights.cc
@@ -17,22 +17,24 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status ShuffleFCWeights::Run(Model* model, std::size_t op_index,
+                                           bool* modified) {
+  *modified = false;
   Operator* op = model->operators[op_index].get();
   if (op->type != OperatorType::kFullyConnected) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   FullyConnectedOperator* fc_op = static_cast<FullyConnectedOperator*>(op);
   // Exit if this FC op already has shuffled weights
   if (fc_op->weights_format != FullyConnectedWeightsFormat::kDefault) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const Array& input_array = model->GetArray(fc_op->inputs[0]);
   const string& weights_name = fc_op->inputs[1];
@@ -46,11 +48,11 @@ bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) {
       output_array.data_type != ArrayDataType::kInt16 ||
       !input_array.quantization_params || !weights_array.quantization_params ||
       !output_array.quantization_params) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Exit if the shapes aren't known
   if (!input_array.has_shape() || !weights_array.has_shape()) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Exit if, based on the known shapes, this FC op is not a GEMV.
   // The shuffling of FC weights is only useful to enable fast GEMV paths.
@@ -64,7 +66,7 @@ bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) {
           "the input shape is not 1D or 2D (possibly with additional inner "
           "dimensions of size 1)",
           LogName(*op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
   }
   if (input_shape.dims(0) != 1 && input_shape.dims(0) != 4) {
@@ -73,7 +75,7 @@ bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) {
         "the input shape's leading dimension, i.e. the 'batch size', is not "
         "equal to 1 or 4",
         LogName(*op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Exit if the weights shape isn't an integral multiple of the shuffled
   // block shape, 4x16. We don't want to have to write code dealing with
@@ -88,7 +90,7 @@ bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) {
   // two.
   const Shape& weights_shape = weights_array.shape();
   if (weights_shape.dimensions_count() != 2) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const int rows = weights_shape.dims(0);
   const int cols = weights_shape.dims(1);
@@ -97,11 +99,11 @@ bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) {
         "Not applying experimental shuffling to the weights of %s because its "
         "shape isn't a multiple of the shuffling block shape, 4x16",
         LogName(*op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Exit if the weights aren't already a constant array.
   if (!weights_array.buffer) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Exit if the weights are used by more than one op.
   if (CountOpsWithInput(*model, weights_name) != 1) {
@@ -109,7 +111,7 @@ bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) {
         "Not applying experimental shuffling to the weights of %s because that "
         "array is consumed by other operators",
         LogName(*op));
-    return false;
+    return ::tensorflow::Status::OK();
   }
   // Compute the shuffled weights
   auto& weights_data =
@@ -152,7 +154,8 @@ bool ShuffleFCWeights::Run(Model* model, std::size_t op_index) {
   shuffled_input_workspace_array.GetOrCreateQuantizationParams() =
       input_array.GetQuantizationParams();
 
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bbbedbe3a93065e3a7007073aad7f6e7600e2651
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -0,0 +1,42 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+tf_cc_test(
+    name = "lstm_utils_test",
+    srcs = ["lstm_utils_test.cc"],
+    deps = [
+        "//tensorflow/lite/toco:graph_transformations",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "resolve_constant_concatenation_test",
+    srcs = ["resolve_constant_concatenation_test.cc"],
+    deps = [
+        "//tensorflow/lite/toco:graph_transformations",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "resolve_constant_unary_test",
+    srcs = ["resolve_constant_unary_test.cc"],
+    deps = [
+        "//tensorflow/lite/toco:graph_transformations",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc b/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc
rename to tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc
index 6aae0775d3445daf7d990bcce09d335c5f686601..bdb27e8af2e359263fa68a95b69b18508a3b5739 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/lstm_utils_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/lstm_utils_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
similarity index 91%
rename from tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
rename to tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
index 66cfed4ac26969729d1881f11ba6ae74d9817fb5..00d60b79ca96f6b28b73598a07aebccc0a2b51b5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
+++ b/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
@@ -166,7 +166,10 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis0) {
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
   EXPECT_THAT(model.GetArrayMap().size(), 5);
-  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+  bool modified;
+  ASSERT_TRUE((*graph_transformation_set.begin())
+                  ->Run(&model, /*op_index=*/0, &modified)
+                  .ok());
   EXPECT_THAT(model.GetArrayMap().size(), 1);
 
   auto& concatenated_array = (*model.GetArrayMap().begin()).second;
@@ -185,7 +188,10 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis1) {
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
   EXPECT_THAT(model.GetArrayMap().size(), 5);
-  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+  bool modified;
+  ASSERT_TRUE((*graph_transformation_set.begin())
+                  ->Run(&model, /*op_index=*/0, &modified)
+                  .ok());
   EXPECT_THAT(model.GetArrayMap().size(), 1);
 
   auto& concatenated_array = (*model.GetArrayMap().begin()).second;
@@ -204,7 +210,10 @@ TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis2) {
   GraphTransformationsSet graph_transformation_set;
   graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
   EXPECT_THAT(model.GetArrayMap().size(), 5);
-  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+  bool modified;
+  ASSERT_TRUE((*graph_transformation_set.begin())
+                  ->Run(&model, /*op_index=*/0, &modified)
+                  .ok());
   EXPECT_THAT(model.GetArrayMap().size(), 1);
 
   auto& concatenated_array = (*model.GetArrayMap().begin()).second;
diff --git a/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_unary_test.cc b/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_unary_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..246a13a06102161349bec24de181dd6ca0e711e4
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/tests/resolve_constant_unary_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+
+void RunResolveSum(const std::vector<float>& input,
+                   const std::vector<int>& input_shape,
+                   const std::vector<int>& axis,
+                   const std::vector<int>& output_shape,
+                   const std::vector<float>& expected_output) {
+  Model model;
+  Array& input0 = model.GetOrCreateArray("input0");
+  Array& input1 = model.GetOrCreateArray("input1");
+  Array& output = model.GetOrCreateArray("output");
+
+  *input0.mutable_shape()->mutable_dims() = input_shape;
+  input0.data_type = ArrayDataType::kFloat;
+  input0.GetMutableBuffer<ArrayDataType::kFloat>().data = input;
+
+  *input1.mutable_shape()->mutable_dims() = {static_cast<int>(axis.size())};
+  input1.GetMutableBuffer<ArrayDataType::kInt32>().data = axis;
+  input1.data_type = ArrayDataType::kInt32;
+
+  *output.mutable_shape()->mutable_dims() = output_shape;
+
+  auto sum_op = absl::make_unique<TensorFlowSumOperator>();
+  sum_op->keep_dims = true;
+  sum_op->inputs = {"input0", "input1"};
+  sum_op->outputs = {"output"};
+  model.operators.push_back(std::move(sum_op));
+  bool modified;
+  ASSERT_TRUE(ResolveConstantUnaryOperator().Run(&model, 0, &modified).ok());
+  EXPECT_EQ(model.GetArray("output").GetBuffer<ArrayDataType::kFloat>().data,
+            expected_output);
+  EXPECT_EQ(model.GetArray("output").shape().dims(), output_shape);
+}
+
+// Reduce a 2d array across axis 0
+TEST(ResolveConstantUnary, ResolveSumAxis0_2D) {
+  // clang-format off
+  RunResolveSum(
+      // Input data
+      {3, 1, 4, 1,
+       5, 9, 2, 6,
+       5, 3, 5, 8},
+
+      // Input shape
+      {3, 4},
+
+      // Axes
+      {0},
+
+      // Expected output shape,
+      {1, 4},
+
+      // Expected output
+      {13, 13, 11, 15});
+  // clang-format on
+}
+
+// Reduce a 2d array across axis 1
+TEST(ResolveConstantUnary, ResolveSumAxis1_2D) {
+  // clang-format off
+  RunResolveSum(
+      // Input data
+      {3, 1, 4, 1,
+       5, 9, 2, 6,
+       5, 3, 5, 8},
+
+      // Input shape
+      {3, 4},
+
+      // Axes
+      {1},
+
+      // Expected output shape,
+      {3, 1},
+
+      // Expected output
+      {9, 22, 21});
+  // clang-format on
+}
+
+// Reduce a 3d tensor across axes 0 and 2.
+TEST(ResolveConstantUnary, ResolveSumAxis0_2_3D) {
+  // clang-format off
+  RunResolveSum(
+      // Input data
+      {  0,   1,   2,
+         3,  10,  11,
+        12,  13,  20,
+        21,  22,  23,
+
+       100, 101, 102,
+       103, 110, 111,
+       112, 113, 120,
+       121, 122, 123,
+
+       200, 201, 202,
+       203, 210, 211,
+       212, 213, 220,
+       221, 222, 223 },
+
+      // Input shape
+      {3, 4, 3},
+
+      // Axes
+      {0, 2},
+
+      // Expected output shape,
+      {1, 4, 1},
+
+      // Expected output, generated using octave.
+      { 909, 972, 1035, 1098});
+  // clang-format on
+}
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
similarity index 80%
rename from tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
rename to tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
index 69bad2fa89cb89cd74e3a4bca98da906a322a670..3e36dd5a45c720a73f8d1fcc0d2732533ba5cf31 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
+++ b/tensorflow/lite/toco/graph_transformations/unfuse_activation_functions.cc
@@ -17,21 +17,24 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
-bool UnfuseActivationFunctions::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status UnfuseActivationFunctions::Run(Model* model,
+                                                    std::size_t op_index,
+                                                    bool* modified) {
+  *modified = false;
   const auto it = model->operators.begin() + op_index;
   auto* op = it->get();
 
   // If a conv operation has an im2col array, yield: it should be dropped first.
   if ((op->type == OperatorType::kConv) && (op->outputs.size() == 2)) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
 
   Operator* ac_op = nullptr;
@@ -46,7 +49,7 @@ bool UnfuseActivationFunctions::Run(Model* model, std::size_t op_index) {
       ac_op = new Relu1Operator;
       break;
     default:
-      return false;
+      return ::tensorflow::Status::OK();
   }
 
   // At this point we know that the op has a fused activation function. At the
@@ -74,7 +77,8 @@ bool UnfuseActivationFunctions::Run(Model* model, std::size_t op_index) {
 
   ac_op->inputs = {tmp_array_name};
   op->outputs = {tmp_array_name};
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
similarity index 92%
rename from tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
rename to tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index dd9e26e68bd7e6d5cb751fdbf705b861c3f2f188..e57f175812f4c66cf9dbb15910d117a9d27acdf9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -16,13 +16,16 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 namespace toco {
 
-bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status UnpartitionEmbeddingLookup::Run(Model* model,
+                                                     std::size_t op_index,
+                                                     bool* modified) {
+  *modified = false;
   // Collapses a partitioned tf.nn.embedding_lookup back into a single Gather.
   // https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup
   // This transform attempts to identify the len(params) > 1 case and collapse
@@ -47,7 +50,7 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
   // First look for the final DynamicStitch.
   auto op_it = model->operators.begin() + op_index;
   if (op_it->get()->type != OperatorType::kDynamicStitch) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto* stitch_op = static_cast<DynamicStitchOperator*>(op_it->get());
 
@@ -72,7 +75,7 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
           "Skipping because indices input %s into "
           "%s is unexpected",
           LogName(*op), LogName(*stitch_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
     if (!indices_partition_op) {
       indices_partition_op = static_cast<DynamicPartitionOperator*>(op);
@@ -83,7 +86,7 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
             "Skipping because indices input %s into "
             "%s is from a different source op than others",
             LogName(*op), LogName(*stitch_op));
-        return false;
+        return ::tensorflow::Status::OK();
       }
     }
   }
@@ -92,12 +95,12 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
   // The data for the indices must be a constant range of the array shape.
   if (!IsConstantParameterArray(*model, indices_partition_op->inputs[0])) {
     AddMessageF("Skipping because indices partition data is non-constant");
-    return false;
+    return ::tensorflow::Status::OK();
   }
   auto& indices_data_array = model->GetArray(indices_partition_op->inputs[0]);
   if (indices_data_array.data_type == ArrayDataType::kNone) {
     // Yield until data types are propagated.
-    return false;
+    return ::tensorflow::Status::OK();
   }
   CHECK(indices_data_array.data_type == ArrayDataType::kInt32)
       << "Indices partition inputs must be int32";
@@ -117,7 +120,7 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
           "Skipping because data input %s into %s "
           "is unexpected",
           LogName(*op), LogName(*stitch_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
     gather_ops.push_back(static_cast<GatherOperator*>(op));
   }
@@ -132,7 +135,7 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
           "Skipping because data input %s into "
           "%s is unexpected",
           LogName(*op), LogName(*gather_op));
-      return false;
+      return ::tensorflow::Status::OK();
     }
     if (!data_partition_op) {
       data_partition_op = static_cast<DynamicPartitionOperator*>(op);
@@ -143,7 +146,7 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
             "Skipping because data input %s into "
             "%s is from a different source op than others",
             LogName(*op), LogName(*gather_op));
-        return false;
+        return ::tensorflow::Status::OK();
       }
     }
   }
@@ -236,7 +239,8 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
   DeleteOpAndArraysIfUnused(model, indices_partition_op);
   DeleteOpAndArraysIfUnused(model, data_partition_op);
   DeleteOpAndArraysIfUnused(model, stitch_op);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
similarity index 91%
rename from tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc
rename to tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index fedf4441e2424e9c26c5c1c8a6f07a406c0d937b..41a735394d714b65a4c9fc309927e34a7f610431 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -36,10 +36,12 @@ namespace toco {
 //    slice_c = tf.matmul(slice_a, slice_b)
 //    result_slices[bat] = slice_c
 //  result = tf.stack(result_slices)
-bool UnrollBatchMatMul::Run(Model* model, std::size_t op_index) {
+::tensorflow::Status UnrollBatchMatMul::Run(Model* model, std::size_t op_index,
+                                            bool* modified) {
+  *modified = false;
   auto batch_op_it = model->operators.begin() + op_index;
   if (batch_op_it->get()->type != OperatorType::kBatchMatMul) {
-    return false;
+    return ::tensorflow::Status::OK();
   }
   const auto* batch_op =
       static_cast<const BatchMatMulOperator*>(batch_op_it->get());
@@ -47,7 +49,8 @@ bool UnrollBatchMatMul::Run(Model* model, std::size_t op_index) {
   // We must have the shape of at least one input to know our batch size.
   const auto& input_array_a = model->GetArray(batch_op->inputs[0]);
   const auto& input_array_b = model->GetArray(batch_op->inputs[1]);
-  if (!input_array_a.has_shape() || !input_array_b.has_shape()) return false;
+  if (!input_array_a.has_shape() || !input_array_b.has_shape())
+    return ::tensorflow::Status::OK();
 
   // We only support the rank 3 case. If you are batching on rank > 3 you'll
   // have to figure that out.
@@ -66,7 +69,8 @@ bool UnrollBatchMatMul::Run(Model* model, std::size_t op_index) {
     batch_op_it = matmul_op_it + 1;
     CHECK_EQ(batch_op_it->get(), batch_op);
     model->operators.erase(batch_op_it);
-    return true;
+    *modified = true;
+    return ::tensorflow::Status::OK();
   }
   CHECK_EQ(input_array_a.shape().dimensions_count(), 3)
       << "Input arrays must have rank 3";
@@ -113,7 +117,8 @@ bool UnrollBatchMatMul::Run(Model* model, std::size_t op_index) {
     auto* slice_b_op = new SliceOperator;
     slice_b_op->inputs = {
         batch_op->inputs[1],
-        CreateInt32Array(model, batch_name + "/slice_b/slice/begin", {0, 0, 0}),
+        CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
+                         {batch, 0, 0}),
         CreateInt32Array(
             model, batch_name + "/slice_b/slice/size",
             {1, input_array_b.shape().dims(1), input_array_b.shape().dims(2)}),
@@ -167,7 +172,8 @@ bool UnrollBatchMatMul::Run(Model* model, std::size_t op_index) {
   CHECK(batch_op_it != model->operators.end());
   CHECK(batch_op_it->get() == batch_op);
   model->operators.erase(batch_op_it);
-  return true;
+  *modified = true;
+  return ::tensorflow::Status::OK();
 }
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
similarity index 85%
rename from tensorflow/contrib/lite/toco/import_tensorflow.cc
rename to tensorflow/lite/toco/import_tensorflow.cc
index 5eaf6e27fcd5e4b8534f7968fd21a9a55b468603..0b2f810394311a33899b9242e73131e109a2b4c0 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/import_tensorflow.h"
+#include "tensorflow/lite/toco/import_tensorflow.h"
 
 #include <memory>
 #include <string>
@@ -27,11 +27,11 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_util.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
+#include "tensorflow/lite/toco/tensorflow_util.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/session_options.h"
@@ -50,6 +51,7 @@ limitations under the License.
 
 using tensorflow::AttrValue;
 using tensorflow::DT_BOOL;
+using tensorflow::DT_COMPLEX64;
 using tensorflow::DT_FLOAT;
 using tensorflow::DT_INT32;
 using tensorflow::DT_INT64;
@@ -185,6 +187,8 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
     return ArrayDataType::kInt64;
   else if (dtype == DT_STRING)
     return ArrayDataType::kString;
+  else if (dtype == DT_COMPLEX64)
+    return ArrayDataType::kComplex64;
   else
     LOG(INFO) << "Unsupported data type in placeholder op: " << dtype;
   return ArrayDataType::kNone;
@@ -195,23 +199,35 @@ tensorflow::Status ImportShape(
         input_dims,
     int* input_flat_size, Shape* shape) {
   std::vector<int> input_dims_only_sizes;
+  bool zero_sized_shape = false;
   for (auto& d : input_dims) {
-    if (d.size() == 0) {
-      // Some TensorFlow shapes contain a 0 dim, effectively making
-      // them of flat size 0 even though they have other nonzero dims.
-      // This breaks our invariant, that array dims can't be 0.
-      // For now, tweaking this to record a 0-D shape instead.
-      shape->mutable_dims()->clear();
-      if (input_flat_size != nullptr) *input_flat_size = 0;
-      return tensorflow::Status::OK();
-    }
     // TensorFlow's shapes use int64s, while TOCO uses ints.
     if (d.size() > std::numeric_limits<int>::max()) {
       return tensorflow::errors::InvalidArgument("Shape element overflows");
     }
-
+    if (d.size() == 0) {
+      zero_sized_shape = true;
+    }
     input_dims_only_sizes.push_back(d.size());
   }
+
+  // Note that up to this point we were OK with the input shape containing
+  // elements valued -1 or 0, which are perfectly legal in tensorflow. However
+  // our CheckValidShapeDimensions() insists on them being >= 1, with the
+  // exception of the "scalar" shape [0]. The main issue with zero-values shape
+  // elements is that the corresponding arrays don't contain any data and the
+  // allocation code gets a bit confused. It seems that the code expects an
+  // empty shape for zero-sized shapes, so we will do just that, except for the
+  // [0] case.
+  // TODO(b/119325030): In order to correctly import the "scalar" shapes the
+  // following test must include "&& input_dims_only_sizes.size() > 1", but
+  // that seems to slow everything down a lot.
+  if (zero_sized_shape) {
+    shape->mutable_dims()->clear();
+    if (input_flat_size != nullptr) *input_flat_size = 0;
+    return tensorflow::Status::OK();
+  }
+
   *shape->mutable_dims() = input_dims_only_sizes;
 
   if (input_flat_size == nullptr) return tensorflow::Status::OK();
@@ -257,6 +273,48 @@ tensorflow::Status ImportFloatArray(const TensorProto& input_tensor,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ImportComplex64Array(const TensorProto& input_tensor,
+                                        Array* output_array) {
+  CHECK_EQ(input_tensor.dtype(), DT_COMPLEX64);
+  const auto& input_shape = input_tensor.tensor_shape();
+  CHECK_LE(input_shape.dim_size(), 4);
+  int input_flat_size;
+  auto status = ImportShape(input_shape.dim(), &input_flat_size,
+                            output_array->mutable_shape());
+  if (!status.ok()) return status;
+
+  auto& output_complex_data =
+      output_array->GetMutableBuffer<ArrayDataType::kComplex64>().data;
+  output_complex_data.resize(RequiredBufferSizeForShape(output_array->shape()),
+                             std::complex<float>(0.f, 0.f));
+  CHECK_GE(output_complex_data.size(), input_flat_size);
+  if (input_tensor.scomplex_val_size() == 2) {
+    for (int i = 0; i < input_flat_size; i++) {
+      output_complex_data[i] = std::complex<float>(
+          input_tensor.scomplex_val(0), input_tensor.scomplex_val(1));
+    }
+  } else if (input_tensor.scomplex_val_size() == 2 * input_flat_size) {
+    for (int i = 0; i < input_flat_size; ++i) {
+      output_complex_data[i] =
+          std::complex<float>(input_tensor.scomplex_val(2 * i),
+                              input_tensor.scomplex_val(2 * i + 1));
+    }
+  } else if (input_tensor.tensor_content().size() ==
+             input_flat_size * sizeof(std::complex<float>)) {
+    toco::port::CopyToBuffer(
+        input_tensor.tensor_content(),
+        reinterpret_cast<char*>(output_complex_data.data()));
+  } else {
+    return tensorflow::errors::InvalidArgument(absl::StrCat(
+        "Neither input_content (",
+        input_tensor.tensor_content().size() / sizeof(std::complex<float>),
+        ") nor scomplex_val (", input_tensor.scomplex_val_size(),
+        ") have the right dimensions (", input_flat_size,
+        ") for this complex64 tensor"));
+  }
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ImportQuint8Array(const TensorProto& input_tensor,
                                      Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_QUINT8);
@@ -477,6 +535,30 @@ string CreateConstArray(Model* model, string const& name,
   return array_name;
 }
 
+// Retain TensorFlow NodeDef in Toco Operator.
+//
+// If an op is supported by Toco but not supported by TFLite, TFLite exporter
+// will use the retained NodeDef to populate a Flex op when Flex mode is
+// enabled.
+//
+// This can't be easily applied to all operations, because a TensorFlow node
+// may become multiple Toco operators. Thus we need to call this function in
+// operator conversion functions one by one whenever feasible.
+//
+// This may cause problems if a graph transformation rule changes parameters
+// of the node. When calling this function, please check if any existing
+// graph transformation rule will change an existing operator with the same
+// type.
+//
+// This provides a route to handle Toco-supported & TFLite-unsupported ops
+// in Flex mode. However it's not a solid solution. Eventually we should
+// get rid of this.
+// TODO(b/117327937): Implement all Toco-supported ops in TFLite, and remove
+// this function.
+void RetainTensorFlowNodeDef(const NodeDef& node, Operator* op) {
+  node.SerializeToString(&op->tensorflow_node_def);
+}
+
 tensorflow::Status ConvertConstOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -512,6 +594,10 @@ tensorflow::Status ConvertConstOperator(
       array.data_type = ArrayDataType::kBool;
       status = ImportBoolArray(tensor, &array);
       break;
+    case DT_COMPLEX64:
+      array.data_type = ArrayDataType::kComplex64;
+      status = ImportComplex64Array(tensor, &array);
+      break;
     default:
       array.data_type = ArrayDataType::kNone;
       // do nothing, silently ignore the Const data.
@@ -538,7 +624,8 @@ tensorflow::Status ConvertConvOperator(
 
   const auto& input_name = node.input(0);
   const auto& weights_name = node.input(1);
-  const auto& reordered_weights_name = weights_name + "_reordered";
+  const auto& reordered_weights_name =
+      AvailableArrayName(*model, weights_name + "_reordered");
   // Check if a ReorderAxesOperator was already created for these weights
   // (that happens when multiple layers share the same weights).
   const Operator* existing_reorder =
@@ -848,6 +935,25 @@ tensorflow::Status ConvertSplitOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertSplitVOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "SplitV");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 3));
+  auto* op = new TensorFlowSplitVOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  const int num_split = GetIntAttr(node, "num_split");
+  op->outputs.push_back(node.name());
+  for (int i = 1; i < num_split; i++) {
+    op->outputs.push_back(absl::StrCat(node.name(), ":", i));
+  }
+  op->num_split = num_split;
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertSwitchOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -990,6 +1096,10 @@ tensorflow::Status ConvertBatchMatMulOperator(
   auto* batch_matmul = new BatchMatMulOperator;
   batch_matmul->inputs = {node.input(0), node.input(1)};
   batch_matmul->outputs = {node.name()};
+
+  // For Flex mode. Please read the comments of the function.
+  RetainTensorFlowNodeDef(node, batch_matmul);
+
   model->operators.emplace_back(batch_matmul);
   return tensorflow::Status::OK();
 }
@@ -1043,28 +1153,162 @@ tensorflow::Status ConvertConcatOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertMirrorPadOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  if (node.op() != "MirrorPad") {
+    LOG(FATAL) << "Expected MirrorPad.";
+  }
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  CHECK_EQ(num_inputs, 2);
+  auto* op = new MirrorPadOperator;
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  const auto mode = GetStringAttr(node, "mode");
+  if (mode == "REFLECT") {
+    op->mode = toco::MirrorPadMode::kReflect;
+  } else if (mode == "SYMMETRIC") {
+    op->mode = toco::MirrorPadMode::kSymmetric;
+  }
+
+  model->operators.emplace_back(op);
+
+  return tensorflow::Status::OK();
+}
+
+static constexpr int kAnyNumInputs = -1;
+
+enum FlexSupport { kFlexOk, kFlexNotOk };
+
 // This method supports simple operators without additional attributes.
-template <typename Op>
-tensorflow::Status ConvertSimpleOperator(
+// Converts a simple operator that takes no attributes. The list of inputs is
+// taken from the given NodeDef, and its number must match NumInputs, unless
+// kAnyNumInputs is passed in. If kFlexOk is passed in the resulting operator
+// will be eligible for being exported as a flex op.
+template <typename Op, int NumInputs, FlexSupport flex>
+tensorflow::Status ConvertSimpleOperatorGeneric(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
+  if (NumInputs != kAnyNumInputs) {
+    TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, NumInputs));
+  }
   auto* op = new Op;
   const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+
+  if (flex == kFlexOk) {
+    RetainTensorFlowNodeDef(node, op);
+  }
+
   model->operators.emplace_back(op);
   return tensorflow::Status::OK();
 }
 
-// This method supports simple operators without additional attributes.
-template <typename Op, unsigned int NumInputs>
+// Convert a simple operator which is not valid as a flex op.
+template <typename Op, int NumInputs = kAnyNumInputs>
 tensorflow::Status ConvertSimpleOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, NumInputs));
-  return ConvertSimpleOperator<Op>(node, tf_import_flags, model);
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexNotOk>(
+      node, tf_import_flags, model);
+}
+
+// Convert a simple operator which is valid as a flex op.
+template <typename Op, int NumInputs = kAnyNumInputs>
+tensorflow::Status ConvertSimpleOperatorFlexOk(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexOk>(
+      node, tf_import_flags, model);
+}
+
+void GetOutputNamesFromNodeDef(const NodeDef& node,
+                               const tensorflow::OpDef& op_def,
+                               TensorFlowUnsupportedOperator* op) {
+  int next_output = 0;
+  auto add_output = [&node, &next_output, op]() {
+    if (next_output == 0) {
+      op->outputs.push_back(node.name());  // Implicit :0.
+    } else {
+      op->outputs.push_back(absl::StrCat(node.name(), ":", next_output));
+    }
+    ++next_output;
+  };
+  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    string multiples = op_def.output_arg(i).number_attr();
+    if (!multiples.empty()) {
+      CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
+      int num_outputs = GetIntAttr(node, multiples);
+      for (int j = 0; j < num_outputs; ++j) {
+        add_output();
+      }
+    } else {
+      string list = op_def.output_arg(i).type_list_attr();
+      if (!list.empty()) {
+        CHECK(HasAttr(node, list)) << "No attr named " << list;
+        const AttrValue::ListValue& list_value = GetListAttr(node, list);
+        for (int j = 0; j < list_value.type_size(); ++j) {
+          add_output();
+        }
+      } else {
+        add_output();
+      }
+    }
+  }
+}
+
+void GetOutputTypesFromNodeDef(const NodeDef& node,
+                               const tensorflow::OpDef& op_def,
+                               TensorFlowUnsupportedOperator* op) {
+  // The given type to the op, or clear the types if invalid.
+  auto add_type = [&node, op](tensorflow::DataType type) {
+    if (type == tensorflow::DT_INVALID) {
+      LOG(WARNING) << "Op node missing output type attribute: " << node.name();
+      op->output_data_types.clear();
+    } else {
+      op->output_data_types.push_back(ConvertDataType(type));
+    }
+  };
+
+  // Retrieve the data type according to the OpDef definition: either the
+  // "type" or "type_attr" field will be set.
+  auto get_type = [&node](const tensorflow::OpDef::ArgDef& a) {
+    if (a.type() != tensorflow::DT_INVALID) {
+      return a.type();
+    } else if (HasAttr(node, a.type_attr())) {
+      return GetDataTypeAttr(node, a.type_attr());
+    } else {
+      return tensorflow::DT_INVALID;
+    }
+  };
+
+  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    string multiples = op_def.output_arg(i).number_attr();
+    if (!multiples.empty()) {
+      CHECK(HasAttr(node, multiples)) << "No attr named " << multiples;
+      int num_outputs = GetIntAttr(node, multiples);
+      auto type = get_type(op_def.output_arg(i));
+      for (int j = 0; j < num_outputs; ++j) {
+        add_type(type);
+      }
+    } else {
+      string list = op_def.output_arg(i).type_list_attr();
+      if (!list.empty()) {
+        CHECK(HasAttr(node, list)) << "No attr named " << list;
+        const AttrValue::ListValue& list_value = GetListAttr(node, list);
+        for (int j = 0; j < list_value.type_size(); ++j) {
+          add_type(list_value.type(j));
+        }
+      } else {
+        add_type(get_type(op_def.output_arg(i)));
+      }
+    }
+  }
 }
 
 tensorflow::Status ConvertUnsupportedOperator(
@@ -1081,7 +1325,10 @@ tensorflow::Status ConvertUnsupportedOperator(
 
   auto* op = new TensorFlowUnsupportedOperator;
   op->tensorflow_op = node.op();
-  node.SerializeToString(&op->tensorflow_node_def);
+
+  // For Flex mode. Please read the comments of the function.
+  RetainTensorFlowNodeDef(node, op);
+
   model->operators.emplace_back(op);
 
   // Parse inputs.
@@ -1090,13 +1337,13 @@ tensorflow::Status ConvertUnsupportedOperator(
     op->inputs.push_back(node.input(i));
   }
 
-  // Parse outputs.
-  op->outputs.push_back(node.name());  // Implicit :0.
+  // Parse outputs. Name them after the node's name, plus an ordinal suffix.
+  // Note that some outputs are to be multipled by a named attribute.
   const tensorflow::OpDef* op_def = nullptr;
   if (tensorflow::OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) {
-    for (int i = 1; i < op_def->output_arg_size(); ++i) {
-      op->outputs.push_back(absl::StrCat(node.name(), ":", i));
-    }
+    GetOutputNamesFromNodeDef(node, *op_def, op);
+  } else {
+    op->outputs.push_back(node.name());  // Implicit :0.
   }
 
   // Parse if the op supports quantization
@@ -1119,16 +1366,7 @@ tensorflow::Status ConvertUnsupportedOperator(
     const auto& output_type = GetDataTypeAttr(node, "Tout");
     op->output_data_types.push_back(ConvertDataType(output_type));
   } else if (op_def != nullptr) {
-    for (const auto& output_arg : op_def->output_arg()) {
-      if (HasAttr(node, output_arg.type_attr())) {
-        op->output_data_types.push_back(
-            ConvertDataType(GetDataTypeAttr(node, output_arg.type_attr())));
-      } else {
-        LOG(INFO) << "Op node missing output type attribute: " << node.name();
-        op->output_data_types.clear();
-        break;
-      }
-    }
+    GetOutputTypesFromNodeDef(node, *op_def, op);
   } else {
     // TODO(b/113613439): Figure out how to propagate types for custom ops
     // that have no OpDef.
@@ -1336,6 +1574,25 @@ tensorflow::Status ConvertResizeBilinearOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertResizeNearestNeighborOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "ResizeNearestNeighbor");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
+  auto* op = new ResizeNearestNeighborOperator;
+
+  op->align_corners = false;
+  if (HasAttr(node, "align_corners")) {
+    op->align_corners = GetBoolAttr(node, "align_corners");
+  }
+
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertBatchNormWithGlobalNormalizationOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1605,6 +1862,7 @@ tensorflow::Status ConvertRangeOperator(
   op->inputs.push_back(node.input(1));
   op->inputs.push_back(node.input(2));
   op->outputs.push_back(node.name());
+
   model->operators.emplace_back(op);
   return tensorflow::Status::OK();
 }
@@ -1798,13 +2056,13 @@ bool InlineAllFunctions(GraphDef* graphdef) {
   tensorflow::SessionOptions options;
   auto* device_count = options.config.mutable_device_count();
   device_count->insert({"CPU", 1});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
 
   tensorflow::FunctionLibraryDefinition fld(tensorflow::OpRegistry::Global(),
                                             graphdef_copy.library());
-  tensorflow::DeviceMgr device_mgr(devices);
+  tensorflow::DeviceMgr device_mgr(std::move(devices));
   tensorflow::OptimizerOptions o_opts;
   tensorflow::ProcessFunctionLibraryRuntime pflr(
       &device_mgr, tensorflow::Env::Default(), TF_GRAPH_DEF_VERSION, &fld,
@@ -1964,6 +2222,63 @@ tensorflow::Status ConvertCTCBeamSearchDecoderOperator(
   return tensorflow::Status::OK();
 }
 
+// This isn't a TensorFlow builtin op. Currently this node can only be generated
+// with TfLite OpHint API.
+tensorflow::Status ConvertUnidirectionalSequenceLstm(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  DCHECK_EQ(node.op(), "UnidirectionalSequenceLstm");
+
+  auto* op = new UnidirectionalSequenceLstmOperator();
+  const auto& indices = GetListAttr(node, "_tflite_input_indices");
+  if (indices.i_size() != node.input().size()) {
+    return tensorflow::errors::InvalidArgument("Input size does not match.");
+  }
+
+  // The input size needs to be the same as the TfLite UniDirectionalSequence
+  // Lstm implementation.
+  const int kInputsSize = 20;
+
+  op->inputs.resize(kInputsSize);
+  std::vector<bool> done(kInputsSize);
+  int idx = 0;
+  for (const string& input : node.input()) {
+    int real_index = indices.i(idx);
+    op->inputs[real_index] = (input);
+    done[real_index] = true;
+    idx++;
+  }
+
+  for (int idx = 0; idx < done.size(); idx++) {
+    if (!done[idx]) {
+      string optional_name = node.name() + "_" + std::to_string(idx);
+      model->CreateOptionalArray(optional_name);
+      op->inputs[idx] = optional_name;
+    }
+  }
+
+  // There're three outputs, only the last one is required.
+  op->outputs.push_back(node.name() + ":2");
+  model->operators.emplace_back(op);
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertLeakyReluOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "LeakyRelu");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 1));
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  const auto& input_name = node.input(0);
+  auto* op = new LeakyReluOperator;
+  op->inputs.push_back(input_name);
+  op->outputs.push_back(node.name());
+  op->alpha = GetFloatAttr(node, "alpha");
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 }  // namespace
 
 namespace internal {
@@ -1973,10 +2288,20 @@ using ConverterType = tensorflow::Status (*)(
     Model* model);
 using ConverterMapType = std::unordered_map<std::string, ConverterType>;
 
+ConverterMapType GetTensorFlowNodeConverterMapForFlex() {
+  return std::unordered_map<std::string, ConverterType>({
+      // We need to let TCO convert Placeholder information into
+      // array data, so that the data types are correct.
+      {"LegacyFedInput", ConvertPlaceholderOperator},
+      {"Placeholder", ConvertPlaceholderOperator},
+  });
+}
+
 ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
+      {"Abs", ConvertSimpleOperator<AbsOperator>},
       {"Add", ConvertSimpleOperator<AddOperator, 2>},
-      {"AddN", ConvertSimpleOperator<AddNOperator>},
+      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator>},
       {"All", ConvertSimpleOperator<TensorFlowAllOperator>},
       {"Any", ConvertReduceOperator<TensorFlowAnyOperator>},
       {"ArgMax", ConvertArgMaxOperator},
@@ -2018,6 +2343,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
        ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2>},
       {"Identity", ConvertIdentityOperator},
       {"LRN", ConvertLRNOperator},
+      {"LeakyRelu", ConvertLeakyReluOperator},
       {"LegacyFedInput", ConvertPlaceholderOperator},
       {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2>},
       {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>},
@@ -2056,6 +2382,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Relu6", ConvertSimpleOperator<Relu6Operator, 1>},
       {"Reshape", ConvertSimpleOperator<TensorFlowReshapeOperator, 2>},
       {"ResizeBilinear", ConvertResizeBilinearOperator},
+      {"ResizeNearestNeighbor", ConvertResizeNearestNeighborOperator},
       {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1>},
       {"Select", ConvertSimpleOperator<SelectOperator, 3>},
       {"Shape", ConvertShapeOperator},
@@ -2067,8 +2394,11 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"SpaceToDepth", ConvertSpaceToDepthOperator},
       {"SparseToDense", ConvertSparseToDenseOperator},
       {"Split", ConvertSplitOperator},
+      {"SplitV", ConvertSplitVOperator},
       {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1>},
       {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1>},
+      {"SquaredDifference",
+       ConvertSimpleOperator<SquaredDifferenceOperator, 2>},
       {"Squeeze", ConvertSqueezeOperator},
       {"StopGradient", ConvertIdentityOperator},
       {"StridedSlice", ConvertStridedSliceOperator},
@@ -2083,6 +2413,8 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Transpose", ConvertSimpleOperator<TransposeOperator, 2>},
       {"Unpack", ConvertUnpackOperator},
       {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1>},
+      {"UnidirectionalSequenceLstm", ConvertUnidirectionalSequenceLstm},
+      {"MirrorPad", ConvertMirrorPadOperator},
   });
 }
 
@@ -2128,6 +2460,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
   // converted to TFLite Flex ops.
   if (!tf_import_flags.import_all_ops_as_unsupported) {
     converter_map = internal::GetTensorFlowNodeConverterMap();
+  } else {
+    converter_map = internal::GetTensorFlowNodeConverterMapForFlex();
   }
 
   for (auto node : inlined_graph.node()) {
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.h b/tensorflow/lite/toco/import_tensorflow.h
similarity index 82%
rename from tensorflow/contrib/lite/toco/import_tensorflow.h
rename to tensorflow/lite/toco/import_tensorflow.h
index c5ff96956a748d83027200aaa57d5cb456fac636..5b74ff2bc31a0f9054faf06f4067dfcaad367371 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.h
+++ b/tensorflow/lite/toco/import_tensorflow.h
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#ifndef TENSORFLOW_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#define TENSORFLOW_LITE_TOCO_IMPORT_TENSORFLOW_H_
 
 #include <memory>
 #include <string>
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 
 namespace toco {
@@ -30,7 +30,7 @@ struct TensorFlowImportFlags {
 
   // Do not recognize any op and import all ops as
   // `TensorFlowUnsupportedOperator`. This is used to populated with the
-  // `force_flex_ops` flag.
+  // `force_select_tf_ops` flag.
   bool import_all_ops_as_unsupported = false;
 };
 
@@ -44,4 +44,4 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#endif  // TENSORFLOW_LITE_TOCO_IMPORT_TENSORFLOW_H_
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0be358b1f7be2cc632322558eda3da86d16688af
--- /dev/null
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -0,0 +1,434 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/import_tensorflow.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace toco {
+
+using tensorflow::AttrValue;
+using tensorflow::DT_BOOL;
+using tensorflow::DT_COMPLEX64;
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT32;
+using tensorflow::DT_INT64;
+using tensorflow::DT_QUINT8;
+using tensorflow::DT_STRING;
+using tensorflow::NodeDef;
+using tensorflow::Status;
+
+namespace internal {
+using ConverterType = tensorflow::Status (*)(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model);
+using ConverterMapType = std::unordered_map<std::string, ConverterType>;
+
+ConverterMapType GetTensorFlowNodeConverterMap();
+Status ImportTensorFlowNode(const NodeDef&, const TensorFlowImportFlags&,
+                            Model*, const ConverterMapType&);
+}  // namespace internal
+
+namespace {
+
+Status ImportNode(const NodeDef& node, Model* model) {
+  const auto converter = internal::GetTensorFlowNodeConverterMap();
+  return internal::ImportTensorFlowNode(node, TensorFlowImportFlags(), model,
+                                        converter);
+}
+
+Status ImportFlexNode(const NodeDef& node, Model* model) {
+  // Empty converter => all nodes are flex nodes.
+  const auto converter = internal::ConverterMapType();
+  return internal::ImportTensorFlowNode(node, TensorFlowImportFlags(), model,
+                                        converter);
+}
+
+Status ImportNode(const NodeDef& node) {
+  Model model;
+  return ImportNode(node, &model);
+}
+
+NodeDef BuildNode(
+    const std::string& op,
+    const std::vector<std::initializer_list<int>>& output_shapes) {
+  NodeDef node;
+  node.set_op(op);
+  node.set_name("Node1");
+  node.add_input();
+  node.set_input(0, "Node0");
+
+  AttrValue::ListValue* shapes =
+      (*node.mutable_attr())["_output_shapes"].mutable_list();
+  for (const auto& output_shape : output_shapes) {
+    tensorflow::TensorShapeProto* shape = shapes->add_shape();
+    for (int64_t output_shape_dim : output_shape) {
+      auto shape_dim = shape->add_dim();
+      shape_dim->set_size(output_shape_dim);
+    }
+  }
+
+  return node;
+}
+
+namespace {
+void BuildConstNode(std::initializer_list<int64_t> shape,
+                    tensorflow::DataType dtype, int64_t num_elements,
+                    NodeDef* node) {
+  node->set_op("Const");
+  node->set_name("Node1");
+
+  // An attribute describing the type of this const node.
+  AttrValue dtype_attr;
+  SetAttrValue(dtype, &dtype_attr);
+  (*node->mutable_attr())["dtype"] = dtype_attr;
+
+  // An attribute describing the content of this const node.
+  tensorflow::TensorProto t;
+  t.set_dtype(dtype);
+  auto* s = t.mutable_tensor_shape();
+  for (auto d : shape) {
+    s->add_dim()->set_size(d);
+  }
+
+  // TODO(ahentz): also need to test via tensor_content()
+  switch (dtype) {
+    case DT_FLOAT:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_float_val(i / 10000.0);
+      }
+      break;
+    case DT_INT32:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_int_val(i % std::numeric_limits<int>::max());
+      }
+      break;
+    case DT_QUINT8:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_int_val(i % std::numeric_limits<uint8_t>::max());
+      }
+      break;
+    case DT_INT64:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_int64_val(i);
+      }
+      break;
+    case DT_STRING:
+      break;
+    case DT_BOOL:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_bool_val(i % 2);
+      }
+      break;
+    case DT_COMPLEX64:
+      for (int64_t i = 0; i < num_elements; ++i) {
+        t.add_scomplex_val(i / 10000.0);
+        t.add_scomplex_val(-i / 10000.0);
+      }
+      break;
+    default:
+      break;
+  }
+
+  AttrValue value_attr;
+  SetAttrValue(t, &value_attr);
+  (*node->mutable_attr())["value"] = value_attr;
+}
+}  //  namespace
+
+class ShapeImportTest : public ::testing::TestWithParam<tensorflow::DataType> {
+};
+
+TEST_P(ShapeImportTest, ShapeElementIsNegative) {
+  NodeDef node;
+  BuildConstNode({1, -2, 10}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_EQ(
+      status.error_message(),
+      "Tensor shape should not include negative values\n\t (while processing "
+      "node 'Node1')");
+}
+
+TEST_P(ShapeImportTest, ShapeElementIsZero) {
+  NodeDef node;
+  // Const nodes with zero-sized, non-scalar shapes are still not importable.
+  BuildConstNode({1, 0, 10}, GetParam(), 0, &node);
+
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+
+  const auto& array = model.GetArray("Node1");
+  EXPECT_THAT(array.shape().dims(), ::testing::ElementsAre());
+}
+
+// Note how this is subtly different thant ShapeElementIsZero above, where toco
+// removes all shape information after import.
+TEST_P(ShapeImportTest, ShapeIsOneDimZero) {
+  NodeDef node;
+  BuildConstNode({0}, GetParam(), 0, &node);
+
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+
+  const auto& array = model.GetArray("Node1");
+  // We would like to have [0] shapes actually import correctly, but
+  // for some reason that slows everything down.
+  EXPECT_THAT(array.shape().dims(), ::testing::ElementsAre());
+}
+
+TEST_P(ShapeImportTest, ShapeElementTooLarge) {
+  NodeDef node;
+  BuildConstNode({3000000000}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_EQ(status.error_message(),
+            "Shape element overflows\n\t (while processing node 'Node1')");
+}
+
+TEST_P(ShapeImportTest, ShapeTooLarge) {
+  NodeDef node;
+  BuildConstNode({1000000, 2000000, 2000000, 2000000}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_EQ(status.error_message(),
+            "Tensor shape is too large\n\t (while processing node 'Node1')");
+}
+
+TEST_P(ShapeImportTest, ValidShapeButZeroElements) {
+  NodeDef node;
+  BuildConstNode({1, 2, 2, 2}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_THAT(status.error_message(),
+              ::testing::MatchesRegex(
+                  "Neither input_content .0. nor .*_val .0. have the right "
+                  "dimensions .8. for this .* tensor\n\t .while processing "
+                  "node 'Node1'."));
+}
+
+std::vector<tensorflow::DataType> TestTypes() {
+  return {DT_FLOAT, DT_INT32, DT_INT64, DT_BOOL, DT_QUINT8, DT_COMPLEX64};
+}
+
+INSTANTIATE_TEST_CASE_P(ShapeImportTest, ShapeImportTest,
+                        ::testing::ValuesIn(TestTypes()));
+
+TEST(ImportTest, Complex64ConstNode) {
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_COMPLEX64, 6, &node);
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+  const auto& array = model.GetArray("Node1");
+  EXPECT_EQ(ArrayDataType::kComplex64, array.data_type);
+  EXPECT_EQ(6, array.GetBuffer<ArrayDataType::kComplex64>().Length());
+  int64_t i = 0;
+  for (const auto& datum : array.GetBuffer<ArrayDataType::kComplex64>().data) {
+    EXPECT_EQ(i / 10000.0f, std::real(datum));
+    EXPECT_EQ(-i / 10000.0f, std::imag(datum));
+    i++;
+  }
+}
+
+std::vector<std::pair<tensorflow::DataType, ArrayDataType>> UnaryTestTypes() {
+  return {{DT_FLOAT, ArrayDataType::kFloat},
+          {DT_INT32, ArrayDataType::kInt32},
+          {DT_INT64, ArrayDataType::kInt64}};
+}
+
+class TypeImportTest : public ::testing::TestWithParam<
+                           std::pair<tensorflow::DataType, ArrayDataType>> {
+ protected:
+  TypeImportTest() {}
+
+  void BuildUnaryNode(const std::string& op_name, tensorflow::DataType dtype,
+                      NodeDef* node) {
+    node->set_op(op_name);
+    node->set_name("Node1");
+
+    node->add_input();
+    node->set_input(0, "Node0");
+
+    AttrValue dtype_attr;
+    SetAttrValue(dtype, &dtype_attr);
+    (*node->mutable_attr())["T"] = dtype_attr;
+  }
+};
+
+TEST_P(TypeImportTest, BasicTypeInference) {
+  NodeDef node;
+  BuildUnaryNode("Atan", GetParam().first, &node);
+
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+
+  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
+  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
+  const TensorFlowUnsupportedOperator* op =
+      static_cast<const TensorFlowUnsupportedOperator*>(
+          model.operators[0].get());
+  ASSERT_THAT(op->output_data_types, ::testing::ElementsAre(GetParam().second));
+}
+INSTANTIATE_TEST_CASE_P(BasicTypeInference, TypeImportTest,
+                        ::testing::ValuesIn(UnaryTestTypes()));
+
+TEST(ImportTest, TypeInferenceWithFixedOutputType) {
+  // Create an op that has a fixed output type (bool).
+  Model model;
+  EXPECT_TRUE(ImportNode(BuildNode("IsFinite", {{1, 2}, {2, 3}}), &model).ok());
+  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
+  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
+  const TensorFlowUnsupportedOperator* op =
+      static_cast<const TensorFlowUnsupportedOperator*>(
+          model.operators[0].get());
+
+  // The static output type should be indicated in the imported op.
+  ASSERT_THAT(op->output_data_types,
+              ::testing::ElementsAre(ArrayDataType::kBool));
+}
+
+TEST(ImportTest, FailedTypeInference) {
+  // Create a unary op with no Type ("T") annotation.
+  NodeDef node;
+  node.set_op("Atan");
+  node.set_name("Node1");
+  node.add_input();
+  node.set_input(0, "Node0");
+
+  Model model;
+  EXPECT_TRUE(ImportNode(node, &model).ok());
+
+  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
+  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
+  const TensorFlowUnsupportedOperator* op =
+      static_cast<const TensorFlowUnsupportedOperator*>(
+          model.operators[0].get());
+  ASSERT_TRUE(op->output_data_types.empty());
+}
+
+TEST(ImportTest, UnsupportedOpWithOutputShapes) {
+  // Create an unsupported op with output shapes.
+  Model model;
+  EXPECT_TRUE(ImportNode(BuildNode("Atan", {{1, 2}, {2, 3}}), &model).ok());
+  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
+  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
+  const TensorFlowUnsupportedOperator* op =
+      static_cast<const TensorFlowUnsupportedOperator*>(
+          model.operators[0].get());
+
+  // The output shapes should be imported.
+  ASSERT_EQ(op->output_shapes.size(), 2);
+  ASSERT_THAT(op->output_shapes[0].dims(), ::testing::ElementsAre(1, 2));
+  ASSERT_THAT(op->output_shapes[1].dims(), ::testing::ElementsAre(2, 3));
+}
+
+TEST(ImportTest, UnsupportedOpWithWildcardOutputShapes) {
+  // Create an unsupported op with wildcard output shapes.
+  Model model;
+  EXPECT_TRUE(ImportNode(BuildNode("Atan", {{-1, 2}}), &model).ok());
+  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
+  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
+  const TensorFlowUnsupportedOperator* op =
+      static_cast<const TensorFlowUnsupportedOperator*>(
+          model.operators[0].get());
+
+  // Wildcard shapes aren't yet supported.
+  ASSERT_TRUE(op->output_shapes.empty());
+}
+
+TEST(ImportTest, UnsupportedOpWithMultipleOutputs) {
+  // This test needs an existing TensorFlow op to run correctly, because it
+  // read the OpDef from the global registry. The complex output setup of
+  // ParseExample allows us to test all nuances here, but we will need to add
+  // attributes to match the specification in the OpDef.
+  NodeDef node = BuildNode("ParseExample", {});
+
+  // Nsparse defines how many sparse indices and shapes there are. Here we set
+  // Nsparse to 2, meaning there will be 2 INT64 tensors for 'sparse_indices'
+  // and 2 INT64 tensors for 'sparse_shapes. The type of those tensors is
+  // defined in the OpDef.
+  {
+    AttrValue value_attr;
+    SetAttrValue(2, &value_attr);
+    (*node.mutable_attr())["Nsparse"] = value_attr;
+  }
+
+  // The there will be a number of 'sparse_values' tensors, defined by the
+  // attribute 'sparse_types', which is a list of types.
+  {
+    AttrValue value_attr;
+    std::vector<tensorflow::DataType> types;
+    types.push_back(tensorflow::DT_FLOAT);
+    types.push_back(tensorflow::DT_STRING);
+    SetAttrValue(types, &value_attr);
+    (*node.mutable_attr())["sparse_types"] = value_attr;
+  }
+
+  // And finally there will be 'dense_values' tensors, which are controlled by
+  // the 'Tdense' attribute.
+  {
+    AttrValue value_attr;
+    std::vector<tensorflow::DataType> types;
+    types.push_back(tensorflow::DT_STRING);
+    types.push_back(tensorflow::DT_FLOAT);
+    types.push_back(tensorflow::DT_INT64);
+    SetAttrValue(types, &value_attr);
+    (*node.mutable_attr())["Tdense"] = value_attr;
+  }
+
+  Model model;
+  EXPECT_TRUE(ImportFlexNode(node, &model).ok());
+
+  ASSERT_THAT(model.operators.size(), ::testing::Ge(1));
+  ASSERT_EQ(model.operators[0]->type, OperatorType::kUnsupported);
+  const TensorFlowUnsupportedOperator* op =
+      static_cast<const TensorFlowUnsupportedOperator*>(
+          model.operators[0].get());
+
+  ASSERT_EQ(op->outputs.size(), 9);
+  ASSERT_EQ(op->output_data_types.size(), 9);
+
+  // The 'sparse_indices' output tensors.
+  ASSERT_EQ(op->outputs[0], "Node1");
+  ASSERT_EQ(op->outputs[1], "Node1:1");
+  ASSERT_EQ(op->output_data_types[0], ArrayDataType::kInt64);
+  ASSERT_EQ(op->output_data_types[1], ArrayDataType::kInt64);
+
+  // The 'sparse_values' output tensors.
+  ASSERT_EQ(op->outputs[2], "Node1:2");
+  ASSERT_EQ(op->outputs[3], "Node1:3");
+  ASSERT_EQ(op->output_data_types[2], ArrayDataType::kFloat);
+  ASSERT_EQ(op->output_data_types[3], ArrayDataType::kString);
+
+  // The 'sparse_shapes' output tensors.
+  ASSERT_EQ(op->outputs[4], "Node1:4");
+  ASSERT_EQ(op->outputs[5], "Node1:5");
+  ASSERT_EQ(op->output_data_types[4], ArrayDataType::kInt64);
+  ASSERT_EQ(op->output_data_types[5], ArrayDataType::kInt64);
+
+  // The 'dense_shapes' output tensors.
+  ASSERT_EQ(op->outputs[6], "Node1:6");
+  ASSERT_EQ(op->outputs[7], "Node1:7");
+  ASSERT_EQ(op->outputs[8], "Node1:8");
+  ASSERT_EQ(op->output_data_types[6], ArrayDataType::kString);
+  ASSERT_EQ(op->output_data_types[7], ArrayDataType::kFloat);
+  ASSERT_EQ(op->output_data_types[8], ArrayDataType::kInt64);
+}
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
new file mode 100644
index 0000000000000000000000000000000000000000..d392535f5c98cdd3532299064f2c6d9305214e71
--- /dev/null
+++ b/tensorflow/lite/toco/model.h
@@ -0,0 +1,2213 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_MODEL_H_
+#define TENSORFLOW_LITE_TOCO_MODEL_H_
+
+#include <complex>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+using tflite::QuantizationParams;
+
+enum class OperatorType : uint8 {
+  kNone,
+  // General-purpose neural network operators.
+  kAdd,
+  kAddN,
+  kAveragePool,
+  kBatchMatMul,
+  kBatchNormalization,
+  kConv,
+  kConcatenation,
+  kDepthwiseConv,
+  kDepthToSpace,
+  kSpaceToDepth,
+  kDequantize,
+  kDiv,
+  kExp,
+  kExpandDims,
+  kFill,
+  kFloorDiv,
+  kFloorMod,
+  kFullyConnected,
+  kL2Normalization,
+  kL2Pool,
+  kLstmCell,
+  kUnidirectionalSequenceLstm,
+  kLocalResponseNormalization,
+  kLog,
+  kLogistic,
+  kMaxPool,
+  kFakeQuant,
+  kMul,
+  kOneHot,
+  kRandomUniform,
+  kRange,
+  kRank,
+  kRelu,
+  kRelu1,
+  kRelu6,
+  kPRelu,
+  kSoftmax,
+  kLogSoftmax,
+  kSub,
+  kTanh,
+  kTransposeConv,
+  kCast,
+  kFloor,
+  kGather,
+  kResizeBilinear,
+  kSin,
+  kSpaceToBatchND,
+  kPack,
+  kBatchToSpaceND,
+  kPad,
+  kPadV2,
+  kReduceProd,  // Reduction product
+  kStridedSlice,
+  kSlice,
+  kSqueeze,
+  kMean,
+  kArgMax,
+  // The SVDF Op is a decomposition of a densely connected Op into
+  // low rank filters. For details:
+  // https://research.google.com/pubs/pub43813.html
+  kSvdf,
+  // Special operators used for importing TensorFlow nodes.
+  // The general intent is to have some graph transformation either
+  // drop them or rewrite them as general-purpose operators.
+  kAll,
+  kAssert,
+  kConcat,
+  kConcatV2,
+  kGreater,
+  kGreaterEqual,
+  kIdentity,
+  kLess,
+  kLessEqual,
+  kReduceMax,  //  Reduction Max
+  kMaximum,    //  Element-wise Maximum
+  kReduceMin,  //  Reduction Min
+  kMinimum,    //  Element-wise Minimum
+  kMatMul,
+  kMerge,
+  kNeg,
+  kReshape,
+  kRsqrt,
+  kShape,
+  kSplit,
+  kSplitV,
+  kSqrt,
+  kSquare,
+  kSquaredDifference,
+  kSum,
+  kSwitch,
+  kTile,
+  kTranspose,
+  kTopK_V2,
+  kDynamicPartition,
+  kDynamicStitch,
+  // An unsupported TF operation. It's only needed to be able to represent TF
+  // graph internally and is expected to be dropped by graph transformations.
+  kUnsupported,
+  // Finally, TensorFlow uses different conventions for axes ordering,
+  // see AxesOrder, and this cannot always be resolved at the time of importing
+  // nodes, as TensorFlow parameters may be constant-expression subgraphs
+  // instead of being given as plain constant arrays. So we need to insert
+  // special nodes in the graph to shuffle axes.
+  kReorderAxes,
+  kSelect,
+  kSparseToDense,
+  kEqual,
+  kNotEqual,
+  kPow,
+  kArgMin,
+  kAny,
+  kLogicalAnd,
+  kLogicalNot,
+  kLogicalOr,
+  kCTCBeamSearchDecoder,
+  kUnpack,
+  kZerosLike,
+  kResizeNearestNeighbor,
+  kLeakyRelu,
+  kAbs,
+  kMirrorPad
+};
+
+// Helper to deal with TensorFlow arrays using a different ordering of
+// dimensions
+// ("axes") than our own.
+// TODO(benoitjacob): Ultimately, we shouldn't have any "ordering" of axes,
+// we should have associative arrays mapping symbolic axes identifiers (like
+// "output_depth") to dimensions. We would then not need this anymore.
+enum class AxesOrder {
+  kOneAxis,  // one-dimensional array, one unique axis.
+  kCR,       // column-major matrix storage order. Our standard.
+  kRC,       // row-major matrix storage order. TensorFlow default.
+  kOHWI,     // Our standard for conv weights
+  kHWIO,     // TensorFlow conv weights
+  k1HWO,     // Our standard for DepthwiseConv weights
+  kHWIM,     // TensorFlow DepthwiseConv weights
+  kNHWC,     // TensorFlow activations
+  kHWOI,     // TensorFlow back-prop conv weights
+};
+
+// The type of the scalars in an array.
+// Note that the type does not by itself tell whether the values in the array
+// are non-quantized (can be accessed directly) or quantized (must be
+// interpreted in conjunction with QuantizationParams).
+//
+// In practice though:
+//   float values are never quantized
+//   uint8 values are always quantized
+//   int32 values are sometimes quantized (depending on whether
+//   QuantizationParams are present).
+//   complex values are never quantized
+//   other types are never quantized at the moment.
+//
+// kNone means that we don't know the data type yet, or that we don't care
+// because we'll be dropping the array anyway (e.g. some exotic array types
+// may be involved only in debug-only subgraphs that we may not be interested
+// in actually supporting).
+enum class ArrayDataType : uint8 {
+  kNone,  // 0
+  kBool,
+  kFloat,
+  kInt8,
+  kUint8,
+  kInt16,  // 5
+  kUint16,
+  kInt32,
+  kUint32,
+  kInt64,
+  kUint64,  // 10
+  kString,
+  kComplex64,
+};
+
+// Compile-time logic to map ArrayDataType to the corresponding C++ scalar type
+template <ArrayDataType A>
+struct DataTypeImpl {};
+template <>
+struct DataTypeImpl<ArrayDataType::kNone> {
+  typedef int Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kBool> {
+  typedef bool Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kFloat> {
+  typedef float Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt8> {
+  typedef int8 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint8> {
+  typedef uint8 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt16> {
+  typedef int16 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint16> {
+  typedef uint16 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt32> {
+  typedef int32 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint32> {
+  typedef uint32 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt64> {
+  typedef int64 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint64> {
+  typedef uint64 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kString> {
+  typedef string Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kComplex64> {
+  typedef std::complex<float> Type;
+};
+
+template <ArrayDataType A>
+using DataType = typename DataTypeImpl<A>::Type;
+
+// Base class for type-specific buffer types.
+struct GenericBuffer {
+  // Non-default-constructible: only ArrayDataType-specific subclass
+  // objects may be constructed.
+  GenericBuffer() = delete;
+  // Non-copyable-or-movable: we should only store pointers-to-Buffer
+  // in containers, not Operators themselves, so there should be no
+  // copy or move.
+  GenericBuffer(const GenericBuffer&) = delete;
+  GenericBuffer(const GenericBuffer&&) = delete;
+
+  // We need a virtual destructor so we can store pointers-to-Buffer
+  // in containers and have the containers call the right subclass destructor.
+  virtual ~GenericBuffer() {}
+
+  virtual int Length() const = 0;
+
+  const ArrayDataType type;
+
+ protected:
+  // Constructor used by subclasses for specific ArrayDataType's.
+  explicit GenericBuffer(ArrayDataType t) : type(t) {}
+};
+
+// Type-specific buffer, containing type-specific storage.
+template <ArrayDataType A>
+struct Buffer : GenericBuffer {
+  Buffer() : GenericBuffer(A) {}
+
+  int Length() const override { return data.size(); }
+
+  std::vector<DataType<A>> data;
+};
+
+class Shape {
+ public:
+  // For Shape, we stick to half-way encapsulation for now:
+  // we hide the raw dims_ member, but expose it raw by accessors
+  // because from some brainstorming, it's not at all easy to
+  // anticipate which flavor of more hermetic encapsulation would
+  // actually buy us future-proof-ness without being needlessly
+  // cumbersome.
+  Shape() {}
+  Shape(std::initializer_list<int> dim_list) : dims_(dim_list) {}
+
+  void ReplaceDims(std::initializer_list<int> dim_list) {
+    dims_ = std::vector<int>(dim_list);
+  }
+
+  const std::vector<int>& dims() const { return dims_; }
+  std::vector<int>* mutable_dims() { return &dims_; }
+  const int dimensions_count() const { return dims_.size(); }
+
+  // We still have that one convenience accessor to avoid
+  // the awkward double bracket issue:  shape.dims()[i].
+  int dims(int i) const {
+    // Always check for out-of-bounds accesses, even in optimized builds where
+    // standard assertions are disabled. Out-of-bounds access here is a common
+    // occurrence.
+    CHECK_GE(i, 0);
+    CHECK_GT(dims_.size(), i);
+    return dims_[i];
+  }
+
+  bool operator==(const Shape& comp) const {
+    return (this->dims_ == comp.dims());
+  }
+
+  bool operator!=(const Shape& comp) const { return !((*this) == comp); }
+
+ private:
+  std::vector<int> dims_;
+};
+
+// Base class for all operator classes.
+struct Operator {
+  // Non-default-constructible: only OperatorType-specific subclass
+  // objects may be constructed.
+  Operator() = delete;
+  // Non-copyable-or-movable: we should only store pointers-to-Operator
+  // in containers, not Operators themselves, so there should be no
+  // copy or move.
+  Operator(const Operator&) = delete;
+  Operator(const Operator&&) = delete;
+
+  // We need a virtual destructor so we can store pointers-to-Operator
+  // in containers and have the containers call the right subclass destructor.
+  virtual ~Operator() {}
+
+  // The specific type of operator. Corresponds 1:1 to subclasses.
+  const OperatorType type;
+
+  // The activation function that may be fused into this operator,
+  // or None if no activation function is fused.
+  FusedActivationFunctionType fused_activation_function;
+
+  // Input arrays: either activation arrays or constant array parameters.
+  // We refer to them by their name, not by their address; the mapping of
+  // names to addresses is given by the Model, which owns both Operator's and
+  // Array's. Thus, an Operator on its own doesn't contain much information,
+  // it is meant to be used in conjunction with the Model that owns it.
+  std::vector<string> inputs;
+
+  // Output activation arrays. Same comments as for inputs apply here too.
+  std::vector<string> outputs;
+
+  // If true, the array has more outputs than are listed in the 'outputs'
+  // member. These need to be resolved by some graph transformation.
+  // This flag is only here to indicate that an operator should not be
+  // discarded as unused, even if from its 'outputs' member alone it
+  // looks unused.
+  bool unresolved_outputs = false;
+
+  // A serialized tensorflow::NodeDef string.
+  // The field is filled only when importing from TensorFlow.
+  // It's guaranteed to be filled for `TensorFlowUnsupportedOperator`.
+  // It's not guaranteed to be filled for other ops. Ops created by graph
+  // transformations won't have TensorFlow NodeDef.
+  string tensorflow_node_def;
+
+ protected:
+  // Constructor used by subclasses for specific OperatorType's.
+  explicit Operator(OperatorType t)
+      : type(t),
+        fused_activation_function(FusedActivationFunctionType::kNone) {}
+};
+
+// Padding types for Conv-like operators. This is how padding is typically
+// specified in model files. But for inference, we will need to resolve this
+// to a FixedPadding, see below.
+enum class PaddingType { kNone, kSame, kValid };
+
+// Padding as resolved for a specific layer shape, as needed for inference.
+// For a given layer shape, a given padding type will resolve to a choice of
+// a number of padding rows and columns, which we call the padding height and
+// width respectively.
+struct FixedPadding {
+  int width = 0;
+  int height = 0;
+};
+
+// "Universal" padding struct containing both a generic PaddingType (as
+// represented in a model file), and a FixedPadding (as needed for inference).
+// The latter is resolved during the PropagateFixedSizes pass.
+struct Padding {
+  FixedPadding& GetOrCreateFixedPadding() {
+    if (!fixed) {
+      FixedPadding* ptr = new FixedPadding;
+      fixed = std::unique_ptr<FixedPadding>(ptr);
+    }
+    return *fixed;
+  }
+
+  Padding() : type(PaddingType::kNone) {}
+  PaddingType type;
+  std::unique_ptr<FixedPadding> fixed;
+};
+
+// "Convolutional" layer, as represented in model files.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the Conv weights
+//   inputs[2]: optional: the bias vector, specifying the biases for each output
+//   channel.
+//
+// Outputs:
+//   outputs[0]: required: the output activations array
+//   outputs[1]: optional: the intermediate array of im2col-replicated input
+//                         activations. Present when targeting implementations
+//                         of Conv layers as Im2col+GEMM.
+//
+// TensorFlow equivalent: Conv2D
+struct ConvOperator : Operator {
+  ConvOperator() : Operator(OperatorType::kConv) {}
+  Padding padding;
+  int stride_width = 0;
+  int stride_height = 0;
+  // A dilation_rate of 0 is invalid and this field is an optional attribute.
+  // Thus initializing it to 1 to allow default conv behavior when the
+  // attribute is not present.
+  int dilation_width_factor = 1;
+  int dilation_height_factor = 1;
+};
+
+// CTCBeamSearchDecoder operator:
+//
+// Inputs:
+//   inputs[0]: required: the logits.
+//   inputs[1]: required: sequence length.
+//   inputs[2]: optional: beam width.
+//   inputs[3]: optional: top paths.
+//   inputs[4]: optional: merge repeated.
+//
+//  Outputs:
+//    outputs[0]: deocoded.
+//    outputs[1]: log probability.
+//
+// TensorFlow equivalent: CTCBeamSearchDecoder
+struct CTCBeamSearchDecoderOperator : Operator {
+  CTCBeamSearchDecoderOperator()
+      : Operator(OperatorType::kCTCBeamSearchDecoder) {}
+  int beam_width;
+  int top_paths;
+  bool merge_repeated = true;
+};
+
+// Depthwise-separable convolution operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the DepthwiseConv weights
+//   inputs[2]: optional: the bias vector, specifying the biases for each output
+//   channel.
+//
+// TensorFlow equivalent: DepthwiseConv2dNative
+struct DepthwiseConvOperator : Operator {
+  DepthwiseConvOperator() : Operator(OperatorType::kDepthwiseConv) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int depth_multiplier = 0;
+  // A dilation_rate of 0 is invalid and this field is an optional attribute.
+  // Thus initializing it to 1 to allow default conv behavior when the
+  // attribute is not present.
+  int dilation_width_factor = 1;
+  int dilation_height_factor = 1;
+};
+
+// Depth-to-space transform operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//
+// TensorFlow equivalent: DepthToSpace
+struct DepthToSpaceOperator : Operator {
+  DepthToSpaceOperator() : Operator(OperatorType::kDepthToSpace) {}
+  int block_size = 0;
+};
+
+// Space-to-depth transform operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//
+// TensorFlow equivalent: SpaceToDepth
+struct SpaceToDepthOperator : Operator {
+  SpaceToDepthOperator() : Operator(OperatorType::kSpaceToDepth) {}
+  int block_size = 0;
+};
+
+// Fully-connected operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the FullyConnected weights
+//   inputs[2]: optional: the bias vector, specifying the biases for each output
+//   channel.
+//
+// TensorFlow equivalent: a pair consisting of a Reshape node reshaping the
+// input activations as a matrix, followed by a MatMul node.
+struct FullyConnectedOperator : Operator {
+  FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {}
+  FullyConnectedWeightsFormat weights_format =
+      FullyConnectedWeightsFormat::kDefault;
+};
+
+// Dequantization operator, converting a quantized array of integers with
+// quantization parameters specifying how these integers correspond to real
+// numbers
+// (see QuantizationParams) to an output activations array of floating-point
+// values.
+//
+// In floating-point image models, there is typically a Dequantization operator
+// at the very beginning, converting the input image RGB data, consisting of
+// uint8 integer values, to floating-point input activations. That is where
+// image model parameters such as "mean_value" and "std_value" are typically
+// handled.
+//
+// This is the only operator type that converts from quantized to
+// floating-point,
+// and there is at the moment no operator type at all to convert from
+// floating-point
+// to quantized. Every other operator does either float->float or
+// quantized->quantized.
+//
+// Inputs:
+//   inputs[0]: required: the input quantized activations array
+//
+// TensorFlow equivalent: Dequantize
+struct DequantizeOperator : Operator {
+  DequantizeOperator() : Operator(OperatorType::kDequantize) {}
+};
+
+// Batch-normalization operator.
+//
+// We only support batch-normalization using pre-learned moments, so this is
+// just
+// computing (input - mean) * multiplier + offset. As such, this can be
+// expressed as a combination of Add and Mul nodes, and indeed this is how
+// we break it down during tooling for the purpose of fusing it into
+// other operators.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the learned mean array
+//   inputs[2]: required: the learned multiplier array
+//   inputs[3]: required: the learned offset array
+//
+// TensorFlow equivalent: a combination of Add and Mul nodes
+struct BatchNormalizationOperator : Operator {
+  BatchNormalizationOperator()
+      : Operator(OperatorType::kBatchNormalization),
+        global_normalization(false) {}
+  bool global_normalization;
+};
+
+// L2-normalization operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//
+// TensorFlow equivalent: none. In TensorFlow, L2 normalization is implemented
+// by a sub-graph of operators implementing L2-normalization
+// from lower-level arithmetic nodes; during tooling, we identify such
+// sub-graphs
+// and replace them by L2NormalizationOperator's. See IdentifyL2Normalization.
+struct L2NormalizationOperator : Operator {
+  L2NormalizationOperator() : Operator(OperatorType::kL2Normalization) {}
+};
+
+// LSTM Cell operator.
+//
+// Inputs:
+//   inputs[0]: required: the input data array
+//   inputs[1]: required: the previous output activations array
+//   inputs[2]: required: the learned weights array
+//   inputs[3]: required: the learned biases array
+//   inputs[4]: required: the previous output state
+//   outputs[0]: required: the output activations array
+//   outputs[1]: required: the new state array
+//
+// TensorFlow equivalent: none. In TensorFlow, an LSTM is implemented
+// with a sub-graph of lower-level arithmetic nodes; during tooling, we identify
+// such sub-graphs and replace them with LstmCells. See IdentifyLstmCell().
+struct LstmCellOperator : Operator {
+  enum Inputs {
+    DATA_INPUT = 0,
+    PREV_ACTIV_INPUT = 1,
+    WEIGHTS_INPUT = 2,
+    BIASES_INPUT = 3,
+    PREV_STATE_INPUT = 4,
+    NUM_INPUTS = 5
+  };
+  enum Outputs {
+    ACTIV_OUTPUT = 0,
+    STATE_OUTPUT = 1,
+    CONCAT_TEMP = 2,
+    ACTIV_TEMP = 3,
+    NUM_OUTPUTS = 4
+  };
+  enum KernelType {
+    KERNEL_BASIC = 0,
+    KERNEL_FULL = 1,
+  };
+
+  LstmCellOperator()
+      : Operator(OperatorType::kLstmCell), kernel_type(KERNEL_BASIC) {}
+
+  KernelType kernel_type;
+};
+
+struct UnidirectionalSequenceLstmOperator : Operator {
+  UnidirectionalSequenceLstmOperator()
+      : Operator(OperatorType::kUnidirectionalSequenceLstm) {}
+};
+
+// Element-wise multiplication operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Mul
+struct MulOperator : Operator {
+  MulOperator() : Operator(OperatorType::kMul) {}
+};
+
+// Element-wise Abs operator:
+//   x -> abs(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Relu
+struct AbsOperator : Operator {
+  AbsOperator() : Operator(OperatorType::kAbs) {}
+};
+
+// Element-wise Relu operator:
+//   x -> max(0, x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Relu
+struct ReluOperator : Operator {
+  ReluOperator() : Operator(OperatorType::kRelu) {}
+};
+
+// Element-wise Relu1 operator:
+//   x -> min(max(x, -1), 1)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: none. We can construct the operator with Minimum
+// and Maximum operations
+struct Relu1Operator : Operator {
+  Relu1Operator() : Operator(OperatorType::kRelu1) {}
+};
+
+// Element-wise Relu6 operator:
+//   x -> max(0, min(6, x))
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Relu6
+struct Relu6Operator : Operator {
+  Relu6Operator() : Operator(OperatorType::kRelu6) {}
+};
+
+// PRelu
+//   f(x) = alpha * x for x < 0, f(x) = x for x >= 0.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the alpha array
+//
+// Equivalent to keras.layers.PReLU.
+struct PReluOperator : Operator {
+  PReluOperator() : Operator(OperatorType::kPRelu) {}
+};
+
+// LeakyRelu
+//   x -> max(x, alpha * x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: LeakyRelu
+struct LeakyReluOperator : Operator {
+  LeakyReluOperator() : Operator(OperatorType::kLeakyRelu) {}
+
+  float alpha = 0.2f;  // 0.2 matches the default value for the TF op attribute.
+};
+
+// Element-wise Logistic operator:
+//   x -> Logistic(x) = 1 / (1 + exp(-x))
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sigmoid
+struct LogisticOperator : Operator {
+  LogisticOperator() : Operator(OperatorType::kLogistic) {}
+};
+
+// Element-wise natural log operator:
+//   x -> ln(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Log
+struct LogOperator : Operator {
+  LogOperator() : Operator(OperatorType::kLog) {}
+};
+
+// Element-wise Tanh operator:
+//   x -> Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Tanh
+struct TanhOperator : Operator {
+  TanhOperator() : Operator(OperatorType::kTanh) {}
+};
+
+// Element-wise Sin operator:
+//   x -> Sin(x) = sin(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sin
+struct SinOperator : Operator {
+  SinOperator() : Operator(OperatorType::kSin) {}
+};
+
+// Element-wise addition operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Add
+struct AddOperator : Operator {
+  AddOperator() : Operator(OperatorType::kAdd) {}
+};
+
+// Element-wise addition operator for N inputs.
+//
+// Inputs:
+//   inputs[i]: The i-th array to add together to form the output.
+//
+// TensorFlow equivalent: AddN
+struct AddNOperator : Operator {
+  AddNOperator() : Operator(OperatorType::kAddN) {}
+};
+
+// Concatenation operator: concatenates its inputs
+// along the axis.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to concatenate.
+//
+// TensorFlow equivalent: Concat.
+struct ConcatenationOperator : Operator {
+  ConcatenationOperator() : Operator(OperatorType::kConcatenation) {}
+  int axis = 0;
+};
+
+// Reordering dimensions. Used only during tooling to transform graphs from
+// the TensorFlow format.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: none. This is only useful to convert between formats.
+struct ReorderAxesOperator : Operator {
+  ReorderAxesOperator() : Operator(OperatorType::kReorderAxes) {}
+  AxesOrder input_axes_order;
+  AxesOrder output_axes_order;
+};
+
+// Average-pooling operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: AveragePool
+struct AveragePoolOperator : Operator {
+  AveragePoolOperator() : Operator(OperatorType::kAveragePool) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int kheight = 0;
+  int kwidth = 0;
+};
+
+// Local response normalization operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: LRN
+struct LocalResponseNormalizationOperator : Operator {
+  LocalResponseNormalizationOperator()
+      : Operator(OperatorType::kLocalResponseNormalization) {}
+
+  int range = 0;
+  float bias = 0.f;
+  float alpha = 0.f;
+  float beta = 0.f;
+};
+
+// Max-pooling operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: MaxPool
+struct MaxPoolOperator : Operator {
+  MaxPoolOperator() : Operator(OperatorType::kMaxPool) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int kheight = 0;
+  int kwidth = 0;
+};
+
+// L2-pooling operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: none. Can be shimmed by squaring+avgpool+sqrt.
+struct L2PoolOperator : Operator {
+  L2PoolOperator() : Operator(OperatorType::kL2Pool) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int kheight = 0;
+  int kwidth = 0;
+};
+
+// The expected [min, max] range of values in a given array.
+// Used for quantization only.
+// This information typically comes from special nodes found in quantized
+// models, see FakeQuantOperator, and is used during quantization to resolve
+// actual quantization parameters (see QuantizationParams).
+struct MinMax {
+  double min = 0.;
+  double max = 0.;
+};
+
+inline bool operator==(const MinMax& m1, const MinMax& m2) {
+  return m1.min == m2.min && m1.max == m2.max;
+}
+
+// Fake-quantization operator. This does two things:
+//   - Annotate its input and output arrays with MinMax information,
+//   - Arithmetic-wise, this operator rounds incoming activation values
+//     to the nearest representable value on the scale of 256
+//     values from the min to the max value dictated by its MinMax info.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: optional: the 'min' value, if it has not yet been resolved
+//              to a constant.
+//   inputs[2]: optional: the 'max' value, if it has not yet been resolved
+//              to a constant.
+//
+// TensorFlow equivalent: FakeQuantWithMinMaxVars, FakeQuantWithMinMaxArgs.
+struct FakeQuantOperator : Operator {
+  FakeQuantOperator() : Operator(OperatorType::kFakeQuant) {}
+  std::unique_ptr<MinMax> minmax;
+  int num_bits = 8;
+  bool narrow_range = false;
+};
+
+// Element-wise division operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Div
+struct DivOperator : Operator {
+  DivOperator() : Operator(OperatorType::kDiv) {}
+};
+
+// Element-wise identity (x->x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Identity
+struct TensorFlowIdentityOperator : Operator {
+  TensorFlowIdentityOperator() : Operator(OperatorType::kIdentity) {}
+};
+
+// Batch matrix multiplication operator. This comes from the (deprecated)
+// tf.batch_matmul or a tf.matmul that has rank 3. dims(0) is the batch count
+// and it can be trivially unrolled into a series of matmuls on each element.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side matrix
+//   inputs[1]: required: the right-hand side matrix
+//
+// TensorFlow equivalent: MatMul
+struct BatchMatMulOperator : Operator {
+  BatchMatMulOperator() : Operator(OperatorType::kBatchMatMul) {}
+};
+
+// General matrix multiplication operator. We don't want to support general
+// matrix multiplication at inference time, so we resolve it during tooling
+// to more specific operator types, namely, FullyConnected.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side matrix
+//   inputs[1]: required: the right-hand side matrix
+//
+// TensorFlow equivalent: MatMul
+struct TensorFlowMatMulOperator : Operator {
+  TensorFlowMatMulOperator() : Operator(OperatorType::kMatMul) {}
+  bool transpose_a = false;
+  bool transpose_b = false;
+};
+
+// Padding operator. Pads a tensor with zeros.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the padding array
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of
+// `input` in that dimension.
+//
+// TensorFlow equivalent: Pad
+struct PadOperator : Operator {
+  PadOperator() : Operator(OperatorType::kPad) {}
+
+  std::vector<int> left_padding;
+  std::vector<int> right_padding;
+};
+
+// PaddingV2 operator. Pads a tensor with the given constant value.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the padding array
+//   inputs[2]: required: the scalar constant_values
+//
+// This operation pads input according to the paddings and constant_values you
+// specify. paddings is an integer tensor with shape [Dn, 2], where n is the
+// rank of input. For each dimension D of input, paddings[D, 0] indicates how
+// many padding values to add before the contents of input in that dimension,
+// and paddings[D, 1] indicates how many padding values to add after the
+// contents of input in that dimension. constant_values is a scalar tensor of
+// the same type as input that indicates the value to use for padding input.
+//
+// TensorFlow equivalent: PadV2
+struct PadV2Operator : Operator {
+  PadV2Operator() : Operator(OperatorType::kPadV2) {}
+
+  std::vector<int> left_padding;
+  std::vector<int> right_padding;
+};
+
+// Strided slice operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the begin array
+//   inputs[2]: required: the end array
+//   inputs[3]: optional: the strides array
+//
+// TensorFlow equivalent: StridedSlice
+struct StridedSliceOperator : Operator {
+  StridedSliceOperator() : Operator(OperatorType::kStridedSlice) {}
+
+  std::vector<int> start_indices;
+  std::vector<int> stop_indices;
+  std::vector<int> strides;
+
+  int begin_mask;
+  int ellipsis_mask;
+  int end_mask;
+  int new_axis_mask;
+  int shrink_axis_mask;
+
+  StridedSliceOperator(const StridedSliceOperator& other)
+      : Operator(OperatorType::kStridedSlice) {
+    inputs = other.inputs;
+    outputs = other.outputs;
+
+    start_indices = other.start_indices;
+    stop_indices = other.stop_indices;
+    strides = other.strides;
+
+    begin_mask = other.begin_mask;
+    ellipsis_mask = other.ellipsis_mask;
+    end_mask = other.end_mask;
+    new_axis_mask = other.new_axis_mask;
+    shrink_axis_mask = other.shrink_axis_mask;
+  }
+
+  void PadIndices(int dim_count) {
+    // Add indices and mask bits to fully include extra dimensions
+    CHECK_GE(dim_count, start_indices.size());
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    for (int i = start_indices.size(); i < dim_count; i++) {
+      start_indices.push_back(0);
+      stop_indices.push_back(0);
+      strides.push_back(1);
+      begin_mask |= 1 << i;
+      end_mask |= 1 << i;
+    }
+  }
+
+  void ReverseIndices() {
+    CHECK_EQ(start_indices.size(), stop_indices.size());
+    CHECK_EQ(stop_indices.size(), strides.size());
+
+    std::reverse(start_indices.begin(), start_indices.end());
+    std::reverse(stop_indices.begin(), stop_indices.end());
+    std::reverse(strides.begin(), strides.end());
+
+    begin_mask = toco::port::ReverseBits32(static_cast<uint32>(begin_mask)) >>
+                 (32 - start_indices.size());
+    ellipsis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(ellipsis_mask)) >>
+        (32 - start_indices.size());
+    end_mask = toco::port::ReverseBits32(static_cast<uint32>(end_mask)) >>
+               (32 - start_indices.size());
+    new_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(new_axis_mask)) >>
+        (32 - start_indices.size());
+    shrink_axis_mask =
+        toco::port::ReverseBits32(static_cast<uint32>(shrink_axis_mask)) >>
+        (32 - start_indices.size());
+  }
+};
+
+// Reshaping operator, reshaping its input array to a two-dimensional shape
+// (a "matrix"). This is used in the TensorFlow format, in conjunction with
+// MatMul nodes, to implement fully-connected layers.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Reshape --- except that we only support a special case
+// here, where the output shape is a matrix (2D) shape.
+struct TensorFlowReshapeOperator : Operator {
+  TensorFlowReshapeOperator() : Operator(OperatorType::kReshape) {}
+  std::vector<int> shape;
+};
+
+// Removes dimensions of size 1 from the shape of a tensor.
+// https://www.tensorflow.org/api_docs/python/tf/squeeze
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Squeeze
+struct SqueezeOperator : Operator {
+  SqueezeOperator() : Operator(OperatorType::kSqueeze) {}
+
+  std::vector<int> squeeze_dims;
+};
+
+// Inputs:
+//   inputs[0]: required: the output shape
+//   inputs[1]: required: the weights
+//   inputs[2]: required: the input activations array
+//   NOTE: The input activations is NOT the first input.
+//
+//
+// Outputs:
+//   outputs[0]: required: the output activations array
+//
+// TensorFlow equivalent: Conv2DBackpropInput
+struct TransposeConvOperator : Operator {
+  enum Inputs {
+    OUTPUT_SHAPE = 0,
+    WEIGHTS = 1,
+    DATA_INPUT = 2,
+  };
+
+  TransposeConvOperator() : Operator(OperatorType::kTransposeConv) {}
+  Padding padding;
+  int stride_width = 0;
+  int stride_height = 0;
+  // Dilation is possible with transpose convolution, but Tensorflow does not
+  // currently support it, so we omit it.
+};
+
+// Given a tensor input, this operation calculates element-wise exponential
+// (y = e^x).
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//
+// TensorFlow equivalent: Exp
+struct ExpOperator : Operator {
+  ExpOperator() : Operator(OperatorType::kExp) {}
+};
+
+// Given a tensor input, this operation inserts a dimension of 1 at the
+// dimension index axis of input's shape. The dimension index axis starts at
+// zero; if you specify a negative number for axis it is counted backward from
+// the end.
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//   inputs[1]: required: 0-D (scalar). Specifies the dimension index at which
+//   to expand the shape of input
+//
+// TensorFlow equivalent: ExpandDims
+struct ExpandDimsOperator : Operator {
+  ExpandDimsOperator() : Operator(OperatorType::kExpandDims) {}
+};
+
+// Ceates a tensor of shape dims and fills it with the given scalar value.
+// Output type will be the same as the given scalar value.
+//
+// Inputs:
+//   inputs[0]: required: 1-D (int32) - the shape of the output tensor
+//   inputs[1]: required: 0-D (scalar) - value to fill the tensor with
+//
+// TensorFlow equivalent: Fill
+struct FillOperator : Operator {
+  FillOperator() : Operator(OperatorType::kFill) {}
+};
+
+// Element-wise floor division operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorDiv
+struct FloorDivOperator : Operator {
+  FloorDivOperator() : Operator(OperatorType::kFloorDiv) {}
+};
+
+// Element-wise floor mod operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorMod
+struct FloorModOperator : Operator {
+  FloorModOperator() : Operator(OperatorType::kFloorMod) {}
+};
+
+struct RandomUniformOperator : Operator {
+  RandomUniformOperator() : Operator(OperatorType::kRandomUniform) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+  int64 seed;
+  int64 seed2;
+};
+
+// Creates a sequence of numbers that begins at start and extends by increments
+// of delta up to but not including limit.
+//
+// The dtype of the resulting tensor is inferred from the inputs unless it is
+// provided explicitly.
+//
+// Inputs:
+//   inputs[0]: required: the start
+//   inputs[1]: required: the limit
+//   inputs[2]: required: the delta
+//
+// TensorFlow equivalent: Range
+struct RangeOperator : Operator {
+  RangeOperator() : Operator(OperatorType::kRange) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
+// Rank operator. Extracts the rank of the tensor.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// This operation outputs a 0-D integer tensor representing the rank of
+// the input.
+//
+// TensorFlow equivalent: Rank.  We currently assume that the output is int32
+// and not int64.  The output type could be stored herein.
+struct RankOperator : Operator {
+  RankOperator() : Operator(OperatorType::kRank) {}
+};
+
+// Element-wise negation (-x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Neg
+struct NegOperator : Operator {
+  NegOperator() : Operator(OperatorType::kNeg) {}
+};
+
+// Element-wise select operator choosing elements from inputs[1] or input[2]
+//
+// Inputs:
+//  inputs[0]: required: boolean mask per index
+//  inputs[1]: required: tensor of values if true
+//  inputs[2]: required: tensor of values if false
+//
+//  TensorFlow equivalent: Select
+struct SelectOperator : Operator {
+  SelectOperator() : Operator(OperatorType::kSelect) {}
+};
+
+// Element-wise reciprocal-square-root (x^-0.5) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Rsqrt
+struct TensorFlowRsqrtOperator : Operator {
+  TensorFlowRsqrtOperator() : Operator(OperatorType::kRsqrt) {}
+};
+
+// Stacks a list of rank-R tensors into one rank-(R+1) tensor.
+//
+// Packs the list of tensors in values into a tensor with rank one higher than
+// each tensor in values, by packing them along the axis dimension. Given a list
+// of length N of tensors of shape (A, B, C);.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to merge.
+//
+// TensorFlow equivalent: Pack
+struct PackOperator : Operator {
+  PackOperator() : Operator(OperatorType::kPack) {}
+  int values_count;
+  int axis = 0;
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
+// Shape operator. Extracts the shape of the tensor.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// This operation outputs a 1-D integer tensor representing the shape of
+// the input.
+//
+// TensorFlow equivalent: Shape.
+struct TensorFlowShapeOperator : Operator {
+  TensorFlowShapeOperator() : Operator(OperatorType::kShape) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt32;
+};
+
+// Element-wise square-root (x^0.5) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sqrt
+struct TensorFlowSqrtOperator : Operator {
+  TensorFlowSqrtOperator() : Operator(OperatorType::kSqrt) {}
+};
+
+// Element-wise square (x*x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Square
+struct TensorFlowSquareOperator : Operator {
+  TensorFlowSquareOperator() : Operator(OperatorType::kSquare) {}
+};
+
+// Element-wise squared difference ((x-y)*(x-y)) operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: SquaredDifference
+struct SquaredDifferenceOperator : Operator {
+  SquaredDifferenceOperator() : Operator(OperatorType::kSquaredDifference) {}
+};
+
+// Transposes a tensor.
+//
+// By default, this operation performs a regular matrix transpose on 2-D input
+// tensors.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Transpose
+struct TransposeOperator : Operator {
+  TransposeOperator() : Operator(OperatorType::kTranspose) {}
+  std::vector<int> perm;
+};
+
+// Element-wise subtraction operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Sub
+struct SubOperator : Operator {
+  SubOperator() : Operator(OperatorType::kSub) {}
+};
+
+// Sum reduction: computes the sum of all of entries across the axes.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sum
+struct TensorFlowSumOperator : Operator {
+  TensorFlowSumOperator() : Operator(OperatorType::kSum) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// Prod reduction: computes the product of all of entries across the axes.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Prod
+struct TensorFlowProdOperator : Operator {
+  TensorFlowProdOperator() : Operator(OperatorType::kReduceProd) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// TensorFlow Tile equivalent. Refer to TensorFlow documentation for details.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: int array with length of rank(input[0])
+struct TensorFlowTileOperator : Operator {
+  TensorFlowTileOperator() : Operator(OperatorType::kTile) {}
+};
+
+// TensorFlow Slice equivalent. Refer to TensorFlow documentation for details.
+struct SliceOperator : Operator {
+  SliceOperator() : Operator(OperatorType::kSlice) {}
+
+  std::vector<int> begin;
+  std::vector<int> size;
+};
+
+// TensorFlow Split equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+struct TensorFlowSplitOperator : Operator {
+  TensorFlowSplitOperator() : Operator(OperatorType::kSplit) {}
+  int num_split = 0;
+};
+
+// TensorFlow SplitV equivalent. Refer to TensorFlow documentation for details.
+struct TensorFlowSplitVOperator : Operator {
+  TensorFlowSplitVOperator() : Operator(OperatorType::kSplitV) {}
+  int num_split = 0;
+};
+
+// TensorFlow Concat equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Concretely, once the concat dim becomes known, if it is the depth
+// dimension then we can change this op into a DepthConcatenation op.
+// Otherwise, we hope for some other graph transformation to drop this node.
+struct TensorFlowConcatOperator : Operator {
+  TensorFlowConcatOperator() : Operator(OperatorType::kConcat) {}
+};
+
+// TensorFlow ConcatV2 equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Concretely, once the concat dim becomes known, if it is the depth
+// dimension then we can change this op into a DepthConcatenation op.
+// Otherwise, we hope for some other graph transformation to drop this node.
+struct TensorFlowConcatV2Operator : Operator {
+  TensorFlowConcatV2Operator() : Operator(OperatorType::kConcatV2) {}
+};
+
+// TensorFlow Merge equivalent. Refer to TensorFlow documentation for details.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to merge.
+//
+// It is expected that graph transformations will drop all but exactly one
+// of the inputs, at which point the Merge node will be equivalent to an
+// Identity node forwarding the remaining input.
+//
+// Note: We do not currently support runtime control flow: we only support
+// control flow that can be resolved at tooling time (independently of input
+// activations).
+struct TensorFlowMergeOperator : Operator {
+  TensorFlowMergeOperator() : Operator(OperatorType::kMerge) {}
+};
+
+// TensorFlow Switch equivalent. Refer to TensorFlow documentation for details.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the boolean predicate, given as an array of size 1
+//     and of type kBool, will determine which output gets selected.
+//
+// Outputs: a TensorFlow Switch node always has exactly two outputs. Depending
+// on the boolean value that the input predicate resolves to (see note below),
+// one or the other of the outputs will be 'selected': the input array will be
+// forwarded to the 'selected output' as if by a Identity node, while the other
+// output will be discarded, and any graph edge connecting that discarded output
+// will be dropped. The rule for selecting outputs is as follows:
+//   outputs[0] will be selected if the input predicate resolves to 'true'.
+//   outputs[1] will be selected if the input predicate resolves to 'false'.
+//
+// Note: We do not currently support runtime control flow: we only support
+// control flow that can be resolved at tooling time (independently of input
+// activations).
+struct TensorFlowSwitchOperator : Operator {
+  TensorFlowSwitchOperator() : Operator(OperatorType::kSwitch) {}
+};
+
+// TensorFlow All equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowAllOperator : Operator {
+  TensorFlowAllOperator() : Operator(OperatorType::kAll) {}
+};
+
+// TensorFlow Assert equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, we just drop Assert nodes.
+struct TensorFlowAssertOperator : Operator {
+  TensorFlowAssertOperator() : Operator(OperatorType::kAssert) {}
+};
+
+// TensorFlow Less equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowLessOperator : Operator {
+  TensorFlowLessOperator() : Operator(OperatorType::kLess) {}
+};
+
+// TensorFlow LessEqual equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowLessEqualOperator : Operator {
+  TensorFlowLessEqualOperator() : Operator(OperatorType::kLessEqual) {}
+};
+
+// TensorFlow Less equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowGreaterOperator : Operator {
+  TensorFlowGreaterOperator() : Operator(OperatorType::kGreater) {}
+};
+
+// TensorFlow GreaterEqual equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowGreaterEqualOperator : Operator {
+  TensorFlowGreaterEqualOperator() : Operator(OperatorType::kGreaterEqual) {}
+};
+
+// TensorFlow Equal equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowEqualOperator : Operator {
+  TensorFlowEqualOperator() : Operator(OperatorType::kEqual) {}
+};
+
+// TensorFlow Not Equal equivalent. Refer to TensorFlow documentation for
+// details.
+struct TensorFlowNotEqualOperator : Operator {
+  TensorFlowNotEqualOperator() : Operator(OperatorType::kNotEqual) {}
+};
+
+// Max reduction: computes the max of all of entries across the axes.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Max
+struct TensorFlowMaxOperator : Operator {
+  TensorFlowMaxOperator() : Operator(OperatorType::kReduceMax) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// Min reduction: computes the min of all of entries across the axes.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Min
+struct TensorFlowMinOperator : Operator {
+  TensorFlowMinOperator() : Operator(OperatorType::kReduceMin) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// Element-wise maximum operator. Currently it only supports scalar as
+// the second operand.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Maximum
+struct TensorFlowMaximumOperator : Operator {
+  TensorFlowMaximumOperator() : Operator(OperatorType::kMaximum) {}
+};
+
+// Element-wise minimum operator. Currently it only supports scalar as
+// the second operand.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Minimum
+struct TensorFlowMinimumOperator : Operator {
+  TensorFlowMinimumOperator() : Operator(OperatorType::kMinimum) {}
+};
+
+// General TF operation, unsupported by tf.mini. Expected to be dropped by
+// graph transformations.
+struct TensorFlowUnsupportedOperator : Operator {
+  TensorFlowUnsupportedOperator() : Operator(OperatorType::kUnsupported) {}
+
+  // The original TF operation type. Used for diagnostic purposes.
+  string tensorflow_op;
+  // A boolean indicating if the unsupported op should be treated as quantized.
+  bool quantized = false;
+  // A boolean indicating if the unsupported op output should allow float values
+  // in quantized mode.
+  bool support_output_type_float_in_quantized_op = false;
+  // Output data types
+  std::vector<ArrayDataType> output_data_types;
+  // Output shapes.
+  std::vector<Shape> output_shapes;
+};
+
+// Softmax activation function.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Softmax
+struct SoftmaxOperator : Operator {
+  SoftmaxOperator() : Operator(OperatorType::kSoftmax) {}
+  float beta = 0.f;
+};
+
+// LogSoftmax activation function.
+//
+// Inputs:
+//   inputs[0]: required: the logits input array
+//
+// TensorFlow equivalent: LogSoftmax
+struct LogSoftmaxOperator : Operator {
+  LogSoftmaxOperator() : Operator(OperatorType::kLogSoftmax) {}
+
+  // LogSoftmax can in principal have very large negative output, depending on
+  // the input size.  However, input x_i that is less than x_max-10 is
+  // accumulated as exp(x_i-x_max), which is truncated to zero.
+  //
+  // Since we effectively disregard smallish inputs in the normalizing factor,
+  // we also drop them in the output (set to minimum output), and in doing so
+  // make better use of the quantization range / resolution.
+  static constexpr float kOutputRangeMin = -16.0;
+};
+
+// Cast operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Cast
+struct CastOperator : Operator {
+  CastOperator() : Operator(OperatorType::kCast) {}
+  ArrayDataType src_data_type = ArrayDataType::kNone;
+  ArrayDataType dst_data_type = ArrayDataType::kNone;
+};
+
+// Floor operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Floor
+struct FloorOperator : Operator {
+  FloorOperator() : Operator(OperatorType::kFloor) {}
+};
+
+// Gather operator. It gathers slices from params according to indices.
+// Only 1-D indices are supported at the moment.
+//
+// Inputs:
+//   inputs[0]: required: the params array
+//   inputs[1]: required: the indices to gather
+//   inputs[2]: optional: axis
+//
+// TensorFlow equivalent: Gather
+struct GatherOperator : Operator {
+  GatherOperator() : Operator(OperatorType::kGather) {}
+  // Axis is populated explicitly or implicitly from the axis input by
+  // ResolveGatherAttributes. An empty axis indicates that the axis has not yet
+  // be resolved.
+  absl::optional<int> axis;
+
+  // This field is not used by the standard TF Lite export but it is still need
+  // for legacy Gather implementations.
+  int input_rank = 0;
+};
+
+// ArgMax operator. It returns the index of the maximum value along axis.
+//
+// Inputs:
+//   inputs[0]: required: the input tensor
+//
+// TensorFlow equivalent: ArgMax
+struct ArgMaxOperator : Operator {
+  ArgMaxOperator() : Operator(OperatorType::kArgMax) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt64;
+};
+
+// ArgMin operator. It returns the index of the minimum value along axis.
+//
+// Inputs:
+//   inputs[0]: required: the input tensor
+//
+// TensorFlow equivalent: ArgMin
+struct ArgMinOperator : Operator {
+  ArgMinOperator() : Operator(OperatorType::kArgMin) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt64;
+};
+
+// ResizeBilinear operator. It resizes input images with bilinear interpolation.
+// It does not support align_corners at the moment.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the new image size
+//
+// TensorFlow equivalent: ResizeBilinear
+struct ResizeBilinearOperator : Operator {
+  ResizeBilinearOperator() : Operator(OperatorType::kResizeBilinear) {}
+
+  bool align_corners = false;
+};
+
+// ResizeNearestNeighborOperator operator. It resizes input images with nearest
+// neighbor interpolation. It does not support align_corners at the moment.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the new image size
+//
+// TensorFlow equivalent: ResizeNearestNeighbor
+struct ResizeNearestNeighborOperator : Operator {
+  ResizeNearestNeighborOperator()
+      : Operator(OperatorType::kResizeNearestNeighbor) {}
+
+  bool align_corners = false;
+};
+
+// SpaceToBatchND operator. It divides spatial dimensions into a grid of
+// blocks and interleaves these blocks with the batch dimension. Currently,
+// only 2-d blocks are supported.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the block shape
+//   inputs[2]: required: the paddings
+//
+// TensorFlow equivalent: SpaceToBatchND
+struct SpaceToBatchNDOperator : Operator {
+  SpaceToBatchNDOperator() : Operator(OperatorType::kSpaceToBatchND) {}
+
+  std::vector<int> block_shape;
+  std::vector<int> before_paddings;
+  std::vector<int> after_paddings;
+};
+
+// BatchToSpaceND operator. Rearranges data from batch into blocks of
+// spatial data. Currently, only 2-d blocks are supported.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the block shape
+//   inputs[2]: required: the crops
+//
+// TensorFlow equivalent: BatchToSpaceND
+struct BatchToSpaceNDOperator : Operator {
+  BatchToSpaceNDOperator() : Operator(OperatorType::kBatchToSpaceND) {}
+
+  std::vector<int> block_shape;
+  std::vector<int> before_crops;
+  std::vector<int> after_crops;
+};
+
+// Mean operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Mean
+struct MeanOperator : Operator {
+  MeanOperator() : Operator(OperatorType::kMean) {}
+
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// Svdf operator:
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: weights_feature
+//   inputs[2]: required: weights_time
+//   inputs[3]: optional: bias
+struct SvdfOperator : Operator {
+  SvdfOperator() : Operator(OperatorType::kSvdf) {}
+  int rank;
+};
+
+// TopKV2 operator.
+//
+// Inputs:
+//    input tensor and top_k scalar.
+struct TopKV2Operator : Operator {
+  TopKV2Operator() : Operator(OperatorType::kTopK_V2) {}
+};
+
+// DynamicPartition operator:
+//
+// Inputs:
+//  inputs[0]: required: data.
+//  inputs[1]: required: partitions.
+//
+// TensorFlow equivalent: DynamicPartition
+struct DynamicPartitionOperator : Operator {
+  DynamicPartitionOperator() : Operator(OperatorType::kDynamicPartition) {}
+  int num_partitions;
+};
+
+// DynamicStitch operator:
+//
+// Inputs:
+//  inputs[0,N): required: indices.
+//  inputs[N,2N): required: data.
+//
+// TensorFlow equivalent: DynamicStitch/ParallelDynamicStitch
+struct DynamicStitchOperator : Operator {
+  DynamicStitchOperator() : Operator(OperatorType::kDynamicStitch) {}
+  int num_partitions;
+};
+
+// SparseToDense operator:
+//
+// Inputs:
+// Inputs[0]: required: sparse_indices.
+// Inputs[1]: required: output_shape.
+// Inputs[2]: required: sparse_values.
+//
+// TensorFlow equivalent: SparseToDense.
+struct SparseToDenseOperator : Operator {
+  SparseToDenseOperator() : Operator(OperatorType::kSparseToDense) {}
+  bool validate_indices;
+};
+
+// Pow operator:
+//
+// Inputs:
+// Inputs[0]: required: A tensor.
+// Inputs[1]: required: A tensor.
+//
+// TensorFlow equivalent: Pow.
+struct PowOperator : Operator {
+  PowOperator() : Operator(OperatorType::kPow) {}
+};
+
+// Any operator:
+//
+// Inputs:
+// Inputs[0]: required: A boolean input tensor.
+// Inputs[1]: required: reduction_indices.
+//
+// TensorFlow equivalent: tf.reduce_any.
+struct TensorFlowAnyOperator : Operator {
+  TensorFlowAnyOperator() : Operator(OperatorType::kAny) {}
+  std::vector<int> axis;
+  bool keep_dims = false;
+};
+
+// LogicalAnd operator:
+//
+// Inputs:
+// Inputs[0]: required: A boolean tensor.
+// Inputs[1]: required: A boolean tensor.
+//
+// TensorFlow equivalent: tf.logical_and.
+struct LogicalAndOperator : Operator {
+  LogicalAndOperator() : Operator(OperatorType::kLogicalAnd) {}
+};
+
+// LogicalNot operator:
+//
+// Inputs:
+// Inputs[0]: required: A boolean tensor.
+//
+// TensorFlow equivalent: tf.logical_not.
+struct LogicalNotOperator : Operator {
+  LogicalNotOperator() : Operator(OperatorType::kLogicalNot) {}
+};
+
+// OneHot operator:
+//
+// Inputs:
+// Inputs[0]: required: indices.
+// Inputs[1]: required: depth.
+// Inputs[2]: required: on_value.
+// Inputs[3]: required: off_value.
+//
+// TensorFlow equivalent: OneHot.
+struct OneHotOperator : Operator {
+  enum Inputs {
+    INDICES_INPUT = 0,
+    DEPTH_INPUT = 1,
+    ON_VALUE_INPUT = 2,
+    OFF_VALUE_INPUT = 3,
+  };
+
+  OneHotOperator() : Operator(OperatorType::kOneHot) {}
+  int axis = -1;
+};
+
+// LogicalOr operator:
+//
+// Inputs:
+// Inputs[0]: required: A Bool tensor.
+// Inputs[1]: required: A Bool tensor.
+//
+// TensorFlow equivalent: LogicalOr.
+struct LogicalOrOperator : Operator {
+  LogicalOrOperator() : Operator(OperatorType::kLogicalOr) {}
+};
+
+// Unpack operator:
+//
+// Inputs:
+// Inputs[0]: required: A boolean input tensor.
+// Inputs[1]: required: reduction_indices.
+//
+// TensorFlow equivalent: tf.unstack.
+struct UnpackOperator : Operator {
+  UnpackOperator() : Operator(OperatorType::kUnpack) {}
+  int num;
+  int axis;
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
+// ZerosLike operator:
+//
+// Inputs:
+// inputs[0]: required: the input array
+//
+// TensorFlow equivalent: tf.zeros_like
+struct TensorFlowZerosLikeOperator : Operator {
+  TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {}
+};
+
+enum class MirrorPadMode { kNone, kSymmetric, kReflect };
+
+// MirrorPad Operator:
+//
+// Inputs:
+// Inputs[0]: required: input tensor to be padded.
+// Inputs[1]: required: 2 Column matrix specifying padding sizes. The number of
+// rows must be the same as the rank of the input.
+// Inputs[2]: required: REFLECT or SYMMETRIC.
+//
+// TensorFlow equivalent: MirrorPad.
+struct MirrorPadOperator : Operator {
+  MirrorPadOperator() : Operator(OperatorType::kMirrorPad) {}
+  // mode is either SYMMETRIC or REFLECT.
+  MirrorPadMode mode;
+};
+
+// Alloc's are used for transient arrays only. An Alloc specifies which interval
+// of the "transient_data" workspace buffer passed to inference functions, is to
+// be used for the transient array at hand. The 'start' and 'end' values are
+// offsets from the start of the workspace buffer, expressed in bytes.
+struct Alloc {
+  int64 start = 0;
+  int64 end = 0;
+};
+
+inline bool operator<(const Alloc& a, const Alloc& b) {
+  return a.start < b.start;
+}
+
+// Array represents an array (either a constant parameter array or an
+// activations array) in a Model.
+struct Array {
+  template <ArrayDataType A>
+  const Buffer<A>& GetBuffer() const {
+    DCHECK(buffer);
+    DCHECK(buffer->type == A);
+    return *static_cast<const Buffer<A>*>(buffer.get());
+  }
+  template <ArrayDataType A>
+  Buffer<A>& GetMutableBuffer() {
+    if (!buffer) {
+      Buffer<A>* ptr = new Buffer<A>;
+      buffer = std::unique_ptr<GenericBuffer>(ptr);
+    }
+    DCHECK(buffer);
+    DCHECK(buffer->type == A);
+    return *static_cast<Buffer<A>*>(buffer.get());
+  }
+  Alloc& GetOrCreateAlloc() {
+    if (!alloc) {
+      alloc = std::unique_ptr<Alloc>(new Alloc);
+    }
+    return *alloc;
+  }
+  MinMax& GetOrCreateMinMax() {
+    if (!minmax) {
+      minmax = std::unique_ptr<MinMax>(new MinMax);
+    }
+    return *minmax;
+  }
+  MinMax& GetMinMax() const {
+    DCHECK(minmax);
+    return *minmax;
+  }
+  QuantizationParams& GetOrCreateQuantizationParams() {
+    if (!quantization_params) {
+      quantization_params =
+          std::unique_ptr<QuantizationParams>(new QuantizationParams);
+    }
+    return *quantization_params;
+  }
+  QuantizationParams& GetQuantizationParams() const {
+    DCHECK(quantization_params);
+    return *quantization_params;
+  }
+
+  // The data type of the actual elements of this array, that is:
+  //  - If there is a buffer (see 'buffer' member), it must be of the same
+  //    type.
+  //  - If there is no buffer, meaning that this is a runtime (i.e. activations)
+  //    array, then this specifies the type of elements that there will be
+  //    at runtime.
+  //
+  // Note that this only specifies the storage type of elements; this does
+  // not specify whether these are to be treated as 'real' or 'quantized'
+  // values.
+  // That is decided by whether the 'quantization_params' member is null.
+  ArrayDataType data_type = ArrayDataType::kNone;
+  // The final value that data_type should have at the end of graph
+  // transformations
+  ArrayDataType final_data_type = ArrayDataType::kNone;
+  // The dimensions of this array --- this specifies both sizes and strides
+  // (the storage layout).
+  //
+  // Issues with shape handling that remain include:
+  //   - No way to distinguish between 0-dimensional dims and missing dims.
+  //   - No way to describe dims that may be runtime-variable.
+  //   - Addressing of dims by integer index differs in different graph formats
+  //     (TensorFlow vs. other frameworks vs. what we have informally grown
+  //     within toco).
+  //     This is currently quite messy; see ReorderAxesOperator which is how we
+  //     bridge some of these discrepancies at the moment. This is overdue for
+  //     a redesign; I'm thinking that it would be nice to have more flexible
+  //     dims that allow mapping 1:1, cleanly, dims as they are in various
+  //     formats,
+  //     then explicitly convert between different conventions.
+
+  // Proto-style accessors
+  bool has_shape() const { return array_shape != nullptr; }
+  const Shape& shape() const {
+    CHECK(has_shape());
+    return *array_shape;
+  }
+  Shape* mutable_shape() {
+    if (!array_shape) {
+      array_shape.reset(new Shape);
+    }
+    return array_shape.get();
+  }
+  void copy_shape(const Shape& src_shape) { *mutable_shape() = src_shape; }
+  void clear_shape() { array_shape = nullptr; }
+
+  // The constant buffer backing this array. This is non-null if and only if
+  // this is a constant parameter array. Conversely, this is null for
+  // activations arrays.
+  //
+  // Note that this buffer is pure storage. In the case of quantized values,
+  // it only stores the quantized values, it does not know by itself about the
+  // quantization parameters necessary to interprete these values, that is
+  // in the separate 'quantization_params' field. In fact, this 'buffer' field
+  // does no even know whether values are quantized. It only has a data_type,
+  // which must equal the 'data_type' member here, and which only describes
+  // the storage type of element, does not tell whether they are quantized i.e.
+  // whether they are to be interpreted with quantization_params.
+  std::unique_ptr<GenericBuffer> buffer;
+  // Only for activation arrays (i.e. when 'buffer' is null).
+  // Only for code generation.
+  //
+  // Describes the allocation of this array within the workspace buffer
+  // allocated
+  // for all transient arrays.
+  std::unique_ptr<Alloc> alloc;
+  // Describes the [min, max] range of values
+  // to be assumed when determining quantization_params.
+  //
+  // Only used for quantization. In fact, only used for determining
+  // quantization_params.
+  //
+  // Used for both constant arrays (those having a 'buffer') and non-constant
+  // arrays (activations). Indeed, it is important to use the same min-max range
+  // as was used during training, even if that min-max range is slightly wrong
+  // w.r.t. actual buffer elements. Doing otherwise would defeat the point of
+  // re-training for quantization.
+  std::unique_ptr<MinMax> minmax;
+  // Quantization parameters. The non-null-ness of this pointer is what
+  // defines whether this array is quantized or not.
+  //
+  // If this is non-null, then these quantization parameters are to be used
+  // to assign a meaning as real numbers to the elements of this array.
+  std::unique_ptr<QuantizationParams> quantization_params;
+  // narrow_range is a detail of how toco handles FakeQuant operators with
+  // narrow_range, see
+  // https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_vars
+  //
+  // For more context about what that is useful for, see the big comment in
+  // graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+  //
+  // The narrow_range flag applies only to quantized arrays, and changes
+  // their quantization in the following way when it is set to 'true':
+  // 1. The computation of {zero_point, scale} from {min, max} needs to be
+  //    amended so that the real min value will get quantized to
+  //    (min_quantized_value + 1) instead of just (min_quantized_value).
+  //    E.g. for uint8 quantization, the real min value should get quantized to
+  //    the uint8 value 1, not 0.
+  // 2. Quantized values should get clamped to the interval
+  //    [min_quantized_value + 1, max_value]. Equivalently, the
+  //    min_quantized_value should get nudged to (min_quantized_value + 1).
+  // The reason why 1. does not imply 2. is that real values may not belong to
+  // the stated [min, max] interval. Concretely, weights recorded at the last
+  // learning step may not fall in the [min, max] interval recorded over
+  // previous learning steps, as the values evolve across learning steps.
+  //
+  // Rationale why this is directly a field on Array:
+  // - This can't be just a field on FakeQuantOperator, because
+  //   FakeQuantOperators are gone (DropFakeQuant) before we get to using that
+  //   information (Quantize). We need a place to store that bit in the interim.
+  // - This can't be in QuantizationParams because we need to record this
+  //   ahead of quantization, and QuantizationParams are only created during
+  //   quantization.
+  // - This could be in MinMax, but that would be an abuse of what MinMax is
+  //   about, and would break existing code that assumes that a MinMax is just
+  //   a min and a max. Unlike MinMax which is agnostic as to the quantized
+  //   data type, narrow_range refers to values in the quantized data type.
+  bool narrow_range = false;
+
+ private:
+  std::unique_ptr<Shape> array_shape;
+};
+
+// Our Model struct, represents an entire model (our "top-level" struct).
+// Owns everything.
+class Model {
+ public:
+  using ArrayMap = std::unordered_map<string, std::unique_ptr<Array>>;
+
+  bool HasArray(const string& name) const { return arrays.count(name) > 0; }
+  Array& GetArray(const string& name) const {
+    DCHECK(HasArray(name)) << "Array not found: " << name;
+    return *arrays.at(name);
+  }
+  Array& GetOrCreateArray(const string& name) {
+    // Make sure name is not used by an optional array
+    DCHECK(!optional_arrays.count(name));
+    if (!HasArray(name)) {
+      Array* ptr = new Array;
+      arrays[name] = std::unique_ptr<Array>(ptr);
+    }
+    Array& result = GetArray(name);
+    return result;
+  }
+  void CreateOptionalArray(const string& name) {
+    DCHECK(!arrays.count(name) && !optional_arrays.count(name));
+    optional_arrays.insert(name);
+  }
+  bool IsOptionalArray(const string& name) const {
+    return optional_arrays.count(name);
+  }
+
+  // Note that this invalidates all array iterators.
+  void EraseArray(const string& name) { arrays.erase(name); }
+  void EraseArrays(std::function<bool(const string&)> discardable) {
+    for (auto it = arrays.begin(); it != arrays.end();) {
+      if (discardable(it->first)) {
+        it = arrays.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+  const ArrayMap& GetArrayMap() const { return arrays; }
+  ArrayMap& GetMutableArrayMap() { return arrays; }
+
+  int64 ArithmeticOpsCount() const { return ops_count; }
+
+  // Optional arrays are used for optional tensors,
+  // these tensors do not have data, but with reserved names as op inputs.
+  std::set<string> optional_arrays;
+
+  // The list of operators. Notice how it's a list of unique_ptr's, implying
+  // that the Model is what owns Operator's and keeps them alive.
+  std::vector<std::unique_ptr<Operator>> operators;
+
+  // Generic flags, a place where we combine information passed to us via
+  // command-line parameters (e.g. --input_width=N) with information that
+  // we may or may not find in the input model file.
+  ModelFlags flags;
+  // For code-generation only: required size of the transient_data buffer
+  std::size_t transient_data_size = 0;
+  // For code-generation only: required alignment of the transient_data buffer
+  std::size_t transient_data_alignment = 0;
+  // Arithmetic operations performed in the model.
+  int64 ops_count = 0;
+
+ private:
+  // The associative array mapping names to Array's.
+  // Notice how it's a container of unique_ptr's, implying
+  // that the Model is what owns Array's and keeps them alive.
+  // The Operator's refer to these Array's by their name strings, not by their
+  // addresses. See Operator::inputs, Operator::outputs.
+  std::unordered_map<string, std::unique_ptr<Array>> arrays;
+};
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_MODEL_H_
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
similarity index 98%
rename from tensorflow/contrib/lite/toco/model_cmdline_flags.cc
rename to tensorflow/lite/toco/model_cmdline_flags.cc
index b6a401aaf2f0028f01fd26bb0de08f19450df330..717a28bc615e0a142c41efb3afaa49f64d2a1e14 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/lite/toco/model_cmdline_flags.h"
 
 #include <string>
 #include <vector>
@@ -22,9 +22,9 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
-#include "tensorflow/contrib/lite/toco/args.h"
-#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
-#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
+#include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.h b/tensorflow/lite/toco/model_cmdline_flags.h
similarity index 80%
rename from tensorflow/contrib/lite/toco/model_cmdline_flags.h
rename to tensorflow/lite/toco/model_cmdline_flags.h
index c868d5c7d0b5a6ee81d99423414c87e4e6e7cf66..1642e053199b1aa4fdd41642888a5c15e184f1eb 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.h
+++ b/tensorflow/lite/toco/model_cmdline_flags.h
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#ifndef TENSORFLOW_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#define TENSORFLOW_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
 
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/args.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/types.pb.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/types.pb.h"
 
 namespace toco {
 // Parse and remove arguments for models (in toco). Returns true if parsing
@@ -40,4 +40,4 @@ ParsedModelFlags* GlobalParsedModelFlags();
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#endif  // TENSORFLOW_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/lite/toco/model_flags.proto
similarity index 99%
rename from tensorflow/contrib/lite/toco/model_flags.proto
rename to tensorflow/lite/toco/model_flags.proto
index 6c1c53658c073668357861907d5f01fc627e4707..bcdac295d261c0e7cc04c5a8c3e2e5d88736cd88 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/lite/toco/model_flags.proto
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 syntax = "proto2";
-import "tensorflow/contrib/lite/toco/types.proto";
+import "tensorflow/lite/toco/types.proto";
 
 package toco;
 
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..07056f66c35536e82b8f1fdd7938161e216b850a
--- /dev/null
+++ b/tensorflow/lite/toco/python/BUILD
@@ -0,0 +1,74 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "py_binary")
+
+config_setting(
+    name = "tflite_convert_with_select_tf_ops",
+    define_values = {"tflite_convert_with_select_tf_ops": "true"},
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "toco_python_api",
+    srcs = ["toco_python_api.cc"],
+    hdrs = ["toco_python_api.h"],
+    deps = [
+        "//third_party/python_runtime:headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/toco:model_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_graphviz_dump_options",
+        "//tensorflow/lite/toco:toco_port",
+        "//tensorflow/lite/toco:toco_tooling",
+    ] + select({
+        # This is required when running `tflite_convert` from `bazel`.
+        # It requires to link with TensorFlow Ops to get the op definitions.
+        ":tflite_convert_with_select_tf_ops": [
+            "//tensorflow/core:ops",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+tf_py_wrap_cc(
+    name = "tensorflow_wrap_toco",
+    srcs = ["toco.i"],
+    deps = [
+        ":toco_python_api",
+        "//tensorflow/lite/toco:model_flags_proto_cc",
+        "//tensorflow/lite/toco:toco_flags_proto_cc",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+py_binary(
+    name = "toco_from_protos",
+    srcs = ["toco_from_protos.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tensorflow_wrap_toco",
+        "//tensorflow/python:platform",
+    ],
+)
+
+tf_py_test(
+    name = "toco_from_protos_test",
+    srcs = ["toco_from_protos_test.py"],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/lite/toco:model_flags_proto_py",
+        "//tensorflow/lite/toco:toco_flags_proto_py",
+    ],
+    data = [
+        ":toco_from_protos",
+    ],
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+)
diff --git a/tensorflow/contrib/lite/toco/python/toco.i b/tensorflow/lite/toco/python/toco.i
similarity index 95%
rename from tensorflow/contrib/lite/toco/python/toco.i
rename to tensorflow/lite/toco/python/toco.i
index 0d2fbdd67b3aa59af9d5f32c4f1693fe044a7efa..c7dfdc35ab274ff6fa8c2322b6f353bceea8b11d 100644
--- a/tensorflow/contrib/lite/toco/python/toco.i
+++ b/tensorflow/lite/toco/python/toco.i
@@ -16,7 +16,7 @@ limitations under the License.
 %include "std_string.i"
 
 %{
-#include "tensorflow/contrib/lite/toco/python/toco_python_api.h"
+#include "tensorflow/lite/toco/python/toco_python_api.h"
 %}
 
 namespace toco {
diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos.py b/tensorflow/lite/toco/python/toco_from_protos.py
similarity index 96%
rename from tensorflow/contrib/lite/toco/python/toco_from_protos.py
rename to tensorflow/lite/toco/python/toco_from_protos.py
index c0b032083b2347424b9fd85ab2440e18c0f68e91..152dd241eabba3397f4bed5edc5d2650f40366b1 100644
--- a/tensorflow/contrib/lite/toco/python/toco_from_protos.py
+++ b/tensorflow/lite/toco/python/toco_from_protos.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import argparse
 import sys
-from tensorflow.contrib.lite.toco.python import tensorflow_wrap_toco
+from tensorflow.lite.toco.python import tensorflow_wrap_toco
 from tensorflow.python.platform import app
 
 FLAGS = None
diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py b/tensorflow/lite/toco/python/toco_from_protos_test.py
similarity index 95%
rename from tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
rename to tensorflow/lite/toco/python/toco_from_protos_test.py
index 75c1c8970c9cd552ab86bfe4743634c598e19127..34cfd2c59fdc3aa3c83728f622fbf5b8d02d7e00 100644
--- a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
+++ b/tensorflow/lite/toco/python/toco_from_protos_test.py
@@ -20,9 +20,9 @@ import os
 import tempfile
 
 import tensorflow as tf
-from tensorflow.contrib.lite.toco import model_flags_pb2
-from tensorflow.contrib.lite.toco import toco_flags_pb2
-from tensorflow.contrib.lite.toco import types_pb2
+from tensorflow.lite.toco import model_flags_pb2
+from tensorflow.lite.toco import toco_flags_pb2
+from tensorflow.lite.toco import types_pb2
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import resource_loader
 
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
similarity index 85%
rename from tensorflow/contrib/lite/toco/python/toco_python_api.cc
rename to tensorflow/lite/toco/python/toco_python_api.cc
index d93e104038741e6e59608f04115854d611f1f9ae..ce8e3c9df88ba511fcca9d9a256896624194463b 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -16,13 +16,13 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/platform/logging.h"
 
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/python/toco_python_api.h"
-#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
-#include "tensorflow/contrib/lite/toco/toco_port.h"
-#include "tensorflow/contrib/lite/toco/toco_tooling.h"
-#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/python/toco_python_api.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_tooling.h"
+#include "tensorflow/lite/toco/toco_types.h"
 
 namespace toco {
 
@@ -86,9 +86,12 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
       toco::Import(toco_flags, model_flags, input_contents_txt);
   toco::Transform(toco_flags, model.get());
   string output_file_contents_txt;
-  Export(toco_flags, *model, toco_flags.allow_custom_ops(),
-         &output_file_contents_txt);
-
+  auto status = Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+                       &output_file_contents_txt);
+  if (!status.ok()) {
+    PyErr_SetString(PyExc_Exception, status.error_message().c_str());
+    return nullptr;
+  }
   if (extended_return) {
     PyObject* dict = PyDict_New();
     PyDict_SetItemString(
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.h b/tensorflow/lite/toco/python/toco_python_api.h
similarity index 88%
rename from tensorflow/contrib/lite/toco/python/toco_python_api.h
rename to tensorflow/lite/toco/python/toco_python_api.h
index ee054bbed9823d532bcb1f946ba0816cda95e5ea..4ab0961e1276e47ffcdc21f6c45611405c7a6f68 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.h
+++ b/tensorflow/lite/toco/python/toco_python_api.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#ifndef TENSORFLOW_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#define TENSORFLOW_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
 
 #include <Python.h>
 #include <string>
@@ -33,4 +33,4 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#endif  // TENSORFLOW_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
diff --git a/tensorflow/lite/toco/runtime/common.h b/tensorflow/lite/toco/runtime/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f83be8fa81e0f015b51f7e56b78647e3fbef7ea
--- /dev/null
+++ b/tensorflow/lite/toco/runtime/common.h
@@ -0,0 +1,26 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_RUNTIME_COMMON_H_
+#define TENSORFLOW_LITE_TOCO_RUNTIME_COMMON_H_
+
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif
+#endif
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+#endif  // TENSORFLOW_LITE_TOCO_RUNTIME_COMMON_H_
diff --git a/tensorflow/lite/toco/runtime/types.h b/tensorflow/lite/toco/runtime/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..eac9b8af6e6c786a45b0267158bf88f1a3515ae9
--- /dev/null
+++ b/tensorflow/lite/toco/runtime/types.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_RUNTIME_TYPES_H_
+#define TENSORFLOW_LITE_TOCO_RUNTIME_TYPES_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace toco {
+
+// TODO(ahentz): These are just stopgaps for now, untils we move all
+// the code over to tflite.
+using tflite::Dims;
+using tflite::FullyConnectedWeightsFormat;
+using tflite::FusedActivationFunctionType;
+using tflite::RequiredBufferSizeForDims;
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_RUNTIME_TYPES_H_
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ae361bf212daeae5cede941111329b2265962ce6
--- /dev/null
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
@@ -0,0 +1,90 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "cluster_utils",
+    srcs = [
+        "cluster_utils.cc",
+    ],
+    hdrs = [
+        "cluster_utils.h",
+    ],
+    deps = [
+        "//tensorflow/lite/toco:toco_port",
+    ],
+)
+
+cc_library(
+    name = "cluster",
+    srcs = [
+        "cluster.cc",
+    ],
+    hdrs = [
+        "cluster.h",
+    ],
+    deps = [
+        ":cluster_utils",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+    ],
+)
+
+cc_library(
+    name = "resolve_svdf",
+    srcs = [
+        "resolve_svdf.cc",
+    ],
+    hdrs = [
+        "resolve_svdf.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cluster",
+        ":cluster_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:toco_port",
+        "//tensorflow/lite/toco:tooling_util",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+tf_cc_test(
+    name = "resolve_svdf_test",
+    srcs = ["resolve_svdf_test.cc"],
+    deps = [
+        ":cluster",
+        ":cluster_utils",
+        ":resolve_cluster",
+        ":resolve_svdf",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "resolve_cluster",
+    srcs = [
+        "resolve_cluster.cc",
+    ],
+    hdrs = [
+        "resolve_cluster.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cluster",
+        ":cluster_utils",
+        ":resolve_svdf",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/toco:tooling_util",
+    ],
+)
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.cc b/tensorflow/lite/toco/tensorflow_graph_matching/cluster.cc
similarity index 95%
rename from tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.cc
rename to tensorflow/lite/toco/tensorflow_graph_matching/cluster.cc
index 98a130ea39c45c2c8259c87779532a312433c5a7..afce05dc7a932f3c6c9dd5f1619d9708adf5238b 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/cluster.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster.h"
 
 namespace toco {
 
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h b/tensorflow/lite/toco/tensorflow_graph_matching/cluster.h
similarity index 89%
rename from tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h
rename to tensorflow/lite/toco/tensorflow_graph_matching/cluster.h
index fda7743a27e79478d54b3708ba85c9b6390d0b0e..af268ddd3703f3e0526db3a75dd03204dfeac6d0 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/cluster.h
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
 
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -98,4 +98,4 @@ class ClusterFactoryInterface {
 
 }  // end namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.cc b/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.cc
similarity index 95%
rename from tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.cc
rename to tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.cc
index 14c3cd6487841d6d79b583d9245c130585324d9d..8a010ef8208ee9c6e21af974640ca75ba41be0b0 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string>
-#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/toco_types.h"
 namespace toco {
 
 bool StrContains(const string& x, const string& search_pattern) {
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h b/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h
similarity index 82%
rename from tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h
rename to tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h
index b57bded305ffbbcb91de880ebac081dcb4e7db82..9b9c4fc20862c0dd2549cbd143477ac6fed915ba 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
 
 #include <string>
 
@@ -30,4 +30,4 @@ void Transpose2DTensor(const float* tensor, int row, int col,
 
 }  // end namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_UTILS_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
similarity index 93%
rename from tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
rename to tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
index 5e421ba944cccd9746c66bc33e986b4406dd3bf5..7a1875120788a549f6a56ac5ae4fce3eaa0480b2 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
 
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h
similarity index 86%
rename from tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h
rename to tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h
index 3334552afb1becdba7bb980a2a362489c6b3fdaf..d7afcced7b7ac536e00e857b31c89c02d98987b6 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
 
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 
@@ -60,4 +60,4 @@ std::unique_ptr<tensorflow::GraphDef> MaybeReplaceCompositeSubgraph(
 
 }  // end namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.cc b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
similarity index 96%
rename from tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
rename to tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
index d6a099817c7b88c7dcd9c3e4e8b131c2a25cffcd..fcd9ee45d984f05eabf5d51c223b45433e801308 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
 
 #include <ctype.h>
 #include <stddef.h>
@@ -22,11 +22,11 @@ limitations under the License.
 #include <vector>
 
 #include "google/protobuf/map.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
-#include "tensorflow/contrib/lite/toco/toco_port.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h
similarity index 85%
rename from tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h
rename to tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h
index 383fd99dff225c65c5094e7bc7a61c77cc17aa38..649cadfa066f941af7c88c2d8956c72e6cca9d23 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
 
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -79,4 +79,4 @@ class SvdfClusterFactory : public ClusterFactoryInterface {
 
 }  // end namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
rename to tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
index 646d048496c27955aa641fd01a35d8acfbd8dd90..f66b59ccce663fc791ef593e8d2a71443811294f 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
 
 #include <string>
 #include <unordered_map>
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
-#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.cc b/tensorflow/lite/toco/tensorflow_util.cc
similarity index 97%
rename from tensorflow/contrib/lite/toco/tensorflow_util.cc
rename to tensorflow/lite/toco/tensorflow_util.cc
index 0e7e9c41a066581b14fe1b78f83d8d57b916be6c..db9388b040c4e922774fa0780a0a0799e5c1361a 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_util.cc
+++ b/tensorflow/lite/toco/tensorflow_util.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tensorflow_util.h"
+#include "tensorflow/lite/toco/tensorflow_util.h"
 
 #include <string.h>
 #include <memory>
@@ -24,8 +24,8 @@ limitations under the License.
 #include "google/protobuf/map.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/contrib/lite/toco/toco_port.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.h b/tensorflow/lite/toco/tensorflow_util.h
similarity index 81%
rename from tensorflow/contrib/lite/toco/tensorflow_util.h
rename to tensorflow/lite/toco/tensorflow_util.h
index 61f91042685288a48ba19f8c67d4c7c1960a7787..010fbe88b21790cba742645bcbefb28da26f0cf6 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_util.h
+++ b/tensorflow/lite/toco/tensorflow_util.h
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+#ifndef TENSORFLOW_LITE_TOCO_TENSORFLOW_UTIL_H_
+#define TENSORFLOW_LITE_TOCO_TENSORFLOW_UTIL_H_
 
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 
@@ -29,4 +29,4 @@ void LogDumpGraphDef(int log_level, const string& message,
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+#endif  // TENSORFLOW_LITE_TOCO_TENSORFLOW_UTIL_H_
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..36ca638ee8c83f6cc1d887a0efaf2b0676f95bd8
--- /dev/null
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -0,0 +1,150 @@
+package(
+    # To suppress build cleaner error about inclusion of schema_generate.h.
+    features = ["-layering_check"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "operator",
+    srcs = [
+        "operator.cc",
+        "whitelisted_flex_ops.cc",
+        "whitelisted_flex_ops.h",
+    ],
+    hdrs = [
+        "builtin_operator.h",
+        "custom_operator.h",
+        "operator.h",
+        "simple_operator.h",
+    ],
+    deps = [
+        ":types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/toco:graph_transformations",
+        "//tensorflow/lite/toco:model",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "operator_test",
+    srcs = [
+        "operator_test.cc",
+    ],
+    deps = [
+        ":operator",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/toco:tooling_util",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "types",
+    srcs = [
+        "types.cc",
+    ],
+    hdrs = [
+        "types.h",
+    ],
+    deps = [
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/toco:model",
+    ],
+)
+
+tf_cc_test(
+    name = "types_test",
+    srcs = [
+        "types_test.cc",
+    ],
+    deps = [
+        ":types",
+        "//tensorflow/core:ops",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "export",
+    srcs = [
+        "export.cc",
+    ],
+    hdrs = [
+        "export.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":operator",
+        ":types",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "//tensorflow/lite/tools/optimize:quantize_weights",
+        "@com_google_absl//absl/strings",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "export_test",
+    srcs = [
+        "export_test.cc",
+    ],
+    deps = [
+        ":export",
+        "//tensorflow/core:ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "import",
+    srcs = [
+        "import.cc",
+    ],
+    hdrs = [
+        "import.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":operator",
+        ":types",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "//tensorflow/lite/tools:verifier",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "import_test",
+    srcs = [
+        "import_test.cc",
+    ],
+    deps = [
+        ":import",
+        "//tensorflow/core:ops",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/contrib/lite/toco/tflite/builtin_operator.h b/tensorflow/lite/toco/tflite/builtin_operator.h
similarity index 90%
rename from tensorflow/contrib/lite/toco/tflite/builtin_operator.h
rename to tensorflow/lite/toco/tflite/builtin_operator.h
index cfe7ecd9f982618dea3b3a5d02e69e3f15434bc2..ea012ff6e706ae0e546d93a1fd9c5cf085e25262 100644
--- a/tensorflow/contrib/lite/toco/tflite/builtin_operator.h
+++ b/tensorflow/lite/toco/tflite/builtin_operator.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
 
 #include "absl/memory/memory.h"
-#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
 
 namespace toco {
 
@@ -71,4 +71,4 @@ class BuiltinOperator : public BaseOperator {
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/custom_operator.h b/tensorflow/lite/toco/tflite/custom_operator.h
similarity index 90%
rename from tensorflow/contrib/lite/toco/tflite/custom_operator.h
rename to tensorflow/lite/toco/tflite/custom_operator.h
index bd5713618ff379be42fd1b76649cfb2cf55b843d..2ca740bb90d5ad51768637c19542068b14876ede 100644
--- a/tensorflow/contrib/lite/toco/tflite/custom_operator.h
+++ b/tensorflow/lite/toco/tflite/custom_operator.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
 
 #include "flatbuffers/flexbuffers.h"
 #include "absl/memory/memory.h"
-#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
 
 namespace toco {
 
@@ -71,4 +71,4 @@ class CustomOperator : public BaseOperator {
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8b9448486dfb60695cddda9dc320c4ab616e8217
--- /dev/null
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -0,0 +1,608 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/tflite/export.h"
+
+#include "flatbuffers/flexbuffers.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/toco/tflite/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/lite/tools/optimize/quantize_weights.h"
+#include "tensorflow/lite/version.h"
+
+namespace toco {
+
+namespace tflite {
+
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
+using ::tflite::Buffer;
+using ::tflite::BuiltinOperator;
+using ::tflite::BuiltinOperator_CUSTOM;
+using ::tflite::BuiltinOperator_MAX;
+using ::tflite::BuiltinOperator_MIN;
+using ::tflite::CreateBuffer;
+using ::tflite::CreateModel;
+using ::tflite::CreateOperator;
+using ::tflite::CreateTensor;
+using ::tflite::Operator;
+using ::tflite::OperatorCode;
+using ::tflite::SubGraph;
+using ::tflite::Tensor;
+
+namespace {
+
+// Check if a TensorFlow Op is a control flow op by its name.
+bool IsControlFlowOp(const string& tensorflow_op) {
+  // Technically this is equalivent to `::tensorflow::Node::IsControlFlow()`.
+  // It requires to construct a `::tensorflow::Graph` to use that helper
+  // function, so we simply hardcode the list of control flow ops here.
+  if (tensorflow_op == "Switch" || tensorflow_op == "RefSwitch" ||
+      tensorflow_op == "Merge" || tensorflow_op == "RefMerge" ||
+      tensorflow_op == "Enter" || tensorflow_op == "RefEnter" ||
+      tensorflow_op == "Exit" || tensorflow_op == "RefExit" ||
+      tensorflow_op == "NextIteration" || tensorflow_op == "RefNextIteration") {
+    return true;
+  }
+  // TODO(ycling): Also check how to handle Variable ops and Assign ops.
+  return false;
+}
+
+// Check if a TensorFlow Op is unsupportred by the Flex runtime.
+bool IsUnsupportedFlexOp(const string& tensorflow_op) {
+  if (IsControlFlowOp(tensorflow_op)) {
+    return true;
+  }
+  // `HashTableV2` isn't supported for now since it requires an additinonal
+  // initialization step.
+  // TODO(b/117651199): Support `HashTableV2` with Flex runtime.
+  if (tensorflow_op == "HashTableV2") {
+    return true;
+  }
+  return false;
+}
+
+// Map from operator name to TF Lite enum value, for all builtins.
+const std::map<string, BuiltinOperator>& GetBuiltinOpsMap() {
+  static std::map<string, BuiltinOperator>* builtin_ops = nullptr;
+  if (builtin_ops == nullptr) {
+    builtin_ops = new std::map<string, BuiltinOperator>();
+
+    for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
+      BuiltinOperator op = static_cast<BuiltinOperator>(i);
+      string name = EnumNameBuiltinOperator(op);
+      if (op != BuiltinOperator_CUSTOM && !name.empty()) {
+        (*builtin_ops)[name] = op;
+      }
+    }
+  }
+  return *builtin_ops;
+}
+
+void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
+                        string* file_contents) {
+  const uint8_t* buffer = builder.GetBufferPointer();
+  int size = builder.GetSize();
+  *file_contents = string(reinterpret_cast<const char*>(buffer), size);
+}
+
+}  // Anonymous namespace.
+
+namespace details {
+
+OperatorKey::OperatorKey(
+    const ::toco::Operator& op,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    bool enable_select_tf_ops) {
+  // Get the op name (by Toco definition).
+  string name = HelpfulOperatorTypeName(op);
+
+  bool is_builtin = false;
+  const auto& builtin_ops = GetBuiltinOpsMap();
+  if (ops_by_type.count(op.type) != 0) {
+    version_ = ops_by_type.at(op.type)->GetVersion(op);
+    name = ops_by_type.at(op.type)->name();
+    is_builtin = (builtin_ops.count(name) > 0);
+  }
+
+  if (is_builtin) {
+    // For TFLite supported builtin ops, find out its BuiltinOperator enum used
+    // in FlatBuffer.
+    type_ = builtin_ops.at(name);
+    return;
+  }
+  // The logic below is all for custom ops or Flex ops.
+  is_custom_op_ = true;
+  type_ = BuiltinOperator_CUSTOM;
+
+  if (op.type == OperatorType::kUnsupported) {
+    const TensorFlowUnsupportedOperator& unsupported_op =
+        static_cast<const TensorFlowUnsupportedOperator&>(op);
+    const auto tensorflow_op = unsupported_op.tensorflow_op;
+
+    if (ShouldExportAsFlexOp(enable_select_tf_ops,
+                             unsupported_op.tensorflow_op)) {
+      is_custom_op_ = false;
+      is_flex_op_ = true;
+      flex_tensorflow_op_ = tensorflow_op;
+      custom_code_ =
+          string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
+    } else {
+      custom_code_ = tensorflow_op;
+    }
+  } else if (enable_select_tf_ops && !op.tensorflow_node_def.empty()) {
+    // For Toco-supported/TFLite-unsupported ops, if the TensorFlow NodeDef
+    // is retained in the Toco Operator, we produce a Flex op if Flex mode
+    // is enabled.
+    is_custom_op_ = false;
+    is_flex_op_ = true;
+    flex_tensorflow_op_ = name;
+    custom_code_ =
+        string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
+  } else {
+    // If Flex is disabled or the original TensorFlow NodeDef isn't available,
+    // we produce a custom op. This gives developers a chance to implemenr
+    // custom ops.
+    custom_code_ = name;
+  }
+
+  if (is_flex_op_) {
+    if (IsUnsupportedFlexOp(flex_tensorflow_op_)) {
+      is_unsupported_flex_op_ = true;
+    }
+  }
+}
+
+void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
+  // First find a list of unique array names.
+  std::set<string> names;
+  for (const auto& array_pair : model.GetArrayMap()) {
+    names.insert(array_pair.first);
+  }
+
+  // Now assign indices to them and fill in the map.
+  int index = 0;
+  for (const auto& name : names) {
+    (*tensors_map)[name] = index;
+    ++index;
+  }
+}
+
+void LoadOperatorsMap(
+    const Model& model, OperatorsMap* operators_map,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    bool enable_select_tf_ops) {
+  // First find a list of unique operator types.
+  std::set<OperatorKey> keys;
+  for (const auto& op : model.operators) {
+    keys.insert(OperatorKey(*op, ops_by_type, enable_select_tf_ops));
+  }
+  // Now assign indices to them and fill in the map.
+  int index = 0;
+  for (const auto& key : keys) {
+    (*operators_map)[key] = index;
+    ++index;
+  }
+}
+
+}  // namespace details
+
+Offset<Vector<Offset<Tensor>>> ExportTensors(
+    const Model& model, const details::TensorsMap& tensors_map,
+    FlatBufferBuilder* builder, std::vector<const Array*>* buffers_to_write,
+    const std::set<int32_t>& variable_tensor_indices) {
+  // In the end we will need to produce a vector sorted by the indices of the
+  // tensors in the tensors_map.
+  std::map<int, Offset<Tensor>> ordered_tensors;
+
+  for (const auto& array_pair : model.GetArrayMap()) {
+    const string& tensor_name = array_pair.first;
+    const toco::Array& array = *array_pair.second;
+
+    int buffer_index = buffers_to_write->size();
+    auto type = DataType::Serialize(array.data_type);
+    buffers_to_write->push_back(&array);
+
+    std::vector<int> shape;
+    if (array.has_shape()) {
+      for (int d : array.shape().dims()) {
+        shape.push_back(d);
+      }
+    }
+
+    Offset<Vector<float>> min;
+    Offset<Vector<float>> max;
+    Offset<Vector<float>> scale;
+    Offset<Vector<int64_t>> zero_point;
+    if (array.minmax) {
+      min = builder->CreateVector(
+          std::vector<float>{static_cast<float>(array.minmax->min)});
+      max = builder->CreateVector(
+          std::vector<float>{static_cast<float>(array.minmax->max)});
+    }
+    if (array.quantization_params) {
+      scale = builder->CreateVector(std::vector<float>{
+          static_cast<float>(array.quantization_params->scale)});
+      zero_point = builder->CreateVector(
+          std::vector<int64_t>{array.quantization_params->zero_point});
+    }
+    auto q_param = ::tflite::CreateQuantizationParameters(*builder, min, max,
+                                                          scale, zero_point);
+
+    int index = tensors_map.at(tensor_name);
+    bool is_variable =
+        variable_tensor_indices.find(index) != variable_tensor_indices.end();
+    ordered_tensors[index] =
+        CreateTensor(*builder, builder->CreateVector(shape), type, buffer_index,
+                     builder->CreateString(tensor_name), q_param, is_variable);
+  }
+
+  std::vector<Offset<Tensor>> tensor_vector;
+  tensor_vector.reserve(ordered_tensors.size());
+  for (const auto& tensor : ordered_tensors) {
+    tensor_vector.push_back(tensor.second);
+  }
+
+  return builder->CreateVector(tensor_vector);
+}
+
+Offset<Vector<int32_t>> ExportInputTensors(
+    const Model& model, const details::TensorsMap& tensors_map,
+    FlatBufferBuilder* builder) {
+  std::vector<int32_t> inputs;
+  for (const auto& input : model.flags.input_arrays()) {
+    inputs.push_back(tensors_map.at(input.name()));
+  }
+  return builder->CreateVector<int32_t>(inputs);
+}
+
+Offset<Vector<int32_t>> ExportOutputTensors(
+    const Model& model, const details::TensorsMap& tensors_map,
+    FlatBufferBuilder* builder) {
+  std::vector<int32_t> outputs;
+  for (const string& output : model.flags.output_arrays()) {
+    outputs.push_back(tensors_map.at(output));
+  }
+  return builder->CreateVector<int32_t>(outputs);
+}
+
+Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
+    const Model& model,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    const details::OperatorsMap& operators_map, FlatBufferBuilder* builder,
+    const ExportParams& params) {
+  // Map from operator name to TF Lite enum value, for all builtins.
+  std::map<string, BuiltinOperator> builtin_ops;
+  for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
+    BuiltinOperator op = static_cast<BuiltinOperator>(i);
+    string name = EnumNameBuiltinOperator(op);
+    if (op != BuiltinOperator_CUSTOM && !name.empty()) {
+      builtin_ops[name] = op;
+    }
+  }
+
+  // We will need to produce a vector of codes in the same order as they
+  // appear in the operators_map.
+  std::map<int, Offset<OperatorCode>> ordered_opcodes;
+
+  for (const auto& op : model.operators) {
+    const details::OperatorKey operator_key =
+        details::OperatorKey(*op, ops_by_type, params.enable_select_tf_ops);
+    int op_index = operators_map.at(operator_key);
+
+    flatbuffers::Offset<flatbuffers::String> custom_code = 0;
+    if (!operator_key.custom_code().empty()) {
+      custom_code = builder->CreateString(operator_key.custom_code());
+    }
+
+    ordered_opcodes[op_index] = CreateOperatorCode(
+        *builder, operator_key.type(), custom_code, operator_key.version());
+  }
+
+  std::vector<Offset<OperatorCode>> opcode_vector;
+  opcode_vector.reserve(ordered_opcodes.size());
+  for (const auto& opcode : ordered_opcodes) {
+    opcode_vector.push_back(opcode.second);
+  }
+
+  return builder->CreateVector(opcode_vector);
+}
+
+Offset<Vector<Offset<Operator>>> ExportOperators(
+    const Model& model,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    const details::OperatorsMap& operators_map,
+    const details::TensorsMap& tensors_map, FlatBufferBuilder* builder,
+    std::set<int32_t>* variable_tensor_indices, const ExportParams& params) {
+  variable_tensor_indices->clear();
+
+  auto is_tflite_builtin = [](const BaseOperator* op) {
+    const auto& tflite_builtins = GetBuiltinOpsMap();
+    return (op && tflite_builtins.find(op->name()) != tflite_builtins.end());
+  };
+
+  // The operators are in execution order, so we just follow tf.mini order.
+  std::vector<Offset<Operator>> op_vector;
+  for (const auto& op : model.operators) {
+    std::vector<int32_t> inputs;
+    for (const string& input : op->inputs) {
+      // -1 is the ID for optional tensor in TFLite output
+      int id = model.IsOptionalArray(input) ? -1 : tensors_map.at(input);
+      inputs.push_back(id);
+    }
+    std::vector<int32_t> outputs;
+    for (const string& output : op->outputs) {
+      outputs.push_back(tensors_map.at(output));
+    }
+
+    const auto key =
+        details::OperatorKey(*op, ops_by_type, params.enable_select_tf_ops);
+    int op_index = operators_map.at(key);
+
+    auto tflite_op_it = ops_by_type.find(op->type);
+    BaseOperator* tflite_op = tflite_op_it == ops_by_type.end()
+                                  ? nullptr
+                                  : tflite_op_it->second.get();
+
+    // This is a custom op unless we can find it in ops_by_type, and even then
+    // it could be a custom op (such as kUnsupported).
+    auto options = Options::Custom(0);
+
+    std::vector<bool> mutating_input_variables;
+
+    // It is conceivable that an op is exportable via Serialize() but does not
+    // have a corresponding TFLITE builtin. In that case, when flex mode is
+    // enabled we should export it as a flex op, not as a native.
+    bool export_as_flex_op = !is_tflite_builtin(tflite_op) &&
+                             key.is_flex_op() &&
+                             !op->tensorflow_node_def.empty();
+    if (export_as_flex_op) {
+      auto fbb = WriteFlexOpOptions(op->tensorflow_node_def);
+      if (fbb) {
+        options = Options::Custom(builder->CreateVector(fbb->GetBuffer()));
+      }
+    } else if (tflite_op) {
+      options = tflite_op->Serialize(*op, builder);
+      mutating_input_variables = tflite_op->GetMutatingInputVariables(*op);
+
+      if (!mutating_input_variables.empty()) {
+        for (int i = 0; i < op->inputs.size(); ++i) {
+          if (!mutating_input_variables[i]) {
+            continue;
+          }
+          int32_t variable_tensor_index = tensors_map.at(op->inputs[i]);
+          variable_tensor_indices->insert(variable_tensor_index);
+        }
+      }
+    } else {
+      // We don't know much about this op. It doesn't have a serializer and
+      // it is not supposed to be exported as a flex op. We will treat it as
+      // a regular custom op: we will still create an operator for it, but it
+      // will not have any 'options'.
+    }
+
+    // The only supported CustomOptionFormat is FLEXBUFFERS now.
+    op_vector.push_back(CreateOperator(
+        *builder, op_index, builder->CreateVector(inputs),
+        builder->CreateVector(outputs), options.type, options.builtin,
+        options.custom, ::tflite::CustomOptionsFormat_FLEXBUFFERS,
+        builder->CreateVector(mutating_input_variables)));
+  }
+
+  return builder->CreateVector(op_vector);
+}
+
+Offset<Vector<Offset<Buffer>>> ExportBuffers(
+    const Model& model, const std::vector<const Array*>& buffers_to_write,
+    FlatBufferBuilder* builder) {
+  std::vector<Offset<Buffer>> buffer_vector;
+  size_t index = 0;
+  for (const Array* array_ptr : buffers_to_write) {
+    const Array& array = *array_ptr;
+    Offset<Vector<uint8_t>> data_buffer = DataBuffer::Serialize(array, builder);
+    buffer_vector.push_back(CreateBuffer(*builder, data_buffer));
+    index++;
+  }
+  return builder->CreateVector(buffer_vector);
+}
+
+tensorflow::Status Export(const Model& model, string* output_file_contents,
+                          const ExportParams& params) {
+  const auto ops_by_type = BuildOperatorByTypeMap(params.enable_select_tf_ops);
+  return Export(model, output_file_contents, params, ops_by_type);
+}
+
+tensorflow::Status Export(
+    const Model& model, string* output_file_contents,
+    const ExportParams& params,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
+  flatbuffers::FlatBufferBuilder builder(/*initial_size=*/10240);
+
+  details::TensorsMap tensors_map;
+  details::LoadTensorsMap(model, &tensors_map);
+
+  details::OperatorsMap operators_map;
+  details::LoadOperatorsMap(model, &operators_map, ops_by_type,
+                            params.enable_select_tf_ops);
+
+  std::vector<const Array*> buffers_to_write;
+  Array empty_array;
+  buffers_to_write.push_back(&empty_array);
+
+  auto op_codes =
+      ExportOperatorCodes(model, ops_by_type, operators_map, &builder, params);
+
+  for (const auto& op : model.operators) {
+    if (op->type == OperatorType::kFakeQuant) {
+      LOG(WARNING) << "FAKE_QUANT operation " << LogName(*op)
+                   << " was not converted. If running quantized make sure you "
+                      "are passing --inference_type=QUANTIZED_UINT8 and values "
+                      "for --std_values and --mean_values.";
+    }
+  }
+
+  // The set of used builtin ops.
+  std::set<string> builtin_ops;
+  // The set of custom ops (not including Flex ops).
+  std::set<string> custom_ops;
+  // The set of Flex ops which are not supported.
+  std::set<string> unsupported_flex_ops;
+
+  for (const auto& it : operators_map) {
+    const details::OperatorKey& key = it.first;
+    if (key.is_custom_op()) {
+      custom_ops.insert(key.custom_code());
+    }
+    if (key.is_unsupported_flex_op()) {
+      unsupported_flex_ops.insert(key.flex_tensorflow_op());
+    }
+    if (!key.is_custom_op() && !key.is_flex_op() &&
+        !key.is_unsupported_flex_op()) {
+      builtin_ops.insert(EnumNameBuiltinOperator(key.type()));
+    }
+  }
+
+  if (!custom_ops.empty()) {
+    if (!params.allow_custom_ops) {
+      // Remove ExpandDims and ReorderAxes from unimplemented list unless they
+      // compose the list. Both ops are removed during graph transformations.
+      // However, if an op is unimplemented earlier in the model, the graph
+      // transformation is unable to run because the output shape is not
+      // defined. This causes unnecessary confusion during model conversion
+      // time.
+      std::set<string> custom_ops_final;
+      for (const auto& op_type : custom_ops) {
+        if (op_type != "ReorderAxes" && op_type != "ExpandDims") {
+          custom_ops_final.insert(op_type);
+        }
+      }
+      if (custom_ops_final.empty()) {
+        custom_ops_final = custom_ops;
+      }
+
+      auto please_report_bug_message = []() {
+        return "We are continually in the process of adding support to "
+               "TensorFlow Lite for more ops. It would be helpful if you could "
+               "inform us of how this conversion went by opening a github "
+               "issue at "
+               "https://github.com/tensorflow/tensorflow/issues/new?template="
+               "40-tflite-op-request.md\n and pasting the following:\n\n";
+      };
+
+      if (params.enable_select_tf_ops) {
+        return tensorflow::errors::InvalidArgument(absl::StrCat(
+            please_report_bug_message(),
+            "Some of the operators in the model are not supported by "
+            "the standard TensorFlow Lite runtime and are not recognized by "
+            "TensorFlow. If you have a custom "
+            "implementation for them you can disable this error with "
+            "--allow_custom_ops, or by setting allow_custom_ops=True "
+            "when calling tf.lite.TFLiteConverter(). Here is a list "
+            "of builtin operators you are using: ",
+            absl::StrJoin(builtin_ops, ", "),
+            ". Here is a list "
+            "of operators for which you will need custom implementations: ",
+            absl::StrJoin(custom_ops_final, ", "), "."));
+      } else {
+        return tensorflow::errors::InvalidArgument(absl::StrCat(
+            please_report_bug_message(),
+            "Some of the operators in the model are not supported by "
+            "the standard TensorFlow Lite runtime. If those are native "
+            "TensorFlow operators, you might be able to use the extended "
+            "runtime by passing --enable_select_tf_ops, or by setting "
+            "target_ops=TFLITE_BUILTINS,SELECT_TF_OPS when calling "
+            "tf.lite.TFLiteConverter(). Otherwise, if you have a "
+            "custom implementation for them you can disable this error with "
+            "--allow_custom_ops, or by setting allow_custom_ops=True "
+            "when calling tf.lite.TFLiteConverter(). Here is a list "
+            "of builtin operators you are using: ",
+            absl::StrJoin(builtin_ops, ", "),
+            ". Here is a list "
+            "of operators for which you will need custom implementations: ",
+            absl::StrJoin(custom_ops_final, ", "), "."));
+      }
+    }
+
+    std::set<string> unsupported_control_flow_ops;
+    // Check if unsupported ops contains control flow ops. It's impossible
+    // to implement these ops as custom ops at the moment.
+    for (const auto& op : custom_ops) {
+      if (IsControlFlowOp(op)) {
+        unsupported_control_flow_ops.insert(op);
+      }
+    }
+    if (!unsupported_control_flow_ops.empty()) {
+      return tensorflow::errors::InvalidArgument(absl::StrCat(
+          "TensorFlow Lite currently doesn't support control flow ops: ",
+          absl::StrJoin(unsupported_control_flow_ops, ", "), "."));
+    }
+  }
+
+  if (!unsupported_flex_ops.empty()) {
+    return tensorflow::errors::InvalidArgument(
+        absl::StrCat("Some of the operators in the model are not supported by "
+                     "TensorFlow Flex runtime: ",
+                     absl::StrJoin(unsupported_flex_ops, ", "), "."));
+  }
+
+  std::set<int32_t> variable_tensor_indices;
+  auto ops = ExportOperators(model, ops_by_type, operators_map, tensors_map,
+                             &builder, &variable_tensor_indices, params);
+
+  auto tensors = ExportTensors(model, tensors_map, &builder, &buffers_to_write,
+                               variable_tensor_indices);
+  auto inputs = ExportInputTensors(model, tensors_map, &builder);
+  auto outputs = ExportOutputTensors(model, tensors_map, &builder);
+
+  // TODO(aselle): add support to toco for multiple subgraphs.
+  auto subgraph = CreateSubGraph(builder, tensors, inputs, outputs, ops,
+                                 /* name */ 0);
+  std::vector<flatbuffers::Offset<SubGraph>> subgraphs = {subgraph};
+
+  auto buffers = ExportBuffers(model, buffers_to_write, &builder);
+  auto description = builder.CreateString("TOCO Converted.");
+  auto new_model_location =
+      CreateModel(builder, TFLITE_SCHEMA_VERSION, op_codes,
+                  builder.CreateVector(subgraphs), description, buffers);
+  ::tflite::FinishModelBuffer(builder, new_model_location);
+
+  if (params.quantize_weights) {
+    // Call the quantize_weights tool.
+    LOG(INFO) << "Quantizing TFLite model after conversion to flatbuffer. "
+                 "dump_graphviz will only output the model before this "
+                 "transformation. To visualize the output graph use "
+                 "lite/tools/optimize.py.";
+    flatbuffers::FlatBufferBuilder q_builder(/*initial_size=*/10240);
+    const uint8_t* buffer = builder.GetBufferPointer();
+    const ::tflite::Model* input_model = ::tflite::GetModel(buffer);
+    if (::tflite::optimize::QuantizeWeights(&q_builder, input_model) !=
+        kTfLiteOk) {
+      return tensorflow::errors::InvalidArgument(
+          "Quantize weights transformation failed.");
+    }
+    WriteModelToString(q_builder, output_file_contents);
+  } else {
+    WriteModelToString(builder, output_file_contents);
+  }
+
+  return tensorflow::Status();
+}
+
+}  // namespace tflite
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/tflite/export.h b/tensorflow/lite/toco/tflite/export.h
new file mode 100644
index 0000000000000000000000000000000000000000..adf6757a3027e53912af03a51dbdfdcdca6b60e8
--- /dev/null
+++ b/tensorflow/lite/toco/tflite/export.h
@@ -0,0 +1,174 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_EXPORT_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_EXPORT_H_
+
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/util.h"
+
+namespace toco {
+
+namespace tflite {
+
+// The parameters for exporting a TFLite model.
+struct ExportParams {
+  bool allow_custom_ops = false;
+  bool enable_select_tf_ops = false;
+  bool quantize_weights = false;
+};
+
+// Transform the given tf.mini model into a TF Lite flatbuffer and deposit the
+// result in the given string.
+tensorflow::Status Export(const Model& model, string* output_file_contents,
+                          const ExportParams& params);
+
+// Export API with custom TFLite operator mapping.
+tensorflow::Status Export(
+    const Model& model, string* output_file_contents,
+    const ExportParams& params,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
+
+// This is for backward-compatibility.
+// TODO(ycling): Remove the deprecated entry functions.
+inline void Export(const Model& model, bool allow_custom_ops,
+                   bool quantize_weights, string* output_file_contents) {
+  ExportParams params;
+  params.allow_custom_ops = allow_custom_ops;
+  params.quantize_weights = quantize_weights;
+  auto status = Export(model, output_file_contents, params);
+  if (!status.ok()) LOG(QFATAL) << status.error_message();
+}
+
+// This is for backward-compatibility.
+// TODO(ycling): Remove the deprecated entry functions.
+inline void Export(
+    const Model& model, bool allow_custom_ops, bool quantize_weights,
+    string* output_file_contents,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
+  ExportParams params;
+  params.allow_custom_ops = allow_custom_ops;
+  params.quantize_weights = quantize_weights;
+  auto status = Export(model, output_file_contents, params, ops_by_type);
+  if (!status.ok()) LOG(QFATAL) << status.error_message();
+}
+
+// This is for backward-compatibility.
+// TODO(ycling): Remove the deprecated entry functions.
+inline void Export(const Model& model, string* output_file_contents) {
+  ExportParams params;
+  params.allow_custom_ops = true;
+  auto status = Export(model, output_file_contents, params);
+  if (!status.ok()) LOG(QFATAL) << status.error_message();
+}
+
+namespace details {
+
+// A maps from tensor name to its final position in the TF Lite buffer.
+using TensorsMap = std::unordered_map<string, int>;
+
+// A key to identify an operator.
+// Only when `type` is `kUnsupported`, `custom_code` is filled to
+// identify which operation is used.
+class OperatorKey {
+ public:
+  OperatorKey() {}
+
+  // Construct OperatorKey by Toco op.
+  OperatorKey(
+      const ::toco::Operator& op,
+      const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+      bool enable_select_tf_ops);
+
+  // Construct OperatorKey by type, custom code and version.
+  // Note that this construct doesn't set the additional information including
+  // `is_custom_op`, `is_flex_op`, `is_unsupported_flex_op`.
+  OperatorKey(::tflite::BuiltinOperator type, const std::string& custom_code,
+              int version)
+      : type_(type), custom_code_(custom_code), version_(version) {}
+
+  // Only `type`, `custom_code` and `version` is used to compute hash and
+  // identity.
+  ::tflite::BuiltinOperator type() const { return type_; }
+  const std::string& custom_code() const { return custom_code_; }
+  int version() const { return version_; }
+
+  // The attributes below are not used to compute hash and identity.
+  //
+  // Return true if the op is a custom op. Note it will return false for Flex
+  // ops.
+  bool is_custom_op() const { return is_custom_op_; }
+  // Return true if the op is a Flex op.
+  bool is_flex_op() const { return is_flex_op_; }
+  // Return true if the op is a Flex op but it's knwon that the op is not
+  // supported by Flex runtime.
+  bool is_unsupported_flex_op() const { return is_unsupported_flex_op_; }
+  // Return the original TensorFlow op name for a Flex op.
+  const std::string& flex_tensorflow_op() const { return flex_tensorflow_op_; }
+
+  bool operator<(const OperatorKey& other) const {
+    if (type_ < other.type_)
+      return true;
+    else if (type_ > other.type_)
+      return false;
+    else if (custom_code_ < other.custom_code_)
+      return true;
+    else if (custom_code_ > other.custom_code_)
+      return false;
+    else
+      return version_ < other.version_;
+  }
+
+  bool operator==(const OperatorKey& other) const {
+    return type_ == other.type_ && custom_code_ == other.custom_code_ &&
+           version_ == other.version_;
+  }
+
+  struct Hash {
+    size_t operator()(const OperatorKey& key) const {
+      return ::tflite::CombineHashes(
+          {std::hash<size_t>()(static_cast<size_t>(key.type())),
+           std::hash<std::string>()(key.custom_code()),
+           std::hash<int>()(key.version())});
+    }
+  };
+
+ private:
+  ::tflite::BuiltinOperator type_ = ::tflite::BuiltinOperator_CUSTOM;
+  std::string custom_code_;
+  int version_ = 1;
+
+  bool is_custom_op_ = false;
+  bool is_flex_op_ = false;
+  bool is_unsupported_flex_op_ = false;
+  // The original TensorFlow op name for the flex op. Filled only when
+  // `is_flex_op` is true.
+  std::string flex_tensorflow_op_;
+};
+
+// A maps from operator type to its final position in the TF Lite buffer.
+using OperatorsMap = std::unordered_map<OperatorKey, int, OperatorKey::Hash>;
+
+void LoadTensorsMap(const Model& model, TensorsMap* tensors_map);
+void LoadOperatorsMap(
+    const Model& model, OperatorsMap* operators_map,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    bool enable_select_tf_ops);
+
+}  // namespace details
+}  // namespace tflite
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_EXPORT_H_
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b371296784a34e081ae9bc5c1497348d9eb925ba
--- /dev/null
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -0,0 +1,572 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/tflite/export.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/tflite/builtin_operator.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/toco/tflite/types.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace toco {
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+class ExportTest : public ::testing::Test {
+ protected:
+  void ResetOperators() { input_model_.operators.clear(); }
+  void AddTensorsByName(std::initializer_list<string> names) {
+    for (const string& name : names) {
+      input_model_.GetOrCreateArray(name);
+    }
+  }
+  void AddOperatorsByName(std::initializer_list<string> names) {
+    for (const string& name : names) {
+      if (name == "Conv") {
+        auto* op = new ConvOperator;
+        op->padding.type = PaddingType::kSame;
+        input_model_.operators.emplace_back(op);
+      } else if (name == "Add") {
+        input_model_.operators.emplace_back(new AddOperator);
+      } else if (name == "Sub") {
+        input_model_.operators.emplace_back(new SubOperator);
+      } else if (name == "Assert") {
+        auto* op = new TensorFlowAssertOperator;
+
+        // Even though assert is known to TOCO, it doesn't have a tflite
+        // serializer, so it has to be exported as a custom op. If we attach a
+        // NodeDef to it, however, it will be exported as a flex op instead.
+        ::tensorflow::NodeDef node_def;
+        node_def.set_name("Assert");
+        node_def.set_op("Assert");
+        node_def.SerializeToString(&op->tensorflow_node_def);
+
+        input_model_.operators.emplace_back(op);
+      } else {
+        auto* op = new TensorFlowUnsupportedOperator;
+        op->tensorflow_op = name;
+        input_model_.operators.emplace_back(op);
+      }
+    }
+  }
+
+  void BuildQuantizableTestModel() {
+    input_model_.GetOrCreateArray("inputs");
+    Array& weight_array = input_model_.GetOrCreateArray("weights");
+
+    // Make the buffer large enough for QuantizeWeights transformation to take
+    // effect.
+    int buf_size = 1296;
+    auto weight_buf = absl::make_unique<float[]>(buf_size);
+    for (int i = 0; i < buf_size; i++) {
+      // Fill the array with some garbage values.
+      weight_buf[i] = static_cast<float>(i % 128);
+    }
+
+    weight_array.data_type = ArrayDataType::kFloat;
+
+    // Initialize shape for the input array.
+    Shape* weight_array_shape = weight_array.mutable_shape();
+    std::vector<int>* weight_array_shape_dim =
+        weight_array_shape->mutable_dims();
+    weight_array_shape_dim->resize(4, 6);
+    auto& weight_array_buffer =
+        weight_array.GetMutableBuffer<ArrayDataType::kFloat>();
+    weight_array_buffer.data.resize(buf_size);
+    float* buf_ptr =
+        weight_array.GetMutableBuffer<ArrayDataType::kFloat>().data.data();
+    std::copy(weight_buf.get(), weight_buf.get() + buf_size, buf_ptr);
+
+    {
+      auto* op = new ConvOperator;
+      op->padding.type = PaddingType::kSame;
+      op->inputs = {"inputs", "weights"};
+      input_model_.operators.emplace_back(op);
+    }
+    input_model_.operators.emplace_back(new AddOperator);
+  }
+
+  std::vector<string> ExportAndSummarizeOperators(const ExportParams& params) {
+    std::vector<string> names;
+
+    string result;
+    auto status = Export(input_model_, &result, params);
+    if (!status.ok()) {
+      LOG(INFO) << status.error_message();
+      return names;
+    }
+
+    auto* model = ::tflite::GetModel(result.data());
+
+    for (const ::tflite::OperatorCode* opcode : *model->operator_codes()) {
+      if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
+        names.push_back(string("builtin:") + ::tflite::EnumNameBuiltinOperator(
+                                                 opcode->builtin_code()));
+      } else {
+        names.push_back(string("custom:") + opcode->custom_code()->c_str());
+      }
+    }
+
+    return names;
+  }
+
+  std::vector<uint32_t> ExportAndGetOperatorIndices(
+      const ExportParams& params) {
+    std::vector<uint32_t> indices;
+
+    string result;
+    if (!Export(input_model_, &result, params).ok()) return indices;
+    auto* model = ::tflite::GetModel(result.data());
+
+    auto operators = (*model->subgraphs())[0]->operators();
+    for (const auto* op : *operators) {
+      indices.push_back(op->opcode_index());
+    }
+    return indices;
+  }
+
+  Model input_model_;
+};
+
+TEST_F(ExportTest, LoadTensorsMap) {
+  AddTensorsByName({"tensor_one", "tensor_two"});
+
+  details::TensorsMap tensors;
+  details::LoadTensorsMap(input_model_, &tensors);
+  EXPECT_EQ(0, tensors["tensor_one"]);
+  EXPECT_EQ(1, tensors["tensor_two"]);
+}
+
+TEST_F(ExportTest, LoadOperatorsMap) {
+  AddOperatorsByName({"Conv", "Add", "MyCrazyOp", "Sub"});
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
+  EXPECT_EQ(
+      0, operators[details::OperatorKey(::tflite::BuiltinOperator_ADD, "", 1)]);
+  EXPECT_EQ(1, operators[details::OperatorKey(::tflite::BuiltinOperator_CONV_2D,
+                                              "", 1)]);
+  EXPECT_EQ(2, operators[details::OperatorKey(::tflite::BuiltinOperator_CUSTOM,
+                                              "MyCrazyOp", 1)]);
+  EXPECT_EQ(
+      3, operators[details::OperatorKey(::tflite::BuiltinOperator_SUB, "", 1)]);
+}
+
+TEST_F(ExportTest, Export) {
+  AddOperatorsByName({"Conv", "Add", "MyCrazyOp", "Sub"});
+
+  ExportParams params;
+  params.allow_custom_ops = true;
+  params.enable_select_tf_ops = false;
+  params.quantize_weights = false;
+
+  EXPECT_THAT(ExportAndSummarizeOperators(params),
+              ElementsAre("builtin:ADD", "builtin:CONV_2D", "custom:MyCrazyOp",
+                          "builtin:SUB"));
+  EXPECT_THAT(ExportAndGetOperatorIndices(params), ElementsAre(1, 0, 2, 3));
+}
+
+TEST_F(ExportTest, QuantizeWeights) {
+  // Sanity check for quantize_weights parameter.
+  BuildQuantizableTestModel();
+  string unquantized_result;
+  Export(input_model_, true, /*quantize_weights*/ false, &unquantized_result);
+
+  BuildQuantizableTestModel();
+  string quantized_result;
+  Export(input_model_, true, /*quantize_weights*/ true, &quantized_result);
+
+  // The quantized models should be smaller.
+  EXPECT_LT(quantized_result.size(), unquantized_result.size());
+}
+
+class OpSetsTest : public ExportTest {
+ public:
+  enum OpSet { kTfLiteBuiltins, kSelectTfOps, kCustomOps };
+
+  void SetAllowedOpSets(std::initializer_list<OpSet> sets) {
+    import_all_ops_as_unsupported_ = true;
+    params_.allow_custom_ops = false;
+    params_.enable_select_tf_ops = false;
+    params_.quantize_weights = false;
+
+    for (OpSet i : sets) {
+      switch (i) {
+        case kTfLiteBuiltins:
+          import_all_ops_as_unsupported_ = false;
+          break;
+        case kSelectTfOps:
+          params_.enable_select_tf_ops = true;
+          break;
+        case kCustomOps:
+          params_.allow_custom_ops = true;
+          break;
+      }
+    }
+  }
+
+  std::vector<string> ImportExport(std::initializer_list<string> op_names) {
+    ResetOperators();
+    if (!import_all_ops_as_unsupported_) {
+      AddOperatorsByName(op_names);
+    } else {
+      for (const string& name : op_names) {
+        auto* op = new TensorFlowUnsupportedOperator;
+        op->tensorflow_op = name;
+        input_model_.operators.emplace_back(op);
+      }
+    }
+    return ExportAndSummarizeOperators(params_);
+  }
+
+ private:
+  bool import_all_ops_as_unsupported_;
+  ExportParams params_;
+};
+
+TEST_F(OpSetsTest, BuiltinsOnly) {
+  // --target_op_set=TFLITE_BUILTINS
+  SetAllowedOpSets({kTfLiteBuiltins});
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
+              ElementsAre());
+  EXPECT_THAT(ImportExport({"Add"}), ElementsAre("builtin:ADD"));
+
+  // --target_op_set=TFLITE_BUILTINS --allow_custom_ops
+  SetAllowedOpSets({kTfLiteBuiltins, kCustomOps});
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
+              ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:Assert",
+                          "custom:UnrollAndFold"));
+}
+
+TEST_F(OpSetsTest, TfSelectOnly) {
+  // --target_op_set=SELECT_TF_OPS
+  SetAllowedOpSets({kSelectTfOps});
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "RandomUniform",
+                            "UnrollAndFold", "Assert"}),
+              ElementsAre());
+  EXPECT_THAT(ImportExport({"Add"}), ElementsAre("custom:FlexAdd"));
+
+  // --target_op_set=SELECT_TF_OPS --allow_custom_ops
+  SetAllowedOpSets({kSelectTfOps, kCustomOps});
+  EXPECT_THAT(
+      ImportExport(
+          {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
+      ElementsAre("custom:AdjustHue", "custom:FlexAdd", "custom:FlexAssert",
+                  "custom:FlexRandomUniform", "custom:UnrollAndFold"));
+}
+
+TEST_F(OpSetsTest, BuiltinsAndTfSelect) {
+  // --target_op_set=TFLITE_BUILTINS,SELECT_TF_OPS
+  SetAllowedOpSets({kTfLiteBuiltins, kSelectTfOps});
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
+              ElementsAre());
+  EXPECT_THAT(ImportExport({"Add", "RandomUniform"}),
+              ElementsAre("builtin:ADD", "custom:FlexRandomUniform"));
+
+  // --target_op_set=TFLITE_BUILTINS,SELECT_TF_OPS --allow_custom_ops
+  SetAllowedOpSets({kTfLiteBuiltins, kSelectTfOps, kCustomOps});
+  EXPECT_THAT(
+      ImportExport(
+          {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
+      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:FlexAssert",
+                  "custom:FlexRandomUniform", "custom:UnrollAndFold"));
+}
+
+// This test is based on a hypothetical scenario that dilation is supported
+// only in Conv version 2. So Toco populates version=1 when dialation
+// parameters are all 1, and version=2 otehrwise.
+class FakeConvolutionOperator
+    : public BuiltinOperator<ConvOperator, ::tflite::Conv2DOptions,
+                             ::tflite::BuiltinOptions_Conv2DOptions> {
+ public:
+  FakeConvolutionOperator()
+      : BuiltinOperator(::tflite::BuiltinOperator_CONV_2D,
+                        OperatorType::kConv) {}
+
+  // Returning the op version according to the op parameters.
+  int GetVersion(const Operator& op) const override {
+    const TocoOperator& conv_op = static_cast<const TocoOperator&>(op);
+    if (conv_op.dilation_width_factor != 1 ||
+        conv_op.dilation_height_factor != 1) {
+      // Version 2 if dilation is used.
+      return 2;
+    }
+    return 1;
+  }
+
+  // Note: The read / write code doesn't need to be changed if we stick with
+  // the restrictions:
+  // * Only adding parameters at the bottom of the Flatbuffer tables.
+  // * When the default value of parameters are used, the op works consistently
+  //   with the previous version.
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateConv2DOptions(*builder, padding, op.stride_width,
+                                         op.stride_height, activation_function,
+                                         op.dilation_width_factor,
+                                         op.dilation_height_factor);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+    op->dilation_width_factor = options.dilation_w_factor();
+    op->dilation_height_factor = options.dilation_h_factor();
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class VersionedOpExportTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    input_model_.GetOrCreateArray("input");
+    input_model_.GetOrCreateArray("filter");
+    input_model_.GetOrCreateArray("output");
+  }
+  void AddConvOp(bool use_dialation) {
+    {
+      auto* op = new ConvOperator;
+      op->inputs.push_back("input");
+      op->inputs.push_back("filter");
+      op->inputs.push_back("output");
+
+      op->padding.type = PaddingType::kSame;
+      op->stride_width = 1;
+      op->stride_height = 1;
+      if (use_dialation) {
+        op->dilation_width_factor = 2;
+        op->dilation_height_factor = 2;
+      } else {
+        op->dilation_width_factor = 1;
+        op->dilation_height_factor = 1;
+      }
+      input_model_.operators.emplace_back(op);
+    }
+  }
+
+  std::map<OperatorType, std::unique_ptr<BaseOperator>>
+  BuildFakeOperatorByTypeMap() {
+    std::map<OperatorType, std::unique_ptr<BaseOperator>> result;
+    result[OperatorType::kConv] =
+        std::unique_ptr<BaseOperator>(new FakeConvolutionOperator);
+    return result;
+  }
+
+  Model input_model_;
+};
+
+TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV1) {
+  AddConvOp(false);
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
+
+  EXPECT_EQ(1, operators.size());
+  EXPECT_EQ(0, operators.at(details::OperatorKey(
+                   ::tflite::BuiltinOperator_CONV_2D, "", 1)));
+}
+
+TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV2) {
+  AddConvOp(true);
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
+
+  EXPECT_EQ(1, operators.size());
+  EXPECT_EQ(0, operators.at(details::OperatorKey(
+                   ::tflite::BuiltinOperator_CONV_2D, "", 2)));
+}
+
+TEST_F(VersionedOpExportTest, LoadOperatorsMapWithBothVersions) {
+  AddConvOp(false);
+  AddConvOp(true);
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type, false);
+
+  EXPECT_EQ(2, operators.size());
+  EXPECT_EQ(0, operators.at(details::OperatorKey(
+                   ::tflite::BuiltinOperator_CONV_2D, "", 1)));
+  EXPECT_EQ(1, operators.at(details::OperatorKey(
+                   ::tflite::BuiltinOperator_CONV_2D, "", 2)));
+}
+
+TEST_F(VersionedOpExportTest, Export) {
+  AddConvOp(false);
+  AddConvOp(true);
+
+  string result;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  Export(input_model_, true, false, &result, ops_by_type);
+
+  auto* model = ::tflite::GetModel(result.data());
+  auto operator_codes = model->operator_codes();
+
+  // Verify that 2 operator codes are populdated. Both are CONV_2D but with
+  // different versions.
+  EXPECT_EQ(2, operator_codes->size());
+  EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
+            (*operator_codes)[0]->builtin_code());
+  EXPECT_EQ(1, (*operator_codes)[0]->version());
+  EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
+            (*operator_codes)[1]->builtin_code());
+  EXPECT_EQ(2, (*operator_codes)[1]->version());
+
+  // Verify that the 2 operators points to the correct indices of the operation
+  // codes.
+  auto operators = (*model->subgraphs())[0]->operators();
+  EXPECT_EQ(2, operators->size());
+  EXPECT_EQ(0, (*operators)[0]->opcode_index());
+  EXPECT_EQ(1, (*operators)[1]->opcode_index());
+}
+
+TEST(OperatorKeyTest, TestBuiltinOp) {
+  auto op = absl::make_unique<ConvOperator>();
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  const auto key = details::OperatorKey(*op, ops_by_type, false);
+
+  EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CONV_2D);
+  EXPECT_EQ(key.custom_code(), "");
+  EXPECT_EQ(key.version(), 1);
+}
+
+TEST(OperatorKeyTest, TestCustomOp) {
+  auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
+  op->tensorflow_op = "MyCrazyCustomOp";
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  const auto key = details::OperatorKey(*op, ops_by_type, false);
+
+  EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
+  EXPECT_EQ(key.custom_code(), "MyCrazyCustomOp");
+  EXPECT_EQ(key.version(), 1);
+}
+
+TEST(OperatorKeyTest, TestFlexOp) {
+  auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
+  op->tensorflow_op = "BatchMatMul";
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  {
+    const auto key = details::OperatorKey(*op, ops_by_type, false);
+    // It shouldn't be converted to Flex op if `allow_flex_op` is false.
+    EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
+    EXPECT_EQ(key.custom_code(), "BatchMatMul");
+    EXPECT_EQ(key.version(), 1);
+    EXPECT_TRUE(key.is_custom_op());
+    EXPECT_FALSE(key.is_flex_op());
+  }
+
+  {
+    // Verify that the custom op name is prefixed by "Flex" and `is_flex_op`
+    // is true.
+    const auto key = details::OperatorKey(*op, ops_by_type, true);
+    EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
+    EXPECT_EQ(key.custom_code(), "FlexBatchMatMul");
+    EXPECT_EQ(key.version(), 1);
+    EXPECT_FALSE(key.is_custom_op());
+    EXPECT_TRUE(key.is_flex_op());
+  }
+}
+
+TEST(OperatorKeyTest, TestFlexWithControlFlowOp) {
+  auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
+  op->tensorflow_op = "Merge";
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  const auto key = details::OperatorKey(*op, ops_by_type, true);
+
+  EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
+  EXPECT_EQ(key.custom_code(), "FlexMerge");
+  EXPECT_EQ(key.version(), 1);
+  EXPECT_FALSE(key.is_custom_op());
+  EXPECT_TRUE(key.is_flex_op());
+  // The control flow ops should be marked as unsupported.
+  EXPECT_TRUE(key.is_unsupported_flex_op());
+}
+
+TEST(OperatorKeyTest, TestFlexWithUnsupportedOp) {
+  auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
+  op->tensorflow_op = "HashTableV2";
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  const auto key = details::OperatorKey(*op, ops_by_type, true);
+
+  EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
+  EXPECT_EQ(key.custom_code(), "HashTableV2");
+  EXPECT_EQ(key.version(), 1);
+  // While HashTableV2 is excluded from the whitelisted flex op list, eventually
+  // it won't be, and the following expectations will need to change as the op
+  // is explicitly blacklisted due to lack of asset support.
+  EXPECT_FALSE(key.is_flex_op());
+  EXPECT_FALSE(key.is_unsupported_flex_op());
+}
+
+TEST(OperatorKeyTest, TestFlexWithPartiallySupportedOps) {
+  // Test Toco-supported/TFLite-unsupported operators.
+  // TODO(ycling): The test will be broken if TensorFlowAssert is implemented in
+  // TFLite. Find a more robust way to test the fallback logic.
+  auto op = absl::make_unique<TensorFlowAssertOperator>();
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+
+  {
+    // If NodeDef isn't retained in the Toco op, a regular custom op
+    // will be exported.
+    const auto key = details::OperatorKey(*op, ops_by_type, true);
+    EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
+    EXPECT_EQ(key.custom_code(), "Assert");
+    EXPECT_EQ(key.version(), 1);
+    EXPECT_TRUE(key.is_custom_op());
+    EXPECT_FALSE(key.is_flex_op());
+  }
+
+  ::tensorflow::NodeDef node_def;
+  node_def.set_name("TensorFlowAssert");
+  node_def.set_op("TensorFlowAssert");
+  node_def.SerializeToString(&op->tensorflow_node_def);
+
+  {
+    // If NodeDef is retained in the Toco op, a Flex op will be exported.
+    const auto key = details::OperatorKey(*op, ops_by_type, true);
+    EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
+    EXPECT_EQ(key.custom_code(), "FlexAssert");
+    EXPECT_EQ(key.version(), 1);
+    EXPECT_FALSE(key.is_custom_op());
+    EXPECT_TRUE(key.is_flex_op());
+  }
+}
+
+// TODO(ahentz): tests for tensors, inputs, outputs, opcodes and operators.
+
+}  // namespace
+}  // namespace tflite
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
similarity index 86%
rename from tensorflow/contrib/lite/toco/tflite/import.cc
rename to tensorflow/lite/toco/tflite/import.cc
index 1dd4915b31413e5afb04b45ee7c4893a2eded66d..1692f721256090f5a03c4e46dabdbe65be497d16 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tflite/import.h"
+#include "tensorflow/lite/toco/tflite/import.h"
 
 #include "flatbuffers/flexbuffers.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/toco/tflite/operator.h"
-#include "tensorflow/contrib/lite/toco/tflite/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/contrib/lite/tools/verifier.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/toco/tflite/types.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/lite/tools/verifier.h"
 
 namespace toco {
 
@@ -165,21 +165,28 @@ void ImportOperators(
   }
 }
 
-void ImportIOTensors(const ::tflite::Model& input_model,
+void ImportIOTensors(const ModelFlags& model_flags,
+                     const ::tflite::Model& input_model,
                      const details::TensorsTable& tensors_table, Model* model) {
-  auto inputs = (*input_model.subgraphs())[0]->inputs();
-  if (inputs) {
-    for (int input : *inputs) {
-      const string& input_name = tensors_table.at(input);
-      model->flags.add_input_arrays()->set_name(input_name);
+  // Import from the first subgraph if input arrays have not been specified.
+  if (model_flags.input_arrays().empty()) {
+    auto inputs = (*input_model.subgraphs())[0]->inputs();
+    if (inputs) {
+      for (int input : *inputs) {
+        const string& input_name = tensors_table.at(input);
+        model->flags.add_input_arrays()->set_name(input_name);
+      }
     }
   }
 
-  auto outputs = (*input_model.subgraphs())[0]->outputs();
-  if (outputs) {
-    for (int output : *outputs) {
-      const string& output_name = tensors_table.at(output);
-      model->flags.add_output_arrays(output_name);
+  // Import from the first subgraph if output arrays have not been specified.
+  if (model_flags.output_arrays().empty()) {
+    auto outputs = (*input_model.subgraphs())[0]->outputs();
+    if (outputs) {
+      for (int output : *outputs) {
+        const string& output_name = tensors_table.at(output);
+        model->flags.add_output_arrays(output_name);
+      }
     }
   }
 }
@@ -219,7 +226,8 @@ std::unique_ptr<Model> Import(const ModelFlags& model_flags,
   ImportTensors(*input_model, model.get());
   ImportOperators(*input_model, ops_by_name, tensors_table, operators_table,
                   model.get());
-  ImportIOTensors(*input_model, tensors_table, model.get());
+
+  ImportIOTensors(model_flags, *input_model, tensors_table, model.get());
 
   UndoWeightsShuffling(model.get());
 
diff --git a/tensorflow/contrib/lite/toco/tflite/import.h b/tensorflow/lite/toco/tflite/import.h
similarity index 84%
rename from tensorflow/contrib/lite/toco/tflite/import.h
rename to tensorflow/lite/toco/tflite/import.h
index 280677bae189fa345c2e19f6399a7b9ac7629ab5..f5de3b53b5bc24c327caac11b360c237cd766907 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.h
+++ b/tensorflow/lite/toco/tflite/import.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_IMPORT_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_IMPORT_H_
 
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
 
 namespace toco {
 
@@ -46,4 +46,4 @@ void LoadOperatorsTable(const ::tflite::Model &input_model,
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_IMPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/toco/tflite/import_test.cc
rename to tensorflow/lite/toco/tflite/import_test.cc
index edd22f783f03b1fbd34039cd7b00f08d34ca9fc6..93ab5141abe81c4ed4c1ff0ac7ca5e89577c71fb 100644
--- a/tensorflow/contrib/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tflite/import.h"
+#include "tensorflow/lite/toco/tflite/import.h"
 
 #include "flatbuffers/flexbuffers.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
 
 namespace toco {
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
similarity index 85%
rename from tensorflow/contrib/lite/toco/tflite/operator.cc
rename to tensorflow/lite/toco/tflite/operator.cc
index 9addbb81e71c7ecbef0ed8daf14a0d277bddf12f..205af23da57b08c8c62367df1c154bea5e50cc57 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -12,17 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
 
 // TODO(ycling): Consider refactoring to extract the LSTM definition out of
 // graph_transformation module.
-#include "tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h"
-#include "tensorflow/contrib/lite/toco/tflite/builtin_operator.h"
-#include "tensorflow/contrib/lite/toco/tflite/custom_operator.h"
-#include "tensorflow/contrib/lite/toco/tflite/simple_operator.h"
-#include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/lite/toco/tflite/builtin_operator.h"
+#include "tensorflow/lite/toco/tflite/custom_operator.h"
+#include "tensorflow/lite/toco/tflite/simple_operator.h"
+#include "tensorflow/lite/toco/tflite/types.h"
+#include "tensorflow/lite/toco/tflite/whitelisted_flex_ops.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace toco {
@@ -741,6 +744,43 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
   }
 };
 
+class UnidirectionalSequenceLstm
+    : public BuiltinOperator<
+          UnidirectionalSequenceLstmOperator,
+          ::tflite::UnidirectionalSequenceLSTMOptions,
+          ::tflite::BuiltinOptions_UnidirectionalSequenceLSTMOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    // Current toco converter only supports tanh, no clip.
+    return ::tflite::CreateUnidirectionalSequenceLSTMOptions(
+        *builder, /*fused_activation_function=*/
+        ::tflite::ActivationFunctionType_TANH,
+        /*cell_clip=*/0.0,
+        /*proj_clip=*/0.0,
+        /*time_major=*/true);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh activation, so check that tflite type is tanh.
+    DCHECK(options.fused_activation_function() ==
+           ::tflite::ActivationFunctionType_TANH);
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+
+  std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const override {
+    std::vector<bool> mutating_input_variables(op.inputs.size(), false);
+    mutating_input_variables[kInputActivationStateTensor] = true;
+    mutating_input_variables[kInputCellStateTensor] = true;
+    return mutating_input_variables;
+  }
+};
+
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::ReducerOptions,
                                     ::tflite::BuiltinOptions_ReducerOptions> {
  public:
@@ -874,6 +914,27 @@ class ResizeBilinear
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class ResizeNearestNeighbor
+    : public BuiltinOperator<
+          ResizeNearestNeighborOperator, ::tflite::ResizeNearestNeighborOptions,
+          ::tflite::BuiltinOptions_ResizeNearestNeighborOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateResizeNearestNeighborOptions(*builder,
+                                                        op.align_corners);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->align_corners = options.align_corners();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class Squeeze
     : public BuiltinOperator<SqueezeOperator, ::tflite::SqueezeOptions,
                              ::tflite::BuiltinOptions_SqueezeOptions> {
@@ -917,6 +978,26 @@ class Split
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class SplitV
+    : public BuiltinOperator<TensorFlowSplitVOperator, ::tflite::SplitVOptions,
+                             ::tflite::BuiltinOptions_SplitVOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSplitVOptions(*builder, op.num_split);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->num_split = options.num_splits();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class StridedSlice
     : public BuiltinOperator<StridedSliceOperator,
                              ::tflite::StridedSliceOptions,
@@ -1157,11 +1238,90 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class LeakyRelu
+    : public BuiltinOperator<LeakyReluOperator, ::tflite::LeakyReluOptions,
+                             ::tflite::BuiltinOptions_LeakyReluOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateLeakyReluOptions(*builder, op.alpha);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->alpha = options.alpha();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class SquaredDifference
+    : public BuiltinOperator<
+          SquaredDifferenceOperator, ::tflite::SquaredDifferenceOptions,
+          ::tflite::BuiltinOptions_SquaredDifferenceOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSquaredDifferenceOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class MirrorPad
+    : public BuiltinOperator<MirrorPadOperator, ::tflite::MirrorPadOptions,
+                             ::tflite::BuiltinOptions_MirrorPadOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateMirrorPadOptions(
+        *builder, op.mode == MirrorPadMode::kReflect
+                      ? ::tflite::MirrorPadMode::MirrorPadMode_REFLECT
+                      : ::tflite::MirrorPadMode::MirrorPadMode_SYMMETRIC);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->mode = options.mode() == ::tflite::MirrorPadMode::MirrorPadMode_REFLECT
+                   ? MirrorPadMode::kReflect
+                   : MirrorPadMode::kSymmetric;
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
+    const string& tensorflow_node_def) {
+  auto fbb = absl::make_unique<flexbuffers::Builder>();
+
+  ::tensorflow::NodeDef node_def;
+  if (!node_def.ParseFromString(tensorflow_node_def)) {
+    LOG(ERROR) << "Failed to parse TensorFlow NodeDef";
+    return {};
+  }
+
+  fbb->Vector([&]() {
+    fbb->String(node_def.op());
+    fbb->String(tensorflow_node_def);
+  });
+  fbb->Finish();
+  LOG(INFO) << "Writing flex op: " << node_def.op();
+  return std::unique_ptr<flexbuffers::Builder>(fbb.release());
+}
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   TensorFlowUnsupported(const string& name, OperatorType type,
-                        bool allow_flex_ops)
-      : BaseOperator(name, type), allow_flex_ops_(allow_flex_ops) {}
+                        bool enable_select_tf_ops)
+      : BaseOperator(name, type), enable_select_tf_ops_(enable_select_tf_ops) {}
 
   Options Serialize(const Operator& op,
                     flatbuffers::FlatBufferBuilder* builder) const override {
@@ -1192,6 +1352,9 @@ class TensorFlowUnsupported : public BaseOperator {
 
   std::unique_ptr<flexbuffers::Builder> WriteOptions(
       const TensorFlowUnsupportedOperator& op) const {
+    if (enable_select_tf_ops_) {
+      return WriteFlexOpOptions(op.tensorflow_node_def);
+    }
     auto fbb = absl::make_unique<flexbuffers::Builder>();
 
     ::tensorflow::NodeDef node_def;
@@ -1200,7 +1363,7 @@ class TensorFlowUnsupported : public BaseOperator {
       return std::unique_ptr<flexbuffers::Builder>();
     }
 
-    if (allow_flex_ops_) {
+    if (ShouldExportAsFlexOp(enable_select_tf_ops_, node_def.op())) {
       fbb->Vector([&]() {
         fbb->String(node_def.op());
         fbb->String(op.tensorflow_node_def);
@@ -1316,13 +1479,37 @@ class TensorFlowUnsupported : public BaseOperator {
   }
 
  private:
-  const bool allow_flex_ops_;
+  const bool enable_select_tf_ops_;
+};
+
+class Dequantize
+    : public BuiltinOperator<DequantizeOperator, ::tflite::DequantizeOptions,
+                             ::tflite::BuiltinOptions_DequantizeOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateDequantizeOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override {
+    // TODO(suharshs): Dequantize now supports INT8 in addition to
+    // QUANTIZED_UINT8. When TOCO can create models with INT8, we need
+    // to find a way to see the type here and return version 2. Right now
+    // version 2 will only be added by post training quantization tools.
+    return 1;
+  }
 };
 
 namespace {
 // Build a vector containing all the known operators.
 std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
-    bool allow_flex_ops = false) {
+    bool enable_select_tf_ops = false) {
   std::vector<std::unique_ptr<BaseOperator>> ops;
   using tensorflow::MakeUnique;
   // Builtin Operators.
@@ -1364,6 +1551,7 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                     OperatorType::kMaxPool));
   ops.push_back(
       MakeUnique<Mul>(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
+
   ops.push_back(
       MakeUnique<Pad>(::tflite::BuiltinOperator_PAD, OperatorType::kPad));
   ops.push_back(
@@ -1393,10 +1581,15 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.push_back(
       MakeUnique<ResizeBilinear>(::tflite::BuiltinOperator_RESIZE_BILINEAR,
                                  OperatorType::kResizeBilinear));
+  ops.push_back(MakeUnique<ResizeNearestNeighbor>(
+      ::tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      OperatorType::kResizeNearestNeighbor));
   ops.push_back(MakeUnique<Squeeze>(::tflite::BuiltinOperator_SQUEEZE,
                                     OperatorType::kSqueeze));
   ops.push_back(
       MakeUnique<Split>(::tflite::BuiltinOperator_SPLIT, OperatorType::kSplit));
+  ops.push_back(MakeUnique<SplitV>(::tflite::BuiltinOperator_SPLIT_V,
+                                   OperatorType::kSplitV));
   ops.push_back(MakeUnique<StridedSlice>(
       ::tflite::BuiltinOperator_STRIDED_SLICE, OperatorType::kStridedSlice));
   ops.push_back(MakeUnique<TopK_V2>(::tflite::BuiltinOperator_TOPK_V2,
@@ -1423,24 +1616,35 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                       OperatorType::kFakeQuant));
   ops.push_back(
       MakeUnique<Pack>(::tflite::BuiltinOperator_PACK, OperatorType::kPack));
+  ops.emplace_back(MakeUnique<UnidirectionalSequenceLstm>(
+      ::tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+      OperatorType::kUnidirectionalSequenceLstm));
   ops.push_back(MakeUnique<OneHot>(::tflite::BuiltinOperator_ONE_HOT,
                                    OperatorType::kOneHot));
   ops.push_back(MakeUnique<Unpack>(::tflite::BuiltinOperator_UNPACK,
                                    OperatorType::kUnpack));
+  ops.push_back(MakeUnique<LeakyRelu>(::tflite::BuiltinOperator_LEAKY_RELU,
+                                      OperatorType::kLeakyRelu));
+  ops.push_back(MakeUnique<SquaredDifference>(
+      ::tflite::BuiltinOperator_SQUARED_DIFFERENCE,
+      OperatorType::kSquaredDifference));
+  ops.push_back(MakeUnique<MirrorPad>(::tflite::BuiltinOperator_MIRROR_PAD,
+                                      OperatorType::kMirrorPad));
 
   // Custom Operators.
   ops.push_back(
       MakeUnique<DepthToSpace>("DEPTH_TO_SPACE", OperatorType::kDepthToSpace));
   ops.push_back(MakeUnique<CTCBeamSearchDecoder>(
       "CTC_BEAM_SEARCH_DECODER", OperatorType::kCTCBeamSearchDecoder));
-  ops.push_back(MakeUnique<TensorFlowUnsupported>(
-      "TENSORFLOW_UNSUPPORTED", OperatorType::kUnsupported, allow_flex_ops));
-
-  // There operators are supported by Toco, but not by TF Lite, and has no
-  // attributes.
-  ops.push_back(
-      MakeUnique<SimpleOperator<AddNOperator>>("ADDN", OperatorType::kAddN));
-  // Simple Operators.
+  ops.push_back(MakeUnique<TensorFlowUnsupported>("TENSORFLOW_UNSUPPORTED",
+                                                  OperatorType::kUnsupported,
+                                                  enable_select_tf_ops));
+
+  // SimpleOperator was designed to export CUSTOM TF Lite ops, but has since
+  // been modified to also export builtins. As TOCO evolved we added warnings
+  // when custom ops are exported but SimpleOperator bypasses thoses. To
+  // prevent user confusion we are settling on using SimpleOperator only for
+  // builtins.
   ops.push_back(MakeUnique<SimpleOperator<DequantizeOperator>>(
       "DEQUANTIZE", OperatorType::kDequantize));
   ops.push_back(
@@ -1493,6 +1697,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       "LOGICAL_NOT", OperatorType::kLogicalNot));
   ops.emplace_back(new SimpleOperator<FloorDivOperator>(
       "FLOOR_DIV", OperatorType::kFloorDiv));
+  ops.emplace_back(new SimpleOperator<FloorModOperator>(
+      "FLOOR_MOD", OperatorType::kFloorMod));
+  ops.emplace_back(
+      new SimpleOperator<RangeOperator>("RANGE", OperatorType::kRange));
   // Element-wise operator
   ops.push_back(
       MakeUnique<SimpleOperator<SinOperator>>("SIN", OperatorType::kSin));
@@ -1506,17 +1714,20 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       "SQUARE", OperatorType::kSquare));
   ops.push_back(MakeUnique<SimpleOperator<TensorFlowZerosLikeOperator>>(
       "ZEROS_LIKE", OperatorType::kZerosLike));
-
+  ops.push_back(
+      MakeUnique<SimpleOperator<AbsOperator>>("ABS", OperatorType::kAbs));
+  ops.push_back(
+      MakeUnique<SimpleOperator<FillOperator>>("FILL", OperatorType::kFill));
   return ops;
 }
 }  // namespace
 
 std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
-    bool allow_flex_ops) {
+    bool enable_select_tf_ops) {
   std::map<OperatorType, std::unique_ptr<BaseOperator>> result;
 
   std::vector<std::unique_ptr<BaseOperator>> ops =
-      BuildOperatorList(allow_flex_ops);
+      BuildOperatorList(enable_select_tf_ops);
   for (auto& op : ops) {
     result[op->type()] = std::move(op);
   }
@@ -1525,11 +1736,11 @@ std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
 }
 
 std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
-    bool allow_flex_ops) {
+    bool enable_select_tf_ops) {
   std::map<string, std::unique_ptr<BaseOperator>> result;
 
   std::vector<std::unique_ptr<BaseOperator>> ops =
-      BuildOperatorList(allow_flex_ops);
+      BuildOperatorList(enable_select_tf_ops);
   for (auto& op : ops) {
     result[op->name()] = std::move(op);
   }
@@ -1537,6 +1748,32 @@ std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
   return result;
 }
 
+bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
+                          const string& tensorflow_op_name) {
+  // If Flex ops aren't allow at all, simply return false.
+  if (!enable_select_tf_ops) {
+    return false;
+  }
+  // Check if we can find the `OpDef` for the TensorFlow op. If we can find
+  // it and it has been whitelisted, export the op as an Flex op. Otherwise,
+  // export it as a regular custom op.
+  const tensorflow::OpDef* op_def = nullptr;
+  if (!tensorflow::OpRegistry::Global()
+           ->LookUpOpDef(tensorflow_op_name, &op_def)
+           .ok()) {
+    return false;
+  }
+
+  if (!IsWhitelistedFlexOp(tensorflow_op_name)) {
+    LOG(WARNING) << "Op " << tensorflow_op_name
+                 << " is a valid TensorFlow op but has not been whitelisted for"
+                    " the TensorFlow Lite flex op set.";
+    return false;
+  }
+
+  return true;
+}
+
 }  // namespace tflite
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.h b/tensorflow/lite/toco/tflite/operator.h
similarity index 81%
rename from tensorflow/contrib/lite/toco/tflite/operator.h
rename to tensorflow/lite/toco/tflite/operator.h
index 13d9f6c49a3e6b09aeda0ade0a2a613ab6b01179..4ac531579c12c8f9c7e7904cbae261e74235e168 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.h
+++ b/tensorflow/lite/toco/tflite/operator.h
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_OPERATOR_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_OPERATOR_H_
 
 #include "flatbuffers/flatbuffers.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/toco/model.h"
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
 
 namespace toco {
 
@@ -26,15 +27,20 @@ namespace tflite {
 class BaseOperator;
 
 // Return a map contained all know TF Lite Operators, keyed by their names.
-// TODO(ycling): The pattern to propagate parameters (e.g. allow_flex_ops)
+// TODO(ycling): The pattern to propagate parameters (e.g. enable_select_tf_ops)
 // is ugly here. Consider refactoring.
 std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap(
-    bool allow_flex_ops = false);
+    bool enable_select_tf_ops = false);
 
 // Return a map contained all know TF Lite Operators, keyed by the type of
 // their tf.mini counterparts.
 std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap(
-    bool allow_flex_ops = false);
+    bool enable_select_tf_ops = false);
+
+// Write the custom option FlexBuffer with a serialized TensorFlow NodeDef
+// for a Flex op.
+std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
+    const string& tensorflow_node_def);
 
 // These are the flatbuffer types for custom and builtin options.
 using CustomOptions = flatbuffers::Vector<uint8_t>;
@@ -107,8 +113,13 @@ class BaseOperator {
   OperatorType type_;
 };
 
+// Helper function to determine if a unsupported TensorFlow op should be
+// exported as an Flex op or a regular custom op.
+bool ShouldExportAsFlexOp(bool enable_select_tf_ops,
+                          const string& tensorflow_op_name);
+
 }  // namespace tflite
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
similarity index 90%
rename from tensorflow/contrib/lite/toco/tflite/operator_test.cc
rename to tensorflow/lite/toco/tflite/operator_test.cc
index 0bc591e64719b819de7457f8b7c214c98c1ae18b..14ec89cd73f19fcd141640bda7bfba6435f59ac7 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
 
 #include "flatbuffers/flexbuffers.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -148,6 +149,9 @@ TEST_F(OperatorTest, SimpleOperators) {
                                                 OperatorType::kSquare);
   CheckSimpleOperator<TensorFlowZerosLikeOperator>("ZEROS_LIKE",
                                                    OperatorType::kZerosLike);
+  CheckSimpleOperator<FloorModOperator>("FLOOR_MOD", OperatorType::kFloorMod);
+  CheckSimpleOperator<RangeOperator>("RANGE", OperatorType::kRange);
+  CheckSimpleOperator<FillOperator>("FILL", OperatorType::kFill);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -307,6 +311,14 @@ TEST_F(OperatorTest, CustomSplit) {
   EXPECT_EQ(op.num_split, output_toco_op->num_split);
 }
 
+TEST_F(OperatorTest, CustomSplitV) {
+  TensorFlowSplitVOperator op;
+  op.num_split = 123;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SPLIT_V", OperatorType::kSplitV), op);
+  EXPECT_EQ(op.num_split, output_toco_op->num_split);
+}
+
 TEST_F(OperatorTest, BuiltinAveragePool) {
   AveragePoolOperator op;
   op.fused_activation_function = FusedActivationFunctionType::kRelu6;
@@ -384,6 +396,16 @@ TEST_F(OperatorTest, ResizeBilinear) {
   EXPECT_EQ(op.align_corners, output_toco_op->align_corners);
 }
 
+TEST_F(OperatorTest, ResizeNearestNeighbor) {
+  ResizeNearestNeighborOperator op;
+  op.align_corners = true;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("RESIZE_NEAREST_NEIGHBOR",
+                                          OperatorType::kResizeNearestNeighbor),
+                              op);
+  EXPECT_EQ(op.align_corners, output_toco_op->align_corners);
+}
+
 TEST_F(OperatorTest, Svdf) {
   SvdfOperator op;
   op.fused_activation_function = FusedActivationFunctionType::kRelu;
@@ -504,6 +526,21 @@ TEST_F(OperatorTest, BuiltinUnpack) {
   EXPECT_EQ(op.axis, output_toco_op->axis);
 }
 
+TEST_F(OperatorTest, BuiltinLeakyRelu) {
+  LeakyReluOperator op;
+  op.alpha = 3;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("LEAKY_RELU", OperatorType::kLeakyRelu), op);
+  EXPECT_EQ(op.alpha, output_toco_op->alpha);
+}
+
+TEST_F(OperatorTest, BuiltinSquaredDifference) {
+  SquaredDifferenceOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SQUARED_DIFFERENCE", OperatorType::kSquaredDifference), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+}
+
 TEST_F(OperatorTest, CustomCTCBeamSearchDecoder) {
   CTCBeamSearchDecoderOperator op;
   op.beam_width = 3;
@@ -569,6 +606,24 @@ TEST_F(OperatorTest, TensorFlowUnsupportedWithoutAttr) {
   EXPECT_TRUE(output_node_def.attr().empty());
 }
 
+TEST_F(OperatorTest, TestShouldExportAsFlexOp) {
+  EXPECT_FALSE(ShouldExportAsFlexOp(false, "Conv2D"));
+  EXPECT_TRUE(ShouldExportAsFlexOp(true, "Conv2D"));
+  EXPECT_TRUE(ShouldExportAsFlexOp(true, "EluGrad"));
+  EXPECT_FALSE(ShouldExportAsFlexOp(true, "MyAwesomeCustomOp"));
+  // While the RFFT op is available on desktop, it is not in the kernel
+  // set available on mobile and should be excluded.
+  EXPECT_FALSE(ShouldExportAsFlexOp(true, "RFFT"));
+}
+
+TEST_F(OperatorTest, BuiltinMirrorPad) {
+  MirrorPadOperator op;
+  op.mode = MirrorPadMode::kReflect;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("MIRROR_PAD", OperatorType::kMirrorPad), op);
+  EXPECT_EQ(op.mode, output_toco_op->mode);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/toco/tflite/simple_operator.h b/tensorflow/lite/toco/tflite/simple_operator.h
similarity index 86%
rename from tensorflow/contrib/lite/toco/tflite/simple_operator.h
rename to tensorflow/lite/toco/tflite/simple_operator.h
index a7f7e886f61d3bbf221c0ab7a24d6c3e629ec274..e3e4c8551e931ff54f72c130cf1908ffa5e79514 100644
--- a/tensorflow/contrib/lite/toco/tflite/simple_operator.h
+++ b/tensorflow/lite/toco/tflite/simple_operator.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
 
-#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/lite/toco/tflite/operator.h"
 
 namespace toco {
 
@@ -49,4 +49,4 @@ class SimpleOperator : public BaseOperator {
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/types.cc b/tensorflow/lite/toco/tflite/types.cc
similarity index 98%
rename from tensorflow/contrib/lite/toco/tflite/types.cc
rename to tensorflow/lite/toco/tflite/types.cc
index 754f0b4b8c661355c99d9e5a86f2d7844414a303..f878dafc1ed3c85197e6b161290ab4da548090f5 100644
--- a/tensorflow/contrib/lite/toco/tflite/types.cc
+++ b/tensorflow/lite/toco/tflite/types.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tflite/types.h"
-#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/lite/toco/tflite/types.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace toco {
 
diff --git a/tensorflow/lite/toco/tflite/types.h b/tensorflow/lite/toco/tflite/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc2edb742974262257dfb21f24bf9900baeb495c
--- /dev/null
+++ b/tensorflow/lite/toco/tflite/types.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_TYPES_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_TYPES_H_
+
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/toco/model.h"
+
+namespace toco {
+
+namespace tflite {
+
+struct DataType {
+  static ::tflite::TensorType Serialize(ArrayDataType array_data_type);
+  static ArrayDataType Deserialize(int tensor_type);
+};
+
+struct DataBuffer {
+  using FlatBufferOffset = flatbuffers::Offset<flatbuffers::Vector<uint8_t>>;
+
+  // Build the flatbuffer representation of a toco's Array and return the
+  // corresponding offset into the flatbuffer. Note that data from the array
+  // will be copied into the flatbuffer.
+  static FlatBufferOffset Serialize(const Array& array,
+                                    flatbuffers::FlatBufferBuilder* builder);
+  // Copy data from the given tensor into toco's Array.
+  static void Deserialize(const ::tflite::Tensor& tensor,
+                          const ::tflite::Buffer& buffer, Array* array);
+};
+
+struct Padding {
+  static ::tflite::Padding Serialize(PaddingType padding_type);
+  static PaddingType Deserialize(int padding);
+};
+
+struct ActivationFunction {
+  static ::tflite::ActivationFunctionType Serialize(
+      FusedActivationFunctionType faf_type);
+  static FusedActivationFunctionType Deserialize(int activation_function);
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/types_test.cc b/tensorflow/lite/toco/tflite/types_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/toco/tflite/types_test.cc
rename to tensorflow/lite/toco/tflite/types_test.cc
index 8e9f30ba3a6e6b98fa9c4237567b0797a5a797aa..efa2911b5b8c25920e8f12c06d370711b4790d7e 100644
--- a/tensorflow/contrib/lite/toco/tflite/types_test.cc
+++ b/tensorflow/lite/toco/tflite/types_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/lite/toco/tflite/types.h"
 
 #include <complex>
 
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..039a918af16019292214f982326fba3eb5695c62
--- /dev/null
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
@@ -0,0 +1,461 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/tflite/whitelisted_flex_ops.h"
+
+#include <set>
+
+namespace toco {
+namespace tflite {
+
+bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
+  static const std::set<std::string>* whitelisted_flex_ops =
+      new std::set<std::string>({
+          "Abort",
+          "Abs",
+          "Add",
+          "AddN",
+          "AddV2",
+          "All",
+          "Any",
+          "ApplyAdadelta",
+          "ApplyAdagrad",
+          "ApplyAdagradDA",
+          "ApplyAdam",
+          "ApplyAdaMax",
+          "ApplyAddSign",
+          "ApplyCenteredRMSProp",
+          "ApplyFtrl",
+          "ApplyFtrlV2",
+          "ApplyGradientDescent",
+          "ApplyMomentum",
+          "ApplyPowerSign",
+          "ApplyProximalAdagrad",
+          "ApplyProximalGradientDescent",
+          "ApplyRMSProp",
+          "ApproximateEqual",
+          "_Arg",
+          "ArgMax",
+          "ArgMin",
+          "_ArrayToList",
+          "Assert",
+          "Assign",
+          "AssignAdd",
+          "AssignSub",
+          "AudioSpectrogram",
+          "AvgPool",
+          "AvgPool3D",
+          "AvgPoolGrad",
+          "BatchMatMul",
+          "BatchNormWithGlobalNormalization",
+          "BatchNormWithGlobalNormalizationGrad",
+          "BatchToSpace",
+          "BatchToSpaceND",
+          "BiasAdd",
+          "BiasAddGrad",
+          "BiasAddV1",
+          "BroadcastArgs",
+          "BroadcastGradientArgs",
+          "Cast",
+          "CheckNumerics",
+          "ComplexAbs",
+          "Concat",
+          "ConcatOffset",
+          "ConcatV2",
+          "ConjugateTranspose",
+          "Const",
+          "ControlTrigger",
+          "Conv2D",
+          "Conv2DBackpropFilter",
+          "Conv2DBackpropInput",
+          "Conv3D",
+          "Cos",
+          "Cosh",
+          "CropAndResize",
+          "CropAndResizeGradBoxes",
+          "CropAndResizeGradImage",
+          "CTCBeamSearchDecoder",
+          "CTCGreedyDecoder",
+          "DataFormatDimMap",
+          "DataFormatVecPermute",
+          "DebugGradientIdentity",
+          "DebugGradientRefIdentity",
+          "DecodeBmp",
+          "DecodeWav",
+          "DeleteSessionTensor",
+          "DepthToSpace",
+          "DepthwiseConv2dNative",
+          "Dequantize",
+          "DestroyTemporaryVariable",
+          "Div",
+          "DivNoNan",
+          "DynamicPartition",
+          "DynamicStitch",
+          "Elu",
+          "EluGrad",
+          "EncodeWav",
+          "EnsureShape",
+          "Enter",
+          "Equal",
+          "Exit",
+          "Exp",
+          "ExpandDims",
+          "FakeQuantWithMinMaxArgs",
+          "FakeQuantWithMinMaxArgsGradient",
+          "FakeQuantWithMinMaxVars",
+          "FakeQuantWithMinMaxVarsGradient",
+          "FakeQuantWithMinMaxVarsPerChannel",
+          "FakeQuantWithMinMaxVarsPerChannelGradient",
+          "FakeQueue",
+          "FIFOQueue",
+          "FIFOQueueV2",
+          "Fill",
+          "Floor",
+          "FloorDiv",
+          "FloorMod",
+          "FusedBatchNorm",
+          "FusedBatchNormGrad",
+          "FusedBatchNormGradV2",
+          "FusedBatchNormV2",
+          "FusedPadConv2D",
+          "FusedResizeAndPadConv2D",
+          "Gather",
+          "GatherNd",
+          "GatherV2",
+          "GetSessionHandle",
+          "GetSessionHandleV2",
+          "GetSessionTensor",
+          "Greater",
+          "GreaterEqual",
+          "_HostCast",
+          "_HostRecv",
+          "_HostSend",
+          "Identity",
+          "IdentityN",
+          "ImmutableConst",
+          "InTopK",
+          "InTopKV2",
+          "Inv",
+          "InvertPermutation",
+          "InvGrad",
+          "IsFinite",
+          "IsNan",
+          "IsVariableInitialized",
+          "LeakyRelu",
+          "LeakyReluGrad",
+          "Less",
+          "LessEqual",
+          "LinSpace",
+          "ListDiff",
+          "_ListToArray",
+          "Log",
+          "LogicalAnd",
+          "LogicalNot",
+          "LogicalOr",
+          "LogSoftmax",
+          "LoopCond",
+          "LRN",
+          "MatMul",
+          "Max",
+          "Maximum",
+          "MaxPool",
+          "MaxPool3D",
+          "MaxPoolGrad",
+          "MaxPoolGradGrad",
+          "MaxPoolGradGradV2",
+          "MaxPoolGradV2",
+          "MaxPoolGradWithArgmax",
+          "MaxPoolV2",
+          "MaxPoolWithArgmax",
+          "Mean",
+          "Merge",
+          "MergeV2Checkpoints",
+          "Mfcc",
+          "Min",
+          "Minimum",
+          "MirrorPad",
+          "MirrorPadGrad",
+          "Mul",
+          "Multinomial",
+          "Neg",
+          "NextIteration",
+          "NonMaxSuppression",
+          "NonMaxSuppressionV2",
+          "NonMaxSuppressionV3",
+          "NonMaxSuppressionV4",
+          "NonMaxSuppressionWithOverlaps",
+          "NoOp",
+          "NotEqual",
+          "OneHot",
+          "OnesLike",
+          "Pack",
+          "Pad",
+          "PaddingFIFOQueue",
+          "PaddingFIFOQueueV2",
+          "PadV2",
+          "ParallelDynamicStitch",
+          "ParseExample",
+          "ParseSequenceExample",
+          "ParseSingleExample",
+          "ParseSingleSequenceExample",
+          "Placeholder",
+          "PlaceholderV2",
+          "PlaceholderWithDefault",
+          "Pow",
+          "PreventGradient",
+          "Print",
+          "PrintV2",
+          "Prod",
+          "QuantizedAdd",
+          "QuantizedAvgPool",
+          "QuantizedBatchNormWithGlobalNormalization",
+          "QuantizedBiasAdd",
+          "QuantizedConcat",
+          "QuantizedConv2D",
+          "QuantizedInstanceNorm",
+          "QuantizedMatMul",
+          "QuantizedMaxPool",
+          "QuantizedMul",
+          "QuantizeDownAndShrinkRange",
+          "QuantizedRelu",
+          "QuantizedRelu6",
+          "QuantizedReshape",
+          "QuantizedResizeBilinear",
+          "QuantizeV2",
+          "QueueClose",
+          "QueueCloseV2",
+          "QueueDequeue",
+          "QueueDequeueMany",
+          "QueueDequeueManyV2",
+          "QueueDequeueUpTo",
+          "QueueDequeueUpToV2",
+          "QueueDequeueV2",
+          "QueueEnqueue",
+          "QueueEnqueueMany",
+          "QueueEnqueueManyV2",
+          "QueueEnqueueV2",
+          "QueueIsClosed",
+          "QueueIsClosedV2",
+          "QueueSize",
+          "QueueSizeV2",
+          "RandomGamma",
+          "RandomStandardNormal",
+          "RandomUniform",
+          "RandomUniformInt",
+          "Range",
+          "Rank",
+          "RealDiv",
+          "Reciprocal",
+          "ReciprocalGrad",
+          "_Recv",
+          "RefEnter",
+          "RefExit",
+          "RefIdentity",
+          "RefMerge",
+          "RefNextIteration",
+          "RefSelect",
+          "RefSwitch",
+          "Relu",
+          "Relu6",
+          "Relu6Grad",
+          "ReluGrad",
+          "RemoteCall",
+          "RequantizationRange",
+          "Requantize",
+          "Reshape",
+          "ResizeBilinear",
+          "ResizeBilinearGrad",
+          "ResizeNearestNeighbor",
+          "ResizeNearestNeighborGrad",
+          "ResourceApplyAdadelta",
+          "ResourceApplyAdagrad",
+          "ResourceApplyAdagradDA",
+          "ResourceApplyAdam",
+          "ResourceApplyAdaMax",
+          "ResourceApplyAddSign",
+          "ResourceApplyCenteredRMSProp",
+          "ResourceApplyFtrl",
+          "ResourceApplyFtrlV2",
+          "ResourceApplyGradientDescent",
+          "ResourceApplyMomentum",
+          "ResourceApplyPowerSign",
+          "ResourceApplyProximalAdagrad",
+          "ResourceApplyProximalGradientDescent",
+          "ResourceApplyRMSProp",
+          "ResourceSparseApplyAdadelta",
+          "ResourceSparseApplyAdagrad",
+          "ResourceSparseApplyAdagradDA",
+          "ResourceSparseApplyCenteredRMSProp",
+          "ResourceSparseApplyFtrl",
+          "ResourceSparseApplyFtrlV2",
+          "ResourceSparseApplyMomentum",
+          "ResourceSparseApplyProximalAdagrad",
+          "ResourceSparseApplyProximalGradientDescent",
+          "ResourceSparseApplyRMSProp",
+          "ResourceStridedSliceAssign",
+          "Restore",
+          "RestoreSlice",
+          "RestoreV2",
+          "_Retval",
+          "Reverse",
+          "ReverseSequence",
+          "ReverseV2",
+          "Round",
+          "Rsqrt",
+          "RsqrtGrad",
+          "Save",
+          "SaveSlices",
+          "SaveV2",
+          "SegmentMax",
+          "SegmentMean",
+          "SegmentMin",
+          "SegmentProd",
+          "SegmentSum",
+          "Select",
+          "Selu",
+          "SeluGrad",
+          "_Send",
+          "Shape",
+          "ShapeN",
+          "ShardedFilename",
+          "ShardedFilespec",
+          "Sigmoid",
+          "SigmoidGrad",
+          "Sign",
+          "Sin",
+          "Sinh",
+          "Size",
+          "Slice",
+          "Softmax",
+          "SoftmaxCrossEntropyWithLogits",
+          "Softplus",
+          "SoftplusGrad",
+          "Softsign",
+          "SoftsignGrad",
+          "SpaceToBatch",
+          "SpaceToBatchND",
+          "SpaceToDepth",
+          "SparseApplyAdadelta",
+          "SparseApplyAdagrad",
+          "SparseApplyAdagradDA",
+          "SparseApplyCenteredRMSProp",
+          "SparseApplyFtrl",
+          "SparseApplyFtrlV2",
+          "SparseApplyMomentum",
+          "SparseApplyProximalAdagrad",
+          "SparseApplyProximalGradientDescent",
+          "SparseApplyRMSProp",
+          "SparseFillEmptyRows",
+          "SparseFillEmptyRowsGrad",
+          "SparseReshape",
+          "SparseSegmentMean",
+          "SparseSegmentMeanGrad",
+          "SparseSegmentMeanWithNumSegments",
+          "SparseSegmentSqrtN",
+          "SparseSegmentSqrtNGrad",
+          "SparseSegmentSqrtNWithNumSegments",
+          "SparseSegmentSum",
+          "SparseSegmentSumWithNumSegments",
+          "SparseToDense",
+          "Split",
+          "SplitV",
+          "Sqrt",
+          "SqrtGrad",
+          "Square",
+          "SquaredDifference",
+          "Squeeze",
+          "Stack",
+          "StackClose",
+          "StackCloseV2",
+          "StackPop",
+          "StackPopV2",
+          "StackPush",
+          "StackPushV2",
+          "StackV2",
+          "StopGradient",
+          "StridedSlice",
+          "StridedSliceAssign",
+          "StridedSliceGrad",
+          "StringJoin",
+          "Sub",
+          "Sum",
+          "Switch",
+          "SymbolicGradient",
+          "Tan",
+          "Tanh",
+          "TanhGrad",
+          "TemporaryVariable",
+          "TensorArray",
+          "TensorArrayClose",
+          "TensorArrayCloseV2",
+          "TensorArrayCloseV3",
+          "TensorArrayConcat",
+          "TensorArrayConcatV2",
+          "TensorArrayConcatV3",
+          "TensorArrayGather",
+          "TensorArrayGatherV2",
+          "TensorArrayGatherV3",
+          "TensorArrayGrad",
+          "TensorArrayGradV2",
+          "TensorArrayGradV3",
+          "TensorArrayGradWithShape",
+          "TensorArrayPack",
+          "TensorArrayRead",
+          "TensorArrayReadV2",
+          "TensorArrayReadV3",
+          "TensorArrayScatter",
+          "TensorArrayScatterV2",
+          "TensorArrayScatterV3",
+          "TensorArraySize",
+          "TensorArraySizeV2",
+          "TensorArraySizeV3",
+          "TensorArraySplit",
+          "TensorArraySplitV2",
+          "TensorArraySplitV3",
+          "TensorArrayUnpack",
+          "TensorArrayV2",
+          "TensorArrayV3",
+          "TensorArrayWrite",
+          "TensorArrayWriteV2",
+          "TensorArrayWriteV3",
+          "Tile",
+          "TileGrad",
+          "Timestamp",
+          "TopK",
+          "TopKV2",
+          "Transpose",
+          "TruncateDiv",
+          "TruncatedNormal",
+          "Unique",
+          "UniqueV2",
+          "UniqueWithCounts",
+          "UniqueWithCountsV2",
+          "Unpack",
+          "UnsortedSegmentMax",
+          "UnsortedSegmentMin",
+          "UnsortedSegmentProd",
+          "UnsortedSegmentSum",
+          "Variable",
+          "VariableV2",
+          "Where",
+          "Xdivy",
+          "Xlogy",
+          "ZerosLike",
+      });
+  return whitelisted_flex_ops->find(tensorflow_op_name) !=
+         whitelisted_flex_ops->end();
+}
+
+}  // namespace tflite
+}  // namespace toco
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.h b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..2559a7052852cad01cba57002f09b09ca6103dfd
--- /dev/null
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TFLITE_WHITELISTED_FLEX_OPS_H_
+#define TENSORFLOW_LITE_TOCO_TFLITE_WHITELISTED_FLEX_OPS_H_
+
+#include <string>
+
+namespace toco {
+namespace tflite {
+
+// Whether the given op has been statically whitelisted for flex export.
+//
+// This static whitelist is formed by the intersection of ops supported by
+// TensorFlowMobile on both iOS and Android. As the converter is likely running
+// on a host that has the full suite of TensorFlow ops available, we use this
+// static whitelist to ensure compatibility when deploying to a mobile device.
+// TODO(b/118389105): Automate generation of the whitelisted flex ops.
+bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name);
+
+}  // namespace tflite
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TFLITE_WHITELISTED_FLEX_OPS_H_
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a3d6a5848751f4c1d526153bd6f6d08a9f882af
--- /dev/null
+++ b/tensorflow/lite/toco/toco.cc
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/lite/toco/toco_convert.h"
+
+int main(int argc, char** argv) {
+  toco::string msg;
+  toco::ParsedTocoFlags parsed_toco_flags;
+  toco::ParsedModelFlags parsed_model_flags;
+
+  // If no args were specified, give a help string to be helpful.
+  int* effective_argc = &argc;
+  char** effective_argv = argv;
+  if (argc == 1) {
+    // No arguments, so manufacture help argv.
+    static int dummy_argc = 2;
+    static char* dummy_argv[] = {argv[0], const_cast<char*>("--help")};
+    effective_argc = &dummy_argc;
+    effective_argv = dummy_argv;
+  }
+
+  // Parse toco flags and command flags in sequence, each one strips off args,
+  // giving InitGoogle a chance to handle all remaining arguments.
+  bool toco_success = toco::ParseTocoFlagsFromCommandLineFlags(
+      effective_argc, effective_argv, &msg, &parsed_toco_flags);
+  bool model_success = toco::ParseModelFlagsFromCommandLineFlags(
+      effective_argc, effective_argv, &msg, &parsed_model_flags);
+  if (!toco_success || !model_success || !msg.empty()) {
+    fprintf(stderr, "%s", msg.c_str());
+    fflush(stderr);
+    return 1;
+  }
+  toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
+  auto status = toco::Convert(parsed_toco_flags, parsed_model_flags);
+  return status.ok() ? 0 : -1;
+}
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/lite/toco/toco_cmdline_flags.cc
similarity index 94%
rename from tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
rename to tensorflow/lite/toco/toco_cmdline_flags.cc
index cff79776bc787ea8c9b0afa8cce556a175453bd0..7d525ae5583c4f8e18fa4f517561f046ab8cf3bc 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/lite/toco/toco_cmdline_flags.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
 #include "absl/types/optional.h"
-#include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
-#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -166,12 +166,13 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "Boolean indicating whether to quantize the weights of the "
            "converted float model. Model size will be reduced and there will "
            "be latency improvements (at the cost of accuracy)."),
+      // TODO(b/118822804): Unify the argument definition with `tflite_convert`.
       // WARNING: Experimental interface, subject to change
-      Flag("allow_flex_ops", parsed_flags.allow_flex_ops.bind(),
-           parsed_flags.allow_flex_ops.default_value(), ""),
+      Flag("enable_select_tf_ops", parsed_flags.enable_select_tf_ops.bind(),
+           parsed_flags.enable_select_tf_ops.default_value(), ""),
       // WARNING: Experimental interface, subject to change
-      Flag("force_flex_ops", parsed_flags.force_flex_ops.bind(),
-           parsed_flags.force_flex_ops.default_value(), "")};
+      Flag("force_select_tf_ops", parsed_flags.force_select_tf_ops.bind(),
+           parsed_flags.force_select_tf_ops.default_value(), "")};
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
   if (asked_for_help) {
@@ -266,15 +267,15 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   READ_TOCO_FLAG(split_tflite_lstm_inputs, FlagRequirement::kNone);
   READ_TOCO_FLAG(quantize_weights, FlagRequirement::kNone);
   READ_TOCO_FLAG(post_training_quantize, FlagRequirement::kNone);
-  READ_TOCO_FLAG(allow_flex_ops, FlagRequirement::kNone);
-  READ_TOCO_FLAG(force_flex_ops, FlagRequirement::kNone);
+  READ_TOCO_FLAG(enable_select_tf_ops, FlagRequirement::kNone);
+  READ_TOCO_FLAG(force_select_tf_ops, FlagRequirement::kNone);
 
-  if (parsed_toco_flags.force_flex_ops.value() &&
-      !parsed_toco_flags.allow_flex_ops.value()) {
-    // TODO(ycling): Consider to enforce `allow_flex_ops` when
-    // `force_flex_ops` is true.
-    LOG(WARNING) << "--force_flex_ops should always be used with "
-                    "--allow_flex_ops.";
+  if (parsed_toco_flags.force_select_tf_ops.value() &&
+      !parsed_toco_flags.enable_select_tf_ops.value()) {
+    // TODO(ycling): Consider to enforce `enable_select_tf_ops` when
+    // `force_select_tf_ops` is true.
+    LOG(WARNING) << "--force_select_tf_ops should always be used with "
+                    "--enable_select_tf_ops.";
   }
 
   // Deprecated flag handling.
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.h b/tensorflow/lite/toco/toco_cmdline_flags.h
similarity index 79%
rename from tensorflow/contrib/lite/toco/toco_cmdline_flags.h
rename to tensorflow/lite/toco/toco_cmdline_flags.h
index 46eb3f57283cc52bf2877f578500f3a4a633be86..cf57055abc26e6a39820f6af3c03c3d49b4e54e3 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.h
+++ b/tensorflow/lite/toco/toco_cmdline_flags.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
 
 #include <string>
 #include <vector>
-#include "tensorflow/contrib/lite/toco/args.h"
-#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/types.pb.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/types.pb.h"
 
 namespace toco {
 // Parse and remove arguments handled from toco. Returns true if parsing
@@ -33,4 +33,4 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
new file mode 100644
index 0000000000000000000000000000000000000000..28e7b10ecd056815c8ca6d7a74f324a18d307451
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_tooling.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+namespace {
+
+// Checks the permissions of the output file to ensure it is writeable.
+void CheckOutputFilePermissions(const Arg<string>& output_file) {
+  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
+  QCHECK(port::file::Writable(output_file.value()).ok())
+      << "Specified output_file is not writable: " << output_file.value()
+      << ".\n";
+}
+
+// Checks the permissions of the frozen model file.
+void CheckFrozenModelPermissions(const Arg<string>& input_file) {
+  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
+  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file does not exist: " << input_file.value() << ".\n";
+  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file exists, but is not readable: "
+      << input_file.value() << ".\n";
+}
+
+// Reads the contents of the GraphDef from either the frozen graph file or the
+// SavedModel directory. If it reads the SavedModel directory, it updates the
+// ModelFlags and TocoFlags accordingly.
+void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags,
+                   string* graph_def_contents) {
+  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
+
+  // Ensure savedmodel_directory is not set.
+  QCHECK(!parsed_toco_flags.savedmodel_directory.specified())
+      << "Use `tensorflow/lite/python/tflite_convert` script with "
+      << "SavedModel directories.\n";
+
+  // Checks the input file permissions and reads the contents.
+  CheckFrozenModelPermissions(parsed_toco_flags.input_file);
+  CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
+                                graph_def_contents, port::file::Defaults())
+            .ok());
+}
+}  // namespace
+
+tensorflow::Status Convert(const string& graph_def_contents,
+                           const TocoFlags& toco_flags,
+                           const ModelFlags& model_flags,
+                           string* output_file_contents) {
+  std::unique_ptr<Model> model =
+      Import(toco_flags, model_flags, graph_def_contents);
+  Transform(toco_flags, model.get());
+  return Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+                output_file_contents);
+}
+
+tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags) {
+  ModelFlags model_flags;
+  ReadModelFlagsFromCommandLineFlags(parsed_model_flags, &model_flags);
+
+  TocoFlags toco_flags;
+  ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
+
+  string graph_def_contents;
+  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
+                &model_flags, &graph_def_contents);
+  CheckOutputFilePermissions(parsed_toco_flags.output_file);
+
+  string output_file_contents;
+  TF_RETURN_IF_ERROR(Convert(graph_def_contents, toco_flags, model_flags,
+                             &output_file_contents));
+
+  TF_RETURN_IF_ERROR(
+      port::file::SetContents(parsed_toco_flags.output_file.value(),
+                              output_file_contents, port::file::Defaults()));
+  return tensorflow::Status();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/toco_convert.h b/tensorflow/lite/toco/toco_convert.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebbd336d3f50ae63a106387eadb5888c00ed9064
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+
+namespace toco {
+
+tensorflow::Status Convert(const string& graph_def_contents,
+                           const TocoFlags& toco_flags,
+                           const ModelFlags& model_flags,
+                           string* output_file_contents);
+
+tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags);
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3c440db94396def2f8cfd40242642767d11a63a
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -0,0 +1,173 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/toco_convert.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace toco {
+namespace {
+
+TEST(TocoTest, MissingInputFile) {
+  ParsedTocoFlags toco_flags;
+  ParsedModelFlags model_flags;
+  EXPECT_DEATH(Convert(toco_flags, model_flags).ok(),
+               "Missing required flag --input_file");
+}
+
+TEST(TocoTest, BadInputFormat) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Unhandled input_format='FILE_FORMAT_UNKNOWN'");
+}
+
+TEST(TocoTest, MissingOuputArrays) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "This model does not define output arrays, so a --output_arrays "
+               "flag must be given on the command-line");
+}
+
+TEST(TocoTest, BadOutputArray) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  model_flags.add_output_arrays("output1");
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Specified output array .output1. is not produced by any op "
+               "in this graph. Is it a typo. To silence this message, pass "
+               "this flag:  allow_nonexistent_arrays");
+}
+
+TEST(TocoTest, BadOutputFormat) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "output1"
+      input: "input1"
+      input: "input2"
+      op: "Sub"
+      attr { key: "T" value { type: DT_FLOAT } }
+    }
+  )GraphDef";
+
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Unhandled output_format='FILE_FORMAT_UNKNOWN'");
+}
+
+TEST(TocoTest, SimpleFloatModel) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  toco_flags.set_output_format(TENSORFLOW_GRAPHDEF);
+
+  // Inputs are automatically selected (but that might not be a good idea).
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "input1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "input2"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "output1"
+      input: "input1"
+      input: "input2"
+      op: "Sub"
+      attr { key: "T" value { type: DT_FLOAT } }
+    }
+  )GraphDef";
+
+  string output;
+  EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
+  EXPECT_TRUE(!output.empty());
+}
+
+TEST(TocoTest, TransientStringTensors) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+
+  // We need to do a couple of things to trigger the transient array
+  // initialization code: output format must support memory planning, and the
+  // input array must have a shape.
+  toco_flags.set_output_format(TFLITE);
+
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "input1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_STRING } }
+      attr { key: "shape" value { shape { dim { size:1 }}}}
+    }
+    node {
+      name: "indices1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "intermediate1"
+      op: "Gather"
+      input: "input1"
+      input: "indices1"
+      attr { key: "Tparams" value { type: DT_STRING } }
+      attr { key: "Tindices" value { type: DT_INT64 } }
+    }
+    node {
+      name: "output1"
+      op: "Gather"
+      input: "intermediate1"
+      input: "indices2"
+      attr { key: "Tparams" value { type: DT_STRING } }
+      attr { key: "Tindices" value { type: DT_INT64 } }
+    }
+  )GraphDef";
+
+  string output;
+
+  EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
+  EXPECT_TRUE(!output.empty());
+}
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/lite/toco/toco_flags.proto
similarity index 93%
rename from tensorflow/contrib/lite/toco/toco_flags.proto
rename to tensorflow/lite/toco/toco_flags.proto
index ca3e64485e7a46c730966db27c1d5190a3632453..cb015ba3d2a74258527ee124c5891c996f416c28 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/lite/toco/toco_flags.proto
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 syntax = "proto2";
-import "tensorflow/contrib/lite/toco/types.proto";
+import "tensorflow/lite/toco/types.proto";
 
 package toco;
 
@@ -190,16 +190,19 @@ message TocoFlags {
   // (at the cost of accuracy).
   optional bool post_training_quantize = 26 [default = false];
 
-  // When enabled, unsupported ops will be converted to TFLite Flex ops.
+  // This flag only works when converting to TensorFlow Lite format.
+  // When enabled, unsupported ops will be converted to select TensorFlow ops.
   // TODO(ycling): Consider to rename the following 2 flags and don't call it
   // "Flex".
-  // `allow_flex_ops` should always be used with `allow_custom_ops`.
+  // `enable_select_tf_ops` should always be used with `allow_custom_ops`.
   // WARNING: Experimental interface, subject to change
-  optional bool allow_flex_ops = 27 [default = false];
+  optional bool enable_select_tf_ops = 27 [default = false];
 
-  // When enabled, all TensorFlow ops will be converted to TFLite Flex
-  // ops directly. This will force `allow_flex_ops` to true.
-  // `force_flex_ops` should always be used with `allow_flex_ops`.
+  // This flag only works when converting to TensorFlow Lite format.
+  // When enabled, all TensorFlow ops will be converted to select TensorFlow
+  // ops.
+  // This will force `enable_select_tf_ops` to true.
+  // `force_select_tf_ops` should always be used with `enable_select_tf_ops`.
   // WARNING: Experimental interface, subject to change
-  optional bool force_flex_ops = 28 [default = false];
+  optional bool force_select_tf_ops = 28 [default = false];
 }
diff --git a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.cc b/tensorflow/lite/toco/toco_graphviz_dump_options.cc
similarity index 92%
rename from tensorflow/contrib/lite/toco/toco_graphviz_dump_options.cc
rename to tensorflow/lite/toco/toco_graphviz_dump_options.cc
index 4e98e7081de4388e5425f0eea9f6bb5f5cdafcd7..449f0f07cec1282711c04785cce31b7d5aca436e 100644
--- a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.cc
+++ b/tensorflow/lite/toco/toco_graphviz_dump_options.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
+#include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
 
 namespace toco {
 GraphVizDumpOptions* GraphVizDumpOptions::singleton() {
diff --git a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h b/tensorflow/lite/toco/toco_graphviz_dump_options.h
similarity index 82%
rename from tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
rename to tensorflow/lite/toco/toco_graphviz_dump_options.h
index 7cdd55e5422589aa000000b82d09b9d8397d7a88..00d9cd13a662722fff49af2edcda8d122864034d 100644
--- a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
+++ b/tensorflow/lite/toco/toco_graphviz_dump_options.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
 
 #include <string>
 
@@ -29,4 +29,4 @@ struct GraphVizDumpOptions {
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
similarity index 96%
rename from tensorflow/contrib/lite/toco/toco_port.cc
rename to tensorflow/lite/toco/toco_port.cc
index 204c0d101eac6d37355d49984a38ffd0d4dd27be..fb8c1b8337f1e509ed9c9ee2522e63e84d143927 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <cstring>
 
-#include "tensorflow/contrib/lite/toco/toco_port.h"
-#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -66,8 +66,9 @@ namespace file {
 // Conversion to our wrapper Status.
 tensorflow::Status ToStatus(const ::util::Status& uts) {
   if (!uts.ok()) {
-    return tensorflow::Status(tensorflow::errors::Code(uts.error_code()),
-                              uts.error_message());
+    return tensorflow::Status(
+        tensorflow::errors::Code(::util::RetrieveErrorCode(uts)),
+        uts.error_message());
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/lite/toco/toco_port.h
similarity index 94%
rename from tensorflow/contrib/lite/toco/toco_port.h
rename to tensorflow/lite/toco/toco_port.h
index 17f82b9dd7dcc633aa204038b6d965f4eb6967bb..2f39e3d6d5c02457e9ade320e7525fbf881b5389 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/lite/toco/toco_port.h
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_PORT_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_PORT_H_
 
 // Portability layer for toco tool. Mainly, abstract filesystem access so we
 // can build and use on google internal environments and on OSX.
 
 #include <string>
 #include "google/protobuf/text_format.h"
-#include "tensorflow/contrib/lite/toco/format_port.h"
+#include "tensorflow/lite/toco/format_port.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
@@ -110,4 +110,4 @@ bool ParseFromStringEitherTextOrBinary(const std::string& input_file_contents,
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/toco_port_test.cc b/tensorflow/lite/toco/toco_port_test.cc
similarity index 88%
rename from tensorflow/contrib/lite/toco/toco_port_test.cc
rename to tensorflow/lite/toco/toco_port_test.cc
index 650a617aebc053e789f41a56f9bb7fb514740f9a..f5fbb4caeb2882d51c4b586293eb202fcf60a9de 100644
--- a/tensorflow/contrib/lite/toco/toco_port_test.cc
+++ b/tensorflow/lite/toco/toco_port_test.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/toco_port.h"
-#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_types.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -23,9 +23,9 @@ namespace port {
 namespace {
 
 #ifdef PLATFORM_GOOGLE
-#define TFLITE_PREFIX "third_party/tensorflow/contrib/lite/"
+#define TFLITE_PREFIX "third_party/tensorflow/lite/"
 #else
-#define TFLITE_PREFIX "tensorflow/contrib/lite/"
+#define TFLITE_PREFIX "tensorflow/lite/"
 #endif
 
 TEST(TocoPortTest, Exists) {
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
similarity index 92%
rename from tensorflow/contrib/lite/toco/toco_tooling.cc
rename to tensorflow/lite/toco/toco_tooling.cc
index 106494f3547a9385317c1712de1281931cee7328..55a454e66de4d0afce18421450d875911bea01f4 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/toco_tooling.h"
+#include "tensorflow/lite/toco/toco_tooling.h"
 
 #include <cstdlib>
 #include <memory>
@@ -20,16 +20,16 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/contrib/lite/toco/allocate_transient_arrays.h"
-#include "tensorflow/contrib/lite/toco/dump_graphviz.h"
-#include "tensorflow/contrib/lite/toco/export_tensorflow.h"
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/import_tensorflow.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/tflite/export.h"
-#include "tensorflow/contrib/lite/toco/tflite/import.h"
-#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/allocate_transient_arrays.h"
+#include "tensorflow/lite/toco/dump_graphviz.h"
+#include "tensorflow/lite/toco/export_tensorflow.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/import_tensorflow.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/tflite/export.h"
+#include "tensorflow/lite/toco/tflite/import.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace toco {
@@ -198,7 +198,7 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
               : (toco_flags.output_format() != TENSORFLOW_GRAPHDEF);
 
       tf_import_flags.import_all_ops_as_unsupported =
-          toco_flags.force_flex_ops();
+          toco_flags.force_select_tf_ops();
 
       model = ImportTensorFlowGraphDef(model_flags, tf_import_flags,
                                        input_file_contents);
@@ -210,7 +210,8 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
       CheckInvariants(*model);
       break;
     default:
-      LOG(FATAL) << "Unhandled input_format";
+      LOG(FATAL) << "Unhandled input_format='"
+                 << FileFormat_Name(toco_flags.input_format()) << "'";
   }
 
   LogDump(kLogLevelModelChanged, "AT IMPORT", *model);
@@ -308,6 +309,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   // Fix any issues with IO edges. This must happen after any transform that
   // may modify the structure of the edges.
   FixEdgeArrays(model);
+  FixOperatorOrdering(model);
 
   if (quantize_output) {
     // If the user specified default min/max ranges we need to set all arrays
@@ -400,8 +402,8 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   model->ops_count = ops_count;
 }
 
-void Export(const TocoFlags& toco_flags, const Model& model,
-            bool allow_custom_ops, string* output_file_contents) {
+tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
+                          bool allow_custom_ops, string* output_file_contents) {
   switch (toco_flags.output_format()) {
     case TENSORFLOW_GRAPHDEF:
       ExportTensorFlowGraphDef(model, output_file_contents);
@@ -409,24 +411,25 @@ void Export(const TocoFlags& toco_flags, const Model& model,
     case TFLITE: {
       toco::tflite::ExportParams params;
 
-      // Always allow custom ops when flex ops are allowed.
-      if (toco_flags.force_flex_ops() || toco_flags.allow_flex_ops()) {
-        params.allow_flex_ops = true;
-        params.allow_custom_ops = true;
-      } else if (allow_custom_ops) {
-        params.allow_custom_ops = true;
-      }
-
+      params.enable_select_tf_ops =
+          toco_flags.force_select_tf_ops() || toco_flags.enable_select_tf_ops();
+      params.allow_custom_ops = allow_custom_ops;
       params.quantize_weights = toco_flags.post_training_quantize();
 
-      toco::tflite::Export(model, output_file_contents, params);
+      auto status = toco::tflite::Export(model, output_file_contents, params);
+      if (!status.ok()) {
+        LOG(ERROR) << status.error_message();
+      }
+      return status;
     } break;
     case GRAPHVIZ_DOT:
       DumpGraphviz(model, output_file_contents);
       break;
     default:
-      LOG(FATAL) << "Unhandled output_format";
+      LOG(FATAL) << "Unhandled output_format='"
+                 << FileFormat_Name(toco_flags.output_format()) << "'";
   }
+  return tensorflow::Status();
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/toco_tooling.h b/tensorflow/lite/toco/toco_tooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..742e3769269859c62522707ba415cd509e8df629
--- /dev/null
+++ b/tensorflow/lite/toco/toco_tooling.h
@@ -0,0 +1,53 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_TOOLING_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_TOOLING_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+
+namespace toco {
+
+// Imports the input file into a Model object.
+std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
+                              const ModelFlags& model_flags,
+                              const string& input_file_contents);
+
+// Transforms a Model. The resulting Model is ready to be passed
+// to Export with the exact same toco_flags.
+void Transform(const TocoFlags& toco_flags, Model* model);
+
+// Exports the Model, which must be of the 'lowered' form returned by
+// Transform, to a file of the format given by
+// toco_flags.output_format().
+tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
+                          bool allow_custom_ops, string* output_file_contents);
+
+// This if for backward-compatibility with internal tools.
+inline void Export(const TocoFlags& toco_flags, const Model& model,
+                   string* output_file_contents) {
+  auto status = Export(toco_flags, model, true, output_file_contents);
+  if (!status.ok()) {
+    LOG(QFATAL) << status.error_message();
+  }
+}
+
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_TOOLING_H_
diff --git a/tensorflow/contrib/lite/toco/toco_types.h b/tensorflow/lite/toco/toco_types.h
similarity index 88%
rename from tensorflow/contrib/lite/toco/toco_types.h
rename to tensorflow/lite/toco/toco_types.h
index 319f1066cdb33e60178f6db142712363d9f07f3d..da2efd6724a7042a3498b070f740947d0363f0da 100644
--- a/tensorflow/contrib/lite/toco/toco_types.h
+++ b/tensorflow/lite/toco/toco_types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TYPES_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TYPES_H_
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_TYPES_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_TYPES_H_
 
 #include <string>
 #include "tensorflow/core/platform/platform.h"
@@ -42,4 +42,4 @@ using tensorflow::uint8;
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TYPES_H_
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
similarity index 95%
rename from tensorflow/contrib/lite/toco/tooling_util.cc
rename to tensorflow/lite/toco/tooling_util.cc
index e3f27e9e2a43e0a7f6659c305beb01905c55dee7..af4cd386a209d82cb56a877410abe6fbdbf99c7b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 
 #include <functional>
 #include <iterator>
@@ -27,9 +27,9 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
 #include "re2/re2.h"
-#include "tensorflow/contrib/lite/toco/dump_graphviz.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
+#include "tensorflow/lite/toco/dump_graphviz.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -87,6 +87,8 @@ string ArrayDataTypeName(ArrayDataType data_type) {
       return "String";
     case ArrayDataType::kBool:
       return "Bool";
+    case ArrayDataType::kComplex64:
+      return "Complex64";
     case ArrayDataType::kNone:
       return "None";
     default:
@@ -306,6 +308,7 @@ const char* OperatorTypeName(OperatorType type) {
 #define HANDLE_OPERATORTYPENAME_CASE(c) \
   case OperatorType::k##c:              \
     return #c;
+    HANDLE_OPERATORTYPENAME_CASE(Abs)
     HANDLE_OPERATORTYPENAME_CASE(Add)
     HANDLE_OPERATORTYPENAME_CASE(AddN)
     HANDLE_OPERATORTYPENAME_CASE(AveragePool)
@@ -369,6 +372,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Shape)
     HANDLE_OPERATORTYPENAME_CASE(Slice)
     HANDLE_OPERATORTYPENAME_CASE(Split)
+    HANDLE_OPERATORTYPENAME_CASE(SplitV)
     HANDLE_OPERATORTYPENAME_CASE(Sqrt)
     HANDLE_OPERATORTYPENAME_CASE(Square)
     HANDLE_OPERATORTYPENAME_CASE(Switch)
@@ -407,6 +411,11 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(CTCBeamSearchDecoder)
     HANDLE_OPERATORTYPENAME_CASE(Unpack)
     HANDLE_OPERATORTYPENAME_CASE(ZerosLike)
+    HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceLstm)
+    HANDLE_OPERATORTYPENAME_CASE(ResizeNearestNeighbor)
+    HANDLE_OPERATORTYPENAME_CASE(LeakyRelu)
+    HANDLE_OPERATORTYPENAME_CASE(SquaredDifference)
+    HANDLE_OPERATORTYPENAME_CASE(MirrorPad)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -435,6 +444,7 @@ bool OperatorSupportsFusedActivation(OperatorType type) {
     case OperatorType::kMaxPool:
     case OperatorType::kMul:
     case OperatorType::kSub:
+    case OperatorType::kSquaredDifference:
       return true;
     default:
       return false;
@@ -527,12 +537,12 @@ void DumpGraphvizVideoFrame(const Model& model) {
   if (!dump_hashes.count(hash)) {
     LOG(INFO) << "DUMPING GRAPHVIZ VIDEO FRAME: " << dump_id;
     dump_hashes.insert(hash);
-    CHECK(port::file::SetContents(
-              port::file::JoinPath(
-                  dump_options.dump_graphviz,
-                  toco::port::StringF("toco_video_%05d.dot", dump_id)),
-              graphviz_dump, port::file::Defaults())
-              .ok());
+    const auto result = port::file::SetContents(
+        port::file::JoinPath(
+            dump_options.dump_graphviz,
+            toco::port::StringF("toco_video_%05d.dot", dump_id)),
+        graphviz_dump, port::file::Defaults());
+    QCHECK(result.ok()) << result.error_message();
     dump_id++;
   }
 }
@@ -546,14 +556,13 @@ void LogDump(int log_level, const string& message, const Model& model) {
     string graphviz_dump;
 
     DumpGraphviz(model, &graphviz_dump);
-    CHECK(port::file::SetContents(
-              port::file::JoinPath(
-                  dump_options.dump_graphviz,
-                  absl::StrCat("toco_",
-                               absl::StrReplaceAll(message, {{" ", "_"}}),
-                               ".dot")),
-              graphviz_dump, port::file::Defaults())
-              .ok());
+    const auto result = port::file::SetContents(
+        port::file::JoinPath(
+            dump_options.dump_graphviz,
+            absl::StrCat("toco_", absl::StrReplaceAll(message, {{" ", "_"}}),
+                         ".dot")),
+        graphviz_dump, port::file::Defaults());
+    QCHECK(result.ok()) << result.error_message();
   }
 
   if (!VLOG_IS_ON(log_level)) {
@@ -737,15 +746,41 @@ bool CompareArrayBuffers(const Array& lhs_array, const Array& rhs_array) {
   }
   return true;
 }
+
+bool HaveSameMinMax(const Array& lhs_array, const Array& rhs_array) {
+  if (lhs_array.minmax || rhs_array.minmax) {
+    if (!lhs_array.minmax || !rhs_array.minmax) {
+      return false;
+    }
+    if (!(*lhs_array.minmax == *rhs_array.minmax)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool HaveSameQuantizationParams(const Array& lhs_array,
+                                const Array& rhs_array) {
+  if (lhs_array.quantization_params || rhs_array.quantization_params) {
+    if (!lhs_array.quantization_params || !rhs_array.quantization_params) {
+      return false;
+    }
+    if (!(*lhs_array.quantization_params == *rhs_array.quantization_params)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
 bool CompareConstantArrays(const Array& lhs_array, const Array& rhs_array) {
-  bool attrs_equal =
-      lhs_array.shape() == rhs_array.shape() &&
-      lhs_array.data_type == rhs_array.data_type &&
-      lhs_array.final_data_type == rhs_array.final_data_type &&
-      lhs_array.minmax == rhs_array.minmax &&
-      lhs_array.quantization_params == rhs_array.quantization_params;
+  bool attrs_equal = lhs_array.shape() == rhs_array.shape() &&
+                     lhs_array.data_type == rhs_array.data_type &&
+                     lhs_array.final_data_type == rhs_array.final_data_type &&
+                     HaveSameMinMax(lhs_array, rhs_array) &&
+                     HaveSameQuantizationParams(lhs_array, rhs_array) &&
+                     lhs_array.narrow_range == rhs_array.narrow_range;
   if (!attrs_equal) {
     return false;
   }
@@ -772,6 +807,9 @@ bool CompareConstantArrays(const Array& lhs_array, const Array& rhs_array) {
       return CompareArrayBuffers<ArrayDataType::kUint64>(lhs_array, rhs_array);
     case ArrayDataType::kString:
       return CompareArrayBuffers<ArrayDataType::kString>(lhs_array, rhs_array);
+    case ArrayDataType::kComplex64:
+      return CompareArrayBuffers<ArrayDataType::kComplex64>(lhs_array,
+                                                            rhs_array);
     default:
       LOG(FATAL) << "Unsupported data type: "
                  << ArrayDataTypeName(lhs_array.data_type);
@@ -861,6 +899,9 @@ void CheckNonExistentIOArrays(const Model& model) {
         << "\" is not consumed by any op in this graph. " << general_comment;
   }
   for (const string& output_array : model.flags.output_arrays()) {
+    if (IsConstantParameterArray(model, output_array)) {
+      continue;  // It is OK to request that a constant be an output.
+    }
     QCHECK(GetOpWithOutput(model, output_array))
         << "Specified output array \"" << output_array
         << "\" is not produced by any op in this graph. " << general_comment;
@@ -898,12 +939,12 @@ void CheckNoMissingArray(const Model& model) {
 void FixNoMissingArray(Model* model) {
   for (const auto& op : model->operators) {
     for (const auto& input : op->inputs) {
-      if (!model->HasArray(input)) {
+      if (!model->HasArray(input) && !model->IsOptionalArray(input)) {
         model->GetOrCreateArray(input);
       }
     }
     for (const auto& output : op->outputs) {
-      if (!model->HasArray(output)) {
+      if (!model->HasArray(output) && !model->IsOptionalArray(output)) {
         model->GetOrCreateArray(output);
       }
     }
@@ -999,10 +1040,10 @@ void CheckEachArray(const Model& model) {
     if (colon_pos != string::npos) {
       CHECK_EQ(name.substr(colon_pos + 1).find_first_not_of("0123456789"),
                string::npos)
-          << "Array name must only have digits after colon";
+          << "Array '" << name << "' has non-digit characters after colon.";
     }
-    CHECK_GT(colon_pos, 0)
-        << "First character of array name must not be a colon.";
+    CHECK_GT(colon_pos, 0) << "Array '" << name
+                           << "' must not start with a colon.";
   }
 }
 
@@ -1237,11 +1278,15 @@ void DedupeConstantArrays(Model* model, size_t min_size) {
         lhs_array.final_data_type != ArrayDataType::kNone
             ? lhs_array.final_data_type
             : lhs_array.data_type;
-    size_t array_byte_size =
-        lhs_array.buffer->Length() * ElementSize(final_data_type);
-    if (array_byte_size < min_size) {
-      // Too small; skip.
-      continue;
+    // Ignore small arrays, don't check string arrays because it is not possible
+    // to estimate its size.
+    if (final_data_type != ArrayDataType::kString) {
+      size_t array_byte_size =
+          lhs_array.buffer->Length() * ElementSize(final_data_type);
+      if (array_byte_size < min_size) {
+        // Too small; skip.
+        continue;
+      }
     }
 
     auto next_lhs_array_it = lhs_array_it;
@@ -1380,6 +1425,9 @@ void CloneArray(Model* model, const string& source_array_name,
     case ArrayDataType::kString:
       CopyArrayBuffer<ArrayDataType::kString>(source_array, &target_array);
       break;
+    case ArrayDataType::kComplex64:
+      CopyArrayBuffer<ArrayDataType::kComplex64>(source_array, &target_array);
+      break;
     default:
       LOG(FATAL) << "Unsupported data type: "
                  << ArrayDataTypeName(source_array.data_type);
@@ -1687,6 +1735,8 @@ int ElementSize(ArrayDataType data_type) {
       return 8;
     case ArrayDataType::kUint64:
       return 8;
+    case ArrayDataType::kComplex64:
+      return 8;
 
     // Usually not critical limitation because strings are only input and/or
     // output.
@@ -1725,6 +1775,14 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
   if (!array->has_shape()) {
     return false;
   }
+
+  // The size of string tensors is rarely known ahead of time, so all transient
+  // tensors of this type will need to be dynamically allocated.
+  if (array->final_data_type == ArrayDataType::kString ||
+      array->data_type == ArrayDataType::kString) {
+    return false;
+  }
+
   return true;
 }
 
@@ -2165,6 +2223,8 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
       return ArrayDataType::kFloat;
     case QUANTIZED_UINT8:
       return ArrayDataType::kUint8;
+    case INT8:
+      return ArrayDataType::kInt8;
     case QUANTIZED_INT16:
       return ArrayDataType::kInt16;
     case INT32:
@@ -2173,6 +2233,10 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
       return ArrayDataType::kInt64;
     case BOOL:
       return ArrayDataType::kBool;
+    case STRING:
+      return ArrayDataType::kString;
+    case COMPLEX64:
+      return ArrayDataType::kComplex64;
     default:
       return ArrayDataType::kNone;
   }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
similarity index 96%
rename from tensorflow/contrib/lite/toco/tooling_util.h
rename to tensorflow/lite/toco/tooling_util.h
index 5f4b8cb66a2c5471158f761f61e4e49c78a95584..53131824b532853afc1660354de92da40db0da86 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
-#define TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+#ifndef TENSORFLOW_LITE_TOCO_TOOLING_UTIL_H_
+#define TENSORFLOW_LITE_TOCO_TOOLING_UTIL_H_
 
 #include <algorithm>
 #include <cmath>
@@ -28,12 +28,12 @@ limitations under the License.
 #if TOCO_SUPPORT_PORTABLE_PROTOS
 #include "third_party/protobuf/include/google/protobuf/text_format.h"
 #endif  // TOCO_SUPPORT_PORTABLE_PROTOS
-#include "tensorflow/contrib/lite/kernels/internal/types.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
-#include "tensorflow/contrib/lite/toco/types.pb.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/runtime/types.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -338,8 +338,9 @@ tensorflow::Status NumElements(const std::vector<T>& shape, U* num_elements) {
       return tensorflow::errors::InvalidArgument(
           "Tensor shape should not include negative values");
     }
-    if (static_cast<uint64_t>(dim) >
-        std::numeric_limits<U>::max() / *num_elements) {
+    if (*num_elements != 0 &&
+        static_cast<uint64_t>(dim) >
+            std::numeric_limits<U>::max() / *num_elements) {
       *num_elements = 0;
       return tensorflow::errors::InvalidArgument("Tensor shape is too large");
     }
@@ -358,4 +359,4 @@ void CopyMinMaxAndQuantizationRelatedFields(const Array& src, Array* dst);
 
 }  // namespace toco
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+#endif  // TENSORFLOW_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/tooling_util_test.cc b/tensorflow/lite/toco/tooling_util_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/toco/tooling_util_test.cc
rename to tensorflow/lite/toco/tooling_util_test.cc
index eb495646a2df0d0295eab54fcc5a5bf156a59d39..6f1c9c563ada01891b67094caa93cfd1847cdf6b 100644
--- a/tensorflow/contrib/lite/toco/tooling_util_test.cc
+++ b/tensorflow/lite/toco/tooling_util_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace toco {
@@ -109,6 +109,10 @@ TEST(NumElementsTest, Int) {
   EXPECT_TRUE(status.ok());
   EXPECT_EQ(count, 2146435072);
 
+  status = NumElements(std::vector<int>{1024, 0, 2048}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 0);
+
   status = NumElements(std::vector<int>{1, 2, -3}, &count);
   EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
 
diff --git a/tensorflow/contrib/lite/toco/types.proto b/tensorflow/lite/toco/types.proto
similarity index 89%
rename from tensorflow/contrib/lite/toco/types.proto
rename to tensorflow/lite/toco/types.proto
index 421667a83c14a738231488b1c14eb6071487bafb..fa911b8a4c80d96790fa16e34dbc3f114b522e45 100644
--- a/tensorflow/contrib/lite/toco/types.proto
+++ b/tensorflow/lite/toco/types.proto
@@ -40,4 +40,10 @@ enum IODataType {
 
   // Boolean
   BOOL = 7;
+
+  // Complex64, not quantized
+  COMPLEX64 = 8;
+
+  // Int8, quantized based on QuantizationParameters in schema.
+  INT8 = 9;
 }
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1d141b5dd01a4a03c65d0c8a119ad62eea224d52
--- /dev/null
+++ b/tensorflow/lite/tools/BUILD
@@ -0,0 +1,96 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+
+common_copts = ["-Wall"]
+
+py_binary(
+    name = "visualize",
+    srcs = ["visualize.py"],
+    data = [
+        "//tensorflow/lite/schema:schema.fbs",
+        "//tensorflow/python:platform",
+        "@flatbuffers//:flatc",
+    ],
+    srcs_version = "PY2AND3",
+)
+
+tf_cc_binary(
+    name = "generate_op_registrations",
+    srcs = ["gen_op_registration_main.cc"],
+    deps = [
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/tools:gen_op_registration",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "gen_op_registration",
+    srcs = ["gen_op_registration.cc"],
+    hdrs = ["gen_op_registration.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
+
+cc_test(
+    name = "gen_op_registration_test",
+    srcs = ["gen_op_registration_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/0_subgraphs.bin",
+        "//tensorflow/lite:testdata/2_subgraphs.bin",
+        "//tensorflow/lite:testdata/empty_model.bin",
+        "//tensorflow/lite:testdata/test_model.bin",
+        "//tensorflow/lite:testdata/test_model_broken.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":gen_op_registration",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "verifier",
+    srcs = ["verifier.cc"],
+    hdrs = ["verifier.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_test(
+    name = "verifier_test",
+    size = "small",
+    srcs = ["verifier_test.cc"],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":verifier",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/accuracy/BUILD b/tensorflow/lite/tools/accuracy/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..64475e057ae41534891bbce20324e8c588118e3c
--- /dev/null
+++ b/tensorflow/lite/tools/accuracy/BUILD
@@ -0,0 +1,328 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+common_linkopts = tflite_linkopts() + select({
+    "//conditions:default": [],
+    "//tensorflow:android": [
+        "-pie",
+        "-llog",
+    ],
+})
+
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
+    ],
+    data = ["//tensorflow/lite:testdata/multi_add.bin"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":utils",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework_internal",
+                "//tensorflow/core:lib",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "run_tflite_model_op",
+    srcs = ["run_tflite_model_op.cc"],
+    copts = tflite_copts(),
+    deps = [
+        ":utils",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:tensorflow",
+                "//tensorflow/core:protos_all_cc",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:framework",
+                "//tensorflow/core:lib",
+                "//tensorflow/core:ops",
+            ],
+        },
+    ),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "android_required_build_flags",
+    srcs = ["android_required_build_flags.cc"],
+    copts = tflite_copts(),
+)
+
+tf_cc_test(
+    name = "run_tflite_model_op_test",
+    srcs = ["run_tflite_model_op_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
+    ],
+    data = ["//tensorflow/lite:testdata/multi_add.bin"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        ":run_tflite_model_op",
+        ":android_required_build_flags",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:framework",
+                "//tensorflow/core:framework_internal",
+                "//tensorflow/core:lib",
+                "//tensorflow/core:ops",
+                "//tensorflow/core:protos_all_cc",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "stage",
+    hdrs = ["stage.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/cc:scope",
+    ],
+)
+
+cc_library(
+    name = "file_reader_stage",
+    srcs = ["file_reader_stage.cc"],
+    hdrs = ["file_reader_stage.h"],
+    deps = [
+        ":stage",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+    ],
+)
+
+tf_cc_test(
+    name = "file_reader_stage_test",
+    srcs = ["file_reader_stage_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":file_reader_stage",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core/kernels:android_whole_file_read_ops",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "run_tflite_model_stage",
+    srcs = ["run_tflite_model_stage.cc"],
+    hdrs = ["run_tflite_model_stage.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":run_tflite_model_op",
+        ":stage",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+    ],
+)
+
+cc_library(
+    name = "accuracy_eval_stage",
+    hdrs = ["accuracy_eval_stage.h"],
+    copts = tflite_copts(),
+    deps = [
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "eval_pipeline",
+    srcs = ["eval_pipeline.cc"],
+    hdrs = ["eval_pipeline.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":accuracy_eval_stage",
+        ":stage",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:core_cpu",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "eval_pipeline_test",
+    srcs = ["eval_pipeline_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":eval_pipeline",
+        "//tensorflow/cc:cc_ops",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:ops",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "eval_pipeline_builder",
+    srcs = ["eval_pipeline_builder.cc"],
+    hdrs = ["eval_pipeline_builder.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":eval_pipeline",
+        ":accuracy_eval_stage",
+        ":stage",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/cc:cc_ops",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:ops",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "eval_pipeline_builder_test",
+    srcs = ["eval_pipeline_builder_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":eval_pipeline_builder",
+        "//tensorflow/cc:cc_ops",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:ops",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "csv_writer",
+    hdrs = ["csv_writer.h"],
+    copts = tflite_copts(),
+    deps = select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:lib",
+            ],
+        },
+    ),
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/tools/accuracy/README.md b/tensorflow/lite/tools/accuracy/README.md
similarity index 100%
rename from tensorflow/contrib/lite/tools/accuracy/README.md
rename to tensorflow/lite/tools/accuracy/README.md
diff --git a/tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h b/tensorflow/lite/tools/accuracy/accuracy_eval_stage.h
similarity index 88%
rename from tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h
rename to tensorflow/lite/tools/accuracy/accuracy_eval_stage.h
index 9cb843729aa8c127814be23f1183b5a9edcb1702..5a2ba3d2a7a2f13f10b4b8a060c99c4ee63cb6f2 100644
--- a/tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h
+++ b/tensorflow/lite/tools/accuracy/accuracy_eval_stage.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
 
 #include <vector>
 
@@ -46,4 +46,4 @@ class AccuracyEval {
 };
 }  //  namespace metrics
 }  //  namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_ACCURACY_EVAL_STAGE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/android_required_build_flags.cc b/tensorflow/lite/tools/accuracy/android_required_build_flags.cc
similarity index 100%
rename from tensorflow/contrib/lite/tools/accuracy/android_required_build_flags.cc
rename to tensorflow/lite/tools/accuracy/android_required_build_flags.cc
diff --git a/tensorflow/contrib/lite/tools/accuracy/csv_writer.h b/tensorflow/lite/tools/accuracy/csv_writer.h
similarity index 92%
rename from tensorflow/contrib/lite/tools/accuracy/csv_writer.h
rename to tensorflow/lite/tools/accuracy/csv_writer.h
index 806b0d9418e8b03b92c0f33b6d531ce248ae43a6..d74a803ce18766c23c0e6ee0d574bc19daf83361 100644
--- a/tensorflow/contrib/lite/tools/accuracy/csv_writer.h
+++ b/tensorflow/lite/tools/accuracy/csv_writer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_CSV_WRITER_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_CSV_WRITER_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_CSV_WRITER_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_CSV_WRITER_H_
 
 #include <fstream>
 #include <vector>
@@ -76,4 +76,4 @@ class CSVWriter {
 };
 }  // namespace metrics
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_CSV_WRITER_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_CSV_WRITER_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.cc b/tensorflow/lite/tools/accuracy/eval_pipeline.cc
similarity index 95%
rename from tensorflow/contrib/lite/tools/accuracy/eval_pipeline.cc
rename to tensorflow/lite/tools/accuracy/eval_pipeline.cc
index a03aba6a2685db7a535829f98303174e9399b94d..658824a7d03fe6dd514c2ab48d85e341a89e26cc 100644
--- a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.cc
+++ b/tensorflow/lite/tools/accuracy/eval_pipeline.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h"
+#include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
 
 namespace tensorflow {
 namespace metrics {
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h b/tensorflow/lite/tools/accuracy/eval_pipeline.h
similarity index 89%
rename from tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h
rename to tensorflow/lite/tools/accuracy/eval_pipeline.h
index c9cfc866139da86d7de2036a07315e66dfaf60f0..1ec21b07e8bce1b17a63b75ee129b43a0c74d74f 100644
--- a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h
+++ b/tensorflow/lite/tools/accuracy/eval_pipeline.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
 
 #include <string>
 
-#include "tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h"
-#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+#include "tensorflow/lite/tools/accuracy/accuracy_eval_stage.h"
+#include "tensorflow/lite/tools/accuracy/stage.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
@@ -84,4 +84,4 @@ class EvalPipeline {
 };
 }  //  namespace metrics
 }  //  namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.cc b/tensorflow/lite/tools/accuracy/eval_pipeline_builder.cc
similarity index 97%
rename from tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.cc
rename to tensorflow/lite/tools/accuracy/eval_pipeline_builder.cc
index 2e16437e1588b400b915a488e402a52efa3b755c..1b360d31b36e5715ee5ba70b51a536b525343ae0 100644
--- a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.cc
+++ b/tensorflow/lite/tools/accuracy/eval_pipeline_builder.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h"
+#include "tensorflow/lite/tools/accuracy/eval_pipeline_builder.h"
 
 #include "absl/memory/memory.h"
 #include "tensorflow/cc/ops/standard_ops.h"
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h b/tensorflow/lite/tools/accuracy/eval_pipeline_builder.h
similarity index 89%
rename from tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h
rename to tensorflow/lite/tools/accuracy/eval_pipeline_builder.h
index 692db022f8bc747979337dec7f08af9fcb6932fa..18b52ac7bea3617fc682fe290d2d9fe704c0f839 100644
--- a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h
+++ b/tensorflow/lite/tools/accuracy/eval_pipeline_builder.h
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
 
 #include <memory>
 #include <string>
 
-#include "tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h"
-#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h"
-#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+#include "tensorflow/lite/tools/accuracy/accuracy_eval_stage.h"
+#include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
+#include "tensorflow/lite/tools/accuracy/stage.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -96,4 +96,4 @@ class EvalPipelineBuilder {
 
 }  //  namespace metrics
 }  //  namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_EVAL_PIPELINE_BUILDER_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder_test.cc b/tensorflow/lite/tools/accuracy/eval_pipeline_builder_test.cc
similarity index 99%
rename from tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder_test.cc
rename to tensorflow/lite/tools/accuracy/eval_pipeline_builder_test.cc
index 2d41929b7920f403cb6b9858a7c54cb13273fb95..9bf725439c486d776cdd76911ee8072871bc41e9 100644
--- a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder_test.cc
+++ b/tensorflow/lite/tools/accuracy/eval_pipeline_builder_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h"
+#include "tensorflow/lite/tools/accuracy/eval_pipeline_builder.h"
 #include <gtest/gtest.h>
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/public/session.h"
diff --git a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_test.cc b/tensorflow/lite/tools/accuracy/eval_pipeline_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/tools/accuracy/eval_pipeline_test.cc
rename to tensorflow/lite/tools/accuracy/eval_pipeline_test.cc
index ea0f6e19df46d8934dc9eabb1c57a01bb5e91a1f..53cbf8ccd5b7fd10770ccdb3aeebca1414655082 100644
--- a/tensorflow/contrib/lite/tools/accuracy/eval_pipeline_test.cc
+++ b/tensorflow/lite/tools/accuracy/eval_pipeline_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h"
+#include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
 #include <gtest/gtest.h>
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/public/session.h"
diff --git a/tensorflow/contrib/lite/tools/accuracy/file_reader_stage.cc b/tensorflow/lite/tools/accuracy/file_reader_stage.cc
similarity index 93%
rename from tensorflow/contrib/lite/tools/accuracy/file_reader_stage.cc
rename to tensorflow/lite/tools/accuracy/file_reader_stage.cc
index 61bed369f8b4f659ee12834efdc23f6315dd8d42..a106a79a4baedc3fe10876c8efda9ba45c606a67 100644
--- a/tensorflow/contrib/lite/tools/accuracy/file_reader_stage.cc
+++ b/tensorflow/lite/tools/accuracy/file_reader_stage.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h"
+#include "tensorflow/lite/tools/accuracy/file_reader_stage.h"
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
diff --git a/tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h b/tensorflow/lite/tools/accuracy/file_reader_stage.h
similarity index 81%
rename from tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h
rename to tensorflow/lite/tools/accuracy/file_reader_stage.h
index 18db5837c1717ca5be966d8a4d764ea88d2674d3..19655e96973498c5dae1beb38debad0c55612155 100644
--- a/tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h
+++ b/tensorflow/lite/tools/accuracy/file_reader_stage.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
 
 #include <string>
 
-#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+#include "tensorflow/lite/tools/accuracy/stage.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -34,4 +34,4 @@ class FileReaderStage : public Stage {
 };
 }  //  namespace metrics
 }  //  namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_FILE_READER_STAGE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/file_reader_stage_test.cc b/tensorflow/lite/tools/accuracy/file_reader_stage_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/tools/accuracy/file_reader_stage_test.cc
rename to tensorflow/lite/tools/accuracy/file_reader_stage_test.cc
index a75f99187d6ea0918398899ccef1511faa3ee0a6..21be0a766b5ec47da404ee9feaf42cba361f0354 100644
--- a/tensorflow/contrib/lite/tools/accuracy/file_reader_stage_test.cc
+++ b/tensorflow/lite/tools/accuracy/file_reader_stage_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h"
+#include "tensorflow/lite/tools/accuracy/file_reader_stage.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/BUILD b/tensorflow/lite/tools/accuracy/ilsvrc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a4d21961a6f00ea288550a0b054519ebe1e7abdd
--- /dev/null
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/BUILD
@@ -0,0 +1,182 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+common_linkopts = tflite_linkopts() + select({
+    "//conditions:default": [],
+    "//tensorflow:android": [
+        "-pie",
+        "-llog",
+    ],
+})
+
+cc_library(
+    name = "inception_preprocessing",
+    srcs = ["inception_preprocessing.cc"],
+    hdrs = ["inception_preprocessing.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/tools/accuracy:android_required_build_flags",
+        "//tensorflow/lite/tools/accuracy:stage",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core/kernels:android_tensorflow_image_op",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:tensorflow",
+                "//tensorflow/core:protos_all_cc",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:framework",
+                "//tensorflow/core:lib",
+                "//tensorflow/core:ops",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "inception_preprocessing_test",
+    srcs = ["inception_preprocessing_test.cc"],
+    args = [
+        "--test_image=$(location :testdata/grace_hopper.jpg)",
+    ],
+    data = [":testdata/grace_hopper.jpg"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    tags = [
+        "no_oss",  # b/114307765
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":inception_preprocessing",
+        "//tensorflow/lite/tools/accuracy:android_required_build_flags",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:framework_internal",
+                "//tensorflow/core:lib",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "imagenet_topk_eval",
+    srcs = ["imagenet_topk_eval.cc"],
+    hdrs = ["imagenet_topk_eval.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/tools/accuracy:accuracy_eval_stage",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:lib",
+            ],
+        },
+    ),
+)
+
+tf_cc_test(
+    name = "imagenet_topk_eval_test",
+    srcs = ["imagenet_topk_eval_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":imagenet_topk_eval",
+        "@com_google_googletest//:gtest",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+            ],
+        },
+    ),
+)
+
+cc_library(
+    name = "imagenet_model_evaluator",
+    srcs = ["imagenet_model_evaluator.cc"],
+    hdrs = ["imagenet_model_evaluator.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":imagenet_topk_eval",
+        ":inception_preprocessing",
+        "//tensorflow/lite/tools/accuracy:android_required_build_flags",
+        "//tensorflow/lite/tools/accuracy:eval_pipeline",
+        "//tensorflow/lite/tools/accuracy:eval_pipeline_builder",
+        "//tensorflow/lite/tools/accuracy:file_reader_stage",
+        "//tensorflow/lite/tools/accuracy:run_tflite_model_stage",
+        "//tensorflow/lite/tools/accuracy:utils",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core/kernels:android_whole_file_read_ops",
+                "//tensorflow/core/kernels:android_tensorflow_image_op",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:tensorflow",
+                "//tensorflow/core:lib_internal",
+                "//tensorflow/core:framework_internal",
+                "//tensorflow/core:framework",
+                "//tensorflow/core:lib",
+                "//tensorflow/core:core_cpu",
+            ],
+        },
+    ),
+)
+
+tf_cc_binary(
+    name = "imagenet_accuracy_eval",
+    srcs = ["imagenet_accuracy_eval.cc"],
+    copts = tflite_copts(),
+    linkopts = common_linkopts,
+    deps = [
+        ":imagenet_model_evaluator",
+        ":imagenet_topk_eval",
+        "@com_google_absl//absl/memory",
+        "//tensorflow/lite/tools/accuracy:android_required_build_flags",
+        "//tensorflow/lite/tools/accuracy:csv_writer",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:lib",
+                "//tensorflow/core:framework_internal",
+            ],
+        },
+    ),
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac3a1566e2a2c834260acbfbee8908cc13efa42a
--- /dev/null
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
@@ -0,0 +1,146 @@
+## Accuracy evaluation for ILSVRC 2012 (Imagenet Large Scale Visual Recognition Challenge) image classification task
+
+This binary can evaluate the accuracy of TFLite models trained for the [ILSVRC 2012 image classification task]
+(http://www.image-net.org/challenges/LSVRC/2012/).
+The binary takes the path to validation images and labels as inputs. It outputs the accuracy after running the TFLite model on the validation sets.
+
+To run the binary download the ILSVRC 2012 devkit [see instructions](#downloading-ilsvrc) and run the [`generate_validation_ground_truth` script](#ground-truth-label-generation) to generate the ground truth labels.
+
+## Parameters
+The binary takes the following parameters:
+
+*   `model_file` : `string` \
+    Path to the TFlite model file.
+
+*   `ground_truth_images_path`: `string` \
+    The path to the directory containing ground truth images.
+
+*   `ground_truth_labels`: `string` \
+    Path to ground truth labels file. This file should contain the same number of labels as    the number images in the ground truth directory. The labels are assumed to be in the
+    same order as the sorted filename of images. See [ground truth label generation](#ground-truth-label-generation)
+    section for more information about how to generate labels for images.
+
+*    `model_output_labels`: `string` \
+    Path to the file containing labels, that is used to interpret the output of
+    the model. E.g. in case of mobilenets, this is the path to
+    `mobilenet_labels.txt` where each label is in the same order as the output
+    1001 dimension tensor.
+
+*   `output_path`: `string` \
+    This is the path to the output file. The output is a CSV file that has top-10 accuracies in each row. Each line of output file is the cumulative accuracy after processing images in a sorted order. So first line is accuracy after processing the first image, second line is accuracy after procesing first two images. The last line of the file is accuracy after processing the entire validation set.
+
+and the following optional parameters:
+
+*   `blacklist_file_path`: `string` \
+    Path to blacklist file. This file contains the indices of images that are blacklisted for evaluation. 1762 images are blacklisted in ILSVRC dataset. For details please refer to readme.txt of ILSVRC2014 devkit.
+
+*   `num_images`: `int` (default=0) \
+    The number of images to process, if 0, all images in the directory are processed otherwise only num_images will be processed.
+
+*   `num_threads`: `int` (default=4) \
+    The number of threads to use for evaluation.
+
+
+## Downloading ILSVRC
+In order to use this tool to run evaluation on the full 50K ImageNet dataset,
+download the data set from http://image-net.org/request.
+
+## Ground truth label generation
+The ILSVRC 2012 devkit `validation_ground_truth.txt` contains IDs that correspond to synset of the image. 
+The accuracy binary however expects the ground truth labels to contain the actual name of 
+category instead of synset ids. A conversion script has been provided to convert the validation ground truth to
+category labels. The `validation_ground_truth.txt` can be converted by the following steps:
+
+```
+ILSVRC_2012_DEVKIT_DIR=[set to path to ILSVRC 2012 devkit]
+VALIDATION_LABELS=[set to  path to output]
+
+python generate_validation_labels.py -- \
+--ilsvrc_devkit_dir=${ILSVRC_2012_DEVKIT_DIR} \
+--validation_labels_output=${VALIDATION_LABELS}
+```
+
+## Running the binary
+
+### On Android
+
+(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android for configuring NDK and SDK.
+
+(1) Build using the following command:
+
+```
+bazel build -c opt \
+  --config=android_arm \
+  --config=monolithic \
+  --cxxopt='--std=c++11' \
+  --copt=-D__ANDROID_TYPES_FULL__ \
+  --copt=-DSUPPORT_SELECTIVE_REGISTRATION \
+  //tensorflow/lite/tools/accuracy/ilsvrc:imagenet_accuracy_eval
+```
+
+(2) Connect your phone. Push the binary to your phone with adb push
+     (make the directory if required):
+
+```
+adb push bazel-bin/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval /data/local/tmp
+```
+
+(3) Make the binary executable.
+
+```
+adb shell chmod +x /data/local/tmp/imagenet_accuracy_eval
+```
+
+(4) Push the TFLite model  that you need to test. For example:
+
+```
+adb push mobilenet_quant_v1_224.tflite /data/local/tmp
+```
+
+(5) Push the imagenet images to device, make sure device has sufficient storage available before pushing the dataset:
+
+```
+adb shell mkdir /data/local/tmp/ilsvrc_images && \
+adb push ${IMAGENET_IMAGES_DIR} /data/local/tmp/ilsvrc_images
+```
+
+(6) Push the generated validation ground labels to device.
+
+```
+adb push ${VALIDATION_LABELS} /data/local/tmp/ilsvrc_validation_labels.txt
+```
+
+(7) Push the model labels text file to device.
+
+```
+adb push ${MODEL_LABELS_TXT} /data/local/tmp/model_output_labels.txt
+```
+
+(8) Run the binary.
+
+```
+adb shell /data/local/tmp/imagenet_accuracy_eval \
+  --model_file=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --ground_truth_images_path=/data/local/tmp/ilsvrc_images \
+  --ground_truth_labels=/data/local/tmp/ilsvrc_validation_labels.txt \
+  --model_output_labels=/data/local/tmp/model_output_labels.txt \
+  --output_file_path=/data/local/tmp/accuracy_output.txt \
+  --num_images=0 # Run on all images.
+```
+
+###  On Desktop
+
+(1) Build and run using the following command:
+
+```
+bazel run -c opt \
+  --cxxopt='--std=c++11' \
+  -- \
+  //tensorflow/lite/tools/accuracy/ilsvrc:imagenet_accuracy_eval \
+  --model_file=mobilenet_quant_v1_224.tflite \
+  --ground_truth_images_path=${IMAGENET_IMAGES_DIR} \
+  --ground_truth_labels=${VALIDATION_LABELS} \
+  --model_output_labels=${MODEL_LABELS_TXT} \
+  --output_file_path=/tmp/accuracy_output.txt \
+  --num_images=0 # Run on all images.
+```
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/clsloc_validation_blacklist.txt b/tensorflow/lite/tools/accuracy/ilsvrc/clsloc_validation_blacklist.txt
similarity index 100%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/clsloc_validation_blacklist.txt
rename to tensorflow/lite/tools/accuracy/ilsvrc/clsloc_validation_blacklist.txt
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/generate_validation_labels.py b/tensorflow/lite/tools/accuracy/ilsvrc/generate_validation_labels.py
similarity index 100%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/generate_validation_labels.py
rename to tensorflow/lite/tools/accuracy/ilsvrc/generate_validation_labels.py
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
similarity index 96%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
rename to tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
index 2a8a2b9b59db062626d489159de7175a8803d4fc..090a023c02727c3c13fda1391d311cbeffa869a0 100644
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_accuracy_eval.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <memory>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/contrib/lite/tools/accuracy/csv_writer.h"
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h"
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+#include "tensorflow/lite/tools/accuracy/csv_writer.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
similarity index 95%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
rename to tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 63616fc3b4b0666c420200b559636e9568cf3ab0..9a74e221c13e72c286512175a7f633c87f75eedd 100644
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h"
 
 #include <fstream>
 #include <iomanip>
@@ -22,13 +22,13 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline.h"
-#include "tensorflow/contrib/lite/tools/accuracy/eval_pipeline_builder.h"
-#include "tensorflow/contrib/lite/tools/accuracy/file_reader_stage.h"
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
-#include "tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h"
-#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+#include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
+#include "tensorflow/lite/tools/accuracy/eval_pipeline_builder.h"
+#include "tensorflow/lite/tools/accuracy/file_reader_stage.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
+#include "tensorflow/lite/tools/accuracy/run_tflite_model_stage.h"
+#include "tensorflow/lite/tools/accuracy/utils.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/init_main.h"
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
similarity index 91%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
rename to tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
index 97e4232b358cab4f3b60d2a1eb8291e2e7931c8e..c3c49e9a51b525cfe012b55e04fd2d5ef50cf90c 100644
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_MODEL_EVALUATOR_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_MODEL_EVALUATOR_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_IMAGENET_MODEL_EVALUATOR_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_IMAGENET_MODEL_EVALUATOR_H_
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
-#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+#include "tensorflow/lite/tools/accuracy/utils.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -121,4 +121,4 @@ class ImagenetModelEvaluator {
 
 }  // namespace metrics
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_ILSVRC_IMAGENET_MODEL_EVALUATOR_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_IMAGENET_MODEL_EVALUATOR_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
similarity index 98%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
rename to tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
index c75baa82b1d013431b0c9f96c8183b298641e5eb..2b086cdf7075d7e6328ce0a41b17ca611ea3c4e2 100644
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
 
 #include <numeric>
 
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h
similarity index 91%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h
rename to tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h
index cad646a30ca96be011d9c4692904699f24e5bc22..e1fc445abf41b537f49cf5e909b0ec3ce72adb2e 100644
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_TOPK_EVAL_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_IMAGENET_TOPK_EVAL_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_IMAGENET_TOPK_EVAL_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_IMAGENET_TOPK_EVAL_H_
 
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/tools/accuracy/accuracy_eval_stage.h"
+#include "tensorflow/lite/tools/accuracy/accuracy_eval_stage.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/mutex.h"
 
@@ -80,4 +80,4 @@ class ImagenetTopKAccuracy : public AccuracyEval {
 }  //  namespace metrics
 }  //  namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_ILSVRC_IMAGENET_TOPK_EVAL_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_IMAGENET_TOPK_EVAL_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc
rename to tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc
index ff332af5c5e56ec2e14b9e4ee509c6344be22c66..61b7afc552de60ef2a48fa3dfe88158f35108cc5 100644
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.h"
 #include <gtest/gtest.h>
 
 namespace tensorflow {
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
similarity index 97%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
rename to tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
index 7512b39c32f98faed9b41f829666bf1d4d145d82..9a889f0dd88bc4c51b2c060baf0e89c126c98c1f 100644
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 
 #include <memory>
 
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
similarity index 89%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
rename to tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
index 15df71981756f6171b8e12bd9ed2a337c4867b64..4a1d3ce4769d1a7d3f46f39941eb3e9bcde7785c 100644
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
 
 #include <utility>
 
-#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+#include "tensorflow/lite/tools/accuracy/stage.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -72,4 +72,4 @@ class InceptionPreprocessingStage : public Stage {
 }  // namespace metrics
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
rename to tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
index 3587878ba3cadd13eb0af4c004f4f98184daf5de..5d0e01d7d18c451b978edbd08fc27934c8379961 100644
--- a/tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg b/tensorflow/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d2a427810f679db537236c5430873a81a62ef412
Binary files /dev/null and b/tensorflow/lite/tools/accuracy/ilsvrc/testdata/grace_hopper.jpg differ
diff --git a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op.cc b/tensorflow/lite/tools/accuracy/run_tflite_model_op.cc
similarity index 95%
rename from tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op.cc
rename to tensorflow/lite/tools/accuracy/run_tflite_model_op.cc
index da4258f1c131076f564f0002a3cd99b221a18852..5f413b8ee39324ecc4e74007ceb61689065ce9a8 100644
--- a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op.cc
+++ b/tensorflow/lite/tools/accuracy/run_tflite_model_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/op_resolver.h"
-#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/tools/accuracy/utils.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op_test.cc b/tensorflow/lite/tools/accuracy/run_tflite_model_op_test.cc
similarity index 100%
rename from tensorflow/contrib/lite/tools/accuracy/run_tflite_model_op_test.cc
rename to tensorflow/lite/tools/accuracy/run_tflite_model_op_test.cc
diff --git a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.cc b/tensorflow/lite/tools/accuracy/run_tflite_model_stage.cc
similarity index 95%
rename from tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.cc
rename to tensorflow/lite/tools/accuracy/run_tflite_model_stage.cc
index c96795d4994ae3bee88da6ac6d26033c981b8d6a..6082290c0bc4fb6c7cbd0dc35f7ca7721103a9a5 100644
--- a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.cc
+++ b/tensorflow/lite/tools/accuracy/run_tflite_model_stage.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h"
+#include "tensorflow/lite/tools/accuracy/run_tflite_model_stage.h"
 
 #include <vector>
 
diff --git a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h b/tensorflow/lite/tools/accuracy/run_tflite_model_stage.h
similarity index 85%
rename from tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h
rename to tensorflow/lite/tools/accuracy/run_tflite_model_stage.h
index 90d12d6f424516859d6ca65c162663de44eeb391..61034491777a8d762339144f938d303ef0b8d29c 100644
--- a/tensorflow/contrib/lite/tools/accuracy/run_tflite_model_stage.h
+++ b/tensorflow/lite/tools/accuracy/run_tflite_model_stage.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
 
 #include <string>
 
-#include "tensorflow/contrib/lite/tools/accuracy/stage.h"
+#include "tensorflow/lite/tools/accuracy/stage.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -50,4 +50,4 @@ class RunTFLiteModelStage : public Stage {
 
 }  //  namespace metrics
 }  //  namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_RUN_TFLITE_MODEL_STAGE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/stage.h b/tensorflow/lite/tools/accuracy/stage.h
similarity index 90%
rename from tensorflow/contrib/lite/tools/accuracy/stage.h
rename to tensorflow/lite/tools/accuracy/stage.h
index 8292ea2ec735dc6946a4516483b9b97e685e4949..0a9e3fbd055e67ff7764911df0dddc606ca488d1 100644
--- a/tensorflow/contrib/lite/tools/accuracy/stage.h
+++ b/tensorflow/lite/tools/accuracy/stage.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_STAGE_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_STAGE_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_STAGE_H_
 
 #include "tensorflow/cc/framework/scope.h"
 
@@ -53,4 +53,4 @@ class Stage {
 }  //  namespace metrics
 }  //  namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_STAGE_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_STAGE_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/utils.cc b/tensorflow/lite/tools/accuracy/utils.cc
similarity index 92%
rename from tensorflow/contrib/lite/tools/accuracy/utils.cc
rename to tensorflow/lite/tools/accuracy/utils.cc
index f5493301fc4d781418cc5c7397bae02ecc155c56..c19dc1ff7cca10745a367c027bef1067d117eb4a 100644
--- a/tensorflow/contrib/lite/tools/accuracy/utils.cc
+++ b/tensorflow/lite/tools/accuracy/utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+#include "tensorflow/lite/tools/accuracy/utils.h"
 
 #include <sys/stat.h>
 
@@ -22,10 +22,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
 
 namespace tensorflow {
 namespace metrics {
diff --git a/tensorflow/contrib/lite/tools/accuracy/utils.h b/tensorflow/lite/tools/accuracy/utils.h
similarity index 85%
rename from tensorflow/contrib/lite/tools/accuracy/utils.h
rename to tensorflow/lite/tools/accuracy/utils.h
index 37cbad4d51fd0ddf700b14ead037ae4aeed4d82a..5b7639317eff0b2404dd0de780b992805f659697 100644
--- a/tensorflow/contrib/lite/tools/accuracy/utils.h
+++ b/tensorflow/lite/tools/accuracy/utils.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_UTILS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_UTILS_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_UTILS_H_
 
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/lite/context.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
@@ -43,4 +43,4 @@ Status ReadFileLines(const string& file_path,
 }  // namespace utils
 }  // namespace metrics
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_ACCURACY_UTILS_H_
+#endif  // TENSORFLOW_LITE_TOOLS_ACCURACY_UTILS_H_
diff --git a/tensorflow/contrib/lite/tools/accuracy/utils_test.cc b/tensorflow/lite/tools/accuracy/utils_test.cc
similarity index 97%
rename from tensorflow/contrib/lite/tools/accuracy/utils_test.cc
rename to tensorflow/lite/tools/accuracy/utils_test.cc
index 727eba21b6c6005d367130b23e31bc223508bc60..401872f18ffb3a5a35d2712b90b4624244b68917 100644
--- a/tensorflow/contrib/lite/tools/accuracy/utils_test.cc
+++ b/tensorflow/lite/tools/accuracy/utils_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/tools/accuracy/utils.h"
+#include "tensorflow/lite/tools/accuracy/utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bc47406cd92d406a0900743986ea67a4ba39240e
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -0,0 +1,151 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
+
+common_copts = ["-Wall"] + tflite_copts()
+
+cc_library(
+    name = "logging",
+    hdrs = ["logging.h"],
+    copts = common_copts,
+)
+
+cc_binary(
+    name = "benchmark_model",
+    srcs = [
+        "benchmark_main.cc",
+    ],
+    copts = common_copts,
+    linkopts = tflite_linkopts() + select({
+        "//tensorflow:android": [
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":benchmark_tflite_model_lib",
+        ":logging",
+    ],
+)
+
+cc_binary(
+    name = "benchmark_model_plus_flex",
+    srcs = [
+        "benchmark_plus_flex_main.cc",
+    ],
+    copts = common_copts,
+    linkopts = tflite_linkopts() + select({
+        "//tensorflow:android": [
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":benchmark_tflite_model_lib",
+        ":logging",
+        "//tensorflow/lite/delegates/flex:delegate",
+        "//tensorflow/lite/testing:init_tensorflow",
+    ],
+)
+
+cc_test(
+    name = "benchmark_test",
+    srcs = ["benchmark_test.cc"],
+    args = [
+        "--graph=$(location //tensorflow/lite:testdata/multi_add.bin)",
+    ],
+    data = ["//tensorflow/lite:testdata/multi_add.bin"],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":benchmark_tflite_model_lib",
+        ":command_line_flags",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "command_line_flags",
+    srcs = ["command_line_flags.cc"],
+    hdrs = ["command_line_flags.h"],
+    copts = common_copts,
+)
+
+cc_test(
+    name = "command_line_flags_test",
+    srcs = ["command_line_flags_test.cc"],
+    copts = common_copts,
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":command_line_flags",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "benchmark_tflite_model_lib",
+    srcs = [
+        "benchmark_tflite_model.cc",
+        "logging.h",
+    ],
+    hdrs = ["benchmark_tflite_model.h"],
+    copts = common_copts,
+    deps = [
+        ":benchmark_model_lib",
+        ":logging",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/profiling:profile_summarizer",
+        "@gemmlowp",
+    ],
+)
+
+cc_library(
+    name = "benchmark_params",
+    srcs = [
+        "benchmark_params.cc",
+    ],
+    hdrs = ["benchmark_params.h"],
+    copts = common_copts,
+    deps = [":logging"],
+)
+
+cc_library(
+    name = "benchmark_model_lib",
+    srcs = [
+        "benchmark_model.cc",
+    ],
+    hdrs = ["benchmark_model.h"],
+    copts = common_copts,
+    deps = [
+        ":benchmark_params",
+        ":command_line_flags",
+        ":logging",
+        "//tensorflow/core:stats_calculator_portable",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/profiling:profile_summarizer",
+        "//tensorflow/lite/profiling:profiler",
+        "//tensorflow/lite/profiling:time",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a4d9c879eb645019a7626502207e9a3f4e89b1c1
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -0,0 +1,202 @@
+# TFLite Model Benchmark Tool
+
+## Description
+
+A simple C++ binary to benchmark a TFLite model and its individual operators,
+both on desktop machines and on Android. The binary takes a TFLite model,
+generates random inputs and then repeatedly runs the model for specified number
+of runs. Aggregrate latency statistics are reported after running the benchmark.
+
+The instructions below are for running the binary on Desktop and Android,
+for iOS please use the
+[iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
+
+An experimental Android APK wrapper for the benchmark model utility offers more
+faithful execution behavior on Android (via a foreground Activity). It is
+located
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/android).
+
+## Parameters
+
+The binary takes the following required parameters:
+
+*   `graph`: `string` \
+    The path to the TFLite model file.
+
+and the following optional parameters:
+
+*   `num_threads`: `int` (default=1) \
+    The number of threads to use for running TFLite interpreter.
+*   `warmup_runs`: `int` (default=1) \
+    The number of warmup runs to do before starting the benchmark.
+*   `num_runs`: `int` (default=50) \
+    The number of runs. Increase this to reduce variance.
+*   `run_delay`: `float` (default=-1.0) \
+    The delay in seconds between subsequent benchmark runs. Non-positive values
+    mean use no delay.
+*   `use_nnapi`: `bool` (default=false) \
+    Whether to use [Android NNAPI](https://developer.android.com/ndk/guides/neuralnetworks/).
+    This API is available on recent Android devices.
+
+## To build/install/run
+
+### On Android:
+
+(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android to edit the `WORKSPACE` to configure the android NDK/SDK.
+
+(1) Build for your specific platform, e.g.:
+
+```
+bazel build -c opt \
+  --config=android_arm \
+  --cxxopt='--std=c++11' \
+  tensorflow/lite/tools/benchmark:benchmark_model
+```
+
+(2) Connect your phone. Push the binary to your phone with adb push
+     (make the directory if required):
+
+```
+adb push bazel-bin/tensorflow/lite/tools/benchmark/benchmark_model /data/local/tmp
+```
+
+(3) Make the binary executable.
+
+```
+adb shell chmod +x /data/local/tmp/benchmark_model
+```
+
+(4) Push the compute graph that you need to test. For example:
+
+```
+adb push mobilenet_quant_v1_224.tflite /data/local/tmp
+```
+
+(5) Run the benchmark. For example:
+
+```
+adb shell /data/local/tmp/benchmark_model \
+  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --num_threads=4
+```
+
+### On desktop:
+(1) build the binary
+
+```
+bazel build -c opt tensorflow/lite/tools/benchmark:benchmark_model
+```
+
+(2) Run on your compute graph, similar to the Android case but without the need of adb shell.
+For example:
+
+```
+bazel-bin/tensorflow/lite/tools/benchmark/benchmark_model \
+  --graph=mobilenet_quant_v1_224.tflite \
+  --num_threads=4
+```
+
+The MobileNet graph used as an example here may be downloaded from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip).
+
+
+## Reducing variance between runs on Android.
+
+Most modern Android phones use [ARM big.LITTLE](https://en.wikipedia.org/wiki/ARM_big.LITTLE)
+architecture where some cores are more power hungry but faster than other cores.
+When running benchmarks on these phones there can be significant variance
+between different runs of the benchmark. One way to reduce variance between runs
+is to set the [CPU affinity](https://en.wikipedia.org/wiki/Processor_affinity)
+before running the benchmark. On Android this can be done using the `taskset`
+command.
+E.g. for running the benchmark on big cores on Pixel 2 with a single thread one
+can use the following command:
+
+```
+adb shell taskset f0 /data/local/tmp/benchmark_model \
+  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --num_threads=1
+```
+
+where `f0` is the affinity mask for big cores on Pixel 2.
+Note: The affinity mask varies with the device.
+
+## Profiling model operators
+The benchmark model binary also allows you to profile operators and give execution times of each operator. To do this,
+compile the binary with a compiler flag that enables profiling to be compiled in. Pass **--copt=-DTFLITE_PROFILING_ENABLED**
+to compile benchmark with profiling support.
+For example, to compile with profiling support on Android, add this flag to the previous command:
+
+```
+bazel build -c opt \
+  --config=android_arm \
+  --cxxopt='--std=c++11' \
+  --copt=-DTFLITE_PROFILING_ENABLED \
+  tensorflow/lite/tools/benchmark:benchmark_model
+```
+This compiles TFLite with profiling enabled, now you can run the benchmark binary like before. The binary will produce detailed statistics for each operation similar to those shown below:
+
+```
+
+============================== Run Order ==============================
+	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
+	                 CONV_2D	    0.000	    4.269	    4.269	  0.107%	  0.107%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
+	       DEPTHWISE_CONV_2D	    4.270	    2.150	    2.150	  0.054%	  0.161%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6]
+	                 CONV_2D	    6.421	    6.107	    6.107	  0.153%	  0.314%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   12.528	    1.366	    1.366	  0.034%	  0.348%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_depthwise/Relu6]
+	                 CONV_2D	   13.895	    4.195	    4.195	  0.105%	  0.454%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   18.091	    1.260	    1.260	  0.032%	  0.485%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_depthwise/Relu6]
+	                 CONV_2D	   19.352	    6.652	    6.652	  0.167%	  0.652%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   26.005	    0.698	    0.698	  0.018%	  0.670%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_depthwise/Relu6]
+	                 CONV_2D	   26.703	    3.344	    3.344	  0.084%	  0.754%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   30.047	    0.646	    0.646	  0.016%	  0.770%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6]
+	                 CONV_2D	   30.694	    5.800	    5.800	  0.145%	  0.915%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   36.495	    0.331	    0.331	  0.008%	  0.924%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6]
+	                 CONV_2D	   36.826	    2.838	    2.838	  0.071%	  0.995%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   39.665	    0.439	    0.439	  0.011%	  1.006%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6]
+	                 CONV_2D	   40.105	    5.293	    5.293	  0.133%	  1.139%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   45.399	    0.352	    0.352	  0.009%	  1.147%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6]
+	                 CONV_2D	   45.752	    5.322	    5.322	  0.133%	  1.281%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   51.075	    0.357	    0.357	  0.009%	  1.290%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6]
+	                 CONV_2D	   51.432	    5.693	    5.693	  0.143%	  1.433%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   57.126	    0.366	    0.366	  0.009%	  1.442%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6]
+	                 CONV_2D	   57.493	    5.472	    5.472	  0.137%	  1.579%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   62.966	    0.364	    0.364	  0.009%	  1.588%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6]
+	                 CONV_2D	   63.330	    5.404	    5.404	  0.136%	  1.724%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   68.735	    0.155	    0.155	  0.004%	  1.728%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6]
+	                 CONV_2D	   68.891	    2.970	    2.970	  0.074%	  1.802%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   71.862	    0.206	    0.206	  0.005%	  1.807%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6]
+	                 CONV_2D	   72.069	    5.888	    5.888	  0.148%	  1.955%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
+	         AVERAGE_POOL_2D	   77.958	    0.036	    0.036	  0.001%	  1.956%	     0.000	        0	[MobilenetV1/Logits/AvgPool_1a/AvgPool]
+	                 CONV_2D	   77.994	    1.445	    1.445	  0.036%	  1.992%	     0.000	        0	[MobilenetV1/Logits/Conv2d_1c_1x1/BiasAdd]
+	                 RESHAPE	   79.440	    0.002	    0.002	  0.000%	  1.992%	     0.000	        0	[MobilenetV1/Predictions/Reshape]
+	                 SOFTMAX	   79.443	    0.029	    0.029	  0.001%	  1.993%	     0.000	        0	[MobilenetV1/Predictions/Softmax]
+
+============================== Top by Computation Time ==============================
+	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
+	                 CONV_2D	   19.352	    6.652	    6.652	  0.167%	  0.167%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
+	                 CONV_2D	    6.421	    6.107	    6.107	  0.153%	  0.320%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
+	                 CONV_2D	   72.069	    5.888	    5.888	  0.148%	  0.468%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
+	                 CONV_2D	   30.694	    5.800	    5.800	  0.145%	  0.613%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
+	                 CONV_2D	   51.432	    5.693	    5.693	  0.143%	  0.756%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	                 CONV_2D	   57.493	    5.472	    5.472	  0.137%	  0.893%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
+	                 CONV_2D	   63.330	    5.404	    5.404	  0.136%	  1.029%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6]
+	                 CONV_2D	   45.752	    5.322	    5.322	  0.133%	  1.162%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	                 CONV_2D	   40.105	    5.293	    5.293	  0.133%	  1.295%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	                 CONV_2D	    0.000	    4.269	    4.269	  0.107%	  1.402%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
+
+Number of nodes executed: 31
+============================== Summary by node type ==============================
+	             [Node type]	  [count]	  [avg ms]	    [avg %]	    [cdf %]	  [mem KB]	[times called]
+	                 CONV_2D	       15	     1.406	    89.270%	    89.270%	     0.000	        0
+	       DEPTHWISE_CONV_2D	       13	     0.169	    10.730%	   100.000%	     0.000	        0
+	                 SOFTMAX	        1	     0.000	     0.000%	   100.000%	     0.000	        0
+	                 RESHAPE	        1	     0.000	     0.000%	   100.000%	     0.000	        0
+	         AVERAGE_POOL_2D	        1	     0.000	     0.000%	   100.000%	     0.000	        0
+
+Timings (microseconds): count=50 first=79449 curr=81350 min=77385 max=88213 avg=79732 std=1929
+Memory (bytes): count=0
+31 nodes observed
+
+
+Average inference timings in us: Warmup: 83235, Init: 38467, no stats: 79760.9
+```
diff --git a/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..7cdca2885ddabe89bc846f3099dc055d471874b3
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.tensorflow.lite.benchmark">
+
+    <!-- Necessary for loading custom models from disk. -->
+    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
+
+    <!-- Target SDK 21 (<23) to avoid the need for requesting storage
+         permissions. This APK will almost always be used from the command-line
+         anyway, and be expicitly installed by the developer. -->
+    <uses-sdk
+        android:minSdkVersion="21"
+        android:targetSdkVersion="21" />
+
+    <application>
+        <!-- This Activity runs the TensorFlow Lite benchmark at creation, using
+             a provided set of arguments, then immediately terminates. -->
+        <activity android:name="org.tensorflow.lite.benchmark.BenchmarkModelActivity"
+                  android:screenOrientation="portrait"
+                  android:label="TFLite Benchmark"
+                  android:theme="@android:style/Theme.NoDisplay"
+                  android:exported="true"
+                  android:noHistory="true" />
+    </application>
+
+</manifest>
diff --git a/tensorflow/lite/tools/benchmark/android/BUILD b/tensorflow/lite/tools/benchmark/android/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a291effddc91d2abd153e9e8422ec7cbf5725c4b
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/BUILD
@@ -0,0 +1,44 @@
+# Description:
+#   BenchmarkModel Android harness for TensorFlow Lite benchmarks.
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
+
+# See README.md for details about building and executing this benchmark.
+android_binary(
+    name = "benchmark_model",
+    srcs = glob([
+        "src/**/*.java",
+    ]),
+    custom_package = "org.tensorflow.lite.benchmark",
+    manifest = "AndroidManifest.xml",
+    # In some platforms we don't have an Android SDK/NDK and this target
+    # can't be built. We need to prevent the build system from trying to
+    # use the target in that case.
+    tags = ["manual"],
+    deps = [":tensorflowlite_benchmark_native"],
+)
+
+tflite_jni_binary(
+    name = "libtensorflowlite_benchmark.so",
+    srcs = glob([
+        "jni/**/*.cc",
+        "jni/**/*.h",
+    ]),
+    deps = [
+        "//tensorflow/lite/java/jni",
+        "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
+        "//tensorflow/lite/tools/benchmark:logging",
+    ],
+)
+
+cc_library(
+    name = "tensorflowlite_benchmark_native",
+    srcs = ["libtensorflowlite_benchmark.so"],
+    visibility = ["//visibility:private"],
+)
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f5b67e3f79aa669c5424d46c23f053213ad3a101
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -0,0 +1,65 @@
+# TFLite Android Model Benchmark Tool
+
+## Description
+
+This Android benchmark app is a simple wrapper around the TensorFlow Lite
+[command-line benchmark utility](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark).
+
+Pushing and executing binaries directly on Android is a valid approach to
+benchmarking, but it can result in subtle (but observable) differences in
+performance relative to execution within an actual Android app. In particular,
+Android's scheduler tailors behavior based on thread and process priorities,
+which differ between a foreground Activity/Application and a regular background
+binary executed via `adb shell ...`. This tailored behavior is most evident when
+enabling multi-threaded CPU execution with TensorFlow Lite.
+
+To that end, this app offers perhaps a more faithful view of runtime performance
+that developers can expected when deploying TensorFlow Lite with their
+application.
+
+## To build/install/run
+
+(0) Refer to
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android
+to edit the `WORKSPACE` to configure the android NDK/SDK.
+
+(1) Build for your specific platform, e.g.:
+
+```
+bazel build -c opt \
+  --config=android_arm64 \
+  --cxxopt='--std=c++11' \
+  tensorflow/lite/tools/benchmark/android:benchmark_model
+```
+
+(2) Connect your phone. Install the benchmark APK to your phone with adb:
+
+```
+adb install -r -d bazel-bin/tensorflow/lite/tools/benchmark/android/benchmark_model.apk
+```
+
+(3) Push the compute graph that you need to test.
+
+```
+adb push mobilenet_quant_v1_224.tflite /data/local/tmp
+```
+
+(4) Run the benchmark. Additional command-line flags are documented
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/README.md)
+and can be appended to the `args` string alongside the required `--graph` flag
+(note that all args must be nested in the single quoted string that follows the
+args key).
+
+```
+adb shell am start -S -n
+  org.tensorflow.lite.benchmark/org.tensorflow.lite.benchmark.BenchmarkModelActivity \
+  --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite --num_threads=4"'
+```
+
+(5) The results will be available in Android's logcat, e.g.:
+
+```
+adb logcat | grep "Average inference"
+
+... tflite  : Average inference timings in us: Warmup: 91471, Init: 4108, Inference: 80660.1
+```
diff --git a/tensorflow/lite/tools/benchmark/android/jni/benchmark_model_jni.cc b/tensorflow/lite/tools/benchmark/android/jni/benchmark_model_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee67bdafb0d3dd84ca1eaba8062e385887f3eb74
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/jni/benchmark_model_jni.cc
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <jni.h>
+
+#include <sstream>
+#include <string>
+
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#endif
+
+namespace tflite {
+namespace benchmark {
+namespace {
+
+class AndroidBenchmarkLoggingListener : public BenchmarkListener {
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    auto inference_us = results.inference_time_us();
+    auto init_us = results.startup_latency_us();
+    auto warmup_us = results.warmup_time_us();
+    std::stringstream results_output;
+    results_output << "Average inference timings in us: "
+                   << "Warmup: " << warmup_us.avg() << ", "
+                   << "Init: " << init_us << ", "
+                   << "Inference: " << inference_us.avg();
+#ifdef __ANDROID__
+    __android_log_print(ANDROID_LOG_ERROR, "tflite", "%s",
+                        results_output.str().c_str());
+#else
+    fprintf(stderr, "%s", results_output.str().c_str());
+#endif
+  }
+};
+
+void Run(int argc, char** argv) {
+  BenchmarkTfLiteModel benchmark;
+  AndroidBenchmarkLoggingListener listener;
+  benchmark.AddListener(&listener);
+  benchmark.Run(argc, argv);
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace tflite
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_benchmark_BenchmarkModel_nativeRun(JNIEnv* env,
+                                                            jclass clazz,
+                                                            jstring args_obj) {
+  const char* args_chars = env->GetStringUTFChars(args_obj, nullptr);
+
+  // Split the args string into individual arg tokens.
+  std::istringstream iss(args_chars);
+  std::vector<std::string> args_split{std::istream_iterator<std::string>(iss),
+                                      {}};
+
+  // Construct a fake argv command-line object for the benchmark.
+  std::vector<char*> argv;
+  std::string arg0 = "(BenchmarkModelAndroid)";
+  argv.push_back(const_cast<char*>(arg0.data()));
+  for (auto& arg : args_split) {
+    argv.push_back(const_cast<char*>(arg.data()));
+  }
+
+  tflite::benchmark::Run(static_cast<int>(argv.size()), argv.data());
+
+  env->ReleaseStringUTFChars(args_obj, args_chars);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModel.java b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModel.java
new file mode 100644
index 0000000000000000000000000000000000000000..a6cf8d78d5703300b3576ab3221326a2335e602e
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModel.java
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark;
+
+/** Helper class for running a native TensorFlow Lite benchmark. */
+class BenchmarkModel {
+  static {
+    System.loadLibrary("tensorflowlite_benchmark");
+  }
+
+  // Executes a standard TensorFlow Lite benchmark according to the provided args.
+  //
+  // Note that {@code args} will be split by the native execution code.
+  public static void run(String args) {
+    nativeRun(args);
+  }
+
+  private static native void nativeRun(String args);
+}
diff --git a/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..12410adf3d6687ffa514c6ba21981fb19286fe62
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.util.Log;
+
+/** Main {@code Activity} class for the benchmark app. */
+public class BenchmarkModelActivity extends Activity {
+
+  private static final String TAG = "tflite_BenchmarkModelActivity";
+
+  private static final String ARGS_INTENT_KEY_0 = "args";
+  private static final String ARGS_INTENT_KEY_1 = "--args";
+
+  @Override
+  public void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+
+    Intent intent = getIntent();
+    Bundle bundle = intent.getExtras();
+    String args = bundle.getString(ARGS_INTENT_KEY_0, bundle.getString(ARGS_INTENT_KEY_1));
+    Log.i(TAG, "Running TensorFlow Lite benchmark with args: " + args);
+
+    BenchmarkModel.run(args);
+
+    finish();
+  }
+}
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_main.cc b/tensorflow/lite/tools/benchmark/benchmark_main.cc
similarity index 89%
rename from tensorflow/contrib/lite/tools/benchmark/benchmark_main.cc
rename to tensorflow/lite/tools/benchmark/benchmark_main.cc
index 372d31e838e5666df492ee3156022249a2d97691..dcf82a8b7ec348f118a546f6dd8b640184b8abef 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_main.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_main.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
-#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
 
 namespace tflite {
 namespace benchmark {
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9b485efcaa81b011c598d5dfa39d4f253090dc8
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -0,0 +1,195 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+
+#include <time.h>
+
+#include <iostream>
+#include <sstream>
+
+#include "tensorflow/lite/profiling/time.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
+
+namespace {
+void SleepForSeconds(double sleep_seconds) {
+  if (sleep_seconds <= 0.0) {
+    return;
+  }
+  // Convert the run_delay string into a timespec.
+  timespec req;
+  req.tv_sec = static_cast<time_t>(sleep_seconds);
+  req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
+  // If requested, sleep between runs for an arbitrary amount of time.
+  // This can be helpful to determine the effect of mobile processor
+  // scaling and thermal throttling.
+#ifdef PLATFORM_WINDOWS
+  Sleep(sleep_seconds * 1000);
+#else
+  nanosleep(&req, nullptr);
+#endif
+}
+
+}  // namespace
+
+namespace tflite {
+namespace benchmark {
+using tensorflow::Stat;
+
+BenchmarkParams BenchmarkModel::DefaultParams() {
+  BenchmarkParams params;
+  params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(50));
+  params.AddParam("min_secs", BenchmarkParam::Create<float>(1.0f));
+  params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
+  params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("output_prefix", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("warmup_runs", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
+  return params;
+}
+
+BenchmarkModel::BenchmarkModel() : params_(DefaultParams()) {}
+
+void BenchmarkLoggingListener::OnBenchmarkEnd(const BenchmarkResults &results) {
+  auto inference_us = results.inference_time_us();
+  auto init_us = results.startup_latency_us();
+  auto warmup_us = results.warmup_time_us();
+  TFLITE_LOG(INFO) << "Average inference timings in us: "
+                   << "Warmup: " << warmup_us.avg() << ", "
+                   << "Init: " << init_us << ", "
+                   << "no stats: " << inference_us.avg();
+}
+
+std::vector<Flag> BenchmarkModel::GetFlags() {
+  return {
+      CreateFlag<int32_t>("num_runs", &params_,
+                          "minimum number of runs, see also min_secs"),
+      CreateFlag<float>(
+          "min_secs", &params_,
+          "minimum number of seconds to rerun for, potentially making the "
+          "actual number of runs to be greater than num_runs"),
+      CreateFlag<float>("run_delay", &params_, "delay between runs in seconds"),
+      CreateFlag<int32_t>("num_threads", &params_, "number of threads"),
+      CreateFlag<std::string>("benchmark_name", &params_, "benchmark name"),
+      CreateFlag<std::string>("output_prefix", &params_,
+                              "benchmark output prefix"),
+      CreateFlag<int32_t>(
+          "warmup_runs", &params_,
+          "minimum number of runs performed on initialization, to "
+          "allow performance characteristics to settle, see also "
+          "warmup_min_secs"),
+      CreateFlag<float>(
+          "warmup_min_secs", &params_,
+          "minimum number of seconds to rerun for, potentially making the "
+          "actual number of warm-up runs to be greater than warmup_runs"),
+  };
+}
+
+void BenchmarkModel::LogParams() {
+  TFLITE_LOG(INFO) << "Min num runs: [" << params_.Get<int32_t>("num_runs")
+                   << "]";
+  TFLITE_LOG(INFO) << "Min runs duration (seconds): ["
+                   << params_.Get<float>("min_secs") << "]";
+  TFLITE_LOG(INFO) << "Inter-run delay (seconds): ["
+                   << params_.Get<float>("run_delay") << "]";
+  TFLITE_LOG(INFO) << "Num threads: [" << params_.Get<int32_t>("num_threads")
+                   << "]";
+  TFLITE_LOG(INFO) << "Benchmark name: ["
+                   << params_.Get<std::string>("benchmark_name") << "]";
+  TFLITE_LOG(INFO) << "Output prefix: ["
+                   << params_.Get<std::string>("output_prefix") << "]";
+  TFLITE_LOG(INFO) << "Min warmup runs: ["
+                   << params_.Get<int32_t>("warmup_runs") << "]";
+  TFLITE_LOG(INFO) << "Min warmup runs duration (seconds): ["
+                   << params_.Get<float>("warmup_min_secs") << "]";
+}
+
+void BenchmarkModel::PrepareInputsAndOutputs() {}
+
+Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
+                                  RunType run_type) {
+  Stat<int64_t> run_stats;
+  TFLITE_LOG(INFO) << "Running benchmark for at least " << min_num_times
+                   << " iterations and at least " << min_secs << " seconds";
+  int64_t min_finish_us =
+      profiling::time::NowMicros() + static_cast<int64_t>(min_secs * 1.e6f);
+  for (int run = 0;
+       run < min_num_times || profiling::time::NowMicros() < min_finish_us;
+       run++) {
+    PrepareInputsAndOutputs();
+    listeners_.OnSingleRunStart(run_type);
+    int64_t start_us = profiling::time::NowMicros();
+    RunImpl();
+    int64_t end_us = profiling::time::NowMicros();
+    listeners_.OnSingleRunEnd();
+
+    run_stats.UpdateStat(end_us - start_us);
+    SleepForSeconds(params_.Get<float>("run_delay"));
+  }
+
+  std::stringstream stream;
+  run_stats.OutputToStream(&stream);
+  TFLITE_LOG(INFO) << stream.str() << std::endl;
+
+  return run_stats;
+}
+
+bool BenchmarkModel::ValidateParams() { return true; }
+
+void BenchmarkModel::Run(int argc, char **argv) {
+  if (!ParseFlags(argc, argv)) {
+    return;
+  }
+  Run();
+}
+
+void BenchmarkModel::Run() {
+  ValidateParams();
+  LogParams();
+
+  listeners_.OnBenchmarkStart(params_);
+  int64_t initialization_start_us = profiling::time::NowMicros();
+  Init();
+  int64_t initialization_end_us = profiling::time::NowMicros();
+  int64_t startup_latency_us = initialization_end_us - initialization_start_us;
+  TFLITE_LOG(INFO) << "Initialized session in " << startup_latency_us / 1e3
+                   << "ms";
+
+  uint64_t input_bytes = ComputeInputBytes();
+  Stat<int64_t> warmup_time_us =
+      Run(params_.Get<int32_t>("warmup_runs"),
+          params_.Get<float>("warmup_min_secs"), WARMUP);
+  Stat<int64_t> inference_time_us =
+      Run(params_.Get<int32_t>("num_runs"), params_.Get<float>("min_secs"),
+          REGULAR);
+  listeners_.OnBenchmarkEnd(
+      {startup_latency_us, input_bytes, warmup_time_us, inference_time_us});
+}
+
+bool BenchmarkModel::ParseFlags(int argc, char **argv) {
+  auto flag_list = GetFlags();
+  const bool parse_result =
+      Flags::Parse(&argc, const_cast<const char **>(argv), flag_list);
+  if (!parse_result) {
+    std::string usage = Flags::Usage(argv[0], flag_list);
+    TFLITE_LOG(ERROR) << usage;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h b/tensorflow/lite/tools/benchmark/benchmark_model.h
similarity index 91%
rename from tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
rename to tensorflow/lite/tools/benchmark/benchmark_model.h
index cc215a7b7f08a959ca732773a54efdf928c1fc2e..31ee5c92aa3f1ffb53f1a39fbc6e1344d92a260c 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
 
 #include <cmath>
 #include <limits>
@@ -23,8 +23,8 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/contrib/lite/tools/benchmark/benchmark_params.h"
-#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
+#include "tensorflow/lite/tools/benchmark/command_line_flags.h"
 #include "tensorflow/core/util/stats_calculator.h"
 
 namespace tflite {
@@ -150,7 +150,8 @@ class BenchmarkModel {
   bool ParseFlags(int argc, char** argv);
   virtual std::vector<Flag> GetFlags();
   virtual uint64_t ComputeInputBytes() = 0;
-  virtual tensorflow::Stat<int64_t> Run(int num_times, RunType run_type);
+  virtual tensorflow::Stat<int64_t> Run(int min_num_times, float min_secs,
+                                        RunType run_type);
   virtual void PrepareInputsAndOutputs();
   virtual void RunImpl() = 0;
   BenchmarkParams params_;
@@ -160,4 +161,4 @@ class BenchmarkModel {
 }  // namespace benchmark
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_params.cc b/tensorflow/lite/tools/benchmark/benchmark_params.cc
similarity index 92%
rename from tensorflow/contrib/lite/tools/benchmark/benchmark_params.cc
rename to tensorflow/lite/tools/benchmark/benchmark_params.cc
index 1dcf580a9d4995e6cb3706d3562bc8a2f4670082..5ab3adff553674170563dbdbce17a6759582b5da 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_params.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_params.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/benchmark/benchmark_params.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_params.h"
 
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
 
 namespace tflite {
 namespace benchmark {
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_params.h b/tensorflow/lite/tools/benchmark/benchmark_params.h
similarity index 90%
rename from tensorflow/contrib/lite/tools/benchmark/benchmark_params.h
rename to tensorflow/lite/tools/benchmark/benchmark_params.h
index c98f47bb0d89864dff54d7cdebe764e56e4cfda2..594baa5b4ec1ecb3e721689c64698dea8e2ab5b4 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_params.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_params.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_PARAMS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_PARAMS_H_
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PARAMS_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PARAMS_H_
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
 
 namespace tflite {
 namespace benchmark {
@@ -98,4 +98,4 @@ class BenchmarkParams {
 
 }  // namespace benchmark
 }  // namespace tflite
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_PARAMS_H_
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_PARAMS_H_
diff --git a/tensorflow/lite/tools/benchmark/benchmark_plus_flex_main.cc b/tensorflow/lite/tools/benchmark/benchmark_plus_flex_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e72a293770afd5f47220df15152a69ba7d3a559
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_plus_flex_main.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/testing/init_tensorflow.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
+
+namespace tflite {
+namespace benchmark {
+
+int Main(int argc, char** argv) {
+  ::tflite::InitTensorFlow();
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+  TFLITE_LOG(INFO) << "STARTING with custom ops!";
+#else
+  TFLITE_LOG(INFO) << "STARTING!";
+#endif
+  BenchmarkTfLiteModel benchmark;
+  BenchmarkLoggingListener listener;
+  benchmark.AddListener(&listener);
+  benchmark.Run(argc, argv);
+  return 0;
+}
+}  // namespace benchmark
+}  // namespace tflite
+
+int main(int argc, char** argv) { return tflite::benchmark::Main(argc, argv); }
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4f830122f65bcacb0eae4783998cf8bb5611fb9
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/command_line_flags.h"
+
+namespace {
+const std::string* g_model_path = nullptr;
+}
+
+namespace tflite {
+namespace benchmark {
+namespace {
+
+BenchmarkParams CreateParams() {
+  BenchmarkParams params;
+  params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(2));
+  params.AddParam("min_secs", BenchmarkParam::Create<float>(1.0f));
+  params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
+  params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("output_prefix", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("warmup_runs", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("graph", BenchmarkParam::Create<std::string>(*g_model_path));
+  params.AddParam("input_layer", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("input_layer_shape", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
+  params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
+  return params;
+}
+
+class TestBenchmark : public BenchmarkTfLiteModel {
+ public:
+  explicit TestBenchmark(BenchmarkParams params)
+      : BenchmarkTfLiteModel(std::move(params)) {}
+  const tflite::Interpreter* GetInterpreter() { return interpreter.get(); }
+
+  void Prepare() { PrepareInputsAndOutputs(); }
+};
+
+TEST(BenchmarkTest, DoesntCrash) {
+  ASSERT_THAT(g_model_path, testing::NotNull());
+
+  BenchmarkTfLiteModel benchmark(CreateParams());
+  benchmark.Run();
+}
+
+TEST(BenchmarkTest, ParametersArePopulatedWhenInputShapeIsNotSpecified) {
+  ASSERT_THAT(g_model_path, testing::NotNull());
+
+  TestBenchmark benchmark(CreateParams());
+  benchmark.Init();
+  benchmark.Prepare();
+
+  auto interpreter = benchmark.GetInterpreter();
+  auto inputs = interpreter->inputs();
+  ASSERT_GE(inputs.size(), 1);
+  auto input_tensor = interpreter->tensor(inputs[0]);
+
+  std::vector<char> input_bytes;
+  input_bytes.reserve(input_tensor->bytes);
+  for (size_t i = 0; i < input_tensor->bytes; i++) {
+    input_bytes.push_back(input_tensor->data.raw_const[i]);
+  }
+  benchmark.Prepare();
+
+  // Expect data is not the same.
+  EXPECT_EQ(input_bytes.size(), input_tensor->bytes);
+  bool is_same = true;
+  for (size_t i = 0; i < input_tensor->bytes; i++) {
+    if (input_bytes[i] != input_tensor->data.raw_const[i]) {
+      is_same = false;
+      break;
+    }
+  }
+  EXPECT_FALSE(is_same);
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  std::string model_path;
+  std::vector<tflite::Flag> flags = {
+      tflite::Flag::CreateFlag("graph", &model_path, "Path to model file.")};
+  g_model_path = &model_path;
+  const bool parse_result =
+      tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flags);
+  if (!parse_result) {
+    std::cerr << tflite::Flags::Usage(argv[0], flags);
+    return 1;
+  }
+
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
similarity index 87%
rename from tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
rename to tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 2a3df7f289ffb7ac1ad3ebc2d6fc42ecf67e2be9..32cf4e4292a57ebb73abfaeb3d73d5c1e5717f43 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
 
 #include <cstdarg>
 #include <cstdlib>
@@ -23,11 +23,15 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/op_resolver.h"
-#include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
+
+#ifdef GEMMLOWP_PROFILING
+#include "gemmlowp/profiling/profiler.h"
+#endif
 
 #ifdef TFLITE_CUSTOM_OPS_HEADER
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
@@ -62,6 +66,21 @@ void ProfilingListener::OnSingleRunEnd() {
   summarizer_.ProcessProfiles(profile_events, *interpreter_);
 }
 
+void GemmlowpProfilingListener::OnBenchmarkStart(
+    const BenchmarkParams& params) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::RegisterCurrentThreadForProfiling();
+  gemmlowp::StartProfiling();
+#endif
+}
+
+void GemmlowpProfilingListener::OnBenchmarkEnd(
+    const BenchmarkResults& results) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::FinishProfiling();
+#endif
+}
+
 namespace {
 
 std::vector<std::string> Split(const std::string& str, const char delim) {
@@ -162,7 +181,18 @@ bool PopulateInputLayerInfo(
   return true;
 }
 
-BenchmarkParams GetDefaultParams() {
+std::vector<int> TfLiteIntArrayToVector(const TfLiteIntArray* int_array) {
+  std::vector<int> values;
+  values.reserve(int_array->size);
+  for (size_t i = 0; i < int_array->size; i++) {
+    values.push_back(int_array->data[i]);
+  }
+  return values;
+}
+
+}  // namespace
+
+BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   BenchmarkParams default_params = BenchmarkModel::DefaultParams();
   default_params.AddParam("graph", BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("input_layer",
@@ -173,16 +203,13 @@ BenchmarkParams GetDefaultParams() {
   return default_params;
 }
 
-}  // namespace
-
 BenchmarkTfLiteModel::BenchmarkTfLiteModel()
-    : BenchmarkModel(GetDefaultParams()) {
-  AddListener(&profiling_listener_);
-}
+    : BenchmarkTfLiteModel(DefaultParams()) {}
 
 BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
     : BenchmarkModel(std::move(params)) {
   AddListener(&profiling_listener_);
+  AddListener(&gemmlowp_profiling_listener_);
 }
 
 std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
@@ -232,12 +259,10 @@ uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
 void BenchmarkTfLiteModel::PrepareInputsAndOutputs() {
   auto interpreter_inputs = interpreter->inputs();
   // Set the values of the input tensors.
-  for (int j = 0; j < inputs.size(); ++j) {
-    const InputLayerInfo& input = inputs[j];
+  for (int j = 0; j < interpreter_inputs.size(); ++j) {
     int i = interpreter_inputs[j];
     TfLiteTensor* t = interpreter->tensor(i);
-    std::vector<int> sizes = input.shape;
-
+    std::vector<int> sizes = TfLiteIntArrayToVector(t->dims);
     // TODO(ahentz): below we ignore the O-th dimension (number of batches).
     if (t->type == kTfLiteFloat32) {
       FillRandomValue<float>(
@@ -256,12 +281,17 @@ void BenchmarkTfLiteModel::PrepareInputsAndOutputs() {
           interpreter->typed_tensor<uint8_t>(i),
           std::vector<int>(sizes.begin() + 1, sizes.end()),
           []() { return static_cast<uint8_t>(rand()) % 255; });
+    } else if (t->type == kTfLiteInt8) {
+      FillRandomValue<int8_t>(
+          interpreter->typed_tensor<int8_t>(i),
+          std::vector<int>(sizes.begin() + 1, sizes.end()),
+          []() { return static_cast<int8_t>(rand()) % 255 - 127; });
     } else if (t->type == kTfLiteString) {
       tflite::DynamicBuffer buffer;
       FillRandomString(&buffer, sizes, []() {
         return "we're have some friends over saturday to hang out in the yard";
       });
-      buffer.WriteToTensor(interpreter->tensor(i));
+      buffer.WriteToTensor(interpreter->tensor(i), /*new_shape=*/nullptr);
     } else {
       TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
                         << " of type " << t->type;
@@ -301,6 +331,7 @@ void BenchmarkTfLiteModel::Init() {
   bool use_nnapi = params_.Get<bool>("use_nnapi");
 
   interpreter->UseNNAPI(use_nnapi);
+  ApplyDelegates();
 
   auto interpreter_inputs = interpreter->inputs();
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
new file mode 100644
index 0000000000000000000000000000000000000000..83599e644d1f41f70fd96f3a73f9155d6e62deef
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/profiling/profile_summarizer.h"
+#include "tensorflow/lite/tools/benchmark/benchmark_model.h"
+
+namespace tflite {
+namespace benchmark {
+
+// Dumps profiling events if profiling is enabled.
+class ProfilingListener : public BenchmarkListener {
+ public:
+  explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
+
+  void SetInterpreter(Interpreter* interpreter);
+
+  void OnSingleRunStart(RunType run_type) override;
+
+  void OnSingleRunEnd() override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+ private:
+  Interpreter* interpreter_;
+  profiling::Profiler profiler_;
+  profiling::ProfileSummarizer summarizer_;
+  bool has_profiles_;
+};
+
+// Dumps gemmlowp profiling events if gemmlowp profiling is enabled.
+class GemmlowpProfilingListener : public BenchmarkListener {
+ public:
+  virtual ~GemmlowpProfilingListener() {}
+
+  void OnBenchmarkStart(const BenchmarkParams& params) override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
+
+// Benchmarks a TFLite model by running tflite interpreter.
+class BenchmarkTfLiteModel : public BenchmarkModel {
+ public:
+  BenchmarkTfLiteModel();
+  explicit BenchmarkTfLiteModel(BenchmarkParams params);
+  virtual ~BenchmarkTfLiteModel() {}
+
+  std::vector<Flag> GetFlags() override;
+  void LogParams() override;
+  bool ValidateParams() override;
+  uint64_t ComputeInputBytes() override;
+  void Init() override;
+  void RunImpl() override;
+
+  struct InputLayerInfo {
+    std::string name;
+    std::vector<int> shape;
+  };
+
+ protected:
+  static BenchmarkParams DefaultParams();
+  void PrepareInputsAndOutputs() override;
+
+  // Allows installation of custom delegates during initialization
+  virtual void ApplyDelegates() {}
+
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+
+ private:
+  std::vector<InputLayerInfo> inputs;
+  ProfilingListener profiling_listener_;
+  GemmlowpProfilingListener gemmlowp_profiling_listener_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc b/tensorflow/lite/tools/benchmark/command_line_flags.cc
similarity index 96%
rename from tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
rename to tensorflow/lite/tools/benchmark/command_line_flags.cc
index ff818b9dcb5ee0b58b95c3dceae74083dbd4f0da..2fad780dc8680b5ac6c1d2d77739e495116aa990 100644
--- a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
+++ b/tensorflow/lite/tools/benchmark/command_line_flags.cc
@@ -10,7 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
+#include "tensorflow/lite/tools/benchmark/command_line_flags.h"
 
 #include <cstring>
 #include <sstream>
@@ -59,11 +59,12 @@ bool ParseFlag(const std::string& flag_value,
 
 bool ParseBoolFlag(const std::string& flag_value,
                    const std::function<void(const bool&)>& hook) {
-  if (flag_value != "true" && flag_value != "false") {
+  if (flag_value != "true" && flag_value != "false" && flag_value != "0" &&
+      flag_value != "1") {
     return false;
   }
 
-  hook(flag_value == "true");
+  hook(flag_value == "true" || flag_value == "1");
   return true;
 }
 }  // namespace
diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h b/tensorflow/lite/tools/benchmark/command_line_flags.h
similarity index 95%
rename from tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
rename to tensorflow/lite/tools/benchmark/command_line_flags.h
index 6a0affd83449350d6268fc845aa0997f14809525..cc71450053ee8d9006e4c3ab06cfd9a5163e6bee 100644
--- a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
+++ b/tensorflow/lite/tools/benchmark/command_line_flags.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
 
 #include <functional>
 #include <string>
@@ -120,4 +120,4 @@ class Flags {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc b/tensorflow/lite/tools/benchmark/command_line_flags_test.cc
similarity index 90%
rename from tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
rename to tensorflow/lite/tools/benchmark/command_line_flags_test.cc
index 03da8051099899241fa5241374d754adb1aa93c6..afdf2793bf9db61636941e8415934e312d58ed07 100644
--- a/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
+++ b/tensorflow/lite/tools/benchmark/command_line_flags_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
+#include "tensorflow/lite/tools/benchmark/command_line_flags.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace {
@@ -27,13 +27,17 @@ TEST(CommandLineFlagsTest, BasicUsage) {
   bool some_switch = false;
   std::string some_name = "something_a";
   float some_float = -23.23f;
+  bool some_bool = false;
+  bool some_numeric_bool = true;
   const char* argv_strings[] = {"program_name",
                                 "--some_int32=20",
                                 "--some_int64=214748364700",
                                 "--some_switch=true",
                                 "--some_name=somethingelse",
-                                "--some_float=42.0"};
-  int argc = 6;
+                                "--some_float=42.0",
+                                "--some_bool=true",
+                                "--some_numeric_bool=0"};
+  int argc = 8;
   bool parsed_ok = Flags::Parse(
       &argc, reinterpret_cast<const char**>(argv_strings),
       {
@@ -42,6 +46,9 @@ TEST(CommandLineFlagsTest, BasicUsage) {
           Flag::CreateFlag("some_switch", &some_switch, "some switch"),
           Flag::CreateFlag("some_name", &some_name, "some name"),
           Flag::CreateFlag("some_float", &some_float, "some float"),
+          Flag::CreateFlag("some_bool", &some_bool, "some bool"),
+          Flag::CreateFlag("some_numeric_bool", &some_numeric_bool,
+                           "some numeric bool"),
       });
 
   EXPECT_EQ(true, parsed_ok);
@@ -50,6 +57,8 @@ TEST(CommandLineFlagsTest, BasicUsage) {
   EXPECT_EQ(true, some_switch);
   EXPECT_EQ("somethingelse", some_name);
   EXPECT_NEAR(42.0f, some_float, 1e-5f);
+  EXPECT_TRUE(some_bool);
+  EXPECT_FALSE(some_numeric_bool);
   EXPECT_EQ(argc, 1);
 }
 
diff --git a/tensorflow/lite/tools/benchmark/ios/README.md b/tensorflow/lite/tools/benchmark/ios/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fed9e7ea7e8633e00413118fa3e9e4f12d5188a4
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/ios/README.md
@@ -0,0 +1,54 @@
+# TFLite iOS benchmark app.
+
+## Description
+
+An iOS app to benchmark TFLite models.
+
+The app reads benchmark parameters from a JSON file named `benchmark_params.json`
+in its `benchmark_data` directory. Any downloaded models for benchmarking should
+also be placed in `benchmark_data` directory.
+
+The JSON file specifies the name of the model file and other benchmarking
+parameters like inputs to the model, type of inputs, number of iterations,
+number of threads. The default values in the JSON file are for the
+Mobilenet_1.0_224 model
+([paper](https://arxiv.org/pdf/1704.04861.pdf),
+[tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz))
+
+## To build/install/run
+
+- Follow instructions at
+[iOS build for TFLite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/ios.md)
+to build TFLite.
+
+Running
+
+```bash
+tensorflow/lite/build_ios_universal_lib.sh
+```
+will also build `tensorflow/lite/gen/lib/benchmark-lib.a` .
+
+- Now copy the downloaded model file to `benchmark_data` directory. 
+
+- Modify `benchmark_params.json` change the `input_layer`, `input_layer_shape`
+and other benchmark parameters.
+
+- Change `Build Phases -> Copy Bundle Resources` and add the model file to the
+resources that need to be copied.
+
+- Ensure that `Build Phases -> Link Binary With Library` contains the 
+`Accelerate framework` and `tensorflow/lite/gen/lib/benchmark-lib.a`.
+
+- Now try running the app. The app has a single button that runs the benchmark
+  on the model and displays results in a text view below.
+
+## Profiling
+
+If you want detailed profiling, use the following command:
+
+```bash
+tensorflow/lite/build_ios_universal_lib.sh -p
+```
+
+Then following the same steps above and run the benchmark app. You will see the
+detailed profiling results in the outputs.
diff --git a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..a5f5bfbbdafc7c11a1340dc26cc2b29d525cca7a
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
@@ -0,0 +1,381 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		6FE7579A20D59CE500F01636 /* benchmark_params.json in Resources */ = {isa = PBXBuildFile; fileRef = 6FE7579920D59CE500F01636 /* benchmark_params.json */; };
+		6FE7579D20D5A5E000F01636 /* benchmark-lib.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 6FE7579C20D5A5E000F01636 /* benchmark-lib.a */; };
+		6FE7579F20D5A6A700F01636 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 6FE7579E20D5A6A700F01636 /* Accelerate.framework */; };
+		6FE757A120D5AB8100F01636 /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = 6FE757A020D5AB8000F01636 /* mobilenet_v1_1.0_224.tflite */; };
+		6FE93FFD20D592D8008C9FE4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 6FE93FFC20D592D8008C9FE4 /* AppDelegate.m */; };
+		6FE9400020D592D8008C9FE4 /* BenchmarkViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 6FE93FFF20D592D8008C9FE4 /* BenchmarkViewController.mm */; };
+		6FE9400320D592D8008C9FE4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 6FE9400120D592D8008C9FE4 /* Main.storyboard */; };
+		6FE9400520D592DA008C9FE4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 6FE9400420D592DA008C9FE4 /* Assets.xcassets */; };
+		6FE9400B20D592DA008C9FE4 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 6FE9400A20D592DA008C9FE4 /* main.m */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		6FE7579920D59CE500F01636 /* benchmark_params.json */ = {isa = PBXFileReference; lastKnownFileType = text.json; path = benchmark_params.json; sourceTree = "<group>"; };
+		6FE7579C20D5A5E000F01636 /* benchmark-lib.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "benchmark-lib.a"; path = "$SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib/benchmark-lib.a"; sourceTree = "<group>"; };
+		6FE7579E20D5A6A700F01636 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
+		6FE757A020D5AB8000F01636 /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
+		6FE93FF820D592D8008C9FE4 /* TFLiteBenchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TFLiteBenchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		6FE93FFB20D592D8008C9FE4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		6FE93FFC20D592D8008C9FE4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
+		6FE93FFE20D592D8008C9FE4 /* BenchmarkViewController.h */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.h; path = BenchmarkViewController.h; sourceTree = "<group>"; };
+		6FE93FFF20D592D8008C9FE4 /* BenchmarkViewController.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = BenchmarkViewController.mm; sourceTree = "<group>"; };
+		6FE9400220D592D8008C9FE4 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		6FE9400420D592DA008C9FE4 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		6FE9400920D592DA008C9FE4 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		6FE9400A20D592DA008C9FE4 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		6FE93FF520D592D8008C9FE4 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				6FE7579F20D5A6A700F01636 /* Accelerate.framework in Frameworks */,
+				6FE7579D20D5A5E000F01636 /* benchmark-lib.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		6FE7579820D59C8B00F01636 /* benchmark_data */ = {
+			isa = PBXGroup;
+			children = (
+				6FE757A020D5AB8000F01636 /* mobilenet_v1_1.0_224.tflite */,
+				6FE7579920D59CE500F01636 /* benchmark_params.json */,
+			);
+			path = benchmark_data;
+			sourceTree = "<group>";
+		};
+		6FE7579B20D5A5E000F01636 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				6FE7579E20D5A6A700F01636 /* Accelerate.framework */,
+				6FE7579C20D5A5E000F01636 /* benchmark-lib.a */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		6FE93FEF20D592D8008C9FE4 = {
+			isa = PBXGroup;
+			children = (
+				6FE93FFA20D592D8008C9FE4 /* TFLiteBenchmark */,
+				6FE93FF920D592D8008C9FE4 /* Products */,
+				6FE7579B20D5A5E000F01636 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		6FE93FF920D592D8008C9FE4 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				6FE93FF820D592D8008C9FE4 /* TFLiteBenchmark.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		6FE93FFA20D592D8008C9FE4 /* TFLiteBenchmark */ = {
+			isa = PBXGroup;
+			children = (
+				6FE7579820D59C8B00F01636 /* benchmark_data */,
+				6FE93FFB20D592D8008C9FE4 /* AppDelegate.h */,
+				6FE93FFC20D592D8008C9FE4 /* AppDelegate.m */,
+				6FE93FFE20D592D8008C9FE4 /* BenchmarkViewController.h */,
+				6FE93FFF20D592D8008C9FE4 /* BenchmarkViewController.mm */,
+				6FE9400120D592D8008C9FE4 /* Main.storyboard */,
+				6FE9400420D592DA008C9FE4 /* Assets.xcassets */,
+				6FE9400920D592DA008C9FE4 /* Info.plist */,
+				6FE9400A20D592DA008C9FE4 /* main.m */,
+			);
+			path = TFLiteBenchmark;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		6FE93FF720D592D8008C9FE4 /* TFLiteBenchmark */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 6FE9400E20D592DA008C9FE4 /* Build configuration list for PBXNativeTarget "TFLiteBenchmark" */;
+			buildPhases = (
+				6FE93FF420D592D8008C9FE4 /* Sources */,
+				6FE93FF520D592D8008C9FE4 /* Frameworks */,
+				6FE93FF620D592D8008C9FE4 /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = TFLiteBenchmark;
+			productName = TFLiteBenchmark;
+			productReference = 6FE93FF820D592D8008C9FE4 /* TFLiteBenchmark.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		6FE93FF020D592D8008C9FE4 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 1000;
+				ORGANIZATIONNAME = Example;
+				TargetAttributes = {
+					6FE93FF720D592D8008C9FE4 = {
+						CreatedOnToolsVersion = 10.0;
+					};
+				};
+			};
+			buildConfigurationList = 6FE93FF320D592D8008C9FE4 /* Build configuration list for PBXProject "TFLiteBenchmark" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 6FE93FEF20D592D8008C9FE4;
+			productRefGroup = 6FE93FF920D592D8008C9FE4 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				6FE93FF720D592D8008C9FE4 /* TFLiteBenchmark */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		6FE93FF620D592D8008C9FE4 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				6FE757A120D5AB8100F01636 /* mobilenet_v1_1.0_224.tflite in Resources */,
+				6FE9400520D592DA008C9FE4 /* Assets.xcassets in Resources */,
+				6FE9400320D592D8008C9FE4 /* Main.storyboard in Resources */,
+				6FE7579A20D59CE500F01636 /* benchmark_params.json in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		6FE93FF420D592D8008C9FE4 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				6FE9400020D592D8008C9FE4 /* BenchmarkViewController.mm in Sources */,
+				6FE9400B20D592DA008C9FE4 /* main.m in Sources */,
+				6FE93FFD20D592D8008C9FE4 /* AppDelegate.m in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXVariantGroup section */
+		6FE9400120D592D8008C9FE4 /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				6FE9400220D592D8008C9FE4 /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		6FE9400C20D592DA008C9FE4 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.0;
+				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+				ONLY_ACTIVE_ARCH = YES;
+				OTHER_CFLAGS = "";
+				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
+				SDKROOT = iphoneos;
+			};
+			name = Debug;
+		};
+		6FE9400D20D592DA008C9FE4 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.0;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				OTHER_CFLAGS = "";
+				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
+				SDKROOT = iphoneos;
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		6FE9400F20D592DA008C9FE4 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				"HEADER_SEARCH_PATHS[arch=*]" = (
+					$SRCROOT/../../../../../../,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/eigen,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
+				);
+				INFOPLIST_FILE = TFLiteBenchmark/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib;
+				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+				"USER_HEADER_SEARCH_PATHS[arch=*]" = "";
+			};
+			name = Debug;
+		};
+		6FE9401020D592DA008C9FE4 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				"HEADER_SEARCH_PATHS[arch=*]" = (
+					$SRCROOT/../../../../../../,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/eigen,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
+				);
+				INFOPLIST_FILE = TFLiteBenchmark/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib;
+				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		6FE93FF320D592D8008C9FE4 /* Build configuration list for PBXProject "TFLiteBenchmark" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				6FE9400C20D592DA008C9FE4 /* Debug */,
+				6FE9400D20D592DA008C9FE4 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		6FE9400E20D592DA008C9FE4 /* Build configuration list for PBXNativeTarget "TFLiteBenchmark" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				6FE9400F20D592DA008C9FE4 /* Debug */,
+				6FE9401020D592DA008C9FE4 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 6FE93FF020D592D8008C9FE4 /* Project object */;
+}
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.h b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.h
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.h
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.h
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.m b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.m
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.m
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/AppDelegate.m
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Assets.xcassets/AppIcon.appiconset/Contents.json b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Assets.xcassets/AppIcon.appiconset/Contents.json
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Assets.xcassets/AppIcon.appiconset/Contents.json
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Assets.xcassets/AppIcon.appiconset/Contents.json
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Assets.xcassets/Contents.json b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Assets.xcassets/Contents.json
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Assets.xcassets/Contents.json
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Assets.xcassets/Contents.json
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/LaunchScreen.storyboard b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/LaunchScreen.storyboard
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/LaunchScreen.storyboard
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/LaunchScreen.storyboard
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/Main.storyboard b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/Main.storyboard
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/Main.storyboard
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Base.lproj/Main.storyboard
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.h b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.h
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.h
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.h
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm
similarity index 97%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm
index 356d5b0e17abc715de9b8f7a20ec7459f3468da1..590c215f51546f04475f0f84828058b33dfe187d 100644
--- a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm
+++ b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/BenchmarkViewController.mm
@@ -18,8 +18,8 @@
 #import <sstream>
 #import <string>
 #import <vector>
-#import "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
-#import "tensorflow/contrib/lite/tools/benchmark/logging.h"
+#import "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#import "tensorflow/lite/tools/benchmark/logging.h"
 
 namespace {
 NSString* FilePathForResourceName(NSString* filename) {
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Info.plist b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Info.plist
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Info.plist
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/Info.plist
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/benchmark_data/benchmark_params.json b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/benchmark_data/benchmark_params.json
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/benchmark_data/benchmark_params.json
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/benchmark_data/benchmark_params.json
diff --git a/tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/main.m b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/main.m
similarity index 100%
rename from tensorflow/contrib/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/main.m
rename to tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark/main.m
diff --git a/tensorflow/contrib/lite/tools/benchmark/logging.h b/tensorflow/lite/tools/benchmark/logging.h
similarity index 92%
rename from tensorflow/contrib/lite/tools/benchmark/logging.h
rename to tensorflow/lite/tools/benchmark/logging.h
index 4045d1e7311512ee56f60601b3ddb0560ba1bffa..71dd511a080ecc6297daa9adb863c543a042dd88 100644
--- a/tensorflow/contrib/lite/tools/benchmark/logging.h
+++ b/tensorflow/lite/tools/benchmark/logging.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_LOGGING_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_LOGGING_H_
+#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_LOGGING_H_
+#define TENSORFLOW_LITE_TOOLS_BENCHMARK_LOGGING_H_
 
 // LOG and CHECK macros for benchmarks.
 
@@ -73,4 +73,4 @@ class LoggingWrapper {
 
 #define TFLITE_BENCHMARK_CHECK_EQ(a, b) TFLITE_BENCHMARK_CHECK(a == b)
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_LOGGING_H_
+#endif  // TENSORFLOW_LITE_TOOLS_BENCHMARK_LOGGING_H_
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration.cc b/tensorflow/lite/tools/gen_op_registration.cc
similarity index 93%
rename from tensorflow/contrib/lite/tools/gen_op_registration.cc
rename to tensorflow/lite/tools/gen_op_registration.cc
index d80ea59170b4edc67ca45a4410890f60cf5259e7..ca66eef46607ae43c7c1c0ec2de6098ca4c71e14 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration.cc
+++ b/tensorflow/lite/tools/gen_op_registration.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <vector>
 
 #include "re2/re2.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/tools/gen_op_registration.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/gen_op_registration.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration.h b/tensorflow/lite/tools/gen_op_registration.h
similarity index 82%
rename from tensorflow/contrib/lite/tools/gen_op_registration.h
rename to tensorflow/lite/tools/gen_op_registration.h
index 5f2ac6ca97fde9a2fe6f4bcf20184f6ef6606f0b..a616720c934b9ed302defd139a4e48d94496d61d 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration.h
+++ b/tensorflow/lite/tools/gen_op_registration.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#ifndef TENSORFLOW_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#define TENSORFLOW_LITE_TOOLS_GEN_OP_REGISTRATION_H_
 
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/string.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 
@@ -36,4 +36,4 @@ void ReadOpsFromModel(const ::tflite::Model* model,
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#endif  // TENSORFLOW_LITE_TOOLS_GEN_OP_REGISTRATION_H_
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc b/tensorflow/lite/tools/gen_op_registration_main.cc
similarity index 98%
rename from tensorflow/contrib/lite/tools/gen_op_registration_main.cc
rename to tensorflow/lite/tools/gen_op_registration_main.cc
index f7df80821fc383063c6e19148bfb13801368b334..090b709478d7e73a731a6e87fbdcb1445fa84c8a 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
+++ b/tensorflow/lite/tools/gen_op_registration_main.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/strip.h"
-#include "tensorflow/contrib/lite/tools/gen_op_registration.h"
+#include "tensorflow/lite/tools/gen_op_registration.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration_test.cc b/tensorflow/lite/tools/gen_op_registration_test.cc
similarity index 88%
rename from tensorflow/contrib/lite/tools/gen_op_registration_test.cc
rename to tensorflow/lite/tools/gen_op_registration_test.cc
index 28a98d68ab23a558a682dd6debb6081f2a1640dc..0ae91018ddf3db0ae0e36fd78545c72cd6dba2ca 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration_test.cc
+++ b/tensorflow/lite/tools/gen_op_registration_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/gen_op_registration.h"
+#include "tensorflow/lite/tools/gen_op_registration.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
@@ -43,25 +43,25 @@ TEST_F(GenOpRegistrationTest, TestNonExistantFiles) {
 }
 
 TEST_F(GenOpRegistrationTest, TestModels) {
-  ReadOps("tensorflow/contrib/lite/testdata/test_model.bin");
+  ReadOps("tensorflow/lite/testdata/test_model.bin");
   EXPECT_THAT(builtin_ops_, ElementsAreArray({"CONV_2D"}));
   EXPECT_THAT(custom_ops_, ElementsAreArray({"testing_op"}));
 }
 
 TEST_F(GenOpRegistrationTest, TestEmptyModels) {
-  ReadOps("tensorflow/contrib/lite/testdata/empty_model.bin");
+  ReadOps("tensorflow/lite/testdata/empty_model.bin");
   EXPECT_EQ(builtin_ops_.size(), 0);
   EXPECT_EQ(custom_ops_.size(), 0);
 }
 
 TEST_F(GenOpRegistrationTest, TestZeroSubgraphs) {
-  ReadOps("tensorflow/contrib/lite/testdata/0_subgraphs.bin");
+  ReadOps("tensorflow/lite/testdata/0_subgraphs.bin");
   EXPECT_EQ(builtin_ops_.size(), 0);
   EXPECT_EQ(custom_ops_.size(), 0);
 }
 
 TEST_F(GenOpRegistrationTest, TestBrokenMmap) {
-  ReadOps("tensorflow/contrib/lite/testdata/test_model_broken.bin");
+  ReadOps("tensorflow/lite/testdata/test_model_broken.bin");
   EXPECT_EQ(builtin_ops_.size(), 0);
   EXPECT_EQ(custom_ops_.size(), 0);
 }
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..994f660dba7742de162525dcf6a8c6a288ee71c6
--- /dev/null
+++ b/tensorflow/lite/tools/make/Makefile
@@ -0,0 +1,234 @@
+# Find where we're running from, so we can store generated files here.
+ifeq ($(origin MAKEFILE_DIR), undefined)
+	MAKEFILE_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+endif
+
+# Try to figure out the host system
+HOST_OS :=
+ifeq ($(OS),Windows_NT)
+	HOST_OS = windows
+else
+	UNAME_S := $(shell uname -s)
+	ifeq ($(UNAME_S),Linux)
+		HOST_OS := linux
+	endif
+	ifeq ($(UNAME_S),Darwin)
+		HOST_OS := osx
+	endif
+endif
+
+HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi)
+
+# Override these on the make command line to target a specific architecture. For example:
+# make -f tensorflow/lite/Makefile TARGET=rpi TARGET_ARCH=armv7l
+TARGET := $(HOST_OS)
+TARGET_ARCH := $(HOST_ARCH)
+
+INCLUDES := \
+-I. \
+-I$(MAKEFILE_DIR)/../../../../../ \
+-I$(MAKEFILE_DIR)/../../../../../../ \
+-I$(MAKEFILE_DIR)/downloads/ \
+-I$(MAKEFILE_DIR)/downloads/eigen \
+-I$(MAKEFILE_DIR)/downloads/absl \
+-I$(MAKEFILE_DIR)/downloads/gemmlowp \
+-I$(MAKEFILE_DIR)/downloads/neon_2_sse \
+-I$(MAKEFILE_DIR)/downloads/farmhash/src \
+-I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
+-I$(OBJDIR)
+# This is at the end so any globally-installed frameworks like protobuf don't
+# override local versions in the source tree.
+INCLUDES += -I/usr/local/include
+
+# These are the default libraries needed, but they can be added to or
+# overridden by the platform-specific settings in target makefiles.
+LIBS := \
+-lstdc++ \
+-lpthread \
+-lm \
+-lz
+
+# There are no rules for compiling objects for the host system (since we don't
+# generate things like the protobuf compiler that require that), so all of
+# these settings are for the target compiler.
+CXXFLAGS := -O3 -DNDEBUG
+CXXFLAGS += $(EXTRA_CXXFLAGS)
+CCFLAGS := ${CXXFLAGS}
+CXXFLAGS += --std=c++11
+CFLAGS :=
+LDOPTS := -L/usr/local/lib
+ARFLAGS := -r
+TARGET_TOOLCHAIN_PREFIX :=
+CC_PREFIX :=
+
+# This library is the main target for this makefile. It will contain a minimal
+# runtime that can be linked in to other programs.
+LIB_NAME := libtensorflow-lite.a
+
+# Benchmark static library and binary
+BENCHMARK_LIB_NAME := benchmark-lib.a
+BENCHMARK_BINARY_NAME := benchmark_model
+
+# A small example program that shows how to link against the library.
+MINIMAL_SRCS := \
+tensorflow/lite/examples/minimal/minimal.cc
+
+# What sources we want to compile, must be kept in sync with the main Bazel
+# build files.
+
+PROFILER_SRCS := \
+	tensorflow/lite/profiling/time.cc
+PROFILE_SUMMARIZER_SRCS := \
+	tensorflow/lite/profiling/profile_summarizer.cc \
+	tensorflow/core/util/stats_calculator.cc
+
+CORE_CC_ALL_SRCS := \
+$(wildcard tensorflow/lite/*.cc) \
+$(wildcard tensorflow/lite/*.c) \
+$(wildcard tensorflow/lite/c/*.c) \
+$(wildcard tensorflow/lite/core/*.cc) \
+$(wildcard tensorflow/lite/core/api/*.cc)
+ifneq ($(BUILD_TYPE),micro)
+CORE_CC_ALL_SRCS += \
+$(wildcard tensorflow/lite/kernels/*.cc) \
+$(wildcard tensorflow/lite/kernels/internal/*.cc) \
+$(wildcard tensorflow/lite/kernels/internal/optimized/*.cc) \
+$(wildcard tensorflow/lite/kernels/internal/reference/*.cc) \
+$(PROFILER_SRCS) \
+$(wildcard tensorflow/lite/kernels/*.c) \
+$(wildcard tensorflow/lite/kernels/internal/*.c) \
+$(wildcard tensorflow/lite/kernels/internal/optimized/*.c) \
+$(wildcard tensorflow/lite/kernels/internal/reference/*.c) \
+$(wildcard tensorflow/lite/tools/make/downloads/farmhash/src/farmhash.cc) \
+$(wildcard tensorflow/lite/tools/make/downloads/fft2d/fftsg.c)
+endif
+# Remove any duplicates.
+CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
+CORE_CC_EXCLUDE_SRCS := \
+$(wildcard tensorflow/lite/*test.cc) \
+$(wildcard tensorflow/lite/*/*test.cc) \
+$(wildcard tensorflow/lite/*/*/*test.cc) \
+$(wildcard tensorflow/lite/*/*/*/*test.cc) \
+$(wildcard tensorflow/lite/kernels/test_util.cc) \
+$(MINIMAL_SRCS)
+ifeq ($(BUILD_TYPE),micro)
+CORE_CC_EXCLUDE_SRCS += \
+tensorflow/lite/mmap_allocation.cc \
+tensorflow/lite/nnapi_delegate.cc
+else
+CORE_CC_EXCLUDE_SRCS += \
+tensorflow/lite/mmap_allocation_disabled.cc \
+tensorflow/lite/nnapi_delegate_disabled.cc
+endif
+# Filter out all the excluded files.
+TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
+
+# Benchmark sources
+BENCHMARK_SRCS_DIR := tensorflow/lite/tools/benchmark
+BENCHMARK_ALL_SRCS := $(TFLITE_CC_SRCS) \
+	$(wildcard $(BENCHMARK_SRCS_DIR)/*.cc) \
+	$(PROFILE_SUMMARIZER_SRCS)
+
+BENCHMARK_SRCS := $(filter-out \
+	$(wildcard $(BENCHMARK_SRCS_DIR)/*_test.cc), \
+    $(BENCHMARK_ALL_SRCS))
+
+# These target-specific makefiles should modify or replace options like
+# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
+# based on platforms or architectures should happen within these files, to
+# keep this main makefile focused on the sources and dependencies.
+include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
+
+ALL_SRCS := \
+	$(MINIMAL_SRCS) \
+	$(PROFILER_SRCS) \
+	$(PROFILER_SUMMARY_SRCS) \
+	$(TF_LITE_CC_SRCS) \
+	$(BENCHMARK_SRCS)
+
+# Where compiled objects are stored.
+GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
+OBJDIR := $(GENDIR)obj/
+BINDIR := $(GENDIR)bin/
+LIBDIR := $(GENDIR)lib/
+
+LIB_PATH := $(LIBDIR)$(LIB_NAME)
+BENCHMARK_LIB := $(LIBDIR)$(BENCHMARK_LIB_NAME)
+BENCHMARK_BINARY := $(BINDIR)$(BENCHMARK_BINARY_NAME)
+MINIMAL_BINARY := $(BINDIR)minimal
+
+CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
+CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
+AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
+
+MINIMAL_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MINIMAL_SRCS))))
+
+LIB_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(TF_LITE_CC_SRCS))))
+
+BENCHMARK_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(BENCHMARK_SRCS))))
+
+# For normal manually-created TensorFlow C++ source files.
+$(OBJDIR)%.o: %.cc
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+# For normal manually-created TensorFlow C++ source files.
+$(OBJDIR)%.o: %.c
+	@mkdir -p $(dir $@)
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
+# The target that's compiled if there's no command-line arguments.
+all: $(LIB_PATH)  $(MINIMAL_BINARY) $(BENCHMARK_BINARY)
+
+# The target that's compiled for micro-controllers
+micro: $(LIB_PATH)
+
+# Hack for generating schema file bypassing flatbuffer parsing
+tensorflow/lite/schema/schema_generated.h:
+	@cp -u tensorflow/lite/schema/schema_generated.h.OPENSOURCE tensorflow/lite/schema/schema_generated.h
+
+# Gathers together all the objects we've compiled into a single '.a' archive.
+$(LIB_PATH): tensorflow/lite/schema/schema_generated.h $(LIB_OBJS)
+	@mkdir -p $(dir $@)
+	$(AR) $(ARFLAGS) $(LIB_PATH) $(LIB_OBJS)
+
+$(MINIMAL_BINARY): $(MINIMAL_OBJS) $(LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(MINIMAL_BINARY) $(MINIMAL_OBJS) \
+	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+
+$(BENCHMARK_LIB) : $(LIB_PATH) $(BENCHMARK_OBJS)
+	@mkdir -p $(dir $@)
+	$(AR) $(ARFLAGS) $(BENCHMARK_LIB) $(LIB_OBJS) $(BENCHMARK_OBJS)
+
+benchmark_lib: $(BENCHMARK_LIB)
+
+$(BENCHMARK_BINARY) : $(BENCHMARK_LIB)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(BENCHMARK_BINARY) \
+	$(LIBFLAGS) $(BENCHMARK_LIB) $(LDFLAGS) $(LIBS)
+
+benchmark: $(BENCHMARK_BINARY)
+
+libdir:
+	@echo $(LIBDIR)
+
+# Gets rid of all generated files.
+clean:
+	rm -rf $(MAKEFILE_DIR)/gen
+
+# Gets rid of target files only, leaving the host alone. Also leaves the lib
+# directory untouched deliberately, so we can persist multiple architectures
+# across builds for iOS and Android.
+cleantarget:
+	rm -rf $(OBJDIR)
+	rm -rf $(BINDIR)
+
+$(DEPDIR)/%.d: ;
+.PRECIOUS: $(DEPDIR)/%.d
+
+-include $(patsubst %,$(DEPDIR)/%.d,$(basename $(ALL_SRCS)))
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8b617ef5937a062261ee23bed3cfd1f40e6a3995
--- /dev/null
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -0,0 +1,56 @@
+#!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../../.."
+
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64 arm64)"
+  echo "  default is [x86_64 armv7 armv7s arm64]"
+  echo "-p enable profiling"
+  exit 1
+}
+
+profiling_args=""
+BUILD_ARCHS="x86_64 armv7 armv7s arm64"
+while getopts "a:p" opt_name; do
+  case "$opt_name" in
+    a) BUILD_ARCHS="${OPTARG}";;
+    p) profiling_args='-DGEMMLOWP_PROFILING,-DTFLITE_PROFILING_ENABLED';;
+    *) usage;;
+  esac
+done
+shift $(($OPTIND - 1))
+
+# Build library for supported architectures and packs them in a fat binary.
+make_library() {
+    LIBS=""
+    for arch in $BUILD_ARCHS
+    do
+        make -f tensorflow/lite/tools/make/Makefile TARGET=ios TARGET_ARCH=${arch} \
+            EXTRA_CXXFLAGS=$profiling_args -j 8
+        LIBS="${LIBS} tensorflow/lite/tools/make/gen/ios_${arch}/lib/${1}"
+    done
+    mkdir -p tensorflow/lite/tools/make/gen/lib
+    lipo $LIBS -create \
+    -output tensorflow/lite/tools/make/gen/lib/${1}
+}
+
+make_library libtensorflow-lite.a
+make_library benchmark-lib.a
diff --git a/tensorflow/contrib/lite/tools/make/build_rpi_lib.sh b/tensorflow/lite/tools/make/build_rpi_lib.sh
similarity index 83%
rename from tensorflow/contrib/lite/tools/make/build_rpi_lib.sh
rename to tensorflow/lite/tools/make/build_rpi_lib.sh
index 24ecd4356df12c25dbdbf81684b7de128e8d11f4..1521bb39332bd44ecc6cf0f6a2910c7f0711a123 100755
--- a/tensorflow/contrib/lite/tools/make/build_rpi_lib.sh
+++ b/tensorflow/lite/tools/make/build_rpi_lib.sh
@@ -17,6 +17,6 @@
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../../.."
+cd "$SCRIPT_DIR/../../../.."
 
-CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/contrib/lite/tools/make/Makefile TARGET=rpi TARGET_ARCH=armv7l
+CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=rpi TARGET_ARCH=armv7l
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
new file mode 100755
index 0000000000000000000000000000000000000000..fa3d5d3d3b6657ff327dd6ec34bd65823da13cd2
--- /dev/null
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../../.."
+
+DOWNLOADS_DIR=tensorflow/lite/tools/make/downloads
+BZL_FILE_PATH=tensorflow/workspace.bzl
+
+# Ensure it is being run from repo root
+if [ ! -f $BZL_FILE_PATH ]; then
+  echo "Could not find ${BZL_FILE_PATH}":
+  echo "Likely you are not running this from the root directory of the repository.";
+  exit 1;
+fi
+
+EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
+ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
+NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
+FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
+FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz"
+FFT2D_URL="https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz"
+
+# TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
+#                   so work around it by patching the source.
+replace_by_sed() {
+  local regex="${1}"
+  shift
+  # Detect the version of sed by the return value of "--version" flag. GNU-sed
+  # supports "--version" while BSD-sed doesn't.
+  if ! sed --version >/dev/null 2>&1; then
+    # BSD-sed.
+    sed -i '' -e "${regex}" "$@"
+  else
+    # GNU-sed.
+    sed -i -e "${regex}" "$@"
+  fi
+}
+
+download_and_extract() {
+  local usage="Usage: download_and_extract URL DIR"
+  local url="${1:?${usage}}"
+  local dir="${2:?${usage}}"
+  echo "downloading ${url}" >&2
+  mkdir -p "${dir}"
+  if [[ "${url}" == *gz ]]; then
+    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
+  elif [[ "${url}" == *zip ]]; then
+    tempdir=$(mktemp -d)
+    tempdir2=$(mktemp -d)
+
+    curl -L ${url} > ${tempdir}/zipped.zip
+    unzip ${tempdir}/zipped.zip -d ${tempdir2}
+
+    # If the zip file contains nested directories, extract the files from the
+    # inner directory.
+    if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
+      # unzip has no strip components, so unzip to a temp dir, and move the
+      # files we want from the tempdir to destination.
+      cp -R ${tempdir2}/*/* ${dir}/
+    else
+      cp -R ${tempdir2}/* ${dir}/
+    fi
+    rm -rf ${tempdir2} ${tempdir}
+  fi
+
+  # Delete any potential BUILD files, which would interfere with Bazel builds.
+  find "${dir}" -type f -name '*BUILD' -delete
+}
+
+download_and_extract "${EIGEN_URL}" "${DOWNLOADS_DIR}/eigen"
+download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
+download_and_extract "${GOOGLETEST_URL}" "${DOWNLOADS_DIR}/googletest"
+download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
+download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
+download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
+download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
+
+replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
+  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
+replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#static uint32x2_t p2ui_CONJ_XOR;// = vld1_u32( conj_XOR_DATA ); - Removed by scripts#' \
+  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
+replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \
+  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
+
+echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/lite/tools/make/targets/ios_makefile.inc b/tensorflow/lite/tools/make/targets/ios_makefile.inc
similarity index 96%
rename from tensorflow/contrib/lite/tools/make/targets/ios_makefile.inc
rename to tensorflow/lite/tools/make/targets/ios_makefile.inc
index 7f36b8ecef4715a4b89e74bd9ef17d28bbf72ae2..ae9276f9a6382b744801b01eec031cf9a6047398 100644
--- a/tensorflow/contrib/lite/tools/make/targets/ios_makefile.inc
+++ b/tensorflow/lite/tools/make/targets/ios_makefile.inc
@@ -22,7 +22,7 @@ ifeq ($(TARGET), ios)
 	TARGET_ARCH := x86_64
 	CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-		-DTFLITE_USE_APPLE_ACCELERATE_FOR_CONV \
+		-DTF_LITE_USE_CBLAS \
 		-fembed-bitcode \
 		-Wno-c++11-narrowing \
 		-mno-thumb \
diff --git a/tensorflow/contrib/lite/tools/make/targets/linux_makefile.inc b/tensorflow/lite/tools/make/targets/linux_makefile.inc
similarity index 100%
rename from tensorflow/contrib/lite/tools/make/targets/linux_makefile.inc
rename to tensorflow/lite/tools/make/targets/linux_makefile.inc
diff --git a/tensorflow/contrib/lite/tools/make/targets/riscv_makefile.inc b/tensorflow/lite/tools/make/targets/riscv_makefile.inc
similarity index 100%
rename from tensorflow/contrib/lite/tools/make/targets/riscv_makefile.inc
rename to tensorflow/lite/tools/make/targets/riscv_makefile.inc
diff --git a/tensorflow/contrib/lite/tools/make/targets/rpi_makefile.inc b/tensorflow/lite/tools/make/targets/rpi_makefile.inc
similarity index 100%
rename from tensorflow/contrib/lite/tools/make/targets/rpi_makefile.inc
rename to tensorflow/lite/tools/make/targets/rpi_makefile.inc
diff --git a/tensorflow/contrib/lite/tools/make/targets/stm32f1_makefile.inc b/tensorflow/lite/tools/make/targets/stm32f1_makefile.inc
similarity index 100%
rename from tensorflow/contrib/lite/tools/make/targets/stm32f1_makefile.inc
rename to tensorflow/lite/tools/make/targets/stm32f1_makefile.inc
diff --git a/tensorflow/contrib/lite/tools/make/targets/stm32f7_makefile.inc b/tensorflow/lite/tools/make/targets/stm32f7_makefile.inc
similarity index 100%
rename from tensorflow/contrib/lite/tools/make/targets/stm32f7_makefile.inc
rename to tensorflow/lite/tools/make/targets/stm32f7_makefile.inc
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0a0d5cc4123ba64c7208c5e74344248b28af6851
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -0,0 +1,25 @@
+# TODO(suharshs): Write quantize_weights tests that use small exportable files.
+# Then we can remove this file.
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+
+cc_library(
+    name = "quantize_weights",
+    srcs = ["quantize_weights.cc"],
+    hdrs = ["quantize_weights.h"],
+    deps = [
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/contrib/lite/tools/optimize/g3doc/quantize_weights.md b/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md
similarity index 96%
rename from tensorflow/contrib/lite/tools/optimize/g3doc/quantize_weights.md
rename to tensorflow/lite/tools/optimize/g3doc/quantize_weights.md
index 93fe576583eaaf43e6fae8a63f4480dae59c3568..cea164c38f0d78eb5797a97da51b1e2dee861b29 100644
--- a/tensorflow/contrib/lite/tools/optimize/g3doc/quantize_weights.md
+++ b/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md
@@ -3,7 +3,7 @@
 ## Recommended usage
 
 The Quantize Weights transformation is integrated with
-[tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md#transformation-flags).
+[tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/convert/cmdline_reference.md#transformation-flags).
 
 The recommended way of invoking this tool is by simply adding the
 `--post_training_quantize` flag to your original tflite_convert invocation. For
diff --git a/tensorflow/contrib/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
similarity index 94%
rename from tensorflow/contrib/lite/tools/optimize/quantize_weights.cc
rename to tensorflow/lite/tools/optimize/quantize_weights.cc
index d02d78bf53a145f6ab9cd61cecb4f8be391a31a6..de3c0b03237c1c85d1cfbeafc2ce8db4faf70ff6 100644
--- a/tensorflow/contrib/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/tools/optimize/quantize_weights.h"
+#include "tensorflow/lite/tools/optimize/quantize_weights.h"
 
 #include <algorithm>
 #include <memory>
@@ -21,10 +21,10 @@ limitations under the License.
 
 #include "flatbuffers/flexbuffers.h"
 #include "absl/memory/memory.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tflite {
@@ -110,24 +110,24 @@ std::vector<int32_t> GetWeightInputIndices(const BuiltinOperator& op_code) {
       op_code == BuiltinOperator_EMBEDDING_LOOKUP) {
     return {1};
   } else if (op_code == BuiltinOperator_SVDF) {
-    // https://www.tensorflow.org/code/tensorflow/contrib/lite/kernels/svdf.cc
+    // https://www.tensorflow.org/code/tensorflow/lite/kernels/svdf.cc
     return {1, 2};
   } else if (op_code == BuiltinOperator_LSTM ||
              op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM) {
-    // https://www.tensorflow.org/code/tensorflow/contrib/lite/kernels/lstm.cc
-    // https://www.tensorflow.org/code/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+    // https://www.tensorflow.org/code/tensorflow/lite/kernels/lstm.cc
+    // https://www.tensorflow.org/code/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
     return {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16};
   } else if (op_code == BuiltinOperator_RNN ||
              op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN) {
-    // https://www.tensorflow.org/code/tensorflow/contrib/lite/kernels/basic_rnn.cc
-    // https://www.tensorflow.org/code/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+    // https://www.tensorflow.org/code/tensorflow/lite/kernels/basic_rnn.cc
+    // https://www.tensorflow.org/code/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
     return {1, 2};
   } else if (op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM) {
-    // https://www.tensorflow.org/code/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
+    // https://www.tensorflow.org/code/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
     return {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 16,
             18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 33};
   } else if (op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN) {
-    // https://www.tensorflow.org/code/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
+    // https://www.tensorflow.org/code/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
     return {1, 2, 4, 5};
   }
   return {};
@@ -182,8 +182,7 @@ std::vector<TensorInfo> GetQuantizableTensorsFromOperator(
     TensorT* tensor = subgraph->tensors[tensor_idx].get();
     // TODO(suharshs): Support shared weights, i.e. If two tensors share the
     // same weight array, things may break. (i.e. SSD object detection)
-    if (!eval_hybrid &&
-        CountTensorConsumers(model, subgraph, tensor_idx) != 1) {
+    if (CountTensorConsumers(model, subgraph, tensor_idx) != 1) {
       LOG(INFO) << "Skipping quantization of tensor " << tensor->name
                 << " that is shared between multiple multiple operations.";
       continue;
diff --git a/tensorflow/contrib/lite/tools/optimize/quantize_weights.h b/tensorflow/lite/tools/optimize/quantize_weights.h
similarity index 85%
rename from tensorflow/contrib/lite/tools/optimize/quantize_weights.h
rename to tensorflow/lite/tools/optimize/quantize_weights.h
index 706f10b87b166c74d747031a982113fd258616da..c2c0b0ce83435dc423a62cea598e35ba45a0561f 100644
--- a/tensorflow/contrib/lite/tools/optimize/quantize_weights.h
+++ b/tensorflow/lite/tools/optimize/quantize_weights.h
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_OPTIMIZE_QUANTIZE_WEIGHTS_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_OPTIMIZE_QUANTIZE_WEIGHTS_H_
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_WEIGHTS_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_WEIGHTS_H_
 
 #include <memory>
 #include "flatbuffers/flexbuffers.h"
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace optimize {
@@ -54,4 +54,4 @@ TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
 }  // namespace optimize
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_OPTIMIZE_QUANTIZE_WEIGHTS_H_
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_WEIGHTS_H_
diff --git a/tensorflow/contrib/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
similarity index 95%
rename from tensorflow/contrib/lite/tools/optimize/quantize_weights_test.cc
rename to tensorflow/lite/tools/optimize/quantize_weights_test.cc
index 387b3471c2c4c59948113cd609941577607fa737..32725e5ee29c364d56754c08a2cb1084ef049fdb 100644
--- a/tensorflow/contrib/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/lite/tools/optimize/quantize_weights.h"
+#include "tensorflow/lite/tools/optimize/quantize_weights.h"
 
 #include <memory>
 
 #include "flatbuffers/flexbuffers.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace optimize {
@@ -160,7 +160,7 @@ class QuantizeWeightsTest : public ::testing::Test {
 
 TEST_F(QuantizeWeightsTest, SimpleTestWithHybrid) {
   string model_path =
-      "third_party/tensorflow/contrib/lite/tools/optimize/testdata/"
+      "third_party/tensorflow/lite/tools/optimize/testdata/"
       "mobilenet_v1_0.25_128.tflite";
   std::unique_ptr<FlatBufferModel> input_fb =
       FlatBufferModel::BuildFromFile(model_path.data());
@@ -177,7 +177,7 @@ TEST_F(QuantizeWeightsTest, SimpleTestWithHybrid) {
 
 TEST_F(QuantizeWeightsTest, SimpleTestWithoutHybrid) {
   string model_path =
-      "third_party/tensorflow/contrib/lite/tools/optimize/testdata/"
+      "third_party/tensorflow/lite/tools/optimize/testdata/"
       "mobilenet_v1_0.25_128.tflite";
   std::unique_ptr<FlatBufferModel> input_fb =
       FlatBufferModel::BuildFromFile(model_path.data());
@@ -195,7 +195,7 @@ TEST_F(QuantizeWeightsTest, SimpleTestWithoutHybrid) {
 
 TEST_F(QuantizeWeightsTest, SimpleTestWithWeightsMinNumElements) {
   string model_path =
-      "third_party/tensorflow/contrib/lite/tools/optimize/testdata/"
+      "third_party/tensorflow/lite/tools/optimize/testdata/"
       "mobilenet_v1_0.25_128.tflite";
   std::unique_ptr<FlatBufferModel> input_fb =
       FlatBufferModel::BuildFromFile(model_path.data());
diff --git a/tensorflow/lite/tools/pip_package/MANIFEST.in b/tensorflow/lite/tools/pip_package/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..bb574e63a372da96841efbc70b8e213a943213c6
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include * *.py
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8190782c39fcb910749fb466b7075dd628cdd554
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -0,0 +1,33 @@
+# Building TensorFlow Lite Standalone Pip
+
+Many users would like to deploy TensorFlow lite interpreter and use it from
+Python without requiring the rest of TensorFlow.
+
+## Steps
+
+To build a binary wheel run this script:
+```
+sudo apt install swig libjpeg-dev zlib1g-dev python3-dev python3-numpy
+sh tensorflow/lite/tools/pip_package/build_pip_package.sh
+```
+That will print out some output and a .whl file. You can then install that
+```
+pip install --upgrade <wheel>
+```
+
+Note, unlike tensorflow this will be installed to a tflite_runtime namespace.
+You can then use the Tensorflow Lite interpreter as.
+```
+import tflite_runtime as tflr
+interpreter = tflr.lite.Interpreter(model_path="foo.tflite")
+```
+
+This currently works to build on Linux machines including Raspberry Pi. In
+the future, cross compilation to smaller SOCs like Raspberry Pi from
+bigger host will be supported.
+
+## Caveats
+
+* You cannot use TensorFlow Select ops, only TensorFlow Lite builtins.
+* Currently custom ops and delegates cannot be registered.
+
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package.sh b/tensorflow/lite/tools/pip_package/build_pip_package.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2887ce84712aa75168bd2b5ae77240f25deddf57
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/build_pip_package.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+# Find where this script lives and then the Tensorflow root.
+MY_DIRECTORY=`dirname $0`
+export TENSORFLOW_SRC_ROOT=`realpath $MY_DIRECTORY/../../../..`
+
+export TENSORFLOW_VERSION=`grep "_VERSION = " $TENSORFLOW_SRC_ROOT/tensorflow/tools/pip_package/setup.py  | cut -d'=' -f 2 | sed "s/[ '-]//g"`;
+
+
+# Build a pip build tree.
+BUILD_ROOT=/tmp/tflite_pip
+rm -rf $BUILD_ROOT
+mkdir -p $BUILD_ROOT/tflite_runtime/lite
+mkdir -p $BUILD_ROOT/tflite_runtime/lite/python
+
+# Build an importable module tree
+cat > $BUILD_ROOT/tflite_runtime/__init__.py <<EOF;
+import tflite_runtime.lite.interpreter
+EOF
+
+cat > $BUILD_ROOT/tflite_runtime/lite/__init__.py <<EOF;
+from interpreter import Interpreter as Interpreter
+EOF
+
+cat > $BUILD_ROOT/tflite_runtime/lite/python/__init__.py <<EOF;
+# Python module for TensorFlow Lite
+EOF
+
+# Copy necessary source files
+TFLITE_ROOT=$TENSORFLOW_SRC_ROOT/tensorflow/lite
+cp -r  $TFLITE_ROOT/python/interpreter_wrapper $BUILD_ROOT
+cp $TFLITE_ROOT/python/interpreter.py $BUILD_ROOT/tflite_runtime/lite/
+cp $TFLITE_ROOT/tools/pip_package/setup.py $BUILD_ROOT
+cp $TFLITE_ROOT/tools/pip_package/MANIFEST.in $BUILD_ROOT
+
+# Build the Pip
+cd $BUILD_ROOT
+python setup.py bdist_wheel
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..64d62ee1f2d5d0cc1fa1d1804c637f8220937128
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -0,0 +1,150 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite is for mobile and embedded devices.
+
+TensorFlow Lite is the official solution for running machine learning models on
+mobile and embedded devices. It enables on-device machine learning inference
+with low latency and a small binary size on Android, iOS, and other operating
+systems.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+import subprocess
+
+from distutils.command.build_ext import build_ext
+import numpy
+
+from setuptools import Extension
+from setuptools import find_packages
+from setuptools import setup
+from setuptools.command.build_py import build_py
+PACKAGE_NAME = 'tflite-runtime'
+PACKAGE_VERSION = os.environ['TENSORFLOW_VERSION']
+DOCLINES = __doc__.split('\n')
+PACKAGE = 'tflite_runtime.lite.python'
+TENSORFLOW_DIR = os.environ['TENSORFLOW_SRC_ROOT']
+
+# Setup cross compiling
+TARGET = (
+    os.environ['TENSORFLOW_TARGET'] if 'TENSORFLOW_TARGET' in os.environ
+    else None)
+if TARGET == 'rpi':
+  os.environ['CXX'] = 'arm-linux-gnueabihf-g++'
+  os.environ['CC'] = 'arm-linux-gnueabihf-g++'
+MAKE_CROSS_OPTIONS = ['TARGET=%s' % TARGET]  if TARGET else []
+
+RELATIVE_MAKE_DIR = os.path.join('tensorflow', 'lite', 'tools', 'make')
+MAKE_DIR = os.path.join(TENSORFLOW_DIR, RELATIVE_MAKE_DIR)
+DOWNLOADS_DIR = os.path.join(MAKE_DIR, 'downloads')
+RELATIVE_MAKEFILE_PATH = os.path.join(RELATIVE_MAKE_DIR, 'Makefile')
+DOWNLOAD_SCRIPT_PATH = os.path.join(MAKE_DIR, 'download_dependencies.sh')
+
+
+def make_args(target='', quiet=True):
+  """Construct make command line."""
+  args = (['make', 'SHELL=/bin/bash', '-C', TENSORFLOW_DIR]
+          + MAKE_CROSS_OPTIONS +
+          ['-f', RELATIVE_MAKEFILE_PATH, '-j',
+           str(multiprocessing.cpu_count())])
+  if quiet:
+    args.append('--quiet')
+  if target:
+    args.append(target)
+  return args
+
+
+def make_output(target):
+  """Invoke make on the target and return output."""
+  return subprocess.check_output(make_args(target)).decode('utf-8').strip()
+
+
+def make():
+  """Invoke make to build tflite C++ sources.
+
+  Build dependencies:
+     apt-get install swig libjpeg-dev zlib1g-dev python3-dev python3-nump
+  """
+  subprocess.check_call(make_args(quiet=False))
+
+
+def download_dependencies():
+  """Download build dependencies if haven't done yet."""
+  if not os.path.isdir(DOWNLOADS_DIR) or not os.listdir(DOWNLOADS_DIR):
+    subprocess.check_call(DOWNLOAD_SCRIPT_PATH)
+
+
+class CustomBuildExt(build_ext, object):
+
+  def run(self):
+    download_dependencies()
+    make()
+
+    return super(CustomBuildExt, self).run()
+
+
+class CustomBuildPy(build_py, object):
+
+  def run(self):
+    self.run_command('build_ext')
+    return super(CustomBuildPy, self).run()
+
+
+LIB_TFLITE = 'tensorflow-lite'
+LIB_TFLITE_DIR = make_output('libdir')
+
+ext = Extension(
+    name='%s._interpreter_wrapper' % PACKAGE,
+    language='c++',
+    sources=['interpreter_wrapper/interpreter_wrapper.i',
+             'interpreter_wrapper/interpreter_wrapper.cc'],
+    swig_opts=['-c++',
+               '-I%s' % TENSORFLOW_DIR,
+               '-module', 'interpreter_wrapper',
+               '-outdir', '.'],
+    extra_compile_args=['-std=c++11'],
+    include_dirs=[TENSORFLOW_DIR,
+                  os.path.join(TENSORFLOW_DIR, 'tensorflow', 'lite', 'tools',
+                               'pip_package'),
+                  numpy.get_include(),
+                  os.path.join(DOWNLOADS_DIR, 'flatbuffers', 'include'),
+                  os.path.join(DOWNLOADS_DIR, 'absl')],
+    libraries=[LIB_TFLITE],
+    library_dirs=[LIB_TFLITE_DIR])
+
+
+setup(
+    name=PACKAGE_NAME,
+    version=PACKAGE_VERSION,
+    description=DOCLINES[0],
+    long_description='\n'.join(DOCLINES[2:]),
+    url='https://www.tensorflow.org/lite/',
+    author='Google Inc.',
+    author_email='opensource@google.com',
+    license='Apache 2.0',
+    include_package_data=True,
+    keywords='tflite tensorflow tensor machine learning',
+    packages=find_packages(exclude=[]),
+    ext_modules=[ext],
+    package_dir={PACKAGE: '.'},
+    cmdclass={
+        'build_ext': CustomBuildExt,
+        'build_py': CustomBuildPy,
+    }
+)
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
similarity index 97%
rename from tensorflow/contrib/lite/tools/verifier.cc
rename to tensorflow/lite/tools/verifier.cc
index 8d3a7a624265ca6f9933f36949fd6fdbb3c39c40..02d6e6b23cdd66c9dd87700e4be6bb2cfbee407f 100644
--- a/tensorflow/contrib/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/verifier.h"
+#include "tensorflow/lite/tools/verifier.h"
 #include <climits>
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/version.h"
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/lite/tools/verifier.h
similarity index 87%
rename from tensorflow/contrib/lite/tools/verifier.h
rename to tensorflow/lite/tools/verifier.h
index a596c650a0c2533b6ece3cc7c692d863c2d3f860..50b6432d4e3d82fa1e7e01096d84e6be6495ee01 100644
--- a/tensorflow/contrib/lite/tools/verifier.h
+++ b/tensorflow/lite/tools/verifier.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
+#ifndef TENSORFLOW_LITE_TOOLS_VERIFIER_H_
+#define TENSORFLOW_LITE_TOOLS_VERIFIER_H_
 
 #include <stdio.h>
 
-#include "tensorflow/contrib/lite/error_reporter.h"
-#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/lite/error_reporter.h"
+#include "tensorflow/lite/model.h"
 
 namespace tflite {
 
@@ -49,4 +49,4 @@ bool Verify(const void* buf, size_t len, const OpResolver& resolver,
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_VERIFIER_H_
+#endif  // TENSORFLOW_LITE_TOOLS_VERIFIER_H_
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
similarity index 96%
rename from tensorflow/contrib/lite/tools/verifier_test.cc
rename to tensorflow/lite/tools/verifier_test.cc
index ad7d59ecb41a0c81a6a4d8edae5fa6b4b5a7bede..98abafad927ae45cd7de428d0011e234f345dd6e 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/util.h"
 #include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/allocation.h"
-#include "tensorflow/contrib/lite/error_reporter.h"
-#include "tensorflow/contrib/lite/op_resolver.h"
-#include "tensorflow/contrib/lite/schema/schema_generated.h"
-#include "tensorflow/contrib/lite/testing/util.h"
-#include "tensorflow/contrib/lite/tools/verifier.h"
-#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/error_reporter.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/tools/verifier.h"
+#include "tensorflow/lite/version.h"
 #include "tensorflow/core/framework/numeric_types.h"
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/tools/visualize.py b/tensorflow/lite/tools/visualize.py
similarity index 97%
rename from tensorflow/contrib/lite/tools/visualize.py
rename to tensorflow/lite/tools/visualize.py
index d7eea7939933a2d8dbf38bed01af444811ef7920..53bb67e3fedbda28edee9834cd820073f62ae5d7 100644
--- a/tensorflow/contrib/lite/tools/visualize.py
+++ b/tensorflow/lite/tools/visualize.py
@@ -31,15 +31,15 @@ import sys
 from tensorflow.python.platform import resource_loader
 
 # Schema to use for flatbuffers
-_SCHEMA = "third_party/tensorflow/contrib/lite/schema/schema.fbs"
+_SCHEMA = "third_party/tensorflow/lite/schema/schema.fbs"
 
 # TODO(angerson): fix later when rules are simplified..
 _SCHEMA = resource_loader.get_path_to_datafile("../schema/schema.fbs")
-_BINARY = resource_loader.get_path_to_datafile("../../../../flatbuffers/flatc")
+_BINARY = resource_loader.get_path_to_datafile("../../../flatbuffers/flatc")
 # Account for different package positioning internal vs. external.
 if not os.path.exists(_BINARY):
   _BINARY = resource_loader.get_path_to_datafile(
-      "../../../../../flatbuffers/flatc")
+      "../../../../flatbuffers/flatc")
 
 if not os.path.exists(_SCHEMA):
   raise RuntimeError("Sorry, schema file cannot be found at %r" % _SCHEMA)
@@ -339,7 +339,7 @@ def CreateHtmlFile(tflite_input, html_output):
 
   # Spec on what keys to display
   buffer_keys_to_display = [("data", DataSizeMapper())]
-  operator_keys_to_display = [("builtin_code", None)]
+  operator_keys_to_display = [("builtin_code", None), ("custom_code", None)]
 
   for subgraph_idx, g in enumerate(data["subgraphs"]):
     # Subgraph local specs on what to display
diff --git a/tensorflow/contrib/lite/tutorials/BUILD b/tensorflow/lite/tutorials/BUILD
similarity index 100%
rename from tensorflow/contrib/lite/tutorials/BUILD
rename to tensorflow/lite/tutorials/BUILD
diff --git a/tensorflow/contrib/lite/tutorials/dataset.py b/tensorflow/lite/tutorials/dataset.py
similarity index 100%
rename from tensorflow/contrib/lite/tutorials/dataset.py
rename to tensorflow/lite/tutorials/dataset.py
diff --git a/tensorflow/contrib/lite/tutorials/mnist_tflite.py b/tensorflow/lite/tutorials/mnist_tflite.py
similarity index 92%
rename from tensorflow/contrib/lite/tutorials/mnist_tflite.py
rename to tensorflow/lite/tutorials/mnist_tflite.py
index 7b8bf5b5dbc8462d859c189af16c461244bfc374..6cc5846163594d74cfcbd95ab99ddb6a7b67bdf1 100644
--- a/tensorflow/contrib/lite/tutorials/mnist_tflite.py
+++ b/tensorflow/lite/tutorials/mnist_tflite.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 import numpy as np
 import tensorflow as tf  # pylint: disable=g-bad-import-order
-from tensorflow.contrib.lite.tutorials import dataset
+from tensorflow.lite.tutorials import dataset
 flags = tf.app.flags
 
 flags.DEFINE_string('data_dir', '/tmp/data_dir',
@@ -34,8 +34,8 @@ flags = flags.FLAGS
 def test_image_generator():
   # Generates an iterator over images
   with tf.Session() as sess:
-    input_data = dataset.test(
-        flags.data_dir).make_one_shot_iterator().get_next()
+    input_data = tf.compat.v1.data.make_one_shot_iterator(dataset.test(
+        flags.data_dir)).get_next()
     try:
       while True:
         yield sess.run(input_data)
@@ -69,7 +69,7 @@ def run_eval(interpreter, input_image):
 
 
 def main(_):
-  interpreter = tf.contrib.lite.Interpreter(model_path=flags.model_file)
+  interpreter = tf.lite.Interpreter(model_path=flags.model_file)
   interpreter.allocate_tensors()
   num_correct, total = 0, 0
   for input_data in test_image_generator():
diff --git a/tensorflow/contrib/lite/tutorials/post_training_quant.ipynb b/tensorflow/lite/tutorials/post_training_quant.ipynb
similarity index 94%
rename from tensorflow/contrib/lite/tutorials/post_training_quant.ipynb
rename to tensorflow/lite/tutorials/post_training_quant.ipynb
index 80cdb2f080ba51c28b8328bb1f524a807e1e04e9..394ab0760b5672978e0638c0ff01a8f00442302c 100644
--- a/tensorflow/contrib/lite/tutorials/post_training_quant.ipynb
+++ b/tensorflow/lite/tutorials/post_training_quant.ipynb
@@ -19,10 +19,10 @@
       "source": [
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
         "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/tutorials/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
         "  \u003c/td\u003e\n",
         "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/tutorials/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
         "  \u003c/td\u003e\n",
         "\u003c/table\u003e"
       ]
@@ -235,9 +235,9 @@
         "id": "AT8BgkKmljOy"
       },
       "source": [
-        "Using the python `TocoConverter`, the saved model can be converted into a TFLite model.\n",
+        "Using the python `TFLiteConverter`, the saved model can be converted into a TFLite model.\n",
         "\n",
-        "First load the model using the `TocoConverter`:"
+        "First load the model using the `TFLiteConverter`:"
       ]
     },
     {
@@ -252,7 +252,7 @@
       "source": [
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()\n",
-        "converter = tf.contrib.lite.TocoConverter.from_saved_model(saved_model_dir)\n",
+        "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
         "tflite_model = converter.convert()"
       ]
     },
@@ -386,7 +386,7 @@
         "images, labels = tf.to_float(mnist_test[0])/255.0, mnist_test[1]\n",
         "\n",
         "# Note: If you change the batch size, then use \n",
-        "# `tf.contrib.lite.Interpreter.resize_tensor_input` to also change it for\n",
+        "# `tf.lite.Interpreter.resize_tensor_input` to also change it for\n",
         "# the interpreter.\n",
         "mnist_ds = tf.data.Dataset.from_tensor_slices((images, labels)).batch(1)"
       ]
@@ -411,7 +411,7 @@
       },
       "outputs": [],
       "source": [
-        "interpreter = tf.contrib.lite.Interpreter(model_path=str(tflite_model_file))\n",
+        "interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))\n",
         "interpreter.allocate_tensors()\n",
         "input_index = interpreter.get_input_details()[0][\"index\"]\n",
         "output_index = interpreter.get_output_details()[0][\"index\"]"
@@ -428,7 +428,7 @@
       "outputs": [],
       "source": [
         "tf.logging.set_verbosity(tf.logging.DEBUG)\n",
-        "interpreter_quant = tf.contrib.lite.Interpreter(model_path=str(tflite_model_quant_file))"
+        "interpreter_quant = tf.lite.Interpreter(model_path=str(tflite_model_quant_file))"
       ]
     },
     {
@@ -592,7 +592,7 @@
         "\n",
         "We now consider another example. Resnets with pre-activation layers (Resnet-v2) are widely used for vision applications.\n",
         "  Pre-trained frozen graph for resnet-v2-101 is available at the\n",
-        "  [Tensorflow Lite model repository](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/models.md).\n",
+        "  [Tensorflow Lite model repository](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models.md).\n",
         "\n",
         "We can convert the frozen graph to a TFLite flatbuffer with quantization by:\n"
       ]
@@ -648,7 +648,7 @@
         "graph_def_file = pathlib.Path(archive_path).parent/\"resnet_v2_101_299_frozen.pb\"\n",
         "input_arrays = [\"input\"] \n",
         "output_arrays = [\"output\"]\n",
-        "converter = tf.contrib.lite.TocoConverter.from_frozen_graph(\n",
+        "converter = tf.lite.TFLiteConverter.from_frozen_graph(\n",
         "  str(graph_def_file), input_arrays, output_arrays, input_shapes={\"input\":[1,299,299,3]})\n",
         "converter.post_training_quantize = True\n",
         "resnet_tflite_file = graph_def_file.parent/\"resnet_v2_101_quantized.tflite\"\n",
@@ -678,7 +678,7 @@
       "source": [
         "\n",
         "The model size reduces from 171 MB to 43 MB.\n",
-        "The accuracy of this model on imagenet can be evaluated using the scripts provided for [TFLite accuracy measurement](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/accuracy/ilsvrc).\n",
+        "The accuracy of this model on imagenet can be evaluated using the scripts provided for [TFLite accuracy measurement](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/accuracy/ilsvrc).\n",
         "\n",
         "The optimized model top-1 accuracy is 76.8, the same as the floating point model."
       ]
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..866e4ebb0aa83ac3a6167de69626d0556ec2a9c5
--- /dev/null
+++ b/tensorflow/lite/util.cc
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/util.h"
+
+#include <cstring>
+
+namespace tflite {
+
+bool IsFlexOp(const char* custom_name) {
+  return custom_name && strncmp(custom_name, kFlexCustomCodePrefix,
+                                strlen(kFlexCustomCodePrefix)) == 0;
+}
+
+TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input) {
+  return ConvertArrayToTfLiteIntArray(input.size(), input.data());
+}
+
+TfLiteIntArray* ConvertArrayToTfLiteIntArray(const int rank, const int* dims) {
+  TfLiteIntArray* output = TfLiteIntArrayCreate(rank);
+  for (size_t i = 0; i < rank; i++) {
+    output->data[i] = dims[i];
+  }
+  return output;
+}
+
+bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
+                                 const int* b) {
+  if (!a) return false;
+  if (a->size != b_size) return false;
+  for (int i = 0; i < a->size; ++i) {
+    if (a->data[i] != b[i]) return false;
+  }
+  return true;
+}
+
+size_t CombineHashes(std::initializer_list<size_t> hashes) {
+  size_t result = 0;
+  // Hash combiner used by TensorFlow core.
+  for (size_t hash : hashes) {
+    result = result ^
+             (hash + 0x9e3779b97f4a7800ULL + (result << 10) + (result >> 4));
+  }
+  return result;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbb87528d06b6719a29b364711a7c62c273fdb34
--- /dev/null
+++ b/tensorflow/lite/util.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file provides general C++ utility functions in TFLite.
+// For example: Converting between `TfLiteIntArray`, `std::vector` and
+// Flatbuffer vectors. These functions can't live in `context.h` since it's pure
+// C.
+
+#ifndef TENSORFLOW_LITE_UTIL_H_
+#define TENSORFLOW_LITE_UTIL_H_
+
+#include <vector>
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+
+// The prefix of Flex op custom code.
+// This will be matched agains the `custom_code` field in `OperatorCode`
+// Flatbuffer Table.
+// WARNING: This is an experimental API and subject to change.
+constexpr char kFlexCustomCodePrefix[] = "Flex";
+
+// Checks whether the prefix of the custom name indicates the operation is an
+// Flex operation.
+bool IsFlexOp(const char* custom_name);
+
+// Converts a `std::vector` to a `TfLiteIntArray`. The caller takes ownership
+// of the returned pointer.
+TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input);
+
+// Converts an array (of the given size) to a `TfLiteIntArray`. The caller
+// takes ownership of the returned pointer, and must make sure 'dims' has at
+// least 'rank' elemnts.
+TfLiteIntArray* ConvertArrayToTfLiteIntArray(const int rank, const int* dims);
+
+// Checks whether a `TfLiteIntArray` and an int array have matching elements.
+// The caller must guarantee that 'b' has at least 'b_size' elements.
+bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
+                                 const int* b);
+
+size_t CombineHashes(std::initializer_list<size_t> hashes);
+
+struct TfLiteIntArrayDeleter {
+  void operator()(TfLiteIntArray* a) {
+    if (a) TfLiteIntArrayFree(a);
+  }
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_UTIL_H_
diff --git a/tensorflow/lite/util_test.cc b/tensorflow/lite/util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..606d24274770d2a778eda2a93aba4690f354009e
--- /dev/null
+++ b/tensorflow/lite/util_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace {
+
+TEST(ConvertVectorToTfLiteIntArray, TestWithVector) {
+  std::vector<int> input = {1, 2};
+  TfLiteIntArray* output = ConvertVectorToTfLiteIntArray(input);
+  ASSERT_NE(output, nullptr);
+  EXPECT_EQ(output->size, 2);
+  EXPECT_EQ(output->data[0], 1);
+  EXPECT_EQ(output->data[1], 2);
+  TfLiteIntArrayFree(output);
+}
+
+TEST(ConvertVectorToTfLiteIntArray, TestWithEmptyVector) {
+  std::vector<int> input;
+  TfLiteIntArray* output = ConvertVectorToTfLiteIntArray(input);
+  ASSERT_NE(output, nullptr);
+  EXPECT_EQ(output->size, 0);
+  TfLiteIntArrayFree(output);
+}
+
+TEST(UtilTest, IsFlexOp) {
+  EXPECT_TRUE(IsFlexOp("Flex"));
+  EXPECT_TRUE(IsFlexOp("FlexOp"));
+  EXPECT_FALSE(IsFlexOp("flex"));
+  EXPECT_FALSE(IsFlexOp("Fle"));
+  EXPECT_FALSE(IsFlexOp("OpFlex"));
+  EXPECT_FALSE(IsFlexOp(nullptr));
+  EXPECT_FALSE(IsFlexOp(""));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/version.h b/tensorflow/lite/version.h
similarity index 87%
rename from tensorflow/contrib/lite/version.h
rename to tensorflow/lite/version.h
index efd63f4006ae661c6fdbbaa81cb02fa8947271f3..639d5a336a179462bd7922ecdc4970d267b99b24 100644
--- a/tensorflow/contrib/lite/version.h
+++ b/tensorflow/lite/version.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_VERSION_H_
-#define TENSORFLOW_CONTRIB_LITE_VERSION_H_
+#ifndef TENSORFLOW_LITE_VERSION_H_
+#define TENSORFLOW_LITE_VERSION_H_
 
 // The version number of the Schema. Ideally all changes will be backward
 // compatible. If that ever changes, we must ensure that version is the first
 // entry in the new tflite root so that we can see that version is not 1.
 #define TFLITE_SCHEMA_VERSION (3)
 
-#endif  // TENSORFLOW_CONTRIB_LITE_VERSION_H_
+#endif  // TENSORFLOW_LITE_VERSION_H_
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
new file mode 100644
index 0000000000000000000000000000000000000000..347dc9fc6b998a8dad8c33b68292f40a7b534457
--- /dev/null
+++ b/tensorflow/opensource_only.files
@@ -0,0 +1,243 @@
+tensorflow/contrib/tpu/profiler/pip_package/BUILD
+tensorflow/contrib/tpu/profiler/pip_package/setup.py
+tensorflow/contrib/tpu/profiler/pip_package/README
+tensorflow/contrib/tpu/profiler/pip_package/build_pip_package.sh
+tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/__init__.py
+tensorflow/contrib/mpi/BUILD
+tensorflow/tools/ci_build/remote/BUILD
+tensorflow/tools/pip_package/README
+tensorflow/tools/pip_package/MANIFEST.in
+tensorflow/tools/pip_package/simple_console.py
+tensorflow/tools/pip_package/build_pip_package.sh
+tensorflow/tools/pip_package/check_load_py_test.py
+tensorflow/tools/pip_package/pip_smoke_test.py
+tensorflow/tools/pip_package/simple_console_for_windows.py
+tensorflow/tools/pip_package/setup.py
+tensorflow/tools/pip_package/BUILD
+tensorflow/tools/lib_package/concat_licenses.sh
+tensorflow/tools/lib_package/libtensorflow_test.c
+tensorflow/tools/lib_package/LibTensorFlowTest.java
+tensorflow/tools/lib_package/BUILD
+tensorflow/tools/lib_package/libtensorflow_test.sh
+tensorflow/tools/lib_package/README.md
+tensorflow/tools/lib_package/libtensorflow_java_test.sh
+tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+tensorflow/tools/def_file_filter/BUILD
+tensorflow/tools/def_file_filter/BUILD.tpl
+tensorflow/tools/def_file_filter/def_file_filter.py.tpl
+tensorflow/third_party/mkl/MKL_LICENSE
+tensorflow/third_party/mkl/LICENSE
+tensorflow/third_party/mkl/BUILD
+tensorflow/third_party/mkl/mkl.BUILD
+tensorflow/third_party/mkl/build_defs.bzl
+tensorflow/third_party/backports_weakref.BUILD
+tensorflow/third_party/toolchains/clang6/BUILD
+tensorflow/third_party/toolchains/clang6/README.md
+tensorflow/third_party/toolchains/clang6/repo.bzl
+tensorflow/third_party/toolchains/clang6/CROSSTOOL.tpl
+tensorflow/third_party/toolchains/clang6/clang.BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
+tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
+tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
+tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
+tensorflow/third_party/toolchains/preconfig/generate/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/BUILD
+tensorflow/third_party/toolchains/gpus/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/gpus/cuda/BUILD
+tensorflow/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
+tensorflow/third_party/toolchains/gpus/crosstool/BUILD
+tensorflow/third_party/toolchains/gpus/crosstool/CROSSTOOL
+tensorflow/third_party/toolchains/gpus/py/BUILD
+tensorflow/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
+tensorflow/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
+tensorflow/third_party/toolchains/cpus/arm/BUILD
+tensorflow/third_party/toolchains/cpus/py3/BUILD
+tensorflow/third_party/toolchains/cpus/py/BUILD
+tensorflow/third_party/toolchains/BUILD
+tensorflow/third_party/nccl/remote.BUILD.tpl
+tensorflow/third_party/nccl/archive.BUILD
+tensorflow/third_party/nccl/LICENSE
+tensorflow/third_party/nccl/system.BUILD.tpl
+tensorflow/third_party/nccl/nccl_configure.bzl
+tensorflow/third_party/nccl/build_defs.bzl.tpl
+tensorflow/third_party/nccl/BUILD
+tensorflow/third_party/gpus/BUILD
+tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+tensorflow/third_party/gpus/crosstool/CROSSTOOL.tpl
+tensorflow/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
+tensorflow/third_party/gpus/crosstool/LICENSE
+tensorflow/third_party/gpus/crosstool/remote.BUILD.tpl
+tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
+tensorflow/third_party/gpus/crosstool/BUILD.tpl
+tensorflow/third_party/gpus/crosstool/BUILD
+tensorflow/third_party/gpus/cuda/LICENSE
+tensorflow/third_party/gpus/cuda/BUILD.tpl
+tensorflow/third_party/gpus/cuda/BUILD.windows.tpl
+tensorflow/third_party/gpus/cuda/cuda_config.h.tpl
+tensorflow/third_party/gpus/cuda/remote.BUILD.tpl
+tensorflow/third_party/gpus/cuda/BUILD
+tensorflow/third_party/gpus/cuda/build_defs.bzl.tpl
+tensorflow/third_party/gpus/rocm/rocm_config.h.tpl
+tensorflow/third_party/gpus/rocm/BUILD
+tensorflow/third_party/gpus/rocm/BUILD.tpl
+tensorflow/third_party/gpus/rocm/build_defs.bzl.tpl
+tensorflow/third_party/gpus/cuda_configure.bzl
+tensorflow/third_party/gpus/rocm_configure.bzl
+tensorflow/third_party/snappy.BUILD
+tensorflow/third_party/cython.BUILD
+tensorflow/third_party/farmhash.BUILD
+tensorflow/third_party/eigen3/Eigen/Cholesky
+tensorflow/third_party/eigen3/Eigen/QR
+tensorflow/third_party/eigen3/Eigen/LU
+tensorflow/third_party/eigen3/Eigen/Core
+tensorflow/third_party/eigen3/Eigen/SVD
+tensorflow/third_party/eigen3/Eigen/Eigenvalues
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
+tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions
+tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions
+tensorflow/third_party/eigen3/LICENSE
+tensorflow/third_party/eigen3/BUILD
+tensorflow/third_party/systemlibs/build_defs.bzl.tpl
+tensorflow/third_party/systemlibs/absl_py.BUILD
+tensorflow/third_party/systemlibs/curl.BUILD
+tensorflow/third_party/systemlibs/termcolor.BUILD
+tensorflow/third_party/systemlibs/absl_py.absl.flags.BUILD
+tensorflow/third_party/systemlibs/grpc.BUILD
+tensorflow/third_party/systemlibs/swig.BUILD
+tensorflow/third_party/systemlibs/protobuf.bzl
+tensorflow/third_party/systemlibs/protobuf.BUILD
+tensorflow/third_party/systemlibs/BUILD
+tensorflow/third_party/systemlibs/google_cloud_cpp.BUILD
+tensorflow/third_party/systemlibs/astor.BUILD
+tensorflow/third_party/systemlibs/six.BUILD
+tensorflow/third_party/systemlibs/absl_py.absl.testing.BUILD
+tensorflow/third_party/systemlibs/boringssl.BUILD
+tensorflow/third_party/systemlibs/nsync.BUILD
+tensorflow/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD
+tensorflow/third_party/systemlibs/gif.BUILD
+tensorflow/third_party/systemlibs/pcre.BUILD
+tensorflow/third_party/systemlibs/BUILD.tpl
+tensorflow/third_party/systemlibs/snappy.BUILD
+tensorflow/third_party/systemlibs/gast.BUILD
+tensorflow/third_party/systemlibs/cython.BUILD
+tensorflow/third_party/systemlibs/double_conversion.BUILD
+tensorflow/third_party/systemlibs/zlib.BUILD
+tensorflow/third_party/systemlibs/jsoncpp.BUILD
+tensorflow/third_party/systemlibs/re2.BUILD
+tensorflow/third_party/systemlibs/lmdb.BUILD
+tensorflow/third_party/systemlibs/googleapis.BUILD
+tensorflow/third_party/systemlibs/png.BUILD
+tensorflow/third_party/systemlibs/syslibs_configure.bzl
+tensorflow/third_party/systemlibs/sqlite.BUILD
+tensorflow/third_party/python_runtime/BUILD
+tensorflow/third_party/sycl/crosstool/BUILD
+tensorflow/third_party/ngraph/LICENSE
+tensorflow/third_party/ngraph/tbb.BUILD
+tensorflow/third_party/ngraph/BUILD
+tensorflow/third_party/ngraph/ngraph.BUILD
+tensorflow/third_party/ngraph/build_defs.bzl
+tensorflow/third_party/ngraph/NGRAPH_LICENSE
+tensorflow/third_party/ngraph/ngraph_tf.BUILD
+tensorflow/third_party/ngraph/nlohmann_json.BUILD
+tensorflow/third_party/clang_toolchain/download_clang.bzl
+tensorflow/third_party/clang_toolchain/BUILD
+tensorflow/third_party/clang_toolchain/cc_configure_clang.bzl
+tensorflow/third_party/gast.BUILD
+tensorflow/third_party/llvm/BUILD
+tensorflow/third_party/llvm/expand_cmake_vars.py
+tensorflow/third_party/llvm/llvm.autogenerated.BUILD
+tensorflow/third_party/llvm/llvm.bzl
+tensorflow/third_party/icu/udata.patch
+tensorflow/third_party/fft2d/BUILD
+tensorflow/third_party/fft2d/fft.h
+tensorflow/third_party/fft2d/LICENSE
+tensorflow/third_party/fft2d/fft2d.BUILD
+tensorflow/third_party/boringssl/BUILD
+tensorflow/third_party/mpi/.gitignore
+tensorflow/third_party/mpi/BUILD
+tensorflow/third_party/tensorrt/LICENSE
+tensorflow/third_party/tensorrt/BUILD
+tensorflow/third_party/tensorrt/build_defs.bzl.tpl
+tensorflow/third_party/tensorrt/BUILD.tpl
+tensorflow/third_party/tensorrt/tensorrt_configure.bzl
+tensorflow/third_party/kafka/config.patch
+tensorflow/third_party/kafka/BUILD
+tensorflow/third_party/android/BUILD
+tensorflow/third_party/android/android.bzl.tpl
+tensorflow/third_party/android/android_configure.bzl
+tensorflow/third_party/android/android_configure.BUILD.tpl
+tensorflow/third_party/tflite_smartreply.BUILD
+tensorflow/third_party/mkl_dnn/LICENSE
+tensorflow/third_party/mkl_dnn/mkldnn.BUILD
+tensorflow/third_party/pcre.BUILD
+tensorflow/third_party/linenoise.BUILD
+tensorflow/third_party/sqlite.BUILD
+tensorflow/third_party/common.bzl
+tensorflow/third_party/com_google_absl.BUILD
+tensorflow/third_party/pprof.BUILD
+tensorflow/third_party/BUILD
+tensorflow/third_party/tflite_mobilenet_quant.BUILD
+tensorflow/third_party/lmdb.BUILD
+tensorflow/third_party/git/BUILD.tpl
+tensorflow/third_party/git/BUILD
+tensorflow/third_party/git/git_configure.bzl
+tensorflow/third_party/protobuf/BUILD
+tensorflow/third_party/tflite_mobilenet.BUILD
+tensorflow/third_party/py/BUILD
+tensorflow/third_party/py/BUILD.tpl
+tensorflow/third_party/py/remote.BUILD.tpl
+tensorflow/third_party/py/numpy/BUILD
+tensorflow/third_party/py/python_configure.bzl
+tensorflow/third_party/termcolor.BUILD
+tensorflow/third_party/png_fix_rpi.patch
+tensorflow/third_party/swig.BUILD
+tensorflow/third_party/astor.BUILD
+tensorflow/third_party/grpc/BUILD
+tensorflow/third_party/curl.BUILD
+tensorflow/third_party/arm_neon_2_x86_sse.BUILD
+tensorflow/third_party/png.BUILD
+tensorflow/third_party/googleapis.BUILD
+tensorflow/third_party/mpi_collectives/BUILD
+tensorflow/third_party/nanopb.BUILD
+tensorflow/third_party/gif.BUILD
+tensorflow/third_party/double_conversion.BUILD
+tensorflow/third_party/six.BUILD
+tensorflow/third_party/tflite_mobilenet_float.BUILD
+tensorflow/third_party/repo.bzl
+tensorflow/third_party/codegen.BUILD
+tensorflow/third_party/cub.BUILD
+tensorflow/third_party/jsoncpp.BUILD
+tensorflow/third_party/tflite_ovic_testdata.BUILD
+tensorflow/third_party/libxsmm.BUILD
+tensorflow/third_party/zlib.BUILD
+tensorflow/third_party/eigen.BUILD
+tensorflow/stream_executor/BUILD
+tensorflow/api_template_v1.__init__.py
+tensorflow/compat_template_v1.__init__.py
+tensorflow/api_template.__init__.py
+tensorflow/__init__.py
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index da3c56db92311501371850ba768757bbdc0c49c0..0a3ee65bc48013971c857fc5fb04f397c3edd2aa 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -7,10 +7,11 @@
 visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
     "//tensorflow:internal",
-    "//tensorflow/contrib/lite/toco/python:__pkg__",
+    "//tensorflow/lite/toco/python:__pkg__",
     "//tensorflow_models:__subpackages__",
+    "//tensorflow_model_optimization:__subpackages__",
     # TODO(aselle): to pass open source test.
-    "//bazel_pip/tensorflow/contrib/lite/toco/python:__pkg__",
+    "//bazel_pip/tensorflow/lite/toco/python:__pkg__",
 ]
 
 package(default_visibility = visibility)
@@ -19,6 +20,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+exports_files(["platform/base.i"])
+
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
@@ -58,7 +61,7 @@ py_library(
         "//tensorflow/compiler/aot/tests:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/contrib/learn:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/contrib/learn/python/learn/datasets:__pkg__",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/contrib/lite/toco/python:__pkg__",  # TODO(b/34059704): remove when fixed
+        "//tensorflow/lite/toco/python:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python/debug:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python/tools:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
@@ -76,9 +79,11 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = [
         "//tensorflow:__pkg__",
+        "//tensorflow/python/estimator:__subpackages__",
         "//tensorflow/python/tools:__pkg__",
         "//tensorflow/python/tools/api/generator:__pkg__",
         "//tensorflow/tools/api/tests:__pkg__",
+        "//tensorflow/tools/compatibility/update:__pkg__",
     ],
     deps = [
         ":array_ops",
@@ -88,6 +93,7 @@ py_library(
         ":client",
         ":client_testlib",
         ":collective_ops",
+        ":cond_v2",
         ":confusion_matrix",
         ":control_flow_ops",
         ":cudnn_rnn_ops_gen",
@@ -96,6 +102,7 @@ py_library(
         ":framework_for_generated_wrappers",
         ":functional_ops",
         ":gradient_checker",
+        ":gradient_checker_v2",
         ":graph_util",
         ":histogram_ops",
         ":image_ops",
@@ -108,6 +115,7 @@ py_library(
         ":manip_ops",
         ":math_ops",
         ":metrics",
+        ":nccl_ops",
         ":nn",
         ":ops",
         ":platform",
@@ -117,7 +125,6 @@ py_library(
         ":session_ops",
         ":sets",
         ":sparse_ops",
-        ":spectral_ops",
         ":spectral_ops_test_util",
         ":standard_ops",
         ":state_ops",
@@ -125,6 +132,7 @@ py_library(
         ":subscribe",
         ":summary",
         ":tensor_array_ops",
+        ":tensor_forest_ops",
         ":test_ops",  # TODO: Break testing code out into separate rule.
         ":tf_cluster",
         ":tf_item",
@@ -132,16 +140,22 @@ py_library(
         ":training",
         ":util",
         ":weights_broadcast_ops",
+        ":while_v2",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/lite/python:lite",
         "//tensorflow/python/compat",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute",
         "//tensorflow/python/distribute:estimator_training",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/ops/parallel_for",
+        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/ops/signal",
         "//tensorflow/python/profiler",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/tools:component_api_helper",
@@ -466,6 +480,7 @@ py_test(
     tags = [
         "no_pip",  # Path issues due to test environment
         "no_windows",
+        "notap",
     ],
     deps = [
         ":client_testlib",
@@ -511,6 +526,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "dispatch_test",
+    srcs = ["util/dispatch_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":platform",
+        ":util",
+    ],
+)
+
 py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
@@ -742,7 +768,6 @@ py_library(
     srcs = ["framework/graph_to_function_def.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":cond_v2_impl",
         ":op_def_registry",
         "//tensorflow/core:protos_all_py",
     ],
@@ -842,7 +867,6 @@ py_library(
     deps = [
         ":c_api_util",
         ":control_flow_util",
-        ":cpp_shape_inference_proto_py",
         ":device",
         ":dtypes",
         ":error_interpolation",
@@ -850,6 +874,7 @@ py_library(
         ":platform",
         ":registry",
         ":tensor_shape",
+        ":tf2",
         ":traceable_stack",
         ":util",
         ":versions",
@@ -861,6 +886,46 @@ py_library(
     ],
 )
 
+py_library(
+    name = "func_graph",
+    srcs = ["framework/func_graph.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":auto_control_deps",
+        ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
+        "//tensorflow/python/autograph",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:graph_only_ops",
+        "//tensorflow/python/eager:tape",
+    ],
+)
+
+py_library(
+    name = "auto_control_deps",
+    srcs = ["framework/auto_control_deps.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_ops",
+        ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
+        ":util",
+    ],
+)
+
+py_test(
+    name = "auto_control_deps_test",
+    size = "small",
+    srcs = ["framework/auto_control_deps_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":auto_control_deps",
+        ":client_testlib",
+    ],
+)
+
 py_library(
     name = "random_seed",
     srcs = ["framework/random_seed.py"],
@@ -933,6 +998,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dtypes",
+        ":tf2",
         ":util",
         "//tensorflow/core:protos_all_py",
     ],
@@ -946,6 +1012,7 @@ py_library(
         ":common_shapes",
         ":dtypes",
         ":tensor_shape",
+        ":util",
         "//third_party/py/numpy",
     ],
 )
@@ -1004,6 +1071,7 @@ py_library(
         ":random_seed",
         ":resource_variable_ops",
         ":session",
+        ":tensor_array_ops",
         ":training",
         ":util",
         ":variables",
@@ -1028,10 +1096,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":client",
+        ":cond_v2",
         ":framework_test_lib",
         ":gradient_checker",
+        ":gradient_checker_v2",
         ":platform_test",
         ":util",
+        ":while_v2",
     ],
 )
 
@@ -1112,7 +1183,10 @@ py_test(
     ],
     main = "platform/build_info_test.py",
     srcs_version = "PY2AND3",
-    tags = ["notap"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
     deps = [
         ":client_testlib",
         ":platform",
@@ -1154,6 +1228,7 @@ py_library(
         ":tensor_shape",
         ":util",
         ":variable_scope",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -1241,7 +1316,10 @@ py_test(
     data = ["//tensorflow/python:meta_graph_testdata"],
     main = "framework/meta_graph_test.py",
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -1329,6 +1407,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],  # test_ops_2 is not available in pip.
     deps = [
+        ":cond_v2",
         ":control_flow_ops",
         ":errors",
         ":framework",
@@ -1343,6 +1422,7 @@ py_test(
         ":util",
         ":variable_scope",
         ":variables",
+        ":while_v2",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
@@ -1487,6 +1567,7 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1562,6 +1643,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "tensor_forest_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "summary_ops_gen",
     visibility = ["//tensorflow:__subpackages__"],
@@ -1742,7 +1831,6 @@ tf_gen_op_wrapper_private_py(
 tf_gen_op_wrapper_private_py(
     name = "stateless_random_ops_gen",
     visibility = [
-        "//tensorflow/contrib/stateless:__pkg__",
         "//tensorflow/python/data/experimental/ops:__pkg__",
     ],
 )
@@ -1757,7 +1845,10 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "sdca_ops_gen",
-    visibility = ["//tensorflow/contrib/linear_optimizer:__pkg__"],
+    visibility = [
+        "//tensorflow/contrib/linear_optimizer:__pkg__",
+        "//tensorflow_estimator/python/estimator/canned/linear_optimizer:__pkg__",
+    ],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1779,6 +1870,7 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "spectral_ops_gen",
+    visibility = ["//tensorflow/python/ops/signal:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1795,6 +1887,33 @@ tf_gen_op_wrapper_private_py(
     out = "training/gen_training_ops.py",
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "ragged_array_ops_gen",
+    visibility = [
+        "//learning/brain/contrib/text:__pkg__",
+        "//learning/brain/contrib/text/python/ragged:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "ragged_math_ops_gen",
+    visibility = [
+        "//learning/brain/contrib/text:__pkg__",
+        "//learning/brain/contrib/text/python/ragged:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "ragged_conversion_ops_gen",
+    visibility = [
+        "//learning/brain/contrib/text:__pkg__",
+        "//learning/brain/contrib/text/python/ragged:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
+    ],
+)
+
 py_library(
     name = "array_grad",
     srcs = ["ops/array_grad.py"],
@@ -1856,6 +1975,28 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tensor_forest_ops",
+    srcs = ["ops/tensor_forest_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":ops",
+        ":tensor_forest_ops_gen",
+        ":training",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+    ],
+)
+
+py_library(
+    name = "optional_grad",
+    srcs = ["ops/optional_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_ops",
+    ],
+)
+
 py_library(
     name = "sets",
     srcs = [
@@ -1964,15 +2105,15 @@ py_library(
     ],
 )
 
+# Note: targets depending on this should also depend on ":cond_v2" and ":while_v2".
+# See b/118513001.
 py_library(
     name = "control_flow_ops",
     srcs = ["ops/control_flow_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "tensor_shape",
         ":array_ops",
         ":array_ops_gen",
-        ":cond_v2_impl",
         ":constant_op",
         ":control_flow_ops_gen",
         ":control_flow_util",
@@ -1985,6 +2126,8 @@ py_library(
         ":resource_variable_ops_gen",
         ":sparse_tensor",
         ":tensor_array_ops",
+        ":tensor_shape",
+        ":tf2",
         ":tf_should_use",
         ":util",
         "//tensorflow/core:protos_all_py",
@@ -2001,6 +2144,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "control_flow_util_v2",
+    srcs = ["ops/control_flow_util_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_util",
+        ":framework_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
 py_library(
     name = "cond_v2",
     srcs = [
@@ -2008,10 +2164,20 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":cond_v2_impl",
+        ":array_ops",
+        ":c_api_util",
+        ":control_flow_util_v2",
+        ":framework_ops",
         ":function",
         ":function_def_to_graph",
+        ":functional_ops_gen",
         ":gradients",
+        ":gradients_impl",
+        ":graph_to_function_def",
+        ":pywrap_tensorflow",
+        ":util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -2023,39 +2189,24 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":cond_v2_impl",
         ":constant_op",
         ":control_flow_ops",
         ":control_flow_util",
+        ":control_flow_util_v2",
+        ":dtypes",
         ":framework_ops",
         ":function_def_to_graph",
         ":functional_ops_gen",
         ":gradients_impl",
         ":list_ops",
+        ":tensor_array_ops",
         ":tensor_shape",
+        ":tensor_util",
         ":util",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:function",
     ],
 )
 
-py_library(
-    name = "cond_v2_impl",
-    srcs = [
-        "ops/cond_v2_impl.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":c_api_util",
-        ":framework_ops",
-        ":functional_ops_gen",
-        ":pywrap_tensorflow",
-        ":util",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
 py_library(
     name = "ctc_ops",
     srcs = ["ops/ctc_ops.py"],
@@ -2137,6 +2288,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":gradients_impl",
+        ":unconnected_gradients",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:tape",
     ],
@@ -2153,7 +2305,6 @@ py_library(
         ":array_ops",
         ":bitwise_ops",
         ":check_ops",
-        ":cond_v2_impl",
         ":control_flow_grad",
         ":control_flow_ops",
         ":control_flow_util",
@@ -2169,16 +2320,16 @@ py_library(
         ":manip_ops",
         ":math_grad",
         ":math_ops",
+        ":optional_grad",
         ":platform",
         ":random_grad",
         ":resource_variable_ops",
-        ":spectral_grad",
         ":tensor_array_ops",
         ":tensor_util",
+        ":unconnected_gradients",
         ":util",
         ":variable_scope",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
         "//third_party/py/numpy",
@@ -2186,6 +2337,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "unconnected_gradients",
+    srcs = ["ops/unconnected_gradients.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":util",
+    ],
+)
+
 py_library(
     name = "histogram_ops",
     srcs = ["ops/histogram_ops.py"],
@@ -2405,7 +2565,6 @@ py_library(
         ":nn_ops_gen",
         ":sparse_ops_gen",
         ":sparse_tensor",
-        ":spectral_ops_gen",
         ":state_ops",
         ":state_ops_gen",
         ":tensor_shape",
@@ -2582,6 +2741,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "stateless_random_ops",
+    srcs = ["ops/stateless_random_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":math_ops",
+        ":random_ops",
+        ":stateless_random_ops_gen",
+    ],
+)
+
 py_library(
     name = "rnn",
     srcs = ["ops/rnn.py"],
@@ -2698,33 +2870,34 @@ py_test(
         ":framework_test_lib",
         ":sparse_ops",
         ":sparse_tensor",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_library(
-    name = "spectral_grad",
-    srcs = ["ops/spectral_grad.py"],
+    name = "sort_ops",
+    srcs = ["ops/sort_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
         ":framework",
-        ":framework_for_generated_wrappers",
         ":math_ops",
-        ":spectral_ops",
+        ":nn_ops",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
-    name = "spectral_ops",
-    srcs = ["ops/spectral_ops.py"],
+py_test(
+    name = "sort_ops_test",
+    srcs = ["ops/sort_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":dtypes",
-        ":framework_ops",
-        ":math_ops",
-        ":spectral_ops_gen",
+        ":client_testlib",
+        ":framework",
+        ":random_ops",
+        ":sort_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -2842,12 +3015,13 @@ py_library(
         ":random_ops",
         ":script_ops",
         ":session_ops",
+        ":sort_ops",
         ":sparse_grad",
         ":sparse_ops",
         ":special_math_ops",
-        ":spectral_grad",
         ":state_grad",
         ":state_ops",
+        ":stateless_random_ops",
         ":string_ops",
         ":template",
         ":tensor_array_grad",
@@ -2855,6 +3029,7 @@ py_library(
         ":util",
         ":variable_scope",
         ":variables",
+        "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
     ],
@@ -2894,18 +3069,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "summary_ops",
-    srcs = ["ops/summary_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":logging_ops_gen",
-        ":summary_op_util",
-    ],
-)
-
 py_library(
     name = "summary_ops_v2",
     srcs = ["ops/summary_ops_v2.py"],
@@ -2961,13 +3124,16 @@ py_library(
     deps = [
         ":array_ops",
         ":constant_op",
+        ":control_flow_ops_gen",
         ":data_flow_ops_gen",
         ":dtypes",
         ":errors",
         ":framework_ops",
+        ":list_ops",
         ":math_ops",
         ":tensor_shape",
         ":tensor_util",
+        ":tf2",
         ":tf_should_use",
         "//tensorflow/python/eager:context",
     ],
@@ -2985,6 +3151,7 @@ py_library(
         ":platform",
         ":resource_variable_ops",
         ":tensor_shape",
+        ":tf2",
         ":util",
         ":variables",
         "//tensorflow/python/eager:context",
@@ -3025,6 +3192,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "gradient_checker_v2",
+    srcs = ["ops/gradient_checker_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":framework_for_generated_wrappers",
+        ":gradients",
+        ":platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 # This target is deprecated.
 py_library(
     name = "ops",
@@ -3056,6 +3236,7 @@ cuda_py_test(
     srcs = ["ops/control_flow_ops_test.py"],
     additional_deps = [
         ":array_ops",
+        ":cond_v2",
         ":control_flow_ops",
         ":embedding_ops",
         ":framework_for_generated_wrappers",
@@ -3071,6 +3252,8 @@ cuda_py_test(
         ":util",
         ":variable_scope",
         ":variables",
+        ":while_v2",
+        "//tensorflow/python/eager:def_function",
     ],
 )
 
@@ -3090,6 +3273,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "gradient_checker_v2_test",
+    size = "medium",
+    srcs = ["ops/gradient_checker_v2_test.py"],
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_ops",
+        ":platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "gradients_test",
     size = "medium",
@@ -3119,6 +3318,7 @@ cuda_py_test(
         ":variable_scope",
         "//third_party/py/numpy",
     ],
+    tags = ["no_oss"],  # b/118709825
 )
 
 cuda_py_test(
@@ -3197,6 +3397,9 @@ cuda_py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:execution_callbacks",
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
@@ -3312,11 +3515,13 @@ py_library(
         exclude = [
             "**/*test*",
             "training/checkpointable/**/*.py",
+            "training/saving/**/*.py",
             # The following targets have their own build rules (same name as the
             # file):
             "training/basic_session_run_hooks.py",
             "training/checkpoint_management.py",
-            "training/saveable_object.py",
+            "training/distribute.py",
+            "training/distribution_strategy_context.py",
             "training/saver.py",
             "training/session_run_hook.py",
             "training/training_util.py",
@@ -3333,6 +3538,7 @@ py_library(
         ":control_flow_ops",
         ":data_flow_ops",
         ":device",
+        ":distribute",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -3365,6 +3571,7 @@ py_library(
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
@@ -3376,10 +3583,17 @@ py_library(
     ],
 )
 
+# Dependency added and used by ClusterResolvers to avoid circular dependency between keras, distribute, and training.
 py_library(
-    name = "saveable_object",
-    srcs = ["training/saveable_object.py"],
+    name = "training_server_lib",
+    srcs = ["training/server_lib.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":pywrap_tensorflow",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+    ],
 )
 
 py_library(
@@ -3435,7 +3649,6 @@ py_library(
         ":platform",
         ":pywrap_tensorflow",
         ":resource_variable_ops",
-        ":saveable_object",
         ":session",
         ":state_ops",
         ":string_ops",
@@ -3445,22 +3658,13 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
-py_library(
-    name = "device_util",
-    srcs = ["training/device_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":device",
-        ":framework_ops",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
 py_library(
     name = "distribute",
     srcs = [
@@ -3469,29 +3673,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":device_util",
-        ":framework_ops",
-        ":platform",
-        ":resource_variable_ops",
-        ":state_ops",
-        ":util",
-        ":variable_scope",
-        "//tensorflow/python/data",
-        "//tensorflow/python/ops/losses",
-    ],
-)
-
-py_test(
-    name = "distribute_test",
-    size = "small",
-    srcs = ["training/distribute_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client_testlib",
-        ":distribute",
-        ":variable_scope",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
@@ -3569,12 +3751,15 @@ py_library(
     ],
 )
 
+# Placeholder for intenal nest_test comments.
 py_test(
     name = "util_nest_test",
     size = "small",
     srcs = ["util/nest_test.py"],
     main = "util/nest_test.py",
     srcs_version = "PY2AND3",
+    visibility = visibility + [
+    ],
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -3648,6 +3833,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":util",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -3893,7 +4079,6 @@ tf_py_wrap_cc(
         "platform/stacktrace_handler.i",
         "pywrap_tfe.i",
         "training/quantize_training.i",
-        "training/server_lib.i",
         "util/kernel_registry.i",
         "util/port.i",
         "util/py_checkpoint_reader.i",
@@ -3996,11 +4181,24 @@ genrule(
 
 # Get the import library of  _pywrap_tensorflow_internal.dll
 filegroup(
-    name = "pywrap_tensorflow_import_lib_file",
+    name = "get_pywrap_tensorflow_import_lib_file",
     srcs = [":_pywrap_tensorflow_internal.so"],
     output_group = "interface_library",
 )
 
+# Rename the import library for _pywrap_tensorflow_internal.pyd to _pywrap_tensorflow_internal.lib
+# (It was _pywrap_tensorflow_internal.so.if.lib).
+genrule(
+    name = "pywrap_tensorflow_import_lib_file",
+    srcs = [":get_pywrap_tensorflow_import_lib_file"],
+    outs = ["_pywrap_tensorflow_internal.lib"],
+    cmd = select({
+        "//tensorflow:windows": "cp -f $< $@",
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    visibility = ["//visibility:public"],
+)
+
 # Create a cc_import rule for the import library of _pywrap_tensorflow_internal.dll
 # so that custom ops' dynamic libraries can link against it.
 cc_import(
@@ -4478,7 +4676,6 @@ cuda_py_tests(
         "training/basic_loops_test.py",
         "training/coordinator_test.py",
         "training/device_setter_test.py",
-        "training/device_util_test.py",
         "training/ftrl_test.py",
         "training/gradient_descent_test.py",
         "training/learning_rate_decay_test.py",
@@ -4789,7 +4986,7 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -4901,7 +5098,6 @@ py_library(
     deps = [
         ":client",
         ":constant_op",
-        ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
         ":lib",
@@ -4910,12 +5106,10 @@ py_library(
         ":protos_all_py",
         ":pywrap_tensorflow",
         ":summary_op_util",
-        ":summary_ops",
         ":summary_ops_gen",
         ":summary_ops_v2",
         ":util",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
@@ -4926,7 +5120,6 @@ py_tests(
     srcs = [
         "summary/plugin_asset_test.py",
         "summary/summary_test.py",
-        "summary/text_summary_test.py",
         "summary/writer/writer_test.py",
     ],
     additional_deps = [
@@ -5196,6 +5389,19 @@ cuda_py_test(
     main = "ops/concat_benchmark.py",
 )
 
+cuda_py_test(
+    name = "control_flow_ops_benchmark",
+    srcs = ["ops/control_flow_ops_benchmark.py"],
+    additional_deps = [
+        ":client_testlib",
+        ":constant_op",
+        ":control_flow_ops",
+        ":framework_ops",
+        "//tensorflow/python/eager:function",
+    ],
+    main = "ops/control_flow_ops_benchmark.py",
+)
+
 cuda_py_test(
     name = "conv2d_benchmark",
     size = "large",
@@ -5640,6 +5846,48 @@ py_test(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "nccl_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:nccl_ops_op_lib",
+    ],
+)
+
+py_library(
+    name = "nccl_ops",
+    srcs = ["ops/nccl_ops.py"],
+    srcs_version = "PY2AND3",
+    visibility = visibility + [
+        "//learning/deepmind/tensorflow:__subpackages__",
+    ],
+    deps = [
+        ":framework_for_generated_wrappers",
+        ":nccl_ops_gen",
+    ],
+)
+
+cuda_py_test(
+    name = "nccl_ops_test",
+    size = "small",
+    srcs = ["ops/nccl_ops_test.py"],
+    additional_deps = [
+        ":nccl_ops",
+        ":array_ops",
+        ":client_testlib",
+        ":framework_test_lib",
+        ":platform_test",
+    ],
+    # Disabled on jenkins until errors finding nvmlShutdown are found.
+    tags = [
+        "manual",
+        "multi_gpu",
+        "no_oss",
+        "noguitar",
+        "notap",
+    ],
+)
+
 py_binary(
     name = "graph_analyzer",
     srcs = [
@@ -5658,3 +5906,9 @@ pyx_library(
     py_deps = ["//tensorflow/python:util"],
     deps = ["//third_party/py/numpy:headers"],
 )
+
+py_library(
+    name = "tf2",
+    srcs = ["tf2.py"],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 4921ecc43c04dd5b0a53d8fc7ae1c85a0e75eff0..b2cc63bd1320700801d4aaf0a9b33c8da7821412 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -48,13 +48,6 @@ import numpy as np
 
 from tensorflow.python import pywrap_tensorflow
 
-from tensorflow.python.tools import component_api_helper
-component_api_helper.package_hook(
-    parent_package_str='tensorflow.python',
-    child_package_str=(
-        'tensorflow_estimator.python.estimator'))
-del component_api_helper
-
 # Protocol buffers
 from tensorflow.core.framework.graph_pb2 import *
 from tensorflow.core.framework.node_def_pb2 import *
@@ -85,6 +78,7 @@ from tensorflow.python.ops import initializers_ns as initializers
 
 # Bring in subpackages.
 from tensorflow.python import data
+from tensorflow.python import distribute
 from tensorflow.python import keras
 from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
@@ -93,11 +87,12 @@ from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import ragged
 from tensorflow.python.ops import sets
-from tensorflow.python.ops import spectral_ops as spectral
 from tensorflow.python.ops.distributions import distributions
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.ops.signal import signal
 from tensorflow.python.profiler import profiler
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.summary import summary
@@ -131,6 +126,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # Eager execution
 from tensorflow.python.eager.context import executing_eagerly
+from tensorflow.python.eager.def_function import function
 from tensorflow.python.framework.ops import enable_eager_execution
 
 # Necessary for the symbols in this module to be taken into account by
@@ -149,26 +145,26 @@ nn.rnn_cell = rnn_cell
 
 # Export protos
 # pylint: disable=undefined-variable
-tf_export('AttrValue')(AttrValue)
-tf_export('ConfigProto')(ConfigProto)
+tf_export(v1=['AttrValue'])(AttrValue)
+tf_export(v1=['ConfigProto'])(ConfigProto)
 tf_export('Event', 'summary.Event')(Event)
-tf_export('GPUOptions')(GPUOptions)
-tf_export('GraphDef')(GraphDef)
-tf_export('GraphOptions')(GraphOptions)
-tf_export('HistogramProto')(HistogramProto)
-tf_export('LogMessage')(LogMessage)
-tf_export('MetaGraphDef')(MetaGraphDef)
-tf_export('NameAttrList')(NameAttrList)
-tf_export('NodeDef')(NodeDef)
-tf_export('OptimizerOptions')(OptimizerOptions)
-tf_export('RunMetadata')(RunMetadata)
-tf_export('RunOptions')(RunOptions)
-tf_export('SessionLog', 'summary.SessionLog')(SessionLog)
+tf_export(v1=['GPUOptions'])(GPUOptions)
+tf_export(v1=['GraphDef'])(GraphDef)
+tf_export(v1=['GraphOptions'])(GraphOptions)
+tf_export(v1=['HistogramProto'])(HistogramProto)
+tf_export(v1=['LogMessage'])(LogMessage)
+tf_export(v1=['MetaGraphDef'])(MetaGraphDef)
+tf_export(v1=['NameAttrList'])(NameAttrList)
+tf_export(v1=['NodeDef'])(NodeDef)
+tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
+tf_export(v1=['RunMetadata'])(RunMetadata)
+tf_export(v1=['RunOptions'])(RunOptions)
+tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
 tf_export('Summary', 'summary.Summary')(Summary)
 tf_export('summary.SummaryDescription')(SummaryDescription)
 tf_export('SummaryMetadata')(SummaryMetadata)
 tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
-tf_export('TensorInfo')(TensorInfo)
+tf_export(v1=['TensorInfo'])(TensorInfo)
 # pylint: enable=undefined-variable
 
 # Special dunders that we choose to export:
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index 5ed5e85158352ec7af0493d739c77c575c715cc2..7252e0d9bf92e430e224fe00d9a9a5ff4254b46f 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -24,14 +24,16 @@ from __future__ import print_function
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
+from tensorflow.python.autograph.core.converter import ConversionOptions
+from tensorflow.python.autograph.core.converter import Feature
+from tensorflow.python.autograph.core.converter import Verbosity
 from tensorflow.python.autograph.core.errors import GraphConstructionError
-from tensorflow.python.autograph.core.errors import TfRuntimeError
 from tensorflow.python.autograph.core.errors import improved_errors
-from tensorflow.python.autograph.impl.api import ConversionOptions
-from tensorflow.python.autograph.impl.api import RunMode
+from tensorflow.python.autograph.core.errors import TfRuntimeError
 from tensorflow.python.autograph.impl.api import convert
 from tensorflow.python.autograph.impl.api import converted_call
 from tensorflow.python.autograph.impl.api import do_not_convert
+from tensorflow.python.autograph.impl.api import RunMode
 from tensorflow.python.autograph.impl.api import to_code
 from tensorflow.python.autograph.impl.api import to_graph
 from tensorflow.python.autograph.lang.directives import set_element_type
@@ -44,6 +46,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = [
     # Main API
     'ConversionOptions',
+    'Feature',
     'RunMode',
     'convert',
     'converted_call',
@@ -56,6 +59,7 @@ _allowed_symbols = [
     'improved_errors',
     'GraphConstructionError',
     'TfRuntimeError',
+    'Verbosity',
     # Python language "extensions"
     'set_element_type',
     'set_loop_options',
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index f06dc78f0e0507792453e77333059a9e9b4ab649..3ac446db02c6ef1946e76a8b549a85c67fed2872 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -17,6 +17,7 @@ filegroup(
 py_library(
     name = "converters",
     srcs = [
+        "arg_defaults.py",
         "asserts.py",
         "break_statements.py",
         "builtin_functions.py",
@@ -47,11 +48,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "arg_defaults_test",
+    srcs = ["arg_defaults_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":converters",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/autograph/core:test_lib",
+    ],
+)
+
 py_test(
     name = "asserts_test",
     srcs = ["asserts_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -84,7 +95,6 @@ py_test(
 
 py_test(
     name = "call_trees_test",
-    size = "large",
     srcs = ["call_trees_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_windows"],
diff --git a/tensorflow/python/autograph/converters/arg_defaults.py b/tensorflow/python/autograph/converters/arg_defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1a5efd71c083300e31a3a10b4af2749d9e6b75a
--- /dev/null
+++ b/tensorflow/python/autograph/converters/arg_defaults.py
@@ -0,0 +1,87 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Modifies the signature to allow resolving the value of default arguments.
+
+Normally, function symbols are captured either in a function's globals or
+closure. This is not true for default arguments, which are evaluated when the
+function is defined:
+
+    b = 1
+    c = 2
+    def f(a=b + 1):
+      return a + c
+
+In the above example, the namespace of the function would include `c = 2` but
+not `b`.
+
+If we were to naively generate a new function:
+
+    def new_f(a=b + 1):
+      return a + c
+
+The generated code would fail to load unless we exposed a symbol `b`. Capturing
+the closure of such an expression is difficult. However, we can capture the
+default value of argument `a` with relative ease.
+
+This converter replaces all default argument expressions with a constant so
+that they don't cause loading to fail. This requires that the default values
+are reset after loading the transformed function:
+
+    def new_f(a=None):
+      return a + c
+
+    # ... later, after new_f was loaded ...
+    new_f.__defaults__ = f.__defaults__
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.pyct import parser
+
+
+class ArgDefaultsTransformer(converter.Base):
+  """Transforms top level argument defaults.
+
+  This transformer modifies self.ctx.arg_defaults directly.
+  """
+
+  def visit_arguments(self, node):
+    for i in range(len(node.defaults)):
+      node.defaults[i] = parser.parse_expression('None')
+
+    for i, d in enumerate(node.kw_defaults):
+      if d is not None:
+        node.kw_defaults[i] = parser.parse_expression('None')
+
+    # Only the top level function is modified - no need to visit the children.
+    return node
+
+
+def transform(node, ctx):
+  """Transform function call to the compiled counterparts.
+
+  Args:
+    node: AST
+    ctx: EntityContext
+  Returns:
+    A tuple (node, new_names):
+        node: The transformed AST
+        new_names: set(string), containing any newly-generated names
+  """
+  return ArgDefaultsTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/arg_defaults_test.py b/tensorflow/python/autograph/converters/arg_defaults_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5acc309e35285845061291e25df6002f3d22d399
--- /dev/null
+++ b/tensorflow/python/autograph/converters/arg_defaults_test.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for arg_defaults module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.converters import arg_defaults
+from tensorflow.python.autograph.core import converter_testing
+from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.platform import test
+
+
+class ArgDefaultsTransformerTest(converter_testing.TestCase):
+
+  def assertTransformedFirstLineIs(self, node, expected):
+    self.assertEqual(compiler.ast_to_source(node).split('\n')[0], expected)
+
+  def test_no_args(self):
+
+    def test_fn():
+      pass
+
+    node, ctx = self.prepare(test_fn, {})
+    node = arg_defaults.transform(node, ctx)
+    self.assertTransformedFirstLineIs(node, 'def test_fn():')
+
+  def test_no_defaults(self):
+
+    def test_fn(a, b, *c, **e):
+      return a, b, c, e
+
+    node, ctx = self.prepare(test_fn, {})
+    node = arg_defaults.transform(node, ctx)
+    self.assertTransformedFirstLineIs(node, 'def test_fn(a, b, *c, **e):')
+
+  # TODO(mdan): Add kwonly-arg tests when PY2 is no longer supported.
+
+  def test_arg_defaults(self):
+
+    def test_fn(a, b=1, c=2):
+      return a, b, c
+
+    node, ctx = self.prepare(test_fn, {})
+    node = arg_defaults.transform(node, ctx)
+    self.assertTransformedFirstLineIs(node, 'def test_fn(a, b=None, c=None):')
+
+  def test_arg_defaults_with_vararg(self):
+
+    def test_fn(a, b=1, *c):  # pylint: disable=keyword-arg-before-vararg
+      return a, b, c
+
+    node, ctx = self.prepare(test_fn, {})
+    node = arg_defaults.transform(node, ctx)
+    self.assertTransformedFirstLineIs(node, 'def test_fn(a, b=None, *c):')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/converters/asserts.py b/tensorflow/python/autograph/converters/asserts.py
index 56a97534c429028d592a2c78c50caeaabd8aba68..4ba827c35f71abb038373e6855619b9c14929e88 100644
--- a/tensorflow/python/autograph/converters/asserts.py
+++ b/tensorflow/python/autograph/converters/asserts.py
@@ -33,7 +33,7 @@ class AssertTransformer(converter.Base):
     # Note: The lone tf.Assert call will be wrapped with control_dependencies
     # by side_effect_guards.
     template = """
-      tf.Assert(test, (msg,))
+      ag__.assert_stmt(test, lambda: msg)
     """
 
     if node.msg is None:
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index 01282f9e6297291c4a5c9d7ffd4598a53bcc4943..9ae448892a030b331adc216052ba22d3ca7533df 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -18,24 +18,32 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import gast
-
 from tensorflow.python.autograph.converters import asserts
+from tensorflow.python.autograph.converters import side_effect_guards
 from tensorflow.python.autograph.core import converter_testing
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.platform import test
 
 
 class AssertsTest(converter_testing.TestCase):
 
-  def test_transform(self):
+  @test_util.run_deprecated_v1
+  def test_basic(self):
 
     def test_fn(a):
-      assert a > 0
-
-    node, ctx = self.prepare(test_fn, {})
-    node = asserts.transform(node, ctx)
-
-    self.assertTrue(isinstance(node.body[0].value, gast.Call))
+      assert a, 'test message'
+      return tf.no_op()  # pylint:disable=undefined-variable
+
+    with self.converted(test_fn, (asserts, side_effect_guards), {},
+                        gen_control_flow_ops.no_op) as result:
+      with self.cached_session() as sess:
+        op = result.test_fn(constant_op.constant(False))
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     'test message'):
+          self.evaluate(op)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/break_statements.py b/tensorflow/python/autograph/converters/break_statements.py
index bd6b0b248c5190435c0843c5273abe22d26158cc..e4e32ab9761aa13b5a7eefbc297ad3ea79412e99 100644
--- a/tensorflow/python/autograph/converters/break_statements.py
+++ b/tensorflow/python/autograph/converters/break_statements.py
@@ -42,7 +42,7 @@ class BreakTransformer(converter.Base):
     var_name = self.state[_Break].control_var_name
     # TODO(mdan): This will fail when expanded inside a top-level else block.
     template = """
-      var_name = tf.constant(True)
+      var_name = True
       continue
     """
     return templates.replace(template, var_name=var_name)
@@ -85,7 +85,7 @@ class BreakTransformer(converter.Base):
       guarded_orelse = self._guard_if_present(node.orelse, break_var)
 
       template = """
-        var_name = tf.constant(False)
+        var_name = False
         while test and not var_name:
           body
         else:
@@ -122,7 +122,7 @@ class BreakTransformer(converter.Base):
       # the control variable is marked as used.
       # TODO(mdan): Use a marker instead, e.g. ag__.condition_loop_on(var_name)
       template = """
-        var_name = tf.constant(False)
+        var_name = False
         for target in iter_:
           (var_name,)
           body
diff --git a/tensorflow/python/autograph/converters/break_statements_test.py b/tensorflow/python/autograph/converters/break_statements_test.py
index 39406a969db660892d082ef1b38a673146000ebe..c52ce508df85435e11d7c8404477c24a71fcd16a 100644
--- a/tensorflow/python/autograph/converters/break_statements_test.py
+++ b/tensorflow/python/autograph/converters/break_statements_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import break_statements
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.eager import context as tfe_ctx
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
@@ -43,10 +42,9 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 1)
-      self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 4)
 
   def test_for_loop(self):
 
@@ -82,10 +80,9 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u, w
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 3)
-      self.assertTransformedEquivalent(test_fn, 11)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 11)
 
   def test_nested_loops(self):
 
@@ -105,11 +102,10 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 2)
-      self.assertTransformedEquivalent(test_fn, 3)
-      self.assertTransformedEquivalent(test_fn, 5)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 5)
 
   def test_loop_orelse(self):
 
@@ -127,10 +123,9 @@ class BreakCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 2)
-      self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, 3)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/builtin_functions_test.py b/tensorflow/python/autograph/converters/builtin_functions_test.py
index 2ed14c14e74960980d8e609331c67db02f07755e..2683be16ec7ffa91b1df3cd272336366502d9f4f 100644
--- a/tensorflow/python/autograph/converters/builtin_functions_test.py
+++ b/tensorflow/python/autograph/converters/builtin_functions_test.py
@@ -24,23 +24,26 @@ from tensorflow.python.autograph.converters import builtin_functions
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class BuiltinFunctionsTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_len(self):
 
     def test_fn(a):
       return len(a)
 
     with self.converted(test_fn, builtin_functions, {'len': len}) as result:
-      with self.test_session() as sess:
+      with self.session() as sess:
         p = array_ops.placeholder(dtype=dtypes.int32, shape=None)
         ops = result.test_fn(p)
         self.assertEqual(sess.run(ops, {p: [0, 0, 0]}), 3)
 
+  @test_util.run_deprecated_v1
   def test_print(self):
 
     if six.PY2:
@@ -50,10 +53,11 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
       return print(a)
 
     with self.converted(test_fn, builtin_functions, {'print': print}) as result:
-      with self.test_session() as sess:
+      with self.session() as sess:
         with self.assertPrints('a\n'):
           sess.run(result.test_fn('a'))
 
+  @test_util.run_deprecated_v1
   def test_print_multiple_values(self):
 
     if six.PY2:
@@ -63,7 +67,7 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
       return print(a, b, c)
 
     with self.converted(test_fn, builtin_functions, {'print': print}) as result:
-      with self.test_session() as sess:
+      with self.session() as sess:
         with self.assertPrints('a 1 [2, 3]\n'):
           sess.run(
               result.test_fn(
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index fc2075b78170b29f0f596974ea2e455d49dd54a6..9b85fc8367ceda77ab656bb889c88922cc52e173 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -22,7 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import namedtuple
+import collections
 
 import gast
 
@@ -35,11 +35,11 @@ from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.util import tf_inspect
 
 
-class FunctionInfo(namedtuple('FunctionInfo', ('dtype',))):
+class FunctionInfo(collections.namedtuple('FunctionInfo', ('dtype',))):
   pass
 
 
-# TODO(mdan): Move this to config.py.
+# TODO(mdan): Move this to a separate transformer.
 KNOWN_NUMPY_FUNCTIONS = {
     ('numpy', 'random', 'binomial'): FunctionInfo(dtype='tf.int64'),
 }
@@ -85,14 +85,15 @@ class FunctionNamer(object):
 class CallTreeTransformer(converter.Base):
   """Transforms the call tree by renaming transformed symbols."""
 
-  def _resolve_name(self, node):
+  def _resolve_decorator_name(self, node):
     """Used to resolve decorator info."""
     if isinstance(node, gast.Call):
-      return self._resolve_name(node.func)
+      return self._resolve_decorator_name(node.func)
     if isinstance(node, gast.Name):
-      return self.ctx.namespace.get(node.id)
+      # TODO(mdan): Add test coverage for this branch.
+      return self.ctx.info.namespace.get(node.id)
     if isinstance(node, gast.Attribute):
-      parent = self._resolve_name(node.value)
+      parent = self._resolve_decorator_name(node.value)
       if parent is not None:
         return getattr(parent, node.attr)
       return None
@@ -107,14 +108,28 @@ class CallTreeTransformer(converter.Base):
       if hasattr(owner_type, node.attr):
         return getattr(owner_type, node.attr)
       else:
-        raise ValueError('Type "%s" has not attribute "%s". Is it dynamic?' %
+        # TODO(mdan): We should probably return None here rather than an error.
+        raise ValueError('Type "%s" has no attribute "%s". Is it dynamic?' %
                          (owner_type, node.attr))
     return None
 
   def _function_is_compilable(self, target_entity):
     """Determines whether an entity can be compiled at all."""
-    # TODO(mdan): This is just a placeholder. Implement.
-    return not inspect_utils.isbuiltin(target_entity)
+    # TODO(mdan): Expand.
+
+    if target_entity.__module__ is None:
+      # Functions like builtins and NumPy don't expose a module.
+      # Those in general should not be compiled.
+      return False
+
+    if inspect_utils.isbuiltin(target_entity):
+      return False
+
+    if inspect_utils.isnamedtuple(target_entity):
+      # namedtuple doesn't expose its source code, making it uncompilable.
+      return False
+
+    return True
 
   def _should_compile(self, node, fqn):
     """Determines whether an entity should be compiled in the context."""
@@ -128,25 +143,28 @@ class CallTreeTransformer(converter.Base):
       if fqn[:i] in self.ctx.program.uncompiled_modules:
         return False
 
-    # Check for local decorations
-    if anno.hasanno(node, 'graph_ready'):
-      return False
-
-    # The decorators themselves are not to be converted.
-    # If present, the decorators should appear as static functions.
     target_entity = self._try_resolve_target(node.func)
+
     if target_entity is not None:
-      # This attribute is set by the decorator itself.
-      # TODO(mdan): This may not play nicely with other wrapping decorators.
-      if hasattr(target_entity, '__pyct_is_compile_decorator'):
-        return False
 
-      if target_entity in self.ctx.program.autograph_decorators:
-        return False
+      # Currently, lambdas are always converted.
+      # TODO(mdan): Allow markers of the kind f = ag.do_not_convert(lambda: ...)
+      if inspect_utils.islambda(target_entity):
+        return True
+
+      # This may be reached when "calling" a callable attribute of an object.
+      # For example:
+      #
+      #   self.fc = tf.keras.layers.Dense()
+      #   self.fc()
+      #
+      for mod in self.ctx.program.uncompiled_modules:
+        if target_entity.__module__.startswith(mod[0] + '.'):
+          return False
 
       # Inspect the target function decorators. If any include a @convert
-      # or @graph_ready annotation, then they must be called as they are.
-      # TODO(mdan): This may be quite heavy.
+      # or @do_not_convert annotation, then they must be called as they are.
+      # TODO(mdan): This may be quite heavy. Perhaps always dynamically convert?
       # To parse and re-analyze each function for every call site could be quite
       # wasteful. Maybe we could cache the parsed AST?
       try:
@@ -157,10 +175,15 @@ class CallTreeTransformer(converter.Base):
         # to py_func).
         return True
 
+      # This attribute is set when the decorator was applied before the
+      # function was parsed. See api.py.
+      if hasattr(target_entity, '__ag_compiled'):
+        return False
+
       for dec in target_node.decorator_list:
-        decorator_fn = self._resolve_name(dec)
+        decorator_fn = self._resolve_decorator_name(dec)
         if (decorator_fn is not None and
-            decorator_fn in self.ctx.program.autograph_decorators):
+            decorator_fn in self.ctx.program.options.strip_decorators):
           return False
 
     return True
@@ -171,9 +194,6 @@ class CallTreeTransformer(converter.Base):
     target_entity = anno.getanno(node.func, 'live_val')
     target_fqn = anno.getanno(node.func, 'fqn')
 
-    if not self._should_compile(node, target_fqn):
-      return node
-
     if anno.hasanno(node, 'is_constructor'):
       new_name = self.ctx.namer.compiled_class_name(
           target_fqn, live_entity=target_entity)
@@ -193,20 +213,10 @@ class CallTreeTransformer(converter.Base):
           # The renaming process will transform it into a regular function.
           # TODO(mdan): Is this complete? How does it work with nested members?
           node.args = [node.func.value] + node.args
-      node.func = templates.replace('func_name', func_name=new_name)[0]
+      node.func = templates.replace_as_expression(
+          'func_name', func_name=new_name)
     return node
 
-  def _wrap_to_py_func_no_return(self, node):
-    # TODO(mdan): Properly handle varargs, etc.
-    template = """
-      ag__.utils.wrap_py_func(func, None, (args,), kwargs, True)
-    """
-    return templates.replace(
-        template,
-        func=node.func,
-        args=node.args,
-        kwargs=ast_util.keywords_to_dict(node.keywords))
-
   def _wrap_to_py_func_single_return(self, node, dtype):
     # TODO(mdan): Properly handle varargs, etc.
     template = """
@@ -238,88 +248,91 @@ class CallTreeTransformer(converter.Base):
     # Before we could convert all the time though, we'd need a reasonable
     # caching mechanism.
     template = """
-      ag__.converted_call(
-          func,
-          ag__.ConversionOptions.new(recursive=recursive_val),
-          args)
+      ag__.converted_call(func, owner, options, args)
     """
-    call_expr = templates.replace(
+    if isinstance(node.func, gast.Attribute):
+      func = gast.Str(node.func.attr)
+      owner = node.func.value
+    else:
+      func = node.func
+      owner = parser.parse_expression('None')
+    new_call = templates.replace_as_expression(
         template,
-        func=node.func,
-        recursive_val=parser.parse_expression(str(self.ctx.program.recursive)),
+        func=func,
+        owner=owner,
+        options=self.ctx.program.options.to_ast(
+            self.ctx.info.namespace,
+            internal_convert_user_code=self.ctx.program.options.recursive),
         args=node.args)
-    new_call = call_expr[0].value
     # TODO(mdan): Improve the template mechanism to better support this.
     new_call.keywords = node.keywords
     return new_call
 
-  def visit_Expr(self, node):
-    if isinstance(node.value, gast.Call):
-      if anno.hasanno(node.value.func, 'live_val'):
-        target_entity = anno.getanno(node.value.func, 'live_val')
-        if not self._function_is_compilable(target_entity):
-          if anno.hasanno(node.value.func, 'fqn'):
-            target_fqn = anno.getanno(node.value.func, 'fqn')
-            if not self._should_compile(node.value, target_fqn):
-              return node
-            node = self._wrap_to_py_func_no_return(node.value)
-            return node
-      # Only the case of py_func with no return value is special.
-      # Everything else is processed by visit_Call.
-      self.visit(node.value)
-    else:
-      self.generic_visit(node)
+  def _visit_decorators(self, decorator_list):
+    if not self.ctx.program.options.uses(converter.Feature.DECORATORS):
+      # When not processing decorators, strip everything that is encountered.
+      return []
+
+    return self.visit_block(decorator_list)
+
+  def visit_FunctionDef(self, node):
+    node.args = self.visit(node.args)
+    node.body = self.visit_block(node.body)
+    node.decorator_list = self._visit_decorators(node.decorator_list)
+    node.returns = self.visit_block(node.returns)
     return node
 
   def visit_Call(self, node):
-    # If the function call is wrapped by one of the marker decorators,
-    # consider it graph ready.
-    if anno.hasanno(node.func, 'live_val'):
-      target_entity = anno.getanno(node.func, 'live_val')
-      if target_entity in self.ctx.program.autograph_decorators:
-        if len(node.args) < 1:
-          raise ValueError(
-              'Found call to decorator function "%s", but it had no arguments. '
-              'A decorator needs at least one positional argument.' %
-              target_entity)
-        anno.setanno(node.args[0], 'graph_ready', True)
-
-    self.generic_visit(node)
     if anno.hasanno(node.func, 'live_val'):
       target_entity = anno.getanno(node.func, 'live_val')
+
       if anno.hasanno(node.func, 'fqn'):
         target_fqn = anno.getanno(node.func, 'fqn')
       else:
         target_fqn = None
+
       if self._function_is_compilable(target_entity):
-        node = self._rename_compilable_function(node)
+        if self._should_compile(node, target_fqn):
+          node = self._rename_compilable_function(node)
+        else:
+          node = self.generic_visit(node)
+          return node
+
       elif target_fqn and target_fqn in KNOWN_NUMPY_FUNCTIONS:
         # TODO(mdan): Should we replace these with equivalent TF ops instead?
         node = self._wrap_to_py_func_single_return(
             node, KNOWN_NUMPY_FUNCTIONS[target_fqn].dtype)
+
+      elif inspect_utils.isbuiltin(target_entity):
+        # Note: Any builtin that passed the builtins converter is assumed to be
+        # safe for graph mode.
+        return node
+
+      elif inspect_utils.isnamedtuple(target_entity):
+        # Although not compilable, we assume they are safe for graph mode.
+        node = self.generic_visit(node)
+        return node
+
       else:
+        # TODO(mdan): Instert dynamic conversion here instead.
         raise NotImplementedError(
             'py_func with return values (unknown function)')
     else:
-      if anno.hasanno(node.func, anno.Basic.QN):
-        # Special-case a few builtins that otherwise go undetected. This
-        # normally doesn't pose a problem, but the dict built-in doesn't
-        # work with inspect.getargspec which is required for dynamic functions.
-        # Note: expecting this is resilient to aliasing (e.g.
-        # dict = an_evil_dict), because in those cases the regular mechanisms
-        # process a simple user function.
-        qn = anno.getanno(node.func, anno.Basic.QN)
-        # Add items to this list as needed.
-        if str(qn) in ('dict',):
-          return node
+      # Special cases
+      # TODO(mdan): These need a systematic review - there may be more.
 
+      # 1. super() calls - these are preserved. The class conversion mechanism
+      # will ensure that they return the correct value.
       if ast_util.matches(node, 'super(_)'):
-        # super() calls are preserved. The class conversion mechanism will
-        # ensure that they return the correct value.
         return node
 
-      if self.ctx.program.recursive:
-        node = self._insert_dynamic_conversion(node)
+      # 2. super().method calls - these are preserved as well, when the
+      # conversion processes the entire class.
+      if (ast_util.matches(node, 'super(_)._(_)') and
+          self.ctx.info.owner_type is not None):
+        return node
+
+      node = self._insert_dynamic_conversion(node)
     return node
 
 
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index 0e50f42c6a8fa69519f6576857f9f7a17917bf65..454d75d755c7273d11e1f89e4138cd997eb6e49a 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
 
 from tensorflow.python.autograph.converters import call_trees
@@ -85,23 +87,33 @@ class CallTreesTest(converter_testing.TestCase):
       tc = TestClass()
       self.assertEquals(3, result.test_fn_2(tc, 1))
 
-  def test_py_func_no_retval(self):
+  def test_known_called_lambda(self):
+
+    l = lambda x: x
 
     def test_fn(a):
-      setattr(a, 'foo', 'bar')
+      return l(a)
 
-    with self.converted(test_fn, call_trees, {'setattr': setattr}) as result:
-      with self.cached_session() as sess:
+    ns = {'l': l}
+    node, ctx = self.prepare(test_fn, ns)
+    node = call_trees.transform(node, ctx)
+
+    with self.compiled(node, ns) as result:
+      self.assertEquals(1, result.test_fn(1))
+
+  def test_known_called_namedtuple(self):
+
+    nt = collections.namedtuple('TestNamedTuple', ['a'])
+
+    def test_fn(a):
+      return nt(a)
 
-        class Dummy(object):
-          pass
+    ns = {'nt': nt}
+    node, ctx = self.prepare(test_fn, ns)
+    node = call_trees.transform(node, ctx)
 
-        a = Dummy()
-        result.test_fn(a)
-        py_func_op, = sess.graph.get_operations()
-        self.assertFalse(hasattr(a, 'foo'))
-        sess.run(py_func_op)
-        self.assertEquals('bar', a.foo)
+    with self.compiled(node, ns) as result:
+      self.assertEquals(nt(1), result.test_fn(1))
 
   def test_py_func_known_function(self):
 
@@ -112,7 +124,7 @@ class CallTreesTest(converter_testing.TestCase):
                         dtypes.int64) as result:
       with self.cached_session() as sess:
         self.assertTrue(isinstance(result.test_fn(), ops.Tensor))
-        self.assertIn(sess.run(result.test_fn()), (0, 1, 2))
+        self.assertIn(self.evaluate(result.test_fn()), (0, 1, 2))
 
   def test_uncompiled_modules(self):
 
@@ -131,7 +143,22 @@ class CallTreesTest(converter_testing.TestCase):
     with self.compiled(node, ns) as result:
       with self.cached_session() as sess:
         result_tensor = result.test_fn(constant_op.constant(1))
-        self.assertEquals(sess.run(result_tensor), 3)
+        self.assertEquals(self.evaluate(result_tensor), 3)
+
+  def test_call_to_decorated_function(self):
+
+    def decorator(f):
+      return f
+
+    @decorator
+    def called_fn(a):
+      return a
+
+    def test_fn(a):
+      return called_fn(a)
+
+    node, ctx = self.prepare(test_fn, {'called_fn': called_fn})
+    node = call_trees.transform(node, ctx)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/conditional_expressions.py b/tensorflow/python/autograph/converters/conditional_expressions.py
index 40728f555da625564c7dfd3f58fd31957fa020af..a4eef7e6a1f7c162f5fa19891a3466c23dc86fe9 100644
--- a/tensorflow/python/autograph/converters/conditional_expressions.py
+++ b/tensorflow/python/autograph/converters/conditional_expressions.py
@@ -19,109 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import templates
-from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
-
-
-class _FunctionDefs(object):
-
-  def __init__(self):
-    self.nodes = []
-
-
-class _Statement(object):
-
-  def __init__(self):
-    self.scope = None
 
 
 class ConditionalExpressionTransformer(converter.Base):
   """Converts contitional expressions to functional form."""
 
-  def _postprocess_statement(self, node):
-    """Inserts any separate functions that node may use."""
-    replacements = []
-    for def_node in self.state[_FunctionDefs].nodes:
-      replacements.extend(def_node)
-    replacements.append(node)
-    node = replacements
-    # The corresponding enter is called by self.visit_block (see _process_block)
-    self.state[_FunctionDefs].exit()
-    return node, None
-
-  def _create_branch(self, expr, name_stem):
-    scope = self.state[_Statement].scope
-    name = self.ctx.namer.new_symbol(name_stem, scope.referenced)
-    template = """
-      def name():
-        return expr,
-    """
-    node = templates.replace(template, name=name, expr=expr)
-    self.state[_FunctionDefs].nodes.append(node)
-    return name
-
   def visit_IfExp(self, node):
-    if anno.hasanno(node.test, anno.Basic.QN):
-      name_root = anno.getanno(node.test, anno.Basic.QN).ssf()
-    else:
-      name_root = 'ifexp'
-
-    true_fn_name = self._create_branch(node.body, '%s_true' % name_root)
-    false_fn_name = self._create_branch(node.orelse, '%s_false' % name_root)
-
     return templates.replace_as_expression(
-        'ag__.utils.run_cond(test, true_fn_name, false_fn_name)',
+        'ag__.if_stmt(test, lambda: true_expr, lambda: false_expr)',
         test=node.test,
-        true_fn_name=true_fn_name,
-        false_fn_name=false_fn_name)
-
-  def _process_block(self, scope, block):
-    self.state[_Statement].enter()
-    self.state[_Statement].scope = scope
-    block = self.visit_block(
-        block,
-        before_visit=self.state[_FunctionDefs].enter,
-        after_visit=self._postprocess_statement)
-    self.state[_Statement].exit()
-    return block
-
-  def visit_FunctionDef(self, node):
-    node.args = self.generic_visit(node.args)
-    node.decorator_list = self.visit_block(node.decorator_list)
-    node.body = self._process_block(
-        anno.getanno(node, anno.Static.SCOPE), node.body)
-    return node
-
-  def visit_For(self, node):
-    node.target = self.visit(node.target)
-    node.body = self._process_block(
-        anno.getanno(node, NodeAnno.BODY_SCOPE), node.body)
-    node.orelse = self._process_block(
-        anno.getanno(node, NodeAnno.ORELSE_SCOPE), node.orelse)
-    return node
-
-  def visit_While(self, node):
-    node.test = self.visit(node.test)
-    node.body = self._process_block(
-        anno.getanno(node, NodeAnno.BODY_SCOPE), node.body)
-    node.orelse = self._process_block(
-        anno.getanno(node, NodeAnno.ORELSE_SCOPE), node.orelse)
-    return node
-
-  def visit_If(self, node):
-    node.test = self.visit(node.test)
-    node.body = self._process_block(
-        anno.getanno(node, NodeAnno.BODY_SCOPE), node.body)
-    node.orelse = self._process_block(
-        anno.getanno(node, NodeAnno.ORELSE_SCOPE), node.orelse)
-    return node
-
-  def visit_With(self, node):
-    node.items = self.visit_block(node.items)
-    node.body = self._process_block(
-        anno.getanno(node, NodeAnno.BODY_SCOPE), node.body)
-    return node
+        true_expr=node.body,
+        false_expr=node.orelse)
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index 584cdc1efd4b2e4327be34e1d9d51de3635fccd5..05e19e59fc6701db618e925e1d305f299b270e33 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -24,94 +24,93 @@ from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-# Tags for local state.
-CONTROL_VAR_NAME = 'control_var_name'
-CONTINUE_USED = 'continue_used'
-GUARD_CREATED = 'guard_created'
-CREATE_GUARD_NEXT = 'create_guard_next'
+class _Continue(object):
+
+  def __init__(self):
+    self.used = False
+    self.control_var_name = None
+    self.create_guard = False
+    self.guard_created = False
+
+  def __repr__(self):
+    return 'used: %s, var: %s' % (self.used, self.control_var_name)
 
 
 class ContinueCanonicalizationTransformer(converter.Base):
   """Canonicalizes continue statements into additional conditionals."""
 
   def visit_Continue(self, node):
-    self.set_local(CONTINUE_USED, True)
+    self.state[_Continue].used = True
     template = """
-      var_name = tf.constant(True)
+      var_name = True
     """
     return templates.replace(
-        template, var_name=self.get_local(CONTROL_VAR_NAME))
+        template, var_name=self.state[_Continue].control_var_name)
 
   def _postprocess_statement(self, node):
     # Example of how the state machine below works:
     #
-    #   1| stmt           # State: CONTINUE_USED = False
+    #   1| stmt           # State: Continue_.used = False
     #    |                # Action: none
     #   2| if cond:
-    #   3|   continue     # State: CONTINUE_USED = True,
-    #    |                #        GUARD_CREATED = False,
-    #    |                #        CREATE_GUARD_NEXT = False
-    #    |                # Action: set CREATE_GUARD_NEXT = True
-    #   4| stmt           # State: CONTINUE_USED = True,
-    #    |                #        GUARD_CREATED = False,
-    #    |                #        CREATE_GUARD_NEXT = True
+    #   3|   continue     # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = False,
+    #    |                #        Continue_.create_guard = False
+    #    |                # Action: Continue_.create_guard = True
+    #   4| stmt           # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = False,
+    #    |                #        Continue_.create_guard = True
     #    |                # Action: create `if not continue_used`,
-    #    |                #         set GUARD_CREATED = True
-    #   5| stmt           # State: CONTINUE_USED = True, GUARD_CREATED = True
+    #    |                #         set Continue_.guard_created = True
+    #   5| stmt           # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = True
     #    |                # Action: none (will be wrapped under previously
     #    |                #         created if node)
 
-    if self.get_local(CONTINUE_USED, False):
-      if self.get_local(GUARD_CREATED, False):
+    if self.state[_Continue].used:
+      if self.state[_Continue].guard_created:
         return node, None
 
-      elif not self.get_local(CREATE_GUARD_NEXT, False):
-        self.set_local(CREATE_GUARD_NEXT, True)
+      elif not self.state[_Continue].create_guard:
+        self.state[_Continue].create_guard = True
         return node, None
 
       else:
-        self.set_local(GUARD_CREATED, True)
+        self.state[_Continue].guard_created = True
         template = """
           if not var_name:
             original_node
         """
         cond, = templates.replace(
             template,
-            var_name=self.get_local(CONTROL_VAR_NAME),
+            var_name=self.state[_Continue].control_var_name,
             original_node=node)
         return cond, cond.body
     return node, None
 
   def _visit_loop_body(self, node, nodes):
-    self.enter_local_scope()
+    self.state[_Continue].enter()
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     continue_var = self.ctx.namer.new_symbol('continue_', scope.referenced)
-    self.set_local(CONTROL_VAR_NAME, continue_var)
+    self.state[_Continue].control_var_name = continue_var
 
     nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
 
-    if self.get_local(CONTINUE_USED, False):
+    if self.state[_Continue].used:
       template = """
-        var_name = tf.constant(False)
+        var_name = False
       """
       control_var_init = templates.replace(template, var_name=continue_var)
       nodes = control_var_init + nodes
 
-    self.exit_local_scope()
+    self.state[_Continue].exit()
     return nodes
 
-  def _visit_non_loop_body(self, nodes):
-    self.enter_local_scope(inherit=(CONTROL_VAR_NAME,))
-    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
-    continue_used = self.get_local(CONTINUE_USED, False)
-    self.exit_local_scope(keep=(CONTINUE_USED,))
-    return nodes, continue_used
-
   def visit_While(self, node):
     node.test = self.visit(node.test)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse, _ = self._visit_non_loop_body(node.orelse)
+    node.orelse = self.visit_block(node.orelse)
     return node
 
   def visit_For(self, node):
@@ -119,21 +118,11 @@ class ContinueCanonicalizationTransformer(converter.Base):
     node.iter = self.generic_visit(node.iter)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse, _ = self._visit_non_loop_body(node.orelse)
-    return node
-
-  def visit_If(self, node):
-    node.test = self.generic_visit(node.test)
-    node.body, continue_used_body = self._visit_non_loop_body(node.body)
-    node.orelse, continue_used_orelse = self._visit_non_loop_body(node.orelse)
-    self.set_local(CONTINUE_USED, continue_used_body or continue_used_orelse)
-    return node
-
-  def visit_With(self, node):
-    node.items = self.visit_block(node.items)
-    node.body, _ = self._visit_non_loop_body(node.body)
+    node.orelse = self.visit_block(node.orelse)
     return node
 
 
 def transform(node, ctx):
-  return ContinueCanonicalizationTransformer(ctx).visit(node)
+  transformer = ContinueCanonicalizationTransformer(ctx)
+  node = transformer.visit(node)
+  return node
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 416a60d2ee1c408d22d0f8e23a6a28751e8277ad..bef6cae1bb89908bd644115e31ca5662043b060c 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -49,12 +49,23 @@ class ControlFlowTransformer(converter.Base):
 
   def _create_cond_branch(self, body_name, aliased_orig_names,
                           aliased_new_names, body, returns):
+    if len(returns) == 1:
+      template = """
+        return retval
+      """
+      return_stmt = templates.replace(template, retval=returns[0])
+    else:
+      template = """
+        return (retvals,)
+      """
+      return_stmt = templates.replace(template, retvals=returns)
+
     if aliased_orig_names:
       template = """
         def body_name():
           aliased_new_names, = aliased_orig_names,
           body
-          return (returns,)
+          return_stmt
       """
       return templates.replace(
           template,
@@ -62,20 +73,20 @@ class ControlFlowTransformer(converter.Base):
           body=body,
           aliased_orig_names=aliased_orig_names,
           aliased_new_names=aliased_new_names,
-          returns=returns)
+          return_stmt=return_stmt)
     else:
       template = """
         def body_name():
           body
-          return (returns,)
+          return_stmt
       """
       return templates.replace(
-          template, body_name=body_name, body=body, returns=returns)
+          template, body_name=body_name, body=body, return_stmt=return_stmt)
 
   def _create_cond_expr(self, results, test, body_name, orelse_name):
     if results is not None:
       template = """
-        results = ag__.utils.run_cond(test, body_name, orelse_name)
+        results = ag__.if_stmt(test, body_name, orelse_name)
       """
       return templates.replace(
           template,
@@ -85,36 +96,59 @@ class ControlFlowTransformer(converter.Base):
           orelse_name=orelse_name)
     else:
       template = """
-        ag__.utils.run_cond(test, body_name, orelse_name)
+        ag__.if_stmt(test, body_name, orelse_name)
       """
       return templates.replace(
           template, test=test, body_name=body_name, orelse_name=orelse_name)
 
-  def _fmt_symbol_list(self, symbol_set):
+  def _fmt_symbols(self, symbol_set):
     if not symbol_set:
       return 'no variables'
     return ', '.join(map(str, symbol_set))
 
-  def _validate_no_live_vars_created(self, node):
-    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
-    live_vars_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
-    live_vars_created_in_body = live_vars_out & body_scope.created
-    if live_vars_created_in_body:
-      raise ValueError(
-          'The following variables are created inside the loop and used later:'
-          '\n%s\n'
-          'Variables must be declared outside loops because loops may not'
-          ' necessarily execute.' % self._fmt_symbol_list(
-              live_vars_created_in_body))
+  def _determine_aliased_symbols(self, scope, node_defined_in, block):
+    if block:
+      block_live_in = set(anno.getanno(block[0], anno.Static.LIVE_VARS_IN))
+    else:
+      block_live_in = set()
+
+    # For the purpose of aliasing, composite symbols with live owners are live
+    # as well. Otherwise this would leak tensors from the conditional's body.
+    #
+    # For example:
+    #
+    #   obj = some_obj
+    #   if cond:
+    #     obj.a = val
+    #
+    # Thanslating to the code below would be incorrect:
+    #
+    #   def true_fn():
+    #     obj.a = val()  # Wrong! leaks ops owned by true_fn
+    #     return obj.a
+    for s in scope.modified:
+      if s.is_composite():
+        live_parents = block_live_in & s.owner_set
+        if live_parents:
+          block_live_in.add(s)
+    return scope.modified & node_defined_in & block_live_in
 
   def visit_If(self, node):
-    node = self.generic_visit(node)
-
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
     defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
 
+    # Note: this information needs to be extracted before the body conversion
+    # that happens in the call to generic_visit below, because the conversion
+    # generates nodes that lack static analysis annotations.
+    need_alias_in_body = self._determine_aliased_symbols(
+        body_scope, defined_in, node.body)
+    need_alias_in_orelse = self._determine_aliased_symbols(
+        orelse_scope, defined_in, node.orelse)
+
+    node = self.generic_visit(node)
+
     modified_in_cond = body_scope.modified | orelse_scope.modified
     returned_from_cond = set()
     for s in modified_in_cond:
@@ -123,23 +157,24 @@ class ControlFlowTransformer(converter.Base):
       elif s.is_composite():
         # Special treatment for compound objects: if any of their owner entities
         # are live, then they are outputs as well.
-        if any(owner in live_out for owner in s.owner_set):
+        if live_out & s.owner_set:
           returned_from_cond.add(s)
 
-    need_alias_in_body = body_scope.modified & defined_in
-    need_alias_in_orelse = orelse_scope.modified & defined_in
-
     created_in_body = body_scope.modified & returned_from_cond - defined_in
     created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
 
-    if created_in_body != created_in_orelse:
+    basic_created_in_body = tuple(
+        s for s in created_in_body if not s.is_composite())
+    basic_created_in_orelse = tuple(
+        s for s in created_in_orelse if not s.is_composite())
+    if basic_created_in_body != basic_created_in_orelse:
       raise ValueError(
           'if statement may not initialize all variables: the true branch'
           ' creates %s, while the false branch creates %s. Make sure all'
           ' these variables are initialized either in both'
           ' branches or before the if statement.' %
-          (self._fmt_symbol_list(created_in_body),
-           self._fmt_symbol_list(created_in_orelse)))
+          (self._fmt_symbols(basic_created_in_body),
+           self._fmt_symbols(basic_created_in_orelse)))
 
     # Alias the closure variables inside the conditional functions, to allow
     # the functions access to the respective variables.
@@ -161,10 +196,13 @@ class ControlFlowTransformer(converter.Base):
     node_body = ast_util.rename_symbols(node.body, alias_body_map)
     node_orelse = ast_util.rename_symbols(node.orelse, alias_orelse_map)
 
+    cond_var_name = self.ctx.namer.new_symbol('cond', body_scope.referenced)
+    body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
+    orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
+
     returned_from_cond = tuple(returned_from_cond)
     if returned_from_cond:
       if len(returned_from_cond) == 1:
-        # TODO(mdan): Move this quirk into the operator implementation.
         cond_results = returned_from_cond[0]
       else:
         cond_results = gast.Tuple([s.ast() for s in returned_from_cond], None)
@@ -183,12 +221,14 @@ class ControlFlowTransformer(converter.Base):
       # actually has some return value as well.
       cond_results = None
       # TODO(mdan): This doesn't belong here; it's specific to the operator.
-      returned_from_body = templates.replace_as_expression('tf.constant(1)')
-      returned_from_orelse = templates.replace_as_expression('tf.constant(1)')
-
-    body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
-    orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
-
+      returned_from_body = (templates.replace_as_expression(
+          'ag__.match_staging_level(1, cond_var_name)',
+          cond_var_name=cond_var_name),)
+      returned_from_orelse = (templates.replace_as_expression(
+          'ag__.match_staging_level(1, cond_var_name)',
+          cond_var_name=cond_var_name),)
+
+    cond_assign = self.create_assignment(cond_var_name, node.test)
     body_def = self._create_cond_branch(
         body_name,
         aliased_orig_names=aliased_body_orig_names,
@@ -201,56 +241,102 @@ class ControlFlowTransformer(converter.Base):
         aliased_new_names=aliased_orelse_new_names,
         body=node_orelse,
         returns=returned_from_orelse)
-    cond_expr = self._create_cond_expr(cond_results, node.test, body_name,
+    cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
                                        orelse_name)
 
-    return body_def + orelse_def + cond_expr
-
-  def visit_While(self, node):
-    self.generic_visit(node)
-
-    self._validate_no_live_vars_created(node)
+    return cond_assign + body_def + orelse_def + cond_expr
 
+  def _get_loop_state(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
-    body_closure = body_scope.modified - body_scope.created
-    all_referenced = body_scope.referenced
-
-    cond_scope = anno.getanno(node, annos.NodeAnno.COND_SCOPE)
-    cond_closure = set()
-    for s in cond_scope.used:
-      for root in s.support_set:
-        if root not in body_scope.created:
-          cond_closure.add(root)
-
-    state = list(body_closure)
-    if not state:
+    defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
+    live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
+    live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
+    reserved_symbols = body_scope.referenced
+
+    # Note that it doesn't matter whether the variables are live after the loop.
+    # If the loop modifies them nonlocally (e.g. the result of an iteration
+    # depends on the previous iteration), then they need to be included in
+    # the loop state, regardless of whether they are later used or not.
+    loop_state = body_scope.modified & live_in
+
+    undefined_lives = loop_state - defined_in
+    # Only simple variables must be defined. The composite ones will be
+    # implicitly checked at runtime.
+    undefined_simple_lives = {v for v in undefined_lives if v.is_simple()}
+    if undefined_simple_lives:
+      raise NameError(
+          'cannot convert loop: it includes symbols that are undefined'
+          ' when entering the loop: {}'.format(
+              self._fmt_symbols(undefined_simple_lives)))
+
+    live_defs_in_loop = (body_scope.modified - live_in) & live_out
+    if live_defs_in_loop:
+      # TODO(mdan): Include reference to explanation why.
+      raise NotImplementedError(
+          'cannot convert loop: it includes symbols that are defined'
+          ' inside the loop, but used later: {}. To fix, initialize'
+          ' these symbols before the loop'.format(
+              self._fmt_symbols(live_defs_in_loop)))
+
+    if not loop_state:
       # TODO(mdan): Implement this properly.
-      # To complete this statement, we need to check whether any variable
-      # created inside the body scope is used before being modified outside the
-      # scope. This should be done during activity analysis, and in general
-      # should cover the case where variables may not be initialized.
-      raise ValueError('cannot convert while loop: no outputs')
+      # We need to check whether any variable created inside the body scope
+      # is used before being modified outside the scope. This should be done
+      # during activity analysis, and in general should cover the case where
+      # variables may not be initialized.
+      raise ValueError('cannot convert loop: no outputs')
+
+    return loop_state, reserved_symbols
 
+  def _state_constructs(self, loop_state, reserved_symbols):
+    loop_state = list(loop_state)
     state_ssf = [
-        self.ctx.namer.new_symbol(s.ssf(), all_referenced) for s in state
+        self.ctx.namer.new_symbol(s.ssf(), reserved_symbols) for s in loop_state
     ]
     ssf_map = {
         name: ssf
-        for name, ssf in zip(state, state_ssf)
+        for name, ssf in zip(loop_state, state_ssf)
         if str(name) != ssf
     }
 
-    if len(state) == 1:
-      state = state[0]
+    if len(loop_state) == 1:
+      loop_state = loop_state[0]
       state_ssf = state_ssf[0]
-      state_ast_tuple = state
+      state_ast_tuple = loop_state
     else:
-      state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
+      state_ast_tuple = gast.Tuple([n.ast() for n in loop_state], None)
+
+    return loop_state, state_ssf, state_ast_tuple, ssf_map
 
+  def visit_While(self, node):
+    self.generic_visit(node)
+
+    loop_state, reserved_symbols = self._get_loop_state(node)
+
+    # Note: one might expect we can dispatch based on the loop condition.
+    # But because that is dependent on the state, it cannot be evaluated ahead
+    # of time - doing that would risk duplicating any effects the condition has.
+    # Furthermore, we cannot evaluate slices and attributes, because they might
+    # trigger __getitem__ or __getattribute__.
+    #
+    # A case where this fails includes ops with side effects on a stateful
+    # resource captured in an object:
+    #
+    #   while self.v.read() > 0:
+    #     self.v.assign(1)
+    #
+    # TODO(mdan): Handle the case above.
+    cond_scope = anno.getanno(node, annos.NodeAnno.COND_SCOPE)
+    cond_closure = set()
+    for s in cond_scope.read:
+      cond_closure.update(s.support_set)
+    cond_closure -= loop_state
+
+    loop_state, state_ssf, state_ast_tuple, ssf_map = self._state_constructs(
+        loop_state, reserved_symbols)
     node_body = ast_util.rename_symbols(node.body, ssf_map)
     test = ast_util.rename_symbols(node.test, ssf_map)
 
-    # TODO(b/113118541) investigate the need-for and correctness-of extra_deps
     template = """
       def test_name(state_ssf):
         return test
@@ -262,12 +348,12 @@ class ControlFlowTransformer(converter.Base):
     """
     node = templates.replace(
         template,
-        state=state,
+        state=loop_state,
         state_ssf=state_ssf,
         state_ast_tuple=state_ast_tuple,
-        test_name=self.ctx.namer.new_symbol('loop_test', body_scope.referenced),
+        test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
         test=test,
-        body_name=self.ctx.namer.new_symbol('loop_body', body_scope.referenced),
+        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
         body=node_body,
         extra_deps=tuple(s.ast() for s in cond_closure),
     )
@@ -277,30 +363,9 @@ class ControlFlowTransformer(converter.Base):
   def visit_For(self, node):
     self.generic_visit(node)
 
-    self._validate_no_live_vars_created(node)
-
-    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
-    body_closure = body_scope.modified - body_scope.created
-    all_referenced = body_scope.referenced
-
-    state = list(body_closure)
-
-    state_ssf = [
-        self.ctx.namer.new_symbol(s.ssf(), all_referenced) for s in state
-    ]
-    ssf_map = {
-        name: ssf
-        for name, ssf in zip(state, state_ssf)
-        if str(name) != ssf
-    }
-
-    if len(state) == 1:
-      state = state[0]
-      state_ssf = state_ssf[0]
-      state_ast_tuple = state
-    else:
-      state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
-
+    loop_state, reserved_symbols = self._get_loop_state(node)
+    loop_state, state_ssf, state_ast_tuple, ssf_map = self._state_constructs(
+        loop_state, reserved_symbols)
     node_body = ast_util.rename_symbols(node.body, ssf_map)
     if anno.hasanno(node, 'extra_test'):
       extra_test = anno.getanno(node, 'extra_test')
@@ -321,14 +386,15 @@ class ControlFlowTransformer(converter.Base):
     """
     node = templates.replace(
         template,
-        state=state,
+        state=loop_state,
         state_ssf=state_ssf,
         state_ast_tuple=state_ast_tuple,
         iter_=node.iter,
         iterate=node.target,
-        extra_test_name=self.ctx.namer.new_symbol('extra_test', all_referenced),
+        extra_test_name=self.ctx.namer.new_symbol('extra_test',
+                                                  reserved_symbols),
         extra_test_expr=extra_test,
-        body_name=self.ctx.namer.new_symbol('loop_body', all_referenced),
+        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
         body=node_body)
 
     return node
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index cfa0ea920ce8ad50e59eb0327f8486017405dae9..034fcbe3865cdd78cdaad19631da98359cb4690d 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -36,6 +37,7 @@ class ControlFlowTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         self.assertEqual(sess.run(result.test_fn(*inputs)), expected)
 
+  @test_util.run_deprecated_v1
   def test_while_basic(self):
 
     def test_fn(n):
@@ -48,6 +50,7 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(5), (10, 5, 5))
 
+  @test_util.run_deprecated_v1
   def test_while_nested(self):
 
     def test_fn(n):
@@ -66,6 +69,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(5),
                                  (25, 5, 0, 5))
 
+  @test_util.run_deprecated_v1
   def test_while_single_output(self):
 
     def test_fn(n):
@@ -83,9 +87,10 @@ class ControlFlowTest(converter_testing.TestCase):
       return s
 
     node, ctx = self.prepare(bad_while_loop, {})
-    with self.assertRaises(transformer.AutographParseError):
+    with self.assertRaises(NameError):
       control_flow.transform(node, ctx)
 
+  @test_util.run_deprecated_v1
   def test_if_basic(self):
 
     def test_fn(n):
@@ -100,6 +105,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), (-1, 0))
     self.assertTransformedResult(test_fn, constant_op.constant(-1), (0, -2))
 
+  @test_util.run_deprecated_v1
   def test_if_complex_outputs(self):
 
     class TestClass(object):
@@ -124,6 +130,7 @@ class ControlFlowTest(converter_testing.TestCase):
         res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
         self.assertEqual(sess.run((res_obj.a, res_obj.b)), (0, -2))
 
+  @test_util.run_deprecated_v1
   def test_if_single_output(self):
 
     def test_fn(n):
@@ -133,6 +140,7 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(1), -1)
 
+  @test_util.run_deprecated_v1
   def test_if_semi(self):
 
     def test_fn(n):
@@ -143,6 +151,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(2), 3)
     self.assertTransformedResult(test_fn, constant_op.constant(-3), -3)
 
+  @test_util.run_deprecated_v1
   def test_if_local_var(self):
 
     def test_fn(n):
@@ -154,6 +163,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
+  @test_util.run_deprecated_v1
   def test_if_no_outputs(self):
 
     def test_fn(n):
@@ -177,6 +187,7 @@ class ControlFlowTest(converter_testing.TestCase):
     with self.assertRaises(transformer.AutographParseError):
       control_flow.transform(node, ctx)
 
+  @test_util.run_deprecated_v1
   def test_simple_for(self):
 
     def test_fn(l):
@@ -191,6 +202,7 @@ class ControlFlowTest(converter_testing.TestCase):
     empty_vector = constant_op.constant([], shape=(0,), dtype=dtypes.int32)
     self.assertTransformedResult(test_fn, empty_vector, (0, 0))
 
+  @test_util.run_deprecated_v1
   def test_for_single_output(self):
 
     def test_fn(l):
@@ -232,9 +244,10 @@ class ControlFlowTest(converter_testing.TestCase):
       return s
 
     node, ctx = self.prepare(bad_for_loop, {})
-    with self.assertRaises(transformer.AutographParseError):
+    with self.assertRaises(NameError):
       control_flow.transform(node, ctx)
 
+  @test_util.run_deprecated_v1
   def test_for_tuple_unpacking(self):
     def test_fn(x_list):
       z = tf.constant(0)  # pylint:disable=undefined-variable
diff --git a/tensorflow/python/autograph/converters/decorators.py b/tensorflow/python/autograph/converters/decorators.py
index 724f0fe5eda41ad55ea896d4db1d7e2ceeddc713..f0ea51277468499937089c89eedb344149cb1ae7 100644
--- a/tensorflow/python/autograph/converters/decorators.py
+++ b/tensorflow/python/autograph/converters/decorators.py
@@ -53,11 +53,14 @@ class DecoratorsTransformer(converter.Base):
         # This is currently verified by tests.
         continue
 
+      if not anno.hasanno(dec_func, 'live_val'):
+        raise ValueError('could not resolve the decorator "@%s"' %
+                         (anno.getanno(dec_func, anno.Basic.QN)))
+
       original_dec = anno.getanno(dec_func, anno.Basic.QN)
       dec_value = anno.getanno(dec_func, 'live_val')
 
-      if dec_value in self.ctx.program.autograph_decorators:
-        # AutoGraph decorators do not need to be preserved.
+      if dec_value in self.ctx.program.options.strip_decorators:
         continue
 
       # When using foo.bar.baz, we only really need to grab foo and import
diff --git a/tensorflow/python/autograph/converters/decorators_test.py b/tensorflow/python/autograph/converters/decorators_test.py
index fb31c8d583678eeee5b202642b428b831ddacdf5..abd76849d6eafd92c2d7fa540a30d699e3a57e52 100644
--- a/tensorflow/python/autograph/converters/decorators_test.py
+++ b/tensorflow/python/autograph/converters/decorators_test.py
@@ -57,17 +57,14 @@ def self_transform_decorator(transform):
 
 class DecoratorsTest(converter_testing.TestCase):
 
-  def _transform(self, f, autograph_decorators):
+  def _transform(self, f, strip_decorators):
     namespace = {
         'self_transform_decorator': self_transform_decorator,
         'simple_decorator': simple_decorator,
         'converter_testing': converter_testing,
     }
     node, ctx = self.prepare(
-        f,
-        namespace,
-        recursive=False,
-        autograph_decorators=autograph_decorators)
+        f, namespace, recursive=False, strip_decorators=strip_decorators)
     node = decorators.transform(node, ctx)
     import_line = '\n'.join(ctx.program.additional_imports)
     result, _ = compiler.ast_to_object(node, source_prefix=import_line)
diff --git a/tensorflow/python/autograph/converters/error_handlers_test.py b/tensorflow/python/autograph/converters/error_handlers_test.py
index 676ff9e02baef2d54c95b6de1ba8ca03146ec6a9..1f6c5a682172b54dfd6c1c47f2ac94396db11d43 100644
--- a/tensorflow/python/autograph/converters/error_handlers_test.py
+++ b/tensorflow/python/autograph/converters/error_handlers_test.py
@@ -18,11 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gast
+
 from tensorflow.python.autograph.converters import error_handlers
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.platform import test
 
 
@@ -33,26 +34,20 @@ class ErrorHandlersTest(converter_testing.TestCase):
     def test_fn():
       raise ValueError()
 
-    node, ctx = self.prepare(test_fn, {})
-    anno.setanno(
-        node, anno.Basic.ORIGIN,
-        origin_info.OriginInfo(None, 'test_function_name', 'test_code',
-                               'test_comment'))
-    node = error_handlers.transform(node, ctx)
-    with self.compiled(node, {}) as result:
+    with self.converted(test_fn, error_handlers, {}) as result:
       with self.assertRaises(errors.GraphConstructionError):
-        # Here we just assert that the handler works. Its correctness is
-        # verified by errors_test.py.
+        # Here we just assert that the handler works.
         result.test_fn()
 
   def test_no_origin_annotation(self):
 
-    def test_fn():
-      raise ValueError()
+    def test_fn(x):
+      return x + 1
 
-    with self.converted(test_fn, error_handlers, {}) as result:
-      with self.assertRaises(ValueError):
-        result.test_fn()
+    node, ctx = self.prepare(test_fn, {})
+    anno.delanno(node, anno.Basic.ORIGIN)
+    node = error_handlers.transform(node, ctx)
+    self.assertIsInstance(node.body[0], gast.Return)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/function_scopes_test.py b/tensorflow/python/autograph/converters/function_scopes_test.py
index e5ce03a1090a072c413a71eda5643530dff025bd..5a1248c8015c36882136421bfe4efc7d3dd58831 100644
--- a/tensorflow/python/autograph/converters/function_scopes_test.py
+++ b/tensorflow/python/autograph/converters/function_scopes_test.py
@@ -22,11 +22,13 @@ from tensorflow.python.autograph.converters import function_scopes
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FunctionBodyTransformerTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_basic(self):
 
     def test_fn(l):
@@ -40,6 +42,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       self.assertIn('test_fn/', result_op.op.name)
       self.assertEqual('Docstring.', result.test_fn.__doc__)
 
+  @test_util.run_deprecated_v1
   def test_multiline_docstring(self):
 
     tf = None
@@ -58,6 +61,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       self.assertIn('First sentence.', result.test_fn.__doc__)
       self.assertIn('Second sentence.', result.test_fn.__doc__)
 
+  @test_util.run_deprecated_v1
   def test_nested_functions(self):
 
     def test_fn(l):
@@ -74,6 +78,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       self.assertNotIn('inner_fn', first.op.name)
       self.assertIn('test_fn/inner_fn/', second.op.name)
 
+  @test_util.run_deprecated_v1
   def test_method(self):
 
     class TestClass(object):
diff --git a/tensorflow/python/autograph/converters/lists_test.py b/tensorflow/python/autograph/converters/lists_test.py
index f6da845fcc3f19106073deaa094c0479063c02e7..39843c7d74f7f8e3f9c35d74258df4d3df86355b 100644
--- a/tensorflow/python/autograph/converters/lists_test.py
+++ b/tensorflow/python/autograph/converters/lists_test.py
@@ -68,7 +68,7 @@ class ListTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2, 3])
+        self.assertAllEqual(self.evaluate(r), [1, 2, 3])
 
   def test_list_pop(self):
 
@@ -91,8 +91,8 @@ class ListTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         ts, tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2])
-        self.assertAllEqual(sess.run(ts), 3)
+        self.assertAllEqual(self.evaluate(r), [1, 2])
+        self.assertAllEqual(self.evaluate(ts), 3)
 
   def test_double_list_pop(self):
 
@@ -123,7 +123,7 @@ class ListTest(converter_testing.TestCase):
 
     with self.compiled(node, {}, array_ops.stack, dtypes.int32) as result:
       with self.cached_session() as sess:
-        self.assertAllEqual(sess.run(result.test_fn()), [1, 2, 3])
+        self.assertAllEqual(self.evaluate(result.test_fn()), [1, 2, 3])
 
   # TODO(mdan): Add a test with tf.stack with axis kwarg.
 
diff --git a/tensorflow/python/autograph/converters/logical_expressions.py b/tensorflow/python/autograph/converters/logical_expressions.py
index 8c4d53f9a8ac914746d29e4efd7e8315d6c6a63b..dfcaafdc9eba61bcb3c03432eadf309484d48dee 100644
--- a/tensorflow/python/autograph/converters/logical_expressions.py
+++ b/tensorflow/python/autograph/converters/logical_expressions.py
@@ -28,7 +28,6 @@ from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 
-
 # TODO(mdan): Properly extrack boolean ops according to lazy eval rules.
 # Note that this isn't completely safe either, because tensors may have control
 # dependencies.
@@ -44,19 +43,22 @@ class LogicalExpressionTransformer(converter.Base):
 
   def __init__(self, ctx):
     super(LogicalExpressionTransformer, self).__init__(ctx)
-    # TODO(mdan): Look into replacing with bitwise operators instead.
-    # TODO(mdan): Skip replacing if the function is trivial.
+    # TODO(mdan): For completeness and consistency, overload everything.
     self.op_mapping = {
-        gast.And: 'tf.logical_and',
-        gast.Eq: 'tf.equal',
-        gast.Gt: 'tf.greater',
-        gast.GtE: 'tf.greater_equal',
-        gast.Lt: 'tf.less',
-        gast.LtE: 'tf.less_equal',
-        gast.Not: 'tf.logical_not',
-        gast.NotEq: 'tf.not_equal',
-        gast.Or: 'tf.logical_or',
-        gast.USub: 'tf.negative',
+        gast.And: 'ag__.and_',
+        gast.Eq: 'ag__.eq',
+        gast.NotEq: 'ag__.not_eq',
+        gast.Lt: 'ag__.lt',
+        gast.LtE: 'ag__.lt_e',
+        gast.Gt: 'ag__.gt',
+        gast.GtE: 'ag__.gt_e',
+        gast.Is: 'ag__.is_',
+        gast.IsNot: 'ag__.is_not',
+        gast.In: 'ag__.in_',
+        gast.Not: 'ag__.not_',
+        gast.NotIn: 'ag__.not_in',
+        gast.Or: 'ag__.or_',
+        gast.USub: 'ag__.u_sub',
     }
 
   def _expect_simple_symbol(self, operand):
@@ -78,27 +80,48 @@ class LogicalExpressionTransformer(converter.Base):
     op_type = type(operator)
     return self.op_mapping[op_type]
 
-  def _as_function(self, func_name, args):
-    template = """
-      func_name(args)
-    """
-    replacement = templates.replace_as_expression(
-        template, func_name=parser.parse_expression(func_name), args=args)
+  def _as_function(self, func_name, args, args_as_lambda=False):
+    if args_as_lambda:
+      args_as_lambda = []
+      for arg in args:
+        template = """
+          lambda: arg
+        """
+        args_as_lambda.append(
+            templates.replace_as_expression(template, arg=arg))
+      args = args_as_lambda
+
+    if not args:
+      template = """
+        func_name()
+      """
+      replacement = templates.replace_as_expression(
+          template, func_name=parser.parse_expression(func_name))
+    elif len(args) == 1:
+      template = """
+        func_name(arg)
+      """
+      replacement = templates.replace_as_expression(
+          template, func_name=parser.parse_expression(func_name), arg=args[0])
+    elif len(args) == 2:
+      template = """
+        func_name(arg1, arg2)
+      """
+      replacement = templates.replace_as_expression(
+          template,
+          func_name=parser.parse_expression(func_name),
+          arg1=args[0],
+          arg2=args[1])
+    else:
+      raise NotImplementedError('{} arguments for {}'.format(
+          len(args), func_name))
+
     anno.setanno(replacement, SAFE_BOOLEAN_OPERAND, True)
     return replacement
 
   def visit_Compare(self, node):
     node = self.generic_visit(node)
 
-    if not all(self._has_matching_func(op) for op in node.ops):
-      if len(node.ops) == 1:
-        # Basic expressions are safe to leave as they are.
-        return node
-      else:
-        raise NotImplementedError(
-            'compound expression with at least one unsupported '
-            'operator: {}'.format(node.ops))
-
     ops_and_comps = list(zip(node.ops, node.comparators))
     left = node.left
     op_tree = None
@@ -113,8 +136,8 @@ class LogicalExpressionTransformer(converter.Base):
         anno.setanno(binary_comparison, SAFE_BOOLEAN_OPERAND, True)
       if op_tree:
         self._expect_simple_symbol(right)
-        op_tree = self._as_function('tf.logical_and',
-                                    (binary_comparison, op_tree))
+        op_tree = self._as_function(
+            'ag__.and_', (op_tree, binary_comparison), args_as_lambda=True)
       else:
         op_tree = binary_comparison
       left = right
@@ -123,7 +146,7 @@ class LogicalExpressionTransformer(converter.Base):
 
   def visit_UnaryOp(self, node):
     node = self.generic_visit(node)
-    return self._as_function(self._matching_func(node.op), node.operand)
+    return self._as_function(self._matching_func(node.op), (node.operand,))
 
   def visit_BoolOp(self, node):
     node = self.generic_visit(node)
@@ -133,7 +156,8 @@ class LogicalExpressionTransformer(converter.Base):
     while node_values:
       left = node_values.pop()
       self._expect_simple_symbol(left)
-      right = self._as_function(self._matching_func(node.op), (left, right))
+      right = self._as_function(
+          self._matching_func(node.op), (left, right), args_as_lambda=True)
     return right
 
 
diff --git a/tensorflow/python/autograph/converters/logical_expressions_test.py b/tensorflow/python/autograph/converters/logical_expressions_test.py
index b78b4d3a6a7a05e33ab3484b5ba3040fb1e86d8c..687412750e0b2d3e7db275f6c25e5923ffaaa831 100644
--- a/tensorflow/python/autograph/converters/logical_expressions_test.py
+++ b/tensorflow/python/autograph/converters/logical_expressions_test.py
@@ -20,34 +20,57 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import logical_expressions
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.ops import math_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class GradientsFunctionTest(converter_testing.TestCase):
+class LogicalExpressionTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_equals(self):
 
     def test_fn(a, b):
       return a == b
 
-    with self.converted(test_fn, logical_expressions, {},
-                        math_ops.equal) as result:
+    with self.converted(test_fn, logical_expressions, {}) as result:
       with self.cached_session() as sess:
-        self.assertTrue(sess.run(result.test_fn(1, 1)))
-        self.assertFalse(sess.run(result.test_fn(1, 2)))
+        self.assertTrue(sess.run(result.test_fn(constant_op.constant(1), 1)))
+        self.assertFalse(sess.run(result.test_fn(constant_op.constant(1), 2)))
 
+  @test_util.run_deprecated_v1
   def test_bool_ops(self):
 
     def test_fn(a, b, c):
-      return (a or b) and (a or b or c)
+      return (a or b) and (a or b or c) and not c
+
+    with self.converted(test_fn, logical_expressions, {}) as result:
+      with self.cached_session() as sess:
+        self.assertTrue(
+            sess.run(result.test_fn(constant_op.constant(True), False, False)))
+        self.assertFalse(
+            sess.run(result.test_fn(constant_op.constant(True), False, True)))
 
-    with self.converted(test_fn, logical_expressions, {}, math_ops.logical_or,
-                        math_ops.logical_and) as result:
+  @test_util.run_deprecated_v1
+  def test_comparison(self):
+
+    def test_fn(a, b, c, d):
+      return a < b == c > d
+
+    with self.converted(test_fn, logical_expressions, {}) as result:
       with self.cached_session() as sess:
-        self.assertTrue(sess.run(result.test_fn(True, False, True)))
+        # Note: having just the first constant a tensor tests that the
+        # operations execute in the correct order. If anything other than
+        # a < b executed first, the result would be a Python scalar and not a
+        # Tensor. This is valid as long as the dispat is automatic based on
+        # type.
+        self.assertTrue(
+            sess.run(result.test_fn(constant_op.constant(1), 2, 2, 1)))
+        self.assertFalse(
+            sess.run(result.test_fn(constant_op.constant(1), 2, 2, 3)))
+
+  def test_default_ops(self):
 
-  def test_unsupported_ops(self):
     def test_fn(a, b):
       return a in b
 
diff --git a/tensorflow/python/autograph/converters/side_effect_guards.py b/tensorflow/python/autograph/converters/side_effect_guards.py
index 6e48e57bde0fffab96db40efe840bf067bf11300..98e29ec8e1b27061371f0328402d8cb45a0f69e7 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards.py
@@ -122,11 +122,12 @@ class SideEffectGuardTransformer(converter.Base):
       # possible, gate all remaining statements (and that may fail too, see
       # _visit_and_reindent.
       args_scope = anno.getanno(node.value, NodeAnno.ARGS_SCOPE)
+      live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
       # NOTE: We can't guard object attributes because they may not be writable.
       # In addition, avoid renaming well-known names.
       # TODO(mdan): Move these names into config.
-      unguarded_names = (qual_names.QN('self'), qual_names.QN('tf'))
-      guarded_args = tuple(s for s in args_scope.used
+      unguarded_names = (qual_names.QN('self'), qual_names.QN('ag__'))
+      guarded_args = tuple(s for s in live_out
                            if not s.is_composite() and s not in unguarded_names)
 
       # TODO(mdan): Include all arguments which depended on guarded_args too.
diff --git a/tensorflow/python/autograph/converters/side_effect_guards_test.py b/tensorflow/python/autograph/converters/side_effect_guards_test.py
index cef3199169c387194a95df72c26f353ad8f58873..645267e56002a999cd497f11f7507449ab900be6 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards_test.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -34,6 +35,7 @@ tf = None  # Will be replaced by a mock.
 
 class SideEffectGuardsTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_side_effect_on_return_only_variable(self):
 
     def test_fn(a):
@@ -48,12 +50,12 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Add support for this use case.
         # Right now the variable `a` is not conditioned on the `assign` because
         # there's no way to add control dependencies to a variable object.
-        self.assertEqual(2, sess.run(v))
+        self.assertEqual(2, self.evaluate(v))
 
   def test_side_effect_on_used_variable(self):
 
@@ -69,12 +71,13 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
         # Right now it's 3 or 4 based on whether the read is synchronized.
-        self.assertEqual(3, sess.run(v))
+        self.assertEqual(3, self.evaluate(v))
 
+  @test_util.run_deprecated_v1
   def test_side_effect_on_tensor(self):
 
     def test_fn(a):
@@ -109,10 +112,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign_add) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, sess.run(v))
+        self.assertEqual(4, self.evaluate(v))
 
   def test_multiline_nested_block(self):
 
@@ -130,10 +133,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign, ops.name_scope) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(3, sess.run(v))
+        self.assertEqual(3, self.evaluate(v))
 
   def test_multiline_block_unsafe(self):
 
@@ -153,10 +156,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
                        state_ops.assign_add) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, sess.run(v))
+        self.assertEqual(4, self.evaluate(v))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py
index e190a7cfe8492bef5985f128cf553a0fc17b3b96..bd049afdfcef4c839bcb3d9ba5444d885c3061cc 100644
--- a/tensorflow/python/autograph/converters/slices_test.py
+++ b/tensorflow/python/autograph/converters/slices_test.py
@@ -49,7 +49,7 @@ class SliceTest(converter_testing.TestCase):
         tl = list_ops.tensor_list_from_tensor(
             [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32))
         y = result.test_fn(tl)
-        self.assertEqual(2, sess.run(y))
+        self.assertEqual(2, self.evaluate(y))
 
   def test_index_access_multiple_definitions(self):
 
diff --git a/tensorflow/python/autograph/core/config.py b/tensorflow/python/autograph/core/config.py
index 4fa8489af57cf50a86622b1b55c134c06736c0e5..574f819504e526420dd1956359dc974869d735f3 100644
--- a/tensorflow/python/autograph/core/config.py
+++ b/tensorflow/python/autograph/core/config.py
@@ -45,5 +45,4 @@ NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 # TODO(mdan); Consolidate all internal imports into a single __ag module.
 COMPILED_IMPORT_STATEMENTS = (
     'from __future__ import print_function',
-    'import tensorflow as tf',
 )
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 408a573ad0aa206cf75e5e399221e44a441a0a10..e88c4674ee24867dec32d62589afdc2e48dfcace 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -40,7 +40,7 @@ converter.ProgramContext contains mutable state across related entities. For
 example, when converting several functions that call one another, the
 ProgramContext should be shared across these entities.
 
-Below is the overal flow at conversion:
+Below is the overall flow at conversion:
 
     program_ctx = ProgramContext(<entities to convert>, <global settings>, ...)
     while <program_ctx has more entities to convert>:
@@ -64,6 +64,7 @@ from __future__ import division
 from __future__ import print_function
 
 from enum import Enum
+from enum import IntEnum
 
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import naming
@@ -71,13 +72,17 @@ from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.autograph.pyct.static_analysis import activity
 from tensorflow.python.autograph.pyct.static_analysis import live_values
 from tensorflow.python.autograph.pyct.static_analysis import liveness
 from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.pyct.static_analysis import type_info
+from tensorflow.python.eager import function
 
 # TODO(mdan): These contexts can be refactored into first class objects.
 # For example, we could define Program and Entity abstractions that hold on
@@ -86,43 +91,179 @@ from tensorflow.python.autograph.pyct.static_analysis import type_info
 # TODO(mdan): Add a test specific to this converter.
 
 
+class Verbosity(IntEnum):
+  """Different levels of verbosity for printing errors.
+
+  Attributes:
+   * BRIEF: No logging, minimal error messages.
+   * VERBOSE: Detailed logging of generated code, detailed error messages.
+  """
+  BRIEF = 0
+  VERBOSE = 1
+
+
+class Feature(Enum):
+  """Constants to use when selecting AutoGraph features."""
+
+  ALL = 'Enable all features.'
+
+  AUTO_CONTROL_DEPS = (
+      'Insert of control dependencies in the generated code.')
+  DECORATORS = (
+      'Allow decorators in local functions. Note that special decorators,'
+      ' like ag.convert or tf.function are allowed regardless of this toggle.')
+  ERROR_REWRITING = (
+      'Rewrite errors that occur in the generated code to indicate the source'
+      ' code to which the failing code corresponds.')
+  LISTS = 'Convert list idioms, like initializers, slices, append, etc.'
+  NAME_SCOPES = (
+      'Insert name scopes that name ops according to context, like the'
+      ' function they were defined in.')
+
+  def __repr__(self):
+    return self.name
+
+
+class ConversionOptions(object):
+  """Immutable container for global conversion flags.
+
+  Attributes:
+    recursive: bool, whether to recursively convert any user functions or
+      classes that the converted function may use.
+    verbose: Verbosity, the level of verbosity to use.
+    strip_decorators: Tuple[Callable], contains decorators that should be in
+      excluded from the compiled output. By default, when converting a function
+      before the decorators are applied, the compiled output will include those
+      decorators.
+    force_conversion: bool, whether to force convertinng the target entity. When
+      force_conversion is turned off, the converter may decide to return the
+      function as-is.
+    optional_features: Union[Feature, Set[Feature]], controls the use of
+      optional features in the conversion process. See Feature for available
+      options.
+  """
+
+  def __init__(self,
+               recursive=False,
+               verbose=Verbosity.VERBOSE,
+               strip_decorators=None,
+               force_conversion=False,
+               internal_convert_user_code=True,
+               optional_features=Feature.ALL):
+    self.recursive = recursive
+    self.verbose = verbose
+    self._strip_decorators = strip_decorators or ()
+    self.force_conversion = force_conversion
+    # TODO(mdan): Rename to conversion_recursion_depth?
+    self.internal_convert_user_code = internal_convert_user_code
+
+    if isinstance(optional_features, Feature):
+      optional_features = (optional_features,)
+    optional_features = frozenset(optional_features)
+    self.optional_features = optional_features
+
+  @property
+  def strip_decorators(self):
+    # A few decorators are included by default.
+    # TODO(mdan): Revert if function.defun becomes a public symbol.
+    return self._strip_decorators + (function.defun,)
+
+  def uses(self, feature):
+    return (Feature.ALL in self.optional_features or
+            feature in self.optional_features)
+
+  def to_ast(self, namespace, internal_convert_user_code=None):
+    """Returns a representation of this object as an AST node.
+
+    The AST node encodes a constructor that would create an object with the
+    same contents.
+
+    Args:
+      namespace: Dict[str, Any], the namespace to use when serializing values to
+        names.
+      internal_convert_user_code: Optional[bool], allows ovrriding the
+        corresponding value.
+
+    Returns:
+      ast.Node
+    """
+    template = """
+      constructor_name(
+          recursive=recursive_val,
+          verbose=verbose_val,
+          strip_decorators=strip_decorators_val,
+          force_conversion=force_conversion_val,
+          optional_features=optional_features_val,
+          internal_convert_user_code=internal_convert_user_code_val)
+    """
+
+    def as_qualified_name(o):
+      name = inspect_utils.getqualifiedname(namespace, o)
+      if not name:
+        raise ValueError('Could not locate entity {} in {}'.format(
+            o, namespace))
+      return name
+
+    def list_of_names(values):
+      return parser.parse_expression('({})'.format(', '.join(
+          tuple(as_qualified_name(v) for v in values))))
+
+    def list_of_features(values):
+      return parser.parse_expression('({})'.format(', '.join(
+          'ag__.Feature.{}'.format(v)
+          for v in Feature.__members__
+          if v in values)))
+
+    if internal_convert_user_code is not None:
+      internal_convert_user_code = self.internal_convert_user_code
+
+    expr_ast = templates.replace(
+        template,
+        constructor_name=parser.parse_expression(
+            as_qualified_name(ConversionOptions)),
+        recursive_val=parser.parse_expression(str(self.recursive)),
+        verbose_val=parser.parse_expression(str(int(self.verbose))),
+        strip_decorators_val=list_of_names(self._strip_decorators),
+        force_conversion_val=parser.parse_expression(
+            str(self.force_conversion)),
+        internal_convert_user_code_val=parser.parse_expression(
+            str(internal_convert_user_code)),
+        optional_features_val=list_of_features(self.optional_features))
+    return expr_ast[0].value
+
+
 class ProgramContext(object):
   """ProgramContext keeps track of converting function hierarchies.
 
   This object is mutable, and is updated during conversion. Not thread safe.
 
   Attributes:
-    recursive: bool, whether to recursively convert any functions that the
-        decorator function may call.
-    autograph_decorators: Tuple[Callable, ...], decorator functions that belong
-        to AutoGraph. These require special treatment.
+    options: ConversionOptions
     dependency_cache: Dict[Any, ast.AST], the original entities mapped to their
-        converted AST
+      converted AST
     additional_imports: Set[Any], additional entities which for any reason
-        cannot be attached after loading and need to be explicitly imported
-        in the generated code
-    name_map: Dict[str, str], map of original entity name to the name of
-        their converted counterparts
-    autograph_module: Module, a reference to the autograph module. This
-        needs to be specified by the caller to avoid circular dependencies.
+      cannot be attached after loading and need to be explicitly imported in the
+      generated code
+    name_map: Dict[str, str], map of original entity name to the name of their
+      converted counterparts
+    autograph_module: Module, a reference to the autograph module. This needs to
+      be specified by the caller to avoid circular dependencies.
     uncompiled_modules: Set[Tuple[str, ...]], with each tuple representing the
-        fully qualified name of a package containing functions that will not be
-        compiled.
+      fully qualified name of a package containing functions that will not be
+      compiled.
     required_imports: str, containing an import statement on each line. These
-        are all the imports necessary for the compiled code to run, in addition
-        to the closures of each entity, which are attached dynamically.
+      are all the imports necessary for the compiled code to run, in addition to
+      the closures of each entity, which are attached dynamically.
   """
 
   def __init__(
       self,
-      recursive,
-      autograph_decorators,
+      options,
       partial_types,
       autograph_module,
       uncompiled_modules,
   ):
-    self.recursive = recursive
-    self.autograph_decorators = autograph_decorators
+    self.options = options
     self.partial_types = partial_types if partial_types else ()
     self.autograph_module = autograph_module
     self.uncompiled_modules = uncompiled_modules
@@ -140,7 +281,7 @@ class ProgramContext(object):
                      tuple(self.additional_imports))
 
   def new_namer(self, namespace):
-    return naming.Namer(namespace, self.recursive, self.name_map,
+    return naming.Namer(namespace, self.options.recursive, self.name_map,
                         self.partial_types)
 
   def update_name_map(self, namer):
@@ -294,7 +435,7 @@ def standard_analysis(node, context, is_initial=False):
     node: ast.AST
     context: converter.EntityContext
     is_initial: bool, whether this is the initial analysis done on the input
-        source code
+      source code
 
   Returns:
     ast.AST, same as node, with the static analysis annotations added
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index dc2d419d348d14c2f4b3bd8026f51b0cd020ca49..f1374081d3c6e0dd93c39d331c76404859b2f40a 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -30,7 +30,10 @@ from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import transformer
@@ -41,7 +44,7 @@ def imported_decorator(f):
   return lambda a: f(a) + 1
 
 
-# TODO(mdan): We might be able to use the real namer here.
+# TODO(mdan): We should use the real namer here.
 class FakeNamer(object):
   """A fake namer that uses a global counter to generate unique names."""
 
@@ -59,7 +62,8 @@ class FakeNamer(object):
                              original_fqn,
                              live_entity=None,
                              owner_type=None):
-    del live_entity
+    if inspect_utils.islambda(live_entity):
+      return None, False
     if owner_type is not None:
       return None, False
     return ('renamed_%s' % '_'.join(original_fqn)), True
@@ -94,22 +98,15 @@ class TestCase(test.TestCase):
       self.dynamic_calls.append(args)
       return 7
 
-    class ConversionOptions(object):
-      """Mock version of api.ConversionOptions."""
-
-      def __init__(self, recursive):
-        self.recursive = recursive
-
-      @classmethod
-      def new(cls, recursive):
-        cls(recursive)
-
     try:
       result, source = compiler.ast_to_object(node, include_source_map=True)
 
+      # TODO(mdan): Move this into self.prepare()
       result.tf = self.make_fake_mod('fake_tf', *symbols)
-      fake_ag = self.make_fake_mod('fake_ag', converted_call, ConversionOptions)
+      fake_ag = self.make_fake_mod('fake_ag', converted_call,
+                                   converter.ConversionOptions)
       fake_ag.__dict__.update(operators.__dict__)
+      fake_ag.__dict__.update(special_functions.__dict__)
       fake_ag.__dict__['utils'] = utils
       fake_ag.__dict__['rewrite_graph_construction_error'] = (
           errors.rewrite_graph_construction_error)
@@ -128,7 +125,13 @@ class TestCase(test.TestCase):
   @contextlib.contextmanager
   def converted(self, entity, converter_module, namespace, *tf_symbols):
     node, ctx = self.prepare(entity, namespace)
-    node = converter_module.transform(node, ctx)
+
+    if not isinstance(converter_module, (list, tuple)):
+      converter_module = (converter_module,)
+    for i, m in enumerate(converter_module):
+      node = converter.standard_analysis(node, ctx, is_initial=not i)
+      node = m.transform(node, ctx)
+
     with self.compiled(node, namespace, *tf_symbols) as result:
       yield result
 
@@ -155,14 +158,18 @@ class TestCase(test.TestCase):
               arg_types=None,
               owner_type=None,
               recursive=True,
-              autograph_decorators=()):
+              strip_decorators=()):
+    namespace['ConversionOptions'] = converter.ConversionOptions
+
     node, source = parser.parse_entity(test_fn)
     node = node.body[0]
     if namer is None:
       namer = FakeNamer()
     program_ctx = converter.ProgramContext(
-        recursive=recursive,
-        autograph_decorators=autograph_decorators,
+        options=converter.ConversionOptions(
+            recursive=recursive,
+            strip_decorators=strip_decorators,
+            verbose=True),
         partial_types=None,
         autograph_module=None,
         uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
@@ -174,5 +181,6 @@ class TestCase(test.TestCase):
         arg_types=arg_types,
         owner_type=owner_type)
     ctx = converter.EntityContext(namer, entity_info, program_ctx)
+    origin_info.resolve(node, source, test_fn)
     node = converter.standard_analysis(node, ctx, is_initial=True)
     return node, ctx
diff --git a/tensorflow/python/autograph/core/errors_test.py b/tensorflow/python/autograph/core/errors_test.py
index aa6c293268c86892ea076000a112fc6a3012b2ab..845a28a5222a77d0d2a2ee49f6edb86f57ddb6a6 100644
--- a/tensorflow/python/autograph/core/errors_test.py
+++ b/tensorflow/python/autograph/core/errors_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors as tf_errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
@@ -47,6 +48,7 @@ class RuntimeErrorsTest(test.TestCase):
                                     'test_comment')
     return loc, origin
 
+  @test_util.run_deprecated_v1
   def test_improved_errors_basic(self):
     loc, origin = self.fake_origin(zero_div, 2)
     zero_div_caller.ag_source_map = {loc: origin}
@@ -55,13 +57,14 @@ class RuntimeErrorsTest(test.TestCase):
     with self.assertRaises(errors.TfRuntimeError) as cm:
       with errors.improved_errors(zero_div_caller):
         with self.cached_session() as sess:
-          sess.run(ops)
+          self.evaluate(ops)
 
     for frame in cm.exception.custom_traceback:
       _, _, function_name, _ = frame
       self.assertNotEqual('zero_div', function_name)
     self.assertIn(origin.as_frame(), set(cm.exception.custom_traceback))
 
+  @test_util.run_deprecated_v1
   def test_improved_errors_no_matching_lineno(self):
     loc, origin = self.fake_origin(zero_div, -1)
     zero_div_caller.ag_source_map = {loc: origin}
@@ -70,7 +73,7 @@ class RuntimeErrorsTest(test.TestCase):
     with self.assertRaises(errors.TfRuntimeError) as cm:
       with errors.improved_errors(zero_div_caller):
         with self.cached_session() as sess:
-          sess.run(ops)
+          self.evaluate(ops)
 
     all_function_names = set()
     for frame in cm.exception.custom_traceback:
@@ -79,6 +82,7 @@ class RuntimeErrorsTest(test.TestCase):
       self.assertNotEqual('test_function_name', function_name)
     self.assertIn('zero_div', all_function_names)
 
+  @test_util.run_deprecated_v1
   def test_improved_errors_failures(self):
     loc, _ = self.fake_origin(zero_div, 2)
     zero_div_caller.ag_source_map = {loc: 'bogus object'}
@@ -87,7 +91,7 @@ class RuntimeErrorsTest(test.TestCase):
     with self.assertRaises(tf_errors.InvalidArgumentError):
       with errors.improved_errors(zero_div_caller):
         with self.cached_session() as sess:
-          sess.run(ops)
+          self.evaluate(ops)
 
   def test_improved_errors_validation(self):
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/autograph/core/function_wrapping_test.py b/tensorflow/python/autograph/core/function_wrapping_test.py
index 5e217055c7154dbcabed06be157a25e4068f2ddb..7e21b979dbcd24f815f2d7ce88ad9ec1f6690507 100644
--- a/tensorflow/python/autograph/core/function_wrapping_test.py
+++ b/tensorflow/python/autograph/core/function_wrapping_test.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FunctionWrappingTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_function_scope_name(self):
     with function_wrapping.function_scope('test_name'):
       t = constant_op.constant(1)
diff --git a/tensorflow/python/autograph/core/naming.py b/tensorflow/python/autograph/core/naming.py
index aecc9e33caaed9e336fedc6fcc5a02cc176c7861..b8d79daebaa6d6dcf5f324f637a3b496f3742b92 100644
--- a/tensorflow/python/autograph/core/naming.py
+++ b/tensorflow/python/autograph/core/naming.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import qual_names
 
 
@@ -76,6 +77,9 @@ class Namer(object):
     if not self.recursive:
       return None, False
 
+    if (live_entity is not None and inspect_utils.islambda(live_entity)):
+      return None, False
+
     if owner_type is not None and owner_type not in self.partial_types:
       # Members are not renamed when part of an entire converted class.
       return None, False
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index bef62a640384bd5ba2501aa179ae4fd6918f7141..201a88875413982b0f1a791f3408b403a3259eb8 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -31,6 +31,7 @@ py_library(
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
+        "//third_party/py/numpy",
         "@gast_archive//:gast",
         "@six_archive//:six",
     ],
@@ -40,7 +41,6 @@ py_test(
     name = "api_test",
     srcs = ["api_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
@@ -53,7 +53,6 @@ py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 1dc97d2331b7a823681e398d3602750350e049c6..f7774888c8a5ccb8a64186476d6e78b999e527ba 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -18,11 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import functools
+import sys
 
 from enum import Enum
 
+# pylint:disable=g-bad-import-order
+import numpy as np
+# pylint:enable=g-bad-import-order
+
+
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import conversion
@@ -30,6 +35,8 @@ from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.utils import py_func
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -39,43 +46,13 @@ from tensorflow.python.util import tf_inspect
 # (currently we require (module + class name, type))
 
 
-class ConversionOptions(
-    collections.namedtuple('ConversionOptions',
-                           ('recursive', 'verbose', 'strip_decorators',
-                            'force_conversion', 'arg_types'))):
-  """Container for conversion flags.
-
-  Attributes:
-    recursive: bool, whether to recursively convert any user functions or
-        classes that the converted function may use.
-    verbose: bool, whether to log the compiled code.
-    strip_decorators: Tuple[Callable], contains decorators that should be in
-        excluded from the compiled output. By default, when converting a
-        function before the decorators are applied, the compiled output will
-        include those decorators.
-    force_conversion: bool, whether to force convertinng the target entity.
-        When force_conversion is turned off, the converter may decide to
-        return the function as-is.
-    arg_types: Optional[Dict[Text, Type]], type hints for symbols including
-        function arguments.
-  """
-
-  @classmethod
-  def new(cls,
-          recursive=False,
-          verbose=False,
-          strip_decorators=None,
-          force_conversion=False,
-          arg_types=None):
-    return cls(recursive=recursive,
-               verbose=verbose,
-               strip_decorators=strip_decorators or (),
-               force_conversion=force_conversion,
-               arg_types=arg_types or {})
-
-
 # TODO(mdan): This should behave like to_graph (e.g. convert statically).
-def convert(recursive=False, verbose=False):
+# TODO(znado): Make an alias so can write Verbosity directly without needing
+# to write converter.
+def convert(
+    recursive=False,
+    verbose=converter.Verbosity.BRIEF,
+    optional_features=converter.Feature.ALL):
   """Decorator that compiles a function to use TensorFlow ops.
 
   The decorator is dynamic - it recompiles the target whenever the decorated
@@ -85,31 +62,36 @@ def convert(recursive=False, verbose=False):
 
   Args:
     recursive: bool, whether to recursively convert any functions or classes
-        that the converted function may use.
-    verbose: bool, whether to output the compiled code in the logs.
+      that the converted function may use.
+    verbose: converter.Verbosity, the level of verbosity.
+    optional_features: converted.Feature, allows toggling optional or
+      experimental features. When set to None, only the core features are
+      enabled.
 
   Returns:
     Callable, a decorator that converts the given function into an equivalent
     function that uses TensorFlow ops.
   """
+
   def decorator(f):
     """Decorator implementation."""
 
     @functools.wraps(f)
     def wrapper(*args, **kwargs):
       return converted_call(
-          f,
-          ConversionOptions.new(
+          f, None,
+          converter.ConversionOptions(
               recursive=recursive,
               verbose=verbose,
               force_conversion=True,
+              optional_features=optional_features,
           ), *args, **kwargs)
 
     wrapper = tf_decorator.make_decorator(f, wrapper)
 
     # Sometimes the decorator is just desugared, making it impossible to detect.
     # This attribute makes detection easier.
-    setattr(wrapper, '__pyct_is_compile_decorator', True)
+    setattr(wrapper, '__ag_compiled', True)
     return wrapper
 
   return decorator
@@ -118,8 +100,7 @@ def convert(recursive=False, verbose=False):
 class RunMode(Enum):
   """Specifies the way a converted function or method should be executed in TF.
 
-  The enum values have the following semantics:
-
+  Attributes:
    * GRAPH: Call this function directly, as-is. This is suitable for functions
        that were already designed for TF graphs and contain ops.
    * PY_FUNC: Wrap this function into a py_func op. This is suitable for code
@@ -137,10 +118,10 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
 
   Args:
     run_as: RunMode, specifies how to use the function in TensorFlow.
-    return_dtypes: Optional[Iterable[
-        Union[tf.DType, utils.py_func.MatchDType]]], the return data types of
-        the converted function, if run_as is RunMode.PY_FUNC. Ignored otherwise.
-        May be set to None if the function has no return values.
+    return_dtypes: Optional[Iterable[ Union[tf.DType,
+      utils.py_func.MatchDType]]], the return data types of the converted
+      function, if run_as is RunMode.PY_FUNC. Ignored otherwise. May be set to
+      None if the function has no return values.
 
   Returns:
     Callable, a decorator that wraps the original function.
@@ -170,35 +151,96 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
 
     # Sometimes the decorator is just desugared, making it impossible to detect.
     # This attribute makes detection easier.
-    setattr(wrapper, '__pyct_is_compile_decorator', True)
+    setattr(wrapper, '__ag_compiled', True)
     return wrapper
 
   return decorator
 
 
 # TODO(mdan): Move to a private, undocumented module.
-def converted_call(f, options, *args, **kwargs):
+def converted_call(f, owner, options, *args, **kwargs):
   """Compiles a function call inline. For internal use only."""
+  if options.verbose >= converter.Verbosity.VERBOSE:
+    logging.info('Converted call: {}; owner: {}'.format(f, owner))
+
+  if owner is not None:
+    if not isinstance(f, str):
+      raise ValueError(
+          'When owner is specified, the function name must be specified as'
+          ' a string: {}'.format(f))
+
+    # Special case when the owner is a 'super' object. In that case lookups of
+    # dynamic attributes won't work. See
+    # inspect_utils.SuperWrapperForDynamicAttrs.
+    if isinstance(owner, super):
+      owner = inspect_utils.SuperWrapperForDynamicAttrs(owner)
+
+    f = getattr(owner, f)
+
+  if inspect_utils.isbuiltin(f):
+    return py_builtins.overload_of(f)(*args, **kwargs)
+
   # TODO(mdan): This needs cleanup.
   # In particular, we may want to avoid renaming functions altogether.
   if not options.force_conversion and conversion.is_whitelisted_for_graph(f):
+
+    # Args typically include `self`, as required by the conversion process.
+    # When conversion is skipped, `self` is not necessary, because the
+    # original bound method is being executed. This code removes it.
+    if tf_inspect.ismethod(f) and args:
+      f_class = inspect_utils.getmethodclass(f)
+      if args[0] is f_class:
+        args = args[1:]
+
     return f(*args, **kwargs)
 
-  unknown_arg_value = object()  # Sentinel for arguments of unknown value
+  # internal_convert_user_code is for example turned off when issuing a dynamic
+  # call conversion from generated code while in nonrecursive mode. In that
+  # case we evidently don't want to recurse, but we still have to convert
+  # things like builtins.
+  if not options.internal_convert_user_code:
+    return f(*args, **kwargs)
 
-  if inspect_utils.isbuiltin(f):
-    return py_builtins.overload_of(f)(*args, **kwargs)
+  # Unwrap functools.partial objects
+  # TODO(allenl, mdan): Consider sharing unwrapping logic with tf_inspect.
+  while isinstance(f, functools.partial):
+    args = f.args + args
+    new_kwargs = {}
+    if f.keywords is not None:
+      new_kwargs.update(f.keywords)
+    new_kwargs.update(kwargs)
+    kwargs = new_kwargs
+    f = f.func
 
   if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
     # Regular functions
     target_entity = f
     arg_map_target = f
-    effective_args = args
     f_class = inspect_utils.getmethodclass(f)
 
+    # TODO(b/119246461): This may be more elegantly handled using __get__?
     if f_class is not None:
+      # If this is a method call, it may or may not include self.
+      #
+      # Example when self is included:
+      #   converted_call(to_graph(foo.bar), foo)
+      #
+      # Example when self is not included:
+      #   super(...).foo(args)
+      #
+      if owner is not None and (not args or args[0] is not owner):
+        effective_args = (owner,) + args
+      else:
+        # When the owner is not specified, use the result of
+        # inspect_utils.getmethodclass.
+        # TODO(b/119246461): Make sure an owner is always specified.
+        if not args or args[0] is not f_class:
+          effective_args = (f_class,) + args
+        else:
+          effective_args = (f_class,) + args[1:]
       partial_types = (f_class,)
     else:
+      effective_args = args
       partial_types = ()
 
   elif tf_inspect.isclass(f):
@@ -219,13 +261,10 @@ def converted_call(f, options, *args, **kwargs):
     NotImplementedError('unknown callable type "%s"' % type(f))
 
   arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
+  arg_types = {}
   for name, arg in arg_values.items():
-    if arg is unknown_arg_value:
-      continue
     arg_class = arg.__class__
-    # If arg_value_hints specifies any name, use that instead.
-    if name not in options.arg_types:
-      options.arg_types[name] = (arg_class.__name__, arg_class)
+    arg_types[name] = (arg_class.__name__, arg_class)
 
   # When called from within a decorator, this is the only indication that
   # the function is a method - it appears that the decorator is applied
@@ -243,10 +282,36 @@ def converted_call(f, options, *args, **kwargs):
       recursive=options.recursive,
       verbose=options.verbose,
       arg_values=arg_values,
-      arg_types=options.arg_types,
+      arg_types=arg_types,
       partial_types=partial_types,
-      strip_decorators=options.strip_decorators)
-  return converted_f(*effective_args, **kwargs)
+      strip_decorators=options.strip_decorators,
+      optional_features=options.optional_features)
+
+  result = converted_f(*effective_args, **kwargs)
+
+  # The converted function's closure is simply inserted into the function's
+  # module __dict__. Since modules are permanently cached, that results in
+  # leaking the entire closure.
+  # Normally, it's not safe to delete the module because that may release said
+  # closure as well. However, in the case of converted_call we are certain the
+  # function will not be executed again, so the closure should no longer be
+  # needed so long as the function doesn't return any executable code.
+  # TODO(mdan): Attach the closure properly, using cells.
+  if all(map(_is_not_callable, nest.flatten(result))):
+    del sys.modules[converted_f.__module__]
+
+  return result
+
+
+def _is_not_callable(obj):
+  # TODO(brianklee): Handle case when obj is a tensor dependent on a py_func.
+  if isinstance(obj, (int, float, complex, str, bool)):
+    return True
+  if isinstance(obj, (np.ndarray, np.generic)):
+    return True
+  if tensor_util.is_tensor(obj):
+    return True
+  return False
 
 
 # TODO(mdan): Rename: to_ops?
@@ -254,11 +319,12 @@ def converted_call(f, options, *args, **kwargs):
 # TODO(mdan): Remove partial_types.
 def to_graph(e,
              recursive=True,
-             verbose=False,
+             verbose=converter.Verbosity.VERBOSE,
              arg_values=None,
              arg_types=None,
              partial_types=None,
-             strip_decorators=None):
+             strip_decorators=None,
+             optional_features=converter.Feature.ALL):
   """Converts a Python entity into equivalent code that uses TensorFlow ops.
 
   Supported Python entities include:
@@ -270,15 +336,17 @@ def to_graph(e,
   Args:
     e: Union[Callable, Type], the Python entity to convert.
     recursive: bool, whether to recursively convert any functions that the
-        converted function may call.
-    verbose: bool, whether to output the compiled code in the logs.
+      converted function may call.
+    verbose: converter.Verbosity, the level of printing verbosity to use.
     arg_values: Optional[Dict[Text, Any]], value hints for symbols including
-        function arguments.
+      function arguments.
     arg_types: Optional[Dict[Text, Type]], type hints for symbols including
-        function arguments.
+      function arguments.
     partial_types: Set[Type], reserved for internal use.
     strip_decorators: Tuple[Callable], same as
-        ConversionOptions.strip_decorators.
+      ConversionOptions.strip_decorators.
+    optional_features: Union[Feature, Set[Feature]], same as
+      ConversionOptions.optional_features.
 
   Returns:
     Union[Callable, Type], the converted entity, which is the same kind as e
@@ -293,8 +361,11 @@ def to_graph(e,
   strip_decorators += (convert, do_not_convert, converted_call)
 
   program_ctx = converter.ProgramContext(
-      recursive=recursive,
-      autograph_decorators=strip_decorators,
+      options=converter.ConversionOptions(
+          recursive=recursive,
+          verbose=verbose,
+          strip_decorators=strip_decorators,
+          optional_features=optional_features),
       partial_types=partial_types,
       autograph_module=tf_inspect.getmodule(to_graph),
       uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
@@ -305,7 +376,7 @@ def to_graph(e,
   for dep in reversed(program_ctx.conversion_order):
     nodes.extend(program_ctx.dependency_cache[dep])
 
-  compiled_module, compiled_src = compiler.ast_to_object(
+  compiled_module, _ = compiler.ast_to_object(
       nodes,
       source_prefix=program_ctx.required_imports,
       include_source_map=True)
@@ -318,6 +389,14 @@ def to_graph(e,
       compiled_module.__dict__[key] = val
   compiled = getattr(compiled_module, name)
 
+  if tf_inspect.isfunction(e):
+    compiled.__defaults__ = e.__defaults__
+
+  if hasattr(compiled, '__globals__'):
+    # Remove self to avoid circular references. This will probably only work
+    # so long as the function is not reentrant.
+    del compiled.__globals__[name]
+
   # Need this so the source_mapping attribute is available for the context
   # manager to access for runtime errors.
   #
@@ -333,9 +412,6 @@ def to_graph(e,
   setattr(compiled, source_map_attribute_name,
           compiled_module.__dict__['ag_source_map__'])
 
-  if verbose:
-    logging.info('Compiled output of %s:\n\n%s\n', e, compiled_src)
-
   return compiled
 
 
@@ -352,11 +428,11 @@ def to_code(e,
   Args:
     e: Union[Callable, Type], the Python entity to convert.
     recursive: bool, whether to recursively convert any functions that the
-        converted function may call.
+      converted function may call.
     arg_values: Optional[Dict[Text, Any]], value hints for symbols including
-        function arguments.
+      function arguments.
     arg_types: Optional[Dict[Text, Type]], type hints for symbols including
-        function arguments.
+      function arguments.
     partial_types: Set[Type], reserved for internal use.
     indentation: Text, when to use for each level of indentation.
 
@@ -364,8 +440,9 @@ def to_code(e,
     Text, the converted code.
   """
   program_ctx = converter.ProgramContext(
-      recursive=recursive,
-      autograph_decorators=(convert, do_not_convert, converted_call),
+      options=converter.ConversionOptions(
+          recursive=recursive,
+          strip_decorators=(convert, do_not_convert, converted_call)),
       partial_types=partial_types,
       autograph_module=tf_inspect.getmodule(to_graph),
       uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index 8ce5022c0a06011aad3fa36c9a7311a6ce64a784..d5561ba8249f539e720fa1ecb5800b76c61a8c2f 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -18,27 +18,34 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import gc
+
 import numpy as np
 
 from tensorflow.python.autograph import utils
-from tensorflow.python.autograph.core import config
+from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import api
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 
-
 tf = utils.fake_tf()
 
-class ApiTest(test.TestCase):
 
-  def setUp(self):
-    config.COMPILED_IMPORT_STATEMENTS = (
-        'from __future__ import print_function',
-    )
+class TestResource(str):
+  pass
+
 
+class ApiTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
   def test_decorator_recurses(self):
 
     class TestClass(object):
@@ -59,8 +66,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_does_not_recurse(self):
 
     class TestClass(object):
@@ -79,8 +87,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_calls_unconverted_graph(self):
 
     class TestClass(object):
@@ -100,8 +109,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_calls_unconverted_py_func(self):
 
     class TestClass(object):
@@ -126,8 +136,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_calls_decorated(self):
 
     class TestClass(object):
@@ -149,7 +160,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_decorator_preserves_argspec(self):
 
@@ -167,6 +178,7 @@ class ApiTest(test.TestCase):
         list(tf_inspect.getfullargspec(tc.called_member)),
         list(tf_inspect.getfullargspec(tc.called_member_converted)))
 
+  @test_util.run_deprecated_v1
   def test_convert_call_site_decorator(self):
 
     class TestClass(object):
@@ -179,9 +191,8 @@ class ApiTest(test.TestCase):
       @api.convert(recursive=True)
       def test_method(self, x, s, a):
         while tf.reduce_sum(x) > s:
-          x //= api.converted_call(
-              self.called_member,
-              api.ConversionOptions.new(), self, a)
+          x //= api.converted_call(self.called_member, None,
+                                   converter.ConversionOptions(), self, a)
         return x
 
     tc = TestClass()
@@ -189,10 +200,10 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_converted_call_builtin(self):
-    x = api.converted_call(range, api.ConversionOptions.new(), 3)
+    x = api.converted_call(range, None, converter.ConversionOptions(), 3)
     self.assertEqual((0, 1, 2), tuple(x))
 
   def test_converted_call_function(self):
@@ -203,9 +214,38 @@ class ApiTest(test.TestCase):
       return x
 
     with self.cached_session() as sess:
-      x = api.converted_call(test_fn, api.ConversionOptions.new(),
+      x = api.converted_call(test_fn, None, converter.ConversionOptions(),
                              constant_op.constant(-1))
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
+
+  @test_util.run_v1_only('b/120545219')
+  def test_converted_call_functools_partial(self):
+
+    def test_fn(x, y, z):
+      if x < 0:
+        return -x, -y, -z
+      return x, y, z
+
+    x = api.converted_call(
+        functools.partial(test_fn, constant_op.constant(-1), z=-3),
+        None, converter.ConversionOptions(),
+        constant_op.constant(-2))
+    self.assertEqual((1, 2, 3), self.evaluate(x))
+
+    x = api.converted_call(
+        functools.partial(
+            functools.partial(test_fn, constant_op.constant(-1)), z=-3),
+        None, converter.ConversionOptions(),
+        constant_op.constant(-2))
+    self.assertEqual((1, 2, 3), self.evaluate(x))
+
+  def test_converted_call_method_explicit_owner(self):
+    # TODO(mdan): Implement.
+    pass
+
+  def test_converted_call_method_explicit_super_owner(self):
+    # TODO(mdan): Implement.
+    pass
 
   def test_converted_call_method(self):
 
@@ -221,8 +261,9 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(tc.test_method, api.ConversionOptions.new(), tc)
-      self.assertEqual(1, sess.run(x))
+      x = api.converted_call(tc.test_method, None,
+                             converter.ConversionOptions(), tc)
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_by_class(self):
 
@@ -238,10 +279,9 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(
-          TestClass.test_method,
-          api.ConversionOptions.new(), tc)
-      self.assertEqual(1, sess.run(x))
+      x = api.converted_call(TestClass.test_method, None,
+                             converter.ConversionOptions(), tc)
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_callable_object(self):
 
@@ -257,8 +297,8 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(tc, api.ConversionOptions.new())
-      self.assertEqual(1, sess.run(x))
+      x = api.converted_call(tc, None, converter.ConversionOptions())
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_constructor(self):
 
@@ -273,11 +313,11 @@ class ApiTest(test.TestCase):
         return self.x
 
     with self.cached_session() as sess:
-      tc = api.converted_call(TestClass, api.ConversionOptions.new(),
+      tc = api.converted_call(TestClass, None, converter.ConversionOptions(),
                               constant_op.constant(-1))
       # tc is now a converted object.
       x = tc.test_method()
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_already_converted(self):
 
@@ -285,15 +325,91 @@ class ApiTest(test.TestCase):
       return x == 0
 
     with self.cached_session() as sess:
-      x = api.converted_call(f, api.ConversionOptions.new(),
+      x = api.converted_call(f, None, converter.ConversionOptions(),
                              constant_op.constant(0))
-      self.assertTrue(sess.run(x))
+      self.assertTrue(self.evaluate(x))
 
       converted_f = api.to_graph(f)
-      x = api.converted_call(converted_f, api.ConversionOptions.new(),
+      x = api.converted_call(converted_f, None, converter.ConversionOptions(),
                              constant_op.constant(0))
-      self.assertTrue(sess.run(x))
+      self.assertTrue(self.evaluate(x))
+
+  @test_util.run_deprecated_v1
+  def test_converted_call_no_user_code(self):
+
+    def f(x):
+      return len(x)
+
+    opts = converter.ConversionOptions(internal_convert_user_code=False)
+
+    # f should not be converted, causing len to error out.
+    with self.assertRaisesRegexp(Exception,
+                                 'object of type \'Tensor\' has no len()'):
+      api.converted_call(f, None, opts, constant_op.constant([0]))
+
+    # len on the other hand should work fine.
+    x = api.converted_call(len, None, opts, constant_op.constant([0]))
+    # The constant has static shape so the result is a primitive not a Tensor.
+    self.assertEqual(x, 1)
+
+  def test_converted_call_whitelisted_method(self):
+
+    opts = converter.ConversionOptions()
+
+    model = sequential.Sequential([
+        core.Dense(2)
+    ])
+
+    x = api.converted_call(model.call, None, opts,
+                           constant_op.constant([[0.0]]), training=True)
+
+    with self.cached_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+
+  def test_converted_call_whitelisted_method_extra_self(self):
+
+    opts = converter.ConversionOptions()
+
+    model = sequential.Sequential([
+        core.Dense(2)
+    ])
+
+    x = api.converted_call(model.call, None, opts,
+                           model, constant_op.constant([[0.0]]), training=True)
+
+    with self.cached_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
+  def test_converted_call_whitelisted_method_via_owner(self):
+
+    opts = converter.ConversionOptions()
+
+    model = sequential.Sequential([
+        core.Dense(2)
+    ])
+
+    x = api.converted_call('call', model, opts,
+                           constant_op.constant([[0.0]]), training=True)
+
+    with self.cached_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+
+  def test_converted_call_lambda(self):
+
+    opts = converter.ConversionOptions()
+
+    l = lambda x: x == 0
+
+    x = api.converted_call(l, None, opts, constant_op.constant(0))
+
+    with self.cached_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(True, self.evaluate(x))
+
+  @test_util.run_deprecated_v1
   def test_to_graph_basic(self):
 
     def test_fn(x, s):
@@ -305,7 +421,23 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       x = compiled_fn(constant_op.constant([4, 8]), 4)
-      self.assertListEqual([1, 2], sess.run(x).tolist())
+      self.assertListEqual([1, 2], self.evaluate(x).tolist())
+
+  @test_util.run_deprecated_v1
+  def test_to_graph_with_defaults(self):
+
+    foo = 4
+
+    def test_fn(x, s=foo):
+      while tf.reduce_sum(x) > s:
+        x //= 2
+      return x
+
+    compiled_fn = api.to_graph(test_fn)
+
+    with self.cached_session() as sess:
+      x = compiled_fn(constant_op.constant([4, 8]))
+      self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
   def test_to_code_basic(self):
 
@@ -326,6 +458,39 @@ class ApiTest(test.TestCase):
 
     self.assertTrue(hasattr(api.to_graph(test_fn), 'ag_source_map'))
 
+  def assertNoMemoryLeaks(self, target_f):
+    refs_before = set(id(obj) for obj in gc.get_objects())
+    target_f()
+    gc.collect()
+    objs_after = [obj for obj in gc.get_objects() if id(obj) not in refs_before]
+    leaked = [obj for obj in objs_after if isinstance(obj, TestResource)]
+    self.assertFalse(leaked,
+                     'Resources {} were leaked by AutoGraph.'.format(leaked))
+
+  def test_no_module_memory_leak(self):
+    def f():
+      resource = TestResource('some-resource')
+      @api.convert()
+      def target(x):
+        return x + resource, 42
+      self.assertEqual(target('foo'), ('foosome-resource', 42))
+
+    self.assertNoMemoryLeaks(f)
+
+  def test_no_module_memory_leak_deferred_call(self):
+    def f():
+      resource = TestResource('some-resource')
+      @api.convert()
+      def target(x):
+        def inner_fn():
+          return x + resource
+        return inner_fn, 42
+      self.assertEqual(target('foo')[0](), 'foosome-resource')
+
+    f()
+    # TODO(brianklee): Reenable when we've revised module loading approach.
+    # self.assertNoMemoryLeaks(f)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 52abd4062688f706be05f7e1b1c5a3cc7c101489..f8decd24e8e2eb5bcad22ba64d1865e8497363e3 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -18,12 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import imp
 
 import gast
 
 from tensorflow.python.autograph import operators
 from tensorflow.python.autograph import utils
+from tensorflow.python.autograph.converters import arg_defaults
 from tensorflow.python.autograph.converters import asserts
 from tensorflow.python.autograph.converters import break_statements
 from tensorflow.python.autograph.converters import builtin_functions
@@ -44,13 +46,16 @@ from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import ast_util
+from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_inspect
 
 
@@ -68,12 +73,31 @@ def is_whitelisted_for_graph(o):
   Returns:
     Boolean
   """
-  m = tf_inspect.getmodule(o)
+  # TODO(b/120224672): Fix this.
+  if isinstance(o, functools.partial):
+    # tf_inspect.getmodule(functools.partial(...)) otherwise returns None since
+    # functools.partial objects do not have a __module__ attribute.
+    m = functools
+  else:
+    m = tf_inspect.getmodule(o)
   for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
     if m.__name__.startswith(prefix):
       return True
+
   if hasattr(o, 'autograph_info__'):
     return True
+
+  if inspect_utils.isnamedtuple(o):
+    # Due to the way they're constructed, namedtuple types cannot be converted
+    # because they don't expose source code. But we assume they are safe for
+    # graph mode since they are just containers.
+    if tf_inspect.isclass(o) and len(o.__bases__) > 1:
+      logging.log_first_n(
+          logging.level_warning(),
+          'Entity {} looks like a namedtuple subclass. If it has any custom'
+          ' methods, they will not be converted by AutoGraph.'.format(o), 1)
+    return True
+
   return False
 
 
@@ -105,18 +129,13 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
   Raises:
     ValueError: if the entity type is not supported.
   """
+  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
+    logging.info('Converting {}'.format(o))
+
   if tf_inspect.isclass(o):
     node, name, ns = class_to_graph(o, program_ctx)
   elif tf_inspect.isfunction(o):
-    # TODO(mdan): This is not a reliable mechanism.
-    # The most reliable way is to check the source code, the AST will contain
-    # a Lambda node instead of a FunctionDef
-    if o.__name__ == '<lambda>':
-      raise NotImplementedError(
-          'lambda functions are not yet supported; declare the function'
-          ' using def instead: %s' % o)
-    else:
-      node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
+    node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   elif tf_inspect.ismethod(o):
     node, name, ns = function_to_graph(o, program_ctx, arg_values, arg_types)
   # TODO(mdan,yashkatariya): Remove when object conversion is implemented.
@@ -145,7 +164,11 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
 
   program_ctx.add_to_cache(o, node)
 
-  if program_ctx.recursive:
+  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
+    logging.info('Compiled output of {}:\n\n{}\n'.format(
+        o, compiler.ast_to_source(node)))
+
+  if program_ctx.options.recursive:
     while True:
       candidate = None
       for obj in program_ctx.name_map.keys():
@@ -182,8 +205,7 @@ def class_to_graph(c, program_ctx):
         program_ctx=program_ctx,
         arg_values={},
         arg_types={'self': (c.__name__, c)},
-        owner_type=c,
-        rewrite_errors=False)
+        owner_type=c)
     if class_namespace is None:
       class_namespace = namespace
     else:
@@ -255,8 +277,7 @@ def _add_self_references(namespace, autograph_module):
     # Craft a module that exposes parts of the external API as well as certain
     # internal modules.
     ag_internal = imp.new_module('autograph')
-    ag_internal.converted_call = autograph_module.converted_call
-    ag_internal.ConversionOptions = autograph_module.ConversionOptions
+    ag_internal.__dict__.update(autograph_module.__dict__)
     ag_internal.utils = utils
     ag_internal.function_scope = function_wrapping.function_scope
     ag_internal.rewrite_graph_construction_error = (
@@ -264,6 +285,7 @@ def _add_self_references(namespace, autograph_module):
     # TODO(mdan): Add safeguards against name clashes.
     # We don't want to create a submodule because we want the operators to be
     # accessible as ag__.<operator>
+    ag_internal.__dict__.update(special_functions.__dict__)
     ag_internal.__dict__.update(operators.__dict__)
 
   _add_reserved_symbol(namespace, 'ag__', ag_internal)
@@ -273,12 +295,33 @@ def function_to_graph(f,
                       program_ctx,
                       arg_values,
                       arg_types,
-                      owner_type=None,
-                      rewrite_errors=True):
+                      owner_type=None):
   """Specialization of `entity_to_graph` for callable functions."""
 
   node, source = parser.parse_entity(f)
   node = node.body[0]
+
+  # In general, the output of inspect.getsource is inexact because it uses
+  # regex matching to adjust the exact location around the line number that
+  # CPython records. This is particularly problematic for lambda functions,
+  # where the entire containing lines are returned.
+  nodes = ast_util.find_matching_definitions(node, f)
+  if len(nodes) != 1:
+    if f.__name__ == '<lambda>':
+      raise ValueError(
+          'Unable to identify source code of lambda function {}. It was'
+          ' defined on this line: {}, which must contain a single lambda with'
+          ' matching signature. To avoid ambiguity, define each lambda'
+          ' in a separate expression.'.format(f, source))
+    else:
+      raise ValueError(
+          'Unable to identify source code of function {}. The source code'
+          ' reported by Python did not include exactly one matching signature:'
+          '\n{}\n. This is an extremely rare occurrence. Please report it to'
+          ' the TensorFlow team.'.format(f, source))
+  node, = nodes
+
+  # TODO(znado): Place inside standard_analysis.
   origin_info.resolve(node, source, f)
   namespace = inspect_utils.getnamespace(f)
   _add_self_references(namespace, program_ctx.autograph_module)
@@ -292,15 +335,22 @@ def function_to_graph(f,
       arg_types=arg_types,
       owner_type=owner_type)
   context = converter.EntityContext(namer, entity_info, program_ctx)
-  node = node_to_graph(node, context, rewrite_errors=rewrite_errors)
+  node = node_to_graph(node, context)
 
-  # TODO(mdan): This somewhat duplicates the call rename logic in call_trees.py
-  new_name, did_rename = namer.compiled_function_name(f.__name__, f, owner_type)
-  if not did_rename:
-    new_name = f.__name__
-    if node.name != f.__name__:
-      raise NotImplementedError('Strange corner case. Send us offending code!')
-  node.name = new_name
+  if isinstance(node, gast.Lambda):
+    new_name = namer.new_symbol('tf__lambda', ())
+    node = gast.Assign(
+        targets=[gast.Name(new_name, gast.Store(), None)], value=node)
+
+  else:
+    # TODO(mdan): This somewhat duplicates the renaming logic in call_trees.py
+    new_name, did_rename = namer.compiled_function_name(f.__name__, f,
+                                                        owner_type)
+    if did_rename:
+      node.name = new_name
+    else:
+      new_name = f.__name__
+      assert node.name == new_name
 
   program_ctx.update_name_map(namer)
   # TODO(mdan): Use this at compilation.
@@ -308,13 +358,12 @@ def function_to_graph(f,
   return [node], new_name, namespace
 
 
-def node_to_graph(node, context, rewrite_errors=True):
+def node_to_graph(node, context):
   """Convert Python code to equivalent TF graph mode code.
 
   Args:
     node: AST, the code to convert.
     context: converter.EntityContext
-    rewrite_errors: Boolean, whether or not to rewrite the error traceback.
 
   Returns:
     A tuple (node, deps):
@@ -330,7 +379,9 @@ def node_to_graph(node, context, rewrite_errors=True):
   # TODO(mdan): Is it feasible to reconstruct intermediate source code?
   context.info.source_code = None
 
-  node = converter.apply_(node, context, decorators)
+  if context.program.options.uses(converter.Feature.DECORATORS):
+    node = converter.apply_(node, context, decorators)
+  node = converter.apply_(node, context, arg_defaults)
   node = converter.apply_(node, context, directives)
   node = converter.apply_(node, context, break_statements)
   node = converter.apply_(node, context, asserts)
@@ -338,17 +389,20 @@ def node_to_graph(node, context, rewrite_errors=True):
   # dealing with the extra loop increment operation that the for
   # canonicalization creates.
   node = converter.apply_(node, context, continue_statements)
-  context.info.namespace['len'] = len
   node = converter.apply_(node, context, return_statements)
-  node = converter.apply_(node, context, lists)
-  node = converter.apply_(node, context, slices)
+  if context.program.options.uses(converter.Feature.LISTS):
+    node = converter.apply_(node, context, lists)
+    node = converter.apply_(node, context, slices)
   node = converter.apply_(node, context, builtin_functions)
   node = converter.apply_(node, context, call_trees)
   node = converter.apply_(node, context, control_flow)
   node = converter.apply_(node, context, conditional_expressions)
   node = converter.apply_(node, context, logical_expressions)
-  node = converter.apply_(node, context, side_effect_guards)
-  node = converter.apply_(node, context, function_scopes)
-  if rewrite_errors:
+  if context.program.options.uses(converter.Feature.AUTO_CONTROL_DEPS):
+    node = converter.apply_(node, context, side_effect_guards)
+  # TODO(mdan): If function scopes ever does more, the toggle will need moving.
+  if context.program.options.uses(converter.Feature.NAME_SCOPES):
+    node = converter.apply_(node, context, function_scopes)
+  if context.program.options.uses(converter.Feature.ERROR_REWRITING):
     node = converter.apply_(node, context, error_handlers)
   return node
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index 07d0f75129235b2a12ca487cd51d723810631ebe..9a4fbdad8c1994d8c8cc534b6e0b4af45f5c4c80 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import api
+from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.impl import conversion
 from tensorflow.python.framework import constant_op
 from tensorflow.python.keras.engine import training
@@ -34,8 +35,7 @@ class ConversionTest(test.TestCase):
 
   def _simple_program_ctx(self):
     return converter.ProgramContext(
-        recursive=True,
-        autograph_decorators=(),
+        options=converter.ConversionOptions(recursive=True),
         partial_types=(),
         autograph_module=api,
         uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
@@ -66,6 +66,20 @@ class ConversionTest(test.TestCase):
     self.assertEqual('tf__f', name)
     self.assertIs(ns['b'], b)
 
+  def test_entity_to_graph_function_with_defaults(self):
+    b = 2
+    c = 1
+    def f(a, d=c + 1):
+      return a + b + d
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, _ = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.FunctionDef)
+    self.assertEqual('tf__f', name)
+    self.assertEqual(
+        compiler.ast_to_source(fn_node.args.defaults[0]).strip(), 'None')
+
   def test_entity_to_graph_call_tree(self):
 
     def g(a):
@@ -146,12 +160,67 @@ class ConversionTest(test.TestCase):
                      program_ctx.dependency_cache[TestSubclass][-2].name)
 
   def test_entity_to_graph_lambda(self):
-    f = lambda a: a
+    b = 2
+    f = lambda x: b * x if x > 0 else -x
 
-    with self.assertRaises(NotImplementedError):
-      program_ctx = self._simple_program_ctx()
+    program_ctx = self._simple_program_ctx()
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.Assign)
+    self.assertIsInstance(fn_node.value, gast.Lambda)
+    self.assertEqual('tf__lambda', name)
+    self.assertIs(ns['b'], b)
+
+  def test_entity_to_graph_multiple_lambdas(self):
+    a, b = 1, 2
+    f, _ = (lambda x: a * x, lambda y: b * y)
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.Assign)
+    self.assertIsInstance(fn_node.value, gast.Lambda)
+    self.assertEqual('tf__lambda', name)
+    self.assertIs(ns['a'], a)
+
+  def test_entity_to_graph_multiple_lambdas_ambiguous_definitions(self):
+    a, b = 1, 2
+    f, _ = (lambda x: a * x, lambda x: b * x)
+
+    program_ctx = self._simple_program_ctx()
+    with self.assertRaises(ValueError):
       conversion.entity_to_graph(f, program_ctx, None, None)
 
+  def test_entity_to_graph_lambda_code_with_garbage(self):
+    # pylint:disable=g-long-lambda
+    f = (  # intentional wrap
+        lambda x: (x  # intentional wrap
+                   + 1),)[0]
+    # pylint:enable=g-long-lambda
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, _ = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.Assign)
+    self.assertIsInstance(fn_node.value, gast.Lambda)
+    self.assertEqual('tf__lambda', name)
+
+  def test_entity_to_graph_nested_functions(self):
+    b = 2
+
+    def f(x):
+      def g(x):
+        return b * x
+      return g(x)
+
+    program_ctx = self._simple_program_ctx()
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, _ = nodes
+    self.assertIsInstance(fn_node, gast.FunctionDef)
+    self.assertEqual(fn_node.name, 'tf__f')
+    self.assertEqual('tf__f', name)
+    self.assertIs(ns['b'], b)
+
   def test_ag_module_cached(self):
     def callee():
       return range(3)
diff --git a/tensorflow/python/autograph/lang/special_functions.py b/tensorflow/python/autograph/lang/special_functions.py
index 62ac018ac46ffd98e1d8b91d71fe953a0a9f1700..411770692b0d7f35826d6f9e5151dbf2f7e8136d 100644
--- a/tensorflow/python/autograph/lang/special_functions.py
+++ b/tensorflow/python/autograph/lang/special_functions.py
@@ -24,6 +24,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators import data_structures
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_util
 
 
@@ -46,6 +47,13 @@ def _validate_list_constructor(elements, element_dtype, element_shape):
       ' allowed'.format(type(elements)))
 
 
+def match_staging_level(value, like_value):
+  """Casts a value to be staged at the same level as another."""
+  if tensor_util.is_tensor(like_value):
+    return constant_op.constant(value)
+  return value
+
+
 def tensor_list(elements,
                 element_dtype=None,
                 element_shape=None,
diff --git a/tensorflow/python/autograph/lang/special_functions_test.py b/tensorflow/python/autograph/lang/special_functions_test.py
index 206a32d07cd2b8b7aa1357fa9be4e23b03276a71..8d40f4036c5a1892afca6e5fb2daf891c9487800 100644
--- a/tensorflow/python/autograph/lang/special_functions_test.py
+++ b/tensorflow/python/autograph/lang/special_functions_test.py
@@ -30,27 +30,36 @@ from tensorflow.python.platform import test
 
 class SpecialFunctionsTest(test.TestCase):
 
+  def test_match_staging_level(self):
+    some_tensor = constant_op.constant(0)
+    tensor_one = special_functions.match_staging_level(1, some_tensor)
+    python_one = special_functions.match_staging_level(1, 1)
+    with self.cached_session() as sess:
+      self.assertTrue(tensor_util.is_tensor(tensor_one))
+      self.assertAllEqual(self.evaluate(tensor_one), 1)
+      self.assertEqual(python_one, 1)
+
   def test_tensor_list_empty_list(self):
     l = special_functions.tensor_list([],
                                       element_dtype=dtypes.int32,
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [])
 
     l = special_functions.tensor_list((),
                                       element_dtype=dtypes.int32,
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [])
 
   def test_tensor_list_tensor(self):
     l = special_functions.tensor_list(
         constant_op.constant([], dtype=dtypes.int32))
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [])
 
   def test_tensor_list_unsupported_initializer(self):
     with self.assertRaisesRegexp(ValueError, 'unknown type'):
@@ -66,16 +75,16 @@ class SpecialFunctionsTest(test.TestCase):
 
     l = special_functions.tensor_list(elements)
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [[1, 2], [3, 4]])
 
   def test_tensor_list_array_from_elements(self):
     elements = [constant_op.constant([1, 2]), constant_op.constant([3, 4])]
 
     l = special_functions.tensor_list(elements, use_tensor_array=True)
     sl = l.stack()
-    with self.test_session() as sess:
-      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+    with self.cached_session() as sess:
+      self.assertAllEqual(self.evaluate(sl), [[1, 2], [3, 4]])
 
   def test_stack(self):
     self.assertEqual(special_functions.stack(1, strict=False), 1)
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index a116611b6454ff77ad4f640b60dca2a158956c02..aedb901845b97bbee5918902875b5023a8604dcd 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -22,6 +22,8 @@ py_library(
         "__init__.py",
         "control_flow.py",
         "data_structures.py",
+        "exceptions.py",
+        "logical.py",
         "py_builtins.py",
         "slices.py",
     ],
@@ -62,6 +64,26 @@ py_test(
     ],
 )
 
+py_test(
+    name = "exceptions_test",
+    srcs = ["exceptions_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "logical_test",
+    srcs = ["logical_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "py_builtins_test",
     srcs = ["py_builtins_test.py"],
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 0d3b44b6c4bb69eaa1b309614cdf1201e8ab5104..7a580fe32475cbc32f20a1196c075fbf7f981d27 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -38,6 +38,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators.control_flow import for_stmt
+from tensorflow.python.autograph.operators.control_flow import if_stmt
 from tensorflow.python.autograph.operators.control_flow import while_stmt
 from tensorflow.python.autograph.operators.data_structures import list_append
 from tensorflow.python.autograph.operators.data_structures import list_pop
@@ -45,6 +46,21 @@ from tensorflow.python.autograph.operators.data_structures import list_stack
 from tensorflow.python.autograph.operators.data_structures import ListPopOpts
 from tensorflow.python.autograph.operators.data_structures import ListStackOpts
 from tensorflow.python.autograph.operators.data_structures import new_list
+from tensorflow.python.autograph.operators.exceptions import assert_stmt
+from tensorflow.python.autograph.operators.logical import and_
+from tensorflow.python.autograph.operators.logical import eq
+from tensorflow.python.autograph.operators.logical import gt
+from tensorflow.python.autograph.operators.logical import gt_e
+from tensorflow.python.autograph.operators.logical import in_
+from tensorflow.python.autograph.operators.logical import is_
+from tensorflow.python.autograph.operators.logical import is_not
+from tensorflow.python.autograph.operators.logical import lt
+from tensorflow.python.autograph.operators.logical import lt_e
+from tensorflow.python.autograph.operators.logical import not_
+from tensorflow.python.autograph.operators.logical import not_eq
+from tensorflow.python.autograph.operators.logical import not_in
+from tensorflow.python.autograph.operators.logical import or_
+from tensorflow.python.autograph.operators.logical import u_sub
 from tensorflow.python.autograph.operators.py_builtins import float_
 from tensorflow.python.autograph.operators.py_builtins import int_
 from tensorflow.python.autograph.operators.py_builtins import len_
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 6eedd695a74c134ad1c7cc3524bef64ba5b7066a..89f7b8522f569542fa935877cdd9de6a9797c2c4 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -61,7 +60,7 @@ def for_stmt(iter_, extra_test, body, init_state):
   """
   if tensor_util.is_tensor(iter_):
     return _known_len_for_stmt(iter_, extra_test, body, init_state)
-  elif isinstance(iter_, dataset_ops.Dataset):
+  elif isinstance(iter_, dataset_ops.DatasetV2):
     return _dataset_for_stmt(iter_, extra_test, body, init_state)
   else:
     return _py_for_stmt(iter_, extra_test, body, init_state)
@@ -100,6 +99,7 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
       extra_deps=(iter_,),
       opts=dict(maximum_iterations=n))
   # Dropping the iteration index because it's not syntactically visible.
+  # TODO(mdan): Don't.
   results = results[1:]
 
   # TODO(mdan): Remove this special case.
@@ -110,40 +110,15 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
 
 def _dataset_for_stmt(ds, extra_test, body, init_state):
   """Overload of for_stmt that iterates over TF Datasets."""
-  # Because Datsets only expose get_next, in the style of Python iterators,
-  # we are forced to unpack the loop as:
-  #
-  # epoch_number, iterate = ds.get_next()
-  # while epoch_number < 2:
-  #   <body>
-  #   epoch_number, iterate = ds.get_next()
-  epoch_numbers = dataset_ops.Dataset.range(2)
-  def tag_with(ds, tag):
-    return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(tag).repeat(), ds))
-  ds_with_epoch = epoch_numbers.flat_map(lambda i: tag_with(ds, i))
-
-  iterator = ds_with_epoch.make_initializable_iterator()
-  with ops.control_dependencies((iterator.initializer,)):
-    epoch_number, iterate = iterator.get_next()
-
-    def while_body(epoch_number, iterate, *state):
-      new_state = body(iterate, *state)
-      epoch_number, iterate = iterator.get_next()
-      return (epoch_number, iterate) + new_state
-
-    def while_cond(epoch_number, iterate, *state):
-      del iterate
-      return gen_math_ops.logical_and(epoch_number < 1, extra_test(*state))
-
-    results = while_stmt(
-        while_cond,
-        while_body,
-        init_state=(epoch_number, iterate) + init_state,
-        extra_deps=())
-  # Dropping the epoch number and iterate because they are not syntactically
-  # visible.
-  results = results[2:]
+  if extra_test(*init_state) is not True:
+    raise NotImplementedError(
+        'break statements are not yet supported in for/Dataset loops')
+
+  def reduce_body(state, iterate):
+    new_state = body(iterate, *state)
+    return new_state
+
+  results = ds.reduce(init_state, reduce_body)
 
   # TODO(mdan): Remove this special case.
   if len(results) == 1:
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index bb214b6f169b66769e09163ebb20ac095319719b..0a7d4b64022f583bae4effc7d0f7eb04f46cc048 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -22,12 +22,14 @@ from tensorflow.python.autograph.operators import control_flow
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class ForLoopTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_tensor(self):
     s = control_flow.for_stmt(
         constant_op.constant([1, 2, 3, 4]),
@@ -35,7 +37,7 @@ class ForLoopTest(test.TestCase):
         body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.cached_session() as sess:
-      self.assertEqual((10,), sess.run(s))
+      self.assertEqual((10,), self.evaluate(s))
 
   def test_python(self):
     s = control_flow.for_stmt(
@@ -45,6 +47,7 @@ class ForLoopTest(test.TestCase):
         init_state=(0,))
     self.assertEqual(10, s)
 
+  @test_util.run_deprecated_v1
   def test_dataset(self):
     to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
     s = control_flow.for_stmt(
@@ -53,11 +56,12 @@ class ForLoopTest(test.TestCase):
         body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.cached_session() as sess:
-      self.assertEqual((10,), sess.run(s))
+      self.assertEqual((10,), self.evaluate(s))
 
 
 class WhileLoopTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_tensor(self):
     n = constant_op.constant(5)
     results = control_flow.while_stmt(
@@ -66,7 +70,7 @@ class WhileLoopTest(test.TestCase):
         init_state=(0, 0),
         extra_deps=(n,))
     with self.cached_session() as sess:
-      self.assertEqual((5, 10), sess.run(results))
+      self.assertEqual((5, 10), self.evaluate(results))
 
   def test_python(self):
     n = 5
@@ -80,20 +84,36 @@ class WhileLoopTest(test.TestCase):
 
 class IfStmtTest(test.TestCase):
 
-  def test_tensor(self):
-    def test_if_stmt(cond):
-      return control_flow.if_stmt(
-          cond=cond,
-          body=lambda: 1,
-          orelse=lambda: -1)
+  def single_return_if_stmt(self, cond):
+    return control_flow.if_stmt(cond=cond, body=lambda: 1, orelse=lambda: -1)
+
+  def multi_return_if_stmt(self, cond):
+    return control_flow.if_stmt(
+        cond=cond, body=lambda: (1, 2), orelse=lambda: (-1, -2))
 
+  @test_util.run_deprecated_v1
+  def test_tensor(self):
     with self.cached_session() as sess:
-      self.assertEqual(1, sess.run(test_if_stmt(constant_op.constant(True))))
-      self.assertEqual(-1, sess.run(test_if_stmt(constant_op.constant(False))))
+      t = self.single_return_if_stmt(constant_op.constant(True))
+      self.assertEqual(1, self.evaluate(t))
+      t = self.single_return_if_stmt(constant_op.constant(False))
+      self.assertEqual(-1, self.evaluate(t))
 
   def test_python(self):
-    self.assertEqual(1, control_flow.if_stmt(True, lambda: 1, lambda: -1))
-    self.assertEqual(-1, control_flow.if_stmt(False, lambda: 1, lambda: -1))
+    self.assertEqual(1, self.single_return_if_stmt(True))
+    self.assertEqual(-1, self.single_return_if_stmt(False))
+
+  @test_util.run_deprecated_v1
+  def test_tensor_multiple_returns(self):
+    with self.cached_session() as sess:
+      t = self.multi_return_if_stmt(constant_op.constant(True))
+      self.assertAllEqual([1, 2], self.evaluate(t))
+      t = self.multi_return_if_stmt(constant_op.constant(False))
+      self.assertAllEqual([-1, -2], self.evaluate(t))
+
+  def test_python_multiple_returns(self):
+    self.assertEqual((1, 2), self.multi_return_if_stmt(True))
+    self.assertEqual((-1, -2), self.multi_return_if_stmt(False))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/operators/data_structures_test.py b/tensorflow/python/autograph/operators/data_structures_test.py
index 6039b07982c8e4b820acda059c701b8fdb96e295..c5a3a3d1cac998a0fc59163d73288317bd4a3e30 100644
--- a/tensorflow/python/autograph/operators/data_structures_test.py
+++ b/tensorflow/python/autograph/operators/data_structures_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.autograph.operators import data_structures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -43,7 +44,7 @@ class ListTest(test.TestCase):
     l = data_structures.tf_tensor_list_new([3, 4, 5])
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
   def test_tf_tensor_list_new_empty(self):
     l = data_structures.tf_tensor_list_new([],
@@ -51,14 +52,15 @@ class ListTest(test.TestCase):
                                            element_shape=())
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [])
+      self.assertAllEqual(self.evaluate(t), [])
 
   def test_tf_tensor_list_new_from_tensor(self):
     l = data_structures.tf_tensor_list_new(constant_op.constant([3, 4, 5]))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
+  @test_util.run_deprecated_v1
   def test_tf_tensor_list_new_illegal_input(self):
     with self.assertRaises(ValueError):
       data_structures.tf_tensor_list_new([3, 4.0])
@@ -77,7 +79,7 @@ class ListTest(test.TestCase):
     l = data_structures.tf_tensor_array_new([3, 4, 5])
     t = l.stack()
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
   def test_tf_tensor_array_new_illegal_input(self):
     with self.assertRaises(ValueError):
@@ -102,15 +104,16 @@ class ListTest(test.TestCase):
 
     t = list_ops.tensor_list_stack(l, element_dtype=x.dtype)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [[1, 2, 3]])
+      self.assertAllEqual(self.evaluate(t), [[1, 2, 3]])
 
+  @test_util.run_v1_only("b/117943489")
   def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l1 = data_structures.list_append(l, 1)
     l2 = data_structures.list_append(l1, 2)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(l1.stack()), [1])
-      self.assertAllEqual(sess.run(l2.stack()), [1, 2])
+      self.assertAllEqual(self.evaluate(l1.stack()), [1])
+      self.assertAllEqual(self.evaluate(l2.stack()), [1, 2])
 
   def test_append_python(self):
     l = []
@@ -131,10 +134,10 @@ class ListTest(test.TestCase):
 
     with self.cached_session() as sess:
       l, x = data_structures.list_pop(l, None, opts)
-      self.assertAllEqual(sess.run(x), [3, 4])
+      self.assertAllEqual(self.evaluate(x), [3, 4])
 
       t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
-      self.assertAllEqual(sess.run(t), [[1, 2]])
+      self.assertAllEqual(self.evaluate(t), [[1, 2]])
 
   def test_pop_python(self):
     l = [1, 2, 3]
@@ -152,12 +155,12 @@ class ListTest(test.TestCase):
 
     with self.cached_session() as sess:
       t = data_structures.list_stack(l, opts)
-      self.assertAllEqual(sess.run(t), sess.run(initial_list))
+      self.assertAllEqual(self.evaluate(t), self.evaluate(initial_list))
 
+  @test_util.run_deprecated_v1
   def test_stack_tensor_list_empty(self):
     l = list_ops.empty_tensor_list(
-        element_shape=-1,
-        element_dtype=dtypes.variant)
+        element_shape=None, element_dtype=dtypes.variant)
 
     opts = data_structures.ListStackOpts(
         element_dtype=dtypes.int32, original_call=None)
diff --git a/tensorflow/python/autograph/operators/exceptions.py b/tensorflow/python/autograph/operators/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..6078160f6851ecb4e00fb58ac506cb20959fae53
--- /dev/null
+++ b/tensorflow/python/autograph/operators/exceptions.py
@@ -0,0 +1,86 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exception handling statements: assert, etc."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import tf_inspect
+
+
+def assert_stmt(expression1, expression2):
+  """Functional form of an assert statement.
+
+  This follows the semantics of the Python assert statement, however the
+  concrete implementations may deviate from it. See the respective
+  implementation for details.
+
+  In general, the assert statement should not be used for control flow.
+  Furthermore, it is encouraged that the assertion expressions should not have
+  side effects.
+
+  Args:
+    expression1: Any
+    expression2: Callable[[], Any], returns the expression to include in the
+        error message when expression1 evaluates to False. When expression1 is
+        True, the result of expression2 will not be evaluated, however,
+        expression2 itself may be evaluated in some implementations.
+
+  Returns:
+    Any, implementation-dependent.
+
+  Raises:
+    ValueError: if any arguments are illegal.
+  """
+  if not callable(expression2):
+    raise ValueError('{} must be a callable'.format(expression2))
+  args, _, keywords, _ = tf_inspect.getargspec(expression2)
+  if args or keywords:
+    raise ValueError('{} may not have any arguments'.format(expression2))
+
+  if tensor_util.is_tensor(expression1):
+    return _tf_assert_stmt(expression1, expression2)
+  else:
+    return _py_assert_stmt(expression1, expression2)
+
+
+def _tf_assert_stmt(expression1, expression2):
+  """Overload of assert_stmt that stages a TF Assert.
+
+  This implementation deviates from Python semantics as follows:
+    (1) the assertion is verified regardless of the state of __debug__
+    (2) on assertion failure, the graph execution will fail with
+        tensorflow.errors.ValueError, rather than AssertionError.
+
+  Args:
+    expression1: tensorflow.Tensor, must evaluate to a tf.bool scalar
+    expression2: Callable[[], Union[tensorflow.Tensor, List[tensorflow.Tensor]]]
+
+  Returns:
+    tensorflow.Operation
+  """
+  expression2_tensors = expression2()
+  if not isinstance(expression2_tensors, list):
+    expression2_tensors = [expression2_tensors]
+  return control_flow_ops.Assert(expression1, expression2_tensors)
+
+
+def _py_assert_stmt(expression1, expression2):
+  """Overload of assert_stmt that executes a Python assert statement."""
+  assert expression1, expression2()
+  return None
diff --git a/tensorflow/python/autograph/operators/exceptions_test.py b/tensorflow/python/autograph/operators/exceptions_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..21ba76bb9521132ad3a54eb4d6004dc6d725d03f
--- /dev/null
+++ b/tensorflow/python/autograph/operators/exceptions_test.py
@@ -0,0 +1,90 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for exceptions module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.operators import exceptions
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class ExceptionsTest(test.TestCase):
+
+  def test_assert_tf_untriggered(self):
+    with self.cached_session() as sess:
+      t = exceptions.assert_stmt(
+          constant_op.constant(True), lambda: constant_op.constant('ignored'))
+      self.evaluate(t)
+
+  @test_util.run_deprecated_v1
+  def test_assert_tf_triggered(self):
+    with self.cached_session() as sess:
+      t = exceptions.assert_stmt(
+          constant_op.constant(False),
+          lambda: constant_op.constant('test message'))
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   'test message'):
+        self.evaluate(t)
+
+  @test_util.run_deprecated_v1
+  def test_assert_tf_multiple_printed_values(self):
+    two_tensors = [
+        constant_op.constant('test message'),
+        constant_op.constant('another message')
+    ]
+    with self.cached_session() as sess:
+      t = exceptions.assert_stmt(
+          constant_op.constant(False), lambda: two_tensors)
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   'test message.*another message'):
+        self.evaluate(t)
+
+  def test_assert_python_untriggered(self):
+    side_effect_trace = []
+
+    def expression_with_side_effects():
+      side_effect_trace.append(object())
+      return 'test message'
+
+    exceptions.assert_stmt(True, expression_with_side_effects)
+
+    self.assertListEqual(side_effect_trace, [])
+
+  def test_assert_python_triggered(self):
+    if not __debug__:
+      # Python assertions only be tested when in debug mode.
+      return
+
+    side_effect_trace = []
+    tracer = object()
+
+    def expression_with_side_effects():
+      side_effect_trace.append(tracer)
+      return 'test message'
+
+    with self.assertRaisesRegexp(AssertionError, 'test message'):
+      exceptions.assert_stmt(False, expression_with_side_effects)
+    self.assertListEqual(side_effect_trace, [tracer])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/operators/logical.py b/tensorflow/python/autograph/operators/logical.py
new file mode 100644
index 0000000000000000000000000000000000000000..569db5b91bd7efb92ce2b8a8b8eb6eb773f4abcb
--- /dev/null
+++ b/tensorflow/python/autograph/operators/logical.py
@@ -0,0 +1,139 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logical operators, including comparison and bool operators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+
+
+def not_(a):
+  """Functional form of "not"."""
+  if tensor_util.is_tensor(a):
+    return gen_math_ops.logical_not(a)
+  return not a
+
+
+def and_(a, b):
+  """Functional form of "and". Uses lazy evaluation semantics."""
+  a_val = a()
+  if tensor_util.is_tensor(a_val):
+    return _tf_lazy_and(a_val, b)
+  return _py_lazy_and(a_val, b)
+
+
+def _tf_lazy_and(cond, b):
+  """Lazy-eval equivalent of "and" for Tensors."""
+  # TODO(mdan): Enforce cond is scalar here?
+  return control_flow_ops.cond(cond, b, lambda: cond)
+
+
+def _py_lazy_and(cond, b):
+  """Lazy-eval equivalent of "and" in Python."""
+  return cond and b()
+
+
+def or_(a, b):
+  """Functional form of "or". Uses lazy evaluation semantics."""
+  a_val = a()
+  if tensor_util.is_tensor(a_val):
+    return _tf_lazy_or(a_val, b)
+  return _py_lazy_or(a_val, b)
+
+
+def _tf_lazy_or(cond, b):
+  """Lazy-eval equivalent of "or" for Tensors."""
+  # TODO(mdan): Enforce cond is scalar here?
+  return control_flow_ops.cond(cond, lambda: cond, b)
+
+
+def _py_lazy_or(cond, b):
+  """Lazy-eval equivalent of "or" in Python."""
+  return cond or b()
+
+
+def eq(a, b):
+  """Functional form of "equal"."""
+  if tensor_util.is_tensor(a) or tensor_util.is_tensor(b):
+    return _tf_equal(a, b)
+  return _py_equal(a, b)
+
+
+def _tf_equal(a, b):
+  """Overload of "equal" for Tensors."""
+  return gen_math_ops.equal(a, b)
+
+
+def _py_equal(a, b):
+  """Overload of "equal" that falls back to Python's default implementation."""
+  return a == b
+
+
+def not_eq(a, b):
+  """Functional form of "not-equal"."""
+  return not_(eq(a, b))
+
+
+# Default implementation for the remainings.
+
+
+def gt(a, b):
+  """Functional form of "less-than"."""
+  return a > b
+
+
+def gt_e(a, b):
+  """Functional form of "less-than"."""
+  return a >= b
+
+
+def is_(a, b):
+  """Functional form of "less-than"."""
+  return a is b
+
+
+def is_not(a, b):
+  """Functional form of "less-than"."""
+  return a is not b
+
+
+def in_(a, b):
+  """Functional form of "less-than"."""
+  # TODO(mdan): in and not_in should probably be convertible for some types.
+  return a in b
+
+
+def lt(a, b):
+  """Functional form of "less-than"."""
+  return a < b
+
+
+def lt_e(a, b):
+  """Functional form of "less-than"."""
+  return a <= b
+
+
+def not_in(a, b):
+  """Functional form of "less-than"."""
+  return a not in b
+
+
+def u_sub(a):
+  """Functional form of "unary-sub"."""
+  return -a
diff --git a/tensorflow/python/autograph/operators/logical_test.py b/tensorflow/python/autograph/operators/logical_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e22f39932d17397bca22bff8793e7649580d75d3
--- /dev/null
+++ b/tensorflow/python/autograph/operators/logical_test.py
@@ -0,0 +1,88 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for logical module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.operators import logical
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class LogicalOperatorsTest(test.TestCase):
+
+  def assertNotCalled(self):
+    self.fail('this should not be called')
+
+  def _tf_true(self):
+    return constant_op.constant(True)
+
+  def _tf_false(self):
+    return constant_op.constant(False)
+
+  def test_and_python(self):
+    self.assertTrue(logical.and_(lambda: True, lambda: True))
+    self.assertTrue(logical.and_(lambda: [1], lambda: True))
+    self.assertListEqual(logical.and_(lambda: True, lambda: [1]), [1])
+
+    self.assertFalse(logical.and_(lambda: False, lambda: True))
+    self.assertFalse(logical.and_(lambda: False, self.assertNotCalled))
+
+  @test_util.run_deprecated_v1
+  def test_and_tf(self):
+    with self.cached_session() as sess:
+      t = logical.and_(self._tf_true, self._tf_true)
+      self.assertEqual(self.evaluate(t), True)
+      t = logical.and_(self._tf_true, lambda: True)
+      self.assertEqual(self.evaluate(t), True)
+      t = logical.and_(self._tf_false, lambda: True)
+      self.assertEqual(self.evaluate(t), False)
+      # TODO(mdan): Add a test for ops with side effects.
+
+  def test_or_python(self):
+    self.assertFalse(logical.or_(lambda: False, lambda: False))
+    self.assertFalse(logical.or_(lambda: [], lambda: False))
+    self.assertListEqual(logical.or_(lambda: False, lambda: [1]), [1])
+
+    self.assertTrue(logical.or_(lambda: False, lambda: True))
+    self.assertTrue(logical.or_(lambda: True, self.assertNotCalled))
+
+  @test_util.run_deprecated_v1
+  def test_or_tf(self):
+    with self.cached_session() as sess:
+      t = logical.or_(self._tf_false, self._tf_true)
+      self.assertEqual(self.evaluate(t), True)
+      t = logical.or_(self._tf_false, lambda: True)
+      self.assertEqual(self.evaluate(t), True)
+      t = logical.or_(self._tf_true, lambda: True)
+      self.assertEqual(self.evaluate(t), True)
+      # TODO(mdan): Add a test for ops with side effects.
+
+  def test_not_python(self):
+    self.assertFalse(logical.not_(True))
+    self.assertFalse(logical.not_([1]))
+    self.assertTrue(logical.not_([]))
+
+  def test_not_tf(self):
+    with self.cached_session() as sess:
+      t = logical.not_(self._tf_false())
+      self.assertEqual(self.evaluate(t), True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index 91a2a22cc2271d9970686027703ae4a9363b1396..ddf05f73f37821c6ff7e246051cd82a560f370e3 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -120,8 +120,8 @@ def _tf_tensor_list_len(s):
 def _tf_tensor_len(s):
   """Overload of len_ for Tensor arguments."""
   # Statically shaped tensors: length is known ahead of time.
-  if s.shape.ndims and s.shape[0].value is not None:
-    return s.shape[0].value
+  if s.shape.ndims and s.shape.dims[0].value is not None:
+    return s.shape.dims[0].value
 
   # Static shape of unknown dimensions: use dynamic shape but statically
   # chech that it's a scalar.
@@ -133,7 +133,7 @@ def _tf_tensor_len(s):
     raise ValueError(
         'len requires a non-scalar tensor, got one of shape {}'.format(shape))
 
-  if shape.shape[0].value is not None:
+  if shape.shape.dims[0].value is not None:
     return array_ops.shape(s)[0]
 
   # Fully dynamic shape: use ops.
@@ -174,6 +174,7 @@ def _tf_py_func_print(objects, kwargs):
     override_kwargs['flush'] = True
 
   def print_wrapper(*vals):
+    vals = tuple(v.numpy() if tensor_util.is_tensor(v) else v for v in vals)
     if six.PY3:
       # TensorFlow doesn't seem to generate Unicode when passing strings to
       # py_func. This causes the print to add a "b'" wrapper to the output,
@@ -193,6 +194,7 @@ def range_(start_or_stop, stop=UNDEFINED, step=UNDEFINED):
 
 
 def _tf_range(start_or_stop, stop, step):
+  """Overload of range_ that generates a TF range tensor."""
   # Note: for static inputs (e.g. constants), tf.range errors out at graph
   # construction time, instead of returning an empty tensor. Preventing the
   # graph construction error aligns the semantics with Python.
@@ -216,10 +218,10 @@ def _py_range(start_or_stop, stop, step):
   return range(start_or_stop)
 
 
-SUPPORTED_BUILTINS = set((abs, float, int, len, print, range))
+SUPPORTED_BUILTINS = (abs, float, int, len, print, range)
 
 if six.PY2:
-  SUPPORTED_BUILTINS.add(xrange)
+  SUPPORTED_BUILTINS += (xrange,)
 
 BUILTIN_FUINCTIONS_MAP = {
     'abs': abs_,
@@ -228,5 +230,6 @@ BUILTIN_FUINCTIONS_MAP = {
     'len': len_,
     'print': print_,
     'range': range_,
+    # TODO(mdan): This might make more sense as tf.data.range.
     'xrange': range_,
 }
diff --git a/tensorflow/python/autograph/operators/py_builtins_test.py b/tensorflow/python/autograph/operators/py_builtins_test.py
index c94a918d5a95eb02a2f5322db59d74ef73356e85..c856e39d141f8479e2b9409b21d6683618a5e645 100644
--- a/tensorflow/python/autograph/operators/py_builtins_test.py
+++ b/tensorflow/python/autograph/operators/py_builtins_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -38,29 +39,29 @@ class PyBuiltinsTest(test.TestCase):
     self.assertEqual(py_builtins.abs_(-1), 1)
     with self.cached_session() as sess:
       t = py_builtins.abs_(constant_op.constant(-1))
-      self.assertEqual(sess.run(t), 1)
+      self.assertEqual(self.evaluate(t), 1)
       t = py_builtins.abs_(constant_op.constant([-1, 2, -3]))
-      self.assertAllEqual(sess.run(t), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(t), [1, 2, 3])
 
   def test_float(self):
     self.assertEqual(py_builtins.float_(10), 10.0)
     self.assertEqual(py_builtins.float_('10.0'), 10.0)
     with self.cached_session() as sess:
       t = py_builtins.float_(constant_op.constant(1, dtype=dtypes.int64))
-      self.assertEqual(sess.run(t), 1.0)
+      self.assertEqual(self.evaluate(t), 1.0)
       st = py_builtins.float_(constant_op.constant('1.0'))
-      self.assertEqual(sess.run(st), 1.0)
+      self.assertEqual(self.evaluate(st), 1.0)
 
   def test_int(self):
     self.assertEqual(py_builtins.int_(10.0), 10)
     self.assertEqual(py_builtins.int_('11', 2), 3)
     with self.cached_session() as sess:
       t = py_builtins.int_(constant_op.constant(1, dtype=dtypes.float64))
-      self.assertEqual(sess.run(t), 1)
+      self.assertEqual(self.evaluate(t), 1)
       st = py_builtins.int_(constant_op.constant('1'))
-      self.assertEqual(sess.run(st), 1)
+      self.assertEqual(self.evaluate(st), 1)
       st = py_builtins.int_(constant_op.constant('1'), 10)
-      self.assertEqual(sess.run(st), 1)
+      self.assertEqual(self.evaluate(st), 1)
 
   def test_int_unsupported_base(self):
     t = constant_op.constant(1, dtype=dtypes.float64)
@@ -73,14 +74,15 @@ class PyBuiltinsTest(test.TestCase):
       t = py_builtins.len_(constant_op.constant([[1], [2], [3]]))
       self.assertEqual(t, 3)
       ta = py_builtins.len_(tensor_array_ops.TensorArray(dtypes.int32, size=5))
-      self.assertEqual(sess.run(ta), 5)
+      self.assertEqual(self.evaluate(ta), 5)
       tl = py_builtins.len_(data_structures.tf_tensor_list_new([3, 4, 5]))
-      self.assertEqual(sess.run(tl), 3)
+      self.assertEqual(self.evaluate(tl), 3)
 
   def test_len_scalar(self):
     with self.assertRaises(ValueError):
       py_builtins.len_(constant_op.constant(1))
 
+  @test_util.run_deprecated_v1
   def test_len_dynamic_shape(self):
     with self.cached_session() as sess:
       p = array_ops.placeholder(dtype=dtypes.int32, shape=None)
@@ -91,6 +93,7 @@ class PyBuiltinsTest(test.TestCase):
         t = py_builtins.len_(p)
         sess.run(t, {p: 1})
 
+  @test_util.run_deprecated_v1
   def test_print_tensors(self):
     try:
       out_capturer = six.StringIO()
@@ -101,6 +104,7 @@ class PyBuiltinsTest(test.TestCase):
     finally:
       sys.stdout = sys.__stdout__
 
+  @test_util.run_deprecated_v1
   def test_print_complex(self):
     try:
       out_capturer = six.StringIO()
@@ -120,18 +124,18 @@ class PyBuiltinsTest(test.TestCase):
   def test_range_tensor(self):
     with self.cached_session() as sess:
       r = py_builtins.range_(constant_op.constant(3))
-      self.assertAllEqual(sess.run(r), [0, 1, 2])
+      self.assertAllEqual(self.evaluate(r), [0, 1, 2])
       r = py_builtins.range_(1, constant_op.constant(3))
-      self.assertAllEqual(sess.run(r), [1, 2])
+      self.assertAllEqual(self.evaluate(r), [1, 2])
       r = py_builtins.range_(2, 0, constant_op.constant(-1))
-      self.assertAllEqual(sess.run(r), [2, 1])
+      self.assertAllEqual(self.evaluate(r), [2, 1])
 
   def test_range_tensor_empty_range(self):
-    with self.test_session() as sess:
+    with self.session() as sess:
       r = py_builtins.range_(constant_op.constant(-3))
-      self.assertAllEqual(sess.run(r), [])
+      self.assertAllEqual(self.evaluate(r), [])
       r = py_builtins.range_(5, constant_op.constant(2))
-      self.assertAllEqual(sess.run(r), [])
+      self.assertAllEqual(self.evaluate(r), [])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/operators/slices_test.py b/tensorflow/python/autograph/operators/slices_test.py
index 9e4865b3c66923815338e70d4104c42318e56eb3..d444054fd772cf68b2e7c028adc87b6623ccffba 100644
--- a/tensorflow/python/autograph/operators/slices_test.py
+++ b/tensorflow/python/autograph/operators/slices_test.py
@@ -34,7 +34,7 @@ class SlicesTest(test.TestCase):
 
     with self.cached_session() as sess:
       t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
-      self.assertAllEqual(sess.run(t), [[5, 6], [3, 4]])
+      self.assertAllEqual(self.evaluate(t), [[5, 6], [3, 4]])
 
   def test_get_item_tensor_list(self):
     initial_list = constant_op.constant([[1, 2], [3, 4]])
@@ -44,7 +44,7 @@ class SlicesTest(test.TestCase):
         l, 1, slices.GetItemOpts(element_dtype=initial_list.dtype))
 
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4])
+      self.assertAllEqual(self.evaluate(t), [3, 4])
 
   def test_get_item_tensor_string(self):
     initial_str = constant_op.constant('abcd')
@@ -52,14 +52,14 @@ class SlicesTest(test.TestCase):
                         slices.GetItemOpts(element_dtype=initial_str.dtype))
 
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(t), b'b')
+      self.assertEqual(self.evaluate(t), b'b')
 
     initial_list_str = constant_op.constant(['abcd', 'bcde'])
     t = slices.get_item(initial_list_str, 1,
                         slices.GetItemOpts(element_dtype=initial_str.dtype))
 
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(t), b'bcde')
+      self.assertEqual(self.evaluate(t), b'bcde')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index ddadc6b96e8eb5417bfa1676ae304f7cbdedd92b..ba8ec271394981ec878473205a8dbbd19d255f3b 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -80,7 +80,6 @@ py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -154,7 +153,6 @@ py_test(
     name = "transformer_test",
     srcs = ["transformer_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/pyct/anno.py b/tensorflow/python/autograph/pyct/anno.py
index 1a52110ef36bbc0888e03cc25b3717822cb75c16..e1f4af46cd7c2e7d25a646ee3f73261c59a1f72a 100644
--- a/tensorflow/python/autograph/pyct/anno.py
+++ b/tensorflow/python/autograph/pyct/anno.py
@@ -63,10 +63,8 @@ class Static(NoValue):
   The enum values are used strictly for documentation purposes.
   """
 
-  # Deprecated - use reaching definitions instead.
   # Symbols
   # These flags are boolean.
-  IS_LOCAL = 'Symbol is local to the function scope being analyzed.'
   IS_PARAM = 'Symbol is a parameter to the function being analyzed.'
 
   # Scopes
@@ -91,6 +89,7 @@ class Static(NoValue):
   DEFINED_VARS_IN = (
       'Symbols defined when entering the node. See reaching_definitions.py.')
   LIVE_VARS_OUT = ('Symbols live when exiting the node. See liveness.py.')
+  LIVE_VARS_IN = ('Symbols live when entering the node. See liveness.py.')
 
 
 FAIL = object()
diff --git a/tensorflow/python/autograph/pyct/ast_util.py b/tensorflow/python/autograph/pyct/ast_util.py
index 7df3b8858c0128de64a928d7daf9db081566d9c6..ea7eca6463a17d43f1a3536ebdd1770cfcf265f7 100644
--- a/tensorflow/python/autograph/pyct/ast_util.py
+++ b/tensorflow/python/autograph/pyct/ast_util.py
@@ -24,6 +24,7 @@ import gast
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.util import tf_inspect
 
 
 class CleanCopier(object):
@@ -311,3 +312,64 @@ def parallel_walk(node, other):
         raise ValueError(
             'inconsistent values for field {}: {} and {}'.format(
                 f, n_child, o_child))
+
+
+class FunctionDefMatcher(gast.NodeVisitor):
+  """Finds nodes that match a given function's signature."""
+
+  def __init__(self, fn):
+    self.fn = fn
+    self.matching_nodes = []
+
+  def _arg_name(self, node):
+    if node is None:
+      return None
+    if isinstance(node, gast.Name):
+      return node.id
+    assert isinstance(node, str)
+    return node
+
+  def _argspec_matches(self, node):
+    arg_spec = tf_inspect.getfullargspec(self.fn)
+
+    node_args = tuple(self._arg_name(arg) for arg in node.args.args)
+    if node_args != tuple(arg_spec.args):
+      return False
+
+    if arg_spec.varargs != self._arg_name(node.args.vararg):
+      return False
+
+    if arg_spec.varkw != self._arg_name(node.args.kwarg):
+      return False
+
+    node_kwonlyargs = tuple(self._arg_name(arg) for arg in node.args.kwonlyargs)
+    if node_kwonlyargs != tuple(arg_spec.kwonlyargs):
+      return False
+
+    return True
+
+  def visit_Lambda(self, node):
+    self.generic_visit(node)
+
+    if self.fn.__name__ != '<lambda>':
+      return
+    if not self._argspec_matches(node):
+      return
+
+    self.matching_nodes.append(node)
+
+  def visit_FunctionDef(self, node):
+    self.generic_visit(node)
+
+    if self.fn.__name__ != node.name:
+      return
+    if not self._argspec_matches(node):
+      return
+
+    self.matching_nodes.append(node)
+
+
+def find_matching_definitions(node, f):
+  matcher = FunctionDefMatcher(f)
+  matcher.visit(node)
+  return tuple(matcher.matching_nodes)
diff --git a/tensorflow/python/autograph/pyct/ast_util_test.py b/tensorflow/python/autograph/pyct/ast_util_test.py
index b1577c466e6e67d6429b5f0eef6916efad16f46b..9fcbbe646c6e558b93fdafb6380ae0a46ee1d60a 100644
--- a/tensorflow/python/autograph/pyct/ast_util_test.py
+++ b/tensorflow/python/autograph/pyct/ast_util_test.py
@@ -22,6 +22,8 @@ import ast
 import collections
 import textwrap
 
+import gast
+
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
@@ -191,6 +193,107 @@ class AstUtilTest(test.TestCase):
       for _ in ast_util.parallel_walk(node_1, node_3):
         pass
 
+  def assertLambdaNodes(self, matching_nodes, expected_bodies):
+    self.assertEqual(len(matching_nodes), len(expected_bodies))
+    for node in matching_nodes:
+      self.assertIsInstance(node, gast.Lambda)
+      self.assertIn(compiler.ast_to_source(node.body).strip(), expected_bodies)
+
+  def test_find_matching_definitions_lambda(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      f = lambda x: 1
+    """))
+    f = lambda x: x
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(1)',))
+
+  def test_find_matching_definitions_lambda_multiple_matches(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      f = lambda x: 1, lambda x: 2
+    """))
+    f = lambda x: x
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(1)', '(2)'))
+
+  def test_find_matching_definitions_lambda_uses_arg_names(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      f = lambda x: 1, lambda y: 2
+    """))
+    f = lambda x: x
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(1)',))
+
+    f = lambda y: y
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertLambdaNodes(nodes, ('(2)',))
+
+  def assertFunctionDefNodes(self, matching_nodes, expected_bodies):
+    self.assertEqual(len(matching_nodes), len(expected_bodies))
+    for node in matching_nodes:
+      self.assertIsInstance(node, gast.FunctionDef)
+      self.assertIn(compiler.ast_to_source(node.body).strip(), expected_bodies)
+
+  def test_find_matching_definitions_function(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(x):
+        return 1
+    """))
+
+    def f(x):
+      return x
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1',))
+
+  def test_find_matching_definitions_nested_functions_same_name(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(x, *args, **kwargs):
+        def f(x, y):
+          return 1
+        return 2
+    """))
+
+    def f(x, y):
+      return x + y
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1',))
+
+  def test_find_matching_definitions_nested_functions_same_args(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def g(x):
+        def f(x):
+          return 1
+        return 2
+    """))
+
+    def f(x):
+      return x
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1',))
+
+  def test_find_matching_definitions_multiple_matches(self):
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(x):
+        return 1
+      def f(x):
+        return 2
+    """))
+
+    def f(x):
+      return x
+
+    nodes = ast_util.find_matching_definitions(node, f)
+    self.assertFunctionDefNodes(nodes, ('return 1', 'return 2'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/cfg.py b/tensorflow/python/autograph/pyct/cfg.py
index fca0eb62e4c701658e257cd7f3f24b92d55c5280..fdfcd4dcc15b0c6238dcdc3fedef60f2984c33a4 100644
--- a/tensorflow/python/autograph/pyct/cfg.py
+++ b/tensorflow/python/autograph/pyct/cfg.py
@@ -22,6 +22,10 @@ Once built, the CFG itself is immutable, but the values it holds need not be;
 they are usually annotated with information extracted by walking the graph.
 """
 
+# TODO(mdan): The notion of 'statements' below is inaccurate.
+# They should rather be called 'block statements', because they include
+# statements that may have a body, e.g. if and while.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -675,10 +679,6 @@ class AstToCfg(gast.NodeVisitor):
     self.cfgs[node] = self.builder.build()
     self.builder = self.builder_stack.pop()
 
-  def visit_Lambda(self, node):
-    # TODO(mdan): Treat like FunctionDef? That would be a separate CFG.
-    raise NotImplementedError()
-
   def visit_Return(self, node):
     self._process_exit_statement(node, gast.FunctionDef)
 
@@ -763,9 +763,9 @@ class AstToCfg(gast.NodeVisitor):
 
     self.builder.enter_section(node)
 
-    # TODO(mdan): Strictly speaking, this should be node.target + node.iter.
-    # A blind dataflow analysis would have to process both node.target and
-    # node.iter to properly process read and write access.
+    # Note: Strictly speaking, this should be node.target + node.iter.
+    # However, the activity analysis accounts for this inconsistency,
+    # so dataflow analysis produces the correct values.
     self.builder.enter_loop_section(node, node.iter)
     for stmt in node.body:
       self.visit(stmt)
diff --git a/tensorflow/python/autograph/pyct/cfg_test.py b/tensorflow/python/autograph/pyct/cfg_test.py
index bd82e70f7dea20c2658e31bc09fcc2e68a24ec13..d5870124bcec1989af27949b70e490a7a0899461 100644
--- a/tensorflow/python/autograph/pyct/cfg_test.py
+++ b/tensorflow/python/autograph/pyct/cfg_test.py
@@ -964,6 +964,22 @@ class AstToCfgTest(test.TestCase):
         ),
     )
 
+  def test_lambda_basic(self):
+
+    def test_fn(a):
+      a = lambda b: a + b
+      return a
+
+    graph, = self._build_cfg(test_fn).values()
+
+    self.assertGraphMatches(
+        graph,
+        (
+            ('a', 'a = lambda b: a + b', 'return a'),
+            ('a = lambda b: a + b', 'return a', None),
+        ),
+    )
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/common_transformers/BUILD b/tensorflow/python/autograph/pyct/common_transformers/BUILD
index 5e2f8f3ac0ac4d3f6be65fd9c1ae9d7b9e0a18c3..1106a19de1bfa13e05d112974267c92ffe7dc6cb 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/BUILD
+++ b/tensorflow/python/autograph/pyct/common_transformers/BUILD
@@ -34,6 +34,7 @@ py_test(
     name = "anf_test",
     srcs = ["anf_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_oss"],
     deps = [
         ":common_transformers",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
index ccc7e4ca8fcd365ee06eaaaa13832640b313d856..525d4886dee37c79d4087a293fa9ce5424a74c15 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
@@ -43,6 +43,29 @@ class DummyGensym(object):
     return stem + '_' + str(1000 + self._idx)
 
 
+# These two test functions have to be top-level, not nested, for compatibility
+# with some unknown version of Python 2.7 preceding 2.7.15.  Why?  Because
+# `exec` and nested function definitions _incomaptibly_ change the
+# representation of local variables, such that `exec` inside a nested function
+# definition is a syntax error in that version.  The tuple form of `exec` fixes
+# this problem, but apparently that was introduced in some unknown version of
+# Python that's more recent than at least one version that we wish to be
+# compatible with.
+def exec_test_function():
+  # The point is to test A-normal form conversion of exec
+  # pylint: disable=exec-used
+  exec('computed' + 5 + 'stuff', globals(), locals())
+
+
+def exec_expected_result():
+  # pylint: disable=exec-used
+  tmp_1001 = 'computed' + 5
+  tmp_1002 = tmp_1001 + 'stuff'
+  tmp_1003 = globals()
+  tmp_1004 = locals()
+  exec(tmp_1002, tmp_1003, tmp_1004)
+
+
 class AnfTransformerTest(test.TestCase):
 
   def _simple_source_info(self):
@@ -357,21 +380,7 @@ class AnfTransformerTest(test.TestCase):
     self.assert_body_anfs_as_expected(expected_result, test_function)
 
   def test_exec(self):
-
-    def test_function():
-      # The point is to test A-normal form conversion of exec
-      # pylint: disable=exec-used
-      exec('computed' + 5 + 'stuff', globals(), locals())
-
-    def expected_result():
-      # pylint: disable=exec-used
-      tmp_1001 = 'computed' + 5
-      tmp_1002 = tmp_1001 + 'stuff'
-      tmp_1003 = globals()
-      tmp_1004 = locals()
-      exec(tmp_1002, tmp_1003, tmp_1004)
-
-    self.assert_body_anfs_as_expected(expected_result, test_function)
+    self.assert_body_anfs_as_expected(exec_expected_result, exec_test_function)
 
   def test_simple_while_and_assert(self):
 
diff --git a/tensorflow/python/autograph/pyct/compiler.py b/tensorflow/python/autograph/pyct/compiler.py
index 21281aeb561475db1726ab4c3d80a25622a71ae4..06e66c5b5871d5528bccfcc9fe47268207594ea6 100644
--- a/tensorflow/python/autograph/pyct/compiler.py
+++ b/tensorflow/python/autograph/pyct/compiler.py
@@ -123,26 +123,15 @@ def ast_to_object(nodes,
   compiled_nodes = imp.load_source(module_name, f.name)
 
   # TODO(znado): Clean this up so we don't need to attach it to the namespace.
-  # TODO(znado): This does not work for classes because their methods share a
-  # namespace.
-  # This attaches the source map which is needed for error handling.  Note that
-  # api.to_graph copies this source map into an attribute of the function.
-  #
-  # We need this so the ag_source_map__ variable is available to the call to
-  # rewrite_graph_construction_error in the except block inside each function
-  # that handles graph construction errors.
-  #
   # We cannot get the rewritten function name until it is too late so templating
-  # is hard, and this cleanly fixes the
-  # issues encountered with nested functions because this is attached to the
-  # outermost one.
+  # is hard, and this cleanly fixes the issues encountered with nested functions
+  # because this is attached to the outermost one.
   if include_source_map:
     # TODO(mdan): This name should be decided by the caller.
     source_map_name = 'ag_source_map__'
-    if source_map_name in compiled_nodes.__dict__:
-      raise ValueError('cannot convert %s because is has namespace attribute '
-                       '"%s", which is reserved for AutoGraph.' %
-                       (compiled_nodes, source_map_name))
+    assert source_map_name not in compiled_nodes.__dict__, (
+        'cannot convert %s because is has namespace attribute "%s", which is '
+        'reserved for AutoGraph.') % (compiled_nodes, source_map_name)
     compiled_nodes.__dict__[source_map_name] = source_map
 
   return compiled_nodes, source
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index eef74599a7d5415b4b05d2f05fb094b1dcd33323..7c819f364fa79d40c0fbb080b3b358b36bfd8c0c 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -29,10 +29,48 @@ import six
 from tensorflow.python.util import tf_inspect
 
 
+# These functions test negative for isinstance(*, types.BuiltinFunctionType)
+# and inspect.isbuiltin, and are generally not visible in globals().
+SPECIAL_BUILTINS = {
+    'dict': dict,
+    'float': float,
+    'int': int,
+    'len': len,
+    'list': list,
+    'print': print,
+    'range': range,
+    'tuple': tuple
+}
+
+if six.PY2:
+  SPECIAL_BUILTINS['xrange'] = xrange
+
+
+def islambda(f):
+  if not tf_inspect.isfunction(f):
+    return False
+  if not hasattr(f, '__name__'):
+    return False
+  return f.__name__ == '<lambda>'
+
+
+def isnamedtuple(f):
+  """Returns True if the argument is a namedtuple-like."""
+  if not (tf_inspect.isclass(f) and issubclass(f, tuple)):
+    return False
+  if not hasattr(f, '_fields'):
+    return False
+  fields = getattr(f, '_fields')
+  if not isinstance(fields, tuple):
+    return False
+  if not all(isinstance(f, str) for f in fields):
+    return False
+  return True
+
+
 def isbuiltin(f):
-  # Note these return false for isinstance(f, types.BuiltinFunctionType) so we
-  # need to specifically check for them.
-  if f in (range, int, float):
+  """Returns True if the argument is a built-in function."""
+  if f in SPECIAL_BUILTINS.values():
     return True
   if isinstance(f, types.BuiltinFunctionType):
     return True
@@ -63,6 +101,58 @@ def getnamespace(f):
   return namespace
 
 
+def getqualifiedname(namespace, object_, max_depth=2):
+  """Returns the name by which a value can be referred to in a given namespace.
+
+  If the object defines a parent module, the function attempts to use it to
+  locate the object.
+
+  This function will recurse inside modules, but it will not search objects for
+  attributes. The recursion depth is controlled by max_depth.
+
+  Args:
+    namespace: Dict[str, Any], the namespace to search into.
+    object_: Any, the value to search.
+    max_depth: Optional[int], a limit to the recursion depth when searching
+        inside modules.
+  Returns: Union[str, None], the fully-qualified name that resolves to the value
+      o, or None if it couldn't be found.
+  """
+  for name, value in namespace.items():
+    # The value may be referenced by more than one symbol, case in which
+    # any symbol will be fine. If the program contains symbol aliases that
+    # change over time, this may capture a symbol that will later point to
+    # something else.
+    # TODO(mdan): Prefer the symbol that matches the value type name.
+    if object_ is value:
+      return name
+
+  # If an object is not found, try to search its parent modules.
+  parent = tf_inspect.getmodule(object_)
+  if (parent is not None and parent is not object_ and
+      parent is not namespace):
+    # No limit to recursion depth because of the guard above.
+    parent_name = getqualifiedname(namespace, parent, max_depth=0)
+    if parent_name is not None:
+      name_in_parent = getqualifiedname(parent.__dict__, object_, max_depth=0)
+      assert name_in_parent is not None, (
+          'An object should always be found in its owner module')
+      return '{}.{}'.format(parent_name, name_in_parent)
+
+  # TODO(mdan): Use breadth-first search and avoid visiting modules twice.
+  if max_depth:
+    # Iterating over a copy prevents "changed size due to iteration" errors.
+    # It's unclear why those occur - suspecting new modules may load during
+    # iteration.
+    for name, value in namespace.copy().items():
+      if tf_inspect.ismodule(value):
+        name_in_module = getqualifiedname(value.__dict__, object_,
+                                          max_depth - 1)
+        if name_in_module is not None:
+          return '{}.{}'.format(name, name_in_module)
+  return None
+
+
 def _get_unbound_function(m):
   # TODO(mdan): Figure out why six.get_unbound_function fails in some cases.
   # The failure case is for tf.keras.Model.
@@ -117,12 +207,15 @@ def getmethodclass(m):
       return m.__class__
 
   # Instance method and class methods: should be bound to a non-null "self".
-  # If self is a class, then it's a class method.
   if hasattr(m, '__self__'):
-    if m.__self__:
-      if tf_inspect.isclass(m.__self__):
-        return m.__self__
-      return type(m.__self__)
+    if m.__self__ is not None:
+      # A fallback allowing methods to be actually bound to a type different
+      # than __self__. This is useful when a strong reference from the method
+      # to the object is not desired, for example when caching is involved.
+      if hasattr(m.__self__, 'ag_self_weakref__'):
+        return m.__self__.ag_self_weakref__()
+
+      return m.__self__
 
   # Class, static and unbound methods: search all defined classes in any
   # namespace. This is inefficient but more robust method.
@@ -159,3 +252,59 @@ def getmethodclass(m):
     raise ValueError('Found too many owners of %s: %s' % (m, owners))
 
   return None
+
+
+class SuperWrapperForDynamicAttrs(object):
+  """A wrapper that supports dynamic attribute lookup on the super object.
+
+  For example, in the following code, `super` incorrectly reports that
+  `super(Bar, b)` lacks the `a` attribute:
+
+    class Foo(object):
+      def __init__(self):
+        self.a = lambda: 1
+
+      def bar(self):
+        return hasattr(self, 'a')
+
+    class Bar(Foo):
+      def bar(self):
+        return super(Bar, self).bar()
+
+
+    b = Bar()
+    print(hasattr(super(Bar, b), 'a'))  # False
+    print(super(Bar, b).bar())          # True
+
+  A practical situation when this tends to happen is Keras model hierarchies
+  that hold references to certain layers, like this:
+
+    class MiniModel(keras.Model):
+
+      def __init__(self):
+        super(MiniModel, self).__init__()
+        self.fc = keras.layers.Dense(1)
+
+      def call(self, inputs, training=True):
+        return self.fc(inputs)
+
+    class DefunnedMiniModel(MiniModel):
+
+      def call(self, inputs, training=True):
+        return super(DefunnedMiniModel, self).call(inputs, training=training)
+
+  A side effect of this wrapper is that all attributes become visible, even
+  those created in the subclass.
+  """
+
+  # TODO(mdan): Investigate why that happens - it may be for a reason.
+  # TODO(mdan): Probably need more overrides to make it look like super.
+
+  def __init__(self, target):
+    self._target = target
+
+  def __getattribute__(self, name):
+    target = object.__getattribute__(self, '_target')
+    if hasattr(target, name):
+      return getattr(target, name)
+    return getattr(target.__self__, name)
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index f3eb027822f53feecb5f51bb45e600f4cde81d81..a2c39056d1b09dbae937915cf17de5c6f55d4886 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -18,11 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from functools import wraps
+import collections
+import functools
+import imp
+import types
+import weakref
 
 import six
 
+from tensorflow.python import lib
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
 
 
@@ -41,7 +47,7 @@ def wrapping_decorator():
     def replacement(*_):
       return None
 
-    @wraps(f)
+    @functools.wraps(f)
     def wrapper(*args, **kwargs):
       return replacement(*args, **kwargs)
     return wrapper
@@ -90,6 +96,38 @@ def free_factory():
 
 class InspectUtilsTest(test.TestCase):
 
+  def test_islambda(self):
+    def test_fn():
+      pass
+
+    self.assertTrue(inspect_utils.islambda(lambda x: x))
+    self.assertFalse(inspect_utils.islambda(test_fn))
+
+  def test_isnamedtuple(self):
+    nt = collections.namedtuple('TestNamedTuple', ['a', 'b'])
+
+    class NotANamedTuple(tuple):
+      pass
+
+    self.assertTrue(inspect_utils.isnamedtuple(nt))
+    self.assertFalse(inspect_utils.isnamedtuple(NotANamedTuple))
+
+  def test_isnamedtuple_confounder(self):
+    """This test highlights false positives when detecting named tuples."""
+
+    class NamedTupleLike(tuple):
+      _fields = ('a', 'b')
+
+    self.assertTrue(inspect_utils.isnamedtuple(NamedTupleLike))
+
+  def test_isnamedtuple_subclass(self):
+    """This test highlights false positives when detecting named tuples."""
+
+    class NamedTupleSubclass(collections.namedtuple('Test', ['a', 'b'])):
+      pass
+
+    self.assertTrue(inspect_utils.isnamedtuple(NamedTupleSubclass))
+
   def test_getnamespace_globals(self):
     ns = inspect_utils.getnamespace(factory)
     self.assertEqual(ns['free_function'], free_function)
@@ -127,6 +165,32 @@ class InspectUtilsTest(test.TestCase):
     self.assertEqual(ns['closed_over_primitive'], closed_over_primitive)
     self.assertTrue('local_var' not in ns)
 
+  def test_getqualifiedname(self):
+    foo = object()
+    qux = imp.new_module('quxmodule')
+    bar = imp.new_module('barmodule')
+    baz = object()
+    bar.baz = baz
+
+    ns = {
+        'foo': foo,
+        'bar': bar,
+        'qux': qux,
+    }
+
+    self.assertIsNone(inspect_utils.getqualifiedname(ns, inspect_utils))
+    self.assertEqual(inspect_utils.getqualifiedname(ns, foo), 'foo')
+    self.assertEqual(inspect_utils.getqualifiedname(ns, bar), 'bar')
+    self.assertEqual(inspect_utils.getqualifiedname(ns, baz), 'bar.baz')
+
+  def test_getqualifiedname_finds_via_parent_module(self):
+    # TODO(mdan): This test is vulnerable to change in the lib module.
+    # A better way to forge modules should be found.
+    self.assertEqual(
+        inspect_utils.getqualifiedname(
+            lib.__dict__, lib.io.file_io.FileIO, max_depth=1),
+        'io.file_io.FileIO')
+
   def test_getmethodclass(self):
 
     self.assertEqual(
@@ -156,16 +220,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = TestClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        TestClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.static_method),
         TestClass)
@@ -214,16 +278,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = LocalClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        LocalClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        LocalClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        LocalClass)
+        test_obj)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        LocalClass)
+        test_obj)
 
   def test_getmethodclass_callables(self):
     class TestCallable(object):
@@ -234,6 +298,25 @@ class InspectUtilsTest(test.TestCase):
     c = TestCallable()
     self.assertEqual(inspect_utils.getmethodclass(c), TestCallable)
 
+  def test_getmethodclass_weakref_mechanism(self):
+    test_obj = TestClass()
+
+    class WeakrefWrapper(object):
+
+      def __init__(self):
+        self.ag_self_weakref__ = weakref.ref(test_obj)
+
+    def test_fn(self):
+      return self
+
+    bound_method = types.MethodType(test_fn, WeakrefWrapper())
+    self.assertEqual(inspect_utils.getmethodclass(bound_method), test_obj)
+
+  def test_getmethodclass_no_bool_conversion(self):
+
+    tensor = constant_op.constant([1])
+    self.assertEqual(inspect_utils.getmethodclass(tensor.get_shape), tensor)
+
   def test_getdefiningclass(self):
     class Superclass(object):
 
@@ -272,6 +355,38 @@ class InspectUtilsTest(test.TestCase):
     self.assertTrue(inspect_utils.isbuiltin(len))
     self.assertFalse(inspect_utils.isbuiltin(function_decorator))
 
+  def test_super_wrapper_for_dynamic_attrs(self):
+
+    a = object()
+    b = object()
+
+    class Base(object):
+
+      def __init__(self):
+        self.a = a
+
+    class Subclass(Base):
+
+      def __init__(self):
+        super(Subclass, self).__init__()
+        self.b = b
+
+    base = Base()
+    sub = Subclass()
+
+    sub_super = super(Subclass, sub)
+    sub_super_wrapped = inspect_utils.SuperWrapperForDynamicAttrs(sub_super)
+
+    self.assertIs(base.a, a)
+    self.assertIs(sub.a, a)
+
+    self.assertFalse(hasattr(sub_super, 'a'))
+    self.assertIs(sub_super_wrapped.a, a)
+
+    # TODO(mdan): Is this side effect harmful? Can it be avoided?
+    # Note that `b` was set in `Subclass.__init__`.
+    self.assertIs(sub_super_wrapped.b, b)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 63686350d518d806578bf6c49d108ad764ad0bfe..39fc1a7ed05c06da89efe505e439b307badb4b4e 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 import textwrap
 
 import gast
+import six
 
 from tensorflow.python.util import tf_inspect
 
@@ -31,27 +32,77 @@ from tensorflow.python.util import tf_inspect
 def parse_entity(entity):
   """Returns the AST of given entity."""
   source = tf_inspect.getsource(entity)
+
+  def fail(comment):
+    raise ValueError(
+        'Failed to parse source code of {}, which Python reported as:\n{}\n'
+        '{}'.format(entity, source, comment))
+
   # Comments and multiline strings can appear at arbitrary indentation levels,
   # causing textwrap.dedent to not correctly dedent source code.
   # TODO(b/115884650): Automatic handling of comments/multiline strings.
   source = textwrap.dedent(source)
+
   try:
     return parse_str(source), source
+
   except IndentationError:
-    # Because we are parsing the source code of entities that have already
-    # successfully parsed once, any IndentationErrors are guaranteed to be
-    # caused by insufficient dedenting.
-    raise ValueError(
-        'Failed to dedent prior to parsing source code. If you have comments '
-        'or multiline strings in your code, try indenting them. '
-        'Multiline strings can be rewritten using textwrap.dedent.\n'
-        'Offending source code: \n %s' % source)
+    # The text below lists the causes of this error known to us. There may
+    # be more.
+    fail('This may be caused by multiline strings or comments not indented at'
+         'the same level as the code.')
+
+  except SyntaxError as e:
+    if not tf_inspect.isfunction(entity) or entity.__name__ != '<lambda>':
+      raise
+
+    # Certain entities, like lambdas, only hold the raw code lines which defined
+    # them, which may include surrounding tokens and may be syntactically
+    # invalid out of context. For example:
+    #
+    #     l = (
+    #         lambda x: x,)[0]
+    #
+    # will have the dedented source "lambda x: x,)[0]"
+    # Here we make an attempt to stip away the garbage by looking at the
+    # information in the syntax error.
+    lines = source.split('\n')
+    lineno, offset = e.lineno, e.offset  # 1-based
+
+    # Give up if there's nothing we can chip away.
+    if len(lines) == lineno and len(lines[-1]) == offset:
+      fail('If this is a lambda function, the error may be avoided by creating'
+           ' the lambda in a standalone statement.')
+
+    # Drop all lines following the error location
+    # TODO(mdan): What's with the pylint errors?
+    lines = lines[:lineno]  # pylint:disable=invalid-slice-index
+    # Drop all characters following the error location
+    lines[-1] = lines[-1][:offset - 1]  # pylint:disable=invalid-slice-index
+    new_source = '\n'.join(lines)
+
+    try:
+      return parse_str(new_source), new_source
+    except SyntaxError as e:
+      fail('If this is a lambda function, the error may be avoided by creating'
+           ' the lambda in a standalone statement. Tried to strip down the'
+           ' source to:\n{}\nBut that did not work.'.format(new_source))
 
 
 def parse_str(src):
   """Returns the AST of given piece of code."""
   # TODO(mdan): This should exclude the module things are autowrapped in.
-  return gast.parse(src)
+
+  if six.PY2 and '.print(' in src:
+    # This special treatment is required because gast.parse is not aware of
+    # whether print_function was present in the original context.
+    src = 'from __future__ import print_function\n' + src
+    parsed_module = gast.parse(src)
+    parsed_module.body = parsed_module.body[1:]
+  else:
+    parsed_module = gast.parse(src)
+
+  return parsed_module
 
 
 def parse_expression(src):
diff --git a/tensorflow/python/autograph/pyct/qual_names.py b/tensorflow/python/autograph/pyct/qual_names.py
index 334cbd7d38ec3d0d7b3468f8f64153ecd2df1038..6ad6199acf70842d7427e59910b27237e88f8654 100644
--- a/tensorflow/python/autograph/pyct/qual_names.py
+++ b/tensorflow/python/autograph/pyct/qual_names.py
@@ -99,6 +99,9 @@ class QN(object):
   def is_symbol(self):
     return isinstance(self.qn[0], str)
 
+  def is_simple(self):
+    return len(self.qn) <= 1
+
   def is_composite(self):
     return len(self.qn) > 1
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 4a4ccdcbd15a592d4a6d2713c192d60e8dc76492..5e260c5730ae855397f3f94664c0ccb409dcbba1 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -38,7 +38,6 @@ py_test(
     name = "activity_test",
     srcs = ["activity_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/python:client_testlib",
@@ -51,7 +50,6 @@ py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index 086eda7574a4c0846ba35a8fcbe897be6bdccc1a..4359e0a2682f0f6818a0c2e0aaffeaa12718c514 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -42,10 +42,20 @@ class Scope(object):
   Note that scopes do not necessarily align with Python's scopes. For example,
   the body of an if statement may be considered a separate scope.
 
+  Caution - the AST references held by this object are weak.
+
   Attributes:
-    modified: identifiers modified in this scope
-    created: identifiers created in this scope
-    used: identifiers referenced in this scope
+    modified: Set[qual_names.QN], identifiers modified in this scope
+    read: Set[qual_names.QN], identifiers read in this scope
+    deleted: Set[qual_names.QN], identifiers deleted in this scope
+    params: WeakValueDictionary[qual_names.QN, ast.Node], function arguments
+      visible in this scope, mapped to the function node that defines them
+
+  Note - simple statements may never delete and modify a symbol at the same
+  time. However, compound ones like if statements can. In that latter case, it's
+  undefined whether the symbol is actually modified or deleted upon statement
+  exit. Certain analyses like reaching definitions need to be careful about
+  this.
   """
 
   def __init__(self, parent, isolated=True, add_unknown_symbols=False):
@@ -54,7 +64,8 @@ class Scope(object):
     Args:
       parent: A Scope or None.
       isolated: Whether the scope is isolated, that is, whether variables
-          created in this scope should be visible to the parent scope.
+          modified in this scope should be considered modified in the parent
+          scope.
       add_unknown_symbols: Whether to handle attributed and subscripts
           without having first seen the base name.
           E.g., analyzing the statement 'x.y = z' without first having seen 'x'.
@@ -63,22 +74,22 @@ class Scope(object):
     self.parent = parent
     self.add_unknown_symbols = add_unknown_symbols
     self.modified = set()
-    # TODO(mdan): Completely remove this.
-    self.created = set()
-    self.used = set()
-    self.params = {}
-    self.returned = set()
+    self.read = set()
+    self.deleted = set()
+    self.params = weakref.WeakValueDictionary()
+
+  @property
+  def affects_parent(self):
+    return not self.isolated and self.parent is not None
 
-  # TODO(mdan): Rename to `locals`
   @property
   def referenced(self):
-    if not self.isolated and self.parent is not None:
-      return self.used | self.parent.referenced
-    return self.used
+    if self.affects_parent:
+      return self.read | self.parent.referenced
+    return self.read
 
   def __repr__(self):
-    return 'Scope{r=%s, c=%s, w=%s}' % (tuple(self.used), tuple(self.created),
-                                        tuple(self.modified))
+    return 'Scope{r=%s, w=%s}' % (tuple(self.read), tuple(self.modified))
 
   def copy_from(self, other):
     """Recursively copies the contents of this scope from another scope."""
@@ -88,10 +99,8 @@ class Scope(object):
       self.parent.copy_from(other.parent)
     self.isolated = other.isolated
     self.modified = copy.copy(other.modified)
-    self.created = copy.copy(other.created)
-    self.used = copy.copy(other.used)
+    self.read = copy.copy(other.read)
     self.params = copy.copy(other.params)
-    self.returned = copy.copy(other.returned)
 
   @classmethod
   def copy_of(cls, other):
@@ -109,60 +118,35 @@ class Scope(object):
     if other.parent is not None:
       self.parent.merge_from(other.parent)
     self.modified |= other.modified
-    self.created |= other.created
-    self.used |= other.used
+    self.read |= other.read
     self.params.update(other.params)
-    self.returned |= other.returned
-
-  def has(self, name):
-    if name in self.modified:
-      return True
-    elif self.parent is not None:
-      return self.parent.has(name)
-    return False
 
   def mark_read(self, name):
-    self.used.add(name)
-    if self.parent is not None and name not in self.created:
+    self.read.add(name)
+    if self.parent is not None and name not in self.params:
       self.parent.mark_read(name)
 
+  def mark_modified(self, name):
+    self.modified.add(name)
+    if self.affects_parent:
+      self.parent.mark_modified(name)
+
+  def mark_deleted(self, name):
+    self.deleted.add(name)
+
   def mark_param(self, name, owner):
     # Assumption: all AST nodes have the same life span. This lets us use
     # a weak reference to mark the connection between a symbol node and the
     # function node whose argument that symbol is.
-    self.params[name] = weakref.ref(owner)
+    self.params[name] = owner
 
-  def mark_creation(self, name, writes_create_symbol=False):
-    """Mark a qualified name as created."""
-    if name.is_composite():
-      parent = name.parent
-      if not writes_create_symbol:
-        return
-      else:
-        if not self.has(parent):
-          if self.add_unknown_symbols:
-            self.mark_read(parent)
-          else:
-            raise ValueError('Unknown symbol "%s".' % parent)
-    self.created.add(name)
-
-  def mark_write(self, name):
-    """Marks the given symbol as modified in the current scope."""
-    self.modified.add(name)
-    if self.isolated:
-      self.mark_creation(name)
-    else:
-      if self.parent is None:
-        self.mark_creation(name)
-      else:
-        if not self.parent.has(name):
-          self.mark_creation(name)
-        self.parent.mark_write(name)
 
-  def mark_returned(self, name):
-    self.returned.add(name)
-    if not self.isolated and self.parent is not None:
-      self.parent.mark_returned(name)
+class _Lambda(object):
+
+  no_root = True
+
+  def __init__(self):
+    self.args = set()
 
 
 class ActivityAnalyzer(transformer.Base):
@@ -178,8 +162,11 @@ class ActivityAnalyzer(transformer.Base):
   def __init__(self, context, parent_scope=None, add_unknown_symbols=False):
     super(ActivityAnalyzer, self).__init__(context)
     self.scope = Scope(parent_scope, None, add_unknown_symbols)
-    self._in_return_statement = False
+
+    # Note: all these flags crucially rely on the respective nodes are
+    # leaves in the AST, that is, they cannot contain other statements.
     self._in_aug_assign = False
+    self._in_function_def_args = False
 
   @property
   def _in_constructor(self):
@@ -197,38 +184,49 @@ class ActivityAnalyzer(transformer.Base):
         return True
     return False
 
-  def _track_symbol(self,
-                    node,
-                    composite_writes_alter_parent=False,
-                    writes_create_symbol=False):
+  def _track_symbol(self, node, composite_writes_alter_parent=False):
     # A QN may be missing when we have an attribute (or subscript) on a function
     # call. Example: a().b
     if not anno.hasanno(node, anno.Basic.QN):
       return
     qn = anno.getanno(node, anno.Basic.QN)
 
+    # When inside a lambda, ignore any of the lambda's arguments.
+    # This includes attributes or slices of those arguments.
+    for l in self.state[_Lambda]:
+      if qn in l.args:
+        return
+      if qn.owner_set & set(l.args):
+        return
+
     if isinstance(node.ctx, gast.Store):
-      self.scope.mark_write(qn)
+      self.scope.mark_modified(qn)
       if qn.is_composite and composite_writes_alter_parent:
-        self.scope.mark_write(qn.parent)
-      if writes_create_symbol:
-        self.scope.mark_creation(qn, writes_create_symbol=True)
+        self.scope.mark_modified(qn.parent)
       if self._in_aug_assign:
         self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Load):
       self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Param):
-      # Param contexts appear in function defs, so they have the meaning of
-      # defining a variable.
-      self.scope.mark_write(qn)
-      self.scope.mark_param(qn, self.enclosing_entities[-1])
+      if self._in_function_def_args:
+        # In function defs have the meaning of defining a variable.
+        self.scope.mark_modified(qn)
+        self.scope.mark_param(qn, self.enclosing_entities[-1])
+      elif self.state[_Lambda].level:
+        # In lambdas, they are tracked separately.
+        self.state[_Lambda].args.add(qn)
+      else:
+        # TODO(mdan): Is this case possible at all?
+        raise NotImplementedError(
+            'Param "{}" outside a function arguments or lambda.'.format(qn))
+    elif isinstance(node.ctx, gast.Del):
+      # The read matches the Python semantics - attempting to delete an
+      # undefined symbol is illegal.
+      self.scope.mark_read(qn)
+      self.scope.mark_deleted(qn)
     else:
-      raise ValueError('Unknown context %s for node %s.' % (type(node.ctx), qn))
-
-    anno.setanno(node, NodeAnno.IS_LOCAL, self.scope.has(qn))
-
-    if self._in_return_statement:
-      self.scope.mark_returned(qn)
+      raise ValueError('Unknown context {} for node "{}".'.format(
+          type(node.ctx), qn))
 
   def _enter_scope(self, isolated):
     self.scope = Scope(self.scope, isolated=isolated)
@@ -243,14 +241,17 @@ class ActivityAnalyzer(transformer.Base):
     self._exit_scope()
     return node
 
+  def visit_nonlocal(self, node):
+    raise NotImplementedError()
+
+  def visit_global(self, node):
+    raise NotImplementedError()
+
   def visit_Expr(self, node):
     return self._process_statement(node)
 
   def visit_Return(self, node):
-    self._in_return_statement = True
-    node = self._process_statement(node)
-    self._in_return_statement = False
-    return node
+    return self._process_statement(node)
 
   def visit_Assign(self, node):
     return self._process_statement(node)
@@ -263,6 +264,9 @@ class ActivityAnalyzer(transformer.Base):
     self._in_aug_assign = False
     return node
 
+  def visit_Delete(self, node):
+    return self._process_statement(node)
+
   def visit_Name(self, node):
     node = self.generic_visit(node)
     self._track_symbol(node)
@@ -271,8 +275,7 @@ class ActivityAnalyzer(transformer.Base):
   def visit_Attribute(self, node):
     node = self.generic_visit(node)
     if self._in_constructor and self._node_sets_self_attribute(node):
-      self._track_symbol(
-          node, composite_writes_alter_parent=True, writes_create_symbol=True)
+      self._track_symbol(node, composite_writes_alter_parent=True)
     else:
       self._track_symbol(node)
     return node
@@ -328,6 +331,13 @@ class ActivityAnalyzer(transformer.Base):
       self.scope.merge_from(after_child)
     return parent
 
+  def visit_Lambda(self, node):
+    assert not self._in_function_def_args
+    self.state[_Lambda].enter()
+    node = self.generic_visit(node)
+    self.state[_Lambda].exit()
+    return node
+
   def visit_arguments(self, node):
     return self._process_statement(node)
 
@@ -336,13 +346,16 @@ class ActivityAnalyzer(transformer.Base):
     # of its name, along with the usage of any decorator accompany it.
     self._enter_scope(False)
     node.decorator_list = self.visit_block(node.decorator_list)
-    self.scope.mark_write(qual_names.QN(node.name))
+    self.scope.mark_modified(qual_names.QN(node.name))
     anno.setanno(node, anno.Static.SCOPE, self.scope)
     self._exit_scope()
 
     # A separate Scope tracks the actual function definition.
     self._enter_scope(True)
+    assert not (self._in_function_def_args or self.state[_Lambda].level)
+    self._in_function_def_args = True
     node.args = self.visit(node.args)
+    self._in_function_def_args = False
 
     # Track the body separately. This is for compatibility reasons, it may not
     # be strictly needed.
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
index d4a6ce8ac3a99c90aeb2d9cdc6d45f1850abcd6d..997d9a8aff111dfb0c223840da642ce8b2f138ce 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
@@ -32,62 +32,63 @@ from tensorflow.python.platform import test
 
 class ScopeTest(test.TestCase):
 
+  def assertMissing(self, qn, scope):
+    self.assertNotIn(qn, scope.read)
+    self.assertNotIn(qn, scope.modified)
+
+  def assertReadOnly(self, qn, scope):
+    self.assertIn(qn, scope.read)
+    self.assertNotIn(qn, scope.modified)
+
+  def assertWriteOnly(self, qn, scope):
+    self.assertNotIn(qn, scope.read)
+    self.assertIn(qn, scope.modified)
+
+  def assertReadWrite(self, qn, scope):
+    self.assertIn(qn, scope.read)
+    self.assertIn(qn, scope.modified)
+
   def test_basic(self):
     scope = activity.Scope(None)
-    self.assertFalse(scope.has(QN('foo')))
+    self.assertMissing(QN('foo'), scope)
 
     scope.mark_read(QN('foo'))
-    self.assertFalse(scope.has(QN('foo')))
-
-    scope.mark_write(QN('foo'))
-    self.assertTrue(scope.has(QN('foo')))
+    self.assertReadOnly(QN('foo'), scope)
 
-    scope.mark_read(QN('bar'))
-    self.assertFalse(scope.has(QN('bar')))
+    scope.mark_modified(QN('foo'))
+    self.assertReadWrite(QN('foo'), scope)
 
   def test_copy_from(self):
     scope = activity.Scope(None)
-    scope.mark_write(QN('foo'))
-
+    scope.mark_modified(QN('foo'))
     other = activity.Scope(None)
     other.copy_from(scope)
 
-    self.assertTrue(QN('foo') in other.modified)
+    self.assertWriteOnly(QN('foo'), other)
 
-    scope.mark_write(QN('bar'))
+    scope.mark_modified(QN('bar'))
     scope.copy_from(other)
 
-    self.assertFalse(QN('bar') in scope.modified)
+    self.assertMissing(QN('bar'), scope)
 
-    scope.mark_write(QN('bar'))
+    scope.mark_modified(QN('bar'))
     scope.merge_from(other)
 
-    self.assertTrue(QN('bar') in scope.modified)
-    self.assertFalse(QN('bar') in other.modified)
+    self.assertWriteOnly(QN('bar'), scope)
+    self.assertMissing(QN('bar'), other)
 
   def test_copy_of(self):
     scope = activity.Scope(None)
     scope.mark_read(QN('foo'))
+    other = activity.Scope.copy_of(scope)
 
-    self.assertTrue(QN('foo') in activity.Scope.copy_of(scope).used)
+    self.assertReadOnly(QN('foo'), other)
 
     child_scope = activity.Scope(scope)
     child_scope.mark_read(QN('bar'))
+    other = activity.Scope.copy_of(child_scope)
 
-    self.assertTrue(QN('bar') in activity.Scope.copy_of(child_scope).used)
-
-  def test_nesting(self):
-    scope = activity.Scope(None)
-    scope.mark_write(QN('foo'))
-    scope.mark_read(QN('bar'))
-
-    child = activity.Scope(scope)
-    self.assertTrue(child.has(QN('foo')))
-    self.assertTrue(scope.has(QN('foo')))
-
-    child.mark_write(QN('bar'))
-    self.assertTrue(child.has(QN('bar')))
-    self.assertFalse(scope.has(QN('bar')))
+    self.assertReadOnly(QN('bar'), other)
 
   def test_referenced(self):
     scope = activity.Scope(None)
@@ -123,25 +124,6 @@ class ActivityAnalyzerTest(test.TestCase):
     node = activity.resolve(node, entity_info)
     return node, entity_info
 
-  def test_local_markers(self):
-
-    def test_fn(a):  # pylint:disable=unused-argument
-      b = c  # pylint:disable=undefined-variable
-      while b > 0:
-        b -= 1
-      return b
-
-    node, _ = self._parse_and_analyze(test_fn)
-    self.assertFalse(
-        anno.getanno(node.body[0].body[0].value,
-                     NodeAnno.IS_LOCAL))  # c in b = c
-    self.assertTrue(
-        anno.getanno(node.body[0].body[1].test.left,
-                     NodeAnno.IS_LOCAL))  # b in b > 0
-    self.assertTrue(
-        anno.getanno(node.body[0].body[2].value,
-                     NodeAnno.IS_LOCAL))  # b in return b
-
   def assertSymbolSetsAre(self, expected, actual, name):
     expected = set(expected)
     actual = set(str(s) for s in actual)
@@ -153,12 +135,10 @@ class ActivityAnalyzerTest(test.TestCase):
         '  Extra:    %s\n' % (name.upper(), expected, actual,
                               expected - actual, actual - expected))
 
-  def assertScopeIsRmc(self, scope, used, modified, created):
+  def assertScopeIs(self, scope, used, modified):
     """Assert the scope contains specific used, modified & created variables."""
-    self.assertSymbolSetsAre(used, scope.used, 'read')
+    self.assertSymbolSetsAre(used, scope.read, 'read')
     self.assertSymbolSetsAre(modified, scope.modified, 'modified')
-    # Created is deprecated, we're no longer verifying it.
-    # self.assertSymbolSetsAre(created, scope.created, 'created')
 
   def test_print_statement(self):
 
@@ -181,7 +161,7 @@ class ActivityAnalyzerTest(test.TestCase):
       print_args_scope = anno.getanno(print_node, NodeAnno.ARGS_SCOPE)
     # We basically need to detect which variables are captured by the call
     # arguments.
-    self.assertScopeIsRmc(print_args_scope, ('a', 'b'), (), ())
+    self.assertScopeIs(print_args_scope, ('a', 'b'), ())
 
   def test_call_args(self):
 
@@ -195,8 +175,8 @@ class ActivityAnalyzerTest(test.TestCase):
     call_node = node.body[0].body[2].value
     # We basically need to detect which variables are captured by the call
     # arguments.
-    self.assertScopeIsRmc(
-        anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'b'), (), ())
+    self.assertScopeIs(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'b'), ())
 
   def test_call_args_attributes(self):
 
@@ -210,12 +190,8 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     call_node = node.body[0].body[1].value
-    self.assertScopeIsRmc(
-        anno.getanno(call_node, NodeAnno.ARGS_SCOPE),
-        ('a', 'a.b', 'a.c'),
-        (),
-        (),
-    )
+    self.assertScopeIs(
+        anno.getanno(call_node, NodeAnno.ARGS_SCOPE), ('a', 'a.b', 'a.c'), ())
 
   def test_call_args_subscripts(self):
 
@@ -230,12 +206,9 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     call_node = node.body[0].body[2].value
-    self.assertScopeIsRmc(
+    self.assertScopeIs(
         anno.getanno(call_node, NodeAnno.ARGS_SCOPE),
-        ('a', 'a[0]', 'a[b]', 'b'),
-        (),
-        (),
-    )
+        ('a', 'a[0]', 'a[b]', 'b'), ())
 
   def test_while(self):
 
@@ -248,14 +221,13 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     while_node = node.body[0].body[1]
-    self.assertScopeIsRmc(
-        anno.getanno(while_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'),
-        ('c',))
-    self.assertScopeIsRmc(
+    self.assertScopeIs(
+        anno.getanno(while_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'))
+    self.assertScopeIs(
         anno.getanno(while_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
-        ('b', 'c'), ('a', 'b', 'c'))
-    self.assertScopeIsRmc(
-        anno.getanno(while_node, NodeAnno.COND_SCOPE), ('b',), (), ())
+        ('b', 'c'))
+    self.assertScopeIs(
+        anno.getanno(while_node, NodeAnno.COND_SCOPE), ('b',), ())
 
   def test_for(self):
 
@@ -268,11 +240,11 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     for_node = node.body[0].body[1]
-    self.assertScopeIsRmc(
-        anno.getanno(for_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'), ('c',))
-    self.assertScopeIsRmc(
+    self.assertScopeIs(
+        anno.getanno(for_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'))
+    self.assertScopeIs(
         anno.getanno(for_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
-        ('b', 'c', '_'), ('a', 'b', 'c', '_'))
+        ('b', 'c', '_'))
 
   def test_if(self):
 
@@ -289,18 +261,16 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('x', 'y', 'z'),
-        ('y', 'z'))
-    # TODO(mdan): Double check: is it ok to not mark a local symbol as not read?
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.BODY_SCOPE).parent, ('x', 'z', 'u'),
-        ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
-    self.assertScopeIsRmc(
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('x', 'y', 'z'))
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE).parent, ('x', 'y', 'z', 'u'),
+        ('x', 'y', 'z', 'u'))
+    self.assertScopeIs(
         anno.getanno(if_node, NodeAnno.ORELSE_SCOPE), ('x', 'y'),
-        ('x', 'y', 'u'), ('y', 'u'))
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE).parent, ('x', 'z', 'u'),
+        ('x', 'y', 'u'))
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE).parent,
         ('x', 'y', 'z', 'u'), ('x', 'y', 'z', 'u'))
 
   def test_if_attributes(self):
@@ -316,24 +286,14 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.BODY_SCOPE),
-        ('a', 'a.c'),
-        ('a.b', 'd'),
-        ('d',),
-    )
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE),
-        ('a', 'a.c'),
-        ('a.b', 'd'),
-        ('d',),
-    )
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.BODY_SCOPE).parent,
-        ('a', 'a.c', 'd'),
-        ('a.b', 'd'),
-        ('a', 'd'),
-    )
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('a', 'a.c'), ('a.b', 'd'))
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE), ('a', 'a.c'),
+        ('a.b', 'd'))
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE).parent, ('a', 'a.c', 'd'),
+        ('a.b', 'd'))
 
   def test_if_subscripts(self):
 
@@ -348,25 +308,15 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.BODY_SCOPE),
-        ('a', 'b', 'c', 'a[c]'),
-        ('a[b]', 'd'),
-        ('d',),
-    )
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('a', 'b', 'c', 'a[c]'),
+        ('a[b]', 'd'))
     # TODO(mdan): Should subscript writes (a[0] = 1) be considered to read "a"?
-    self.assertScopeIsRmc(
-        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE),
-        ('a', 'e'),
-        ('a[0]', 'd'),
-        ('d',),
-    )
-    self.assertScopeIsRmc(
+    self.assertScopeIs(
+        anno.getanno(if_node, NodeAnno.ORELSE_SCOPE), ('a', 'e'), ('a[0]', 'd'))
+    self.assertScopeIs(
         anno.getanno(if_node, NodeAnno.ORELSE_SCOPE).parent,
-        ('a', 'b', 'c', 'd', 'e', 'a[c]'),
-        ('d', 'a[b]', 'a[0]'),
-        ('a', 'b', 'c', 'd', 'e'),
-    )
+        ('a', 'b', 'c', 'd', 'e', 'a[c]'), ('d', 'a[b]', 'a[0]'))
 
   def test_nested_if(self):
 
@@ -380,12 +330,10 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     inner_if_node = node.body[0].body[0].body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(inner_if_node, NodeAnno.BODY_SCOPE), ('b',), ('a',),
-        ('a',))
-    self.assertScopeIsRmc(
-        anno.getanno(inner_if_node, NodeAnno.ORELSE_SCOPE), ('b',), ('a',),
-        ('a',))
+    self.assertScopeIs(
+        anno.getanno(inner_if_node, NodeAnno.BODY_SCOPE), ('b',), ('a',))
+    self.assertScopeIs(
+        anno.getanno(inner_if_node, NodeAnno.ORELSE_SCOPE), ('b',), ('a',))
 
   def test_nested_function(self):
 
@@ -404,11 +352,8 @@ class ActivityAnalyzerTest(test.TestCase):
     node, _ = self._parse_and_analyze(test_fn)
     fn_def_node = node.body[0].body[0]
 
-    self.assertScopeIsRmc(
-        anno.getanno(fn_def_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('y',), (
-            'x',
-            'y',
-        ))
+    self.assertScopeIs(
+        anno.getanno(fn_def_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('y',))
 
   def test_constructor_attributes(self):
 
@@ -420,12 +365,9 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(TestClass)
     init_node = node.body[0].body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(init_node, NodeAnno.BODY_SCOPE),
-        ('self', 'a', 'self.b'),
-        ('self', 'self.b', 'self.b.c'),
-        ('self', 'a', 'self.b'),
-    )
+    self.assertScopeIs(
+        anno.getanno(init_node, NodeAnno.BODY_SCOPE), ('self', 'a', 'self.b'),
+        ('self', 'self.b', 'self.b.c'))
 
   def test_aug_assign_subscripts(self):
 
@@ -434,12 +376,8 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     fn_node = node.body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
-        ('a', 'a[0]'),
-        ('a[0]',),
-        ('a',),
-    )
+    self.assertScopeIs(
+        anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('a', 'a[0]'), ('a[0]',))
 
   def test_return_vars_are_read(self):
 
@@ -448,16 +386,7 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     fn_node = node.body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
-        ('c',),
-        (),
-        (
-            'a',
-            'b',
-            'c',
-        ),
-    )
+    self.assertScopeIs(anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('c',), ())
 
   def test_aug_assign(self):
 
@@ -466,12 +395,8 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     fn_node = node.body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
-        ('a', 'b'),
-        ('a'),
-        ('a', 'b'),
-    )
+    self.assertScopeIs(
+        anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('a', 'b'), ('a'))
 
   def test_aug_assign_rvalues(self):
 
@@ -485,23 +410,67 @@ class ActivityAnalyzerTest(test.TestCase):
 
     node, _ = self._parse_and_analyze(test_fn)
     fn_node = node.body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
-        ('foo', 'x'),
-        (),
-        ('x',),
-    )
+    self.assertScopeIs(
+        anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('foo', 'x'), ())
 
-  def test_params_created(self):
+  def test_params(self):
 
     def test_fn(a, b):  # pylint: disable=unused-argument
       return b
 
     node, _ = self._parse_and_analyze(test_fn)
     fn_node = node.body[0]
-    self.assertScopeIsRmc(
-        anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('b',), (('')),
-        (('a', 'b')))
+    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(body_scope, ('b',), ())
+    self.assertScopeIs(body_scope.parent, ('b',), ('a', 'b'))
+
+    args_scope = anno.getanno(fn_node.args, anno.Static.SCOPE)
+    self.assertSymbolSetsAre(('a', 'b'), args_scope.params.keys(), 'params')
+
+  def test_lambda_captures_reads(self):
+
+    def test_fn(a, b):
+      return lambda: a + b
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node.body[0]
+    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(body_scope, ('a', 'b'), ())
+    # Nothing local to the lambda is tracked.
+    self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
+
+  def test_lambda_params_are_isolated(self):
+
+    def test_fn(a, b):  # pylint: disable=unused-argument
+      return lambda a: a + b
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node.body[0]
+    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(body_scope, ('b',), ())
+    self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
+
+  def test_lambda_complex(self):
+
+    def test_fn(a, b, c, d):  # pylint: disable=unused-argument
+      a = (lambda a, b, c: a + b + c)(d, 1, 2) + b
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node.body[0]
+    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(body_scope, ('b', 'd'), ('a',))
+    self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
+
+  def test_lambda_nested(self):
+
+    def test_fn(a, b, c, d, e):  # pylint: disable=unused-argument
+      a = lambda a, b: d(lambda b: a + b + c)  # pylint: disable=undefined-variable
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node.body[0]
+    body_scope = anno.getanno(fn_node, NodeAnno.BODY_SCOPE)
+    self.assertScopeIs(body_scope, ('c', 'd'), ('a',))
+    self.assertSymbolSetsAre((), body_scope.params.keys(), 'params')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/static_analysis/live_values.py b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
index 36b9e7074dc2088422823a025368378acc729e18..e8e3d229bea4bb505d58cdae24de87377b1b50e6 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/live_values.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
@@ -26,14 +26,8 @@ from __future__ import print_function
 import gast
 
 from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
-
-
-# TODO(aqj): Do we need this? Do other builtins fail in similar ways
-# See b/114389775 for a related bug in pyct
-# These symbols are legal in Python, but don't appear in the namespace.
-_SPECIAL_SYMBOLS = {'range': range, 'print': print}
 
 
 class LiveValueResolver(transformer.Base):
@@ -72,10 +66,11 @@ class LiveValueResolver(transformer.Base):
             # If the symbol value is for example a primitive, then it will not
             # have a name.
             pass
-        elif node.id in _SPECIAL_SYMBOLS:
+        elif node.id in inspect_utils.SPECIAL_BUILTINS:
           # Note: if the user redefined any of these symbols, then they would
           # be visible in the namespace and we would never reach this branch.
-          anno.setanno(node, 'live_val', _SPECIAL_SYMBOLS[node.id])
+          anno.setanno(
+              node, 'live_val', inspect_utils.SPECIAL_BUILTINS[node.id])
         else:
           pass
           # TODO(mdan): Should we raise an error here?
@@ -129,11 +124,9 @@ class LiveValueResolver(transformer.Base):
         anno.setanno(node, 'fqn',
                      anno.getanno(node.value, 'type_fqn') + (node.attr,))
     elif isinstance(node.value, gast.Name):
-      stem_name = node.value
-      # All nonlocal symbols should be fully resolved.
-      assert anno.hasanno(stem_name, NodeAnno.IS_LOCAL), stem_name
       # TODO(mdan): Figure out what to do when calling attribute on local object
       # Maybe just leave as-is?
+      pass
     return node
 
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index 41c903beb9e9d32ddbf7a7b72da3027797bf2aec..f8b8d7fa77c167e0ebf96dd533e3c42b0c30b8e5 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -14,8 +14,13 @@
 # ==============================================================================
 """Live variable analysis.
 
-This analysis attaches a set containing the live symbols that are live at the
-exit of control flow statements.
+See https://en.wikipedia.org/wiki/Live_variable_analysis for a definition of
+the following idioms: live variable, live in, live out, which are used
+throughout this file.
+
+This analysis attaches the following:
+ * symbols that are live at the exit of control flow statements
+ * symbols that are live at the entry of control flow statements
 
 Requires activity analysis.
 """
@@ -50,11 +55,11 @@ class Analyzer(cfg.GraphVisitor):
     if anno.hasanno(node.ast_node, anno.Static.SCOPE):
       node_scope = anno.getanno(node.ast_node, anno.Static.SCOPE)
 
-      gen = node_scope.used | self.extra_gen.get(node.ast_node, frozenset())
+      gen = node_scope.read | self.extra_gen.get(node.ast_node, frozenset())
       # TODO(mdan): verify whether composites' parents need to be added.
-      # E.g. if x.y is live whether x needs to be added. Theoretically the
+      # E.g. whether x needs to be added if x.y is live. Theoretically the
       # activity analysis should have both so that wouldn't be needed.
-      kill = node_scope.modified
+      kill = node_scope.modified | node_scope.deleted
 
       live_out = set()
       for n in node.next:
@@ -156,6 +161,16 @@ class Annotator(transformer.Base):
     self.cross_function_analyzer = cross_function_analyzer
     self.current_analyzer = None
 
+  def visit(self, node):
+    node = super(Annotator, self).visit(node)
+    if (self.current_analyzer is not None and
+        isinstance(node, gast.stmt) and
+        node in self.current_analyzer.graph.index):
+      cfg_node = self.current_analyzer.graph.index[node]
+      anno.setanno(node, anno.Static.LIVE_VARS_IN,
+                   frozenset(self.current_analyzer.in_[cfg_node]))
+    return node
+
   def visit_FunctionDef(self, node):
     parent_analyzer = self.current_analyzer
     self.current_analyzer = self.cross_function_analyzer.analyzers[node]
@@ -164,23 +179,45 @@ class Annotator(transformer.Base):
     self.current_analyzer = parent_analyzer
     return node
 
-  def _aggregate_successors_live_in(self, node):
+  def _block_statement_live_out(self, node):
     successors = self.current_analyzer.graph.stmt_next[node]
-    node_live_out = set()
+    stmt_live_out = set()
     for s in successors:
-      node_live_out.update(self.current_analyzer.in_[s])
-    anno.setanno(node, anno.Static.LIVE_VARS_OUT, frozenset(node_live_out))
-    node = self.generic_visit(node)
+      stmt_live_out.update(self.current_analyzer.in_[s])
+    anno.setanno(node, anno.Static.LIVE_VARS_OUT, frozenset(stmt_live_out))
+    return node
+
+  def _block_statement_live_in(self, node, entry_node):
+    cfg_node = self.current_analyzer.graph.index[entry_node]
+    stmt_live_in = frozenset(self.current_analyzer.in_[cfg_node])
+    anno.setanno(node, anno.Static.LIVE_VARS_IN, stmt_live_in)
     return node
 
   def visit_If(self, node):
-    return self._aggregate_successors_live_in(node)
+    node = self.generic_visit(node)
+    node = self._block_statement_live_out(node)
+    return self._block_statement_live_in(node, node.test)
 
   def visit_For(self, node):
-    return self._aggregate_successors_live_in(node)
+    node = self.generic_visit(node)
+    node = self._block_statement_live_out(node)
+    return self._block_statement_live_in(node, node.iter)
 
   def visit_While(self, node):
-    return self._aggregate_successors_live_in(node)
+    node = self.generic_visit(node)
+    node = self._block_statement_live_out(node)
+    return self._block_statement_live_in(node, node.test)
+
+  def visit_With(self, node):
+    node = self.generic_visit(node)
+    return self._block_statement_live_in(node, node.items[0])
+
+  def visit_Expr(self, node):
+    node = self.generic_visit(node)
+    cfg_node = self.current_analyzer.graph.index[node]
+    anno.setanno(node, anno.Static.LIVE_VARS_OUT,
+                 frozenset(self.current_analyzer.out[cfg_node]))
+    return node
 
 
 def resolve(node, source_info, graphs):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
index 0d5f369e92e4312e7ed05b3f815a9935158cd45c..4366808d4962394b98cb3d939abed9666899a6d3 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
@@ -47,14 +47,23 @@ class LivenessTest(test.TestCase):
 
   def assertHasLiveOut(self, node, expected):
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
-    live_out_str = set(str(v) for v in live_out)
+    live_out_strs = set(str(v) for v in live_out)
     if not expected:
       expected = ()
     if not isinstance(expected, tuple):
       expected = (expected,)
-    self.assertSetEqual(live_out_str, set(expected))
+    self.assertSetEqual(live_out_strs, set(expected))
 
-  def test_stacked_if(self):
+  def assertHasLiveIn(self, node, expected):
+    live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
+    live_in_strs = set(str(v) for v in live_in)
+    if not expected:
+      expected = ()
+    if not isinstance(expected, tuple):
+      expected = (expected,)
+    self.assertSetEqual(live_in_strs, set(expected))
+
+  def test_live_out_stacked_if(self):
 
     def test_fn(x, a):
       if a > 0:
@@ -69,7 +78,7 @@ class LivenessTest(test.TestCase):
     self.assertHasLiveOut(fn_body[0], ('a', 'x'))
     self.assertHasLiveOut(fn_body[1], 'x')
 
-  def test_stacked_if_else(self):
+  def test_live_out_stacked_if_else(self):
 
     def test_fn(x, a):
       if a > 0:
@@ -86,7 +95,7 @@ class LivenessTest(test.TestCase):
     self.assertHasLiveOut(fn_body[0], 'a')
     self.assertHasLiveOut(fn_body[1], 'x')
 
-  def test_for_basic(self):
+  def test_live_out_for_basic(self):
 
     def test_fn(x, a):
       for i in range(a):
@@ -98,7 +107,7 @@ class LivenessTest(test.TestCase):
 
     self.assertHasLiveOut(fn_body[0], 'x')
 
-  def test_attributes(self):
+  def test_live_out_attributes(self):
 
     def test_fn(x, a):
       if a > 0:
@@ -110,7 +119,7 @@ class LivenessTest(test.TestCase):
 
     self.assertHasLiveOut(fn_body[0], ('x.y', 'x'))
 
-  def test_nested_functions(self):
+  def test_live_out_nested_functions(self):
 
     def test_fn(a, b):
       if b:
@@ -126,7 +135,7 @@ class LivenessTest(test.TestCase):
 
     self.assertHasLiveOut(fn_body[0], 'a')
 
-  def test_nested_functions_isolation(self):
+  def test_live_out_nested_functions_isolation(self):
 
     def test_fn(b):
       if b:
@@ -144,6 +153,95 @@ class LivenessTest(test.TestCase):
 
     self.assertHasLiveOut(fn_body[0], 'max')
 
+  def test_live_out_deletion(self):
+
+    def test_fn(x, y, a):
+      for _ in a:
+        if x:
+          del y
+        else:
+          y = 0
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    self.assertHasLiveOut(fn_body[0], ())
+
+  def test_live_in_stacked_if(self):
+
+    def test_fn(x, a, b, c):
+      if a > 0:
+        x = b
+      if c > 1:
+        x = 0
+      return x
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    self.assertHasLiveIn(fn_body[0], ('a', 'b', 'c', 'x'))
+    self.assertHasLiveIn(fn_body[1], ('c', 'x'))
+
+  def test_live_in_stacked_if_else(self):
+
+    def test_fn(x, a, b, c, d):
+      if a > 1:
+        x = b
+      else:
+        x = c
+      if d > 0:
+        x = 0
+      return x
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    self.assertHasLiveIn(fn_body[0], ('a', 'b', 'c', 'd'))
+    self.assertHasLiveIn(fn_body[1], ('d', 'x'))
+
+  def test_live_in_for_basic(self):
+
+    def test_fn(x, y, a):
+      for i in a:
+        x = i
+        y += x
+        z = 0
+      return y, z
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    self.assertHasLiveIn(fn_body[0], ('a', 'y', 'z'))
+
+  def test_live_in_for_nested(self):
+
+    def test_fn(x, y, a):
+      for i in a:
+        for j in i:
+          x = i
+          y += x
+          z = j
+      return y, z
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    self.assertHasLiveIn(fn_body[0], ('a', 'y', 'z'))
+
+  def test_live_in_deletion(self):
+
+    def test_fn(x, y, a):
+      for _ in a:
+        if x:
+          del y
+        else:
+          y = 0
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    self.assertHasLiveIn(fn_body[0], ('a', 'x', 'y'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
index 9aaf318a9f573c486a7392e459b5750cc7a36100..d1587d81780780f56ab0ec1fb0dbb9942a3d4539 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
@@ -28,6 +28,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import weakref
+
 import gast
 
 from tensorflow.python.autograph.pyct import anno
@@ -137,12 +139,12 @@ class Analyzer(cfg.GraphVisitor):
         for s in node_scope.modified:
           def_ = self._definition_factory()
           if s in node_scope.params:
-            def_.param_of = node_scope.params[s]
+            def_.param_of = weakref.ref(node_scope.params[s])
           node_symbols[s] = def_
         self.gen_map[node] = _NodeState(node_symbols)
 
       gen = self.gen_map[node]
-      kill = node_scope.modified
+      kill = node_scope.modified | node_scope.deleted
       defs_out = gen | (defs_in - kill)
 
     else:
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index 373a2cb38f54635fa9adbd58f4878e2c904343b9..8c0d51850770e90c6755951e4ca5b01bb0987c51 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -238,6 +238,41 @@ class DefinitionInfoTest(test.TestCase):
     self.assertSameDef(creation, mutation)
     self.assertSameDef(creation, use)
 
+  def test_deletion_partial(self):
+
+    def test_fn(a):
+      a = 0
+      if a:
+        del a
+      else:
+        a = 1
+      return a
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    first_def = fn_body[0].targets[0]
+    second_def = fn_body[1].orelse[0].targets[0]
+    use = fn_body[2].value
+    self.assertNotSameDef(use, first_def)
+    self.assertSameDef(use, second_def)
+
+  def test_deletion_total(self):
+
+    def test_fn(a):
+      if a:
+        a = 0
+      else:
+        a = 1
+      del a
+      return a
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    use = fn_body[2].value
+    self.assertHasDefs(use, 0)
+
   def test_replacement(self):
 
     def foo(a):
diff --git a/tensorflow/python/autograph/pyct/templates.py b/tensorflow/python/autograph/pyct/templates.py
index 1af8fca59999232b905fcfaebf2c806403ec5743..2272ea42086ff726eaf02f8fccacc6b661d6207e 100644
--- a/tensorflow/python/autograph/pyct/templates.py
+++ b/tensorflow/python/autograph/pyct/templates.py
@@ -32,6 +32,66 @@ from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
 
 
+class ContextAdjuster(gast.NodeTransformer):
+  """Adjusts the ctx field of nodes to ensure consistency.
+
+  This transformer can change the ctx fields of a variable, tuple and other
+  AST elements that allow one, based on whether the element is being read or
+  written.
+  """
+
+  def __init__(self, override_value):
+    self._ctx_override = override_value
+
+  def visit(self, node):
+    original_override = self._ctx_override
+    node = super(ContextAdjuster, self).visit(node)
+    if hasattr(node, 'ctx'):
+      assert node.ctx is not None, 'node {} has ctx unset'.format(node)
+    self._ctx_override = original_override
+    return node
+
+  def _apply_override(self, node):
+    if self._ctx_override is not None:
+      node.ctx = self._ctx_override()
+
+  def visit_Attribute(self, node):
+    self._apply_override(node)
+    self._ctx_override = gast.Load
+    node = self.generic_visit(node)
+    return node
+
+  def visit_Tuple(self, node):
+    self._apply_override(node)
+    return self.generic_visit(node)
+
+  def visit_List(self, node):
+    self._apply_override(node)
+    return self.generic_visit(node)
+
+  def visit_Name(self, node):
+    self._apply_override(node)
+    return self.generic_visit(node)
+
+  def visit_Call(self, node):
+    self._apply_override(node)
+    # We may be able to override these to Load(), but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+  def visit_Dict(self, node):
+    # We may be able to override these to Load(), but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+  def visit_Subscript(self, node):
+    node.value = self.visit(node.value)
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+
 class ReplaceTransformer(gast.NodeTransformer):
   """Replace AST nodes."""
 
@@ -90,7 +150,8 @@ class ReplaceTransformer(gast.NodeTransformer):
     # we could allow changing just node arg, so that we end up with bar=baz.
     raise ValueError(
         'a keyword argument may only be replaced by another keyword or a '
-        'non-empty list of keywords. Found: %s' % repl)
+        'non-empty list of keywords. Found: {} for keyword {}'.format(
+            repl, node.arg))
 
   def visit_FunctionDef(self, node):
     node = self.generic_visit(node)
@@ -105,85 +166,6 @@ class ReplaceTransformer(gast.NodeTransformer):
     node.name = repl.id
     return node
 
-  def _check_has_context(self, node):
-    if not node.ctx:
-      raise ValueError('node %s is missing ctx value' % node)
-
-  # TODO(mdan): Rewrite _check and _set using a separate transformer.
-  def _check_inner_children_have_context(self, node):
-    if isinstance(node, gast.Attribute):
-      self._check_inner_children_have_context(node.value)
-      self._check_has_context(node)
-    elif isinstance(node, (gast.Tuple, gast.List)):
-      for e in node.elts:
-        self._check_inner_children_have_context(e)
-      self._check_has_context(node)
-    elif isinstance(node, gast.Dict):
-      for e in node.keys:
-        self._check_inner_children_have_context(e)
-      for e in node.values:
-        self._check_inner_children_have_context(e)
-    elif isinstance(node, gast.Index):
-      self._check_inner_children_have_context(node.value)
-    elif isinstance(node, gast.Subscript):
-      self._check_inner_children_have_context(node.value)
-      self._check_inner_children_have_context(node.slice)
-    elif isinstance(node, gast.Slice):
-      self._check_inner_children_have_context(node.lower)
-      if node.upper:
-        self._check_inner_children_have_context(node.upper)
-      if node.step:
-        self._check_inner_children_have_context(node.step)
-    elif isinstance(node, gast.BinOp):
-      self._check_inner_children_have_context(node.left)
-      self._check_inner_children_have_context(node.right)
-    elif isinstance(node, gast.UnaryOp):
-      self._check_inner_children_have_context(node.operand)
-    elif isinstance(node, gast.Name):
-      self._check_has_context(node)
-    elif isinstance(node, (gast.Str, gast.Num)):
-      pass
-    else:
-      raise ValueError('unexpected node type "%s"' % node)
-
-  def _set_inner_child_context(self, node, ctx):
-    if isinstance(node, gast.Attribute):
-      self._set_inner_child_context(node.value, gast.Load())
-      node.ctx = ctx
-    elif isinstance(node, (gast.Tuple, gast.List)):
-      for e in node.elts:
-        self._set_inner_child_context(e, ctx)
-      node.ctx = ctx
-    elif isinstance(node, gast.Name):
-      node.ctx = ctx
-    elif isinstance(node, gast.Call):
-      self._set_inner_child_context(node.func, ctx)
-      # We may be able to override these to Load(), but for now it's simpler
-      # to just assert that they're set.
-      for a in node.args:
-        self._check_inner_children_have_context(a)
-      for k in node.keywords:
-        self._check_inner_children_have_context(k.value)
-    elif isinstance(node, gast.Dict):
-      # We may be able to override these to Load(), but for now it's simpler
-      # to just assert that they're set.
-      for e in node.keys:
-        self._check_inner_children_have_context(e)
-      for e in node.values:
-        self._check_inner_children_have_context(e)
-    elif isinstance(node, gast.Subscript):
-      self._set_inner_child_context(node.value, ctx)
-      self._check_inner_children_have_context(node.slice)
-    elif isinstance(node, gast.BinOp):
-      self._check_inner_children_have_context(node.left)
-      self._check_inner_children_have_context(node.right)
-    elif isinstance(node, gast.UnaryOp):
-      self._check_inner_children_have_context(node.operand)
-    elif isinstance(node, (gast.Str, gast.Num)):
-      pass
-    else:
-      raise ValueError('unexpected node type "%s"' % node)
-
   def visit_Attribute(self, node):
     node = self.generic_visit(node)
     if node.attr not in self.replacements:
@@ -203,16 +185,10 @@ class ReplaceTransformer(gast.NodeTransformer):
     new_nodes = self._prepare_replacement(node, node.id)
 
     # Preserve the target context.
+    adjuster = ContextAdjuster(type(node.ctx))
     for n in new_nodes:
-      if isinstance(n, (gast.Tuple, gast.List)):
-        for e in n.elts:
-          self._set_inner_child_context(e, node.ctx)
-      if isinstance(n, gast.Attribute):
-        # For attributes, the inner Name node receives the context, while the
-        # outer ones have it set to Load.
-        self._set_inner_child_context(n, node.ctx)
-      else:
-        n.ctx = node.ctx
+      if hasattr(n, 'ctx'):
+        adjuster.visit(n)
 
     if len(new_nodes) == 1:
       new_nodes, = new_nodes
diff --git a/tensorflow/python/autograph/pyct/templates_test.py b/tensorflow/python/autograph/pyct/templates_test.py
index 30322418469e4f3c7e091cd76a0a1726d999cc5b..cdb44b822e84ad5822c78d50c2f958b1fba9ec18 100644
--- a/tensorflow/python/autograph/pyct/templates_test.py
+++ b/tensorflow/python/autograph/pyct/templates_test.py
@@ -134,19 +134,18 @@ class TemplatesTest(test.TestCase):
 
   def test_replace_expression_context(self):
     template = """
-      def test_fn(foo):
+      def test_fn():
         foo
     """
 
     node = templates.replace(
         template, foo=parser.parse_expression('a + 2 * b / -c'))[0]
-    self.assertIsInstance(node.body[0].ctx, gast.Load)
     self.assertIsInstance(node.body[0].left.ctx, gast.Load)
     self.assertIsInstance(node.body[0].right.left.right.ctx, gast.Load)
 
   def test_replace_complex_context(self):
     template = """
-      def test_fn(foo):
+      def test_fn():
         foo = 0
     """
 
@@ -160,7 +159,7 @@ class TemplatesTest(test.TestCase):
 
   def test_replace_index(self):
     template = """
-      def test_fn(foo):
+      def test_fn():
         foo = 0
     """
 
@@ -214,15 +213,15 @@ class TemplatesTest(test.TestCase):
     result, _ = compiler.ast_to_object(node)
     self.assertEquals(3, result.test_fn())
 
-  def replace_as_expression(self):
+  def test_replace_as_expression(self):
     template = """
       foo(a)
     """
 
-    node = templates.replace(template, foo='bar', a='baz')
-    self.assertTrue(node is gast.Call)
+    node = templates.replace_as_expression(template, foo='bar', a='baz')
+    self.assertIsInstance(node, gast.Call)
     self.assertEqual(node.func.id, 'bar')
-    self.assertEqual(node.func.args[0].id, 'baz')
+    self.assertEqual(node.args[0].id, 'baz')
 
   def test_replace_as_expression_restrictions(self):
     template = """
@@ -232,6 +231,13 @@ class TemplatesTest(test.TestCase):
     with self.assertRaises(ValueError):
       templates.replace_as_expression(template)
 
+  def test_function_call_in_list(self):
+    template = """
+        foo(bar)
+    """
+    source = parser.parse_expression('[a(b(1))]')
+    templates.replace_as_expression(template, bar=source)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index 520f5038da2dd842f4da111a848bc913c043e89f..b6830534b3dbf2e2815957b26d715d24dc002da7 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -26,6 +26,7 @@ import six
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import pretty_printer
+from tensorflow.python.autograph.pyct import templates
 
 
 class AutographParseError(SyntaxError):
@@ -92,7 +93,8 @@ class _StateStack(object):
     # the superclass' setattr.
     object.__setattr__(self, 'type', type_)
     object.__setattr__(self, '_stack', [])
-    self.enter()
+    if not hasattr(type_, 'no_root'):
+      self.enter()
 
   def enter(self):
     self._stack.append(self.type())
@@ -108,6 +110,9 @@ class _StateStack(object):
   def value(self):
     return self._stack[-1]
 
+  def __iter__(self):
+    return iter(self._stack)
+
   def __getattr__(self, key):
     return getattr(self._stack[-1], key)
 
@@ -199,8 +204,10 @@ class Base(gast.NodeTransformer):
     Args:
       entity_info: An EntityInfo object.
     """
+    self._current_origin = None
     self._lineno = 0
     self._col_offset = 0
+    # TODO(znado): remove this from the constructor of all Transformers.
     self.entity_info = entity_info
     self._enclosing_entities = []
 
@@ -274,6 +281,12 @@ class Base(gast.NodeTransformer):
       print(pretty_printer.fmt(node))
     return node
 
+  def create_assignment(self, target, expression):
+    template = """
+      target = expression
+    """
+    return templates.replace(template, target=target, expression=expression)
+
   def visit_block(self, nodes, before_visit=None, after_visit=None):
     """A more powerful version of generic_visit for statement blocks.
 
@@ -308,19 +321,23 @@ class Base(gast.NodeTransformer):
             return node, None
 
     Args:
-      nodes: enumerable of AST node objects
+      nodes: enumerable of AST node objects. If None, the function returns None.
       before_visit: optional callable that is called before visiting each item
-          in nodes
-      after_visit: optional callable that takes in an AST node and
-          returns a tuple (new_node, new_destination). It is called after
-          visiting each item in nodes. Is used in the same was as the
+        in nodes
+      after_visit: optional callable that takes in an AST node and returns a
+        tuple (new_node, new_destination). It is called after visiting each item
+        in nodes. Is used in the same was as the
           visit_* methods: new_node will replace the node; if not None,
-          new_destination must be a list, and subsequent nodes will be placed
-          in this list instead of the list returned by visit_block.
+            new_destination must be a list, and subsequent nodes will be placed
+            in this list instead of the list returned by visit_block.
+
     Returns:
       A list of AST node objects containing the transformed items fron nodes,
       except those nodes that have been relocated using after_visit.
     """
+    if nodes is None:
+      return None
+
     results = []
     node_destination = results
     for node in nodes:
@@ -417,13 +434,12 @@ class Base(gast.NodeTransformer):
           ' visit lists of nodes, use "visit_block" instead').format(type(node))
       raise ValueError(msg)
 
-    source_code = self.entity_info.source_code
-    source_file = self.entity_info.source_file
     did_enter_function = False
     local_scope_size_at_entry = len(self._local_scope_state)
     processing_expr_node = False
 
     try:
+      parent_origin = self._current_origin
       if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
         did_enter_function = True
       elif isinstance(node, gast.Expr):
@@ -432,15 +448,15 @@ class Base(gast.NodeTransformer):
       if did_enter_function:
         self._enclosing_entities.append(node)
 
-      if source_code and hasattr(node, 'lineno'):
-        self._lineno = node.lineno
-        self._col_offset = node.col_offset
+      if anno.hasanno(node, anno.Basic.ORIGIN):
+        self._current_origin = anno.getanno(node, anno.Basic.ORIGIN)
 
       if processing_expr_node:
         entry_expr_value = node.value
 
       if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
         result = super(Base, self).visit(node)
+      self._current_origin = parent_origin
 
       # Adjust for consistency: replacing the value of an Expr with
       # an Assign node removes the need for the Expr node.
@@ -462,26 +478,26 @@ class Base(gast.NodeTransformer):
             'Inconsistent local scope stack. Before entering node %s, the'
             ' stack had length %d, after exit it has length %d. This'
             ' indicates enter_local_scope and exit_local_scope are not'
-            ' well paired.' % (
-                node,
-                local_scope_size_at_entry,
-                len(self._local_scope_state)
-            ))
+            ' well paired.' % (node, local_scope_size_at_entry,
+                               len(self._local_scope_state)))
       return result
 
     except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
-      msg = '%s: %s\nOffending source:\n%s\n\nOccurred at node:\n%s' % (
-          e.__class__.__name__, str(e), self._get_source(node),
-          pretty_printer.fmt(node, color=False))
-      if source_code:
-        line = source_code.splitlines()[self._lineno - 1]
-      else:
-        line = '<no source available>'
+      if not self._current_origin:
+        raise e
+      original_file_path = self._current_origin.loc.filename
+      original_line_number = self._current_origin.loc.lineno
+      original_col_offset = self._current_origin.loc.col_offset
+      original_source_line = self._current_origin.source_code_line
+      msg = '%s: %s.' % (e.__class__.__name__, str(e))
+
       # TODO(mdan): Avoid the printing of the original exception.
       # In other words, we need to find how to suppress the "During handling
       # of the above exception, another exception occurred" message.
-      six.reraise(AutographParseError,
-                  AutographParseError(
-                      msg,
-                      (source_file, self._lineno, self._col_offset + 1, line)),
-                  sys.exc_info()[2])
+      six.reraise(
+          AutographParseError,
+          AutographParseError(msg, (original_file_path, original_line_number,
+                                    original_col_offset, original_source_line)),
+          sys.exc_info()[2])
+    finally:
+      self._current_origin = parent_origin
diff --git a/tensorflow/python/autograph/pyct/transformer_test.py b/tensorflow/python/autograph/pyct/transformer_test.py
index 23bf9a8e16617c23a83eac57918165d91c990798..0c68d2a7648ccd3f44fb53db994bd0bb94a813eb 100644
--- a/tensorflow/python/autograph/pyct/transformer_test.py
+++ b/tensorflow/python/autograph/pyct/transformer_test.py
@@ -304,20 +304,11 @@ class TransformerTest(test.TestCase):
     tr = BrokenTransformer(self._simple_source_info())
 
     node, _ = parser.parse_entity(test_function)
-    with self.assertRaises(transformer.AutographParseError) as cm:
+    with self.assertRaises(ValueError) as cm:
       node = tr.visit(node)
     obtained_message = str(cm.exception)
     expected_message = r'expected "ast.AST", got "\<(type|class) \'list\'\>"'
     self.assertRegexpMatches(obtained_message, expected_message)
-    # The exception should point at the if statement, not any place else.  Could
-    # also check the stack trace.
-    self.assertTrue(
-        'Occurred at node:\nIf' in obtained_message, obtained_message)
-    self.assertTrue(
-        'Occurred at node:\nFunctionDef' not in obtained_message,
-        obtained_message)
-    self.assertTrue(
-        'Occurred at node:\nReturn' not in obtained_message, obtained_message)
 
   def test_robust_error_on_ast_corruption(self):
     # A child class should not be able to be so broken that it causes the error
@@ -344,26 +335,13 @@ class TransformerTest(test.TestCase):
     tr = BrokenTransformer(self._simple_source_info())
 
     node, _ = parser.parse_entity(test_function)
-    with self.assertRaises(transformer.AutographParseError) as cm:
+    with self.assertRaises(ValueError) as cm:
       node = tr.visit(node)
     obtained_message = str(cm.exception)
     # The message should reference the exception actually raised, not anything
     # from the exception handler.
     expected_substring = 'I blew up'
     self.assertTrue(expected_substring in obtained_message, obtained_message)
-    # Expect the exception to have failed to parse the corrupted AST
-    self.assertTrue(
-        '<could not convert AST to source>' in obtained_message,
-        obtained_message)
-    # The exception should point at the if statement, not any place else.  Could
-    # also check the stack trace.
-    self.assertTrue(
-        'Occurred at node:\nIf' in obtained_message, obtained_message)
-    self.assertTrue(
-        'Occurred at node:\nFunctionDef' not in obtained_message,
-        obtained_message)
-    self.assertTrue(
-        'Occurred at node:\nReturn' not in obtained_message, obtained_message)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index 22451d4f3fcf7ea1387f65d5e2640021371917aa..790c661661dabab7c5e1d5dd097a60562c8cc358 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -22,7 +22,6 @@ py_library(
         "__init__.py",
         "context_managers.py",
         "misc.py",
-        "multiple_dispatch.py",
         "py_func.py",
         "tensor_list.py",
         "tensors.py",
@@ -61,16 +60,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "multiple_dispatch_test",
-    srcs = ["multiple_dispatch_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":utils",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
 py_test(
     name = "py_func_test",
     srcs = ["py_func_test.py"],
diff --git a/tensorflow/python/autograph/utils/__init__.py b/tensorflow/python/autograph/utils/__init__.py
index c781958481ff73984103d1e7d0dc546d6fa51736..d9031159b11086a9bf8e47adf52ade4bbdae1188 100644
--- a/tensorflow/python/autograph/utils/__init__.py
+++ b/tensorflow/python/autograph/utils/__init__.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.utils.context_managers import control_dependency_on_returns
 from tensorflow.python.autograph.utils.misc import alias_tensors
-from tensorflow.python.autograph.utils.multiple_dispatch import run_cond
 from tensorflow.python.autograph.utils.py_func import wrap_py_func
 from tensorflow.python.autograph.utils.tensor_list import dynamic_list_append
 from tensorflow.python.autograph.utils.testing import fake_tf
diff --git a/tensorflow/python/autograph/utils/misc_test.py b/tensorflow/python/autograph/utils/misc_test.py
index 8d2b0d6e13802313abf6751b0e62b2807a866c2f..c78df48d6263b121076c86198670222441e7fec7 100644
--- a/tensorflow/python/autograph/utils/misc_test.py
+++ b/tensorflow/python/autograph/utils/misc_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.utils.misc import alias_tensors
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.ops.variables import Variable
 from tensorflow.python.platform import test
@@ -26,14 +27,16 @@ from tensorflow.python.platform import test
 
 class MiscTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_alias_single_tensor(self):
     a = constant(1)
 
     new_a = alias_tensors(a)
     self.assertFalse(new_a is a)
     with self.cached_session() as sess:
-      self.assertEqual(1, sess.run(new_a))
+      self.assertEqual(1, self.evaluate(new_a))
 
+  @test_util.run_deprecated_v1
   def test_alias_tensors(self):
     a = constant(1)
     v = Variable(2)
@@ -47,7 +50,7 @@ class MiscTest(test.TestCase):
     self.assertTrue(new_s is s)
     self.assertTrue(new_l is l)
     with self.cached_session() as sess:
-      self.assertEqual(1, sess.run(new_a))
+      self.assertEqual(1, self.evaluate(new_a))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/utils/multiple_dispatch.py b/tensorflow/python/autograph/utils/multiple_dispatch.py
deleted file mode 100644
index 107c8f7a681009b75db73fd5a42803e498a9876a..0000000000000000000000000000000000000000
--- a/tensorflow/python/autograph/utils/multiple_dispatch.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities for type-dependent behavior used in autograph-generated code."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.utils.type_check import is_tensor
-from tensorflow.python.ops import control_flow_ops
-
-
-def run_cond(condition, true_fn, false_fn):
-  """Type-dependent functional conditional.
-
-  Args:
-    condition: A Tensor or Python bool.
-    true_fn: A Python callable implementing the true branch of the conditional.
-    false_fn: A Python callable implementing the false branch of the
-      conditional.
-
-  Returns:
-    result: The result of calling the appropriate branch. If condition is a
-    Tensor, tf.cond will be used. Otherwise, a standard Python if statement will
-    be ran.
-  """
-  if is_tensor(condition):
-    return control_flow_ops.cond(condition, true_fn, false_fn)
-  else:
-    return py_cond(condition, true_fn, false_fn)
-
-
-def py_cond(condition, true_fn, false_fn):
-  """Functional version of Python's conditional."""
-  if condition:
-    results = true_fn()
-  else:
-    results = false_fn()
-
-  # The contract for the branch functions is to return tuples, but they should
-  # be collapsed to a single element when there is only one output.
-  if len(results) == 1:
-    return results[0]
-  return results
diff --git a/tensorflow/python/autograph/utils/multiple_dispatch_test.py b/tensorflow/python/autograph/utils/multiple_dispatch_test.py
deleted file mode 100644
index 2a77c895ce80b817c2e62605921cdd56c344bd0d..0000000000000000000000000000000000000000
--- a/tensorflow/python/autograph/utils/multiple_dispatch_test.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for multiple_dispatch."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.utils import multiple_dispatch
-from tensorflow.python.client.session import Session
-from tensorflow.python.framework.constant_op import constant
-from tensorflow.python.platform import test
-
-
-class MultipleDispatchTest(test.TestCase):
-
-  def test_run_cond_python(self):
-    true_fn = lambda: (2,)
-    false_fn = lambda: (3,)
-    self.assertEqual(multiple_dispatch.run_cond(True, true_fn, false_fn), 2)
-    self.assertEqual(multiple_dispatch.run_cond(False, true_fn, false_fn), 3)
-
-  def test_run_cond_tf(self):
-    true_fn = lambda: (constant(2),)
-    false_fn = lambda: (constant(3),)
-    with Session() as sess:
-      out = multiple_dispatch.run_cond(constant(True), true_fn, false_fn)
-      self.assertEqual(sess.run(out), 2)
-      out = multiple_dispatch.run_cond(constant(False), true_fn, false_fn)
-      self.assertEqual(sess.run(out), 3)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/utils/py_func.py b/tensorflow/python/autograph/utils/py_func.py
index 11ebfb2e49f0e762b56ae2cde2b76d2e24032d72..ee8b46b52061f28eacdf2f980cccb07c889e7274 100644
--- a/tensorflow/python/autograph/utils/py_func.py
+++ b/tensorflow/python/autograph/utils/py_func.py
@@ -127,5 +127,6 @@ def wrap_py_func(f, return_dtypes, args, kwargs=None, use_dummy_return=False):
     retval = f(*f_args, **f_kwargs)
     return 1 if use_dummy_return else retval
 
-  return script_ops.py_func(f_wrapper, tensor_args, dtypes.int64
-                            if use_dummy_return else return_dtypes)
+  if use_dummy_return:
+    return_dtypes = dtypes.int32
+  return script_ops.eager_py_func(f_wrapper, tensor_args, return_dtypes)
diff --git a/tensorflow/python/autograph/utils/py_func_test.py b/tensorflow/python/autograph/utils/py_func_test.py
index 1c220d94922be680021bd96c6b7ddbf2593c6125..d17ede77142483208a0954244579b3249f0ffba5 100644
--- a/tensorflow/python/autograph/utils/py_func_test.py
+++ b/tensorflow/python/autograph/utils/py_func_test.py
@@ -32,15 +32,15 @@ class PyFuncTest(test.TestCase):
       return a + b + c
 
     with self.cached_session() as sess:
-      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+      result = py_func.wrap_py_func(test_fn, dtypes.int32,
                                     (1, constant_op.constant(1), 1))
-      self.assertEqual(3, sess.run(result))
-      result = py_func.wrap_py_func(test_fn, dtypes.int64, (1, 1, 1))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int32, (1, 1, 1))
+      self.assertEqual(3, self.evaluate(result))
       result = py_func.wrap_py_func(
-          test_fn, dtypes.int64,
+          test_fn, dtypes.int32,
           (constant_op.constant(1), 1, constant_op.constant(1)))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
 
   def test_wrap_py_func_complex_args(self):
 
@@ -53,11 +53,11 @@ class PyFuncTest(test.TestCase):
       return a * b.foo
 
     with self.cached_session() as sess:
-      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass()))
-      self.assertEqual(35, sess.run(result))
-      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+      result = py_func.wrap_py_func(test_fn, dtypes.int32, (7, TestClass()))
+      self.assertEqual(35, self.evaluate(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int32,
                                     (constant_op.constant(7), TestClass()))
-      self.assertEqual(35, sess.run(result))
+      self.assertEqual(35, self.evaluate(result))
 
   def test_wrap_py_func_kwargs(self):
 
@@ -70,17 +70,17 @@ class PyFuncTest(test.TestCase):
       return a * b.foo + c * d.foo
 
     with self.cached_session() as sess:
-      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass(5)), {
+      result = py_func.wrap_py_func(test_fn, dtypes.int32, (7, TestClass(5)), {
           'c': 11,
           'd': TestClass(13)
       })
-      self.assertEqual(178, sess.run(result))
-      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+      self.assertEqual(178, self.evaluate(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int32,
                                     (constant_op.constant(7), TestClass(5)), {
                                         'c': constant_op.constant(11),
                                         'd': TestClass(13)
                                     })
-      self.assertEqual(178, sess.run(result))
+      self.assertEqual(178, self.evaluate(result))
 
   def test_wrap_py_func_dummy_return(self):
 
@@ -91,11 +91,11 @@ class PyFuncTest(test.TestCase):
 
     with self.cached_session() as sess:
       result = py_func.wrap_py_func(test_fn, None, (5,), use_dummy_return=True)
-      self.assertEqual(1, sess.run(result))
+      self.assertEqual(1, self.evaluate(result))
       self.assertEqual([1], side_counter)
       result = py_func.wrap_py_func(
           test_fn, None, (constant_op.constant(5),), use_dummy_return=True)
-      self.assertEqual(1, sess.run(result))
+      self.assertEqual(1, self.evaluate(result))
       self.assertEqual([2], side_counter)
 
 
diff --git a/tensorflow/python/autograph/utils/tensor_list_test.py b/tensorflow/python/autograph/utils/tensor_list_test.py
index 697c166eb12c0f3e5b3782259795fcf2e366cb5d..bbbc3bf691818d292d53999c563bcc1112d0703f 100644
--- a/tensorflow/python/autograph/utils/tensor_list_test.py
+++ b/tensorflow/python/autograph/utils/tensor_list_test.py
@@ -19,10 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.utils import tensor_list as tl
-from tensorflow.python.client.session import Session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -34,6 +34,7 @@ class TensorListTest(test.TestCase):
   def _shape(self, shape_tuple):
     return constant(shape_tuple, dtypes.int32)
 
+  @test_util.run_v1_only("b/117943489")
   def test_dynamic_list_append(self):
     l = []
     l = tl.dynamic_list_append(l, 1)
@@ -42,19 +43,16 @@ class TensorListTest(test.TestCase):
     l = list_ops.empty_tensor_list(self._shape(()), dtypes.int32)
     l = tl.dynamic_list_append(l, 1)
     s = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(s), [1])
+    self.assertAllEqual(s, [1])
 
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l = tl.dynamic_list_append(l, 1)
     s = l.stack()
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(s), [1])
+    self.assertAllEqual(s, [1])
 
     l = tl.TensorList(self._shape(()), dtypes.int32)
     l = tl.dynamic_list_append(l, 1)
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(l[0]), 1)
+    self.assertAllEqual(l[0], 1)
 
   def test_list_append_python(self):
     with context.eager_mode():
@@ -80,6 +78,7 @@ class TensorListTest(test.TestCase):
       l[0] = ops.convert_to_tensor(b)
       self.assertEqual(l[0].numpy(), b.numpy())
 
+  @test_util.run_deprecated_v1
   def test_list_append_tf(self):
     a = constant(3.0)
     l = tl.TensorList(a.shape, a.dtype)
@@ -91,13 +90,12 @@ class TensorListTest(test.TestCase):
     c3 = l.count()
     a2 = l.pop()
     c4 = l.count()
-    with Session() as sess:
-      c1, c2, c3, c4, a, a2 = sess.run([c1, c2, c3, c4, a, a2])
-      self.assertEqual(c1, 1)
-      self.assertEqual(c2, 2)
-      self.assertEqual(c3, 1)
-      self.assertEqual(c4, 0)
-      self.assertEqual(a, a2)
+    c1, c2, c3, c4, a, a2 = self.evaluate([c1, c2, c3, c4, a, a2])
+    self.assertEqual(c1, 1)
+    self.assertEqual(c2, 2)
+    self.assertEqual(c3, 1)
+    self.assertEqual(c4, 0)
+    self.assertEqual(a, a2)
 
   def test_list_index_tf(self):
     a = constant(3.0)
@@ -107,10 +105,9 @@ class TensorListTest(test.TestCase):
     l0 = l[0]
     l[0] = b
     l1 = l[0]
-    with self.cached_session() as sess:
-      l0, l1, a, b = sess.run([l0, l1, a, b])
-      self.assertEqual(l0, a)
-      self.assertEqual(l1, b)
+    l0, l1, a, b = self.evaluate([l0, l1, a, b])
+    self.assertEqual(l0, a)
+    self.assertEqual(l1, b)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/utils/testing.py b/tensorflow/python/autograph/utils/testing.py
index cb4785d0dc0f4674b3560418daeb6733364b21e7..dd6bdc8931e9d41de91c918bbe5ba6455ed6eaf4 100644
--- a/tensorflow/python/autograph/utils/testing.py
+++ b/tensorflow/python/autograph/utils/testing.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import imp
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -28,6 +29,7 @@ def fake_tf():
   """Creates a fake module that looks like TensorFlow, for testing."""
   mod = imp.new_module('tensorflow')
   mod_contents = dict()
+  mod_contents.update(gen_math_ops.__dict__)
   mod_contents.update(math_ops.__dict__)
   mod_contents.update(ops.__dict__)
   mod_contents.update(mod.__dict__)
diff --git a/tensorflow/python/autograph/utils/type_check.py b/tensorflow/python/autograph/utils/type_check.py
index 8748abc47bcfb55b4d0b11178a46816249732da9..ccef7dee03982e46a969ec70d4bbd8f61f8ce6d7 100644
--- a/tensorflow/python/autograph/utils/type_check.py
+++ b/tensorflow/python/autograph/utils/type_check.py
@@ -30,4 +30,4 @@ def is_tensor(*args):
   Returns:
     True if any *args are TensorFlow types, False if none are.
   """
-  return any([tensor_util.is_tensor(a) for a in args])
+  return any(tensor_util.is_tensor(a) for a in args)
diff --git a/tensorflow/python/autograph/utils/type_check_test.py b/tensorflow/python/autograph/utils/type_check_test.py
index b3d1304e16ff1f53e3e1686d5973b76f2de91b1a..2521dc9f925625163ffd0caf63a0c6ac17eca969 100644
--- a/tensorflow/python/autograph/utils/type_check_test.py
+++ b/tensorflow/python/autograph/utils/type_check_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.platform import test
 
 class TypeCheckTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_checks(self):
     self.assertTrue(type_check.is_tensor(constant_op.constant([1, 2, 3])))
     self.assertTrue(
diff --git a/tensorflow/python/client/client_lib.py b/tensorflow/python/client/client_lib.py
index 80a256bf7a87032a40bfb3fa19fb0162c6dd2393..6efddba9792533c010707422341022477678512e 100644
--- a/tensorflow/python/client/client_lib.py
+++ b/tensorflow/python/client/client_lib.py
@@ -15,7 +15,7 @@
 
 """Support for launching graphs and executing operations.
 
-See the [Client](https://tensorflow.org/api_guides/python/client) guide.
+See the [Client](https://www.tensorflow.org/guide/graphs) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/client/device_lib.i b/tensorflow/python/client/device_lib.i
index 944e855cee2ab9da7a4a801d1b993bec4d8ebc55..3e579152d5170d8c773136f09add59aaa5b89d98 100644
--- a/tensorflow/python/client/device_lib.i
+++ b/tensorflow/python/client/device_lib.i
@@ -48,17 +48,14 @@ static std::vector<string> ListDevicesWithSessionConfig(
   std::vector<string> output;
   SessionOptions options;
   options.config = config;
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::AddDevices(
       options, "" /* name_prefix */, &devices);
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
   }
 
-  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(),
-                                                     devices.end());
-
-  for (const Device* device : devices) {
+  for (const std::unique_ptr<Device>& device : devices) {
     const DeviceAttributes& attr = device->attributes();
     string attr_serialized;
     if (!attr.SerializeToString(&attr_serialized)) {
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index c963cfd3340593afa8e4e1c57f2c2d966600a5cf..87a200ed336735f4b4abd9b0ac2352e36f7b84e4 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -828,7 +828,7 @@ class BaseSession(SessionInterface):
     nested list, tuple, namedtuple, dict, or OrderedDict containing graph
     elements at its leaves.  A graph element can be one of the following types:
 
-    * An `tf.Operation`.
+    * A `tf.Operation`.
       The corresponding fetched value will be `None`.
     * A `tf.Tensor`.
       The corresponding fetched value will be a numpy ndarray containing the
@@ -1097,7 +1097,7 @@ class BaseSession(SessionInterface):
           if isinstance(subfeed_val, ops.Tensor):
             raise TypeError('The value of a feed cannot be a tf.Tensor object. '
                             'Acceptable feed values include Python scalars, '
-                            'strings, lists, numpy ndarrays, or TensorHandles.'
+                            'strings, lists, numpy ndarrays, or TensorHandles. '
                             'For reference, the tensor object was ' +
                             str(feed_val) + ' which was passed to the '
                             'feed with key ' + str(feed) + '.')
@@ -1283,7 +1283,7 @@ class BaseSession(SessionInterface):
   # Old format: [[Node: <node_name> = ...]]
   # New format: [[{{node <node_name>}} = ...]]
   _NODEDEF_NAME_RE = re.compile(
-      r'\[\[(Node: )?(\{\{node )?([^\} ]*)(\}\})?\s*=')
+      r'\[\[(Node: )?(\{\{node )?([^\} ]*)(\}\})?\s*=*')
 
   def _do_run(self, handle, target_list, fetch_list, feed_dict, options,
               run_metadata):
@@ -1471,7 +1471,7 @@ class BaseSession(SessionInterface):
     return BaseSession._Callable(self, callable_options)
 
 
-@tf_export('Session')
+@tf_export(v1=['Session'])
 class Session(BaseSession):
   """A class for running TensorFlow operations.
 
@@ -1626,7 +1626,7 @@ class Session(BaseSession):
     tf_session.TF_Reset(target, containers, config)
 
 
-@tf_export('InteractiveSession')
+@tf_export(v1=['InteractiveSession'])
 class InteractiveSession(BaseSession):
   """A TensorFlow `Session` for use in interactive contexts, such as a shell.
 
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index df020f88a88687ac9616d40618aebb8f7eef2858..224f880ed15f1796b08d1db3ea52c52302a9b83f 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -62,7 +62,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
 
     const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config)
-    output = sess.run(const)
+    output = self.evaluate(const)
     self.assertEqual(17, output)
 
   def testClusterSpecPropagationWorker2Placement(self):
@@ -106,7 +106,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default() as g, ops.device('/job:worker/task:0'):
       const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config, graph=g)
-    output = sess.run(const)
+    output = self.evaluate(const)
     self.assertEqual(17, output)
 
   def testCanonicalDeviceNames(self):
@@ -208,7 +208,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
       with ops.device('/job:worker/task:0/cpu:0'):
         sum3 = sum1 + sum2
     sess = session.Session(server1.target, config=config, graph=g)
-    output = sess.run(sum3)
+    output = self.evaluate(sum3)
     self.assertEqual(40, output)
 
   def testLegacyDeviceNames(self):
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 92ca47efa9348f4ac77f2b22e684080eccb38617..a97930635af5cee0cea4bcdf6f04a5894d7d3aed 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -117,7 +117,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     a = constant_op.constant(2.0, dtypes.float32)
     b = a * 2
     c = b * 3
-    r1 = sess.run([b, c])
+    r1 = self.evaluate([b, c])
     h = sess.partial_run_setup([b, c], [])
     r2 = sess.partial_run(h, [b, c])
     self.assertEqual(r1, r2)
@@ -188,6 +188,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     r = sess.partial_run(h, [b], {})
     self.assertEqual([6.0], r)
 
+  @test_util.run_deprecated_v1
   def testInvalidPartialRunSetup(self):
     sess = session.Session()
     x = array_ops.placeholder(dtypes.float32, shape=[])
@@ -196,6 +197,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
         'specify at least one target to fetch or execute.'):
       sess.partial_run_setup(fetches=[], feeds=[x])
 
+  @test_util.run_deprecated_v1
   def testPartialRunSetupNoFeedsPassed(self):
     sess = session.Session()
     r1 = constant_op.constant([6.0])
@@ -204,80 +206,102 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     result1 = sess.partial_run(h, r1)
     self.assertEqual([6.0], result1)
 
+  @test_util.run_deprecated_v1
   def testPartialRunDirect(self):
     self.RunTestPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunIncompleteDirect(self):
     self.RunTestPartialRunIncomplete(session.Session())
 
+  @test_util.run_deprecated_v1
   def testConcurrentPartialRunDirect(self):
     self.RunTestConcurrentPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testManyPartialRunDirect(self):
     self.RunTestManyPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testRunAndPartialRunDirect(self):
     self.RunTestRunAndPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunMissingPlaceholderFeedExceptionDirect(self):
     self.RunTestPartialRunMissingPlaceholderFeedException(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFeedDirect(self):
     self.RunTestPartialRunUnspecifiedFeed(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFetchDirect(self):
     self.RunTestPartialRunUnspecifiedFetch(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFedDirect(self):
     self.RunTestPartialRunAlreadyFed(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFetchedDirect(self):
     self.RunTestPartialRunAlreadyFetched(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunEmptyFetchesDirect(self):
     self.RunTestPartialRunEmptyFetches(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunIncompleteDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunIncomplete(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testConcurrentPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestConcurrentPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testManyPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestManyPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testRunAndPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestRunAndPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunMissingPlaceholderFeedExceptionDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunMissingPlaceholderFeedException(
         session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFeedDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunUnspecifiedFeed(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFetchDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunUnspecifiedFetch(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFedDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunAlreadyFed(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFetchedDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunAlreadyFetched(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunEmptyFetchesDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunEmptyFetches(session.Session(server.target))
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 347833ce8fd095eb4acdef4a8a7e09046b554ba3..c4a118a41406afc52586553b1d3f0b446005c46d 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -312,6 +312,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(None, res[2])
       self.assertEqual(44.0, res[1])
 
+  @test_util.run_v1_only('b/120545219')
   def testFetchAttrs(self):
     if attr is None:
       self.skipTest('attr module is unavailable.')
@@ -340,6 +341,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(val3, result.field1)
       self.assertAllEqual(val2, result.field2)
 
+  @test_util.run_v1_only('b/120545219')
   def testFetchNestedAttrs(self):
     if attr is None:
       self.skipTest('attr module is unavailable.')
@@ -1024,6 +1026,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       fed_c_val = c.eval(feed_dict={a.name: [[4.0, 4.0]]})
       self.assertAllEqual([[16.0, 16.0, 16.0]], fed_c_val)
 
+  @test_util.run_v1_only('b/120545219')
   def testOperationRunMethod(self):
     with session.Session():
       a = constant_op.constant(1.0, shape=[1, 2])
@@ -1154,6 +1157,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         else:
           importer.import_graph_def(gdef, name='import')
 
+  @test_util.run_v1_only('b/120545219')
   def testParallelRunAndSingleBuild(self):
     with session.Session() as sess:
       c = constant_op.constant(5.0)
@@ -1174,6 +1178,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       for t in threads:
         t.join()
 
+  @test_util.run_v1_only('b/120545219')
   def testParallelRunAndParallelBuild(self):
     with session.Session() as sess:
       c = constant_op.constant(5.0)
@@ -1274,6 +1279,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(RuntimeError, 'The Session graph is empty.'):
         sess.run({})
 
+  @test_util.run_v1_only('b/120545219')
   def testNotEntered(self):
     # pylint: disable=protected-access
     self.assertEqual(ops._default_session_stack.get_default(), None)
@@ -1289,6 +1295,7 @@ class SessionTest(test_util.TensorFlowTestCase):
           ValueError, lambda e: 'No default session is registered.' in str(e)):
         c_2.eval()
 
+  @test_util.run_v1_only('b/120545219')
   def testInteractive(self):
     with ops.device('/cpu:0'):
       sess = session.InteractiveSession()
@@ -1301,6 +1308,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[24.0]], e.eval())
       sess.close()
 
+  @test_util.run_v1_only('b/120545219')
   def testMultipleInteractiveSessionsWarning(self):
     # Reinitialize the global state to ensure that the expected warnings will
     # be emitted.
@@ -1328,6 +1336,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     sess2.close()
     sess.close()
 
+  @test_util.run_v1_only('b/120545219')
   def testInteractivePlacePrunedGraph(self):
     sess = session.InteractiveSession()
 
@@ -1349,6 +1358,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       a.eval()
     sess.close()
 
+  @test_util.run_v1_only('b/120545219')
   def testDefaultSessionPlacePrunedGraph(self):
     sess = session.Session()
 
@@ -1769,9 +1779,11 @@ class SessionTest(test_util.TensorFlowTestCase):
     sess.run(a, run_metadata=run_metadata)
     self.assertEqual(len(run_metadata.partition_graphs), 0)
 
+  @test_util.run_v1_only('b/120545219')
   def testOutputPartitionGraphsDirect(self):
     self.runTestOutputPartitionGraphs(session.Session())
 
+  @test_util.run_v1_only('b/120545219')
   def testOutputPartitionGraphsDistributed(self):
     server = server_lib.Server.create_local_server()
     self.runTestOutputPartitionGraphs(session.Session(server.target))
@@ -1796,6 +1808,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     del sess1
     del sess2
 
+  @test_util.run_v1_only('b/120545219')
   def testAsDefault(self):
     c = constant_op.constant(37)
     sess = session.Session()
@@ -1821,6 +1834,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(TypeError, 'graph must be a tf.Graph'):
       session.Session(graph=37)
 
+  @test_util.run_v1_only('b/120545219')
   def testTimeoutWithShortOperations(self):
     num_epochs = 5
     q = data_flow_ops.FIFOQueue(capacity=50, dtypes=[dtypes.int32], shapes=[()])
@@ -1834,6 +1848,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(enqueue_op)
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
+  @test_util.run_v1_only('b/120545219')
   def testRegisterFetchAndFeedConversionFunctions(self):
 
     class SquaredTensor(object):
@@ -1865,6 +1880,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       squared_eval = sess.partial_run(partial_run, squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
 
+  @test_util.run_v1_only('b/120545219')
   def testDefaultLogDevicePlacement(self):
 
     class CaptureStderr(str):
@@ -1914,6 +1930,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertTrue('/job:local/replica:0/task:0/device:CPU:0' in str(log),
                       str(log))
 
+  @test_util.run_v1_only('b/120545219')
   def testLocalMasterSessionTimeout(self):
     # Test that the timeout passed in a config to the session works correctly.
     config = config_pb2.ConfigProto(operation_timeout_in_ms=1000)
@@ -1927,6 +1944,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.DeadlineExceededError):
         sess.run(dequeued_t)
 
+  @test_util.run_v1_only('b/120545219')
   def testDefaultServerTimeout(self):
     # Test that the default server config timeout gets used when no Session
     # config is provided.
@@ -1952,9 +1970,11 @@ class SessionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesOpError('has inputs from different frames'):
       sess.run(res, feed_dict={data: 1.0})
 
+  @test_util.run_v1_only('b/120545219')
   def testBuildGraphErrorDirect(self):
     self.runTestBuildGraphError(session.Session())
 
+  @test_util.run_v1_only('b/120545219')
   def testBuildGraphErrorDist(self):
     server = server_lib.Server.create_local_server()
     self.runTestBuildGraphError(session.Session(server.target))
@@ -1993,9 +2013,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       result = sess.run(f)
       self.assertEqual(result, 2.0)
 
+  @test_util.run_v1_only('b/120545219')
   def testAddFunctionToSession(self):
     self.runTestAddFunctionToSession()
 
+  @test_util.run_v1_only('b/120545219')
   def testAddFunctionToGrpcSession(self):
     server = server_lib.Server.create_local_server()
     self.runTestAddFunctionToSession(server.target)
@@ -2009,6 +2031,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with session.Session():
       pass
 
+  @test_util.run_v1_only('b/120545219')
   def testAutoConvertAndCheckData(self):
     with self.cached_session() as sess:
       a = array_ops.placeholder(dtype=dtypes.string)
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index 032bbf7c4eddb373abbdbf54ac79f79d9cbc5a55..61c0da01b836843a756c90fee20fbcb0ee94f59c 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -57,12 +57,13 @@ class TimelineTest(test.TestCase):
     ctf = tl.generate_chrome_trace_format()
     self._validateTrace(ctf)
 
+  @test_util.run_deprecated_v1
   def testTimelineCpu(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
     run_metadata = config_pb2.RunMetadata()
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       const1 = constant_op.constant(1.0, name='const1')
       const2 = constant_op.constant(2.0, name='const2')
       result = math_ops.add(const1, const2) + const1 * const2
@@ -93,7 +94,7 @@ class TimelineTest(test.TestCase):
         trace_level=config_pb2.RunOptions.FULL_TRACE)
     run_metadata = config_pb2.RunMetadata()
 
-    with self.test_session(force_gpu=True) as sess:
+    with self.session(force_gpu=True) as sess:
       const1 = constant_op.constant(1.0, name='const1')
       const2 = constant_op.constant(2.0, name='const2')
       result = math_ops.add(const1, const2) + const1 * const2
@@ -147,7 +148,7 @@ class TimelineTest(test.TestCase):
         num2 = variables.Variable(2.0, name='num2')
       with ops.device('/cpu:2'):
         result = num1 + num2 + num1 * num2
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(result, options=run_options, run_metadata=run_metadata)
 
     self.assertTrue(run_metadata.HasField('step_stats'))
@@ -176,7 +177,7 @@ class TimelineTest(test.TestCase):
         num2 = variables.Variable(2.0, name='num2')
       with ops.device('/cpu:2'):
         result = num1 + num2 + num1 * num2
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(result, options=run_options, run_metadata=run_metadata)
     self.assertTrue(run_metadata.HasField('step_stats'))
     step_stats = run_metadata.step_stats
diff --git a/tensorflow/python/client/virtual_gpu_test.py b/tensorflow/python/client/virtual_gpu_test.py
index 52e1b56886f83c3e15bcf918fbc5e1f88ba26ed6..e82ee0666c30f8dcf71d3e6609fc7d7a8ec7eeed 100644
--- a/tensorflow/python/client/virtual_gpu_test.py
+++ b/tensorflow/python/client/virtual_gpu_test.py
@@ -199,7 +199,7 @@ class VirtualGpuTest(test_util.TensorFlowTestCase):
     self._util = VirtualGpuTestUtil()
 
   def testStatsContainAllDeviceNames(self):
-    with self.test_session(config=self._util.config) as sess:
+    with self.session(config=self._util.config) as sess:
       # TODO(laigd): b/70811538. The is_gpu_available() call will invoke
       # DeviceFactory::AddDevices() with a default SessionOption, which prevents
       # adding virtual devices in the future, thus must be called within a
@@ -216,7 +216,7 @@ class VirtualGpuTest(test_util.TensorFlowTestCase):
       for d in self._util.devices:
         with ops.device(d):
           var = variables.Variable(random_ops.random_uniform(mat_shape))
-          sess.run(var.initializer)
+          self.evaluate(var.initializer)
           data.append(var)
       s = data[0]
       for i in range(1, len(data)):
@@ -232,7 +232,7 @@ class VirtualGpuTest(test_util.TensorFlowTestCase):
     self.assertTrue('/job:localhost/replica:0/task:0/device:GPU:2' in devices)
 
   def testLargeRandomGraph(self):
-    with self.test_session(config=self._util.config) as sess:
+    with self.session(config=self._util.config) as sess:
       if not test.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
       for _ in range(5):
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index e0a1c8e0571879e9661cdb0714cc6a794b7ea455..9f2ce8c676e77480106c525bdc9c6440c599acec 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -9,7 +9,10 @@ py_library(
     srcs = ["compat.py"],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
-    deps = ["//tensorflow/python:util"],
+    deps = [
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:util",
+    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 76e08610bacbaf500b95527954eca4d4b32f7d0e..f11e97b2112a0e1a70f409d83eb5b5f0c2596de6 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -23,10 +23,16 @@ from __future__ import division
 from __future__ import print_function
 
 import datetime
+
+from tensorflow.python import tf2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import variable_scope
+
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 10, 4)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 12, 10)
 
 
 @tf_export("compat.forward_compatible")
@@ -132,3 +138,40 @@ def forward_compatibility_horizon(year, month, day):
     yield
   finally:
     _FORWARD_COMPATIBILITY_HORIZON = old_compat_date
+
+
+@tf_export(v1=["enable_v2_behavior"])
+def enable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 2.x.
+
+  This function is called in the main TensorFlow `__init__.py` file, user should
+  not need to call it, except during complex migrations.
+  """
+  tf2.enable()  # Switches TensorArrayV2 and control flow V2
+  ops.enable_eager_execution()
+  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.enable_resource_variables()
+
+
+@tf_export(v1=["disable_v2_behavior"])
+def disable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 1.x.
+
+  User can call this function to disable 2.x behavior during complex migrations.
+  """
+  tf2.disable()  # Switches TensorArrayV2 and control flow V2
+  ops.disable_eager_execution()
+  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.disable_resource_variables()
+
+
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 7536ba668abf5f3aa62fb73921d14e7ffe5b8c19..75ba88f3034632bd925c7736fe7af42cd3aa274f 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -24,6 +24,8 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.data import experimental
 from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.data.ops.dataset_ops import make_initializable_iterator
+from tensorflow.python.data.ops.dataset_ops import make_one_shot_iterator
 from tensorflow.python.data.ops.iterator_ops import Iterator
 from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
 from tensorflow.python.data.ops.readers import TextLineDataset
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5b0500eae1970b4f183737d4fc0cd4171dd1ea15
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -0,0 +1,73 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_test(
+    name = "batch_benchmark",
+    srcs = ["batch_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "filter_benchmark",
+    srcs = ["filter_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "from_tensor_slices_benchmark",
+    srcs = ["from_tensor_slices_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "map_benchmark",
+    srcs = ["map_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "range_benchmark",
+    srcs = ["range_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e063849f70381b8244a8a916353a3cc3be15c230
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/batch_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class BatchBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.batch()`."""
+
+  def benchmarkBatchSparse(self):
+    non_zeros_per_row_values = [0, 1, 5, 10, 100]
+    batch_size_values = [1, 32, 64, 128, 1024]
+
+    sparse_placeholder = array_ops.sparse_placeholder(dtype=dtypes.int64)
+    batch_size_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[])
+
+    dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
+        ).batch(batch_size_placeholder)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    for non_zeros_per_row in non_zeros_per_row_values:
+
+      sparse_value = sparse_tensor.SparseTensorValue(
+          indices=np.arange(non_zeros_per_row, dtype=np.int64)[:, np.newaxis],
+          values=np.arange(non_zeros_per_row, dtype=np.int64),
+          dense_shape=[1000])
+
+      for batch_size in batch_size_values:
+
+        with session.Session() as sess:
+          sess.run(iterator.initializer, feed_dict={
+              sparse_placeholder: sparse_value,
+              batch_size_placeholder: batch_size})
+          # Run five steps to warm up the session caches before taking the
+          # first measurement.
+          for _ in range(5):
+            sess.run(next_element.indices.op)
+          deltas = []
+          for _ in range(100):
+            start = time.time()
+            for _ in range(100):
+              sess.run(next_element.indices.op)
+            end = time.time()
+            deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100.0
+
+        print("Batch sparse dataset non-zeros per row: %d batch_size: %d "
+              "wall time: %f"
+              % (non_zeros_per_row, batch_size, median_wall_time))
+        self.report_benchmark(
+            iters=10000, wall_time=median_wall_time,
+            name="batch_sparse_dataset_nnz_%d_batch_size_%d" % (
+                non_zeros_per_row, batch_size))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/filter_benchmark.py b/tensorflow/python/data/benchmarks/filter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6d86fe2218aec835e4f09f0c8c708596cf511f8
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/filter_benchmark.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.filter()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class FilterBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.filter()`."""
+
+  def _benchmark(self, predicate, name):
+    with ops.Graph().as_default():
+      dataset = (
+          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(5):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        print("Filter dataset using %s. Median wall time: %f" %
+              (name, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name=name)
+
+  def benchmarkSimpleFunction(self):
+    self._benchmark(array_ops.identity, "simple_function")
+
+  def benchmarkReturnComponentOptimization(self):
+    self._benchmark(lambda x: x, "return_component")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7f1a4e7af5b00569e71900df8f2a7486d7c813b
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
@@ -0,0 +1,188 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.from_tensor_slices()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class FromTensorSlicesBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.from_tensor_slices()`."""
+
+  def benchmarkSliceRepeatBatch(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data)
+        .repeat(num_epochs + 1).batch(batch_size))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        sess.run(next_element)
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          sess.run(next_element)
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print("Slice/repeat/batch with sess.run() input size: %d batch size: %d "
+          "Median wall time per element: %f" % (input_size, batch_size,
+                                                median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="slice_repeat_batch_input_%d_batch_%d" % (input_size, batch_size))
+
+  def benchmarkSliceRepeatBatchCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data)
+        .repeat(num_epochs + 1).batch(batch_size))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print(
+        "Slice/repeat/batch with callable input size: %d batch size: %d Median"
+        " wall time per element: %f" % (input_size, batch_size,
+                                        median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="slice_repeat_batch_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+  def benchmarkReshapeSliceRepeatCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
+        .repeat(num_epochs + 1))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print("Reshape/slice/repeat with callable input size: %d batch size: %d "
+          "Median wall time per element: %f" % (input_size, batch_size,
+                                                median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="reshape_slice_repeat_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+  def benchmarkSliceBatchCacheRepeatCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
+        .cache().repeat(num_epochs + 1))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print(
+        "Slice/batch/cache/repeat with callable input size: %d batch size: %d "
+        "Median wall time per element: %f"
+        % (input_size, batch_size, median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="slice_batch_cache_repeat_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..65d945cdae87aedad55351cfb63ad06e3521d570
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -0,0 +1,135 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bechmarks for `tf.data.Dataset.map()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class MapBenchmark(test.Benchmark):
+  """Bechmarks for `tf.data.Dataset.map()`."""
+
+  def benchmarkChainOfMaps(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda x: x
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
+        with ops.Graph().as_default():
+          dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+          for _ in range(chain_length):
+            dataset = dataset_ops.MapDataset(
+                dataset,
+                map_fn,
+                use_inter_op_parallelism=use_inter_op_parallelism)
+          iterator = dataset_ops.make_one_shot_iterator(dataset)
+          next_element = iterator.get_next()
+
+          with session.Session() as sess:
+            for _ in range(5):
+              sess.run(next_element.op)
+            deltas = []
+            for _ in range(100):
+              start = time.time()
+              for _ in range(100):
+                sess.run(next_element.op)
+              end = time.time()
+              deltas.append(end - start)
+
+            median_wall_time = np.median(deltas) / 100
+            print("Map dataset chain length%s: %d Median wall time: %f" %
+                  (print_label, chain_length, median_wall_time))
+            self.report_benchmark(
+                iters=1000,
+                wall_time=median_wall_time,
+                name="map_dataset_chain_length_%d%s" % (chain_length,
+                                                        benchmark_label))
+
+  def benchmarkMapFanOut(self):
+    fan_outs = [1, 2, 5, 10, 20, 50, 100]
+    for fan_out in fan_outs:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda *xs: xs
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
+        with ops.Graph().as_default():
+          dataset = dataset_ops.Dataset.from_tensors(
+              tuple(0 for _ in range(fan_out))).repeat(None)
+          dataset = dataset_ops.MapDataset(
+              dataset,
+              map_fn,
+              use_inter_op_parallelism=use_inter_op_parallelism)
+          iterator = dataset_ops.make_one_shot_iterator(dataset)
+          next_element = iterator.get_next()
+
+          with session.Session() as sess:
+            for _ in range(5):
+              sess.run(next_element[0].op)
+            deltas = []
+            for _ in range(100):
+              start = time.time()
+              for _ in range(100):
+                sess.run(next_element[0].op)
+              end = time.time()
+              deltas.append(end - start)
+
+            median_wall_time = np.median(deltas) / 100
+            print("Map dataset fan out%s: %d Median wall time: %f" %
+                  (print_label, fan_out, median_wall_time))
+            self.report_benchmark(
+                iters=1000,
+                wall_time=median_wall_time,
+                name="map_dataset_fan_out_%d%s" % (fan_out, benchmark_label))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5020e2873063ea8b01801c0889a23cb60601ec3
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.range()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+_NUMPY_RANDOM_SEED = 42
+
+
+class RangeBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.range()`."""
+
+  def _benchmarkRangeHelper(self, modeling_enabled):
+    num_elements = 10000000 if modeling_enabled else 50000000
+    options = dataset_ops.Options()
+    options.experimental_autotune = modeling_enabled
+
+    # Use `Dataset.skip()` and `Dataset.take()` to perform the iteration in
+    # C++, and focus on the minimal overheads (excluding Python invocation
+    # costs).
+    dataset = dataset_ops.Dataset.range(num_elements).skip(
+        num_elements - 1).take(1).with_options(options)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      # Run once to warm up the session caches.
+      sess.run(iterator.initializer)
+      sess.run(next_element)
+
+      # Run once for timing.
+      sess.run(iterator.initializer)
+      start = time.time()
+      sess.run(next_element)
+      end = time.time()
+
+      time_per_element = (end - start) / num_elements
+      print("Average time per element (%s modeling): %f nanoseconds" % (
+          "with" if modeling_enabled else "without", time_per_element * 1e9))
+      self.report_benchmark(iters=num_elements, wall_time=time_per_element,
+                            name="benchmark_tf_data_dataset_range%s"
+                            % ("_with_modeling" if modeling_enabled else ""))
+
+  def benchmarkRange(self):
+    for modeling_enabled in [False, True]:
+      self._benchmarkRangeHelper(modeling_enabled)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 2ac159d38aafdc07f2c8725780448c85c67e3ec4..ffc2e5ef5fa239beada67687ec700437b2fc44ba 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -25,17 +25,29 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@Counter
 @@CheckpointInputPipelineHook
 @@CsvDataset
+@@DatasetStructure
+@@NestedStructure
+@@OptimizationOptions
 @@Optional
+@@OptionalStructure
 @@RandomDataset
 @@Reducer
+@@SparseTensorStructure
 @@SqlDataset
+@@StatsAggregator
+@@StatsOptions
+@@Structure
 @@TFRecordWriter
+@@TensorStructure
+@@ThreadingOptions
 
 @@bucket_by_sequence_length
+@@cardinality
 @@choose_from_datasets
 @@copy_to_device
 @@dense_to_sparse_batch
 @@enumerate_dataset
+@@filter_for_shard
 @@get_next_as_optional
 @@get_single_element
 @@group_by_reducer
@@ -52,11 +64,13 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@rejection_resample
 @@sample_from_datasets
 @@scan
-@@set_stats_aggregator
 @@shuffle_and_repeat
-@@StatsAggregator
 @@unbatch
 @@unique
+
+@@AUTOTUNE
+@@INFINITE_CARDINALITY
+@@UNKNOWN_CARDINALITY
 """
 
 from __future__ import absolute_import
@@ -68,9 +82,13 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
 from tensorflow.python.data.experimental.ops.batching import unbatch
+from tensorflow.python.data.experimental.ops.cardinality import cardinality
+from tensorflow.python.data.experimental.ops.cardinality import INFINITE as INFINITE_CARDINALITY
+from tensorflow.python.data.experimental.ops.cardinality import UNKNOWN as UNKNOWN_CARDINALITY
 from tensorflow.python.data.experimental.ops.counter import Counter
 from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
 from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
+from tensorflow.python.data.experimental.ops.filter_for_shard_ops import filter_for_shard
 from tensorflow.python.data.experimental.ops.get_single_element import get_single_element
 from tensorflow.python.data.experimental.ops.grouping import bucket_by_sequence_length
 from tensorflow.python.data.experimental.ops.grouping import group_by_reducer
@@ -81,10 +99,8 @@ from tensorflow.python.data.experimental.ops.interleave_ops import parallel_inte
 from tensorflow.python.data.experimental.ops.interleave_ops import sample_from_datasets
 from tensorflow.python.data.experimental.ops.iterator_ops import CheckpointInputPipelineHook
 from tensorflow.python.data.experimental.ops.iterator_ops import make_saveable_from_iterator
-
-# Optimization constant that can be used to enable auto-tuning.
 from tensorflow.python.data.experimental.ops.optimization import AUTOTUNE
-
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.experimental.ops.parsing_ops import parse_example_dataset
 from tensorflow.python.data.experimental.ops.prefetching_ops import copy_to_device
 from tensorflow.python.data.experimental.ops.prefetching_ops import prefetch_to_device
@@ -96,13 +112,20 @@ from tensorflow.python.data.experimental.ops.readers import SqlDataset
 from tensorflow.python.data.experimental.ops.resampling import rejection_resample
 from tensorflow.python.data.experimental.ops.scan_ops import scan
 from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repeat
+from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
-from tensorflow.python.data.experimental.ops.stats_ops import set_stats_aggregator
-from tensorflow.python.data.experimental.ops.stats_ops import StatsAggregator
+from tensorflow.python.data.experimental.ops.stats_options import StatsOptions
+from tensorflow.python.data.experimental.ops.threading_options import ThreadingOptions
 from tensorflow.python.data.experimental.ops.unique import unique
 from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
+from tensorflow.python.data.ops.dataset_ops import DatasetStructure
 from tensorflow.python.data.ops.iterator_ops import get_next_as_optional
 from tensorflow.python.data.ops.optional_ops import Optional
+from tensorflow.python.data.ops.optional_ops import OptionalStructure
+from tensorflow.python.data.util.structure import NestedStructure
+from tensorflow.python.data.util.structure import SparseTensorStructure
+from tensorflow.python.data.util.structure import Structure
+from tensorflow.python.data.util.structure import TensorStructure
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index b9398aebe7008bb010ca8ab0eb27d56ea27fdc51..8175116c6eddf4a754202a2fbb22499c79a3f5b8 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -8,13 +8,11 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_test(
-    name = "map_benchmark",
-    size = "medium",
-    srcs = ["map_benchmark.py"],
+    name = "autotune_benchmark",
+    srcs = ["autotune_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
         "//tensorflow/python/data/experimental/ops:batching",
@@ -23,3 +21,105 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
+
+py_test(
+    name = "csv_dataset_benchmark",
+    srcs = ["csv_dataset_benchmark.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/ops:readers",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "map_and_batch_benchmark",
+    srcs = ["map_and_batch_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "map_vectorization_benchmark",
+    srcs = ["map_vectorization_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "matching_files_benchmark",
+    size = "small",
+    srcs = ["matching_files_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:matching_files",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "optimize_benchmark",
+    srcs = ["optimize_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "unbatch_benchmark",
+    srcs = ["unbatch_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e713494b526320f2c18774c7198406521c373033
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -0,0 +1,187 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for autotuning performance knobs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class AutotuneBenchmark(test.Benchmark):
+  """Benchmarks for autotuning performance knobs."""
+
+  def benchmarkMap(self):
+    k = 1024 * 1024
+    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
+                                                np.random.rand(4 * k,
+                                                               1))).repeat()
+    dataset = dataset.map(
+        math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next.op)
+      for _ in range(1000):
+        start = time.time()
+        sess.run(get_next.op)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+    self.report_benchmark(
+        iters=1000, wall_time=np.median(deltas), name="map_autotune")
+
+  def benchmarkMapAndBatch(self):
+    self._benchmarkMapAndBatch(numa_aware=False)
+    self._benchmarkMapAndBatch(numa_aware=True)
+
+  def _benchmarkMapAndBatch(self, numa_aware):
+    batch_size = 16
+    k = 1024 * 1024
+    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
+                                                np.random.rand(4 * k,
+                                                               1))).repeat()
+    dataset = dataset.apply(
+        batching.map_and_batch(
+            math_ops.matmul,
+            num_parallel_calls=optimization.AUTOTUNE,
+            batch_size=batch_size))
+    options = dataset_ops.Options()
+    options.experimental_numa_aware = numa_aware
+    dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next.op)
+      for _ in range(100):
+        start = time.time()
+        sess.run(get_next.op)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+
+    self.report_benchmark(
+        iters=100,
+        wall_time=np.median(deltas),
+        name=("numa_" if numa_aware else "") + "map_and_batch_autotune")
+
+  def benchmarkInterleave(self):
+    k = 1024 * 1024
+    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
+                                                np.random.rand(4 * k,
+                                                               1))).repeat()
+    dataset = dataset.map(math_ops.matmul)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        cycle_length=10,
+        num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next.op)
+      for _ in range(1000):
+        start = time.time()
+        sess.run(get_next.op)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+    self.report_benchmark(
+        iters=1000,
+        wall_time=np.median(deltas),
+        name="interleave_autotune")
+
+  def benchmarkMapAndInterleave(self):
+    k = 1024 * 1024
+    a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
+    b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
+    c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
+    dataset_a = dataset_ops.Dataset.from_tensors(a).repeat()
+    dataset_b = dataset_ops.Dataset.from_tensors(b).repeat()
+    dataset_c = dataset_ops.Dataset.from_tensors(c).repeat()
+
+    def f1(x, y):
+      return math_ops.matmul(x, y)
+
+    def f2(a, b):
+      x, y = b
+      return a, math_ops.matmul(x, y)
+
+    dataset = dataset_a
+    dataset = dataset.map(f1, num_parallel_calls=optimization.AUTOTUNE)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        num_parallel_calls=optimization.AUTOTUNE,
+        cycle_length=2)
+
+    dataset = dataset_ops.Dataset.zip((dataset, dataset_b))
+    dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        num_parallel_calls=optimization.AUTOTUNE,
+        cycle_length=2)
+
+    dataset = dataset_ops.Dataset.zip((dataset, dataset_c))
+    dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next)
+      for _ in range(100):
+        start = time.time()
+        sess.run(get_next)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+    self.report_benchmark(
+        iters=100,
+        wall_time=np.median(deltas),
+        name="map_and_interleave_autotune")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..03345ce4e6648fecf47348806c55adba10aeed5a
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
@@ -0,0 +1,130 @@
+#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.CsvDataset`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import string
+import tempfile
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+
+class CsvDatasetBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.CsvDataset`."""
+
+  FLOAT_VAL = '1.23456E12'
+  STR_VAL = string.ascii_letters * 10
+
+  def _setUp(self, str_val):
+    # Since this isn't test.TestCase, have to manually create a test dir
+    gfile.MakeDirs(googletest.GetTempDir())
+    self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
+
+    self._num_cols = [4, 64, 256]
+    self._num_per_iter = 5000
+    self._filenames = []
+    for n in self._num_cols:
+      fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
+      with open(fn, 'wb') as f:
+        # Just write 100 rows and use `repeat`... Assumes the cost
+        # of creating an iterator is not significant
+        row = ','.join([str_val for _ in range(n)])
+        f.write('\n'.join([row for _ in range(100)]))
+      self._filenames.append(fn)
+
+  def _tearDown(self):
+    gfile.DeleteRecursively(self._temp_dir)
+
+  def _runBenchmark(self, dataset, num_cols, prefix):
+    dataset = dataset.skip(self._num_per_iter - 1)
+    deltas = []
+    for _ in range(10):
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      with session.Session() as sess:
+        start = time.time()
+        # NOTE: This depends on the underlying implementation of skip, to have
+        # the net effect of calling `GetNext` num_per_iter times on the
+        # input dataset. We do it this way (instead of a python for loop, or
+        # batching N inputs in one iter) so that the overhead from session.run
+        # or batch doesn't dominate. If we eventually optimize skip, this has
+        # to change.
+        sess.run(next_element)
+        end = time.time()
+      deltas.append(end - start)
+    # Median wall time per CSV record read and decoded
+    median_wall_time = np.median(deltas) / self._num_per_iter
+    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
+                                                    median_wall_time))
+    self.report_benchmark(
+        iters=self._num_per_iter,
+        wall_time=median_wall_time,
+        name='%s_with_cols_%d' % (prefix, num_cols))
+
+  def benchmarkMapWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_float_map_decode_csv')
+    self._tearDown()
+
+  def benchmarkMapWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
+    self._tearDown()
+
+  def benchmarkCsvDatasetWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_float_fused_dataset')
+    self._tearDown()
+
+  def benchmarkCsvDatasetWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_fused_dataset')
+    self._tearDown()
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..b17f2bcd12b2b78c97e7c390d919331ac4ef5386
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
@@ -0,0 +1,226 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.map_and_batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import hashlib
+import itertools
+import time
+
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+_NUMPY_RANDOM_SEED = 42
+
+
+class MapAndBatchBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.map_and_batch()`."""
+
+  def benchmarkMapAndBatchDense(self):
+    """Measures the performance of parallelized batching."""
+    shapes = [(), (10,), (10, 10), (10, 10, 10), (224, 224, 3)]
+    batch_size_values = [1, 32, 64, 128, 1024]
+
+    shape_placeholder = array_ops.placeholder(dtypes.int64, shape=[None])
+    batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+
+    dataset = dataset_ops.Dataset.range(1000000000)
+
+    dense_value = random_ops.random_normal(shape=shape_placeholder)
+
+    dataset = dataset.apply(batching.map_and_batch(
+        lambda _: dense_value, batch_size_placeholder))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    for shape in shapes:
+      for batch_size in batch_size_values:
+
+        with session.Session() as sess:
+          sess.run(iterator.initializer, feed_dict={
+              shape_placeholder: shape, batch_size_placeholder: batch_size})
+
+          # Use a C++ callable to minimize the Python overhead in the benchmark.
+          callable_opts = config_pb2.CallableOptions()
+          callable_opts.target.append(next_element.op.name)
+          op_callable = sess._make_callable_from_options(callable_opts)  # pylint: disable=protected-access
+
+          # Run five steps to warm up the session caches before taking the
+          # first measurement.
+          for _ in range(5):
+            op_callable()
+          deltas = []
+          overall_start = time.time()
+          # Run at least five repetitions and for at least five seconds.
+          while len(deltas) < 5 or time.time() - overall_start < 5.0:
+            start = time.time()
+            for _ in range(100):
+              op_callable()
+            end = time.time()
+            deltas.append(end - start)
+          del op_callable
+
+        median_wall_time = np.median(deltas) / 100.0
+        iters = len(deltas) * 100
+
+        print("Map and batch dense dataset shape: %r batch_size: %d "
+              "wall time: %f (%d iters)"
+              % (shape, batch_size, median_wall_time, iters))
+        self.report_benchmark(
+            iters=iters, wall_time=median_wall_time,
+            name="benchmark_batch_dense_dataset_nnz_%d_batch_size_%d" % (
+                np.prod(shape), batch_size))
+
+  def benchmarkMapAndBatchChainingVersusFusing(self):
+    """Compares the performance of chaining and fusing map and batch.
+
+    NOTE: It is recommended to build the benchmark with
+    `-c opt --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-gmlt`
+    and execute it on a machine with at least 32 CPU cores.
+    """
+
+    # Sequential pipeline configurations.
+    seq_elem_size_series = itertools.product([1], [1], [1, 2, 4, 8], [16])
+    seq_batch_size_series = itertools.product([1], [1], [1], [8, 16, 32, 64])
+
+    # Parallel pipeline configuration.
+    par_elem_size_series = itertools.product([32], [32], [1, 2, 4, 8], [256])
+    par_batch_size_series = itertools.product([32], [32], [1],
+                                              [128, 256, 512, 1024])
+    par_num_calls_series = itertools.product([8, 16, 32, 64], [32], [1], [512])
+    par_inter_op_series = itertools.product([32], [8, 16, 32, 64], [1], [512])
+
+    def name(method, label, num_calls, inter_op, element_size, batch_size):
+      return ("%s_id_%s_num_calls_%d_inter_op_%d_elem_size_%d_batch_size_%d" % (
+          method,
+          hashlib.sha1(label).hexdigest()[:8],
+          num_calls,
+          inter_op,
+          element_size,
+          batch_size,
+      ))
+
+    def benchmark(label, series):
+      """Runs benchmark the given series."""
+
+      print("%s:" % label)
+
+      def make_base_dataset(element_size):
+        k = 1024 * 1024
+        x = constant_op.constant(np.random.rand(element_size, 4 * k))
+        y = constant_op.constant(np.random.rand(4 * k, 1))
+        return dataset_ops.Dataset.range(1000000000000).map(lambda _: (x, y))
+
+      for num_calls, inter_op, element_size, batch_size in series:
+
+        num_iters = 1024 // (
+            (element_size * batch_size) // min(num_calls, inter_op))
+        fused_dataset = make_base_dataset(element_size)
+        fused_dataset = fused_dataset.map(
+            math_ops.matmul,
+            num_parallel_calls=num_calls).batch(batch_size=batch_size)
+
+        fused_iterator = dataset_ops.make_one_shot_iterator(fused_dataset)
+        fused_get_next = fused_iterator.get_next()
+
+        fused_deltas = []
+        with session.Session(
+            config=config_pb2.ConfigProto(
+                inter_op_parallelism_threads=inter_op,
+                use_per_session_threads=True)) as sess:
+
+          for _ in range(5):
+            sess.run(fused_get_next.op)
+          for _ in range(num_iters):
+            start = time.time()
+            sess.run(fused_get_next.op)
+            end = time.time()
+            fused_deltas.append(end - start)
+
+        # `map_and_batch_fusion` is optimized by default. To get the chained
+        # dataset, with have to disable it.
+        options = dataset_ops.Options()
+        options.experimental_optimization = OptimizationOptions()
+        options.experimental_optimization.map_and_batch_fusion = False
+        chained_dataset = fused_dataset.with_options(options)
+        chained_iterator = dataset_ops.make_one_shot_iterator(chained_dataset)
+        chained_get_next = chained_iterator.get_next()
+
+        chained_deltas = []
+        with session.Session(
+            config=config_pb2.ConfigProto(
+                inter_op_parallelism_threads=inter_op,
+                use_per_session_threads=True)) as sess:
+          for _ in range(5):
+            sess.run(chained_get_next.op)
+          for _ in range(num_iters):
+            start = time.time()
+            sess.run(chained_get_next.op)
+            end = time.time()
+            chained_deltas.append(end - start)
+
+        print(
+            "batch size: %d, num parallel calls: %d, inter-op parallelism: %d, "
+            "element size: %d, num iters: %d\nchained wall time: %f (median), "
+            "%f (mean), %f (stddev), %f (min), %f (max)\n  fused wall time: "
+            "%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n    "
+            "chained/fused:    %.2fx (median),    %.2fx (mean)" %
+            (batch_size, num_calls, inter_op, element_size, num_iters,
+             np.median(chained_deltas), np.mean(chained_deltas),
+             np.std(chained_deltas), np.min(chained_deltas),
+             np.max(chained_deltas), np.median(fused_deltas),
+             np.mean(fused_deltas), np.std(fused_deltas), np.min(fused_deltas),
+             np.max(fused_deltas),
+             np.median(chained_deltas) / np.median(fused_deltas),
+             np.mean(chained_deltas) / np.mean(fused_deltas)))
+
+        self.report_benchmark(
+            iters=num_iters,
+            wall_time=np.median(chained_deltas),
+            name=name("chained", label, num_calls, inter_op, element_size,
+                      batch_size))
+
+        self.report_benchmark(
+            iters=num_iters,
+            wall_time=np.median(fused_deltas),
+            name=name("fused", label, num_calls, inter_op, element_size,
+                      batch_size))
+
+      print()
+
+    np.random.seed(_NUMPY_RANDOM_SEED)
+    benchmark("Sequential element size evaluation", seq_elem_size_series)
+    benchmark("Sequential batch size evaluation", seq_batch_size_series)
+    benchmark("Parallel element size evaluation", par_elem_size_series)
+    benchmark("Parallel batch size evaluation", par_batch_size_series)
+    benchmark("Transformation parallelism evaluation", par_num_calls_series)
+    benchmark("Threadpool size evaluation", par_inter_op_series)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_benchmark.py
deleted file mode 100644
index ad253cffa568b3abe367b661b409348ce9f56fa1..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/benchmarks/map_benchmark.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import hashlib
-import itertools
-import time
-
-import numpy as np
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-_NUMPY_RANDOM_SEED = 42
-
-
-class MapDatasetBenchmark(test.Benchmark):
-
-  # The purpose of this benchmark is to compare the performance of chaining vs
-  # fusing of the map and batch transformations across various configurations.
-  #
-  # NOTE: It is recommended to build the benchmark with
-  # `-c opt --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-gmlt`
-  # and execute it on a machine with at least 32 CPU cores.
-  def benchmarkMapAndBatch(self):
-
-    # Sequential pipeline configurations.
-    seq_elem_size_series = itertools.product([1], [1], [1, 2, 4, 8], [16])
-    seq_batch_size_series = itertools.product([1], [1], [1], [8, 16, 32, 64])
-
-    # Parallel pipeline configuration.
-    par_elem_size_series = itertools.product([32], [32], [1, 2, 4, 8], [256])
-    par_batch_size_series = itertools.product([32], [32], [1],
-                                              [128, 256, 512, 1024])
-    par_num_calls_series = itertools.product([8, 16, 32, 64], [32], [1], [512])
-    par_inter_op_series = itertools.product([32], [8, 16, 32, 64], [1], [512])
-
-    def name(method, label, num_calls, inter_op, element_size, batch_size):
-      return ("%s_id_%s_num_calls_%d_inter_op_%d_elem_size_%d_batch_size_%d" % (
-          method,
-          hashlib.sha1(label).hexdigest(),
-          num_calls,
-          inter_op,
-          element_size,
-          batch_size,
-      ))
-
-    def benchmark(label, series):
-
-      print("%s:" % label)
-      for num_calls, inter_op, element_size, batch_size in series:
-
-        num_iters = 1024 // (
-            (element_size * batch_size) // min(num_calls, inter_op))
-        k = 1024 * 1024
-        dataset = dataset_ops.Dataset.from_tensors((np.random.rand(
-            element_size, 4 * k), np.random.rand(4 * k, 1))).repeat()
-
-        chained_dataset = dataset.map(
-            math_ops.matmul,
-            num_parallel_calls=num_calls).batch(batch_size=batch_size)
-        chained_iterator = chained_dataset.make_one_shot_iterator()
-        chained_get_next = chained_iterator.get_next()
-
-        chained_deltas = []
-        with session.Session(
-            config=config_pb2.ConfigProto(
-                inter_op_parallelism_threads=inter_op,
-                use_per_session_threads=True)) as sess:
-          for _ in range(5):
-            sess.run(chained_get_next.op)
-          for _ in range(num_iters):
-            start = time.time()
-            sess.run(chained_get_next.op)
-            end = time.time()
-            chained_deltas.append(end - start)
-
-        fused_dataset = dataset.apply(
-            batching.map_and_batch(
-                math_ops.matmul,
-                num_parallel_calls=num_calls,
-                batch_size=batch_size))
-        fused_iterator = fused_dataset.make_one_shot_iterator()
-        fused_get_next = fused_iterator.get_next()
-
-        fused_deltas = []
-        with session.Session(
-            config=config_pb2.ConfigProto(
-                inter_op_parallelism_threads=inter_op,
-                use_per_session_threads=True)) as sess:
-
-          for _ in range(5):
-            sess.run(fused_get_next.op)
-          for _ in range(num_iters):
-            start = time.time()
-            sess.run(fused_get_next.op)
-            end = time.time()
-            fused_deltas.append(end - start)
-
-        print(
-            "batch size: %d, num parallel calls: %d, inter-op parallelism: %d, "
-            "element size: %d, num iters: %d\nchained wall time: %f (median), "
-            "%f (mean), %f (stddev), %f (min), %f (max)\n  fused wall time: "
-            "%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n    "
-            "chained/fused:    %.2fx (median),    %.2fx (mean)" %
-            (batch_size, num_calls, inter_op, element_size, num_iters,
-             np.median(chained_deltas), np.mean(chained_deltas),
-             np.std(chained_deltas), np.min(chained_deltas),
-             np.max(chained_deltas), np.median(fused_deltas),
-             np.mean(fused_deltas), np.std(fused_deltas), np.min(fused_deltas),
-             np.max(fused_deltas),
-             np.median(chained_deltas) / np.median(fused_deltas),
-             np.mean(chained_deltas) / np.mean(fused_deltas)))
-
-        self.report_benchmark(
-            iters=num_iters,
-            wall_time=np.median(chained_deltas),
-            name=name("chained", label, num_calls, inter_op, element_size,
-                      batch_size))
-
-        self.report_benchmark(
-            iters=num_iters,
-            wall_time=np.median(fused_deltas),
-            name=name("fused", label, num_calls, inter_op, element_size,
-                      batch_size))
-
-      print("")
-
-    np.random.seed(_NUMPY_RANDOM_SEED)
-    benchmark("Sequential element size evaluation", seq_elem_size_series)
-    benchmark("Sequential batch size evaluation", seq_batch_size_series)
-    benchmark("Parallel element size evaluation", par_elem_size_series)
-    benchmark("Parallel batch size evaluation", par_batch_size_series)
-    benchmark("Transformation parallelism evaluation", par_num_calls_series)
-    benchmark("Threadpool size evaluation", par_inter_op_series)
-
-  # This benchmark compares the performance of pipeline with multiple chained
-  # maps with and without map fusion.
-  def benchmarkChainOfMaps(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      self._benchmarkChainOfMaps(chain_length, False)
-      self._benchmarkChainOfMaps(chain_length, True)
-
-  def _benchmarkChainOfMaps(self, chain_length, optimize_dataset):
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-      for _ in range(chain_length):
-        dataset = dataset.map(lambda x: x)
-      if optimize_dataset:
-        dataset = dataset.apply(optimization.optimize(["map_fusion"]))
-
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(5):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        opt_mark = "opt" if optimize_dataset else "no-opt"
-        print("Map dataset {} chain length: {} Median wall time: {}".format(
-            opt_mark, chain_length, median_wall_time))
-        self.report_benchmark(
-            iters=1000,
-            wall_time=median_wall_time,
-            name="benchmark_map_dataset_chain_latency_{}_{}".format(
-                opt_mark, chain_length))
-
-
-class MapAndFilterBenchmark(test.Benchmark):
-
-  # This benchmark compares the performance of pipeline with multiple chained
-  # map + filter with and without map fusion.
-  def benchmarkMapAndFilter(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      self._benchmarkMapAndFilter(chain_length, False)
-      self._benchmarkMapAndFilter(chain_length, True)
-
-  def _benchmarkMapAndFilter(self, chain_length, optimize_dataset):
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-      for _ in range(chain_length):
-        dataset = dataset.map(lambda x: x + 5).filter(
-            lambda x: math_ops.greater_equal(x - 5, 0))
-      if optimize_dataset:
-        dataset = dataset.apply(
-            optimization.optimize(["map_and_filter_fusion"]))
-
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(10):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        opt_mark = "opt" if optimize_dataset else "no-opt"
-        print("Map and filter dataset {} chain length: {} Median wall time: {}".
-              format(opt_mark, chain_length, median_wall_time))
-        self.report_benchmark(
-            iters=1000,
-            wall_time=median_wall_time,
-            name="benchmark_map_and_filter_dataset_chain_latency_{}_{}".format(
-                opt_mark, chain_length))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a60ba0a857ee18e88e912fc25000a479e4a86e72
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
@@ -0,0 +1,205 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for the `MapVectorization` optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import optimization_options
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+
+
+def _generate_csv_test_case():
+  """Generates a `decode_csv()` test case."""
+
+  def csv_factory():
+    return dataset_ops.Dataset.from_tensor_slices(["1.0:2:a",
+                                                   "2.4:5:c"]).repeat(5)
+
+  def decode_csv_fn(x):
+    return parsing_ops.decode_csv(
+        x,
+        record_defaults=[
+            constant_op.constant([], dtypes.float32),
+            constant_op.constant([], dtypes.int32),
+            constant_op.constant([], dtypes.string)
+        ],
+        field_delim=":")
+
+  return decode_csv_fn, csv_factory
+
+
+def _generate_parse_single_example_test_case():
+  """Generates a `parse_single_example()` test case."""
+
+  def parse_example_factory():
+    """Parse example factory."""
+
+    def _int64_feature(*values):
+      return feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=values))
+
+    def _bytes_feature(*values):
+      return feature_pb2.Feature(
+          bytes_list=feature_pb2.BytesList(
+              value=[v.encode("utf-8") for v in values]))
+
+    return dataset_ops.Dataset.from_tensor_slices(
+        constant_op.constant([
+            example_pb2.Example(
+                features=feature_pb2.Features(
+                    feature={
+                        "dense_int": _int64_feature(i),
+                        "dense_str": _bytes_feature(str(i)),
+                        "sparse_int": _int64_feature(i, i * 2, i * 4, i * 8),
+                        "sparse_str": _bytes_feature(*["abc"] * i)
+                    })).SerializeToString() for i in range(10)
+        ]))
+
+  def parse_single_example_fn(x):
+    features = {
+        "dense_int": parsing_ops.FixedLenFeature((), dtypes.int64, 0),
+        "dense_str": parsing_ops.FixedLenFeature((), dtypes.string, ""),
+        "sparse_int": parsing_ops.VarLenFeature(dtypes.int64),
+        "sparse_str": parsing_ops.VarLenFeature(dtypes.string),
+    }
+    return parsing_ops.parse_single_example(x, features)
+
+  return parse_single_example_fn, parse_example_factory
+
+
+# TODO(rachelim): Add a benchmark for more expensive transformations, such as
+# vgg_preprocessing.
+class MapVectorizationBenchmark(test.Benchmark):
+  """Benchmarks for the `MapVectorization` optimization."""
+
+  def _run(self, x, num_iters=100, name=None):
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        # Warm up session...
+        sess.run(x)
+      for _ in range(num_iters):
+        start = time.time()
+        sess.run(x)
+        end = time.time()
+        deltas.append(end - start)
+    median_time = np.median(deltas)
+    self.report_benchmark(iters=num_iters, wall_time=median_time, name=name)
+    return median_time
+
+  def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
+    num_elems = int(np.sum([np.prod(x) for x in input_size]))
+    name_template = "{}__batch_size_{}_input_element_size_{}_{}"
+
+    base_dataset = input_dataset.map(map_fn).batch(batch_size)
+
+    options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    # Disable default map_and_batch_fusion optimization
+    opt_options.map_and_batch_fusion = False
+    options.experimental_optimization = opt_options
+    base_dataset = base_dataset.with_options(options)
+
+    unoptimized_op = dataset_ops.make_one_shot_iterator(base_dataset).get_next()
+
+    optimized_options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.map_vectorization = True
+    optimized_options.experimental_optimization = opt_options
+    optimized = base_dataset.with_options(optimized_options)
+    optimized_op = dataset_ops.make_one_shot_iterator(optimized).get_next()
+
+    unoptimized_time = self._run(
+        unoptimized_op,
+        name=name_template.format(str_id, batch_size, num_elems, "unoptimized"))
+    optimized_time = self._run(
+        optimized_op,
+        name=name_template.format(str_id, batch_size, num_elems, "optimized"))
+
+    print("Batch size: {}\n"
+          "Input element size: {}\n"
+          "Transformation: {}\n"
+          "Speedup: {}\n".format(batch_size, input_size, str_id,
+                                 (unoptimized_time / optimized_time)))
+
+  # Known cheap functions
+  def benchmarkIdentity(self):
+    self._benchmark_helper(lambda *args: [array_ops.identity(x) for x in args],
+                           "identity")
+
+  def benchmarkAddConst(self):
+    self._benchmark_helper(lambda *args: [x + 1 for x in args], "add_const")
+
+  def benchmarkReturnConst(self):
+    self._benchmark_helper(lambda *args: [constant_op.constant(2)], "ret_const")
+
+  def benchmarkSelect(self):
+    self._benchmark_helper(lambda *args: args[0], "select")
+
+  def benchmarkCast(self):
+    self._benchmark_helper(
+        lambda *args: [math_ops.cast(x, dtypes.float64) for x in args], "cast")
+
+  def benchmarkReshape(self):
+    self._benchmark_helper(
+        lambda *args: [array_ops.reshape(x, (-1, 30)) for x in args], "reshape")
+
+  def benchmarkDecodeCSV(self):
+    csv_fn, csv_factory = _generate_csv_test_case()
+    self._benchmark_helper(csv_fn, "decode_csv", lambda: [csv_factory()])
+
+  def benchmarkParseSingleExample(self):
+    # NOTE: Since we haven't implemented a vectorizer for "SerializeSparse",
+    # this function is only naively vectorized.
+    parse_fn, parse_factory = _generate_parse_single_example_test_case()
+
+    self._benchmark_helper(parse_fn, "parse_single_example",
+                           lambda: [parse_factory()])
+
+  def _default_dataset_factory(self):
+    input_sizes = [(10, 10, 3), (10, 100, 300)]
+    for sz in input_sizes:
+      yield dataset_ops.Dataset.from_tensor_slices(np.random.rand(*sz))
+
+  def _benchmark_helper(self, map_fn, str_id, base_dataset_factory=None):
+    if base_dataset_factory is None:
+      base_dataset_factory = self._default_dataset_factory
+
+    batch_size = 1000
+    for base_dataset in base_dataset_factory():
+      base_dataset = base_dataset.repeat()
+      input_size = [
+          tuple(shape.as_list())
+          for shape in nest.flatten(base_dataset.output_shapes)
+      ]
+      self._compare(base_dataset, map_fn, batch_size, input_size, str_id)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53f8dd7c537fecbfcd551e2a4809aaf5447ff46
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for the experimental `MatchingFilesDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import matching_files
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class MatchingFilesBenchmark(test.Benchmark):
+  """Benchmark for the experimental `MatchingFilesDataset`."""
+
+  def benchmarkNestedDirectories(self):
+    tmp_dir = tempfile.mkdtemp()
+    width = 500
+    depth = 10
+    for i in range(width):
+      for j in range(depth):
+        new_base = os.path.join(tmp_dir, str(i),
+                                *[str(dir_name) for dir_name in range(j)])
+        os.makedirs(new_base)
+        child_files = ['a.py', 'b.pyc'] if j < depth - 1 else ['c.txt', 'd.log']
+        for f in child_files:
+          filename = os.path.join(new_base, f)
+          open(filename, 'w').close()
+
+    patterns = [
+        os.path.join(tmp_dir, os.path.join(*['**'
+                                             for _ in range(depth)]), suffix)
+        for suffix in ['*.txt', '*.log']
+    ]
+
+    deltas = []
+    iters = 3
+    for _ in range(iters):
+      with ops.Graph().as_default():
+        dataset = matching_files.MatchingFilesDataset(patterns)
+        next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+
+        with session.Session() as sess:
+          sub_deltas = []
+          while True:
+            try:
+              start = time.time()
+              sess.run(next_element)
+              end = time.time()
+              sub_deltas.append(end - start)
+            except errors.OutOfRangeError:
+              break
+          deltas.append(sub_deltas)
+
+    median_deltas = np.median(deltas, axis=0)
+    print('Nested directory size (width*depth): %d*%d Median wall time: '
+          '%fs (read first filename), %fs (read second filename), avg %fs'
+          ' (read %d more filenames)' %
+          (width, depth, median_deltas[0], median_deltas[1],
+           np.average(median_deltas[2:]), len(median_deltas) - 2))
+    self.report_benchmark(
+        iters=iters,
+        wall_time=np.sum(median_deltas),
+        extras={
+            'read first file:':
+                median_deltas[0],
+            'read second file:':
+                median_deltas[1],
+            'avg time for reading %d more filenames:' %
+            (len(median_deltas) - 2):
+                np.average(median_deltas[2:])
+        },
+        name='dataset_nested_directory(%d*%d)' %
+        (width, depth))
+
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f9b89111fcda9230062a4aa7d3477df5d2f36a5
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
@@ -0,0 +1,120 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for static optimizations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class OptimizationBenchmark(test.Benchmark):
+  """Benchmarks for static optimizations."""
+
+  def benchmarkMapFusion(self):
+    """Evaluates performance map of fusion."""
+
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkMapFusion(chain_length, False)
+      self._benchmarkMapFusion(chain_length, True)
+
+  def _benchmarkMapFusion(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.map(lambda x: x)
+      if optimize_dataset:
+        options = dataset_ops.Options()
+        options.experimental_map_fusion = True
+        dataset = dataset.with_options(options)
+
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(5):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "noopt"
+        print("Map dataset {} chain length: {} Median wall time: {}".format(
+            opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name="map_fusion_{}_chain_length_{}".format(
+                opt_mark, chain_length))
+
+  def benchmarkMapAndFilterFusion(self):
+    """Evaluates performance map of fusion."""
+
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkMapAndFilterFusion(chain_length, False)
+      self._benchmarkMapAndFilterFusion(chain_length, True)
+
+  def _benchmarkMapAndFilterFusion(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.map(lambda x: x + 5).filter(
+            lambda x: math_ops.greater_equal(x - 5, 0))
+      if optimize_dataset:
+        options = dataset_ops.Options()
+        options.experimental_map_and_filter_fusion = True
+        dataset = dataset.with_options(options)
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(10):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "noopt"
+        print("Map and filter dataset {} chain length: {} Median wall time: {}"
+              .format(opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name="map_and_filter_fusion_{}_chain_length_{}".format(
+                opt_mark, chain_length))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c36a32534dddfc29e5f0d4253508e44f9ae4a899
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
@@ -0,0 +1,107 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.unbatch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class UnbatchBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.unbatch()`."""
+
+  def benchmarkNativeUnbatch(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.apply(batching.unbatch())
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (native) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="native_batch_size_%d" %
+              batch_size)
+
+  # Include a benchmark of the previous `unbatch()` implementation that uses
+  # a composition of more primitive ops. Eventually we'd hope to generate code
+  # that is as good in both cases.
+  def benchmarkOldUnbatchImplementation(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (unfused) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="unfused_batch_size_%d" %
+              batch_size)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 4eef9580ad14d66a6ba86c1604dc1013a1fbf82e..548eb422ed06de84447494391ad9e54d9b2df0b2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -38,6 +38,7 @@ cuda_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python/compat:compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -71,15 +72,11 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/experimental/ops:error_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -152,27 +149,6 @@ py_test(
     ],
 )
 
-cuda_py_test(
-    name = "function_buffering_resource_test",
-    size = "small",
-    srcs = ["function_buffering_resource_test.py"],
-    additional_deps = [
-        "//tensorflow/python/data/experimental/ops:prefetching_ops",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-    tags = ["no_windows_gpu"],
-)
-
 py_test(
     name = "get_single_element_test",
     size = "small",
@@ -279,6 +255,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/ops:readers",
@@ -331,6 +308,7 @@ py_test(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond_v2",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -368,6 +346,37 @@ py_test(
     ],
 )
 
+py_test(
+    name = "matching_files_test",
+    size = "small",
+    srcs = ["matching_files_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:matching_files",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "cardinality_test",
+    srcs = ["cardinality_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/data/experimental/ops:cardinality",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "override_threadpool_test",
     size = "small",
@@ -536,6 +545,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -565,6 +575,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "sleep_test",
+    srcs = ["sleep_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:sleep",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_library(
     name = "sql_dataset_test_base",
     srcs = ["sql_dataset_test_base.py"],
@@ -602,7 +625,9 @@ py_test(
     size = "medium",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+    ],
     deps = [
         ":reader_dataset_ops_test_base",
         ":stats_dataset_test_base",
@@ -610,9 +635,15 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:stats_aggregator",
         "//tensorflow/python/data/experimental/ops:stats_ops",
+        "//tensorflow/python/data/experimental/ops:stats_options",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -685,3 +716,14 @@ py_test(
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
+
+cuda_py_test(
+    name = "wrap_unwrap_test",
+    size = "small",
+    srcs = ["wrap_unwrap_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index 3903ec49b98447bc69e37107c359be748818f1f1..8264dee3c15da3e1c10751b9c3db3d1e2bc3f1ee 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -105,14 +105,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
               boundaries,
               batch_sizes,
               no_padding=no_padding))
-      batch, = dataset.make_one_shot_iterator().get_next()
+      batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
       with self.cached_session() as sess:
         batches = []
         for _ in range(4):
-          batches.append(sess.run(batch))
+          batches.append(self.evaluate(batch))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(batch)
+          self.evaluate(batch)
       batch_sizes_val = []
       lengths_val = []
       for batch in batches:
@@ -155,14 +155,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
+    batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       batches = []
       for _ in range(3):
-        batches.append(sess.run(batch))
+        batches.append(self.evaluate(batch))
       with self.assertRaisesOpError("bucket_boundaries"):
-        sess.run(batch)
+        self.evaluate(batch)
     batch_sizes_val = []
     lengths_val = []
     for batch in batches:
@@ -192,14 +192,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
+    batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       batches = []
       for _ in range(5):
-        batches.append(sess.run(batch))
+        batches.append(self.evaluate(batch))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(batch)
+        self.evaluate(batch)
 
     self.assertAllEqual(batches[0], [[1, 0],
                                      [1, 1]])
@@ -295,12 +295,12 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
 
     def _compute_batches(dataset):
       """Computes actual batch outputs of dataset and stores in a set."""
-      batch = dataset.make_one_shot_iterator().get_next()
+      batch = dataset_ops.make_one_shot_iterator(dataset).get_next()
       all_sparse_tensors = set()
       with self.cached_session() as sess:
         with self.assertRaises(errors.OutOfRangeError):
           while True:
-            output = sess.run(batch)
+            output = self.evaluate(batch)
             sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
                            tuple(output.values))
             all_sparse_tensors.add(sprs_tensor)
diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..943f0f1f81272b334f0011a301636e9927c15b7c
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -0,0 +1,158 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.cardinality()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import cardinality
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for `tf.data.experimental.cardinality()`."""
+
+  @parameterized.named_parameters(
+      # pylint: disable=g-long-lambda
+      ("Batch1",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True), 2),
+      ("Batch2",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=False), 3),
+      ("Batch3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).batch(2),
+       cardinality.UNKNOWN),
+      ("Batch4", lambda: dataset_ops.Dataset.range(5).repeat().batch(2),
+       cardinality.INFINITE),
+      ("Cache1", lambda: dataset_ops.Dataset.range(5).cache(), 5),
+      ("Cache2", lambda: dataset_ops.Dataset.range(5).cache("foo"), 5),
+      ("Concatenate1", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5)), 10),
+      ("Concatenate2",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5)), cardinality.UNKNOWN),
+      ("Concatenate3", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5)),
+       cardinality.INFINITE),
+      ("Concatenate4", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.UNKNOWN),
+      ("Concatenate5",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.UNKNOWN),
+      ("Concatenate6", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.INFINITE),
+      ("Concatenate7", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("Concatenate8",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("Concatenate9",
+       lambda: dataset_ops.Dataset.range(5).repeat().concatenate(
+           dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("FlatMap", lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)),
+       cardinality.UNKNOWN),
+      ("Filter", lambda: dataset_ops.Dataset.range(5).filter(lambda _: True),
+       cardinality.UNKNOWN),
+      ("FromTensors1", lambda: dataset_ops.Dataset.from_tensors(0), 1),
+      ("FromTensors2", lambda: dataset_ops.Dataset.from_tensors((0, 1)), 1),
+      ("FromTensorSlices1",
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0]), 3),
+      ("FromTensorSlices2",
+       lambda: dataset_ops.Dataset.from_tensor_slices(([0, 0, 0], [1, 1, 1])),
+       3),
+      ("Interleave1", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       cardinality.UNKNOWN),
+      ("Interleave2", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), cardinality.UNKNOWN),
+      ("Map1", lambda: dataset_ops.Dataset.range(5).map(lambda x: x), 5),
+      ("Map2", lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1), 5),
+      ("PaddedBatch1", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=True), 2),
+      ("PaddedBatch2", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=False), 3),
+      ("PaddedBatch3", lambda: dataset_ops.Dataset.range(5).filter(
+          lambda _: True).padded_batch(2, []), cardinality.UNKNOWN),
+      ("PaddedBatch4",
+       lambda: dataset_ops.Dataset.range(5).repeat().padded_batch(2, []),
+       cardinality.INFINITE),
+      ("Prefetch", lambda: dataset_ops.Dataset.range(5).prefetch(buffer_size=1),
+       5),
+      ("Range1", lambda: dataset_ops.Dataset.range(0), 0),
+      ("Range2", lambda: dataset_ops.Dataset.range(5), 5),
+      ("Range3", lambda: dataset_ops.Dataset.range(5, 10), 5),
+      ("Range4", lambda: dataset_ops.Dataset.range(10, 5), 0),
+      ("Range5", lambda: dataset_ops.Dataset.range(5, 10, 2), 3),
+      ("Range6", lambda: dataset_ops.Dataset.range(10, 5, -2), 3),
+      ("Repeat1", lambda: dataset_ops.Dataset.range(0).repeat(0), 0),
+      ("Repeat2", lambda: dataset_ops.Dataset.range(1).repeat(0), 0),
+      ("Repeat3", lambda: dataset_ops.Dataset.range(0).repeat(5), 0),
+      ("Repeat4", lambda: dataset_ops.Dataset.range(1).repeat(5), 5),
+      ("Repeat5", lambda: dataset_ops.Dataset.range(0).repeat(), 0),
+      ("Repeat6", lambda: dataset_ops.Dataset.range(1).repeat(),
+       cardinality.INFINITE),
+      ("Shuffle", lambda: dataset_ops.Dataset.range(5).shuffle(buffer_size=1),
+       5),
+      ("Skip1", lambda: dataset_ops.Dataset.range(5).skip(2), 3),
+      ("Skip2", lambda: dataset_ops.Dataset.range(5).skip(8), 0),
+      ("Skip3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).skip(2),
+       cardinality.UNKNOWN),
+      ("Skip4", lambda: dataset_ops.Dataset.range(5).repeat().skip(2),
+       cardinality.INFINITE),
+      ("Take1", lambda: dataset_ops.Dataset.range(5).take(2), 2),
+      ("Take2", lambda: dataset_ops.Dataset.range(5).take(8), 5),
+      ("Take3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).take(2),
+       cardinality.UNKNOWN),
+      ("Take4", lambda: dataset_ops.Dataset.range(5).repeat().take(2), 2),
+      ("Window1", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=True), 2),
+      ("Window2", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=False), 3),
+      ("Zip1", lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5)),
+       5),
+      ("Zip2", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
+      ("Zip3", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5),
+           dataset_ops.Dataset.range(3).repeat())), 5),
+      ("Zip4", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5).repeat(),
+           dataset_ops.Dataset.range(3).repeat())), cardinality.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5),
+           dataset_ops.Dataset.range(3).filter(lambda _: True))),
+       cardinality.UNKNOWN),
+      # pylint: enable=g-long-lambda
+  )
+  def testNumElements(self, dataset_fn, expected_result):
+    with self.cached_session() as sess:
+      self.assertEqual(
+          sess.run(cardinality.cardinality(dataset_fn())), expected_result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
index adfacf1c9f856e08d6bc60f1197391e0d57765bb..b8166fe8334a5117005b7194cd582287eac74dd7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
@@ -28,18 +28,21 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat as util_compat
 
 
 class CopyToDeviceTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testCopyToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -53,19 +56,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceInt32(self):
     host_dataset = dataset_ops.Dataset.from_tensors([0, 1, 2, 3])
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -79,18 +83,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual((4,), next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+    with self.test_session(config=worker_config):
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToSameDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:0"))
 
     with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -104,19 +109,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceWithPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -130,19 +136,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyDictToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -156,19 +163,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyDictToDeviceWithPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -182,12 +190,13 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopySparseTensorsToDevice(self):
 
     def make_tensor(i):
@@ -200,7 +209,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -213,15 +222,16 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopySparseTensorsToDeviceWithPrefetch(self):
 
     def make_tensor(i):
@@ -234,7 +244,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -247,14 +257,14 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpu(self):
     if not test_util.is_gpu_available():
@@ -265,15 +275,16 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithPrefetch(self):
     if not test_util.is_gpu_available():
@@ -284,15 +295,53 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
+
+  def testCopyToDeviceGpuWithMap(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    def generator():
+      for i in range(10):
+        yield i, float(i), str(i)
+
+    host_dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=(dtypes.int32, dtypes.float32, dtypes.string))
+    device_dataset = host_dataset.apply(
+        prefetching_ops.copy_to_device("/gpu:0"))
+
+    def gpu_map_func(x, y, z):
+      return math_ops.square(x), math_ops.square(y), z
+
+    device_dataset = device_dataset.apply(
+        prefetching_ops.map_on_gpu(gpu_map_func))
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    device_dataset = device_dataset.with_options(options)
+
+    with ops.device("/gpu:0"):
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
+      next_element = iterator.get_next()
+
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      for i in range(10):
+        x, y, z = self.evaluate(next_element)
+        self.assertEqual(i**2, x)
+        self.assertEqual(float(i**2), y)
+        self.assertEqual(util_compat.as_bytes(str(i)), z)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuInt32(self):
     if not test_util.is_gpu_available():
@@ -303,14 +352,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuInt32AndPrefetch(self):
     if not test_util.is_gpu_available():
@@ -321,14 +371,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuStrings(self):
     if not test_util.is_gpu_available():
@@ -339,14 +390,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([b"a", b"b", b"c"], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuStringsAndPrefetch(self):
     if not test_util.is_gpu_available():
@@ -357,14 +409,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([b"a", b"b", b"c"], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDevicePingPongCPUGPU(self):
     if not test_util.is_gpu_available():
@@ -378,23 +431,25 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
           prefetching_ops.copy_to_device("/cpu:0", source_device="/gpu:0"))
 
       with ops.device("/cpu:0"):
-        iterator = back_to_cpu_dataset.make_initializable_iterator()
+        iterator = dataset_ops.make_initializable_iterator(back_to_cpu_dataset)
         next_element = iterator.get_next()
 
-      with self.cached_session() as sess:
-        sess.run(iterator.initializer)
+      with self.cached_session(
+          config=config_pb2.ConfigProto(allow_soft_placement=False)):
+        self.evaluate(iterator.initializer)
         for i in range(10):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -408,23 +463,24 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+    with self.test_session(config=worker_config):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceWithReInitAndPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -438,15 +494,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+    with self.test_session(config=worker_config):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithReInit(self):
     if not test_util.is_gpu_available():
@@ -457,18 +513,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithReInitAndPrefetch(self):
     if not test_util.is_gpu_available():
@@ -479,18 +536,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testIteratorGetNextAsOptionalOnGPU(self):
     if not test_util.is_gpu_available():
@@ -500,33 +558,35 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/gpu:0"))
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_elem = iterator_ops.get_next_as_optional(iterator)
       elem_has_value_t = next_elem.has_value()
       elem_value_t = next_elem.get_value()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       # Before initializing the iterator, evaluating the optional fails with
       # a FailedPreconditionError.
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_has_value_t)
+        self.evaluate(elem_has_value_t)
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_value_t)
+        self.evaluate(elem_value_t)
 
       # For each element of the dataset, assert that the optional evaluates to
       # the expected value.
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(3):
-        elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t])
+        elem_has_value, elem_value = self.evaluate(
+            [elem_has_value_t, elem_value_t])
         self.assertTrue(elem_has_value)
         self.assertEqual(i, elem_value)
 
       # After exhausting the iterator, `next_elem.has_value()` will evaluate to
       # false, and attempting to get the value will fail.
       for _ in range(2):
-        self.assertFalse(sess.run(elem_has_value_t))
+        self.assertFalse(self.evaluate(elem_has_value_t))
         with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(elem_value_t)
+          self.evaluate(elem_value_t)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
index 4e114ac47914f89666f35a9fbc3c4a0099f0e6b1..49e1f2272b7bea8f2d245d678711a3879774ba06 100644
--- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
@@ -19,32 +19,35 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import counter
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class CounterTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testCounter(self):
     """Test dataset construction using `count`."""
-    iterator = (counter.Counter(start=3, step=4)
-                .make_one_shot_iterator())
+    iterator = dataset_ops.make_one_shot_iterator(
+        counter.Counter(start=3, step=4))
     get_next = iterator.get_next()
     self.assertEqual([], get_next.shape.as_list())
     self.assertEqual(dtypes.int64, get_next.dtype)
 
-    negative_iterator = (counter.Counter(start=0, step=-1)
-                         .make_one_shot_iterator())
+    negative_iterator = dataset_ops.make_one_shot_iterator(
+        counter.Counter(start=0, step=-1))
     negative_get_next = negative_iterator.get_next()
 
     with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(get_next))
-      self.assertEqual(3 + 4, sess.run(get_next))
-      self.assertEqual(3 + 2 * 4, sess.run(get_next))
+      self.assertEqual(3, self.evaluate(get_next))
+      self.assertEqual(3 + 4, self.evaluate(get_next))
+      self.assertEqual(3 + 2 * 4, self.evaluate(get_next))
 
-      self.assertEqual(0, sess.run(negative_get_next))
-      self.assertEqual(-1, sess.run(negative_get_next))
-      self.assertEqual(-2, sess.run(negative_get_next))
+      self.assertEqual(0, self.evaluate(negative_get_next))
+      self.assertEqual(-1, self.evaluate(negative_get_next))
+      self.assertEqual(-2, self.evaluate(negative_get_next))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index fb75be1fbcf1478994d25ea8b1084c6883adbf8d..b2f1b43ecf6f82725143c95af4d6f4df58e41903 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -20,14 +20,8 @@ from __future__ import print_function
 
 import gzip
 import os
-import string
-import tempfile
-import time
 import zlib
 
-import numpy as np
-
-from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
@@ -38,8 +32,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 
 
@@ -537,96 +529,5 @@ class CsvDatasetTest(test_base.DatasetTestBase):
           record_defaults=record_defaults)
 
 
-class CsvDatasetBenchmark(test.Benchmark):
-  """Benchmarks for the various ways of creating a dataset from CSV files.
-  """
-  FLOAT_VAL = '1.23456E12'
-  STR_VAL = string.ascii_letters * 10
-
-  def _setUp(self, str_val):
-    # Since this isn't test.TestCase, have to manually create a test dir
-    gfile.MakeDirs(googletest.GetTempDir())
-    self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
-
-    self._num_cols = [4, 64, 256]
-    self._num_per_iter = 5000
-    self._filenames = []
-    for n in self._num_cols:
-      fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
-      with open(fn, 'wb') as f:
-        # Just write 100 rows and use `repeat`... Assumes the cost
-        # of creating an iterator is not significant
-        row = ','.join([str_val for _ in range(n)])
-        f.write('\n'.join([row for _ in range(100)]))
-      self._filenames.append(fn)
-
-  def _tearDown(self):
-    gfile.DeleteRecursively(self._temp_dir)
-
-  def _runBenchmark(self, dataset, num_cols, prefix):
-    dataset = dataset.skip(self._num_per_iter - 1)
-    deltas = []
-    for _ in range(10):
-      next_element = dataset.make_one_shot_iterator().get_next()
-      with session.Session() as sess:
-        start = time.time()
-        # NOTE: This depends on the underlying implementation of skip, to have
-        # the net effect of calling `GetNext` num_per_iter times on the
-        # input dataset. We do it this way (instead of a python for loop, or
-        # batching N inputs in one iter) so that the overhead from session.run
-        # or batch doesn't dominate. If we eventually optimize skip, this has
-        # to change.
-        sess.run(next_element)
-        end = time.time()
-      deltas.append(end - start)
-    # Median wall time per CSV record read and decoded
-    median_wall_time = np.median(deltas) / self._num_per_iter
-    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
-                                                    median_wall_time))
-    self.report_benchmark(
-        iters=self._num_per_iter,
-        wall_time=median_wall_time,
-        name='%s_with_cols_%d' % (prefix, num_cols))
-
-  def benchmarkMapWithFloats(self):
-    self._setUp(self.FLOAT_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [[0.0]] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_float_map_decode_csv')
-    self._tearDown()
-
-  def benchmarkMapWithStrings(self):
-    self._setUp(self.STR_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [['']] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
-    self._tearDown()
-
-  def benchmarkCsvDatasetWithFloats(self):
-    self._setUp(self.FLOAT_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [[0.0]] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_float_fused_dataset')
-    self._tearDown()
-
-  def benchmarkCsvDatasetWithStrings(self):
-    self._setUp(self.STR_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [['']] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_strings_fused_dataset')
-    self._tearDown()
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
index 73be6cbcca8a204ac87cfb6ac8ae87f1d84ffa15..22e057a2848fd154de0ad356f2238fb2028cd647 100644
--- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
@@ -24,27 +24,28 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class DenseToSparseBatchTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([x], x)).apply(
-            batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [12])))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)], results.indices)
@@ -56,23 +57,23 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
                             results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetWithUnknownShape(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([x, x], x)).apply(
-            batching.dense_to_sparse_batch(
-                4, [5, None])).make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [5, None])))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j, z]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)
@@ -89,20 +90,22 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
         ], results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetWithInvalidShape(self):
     input_tensor = array_ops.constant([[1]])
     with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
-      dataset_ops.Dataset.from_tensors(input_tensor).apply(
-          batching.dense_to_sparse_batch(4, [-2])).make_initializable_iterator()
+      dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.from_tensors(input_tensor).apply(
+              batching.dense_to_sparse_batch(4, [-2])))
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetShapeErrors(self):
     input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [12])))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -111,13 +114,13 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
       sess.run(init_op, feed_dict={input_tensor: [[1]]})
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "incompatible with the row shape"):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Initialize with an input tensor that is larger than `row_shape`.
       sess.run(init_op, feed_dict={input_tensor: range(13)})
       with self.assertRaisesRegexp(errors.DataLossError,
                                    "larger than the row shape"):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
index 796a692c56ffb3cbd1347270ed31b3abcbef1739..214434206669299cf545d68bdc330b1a548b4710 100644
--- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
@@ -24,11 +24,13 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
     input_datasets = [
@@ -36,16 +38,16 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
     ]
     dataset = interleave_ops._DirectedInterleaveDataset(selector_dataset,
                                                         input_datasets)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for _ in range(100):
         for i in range(10):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def _normalize(self, vec):
     return vec / vec.sum()
@@ -65,18 +67,19 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
         for i in range(num_datasets)
     ], weights)
     dataset = dataset.take(num_samples)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       freqs = np.zeros([num_datasets])
       for _ in range(num_samples):
-        freqs[sess.run(next_element)] += 1
+        freqs[self.evaluate(next_element)] += 1
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
     return freqs
 
+  @test_util.run_deprecated_v1
   def testSampleFromDatasets(self):
     random_seed.set_random_seed(1619)
     num_samples = 5000
@@ -96,20 +99,21 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
       freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
       self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
 
+  @test_util.run_deprecated_v1
   def testSelectFromDatasets(self):
     words = [b"foo", b"bar", b"baz"]
     datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words]
     choice_array = np.random.randint(3, size=(15,), dtype=np.int64)
     choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
     dataset = interleave_ops.choose_from_datasets(datasets, choice_dataset)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in choice_array:
-        self.assertEqual(words[i], sess.run(next_element))
+        self.assertEqual(words[i], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testErrors(self):
     with self.assertRaisesRegexp(ValueError,
diff --git a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
index e54235d9f80c2dc0eaf2c30a8e5eda58310b3284..25742098f18787bc1d2e5bfd9c8717a777b8312c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
@@ -24,17 +24,20 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class EnumerateDatasetTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testEnumerateDataset(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
     start = constant_op.constant(20, dtype=dtypes.int64)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply(
-        enumerate_ops.enumerate_dataset(start)).make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensor_slices(components).apply(
+            enumerate_ops.enumerate_dataset(start)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -44,12 +47,12 @@ class EnumerateDatasetTest(test_base.DatasetTestBase):
                      [t.shape for t in get_next[1]])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertEqual((20, (b"a", 1, 37.0)), sess.run(get_next))
-      self.assertEqual((21, (b"b", 2, 38.0)), sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertEqual((20, (b"a", 1, 37.0)), self.evaluate(get_next))
+      self.assertEqual((21, (b"b", 2, 38.0)), self.evaluate(get_next))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
index c6ee88c676df201f022259abe7ed128db3cc2d73..357b5f1b49b9f75e187fc02a5a89907baa445a76 100644
--- a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
@@ -47,17 +47,17 @@ class FilterBenchmark(test.Benchmark):
       if optimize_dataset:
         dataset = dataset.apply(optimization.optimize(["filter_fusion"]))
 
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
 
       with session.Session() as sess:
         for _ in range(10):
-          sess.run(next_element.op)
+          self.evaluate(next_element.op)
         deltas = []
         for _ in range(100):
           start = time.time()
           for _ in range(100):
-            sess.run(next_element.op)
+            self.evaluate(next_element.op)
           end = time.time()
           deltas.append(end - start)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py b/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
deleted file mode 100644
index 399fd284f4de1a4d25115908c61ba6325cd38ea1..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the private `FunctionBufferingResource` used in prefetching."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import threading
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.data.experimental.ops import prefetching_ops
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.platform import test
-
-
-class FunctionBufferingResourceTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self._event = threading.Event()
-
-  def _create_ds_and_iterator(self, device0, initializable=False):
-
-    def gen():
-      for i in range(1, 10):
-        yield [float(i)]
-        if i == 6:
-          self._event.set()
-
-    with ops.device(device0):
-      ds = dataset_ops.Dataset.from_generator(gen, (dtypes.float32))
-      if initializable:
-        ds_iterator = ds.make_initializable_iterator()
-      else:
-        ds_iterator = ds.make_one_shot_iterator()
-      return (ds, ds_iterator)
-
-  def _create_ops(self, ds, ds_iterator, buffer_name, device0, device1):
-    ds_iterator_handle = ds_iterator.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, ds.output_types, ds.output_shapes)
-      return remote_iterator.get_next()
-
-    target = constant_op.constant(device0)
-    with ops.device(device1):
-      buffer_resource_handle = prefetching_ops.function_buffering_resource(
-          f=_remote_fn,
-          output_types=[dtypes.float32],
-          target_device=target,
-          string_arg=ds_iterator_handle,
-          buffer_size=3,
-          shared_name=buffer_name)
-
-    with ops.device(device1):
-      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
-          function_buffer_resource=buffer_resource_handle,
-          output_types=[dtypes.float32])
-      reset_op = prefetching_ops.function_buffering_resource_reset(
-          function_buffer_resource=buffer_resource_handle)
-      destroy_op = resource_variable_ops.destroy_resource_op(
-          buffer_resource_handle, ignore_lookup_error=True)
-
-    return (prefetch_op, reset_op, destroy_op)
-
-  def _prefetch_fn_helper_one_shot(self, buffer_name, device0, device1):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=False)
-    prefetch_op, _, destroy_op = self._create_ops(ds, ds_iterator, buffer_name,
-                                                  device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
-
-  def testSameDeviceCPU(self):
-    self._prefetch_fn_helper_one_shot("same_device_cpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/cpu:0")
-
-  def testDifferentDeviceCPU(self):
-    self._prefetch_fn_helper_one_shot("diff_device_cpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/cpu:1")
-
-  def testDifferentDeviceCPUGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    self._prefetch_fn_helper_one_shot("cpu_gpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/gpu:0")
-
-  def testReinitialization(self):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/cpu:1"
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
-    prefetch_op, reset_op, destroy_op = self._create_ops(
-        ds, ds_iterator, "reinit", device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      # Lets reset the function buffering resource and reinitialize the
-      # iterator. Should be able to go through this again.
-      self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
-
-  def testReinitializationOutOfRange(self):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/cpu:1"
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
-    prefetch_op, reset_op, destroy_op = self._create_ops(
-        ds, ds_iterator, "reinit", device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
-      for i in range(1, 10):
-        elem = sess.run(prefetch_op)
-        self.assertEqual(elem, [float(i)])
-      # Try fetching after its over twice to test out end of sequence.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      # Now reset everything and try it out again.
-      self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
-      for i in range(1, 10):
-        elem = sess.run(prefetch_op)
-        self.assertEqual(elem, [float(i)])
-      # Try fetching after its over twice to test out end of sequence.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      sess.run(destroy_op)
-
-  def testStringsGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/gpu:0"
-
-    ds = dataset_ops.Dataset.from_tensor_slices(["a", "b", "c"])
-    ds_iterator = ds.make_one_shot_iterator()
-    ds_iterator_handle = ds_iterator.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, ds.output_types, ds.output_shapes)
-      return remote_iterator.get_next()
-
-    target = constant_op.constant(device0)
-    with ops.device(device1):
-      buffer_resource_handle = prefetching_ops.function_buffering_resource(
-          f=_remote_fn,
-          output_types=[dtypes.string],
-          target_device=target,
-          string_arg=ds_iterator_handle,
-          buffer_size=3,
-          shared_name="strings")
-
-    with ops.device(device1):
-      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
-          function_buffer_resource=buffer_resource_handle,
-          output_types=[dtypes.string])
-      destroy_op = resource_variable_ops.destroy_resource_op(
-          buffer_resource_handle, ignore_lookup_error=True)
-
-    with self.cached_session() as sess:
-      self.assertEqual([b"a"], sess.run(prefetch_op))
-      self.assertEqual([b"b"], sess.run(prefetch_op))
-      self.assertEqual([b"c"], sess.run(prefetch_op))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      sess.run(destroy_op)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index 8c07afbac57944593ba48f2116f876dbe7ab9e76..ef576563a15a7385d450e4f254e1cb579f79ce8c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -39,6 +40,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("MoreThanOne", 0, 2, errors.InvalidArgumentError,
        "Dataset had more than one element."),
   )
+  @test_util.run_deprecated_v1
   def testGetSingleElement(self, skip, take, error=None, error_msg=None):
     skip_t = array_ops.placeholder(dtypes.int64, shape=[])
     take_t = array_ops.placeholder(dtypes.int64, shape=[])
@@ -67,6 +69,17 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
         with self.assertRaisesRegexp(error, error_msg):
           sess.run(element, feed_dict={skip_t: skip, take_t: take})
 
+  def testWindow(self):
+    """Test that `get_single_element()` can consume a nested dataset."""
+    def flat_map_func(ds):
+      batched = ds.batch(2)
+      element = get_single_element.get_single_element(batched)
+      return dataset_ops.Dataset.from_tensors(element)
+
+    dataset = dataset_ops.Dataset.range(10).window(2).flat_map(flat_map_func)
+    self.assertDatasetProduces(
+        dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
index 9030328593181c15981c889cd7b0c0dc370f060d..8507df3d3a27ea62c9d866c94af589fbc566317e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -36,14 +37,15 @@ class GroupByReducerTest(test_base.DatasetTestBase):
 
   def checkResults(self, dataset, shapes, values):
     self.assertEqual(shapes, dataset.output_shapes)
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
     with self.cached_session() as sess:
       for expected in values:
-        got = sess.run(get_next)
+        got = self.evaluate(get_next)
         self.assertEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testSum(self):
     reducer = grouping.Reducer(
         init_func=lambda _: np.int64(0),
@@ -55,6 +57,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       self.checkResults(
           dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
 
+  @test_util.run_deprecated_v1
   def testAverage(self):
 
     def reduce_fn(x, y):
@@ -72,6 +75,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       self.checkResults(
           dataset, shapes=tensor_shape.scalar(), values=[i - 1, i])
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray)
     reducer = grouping.Reducer(
@@ -88,6 +92,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           shapes=tensor_shape.scalar(),
           values=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
 
+  @test_util.run_deprecated_v1
   def testSparseSum(self):
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -105,6 +110,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       self.checkResults(
           dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
 
+  @test_util.run_deprecated_v1
   def testChangingStateShape(self):
 
     def reduce_fn(x, _):
@@ -124,14 +130,14 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           grouping.group_by_reducer(lambda x: x, reducer))
       self.assertEqual([None], dataset.output_shapes[0].as_list())
       self.assertIs(None, dataset.output_shapes[1].ndims)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       get_next = iterator.get_next()
       with self.cached_session() as sess:
-        x, y = sess.run(get_next)
+        x, y = self.evaluate(get_next)
         self.assertAllEqual([0] * (2**i), x)
         self.assertAllEqual(np.array(1, ndmin=i), y)
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
   def testTypeMismatch(self):
     reducer = grouping.Reducer(
@@ -188,9 +194,9 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.zip(
         (dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(10))).apply(
             grouping.group_by_reducer(lambda x, y: np.int64(0), reducer))
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
     with self.cached_session() as sess:
-      x, y = sess.run(get_next)
+      x, y = self.evaluate(get_next)
       self.assertAllEqual(x, np.asarray([x for x in range(10)]))
       self.assertEqual(y, 45)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index 557d56e8b9a60ec4cd4fb248dd6dfeb1c2ed4589..cbb79e55f507a41c0522163dc0b68c56835891a6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
@@ -49,6 +50,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
              32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
                  [None]), tensor_shape.TensorShape([3])))))
 
+  @test_util.run_deprecated_v1
   def testSingleBucket(self):
 
     def _map_fn(v):
@@ -63,14 +65,14 @@ class GroupByWindowTest(test_base.DatasetTestBase):
             lambda x, y, z: 0,
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
-      which_bucket, bucketed_values = sess.run(get_next)
+      which_bucket, bucketed_values = self.evaluate(get_next)
 
       self.assertEqual(0, which_bucket)
 
@@ -84,6 +86,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertAllEqual(expected_unk_int64, bucketed_values[1])
       self.assertAllEqual(expected_vec3_str, bucketed_values[2])
 
+  @test_util.run_deprecated_v1
   def testEvenOddBuckets(self):
 
     def _map_fn(v):
@@ -98,16 +101,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
             lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       # Get two minibatches (one containing even values, one containing odds)
-      which_bucket_even, bucketed_values_even = sess.run(get_next)
-      which_bucket_odd, bucketed_values_odd = sess.run(get_next)
+      which_bucket_even, bucketed_values_even = self.evaluate(get_next)
+      which_bucket_odd, bucketed_values_odd = self.evaluate(get_next)
 
       # Count number of bucket_tensors.
       self.assertEqual(3, len(bucketed_values_even))
@@ -141,6 +144,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
       self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
 
+  @test_util.run_deprecated_v1
   def testEvenOddBucketsFilterOutAllOdd(self):
 
     def _map_fn(v):
@@ -169,16 +173,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
             lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
             lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       # Get two minibatches ([0, 2, ...] and [64, 66, ...])
-      which_bucket0, bucketed_values_even0 = sess.run(get_next)
-      which_bucket1, bucketed_values_even1 = sess.run(get_next)
+      which_bucket0, bucketed_values_even0 = self.evaluate(get_next)
+      which_bucket1, bucketed_values_even1 = self.evaluate(get_next)
 
       # Ensure that bucket 1 was completely filtered out
       self.assertAllEqual(0, which_bucket0)
@@ -188,6 +192,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertAllEqual(
           np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
 
+  @test_util.run_deprecated_v1
   def testDynamicWindowSize(self):
     components = np.arange(100).astype(np.int64)
 
@@ -202,16 +207,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
         grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20),
                                  None, window_size_func))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.OutOfRangeError):
         batches = 0
         while True:
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           is_even = all(x % 2 == 0 for x in result)
           is_odd = all(x % 2 == 1 for x in result)
           self.assertTrue(is_even or is_odd)
@@ -221,22 +226,23 @@ class GroupByWindowTest(test_base.DatasetTestBase):
 
       self.assertEqual(batches, 15)
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
         .apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
+                                     4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       counts = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           self.assertTrue(
               all(x % 2 == 0
                   for x in result) or all(x % 2 == 1)
@@ -248,61 +254,64 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertGreaterEqual(num_full_batches, 24)
       self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
 
+  @test_util.run_deprecated_v1
   def testImmediateOutput(self):
     components = np.array(
         [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
             grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
+                                     4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       # The input is infinite, so this test demonstrates that:
       # 1. We produce output without having to consume the entire input,
       # 2. Different buckets can produce output at different rates, and
       # 3. For deterministic input, the output is deterministic.
       for _ in range(3):
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-        self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
-        self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
+        self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
+        self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next))
+        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testSmallGroups(self):
     components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
+                                     4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
+      self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
       # The small outputs at the end are deterministically produced in key
       # order.
-      self.assertAllEqual([0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1], sess.run(get_next))
+      self.assertAllEqual([0, 0, 0], self.evaluate(get_next))
+      self.assertAllEqual([1], self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(4).apply(
-            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
-        .make_initializable_iterator())
+            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "Window size must be greater than zero, but got 0."):
-        print(sess.run(get_next))
+        print(self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testReduceFuncError(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
 
@@ -314,19 +323,19 @@ class GroupByWindowTest(test_base.DatasetTestBase):
           padded_shapes=(tensor_shape.TensorShape([]),
                          constant_op.constant([5], dtype=dtypes.int64) * -1))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
-                                     32)).make_initializable_iterator())
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testConsumeWindowDatasetMoreThanOnce(self):
     components = np.random.randint(50, size=(200,)).astype(np.int64)
 
@@ -340,22 +349,21 @@ class GroupByWindowTest(test_base.DatasetTestBase):
               4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
       ))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
         .apply(grouping.group_by_window(
             lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
-            reduce_func, 4))
-        .make_initializable_iterator())
+            reduce_func, 4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       counts = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
-          tight_result, multiple_of_10_result = sess.run(get_next)
+          tight_result, multiple_of_10_result = self.evaluate(get_next)
           self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
           self.assertAllEqual(tight_result,
                               multiple_of_10_result[:, :tight_result.shape[1]])
diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
index c0ec1486ab8d49e8f1fc3a6ac98fe32cefba605b..81f580fccbd6b0053eaa865408b4f8c5f95ba94f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ _NUMPY_RANDOM_SEED = 42
 
 class IgnoreErrorsTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -42,17 +44,18 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.check_numerics(x, "message")).apply(
             error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testParallelMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -60,17 +63,18 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components).map(
             lambda x: array_ops.check_numerics(x, "message"),
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testReadFileIgnoreError(self):
 
     def write_string_to_file(value, filename):
@@ -87,28 +91,28 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(filenames).map(
             io_ops.read_file,
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       # All of the files are present.
-      sess.run(init_op)
+      self.evaluate(init_op)
       for filename in filenames:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Delete one of the files.
       os.remove(filenames[0])
 
       # Attempting to read filenames[0] will fail, but ignore_errors()
       # will catch the error.
-      sess.run(init_op)
+      self.evaluate(init_op)
       for filename in filenames[1:]:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
index c93a8353ce01063f52ecc68253df7d02a7689603..c3c4ccd07708d2c7cfdc57c2a6fcbf320f1dfb36 100644
--- a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import test
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class IndexedDatasetOpsTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testLowLevelIndexedDatasetOps(self):
     identity = ged_ops.experimental_identity_indexed_dataset(
         ops.convert_to_tensor(16, dtype=dtypes.uint64))
@@ -46,14 +48,15 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
         handle, index, output_types=[dtypes.uint64], output_shapes=[[]])
 
     with self.cached_session() as sess:
-      sess.run(materialize)
+      self.evaluate(materialize)
       self.assertEqual([3], sess.run(get_op, feed_dict={index: 3}))
 
+  @test_util.run_deprecated_v1
   def testIdentityIndexedDataset(self):
     ds = indexed_dataset_ops.IdentityIndexedDataset(16)
     materialized = ds.materialize()
     with self.cached_session() as sess:
-      sess.run(materialized.initializer)
+      self.evaluate(materialized.initializer)
       placeholder = array_ops.placeholder(dtypes.uint64, shape=[])
       for i in range(16):
         output = sess.run(
@@ -68,12 +71,13 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
     itr = ds.make_initializable_iterator()
     n = itr.get_next()
     with self.cached_session() as sess:
-      sess.run(itr.initializer)
+      self.evaluate(itr.initializer)
       for i in range(16):
-        output = sess.run(n)
+        output = self.evaluate(n)
         self.assertEqual(i, output)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(n)
+        self.evaluate(n)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
index 5ee94e14dcdd77ad4317d5fee022975bb74b9f39..7c78810494866cbd4cac4201d23182e083037e1c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -20,11 +20,15 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
+from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 
@@ -38,11 +42,12 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from file 0.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames[0],
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames[0],
+                    label_key="label",
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(
                 sess,
                 batch_size,
@@ -55,11 +60,12 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from file 1.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames[1],
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames[1],
+                    label_key="label",
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(
                 sess,
                 batch_size,
@@ -72,11 +78,12 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames,
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames,
+                    label_key="label",
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(
                 sess,
                 batch_size,
@@ -88,14 +95,16 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames,
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames,
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(sess, batch_size, num_epochs=num_epochs)
             with self.assertRaises(errors.OutOfRangeError):
               self._next_actual_batch(sess)
 
+  @test_util.run_deprecated_v1
   def testReadWithEquivalentDataset(self):
     features = {
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
@@ -105,19 +114,19 @@ class MakeBatchedFeaturesDatasetTest(
         core_readers.TFRecordDataset(self.test_filenames)
         .map(lambda x: parsing_ops.parse_single_example(x, features))
         .repeat(10).batch(2))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
           range(self._num_files), 2, 10):
-        actual_batch = sess.run(next_element)
+        actual_batch = self.evaluate(next_element)
         self.assertAllEqual(file_batch, actual_batch["file"])
         self.assertAllEqual(record_batch, actual_batch["record"])
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testReadWithFusedShuffleRepeatDataset(self):
     num_epochs = 5
@@ -126,18 +135,18 @@ class MakeBatchedFeaturesDatasetTest(
       # Test that shuffling with same seed produces the same result.
       with ops.Graph().as_default() as g:
         with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
+          outputs1 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
+              shuffle_seed=5)).get_next()
+          outputs2 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
+              shuffle_seed=5)).get_next()
           for _ in range(total_records // batch_size):
             batch1 = self._run_actual_batch(outputs1, sess)
             batch2 = self._run_actual_batch(outputs2, sess)
@@ -147,18 +156,18 @@ class MakeBatchedFeaturesDatasetTest(
       # Test that shuffling with different seeds produces a different order.
       with ops.Graph().as_default() as g:
         with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
+          outputs1 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
+              shuffle_seed=5)).get_next()
+          outputs2 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=15).make_one_shot_iterator().get_next()
+              shuffle_seed=15)).get_next()
           all_equal = True
           for _ in range(total_records // batch_size):
             batch1 = self._run_actual_batch(outputs1, sess)
@@ -174,14 +183,14 @@ class MakeBatchedFeaturesDatasetTest(
         for parser_num_threads in [2, 4]:
           with ops.Graph().as_default() as g:
             with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
-                  filenames=self.test_filenames,
-                  label_key="label",
-                  num_epochs=num_epochs,
-                  batch_size=batch_size,
-                  reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
+              self.outputs = dataset_ops.make_one_shot_iterator(
+                  self.make_batch_feature(
+                      filenames=self.test_filenames,
+                      label_key="label",
+                      num_epochs=num_epochs,
+                      batch_size=batch_size,
+                      reader_num_threads=reader_num_threads,
+                      parser_num_threads=parser_num_threads)).get_next()
               self.verify_records(
                   sess,
                   batch_size,
@@ -193,13 +202,13 @@ class MakeBatchedFeaturesDatasetTest(
 
           with ops.Graph().as_default() as g:
             with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
-                  filenames=self.test_filenames,
-                  num_epochs=num_epochs,
-                  batch_size=batch_size,
-                  reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
+              self.outputs = dataset_ops.make_one_shot_iterator(
+                  self.make_batch_feature(
+                      filenames=self.test_filenames,
+                      num_epochs=num_epochs,
+                      batch_size=batch_size,
+                      reader_num_threads=reader_num_threads,
+                      parser_num_threads=parser_num_threads)).get_next()
               self.verify_records(
                   sess,
                   batch_size,
@@ -213,12 +222,12 @@ class MakeBatchedFeaturesDatasetTest(
       for num_epochs in [1, 10]:
         with ops.Graph().as_default():
           # Basic test: read from file 0.
-          outputs = self.make_batch_feature(
+          outputs = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               label_key="label",
               num_epochs=num_epochs,
               batch_size=batch_size,
-              drop_final_batch=True).make_one_shot_iterator().get_next()
+              drop_final_batch=True)).get_next()
           for tensor in nest.flatten(outputs):
             if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
               self.assertEqual(tensor.shape[0], batch_size)
@@ -234,6 +243,20 @@ class MakeBatchedFeaturesDatasetTest(
       if issubclass(clazz, ops.Tensor):
         self.assertEqual(32, shape[0])
 
+  def testOldStyleReader(self):
+    with self.assertRaisesRegexp(
+        TypeError, r"The `reader` argument must return a `Dataset` object. "
+        r"`tf.ReaderBase` subclasses are not supported."):
+      _ = readers.make_batched_features_dataset(
+          file_pattern=self.test_filenames[0], batch_size=32,
+          features={
+              "file": parsing_ops.FixedLenFeature([], dtypes.int64),
+              "record": parsing_ops.FixedLenFeature([], dtypes.int64),
+              "keywords": parsing_ops.VarLenFeature(dtypes.string),
+              "label": parsing_ops.FixedLenFeature([], dtypes.string),
+          },
+          reader=io_ops.TFRecordReader)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index e4bf08918420b7b63fbb0d3a0ae56c7395ff9e97..3b7b335e7066175fba6ef190b977362bc461ca1d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -29,10 +29,11 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
   def _make_csv_dataset(self, filenames, batch_size, num_epochs=1, **kwargs):
@@ -74,7 +75,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
   def _verify_output(
       self,
-      sess,
       dataset,
       batch_size,
       num_epochs,
@@ -82,7 +82,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
       expected_output,
       expected_keys,
   ):
-    nxt = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
 
     for expected_features in self._next_expected_batch(
         expected_output,
@@ -90,7 +90,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         batch_size,
         num_epochs,
     ):
-      actual_features = sess.run(nxt)
+      actual_features = self.evaluate(get_next())
 
       if label_name is not None:
         expected_labels = expected_features.pop(label_name)
@@ -102,7 +102,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         self.assertAllEqual(expected_features[k], actual_features[k])
 
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(nxt)
+      self.evaluate(get_next())
 
   def _test_dataset(self,
                     inputs,
@@ -116,16 +116,14 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
     # Convert str type because py3 tf strings are bytestrings
     filenames = self._setup_files(
         inputs, compression_type=kwargs.get("compression_type", None))
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g) as sess:
-        dataset = self._make_csv_dataset(
-            filenames,
-            batch_size=batch_size,
-            num_epochs=num_epochs,
-            label_name=label_name,
-            **kwargs)
-        self._verify_output(sess, dataset, batch_size, num_epochs, label_name,
-                            expected_output, expected_keys)
+    dataset = self._make_csv_dataset(
+        filenames,
+        batch_size=batch_size,
+        num_epochs=num_epochs,
+        label_name=label_name,
+        **kwargs)
+    self._verify_output(dataset, batch_size, num_epochs, label_name,
+                        expected_output, expected_keys)
 
   def testMakeCSVDataset(self):
     """Tests making a CSV dataset with keys and defaults provided."""
@@ -581,69 +579,65 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
     total_records = 20
     for batch_size in [1, 2]:
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          # Test that shuffling with the same seed produces the same result
-          dataset1 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=5,
-              num_epochs=2,
-          )
-          dataset2 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=5,
-              num_epochs=2,
-          )
-          outputs1 = dataset1.make_one_shot_iterator().get_next()
-          outputs2 = dataset2.make_one_shot_iterator().get_next()
-          for _ in range(total_records // batch_size):
-            batch1 = nest.flatten(sess.run(outputs1))
-            batch2 = nest.flatten(sess.run(outputs2))
-            for i in range(len(batch1)):
-              self.assertAllEqual(batch1[i], batch2[i])
-
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          # Test that shuffling with a different seed produces different results
-          dataset1 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=5,
-              num_epochs=2,
-          )
-          dataset2 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=6,
-              num_epochs=2,
-          )
-          outputs1 = dataset1.make_one_shot_iterator().get_next()
-          outputs2 = dataset2.make_one_shot_iterator().get_next()
-          all_equal = False
-          for _ in range(total_records // batch_size):
-            batch1 = nest.flatten(sess.run(outputs1))
-            batch2 = nest.flatten(sess.run(outputs2))
-            for i in range(len(batch1)):
-              all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
-          self.assertFalse(all_equal)
+      # Test that shuffling with the same seed produces the same result
+      dataset1 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=5,
+          num_epochs=2,
+      )
+      dataset2 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=5,
+          num_epochs=2,
+      )
+      next1 = self.getNext(dataset1)
+      next2 = self.getNext(dataset2)
+      for _ in range(total_records // batch_size):
+        batch1 = nest.flatten(self.evaluate(next1()))
+        batch2 = nest.flatten(self.evaluate(next2()))
+        for i in range(len(batch1)):
+          self.assertAllEqual(batch1[i], batch2[i])
+
+      # Test that shuffling with a different seed produces different results
+      dataset1 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=5,
+          num_epochs=2,
+      )
+      dataset2 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=6,
+          num_epochs=2,
+      )
+      next1 = self.getNext(dataset1)
+      next2 = self.getNext(dataset2)
+      all_equal = False
+      for _ in range(total_records // batch_size):
+        batch1 = nest.flatten(self.evaluate(next1()))
+        batch2 = nest.flatten(self.evaluate(next2()))
+        for i in range(len(batch1)):
+          all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
+      self.assertFalse(all_equal)
 
   def testIndefiniteRepeatShapeInference(self):
     column_names = ["col%d" % i for i in range(5)]
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
index 657cf3c00ee899a9a5718d808ba3d7ee2454bf6b..ab2feb642629eef098162ca445f54e84fc0389a9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -105,7 +106,7 @@ class MakeTFRecordDatasetTest(
     for expected_batch in self._next_expected_batch(
         file_indices, batch_size, num_epochs, interleave_cycle_length,
         drop_final_batch, use_parser_fn):
-      actual_batch = sess.run(outputs)
+      actual_batch = self.evaluate(outputs)
       self.assertAllEqual(expected_batch, actual_batch)
 
   def _read_test(self, batch_size, num_epochs, file_index=None,
@@ -122,20 +123,21 @@ class MakeTFRecordDatasetTest(
 
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
-        outputs = readers.make_tf_record_dataset(
-            file_pattern=file_pattern,
-            num_epochs=num_epochs,
-            batch_size=batch_size,
-            parser_fn=fn,
-            num_parallel_reads=num_parallel_reads,
-            drop_final_batch=drop_final_batch,
-            shuffle=False).make_one_shot_iterator().get_next()
+        outputs = dataset_ops.make_one_shot_iterator(
+            readers.make_tf_record_dataset(
+                file_pattern=file_pattern,
+                num_epochs=num_epochs,
+                batch_size=batch_size,
+                parser_fn=fn,
+                num_parallel_reads=num_parallel_reads,
+                drop_final_batch=drop_final_batch,
+                shuffle=False)).get_next()
         self._verify_records(
             sess, outputs, batch_size, file_index, num_epochs=num_epochs,
             interleave_cycle_length=num_parallel_reads,
             drop_final_batch=drop_final_batch, use_parser_fn=parser_fn)
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(outputs)
+          self.evaluate(outputs)
 
   def testRead(self):
     for batch_size in [1, 2]:
@@ -185,22 +187,22 @@ class MakeTFRecordDatasetTest(
             num_parallel_reads=num_parallel_reads,
             shuffle=True,
             shuffle_seed=seed)
-        iterator = dataset.make_initializable_iterator()
+        iterator = dataset_ops.make_initializable_iterator(dataset)
         next_element = iterator.get_next()
 
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         first_batches = []
         try:
           while True:
-            first_batches.append(sess.run(next_element))
+            first_batches.append(self.evaluate(next_element))
         except errors.OutOfRangeError:
           pass
 
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         second_batches = []
         try:
           while True:
-            second_batches.append(sess.run(next_element))
+            second_batches.append(self.evaluate(next_element))
         except errors.OutOfRangeError:
           pass
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index afd0fc3abfa4421d27346df58284cadc4e059f85..5c115f7ae311ddabef1ff6d7279d724bb1e18f85 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -29,7 +29,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
@@ -38,12 +40,18 @@ from tensorflow.python.platform import test
 class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
-      ("Default", None, None),
-      ("SequentialCalls", 1, None),
-      ("ParallelCalls", 2, None),
-      ("ParallelBatches", None, 10),
+      ("Default", None, None, False),
+      ("SequentialCalls", 1, None, False),
+      ("ParallelCalls", 2, None, False),
+      ("ParallelBatches", None, 10, False),
+      ("DefaultNUMA", None, None, True),
+      ("SequentialCallsNUMA", 1, None, True),
+      ("ParallelCallsNUMA", 2, None, True),
+      ("ParallelBatchesNUMA", None, 10, True),
   )
-  def testMapAndBatch(self, num_parallel_calls, num_parallel_batches):
+  @test_util.run_deprecated_v1
+  def testMapAndBatch(self, num_parallel_calls, num_parallel_batches,
+                      numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset ->
     # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
@@ -57,14 +65,20 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (
+    dataset = (
         dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
             batching.map_and_batch(
                 map_func=_map_fn,
                 batch_size=batch_size,
                 num_parallel_calls=num_parallel_calls,
-                num_parallel_batches=num_parallel_batches))
-        .make_initializable_iterator())
+                num_parallel_batches=num_parallel_batches)))
+
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -77,13 +91,13 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op, feed_dict={count: 28, batch_size: 14})
       num_batches = (28 * 7) // 14
       for i in range(num_batches):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
             self.assertAllEqual(component[(i * 14 + j) % 7]**2,
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Batch of a finite input, where the batch_size does not
       # divide the total number of elements.
@@ -92,119 +106,171 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       # We expect (num_batches - 1) full-sized batches.
       num_batches = int(math.ceil((14 * 7) / 8))
       for i in range(num_batches - 1):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
             self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
-      result = sess.run(get_next)
+      result = self.evaluate(get_next)
       for component, result_component in zip(components, result):
         for j in range((14 * 7) % 8):
           self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
                               result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Batch of an empty input should fail straight away.
       sess.run(init_op, feed_dict={count: 0, batch_size: 8})
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Empty batch should be an initialization time error.
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
   @parameterized.named_parameters(
-      ("Even", False),
-      ("Uneven", True),
+      ("Even", False, False),
+      ("Uneven", True, False),
+      ("EvenNUMA", False, True),
+      ("UnevenNUMA", True, True),
   )
-  def testMapAndBatchPartialBatch(self, drop_remainder):
-    iterator = (
+  @test_util.run_deprecated_v1
+  def testMapAndBatchPartialBatch(self, drop_remainder, numa_aware):
+    dataset = (
         dataset_ops.Dataset.range(10).apply(
             batching.map_and_batch(
                 lambda x: array_ops.reshape(x * x, [1]),
                 batch_size=4,
-                drop_remainder=drop_remainder)).make_one_shot_iterator())
+                drop_remainder=drop_remainder)))
+
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
     if drop_remainder:
       self.assertEqual([4, 1], iterator.output_shapes.as_list())
     else:
       self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+    with self.cached_session():
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
       if not drop_remainder:
-        self.assertAllEqual([[64], [81]], sess.run(next_element))
+        self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
+
+  @parameterized.named_parameters(
+      ("Normal", False),
+      ("NUMA", True),
+  )
+  @test_util.run_deprecated_v1
+  def testMapAndBatchYieldsPartialBatch(self, numa_aware):
+    dataset = (
+        dataset_ops.Dataset.range(10).apply(
+            batching.map_and_batch(lambda x: array_ops.reshape(x * x, [1]), 4)))
 
-  def testMapAndBatchYieldsPartialBatch(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .apply(batching.map_and_batch(
-                    lambda x: array_ops.reshape(x * x, [1]), 4))
-                .make_one_shot_iterator())
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      self.assertAllEqual([[64], [81]], sess.run(next_element))
+    with self.cached_session():
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
+      self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
+
+  @parameterized.named_parameters(
+      ("Normal", False),
+      ("NUMA", True),
+  )
+  @test_util.run_deprecated_v1
+  def testMapAndBatchParallelGetNext(self, numa_aware):
+    dataset = dataset_ops.Dataset.range(50000).apply(
+        batching.map_and_batch(lambda x: x, batch_size=100))
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
-  def testMapAndBatchParallelGetNext(self):
-    iterator = (dataset_ops.Dataset.range(50000)
-                .apply(batching.map_and_batch(lambda x: x, batch_size=100))
-                .make_one_shot_iterator())
     elements = []
     for _ in range(100):
       elements.append(iterator.get_next())
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(5):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
-          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
+          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
         self.assertAllEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
+        self.evaluate(elements)
+
+  @parameterized.named_parameters(
+      ("Normal", False),
+      ("NUMA", True),
+  )
+  @test_util.run_deprecated_v1
+  def testMapAndBatchParallelGetNextDropRemainder(self, numa_aware):
+    dataset = dataset_ops.Dataset.range(49999).apply(
+        batching.map_and_batch(
+            lambda x: x, batch_size=100, drop_remainder=True))
+
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
-  def testMapAndBatchParallelGetNextDropRemainder(self):
-    iterator = (
-        dataset_ops.Dataset.range(49999).apply(
-            batching.map_and_batch(
-                lambda x: x, batch_size=100, drop_remainder=True))
-        .make_one_shot_iterator())
     elements = []
     for _ in range(100):
       elements.append(iterator.get_next())
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(4):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
-          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
+          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
         self.assertAllEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
+        self.evaluate(elements)
 
-  def testMapAndBatchSparse(self):
+  @parameterized.named_parameters(
+      ("Normal", False),
+      ("NUMA", True),
+  )
+  @test_util.run_deprecated_v1
+  def testMapAndBatchSparse(self, numa_aware):
 
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
           indices=[[0]], values=(i * [1]), dense_shape=[1])
 
-    iterator = dataset_ops.Dataset.range(10).apply(
-        batching.map_and_batch(_sparse, 5)).make_initializable_iterator()
+    dataset = dataset_ops.Dataset.range(10).apply(
+        batching.map_and_batch(_sparse, 5))
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
+    with self.cached_session():
+      self.evaluate(init_op)
       for i in range(2):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         expected = sparse_tensor.SparseTensorValue(
             indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
             values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
@@ -212,23 +278,37 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
         self.assertTrue(sparse_tensor.is_sparse(actual))
         self.assertSparseValuesEqual(actual, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
-  def testMapAndBatchFails(self):
+  @parameterized.named_parameters(
+      ("Normal", False),
+      ("NUMA", True),
+  )
+  @test_util.run_deprecated_v1
+  def testMapAndBatchFails(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.check_numerics(
             constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
     batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
-        .make_initializable_iterator())
+    dataset = dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+
     init_op = iterator.initializer
     with self.cached_session() as sess:
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
         sess.run(init_op, feed_dict={batch_size: 14})
 
-  def testMapAndBatchShapeMismatch(self):
+  @parameterized.named_parameters(
+      ("Normal", False),
+      ("NUMA", True),
+  )
+  @test_util.run_deprecated_v1
+  def testMapAndBatchShapeMismatch(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
 
     def generator():
@@ -240,18 +320,26 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_generator(
         generator, output_types=dtypes.int32)
     batch_size = 4
-    iterator = (
-        dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
-        .make_initializable_iterator())
+    dataset = dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+
     init_op = iterator.initializer
     get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
+    with self.cached_session():
+      self.evaluate(init_op)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "number of elements does not match"):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
-  def testMapAndBatchImplicitDispose(self):
+  @parameterized.named_parameters(
+      ("Normal", False),
+      ("NUMA", True),
+  )
+  def testMapAndBatchImplicitDispose(self, numa_aware):
     # Tests whether a map and batch dataset will be cleaned up correctly when
     # the pipeline does not run it until exhaustion.
     # The pipeline is TensorSliceDataset -> RepeatDataset(1000) ->
@@ -266,22 +354,33 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
         1000).apply(batching.map_and_batch(_map_fn, batch_size=100))
     dataset = dataset.prefetch(5)
-    iterator = dataset.make_one_shot_iterator()
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       for _ in range(3):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
-      ("1", 0),
-      ("2", 5),
-      ("3", 10),
-      ("4", 90),
-      ("5", 95),
-      ("6", 99),
+      ("1", 0, False),
+      ("2", 5, False),
+      ("3", 10, False),
+      ("4", 90, False),
+      ("5", 95, False),
+      ("6", 99, False),
+      ("1NUMA", 0, True),
+      ("2NUMA", 5, True),
+      ("3NUMA", 10, True),
+      ("4NUMA", 90, True),
+      ("5NUMA", 95, True),
+      ("6NUMA", 99, True),
   )
-  def testMapAndBatchOutOfRangeError(self, threshold):
+  @test_util.run_deprecated_v1
+  def testMapAndBatchMapError(self, threshold, numa_aware):
 
     def raising_py_fn(i):
       if i >= threshold:
@@ -289,48 +388,143 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       else:
         return i
 
-    iterator = (
-        dataset_ops.Dataset.range(100).apply(
-            batching.map_and_batch(
-                lambda x: script_ops.py_func(raising_py_fn, [x], dtypes.int64),
-                batch_size=10)).make_one_shot_iterator())
+    dataset = dataset_ops.Dataset.range(100).apply(
+        batching.map_and_batch(
+            lambda x: script_ops.py_func(raising_py_fn, [x], dtypes.int64),
+            batch_size=10))
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)], sess.run(get_next))
-      if threshold % 10 != 0:
-        self.assertAllEqual(
-            [threshold // 10 * 10 + j for j in range(threshold % 10)],
-            sess.run(get_next))
+        self.assertAllEqual([i * 10 + j for j in range(10)],
+                            self.evaluate(get_next))
+      if numa_aware:
+        if threshold % 10 != 0:
+          self.assertAllEqual(
+              [threshold // 10 * 10 + j for j in range(threshold % 10)],
+              self.evaluate(get_next))
+      else:
+        for i in range(threshold // 10, 10):
+          with self.assertRaises(errors.InvalidArgumentError):
+            self.evaluate(get_next)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
-      ("1", False, dtypes.bool),
-      ("2", -42, dtypes.int8),
-      ("3", -42, dtypes.int16),
-      ("4", -42, dtypes.int32),
-      ("5", -42, dtypes.int64),
-      ("6", 42, dtypes.uint8),
-      ("7", 42, dtypes.uint16),
-      ("8", 42.0, dtypes.float16),
-      ("9", 42.0, dtypes.float32),
-      ("10", 42.0, dtypes.float64),
-      ("11", b"hello", dtypes.string),
+      ("1", False, dtypes.bool, False),
+      ("2", -42, dtypes.int8, False),
+      ("3", -42, dtypes.int16, False),
+      ("4", -42, dtypes.int32, False),
+      ("5", -42, dtypes.int64, False),
+      ("6", 42, dtypes.uint8, False),
+      ("7", 42, dtypes.uint16, False),
+      ("8", 42.0, dtypes.float16, False),
+      ("9", 42.0, dtypes.float32, False),
+      ("10", 42.0, dtypes.float64, False),
+      ("11", b"hello", dtypes.string, False),
+      ("1NUMA", False, dtypes.bool, True),
+      ("2NUMA", -42, dtypes.int8, True),
+      ("3NUMA", -42, dtypes.int16, True),
+      ("4NUMA", -42, dtypes.int32, True),
+      ("5NUMA", -42, dtypes.int64, True),
+      ("6NUMA", 42, dtypes.uint8, True),
+      ("7NUMA", 42, dtypes.uint16, True),
+      ("8NUMA", 42.0, dtypes.float16, True),
+      ("9NUMA", 42.0, dtypes.float32, True),
+      ("10NUMA", 42.0, dtypes.float64, True),
+      ("11NUMA", b"hello", dtypes.string, True),
   )
-  def testMapAndBatchTypes(self, element, dtype):
+  def testMapAndBatchTypes(self, element, dtype, numa_aware):
+
     def gen():
       yield element
 
     dataset = dataset_ops.Dataset.from_generator(gen, dtype).repeat(100).apply(
         batching.map_and_batch(lambda x: x, batch_size=10))
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
 
-    with self.cached_session() as sess:
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
+
+    with self.cached_session():
       for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
+        self.assertAllEqual([element for _ in range(10)],
+                            self.evaluate(get_next))
+
+  @parameterized.named_parameters(
+      ("Identity", None, lambda x: x, None),
+      ("Replicate", None, lambda x: (x, x), None),
+      ("Swap", (None, None), lambda x, y: (y, x), None),
+      ("Project", (None, None), lambda x, y: x, None),
+  )
+  @test_util.run_deprecated_v1
+  def testShortCircuit(self, structure, map_fn, num_parallel_calls):
+    dataset = self.structuredDataset(structure).repeat().apply(
+        batching.map_and_batch(map_fn, batch_size=10))
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
+
+    with self.cached_session() as sess:
+      if isinstance(structure, tuple):
+        expected = map_fn(
+            *sess.run(self.structuredElement(structure, shape=[10])))
+      else:
+        expected = map_fn(
+            sess.run(self.structuredElement(structure, shape=[10])))
+      self.assertAllEqual(expected, self.evaluate(get_next))
+
+  @test_util.run_deprecated_v1
+  def testShortCircuitCapturedInput(self):
+    captured_t = array_ops.placeholder(dtypes.int64, shape=[])
+    dataset = self.structuredDataset(None).repeat().apply(
+        batching.map_and_batch(lambda x: captured_t, batch_size=10))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer, feed_dict={captured_t: 42})
+      self.assertAllEqual([42] * 10, self.evaluate(get_next))
+
+  @parameterized.named_parameters(
+      ("Normal", False),
+      ("NUMA", True),
+  )
+  @test_util.run_deprecated_v1
+  def testMapAndBatchControlFlow(self, numa_aware):
+
+    def map_fn(x):
+      previous_cond_v2_value = control_flow_ops.ENABLE_COND_V2
+      control_flow_ops.ENABLE_COND_V2 = True
+      return_value = control_flow_ops.cond(x < 50, lambda: x + 1, lambda: x * x)
+      control_flow_ops.ENABLE_COND_V2 = previous_cond_v2_value
+      return return_value
+
+    dataset = dataset_ops.Dataset.range(100).apply(
+        batching.map_and_batch(map_fn, batch_size=10))
+    if numa_aware:
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+    with self.cached_session():
+      for i in range(10):
+        if i < 5:
+          self.assertAllEqual([i * 10 + j + 1 for j in range(10)],
+                              self.evaluate(get_next))
+        else:
+          self.assertAllEqual(
+              [((i * 10) + j) * ((i * 10) + j) for j in range(10)],
+              self.evaluate(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index ae9dedb0ab061269f07732d18e85762ccc658420..6042ca1c63f561a20e58e63e7864e13e847d3b35 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -22,11 +22,12 @@ import time
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import map_defun
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import data_flow_ops
@@ -39,7 +40,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunSimple(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)])
     def simple_fn(x):
       return x * 2 + 3
 
@@ -51,7 +52,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunMismatchedTypes(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
     def fn(x):
       return math_ops.cast(x, dtypes.float64)
 
@@ -64,7 +65,7 @@ class MapDefunTest(test_base.DatasetTestBase):
   def testMapDefunReduceDim(self):
     # Tests where the output has a different rank from the input
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)])
     def fn(x):
       return array_ops.gather(x, 0)
 
@@ -76,7 +77,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunMultipleOutputs(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)])
     def fn(x):
       return (x, math_ops.cast(x * 2 + 3, dtypes.float64))
 
@@ -89,7 +90,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunShapeInference(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)])
     def fn(x):
       return x
 
@@ -100,7 +101,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunPartialShapeInference(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)])
     def fn(x):
       return x
 
@@ -110,7 +111,10 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunRaisesErrorOnRuntimeShapeMismatch(self):
 
-    @function.Defun(dtypes.int32, dtypes.int32)
+    @function.defun(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.int32),
+        tensor_spec.TensorSpec(None, dtypes.int32)
+    ])
     def fn(x, y):
       return x, y
 
@@ -126,7 +130,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunRaisesDefunError(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
     def fn(x):
       with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
         return array_ops.identity(x)
@@ -138,7 +142,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunCancelledCorrectly(self):
 
-    @function.Defun(dtypes.int64)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([5], dtypes.int64)])
     def defun(x):
       # x has leading dimension 5, this will raise an error
       return array_ops.gather(x, 10)
@@ -154,7 +158,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunWithUnspecifiedOutputShape(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)])
     def simple_fn(x):
       res = x * 2 + 3
       return (res, res + 1, res + 2)
@@ -171,7 +175,8 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunWithDifferentOutputShapeEachRun(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.int32)])
     def simple_fn(x):
       return x * 2 + 3
 
@@ -184,7 +189,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunWithWrongOutputShape(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)])
     def simple_fn(x):
       return x * 2 + 3
 
@@ -196,7 +201,8 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunWithInvalidInput(self):
 
-    @function.Defun(dtypes.int32)
+    @function.defun(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.int32)])
     def simple_fn(x):
       return x * 2
 
@@ -212,12 +218,12 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def _assert_op_cancelled(self, sess, map_defun_op):
     with self.assertRaisesRegexp(errors.CancelledError, "was cancelled"):
-      sess.run(map_defun_op)
+      self.evaluate(map_defun_op)
 
   def testMapDefunWithParentCancellation(self):
     # Checks that a cancellation of the parent graph is threaded through to
     # MapDefunOp correctly.
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
     def simple_fn(x):
       del x
       queue = data_flow_ops.FIFOQueue(10, dtypes.int32, ())
@@ -238,7 +244,7 @@ class MapDefunTest(test_base.DatasetTestBase):
   def testMapDefunWithCapturedInputs(self):
     c = constant_op.constant(2)
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
     def fn(x):
       return x + c
 
@@ -254,10 +260,10 @@ class MapDefunBenchmark(test.Benchmark):
     with session.Session() as sess:
       # Warm up the session
       for _ in range(5):
-        sess.run(op)
+        self.evaluate(op)
       start = time.time()
       for _ in range(num_iters):
-        sess.run(op)
+        self.evaluate(op)
       end = time.time()
       mean_us = (end - start) * 1e6 / num_iters
       self.report_benchmark(
@@ -269,7 +275,7 @@ class MapDefunBenchmark(test.Benchmark):
   def benchmarkDefunVsMapFn(self):
     """Benchmarks to compare the performance of MapDefun vs tf.map_fn."""
 
-    @function.Defun(dtypes.int32)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
     def defun(x):
       return array_ops.identity(x)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ee7616d35e801743167865d8d8097064ef88126
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
@@ -0,0 +1,177 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the private `MatchingFilesDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from tensorflow.python.data.experimental.ops import matching_files
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class MatchingFilesTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    self.tmp_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+  def _touchTempFiles(self, filenames):
+    for filename in filenames:
+      open(os.path.join(self.tmp_dir, filename), 'a').close()
+
+  @test_util.run_deprecated_v1
+  def testNonExistingDirectory(self):
+    """Test the MatchingFiles dataset with a non-existing directory."""
+
+    self.tmp_dir = os.path.join(self.tmp_dir, 'nonexistingdir')
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*'))
+    with self.cached_session() as sess:
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      with self.assertRaises(errors.NotFoundError):
+        sess.run(next_element)
+
+  @test_util.run_deprecated_v1
+  def testEmptyDirectory(self):
+    """Test the MatchingFiles dataset with an empty directory."""
+
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*'))
+    with self.cached_session() as sess:
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      with self.assertRaises(errors.NotFoundError):
+        sess.run(next_element)
+
+  @test_util.run_deprecated_v1
+  def testSimpleDirectory(self):
+    """Test the MatchingFiles dataset with a simple directory."""
+
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*'))
+    with self.cached_session() as sess:
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+
+      expected_filenames = []
+      actual_filenames = []
+      for filename in filenames:
+        expected_filenames.append(
+            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
+        actual_filenames.append(compat.as_bytes(sess.run(next_element)))
+
+      self.assertItemsEqual(expected_filenames, actual_filenames)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  @test_util.run_deprecated_v1
+  def testFileSuffixes(self):
+    """Test the MatchingFiles dataset using the suffixes of filename."""
+
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*.py'))
+    with self.cached_session() as sess:
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      expected_filenames = []
+      actual_filenames = []
+      for filename in filenames[1:-1]:
+        expected_filenames.append(
+            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
+        actual_filenames.append(compat.as_bytes(sess.run(next_element)))
+
+      self.assertItemsEqual(expected_filenames, actual_filenames)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  @test_util.run_deprecated_v1
+  def testFileMiddles(self):
+    """Test the MatchingFiles dataset using the middles of filename."""
+
+    filenames = ['aa.txt', 'bb.py', 'bbc.pyc', 'cc.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, 'b*.py*'))
+    with self.cached_session() as sess:
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      expected_filenames = []
+      actual_filenames = []
+      for filename in filenames[1:3]:
+        expected_filenames.append(
+            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
+        actual_filenames.append(compat.as_bytes(sess.run(next_element)))
+
+      self.assertItemsEqual(expected_filenames, actual_filenames)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  @test_util.run_deprecated_v1
+  def testNestedDirectories(self):
+    """Test the MatchingFiles dataset with nested directories."""
+
+    filenames = []
+    width = 8
+    depth = 4
+    for i in range(width):
+      for j in range(depth):
+        new_base = os.path.join(self.tmp_dir, str(i),
+                                *[str(dir_name) for dir_name in range(j)])
+        os.makedirs(new_base)
+        child_files = ['a.py', 'b.pyc'] if j < depth - 1 else ['c.txt', 'd.log']
+        for f in child_files:
+          filename = os.path.join(new_base, f)
+          filenames.append(filename)
+          open(filename, 'w').close()
+
+    patterns = [
+        os.path.join(self.tmp_dir, os.path.join(*['**' for _ in range(depth)]),
+                     suffix) for suffix in ['*.txt', '*.log']
+    ]
+
+    dataset = matching_files.MatchingFilesDataset(patterns)
+    with self.cached_session() as sess:
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      expected_filenames = [
+          compat.as_bytes(filename)
+          for filename in filenames
+          if filename.endswith('.txt') or filename.endswith('.log')
+      ]
+      actual_filenames = []
+      while True:
+        try:
+          actual_filenames.append(compat.as_bytes(sess.run(next_element)))
+        except errors.OutOfRangeError:
+          break
+
+      self.assertItemsEqual(expected_filenames, actual_filenames)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index c92bb8b9bcde838d29409b86dd3e6149c9d60d1a..bf868ebe79339e3c36473711ece064210db5f47f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -7,9 +7,9 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_test(
-    name = "assert_next_dataset_op_test",
+    name = "assert_next_dataset_test",
     size = "medium",
-    srcs = ["assert_next_dataset_op_test.py"],
+    srcs = ["assert_next_dataset_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -25,6 +25,30 @@ py_test(
     ],
 )
 
+py_test(
+    name = "filter_fusion_test",
+    size = "medium",
+    srcs = ["filter_fusion_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "hoist_random_uniform_test",
     size = "small",
@@ -38,10 +62,14 @@ py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -63,15 +91,15 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:stats_aggregator",
         "//tensorflow/python/data/experimental/ops:stats_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 py_test(
-    name = "map_vectorization_test",
-    size = "small",
-    srcs = ["map_vectorization_test.py"],
+    name = "make_numa_aware_test",
+    srcs = ["make_numa_aware_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -79,25 +107,36 @@ py_test(
         "no_windows",
     ],
     deps = [
-        "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "map_and_batch_fusion_test",
+    srcs = ["map_and_batch_fusion_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 py_test(
     name = "map_and_filter_fusion_test",
-    size = "medium",
     srcs = ["map_and_filter_fusion_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -112,6 +151,27 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "map_fusion_test",
+    srcs = ["map_fusion_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -130,21 +190,61 @@ py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "map_vectorization_test",
+    size = "medium",
+    srcs = ["map_vectorization_test.py"],
+    shard_count = 8,
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:bitwise_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:clip_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
-    name = "model_dataset_op_test",
+    name = "model_dataset_test",
     size = "medium",
-    srcs = ["model_dataset_op_test.py"],
+    srcs = ["model_dataset_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -155,12 +255,10 @@ py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python/data/experimental/ops:batching",
-        "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -176,20 +274,49 @@ py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_test(
+    name = "optimize_dataset_test",
+    size = "medium",
+    srcs = ["optimize_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/data/experimental/ops:batching",
-        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
-    name = "optimize_dataset_op_test",
-    size = "small",
-    srcs = ["optimize_dataset_op_test.py"],
+    name = "shuffle_and_repeat_fusion_test",
+    srcs = ["shuffle_and_repeat_fusion_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -200,8 +327,8 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_op_test.py
deleted file mode 100644
index 45b77b5c20e808097b6b8aa3c7b3ad612398af19..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_op_test.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
-
-
-class AssertNextDatasetTest(test_base.DatasetTestBase):
-
-  def testAssertNext(self):
-    dataset = dataset_ops.Dataset.from_tensors(0).apply(
-        optimization.assert_next(["Map"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
-
-  def testAssertNextInvalid(self):
-    dataset = dataset_ops.Dataset.from_tensors(0).apply(
-        optimization.assert_next(["Whoops"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Asserted Whoops transformation at offset 0 but encountered "
-          "Map transformation instead."):
-        sess.run(get_next)
-
-  def testAssertNextShort(self):
-    dataset = dataset_ops.Dataset.from_tensors(0).apply(
-        optimization.assert_next(["Map", "Whoops"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Asserted next 2 transformations but encountered only 1."):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b8248a78da11d99e3cf6cd87ab69d30d4d369d6
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
@@ -0,0 +1,60 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.assert_next()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AssertNextDatasetTest(test_base.DatasetTestBase):
+
+  def testAssertNext(self):
+    dataset = dataset_ops.Dataset.from_tensors(0).apply(
+        optimization.assert_next(["Map"])).map(lambda x: x)
+    self.assertDatasetProduces(dataset, expected_output=[0])
+
+  def testAssertNextInvalid(self):
+    dataset = dataset_ops.Dataset.from_tensors(0).apply(
+        optimization.assert_next(["Whoops"])).map(lambda x: x)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            "Asserted Whoops transformation at offset 0 but encountered "
+            "Map transformation instead."))
+
+  def testAssertNextShort(self):
+    dataset = dataset_ops.Dataset.from_tensors(0).apply(
+        optimization.assert_next(["Map", "Whoops"])).map(lambda x: x)
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            "Asserted next 2 transformations but encountered only 1."))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7371cf31dff33a5de18f3268ecdfc91c6a08b29c
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the `FilterFusion` optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def _filter_fusion_test_cases():
+  """Generates test cases for the FilterFusion optimization."""
+
+  take_all = lambda x: constant_op.constant(True)
+  is_zero = lambda x: math_ops.equal(x, 0)
+  greater = lambda x: math_ops.greater(x + 5, 0)
+
+  tests = []
+  filters = [take_all, is_zero, greater]
+  identity = lambda x: x
+  for x, predicate_1 in enumerate(filters):
+    for y, predicate_2 in enumerate(filters):
+      tests.append(("Mixed{}{}".format(x, y), identity,
+                    [predicate_1, predicate_2]))
+      for z, predicate_3 in enumerate(filters):
+        tests.append(("Mixed{}{}{}".format(x, y, z), identity,
+                      [predicate_1, predicate_2, predicate_3]))
+
+  take_all_multiple = lambda x, y: constant_op.constant(True)
+  # Multi output
+  tests.append(("Multi1", lambda x: (x, x),
+                [take_all_multiple, take_all_multiple]))
+  tests.append(("Multi2", lambda x: (x, 2), [
+      take_all_multiple,
+      lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0)
+  ]))
+  return tuple(tests)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(*_filter_fusion_test_cases())
+  def testFilterFusion(self, map_function, predicates):
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["Map", "Filter",
+                                  "MemoryCacheImpl"])).map(map_function)
+    for predicate in predicates:
+      dataset = dataset.filter(predicate)
+
+    dataset = dataset.cache()
+    options = dataset_ops.Options()
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.filter_fusion = True
+    dataset = dataset.with_options(options)
+    expected_output = []
+    for x in range(5):
+      r = map_function(x)
+      filtered = False
+      for predicate in predicates:
+        if isinstance(r, tuple):
+          b = predicate(*r)  # Pass tuple as multiple arguments.
+        else:
+          b = predicate(r)
+        if not self.evaluate(b):
+          filtered = True
+          break
+
+      if not filtered:
+        expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
index 81437c0aecd0f5d9705d104472e39bd01cb76039..5f3a8683fbb6cb2b43a41ad6d738b4982755bbff 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for HostState optimization."""
+"""Tests for the `HoistRandomUniform` optimization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -20,56 +20,84 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
-class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
+def _hoist_random_uniform_test_cases():
+  """Generates test cases for the HoistRandomUniform optimization."""
+
+  plus_one = lambda x: x + 1
 
-  @staticmethod
-  def map_functions():
-    plus_one = lambda x: x + 1
+  def random(_):
+    return random_ops.random_uniform([],
+                                     minval=1,
+                                     maxval=10,
+                                     dtype=dtypes.float32,
+                                     seed=42)
 
-    def random(_):
-      return random_ops.random_uniform([],
-                                       minval=1,
-                                       maxval=10,
-                                       dtype=dtypes.float32,
-                                       seed=42)
+  def random_with_assert(x):
+    y = random(x)
+    assert_op = control_flow_ops.Assert(math_ops.greater_equal(y, 1), [y])
+    with ops.control_dependencies([assert_op]):
+      return y
 
-    def random_with_assert(x):
-      y = random(x)
-      assert_op = control_flow_ops.Assert(math_ops.greater_equal(y, 1), [y])
-      with ops.control_dependencies([assert_op]):
-        return y
+  twice_random = lambda x: (random(x) + random(x)) / 2.
 
-    twice_random = lambda x: (random(x) + random(x)) / 2.
+  tests = [("PlusOne", plus_one, False), ("RandomUniform", random, True),
+           ("RandomWithAssert", random_with_assert, True),
+           ("TwiceRandom", twice_random, False)]
+  return tuple(tests)
 
-    tests = [("PlusOne", plus_one, False), ("RandomUniform", random, True),
-             ("RandomWithAssert", random_with_assert, True),
-             ("TwiceRandom", twice_random, False)]
-    return tuple(tests)
 
-  @parameterized.named_parameters(*map_functions.__func__())
+@test_util.run_all_in_graph_and_eager_modes
+class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def _testDataset(self, dataset):
+    previous_result = 0
+    if context.executing_eagerly():
+      iterator = dataset.__iter__()
+      get_next = iterator._next_internal  # pylint: disable=protected-access
+    else:
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next
+    for _ in range(5):
+      result = self.evaluate(get_next())
+      self.assertLessEqual(1, result)
+      self.assertLessEqual(result, 10)
+      # This checks if the result is somehow random by checking if we are not
+      # generating the same values.
+      self.assertNotEqual(previous_result, result)
+      previous_result = result
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @parameterized.named_parameters(*_hoist_random_uniform_test_cases())
   def testHoisting(self, function, will_optimize):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(
             ["Zip[0]", "Map"] if will_optimize else ["Map"])).map(function)
 
     options = dataset_ops.Options()
-    options.experimental_hoist_random_uniform = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
 
-  def testAdditionalInputs(self):
+  def testCapturedInputs(self):
     a = constant_op.constant(1, dtype=dtypes.float32)
     b = constant_op.constant(0, dtype=dtypes.float32)
     some_tensor = math_ops.mul(a, b)
@@ -81,26 +109,11 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(["Zip[0]", "Map"])).map(random_with_capture)
     options = dataset_ops.Options()
-    options.experimental_hoist_random_uniform = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
 
-  def _testDataset(self, dataset):
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    previous_result = 0
-    with self.cached_session() as sess:
-      for _ in range(5):
-        result = sess.run(get_next)
-        self.assertLessEqual(1, result)
-        self.assertLessEqual(result, 10)
-        # This checks if the result is somehow random by checking if we are not
-        # generating the same values.
-        self.assertNotEqual(previous_result, result)
-        previous_result = result
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index 26fec0414e40b5e8c45bbf2ca806d5857f1afc0c..fc65f52704c3389a24e9f304cfa1cadd5686c7d6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -12,47 +12,69 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the LatencyAllEdges optimization."""
+"""Tests for the `LatencyAllEdges` optimization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops import stats_ops
+from tensorflow.python.data.experimental.ops import stats_aggregator
+from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class OptimizeStatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
 
   def testLatencyStatsOptimization(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.from_tensors(1).apply(
         optimization.assert_next(
             ["LatencyStats", "Map", "LatencyStats", "Prefetch",
-             "LatencyStats"])).map(lambda x: x * x).prefetch(1).apply(
-                 stats_ops.set_stats_aggregator(stats_aggregator))
+             "LatencyStats"])).map(lambda x: x * x).prefetch(1)
     options = dataset_ops.Options()
-    options.experimental_latency_all_edges = True
+    options.experimental_stats = stats_options.StatsOptions()
+    options.experimental_stats.latency_all_edges = True
+    options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[1],
+        requires_initialization=True,
+        num_test_iterations=1)
+    summary_t = aggregator.get_summary()
+    summary_str = self.evaluate(summary_t)
+    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
+                                1)
+    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
+    self._assertSummaryHasCount(summary_str,
+                                "record_latency_PrefetchDataset/_6", 1)
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertEqual(1 * 1, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      summary_str = sess.run(summary_t)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_TensorDataset/_1", 1)
-      self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4",
-                                  1)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_PrefetchDataset/_6", 1)
+  def testLatencyStatsOptimizationV2(self):
+    aggregator = stats_aggregator.StatsAggregator()
+    dataset = dataset_ops.Dataset.from_tensors(1).apply(
+        optimization.assert_next(
+            ["LatencyStats", "Map", "LatencyStats", "Prefetch",
+             "LatencyStats"])).map(lambda x: x * x).prefetch(1)
+    options = dataset_ops.Options()
+    options.experimental_stats = stats_options.StatsOptions()
+    options.experimental_stats.aggregator = aggregator
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[1],
+        requires_initialization=True,
+        num_test_iterations=1)
+    summary_t = aggregator.get_summary()
+    summary_str = self.evaluate(summary_t)
+    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
+                                1)
+    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
+    self._assertSummaryHasCount(summary_str,
+                                "record_latency_PrefetchDataset/_6", 1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2386dd5f116d660eb93213c935b662c05d90011d
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
@@ -0,0 +1,43 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the `MakeNumaAware` optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MakeNumaAwareTest(test_base.DatasetTestBase):
+
+  def testMakeNumaAware(self):
+    dataset = dataset_ops.Dataset.range(10).apply(
+        optimization.assert_next(["NumaMapAndBatch"])).apply(
+            batching.map_and_batch(lambda x: x * x, 10))
+    options = dataset_ops.Options()
+    options.experimental_numa_aware = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset, expected_output=[[x * x for x in range(10)]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2ff3116eccf2ccfb7ed72085f4727a1e0262164
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
@@ -0,0 +1,39 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the `MapAndBatchFusion` optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MapAndBatchFusionTest(test_base.DatasetTestBase):
+
+  def testMapAndBatchFusion(self):
+    dataset = dataset_ops.Dataset.range(10).apply(
+        optimization.assert_next(
+            ["MapAndBatch"])).map(lambda x: x * x).batch(10)
+    self.assertDatasetProduces(
+        dataset, expected_output=[[x * x for x in range(10)]])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index 7f8a4e6406ae15ed0d6e956d13e7b0ee252c9a76..db8f214fbfca1389af70df55518c885610984031 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the MapAndFilterFusion optimization."""
+"""Tests for the `MapAndFilterFusion` optimization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -20,136 +20,76 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+def _map_and_filter_fusion_test_cases():
+  """Generates test cases for the MapAndFilterFusion optimization."""
+
+  identity = lambda x: x
+  increment = lambda x: x + 1
+  minus_five = lambda x: x - 5
+
+  def increment_and_square(x):
+    y = x + 1
+    return y * y
+
+  take_all = lambda x: constant_op.constant(True)
+  is_zero = lambda x: math_ops.equal(x, 0)
+  is_odd = lambda x: math_ops.equal(x % 2, 0)
+  greater = lambda x: math_ops.greater(x + 5, 0)
+
+  functions = [identity, increment, minus_five, increment_and_square]
+  filters = [take_all, is_zero, is_odd, greater]
+  tests = []
+
+  for x, fun in enumerate(functions):
+    for y, predicate in enumerate(filters):
+      tests.append(("Mixed{}{}".format(x, y), fun, predicate))
+
+  # Multi output
+  tests.append(("Multi1", lambda x: (x, x),
+                lambda x, y: constant_op.constant(True)))
+  tests.append(
+      ("Multi2", lambda x: (x, 2),
+       lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0)))
+  return tuple(tests)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  @staticmethod
-  def map_functions():
-    identity = lambda x: x
-    increment = lambda x: x + 1
-
-    def increment_and_square(x):
-      y = x + 1
-      return y * y
-
-    functions = [identity, increment, increment_and_square]
-    tests = []
-    for i, fun1 in enumerate(functions):
-      for j, fun2 in enumerate(functions):
-        tests.append((
-            "Test{}{}".format(i, j),
-            [fun1, fun2],
-        ))
-        for k, fun3 in enumerate(functions):
-          tests.append((
-              "Test{}{}{}".format(i, j, k),
-              [fun1, fun2, fun3],
-          ))
-
-    swap = lambda x, n: (n, x)
-    tests.append((
-        "Swap1",
-        [lambda x: (x, 42), swap],
-    ))
-    tests.append((
-        "Swap2",
-        [lambda x: (x, 42), swap, swap],
-    ))
-    return tuple(tests)
-
-  @parameterized.named_parameters(*map_functions.__func__())
-  def testMapFusion(self, functions):
-    dataset = dataset_ops.Dataset.range(5).apply(
-        optimization.assert_next(["Map", "Prefetch"]))
-    for function in functions:
-      dataset = dataset.map(function)
-
-    dataset = dataset.prefetch(0)
-    options = dataset_ops.Options()
-    options.experimental_map_fusion = True
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        r = x
-        for function in functions:
-          if isinstance(r, tuple):
-            r = function(*r)  # Pass tuple as multiple arguments.
-          else:
-            r = function(r)
-        self.assertAllEqual(r, result)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @staticmethod
-  def map_and_filter_functions():
-    identity = lambda x: x
-    increment = lambda x: x + 1
-    minus_five = lambda x: x - 5
-
-    def increment_and_square(x):
-      y = x + 1
-      return y * y
-
-    take_all = lambda x: constant_op.constant(True)
-    is_zero = lambda x: math_ops.equal(x, 0)
-    is_odd = lambda x: math_ops.equal(x % 2, 0)
-    greater = lambda x: math_ops.greater(x + 5, 0)
-
-    functions = [identity, increment, minus_five, increment_and_square]
-    filters = [take_all, is_zero, is_odd, greater]
-    tests = []
-
-    for x, fun in enumerate(functions):
-      for y, predicate in enumerate(filters):
-        tests.append(("Mixed{}{}".format(x, y), fun, predicate))
-
-    # Multi output
-    tests.append(("Multi1", lambda x: (x, x),
-                  lambda x, y: constant_op.constant(True)))
-    tests.append(
-        ("Multi2", lambda x: (x, 2),
-         lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0)))
-    return tuple(tests)
-
-  @parameterized.named_parameters(*map_and_filter_functions.__func__())
+  def _testMapAndFilter(self, dataset, function, predicate):
+    expected_output = []
+    for x in range(10):
+      r = function(x)
+      if isinstance(r, tuple):
+        b = predicate(*r)  # Pass tuple as multiple arguments.
+      else:
+        b = predicate(r)
+      if self.evaluate(b):
+        expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  @parameterized.named_parameters(*_map_and_filter_fusion_test_cases())
   def testMapFilterFusion(self, function, predicate):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(
             ["Map", "FilterByLastComponent"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_map_and_filter_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
 
-  def _testMapAndFilter(self, dataset, function, predicate):
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(10):
-        r = function(x)
-        if isinstance(r, tuple):
-          b = predicate(*r)  # Pass tuple as multiple arguments.
-        else:
-          b = predicate(r)
-        if sess.run(b):
-          result = sess.run(get_next)
-          self.assertAllEqual(r, result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testAdditionalInputs(self):
+  def testCapturedInputs(self):
     a = constant_op.constant(3, dtype=dtypes.int64)
     b = constant_op.constant(4, dtype=dtypes.int64)
     some_tensor = math_ops.mul(a, b)
@@ -158,75 +98,16 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
     def predicate(y):
       return math_ops.less(math_ops.cast(y, dtypes.int64), some_tensor)
 
-    # We are currently not supporting functions with additional inputs.
+    # We are currently not supporting functions with captured inputs.
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(["Map",
                                   "Filter"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_map_and_filter_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
 
-  @staticmethod
-  def filter_functions():
-    take_all = lambda x: constant_op.constant(True)
-    is_zero = lambda x: math_ops.equal(x, 0)
-    greater = lambda x: math_ops.greater(x + 5, 0)
-
-    tests = []
-    filters = [take_all, is_zero, greater]
-    identity = lambda x: x
-    for x, predicate_1 in enumerate(filters):
-      for y, predicate_2 in enumerate(filters):
-        tests.append(("Mixed{}{}".format(x, y), identity,
-                      [predicate_1, predicate_2]))
-        for z, predicate_3 in enumerate(filters):
-          tests.append(("Mixed{}{}{}".format(x, y, z), identity,
-                        [predicate_1, predicate_2, predicate_3]))
-
-    take_all_multiple = lambda x, y: constant_op.constant(True)
-    # Multi output
-    tests.append(("Multi1", lambda x: (x, x),
-                  [take_all_multiple, take_all_multiple]))
-    tests.append(("Multi2", lambda x: (x, 2), [
-        take_all_multiple,
-        lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0)
-    ]))
-    return tuple(tests)
-
-  @parameterized.named_parameters(*filter_functions.__func__())
-  def testFilterFusion(self, map_function, predicates):
-    dataset = dataset_ops.Dataset.range(5).apply(
-        optimization.assert_next(["Map", "Filter",
-                                  "Prefetch"])).map(map_function)
-    for predicate in predicates:
-      dataset = dataset.filter(predicate)
-
-    dataset = dataset.prefetch(0)
-    options = dataset_ops.Options()
-    options.experimental_filter_fusion = True
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(5):
-        r = map_function(x)
-        filtered = False
-        for predicate in predicates:
-          if isinstance(r, tuple):
-            b = predicate(*r)  # Pass tuple as multiple arguments.
-          else:
-            b = predicate(r)
-          if not sess.run(b):
-            filtered = True
-            break
-
-        if not filtered:
-          result = sess.run(get_next)
-          self.assertAllEqual(r, result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8d63903749d13b80f662c996ebf5c95f934a0b1
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
@@ -0,0 +1,94 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the `MapFusion` optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+def _map_fusion_test_cases():
+  """Generates test cases for the MapFusion optimization."""
+
+  identity = lambda x: x
+  increment = lambda x: x + 1
+
+  def increment_and_square(x):
+    y = x + 1
+    return y * y
+
+  functions = [identity, increment, increment_and_square]
+  tests = []
+  for i, fun1 in enumerate(functions):
+    for j, fun2 in enumerate(functions):
+      tests.append((
+          "Test{}{}".format(i, j),
+          [fun1, fun2],
+      ))
+      for k, fun3 in enumerate(functions):
+        tests.append((
+            "Test{}{}{}".format(i, j, k),
+            [fun1, fun2, fun3],
+        ))
+
+  swap = lambda x, n: (n, x)
+  tests.append((
+      "Swap1",
+      [lambda x: (x, 42), swap],
+  ))
+  tests.append((
+      "Swap2",
+      [lambda x: (x, 42), swap, swap],
+  ))
+  return tuple(tests)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(*_map_fusion_test_cases())
+  def testMapFusion(self, functions):
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["Map", "MemoryCacheImpl"]))
+    for function in functions:
+      dataset = dataset.map(function)
+
+    dataset = dataset.cache()
+    options = dataset_ops.Options()
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_fusion = True
+    dataset = dataset.with_options(options)
+    expected_output = []
+    for x in range(5):
+      r = x
+      for function in functions:
+        if isinstance(r, tuple):
+          r = function(*r)  # Pass tuple as multiple arguments.
+        else:
+          r = function(r)
+      expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index ce9c9bc47bdac985d9c8770168066a065cacd786..0ff3fff4f8550a4221e54ab2b01ddcaf6c340145 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the MapParallelization optimization."""
+"""Tests for the `MapParallelization` optimization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -20,67 +20,60 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
-class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
+def _map_parallelization_test_cases():
+  """Generates test cases for the MapParallelization optimization."""
+
+  identity = lambda x: x
+  increment = lambda x: x + 1
+
+  def assert_greater(x):
+    assert_op = control_flow_ops.Assert(math_ops.greater(x, -1), [x])
+    with ops.control_dependencies([assert_op]):
+      return x
 
-  @staticmethod
-  def map_functions():
-    identity = lambda x: x
-    increment = lambda x: x + 1
+  def random(_):
+    return random_ops.random_uniform([],
+                                     minval=0,
+                                     maxval=10,
+                                     dtype=dtypes.int64,
+                                     seed=42)
 
-    def assert_greater(x):
-      assert_op = control_flow_ops.Assert(math_ops.greater(x, -1), [x])
-      with ops.control_dependencies([assert_op]):
-        return x
+  def assert_with_random(x):
+    x = assert_greater(x)
+    return random(x)
 
-    def random(_):
-      return random_ops.random_uniform([],
-                                       minval=0,
-                                       maxval=10,
-                                       dtype=dtypes.int64,
-                                       seed=42)
+  return (("Identity", identity, True), ("Increment", increment, True),
+          ("AssertGreater", assert_greater, True), ("Random", random, False),
+          ("AssertWithRandom", assert_with_random, False))
 
-    def assert_with_random(x):
-      x = assert_greater(x)
-      return random(x)
 
-    return (("Identity", identity, True), ("Increment", increment, True),
-            ("AssertGreater", assert_greater, True), ("Random", random, False),
-            ("AssertWithRandom", assert_with_random, False))
+@test_util.run_all_in_graph_and_eager_modes
+class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  @parameterized.named_parameters(*map_functions.__func__())
+  @parameterized.named_parameters(*_map_parallelization_test_cases())
   def testMapParallelization(self, function, should_optimize):
     next_nodes = ["ParallelMap"] if should_optimize else ["Map"]
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(next_nodes)).map(function)
     options = dataset_ops.Options()
-    options.experimental_map_parallelization = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_parallelization = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        # No need to run the pipeline if it was not optimized.  Also the results
-        # might be hard to check because of random.
-        if not should_optimize:
-          return
-        r = function(x)
-        self.assertAllEqual(r, result)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    if should_optimize:
+      self.assertDatasetProduces(
+          dataset, expected_output=[function(x) for x in range(5)])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index 971a2d94b9b79da56923909fade5841ce36262af..adc411bfb5996904a92fd5b565eb59a439303500 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -12,30 +12,312 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the MapVectorization optimization."""
+"""Tests for the `MapVectorization` optimization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.client import session
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 
 
+def _generate_unary_cwise_math_cases():
+  # TODO(rachelim): Consolidate tests with pfor when APIs are somewhat shared.
+  bitwise_cases = [("Invert", bitwise_ops.invert)]
+  logical_cases = [("LogicalNot", math_ops.logical_not)]
+  complex_cases = [
+      ("Angle", math_ops.angle),
+      ("ComplexAbs", math_ops.abs),
+      ("Conj", math_ops.conj),
+      ("Imag", math_ops.imag),
+      ("Real", math_ops.real),
+  ]
+  real_cases = [
+      ("Abs", math_ops.abs),
+      ("Acos", math_ops.acos),
+      ("Acosh", lambda x: math_ops.acosh(1 + math_ops.square(x))),
+      ("Asin", math_ops.asin),
+      ("Asinh", math_ops.asinh),
+      ("Atan", math_ops.atan),
+      ("Atanh", math_ops.atanh),
+      ("BesselI0e", math_ops.bessel_i0e),
+      ("BesselI1e", math_ops.bessel_i1e),
+      ("Ceil", math_ops.ceil),
+      ("Cos", math_ops.cos),
+      ("Cosh", math_ops.cosh),
+      ("Digamma", math_ops.digamma),
+      ("Elu", nn.elu),
+      ("Erf", math_ops.erf),
+      ("Erfc", math_ops.erfc),
+      ("Exp", math_ops.exp),
+      ("Expm1", math_ops.expm1),
+      ("Floor", math_ops.floor),
+      ("Inv", math_ops.inv),
+      ("IsFinite", math_ops.is_finite),
+      ("IsInf", math_ops.is_inf),
+      ("Lgamma", math_ops.lgamma),
+      ("Log", math_ops.log),
+      ("Log1p", math_ops.log1p),
+      ("Neg", math_ops.negative),
+      ("Reciprocal", math_ops.reciprocal),
+      ("Relu", nn.relu),
+      ("Relu6", nn.relu6),
+      ("Rint", math_ops.rint),
+      ("Round", math_ops.round),
+      ("Rsqrt", math_ops.rsqrt),
+      ("Selu", nn.selu),
+      ("Sigmoid", math_ops.sigmoid),
+      ("Sign", math_ops.sign),
+      ("Sin", math_ops.sin),
+      ("Sinh", math_ops.sinh),
+      ("Softplus", nn.softplus),
+      ("Softsign", nn.softsign),
+      ("Sqrt", math_ops.sqrt),
+      ("Square", math_ops.square),
+      ("Tan", math_ops.tan),
+      ("Tanh", math_ops.tanh),
+  ]
+  random_input = np.random.rand(3, 5)
+  complex_component = np.random.rand(3, 5)
+  random_int = np.random.randint(0, 10, (7, 3, 5))
+
+  def bitwise_dataset_factory():
+    return dataset_ops.Dataset.from_tensor_slices(random_int)
+
+  def logical_dataset_factory():
+    return dataset_ops.Dataset.from_tensor_slices(random_input > 0)
+
+  def random_dataset_factory():
+    return dataset_ops.Dataset.from_tensor_slices(random_input)
+
+  def complex_dataset_factory():
+    return dataset_ops.Dataset.from_tensor_slices(
+        math_ops.complex(random_input, complex_component))
+
+  case_factory_pairs = [
+      (bitwise_cases, bitwise_dataset_factory),
+      (logical_cases, logical_dataset_factory),
+      (complex_cases, complex_dataset_factory),
+      (real_cases, random_dataset_factory),
+  ]
+  return [(case[0], case[1], factory)
+          for cases, factory in case_factory_pairs
+          for case in cases]
+
+
+def _generate_binary_cwise_math_cases():
+  bitwise_cases = [("BitwiseAnd", bitwise_ops.bitwise_and),
+                   ("BitwiseOr", bitwise_ops.bitwise_or),
+                   ("BitwiseXor", bitwise_ops.bitwise_xor),
+                   ("LeftShift", bitwise_ops.left_shift),
+                   ("RightShift", bitwise_ops.right_shift)]
+
+  logical_cases = [("LogicalAnd", math_ops.logical_and),
+                   ("LogicalOr", math_ops.logical_or)]
+
+  # Wrapper functions restricting the range of inputs of zeta and polygamma.
+  def safe_polygamma(x, y):
+    return math_ops.polygamma(
+        math_ops.round(clip_ops.clip_by_value(y, 1, 10)), x * x + 1)
+
+  def safe_zeta(x, y):
+    return math_ops.zeta(x * x + 1, y * y)
+
+  real_cases = [
+      ("Add", math_ops.add),
+      ("AddV2", math_ops.add_v2),
+      ("Atan2", math_ops.atan2),
+      ("Complex", math_ops.complex),
+      ("DivNoNan", math_ops.div_no_nan),
+      ("Equal", math_ops.equal),
+      ("FloorDiv", math_ops.floor_div),
+      ("FloorMod", math_ops.floor_mod),
+      ("Greater", math_ops.greater),
+      ("GreaterEqual", math_ops.greater_equal),
+      ("Igamma", math_ops.igamma),
+      ("Igammac", math_ops.igammac),
+      ("IgammaGradA", math_ops.igamma_grad_a),
+      ("Less", math_ops.less),
+      ("LessEqual", math_ops.less_equal),
+      ("Maximum", math_ops.maximum),
+      ("Minimum", math_ops.minimum),
+      ("Mod", math_ops.mod),
+      ("Mul", math_ops.multiply),
+      ("NotEqual", math_ops.not_equal),
+      ("Polygamma", safe_polygamma),
+      ("Pow", math_ops.pow),
+      ("RealDiv", math_ops.divide),
+      ("SquareDifference", math_ops.squared_difference),
+      ("Sub", math_ops.subtract),
+      ("TruncateMod", math_ops.truncate_mod),
+      ("Zeta", safe_zeta),
+  ]
+
+  # Exercises broadcasting capabilities
+  x = np.random.rand(7, 3, 5)
+  y = np.random.rand(3, 5)
+
+  x_int = np.random.randint(0, 10, (7, 3, 5))
+  y_int = np.random.randint(0, 10, (3, 5))
+
+  def bitwise_dataset_factory():
+    return dataset_ops.Dataset.from_tensors((x_int, y_int))
+
+  def logical_dataset_factory():
+    return dataset_ops.Dataset.from_tensors((x > 0, y > 0))
+
+  def random_dataset_factory():
+    return dataset_ops.Dataset.from_tensors((x, y))
+
+  case_factory_pairs = [
+      (bitwise_cases, bitwise_dataset_factory),
+      (logical_cases, logical_dataset_factory),
+      (real_cases, random_dataset_factory),
+  ]
+  return [(case[0], case[1], factory)
+          for cases, factory in case_factory_pairs
+          for case in cases]
+
+
+def _generate_cwise_test_cases():
+  return _generate_unary_cwise_math_cases() + _generate_binary_cwise_math_cases(
+  )
+
+
+def _generate_csv_test_case():
+
+  def csv_factory():
+    return dataset_ops.Dataset.from_tensor_slices(["1.0:2:a",
+                                                   "2.4:5:c"]).repeat(5)
+
+  def decode_csv_fn(x):
+    return parsing_ops.decode_csv(
+        x,
+        record_defaults=[
+            constant_op.constant([], dtypes.float32),
+            constant_op.constant([], dtypes.int32),
+            constant_op.constant([], dtypes.string)
+        ],
+        field_delim=":")
+
+  return decode_csv_fn, csv_factory
+
+
+def _generate_parse_single_example_test_case():
+
+  def parse_example_factory():
+
+    def _int64_feature(*values):
+      return feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=values))
+
+    def _bytes_feature(*values):
+      return feature_pb2.Feature(
+          bytes_list=feature_pb2.BytesList(
+              value=[v.encode("utf-8") for v in values]))
+
+    return dataset_ops.Dataset.from_tensor_slices(
+        constant_op.constant([
+            example_pb2.Example(
+                features=feature_pb2.Features(
+                    feature={
+                        "dense_int": _int64_feature(i),
+                        "dense_str": _bytes_feature(str(i)),
+                        "sparse_int": _int64_feature(i, i * 2, i * 4, i * 8),
+                        "sparse_str": _bytes_feature(*["abc"] * i)
+                    })).SerializeToString() for i in range(10)
+        ]))
+
+  def parse_single_example_fn(x):
+    features = {
+        "dense_int": parsing_ops.FixedLenFeature((), dtypes.int64, 0),
+        "dense_str": parsing_ops.FixedLenFeature((), dtypes.string, ""),
+        "sparse_int": parsing_ops.VarLenFeature(dtypes.int64),
+        "sparse_str": parsing_ops.VarLenFeature(dtypes.string),
+    }
+    return parsing_ops.parse_single_example(x, features)
+
+  return parse_single_example_fn, parse_example_factory
+
+
+def _generate_optimization_test_cases():
+
+  def base_dataset_factory():
+    return dataset_ops.Dataset.from_tensors(np.random.rand(10, 3)).repeat(5)
+
+  rand_val = np.random.rand(1, 1, 1, 1, 1, 1)
+
+  csv_test_case = _generate_csv_test_case()
+  parse_fn, parse_base = _generate_parse_single_example_test_case()
+
+  def dense_output_only_parse_fn(x):
+    # Since we haven't implemented a vectorizer for SerializeSparse, any
+    # function with sparse outputs will only be naively vectorized.
+    parse_result = parse_fn(x)
+    return [
+        y for y in parse_result if not isinstance(y, sparse_tensor.SparseTensor)
+    ]
+
+  def map_fn_with_cycle(x):
+    c = lambda i: math_ops.less(i, 10)
+    b = lambda i: math_ops.add(i, 1)
+    return control_flow_ops.while_loop(c, b, [x])
+
+  # Misc test cases
+  test_cases = [
+      ("Basic", lambda x: (x, x + 1), base_dataset_factory),
+      ("Broadcast", lambda x: x + rand_val, base_dataset_factory),
+      ("Cycle", map_fn_with_cycle, lambda: dataset_ops.Dataset.from_tensors(1)),
+      ("Const", lambda x: 2, base_dataset_factory),
+      ("Cast", lambda x: math_ops.cast(x, dtypes.float64),
+       base_dataset_factory),
+      ("Reshape", lambda x: array_ops.reshape(x, (-1, 30)),
+       base_dataset_factory),
+      ("Transpose", array_ops.transpose, base_dataset_factory),
+      ("Unpack", array_ops.unstack, base_dataset_factory),
+      ("UnpackNegativeAxis", lambda x: array_ops.unstack(x, axis=-1),
+       base_dataset_factory),
+      # Parsing ops
+      ("DecodeCSV", csv_test_case[0], csv_test_case[1]),
+      ("ParseSingleExample", parse_fn, parse_base),
+      ("ParseSingleExampleDenseOutputOnly", dense_output_only_parse_fn,
+       parse_base),
+  ] + _generate_cwise_test_cases()
+
+  return [{
+      "testcase_name":
+          x[0] + "Parallel" if num_parallel_calls is not None else x[0],
+      "map_fn":
+          x[1],
+      "base_dataset_factory":
+          x[2],
+      "num_parallel_calls":
+          num_parallel_calls
+  } for x in test_cases for num_parallel_calls in (None, 12)]
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _get_test_datasets(self,
@@ -45,7 +327,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
                          expect_optimized=True):
     """Given base dataset and map fn, creates test datasets.
 
-    Returns a tuple of (unoptimized, dataset, optimized dataset). The
+    Returns a tuple of (unoptimized dataset, optimized dataset). The
     unoptimized dataset has the assertion that Batch follows Map. The optimized
     dataset has the assertion that Map follows Batch, and has the
     "map_vectorization" optimization applied.
@@ -62,34 +344,37 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
       Tuple of (unoptimized dataset, optimized dataset).
     """
     map_node_name = "Map" if num_parallel_calls is None else "ParallelMap"
-    batch_size = 100
 
     def _make_dataset(node_names):
-      return base_dataset.apply(optimization.assert_next(node_names)).map(
-          map_fn, num_parallel_calls=num_parallel_calls).batch(batch_size)
+      dataset = base_dataset.apply(optimization.assert_next(node_names))
+      dataset = dataset.map(map_fn, num_parallel_calls)
+      dataset = dataset.batch(100)
+      options = dataset_ops.Options()
+      opt_options = optimization_options.OptimizationOptions()
+      opt_options.map_and_batch_fusion = False
+      options.experimental_optimization = opt_options
+      dataset = dataset.with_options(options)
+      return dataset
 
     unoptimized = _make_dataset([map_node_name, "Batch"])
     optimized = _make_dataset(["Batch", map_node_name]
                               if expect_optimized else [map_node_name, "Batch"])
     options = dataset_ops.Options()
-    options.experimental_map_vectorization = True
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.map_vectorization = True
+    options.experimental_optimization = opt_options
     optimized = optimized.with_options(options)
     return unoptimized, optimized
 
-  @parameterized.named_parameters(
-      ("Basic", lambda x: (x, x + 1), None),
-      ("Const", lambda x: 2, 12),
-      ("Parallel", lambda x: (x, x + 1), 12),
-      ("Gather", lambda x: array_ops.gather(x, 0), 12),
-  )
-  def testOptimization(self, map_fn, num_parallel_calls):
-    base_dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2],
-                                                           [3, 4]]).repeat(5)
+  @parameterized.named_parameters(_generate_optimization_test_cases())
+  def testOptimization(self, map_fn, base_dataset_factory, num_parallel_calls):
+    base_dataset = base_dataset_factory()
     unoptimized, optimized = self._get_test_datasets(base_dataset, map_fn,
                                                      num_parallel_calls)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  def testOptimizationBadMapFn(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationBadMapFn(self):
     # Test map functions that give an error
     def map_fn(x):
       # x has leading dimension 5, this will raise an error
@@ -98,25 +383,27 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
         5, drop_remainder=True)
     _, optimized = self._get_test_datasets(base_dataset, map_fn)
-    nxt = optimized.make_one_shot_iterator().get_next()
+    nxt = dataset_ops.make_one_shot_iterator(optimized).get_next()
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"indices = 10 is not in \[0, 5\)"):
       self.evaluate(nxt)
 
   def testOptimizationWithCapturedInputs(self):
     # Tests that vectorization works with captured inputs
+    y = constant_op.constant(1, shape=(2,))
+    z = constant_op.constant(2, shape=(2,))
+
     def map_fn(x):
-      return x + y
+      return x, y, z
 
-    y = constant_op.constant(1, shape=(2,))
     base_dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2],
                                                            [3, 4]]).repeat(5)
-    # TODO(rachelim): when this optimization works, turn on expect_optimized
     unoptimized, optimized = self._get_test_datasets(
-        base_dataset, map_fn, expect_optimized=False)
+        base_dataset, map_fn, expect_optimized=True)
     self.assertDatasetsEqual(optimized, unoptimized)
 
-  def testOptimizationIgnoreStateful(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationIgnoreStateful(self):
 
     def map_fn(x):
       with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
@@ -142,7 +429,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=False)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  def testOptimizationIgnoreRaggedMap(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationIgnoreRaggedMap(self):
     # Don't optimize when the output of the map fn shapes are unknown.
     def map_fn(x):
       return array_ops.tile(x, x)
@@ -156,76 +444,5 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
          ("IteratorGetNext", "IteratorGetNext_1", 1)])
 
 
-class MapVectorizationBenchmark(test.Benchmark):
-  # TODO(rachelim): Add a benchmark for more expensive transformations, such as
-  # vgg_preprocessing.
-
-  def _run(self, x, num_iters=100, name=None):
-    deltas = []
-    with session.Session() as sess:
-      for _ in range(5):
-        # Warm up session...
-        sess.run(x)
-      for _ in range(num_iters):
-        start = time.time()
-        sess.run(x)
-        end = time.time()
-        deltas.append(end - start)
-    median_time = np.median(deltas)
-    self.report_benchmark(iters=num_iters, wall_time=median_time, name=name)
-    return median_time
-
-  def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
-    num_elems = np.prod(input_size)
-    name_template = "{}__batch_size_{}_input_size_{}_{}"
-    unoptimized = input_dataset.map(map_fn).batch(batch_size)
-    unoptimized_op = unoptimized.make_one_shot_iterator().get_next()
-
-    optimized = input_dataset.map(map_fn).batch(batch_size)
-    options = dataset_ops.Options()
-    options.experimental_map_vectorization = True
-    optimized = optimized.with_options(options)
-    optimized_op = optimized.make_one_shot_iterator().get_next()
-
-    unoptimized_time = self._run(
-        unoptimized_op,
-        name=name_template.format(str_id, batch_size, num_elems, "unoptimized"))
-    optimized_time = self._run(
-        optimized_op,
-        name=name_template.format(str_id, batch_size, num_elems, "optimized"))
-
-    print("Batch size: {}\n"
-          "Input size: {}\n"
-          "Transformation: {}\n"
-          "Speedup: {}\n".format(batch_size, input_size, str_id,
-                                 (unoptimized_time / optimized_time)))
-
-  # Known cheap functions
-  def benchmarkIdentity(self):
-    self._benchmark_helper(lambda *args: [array_ops.identity(x) for x in args],
-                           "identity")
-
-  def benchmarkAddConst(self):
-    self._benchmark_helper(lambda *args: [x + 1 for x in args], "add_const")
-
-  def benchmarkReturnConst(self):
-    self._benchmark_helper(lambda *args: [constant_op.constant(2)], "ret_const")
-
-  def benchmarkSelect(self):
-    self._benchmark_helper(lambda *args: args[0], "select")
-
-  def benchmarkCast(self):
-    self._benchmark_helper(
-        lambda *args: [math_ops.cast(x, dtypes.float64) for x in args], "cast")
-
-  def _benchmark_helper(self, map_fn, str_id):
-    input_sizes = [(10, 10, 3), (10, 100, 300)]
-    batch_size = 1000
-    for input_size in input_sizes:
-      input_dataset = dataset_ops.Dataset.from_tensor_slices(
-          (np.random.rand(*input_size), np.random.rand(*input_size))).repeat()
-      self._compare(input_dataset, map_fn, batch_size, input_size, str_id)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py
deleted file mode 100644
index 82516356df87518ff20bb6d0f0dcdcdcc814dbe3..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_op_test.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class ModelDatasetTest(test_base.DatasetTestBase):
-
-  def testModelMap(self):
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.map(math_ops.matmul)
-    options = dataset_ops.Options()
-    options.experimental_autotune = True
-    iterator = dataset.with_options(options).make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(100):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelParallelMap(self):
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.map(
-        math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
-    options = dataset_ops.Options()
-    options.experimental_autotune = True
-    iterator = dataset.with_options(options).make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(1000):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelMapAndBatch(self):
-    batch_size = 16
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.apply(
-        batching.map_and_batch(
-            math_ops.matmul,
-            num_parallel_calls=optimization.AUTOTUNE,
-            batch_size=batch_size))
-    options = dataset_ops.Options()
-    options.experimental_autotune = True
-    iterator = dataset.with_options(options).make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(10):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelParallelInterleave(self):
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.map(math_ops.matmul)
-    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
-        lambda _: dataset,
-        cycle_length=10,
-        num_parallel_calls=optimization.AUTOTUNE)
-    options = dataset_ops.Options()
-    options.experimental_autotune = True
-    iterator = dataset.with_options(options).make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(1000):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelNested(self):
-    k = 1024 * 1024
-    a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
-    b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
-    c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
-    dataset = dataset_ops.Dataset.from_tensors((a, b, c)).repeat()
-
-    def f1(a, b, c):
-      x, y = a
-      return math_ops.matmul(x, y), b, c
-
-    def f2(a, b, c):
-      x, y = b
-      return a, math_ops.matmul(x, y), c
-
-    def f3(a, b, c):
-      x, y = c
-      return a, b, math_ops.matmul(x, y)
-
-    dataset = dataset.map(f1, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
-        lambda _: dataset, cycle_length=2)
-
-    dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
-        lambda _: dataset, cycle_length=2)
-
-    dataset = dataset.map(f3, num_parallel_calls=optimization.AUTOTUNE)
-    options = dataset_ops.Options()
-    options.experimental_autotune = True
-    iterator = dataset.with_options(options).make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next)
-      for _ in range(100):
-        start = time.time()
-        sess.run(get_next)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f0274b41f2da1add8b2361b54e5c32a5974da41
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the private `_ModelDataset` transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+# TODO(b/117581999): Add eager coverage for the following tests.
+class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testAutotuneOption(self):
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset.map(lambda x: x).apply(
+        optimization.assert_next(["Model"]))
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    dataset = dataset.with_options(options)
+
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      self.assertEqual(0, self.evaluate(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index fb0640fe9f575adfaacfc62e463c6491d1722526..8058f53eea240831545444286fb2c6aa404e240a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the MapParallelization optimization."""
+"""Tests for the `NoopElimination` optimization."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -22,11 +22,12 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NoopEliminationTest(test_base.DatasetTestBase):
 
   def testNoopElimination(self):
@@ -37,23 +38,10 @@ class NoopEliminationTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(5)
     dataset = dataset.apply(
         optimization.assert_next(
-            ["FiniteRepeat", "FiniteSkip", "Prefetch", "Prefetch"]))
-    dataset = dataset.repeat(some_tensor).skip(5).prefetch(0).take(-1).skip(
-        0).repeat(1).prefetch(0)
-    options = dataset_ops.Options()
-    options.experimental_noop_elimination = True
-    dataset = dataset.with_options(options)
-
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        self.assertAllEqual(result, x)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+            ["FiniteRepeat", "FiniteSkip", "Prefetch", "MemoryCacheImpl"]))
+    dataset = dataset.repeat(some_tensor).skip(5).take(-1).skip(0).repeat(
+        1).prefetch(0).prefetch(1).cache()
+    self.assertDatasetProduces(dataset, expected_output=range(5))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py
deleted file mode 100644
index 760cd8cc4ec8be7df54b013ff9e55f051c697bf2..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_op_test.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.platform import test
-
-
-class OptimizeDatasetTest(test_base.DatasetTestBase):
-
-  def testOptimizationDefault(self):
-    dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(["Map",
-                                  "Batch"])).map(lambda x: x * x).batch(10)
-    iterator = dataset.with_options(
-        dataset_ops.Options()).make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testOptimizationFusion(self):
-    dataset = dataset_ops.Dataset.range(10).apply(
-        optimization.assert_next(
-            ["MapAndBatch"])).map(lambda x: x * x).batch(10)
-    options = dataset_ops.Options()
-    options.experimental_map_and_batch_fusion = True
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testOptimizationStatefulFunction(self):
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda _: random_ops.random_uniform([])).batch(10)
-    options = dataset_ops.Options()
-    options.experimental_map_and_batch_fusion = True
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(get_next)
-
-  def testOptimizationLargeInputFromTensor(self):
-    input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
-    dataset = dataset_ops.Dataset.from_tensors(input_t)
-    options = dataset_ops.Options()
-    options.experimental_map_and_batch_fusion = True
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, {input_t: np.ones([512, 1024, 1025], np.int32)})
-      sess.run(get_next)
-
-  def testOptimizationLargeInputFromTensorSlices(self):
-    input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
-    dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
-    options = dataset_ops.Options()
-    options.experimental_map_and_batch_fusion = True
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, {input_t: np.ones([1, 512, 1024, 1025], np.int32)})
-      sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..230b74e9e8e0e3e26aeabe11faa84c651069c7b8
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -0,0 +1,283 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the private `_OptimizeDataset` transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import grouping
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization_options
+from tensorflow.python.data.experimental.ops import scan_ops
+from tensorflow.python.data.experimental.ops import threadpool
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+
+
+def _generate_captured_refvar_test_cases():
+  """Generates testcases.
+
+  Returns:
+    A list of tuples of (testcase_name, make_dataset_fn). make_dataset_fn takes
+    a tf.Variable as input and creates a test dataset that uses that variable.
+  """
+
+  def make_map_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).map(lambda x: x + var)
+
+  def make_flat_map_dataset(var):
+    return dataset_ops.Dataset.from_tensors(
+        0).flat_map(lambda _: dataset_ops.Dataset.from_tensors(var))
+
+  def make_filter_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).filter(lambda x: x < var)
+
+  def make_map_and_batch_dataset(var):
+
+    def map_fn(x):
+      return x + var
+
+    return dataset_ops.Dataset.from_tensors(0).apply(
+        batching.map_and_batch(map_fn, 1))
+
+  def make_group_by_reducer_dataset(var):
+    reducer = grouping.Reducer(
+        init_func=lambda _: 0,
+        reduce_func=lambda x, y: x,
+        finalize_func=lambda _: var)
+    return dataset_ops.Dataset.range(5).apply(
+        grouping.group_by_reducer(lambda x: x % 2, reducer))
+
+  def make_group_by_window_dataset(var):
+
+    def reduce_fn(key, bucket):
+      del key, bucket
+      return dataset_ops.Dataset.from_tensors(var)
+
+    return dataset_ops.Dataset.from_tensors(0).repeat(10).apply(
+        grouping.group_by_window(lambda _: 0, reduce_fn, 10))
+
+  def make_scan_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).apply(
+        scan_ops.scan(
+            0, lambda old_state, elem: (old_state + 1, elem + old_state + var)))
+
+  return [
+      # Core datasets
+      ("Map", make_map_dataset),
+      ("FlatMap", make_flat_map_dataset),
+      ("Filter", make_filter_dataset),
+      # Experimental datasets
+      ("MapAndBatch", make_map_and_batch_dataset),
+      ("GroupByReducer", make_group_by_reducer_dataset),
+      ("GroupByWindow", make_group_by_window_dataset),
+      ("Scan", make_scan_dataset)
+  ]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testOptimizationStatefulFunction(self):
+    dataset = dataset_ops.Dataset.range(
+        10).map(lambda _: random_ops.random_uniform([])).batch(10)
+    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    get_next = self.getNext(dataset)
+    self.evaluate(get_next())
+
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationLargeInputFromTensor(self):
+    input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
+    dataset = dataset_ops.Dataset.from_tensors(input_t)
+    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op, {input_t: np.ones([512, 1024, 1025], np.int32)})
+      self.evaluate(get_next)
+
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationLargeInputFromTensorSlices(self):
+    input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
+    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op, {input_t: np.ones([1, 512, 1024, 1025], np.int32)})
+      self.evaluate(get_next)
+
+  def testOptimizationNestedDataset(self):
+
+    def flat_map_fn(_):
+      dataset = dataset_ops.Dataset.from_tensors(0)
+      dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
+      dataset = dataset.skip(0)  # Should be removed by noop elimination
+      dataset = dataset.cache()
+      return dataset
+
+    dataset = dataset_ops.Dataset.range(1)
+    dataset = dataset.flat_map(flat_map_fn)
+    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    self.assertDatasetProduces(dataset, expected_output=[0])
+
+  def testOptimizationNestedDatasetWithModifiedRetval(self):
+
+    def flat_map_fn(_):
+      dataset = dataset_ops.Dataset.from_tensors(0)
+      dataset = dataset.apply(optimization.assert_next(["MapAndBatch"]))
+      # Should be fused by map and batch fusion
+      dataset = dataset.map(lambda x: x)
+      dataset = dataset.batch(1)
+      return dataset
+
+    dataset = dataset_ops.Dataset.range(1)
+    dataset = dataset.flat_map(flat_map_fn)
+
+    # TODO(b/120558523): We use Options instead of _OptimizeDataset directly
+    # here because of a bug with chaining _OptimizeDatasets when there are
+    # nested dataset functions
+    options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.map_and_batch_fusion = True
+    options.experimental_optimization = opt_options
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(dataset, expected_output=[[0]])
+
+  def testOptimizationThreadPoolDataset(self):
+    dataset = dataset_ops.Dataset.range(10).batch(10)
+
+    dataset = threadpool.override_threadpool(
+        dataset,
+        threadpool.PrivateThreadPool(
+            2, display_name="private_thread_pool_%d" % 2))
+
+    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[list(range(10))],
+        requires_initialization=True)
+
+  def testOptimizationNonSerializable(self):
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset.apply(optimization.assert_next(["FiniteSkip"]))
+    dataset = dataset.skip(0)  # Should not be removed by noop elimination
+    dataset = dataset.apply(optimization.non_serializable())
+    dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
+    dataset = dataset.skip(0)  # Should be removed by noop elimination
+    dataset = dataset.cache()
+    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    self.assertDatasetProduces(dataset, expected_output=[0])
+
+  def testOptimizationNonSerializableAsDirectInput(self):
+    """Tests that non-serializable dataset can be OptimizeDataset's input."""
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset.apply(optimization.non_serializable())
+    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    self.assertDatasetProduces(dataset, expected_output=[0])
+
+  @parameterized.named_parameters(_generate_captured_refvar_test_cases())
+  # Skip eager because RefVariables are not supported in eager mode.
+  def testSkipEagerOptimizationWithCapturedRefVar(self, dataset_fn):
+    """Tests that default optimizations are disabled with ref variables."""
+    variable = variable_scope.get_variable(
+        "v", initializer=0, use_resource=False)
+    assign_op = variable.assign_add(1)
+
+    unoptimized_dataset = dataset_fn(variable)
+
+    options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.noop_elimination = True
+    opt_options.map_and_batch_fusion = True
+    options.experimental_optimization = opt_options
+    optimized_dataset = unoptimized_dataset.with_options(options)
+
+    # Check that warning is logged.
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      optimized_it = optimized_dataset.make_initializable_iterator()
+
+    self.assertGreaterEqual(len(w), 1)
+    expected = ("tf.data static optimizations are not compatible with "
+                "tf.Variable. The following optimizations will be disabled: %s."
+                " To enable optimizations, use resource variables instead by "
+                "calling `tf.enable_resource_variables()` at the start of the "
+                "program." % (", ".join(opt_options._static_optimizations())))
+    self.assertTrue(any([expected in str(warning) for warning in w]))
+
+    # Check that outputs are the same in the optimized and unoptimized cases,
+    # when the variable value is changing.
+    unoptimized_it = unoptimized_dataset.make_initializable_iterator()
+    with ops.control_dependencies([assign_op]):
+      unoptimized_output = unoptimized_it.get_next()
+      optimized_output = optimized_it.get_next()
+
+    self.evaluate(variable.initializer)
+    self.evaluate((unoptimized_it.initializer, optimized_it.initializer))
+    while True:
+      try:
+        unoptimized, optimized = self.evaluate((unoptimized_output,
+                                                optimized_output))
+        self.assertEqual(unoptimized, optimized)
+      except errors.OutOfRangeError:
+        break
+
+  def testOptimizationEnabledByDefault(self):
+    """Tests that some optimizations are applied to datasets by default."""
+    options = dataset_ops.Options()
+    expected_optimizations = [
+        "map_and_batch_fusion",
+        "noop_elimination",
+        "shuffle_and_repeat_fusion",
+    ]
+    self.assertEqual(
+        set(options._static_optimizations()), set(expected_optimizations))
+
+  def testOptimizationDisableDefault(self):
+    """Tests that we can disable all static optimizations enabled by default.
+
+    If the `apply_default_optimizations` optimization options flag is False,
+    only explicitly enabled optimizations will be applied.
+    """
+    options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.hoist_random_uniform = True
+    opt_options.apply_default_optimizations = False
+    options.experimental_optimization = opt_options
+    expected_optimizations = ["hoist_random_uniform"]
+    self.assertEqual(options._static_optimizations(), expected_optimizations)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..594b59375febbba6c939dc5429ff59fe9c971a5f
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
@@ -0,0 +1,48 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the `ShuffleAndRepeatFusion` optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase):
+
+  def testShuffleAndRepeatFusion(self):
+    dataset = dataset_ops.Dataset.range(10).apply(
+        optimization.assert_next(["ShuffleAndRepeat"])).shuffle(10).repeat(2)
+    get_next = self.getNext(dataset)
+
+    for _ in range(2):
+      results = []
+      for _ in range(10):
+        results.append(self.evaluate(get_next()))
+      self.assertAllEqual([x for x in range(10)], sorted(results))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
index 5e419a9b2f9e9debef63446263dc51b5c079a495..aa81663a188cfee738acaedfd44e239909a4215e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
@@ -22,12 +22,15 @@ import threading
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
@@ -35,18 +38,7 @@ from tensorflow.python.platform import test
 class OverrideThreadpoolTest(test_base.DatasetTestBase,
                              parameterized.TestCase):
 
-  @parameterized.named_parameters(
-      ("1", 1, None),
-      ("2", 2, None),
-      ("3", 4, None),
-      ("4", 8, None),
-      ("5", 16, None),
-      ("6", 4, -1),
-      ("7", 4, 0),
-      ("8", 4, 1),
-      ("9", 4, 4),
-  )
-  def testNumThreads(self, num_threads, max_intra_op_parallelism):
+  def _testNumThreadsHelper(self, num_threads, override_threadpool_fn):
 
     def get_thread_id(_):
       # Python creates a dummy thread object to represent the current
@@ -60,32 +52,86 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
         dataset_ops.Dataset.range(1000).map(
             lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64),
             num_parallel_calls=32).apply(unique.unique()))
-
-    dataset = threadpool.override_threadpool(
-        dataset,
-        threadpool.PrivateThreadPool(
-            num_threads,
-            max_intra_op_parallelism=max_intra_op_parallelism,
-            display_name="private_thread_pool_%d" % num_threads))
-
-    iterator = dataset.make_initializable_iterator()
+    dataset = override_threadpool_fn(dataset)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      thread_ids = []
-      try:
-        while True:
-          thread_ids.append(sess.run(next_element))
-      except errors.OutOfRangeError:
-        pass
-      self.assertEqual(len(thread_ids), len(set(thread_ids)))
-      self.assertGreater(len(thread_ids), 0)
+    self.evaluate(iterator.initializer)
+    thread_ids = []
+    try:
+      while True:
+        thread_ids.append(self.evaluate(next_element))
+    except errors.OutOfRangeError:
+      pass
+    self.assertLen(thread_ids, len(set(thread_ids)))
+    self.assertNotEmpty(thread_ids)
+    if num_threads:
       # NOTE(mrry): We don't control the thread pool scheduling, and
       # so cannot guarantee that all of the threads in the pool will
       # perform work.
       self.assertLessEqual(len(thread_ids), num_threads)
 
+  @parameterized.named_parameters(
+      ("1", 1, None),
+      ("2", 2, None),
+      ("3", 4, None),
+      ("4", 8, None),
+      ("5", 16, None),
+      ("6", 4, -1),
+      ("7", 4, 0),
+      ("8", 4, 1),
+      ("9", 4, 4),
+  )
+  @test_util.run_deprecated_v1
+  def testNumThreadsDeprecated(self, num_threads, max_intra_op_parallelism):
+
+    def override_threadpool_fn(dataset):
+      return threadpool.override_threadpool(
+          dataset,
+          threadpool.PrivateThreadPool(
+              num_threads,
+              max_intra_op_parallelism=max_intra_op_parallelism,
+              display_name="private_thread_pool_%d" % num_threads))
+
+    self._testNumThreadsHelper(num_threads, override_threadpool_fn)
+
+  @parameterized.named_parameters(
+      ("1", 1, None),
+      ("2", 2, None),
+      ("3", 4, None),
+      ("4", 8, None),
+      ("5", 16, None),
+      ("6", None, 0),
+      ("7", None, 1),
+      ("8", None, 4),
+      ("9", 4, 0),
+      ("10", 4, 1),
+      ("11", 4, 4),
+      ("12", None, None),
+  )
+  @test_util.run_deprecated_v1
+  def testNumThreads(self, num_threads, max_intra_op_parallelism):
+
+    def override_threadpool_fn(dataset):
+      t_options = threading_options.ThreadingOptions()
+      if max_intra_op_parallelism is not None:
+        t_options.max_intra_op_parallelism = max_intra_op_parallelism
+      if num_threads is not None:
+        t_options.private_threadpool_size = num_threads
+      options = dataset_ops.Options()
+      options.experimental_threading = t_options
+      return dataset.with_options(options)
+
+    self._testNumThreadsHelper(num_threads, override_threadpool_fn)
+
+  def testMaxIntraOpParallelismAsGraphDefInternal(self):
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset_ops._MaxIntraOpParallelismDataset(dataset, 1)
+    graph = graph_pb2.GraphDef().FromString(
+        self.evaluate(dataset._as_serialized_graph()))
+    self.assertTrue(
+        any([node.op != "MaxIntraOpParallelismDataset" for node in graph.node]))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
index 90ac250df70bfac8c0d73836391900cf83a603e5..113326c028a53be5b6aa3889ace5013fc08843a4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
@@ -86,7 +86,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                                                self.block_length, self.sloppy,
                                                self.buffer_output_elements,
                                                self.prefetch_input_elements)))
-    self.iterator = self.dataset.make_initializable_iterator()
+    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
 
@@ -195,9 +195,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 1):
         self.write_coordination_events[expected_element].set()
         self.assertEqual(expected_element * expected_element,
-                         sess.run(self.next_element))
+                         self.evaluate(self.next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testSingleThreaded(self):
     self._testSingleThreaded()
@@ -235,10 +235,10 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       for expected_element in self._interleave(
           [[3] * 3, [7] * 7, [4] * 4] * self.repeat_count, 2, 1):
         self.write_coordination_events[expected_element].set()
-        output = sess.run(self.next_element)
+        output = self.evaluate(self.next_element)
         self.assertEqual(expected_element * expected_element, output)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def _testTwoThreadsNoContention(self, sloppy=False):
     # num_threads > 1.
@@ -262,7 +262,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           self.read_coordination_events[expected_element].acquire()
           done_first_event = True
@@ -270,7 +270,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContention(self):
     self._testTwoThreadsNoContention()
@@ -309,7 +309,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         else:
           self.write_coordination_events[expected_element].set()
         time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           done_first_event = True
           self.assertTrue(
@@ -318,7 +318,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContentionWithRaces(self):
     self._testTwoThreadsNoContentionWithRaces()
@@ -348,7 +348,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           done_first_event = True
           self.read_coordination_events[expected_element].acquire()
@@ -356,7 +356,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContentionBlockLength(self):
     self._testTwoThreadsNoContentionBlockLength()
@@ -396,7 +396,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         else:
           self.write_coordination_events[expected_element].set()
         time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           done_first_event = True
           self.assertTrue(
@@ -405,7 +405,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContentionWithRacesAndBlocking(self):
     self._testTwoThreadsNoContentionWithRacesAndBlocking()
@@ -428,7 +428,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
               self.prefetch_input_elements: 0,
           })
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testEmptyInput(self):
     self._testEmptyInput()
@@ -451,7 +451,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
               self.prefetch_input_elements: 0,
           })
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testNonEmptyInputIntoEmptyOutputs(self):
     self._testNonEmptyInputIntoEmptyOutputs()
@@ -484,7 +484,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         # presence of finishing iterators.
         if done_first_event and not (sloppy and (i in race_indices)):
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event or (sloppy and (i in race_indices)):
           done_first_event = True
           self.read_coordination_events[expected_element].acquire()
@@ -520,10 +520,10 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       ]
       for element in mis_ordering:
         self.write_coordination_events[element].set()
-        self.assertEqual(element * element, sess.run(self.next_element))
+        self.assertEqual(element * element, self.evaluate(self.next_element))
         self.assertTrue(self.read_coordination_events[element].acquire(False))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testBlockLengthWithContentionSloppy(self):
     with self.cached_session() as sess:
@@ -549,7 +549,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           self.read_coordination_events[expected_element].acquire()
           done_first_event = True
@@ -557,7 +557,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def _testEarlyExit(self, sloppy=False):
     # Exiting without consuming all input should not block
@@ -575,7 +575,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           })
       for i in range(4, 7):
         self.write_coordination_events[i].set()
-      elem = sess.run(self.next_element)  # Start all workers
+      elem = self.evaluate(self.next_element)  # Start all workers
       # Allow the one successful worker to progress beyond the py_func again.
       elem = int(math.sqrt(elem))
       self.write_coordination_events[elem].set()
@@ -603,12 +603,12 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     dataset = dataset.apply(
         interleave_ops.parallel_interleave(
             interleave_fn, cycle_length=16, block_length=2, sloppy=sloppy))
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     with self.cached_session() as sess:
       output_values = []
       for _ in range(30):
-        output_values.append(sess.run(iterator.get_next()))
+        output_values.append(self.evaluate(iterator.get_next()))
 
     expected_values = self._interleave(
         [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2)
@@ -630,20 +630,19 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
 
     dataset = dataset_ops.Dataset.range(10).map(_map_fn)
-    iterator = dataset.apply(
-        interleave_ops.parallel_interleave(
-            _interleave_fn, cycle_length=1)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset.apply(
+        interleave_ops.parallel_interleave(_interleave_fn, cycle_length=1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
         for j in range(2):
           expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
+          self.assertAllEqual(expected, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   def testErrorsInOutputFn(self):
     with self.cached_session() as sess:
@@ -668,15 +667,15 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           self.error = ValueError()
           self.write_coordination_events[expected_element].set()
           with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(self.next_element)
+            self.evaluate(self.next_element)
         else:
           self.write_coordination_events[expected_element].set()
-          actual_element = sess.run(self.next_element)
+          actual_element = self.evaluate(self.next_element)
           self.assertEqual(expected_element * expected_element, actual_element,
                            "At index %s: %s expected, got: %s" %
                            (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testErrorsInInputFn(self):
 
@@ -701,7 +700,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                                                self.buffer_output_elements,
                                                self.prefetch_input_elements)))
 
-    self.iterator = self.dataset.make_initializable_iterator()
+    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
 
@@ -720,14 +719,14 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
         if expected_element == 5:
           with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(self.next_element)
+            self.evaluate(self.next_element)
         else:
-          actual_element = sess.run(self.next_element)
+          actual_element = self.evaluate(self.next_element)
           self.assertEqual(expected_element, actual_element,
                            "At index %s: %s expected, got: %s" %
                            (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testErrorsInInterleaveFn(self):
 
@@ -750,7 +749,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                                                self.buffer_output_elements,
                                                self.prefetch_input_elements)))
 
-    self.iterator = self.dataset.make_initializable_iterator()
+    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
 
@@ -769,14 +768,14 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
         if expected_element == 5:
           with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(self.next_element)
+            self.evaluate(self.next_element)
         else:
-          actual_element = sess.run(self.next_element)
+          actual_element = self.evaluate(self.next_element)
           self.assertEqual(expected_element, actual_element,
                            "At index %s: %s expected, got: %s" %
                            (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testShutdownRace(self):
     dataset = dataset_ops.Dataset.range(20)
@@ -789,17 +788,17 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
             buffer_output_elements=1,
             prefetch_input_elements=0))
     dataset = dataset.batch(32)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     results = []
     with self.cached_session() as sess:
       for _ in range(2):
         elements = []
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         try:
           while True:
-            elements.extend(sess.run(next_element))
+            elements.extend(self.evaluate(next_element))
         except errors.OutOfRangeError:
           pass
         results.append(elements)
diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
index 723e709ae8dbb9ad2d27e77780c3da4a5e95c7d0..76e0d4d72a6d22f24da9c762770d1592ba67b737 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
@@ -27,14 +27,13 @@ from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsing_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
 
 # Helpers for creating Example objects
 example = example_pb2.Example
@@ -49,70 +48,63 @@ feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d)
 sequence_example = example_pb2.SequenceExample
 
 
-def _compare_output_to_expected(tester, dict_tensors, expected_tensors,
-                                flat_output):
-  tester.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
-
-  i = 0  # Index into the flattened output of session.run()
-  for k, v in sorted(dict_tensors.items()):
-    # TODO(shivaniagrawal): flat_output is same as v.
-    expected_v = expected_tensors[k]
-    tf_logging.info("Comparing key: %s", k)
-    print("i", i, "flat_output", flat_output[i], "expected_v", expected_v)
-    if sparse_tensor.is_sparse(v):
-      # Three outputs for SparseTensor : indices, values, shape.
-      tester.assertEqual([k, len(expected_v)], [k, 3])
-      print("i", i, "flat_output", flat_output[i].indices, "expected_v",
-            expected_v[0])
-      tester.assertAllEqual(expected_v[0], flat_output[i].indices)
-      tester.assertAllEqual(expected_v[1], flat_output[i].values)
-      tester.assertAllEqual(expected_v[2], flat_output[i].dense_shape)
-    else:
-      # One output for standard Tensor.
-      tester.assertAllEqual(expected_v, flat_output[i])
-    i += 1
+@test_util.run_all_in_graph_and_eager_modes
+class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
+  def _compare_output_to_expected(self, dict_tensors, expected_tensors):
+    self.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys()))
 
-class ParseExampleDatasetTest(test_base.DatasetTestBase):
+    for k, v in sorted(dict_tensors.items()):
+      expected_v = expected_tensors[k]
+      if sparse_tensor.is_sparse(v):
+        self.assertSparseValuesEqual(expected_v, v)
+      else:
+        # One output for standard Tensor.
+        self.assertAllEqual(expected_v, v)
 
   def _test(self,
             input_tensor,
             feature_val,
             expected_values=None,
-            expected_err=None):
-
-    with self.cached_session() as sess:
-      if expected_err:
-        with self.assertRaisesWithPredicateMatch(expected_err[0],
-                                                 expected_err[1]):
-          dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply(
-              contrib_parsing_ops.parse_example_dataset(feature_val))
-          get_next = dataset.make_one_shot_iterator().get_next()
-          sess.run(get_next)
-        return
-      else:
-        # Returns dict w/ Tensors and SparseTensors.
-        # Check values.
+            expected_err=None,
+            create_iterator_twice=False):
+
+    if expected_err:
+      with self.assertRaisesWithPredicateMatch(expected_err[0],
+                                               expected_err[1]):
         dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply(
             contrib_parsing_ops.parse_example_dataset(feature_val))
-        get_next = dataset.make_one_shot_iterator().get_next()
-        result = sess.run(get_next)
-        flattened = nest.flatten(result)
-        print("result", result, "expected_values", expected_values)
-        _compare_output_to_expected(self, result, expected_values, flattened)
-
-      # Check shapes; if serialized is a Tensor we need its size to
-      # properly check.
-      batch_size = (
-          input_tensor.eval().size if isinstance(input_tensor, ops.Tensor) else
-          np.asarray(input_tensor).size)
-      for k, f in feature_val.items():
-        print("output_shapes as list ",
-              tuple(dataset.output_shapes[k].as_list()))
-        if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
-          self.assertEqual(dataset.output_shapes[k].as_list()[0], batch_size)
-        elif isinstance(f, parsing_ops.VarLenFeature):
-          self.assertEqual(dataset.output_shapes[k].as_list()[1], None)
+        get_next = self.getNext(dataset)
+        self.evaluate(get_next())
+      return
+    else:
+      # Returns dict w/ Tensors and SparseTensors.
+      # Check values.
+      dataset = dataset_ops.Dataset.from_tensors(input_tensor).apply(
+          contrib_parsing_ops.parse_example_dataset(feature_val))
+      get_next = self.getNext(dataset)
+      result = self.evaluate(get_next())
+      self._compare_output_to_expected(result, expected_values)
+      with self.assertRaises(errors_impl.OutOfRangeError):
+        self.evaluate(get_next())
+      with self.assertRaises(errors_impl.OutOfRangeError):
+        self.evaluate(get_next())
+      if create_iterator_twice:
+        get_next = self.getNext(dataset)
+        result = self.evaluate(get_next())
+        self._compare_output_to_expected(result, expected_values)
+        with self.assertRaises(errors_impl.OutOfRangeError):
+          self.evaluate(get_next())
+    # Check shapes; if serialized is a Tensor we need its size to
+    # properly check.
+    batch_size = (
+        self.evaluate(input_tensor).size if isinstance(input_tensor, ops.Tensor)
+        else np.asarray(input_tensor).size)
+    for k, f in feature_val.items():
+      if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
+        self.assertEqual(dataset.output_shapes[k].as_list()[0], batch_size)
+      elif isinstance(f, parsing_ops.VarLenFeature):
+        self.assertEqual(dataset.output_shapes[k].as_list()[1], None)
 
   def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
@@ -123,13 +115,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
     b_default = np.random.rand(3, 3).astype(bytes)
     c_default = np.random.rand(2).astype(np.float32)
 
-    expected_st_a = (  # indices, values, shape
-        np.empty(
-            (0, 2), dtype=np.int64),  # indices
-        np.empty(
-            (0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array(
-            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+    expected_st_a = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.empty((0, 2), dtype=np.int64),  # indices
+        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
 
     expected_output = {
         sparse_name: expected_st_a,
@@ -152,8 +141,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
                 parsing_ops.FixedLenFeature(
                     (2,), dtypes.float32, default_value=c_default),
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
+  @test_util.run_deprecated_v1
   def testEmptySerializedWithoutDefaultsShouldFail(self):
     input_features = {
         "st_a":
@@ -187,6 +178,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_err=(errors_impl.InvalidArgumentError,
                       "Feature: c \\(data type: float\\) is required"))
 
+  @test_util.run_deprecated_v1
   def testDenseNotMatchingShapeShouldFail(self):
     original = [
         example(features=features({
@@ -233,17 +225,14 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_st_c = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64), np.array(
-                [3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32), np.array(
-                    [4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
+    expected_st_c = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 1], [3, 0], [3, 1], [3, 2]], dtype=np.int64),
+        np.array([3.0, 4.0, 1.0, 2.0, -1.0], dtype=np.float32),
+        np.array([4, 3], dtype=np.int64))  # batch == 2, max_elems = 3
 
-    expected_st_d = (  # indices, values, shape
-        np.array(
-            [[3, 0]], dtype=np.int64), np.array(
-                ["hi"], dtype=bytes), np.array(
-                    [4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
+    expected_st_d = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[3, 0]], dtype=np.int64), np.array(["hi"], dtype=bytes),
+        np.array([4, 1], dtype=np.int64))  # batch == 2, max_elems = 1
 
     expected_output = {
         "st_c": expected_st_c,
@@ -255,7 +244,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
             "st_c": parsing_ops.VarLenFeature(dtypes.float32),
             "st_d": parsing_ops.VarLenFeature(dtypes.string)
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
   def testSerializedContainingSparseFeature(self):
     original = [
@@ -280,19 +270,18 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_sp = (  # indices, values, shape
-        np.array(
-            [[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64),
-        np.array(
-            [3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32), np.array(
-                [4, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+    expected_sp = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 5], [0, 10], [3, 0], [3, 3], [3, 9]], dtype=np.int64),
+        np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
+        np.array([4, 13], dtype=np.int64))  # batch == 4, max_elems = 13
 
     expected_output = {"sp": expected_sp,}
 
     self._test(
         ops.convert_to_tensor(serialized),
         {"sp": parsing_ops.SparseFeature(["idx"], "val", dtypes.float32, [13])},
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
   def testSerializedContainingSparseFeatureReuse(self):
     original = [
@@ -309,17 +298,15 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_sp1 = (  # indices, values, shape
-        np.array(
-            [[0, 5], [0, 10]], dtype=np.int64), np.array(
-                [3.0, 4.0], dtype=np.float32), np.array(
-                    [2, 13], dtype=np.int64))  # batch == 2, max_elems = 13
+    expected_sp1 = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 5], [0, 10]], dtype=np.int64),
+        np.array([3.0, 4.0], dtype=np.float32),
+        np.array([2, 13], dtype=np.int64))  # batch == 2, max_elems = 13
 
-    expected_sp2 = (  # indices, values, shape
-        np.array(
-            [[0, 5], [0, 10]], dtype=np.int64), np.array(
-                [5.0, 6.0], dtype=np.float32), np.array(
-                    [2, 7], dtype=np.int64))  # batch == 2, max_elems = 13
+    expected_sp2 = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 5], [0, 10]], dtype=np.int64),
+        np.array([5.0, 6.0], dtype=np.float32),
+        np.array([2, 7], dtype=np.int64))  # batch == 2, max_elems = 13
 
     expected_output = {
         "sp1": expected_sp1,
@@ -334,7 +321,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
                 parsing_ops.SparseFeature(
                     "idx", "val2", dtypes.float32, size=7, already_sorted=True)
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
   def testSerializedContaining3DSparseFeature(self):
     original = [
@@ -361,11 +349,10 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
 
     serialized = [m.SerializeToString() for m in original]
 
-    expected_sp = (
+    expected_sp = sparse_tensor.SparseTensorValue(
         # indices
-        np.array(
-            [[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]],
-            dtype=np.int64),
+        np.array([[0, 5, 0], [0, 10, 2], [3, 0, 1], [3, 3, 2], [3, 9, 0]],
+                 dtype=np.int64),
         # values
         np.array([3.0, 4.0, 1.0, -1.0, 2.0], dtype=np.float32),
         # shape batch == 4, max_elems = 13
@@ -379,7 +366,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
                 parsing_ops.SparseFeature(["idx0", "idx1"], "val",
                                           dtypes.float32, [13, 3])
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
   def testSerializedContainingDense(self):
     aname = "a"
@@ -413,7 +401,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
             bname:
                 parsing_ops.FixedLenFeature((1, 1, 1, 1), dtype=dtypes.string),
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
   # This test is identical as the previous one except
   # for the creation of 'serialized'.
@@ -459,7 +448,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
             bname:
                 parsing_ops.FixedLenFeature((1, 1, 1, 1), dtype=dtypes.string),
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
   def testSerializedContainingDenseScalar(self):
     original = [
@@ -482,7 +472,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
                 parsing_ops.FixedLenFeature(
                     (1,), dtype=dtypes.float32, default_value=-1),
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
   def testSerializedContainingDenseWithDefaults(self):
     original = [
@@ -519,21 +510,18 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
                 parsing_ops.FixedLenFeature(
                     (1, 1, 1, 1), dtype=dtypes.string, default_value="tmp_str"),
         },
-        expected_values=expected_output)
-
-  def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
-    expected_st_a = (  # indices, values, shape
-        np.empty(
-            (0, 2), dtype=np.int64),  # indices
-        np.empty(
-            (0,), dtype=np.int64),  # sp_a is DT_INT64
-        np.array(
-            [2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
-    expected_sp = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 3], [1, 7]], dtype=np.int64), np.array(
-                ["a", "b", "c"], dtype="|S"), np.array(
-                    [2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+        expected_values=expected_output,
+        create_iterator_twice=True)
+
+  def testSerializedSparseAndSparseFeatureAndDenseWithNoDefault(self):
+    expected_st_a = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.empty((0, 2), dtype=np.int64),  # indices
+        np.empty((0,), dtype=np.int64),  # sp_a is DT_INT64
+        np.array([2, 0], dtype=np.int64))  # batch == 2, max_elems = 0
+    expected_sp = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 3], [1, 7]], dtype=np.int64),
+        np.array(["a", "b", "c"], dtype="|S"),
+        np.array([2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
 
     original = [
         example(features=features({
@@ -577,20 +565,19 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
             "c":
                 parsing_ops.FixedLenFeature((2,), dtypes.float32),
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
-  def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
-    expected_idx = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
-        np.array([0, 3, 7, 1]), np.array(
-            [2, 2], dtype=np.int64))  # batch == 4, max_elems = 2
+  def testerializedContainingSparseAndSparseFeatureWithReuse(self):
+    expected_idx = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
+        np.array([0, 3, 7, 1]),
+        np.array([2, 2], dtype=np.int64))  # batch == 4, max_elems = 2
 
-    expected_sp = (  # indices, values, shape
-        np.array(
-            [[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64), np.array(
-                ["a", "b", "d", "c"], dtype="|S"), np.array(
-                    [2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
+    expected_sp = sparse_tensor.SparseTensorValue(  # indices, values, shape
+        np.array([[0, 0], [0, 3], [1, 1], [1, 7]], dtype=np.int64),
+        np.array(["a", "b", "d", "c"], dtype="|S"),
+        np.array([2, 13], dtype=np.int64))  # batch == 4, max_elems = 13
 
     original = [
         example(features=features({
@@ -616,7 +603,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
             "sp":
                 parsing_ops.SparseFeature(["idx"], "val", dtypes.string, [13]),
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
   def _testSerializedContainingVarLenDenseLargerBatch(self, batch_size):
     # During parsing, data read from the serialized proto is stored in buffers.
@@ -675,18 +663,19 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
                     allow_missing=True,
                     default_value="default"),
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
   def testSerializedContainingVarLenDenseLargerBatch(self):
     np.random.seed(3456)
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
-  def testSerializedContainingVarLenDense(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerSerializedShapeMismatch(self):
     aname = "a"
     bname = "b"
     cname = "c"
-    dname = "d"
     original = [
         example(features=features({
             cname: int64_feature([2]),
@@ -705,6 +694,48 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         })),
     ]
 
+    serialized = [m.SerializeToString() for m in original]
+    self._test(
+        ops.convert_to_tensor(serialized), {
+            aname:
+                parsing_ops.FixedLenSequenceFeature((2, 1),
+                                                    dtype=dtypes.float32,
+                                                    allow_missing=True,
+                                                    default_value=[]),
+            bname:
+                parsing_ops.FixedLenSequenceFeature(
+                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+        },
+        expected_err=(ValueError,
+                      "Cannot reshape a tensor with 0 elements to shape"))
+
+  @test_util.run_deprecated_v1
+  def testSerializedContainingVarLenDense(self):
+    aname = "a"
+    bname = "b"
+    cname = "c"
+    dname = "d"
+    original = [
+        example(features=features({
+            cname: int64_feature([2]),
+        })),
+        example(
+            features=features({
+                aname: float_feature([1, 1]),
+                bname: bytes_feature([b"b0_str", b"b1_str"]),
+            })),
+        example(
+            features=features({
+                aname: float_feature([-1, -1, 2, 2]),
+                bname: bytes_feature([b"b1"]),
+            })),
+        example(
+            features=features({
+                aname: float_feature([]),
+                cname: int64_feature([3]),
+            })),
+    ]
+
     serialized = [m.SerializeToString() for m in original]
 
     expected_output = {
@@ -742,7 +773,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
                 parsing_ops.FixedLenSequenceFeature(
                     shape=[], dtype=dtypes.string, allow_missing=True),
         },
-        expected_values=expected_output)
+        expected_values=expected_output,
+        create_iterator_twice=True)
 
     # Test with padding values.
     expected_output_custom_padding = dict(expected_output)
@@ -789,21 +821,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
             errors_impl.OpError, "Key: b, Index: 2.  "
             "Number of bytes values is not a multiple of stride length."))
 
-    self._test(
-        ops.convert_to_tensor(serialized), {
-            aname:
-                parsing_ops.FixedLenSequenceFeature(
-                    (2, 1),
-                    dtype=dtypes.float32,
-                    allow_missing=True,
-                    default_value=[]),
-            bname:
-                parsing_ops.FixedLenSequenceFeature(
-                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
-        },
-        expected_err=(ValueError,
-                      "Cannot reshape a tensor with 0 elements to shape"))
-
     self._test(
         ops.convert_to_tensor(serialized), {
             aname:
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index f73725366c46e1b0dca88e3d1b09147a23966eaf..80bd43e9adee52afefc6a6c9866bab671aa4a731 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -31,17 +31,15 @@ from tensorflow.python.platform import test
 
 class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testPrefetchToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -50,29 +48,26 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchToSameDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device(
             "/job:localhost/replica:0/task:0/device:CPU:0"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -81,27 +76,24 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchDictToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -110,17 +102,17 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element["a"].dtype)
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchSparseTensorsToDevice(self):
     def make_tensor(i):
       return sparse_tensor.SparseTensorValue(
@@ -130,12 +122,9 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -144,18 +133,17 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testPrefetchToDeviceGpu(self):
     if not test_util.is_gpu_available():
@@ -165,26 +153,26 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/gpu:0"))
 
-    iterator = device_dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_initializable_iterator(device_dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -193,20 +181,19 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+    with self.test_session(config=worker_config):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testPrefetchToDeviceGpuWithReInit(self):
     if not test_util.is_gpu_available():
@@ -216,18 +203,19 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/gpu:0"))
 
-    iterator = device_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(device_dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
index fe0b3b5f3b80acbc69b6dd4285102df76e039b98..77df8310d439b458c691ccbfb1d6015859c7d015 100644
--- a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
@@ -64,7 +64,7 @@ class FixedLengthRecordDatasetTestBase(test_base.DatasetTestBase):
 
 
 class MakeBatchedFeaturesDatasetTestBase(test_base.DatasetTestBase):
-  """Base class for setting up and testing `make_batched_feature_dataset`."""
+  """Base class for setting up and testing `make_batched_features_dataset`."""
 
   def setUp(self):
     super(MakeBatchedFeaturesDatasetTestBase, self).setUp()
diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
index 4c879dbae68b358e46b2546dc61befe060df4daa..76f68f50c8188e58affc353e62b7ff8c952c4955 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
@@ -47,7 +48,7 @@ def _time_resampling(
           initial_dist=init_dist,
           seed=142))
 
-  get_next = dataset.make_one_shot_iterator().get_next()
+  get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
   with test_obj.test_session() as sess:
     start_time = time.time()
@@ -63,6 +64,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @parameterized.named_parameters(
       ("InitialDistributionKnown", True),
       ("InitialDistributionUnknown", False))
+  @test_util.run_deprecated_v1
   def testDistribution(self, initial_known):
     classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
     target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
@@ -71,12 +73,12 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
         200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
 
-    get_next = dataset.apply(
+    get_next = dataset_ops.make_one_shot_iterator(dataset.apply(
         resampling.rejection_resample(
             target_dist=target_dist,
             initial_dist=initial_dist,
             class_func=lambda c, _: c,
-            seed=27)).make_one_shot_iterator().get_next()
+            seed=27))).get_next()
 
     with self.cached_session() as sess:
       returned = []
@@ -97,6 +99,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @parameterized.named_parameters(
       ("OnlyInitial", True),
       ("NotInitial", False))
+  @test_util.run_deprecated_v1
   def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
     init_dist = [0.5, 0.5]
     target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
@@ -114,7 +117,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       returned = []
@@ -122,6 +125,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
         while True:
           returned.append(sess.run(get_next))
 
+  @test_util.run_deprecated_v1
   def testRandomClasses(self):
     init_dist = [0.25, 0.25, 0.25, 0.25]
     target_dist = [0.0, 0.0, 0.0, 1.0]
@@ -145,7 +149,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       returned = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
index 516e489d043ccec513267ae3d51b639540a4fcd6..658e6120cf9e30d7f79e542c8df726d997b1abb9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
@@ -22,12 +22,14 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class RestructuredDatasetTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testRestructureDataset(self):
     components = (array_ops.placeholder(dtypes.int32),
                   (array_ops.placeholder(dtypes.int32, shape=[None]),
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index 0730455431f9a3faaeb22b62f59d45c04d07c208..bd974b21e301806e5282c8970e091df684c85144 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
@@ -40,6 +41,7 @@ class ScanTest(test_base.DatasetTestBase):
     return dataset_ops.Dataset.from_tensors(0).repeat().apply(
         scan_ops.scan(start, scan_fn))
 
+  @test_util.run_deprecated_v1
   def testCount(self):
     def make_scan_fn(step):
       return lambda state, _: (state + step, state)
@@ -47,8 +49,8 @@ class ScanTest(test_base.DatasetTestBase):
     start = array_ops.placeholder(dtypes.int32, shape=[])
     step = array_ops.placeholder(dtypes.int32, shape=[])
     take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = self._counting_dataset(
-        start, make_scan_fn(step)).take(take).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(self._counting_dataset(
+        start, make_scan_fn(step)).take(take))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -60,15 +62,15 @@ class ScanTest(test_base.DatasetTestBase):
                  feed_dict={start: start_val, step: step_val, take: take_val})
         for expected, _ in zip(
             itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, sess.run(next_element))
+          self.assertEqual(expected, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
   @test_util.run_in_graph_and_eager_modes
   def testFibonacci(self):
-    iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
-        scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))
-    ).make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
+            scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))))
 
     if context.executing_eagerly():
       next_element = iterator.get_next
@@ -83,6 +85,7 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertEqual(5, self.evaluate(next_element()))
     self.assertEqual(8, self.evaluate(next_element()))
 
+  @test_util.run_deprecated_v1
   def testSparseCount(self):
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -96,9 +99,8 @@ class ScanTest(test_base.DatasetTestBase):
     start = array_ops.placeholder(dtypes.int32, shape=[])
     step = array_ops.placeholder(dtypes.int32, shape=[])
     take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = self._counting_dataset(
-        _sparse(start),
-        make_scan_fn(step)).take(take).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(self._counting_dataset(
+        _sparse(start), make_scan_fn(step)).take(take))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -110,10 +112,11 @@ class ScanTest(test_base.DatasetTestBase):
                  feed_dict={start: start_val, step: step_val, take: take_val})
         for expected, _ in zip(
             itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, sess.run(next_element).values[0])
+          self.assertEqual(expected, self.evaluate(next_element).values[0])
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testChangingStateShape(self):
     # Test the fixed-point shape invariant calculations: start with
     # initial values with known shapes, and use a scan function that
@@ -131,16 +134,16 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertIs(None, dataset.output_shapes[0][1].ndims)
     self.assertEqual([], dataset.output_shapes[1].as_list())
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(5):
-        (longer_vector_val, larger_rank_val), _ = sess.run(next_element)
+        (longer_vector_val, larger_rank_val), _ = self.evaluate(next_element)
         self.assertAllEqual([0] * (2**i), longer_vector_val)
         self.assertAllEqual(np.array(1, ndmin=i), larger_rank_val)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testIncorrectStateType(self):
 
@@ -167,6 +170,21 @@ class ScanTest(test_base.DatasetTestBase):
       dataset.apply(
           scan_ops.scan(constant_op.constant(1, dtype=dtypes.int32), _scan_fn))
 
+  def testPreserveCardinality(self):
+
+    def scan_fn(state, val):
+
+      def py_fn(_):
+        raise StopIteration()
+
+      return state, script_ops.py_func(py_fn, [val], dtypes.int64)
+
+    dataset = dataset_ops.Dataset.from_tensors(0).apply(
+        scan_ops.scan(constant_op.constant(1), scan_fn))
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index e556b65b7c186908948acb0a0757d05ecb2bffb4..4a2e28f49649ea698e9d426d86dae4bb42cdebf9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -74,7 +74,11 @@ py_test(
     size = "small",
     srcs = ["checkpoint_input_pipeline_hook_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "no_windows",
+        "notsan",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -306,6 +310,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "numa_map_and_batch_dataset_serialization_test",
+    size = "medium",
+    srcs = ["numa_map_and_batch_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "no_windows",
+        "notap",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "map_dataset_serialization_test",
     size = "medium",
@@ -332,6 +355,23 @@ py_test(
     ],
 )
 
+py_test(
+    name = "matching_files_dataset_serialization_test",
+    size = "small",
+    srcs = ["matching_files_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:matching_files",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "optimize_dataset_serialization_test",
     size = "small",
@@ -620,6 +660,7 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:stats_aggregator",
         "//tensorflow/python/data/experimental/ops:stats_ops",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
index 94393d6d4ba98eae72a29883ebef9c6d075c6fec..8cc66d0c29392b206015ad886780d854fb2b5d5c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
@@ -21,17 +21,18 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops import iterator_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow_estimator.python.estimator import estimator
+from tensorflow_estimator.python.estimator import model_fn
 
 
 class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
@@ -68,6 +69,7 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
   def _build_iterator_saver_hook(self, est):
     return iterator_ops.CheckpointInputPipelineHook(est)
 
+  @test_util.run_deprecated_v1
   def testReturnDatasetFromInputFn(self):
 
     def _input_fn():
@@ -80,6 +82,7 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
+  @test_util.run_deprecated_v1
   def testBuildIteratorInInputFn(self):
 
     def _input_fn():
@@ -94,6 +97,7 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
+  @test_util.run_deprecated_v1
   def testDoNotRestore(self):
 
     def _input_fn():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
index 7f435b823975ad7a12661d909f37cebae67a0018..bdbd8702b7f8d315a730c5cd2b000218ea5e19be 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -23,6 +23,8 @@ import os
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -73,23 +75,39 @@ class DatasetSerializationTestBase(test.TestCase):
     Raises:
       AssertionError if any test fails.
     """
+    # NOTE: We disable all default optimizations in serialization tests in order
+    # to test the actual dataset in question.
+    options = dataset_ops.Options()
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
+
+    def ds_fn1_no_opt():
+      return ds_fn1().with_options(options)
+
     self.verify_unused_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_fully_used_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_exhausted_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_init_before_restore(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_multiple_breaks(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_reset_restored_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_restore_in_empty_graph(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     if ds_fn2:
+
+      def ds_fn2_no_opt():
+        return ds_fn2().with_options(options)
+
       self.verify_restore_in_modified_graph(
-          ds_fn1, ds_fn2, num_outputs, sparse_tensors=sparse_tensors)
+          ds_fn1_no_opt,
+          ds_fn2_no_opt,
+          num_outputs,
+          sparse_tensors=sparse_tensors)
 
   def verify_unused_iterator(self,
                              ds_fn,
@@ -578,7 +596,7 @@ class DatasetSerializationTestBase(test.TestCase):
     return np.linspace(0, num_outputs, num_samples, dtype=int)
 
   def _build_graph(self, ds_fn, sparse_tensors=False):
-    iterator = ds_fn().make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(ds_fn())
 
     saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
index 225f6cbac01adb22383a2d682886e1f4810871c8..e3ba8ad231b5c5c534ebc632b5f6cc6bf62451ff 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
@@ -35,7 +33,7 @@ class FilterDatasetSerializationTest(
 
   def testFilterCore(self):
     div = 3
-    num_outputs = np.sum([x % 3 != 2 for x in range(100)])
+    num_outputs = sum(x % 3 != 2 for x in range(100))
     self.run_core_tests(lambda: self._build_filter_range_graph(div),
                         lambda: self._build_filter_range_graph(div * 2),
                         num_outputs)
@@ -47,7 +45,7 @@ class FilterDatasetSerializationTest(
                 lambda d: d["foo"] + d["bar"])
 
   def testFilterDictCore(self):
-    num_outputs = np.sum([(x**2) % 2 == 0 for x in range(10)])
+    num_outputs = sum((x**2) % 2 == 0 for x in range(10))
     self.run_core_tests(self._build_filter_dict_graph, None, num_outputs)
 
   def _build_sparse_filter(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
index 166ffa99ca02eabe8b8b30ba6f1fa8ed99d8b45c..8bfe6ce2f30e02c78f4a5b760849b92dd0a8fc65 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
@@ -22,6 +22,7 @@ import math
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -83,6 +84,19 @@ class MapAndBatchDatasetSerializationTest(
     self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
                         num_outputs_drop_remainder)
 
+  def testSparse(self):
+
+    def build_dataset():
+
+      def map_fn(i):
+        return sparse_tensor.SparseTensorValue(
+            indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+      return dataset_ops.Dataset.range(10).apply(
+          batching.map_and_batch(map_fn, 5))
+
+    self.run_core_tests(build_dataset, None, 2)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c026e97835ccf32d0801a5f6eb1a49d1173dffed
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
@@ -0,0 +1,65 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the MatchingFilesDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import matching_files
+from tensorflow.python.platform import test
+
+
+class MatchingFilesDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_iterator_graph(self, test_patterns):
+    return matching_files.MatchingFilesDataset(test_patterns)
+
+  def testMatchingFilesCore(self):
+    tmp_dir = tempfile.mkdtemp()
+    width = 16
+    depth = 8
+    for i in range(width):
+      for j in range(depth):
+        new_base = os.path.join(tmp_dir, str(i),
+                                *[str(dir_name) for dir_name in range(j)])
+        if not os.path.exists(new_base):
+          os.makedirs(new_base)
+        child_files = ['a.py', 'b.pyc'] if j < depth - 1 else ['c.txt', 'd.log']
+        for f in child_files:
+          filename = os.path.join(new_base, f)
+          open(filename, 'w').close()
+
+    patterns = [
+        os.path.join(tmp_dir, os.path.join(*['**'
+                                             for _ in range(depth)]), suffix)
+        for suffix in ['*.txt', '*.log']
+    ]
+
+    num_outputs = width * len(patterns)
+    self.run_core_tests(lambda: self._build_iterator_graph(patterns),
+                        lambda: self._build_iterator_graph(patterns[0:1]),
+                        num_outputs)
+
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/numa_map_and_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/numa_map_and_batch_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..04aab329cd81c88c41aef565d3ea622025016564
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/numa_map_and_batch_dataset_serialization_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the MapAndBatchDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class MapAndBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testNumParallelBatches(self):
+    range_size = 11
+    num_repeats = 2
+    batch_size = 5
+    total_outputs = range_size * num_repeats
+    num_outputs_drop_remainder = total_outputs // batch_size
+    num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size))
+    num_parallel_batches = 2
+
+    def build_ds(range_start, drop_remainder=False):
+
+      def _map_fn(x):
+        return math_ops.square(x)
+
+      ds = dataset_ops.Dataset.range(
+          range_start, range_start + range_size).repeat(num_repeats).apply(
+              batching.map_and_batch(
+                  map_func=_map_fn,
+                  batch_size=batch_size,
+                  num_parallel_batches=num_parallel_batches,
+                  drop_remainder=drop_remainder))
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      return ds.with_options(options)
+
+    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
+                        num_outputs_keep_remainder)
+    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
+                        num_outputs_drop_remainder)
+
+  def testNumParallelCalls(self):
+    range_size = 11
+    num_repeats = 2
+    batch_size = 5
+    total_outputs = range_size * num_repeats
+    num_outputs_drop_remainder = total_outputs // batch_size
+    num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size))
+    num_parallel_calls = 7
+
+    def build_ds(range_start, drop_remainder=False):
+
+      def _map_fn(x):
+        return math_ops.square(x)
+
+      ds = dataset_ops.Dataset.range(
+          range_start, range_start + range_size).repeat(num_repeats).apply(
+              batching.map_and_batch(
+                  map_func=_map_fn,
+                  batch_size=batch_size,
+                  num_parallel_calls=num_parallel_calls,
+                  drop_remainder=drop_remainder))
+      options = dataset_ops.Options()
+      options.experimental_numa_aware = True
+      return ds.with_options(options)
+
+    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
+                        num_outputs_keep_remainder)
+    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
+                        num_outputs_drop_remainder)
+
+
+if __name__ == "__main__":
+  test.main()
+
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
index ef99d01c73ce164265c06bdf08b76ff67a90dd89..34419a314938560818f3a9f4cdd1979a8dbb44d4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
@@ -56,8 +56,8 @@ class RangeDatasetSerializationTest(
   def testSaveRestore(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -71,36 +71,36 @@ class RangeDatasetSerializationTest(
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
+        self.evaluate(init_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
     # Saving and restoring in same session.
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-        sess.run(restore_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
   def _build_range_dataset(self, start, stop):
     return dataset_ops.Dataset.range(start, stop)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
index 88d5c896c9fd9710e41026b321daa1fc90a7c66f..12fa0989d0778a6e7734413789fbc8a00390937d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
@@ -60,9 +60,9 @@ class SerializationIntegrationTest(test.TestCase):
       init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
                                                         num_outputs)
       with self.session(graph=g) as sess:
-        sess.run(init_ops)
+        self.evaluate(init_ops)
         for _ in range(break_point):
-          output = sess.run(get_next_ops)
+          output = self.evaluate(get_next_ops)
           for i in range(num_pipelines):
             all_outputs[i].append(output[i])
         saver.save(sess, self._ckpt_path())
@@ -73,7 +73,7 @@ class SerializationIntegrationTest(test.TestCase):
       with self.session(graph=g) as sess:
         saver.restore(sess, self._ckpt_path())
         for _ in range(num_outputs - break_point):
-          output = sess.run(get_next_ops)
+          output = self.evaluate(get_next_ops)
           for i in range(num_pipelines):
             all_outputs[i].append(output[i])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
index a04f1ddafce2e386b04694adb81061e99c6b8abd..e753a7a15be4ea609ce69568da1c88847bdc5727 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
@@ -138,9 +138,9 @@ class ShuffleDatasetSerializationTest(
           saver = saver_lib.Saver(allow_empty=True)
           with self.session(graph=g) as sess:
             self._save(sess, saver)
-            expected = [sess.run(get_next_ops) for _ in range(num_outputs)]
+            expected = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
             self._restore(saver, sess)
-            actual = [sess.run(get_next_ops) for _ in range(num_outputs)]
+            actual = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
             self.match(expected, actual)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
index ef7061b190473f726673e85fbd19fc8da8584052..662d768b4896f846e7d0cad078838a7c12590c04 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/stats_dataset_serialization_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
@@ -92,9 +93,9 @@ class StatsDatasetSerializationTest(
         None, num_outputs)
 
   def _build_dataset_stats_aggregator(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+    aggregator = stats_aggregator.StatsAggregator()
     return dataset_ops.Dataset.range(10).apply(
-        stats_ops.set_stats_aggregator(stats_aggregator))
+        stats_ops.set_stats_aggregator(aggregator))
 
   def test_set_stats_aggregator_not_support_checkpointing(self):
     with self.assertRaisesRegexp(errors.UnimplementedError,
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index c208963a8612228ecf9ff8b91328a2d02c0d3890..9528f83291f9e4b752a266499e9ec6d7e5239f7d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -34,16 +35,17 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
         shuffle_ops.shuffle_and_repeat(buffer_size=5, count=count, seed=seed))
 
   def _gen_outputs(self, ds_fn, num_outputs, verify_exhausted=True):
-    get_next = ds_fn().make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(ds_fn()).get_next()
     outputs = []
     with self.cached_session() as sess:
       for _ in range(num_outputs):
-        outputs.append(sess.run(get_next))
+        outputs.append(self.evaluate(get_next))
       if verify_exhausted:
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
     return outputs
 
+  @test_util.run_deprecated_v1
   def testCorrectOutput(self):
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertSequenceEqual(
@@ -52,6 +54,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     for i in range(5):
       self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20))
 
+  @test_util.run_deprecated_v1
   def testReshuffling(self):
     # Check that the output orders of different epochs are indeed different.
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
@@ -60,17 +63,20 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
       epoch2 = output[(i + 1) * 20:(i + 2) * 20]
       self.assertNotEqual(epoch1, epoch2)
 
+  @test_util.run_deprecated_v1
   def testSameOrderForSameSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertEqual(output1, output2)
 
+  @test_util.run_deprecated_v1
   def testDifferentOrderForDifferentSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(20), 100)
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
+  @test_util.run_deprecated_v1
   def testCountNone(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=None), 100, verify_exhausted=False)
@@ -79,6 +85,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
+  @test_util.run_deprecated_v1
   def testCountMinusOne(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=-1), 100, verify_exhausted=False)
@@ -108,7 +115,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
           shuffle_ops.shuffle_and_repeat(buffer_size=21))
       get_next_op = ds.make_one_shot_iterator().get_next()
       with self.session(graph=g) as sess:
-        sess.run(get_next_op)
+        self.evaluate(get_next_op)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..46b22f80b6d5f918624dcc98b894fbc37e0e46bc
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.sleep()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.data.experimental.ops import sleep
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+_NUMPY_RANDOM_SEED = 42
+
+
+class SleepTest(test_base.DatasetTestBase):
+
+  @test_util.run_deprecated_v1
+  def testSleep(self):
+    sleep_microseconds = 100
+    dataset = dataset_ops.Dataset.range(10).apply(
+        sleep.sleep(sleep_microseconds))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      self.evaluate(iterator.initializer)
+      start_time = time.time()
+      for i in range(10):
+        self.assertEqual(i, self.evaluate(next_element))
+      end_time = time.time()
+      self.assertGreater(end_time - start_time, (10 * sleep_microseconds) / 1e6)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
index a2c11696387ddbf81546765734854897a279adbf..eb66927ee5c73c67325f3764d29d5c8461c05cbb 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
@@ -39,10 +39,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                             "ORDER BY first_name DESC"
             })
         for _ in range(2):  # Dataset is repeated. See setUp.
-          self.assertEqual((b"John", b"Doe", b"Hi!"), sess.run(get_next))
-          self.assertEqual((b"Jane", b"Moe", b"Hi again!"), sess.run(get_next))
+          self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
+          self.assertEqual((b"Jane", b"Moe", b"Hi again!"),
+                           self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
   # Test that SqlDataset works on a join query.
   def testReadResultSetJoinQuery(self):
@@ -58,9 +59,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ON students.first_name = people.first_name "
                   "AND students.last_name = people.last_name"
           })
-      self.assertEqual((b"John", b"California", b"Hi!"), sess.run(get_next))
+      self.assertEqual((b"John", b"California", b"Hi!"),
+                       self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that SqlDataset can read a database entry with a null-terminator
   # in the middle of the text and place the entry in a `string` tensor.
@@ -75,10 +77,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, last_name, favorite_nonsense_word "
                   "FROM students ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"n\0nsense"), sess.run(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"n\0nsense"), self.evaluate(get_next))
+      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"),
+                       self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that SqlDataset works when used on two different queries.
   # Because the output types of the dataset must be determined at graph-creation
@@ -93,21 +96,22 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, last_name, motto FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"Hi!"), sess.run(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
+      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
       sess.run(
           init_op,
           feed_dict={
               self.query: "SELECT first_name, last_name, state FROM people "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"California"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"California"),
+                       self.evaluate(get_next))
       self.assertEqual((b"Benjamin", b"Franklin", b"Pennsylvania"),
-                       sess.run(get_next))
+                       self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an `OutOfRangeError` is raised on the first call to
   # `get_next_str_only` if result set is empty.
@@ -122,7 +126,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "WHERE first_name = 'Nonexistent'"
           })
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an error is raised when `driver_name` is invalid.
   def testReadResultSetWithInvalidDriverName(self):
@@ -151,7 +155,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       with self.assertRaises(errors.UnknownError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an error is raised when there is a syntax error in `query`.
   def testReadResultSetOfQueryWithSyntaxError(self):
@@ -166,7 +170,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       with self.assertRaises(errors.UnknownError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an error is raised when the number of columns in `query`
   # does not match the length of `output_types`.
@@ -181,7 +185,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that no results are returned when `query` is an insert query rather
   # than a select query. In particular, the error refers to the number of
@@ -199,7 +203,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "VALUES ('Foo', 'Bar', 'Baz'), ('Fizz', 'Buzz', 'Fizzbuzz')"
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int8` tensor.
@@ -212,10 +216,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int8` tensor.
@@ -230,9 +234,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students "
                           "WHERE first_name = 'John' ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0, -2), sess.run(get_next))
+      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int8` tensor.
@@ -246,11 +250,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT desk_number, favorite_negative_number FROM students "
                   "ORDER BY first_name DESC"
           })
-      self.assertEqual((9, -2), sess.run(get_next))
+      self.assertEqual((9, -2), self.evaluate(get_next))
       # Max and min values of int8
-      self.assertEqual((127, -128), sess.run(get_next))
+      self.assertEqual((127, -128), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int16` tensor.
@@ -263,10 +267,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int16` tensor.
@@ -281,9 +285,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students "
                           "WHERE first_name = 'John' ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0, -2), sess.run(get_next))
+      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int16` tensor.
@@ -297,11 +301,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students ORDER BY first_name DESC"
           })
       # Max value of int16
-      self.assertEqual((b"John", 32767), sess.run(get_next))
+      self.assertEqual((b"John", 32767), self.evaluate(get_next))
       # Min value of int16
-      self.assertEqual((b"Jane", -32768), sess.run(get_next))
+      self.assertEqual((b"Jane", -32768), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int32` tensor.
@@ -314,8 +318,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int32` tensor.
@@ -328,10 +332,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, income FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0), sess.run(get_next))
-      self.assertEqual((b"Jane", -20000), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
+      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int32` tensor.
@@ -345,11 +349,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Max value of int32
-      self.assertEqual((b"John", 2147483647), sess.run(get_next))
+      self.assertEqual((b"John", 2147483647), self.evaluate(get_next))
       # Min value of int32
-      self.assertEqual((b"Jane", -2147483648), sess.run(get_next))
+      self.assertEqual((b"Jane", -2147483648), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a numeric `varchar` from a SQLite database
   # table and place it in an `int32` tensor.
@@ -362,10 +366,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, school_id FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 123), sess.run(get_next))
-      self.assertEqual((b"Jane", 1000), sess.run(get_next))
+      self.assertEqual((b"John", 123), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 1000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in an `int64` tensor.
@@ -378,10 +382,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int64` tensor.
@@ -394,10 +398,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, income FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0), sess.run(get_next))
-      self.assertEqual((b"Jane", -20000), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
+      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int64` tensor.
@@ -412,11 +416,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       # Max value of int64
-      self.assertEqual((b"John", 9223372036854775807), sess.run(get_next))
+      self.assertEqual((b"John", 9223372036854775807), self.evaluate(get_next))
       # Min value of int64
-      self.assertEqual((b"Jane", -9223372036854775808), sess.run(get_next))
+      self.assertEqual((b"Jane", -9223372036854775808), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in a `uint8` tensor.
@@ -429,10 +433,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read the minimum and maximum uint8 values from a
   # SQLite database table and place them in `uint8` tensors.
@@ -446,11 +450,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Min value of uint8
-      self.assertEqual((b"John", 0), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
       # Max value of uint8
-      self.assertEqual((b"Jane", 255), sess.run(get_next))
+      self.assertEqual((b"Jane", 255), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in a `uint16` tensor.
@@ -463,10 +467,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read the minimum and maximum uint16 values from a
   # SQLite database table and place them in `uint16` tensors.
@@ -480,11 +484,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Min value of uint16
-      self.assertEqual((b"John", 0), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
       # Max value of uint16
-      self.assertEqual((b"Jane", 65535), sess.run(get_next))
+      self.assertEqual((b"Jane", 65535), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a 0-valued and 1-valued integer from a
   # SQLite database table and place them as `True` and `False` respectively
@@ -499,10 +503,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, registration_complete FROM students "
                   "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", True), sess.run(get_next))
-      self.assertEqual((b"Jane", False), sess.run(get_next))
+      self.assertEqual((b"John", True), self.evaluate(get_next))
+      self.assertEqual((b"Jane", False), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer that is not 0-valued or 1-valued
   # from a SQLite database table and place it as `True` in a `bool` tensor.
@@ -515,10 +519,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, favorite_medium_sized_number "
                           "FROM students ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", True), sess.run(get_next))
-      self.assertEqual((b"Jane", True), sess.run(get_next))
+      self.assertEqual((b"John", True), self.evaluate(get_next))
+      self.assertEqual((b"Jane", True), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a float from a SQLite database table
   # and place it in a `float64` tensor.
@@ -533,10 +537,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, last_name, victories FROM townspeople "
                   "ORDER BY first_name"
           })
-      self.assertEqual((b"George", b"Washington", 20.0), sess.run(get_next))
-      self.assertEqual((b"John", b"Adams", -19.95), sess.run(get_next))
+      self.assertEqual((b"George", b"Washington", 20.0),
+                       self.evaluate(get_next))
+      self.assertEqual((b"John", b"Adams", -19.95), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a float from a SQLite database table beyond
   # the precision of 64-bit IEEE, without throwing an error. Test that
@@ -555,13 +560,13 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
       self.assertEqual(
           (b"George", b"Washington",
            1331241.321342132321324589798264627463827647382647382643874),
-          sess.run(get_next))
+          self.evaluate(get_next))
       self.assertEqual(
           (b"John", b"Adams",
            1331241321342132321324589798264627463827647382647382643874.0),
-          sess.run(get_next))
+          self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a float from a SQLite database table,
   # representing the largest integer representable as a 64-bit IEEE float
@@ -579,11 +584,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name"
           })
       self.assertNotEqual((b"George", b"Washington", 9007199254740992.0),
-                          sess.run(get_next))
+                          self.evaluate(get_next))
       self.assertNotEqual((b"John", b"Adams", 9007199254740991.0),
-                          sess.run(get_next))
+                          self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
index 6aaaa90c651ebab7ce5d98371d45a7f64831e883..809e09c80420979b84dc5e4706398f793466a059 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
@@ -24,6 +24,7 @@ import sqlite3
 
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -35,7 +36,7 @@ class SqlDatasetTestBase(test_base.DatasetTestBase):
   def _createSqlDataset(self, output_types, num_repeats=1):
     dataset = readers.SqlDataset(self.driver_name, self.data_source_name,
                                  self.query, output_types).repeat(num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     return init_op, get_next
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 427654cd7628b9eb134537ce92a44c0d578ac9ff..f19b08a2dde821124b6f5065eed4c825afa9f107 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -17,81 +17,113 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_ops
+from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+def function_set_stats_aggregator(dataset,
+                                  aggregator,
+                                  prefix="",
+                                  counter_prefix=""):
+  return dataset.apply(
+      stats_ops.set_stats_aggregator(aggregator, prefix, counter_prefix))
+
+
+def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
+  options = dataset_ops.Options()
+  options.experimental_stats = stats_options.StatsOptions()
+  options.experimental_stats.aggregator = aggregator
+  options.experimental_stats.prefix = prefix
+  options.experimental_stats.counter_prefix = counter_prefix
+  options.experimental_stats.latency_all_edges = False
+  return dataset.with_options(options)
+
+
+@parameterized.named_parameters(
+    ("SetStatsAggregator", function_set_stats_aggregator),
+    ("StatsOptions", function_apply_options),
+)
 class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
-  def testBytesProduced(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  @test_util.run_deprecated_v1
+  def testBytesProduced(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
-            stats_ops.bytes_produced_stats("bytes_produced")).apply(
-                stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator = dataset.make_initializable_iterator()
+            stats_ops.bytes_produced_stats("bytes_produced"))
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       expected_sum = 0.0
       for i in range(100):
         self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
         expected_sum += i * 8.0
         self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      summary_str = sess.run(summary_t)
+        self.evaluate(next_element)
+      summary_str = self.evaluate(summary_t)
       self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
-  def testLatencyStats(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  @test_util.run_deprecated_v1
+  def testLatencyStats(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator = dataset.make_initializable_iterator()
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(i + 1))
+            self.evaluate(summary_t), "record_latency", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 100.0)
 
-  def testPrefetchBufferUtilization(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  @test_util.run_deprecated_v1
+  def testPrefetchBufferUtilization(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
-        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(
-            -1).apply(stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator = dataset.make_initializable_iterator()
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(-1)
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
         self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
                                     float(i + 1))
         self._assertSummaryContains(summary_str, "Prefetch::buffer_capacity")
@@ -99,222 +131,331 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         self._assertSummaryHasRange(summary_str, "Prefetch::buffer_utilization",
                                     0, 1)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      summary_str = sess.run(summary_t)
+        self.evaluate(next_element)
+      summary_str = self.evaluate(summary_t)
       self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
                                   100)
 
-  def testPrefetchBufferScalars(self):
-    stats_aggregator = stats_ops.StatsAggregator()
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(
-            0).apply(stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator = dataset.make_initializable_iterator()
+  @test_util.run_deprecated_v1
+  def testPrefetchBufferScalars(self, dataset_transformation):
+    def map_fn(x):
+      return array_ops.tile([x], ops.convert_to_tensor([x]))
+    aggregator = stats_aggregator.StatsAggregator()
+    dataset = dataset_ops.Dataset.range(10).map(map_fn).prefetch(1)
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(10):
         self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasScalarValue(summary_str,
-                                          "Prefetch::buffer_capacity", 0)
+                                          "Prefetch::buffer_capacity", 1)
         self._assertSummaryHasScalarValue(summary_str, "Prefetch::buffer_size",
-                                          0)
+                                          1)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
-  def testFilteredElementsStats(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  @test_util.run_deprecated_v1
+  def testFilteredElementsStats(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(101).filter(
-        lambda x: math_ops.equal(math_ops.mod(x, 3), 0)).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator = dataset.make_initializable_iterator()
+        lambda x: math_ops.equal(math_ops.mod(x, 3), 0))
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(34):
-        self.assertEqual(i * 3, sess.run(next_element))
+        self.assertEqual(i * 3, self.evaluate(next_element))
         if i is not 0:
           self._assertSummaryHasScalarValue(
-              sess.run(summary_t), "Filter::dropped_elements", float(i * 2))
+              self.evaluate(summary_t), "Filter::dropped_elements",
+              float(i * 2))
         self._assertSummaryHasScalarValue(
-            sess.run(summary_t), "Filter::filtered_elements", float(i + 1))
+            self.evaluate(summary_t), "Filter::filtered_elements", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self._assertSummaryHasScalarValue(
-          sess.run(summary_t), "Filter::dropped_elements", 67.0)
+          self.evaluate(summary_t), "Filter::dropped_elements", 67.0)
       self._assertSummaryHasScalarValue(
-          sess.run(summary_t), "Filter::filtered_elements", 34.0)
-
-  def testReinitialize(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+          self.evaluate(summary_t), "Filter::filtered_elements", 34.0)
+
+  @test_util.run_deprecated_v1
+  def testMapBufferUtilization(self, dataset_transformation):
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(10).map(
+          lambda x: array_ops.tile([x], ops.convert_to_tensor([x])),
+          num_parallel_calls=4)
+
+    self._testParallelCallsStats(
+        dataset_fn,
+        "ParallelMap",
+        10,
+        dataset_transformation,
+        function_processing_time=True)
+
+  @test_util.run_deprecated_v1
+  def testMapAutoTuneBufferUtilization(self, dataset_transformation):
+
+    def dataset_fn():
+      dataset = dataset_ops.Dataset.range(10).map(
+          lambda x: array_ops.tile([x], ops.convert_to_tensor([x])),
+          num_parallel_calls=optimization.AUTOTUNE)
+      options = dataset_ops.Options()
+      options.experimental_autotune = True
+      return dataset.with_options(options)
+
+    self._testParallelCallsStats(
+        dataset_fn,
+        "ParallelMap",
+        10,
+        dataset_transformation,
+        function_processing_time=True)
+
+  @test_util.run_deprecated_v1
+  def testInterleaveAutoTuneBufferUtilization(self, dataset_transformation):
+
+    def dataset_fn():
+      dataset = dataset_ops.Dataset.range(10).map(
+          lambda x: array_ops.tile([x], ops.convert_to_tensor([x])))
+      dataset = dataset_ops.Dataset.range(1).interleave(
+          lambda _: dataset,
+          cycle_length=1,
+          num_parallel_calls=optimization.AUTOTUNE)
+      options = dataset_ops.Options()
+      options.experimental_autotune = True
+      return dataset.with_options(options)
+
+    self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10,
+                                 dataset_transformation)
+
+  @test_util.run_deprecated_v1
+  def testMapAndBatchAutoTuneBufferUtilization(self, dataset_transformation):
+
+    def dataset_fn():
+      dataset = dataset_ops.Dataset.range(100).apply(
+          batching.map_and_batch(
+              lambda x: array_ops.tile([x], ops.convert_to_tensor([2])),
+              num_parallel_calls=optimization.AUTOTUNE,
+              batch_size=16))
+      options = dataset_ops.Options()
+      options.experimental_autotune = True
+      return dataset.with_options(options)
+
+    num_output = 100 // 16 + 1
+    self._testParallelCallsStats(
+        dataset_fn,
+        "MapAndBatch",
+        num_output,
+        dataset_transformation,
+        check_elements=False,
+        function_processing_time=True)
+
+  @test_util.run_deprecated_v1
+  def testReinitialize(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator = dataset.make_initializable_iterator()
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       for j in range(5):
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         for i in range(100):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
           self._assertSummaryHasCount(
-              sess.run(summary_t), "record_latency", float((j * 100) + i + 1))
+              self.evaluate(summary_t), "record_latency",
+              float((j * 100) + i + 1))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", (j + 1) * 100.0)
+            self.evaluate(summary_t), "record_latency", (j + 1) * 100.0)
 
-  def testNoAggregatorRegistered(self):
+  @test_util.run_deprecated_v1
+  def testNoAggregatorRegistered(self, dataset_transformation):
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
-  def testMultipleTags(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  @test_util.run_deprecated_v1
+  def testMultipleTags(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency_2")).apply(
-                stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator = dataset.make_initializable_iterator()
+            stats_ops.latency_stats("record_latency_2"))
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(i + 1))
+            self.evaluate(summary_t), "record_latency", float(i + 1))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency_2", float(i + 1))
+            self.evaluate(summary_t), "record_latency_2", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+        self.evaluate(next_element)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "record_latency_2", 100.0)
+          self.evaluate(summary_t), "record_latency", 100.0)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency_2", 100.0)
 
-  def testRepeatedTags(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  @test_util.run_deprecated_v1
+  def testRepeatedTags(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.latency_stats("record_latency")).apply(
-                stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator = dataset.make_initializable_iterator()
+            stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 200.0)
 
-  def testMultipleIteratorsSameAggregator(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  @test_util.run_deprecated_v1
+  def testMultipleIteratorsSameAggregator(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator))
-    iterator_0 = dataset.make_initializable_iterator()
-    iterator_1 = dataset.make_initializable_iterator()
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator_0 = dataset_ops.make_initializable_iterator(dataset)
+    iterator_1 = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run([iterator_0.initializer, iterator_1.initializer])
+      self.evaluate([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
-        self.assertEqual(i * 2, sess.run(next_element))
+        self.assertEqual(i * 2, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 200.0)
 
-  def testMultipleDatasetWithTags(self):
-    stats_aggregator = stats_ops.StatsAggregator()
+  @test_util.run_deprecated_v1
+  def testMultipleDatasetWithPrefixes(self, dataset_transformation):
+    aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator, "dataset1"))
+        stats_ops.latency_stats("record_latency"))
+    dataset = dataset_transformation(dataset, aggregator, prefix="dataset1")
     dataset2 = dataset_ops.Dataset.range(100).apply(
-        stats_ops.latency_stats("record_latency")).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator, "dataset2"))
-    iterator_0 = dataset.make_initializable_iterator()
-    iterator_1 = dataset2.make_initializable_iterator()
+        stats_ops.latency_stats("record_latency"))
+    dataset2 = dataset_transformation(dataset2, aggregator, prefix="dataset2")
+    iterator_0 = dataset_ops.make_initializable_iterator(dataset)
+    iterator_1 = dataset_ops.make_initializable_iterator(dataset2)
     next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator_0.initializer, iterator_1.initializer])
+      self.evaluate([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
-        self.assertEqual(i * 2, sess.run(next_element))
+        self.assertEqual(i * 2, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "dataset1_record_latency", float(i + 1))
+            self.evaluate(summary_t), "dataset1_record_latency", float(i + 1))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "dataset2_record_latency", float(i + 1))
+            self.evaluate(summary_t), "dataset2_record_latency", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "dataset1_record_latency", 100.0)
+          self.evaluate(summary_t), "dataset1_record_latency", 100.0)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "dataset2_record_latency", 100.0)
+          self.evaluate(summary_t), "dataset2_record_latency", 100.0)
 
 
+@parameterized.named_parameters(
+    dict(
+        testcase_name="SetStatsAggregator",
+        dataset_transformation=function_set_stats_aggregator),
+    dict(
+        testcase_name="StatsOptions",
+        dataset_transformation=function_apply_options))
 class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
-  def testFeaturesStats(self):
+  @test_util.run_deprecated_v1
+  def testFeaturesStats(self, dataset_transformation):
     num_epochs = 5
     total_records = num_epochs * self._num_records
     batch_size = 2
-    stats_aggregator = stats_ops.StatsAggregator()
-    dataset = self.make_batch_feature(
-        filenames=self.test_filenames[0],
-        num_epochs=num_epochs,
-        batch_size=batch_size,
-        shuffle=True,
-        shuffle_seed=5,
-        drop_final_batch=False).apply(
-            stats_ops.set_stats_aggregator(stats_aggregator, "record_stats"))
-    iterator = dataset.make_initializable_iterator()
+    aggregator = stats_aggregator.StatsAggregator()
+
+    def dataset_fn():
+      return self.make_batch_feature(
+          filenames=self.test_filenames[0],
+          num_epochs=num_epochs,
+          batch_size=batch_size,
+          shuffle=True,
+          shuffle_seed=5,
+          drop_final_batch=False)
+
+    num_output = total_records // batch_size
+    if total_records % batch_size:
+      num_output = total_records // batch_size + 1
+
+    self._testParallelCallsStats(
+        dataset_fn,
+        "ParseExample",
+        num_output,
+        dataset_transformation,
+        check_elements=False)
+
+    dataset = dataset_transformation(
+        dataset_fn(), aggregator, prefix="record_stats")
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
-    summary_t = stats_aggregator.get_summary()
+    summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(iterator.initializer)
-      for _ in range(total_records // batch_size + 1 if total_records %
-                     batch_size else total_records // batch_size):
-        sess.run(next_element)
+      self.evaluate(iterator.initializer)
+      for _ in range(num_output):
+        self.evaluate(next_element)
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "record_stats_features", total_records)
+          self.evaluate(summary_t), "record_stats_features", total_records)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "record_stats_feature-values", total_records)
+          self.evaluate(summary_t), "record_stats_feature-values",
+          total_records)
       self._assertSummaryHasSum(
-          sess.run(summary_t), "record_stats_features", total_records * 4)
+          self.evaluate(summary_t), "record_stats_features", total_records * 4)
       self._assertSummaryHasSum(
-          sess.run(summary_t), "record_stats_feature-values",
+          self.evaluate(summary_t), "record_stats_feature-values",
           self._sum_keywords(1) * num_epochs + 3 * total_records)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index 80f26259272061d4cd7862dc7606731903549835..ab1d1c3028a4ee99b99145c7296b7b0d5b8ea6b9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -17,9 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
 
 from tensorflow.core.framework import summary_pb2
+from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
 
 
 class StatsDatasetTestBase(test_base.DatasetTestBase):
@@ -42,6 +46,16 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
         return
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
 
+  def _assertSummaryHasCountMoreOrEqualGeneralisedTag(self, summary_str, tag,
+                                                      expected_value):
+    summary_proto = summary_pb2.Summary()
+    summary_proto.ParseFromString(summary_str)
+    for value in summary_proto.value:
+      if tag in value.tag:
+        self.assertGreaterEqual(value.histo.num, expected_value)
+        return
+    self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
+
   def _assertSummaryHasRange(self, summary_str, tag, min_value, max_value):
     summary_proto = summary_pb2.Summary()
     summary_proto.ParseFromString(summary_str)
@@ -69,3 +83,38 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
         self.assertEqual(expected_value, value.simple_value)
         return
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
+
+  def _testParallelCallsStats(self,
+                              dataset_fn,
+                              dataset_name,
+                              num_output,
+                              dataset_transformation,
+                              function_processing_time=False,
+                              check_elements=True):
+    aggregator = stats_aggregator.StatsAggregator()
+    dataset = dataset_fn()
+    dataset = dataset_transformation(dataset, aggregator)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+    summary_t = aggregator.get_summary()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer)
+      for i in range(num_output):
+        next_ = sess.run(next_element)
+        if check_elements:
+          self.assertAllEqual(np.array([i] * i, dtype=np.int64), next_)
+        summary_str = sess.run(summary_t)
+        if function_processing_time:
+          self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
+              summary_str, "::execution_time", float(i + 1))
+        self._assertSummaryContains(summary_str,
+                                    dataset_name + "::num_parallel_calls")
+        self._assertSummaryContains(summary_str,
+                                    dataset_name + "::active_parallel_calls")
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      if function_processing_time:
+        summary_str = sess.run(summary_t)
+        self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
+            summary_str, "::execution_time", float(num_output))
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
index 0278a208cbba5c84cb19732172277cf6685d5520..cef5e8d269ce8d4db861b97efc1a75a1dbf2ff8e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -17,20 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
 
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
@@ -40,20 +38,22 @@ from tensorflow.python.util import compat
 
 class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testUnbatchWithUnknownRankInput(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
         batching.unbatch())
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_elem = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
       for i in range(4):
-        self.assertEqual(i, sess.run(next_elem))
+        self.assertEqual(i, self.evaluate(next_elem))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_elem)
+        self.evaluate(next_elem)
 
+  @test_util.run_deprecated_v1
   def testUnbatchScalarDataset(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -63,16 +63,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i,) * 3, sess.run(op))
+        self.assertEqual((i,) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithStrings(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -83,16 +84,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
+        self.assertEqual((i, compat.as_bytes(str(i)), i), self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
@@ -102,18 +104,19 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        st_row = sess.run(next_element)
+        st_row = self.evaluate(next_element)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
         self.assertEqual([10], st_row.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithDenseAndSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
@@ -123,19 +126,20 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        dense_elem, st_row = sess.run(next_element)
+        dense_elem, st_row = self.evaluate(next_element)
         self.assertEqual(i, dense_elem)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
         self.assertEqual([10], st_row.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -145,16 +149,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual(((i,),) * 3, sess.run(op))
+        self.assertEqual(((i,),) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchMultiElementTupleDataset(self):
     data = tuple([(math_ops.range(10 * i, 10 * i + 10),
                    array_ops.fill([10], "hi")) for i in range(3)])
@@ -165,28 +170,29 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertAllEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
         self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
-                         sess.run(op))
+                         self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchEmpty(self):
     data = dataset_ops.Dataset.from_tensors(
         (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
          constant_op.constant([], shape=[0, 4, 0])))
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testUnbatchStaticShapeMismatch(self):
     data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
@@ -194,12 +200,13 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       data.apply(batching.unbatch())
 
+  @test_util.run_deprecated_v1
   def testUnbatchDynamicShapeMismatch(self):
     ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
     ph2 = array_ops.placeholder(dtypes.int32, shape=None)
     data = dataset_ops.Dataset.from_tensors((ph1, ph2))
     data = data.apply(batching.unbatch())
-    iterator = data.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -211,7 +218,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
               ph2: np.arange(8).astype(np.int32)
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
       # No 0th dimension (i.e. scalar value) for one component.
       sess.run(
@@ -221,79 +228,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
               ph2: 7
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
-
-
-class UnbatchBenchmark(test.Benchmark):
-
-  def benchmarkNativeUnbatch(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.apply(batching.unbatch())
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (native) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_native_batch_size_%d" %
-              batch_size)
-
-  # Include a benchmark of the previous `unbatch()` implementation that uses
-  # a composition of more primitive ops. Eventually we'd hope to generate code
-  # that is as good in both cases.
-  def benchmarkOldUnbatchImplementation(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (unfused) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
-              batch_size)
+        self.evaluate(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
index 847cff26b0d047f852658344529750b908250a19..1d9941d7f4d0729e5e0f62ebbac80d0d4d385f59 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -43,20 +44,21 @@ class UniqueTest(test_base.DatasetTestBase):
     current_test_case = []
     dataset = dataset_ops.Dataset.from_generator(lambda: current_test_case,
                                                  dtype).apply(unique.unique())
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for test_case, expected in test_cases:
         current_test_case = test_case
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         for element in expected:
           if dtype == dtypes.string:
             element = compat.as_bytes(element)
-          self.assertAllEqual(element, sess.run(next_element))
+          self.assertAllEqual(element, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testSimpleInt(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       self._testSimpleHelper(dtype, [
@@ -69,6 +71,7 @@ class UniqueTest(test_base.DatasetTestBase):
           ([[1, 1], [1, 1], [2, 2], [3, 3], [1, 1]], [[1, 1], [2, 2], [3, 3]]),
       ])
 
+  @test_util.run_deprecated_v1
   def testSimpleString(self):
     self._testSimpleHelper(dtypes.string, [
         ([], []),
diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c734b65e056df954a8597ab6f23489353cc057b
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Wrapping / Unwrapping dataset variants."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.platform import test
+
+
+class WrapDatasetVariantTest(test_base.DatasetTestBase):
+
+  def testBasic(self):
+    ds = dataset_ops.Dataset.range(100)
+    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+
+    wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
+    unwrapped_variant = gen_dataset_ops.unwrap_dataset_variant(wrapped_variant)
+
+    variant_ds = dataset_ops._VariantDataset(unwrapped_variant,
+                                             ds._element_structure)
+    iterator = dataset_ops.make_initializable_iterator(variant_ds)
+    get_next = iterator.get_next()
+
+    with self.cached_session():
+      self.evaluate(iterator.initializer)
+      for i in range(100):
+        self.assertEqual(i, self.evaluate(get_next))
+
+  def testGPU(self):
+    ds = dataset_ops.Dataset.range(100)
+    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+    wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
+
+    with ops.device("/gpu:0"):
+      gpu_wrapped_variant = array_ops.identity(wrapped_variant)
+
+    unwrapped_variant = gen_dataset_ops.unwrap_dataset_variant(
+        gpu_wrapped_variant)
+    variant_ds = dataset_ops._VariantDataset(unwrapped_variant,
+                                             ds._element_structure)
+    iterator = dataset_ops.make_initializable_iterator(variant_ds)
+    get_next = iterator.get_next()
+
+    with self.cached_session():
+      self.evaluate(iterator.initializer)
+      for i in range(100):
+        self.assertEqual(i, self.evaluate(get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 915d399f1bb0cb2b2945389845c325381b32cacd..60c20e0bcf2d875a15ffcc4c42d10cb6e0cc25ea 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -4,12 +4,15 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_gen_op_wrapper_py",
-    "tf_kernel_library",
+py_library(
+    name = "cardinality",
+    srcs = ["cardinality.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:tensor_util",
+    ],
 )
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 py_library(
     name = "counter",
@@ -61,14 +64,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -89,6 +91,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:tensor_shape",
@@ -122,6 +125,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
@@ -130,6 +134,7 @@ py_library(
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
         "//third_party/py/numpy",
     ],
 )
@@ -144,6 +149,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "filter_for_shard_ops",
+    srcs = ["filter_for_shard_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
 py_library(
     name = "error_ops",
     srcs = ["error_ops.py"],
@@ -170,7 +187,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -193,6 +210,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "map_defun",
+    srcs = ["map_defun.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "matching_files",
+    srcs = ["matching_files.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
+    ],
+)
+
 py_library(
     name = "optimization",
     srcs = ["optimization.py"],
@@ -207,29 +248,28 @@ py_library(
 )
 
 py_library(
-    name = "parsing_ops",
-    srcs = ["parsing_ops.py"],
+    name = "optimization_options",
+    srcs = ["optimization_options.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
     ],
 )
 
 py_library(
-    name = "map_defun",
-    srcs = ["map_defun.py"],
+    name = "parsing_ops",
+    srcs = ["parsing_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -258,12 +298,33 @@ py_library(
     srcs = ["scan_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
+    ],
+)
+
+py_library(
+    name = "sleep",
+    srcs = ["sleep.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+py_library(
+    name = "stats_aggregator",
+    srcs = ["stats_aggregator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -282,6 +343,27 @@ py_library(
     ],
 )
 
+py_library(
+    name = "stats_options",
+    srcs = ["stats_options.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":stats_aggregator",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
+    ],
+)
+
+py_library(
+    name = "threading_options",
+    srcs = ["threading_options.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
+    ],
+)
+
 py_library(
     name = "threadpool",
     srcs = ["threadpool.py"],
@@ -289,9 +371,8 @@ py_library(
     deps = [
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -341,9 +422,12 @@ py_library(
     deps = [
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -351,20 +435,24 @@ py_library(
     name = "dataset_ops",
     deps = [
         ":batching",
+        ":cardinality",
         ":counter",
         ":enumerate_ops",
         ":error_ops",
+        ":filter_for_shard_ops",
         ":get_single_element",
         ":grouping",
         ":indexed_dataset_ops",
         ":interleave_ops",
         ":map_defun",
+        ":matching_files",
         ":optimization",
         ":prefetching_ops",
         ":readers",
         ":resampling",
         ":scan_ops",
         ":shuffle_ops",
+        ":sleep",
         ":stats_ops",
         ":threadpool",
         ":unique",
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index d42af9e7e97ed612507805a55c43667895857719..29df98f4ea4c90d80f3518684febacc101ec2ba5 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -24,17 +24,18 @@ from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -364,23 +365,19 @@ class _UnbatchDataset(dataset_ops.UnaryDataset):
                          "different batch sizes.")
     self._input_dataset = input_dataset
 
+    self._structure = structure.convert_legacy_structure(
+        input_dataset.output_types,
+        nest.map_structure(lambda s: s[1:], input_dataset.output_shapes),
+        input_dataset.output_classes)
+
   def _as_variant_tensor(self):
-    return gen_dataset_ops.unbatch_dataset(
+    return ged_ops.experimental_unbatch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return nest.map_structure(lambda s: s[1:],
-                              self._input_dataset.output_shapes)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 @tf_export("data.experimental.unbatch")
@@ -408,21 +405,19 @@ def unbatch():
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    if not sparse.any_sparse(dataset.output_classes):
-      return _UnbatchDataset(dataset)
-
     # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
     # are normalized to the rank-1 dense representation, so that the
     # sparse-oblivious unbatching logic will slice them
     # appropriately. This leads to a somewhat inefficient re-encoding step
     # for all SparseTensor components.
-    # TODO(mrry): Consider optimizing this in future
-    # if it turns out to be a bottleneck.
+    # TODO(mrry): Consider optimizing this in future if it turns out to be
+    # a bottleneck.
     def normalize(arg, *rest):
+      # pylint: disable=protected-access
       if rest:
-        return sparse.serialize_many_sparse_tensors((arg,) + rest)
+        return dataset._element_structure._to_batched_tensor_list((arg,) + rest)
       else:
-        return sparse.serialize_many_sparse_tensors(arg)
+        return dataset._element_structure._to_batched_tensor_list(arg)
 
     normalized_dataset = dataset.map(normalize)
 
@@ -453,25 +448,20 @@ class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
     self._batch_size = batch_size
     self._row_shape = row_shape
+    self._structure = structure.SparseTensorStructure(
+        input_dataset.output_types,
+        tensor_shape.vector(None).concatenate(self._row_shape))
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.dense_to_sparse_batch_dataset(
+    return ged_ops.experimental_dense_to_sparse_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._batch_size,
         row_shape=convert.partial_shape_to_tensor(self._row_shape),
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return sparse_tensor.SparseTensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.vector(None).concatenate(self._row_shape)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class _RestructuredDataset(dataset_ops.UnaryDataset):
@@ -522,13 +512,10 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
             "Dataset with output types %r cannot be restructured to have "
             "output types %r" % (dataset.output_types, output_types))
 
-    self._output_types = output_types
-
     if output_shapes is None:
       # Inherit shapes from the original `dataset`.
-      self._output_shapes = nest.pack_sequence_as(output_types,
-                                                  nest.flatten(
-                                                      dataset.output_shapes))
+      output_shapes = nest.pack_sequence_as(
+          output_types, nest.flatten(dataset.output_shapes))
     else:
       if not allow_unsafe_cast:
         # Validate that the shapes are compatible.
@@ -543,39 +530,34 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
                 "Dataset with output shapes %r cannot be restructured to have "
                 "incompatible output shapes %r" % (dataset.output_shapes,
                                                    output_shapes))
-      self._output_shapes = nest.map_structure_up_to(
+      output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
     if output_classes is None:
       # Inherit class types from the original `dataset`.
-      self._output_classes = nest.pack_sequence_as(output_types,
-                                                   nest.flatten(
-                                                       dataset.output_classes))
-    else:
-      self._output_classes = output_classes
+      output_classes = nest.pack_sequence_as(
+          output_types, nest.flatten(dataset.output_classes))
+
+    self._structure = structure.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
 
   def _as_variant_tensor(self):
     return self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
+  def _element_structure(self):
+    return self._structure
 
 
-class _MapAndBatchDataset(dataset_ops.MapDataset):
+class _MapAndBatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
   def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
                drop_remainder):
     """See `Dataset.map()` for details."""
-    super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
+    super(_MapAndBatchDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
+        map_func, "tf.data.experimental.map_and_batch()", dataset=input_dataset)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_calls_t = ops.convert_to_tensor(
@@ -583,33 +565,33 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
     self._drop_remainder_t = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
-    self._batch_size = batch_size
-    self._drop_remainder = drop_remainder
+    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder_t)
+    if constant_drop_remainder:
+      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
+      # or `False` (explicitly retaining the remainder).
+      self._structure = self._map_func.output_structure._batch(  # pylint: disable=protected-access
+          tensor_util.constant_value(self._batch_size_t))
+    else:
+      self._structure = self._map_func.output_structure._batch(None)  # pylint: disable=protected-access
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    input_resource = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.map_and_batch_dataset_v2(
-        input_resource,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+    return ged_ops.experimental_map_and_batch_dataset(
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         batch_size=self._batch_size_t,
         num_parallel_calls=self._num_parallel_calls_t,
         drop_remainder=self._drop_remainder_t,
+        preserve_cardinality=True,
         **dataset_ops.flat_structure(self))
-    # pylint: enable=protected-access
-
-  @property
-  def output_shapes(self):
-    dim = self._batch_size if self._drop_remainder else None
-    return nest.pack_sequence_as(self._output_shapes, [
-        tensor_shape.vector(dim).concatenate(s)
-        for s in nest.flatten(self._output_shapes)
-    ])
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 @tf_export("data.experimental.map_and_batch")
@@ -641,9 +623,10 @@ def map_and_batch(map_func,
       whether the last batch should be dropped in case its size is smaller than
       desired; the default behavior is not to drop the smaller batch.
     num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
-        representing the number of elements to process in parallel. If not
-        specified, `batch_size * num_parallel_batches` elements will be
-        processed in parallel.
+      representing the number of elements to process in parallel. If not
+      specified, `batch_size * num_parallel_batches` elements will be processed
+      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      the number of parallel calls is set dynamically based on available CPU.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/python/data/experimental/ops/cardinality.py b/tensorflow/python/data/experimental/ops/cardinality.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cf0a8801e8339f233eb61c8e0b1223b8b94358b
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/cardinality.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cardinality analysis of `Dataset` objects."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+INFINITE = -1
+UNKNOWN = -2
+tf_export("data.experimental.INFINITE_CARDINALITY").export_constant(
+    __name__, "INFINITE")
+tf_export("data.experimental.UNKNOWN_CARDINALITY").export_constant(
+    __name__, "UNKNOWN")
+
+
+@tf_export("data.experimental.cardinality")
+def cardinality(dataset):
+  """Returns the cardinality of `dataset`, if known.
+
+  The operation returns the cardinality of `dataset`. The operation may return
+  `tf.data.experimental.INFINITE_CARDINALITY` if `dataset` contains an infinite
+  number of elements or `tf.data.experimental.UNKNOWN_CARDINALITY` if the
+  analysis fails to determine the number of elements in `dataset` (e.g. when the
+  dataset source is a file).
+
+  Args:
+    dataset: A `tf.data.Dataset` for which to determine cardinality.
+
+  Returns:
+    A scalar `tf.int64` `Tensor` representing the cardinality of `dataset`. If
+    the cardinality is infinite or unknown, the operation returns the named
+    constant `INFINITE_CARDINALITY` and `UNKNOWN_CARDINALITY` respectively.
+  """
+  return ged_ops.experimental_dataset_cardinality(dataset._as_variant_tensor())  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/experimental/ops/counter.py b/tensorflow/python/data/experimental/ops/counter.py
index 42200eaef9cb078afa0a9f598b6fa21e5e91f04b..652eb9d002992a737f3f8f0018db3a7316d0091e 100644
--- a/tensorflow/python/data/experimental/ops/counter.py
+++ b/tensorflow/python/data/experimental/ops/counter.py
@@ -25,8 +25,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.Counter")
-def Counter(start=0, step=1, dtype=dtypes.int64):
+@tf_export("data.experimental.Counter", v1=[])
+def CounterV2(start=0, step=1, dtype=dtypes.int64):
   """Creates a `Dataset` that counts from `start` in steps of size `step`.
 
   For example:
@@ -53,3 +53,13 @@ def Counter(start=0, step=1, dtype=dtypes.int64):
     step = ops.convert_to_tensor(step, dtype=dtype, name="step")
     return dataset_ops.Dataset.from_tensors(0).repeat(None).apply(
         scan_ops.scan(start, lambda state, _: (state + step, state)))
+
+
+@tf_export(v1=["data.experimental.Counter"])
+def CounterV1(start=0, step=1, dtype=dtypes.int64):
+  return dataset_ops.DatasetV1Adapter(CounterV2(start, step, dtype))
+CounterV1.__doc__ = CounterV2.__doc__
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+Counter = CounterV1  # pylint: disable=invalid-name
diff --git a/tensorflow/python/data/experimental/ops/enumerate_ops.py b/tensorflow/python/data/experimental/ops/enumerate_ops.py
index a1af98f552c8e68f458f3e9ab33ff29bc53e6136..04d875c7af238930a673fe744b3912f6ba44b5d2 100644
--- a/tensorflow/python/data/experimental/ops/enumerate_ops.py
+++ b/tensorflow/python/data/experimental/ops/enumerate_ops.py
@@ -26,9 +26,9 @@ from tensorflow.python.util.tf_export import tf_export
 
 @tf_export("data.experimental.enumerate_dataset")
 def enumerate_dataset(start=0):
-  """A transformation that enumerate the elements of a dataset.
+  """A transformation that enumerates the elements of a dataset.
 
-  It is Similar to python's `enumerate`.
+  It is similar to python's `enumerate`.
   For example:
 
   ```python
@@ -44,8 +44,8 @@ def enumerate_dataset(start=0):
   ```
 
   Args:
-    start: A `tf.int64` scalar `tf.Tensor`, representing the start
-      value for enumeration.
+    start: A `tf.int64` scalar `tf.Tensor`, representing the start value for
+      enumeration.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
index 82e274b70c5b703c62dcc143df371fae3d80065e..879b13ce092f20c2a6cfc911ba4c6e11992e23a8 100644
--- a/tensorflow/python/data/experimental/ops/error_ops.py
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -52,7 +52,7 @@ def ignore_errors():
   return _apply_fn
 
 
-class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
+class _IgnoreErrorsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that silently ignores errors when computing its input."""
 
   def __init__(self, input_dataset):
@@ -64,15 +64,3 @@ class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py b/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d3dca3e9a883cf5eeacb368bbbf1af4420f3a1
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Naive shard dataset transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.filter_for_shard")
+def filter_for_shard(num_shards, shard_index):
+  """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+
+  This dataset operator is very useful when running distributed training, as
+  it allows each worker to read a unique subset.
+
+  When reading a single input file, you can skip elements as follows:
+
+  ```python
+  d = tf.data.TFRecordDataset(FLAGS.input_file)
+  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
+                                               FLAGS.worker_index))
+  d = d.repeat(FLAGS.num_epochs)
+  d = d.shuffle(FLAGS.shuffle_buffer_size)
+  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+  ```
+
+  Important caveats:
+
+  - Be sure to shard before you use any randomizing operator (such as
+    shuffle).
+  - Generally it is best if the shard operator is used early in the dataset
+    pipeline. For example, when reading from a set of TFRecord files, shard
+    before converting the dataset to input samples. This avoids reading every
+    file on every worker. The following is an example of an efficient
+    sharding strategy within a complete pipeline:
+
+  ```python
+  d = Dataset.list_files(FLAGS.pattern)
+  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
+                                               FLAGS.worker_index))
+  d = d.repeat(FLAGS.num_epochs)
+  d = d.shuffle(FLAGS.shuffle_buffer_size)
+  d = d.interleave(tf.data.TFRecordDataset,
+                   cycle_length=FLAGS.num_readers, block_length=1)
+  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+  ```
+
+  Args:
+    num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      shards operating in parallel.
+    shard_index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: if `num_shards` or `shard_index` are illegal values. Note: error
+      checking is done on a best-effort basis, and errors aren't guaranteed to
+      be caught upon dataset creation. (e.g. providing in a placeholder tensor
+      bypasses the early checking, and will instead result in an error during
+      a session.run call.)
+  """
+  num_shards = ops.convert_to_tensor(
+      num_shards, name="num_shards", dtype=dtypes.int64)
+  num_shards_static = tensor_util.constant_value(num_shards)
+  shard_index = ops.convert_to_tensor(shard_index, name="shard_index",
+                                      dtype=dtypes.int64)
+  shard_index_static = tensor_util.constant_value(shard_index)
+
+  if num_shards_static is not None and num_shards_static < 1:
+    raise ValueError("num_shards must be >= 1; got: %s" % num_shards_static)
+  if shard_index_static is not None and shard_index_static < 0:
+    raise ValueError("shard_index must be >= 0; got: %s" % shard_index_static)
+  if (shard_index_static is not None and num_shards_static is not None and
+      shard_index_static >= num_shards_static):
+    raise ValueError("shard_index must be < num_shards; %s is not < %s" %
+                     (shard_index_static, num_shards_static))
+
+  def filter_fn(elem_index, _):
+    mod_result = math_ops.mod(elem_index, num_shards)
+    return math_ops.equal(mod_result, shard_index)
+
+  def _apply_fn(dataset):
+    # pylint: disable=protected-access
+    return dataset._enumerate().filter(filter_fn).map(lambda _, elem: elem)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/get_single_element.py b/tensorflow/python/data/experimental/ops/get_single_element.py
index 132526166cfe49e267b1569b9e7851c8256234dd..d649a0701270c55d399af140f5e2bae79484fec2 100644
--- a/tensorflow/python/data/experimental/ops/get_single_element.py
+++ b/tensorflow/python/data/experimental/ops/get_single_element.py
@@ -18,8 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -60,13 +58,11 @@ def get_single_element(dataset):
     InvalidArgumentError (at runtime): if `dataset` does not contain exactly
       one element.
   """
-  if not isinstance(dataset, dataset_ops.Dataset):
+  if not isinstance(dataset, dataset_ops.DatasetV2):
     raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
 
-  nested_ret = nest.pack_sequence_as(
-      dataset.output_types, gen_dataset_ops.dataset_to_single_element(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+  # pylint: disable=protected-access
+  return dataset._element_structure._from_compatible_tensor_list(
+      gen_dataset_ops.dataset_to_single_element(
+          dataset._as_variant_tensor(),
           **dataset_ops.flat_structure(dataset)))
-  return sparse.deserialize_sparse_tensors(
-      nested_ret, dataset.output_types, dataset.output_shapes,
-      dataset.output_classes)
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index 18ba583220c211add8fcfe4dee43cf822ad7dba8..ef6b232429b872016842bcf513a851445b4d8a5e 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -21,13 +21,14 @@ import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -198,7 +199,7 @@ def bucket_by_sequence_length(element_length_func,
       for shape in nest.flatten(shapes):
         shape = tensor_shape.TensorShape(shape)
         shape = [
-            none_filler if d.value is None else d
+            none_filler if tensor_shape.dimension_value(d) is None else d
             for d in shape
         ]
         padded.append(shape)
@@ -236,29 +237,6 @@ def bucket_by_sequence_length(element_length_func,
     return _apply_fn
 
 
-def _map_x_dataset(map_func):
-  """A transformation that maps `map_func` across its input.
-
-  This transformation is similar to `tf.data.Dataset.map`, but in addition to
-  supporting dense and sparse tensor inputs, it also supports dataset inputs.
-
-  Args:
-    map_func: A function mapping a nested structure of tensors and/or datasets
-      (having shapes and types defined by `self.output_shapes` and
-     `self.output_types`) to another nested structure of tensors and/or
-     datasets.
-
-  Returns:
-    Dataset: A `Dataset`.
-  """
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return _MapXDataset(dataset, map_func)
-
-  return _apply_fn
-
-
 class _GroupByReducerDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that groups its input and performs a reduction."""
 
@@ -274,51 +252,45 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
     self._make_finalize_func(reducer.finalize_func)
 
   def _make_key_func(self, key_func, input_dataset):
-    """Make wrapping Defun for key_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        key_func, "tf.data.experimental.group_by_reducer()", input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+    """Make wrapping defun for key_func."""
+    self._key_func = dataset_ops.StructuredFunctionWrapper(
+        key_func, self._transformation_name(), dataset=input_dataset)
+    if not self._key_func.output_structure.is_compatible_with(
+        structure.TensorStructure(dtypes.int64, [])):
       raise ValueError(
           "`key_func` must return a single tf.int64 tensor. "
           "Got type=%s and shape=%s"
-          % (wrapped_func.output_types, wrapped_func.output_shapes))
-    self._key_func = wrapped_func.function
-
+          % (self._key_func.output_types, self._key_func.output_shapes))
   def _make_init_func(self, init_func):
-    """Make wrapping Defun for init_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    """Make wrapping defun for init_func."""
+    self._init_func = dataset_ops.StructuredFunctionWrapper(
         init_func,
-        "tf.data.experimental.group_by_reducer()",
-        input_classes=ops.Tensor,
-        input_shapes=tensor_shape.scalar(),
-        input_types=dtypes.int64)
-    self._init_func = wrapped_func.function
-    self._state_classes = wrapped_func.output_classes
-    self._state_shapes = wrapped_func.output_shapes
-    self._state_types = wrapped_func.output_types
+        self._transformation_name(),
+        input_structure=structure.TensorStructure(dtypes.int64, []))
 
   def _make_reduce_func(self, reduce_func, input_dataset):
-    """Make wrapping Defun for reduce_func."""
+    """Make wrapping defun for reduce_func."""
 
     # Iteratively rerun the reduce function until reaching a fixed point on
-    # `self._state_shapes`.
+    # `self._state_structure`.
+    self._state_structure = self._init_func.output_structure
+    state_types = self._init_func.output_types
+    state_shapes = self._init_func.output_shapes
+    state_classes = self._init_func.output_classes
     need_to_rerun = True
     while need_to_rerun:
 
       wrapped_func = dataset_ops.StructuredFunctionWrapper(
           reduce_func,
-          "tf.data.experimental.group_by_reducer()",
-          input_classes=(self._state_classes, input_dataset.output_classes),
-          input_shapes=(self._state_shapes, input_dataset.output_shapes),
-          input_types=(self._state_types, input_dataset.output_types),
+          self._transformation_name(),
+          input_structure=structure.NestedStructure(
+              (self._state_structure, input_dataset._element_structure)),  # pylint: disable=protected-access
           add_to_graph=False)
 
       # Extract and validate class information from the returned values.
       for new_state_class, state_class in zip(
           nest.flatten(wrapped_func.output_classes),
-          nest.flatten(self._state_classes)):
+          nest.flatten(state_classes)):
         if not issubclass(new_state_class, state_class):
           raise TypeError(
               "The element classes for the new state must match the initial "
@@ -327,16 +299,15 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
 
       # Extract and validate type information from the returned values.
       for new_state_type, state_type in zip(
-          nest.flatten(wrapped_func.output_types),
-          nest.flatten(self._state_types)):
+          nest.flatten(wrapped_func.output_types), nest.flatten(state_types)):
         if new_state_type != state_type:
           raise TypeError(
               "The element types for the new state must match the initial "
               "state. Expected %s; got %s." %
-              (self._state_types, wrapped_func.output_types))
+              (self._init_func.output_types, wrapped_func.output_types))
 
       # Extract shape information from the returned values.
-      flat_state_shapes = nest.flatten(self._state_shapes)
+      flat_state_shapes = nest.flatten(state_shapes)
       flat_new_state_shapes = nest.flatten(wrapped_func.output_shapes)
       weakened_state_shapes = [
           original.most_specific_compatible_shape(new)
@@ -353,50 +324,45 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
           break
 
       if need_to_rerun:
-        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
-                                                   weakened_state_shapes)
+        state_shapes = nest.pack_sequence_as(
+            self._init_func.output_shapes, weakened_state_shapes)
+        self._state_structure = structure.convert_legacy_structure(
+            state_types, state_shapes, state_classes)
 
-    self._reduce_func = wrapped_func.function
-    self._reduce_func.add_to_graph(ops.get_default_graph())
+    self._reduce_func = wrapped_func
+    self._reduce_func.function.add_to_graph(ops.get_default_graph())
 
   def _make_finalize_func(self, finalize_func):
-    """Make wrapping Defun for finalize_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        finalize_func,
-        "tf.data.experimental.group_by_reducer()",
-        input_classes=self._state_classes,
-        input_shapes=self._state_shapes,
-        input_types=self._state_types)
-    self._finalize_func = wrapped_func.function
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
+    """Make wrapping defun for finalize_func."""
+    self._finalize_func = dataset_ops.StructuredFunctionWrapper(
+        finalize_func, self._transformation_name(),
+        input_structure=self._state_structure)
 
   @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._finalize_func.output_structure
 
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _functions(self):
+    return [
+        self._key_func, self._init_func, self._reduce_func, self._finalize_func
+    ]
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_reducer_dataset(
+    return ged_ops.experimental_group_by_reducer_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._init_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._finalize_func.captured_inputs,
-        key_func=self._key_func,
-        init_func=self._init_func,
-        reduce_func=self._reduce_func,
-        finalize_func=self._finalize_func,
+        self._key_func.function.captured_inputs,
+        self._init_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        init_func=self._init_func.function,
+        reduce_func=self._reduce_func.function,
+        finalize_func=self._finalize_func.function,
         **dataset_ops.flat_structure(self))
 
+  def _transformation_name(self):
+    return "tf.data.experimental.group_by_reducer()"
+
 
 class _GroupByWindowDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that groups its input and performs a windowed reduction."""
@@ -412,77 +378,68 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
     self._make_window_size_func(window_size_func)
 
   def _make_window_size_func(self, window_size_func):
-    """Make wrapping Defun for window_size_func."""
+    """Make wrapping defun for window_size_func."""
+
     def window_size_func_wrapper(key):
       return ops.convert_to_tensor(window_size_func(key), dtype=dtypes.int64)
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._window_size_func = dataset_ops.StructuredFunctionWrapper(
         window_size_func_wrapper,
-        "tf.data.experimental.group_by_window()",
-        input_classes=ops.Tensor,
-        input_shapes=tensor_shape.scalar(),
-        input_types=dtypes.int64)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+        self._transformation_name(),
+        input_structure=structure.TensorStructure(dtypes.int64, []))
+    if not self._window_size_func.output_structure.is_compatible_with(
+        structure.TensorStructure(dtypes.int64, [])):
       raise ValueError(
           "`window_size_func` must return a single tf.int64 scalar tensor.")
-    self._window_size_func = wrapped_func.function
 
   def _make_key_func(self, key_func, input_dataset):
-    """Make wrapping Defun for key_func."""
+    """Make wrapping defun for key_func."""
+
     def key_func_wrapper(*args):
       return ops.convert_to_tensor(key_func(*args), dtype=dtypes.int64)
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        key_func_wrapper, "tf.data.experimental.group_by_window()",
-        input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+    self._key_func = dataset_ops.StructuredFunctionWrapper(
+        key_func_wrapper, self._transformation_name(), dataset=input_dataset)
+    if not self._key_func.output_structure.is_compatible_with(
+        structure.TensorStructure(dtypes.int64, [])):
       raise ValueError(
           "`key_func` must return a single tf.int64 scalar tensor.")
-    self._key_func = wrapped_func.function
 
   def _make_reduce_func(self, reduce_func, input_dataset):
-    """Make wrapping Defun for reduce_func."""
-    nested_dataset = dataset_ops._NestedDatasetComponent(input_dataset)  # pylint: disable=protected-access
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        reduce_func,
-        "tf.data.experimental.reduce_by_window()",
-        input_classes=(ops.Tensor, nested_dataset),
-        input_shapes=(tensor_shape.scalar(), nested_dataset),
-        input_types=(dtypes.int64, nested_dataset),
-        experimental_nested_dataset_support=True)
+    """Make wrapping defun for reduce_func."""
+    nested_dataset = dataset_ops.DatasetStructure(
+        input_dataset._element_structure)  # pylint: disable=protected-access
+    input_structure = structure.NestedStructure(
+        (structure.TensorStructure(dtypes.int64, []), nested_dataset))
+    self._reduce_func = dataset_ops.StructuredFunctionWrapper(
+        reduce_func, self._transformation_name(),
+        input_structure=input_structure)
     if not isinstance(
-        wrapped_func.output_classes, dataset_ops._NestedDatasetComponent):  # pylint: disable=protected-access
+        self._reduce_func.output_structure, dataset_ops.DatasetStructure):
       raise TypeError("`reduce_func` must return a `Dataset` object.")
-    self._output_classes = wrapped_func.output_classes.output_classes
-    self._output_types = wrapped_func.output_types.output_types
-    self._output_shapes = wrapped_func.output_shapes.output_shapes
-    self._reduce_func = wrapped_func.function
+    # pylint: disable=protected-access
+    self._structure = (
+        self._reduce_func.output_structure._element_structure)
 
   @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _functions(self):
+    return [self._key_func, self._reduce_func, self._window_size_func]
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_window_dataset(
+    return ged_ops.experimental_group_by_window_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._window_size_func.captured_inputs,
-        key_func=self._key_func,
-        reduce_func=self._reduce_func,
-        window_size_func=self._window_size_func,
+        self._key_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._window_size_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        reduce_func=self._reduce_func.function,
+        window_size_func=self._window_size_func.function,
         **dataset_ops.flat_structure(self))
 
+  def _transformation_name(self):
+    return "tf.data.experimental.group_by_window()"
+
 
 @tf_export("data.experimental.Reducer")
 class Reducer(object):
@@ -510,42 +467,3 @@ class Reducer(object):
   @property
   def finalize_func(self):
     return self._finalize_func
-
-
-class _MapXDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that maps a function over elements in its input."""
-
-  def __init__(self, input_dataset, map_func):
-    """See `map_x_dataset()` for details."""
-    super(_MapXDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        map_func,
-        "tf.data.experimental.map_x_dataset()",
-        input_dataset,
-        experimental_nested_dataset_support=True)
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
-
-  def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.map_dataset(
-        input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
diff --git a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
index 9c06474a2f8076d3ded5fd798665ea05930ecfe5..fdf3692420b1943db0b4ff0de826e6203593e2c7 100644
--- a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
+++ b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
@@ -22,9 +22,9 @@ import abc
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
@@ -65,6 +65,7 @@ class MaterializedIndexedDataset(object):
             sparse.as_dense_types(self._output_shapes, self._output_classes)))
 
 
+# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
 class IndexedDataset(dataset_ops.Dataset):
   """IndexedDataset is highly experimental!
   """
@@ -93,11 +94,7 @@ class IndexedDataset(dataset_ops.Dataset):
         ged_ops.experimental_materialized_index_dataset_handle(
             container=container,
             shared_name=shared_name,
-            output_types=nest.flatten(
-                sparse.as_dense_types(self.output_types, self.output_classes)),
-            output_shapes=nest.flatten(
-                sparse.as_dense_types(self.output_shapes,
-                                      self.output_classes))))
+            **dataset_ops.flat_structure(self)))
 
     with ops.colocate_with(materialized_resource):
       materializer = ged_ops.experimental_indexed_dataset_materialize(
@@ -106,38 +103,6 @@ class IndexedDataset(dataset_ops.Dataset):
                                       self.output_classes, self.output_types,
                                       self.output_shapes)
 
-  @abc.abstractproperty
-  def output_types(self):
-    """Returns the type of each component of an element of this IndexedDataset.
-
-    Returns:
-      A nested structure of `tf.DType` objects corresponding to each component
-      of an element of this IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset.output_types")
-
-  @abc.abstractproperty
-  def output_classes(self):
-    """Returns the class of each component of an element of this IndexedDataset.
-
-    The expected values are `tf.Tensor` and `tf.SparseTensor`.
-
-    Returns:
-      A nested structure of Python `type` objects corresponding to each
-      component of an element of this IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset.output_classes")
-
-  @abc.abstractproperty
-  def output_shapes(self):
-    """Returns the shape of each component of an element of this IndexedDataset.
-
-    Returns:
-      A nested structure of `tf.TensorShape` objects corresponding to each
-      component of an element of this IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset.output_shapes")
-
   @abc.abstractmethod
   def _as_variant_tensor(self):
     """Creates a `tf.variant` `tf.Tensor` representing this IndexedDataset.
@@ -149,6 +114,7 @@ class IndexedDataset(dataset_ops.Dataset):
     raise NotImplementedError("IndexedDataset._as_variant_tensor")
 
 
+# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
 class IdentityIndexedDataset(IndexedDataset):
   """IdentityIndexedDataset is a trivial indexed dataset used for testing.
   """
@@ -159,16 +125,8 @@ class IdentityIndexedDataset(IndexedDataset):
     self._size = ops.convert_to_tensor(size, dtype=dtypes.uint64, name="size")
 
   @property
-  def output_types(self):
-    return dtypes.uint64
-
-  @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.uint64, [])
 
   def _as_variant_tensor(self):
     return ged_ops.experimental_identity_indexed_dataset(self._size)
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index a3c094859efb7586b3ddcf1823ab27bf0a733445..5a719f8ed8f0176f628a89eb1b3e535064d9a72e 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -21,6 +21,7 @@ from tensorflow.python.data.experimental.ops import random_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -101,6 +102,18 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
           data_input.output_classes != data_inputs[0].output_classes):
         raise TypeError("All datasets must have the same type and class.")
 
+    output_shapes = self._data_inputs[0].output_shapes
+    for data_input in self._data_inputs[1:]:
+      output_shapes = nest.pack_sequence_as(output_shapes, [
+          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
+              nest.flatten(output_shapes),
+              nest.flatten(data_input.output_shapes))
+      ])
+
+    self._structure = structure.convert_legacy_structure(
+        data_inputs[0].output_types, output_shapes,
+        data_inputs[0].output_classes)
+
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
     return (
@@ -115,26 +128,12 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
     return [self._selector_input] + self._data_inputs
 
   @property
-  def output_classes(self):
-    return self._data_inputs[0].output_classes
-
-  @property
-  def output_shapes(self):
-    ret = self._data_inputs[0].output_shapes
-    for data_input in self._data_inputs[1:]:
-      ret = nest.pack_sequence_as(ret, [
-          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
-              nest.flatten(ret), nest.flatten(data_input.output_shapes))
-      ])
-    return ret
-
-  @property
-  def output_types(self):
-    return self._data_inputs[0].output_types
+  def _element_structure(self):
+    return self._structure
 
 
-@tf_export("data.experimental.sample_from_datasets")
-def sample_from_datasets(datasets, weights=None, seed=None):
+@tf_export("data.experimental.sample_from_datasets", v1=[])
+def sample_from_datasets_v2(datasets, weights=None, seed=None):
   """Samples elements at random from the datasets in `datasets`.
 
   Args:
@@ -158,7 +157,7 @@ def sample_from_datasets(datasets, weights=None, seed=None):
       length of the `datasets` element.
   """
   num_datasets = len(datasets)
-  if not isinstance(weights, dataset_ops.Dataset):
+  if not isinstance(weights, dataset_ops.DatasetV2):
     if weights is None:
       # Select inputs with uniform probability.
       logits = [[1.0] * num_datasets]
@@ -217,8 +216,15 @@ def sample_from_datasets(datasets, weights=None, seed=None):
   return _DirectedInterleaveDataset(selector_input, datasets)
 
 
-@tf_export("data.experimental.choose_from_datasets")
-def choose_from_datasets(datasets, choice_dataset):
+@tf_export(v1=["data.experimental.sample_from_datasets"])
+def sample_from_datasets_v1(datasets, weights=None, seed=None):
+  return dataset_ops.DatasetV1Adapter(
+      sample_from_datasets_v2(datasets, weights, seed))
+sample_from_datasets_v1.__doc__ = sample_from_datasets_v2.__doc__
+
+
+@tf_export("data.experimental.choose_from_datasets", v1=[])
+def choose_from_datasets_v2(datasets, choice_dataset):
   """Creates a dataset that deterministically chooses elements from `datasets`.
 
   For example, given the following datasets:
@@ -260,3 +266,16 @@ def choose_from_datasets(datasets, choice_dataset):
     raise TypeError("`choice_dataset` must be a dataset of scalar "
                     "`tf.int64` tensors.")
   return _DirectedInterleaveDataset(choice_dataset, datasets)
+
+
+@tf_export(v1=["data.experimental.choose_from_datasets"])
+def choose_from_datasets_v1(datasets, choice_dataset):
+  return dataset_ops.DatasetV1Adapter(
+      choose_from_datasets_v2(datasets, choice_dataset))
+choose_from_datasets_v1.__doc__ = choose_from_datasets_v2.__doc__
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+choose_from_datasets = choose_from_datasets_v1
+sample_from_datasets = sample_from_datasets_v1
diff --git a/tensorflow/python/data/experimental/ops/iterator_ops.py b/tensorflow/python/data/experimental/ops/iterator_ops.py
index 72d7d58f0641b4b8ef32fd55d2ffad395b0da986..5eb256397731d39fafcc8a3e4da11dc5cbb72f35 100644
--- a/tensorflow/python/data/experimental/ops/iterator_ops.py
+++ b/tensorflow/python/data/experimental/ops/iterator_ops.py
@@ -198,7 +198,7 @@ class CheckpointInputPipelineHook(session_run_hook.SessionRunHook):
     # is run *after* this hook. That is troublesome because
     # 1. If a checkpoint exists and this hook restores it, the initializer hook
     #    will override it.
-    # 2. If no checkpoint exists, this hook will try to save an initialized
+    # 2. If no checkpoint exists, this hook will try to save an uninitialized
     #    iterator which will result in an exception.
     #
     # As a temporary fix we enter the following implicit contract between this
diff --git a/tensorflow/python/data/experimental/ops/map_defun.py b/tensorflow/python/data/experimental/ops/map_defun.py
index 3ac1158d8b33d00fff80e59272c4cb414320d509..5d729d392ac5ec9745cbfdd269bc536a74f3e865 100644
--- a/tensorflow/python/data/experimental/ops/map_defun.py
+++ b/tensorflow/python/data/experimental/ops/map_defun.py
@@ -27,7 +27,7 @@ def map_defun(fn, elems, output_dtypes, output_shapes):
   """Map a function on the list of tensors unpacked from `elems` on dimension 0.
 
   Args:
-    fn: A function (`function.Defun`) that takes a list of tensors and returns
+    fn: A function (`function.defun`) that takes a list of tensors and returns
       another list of tensors. The output list has the same types as
       output_dtypes. The elements of the output list have the same dimension 0
       as `elems`, and the remaining dimensions correspond to those of
@@ -52,7 +52,10 @@ def map_defun(fn, elems, output_dtypes, output_shapes):
     raise ValueError("`output_shapes` must be a list of `tf.TensorShape` "
                      "objects.")
 
+  concrete_fn = fn._get_concrete_function_internal()  # pylint: disable=protected-access
+  # TODO(shivaniagrawal/rachelim): what about functions created without
+  # input_signature.
   elems = [ops.convert_to_tensor(e) for e in elems]
   output_shapes = [tensor_shape.TensorShape(s) for s in output_shapes]
-  return gen_dataset_ops.map_defun(elems, fn.captured_inputs, output_dtypes,
-                                   output_shapes, fn)
+  return gen_dataset_ops.map_defun(elems, concrete_fn.captured_inputs,
+                                   output_dtypes, output_shapes, concrete_fn)
diff --git a/tensorflow/python/data/experimental/ops/matching_files.py b/tensorflow/python/data/experimental/ops/matching_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..63b99cb1e4533d165902893918d5aea2c6f02613
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/matching_files.py
@@ -0,0 +1,41 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for matching input filenames."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+class MatchingFilesDataset(dataset_ops.DatasetSource):
+  """A `Dataset` that list the files according to the input patterns."""
+
+  def __init__(self, patterns):
+    super(MatchingFilesDataset, self).__init__()
+    self._patterns = ops.convert_to_tensor(
+        patterns, dtype=dtypes.string, name="patterns")
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_matching_files_dataset(self._patterns)
+
+  @property
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index 276dde8383c9faa9931bf2c28cb96da1d55b01e2..c6c7de9265c32245dfbc348a4e7c4fd06eda653b 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -21,9 +21,12 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
 
 # A constant that can be used to enable auto-tuning.
 AUTOTUNE = -1
+tf_export("data.experimental.AUTOTUNE").export_constant(__name__, "AUTOTUNE")
 
 
 # TODO(jsimsa): Support RE matching for both individual transformation (e.g. to
@@ -62,6 +65,21 @@ def model():
   return _apply_fn
 
 
+def non_serializable():
+  """A non-serializable identity transformation.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    return _NonSerializableDataset(dataset)
+
+  return _apply_fn
+
+
 def optimize(optimizations=None):
   """A transformation that applies optimizations.
 
@@ -82,7 +100,7 @@ def optimize(optimizations=None):
   return _apply_fn
 
 
-class _AssertNextDataset(dataset_ops.UnaryDataset):
+class _AssertNextDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that asserts which transformations happen next."""
 
   def __init__(self, input_dataset, transformations):
@@ -100,15 +118,16 @@ class _AssertNextDataset(dataset_ops.UnaryDataset):
         self._transformations,
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
+class _NonSerializableDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` that performs non-serializable identity transformation."""
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def __init__(self, input_dataset):
+    """See `non_serializable()` for details."""
+    super(_NonSerializableDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
 
+  def _as_variant_tensor(self):
+    return gen_experimental_dataset_ops.experimental_non_serializable_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        **dataset_ops.flat_structure(self))
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..11b8b86f64b204782030411cc533d57dcc348bd3
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -0,0 +1,118 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling optimizations in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.data.util import options
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.OptimizationOptions")
+class OptimizationOptions(options.OptionsBase):
+  """Represents options for dataset optimizations.
+
+  You can apply `OptimizationOptions` to a `dataset` object, as follows:
+
+  ```python
+  options = tf.data.Options()
+  options.optimization = tf.data.experimental.OptimizationOptions()
+  options.optimization.map_and_batch_fusion = True
+  dataset = dataset.with_options(options)
+  ```
+  """
+  apply_default_optimizations = options.create_option(
+      name="apply_default_optimizations",
+      ty=bool,
+      docstring=
+      "Whether to apply default static optimizations. If False, only static "
+      "optimizations that have been explicitly enabled will be applied.")
+
+  filter_fusion = options.create_option(
+      name="filter_fusion",
+      ty=bool,
+      docstring="Whether to fuse filter transformations.")
+
+  hoist_random_uniform = options.create_option(
+      name="hoist_random_uniform",
+      ty=bool,
+      docstring=
+      "Whether to hoist `tf.random_uniform()` ops out of map transformations.")
+
+  map_and_batch_fusion = options.create_option(
+      name="map_and_batch_fusion",
+      ty=bool,
+      docstring="Whether to fuse map and batch transformations.")
+
+  map_and_filter_fusion = options.create_option(
+      name="map_and_filter_fusion",
+      ty=bool,
+      docstring="Whether to fuse map and filter transformations.")
+
+  map_fusion = options.create_option(
+      name="map_and_filter_fusion",
+      ty=bool,
+      docstring="Whether to fuse map transformations.")
+
+  map_parallelization = options.create_option(
+      name="map_parallelization",
+      ty=bool,
+      docstring="Whether to parallelize stateless map transformations.")
+
+  map_vectorization = options.create_option(
+      name="map_vectorization",
+      ty=bool,
+      docstring="Whether to vectorize map transformations.")
+
+  noop_elimination = options.create_option(
+      name="noop_elimination",
+      ty=bool,
+      docstring="Whether to eliminate no-op transformations.")
+
+  shuffle_and_repeat_fusion = options.create_option(
+      name="shuffle_and_repeat_fusion",
+      ty=bool,
+      docstring="Whether to fuse shuffle and repeat transformations. If None, "
+      "defaults to True.")
+
+  def _static_optimizations(self):
+    """Produces the list of enabled static optimizations."""
+    result = []
+    optimizations_to_enable = [
+        "filter_fusion",
+        "hoist_random_uniform",
+        "map_and_filter_fusion",
+        "map_fusion",
+        "map_parallelization",
+        "map_vectorization",
+    ]
+    for optimization in optimizations_to_enable:
+      if getattr(self, optimization):
+        result.append(optimization)
+
+    if self.apply_default_optimizations is not False:
+      # The following optimizations are turned on by default, unless the
+      # user explicitly disables them.
+      optimizations_to_disable = [
+          "map_and_batch_fusion",
+          "noop_elimination",
+          "shuffle_and_repeat_fusion",
+      ]
+      for optimization in optimizations_to_disable:
+        if getattr(self, optimization) is not False:
+          result.append(optimization)
+    return result
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index 6615b9022a23628fb5c37fb51762c429086b983c..deb20d61888adeeff078997fc8adfede604de8eb 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -18,11 +18,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -33,8 +33,8 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
   def __init__(self, input_dataset, features, num_parallel_calls):
     super(_ParseExampleDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
-    if not all(types == dtypes.string
-               for types in nest.flatten(input_dataset.output_types)):
+    if not input_dataset._element_structure.is_compatible_with(  # pylint: disable=protected-access
+        structure.TensorStructure(dtypes.string, [None])):
       raise TypeError("Input dataset should be a dataset of vectors of strings")
     self._num_parallel_calls = num_parallel_calls
     # pylint: disable=protected-access
@@ -67,20 +67,22 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
         for _ in range(len(sparse_keys))
     ]
 
-    self._output_shapes = dict(
+    output_shapes = dict(
         zip(self._dense_keys + self._sparse_keys,
             dense_output_shapes + sparse_output_shapes))
-    self._output_types = dict(
+    output_types = dict(
         zip(self._dense_keys + self._sparse_keys,
             self._dense_types + self._sparse_types))
-    self._output_classes = dict(
+    output_classes = dict(
         zip(self._dense_keys + self._sparse_keys,
             [ops.Tensor for _ in range(len(self._dense_defaults))] +
             [sparse_tensor.SparseTensor for _ in range(len(self._sparse_keys))
             ]))
+    self._structure = structure.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.parse_example_dataset(
+    return gen_experimental_dataset_ops.experimental_parse_example_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._num_parallel_calls,
         self._dense_defaults,
@@ -91,16 +93,8 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
 
 # TODO(b/111553342): add arguments names and example names as well.
@@ -138,10 +132,10 @@ def parse_example_dataset(features, num_parallel_calls=1):
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
     out_dataset = _ParseExampleDataset(dataset, features, num_parallel_calls)
-    if any([
+    if any(
         isinstance(feature, parsing_ops.SparseFeature)
         for _, feature in features.items()
-    ]):
+    ):
       # pylint: disable=protected-access
       # pylint: disable=g-long-lambda
       out_dataset = out_dataset.map(
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
index 48d7136f953dc49cfba34a1262276611999e697f..e46dfb6568d5d0c29187c233e503cef98eecece1 100644
--- a/tensorflow/python/data/experimental/ops/prefetching_ops.py
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -17,17 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
-
+from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import device as framework_device
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_dataset_ops
@@ -36,303 +33,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-def function_buffering_resource(string_arg,
-                                target_device,
-                                f,
-                                buffer_size,
-                                output_types,
-                                container="",
-                                shared_name=None,
-                                name=None):
-  """Creates a FunctionBufferingResource.
-
-  A FunctionBufferingResource fills up a buffer by calling a function `f` on
-  `target_device`. `f` should take in only a single string argument as input.
-
-  Args:
-    string_arg: The single string argument to the function.
-    target_device: The device to run `f` on.
-    f: The function to be executed.
-    buffer_size: Size of the buffer to be populated.
-    output_types: The output types generated by the function.
-    container: (Optional) string. Defaults to "".
-    shared_name: (Optional) string.
-    name: (Optional) string to name the op.
-
-  Returns:
-    Handle to a FunctionBufferingResource.
-  """
-  if shared_name is None:
-    shared_name = ""
-  return ged_ops.experimental_function_buffering_resource(
-      string_arg=string_arg,
-      target_device=target_device,
-      shared_name=shared_name,
-      f=f,
-      buffer_size=buffer_size,
-      container=container,
-      name=name,
-      output_types=output_types)
-
-
-def function_buffering_resource_get_next(function_buffer_resource,
-                                         output_types,
-                                         name=None):
-  return ged_ops.experimental_function_buffering_resource_get_next(
-      function_buffer_resource=function_buffer_resource,
-      output_types=output_types,
-      name=name)
-
-
-def function_buffering_resource_reset(function_buffer_resource, name=None):
-  return ged_ops.experimental_function_buffering_resource_reset(
-      function_buffer_resource=function_buffer_resource, name=name)
-
-
-# pylint: disable=protected-access
-class _PrefetchToDeviceIterator(object):
-  """A replacement for `tf.data.Iterator` that prefetches to another device.
-
-  Args:
-    input_dataset: The input dataset
-    one_shot: If true, we make a one shot iterator that's already initialized.
-    device: A fully specified device string where we want to prefetch to
-    buffer_size: Size of the prefetching buffer.
-    shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-  Returns:
-    An Iterator type object.
-  """
-
-  def __init__(self,
-               input_dataset,
-               one_shot,
-               device,
-               buffer_size,
-               shared_name=None):
-    self._input_dataset = input_dataset
-    self._get_next_call_count = 0
-    self._one_shot = one_shot
-    if shared_name is None:
-      shared_name = ""
-
-    if self._one_shot:
-      self._input_iterator = input_dataset.make_one_shot_iterator()
-    else:
-      self._input_iterator = iterator_ops.Iterator.from_structure(
-          self._input_dataset.output_types, self._input_dataset.output_shapes,
-          shared_name, self._input_dataset.output_classes)
-    input_iterator_handle = self._input_iterator.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _prefetch_fn(handle):
-      """Prefetches one element from `input_iterator`."""
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, self._input_iterator.output_types,
-          self._input_iterator.output_shapes,
-          self._input_iterator.output_classes)
-      ret = remote_iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    iterator_device = ged_ops.experimental_iterator_get_device(
-        self._input_iterator._iterator_resource)
-
-    with ops.device(device):
-      self._buffering_resource = function_buffering_resource(
-          f=_prefetch_fn,
-          target_device=iterator_device,
-          string_arg=input_iterator_handle,
-          buffer_size=buffer_size,
-          shared_name=shared_name,
-          output_types=nest.flatten(
-              sparse.as_dense_types(self._input_dataset.output_types,
-                                    self._input_dataset.output_classes)))
-
-    if not self._one_shot:
-      reset_op = function_buffering_resource_reset(self._buffering_resource)
-      with ops.control_dependencies([reset_op]):
-        self._initializer = self._input_iterator.make_initializer(
-            self._input_dataset)
-
-  def get_next(self, name=None):
-    """See `tf.data.Iterator.get_next`."""
-    self._get_next_call_count += 1
-    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
-      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
-
-    flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
-        self._buffering_resource,
-        output_types=nest.flatten(
-            sparse.as_dense_types(self.output_types, self.output_classes)),
-        name=name)
-
-    ret = sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(self.output_types, flat_ret),
-        self.output_types, self.output_shapes, self.output_classes)
-
-    for tensor, shape in zip(
-        nest.flatten(ret), nest.flatten(self.output_shapes)):
-      if isinstance(tensor, ops.Tensor):
-        tensor.set_shape(shape)
-
-    return ret
-
-  @property
-  def initializer(self):
-    if self._one_shot:
-      raise NotImplementedError("Can't initialize a one_shot_iterator")
-    return self._initializer
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
-  """A replacement for `tf.data.Iterator` that prefetches to another device.
-
-  Args:
-    input_dataset: The input dataset
-    one_shot: If true, we make a one shot iterator that's already initialized.
-    device: A fully specified device string where we want to prefetch to
-    buffer_size: Size of the prefetching buffer.
-    shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-  Returns:
-    An Iterator type object.
-  """
-
-  def __init__(self,
-               input_dataset,
-               device,
-               buffer_size):
-    with ops.device("/device:CPU:0"):
-      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
-      input_iterator_handle = gen_dataset_ops.iterator_to_string_handle(
-          self._resource)
-
-    self._device = device
-
-    @function.Defun(dtypes.string)
-    def _prefetch_fn(handle):
-      """Prefetches one element from `input_iterator`."""
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, self.output_types, self.output_shapes, self.output_classes)
-      ret = remote_iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    _prefetch_fn.add_to_graph(None)
-
-    with ops.device(device):
-      self._buffering_resource = function_buffering_resource(
-          f=_prefetch_fn,
-          output_types=self._flat_output_types,
-          target_device=ged_ops.experimental_iterator_get_device(
-              self._resource),
-          string_arg=input_iterator_handle,
-          buffer_size=buffer_size,
-          shared_name=iterator_ops._generate_shared_name(
-              "function_buffer_resource"))
-
-  def _next_internal(self):
-    """Returns a nested structure of `tf.Tensor`s containing the next element.
-    """
-    # This runs in sync mode as iterators use an error status to communicate
-    # that there is no more data to iterate over.
-    # TODO(b/77291417): Fix
-    with context.execution_mode(context.SYNC):
-      with ops.device(self._device):
-        ret = ged_ops.experimental_function_buffering_resource_get_next(
-            function_buffer_resource=self._buffering_resource,
-            output_types=self._flat_output_types)
-      return sparse.deserialize_sparse_tensors(
-          nest.pack_sequence_as(self._output_types, ret), self._output_types,
-          self._output_shapes, self._output_classes)
-# pylint: enable=protected-access
-
-
-class _PrefetchToDeviceDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` whose iterator prefetches elements to another device."""
-
-  def __init__(self, input_dataset, device, buffer_size):
-    super(_PrefetchToDeviceDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._device = device
-    self._buffer_size = buffer_size if buffer_size is not None else 1
-
-  # The static analysis cannot tell that the eager iterator's superclass has
-  # a `next()` method.
-  # pylint: disable=non-iterator-returned
-  def __iter__(self):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    The returned iterator implements the Python iterator protocol and therefore
-    can only be used in eager mode.
-
-    Returns:
-      An `Iterator` over the elements of this dataset.
-
-    Raises:
-      RuntimeError: If eager execution is enabled.
-    """
-    if context.executing_eagerly():
-      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
-                                            self._buffer_size)
-    else:
-      raise RuntimeError("dataset.__iter__() is only supported when eager "
-                         "execution is enabled.")
-  # pylint: enable=non-iterator-returned
-
-  def make_one_shot_iterator(self):
-    if context.executing_eagerly():
-      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
-                                            self._buffer_size)
-    else:
-      return _PrefetchToDeviceIterator(self._input_dataset, one_shot=True,
-                                       device=self._device,
-                                       buffer_size=self._buffer_size)
-
-  def make_initializable_iterator(self, shared_name=None):
-    return _PrefetchToDeviceIterator(
-        self._input_dataset,
-        one_shot=False,
-        device=self._device,
-        buffer_size=self._buffer_size,
-        shared_name=shared_name)
-
-  def _as_variant_tensor(self):
-    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
-    # transformation methods is called.
-    # TODO(mrry): Investigate support for chaining further transformations after
-    # the prefetch, including GPU support.
-    raise NotImplementedError("`prefetch_to_device()` must be the last "
-                              "transformation in a dataset pipeline.")
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-
 @tf_export("data.experimental.prefetch_to_device")
 def prefetch_to_device(device, buffer_size=None):
   """A transformation that prefetches dataset values to the given `device`.
@@ -350,7 +50,8 @@ def prefetch_to_device(device, buffer_size=None):
     `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
-    return _PrefetchToDeviceDataset(dataset, device, buffer_size)
+    return dataset.apply(
+        copy_to_device(target_device=device)).prefetch(buffer_size)
 
   return _apply_fn
 
@@ -369,8 +70,14 @@ def copy_to_device(target_device, source_device="/cpu:0"):
   """
 
   def _apply_fn(dataset):
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.apply_default_optimizations = False
+    options.experimental_optimization = opt_options
     return _CopyToDeviceDataset(
-        dataset, target_device=target_device, source_device=source_device)
+        dataset, target_device=target_device,
+        source_device=source_device).with_options(options)
 
   return _apply_fn
 
@@ -378,7 +85,7 @@ def copy_to_device(target_device, source_device="/cpu:0"):
 # TODO(rohanj): Use the _input_hostmem attr on the RemoteCall ops to indicate
 # all inputs to the Op are in host memory, thereby avoiding some unnecessary
 # Sends and Recvs.
-class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
+class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that copies elements to another device."""
 
   def __init__(self, input_dataset, target_device, source_device="/cpu:0"):
@@ -397,14 +104,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
     self._source_device_string = source_device
     self._source_device = ops.convert_to_tensor(source_device)
 
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._input_dataset.output_shapes,
-                               self._input_dataset.output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._input_dataset.output_types,
-                              self._input_dataset.output_classes))
-
-    @function.Defun()
+    @function.defun()
     def _init_func():
       """Creates an iterator for the input dataset.
 
@@ -414,24 +114,25 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
       # pylint: disable=protected-access
       ds_variant = self._input_dataset._as_variant_tensor()
       resource = gen_dataset_ops.anonymous_iterator(
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self._input_dataset))
       with ops.control_dependencies(
           [gen_dataset_ops.make_iterator(ds_variant, resource)]):
         return gen_dataset_ops.iterator_to_string_handle(resource)
 
-    @function.Defun()
+    init_func_concrete = _init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
+    @function.defun()
     def _remote_init_func():
       return functional_ops.remote_call(
           target=self._source_device,
-          args=_init_func.captured_inputs,
+          args=init_func_concrete.captured_inputs,
           Tout=[dtypes.string],
-          f=_init_func)
+          f=init_func_concrete)
 
-    self._init_func = _remote_init_func
-    self._init_captured_args = _remote_init_func.captured_inputs
+    self._init_func = _remote_init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    self._init_captured_args = self._init_func.captured_inputs
 
-    @function.Defun(dtypes.string)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _next_func(string_handle):
       """Calls get_next for created iterator.
 
@@ -444,21 +145,23 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
         iterator = iterator_ops.Iterator.from_string_handle(
             string_handle, self.output_types, self.output_shapes,
             self.output_classes)
-      ret = iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+      return self._element_structure._to_tensor_list(iterator.get_next())  # pylint: disable=protected-access
+
+    next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
-    @function.Defun(dtypes.string)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
           target=self._source_device,
-          args=[string_handle] + _next_func.captured_inputs,
-          Tout=self._flat_output_types,
-          f=_next_func)
+          args=[string_handle] +
+          next_func_concrete.captured_inputs,
+          Tout=self._input_dataset._element_structure._flat_types,  # pylint: disable=protected-access
+          f=next_func_concrete)
 
-    self._next_func = _remote_next_func
-    self._next_captured_args = _remote_next_func.captured_inputs
+    self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    self._next_captured_args = self._next_func.captured_inputs
 
-    @function.Defun(dtypes.string)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _finalize_func(string_handle):
       """Destroys the iterator resource created.
 
@@ -469,28 +172,30 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
       """
       iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
           string_handle,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self._input_dataset))
       with ops.control_dependencies([
           resource_variable_ops.destroy_resource_op(
               iterator_resource, ignore_lookup_error=True)]):
         return array_ops.constant(0, dtypes.int64)
 
-    @function.Defun(dtypes.string)
+    finalize_func_concrete = _finalize_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
           target=self._source_device,
-          args=[string_handle] + _finalize_func.captured_inputs,
+          args=[string_handle] + finalize_func_concrete.captured_inputs,
           Tout=[dtypes.int64],
-          f=_finalize_func)
+          f=finalize_func_concrete)
 
-    self._finalize_func = _remote_finalize_func
-    self._finalize_captured_args = _remote_finalize_func.captured_inputs
+    self._finalize_func = _remote_finalize_func._get_concrete_function_internal(  # pylint: disable=protected-access
+    )
+    self._finalize_captured_args = self._finalize_func.captured_inputs
 
     g = ops.get_default_graph()
-    _remote_init_func.add_to_graph(g)
-    _remote_next_func.add_to_graph(g)
-    _remote_finalize_func.add_to_graph(g)
+    self._init_func.add_to_graph(g)
+    self._next_func.add_to_graph(g)
+    self._finalize_func.add_to_graph(g)
     # pylint: enable=protected-scope
 
   # The one_shot_iterator implementation needs a 0 arg _make_dataset function
@@ -515,17 +220,63 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           init_func=self._init_func,
           next_func=self._next_func,
           finalize_func=self._finalize_func,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self._input_dataset))
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
+class _MapOnGpuDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over elements in its using a GPU."""
+
+  def __init__(self, input_dataset, map_func, use_inter_op_parallelism=True):
+    """See `Dataset.map()` for details."""
+    super(_MapOnGpuDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._use_inter_op_parallelism = use_inter_op_parallelism
+
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        defun_kwargs={"experimental_ints_on_device": True})
+
+  def _functions(self):
+    return [self._map_func]
+
+  def _as_variant_tensor(self):
+    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+    return ged_ops.experimental_map_dataset(
+        input_t,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        use_inter_op_parallelism=self._use_inter_op_parallelism,
+        **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
+  def _element_structure(self):
+    return self._map_func.output_structure
+
+  def _transformation_name(self):
+    return "map_on_gpu()"
+
+
+def map_on_gpu(map_func):
+  """Maps `map_func` across the elements of this dataset.
+
+  NOTE: This is a highly experimental version of `tf.data.Dataset.map` that runs
+  `map_func` on GPU. It must be used after applying the
+  `tf.data.experimental.copy_to_device` transformation with a GPU device
+  argument.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors (having shapes
+      and types defined by `self.output_shapes` and `self.output_types`) to
+      another nested structure of tensors.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _MapOnGpuDataset(dataset, map_func)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index e3a2aeab31ea35ee9636821e3e8b8db35ed72b65..cbdf367db6bd5b4ce27e636c08a19cd4fedda041 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -17,38 +17,46 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import random_seed
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.RandomDataset")
-class RandomDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.RandomDataset", v1=[])
+class RandomDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` of pseudorandom values."""
 
   def __init__(self, seed=None):
     """A `Dataset` of pseudorandom values."""
-    super(RandomDataset, self).__init__()
+    super(RandomDatasetV2, self).__init__()
     self._seed, self._seed2 = random_seed.get_seed(seed)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.random_dataset(
+    return gen_experimental_dataset_ops.experimental_random_dataset(
         seed=self._seed,
         seed2=self._seed2,
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return ops.Tensor
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.int64, [])
 
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
 
-  @property
-  def output_types(self):
-    return dtypes.int64
+@tf_export(v1=["data.experimental.RandomDataset"])
+class RandomDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` of pseudorandom values."""
+
+  @functools.wraps(RandomDatasetV2.__init__)
+  def __init__(self, seed=None):
+    wrapped = RandomDatasetV2(seed)
+    super(RandomDatasetV1, self).__init__(wrapped)
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+RandomDataset = RandomDatasetV1
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 3b2d0945148e44a0c800b4a661b88fc921e93507..c2d82aeb59174fb9d35c4cc2c3d850fb351d8a90 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import collections
 import csv
+import functools
 
 import numpy as np
 
@@ -31,13 +32,13 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util.tf_export import tf_export
 
@@ -306,8 +307,8 @@ def make_tf_record_dataset(file_pattern,
     return dataset.prefetch(buffer_size=prefetch_buffer_size)
 
 
-@tf_export("data.experimental.make_csv_dataset")
-def make_csv_dataset(
+@tf_export("data.experimental.make_csv_dataset", v1=[])
+def make_csv_dataset_v2(
     file_pattern,
     batch_size,
     column_names=None,
@@ -506,11 +507,42 @@ def make_csv_dataset(
   return dataset
 
 
+@tf_export(v1=["data.experimental.make_csv_dataset"])
+def make_csv_dataset_v1(
+    file_pattern,
+    batch_size,
+    column_names=None,
+    column_defaults=None,
+    label_name=None,
+    select_columns=None,
+    field_delim=",",
+    use_quote_delim=True,
+    na_value="",
+    header=True,
+    num_epochs=None,
+    shuffle=True,
+    shuffle_buffer_size=10000,
+    shuffle_seed=None,
+    prefetch_buffer_size=optimization.AUTOTUNE,
+    num_parallel_reads=1,
+    sloppy=False,
+    num_rows_for_inference=100,
+    compression_type=None,
+):  # pylint: disable=missing-docstring
+  return dataset_ops.DatasetV1Adapter(make_csv_dataset_v2(
+      file_pattern, batch_size, column_names, column_defaults, label_name,
+      select_columns, field_delim, use_quote_delim, na_value, header,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, num_parallel_reads, sloppy, num_rows_for_inference,
+      compression_type))
+make_csv_dataset_v1.__doc__ = make_csv_dataset_v2.__doc__
+
+
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 4 * 1024 * 1024  # 4 MB
 
 
-@tf_export("data.experimental.CsvDataset")
-class CsvDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.CsvDataset", v1=[])
+class CsvDatasetV2(dataset_ops.DatasetSource):
   """A Dataset comprising lines from one or more CSV files."""
 
   def __init__(self,
@@ -540,7 +572,9 @@ class CsvDataset(dataset_ops.DatasetSource):
 
     We can construct a CsvDataset from it as follows:
     ```python
-    dataset = tf.data.experimental.CsvDataset(
+    tf.enable_eager_execution()
+
+     dataset = tf.data.experimental.CsvDataset(
         "my_file*.csv",
         [tf.float32,  # Required field, use dtype or empty tensor
          tf.constant([0.0], dtype=tf.float32),  # Optional field, default to 0.0
@@ -552,13 +586,8 @@ class CsvDataset(dataset_ops.DatasetSource):
 
     The expected output of its iterations is:
     ```python
-    next_element = dataset.make_one_shot_iterator().get_next()
-    with tf.Session() as sess:
-      while True:
-        try:
-          print(sess.run(next_element))
-        except tf.errors.OutOfRangeError:
-          break
+    for element in dataset:
+      print(element)
 
     >> (4.28e10, 5.55e6, 12)
     >> (-5.3e14, 0.0, 2)
@@ -593,7 +622,7 @@ class CsvDataset(dataset_ops.DatasetSource):
         the input data. If specified, only this subset of columns will be
         parsed. Defaults to parsing all columns.
     """
-    super(CsvDataset, self).__init__()
+    super(CsvDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -623,11 +652,9 @@ class CsvDataset(dataset_ops.DatasetSource):
         argument_default=[],
         argument_dtype=dtypes.int64,
     )
-    self._output_shapes = tuple(
-        tensor_shape.scalar() for _ in range(len(record_defaults)))
-    self._output_types = tuple(d.dtype for d in self._record_defaults)
-    self._output_classes = tuple(
-        ops.Tensor for _ in range(len(record_defaults)))
+    self._structure = structure.NestedStructure(
+        tuple(structure.TensorStructure(d.dtype, [])
+              for d in self._record_defaults))
 
   def _as_variant_tensor(self):
     # Constructs graph node for the dataset op.
@@ -636,7 +663,7 @@ class CsvDataset(dataset_ops.DatasetSource):
         record_defaults=self._record_defaults,
         buffer_size=self._buffer_size,
         header=self._header,
-        output_shapes=self._output_shapes,
+        output_shapes=self._structure._flat_shapes,  # pylint: disable=protected-access
         field_delim=self._field_delim,
         use_quote_delim=self._use_quote_delim,
         na_value=self._na_value,
@@ -645,34 +672,47 @@ class CsvDataset(dataset_ops.DatasetSource):
     )
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_shapes(self):
-    return self._output_shapes
 
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-
-@tf_export("data.experimental.make_batched_features_dataset")
-def make_batched_features_dataset(file_pattern,
-                                  batch_size,
-                                  features,
-                                  reader=core_readers.TFRecordDataset,
-                                  label_key=None,
-                                  reader_args=None,
-                                  num_epochs=None,
-                                  shuffle=True,
-                                  shuffle_buffer_size=10000,
-                                  shuffle_seed=None,
-                                  prefetch_buffer_size=optimization.AUTOTUNE,
-                                  reader_num_threads=1,
-                                  parser_num_threads=2,
-                                  sloppy_ordering=False,
-                                  drop_final_batch=False):
+@tf_export(v1=["data.experimental.CsvDataset"])
+class CsvDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A Dataset comprising lines from one or more CSV files."""
+
+  @functools.wraps(CsvDatasetV2.__init__)
+  def __init__(self,
+               filenames,
+               record_defaults,
+               compression_type=None,
+               buffer_size=None,
+               header=False,
+               field_delim=",",
+               use_quote_delim=True,
+               na_value="",
+               select_cols=None):
+    wrapped = CsvDatasetV2(filenames, record_defaults, compression_type,
+                           buffer_size, header, field_delim, use_quote_delim,
+                           na_value, select_cols)
+    super(CsvDatasetV1, self).__init__(wrapped)
+
+
+@tf_export("data.experimental.make_batched_features_dataset", v1=[])
+def make_batched_features_dataset_v2(file_pattern,
+                                     batch_size,
+                                     features,
+                                     reader=core_readers.TFRecordDataset,
+                                     label_key=None,
+                                     reader_args=None,
+                                     num_epochs=None,
+                                     shuffle=True,
+                                     shuffle_buffer_size=10000,
+                                     shuffle_seed=None,
+                                     prefetch_buffer_size=optimization.AUTOTUNE,
+                                     reader_num_threads=1,
+                                     parser_num_threads=2,
+                                     sloppy_ordering=False,
+                                     drop_final_batch=False):
   """Returns a `Dataset` of feature dictionaries from `Example` protos.
 
   If label_key argument is provided, returns a `Dataset` of tuple
@@ -760,6 +800,7 @@ def make_batched_features_dataset(file_pattern,
     Each `dict` maps feature keys to `Tensor` or `SparseTensor` objects.
 
   Raises:
+    TypeError: If `reader` is a `tf.ReaderBase` subclass.
     ValueError: If `label_key` is not one of the `features` keys.
   """
   # Create dataset of all matching filenames
@@ -768,6 +809,12 @@ def make_batched_features_dataset(file_pattern,
   if shuffle:
     dataset = dataset.shuffle(len(filenames), shuffle_seed)
 
+  if isinstance(reader, type) and issubclass(reader, io_ops.ReaderBase):
+    raise TypeError("The `reader` argument must return a `Dataset` object. "
+                    "`tf.ReaderBase` subclasses are not supported. For "
+                    "example, pass `tf.data.TFRecordDataset` instead of "
+                    "`tf.TFRecordReader`.")
+
   # Read `Example` records from files as tensor objects.
   if reader_args is None:
     reader_args = []
@@ -811,6 +858,31 @@ def make_batched_features_dataset(file_pattern,
   return dataset
 
 
+@tf_export(v1=["data.experimental.make_batched_features_dataset"])
+def make_batched_features_dataset_v1(file_pattern,  # pylint: disable=missing-docstring
+                                     batch_size,
+                                     features,
+                                     reader=core_readers.TFRecordDataset,
+                                     label_key=None,
+                                     reader_args=None,
+                                     num_epochs=None,
+                                     shuffle=True,
+                                     shuffle_buffer_size=10000,
+                                     shuffle_seed=None,
+                                     prefetch_buffer_size=optimization.AUTOTUNE,
+                                     reader_num_threads=1,
+                                     parser_num_threads=2,
+                                     sloppy_ordering=False,
+                                     drop_final_batch=False):
+  return dataset_ops.DatasetV1Adapter(make_batched_features_dataset_v2(
+      file_pattern, batch_size, features, reader, label_key, reader_args,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, reader_num_threads, parser_num_threads,
+      sloppy_ordering, drop_final_batch))
+make_batched_features_dataset_v2.__doc__ = (
+    make_batched_features_dataset_v1.__doc__)
+
+
 def _get_file_names(file_pattern, shuffle):
   """Parse list of file names from pattern, optionally shuffled.
 
@@ -842,8 +914,8 @@ def _get_file_names(file_pattern, shuffle):
   return file_names
 
 
-@tf_export("data.experimental.SqlDataset")
-class SqlDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.SqlDataset", v1=[])
+class SqlDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` consisting of the results from a SQL query."""
 
   def __init__(self, driver_name, data_source_name, query, output_types):
@@ -853,17 +925,14 @@ class SqlDataset(dataset_ops.DatasetSource):
     For example:
 
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.data.experimental.SqlDataset("sqlite", "/foo/bar.sqlite3",
                                               "SELECT name, age FROM people",
                                               (tf.string, tf.int32))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
     # Prints the rows of the result set of the above query.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for element in dataset:
+      print(element)
     ```
 
     Args:
@@ -875,30 +944,40 @@ class SqlDataset(dataset_ops.DatasetSource):
       output_types: A tuple of `tf.DType` objects representing the types of the
         columns returned by `query`.
     """
-    super(SqlDataset, self).__init__()
+    super(SqlDatasetV2, self).__init__()
     self._driver_name = ops.convert_to_tensor(
         driver_name, dtype=dtypes.string, name="driver_name")
     self._data_source_name = ops.convert_to_tensor(
         data_source_name, dtype=dtypes.string, name="data_source_name")
     self._query = ops.convert_to_tensor(
         query, dtype=dtypes.string, name="query")
-    self._output_types = output_types
+    self._structure = structure.NestedStructure(
+        nest.map_structure(
+            lambda dtype: structure.TensorStructure(dtype, []), output_types))
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.sql_dataset(self._driver_name,
-                                       self._data_source_name, self._query,
-                                       nest.flatten(self.output_types),
-                                       nest.flatten(self.output_shapes))
+    return gen_experimental_dataset_ops.experimental_sql_dataset(
+        self._driver_name, self._data_source_name, self._query,
+        nest.flatten(self.output_types), nest.flatten(self.output_shapes))
 
   @property
-  def output_classes(self):
-    return nest.map_structure(lambda _: ops.Tensor, self._output_types)
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_shapes(self):
-    return nest.map_structure(lambda _: tensor_shape.TensorShape([]),
-                              self._output_types)
 
-  @property
-  def output_types(self):
-    return self._output_types
+@tf_export(v1=["data.experimental.SqlDataset"])
+class SqlDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` consisting of the results from a SQL query."""
+
+  @functools.wraps(SqlDatasetV2.__init__)
+  def __init__(self, driver_name, data_source_name, query, output_types):
+    wrapped = SqlDatasetV2(driver_name, data_source_name, query, output_types)
+    super(SqlDatasetV1, self).__init__(wrapped)
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+CsvDataset = CsvDatasetV1
+SqlDataset = SqlDatasetV1
+make_batched_features_dataset = make_batched_features_dataset_v1
+make_csv_dataset = make_csv_dataset_v1
diff --git a/tensorflow/python/data/experimental/ops/scan_ops.py b/tensorflow/python/data/experimental/ops/scan_ops.py
index e05e7c5a18755af40b77362ff16abf56714e16ed..5c77ad734348401ed666c562b36ef52ec8c5525b 100644
--- a/tensorflow/python/data/experimental/ops/scan_ops.py
+++ b/tensorflow/python/data/experimental/ops/scan_ops.py
@@ -21,10 +21,10 @@ import collections
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -49,18 +49,7 @@ class _ScanDataset(dataset_ops.UnaryDataset):
     # Compute initial values for the state classes, shapes and types based on
     # the initial state. The shapes may be refined by running `tf_scan_func` one
     # or more times below.
-    self._state_classes = sparse.get_classes(self._initial_state)
-    self._state_shapes = nest.pack_sequence_as(
-        self._initial_state,
-        [t.get_shape() for t in nest.flatten(self._initial_state)])
-    self._state_types = nest.pack_sequence_as(
-        self._initial_state,
-        [t.dtype for t in nest.flatten(self._initial_state)])
-
-    # Will be populated by calling `tf_scan_func`.
-    self._output_classes = None
-    self._output_shapes = None
-    self._output_types = None
+    self._state_structure = structure.Structure.from_value(self._initial_state)
 
     # Iteratively rerun the scan function until reaching a fixed point on
     # `self._state_shapes`.
@@ -69,10 +58,9 @@ class _ScanDataset(dataset_ops.UnaryDataset):
 
       wrapped_func = dataset_ops.StructuredFunctionWrapper(
           scan_func,
-          "tf.data.experimental.scan()",
-          input_classes=(self._state_classes, input_dataset.output_classes),
-          input_shapes=(self._state_shapes, input_dataset.output_shapes),
-          input_types=(self._state_types, input_dataset.output_types),
+          self._transformation_name(),
+          input_structure=structure.NestedStructure(
+              (self._state_structure, input_dataset._element_structure)),  # pylint: disable=protected-access
           add_to_graph=False)
       if not (
           isinstance(wrapped_func.output_types, collections.Sequence) and
@@ -83,29 +71,35 @@ class _ScanDataset(dataset_ops.UnaryDataset):
       new_state_classes, self._output_classes = wrapped_func.output_classes
 
       # Extract and validate class information from the returned values.
-      for new_state_class, state_class in zip(
+      new_state_classes, output_classes = wrapped_func.output_classes
+      old_state_classes = self._state_structure._to_legacy_output_classes()  # pylint: disable=protected-access
+      for new_state_class, old_state_class in zip(
           nest.flatten(new_state_classes),
-          nest.flatten(self._state_classes)):
-        if not issubclass(new_state_class, state_class):
+          nest.flatten(old_state_classes)):
+        if not issubclass(new_state_class, old_state_class):
           raise TypeError(
               "The element classes for the new state must match the initial "
               "state. Expected %s; got %s." %
-              (self._state_classes, new_state_classes))
+              (old_state_classes, new_state_classes))
 
       # Extract and validate type information from the returned values.
-      new_state_types, self._output_types = wrapped_func.output_types
-      for new_state_type, state_type in zip(
-          nest.flatten(new_state_types), nest.flatten(self._state_types)):
-        if new_state_type != state_type:
+      new_state_types, output_types = wrapped_func.output_types
+      old_state_types = self._state_structure._to_legacy_output_types()  # pylint: disable=protected-access
+      for new_state_type, old_state_type in zip(
+          nest.flatten(new_state_types), nest.flatten(old_state_types)):
+        if new_state_type != old_state_type:
           raise TypeError(
               "The element types for the new state must match the initial "
               "state. Expected %s; got %s." %
-              (self._state_types, new_state_types))
+              (old_state_types, new_state_types))
 
       # Extract shape information from the returned values.
-      new_state_shapes, self._output_shapes = wrapped_func.output_shapes
+      new_state_shapes, output_shapes = wrapped_func.output_shapes
+      old_state_shapes = self._state_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
+      self._structure = structure.convert_legacy_structure(
+          output_types, output_shapes, output_classes)
 
-      flat_state_shapes = nest.flatten(self._state_shapes)
+      flat_state_shapes = nest.flatten(old_state_shapes)
       flat_new_state_shapes = nest.flatten(new_state_shapes)
       weakened_state_shapes = [
           original.most_specific_compatible_shape(new)
@@ -122,32 +116,37 @@ class _ScanDataset(dataset_ops.UnaryDataset):
           break
 
       if need_to_rerun:
-        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
-                                                   weakened_state_shapes)
+        # TODO(b/110122868): Support a "most specific compatible structure"
+        # method for combining structures, to avoid using legacy structures
+        # in this method.
+        self._state_structure = structure.convert_legacy_structure(
+            old_state_types,
+            nest.pack_sequence_as(old_state_shapes, weakened_state_shapes),
+            old_state_classes)
 
-    self._scan_func = wrapped_func.function
-    self._scan_func.add_to_graph(ops.get_default_graph())
+    self._scan_func = wrapped_func
+    self._scan_func.function.add_to_graph(ops.get_default_graph())
+
+  def _functions(self):
+    return [self._scan_func]
 
   def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.scan_dataset(
+    # pylint: disable=protected-access
+    input_t = self._input_dataset._as_variant_tensor()
+    return gen_experimental_dataset_ops.experimental_scan_dataset(
         input_t,
-        nest.flatten(sparse.serialize_sparse_tensors(self._initial_state)),
-        self._scan_func.captured_inputs,
-        f=self._scan_func,
+        self._state_structure._to_tensor_list(self._initial_state),
+        self._scan_func.function.captured_inputs,
+        f=self._scan_func.function,
+        preserve_cardinality=True,
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_types(self):
-    return self._output_types
+  def _transformation_name(self):
+    return "tf.data.experimental.scan()"
 
 
 @tf_export("data.experimental.scan")
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index a4307212daf488deae986073264911fcf778588f..d12328a7145992880aedd939d7a02a8a12c61d4c 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
+class _ShuffleAndRepeatDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that fuses `shuffle` and `repeat`."""
 
   def __init__(self, input_dataset, buffer_size, count=None, seed=None):
@@ -53,18 +53,6 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
     # pylint: enable=protected-access
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 @tf_export("data.experimental.shuffle_and_repeat")
 def shuffle_and_repeat(buffer_size, count=None, seed=None):
diff --git a/tensorflow/python/data/experimental/ops/sleep.py b/tensorflow/python/data/experimental/ops/sleep.py
new file mode 100644
index 0000000000000000000000000000000000000000..2da832395b2e665168c1cd9cd7f52fb13e50c830
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/sleep.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling threading in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
+
+
+class _SleepDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A `Dataset` that sleeps before producing each upstream element."""
+
+  def __init__(self, input_dataset, sleep_microseconds):
+    super(_SleepDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._sleep_microseconds = sleep_microseconds
+
+  def _as_variant_tensor(self):
+    return gen_experimental_dataset_ops.experimental_sleep_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._sleep_microseconds,
+        **dataset_ops.flat_structure(self))
+
+
+def sleep(sleep_microseconds):
+  """Sleeps for `sleep_microseconds` before producing each input element.
+
+  Args:
+    sleep_microseconds: The number of microseconds to sleep before producing an
+      input element.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _SleepDataset(dataset, sleep_microseconds)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5fcc033ab7df34369e0680275df744c431ed069
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -0,0 +1,83 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""StatsAggregator for aggregating statistics from `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.StatsAggregator")
+class StatsAggregator(object):
+  """A stateful resource that aggregates statistics from one or more iterators.
+
+  To record statistics, use one of the custom transformation functions defined
+  in this module when defining your `tf.data.Dataset`. All statistics will be
+  aggregated by the `StatsAggregator` that is associated with a particular
+  iterator (see below). For example, to record the latency of producing each
+  element by iterating over a dataset:
+
+  ```python
+  dataset = ...
+  dataset = dataset.apply(tf.data.experimental.latency_stats("total_bytes"))
+  ```
+
+  To associate a `StatsAggregator` with a `tf.data.Dataset` object, use
+  the following pattern:
+
+  ```python
+  aggregator = tf.data.experimental.StatsAggregator()
+  dataset = ...
+
+  # Apply `StatsOptions` to associate `dataset` with `aggregator`.
+  options = dataset_ops.Options()
+  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
+  dataset = dataset.with_options(options)
+  ```
+
+  To get a protocol buffer summary of the currently aggregated statistics,
+  use the `StatsAggregator.get_summary()` tensor. The easiest way to do this
+  is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection,
+  so that the summaries will be included with any existing summaries.
+
+  ```python
+  aggregator = tf.data.experimental.StatsAggregator()
+  # ...
+  stats_summary = aggregator.get_summary()
+  tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
+  ```
+
+  Note: This interface is experimental and expected to change. In particular,
+  we expect to add other implementations of `StatsAggregator` that provide
+  different ways of exporting statistics, and add more types of statistics.
+  """
+
+  def __init__(self):
+    """Creates a `StatsAggregator`."""
+    self._resource = ged_ops.experimental_stats_aggregator_handle()
+
+  # TODO(b/116314787): Update this/add support for V2 summary API.
+  def get_summary(self):
+    """Returns a string `tf.Tensor` that summarizes the aggregated statistics.
+
+    The returned tensor will contain a serialized `tf.summary.Summary` protocol
+    buffer, which can be used with the standard TensorBoard logging facilities.
+
+    Returns:
+      A scalar string `tf.Tensor` that summarizes the aggregated statistics.
+    """
+    return ged_ops.experimental_stats_aggregator_summary(self._resource)
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index 54ef6fc3e85711e5b581c850f3a94da2bdd8c2af..15a9d24546e950543cc3274dbead26178620b5ed 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -20,113 +20,21 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.StatsAggregator")
-class StatsAggregator(object):
-  """A stateful resource that aggregates statistics from one or more iterators.
-
-  To record statistics, use one of the custom transformation functions defined
-  in this module when defining your `tf.data.Dataset`. All statistics will be
-  aggregated by the `StatsAggregator` that is associated with a particular
-  iterator (see below). For example, to record the latency of producing each
-  element by iterating over a dataset:
-
-  ```python
-  dataset = ...
-  dataset = dataset.apply(tf.data.experimental.latency_stats("total_bytes"))
-  ```
-
-  To associate a `StatsAggregator` with a `tf.data.Dataset` object, use
-  the following pattern:
-
-  ```python
-  stats_aggregator = stats_ops.StatsAggregator()
-  dataset = ...
-
-  # Apply `set_stats_aggregator` to associate `dataset` with `stats_aggregator`.
-  dataset = dataset.apply(
-      tf.data.experimental.set_stats_aggregator(stats_aggregator))
-  iterator = dataset.make_one_shot_iterator()
-  ```
-
-  To get a protocol buffer summary of the currently aggregated statistics,
-  use the `StatsAggregator.get_summary()` tensor. The easiest way to do this
-  is to add the returned tensor to the `tf.GraphKeys.SUMMARIES` collection,
-  so that the summaries will be included with any existing summaries.
-
-  ```python
-  stats_aggregator = stats_ops.StatsAggregator()
-  # ...
-  stats_summary = stats_aggregator.get_summary()
-  tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
-  ```
-
-  Note: This interface is experimental and expected to change. In particular,
-  we expect to add other implementations of `StatsAggregator` that provide
-  different ways of exporting statistics, and add more types of statistics.
-  """
-
-  def __init__(self):
-    """Creates a `StatsAggregator`."""
-    self._resource = gen_dataset_ops.stats_aggregator_handle()
-
-  # TODO(b/116314787): Update this/add support for V2 summary API.
-  def get_summary(self):
-    """Returns a string `tf.Tensor` that summarizes the aggregated statistics.
-
-    The returned tensor will contain a serialized `tf.summary.Summary` protocol
-    buffer, which can be used with the standard TensorBoard logging facilities.
-
-    Returns:
-      A scalar string `tf.Tensor` that summarizes the aggregated statistics.
-    """
-    return gen_dataset_ops.stats_aggregator_summary(self._resource)
-
-
-class _SetStatsAggregatorDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that acts as an identity, and sets given stats_aggregator."""
-
-  def __init__(self, input_dataset, stats_aggregator, tag, prefix):
-    super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._stats_aggregator = stats_aggregator
-    self._tag = tag
-    self._prefix = prefix
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.set_stats_aggregator_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._stats_aggregator._resource,  # pylint: disable=protected-access
-        self._tag,
-        self._prefix,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-
-@tf_export("data.experimental.set_stats_aggregator")
-def set_stats_aggregator(stats_aggregator, tag="", counter_prefix=""):
+@deprecation.deprecated(None, "Use `tf.data.experimental.StatsOptions`.")
+def set_stats_aggregator(stats_aggregator, prefix="", counter_prefix=""):
   """Set the given `stats_aggregator` for aggregating the input dataset stats.
 
   Args:
     stats_aggregator: A `tf.contrib.data.StatsAggregator` object.
-    tag: (Optional) String, all statistics recorded for the input `dataset`
-      will have given `tag` prepend with the name.
+    prefix: (Optional) String, all statistics recorded for the input `dataset`
+      will have given `prefix` prepend with the name.
     counter_prefix: (Optional) String, all statistics recorded as `counters`
-      will have the given `prefix` for the counter. Defaults to "/tesorflow".
+      will have the given `prefix` for the counter. Defaults to "/tensorflow".
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -134,8 +42,8 @@ def set_stats_aggregator(stats_aggregator, tag="", counter_prefix=""):
   """
 
   def _apply_fn(dataset):
-    return _SetStatsAggregatorDataset(dataset, stats_aggregator, tag,
-                                      counter_prefix)
+    return dataset_ops._SetStatsAggregatorDataset(  # pylint: disable=protected-access
+        dataset, stats_aggregator, prefix, counter_prefix)
 
   return _apply_fn
 
@@ -158,8 +66,10 @@ def bytes_produced_stats(tag):
   """
 
   def _apply_fn(dataset):
-    return _StatsDataset(dataset, gen_dataset_ops.bytes_produced_stats_dataset,
-                         tag)
+    return _StatsDataset(
+        dataset,
+        gen_experimental_dataset_ops.experimental_bytes_produced_stats_dataset,
+        tag)
 
   return _apply_fn
 
@@ -181,12 +91,14 @@ def latency_stats(tag):
   """
 
   def _apply_fn(dataset):
-    return _StatsDataset(dataset, gen_dataset_ops.latency_stats_dataset, tag)
+    return _StatsDataset(
+        dataset,
+        gen_experimental_dataset_ops.experimental_latency_stats_dataset, tag)
 
   return _apply_fn
 
 
-class _StatsDataset(dataset_ops.UnaryDataset):
+class _StatsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and also records statistics."""
 
   def __init__(self, input_dataset, op_function, tag):
@@ -200,15 +112,3 @@ class _StatsDataset(dataset_ops.UnaryDataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._tag,
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
diff --git a/tensorflow/python/data/experimental/ops/stats_options.py b/tensorflow/python/data/experimental/ops/stats_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e884aa08ae9173df0fda0e81e176644cd342bfa
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/stats_options.py
@@ -0,0 +1,79 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""StatsOptions to configure stats aggregation options for `tf.data` pipelines.
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import stats_aggregator
+from tensorflow.python.data.util import options
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.StatsOptions")
+class StatsOptions(options.OptionsBase):
+  """Represents options for collecting dataset stats using `StatsAggregator`.
+
+  To apply `StatsOptions` with a `tf.data.Dataset` object, use the following
+  pattern:
+
+  ```python
+  aggregator = tf.data.experimental.StatsAggregator()
+
+  options = tf.data.Options()
+  options.experimental_stats = tf.data.experimental.StatsOptions()
+  options.experimental_stats.aggregator = aggregator
+  dataset = dataset.with_options(options)
+  ```
+
+  Note: a `StatsAggregator` object can be attached either duing construction or
+  can be provided later like in above example.
+
+  ```python
+  aggretator = tf.data.experimental.StatsAggregator()
+  # attach aggregator during construction
+  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
+  .....
+  ```
+  """
+
+  aggregator = options.create_option(
+      name="aggregator",
+      ty=stats_aggregator.StatsAggregator,
+      docstring=
+      "Associates the given statistics aggregator with the dataset pipeline.")
+
+  prefix = options.create_option(
+      name="prefix",
+      ty=str,
+      docstring=
+      "Prefix to prepend all statistics recorded for the input `dataset` with.",
+      default="")
+
+  counter_prefix = options.create_option(
+      name="counter_prefix",
+      ty=str,
+      docstring=
+      "Prefix for the statistics recorded as counter.",
+      default="")
+
+  latency_all_edges = options.create_option(
+      name="latency_all_edges",
+      ty=bool,
+      docstring=
+      "Whether to add latency measurements on all edges.",
+      default=True)
diff --git a/tensorflow/python/data/experimental/ops/threading_options.py b/tensorflow/python/data/experimental/ops/threading_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbf662186f818a24a3b19ea678f87351ab45ed6e
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/threading_options.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling threading in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.data.util import options
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.ThreadingOptions")
+class ThreadingOptions(options.OptionsBase):
+  """Represents options for dataset threading.
+
+  To apply `ThreadingOptions` to a `dataset` object, use the following pattern:
+
+  ```python
+  options = tf.data.Options()
+  options.experimental_threading = tf.data.experimental.ThreadingOptions()
+  options.experimental_threading.private_threadpool_size = 10
+  dataset = dataset.with_options(options)
+  ```
+  """
+
+  max_intra_op_parallelism = options.create_option(
+      name="max_intra_op_parallelism",
+      ty=int,
+      docstring=
+      "If set, it overrides the maximum degree of intra-op parallelism.")
+
+  private_threadpool_size = options.create_option(
+      name="private_threadpool_size",
+      ty=int,
+      docstring=
+      "If set, the dataset will use a private threadpool of the given size.",
+      default=None)
diff --git a/tensorflow/python/data/experimental/ops/threadpool.py b/tensorflow/python/data/experimental/ops/threadpool.py
index 3ea017c6e80a1a22a6bd82770db1952aebd38849..69e8829d687fb54767bca1716c259efa150b4887 100644
--- a/tensorflow/python/data/experimental/ops/threadpool.py
+++ b/tensorflow/python/data/experimental/ops/threadpool.py
@@ -60,7 +60,7 @@ class PrivateThreadPool(object):
           display_name=display_name)
 
 
-class _ThreadPoolDataset(dataset_ops.UnaryDataset):
+class _ThreadPoolDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and sets a custom threadpool."""
 
   def __init__(self, input_dataset, thread_pool):
@@ -74,18 +74,6 @@ class _ThreadPoolDataset(dataset_ops.UnaryDataset):
         self._thread_pool._resource,  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
 
 # TODO(b/73383364): Properly export in the `tf.data.experimental` API when
 # stable or make private / remove.
diff --git a/tensorflow/python/data/experimental/ops/unique.py b/tensorflow/python/data/experimental/ops/unique.py
index 2a7775c456e86a9339cdfccf1e05f545238bb145..55ed98d8542187b1bd353e2ca581ef2fd2180875 100644
--- a/tensorflow/python/data/experimental/ops/unique.py
+++ b/tensorflow/python/data/experimental/ops/unique.py
@@ -48,7 +48,7 @@ def unique():
   return _apply_fn
 
 
-class _UniqueDataset(dataset_ops.UnaryDataset):
+class _UniqueDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` contains the unique elements from its input."""
 
   def __init__(self, input_dataset):
@@ -65,15 +65,3 @@ class _UniqueDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_unique_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/writers.py b/tensorflow/python/data/experimental/ops/writers.py
index 994447cb4db352432e6f2a672c45ba8242930126..aef6da51409dbe13f59408b650fc5947f088d89d 100644
--- a/tensorflow/python/data/experimental/ops/writers.py
+++ b/tensorflow/python/data/experimental/ops/writers.py
@@ -22,7 +22,7 @@ from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -48,7 +48,7 @@ class TFRecordWriter(object):
     Returns:
       A `tf.Operation` that, when run, writes contents of `dataset` to a file.
     """
-    if not isinstance(dataset, dataset_ops.Dataset):
+    if not isinstance(dataset, dataset_ops.DatasetV2):
       raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
     if (dataset.output_types != dtypes.string or
         dataset.output_shapes != tensor_shape.scalar()):
@@ -56,5 +56,5 @@ class TFRecordWriter(object):
           "`dataset` must produce scalar `DT_STRING` tensors whereas it "
           "produces shape {0} and types {1}".format(dataset.output_shapes,
                                                     dataset.output_types))
-    return gen_dataset_ops.dataset_to_tf_record(
+    return gen_experimental_dataset_ops.experimental_dataset_to_tf_record(
         dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index c7295d6e694ce1e0b672cee905ce1edbc520d351..3390100bed5c6dbe937d26f008d794c0fbf3a753 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -8,51 +8,48 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "sycl_py_test")
 
 tf_py_test(
-    name = "batch_dataset_op_test",
+    name = "batch_test",
     size = "small",
-    srcs = ["batch_dataset_op_test.py"],
+    srcs = ["batch_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
 tf_py_test(
-    name = "cache_dataset_op_test",
+    name = "cache_test",
     size = "small",
-    srcs = ["cache_dataset_op_test.py"],
+    srcs = ["cache_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
 tf_py_test(
-    name = "concatenate_dataset_op_test",
+    name = "concatenate_test",
     size = "small",
-    srcs = ["concatenate_dataset_op_test.py"],
+    srcs = ["concatenate_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
@@ -65,146 +62,267 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "dataset_constructor_op_test",
+    name = "dataset_checkpoint_test",
     size = "small",
-    srcs = ["dataset_constructor_op_test.py"],
+    srcs = ["dataset_checkpoint_test.py"],
     additional_deps = [
         ":test_base",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "dataset_test",
+    size = "small",
+    srcs = ["dataset_test.py"],
+    additional_deps = [
+        ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-    ],
-    tags = [
-        "manual",
-        "nomac",  # b/62040583
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
 tf_py_test(
-    name = "dataset_from_generator_op_test",
-    size = "medium",
-    srcs = ["dataset_from_generator_op_test.py"],
+    name = "filter_test",
+    size = "small",
+    srcs = ["filter_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:sparse",
     ],
 )
 
 tf_py_test(
-    name = "dataset_ops_test",
+    name = "fixed_length_record_dataset_test",
     size = "small",
-    srcs = ["dataset_ops_test.py"],
+    srcs = ["fixed_length_record_dataset_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+tf_py_test(
+    name = "flat_map_test",
+    size = "medium",
+    srcs = ["flat_map_test.py"],
     additional_deps = [
         ":test_base",
-        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
+    grpc_enabled = True,
 )
 
 tf_py_test(
-    name = "filter_dataset_op_test",
+    name = "from_generator_test",
+    size = "medium",
+    srcs = ["from_generator_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:session",
+    ],
+)
+
+tf_py_test(
+    name = "from_sparse_tensor_slices_test",
     size = "small",
-    srcs = ["filter_dataset_op_test.py"],
+    srcs = ["from_sparse_tensor_slices_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
     ],
 )
 
 tf_py_test(
-    name = "flat_map_dataset_op_test",
+    name = "from_tensors_test",
     size = "small",
-    srcs = ["flat_map_dataset_op_test.py"],
+    srcs = ["from_tensors_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+    tags = [
+        "nomac",  # b/62040583
     ],
-    grpc_enabled = True,
 )
 
 tf_py_test(
-    name = "list_files_dataset_op_test",
+    name = "from_tensor_slices_test",
     size = "small",
-    srcs = ["list_files_dataset_op_test.py"],
+    srcs = ["from_tensor_slices_test.py"],
     additional_deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:util",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 tf_py_test(
-    name = "interleave_dataset_op_test",
-    size = "small",
-    srcs = ["interleave_dataset_op_test.py"],
+    name = "interleave_test",
+    size = "medium",
+    srcs = ["interleave_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:session",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
+    ],
+)
+
+tf_py_test(
+    name = "iterator_checkpoint_test",
+    size = "medium",
+    srcs = ["iterator_checkpoint_test.py"],
+    additional_deps = [
+        ":test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
     ],
+    grpc_enabled = True,
 )
 
-cuda_py_test(
-    name = "iterator_ops_test",
+tf_py_test(
+    name = "iterator_cluster_test",
     size = "small",
-    srcs = ["iterator_ops_test.py"],
+    srcs = ["iterator_cluster_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:string_ops",
+    ],
+    grpc_enabled = True,
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "no_windows",
+    ],
+)
+
+cuda_py_test(
+    name = "iterator_test",
+    size = "medium",
+    srcs = ["iterator_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/checkpointable:util",
         "//tensorflow/python:array_ops",
@@ -235,57 +353,49 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "iterator_ops_cluster_test",
+    name = "list_files_test",
     size = "small",
-    srcs = ["iterator_ops_cluster_test.py"],
+    srcs = ["list_files_test.py"],
     additional_deps = [
-        "//tensorflow/core:protos_all_py",
+        ":test_base",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:lookup_ops",
-    ],
-    grpc_enabled = True,
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "no_windows",
     ],
 )
 
 tf_py_test(
-    name = "map_dataset_op_test",
-    size = "small",
-    srcs = ["map_dataset_op_test.py"],
+    name = "map_test",
+    size = "medium",
+    srcs = ["map_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -299,6 +409,8 @@ cuda_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -306,14 +418,15 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
     tags = [
+        "no_oss",  # TODO(b/117920141): Investigate breakage and re-enable.
         "no_windows_gpu",
     ],
 )
 
 cuda_py_test(
-    name = "optional_ops_test",
+    name = "optional_test",
     size = "small",
-    srcs = ["optional_ops_test.py"],
+    srcs = ["optional_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -332,9 +445,30 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "prefetch_dataset_op_test",
+    name = "padded_batch_test",
     size = "small",
-    srcs = ["prefetch_dataset_op_test.py"],
+    srcs = ["padded_batch_test.py"],
+    additional_deps = [
+        ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
+    name = "prefetch_test",
+    size = "small",
+    srcs = ["prefetch_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -343,77 +477,105 @@ tf_py_test(
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+    ],
+)
+
+tf_py_test(
+    name = "range_test",
+    size = "small",
+    srcs = ["range_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "reduce_test",
+    size = "small",
+    srcs = ["reduce_test.py"],
+    additional_deps = [
+        ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 tf_py_test(
-    name = "range_dataset_op_test",
+    name = "repeat_test",
     size = "small",
-    srcs = ["range_dataset_op_test.py"],
+    srcs = ["repeat_test.py"],
     additional_deps = [
         ":test_base",
+        "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
 tf_py_test(
-    name = "reader_dataset_ops_test",
+    name = "shard_test",
+    size = "small",
+    srcs = ["shard_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "shuffle_test",
     size = "small",
-    srcs = ["reader_dataset_ops_test.py"],
+    srcs = ["shuffle_test.py"],
     additional_deps = [
         ":test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
     ],
 )
 
 tf_py_test(
-    name = "reduce_dataset_op_test",
+    name = "skip_test",
     size = "small",
-    srcs = ["reduce_dataset_op_test.py"],
+    srcs = ["skip_test.py"],
     additional_deps = [
         ":test_base",
-        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 tf_py_test(
-    name = "sequence_dataset_op_test",
+    name = "take_test",
     size = "small",
-    srcs = ["sequence_dataset_op_test.py"],
+    srcs = ["take_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
@@ -426,31 +588,38 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "shard_dataset_op_test",
+    name = "text_line_dataset_test",
     size = "small",
-    srcs = ["shard_dataset_op_test.py"],
+    srcs = ["text_line_dataset_test.py"],
     additional_deps = [
         ":test_base",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:util",
     ],
 )
 
 tf_py_test(
-    name = "shuffle_dataset_op_test",
+    name = "tf_record_dataset_test",
     size = "small",
-    srcs = ["shuffle_dataset_op_test.py"],
+    srcs = ["tf_record_dataset_test.py"],
     additional_deps = [
         ":test_base",
-        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -458,17 +627,21 @@ py_library(
     name = "test_base",
     srcs = ["test_base.py"],
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/eager:context",
     ],
 )
 
 tf_py_test(
-    name = "window_dataset_op_test",
-    size = "small",
-    srcs = ["window_dataset_op_test.py"],
+    name = "window_test",
+    size = "medium",
+    srcs = ["window_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -484,9 +657,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "zip_dataset_op_test",
+    name = "zip_test",
     size = "small",
-    srcs = ["zip_dataset_op_test.py"],
+    srcs = ["zip_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
deleted file mode 100644
index 9cb4daf2844fee251cb1e1b4a2c480bada1dc409..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
+++ /dev/null
@@ -1,514 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('even', 28, 14, False),
-      ('uneven_with_remainder', 28, 15, False),
-      ('uneven_without_remainder', 28, 15, True),
-      ('empty', 0, 14, False),
-  )
-  def testBatchDataset(self, count, batch_size, drop_remainder):
-    """Tests the batch dataset logic for various input configurations.
-
-    Args:
-      count: the number of input elements
-      batch_size: the batch size
-      drop_remainder: whether a smaller batch size should be produced if batch
-        size does not divide number of inputs evenly
-    """
-
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(count) -> BatchDataset(batch_size).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count_t = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count).batch(batch_size,
-                             drop_remainder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    if drop_remainder:
-      dim0 = batch_size
-    else:
-      dim0 = None
-    self.assertEqual([[dim0] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              count_t: count,
-              batch_size_t: batch_size,
-              drop_remainder_t: drop_remainder
-          })
-      num_full_batches = (count * 7) // batch_size
-      for i in range(num_full_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(batch_size):
-            self.assertAllEqual(component[(i * batch_size + j) % 7]**2,
-                                result_component[j])
-      if not drop_remainder and (count * 7) % batch_size > 0:
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range((count * 7) % batch_size):
-            self.assertAllEqual(
-                component[(num_full_batches * batch_size + j) % 7]**2,
-                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testBatchDatasetInvalidBatchSize(self):
-    iterator = (dataset_ops.Dataset.range(10).batch(0).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-
-  def testBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
-        5).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testBatchSparseWithDifferentDenseShapes(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=array_ops.expand_dims(
-              math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
-          dense_shape=[i])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
-        5).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected_indices = []
-        expected_values = []
-        for j in range(5):
-          for k in range(i * 5 + j):
-            expected_indices.append([j, k])
-            expected_values.append(i * 5 + j)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=expected_indices,
-            values=expected_values,
-            dense_shape=[5, (i + 1) * 5 - 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(
-        2).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
-                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
-          values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
-          dense_shape=[2, 5, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testBatchShapeError(self):
-
-    def generator():
-      yield [1.0, 2.0, 3.0]
-      yield [4.0, 5.0, 6.0]
-      yield [7.0, 8.0, 9.0, 10.0]
-
-    iterator = (
-        dataset_ops.Dataset.from_generator(
-            generator, dtypes.float32, output_shapes=[None]).batch(3)
-        .make_initializable_iterator())
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'Cannot batch tensors with different shapes in component 0. '
-          r'First element had shape \[3\] and element 2 had shape \[4\].'):
-        sess.run(next_element)
-
-
-def _random_seq_lens(count):
-  return np.random.randint(20, size=(count,)).astype(np.int32)
-
-
-class PaddedBatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default_padding', _random_seq_lens(32), 4, [-1], False),
-      ('constant_padding', _random_seq_lens(32), 4, [25], False),
-      ('uneven_with_remainder', _random_seq_lens(34), 4, [-1], False),
-      ('uneven_without_remainder', _random_seq_lens(34), 4, [-1], True),
-  )
-  def testPaddedBatchDataset(self, seq_lens, batch_size, padded_shapes,
-                             drop_remainder):
-    """Tests the padded batch dataset logic for various input configurations.
-
-    Args:
-      seq_lens: the input sequence lengths
-      batch_size: the batch size
-      padded_shapes: the padded shapes to use
-      drop_remainder: whether a smaller batch size should be produced if batch
-        size does not divide number of inputs evenly
-    """
-
-    seq_lens_t = array_ops.placeholder(dtypes.int32, shape=[None])
-    batch_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    padded_shapes_t = array_ops.placeholder(dtypes.int64, shape=[1])
-    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens_t)
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            batch_size=batch_size_t,
-            drop_remainder=drop_remainder_t,
-            padded_shapes=padded_shapes_t).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              seq_lens_t: seq_lens,
-              batch_size_t: batch_size,
-              padded_shapes_t: padded_shapes,
-              drop_remainder_t: drop_remainder,
-          })
-
-      num_full_batches = len(seq_lens) // batch_size
-
-      for i in range(num_full_batches):
-        result = sess.run(get_next)
-        padded_len = padded_shapes[0]
-        if padded_len is None or padded_len == -1:
-          padded_len = np.max(result) if result.size > 0 else 0
-        self.assertEqual((batch_size, padded_len), result.shape)
-        for j in range(batch_size):
-          seq_len = seq_lens[(i * batch_size) + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:],
-                              [0] * (padded_len - seq_len))
-
-      if not drop_remainder and len(seq_lens) % batch_size > 0:
-        result = sess.run(get_next)
-        padded_len = np.max(result) if result.size > 0 else 0
-        self.assertEqual((len(seq_lens) % batch_size, padded_len),
-                         result.shape)
-        for j in range(len(seq_lens) % batch_size):
-          seq_len = seq_lens[num_full_batches * batch_size + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:],
-                              [0] * (padded_len - seq_len))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchShortPadding(self):
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([6, 5, 5, 5, 5])
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            batch_size=4, padded_shapes=[5]).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.DataLossError):
-        sess.run(get_next)
-
-  def testPaddedBatchEmptyTensors(self):
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([0, 0, 0, 0])
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            batch_size=4, padded_shapes=[-1]).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      result = sess.run(get_next)
-      self.assertAllEqual([[], [], [], []], result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDatasetNonDefaultPadding(self):
-    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
-    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
-
-    def fill_tuple(x):
-      filled = array_ops.fill([x], x)
-      return (filled, string_ops.as_string(filled))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
-        .padded_batch(
-            4,
-            padded_shapes=(padded_shape, padded_shape),
-            padding_values=(-1, '<end>')).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Test with random sequence lengths, and max padding.
-      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
-        result = sess.run(get_next)
-        padded_len = np.max(result[0])
-        self.assertEqual((4, padded_len), result[0].shape)
-        self.assertEqual((4, padded_len), result[1].shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
-          self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[0][j, seq_len:],
-                              [-1] * (padded_len - seq_len))
-          self.assertAllEqual(result[1][j, :seq_len],
-                              [compat.as_bytes(str(seq_len))] * seq_len)
-          self.assertAllEqual(result[1][j, seq_len:],
-                              [b'<end>'] * (padded_len - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDatasetUnicode(self):
-    # See GitHub issue 16149
-    def generator():
-      data = [[u'Простой', u'тест', u'юникода'],
-              [u'никогда', u'не', u'бывает', u'простым']]
-
-      for seq in data:
-        yield seq, [0, 1, 2, 3]
-
-    dataset = dataset_ops.Dataset.from_generator(
-        generator, (dtypes.string, dtypes.int32),
-        (tensor_shape.TensorShape([None]), tensor_shape.TensorShape([None])))
-    padded_dataset = dataset.padded_batch(
-        2, padded_shapes=([None], [None]), padding_values=('', 0))
-    with self.cached_session() as sess:
-      next_element = padded_dataset.make_one_shot_iterator().get_next()
-      sess.run(next_element)
-
-  def testPaddedBatchDatasetShapeSpecifications(self):
-    int_placeholder = array_ops.placeholder(dtypes.int32)
-    float_placeholder = array_ops.placeholder(dtypes.float32)
-    string_placeholder = array_ops.placeholder(dtypes.string)
-    input_dataset = dataset_ops.Dataset.from_tensors(
-        (int_placeholder, float_placeholder, string_placeholder))
-
-    # Test different ways of specifying the `padded_shapes` argument.
-    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
-        32,
-        padded_shapes=(tensor_shape.TensorShape([None]),
-                       tensor_shape.TensorShape([None, None]),
-                       tensor_shape.TensorShape([37])))
-    dynamic_padding_from_lists = input_dataset.padded_batch(
-        32, padded_shapes=([None], [None, None], [37]))
-    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
-        32, padded_shapes=([-1], [-1, -1], [37]))
-    dynamic_padding_from_tensors = input_dataset.padded_batch(
-        32,
-        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
-                       constant_op.constant([-1, -1], dtype=dtypes.int64),
-                       constant_op.constant([37], dtype=dtypes.int64)))
-
-    for dataset in [
-        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
-        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
-    ]:
-      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
-      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
-      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
-
-  def testPaddedBatchSparseError(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
-
-    with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
-
-  def testPaddedBatchShapeError(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(1,\) is not compatible with the '
-        r'corresponding input component shape \(\).'):
-      _ = dataset_ops.Dataset.range(10).padded_batch(5, padded_shapes=[1])
-
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(1,\) is not compatible with the '
-        r'corresponding input component shape \(3,\).'):
-      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=[1])
-
-    with self.assertRaisesRegexp(
-        ValueError, r'Padded shape .* must be a 1-D tensor '
-        r'of tf.int64 values, but its shape was \(2, 2\).'):
-      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=[[1, 1], [1, 1]])
-
-    with self.assertRaisesRegexp(
-        TypeError, r'Padded shape .* must be a 1-D tensor '
-        r'of tf.int64 values, but its element type was float32.'):
-      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=constant_op.constant([1., 2., 3.]))
-
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(1,\) is not compatible with the '
-        r'corresponding input component shape \(\).'):
-      shape_as_tensor = constant_op.constant([1], dtype=dtypes.int64)
-      _ = dataset_ops.Dataset.range(10).padded_batch(
-          5, padded_shapes=shape_as_tensor)
-
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(\?, \?\) is not compatible with the '
-        r'corresponding input component shape \(\).'):
-      shape_as_tensor = array_ops.placeholder(dtypes.int64, shape=[2])
-      _ = dataset_ops.Dataset.range(10).padded_batch(
-          5, padded_shapes=shape_as_tensor)
-
-
-class BatchDatasetBenchmark(test.Benchmark):
-
-  def benchmarkBatchSparse(self):
-    non_zeros_per_row_values = [0, 1, 5, 10, 100]
-    batch_size_values = [1, 32, 64, 128, 1024]
-
-    sparse_placeholder = array_ops.sparse_placeholder(dtype=dtypes.int64)
-    batch_size_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[])
-
-    dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
-        ).batch(batch_size_placeholder)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    for non_zeros_per_row in non_zeros_per_row_values:
-
-      sparse_value = sparse_tensor.SparseTensorValue(
-          indices=np.arange(non_zeros_per_row, dtype=np.int64)[:, np.newaxis],
-          values=np.arange(non_zeros_per_row, dtype=np.int64),
-          dense_shape=[1000])
-
-      for batch_size in batch_size_values:
-
-        with session.Session() as sess:
-          sess.run(iterator.initializer, feed_dict={
-              sparse_placeholder: sparse_value,
-              batch_size_placeholder: batch_size})
-          # Run five steps to warm up the session caches before taking the
-          # first measurement.
-          for _ in range(5):
-            sess.run(next_element.indices.op)
-          deltas = []
-          for _ in range(100):
-            start = time.time()
-            for _ in range(100):
-              sess.run(next_element.indices.op)
-            end = time.time()
-            deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100.0
-
-        print('Batch sparse dataset non-zeros per row: %d batch_size: %d '
-              'wall time: %f'
-              % (non_zeros_per_row, batch_size, median_wall_time))
-        self.report_benchmark(
-            iters=10000, wall_time=median_wall_time,
-            name='benchmark_batch_sparse_dataset_nnz_%d_batch_size_%d' % (
-                non_zeros_per_row, batch_size))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b035e59173e6ee52be8ec0aab21c761093d07ce
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('even', 28, 14, False),
+      ('uneven_with_remainder', 28, 15, False),
+      ('uneven_without_remainder', 28, 15, True),
+      ('empty', 0, 14, False),
+  )
+  def testBatchDataset(self, count, batch_size, drop_remainder):
+    """Tests the batch dataset logic for various input configurations.
+
+    Args:
+      count: the number of input elements
+      batch_size: the batch size
+      drop_remainder: whether a smaller batch size should be produced if batch
+        size does not divide number of inputs evenly
+    """
+
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(count) -> BatchDataset(batch_size).
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn).repeat(count).batch(batch_size, drop_remainder)
+    get_next = self.getNext(dataset)
+
+    if drop_remainder:
+      dim0 = batch_size
+    else:
+      dim0 = None
+    self.assertEqual(
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)],
+        [[dim0] + list(c.shape[1:]) for c in components])
+
+    num_full_batches = (count * 7) // batch_size
+    for i in range(num_full_batches):
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range(batch_size):
+          self.assertAllEqual(component[(i * batch_size + j) % 7]**2,
+                              result_component[j])
+    if not drop_remainder and (count * 7) % batch_size > 0:
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range((count * 7) % batch_size):
+          self.assertAllEqual(
+              component[(num_full_batches * batch_size + j) % 7]**2,
+              result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      result = self.evaluate(get_next())
+
+  def testBatchDatasetInvalidBatchSize(self):
+    dataset = (dataset_ops.Dataset.range(10).batch(0))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ''))
+
+  def testBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5)
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1]) for i in range(2)
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testBatchSparseWithDifferentDenseShapes(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=array_ops.expand_dims(
+              math_ops.range(i, dtype=dtypes.int64), 1),
+          values=array_ops.fill([math_ops.to_int32(i)], i),
+          dense_shape=[i])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5)
+    expected_output = []
+    for i in range(2):
+      expected_indices = []
+      expected_outputs = []
+      for j in range(5):
+        for k in range(i * 5 + j):
+          expected_indices.append([j, k])
+          expected_outputs.append(i * 5 + j)
+      expected_output.append(
+          sparse_tensor.SparseTensorValue(
+              indices=expected_indices,
+              values=expected_outputs,
+              dense_shape=[5, (i + 1) * 5 - 1]))
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testNestedBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(2)
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
+                     [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
+            values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            dense_shape=[2, 5, 1])
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testBatchShapeError(self):
+
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    dataset = (
+        dataset_ops.Dataset.from_generator(
+            generator, dtypes.float32, output_shapes=[None]).batch(3))
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            r'Cannot batch tensors with different shapes in component 0. First '
+            r'element had shape \[3\] and element 2 had shape \[4\].'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
deleted file mode 100644
index 63625fac03beeb3f8756bfa5c8e543fdc3488fc4..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class FileCacheDatasetTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-    self.cache_prefix = path.join(self.tmp_dir, "cache")
-
-  def tearDown(self):
-    if self.tmp_dir:
-      shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def testCacheDatasetPassthrough(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache(filename_placeholder)
-
-    self.assertEqual(
-        tuple([c.shape[1:] for c in components]), cache_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = iterator_ops.Iterator.from_structure(cache_dataset.output_types,
-                                                    cache_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_cache_op = iterator.make_initializer(cache_dataset)
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # First run without caching to collect the "ground truth".
-      sess.run(init_fifo_op)
-      elements = []
-      for _ in range(20):
-        elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Assert that the cached dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_cache_op, feed_dict={filename_placeholder: self.cache_prefix})
-      cached_elements = []
-      for _ in range(20):
-        cached_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(elements, cached_elements)
-
-      # Re-initialize with an empty upstream (to throw errors.OutOfRangeError
-      # if we didn't use the cache).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix
-          })
-      replayed_elements = []
-      for _ in range(20):
-        replayed_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(cached_elements, replayed_elements)
-
-      # Re-initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix + "nonsense"
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentWriters(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(get_next1)  # this should succeed
-
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-      with self.assertRaises(errors.AlreadyExistsError):
-        sess.run(get_next2)
-
-      sess.run(get_next1)  # this should continue to succeed
-
-  def testConcurrentReaders(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      elements = []
-      for _ in range(4):
-        elements.append(sess.run(get_next1))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      # Re-initialize
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-
-      # Reading concurrently should succeed.
-      elements_itr1 = []
-      elements_itr2 = []
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      # Intentionally reversing the order
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next2)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      self.assertAllEqual(elements, elements_itr1)
-      self.assertAllEqual(elements, elements_itr2)
-
-
-class MemoryCacheDatasetTest(test_base.DatasetTestBase):
-
-  def testCacheDatasetPassthrough(self):
-    with ops.device("cpu:0"):
-      repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
-      dataset = dataset_ops.Dataset.range(3).flat_map(
-          lambda x: dataset_ops.Dataset.from_tensors(x).repeat(repeat_count))
-
-      cached_dataset = dataset.cache().repeat(2)
-      uncached_dataset = dataset.repeat(2)
-
-      # Needs to be initializable to capture the variable.
-      cached_iterator = cached_dataset.make_initializable_iterator()
-      cached_next = cached_iterator.get_next()
-      uncached_iterator = uncached_dataset.make_initializable_iterator()
-      uncached_next = uncached_iterator.get_next()
-
-      with self.cached_session() as sess:
-
-        sess.run(repeat_count.initializer)
-        sess.run(cached_iterator.initializer)
-        sess.run(uncached_iterator.initializer)
-
-        for i in range(3):
-          for _ in range(10):
-            self.assertEqual(sess.run(cached_next), i)
-            self.assertEqual(sess.run(uncached_next), i)
-
-        sess.run(repeat_count.assign(0))
-
-        # The uncached iterator should now be empty.
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(uncached_next)
-
-        # The cached iterator replays from cache.
-        for i in range(3):
-          for _ in range(10):
-            self.assertEqual(sess.run(cached_next), i)
-
-        # The cached iterator should now be empty.
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(cached_next)
-
-  def testEmptyCacheReading(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache()
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = cache_dataset.make_initializable_iterator()
-    init_cache_op = iterator.initializer
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(init_cache_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentReaders(self):
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    dataset = dataset_ops.Dataset.range(count_placeholder).cache()
-    d1 = dataset.map(lambda x: x + 1)
-    d2 = dataset.map(lambda x: x + 6)
-
-    i1 = d1.make_initializable_iterator()
-    i2 = d2.make_initializable_iterator()
-
-    with self.cached_session() as sess:
-      sess.run(i1.initializer)
-
-      self.assertEqual(1, sess.run(i1.get_next()))
-      self.assertEqual(2, sess.run(i1.get_next()))
-      self.assertEqual(3, sess.run(i1.get_next()))
-
-      sess.run(i2.initializer, feed_dict={count_placeholder: 3})
-
-      self.assertEqual(6, sess.run(i2.get_next()))
-      self.assertEqual(7, sess.run(i2.get_next()))
-      self.assertEqual(4, sess.run(i1.get_next()))  # interleave execution
-      self.assertEqual([8, 5], sess.run([i2.get_next(), i1.get_next()]))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i1.get_next())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i2.get_next())
-
-  def testCacheTakeRepeat(self):
-    dataset = dataset_ops.Dataset.range(10).cache().take(5).repeat(2)
-    itr = dataset.make_one_shot_iterator()
-    n = itr.get_next()
-
-    expected_values = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
-
-    with self.cached_session() as sess:
-      for i, expected in enumerate(expected_values):
-        self.assertEqual(expected, sess.run(n),
-                         "Unexpected value at index %s" % i)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b561cd58baf732f557d518e7eb237ab00512acc1
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -0,0 +1,253 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.cache()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+import shutil
+import tempfile
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FileCacheTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    self.tmp_dir = tempfile.mkdtemp()
+    self.cache_prefix = path.join(self.tmp_dir, "cache")
+
+  def tearDown(self):
+    if self.tmp_dir:
+      shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+  def testCacheDatasetPassthrough(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    def dataset_fn(count=5, filename=None):
+      repeat_dataset = (
+          dataset_ops.Dataset.from_tensor_slices(components).repeat(count))
+      if filename:
+        return repeat_dataset.cache(filename)
+      else:
+        return repeat_dataset
+
+    self.assertEqual(
+        tuple([c.shape[1:] for c in components]),
+        dataset_fn().output_shapes)
+
+    get_next = self.getNext(dataset_fn())
+
+    # First run without caching to collect the "ground truth".
+    elements = []
+    for _ in range(20):
+      elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Assert that the cached dataset has the same elements as the
+    # "ground truth".
+    get_next = self.getNext(dataset_fn(filename=self.cache_prefix))
+    cached_elements = []
+    for _ in range(20):
+      cached_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertAllEqual(elements, cached_elements)
+
+    # Re-initialize with an empty upstream (to throw errors.OutOfRangeError
+    # if we didn't use the cache).
+    get_next = self.getNext(dataset_fn(count=0, filename=self.cache_prefix))
+    replayed_elements = []
+    for _ in range(20):
+      replayed_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(cached_elements, replayed_elements)
+
+    # Re-initialize with an empty upstream and a missing cache file (should
+    # throw errors.OutOfRangeError immediately).
+    get_next = self.getNext(
+        dataset_fn(count=0, filename=self.cache_prefix + "nonsense"))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testConcurrentWriters(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    cache_dataset1 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+    cache_dataset2 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+
+    get_next1 = self.getNext(cache_dataset1)
+    get_next2 = self.getNext(cache_dataset2)
+
+    self.evaluate(get_next1())  # this should succeed
+
+    with self.assertRaises(errors.AlreadyExistsError):
+      self.evaluate(get_next2())
+
+    self.evaluate(get_next1())  # this should continue to succeed
+
+  def testConcurrentReaders(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    cache_dataset1 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+    cache_dataset2 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+
+    get_next1 = self.getNext(cache_dataset1)
+    get_next2 = self.getNext(cache_dataset2)
+
+    elements = []
+    for _ in range(4):
+      elements.append(self.evaluate(get_next1()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next1())
+
+    # Re-initialize
+    get_next1 = self.getNext(cache_dataset1)
+    get_next2 = self.getNext(cache_dataset2)
+
+    # Reading concurrently should succeed.
+    elements_itr1 = []
+    elements_itr2 = []
+    elements_itr2.append(self.evaluate(get_next2()))
+    elements_itr1.append(self.evaluate(get_next1()))
+    elements_itr2.append(self.evaluate(get_next2()))
+    elements_itr1.append(self.evaluate(get_next1()))
+    # Intentionally reversing the order
+    elements_itr1.append(self.evaluate(get_next1()))
+    elements_itr2.append(self.evaluate(get_next2()))
+    elements_itr1.append(self.evaluate(get_next1()))
+    elements_itr2.append(self.evaluate(get_next2()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next2())
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next1())
+
+    self.assertAllEqual(elements, elements_itr1)
+    self.assertAllEqual(elements, elements_itr2)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MemoryCacheTest(test_base.DatasetTestBase):
+
+  def testCacheDatasetPassthrough(self):
+    with ops.device("cpu:0"):
+      repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
+      dataset = dataset_ops.Dataset.range(3).flat_map(
+          lambda x: dataset_ops.Dataset.from_tensors(x).repeat(repeat_count))
+
+      cached_dataset = dataset.cache().repeat(2)
+      uncached_dataset = dataset.repeat(2)
+
+      self.evaluate(repeat_count.initializer)
+      # Needs to be initializable to capture the variable.
+      cached_next = self.getNext(cached_dataset, requires_initialization=True)
+      uncached_next = self.getNext(
+          uncached_dataset, requires_initialization=True)
+      for i in range(3):
+        for _ in range(10):
+          self.assertEqual(self.evaluate(cached_next()), i)
+          self.assertEqual(self.evaluate(uncached_next()), i)
+
+      self.evaluate(repeat_count.assign(0))
+
+      # The uncached iterator should now be empty.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(uncached_next())
+
+      # The cached iterator replays from cache.
+      for i in range(3):
+        for _ in range(10):
+          self.assertEqual(self.evaluate(cached_next()), i)
+
+      # The cached iterator should now be empty.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(cached_next())
+
+  def testEmptyCacheReading(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    repeat_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).repeat(0))
+    cache_dataset = repeat_dataset.cache()
+
+    # Create initialization ops for iterators without and with
+    # caching, respectively.
+    self.assertDatasetProduces(cache_dataset, expected_output=[])
+
+  def testConcurrentReaders(self):
+
+    dataset = dataset_ops.Dataset.range(5).cache()
+    d1 = dataset.map(lambda x: x + 1)
+    d2 = dataset.map(lambda x: x + 6)
+
+    get_next1 = self.getNext(d1)
+
+    self.assertEqual(1, self.evaluate(get_next1()))
+    self.assertEqual(2, self.evaluate(get_next1()))
+    self.assertEqual(3, self.evaluate(get_next1()))
+
+    get_next2 = self.getNext(d2)
+
+    self.assertEqual(6, self.evaluate(get_next2()))
+    self.assertEqual(7, self.evaluate(get_next2()))
+    self.assertEqual(4, self.evaluate(get_next1()))  # interleave execution
+    self.assertEqual([8, 5],
+                     [self.evaluate(get_next2()),
+                      self.evaluate(get_next1())])
+    self.assertEqual(9, self.evaluate(get_next2()))
+    self.assertEqual(10, self.evaluate(get_next2()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next2())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next1())
+
+  def testCacheTakeRepeat(self):
+    dataset = dataset_ops.Dataset.range(10).cache().take(5).repeat(2)
+
+    expected_output = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
deleted file mode 100644
index 83af31f380efabc0d8654668a9a81d5789b8eeb1..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.platform import test
-
-
-class ConcatenateDatasetTest(test_base.DatasetTestBase):
-
-  def testConcatenateDataset(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20),
-        np.tile(np.array([[12], [13], [14], [15]]), 15),
-        np.array([37.0, 38.0, 39.0, 40.0]))
-    to_concatenate_components = (
-        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
-        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
-        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-    concatenated = input_dataset.concatenate(dataset_to_concatenate)
-    self.assertEqual(concatenated.output_shapes, (tensor_shape.TensorShape(
-        [20]), tensor_shape.TensorShape([15]), tensor_shape.TensorShape([])))
-
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcatenateDatasetDifferentShape(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20),
-        np.tile(np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (
-        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
-        np.tile(np.array([[12], [13], [14], [15], [16]]), 15))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-    concatenated = input_dataset.concatenate(dataset_to_concatenate)
-    self.assertEqual(
-        [ts.as_list()
-         for ts in nest.flatten(concatenated.output_shapes)], [[20], [None]])
-
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcatenateDatasetDifferentStructure(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 5),
-        np.tile(np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (
-        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
-        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
-        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-
-    with self.assertRaisesRegexp(TypeError, "have different types"):
-      input_dataset.concatenate(dataset_to_concatenate)
-
-  def testConcatenateDatasetDifferentKeys(self):
-    input_components = {
-        "foo": np.array([[1], [2], [3], [4]]),
-        "bar": np.array([[12], [13], [14], [15]])
-    }
-    to_concatenate_components = {
-        "foo": np.array([[1], [2], [3], [4]]),
-        "baz": np.array([[5], [6], [7], [8]])
-    }
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-
-    with self.assertRaisesRegexp(TypeError, "have different types"):
-      input_dataset.concatenate(dataset_to_concatenate)
-
-  def testConcatenateDatasetDifferentType(self):
-    input_components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 5),
-        np.tile(np.array([[12], [13], [14], [15]]), 4))
-    to_concatenate_components = (
-        np.tile(np.array([[1.0], [2.0], [3.0], [4.0]]), 5),
-        np.tile(np.array([[12], [13], [14], [15]]), 15))
-
-    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
-    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
-        to_concatenate_components)
-
-    with self.assertRaisesRegexp(TypeError, "have different types"):
-      input_dataset.concatenate(dataset_to_concatenate)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/concatenate_test.py b/tensorflow/python/data/kernel_tests/concatenate_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d8bfdc8f3afc2aed265f3907c22ff442ba590c4
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/concatenate_test.py
@@ -0,0 +1,143 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.concatenate()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ConcatenateTest(test_base.DatasetTestBase):
+
+  def testConcatenateDataset(self):
+    input_components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 15),
+        np.array([37.0, 38.0, 39.0, 40.0]))
+    to_concatenate_components = (
+        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
+        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
+        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+    concatenated = input_dataset.concatenate(dataset_to_concatenate)
+    self.assertEqual(concatenated.output_shapes, (tensor_shape.TensorShape(
+        [20]), tensor_shape.TensorShape([15]), tensor_shape.TensorShape([])))
+
+    get_next = self.getNext(concatenated)
+
+    for i in range(9):
+      result = self.evaluate(get_next())
+      if i < 4:
+        for component, result_component in zip(input_components, result):
+          self.assertAllEqual(component[i], result_component)
+      else:
+        for component, result_component in zip(to_concatenate_components,
+                                               result):
+          self.assertAllEqual(component[i - 4], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testConcatenateDatasetDifferentShape(self):
+    input_components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 4))
+    to_concatenate_components = (
+        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
+        np.tile(np.array([[12], [13], [14], [15], [16]]), 15))
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+    concatenated = input_dataset.concatenate(dataset_to_concatenate)
+    self.assertEqual(
+        [ts.as_list()
+         for ts in nest.flatten(concatenated.output_shapes)], [[20], [None]])
+    get_next = self.getNext(concatenated)
+    for i in range(9):
+      result = self.evaluate(get_next())
+      if i < 4:
+        for component, result_component in zip(input_components, result):
+          self.assertAllEqual(component[i], result_component)
+      else:
+        for component, result_component in zip(to_concatenate_components,
+                                               result):
+          self.assertAllEqual(component[i - 4], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testConcatenateDatasetDifferentStructure(self):
+    input_components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 5),
+        np.tile(np.array([[12], [13], [14], [15]]), 4))
+    to_concatenate_components = (
+        np.tile(np.array([[1], [2], [3], [4], [5]]), 20),
+        np.tile(np.array([[12], [13], [14], [15], [16]]), 15),
+        np.array([37.0, 38.0, 39.0, 40.0, 41.0]))
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+
+    with self.assertRaisesRegexp(TypeError, "have different types"):
+      input_dataset.concatenate(dataset_to_concatenate)
+
+  def testConcatenateDatasetDifferentKeys(self):
+    input_components = {
+        "foo": np.array([[1], [2], [3], [4]]),
+        "bar": np.array([[12], [13], [14], [15]])
+    }
+    to_concatenate_components = {
+        "foo": np.array([[1], [2], [3], [4]]),
+        "baz": np.array([[5], [6], [7], [8]])
+    }
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+
+    with self.assertRaisesRegexp(TypeError, "have different types"):
+      input_dataset.concatenate(dataset_to_concatenate)
+
+  def testConcatenateDatasetDifferentType(self):
+    input_components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 5),
+        np.tile(np.array([[12], [13], [14], [15]]), 4))
+    to_concatenate_components = (
+        np.tile(np.array([[1.0], [2.0], [3.0], [4.0]]), 5),
+        np.tile(np.array([[12], [13], [14], [15]]), 15))
+
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_components)
+    dataset_to_concatenate = dataset_ops.Dataset.from_tensor_slices(
+        to_concatenate_components)
+
+    with self.assertRaisesRegexp(TypeError, "have different types"):
+      input_dataset.concatenate(dataset_to_concatenate)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dcd94ea0207a53be1e3444db2a3e6643b8841ed
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
@@ -0,0 +1,361 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Checkpoint tests for `tf.data.Dataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+class DatasetCheckpointTest(test_base.DatasetTestBase):
+
+  def tearDown(self):
+    # Remove all checkpoint files.
+    prefix = self._iterator_checkpoint_prefix()
+    pattern = prefix + "*"
+    files = gfile.Glob(pattern)
+    map(gfile.Remove, files)
+
+  def _iterator_checkpoint_prefix(self):
+    return os.path.join(self.get_temp_dir(), "iterator")
+
+  def _save_op(self, iterator_resource):
+    iterator_state_variant = gen_dataset_ops.serialize_iterator(
+        iterator_resource)
+    save_op = io_ops.write_file(
+        self._iterator_checkpoint_prefix(),
+        parsing_ops.serialize_tensor(iterator_state_variant))
+    return save_op
+
+  def _restore_op(self, iterator_resource):
+    iterator_state_variant = parsing_ops.parse_tensor(
+        io_ops.read_file(self._iterator_checkpoint_prefix()), dtypes.variant)
+    restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
+                                                      iterator_state_variant)
+    return restore_op
+
+  def testSaveRestore(self):
+
+    def _build_graph(start, stop):
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      save_op = self._save_op(iterator._iterator_resource)
+      restore_op = self._restore_op(iterator._iterator_resource)
+      return init_op, get_next, save_op, restore_op
+
+    # Saving and restoring in different sessions.
+    start = 2
+    stop = 10
+    break_point = 5
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, _ = _build_graph(start, stop)
+      with self.session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        for i in range(start, break_point):
+          self.assertEqual(i, sess.run(get_next))
+        sess.run(save_op)
+
+    with ops.Graph().as_default() as g:
+      init_op, get_next, _, restore_op = _build_graph(start, stop)
+      with self.session(graph=g) as sess:
+        sess.run(init_op)
+        sess.run(restore_op)
+        for i in range(break_point, stop):
+          self.assertEqual(i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+    # Saving and restoring in same session.
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
+      with self.session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        for i in range(start, break_point):
+          self.assertEqual(i, sess.run(get_next))
+        sess.run(save_op)
+        sess.run(restore_op)
+        for i in range(break_point, stop):
+          self.assertEqual(i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testRestoreWithoutBuildingDatasetGraph(self):
+
+    def _build_graph(start, stop, num_epochs):
+      dataset = dataset_ops.Dataset.range(start, stop).repeat(num_epochs)
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      save_op = self._save_op(iterator._iterator_resource)
+      restore_op = self._restore_op(iterator._iterator_resource)
+      return init_op, get_next, save_op, restore_op
+
+    # Saving and restoring in different sessions.
+    start = 2
+    stop = 10
+    num_epochs = 5
+    break_point = 5
+    break_epoch = 3
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, _ = _build_graph(start, stop, num_epochs)
+      with self.session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        for _ in range(break_epoch):
+          for i in range(start, stop):
+            self.assertEqual(i, sess.run(get_next))
+        for i in range(start, break_point):
+          self.assertEqual(i, sess.run(get_next))
+        sess.run(save_op)
+
+    with ops.Graph().as_default() as g:
+      # Create an empty IteratorResource and restore the Iterator into it.
+      output_types = dtypes.int64
+      output_shapes = tensor_shape.scalar()
+      iterator = iterator_ops.Iterator.from_structure(output_types,
+                                                      output_shapes)
+      restore_op = self._restore_op(iterator._iterator_resource)
+      get_next = iterator.get_next()
+      with self.session(graph=g) as sess:
+        sess.run(restore_op)
+        for i in range(break_point, stop):
+          self.assertEqual(i, sess.run(get_next))
+        for _ in range(break_epoch + 1, num_epochs):
+          for i in range(start, stop):
+            self.assertEqual(i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testRestoreInModifiedGraph(self):
+
+    def _build_graph(start, stop):
+      dataset = dataset_ops.Dataset.range(start, stop)
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      save_op = self._save_op(iterator._iterator_resource)
+      restore_op = self._restore_op(iterator._iterator_resource)
+      return init_op, get_next, save_op, restore_op
+
+    # Saving and restoring in different sessions.
+    start = 2
+    stop = 10
+    stop_1 = 8
+    break_point = 5
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, _ = _build_graph(start, stop)
+      with self.session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        for i in range(start, break_point):
+          self.assertEqual(i, sess.run(get_next))
+        sess.run(save_op)
+
+    with ops.Graph().as_default() as g:
+      # Intentionally build a graph with a different value for stop to make sure
+      # the original dataset graph is actually getting loaded.
+      init_op, get_next, _, restore_op = _build_graph(start, stop_1)
+      with self.session(graph=g) as sess:
+        sess.run(restore_op)
+        for i in range(break_point, stop):
+          self.assertEqual(i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testInitThenRestore(self):
+    # Note: Calling init_op before restore_op is redundant. This test just makes
+    # sure we do not fail if restore is called on an already initialized
+    # iterator resource.
+
+    def _build_graph(start, stop):
+      dataset = dataset_ops.Dataset.range(start, stop)
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      save_op = self._save_op(iterator._iterator_resource)
+      restore_op = self._restore_op(iterator._iterator_resource)
+      return init_op, get_next, save_op, restore_op
+
+    # Saving and restoring in different sessions.
+    start = 2
+    stop = 10
+    break_point = 5
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, _ = _build_graph(start, stop)
+      with self.session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        for i in range(start, break_point):
+          self.assertEqual(i, sess.run(get_next))
+        sess.run(save_op)
+
+    with ops.Graph().as_default() as g:
+      init_op, get_next, _, restore_op = _build_graph(start, stop)
+      with self.session(graph=g) as sess:
+        sess.run(init_op)
+        sess.run(restore_op)
+        for i in range(break_point, stop):
+          self.assertEqual(i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testMultipleSaves(self):
+
+    def _build_graph(start, stop):
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      save_op = self._save_op(iterator._iterator_resource)
+      restore_op = self._restore_op(iterator._iterator_resource)
+      return init_op, get_next, save_op, restore_op
+
+    start = 2
+    stop = 10
+    break_point1 = 5
+    break_point2 = 7
+
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, _ = _build_graph(start, stop)
+      with self.session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        for i in range(start, break_point1):
+          self.assertEqual(i, sess.run(get_next))
+        sess.run(save_op)
+
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
+      with self.session(graph=g) as sess:
+        sess.run(restore_op)
+        for i in range(break_point1, break_point2):
+          self.assertEqual(i, sess.run(get_next))
+        sess.run(save_op)
+
+    break_point2 = 7
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
+      with self.session(graph=g) as sess:
+        sess.run(restore_op)
+        for i in range(break_point2, stop):
+          self.assertEqual(i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testSaveRestoreWithRepeat(self):
+
+    def _build_graph(start, stop, num_epochs):
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop).repeat(num_epochs))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      save_op = self._save_op(iterator._iterator_resource)
+      restore_op = self._restore_op(iterator._iterator_resource)
+      return init_op, get_next, save_op, restore_op
+
+    start = 2
+    stop = 10
+    num_epochs = 5
+    break_range = 5
+    break_epoch = 3
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, restore_op = _build_graph(
+          start, stop, num_epochs)
+      with self.session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        # Note: There is no checkpoint saved currently so a NotFoundError is
+        # raised.
+        with self.assertRaises(errors.NotFoundError):
+          sess.run(restore_op)
+        for _ in range(break_epoch - 1):
+          for i in range(start, stop):
+            self.assertEqual(i, sess.run(get_next))
+        for i in range(start, break_range):
+          self.assertEqual(i, sess.run(get_next))
+        sess.run(save_op)
+
+    with ops.Graph().as_default() as g:
+      init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
+      with self.session(graph=g) as sess:
+        sess.run(restore_op)
+        for i in range(break_range, stop):
+          self.assertEqual(i, sess.run(get_next))
+        for _ in range(break_epoch, num_epochs):
+          for i in range(start, stop):
+            self.assertEqual(i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testSaveRestoreExhaustedIterator(self):
+
+    def _build_graph(start, stop, num_epochs):
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop).repeat(num_epochs))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      save_op = self._save_op(iterator._iterator_resource)
+      restore_op = self._restore_op(iterator._iterator_resource)
+      return init_op, get_next, save_op, restore_op
+
+    start = 2
+    stop = 10
+    num_epochs = 5
+    with ops.Graph().as_default() as g:
+      init_op, get_next, save_op, restore_op = _build_graph(
+          start, stop, num_epochs)
+      with self.session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        # Note: There is no checkpoint saved currently so a NotFoundError is
+        # raised.
+        with self.assertRaises(errors.NotFoundError):
+          sess.run(restore_op)
+        for _ in range(num_epochs):
+          for i in range(start, stop):
+            self.assertEqual(i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+        sess.run(save_op)
+
+    with ops.Graph().as_default() as g:
+      init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
+      with self.session(graph=g) as sess:
+        sess.run(restore_op)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
deleted file mode 100644
index bc6b36285aa417e6812e44e97e4f3a30ceb8e6a0..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
+++ /dev/null
@@ -1,650 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.platform import test
-
-
-class DatasetConstructorTest(test_base.DatasetTestBase):
-
-  def testFromTensors(self):
-    """Test a dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorsSparse(self):
-    """Test a dataset that represents a single tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0]]),
-        values=np.array([0]),
-        dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorsMixed(self):
-    """Test an dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0]]),
-                      values=np.array([0]),
-                      dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape)
-        if sparse_tensor.is_sparse(c) else c.shape for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        if sparse_tensor.is_sparse(component):
-          self.assertSparseValuesEqual(component, result_component)
-        else:
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlices(self):
-    """Test a dataset that represents the slices from a tuple of tensors."""
-    components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
-            np.array([[12], [13], [14], [15]]), 22),
-        np.array([37.0, 38.0, 39.0, 40.0])
-    )
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesSparse(self):
-    """Test a dataset that represents the slices from a tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0, 0], [1, 0], [2, 0]]),
-        values=np.array([0, 0, 0]),
-        dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(expected[i], results):
-          self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesMixed(self):
-    """Test a dataset that represents the slices from a tuple of tensors."""
-    components = (np.tile(np.array([[1], [2], [3]]), 20),
-                  np.tile(np.array([[12], [13], [14]]), 22),
-                  np.array([37.0, 38.0, 39.0]),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
-                      values=np.array([0, 0, 0]),
-                      dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape[1:])
-        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            (list(zip(*components[:3]))[i] + expected[i]), results):
-          if sparse_tensor.is_sparse(component):
-            self.assertSparseValuesEqual(component, result_component)
-          else:
-            self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesWithDict(self):
-    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(dtypes.int32, iterator.output_types["foo"])
-    self.assertEqual(dtypes.float32, iterator.output_types["bar"])
-    self.assertEqual((), iterator.output_shapes["foo"])
-    self.assertEqual((1,), iterator.output_shapes["bar"])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(3):
-        results = sess.run(get_next)
-        self.assertEqual(components["foo"][i], results["foo"])
-        self.assertEqual(components["bar"][i], results["bar"])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromSparseTensorSlices(self):
-    """Test a dataset based on slices of a `tf.SparseTensor`."""
-    st = array_ops.sparse_placeholder(dtypes.float64)
-    iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
-
-    with self.cached_session() as sess:
-      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
-
-      # Test with sparse tensor in the appropriate order.
-      indices = np.array(
-          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
-      values = np.array([val for s in slices for val in s])
-      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
-      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
-                                                    dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      for i, s in enumerate(slices):
-        results = sess.run(get_next)
-        self.assertAllEqual(s, results.values)
-        expected_indices = np.array(
-            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
-        self.assertAllEqual(expected_indices, results.indices)
-        self.assertAllEqual(dense_shape[1:], results.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test with sparse tensor in the reverse order, which is not
-      # currently supported.
-      reverse_order_indices = indices[::-1, :]
-      reverse_order_values = values[::-1]
-      sparse_feed = sparse_tensor.SparseTensorValue(
-          reverse_order_indices, reverse_order_values, dense_shape)
-      with self.assertRaises(errors.UnimplementedError):
-        sess.run(init_op, feed_dict={st: sparse_feed})
-
-      # Test with an empty sparse tensor.
-      empty_indices = np.empty((0, 4), dtype=np.int64)
-      empty_values = np.empty((0,), dtype=np.float64)
-      empty_dense_shape = [0, 4, 37, 9]
-      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
-                                                    empty_dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  # pylint: disable=g-long-lambda,unnecessary-lambda
-  def testNestedStructure(self):
-    components = (np.array([1, 2, 3], dtype=np.int64),
-                  (np.array([4., 5.]), np.array([6., 7.])),
-                  np.array([8, 9, 10], dtype=np.int64))
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.shuffle(10, 10)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.repeat(-1)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.filter(lambda x, y, z: True)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.take(5)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
-                                                       (y[0], y[1])))
-    )
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.batch(32)
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
-                      nest.pack_sequence_as(dataset.output_shapes, [
-                          s.as_list()
-                          for s in nest.flatten(dataset.output_shapes)
-                      ]))
-
-    iterator = dataset.make_one_shot_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    iterator = dataset.make_initializable_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    # Define a separate set of components with matching leading
-    # dimension for the from-slices constructor.
-    components_for_slices = (np.array([1, 2, 3], dtype=np.int64),
-                             (np.array([4., 5., 6.]),
-                              np.array([7., 8., 9.])),
-                             np.array([10, 11, 12], dtype=np.int64))
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([], ([], []), []), dataset.output_shapes)
-
-  def testNestedDict(self):
-    components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
-    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
-    self.assertEquals(dtypes.int32, dataset.output_types["b"])
-    self.assertEquals([], dataset.output_shapes["a"]["aa"])
-    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
-    self.assertEquals([3], dataset.output_shapes["b"])
-
-  def testNonSequenceNestedStructure(self):
-    components = np.array([1, 2, 3], dtype=np.int64)
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.filter(
-        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([2, 3], dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    self.assertEquals(dtypes.int64, get_next.dtype)
-    self.assertEquals([3], get_next.shape)
-
-  def testSplitPipelineFailsWithPlacementError(self):
-    with session.Session(
-        target="",
-        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
-
-      dataset = dataset_ops.Dataset.from_tensors(0)
-
-      # Define a pipeline that attempts to use variables on two
-      # different devices.
-      #
-      # Initialize the variables before creating to iterator, to avoid the
-      # placement algorithm overriding the DT_RESOURCE colocation constraints.
-      with ops.device("/cpu:0"):
-        var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_0.read_value())
-      sess.run(var_0.initializer)
-
-      with ops.device("/cpu:1"):
-        var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_1.read_value())
-      sess.run(var_1.initializer)
-
-      iterator = dataset.make_initializable_iterator()
-      sess.run(iterator.initializer)
-
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          "Error while reading resource variable Variable"):
-        sess.run(iterator.get_next())
-
-
-class DatasetConstructorBenchmark(test.Benchmark):
-
-  def benchmarkSliceRepeatBatch(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        sess.run(next_element)
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          sess.run(next_element)
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print("Slice/repeat/batch with sess.run() input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_slice_repeat_batch_input_%d_batch_%d" % (input_size,
-                                                                 batch_size))
-
-  def benchmarkSliceRepeatBatchCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print(
-        "Slice/repeat/batch with callable input size: %d batch size: %d Median"
-        " wall time per element: %f" % (input_size, batch_size,
-                                        median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_slice_repeat_batch_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkReshapeSliceRepeatCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
-        .repeat(num_epochs + 1))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print("Reshape/slice/repeat with callable input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_reshape_slice_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkSliceBatchCacheRepeatCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
-        .cache().repeat(num_epochs + 1))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print(
-        "Slice/batch/cache/repeat with callable input size: %d batch size: %d "
-        "Median wall time per element: %f"
-        % (input_size, batch_size, median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_slice_batch_cache_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
deleted file mode 100644
index cb8cb9a77df0b897a87dfecb96395c1bbee450b0..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
+++ /dev/null
@@ -1,482 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import threading
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import script_ops
-from tensorflow.python.platform import test
-
-
-class DatasetConstructorTest(test_base.DatasetTestBase):
-
-  def _testFromGenerator(self, generator, elem_sequence, num_repeats,
-                         output_types=None):
-    if output_types is None:
-      output_types = dtypes.int64
-    iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=output_types)
-        .repeat(num_repeats)
-        .prefetch(5)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(2):  # Run twice to test reinitialization.
-        sess.run(init_op)
-        for _ in range(num_repeats):
-          for elem in elem_sequence:
-            self.assertAllEqual(elem, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def _testFromGeneratorOneShot(self, generator, elem_sequence, num_repeats):
-    iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
-        .repeat(num_repeats)
-        .prefetch(5)
-        .make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(num_repeats):
-        for elem in elem_sequence:
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorUsingFunction(self):
-    def generator():
-      for i in range(1, 100):
-        yield [i] * i
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-    self._testFromGeneratorOneShot(generator, elem_sequence, 1)
-    self._testFromGeneratorOneShot(generator, elem_sequence, 5)
-
-  def testFromGeneratorUsingList(self):
-    generator = lambda: [[i] * i for i in range(1, 100)]
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-
-  def testFromGeneratorUsingNdarray(self):
-    generator = lambda: np.arange(100, dtype=np.int64)
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1, output_types=np.int64)
-    self._testFromGenerator(generator, elem_sequence, 5, output_types=np.int64)
-
-  def testFromGeneratorUsingGeneratorExpression(self):
-    # NOTE(mrry): Generator *expressions* are not repeatable (or in
-    # general reusable), because they eagerly evaluate the `for`
-    # expression as `iter(range(1, 100))` and discard the means of
-    # reconstructing `range(1, 100)`. Wrapping the generator
-    # expression in a `lambda` makes it repeatable.
-    generator = lambda: ([i] * i for i in range(1, 100))
-    elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
-
-  def testFromMultipleConcurrentGenerators(self):
-    num_inner_repeats = 5
-    num_outer_repeats = 100
-
-    def generator():
-      for i in range(1, 10):
-        yield ([i] * i, [i, i ** 2, i ** 3])
-    input_list = list(generator())
-
-    # The interleave transformation is essentially a flat map that
-    # draws from multiple input datasets concurrently (in a cyclic
-    # fashion). By placing `Datsaet.from_generator()` inside an
-    # interleave, we test its behavior when multiple iterators are
-    # active at the same time; by additionally prefetching inside the
-    # interleave, we create the possibility of parallel (modulo GIL)
-    # invocations to several iterators created by the same dataset.
-    def interleave_fn(_):
-      return (dataset_ops.Dataset.from_generator(
-          generator, output_types=(dtypes.int64, dtypes.int64),
-          output_shapes=([None], [3]))
-              .repeat(num_inner_repeats).prefetch(5))
-
-    iterator = (
-        dataset_ops.Dataset.range(num_outer_repeats)
-        .interleave(interleave_fn, cycle_length=10,
-                    block_length=len(input_list))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(num_inner_repeats * num_outer_repeats):
-        for elem in input_list:
-          val0, val1 = sess.run(get_next)
-          self.assertAllEqual(elem[0], val0)
-          self.assertAllEqual(elem[1], val1)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  # TODO(b/67868766): Reenable this when the source of flakiness is discovered.
-  def _testFromGeneratorsRunningInParallel(self):
-    num_parallel_iterators = 3
-
-    # Define shared state that multiple iterator instances will access to
-    # demonstrate their concurrent activity.
-    lock = threading.Lock()
-    condition = threading.Condition(lock)
-    next_ticket = [0]  # GUARDED_BY(lock)
-
-    def generator():
-      # NOTE(mrry): We yield one element before the barrier, because
-      # the current implementation of `Dataset.interleave()` must
-      # fetch one element from each incoming dataset to start the
-      # prefetching.
-      yield 0
-
-      # Define a barrier that `num_parallel_iterators` iterators must enter
-      # before any can proceed. Demonstrates that multiple iterators may be
-      # active at the same time.
-      condition.acquire()
-      ticket = next_ticket[0]
-      next_ticket[0] += 1
-      if ticket == num_parallel_iterators - 1:
-        # The last iterator to join the barrier notifies the others.
-        condition.notify_all()
-      else:
-        # Wait until the last iterator enters the barrier.
-        while next_ticket[0] < num_parallel_iterators:
-          condition.wait()
-      condition.release()
-
-      yield 1
-
-    # As in `testFromMultipleConcurrentGenerators()`, we use a combination of
-    # `Dataset.interleave()` and `Dataset.prefetch()` to cause multiple
-    # iterators to be active concurrently.
-    def interleave_fn(_):
-      return dataset_ops.Dataset.from_generator(
-          generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2)
-
-    iterator = (
-        dataset_ops.Dataset.range(num_parallel_iterators)
-        .interleave(
-            interleave_fn, cycle_length=num_parallel_iterators, block_length=1)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for elem in [0, 1]:
-        for _ in range(num_parallel_iterators):
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorImplicitConversion(self):
-    def generator():
-      yield [1]
-      yield [2]
-      yield [3]
-
-    for dtype in [dtypes.int8, dtypes.int32, dtypes.int64]:
-      iterator = (dataset_ops.Dataset.from_generator(
-          generator, output_types=dtype, output_shapes=[1])
-                  .make_initializable_iterator())
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      self.assertEqual(dtype, get_next.dtype)
-
-      with self.cached_session() as sess:
-        sess.run(init_op)
-        for expected in [[1], [2], [3]]:
-          next_val = sess.run(get_next)
-          self.assertEqual(dtype.as_numpy_dtype, next_val.dtype)
-          self.assertAllEqual(expected, next_val)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testFromGeneratorString(self):
-    def generator():
-      yield "foo"
-      yield b"bar"
-      yield u"baz"
-
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.string, output_shapes=[])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for expected in [b"foo", b"bar", b"baz"]:
-        next_val = sess.run(get_next)
-        self.assertAllEqual(expected, next_val)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorTypeError(self):
-    def generator():
-      yield np.array([1, 2, 3], dtype=np.int64)
-      yield np.array([4, 5, 6], dtype=np.int64)
-      yield "ERROR"
-      yield np.array([7, 8, 9], dtype=np.int64)
-
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError("The expected type was int64"):
-        sess.run(get_next)
-      self.assertAllEqual([7, 8, 9], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorShapeError(self):
-    def generator():
-      yield np.array([1, 2, 3], dtype=np.int64)
-      yield np.array([4, 5, 6], dtype=np.int64)
-      yield np.array([7, 8, 9, 10], dtype=np.int64)
-      yield np.array([11, 12, 13], dtype=np.int64)
-
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
-        sess.run(get_next)
-      self.assertAllEqual([11, 12, 13], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorStructureError(self):
-    def generator():
-      yield 1, 2
-      yield 3, 4
-      yield 5
-      yield 6, 7, 8
-      yield 9, 10
-
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=(dtypes.int64, dtypes.int64))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertEqual((1, 2), sess.run(get_next))
-      self.assertEqual((3, 4), sess.run(get_next))
-      with self.assertRaisesOpError(
-          r"The expected structure was \(tf\.int64, tf\.int64\)"):
-        sess.run(get_next)
-      with self.assertRaisesOpError(
-          r"The expected structure was \(tf\.int64, tf\.int64\)"):
-        sess.run(get_next)
-      self.assertEqual((9, 10), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorHeterogeneous(self):
-    def generator():
-      yield 1
-      yield [2, 3]
-
-    iterator = (
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(1, sess.run(get_next))
-      self.assertAllEqual([2, 3], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorStopShort(self):
-
-    def generator():
-      yield 0
-      yield 1
-      yield 2
-
-    iterator = (
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(0, sess.run(get_next))
-      self.assertAllEqual(1, sess.run(get_next))
-
-  def testFromGeneratorDestructorCalled(self):
-    # Use an `Event` to signal that the generator has been deleted.
-    event = threading.Event()
-
-    class GeneratorWrapper(object):
-
-      def __iter__(self):
-        return self
-
-      def next(self):
-        return self.__next__()
-
-      def __next__(self):
-        return 42
-
-      def __del__(self):
-        event.set()
-
-    iterator = dataset_ops.Dataset.from_generator(
-        GeneratorWrapper,
-        output_types=dtypes.int64).take(2).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(42, sess.run(get_next))
-      self.assertAllEqual(42, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      # Test that `GeneratorWrapper` object is destroyed when the
-      # iterator terminates (and the generator iterator is deleted).
-      self.assertTrue(event.is_set())
-
-  def testFromGeneratorWithArgs(self):
-
-    def flat_map_fn(elem):
-
-      def generator_with_arg(n):
-        for _ in range(n):
-          yield np.array(n, dtype=np.int64)
-
-      return dataset_ops.Dataset.from_generator(
-          generator_with_arg, output_types=dtypes.int64, output_shapes=(),
-          args=(elem,))
-
-    iterator = (dataset_ops.Dataset
-                .range(5)
-                .flat_map(flat_map_fn)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
-      for x in expected:
-        self.assertEqual(x, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromGeneratorWithTwoArgs(self):
-
-    def flat_map_fn(elem, message):
-
-      def generator_with_arg(n, msg):
-        for i in range(n):
-          yield i, msg
-
-      return dataset_ops.Dataset.from_generator(
-          generator_with_arg, output_types=(dtypes.int64, dtypes.string),
-          output_shapes=((), ()), args=(elem, message))
-
-    iterator = (
-        dataset_ops.Dataset.zip(
-            (dataset_ops.Dataset.range(5),
-             dataset_ops.Dataset.from_tensors("Hi!").repeat(None)))
-        .flat_map(flat_map_fn)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [(0, b"Hi!"),
-                  (0, b"Hi!"), (1, b"Hi!"),
-                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"),
-                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"), (3, b"Hi!")]
-      for x in expected:
-        self.assertEqual(x, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testGeneratorDatasetFinalizeFunctionCalled(self):
-    # NOTE(mrry): This test tests the internal `_GeneratorDataset`,
-    # which affords more control over what the finalize function can do than
-    # the `Dataset.from_generator()` wrapper.
-
-    # Use an `Event` to signal that the generator has been deleted.
-    event = threading.Event()
-
-    def finalize_fn(_):
-      def finalize_py_func():
-        event.set()
-        return 0
-      return script_ops.py_func(finalize_py_func, [], [dtypes.int64],
-                                stateful=True)
-
-    dummy = constant_op.constant(37)
-    iterator = (dataset_ops._GeneratorDataset(dummy, lambda x: x,
-                                              lambda x: x, finalize_fn)
-                .take(2)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(37, sess.run(get_next))
-      self.assertAllEqual(37, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-        self.assertTrue(event.is_set())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_ops_test.py b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
deleted file mode 100644
index b9f8875b9f65439585ff8b1642054e14d06875cf..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/dataset_ops_test.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the input pipeline ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.core.framework import graph_pb2
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.platform import test
-
-
-class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def testAsSerializedGraph(self):
-    dataset = dataset_ops.Dataset.range(10)
-    with self.cached_session() as sess:
-      graph = graph_pb2.GraphDef().FromString(
-          sess.run(dataset._as_serialized_graph()))
-      self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
-
-  @staticmethod
-  def make_apply_fn(dataset):
-
-    def apply_fn(dataset):
-
-      def _apply_fn(dataset):
-        return dataset.cache()
-
-      return dataset.apply(_apply_fn)
-
-    return apply_fn
-
-  @staticmethod
-  def make_gen():
-
-    def gen():
-      yield 42
-
-    return gen
-
-  @staticmethod
-  def make_interleave_fn(dataset, num_parallel_calls=None):
-
-    def interleave_fn(dataset):
-      return dataset.interleave(
-          lambda x: dataset_ops.Dataset.range(0),
-          cycle_length=2,
-          num_parallel_calls=num_parallel_calls)
-
-    return interleave_fn
-
-  @parameterized.named_parameters(
-      ("FixedLengthRecord", readers.FixedLengthRecordDataset("", 42)),
-      ("FromGenerator",
-       dataset_ops.Dataset.from_generator(make_gen.__func__(), dtypes.int32),
-       1),
-      ("FromSparseTensorSlices",
-       dataset_ops.Dataset.from_sparse_tensor_slices(
-           sparse_tensor.SparseTensor(
-               indices=np.array([[0, 0], [1, 0], [2, 0]]),
-               values=np.array([0, 0, 0]),
-               dense_shape=np.array([3, 1])))),
-      ("FromTensors", dataset_ops.Dataset.from_tensors([42])),
-      ("FromTensorSlices", dataset_ops.Dataset.from_tensors([42])),
-      ("Range", dataset_ops.Dataset.range(10)),
-      ("TextLine", readers.TextLineDataset("")),
-      ("TFRecord", readers.TFRecordDataset(""), 1),
-  )
-  def testDatasetSourceInputs(self, dataset, num_inputs=0):
-    self.assertEqual(num_inputs, len(dataset._inputs()))
-
-  @parameterized.named_parameters(
-      ("Apply", make_apply_fn.__func__(dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Batch", lambda x: x.batch(10), dataset_ops.Dataset.range(0)),
-      ("Cache", lambda x: x.cache(), dataset_ops.Dataset.range(0)),
-      ("Filter", lambda x: x.filter(lambda x: True),
-       dataset_ops.Dataset.range(0)),
-      ("FlatMap", lambda x: x.flat_map(lambda x: dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Interleave", make_interleave_fn.__func__(dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Map", lambda x: x.map(lambda x: x), dataset_ops.Dataset.range(0)),
-      ("PaddedBatch", lambda x: x.padded_batch(10, []),
-       dataset_ops.Dataset.range(0)),
-      ("ParallelInterleave",
-       make_interleave_fn.__func__(dataset_ops.Dataset.range(0), 2),
-       dataset_ops.Dataset.range(0)),
-      ("ParallelMap", lambda x: x.map(lambda x: x, num_parallel_calls=2),
-       dataset_ops.Dataset.range(0)),
-      ("Repeat", lambda x: x.repeat(), dataset_ops.Dataset.range(0)),
-      ("Shuffle", lambda x: x.shuffle(10), dataset_ops.Dataset.range(0)),
-      ("Skip", lambda x: x.skip(1), dataset_ops.Dataset.range(0)),
-      ("Take", lambda x: x.take(1), dataset_ops.Dataset.range(0)),
-      ("Window", lambda x: x.window(10), dataset_ops.Dataset.range(0)),
-  )
-  def testUnaryTransformationInputs(self, dataset_fn, input_dataset):
-    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
-
-  @parameterized.named_parameters(
-      ("Concatenate", lambda x, y: x.concatenate(y),
-       dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1)))
-  def testBinaryTransformationInputs(self, dataset_fn, input1, input2):
-    self.assertEqual([input1, input2], dataset_fn(input1, input2)._inputs())
-
-  @parameterized.named_parameters(
-      ("ZipOne", dataset_ops.Dataset.zip, (dataset_ops.Dataset.range(0))),
-      ("ZipNest", dataset_ops.Dataset.zip,
-       (dataset_ops.Dataset.range(0),
-        (dataset_ops.Dataset.range(1), dataset_ops.Dataset.range(2)))),
-      ("ZipTuple", dataset_ops.Dataset.zip,
-       (dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1))))
-  def testVariadicTransformationInputs(self, dataset_fn, input_datasets):
-    self.assertEqual(
-        nest.flatten(input_datasets),
-        dataset_fn(input_datasets)._inputs())
-
-  def testCollectInputs(self):
-    ds1 = dataset_ops.Dataset.range(0)
-    ds2 = ds1.concatenate(ds1)
-    ds3 = dataset_ops.Dataset.zip((ds2, ds1, ds2))
-
-    inputs = []
-    queue = [ds3]
-    while queue:
-      ds = queue[0]
-      queue = queue[1:]
-      queue.extend(ds._inputs())
-      inputs.append(ds)
-
-    self.assertEqual(5, inputs.count(ds1))
-    self.assertEqual(2, inputs.count(ds2))
-    self.assertEqual(1, inputs.count(ds3))
-
-  def testOptionsDefault(self):
-    ds = dataset_ops.Dataset.range(0)
-    self.assertEqual(dataset_ops.Options(), ds.options())
-
-  def testOptionsOnce(self):
-    options = dataset_ops.Options()
-    ds = dataset_ops.Dataset.range(0).with_options(options).cache()
-    self.assertEqual(options, ds.options())
-
-  def testOptionsTwiceSame(self):
-    options = dataset_ops.Options()
-    options.experimental_autotune = True
-    ds = dataset_ops.Dataset.range(0).with_options(options).with_options(
-        options)
-    self.assertEqual(options, ds.options())
-
-  def testOptionsTwiceDifferent(self):
-    options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
-    options2 = dataset_ops.Options()
-    options2.experimental_filter_fusion = False
-    ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
-        options2)
-    self.assertTrue(ds.options().experimental_autotune)
-    self.assertFalse(ds.options().experimental_filter_fusion)
-
-  def testOptionsTwiceDifferentError(self):
-    options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
-    options2 = dataset_ops.Options()
-    options2.experimental_autotune = False
-    with self.assertRaisesRegexp(ValueError,
-                                 "Cannot merge incompatible values of option"):
-      dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2952c08be02b76fb221ee0f31f4b9fc34a14d659
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -0,0 +1,317 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import optional_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testAsSerializedGraph(self):
+    dataset = dataset_ops.Dataset.range(10)
+    graph = graph_pb2.GraphDef().FromString(
+        self.evaluate(dataset._as_serialized_graph()))
+    self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
+
+  @staticmethod
+  def make_apply_fn(dataset):
+
+    def apply_fn(dataset):
+
+      def _apply_fn(dataset):
+        return dataset.cache()
+
+      return dataset.apply(_apply_fn)
+
+    return apply_fn
+
+  @staticmethod
+  def make_gen():
+
+    def gen():
+      yield 42
+
+    return gen
+
+  @staticmethod
+  def make_interleave_fn(dataset, num_parallel_calls=None):
+
+    def interleave_fn(dataset):
+      return dataset.interleave(
+          lambda x: dataset_ops.Dataset.range(0),
+          cycle_length=2,
+          num_parallel_calls=num_parallel_calls)
+
+    return interleave_fn
+
+  @parameterized.named_parameters(
+      ("FixedLengthRecord",
+       lambda: readers.FixedLengthRecordDataset("", 42)),
+      ("FromGenerator",
+       lambda: dataset_ops.Dataset.from_generator(
+           DatasetTest.make_gen(), dtypes.int32),
+       1),
+      ("FromTensors", lambda: dataset_ops.Dataset.from_tensors([42])),
+      ("FromTensorSlices", lambda: dataset_ops.Dataset.from_tensors([42])),
+      ("Range", lambda: dataset_ops.Dataset.range(10)),
+      ("TextLine", lambda: readers.TextLineDataset("")),
+      ("TFRecord", lambda: readers.TFRecordDataset(""), 1),
+  )
+  def testDatasetSimpleSourceInputs(self, dataset_fn, num_inputs=0):
+    self.assertEqual(num_inputs, len(dataset_fn()._inputs()))
+
+  def testDatasetComplexSourceInputs(self):
+    dataset_fn = dataset_ops.Dataset.from_sparse_tensor_slices(
+        sparse_tensor.SparseTensor(
+            indices=np.array([[0, 0], [1, 0], [2, 0]]),
+            values=np.array([0, 0, 0]),
+            dense_shape=np.array([3, 1])))
+    self.assertEqual(0, len(dataset_fn._inputs()))
+
+  @parameterized.named_parameters(
+      ("Batch",
+       lambda x: x.batch(10),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("Cache",
+       lambda x: x.cache(),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("Filter",
+       lambda x: x.filter(lambda x: True),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("FlatMap",
+       lambda x: x.flat_map(lambda x: dataset_ops.Dataset.range(0)),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("Map",
+       lambda x: x.map(lambda x: x),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("PaddedBatch",
+       lambda x: x.padded_batch(10, []),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("ParallelMap",
+       lambda x: x.map(lambda x: x, num_parallel_calls=2),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("Repeat",
+       lambda x: x.repeat(),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("Shuffle",
+       lambda x: x.shuffle(10),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("Skip",
+       lambda x: x.skip(1),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("Take",
+       lambda x: x.take(1),
+       lambda: dataset_ops.Dataset.range(0)),
+      ("Window",
+       lambda x: x.window(10),
+       lambda: dataset_ops.Dataset.range(0)),
+  )
+  def testUnaryTransformationInputs(self, dataset_fn, input_dataset_fn):
+    input_dataset = input_dataset_fn()
+    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
+
+  def testUnaryTransformationInputsApply(self):
+    input_dataset = dataset_ops.Dataset.range(0)
+    dataset_fn = self.make_apply_fn(dataset_ops.Dataset.range(0))
+    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
+
+  @parameterized.named_parameters(
+      ("ParallelInterleave",
+       [lambda: dataset_ops.Dataset.range(0), 2],
+       lambda: dataset_ops.Dataset.range(0)),
+      ("Interleave",
+       [lambda: dataset_ops.Dataset.range(0), None],
+       lambda: dataset_ops.Dataset.range(0)),
+  )
+  def testUnaryTransformationInputsWithInterleaveFn(
+      self, interleave_fn_args, input_dataset_fn):
+    input_dataset = input_dataset_fn()
+    dataset_fn = self.make_interleave_fn(*interleave_fn_args)
+    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
+
+  @parameterized.named_parameters(
+      ("Concatenate", lambda x, y: x.concatenate(y),
+       lambda: dataset_ops.Dataset.range(0),
+       lambda: dataset_ops.Dataset.range(1)))
+  def testBinaryTransformationInputs(self, dataset_fn, input1_fn, input2_fn):
+    input1 = input1_fn()
+    input2 = input2_fn()
+    self.assertEqual([input1, input2], dataset_fn(input1, input2)._inputs())
+
+  @parameterized.named_parameters(
+      ("ZipOne",
+       dataset_ops.Dataset.zip,
+       lambda: (dataset_ops.Dataset.range(0))),
+      ("ZipNest",
+       dataset_ops.Dataset.zip,
+       lambda: (dataset_ops.Dataset.range(0),
+                (dataset_ops.Dataset.range(1),
+                 dataset_ops.Dataset.range(2)))),
+      ("ZipTuple",
+       dataset_ops.Dataset.zip,
+       lambda: (dataset_ops.Dataset.range(0),
+                dataset_ops.Dataset.range(1))),
+  )
+  def testVariadicTransformationInputs(self, dataset_fn, input_datasets_fn):
+    input_datasets = input_datasets_fn()
+    self.assertEqual(
+        nest.flatten(input_datasets),
+        dataset_fn(input_datasets)._inputs())
+
+  def testCollectInputs(self):
+    ds1 = dataset_ops.Dataset.range(0)
+    ds2 = ds1.concatenate(ds1)
+    ds3 = dataset_ops.Dataset.zip((ds2, ds1, ds2))
+
+    inputs = []
+    queue = [ds3]
+    while queue:
+      ds = queue[0]
+      queue = queue[1:]
+      queue.extend(ds._inputs())
+      inputs.append(ds)
+
+    self.assertEqual(5, inputs.count(ds1))
+    self.assertEqual(2, inputs.count(ds2))
+    self.assertEqual(1, inputs.count(ds3))
+
+  def testOptionsDefault(self):
+    ds = dataset_ops.Dataset.range(0)
+    self.assertEqual(dataset_ops.Options(), ds.options())
+
+  def testOptionsOnce(self):
+    options = dataset_ops.Options()
+    ds = dataset_ops.Dataset.range(0).with_options(options).cache()
+    self.assertEqual(options, ds.options())
+
+  def testOptionsTwiceSame(self):
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    ds = dataset_ops.Dataset.range(0).with_options(options).with_options(
+        options)
+    self.assertEqual(options, ds.options())
+
+  def testOptionsTwiceDifferent(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_deterministic = False
+    ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
+        options2)
+    self.assertTrue(ds.options().experimental_autotune)
+    # Explicitly check that flag is False since assertFalse allows None
+    self.assertIs(ds.options().experimental_deterministic, False)
+
+  def testOptionsTwiceDifferentError(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_autotune = False
+    with self.assertRaisesRegexp(ValueError,
+                                 "Cannot merge incompatible values"):
+      dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
+
+  def testOptionsMergeOptionsFromMultipleInputs(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_deterministic = True
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(0).with_options(options1),
+         dataset_ops.Dataset.range(0).with_options(options2)))
+    self.assertTrue(ds.options().experimental_autotune)
+    self.assertTrue(ds.options().experimental_deterministic)
+
+  # TODO(b/119882922): use-after-free bug in eager mode.
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[0]], values=constant_op.constant([0], dtype=dtypes.int32),
+          dense_shape=[1]),
+       structure.SparseTensorStructure(dtypes.int32, [1])),
+      ("Nest", lambda: {
+          "a": constant_op.constant(37.0),
+          "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))},
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, []),
+           "b": (structure.TensorStructure(dtypes.string, [1]),
+                 structure.TensorStructure(dtypes.string, []))})),
+      ("Dataset", lambda: dataset_ops.Dataset.from_tensor_slices(
+          constant_op.constant([1, 2, 3])),
+       dataset_ops.DatasetStructure(
+           structure.TensorStructure(dtypes.int32, []))),
+      ("Optional", lambda: optional_ops.Optional.from_value(37.0),
+       optional_ops.OptionalStructure(
+           structure.TensorStructure(dtypes.float32, []))),
+  )
+  def testSkipEagerDatasetStructure(self, tf_value_fn,
+                                    expected_element_structure):
+    dataset = dataset_ops.Dataset.from_tensors(0).map(lambda _: tf_value_fn())
+    dataset_structure = structure.Structure.from_value(dataset)
+    self.assertIsInstance(dataset_structure, dataset_ops.DatasetStructure)
+
+    # TODO(b/110122868): Add a public API to `tf.data.Dataset` for accessing
+    # the element structure.
+    self.assertTrue(expected_element_structure.is_compatible_with(
+        dataset_structure._element_structure))
+    self.assertTrue(dataset_structure._element_structure.is_compatible_with(
+        expected_element_structure))
+
+    self.assertEqual([dtypes.variant], dataset_structure._flat_types)
+    self.assertEqual([tensor_shape.scalar()], dataset_structure._flat_shapes)
+
+    # Assert that the `Dataset` survives a round-trip via _from_tensor_list()
+    # and _to_tensor_list().
+    round_trip_dataset = dataset_structure._from_tensor_list(
+        dataset_structure._to_tensor_list(dataset))
+
+    value = tf_value_fn()
+
+    if isinstance(value, dataset_ops.Dataset):
+      self.assertDatasetsEqual(value, dataset.flat_map(lambda x: x))
+    elif isinstance(value, optional_ops.Optional):
+      self.assertDatasetProduces(
+          round_trip_dataset.map(lambda opt: opt.get_value()),
+          [self.evaluate(value.get_value())],
+          requires_initialization=True)
+    else:
+      self.assertDatasetProduces(
+          round_trip_dataset, [self.evaluate(tf_value_fn())],
+          requires_initialization=True)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
deleted file mode 100644
index 6b7afafa5d8d0a1d3d79f1ff90633f697a37045b..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class FilterDatasetTest(test_base.DatasetTestBase):
-
-  def testFilterDataset(self):
-    components = (
-        np.arange(7, dtype=np.int64),
-        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
-            7, dtype=np.int64)[:, np.newaxis],
-        np.array(37.0, dtype=np.float64) * np.arange(7)
-    )
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    modulus = array_ops.placeholder(dtypes.int64)
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count)
-        .filter(lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Test that we can dynamically feed a different modulus value for each
-      # iterator.
-      def do_test(count_val, modulus_val):
-        sess.run(init_op, feed_dict={count: count_val, modulus: modulus_val})
-        for _ in range(count_val):
-          for i in [x for x in range(7) if x**2 % modulus_val == 0]:
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-      do_test(14, 2)
-      do_test(4, 18)
-
-      # Test an empty dataset.
-      do_test(0, 1)
-
-  def testFilterRange(self):
-    dataset = dataset_ops.Dataset.range(100).filter(
-        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
-      self.assertEqual(1, sess.run(get_next))
-      self.assertEqual(3, sess.run(get_next))
-
-  def testFilterDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .filter(lambda d: math_ops.equal(d["bar"] % 2, 0))
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        if (i ** 2) % 2 == 0:
-          self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testUseStepContainerInFilter(self):
-    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
-
-    # Define a predicate that returns true for the first element of
-    # the sequence and not the second, and uses `tf.map_fn()`.
-    def _predicate(xs):
-      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
-      summed = math_ops.reduce_sum(squared_xs)
-      return math_ops.equal(summed, 1 + 4 + 9)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([[1, 2, 3], [4, 5, 6]])
-        .filter(_predicate)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(input_data[0], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1])), i
-
-    def _filter_fn(_, i):
-      return math_ops.equal(i % 2, 0)
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
-            lambda x, i: x).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(5):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testReturnComponent(self):
-    iterator = (
-        dataset_ops.Dataset.zip(
-            (dataset_ops.Dataset.range(10),
-             dataset_ops.Dataset.from_tensors(True).repeat(None)))
-        .filter(lambda x, y: y).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, True), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testParallelFilters(self):
-    dataset = dataset_ops.Dataset.range(10).filter(
-        lambda x: math_ops.equal(x % 2, 0))
-    iterators = [dataset.make_one_shot_iterator() for _ in range(10)]
-    next_elements = [iterator.get_next() for iterator in iterators]
-    with self.cached_session() as sess:
-      self.assertEqual([0 for _ in range(10)], sess.run(next_elements))
-
-
-class FilterDatasetBenchmark(test.Benchmark):
-
-  def _benchmark(self, predicate, name):
-    with ops.Graph().as_default():
-      dataset = (
-          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(5):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        print("Filter dataset using %s. Median wall time: %f" %
-              (name, median_wall_time))
-        self.report_benchmark(
-            iters=100,
-            wall_time=median_wall_time,
-            name="benchmark_filter_dataset_%s" % name)
-
-  def benchmarkSimpleFunction(self):
-    self._benchmark(array_ops.identity, "simple_function")
-
-  def benchmarkReturnComponentOptimization(self):
-    self._benchmark(lambda x: x, "return_component")
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_test.py b/tensorflow/python/data/kernel_tests/filter_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..afaf954cbc6a96984239cb22665bbe1f17d6d40d
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/filter_test.py
@@ -0,0 +1,128 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.filter()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FilterTest(test_base.DatasetTestBase):
+
+  def testFilterDataset(self):
+    components = (
+        np.arange(7, dtype=np.int64),
+        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
+            7, dtype=np.int64)[:, np.newaxis],
+        np.array(37.0, dtype=np.float64) * np.arange(7)
+    )
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def do_test(count, modulus):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+          _map_fn).repeat(count).filter(
+              lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      get_next = self.getNext(dataset)
+      for _ in range(count):
+        for i in [x for x in range(7) if x**2 % modulus == 0]:
+          result = self.evaluate(get_next())
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+    do_test(14, 2)
+    do_test(4, 18)
+
+    # Test an empty dataset.
+    do_test(0, 1)
+
+  def testFilterRange(self):
+    dataset = dataset_ops.Dataset.range(4).filter(
+        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
+    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
+
+  def testFilterDict(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2}).filter(
+            lambda d: math_ops.equal(d["bar"] % 2, 0)).map(
+                lambda d: d["foo"] + d["bar"])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
+
+  def testUseStepContainerInFilter(self):
+    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
+
+    # Define a predicate that returns true for the first element of
+    # the sequence and not the second, and uses `tf.map_fn()`.
+    def _predicate(xs):
+      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
+      summed = math_ops.reduce_sum(squared_xs)
+      return math_ops.equal(summed, 1 + 4 + 9)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6]]).filter(_predicate)
+    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
+
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
+        lambda x, i: x)
+    self.assertDatasetProduces(
+        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
+
+  def testShortCircuit(self):
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(10),
+         dataset_ops.Dataset.from_tensors(True).repeat(None)
+        )).filter(lambda x, y: y)
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, True) for i in range(10)])
+
+  def testParallelFilters(self):
+    dataset = dataset_ops.Dataset.range(10).filter(
+        lambda x: math_ops.equal(x % 2, 0))
+    next_elements = [self.getNext(dataset) for _ in range(10)]
+    self.assertEqual([0 for _ in range(10)],
+                     self.evaluate(
+                         [next_element() for next_element in next_elements]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py b/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9503e57ca7c1b3e1823b30c80e7785a25b133a24
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py
@@ -0,0 +1,171 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.FixedLengthRecordDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FixedLengthRecordDatasetTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    super(FixedLengthRecordDatasetTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+    self._header_bytes = 5
+    self._record_bytes = 3
+    self._footer_bytes = 2
+
+  def _record(self, f, r):
+    return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
+
+  def _createFiles(self, compression_type=None):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
+      filenames.append(fn)
+
+      contents = []
+      contents.append(b"H" * self._header_bytes)
+      for j in range(self._num_records):
+        contents.append(self._record(i, j))
+      contents.append(b"F" * self._footer_bytes)
+      contents = b"".join(contents)
+
+      if not compression_type:
+        with open(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "GZIP":
+        with gzip.GzipFile(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "ZLIB":
+        contents = zlib.compress(contents)
+        with open(fn, "wb") as f:
+          f.write(contents)
+      else:
+        raise ValueError("Unsupported compression_type", compression_type)
+
+    return filenames
+
+  def _testFixedLengthRecordDataset(self, compression_type=None):
+    test_filenames = self._createFiles(compression_type=compression_type)
+
+    def dataset_fn(filenames, num_epochs, batch_size=None):
+      repeat_dataset = readers.FixedLengthRecordDataset(
+          filenames,
+          self._record_bytes,
+          self._header_bytes,
+          self._footer_bytes,
+          compression_type=compression_type).repeat(num_epochs)
+      if batch_size:
+        return repeat_dataset.batch(batch_size)
+      return repeat_dataset
+
+    # Basic test: read from file 0.
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[0]], 1),
+        expected_output=[
+            self._record(0, i) for i in range(self._num_records)
+        ])
+
+    # Basic test: read from file 1.
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[1]], 1),
+        expected_output=[
+            self._record(1, i) for i in range(self._num_records)
+        ])
+
+    # Basic test: read from both files.
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 1), expected_output=expected_output)
+
+    # Test repeated iteration through both files.
+    get_next = self.getNext(dataset_fn(test_filenames, 10))
+    for _ in range(10):
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertEqual(self._record(j, i), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Test batched and repeated iteration through both files.
+    get_next = self.getNext(dataset_fn(test_filenames, 10, self._num_records))
+    for _ in range(10):
+      for j in range(self._num_files):
+        self.assertAllEqual(
+            [self._record(j, i) for i in range(self._num_records)],
+            self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testFixedLengthRecordDatasetNoCompression(self):
+    self._testFixedLengthRecordDataset()
+
+  def testFixedLengthRecordDatasetGzipCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="GZIP")
+
+  def testFixedLengthRecordDatasetZlibCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="ZLIB")
+
+  def testFixedLengthRecordDatasetBuffering(self):
+    test_filenames = self._createFiles()
+    dataset = readers.FixedLengthRecordDataset(
+        test_filenames,
+        self._record_bytes,
+        self._header_bytes,
+        self._footer_bytes,
+        buffer_size=10)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testFixedLengthRecordDatasetWrongSize(self):
+    test_filenames = self._createFiles()
+    dataset = readers.FixedLengthRecordDataset(
+        test_filenames,
+        self._record_bytes + 1,  # Incorrect record length.
+        self._header_bytes,
+        self._footer_bytes,
+        buffer_size=10)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            r"Excluding the header \(5 bytes\) and footer \(2 bytes\), input "
+            r"file \".*fixed_length_record.0.txt\" has body length 21 bytes, "
+            r"which is not an exact multiple of the record length \(4 bytes\).")
+        )
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
deleted file mode 100644
index 68038f9cfc09efcc08c5fa2d8d8af93a4a3c50db..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
-
-
-class FlatMapDatasetTest(test_base.DatasetTestBase):
-
-  # pylint: disable=g-long-lambda
-  def testFlatMapDataset(self):
-    repeats = [1, 2, 3, 4, 5, 0, 1]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in repeats:
-        for _ in range(i):
-          self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedFlatMapDataset(self):
-    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for row in repeats:
-        for i in row:
-          for _ in range(i):
-            self.assertEqual(i, sess.run(get_next))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSharedResourceNestedFlatMapDataset(self):
-    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator(
-                                shared_name="shared_flat_map_iterator"))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    # Create two concurrent sessions that share the same iterator
-    # resource on the same server, and verify that a random
-    # interleaving of `Session.run(get_next)` calls on the two
-    # sessions yields the expected result.
-    server = server_lib.Server.create_local_server()
-    with session.Session(server.target) as sess1:
-      with session.Session(server.target) as sess2:
-        for _ in range(3):
-          sess = random.choice([sess1, sess2])
-          sess.run(init_op)
-          for row in repeats:
-            for i in row:
-              for _ in range(i):
-                sess = random.choice([sess1, sess2])
-                self.assertEqual(i, sess.run(get_next))
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess = random.choice([sess1, sess2])
-          sess.run(get_next)
-
-  def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .flat_map(lambda d: dataset_ops.Dataset.from_tensors(d["foo"])
-                          .repeat(d["bar"]))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for _ in range(i ** 2):
-          self.assertEqual(i * 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-  # pylint: enable=g-long-lambda
-
-  def testSparse(self):
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
-
-    def _flat_map_fn(x):
-      return dataset_ops.Dataset.from_tensor_slices(
-          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/flat_map_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff52821b10740196286c30d19b0cda3b4b44bae5
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -0,0 +1,126 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.flat_map()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FlatMapTest(test_base.DatasetTestBase):
+
+  # pylint: disable=g-long-lambda
+  def testFlatMapDataset(self):
+    repeats = [1, 2, 3, 4, 5, 0, 1]
+    components = np.array(repeats, dtype=np.int64)
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+        lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
+    expected_output = []
+    for i in repeats:
+      expected_output.extend([[i]] * i)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testNestedFlatMapDataset(self):
+    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
+    components = np.array(repeats, dtype=np.int64)
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+        lambda x: dataset_ops.Dataset.from_tensor_slices(x).flat_map(
+            lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y))
+    )
+    expected_output = []
+    for row in repeats:
+      for i in row:
+        expected_output.extend([i] * i)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  # Note: no eager mode coverage, session specific test.
+  @test_util.run_deprecated_v1
+  def testSkipEagerSharedResourceNestedFlatMapDataset(self):
+    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
+    components = np.array(repeats, dtype=np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
+                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
+                            .repeat(y))).make_initializable_iterator(
+                                shared_name="shared_flat_map_iterator"))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    # Create two concurrent sessions that share the same iterator
+    # resource on the same server, and verify that a random
+    # interleaving of `Session.run(get_next)` calls on the two
+    # sessions yields the expected result.
+    server = server_lib.Server.create_local_server()
+    with session.Session(server.target) as sess1:
+      with session.Session(server.target) as sess2:
+        for _ in range(3):
+          sess = random.choice([sess1, sess2])
+          sess.run(init_op)
+          for row in repeats:
+            for i in row:
+              for _ in range(i):
+                sess = random.choice([sess1, sess2])
+                self.assertEqual(i, sess.run(get_next))
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess = random.choice([sess1, sess2])
+          sess.run(get_next)
+
+  def testMapDict(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2}).flat_map(
+            lambda d: dataset_ops.Dataset.from_tensors(
+                d["foo"]).repeat(d["bar"]))
+    get_next = self.getNext(dataset)
+    for i in range(10):
+      for _ in range(i**2):
+        self.assertEqual(i * 2, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testSparse(self):
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _flat_map_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+    expected_output = []
+    for i in range(10):
+      for j in range(2):
+        expected_output.append([i, 0] if j % 2 == 0 else [0, -i])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6625534e7a1a0efc5e39dc53ef57666f601c05b
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -0,0 +1,491 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.data.Dataset.from_generator()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
+
+
+class FromGeneratorTest(test_base.DatasetTestBase):
+
+  def _testFromGenerator(self, generator, elem_sequence, num_repeats,
+                         output_types=None):
+    if output_types is None:
+      output_types = dtypes.int64
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(generator, output_types=output_types)
+        .repeat(num_repeats)
+        .prefetch(5))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for _ in range(2):  # Run twice to test reinitialization.
+        sess.run(init_op)
+        for _ in range(num_repeats):
+          for elem in elem_sequence:
+            self.assertAllEqual(elem, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def _testFromGeneratorOneShot(self, generator, elem_sequence, num_repeats):
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
+        .repeat(num_repeats)
+        .prefetch(5))
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      for _ in range(num_repeats):
+        for elem in elem_sequence:
+          self.assertAllEqual(elem, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorUsingFunction(self):
+    def generator():
+      for i in range(1, 100):
+        yield [i] * i
+    elem_sequence = list(generator())
+    self._testFromGenerator(generator, elem_sequence, 1)
+    self._testFromGenerator(generator, elem_sequence, 5)
+    self._testFromGeneratorOneShot(generator, elem_sequence, 1)
+    self._testFromGeneratorOneShot(generator, elem_sequence, 5)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorUsingList(self):
+    generator = lambda: [[i] * i for i in range(1, 100)]
+    elem_sequence = list(generator())
+    self._testFromGenerator(generator, elem_sequence, 1)
+    self._testFromGenerator(generator, elem_sequence, 5)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorUsingNdarray(self):
+    generator = lambda: np.arange(100, dtype=np.int64)
+    elem_sequence = list(generator())
+    self._testFromGenerator(generator, elem_sequence, 1, output_types=np.int64)
+    self._testFromGenerator(generator, elem_sequence, 5, output_types=np.int64)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorUsingGeneratorExpression(self):
+    # NOTE(mrry): Generator *expressions* are not repeatable (or in
+    # general reusable), because they eagerly evaluate the `for`
+    # expression as `iter(range(1, 100))` and discard the means of
+    # reconstructing `range(1, 100)`. Wrapping the generator
+    # expression in a `lambda` makes it repeatable.
+    generator = lambda: ([i] * i for i in range(1, 100))
+    elem_sequence = list(generator())
+    self._testFromGenerator(generator, elem_sequence, 1)
+    self._testFromGenerator(generator, elem_sequence, 5)
+
+  @test_util.run_deprecated_v1
+  def testFromMultipleConcurrentGenerators(self):
+    num_inner_repeats = 5
+    num_outer_repeats = 100
+
+    def generator():
+      for i in range(1, 10):
+        yield ([i] * i, [i, i ** 2, i ** 3])
+    input_list = list(generator())
+
+    # The interleave transformation is essentially a flat map that
+    # draws from multiple input datasets concurrently (in a cyclic
+    # fashion). By placing `Datsaet.from_generator()` inside an
+    # interleave, we test its behavior when multiple iterators are
+    # active at the same time; by additionally prefetching inside the
+    # interleave, we create the possibility of parallel (modulo GIL)
+    # invocations to several iterators created by the same dataset.
+    def interleave_fn(_):
+      return (dataset_ops.Dataset.from_generator(
+          generator, output_types=(dtypes.int64, dtypes.int64),
+          output_shapes=([None], [3]))
+              .repeat(num_inner_repeats).prefetch(5))
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(num_outer_repeats)
+        .interleave(interleave_fn, cycle_length=10,
+                    block_length=len(input_list)))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for _ in range(num_inner_repeats * num_outer_repeats):
+        for elem in input_list:
+          val0, val1 = sess.run(get_next)
+          self.assertAllEqual(elem[0], val0)
+          self.assertAllEqual(elem[1], val1)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  # TODO(b/67868766): Reenable this when the source of flakiness is discovered.
+  def _testFromGeneratorsRunningInParallel(self):
+    num_parallel_iterators = 3
+
+    # Define shared state that multiple iterator instances will access to
+    # demonstrate their concurrent activity.
+    lock = threading.Lock()
+    condition = threading.Condition(lock)
+    next_ticket = [0]  # GUARDED_BY(lock)
+
+    def generator():
+      # NOTE(mrry): We yield one element before the barrier, because
+      # the current implementation of `Dataset.interleave()` must
+      # fetch one element from each incoming dataset to start the
+      # prefetching.
+      yield 0
+
+      # Define a barrier that `num_parallel_iterators` iterators must enter
+      # before any can proceed. Demonstrates that multiple iterators may be
+      # active at the same time.
+      condition.acquire()
+      ticket = next_ticket[0]
+      next_ticket[0] += 1
+      if ticket == num_parallel_iterators - 1:
+        # The last iterator to join the barrier notifies the others.
+        condition.notify_all()
+      else:
+        # Wait until the last iterator enters the barrier.
+        while next_ticket[0] < num_parallel_iterators:
+          condition.wait()
+      condition.release()
+
+      yield 1
+
+    # As in `testFromMultipleConcurrentGenerators()`, we use a combination of
+    # `Dataset.interleave()` and `Dataset.prefetch()` to cause multiple
+    # iterators to be active concurrently.
+    def interleave_fn(_):
+      return dataset_ops.Dataset.from_generator(
+          generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2)
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(num_parallel_iterators)
+        .interleave(
+            interleave_fn, cycle_length=num_parallel_iterators, block_length=1))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for elem in [0, 1]:
+        for _ in range(num_parallel_iterators):
+          self.assertAllEqual(elem, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorImplicitConversion(self):
+    def generator():
+      yield [1]
+      yield [2]
+      yield [3]
+
+    for dtype in [dtypes.int8, dtypes.int32, dtypes.int64]:
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.from_generator(
+              generator, output_types=dtype, output_shapes=[1]))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+
+      self.assertEqual(dtype, get_next.dtype)
+
+      with self.cached_session() as sess:
+        sess.run(init_op)
+        for expected in [[1], [2], [3]]:
+          next_val = sess.run(get_next)
+          self.assertEqual(dtype.as_numpy_dtype, next_val.dtype)
+          self.assertAllEqual(expected, next_val)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorString(self):
+    def generator():
+      yield "foo"
+      yield b"bar"
+      yield u"baz"
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.string, output_shapes=[]))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for expected in [b"foo", b"bar", b"baz"]:
+        next_val = sess.run(get_next)
+        self.assertAllEqual(expected, next_val)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorTypeError(self):
+    def generator():
+      yield np.array([1, 2, 3], dtype=np.int64)
+      yield np.array([4, 5, 6], dtype=np.int64)
+      yield "ERROR"
+      yield np.array([7, 8, 9], dtype=np.int64)
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.int64, output_shapes=[3]))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      self.assertAllEqual([4, 5, 6], sess.run(get_next))
+      with self.assertRaisesOpError("The expected type was int64"):
+        sess.run(get_next)
+      self.assertAllEqual([7, 8, 9], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorShapeError(self):
+    def generator():
+      yield np.array([1, 2, 3], dtype=np.int64)
+      yield np.array([4, 5, 6], dtype=np.int64)
+      yield np.array([7, 8, 9, 10], dtype=np.int64)
+      yield np.array([11, 12, 13], dtype=np.int64)
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.int64, output_shapes=[3]))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      self.assertAllEqual([4, 5, 6], sess.run(get_next))
+      with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
+        sess.run(get_next)
+      self.assertAllEqual([11, 12, 13], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorStructureError(self):
+    def generator():
+      yield 1, 2
+      yield 3, 4
+      yield 5
+      yield 6, 7, 8
+      yield 9, 10
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=(dtypes.int64, dtypes.int64)))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      self.assertEqual((1, 2), sess.run(get_next))
+      self.assertEqual((3, 4), sess.run(get_next))
+      with self.assertRaisesOpError(
+          r"The expected structure was \(tf\.int64, tf\.int64\)"):
+        sess.run(get_next)
+      with self.assertRaisesOpError(
+          r"The expected structure was \(tf\.int64, tf\.int64\)"):
+        sess.run(get_next)
+      self.assertEqual((9, 10), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorHeterogeneous(self):
+    def generator():
+      yield 1
+      yield [2, 3]
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.int64))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual(1, sess.run(get_next))
+      self.assertAllEqual([2, 3], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorStopShort(self):
+
+    def generator():
+      yield 0
+      yield 1
+      yield 2
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.int64))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual(0, sess.run(get_next))
+      self.assertAllEqual(1, sess.run(get_next))
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorDestructorCalled(self):
+    # Use an `Event` to signal that the generator has been deleted.
+    event = threading.Event()
+
+    class GeneratorWrapper(object):
+
+      def __iter__(self):
+        return self
+
+      def next(self):
+        return self.__next__()
+
+      def __next__(self):
+        return 42
+
+      def __del__(self):
+        event.set()
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            GeneratorWrapper, output_types=dtypes.int64).take(2))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual(42, sess.run(get_next))
+      self.assertAllEqual(42, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      # Test that `GeneratorWrapper` object is destroyed when the
+      # iterator terminates (and the generator iterator is deleted).
+      self.assertTrue(event.is_set())
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorWithArgs(self):
+
+    def flat_map_fn(elem):
+
+      def generator_with_arg(n):
+        for _ in range(n):
+          yield np.array(n, dtype=np.int64)
+
+      return dataset_ops.Dataset.from_generator(
+          generator_with_arg, output_types=dtypes.int64, output_shapes=(),
+          args=(elem,))
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(5).flat_map(flat_map_fn))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      expected = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
+      for x in expected:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testFromGeneratorWithTwoArgs(self):
+
+    def flat_map_fn(elem, message):
+
+      def generator_with_arg(n, msg):
+        for i in range(n):
+          yield i, msg
+
+      return dataset_ops.Dataset.from_generator(
+          generator_with_arg, output_types=(dtypes.int64, dtypes.string),
+          output_shapes=((), ()), args=(elem, message))
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.zip(
+            (dataset_ops.Dataset.range(5),
+             dataset_ops.Dataset.from_tensors("Hi!").repeat(None)))
+        .flat_map(flat_map_fn))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      expected = [(0, b"Hi!"),
+                  (0, b"Hi!"), (1, b"Hi!"),
+                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"),
+                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"), (3, b"Hi!")]
+      for x in expected:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testGeneratorDatasetFinalizeFunctionCalled(self):
+    # NOTE(mrry): This test tests the internal `_GeneratorDataset`,
+    # which affords more control over what the finalize function can do than
+    # the `Dataset.from_generator()` wrapper.
+
+    # Use an `Event` to signal that the generator has been deleted.
+    event = threading.Event()
+
+    def finalize_fn(_):
+      def finalize_py_func():
+        event.set()
+        return 0
+      return script_ops.py_func(finalize_py_func, [], [dtypes.int64],
+                                stateful=True)
+
+    dummy = constant_op.constant(37)
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops._GeneratorDataset(
+            dummy, lambda x: x, lambda x: x, finalize_fn).take(2))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual(37, sess.run(get_next))
+      self.assertAllEqual(37, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+        self.assertTrue(event.is_set())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef608ebb67007c7605e7bea36058d0cd5c5d146f
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
@@ -0,0 +1,86 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.from_sparse_tensor_slices()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerFromSparseTensorSlices(self):
+    """Test a dataset based on slices of a `tf.SparseTensor`."""
+    st = array_ops.sparse_placeholder(dtypes.float64)
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_sparse_tensor_slices(st))
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.cached_session() as sess:
+      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
+
+      # Test with sparse tensor in the appropriate order.
+      indices = np.array(
+          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
+      values = np.array([val for s in slices for val in s])
+      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
+      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
+                                                    dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      for i, s in enumerate(slices):
+        results = sess.run(get_next)
+        self.assertAllEqual(s, results.values)
+        expected_indices = np.array(
+            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
+        self.assertAllEqual(expected_indices, results.indices)
+        self.assertAllEqual(dense_shape[1:], results.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test with sparse tensor in the reverse order, which is not
+      # currently supported.
+      reverse_order_indices = indices[::-1, :]
+      reverse_order_values = values[::-1]
+      sparse_feed = sparse_tensor.SparseTensorValue(
+          reverse_order_indices, reverse_order_values, dense_shape)
+      with self.assertRaises(errors.UnimplementedError):
+        sess.run(init_op, feed_dict={st: sparse_feed})
+
+      # Test with an empty sparse tensor.
+      empty_indices = np.empty((0, 4), dtype=np.int64)
+      empty_values = np.empty((0,), dtype=np.float64)
+      empty_dense_shape = [0, 4, 37, 9]
+      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
+                                                    empty_dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a480e56789aee9198fc88201f0eecb2c2eaab52
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -0,0 +1,177 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.from_tensor_slices()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FromTensorSlicesTest(test_base.DatasetTestBase):
+
+  def testFromTensorSlices(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
+            np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    )
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [shape for shape in dataset.output_shapes])
+
+    for i in range(4):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(components, results):
+        self.assertAllEqual(component[i], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      results = self.evaluate(get_next())
+
+  def testSkipEagerFromTensorSlicesSparse(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0, 0], [1, 0], [2, 0]]),
+        values=np.array([0, 0, 0]),
+        dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
+        [shape for shape in dataset.output_shapes])
+
+    expected = [
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[0]]),
+             values=np.array([1]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[1]]),
+             values=np.array([2]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[2]]),
+             values=np.array([3]),
+             dense_shape=np.array([3]))),
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected)
+
+  def testFromTensorSlicesMixed(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (np.tile(np.array([[1], [2], [3]]), 20),
+                  np.tile(np.array([[12], [13], [14]]), 22),
+                  np.array([37.0, 38.0, 39.0]),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
+                      values=np.array([0, 0, 0]),
+                      dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape[1:])
+        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
+    ], [shape for shape in dataset.output_shapes])
+
+    expected = [
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[0]]),
+             values=np.array([1]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[1]]),
+             values=np.array([2]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[2]]),
+             values=np.array([3]),
+             dense_shape=np.array([3]))),
+    ]
+    for i in range(3):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(
+          (list(zip(*components[:3]))[i] + expected[i]), results):
+        if sparse_tensor.is_sparse(component):
+          self.assertSparseValuesEqual(component, result_component)
+        else:
+          self.assertAllEqual(component, result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testFromTensorSlicesWithDict(self):
+    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual(dtypes.int32, dataset.output_types["foo"])
+    self.assertEqual(dtypes.float32, dataset.output_types["bar"])
+    self.assertEqual((), dataset.output_shapes["foo"])
+    self.assertEqual((1,), dataset.output_shapes["bar"])
+
+    for i in range(3):
+      results = self.evaluate(get_next())
+      self.assertEqual(components["foo"][i], results["foo"])
+      self.assertEqual(components["bar"][i], results["bar"])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab3c15263fdaa0829686f90450e0e79081299a2e
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -0,0 +1,259 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.from_tensors()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FromTensorsTest(test_base.DatasetTestBase):
+
+  def testFromTensors(self):
+    """Test a dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+
+    self.assertEqual([c.shape for c in components],
+                     nest.flatten(dataset.output_shapes))
+
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  def testSkipEagerFromTensorsSparse(self):
+    """Test a dataset that represents a single tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0]]),
+        values=np.array([0]),
+        dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape) for c in components],
+        [shape for shape in dataset.output_shapes])
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  def testFromTensorsMixed(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0]]),
+                      values=np.array([0]),
+                      dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape)
+        if sparse_tensor.is_sparse(c) else c.shape for c in components
+    ], [shape for shape in dataset.output_shapes])
+
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  # pylint: disable=g-long-lambda,unnecessary-lambda
+  def testNestedStructure(self):
+    components = (np.array([1, 2, 3], dtype=np.int64),
+                  (np.array([4., 5.]), np.array([6., 7.])),
+                  np.array([8, 9, 10], dtype=np.int64))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.shuffle(10, 10)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.repeat(-1)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.filter(lambda x, y, z: True)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.take(5)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
+                                                       (y[0], y[1])))
+    )
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.batch(32)
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
+                      nest.pack_sequence_as(dataset.output_shapes, [
+                          s.as_list()
+                          for s in nest.flatten(dataset.output_shapes)
+                      ]))
+
+    # Define a separate set of components with matching leading
+    # dimension for the from-slices constructor.
+    components_for_slices = (np.array([1, 2, 3], dtype=np.int64),
+                             (np.array([4., 5., 6.]), np.array([7., 8., 9.])),
+                             np.array([10, 11, 12], dtype=np.int64))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
+    self.assertEquals((dtypes.int64,
+                       (dtypes.float64, dtypes.float64), dtypes.int64),
+                      dataset.output_types)
+    self.assertEquals(([], ([], []), []), dataset.output_shapes)
+
+  # TODO(b/117581999): more specific shapes in eager mode.
+  @test_util.run_deprecated_v1
+  def testSkipEagerNestedStructure(self):
+    components = (np.array([1, 2, 3], dtype=np.int64), (np.array([4., 5.]),
+                                                        np.array([6., 7.])),
+                  np.array([8, 9, 10], dtype=np.int64))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
+
+    dataset = dataset.flat_map(
+        lambda x, y: dataset_ops.Dataset.from_tensors(
+            ((x[0], x[1]), (y[0], y[1])))).batch(32)
+
+    get_next = self.getNext(dataset)
+    (w, x), (y, z) = get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+    get_next = self.getNext(dataset)
+    (w, x), (y, z) = get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+  def testNestedDict(self):
+    components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
+    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
+    self.assertEquals(dtypes.int32, dataset.output_types["b"])
+    self.assertEquals([], dataset.output_shapes["a"]["aa"])
+    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
+    self.assertEquals([3], dataset.output_shapes["b"])
+
+  def testNonSequenceNestedStructure(self):
+    components = np.array([1, 2, 3], dtype=np.int64)
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.filter(
+        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([2, 3], dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    get_next = self.getNext(dataset)
+    self.assertEquals(dtypes.int64, get_next().dtype)
+    self.assertEquals([3], get_next().shape)
+
+  def testSkipEagerSplitPipelineFailsWithPlacementError(self):
+    with session.Session(
+        target="",
+        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
+
+      dataset = dataset_ops.Dataset.from_tensors(0)
+
+      # Define a pipeline that attempts to use variables on two
+      # different devices.
+      #
+      # Initialize the variables before creating to iterator, to avoid the
+      # placement algorithm overriding the DT_RESOURCE colocation constraints.
+      with ops.device("/cpu:0"):
+        var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
+        dataset = dataset.map(lambda x: x + var_0.read_value())
+      sess.run(var_0.initializer)
+
+      with ops.device("/cpu:1"):
+        var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
+        dataset = dataset.map(lambda x: x + var_1.read_value())
+      sess.run(var_1.initializer)
+
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      sess.run(iterator.initializer)
+
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          "Error while reading resource variable Variable"):
+        sess.run(iterator.get_next())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/inputs_test.py b/tensorflow/python/data/kernel_tests/inputs_test.py
deleted file mode 100644
index d089b49bcc6f80b734ad5e7cb96dfea321504e6f..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/inputs_test.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.platform import test
-
-
-class InputsTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @staticmethod
-  def make_apply_fn(dataset):
-
-    def apply_fn(dataset):
-
-      def _apply_fn(dataset):
-        return dataset.cache()
-
-      return dataset.apply(_apply_fn)
-
-    return apply_fn
-
-  @staticmethod
-  def make_gen():
-
-    def gen():
-      yield 42
-
-    return gen
-
-  @staticmethod
-  def make_interleave_fn(dataset, num_parallel_calls=None):
-
-    def interleave_fn(dataset):
-      return dataset.interleave(
-          lambda x: dataset_ops.Dataset.range(0),
-          cycle_length=2,
-          num_parallel_calls=num_parallel_calls)
-
-    return interleave_fn
-
-  @parameterized.named_parameters(
-      ("FixedLengthRecord", readers.FixedLengthRecordDataset("", 42)),
-      ("FromGenerator",
-       dataset_ops.Dataset.from_generator(make_gen.__func__(), dtypes.int32),
-       1),
-      ("FromSparseTensorSlices",
-       dataset_ops.Dataset.from_sparse_tensor_slices(
-           sparse_tensor.SparseTensor(
-               indices=np.array([[0, 0], [1, 0], [2, 0]]),
-               values=np.array([0, 0, 0]),
-               dense_shape=np.array([3, 1])))),
-      ("FromTensors", dataset_ops.Dataset.from_tensors([42])),
-      ("FromTensorSlices", dataset_ops.Dataset.from_tensors([42])),
-      ("Range", dataset_ops.Dataset.range(10)),
-      ("TextLine", readers.TextLineDataset("")),
-      ("TFRecord", readers.TFRecordDataset(""), 1),
-  )
-  def testDatasetSourceInputs(self, dataset, num_inputs=0):
-    self.assertEqual(num_inputs, len(dataset._inputs()))
-
-  @parameterized.named_parameters(
-      ("Apply", make_apply_fn.__func__(dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Batch", lambda x: x.batch(10), dataset_ops.Dataset.range(0)),
-      ("Cache", lambda x: x.cache(), dataset_ops.Dataset.range(0)),
-      ("Filter", lambda x: x.filter(lambda x: True),
-       dataset_ops.Dataset.range(0)),
-      ("FlatMap", lambda x: x.flat_map(lambda x: dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Interleave", make_interleave_fn.__func__(dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Map", lambda x: x.map(lambda x: x), dataset_ops.Dataset.range(0)),
-      ("PaddedBatch", lambda x: x.padded_batch(10, []),
-       dataset_ops.Dataset.range(0)),
-      ("ParallelInterleave",
-       make_interleave_fn.__func__(dataset_ops.Dataset.range(0), 2),
-       dataset_ops.Dataset.range(0)),
-      ("ParallelMap", lambda x: x.map(lambda x: x, num_parallel_calls=2),
-       dataset_ops.Dataset.range(0)),
-      ("Repeat", lambda x: x.repeat(), dataset_ops.Dataset.range(0)),
-      ("Shuffle", lambda x: x.shuffle(10), dataset_ops.Dataset.range(0)),
-      ("Skip", lambda x: x.skip(1), dataset_ops.Dataset.range(0)),
-      ("Take", lambda x: x.take(1), dataset_ops.Dataset.range(0)),
-      ("Window", lambda x: x.window(10), dataset_ops.Dataset.range(0)),
-  )
-  def testUnaryTransformationInputs(self, dataset_fn, input_dataset):
-    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
-
-  @parameterized.named_parameters(
-      ("Concatenate", lambda x, y: x.concatenate(y),
-       dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1)))
-  def testBinaryTransformationInputs(self, dataset_fn, input1, input2):
-    self.assertEqual([input1, input2], dataset_fn(input1, input2)._inputs())
-
-  @parameterized.named_parameters(
-      ("ZipOne", dataset_ops.Dataset.zip, (dataset_ops.Dataset.range(0))),
-      ("ZipNest", dataset_ops.Dataset.zip,
-       (dataset_ops.Dataset.range(0),
-        (dataset_ops.Dataset.range(1), dataset_ops.Dataset.range(2)))),
-      ("ZipTuple", dataset_ops.Dataset.zip,
-       (dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1))))
-  def testVariadicTransformationInputs(self, dataset_fn, input_datasets):
-    self.assertEqual(
-        nest.flatten(input_datasets),
-        dataset_fn(input_datasets)._inputs())
-
-  def testCollectInputs(self):
-    ds1 = dataset_ops.Dataset.range(0)
-    ds2 = ds1.concatenate(ds1)
-    ds3 = dataset_ops.Dataset.zip((ds2, ds1, ds2))
-
-    inputs = []
-    queue = [ds3]
-    while queue:
-      ds = queue[0]
-      queue = queue[1:]
-      queue.extend(ds._inputs())
-      inputs.append(ds)
-
-    self.assertEqual(5, inputs.count(ds1))
-    self.assertEqual(2, inputs.count(ds2))
-    self.assertEqual(1, inputs.count(ds3))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py b/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
deleted file mode 100644
index 92bb67b6fff78f96352dce2cd96cc48968249996..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.platform import test
-
-
-class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def _interleave(self, lists, cycle_length, block_length):
-    num_open = 0
-
-    # `all_iterators` acts as a queue of iterators over each element of `lists`.
-    all_iterators = [iter(l) for l in lists]
-
-    # `open_iterators` are the iterators whose elements are currently being
-    # interleaved.
-    open_iterators = []
-    for i in range(cycle_length):
-      if all_iterators:
-        open_iterators.append(all_iterators.pop(0))
-        num_open += 1
-      else:
-        open_iterators.append(None)
-
-    while num_open or all_iterators:
-      for i in range(cycle_length):
-        if open_iterators[i] is None:
-          if all_iterators:
-            open_iterators[i] = all_iterators.pop(0)
-            num_open += 1
-          else:
-            continue
-        for _ in range(block_length):
-          try:
-            yield next(open_iterators[i])
-          except StopIteration:
-            open_iterators[i] = None
-            num_open -= 1
-            break
-
-  def testPythonImplementation(self):
-    input_lists = [[4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6],
-                   [4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6]]
-
-    # Cycle length 1 acts like `Dataset.flat_map()`.
-    expected_elements = itertools.chain(*input_lists)
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 1, 1)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > 1.
-    expected_elements = [4, 5, 4, 5, 4, 5, 4,
-                         5, 5, 6, 6,  # NOTE(mrry): When we cycle back
-                                      # to a list and are already at
-                                      # the end of that list, we move
-                                      # on to the next element.
-                         4, 6, 4, 6, 4, 6, 4, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 2, 1)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > 1 and block length > 1.
-    expected_elements = [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6,
-                         4, 5, 5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 2, 3)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > len(input_values).
-    expected_elements = [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6,
-                         4, 4, 5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 7, 2)):
-      self.assertEqual(expected, produced)
-
-  @parameterized.named_parameters(
-      ("1", np.int64([4, 5, 6]), 1, 3, None),
-      ("2", np.int64([4, 5, 6]), 1, 3, 1),
-      ("3", np.int64([4, 5, 6]), 2, 1, None),
-      ("4", np.int64([4, 5, 6]), 2, 1, 1),
-      ("5", np.int64([4, 5, 6]), 2, 1, 2),
-      ("6", np.int64([4, 5, 6]), 2, 3, None),
-      ("7", np.int64([4, 5, 6]), 2, 3, 1),
-      ("8", np.int64([4, 5, 6]), 2, 3, 2),
-      ("9", np.int64([4, 5, 6]), 7, 2, None),
-      ("10", np.int64([4, 5, 6]), 7, 2, 1),
-      ("11", np.int64([4, 5, 6]), 7, 2, 3),
-      ("12", np.int64([4, 5, 6]), 7, 2, 5),
-      ("13", np.int64([4, 5, 6]), 7, 2, 7),
-      ("14", np.int64([]), 2, 3, None),
-      ("15", np.int64([0, 0, 0]), 2, 3, None),
-      ("16", np.int64([4, 0, 6]), 2, 3, None),
-      ("17", np.int64([4, 0, 6]), 2, 3, 1),
-      ("18", np.int64([4, 0, 6]), 2, 3, 2),
-  )
-  def testInterleaveDataset(self, input_values, cycle_length, block_length,
-                            num_parallel_calls):
-    count = 2
-    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
-        count).interleave(
-            lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
-            cycle_length, block_length, num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    def repeat(values, count):
-      result = []
-      for value in values:
-        result.append([value] * value)
-      return result * count
-
-    with self.cached_session() as sess:
-      for expected_element in self._interleave(
-          repeat(input_values, count), cycle_length, block_length):
-        self.assertEqual(expected_element, sess.run(get_next))
-
-      for _ in range(2):
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", np.float32([1., np.nan, 2., np.nan, 3.]), 1, 3, None),
-      ("2", np.float32([1., np.nan, 2., np.nan, 3.]), 1, 3, 1),
-      ("3", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 1, None),
-      ("4", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 1, 1),
-      ("5", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 1, 2),
-      ("6", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 3, None),
-      ("7", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 3, 1),
-      ("8", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 3, 2),
-      ("9", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, None),
-      ("10", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, 1),
-      ("11", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, 3),
-      ("12", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, 5),
-      ("13", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, 7),
-  )
-  def testInterleaveErrorDataset(self,
-                                 input_values,
-                                 cycle_length,
-                                 block_length,
-                                 num_parallel_calls):
-    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
-        lambda x: array_ops.check_numerics(x, "message")).interleave(
-            dataset_ops.Dataset.from_tensors, cycle_length, block_length,
-            num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      for value in input_values:
-        if np.isnan(value):
-          with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(get_next)
-        else:
-          self.assertEqual(value, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
-
-    def _interleave_fn(x):
-      return dataset_ops.Dataset.from_tensor_slices(
-          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).interleave(
-            _interleave_fn, cycle_length=1).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/interleave_test.py b/tensorflow/python/data/kernel_tests/interleave_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a211afcc177faaeb1a00ad03d8f117448f8315
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/interleave_test.py
@@ -0,0 +1,314 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.interleave()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+
+
+def _interleave(lists, cycle_length, block_length):
+  """Reference implementation of interleave used for testing.
+
+  Args:
+    lists: a list of lists to interleave
+    cycle_length: the length of the interleave cycle
+    block_length: the length of the interleave block
+
+  Yields:
+    Elements of `lists` interleaved in the order determined by `cycle_length`
+    and `block_length`.
+  """
+  num_open = 0
+
+  # `all_iterators` acts as a queue of iterators over each element of `lists`.
+  all_iterators = [iter(l) for l in lists]
+
+  # `open_iterators` are the iterators whose elements are currently being
+  # interleaved.
+  open_iterators = []
+  for i in range(cycle_length):
+    if all_iterators:
+      open_iterators.append(all_iterators.pop(0))
+      num_open += 1
+    else:
+      open_iterators.append(None)
+
+  while num_open or all_iterators:
+    for i in range(cycle_length):
+      if open_iterators[i] is None:
+        if all_iterators:
+          open_iterators[i] = all_iterators.pop(0)
+          num_open += 1
+        else:
+          continue
+      for _ in range(block_length):
+        try:
+          yield next(open_iterators[i])
+        except StopIteration:
+          open_iterators[i] = None
+          num_open -= 1
+          break
+
+
+def _make_coordinated_sloppy_dataset(input_values, cycle_length, block_length,
+                                     num_parallel_calls):
+  """Produces a dataset iterator and events to control the order of elements.
+
+  Args:
+    input_values: the values to generate lists to interleave from
+    cycle_length: the length of the interleave cycle
+    block_length: the length of the interleave block
+    num_parallel_calls: the degree of interleave parallelism
+
+  Returns:
+    A dataset iterator (represented as `get_next` op) and events that can be
+    used to control the order of output elements.
+  """
+
+  # Set up threading events used to sequence when items are produced that
+  # are subsequently interleaved. These events allow us to deterministically
+  # simulate slowdowns and force sloppiness.
+  coordination_events = {i: threading.Event() for i in input_values}
+
+  def map_py_fn(x):
+    coordination_events[x].wait()
+    coordination_events[x].clear()
+    return x * x
+
+  def map_fn(x):
+    return script_ops.py_func(map_py_fn, [x], x.dtype)
+
+  def interleave_fn(x):
+    dataset = dataset_ops.Dataset.from_tensors(x)
+    dataset = dataset.repeat(x)
+    return dataset.map(map_fn)
+
+  options = dataset_ops.Options()
+  options.experimental_deterministic = False
+  dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+      2).interleave(interleave_fn, cycle_length, block_length,
+                    num_parallel_calls).with_options(options)
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
+  get_next = iterator.get_next()
+  return get_next, coordination_events
+
+
+def _repeat(values, count):
+  """Produces a list of lists suitable for testing interleave.
+
+  Args:
+    values: for each element `x` the result contains `[x] * x`
+    count: determines how many times to repeat `[x] * x` in the result
+
+  Returns:
+    A list of lists of values suitable for testing interleave.
+  """
+  return [[value] * value for value in np.tile(values, count)]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class InterleaveTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("1", [4, 5, 6], 1, 1, [
+          4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 5, 5, 5, 5,
+          5, 6, 6, 6, 6, 6, 6
+      ]),
+      ("2", [4, 5, 6], 2, 1, [
+          4, 5, 4, 5, 4, 5, 4, 5, 5, 6, 6, 4, 6, 4, 6, 4, 6, 4, 6, 5, 6, 5, 6,
+          5, 6, 5, 6, 5, 6, 6
+      ]),
+      ("3", [4, 5, 6], 2, 3, [
+          4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 5, 5, 5, 6,
+          6, 6, 5, 5, 6, 6, 6
+      ]),
+      ("4", [4, 5, 6], 7, 2, [
+          4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6,
+          6, 5, 6, 6, 5, 6, 6
+      ]),
+      ("5", [4, 0, 6], 2, 1,
+       [4, 4, 6, 4, 6, 4, 6, 6, 4, 6, 4, 6, 4, 4, 6, 6, 6, 6, 6, 6]),
+  )
+  def testPythonImplementation(self, input_values, cycle_length, block_length,
+                               expected_elements):
+    input_lists = _repeat(input_values, 2)
+
+    for expected, produced in zip(
+        expected_elements, _interleave(input_lists, cycle_length,
+                                       block_length)):
+      self.assertEqual(expected, produced)
+
+  @parameterized.named_parameters(
+      ("1", np.int64([4, 5, 6]), 1, 3, None),
+      ("2", np.int64([4, 5, 6]), 1, 3, 1),
+      ("3", np.int64([4, 5, 6]), 2, 1, None),
+      ("4", np.int64([4, 5, 6]), 2, 1, 1),
+      ("5", np.int64([4, 5, 6]), 2, 1, 2),
+      ("6", np.int64([4, 5, 6]), 2, 3, None),
+      ("7", np.int64([4, 5, 6]), 2, 3, 1),
+      ("8", np.int64([4, 5, 6]), 2, 3, 2),
+      ("9", np.int64([4, 5, 6]), 7, 2, None),
+      ("10", np.int64([4, 5, 6]), 7, 2, 1),
+      ("11", np.int64([4, 5, 6]), 7, 2, 3),
+      ("12", np.int64([4, 5, 6]), 7, 2, 5),
+      ("13", np.int64([4, 5, 6]), 7, 2, 7),
+      ("14", np.int64([]), 2, 3, None),
+      ("15", np.int64([0, 0, 0]), 2, 3, None),
+      ("16", np.int64([4, 0, 6]), 2, 3, None),
+      ("17", np.int64([4, 0, 6]), 2, 3, 1),
+      ("18", np.int64([4, 0, 6]), 2, 3, 2),
+  )
+  def testInterleaveDataset(self, input_values, cycle_length, block_length,
+                            num_parallel_calls):
+    count = 2
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+        count).interleave(
+            lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
+            cycle_length, block_length, num_parallel_calls)
+    expected_output = [
+        element for element in _interleave(
+            _repeat(input_values, count), cycle_length, block_length)
+    ]
+    self.assertDatasetProduces(dataset, expected_output)
+
+  @parameterized.named_parameters(
+      ("1", np.float32([1., np.nan, 2., np.nan, 3.]), 1, 3, None),
+      ("2", np.float32([1., np.nan, 2., np.nan, 3.]), 1, 3, 1),
+      ("3", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 1, None),
+      ("4", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 1, 1),
+      ("5", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 1, 2),
+      ("6", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 3, None),
+      ("7", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 3, 1),
+      ("8", np.float32([1., np.nan, 2., np.nan, 3.]), 2, 3, 2),
+      ("9", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, None),
+      ("10", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, 1),
+      ("11", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, 3),
+      ("12", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, 5),
+      ("13", np.float32([1., np.nan, 2., np.nan, 3.]), 7, 2, 7),
+  )
+  def testInterleaveDatasetError(self, input_values, cycle_length, block_length,
+                                 num_parallel_calls):
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
+        lambda x: array_ops.check_numerics(x, "message")).interleave(
+            dataset_ops.Dataset.from_tensors, cycle_length, block_length,
+            num_parallel_calls)
+    get_next = self.getNext(dataset)
+
+    for value in input_values:
+      if np.isnan(value):
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(get_next())
+      else:
+        self.assertEqual(value, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testInterleaveSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).interleave(
+        _interleave_fn, cycle_length=1)
+    get_next = self.getNext(dataset)
+    for i in range(10):
+      for j in range(2):
+        expected = [i, 0] if j % 2 == 0 else [0, -i]
+        self.assertAllEqual(expected, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @parameterized.named_parameters(
+      ("1", np.int64([4, 5, 6]), 2, 1, 1),
+      ("2", np.int64([4, 5, 6]), 2, 1, 2),
+      ("3", np.int64([4, 5, 6]), 2, 3, 1),
+      ("4", np.int64([4, 5, 6]), 2, 3, 2),
+      ("5", np.int64([4, 5, 6]), 3, 2, 1),
+      ("6", np.int64([4, 5, 6]), 3, 2, 2),
+      ("7", np.int64([4, 5, 6]), 3, 2, 3),
+      ("8", np.int64([4, 0, 6]), 2, 3, 1),
+      ("9", np.int64([4, 0, 6]), 2, 3, 2),
+  )
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerSloppyInterleaveInOrder(self, input_values, cycle_length,
+                                           block_length, num_parallel_calls):
+    get_next, coordination_events = _make_coordinated_sloppy_dataset(
+        input_values, cycle_length, block_length, num_parallel_calls)
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=num_parallel_calls + 1,
+        use_per_session_threads=True)
+    with self.cached_session(config=config) as sess:
+      for expected_element in _interleave(
+          _repeat(input_values, 2), cycle_length, block_length):
+        coordination_events[expected_element].set()
+        self.assertEqual(expected_element * expected_element,
+                         self.evaluate(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @parameterized.named_parameters(
+      ("1", np.int64([4, 5, 6]), 2, 1, 2),
+      ("2", np.int64([4, 5, 6]), 2, 3, 2),
+      ("3", np.int64([4, 5, 6]), 3, 2, 3),
+      ("4", np.int64([4, 0, 6]), 2, 3, 2),
+  )
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerSloppyInterleaveOutOfOrder(self, input_values, cycle_length,
+                                              block_length, num_parallel_calls):
+    get_next, coordination_events = _make_coordinated_sloppy_dataset(
+        input_values, cycle_length, block_length, num_parallel_calls)
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=num_parallel_calls + 1,
+        use_per_session_threads=True)
+    with self.cached_session(config=config) as sess:
+      elements = [
+          x for x in _interleave(
+              _repeat(input_values, 2), cycle_length, block_length)
+      ]
+      for i in [1, 4, 7]:
+        elements[i], elements[i + 1] = elements[i + 1], elements[i]
+
+      for element in elements:
+        coordination_events[element].set()
+        self.assertEqual(element * element, self.evaluate(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b356691b75eb337ad61643646ba717e4929ab9
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
@@ -0,0 +1,129 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Checkpoint tests for `tf.data.Iterator`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class IteratorCheckpointingTest(test_base.DatasetTestBase):
+
+  def testSaveRestoreOneShotIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(
+        math_ops.square).batch(2)
+    iterator = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator.get_next())
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual([1, 4], get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([9, 16], get_next())
+    self.assertAllEqual([25, 36], get_next())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual([9, 16], get_next())
+    self.assertAllEqual([25, 36], get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      get_next()
+
+  def testSaveRestoreMultipleIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset.map(math_ops.square).batch(2)
+    iterator_1 = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next_1 = iterator_1.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator_1.get_next())
+    iterator_2 = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next_2 = iterator_2.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator_2.get_next())
+    dataset_2 = dataset_ops.Dataset.range(10)
+    iterator_3 = iter(dataset_2) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset_2)
+    get_next_3 = iterator_3.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator_3.get_next())
+    checkpoint = checkpointable_utils.Checkpoint(
+        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
+    self.assertAllEqual([1, 4], get_next_1())
+    self.assertAllEqual(0, get_next_3())
+    self.assertAllEqual(1, get_next_3())
+    self.assertAllEqual(2, get_next_3())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([1, 4], get_next_2())
+    self.assertAllEqual([9, 16], get_next_2())
+    self.assertAllEqual(3, get_next_3())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual([9, 16], get_next_1())
+    self.assertAllEqual([1, 4], get_next_2())
+    self.assertAllEqual(3, get_next_3())
+
+  def testRestoreExhaustedIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.range(3)
+    iterator = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator.get_next())
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual(0, get_next())
+    self.assertAllEqual(1, get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual(2, get_next())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual(2, get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    checkpoint.restore(save_path).run_restore_ops()
+    with self.assertRaises(errors.OutOfRangeError):
+      get_next()
+
+  def testRestoreInReconstructedIteratorInitializable(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.range(10)
+    iterator = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_initializable_iterator(dataset)
+    get_next = iterator.get_next
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    for i in range(5):
+      checkpoint.restore(
+          checkpoint_management.latest_checkpoint(
+              checkpoint_directory)).initialize_or_restore()
+      for j in range(2):
+        self.assertEqual(i * 2 + j, self.evaluate(get_next()))
+      checkpoint.save(file_prefix=checkpoint_prefix)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..20088234953b1cdc8f85381ded45cf22aa93c75a
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
@@ -0,0 +1,180 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Iterator` using distributed sessions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class IteratorClusterTest(test.TestCase):
+
+  @test_util.run_v1_only("b/120545219")
+  def testRemoteIteratorWithoutRemoteCallFail(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    worker, _ = test_util.create_local_cluster(
+        1, 1, worker_config=worker_config)
+
+    with ops.device("/job:worker/replica:0/task:0/cpu:1"):
+      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+      iterator_3_handle = iterator_3.string_handle()
+
+    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
+      remote_it = iterator_ops.Iterator.from_string_handle(
+          iterator_3_handle, dataset_3.output_types, dataset_3.output_shapes)
+      get_next_op = remote_it.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next_op)
+
+  def _testRemoteIteratorHelper(self, device0, device1, target):
+    with ops.device(device1):
+      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+      iterator_3_handle = iterator_3.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _remote_fn(h):
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          h, dataset_3.output_types, dataset_3.output_shapes)
+      return remote_iterator.get_next()
+
+    with ops.device(device0):
+      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+      remote_op = functional_ops.remote_call(
+          args=[iterator_3_handle],
+          Tout=[dtypes.int32],
+          f=_remote_fn,
+          target=target_placeholder)
+
+    with session.Session(target) as sess:
+      elem = sess.run(remote_op, feed_dict={target_placeholder: device1})
+      self.assertEqual(elem, [1])
+      # Fails when target is cpu:0 where the resource is not located.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(remote_op, feed_dict={target_placeholder: device0})
+      elem = sess.run(iterator_3.get_next())
+      self.assertEqual(elem, [2])
+      elem = sess.run(remote_op, feed_dict={target_placeholder: device1})
+      self.assertEqual(elem, [3])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(remote_op, feed_dict={target_placeholder: device1})
+
+  @test_util.run_v1_only("b/120545219")
+  def testRemoteIteratorUsingRemoteCallOp(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    worker, _ = test_util.create_local_cluster(
+        1, 1, worker_config=worker_config)
+
+    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
+                                   "/job:worker/replica:0/task:0/cpu:1",
+                                   worker[0].target)
+
+  @test_util.run_v1_only("b/120545219")
+  def testRemoteIteratorUsingRemoteCallOpCrossProcess(self):
+    workers, _ = test_util.create_local_cluster(2, 1)
+
+    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
+                                   "/job:worker/replica:0/task:1/cpu:0",
+                                   workers[0].target)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCaptureHashTableInSharedIterator(self):
+    worker, _ = test_util.create_local_cluster(1, 1)
+
+    # NOTE(mrry): We must use the V2 variants of `HashTable`
+    # etc. because these produce a `tf.resource`-typed output that is
+    # compatible with the in-graph function implementation.
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values),
+        default_val,
+        shared_name="shared_table")
+
+    input_sentences = dataset_ops.Dataset.from_tensor_slices(
+        ["brain brain tank salad surgery", "surgery brain"])
+
+    iterator = (
+        input_sentences.map(lambda x: string_ops.string_split([x]).values).map(
+            table.lookup)
+        .make_initializable_iterator(shared_name="shared_iterator"))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      sess.run(table.initializer)
+      sess.run(init_op)
+      self.assertAllEqual([0, 0, -1, 1, 2], sess.run(get_next))
+
+    with session.Session(worker[0].target) as sess:
+      self.assertAllEqual([2, 0], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_v1_only("b/120545219")
+  def testImplicitDisposeParallelMapDataset(self):
+    # Tests whether a parallel map dataset will be cleaned up correctly when
+    # the pipeline does not run it until exhaustion.
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(None) -> PrefetchDataset(100).
+    worker, _ = test_util.create_local_cluster(1, 1)
+
+    components = (np.arange(1000),
+                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
+                  np.array(37.0) * np.arange(1000))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(None).prefetch(10000))
+
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with session.Session(worker[0].target) as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
deleted file mode 100644
index 25c91b42dc65f849a680e65fc7fc2548c1cea8ea..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops that need test_util."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-
-
-class IteratorClusterTest(test.TestCase):
-
-  def testRemoteIteratorWithoutRemoteCallFail(self):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 2
-    worker, _ = test_util.create_local_cluster(
-        1, 1, worker_config=worker_config)
-
-    with ops.device("/job:worker/replica:0/task:0/cpu:1"):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    with ops.device("/job:worker/replica:0/task:0/cpu:0"):
-      remote_it = iterator_ops.Iterator.from_string_handle(
-          iterator_3_handle, dataset_3.output_types, dataset_3.output_shapes)
-      get_next_op = remote_it.get_next()
-
-    with session.Session(worker[0].target) as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next_op)
-
-  def _testRemoteIteratorHelper(self, device0, device1, target):
-    with ops.device(device1):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
-      return remote_iterator.get_next()
-
-    with ops.device(device0):
-      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      remote_op = functional_ops.remote_call(
-          args=[iterator_3_handle],
-          Tout=[dtypes.int32],
-          f=_remote_fn,
-          target=target_placeholder)
-
-    with session.Session(target) as sess:
-      elem = sess.run(remote_op, feed_dict={target_placeholder: device1})
-      self.assertEqual(elem, [1])
-      # Fails when target is cpu:0 where the resource is not located.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(remote_op, feed_dict={target_placeholder: device0})
-      elem = sess.run(iterator_3.get_next())
-      self.assertEqual(elem, [2])
-      elem = sess.run(remote_op, feed_dict={target_placeholder: device1})
-      self.assertEqual(elem, [3])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(remote_op, feed_dict={target_placeholder: device1})
-
-  def testRemoteIteratorUsingRemoteCallOp(self):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 2
-    worker, _ = test_util.create_local_cluster(
-        1, 1, worker_config=worker_config)
-
-    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
-                                   "/job:worker/replica:0/task:0/cpu:1",
-                                   worker[0].target)
-
-  def testRemoteIteratorUsingRemoteCallOpCrossProcess(self):
-    workers, _ = test_util.create_local_cluster(2, 1)
-
-    self._testRemoteIteratorHelper("/job:worker/replica:0/task:0/cpu:0",
-                                   "/job:worker/replica:0/task:1/cpu:0",
-                                   workers[0].target)
-
-  def testCaptureHashTableInSharedIterator(self):
-    worker, _ = test_util.create_local_cluster(1, 1)
-
-    # NOTE(mrry): We must use the V2 variants of `HashTable`
-    # etc. because these produce a `tf.resource`-typed output that is
-    # compatible with the in-graph function implementation.
-    default_val = -1
-    keys = constant_op.constant(["brain", "salad", "surgery"])
-    values = constant_op.constant([0, 1, 2], dtypes.int64)
-    table = lookup_ops.HashTable(
-        lookup_ops.KeyValueTensorInitializer(keys, values),
-        default_val,
-        shared_name="shared_table")
-
-    input_sentences = dataset_ops.Dataset.from_tensor_slices(
-        ["brain brain tank salad surgery", "surgery brain"])
-
-    iterator = (
-        input_sentences.map(lambda x: string_ops.string_split([x]).values).map(
-            table.lookup)
-        .make_initializable_iterator(shared_name="shared_iterator"))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with session.Session(worker[0].target) as sess:
-      sess.run(table.init)
-      sess.run(init_op)
-      self.assertAllEqual([0, 0, -1, 1, 2], sess.run(get_next))
-
-    with session.Session(worker[0].target) as sess:
-      self.assertAllEqual([2, 0], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testImplicitDisposeParallelMapDataset(self):
-    # Tests whether a parallel map dataset will be cleaned up correctly when
-    # the pipeline does not run it until exhaustion.
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(None) -> PrefetchDataset(100).
-    worker, _ = test_util.create_local_cluster(1, 1)
-
-    components = (np.arange(1000),
-                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
-                  np.array(37.0) * np.arange(1000))
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(None).prefetch(10000))
-
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with session.Session(worker[0].target) as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
deleted file mode 100644
index 671e5d4812cae26ce2ff943f71d3b76ae0a0c1f4..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ /dev/null
@@ -1,885 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-import warnings
-
-import numpy as np
-
-from tensorflow.core.protobuf import cluster_pb2
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.compat import compat as forward_compat
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import server_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
-from tensorflow.python.util import compat
-
-
-class IteratorTest(test.TestCase):
-
-  def testNoGradients(self):
-    component = constant_op.constant([1.])
-    side = constant_op.constant(0.)
-    add = lambda x: x + side
-    dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
-    value = dataset.make_one_shot_iterator().get_next()
-    self.assertIsNone(gradients_impl.gradients(value, component)[0])
-    self.assertIsNone(gradients_impl.gradients(value, side)[0])
-    self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
-
-  def testCapturingStateInOneShotRaisesException(self):
-    var = variables.Variable(37.0, name="myvar")
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices([0.0, 1.0, 2.0])
-        .map(lambda x: x + var))
-    with self.assertRaisesRegexp(
-        ValueError, r"`Dataset.make_one_shot_iterator\(\)` does not support "
-        "datasets that capture stateful objects.+myvar"):
-      dataset.make_one_shot_iterator()
-
-  def testOneShotIterator(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(14).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testOneShotIteratorCaptureByValue(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    tensor_components = tuple([ops.convert_to_tensor(c) for c in components])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(tensor_components)
-        .map(_map_fn).repeat(14).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testOneShotIteratorInsideContainer(self):
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    def within_container():
-
-      def _map_fn(x, y, z):
-        return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-      iterator = (
-          dataset_ops.Dataset.from_tensor_slices(components)
-          .map(_map_fn).repeat(14).make_one_shot_iterator())
-      return iterator.get_next()
-
-    server = server_lib.Server.create_local_server()
-
-    # Create two iterators within unique containers, and run them to
-    # make sure that the resources aren't shared.
-    #
-    # The test below would fail if cname were the same across both
-    # sessions.
-    for i in range(2):
-      with session.Session(server.target) as sess:
-        cname = "iteration%d" % i
-        with ops.container(cname):
-          get_next = within_container()
-
-        for _ in range(14):
-          for i in range(7):
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testOneShotIteratorNonBlocking(self):
-    dataset = dataset_ops.Dataset.from_tensors([1, 2, 3]).map(lambda x: x * x)
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    # Create a session with a single thread to ensure that the
-    # one-shot iterator initializer does not deadlock.
-    config = config_pb2.ConfigProto(
-        inter_op_parallelism_threads=1, use_per_session_threads=True)
-    with session.Session(config=config) as sess:
-      self.assertAllEqual([1, 4, 9], sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-    # Test with multiple threads invoking the one-shot iterator concurrently.
-    with session.Session(config=config) as sess:
-      results = []
-
-      def consumer_thread():
-        try:
-          results.append(sess.run(next_element))
-        except errors.OutOfRangeError:
-          results.append(None)
-
-      num_threads = 8
-      threads = [
-          self.checkedThread(consumer_thread) for _ in range(num_threads)
-      ]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-      self.assertEqual(num_threads, len(results))
-      self.assertEqual(num_threads - 1,
-                       len([None for r in results if r is None]))
-      self.assertAllEqual([[1, 4, 9]], [r for r in results if r is not None])
-
-  def testOneShotIteratorInitializerFails(self):
-    # Define a dataset whose initialization will always fail.
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.check_numerics(
-            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(next_element)
-
-      # Test that subsequent attempts to use the iterator also fail.
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(next_element)
-
-    with self.cached_session() as sess:
-
-      def consumer_thread():
-        with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-          sess.run(next_element)
-
-      num_threads = 8
-      threads = [
-          self.checkedThread(consumer_thread) for _ in range(num_threads)
-      ]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-  def testSimpleSharedResource(self):
-    components = (np.array(1, dtype=np.int64),
-                  np.array([1, 2, 3], dtype=np.int64),
-                  np.array(37.0, dtype=np.float64))
-
-    server = server_lib.Server.create_local_server()
-
-    # Create two non-overlapping sessions that share the same iterator
-    # resource on the same server, and verify that an action of the
-    # first session (initializing the iterator) is visible in the
-    # second session.
-    with ops.Graph().as_default():
-      iterator = (
-          dataset_ops.Dataset.from_tensors(components)
-          .map(lambda x, y, z: (x, y, z)).make_initializable_iterator(
-              shared_name="shared_iterator"))
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      with session.Session(server.target) as sess:
-        sess.run(init_op)
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-        # Re-initialize the iterator in the first session.
-        sess.run(init_op)
-
-    with ops.Graph().as_default():
-      # Re-define the iterator manually, without defining any of the
-      # functions in this graph, to ensure that we are not
-      # accidentally redefining functions with the same names in the
-      # new graph.
-      iterator = iterator_ops.Iterator.from_structure(
-          shared_name="shared_iterator",
-          output_types=(dtypes.int64, dtypes.int64, dtypes.float64),
-          output_shapes=([], [3], []))
-      get_next = iterator.get_next()
-
-      with session.Session(server.target) as sess:
-        # Use the iterator without re-initializing in the second session.
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testNotInitializedError(self):
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.FailedPreconditionError,
-                                   "iterator has not been initialized"):
-        sess.run(get_next)
-
-  def testReinitializableIterator(self):
-    dataset_3 = dataset_ops.Dataset.from_tensors(
-        constant_op.constant([1, 2, 3]))
-    dataset_4 = dataset_ops.Dataset.from_tensors(
-        constant_op.constant([4, 5, 6, 7]))
-    iterator = iterator_ops.Iterator.from_structure(dataset_3.output_types,
-                                                    [None])
-
-    dataset_3_init_op = iterator.make_initializer(dataset_3)
-    dataset_4_init_op = iterator.make_initializer(dataset_4)
-    get_next = iterator.get_next()
-
-    self.assertEqual(dataset_3.output_types, iterator.output_types)
-    self.assertEqual(dataset_4.output_types, iterator.output_types)
-    self.assertEqual([None], iterator.output_shapes.as_list())
-
-    with self.cached_session() as sess:
-      # The iterator is initially uninitialized.
-      with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(get_next)
-
-      # Initialize with one dataset.
-      sess.run(dataset_3_init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Initialize with a different dataset.
-      sess.run(dataset_4_init_op)
-      self.assertAllEqual([4, 5, 6, 7], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Reinitialize with the first dataset.
-      sess.run(dataset_3_init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testReinitializableIteratorStaticErrors(self):
-    # Non-matching structure for types and shapes.
-    with self.assertRaises(TypeError):
-      iterator = iterator_ops.Iterator.from_structure(
-          (dtypes.int64, dtypes.float64), [None])
-
-    # Test validation of dataset argument.
-    iterator = iterator_ops.Iterator.from_structure((dtypes.int64,
-                                                     dtypes.float64))
-
-    # Incompatible structure.
-    with self.assertRaises(ValueError):
-      iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors(((constant_op.constant(
-              [1, 2, 3], dtype=dtypes.int64),), (constant_op.constant(
-                  [4., 5., 6., 7.], dtype=dtypes.float64),))))
-
-    # Incompatible types.
-    with self.assertRaises(TypeError):
-      iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors(
-              (constant_op.constant([1, 2, 3], dtype=dtypes.int32),
-               constant_op.constant([4., 5., 6., 7.], dtype=dtypes.float32))))
-
-    # Incompatible shapes.
-    iterator = iterator_ops.Iterator.from_structure(
-        (dtypes.int64, dtypes.float64), ([None], []))
-    with self.assertRaises(TypeError):
-      iterator.make_initializer(
-          dataset_ops.Dataset.from_tensors(
-              (constant_op.constant([1, 2, 3], dtype=dtypes.int64),
-               constant_op.constant([4., 5., 6., 7.], dtype=dtypes.float64))))
-
-  def testIteratorStringHandle(self):
-    dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-    dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
-
-    iterator_3 = dataset_3.make_one_shot_iterator()
-    iterator_4 = dataset_4.make_one_shot_iterator()
-
-    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    feedable_iterator = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dataset_3.output_types, dataset_3.output_shapes)
-    next_element = feedable_iterator.get_next()
-
-    self.assertEqual(dataset_3.output_types, feedable_iterator.output_types)
-    self.assertEqual(dataset_4.output_types, feedable_iterator.output_types)
-    self.assertEqual([], feedable_iterator.output_shapes)
-
-    with self.cached_session() as sess:
-      iterator_3_handle = sess.run(iterator_3.string_handle())
-      iterator_4_handle = sess.run(iterator_4.string_handle())
-
-      self.assertEqual(10,
-                       sess.run(
-                           next_element,
-                           feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(1,
-                       sess.run(
-                           next_element,
-                           feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(20,
-                       sess.run(
-                           next_element,
-                           feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(2,
-                       sess.run(
-                           next_element,
-                           feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(30,
-                       sess.run(
-                           next_element,
-                           feed_dict={handle_placeholder: iterator_4_handle}))
-      self.assertEqual(3,
-                       sess.run(
-                           next_element,
-                           feed_dict={handle_placeholder: iterator_3_handle}))
-      self.assertEqual(40,
-                       sess.run(
-                           next_element,
-                           feed_dict={handle_placeholder: iterator_4_handle}))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(
-            next_element, feed_dict={handle_placeholder: iterator_3_handle})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(
-            next_element, feed_dict={handle_placeholder: iterator_4_handle})
-
-  def testIteratorStringHandleFuture(self):
-    with forward_compat.forward_compatibility_horizon(2018, 8, 4):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
-
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_4 = dataset_4.make_one_shot_iterator()
-
-      handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      feedable_iterator = iterator_ops.Iterator.from_string_handle(
-          handle_placeholder, dataset_3.output_types, dataset_3.output_shapes)
-      next_element = feedable_iterator.get_next()
-
-      self.assertEqual(dataset_3.output_types, feedable_iterator.output_types)
-      self.assertEqual(dataset_4.output_types, feedable_iterator.output_types)
-      self.assertEqual([], feedable_iterator.output_shapes)
-
-      with self.cached_session() as sess:
-        iterator_3_handle = sess.run(iterator_3.string_handle())
-        iterator_4_handle = sess.run(iterator_4.string_handle())
-
-        self.assertEqual(
-            10,
-            sess.run(
-                next_element,
-                feed_dict={handle_placeholder: iterator_4_handle}))
-        self.assertEqual(
-            1,
-            sess.run(
-                next_element,
-                feed_dict={handle_placeholder: iterator_3_handle}))
-        self.assertEqual(
-            20,
-            sess.run(
-                next_element,
-                feed_dict={handle_placeholder: iterator_4_handle}))
-        self.assertEqual(
-            2,
-            sess.run(
-                next_element,
-                feed_dict={handle_placeholder: iterator_3_handle}))
-        self.assertEqual(
-            30,
-            sess.run(
-                next_element,
-                feed_dict={handle_placeholder: iterator_4_handle}))
-        self.assertEqual(
-            3,
-            sess.run(
-                next_element,
-                feed_dict={handle_placeholder: iterator_3_handle}))
-        self.assertEqual(
-            40,
-            sess.run(
-                next_element,
-                feed_dict={handle_placeholder: iterator_4_handle}))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(
-              next_element, feed_dict={handle_placeholder: iterator_3_handle})
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(
-              next_element, feed_dict={handle_placeholder: iterator_4_handle})
-
-  def testIteratorStringHandleReuseTensorObject(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-    one_shot_iterator = dataset.make_one_shot_iterator()
-    initializable_iterator = dataset.make_initializable_iterator()
-    structure_iterator = iterator_ops.Iterator.from_structure(
-        dataset.output_types)
-
-    created_ops = len(ops.get_default_graph().get_operations())
-
-    self.assertIs(one_shot_iterator.string_handle(),
-                  one_shot_iterator.string_handle())
-    self.assertIs(initializable_iterator.string_handle(),
-                  initializable_iterator.string_handle())
-    self.assertIs(structure_iterator.string_handle(),
-                  structure_iterator.string_handle())
-
-    # Assert that getting the (default) string handle creates no ops.
-    self.assertEqual(created_ops, len(ops.get_default_graph().get_operations()))
-
-    # Specifying an explicit name will create a new op.
-    handle_with_name = one_shot_iterator.string_handle(name="foo")
-    self.assertEqual("foo", handle_with_name.op.name)
-    self.assertIsNot(one_shot_iterator.string_handle(), handle_with_name)
-
-    handle_with_same_name = one_shot_iterator.string_handle(name="foo")
-    self.assertEqual("foo_1", handle_with_same_name.op.name)
-    self.assertIsNot(handle_with_name, handle_with_same_name)
-
-  def testIteratorStringHandleError(self):
-    dataset_int_scalar = (
-        dataset_ops.Dataset.from_tensor_slices([1, 2, 3]).repeat())
-    dataset_float_vector = (dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]))
-
-    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    feedable_int_scalar = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dtypes.int32, [])
-    feedable_int_vector = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dtypes.int32, [None])
-    feedable_int_any = iterator_ops.Iterator.from_string_handle(
-        handle_placeholder, dtypes.int32)
-
-    with self.cached_session() as sess:
-      handle_int_scalar = sess.run(
-          dataset_int_scalar.make_one_shot_iterator().string_handle())
-      handle_float_vector = sess.run(
-          dataset_float_vector.make_one_shot_iterator().string_handle())
-
-      self.assertEqual(1,
-                       sess.run(
-                           feedable_int_scalar.get_next(),
-                           feed_dict={handle_placeholder: handle_int_scalar}))
-
-      self.assertEqual(2,
-                       sess.run(
-                           feedable_int_any.get_next(),
-                           feed_dict={handle_placeholder: handle_int_scalar}))
-
-      with self.assertRaises(errors.InvalidArgumentError):
-        print(sess.run(
-            feedable_int_vector.get_next(),
-            feed_dict={handle_placeholder: handle_int_scalar}))
-
-      with self.assertRaises(errors.InvalidArgumentError):
-        print(sess.run(
-            feedable_int_vector.get_next(),
-            feed_dict={handle_placeholder: handle_float_vector}))
-
-  def testRemoteIteratorUsingRemoteCallOpDirectSession(self):
-    worker_config = config_pb2.ConfigProto()
-    worker_config.device_count["CPU"] = 3
-
-    with ops.device("/job:localhost/replica:0/task:0/cpu:1"):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    @function.Defun(dtypes.string)
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, dataset_3.output_types, dataset_3.output_shapes)
-      return remote_iterator.get_next()
-
-    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
-      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      remote_op = functional_ops.remote_call(
-          args=[iterator_3_handle],
-          Tout=[dtypes.int32],
-          f=_remote_fn,
-          target=target_placeholder)
-
-    with self.test_session(config=worker_config) as sess:
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-          })
-      self.assertEqual(elem, [1])
-      # Fails when target is cpu:2 where the resource is not located.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(
-            remote_op,
-            feed_dict={
-                target_placeholder: "/job:localhost/replica:0/task:0/cpu:2"
-            })
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-          })
-      self.assertEqual(elem, [2])
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-          })
-      self.assertEqual(elem, [3])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(
-            remote_op,
-            feed_dict={
-                target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
-            })
-
-  def testRemoteIteratorUsingRemoteCallOpMultiWorkers(self):
-    s1 = server_lib.Server.create_local_server()
-    s2 = server_lib.Server.create_local_server()
-    s3 = server_lib.Server.create_local_server()
-
-    cluster_def = cluster_pb2.ClusterDef()
-    workers = cluster_def.job.add()
-    workers.name = "worker"
-    workers.tasks[0] = s1.target[len("grpc://"):]
-    workers.tasks[1] = s2.target[len("grpc://"):]
-    client = cluster_def.job.add()
-    client.name = "client"
-    client.tasks[0] = s3.target[len("grpc://"):]
-    config = config_pb2.ConfigProto(cluster_def=cluster_def)
-
-    worker_devices = [
-        "/job:worker/replica:0/task:%d/cpu:0" % i for i in range(2)
-    ]
-    itr_handles = []
-    for device in worker_devices:
-      with ops.device(device):
-        src = dataset_ops.Dataset.from_tensor_slices([device])
-        itr = src.make_one_shot_iterator()
-        itr_handles.append(itr.string_handle())
-
-    targets = dataset_ops.Dataset.from_tensor_slices(worker_devices)
-    handles = dataset_ops.Dataset.from_tensor_slices(itr_handles)
-
-    @function.Defun(dtypes.string)
-    def loading_func(h):
-      remote_itr = iterator_ops.Iterator.from_string_handle(
-          h, itr.output_types, itr.output_shapes)
-      return remote_itr.get_next()
-
-    def map_fn(target, handle):
-      return functional_ops.remote_call(
-          args=[handle], Tout=[dtypes.string], f=loading_func, target=target)
-
-    with ops.device("/job:client"):
-      client_dataset = dataset_ops.Dataset.zip((targets, handles)).map(map_fn)
-      itr = client_dataset.make_initializable_iterator()
-      n = itr.get_next()
-
-    with session.Session(s3.target, config=config) as sess:
-      sess.run(itr.initializer)
-      expected_values = worker_devices
-      for expected in expected_values:
-        self.assertEqual((compat.as_bytes(expected),), sess.run(n))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(n)
-
-  def testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
-      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_3_handle = iterator_3.string_handle()
-
-    def _encode_raw(byte_array):
-      return bytes(bytearray(byte_array))
-
-    @function.Defun(dtypes.uint8)
-    def _remote_fn(h):
-      handle = script_ops.py_func(_encode_raw, [h], dtypes.string)
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, dataset_3.output_types, dataset_3.output_shapes)
-      return remote_iterator.get_next()
-
-    with ops.device("/job:localhost/replica:0/task:0/device:GPU:0"):
-      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-      iterator_3_handle_uint8 = parsing_ops.decode_raw(
-          bytes=iterator_3_handle, out_type=dtypes.uint8)
-      remote_op = functional_ops.remote_call(
-          args=[iterator_3_handle_uint8],
-          Tout=[dtypes.int32],
-          f=_remote_fn,
-          target=target_placeholder)
-
-    with self.cached_session() as sess:
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-          })
-      self.assertEqual(elem, [1])
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-          })
-      self.assertEqual(elem, [2])
-      elem = sess.run(
-          remote_op,
-          feed_dict={
-              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-          })
-      self.assertEqual(elem, [3])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(
-            remote_op,
-            feed_dict={
-                target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
-            })
-
-  def testIncorrectIteratorRestore(self):
-
-    def _path():
-      return os.path.join(self.get_temp_dir(), "iterator")
-
-    def _save_op(iterator_resource):
-      iterator_state_variant = gen_dataset_ops.serialize_iterator(
-          iterator_resource)
-      save_op = io_ops.write_file(
-          _path(), parsing_ops.serialize_tensor(iterator_state_variant))
-      return save_op
-
-    def _restore_op(iterator_resource):
-      iterator_state_variant = parsing_ops.parse_tensor(
-          io_ops.read_file(_path()), dtypes.variant)
-      restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                        iterator_state_variant)
-      return restore_op
-
-    def _build_range_dataset_graph():
-      start = 1
-      stop = 10
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    def _build_reader_dataset_graph():
-      filenames = ["test"]  # Does not exist but we don't care in this test.
-      iterator = readers.FixedLengthRecordDataset(
-          filenames, 1, 0, 0).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next_op = iterator.get_next()
-      save_op = _save_op(iterator._iterator_resource)
-      restore_op = _restore_op(iterator._iterator_resource)
-      return init_op, get_next_op, save_op, restore_op
-
-    # Saving iterator for RangeDataset graph.
-    with ops.Graph().as_default() as g:
-      init_op, _, save_op, _ = _build_range_dataset_graph()
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(save_op)
-
-    # Attempt to restore the saved iterator into an IteratorResource of
-    # incompatible type. An iterator of RangeDataset has output type int64,
-    # while an iterator of FixedLengthRecordDataset has output type string.
-    # So an InvalidArgumentError should be raised by
-    # IteratorResource::set_iterator.
-    with ops.Graph().as_default() as g:
-      _, _, _, restore_op = _build_reader_dataset_graph()
-      with self.session(graph=g) as sess:
-        with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(restore_op)
-
-  def testRepeatedGetNextWarning(self):
-    iterator = dataset_ops.Dataset.range(10).make_one_shot_iterator()
-    warnings.simplefilter("always")
-    with warnings.catch_warnings(record=True) as w:
-      for _ in range(100):
-        iterator.get_next()
-    self.assertEqual(100 - iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD, len(w))
-    for warning in w:
-      self.assertTrue(
-          iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE in str(warning.message))
-
-  def testEagerIteratorAsync(self):
-    with context.eager_mode(), context.execution_mode(context.ASYNC):
-      val = 0
-      dataset = dataset_ops.Dataset.range(10)
-      for foo in dataset:
-        self.assertEqual(val, foo.numpy())
-        val += 1
-
-
-class IteratorCheckpointingTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveRestoreOneShotIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(
-        math_ops.square).batch(2)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
-    self.assertAllEqual([1, 4], get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual([9, 16], get_next())
-    self.assertAllEqual([25, 36], get_next())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual([9, 16], get_next())
-    self.assertAllEqual([25, 36], get_next())
-    with self.assertRaises(errors.OutOfRangeError):
-      get_next()
-
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveRestoreMultipleIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
-    dataset = dataset.map(math_ops.square).batch(2)
-    iterator_1 = dataset.make_one_shot_iterator()
-    get_next_1 = iterator_1.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_1.get_next())
-    iterator_2 = dataset.make_one_shot_iterator()
-    get_next_2 = iterator_2.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_2.get_next())
-    dataset_2 = dataset_ops.Dataset.range(10)
-    iterator_3 = dataset_2.make_one_shot_iterator()
-    get_next_3 = iterator_3.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_3.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(
-        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
-    self.assertAllEqual([1, 4], get_next_1())
-    self.assertAllEqual(0, get_next_3())
-    self.assertAllEqual(1, get_next_3())
-    self.assertAllEqual(2, get_next_3())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual([1, 4], get_next_2())
-    self.assertAllEqual([9, 16], get_next_2())
-    self.assertAllEqual(3, get_next_3())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual([9, 16], get_next_1())
-    self.assertAllEqual([1, 4], get_next_2())
-    self.assertAllEqual(3, get_next_3())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testRestoreExhaustedIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.range(3)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
-    self.assertAllEqual(0, get_next())
-    self.assertAllEqual(1, get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual(2, get_next())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual(2, get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    checkpoint.restore(save_path).run_restore_ops()
-    with self.assertRaises(errors.OutOfRangeError):
-      get_next()
-
-  def testRestoreInReconstructedIteratorInitializable(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.range(10)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
-    for i in range(5):
-      with self.cached_session() as sess:
-        checkpoint.restore(checkpoint_management.latest_checkpoint(
-            checkpoint_directory)).initialize_or_restore(sess)
-        for j in range(2):
-          self.assertEqual(i * 2 + j, sess.run(get_next))
-        checkpoint.save(file_prefix=checkpoint_prefix)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..916cf8bb45ce7dbf55261d3f67ca17c0cdbb10fd
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -0,0 +1,881 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Iterator`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import warnings
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.compat import compat as forward_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.data.util import structure
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+
+class IteratorTest(test.TestCase, parameterized.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testNoGradients(self):
+    component = constant_op.constant([1.])
+    side = constant_op.constant(0.)
+    add = lambda x: x + side
+    dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
+    value = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    self.assertIsNone(gradients_impl.gradients(value, component)[0])
+    self.assertIsNone(gradients_impl.gradients(value, side)[0])
+    self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
+
+  @test_util.run_deprecated_v1
+  def testCapturingStateInOneShotRaisesException(self):
+    var = variables.Variable(37.0, name="myvar")
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices([0.0, 1.0, 2.0])
+        .map(lambda x: x + var))
+    with self.assertRaisesRegexp(
+        ValueError, r"`Dataset.make_one_shot_iterator\(\)` does not support "
+        "datasets that capture stateful objects.+myvar"):
+      dataset_ops.make_one_shot_iterator(dataset)
+
+  @test_util.run_deprecated_v1
+  def testOneShotIterator(self):
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(14))
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.cached_session() as sess:
+      for _ in range(14):
+        for i in range(7):
+          result = sess.run(get_next)
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testOneShotIteratorCaptureByValue(self):
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+    tensor_components = tuple([ops.convert_to_tensor(c) for c in components])
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_tensor_slices(tensor_components)
+        .map(_map_fn).repeat(14))
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.cached_session() as sess:
+      for _ in range(14):
+        for i in range(7):
+          result = sess.run(get_next)
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testOneShotIteratorInsideContainer(self):
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def within_container():
+
+      def _map_fn(x, y, z):
+        return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+      iterator = dataset_ops.make_one_shot_iterator(
+          dataset_ops.Dataset.from_tensor_slices(components)
+          .map(_map_fn).repeat(14))
+      return iterator.get_next()
+
+    server = server_lib.Server.create_local_server()
+
+    # Create two iterators within unique containers, and run them to
+    # make sure that the resources aren't shared.
+    #
+    # The test below would fail if cname were the same across both
+    # sessions.
+    for j in range(2):
+      with session.Session(server.target) as sess:
+        cname = "iteration%d" % j
+        with ops.container(cname):
+          get_next = within_container()
+
+        for _ in range(14):
+          for i in range(7):
+            result = sess.run(get_next)
+            for component, result_component in zip(components, result):
+              self.assertAllEqual(component[i]**2, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testOneShotIteratorNonBlocking(self):
+    dataset = dataset_ops.Dataset.from_tensors([1, 2, 3]).map(lambda x: x * x)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    next_element = iterator.get_next()
+
+    # Create a session with a single thread to ensure that the
+    # one-shot iterator initializer does not deadlock.
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=1, use_per_session_threads=True)
+    with session.Session(config=config) as sess:
+      self.assertAllEqual([1, 4, 9], sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+    # Test with multiple threads invoking the one-shot iterator concurrently.
+    with session.Session(config=config) as sess:
+      results = []
+
+      def consumer_thread():
+        try:
+          results.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          results.append(None)
+
+      num_threads = 8
+      threads = [
+          self.checkedThread(consumer_thread) for _ in range(num_threads)
+      ]
+      for t in threads:
+        t.start()
+      for t in threads:
+        t.join()
+
+      self.assertEqual(num_threads, len(results))
+      self.assertEqual(num_threads - 1,
+                       len([None for r in results if r is None]))
+      self.assertAllEqual([[1, 4, 9]], [r for r in results if r is not None])
+
+  @test_util.run_deprecated_v1
+  def testOneShotIteratorInitializerFails(self):
+    # Define a dataset whose initialization will always fail.
+    dataset = dataset_ops.Dataset.from_tensors(
+        array_ops.check_numerics(
+            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+        sess.run(next_element)
+
+      # Test that subsequent attempts to use the iterator also fail.
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+        sess.run(next_element)
+
+    with self.cached_session() as sess:
+
+      def consumer_thread():
+        with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+          sess.run(next_element)
+
+      num_threads = 8
+      threads = [
+          self.checkedThread(consumer_thread) for _ in range(num_threads)
+      ]
+      for t in threads:
+        t.start()
+      for t in threads:
+        t.join()
+
+  def testSimpleSharedResource(self):
+    components = (np.array(1, dtype=np.int64),
+                  np.array([1, 2, 3], dtype=np.int64),
+                  np.array(37.0, dtype=np.float64))
+
+    server = server_lib.Server.create_local_server()
+
+    # Create two non-overlapping sessions that share the same iterator
+    # resource on the same server, and verify that an action of the
+    # first session (initializing the iterator) is visible in the
+    # second session.
+    with ops.Graph().as_default():
+      iterator = (
+          dataset_ops.Dataset.from_tensors(components)
+          .map(lambda x, y, z: (x, y, z)).make_initializable_iterator(
+              shared_name="shared_iterator"))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+
+      with session.Session(server.target) as sess:
+        sess.run(init_op)
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+        # Re-initialize the iterator in the first session.
+        sess.run(init_op)
+
+    with ops.Graph().as_default():
+      # Re-define the iterator manually, without defining any of the
+      # functions in this graph, to ensure that we are not
+      # accidentally redefining functions with the same names in the
+      # new graph.
+      iterator = iterator_ops.Iterator.from_structure(
+          shared_name="shared_iterator",
+          output_types=(dtypes.int64, dtypes.int64, dtypes.float64),
+          output_shapes=([], [3], []))
+      get_next = iterator.get_next()
+
+      with session.Session(server.target) as sess:
+        # Use the iterator without re-initializing in the second session.
+        results = sess.run(get_next)
+        for component, result_component in zip(components, results):
+          self.assertAllEqual(component, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testNotInitializedError(self):
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(components))
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      with self.assertRaisesRegexp(errors.FailedPreconditionError,
+                                   "iterator has not been initialized"):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testReinitializableIterator(self):
+    dataset_3 = dataset_ops.Dataset.from_tensors(
+        constant_op.constant([1, 2, 3]))
+    dataset_4 = dataset_ops.Dataset.from_tensors(
+        constant_op.constant([4, 5, 6, 7]))
+    iterator = iterator_ops.Iterator.from_structure(dataset_3.output_types,
+                                                    [None])
+
+    dataset_3_init_op = iterator.make_initializer(dataset_3)
+    dataset_4_init_op = iterator.make_initializer(dataset_4)
+    get_next = iterator.get_next()
+
+    self.assertEqual(dataset_3.output_types, iterator.output_types)
+    self.assertEqual(dataset_4.output_types, iterator.output_types)
+    self.assertEqual([None], iterator.output_shapes.as_list())
+
+    with self.cached_session() as sess:
+      # The iterator is initially uninitialized.
+      with self.assertRaises(errors.FailedPreconditionError):
+        sess.run(get_next)
+
+      # Initialize with one dataset.
+      sess.run(dataset_3_init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Initialize with a different dataset.
+      sess.run(dataset_4_init_op)
+      self.assertAllEqual([4, 5, 6, 7], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Reinitialize with the first dataset.
+      sess.run(dataset_3_init_op)
+      self.assertAllEqual([1, 2, 3], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @test_util.run_deprecated_v1
+  def testReinitializableIteratorWithFunctions(self):
+
+    def g():
+      for i in range(10):
+        yield i
+
+    iterator = iterator_ops.Iterator.from_structure(dtypes.int64, [])
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      dataset_1 = dataset_ops.Dataset.from_generator(
+          g, output_types=dtypes.int64)
+      sess.run(iterator.make_initializer(dataset_1))
+      for expected in range(10):
+        self.assertEqual(expected, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      dataset_2 = dataset_ops.Dataset.from_generator(
+          g, output_types=dtypes.int64)
+      sess.run(iterator.make_initializer(dataset_2))
+      for expected in range(10):
+        self.assertEqual(expected, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testReinitializableIteratorStaticErrors(self):
+    # Non-matching structure for types and shapes.
+    with self.assertRaises(TypeError):
+      iterator = iterator_ops.Iterator.from_structure(
+          (dtypes.int64, dtypes.float64), [None])
+
+    # Test validation of dataset argument.
+    iterator = iterator_ops.Iterator.from_structure((dtypes.int64,
+                                                     dtypes.float64))
+
+    # Incompatible structure.
+    with self.assertRaises(ValueError):
+      iterator.make_initializer(
+          dataset_ops.Dataset.from_tensors(((constant_op.constant(
+              [1, 2, 3], dtype=dtypes.int64),), (constant_op.constant(
+                  [4., 5., 6., 7.], dtype=dtypes.float64),))))
+
+    # Incompatible types.
+    with self.assertRaises(TypeError):
+      iterator.make_initializer(
+          dataset_ops.Dataset.from_tensors(
+              (constant_op.constant([1, 2, 3], dtype=dtypes.int32),
+               constant_op.constant([4., 5., 6., 7.], dtype=dtypes.float32))))
+
+    # Incompatible shapes.
+    iterator = iterator_ops.Iterator.from_structure(
+        (dtypes.int64, dtypes.float64), ([None], []))
+    with self.assertRaises(TypeError):
+      iterator.make_initializer(
+          dataset_ops.Dataset.from_tensors(
+              (constant_op.constant([1, 2, 3], dtype=dtypes.int64),
+               constant_op.constant([4., 5., 6., 7.], dtype=dtypes.float64))))
+
+  @test_util.run_deprecated_v1
+  def testIteratorStringHandle(self):
+    dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+    dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
+
+    iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+    iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)
+
+    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+    feedable_iterator = iterator_ops.Iterator.from_string_handle(
+        handle_placeholder, dataset_3.output_types, dataset_3.output_shapes)
+    next_element = feedable_iterator.get_next()
+
+    self.assertEqual(dataset_3.output_types, feedable_iterator.output_types)
+    self.assertEqual(dataset_4.output_types, feedable_iterator.output_types)
+    self.assertEqual([], feedable_iterator.output_shapes)
+
+    with self.cached_session() as sess:
+      iterator_3_handle = sess.run(iterator_3.string_handle())
+      iterator_4_handle = sess.run(iterator_4.string_handle())
+
+      self.assertEqual(10,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(1,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_3_handle}))
+      self.assertEqual(20,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(2,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_3_handle}))
+      self.assertEqual(30,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_4_handle}))
+      self.assertEqual(3,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_3_handle}))
+      self.assertEqual(40,
+                       sess.run(
+                           next_element,
+                           feed_dict={handle_placeholder: iterator_4_handle}))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(
+            next_element, feed_dict={handle_placeholder: iterator_3_handle})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(
+            next_element, feed_dict={handle_placeholder: iterator_4_handle})
+
+  @test_util.run_deprecated_v1
+  def testIteratorStringHandleFuture(self):
+    with forward_compat.forward_compatibility_horizon(2018, 8, 4):
+      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+      dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
+
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+      iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)
+
+      handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+      feedable_iterator = iterator_ops.Iterator.from_string_handle(
+          handle_placeholder, dataset_3.output_types, dataset_3.output_shapes)
+      next_element = feedable_iterator.get_next()
+
+      self.assertEqual(dataset_3.output_types, feedable_iterator.output_types)
+      self.assertEqual(dataset_4.output_types, feedable_iterator.output_types)
+      self.assertEqual([], feedable_iterator.output_shapes)
+
+      with self.cached_session() as sess:
+        iterator_3_handle = sess.run(iterator_3.string_handle())
+        iterator_4_handle = sess.run(iterator_4.string_handle())
+
+        self.assertEqual(
+            10,
+            sess.run(
+                next_element,
+                feed_dict={handle_placeholder: iterator_4_handle}))
+        self.assertEqual(
+            1,
+            sess.run(
+                next_element,
+                feed_dict={handle_placeholder: iterator_3_handle}))
+        self.assertEqual(
+            20,
+            sess.run(
+                next_element,
+                feed_dict={handle_placeholder: iterator_4_handle}))
+        self.assertEqual(
+            2,
+            sess.run(
+                next_element,
+                feed_dict={handle_placeholder: iterator_3_handle}))
+        self.assertEqual(
+            30,
+            sess.run(
+                next_element,
+                feed_dict={handle_placeholder: iterator_4_handle}))
+        self.assertEqual(
+            3,
+            sess.run(
+                next_element,
+                feed_dict={handle_placeholder: iterator_3_handle}))
+        self.assertEqual(
+            40,
+            sess.run(
+                next_element,
+                feed_dict={handle_placeholder: iterator_4_handle}))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(
+              next_element, feed_dict={handle_placeholder: iterator_3_handle})
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(
+              next_element, feed_dict={handle_placeholder: iterator_4_handle})
+
+  @test_util.run_deprecated_v1
+  def testIteratorStringHandleReuseTensorObject(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+    one_shot_iterator = dataset_ops.make_one_shot_iterator(dataset)
+    initializable_iterator = dataset_ops.make_initializable_iterator(dataset)
+    structure_iterator = iterator_ops.Iterator.from_structure(
+        dataset.output_types)
+
+    created_ops = len(ops.get_default_graph().get_operations())
+
+    self.assertIs(one_shot_iterator.string_handle(),
+                  one_shot_iterator.string_handle())
+    self.assertIs(initializable_iterator.string_handle(),
+                  initializable_iterator.string_handle())
+    self.assertIs(structure_iterator.string_handle(),
+                  structure_iterator.string_handle())
+
+    # Assert that getting the (default) string handle creates no ops.
+    self.assertEqual(created_ops, len(ops.get_default_graph().get_operations()))
+
+    # Specifying an explicit name will create a new op.
+    handle_with_name = one_shot_iterator.string_handle(name="foo")
+    self.assertEqual("foo", handle_with_name.op.name)
+    self.assertIsNot(one_shot_iterator.string_handle(), handle_with_name)
+
+    handle_with_same_name = one_shot_iterator.string_handle(name="foo")
+    self.assertEqual("foo_1", handle_with_same_name.op.name)
+    self.assertIsNot(handle_with_name, handle_with_same_name)
+
+  @test_util.run_deprecated_v1
+  def testIteratorStringHandleError(self):
+    dataset_int_scalar = (
+        dataset_ops.Dataset.from_tensor_slices([1, 2, 3]).repeat())
+    dataset_float_vector = (dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]))
+
+    handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+
+    feedable_int_scalar = iterator_ops.Iterator.from_string_handle(
+        handle_placeholder, dtypes.int32, [])
+    feedable_int_vector = iterator_ops.Iterator.from_string_handle(
+        handle_placeholder, dtypes.int32, [None])
+    feedable_int_any = iterator_ops.Iterator.from_string_handle(
+        handle_placeholder, dtypes.int32)
+
+    with self.cached_session() as sess:
+      handle_int_scalar = sess.run(dataset_ops.make_one_shot_iterator(
+          dataset_int_scalar).string_handle())
+      handle_float_vector = sess.run(dataset_ops.make_one_shot_iterator(
+          dataset_float_vector).string_handle())
+
+      self.assertEqual(1,
+                       sess.run(
+                           feedable_int_scalar.get_next(),
+                           feed_dict={handle_placeholder: handle_int_scalar}))
+
+      self.assertEqual(2,
+                       sess.run(
+                           feedable_int_any.get_next(),
+                           feed_dict={handle_placeholder: handle_int_scalar}))
+
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(
+            feedable_int_vector.get_next(),
+            feed_dict={handle_placeholder: handle_int_scalar}))
+
+      with self.assertRaises(errors.InvalidArgumentError):
+        print(sess.run(
+            feedable_int_vector.get_next(),
+            feed_dict={handle_placeholder: handle_float_vector}))
+
+  @test_util.run_deprecated_v1
+  def testRemoteIteratorUsingRemoteCallOpDirectSession(self):
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 3
+
+    with ops.device("/job:localhost/replica:0/task:0/cpu:1"):
+      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+      iterator_3_handle = iterator_3.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _remote_fn(h):
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          h, dataset_3.output_types, dataset_3.output_shapes)
+      return remote_iterator.get_next()
+
+    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
+      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+      remote_op = functional_ops.remote_call(
+          args=[iterator_3_handle],
+          Tout=[dtypes.int32],
+          f=_remote_fn,
+          target=target_placeholder)
+
+    with self.session(config=worker_config) as sess:
+      elem = sess.run(
+          remote_op,
+          feed_dict={
+              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
+          })
+      self.assertEqual(elem, [1])
+      # Fails when target is cpu:2 where the resource is not located.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(
+            remote_op,
+            feed_dict={
+                target_placeholder: "/job:localhost/replica:0/task:0/cpu:2"
+            })
+      elem = sess.run(
+          remote_op,
+          feed_dict={
+              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
+          })
+      self.assertEqual(elem, [2])
+      elem = sess.run(
+          remote_op,
+          feed_dict={
+              target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
+          })
+      self.assertEqual(elem, [3])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(
+            remote_op,
+            feed_dict={
+                target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
+            })
+
+  @test_util.run_deprecated_v1
+  def testRemoteIteratorUsingRemoteCallOpMultiWorkers(self):
+    s1 = server_lib.Server.create_local_server()
+    s2 = server_lib.Server.create_local_server()
+    s3 = server_lib.Server.create_local_server()
+
+    cluster_def = cluster_pb2.ClusterDef()
+    workers = cluster_def.job.add()
+    workers.name = "worker"
+    workers.tasks[0] = s1.target[len("grpc://"):]
+    workers.tasks[1] = s2.target[len("grpc://"):]
+    client = cluster_def.job.add()
+    client.name = "client"
+    client.tasks[0] = s3.target[len("grpc://"):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    worker_devices = [
+        "/job:worker/replica:0/task:%d/cpu:0" % i for i in range(2)
+    ]
+    itr_handles = []
+    for device in worker_devices:
+      with ops.device(device):
+        src = dataset_ops.Dataset.from_tensor_slices([device])
+        itr = dataset_ops.make_one_shot_iterator(src)
+        itr_handles.append(itr.string_handle())
+
+    targets = dataset_ops.Dataset.from_tensor_slices(worker_devices)
+    handles = dataset_ops.Dataset.from_tensor_slices(itr_handles)
+
+    @function.Defun(dtypes.string)
+    def loading_func(h):
+      remote_itr = iterator_ops.Iterator.from_string_handle(
+          h, itr.output_types, itr.output_shapes)
+      return remote_itr.get_next()
+
+    def map_fn(target, handle):
+      return functional_ops.remote_call(
+          args=[handle], Tout=[dtypes.string], f=loading_func, target=target)
+
+    with ops.device("/job:client"):
+      client_dataset = dataset_ops.Dataset.zip((targets, handles)).map(map_fn)
+      itr = dataset_ops.make_initializable_iterator(client_dataset)
+      n = itr.get_next()
+
+    with session.Session(s3.target, config=config) as sess:
+      sess.run(itr.initializer)
+      expected_values = worker_devices
+      for expected in expected_values:
+        self.assertEqual((compat.as_bytes(expected),), sess.run(n))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(n)
+
+  def testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
+      dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+      iterator_3_handle = iterator_3.string_handle()
+
+    def _encode_raw(byte_array):
+      return bytes(bytearray(byte_array))
+
+    @function.Defun(dtypes.uint8)
+    def _remote_fn(h):
+      handle = script_ops.py_func(_encode_raw, [h], dtypes.string)
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, dataset_3.output_types, dataset_3.output_shapes)
+      return remote_iterator.get_next()
+
+    with ops.device("/job:localhost/replica:0/task:0/device:GPU:0"):
+      target_placeholder = array_ops.placeholder(dtypes.string, shape=[])
+      iterator_3_handle_uint8 = parsing_ops.decode_raw(
+          bytes=iterator_3_handle, out_type=dtypes.uint8)
+      remote_op = functional_ops.remote_call(
+          args=[iterator_3_handle_uint8],
+          Tout=[dtypes.int32],
+          f=_remote_fn,
+          target=target_placeholder)
+
+    with self.cached_session() as sess:
+      elem = sess.run(
+          remote_op,
+          feed_dict={
+              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
+          })
+      self.assertEqual(elem, [1])
+      elem = sess.run(
+          remote_op,
+          feed_dict={
+              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
+          })
+      self.assertEqual(elem, [2])
+      elem = sess.run(
+          remote_op,
+          feed_dict={
+              target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
+          })
+      self.assertEqual(elem, [3])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(
+            remote_op,
+            feed_dict={
+                target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
+            })
+
+  @test_util.run_deprecated_v1
+  def testIncorrectIteratorRestore(self):
+
+    def _path():
+      return os.path.join(self.get_temp_dir(), "iterator")
+
+    def _save_op(iterator_resource):
+      iterator_state_variant = gen_dataset_ops.serialize_iterator(
+          iterator_resource)
+      save_op = io_ops.write_file(
+          _path(), parsing_ops.serialize_tensor(iterator_state_variant))
+      return save_op
+
+    def _restore_op(iterator_resource):
+      iterator_state_variant = parsing_ops.parse_tensor(
+          io_ops.read_file(_path()), dtypes.variant)
+      restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
+                                                        iterator_state_variant)
+      return restore_op
+
+    def _build_range_dataset_graph():
+      start = 1
+      stop = 10
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      save_op = _save_op(iterator._iterator_resource)
+      restore_op = _restore_op(iterator._iterator_resource)
+      return init_op, get_next, save_op, restore_op
+
+    def _build_reader_dataset_graph():
+      filenames = ["test"]  # Does not exist but we don't care in this test.
+      iterator = dataset_ops.make_initializable_iterator(
+          readers.FixedLengthRecordDataset(filenames, 1, 0, 0))
+      init_op = iterator.initializer
+      get_next_op = iterator.get_next()
+      save_op = _save_op(iterator._iterator_resource)
+      restore_op = _restore_op(iterator._iterator_resource)
+      return init_op, get_next_op, save_op, restore_op
+
+    # Saving iterator for RangeDataset graph.
+    with ops.Graph().as_default() as g:
+      init_op, _, save_op, _ = _build_range_dataset_graph()
+      with self.session(graph=g) as sess:
+        sess.run(init_op)
+        sess.run(save_op)
+
+    # Attempt to restore the saved iterator into an IteratorResource of
+    # incompatible type. An iterator of RangeDataset has output type int64,
+    # while an iterator of FixedLengthRecordDataset has output type string.
+    # So an InvalidArgumentError should be raised by
+    # IteratorResource::set_iterator.
+    with ops.Graph().as_default() as g:
+      _, _, _, restore_op = _build_reader_dataset_graph()
+      with self.session(graph=g) as sess:
+        with self.assertRaises(errors.InvalidArgumentError):
+          sess.run(restore_op)
+
+  @test_util.run_deprecated_v1
+  def testRepeatedGetNextWarning(self):
+    iterator = dataset_ops.make_one_shot_iterator(dataset_ops.Dataset.range(10))
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      for _ in range(100):
+        iterator.get_next()
+    self.assertEqual(100 - iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD, len(w))
+    for warning in w:
+      self.assertIn(
+          iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE, str(warning.message))
+
+  def testEagerIteratorAsync(self):
+    with context.eager_mode(), context.execution_mode(context.ASYNC):
+      val = 0
+      dataset = dataset_ops.Dataset.range(10)
+      for foo in dataset:
+        self.assertEqual(val, foo.numpy())
+        val += 1
+
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0),
+       structure.TensorStructure(dtypes.float32, []),
+       ops.Tensor, dtypes.float32, []),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[0]], values=constant_op.constant([0], dtype=dtypes.int32),
+          dense_shape=[1]),
+       structure.SparseTensorStructure(dtypes.int32, [1]),
+       sparse_tensor.SparseTensor, dtypes.int32, [1]),
+      ("Nest", lambda: {
+          "a": constant_op.constant(37.0),
+          "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))},
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, []),
+           "b": (structure.TensorStructure(dtypes.string, [1]),
+                 structure.TensorStructure(dtypes.string, []))}),
+       {"a": ops.Tensor, "b": (ops.Tensor, ops.Tensor)},
+       {"a": dtypes.float32, "b": (dtypes.string, dtypes.string)},
+       {"a": [], "b": ([1], [])}),
+  )
+  def testIteratorStructure(self, tf_value_fn, expected_element_structure,
+                            expected_output_classes, expected_output_types,
+                            expected_output_shapes):
+    tf_value = tf_value_fn()
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_tensors(tf_value))
+
+    self.assertTrue(expected_element_structure.is_compatible_with(
+        iterator._element_structure))
+    self.assertTrue(iterator._element_structure.is_compatible_with(
+        expected_element_structure))
+
+    self.assertEqual(expected_output_classes, iterator.output_classes)
+    self.assertEqual(expected_output_types, iterator.output_types)
+    self.assertEqual(expected_output_shapes, iterator.output_shapes)
+
+  def testIteratorGetNextName(self):
+    with ops.Graph().as_default():
+      iterator = dataset_ops.make_one_shot_iterator(
+          dataset_ops.Dataset.from_tensors(37.0))
+      next_element = iterator.get_next(name="overridden_name")
+      self.assertEqual("overridden_name", next_element.op.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
deleted file mode 100644
index 8eb13815d4a5da63386caa7f7519a76f8d9981c2..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class ListFilesDatasetOpTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def _touchTempFiles(self, filenames):
-    for filename in filenames:
-      open(path.join(self.tmp_dir, filename), 'a').close()
-
-  def testEmptyDirectory(self):
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testSimpleDirectory(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testSimpleDirectoryNotShuffled(self):
-    filenames = ['b', 'c', 'a']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=False)
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      for filename in sorted(filenames):
-        self.assertEqual(compat.as_bytes(path.join(self.tmp_dir, filename)),
-                         sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFixedSeedResultsInRepeatableOrder(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename))
-                        for filename in filenames]
-
-      all_produced_filenames = []
-      for _ in range(3):
-        produced_filenames = []
-        sess.run(itr.initializer)
-        try:
-          while True:
-            produced_filenames.append(sess.run(next_element))
-        except errors.OutOfRangeError:
-          pass
-        all_produced_filenames.append(produced_filenames)
-
-      # Each run should produce the same set of filenames, which may be
-      # different from the order of `full_filenames`.
-      self.assertItemsEqual(full_filenames, all_produced_filenames[0])
-      # However, the different runs should produce filenames in the same order
-      # as each other.
-      self.assertEqual(all_produced_filenames[0], all_produced_filenames[1])
-      self.assertEqual(all_produced_filenames[0], all_produced_filenames[2])
-
-  def testEmptyDirectoryInitializer(self):
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError, 'No files matched pattern: '):
-        sess.run(
-            itr.initializer,
-            feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-  def testSimpleDirectoryInitializer(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileSuffixes(self):
-    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:-1]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileMiddles(self):
-    filenames = ['a.txt', 'b.py', 'c.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testNoShuffle(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    # Repeat the list twice and ensure that the order is the same each time.
-    # NOTE(mrry): This depends on an implementation detail of `list_files()`,
-    # which is that the list of files is captured when the iterator is
-    # initialized. Otherwise, or if e.g. the iterator were initialized more than
-    # once, it's possible that the non-determinism of `tf.matching_files()`
-    # would cause this test to fail. However, it serves as a useful confirmation
-    # that the `shuffle=False` argument is working as intended.
-    # TODO(b/73959787): Provide some ordering guarantees so that this test is
-    # more meaningful.
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=False).repeat(2)
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames * 2:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-      self.assertItemsEqual(full_filenames, produced_filenames)
-      self.assertEqual(produced_filenames[:len(filenames)],
-                       produced_filenames[len(filenames):])
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a70c4b081d5c710082eb485a1dbb6179a90da2ce
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -0,0 +1,217 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.list_files()`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+import shutil
+import tempfile
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ListFilesTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    self.tmp_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+  def _touchTempFiles(self, filenames):
+    for filename in filenames:
+      open(path.join(self.tmp_dir, filename), 'a').close()
+
+  # Note: eager mode fails in assertion error same as initializer in graph mode.
+  @test_util.run_deprecated_v1
+  def testSkipEagerEmptyDirectory(self):
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(dataset, expected_output=[])
+
+  def testSimpleDirectory(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames
+        ],
+        assert_items_equal=True)
+
+  def testSimpleDirectoryNotShuffled(self):
+    filenames = ['b', 'c', 'a']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=False)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in sorted(filenames)
+        ])
+
+  def testFixedSeedResultsInRepeatableOrder(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
+
+    expected_filenames = [
+        compat.as_bytes(path.join(self.tmp_dir, filename))
+        for filename in filenames
+    ]
+
+    all_actual_filenames = []
+    for _ in range(3):
+      actual_filenames = []
+      next_element = self.getNext(dataset, requires_initialization=True)
+      try:
+        while True:
+          actual_filenames.append(self.evaluate(next_element()))
+      except errors.OutOfRangeError:
+        pass
+      all_actual_filenames.append(actual_filenames)
+
+    # Each run should produce the same set of filenames, which may be
+    # different from the order of `expected_filenames`.
+    self.assertItemsEqual(expected_filenames, all_actual_filenames[0])
+    # However, the different runs should produce filenames in the same order
+    # as each other.
+    self.assertEqual(all_actual_filenames[0], all_actual_filenames[1])
+    self.assertEqual(all_actual_filenames[0], all_actual_filenames[2])
+
+  # TODO(b/117581999): eager mode assertion fail wrapped, debug.
+  def tesSkipEagerEmptyDirectoryInitializer(self):
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(errors.InvalidArgumentError,
+                        'No files matched pattern'),
+        requires_initialization=True)
+
+  def testSimpleDirectoryInitializer(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames
+        ],
+        assert_items_equal=True)
+
+  def testFileSuffixes(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*.py'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[1:-1]
+        ],
+        assert_items_equal=True)
+
+  def testFileMiddles(self):
+    filenames = ['a.txt', 'b.py', 'c.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*.py*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[1:]
+        ],
+        assert_items_equal=True)
+
+  def testNoShuffle(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    # Repeat the list twice and ensure that the order is the same each time.
+    # NOTE(mrry): This depends on an implementation detail of `list_files()`,
+    # which is that the list of files is captured when the iterator is
+    # initialized. Otherwise, or if e.g. the iterator were initialized more than
+    # once, it's possible that the non-determinism of `tf.matching_files()`
+    # would cause this test to fail. However, it serves as a useful confirmation
+    # that the `shuffle=False` argument is working as intended.
+    # TODO(b/73959787): Provide some ordering guarantees so that this test is
+    # more meaningful.
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=False).repeat(2)
+    next_element = self.getNext(dataset)
+
+    expected_filenames = []
+    actual_filenames = []
+    for filename in filenames * 2:
+      expected_filenames.append(
+          compat.as_bytes(path.join(self.tmp_dir, filename)))
+      actual_filenames.append(compat.as_bytes(self.evaluate(next_element())))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self.assertItemsEqual(expected_filenames, actual_filenames)
+    self.assertEqual(actual_filenames[:len(filenames)],
+                     actual_filenames[len(filenames):])
+
+  def testMultiplePatternsAsList(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    patterns = [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']]
+    dataset = dataset_ops.Dataset.list_files(patterns)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[:-1]
+        ],
+        assert_items_equal=True)
+
+  def testMultiplePatternsAsTensor(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[:-1]
+        ],
+        assert_items_equal=True)
+
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
deleted file mode 100644
index 0c372ebb10b4cec2cdc9309eb77304b108200950..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ /dev/null
@@ -1,863 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for `tf.data.Dataset.map()`."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import namedtuple
-import threading
-import time
-import warnings
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import test
-
-
-class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def _buildMapDataset(self, components, count):
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-    return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-            .repeat(count))
-
-  def testMapDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(count).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-
-    dataset = self._buildMapDataset(components, count)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Test single-threaded access to the iterator.
-      sess.run(init_op, feed_dict={count: 14})
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test multi-threaded access to the same iterator.
-      sess.run(init_op, feed_dict={count: 18})
-      results = []
-      def iterator_thread():
-        while True:
-          try:
-            results.append(sess.run(get_next))
-          except errors.OutOfRangeError:
-            return
-      threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
-      for t in threads:
-        t.start()
-      for t in threads:
-        t.join()
-
-      # `results` will contain the same elements components**2
-      # repeated 18 times, but in a non-deterministic order. Sort the
-      # results, and assert that each element of components**2 is
-      # produced 18 times.
-      results.sort(key=lambda x: x[0])
-      for i in range(7):
-        for j in range(18):
-          for component, result_component in zip(components,
-                                                 results[i * 18 + j]):
-            self.assertAllEqual(component[i]**2, result_component)
-
-  def _buildParallelMapDataset(self, components, count, num_parallel_calls,
-                               output_buffer_size):
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-    return (dataset_ops.Dataset.from_tensor_slices(components)
-            .map(_map_fn, num_parallel_calls=num_parallel_calls)
-            .prefetch(output_buffer_size)
-            .repeat(count))
-
-  def testParallelMapDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset -> ParallelMapDataset(square_3) ->
-    # RepeatDataset(count).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    num_parallel_calls = array_ops.placeholder(dtypes.int32, shape=[])
-    output_buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    dataset = self._buildParallelMapDataset(
-        components, count, num_parallel_calls, output_buffer_size)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-
-      def do_test(num_parallel_calls_val, output_buffer_size_val):
-        # Test single-threaded access to the iterator.
-        sess.run(init_op, feed_dict={
-            count: 14,
-            num_parallel_calls: num_parallel_calls_val,
-            output_buffer_size: output_buffer_size_val})
-        for _ in range(14):
-          for i in range(7):
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-        # Test multi-threaded access to the same iterator.
-        sess.run(init_op, feed_dict={
-            count: 18,
-            num_parallel_calls: num_parallel_calls_val,
-            output_buffer_size: output_buffer_size_val})
-        results = []
-        def iterator_thread():
-          while True:
-            try:
-              results.append(sess.run(get_next))
-            except errors.OutOfRangeError:
-              return
-        threads = [self.checkedThread(target=iterator_thread)
-                   for _ in range(64)]
-        for t in threads:
-          t.start()
-        for t in threads:
-          t.join()
-
-        # `results` will contain the same elements components**2
-        # repeated 18 times, but in a non-deterministic order. Sort the
-        # results, and assert that each element of components**2 is
-        # produced 18 times.
-        results.sort(key=lambda x: x[0])
-        for i in range(7):
-          for j in range(18):
-            for component, result_component in zip(components,
-                                                   results[i * 18 + j]):
-              self.assertAllEqual(component[i]**2, result_component)
-
-      for num_parallel_calls_val, output_buffer_size_val in [
-          (1, 1), (1, 2), (2, 2), (2, 4), (8, 8), (8, 16)]:
-        do_test(num_parallel_calls_val, output_buffer_size_val)
-
-  def testImplicitDisposeParallelMapDataset(self):
-    # Tests whether a parallel map dataset will be cleaned up correctly when
-    # the pipeline does not run it until exhaustion.
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(1000).
-    components = (np.arange(1000),
-                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
-                  np.array(37.0) * np.arange(1000))
-
-    dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
-    # NOTE(mrry): Also test that the prefetching thread is cancelled correctly.
-    dataset = dataset.prefetch(100)
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-
-  def testParallelMapUnspecifiedOutputSize(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message"),
-                    num_parallel_calls=2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-
-  def testParallelMapError(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message"),
-                    num_parallel_calls=2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPrefetchError(self):
-    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
-
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-               .map(lambda x: array_ops.check_numerics(x, "message"))
-               .prefetch(2))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureIterator(self):
-
-    def _build_ds(iterator):
-
-      def _map_fn(x):
-        get_next = iterator.get_next()
-        return x * get_next
-
-      return dataset_ops.Dataset.range(10).map(_map_fn)
-
-    def _build_graph():
-      captured_iterator = dataset_ops.Dataset.range(
-          10).make_initializable_iterator()
-      ds = _build_ds(captured_iterator)
-      iterator = ds.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      return captured_iterator.initializer, init_op, get_next
-
-    with ops.Graph().as_default() as g:
-      captured_init_op, init_op, get_next = _build_graph()
-      with self.session(graph=g) as sess:
-        sess.run(captured_init_op)
-        sess.run(init_op)
-        for i in range(10):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testCaptureHashTable(self):
-    # NOTE(mrry): We must use the V2 variants of `HashTable`
-    # etc. because these produce a `tf.resource`-typed output that is
-    # compatible with the in-graph function implementation.
-    default_val = -1
-    keys = constant_op.constant(["brain", "salad", "surgery"])
-    values = constant_op.constant([0, 1, 2], dtypes.int64)
-    table = lookup_ops.HashTable(
-        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-
-    input_sentences = dataset_ops.Dataset.from_tensor_slices(
-        ["brain brain tank salad surgery", "surgery brain"])
-
-    iterator = (input_sentences
-                .map(lambda x: string_ops.string_split([x]).values)
-                .map(table.lookup)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(table.init)
-      sess.run(init_op)
-      sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureQueue(self):
-    elements = np.random.randint(100, size=[200])
-    queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
-    enqueue_op = queue.enqueue_many(elements)
-    close_op = queue.close()
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: queue.dequeue()).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for element in elements:
-        self.assertEqual(element, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureSameResourceMultipleTimes(self):
-    elements = np.random.randint(100, size=[200])
-    queue = data_flow_ops.FIFOQueue(
-        200, dtypes.int64, shapes=[], shared_name="shared_queue")
-    queue_2 = data_flow_ops.FIFOQueue(
-        200, dtypes.int64, shapes=[], shared_name="shared_queue")
-
-    enqueue_op = queue.enqueue_many(elements)
-    close_op = queue.close()
-
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: (queue.dequeue(), queue_2.dequeue()))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for i in range(100):
-        self.assertEqual(sorted([elements[i * 2], elements[i * 2 + 1]]),
-                         sorted(sess.run(get_next)))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testCaptureVariable(self):
-    counter_var = variable_scope.get_variable(
-        "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(counter_var.initializer)
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i, sess.run(counter_var))
-        self.assertEqual(i + 1, sess.run(get_next))
-      self.assertEqual(10, sess.run(counter_var))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(10, sess.run(counter_var))
-
-  def testCaptureUninitializedVariableError(self):
-    counter_var = variable_scope.get_variable(
-        "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.NotFoundError):
-        sess.run(get_next)
-
-  def testSeededStatefulOperatorIsProperlyStateful(self):
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      random_values = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values.extend(sess.run(get_next))
-      self.assertEqual(10, len(random_values))
-      self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
-      sess.run(init_op)
-      random_values_2 = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values_2.extend(sess.run(get_next))
-
-      # Randomness is repeatable given same seed
-      self.assertAllClose(random_values, random_values_2)
-
-  def testStatefulMapKeepsStateAcrossIterators(self):
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: random_ops.random_uniform((), seed=11))
-                .repeat(1000)
-                .batch(10)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      random_values = sess.run(get_next)
-
-      # Assert that one of the next 99 batches yielded by the iterator is
-      # different from the first.
-      i = 0
-      while i < 99:
-        if np.any(random_values != sess.run(get_next)):
-          break
-        i += 1
-      self.assertLess(i, 99)
-
-  def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testMapNamedtuple(self, count=10):
-    # construct dataset of tuples
-    labels = dataset_ops.Dataset.range(count)
-    images = labels.map(lambda l: -l)
-    dataset_tuple = dataset_ops.Dataset.zip((labels, images))
-
-    # convert dataset of tuples to dataset of namedtuples
-    example = namedtuple("Example", ["label", "image"])
-    dataset_namedtuple = dataset_tuple.map(example)
-
-    def preprocess_tuple(label, image):
-      image = 2 * image
-      return label, image
-
-    def preprocess_namedtuple(example):
-      return example._replace(image=2 * example.image)
-
-    # preprocess both datasets
-    dataset_tuple = dataset_tuple.map(preprocess_tuple)
-    dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple)
-
-    next_tuple = dataset_tuple.make_one_shot_iterator().get_next()
-    next_namedtuple = dataset_namedtuple.make_one_shot_iterator().get_next()
-
-    # make sure both datasets contain the same data
-    with self.cached_session() as sess:
-      for i in range(count):
-        tuple_, namedtuple_ = sess.run([next_tuple, next_namedtuple])
-        self.assertEqual(tuple_, namedtuple_)
-        self.assertEqual(tuple_, (i, -2 * i))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_namedtuple)
-
-  def testUseStepContainerInMap(self):
-    row = np.arange(6)
-    iterator = (
-        dataset_ops.Dataset.from_tensors(row)
-        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(row ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPrefetch(self):
-    # We will use this event to test that `_map_py_func()` has been
-    # invoked a certain number of times (6 times, to be exact) after
-    # consuming fewer elements from the iterator.
-    ev = threading.Event()
-
-    set_event_during_invocation = 5
-
-    def _map_py_func(x):
-      if x == set_event_during_invocation:
-        ev.set()
-      return x * x
-
-    def _map_fn(x):
-      return script_ops.py_func(_map_py_func, [x], x.dtype)
-
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset_ops.Dataset.range(100)
-        .map(_map_fn)
-        .prefetch(buffer_size_placeholder)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Simple test that prefetch yields the expected values in the
-      # expected order.
-      for buffer_size in [1, 10, 100, 1000]:
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-      # We can indirectly observe that varying the buffer size has the
-      # intended effect by observing when `ev` is set (on the 6th
-      # invocation of `_map_py_func()`).
-      # NOTE(mrry): We do not test with `buffer_size ==
-      # set_event_during_invocation`, because we must consume at least
-      # one element to start the prefetching.
-      for buffer_size in range(1, set_event_during_invocation):
-        event_will_be_set_after_consuming = (
-            set_event_during_invocation - buffer_size + 1)
-
-        ev.clear()
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(event_will_be_set_after_consuming):
-          self.assertFalse(ev.is_set())
-          self.assertEqual(i * i, sess.run(get_next))
-        ev.wait()
-        for i in range(event_will_be_set_after_consuming, 100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testReturnList(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: [x, constant_op.constant(37.0)])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testMultiOutputPyFunc(self):
-    # The `tf.py_func()` op returns a list of tensors for its outputs.
-    def _map_fn(x_tensor):
-      def _map_py_func(x):
-        return x, np.array(37.0, dtype=np.float64)
-      return script_ops.py_func(
-          _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64])
-
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_map_fn)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1]))
-
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_sparse)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _sparse(i))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSparseChain(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1]))
-
-    def _check(i):
-      self.assertTrue(sparse_tensor.is_sparse(i))
-      return sparse_ops.sparse_concat(0, [i, i])
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_sparse).map(_check)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testParallelMapOutOfRangeError(self):
-    def raising_py_func(i):
-      if i == 100:
-        raise StopIteration()
-      else:
-        return i
-
-    iterator = (
-        dataset_ops.Dataset.range(105)
-        .map(lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64),
-             num_parallel_calls=2)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(100):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConstantOutput(self):
-    iterator = (
-        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, b"hello", 10), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testWarnOnLookupTable(self):
-    def collecting_function(x):
-      _ = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer([], []), 0.0, name="t1")
-      return x
-
-    warnings.simplefilter("always")
-    with warnings.catch_warnings(record=True) as w:
-      _ = dataset_ops.Dataset.range(10).map(collecting_function)
-    # NOTE(mrry): Python 3 prints other warnings in addition to the one we are
-    # testing, so we search for the expected warning.
-    self.assertGreaterEqual(len(w), 1)
-    found_warning = False
-    for warning in w:
-      if ("Creating lookup tables inside a function passed to Dataset.map() is "
-          "not supported." in str(warning)):
-        found_warning = True
-        break
-    self.assertTrue(found_warning)
-
-  def testNestedDatasetError(self):
-    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
-    with self.assertRaisesRegexp(
-        NotImplementedError, r"The Dataset.map\(\) transformation does not "
-        "currently support nested datasets as outputs."):
-      _ = dataset.map(dataset_ops.Dataset.from_tensor_slices)
-
-  def testReturnValueError(self):
-    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
-    with self.assertRaisesRegexp(
-        TypeError, r"Unsupported return value from function passed to "
-        r"Dataset.map\(\): None."):
-      _ = dataset.map(lambda x: None)
-
-  def testBrokenFunctionErrorOnInitialization(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([1.0, 2.0, 3.0])
-
-    def broken_function(_):
-      """A function deliberately designed to fail on instantiation."""
-      value = []
-      tensor_value = attr_value_pb2.AttrValue()
-      tensor_value.tensor.CopyFrom(
-          tensor_util.make_tensor_proto(
-              value, dtype=dtypes.float32, shape=[0], verify_shape=False))
-      dtype_value = attr_value_pb2.AttrValue(type=dtypes.int32.as_datatype_enum)
-
-      # Create a "Const" op with a `tf.float32` value and a `tf.int32` type
-      # attr.
-      const_tensor = ops.get_default_graph().create_op(
-          "Const", [], [dtypes.int32],
-          attrs={
-              "value": tensor_value,
-              "dtype": dtype_value
-          },
-          name="BrokenConst").outputs[0]
-      return const_tensor
-
-    dataset = dataset.map(broken_function)
-    iterator = dataset.make_initializable_iterator()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "BrokenConst"):
-        sess.run(iterator.initializer)
-
-# pylint: disable=g-long-lambda
-  @parameterized.named_parameters(
-      ("Map", lambda dataset, func:
-       dataset_ops.MapDataset(dataset, func, use_inter_op_parallelism=False)),
-      ("ParallelMap", lambda dataset, func:
-       dataset_ops.ParallelMapDataset(dataset, func, num_parallel_calls=1,
-                                      use_inter_op_parallelism=False)),
-  )
-  def testNoInterOpParallelism(self, make_dataset_fn):
-    dataset = dataset_ops.Dataset.from_tensors(0)
-
-    def _get_tid():
-      return np.int64(threading.current_thread().ident)
-
-    def _map_fn(_):
-      tids = []
-      for _ in range(10):
-        tids.append(script_ops.py_func(_get_tid, [], dtypes.int64))
-      return tids
-
-    dataset = make_dataset_fn(dataset, _map_fn)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      tids = sess.run(get_next)
-      self.assertTrue(all(tids[0] == tid for tid in tids))
-# pylint: enable=g-long-lambda
-
-
-class MapDatasetBenchmark(test.Benchmark):
-
-  def benchmarkChainOfMaps(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      for use_inter_op_parallelism in [False, True]:
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-          for _ in range(chain_length):
-            dataset = dataset_ops.MapDataset(
-                dataset,
-                lambda x: x,
-                use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset.make_one_shot_iterator()
-          next_element = iterator.get_next()
-
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element.op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element.op)
-              end = time.time()
-              deltas.append(end - start)
-
-            median_wall_time = np.median(deltas) / 100
-            print("Map dataset chain length%s: %d Median wall time: %f" %
-                  (" (single threaded mode)" if not use_inter_op_parallelism
-                   else "", chain_length, median_wall_time))
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="benchmark_map_dataset_chain_latency_%d%s" %
-                (chain_length, "_single_threaded"
-                 if not use_inter_op_parallelism else ""))
-
-  def benchmarkMapFanOut(self):
-    fan_outs = [1, 2, 5, 10, 20, 50, 100]
-    for fan_out in fan_outs:
-      for use_inter_op_parallelism in [False, True]:
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(
-              tuple(0 for _ in range(fan_out))).repeat(None)
-          dataset = dataset_ops.MapDataset(
-              dataset,
-              lambda *xs: xs,
-              use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset.make_one_shot_iterator()
-          next_element = iterator.get_next()
-
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element[0].op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element[0].op)
-              end = time.time()
-              deltas.append(end - start)
-
-            median_wall_time = np.median(deltas) / 100
-            print("Map dataset fan out%s: %d Median wall time: %f" %
-                  (" (single threaded mode)" if not use_inter_op_parallelism
-                   else "", fan_out, median_wall_time))
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="benchmark_map_dataset_fan_out_%d%s" %
-                (fan_out, "_single_threaded"
-                 if not use_inter_op_parallelism else ""))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e07706413dea9932c0b83f9eaedd62707b57e668
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -0,0 +1,1069 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.map()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import threading
+import warnings
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+
+
+def _make_coordinated_sloppy_dataset(num_elements, num_parallel_calls):
+  """Produces a dataset iterator and events to control the order of elements.
+
+  Args:
+    num_elements: the number of input elements
+    num_parallel_calls: the degree of map parallelism
+
+  Returns:
+    A dataset iterator (represented as `get_next` op) and events that can be
+    used to control the order of output elements.
+  """
+
+  # Set up threading events used to sequence when items are produced that
+  # are subsequently interleaved. These events allow us to deterministically
+  # simulate slowdowns and force sloppiness.
+  coordination_events = {i: threading.Event() for i in range(num_elements)}
+
+  def map_py_fn(x):
+    coordination_events[x].wait()
+    coordination_events[x].clear()
+    return x * x
+
+  def map_fn(x):
+    return script_ops.py_func(map_py_fn, [x], x.dtype)
+
+  options = dataset_ops.Options()
+  options.experimental_deterministic = False
+  dataset = dataset_ops.Dataset.range(num_elements).map(
+      map_fn, num_parallel_calls).with_options(options)
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
+  next_element = iterator.get_next()
+  return next_element, coordination_events
+
+
+@test_util.run_v1_only("b/120545219")
+class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def _buildMapDataset(self, components, count):
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+    return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+            .repeat(count))
+
+  def testMapDataset(self):
+    """Test an dataset that maps a TF function across its input elements."""
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(count).
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+
+    dataset = self._buildMapDataset(components, count)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.cached_session() as sess:
+      # Test single-threaded access to the iterator.
+      sess.run(init_op, feed_dict={count: 14})
+      for _ in range(14):
+        for i in range(7):
+          result = sess.run(get_next)
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test multi-threaded access to the same iterator.
+      sess.run(init_op, feed_dict={count: 18})
+      results = []
+      def iterator_thread():
+        while True:
+          try:
+            results.append(sess.run(get_next))
+          except errors.OutOfRangeError:
+            return
+      threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
+      for t in threads:
+        t.start()
+      for t in threads:
+        t.join()
+
+      # `results` will contain the same elements components**2
+      # repeated 18 times, but in a non-deterministic order. Sort the
+      # results, and assert that each element of components**2 is
+      # produced 18 times.
+      results.sort(key=lambda x: x[0])
+      for i in range(7):
+        for j in range(18):
+          for component, result_component in zip(components,
+                                                 results[i * 18 + j]):
+            self.assertAllEqual(component[i]**2, result_component)
+
+  def _buildParallelMapDataset(self, components, count, num_parallel_calls,
+                               output_buffer_size):
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+    return (dataset_ops.Dataset.from_tensor_slices(components)
+            .map(_map_fn, num_parallel_calls=num_parallel_calls)
+            .prefetch(output_buffer_size)
+            .repeat(count))
+
+  def testParallelMapDataset(self):
+    """Test an dataset that maps a TF function across its input elements."""
+    # The pipeline is TensorSliceDataset -> ParallelMapDataset(square_3) ->
+    # RepeatDataset(count).
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+    count = array_ops.placeholder(dtypes.int64, shape=[])
+    num_parallel_calls = array_ops.placeholder(dtypes.int32, shape=[])
+    output_buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
+
+    dataset = self._buildParallelMapDataset(
+        components, count, num_parallel_calls, output_buffer_size)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [t.shape for t in get_next])
+
+    with self.cached_session() as sess:
+
+      def do_test(num_parallel_calls_val, output_buffer_size_val):
+        # Test single-threaded access to the iterator.
+        sess.run(init_op, feed_dict={
+            count: 14,
+            num_parallel_calls: num_parallel_calls_val,
+            output_buffer_size: output_buffer_size_val})
+        for _ in range(14):
+          for i in range(7):
+            result = sess.run(get_next)
+            for component, result_component in zip(components, result):
+              self.assertAllEqual(component[i]**2, result_component)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+        # Test multi-threaded access to the same iterator.
+        sess.run(init_op, feed_dict={
+            count: 18,
+            num_parallel_calls: num_parallel_calls_val,
+            output_buffer_size: output_buffer_size_val})
+        results = []
+        def iterator_thread():
+          while True:
+            try:
+              results.append(sess.run(get_next))
+            except errors.OutOfRangeError:
+              return
+        threads = [self.checkedThread(target=iterator_thread)
+                   for _ in range(64)]
+        for t in threads:
+          t.start()
+        for t in threads:
+          t.join()
+
+        # `results` will contain the same elements components**2
+        # repeated 18 times, but in a non-deterministic order. Sort the
+        # results, and assert that each element of components**2 is
+        # produced 18 times.
+        results.sort(key=lambda x: x[0])
+        for i in range(7):
+          for j in range(18):
+            for component, result_component in zip(components,
+                                                   results[i * 18 + j]):
+              self.assertAllEqual(component[i]**2, result_component)
+
+      for num_parallel_calls_val, output_buffer_size_val in [
+          (1, 1), (1, 2), (2, 2), (2, 4), (8, 8), (8, 16)]:
+        do_test(num_parallel_calls_val, output_buffer_size_val)
+
+  def testImplicitDisposeParallelMapDataset(self):
+    # Tests whether a parallel map dataset will be cleaned up correctly when
+    # the pipeline does not run it until exhaustion.
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(1000).
+    components = (np.arange(1000),
+                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
+                  np.array(37.0) * np.arange(1000))
+
+    dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
+    # NOTE(mrry): Also test that the prefetching thread is cancelled correctly.
+    dataset = dataset.prefetch(100)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+
+  def testParallelMapUnspecifiedOutputSize(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+               .map(lambda x: array_ops.check_numerics(x, "message"),
+                    num_parallel_calls=2))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+
+  def testParallelMapError(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+               .map(lambda x: array_ops.check_numerics(x, "message"),
+                    num_parallel_calls=2))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+      sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testPrefetchError(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+
+    dataset = (dataset_ops.Dataset.from_tensor_slices(components)
+               .map(lambda x: array_ops.check_numerics(x, "message"))
+               .prefetch(2))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for _ in range(3):
+        sess.run(get_next)
+      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+      sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureIterator(self):
+
+    def _build_ds(iterator):
+
+      def _map_fn(x):
+        get_next = iterator.get_next()
+        return x * get_next
+
+      return dataset_ops.Dataset.range(10).map(_map_fn)
+
+    def _build_graph():
+      captured_iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(10))
+      ds = _build_ds(captured_iterator)
+      iterator = ds.make_initializable_iterator()
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      return captured_iterator.initializer, init_op, get_next
+
+    with ops.Graph().as_default() as g:
+      captured_init_op, init_op, get_next = _build_graph()
+      with self.session(graph=g) as sess:
+        sess.run(captured_init_op)
+        sess.run(init_op)
+        for i in range(10):
+          self.assertEqual(i * i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testCaptureHashTable(self):
+    # NOTE(mrry): We must use the V2 variants of `HashTable`
+    # etc. because these produce a `tf.resource`-typed output that is
+    # compatible with the in-graph function implementation.
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.HashTable(
+        lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
+
+    input_sentences = dataset_ops.Dataset.from_tensor_slices(
+        ["brain brain tank salad surgery", "surgery brain"])
+
+    iterator = dataset_ops.make_initializable_iterator(
+        input_sentences
+        .map(lambda x: string_ops.string_split([x]).values).map(table.lookup))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(table.initializer)
+      sess.run(init_op)
+      sess.run(get_next)
+      sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureQueue(self):
+    elements = np.random.randint(100, size=[200])
+    queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
+    enqueue_op = queue.enqueue_many(elements)
+    close_op = queue.close()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(-1)
+        .map(lambda _: queue.dequeue()))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(enqueue_op)
+      sess.run(close_op)
+      sess.run(init_op)
+      for element in elements:
+        self.assertEqual(element, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureSameResourceMultipleTimes(self):
+    elements = np.random.randint(100, size=[200])
+    queue = data_flow_ops.FIFOQueue(
+        200, dtypes.int64, shapes=[], shared_name="shared_queue")
+    queue_2 = data_flow_ops.FIFOQueue(
+        200, dtypes.int64, shapes=[], shared_name="shared_queue")
+
+    enqueue_op = queue.enqueue_many(elements)
+    close_op = queue.close()
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(-1)
+        .map(lambda _: (queue.dequeue(), queue_2.dequeue())))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(enqueue_op)
+      sess.run(close_op)
+      sess.run(init_op)
+      for i in range(100):
+        self.assertEqual(sorted([elements[i * 2], elements[i * 2 + 1]]),
+                         sorted(sess.run(get_next)))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureVariable(self):
+    counter_var = variable_scope.get_variable(
+        "counter", (), dtypes.int32, use_resource=True)
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: counter_var.assign_add(1)))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(counter_var.initializer)
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual(i, sess.run(counter_var))
+        self.assertEqual(i + 1, sess.run(get_next))
+      self.assertEqual(10, sess.run(counter_var))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertEqual(10, sess.run(counter_var))
+
+  def testCaptureUninitializedVariableError(self):
+    counter_var = variable_scope.get_variable(
+        "counter", (), dtypes.int32, use_resource=True)
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: counter_var.assign_add(1)))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.NotFoundError):
+        sess.run(get_next)
+
+  def testSeededStatefulOperatorIsProperlyStateful(self):
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      random_values = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          random_values.extend(sess.run(get_next))
+      self.assertEqual(10, len(random_values))
+      self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
+      sess.run(init_op)
+      random_values_2 = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          random_values_2.extend(sess.run(get_next))
+
+      # Randomness is repeatable given same seed
+      self.assertAllClose(random_values, random_values_2)
+
+  def testStatefulMapKeepsStateAcrossIterators(self):
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: random_ops.random_uniform((), seed=11))
+        .repeat(1000)
+        .batch(10))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      random_values = sess.run(get_next)
+
+      # Assert that one of the next 99 batches yielded by the iterator is
+      # different from the first.
+      i = 0
+      while i < 99:
+        if np.any(random_values != sess.run(get_next)):
+          break
+        i += 1
+      self.assertLess(i, 99)
+
+  def testStatefulOperationInShortCircuit(self):
+    counter_var = variable_scope.get_variable(
+        "counter", (), dtypes.int32, use_resource=True)
+
+    def increment_fn(x):
+      counter_var.assign_add(1)
+      return x
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(increment_fn))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(counter_var.initializer)
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual(i, sess.run(counter_var))
+        self.assertEqual(i, sess.run(get_next))
+      self.assertEqual(10, sess.run(counter_var))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+      self.assertEqual(10, sess.run(counter_var))
+
+  def testMapDict(self):
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10)
+        .map(lambda x: {"foo": x * 2, "bar": x ** 2})
+        .map(lambda d: d["foo"] + d["bar"]))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual(i * 2 + i**2, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testMapNamedtuple(self, count=10):
+    # construct dataset of tuples
+    labels = dataset_ops.Dataset.range(count)
+    images = labels.map(lambda l: -l)
+    dataset_tuple = dataset_ops.Dataset.zip((labels, images))
+
+    # convert dataset of tuples to dataset of namedtuples
+    example = namedtuple("Example", ["label", "image"])
+    dataset_namedtuple = dataset_tuple.map(example)
+
+    def preprocess_tuple(label, image):
+      image = 2 * image
+      return label, image
+
+    def preprocess_namedtuple(example):
+      return example._replace(image=2 * example.image)
+
+    # preprocess both datasets
+    dataset_tuple = dataset_tuple.map(preprocess_tuple)
+    dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple)
+
+    next_tuple = dataset_ops.make_one_shot_iterator(dataset_tuple).get_next()
+    next_namedtuple = dataset_ops.make_one_shot_iterator(
+        dataset_namedtuple).get_next()
+
+    # make sure both datasets contain the same data
+    with self.cached_session() as sess:
+      for i in range(count):
+        tuple_, namedtuple_ = sess.run([next_tuple, next_namedtuple])
+        self.assertEqual(tuple_, namedtuple_)
+        self.assertEqual(tuple_, (i, -2 * i))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_namedtuple)
+
+  def testUseStepContainerInMap(self):
+    row = np.arange(6)
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(row)
+        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems)))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual(row**2, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaseAndCondInMap(self):
+
+    def control_map_fn(x, y):
+
+      def multiply():
+        return x * 2
+
+      def divide():
+        return x // 2
+
+      def defaults_two():
+        return control_flow_ops.cond(
+            math_ops.equal(math_ops.mod(x, 2), 0),
+            multiply,
+            divide,
+            name="cond_mult")
+
+      pred_fn_pairs = {
+          math_ops.logical_or(math_ops.equal(y, 2), math_ops.equal(y, 3)):
+              defaults_two,
+      }
+
+      return control_flow_ops.case(
+          pred_fn_pairs, default=multiply, exclusive=True)
+
+    def build_dataset(row, num):
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.from_tensor_slices(row).map(
+              lambda x: control_map_fn(x, num)))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      return init_op, get_next
+
+    with self.cached_session() as sess:
+      row = np.arange(6)
+      for num in [2, 3, 4]:
+        init_op, get_next = build_dataset(row, num)
+        sess.run(init_op)
+        for i in range(6):
+          self.assertEqual(
+              (i // 2 if i % 2 else i * 2) if (num == 2 or num == 3) else i * 2,
+              sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testCaseInWhileInMap(self):
+
+    def control_map_fn(x, y):
+
+      def multiply():
+        return x * 2
+
+      def divide():
+        return x // 2
+
+      pred_fn_pairs = {
+          math_ops.logical_or(math_ops.equal(y, 2), math_ops.equal(y, 3)):
+              divide,
+      }
+
+      return control_flow_ops.case(
+          pred_fn_pairs, default=multiply, exclusive=True)
+
+    def build_dataset(row, num):
+      # pylint: disable=g-long-lambda
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.from_tensors(row).map(
+              lambda elems: functional_ops.map_fn(
+                  lambda x: control_map_fn(x, num), elems)))
+      init_op = iterator.initializer
+      get_next = iterator.get_next()
+      return init_op, get_next
+
+    with self.cached_session() as sess:
+      row = np.arange(6)
+      for num in [2, 3, 4]:
+        init_op, get_next = build_dataset(row, num)
+        sess.run(init_op)
+        self.assertAllEqual(
+            [x // 2 if (num == 2 or num == 3) else x * 2 for x in row],
+            sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testCaseAndCondInWhileInMap(self):
+
+    def control_map_fn(x, y):
+
+      def multiply():
+        return x * 2
+
+      def divide():
+        return x // 2
+
+      def defaults_two():
+        return control_flow_ops.cond(
+            math_ops.equal(math_ops.mod(x, 2), 0),
+            multiply,
+            divide,
+            name="cond_mult")
+
+      pred_fn_pairs = {
+          math_ops.logical_or(math_ops.equal(y, 2), math_ops.equal(y, 3)):
+              defaults_two,
+      }
+
+      return control_flow_ops.case(
+          pred_fn_pairs, default=multiply, exclusive=True)
+
+    row = np.arange(6)
+    num = 2
+    # pylint: disable=g-long-lambda
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(row).map(
+            lambda elems: functional_ops.map_fn(
+                lambda x: control_map_fn(x, num), elems)))
+    # pylint: enable=g-long-lambda
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      self.assertAllEqual([(x // 2 if x % 2 else x * 2) if
+                           (num == 2 or num == 3) else x * 2 for x in row],
+                          sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testPrefetch(self):
+    # We will use this event to test that `_map_py_func()` has been
+    # invoked a certain number of times (6 times, to be exact) after
+    # consuming fewer elements from the iterator.
+    ev = threading.Event()
+
+    set_event_during_invocation = 5
+
+    def _map_py_func(x):
+      if x == set_event_during_invocation:
+        ev.set()
+      return x * x
+
+    def _map_fn(x):
+      return script_ops.py_func(_map_py_func, [x], x.dtype)
+
+    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(100)
+        .map(_map_fn)
+        .prefetch(buffer_size_placeholder))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      # Simple test that prefetch yields the expected values in the
+      # expected order.
+      for buffer_size in [1, 10, 100, 1000]:
+        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
+        for i in range(100):
+          self.assertEqual(i * i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+      # We can indirectly observe that varying the buffer size has the
+      # intended effect by observing when `ev` is set (on the 6th
+      # invocation of `_map_py_func()`).
+      # NOTE(mrry): We do not test with `buffer_size ==
+      # set_event_during_invocation`, because we must consume at least
+      # one element to start the prefetching.
+      for buffer_size in range(1, set_event_during_invocation):
+        event_will_be_set_after_consuming = (
+            set_event_during_invocation - buffer_size + 1)
+
+        ev.clear()
+        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
+        for i in range(event_will_be_set_after_consuming):
+          self.assertFalse(ev.is_set())
+          self.assertEqual(i * i, sess.run(get_next))
+        ev.wait()
+        for i in range(event_will_be_set_after_consuming, 100):
+          self.assertEqual(i * i, sess.run(get_next))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testReturnList(self):
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10)
+        .map(lambda x: [x, constant_op.constant(37.0)]))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual((i, 37.0), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testMultiOutputPyFunc(self):
+    # The `tf.py_func()` op returns a list of tensors for its outputs.
+    def _map_fn(x_tensor):
+      def _map_py_func(x):
+        return x, np.array(37.0, dtype=np.float64)
+      return script_ops.py_func(
+          _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64])
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_map_fn))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual((i, 37.0), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        actual = sess.run(get_next)
+        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
+        self.assertSparseValuesEqual(actual, _sparse(i))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSparseChain(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    def _check(i):
+      self.assertTrue(sparse_tensor.is_sparse(i))
+      return sparse_ops.sparse_concat(0, [i, i])
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).map(_check))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        actual = sess.run(get_next)
+        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
+        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testParallelMapOutOfRangeError(self):
+    def raising_py_func(i):
+      if i == 100:
+        raise StopIteration()
+      else:
+        return i
+
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(105)
+        .map(lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64),
+             num_parallel_calls=2))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for i in range(100):
+        self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testConstantOutput(self):
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10]))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        self.assertEqual((i, b"hello", 10), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testWarnOnLookupTable(self):
+    def collecting_function(x):
+      _ = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer([], []), 0.0, name="t1")
+      return x
+
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      _ = dataset_ops.Dataset.range(10).map(collecting_function)
+    # NOTE(mrry): Python 3 prints other warnings in addition to the one we are
+    # testing, so we search for the expected warning.
+    self.assertGreaterEqual(len(w), 1)
+    found_warning = False
+    for warning in w:
+      if ("Creating lookup tables inside a function passed to Dataset.map() is "
+          "not supported." in str(warning)):
+        found_warning = True
+        break
+    self.assertTrue(found_warning)
+
+  def testNestedDatasetMap(self):
+    # TODO(b/110122868): When iterators can yield a `tf.data.Dataset`, remove
+    # the `get_single_element()` call.
+    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]).map(
+        dataset_ops.Dataset.from_tensor_slices).map(
+            lambda ds: ds.batch(3)).flat_map(lambda x: x)
+
+    self.assertDatasetProduces(dataset, [[1.0, 2.0, 3.0]])
+
+  def testReturnValueError(self):
+    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
+    with self.assertRaisesRegexp(
+        TypeError, r"Unsupported return value from function passed to "
+        r"Dataset.map\(\): None."):
+      _ = dataset.map(lambda x: None)
+
+  def testBrokenFunctionErrorOnInitialization(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([1.0, 2.0, 3.0])
+
+    def broken_function(_):
+      """A function deliberately designed to fail on instantiation."""
+      value = []
+      tensor_value = attr_value_pb2.AttrValue()
+      tensor_value.tensor.CopyFrom(
+          tensor_util.make_tensor_proto(
+              value, dtype=dtypes.float32, shape=[0], verify_shape=False))
+      dtype_value = attr_value_pb2.AttrValue(type=dtypes.int32.as_datatype_enum)
+
+      # Create a "Const" op with a `tf.float32` value and a `tf.int32` type
+      # attr.
+      const_tensor = ops.get_default_graph().create_op(
+          "Const", [], [dtypes.int32],
+          attrs={
+              "value": tensor_value,
+              "dtype": dtype_value
+          },
+          name="BrokenConst").outputs[0]
+      return const_tensor
+
+    dataset = dataset.map(broken_function)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+
+    with self.cached_session() as sess:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "BrokenConst"):
+        sess.run(iterator.initializer)
+
+# pylint: disable=g-long-lambda
+  @parameterized.named_parameters(
+      ("Map", lambda dataset, func:
+       dataset_ops.MapDataset(dataset, func, use_inter_op_parallelism=False)),
+      ("ParallelMap", lambda dataset, func:
+       dataset_ops.ParallelMapDataset(dataset, func, num_parallel_calls=1,
+                                      use_inter_op_parallelism=False)),
+  )
+  def testNoInterOpParallelism(self, make_dataset_fn):
+    dataset = dataset_ops.Dataset.from_tensors(0)
+
+    def _get_tid():
+      return np.int64(threading.current_thread().ident)
+
+    def _map_fn(_):
+      tids = []
+      for _ in range(10):
+        tids.append(script_ops.py_func(_get_tid, [], dtypes.int64))
+      return tids
+
+    dataset = make_dataset_fn(dataset, _map_fn)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      tids = sess.run(get_next)
+      self.assertTrue(all(tids[0] == tid for tid in tids))
+# pylint: enable=g-long-lambda
+
+  @parameterized.named_parameters(
+      ("SequentialIdentity", None, lambda x: x, None),
+      ("SequentialReplicate", None, lambda x: (x, x), None),
+      ("SequentialSwap", (None, None), lambda x, y: (y, x), None),
+      ("SequentialProject", (None, None), lambda x, y: x, None),
+      ("ParallelIdentity", None, lambda x: x, 10),
+      ("ParallelReplicate", None, lambda x: (x, x), 10),
+      ("ParallelSwap", (None, None), lambda x, y: (y, x), 10),
+      ("ParallelProject", (None, None), lambda x, y: x, 10),
+  )
+  def testShortCircuit(self, structure, map_fn, num_parallel_calls):
+    dataset = self.structuredDataset(structure).repeat().map(
+        map_fn, num_parallel_calls=num_parallel_calls)
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
+
+    with self.cached_session() as sess:
+      if isinstance(structure, tuple):
+        expected = map_fn(*sess.run(self.structuredElement(structure)))
+      else:
+        expected = map_fn(sess.run(self.structuredElement(structure)))
+      self.assertEqual(expected, sess.run(get_next))
+
+  @parameterized.named_parameters(
+      ("Sequential", None),
+      ("Parallel", 10),
+  )
+  def testShortCircuitCapturedInput(self, num_parallel_calls):
+    captured_t = array_ops.placeholder(dtypes.int64, shape=[])
+    dataset = self.structuredDataset(None).repeat().map(
+        lambda x: captured_t, num_parallel_calls=num_parallel_calls)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer, feed_dict={captured_t: 42})
+      self.assertEqual(42, sess.run(get_next))
+
+  @parameterized.named_parameters(
+      ("1", 1, 1),
+      ("2", 10, 1),
+      ("3", 10, 10),
+      ("4", 100, 1),
+      ("5", 100, 10),
+      ("6", 100, 100),
+  )
+  def testSloppyInterleaveInOrder(self, num_elements, num_parallel_calls):
+    get_next, coordination_events = _make_coordinated_sloppy_dataset(
+        num_elements, num_parallel_calls)
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=num_parallel_calls + 1,
+        use_per_session_threads=True)
+    with self.cached_session(config=config) as sess:
+      for i in range(num_elements):
+        coordination_events[i].set()
+        self.assertEqual(i * i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @parameterized.named_parameters(
+      ("1", 10, 10),
+      ("2", 100, 10),
+      ("3", 100, 100),
+  )
+  def testSloppyInterleaveOutOfOrder(self, num_elements, num_parallel_calls):
+    get_next, coordination_events = _make_coordinated_sloppy_dataset(
+        num_elements, num_parallel_calls)
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=num_parallel_calls + 1,
+        use_per_session_threads=True)
+    with self.cached_session(config=config) as sess:
+      elements = [x for x in range(num_elements)]
+      for i in [1, 4, 7]:
+        elements[i], elements[i + 1] = elements[i + 1], elements[i]
+
+      for element in elements:
+        coordination_events[element].set()
+        self.assertEqual(element * element, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  @parameterized.named_parameters(
+      ("Map", None),
+      ("ParallelMap", 12),
+  )
+  def testPreserveCardinality(self, num_parallel_calls):
+
+    def py_fn(_):
+      raise StopIteration()
+
+    dataset = dataset_ops.DatasetV2.from_tensors(0).map(
+        lambda x: script_ops.py_func(py_fn, [x], dtypes.int64),
+        num_parallel_calls=num_parallel_calls)
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
+
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 1cf6dd1beaf886b46a32a1489e991f7a3efe8b29..0322d1f2c604c3f9588eb8eaa39eb9829bb0a26e 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""MultiDeviceIterator tests."""
+"""Tests for `tf.data.MultiDeviceIterator`."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
@@ -30,8 +32,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): Add eager coverage.
 class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
+  @test_util.run_v1_only("b/120545219")
   def testNoGetNext(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -39,8 +43,9 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -49,14 +54,15 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testOneOnSameDevice(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(10)
@@ -66,14 +72,15 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testRepeatDevices(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(20)
@@ -84,18 +91,19 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 20, 4):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
-        self.assertEqual(i + 2, sess.run(elem_on_3))
-        self.assertEqual(i + 3, sess.run(elem_on_4))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+        self.assertEqual(i + 2, self.evaluate(elem_on_3))
+        self.assertEqual(i + 3, self.evaluate(elem_on_4))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
-        sess.run(elem_on_3)
-        sess.run(elem_on_4)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
+        self.evaluate(elem_on_3)
+        self.evaluate(elem_on_4)
 
+  @test_util.run_v1_only("b/120545219")
   def testNotFullyDivisible(self):
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -104,15 +112,50 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
-      self.assertEqual(8, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+      self.assertEqual(8, self.evaluate(elem_on_1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
+  def testGetNextAsOptional(self):
+    dataset = dataset_ops.Dataset.range(9)
+    multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next_as_optional()
+    elem_on_1_has_value_t = elem_on_1.has_value()
+    elem_on_1_t = elem_on_1.get_value()
+    elem_on_2_has_value_t = elem_on_2.has_value()
+    elem_on_2_t = elem_on_2.get_value()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      self.evaluate(multi_device_iterator.initializer)
+      for i in range(0, 8, 2):
+        elem_on_1_has_value, elem_on_1_value = sess.run(
+            [elem_on_1_has_value_t, elem_on_1_t])
+        self.assertTrue(elem_on_1_has_value)
+        self.assertEqual(i, elem_on_1_value)
+        elem_on_2_has_value, elem_on_2_value = sess.run(
+            [elem_on_2_has_value_t, elem_on_2_t])
+        self.assertTrue(elem_on_2_has_value)
+        self.assertEqual(i + 1, elem_on_2_value)
+      elem_on_1_has_value, elem_on_1_value = sess.run(
+          [elem_on_1_has_value_t, elem_on_1_t])
+      self.assertTrue(elem_on_1_has_value)
+      self.assertEqual(8, elem_on_1_value)
+      self.assertFalse(self.evaluate(elem_on_1_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_2_has_value_t))
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(elem_on_1_t)
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(elem_on_2_t)
+
+  @test_util.run_v1_only("b/120545219")
   def testUneven(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -121,15 +164,16 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleInitializations(self):
     with ops.device("/cpu:0"):
       epoch = array_ops.placeholder(dtypes.int64, shape=[])
@@ -145,7 +189,8 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     with self.test_session(config=config) as sess:
       for i in range(1000):
         sess.run(init_op, feed_dict={epoch: i})
-        self.assertEqual([(i, 0), (i, 1)], sess.run([elem_on_1, elem_on_2]))
+        self.assertEqual([(i, 0), (i, 1)], self.evaluate([elem_on_1,
+                                                          elem_on_2]))
 
   def testBasicGpu(self):
     if not test_util.is_gpu_available():
@@ -158,13 +203,13 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
   def testUnevenGpu(self):
     if not test_util.is_gpu_available():
@@ -177,14 +222,76 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+      for i in range(0, 10, 2):
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
+
+  def testGetNextAsOptionalGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    dataset = dataset_ops.Dataset.range(9)
+    multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/gpu:0"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next_as_optional()
+    elem_on_1_has_value_t = elem_on_1.has_value()
+    elem_on_1_t = elem_on_1.get_value()
+    elem_on_2_has_value_t = elem_on_2.has_value()
+    elem_on_2_t = elem_on_2.get_value()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
+    with self.test_session(config=config) as sess:
+      self.evaluate(multi_device_iterator.initializer)
+      for i in range(0, 8, 2):
+        elem_on_1_has_value, elem_on_1_value = sess.run(
+            [elem_on_1_has_value_t, elem_on_1_t])
+        self.assertTrue(elem_on_1_has_value)
+        self.assertEqual(i, elem_on_1_value)
+        elem_on_2_has_value, elem_on_2_value = sess.run(
+            [elem_on_2_has_value_t, elem_on_2_t])
+        self.assertTrue(elem_on_2_has_value)
+        self.assertEqual(i + 1, elem_on_2_value)
+      elem_on_1_has_value, elem_on_1_value = sess.run(
+          [elem_on_1_has_value_t, elem_on_1_t])
+      self.assertTrue(elem_on_1_has_value)
+      self.assertEqual(8, elem_on_1_value)
+      self.assertFalse(self.evaluate(elem_on_1_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_2_has_value_t))
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(elem_on_1_t)
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.evaluate(elem_on_2_t)
+
+  @test_util.run_v1_only("b/120545219")
+  def testOptimization(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
+    dataset = dataset.skip(0)  # this should be optimized away
+    dataset = dataset.cache()
+
+    options = dataset_ops.Options()
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.noop_elimination = True
+    dataset = dataset.with_options(options)
+
+    multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/optional_ops_test.py b/tensorflow/python/data/kernel_tests/optional_ops_test.py
deleted file mode 100644
index 604e3ad88ec96233771b475705ecac016ac6978c..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/optional_ops_test.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the Optional data type wrapper."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import optional_ops
-from tensorflow.python.data.util import structure
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testFromValue(self):
-    opt = optional_ops.Optional.from_value(constant_op.constant(37.0))
-    self.assertTrue(self.evaluate(opt.has_value()))
-    self.assertEqual(37.0, self.evaluate(opt.get_value()))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testFromStructuredValue(self):
-    opt = optional_ops.Optional.from_value({
-        "a": constant_op.constant(37.0),
-        "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))
-    })
-    self.assertTrue(self.evaluate(opt.has_value()))
-    self.assertEqual({
-        "a": 37.0,
-        "b": ([b"Foo"], b"Bar")
-    }, self.evaluate(opt.get_value()))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testFromSparseTensor(self):
-    st_0 = sparse_tensor.SparseTensorValue(
-        indices=np.array([[0]]),
-        values=np.array([0], dtype=np.int64),
-        dense_shape=np.array([1]))
-    st_1 = sparse_tensor.SparseTensorValue(
-        indices=np.array([[0, 0], [1, 1]]),
-        values=np.array([-1., 1.], dtype=np.float32),
-        dense_shape=np.array([2, 2]))
-    opt = optional_ops.Optional.from_value((st_0, st_1))
-    self.assertTrue(self.evaluate(opt.has_value()))
-    val_0, val_1 = opt.get_value()
-    for expected, actual in [(st_0, val_0), (st_1, val_1)]:
-      self.assertAllEqual(expected.indices, self.evaluate(actual.indices))
-      self.assertAllEqual(expected.values, self.evaluate(actual.values))
-      self.assertAllEqual(expected.dense_shape,
-                          self.evaluate(actual.dense_shape))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testFromNone(self):
-    value_structure = structure.TensorStructure(dtypes.float32, [])
-    opt = optional_ops.Optional.none_from_structure(value_structure)
-    self.assertTrue(opt.value_structure.is_compatible_with(value_structure))
-    self.assertFalse(
-        opt.value_structure.is_compatible_with(
-            structure.TensorStructure(dtypes.float32, [1])))
-    self.assertFalse(
-        opt.value_structure.is_compatible_with(
-            structure.TensorStructure(dtypes.int32, [])))
-    self.assertFalse(self.evaluate(opt.has_value()))
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.evaluate(opt.get_value())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCopyToGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    with ops.device("/cpu:0"):
-      optional_with_value = optional_ops.Optional.from_value(
-          (constant_op.constant(37.0), constant_op.constant("Foo"),
-           constant_op.constant(42)))
-      optional_none = optional_ops.Optional.none_from_structure(
-          structure.TensorStructure(dtypes.float32, []))
-
-    with ops.device("/gpu:0"):
-      gpu_optional_with_value = optional_ops._OptionalImpl(
-          array_ops.identity(optional_with_value._variant_tensor),
-          optional_with_value.value_structure)
-      gpu_optional_none = optional_ops._OptionalImpl(
-          array_ops.identity(optional_none._variant_tensor),
-          optional_none.value_structure)
-
-      gpu_optional_with_value_has_value = gpu_optional_with_value.has_value()
-      gpu_optional_with_value_values = gpu_optional_with_value.get_value()
-
-      gpu_optional_none_has_value = gpu_optional_none.has_value()
-
-    self.assertTrue(self.evaluate(gpu_optional_with_value_has_value))
-    self.assertEqual((37.0, b"Foo", 42),
-                     self.evaluate(gpu_optional_with_value_values))
-    self.assertFalse(self.evaluate(gpu_optional_none_has_value))
-
-  def _assertElementValueEqual(self, expected, actual):
-    if isinstance(expected, dict):
-      self.assertItemsEqual(list(expected.keys()), list(actual.keys()))
-      for k in expected.keys():
-        self._assertElementValueEqual(expected[k], actual[k])
-    elif isinstance(expected, sparse_tensor.SparseTensorValue):
-      self.assertAllEqual(expected.indices, actual.indices)
-      self.assertAllEqual(expected.values, actual.values)
-      self.assertAllEqual(expected.dense_shape, actual.dense_shape)
-    else:
-      self.assertAllEqual(expected, actual)
-
-  # pylint: disable=g-long-lambda
-  @parameterized.named_parameters(
-      ("Tensor", lambda: constant_op.constant(37.0),
-       structure.TensorStructure(dtypes.float32, [])),
-      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
-          indices=[[0]], values=constant_op.constant([0], dtype=dtypes.int32),
-          dense_shape=[1]),
-       structure.SparseTensorStructure(dtypes.int32, [1])),
-      ("Nest", lambda: {
-          "a": constant_op.constant(37.0),
-          "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))},
-       structure.NestedStructure({
-           "a": structure.TensorStructure(dtypes.float32, []),
-           "b": (structure.TensorStructure(dtypes.string, [1]),
-                 structure.TensorStructure(dtypes.string, []))})),
-      ("Optional", lambda: optional_ops.Optional.from_value(37.0),
-       optional_ops.OptionalStructure(
-           structure.TensorStructure(dtypes.float32, []))),
-  )
-  def testOptionalStructure(self, tf_value_fn, expected_value_structure):
-    tf_value = tf_value_fn()
-    opt = optional_ops.Optional.from_value(tf_value)
-
-    self.assertTrue(
-        expected_value_structure.is_compatible_with(opt.value_structure))
-    self.assertTrue(
-        opt.value_structure.is_compatible_with(expected_value_structure))
-
-    opt_structure = structure.Structure.from_value(opt)
-    self.assertIsInstance(opt_structure, optional_ops.OptionalStructure)
-    self.assertTrue(opt_structure.is_compatible_with(opt_structure))
-    self.assertTrue(opt_structure._value_structure.is_compatible_with(
-        expected_value_structure))
-    self.assertEqual([dtypes.variant], opt_structure._flat_types)
-    self.assertEqual([tensor_shape.scalar()], opt_structure._flat_shapes)
-
-    # All OptionalStructure objects are not compatible with a non-optional
-    # value.
-    non_optional_structure = structure.Structure.from_value(
-        constant_op.constant(42.0))
-    self.assertFalse(opt_structure.is_compatible_with(non_optional_structure))
-
-    # Assert that the optional survives a round-trip via _from_tensor_list()
-    # and _to_tensor_list().
-    round_trip_opt = opt_structure._from_tensor_list(
-        opt_structure._to_tensor_list(opt))
-    if isinstance(tf_value, optional_ops.Optional):
-      self.assertEqual(
-          self.evaluate(tf_value.get_value()),
-          self.evaluate(round_trip_opt.get_value().get_value()))
-    else:
-      self.assertEqual(
-          self.evaluate(tf_value), self.evaluate(round_trip_opt.get_value()))
-
-  @parameterized.named_parameters(
-      ("Tensor", np.array([1, 2, 3], dtype=np.int32),
-       lambda: constant_op.constant([4, 5, 6], dtype=dtypes.int32), True),
-      ("SparseTensor", sparse_tensor.SparseTensorValue(
-          indices=[[0, 0], [1, 1]],
-          values=np.array([-1., 1.], dtype=np.float32), dense_shape=[2, 2]),
-       lambda: sparse_tensor.SparseTensor(
-           indices=[[0, 1], [1, 0]], values=[37.0, 42.0], dense_shape=[2, 2]),
-       False),
-      ("Nest", {"a": np.array([1, 2, 3], dtype=np.int32),
-                "b": sparse_tensor.SparseTensorValue(
-                    indices=[[0, 0], [1, 1]],
-                    values=np.array([-1., 1.], dtype=np.float32),
-                    dense_shape=[2, 2])},
-       lambda: {"a": constant_op.constant([4, 5, 6], dtype=dtypes.int32),
-                "b": sparse_tensor.SparseTensor(
-                    indices=[[0, 1], [1, 0]], values=[37.0, 42.0],
-                    dense_shape=[2, 2])}, False),
-  )
-  def testIteratorGetNextAsOptional(self, np_value, tf_value_fn, works_on_gpu):
-    if not works_on_gpu and test.is_gpu_available():
-      self.skipTest("Test case not yet supported on GPU.")
-    ds = dataset_ops.Dataset.from_tensors(np_value).repeat(3)
-    iterator = ds.make_initializable_iterator()
-    next_elem = iterator_ops.get_next_as_optional(iterator)
-    self.assertIsInstance(next_elem, optional_ops.Optional)
-    self.assertTrue(
-        next_elem.value_structure.is_compatible_with(
-            structure.Structure.from_value(tf_value_fn())))
-    elem_has_value_t = next_elem.has_value()
-    elem_value_t = next_elem.get_value()
-    with self.cached_session() as sess:
-      # Before initializing the iterator, evaluating the optional fails with
-      # a FailedPreconditionError.
-      with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_has_value_t)
-      with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_value_t)
-
-      # For each element of the dataset, assert that the optional evaluates to
-      # the expected value.
-      sess.run(iterator.initializer)
-      for _ in range(3):
-        elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t])
-        self.assertTrue(elem_has_value)
-        self._assertElementValueEqual(np_value, elem_value)
-
-      # After exhausting the iterator, `next_elem.has_value()` will evaluate to
-      # false, and attempting to get the value will fail.
-      for _ in range(2):
-        self.assertFalse(sess.run(elem_has_value_t))
-        with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(elem_value_t)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/optional_test.py b/tensorflow/python/data/kernel_tests/optional_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2c62e9423e6e082fd6fc42668e2827cc06246e1
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/optional_test.py
@@ -0,0 +1,366 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Optional`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import optional_ops
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testFromValue(self):
+    opt = optional_ops.Optional.from_value(constant_op.constant(37.0))
+    self.assertTrue(self.evaluate(opt.has_value()))
+    self.assertEqual(37.0, self.evaluate(opt.get_value()))
+
+  def testFromStructuredValue(self):
+    opt = optional_ops.Optional.from_value({
+        "a": constant_op.constant(37.0),
+        "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))
+    })
+    self.assertTrue(self.evaluate(opt.has_value()))
+    self.assertEqual({
+        "a": 37.0,
+        "b": ([b"Foo"], b"Bar")
+    }, self.evaluate(opt.get_value()))
+
+  def testFromSparseTensor(self):
+    st_0 = sparse_tensor.SparseTensorValue(
+        indices=np.array([[0]]),
+        values=np.array([0], dtype=np.int64),
+        dense_shape=np.array([1]))
+    st_1 = sparse_tensor.SparseTensorValue(
+        indices=np.array([[0, 0], [1, 1]]),
+        values=np.array([-1., 1.], dtype=np.float32),
+        dense_shape=np.array([2, 2]))
+    opt = optional_ops.Optional.from_value((st_0, st_1))
+    self.assertTrue(self.evaluate(opt.has_value()))
+    val_0, val_1 = opt.get_value()
+    for expected, actual in [(st_0, val_0), (st_1, val_1)]:
+      self.assertAllEqual(expected.indices, self.evaluate(actual.indices))
+      self.assertAllEqual(expected.values, self.evaluate(actual.values))
+      self.assertAllEqual(expected.dense_shape,
+                          self.evaluate(actual.dense_shape))
+
+  @test_util.run_deprecated_v1
+  def testFromNone(self):
+    value_structure = structure.TensorStructure(dtypes.float32, [])
+    opt = optional_ops.Optional.none_from_structure(value_structure)
+    self.assertTrue(opt.value_structure.is_compatible_with(value_structure))
+    self.assertFalse(
+        opt.value_structure.is_compatible_with(
+            structure.TensorStructure(dtypes.float32, [1])))
+    self.assertFalse(
+        opt.value_structure.is_compatible_with(
+            structure.TensorStructure(dtypes.int32, [])))
+    self.assertFalse(self.evaluate(opt.has_value()))
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(opt.get_value())
+
+  def testAddN(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        # With value
+        opt1 = optional_ops.Optional.from_value((1.0, 2.0))
+        opt2 = optional_ops.Optional.from_value((3.0, 4.0))
+
+        add_tensor = math_ops.add_n([opt1._variant_tensor,
+                                     opt2._variant_tensor])
+        add_opt = optional_ops._OptionalImpl(add_tensor, opt1.value_structure)
+        self.assertAllEqual(self.evaluate(add_opt.get_value()), (4.0, 6.0))
+
+        # Without value
+        opt_none1 = optional_ops.Optional.none_from_structure(
+            opt1.value_structure)
+        opt_none2 = optional_ops.Optional.none_from_structure(
+            opt2.value_structure)
+        add_tensor = math_ops.add_n([opt_none1._variant_tensor,
+                                     opt_none2._variant_tensor])
+        add_opt = optional_ops._OptionalImpl(add_tensor,
+                                             opt_none1.value_structure)
+        self.assertFalse(self.evaluate(add_opt.has_value()))
+
+  def testNestedAddN(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        opt1 = optional_ops.Optional.from_value([1, 2.0])
+        opt2 = optional_ops.Optional.from_value([3, 4.0])
+        opt3 = optional_ops.Optional.from_value((5.0, opt1._variant_tensor))
+        opt4 = optional_ops.Optional.from_value((6.0, opt2._variant_tensor))
+
+        add_tensor = math_ops.add_n([opt3._variant_tensor,
+                                     opt4._variant_tensor])
+        add_opt = optional_ops._OptionalImpl(add_tensor, opt3.value_structure)
+        self.assertEqual(self.evaluate(add_opt.get_value()[0]), 11.0)
+
+        inner_add_opt = optional_ops._OptionalImpl(add_opt.get_value()[1],
+                                                   opt1.value_structure)
+        self.assertAllEqual(inner_add_opt.get_value(), [4, 6.0])
+
+  def testZerosLike(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        # With value
+        opt = optional_ops.Optional.from_value((1.0, 2.0))
+        zeros_tensor = array_ops.zeros_like(opt._variant_tensor)
+        zeros_opt = optional_ops._OptionalImpl(zeros_tensor,
+                                               opt.value_structure)
+        self.assertAllEqual(self.evaluate(zeros_opt.get_value()),
+                            (0.0, 0.0))
+
+        # Without value
+        opt_none = optional_ops.Optional.none_from_structure(
+            opt.value_structure)
+        zeros_tensor = array_ops.zeros_like(opt_none._variant_tensor)
+        zeros_opt = optional_ops._OptionalImpl(zeros_tensor,
+                                               opt_none.value_structure)
+        self.assertFalse(self.evaluate(zeros_opt.has_value()))
+
+  def testNestedZerosLike(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        opt1 = optional_ops.Optional.from_value(1.0)
+        opt2 = optional_ops.Optional.from_value(opt1._variant_tensor)
+
+        zeros_tensor = array_ops.zeros_like(opt2._variant_tensor)
+        zeros_opt = optional_ops._OptionalImpl(zeros_tensor,
+                                               opt2.value_structure)
+        inner_zeros_opt = optional_ops._OptionalImpl(zeros_opt.get_value(),
+                                                     opt1.value_structure)
+        self.assertEqual(self.evaluate(inner_zeros_opt.get_value()), 0.0)
+
+  def testCopyToGPU(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    with ops.device("/cpu:0"):
+      optional_with_value = optional_ops.Optional.from_value(
+          (constant_op.constant(37.0), constant_op.constant("Foo"),
+           constant_op.constant(42)))
+      optional_none = optional_ops.Optional.none_from_structure(
+          structure.TensorStructure(dtypes.float32, []))
+
+    with ops.device("/gpu:0"):
+      gpu_optional_with_value = optional_ops._OptionalImpl(
+          array_ops.identity(optional_with_value._variant_tensor),
+          optional_with_value.value_structure)
+      gpu_optional_none = optional_ops._OptionalImpl(
+          array_ops.identity(optional_none._variant_tensor),
+          optional_none.value_structure)
+
+      gpu_optional_with_value_has_value = gpu_optional_with_value.has_value()
+      gpu_optional_with_value_values = gpu_optional_with_value.get_value()
+
+      gpu_optional_none_has_value = gpu_optional_none.has_value()
+
+    self.assertTrue(self.evaluate(gpu_optional_with_value_has_value))
+    self.assertEqual((37.0, b"Foo", 42),
+                     self.evaluate(gpu_optional_with_value_values))
+    self.assertFalse(self.evaluate(gpu_optional_none_has_value))
+
+  def testNestedCopyToGPU(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    with ops.device("/cpu:0"):
+      optional_with_value = optional_ops.Optional.from_value(
+          (constant_op.constant(37.0), constant_op.constant("Foo"),
+           constant_op.constant(42)))
+      optional_none = optional_ops.Optional.none_from_structure(
+          structure.TensorStructure(dtypes.float32, []))
+      nested_optional = optional_ops.Optional.from_value(
+          (optional_with_value._variant_tensor, optional_none._variant_tensor,
+           1.0))
+
+    with ops.device("/gpu:0"):
+      gpu_nested_optional = optional_ops._OptionalImpl(
+          array_ops.identity(nested_optional._variant_tensor),
+          nested_optional.value_structure)
+
+      gpu_nested_optional_has_value = gpu_nested_optional.has_value()
+      gpu_nested_optional_values = gpu_nested_optional.get_value()
+
+    self.assertTrue(self.evaluate(gpu_nested_optional_has_value))
+
+    inner_with_value = optional_ops._OptionalImpl(
+        gpu_nested_optional_values[0], optional_with_value.value_structure)
+
+    inner_none = optional_ops._OptionalImpl(
+        gpu_nested_optional_values[1], optional_none.value_structure)
+
+    self.assertEqual((37.0, b"Foo", 42),
+                     self.evaluate(inner_with_value.get_value()))
+    self.assertFalse(self.evaluate(inner_none.has_value()))
+    self.assertEqual(1.0, self.evaluate(gpu_nested_optional_values[2]))
+
+  def _assertElementValueEqual(self, expected, actual):
+    if isinstance(expected, dict):
+      self.assertItemsEqual(list(expected.keys()), list(actual.keys()))
+      for k in expected.keys():
+        self._assertElementValueEqual(expected[k], actual[k])
+    elif isinstance(expected, sparse_tensor.SparseTensorValue):
+      self.assertAllEqual(expected.indices, actual.indices)
+      self.assertAllEqual(expected.values, actual.values)
+      self.assertAllEqual(expected.dense_shape, actual.dense_shape)
+    else:
+      self.assertAllEqual(expected, actual)
+
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[0]], values=constant_op.constant([0], dtype=dtypes.int32),
+          dense_shape=[1]),
+       structure.SparseTensorStructure(dtypes.int32, [1])),
+      ("Nest", lambda: {
+          "a": constant_op.constant(37.0),
+          "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))},
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, []),
+           "b": (structure.TensorStructure(dtypes.string, [1]),
+                 structure.TensorStructure(dtypes.string, []))})),
+      ("Optional", lambda: optional_ops.Optional.from_value(37.0),
+       optional_ops.OptionalStructure(
+           structure.TensorStructure(dtypes.float32, []))),
+  )
+  @test_util.run_deprecated_v1
+  def testSkipEagerOptionalStructure(self, tf_value_fn,
+                                     expected_value_structure):
+    tf_value = tf_value_fn()
+    opt = optional_ops.Optional.from_value(tf_value)
+
+    self.assertTrue(
+        expected_value_structure.is_compatible_with(opt.value_structure))
+    self.assertTrue(
+        opt.value_structure.is_compatible_with(expected_value_structure))
+
+    opt_structure = structure.Structure.from_value(opt)
+    self.assertIsInstance(opt_structure, optional_ops.OptionalStructure)
+    self.assertTrue(opt_structure.is_compatible_with(opt_structure))
+    self.assertTrue(opt_structure._value_structure.is_compatible_with(
+        expected_value_structure))
+    self.assertEqual([dtypes.variant], opt_structure._flat_types)
+    self.assertEqual([tensor_shape.scalar()], opt_structure._flat_shapes)
+
+    # All OptionalStructure objects are not compatible with a non-optional
+    # value.
+    non_optional_structure = structure.Structure.from_value(
+        constant_op.constant(42.0))
+    self.assertFalse(opt_structure.is_compatible_with(non_optional_structure))
+
+    # Assert that the optional survives a round-trip via _from_tensor_list()
+    # and _to_tensor_list().
+    round_trip_opt = opt_structure._from_tensor_list(
+        opt_structure._to_tensor_list(opt))
+    if isinstance(tf_value, optional_ops.Optional):
+      self.assertEqual(
+          self.evaluate(tf_value.get_value()),
+          self.evaluate(round_trip_opt.get_value().get_value()))
+    else:
+      self.assertEqual(
+          self.evaluate(tf_value), self.evaluate(round_trip_opt.get_value()))
+
+  @parameterized.named_parameters(
+      ("Tensor", np.array([1, 2, 3], dtype=np.int32),
+       lambda: constant_op.constant([4, 5, 6], dtype=dtypes.int32), True),
+      ("SparseTensor", sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]],
+          values=np.array([-1., 1.], dtype=np.float32), dense_shape=[2, 2]),
+       lambda: sparse_tensor.SparseTensor(
+           indices=[[0, 1], [1, 0]], values=[37.0, 42.0], dense_shape=[2, 2]),
+       False),
+      ("Nest", {"a": np.array([1, 2, 3], dtype=np.int32),
+                "b": sparse_tensor.SparseTensorValue(
+                    indices=[[0, 0], [1, 1]],
+                    values=np.array([-1., 1.], dtype=np.float32),
+                    dense_shape=[2, 2])},
+       lambda: {"a": constant_op.constant([4, 5, 6], dtype=dtypes.int32),
+                "b": sparse_tensor.SparseTensor(
+                    indices=[[0, 1], [1, 0]], values=[37.0, 42.0],
+                    dense_shape=[2, 2])}, False),
+  )
+  @test_util.run_deprecated_v1
+  def testSkipEagerIteratorGetNextAsOptional(self, np_value, tf_value_fn,
+                                             works_on_gpu):
+    if not works_on_gpu and test.is_gpu_available():
+      self.skipTest("Test case not yet supported on GPU.")
+    ds = dataset_ops.Dataset.from_tensors(np_value).repeat(3)
+    iterator = ds.make_initializable_iterator()
+    next_elem = iterator_ops.get_next_as_optional(iterator)
+    self.assertIsInstance(next_elem, optional_ops.Optional)
+    self.assertTrue(
+        next_elem.value_structure.is_compatible_with(
+            structure.Structure.from_value(tf_value_fn())))
+    elem_has_value_t = next_elem.has_value()
+    elem_value_t = next_elem.get_value()
+    with self.cached_session() as sess:
+      # Before initializing the iterator, evaluating the optional fails with
+      # a FailedPreconditionError.
+      with self.assertRaises(errors.FailedPreconditionError):
+        sess.run(elem_has_value_t)
+      with self.assertRaises(errors.FailedPreconditionError):
+        sess.run(elem_value_t)
+
+      # For each element of the dataset, assert that the optional evaluates to
+      # the expected value.
+      sess.run(iterator.initializer)
+      for _ in range(3):
+        elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t])
+        self.assertTrue(elem_has_value)
+        self._assertElementValueEqual(np_value, elem_value)
+
+      # After exhausting the iterator, `next_elem.has_value()` will evaluate to
+      # false, and attempting to get the value will fail.
+      for _ in range(2):
+        self.assertFalse(sess.run(elem_has_value_t))
+        with self.assertRaises(errors.InvalidArgumentError):
+          sess.run(elem_value_t)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcfb2f507bf1a7d91041eb5f24c95c6de2c18362
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -0,0 +1,243 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.padded_batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+def _random_seq_lens(count):
+  return np.random.randint(20, size=(count,)).astype(np.int32)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default_padding', _random_seq_lens(32), 4, [-1], False),
+      ('constant_padding', _random_seq_lens(32), 4, [25], False),
+      ('uneven_with_remainder', _random_seq_lens(34), 4, [-1], False),
+      ('uneven_without_remainder', _random_seq_lens(34), 4, [-1], True),
+  )
+  def testPaddedBatchDataset(self, seq_lens, batch_size, padded_shapes,
+                             drop_remainder):
+    """Tests the padded batch dataset logic for various input configurations.
+
+    Args:
+      seq_lens: the input sequence lengths
+      batch_size: the batch size
+      padded_shapes: the padded shapes to use
+      drop_remainder: whether a smaller batch size should be produced if batch
+        size does not divide number of inputs evenly
+    """
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+        lambda x: array_ops.fill([x], x)).padded_batch(
+            batch_size=batch_size,
+            drop_remainder=drop_remainder,
+            padded_shapes=padded_shapes)
+
+    num_full_batches = len(seq_lens) // batch_size
+    get_next = self.getNext(dataset)
+    for i in range(num_full_batches):
+      result = self.evaluate(get_next())
+      padded_len = padded_shapes[0]
+      if padded_len is None or padded_len == -1:
+        padded_len = np.max(result) if result.size > 0 else 0
+      self.assertEqual((batch_size, padded_len), result.shape)
+      for j in range(batch_size):
+        seq_len = seq_lens[(i * batch_size) + j]
+        self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+        self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
+
+    if not drop_remainder and len(seq_lens) % batch_size > 0:
+      result = self.evaluate(get_next())
+      padded_len = np.max(result) if result.size > 0 else 0
+      self.assertEqual((len(seq_lens) % batch_size, padded_len), result.shape)
+      for j in range(len(seq_lens) % batch_size):
+        seq_len = seq_lens[num_full_batches * batch_size + j]
+        self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+        self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @test_util.run_deprecated_v1
+  def testPaddedBatchShortPadding(self):
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(
+            [6, 5, 5, 5, 5]).map(lambda x: array_ops.fill([x], x)).padded_batch(
+                batch_size=4, padded_shapes=[5]))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.DataLossError, ''))
+
+  def testPaddedBatchEmptyTensors(self):
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(
+            [0, 0, 0, 0]).map(lambda x: array_ops.fill([x], x)).padded_batch(
+                batch_size=4, padded_shapes=[-1]))
+    self.assertDatasetProduces(dataset, expected_output=[[[], [], [], []]])
+
+  def testPaddedBatchDatasetNonDefaultPadding(self):
+
+    def fill_tuple(x):
+      filled = array_ops.fill([x], x)
+      return (filled, string_ops.as_string(filled))
+
+    random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(random_seq_lens).map(fill_tuple)
+        .padded_batch(
+            4, padded_shapes=([-1], [-1]), padding_values=(-1, '<end>')))
+
+    get_next = self.getNext(dataset)
+    for i in range(8):
+      result = self.evaluate(get_next())
+      padded_len = np.max(result[0])
+      self.assertEqual((4, padded_len), result[0].shape)
+      self.assertEqual((4, padded_len), result[1].shape)
+      for j in range(4):
+        seq_len = random_seq_lens[(i * 4) + j]
+        self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
+        self.assertAllEqual(result[0][j, seq_len:],
+                            [-1] * (padded_len - seq_len))
+        self.assertAllEqual(result[1][j, :seq_len],
+                            [compat.as_bytes(str(seq_len))] * seq_len)
+        self.assertAllEqual(result[1][j, seq_len:],
+                            [b'<end>'] * (padded_len - seq_len))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testPaddedBatchDatasetUnicode(self):
+    # See GitHub issue 16149
+    def generator():
+      data = [[u'Простой', u'тест', u'юникода'],
+              [u'никогда', u'не', u'бывает', u'простым']]
+
+      for seq in data:
+        yield seq, [0, 1, 2, 3]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, (dtypes.string, dtypes.int32),
+        (tensor_shape.TensorShape([None]), tensor_shape.TensorShape([None])))
+    padded_dataset = dataset.padded_batch(
+        2, padded_shapes=([None], [None]), padding_values=('', 0))
+    next_element = self.getNext(padded_dataset)
+    self.evaluate(next_element())
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerPaddedBatchDatasetShapeSpecifications(self):
+    int_placeholder = array_ops.placeholder(dtypes.int32)
+    float_placeholder = array_ops.placeholder(dtypes.float32)
+    string_placeholder = array_ops.placeholder(dtypes.string)
+    input_dataset = dataset_ops.Dataset.from_tensors(
+        (int_placeholder, float_placeholder, string_placeholder))
+
+    # Test different ways of specifying the `padded_shapes` argument.
+    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
+        32,
+        padded_shapes=(tensor_shape.TensorShape([None]),
+                       tensor_shape.TensorShape([None, None]),
+                       tensor_shape.TensorShape([37])))
+    dynamic_padding_from_lists = input_dataset.padded_batch(
+        32, padded_shapes=([None], [None, None], [37]))
+    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
+        32, padded_shapes=([-1], [-1, -1], [37]))
+    dynamic_padding_from_tensors = input_dataset.padded_batch(
+        32,
+        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
+                       constant_op.constant([-1, -1], dtype=dtypes.int64),
+                       constant_op.constant([37], dtype=dtypes.int64)))
+
+    for dataset in [
+        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
+        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
+    ]:
+      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
+      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
+      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
+
+  def testPaddedBatchSparseError(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+
+    with self.assertRaises(TypeError):
+      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
+
+  def testPaddedBatchShapeError(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      _ = dataset_ops.Dataset.range(10).padded_batch(5, padded_shapes=[1])
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(3,\).'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=[1])
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Padded shape .* must be a 1-D tensor '
+        r'of tf.int64 values, but its shape was \(2, 2\).'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=[[1, 1], [1, 1]])
+
+    with self.assertRaisesRegexp(
+        TypeError, r'Padded shape .* must be a 1-D tensor '
+        r'of tf.int64 values, but its element type was float32.'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=constant_op.constant([1., 2., 3.]))
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      shape_as_tensor = constant_op.constant([1], dtype=dtypes.int64)
+      _ = dataset_ops.Dataset.range(10).padded_batch(
+          5, padded_shapes=shape_as_tensor)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerPaddedBatchShapeError(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'The padded shape \((\?|None), (\?|None)\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      shape_as_tensor = array_ops.placeholder(dtypes.int64, shape=[2])
+      _ = dataset_ops.Dataset.range(10).padded_batch(
+          5, padded_shapes=shape_as_tensor)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
deleted file mode 100644
index 76e2697b29d368f5607c827fe32d017fbefd5ecd..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test PrefetchDataset."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class PrefetchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.parameters((-1), (0), (5))
-  def testBufferSize(self, buffer_size):
-    buffer_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(10).prefetch(
-        buffer_size=buffer_size_t).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={buffer_size_t: buffer_size})
-      for m in range(10):
-        self.assertEqual(m, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @parameterized.parameters((-2), (-42))
-  def testInvalidBufferSize(self, buffer_size):
-    buffer_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(10).prefetch(
-        buffer_size=buffer_size_t).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "buffer_size"):
-      with self.cached_session() as sess:
-        sess.run(init_op, feed_dict={buffer_size_t: buffer_size})
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/prefetch_test.py b/tensorflow/python/data/kernel_tests/prefetch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a143ba0ac63d42667faa4cfdee6fa74cf0a82f57
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/prefetch_test.py
@@ -0,0 +1,44 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.prefetch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.parameters((-1), (0), (5))
+  def testBufferSize(self, buffer_size):
+    dataset = dataset_ops.Dataset.range(10).prefetch(buffer_size=buffer_size)
+    self.assertDatasetProduces(dataset, expected_output=range(10))
+
+  @parameterized.parameters((-2), (-42))
+  def testInvalidBufferSize(self, buffer_size):
+    dataset = dataset_ops.Dataset.range(10).prefetch(buffer_size=buffer_size)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, "buffer_size"))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
deleted file mode 100644
index b7e2a5f615ea970525c0aa3138ac3567bd5c70bc..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
+++ /dev/null
@@ -1,487 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test RangeDataset."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-
-
-class RangeDatasetTest(test_base.DatasetTestBase):
-
-  def tearDown(self):
-    # Remove all checkpoint files.
-    prefix = self._iterator_checkpoint_prefix()
-    pattern = prefix + "*"
-    files = gfile.Glob(pattern)
-    map(gfile.Remove, files)
-
-  def testStop(self):
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={stop: 5})
-      for i in range(5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStartStop(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 5})
-      for i in range(2, 5):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStartStopStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: 2})
-      for i in range(2, 10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testZeroStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={start: 2, stop: 10, step: 0})
-
-  def testNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 2, stop: 10, step: -1})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(2, 10, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStart(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start,
-                                         stop).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStartWithPositiveStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: 2})
-      # This for loop is a no-op but will ensure that the implementation is
-      # consistent with range if it ever changes.
-      for i in range(10, 2, 2):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testStopLessThanStartWithNegativeStep(self):
-    start = array_ops.placeholder(dtypes.int64, shape=[])
-    stop = array_ops.placeholder(dtypes.int64, shape=[])
-    step = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(start, stop,
-                                         step).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={start: 10, stop: 2, step: -1})
-      for i in range(10, 2, -1):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def _iterator_checkpoint_prefix(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _save_op(self, iterator_resource):
-    iterator_state_variant = gen_dataset_ops.serialize_iterator(
-        iterator_resource)
-    save_op = io_ops.write_file(
-        self._iterator_checkpoint_prefix(),
-        parsing_ops.serialize_tensor(iterator_state_variant))
-    return save_op
-
-  def _restore_op(self, iterator_resource):
-    iterator_state_variant = parsing_ops.parse_tensor(
-        io_ops.read_file(self._iterator_checkpoint_prefix()), dtypes.variant)
-    restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                      iterator_state_variant)
-    return restore_op
-
-  def testSaveRestore(self):
-
-    def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    break_point = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-    # Saving and restoring in same session.
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreWithoutBuildingDatasetGraph(self):
-
-    def _build_graph(start, stop, num_epochs):
-      dataset = dataset_ops.Dataset.range(start, stop).repeat(num_epochs)
-      iterator = dataset.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    num_epochs = 5
-    break_point = 5
-    break_epoch = 3
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop, num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for _ in range(break_epoch):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      # Create an empty IteratorResource and restore the Iterator into it.
-      output_types = dtypes.int64
-      output_shapes = tensor_shape.scalar()
-      iterator = iterator_ops.Iterator.from_structure(output_types,
-                                                      output_shapes)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      get_next = iterator.get_next()
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        for _ in range(break_epoch + 1, num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreInModifiedGraph(self):
-
-    def _build_graph(start, stop):
-      dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    stop_1 = 8
-    break_point = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      # Intentionally build a graph with a different value for stop to make sure
-      # the original dataset graph is actually getting loaded.
-      init_op, get_next, _, restore_op = _build_graph(start, stop_1)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testInitThenRestore(self):
-    # Note: Calling init_op before restore_op is redundant. This test just makes
-    # sure we do not fail if restore is called on an already initialized
-    # iterator resource.
-
-    def _build_graph(start, stop):
-      dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    # Saving and restoring in different sessions.
-    start = 2
-    stop = 10
-    break_point = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testMultipleSaves(self):
-
-    def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    start = 2
-    stop = 10
-    break_point1 = 5
-    break_point2 = 7
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, _ = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        for i in range(start, break_point1):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point1, break_point2):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    break_point2 = 7
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(start, stop)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_point2, stop):
-          self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testSaveRestoreWithRepeat(self):
-
-    def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    start = 2
-    stop = 10
-    num_epochs = 5
-    break_range = 5
-    break_epoch = 3
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(
-          start, stop, num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(break_epoch - 1):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        for i in range(start, break_range):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for i in range(break_range, stop):
-          self.assertEqual(i, sess.run(get_next))
-        for _ in range(break_epoch, num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testSaveRestoreExhaustedIterator(self):
-
-    def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      save_op = self._save_op(iterator._iterator_resource)
-      restore_op = self._restore_op(iterator._iterator_resource)
-      return init_op, get_next, save_op, restore_op
-
-    start = 2
-    stop = 10
-    num_epochs = 5
-    with ops.Graph().as_default() as g:
-      init_op, get_next, save_op, restore_op = _build_graph(
-          start, stop, num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(num_epochs):
-          for i in range(start, stop):
-            self.assertEqual(i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, _, restore_op = _build_graph(start, stop, num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/range_test.py b/tensorflow/python/data/kernel_tests/range_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f5d25e7f3959eed70754db827052a91fd224dbc
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/range_test.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.range()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RangeTest(test_base.DatasetTestBase):
+
+  def testStop(self):
+    dataset = dataset_ops.Dataset.range(5)
+    self.assertDatasetProduces(dataset, expected_output=range(5))
+
+  def testStartStop(self):
+    start, stop = 2, 5
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 5))
+
+  def testStartStopStep(self):
+    start, stop, step = 2, 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, 2))
+
+  def testZeroStep(self):
+    start, stop, step = 2, 10, 0
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ""))
+
+  def testNegativeStep(self):
+    start, stop, step = 2, 10, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, -1))
+
+  def testStopLessThanStart(self):
+    start, stop = 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2))
+
+  def testStopLessThanStartWithPositiveStep(self):
+    start, stop, step = 10, 2, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, 2))
+
+  def testStopLessThanStartWithNegativeStep(self):
+    start, stop, step = 10, 2, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, -1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
deleted file mode 100644
index aef2dd1d9c6a4fc5094bbe79e4714022633c2ed2..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
+++ /dev/null
@@ -1,817 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gzip
-import os
-import zlib
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.lib.io import python_io
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-try:
-  import psutil  # pylint: disable=g-import-not-at-top
-  psutil_import_succeeded = True
-except ImportError:
-  psutil_import_succeeded = False
-
-
-class TextLineDatasetTest(test_base.DatasetTestBase):
-
-  def _lineText(self, f, l):
-    return compat.as_bytes("%d: %d" % (f, l))
-
-  def _createFiles(self,
-                   num_files,
-                   num_lines,
-                   crlf=False,
-                   compression_type=None):
-    filenames = []
-    for i in range(num_files):
-      fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
-      filenames.append(fn)
-      contents = []
-      for j in range(num_lines):
-        contents.append(self._lineText(i, j))
-        # Always include a newline after the record unless it is
-        # at the end of the file, in which case we include it
-        if j + 1 != num_lines or i == 0:
-          contents.append(b"\r\n" if crlf else b"\n")
-      contents = b"".join(contents)
-
-      if not compression_type:
-        with open(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "GZIP":
-        with gzip.GzipFile(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "ZLIB":
-        contents = zlib.compress(contents)
-        with open(fn, "wb") as f:
-          f.write(contents)
-      else:
-        raise ValueError("Unsupported compression_type", compression_type)
-
-    return filenames
-
-  def _testTextLineDataset(self, compression_type=None):
-    test_filenames = self._createFiles(
-        2, 5, crlf=True, compression_type=compression_type)
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = readers.TextLineDataset(
-        filenames, compression_type=compression_type).repeat(num_epochs)
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(2):
-          for i in range(5):
-            self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={filenames: test_filenames,
-                     num_epochs: 10,
-                     batch_size: 5})
-      for _ in range(10):
-        self.assertAllEqual([self._lineText(0, i) for i in range(5)],
-                            sess.run(get_next))
-        self.assertAllEqual([self._lineText(1, i) for i in range(5)],
-                            sess.run(get_next))
-
-  def testTextLineDatasetNoCompression(self):
-    self._testTextLineDataset()
-
-  def testTextLineDatasetGzipCompression(self):
-    self._testTextLineDataset(compression_type="GZIP")
-
-  def testTextLineDatasetZlibCompression(self):
-    self._testTextLineDataset(compression_type="ZLIB")
-
-  def testTextLineDatasetBuffering(self):
-    test_filenames = self._createFiles(2, 5, crlf=True)
-
-    repeat_dataset = readers.TextLineDataset(test_filenames, buffer_size=10)
-    iterator = repeat_dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testIteratorResourceCleanup(self):
-    filename = os.path.join(self.get_temp_dir(), "text.txt")
-    with open(filename, "wt") as f:
-      for i in range(3):
-        f.write("%d\n" % (i,))
-    with context.eager_mode():
-      first_iterator = iter(readers.TextLineDataset(filename))
-      self.assertEqual(b"0", next(first_iterator).numpy())
-      second_iterator = iter(readers.TextLineDataset(filename))
-      self.assertEqual(b"0", next(second_iterator).numpy())
-      # Eager kernel caching is based on op attributes, which includes the
-      # Dataset's output shape. Create a different kernel to test that they
-      # don't create resources with the same names.
-      different_kernel_iterator = iter(
-          readers.TextLineDataset(filename).repeat().batch(16))
-      self.assertEqual([16], next(different_kernel_iterator).shape)
-      # Remove our references to the Python Iterator objects, which (assuming no
-      # reference cycles) is enough to trigger DestroyResourceOp and close the
-      # partially-read files.
-      del first_iterator
-      del second_iterator
-      del different_kernel_iterator
-      if not psutil_import_succeeded:
-        self.skipTest(
-            "psutil is required to check that we've closed our files.")
-      open_files = psutil.Process().open_files()
-      self.assertNotIn(filename, [open_file.path for open_file in open_files])
-
-
-class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    super(FixedLengthRecordReaderTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
-    self._header_bytes = 5
-    self._record_bytes = 3
-    self._footer_bytes = 2
-
-  def _record(self, f, r):
-    return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
-
-  def _createFiles(self):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
-      filenames.append(fn)
-      with open(fn, "wb") as f:
-        f.write(b"H" * self._header_bytes)
-        for j in range(self._num_records):
-          f.write(self._record(i, j))
-        f.write(b"F" * self._footer_bytes)
-    return filenames
-
-  def testFixedLengthRecordDataset(self):
-    test_filenames = self._createFiles()
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-                      .repeat(num_epochs))
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={
-              filenames: test_filenames,
-              num_epochs: 10,
-              batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)],
-              sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFixedLengthRecordDatasetBuffering(self):
-    test_filenames = self._createFiles()
-    dataset = readers.FixedLengthRecordDataset(
-        test_filenames,
-        self._record_bytes,
-        self._header_bytes,
-        self._footer_bytes,
-        buffer_size=10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testFixedLengthRecordDatasetWrongSize(self):
-    test_filenames = self._createFiles()
-    dataset = readers.FixedLengthRecordDataset(
-        test_filenames,
-        self._record_bytes + 1,  # Incorrect record length.
-        self._header_bytes,
-        self._footer_bytes,
-        buffer_size=10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"Excluding the header \(5 bytes\) and footer \(2 bytes\), input "
-          r"file \".*fixed_length_record.0.txt\" has body length 21 bytes, "
-          r"which is not an exact multiple of the record length \(4 bytes\)."):
-        sess.run(iterator.get_next())
-
-  def _iterator_checkpoint_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _save_op(self, iterator_resource):
-    iterator_state_variant = gen_dataset_ops.serialize_iterator(
-        iterator_resource)
-    save_op = io_ops.write_file(
-        self._iterator_checkpoint_path(),
-        parsing_ops.serialize_tensor(iterator_state_variant))
-    return save_op
-
-  def _restore_op(self, iterator_resource):
-    iterator_state_variant = parsing_ops.parse_tensor(
-        io_ops.read_file(self._iterator_checkpoint_path()), dtypes.variant)
-    restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                      iterator_state_variant)
-    return restore_op
-
-  def _build_iterator_graph(self, num_epochs):
-    filenames = self._createFiles()
-    dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-               .repeat(num_epochs))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next_op = iterator.get_next()
-    save_op = self._save_op(iterator._iterator_resource)
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return init_op, get_next_op, save_op, restore_op
-
-  def _restore_iterator(self):
-    output_types = dtypes.string
-    output_shapes = tensor_shape.scalar()
-    iterator = iterator_ops.Iterator.from_structure(output_types, output_shapes)
-    get_next = iterator.get_next()
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return restore_op, get_next
-
-  def testSaveRestore(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testInitThenRestore(self):
-    # Note: Calling init_op before restore_op is redundant. This test just makes
-    # sure we do not fail if restore is called on an already initialized
-    # iterator resource.
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreInModifiedGraph(self):
-    num_epochs = 10
-    num_epochs_1 = 20
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs_1)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreWithoutBuildingDatasetGraph(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      restore_op, get_next_op = self._restore_iterator()
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreUnusedIterator(self):
-    num_epochs = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        # Save unused iterator.
-        sess.run(save_op)
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for _ in range(num_epochs * self._num_files * self._num_records):
-          sess.run(get_next_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreExhaustedIterator(self):
-    num_epochs = 10
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-
-class TFRecordDatasetTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    super(TFRecordDatasetTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
-
-    self.test_filenames = self._createFiles()
-
-    self.filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    self.num_epochs = array_ops.placeholder_with_default(
-        constant_op.constant(1, dtypes.int64), shape=[])
-    self.compression_type = array_ops.placeholder_with_default("", shape=[])
-    self.batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = readers.TFRecordDataset(self.filenames,
-                                             self.compression_type).repeat(
-                                                 self.num_epochs)
-    batch_dataset = repeat_dataset.batch(self.batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    self.init_op = iterator.make_initializer(repeat_dataset)
-    self.init_batch_op = iterator.make_initializer(batch_dataset)
-    self.get_next = iterator.get_next()
-
-  def _record(self, f, r):
-    return compat.as_bytes("Record %d of file %d" % (r, f))
-
-  def _createFiles(self):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
-      filenames.append(fn)
-      writer = python_io.TFRecordWriter(fn)
-      for j in range(self._num_records):
-        writer.write(self._record(i, j))
-      writer.close()
-    return filenames
-
-  def testReadOneEpoch(self):
-    with self.cached_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[0]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(0, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[1]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(1, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from both files.
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochs(self):
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochsOfBatches(self):
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_batch_op,
-          feed_dict={
-              self.filenames: self.test_filenames,
-              self.num_epochs: 10,
-              self.batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          values = sess.run(self.get_next)
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)], values)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadZlibFiles(self):
-    zlib_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        cdata = zlib.compress(f.read())
-
-        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
-        with open(zfn, "wb") as f:
-          f.write(cdata)
-        zlib_files.append(zfn)
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: zlib_files,
-                     self.compression_type: "ZLIB"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadGzipFiles(self):
-    gzip_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
-        with gzip.GzipFile(gzfn, "wb") as gzf:
-          gzf.write(f.read())
-        gzip_files.append(gzfn)
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: gzip_files,
-                     self.compression_type: "GZIP"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadWithBuffer(self):
-    one_mebibyte = 2**20
-    d = readers.TFRecordDataset(self.test_filenames, buffer_size=one_mebibyte)
-    iterator = d.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testReadFromDatasetOfFiles(self):
-    files = dataset_ops.Dataset.from_tensor_slices(self.test_filenames)
-    d = readers.TFRecordDataset(files)
-    iterator = d.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testReadTenEpochsFromDatasetOfFilesInParallel(self):
-    files = dataset_ops.Dataset.from_tensor_slices(
-        self.test_filenames).repeat(10)
-    d = readers.TFRecordDataset(files, num_parallel_reads=4)
-    iterator = d.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    expected = []
-    actual = []
-    with self.cached_session() as sess:
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            expected.append(self._record(j, i))
-            actual.append(sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self.assertEqual(sorted(expected), sorted(actual))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py b/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
deleted file mode 100644
index 11e07300b9716d60d0d96587018dd63dce3f9d24..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def testSum(self):
-    for i in range(10):
-      ds = dataset_ops.Dataset.range(1, i + 1)
-      result = ds.reduce(np.int64(0), lambda x, y: x + y)
-      with self.cached_session() as sess:
-        self.assertEqual(((i + 1) * i) // 2, sess.run(result))
-
-  def testSumTuple(self):
-
-    def reduce_fn(state, value):
-      v1, v2 = value
-      return state + v1 + v2
-
-    for i in range(10):
-      ds = dataset_ops.Dataset.range(1, i + 1)
-      ds = dataset_ops.Dataset.zip((ds, ds))
-      result = ds.reduce(np.int64(0), reduce_fn)
-      with self.cached_session() as sess:
-        self.assertEqual(((i + 1) * i), sess.run(result))
-
-  def testSumAndCount(self):
-
-    def reduce_fn(state, value):
-      s, c = state
-      return s + value, c + 1
-
-    for i in range(10):
-      ds = dataset_ops.Dataset.range(1, i + 1)
-      result = ds.reduce((np.int64(0), np.int64(0)), reduce_fn)
-      with self.cached_session() as sess:
-        s, c = sess.run(result)
-        self.assertEqual(((i + 1) * i) // 2, s)
-        self.assertEqual(i, c)
-
-  def testSquareUsingPlaceholder(self):
-    delta = array_ops.placeholder(dtype=dtypes.int64)
-
-    def reduce_fn(state, _):
-      return state + delta
-
-    for i in range(10):
-      ds = dataset_ops.Dataset.range(1, i + 1)
-      result = ds.reduce(np.int64(0), reduce_fn)
-      with self.cached_session() as sess:
-        square = sess.run(result, feed_dict={delta: i})
-        self.assertEqual(i * i, square)
-
-  def testSparse(self):
-
-    def reduce_fn(_, value):
-      return value
-
-    def make_sparse_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1]))
-
-    for i in range(10):
-      ds = dataset_ops.Dataset.from_tensors(make_sparse_fn(i+1))
-      result = ds.reduce(make_sparse_fn(0), reduce_fn)
-      with self.cached_session() as sess:
-        self.assertSparseValuesEqual(make_sparse_fn(i+1), sess.run(result))
-
-  def testNested(self):
-
-    def reduce_fn(state, value):
-      state["dense"] += value["dense"]
-      state["sparse"] = value["sparse"]
-      return state
-
-    def make_sparse_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1]))
-
-    def map_fn(i):
-      return {"dense": math_ops.cast(i, dtype=dtypes.int64),
-              "sparse": make_sparse_fn(math_ops.cast(i, dtype=dtypes.int64))}
-
-    for i in range(10):
-      ds = dataset_ops.Dataset.range(1, i + 1).map(map_fn)
-      result = ds.reduce(map_fn(0), reduce_fn)
-      with self.cached_session() as sess:
-        result = sess.run(result)
-        self.assertEqual(((i + 1) * i) // 2, result["dense"])
-        self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/reduce_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..14bbc0bf72caa07445ca7d077845e2bc4569cc01
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -0,0 +1,127 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.reduce()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testSum(self):
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1)
+      result = ds.reduce(
+          constant_op.constant(0, dtype=dtypes.int64), lambda x, y: x + y)
+      self.assertEqual(((i + 1) * i) // 2, self.evaluate(result))
+
+  def testSumTuple(self):
+
+    def reduce_fn(state, value):
+      v1, v2 = value
+      return state + v1 + v2
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1)
+      ds = dataset_ops.Dataset.zip((ds, ds))
+      result = ds.reduce(constant_op.constant(0, dtype=dtypes.int64), reduce_fn)
+      self.assertEqual(((i + 1) * i), self.evaluate(result))
+
+  def testSumAndCount(self):
+
+    def reduce_fn(state, value):
+      s, c = state
+      return s + value, c + 1
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1)
+      result = ds.reduce((constant_op.constant(0, dtype=dtypes.int64),
+                          constant_op.constant(0, dtype=dtypes.int64)),
+                         reduce_fn)
+      s, c = self.evaluate(result)
+      self.assertEqual(((i + 1) * i) // 2, s)
+      self.assertEqual(i, c)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSquareUsingPlaceholder(self):
+    delta = array_ops.placeholder(dtype=dtypes.int64)
+
+    def reduce_fn(state, _):
+      return state + delta
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1)
+      result = ds.reduce(np.int64(0), reduce_fn)
+      with self.cached_session() as sess:
+        square = sess.run(result, feed_dict={delta: i})
+        self.assertEqual(i * i, square)
+
+  def testSparse(self):
+
+    def reduce_fn(_, value):
+      return value
+
+    def make_sparse_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.from_tensors(make_sparse_fn(i+1))
+      result = ds.reduce(make_sparse_fn(0), reduce_fn)
+      self.assertSparseValuesEqual(make_sparse_fn(i + 1), self.evaluate(result))
+
+  def testNested(self):
+
+    def reduce_fn(state, value):
+      state["dense"] += value["dense"]
+      state["sparse"] = value["sparse"]
+      return state
+
+    def make_sparse_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    def map_fn(i):
+      return {"dense": math_ops.cast(i, dtype=dtypes.int64),
+              "sparse": make_sparse_fn(math_ops.cast(i, dtype=dtypes.int64))}
+
+    for i in range(10):
+      ds = dataset_ops.Dataset.range(1, i + 1).map(map_fn)
+      result = ds.reduce(map_fn(0), reduce_fn)
+      result = self.evaluate(result)
+      self.assertEqual(((i + 1) * i) // 2, result["dense"])
+      self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/repeat_test.py b/tensorflow/python/data/kernel_tests/repeat_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ef2fc1bfc8fb139cb855305f4e4f2ec70221ce2
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/repeat_test.py
@@ -0,0 +1,84 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.repeat()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RepeatTest(test_base.DatasetTestBase):
+
+  def testRepeatTensorDataset(self):
+    """Test a dataset that repeats its input multiple times."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    # This placeholder can be fed when dataset-definition subgraph
+    # runs (i.e. `init_op` below) to configure the number of
+    # repetitions used in a particular iterator.
+
+    def do_test(count):
+      dataset = dataset_ops.Dataset.from_tensors(components).repeat(count)
+      self.assertEqual([c.shape for c in components],
+                       [shape for shape in dataset.output_shapes])
+      self.assertDatasetProduces(dataset, [components] * count)
+
+    # Test a finite repetition.
+    do_test(3)
+
+    # test a different finite repetition.
+    do_test(7)
+
+    # Test an empty repetition.
+    do_test(0)
+
+    # Test an infinite repetition.
+    # NOTE(mrry): There's not a good way to test that the sequence
+    # actually is infinite.
+    dataset = dataset_ops.Dataset.from_tensors(components).repeat(-1)
+    self.assertEqual([c.shape for c in components],
+                     [shape for shape in dataset.output_shapes])
+    get_next = self.getNext(dataset)
+    for _ in range(17):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(components, results):
+        self.assertAllEqual(component, result_component)
+
+  def testRepeatRepeatTensorDataset(self):
+    """Test the composition of repeat datasets."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    inner_count, outer_count = 7, 14
+
+    dataset = dataset_ops.Dataset.from_tensors(components).repeat(
+        inner_count).repeat(outer_count)
+    self.assertEqual([c.shape for c in components],
+                     [shape for shape in dataset.output_shapes])
+    self.assertDatasetProduces(dataset,
+                               [components] * (inner_count * outer_count))
+
+  def testRepeatEmptyDataset(self):
+    """Test that repeating an empty dataset does not hang."""
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10).repeat(-1)
+    self.assertDatasetProduces(dataset, [])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py b/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
deleted file mode 100644
index e86356dee7c63e062c9dfe945246a0461c3e6526..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class SequenceDatasetTest(test_base.DatasetTestBase):
-
-  def testRepeatTensorDataset(self):
-    """Test a dataset that repeats its input multiple times."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    # This placeholder can be fed when dataset-definition subgraph
-    # runs (i.e. `init_op` below) to configure the number of
-    # repetitions used in a particular iterator.
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .repeat(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Test a finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 3})
-      for _ in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test a different finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 7})
-      for _ in range(7):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an empty repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an infinite repetition.
-      # NOTE(mrry): There's not a good way to test that the sequence
-      # actually is infinite.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for _ in range(17):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-  def testTakeTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .take(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Take fewer than input size
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take more than input size
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take all of input
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSkipTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .skip(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Skip fewer than input size, we should skip
-      # the first 4 elements and then read the rest.
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip more than input size: get nothing.
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip exactly input size.
-      sess.run(init_op, feed_dict={count_placeholder: 10})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Set -1 for 'count': skip the entire dataset.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      for i in range(0, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatRepeatTensorDataset(self):
-    """Test the composition of repeat datasets."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    inner_count = array_ops.placeholder(dtypes.int64, shape=[])
-    outer_count = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components).repeat(inner_count)
-                .repeat(outer_count).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={inner_count: 7, outer_count: 14})
-      for _ in range(7 * 14):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatEmptyDataset(self):
-    """Test that repeating an empty dataset does not hang."""
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10)
-                .repeat(-1).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
deleted file mode 100644
index b9f3c79da56ee20ba3cb96392d97352988089f81..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
-
-
-class ShardDatasetOpTest(test_base.DatasetTestBase):
-
-  def testSimpleCase(self):
-    dataset = dataset_ops.Dataset.range(10).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      self.assertEqual(2, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testNestedData(self):
-    dataset_a = dataset_ops.Dataset.range(10)
-    dataset_b = dataset_ops.Dataset.range(10, 0, -1)
-    dataset = dataset_ops.Dataset.zip((dataset_a, dataset_b)).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      self.assertEqual((2, 8), sess.run(iterator.get_next()))
-      self.assertEqual((7, 3), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testOffsetZero(self):
-    dataset = dataset_ops.Dataset.range(10).shard(5, 0)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(iterator.get_next()))
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testOffsetGreaterNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, 7)
-
-  def testNegativeOffset(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, -3)
-
-  def testNegativeNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(-3, 1)
-
-  def testZeroNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(0, 1)
-
-  def testIteratorEndsBeforeFirstElem(self):
-    dataset = dataset_ops.Dataset.range(1).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testLargerWorkerPool(self):
-    dataset = dataset_ops.Dataset.range(10).shard(7, 5)
-    iterator = dataset.make_one_shot_iterator()
-    with self.cached_session() as sess:
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testIndexEqualsNumShards(self):
-    dataset = dataset_ops.Dataset.range(10).shard(5, 4)
-    iterator = dataset.make_one_shot_iterator()
-    with self.cached_session() as sess:
-      self.assertEqual(4, sess.run(iterator.get_next()))
-      self.assertEqual(9, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testIndexEqualsNumShards2(self):
-    dataset = dataset_ops.Dataset.range(10).shard(4, 3)
-    iterator = dataset.make_one_shot_iterator()
-    with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/shard_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..928550676d5b05c2e5a459af355acebe2f1f1cc4
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -0,0 +1,76 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.shard()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ShardTest(test_base.DatasetTestBase):
+
+  def testSimpleCase(self):
+    dataset = dataset_ops.Dataset.range(10).shard(5, 2)
+    self.assertDatasetProduces(dataset, expected_output=[2, 7])
+
+  def testNestedData(self):
+    dataset_a = dataset_ops.Dataset.range(10)
+    dataset_b = dataset_ops.Dataset.range(10, 0, -1)
+    dataset = dataset_ops.Dataset.zip((dataset_a, dataset_b)).shard(5, 2)
+    self.assertDatasetProduces(dataset, expected_output=[(2, 8), (7, 3)])
+
+  def testOffsetZero(self):
+    dataset = dataset_ops.Dataset.range(10).shard(5, 0)
+    self.assertDatasetProduces(dataset, expected_output=[0, 5])
+
+  def testOffsetGreaterNumShards(self):
+    with self.assertRaises(ValueError):
+      dataset_ops.Dataset.range(10).shard(5, 7)
+
+  def testNegativeOffset(self):
+    with self.assertRaises(ValueError):
+      dataset_ops.Dataset.range(10).shard(5, -3)
+
+  def testNegativeNumShards(self):
+    with self.assertRaises(ValueError):
+      dataset_ops.Dataset.range(10).shard(-3, 1)
+
+  def testZeroNumShards(self):
+    with self.assertRaises(ValueError):
+      dataset_ops.Dataset.range(10).shard(0, 1)
+
+  def testIteratorEndsBeforeFirstElem(self):
+    dataset = dataset_ops.Dataset.range(1).shard(5, 2)
+    self.assertDatasetProduces(dataset, expected_output=[])
+
+  def testLargerWorkerPool(self):
+    dataset = dataset_ops.Dataset.range(10).shard(7, 5)
+    self.assertDatasetProduces(dataset, expected_output=[5])
+
+  def testIndexEqualsNumShards(self):
+    dataset = dataset_ops.Dataset.range(10).shard(5, 4)
+    self.assertDatasetProduces(dataset, expected_output=[4, 9])
+
+  def testIndexEqualsNumShards2(self):
+    dataset = dataset_ops.Dataset.range(10).shard(4, 3)
+    self.assertDatasetProduces(dataset, expected_output=[3, 7])
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
deleted file mode 100644
index 347af18576aed28f534b70e85a390090250fbac6..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class ShuffleDatasetTest(test_base.DatasetTestBase):
-
-  def testShuffleDataset(self):
-    components = (
-        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-        np.array([9.0, 10.0, 11.0, 12.0])
-    )
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    shuffle_dataset = repeat_dataset.shuffle(buffer_size_placeholder,
-                                             seed_placeholder)
-
-    self.assertEqual(tuple([c.shape[1:] for c in components]),
-                     shuffle_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # shuffling, respectively.
-    iterator = iterator_ops.Iterator.from_structure(
-        shuffle_dataset.output_types, shuffle_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_shuffle_op = iterator.make_initializer(shuffle_dataset)
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # First run without shuffling to collect the "ground truth".
-      sess.run(init_fifo_op)
-      unshuffled_elements = []
-      for _ in range(20):
-        unshuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      shuffled_elements = []
-      for _ in range(20):
-        shuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(shuffled_elements))
-
-      # Assert that shuffling twice with the same seeds gives the same sequence.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      reshuffled_elements_same_seed = []
-      for _ in range(20):
-        reshuffled_elements_same_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
-
-      # Assert that shuffling twice with a different seed gives a different
-      # permutation of the same elements.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 1037})
-      reshuffled_elements_different_seed = []
-      for _ in range(20):
-        reshuffled_elements_different_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
-      self.assertAllEqual(
-          sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
-
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth" when the buffer size is smaller than the input
-      # dataset.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 2,
-                     seed_placeholder: 37})
-      reshuffled_elements_small_buffer = []
-      for _ in range(20):
-        reshuffled_elements_small_buffer.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
-
-      # Test the case of shuffling an empty dataset.
-      sess.run(init_shuffle_op, feed_dict={buffer_size_placeholder: 2,
-                                           seed_placeholder: 37,
-                                           count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSeedZero(self):
-    """Test for same behavior when the seed is a Python or Tensor zero."""
-    iterator = (
-        dataset_ops.Dataset.range(10).shuffle(10, seed=0)
-        .make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    elems = []
-    with self.cached_session() as sess:
-      for _ in range(10):
-        elems.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder)
-        .make_initializable_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={seed_placeholder: 0})
-      for elem in elems:
-        self.assertEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDefaultArguments(self):
-    components = [0, 1, 2, 3, 4]
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
-                .repeat().make_one_shot_iterator())
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      counts = collections.defaultdict(lambda: 0)
-      for _ in range(10):
-        for _ in range(5):
-          counts[sess.run(get_next)] += 1
-
-    for i in range(5):
-      self.assertEqual(10, counts[i])
-
-  def testShuffleNoReshuffleEachIteration(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .shuffle(10, reshuffle_each_iteration=False)
-                .batch(10)
-                .repeat(3)
-                .make_one_shot_iterator())
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      initial_permutation = sess.run(next_element)
-      self.assertAllEqual(initial_permutation, sess.run(next_element))
-      self.assertAllEqual(initial_permutation, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testShuffleReshuffleEachIteration(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .shuffle(10, seed=3, reshuffle_each_iteration=True)
-                .batch(10)
-                .repeat(3)
-                .make_one_shot_iterator())
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      initial_permutation = list(sess.run(next_element))
-      for _ in range(2):
-        next_permutation = list(sess.run(next_element))
-        self.assertNotEqual(initial_permutation, next_permutation)
-        self.assertAllEqual(
-            sorted(initial_permutation), sorted(next_permutation))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..13df870938d1cee7b29e0189b9b1db1731bb4114
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -0,0 +1,249 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.shuffle()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testShuffleDataset(self):
+    components = (
+        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+        np.array([9.0, 10.0, 11.0, 12.0])
+    )
+
+    def dataset_fn(count=5, buffer_size=None, seed=0):
+      repeat_dataset = (
+          dataset_ops.Dataset.from_tensor_slices(components).repeat(count))
+      if buffer_size:
+        shuffle_dataset = repeat_dataset.shuffle(buffer_size, seed)
+
+        self.assertEqual(
+            tuple([c.shape[1:] for c in components]),
+            shuffle_dataset.output_shapes)
+        return shuffle_dataset
+      else:
+        return repeat_dataset
+
+    # First run without shuffling to collect the "ground truth".
+    get_next = self.getNext(dataset_fn())
+    unshuffled_elements = []
+    for _ in range(20):
+      unshuffled_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Assert that the shuffled dataset has the same elements as the
+    # "ground truth".
+    get_next = self.getNext(dataset_fn(buffer_size=100, seed=37))
+    shuffled_elements = []
+    for _ in range(20):
+      shuffled_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertAllEqual(sorted(unshuffled_elements), sorted(shuffled_elements))
+
+    # Assert that shuffling twice with the same seeds gives the same sequence.
+    get_next = self.getNext(dataset_fn(buffer_size=100, seed=37))
+    reshuffled_elements_same_seed = []
+    for _ in range(20):
+      reshuffled_elements_same_seed.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
+
+    # Assert that shuffling twice with a different seed gives a different
+    # permutation of the same elements.
+    get_next = self.getNext(dataset_fn(buffer_size=100, seed=137))
+    reshuffled_elements_different_seed = []
+    for _ in range(20):
+      reshuffled_elements_different_seed.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
+    self.assertAllEqual(
+        sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
+
+    # Assert that the shuffled dataset has the same elements as the
+    # "ground truth" when the buffer size is smaller than the input
+    # dataset.
+    get_next = self.getNext(dataset_fn(buffer_size=2, seed=37))
+    reshuffled_elements_small_buffer = []
+    for _ in range(20):
+      reshuffled_elements_small_buffer.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertAllEqual(
+        sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
+
+    # Test the case of shuffling an empty dataset.
+    get_next = self.getNext(dataset_fn(count=0, buffer_size=100, seed=37))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSeedZero(self):
+    """Test for same behavior when the seed is a Python or Tensor zero."""
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.range(10).shuffle(10, seed=0))
+    get_next = iterator.get_next()
+
+    elems = []
+    with self.cached_session() as sess:
+      for _ in range(10):
+        elems.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder))
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer, feed_dict={seed_placeholder: 0})
+      for elem in elems:
+        self.assertEqual(elem, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDefaultArguments(self):
+    components = [0, 1, 2, 3, 4]
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).shuffle(
+        5).repeat()
+    get_next = self.getNext(dataset)
+    counts = collections.defaultdict(lambda: 0)
+    for _ in range(10):
+      for _ in range(5):
+        counts[self.evaluate(get_next())] += 1
+
+    for i in range(5):
+      self.assertEqual(10, counts[i])
+
+  def testShuffleNoReshuffleEachIteration(self):
+    dataset = dataset_ops.Dataset.range(10).shuffle(
+        10, reshuffle_each_iteration=False).batch(10).repeat(3)
+    next_element = self.getNext(dataset)
+
+    initial_permutation = self.evaluate(next_element())
+    self.assertAllEqual(initial_permutation, self.evaluate(next_element()))
+    self.assertAllEqual(initial_permutation, self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+  def testShuffleReshuffleEachIteration(self):
+    dataset = dataset_ops.Dataset.range(10).shuffle(
+        10, seed=3, reshuffle_each_iteration=True).batch(10).repeat(3)
+    next_element = self.getNext(dataset)
+
+    initial_permutation = list(self.evaluate(next_element()))
+    for _ in range(2):
+      next_permutation = list(self.evaluate(next_element()))
+      self.assertNotEqual(initial_permutation, next_permutation)
+      self.assertAllEqual(sorted(initial_permutation), sorted(next_permutation))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+  @parameterized.named_parameters(
+      ("ReshuffleGraphLevelSeed", True, 38, None),
+      ("ReshuffleOpLevelSeed", True, None, 42),
+      ("ReshuffleGraphAndOpLevelSeed", True, 38, 42),
+      ("NoReshuffleGraphLevelSeed", False, 38, None),
+      ("NoReshuffleOpLevelSeed", False, None, 42),
+      ("NoReshuffleGraphAndOpLevelSeed", False, 38, 42),
+  )
+  def testSkipEagerShuffleSeed(self, reshuffle, graph_level_seed,
+                               op_level_seed):
+    results = []
+    for _ in range(2):
+      with ops.Graph().as_default() as g:
+        random_seed.set_random_seed(graph_level_seed)
+        dataset = dataset_ops.Dataset.range(10).shuffle(
+            10, seed=op_level_seed, reshuffle_each_iteration=reshuffle).repeat(
+                3)
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
+        next_element = iterator.get_next()
+
+        run_results = []
+        with self.session(graph=g) as sess:
+          for _ in range(30):
+            run_results.append(sess.run(next_element))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(next_element)
+        results.append(run_results)
+
+    self.assertAllEqual(results[0], results[1])
+
+  # TODO(b/117581999): fails for eager mode with result[0] equal to result[1],
+  # debug.
+  @parameterized.named_parameters(
+      ("ReshuffleOneShot", True, False),
+      ("ReshuffleInitializable", True, True),
+      ("NoReshuffleOneShot", False, False),
+      ("NoReshuffleInitializable", False, True),
+  )
+  def testSkipEagerMultipleIterators(self, reshuffle, initializable):
+    with ops.Graph().as_default() as g:
+      dataset = dataset_ops.Dataset.range(100).shuffle(
+          10, reshuffle_each_iteration=reshuffle).repeat(3)
+
+      if initializable:
+        iterators = [dataset_ops.make_initializable_iterator(dataset)
+                     for _ in range(2)]
+      else:
+        iterators = [dataset_ops.make_one_shot_iterator(dataset)
+                     for _ in range(2)]
+
+      results = []
+      with self.session(graph=g) as sess:
+        for iterator in iterators:
+          if initializable:
+            sess.run(iterator.initializer)
+          next_element = iterator.get_next()
+          run_results = []
+          for _ in range(300):
+            run_results.append(sess.run(next_element))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(next_element)
+
+          results.append(run_results)
+
+        self.assertNotEqual(results[0], results[1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/skip_test.py b/tensorflow/python/data/kernel_tests/skip_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c22be576921c6d8e569ecb60c90925d004a0e5de
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/skip_test.py
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.skip()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SkipTest(test_base.DatasetTestBase):
+
+  def testSkipTensorDataset(self):
+    components = (np.arange(10),)
+
+    def do_test(count):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).skip(count)
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      start_range = min(count, 10) if count != -1 else 10
+      self.assertDatasetProduces(
+          dataset,
+          [tuple(components[0][i:i + 1]) for i in range(start_range, 10)])
+
+    # Skip fewer than input size, we should skip
+    # the first 4 elements and then read the rest.
+    do_test(4)
+
+    # Skip more than input size: get nothing.
+    do_test(25)
+
+    # Skip exactly input size.
+    do_test(10)
+
+    # Set -1 for 'count': skip the entire dataset.
+    do_test(-1)
+
+    # Skip nothing
+    do_test(0)
+
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/take_test.py b/tensorflow/python/data/kernel_tests/take_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..03a7ece2d8c8ea88d4504a4341ae3bb13ee2c3bf
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/take_test.py
@@ -0,0 +1,55 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.take()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TakeTest(test_base.DatasetTestBase):
+
+  def testTakeTensorDataset(self):
+    components = (np.arange(10),)
+
+    def do_test(count):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).take(count)
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      num_output = min(count, 10) if count != -1 else 10
+      self.assertDatasetProduces(
+          dataset, [tuple(components[0][i:i + 1]) for i in range(num_output)])
+
+    # Take fewer than input size
+    do_test(4)
+
+    # Take more than input size
+    do_test(25)
+
+    # Take all of input
+    do_test(-1)
+
+    # Take nothing
+    do_test(0)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index b730e10949160658d841b1a1c7b2b480f3e6e5fc..85f6c9de231a9054a2d7a6f434502dbecce1d601 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -19,10 +19,13 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -35,34 +38,112 @@ class DatasetTestBase(test.TestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-  def getNext(self, dataset):
+  def getNext(self, dataset, requires_initialization=False):
     """Returns a callable that returns the next element of the dataset.
 
     Example use:
     ```python
     # In both graph and eager modes
     dataset = ...
-    nxt = self.getNext(dataset)
-    result = self.evaluate(nxt())
+    get_next = self.getNext(dataset)
+    result = self.evaluate(get_next())
     ```
 
     Args:
-      dataset: A dataset whose next element is returned
-
+      dataset: A dataset whose elements will be returned.
+      requires_initialization: Indicates that when the test is executed in graph
+        mode, it should use an initializable iterator to iterate through the
+        dataset (e.g. when it contains stateful nodes). Defaults to False.
     Returns:
-      A callable that returns the next element of `dataset`
+      A callable that returns the next element of `dataset`.
     """
-    it = dataset.make_one_shot_iterator()
     if context.executing_eagerly():
-      return it.get_next
+      iterator = dataset.__iter__()
+      return iterator._next_internal  # pylint: disable=protected-access
     else:
-      nxt = it.get_next()
-      return lambda: nxt
+      if requires_initialization:
+        iterator = dataset_ops.make_initializable_iterator(dataset)
+        self.evaluate(iterator.initializer)
+      else:
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next()
+      return lambda: get_next
+
+  def _compareOutputToExpected(self, result_values, expected_values,
+                               assert_items_equal):
+    if assert_items_equal:
+      # TODO(shivaniagrawal): add support for nested elements containing sparse
+      # tensors when needed.
+      self.assertItemsEqual(result_values, expected_values)
+      return
+    for i in range(len(result_values)):
+      nest.assert_same_structure(result_values[i], expected_values[i])
+      for result_value, expected_value in zip(
+          nest.flatten(result_values[i]), nest.flatten(expected_values[i])):
+        if sparse_tensor.is_sparse(result_value):
+          self.assertSparseValuesEqual(result_value, expected_value)
+        else:
+          self.assertAllEqual(result_value, expected_value)
+
+  def assertDatasetProduces(self,
+                            dataset,
+                            expected_output=None,
+                            expected_error=None,
+                            requires_initialization=False,
+                            num_test_iterations=1,
+                            assert_items_equal=False):
+    """Asserts that a dataset produces the expected output / error.
+
+    Args:
+      dataset: A dataset to check for the expected output / error.
+      expected_output: A list of elements that the dataset is expected to
+        produce.
+      expected_error: A tuple `(type, predicate)` identifying the expected error
+        `dataset` should raise. The `type` should match the expected exception
+        type, while `predicate` should either be 1) a unary function that inputs
+        the raised exception and returns a boolean indicator of success or 2) a
+        regular expression that is expected to match the error message
+        partially.
+      requires_initialization: Indicates that when the test is executed in graph
+        mode, it should use an initializable iterator to iterate through the
+        dataset (e.g. when it contains stateful nodes). Defaults to False.
+      num_test_iterations: Number of times `dataset` will be iterated. Defaults
+        to 2.
+      assert_items_equal: Tests expected_output has (only) the same elements
+        regardless of order.
+    """
+    self.assertTrue(
+        expected_error is not None or expected_output is not None,
+        "Exactly one of expected_output or expected error should be provided.")
+    if expected_error:
+      self.assertTrue(
+          expected_output is None,
+          "Exactly one of expected_output or expected error should be provided."
+      )
+      with self.assertRaisesWithPredicateMatch(expected_error[0],
+                                               expected_error[1]):
+        get_next = self.getNext(
+            dataset, requires_initialization=requires_initialization)
+        self.evaluate(get_next())
+      return
+    self.assertGreater(num_test_iterations, 0)
+    for _ in range(num_test_iterations):
+      get_next = self.getNext(
+          dataset, requires_initialization=requires_initialization)
+      result = []
+      for _ in range(len(expected_output)):
+        result.append(self.evaluate(get_next()))
+      self._compareOutputToExpected(result, expected_output, assert_items_equal)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   def assertDatasetsEqual(self, dataset1, dataset2):
     """Checks that datasets are equal. Supports both graph and eager mode."""
     self.assertEqual(dataset1.output_types, dataset2.output_types)
     self.assertEqual(dataset1.output_classes, dataset2.output_classes)
+    flattened_types = nest.flatten(dataset1.output_types)
 
     next1 = self.getNext(dataset1)
     next2 = self.getNext(dataset2)
@@ -79,12 +160,12 @@ class DatasetTestBase(test.TestCase):
       op2 = nest.flatten(op2)
       assert len(op1) == len(op2)
       for i in range(len(op1)):
-        if isinstance(
-            op1[i],
-            (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+        if sparse_tensor.is_sparse(op1[i]):
           self.assertSparseValuesEqual(op1[i], op2[i])
-        else:
+        elif flattened_types[i] == dtypes.string:
           self.assertAllEqual(op1[i], op2[i])
+        else:
+          self.assertAllClose(op1[i], op2[i])
 
   def assertDatasetsRaiseSameError(self,
                                    dataset1,
@@ -97,7 +178,7 @@ class DatasetTestBase(test.TestCase):
     try:
       self.evaluate(next1())
       raise ValueError(
-          'Expected dataset to raise an error of type %s, but it did not.' %
+          "Expected dataset to raise an error of type %s, but it did not." %
           repr(exception_class))
     except exception_class as e:
       expected_message = e.message
@@ -107,3 +188,29 @@ class DatasetTestBase(test.TestCase):
       with self.assertRaisesRegexp(exception_class,
                                    re.escape(expected_message)):
         self.evaluate(next2())
+
+  def structuredDataset(self, structure, shape=None, dtype=dtypes.int64):
+    """Returns a singleton dataset with the given structure."""
+    if shape is None:
+      shape = []
+    if structure is None:
+      return dataset_ops.Dataset.from_tensors(
+          array_ops.zeros(shape, dtype=dtype))
+    else:
+      return dataset_ops.Dataset.zip(
+          tuple([
+              self.structuredDataset(substructure, shape, dtype)
+              for substructure in structure
+          ]))
+
+  def structuredElement(self, structure, shape=None, dtype=dtypes.int64):
+    """Returns an element with the given structure."""
+    if shape is None:
+      shape = []
+    if structure is None:
+      return array_ops.zeros(shape, dtype=dtype)
+    else:
+      return tuple([
+          self.structuredElement(substructure, shape, dtype)
+          for substructure in structure
+      ])
diff --git a/tensorflow/python/data/kernel_tests/text_line_dataset_test.py b/tensorflow/python/data/kernel_tests/text_line_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db09a98084fb5430a4430da35d8018da3827dae
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/text_line_dataset_test.py
@@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.TextLineDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+try:
+  import psutil  # pylint: disable=g-import-not-at-top
+  psutil_import_succeeded = True
+except ImportError:
+  psutil_import_succeeded = False
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TextLineDatasetTest(test_base.DatasetTestBase):
+
+  def _lineText(self, f, l):
+    return compat.as_bytes("%d: %d" % (f, l))
+
+  def _createFiles(self,
+                   num_files,
+                   num_lines,
+                   crlf=False,
+                   compression_type=None):
+    filenames = []
+    for i in range(num_files):
+      fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
+      filenames.append(fn)
+      contents = []
+      for j in range(num_lines):
+        contents.append(self._lineText(i, j))
+        # Always include a newline after the record unless it is
+        # at the end of the file, in which case we include it
+        if j + 1 != num_lines or i == 0:
+          contents.append(b"\r\n" if crlf else b"\n")
+      contents = b"".join(contents)
+
+      if not compression_type:
+        with open(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "GZIP":
+        with gzip.GzipFile(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "ZLIB":
+        contents = zlib.compress(contents)
+        with open(fn, "wb") as f:
+          f.write(contents)
+      else:
+        raise ValueError("Unsupported compression_type", compression_type)
+
+    return filenames
+
+  def _testTextLineDataset(self, compression_type=None):
+    test_filenames = self._createFiles(
+        2, 5, crlf=True, compression_type=compression_type)
+
+    def dataset_fn(filenames, num_epochs, batch_size=None):
+      repeat_dataset = readers.TextLineDataset(
+          filenames, compression_type=compression_type).repeat(num_epochs)
+      if batch_size:
+        return repeat_dataset.batch(batch_size)
+      return repeat_dataset
+
+    # Basic test: read from file 0.
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[0]], 1), expected_output=expected_output)
+
+    # Basic test: read from file 1.
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[1]], 1),
+        expected_output=[self._lineText(1, i) for i in range(5)])
+
+    # Basic test: read from both files.
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    expected_output.extend([self._lineText(1, i) for i in range(5)])
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 1), expected_output=expected_output)
+
+    # Test repeated iteration through both files.
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    expected_output.extend([self._lineText(1, i) for i in range(5)])
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 10), expected_output=expected_output * 10)
+
+    # Test batched and repeated iteration through both files.
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 10, 5),
+        expected_output=[[self._lineText(0, i) for i in range(5)],
+                         [self._lineText(1, i) for i in range(5)]] * 10)
+
+  def testTextLineDatasetNoCompression(self):
+    self._testTextLineDataset()
+
+  def testTextLineDatasetGzipCompression(self):
+    self._testTextLineDataset(compression_type="GZIP")
+
+  def testTextLineDatasetZlibCompression(self):
+    self._testTextLineDataset(compression_type="ZLIB")
+
+  def testTextLineDatasetBuffering(self):
+    test_filenames = self._createFiles(2, 5, crlf=True)
+
+    repeat_dataset = readers.TextLineDataset(test_filenames, buffer_size=10)
+    expected_output = []
+    for j in range(2):
+      expected_output.extend([self._lineText(j, i) for i in range(5)])
+    self.assertDatasetProduces(repeat_dataset, expected_output=expected_output)
+
+  def testIteratorResourceCleanup(self):
+    filename = os.path.join(self.get_temp_dir(), "text.txt")
+    with open(filename, "wt") as f:
+      for i in range(3):
+        f.write("%d\n" % (i,))
+    with context.eager_mode():
+      first_iterator = iter(readers.TextLineDataset(filename))
+      self.assertEqual(b"0", next(first_iterator).numpy())
+      second_iterator = iter(readers.TextLineDataset(filename))
+      self.assertEqual(b"0", next(second_iterator).numpy())
+      # Eager kernel caching is based on op attributes, which includes the
+      # Dataset's output shape. Create a different kernel to test that they
+      # don't create resources with the same names.
+      different_kernel_iterator = iter(
+          readers.TextLineDataset(filename).repeat().batch(16))
+      self.assertEqual([16], next(different_kernel_iterator).shape)
+      # Remove our references to the Python Iterator objects, which (assuming no
+      # reference cycles) is enough to trigger DestroyResourceOp and close the
+      # partially-read files.
+      del first_iterator
+      del second_iterator
+      del different_kernel_iterator
+      if not psutil_import_succeeded:
+        self.skipTest(
+            "psutil is required to check that we've closed our files.")
+      open_files = psutil.Process().open_files()
+      self.assertNotIn(filename, [open_file.path for open_file in open_files])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py b/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..13a70aa88d0b427cfc19717bc1202a032b564938
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py
@@ -0,0 +1,170 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.TFRecordDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import test_util
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TFRecordDatasetTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    super(TFRecordDatasetTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+
+    self.test_filenames = self._createFiles()
+
+  def dataset_fn(self,
+                 filenames,
+                 compression_type="",
+                 num_epochs=1,
+                 batch_size=None):
+
+    repeat_dataset = readers.TFRecordDataset(
+        filenames, compression_type).repeat(num_epochs)
+    if batch_size:
+      return repeat_dataset.batch(batch_size)
+    return repeat_dataset
+
+  def _record(self, f, r):
+    return compat.as_bytes("Record %d of file %d" % (r, f))
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(self._num_records):
+        writer.write(self._record(i, j))
+      writer.close()
+    return filenames
+
+  def testReadOneEpoch(self):
+    # Basic test: read from file 0.
+    dataset = self.dataset_fn(self.test_filenames[0])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[self._record(0, i) for i in range(self._num_records)])
+
+    # Basic test: read from file 1.
+    dataset = self.dataset_fn(self.test_filenames[1])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[self._record(1, i) for i in range(self._num_records)])
+
+    # Basic test: read from both files.
+    dataset = self.dataset_fn(self.test_filenames)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadTenEpochs(self):
+    dataset = self.dataset_fn(self.test_filenames, num_epochs=10)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output * 10)
+
+  def testReadTenEpochsOfBatches(self):
+    dataset = self.dataset_fn(
+        self.test_filenames, num_epochs=10, batch_size=self._num_records)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.append(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output * 10)
+
+  def testReadZlibFiles(self):
+    zlib_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        cdata = zlib.compress(f.read())
+
+        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
+        with open(zfn, "wb") as f:
+          f.write(cdata)
+        zlib_files.append(zfn)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = self.dataset_fn(zlib_files, compression_type="ZLIB")
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadGzipFiles(self):
+    gzip_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
+        with gzip.GzipFile(gzfn, "wb") as gzf:
+          gzf.write(f.read())
+        gzip_files.append(gzfn)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = self.dataset_fn(gzip_files, compression_type="GZIP")
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadWithBuffer(self):
+    one_mebibyte = 2**20
+    dataset = readers.TFRecordDataset(
+        self.test_filenames, buffer_size=one_mebibyte)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadFromDatasetOfFiles(self):
+    files = dataset_ops.Dataset.from_tensor_slices(self.test_filenames)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = readers.TFRecordDataset(files)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadTenEpochsFromDatasetOfFilesInParallel(self):
+    files = dataset_ops.Dataset.from_tensor_slices(
+        self.test_filenames).repeat(10)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = readers.TFRecordDataset(files, num_parallel_reads=4)
+    self.assertDatasetProduces(
+        dataset, expected_output=expected_output * 10, assert_items_equal=True)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/window_dataset_op_test.py b/tensorflow/python/data/kernel_tests/window_dataset_op_test.py
deleted file mode 100644
index 9d067810944c23a19418a4625dae2997d122d119..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/window_dataset_op_test.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ("1", 20, 14, 7, 1),
-      ("2", 20, 17, 9, 1),
-      ("3", 20, 14, 14, 1),
-      ("4", 20, 10, 14, 1),
-      ("5", 20, 14, 19, 1),
-      ("6", 20, 4, 1, 2),
-      ("7", 20, 2, 1, 6),
-      ("8", 20, 4, 7, 2),
-      ("9", 20, 2, 7, 6),
-      ("10", 1, 10, 4, 1),
-      ("11", 0, 10, 4, 1),
-      ("12", 20, 14, 7, 1, False),
-      ("13", 20, 17, 9, 1, False),
-      ("14", 20, 14, 14, 1, False),
-      ("15", 20, 10, 14, 1, False),
-      ("16", 20, 14, 19, 1, False),
-      ("17", 20, 4, 1, 2, False),
-      ("18", 20, 2, 1, 6, False),
-      ("19", 20, 4, 7, 2, False),
-      ("20", 20, 2, 7, 6, False),
-      ("21", 1, 10, 4, 1, False),
-      ("22", 0, 10, 4, 1, False),
-  )
-  def testWindowDataset(self, count, size, shift, stride, drop_remainder=True):
-    """Tests a dataset that slides a window its input elements."""
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count_t = array_ops.placeholder(dtypes.int64, shape=[])
-    size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    shift_t = array_ops.placeholder(dtypes.int64, shape=[])
-    stride_t = array_ops.placeholder(dtypes.int64, shape=[])
-    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    def _flat_map_fn(x, y, z):
-      return dataset_ops.Dataset.zip((x.batch(batch_size=size_t),
-                                      y.batch(batch_size=size_t),
-                                      z.batch(batch_size=size_t)))
-
-    iterator = dataset_ops.Dataset.from_tensor_slices(components).map(
-        _map_fn).repeat(count).window(
-            size=size_t,
-            shift=shift_t,
-            stride=stride_t,
-            drop_remainder=drop_remainder_t).flat_map(
-                _flat_map_fn).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              count_t: count,
-              size_t: size,
-              shift_t: shift,
-              stride_t: stride,
-              drop_remainder_t: drop_remainder
-          })
-      num_full_batches = max(
-          0, (count * 7 - ((size - 1) * stride + 1)) // shift + 1)
-      for i in range(num_full_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(size):
-            self.assertAllEqual(component[(i * shift + j * stride) % 7]**2,
-                                result_component[j])
-      if not drop_remainder:
-        num_partial_batches = (count * 7) // shift + (
-            (count * 7) % shift > 0) - num_full_batches
-        for i in range(num_partial_batches):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            remaining = (count * 7) - ((num_full_batches + i) * shift)
-            num_elements = remaining // stride + ((remaining % stride) > 0)
-            for j in range(num_elements):
-              self.assertAllEqual(
-                  component[((num_full_batches + i) * shift + j * stride) % 7]
-                  **2, result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", 14, 0, 3, 1),
-      ("2", 14, 3, 0, 1),
-      ("3", 14, 3, 3, 0),
-  )
-  def testWindowDatasetInvalid(self, count, size, shift, stride):
-    count_t = array_ops.placeholder(dtypes.int64, shape=[])
-    size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    shift_t = array_ops.placeholder(dtypes.int64, shape=[])
-    stride_t = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = dataset_ops.Dataset.range(10).map(lambda x: x).repeat(
-        count_t).window(
-            size=size_t, shift=shift_t,
-            stride=stride_t).flat_map(lambda x: x.batch(batch_size=size_t)
-                                     ).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(
-            init_op,
-            feed_dict={
-                count_t: count,
-                size_t: size,
-                shift_t: shift,
-                stride_t: stride
-            })
-
-  def testWindowSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).window(
-        size=5, shift=3, drop_remainder=True).flat_map(
-            lambda x: x.batch(batch_size=5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      num_batches = (10 - 5) // 3 + 1
-      for i in range(num_batches):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 3, i * 3 + 1, i * 3 + 2, i * 3 + 3, i * 3 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testWindowSparseWithDifferentDenseShapes(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=array_ops.expand_dims(
-              math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
-          dense_shape=[i])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).window(
-        size=5, shift=3, drop_remainder=True).flat_map(
-            lambda x: x.batch(batch_size=5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      num_batches = (10 - 5) // 3 + 1
-      for i in range(num_batches):
-        actual = sess.run(get_next)
-        expected_indices = []
-        expected_values = []
-        for j in range(5):
-          for k in range(i * 3 + j):
-            expected_indices.append([j, k])
-            expected_values.append(i * 3 + j)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=expected_indices,
-            values=expected_values,
-            dense_shape=[5, i * 3 + 5 - 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedWindowSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).window(
-        size=4, shift=2,
-        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=4)).window(
-            size=3, shift=1, drop_remainder=True).flat_map(
-                lambda x: x.batch(batch_size=3)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      # Slide: 1st batch.
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
-                   [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
-                   [2, 2, 0], [2, 3, 0]],
-          values=[0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7],
-          dense_shape=[3, 4, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      # Slide: 2nd batch.
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
-                   [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
-                   [2, 2, 0], [2, 3, 0]],
-          values=[2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9],
-          dense_shape=[3, 4, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testWindowShapeError(self):
-
-    def generator():
-      yield [1.0, 2.0, 3.0]
-      yield [4.0, 5.0, 6.0]
-      yield [7.0, 8.0, 9.0, 10.0]
-
-    iterator = dataset_ops.Dataset.from_generator(
-        generator, dtypes.float32, output_shapes=[None]).window(
-            size=3, shift=1).flat_map(
-                lambda x: x.batch(batch_size=3)).make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"Cannot batch tensors with different shapes in component 0. "
-          r"First element had shape \[3\] and element 2 had shape \[4\]."):
-        sess.run(next_element)
-
-  def testWindowIgnoreErrors(self):
-    input_values = np.float32([1., np.nan, 2., np.nan, 3.])
-    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
-        lambda x: array_ops.check_numerics(x, "message")).window(
-            size=2, shift=2, stride=2,
-            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual(np.float32([1., 2.]), sess.run(get_next))
-      self.assertAllEqual(np.float32([2., 3.]), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/window_test.py b/tensorflow/python/data/kernel_tests/window_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d083142ab6a1f300b9e51b50d0113474053af05e
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/window_test.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.window()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class WindowTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("1", 20, 14, 7, 1),
+      ("2", 20, 17, 9, 1),
+      ("3", 20, 14, 14, 1),
+      ("4", 20, 10, 14, 1),
+      ("5", 20, 14, 19, 1),
+      ("6", 20, 4, 1, 2),
+      ("7", 20, 2, 1, 6),
+      ("8", 20, 4, 7, 2),
+      ("9", 20, 2, 7, 6),
+      ("10", 1, 10, 4, 1),
+      ("11", 0, 10, 4, 1),
+      ("12", 20, 14, 7, 1, False),
+      ("13", 20, 17, 9, 1, False),
+      ("14", 20, 14, 14, 1, False),
+      ("15", 20, 10, 14, 1, False),
+      ("16", 20, 14, 19, 1, False),
+      ("17", 20, 4, 1, 2, False),
+      ("18", 20, 2, 1, 6, False),
+      ("19", 20, 4, 7, 2, False),
+      ("20", 20, 2, 7, 6, False),
+      ("21", 1, 10, 4, 1, False),
+      ("22", 0, 10, 4, 1, False),
+  )
+  def testWindowDataset(self, count, size, shift, stride, drop_remainder=True):
+    """Tests a dataset that slides a window its input elements."""
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def _flat_map_fn(x, y, z):
+      return dataset_ops.Dataset.zip((x.batch(batch_size=size),
+                                      y.batch(batch_size=size),
+                                      z.batch(batch_size=size)))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn).repeat(count).window(
+            size=size,
+            shift=shift,
+            stride=stride,
+            drop_remainder=drop_remainder).flat_map(_flat_map_fn)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual(
+        [[None] + list(c.shape[1:]) for c in components],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+
+    num_full_batches = max(0,
+                           (count * 7 - ((size - 1) * stride + 1)) // shift + 1)
+    for i in range(num_full_batches):
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range(size):
+          self.assertAllEqual(component[(i * shift + j * stride) % 7]**2,
+                              result_component[j])
+    if not drop_remainder:
+      num_partial_batches = (count * 7) // shift + (
+          (count * 7) % shift > 0) - num_full_batches
+      for i in range(num_partial_batches):
+        result = self.evaluate(get_next())
+        for component, result_component in zip(components, result):
+          remaining = (count * 7) - ((num_full_batches + i) * shift)
+          num_elements = remaining // stride + ((remaining % stride) > 0)
+          for j in range(num_elements):
+            self.assertAllEqual(
+                component[((num_full_batches + i) * shift + j * stride) % 7]**2,
+                result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @parameterized.named_parameters(
+      ("1", 14, 0, 3, 1),
+      ("2", 14, 3, 0, 1),
+      ("3", 14, 3, 3, 0),
+  )
+  def testWindowDatasetInvalid(self, count, size, shift, stride):
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x).repeat(
+        count).window(
+            size=size, shift=shift,
+            stride=stride).flat_map(lambda x: x.batch(batch_size=size))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ""))
+
+  def testWindowSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
+        size=5, shift=3,
+        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=5))
+
+    num_batches = (10 - 5) // 3 + 1
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 3, i * 3 + 1, i * 3 + 2, i * 3 + 3, i * 3 + 4],
+            dense_shape=[5, 1]) for i in range(num_batches)
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testWindowSparseWithDifferentDenseShapes(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=array_ops.expand_dims(
+              math_ops.range(i, dtype=dtypes.int64), 1),
+          values=array_ops.fill([math_ops.to_int32(i)], i),
+          dense_shape=[i])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
+        size=5, shift=3,
+        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=5))
+
+    expected_output = []
+    num_batches = (10 - 5) // 3 + 1
+    for i in range(num_batches):
+      expected_indices = []
+      expected_values = []
+      for j in range(5):
+        for k in range(i * 3 + j):
+          expected_indices.append([j, k])
+          expected_values.append(i * 3 + j)
+      expected_output.append(
+          sparse_tensor.SparseTensorValue(
+              indices=expected_indices,
+              values=expected_values,
+              dense_shape=[5, i * 3 + 5 - 1]))
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testNestedWindowSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
+        size=4, shift=2,
+        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=4)).window(
+            size=3, shift=1,
+            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=3))
+
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
+                     [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
+                     [2, 2, 0], [2, 3, 0]],
+            values=[0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7],
+            dense_shape=[3, 4, 1]),
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
+                     [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
+                     [2, 2, 0], [2, 3, 0]],
+            values=[2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9],
+            dense_shape=[3, 4, 1])
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testWindowShapeError(self):
+
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, dtypes.float32, output_shapes=[None]).window(
+            size=3, shift=1).flat_map(lambda x: x.batch(batch_size=3))
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            r"Cannot batch tensors with different shapes in component 0. "
+            r"First element had shape \[3\] and element 2 had shape \[4\]."))
+
+  def testWindowIgnoreErrors(self):
+    input_values = np.float32([1., np.nan, 2., np.nan, 3.])
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
+        lambda x: array_ops.check_numerics(x, "message")).window(
+            size=2, shift=2, stride=2,
+            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
+    self.assertDatasetProduces(
+        dataset, expected_output=[np.float32([1., 2.]),
+                                  np.float32([2., 3.])])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py b/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
deleted file mode 100644
index 9d76387a343de6e8652dd595c08bf72680a8197e..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class ZipDatasetTest(test_base.DatasetTestBase):
-
-  def testZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.float64)
-    ]
-
-    datasets = tuple([
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ])
-    zipped = dataset_ops.Dataset.zip(datasets)
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            equal_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, variable_length_components)})
-      for i in range(2):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            variable_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64, shape=[4, 20]),
-        array_ops.placeholder(dtypes.int64, shape=[4, 22]),
-        array_ops.placeholder(dtypes.float64, shape=[4])
-    ]
-
-    datasets = [
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ]
-    zipped = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([20], get_next[0].shape)
-    self.assertEqual([22], get_next[1][0].shape)
-    self.assertEqual([], get_next[1][1].shape)
-
-    with self.cached_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        result1, (result2, result3) = sess.run(get_next)
-        self.assertAllEqual(equal_length_components[0][i], result1)
-        self.assertAllEqual(equal_length_components[1][i], result2)
-        self.assertAllEqual(equal_length_components[2][i], result3)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/zip_test.py b/tensorflow/python/data/kernel_tests/zip_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..477c9fa7da14276f5ad0b503402e24711b139832
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/zip_test.py
@@ -0,0 +1,101 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.zip()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ZipTest(test_base.DatasetTestBase):
+
+  def testZipDataset(self):
+
+    def dataset_fn(components):
+      datasets = tuple([
+          dataset_ops.Dataset.from_tensor_slices(component)
+          for component in components
+      ])
+      return dataset_ops.Dataset.zip(datasets)
+
+    equal_length_components = [
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    ]
+
+    get_next = self.getNext(dataset_fn(equal_length_components))
+    for i in range(4):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(equal_length_components, results):
+        self.assertAllEqual(component[i], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
+    get_next = self.getNext(dataset_fn(variable_length_components))
+    for i in range(2):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(variable_length_components,
+                                             results):
+        self.assertAllEqual(component[i], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testNestedZipDataset(self):
+
+    equal_length_components = [
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    ]
+    datasets = [
+        dataset_ops.Dataset.from_tensor_slices(component)
+        for component in equal_length_components
+    ]
+    dataset = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
+
+    self.assertEqual(
+        dataset.output_shapes,
+        (tensor_shape.TensorShape([20]),
+         (tensor_shape.TensorShape([22]), tensor_shape.TensorShape([]))))
+
+    get_next = self.getNext(dataset)
+    for i in range(4):
+      result1, (result2, result3) = self.evaluate(get_next())
+      self.assertAllEqual(equal_length_components[0][i], result1)
+      self.assertAllEqual(equal_length_components[1][i], result2)
+      self.assertAllEqual(equal_length_components[2][i], result3)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 76bf2470b1c6ae4e9b72bc9bf573f1fe56a84d2d..fbff7df9c379e04a2b12a14ed5f5534339cde543 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -14,6 +14,7 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python:math_ops",
@@ -25,7 +26,12 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/experimental/ops:stats_options",
+        "//tensorflow/python/data/experimental/ops:threading_options",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:options",
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
@@ -46,6 +52,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/compat",
         "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -80,6 +87,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/util:structure",
     ],
 )
@@ -90,13 +98,17 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dataset_ops",
+        ":iterator_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:functional_ops",
+        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
 )
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index b7e19055f24795610219f16ddd67540d87ee6b92..bee04aaef2b382ffce179bf7b44a699bd4c7b778 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import functools
 import threading
 import warnings
 
@@ -25,15 +26,22 @@ import numpy as np
 import six
 
 from tensorflow.python.compat import compat
+from tensorflow.python.data.experimental.ops import filter_for_shard_ops
+from tensorflow.python.data.experimental.ops import optimization_options
+from tensorflow.python.data.experimental.ops import stats_options
+from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import options as options_lib
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed as core_random_seed
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
@@ -41,26 +49,28 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.Dataset")
-class Dataset(object):
+ops.NotDifferentiable("ReduceDataset")
+
+
+@tf_export("data.Dataset", v1=[])
+@six.add_metaclass(abc.ABCMeta)
+class DatasetV2(object):
   """Represents a potentially large set of elements.
 
   A `Dataset` can be used to represent an input pipeline as a
   collection of elements (nested structures of tensors) and a "logical
   plan" of transformations that act on those elements.
   """
-  __metaclass__ = abc.ABCMeta
-
-  def __init__(self):
-    pass
 
   def _as_serialized_graph(self):
     """Produces serialized graph representation of the dataset.
@@ -86,68 +96,83 @@ class Dataset(object):
 
     raise NotImplementedError("Dataset._inputs")
 
-  def options(self):
-    """Returns the options for this dataset.
+  def _has_captured_ref(self):
+    """Whether this dataset uses a function that captures ref variables.
 
     Returns:
-      A `tf.data.Options` object representing the dataset options.
+      A boolean, which if true indicates that the dataset or one of its inputs
+      uses a function that captures ref variables.
     """
-    for input_dataset in self._inputs():
-      options = input_dataset.options()
-      if options is not None:
-        return options
-    return Options()
+    if context.executing_eagerly():
+      # RefVariables are not supported in eager mode
+      return False
 
-  def make_initializable_iterator(self, shared_name=None):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
+    def is_tensor_or_parent_ref(tensor):
+      if tensor.dtype._is_ref_dtype:  # pylint: disable=protected-access
+        return True
+      return any([is_tensor_or_parent_ref(x) for x in tensor.op.inputs])
 
-    Note: The returned iterator will be in an uninitialized state,
-    and you must run the `iterator.initializer` operation before using it:
+    for fn in self._functions():
+      if any([is_tensor_or_parent_ref(t) for t in fn.function.captured_inputs]):
+        return True
 
-    ```python
-    dataset = ...
-    iterator = dataset.make_initializable_iterator()
-    # ...
-    sess.run(iterator.initializer)
-    ```
+    return any(
+        [input_dataset._has_captured_ref() for input_dataset in self._inputs()])  # pylint: disable=protected-access
 
-    Args:
-      shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
+  def _functions(self):
+    """Returns a list of functions associated with this dataset.
 
     Returns:
-      An `Iterator` over the elements of this dataset.
+      A list of `StructuredFunctionWrapper` objects.
+    """
+    return []
 
-    Raises:
-      RuntimeError: If eager execution is enabled.
+  def options(self):
+    """Returns the options for this dataset and its inputs.
+
+    Returns:
+      A `tf.data.Options` object representing the dataset options.
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "dataset.make_initializable_iterator is not supported when eager "
-          "execution is enabled.")
+    options = Options()
+    for input_dataset in self._inputs():
+      input_options = input_dataset.options()
+      if input_options is not None:
+        options = options.merge(input_options)
+    return options
+
+  def _apply_options(self):
+    """Apply options, such as optimization configuration, to the dataset."""
+
     dataset = self
     options = self.options()
+    if options.experimental_threading is not None:
+      t_options = options.experimental_threading
+      if t_options.private_threadpool_size is not None:
+        dataset = _PrivateThreadPoolDataset(dataset,
+                                            t_options.private_threadpool_size)
+      if t_options.max_intra_op_parallelism is not None:
+        dataset = _MaxIntraOpParallelismDataset(
+            dataset, t_options.max_intra_op_parallelism)
     static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
     if static_optimizations:
-      dataset = _OptimizeDataset(dataset, static_optimizations)
-    if options.experimental_autotune:
+      if self._has_captured_ref():
+        warnings.warn(
+            "tf.data static optimizations are not compatible with tf.Variable. "
+            "The following optimizations will be disabled: %s. To enable "
+            "optimizations, use resource variables instead by calling "
+            "`tf.enable_resource_variables()` at the start of the program." %
+            ", ".join(static_optimizations))
+      else:
+        dataset = _OptimizeDataset(dataset, static_optimizations)
+
+    if options.experimental_autotune is not False:
       dataset = _ModelDataset(dataset)
-    if shared_name is None:
-      shared_name = ""
-    if compat.forward_compatible(2018, 8, 3):
-      iterator_resource = gen_dataset_ops.iterator_v2(
-          container="", shared_name=shared_name, **flat_structure(self))
-    else:
-      iterator_resource = gen_dataset_ops.iterator(
-          container="", shared_name=shared_name, **flat_structure(self))
-    with ops.colocate_with(iterator_resource):
-      initializer = gen_dataset_ops.make_iterator(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          iterator_resource)
-    return iterator_ops.Iterator(iterator_resource, initializer,
-                                 dataset.output_types, dataset.output_shapes,
-                                 dataset.output_classes)
+    if options.experimental_stats and options.experimental_stats.aggregator:  # pylint: disable=line-too-long
+      dataset = _SetStatsAggregatorDataset(  # pylint: disable=protected-access
+          dataset, options.experimental_stats.aggregator,
+          options.experimental_stats.prefix,
+          options.experimental_stats.counter_prefix)
+    return dataset
 
   def __iter__(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
@@ -167,49 +192,17 @@ class Dataset(object):
       raise RuntimeError("dataset.__iter__() is only supported when eager "
                          "execution is enabled.")
 
-  def make_one_shot_iterator(self):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    Note: The returned iterator will be initialized automatically.
-    A "one-shot" iterator does not currently support re-initialization.
+  @abc.abstractproperty
+  def _element_structure(self):
+    """The structure of an element of this dataset.
 
     Returns:
-      An `Iterator` over the elements of this dataset.
+      A `Structure` object representing the structure of an element of this
+      dataset.
     """
-    if context.executing_eagerly():
-      return iterator_ops.EagerIterator(self)
-    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
-    # a 0-argument function.
-    @function.Defun(capture_by_value=True)
-    def _make_dataset():
-      dataset = self
-      options = self.options()
-      static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
-      if static_optimizations:
-        dataset = _OptimizeDataset(dataset, static_optimizations)
-      if options.experimental_autotune:
-        dataset = _ModelDataset(dataset)
-      return dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-    try:
-      _make_dataset.add_to_graph(ops.get_default_graph())
-    except ValueError as err:
-      if "Cannot capture a stateful node" in str(err):
-        raise ValueError(
-            "Failed to create a one-shot iterator for a dataset. "
-            "`Dataset.make_one_shot_iterator()` does not support datasets that "
-            "capture stateful objects, such as a `Variable` or `LookupTable`. "
-            "In these cases, use `Dataset.make_initializable_iterator()`. "
-            "(Original error: %s)" % err)
-      else:
-        six.reraise(ValueError, err)
-
-    return iterator_ops.Iterator(
-        gen_dataset_ops.one_shot_iterator(
-            dataset_factory=_make_dataset, **flat_structure(self)),
-        None, self.output_types, self.output_shapes, self.output_classes)
+    raise NotImplementedError("Dataset._element_structure")
 
-  @abc.abstractproperty
+  @property
   def output_classes(self):
     """Returns the class of each component of an element of this dataset.
 
@@ -219,9 +212,9 @@ class Dataset(object):
       A nested structure of Python `type` objects corresponding to each
       component of an element of this dataset.
     """
-    raise NotImplementedError("Dataset.output_classes")
+    return self._element_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
-  @abc.abstractproperty
+  @property
   def output_shapes(self):
     """Returns the shape of each component of an element of this dataset.
 
@@ -229,9 +222,9 @@ class Dataset(object):
       A nested structure of `tf.TensorShape` objects corresponding to each
       component of an element of this dataset.
     """
-    raise NotImplementedError("Dataset.output_shapes")
+    return self._element_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
-  @abc.abstractproperty
+  @property
   def output_types(self):
     """Returns the type of each component of an element of this dataset.
 
@@ -239,7 +232,7 @@ class Dataset(object):
       A nested structure of `tf.DType` objects corresponding to each component
       of an element of this dataset.
     """
-    raise NotImplementedError("Dataset.output_types")
+    return self._element_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
   def __repr__(self):
     output_shapes = nest.map_structure(str, self.output_shapes)
@@ -256,9 +249,10 @@ class Dataset(object):
     Note that if `tensors` contains a NumPy array, and eager execution is not
     enabled, the values will be embedded in the graph as one or more
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
-    memory and run into byte limits of graph serialization.  If tensors contains
-    one or more large NumPy arrays, consider the alternative described in
-    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
+    memory and run into byte limits of graph serialization. If `tensors`
+    contains one or more large NumPy arrays, consider the alternative described
+    in [this
+    guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors.
@@ -275,9 +269,10 @@ class Dataset(object):
     Note that if `tensors` contains a NumPy array, and eager execution is not
     enabled, the values will be embedded in the graph as one or more
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
-    memory and run into byte limits of graph serialization.  If tensors contains
-    one or more large NumPy arrays, consider the alternative described in
-    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
+    memory and run into byte limits of graph serialization. If `tensors`
+    contains one or more large NumPy arrays, consider the alternative described
+    in [this guide](
+    https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors, each having the same size in the
@@ -288,19 +283,6 @@ class Dataset(object):
     """
     return TensorSliceDataset(tensors)
 
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
-  def from_sparse_tensor_slices(sparse_tensor):
-    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
-
-    Args:
-      sparse_tensor: A `tf.SparseTensor`.
-
-    Returns:
-      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
-    """
-    return SparseTensorSliceDataset(sparse_tensor)
-
   class _GeneratorState(object):
     """Stores outstanding iterators created from a Python generator.
 
@@ -350,17 +332,19 @@ class Dataset(object):
 
     ```python
     import itertools
+    tf.enable_eager_execution()
 
     def gen():
       for i in itertools.count(1):
         yield (i, [1] * i)
 
-    ds = Dataset.from_generator(
+    ds = tf.data.Dataset.from_generator(
         gen, (tf.int64, tf.int64), (tf.TensorShape([]), tf.TensorShape([None])))
-    value = ds.make_one_shot_iterator().get_next()
 
-    sess.run(value)  # (1, array([1]))
-    sess.run(value)  # (2, array([1, 1]))
+    for value in ds.take(2):
+      print value
+    # (1, array([1]))
+    # (2, array([1, 1]))
     ```
 
     NOTE: The current implementation of `Dataset.from_generator()` uses
@@ -412,7 +396,7 @@ class Dataset(object):
     flattened_types = [dtypes.as_dtype(dt) for dt in nest.flatten(output_types)]
     flattened_shapes = nest.flatten(output_shapes)
 
-    generator_state = Dataset._GeneratorState(generator)
+    generator_state = DatasetV2._GeneratorState(generator)
 
     def get_iterator_id_fn(unused_dummy):
       """Creates a unique `iterator_id` for each pass over the dataset.
@@ -557,7 +541,7 @@ class Dataset(object):
     ```
 
     Args:
-      *args: follow same semantics as python's xrange.
+      *args: follows the same semantics as python's xrange.
         len(args) == 1 -> start = 0, stop = args[0], step = 1
         len(args) == 2 -> start = args[0], stop = args[1], step = 1
         len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
@@ -651,7 +635,7 @@ class Dataset(object):
 
   @staticmethod
   def list_files(file_pattern, shuffle=None, seed=None):
-    """A dataset of all files matching a pattern.
+    """A dataset of all files matching one or more glob patterns.
 
     NOTE: The default behavior of this method is to return filenames in
     a non-deterministic random shuffled order. Pass a `seed` or `shuffle=False`
@@ -668,12 +652,13 @@ class Dataset(object):
         - /path/to/dir/c.py
 
     Args:
-      file_pattern: A string or scalar string `tf.Tensor`, representing
-        the filename pattern that will be matched.
+      file_pattern: A string, a list of strings, or a `tf.Tensor` of string type
+        (scalar or vector), representing the filename glob (i.e. shell wildcard)
+        pattern(s) that will be matched.
       shuffle: (Optional.) If `True`, the file names will be shuffled randomly.
         Defaults to `True`.
-      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        random seed that will be used to create the distribution. See
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random
+        seed that will be used to create the distribution. See
         `tf.set_random_seed` for behavior.
 
     Returns:
@@ -733,6 +718,11 @@ class Dataset(object):
   def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None):
     """Randomly shuffles the elements of this dataset.
 
+    This dataset fills a buffer with `buffer_size` elements, then randomly
+    samples elements from this buffer, replacing the selected elements with new
+    elements. For perfect shuffling, a buffer size greater than or equal to the
+    full size of the dataset is required.
+
     Args:
       buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
         number of elements from this dataset from which the new
@@ -791,78 +781,6 @@ class Dataset(object):
     """
     return SkipDataset(self, count)
 
-  def shard(self, num_shards, index):
-    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-    This dataset operator is very useful when running distributed training, as
-    it allows each worker to read a unique subset.
-
-    When reading a single input file, you can skip elements as follows:
-
-    ```python
-    d = tf.data.TFRecordDataset(FLAGS.input_file)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Important caveats:
-
-    - Be sure to shard before you use any randomizing operator (such as
-      shuffle).
-    - Generally it is best if the shard operator is used early in the dataset
-      pipeline. For example, when reading from a set of TFRecord files, shard
-      before converting the dataset to input samples. This avoids reading every
-      file on every worker. The following is an example of an efficient
-      sharding strategy within a complete pipeline:
-
-    ```python
-    d = Dataset.list_files(FLAGS.pattern)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.interleave(tf.data.TFRecordDataset,
-                     cycle_length=FLAGS.num_readers, block_length=1)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Args:
-      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel.
-      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-    Returns:
-      Dataset: A `Dataset`.
-
-    Raises:
-      ValueError: if `num_shards` or `index` are illegal values. Note: error
-        checking is done on a best-effort basis, and aren't guaranteed to be
-        caught upon dataset creation. (e.g. providing in a placeholder tensor
-        bypasses the early checking, and will instead result in an error during
-        a session.run call.)
-    """
-    num_shards = ops.convert_to_tensor(
-        num_shards, name="num_shards", dtype=dtypes.int64)
-    num_shards_static = tensor_util.constant_value(num_shards)
-    index = ops.convert_to_tensor(index, name="index", dtype=dtypes.int64)
-    index_static = tensor_util.constant_value(index)
-
-    if num_shards_static is not None and num_shards_static < 1:
-      raise ValueError("num_shards must be >= 1; got: %s" % num_shards_static)
-    if index_static is not None and index_static < 0:
-      raise ValueError("index must be >= 0; got: %s" % index_static)
-    if (index_static is not None and num_shards_static is not None and
-        index_static >= num_shards_static):
-      raise ValueError("index must be <= num_shards; %s is not < %s" %
-                       (index_static, num_shards_static))
-
-    def filter_fn(elem_index, _):
-      mod_result = math_ops.mod(elem_index, num_shards)
-      return math_ops.equal(mod_result, index)
-
-    return self._enumerate().filter(filter_fn).map(lambda _, elem: elem)
-
   def batch(self, batch_size, drop_remainder=False):
     """Combines consecutive elements of this dataset into batches.
 
@@ -877,7 +795,7 @@ class Dataset(object):
       batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
         consecutive elements of this dataset to combine in a single batch.
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
-        whether the last batch should be dropped in the case its has fewer than
+        whether the last batch should be dropped in the case it has fewer than
         `batch_size` elements; the default behavior is not to drop the smaller
         batch.
 
@@ -934,7 +852,7 @@ class Dataset(object):
         respective components.  Defaults are `0` for numeric types and
         the empty string for string types.
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
-        whether the last batch should be dropped in the case its has fewer than
+        whether the last batch should be dropped in the case it has fewer than
         `batch_size` elements; the default behavior is not to drop the smaller
         batch.
 
@@ -1029,15 +947,18 @@ class Dataset(object):
        `self.output_types`) to another nested structure of tensors.
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
         representing the number elements to process in parallel. If not
-        specified, elements will be processed sequentially.
+        specified, elements will be processed sequentially. If the value
+        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        calls is set dynamically based on available CPU.
 
     Returns:
       Dataset: A `Dataset`.
     """
     if num_parallel_calls is None:
-      return MapDataset(self, map_func)
+      return MapDataset(self, map_func, preserve_cardinality=True)
     else:
-      return ParallelMapDataset(self, map_func, num_parallel_calls)
+      return ParallelMapDataset(
+          self, map_func, num_parallel_calls, preserve_cardinality=True)
 
   def flat_map(self, map_func):
     """Maps `map_func` across this dataset and flattens the result.
@@ -1139,7 +1060,9 @@ class Dataset(object):
       num_parallel_calls: (Optional.) If specified, the implementation creates
         a threadpool, which is used to fetch inputs from cycle elements
         asynchronously and in parallel. The default behavior is to fetch inputs
-        from cycle elements synchronously with no parallelism.
+        from cycle elements synchronously with no parallelism. If the value
+        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        calls is set dynamically based on available CPU.
 
     Returns:
       Dataset: A `Dataset`.
@@ -1188,7 +1111,7 @@ class Dataset(object):
           dataset.
     """
     dataset = transformation_func(self)
-    if not isinstance(dataset, Dataset):
+    if not isinstance(dataset, DatasetV2):
       raise TypeError("`transformation_func` must return a Dataset.")
     dataset._input_datasets = [self]  # pylint: disable=protected-access
     return dataset
@@ -1272,27 +1195,23 @@ class Dataset(object):
 
     # Compute initial values for the state classes, shapes and types based on
     # the initial state.
-    state_classes = sparse.get_classes(initial_state)
-    state_shapes = nest.pack_sequence_as(
-        initial_state, [t.get_shape() for t in nest.flatten(initial_state)])
-    state_types = nest.pack_sequence_as(
-        initial_state, [t.dtype for t in nest.flatten(initial_state)])
+    state_structure = structure_lib.Structure.from_value(initial_state)
 
     # Iteratively rerun the reduce function until reaching a fixed point on
-    # `self._state_shapes`.
+    # `state_structure`.
     need_to_rerun = True
     while need_to_rerun:
 
       wrapped_func = StructuredFunctionWrapper(
           reduce_func,
           "reduce()",
-          input_classes=(state_classes, self.output_classes),
-          input_shapes=(state_shapes, self.output_shapes),
-          input_types=(state_types, self.output_types),
+          input_structure=structure_lib.NestedStructure(
+              (state_structure, self._element_structure)),
           add_to_graph=False)
 
       # Extract and validate class information from the returned values.
       output_classes = wrapped_func.output_classes
+      state_classes = state_structure._to_legacy_output_classes()  # pylint: disable=protected-access
       for new_state_class, state_class in zip(
           nest.flatten(output_classes), nest.flatten(state_classes)):
         if not issubclass(new_state_class, state_class):
@@ -1303,6 +1222,7 @@ class Dataset(object):
 
       # Extract and validate type information from the returned values.
       output_types = wrapped_func.output_types
+      state_types = state_structure._to_legacy_output_types()  # pylint: disable=protected-access
       for new_state_type, state_type in zip(
           nest.flatten(output_types), nest.flatten(state_types)):
         if new_state_type != state_type:
@@ -1313,6 +1233,7 @@ class Dataset(object):
 
       # Extract shape information from the returned values.
       output_shapes = wrapped_func.output_shapes
+      state_shapes = state_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
       flat_state_shapes = nest.flatten(state_shapes)
       flat_new_state_shapes = nest.flatten(output_shapes)
       weakened_state_shapes = [
@@ -1330,35 +1251,33 @@ class Dataset(object):
           break
 
       if need_to_rerun:
-        state_shapes = nest.pack_sequence_as(state_shapes,
-                                             weakened_state_shapes)
+        # TODO(b/110122868): Support a "most specific compatible structure"
+        # method for combining structures, to avoid using legacy structures
+        # here.
+        state_structure = structure_lib.convert_legacy_structure(
+            state_types,
+            nest.pack_sequence_as(state_shapes, weakened_state_shapes),
+            state_classes)
 
     reduce_func = wrapped_func.function
     reduce_func.add_to_graph(ops.get_default_graph())
 
-    return sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(
-            output_types,
-            gen_dataset_ops.reduce_dataset(
-                self._as_variant_tensor(),  # pylint: disable=protected-access
-                nest.flatten(sparse.serialize_sparse_tensors(initial_state)),
-                reduce_func.captured_inputs,
-                f=reduce_func,
-                output_shapes=nest.flatten(
-                    sparse.as_dense_shapes(output_shapes, output_classes)),
-                output_types=nest.flatten(
-                    sparse.as_dense_types(output_types, output_classes)))),
-        output_types,
-        output_shapes,
-        output_classes)
+    # pylint: disable=protected-access
+    return state_structure._from_compatible_tensor_list(
+        gen_dataset_ops.reduce_dataset(
+            self._as_variant_tensor(),
+            state_structure._to_tensor_list(initial_state),
+            reduce_func.captured_inputs,
+            f=reduce_func,
+            output_shapes=state_structure._flat_shapes,
+            output_types=state_structure._flat_types))
 
   def with_options(self, options):
     """Returns a new `tf.data.Dataset` with the given options set.
 
-    The options are "global" in the sense they apply to the entire input
-    pipeline in which the `with_options` transformation is used. If options are
-    set multiple times, they are merged if possible (see
-    `tf.data.Options.merge()` for details).
+    The options are "global" in the sense they apply to the entire dataset.
+    If options are set multiple times, they are merged as long as different
+    options do not use different non-default values.
 
     Args:
       options: A `tf.data.Options` that identifies the options the use.
@@ -1367,156 +1286,513 @@ class Dataset(object):
       Dataset: A `Dataset` with the given options.
 
     Raises:
-      ValueError: if options are set more than once
+      ValueError: when an option is set more than once to a non-default value
     """
     return _OptionsDataset(self, options)
 
 
-@tf_export("data.Options")
-class Options(object):
-  """Represents options for tf.data.Dataset.
+@tf_export(v1=["data.Dataset"])
+class DatasetV1(DatasetV2):
+  """Represents a potentially large set of elements.
 
-  An `Options` object can be for instance used to control which static
-  optimizations to apply or whether to use performance modeling to dynamically
-  tune the parallelism of operations such as `tf.data.Dataset.map` or
-  `tf.data.Dataset.interleave`.
+  A `Dataset` can be used to represent an input pipeline as a
+  collection of elements (nested structures of tensors) and a "logical
+  plan" of transformations that act on those elements.
   """
-  for _name, _ty, _docstring in [
-      ("experimental_autotune", bool,
-       "Whether to dynamically adjust the values of tunable parameters (e.g. "
-       "degrees of parallelism)."),
-      ("experimental_filter_fusion", bool,
-       "Whether to fuse filter transformations."),
-      ("experimental_hoist_random_uniform", bool,
-       "Whether to hoist `tf.random_uniform()` ops out of map transformations."
-      ),
-      ("experimental_latency_all_edges", bool,
-       "Whether to add latency measurements on all edges."),
-      ("experimental_map_and_batch_fusion", bool,
-       "Whether to fuse map and batch transformations."),
-      ("experimental_map_and_filter_fusion", bool,
-       "Whether to fuse map and filter transformations."),
-      ("experimental_map_fusion", bool, "Whether to fuse map transformations."),
-      ("experimental_map_parallelization", bool,
-       "Whether to parallelize stateless map transformations."),
-      ("experimental_map_vectorization", bool,
-       "Whether to vectorize map transformations."),
-      ("experimental_noop_elimination", bool,
-       "Whether to eliminate no-op transformations."),
-      ("experimental_shuffle_and_repeat_fusion", bool,
-       "Whether to fuse shuffle and repeat transformations."),
-  ]:
-
-    def _make_getter(name):  # pylint: disable=no-self-argument
-
-      def getter(self):
-        return getattr(self, "_" + name)
-
-      return getter
-
-    def _make_setter(name, ty):  # pylint: disable=no-self-argument
-
-      def setter(self, value):
-        if not isinstance(value, ty):
-          raise TypeError(
-              "Attempting to set the option %s to incompatible value: %r" %
-              (name, value))
-        setattr(self, "_" + name, value)
-
-      return setter
-
-    vars()["_" + _name] = None
-    vars()[_name] = property(
-        _make_getter(_name), _make_setter(_name, _ty), None, _docstring)
 
   def __init__(self):
     pass
 
-  def __eq__(self, other):
-    if isinstance(other, self.__class__):
-      return self.__dict__ == other.__dict__
-    else:
-      return False
-
-  def __ne__(self, other):
-    return not self.__eq__(other)
-
-  def _static_optimizations(self):
-    """Produces the list of enabled static optimizations."""
-    experimental_optimizations = [
-        "filter_fusion", "hoist_random_uniform", "latency_all_edges",
-        "map_and_batch_fusion", "map_and_filter_fusion", "map_fusion",
-        "map_parallelization", "map_vectorization", "noop_elimination",
-        "shuffle_and_repeat_fusion"
-    ]
-    result = []
-    for exp_opt in experimental_optimizations:
-      if getattr(self, "experimental_" + exp_opt):
-        result.append(exp_opt)
-    return result
-
-  def merge(self, options):
-    """Merges itself with the given `tf.data.Options`.
-
-    The given `tf.data.Options` can be merged as long as there does not exist an
-    attribute that is set to different values in `self` and `options`.
-
-    Args:
-      options: a `tf.data.Options` to merge with
+  @deprecation.deprecated(
+      None, "Use `for ... in dataset:` to iterate over a dataset. If using "
+      "`tf.estimator`, return the `Dataset` object directly from your input "
+      "function. As a last resort, you can use "
+      "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+  def make_one_shot_iterator(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
 
-    Raises:
-      ValueError: if the given `tf.data.Options` cannot be merged
+    Note: The returned iterator will be initialized automatically.
+    A "one-shot" iterator does not currently support re-initialization.
 
     Returns:
-      New `tf.data.Options()` object which is the result of merging self with
-      the input `tf.data.Options`.
+      An `Iterator` over the elements of this dataset.
     """
-    result = Options()
-    for other in [self, options]:
-      for name in [
-          "experimental_autotune", "experimental_filter_fusion",
-          "experimental_hoist_random_uniform", "experimental_latency_all_edges",
-          "experimental_map_and_batch_fusion",
-          "experimental_map_and_filter_fusion", "experimental_map_fusion",
-          "experimental_map_parallelization", "experimental_map_vectorization",
-          "experimental_noop_elimination",
-          "experimental_shuffle_and_repeat_fusion"
-      ]:
-        this = getattr(result, name)
-        that = getattr(other, name)
-        if that is not None:
-          if this is None:
-            setattr(result, name, that)
-          elif this != that:
-            raise ValueError(
-                "Cannot merge incompatible values of option: %s" % (name))
-    return result
+    if context.executing_eagerly():
+      return iterator_ops.EagerIterator(self)
 
+    graph_level_seed, op_level_seed = core_random_seed.get_seed(None)
 
-class DatasetSource(Dataset):
-  """Abstract class representing a dataset with no inputs."""
+    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
+    # a 0-argument function.
+    @function.Defun(capture_by_value=True)
+    def _make_dataset():
+      """Factory function for a dataset."""
+      # NOTE(mrry): `Defun` does not capture the graph-level seed from the
+      # enclosing graph, so if a graph-level seed is present we set the local
+      # graph seed based on a combination of the graph- and op-level seeds.
+      if graph_level_seed is not None:
+        assert op_level_seed is not None
+        core_random_seed.set_random_seed(
+            (graph_level_seed + 87654321 * op_level_seed) % (2 ** 63 - 1))
+
+      dataset = self._apply_options()
+      return dataset._as_variant_tensor()  # pylint: disable=protected-access
 
-  def _inputs(self):
-    return []
+    try:
+      _make_dataset.add_to_graph(ops.get_default_graph())
+    except ValueError as err:
+      if "Cannot capture a stateful node" in str(err):
+        raise ValueError(
+            "Failed to create a one-shot iterator for a dataset. "
+            "`Dataset.make_one_shot_iterator()` does not support datasets that "
+            "capture stateful objects, such as a `Variable` or `LookupTable`. "
+            "In these cases, use `Dataset.make_initializable_iterator()`. "
+            "(Original error: %s)" % err)
+      else:
+        six.reraise(ValueError, err)
 
+    return iterator_ops.Iterator(
+        gen_dataset_ops.one_shot_iterator(
+            dataset_factory=_make_dataset, **flat_structure(self)),
+        None, self.output_types, self.output_shapes, self.output_classes)
 
-class UnaryDataset(Dataset):
-  """Abstract class representing a dataset with one input."""
+  @deprecation.deprecated(
+      None, "Use `for ... in dataset:` to iterate over a dataset. If using "
+      "`tf.estimator`, return the `Dataset` object directly from your input "
+      "function. As a last resort, you can use "
+      "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
+  def make_initializable_iterator(self, shared_name=None):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
 
-  def __init__(self, input_dataset):
-    super(UnaryDataset, self).__init__()
-    self._input_dataset = input_dataset
+    Note: The returned iterator will be in an uninitialized state,
+    and you must run the `iterator.initializer` operation before using it:
 
-  def _inputs(self):
-    return [self._input_dataset]
+    ```python
+    dataset = ...
+    iterator = dataset.make_initializable_iterator()
+    # ...
+    sess.run(iterator.initializer)
+    ```
 
+    Args:
+      shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
 
-class TensorDataset(DatasetSource):
-  """A `Dataset` with a single element, viz. a nested structure of tensors."""
+    Returns:
+      An `Iterator` over the elements of this dataset.
 
-  def __init__(self, tensors):
-    """See `Dataset.from_tensors()` for details."""
-    super(TensorDataset, self).__init__()
+    Raises:
+      RuntimeError: If eager execution is enabled.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "dataset.make_initializable_iterator is not supported when eager "
+          "execution is enabled.")
+    dataset = self._apply_options()
+    if shared_name is None:
+      shared_name = ""
+    if compat.forward_compatible(2018, 8, 3):
+      iterator_resource = gen_dataset_ops.iterator_v2(
+          container="", shared_name=shared_name, **flat_structure(self))
+    else:
+      iterator_resource = gen_dataset_ops.iterator(
+          container="", shared_name=shared_name, **flat_structure(self))
+    with ops.colocate_with(iterator_resource):
+      initializer = gen_dataset_ops.make_iterator(
+          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          iterator_resource)
+    return iterator_ops.Iterator(iterator_resource, initializer,
+                                 dataset.output_types, dataset.output_shapes,
+                                 dataset.output_classes)
+
+  @property
+  def _element_structure(self):
+    # TODO(b/110122868): Remove this override once all `Dataset` instances
+    # implement `element_structure`.
+    return structure_lib.convert_legacy_structure(
+        self.output_types, self.output_shapes, self.output_classes)
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_tensors)
+  def from_tensors(tensors):
+    return DatasetV1Adapter(DatasetV2.from_tensors(tensors))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_tensor_slices)
+  def from_tensor_slices(tensors):
+    return DatasetV1Adapter(DatasetV2.from_tensor_slices(tensors))
+
+  @staticmethod
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
+  def from_sparse_tensor_slices(sparse_tensor):
+    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
+
+    Args:
+      sparse_tensor: A `tf.SparseTensor`.
+
+    Returns:
+      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
+    """
+    return DatasetV1Adapter(SparseTensorSliceDataset(sparse_tensor))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_generator)
+  def from_generator(generator, output_types, output_shapes=None, args=None):
+    return DatasetV1Adapter(DatasetV2.from_generator(
+        generator, output_types, output_shapes, args))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.range)
+  def range(*args):
+    return DatasetV1Adapter(DatasetV2.range(*args))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.zip)
+  def zip(datasets):
+    return DatasetV1Adapter(DatasetV2.zip(datasets))
+
+  @functools.wraps(DatasetV2.concatenate)
+  def concatenate(self, dataset):
+    return DatasetV1Adapter(super(DatasetV1, self).concatenate(dataset))
+
+  @functools.wraps(DatasetV2.prefetch)
+  def prefetch(self, buffer_size):
+    return DatasetV1Adapter(super(DatasetV1, self).prefetch(buffer_size))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.list_files)
+  def list_files(file_pattern, shuffle=None, seed=None):
+    return DatasetV1Adapter(DatasetV2.list_files(file_pattern, shuffle, seed))
+
+  @functools.wraps(DatasetV2.repeat)
+  def repeat(self, count=None):
+    return DatasetV1Adapter(super(DatasetV1, self).repeat(count))
+
+  @functools.wraps(DatasetV2.shuffle)
+  def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None):
+    return DatasetV1Adapter(super(DatasetV1, self).shuffle(
+        buffer_size, seed, reshuffle_each_iteration))
+
+  @functools.wraps(DatasetV2.cache)
+  def cache(self, filename=""):
+    return DatasetV1Adapter(super(DatasetV1, self).cache(filename))
+
+  @functools.wraps(DatasetV2.take)
+  def take(self, count):
+    return DatasetV1Adapter(super(DatasetV1, self).take(count))
+
+  @functools.wraps(DatasetV2.skip)
+  def skip(self, count):
+    return DatasetV1Adapter(super(DatasetV1, self).skip(count))
+
+  @deprecation.deprecated(
+      None, "Use `dataset.apply(tf.data.experimental.filter_for_shard(...))`.")
+  def shard(self, num_shards, index):
+    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+
+    This dataset operator is very useful when running distributed training, as
+    it allows each worker to read a unique subset.
+
+    When reading a single input file, you can skip elements as follows:
+
+    ```python
+    d = tf.data.TFRecordDataset(FLAGS.input_file)
+    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
+    d = d.repeat(FLAGS.num_epochs)
+    d = d.shuffle(FLAGS.shuffle_buffer_size)
+    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+    ```
+
+    Important caveats:
+
+    - Be sure to shard before you use any randomizing operator (such as
+      shuffle).
+    - Generally it is best if the shard operator is used early in the dataset
+      pipeline. For example, when reading from a set of TFRecord files, shard
+      before converting the dataset to input samples. This avoids reading every
+      file on every worker. The following is an example of an efficient
+      sharding strategy within a complete pipeline:
+
+    ```python
+    d = Dataset.list_files(FLAGS.pattern)
+    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
+    d = d.repeat(FLAGS.num_epochs)
+    d = d.shuffle(FLAGS.shuffle_buffer_size)
+    d = d.interleave(tf.data.TFRecordDataset,
+                     cycle_length=FLAGS.num_readers, block_length=1)
+    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+    ```
+
+    Args:
+      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        shards operating in parallel.
+      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+
+    Returns:
+      Dataset: A `Dataset`.
+
+    Raises:
+      ValueError: if `num_shards` or `index` are illegal values. Note: error
+        checking is done on a best-effort basis, and errors aren't guaranteed
+        to be caught upon dataset creation. (e.g. providing in a placeholder
+        tensor bypasses the early checking, and will instead result in an error
+        during a session.run call.)
+    """
+    return self.apply(filter_for_shard_ops.filter_for_shard(num_shards, index))
+
+  @functools.wraps(DatasetV2.batch)
+  def batch(self, batch_size, drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).batch(
+        batch_size, drop_remainder))
+
+  @functools.wraps(DatasetV2.padded_batch)
+  def padded_batch(self,
+                   batch_size,
+                   padded_shapes,
+                   padding_values=None,
+                   drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).padded_batch(
+        batch_size, padded_shapes, padding_values, drop_remainder))
+
+  @functools.wraps(DatasetV2.map)
+  def map(self, map_func, num_parallel_calls=None):
+    if num_parallel_calls is None:
+      return DatasetV1Adapter(
+          MapDataset(self, map_func, preserve_cardinality=False))
+    else:
+      return DatasetV1Adapter(
+          ParallelMapDataset(
+              self, map_func, num_parallel_calls, preserve_cardinality=False))
+
+  @functools.wraps(DatasetV2.flat_map)
+  def flat_map(self, map_func):
+    return DatasetV1Adapter(super(DatasetV1, self).flat_map(map_func))
+
+  @functools.wraps(DatasetV2.interleave)
+  def interleave(self,
+                 map_func,
+                 cycle_length,
+                 block_length=1,
+                 num_parallel_calls=None):
+    return DatasetV1Adapter(super(DatasetV1, self).interleave(
+        map_func, cycle_length, block_length, num_parallel_calls))
+
+  @functools.wraps(DatasetV2.filter)
+  def filter(self, predicate):
+    return DatasetV1Adapter(super(DatasetV1, self).filter(predicate))
+
+  @functools.wraps(DatasetV2.apply)
+  def apply(self, transformation_func):
+    return DatasetV1Adapter(super(DatasetV1, self).apply(transformation_func))
+
+  @functools.wraps(DatasetV2.window)
+  def window(self, size, shift=None, stride=1, drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).window(
+        size, shift, stride, drop_remainder))
+
+  @functools.wraps(DatasetV2.with_options)
+  def with_options(self, options):
+    return DatasetV1Adapter(super(DatasetV1, self).with_options(options))
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+Dataset = DatasetV1
+
+
+class DatasetV1Adapter(DatasetV1):
+  """Wraps a V2 `Dataset` object in the `tf.compat.v1.data.Dataset` API."""
+
+  def __init__(self, dataset):
+    super(DatasetV1Adapter, self).__init__()
+    self._dataset = dataset
+
+  def _as_variant_tensor(self):
+    return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
+
+  def _has_captured_ref(self):
+    return self._dataset._has_captured_ref()  # pylint: disable=protected-access
+
+  def _inputs(self):
+    return self._dataset._inputs()  # pylint: disable=protected-access
+
+  def options(self):
+    return self._dataset.options()
+
+  @property
+  def _element_structure(self):
+    return self._dataset._element_structure  # pylint: disable=protected-access
+
+  def __iter__(self):
+    return iter(self._dataset)
+
+
+@tf_export(v1=["data.make_one_shot_iterator"])
+def make_one_shot_iterator(dataset):
+  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
+
+  Note: The returned iterator will be initialized automatically.
+  A "one-shot" iterator does not support re-initialization.
+
+  Args:
+    dataset: A `tf.data.Dataset`.
+
+  Returns:
+    A `tf.data.Iterator` over the elements of this dataset.
+  """
+  try:
+    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # datasets (e.g. for prefetching) override its behavior.
+    return dataset.make_one_shot_iterator()
+  except AttributeError:
+    return DatasetV1Adapter(dataset).make_one_shot_iterator()
+
+
+@tf_export(v1=["data.make_initializable_iterator"])
+def make_initializable_iterator(dataset):
+  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
+
+  Note: The returned iterator will be in an uninitialized state,
+  and you must run the `iterator.initializer` operation before using it:
+
+  ```python
+  dataset = ...
+  iterator = dataset.make_initializable_iterator()
+  # ...
+  sess.run(iterator.initializer)
+  ```
+
+  Args:
+    dataset: A `tf.data.Dataset`.
+
+  Returns:
+    A `tf.data.Iterator` over the elements of `dataset`.
+
+  Raises:
+    RuntimeError: If eager execution is enabled.
+  """
+  try:
+    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # datasets (e.g. for prefetching) override its behavior.
+    return dataset.make_initializable_iterator()
+  except AttributeError:
+    return DatasetV1Adapter(dataset).make_initializable_iterator()
+
+
+@tf_export("data.Options")
+class Options(options_lib.OptionsBase):
+  """Represents options for tf.data.Dataset.
+
+  An `Options` object can be, for instance, used to control which static
+  optimizations to apply or whether to use performance modeling to dynamically
+  tune the parallelism of operations such as `tf.data.Dataset.map` or
+  `tf.data.Dataset.interleave`.
+  """
+
+  experimental_autotune = options_lib.create_option(
+      name="experimental_autotune",
+      ty=bool,
+      docstring=
+      "Whether to dynamically adjust the values of tunable parameters (e.g. "
+      "degrees of parallelism).")
+
+  experimental_deterministic = options_lib.create_option(
+      name="experimental_deterministic",
+      ty=bool,
+      docstring=
+      "Whether the outputs need to be produced in deterministic order."
+  )
+
+  experimental_numa_aware = options_lib.create_option(
+      name="experimental_numa_aware",
+      ty=bool,
+      docstring="Whether to use NUMA-aware operations.")
+
+  experimental_optimization = options_lib.create_option(
+      name="experimental_optimization",
+      ty=optimization_options.OptimizationOptions,
+      docstring="Associates the given optimization options with the dataset.")
+
+  experimental_stats = options_lib.create_option(
+      name="experimental_stats",
+      ty=stats_options.StatsOptions,
+      docstring="Associates the given statistics options with the dataset.")
+
+  experimental_threading = options_lib.create_option(
+      name="experimental_threading",
+      ty=threading_options.ThreadingOptions,
+      docstring="Associates the given threading options with the dataset.")
+
+  def _static_optimizations(self):
+    """Produces the list of enabled static optimizations."""
+
+    result = []
+    exp_optimization_options = (
+        self.experimental_optimization or
+        optimization_options.OptimizationOptions())  # If not set, use default
+    result.extend(exp_optimization_options._static_optimizations())  # pylint: disable=protected-access
+
+    if self.experimental_numa_aware:
+      result.append("make_numa_aware")
+    if self.experimental_deterministic is False:
+      result.append("make_sloppy")
+    exp_stats_options = self.experimental_stats
+    if exp_stats_options and exp_stats_options.latency_all_edges:
+      result.append("latency_all_edges")
+    return result
+
+  def merge(self, options):
+    """Merges itself with the given `tf.data.Options`.
+
+    The given `tf.data.Options` can be merged as long as there does not exist an
+    attribute that is set to different values in `self` and `options`.
+
+    Args:
+      options: a `tf.data.Options` to merge with
+
+    Raises:
+      ValueError: if the given `tf.data.Options` cannot be merged
+
+    Returns:
+      New `tf.data.Options()` object which is the result of merging self with
+      the input `tf.data.Options`.
+    """
+    return options_lib.merge_options(self, options)
+
+
+class DatasetSource(DatasetV2):
+  """Abstract class representing a dataset with no inputs."""
+
+  def _inputs(self):
+    return []
+
+
+class UnaryDataset(DatasetV2):
+  """Abstract class representing a dataset with one input."""
+
+  def __init__(self, input_dataset):
+    super(UnaryDataset, self).__init__()
+    self._input_dataset = input_dataset
+
+  def _inputs(self):
+    return [self._input_dataset]
+
+
+class UnaryUnchangedStructureDataset(UnaryDataset):
+  """Represents a unary dataset with the same input and output structure."""
+
+  @property
+  def _element_structure(self):
+    return self._input_dataset._element_structure  # pylint: disable=protected-access
+
+
+class TensorDataset(DatasetSource):
+  """A `Dataset` with a single element, viz. a nested structure of tensors."""
+
+  def __init__(self, tensors):
+    """See `Dataset.from_tensors()` for details."""
+    super(TensorDataset, self).__init__()
     with ops.name_scope("tensors"):
       tensors = nest.pack_sequence_as(tensors, [
           sparse_tensor_lib.SparseTensor.from_value(t)
@@ -1524,31 +1800,16 @@ class TensorDataset(DatasetSource):
               t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
       ])
-
-    self._tensors = sparse.serialize_sparse_tensors(tensors)
-    self._output_classes = sparse.get_classes(tensors)
-    self._output_shapes = nest.pack_sequence_as(
-        tensors, [t.get_shape() for t in nest.flatten(tensors)])
-    self._output_types = nest.pack_sequence_as(
-        tensors, [t.dtype for t in nest.flatten(tensors)])
+    self._structure = structure_lib.Structure.from_value(tensors)
+    self._tensors = self._structure._to_tensor_list(tensors)  # pylint: disable=protected-access
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.tensor_dataset(
-        nest.flatten(self._tensors),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+        self._tensors, output_shapes=self._structure._flat_shapes)  # pylint: disable=protected-access
 
   @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class TensorSliceDataset(DatasetSource):
@@ -1564,35 +1825,26 @@ class TensorSliceDataset(DatasetSource):
               t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
       ])
-      flat_tensors = nest.flatten(tensors)
-
-    batch_dim = flat_tensors[0].get_shape()[0]
-    for t in flat_tensors[1:]:
-      batch_dim.assert_is_compatible_with(t.get_shape()[0])
-    self._tensors = sparse.serialize_many_sparse_tensors(tensors)
-    self._output_classes = sparse.get_classes(tensors)
-    self._output_shapes = nest.pack_sequence_as(
-        tensors, [t.get_shape()[1:] for t in nest.flatten(tensors)])
-    self._output_types = nest.pack_sequence_as(
-        tensors, [t.dtype for t in nest.flatten(tensors)])
 
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.tensor_slice_dataset(
-        nest.flatten(self._tensors),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    batched_structure = structure_lib.Structure.from_value(tensors)
+    # pylint: disable=protected-access
+    self._tensors = batched_structure._to_batched_tensor_list(tensors)
+    self._structure = batched_structure._unbatch()
+    # pylint: enable=protected-access
 
-  @property
-  def output_classes(self):
-    return self._output_classes
+    batch_dim = tensor_shape.Dimension(tensor_shape.dimension_value(
+        self._tensors[0].get_shape()[0]))
+    for t in self._tensors[1:]:
+      batch_dim.assert_is_compatible_with(tensor_shape.Dimension(
+          tensor_shape.dimension_value(t.get_shape()[0])))
 
-  @property
-  def output_shapes(self):
-    return self._output_shapes
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.tensor_slice_dataset(
+        self._tensors, output_shapes=self._structure._flat_shapes)  # pylint: disable=protected-access
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class SparseTensorSliceDataset(DatasetSource):
@@ -1605,120 +1857,121 @@ class SparseTensorSliceDataset(DatasetSource):
       raise TypeError("`sparse_tensor` must be a `tf.SparseTensor` object.")
     self._sparse_tensor = sparse_tensor
 
+    indices_shape = self._sparse_tensor.indices.get_shape()
+    shape_shape = self._sparse_tensor.dense_shape.get_shape()
+    rank = (indices_shape.dims[1] - 1).merge_with(shape_shape.dims[0] - 1)
+    self._structure = structure_lib.NestedStructure(
+        (structure_lib.TensorStructure(dtypes.int64, [None, rank]),
+         structure_lib.TensorStructure(self._sparse_tensor.dtype, [None]),
+         structure_lib.TensorStructure(dtypes.int64, [rank])))
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.sparse_tensor_slice_dataset(
         self._sparse_tensor.indices, self._sparse_tensor.values,
         self._sparse_tensor.dense_shape)
 
   @property
-  def output_classes(self):
-    return (ops.Tensor, ops.Tensor, ops.Tensor)
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_shapes(self):
-    indices_shape = self._sparse_tensor.indices.get_shape()
-    shape_shape = self._sparse_tensor.dense_shape.get_shape()
-    rank = (indices_shape[1] - 1).merge_with(shape_shape[0] - 1)
-    num_values = tensor_shape.Dimension(None)
-    return (tensor_shape.TensorShape([num_values, rank]),
-            tensor_shape.TensorShape([num_values]),
-            tensor_shape.TensorShape([rank]))
 
-  @property
-  def output_types(self):
-    return (dtypes.int64, self._sparse_tensor.dtype, dtypes.int64)
+class _VariantDataset(DatasetV2):
+  """A Dataset wrapper around a `tf.variant`-typed function argument."""
 
+  def __init__(self, dataset_variant, structure):
+    super(_VariantDataset, self).__init__()
+    self._dataset_variant = dataset_variant
+    self._structure = structure
 
-class _NestedDatasetComponent(object):
-  """The structure of a `Dataset` nested in a component of another `Dataset`.
+  def _as_variant_tensor(self):
+    return self._dataset_variant
 
-  A `StructuredFunctionWrapper` around a function that returns a `Dataset` as
-  one of its components will have a `NestedDatasetComponent` in the
-  corresponding position in the `output_classes`, `output_shapes`, and
-  `output_types` properties.
+  def _inputs(self):
+    return []
 
-  NOTE(mrry): This class is not currently exposed via the public API. Support
-  for nested datasets can be enabled on a function-by-function basis by setting
-  `experimental_nested_dataset_support=True` in the `StructuredFunctionWrapper`
-  initializer.
+  @property
+  def _element_structure(self):
+    return self._structure
 
-  TODO(b/110122868): Add this class, or something equivalent, to the public API.
-  We are considering revising the public API for accessing Dataset structure
-  (`output_classes` etc.) based on experience with nested datasets and other
-  custom component types.
-  """
 
-  def __init__(self,
-               dataset=None,
-               output_shapes=None,
-               output_types=None,
-               output_classes=None):
-    if dataset is None:
-      if (output_classes is None or output_shapes is None or
-          output_types is None):
-        raise ValueError(
-            "Either `dataset`, or all of `output_classes`, "
-            "`output_shapes`, and `output_types` must be specified.")
-      self._output_classes = output_classes
-      self._output_shapes = output_shapes
-      self._output_types = output_types
-    else:
-      if not (output_classes is None and output_shapes is None and
-              output_types is None):
-        raise ValueError(
-            "Either `dataset`, or all of `output_classes`, "
-            "`output_shapes`, and `output_types` must be specified.")
-      self._output_classes = dataset.output_classes
-      self._output_shapes = dataset.output_shapes
-      self._output_types = dataset.output_types
+@tf_export("data.experimental.DatasetStructure")
+class DatasetStructure(structure_lib.Structure):
+  """Represents a `Dataset` of structured values."""
 
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def __init__(self, element_structure):
+    self._element_structure = element_structure
 
   @property
-  def output_shapes(self):
-    return self._output_shapes
+  def _flat_shapes(self):
+    return [tensor_shape.scalar()]
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _flat_types(self):
+    return [dtypes.variant]
 
+  def is_compatible_with(self, other):
+    # pylint: disable=protected-access
+    return (isinstance(other, DatasetStructure) and
+            self._element_structure.is_compatible_with(
+                other._element_structure))
 
-class _VariantDataset(Dataset):
-  """A Dataset wrapper around a `tf.variant`-typed function argument."""
+  def _to_tensor_list(self, value):
+    return [value._as_variant_tensor()]  # pylint: disable=protected-access
 
-  def __init__(self, dataset_variant, structure):
-    super(_VariantDataset, self).__init__()
-    self._dataset_variant = dataset_variant
-    self._structure = structure
+  def _to_batched_tensor_list(self, value):
+    raise NotImplementedError("Unbatching for `tf.data.Dataset` objects.")
 
-  def _as_variant_tensor(self):
-    return self._dataset_variant
+  def _from_tensor_list(self, flat_value):
+    if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
+        not flat_value[0].shape.is_compatible_with(tensor_shape.scalar())):
+      raise ValueError(
+          "DatasetStructure corresponds to a single tf.variant scalar.")
+    return self._from_compatible_tensor_list(flat_value)
 
-  def _inputs(self):
-    return []
+  def _from_compatible_tensor_list(self, flat_value):
+    # pylint: disable=protected-access
+    return _VariantDataset(flat_value[0], self._element_structure)
+
+  @staticmethod
+  def from_value(value):
+    return DatasetStructure(value._element_structure)  # pylint: disable=protected-access
+
+  def _to_legacy_output_types(self):
+    return self
+
+  def _to_legacy_output_shapes(self):
+    return self
+
+  def _to_legacy_output_classes(self):
+    return self
 
-  @property
-  def output_classes(self):
-    return self._structure.output_classes
+  def _batch(self, batch_size):
+    raise NotImplementedError("Batching for `tf.data.Dataset` objects.")
 
-  @property
-  def output_shapes(self):
-    return self._structure.output_shapes
+  def _unbatch(self):
+    raise NotImplementedError("Unbatching for `tf.data.Dataset` objects.")
 
-  @property
-  def output_types(self):
-    return self._structure.output_types
+
+# pylint: disable=protected-access
+structure_lib.Structure._register_custom_converter(DatasetV2,
+                                                   DatasetStructure.from_value)
+# pylint: enable=protected-access
 
 
 class StructuredFunctionWrapper(object):
   """A wrapper for `Defun` that supports structured arguments and return values.
   """
 
-  def __init__(self, func, transformation_name, dataset=None,
-               input_classes=None, input_shapes=None, input_types=None,
-               add_to_graph=True, experimental_nested_dataset_support=False):
+  def __init__(self,
+               func,
+               transformation_name,
+               dataset=None,
+               input_classes=None,
+               input_shapes=None,
+               input_types=None,
+               input_structure=None,
+               add_to_graph=True,
+               defun_kwargs=None):
     """Creates a new `StructuredFunctionWrapper` for the given function.
 
     Args:
@@ -1735,59 +1988,60 @@ class StructuredFunctionWrapper(object):
         arguments.
       input_types: (Optional.) A nested structure of `tf.DType`. If given, this
         argument defines the element types and structure for `func` arguments.
+      input_structure: (Optional.) A `Structure` object. If given, this argument
+        defines the element types and structure for `func` arguments.
       add_to_graph: (Optional.) If `True`, the function will be added to the
         default graph.
-      experimental_nested_dataset_support: (Optional.) If `True`, the function
-        will support `tf.data.Dataset` objects as arguments and return values.
+      defun_kwargs: (Optional.) A dictionary mapping string argument names to
+        values. If supplied, will be passed to `function.Defun()` as keyword
+        arguments.
 
     Raises:
       ValueError: If an invalid combination of `dataset`, `input_classes`,
         `input_shapes`, and `input_types` is passed.
     """
-    if dataset is None:
-      if input_classes is None or input_shapes is None or input_types is None:
-        raise ValueError("Either `dataset`, or all of `input_classes`, "
-                         "`input_shapes`, and `input_types` must be specified.")
-      self._input_shapes = input_shapes
-      self._input_types = input_types
-      self._input_classes = input_classes
+    if input_structure is None:
+      if dataset is None:
+        if input_classes is None or input_shapes is None or input_types is None:
+          raise ValueError("Either `dataset`, `input_structure` or all of "
+                           "`input_classes`, `input_shapes`, and `input_types` "
+                           "must be specified.")
+        self._input_structure = structure_lib.convert_legacy_structure(
+            input_types, input_shapes, input_classes)
+      else:
+        if not (input_classes is None and input_shapes is None and
+                input_types is None):
+          raise ValueError("Either `dataset`, `input_structure` or all of "
+                           "`input_classes`, `input_shapes`, and `input_types` "
+                           "must be specified.")
+        self._input_structure = dataset._element_structure  # pylint: disable=protected-access
     else:
-      if not (input_classes is None and input_shapes is None and
-              input_types is None):
-        raise ValueError("Either `dataset`, or all of `input_classes`, "
-                         "`input_shapes`, and `input_types` must be specified.")
-      self._input_shapes = dataset.output_shapes
-      self._input_types = dataset.output_types
-      self._input_classes = dataset.output_classes
+      if not (dataset is None and input_classes is None and input_shapes is None
+              and input_types is None):
+        raise ValueError("Either `dataset`, `input_structure`, or all of "
+                         "`input_classes`, `input_shapes`, and `input_types` "
+                         "must be specified.")
+      self._input_structure = input_structure
 
     self._transformation_name = transformation_name
+    readable_transformation_name = transformation_name.replace(
+        ".", "_")[:-2] if len(transformation_name) > 2 else ""
+    self._func_name = "_".join([
+        readable_transformation_name,
+        function_utils.get_func_name(func),
+        str(ops.uid())
+    ])
 
-    # TODO(b/110122868): Enable this support for all `tf.data` functions.
-    self._nested_dataset_support = experimental_nested_dataset_support
+    if defun_kwargs is None:
+      defun_kwargs = {}
 
-    @function.Defun(*self._defun_args())
+    @function.Defun(
+        *self._input_structure._flat_types, func_name=self._func_name,  # pylint: disable=protected-access
+        **defun_kwargs)
     def tf_data_structured_function_wrapper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
-      flat_args = []
-      for arg, arg_class, arg_shape, arg_type in zip(
-          args,
-          nest.flatten(self._input_classes),
-          nest.flatten(self._input_shapes),
-          nest.flatten(self._input_types)):
-        # TODO(b/110122868): Add a registration mechanism for new component
-        # types.
-        if arg_class is sparse_tensor_lib.SparseTensor:
-          arg = sparse.deserialize_sparse_tensors(
-              arg, arg_type, arg_shape, arg_class)
-          arg.indices.set_shape([None, arg_shape.ndims])
-          arg.dense_shape.set_shape([arg_shape.ndims])
-        elif isinstance(arg_class, _NestedDatasetComponent):
-          assert self._nested_dataset_support
-          arg = _VariantDataset(arg, arg_class)
-        else:
-          arg.set_shape(arg_shape)
-        flat_args.append(arg)
-      nested_args = nest.pack_sequence_as(self._input_classes, flat_args)
+      # pylint: disable=protected-access
+      nested_args = self._input_structure._from_compatible_tensor_list(args)
       if not _should_unpack_args(nested_args):
         nested_args = (nested_args,)
 
@@ -1805,55 +2059,14 @@ class StructuredFunctionWrapper(object):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
-      # values to tensors.
-      flat_ret = []
-      flat_classes = []
-      flat_shapes = []
-      flat_types = []
-      for t in nest.flatten(ret):
-        # TODO(b/110122868): Add a registration mechanism for new component
-        # types.
-        if sparse_tensor_lib.is_sparse(t):
-          t = sparse_tensor_lib.SparseTensor.from_value(t)
-          flat_ret.append(sparse.serialize_sparse_tensors(t))
-          flat_classes.append(sparse_tensor_lib.SparseTensor)
-          flat_shapes.append(t.get_shape())
-          flat_types.append(t.dtype)
-        elif isinstance(t, Dataset):
-          if not self._nested_dataset_support:
-            raise NotImplementedError(
-                "The %s transformation does not currently support nested "
-                "datasets as outputs." % self._transformation_name)
-
-          flat_ret.append(t._as_variant_tensor())  # pylint: disable=protected-access
-          component = _NestedDatasetComponent(t)
-          flat_classes.append(component)
-          flat_shapes.append(component)
-          flat_types.append(component)
-          if t.options() != Options():
-            warnings.warn("Encountered a nested dataset with non-default "
-                          "options. These options will not be propagated to "
-                          "the outer dataset.")
-        else:
-          try:
-            t = ops.convert_to_tensor(t)
-          except (ValueError, TypeError):
-            raise TypeError("Unsupported return value from function passed to "
-                            "%s: %s." % (transformation_name, t))
-          flat_ret.append(t)
-          flat_classes.append(ops.Tensor)
-          flat_shapes.append(t.get_shape())
-          flat_types.append(t.dtype)
-
-      ret = nest.pack_sequence_as(ret, flat_ret)
-      self._output_classes = nest.pack_sequence_as(ret, flat_classes)
-      self._output_shapes = nest.pack_sequence_as(ret, flat_shapes)
-      self._output_types = nest.pack_sequence_as(ret, flat_types)
+      try:
+        self._output_structure = structure_lib.Structure.from_value(ret)
+      except (ValueError, TypeError):
+        raise TypeError("Unsupported return value from function passed to "
+                        "%s: %s." % (transformation_name, ret))
 
       _warn_if_collections(transformation_name)
-
-      return flat_ret
+      return self._output_structure._to_tensor_list(ret)
 
     self._function = tf_data_structured_function_wrapper
     if add_to_graph:
@@ -1864,36 +2077,21 @@ class StructuredFunctionWrapper(object):
       # in case (e.g.) we need to rerun the function.
       self._function._create_definition_if_needed()  # pylint: disable=protected-access
 
-  def _defun_args(self):
-    """Returns a flat list of `tf.DType` for the input element structure."""
-    ret = []
-    for input_type, input_class in zip(nest.flatten(self._input_types),
-                                       nest.flatten(self._input_classes)):
-      # TODO(b/110122868): Add a registration mechanism for new component types.
-      if input_class is sparse_tensor_lib.SparseTensor:
-        ret.append(dtypes.variant)
-      elif isinstance(input_class, _NestedDatasetComponent):
-        if not self._nested_dataset_support:
-          raise NotImplementedError(
-              "The %s transformation does not currently support nested "
-              "datasets as inputs." % self._transformation_name)
-        ret.append(dtypes.variant)
-      else:
-        assert isinstance(input_type, dtypes.DType)
-        ret.append(input_type)
-    return ret
+  @property
+  def output_structure(self):
+    return self._output_structure
 
   @property
   def output_classes(self):
-    return self._output_classes
+    return self._output_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
-    return self._output_shapes
+    return self._output_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
-    return self._output_types
+    return self._output_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
   @property
   def function(self):
@@ -1916,30 +2114,11 @@ def flat_structure(dataset):
     A dictionary of keyword arguments that can be passed to many Dataset op
     constructors.
   """
-  output_classes = []
-  output_shapes = []
-  output_types = []
-  for output_class, output_shape, output_type in zip(
-      nest.flatten(dataset.output_classes), nest.flatten(dataset.output_shapes),
-      nest.flatten(dataset.output_types)):
-    if isinstance(output_class, _NestedDatasetComponent):
-      output_classes.append(output_class.output_classes)
-      output_shapes.append(output_shape.output_shapes)
-      output_types.append(output_type.output_types)
-    else:
-      output_classes.append(output_class)
-      output_shapes.append(output_shape)
-      output_types.append(output_type)
-
-  output_classes = nest.pack_sequence_as(dataset.output_classes, output_classes)
-  output_shapes = nest.pack_sequence_as(dataset.output_shapes, output_shapes)
-  output_types = nest.pack_sequence_as(dataset.output_types, output_types)
-
+  # pylint: disable=protected-access
+  structure = dataset._element_structure
   return {
-      "output_shapes":
-          nest.flatten(sparse.as_dense_shapes(output_shapes, output_classes)),
-      "output_types":
-          nest.flatten(sparse.as_dense_types(output_types, output_classes)),
+      "output_shapes": structure._flat_shapes,
+      "output_types": structure._flat_types,
   }
 
 
@@ -1962,71 +2141,52 @@ class _GeneratorDataset(DatasetSource):
         destroyed. The return value is ignored.
     """
     super(_GeneratorDataset, self).__init__()
-    # These members will be initialized by `tf_init_func`.
-    self._state_classes = None
-    self._state_shapes = None
-    self._state_types = None
-
     self._init_args = init_args
 
-    init_args_classes = sparse.get_classes(init_args)
-    init_args_shapes = nest.pack_sequence_as(
-        init_args, [t.get_shape() for t in nest.flatten(init_args)])
-    init_args_types = nest.pack_sequence_as(
-        init_args, [t.dtype for t in nest.flatten(init_args)])
-
-    wrapped_init_func = StructuredFunctionWrapper(
-        init_func, "GeneratorDataset", input_classes=init_args_classes,
-        input_shapes=init_args_shapes, input_types=init_args_types)
-    self._state_classes = wrapped_init_func.output_classes
-    self._state_shapes = wrapped_init_func.output_shapes
-    self._state_types = wrapped_init_func.output_types
-    self._init_func = wrapped_init_func.function
-
-    wrapped_next_func = StructuredFunctionWrapper(
-        next_func, "GeneratorDataset", input_classes=self._state_classes,
-        input_shapes=self._state_shapes, input_types=self._state_types)
-    self._output_classes = wrapped_next_func.output_classes
-    self._output_shapes = wrapped_next_func.output_shapes
-    self._output_types = wrapped_next_func.output_types
-    self._next_func = wrapped_next_func.function
-
-    wrapped_finalize_func = StructuredFunctionWrapper(
-        finalize_func, "GeneratorDataset", input_classes=self._state_classes,
-        input_shapes=self._state_shapes, input_types=self._state_types)
-    self._finalize_func = wrapped_finalize_func.function
+    self._init_structure = structure_lib.Structure.from_value(init_args)
+
+    self._init_func = StructuredFunctionWrapper(
+        init_func,
+        self._transformation_name(),
+        input_structure=self._init_structure)
+
+    self._next_func = StructuredFunctionWrapper(
+        next_func,
+        self._transformation_name(),
+        input_structure=self._init_func.output_structure)
+
+    self._finalize_func = StructuredFunctionWrapper(
+        finalize_func,
+        self._transformation_name(),
+        input_structure=self._init_func.output_structure)
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.generator_dataset(
-        nest.flatten(self._init_args) + self._init_func.captured_inputs,
-        self._next_func.captured_inputs,
-        self._finalize_func.captured_inputs,
-        init_func=self._init_func,
-        next_func=self._next_func,
-        finalize_func=self._finalize_func,
+        self._init_structure._to_tensor_list(self._init_args)  # pylint: disable=protected-access
+        + self._init_func.function.captured_inputs,
+        self._next_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        init_func=self._init_func.function,
+        next_func=self._next_func.function,
+        finalize_func=self._finalize_func.function,
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
+  def _element_structure(self):
+    return self._next_func.output_structure
 
-  @property
-  def output_types(self):
-    return self._output_types
+  def _transformation_name(self):
+    return "Dataset.from_generator()"
 
 
-class ZipDataset(Dataset):
+class ZipDataset(DatasetV2):
   """A `Dataset` that zips its inputs together."""
 
   def __init__(self, datasets):
     """See `Dataset.zip()` for details."""
     super(ZipDataset, self).__init__()
     for ds in nest.flatten(datasets):
-      if not isinstance(ds, Dataset):
+      if not isinstance(ds, DatasetV2):
         if isinstance(ds, list):
           message = ("The argument to `Dataset.zip()` must be a nested "
                      "structure of `Dataset` objects. Nested structures do not "
@@ -2036,6 +2196,10 @@ class ZipDataset(Dataset):
                      "structure of `Dataset` objects.")
         raise TypeError(message)
     self._datasets = datasets
+    self._structure = structure_lib.NestedStructure(
+        nest.pack_sequence_as(
+            self._datasets,
+            [ds._element_structure for ds in nest.flatten(self._datasets)]))  # pylint: disable=protected-access
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
@@ -2048,25 +2212,11 @@ class ZipDataset(Dataset):
     return nest.flatten(self._datasets)
 
   @property
-  def output_classes(self):
-    return nest.pack_sequence_as(
-        self._datasets,
-        [ds.output_classes for ds in nest.flatten(self._datasets)])
-
-  @property
-  def output_shapes(self):
-    return nest.pack_sequence_as(
-        self._datasets,
-        [ds.output_shapes for ds in nest.flatten(self._datasets)])
-
-  @property
-  def output_types(self):
-    return nest.pack_sequence_as(
-        self._datasets,
-        [ds.output_types for ds in nest.flatten(self._datasets)])
+  def _element_structure(self):
+    return self._structure
 
 
-class ConcatenateDataset(Dataset):
+class ConcatenateDataset(DatasetV2):
   """A `Dataset` that concatenates its input with given dataset."""
 
   def __init__(self, input_dataset, dataset_to_concatenate):
@@ -2074,14 +2224,30 @@ class ConcatenateDataset(Dataset):
     super(ConcatenateDataset, self).__init__()
     self._input_dataset = input_dataset
     self._dataset_to_concatenate = dataset_to_concatenate
-    if input_dataset.output_types != dataset_to_concatenate.output_types:
+
+    output_types = input_dataset.output_types
+    if output_types != dataset_to_concatenate.output_types:
       raise TypeError(
           "Two datasets to concatenate have different types %s and %s" %
-          (input_dataset.output_types, dataset_to_concatenate.output_types))
-    if input_dataset.output_classes != dataset_to_concatenate.output_classes:
+          (output_types, dataset_to_concatenate.output_types))
+
+    output_classes = input_dataset.output_classes
+    if output_classes != dataset_to_concatenate.output_classes:
       raise TypeError(
           "Two datasets to concatenate have different classes %s and %s" %
-          (input_dataset.output_classes, dataset_to_concatenate.output_classes))
+          (output_classes, dataset_to_concatenate.output_classes))
+
+    input_shapes = self._input_dataset.output_shapes
+    output_shapes = nest.pack_sequence_as(input_shapes, [
+        ts1.most_specific_compatible_shape(ts2)
+        for (ts1, ts2) in zip(
+            nest.flatten(input_shapes),
+            nest.flatten(self._dataset_to_concatenate.output_shapes))
+    ])
+
+    self._structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
+
     self._input_datasets = [input_dataset, dataset_to_concatenate]
 
   def _as_variant_tensor(self):
@@ -2096,24 +2262,11 @@ class ConcatenateDataset(Dataset):
     return [self._input_dataset, self._dataset_to_concatenate]
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return nest.pack_sequence_as(self._input_dataset.output_shapes, [
-        ts1.most_specific_compatible_shape(ts2)
-        for (ts1, ts2) in zip(
-            nest.flatten(self._input_dataset.output_shapes),
-            nest.flatten(self._dataset_to_concatenate.output_shapes))
-    ])
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
-class RepeatDataset(UnaryDataset):
+class RepeatDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that repeats its input several times."""
 
   def __init__(self, input_dataset, count):
@@ -2132,18 +2285,6 @@ class RepeatDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class RangeDataset(DatasetSource):
   """A `Dataset` of a step separated range of values."""
@@ -2181,19 +2322,11 @@ class RangeDataset(DatasetSource):
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.int64
+  def _element_structure(self):
+    return structure_lib.TensorStructure(dtypes.int64, [])
 
 
-class CacheDataset(UnaryDataset):
+class CacheDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that caches elements of its input."""
 
   def __init__(self, input_dataset, filename):
@@ -2209,20 +2342,8 @@ class CacheDataset(UnaryDataset):
         filename=self._filename,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
 
-
-class ShuffleDataset(UnaryDataset):
+class ShuffleDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that randomly shuffles the elements of its input."""
 
   def __init__(self,
@@ -2255,6 +2376,7 @@ class ShuffleDataset(UnaryDataset):
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
     self._seed, self._seed2 = random_seed.get_seed(seed)
+
     if reshuffle_each_iteration is None:
       self._reshuffle_each_iteration = True
     else:
@@ -2269,20 +2391,8 @@ class ShuffleDataset(UnaryDataset):
         reshuffle_each_iteration=self._reshuffle_each_iteration,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
 
-
-class TakeDataset(UnaryDataset):
+class TakeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` containing the first `count` elements from its input."""
 
   def __init__(self, input_dataset, count):
@@ -2297,20 +2407,8 @@ class TakeDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class SkipDataset(UnaryDataset):
+class SkipDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` skipping the first `count` elements from its input."""
 
   def __init__(self, input_dataset, count):
@@ -2325,18 +2423,6 @@ class SkipDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class BatchDataset(UnaryDataset):
   """A `Dataset` that batches contiguous elements from its input."""
@@ -2350,37 +2436,26 @@ class BatchDataset(UnaryDataset):
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
-  def _as_variant_tensor(self):
-    # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
-    if smart_cond.smart_constant_value(self._drop_remainder) is False:
-      return gen_dataset_ops.batch_dataset(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          batch_size=self._batch_size,
-          **flat_structure(self))
+    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder)
+    # pylint: disable=protected-access
+    if constant_drop_remainder:
+      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
+      # or `False` (explicitly retaining the remainder).
+      self._structure = input_dataset._element_structure._batch(
+          tensor_util.constant_value(self._batch_size))
     else:
-      return gen_dataset_ops.batch_dataset_v2(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          batch_size=self._batch_size,
-          drop_remainder=self._drop_remainder,
-          **flat_structure(self))
+      self._structure = input_dataset._element_structure._batch(None)
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    input_shapes = self._input_dataset.output_shapes
-    return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(
-            tensor_util.constant_value(self._batch_size) if smart_cond.
-            smart_constant_value(self._drop_remainder) else None).concatenate(s)
-        for s in nest.flatten(self._input_dataset.output_shapes)
-    ])
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.batch_dataset_v2(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        batch_size=self._batch_size,
+        drop_remainder=self._drop_remainder,
+        **flat_structure(self))
 
   @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 def _is_padded_shape_compatible_with(padded_shape, input_component_shape):
@@ -2529,22 +2604,34 @@ class PaddedBatchDataset(UnaryDataset):
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
+    def _padded_shape_to_batch_shape(s):
+      return tensor_shape.vector(
+          tensor_util.constant_value(self._batch_size) if smart_cond.
+          smart_constant_value(self._drop_remainder) else None).concatenate(
+              tensor_util.constant_value_as_shape(s))
+
+    output_shapes = nest.map_structure(
+        _padded_shape_to_batch_shape, self._padded_shapes)
+    self._structure = structure_lib.convert_legacy_structure(
+        self._input_dataset.output_types, output_shapes,
+        self._input_dataset.output_classes)
+
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
     if smart_cond.smart_constant_value(self._drop_remainder) is False:
       return gen_dataset_ops.padded_batch_dataset(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          self._input_dataset._as_variant_tensor(),
           batch_size=self._batch_size,
           padded_shapes=[
               ops.convert_to_tensor(s, dtype=dtypes.int64)
               for s in nest.flatten(self._padded_shapes)
           ],
           padding_values=nest.flatten(self._padding_values),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+          output_shapes=self._structure._flat_shapes)
     else:
       return gen_dataset_ops.padded_batch_dataset_v2(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          self._input_dataset._as_variant_tensor(),
           batch_size=self._batch_size,
           padded_shapes=[
               ops.convert_to_tensor(s, dtype=dtypes.int64)
@@ -2552,27 +2639,11 @@ class PaddedBatchDataset(UnaryDataset):
           ],
           padding_values=nest.flatten(self._padding_values),
           drop_remainder=self._drop_remainder,
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-
-    def _padded_shape_to_batch_shape(s):
-      return tensor_shape.vector(
-          tensor_util.constant_value(self._batch_size) if smart_cond.
-          smart_constant_value(self._drop_remainder) else None).concatenate(
-              tensor_util.constant_value_as_shape(s))
-
-    return nest.map_structure(_padded_shape_to_batch_shape, self._padded_shapes)
+          output_shapes=self._structure._flat_shapes)
 
   @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 def _should_unpack_args(args):
@@ -2601,39 +2672,38 @@ def _warn_if_collections(transformation_name):
 class MapDataset(UnaryDataset):
   """A `Dataset` that maps a function over elements in its input."""
 
-  def __init__(self, input_dataset, map_func, use_inter_op_parallelism=True):
+  def __init__(self,
+               input_dataset,
+               map_func,
+               use_inter_op_parallelism=True,
+               preserve_cardinality=False):
     """See `Dataset.map()` for details."""
     super(MapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
-
-    wrapped_func = StructuredFunctionWrapper(
-        map_func, "Dataset.map()", input_dataset)
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
+    self._preserve_cardinality = preserve_cardinality
+    self._map_func = StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     return gen_dataset_ops.map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def _functions(self):
+    return [self._map_func]
 
   @property
-  def output_shapes(self):
-    return self._output_shapes
+  def _element_structure(self):
+    return self._map_func.output_structure
 
-  @property
-  def output_types(self):
-    return self._output_types
+  def _transformation_name(self):
+    return "Dataset.map()"
 
 
 class ParallelMapDataset(MapDataset):
@@ -2643,25 +2713,26 @@ class ParallelMapDataset(MapDataset):
                input_dataset,
                map_func,
                num_parallel_calls,
-               use_inter_op_parallelism=True):
+               use_inter_op_parallelism=True,
+               preserve_cardinality=False):
     """See `Dataset.map()` for details."""
-    super(ParallelMapDataset, self).__init__(input_dataset, map_func,
-                                             use_inter_op_parallelism)
+    super(ParallelMapDataset, self).__init__(
+        input_dataset, map_func, use_inter_op_parallelism, preserve_cardinality)
 
     self._num_parallel_calls = ops.convert_to_tensor(
         num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls")
 
   def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     # pylint: disable=protected-access
+    input_t = self._input_dataset._as_variant_tensor()
     return gen_dataset_ops.parallel_map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         num_parallel_calls=self._num_parallel_calls,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
         **flat_structure(self))
-    # pylint: enable=protected-access
 
 
 class FlatMapDataset(UnaryDataset):
@@ -2672,34 +2743,25 @@ class FlatMapDataset(UnaryDataset):
     super(FlatMapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
 
-    wrapped_func = StructuredFunctionWrapper(
-        map_func, self._transformation_name(), input_dataset,
-        experimental_nested_dataset_support=True)
-    if not isinstance(wrapped_func.output_classes, _NestedDatasetComponent):
+    self._map_func = StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
+    if not isinstance(self._map_func.output_structure, DatasetStructure):
       raise TypeError("`map_func` must return a `Dataset` object.")
-    self._output_classes = wrapped_func.output_classes.output_classes
-    self._output_types = wrapped_func.output_types.output_types
-    self._output_shapes = wrapped_func.output_shapes.output_shapes
-    self._map_func = wrapped_func.function
+    self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.flat_map_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
   def _transformation_name(self):
     return "Dataset.flat_map()"
@@ -2718,12 +2780,13 @@ class InterleaveDataset(FlatMapDataset):
         block_length, dtype=dtypes.int64, name="block_length")
 
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     return gen_dataset_ops.interleave_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,  # pylint: disable=protected-access
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
-        f=self._map_func,  # pylint: disable=protected-access
+        f=self._map_func.function,
         **flat_structure(self))
 
   def _transformation_name(self):
@@ -2747,20 +2810,21 @@ class ParallelInterleaveDataset(FlatMapDataset):
         num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
 
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     return gen_dataset_ops.parallel_interleave_dataset_v2(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,  # pylint: disable=protected-access
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
         self._num_parallel_calls,
-        f=self._map_func,  # pylint: disable=protected-access
+        f=self._map_func.function,
         **flat_structure(self))
 
   def _transformation_name(self):
     return "Dataset.interleave()"
 
 
-class FilterDataset(UnaryDataset):
+class FilterDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that filters its input according to a predicate function."""
 
   def __init__(self, input_dataset, predicate):
@@ -2768,34 +2832,27 @@ class FilterDataset(UnaryDataset):
     super(FilterDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     wrapped_func = StructuredFunctionWrapper(
-        predicate, "Dataset.filter()", input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.bool and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+        predicate, self._transformation_name(), dataset=input_dataset)
+    if not wrapped_func.output_structure.is_compatible_with(
+        structure_lib.TensorStructure(dtypes.bool, [])):
       raise ValueError("`predicate` must return a scalar boolean tensor.")
-    self._predicate = wrapped_func.function
+    self._predicate = wrapped_func
+
+  def _functions(self):
+    return [self._predicate]
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.filter_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        other_arguments=self._predicate.captured_inputs,
-        predicate=self._predicate,
+        other_arguments=self._predicate.function.captured_inputs,
+        predicate=self._predicate.function,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _transformation_name(self):
+    return "Dataset.filter()"
 
 
-class PrefetchDataset(UnaryDataset):
+class PrefetchDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that asynchronously prefetches its input."""
 
   def __init__(self, input_dataset, buffer_size):
@@ -2813,18 +2870,6 @@ class PrefetchDataset(UnaryDataset):
         buffer_size=self._buffer_size,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class WindowDataset(UnaryDataset):
   """A dataset that creates window datasets from the input elements."""
@@ -2839,20 +2884,17 @@ class WindowDataset(UnaryDataset):
         stride, dtype=dtypes.int64, name="stride")
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
-    self._output_classes = nest.pack_sequence_as(
+    nest_of_structures = nest.pack_sequence_as(
         input_dataset.output_classes,
         [
-            _NestedDatasetComponent(  # pylint: disable=protected-access
-                output_classes=output_class,
-                output_shapes=output_shape,
-                output_types=output_type)
+            DatasetStructure(structure_lib.convert_legacy_structure(
+                output_type, output_shape, output_class))
             for output_class, output_shape, output_type in zip(
                 nest.flatten(input_dataset.output_classes),
                 nest.flatten(input_dataset.output_shapes),
                 nest.flatten(input_dataset.output_types))
         ])
-    self._output_shapes = self._output_classes
-    self._output_types = self._output_classes
+    self._structure = structure_lib.NestedStructure(nest_of_structures)
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.window_dataset(
@@ -2864,19 +2906,11 @@ class WindowDataset(UnaryDataset):
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
-class _OptionsDataset(UnaryDataset):
+class _OptionsDataset(UnaryUnchangedStructureDataset):
   """An identity `Dataset` that stores options."""
 
   def __init__(self, input_dataset, options):
@@ -2894,20 +2928,8 @@ class _OptionsDataset(UnaryDataset):
   def options(self):
     return self._options
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class _ModelDataset(UnaryDataset):
+class _ModelDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and models performance."""
 
   def __init__(self, input_dataset):
@@ -2920,20 +2942,8 @@ class _ModelDataset(UnaryDataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class _OptimizeDataset(UnaryDataset):
+class _OptimizeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and applies optimizations."""
 
   def __init__(self, input_dataset, optimizations):
@@ -2951,14 +2961,55 @@ class _OptimizeDataset(UnaryDataset):
         self._optimizations,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
+class _SetStatsAggregatorDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, and sets a stats aggregator."""
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def __init__(self, input_dataset, aggregator, prefix, counter_prefix):
+    super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._stats_aggregator = aggregator
+    self._prefix = prefix
+    self._counter_prefix = counter_prefix
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_set_stats_aggregator_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._stats_aggregator._resource,  # pylint: disable=protected-access
+        self._prefix,
+        self._counter_prefix,
+        **flat_structure(self))
+
+
+class _MaxIntraOpParallelismDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, overriding intra-op parallelism."""
+
+  def __init__(self, input_dataset, max_intra_op_parallelism):
+    super(_MaxIntraOpParallelismDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._max_intra_op_parallelism = ops.convert_to_tensor(
+        max_intra_op_parallelism,
+        dtype=dtypes.int64,
+        name="max_intra_op_parallelism")
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_max_intra_op_parallelism_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._max_intra_op_parallelism,
+        **flat_structure(self))
+
+
+class _PrivateThreadPoolDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, setting a private threadpool."""
+
+  def __init__(self, input_dataset, num_threads):
+    super(_PrivateThreadPoolDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._num_threads = ops.convert_to_tensor(
+        num_threads, dtype=dtypes.int64, name="num_threads")
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_private_thread_pool_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._num_threads,
+        **flat_structure(self))
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index cae00cdbfc2fa141ad0b008e3b1c82527ff52abe..d0e91b01f9138470cd2a06a8b353149b74af2497 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -23,8 +23,7 @@ import warnings
 from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.data.util import structure
+from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -68,7 +67,7 @@ def _device_stack_is_empty():
   return not bool(device_stack)
 
 
-@tf_export("data.Iterator")
+@tf_export(v1=["data.Iterator"])
 class Iterator(checkpointable.CheckpointableBase):
   """Represents the state of iterating through a `Dataset`."""
 
@@ -94,9 +93,15 @@ class Iterator(checkpointable.CheckpointableBase):
     """
     self._iterator_resource = iterator_resource
     self._initializer = initializer
-    self._output_classes = output_classes
-    self._output_types = output_types
-    self._output_shapes = output_shapes
+
+    if (output_types is None or output_shapes is None
+        or output_classes is None):
+      raise ValueError("If `structure` is not specified, all of "
+                       "`output_types`, `output_shapes`, and `output_classes`"
+                       " must be specified.")
+    self._structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
+
     self._string_handle = gen_dataset_ops.iterator_to_string_handle(
         self._iterator_resource)
     self._get_next_call_count = 0
@@ -182,34 +187,32 @@ class Iterator(checkpointable.CheckpointableBase):
     if output_classes is None:
       output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
+    output_structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
     if shared_name is None:
       shared_name = ""
+    # pylint: disable=protected-access
     if compat.forward_compatible(2018, 8, 3):
       if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_v2(
               container="",
               shared_name=shared_name,
-              output_types=nest.flatten(
-                  sparse.as_dense_types(output_types, output_classes)),
-              output_shapes=nest.flatten(
-                  sparse.as_dense_shapes(output_shapes, output_classes)))
+              output_types=output_structure._flat_types,
+              output_shapes=output_structure._flat_shapes)
       else:
         iterator_resource = gen_dataset_ops.iterator_v2(
             container="",
             shared_name=shared_name,
-            output_types=nest.flatten(
-                sparse.as_dense_types(output_types, output_classes)),
-            output_shapes=nest.flatten(
-                sparse.as_dense_shapes(output_shapes, output_classes)))
+            output_types=output_structure._flat_types,
+            output_shapes=output_structure._flat_shapes)
     else:
       iterator_resource = gen_dataset_ops.iterator(
           container="",
           shared_name=shared_name,
-          output_types=nest.flatten(
-              sparse.as_dense_types(output_types, output_classes)),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(output_shapes, output_classes)))
+          output_types=output_structure._flat_types,
+          output_shapes=output_structure._flat_shapes)
+    # pylint: enable=protected-access
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -272,30 +275,28 @@ class Iterator(checkpointable.CheckpointableBase):
     if output_classes is None:
       output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
+    output_structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
+    # pylint: disable=protected-access
     if compat.forward_compatible(2018, 8, 3):
       if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
               string_handle,
-              output_types=nest.flatten(
-                  sparse.as_dense_types(output_types, output_classes)),
-              output_shapes=nest.flatten(
-                  sparse.as_dense_shapes(output_shapes, output_classes)))
+              output_types=output_structure._flat_types,
+              output_shapes=output_structure._flat_shapes)
       else:
         iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
             string_handle,
-            output_types=nest.flatten(
-                sparse.as_dense_types(output_types, output_classes)),
-            output_shapes=nest.flatten(
-                sparse.as_dense_shapes(output_shapes, output_classes)))
+            output_types=output_structure._flat_types,
+            output_shapes=output_structure._flat_shapes)
     else:
       iterator_resource = gen_dataset_ops.iterator_from_string_handle(
           string_handle,
-          output_types=nest.flatten(
-              sparse.as_dense_types(output_types, output_classes)),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(output_shapes, output_classes)))
+          output_types=output_structure._flat_types,
+          output_shapes=output_structure._flat_shapes)
+    # pylint: enable=protected-access
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -332,28 +333,28 @@ class Iterator(checkpointable.CheckpointableBase):
         element structure.
     """
     with ops.name_scope(name, "make_initializer") as name:
-      nest.assert_same_structure(self._output_types, dataset.output_types)
-      nest.assert_same_structure(self._output_shapes, dataset.output_shapes)
+      nest.assert_same_structure(self.output_types, dataset.output_types)
+      nest.assert_same_structure(self.output_shapes, dataset.output_shapes)
       for iterator_class, dataset_class in zip(
-          nest.flatten(self._output_classes),
+          nest.flatten(self.output_classes),
           nest.flatten(dataset.output_classes)):
         if iterator_class is not dataset_class:
           raise TypeError(
               "Expected output classes %r but got dataset with output class %r."
-              % (self._output_classes, dataset.output_classes))
+              % (self.output_classes, dataset.output_classes))
       for iterator_dtype, dataset_dtype in zip(
-          nest.flatten(self._output_types), nest.flatten(dataset.output_types)):
+          nest.flatten(self.output_types), nest.flatten(dataset.output_types)):
         if iterator_dtype != dataset_dtype:
           raise TypeError(
               "Expected output types %r but got dataset with output types %r." %
-              (self._output_types, dataset.output_types))
+              (self.output_types, dataset.output_types))
       for iterator_shape, dataset_shape in zip(
-          nest.flatten(self._output_shapes), nest.flatten(
+          nest.flatten(self.output_shapes), nest.flatten(
               dataset.output_shapes)):
         if not iterator_shape.is_compatible_with(dataset_shape):
           raise TypeError("Expected output shapes compatible with %r but got "
                           "dataset with output shapes %r." %
-                          (self._output_shapes, dataset.output_shapes))
+                          (self.output_shapes, dataset.output_shapes))
     with ops.colocate_with(self._iterator_resource):
       return gen_dataset_ops.make_iterator(
           dataset._as_variant_tensor(), self._iterator_resource, name=name)  # pylint: disable=protected-access
@@ -406,20 +407,12 @@ class Iterator(checkpointable.CheckpointableBase):
     if self._get_next_call_count > GET_NEXT_CALL_WARNING_THRESHOLD:
       warnings.warn(GET_NEXT_CALL_WARNING_MESSAGE)
 
-    return sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(self._output_types,
-                              gen_dataset_ops.iterator_get_next(
-                                  self._iterator_resource,
-                                  output_types=nest.flatten(
-                                      sparse.as_dense_types(
-                                          self._output_types,
-                                          self._output_classes)),
-                                  output_shapes=nest.flatten(
-                                      sparse.as_dense_shapes(
-                                          self._output_shapes,
-                                          self._output_classes)),
-                                  name=name)), self._output_types,
-        self._output_shapes, self._output_classes)
+    # pylint: disable=protected-access
+    flat_ret = gen_dataset_ops.iterator_get_next(
+        self._iterator_resource,
+        output_types=self._structure._flat_types,
+        output_shapes=self._structure._flat_shapes, name=name)
+    return self._structure._from_tensor_list(flat_ret)
 
   def string_handle(self, name=None):
     """Returns a string-valued `tf.Tensor` that represents this iterator.
@@ -446,7 +439,7 @@ class Iterator(checkpointable.CheckpointableBase):
       A nested structure of Python `type` objects corresponding to each
       component of an element of this dataset.
     """
-    return self._output_classes
+    return self._structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
@@ -456,7 +449,7 @@ class Iterator(checkpointable.CheckpointableBase):
       A nested structure of `tf.TensorShape` objects corresponding to each
       component of an element of this dataset.
     """
-    return self._output_shapes
+    return self._structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
@@ -466,7 +459,17 @@ class Iterator(checkpointable.CheckpointableBase):
       A nested structure of `tf.DType` objects corresponding to each component
       of an element of this dataset.
     """
-    return self._output_types
+    return self._structure._to_legacy_output_types()  # pylint: disable=protected-access
+
+  @property
+  def _element_structure(self):
+    """The structure of an element of this iterator.
+
+    Returns:
+      A `Structure` object representing the structure of the components of this
+        optional.
+    """
+    return self._structure
 
   def _gather_saveables_for_checkpoint(self):
 
@@ -519,14 +522,13 @@ class EagerIterator(checkpointable.CheckpointableBase):
           format(type(self)))
     self._device = context.context().device_name
     with ops.device("/cpu:0"):
-      ds_variant = dataset._as_variant_tensor()  # pylint: disable=protected-access
-      self._output_classes = dataset.output_classes
-      self._output_types = dataset.output_types
-      self._output_shapes = dataset.output_shapes
-      self._flat_output_types = nest.flatten(
-          sparse.as_dense_types(self._output_types, self._output_classes))
-      self._flat_output_shapes = nest.flatten(
-          sparse.as_dense_shapes(self._output_shapes, self._output_classes))
+      # pylint: disable=protected-access
+      dataset = dataset._apply_options()
+      ds_variant = dataset._as_variant_tensor()
+      self._structure = structure_lib.convert_legacy_structure(
+          dataset.output_types, dataset.output_shapes, dataset.output_classes)
+      self._flat_output_types = self._structure._flat_types
+      self._flat_output_shapes = self._structure._flat_shapes
       with ops.colocate_with(ds_variant):
         self._resource = gen_dataset_ops.anonymous_iterator(
             output_types=self._flat_output_types,
@@ -535,6 +537,7 @@ class EagerIterator(checkpointable.CheckpointableBase):
         # Delete the resource when this object is deleted
         self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
             handle=self._resource, handle_device=self._device)
+      # pylint: enable=protected-access
 
   def __iter__(self):
     return self
@@ -563,9 +566,7 @@ class EagerIterator(checkpointable.CheckpointableBase):
             output_types=self._flat_output_types,
             output_shapes=self._flat_output_shapes)
 
-      return sparse.deserialize_sparse_tensors(
-          nest.pack_sequence_as(self._output_types, ret), self._output_types,
-          self._output_shapes, self._output_classes)
+      return self._structure._from_compatible_tensor_list(ret)  # pylint: disable=protected-access
 
   def next(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
@@ -585,7 +586,7 @@ class EagerIterator(checkpointable.CheckpointableBase):
       A nested structure of Python `type` objects corresponding to each
       component of an element of this dataset.
     """
-    return self._output_classes
+    return self._structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
@@ -595,7 +596,7 @@ class EagerIterator(checkpointable.CheckpointableBase):
       A nested structure of `tf.TensorShape` objects corresponding to each
       component of an element of this dataset.
     """
-    return self._output_shapes
+    return self._structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
@@ -605,7 +606,17 @@ class EagerIterator(checkpointable.CheckpointableBase):
       A nested structure of `tf.DType` objects corresponding to each component
       of an element of this dataset.
     """
-    return self._output_types
+    return self._structure._to_legacy_output_types()  # pylint: disable=protected-access
+
+  @property
+  def _element_structure(self):
+    """The structure of an element of this iterator.
+
+    Returns:
+      A `Structure` object representing the structure of the components of this
+        optional.
+    """
+    return self._structure
 
   def get_next(self, name=None):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
@@ -640,7 +651,6 @@ class _IteratorSaveable(BaseSaverBuilder.SaveableObject):
     specs = [
         BaseSaverBuilder.SaveSpec(serialized_iterator, "", name + "_STATE")
     ]
-    # pylint: disable=protected-access
     super(_IteratorSaveable, self).__init__(iterator_resource, specs, name)
 
   def restore(self, restored_tensors, restored_shapes):
@@ -665,12 +675,6 @@ def get_next_as_optional(iterator):
   return optional_ops._OptionalImpl(
       gen_dataset_ops.iterator_get_next_as_optional(
           iterator._iterator_resource,
-          output_types=nest.flatten(
-              sparse.as_dense_types(iterator.output_types,
-                                    iterator.output_classes)),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(iterator.output_shapes,
-                                     iterator.output_classes))),
-      structure.Structure._from_legacy_structure(iterator.output_types,
-                                                 iterator.output_shapes,
-                                                 iterator.output_classes))
+          output_types=iterator._element_structure._flat_types,
+          output_shapes=iterator._element_structure._flat_shapes),
+      iterator._element_structure)
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index b7d3aac206585f9d3fa74226df6b62d822b7d3d5..7586012574d39d7409e28f0d830a5fdadb25b61c 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -17,13 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -34,75 +35,78 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
   """A `dummy` generator dataset."""
 
   def __init__(self, shard_num, multi_device_iterator_resource, incarnation_id,
-               source_device, target_device, output_shapes, output_types,
-               output_classes):
+               source_device, target_device, element_structure):
     self._target_device = target_device
-    self._output_types = output_types
-    self._output_shapes = output_shapes
-    self._output_classes = output_classes
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._output_shapes, self._output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._output_types, self._output_classes))
+    self._structure = element_structure
 
     multi_device_iterator_string_handle = (
         gen_dataset_ops.multi_device_iterator_to_string_handle(
             multi_device_iterator_resource))
 
-    @function.Defun()
+    @function.defun()
     def _init_func():
       return multi_device_iterator_string_handle
 
-    @function.Defun()
+    init_func_concrete = _init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
+    @function.defun()
     def _remote_init_func():
       return functional_ops.remote_call(
           target=source_device,
-          args=_init_func.captured_inputs,
+          args=init_func_concrete.captured_inputs,
           Tout=[dtypes.string],
-          f=_init_func)
+          f=init_func_concrete)
 
-    self._init_func = _remote_init_func
-    self._init_captured_args = _remote_init_func.captured_inputs
+    self._init_func = _remote_init_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    self._init_captured_args = self._init_func.captured_inputs
 
-    @function.Defun(dtypes.string)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _next_func(string_handle):
+      # pylint: disable=protected-access
       multi_device_iterator = (
           gen_dataset_ops.multi_device_iterator_from_string_handle(
               string_handle=string_handle,
-              output_types=self._flat_output_types,
-              output_shapes=self._flat_output_shapes))
+              output_types=self._structure._flat_types,
+              output_shapes=self._structure._flat_shapes))
       return gen_dataset_ops.multi_device_iterator_get_next_from_shard(
           multi_device_iterator=multi_device_iterator,
           shard_num=shard_num,
           incarnation_id=incarnation_id,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          output_types=self._structure._flat_types,
+          output_shapes=self._structure._flat_shapes)
+
+    next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
-    @function.Defun(dtypes.string)
+    @function.defun_with_attributes(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
+        attributes={"experimental_ints_on_device": True})
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
           target=source_device,
-          args=[string_handle] + _next_func.captured_inputs,
-          Tout=self._flat_output_types,
-          f=_next_func)
+          args=[string_handle] + next_func_concrete.captured_inputs,
+          Tout=self._structure._flat_types,  # pylint: disable=protected-access
+          f=next_func_concrete)
 
-    self._next_func = _remote_next_func
-    self._next_captured_args = _remote_next_func.captured_inputs
+    self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
+    self._next_captured_args = self._next_func.captured_inputs
 
-    @function.Defun(dtypes.string)
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _finalize_func(unused_string_handle):
       return array_ops.constant(0, dtypes.int64)
 
-    @function.Defun(dtypes.string)
+    finalize_func_concrete = _finalize_func._get_concrete_function_internal()  # pylint: disable=protected-access
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
           target=source_device,
-          args=[string_handle] + _finalize_func.captured_inputs,
+          args=[string_handle] + finalize_func_concrete.captured_inputs,
           Tout=[dtypes.int64],
-          f=_finalize_func)
+          f=finalize_func_concrete)
 
-    self._finalize_func = _remote_finalize_func
-    self._finalize_captured_args = _remote_finalize_func.captured_inputs
+    self._finalize_func = _remote_finalize_func._get_concrete_function_internal(  # pylint: disable=protected-access
+    )
+    self._finalize_captured_args = self._finalize_func.captured_inputs
 
   def _as_variant_tensor(self):
     with ops.device(self._target_device):
@@ -113,24 +117,15 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           init_func=self._init_func,
           next_func=self._next_func,
           finalize_func=self._finalize_func,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self))
 
   def _inputs(self):
     # TODO(b/116506223): Determine which datasets should be used as inputs here.
     return []
 
   @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
 
 class MultiDeviceIterator(object):
@@ -165,18 +160,11 @@ class MultiDeviceIterator(object):
       # TODO(rohanj): Fix this. Tracking bug: b/116467184
       raise RuntimeError("MultiDeviceIterator is not currently supported in "
                          "Eager mode.")
-    self._dataset = dataset
+    self._dataset = dataset._apply_options()  # pylint: disable=protected-access
     self._devices = devices
     self._source_device = source_device
     self._source_device_tensor = ops.convert_to_tensor(source_device)
 
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._dataset.output_shapes,
-                               self._dataset.output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._dataset.output_types,
-                              self._dataset.output_classes))
-
     # Create the MultiDeviceIterator.
     with ops.device(self._source_device):
       self._multi_device_iterator_resource = (
@@ -184,8 +172,7 @@ class MultiDeviceIterator(object):
               devices=self._devices,
               shared_name="",
               container="",
-              output_types=self._flat_output_types,
-              output_shapes=self._flat_output_shapes))
+              **dataset_ops.flat_structure(dataset)))
 
       # The incarnation ID is used to ensure consistency between the per-device
       # iterators and the multi-device iterator.
@@ -200,17 +187,22 @@ class MultiDeviceIterator(object):
     # into the device side from its input. It might be useful in rewriting.
     # Create the per device iterators.
     self._device_iterators = []
-    i = 0
-    for device in self._devices:
+    for i, device in enumerate(self._devices):
       ds = _PerDeviceGenerator(
           i, self._multi_device_iterator_resource, self._incarnation_id,
-          self._source_device_tensor, device, self._dataset.output_shapes,
-          self._dataset.output_types, self._dataset.output_classes)
+          self._source_device_tensor, device, dataset._element_structure)  # pylint: disable=protected-access
       if prefetch_buffer_size > 0:
         ds = ds.prefetch(prefetch_buffer_size)
+      # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
+      # non-CPU devices.
+      options = dataset_ops.Options()
+      options.experimental_autotune = False
+      opt_options = optimization_options.OptimizationOptions()
+      opt_options.apply_default_optimizations = False
+      options.experimental_optimization = opt_options
+      ds = ds.with_options(options)
       with ops.device(device):
         self._device_iterators.append(ds.make_initializable_iterator())
-      i += 1
 
     device_iterator_initializers = [
         iterator.initializer for iterator in self._device_iterators
@@ -219,13 +211,31 @@ class MultiDeviceIterator(object):
 
   def get_next(self):
     result = []
-    i = 0
-    for device in self._devices:
+    for i, device in enumerate(self._devices):
       with ops.device(device):
         result.append(self._device_iterators[i].get_next())
-      i += 1
+    return result
+
+  def get_next_as_optional(self):
+    result = []
+    for i, device in enumerate(self._devices):
+      with ops.device(device):
+        result.append(iterator_ops.get_next_as_optional(
+            self._device_iterators[i]))
     return result
 
   @property
   def initializer(self):
     return self._initializer
+
+  @property
+  def output_types(self):
+    return self._dataset.output_types
+
+  @property
+  def output_shapes(self):
+    return self._dataset.output_shapes
+
+  @property
+  def output_classes(self):
+    return self._dataset.output_classes
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index aca989e03a0a07c51d40d32042e1ab5e0bae7eff..dcb743bee01964baf06543587661bb73b2225abb 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -19,13 +19,17 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@six.add_metaclass(abc.ABCMeta)
 class Optional(object):
   """Wraps a nested structure of tensors that may/may not be present at runtime.
 
@@ -142,6 +146,7 @@ class _OptionalImpl(Optional):
     return self._value_structure
 
 
+@tf_export("data.experimental.OptionalStructure")
 class OptionalStructure(structure.Structure):
   """Represents an optional potentially containing a structured value."""
 
@@ -164,11 +169,18 @@ class OptionalStructure(structure.Structure):
   def _to_tensor_list(self, value):
     return [value._variant_tensor]  # pylint: disable=protected-access
 
+  def _to_batched_tensor_list(self, value):
+    raise NotImplementedError(
+        "Unbatching for `tf.data.experimental.Optional` objects.")
+
   def _from_tensor_list(self, flat_value):
     if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
         not flat_value[0].shape.is_compatible_with(tensor_shape.scalar())):
       raise ValueError(
           "OptionalStructure corresponds to a single tf.variant scalar.")
+    return self._from_compatible_tensor_list(flat_value)
+
+  def _from_compatible_tensor_list(self, flat_value):
     # pylint: disable=protected-access
     return _OptionalImpl(flat_value[0], self._value_structure)
 
@@ -176,6 +188,23 @@ class OptionalStructure(structure.Structure):
   def from_value(value):
     return OptionalStructure(value.value_structure)
 
+  def _to_legacy_output_types(self):
+    return self
+
+  def _to_legacy_output_shapes(self):
+    return self
+
+  def _to_legacy_output_classes(self):
+    return self
+
+  def _batch(self, batch_size):
+    raise NotImplementedError(
+        "Batching for `tf.data.experimental.Optional` objects.")
+
+  def _unbatch(self):
+    raise NotImplementedError(
+        "Unbatching for `tf.data.experimental.Optional` objects.")
+
 
 # pylint: disable=protected-access
 structure.Structure._register_custom_converter(Optional,
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index d08da6704caf8b6c3bc94b49d0fce6ecb8157a75..0d6023dea28e3cefa13b32717e2aee87ac2c2bbf 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -17,13 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -31,8 +34,8 @@ from tensorflow.python.util.tf_export import tf_export
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
-@tf_export("data.TextLineDataset")
-class TextLineDataset(dataset_ops.Dataset):
+@tf_export("data.TextLineDataset", v1=[])
+class TextLineDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` comprising lines from one or more text files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -46,7 +49,7 @@ class TextLineDataset(dataset_ops.Dataset):
         to buffer. A value of 0 results in the default buffering values chosen
         based on the compression type.
     """
-    super(TextLineDataset, self).__init__()
+    super(TextLineDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -61,23 +64,30 @@ class TextLineDataset(dataset_ops.Dataset):
     return gen_dataset_ops.text_line_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
-  def _inputs(self):
-    return []
-
   @property
-  def output_classes(self):
-    return ops.Tensor
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
+
+@tf_export(v1=["data.TextLineDataset"])
+class TextLineDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` comprising lines from one or more text files."""
+
+  def __init__(self, filenames, compression_type=None, buffer_size=None):
+    wrapped = TextLineDatasetV2(filenames, compression_type, buffer_size)
+    super(TextLineDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = TextLineDatasetV2.__init__.__doc__
 
   @property
-  def output_types(self):
-    return dtypes.string
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
 
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
 
-class _TFRecordDataset(dataset_ops.Dataset):
+
+class _TFRecordDataset(dataset_ops.DatasetSource):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -108,20 +118,9 @@ class _TFRecordDataset(dataset_ops.Dataset):
     return gen_dataset_ops.tf_record_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
-  def _inputs(self):
-    return []
-
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.TensorShape([])
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
 
 class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
@@ -145,15 +144,15 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    return gen_dataset_ops.parallel_interleave_dataset(
+    return ged_ops.experimental_parallel_interleave_dataset(
         self._input_dataset._as_variant_tensor(),
-        self._map_func.captured_inputs,
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
         self._sloppy,
         self._buffer_output_elements,
         self._prefetch_input_elements,
-        f=self._map_func,
+        f=self._map_func.function,
         **dataset_ops.flat_structure(self))
     # pylint: enable=protected-access
 
@@ -161,13 +160,13 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
     return "tf.data.experimental.parallel_interleave()"
 
 
-@tf_export("data.TFRecordDataset")
-class TFRecordDataset(dataset_ops.Dataset):
+@tf_export("data.TFRecordDataset", v1=[])
+class TFRecordDatasetV2(dataset_ops.DatasetV2):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None,
                num_parallel_reads=None):
-    """Creates a `TFRecordDataset` to read for one or more TFRecord files.
+    """Creates a `TFRecordDataset` to read one or more TFRecord files.
 
     NOTE: The `num_parallel_reads` argument can be used to improve performance
     when reading from a remote filesystem.
@@ -187,8 +186,8 @@ class TFRecordDataset(dataset_ops.Dataset):
       TypeError: If any argument does not have the expected type.
       ValueError: If any argument does not have the expected shape.
     """
-    super(TFRecordDataset, self).__init__()
-    if isinstance(filenames, dataset_ops.Dataset):
+    super(TFRecordDatasetV2, self).__init__()
+    if isinstance(filenames, dataset_ops.DatasetV2):
       if filenames.output_types != dtypes.string:
         raise TypeError(
             "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
@@ -199,7 +198,7 @@ class TFRecordDataset(dataset_ops.Dataset):
     else:
       filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
       filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
-      filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
+      filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames)
 
     self._filenames = filenames
     self._compression_type = compression_type
@@ -222,10 +221,10 @@ class TFRecordDataset(dataset_ops.Dataset):
              compression_type=None,
              buffer_size=None,
              num_parallel_reads=None):
-    return TFRecordDataset(filenames or self._filenames,
-                           compression_type or self._compression_type,
-                           buffer_size or self._buffer_size,
-                           num_parallel_reads or self._num_parallel_reads)
+    return TFRecordDatasetV2(filenames or self._filenames,
+                             compression_type or self._compression_type,
+                             buffer_size or self._buffer_size,
+                             num_parallel_reads or self._num_parallel_reads)
 
   def _as_variant_tensor(self):
     return self._impl._as_variant_tensor()  # pylint: disable=protected-access
@@ -234,20 +233,44 @@ class TFRecordDataset(dataset_ops.Dataset):
     return self._impl._inputs()  # pylint: disable=protected-access
 
   @property
-  def output_classes(self):
-    return self._impl.output_classes
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
-  @property
-  def output_shapes(self):
-    return self._impl.output_shapes
+
+@tf_export(v1=["data.TFRecordDataset"])
+class TFRecordDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` comprising records from one or more TFRecord files."""
+
+  def __init__(self, filenames, compression_type=None, buffer_size=None,
+               num_parallel_reads=None):
+    wrapped = TFRecordDatasetV2(
+        filenames, compression_type, buffer_size, num_parallel_reads)
+    super(TFRecordDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = TFRecordDatasetV2.__init__.__doc__
+
+  def _clone(self,
+             filenames=None,
+             compression_type=None,
+             buffer_size=None,
+             num_parallel_reads=None):
+    # pylint: disable=protected-access
+    return TFRecordDatasetV1(
+        filenames or self._dataset._filenames,
+        compression_type or self._dataset._compression_type,
+        buffer_size or self._dataset._buffer_size,
+        num_parallel_reads or self._dataset._num_parallel_reads)
 
   @property
-  def output_types(self):
-    return self._impl.output_types
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
 
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
 
-@tf_export("data.FixedLengthRecordDataset")
-class FixedLengthRecordDataset(dataset_ops.Dataset):
+
+@tf_export("data.FixedLengthRecordDataset", v1=[])
+class FixedLengthRecordDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
   def __init__(self,
@@ -255,7 +278,8 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
                record_bytes,
                header_bytes=None,
                footer_bytes=None,
-               buffer_size=None):
+               buffer_size=None,
+               compression_type=None):
     """Creates a `FixedLengthRecordDataset`.
 
     Args:
@@ -268,8 +292,10 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
         bytes to ignore at the end of a file.
       buffer_size: (Optional.) A `tf.int64` scalar representing the number of
         bytes to buffer when reading.
+      compression_type: (Optional.) A `tf.string` scalar evaluating to one of
+        `""` (no compression), `"ZLIB"`, or `"GZIP"`.
     """
-    super(FixedLengthRecordDataset, self).__init__()
+    super(FixedLengthRecordDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._record_bytes = ops.convert_to_tensor(
@@ -281,23 +307,56 @@ class FixedLengthRecordDataset(dataset_ops.Dataset):
         "footer_bytes", footer_bytes)
     self._buffer_size = convert.optional_param_to_tensor(
         "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
+    self._compression_type = convert.optional_param_to_tensor(
+        "compression_type",
+        compression_type,
+        argument_default="",
+        argument_dtype=dtypes.string)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.fixed_length_record_dataset(
-        self._filenames, self._header_bytes, self._record_bytes,
-        self._footer_bytes, self._buffer_size)
-
-  def _inputs(self):
-    return []
+    if (self._compression_type is not None or
+        compat.forward_compatible(2018, 11, 30)):
+      return gen_dataset_ops.fixed_length_record_dataset_v2(
+          self._filenames, self._header_bytes, self._record_bytes,
+          self._footer_bytes, self._buffer_size, self._compression_type)
+    else:
+      return gen_dataset_ops.fixed_length_record_dataset(
+          self._filenames, self._header_bytes, self._record_bytes,
+          self._footer_bytes, self._buffer_size)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
+
+@tf_export(v1=["data.FixedLengthRecordDataset"])
+class FixedLengthRecordDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` of fixed-length records from one or more binary files."""
+
+  def __init__(self,
+               filenames,
+               record_bytes,
+               header_bytes=None,
+               footer_bytes=None,
+               buffer_size=None,
+               compression_type=None):
+    wrapped = FixedLengthRecordDatasetV2(
+        filenames, record_bytes, header_bytes, footer_bytes, buffer_size,
+        compression_type)
+    super(FixedLengthRecordDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = FixedLengthRecordDatasetV2.__init__.__doc__
 
   @property
-  def output_types(self):
-    return dtypes.string
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
+
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+FixedLengthRecordDataset = FixedLengthRecordDatasetV1
+TFRecordDataset = TFRecordDatasetV1
+TextLineDataset = TextLineDatasetV1
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 39082ce3707bb11585694e553b840f94209b1029..04e80299e0d57965c21b88bd94250cb62e76d452 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -93,10 +93,28 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
+py_library(
+    name = "options",
+    srcs = ["options.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "options_test",
+    size = "small",
+    srcs = ["options_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":options",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_library(
     name = "convert",
     srcs = ["convert.py"],
diff --git a/tensorflow/python/data/util/convert_test.py b/tensorflow/python/data/util/convert_test.py
index 89c3afb29691f4f24b7cb4208b16663b616515fa..78ca6e951390b8c248e55dcb7f1ce99f9fa1085f 100644
--- a/tensorflow/python/data/util/convert_test.py
+++ b/tensorflow/python/data/util/convert_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.data.util import convert
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -30,47 +31,53 @@ class ConvertTest(test.TestCase):
 
   def testInteger(self):
     resp = convert.optional_param_to_tensor("foo", 3)
-    with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(resp))
+    self.assertEqual(3, self.evaluate(resp))
 
   def testIntegerDefault(self):
     resp = convert.optional_param_to_tensor("foo", None)
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(resp))
+    self.assertEqual(0, self.evaluate(resp))
 
   def testStringDefault(self):
     resp = convert.optional_param_to_tensor("bar", None, "default",
                                             dtypes.string)
-    with self.cached_session() as sess:
-      self.assertEqual(compat.as_bytes("default"), sess.run(resp))
+    self.assertEqual(compat.as_bytes("default"), self.evaluate(resp))
 
   def testString(self):
     resp = convert.optional_param_to_tensor("bar", "value", "default",
                                             dtypes.string)
-    with self.cached_session() as sess:
-      self.assertEqual(compat.as_bytes("value"), sess.run(resp))
+    self.assertEqual(compat.as_bytes("value"), self.evaluate(resp))
 
   def testPartialShapeToTensorKnownDimension(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([1]))))
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor((1,))))
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor([1])))
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([1], dtype=dtypes.int64))))
-
+    self.assertAllEqual([1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([1]))))
+    self.assertAllEqual([1], self.evaluate(
+        convert.partial_shape_to_tensor((1,))))
+    self.assertAllEqual([1], self.evaluate(
+        convert.partial_shape_to_tensor([1])))
+    self.assertAllEqual([1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([1], dtype=dtypes.int64))))
+
+  @test_util.run_deprecated_v1
   def testPartialShapeToTensorUnknownDimension(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([None]))))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          (None,))))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          [None])))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          [-1])))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([-1], dtype=dtypes.int64))))
+    self.assertAllEqual([-1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([None]))))
+    self.assertAllEqual([-1],
+                        self.evaluate(convert.partial_shape_to_tensor((None,))))
+    self.assertAllEqual([-1],
+                        self.evaluate(convert.partial_shape_to_tensor([None])))
+    self.assertAllEqual([-1],
+                        self.evaluate(convert.partial_shape_to_tensor([-1])))
+    self.assertAllEqual([-1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([-1],
+                                                     dtype=dtypes.int64))))
 
     with self.assertRaisesRegexp(
         ValueError, r"The given shape .* must be a 1-D tensor of tf.int64 "
@@ -84,42 +91,63 @@ class ConvertTest(test.TestCase):
       convert.partial_shape_to_tensor(constant_op.constant([1., 1.]))
 
   def testPartialShapeToTensorMultipleDimensions(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([3, 6]))))
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          (3, 6))))
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          [3, 6])))
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([3, 6], dtype=dtypes.int64))))
-
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([3, None]))))
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          (3, None))))
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          [3, None])))
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([3, -1], dtype=dtypes.int64))))
-
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([None, None]))))
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          (None, None))))
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          [None, None])))
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([-1, -1], dtype=dtypes.int64))))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([3, 6]))))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(convert.partial_shape_to_tensor((3, 6))))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(convert.partial_shape_to_tensor([3, 6])))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([3, 6],
+                                                     dtype=dtypes.int64))))
+
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([3, None]))))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor((3, None))))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor([3, None])))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([3, -1],
+                                                     dtype=dtypes.int64))))
+
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([None, None]))))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor((None, None))))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor([None, None])))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([-1, -1],
+                                                     dtype=dtypes.int64))))
 
   def testPartialShapeToTensorScalar(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([]))))
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(())))
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor([])))
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([], dtype=dtypes.int64))))
+    self.assertAllEqual([],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([]))))
+    self.assertAllEqual([], self.evaluate(convert.partial_shape_to_tensor(())))
+    self.assertAllEqual([], self.evaluate(convert.partial_shape_to_tensor([])))
+    self.assertAllEqual([],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([], dtype=dtypes.int64))))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/util/options.py b/tensorflow/python/data/util/options.py
new file mode 100644
index 0000000000000000000000000000000000000000..9badba8e5670c749b833da7f1e2094f4f3548098
--- /dev/null
+++ b/tensorflow/python/data/util/options.py
@@ -0,0 +1,131 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for tf.data options."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def _internal_attr_name(name):
+  return "_" + name
+
+
+class OptionsBase(object):
+  """Base class for representing a set of tf.data options.
+
+  Attributes:
+    _options: Stores the option values.
+  """
+
+  def __init__(self):
+    self._options = {}
+
+  def __eq__(self, other):
+    if not isinstance(other, self.__class__):
+      return NotImplemented
+    for name in set(self._options) | set(other._options):  # pylint: disable=protected-access
+      if getattr(self, name) != getattr(other, name):
+        return False
+    return True
+
+  def __ne__(self, other):
+    if isinstance(other, self.__class__):
+      return not self.__eq__(other)
+    else:
+      return NotImplemented
+
+
+def create_option(name, ty, docstring, default=None):
+  """Creates a type-checked property.
+
+  Args:
+    name: the name to use
+    ty: the type to use
+    docstring: the docstring to use
+    default: the default value to use
+
+  Returns:
+    A type-checked property.
+  """
+
+  def get_fn(self):
+    return self._options.get(name, default)  # pylint: disable=protected-access
+
+  def set_fn(self, value):
+    if not isinstance(value, ty):
+      raise TypeError("Property \"%s\" must be of type %s, got: %r (type: %r)" %
+                      (name, ty, value, type(value)))
+    self._options[name] = value  # pylint: disable=protected-access
+
+  return property(get_fn, set_fn, None, docstring)
+
+
+def merge_options(*options_list):
+  """Merges the given options, returning the result as a new options object.
+
+  The input arguments are expected to have a matching type that derives from
+  `OptionsBase` (and thus each represent a set of options). The method outputs
+  an object of the same type created by merging the sets of options represented
+  by the input arguments.
+
+  The sets of options can be merged as long as there does not exist an option
+  with different non-default values.
+
+  If an option is an instance of `OptionsBase` itself, then this method is
+  applied recursively to the set of options represented by this option.
+
+  Args:
+    *options_list: options to merge
+
+  Raises:
+    TypeError: if the input arguments are incompatible or not derived from
+      `OptionsBase`
+    ValueError: if the given options cannot be merged
+
+  Returns:
+    A new options object which is the result of merging the given options.
+  """
+  if len(options_list) < 1:
+    raise ValueError("At least one options should be provided")
+  result_type = type(options_list[0])
+
+  for options in options_list:
+    if not isinstance(options, result_type):
+      raise TypeError("Incompatible options type: %r vs %r" % (type(options),
+                                                               result_type))
+
+  if not isinstance(options_list[0], OptionsBase):
+    raise TypeError("The inputs should inherit from `OptionsBase`")
+
+  default_options = result_type()
+  result = result_type()
+  for options in options_list:
+    # Iterate over all set options and merge the into the result.
+    for name in options._options:  # pylint: disable=protected-access
+      this = getattr(result, name)
+      that = getattr(options, name)
+      default = getattr(default_options, name)
+      if that == default:
+        continue
+      elif this == default:
+        setattr(result, name, that)
+      elif isinstance(this, OptionsBase):
+        setattr(result, name, merge_options(this, that))
+      elif this != that:
+        raise ValueError(
+            "Cannot merge incompatible values (%r and %r) of option: %s" %
+            (this, that, name))
+  return result
diff --git a/tensorflow/python/data/util/options_test.py b/tensorflow/python/data/util/options_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5169835a322923d7bf2d644717870d87bfab13f
--- /dev/null
+++ b/tensorflow/python/data/util/options_test.py
@@ -0,0 +1,96 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dataset options utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import options
+from tensorflow.python.platform import test
+
+
+class _TestOptions(options.OptionsBase):
+  x = options.create_option(
+      name="x", ty=int, docstring="the answer to everything", default=42)
+  y = options.create_option(
+      name="y", ty=float, docstring="a tasty pie", default=3.14)
+
+
+class _NestedTestOptions(options.OptionsBase):
+  opts = options.create_option(
+      name="opts", ty=_TestOptions, docstring="nested options")
+
+
+class OptionsTest(test.TestCase):
+
+  def testDocumentation(self):
+    self.assertEqual(_TestOptions.x.__doc__, "the answer to everything")
+    self.assertEqual(_TestOptions.y.__doc__, "a tasty pie")
+
+  def testCreateOption(self):
+    opts = _TestOptions()
+    self.assertEqual(opts.x, 42)
+    self.assertEqual(opts.y, 3.14)
+    self.assertIsInstance(opts.x, int)
+    self.assertIsInstance(opts.y, float)
+    opts.x = 0
+    self.assertEqual(opts.x, 0)
+    with self.assertRaises(TypeError):
+      opts.x = 3.14
+    opts.y = 0.0
+    self.assertEqual(opts.y, 0.0)
+    with self.assertRaises(TypeError):
+      opts.y = 42
+
+  def testMergeOptions(self):
+    options1, options2 = _TestOptions(), _TestOptions()
+    with self.assertRaises(ValueError):
+      options.merge_options()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.x, 42)
+    self.assertEqual(merged_options.y, 3.14)
+    options1.x = 0
+    options2.y = 0.0
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.x, 0)
+    self.assertEqual(merged_options.y, 0.0)
+
+  def testMergeNestedOptions(self):
+    options1, options2 = _NestedTestOptions(), _NestedTestOptions()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts, None)
+    options1.opts = _TestOptions()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts, _TestOptions())
+    options2.opts = _TestOptions()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts, _TestOptions())
+    options1.opts.x = 0
+    options2.opts.y = 0.0
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts.x, 0)
+    self.assertEqual(merged_options.opts.y, 0.0)
+
+  def testMergeOptionsInvalid(self):
+    with self.assertRaises(TypeError):
+      options.merge_options(0)
+    options1, options2 = _TestOptions(), _NestedTestOptions()
+    with self.assertRaises(TypeError):
+      options.merge_options(options1, options2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
index 5e6d22470978d97c5e73640e86d3f8b82cbc1b60..f2e22fefd31749faf52c5db0b967b936c1c76707 100644
--- a/tensorflow/python/data/util/sparse.py
+++ b/tensorflow/python/data/util/sparse.py
@@ -34,7 +34,7 @@ def any_sparse(classes):
   Returns:
     `True` if `classes` contains a sparse tensor type and `False` otherwise.
   """
-  return any([c is sparse_tensor.SparseTensor for c in nest.flatten(classes)])
+  return any(c is sparse_tensor.SparseTensor for c in nest.flatten(classes))
 
 
 def as_dense_shapes(shapes, classes):
diff --git a/tensorflow/python/data/util/sparse_test.py b/tensorflow/python/data/util/sparse_test.py
index 056b32480f3898726940f3c228c9b9eefa28b237..06acf55ab9d1154ec4972b799538948fa76bdb43 100644
--- a/tensorflow/python/data/util/sparse_test.py
+++ b/tensorflow/python/data/util/sparse_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -292,10 +293,11 @@ class SparseTest(test.TestCase):
       return
     self.assertTrue(isinstance(b, sparse_tensor.SparseTensor))
     with self.cached_session():
-      self.assertAllEqual(a.eval().indices, b.eval().indices)
-      self.assertAllEqual(a.eval().values, b.eval().values)
-      self.assertAllEqual(a.eval().dense_shape, b.eval().dense_shape)
+      self.assertAllEqual(a.eval().indices, self.evaluate(b).indices)
+      self.assertAllEqual(a.eval().values, self.evaluate(b).values)
+      self.assertAllEqual(a.eval().dense_shape, self.evaluate(b).dense_shape)
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserialize(self):
     test_cases = (
         (),
@@ -325,6 +327,7 @@ class SparseTest(test.TestCase):
       for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
         self.assertSparseValuesEqual(a, e)
 
+  @test_util.run_deprecated_v1
   def testSerializeManyDeserialize(self):
     test_cases = (
         (),
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index a90ca258c0c7cd23b08f2038be873bca504634f0..9de0c4da0ebe0beec31aa652397f06d6dc665e63 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -19,6 +19,8 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -26,11 +28,14 @@ from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 _STRUCTURE_CONVERSION_FUNCTION_REGISTRY = {}
 
 
+@tf_export("data.experimental.Structure")
+@six.add_metaclass(abc.ABCMeta)
 class Structure(object):
   """Represents structural information, such as type and shape, about a value.
 
@@ -46,7 +51,6 @@ class Structure(object):
   and `tf.data.Dataset.output_classes`, and similar properties and arguments in
   the `tf.data.Iterator` and `Optional` classes.
   """
-  __metaclass__ = abc.ABCMeta
 
   @abc.abstractproperty
   def _flat_shapes(self):
@@ -109,21 +113,76 @@ class Structure(object):
     """
     raise NotImplementedError("Structure._to_tensor_list()")
 
+  @abc.abstractmethod
+  def _to_batched_tensor_list(self, value):
+    """Returns a flat list of rank >= 1 `tf.Tensor` representing `value`.
+
+    This method can be used, along with `self._flat_shapes` and
+    `self._flat_types` to represent structured values in lower level APIs
+    (such as plain TensorFlow operations) that do not understand structure,
+    *and* that require that the plain tensors have a rank of at least one
+    (e.g. for the purpose of slicing the tensors).
+
+    Requires: `self.is_compatible_with(Structure.from_value(value))`.
+
+    Args:
+      value: A value with compatible structure.
+
+    Returns:
+      A flat list of `tf.Tensor` representing `value`.
+    """
+    raise NotImplementedError("Structure._to_batched_tensor_list()")
+
   @abc.abstractmethod
   def _from_tensor_list(self, flat_value):
     """Builds a flat list of `tf.Tensor` into a value matching this structure.
 
-    Requires: The shapes and types of the tensors in `flat_value` must be
-    compatible with `self._flat_shapes` and `self._flat_types` respectively.
-
     Args:
       flat_value: A list of `tf.Tensor` with compatible flat structure.
 
     Returns:
       A structured object matching this structure.
+
+    Raises:
+      ValueError: If the shapes and types of the tensors in `flat_value` are not
+        compatible with `self._flat_shapes` and `self._flat_types` respectively.
     """
     raise NotImplementedError("Structure._from_tensor_list()")
 
+  def _from_compatible_tensor_list(self, flat_value):
+    """A version of `_from_tensor_list()` that may avoid performing checks.
+
+    NOTE: This method should be used to avoid checks for performance reasons,
+    when the validity of `flat_value` has been validated by other means.
+    The shapes and types of the tensors in `flat_value` must be compatible with
+    `self._flat_shapes` and `self._flat_types` respectively. The behavior is
+    undefined if this requirement is not met.
+
+    Args:
+      flat_value: A list of `tf.Tensor` with compatible flat structure.
+
+    Returns:
+      A structured object matching this structure.
+    """
+    return self._from_tensor_list(flat_value)
+
+  @abc.abstractmethod
+  def _batch(self, batch_size):
+    """Returns a structure representing a batch of objects with this structure.
+
+    Args:
+      batch_size: An `int` representing the number of elements in a batch,
+        or `None` if the batch size may vary.
+
+    Returns:
+      A `Structure` representing a batch of objects with this structure.
+    """
+    raise NotImplementedError("Structure._batch()")
+
+  @abc.abstractmethod
+  def _unbatch(self):
+    raise NotImplementedError("Structure._unbatch()")
+
   @staticmethod
   def from_value(value):
     """Returns a `Structure` that represents the given `value`.
@@ -157,54 +216,6 @@ class Structure(object):
         raise TypeError("Could not build a structure for %r" % value)
       return TensorStructure.from_value(tensor)
 
-  @staticmethod
-  def _from_legacy_structure(output_types, output_shapes, output_classes):
-    """Returns a `Structure` that represents the given legacy structure.
-
-    This method provides a way to convert from the existing `Dataset` and
-    `Iterator` structure-related properties to a `Structure` object.
-
-    TODO(b/110122868): Remove this method once `Structure` is used throughout
-    `tf.data`.
-
-    Args:
-      output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of a structured value.
-      output_shapes: A nested structure of `tf.TensorShape` objects
-        corresponding to each component a structured value.
-      output_classes: A nested structure of Python `type` objects corresponding
-        to each component of a structured value.
-
-    Returns:
-      A `Structure`.
-
-    Raises:
-      TypeError: If a structure cannot be built the arguments, because one of
-        the component classes in `output_classes` is not supported.
-    """
-    flat_types = nest.flatten(output_types)
-    flat_shapes = nest.flatten(output_shapes)
-    flat_classes = nest.flatten(output_classes)
-    flat_ret = []
-    for flat_type, flat_shape, flat_class in zip(flat_types, flat_shapes,
-                                                 flat_classes):
-      if issubclass(flat_class, sparse_tensor_lib.SparseTensor):
-        flat_ret.append(SparseTensorStructure(flat_type, flat_shape))
-      elif issubclass(flat_class, ops.Tensor):
-        flat_ret.append(TensorStructure(flat_type, flat_shape))
-      else:
-        # NOTE(mrry): Since legacy structures produced by iterators only
-        # comprise Tensors, SparseTensors, and nests, we do not need to support
-        # all structure types here.
-        raise TypeError(
-            "Could not build a structure for output class %r" % flat_type)
-
-    ret = nest.pack_sequence_as(output_classes, flat_ret)
-    if isinstance(ret, Structure):
-      return ret
-    else:
-      return NestedStructure(ret)
-
   @staticmethod
   def _register_custom_converter(type_object, converter_fn):
     """Registers `converter_fn` for converting values of the given type.
@@ -217,15 +228,82 @@ class Structure(object):
     """
     _STRUCTURE_CONVERSION_FUNCTION_REGISTRY[type_object] = converter_fn
 
+  @abc.abstractmethod
+  def _to_legacy_output_types(self):
+    raise NotImplementedError("Structure._to_legacy_output_types()")
+
+  @abc.abstractmethod
+  def _to_legacy_output_shapes(self):
+    raise NotImplementedError("Structure._to_legacy_output_shapes()")
+
+  @abc.abstractmethod
+  def _to_legacy_output_classes(self):
+    raise NotImplementedError("Structure._to_legacy_output_classes()")
+
+
+def convert_legacy_structure(output_types, output_shapes, output_classes):
+  """Returns a `Structure` that represents the given legacy structure.
+
+  This method provides a way to convert from the existing `Dataset` and
+  `Iterator` structure-related properties to a `Structure` object. A "legacy"
+  structure is represented by the `tf.data.Dataset.output_types`,
+  `tf.data.Dataset.output_shapes`, and `tf.data.Dataset.output_classes`
+  properties.
+
+  TODO(b/110122868): Remove this function once `Structure` is used throughout
+  `tf.data`.
+
+  Args:
+    output_types: A nested structure of `tf.DType` objects corresponding to
+      each component of a structured value.
+    output_shapes: A nested structure of `tf.TensorShape` objects
+      corresponding to each component a structured value.
+    output_classes: A nested structure of Python `type` objects corresponding
+      to each component of a structured value.
+
+  Returns:
+    A `Structure`.
+
+  Raises:
+    TypeError: If a structure cannot be built from the arguments, because one of
+      the component classes in `output_classes` is not supported.
+  """
+  flat_types = nest.flatten(output_types)
+  flat_shapes = nest.flatten(output_shapes)
+  flat_classes = nest.flatten(output_classes)
+  flat_ret = []
+  for flat_type, flat_shape, flat_class in zip(flat_types, flat_shapes,
+                                               flat_classes):
+    if isinstance(flat_class, Structure):
+      flat_ret.append(flat_class)
+    elif issubclass(flat_class, sparse_tensor_lib.SparseTensor):
+      flat_ret.append(SparseTensorStructure(flat_type, flat_shape))
+    elif issubclass(flat_class, ops.Tensor):
+      flat_ret.append(TensorStructure(flat_type, flat_shape))
+    else:
+      # NOTE(mrry): Since legacy structures produced by iterators only
+      # comprise Tensors, SparseTensors, and nests, we do not need to
+      # support all structure types here.
+      raise TypeError(
+          "Could not build a structure for output class %r" % flat_type)
+
+  ret = nest.pack_sequence_as(output_classes, flat_ret)
+  if isinstance(ret, Structure):
+    return ret
+  else:
+    return NestedStructure(ret)
+
 
 # NOTE(mrry): The following classes make extensive use of non-public methods of
 # their base class, so we disable the protected-access lint warning once here.
 # pylint: disable=protected-access
+@tf_export("data.experimental.NestedStructure")
 class NestedStructure(Structure):
   """Represents a nested structure in which each leaf is a `Structure`."""
 
   def __init__(self, nested_structure):
     self._nested_structure = nested_structure
+    self._flat_nested_structure = nest.flatten(nested_structure)
     self._flat_shapes_list = []
     self._flat_types_list = []
     for s in nest.flatten(nested_structure):
@@ -268,23 +346,52 @@ class NestedStructure(Structure):
       raise ValueError("The value %r is not compatible with the nested "
                        "structure %r." % (value, self._nested_structure))
 
-    for sub_value, structure in zip(flat_value,
-                                    nest.flatten(self._nested_structure)):
+    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
       if not structure.is_compatible_with(Structure.from_value(sub_value)):
         raise ValueError("Component value %r is not compatible with the nested "
                          "structure %r." % (sub_value, structure))
       ret.extend(structure._to_tensor_list(sub_value))
     return ret
 
+  def _to_batched_tensor_list(self, value):
+    ret = []
+
+    try:
+      flat_value = nest.flatten_up_to(self._nested_structure, value)
+    except (ValueError, TypeError):
+      raise ValueError("The value %r is not compatible with the nested "
+                       "structure %r." % (value, self._nested_structure))
+
+    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
+      if not structure.is_compatible_with(Structure.from_value(sub_value)):
+        raise ValueError("Component value %r is not compatible with the nested "
+                         "structure %r." % (sub_value, structure))
+      ret.extend(structure._to_batched_tensor_list(sub_value))
+    return ret
+
   def _from_tensor_list(self, flat_value):
     if len(flat_value) != len(self._flat_types):
       raise ValueError("Expected %d flat values in NestedStructure but got %d."
                        % (len(self._flat_types), len(flat_value)))
 
     flat_ret = []
-    for sub_value, structure in zip(flat_value,
-                                    nest.flatten(self._nested_structure)):
-      flat_ret.append(structure._from_tensor_list([sub_value]))
+    i = 0
+    for structure in self._flat_nested_structure:
+      num_flat_values = len(structure._flat_types)
+      sub_value = flat_value[i:i + num_flat_values]
+      flat_ret.append(structure._from_tensor_list(sub_value))
+      i += num_flat_values
+
+    return nest.pack_sequence_as(self._nested_structure, flat_ret)
+
+  def _from_compatible_tensor_list(self, flat_value):
+    flat_ret = []
+    i = 0
+    for structure in self._flat_nested_structure:
+      num_flat_values = len(structure._flat_types)
+      sub_value = flat_value[i:i + num_flat_values]
+      flat_ret.append(structure._from_compatible_tensor_list(sub_value))
+      i += num_flat_values
 
     return nest.pack_sequence_as(self._nested_structure, flat_ret)
 
@@ -295,7 +402,28 @@ class NestedStructure(Structure):
     ]
     return NestedStructure(nest.pack_sequence_as(value, flat_nested_structure))
 
+  def _to_legacy_output_types(self):
+    return nest.map_structure(
+        lambda s: s._to_legacy_output_types(), self._nested_structure)
+
+  def _to_legacy_output_shapes(self):
+    return nest.map_structure(
+        lambda s: s._to_legacy_output_shapes(), self._nested_structure)
+
+  def _to_legacy_output_classes(self):
+    return nest.map_structure(
+        lambda s: s._to_legacy_output_classes(), self._nested_structure)
+
+  def _batch(self, batch_size):
+    return NestedStructure(nest.map_structure(
+        lambda s: s._batch(batch_size), self._nested_structure))
 
+  def _unbatch(self):
+    return NestedStructure(nest.map_structure(
+        lambda s: s._unbatch(), self._nested_structure))
+
+
+@tf_export("data.experimental.TensorStructure")
 class TensorStructure(Structure):
   """Represents structural information about a `tf.Tensor`."""
 
@@ -322,19 +450,54 @@ class TensorStructure(Structure):
                        "and shape %s." % (value, self._dtype, self._shape))
     return [value]
 
+  def _to_batched_tensor_list(self, value):
+    if self._shape.merge_with(value.shape).ndims == 0:
+      raise ValueError("Unbatching a tensor is only supported for rank >= 1")
+    return [value]
+
   def _from_tensor_list(self, flat_value):
     if len(flat_value) != 1:
       raise ValueError("TensorStructure corresponds to a single tf.Tensor.")
     if not self.is_compatible_with(Structure.from_value(flat_value[0])):
       raise ValueError("Cannot convert %r to a tensor with dtype %s and shape "
                        "%s." % (flat_value[0], self._dtype, self._shape))
+    return self._from_compatible_tensor_list(flat_value)
+
+  def _from_compatible_tensor_list(self, flat_value):
+    # TODO(b/112266545): It would be cleaner to create a new `ensure_shape()`
+    # op here and return that, instead of mutating the input's shape using
+    # `Tensor.set_shape()`. However, that would add extra ops on the arguments
+    # of each `tf.data` function, which could impact performance. When this
+    # bug is resolved, we should be able to add the `ensure_shape()` ops and
+    # optimize them away using contextual shape information.
+    flat_value[0].set_shape(self._shape)
     return flat_value[0]
 
   @staticmethod
   def from_value(value):
     return TensorStructure(value.dtype, value.shape)
 
+  def _to_legacy_output_types(self):
+    return self._dtype
+
+  def _to_legacy_output_shapes(self):
+    return self._shape
+
+  def _to_legacy_output_classes(self):
+    return ops.Tensor
 
+  def _batch(self, batch_size):
+    return TensorStructure(
+        self._dtype,
+        tensor_shape.TensorShape([batch_size]).concatenate(self._shape))
+
+  def _unbatch(self):
+    if self._shape.ndims == 0:
+      raise ValueError("Unbatching a tensor is only supported for rank >= 1")
+    return TensorStructure(self._dtype, self._shape[1:])
+
+
+@tf_export("data.experimental.SparseTensorStructure")
 class SparseTensorStructure(Structure):
   """Represents structural information about a `tf.SparseTensor`."""
 
@@ -344,7 +507,11 @@ class SparseTensorStructure(Structure):
 
   @property
   def _flat_shapes(self):
-    return [tensor_shape.vector(3)]
+    # NOTE(mrry): The default flat shape of a boxed `SparseTensor` is `(3,)`,
+    # but a `SparseTensorStructure` can also represent a batch of boxed
+    # `SparseTensor` objects with shape `(?, 3)` (and batches of batches, etc.),
+    # so the flat shape must be unknown.
+    return [tensor_shape.unknown_shape(None)]
 
   @property
   def _flat_types(self):
@@ -358,13 +525,26 @@ class SparseTensorStructure(Structure):
   def _to_tensor_list(self, value):
     return [sparse_ops.serialize_sparse(value, out_type=dtypes.variant)]
 
+  def _to_batched_tensor_list(self, value):
+    if self._dense_shape.merge_with(
+        tensor_util.constant_value_as_shape(value.dense_shape)).ndims == 0:
+      raise ValueError(
+          "Unbatching a sparse tensor is only supported for rank >= 1")
+    return [sparse_ops.serialize_many_sparse(value, out_type=dtypes.variant)]
+
   def _from_tensor_list(self, flat_value):
     if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
         not flat_value[0].shape.is_compatible_with(tensor_shape.vector(3))):
       raise ValueError("SparseTensorStructure corresponds to a single "
                        "tf.variant vector of length 3.")
-    return sparse_ops.deserialize_sparse(
+    return self._from_compatible_tensor_list(flat_value)
+
+  def _from_compatible_tensor_list(self, flat_value):
+    ret = sparse_ops.deserialize_sparse(
         flat_value[0], dtype=self._dtype, rank=self._dense_shape.ndims)
+    ret.indices.set_shape([None, self._dense_shape.ndims])
+    ret.dense_shape.set_shape([self._dense_shape.ndims])
+    return ret
 
   @staticmethod
   def from_value(value):
@@ -372,3 +552,22 @@ class SparseTensorStructure(Structure):
     return SparseTensorStructure(
         sparse_tensor.dtype,
         tensor_util.constant_value_as_shape(sparse_tensor.dense_shape))
+
+  def _to_legacy_output_types(self):
+    return self._dtype
+
+  def _to_legacy_output_shapes(self):
+    return self._dense_shape
+
+  def _to_legacy_output_classes(self):
+    return sparse_tensor_lib.SparseTensor
+
+  def _batch(self, batch_size):
+    return SparseTensorStructure(
+        self._dtype,
+        tensor_shape.TensorShape([batch_size]).concatenate(self._dense_shape))
+
+  def _unbatch(self):
+    if self._dense_shape.ndims == 0:
+      raise ValueError("Unbatching a tensor is only supported for rank >= 1")
+    return SparseTensorStructure(self._dtype, self._dense_shape[1:])
diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py
index 2982763181b97e7badf4c14136e041219639c701..91dcfa6f6089bf052526e17ca8f0e646f7e86d71 100644
--- a/tensorflow/python/data/util/structure_test.py
+++ b/tensorflow/python/data/util/structure_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
@@ -28,58 +29,66 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class StructureTest(test.TestCase, parameterized.TestCase):
-  # pylint disable=protected-access
+class StructureTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  # NOTE(mrry): The arguments must be lifted into lambdas because otherwise they
+  # will be executed before the (eager- or graph-mode) test environment has been
+  # set up.
+  # pylint: disable=g-long-lambda,protected-access
   @parameterized.parameters(
-      (constant_op.constant(37.0), structure.TensorStructure, [dtypes.float32],
-       [[]]), (sparse_tensor.SparseTensor(
-           indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
-               structure.SparseTensorStructure, [dtypes.variant], [[3]]),
-      ((constant_op.constant(37.0), constant_op.constant([1, 2, 3])),
-       structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]), ({
-           "a": constant_op.constant(37.0),
-           "b": constant_op.constant([1, 2, 3])
-       }, structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
-      ({
-          "a":
-              constant_op.constant(37.0),
+      (lambda: constant_op.constant(37.0), structure.TensorStructure,
+       [dtypes.float32], [[]]),
+      (lambda: sparse_tensor.SparseTensor(
+          indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+       structure.SparseTensorStructure, [dtypes.variant], [None]),
+      (lambda: (constant_op.constant(37.0), constant_op.constant([1, 2, 3])),
+       structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
+      (lambda: {
+          "a": constant_op.constant(37.0),
+          "b": constant_op.constant([1, 2, 3])
+      }, structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
+      (lambda: {
+          "a": constant_op.constant(37.0),
           "b": (sparse_tensor.SparseTensor(
               indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
                 sparse_tensor.SparseTensor(
                     indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
       }, structure.NestedStructure,
-       [dtypes.float32, dtypes.variant, dtypes.variant], [[], [3], [3]]))
-  def testFlatStructure(self, value, expected_structure, expected_types,
+       [dtypes.float32, dtypes.variant, dtypes.variant], [[], None, None]))
+  def testFlatStructure(self, value_fn, expected_structure, expected_types,
                         expected_shapes):
+    value = value_fn()
     s = structure.Structure.from_value(value)
     self.assertIsInstance(s, expected_structure)
     self.assertEqual(expected_types, s._flat_types)
-    self.assertEqual(expected_shapes, s._flat_shapes)
+    for expected, actual in zip(expected_shapes, s._flat_shapes):
+      self.assertTrue(actual.is_compatible_with(expected))
+      self.assertTrue(
+          tensor_shape.as_shape(expected).is_compatible_with(actual))
 
   @parameterized.parameters(
-      (constant_op.constant(37.0), [
+      (lambda: constant_op.constant(37.0), lambda: [
           constant_op.constant(38.0),
           array_ops.placeholder(dtypes.float32),
           variables.Variable(100.0), 42.0,
           np.array(42.0, dtype=np.float32)
-      ], [constant_op.constant([1.0, 2.0]),
-          constant_op.constant(37)]),
-      (sparse_tensor.SparseTensor(
+      ], lambda: [constant_op.constant([1.0, 2.0]), constant_op.constant(37)]),
+      (lambda: sparse_tensor.SparseTensor(
           indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
-       [
+       lambda: [
            sparse_tensor.SparseTensor(
                indices=[[1, 1], [3, 4]], values=[10, -1], dense_shape=[4, 5]),
            sparse_tensor.SparseTensorValue(
                indices=[[1, 1], [3, 4]], values=[10, -1], dense_shape=[4, 5]),
            array_ops.sparse_placeholder(dtype=dtypes.int32),
            array_ops.sparse_placeholder(dtype=dtypes.int32, shape=[None, None])
-       ], [
+       ], lambda: [
            constant_op.constant(37, shape=[4, 5]),
            sparse_tensor.SparseTensor(
                indices=[[3, 4]], values=[-1], dense_shape=[5, 6]),
@@ -88,13 +97,13 @@ class StructureTest(test.TestCase, parameterized.TestCase):
            sparse_tensor.SparseTensor(
                indices=[[3, 4]], values=[-1.0], dense_shape=[4, 5])
        ]),
-      ({
+      (lambda: {
           "a": constant_op.constant(37.0),
           "b": constant_op.constant([1, 2, 3])
-      }, [{
+      }, lambda: [{
           "a": constant_op.constant(15.0),
           "b": constant_op.constant([4, 5, 6])
-      }], [{
+      }], lambda: [{
           "a": constant_op.constant(15.0),
           "b": constant_op.constant([4, 5, 6, 7])
       }, {
@@ -108,8 +117,12 @@ class StructureTest(test.TestCase, parameterized.TestCase):
                   indices=[[0], [1], [2]], values=[4, 5, 6], dense_shape=[3])
       }, (constant_op.constant(15.0), constant_op.constant([4, 5, 6]))]),
   )
-  def testIsCompatibleWithStructure(self, original_value, compatible_values,
-                                    incompatible_values):
+  @test_util.run_deprecated_v1
+  def testIsCompatibleWithStructure(
+      self, original_value_fn, compatible_values_fn, incompatible_values_fn):
+    original_value = original_value_fn()
+    compatible_values = compatible_values_fn()
+    incompatible_values = incompatible_values_fn()
     s = structure.Structure.from_value(original_value)
     for compatible_value in compatible_values:
       self.assertTrue(
@@ -120,10 +133,6 @@ class StructureTest(test.TestCase, parameterized.TestCase):
           s.is_compatible_with(
               structure.Structure.from_value(incompatible_value)))
 
-  # NOTE(mrry): The arguments must be lifted into lambdas because otherwise they
-  # will be executed before the (eager- or graph-mode) test environment has been
-  # set up.
-  # pylint: disable=g-long-lambda
   @parameterized.parameters(
       (lambda: constant_op.constant(37.0),),
       (lambda: sparse_tensor.SparseTensor(
@@ -344,12 +353,141 @@ class StructureTest(test.TestCase, parameterized.TestCase):
            "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
                  structure.TensorStructure(dtypes.string, []))})),
   )
-  def testFromLegacyStructure(self, output_types, output_shapes, output_classes,
-                              expected_structure):
-    actual_structure = structure.Structure._from_legacy_structure(
+  def testConvertLegacyStructure(self, output_types, output_shapes,
+                                 output_classes, expected_structure):
+    actual_structure = structure.convert_legacy_structure(
         output_types, output_shapes, output_classes)
     self.assertTrue(expected_structure.is_compatible_with(actual_structure))
     self.assertTrue(actual_structure.is_compatible_with(expected_structure))
 
+  def testNestedNestedStructure(self):
+    # Although `Structure.from_value()` will not construct one, a nested
+    # structure containing nested `NestedStructure` objects can occur if a
+    # structure is constructed manually.
+    s = structure.NestedStructure(
+        (structure.TensorStructure(dtypes.int64, []),
+         structure.NestedStructure(
+             (structure.TensorStructure(dtypes.float32, []),
+              structure.TensorStructure(dtypes.string, [])))))
+
+    int64_t = constant_op.constant(37, dtype=dtypes.int64)
+    float32_t = constant_op.constant(42.0)
+    string_t = constant_op.constant("Foo")
+
+    nested_tensors = (int64_t, (float32_t, string_t))
+
+    tensor_list = s._to_tensor_list(nested_tensors)
+    for expected, actual in zip([int64_t, float32_t, string_t], tensor_list):
+      self.assertIs(expected, actual)
+
+    (actual_int64_t, (actual_float32_t, actual_string_t)) = s._from_tensor_list(
+        tensor_list)
+    self.assertIs(int64_t, actual_int64_t)
+    self.assertIs(float32_t, actual_float32_t)
+    self.assertIs(string_t, actual_string_t)
+
+    (actual_int64_t, (actual_float32_t, actual_string_t)) = (
+        s._from_compatible_tensor_list(tensor_list))
+    self.assertIs(int64_t, actual_int64_t)
+    self.assertIs(float32_t, actual_float32_t)
+    self.assertIs(string_t, actual_string_t)
+
+  @parameterized.named_parameters(
+      ("Tensor", structure.TensorStructure(dtypes.float32, []), 32,
+       structure.TensorStructure(dtypes.float32, [32])),
+      ("TensorUnknown", structure.TensorStructure(dtypes.float32, []), None,
+       structure.TensorStructure(dtypes.float32, [None])),
+      ("SparseTensor", structure.SparseTensorStructure(dtypes.float32, [None]),
+       32, structure.SparseTensorStructure(dtypes.float32, [32, None])),
+      ("SparseTensorUnknown",
+       structure.SparseTensorStructure(dtypes.float32, [4]), None,
+       structure.SparseTensorStructure(dtypes.float32, [None, 4])),
+      ("Nest", structure.NestedStructure({
+          "a": structure.TensorStructure(dtypes.float32, []),
+          "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
+                structure.TensorStructure(dtypes.string, []))}), 128,
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, [128]),
+           "b": (structure.SparseTensorStructure(dtypes.int32, [128, 2, 2]),
+                 structure.TensorStructure(dtypes.string, [128]))})),
+  )
+  def testBatch(self, element_structure, batch_size,
+                expected_batched_structure):
+    batched_structure = element_structure._batch(batch_size)
+    self.assertTrue(
+        batched_structure.is_compatible_with(expected_batched_structure))
+    self.assertTrue(
+        expected_batched_structure.is_compatible_with(batched_structure))
+
+  @parameterized.named_parameters(
+      ("Tensor", structure.TensorStructure(dtypes.float32, [32]),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("TensorUnknown", structure.TensorStructure(dtypes.float32, [None]),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("SparseTensor",
+       structure.SparseTensorStructure(dtypes.float32, [32, None]),
+       structure.SparseTensorStructure(dtypes.float32, [None])),
+      ("SparseTensorUnknown",
+       structure.SparseTensorStructure(dtypes.float32, [None, 4]),
+       structure.SparseTensorStructure(dtypes.float32, [4])),
+      ("Nest", structure.NestedStructure({
+          "a": structure.TensorStructure(dtypes.float32, [128]),
+          "b": (structure.SparseTensorStructure(dtypes.int32, [128, 2, 2]),
+                structure.TensorStructure(dtypes.string, [None]))}),
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, []),
+           "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
+                 structure.TensorStructure(dtypes.string, []))})),
+  )
+  def testUnbatch(self, element_structure, expected_unbatched_structure):
+    unbatched_structure = element_structure._unbatch()
+    self.assertTrue(
+        unbatched_structure.is_compatible_with(expected_unbatched_structure))
+    self.assertTrue(
+        expected_unbatched_structure.is_compatible_with(unbatched_structure))
+
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant([[1.0, 2.0], [3.0, 4.0]]),
+       lambda: constant_op.constant([1.0, 2.0])),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 1]], values=[13, 27], dense_shape=[2, 2]),
+       lambda: sparse_tensor.SparseTensor(
+           indices=[[0]], values=[13], dense_shape=[2])),
+      ("Nest", lambda: (
+          constant_op.constant([[1.0, 2.0], [3.0, 4.0]]),
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 1]], values=[13, 27], dense_shape=[2, 2])),
+       lambda: (constant_op.constant([1.0, 2.0]), sparse_tensor.SparseTensor(
+           indices=[[0]], values=[13], dense_shape=[2]))),
+  )
+  def testToBatchedTensorList(self, value_fn, element_0_fn):
+    batched_value = value_fn()
+    s = structure.Structure.from_value(batched_value)
+    batched_tensor_list = s._to_batched_tensor_list(batched_value)
+
+    # The batch dimension is 2 for all of the test cases.
+    # NOTE(mrry): `tf.shape()` does not currently work for the DT_VARIANT
+    # tensors in which we store sparse tensors.
+    for t in batched_tensor_list:
+      if t.dtype != dtypes.variant:
+        self.assertEqual(2, self.evaluate(array_ops.shape(t)[0]))
+
+    # Test that the 0th element from the unbatched tensor is equal to the
+    # expected value.
+    expected_element_0 = self.evaluate(element_0_fn())
+    unbatched_s = s._unbatch()
+    actual_element_0 = unbatched_s._from_tensor_list(
+        [t[0] for t in batched_tensor_list])
+
+    for expected, actual in zip(
+        nest.flatten(expected_element_0), nest.flatten(actual_element_0)):
+      if sparse_tensor.is_sparse(expected):
+        self.assertSparseValuesEqual(expected, actual)
+      else:
+        self.assertAllEqual(expected, actual)
+
+  # pylint: enable=g-long-lambda
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index e84482d2b25b3adfcb75922e99d4dc5004107dfc..c6abd476d9d274a3aab270a548f5b0ebd3b6d257 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -557,6 +557,7 @@ py_test(
         ":source_utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
+        "//tensorflow/python:cond_v2",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
@@ -566,6 +567,7 @@ py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python:while_v2",
         "//third_party/py/numpy",
     ],
 )
@@ -615,7 +617,7 @@ cuda_py_test(
 
 py_test(
     name = "framework_test",
-    size = "small",
+    size = "medium",
     srcs = ["wrappers/framework_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index 242215dccb95c31ab640579486bc2234dfc6b12d..ffbdff8c47b7208279966fdfcf022865c8a09309 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Public Python API of TensorFlow Debugger (tfdbg).
 
-See the [TFDBG](https://tensorflow.org/api_guides/python/tfdbg) guide.
+See the [TFDBG](https://www.tensorflow.org/guide/debugger) guide.
 
 @@add_debug_tensor_watch
 @@watch_graph
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index f197a9e4dcefdb528a3a843effa95f7311ca007a..586982dc4bf3511925f46268c537ed53d54ed700 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -573,6 +573,7 @@ def create_analyzer_cli(dump):
   return analyzer, registry
 
 
+@test_util.run_v1_only("b/120545219")
 class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -1583,7 +1584,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       x = variables.VariableV1([1, 3, 3, 7], name="x")
       _, idx = array_ops.unique(x, name="x_unique")
       idx_times_two = math_ops.multiply(idx, 2, name="idx_times_two")
-      sess.run(x.initializer)
+      self.evaluate(x.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(
@@ -1668,6 +1669,7 @@ class AnalyzerCLIPrintLargeTensorTest(test_util.TensorFlowTestCase):
     self.assertNotIn("...,", out.lines[4])
 
 
+@test_util.run_v1_only("b/120545219")
 class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -1995,6 +1997,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                      out.font_attr_segs[0])
 
 
+@test_util.run_v1_only("b/120545219")
 class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
 
   @classmethod
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 07b364db9f2aab9c11ecb769a94f36e0809d70a0..66a12efda53470b33edf4788984e632bfe55f2b9 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -105,6 +105,7 @@ class TimeToReadableStrTest(test_util.TensorFlowTestCase):
       cli_shared.time_to_readable_str(100, force_time_unit="ks")
 
 
+@test_util.run_v1_only("b/120545219")
 class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -118,6 +119,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
   def tearDown(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testSingleFetchNoFeeds(self):
     run_start_intro = cli_shared.get_run_start_intro(12, self.const_a, None, {})
 
@@ -181,6 +183,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     run_start_intro = cli_shared.get_run_start_intro(1, self.sparse_d, None, {})
     self.assertEqual(str(self.sparse_d), run_start_intro.lines[4].strip())
 
+  @test_util.run_deprecated_v1
   def testTwoFetchesListNoFeeds(self):
     fetches = [self.const_a, self.const_b]
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -197,6 +200,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testNestedListAsFetches(self):
     fetches = [self.const_c, [self.const_a, self.const_b]]
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -210,6 +214,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 3 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testNestedDictAsFetches(self):
     fetches = {"c": self.const_c, "ab": {"a": self.const_a, "b": self.const_b}}
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -227,6 +232,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 3 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testTwoFetchesAsTupleNoFeeds(self):
     fetches = (self.const_a, self.const_b)
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -243,6 +249,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testTwoFetchesAsNamedTupleNoFeeds(self):
     fetches_namedtuple = namedtuple("fetches", "x y")
     fetches = fetches_namedtuple(self.const_b, self.const_c)
@@ -260,6 +267,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testWithFeedDict(self):
     feed_dict = {
         self.const_a: 10.0,
@@ -283,6 +291,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
                                                        feed_dict)
     self.assertEqual("run #1: 1 fetch (c:0); 2 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testTensorFilters(self):
     feed_dict = {self.const_a: 10.0}
     tensor_filters = {
@@ -313,17 +322,20 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     command_set.add(annot[2].content)
     self.assertEqual({"run -f filter_a", "run -f filter_b"}, command_set)
 
+  @test_util.run_deprecated_v1
   def testGetRunShortDescriptionWorksForTensorFeedKey(self):
     short_description = cli_shared.get_run_short_description(
         1, self.const_a, {self.const_a: 42.0})
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (a:0)", short_description)
 
+  @test_util.run_deprecated_v1
   def testGetRunShortDescriptionWorksForUnicodeFeedKey(self):
     short_description = cli_shared.get_run_short_description(
         1, self.const_a, {u"foo": 42.0})
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (foo)", short_description)
 
 
+@test_util.run_v1_only("b/120545219")
 class GetErrorIntroTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
index 60b6047970732f7e3f015216cbf2c91f8241e956..d6d2b58b5f8138643bb4b9886da01b72295b5df7 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -70,6 +70,7 @@ def _assert_no_lines_match(pattern, lines):
         "%s matched at least one line in %s." % (pattern, str(lines)))
 
 
+@test_util.run_v1_only("b/120545219")
 class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
 
   def testNodeInfoEmpty(self):
@@ -321,6 +322,7 @@ class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
     _assert_at_least_one_line_matches(r"Device Total.*0\.009ms", prof_output)
 
 
+@test_util.run_v1_only("b/120545219")
 class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/cli/stepper_cli_test.py b/tensorflow/python/debug/cli/stepper_cli_test.py
index 7b8a42c25380dde8bc2ce0d34eb79f2ddd54922f..5cf69d0168b70a4d03162512b5024736c50cf23a 100644
--- a/tensorflow/python/debug/cli/stepper_cli_test.py
+++ b/tensorflow/python/debug/cli/stepper_cli_test.py
@@ -129,6 +129,7 @@ def _parse_updated(lines):
   return updated
 
 
+@test_util.run_v1_only("b/120545219")
 class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/examples/debug_errors.py b/tensorflow/python/debug/examples/debug_errors.py
index 28abc9734370630b864da4f693cbddd88c382502..e3692072cc558fa11a47daafb6fb0834d70ee654 100644
--- a/tensorflow/python/debug/examples/debug_errors.py
+++ b/tensorflow/python/debug/examples/debug_errors.py
@@ -77,4 +77,5 @@ if __name__ == "__main__":
       default=False,
       help="Use debugger to track down bad values during training")
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 3821b393ec6847db71b7c4b7396b1ed448ae9538..777fb089881a069e403eb897f4efabcff815e2bf 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -100,4 +100,5 @@ if __name__ == "__main__":
       "--debug flag.")
 
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_keras.py b/tensorflow/python/debug/examples/debug_keras.py
index 3272d85ade957b254b2c1a0977156179cd71bb9d..019121fa0a61a4e69ce370bac23c4575a27a72c9 100644
--- a/tensorflow/python/debug/examples/debug_keras.py
+++ b/tensorflow/python/debug/examples/debug_keras.py
@@ -86,4 +86,5 @@ if __name__ == "__main__":
       default=2,
       help="Number of epochs to train the model for.")
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index ab1c90371cd18bbaf278b72248bcc7e9e9c34b06..09fb06c9c065f544a4c9bb47b96157704a8306e2 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -190,4 +190,5 @@ if __name__ == "__main__":
       "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
       "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/lib/common_test.py b/tensorflow/python/debug/lib/common_test.py
index 5af0dafcf9fd81763b30eb159a3e21ef8b7f9ac9..f6413f6b7b3dee82ea67ca664e8645152fbb5b83 100644
--- a/tensorflow/python/debug/lib/common_test.py
+++ b/tensorflow/python/debug/lib/common_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.platform import googletest
 
 class CommonTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testOnFeedOneFetch(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -35,6 +36,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["b:0"], loaded[1])
 
+  @test_util.run_deprecated_v1
   def testGetRunKeyFlat(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -43,6 +45,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["a:0", "b:0"], loaded[1])
 
+  @test_util.run_deprecated_v1
   def testGetRunKeyNestedFetches(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 01867fc69d0782b34edb1e8eb873b19f5dfc8529..885691c3ef71ba995ec3ab38e2d1bda7e1e30b1a 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import gradient_descent
 
 
+@test_util.run_v1_only("b/120545219")
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index 1f67f8a0d4e55c7faf8ca65af51169831e731576..34030c0adcab30647d360260741a8dcbb870cc73 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -126,8 +126,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       u = variables.Variable([12.0], name="u")
       v = variables.Variable([30.0], name="v")
       w = math_ops.add(u, v, name="w")
-      sess.run(u.initializer)
-      sess.run(v.initializer)
+      self.evaluate(u.initializer)
+      self.evaluate(v.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, w, expected_output=[42.0])
@@ -139,7 +139,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
         b = math_ops.add(a, a, name="b")
       with ops.control_dependencies([a, b]):
         c = math_ops.multiply(b, b, name="c")
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, c, expected_output=400.0)
@@ -150,8 +150,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       y = variables.Variable(20.0, name="y")
       cond = control_flow_ops.cond(
           x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1))
-      sess.run(x.initializer)
-      sess.run(y.initializer)
+      self.evaluate(x.initializer)
+      self.evaluate(y.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, cond, expected_output=21.0)
@@ -173,8 +173,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       toy_loss = x * (u - v)
       train_op = gradient_descent.GradientDescentOptimizer(
           learning_rate=0.1).minimize(toy_loss, name="train_op")
-      sess.run(u.initializer)
-      sess.run(v.initializer)
+      self.evaluate(u.initializer)
+      self.evaluate(v.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(sess, train_op)
 
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index 23ab98444cd0777700daaca26ccafe9c68444cb7..9d59cfc1792a8df472998e115dc01387a9ba3cdf 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -185,6 +185,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertEqual(["file:///tmp/tfdbg_1", "file:///tmp/tfdbg_2"],
                      watch_0.debug_urls)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_allNodes(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -216,6 +217,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertTrue("p1" in node_names)
     self.assertTrue("s" in node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -230,6 +232,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["a1_init", "a1", "a1/Assign", "a1/read", "p1"]),
         sorted(node_names))
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_opTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -255,6 +258,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["p1"], node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_tensorDTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -267,6 +271,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign", "b", "b/Assign"], node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndTensorDTypeWhitelists(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -280,6 +285,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign"], node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -294,6 +300,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["b_init", "b", "b/Assign", "b/read", "c", "s"]),
         sorted(node_names))
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_opTypeBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -306,6 +313,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(sorted(["p1", "s"]), sorted(node_names))
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndOpTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -319,6 +327,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["s"], node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_tensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -335,6 +344,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertNotIn("b/Assign", node_names)
     self.assertIn("s", node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndTensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
diff --git a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
index 74498c8ea3dd494cd8fc6237b60b11a202497990..2405e29aaa51c2e0c422fa6f950ec46553ae75c0 100644
--- a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+@test_util.run_v1_only("b/120545219")
 class DistributedSessionDebugTest(test_util.TensorFlowTestCase):
   """Test the debugging of distributed sessions."""
 
diff --git a/tensorflow/python/debug/lib/grpc_large_data_test.py b/tensorflow/python/debug/lib/grpc_large_data_test.py
index ccc21bcf94fd7d5697cf3ccde0c54d8a32bbe8a9..a7fdbebaf5127211595e9b8322df517be6dc3ce4 100644
--- a/tensorflow/python/debug/lib/grpc_large_data_test.py
+++ b/tensorflow/python/debug/lib/grpc_large_data_test.py
@@ -58,7 +58,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
     self.debug_server.clear_data()
 
   def testSendingLargeGraphDefsWorks(self):
-    with self.test_session(
+    with self.session(
         use_gpu=True,
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
       u = variables.VariableV1(42.0, name="original_u")
@@ -86,7 +86,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
       self.assertGreater(max_graph_def_size, 4 * 1024 * 1024)
 
   def testSendingLargeFloatTensorWorks(self):
-    with self.test_session(
+    with self.session(
         use_gpu=True,
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
       u_init_val_array = list(xrange(1200 * 1024))
@@ -110,7 +110,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
           self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
 
   def testSendingStringTensorWithAlmostTooLargeStringsWorks(self):
-    with self.test_session(
+    with self.session(
         use_gpu=True,
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
       u_init_val = [
@@ -133,7 +133,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
           self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
 
   def testSendingLargeStringTensorWorks(self):
-    with self.test_session(
+    with self.session(
         use_gpu=True,
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
       strs_total_size_threshold = 5000 * 1024
@@ -162,7 +162,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
           self.debug_server.debug_tensor_values["u_init:0:DebugIdentity"][0])
 
   def testSendingEmptyFloatTensorWorks(self):
-    with self.test_session(
+    with self.session(
         use_gpu=True,
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
       u_init = constant_op.constant(
@@ -184,7 +184,7 @@ class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
       self.assertEqual(0, len(u_init_value))
 
   def testSendingEmptyStringTensorWorks(self):
-    with self.test_session(
+    with self.session(
         use_gpu=True,
         config=session_debug_testlib.no_rewrite_session_config()) as sess:
       u_init = constant_op.constant(
diff --git a/tensorflow/python/debug/lib/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
index 1874160dd637596ffb8a935148baf1f308de0210..16ab815d92ddffe2108776388f668427fd140f06 100644
--- a/tensorflow/python/debug/lib/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -28,11 +28,13 @@ from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.debug.lib import session_debug_testlib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
 
   def _debug_urls(self, run_number=None):
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index bfc9a3a382744676fafe9f280ab54f8dee3fedcb..472e2449156fefc2c00bb4079018de224097692e 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -91,6 +91,7 @@ class GrpcDebugServerTest(test_util.TensorFlowTestCase):
     server.stop_server().wait()
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
 
   @classmethod
@@ -353,6 +354,7 @@ class SessionDebugConcurrentTest(
     return urls
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
   """Test server gating of debug ops."""
 
@@ -730,6 +732,7 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
       self.assertEqual("DebugNumericSummary", debug_watch.debug_op)
 
 
+@test_util.run_v1_only("b/120545219")
 class DelayedDebugServerTest(test_util.TensorFlowTestCase):
 
   def testDebuggedSessionRunWorksWithDelayedDebugServerStartup(self):
diff --git a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
index b0dc25851ca3101a48543aeca1325fa155dd29b7..8eef45392f2fb56bc57b6bd6156f9fed8a93cd1f 100644
--- a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
+++ b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
@@ -67,7 +67,7 @@ class SessionDebugMultiGPUTest(test_util.TensorFlowTestCase):
         u1 = math_ops.multiply(v, v, name="u1")
       w = math_ops.subtract(u1, u0, name="w")
 
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(run_options, sess.graph,
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 25ef91b575957164691bccd9d15107d9a4812eac..5165febff52506d07e2d3b0aea361c31567cc419 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -84,6 +84,7 @@ class _RNNCellForTest(rnn_cell_impl.RNNCell):
     return (math_ops.multiply(self._w, input_), state)
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugTestBase(test_util.TensorFlowTestCase):
   """Base class for unit tests of tfdbg running with tf.Session."""
 
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index 4a8d4eaa99f28db26f05a00e7759c79699ca9ab4..4f4aea032132d09f025392587038b79d7f0804c5 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -65,6 +65,7 @@ class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):
     self.assertTrue(
         source_utils.guess_is_tensorflow_py_library(source_utils.__file__))
 
+  @test_util.run_deprecated_v1
   def testFileInPythonKernelsPathReturnsTrue(self):
     x = constant_op.constant(42.0, name="x")
     self.assertTrue(
@@ -109,8 +110,8 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
       self.w = math_ops.matmul(self.u, self.v, name="w")
       self.w_line_number = line_number_above()
 
-      sess.run(self.u.initializer)
-      sess.run(self.v.initializer)
+      self.evaluate(self.u.initializer)
+      self.evaluate(self.v.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(
@@ -215,6 +216,7 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
     os.remove(unrelated_source_path)
 
 
+@test_util.run_v1_only("b/120545219")
 class ListSourceAgainstDumpTest(test_util.TensorFlowTestCase):
 
   def createAndRunGraphWithWhileLoop(self):
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 3839c671982f80158273ea40de73ff920306316d..9e78e207b80a99f3812c5909cf3753d90eab3680 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import gradient_descent
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -443,6 +444,7 @@ class StepperTest(test_util.TensorFlowTestCase):
           self.assertAllClose(-4.0, result["fz"]["z"])
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -577,6 +579,7 @@ class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
       self.assertAllClose([[-1.0], [6.0]], stepper.finalize())
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperAssignAddTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -692,6 +695,7 @@ class StepperAssignAddTest(test_util.TensorFlowTestCase):
       self.assertAllClose(12.0, stepper.cont(self.v))
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperBackwardRunTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/wrappers/disk_usage_test.py b/tensorflow/python/debug/wrappers/disk_usage_test.py
index 0874525966ceb34b9cb99df9affd63cf1865b663..88b1cd540de7a6a56db6e5165be53ae8c9c2df26 100644
--- a/tensorflow/python/debug/wrappers/disk_usage_test.py
+++ b/tensorflow/python/debug/wrappers/disk_usage_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
 
 
+@test_util.run_v1_only("b/120545219")
 class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
 
   @classmethod
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 11011a5c1342b281ab86c7f861d895f570bd037d..42e3b09382d825840ea12eeaf2baf35f33c17da9 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -41,6 +41,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
 
 
+@test_util.run_v1_only("b/120545219")
 class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index afda1fdc0de73ba52df3cef067998699c4e89fb7..ae403205b7cc087f35453c43ce9e3d3cbbd76ac2 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -115,6 +115,8 @@ import abc
 import re
 import threading
 
+import six
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_utils
@@ -329,6 +331,7 @@ class OnRunEndResponse(object):
     pass
 
 
+@six.add_metaclass(abc.ABCMeta)
 class BaseDebugWrapperSession(session.SessionInterface):
   """Base class of debug-wrapper session classes.
 
@@ -788,7 +791,6 @@ class BaseDebugWrapperSession(session.SessionInterface):
   # TODO(cais): Add _node_name_regex_whitelist and
   #   _node_op_type_regex_whitelist.
 
-  @abc.abstractmethod
   def invoke_node_stepper(self,
                           node_stepper,
                           restore_variable_values_on_exit=True):
@@ -805,6 +807,9 @@ class BaseDebugWrapperSession(session.SessionInterface):
       The same return values as the `Session.run()` call on the same fetches as
         the NodeStepper.
     """
+    raise NotImplementedError(
+        self.__class__.__name__ + " does not support node-stepper mode.")
+
 
   def should_stop(self):
     if hasattr(self._sess, "should_stop"):
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 73e08ce7d5969de2ae54e2505fa7b449bfaf631a..a50fa7cf4b870868a61ea4df173fc24bc8a8e110 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -141,6 +141,7 @@ class TestDebugWrapperSessionBadAction(framework.BaseDebugWrapperSession):
     return framework.OnRunEndResponse()
 
 
+@test_util.run_v1_only("b/120545219")
 class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def _no_rewrite_session_config(self):
@@ -339,7 +340,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
     with wrapper.as_default():
       foo = constant_op.constant(42, name="foo")
-      self.assertEqual(42, foo.eval())
+      self.assertEqual(42, self.evaluate(foo))
       self.assertEqual(foo, self._observer["run_fetches"])
 
   def testWrapperShouldSupportSessionClose(self):
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 149a7497df8fecc19a665afc1483ad55c890c335..e38df861f5b633baf94c99e4892e1bd90943337d 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -127,6 +127,7 @@ class LocalCLIDebuggerWrapperSessionForTest(
         return e.exit_token
 
 
+@test_util.run_v1_only("b/120545219")
 class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index bdc869c6437919420bd673b1c9f62e690fdd1bc1..887c61cb8fd81c6be4d20ba6b25c2997cea8cb7f 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -7,15 +7,152 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_library(
+    name = "all_reduce",
+    srcs = [
+        "all_reduce.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nccl_ops",
+    ],
+)
+
+tf_py_test(
+    name = "all_reduce_test",
+    srcs = ["all_reduce_test.py"],
+    additional_deps = [
+        ":all_reduce",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:state_ops",
+    ],
+)
+
+py_library(
+    name = "cross_device_ops",
+    srcs = ["cross_device_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cross_device_utils",
+        ":device_util",
+        ":reduce_util",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "cross_device_utils",
+    srcs = ["cross_device_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":all_reduce",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nccl_ops",
+    ],
+)
+
+py_library(
+    name = "device_util",
+    srcs = ["device_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:device",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "device_util_test",
+    srcs = ["device_util_test.py"],
+    additional_deps = [
+        ":device_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+    ],
+)
 
 py_library(
     name = "distribute",
+    srcs = [
+        "__init__.py",
+    ],
     srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
     deps = [
-        ":distribute_config",
-        ":distribute_coordinator",
-        ":distribute_coordinator_context",
+        ":distribute_lib",
+        ":mirrored_strategy",
+    ],
+)
+
+py_library(
+    name = "distribute_lib",
+    srcs = [
+        "distribute_lib.py",
+        "distribution_strategy_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":device_util",
+        ":reduce_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/tools/docs:doc_controls",
+    ],
+)
+
+py_test(
+    name = "distribute_lib_test",
+    size = "small",
+    srcs = ["distribute_lib_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":distribute_lib",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:variable_scope",
     ],
 )
 
@@ -45,13 +182,10 @@ py_library(
 
 py_test(
     name = "distribute_coordinator_test",
-    size = "large",
     srcs = ["distribute_coordinator_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "manual",
         "no_pip",
-        "notap",
     ],
     deps = [
         ":distribute_coordinator",
@@ -78,6 +212,35 @@ py_library(
     deps = [],
 )
 
+py_library(
+    name = "mirrored_strategy",
+    srcs = ["mirrored_strategy.py"],
+    deps = [
+        ":cross_device_ops",
+        ":device_util",
+        ":distribute_lib",
+        ":multi_worker_util",
+        ":reduce_util",
+        ":shared_variable_creator",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+    ],
+)
+
 py_library(
     name = "multi_worker_util",
     srcs = [
@@ -90,6 +253,35 @@ py_library(
     ],
 )
 
+py_library(
+    name = "input_ops",
+    srcs = ["input_ops.py"],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+cuda_py_test(
+    name = "input_ops_test",
+    srcs = ["input_ops_test.py"],
+    additional_deps = [
+        ":input_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:util",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
+
 py_test(
     name = "multi_worker_util_test",
     srcs = ["multi_worker_util_test.py"],
@@ -122,3 +314,49 @@ py_library(
         "//tensorflow/python:training",
     ],
 )
+
+py_library(
+    name = "reduce_util",
+    srcs = ["reduce_util.py"],
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_library(
+    name = "shared_variable_creator",
+    srcs = ["shared_variable_creator.py"],
+)
+
+py_test(
+    name = "shared_variable_creator_test",
+    srcs = ["shared_variable_creator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":shared_variable_creator",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "values",
+    srcs = ["values.py"],
+    deps = [
+        ":device_util",
+        ":distribute_lib",
+        ":input_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:base",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/python/distribute/__init__.py b/tensorflow/python/distribute/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ff912ae10d8336cfeeb42d060bd0d9c52e24482
--- /dev/null
+++ b/tensorflow/python/distribute/__init__.py
@@ -0,0 +1,25 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution Strategy library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import mirrored_strategy
+# pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/all_reduce.py b/tensorflow/python/distribute/all_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd7c45ae27ac2d093c7feaf9d490ffa074533ddc
--- /dev/null
+++ b/tensorflow/python/distribute/all_reduce.py
@@ -0,0 +1,860 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to construct a TF subgraph implementing distributed All-Reduce."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+from tensorflow.python.framework import device as device_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nccl_ops
+
+
+def _flatten_tensors(tensors):
+  """Check tensors for isomorphism and flatten.
+
+  Args:
+    tensors: list of T `tf.Tensor` which must all have the same shape.
+
+  Returns:
+    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
+    shape: the original shape of each element of input tensors
+
+  Raises:
+    ValueError: tensors are empty or non-isomorphic or have unknown shape.
+  """
+  if not tensors:
+    raise ValueError("tensors cannot be empty")
+  shape = tensors[0].shape
+  for tensor in tensors:
+    shape = shape.merge_with(tensor.shape)
+  if not shape.is_fully_defined():
+    raise ValueError("Tensors must have statically known shape.")
+  if len(shape) != 1:
+    reshaped = []
+    for t in tensors:
+      with ops.colocate_with(t):
+        reshaped.append(array_ops.reshape(t, [-1]))
+    tensors = reshaped
+  return tensors, shape
+
+
+def _reshape_tensors(tensors, shape):
+  """Reshape tensors flattened by _flatten_tensors.
+
+  Args:
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
+    shape: list of integers describing the desired shape.  Product of
+      the elements must equal the length of each tensor.
+
+  Returns:
+    list of T `tf.Tensor` which are the reshaped inputs.
+  """
+  reshaped = []
+  for t in tensors:
+    with ops.colocate_with(t):
+      reshaped.append(array_ops.reshape(t, shape))
+  return reshaped
+
+
+def _padded_split(tensor, pieces):
+  """Like split for 1D tensors but pads-out case where len % pieces != 0.
+
+  Args:
+    tensor: T `tf.Tensor` that must be 1D.
+    pieces: a positive integer specifying the number of pieces into which
+      tensor should be split.
+
+  Returns:
+    list of T `tf.Tensor` of length pieces, which hold the values of
+      thin input tensor, in order.  The final tensor may
+      be zero-padded on the end to make its size equal to those of all
+      of the other tensors.
+
+  Raises:
+    ValueError: The input tensor is not 1D.
+  """
+  shape = tensor.shape
+  if 1 != len(shape):
+    raise ValueError("input tensor must be 1D")
+  tensor_len = shape.dims[0].value
+  with ops.colocate_with(tensor):
+    if tensor_len % pieces != 0:
+      # pad to an even length
+      chunk_size = 1 + tensor_len // pieces
+      if pieces > tensor_len:
+        # This is an edge case that should not come up in practice,
+        # i.e. a different reduction algorithm would be better,
+        # but we'll make it work just for completeness.
+        pad_len = pieces - tensor_len
+        extended_whole = array_ops.concat(
+            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        parts = array_ops.split(extended_whole, pieces)
+        return parts, pad_len
+      elif (pieces - 1) * chunk_size >= tensor_len:
+        # Another edge case of limited real interest.
+        pad_len = (pieces * chunk_size) % tensor_len
+        extended_whole = array_ops.concat(
+            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        parts = array_ops.split(extended_whole, pieces)
+        return parts, pad_len
+      else:
+        last_chunk_size = tensor_len - (pieces - 1) * chunk_size
+        pad_len = chunk_size - last_chunk_size
+        piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
+        parts = array_ops.split(tensor, piece_lens)
+        parts[-1] = array_ops.concat(
+            [parts[-1], array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        return parts, pad_len
+    else:
+      return array_ops.split(tensor, pieces), 0
+
+
+def _strip_padding(tensors, pad_len):
+  """Strip the suffix padding added by _padded_split.
+
+  Args:
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
+    pad_len: number of elements to be stripped from the end of each tensor.
+
+  Returns:
+    list of T `tf.Tensor` which are the stripped inputs.
+
+  Raises:
+    ValueError: tensors must be a non-empty list of 1D tensors, and
+      each must be longer than pad_len.
+  """
+  if not tensors:
+    raise ValueError("tensors cannot be empty")
+  shape = tensors[0].shape
+  if len(shape) > 1:
+    raise ValueError("tensors must be 1D")
+  prefix_len = int(shape[0] - pad_len)
+  if prefix_len < 0:
+    raise ValueError("pad_len longer than tensor")
+  stripped = []
+  for t in tensors:
+    with ops.colocate_with(t):
+      stripped.append(array_ops.slice(t, [0], [prefix_len]))
+  return stripped
+
+
+def _ragged_split(tensor, pieces):
+  """Like split for 1D tensors but allows case where len % pieces != 0.
+
+  Args:
+    tensor: T `tf.Tensor` that must be 1D.
+    pieces: a positive integer specifying the number of pieces into which
+      tensor should be split.
+
+  Returns:
+    list of T `tf.Tensor` of length pieces, which hold the values of
+      the input tensor, in order.  The final tensor may be shorter
+      than the others, which will all be of equal length.
+
+  Raises:
+    ValueError: input tensor must be 1D.
+  """
+  shape = tensor.shape
+  if 1 != len(shape):
+    raise ValueError("input tensor must be 1D")
+  tensor_len = shape.dims[0].value
+  chunk_size = tensor_len // pieces
+  with ops.colocate_with(tensor):
+    if tensor_len != (pieces * chunk_size):
+      # last piece will be short
+      assert pieces > 1
+      last_chunk_size = tensor_len - ((pieces - 1) * chunk_size)
+      assert last_chunk_size > 0
+      piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
+      return array_ops.split(tensor, piece_lens)
+    else:
+      return array_ops.split(tensor, pieces)
+
+
+def _ring_permutations(num_workers, num_subchunks, gpu_perm):
+  """"Generate an array of device index arrays, one for each subchunk.
+
+  In the basic ring reduction algorithm there are size(T)/num_devices
+  data chunks and each device process one chunk per tick, i.e. sending
+  one chunk and receiving one chunk.  The idea of subchunking is that
+  each device processes num_subchunks smaller data regions per tick,
+  and the ring rank permutation is different for each subchunk index
+  so that a device is potentially sending to and receiving from
+  num_subchunks different other devices at each tick.  Where multiple
+  independent data channels exist between devices, this strategy
+  supplies a method of using them in parallel.
+
+  Args:
+    num_workers: number of worker tasks
+    num_subchunks: number of subchunks into which to divide each per-GPU chunk.
+    gpu_perm: an array of integers in [0, num_gpus-1] giving the default
+      ring order of GPUs at each worker.  Other permutations will be generated
+      by rotating this array and splicing together per-worker instances.
+
+  Raises:
+    ValueError: the number of subchunks may not exceed the number of GPUs.
+
+  Returns:
+    pred_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
+        preceding device in the permutation for that subchunk.  The
+        device index of GPU i at worker j is i + (j * num_gpus).
+    rank_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
+       local rank of device d in the permutation for that subchunk.
+  """
+  num_gpus = len(gpu_perm)
+  devices = num_workers * num_gpus
+  if devices == 0:
+    return [], []
+  if num_subchunks > num_gpus:
+    raise ValueError(
+        "num_subchunks %d must be <= num_gpus %d" % (num_subchunks, num_gpus))
+  rotation_interval = max(1, int(num_gpus / num_subchunks))
+  perms_by_s = []
+  for s in range(0, num_subchunks):
+    full_order = []
+    offset = s * rotation_interval
+    for w in range(0, num_workers):
+      default_order = [(w * num_gpus) + i for i in gpu_perm]
+      dev_order = default_order[offset:] + default_order[:offset]
+      full_order += dev_order
+    perms_by_s.append(full_order)
+  pred_by_s_d = [[-1 for d in range(0, devices)]
+                 for s in range(0, num_subchunks)]
+  rank_by_s_d = [[-1 for d in range(0, devices)]
+                 for s in range(0, num_subchunks)]
+  for s in range(0, num_subchunks):
+    for d in range(0, devices):
+      for t in range(0, devices):
+        if d == perms_by_s[s][t]:
+          rank_by_s_d[s][d] = t
+          pred_by_s_d[s][d] = perms_by_s[s][(t + devices - 1) % devices]
+          break
+  return (pred_by_s_d, rank_by_s_d)
+
+
+def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
+                          gpu_perm, red_op, un_op=None):
+  """Construct a subgraph performing a ring-style all-reduce of input_tensors.
+
+  Args:
+    input_tensors: a list of T `tf.Tensor` objects, which must all
+      have the same shape and type.
+    num_workers: number of worker tasks spanned by input_tensors.
+    num_subchunks: number of subchunks each device should process in one tick.
+    gpu_perm: a list of ints giving a ring-wise rank ordering of GPUs at
+      each worker.  All workers must have the same number of
+      GPUs with the same rank ordering.  If NVLINK is available, this should
+      be a ring order supported by NVLINK edges.
+    red_op: a binary operator for elementwise reduction.
+    un_op: an optional unary operator to apply to fully reduced values.
+
+  Raises:
+    ValueError: empty input_tensors or they don't all have same
+    size.
+
+  Returns:
+    a list of T `tf.Tensor` identical sum-reductions of input_tensors.
+  """
+  if len(input_tensors) < 2:
+    raise ValueError("input_tensors must be length 2 or longer")
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  (pred_by_s_d, rank_by_s_d) = _ring_permutations(
+      num_workers, num_subchunks, gpu_perm)
+  chunks_by_dev, pad_len = _build_ring_gather(
+      input_tensors, devices,
+      num_subchunks, pred_by_s_d, rank_by_s_d, red_op)
+  if un_op:
+    chunks_by_dev = _apply_unary_to_chunks(un_op, chunks_by_dev)
+  output_tensors = _build_ring_scatter(pred_by_s_d, rank_by_s_d,
+                                       chunks_by_dev)
+  if pad_len > 0:
+    output_tensors = _strip_padding(output_tensors, pad_len)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_ring_gather(input_tensors, devices, num_subchunks,
+                       pred_by_s_d, rank_by_s_d, red_op):
+  """Construct a subgraph for the first (reduction) pass of ring all-reduce.
+
+  Args:
+    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
+      shape and type.
+    devices: array of device name strings
+    num_subchunks: number of subchunks each device should process in one tick.
+    pred_by_s_d: as produced by _ring_permutations
+    rank_by_s_d: as produced by _ring_permutations
+    red_op: a binary operator for elementwise reduction
+
+  Raises:
+    ValueError: tensors must all be one dimensional.
+
+  Returns:
+    list of list of T `tf.Tensor` of (partially) reduced values where
+    exactly num_subchunks chunks at each device are fully reduced.
+  """
+  num_devices = len(input_tensors)
+  if num_devices == 0:
+    return []
+  if num_devices == 1:
+    return input_tensors
+  shape = input_tensors[0].shape
+  if 1 != len(shape):
+    raise ValueError("input tensors must be 1D")
+  num_chunks = num_devices * num_subchunks
+  num_ticks = num_devices - 1
+  # Initialize chunks_by_dev with splits of the input tensors.
+  chunks_by_dev = []
+  split_pad_len = 0
+  for d in range(0, num_devices):
+    with ops.device(devices[d]):
+      splits, split_pad_len = _padded_split(input_tensors[d], num_chunks)
+      chunks_by_dev.append(splits)
+  # Reduction phase
+  for tick in range(0, num_ticks):
+    # One new partial reduction for every chunk
+    new_partial_reductions = [None for _ in range(0, num_chunks)]
+    # Compute reductions with respect to last tick's values
+    for d in range(0, num_devices):
+      with ops.device(devices[d]):
+        for s in range(0, num_subchunks):
+          rank = rank_by_s_d[s][d]
+          seg_index = (rank + num_devices - (2 + tick)) % num_devices
+          pred_dev = pred_by_s_d[s][d]
+          chunk_index = (seg_index * num_subchunks) + s
+          new_partial_reductions[chunk_index] = red_op(
+              chunks_by_dev[pred_dev][chunk_index],
+              chunks_by_dev[d][chunk_index])
+    # Update chunks_by_dev with the new values at the end of the tick.
+    for d in range(0, num_devices):
+      for s in range(0, num_subchunks):
+        rank = rank_by_s_d[s][d]
+        seg_index = (rank + num_devices - (2 + tick)) % num_devices
+        chunk_index = (seg_index * num_subchunks) + s
+        chunks_by_dev[d][chunk_index] = new_partial_reductions[chunk_index]
+  return chunks_by_dev, split_pad_len
+
+
+def _apply_unary_to_chunks(f, chunks_by_dev):
+  """Apply a unary op to each tensor in chunks_by_dev, on same device.
+
+  Args:
+    f: a unary function over T `tf.Tensor`.
+    chunks_by_dev: list of lists of T `tf.Tensor`.
+
+  Returns:
+    new list of lists of T `tf.Tensor` with the same structure as
+    chunks_by_dev containing the derived tensors.
+  """
+  output = []
+  for x in chunks_by_dev:
+    with ops.colocate_with(x[0]):
+      output.append([f(t) for t in x])
+  return output
+
+
+def _build_ring_scatter(pred_by_s_d, rank_by_s_d,
+                        chunks_by_dev):
+  """Construct subgraph for second (scatter) pass of ring all-reduce.
+
+  Args:
+    pred_by_s_d: as produced by _ring_permutations
+    rank_by_s_d: as produced by _ring_permutations
+    chunks_by_dev: list of list of T `tf.Tensor` indexed by ints
+      (device, chunk)
+
+  Raises:
+    ValueError: chunks_by_dev is not well-formed
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors, one
+    at each device corresponding to the outer dimension of chunks_by_dev.
+  """
+  num_devices = len(chunks_by_dev)
+  num_chunks = len(chunks_by_dev[0])
+  if 0 != num_chunks % num_devices:
+    raise ValueError(
+        "Expect number of chunks per device to be divisible by num_devices")
+  num_subchunks = int(num_chunks / num_devices)
+  num_ticks = num_devices - 1
+  for tick in range(0, num_ticks):
+    passed_values = [None for _ in range(0, num_chunks)]
+    for d in range(0, num_devices):
+      with ops.colocate_with(chunks_by_dev[d][0]):
+        for s in range(0, num_subchunks):
+          rank = rank_by_s_d[s][d]
+          seg_index = (rank + num_devices - (1 + tick)) % num_devices
+          pred_dev = pred_by_s_d[s][d]
+          chunk_index = (seg_index * num_subchunks) + s
+          passed_values[chunk_index] = array_ops.identity(
+              chunks_by_dev[pred_dev][chunk_index])
+    for d in range(0, num_devices):
+      for s in range(0, num_subchunks):
+        rank = rank_by_s_d[s][d]
+        seg_index = (rank + num_devices - (1 + tick)) % num_devices
+        chunk_index = (seg_index * num_subchunks) + s
+        chunks_by_dev[d][chunk_index] = passed_values[chunk_index]
+  # Join chunks at each device.
+  output = []
+  for x in chunks_by_dev:
+    with ops.colocate_with(x[0]):
+      output.append(array_ops.concat(x, 0))
+  return output
+
+
+def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
+  """Construct a subgraph for recursive halving-doubling all-reduce.
+
+  The recursive halving-doubling algorithm is described in
+  http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf
+
+  The concept is to arrange the participating n devices in
+  a linear sequence where devices exchange data pairwise
+  with one other device in each round.  During the gather
+  phase there are lg(n) rounds where devices exchange
+  increasingly smaller sub-tensors with another device
+  at increasingly greater distances, until at the top
+  each device has 1/n of the fully reduced values.  During the
+  scatter phase each device exchanges its fully reduced
+  sub-tensor (which doubles in length at each round)
+  with one other device at increasingly smaller distances
+  until each device has all of the fully reduced values.
+
+  Note: this preliminary version requires that len(input_tensors) be a
+    power of 2.  TODO(tucker): relax this restriction.  Also, the
+    number of elements in each tensor must be divisible by 2^h where h
+    is the number of hops in each phase.  This will also be relaxed in
+    the future with edge-case specific logic.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
+    red_op: a binary elementwise reduction Op.
+    un_op: an optional unary elementwise Op to apply to reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors, one
+    at each device of input_tensors.
+
+  Raises:
+    ValueError: num_devices not a power of 2, or tensor len not divisible
+    by 2 the proper number of times.
+  """
+  devices = [t.device for t in input_tensors]
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  reduced_shards = _build_recursive_hd_gather(input_tensors, devices, red_op)
+  if un_op:
+    reduced_shards = [un_op(t) for t in reduced_shards]
+  output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_recursive_hd_gather(input_tensors, devices, red_op):
+  """Construct the gather phase of recursive halving-doubling all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
+    devices: a list of strings naming the devices hosting input_tensors,
+      which will also be used to host the (partial) reduction values.
+    red_op: a binary elementwise reduction Op.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensor shards.
+
+  Raises:
+    ValueError: num_devices not a power of 2, or tensor len not divisible
+    by 2 the proper number of times.
+  """
+  num_devices = len(devices)
+  num_hops = int(math.log(num_devices, 2))
+  if num_devices != (2 ** num_hops):
+    raise ValueError("num_devices must be a power of 2")
+  chunks = input_tensors
+  for h in range(0, num_hops):
+    span = 2 ** h
+    group_size = span * 2
+    new_chunks = [[] for _ in devices]
+    for d in range(0, num_devices):
+      if (d % group_size) >= (group_size / 2):
+        # skip right half of a pair
+        continue
+      left_dev = devices[d]
+      right_dev = devices[d + span]
+      left_split = array_ops.split(chunks[d], 2)
+      right_split = array_ops.split(chunks[d+span], 2)
+      with ops.device(left_dev):
+        new_chunks[d] = red_op(left_split[0], right_split[0])
+      with ops.device(right_dev):
+        new_chunks[d + span] = red_op(left_split[1], right_split[1])
+    chunks = new_chunks
+  return chunks
+
+
+def _build_recursive_hd_scatter(input_tensors, devices):
+  """Construct the scatter phase of recursive halving-doublng all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
+    devices: a list of strings naming the devices on which the reconstituted
+      full tensors should be placed.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors.
+  """
+  num_devices = len(devices)
+  num_hops = int(math.log(num_devices, 2))
+  assert num_devices == (2 ** num_hops), "num_devices must be a power of 2"
+  chunks = input_tensors
+  for h in reversed(range(0, num_hops)):
+    span = 2 ** h
+    group_size = span * 2
+    new_chunks = [[] for _ in devices]
+    for d in range(0, num_devices):
+      if (d % group_size) >= (group_size / 2):
+        # skip right half of a pair
+        continue
+      left_idx = d
+      right_idx = d + span
+      left_dev = devices[left_idx]
+      right_dev = devices[right_idx]
+      with ops.device(left_dev):
+        new_chunks[left_idx] = array_ops.concat([chunks[left_idx],
+                                                 chunks[right_idx]], 0)
+      with ops.device(right_dev):
+        new_chunks[right_idx] = array_ops.concat([chunks[left_idx],
+                                                  chunks[right_idx]], 0)
+    chunks = new_chunks
+  return chunks
+
+
+def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
+  """Construct a subgraph for shuffle all-reduce.
+
+  Shuffle reduce is essentially the algorithm implemented when using
+  parameter servers.  Suppose tensor length is n, there are d devices
+  and g gather shards.  Each device sends a n/g length sub-tensor to
+  each gather shard.  The gather shards perform a reduction across d
+  fragments, then broadcast the result back to each device.  The
+  devices then join the g fully reduced fragments they receive from
+  the shards.  The gather shards could perform d-1 pairwise
+  reductions, or one d-way reduction.  The first is better where
+  reduction Op time is low compared to transmission time, the second
+  better in the other case.
+
+  Args:
+    input_tensors: list of T @(tf.Tensor} values to be reduced.
+    gather_devices: list of names of devices on which reduction shards
+      should be placed.
+    red_op: an n-array elementwise reduction Op
+    un_op: optional elementwise unary Op to be applied to fully-reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  dst_devices = [t.device for t in input_tensors]
+  reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
+                                         red_op, un_op)
+  output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None):
+  """Construct the gather (concentrate and reduce) phase of shuffle all-reduce.
+
+  Args:
+    input_tensors: list of T @(tf.Tensor} values to be reduced.
+    gather_devices: list of names of devices on which reduction shards
+      should be placed.
+    red_op: the binary reduction Op
+    un_op: optional elementwise unary Op to be applied to fully-reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced shards.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  num_source_devices = len(input_tensors)
+  num_gather_devices = len(gather_devices)
+  shape = input_tensors[0].shape
+  if len(shape) != 1:
+    raise ValueError("input_tensors must be 1D")
+  shards_by_source = []
+  for d in range(0, num_source_devices):
+    with ops.colocate_with(input_tensors[d]):
+      shards_by_source.append(
+          _ragged_split(input_tensors[d], num_gather_devices))
+  reduced_shards = []
+  for d in range(0, num_gather_devices):
+    with ops.device(gather_devices[d]):
+      values = [s[d] for s in shards_by_source]
+      red_shard = red_op(values)
+      if un_op:
+        red_shard = un_op(red_shard)
+      reduced_shards.append(red_shard)
+  return reduced_shards
+
+
+def _build_shuffle_scatter(reduced_shards, dst_devices):
+  """Build the scatter phase of shuffle all-reduce.
+
+  Args:
+    reduced_shards:  list of T @(tf.Tensor} fully reduced shards
+    dst_devices: list of names of devices at which the fully-reduced value
+      should be reconstituted.
+
+  Returns:
+    list of T `tf.Tensor` scattered tensors.
+  """
+  num_devices = len(dst_devices)
+  out_tensors = []
+  for d in range(0, num_devices):
+    with ops.device(dst_devices[d]):
+      out_tensors.append(array_ops.concat(reduced_shards, 0))
+  return out_tensors
+
+
+def _split_by_task(devices, values):
+  """Partition devices and values by common task.
+
+  Args:
+    devices: list of device name strings
+    values: list of T `tf.tensor` of same length as devices.
+
+  Returns:
+    (per_task_devices, per_task_values) where both values are
+    lists of lists with isomorphic structure: the outer list is
+    indexed by task, and the inner list has length of the number
+    of values belonging to that task.  per_task_devices contains
+    the specific devices to which the values are local, and
+    per_task_values contains the corresponding values.
+
+  Raises:
+    ValueError: devices must be same length as values.
+  """
+  num_devices = len(devices)
+  if num_devices != len(values):
+    raise ValueError("len(devices) must equal len(values)")
+  per_task_devices = collections.OrderedDict()
+  per_task_values = collections.OrderedDict()
+  for d in range(num_devices):
+    d_spec = device_lib.DeviceSpec.from_string(devices[d])
+    if not hasattr(d_spec, "task") or d_spec.task is None:
+      assert False, "failed to parse device %s" % devices[d]
+    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
+    if index not in per_task_devices:
+      per_task_devices[index] = []
+      per_task_values[index] = []
+    per_task_devices[index].append(devices[d])
+    per_task_values[index].append(values[d])
+
+  return (list(per_task_devices.values()), list(per_task_values.values()))
+
+
+def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
+  """Build a subgraph that does one full all-reduce, using NCCL.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    red_op: binary elementwise reduction operator.  Must be one of
+      {tf.add}
+    un_op: optional unary elementwise Op to apply to fully-reduce values.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: red_op not supported.
+  """
+  if red_op == math_ops.add:
+    output_tensors = nccl_ops.all_sum(input_tensors)
+  else:
+    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
+  if un_op:
+    un_op_wrapped = []
+    for t in output_tensors:
+      with ops.colocate_with(t):
+        un_op_wrapped.append(un_op(t))
+    output_tensors = un_op_wrapped
+  return output_tensors
+
+
+def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
+  """Construct a subgraph for NCCL hybrid all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    red_op: binary elementwise reduction operator.
+    upper_level_f: function for reducing one value per worker, across
+      workers.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
+  num_workers = len(per_worker_devices)
+  up_values = [None for w in range(0, num_workers)]
+  up_devices = up_values[:]
+  down_values = up_values[:]
+  # First stage: reduce within each worker using NCCL
+  for w in range(0, num_workers):
+    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
+    # NOTE: these reductions will not run to completion unless
+    # every output value is used.  Since we only need one, we
+    # need to put control dependencies on the rest.
+    with ops.control_dependencies(worker_values):
+      with ops.device(worker_values[0].device):
+        up_values[w] = array_ops.identity(worker_values[0])
+      up_devices[w] = per_worker_devices[w][0]
+  # Second stage: Apply upper_level_f to reduce across first device at
+  # each worker
+  level_2_output = upper_level_f(up_values)
+  # Third stage: propagate within each worker using NCCL Broadcast
+  for w in range(0, num_workers):
+    dst_tensors = []
+    with ops.device(per_worker_devices[w][0]):
+      broadcast_src = nccl_ops.broadcast(array_ops.identity(level_2_output[w]))
+    for d in per_worker_devices[w]:
+      with ops.device(d):
+        dst_tensors.append(array_ops.identity(broadcast_src))
+    down_values[w] = dst_tensors
+  output_tensors = [v for sublist in down_values for v in sublist]
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _reduce_non_singleton(input_tensors, red_f, un_op):
+  """If len(input_tensors) > 1, apply red_f, else apply un_op."""
+  if len(input_tensors) > 1:
+    return red_f(input_tensors)
+  else:
+    if not un_op:
+      return input_tensors
+    output_tensors = []
+    for t in input_tensors:
+      with ops.colocate_with(t):
+        output_tensors.append(un_op(t))
+    return output_tensors
+
+
+def build_nccl_then_ring(input_tensors, subdiv, red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Ring across workers."""
+  def upper_builder(y):
+    return build_ring_all_reduce(y, len(y), subdiv, [0], red_op, un_op)
+  def upper_level_f(x):
+    return _reduce_non_singleton(x, upper_builder, un_op)
+  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
+
+
+def build_nccl_then_recursive_hd(input_tensors, red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Recursive-HD across workers."""
+  upper_level_f = lambda x: build_recursive_hd_all_reduce(x, red_op, un_op)
+  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
+
+
+def build_nccl_then_shuffle(input_tensors, gather_devices, nccl_red_op,
+                            shuffle_red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Shuffle across workers."""
+  def upper_level_f(x):
+    return build_shuffle_all_reduce(x, gather_devices, shuffle_red_op, un_op)
+
+  return _build_nccl_hybrid(input_tensors, nccl_red_op, upper_level_f)
+
+
+def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
+  """Construct a subgraph for Shuffle hybrid all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    gather_devices: list of device names on which to host gather shards.
+    red_op: binary elementwise reduction operator.
+    upper_level_f: function for reducing one value per worker, across
+      workers.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  # First stage, reduce across each worker using gather_devices.
+  devices = [t.device for t in input_tensors]
+  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
+  num_workers = len(per_worker_devices)
+  up_values = []
+  if len(gather_devices) != num_workers:
+    raise ValueError("For shuffle hybrid, gather_devices must contain one "
+                     "device per worker. ")
+  for w in range(0, num_workers):
+    reduced_shards = _build_shuffle_gather(
+        per_worker_values[w], [gather_devices[w]], red_op)
+    up_values.append(reduced_shards[0])
+  # Second stage, apply upper_level_f.
+  level_2_output = upper_level_f(up_values)
+  # Third stage, apply shuffle scatter at each worker.
+  output_tensors = []
+  for w in range(0, num_workers):
+    output_tensors += _build_shuffle_scatter(
+        [level_2_output[w]], per_worker_devices[w])
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def build_shuffle_then_ring(input_tensors, gather_devices, subdiv,
+                            red_n_op, red_op, un_op=None):
+  """Construct hybrid of Shuffle within workers, Ring across workers."""
+  def upper_builder(tensors):
+    return build_ring_all_reduce(tensors, len(tensors), subdiv, [0],
+                                 red_op, un_op)
+  def upper_level_f(tensors):
+    return _reduce_non_singleton(tensors, upper_builder, un_op)
+  return _build_shuffle_hybrid(
+      input_tensors, gather_devices, red_n_op, upper_level_f)
+
+
+def build_shuffle_then_shuffle(input_tensors, first_gather_devices,
+                               second_gather_devices, red_op, un_op=None):
+  """Construct hybrid of Shuffle within workers, Shuffle across workers."""
+  def upper_builder(tensors):
+    return build_shuffle_all_reduce(tensors, second_gather_devices,
+                                    red_op, un_op)
+  def upper_level_f(tensors):
+    return _reduce_non_singleton(tensors, upper_builder, un_op)
+  return _build_shuffle_hybrid(
+      input_tensors, first_gather_devices, red_op, upper_level_f)
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce_test.py b/tensorflow/python/distribute/all_reduce_test.py
similarity index 95%
rename from tensorflow/contrib/all_reduce/python/all_reduce_test.py
rename to tensorflow/python/distribute/all_reduce_test.py
index 9a8f62b9866bf0ac873ac299c963e2c3fc75b577..2c6b853124cf838d99da0628d8a610b74429e014 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce_test.py
+++ b/tensorflow/python/distribute/all_reduce_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.contrib.all_reduce.python..all_reduce."""
+"""Tests for all_reduce."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,10 +22,11 @@ import time
 
 import numpy as np
 
-from tensorflow.contrib.all_reduce.python import all_reduce as ar
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.distribute import all_reduce as ar
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -36,6 +37,7 @@ from tensorflow.python.platform import tf_logging
 
 class AllReduceTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFlattenTensorsShapesDefined(self):
     x = array_ops.placeholder(types_pb2.DT_FLOAT, [None])
     with self.assertRaisesRegexp(ValueError,
@@ -99,6 +101,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
           input_tensors.append(array_ops.identity(t8))
     return input_tensors, device_names
 
+  @test_util.run_deprecated_v1
   def testBuildRingGatherPassStructure(self):
     # 1 worker, 1 device
     input_tensors, device_names = self._buildInput(1, 1)
@@ -116,7 +119,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
     # same number outputs as inputs
     self.assertEqual(len(output_tensors), len(input_tensors))
     num_chunks = 2 * len(input_tensors)
-    tlen = input_tensors[0].shape[0].value
+    tlen = tensor_shape.dimension_value(input_tensors[0].shape[0])
     for otl in output_tensors:
       self.assertEqual(len(otl), num_chunks)
       for ot in otl:
@@ -158,7 +161,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
       output_tensors = build_f(input_tensors, un_op)
       sum_reduced = math_ops.add_n(output_tensors)
       sum_reduced.op.run()
-      self.assertAllClose(sum_reduced.eval(), simple_sum.eval())
+      self.assertAllClose(sum_reduced.eval(), self.evaluate(simple_sum))
 
   def _testRingAllReduce(self, num_workers, num_gpus, shape, subdiv):
     start_time = time.time()
@@ -169,6 +172,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     "subdiv=%d elapsed=%f" %
                     (num_workers, num_gpus, shape, subdiv, elapsed))
 
+  @test_util.run_deprecated_v1
   def testRingAllReduce(self):
     self._testRingAllReduce(1, 2, [], 1)
     self._testRingAllReduce(1, 2, [8], 1)
@@ -198,6 +202,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
     tf_logging.info("ShuffleAllReduce num_workers=%d num_gpus=%d shape=%s "
                     "elapsed=%f" % (num_workers, num_gpus, shape, elapsed))
 
+  @test_util.run_deprecated_v1
   def testShuffleAllReduce(self):
     self._testShuffleAllReduce(1, 2, [], 1)
     self._testShuffleAllReduce(1, 2, [8], 1)
@@ -224,6 +229,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     "shape=%s elapsed=%f" %
                     (num_workers, num_gpus, shape, elapsed))
 
+  @test_util.run_deprecated_v1
   def testRecursiveHDAllReduce(self):
     self._testRecursiveHDAllReduce(1, 2, [8])
     self._testRecursiveHDAllReduce(1, 2, [4, 4])
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..360a2993cd9e01c59551a5a8177c8bec03133c45
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -0,0 +1,180 @@
+# Description: Operations defined for Cluster Resolvers
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "cluster_resolver_lib",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":base_cluster_resolver_py",
+        ":gce_cluster_resolver_py",
+        ":kubernetes_cluster_resolver_py",
+        ":slurm_cluster_resolver_py",
+        ":tfconfig_cluster_resolver_py",
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "base_cluster_resolver_py",
+    srcs = ["cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "gce_cluster_resolver_py",
+    srcs = ["gce_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "tfconfig_cluster_resolver_py",
+    srcs = ["tfconfig_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_cluster_resolver_py",
+    srcs = ["tpu_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "slurm_cluster_resolver_py",
+    srcs = ["slurm_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "kubernetes_cluster_resolver_py",
+    srcs = ["kubernetes_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+tf_py_test(
+    name = "base_cluster_resolver_py_test",
+    srcs = ["cluster_resolver_test.py"],
+    additional_deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "gce_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["gce_cluster_resolver_test.py"],
+    additional_deps = [
+        ":gce_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "gce_cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "tfconfig_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tfconfig_cluster_resolver_test.py"],
+    additional_deps = [
+        ":tfconfig_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    grpc_enabled = True,
+    main = "tfconfig_cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "tpu_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tpu_cluster_resolver_test.py"],
+    additional_deps = [
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    grpc_enabled = True,
+    main = "tpu_cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "slurm_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["slurm_cluster_resolver_test.py"],
+    additional_deps = [
+        ":slurm_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "slurm_cluster_resolver_test.py",
+    tags = [],
+)
+
+tf_py_test(
+    name = "kubernetes_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["kubernetes_cluster_resolver_test.py"],
+    additional_deps = [
+        ":kubernetes_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "kubernetes_cluster_resolver_test.py",
+)
diff --git a/tensorflow/contrib/cluster_resolver/README.md b/tensorflow/python/distribute/cluster_resolver/README.md
similarity index 100%
rename from tensorflow/contrib/cluster_resolver/README.md
rename to tensorflow/python/distribute/cluster_resolver/README.md
diff --git a/tensorflow/python/distribute/cluster_resolver/README.slurm b/tensorflow/python/distribute/cluster_resolver/README.slurm
new file mode 100644
index 0000000000000000000000000000000000000000..3a7675f250d62cf380125fece8f27a363a978cfe
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/README.slurm
@@ -0,0 +1,50 @@
+# Slurm Cluster Resolver
+
+The Slurm Cluster Resolver resolves cluster specification for distribution TensorFlow work launched on HPC system running on Slurm. This implementation is able to handle homogeneous task allocation on computing nodes with default task distribution plane. The resolution is done by determining job configuration through a number of Slurm output variables and user input. The resolver requires the specification of total number of tasks launched, process ID/rank of the running process, number of tasks launched per node, number of GPUs present on each node and the number of GPUs to allocate for each task.
+
+The process ID/rank is extracted from environment variable ```SLURM_PROCID``` and the total number of tasks launched is extract from ```SLURM_NTASKS```. The number of tasks per node is extracted from ```SLURM_NTASKS_PER_NODE```, unless a value is specified by user. The number of GPUs present on each node and number of GPUs for each task have to be specified by the user. A base port can be specified by user and in case there are more than one task launched per node the port number will be incremented for each additional tasks on that node. The hostnames are resolved by running command ```scontrol show hostname``` through a subprocess and a list of hostnames will be returned. The distribution of rank/process ID by default follows that order. By default allocated GPUs will be automatically exposed to processes according to specification by setting ```CUDA_VISIBLE_DEVICE```.
+
+## Example
+- Slurm allocation in shell  ```salloc --nodes=2 -t 01:30:00 -A <project ID> --ntasks-per-node=2 --gres=gpu:k80:2 --exclusive```
+- Creating cluster in Python
+```
+cluster_resolver = tf.contrib.cluster_resolver.SlurmClusterResolver(
+    {'ps': 1, 'worker': 3},
+    port_base=8888,
+    tasks_per_node=2,
+    gpus_per_node=2,
+    gpus_per_task=1,
+    auto_set_gpu=True)
+
+cluster = cluster_resolver.cluster_spec()
+job_name, task_index = cluster_resolver.get_task_info()
+```
+The above example resolves a cluster specification for a Slurm job allocation with two computing nodes each having two GPUs and two tasks will be launched on each node. The jobs are specified in form of a dictionary where the key is a string representing job name and value is an integer that specifies the number of tasks in that job. ```cluster_resolver.cluster_spec()``` will return a cluster specificaiton object and the cluster specification will have the following specification as protobuf.
+
+```
+job {
+  name: "ps"
+  tasks {
+    value: "t02n13:8888"
+  }
+}
+job {
+  name: "worker"
+  tasks {
+    value: "t02n13:8889"
+  }
+  tasks {
+    key: 1
+    value: "t02n41:8888"
+  }
+  tasks {
+    key: 2
+    value: "t02n41:8889"
+  }
+}
+```
+
+After calling ```cluster_resolver.cluster_spec()``` internal data structions of the resolver will be populated. By looking at the process ID/rank and comparing with cluster specification the task can 'realize' which task it belongs to. This can be retrieved by calling ```cluster_resolver.get_task_info()``` and a string specifying job name and an integer specifying the task index will be returned.
+
+GPUs will be automatically allocated to the processes. For example in the above example ```
+t02n41:8888``` will see GPU 0 and ```t02n41:8889``` will see GPU 1.
diff --git a/tensorflow/python/distribute/cluster_resolver/__init__.py b/tensorflow/python/distribute/cluster_resolver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef87f59b7fd7ef1774ed97370c75e16f3ec4e295
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library Imports for Cluster Resolvers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import gce_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import kubernetes_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import slurm_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'cluster_resolver',
+    'gce_cluster_resolver',
+    'kubernetes_cluster_resolver',
+    'slurm_cluster_resolver',
+    'tfconfig_cluster_resolver',
+    'tpu_cluster_resolver',
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'KubernetesClusterResolver',
+    'TFConfigClusterResolver',
+    'TPUClusterResolver',
+    'SlurmClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
+
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca40e60a557d8fb1a5db8565369d1d1ae7e0c136
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -0,0 +1,403 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cluster Resolvers are used for dynamic cluster IP/hostname resolution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ClusterResolver(object):
+  """Abstract class for all implementations of ClusterResolvers.
+
+  This defines the skeleton for all implementations of ClusterResolvers.
+  ClusterResolvers are a way for TensorFlow to communicate with various cluster
+  management systems (e.g. GCE, AWS, etc...).
+
+  By letting TensorFlow communicate with these systems, we will be able to
+  automatically discover and resolve IP addresses for various TensorFlow
+  workers. This will eventually allow us to automatically recover from
+  underlying machine failures and scale TensorFlow worker clusters up and down.
+
+  Note to Implementors: In addition to these abstract methods, you must also
+  implement the task_type, task_index, and rpc_layer attributes. You may choose
+  to implement them either as properties with getters or setters or directly
+  set the attributes.
+
+  - task_type is the name of the server's current named job (e.g. 'worker',
+     'ps' in a distributed parameterized training job).
+  - task_index is the ordinal index of the server within the task type.
+  - rpc_layer is the protocol used by TensorFlow to communicate with other
+      TensorFlow servers in a distributed environment.
+  """
+
+  @abc.abstractmethod
+  def cluster_spec(self):
+    """Retrieve the current state of the cluster and returns a ClusterSpec.
+
+    Returns:
+      A ClusterSpec representing the state of the cluster at the moment this
+      function is called.
+
+    Implementors of this function must take care in ensuring that the
+    ClusterSpec returned is up-to-date at the time of calling this function.
+    This usually means retrieving the information from the underlying cluster
+    management system every time this function is invoked and reconstructing
+    a cluster_spec, rather than attempting to cache anything.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractmethod
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Retrieves the name or URL of the session master.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+
+    Implementors of this function must take care in ensuring that the master
+    returned is up-to-date at the time to calling this function. This usually
+    means retrieving the master every time this function is invoked.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractmethod
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    """Returns the number of accelerator cores per worker.
+
+    This returns the number of accelerator cores (such as GPUs and TPUs)
+    available per worker. If workers only has CPU cores available, then this
+    should return 0. This method will query the master for this information
+    if it is not otherwise known.
+
+    Optionally, we allow callers to specify the task_type, task_index, and
+    rpc_layer, if they want to target a specific TensorFlow process to query
+    the number of accelerators. This is to support heterogenous environments,
+    where the number of accelerators cores per host is different.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the machine we
+        want to query.
+      task_index: (Optional) The index of the TensorFlow task of the machine we
+        want to query.
+      accelerator_type: (Optional) The type of accelerator we are trying to
+        query (defaults to 'GPU').
+      config_proto: (Optional) Configuration for starting a new session to
+        query how many accelerator cores it has.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractproperty
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    raise NotImplementedError()
+
+
+class SimpleClusterResolver(ClusterResolver):
+  """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
+
+  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
+               environment='', num_accelerators=0,
+               rpc_layer=None):
+    """Creates a SimpleClusterResolver from a ClusterSpec."""
+    super(SimpleClusterResolver, self).__init__()
+
+    self._task_type = task_type
+    self._task_index = task_index
+    self._environment = environment
+    self._num_accelerators = num_accelerators
+    self._rpc_layer = rpc_layer
+
+    if not isinstance(cluster_spec, ClusterSpec):
+      raise TypeError('cluster_spec must be a ClusterSpec.')
+    self._cluster_spec = cluster_spec
+
+    if not isinstance(master, str):
+      raise TypeError('master must be a string.')
+    self._master = master
+
+  def cluster_spec(self):
+    """Returns the ClusterSpec passed into the constructor."""
+    return self._cluster_spec
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC used by distributed TensorFlow.
+
+    Returns:
+      The name or URL of the session master.
+
+    If a task_type and task_index is given, this will override the `master`
+    string passed into the initialization function.
+    """
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+    else:
+      master = self._master
+
+    return format_master_url(master, rpc_layer=rpc_layer or self._rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._environment
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    """Returns the number of accelerator cores per worker.
+
+    The SimpleClusterResolver does not do automatic detection of accelerators,
+    so a TensorFlow session will never be created, and thus all arguments are
+    unused and we simply return whatever was passed in when this object was
+    initialized.
+
+    Args:
+      task_type: Unused.
+      task_index: Unused.
+      accelerator_type: Unused.
+      config_proto: Unused.
+    """
+    # Unused
+    del task_type, task_index, accelerator_type, config_proto
+    return self._num_accelerators
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+
+class UnionClusterResolver(ClusterResolver):
+  """Performs a union on underlying ClusterResolvers.
+
+  This class performs a union given two or more existing ClusterResolvers. It
+  merges the underlying ClusterResolvers, and returns one unified ClusterSpec
+  when cluster_spec is called. The details of the merge function is
+  documented in the cluster_spec function.
+
+  For additional Cluster Resolver properties such as task type, task index,
+  rpc layer, environment, etc..., we will return the value from the first
+  ClusterResolver in the union.
+  """
+
+  def __init__(self, *args, **kwargs):
+    """Initializes a UnionClusterResolver with other ClusterResolvers.
+
+    Args:
+      *args: `ClusterResolver` objects to be unionized.
+      **kwargs:
+        rpc_layer - (Optional) Override value for the RPC layer used by
+          TensorFlow.
+        task_type - (Optional) Override value for the current task type.
+        task_index - (Optional) Override value for the current task index.
+
+    Raises:
+      TypeError: If any argument is not a subclass of `ClusterResolvers`.
+      ValueError: If there are no arguments passed.
+    """
+    super(UnionClusterResolver, self).__init__()
+
+    self._rpc_layer = kwargs.pop('rpc_layer', None)
+    self._task_type = kwargs.pop('task_type', None)
+    self._task_index = kwargs.pop('task_index', None)
+
+    if kwargs:
+      raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
+
+    if not args:
+      raise ValueError('At least one ClusterResolver is required.')
+
+    for cluster_resolver in args:
+      if not isinstance(cluster_resolver, ClusterResolver):
+        raise TypeError('All arguments must be a sub-class of '
+                        '`ClusterResolver.`')
+    self._cluster_resolvers = args
+
+  def cluster_spec(self):
+    """Returns a union of all the ClusterSpecs from the ClusterResolvers.
+
+    Returns:
+      A ClusterSpec containing host information merged from all the underlying
+      ClusterResolvers.
+
+    Raises:
+      KeyError: If there are conflicting keys detected when merging two or
+      more dictionaries, this exception is raised.
+
+    Note: If there are multiple ClusterResolvers exposing ClusterSpecs with the
+    same job name, we will merge the list/dict of workers.
+
+    If *all* underlying ClusterSpecs expose the set of workers as lists, we will
+    concatenate the lists of workers, starting with the list of workers from
+    the first ClusterResolver passed into the constructor.
+
+    If *any* of the ClusterSpecs expose the set of workers as a dict, we will
+    treat all the sets of workers as dicts (even if they are returned as lists)
+    and will only merge them into a dict if there is no conflicting keys. If
+    there is a conflicting key, we will raise a `KeyError`.
+    """
+
+    merged_cluster = {}
+
+    # We figure out whether it is all lists for a particular job, or whether
+    # there are dicts inside.
+    for cluster_resolver in self._cluster_resolvers:
+      cluster_spec = cluster_resolver.cluster_spec()
+      cluster_dict = cluster_spec.as_dict()
+
+      for job_name, tasks in cluster_dict.items():
+        if job_name in merged_cluster:
+          # If we see a dict, then we write a dict out regardless.
+          if isinstance(tasks, dict):
+            merged_cluster[job_name] = {}
+        else:
+          # We take whichever type is present.
+          if isinstance(tasks, list):
+            merged_cluster[job_name] = []
+          else:
+            merged_cluster[job_name] = {}
+
+    # We then do the merge as appropriate in merged_cluster[job].
+    for cluster_resolver in self._cluster_resolvers:
+      cluster_spec = cluster_resolver.cluster_spec()
+      cluster_dict = cluster_spec.as_dict()
+
+      for job_name, tasks in cluster_dict.items():
+        if isinstance(merged_cluster[job_name], list):
+          # We all have lists, we can just concatenate and be done.
+          merged_cluster[job_name].extend(tasks)
+        else:
+          if isinstance(tasks, list):
+            # We convert to a dictionary if the type is a list.
+            task_dict = dict(zip(range(0, len(tasks)), tasks))
+          else:
+            # We can simply make a copy (for update) and be done.
+            task_dict = tasks.copy()
+
+          # We detect if there are duplicates, and raise an error if so.
+          task_keys = set(task_dict)
+          merged_keys = set(merged_cluster[job_name].keys())
+          intersected_keys = task_keys.intersection(merged_keys)
+          if intersected_keys:
+            raise KeyError('Duplicate keys detected when merging two '
+                           'ClusterSpecs: %s' % repr(intersected_keys))
+
+          # We do the merge after all the processing.
+          merged_cluster[job_name].update(task_dict)
+
+    return ClusterSpec(merged_cluster)
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
+
+    This usually returns the master from the first ClusterResolver passed in,
+    but you can override this by specifying the task_type and task_index.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+    """
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      return format_master_url(master, rpc_layer or self._rpc_layer)
+
+    return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type or self._cluster_resolvers[0].task_type
+
+  @property
+  def task_index(self):
+    return self._task_index or self._cluster_resolvers[0].task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._cluster_resolvers[0].environment
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    return self._cluster_resolvers[0].num_accelerators(
+        task_type, task_index, accelerator_type, config_proto)
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer or self._cluster_resolvers[0].rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f7b46972746f46ee866a5891ed2ca9ef0722a0c
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
@@ -0,0 +1,369 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Cluster Resolvers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class UnionClusterResolverTest(test.TestCase):
+  # TODO(frankchn): Transform to parameterized test after it is included in the
+  # TF open source codebase.
+
+  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto, server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+
+  def testSingleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    union_resolver = UnionClusterResolver(simple_resolver)
+
+    expected_proto = """
+    job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
+                     tasks { key: 1 value: 'ps1:2222' } }
+    job { name: 'worker' tasks { key: 0 value: 'worker0:2222' }
+                         tasks { key: 1 value: 'worker1:2222' }
+                         tasks { key: 2 value: 'worker2:2222' } }
+    """
+    actual_cluster_spec = union_resolver.cluster_spec()
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  def testInitSimpleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
+                                            task_index=1, environment="cloud",
+                                            num_accelerators=8,
+                                            rpc_layer="grpc")
+
+    self.assertEqual(simple_resolver.task_type, "ps")
+    self.assertEqual(simple_resolver.task_index, 1)
+    self.assertEqual(simple_resolver.environment, "cloud")
+    self.assertEqual(simple_resolver.num_accelerators(), 8)
+    self.assertEqual(simple_resolver.rpc_layer, "grpc")
+
+  def testOverrideSimpleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
+                                            task_index=1, environment="cloud",
+                                            num_accelerators=8,
+                                            rpc_layer="grpc")
+
+    simple_resolver.task_type = "worker"
+    simple_resolver.task_index = 2
+    simple_resolver.rpc_layer = "http"
+
+    self.assertEqual(simple_resolver.task_type, "worker")
+    self.assertEqual(simple_resolver.task_index, 2)
+    self.assertEqual(simple_resolver.rpc_layer, "http")
+
+  def testSimpleOverrideMasterWithTaskIndexZero(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    actual_master = simple_resolver.master("worker", 0, rpc_layer="grpc")
+    self.assertEqual(actual_master, "grpc://worker0:2222")
+
+  def testSimpleOverrideMasterWithRpcLayer(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    actual_master = simple_resolver.master("worker", 2, rpc_layer="grpc")
+    self.assertEqual(actual_master, "grpc://worker2:2222")
+
+  def testSimpleOverrideMaster(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+
+    simple_resolver = SimpleClusterResolver(base_cluster_spec)
+    actual_master = simple_resolver.master("worker", 2)
+    self.assertEqual(actual_master, "worker2:2222")
+
+  def testUnionClusterResolverGetProperties(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps",
+                                      task_index=1, environment="cloud",
+                                      num_accelerators=8,
+                                      rpc_layer="grpc")
+
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "ps": ["ps2:2222", "ps3:2222"],
+        "worker": ["worker3:2222", "worker4:2222", "worker5:2222"]
+    })
+    resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker",
+                                      task_index=2, environment="local",
+                                      num_accelerators=16,
+                                      rpc_layer="http")
+
+    union_resolver = UnionClusterResolver(resolver1, resolver2)
+
+    self.assertEqual(union_resolver.task_type, "ps")
+    self.assertEqual(union_resolver.task_index, 1)
+    self.assertEqual(union_resolver.environment, "cloud")
+    self.assertEqual(union_resolver.num_accelerators(), 8)
+    self.assertEqual(union_resolver.rpc_layer, "grpc")
+
+    union_resolver.task_type = "worker"
+    union_resolver.task_index = 2
+    union_resolver.rpc_layer = "http"
+
+    self.assertEqual(union_resolver.task_type, "worker")
+    self.assertEqual(union_resolver.task_index, 2)
+    self.assertEqual(union_resolver.rpc_layer, "http")
+
+  def testTwoNonOverlappingJobMergedClusterResolver(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "ps": [
+            "ps0:2222",
+            "ps1:2222"
+        ]
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": [
+            "worker0:2222",
+            "worker1:2222",
+            "worker2:2222"
+        ]
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
+                     tasks { key: 1 value: 'ps1:2222' } }
+    job { name: 'worker' tasks { key: 0 value: 'worker0:2222' }
+                         tasks { key: 1 value: 'worker1:2222' }
+                         tasks { key: 2 value: 'worker2:2222' } }
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+  def testMergedClusterResolverMaster(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "ps": [
+            "ps0:2222",
+            "ps1:2222"
+        ]
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": [
+            "worker0:2222",
+            "worker1:2222",
+            "worker2:2222"
+        ]
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+
+    unspecified_master = union_cluster.master()
+    self.assertEqual(unspecified_master, "")
+
+    specified_master = union_cluster.master("worker", 1)
+    self.assertEqual(specified_master, "worker1:2222")
+
+    rpc_master = union_cluster.master("worker", 1, rpc_layer="grpc")
+    self.assertEqual(rpc_master, "grpc://worker1:2222")
+
+  def testOverlappingJobMergedClusterResolver(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": [
+            "worker4:2222",
+            "worker5:2222"
+        ]
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": [
+            "worker0:2222",
+            "worker1:2222",
+            "worker2:2222"
+        ]
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: 'worker4:2222' }
+                         tasks { key: 1 value: 'worker5:2222' }
+                         tasks { key: 2 value: 'worker0:2222' }
+                         tasks { key: 3 value: 'worker1:2222' }
+                         tasks { key: 4 value: 'worker2:2222' } }
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+  def testOverlappingSparseJobMergedClusterResolverThrowError(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": {
+            7: "worker4:2222",
+            9: "worker5:2222"
+        }
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": {
+            3: "worker0:2222",
+            6: "worker1:2222",
+            7: "worker2:2222"
+        }
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    self.assertRaises(KeyError, union_cluster.cluster_spec)
+
+  def testOverlappingDictAndListThrowError(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": [
+            "worker4:2222",
+            "worker5:2222"
+        ]
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": {
+            1: "worker0:2222",
+            2: "worker1:2222",
+            3: "worker2:2222"
+        }
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    self.assertRaises(KeyError, union_cluster.cluster_spec)
+
+  def testOverlappingJobNonOverlappingKey(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": {
+            5: "worker4:2222",
+            9: "worker5:2222"
+        }
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": {
+            3: "worker0:2222",
+            6: "worker1:2222",
+            7: "worker2:2222"
+        }
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'worker' tasks { key: 3 value: 'worker0:2222' }
+                         tasks { key: 5 value: 'worker4:2222' }
+                         tasks { key: 6 value: 'worker1:2222' }
+                         tasks { key: 7 value: 'worker2:2222' }
+                         tasks { key: 9 value: 'worker5:2222' }}
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+  def testMixedModeNonOverlappingKey(self):
+    cluster_spec_1 = server_lib.ClusterSpec({
+        "worker": [
+            "worker4:2222",
+            "worker5:2222"
+        ]
+    })
+    cluster_spec_2 = server_lib.ClusterSpec({
+        "worker": {
+            3: "worker0:2222",
+            6: "worker1:2222",
+            7: "worker2:2222"
+        }
+    })
+    cluster_resolver_1 = SimpleClusterResolver(cluster_spec_1)
+    cluster_resolver_2 = SimpleClusterResolver(cluster_spec_2)
+
+    union_cluster = UnionClusterResolver(cluster_resolver_1, cluster_resolver_2)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: 'worker4:2222' }
+                         tasks { key: 1 value: 'worker5:2222' }
+                         tasks { key: 3 value: 'worker0:2222' }
+                         tasks { key: 6 value: 'worker1:2222' }
+                         tasks { key: 7 value: 'worker2:2222' }}
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+  def testRetainSparseJobWithNoMerging(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "worker": {
+            1: "worker0:2222",
+            3: "worker1:2222",
+            5: "worker2:2222"
+        }
+    })
+
+    base_cluster_resolver = SimpleClusterResolver(base_cluster_spec)
+    union_cluster = UnionClusterResolver(base_cluster_resolver)
+    cluster_spec = union_cluster.cluster_spec()
+
+    expected_proto = """
+    job { name: 'worker' tasks { key: 1 value: 'worker0:2222' }
+                         tasks { key: 3 value: 'worker1:2222' }
+                         tasks { key: 5 value: 'worker2:2222' } }
+    """
+    self._verifyClusterSpecEquality(cluster_spec, expected_proto)
+
+
+# TODO(saeta): Include tests for master resolution
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..2412f6dad095bb2282ba51b7edb1f293f57d428d
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
@@ -0,0 +1,212 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for GCE Instance Groups."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+_GOOGLE_API_CLIENT_INSTALLED = True
+try:
+  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
+  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _GOOGLE_API_CLIENT_INSTALLED = False
+
+
+def _format_master_url(master, rpc_layer=None):
+  return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+
+
+class GceClusterResolver(ClusterResolver):
+  """Cluster Resolver for Google Compute Engine.
+
+  This is an implementation of cluster resolvers for the Google Compute Engine
+  instance group platform. By specifying a project, zone, and instance group,
+  this will retrieve the IP address of all the instances within the instance
+  group and return a Cluster Resolver object suitable for use for distributed
+  TensorFlow.
+  """
+
+  def __init__(self,
+               project,
+               zone,
+               instance_group,
+               port,
+               task_type='worker',
+               task_index=0,
+               rpc_layer='grpc',
+               num_accelerators=0,
+               credentials='default',
+               service=None):
+    """Creates a new GceClusterResolver object.
+
+    This takes in a few parameters and creates a GceClusterResolver project. It
+    will then use these parameters to query the GCE API for the IP addresses of
+    each instance in the instance group.
+
+    Args:
+      project: Name of the GCE project.
+      zone: Zone of the GCE instance group.
+      instance_group: Name of the GCE instance group.
+      port: Port of the listening TensorFlow server (default: 8470)
+      task_type: Name of the TensorFlow job this GCE instance group of VM
+        instances belong to.
+      task_index: The task index for this particular VM, within the GCE
+        instance group. In particular, every single instance should be assigned
+        a unique ordinal index within an instance group manually so that they
+        can be distinguished from each other.
+      rpc_layer: The RPC layer TensorFlow should use to communicate across
+        instances.
+      num_accelerators: Number of accelerators (GPUs) present per
+        instance.
+      credentials: GCE Credentials. If nothing is specified, this defaults to
+        GoogleCredentials.get_application_default().
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. (Default: discovery.build('compute', 'v1')). If you specify a
+        custom service object, then the credentials parameter will be ignored.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+    """
+    self._project = project
+    self._zone = zone
+    self._instance_group = instance_group
+    self._task_type = task_type
+    self._task_index = task_index
+    self._rpc_layer = rpc_layer
+    self._num_accelerators = num_accelerators
+    self._port = port
+    self._credentials = credentials
+
+    if credentials == 'default':
+      if _GOOGLE_API_CLIENT_INSTALLED:
+        self._credentials = GoogleCredentials.get_application_default()
+
+    if service is None:
+      if not _GOOGLE_API_CLIENT_INSTALLED:
+        raise ImportError('googleapiclient must be installed before using the '
+                          'GCE cluster resolver')
+      self._service = discovery.build(
+          'compute', 'v1',
+          credentials=self._credentials)
+    else:
+      self._service = service
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest instance group info.
+
+    This returns a ClusterSpec object for use based on information from the
+    specified instance group. We will retrieve the information from the GCE APIs
+    every time this method is called.
+
+    Returns:
+      A ClusterSpec containing host information retrieved from GCE.
+    """
+    request_body = {'instanceState': 'RUNNING'}
+    request = self._service.instanceGroups().listInstances(
+        project=self._project,
+        zone=self._zone,
+        instanceGroups=self._instance_group,
+        body=request_body,
+        orderBy='name')
+
+    worker_list = []
+
+    while request is not None:
+      response = request.execute()
+
+      items = response['items']
+      for instance in items:
+        instance_name = instance['instance'].split('/')[-1]
+
+        instance_request = self._service.instances().get(
+            project=self._project,
+            zone=self._zone,
+            instance=instance_name)
+
+        if instance_request is not None:
+          instance_details = instance_request.execute()
+          ip_address = instance_details['networkInterfaces'][0]['networkIP']
+          instance_url = '%s:%s' % (ip_address, self._port)
+          worker_list.append(instance_url)
+
+      request = self._service.instanceGroups().listInstances_next(
+          previous_request=request,
+          previous_response=response)
+
+    worker_list.sort()
+    return ClusterSpec({self._task_type: worker_list})
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    task_type = task_type if task_type is not None else self._task_type
+    task_index = task_index if task_index is not None else self._task_index
+
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      if rpc_layer or self._rpc_layer:
+        return '%s://%s' % (rpc_layer or self._rpc_layer, master)
+      else:
+        return master
+
+    return ''
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    raise RuntimeError(
+        'You cannot reset the task_type of the GceClusterResolver after it has '
+        'been created.')
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the GCE environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    # Unused
+    del task_type, task_index, accelerator_type, config_proto
+    return self._num_accelerators
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
similarity index 75%
rename from tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
index 87b8303122498992dd24ae06824f7f769357d8f8..d4f0660c922d593d81c0927dea0d6271e89c53e1 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -135,12 +135,86 @@ class GceClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  def testMasterRetrieval(self):
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_index=0,
+        port=8470,
+        credentials=None,
+        service=self.standard_mock_service_client())
+    self.assertEqual(gce_cluster_resolver.master(), 'grpc://10.123.45.67:8470')
+
+  def testMasterRetrievalWithCustomTasks(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    self.assertEqual(
+        gce_cluster_resolver.master('worker', 2, 'test'),
+        'test://10.3.4.5:8470')
+
+  def testOverrideParameters(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_type='testworker',
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    gce_cluster_resolver.task_index = 1
+    gce_cluster_resolver.rpc_layer = 'test'
+
+    self.assertEqual(gce_cluster_resolver.task_type, 'testworker')
+    self.assertEqual(gce_cluster_resolver.task_index, 1)
+    self.assertEqual(gce_cluster_resolver.rpc_layer, 'test')
+    self.assertEqual(gce_cluster_resolver.master(), 'test://10.2.3.4:8470')
+
+  def testOverrideParametersWithZeroOrEmpty(self):
+    name_to_ip = [
+        {'name': 'instance1', 'ip': '10.1.2.3'},
+        {'name': 'instance2', 'ip': '10.2.3.4'},
+        {'name': 'instance3', 'ip': '10.3.4.5'},
+    ]
+
+    gce_cluster_resolver = GceClusterResolver(
+        project='test-project',
+        zone='us-east1-d',
+        instance_group='test-instance-group',
+        task_type='',
+        task_index=1,
+        port=8470,
+        credentials=None,
+        service=self.gen_standard_mock_service_client(name_to_ip))
+
+    self.assertEqual(gce_cluster_resolver.master(
+        task_type='', task_index=0), 'grpc://10.1.2.3:8470')
+
   def testCustomJobNameAndPortRetrieval(self):
     gce_cluster_resolver = GceClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='custom',
+        task_type='custom',
         port=2222,
         credentials=None,
         service=self.standard_mock_service_client())
@@ -196,7 +270,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='worker',
+        task_type='worker',
         port=8470,
         credentials=None,
         service=self.gen_standard_mock_service_client(worker1_name_to_ip))
@@ -205,7 +279,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='worker',
+        task_type='worker',
         port=8470,
         credentials=None,
         service=self.gen_standard_mock_service_client(worker2_name_to_ip))
@@ -214,7 +288,7 @@ class GceClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        job_name='ps',
+        task_type='ps',
         port=2222,
         credentials=None,
         service=self.gen_standard_mock_service_client(ps_name_to_ip))
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..b21c3676bee53e785474308435021885dc93377c
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
@@ -0,0 +1,182 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Kubernetes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import device_lib
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
+from tensorflow.python.training import server_lib
+
+_KUBERNETES_API_CLIENT_INSTALLED = True
+try:
+  from kubernetes import client as k8sclient  # pylint: disable=g-import-not-at-top
+  from kubernetes import config as k8sconfig  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _KUBERNETES_API_CLIENT_INSTALLED = False
+
+
+class KubernetesClusterResolver(ClusterResolver):
+  """Cluster Resolver for Kubernetes.
+
+  This is an implementation of cluster resolvers for Kubernetes. When given the
+  the Kubernetes namespace and label selector for pods, we will retrieve the
+  pod IP addresses of all running pods matching the selector, and return a
+  ClusterSpec based on that information.
+  """
+
+  def __init__(self,
+               job_to_label_mapping=None,
+               tf_server_port=8470,
+               rpc_layer='grpc',
+               override_client=None):
+    """Initializes a new KubernetesClusterResolver.
+
+    This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver
+    will attempt to talk to the Kubernetes master to retrieve all the instances
+    of pods matching a label selector.
+
+    Args:
+      job_to_label_mapping: A mapping of TensorFlow jobs to label selectors.
+        This allows users to specify many TensorFlow jobs in one Cluster
+        Resolver, and each job can have pods belong with different label
+        selectors. For example, a sample mapping might be
+        ```
+        {'worker': ['job-name=worker-cluster-a', 'job-name=worker-cluster-b'],
+         'ps': ['job-name=ps-1', 'job-name=ps-2']}
+        ```
+      tf_server_port: The port the TensorFlow server is listening on.
+      rpc_layer: (Optional) The RPC layer TensorFlow should use to communicate
+        between tasks in Kubernetes. Defaults to 'grpc'.
+      override_client: The Kubernetes client (usually automatically retrieved
+        using `from kubernetes import client as k8sclient`). If you pass this
+        in, you are responsible for setting Kubernetes credentials manually.
+
+    Raises:
+      ImportError: If the Kubernetes Python client is not installed and no
+        `override_client` is passed in.
+      RuntimeError: If autoresolve_task is not a boolean or a callable.
+    """
+    if _KUBERNETES_API_CLIENT_INSTALLED:
+      k8sconfig.load_kube_config()
+
+    if not job_to_label_mapping:
+      job_to_label_mapping = {'worker': ['job-name=tensorflow']}
+
+    if not override_client and not _KUBERNETES_API_CLIENT_INSTALLED:
+      raise ImportError('The Kubernetes Python client must be installed before'
+                        'using the Kubernetes Cluster Resolver. To install the'
+                        'Kubernetes Python client, run `pip install '
+                        'kubernetes` on your command line.')
+
+    self._job_to_label_mapping = job_to_label_mapping
+    self._tf_server_port = tf_server_port
+    self._override_client = override_client
+
+    self.task_type = None
+    self.task_index = None
+    self.rpc_layer = rpc_layer
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
+
+    You must have set the task_type and task_index object properties before
+    calling this function, or pass in the `task_type` and `task_index`
+    parameters when using this function. If you do both, the function parameters
+    will override the object properties.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+    """
+    if task_type is not None and task_index is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(task_type, task_index),
+          rpc_layer or self.rpc_layer)
+
+    if self.task_type is not None and self.task_index is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(self.task_type, self.task_index),
+          rpc_layer or self.rpc_layer)
+
+    return ''
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest info from Kubernetes.
+
+    We retrieve the information from the Kubernetes master every time this
+    method is called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Kubernetes.
+
+    Raises:
+      RuntimeError: If any of the pods returned by the master is not in the
+        `Running` phase.
+    """
+    if not self._override_client:
+      k8sconfig.load_kube_config()
+
+    client = self._override_client or k8sclient.CoreV1Api()
+    cluster_map = {}
+
+    for tf_job in self._job_to_label_mapping:
+      all_pods = []
+      for selector in self._job_to_label_mapping[tf_job]:
+        ret = client.list_pod_for_all_namespaces(label_selector=selector)
+        selected_pods = []
+
+        # Sort the list by the name to make sure it doesn't change call to call.
+        for pod in sorted(ret.items, key=lambda x: x.metadata.name):
+          if pod.status.phase == 'Running':
+            selected_pods.append(
+                '%s:%s' % (pod.status.host_ip, self._tf_server_port))
+          else:
+            raise RuntimeError('Pod "%s" is not running; phase: "%s"' %
+                               (pod.metadata.name, pod.status.phase))
+        all_pods.extend(selected_pods)
+      cluster_map[tf_job] = all_pods
+
+    return server_lib.ClusterSpec(cluster_map)
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the Cloud environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    # TODO(frankchn): Make querying non-local accelerators work
+    if task_type is not None or task_index is not None:
+      raise NotImplementedError('Querying non-local accelerators is not yet'
+                                'implemented.')
+
+    local_devices = device_lib.list_local_devices(config_proto)
+    return sum(d.device_type == accelerator_type for d in local_devices)
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9750fa60b993a3504bbd01f0663cfdf868a2f01
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
@@ -0,0 +1,185 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for K8sClusterResolver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute.cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+mock = test.mock
+
+
+def _mock_kubernetes_client(ret):
+  mock_client = mock.MagicMock()
+  mock_client.list_pod_for_all_namespaces.side_effect = (
+      lambda *args, **kwargs: ret[kwargs['label_selector']])
+  return mock_client
+
+
+def _get_mock_pod_item(name, phase, host_ip):
+  mock_status = mock.Mock()
+  mock_status.configure_mock(phase=phase, host_ip=host_ip)
+
+  mock_metadata = mock.Mock()
+  mock_metadata.configure_mock(name=name)
+
+  mock_item = mock.Mock()
+  mock_item.configure_mock(status=mock_status, metadata=mock_metadata)
+  return mock_item
+
+
+def _create_pod_list(*args):
+  return mock.MagicMock(items=[_get_mock_pod_item(*x) for x in args])
+
+
+class KubernetesClusterResolverTest(test.TestCase):
+
+  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
+    """Verifies that the ClusterSpec generates the correct proto.
+
+    We are testing this four different ways to ensure that the ClusterSpec
+    returned by the TPUClusterResolver behaves identically to a normal
+    ClusterSpec when passed into the generic ClusterSpec libraries.
+
+    Args:
+      cluster_spec: ClusterSpec returned by the TPUClusterResolver
+      expected_proto: Expected protobuf
+    """
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(expected_proto,
+                           server_lib.ClusterSpec(
+                               cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(expected_proto,
+                           server_lib.ClusterSpec(
+                               cluster_spec.as_dict()).as_cluster_def())
+
+  def testSingleItemSuccessfulRetrieval(self):
+    ret = _create_pod_list(('tensorflow-abc123', 'Running', '10.1.2.3'),)
+
+    cluster_resolver = KubernetesClusterResolver(
+        override_client=_mock_kubernetes_client(
+            {'job-name=tensorflow': ret}))
+
+    actual_cluster_spec = cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.1.2.3:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+
+  def testSuccessfulRetrievalWithSort(self):
+    ret = _create_pod_list(
+        ('tensorflow-abc123', 'Running', '10.1.2.3'),
+        ('tensorflow-def456', 'Running', '10.1.2.4'),
+        ('tensorflow-999999', 'Running', '10.1.2.5'))
+
+    cluster_resolver = KubernetesClusterResolver(
+        override_client=_mock_kubernetes_client(
+            {'job-name=tensorflow': ret}))
+
+    actual_cluster_spec = cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.1.2.5:8470' }
+      tasks { key: 1 value: '10.1.2.3:8470' }
+      tasks { key: 2 value: '10.1.2.4:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+
+  def testGetMasterWithOverrideParameters(self):
+    ret = _create_pod_list(
+        ('worker-0', 'Running', '10.1.2.3'),
+        ('worker-1', 'Running', '10.1.2.4'),
+        ('worker-2', 'Running', '10.1.2.5'))
+
+    cluster_resolver = KubernetesClusterResolver(
+        override_client=_mock_kubernetes_client(
+            {'job-name=tensorflow': ret}))
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_index = 0
+    self.assertEqual(cluster_resolver.task_type, 'worker')
+    self.assertEqual(cluster_resolver.task_index, 0)
+    self.assertEqual(cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+    self.assertEqual(cluster_resolver.master('worker', 2),
+                     'grpc://10.1.2.5:8470')
+
+  def testNonRunningPod(self):
+    ret = _create_pod_list(('tensorflow-abc123', 'Failed', '10.1.2.3'),)
+
+    cluster_resolver = KubernetesClusterResolver(
+        override_client=_mock_kubernetes_client(
+            {'job-name=tensorflow': ret}))
+
+    error_msg = 'Pod "tensorflow-abc123" is not running; phase: "Failed"'
+    with self.assertRaisesRegexp(RuntimeError, error_msg):
+      cluster_resolver.cluster_spec()
+
+  def testMultiplePodSelectorsAndWorkers(self):
+    worker1 = _create_pod_list(
+        ('tensorflow-abc123', 'Running', '10.1.2.3'),
+        ('tensorflow-def456', 'Running', '10.1.2.4'),
+        ('tensorflow-999999', 'Running', '10.1.2.5'))
+    worker2 = _create_pod_list(
+        ('tensorflow-abc124', 'Running', '10.1.2.6'),
+        ('tensorflow-def457', 'Running', '10.1.2.7'),
+        ('tensorflow-999990', 'Running', '10.1.2.8'))
+    ps = _create_pod_list(
+        ('tensorflow-ps-1', 'Running', '10.1.2.1'),
+        ('tensorflow-ps-2', 'Running', '10.1.2.2'))
+
+    cluster_resolver = KubernetesClusterResolver(
+        job_to_label_mapping={
+            'worker': ['job-name=worker1', 'job-name=worker2'],
+            'ps': ['job-name=ps']
+        },
+        override_client=_mock_kubernetes_client({
+            'job-name=worker1': worker1,
+            'job-name=worker2': worker2,
+            'job-name=ps': ps
+        }))
+
+    actual_cluster_spec = cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'ps'
+      tasks { key: 0 value: '10.1.2.1:8470' }
+      tasks { key: 1 value: '10.1.2.2:8470' }
+    }
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.1.2.5:8470' }
+      tasks { key: 1 value: '10.1.2.3:8470' }
+      tasks { key: 2 value: '10.1.2.4:8470' }
+      tasks { key: 3 value: '10.1.2.8:8470' }
+      tasks { key: 4 value: '10.1.2.6:8470' }
+      tasks { key: 5 value: '10.1.2.7:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ab81731b7a111848608068220488a368d9b86ec
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Slurm workload manager."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import subprocess
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class SlurmClusterResolver(ClusterResolver):
+  """Cluster Resolver for system with Slurm workload manager.
+
+  This is an implementation of cluster resolvers for Slurm clusters. This allows
+  the specification of jobs and task counts, number of tasks per node, number of
+  GPUs on each node and number of GPUs for each task, It retrieves system
+  attributes by Slurm environment variables, resolves allocated computing node
+  names, construct a cluster and return a Cluster Resolver object which an be
+  use for distributed TensorFlow.
+  """
+
+  def _resolve_hostnames(self):
+    """Resolve host names of nodes allocated in current jobs.
+
+    Returns:
+      A list of node names as strings.
+    """
+    hostlist = (subprocess.check_output(['scontrol', 'show', 'hostname']).
+                decode('utf-8').strip().split('\n'))
+    return hostlist
+
+  def __init__(self,
+               jobs,
+               port_base=8888,
+               gpus_per_node=1,
+               gpus_per_task=1,
+               tasks_per_node=None,
+               auto_set_gpu=True,
+               rpc_layer='grpc'):
+    """Creates a new SlurmClusterResolver object.
+
+    This takes in parameters and creates a SlurmClusterResolver object. It uses
+    those parameters to check which nodes will processes reside and resolves
+    their hostnames. With the number of the GPUs on each node and number of GPUs
+    for each task it offsets the port number for each processes and allocate
+    GPUs to tasks by setting environment variables. The resolver currently
+    supports homogeneous tasks and default Slurm process allocation.
+
+    Args:
+      jobs: Dictionary with job names as key and number of tasks in the job as
+        value
+      port_base: The first port number to start with for processes on a node.
+      gpus_per_node: Number of GPUs available on each node.
+      gpus_per_task: Number of GPUs to be used for each task.
+      tasks_per_node: Number of tasks to run on each node, if not set defaults
+        to Slurm's output environment variable SLURM_NTASKS_PER_NODE.
+      auto_set_gpu: Set the visible CUDA devices automatically while resolving
+        the cluster by setting CUDA_VISIBLE_DEVICES environment variable.
+        Defaults to True.
+      rpc_layer: (Optional) The protocol TensorFlow uses to communicate between
+        nodes. Defaults to 'grpc'.
+
+    Returns:
+      A ClusterResolver object which can be used with distributed TensorFlow.
+
+    Raises:
+      RuntimeError: If requested more GPUs per node then available or requested
+      more tasks then assigned tasks.
+    """
+
+    # check if launched by mpirun
+    if 'OMPI_COMM_WORLD_RANK' in os.environ:
+      self._rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+      num_tasks = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+    else:
+      self._rank = int(os.environ['SLURM_PROCID'])
+      num_tasks = int(os.environ['SLURM_NTASKS'])
+
+    self._jobs = collections.OrderedDict(sorted(jobs.items()))
+    self._port_base = port_base
+
+    # user specification overrides SLURM specification
+    if tasks_per_node is not None:
+      self._tasks_per_node = tasks_per_node
+    elif tasks_per_node is None and 'SLURM_NTASKS_PER_NODE' in os.environ:
+      self._tasks_per_node = int(os.environ['SLURM_NTASKS_PER_NODE'])
+    else:
+      raise RuntimeError('Neither `tasks_per_node` or '
+                         'SLURM_NTASKS_PER_NODE is set.')
+
+    self._gpus_per_node = gpus_per_node
+    self._gpus_per_task = gpus_per_task
+
+    self._auto_set_gpu = auto_set_gpu
+    self.task_type = None
+    self.task_index = None
+    self.rpc_layer = rpc_layer
+
+    self._gpu_allocation = []
+    self._cluster_allocation = {}
+
+    if self._tasks_per_node * self._gpus_per_task > self._gpus_per_node:
+      raise RuntimeError('Requested more GPUs per node then available.')
+
+    if sum(self._jobs.values()) != num_tasks:
+      raise RuntimeError('Requested more tasks then assigned tasks.')
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest instance group info.
+
+    This returns a ClusterSpec object for use based on information from the
+    specified initialization parameters and Slurm environment variables. The
+    cluster specification is resolved each time this function is called. The
+    resolver extract hostnames of nodes by scontrol and pack tasks in that
+    order until a node a has number of tasks that is equal to specification.
+    GPUs on nodes are allocated to tasks by specification through setting
+    CUDA_VISIBLE_DEVICES environment variable.
+
+    Returns:
+      A ClusterSpec containing host information retrieved from Slurm's
+        environment variables.
+    """
+    hostlist = self._resolve_hostnames()
+
+    task_list = []
+    self._gpu_allocation = []
+    self._cluster_allocation = {}
+
+    for host in hostlist:
+      for port_offset, gpu_offset in zip(
+          range(self._tasks_per_node),
+          range(0, self._gpus_per_node, self._gpus_per_task)):
+
+        host_addr = '%s:%d' % (host, self._port_base + port_offset)
+        task_list.append(host_addr)
+        gpu_id_list = []
+
+        for gpu_id in range(gpu_offset, gpu_offset + self._gpus_per_task):
+          gpu_id_list.append(str(gpu_id))
+
+        self._gpu_allocation.append(','.join(gpu_id_list))
+
+    cluster_rank_offset_start = 0
+    cluster_rank_offset_end = 0
+
+    for task_type, num_tasks in self._jobs.items():
+      cluster_rank_offset_end = cluster_rank_offset_start + num_tasks
+
+      self._cluster_allocation[task_type] = (
+          task_list[cluster_rank_offset_start:cluster_rank_offset_end])
+
+      if cluster_rank_offset_start <= self._rank < cluster_rank_offset_end:
+        self.task_type = task_type
+        self.task_index = self._rank - cluster_rank_offset_start
+
+      cluster_rank_offset_start = cluster_rank_offset_end
+
+    if self._auto_set_gpu is True:
+      os.environ['CUDA_VISIBLE_DEVICES'] = self._gpu_allocation[self._rank]
+
+    return ClusterSpec(self._cluster_allocation)
+
+  def get_task_info(self):
+    """Returns job name and task_index for the process which calls this.
+
+    This returns the job name and task index for the process which calls this
+    function according to its rank and cluster specification. The job name and
+    task index are set after a cluster is constructed by cluster_spec otherwise
+    defaults to None.
+
+    Returns:
+      A string specifying job name the process belongs to and an integner
+        specifying the task index the process belongs to in that job.
+    """
+    return self.task_type, self.task_index
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master string for connecting to a TensorFlow master.
+
+    Args:
+      task_type: (Optional) Overrides the default auto-selected task type.
+      task_index: (Optional) Overrides the default auto-slected task index.
+      rpc_layer: (Optional) Overrides the default RPC protocol TensorFlow uses
+        to communicate across nodes.
+
+    Returns:
+      A connection string for connecting to a TensorFlow master.
+    """
+    task_type = task_type if task_type is not None else self.task_type
+    task_index = task_index if task_index is not None else self.task_index
+    rpc_layer = rpc_layer or self.rpc_layer
+    master = self.cluster_spec().task_address(task_type, task_index)
+
+    return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the Slurm environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    # Unused, since this is set in __init__ manually.
+    del task_type, task_index, accelerator_type, config_proto
+    return self._gpus_per_node
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..076539d16f17d64a9a28052960b61a5b99a7c9c6
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
@@ -0,0 +1,187 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SlurmClusterResolver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.distribute.cluster_resolver import SlurmClusterResolver
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+mock = test.mock
+
+
+class SlurmClusterResolverTest(test.TestCase):
+
+  def mock_resolve_hostnames_output(self):
+    return ['t02n13', 't02n41', 't02n43', 't02n44']
+
+  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+
+  @mock.patch.dict(os.environ, {'SLURM_PROCID': '0', 'SLURM_NTASKS': '3'})
+  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
+                     mock_resolve_hostnames_output)
+  def testSimpleSuccessfulRetrieval(self):
+    slurm_cluster_resolver = SlurmClusterResolver(
+        jobs={
+            'ps': 1,
+            'worker': 2
+        },
+        port_base=8888,
+        tasks_per_node=1,
+        gpus_per_node=1,
+        gpus_per_task=1,
+        auto_set_gpu=False)
+
+    actual_cluster_spec = slurm_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'ps' tasks { value: 't02n13:8888' } }
+    job { name: 'worker' tasks { key: 0 value: 't02n41:8888' }
+                         tasks { key: 1 value: 't02n43:8888' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  @mock.patch.dict(os.environ, {'SLURM_PROCID': '0', 'SLURM_NTASKS': '3'})
+  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
+                     mock_resolve_hostnames_output)
+  def testSimpleMasterRetrieval(self):
+    slurm_cluster_resolver = SlurmClusterResolver(
+        jobs={
+            'ps': 1,
+            'worker': 2
+        },
+        port_base=8888,
+        tasks_per_node=1,
+        gpus_per_node=1,
+        gpus_per_task=1,
+        auto_set_gpu=False)
+
+    slurm_cluster_resolver.task_type = 'worker'
+    slurm_cluster_resolver.task_index = 1
+    self.assertEqual(slurm_cluster_resolver.master(), 'grpc://t02n43:8888')
+
+    slurm_cluster_resolver.rpc_layer = 'ab'
+    self.assertEqual(slurm_cluster_resolver.master('ps', 0), 'ab://t02n13:8888')
+    self.assertEqual(
+        slurm_cluster_resolver.master('ps', 0, rpc_layer='test'),
+        'test://t02n13:8888')
+
+  @mock.patch.dict(os.environ, {
+      'SLURM_PROCID': '0',
+      'SLURM_NTASKS': '3',
+      'SLURM_NTASKS_PER_NODE': '1'
+  })
+  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
+                     mock_resolve_hostnames_output)
+  def testTaskPerNodeNotSetRetrieval(self):
+    slurm_cluster_resolver = SlurmClusterResolver(
+        jobs={
+            'ps': 1,
+            'worker': 2
+        },
+        port_base=8888,
+        gpus_per_node=1,
+        gpus_per_task=1,
+        auto_set_gpu=False)
+
+    actual_cluster_spec = slurm_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'ps' tasks { value: 't02n13:8888' } }
+    job { name: 'worker' tasks { key: 0 value: 't02n41:8888' }
+                         tasks { key: 1 value: 't02n43:8888' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  @mock.patch.dict(
+      os.environ, {
+          'SLURM_PROCID': '1',
+          'SLURM_NTASKS': '5',
+          'SLURM_NTASKS_PER_NODE': '2',
+          'CUDA_VISIBLE_DEVICES': ''
+      })
+  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
+                     mock_resolve_hostnames_output)
+  def testMultiTaskPerNodeRetrieval(self):
+    slurm_cluster_resolver = SlurmClusterResolver(
+        jobs={
+            'ps': 1,
+            'worker': 4
+        },
+        port_base=8888,
+        gpus_per_node=2,
+        gpus_per_task=1,
+        auto_set_gpu=True)
+
+    actual_cluster_spec = slurm_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'ps' tasks { value: 't02n13:8888' } }
+    job { name: 'worker' tasks { key: 0 value: 't02n13:8889' }
+                         tasks { key: 1 value: 't02n41:8888' }
+                         tasks { key: 2 value: 't02n41:8889' }
+                         tasks { key: 3 value: 't02n43:8888' } }
+    """
+
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    assert os.environ['CUDA_VISIBLE_DEVICES'] == '1'
+
+  @mock.patch.dict(
+      os.environ, {
+          'SLURM_PROCID': '1',
+          'SLURM_NTASKS': '5',
+          'SLURM_NTASKS_PER_NODE': '2',
+          'CUDA_VISIBLE_DEVICES': ''
+      })
+  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
+                     mock_resolve_hostnames_output)
+  def testMultipleGpusPerTaskRetrieval(self):
+    slurm_cluster_resolver = SlurmClusterResolver(
+        jobs={
+            'ps': 1,
+            'worker': 4
+        },
+        port_base=8888,
+        gpus_per_node=4,
+        gpus_per_task=2,
+        auto_set_gpu=True)
+
+    actual_cluster_spec = slurm_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'ps' tasks { value: 't02n13:8888' } }
+    job { name: 'worker' tasks { key: 0 value: 't02n13:8889' }
+                         tasks { key: 1 value: 't02n41:8888' }
+                         tasks { key: 2 value: 't02n41:8889' }
+                         tasks { key: 3 value: 't02n43:8888' } }
+    """
+
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    assert os.environ['CUDA_VISIBLE_DEVICES'] == '2,3'
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4465714b2679f616d8730205c7ad7c020b04da6
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -0,0 +1,178 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for TF_CONFIG Environment Variables."""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+_TF_CONFIG_ENV = 'TF_CONFIG'
+_SESSION_MASTER_KEY = 'session_master'
+_RPC_LAYER_KEY = 'rpc_layer'
+_TASK_KEY = 'task'
+
+
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
+def _load_tf_config():
+  return json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+
+
+def _get_value_in_tfconfig(key, default=None):
+  tf_config = _load_tf_config()
+  return tf_config[key] if key in tf_config else default
+
+
+class TFConfigClusterResolver(ClusterResolver):
+  """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
+
+  def __init__(self,
+               task_type=None,
+               task_index=None,
+               rpc_layer=None,
+               environment=None,
+               num_accelerators=0):
+    """Creates a new TFConfigClusterResolver.
+
+    Args:
+      task_type: (String, optional) Overrides the task type specified in the
+        TF_CONFIG environment variable.
+      task_index: (Integer, optional) Overrides the task index specified in the
+        TF_CONFIG environment variable.
+      rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
+      environment: (String, optional) Overrides the environment TensorFlow
+        operates in.
+      num_accelerators: (Integer, optional) Specifies the number of
+        accelerators (e.g. GPUs, TPUs, others) that each node has.
+    """
+    # TODO(frankchn): num_accelerators is a stop-gap and will be removed
+    # in favor of autodetection of devices soon.
+
+    self._task_type = task_type
+    self._task_index = task_index
+    self._rpc_layer = rpc_layer
+    self._environment = environment
+    self._num_accelerators = num_accelerators
+
+  @property
+  def task_type(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, {})
+      return task_info['type'] if 'type' in task_info else None
+    else:
+      return self._task_type
+
+  @property
+  def task_index(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, {})
+      return task_info['index'] if 'index' in task_info else None
+    else:
+      return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._environment
+
+  @property
+  def rpc_layer(self):
+    if self._rpc_layer is None:
+      return _get_value_in_tfconfig(_RPC_LAYER_KEY)
+    else:
+      return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    # TODO(frankchn): Connect to server (w/ session_config) in the future.
+    # Unused, we do not connect to another server here right now.
+    del task_type, task_index, accelerator_type, config_proto
+    return self._num_accelerators
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec based on the TF_CONFIG environment variable.
+
+    Returns:
+      A ClusterSpec with information from the TF_CONFIG environment variable.
+    """
+    tf_config = _load_tf_config()
+    if 'cluster' not in tf_config:
+      return ClusterSpec({})
+    return ClusterSpec(tf_config['cluster'])
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a TensorFlow session.
+
+    Args:
+      task_type: (String, optional) Overrides and sets the task_type of the
+        master.
+      task_index: (Integer, optional) Overrides and sets the task id of the
+        master.
+      rpc_layer: (String, optional) Overrides and sets the protocol over which
+        TensorFlow nodes communicate with each other.
+
+    Returns:
+      The address of the master.
+
+    Raises:
+      RuntimeError: If the task_type or task_id is not specified and the
+        `TF_CONFIG` environment variable does not contain a task section.
+    """
+
+    # If `session_master` is set, just use that.
+    session_master = _get_value_in_tfconfig(_SESSION_MASTER_KEY)
+    if session_master is not None:
+      return session_master
+
+    # Return an empty string if we are the only job in the ClusterSpec.
+    cluster_spec = self.cluster_spec()
+    if (not cluster_spec.jobs or
+        (len(cluster_spec.jobs) == 1 and
+         len(cluster_spec.job_tasks(cluster_spec.jobs[0])) == 1)):
+      return ''
+
+    # We try to auto-detect the task type and id, but uses the user-supplied one
+    # where available
+    task_type = task_type if task_type is not None else self.task_type
+    task_index = task_index if task_index is not None else self.task_index
+
+    return format_master_url(cluster_spec.task_address(task_type, task_index),
+                             self.rpc_layer)
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..197eba1739017e8665588618e6b64297b310b513
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -0,0 +1,210 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TFCONFIGClusterResolver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class TFConfigClusterResolverTest(test.TestCase):
+
+  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto, server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+
+  def testNormalClusterSpecRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "task": {
+        "type": "ps",
+        "index": 0
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    expected_proto = """
+    job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
+                     tasks { key: 1 value: 'ps1:2222' } }
+    job { name: 'worker' tasks { key: 0 value: 'worker0:2222' }
+                         tasks { key: 1 value: 'worker1:2222' }
+                         tasks { key: 2 value: 'worker2:2222' } }
+    """
+    actual_cluster_spec = cluster_resolver.cluster_spec()
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  def testAutomaticMasterRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "task": {
+        "type": "ps",
+        "index": 0
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('ps0:2222', cluster_resolver.master())
+
+  def testSpecifiedTaskTypeAndIndexMasterRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "task": {
+        "type": "ps",
+        "index": 0
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('worker1:2222', cluster_resolver.master('worker', 1))
+
+  def testSessionMasterRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "session_master": "sessionmaster:2222",
+      "task": {
+        "type": "ps",
+        "index": 0
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('sessionmaster:2222', cluster_resolver.master())
+
+  def testRpcLayerRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": 0
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
+
+  def testTaskTypeIndexRpcRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": 0
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('ps', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual('grpc', cluster_resolver.rpc_layer)
+
+  def testParameterOverrides(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": 1
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_index=0,
+                                               num_accelerators=8)
+
+    self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
+    self.assertEqual('ps', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual(8, cluster_resolver.num_accelerators())
+
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_index = 1
+    cluster_resolver.rpc_layer = 'test'
+
+    self.assertEqual('test://worker1:2222', cluster_resolver.master())
+    self.assertEqual('worker', cluster_resolver.task_type)
+    self.assertEqual(1, cluster_resolver.task_index)
+    self.assertEqual('test', cluster_resolver.rpc_layer)
+
+  def testZeroItemsInClusterSpecMasterRead(self):
+    os.environ['TF_CONFIG'] = """
+    {}
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('', cluster_resolver.master())
+
+  def testOneItemInClusterSpecMasterRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "worker": ["worker0:2222"]
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('', cluster_resolver.master())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..e907d6fde4f7bb63553b85c580149a8cb51c9c3b
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -0,0 +1,502 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Cloud TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import re
+
+from six.moves.urllib.request import Request
+from six.moves.urllib.request import urlopen
+
+from tensorflow.python.client import session
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+_GOOGLE_API_CLIENT_INSTALLED = True
+try:
+  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
+  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _GOOGLE_API_CLIENT_INSTALLED = False
+
+
+_GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
+_ENDPOINTS_SEPARATOR = ','
+_DEFAULT_ENV_VARIABLE = 'TPU_NAME'
+_DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
+
+_TPU_DEVICE_REGEX = re.compile(
+    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
+_TPU_CONN_RETRIES = 120
+
+DeviceDetails = collections.namedtuple(
+    'DeviceDetails', ['device_map', 'total_cores'])
+
+
+def _get_device_dict_and_cores(devices):
+  """Returns a dict of hosts to cores and total cores given devices names.
+
+  Returns a namedtuple with two attributes:
+    device_map: A map of host_ids to a list of core_ids.
+    total_cores: The total number of cores within the TPU system.
+
+  Args:
+    devices: A list of devices returned by session.list_devices()
+  """
+  device_map = collections.defaultdict(list)
+  num_cores = 0
+  for device in devices:
+    match = _TPU_DEVICE_REGEX.match(device.name)
+    if match:
+      host_id = match.group('host_id')
+      core_id = match.group('core_id')
+      device_map[host_id].append(core_id)
+      num_cores += 1
+  return DeviceDetails(device_map, num_cores)
+
+
+def _verify_and_return_same_core_count(device_dict):
+  """Verifies that every device in device_dict has the same number of cores."""
+  num_cores_per_host_set = (
+      {len(core_ids) for core_ids in device_dict.values()})
+  if len(num_cores_per_host_set) != 1:
+    raise RuntimeError('TPU cores on each device is not the same. This '
+                       'should never happen. Devices: {}'.format(device_dict))
+  return num_cores_per_host_set.pop()
+
+
+class TPUClusterResolver(ClusterResolver):
+  """Cluster Resolver for Google Cloud TPUs.
+
+  This is an implementation of cluster resolvers for the Google Cloud TPU
+  service. As Cloud TPUs are in alpha, you will need to specify a API definition
+  file for this to consume, in addition to a list of Cloud TPUs in your Google
+  Cloud Platform project.
+  """
+
+  def _tpuService(self):
+    """Creates a new Cloud TPU API object.
+
+    This works around an issue where the underlying HTTP connection sometimes
+    times out when the script has been running for too long. Other methods in
+    this object calls this method to get a new API object whenever they need
+    to communicate with the Cloud API.
+
+    Returns:
+      A Google Cloud TPU API object.
+    """
+    if self._service:
+      return self._service
+
+    credentials = self._credentials
+    if credentials is None or credentials == 'default':
+      credentials = GoogleCredentials.get_application_default()
+
+    if self._discovery_url:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials,
+          discoveryServiceUrl=self._discovery_url)
+    else:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials)
+
+  def _requestComputeMetadata(self, path):
+    req = Request('http://metadata/computeMetadata/v1/%s' % path,
+                  headers={'Metadata-Flavor': 'Google'})
+    resp = urlopen(req)
+    return compat.as_bytes(resp.read())
+
+  def _shouldResolve(self):
+    if isinstance(self._should_resolve_override, bool):
+      return self._should_resolve_override
+    if (self._tpu == compat.as_bytes('') or
+        self._tpu == compat.as_bytes('local') or
+        self._tpu.startswith(compat.as_bytes('/bns')) or
+        self._tpu.startswith(compat.as_bytes('localhost:')) or
+        self._tpu.startswith(compat.as_bytes('grpc://')) or
+        self._tpu.startswith(compat.as_bytes('uptc://'))):
+      return False
+    return True
+
+  @staticmethod
+  def _inGke():
+    """When running in GKE, the environment variable will be set."""
+    return _GKE_ENV_VARIABLE in os.environ
+
+  @staticmethod
+  def _gkeEndpoints():
+    return os.environ[_GKE_ENV_VARIABLE]
+
+  @staticmethod
+  def _envVarFallback():
+    if _DEFAULT_ENV_VARIABLE in os.environ:
+      return os.environ[_DEFAULT_ENV_VARIABLE]
+    return None
+
+  @staticmethod
+  def _environmentDiscoveryUrl():
+    return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
+
+  def __init__(self,
+               tpu=None,
+               zone=None,
+               project=None,
+               job_name='worker',
+               coordinator_name=None,
+               coordinator_address=None,
+               credentials='default',
+               service=None,
+               discovery_url=None):
+    """Creates a new TPUClusterResolver object.
+
+    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
+    for the IP addresses and ports of each Cloud TPU listed.
+
+    Args:
+      tpu: Either a string, or a list of strings corresponding to the TPUs to
+        use. If the single string is the empty string, the string 'local', or a
+        string that begins with 'grpc://' or '/bns', then it is assumed to not
+        correspond with a Cloud TPU and will instead be passed as the session
+        master and no ClusterSpec propagation will be done.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
+      job_name: Name of the TensorFlow job the TPUs belong to.
+      coordinator_name: The name to use for the coordinator. Set to None if the
+        coordinator should not be included in the computed ClusterSpec.
+      coordinator_address: The address of the coordinator (typically an ip:port
+        pair). If set to None, a TF server will be started. If coordinator_name
+        is None, a TF server will not be started even if coordinator_address is
+        None.
+      credentials: GCE Credentials. If None, then we use default credentials
+        from the oauth2client
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. If you specify a custom service object, then the credentials
+        parameter will be ignored.
+      discovery_url: A URL template that points to the location of
+        the discovery service. It should have two parameters {api} and
+        {apiVersion} that when filled in produce an absolute URL to the
+        discovery document for that service. The environment variable
+        'TPU_API_DISCOVERY_URL' will override this.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+      ValueError: If no TPUs are specified.
+    """
+    if isinstance(tpu, list):
+      if not tpu:
+        raise ValueError('At least one TPU must be specified.')
+      if len(tpu) != 1:
+        raise NotImplementedError(
+            'Using multiple TPUs in a single session is not yet implemented')
+      tpu = tpu[0]
+
+    in_gke = self._inGke()
+    # When using GKE with Cloud TPUs, the env variable will be set.
+    if tpu is None:
+      if in_gke:
+        tpu = self._gkeEndpoints()
+      else:
+        tpu = self._envVarFallback()
+
+    if tpu is None:
+      raise ValueError('Please provide a TPU Name to connect to.')
+
+    self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
+
+    # By default the task_type is 'worker` and the task_index is 0 (which is the
+    # first worker in the task).
+    self.task_type = job_name
+    self.task_index = 0
+
+    if tpu.startswith('grpc://'):
+      # Cloud environment, where we are using GRPC to communicate to TPUs.
+      self._environment = ''
+    elif tpu == 'local' or not tpu:
+      # Google environment, where the TPU is attached to the host.
+      self._environment = 'google'
+    elif tpu.startswith('/bns') or tpu.startswith('uptc://'):
+      # Google environment, where we reach the TPU through BNS.
+      self._environment = 'google'
+
+    # If TPU is in the Google environment or exists locally, we don't use any
+    # RPC layer.
+    if tpu.startswith('/bns') or tpu.startswith(
+        'uptc://') or tpu == 'local' or not tpu:
+      self.rpc_layer = None
+    else:
+      self.rpc_layer = 'grpc'
+
+    # Setting this overrides the return value of self._shouldResolve()
+    self._should_resolve_override = None
+
+    # We strip out the protocol if it is included, and override the
+    # shouldResolve function to never resolve. We are adding the protocol back
+    # in later in self.master().
+    if self.rpc_layer is not None and tpu.startswith(self.rpc_layer + '://'):
+      tpu = tpu[len(self.rpc_layer + '://'):]
+      self._tpu = tpu
+      self._should_resolve_override = False
+
+    # Whether we should actually attempt to contact Cloud APIs
+    should_resolve = self._shouldResolve()
+
+    # We error out if we are in a non-Cloud environment which cannot talk to the
+    # Cloud APIs using the standard class and a special object is not passed in.
+    self._service = service
+    if (self._service is None and should_resolve and
+        not _GOOGLE_API_CLIENT_INSTALLED):
+      raise ImportError('googleapiclient and oauth2client must be installed '
+                        'before using the TPU cluster resolver. Execute: '
+                        '`pip install --upgrade google-api-python-client` '
+                        'and `pip install --upgrade oauth2client` to '
+                        'install with pip.')
+
+    # We save user-passed credentials, unless the user didn't pass in anything.
+    self._credentials = credentials
+    if (credentials == 'default' and should_resolve and
+        _GOOGLE_API_CLIENT_INSTALLED):
+      self._credentials = None
+
+    # Automatically detect project and zone if unspecified.
+    if not project and should_resolve:
+      project = compat.as_str(
+          self._requestComputeMetadata('project/project-id'))
+    if not zone and should_resolve:
+      zone_path = compat.as_str(self._requestComputeMetadata('instance/zone'))
+      zone = zone_path.split('/')[-1]
+    self._project = project
+    self._zone = zone
+
+    self._discovery_url = self._environmentDiscoveryUrl() or discovery_url
+
+    self._coordinator_name = coordinator_name
+    if (coordinator_name and not coordinator_address and
+        (should_resolve or in_gke)):
+      self._start_local_server()
+    else:
+      self._coordinator_address = coordinator_address
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Get the Master string to be used for the session.
+
+    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
+    first instance in the ClusterSpec returned by the cluster_spec function.
+
+    If a non-TPU name is used when constructing a TPUClusterResolver, that will
+    be returned instead (e.g. If the tpus argument's value when constructing
+    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
+    'grpc://10.240.1.2:8470' will be returned).
+
+    Args:
+      task_type: (Optional, string) The type of the TensorFlow task of the
+        master.
+      task_index: (Optional, integer) The index of the TensorFlow task of the
+        master.
+      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
+        communicate with TPUs.
+
+    Returns:
+      string, the connection string to use when creating a session.
+
+    Raises:
+      ValueError: If none of the TPUs specified exists.
+    """
+    if self._shouldResolve():
+      # We are going to communicate with the Cloud TPU APIs to get a Cluster.
+      cluster_spec = self.cluster_spec()
+      if task_type is not None and task_index is not None:
+        # task_type and task_index is from the function parameter
+        master = cluster_spec.task_address(task_type, task_index)
+      elif self.task_type is not None and self.task_index is not None:
+        # task_type and task_index is from the object
+        master = cluster_spec.task_address(self.task_type, self.task_index)
+      else:
+        # by default we take the first item in the cluster with the right name
+        job_tasks = cluster_spec.job_tasks(self.task_type)
+        if not job_tasks:
+          raise ValueError('No TPUs with the specified names exist.')
+        master = job_tasks[0]
+    else:
+      if isinstance(self._tpu, (bytes, bytearray)):
+        master = self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
+      else:
+        master = self._tpu.split(_ENDPOINTS_SEPARATOR)[0]
+    return format_master_url(master, rpc_layer or self.rpc_layer)
+
+  def get_master(self):
+    return self.master()
+
+  def get_job_name(self):
+    if self._shouldResolve():
+      return self.task_type
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest TPU information.
+
+    We retrieve the information from the GCE APIs every time this method is
+    called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Cloud TPUs.
+
+    Raises:
+      RuntimeError: If the provided TPU is not healthy.
+    """
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
+    ############################################################################
+
+    if self._shouldResolve():
+      # Case 1.
+      full_name = 'projects/%s/locations/%s/nodes/%s' % (
+          self._project, self._zone, compat.as_text(self._tpu))
+      service = self._tpuService()
+      request = service.projects().locations().nodes().get(name=full_name)
+      response = request.execute()
+
+      if 'state' in response and response['state'] != 'READY':
+        raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
+                           (compat.as_text(self._tpu), response['state']))
+
+      if 'health' in response and response['health'] != 'HEALTHY':
+        raise RuntimeError('TPU "%s" is unhealthy: "%s"' %
+                           (compat.as_text(self._tpu), response['health']))
+
+      if 'networkEndpoints' in response:
+        worker_list = [
+            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+            for endpoint in response['networkEndpoints']
+        ]
+      else:
+        # Fall back to the deprecated response format
+        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
+        worker_list = [instance_url]
+
+      cluster_spec = {self.task_type: worker_list}
+    else:
+      if self.rpc_layer is None:
+        # Case 3.
+        return None
+      # Case 2.
+      tpus = []
+      for tpu in self._tpu.split(_ENDPOINTS_SEPARATOR):
+        # We are working around the fact that GKE environment variable that is
+        # supplied to us has the protocol string embedded in it, but we want
+        # to strip it out for the ClusterSpec.
+        if (self.rpc_layer is not None and
+            tpu.startswith(self.rpc_layer + '://')):
+          tpus.append(tpu[len(self.rpc_layer + '://'):])
+        else:
+          tpus.append(tpu)
+      cluster_spec = {self.task_type: tpus}
+
+    if self._coordinator_address:
+      # {1, 2}.a
+      cluster_spec[self._coordinator_name] = [self._coordinator_address]
+
+    return server_lib.ClusterSpec(cluster_spec)
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='TPU',
+                       config_proto=None):
+    """Returns the number of TPU cores per worker.
+
+    Connects to the master and list all the devices present in the master,
+    and counts them up. Also verifies that the device counts per host in the
+    cluster is the same before returning the number of TPU cores per host.
+
+    Args:
+      task_type: Unused.
+      task_index: Unused.
+      accelerator_type: Unused.
+      config_proto: Used to create a connection to a TPU master in order to
+        retrieve the system metadata.
+
+    Raises:
+      RuntimeError: If this is used with a non-TPU accelerator_type.
+    """
+    retry_count = 1
+    # TODO(b/120564445): Replace with standard library for retries.
+    while True:
+      try:
+        with ops.Graph().as_default():
+          with session.Session(self.master(), config=config_proto) as s:
+            devices = s.list_devices()
+            device_details = _get_device_dict_and_cores(devices)
+            break
+      except errors.DeadlineExceededError:
+        error_message = ('Failed to connect to master. The TPU might not be '
+                         'ready (e.g. still scheduling) or the master '
+                         'address is incorrect: got (%s)' % self.master())
+        if retry_count <= _TPU_CONN_RETRIES:
+          logging.warning(error_message)
+          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
+          retry_count += 1
+        else:
+          raise RuntimeError(error_message)
+
+    if device_details.total_cores:
+      return _verify_and_return_same_core_count(device_details.device_map)
+    return 0
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    return self._environment
+
+  def _start_local_server(self):
+    address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
+    self._server = server_lib.Server(
+        {
+            'local': ['0.0.0.0:0']
+        }, protocol='grpc', config=None, start=True)
+    # self._server.target is of the form: grpc://ipaddress:port
+    target = compat.as_bytes(self._server.target)
+    splits = target.split(compat.as_bytes(':'))
+    assert len(splits) == 3, self._server.target
+    assert splits[0] == compat.as_bytes('grpc'), self._server.target
+    self._coordinator_port = compat.as_text(splits[2])
+    self._coordinator_address = '%s:%s' % (
+        address, compat.as_text(self._coordinator_port))
+
+  def __deepcopy__(self, memo):
+    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
+    return self
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..27d92608fa2db95944c94160d716a033ab2f78a2
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
@@ -0,0 +1,628 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPUClusterResolver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.client import session
+from tensorflow.python.distribute import cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+mock = test.mock
+
+
+class MockRequestClass(object):
+
+  def __init__(self, name, tpu_map):
+    self._name = name
+    self._tpu_map = tpu_map
+
+  def execute(self):
+    if self._name in self._tpu_map:
+      return self._tpu_map[self._name]
+    else:
+      raise KeyError('Resource %s was not found' % self._name)
+
+
+class MockNodeClass(object):
+
+  def __init__(self, tpu_map):
+    self._tpu_map = tpu_map
+
+  def get(self, name):
+    return MockRequestClass(name, self._tpu_map)
+
+
+def mock_request_compute_metadata(cls, *args, **kwargs):
+  del cls, kwargs  # Unused.
+  if args[0] == 'project/project-id':
+    return 'test-project'
+  elif args[0] == 'instance/zone':
+    return 'projects/test-project/locations/us-central1-c'
+  elif args[0] == 'instance/network-interfaces/0/ip':
+    return '10.128.1.2'
+  return ''
+
+
+class TPUClusterResolverTest(test.TestCase):
+
+  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
+    """Verifies that the ClusterSpec generates the correct proto.
+
+    We are testing this four different ways to ensure that the ClusterSpec
+    returned by the TPUClusterResolver behaves identically to a normal
+    ClusterSpec when passed into the generic ClusterSpec libraries.
+
+    Args:
+      cluster_spec: ClusterSpec returned by the TPUClusterResolver
+      expected_proto: Expected protobuf
+    """
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(expected_proto,
+                           server_lib.ClusterSpec(
+                               cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(expected_proto,
+                           server_lib.ClusterSpec(
+                               cluster_spec.as_dict()).as_cluster_def())
+
+  def mock_service_client(self, tpu_map=None):
+
+    if tpu_map is None:
+      tpu_map = {}
+
+    mock_locations = mock.MagicMock()
+    mock_locations.nodes.return_value = MockNodeClass(tpu_map)
+
+    mock_project = mock.MagicMock()
+    mock_project.locations.return_value = mock_locations
+
+    mock_client = mock.MagicMock()
+    mock_client.projects.return_value = mock_project
+
+    return mock_client
+
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testRetrieveProjectAndZoneFromMetadata(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'HEALTHY'
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        project=None,
+        zone=None,
+        tpu=['test-tpu-1'],
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
+
+    actual_cluster_spec = resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'coordinator'
+      tasks { key: 0 value: '10.128.1.2:%s' }
+    }
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.1.2.3:8470' }
+    }
+    """ % resolver._coordinator_port
+    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
+
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testRetrieveProjectAndZoneFromMetadataNoCoordinator(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'HEALTHY'
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        project=None,
+        zone=None,
+        tpu=['test-tpu-1'],
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    actual_cluster_spec = resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
+
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testUnhealthyCloudTpu(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'UNHEALTHY'
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        project=None,
+        zone=None,
+        tpu='test-tpu-1',
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    with self.assertRaises(RuntimeError):
+      resolver.cluster_spec()
+
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testNotReadyCloudTpu(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'state': 'CREATING'
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        project=None,
+        zone=None,
+        tpu='test-tpu-1',
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    with self.assertRaises(RuntimeError):
+      resolver.cluster_spec()
+
+  def testSimpleSuccessfulRetrieval(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'HEALTHY'
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu=['test-tpu-1'],
+        coordinator_name='coordinator',
+        coordinator_address='10.128.1.5:10203',
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    actual_cluster_spec = resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
+    job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
+
+  def testNewNetworkEndpointFormat(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'health': 'HEALTHY',
+            'networkEndpoints': [{
+                'ipAddress': '10.2.3.4',
+                'port': 8470,
+            }]
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu='test-tpu-1',
+        coordinator_name='coordinator',
+        coordinator_address='10.128.1.5:10203',
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    actual_cluster_spec = resolver.cluster_spec()
+    expected_proto = """
+    job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
+    job { name: 'worker' tasks { key: 0 value: '10.2.3.4:8470' } }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual('grpc://10.2.3.4:8470', resolver.master())
+
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testPodResolution(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'health':
+                'HEALTHY',
+            'networkEndpoints': [
+                {
+                    'ipAddress': '10.2.3.4',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.5',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.6',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.7',
+                    'port': 8470,
+                },
+            ]
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='test-tpu-1',
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
+
+    actual_cluster_spec = resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'coordinator',
+      tasks { key: 0 value: '10.128.1.2:%s'}
+    }
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.2.3.4:8470' }
+      tasks { key: 1 value: '10.2.3.5:8470' }
+      tasks { key: 2 value: '10.2.3.6:8470' }
+      tasks { key: 3 value: '10.2.3.7:8470' }
+    }
+    """ % resolver._coordinator_port
+    self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
+
+  def testPodResolutionNoCoordinator(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'health':
+                'HEALTHY',
+            'networkEndpoints': [
+                {
+                    'ipAddress': '10.2.3.4',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.5',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.6',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.7',
+                    'port': 8470,
+                },
+            ]
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu='test-tpu-1',
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    actual_cluster_spec = resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.2.3.4:8470' }
+      tasks { key: 1 value: '10.2.3.5:8470' }
+      tasks { key: 2 value: '10.2.3.6:8470' }
+      tasks { key: 3 value: '10.2.3.7:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
+
+  def testGetMasterNoEntries(self):
+    tpu_map = {}
+
+    with self.assertRaises(ValueError):
+      cluster_resolver.TPUClusterResolver(
+          project='test-project',
+          zone='us-central1-c',
+          tpu=[],
+          coordinator_name=None,
+          credentials=None,
+          service=self.mock_service_client(tpu_map=tpu_map))
+
+  # TODO(saeta): Convert to parameterized test when included in OSS TF.
+  def verifyShouldResolve(self, tpu, should_resolve):
+    resolver = cluster_resolver.TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu=tpu,
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map={}))
+    self.assertEqual(should_resolve, resolver._shouldResolve(),
+                     "TPU: '%s'" % tpu)
+
+  def testShouldResolveNoName(self):
+    self.verifyShouldResolve('', False)
+
+  def testShouldResolveLocal(self):
+    self.verifyShouldResolve('local', False)
+
+  def testShouldResolveGrpc(self):
+    self.verifyShouldResolve('grpc://10.1.2.3:8470', False)
+
+  def testShouldResolveBns(self):
+    self.verifyShouldResolve('/bns/foo/bar', False)
+
+  def testShouldResolveName(self):
+    self.verifyShouldResolve('mytpu', True)
+
+  def testShouldResolveList(self):
+    self.verifyShouldResolve(['myothertpu'], True)
+
+  def testShouldResolveGrpcPrefix(self):
+    self.verifyShouldResolve('grpctpu', True)
+
+  def testNoCallComputeMetadata(self):
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='/bns/foo/bar')
+    self.assertEqual(
+        compat.as_bytes('/bns/foo/bar'), resolver.master())
+    self.assertEqual(None, resolver.cluster_spec())
+
+  def testGkeEnvironmentForDonut(self):
+    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
+
+    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
+    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470'),
+        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
+
+    resolver = cluster_resolver.TPUClusterResolver()
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470'),
+        compat.as_bytes(resolver.master()))
+    actual_cluster_spec = resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.120.27.5:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
+
+  def testGkeEnvironmentForPod(self):
+    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = ('grpc://10.120.27.5:8470,'
+                                                     'grpc://10.120.27.6:8470,'
+                                                     'grpc://10.120.27.7:8470,'
+                                                     'grpc://10.120.27.8:8470')
+
+    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
+    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470,'
+                        'grpc://10.120.27.6:8470,'
+                        'grpc://10.120.27.7:8470,'
+                        'grpc://10.120.27.8:8470'),
+        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
+
+    resolver = cluster_resolver.TPUClusterResolver()
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470'),
+        compat.as_bytes(resolver.master()))
+    actual_cluster_spec = resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.120.27.5:8470' }
+      tasks { key: 1 value: '10.120.27.6:8470' }
+      tasks { key: 2 value: '10.120.27.7:8470' }
+      tasks { key: 3 value: '10.120.27.8:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
+
+  def testEnvironmentDiscoveryUrl(self):
+    os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
+    self.assertEqual('https://{api}.internal/{apiVersion}',
+                     (cluster_resolver.TPUClusterResolver.
+                      _environmentDiscoveryUrl()))
+
+  def testEnvironmentAndRpcDetectionForGoogle(self):
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='/bns/ab/cd/ef')
+    self.assertEqual(resolver.environment, 'google')
+    self.assertEqual(resolver.rpc_layer, None)
+
+  def testEnvironmentAndRpcDetectionForGrpcString(self):
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='grpc://10.1.2.3:8470')
+    self.assertEqual(resolver.environment, '')
+    self.assertEqual(resolver.rpc_layer, 'grpc')
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
+
+  def testOverrideTaskTypeAndIndexAndGetMaster(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'health':
+                'HEALTHY',
+            'networkEndpoints': [
+                {
+                    'ipAddress': '10.2.3.4',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.5',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.6',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.7',
+                    'port': 8470,
+                },
+            ]
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu='test-tpu-1',
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
+
+    resolver.task_type = 'worker'
+    resolver.task_index = 3
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470')
+
+    self.assertEqual(
+        resolver.master(
+            task_type='worker', task_index=2, rpc_layer='test'),
+        'test://10.2.3.6:8470')
+
+  def testGetDeviceDictAndCoresWithTPUs(self):
+    device_names = [
+        '/job:tpu_worker/task:0/device:TPU:0',
+        '/job:tpu_worker/task:1/device:TPU:1',
+        '/job:tpu_worker/task:2/device:TPU:0',
+        '/job:tpu_worker/task:3/device:TPU:1',
+        '/job:tpu_worker/task:0/device:TPU:4',
+        '/job:tpu_worker/task:1/device:TPU:5',
+        '/job:tpu_worker/task:2/device:TPU:4',
+        '/job:tpu_worker/task:3/device:TPU:5',
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, 'TPU', 1024, 0) for name in device_names
+    ]
+
+    device_details = tpu_cluster_resolver._get_device_dict_and_cores(
+        device_list)
+    self.assertEqual(device_details.total_cores, 8)
+    self.assertEqual(device_details.device_map,
+                     {'0': ['0', '4'],
+                      '1': ['1', '5'],
+                      '2': ['0', '4'],
+                      '3': ['1', '5']})
+
+  def testGetDeviceDictAndCoresWithCPUsAndGPUs(self):
+    device_names = [
+        '/job:tpu_worker/task:0/device:CPU:0',
+        '/job:tpu_worker/task:1/device:CPU:0',
+        '/job:tpu_worker/task:2/device:CPU:0',
+        '/job:tpu_worker/task:3/device:CPU:0',
+        '/job:tpu_worker/task:0/device:GPU:1',
+        '/job:tpu_worker/task:1/device:GPU:1',
+        '/job:tpu_worker/task:2/device:GPU:1',
+        '/job:tpu_worker/task:3/device:GPU:1',
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, 'XLA', 1024, 0) for name in device_names
+    ]
+
+    device_dict, num_cores = tpu_cluster_resolver._get_device_dict_and_cores(
+        device_list)
+    self.assertEqual(num_cores, 0)
+    self.assertEqual(device_dict, {})
+
+  def testVerifySameCoreCount(self):
+    self.assertEqual(
+        tpu_cluster_resolver._verify_and_return_same_core_count(
+            {0: [0, 1, 2, 3, 4, 5, 6, 7]}), 8)
+    self.assertEqual(
+        tpu_cluster_resolver._verify_and_return_same_core_count(
+            {0: [0, 1], 1: [2, 3]}), 2)
+    with self.assertRaises(RuntimeError):
+      tpu_cluster_resolver._verify_and_return_same_core_count(
+          {0: [0], 1: [1, 2]})
+
+  @mock.patch.object(session.BaseSession, 'list_devices')
+  def testNumAcceleratorsSuccess(self, mock_list_devices):
+    device_names = [
+        '/job:tpu_worker/task:0/device:TPU:0',
+        '/job:tpu_worker/task:1/device:TPU:1',
+        '/job:tpu_worker/task:2/device:TPU:0',
+        '/job:tpu_worker/task:3/device:TPU:1',
+        '/job:tpu_worker/task:0/device:TPU:4',
+        '/job:tpu_worker/task:1/device:TPU:5',
+        '/job:tpu_worker/task:2/device:TPU:4',
+        '/job:tpu_worker/task:3/device:TPU:5',
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, 'TPU', 1024, 0) for name in device_names
+    ]
+    mock_list_devices.return_value = device_list
+
+    resolver = cluster_resolver.TPUClusterResolver(tpu='')
+    self.assertEqual(resolver.num_accelerators(), 2)
+
+  @mock.patch.object(session.BaseSession, 'list_devices')
+  def testNumAcceleratorsRetryFailure(self, mock_list_devices):
+    resolver = cluster_resolver.TPUClusterResolver(tpu='')
+    mock_list_devices.side_effect = errors.DeadlineExceededError(
+        None, None, 'timeout')
+    with self.assertRaises(RuntimeError):
+      resolver.num_accelerators()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..57c552ca8f0abd36466932d800d9f1f802d9664c
--- /dev/null
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -0,0 +1,954 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for different algorithms of reduction and broadcasting."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import six
+
+from tensorflow.python.client import device_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging as logging
+
+
+def check_destinations(destinations):
+  """Checks whether `destinations` is not empty.
+
+  Args:
+    destinations: a DistributedValues, Variable, string or a list of strings.
+
+  Returns:
+    Boolean which is True if `destinations` is not empty.
+  """
+  # Calling bool() on a ResourceVariable is not allowed.
+  if isinstance(destinations, resource_variable_ops.ResourceVariable):
+    return bool(destinations.device)
+  return bool(destinations)
+
+
+def validate_destinations(destinations):
+  if not isinstance(
+      destinations,
+      (value_lib.DistributedValues, resource_variable_ops.ResourceVariable,
+       value_lib.AggregatingVariable, six.string_types, list, tuple)):
+    raise ValueError("destinations must be one of a `DistributedValues` object,"
+                     " a tf.Variable object, a device string, a list or tuple "
+                     "of device strings")
+
+  if not check_destinations(destinations):
+    raise ValueError("destinations can not be empty")
+
+
+def _make_tensor_into_per_replica(input_tensor):
+  """Converts a single tensor into a PerReplica object."""
+  if isinstance(input_tensor, (tuple, list)):
+    raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object, "
+                     "got %r but expected a object that is not a tuple or list."
+                     % (input_tensor,))
+  if isinstance(input_tensor, value_lib.PerReplica):
+    return input_tensor
+
+  try:
+    device = input_tensor.device
+  except AttributeError:
+    raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object "
+                     "because it doesn't have device set.")
+
+  return value_lib.PerReplica({device: input_tensor})
+
+
+def _normalize_value_destination_pairs(value_destination_pairs):
+  """Converts each tensor into a PerReplica object in the input list."""
+  result = []
+  if not isinstance(value_destination_pairs, (list, tuple)):
+    raise ValueError("`value_destination_pairs` should be a list or tuple")
+  for pair in value_destination_pairs:
+    if not isinstance(pair, tuple):
+      raise ValueError(
+          "Each element of `value_destination_pairs` should be a tuple.")
+    if len(pair) != 2:
+      raise ValueError("Each element of `value_destination_pairs` should be a "
+                       "tuple of size 2.")
+
+    per_replica = _make_tensor_into_per_replica(pair[0])
+    result.append((per_replica, pair[1]))
+  return result
+
+
+def _validate_value_destination_pairs(value_destination_pairs):
+  # TODO(yuefengz): raise exceptions instead of returning False.
+  # pylint: disable=g-missing-docstring
+  if not value_destination_pairs: return False
+  if not isinstance(value_destination_pairs, (list, tuple)): return False
+  if not all(isinstance(pair, tuple) for pair in value_destination_pairs):
+    return False
+  if not all(isinstance(v[0], value_lib.PerReplica)
+             for v in value_destination_pairs):
+    return False
+  return True
+
+
+# TODO(yuefengz): consider calling this function in the caller of
+# CrossDeviceOps.
+def get_devices_from(destinations):
+  if isinstance(destinations, value_lib.DistributedValues):
+    return list(destinations.devices)
+  elif isinstance(destinations, (resource_variable_ops.ResourceVariable,
+                                 value_lib.AggregatingVariable)):
+    return [destinations.device]
+  elif isinstance(destinations, six.string_types):
+    return [device_util.resolve(destinations)]
+  elif isinstance(destinations, (list, tuple)):
+    return [device_util.resolve(destination) for destination in destinations]
+  else:
+    return [destinations.device]
+
+
+def _devices_match(left, right):
+  return set(get_devices_from(left)) == set(get_devices_from(right))
+
+
+def _all_devices_match(value_destination_pairs):
+  if not all(_devices_match(v, d) for v, d in value_destination_pairs):
+    return False
+  if not all(_devices_match(v, value_destination_pairs[0][0])
+             for v, _ in value_destination_pairs[1:]):
+    return False
+  return True
+
+
+def _simple_broadcast(value, destinations):
+  index = {}
+  devices = get_devices_from(destinations)
+  for d in devices:
+    index[d] = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
+        value, d)
+  return value_lib.Mirrored(index)
+
+
+def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
+                   reduce_op):
+  # pylint: disable=g-missing-docstring
+  all_values = []
+  count = 0
+  for v in per_replica_value._index.values():  # pylint: disable=protected-access
+    count += 1
+    all_values.append(v)
+  if not all_values:
+    raise ValueError("`per_replica_value` must be non-empty")
+
+  with ops.device(reduce_to_device):
+    with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+      reduced = cross_device_utils.aggregate_tensors_or_indexed_slices(
+          all_values, accumulation_fn)
+      if reduce_op == reduce_util.ReduceOp.MEAN:
+        reduced = cross_device_utils.divide_by_n_tensors_or_indexed_slices(
+            reduced, count)
+      elif reduce_op != reduce_util.ReduceOp.SUM:
+        raise ValueError("`reduce_op` must be Reduce.SUM or Reduce.MEAN.")
+  return reduced
+
+
+class CrossDeviceOps(object):
+  """Base class for cross-device reduction and broadcasting algorithms."""
+
+  def __init__(self):
+    pass
+
+  def reduce(self, reduce_op, per_replica_value, destinations):
+    """Reduce `per_replica_value` to `destinations`.
+
+    It runs the reduction operation defined by `reduce_op` and put the
+    result on `destinations`.
+
+    Args:
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      per_replica_value: a PerReplica object or a tensor with device set.
+      destinations: the reduction destinations.
+
+    Returns:
+      a Mirrored object.
+
+    Raises:
+      ValueError: if per_replica_value is not a PerReplica object.
+    """
+    if not isinstance(per_replica_value, value_lib.PerReplica):
+      per_replica_value = _make_tensor_into_per_replica(per_replica_value)
+
+    validate_destinations(destinations)
+    return self._reduce(reduce_op, per_replica_value, destinations)
+
+  def batch_reduce(self, reduce_op, value_destination_pairs):
+    """Reduce PerReplica objects in a batch.
+
+    Reduce each first element in `value_destination_pairs` to each second
+    element which indicates the destinations.
+
+    Args:
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      value_destination_pairs: a list or a tuple of tuples of PerReplica objects
+        (or tensors with device set if there is one device) and destinations.
+
+    Returns:
+      a list of Mirrored objects.
+
+    Raises:
+      ValueError: if `value_destination_pairs` is not a list or a tuple of
+        tuples of PerReplica objects and destinations
+    """
+    if not _validate_value_destination_pairs(value_destination_pairs):
+      # If the first element of each pair is a tensor, we try to turn it into a
+      # PerReplica object.
+      value_destination_pairs = _normalize_value_destination_pairs(
+          value_destination_pairs)
+
+    for _, d in value_destination_pairs:
+      validate_destinations(d)
+
+    return self._batch_reduce(reduce_op, value_destination_pairs)
+
+  def broadcast(self, tensor, destinations):
+    """Broadcast the `tensor` to destinations.
+
+    Args:
+      tensor: the tensor to broadcast.
+      destinations: the broadcast destinations.
+
+    Returns:
+      a Mirrored object.
+    """
+    validate_destinations(destinations)
+    return self._broadcast(tensor, destinations)
+
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    raise NotImplementedError(
+        "_reduce method must be implemented in descendants.")
+
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
+    raise NotImplementedError(
+        "_batch_reduce method must be implemented in descendants.")
+
+  def _broadcast(self, tensor, destinations):
+    return _simple_broadcast(tensor, destinations)
+
+
+class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
+  """Always do reduction to one device first and then do broadcasting.
+
+    Batch reduction is done by reduction on each element one by one.
+  """
+
+  def __init__(self, reduce_to_device=None, accumulation_fn=math_ops.add_n):
+    """Constructor.
+
+    Args:
+      reduce_to_device: the intermediate device to reduce to. If None, reduce
+        to the first device in `destinations` of the reduce() method.
+      accumulation_fn: a function that does accumulation.
+    """
+    self.reduce_to_device = reduce_to_device
+    self.accumulation_fn = accumulation_fn
+    super(ReductionToOneDeviceCrossDeviceOps, self).__init__()
+
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    if check_destinations(destinations):
+      devices = get_devices_from(destinations)
+    else:
+      devices = get_devices_from(per_replica_value)
+    reduce_to_device = self.reduce_to_device or devices[0]
+    reduced = _simple_reduce(per_replica_value, reduce_to_device,
+                             self.accumulation_fn, reduce_op)
+    return self.broadcast(reduced, devices)
+
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
+    return [
+        self._reduce(reduce_op, t, destinations=v)
+        for t, v in value_destination_pairs
+    ]
+
+
+def _group_value_by_device(per_replica_values):
+  """Group values into sublists by their devices.
+
+  This grouping is needed to call the all-reduce library because it expects a
+  list of the following form:
+    [[(grad0_gpu0, v0_gpu0), (grad1_gpu0, v1_gpu0), (grad2_gpu0, v2_gpu0) ...],
+     [(grad0_gpu1, v0_gpu1), (grad1_gpu1, v1_gpu1), (grad2_gpu1, v2_gpu1) ...],
+     [(grad0_gpu2, v0_gpu2), (grad1_gpu0, v1_gpu2), (grad2_gpu0, v2_gpu2) ...],
+     ...
+    ]
+
+  Args:
+    per_replica_values: a list of PerReplica obejcts.
+
+  Returns:
+    a list of lists, each sublist has components for its corresponding device of
+      PerReplica objects, paired with a None.
+  """
+  destinations = per_replica_values[0].devices
+  grouped = [[] for _ in range(len(destinations))]
+  for per_replica_value in per_replica_values:
+    # pylint: disable=protected-access
+    for i, v in enumerate(per_replica_value._index.values()):
+      assert per_replica_value.devices == destinations
+      grouped[i].append((v, None))
+  return grouped
+
+
+def _ungroup_and_make_mirrored(grouped_reduced,
+                               destinations,
+                               reduce_op,
+                               num_between_graph_workers=1):
+  """Ungroup results from all-reduce and make Mirrored objects.
+
+  Each all-reduce result will be divided by the number of destinations before
+  Mirrored objects are created if reduce_op is "mean".
+
+  Args:
+    grouped_reduced: a list of lists, each sublist has components for each
+      device, paired with a None. It is the result from
+      cross_device_utils.aggregate_gradients_using*.
+    destinations: a list of device strings for returned Mirrored objects.
+    reduce_op: Indicates how values will be aggregated. Accepted values
+      are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+    num_between_graph_workers: number of workers in the between-graph
+      replication.
+
+  Returns:
+    a list of Mirrored objects.
+  """
+  index = [{} for _ in range(len(grouped_reduced[0]))]
+  for d, per_replica_reduced in enumerate(grouped_reduced):
+    for i, (v, _) in enumerate(per_replica_reduced):
+      if reduce_op == reduce_util.ReduceOp.MEAN:
+        index[i][destinations[d]] = v / (
+            len(destinations) * num_between_graph_workers)
+      else:
+        index[i][destinations[d]] = v
+  return [value_lib.Mirrored(v) for v in index]
+
+
+class ConcatAndSplitPacker(object):
+  """Concatenate and split tensors for reduction."""
+
+  def __init__(self, num_packs=1):
+    """Initialize the ConcatAndSplitPacker object.
+
+    Args:
+      num_packs: specifies the number of split packs that will be
+        formed.
+
+    Raises:
+      ValueError: if num_packs is not greater than 0.
+    """
+    if num_packs <= 0:
+      raise ValueError("num_packs must be greater than zero.")
+    self.num_packs = num_packs
+
+  def pack(self, grouped_grads_and_vars):
+    """Pack tensors."""
+    self.grouped_grads_and_vars = grouped_grads_and_vars
+    self.all_device_shapes = []
+    self.all_device_sizes = []
+
+    device_grad_packs = []
+    for device_grads_and_vars in grouped_grads_and_vars:
+      with ops.colocate_with(device_grads_and_vars[0][0]):
+        # Flatten all the grads.
+        flat_grads = [
+            array_ops.reshape(g, [-1]) for g, _ in device_grads_and_vars
+        ]
+        # Remember the original shape of all the grads.
+        device_shapes = [array_ops.shape(g) for g, _ in device_grads_and_vars]
+        # Remember the original sizes of all the grads.
+        device_sizes = [array_ops.size(g) for g, _ in device_grads_and_vars]
+        # Concat all the flat grads into a big flat tensor.
+        concat_grads = array_ops.concat(flat_grads, 0)
+
+        # Split the big tensor into num_splits packs. In cases where the
+        # total size is not divisible num_splits, the last pack gets
+        # more elements.
+        # TODO(zhengxq): it is also possible to optimize away all the concat
+        # as well.
+        num_splits = self.num_packs
+
+        # The array_ops.size function will sometimes remove static shapes. So if
+        # all gradient shapes are defined, we use another method to get the
+        # total size.
+        # TODO(yuefengz): move this logic to array_ops.size.
+        if all(g.shape.is_fully_defined() for g, _ in device_grads_and_vars):
+          total_grad_size = sum(
+              [g.shape.num_elements() for g, _ in device_grads_and_vars])
+        else:
+          total_grad_size = array_ops.size(concat_grads)
+
+        split_size = total_grad_size // num_splits
+        split_size_last = total_grad_size - split_size * (num_splits - 1)
+        split_sizes = [split_size] * (num_splits - 1) + [split_size_last]
+        grad_packs = array_ops.split(concat_grads, split_sizes)
+
+        # Ready to aggregate the repacked gradients, with fake variables.
+        # TODO(zhengxq): It is hacky to have to use fake variables.
+        # We should remove the need for variables in
+        # aggregate_gradients_using*.
+        device_grad_packs.append(zip(grad_packs, [None] * num_splits))
+        self.all_device_shapes.append(device_shapes)
+        self.all_device_sizes.append(device_sizes)
+
+    return device_grad_packs
+
+  def unpack(self, summed_device_grad_packs):
+    """Reverse the pack."""
+    aggregated_device_grads = []
+    for (summed_device_grad_packs,
+         device_grads_and_vars, device_shapes, device_sizes) in zip(
+             summed_device_grad_packs, self.grouped_grads_and_vars,
+             self.all_device_shapes, self.all_device_sizes):
+      # pylint: enable=line-too-long
+      # Reverse the packing operations in the previous steps. Form the
+      # summed gradients back into their original shapes.
+      with ops.colocate_with(summed_device_grad_packs[0][0]):
+        # Form a list of the summed grad packs.
+        device_grad_packs = [g for g, _ in summed_device_grad_packs]
+
+        # Concat them back into a big flat tensor.
+        device_grads_concat = array_ops.concat(device_grad_packs, 0)
+
+        # Split the tensors back into their original sizes.
+        grads_with_sizes = array_ops.split(device_grads_concat, device_sizes)
+
+        # Reshape the tensors back into their original shapes.
+        grads_with_shapes = [
+            array_ops.reshape(grad, shape)
+            for shape, grad in zip(device_shapes, grads_with_sizes)
+        ]
+
+        # Form the list with the original list of variables.
+        summed_device_grads = [
+            (g, v) for g, (_, v) in zip(grads_with_shapes,
+                                        device_grads_and_vars)
+        ]
+        aggregated_device_grads.append(summed_device_grads)
+    return aggregated_device_grads
+
+
+class AggregateSmallTensorPacker(object):
+  """Concatenate small gradient tensors together for reduction."""
+
+  def __init__(self,
+               agg_small_grads_max_bytes=1048576,
+               agg_small_grads_max_group=16):
+    """Initialize the AggregateSmallTensorPacker object.
+
+    Args:
+      agg_small_grads_max_bytes: largest tensor eligible for aggregation,
+        in number of bytes.
+      agg_small_grads_max_group: largest permitted aggregation of small
+        tensors.
+
+    Raises:
+      ValueError: if `agg_small_grads_max_bytes` or `agg_small_grads_max_group`
+        is not greater than 0.
+    """
+    if agg_small_grads_max_bytes <= 0 or agg_small_grads_max_group <= 0:
+      raise ValueError("agg_small_grads_max_bytes and agg_small_grads_max_group"
+                       " should both be greater than zero.")
+    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self.agg_small_grads_max_group = agg_small_grads_max_group
+
+  def pack(self, grouped_grads_and_vars):
+    """Aggregate small tensors."""
+    if (self.agg_small_grads_max_bytes > 0 and
+        self.agg_small_grads_max_group > 0):
+      device_grads, self.packing = cross_device_utils.pack_small_tensors(
+          grouped_grads_and_vars,
+          max_bytes=self.agg_small_grads_max_bytes,
+          max_group=self.agg_small_grads_max_group)
+    return device_grads
+
+  def unpack(self, summed_device_grad_packs):
+    """Reverse the aggregation process."""
+    return cross_device_utils.unpack_small_tensors(summed_device_grad_packs,
+                                                   self.packing)
+
+
+def _pack_tensors(device_grads,
+                  num_packs=0,
+                  agg_small_grads_max_bytes=0,
+                  agg_small_grads_max_group=0):
+  """Pack tensors if specified."""
+  if num_packs > 0:
+    tensor_packer = ConcatAndSplitPacker(num_packs)
+    device_grad_packs = tensor_packer.pack(device_grads)
+  elif agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
+    tensor_packer = AggregateSmallTensorPacker(agg_small_grads_max_bytes,
+                                               agg_small_grads_max_group)
+    device_grad_packs = tensor_packer.pack(device_grads)
+  else:
+    tensor_packer = None
+    device_grad_packs = device_grads
+  return device_grad_packs, tensor_packer
+
+
+def _unpack_tensors(reduced, tensor_packer=None):
+  """Unpack tensors if they are packed before all-reduce."""
+  if tensor_packer:
+    return tensor_packer.unpack(reduced)
+  return reduced
+
+
+class AllReduceCrossDeviceOps(CrossDeviceOps):
+  """Reduction using all reduce."""
+
+  def __init__(self,
+               all_reduce_alg="nccl",
+               num_packs=1,
+               agg_small_grads_max_bytes=0,
+               agg_small_grads_max_group=10):
+    """All-reduce implementation of CrossDeviceOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation:
+      1) If `num_packs` is non-zero, pack values into
+        `num_packs` splits.
+      2) Otherwise, if `agg_small_grads_max_bytes` > 0 and
+        `agg_small_grads_max_group` > 0, aggregate values smaller than
+        `agg_small_grads_max_bytes` into groups with at most
+        `agg_small_grads_max_group` values.
+      3) Otherwise, no repacking or grouping will happen.
+
+    Args:
+      all_reduce_alg: the all-reduce algorithm to use, currently only "nccl" or
+        "hierarchical_copy" are supported.
+      num_packs: see above.
+      agg_small_grads_max_bytes: see above.
+      agg_small_grads_max_group: see above.
+        tensors.
+    """
+    self._all_reduce_alg = all_reduce_alg
+    self._num_packs = num_packs
+    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self._agg_small_grads_max_group = agg_small_grads_max_group
+    super(AllReduceCrossDeviceOps, self).__init__()
+
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
+        per_replica_value)
+    if (_devices_match(per_replica_value, destinations)
+        and not context.executing_eagerly()
+        and not contains_indexed_slices):
+      return self._batch_all_reduce(reduce_op, [per_replica_value])[0]
+    else:
+      if contains_indexed_slices:
+        logging.log_first_n(
+            logging.WARN,
+            "Efficient allreduce is not supported for IndexedSlices.", 10)
+
+      if check_destinations(destinations):
+        devices = get_devices_from(destinations)
+      else:
+        devices = get_devices_from(per_replica_value)
+      reduce_to_device = devices[0]
+      reduced = _simple_reduce(per_replica_value, reduce_to_device,
+                               math_ops.add_n, reduce_op)
+      return self.broadcast(reduced, devices)
+
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
+    all_devices_match = _all_devices_match(value_destination_pairs)
+    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
+        value_destination_pairs)
+    if (all_devices_match and not context.executing_eagerly()
+        and not contains_indexed_slices):
+      return self._batch_all_reduce(reduce_op,
+                                    [v[0] for v in value_destination_pairs])
+    else:
+      if not all_devices_match:
+        logging.log_first_n(logging.WARN,
+                            "Efficient batch_reduce is not supported if "
+                            "destinations are different.",
+                            10)
+
+      return [
+          self._reduce(reduce_op, t, destinations=v)
+          for t, v in value_destination_pairs
+      ]
+
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
+    """All reduce algorithm in a batch."""
+    logging.log_first_n(
+        logging.INFO, "batch_all_reduce invoked for batches size = %d with "
+        "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
+        "agg_small_grads_max_group = %d" %
+        (len(per_replica_values), self._all_reduce_alg, self._num_packs,
+         self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
+    destinations = per_replica_values[0].devices
+    grouped = _group_value_by_device(per_replica_values)
+
+    device_grad_packs, tensor_packer = _pack_tensors(
+        grouped, self._num_packs, self._agg_small_grads_max_bytes,
+        self._agg_small_grads_max_group)
+
+    # The actual aggregation of the repacked gradients. Note that they are
+    # sharded among different aggregation trees. So it is important to strike
+    # the balance on num_splits.
+    if self._all_reduce_alg == "nccl":
+      # TODO(yuefengz): merge this into the all-reduce library.
+      reduced = cross_device_utils.aggregate_gradients_using_nccl(
+          device_grad_packs)
+    else:
+      # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
+      # order.
+      reduced = (
+          cross_device_utils.aggregate_gradients_using_hierarchical_copy(
+              destinations, device_grad_packs))
+
+    reduced = _unpack_tensors(reduced, tensor_packer)
+    return _ungroup_and_make_mirrored(reduced, per_replica_values[0].devices,
+                                      reduce_op)
+
+
+# For compatibility with code using the old name of `AllReduceCrossDeviceOps`.
+AllReduceCrossTowerOps = AllReduceCrossDeviceOps
+
+
+AllReduceSpecTuple = collections.namedtuple("AllReduceSpecTuple",
+                                            "alg shards limit")
+
+
+class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
+  """All-reduce algorithms for distributed TensorFlow."""
+
+  def __init__(self,
+               worker_devices,
+               num_gpus_per_worker,
+               all_reduce_spec=("pscpu/pscpu", 2, -1),
+               num_packs=0,
+               agg_small_grads_max_bytes=0,
+               agg_small_grads_max_group=10):
+    """Initialize the all-reduce algorithm.
+
+    Args:
+      worker_devices: a list of device strings for workers participating in
+        all-reduce.
+      num_gpus_per_worker: number of GPU devices per worker.
+      all_reduce_spec: a tuple or a named tuple or a list of tuples specifying
+        the all-reduce algorithm.
+        1. The first element of a tuple is the name of the all-reduce algorithm.
+        Valid algorithm names are: "nccl", "nccl/xring", "nccl/rechd",
+        "nccl/pscpu", "xring", "pscpu", "psgpu", "pscpu/pscpu". Algorithms with
+        a "/" are hierarchical, so two all-reduces are executed, the first one
+        aggregates tensors within a worker and the second aggregates across
+        workers.
+        2. The second element of a tuple is the number of shards when doing
+        all-reduce. Let's say its values is M, each tensor after packing will be
+        split into M shards and then M parallel all-reduces would be performed
+        before finally they are concatenated backed into a complete tensor.
+        3. The third element is the maximum size of tensors that will be
+        applicable for the algorithm specified by the first element. For
+        example, if all_reduce_spec=[("nccl", 2, 1024), ("pscpu/pscpu", 2, -1)],
+        tensors with size not larger than 1024 bytes will be applied a 2-shard
+        "nccl" all-reduce and other tensors will be applied a 2-shard
+        "pscpu/pscpu" algorithm. The third elements should be in increasing
+        order across tuples and end with -1 which indicates infinity.
+      num_packs: see AllReduceCrossDeviceOps.
+      agg_small_grads_max_bytes: see AllReduceCrossDeviceOps.
+      agg_small_grads_max_group: see AllReduceCrossDeviceOps.
+    """
+    self._worker_devices = worker_devices
+    self._num_gpus_per_worker = num_gpus_per_worker
+    super(MultiWorkerAllReduce, self).__init__(
+        num_packs=num_packs,
+        agg_small_grads_max_bytes=agg_small_grads_max_bytes,
+        agg_small_grads_max_group=agg_small_grads_max_group)
+
+    def validate_and_complete_spec(spec):
+      """Validate and complete the all-reduce spec."""
+      # TODO(yuefengz): support namedtuple.
+      if not isinstance(spec, tuple):
+        raise ValueError(
+            "A tuple is expected for all-reduce spec: %r" % all_reduce_spec)
+      if not spec or len(spec) > 3:
+        raise ValueError(
+            "Too many elements in the all-reduce spec tuple: %r" % spec)
+      if len(spec) == 1:
+        return AllReduceSpecTuple(spec[0], 1, -1)
+      elif len(spec) == 2:
+        return AllReduceSpecTuple(spec[0], spec[1], -1)
+      else:
+        return AllReduceSpecTuple(*spec)
+
+    self._all_reduce_spec = []
+    if isinstance(all_reduce_spec, six.string_types):
+      self._all_reduce_spec.append(AllReduceSpecTuple(all_reduce_spec, 1, -1))
+    elif isinstance(all_reduce_spec, tuple):
+      self._all_reduce_spec.append(validate_and_complete_spec(all_reduce_spec))
+    elif isinstance(all_reduce_spec, list):
+      self._all_reduce_spec = [
+          validate_and_complete_spec(spec) for spec in all_reduce_spec
+      ]
+
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
+    """All reduce algorithm in a batch."""
+    logging.log_first_n(
+        logging.INFO,
+        "distributed batch_all_reduce invoked for batches size = %d with "
+        "allreduce_spec = %r, num_packs = %d, agg_small_grads_max_bytes = %d "
+        "and agg_small_grads_max_group = %d" %
+        (len(per_replica_values), self._all_reduce_spec, self._num_packs,
+         self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
+
+    destinations = sorted(per_replica_values[0].devices)
+    device_grads = _group_value_by_device(per_replica_values)
+
+    # The all reduce library requires fully defined shapes.
+    # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
+    # required as well.
+    for device_grad in device_grads:
+      for grad, _ in device_grad:
+        if not grad.shape.is_fully_defined():
+          raise ValueError("Shape is unknown for node %r" % grad)
+
+    remaining_grads = device_grads
+    aggregated_grads = []
+    for spec_tuple in self._all_reduce_spec:
+      if spec_tuple.limit < 0:
+        this_grads = remaining_grads
+        remaining_grads = []
+      else:
+        (this_grads, remaining_grads) = cross_device_utils.split_grads_by_size(
+            spec_tuple.limit, remaining_grads)
+      if this_grads:
+        device_grad_packs, tensor_packer = _pack_tensors(
+            this_grads, self._num_packs, self._agg_small_grads_max_bytes,
+            self._agg_small_grads_max_group)
+        range_agg_grads = cross_device_utils.sum_gradients_all_reduce(
+            self._worker_devices, device_grad_packs, len(self._worker_devices),
+            spec_tuple.alg, spec_tuple.shards, range(self._num_gpus_per_worker))
+        range_agg_grads = _unpack_tensors(range_agg_grads, tensor_packer)
+
+        if not aggregated_grads:
+          aggregated_grads = range_agg_grads
+        else:
+          assert len(aggregated_grads) == len(range_agg_grads)
+          for i in range(len(aggregated_grads)):
+            aggregated_grads[i] += range_agg_grads[i]
+    assert not remaining_grads
+
+    return _ungroup_and_make_mirrored(aggregated_grads, destinations,
+                                      reduce_op)
+
+
+# TODO(yuefengz): support in-graph collective all-reduce.
+class CollectiveAllReduce(CrossDeviceOps):
+  """All-reduce cross device ops using collective ops.
+
+  In the between-graph replicated training, it will still do all-reduces across
+  all workers and then put results on the right destinations.
+  """
+
+  def __init__(self,
+               num_workers=1,
+               num_gpus_per_worker=0,
+               all_reduce_merge_scope=32,
+               collective_keys=None):
+    """Initializes the object.
+
+    Args:
+      num_workers: number of workers in the between-graph replicated training.
+      num_gpus_per_worker: number of GPUs per worker.
+      all_reduce_merge_scope: size of groups into which to partition consecutive
+        gradients grouped under a common 'allreduce' name scope. This is useful
+        for some optimization of collective ops.
+      collective_keys: an optional CollectiveKey object.
+    """
+    self._num_workers = num_workers
+    self._num_gpus_per_worker = num_gpus_per_worker
+    self._all_reduce_merge_scope = all_reduce_merge_scope
+    self._collective_keys = (collective_keys or
+                             cross_device_utils.CollectiveKeys())
+    super(CollectiveAllReduce, self).__init__()
+
+  # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    if cross_device_utils.contains_indexed_slices(per_replica_value):
+      raise ValueError(
+          "`IndexSlices` is not supported for Collective All-Reduce.")
+    if context.executing_eagerly():
+      raise ValueError(
+          "Eager execution is not supported for Collective All-Reduce")
+
+    all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
+    if _devices_match(per_replica_value, destinations):
+      return all_reduced
+    else:
+      index = {}
+      for d in get_devices_from(destinations):
+        # pylint: disable=protected-access
+        if d in all_reduced._index:
+          index[d] = all_reduced._index[d]
+        else:
+          with ops.control_dependencies(list(
+              all_reduced._index.values())), ops.device(d):
+            index[d] = array_ops.identity(list(all_reduced._index.values())[0])
+
+      return value_lib.Mirrored(index)
+
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
+    if cross_device_utils.contains_indexed_slices(value_destination_pairs):
+      raise ValueError(
+          "`IndexSlices` is not supported for Collective All-Reduce.")
+    if context.executing_eagerly():
+      raise ValueError(
+          "Eager execution is not supported for Collective All-Reduce")
+
+    all_devices_match = _all_devices_match(value_destination_pairs)
+    if all_devices_match:
+      return self._batch_all_reduce(reduce_op,
+                                    [v[0] for v in value_destination_pairs])
+    else:
+      if not all_devices_match:
+        logging.log_first_n(
+            logging.WARN, "Efficient batch_reduce is not supported if "
+            "destinations are different.", 10)
+
+      return [
+          self._reduce(reduce_op, t, destinations=v)
+          for t, v in value_destination_pairs
+      ]
+
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
+    """All-reduce across all workers in a batch."""
+    if context.executing_eagerly():
+      raise ValueError(
+          "Eager execution with collective ops is not supported yet.")
+
+    logging.log_first_n(
+        logging.INFO, "Collective All-reduce invoked with batches size = %d, "
+        "num_workers = %d" % (len(per_replica_values), self._num_workers), 10)
+
+    grouped_by_device = _group_value_by_device(per_replica_values)
+
+    grouped_by_var = list(zip(*grouped_by_device))
+    # grouped_by_var is grouped by variables and takes the following format:
+    # [((grad0_gpu0, v0_gpu0), (grad0_gpu1, v0_gpu1), (grad0_gpu2, v0_gpu2) ..),
+    #  ((grad1_gpu0, v1_gpu0), (grad1_gpu1, v1_gpu1), (grad1_gpu0, v1_gpu2) ..),
+    #  ((grad2_gpu0, v2_gpu0), (grad2_gpu1, v2_gpu1), (grad2_gpu0, v2_gpu2) ..),
+    #  ...
+    # ]
+    chunked_gv = [
+        grouped_by_var[x:x + self._all_reduce_merge_scope]
+        for x in range(0, len(grouped_by_var), self._all_reduce_merge_scope)
+    ]
+
+    reduced_gv_list = []
+    for chunk in chunked_gv:
+      with ops.name_scope("allreduce"):
+        for grad_and_vars in chunk:
+          scaled_grads = [g for g, _ in grad_and_vars]
+          collective_reduced = cross_device_utils.build_collective_reduce(
+              scaled_grads, self._num_workers, self._collective_keys, "Add",
+              "Id")
+          result = []
+          for (_, v), g in zip(grad_and_vars, collective_reduced):
+            result.append([g, v])
+          reduced_gv_list.append(result)
+
+    new_device_grads = [list(x) for x in zip(*reduced_gv_list)]
+    return _ungroup_and_make_mirrored(
+        new_device_grads,
+        per_replica_values[0].devices,
+        reduce_op,
+        num_between_graph_workers=self._num_workers)
+
+
+_dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
+               [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
+
+
+def _has_dgx1_like_links(gpu_links):
+  if not gpu_links:
+    return False
+  # TODO(yuefengz): figure out the right topology for hierarchial copy if
+  # number of gpus are less than 8.
+  if len(gpu_links) < 8:
+    return False
+  for i, (gpu_link, dgx1_link) in enumerate(zip(gpu_links, _dgx1_links)):
+    if (set(gpu_link) != set(dgx1_link) and
+        set(gpu_link) != set(dgx1_link + [i])):
+      return False
+  return True
+
+
+def _choose_all_reduce_algorithm(device_links):
+  if _has_dgx1_like_links(device_links):
+    logging.info("Configured hierarchical_copy with num_packs=%d",
+                 len(device_links))
+    return AllReduceCrossDeviceOps(
+        "hierarchical_copy", num_packs=len(device_links))
+  else:
+    logging.info("Configured nccl all-reduce.")
+    return AllReduceCrossDeviceOps("nccl", num_packs=1)
+
+
+def choose_the_best(devices, session_config=None):
+  """Find the best subclass of CrossDeviceOps given a session config.
+
+  Args:
+    devices: a list of devices passed to `tf.distribute.Strategy`.
+    session_config: a `tf.ConfigProto` or `None`. If `None`, it will make
+      decision based on all local devices.
+
+  Returns:
+    A subclass of `CrossDeviceOps`.
+  """
+  requested_devices = set([device_util.canonicalize(d) for d in devices])
+  machine_devices = device_lib.list_local_devices(session_config=session_config)
+  using_devices = []
+  for d in machine_devices:
+    if device_util.canonicalize(d.name) in requested_devices:
+      using_devices.append(d)
+    else:
+      logging.info(
+          "Device is available but not used by distribute strategy: %s", d.name)
+
+  if len(using_devices) != len(requested_devices):
+    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
+                    "to TensorFlow.")
+    return ReductionToOneDeviceCrossDeviceOps()
+
+  if any(d.device_type.lower() != "gpu" for d in using_devices):
+    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
+                    "to TensorFlow.")
+    return ReductionToOneDeviceCrossDeviceOps()
+
+  device_links = [[] for _ in range(len(using_devices))]
+  for i, device in enumerate(using_devices):
+    for link in device.locality.links.link:
+      device_links[i].append(link.device_id)
+
+  return _choose_all_reduce_algorithm(device_links)
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0faadd7e0cfe69bf8c80399574dd67be53ebcfe0
--- /dev/null
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -0,0 +1,671 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for cross_device_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as pycoll
+import threading
+
+from tensorflow.python.distribute import all_reduce
+from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nccl_ops
+
+
+def aggregate_gradients_using_nccl(replica_grads):
+  """Aggregate gradients using nccl allreduce."""
+  agg_all_g_and_v = []
+  for single_g_and_v in zip(*replica_grads):
+    single_grads = [g for g, _ in single_g_and_v]
+    agg_grads = nccl_ops.all_sum(single_grads)
+    agg_all_g_and_v.append(
+        [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)])
+
+  agg_all_g_and_v = list(zip(*agg_all_g_and_v))
+
+  return agg_all_g_and_v
+
+
+def aggregate_gradients_using_hierarchical_copy(avail_devices, replica_grads):
+  """Aggregate gradients using hierarchical copies.
+
+  Args:
+    avail_devices: available GPU devices.
+    replica_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over replicas. The inner list is over individual gradients.
+
+  Returns:
+    The list of (aggregated_gradient, variable), where the gradient has been
+      summed across all replicas and the variable is chosen from the first
+      replica.
+  """
+  # This only works for DGX-1 type of machine topology
+  # Device peer to peer matrix
+  # DMA: 0 1 2 3 4 5 6 7
+  # 0:   Y Y Y Y Y N N N
+  # 1:   Y Y Y Y N Y N N
+  # 2:   Y Y Y Y N N Y N
+  # 3:   Y Y Y Y N N N Y
+  # 4:   Y N N N Y Y Y Y
+  # 5:   N Y N N Y Y Y Y
+  # 6:   N N Y N Y Y Y Y
+  # 7:   N N N Y Y Y Y Y
+  agg_grads = []
+  num_devices = len(avail_devices)
+  # In the special case of DGX-1 machine topology, the two groups have equal
+  # size.
+  group_size = num_devices // 2
+  for i, single_grads in enumerate(zip(*replica_grads)):
+    group_0_main_device = i % num_devices
+    group_1_main_device = (group_0_main_device + group_size) % num_devices
+    if group_0_main_device < group_size:
+      group_0_begin = 0
+      group_1_begin = group_size
+    else:
+      group_0_begin = group_size
+      group_1_begin = 0
+
+    # Aggregate the first group.
+    group_0_device_grads = single_grads[group_0_begin:
+                                        group_0_begin + group_size]
+    with ops.device(avail_devices[group_0_main_device]):
+      group_0_agg_grads, _ = aggregate_single_gradient_using_copy(
+          group_0_device_grads, False, False)
+
+    # Aggregate the second group.
+    group_1_device_grads = single_grads[group_1_begin:
+                                        group_1_begin + group_size]
+    with ops.device(avail_devices[group_1_main_device]):
+      group_1_agg_grads, _ = aggregate_single_gradient_using_copy(
+          group_1_device_grads, False, False)
+
+    # Aggregate between the groups.
+    with ops.device(avail_devices[group_0_main_device]):
+      (agg_total_grads, _), _ = aggregate_single_gradient_using_copy(
+          [group_0_agg_grads, group_1_agg_grads], False, False)
+
+    # Broadcast the result back into the root of each group.
+    with ops.device(avail_devices[group_0_main_device]):
+      group_0_agg_grads_bcast = array_ops.identity(agg_total_grads)
+    with ops.device(avail_devices[group_1_main_device]):
+      group_1_agg_grads_bcast = array_ops.identity(agg_total_grads)
+
+    agg_grads_bcast = []
+    for j in range(len(single_grads)):
+      with ops.device(avail_devices[j]):
+        # Broadcast the result back to each member in the group from the root.
+        if (group_0_main_device < group_size) == (j < group_size):
+          src_device_grad = group_0_agg_grads_bcast
+        else:
+          src_device_grad = group_1_agg_grads_bcast
+        agg_grads_bcast.append(array_ops.identity(src_device_grad))
+
+    agg_grads.append(
+        [(g, v) for g, (_, v) in zip(agg_grads_bcast, single_grads)])
+
+  agg_grads = list(zip(*agg_grads))
+
+  return agg_grads
+
+
+def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
+                                         check_inf_nan):
+  """Calculate the average gradient for a shared variable across all replicas.
+
+  Note that this function provides a synchronization point across all replicas.
+
+  Args:
+    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
+      (gradient, variable) pair within the outer list represents the gradient
+      of the variable calculated for a single replica, and the number of pairs
+      equals the number of replicas.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all replicas. The variable is chosen
+      from the first replica. The has_nan_or_inf indicates the grads has nan or
+      inf.
+  """
+  grads = [g for g, _ in grad_and_vars]
+  grad = math_ops.add_n(grads)
+
+  if use_mean and len(grads) > 1:
+    grad = array_ops.multiply(grad, 1.0 / len(grads))
+
+  v = grad_and_vars[0][1]
+  if check_inf_nan:
+    has_nan_or_inf = array_ops.logical_not(
+        array_ops.reduce_all(array_ops.is_finite(grads)))
+    return (grad, v), has_nan_or_inf
+  else:
+    return (grad, v), None
+
+
+def group_device_names(devices, group_size):
+  """Group device names into groups of group_size.
+
+  Args:
+    devices: a list of canonical device strings.
+    group_size: integer which is equal to or greater than 1.
+
+  Returns:
+    list of lists of devices, where each inner list is group_size long,
+      and each device appears at least once in an inner list.  If
+      len(devices) % group_size == 0 then each device will appear exactly once.
+
+  Raises:
+    ValueError: if group_size > len(devices)
+  """
+  num_devices = len(devices)
+  if group_size > num_devices:
+    raise ValueError(
+        'only %d devices, but group_size=%d' % (num_devices, group_size))
+  num_groups = (
+      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
+  groups = [[] for i in range(num_groups)]
+  for i in range(num_groups * group_size):
+    groups[i % num_groups].append(devices[i % num_devices])
+  return groups
+
+
+def split_grads_by_size(threshold_size, device_grads):
+  """Break gradients into two sets according to tensor size.
+
+  Args:
+    threshold_size: int size cutoff for small vs large tensor.
+    device_grads: List of lists of (gradient, variable) tuples.  The outer
+        list is over devices. The inner list is over individual gradients.
+
+  Returns:
+    small_grads: Subset of device_grads where shape is <= threshold_size
+       elements.
+    large_grads: Subset of device_grads where shape is > threshold_size
+       elements.
+  """
+  small_grads = []
+  large_grads = []
+  for dl in device_grads:
+    small_dl = []
+    large_dl = []
+    for (g, v) in dl:
+      tensor_size = g.get_shape().num_elements()
+      if tensor_size <= threshold_size:
+        small_dl.append([g, v])
+      else:
+        large_dl.append([g, v])
+    if small_dl:
+      small_grads.append(small_dl)
+    if large_dl:
+      large_grads.append(large_dl)
+  return small_grads, large_grads
+
+
+# threading.Lock() and threading.local() cannot be pickled and therefore cannot
+# be a field of CollectiveKeys. Right now _thread_local is not necessary to be
+# an instance member of CollectiveKeys since we always create a new thread for
+# each replica.
+_lock = threading.Lock()
+_thread_local = threading.local()
+
+
+# TODO(yuefengz): use random key starts to avoid reusing keys?
+class CollectiveKeys(object):
+  """Class that manages collective keys.
+
+  We need to manage three different keys for collective:
+
+  *Group key*: an integer key to identify the set of cooperative devices.
+  Collective ops work under the same set of devices must using the same group
+  key.
+
+  *Instance key*: an integer key to identify the set of same counterpart of
+  tensors on different devices in a device group that need to be all-reduced.
+
+  "Graph key": an integer key that is unique key graph. This is used to support
+  multiple graphs per client session. It must be non-zero and set in the
+  `config` argument of each call to `session.run`.
+  """
+
+  def __init__(self,
+               group_key_start=1,
+               instance_key_start=100,
+               instance_key_with_id_start=10000):
+    """Initializes the object.
+
+    Args:
+      group_key_start: the starting integer of group key.
+      instance_key_start: the starting integer of instance key.
+      instance_key_with_id_start: the starting integer of instance key that is
+        recorded with an id.
+    """
+    self._group_key = group_key_start
+    self._group_key_table = dict()
+
+    # For instance keys with ids
+    self._instance_key_id_to_key_table = dict()
+    self._instance_key_with_id_counter = instance_key_with_id_start
+
+    # For instance keys without ids
+    self._instance_key_start = instance_key_start
+
+  def _get_thread_local_object(self):
+    # We make instance key without key ids thread local so that it will work
+    # with MirroredStrategy and distribute coordinator.
+    if not hasattr(_thread_local, 'instance_key'):
+      _thread_local.instance_key = self._instance_key_start
+    return _thread_local
+
+  def get_group_key(self, devices):
+    """Returns a group key for the set of devices.
+
+    Args:
+      devices: list of strings naming devices in a collective group.
+
+    Returns:
+      int key uniquely identifying the set of device names.
+    """
+    parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
+    # In the between-graph replicated training, different workers need to get
+    # the same device key. So we remove the task_type and task_id from the
+    # devices.
+    # TODO(yuefengz): in the in-graph replicated training, we need to include
+    # task_type and task_id.
+    names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
+    key_id = ','.join(names)
+    with _lock:
+      if key_id not in self._group_key_table:
+        new_key = self._group_key
+        self._group_key += 1
+        self._group_key_table[key_id] = new_key
+    return self._group_key_table[key_id]
+
+  def get_instance_key(self, key_id=None):
+    """Returns a new instance key for use in defining a collective op.
+
+    Args:
+      key_id: optional string. If set, key will be recorded and the same key
+        will be returned when the same key_id is provided. If not, an increasing
+        instance key will be returned.
+    """
+    if key_id:
+      with _lock:
+        if key_id not in self._instance_key_id_to_key_table:
+          self._instance_key_with_id_counter += 1
+          self._instance_key_id_to_key_table[key_id] = (
+              self._instance_key_with_id_counter)
+      return self._instance_key_id_to_key_table[key_id]
+    else:
+      v = self._get_thread_local_object().instance_key
+      self._get_thread_local_object().instance_key += 1
+      return v
+
+
+def build_collective_reduce(input_tensors,
+                            num_workers,
+                            collective_keys,
+                            reduction_op='Add',
+                            unary_op='Id'):
+  """Build a subgraph that does one full all-reduce, using the collective Op.
+
+  Args:
+    input_tensors: tensors within a single worker graph that are to be reduced
+      together; must be one per device.
+    num_workers: total number of workers with identical independent graphs that
+      will be doing this same reduction.  The reduction will actually include
+      the corresponding tensors at all these workers.
+    collective_keys: a CollectiveKeys object.
+    reduction_op: string naming the reduction op.
+    unary_op: string naming the unary final op.
+
+  Returns:
+    An array of final tensors, one per device, computed by the full reduction.
+
+  Raises:
+    ValueError: There must be at least two tensors over all the workers.
+  """
+  group_size = len(input_tensors) * num_workers
+  if group_size < 2:
+    raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
+  devices = [t.device for t in input_tensors]
+  num_devices = len(devices)
+  group_key = collective_keys.get_group_key(devices)
+  instance_key = collective_keys.get_instance_key()
+  out_tensors = []
+  subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
+  for d in range(num_devices):
+    with ops.device(devices[d]):
+      reduce_op = collective_ops.all_reduce(
+          input_tensors[d], group_size, group_key, instance_key, reduction_op,
+          unary_op, subdiv_offsets)
+      out_tensors.append(reduce_op)
+  return out_tensors
+
+
+def sum_grad_and_var_all_reduce(grad_and_vars,
+                                num_workers,
+                                alg,
+                                gpu_indices,
+                                aux_devices=None,
+                                num_shards=1):
+  """Apply all-reduce algorithm over specified gradient tensors."""
+  with ops.name_scope('allreduce'):
+    # Note that each grad_and_vars looks like the following:
+    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+    scaled_grads = [g for g, _ in grad_and_vars]
+    if alg == 'nccl':
+      summed_grads = nccl_ops.all_sum(scaled_grads)
+    elif alg == 'xring':
+      summed_grads = all_reduce.build_ring_all_reduce(
+          scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
+    elif alg == 'nccl/xring':
+      summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
+                                                     math_ops.add)
+    elif alg == 'nccl/rechd':
+      summed_grads = all_reduce.build_nccl_then_recursive_hd(
+          scaled_grads, math_ops.add)
+    elif alg == 'nccl/pscpu':
+      summed_grads = all_reduce.build_nccl_then_shuffle(
+          scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
+    elif alg == 'pscpu/pscpu':
+      second_gather_devices = aux_devices[:num_shards]
+      summed_grads = all_reduce.build_shuffle_then_shuffle(
+          scaled_grads, aux_devices, second_gather_devices, math_ops.add_n)
+    elif alg in ['pscpu', 'psgpu']:
+      summed_grads = all_reduce.build_shuffle_all_reduce(
+          scaled_grads, aux_devices, math_ops.add_n)
+    else:
+      raise ValueError('unsupported all_reduce alg: ', alg)
+
+  result = []
+  for (_, v), g in zip(grad_and_vars, summed_grads):
+    result.append([g, v])
+  return result
+
+
+def sum_gradients_all_reduce(dev_prefixes, replica_grads, num_workers, alg,
+                             num_shards, gpu_indices):
+  """Apply all-reduce algorithm over specified gradient tensors.
+
+  Args:
+    dev_prefixes: list of prefix strings to use to generate PS device names.
+    replica_grads: the gradients to reduce.
+    num_workers: number of worker processes across entire job.
+    alg: the all-reduce algorithm to apply.
+    num_shards: alg-specific sharding factor.
+    gpu_indices: indices of local GPUs in order usable for ring-reduce.
+
+  Returns:
+    list of reduced tensors
+  """
+  alg_contains_shuffle = any(n in alg for n in ['pscpu', 'psgpu'])
+  is_hierarchical = '/' in alg
+  if 'pscpu' in alg:
+    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
+  elif 'psgpu' in alg:
+    aux_devices = [
+        prefix + '/gpu:%d' % i
+        for i in range(len(gpu_indices))
+        for prefix in dev_prefixes
+    ]
+  else:
+    aux_devices = ['/job:localhost/cpu:0']
+  # Auxiliary devices for hierarchical all-reduces.
+  aux_device_groups = group_device_names(
+      aux_devices, num_shards if alg_contains_shuffle else 1)
+  group_index = 0
+  reduced_gv_list = []
+  for grad_and_vars in zip(*replica_grads):
+    reduced_gv_list.append(
+        sum_grad_and_var_all_reduce(
+            grad_and_vars, num_workers, alg, gpu_indices, aux_devices
+            if is_hierarchical else aux_device_groups[group_index], num_shards))
+    group_index = (group_index + 1) % len(aux_device_groups)
+  new_replica_grads = [list(x) for x in zip(*reduced_gv_list)]
+  return new_replica_grads
+
+
+def extract_ranges(index_list, range_size_limit=32):
+  """Extract consecutive ranges and singles from index_list.
+
+  Args:
+    index_list: List of monotone increasing non-negative integers.
+    range_size_limit: Largest size range to return.  If a larger
+      consecutive range exists, it will be returned as multiple
+      ranges.
+
+  Returns:
+    (ranges, singles) where ranges is a list of [first, last] pairs of
+      consecutive elements in index_list, and singles is all of the
+      other elements, in original order.
+  """
+  if not index_list:
+    return [], []
+  first = index_list[0]
+  last = first
+  ranges = []
+  singles = []
+  for i in index_list[1:]:
+    if i == last + 1 and (last - first) <= range_size_limit:
+      last = i
+    else:
+      if last > first:
+        ranges.append([first, last])
+      else:
+        singles.append(first)
+      first = i
+      last = i
+  if last > first:
+    ranges.append([first, last])
+  else:
+    singles.append(first)
+  return ranges, singles
+
+
+GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
+
+
+def pack_range(key, packing, grad_vars, rng):
+  """Form the concatenation of a specified range of gradient tensors.
+
+  Args:
+    key: Value under which to store meta-data in packing that will be used
+      later to restore the grad_var list structure.
+    packing: Dict holding data describing packed ranges of small tensors.
+    grad_vars: List of (grad, var) pairs for one replica.
+    rng: A pair of integers giving the first, last indices of a consecutive
+      range of tensors to be packed.
+
+  Returns:
+    A tensor that is the concatenation of all the specified small tensors.
+  """
+  to_pack = grad_vars[rng[0]:rng[1] + 1]
+  members = []
+  variables = []
+  restore_shapes = []
+  with ops.name_scope('pack'):
+    for g, v in to_pack:
+      variables.append(v)
+      restore_shapes.append(g.shape)
+      with ops.device(g.device):
+        members.append(array_ops.reshape(g, [-1]))
+    packing[key] = GradPackTuple(
+        indices=range(rng[0], rng[1] + 1),
+        vars=variables,
+        shapes=restore_shapes)
+    with ops.device(members[0].device):
+      return array_ops.concat(members, 0)
+
+
+def unpack_grad_tuple(gv, gpt):
+  """Unpack a previously packed collection of gradient tensors.
+
+  Args:
+    gv: A (grad, var) pair to be unpacked.
+    gpt: A GradPackTuple describing the packing operation that produced gv.
+
+  Returns:
+    A list of (grad, var) pairs corresponding to the values that were
+     originally packed into gv, maybe following subsequent operations like
+     reduction.
+  """
+  elt_widths = [x.num_elements() for x in gpt.shapes]
+  with ops.device(gv[0][0].device):
+    with ops.name_scope('unpack'):
+      splits = array_ops.split(gv[0], elt_widths)
+      unpacked_gv = []
+      for idx, s in enumerate(splits):
+        unpacked_gv.append((array_ops.reshape(s, gpt.shapes[idx]),
+                            gpt.vars[idx]))
+  return unpacked_gv
+
+
+def pack_small_tensors(replica_grads, max_bytes=0, max_group=0):
+  """Concatenate small gradient tensors together for reduction.
+
+  Args:
+    replica_grads: List of lists of (gradient, variable) tuples.
+    max_bytes: Int giving max number of bytes in a tensor that
+      may be considered small.
+    max_group: Int giving max number of small tensors that may be
+      concatenated into one new tensor.
+
+  Returns:
+    new_replica_grads, packing where new_replica_grads is identical to
+      replica_grads except that all feasible small_tensors have been removed
+      from their places and concatenated into larger tensors that are
+      now in the front of the list for each replica, and packing contains
+      the data necessary to restore the replica_grads structure.
+
+  Look through the first replica for gradients of the same type (float),
+  and small size, that are all sequential.  For each such group,
+  replace by a new tensor that is a flattened concatenation.  Note
+  that the corresponding variable will be absent, which doesn't matter
+  because it isn't used during all-reduce.
+
+  Requires:
+    Every gv_list in replicas must have isomorphic structure including identical
+      tensor sizes and types.
+  """
+  small_indices = []
+  large_indices = []
+  for idx, (g, _) in enumerate(replica_grads[0]):
+    if g.dtype == dtypes.float32 and (4 * g.shape.num_elements()) <= max_bytes:
+      small_indices.append(idx)
+    else:
+      large_indices.append(idx)
+  small_ranges, small_singles = extract_ranges(
+      small_indices, range_size_limit=max_group)
+  large_indices = sorted(large_indices + small_singles)
+  num_gv = len(replica_grads[0])
+  packing = {}
+  if small_ranges:
+    new_replica_grads = []
+    for dev_idx, gv_list in enumerate(replica_grads):
+      assert len(gv_list) == num_gv
+      new_gv_list = []
+      for r in small_ranges:
+        key = '%d:%d' % (dev_idx, len(new_gv_list))
+        new_gv_list.append((pack_range(key, packing, gv_list, r),
+                            'packing_var_placeholder'))
+      for i in large_indices:
+        new_gv_list.append(gv_list[i])
+      new_replica_grads.append(new_gv_list)
+    return new_replica_grads, packing
+  else:
+    return replica_grads, None
+
+
+def unpack_small_tensors(replica_grads, packing):
+  """Undo the structure alterations to replica_grads done by pack_small_tensors.
+
+  Args:
+    replica_grads: List of List of (grad, var) tuples.
+    packing: A dict generated by pack_small_tensors describing the changes
+      it made to replica_grads.
+
+  Returns:
+    new_replica_grads: identical to replica_grads except that concatenations
+      of small tensors have been split apart and returned to their original
+      positions, paired with their original variables.
+  """
+  if not packing:
+    return replica_grads
+  new_replica_grads = []
+  num_devices = len(replica_grads)
+  num_packed = len(packing.keys()) // num_devices
+  for dev_idx, gv_list in enumerate(replica_grads):
+    gv_list = list(gv_list)
+    new_gv_list = gv_list[num_packed:]
+    for i in range(num_packed):
+      k = '%d:%d' % (dev_idx, i)
+      gpt = packing[k]
+      gv = unpack_grad_tuple(gv_list[i], gpt)
+      for gi, idx in enumerate(gpt.indices):
+        assert idx == gpt.indices[gi]
+        new_gv_list.insert(idx, gv[gi])
+    new_replica_grads.append(new_gv_list)
+  return new_replica_grads
+
+
+def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
+  """Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
+  if any(isinstance(v, ops.IndexedSlices) for v in values):
+    return gradients_impl._AggregateIndexedSlicesGradients(values)  # pylint: disable=protected-access
+  else:
+    return accumulation_fn(values)
+
+
+def divide_by_n_tensors_or_indexed_slices(value, n):
+  if isinstance(value, ops.IndexedSlices):
+    value = gradients_impl._HandleNestedIndexedSlices(value)  # pylint: disable=protected-access
+    return ops.IndexedSlices(
+        value.values / n, value.indices, value.dense_shape)
+  else:
+    return value / n
+
+
+def copy_tensor_or_indexed_slices_to_device(value, device):
+  with ops.device(device):
+    if isinstance(value, ops.IndexedSlices):
+      copied_values = array_ops.identity(value.values)
+      copied_indices = array_ops.identity(value.indices)
+      copied_shape = array_ops.identity(value.dense_shape)
+      result = ops.IndexedSlices(copied_values, copied_indices, copied_shape)
+    else:
+      result = array_ops.identity(value)
+  return result
+
+
+def contains_indexed_slices(value):
+  """Check whether the value is `IndexedSlices` or contains `IndexedSlices`."""
+  if isinstance(value, ops.IndexedSlices):
+    return True
+  elif isinstance(value, (list, tuple)) and value:
+    return any(contains_indexed_slices(v) for v in value)
+  elif isinstance(value, value_lib.DistributedValues):
+    return contains_indexed_slices(list(value._index.values()))  # pylint: disable=protected-access
+  else:
+    return False
diff --git a/tensorflow/python/training/device_util.py b/tensorflow/python/distribute/device_util.py
similarity index 98%
rename from tensorflow/python/training/device_util.py
rename to tensorflow/python/distribute/device_util.py
index 70e1ca4b5d77e5e7529cb0d06a9ffb4657dc74fe..34474582adfa8c73c4a7bbbe130dcf6faf88ce0b 100644
--- a/tensorflow/python/training/device_util.py
+++ b/tensorflow/python/distribute/device_util.py
@@ -50,7 +50,7 @@ def canonicalize(d, default=None):
   # Fill in missing device fields using defaults.
   result = tf_device.DeviceSpec(
       replica=0, task=0, device_type="CPU", device_index=0)
-  if context.executing_eagerly():
+  if ops.executing_eagerly_outside_functions():
     result.job = "localhost"
   if default:
     result.merge_from(tf_device.DeviceSpec.from_string(default))
diff --git a/tensorflow/python/training/device_util_test.py b/tensorflow/python/distribute/device_util_test.py
similarity index 95%
rename from tensorflow/python/training/device_util_test.py
rename to tensorflow/python/distribute/device_util_test.py
index cdbb08229d2f06c2cfeeb855b32665f7c03ea969..2f0d7ed3b317f59e314148c583a8f1f69240b37b 100644
--- a/tensorflow/python/training/device_util_test.py
+++ b/tensorflow/python/distribute/device_util_test.py
@@ -18,14 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import device_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
-from tensorflow.python.training import device_util
 
 
 class DeviceUtilTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCurrentDeviceWithGlobalGraph(self):
     with ops.device("/cpu:0"):
       self.assertEqual(device_util.current(), "/device:CPU:0")
@@ -49,6 +51,7 @@ class DeviceUtilTest(test.TestCase):
         self.assertEqual(device_util.current(),
                          "/job:localhost/replica:0/task:0/device:CPU:0")
 
+  @test_util.run_deprecated_v1
   def testCanonicalizeWithoutDefaultDevice(self):
     self.assertEqual(
         device_util.canonicalize("/cpu:0"),
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index b9b77d4a5b1ad16c9411c0b20966a5b70c6a8967..c0f9b8a1fdfdf8bd95375f489058cadcd63c9cb9 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -245,7 +245,7 @@ class _WorkerContext(object):
     else:
       session_config = self._session_config
 
-    if not self._strategy or self._strategy.should_init:
+    if not self._strategy or self._strategy.extended.experimental_should_init:
       logging.info("Creating chief session creator with config: %r", config)
       return monitored_session.ChiefSessionCreator(
           scaffold,
@@ -261,6 +261,10 @@ class _WorkerContext(object):
           config=session_config,
           max_wait_secs=max_wait_secs)
 
+  @property
+  def session_config(self):
+    return copy.deepcopy(self._session_config)
+
   @property
   def has_barrier(self):
     """Whether the barrier is set or not."""
@@ -301,15 +305,20 @@ class _WorkerContext(object):
     """Returns number of workers in the cluster, including chief."""
     return self._num_workers
 
+  @property
+  def experimental_should_init(self):
+    """Whether to run init ops."""
+    return self._strategy.extended.experimental_should_init
+
   @property
   def should_checkpoint(self):
     """Whether to save checkpoint."""
-    return self._strategy.should_checkpoint
+    return self._strategy.extended.should_checkpoint
 
   @property
   def should_save_summary(self):
     """Whether to save summaries."""
-    return self._strategy.should_save_summary
+    return self._strategy.extended.should_save_summary
 
 
 def _run_single_worker(worker_fn,
@@ -341,7 +350,7 @@ def _run_single_worker(worker_fn,
       rpc_layer=rpc_layer,
       worker_barrier=worker_barrier)
   with context:
-    worker_fn(strategy)
+    return worker_fn(strategy)
 
 
 def _split_cluster_for_evaluator(cluster_spec, task_type):
@@ -448,6 +457,9 @@ def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
   if eval_thread:
     eval_thread.join()
 
+  # TODO(yuefengz): we probably want to return results from all workers?
+  return None
+
 
 def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
                          cluster_spec, session_config, rpc_layer):
@@ -463,7 +475,7 @@ def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
         })
     eval_thread.start()
 
-  _run_single_worker(
+  worker_result = _run_single_worker(
       worker_fn,
       strategy,
       cluster_spec,
@@ -473,6 +485,7 @@ def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
       rpc_layer=rpc_layer)
   if eval_thread:
     eval_thread.join()
+  return worker_result
 
 
 def _configure_session_config_for_std_servers(
@@ -619,10 +632,10 @@ def run_distribute_coordinator(worker_fn,
   The `strategy` object is expected to be a DistributionStrategy object which
   has implemented methods needed by distributed coordinator such as
   `configure(session_config, cluster_spec, task_type, task_id)` which configures
-  the strategy object for a specific task and `should_init` property which
-  instructs the distribute coordinator whether to run init ops for a task. The
-  distribute coordinator will make a copy of the `strategy` object, call its
-  `configure` method and pass it to `worker_fn` as an argument.
+  the strategy object for a specific task and `experimental_should_init`
+  property which instructs the distribute coordinator whether to run init ops
+  for a task. The distribute coordinator will make a copy of the `strategy`
+  object, call its `configure` method and pass it to `worker_fn` as an argument.
 
   The `worker_fn` defines the training logic and is called under a its own
   worker context which can be accessed to via `get_current_worker_context`. A
@@ -692,6 +705,10 @@ def run_distribute_coordinator(worker_fn,
   Raises:
     ValueError: if `cluster_spec` is supplied but not a dict or a ClusterDef or
       a ClusterSpec.
+
+  Returns:
+    In the client job, return the value returned by `worker_fn` if
+    it is in-graph replication; return None otherwise.
   """
   tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
   if not cluster_spec:
@@ -741,12 +758,13 @@ def run_distribute_coordinator(worker_fn,
     # The client must know the cluster but servers in the cluster don't have to
     # know the client.
     if task_type in [_TaskType.CLIENT, None]:
-      if strategy.between_graph:
-        _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
-                                  cluster_spec, session_config, rpc_layer)
+      if strategy.extended.experimental_between_graph:
+        return _run_between_graph_client(worker_fn, strategy, eval_fn,
+                                         eval_strategy, cluster_spec,
+                                         session_config, rpc_layer)
       else:
-        _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
-                             cluster_spec, session_config, rpc_layer)
+        return _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
+                                    cluster_spec, session_config, rpc_layer)
     else:
       # If not a client job, run the standard server.
       _configure_session_config_for_std_servers(strategy, eval_strategy,
@@ -786,7 +804,7 @@ def run_distribute_coordinator(worker_fn,
         environment=environment)
 
     if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
-      if strategy.between_graph:
+      if strategy.extended.experimental_between_graph:
         # All jobs run `worker_fn` if between-graph.
         _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
                            task_id, session_config, rpc_layer)
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index b07308a1b5dafdd89d43a9fb11689c124bbff3fe..7598c105c2dd763c524e50e139fdd9984f1bd0c0 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
+from tensorflow.python.training import session_manager
 
 
 CHIEF = distribute_coordinator._TaskType.CHIEF
@@ -78,46 +79,53 @@ def _strip_protocol(target):
     return target
 
 
-class MockStrategy(object):
+class MockExtended(object):
 
   def __init__(self,
                between_graph=False,
                should_init=None,
                should_checkpoint=None,
                should_save_summary=None):
-    self._between_graph = between_graph
-    self._should_init = should_init
-    self._should_checkpoint = should_checkpoint
-    self._should_save_summary = should_save_summary
+    self.experimental_between_graph = between_graph
+    self.experimental_should_init = should_init
+    self.should_checkpoint = should_checkpoint
+    self.should_save_summary = should_save_summary
 
-  @property
-  def between_graph(self):
-    return self._between_graph
+
+class MockStrategy(object):
+
+  def __init__(self,
+               between_graph=False,
+               should_init=None,
+               should_checkpoint=None,
+               should_save_summary=None):
+    self.extended = MockExtended(between_graph, should_init, should_checkpoint,
+                                 should_save_summary)
 
   def configure(self,
                 session_config=None,
                 cluster_spec=None,
                 task_type=None,
                 task_id=None):
-    if self._should_init is None:
+    if self.extended.experimental_should_init is None:
       if task_id == 0:
-        self._should_init = True
+        self.extended.experimental_should_init = True
       else:
-        self._should_init = False
-    if self._should_checkpoint is None:
+        self.extended.experimental_should_init = False
+    if self.extended.should_checkpoint is None:
       if task_id == 0:
-        self._should_checkpoint = True
+        self.extended.should_checkpoint = True
       else:
-        self._should_checkpoint = False
-    if self._should_save_summary is None:
+        self.extended.should_checkpoint = False
+    if self.extended.should_save_summary is None:
       if task_id == 0:
-        self._should_save_summary = True
+        self.extended.should_save_summary = True
       else:
-        self._should_save_summary = False
+        self.extended.should_save_summary = False
 
     if session_config:
       if (cluster_spec and task_type and task_id is not None and
-          self._between_graph):
+          self.extended.experimental_between_graph):
         session_config.intra_op_parallelism_threads += 1
         if task_type in ["chief", "worker"]:
           session_config.device_filters.extend(
@@ -126,18 +134,6 @@ class MockStrategy(object):
         session_config.inter_op_parallelism_threads += 1
         session_config.device_filters.append("/job:somejob")
 
-  @property
-  def should_init(self):
-    return self._should_init
-
-  @property
-  def should_checkpoint(self):
-    return self._should_checkpoint
-
-  @property
-  def should_save_summary(self):
-    return self._should_save_summary
-
 
 class MockServer(object):
 
@@ -317,9 +313,9 @@ class DistributeCoordinatorTestBase(test.TestCase):
     with ops.device("/job:ps/task:0"):
       # TODO(yuefengz): investigate why not using resource variable will make
       # the test flaky.
-      x = variable_scope.get_variable("x", initializer=10.0, use_resource=True)
+      x = variable_scope.get_variable("xx", initializer=10.0, use_resource=True)
     with ops.device("/job:ps/task:1"):
-      y = variable_scope.get_variable("y", initializer=20.0, use_resource=True)
+      y = variable_scope.get_variable("yy", initializer=20.0, use_resource=True)
 
     x_add = x.assign_add(2.0)
     y_sub = y.assign_sub(2.0)
@@ -372,9 +368,12 @@ class DistributeCoordinatorTestBase(test.TestCase):
     context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
 
-    self.assertEqual(context._strategy.should_init, strategy.should_init)
-    self.assertEqual(context.should_checkpoint, strategy.should_checkpoint)
-    self.assertEqual(context.should_save_summary, strategy.should_save_summary)
+    self.assertEqual(context._strategy.extended.experimental_should_init,
+                     strategy.extended.experimental_should_init)
+    self.assertEqual(context.should_checkpoint,
+                     strategy.extended.should_checkpoint)
+    self.assertEqual(context.should_save_summary,
+                     strategy.extended.should_save_summary)
 
     task_type = str(context.task_type)
     task_id = context.task_id or 0
@@ -384,7 +383,8 @@ class DistributeCoordinatorTestBase(test.TestCase):
       while len(self._strategy_property[task_type]) <= task_id:
         self._strategy_property[task_type].append(None)
       self._strategy_property[task_type][task_id] = (
-          context._strategy.should_init, context.should_checkpoint,
+          context._strategy.extended.experimental_should_init,
+          context.should_checkpoint,
           context.should_save_summary)
 
   def _run_mock_std_server(self,
@@ -427,6 +427,7 @@ class DistributeCoordinatorTestStandaloneMode(DistributeCoordinatorTestBase):
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
 
+  @test_util.run_v1_only("b/120545219")
   def testBetweenGraphWithMonitoredSession(self):
     """Test monitored session in standalone client mode."""
     distribute_coordinator.run_distribute_coordinator(
@@ -600,6 +601,7 @@ class DistributeCoordinatorTestInpendentWorkerMode(
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
 
+  @test_util.run_v1_only("b/120545219")
   def testBetweenGraphWithMonitoredSession(self):
     cluster_spec = self._create_cluster_spec(
         num_workers=NUM_WORKERS, num_ps=NUM_PS)
@@ -930,4 +932,14 @@ class RunStandardTensorflowServerTest(test.TestCase):
 if __name__ == "__main__":
   # TODO(yuefengz): find a smart way to terminite std server threads.
   with test.mock.patch.object(sys, "exit", os._exit):
+    # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly.
+    orig_init = session_manager.SessionManager.__init__
+
+    def new_init(*args, **kwargs):
+      kwargs.pop("recovery_wait_secs", None)
+      kwargs["recovery_wait_secs"] = 0.5
+      orig_init(*args, **kwargs)
+
+    session_manager.SessionManager.__init__ = new_init
+
     test.main()
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..87bf510ec549f6bf1ccabfba438d2c64fd5a88d9
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -0,0 +1,1682 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for running a computation across multiple devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import threading
+import weakref
+import enum
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.eager import context as eager_context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
+
+
+# ------------------------------------------------------------------------------
+# Context tracking whether in a strategy.update() or .update_non_slot() call.
+
+
+_update_device = threading.local()
+
+
+def get_update_device():
+  """Get the current device if in a `tf.distribute.Strategy.update()` call."""
+  try:
+    return _update_device.current
+  except AttributeError:
+    return None
+
+
+class UpdateContext(object):
+  """Context manager when you are in `update()` or `update_non_slot()`."""
+
+  def __init__(self, device):
+    self._device = device
+    self._old_device = None
+
+  def __enter__(self):
+    self._old_device = get_update_device()
+    _update_device.current = self._device
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+    _update_device.current = self._old_device
+
+
+# ------------------------------------------------------------------------------
+# Public utility functions.
+
+
+@tf_export("distribute.get_loss_reduction")
+def get_loss_reduction():
+  """`tf.distribute.ReduceOp` corresponding to the last loss reduction."""
+  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
+  if (loss_reduction == losses_impl.Reduction.SUM or
+      loss_reduction == losses_impl.ReductionV2.SUM):
+    return reduce_util.ReduceOp.SUM
+  return reduce_util.ReduceOp.MEAN
+
+
+# ------------------------------------------------------------------------------
+# Internal API for validating the current thread mode
+
+
+def _require_cross_replica_context_extended(extended):
+  """Verify in cross-replica context."""
+  context = _get_per_thread_mode()
+  cross_replica = context.cross_replica_context
+  if cross_replica is not None and cross_replica.extended is extended:
+    return
+  strategy = extended._container_strategy()  # pylint: disable=protected-access
+  # We have an error to report, figure out the right message.
+  if context.distribution_strategy is not strategy:
+    _wrong_strategy_scope(strategy, context)
+  assert cross_replica is None
+  raise RuntimeError("Method requires being in cross-replica context, use "
+                     "get_replica_context().merge_call()")
+
+
+def _wrong_strategy_scope(strategy, context):
+  # Figure out the right error message.
+  if not distribution_strategy_context.has_distribution_strategy():
+    raise RuntimeError(
+        'Need to be inside "with strategy.scope()" for %s' %
+        (strategy,))
+  else:
+    raise RuntimeError(
+        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
+        (context.distribution_strategy, strategy))
+
+
+def require_replica_context(replica_ctx):
+  """Verify in `replica_ctx` replica context."""
+  context = _get_per_thread_mode()
+  if context.replica_context is replica_ctx: return
+  # We have an error to report, figure out the right message.
+  if context.replica_context is None:
+    raise RuntimeError("Need to be inside `call_for_each_replica()`")
+  if context.distribution_strategy is replica_ctx.distribution_strategy:
+    # Two different ReplicaContexts with the same tf.distribute.Strategy.
+    raise RuntimeError("Mismatching ReplicaContext.")
+  raise RuntimeError(
+      "Mismatching tf.distribute.Strategy objects: %s is not %s." %
+      (context.distribution_strategy, replica_ctx.distribution_strategy))
+
+
+def _require_distribution_strategy_scope_strategy(strategy):
+  """Verify in a `strategy.scope()` in this thread."""
+  context = _get_per_thread_mode()
+  if context.distribution_strategy is strategy: return
+  _wrong_strategy_scope(strategy, context)
+
+
+def _require_distribution_strategy_scope_extended(extended):
+  """Verify in a `distribution_strategy.scope()` in this thread."""
+  context = _get_per_thread_mode()
+  if context.distribution_strategy.extended is extended: return
+  # Report error.
+  strategy = extended._container_strategy()  # pylint: disable=protected-access
+  _wrong_strategy_scope(strategy, context)
+
+
+# ------------------------------------------------------------------------------
+# Internal context managers used to implement the DistributionStrategy
+# base class
+
+
+class _CurrentDistributionContext(object):
+  """Context manager setting the current `tf.distribute.Strategy`.
+
+  Also: overrides the variable creator and optionally the current device.
+  """
+
+  def __init__(self,
+               strategy,
+               var_creator_scope,
+               var_scope=None,
+               default_device=None):
+    self._context = distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
+        strategy)
+    self._var_creator_scope = var_creator_scope
+    self._var_scope = var_scope
+    if default_device:
+      self._device_scope = ops.device(default_device)
+    else:
+      self._device_scope = None
+
+  def __enter__(self):
+    _push_per_thread_mode(self._context)
+    if self._var_scope:
+      self._var_scope.__enter__()
+    self._var_creator_scope.__enter__()
+    if self._device_scope:
+      self._device_scope.__enter__()
+    return self._context.distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    if self._device_scope:
+      self._device_scope.__exit__(exception_type, exception_value, traceback)
+    self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
+    if self._var_scope:
+      self._var_scope.__exit__(exception_type, exception_value, traceback)
+    _pop_per_thread_mode()
+
+
+class _SameScopeAgainContext(object):
+  """Trivial context manager when you are already in `scope()`."""
+
+  def __init__(self, strategy):
+    self._distribution_strategy = strategy
+
+  def __enter__(self):
+    return self._distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+
+
+# TODO(yuefengz): add more replication modes.
+@tf_export("distribute.InputReplicationMode")
+class InputReplicationMode(enum.Enum):
+  """Replication mode for input function."""
+
+  # The input function will be called on each worker independently, creating as
+  # many input pipelines as number of workers. Replicas will dequeue from the
+  # local Dataset on their worker. Distribution Strategy doesn't manage any
+  # state sharing between such separate input pipelines.
+  PER_WORKER = "PER_WORKER"
+
+
+@tf_export("distribute.InputContext")
+class InputContext(object):
+  """A class wrapping information needed by an input function.
+
+  This is a context class that is passed to the user's input fn and contains
+  information about the compute replicas and input pipelines. The number of
+  compute replicas (in sync training) helps compute per input pipeline batch
+  size from the desired global batch size. Input pipeline information can be
+  used to return a different subset of the input in each input pipeline (for
+  e.g. shard the input pipeline, use a different input source etc).
+  """
+
+  def __init__(self,
+               num_input_pipelines=1,
+               input_pipeline_id=0,
+               num_replicas_in_sync=1):
+    """Initializes an InputContext object.
+
+    Args:
+      num_input_pipelines: the number of input pipelines in a cluster.
+      input_pipeline_id: the current input pipeline id, should be an int in
+        [0,`num_input_pipelines`).
+      num_replicas_in_sync: the number of replicas that are in sync.
+    """
+    self._num_input_pipelines = num_input_pipelines
+    self._input_pipeline_id = input_pipeline_id
+    self._num_replicas_in_sync = num_replicas_in_sync
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns the number of compute replicas in sync."""
+    return self._num_replicas_in_sync
+
+  @property
+  def input_pipeline_id(self):
+    """Returns the input pipeline ID."""
+    return self._input_pipeline_id
+
+  @property
+  def num_input_pipelines(self):
+    """Returns the number of input pipelines."""
+    return self._num_input_pipelines
+
+  def get_per_replica_batch_size(self, global_batch_size):
+    """Returns the per-replica batch size.
+
+    Args:
+      global_batch_size: the global batch size which should be divisible by
+        `num_replicas_in_sync`.
+
+    Returns:
+      the per-replica batch size.
+
+    Raises:
+      ValueError: if `global_batch_size` not divisible by
+        `num_replicas_in_sync`.
+    """
+    if global_batch_size % self._num_replicas_in_sync != 0:
+      raise ValueError("The `global_batch_size` %r is not divisible by "
+                       "`num_replicas_in_sync` %r " %
+                       (global_batch_size, self._num_replicas_in_sync))
+    return global_batch_size // self._num_replicas_in_sync
+
+
+# ------------------------------------------------------------------------------
+# Base classes for all distribution strategies.
+
+
+@tf_export("distribute.Strategy")
+class DistributionStrategy(object):
+  """A list of devices with a state & compute distribution policy.
+
+  See [tensorflow/contrib/distribute/README.md](
+  https://www.tensorflow.org/code/tensorflow/contrib/distribute/README.md)
+  for overview and examples.
+  """
+
+  # TODO(josh11b): Raise an exception if variable partitioning requested before
+  #   we add support.
+  # TODO(josh11b): Also `parameter_device_index` property?
+  # TODO(josh11b): `map()`
+  # TODO(josh11b): ClusterSpec/ClusterResolver
+  # TODO(josh11b): Partitioned computations, state; sharding
+  # TODO(josh11b): Model parallelism: "replicas" with multiple devices; shuffling
+  # TODO(josh11b): List of replicas with their worker and parameter devices
+  #   (where the parameter devices may overlap in the ps case).
+
+  def __init__(self, extended):
+    self._extended = extended
+
+  @property
+  def extended(self):
+    """`tf.distribute.StrategyExtended` with additional methods."""
+    return self._extended
+
+  def scope(self):
+    """Returns a context manager selecting this Strategy as current.
+
+    Inside a `with strategy.scope():` code block, this thread
+    will use a variable creator set by `strategy`, and will
+    enter its "cross-replica context".
+
+    Returns:
+      A context manager.
+    """
+    return self._extended._scope(self)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def read_var(self, v):
+    """DEPRECATED: use extended.read_var() instead."""
+    return self._extended.read_var(v)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def colocate_vars_with(self, colocate_with_variable):
+    """DEPRECATED: use extended.colocate_vars_with() instead."""
+    return self._extended.colocate_vars_with(colocate_with_variable)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED
+  def distribute_dataset(self, dataset_fn):
+    """Return a `dataset` split across all replicas.  DEPRECATED.
+
+    DEPRECATED: Please use `make_dataset_iterator` or
+    `make_input_fn_iterator` instead.
+
+    Suitable for providing input to `extended.call_for_each_replica()` by
+    creating an iterator:
+
+    ```
+    def dataset_fn():
+      return tf.data.Dataset.from_tensors([[1.]]).repeat()
+
+    with strategy.scope():
+      distributed_dataset = strategy.distribute_dataset(dataset_fn)
+      iterator = distributed_dataset.make_initializable_iterator()
+      replica_results = strategy.extended.call_for_each_replica(
+          replica_fn, args=(iterator.get_next(),))
+    ```
+
+    Args:
+      dataset_fn: A function that returns a `tf.data.Dataset`.
+
+    Returns:
+      A `PerReplicaDataset` that will produce data for each replica.
+    """
+    return self._extended._distribute_dataset(dataset_fn)  # pylint: disable=protected-access
+
+  def make_dataset_iterator(self, dataset):
+    """Makes an iterator for input provided via input_dataset.
+
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    global batch size. With this assumption, we will make a best effort to
+    divide each batch across all the replicas (one or more workers).
+    If this effort fails, an error will be thrown, and the user should instead
+    use `make_input_fn_iterator` which provides more control to the user, and
+    does not try to divide a batch across replicas.
+
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return self._extended._make_dataset_iterator(dataset)  # pylint: disable=protected-access
+
+  def make_input_fn_iterator(self,
+                             input_fn,
+                             replication_mode=InputReplicationMode.PER_WORKER):
+    """Returns an iterator split across replicas created from an input function.
+
+    The `input_fn` should take an `tf.distribute.InputContext` object where
+    information about input sharding can be accessed:
+
+    ```
+    def input_fn(input_context):
+      d = tf.data.Dataset.from_tensors([[1.]]).repeat()
+      return d.shard(input_context.num_input_pipelines,
+                     input_context.input_pipeline_id)
+    with strategy.scope():
+      iterator = strategy.make_input_fn_iterator(
+          input_fn)
+      replica_results = strategy.extended.call_for_each_replica(
+          replica_fn, iterator.get_next())
+    ```
+
+    Args:
+      input_fn: A function that returns a `tf.data.Dataset`. This function is
+        expected to take an `tf.distribute.InputContext` object.
+      replication_mode: an enum value of `tf.distribute.InputReplicationMode`.
+        Only `PER_WORKER` is supported currently.
+
+    Returns:
+      An iterator object that can be initialized and fetched next element.
+    """
+    if replication_mode != InputReplicationMode.PER_WORKER:
+      raise ValueError(
+          "Input replication mode not supported: %r" % replication_mode)
+    return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
+        input_fn, replication_mode=replication_mode)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def broadcast(self, tensor, destinations=None):
+    """DEPRECATED: use extended.broadcast_to() instead."""
+    return self._extended.broadcast_to(tensor, destinations)
+
+  @doc_controls.do_not_generate_docs  # Use experimental_initialize() instead.
+  def initialize(self):
+    """DEPRECATED: Use `experimental_initialize()` instead."""
+    return self._extended._initialize()  # pylint: disable=protected-access
+
+  def experimental_initialize(self):
+    """Any initialization to be done before running any computations.
+
+    In eager mode, it executes any initialization as a side effect.
+    In graph mode, it creates the initialization ops and returns them.
+
+    For example, TPU initialize_system ops.
+
+    Returns:
+      A list of ops to execute.
+    """
+    return self._extended._initialize()  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # Use experimental_finalize() instead.
+  def finalize(self):
+    """DEPRECATED: Use `experimental_finalize()` instead."""
+    return self._extended._finalize()  # pylint: disable=protected-access
+
+  def experimental_finalize(self):
+    """Any final actions to be done at the end of all computations.
+
+    In eager mode, it executes any finalize actions as a side effect.
+    In graph mode, it creates the finalize ops and returns them.
+
+    For example, TPU shutdown ops.
+
+    Returns:
+      A list of ops to execute.
+    """
+    return self._extended._finalize()  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def run_steps_on_dataset(self, fn, iterator, iterations=1,
+                           initial_loop_values=None):
+    """DEPRECATED: use extended.experimental_run_steps_on_iterator() instead."""
+    return self._extended.experimental_run_steps_on_iterator(
+        fn, iterator, iterations, initial_loop_values)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def call_for_each_replica(self, fn, *args, **kwargs):
+    """DEPRECATED: use extended.call_for_each_replica() instead."""
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to call_for_each_replica")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to call_for_each_replica")
+      kwargs = k
+    kwargs.pop("run_concurrently", None)  # Ignore old option.
+    return self._extended.call_for_each_replica(fn, args, kwargs)
+
+  def reduce(self, reduce_op, value):
+    """Reduce `value` across replicas.
+
+    Args:
+      reduce_op: A `tf.distribute.ReduceOp` value specifying how values should
+        be combined.
+      value: A "per replica" value to be combined into a single tensor.
+
+    Returns:
+      A `Tensor`.
+    """
+    _require_cross_replica_context_extended(self._extended)
+    return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def batch_reduce(self, aggregation, value_destination_pairs):
+    """DEPRECATED: use extended.batch_reduce_to() instead."""
+    return self._extended.batch_reduce_to(aggregation, value_destination_pairs)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def update(self, var, fn, *args, **kwargs):
+    """DEPRECATED: use extended.update() instead."""
+    group = kwargs.pop("group", True)
+    # We temporarily support "grouped" in addition to "group" for backward-
+    # compatibility.
+    group = kwargs.pop("grouped", True) and group
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to update")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to update")
+      kwargs = k
+    return self._extended.update(var, fn, args, kwargs, group)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    """DEPRECATED: use extended.update_non_slot() instead."""
+    group = kwargs.pop("group", True)
+    # We temporarily support "grouped" in addition to "group" for backward-
+    # compatibility.
+    group = kwargs.pop("grouped", True) and group
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to update_non_slot")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to update_non_slot")
+      kwargs = k
+    return self._extended.update_non_slot(
+        colocate_with, fn, args, kwargs, group)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+  def unwrap(self, value):
+    """Returns the list of all per-replica values contained in `value`.
+
+    Args:
+      value: A value returned by `extended.call_for_each_replica()` or a
+        variable created in `scope`.
+
+    Returns:
+      A tuple of values contained in `value`. If `value` represents a single
+      value, this returns `(value,).`
+    """
+    return self._extended._unwrap(value)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def value_container(self, value):
+    """DEPRECATED: use extended.value_container() instead."""
+    return self._extended.value_container(value)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+  def group(self, value, name=None):
+    """Shortcut for `tf.group(self.unwrap(value))`."""
+    return self._extended._group(value, name)  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def require_static_shapes(self):
+    """DEPRECATED: use extended.require_static_shapes instead."""
+    return self._extended.experimental_require_static_shapes
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    return self._extended._num_replicas_in_sync  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def worker_devices(self):
+    """DEPRECATED: use extended.worker_devices instead."""
+    return self._extended.worker_devices
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def parameter_devices(self):
+    """DEPRECATED: use extended.parameter_devices instead."""
+    return self._extended.parameter_devices
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def non_slot_devices(self, var_list):
+    """DEPRECATED: use extended.non_slot_devices instead."""
+    return self._extended.non_slot_devices(var_list)
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def between_graph(self):
+    """DEPRECATED: use extended.experimental_between_graph instead."""
+    return self._extended.experimental_between_graph
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, being replaced by a new API.
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    # pylint: disable=g-doc-return-or-yield,g-doc-args
+    """DEPRECATED: use `update_config_proto` instead.
+
+    Configures the strategy class.
+
+    DEPRECATED: This method's functionality has been split into the strategy
+    constructor and `update_config_proto`. In the future, we will allow passing
+    cluster and config_proto to the constructor to configure the strategy. And
+    `update_config_proto` can be used to update the config_proto based on the
+    specific strategy.
+    """
+    return self._extended._configure(  # pylint: disable=protected-access
+        session_config, cluster_spec, task_type, task_id)
+
+  def update_config_proto(self, config_proto):
+    """Returns a copy of `config_proto` modified for use with this strategy.
+
+    The updated config has something needed to run a strategy, e.g.
+    configuration to run collective ops, or device filters to improve
+    distributed training performance.
+
+    Args:
+      config_proto: a `tf.ConfigProto` object.
+
+    Returns:
+      The updated copy of the `config_proto`.
+    """
+    return self._extended._update_config_proto(config_proto)  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_init(self):
+    """DEPRECATED: use extended.should_init instead."""
+    return self._extended.experimental_should_init
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_checkpoint(self):
+    """DEPRECATED: use extended.should_checkpoint instead."""
+    return self._extended.should_checkpoint
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_save_summary(self):
+    """DEPRECATED: use extended.should_save_summary instead."""
+    return self._extended.should_save_summary
+
+  def __deepcopy__(self, memo):
+    # First do a regular deepcopy of `self`.
+    cls = self.__class__
+    result = cls.__new__(cls)
+    memo[id(self)] = result
+    for k, v in self.__dict__.items():
+      setattr(result, k, copy.deepcopy(v, memo))
+    # One little fix-up: we want `result._extended` to reference `result`
+    # instead of `self`.
+    result._extended._container_strategy_weakref = weakref.ref(result)  # pylint: disable=protected-access
+    return result
+
+  def __copy__(self):
+    raise RuntimeError("Must only deepcopy DistributionStrategy.")
+
+
+@tf_export("distribute.StrategyExtended")
+class DistributionStrategyExtended(object):
+  """Additional APIs for algorithms that need to be distribution-aware.
+
+  The intent is that you can write an algorithm in a stylized way and
+  it will be usable with a variety of different
+  `tf.distribute.Strategy`
+  implementations. Each descendant will implement a different strategy
+  for distributing the algorithm across multiple devices/machines.
+  Furthermore, these changes can be hidden inside the specific layers
+  and other library classes that need special treatment to run in a
+  distributed setting, so that most users' model definition code can
+  run unchanged. The `tf.distribute.Strategy` API works the same way
+  with eager and graph execution.
+
+  First let's introduce a few high-level concepts:
+
+  * _Data parallelism_ is where we run multiple copies of the model
+    on different slices of the input data. This is in contrast to
+    _model parallelism_ where we divide up a single copy of a model
+    across multiple devices.
+    Note: we only support data parallelism for now, but
+    hope to add support for model parallelism in the future.
+  * A _replica_ is one copy of the model, running on one slice of the
+    input data.
+  * _Synchronous_, or more commonly _sync_, training is where the
+    updates from each replica are aggregated together before updating
+    the model variables. This is in contrast to _asynchronous_, or
+    _async_ training, where each replica updates the model variables
+    independently.
+  * Furthermore you might run your computation on multiple devices
+    on one machine (or "host"), or on multiple machines/hosts.
+    If you are running on multiple machines, you might have a
+    single master host that drives computation across all of them,
+    or you might have multiple clients driving the computation
+    asynchronously.
+
+  To distribute an algorithm, we might use some of these ingredients:
+
+  * Parameter servers: These are hosts that hold a single copy of
+    parameters/variables. All replicas that want to operate on a variable
+    retrieve it at the beginning of a step and send an update to be
+    applied at the end of the step. Can support either sync or async
+    training.
+  * Mirrored variables: These are variables that are copied to multiple
+    devices, where we keep the copies in sync by applying the same
+    updates to every copy. Normally would only be used with sync training.
+  * Reductions and Allreduce: A _reduction_ is some method of
+    aggregating multiple values into one value, like "sum" or
+    "mean". If doing sync training, we will perform a reduction on the
+    gradients to a parameter from all replicas before applying the
+    update. Allreduce is an algorithm for performing a reduction on
+    values from multiple devices and making the result available on
+    all of those devices.
+  * In the future we will have support for TensorFlow's partitioned
+    variables, where a single variable is split across multiple
+    devices.
+
+  We have then a few approaches we want to support:
+
+  * Code written (as if) with no knowledge of class `tf.distribute.Strategy`.
+    This code should work as before, even if some of the layers, etc.
+    used by that code are written to be distribution-aware. This is done
+    by having a default `tf.distribute.Strategy` that gives ordinary behavior,
+    and by default being in a single replica context.
+  * Ordinary model code that you want to run using a specific
+    `tf.distribute.Strategy`. This can be as simple as:
+
+    ```
+    with my_strategy.scope():
+      iterator = my_strategy.make_dataset_iterator(dataset)
+      session.run(iterator.initialize())
+      replica_train_ops = my_strategy.extended.call_for_each_replica(
+          replica_fn, args=(iterator.get_next(),))
+      train_op = my_strategy.group(replica_train_ops)
+    ```
+
+    This takes an ordinary `dataset` and `replica_fn` and runs it
+    distributed using a particular `tf.distribute.Strategy` in
+    `my_strategy`. Any variables created in `replica_fn` are created
+    using `my_strategy`'s policy, and library functions called by
+    `replica_fn` can use the `get_replica_context()` API to get enhanced
+    behavior in this case.
+
+  * If you want to write a distributed algorithm, you may use any of
+    the `tf.distribute.Strategy` APIs inside a
+    `with my_strategy.scope():` block of code.
+
+  Lower-level concepts:
+
+  * Wrapped values: In order to represent values parallel across devices
+    (either replicas or the devices associated with a particular value), we
+    wrap them in a "PerReplica" or "Mirrored" object that contains a map
+    from device to values. "PerReplica" is used when the value may be
+    different across replicas, and "Mirrored" when the value are the same.
+  * Unwrapping and merging: Consider calling a function `fn` on multiple
+    replicas, like `extended.call_for_each_replica(fn, args=[w])` with an
+    argument `w` that is a wrapped value. This means `w` will have a map taking
+    replica device `d0` to `w0`, replica device `d1` to `w1`,
+    etc. `extended.call_for_each_replica()` unwraps `w` before calling `fn`, so
+    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges the return
+    values from `fn()`, which can possibly result in wrapped values. For
+    example, let's say `fn()` returns a tuple with three components: `(x, a,
+    v0)` from replica 0, `(x, b, v1)` on replica 1, etc. If the first component
+    is the same object `x` from every replica, then the first component of the
+    merged result will also be `x`. If the second component is different (`a`,
+    `b`, ...)  from each replica, then the merged value will have a wrapped map
+    from replica device to the different values. If the third component is the
+    members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to `v1`, etc.),
+    then the merged result will be that mirrored variable (`v`).
+  * Replica context vs. Cross-replica context: _replica context_ is when we
+    are in some function that is being called once for each replica.
+    Otherwise we are in cross-replica context, which is useful for
+    calling `tf.distribute.Strategy` methods which operate across the
+    replicas (like `reduce_to()`). By default you start in a replica context
+    (the default "single replica context") and then some methods can
+    switch you back and forth, as described below.
+  * Worker devices vs. parameter devices: Most replica computations will
+    happen on worker devices. Since we don't yet support model
+    parallelism, there will be one worker device per replica. When using
+    parameter servers (see above), the set of devices holding
+    variables may be different, otherwise the parameter devices might
+    match the worker devices.
+  * Non-slot devices are some subset of the parameter devices where we
+    put all the non-slot variables. We need to ensure that all
+    non-slot variables are allocated on the same device, or mirrored
+    across the same set of devices. If you have some variable you want
+    to colocate all the non-slot variables with, you can use
+    `colocate_vars_with()` to get the remaining non-slot variables on
+    the same device.  Otherwise you can use `non_slot_devices()` to
+    pick a consistent set of devices to pass to both
+    `colocate_vars_with()` and `update_non_slot()`.
+
+  When using a `tf.distribute.Strategy`, we have a new type dimension
+  called _locality_ that says what values are compatible with which
+  APIs:
+
+  * T: different value for each replica (e.g. a PerReplica-wrapped value).
+  * M: value is "mirrored" across replicas, i.e. there are copies with the
+    same value on each replica (e.g. a Mirrored-wrapped value).
+  * V(`v`): value is "mirrored" across all the devices which have a
+    copy of variable `v` (also a Mirrored-wrapped value, but over
+    parameter devices instead of worker devices).
+  * N: value is "mirrored" across all the "non-slot" devices
+
+  Rules for methods with respect to locality and single-replica vs.
+  cross-replica context:
+
+  * `with d.scope()`: default single-replica context -> cross-replica context
+    for `d`
+  * `with d.extended.colocate_vars_with(v)`: in replica/cross-replica context,
+    variables will be created with locality V(`v`). That is, if we write
+    `with d.extended.colocate_vars_with(v1): v2 = tf.get_variable(...)`,
+    then `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
+    V(`v1`).
+  * `with d.extended.colocate_vars_with(d.extended.non_slot_devices(...))`: in
+    replica/cross-replica context, variables will be created with locality N
+  * `v = tf.get_variable(...)`: in replica/cross-replica context, creates
+    a variable (which by definition will have locality V(`v`), though
+    will match another locality if inside a `colocate_vars_with`
+    scope).
+  * `d.make_dataset_iterator(dataset)` (or the deprecated
+    `d.distribute_dataset(dataset).make_one_shot_iterator()`): in cross-replica
+    context, produces an iterator with locality T
+  * `d.extended.broadcast_to(t)`: in cross-replica context, produces a value
+    with locality M
+  * `d.extended.broadcast_to(t, v)`: in cross-replica context, produces a value
+    with locality V(`v`)
+  * `d.extended.call_for_each_replica(fn, ...)`: in cross-replica context, runs
+    `fn()` in a replica context (and so may call `get_replica_context()` and
+    use its API, including `merge_call()` to get back to cross-replica
+    context), once for each replica. May use values with locality T or
+    M, and any variable.
+  * `d.extended.reduce_to(m, t, t)`: in cross-replica context, accepts t with
+    locality T and produces a value with locality M.
+  * `d.extended.reduce_to(m, t, v)`: in cross-replica context, accepts t with
+    locality T and produces a value with locality V(`v`).
+  * `d.extended.batch_reduce_to(m, [(t, v)]): see `d.extended.reduce_to()`
+  * `d.extended.update(v, fn, ...)`: in cross-replica context, runs `fn()` once
+    for each device `v` is copied to, all inputs should have locality
+    V(`v`), output will have locality V(`v`) as well.
+  * `d.extended.update_non_slot(d.extended.non_slot_devices(), fn)`: in
+    cross-replica context, like `d.extended.update()` except with locality N.
+  * `d.extended.read_var(v)`: Gets the (read-only) value of the variable `v` (on
+    the device determined by the current device scope), aggregating
+    across replicas for replica-local variables. Frequently, this will be
+    done automatically when using `v` in an expression or fetching it in
+    a cross-replica context, but this function can be used to force that
+    conversion happens at a particular point in time (for example, to
+    add the result of the conversion to a graph collection).
+
+  The standard pattern for updating variables is to:
+
+  1. Create an input iterator with `d.make_dataset_iterator()`.
+  2. Define each replica `d.extended.call_for_each_replica()` up to the point of
+     getting a list of gradient, variable pairs.
+  3. Call `d.extended.reduce_to(VariableAggregation.SUM, t, v)` or
+     `d.extended.batch_reduce_to()` to sum the gradients (with locality T)
+     into values with locality V(`v`).
+  4. Call `d.extended.update(v)` for each variable to update its value.
+
+  Steps 3 and 4 are done automatically by class `Optimizer` if you call
+  its `apply_gradients` method in a replica context. Otherwise you can
+  manually call its `_distributed_apply` method in a cross-replica context.
+
+  Another thing you might want to do in the middle of your replica function is
+  an all-reduce of some intermediate value, using `d.extended.reduce_to()` or
+  `d.extended.batch_reduce_to()`. You simply provide the same tensor as the
+  input and destination.
+
+  Layers should expect to be called in a replica context, and can use
+  the `tf.distribute.get_replica_context` function to get a
+  `tf.distribute.ReplicaContext` object. The
+  `ReplicaContext` object has a `merge_call()` method for entering
+  cross-replica context where you can use `reduce_to()` (or
+  `batch_reduce_to()`) and then optionally `update()` to update state.
+
+  You may use this API whether or not a `tf.distribute.Strategy` is
+  being used, since there is a default implementation of
+  `ReplicaContext` and `tf.distribute.Strategy`.
+
+  NOTE for new `tf.distribute.Strategy` implementations: Please put all logic
+  in a subclass of `tf.distribute.StrategyExtended`. The only code needed for
+  the `tf.distribute.Strategy` subclass is for instantiating your subclass of
+  `tf.distribute.StrategyExtended` in the `__init__` method.
+  """
+
+  def __init__(self, container_strategy):
+    self._container_strategy_weakref = weakref.ref(container_strategy)
+    self._default_device = None
+    # This property is used to determine if we should set drop_remainder=True
+    # when creating Datasets from numpy array inputs.
+    self._require_static_shapes = False
+
+  def _container_strategy(self):
+    """Get the containing `DistributionStrategy`.
+
+    This should not generally be needed except when creating a new
+    `ReplicaContext` and to validate that the caller is in the correct
+    `scope()`.
+
+    Returns:
+      The `DistributionStrategy` such that `strategy.extended` is `self`.
+    """
+    container_strategy = self._container_strategy_weakref()
+    assert container_strategy is not None
+    return container_strategy
+
+  def _scope(self, strategy):
+    """Implementation of DistributionStrategy.scope()."""
+    if distribution_strategy_context.has_distribution_strategy():
+      _require_cross_replica_context_extended(self)
+      return _SameScopeAgainContext(strategy)
+
+    def creator_with_resource_vars(*args, **kwargs):
+      _require_distribution_strategy_scope_extended(self)
+      kwargs["use_resource"] = True
+      return self._create_variable(*args, **kwargs)
+
+    def distributed_getter(getter, *args, **kwargs):
+      if not self._allow_variable_partition():
+        if kwargs.pop("partitioner", None) is not None:
+          tf_logging.log_first_n(
+              tf_logging.WARN, "Partitioned variables are disabled when using "
+              "current tf.distribute.Strategy.", 1)
+      return getter(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        strategy,
+        variable_scope.variable_creator_scope(creator_with_resource_vars),
+        variable_scope.variable_scope(
+            variable_scope.get_variable_scope(),
+            custom_getter=distributed_getter), self._default_device)
+
+  def _allow_variable_partition(self):
+    return False
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    # Note: should support "colocate_with" argument.
+    raise NotImplementedError("must be implemented in descendants")
+
+  def read_var(self, v):
+    """Reads the value of a variable.
+
+    Returns the aggregate value of a replica-local variable, or the
+    (read-only) value of any other variable.
+
+    Args:
+      v: A variable allocated within the scope of this `tf.distribute.Strategy`.
+
+    Returns:
+      A tensor representing the value of `v`, aggregated across replicas if
+      necessary.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Scope that controls which devices variables will be created on.
+
+    No operations should be added to the graph inside this scope, it
+    should only be used when creating variables (some implementations
+    work by changing variable creation, others work by using a
+    tf.colocate_with() scope).
+
+    This may only be used inside `self.scope()`.
+
+    Example usage:
+
+    ```
+    with strategy.scope():
+      var1 = tf.get_variable(...)
+      with strategy.extended.colocate_vars_with(v1):
+        # var2 and var3 will be created on the same device(s) as var1
+        var2 = tf.get_variable(...)
+        var3 = tf.get_variable(...)
+
+      def fn(v1, v2, v3):
+        # operates on v1 from var1, v2 from var2, and v3 from var3
+
+      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
+      strategy.extended.update(v1, fn, args=(v2, v3))
+    ```
+
+    Args:
+      colocate_with_variable: A created in `self.scope()`. Variables created
+        while in the returned context manager will be on the same set of
+        devices as `colocate_with_variable`.
+
+    Returns:
+      A context manager.
+    """
+    def create_colocated_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope_extended(self)
+      kwargs["use_resource"] = True
+      kwargs["colocate_with"] = colocate_with_variable
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope_extended(self)
+    return variable_scope.variable_creator_scope(create_colocated_variable)
+
+  def _call_dataset_fn(self, dataset_fn):
+    """Call the `dataset_fn` with `input_context` as argument."""
+    result = dataset_fn()
+    if not isinstance(result, dataset_ops.DatasetV2):
+      raise ValueError(
+          "dataset_fn() must return a tf.data.Dataset when using a "
+          "tf.distribute.Strategy.")
+    return result
+
+  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
+  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
+  # Extend to implement more functionality of datasets.
+  def _distribute_dataset(self, dataset_fn):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _make_dataset_iterator(self, dataset):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _make_input_fn_iterator(self, input_fn, replication_mode):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def broadcast_to(self, tensor, destinations):
+    """Mirror a tensor on one device to all worker devices.
+
+    Args:
+      tensor: A Tensor value to broadcast.
+      destinations: A mirrored variable or device string specifying the
+        destination devices to copy `tensor` to.
+
+    Returns:
+      A value mirrored to `destinations` devices.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_replica_context_extended(self)
+    assert not isinstance(destinations, (list, tuple))
+    return self._broadcast_to(tensor, destinations)
+
+  def _broadcast_to(self, tensor, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _initialize(self):
+    return []
+
+  def _finalize(self):
+    return []
+
+  def experimental_run_steps_on_iterator(self, fn, iterator, iterations=1,
+                                         initial_loop_values=None):
+    """Run `fn` with input from `iterator` for `iterations` times.
+
+    This method can be used to run a step function for training a number of
+    times using input from a dataset.
+
+    Args:
+      fn: function to run using this distribution strategy. The function must
+        have the following signature: `def fn(context, inputs)`.
+        `context` is an instance of `MultiStepContext` that will be passed when
+        `fn` is run. `context` can be used to specify the outputs to be returned
+        from `fn` by calling `context.set_last_step_output`. It can also be used
+        to capture non tensor outputs by `context.set_non_tensor_output`.
+        See `MultiStepContext` documentation for more information.
+        `inputs` will have same type/structure as `iterator.get_next()`.
+        Typically, `fn` will use `call_for_each_replica` method of the strategy
+        to distribute the computation over multiple replicas.
+      iterator: Iterator of a dataset that represents the input for `fn`. The
+        caller is responsible for initializing the iterator as needed.
+      iterations: (Optional) Number of iterations that `fn` should be run.
+        Defaults to 1.
+      initial_loop_values: (Optional) Initial values to be passed into the
+        loop that runs `fn`. Defaults to `None`. # TODO(priyag): Remove
+        initial_loop_values argument when we have a mechanism to infer the
+        outputs of `fn`.
+
+    Returns:
+      Returns the `MultiStepContext` object which has the following properties,
+      among other things:
+        - run_op: An op that runs `fn` `iterations` times.
+        - last_step_outputs: A dictionary containing tensors set using
+        `context.set_last_step_output`. Evaluating this returns the value of
+        the tensors after the last iteration.
+        - non_tensor_outputs: A dictionatry containing anything that was set by
+          `fn` by calling `context.set_non_tensor_output`.
+    """
+    _require_cross_replica_context_extended(self)
+    return self._experimental_run_steps_on_iterator(
+        fn, iterator, iterations, initial_loop_values)
+
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def call_for_each_replica(self, fn, args=(), kwargs=None):
+    """Run `fn` once per replica.
+
+    `fn` may call `tf.get_replica_context()` to access methods such as
+    `replica_id_in_sync_group` and `merge_call()`.
+
+    `merge_call()` is used to communicate between the replicas and
+    re-enter the cross-replica context. All replicas pause their execution
+    having encountered a `merge_call()` call. After that the
+    `merge_fn`-function is executed. Its results are then unwrapped and
+    given back to each replica call. After that execution resumes until
+    `fn` is complete or encounters another `merge_call()`.  Example:
+
+    ```python
+    # Called once in "cross-replica" context.
+    def merge_fn(distribution, three_plus_replica_id):
+      # sum the values across replicas
+      return sum(distribution.unwrap(three_plus_replica_id))
+
+    # Called once per replica in `distribution`, in a "replica" context.
+    def fn(three):
+      replica_ctx = tf.get_replica_context()
+      v = three + replica_ctx.replica_id_in_sync_group
+      # Computes the sum of the `v` values across all replicas.
+      s = replica_ctx.merge_call(merge_fn, args=(v,))
+      return s + v
+
+    with distribution.scope():
+      # in "cross-replica" context
+      ...
+      merged_results = distribution.call_for_each_replica(fn, args=[3])
+      # merged_results has the values from every replica execution of `fn`.
+      print(distribution.unwrap(merged_results))  # Prints a list
+    ```
+
+    Args:
+      fn: function to run (will be run once per replica).
+      args: Tuple or list with positional arguments for `fn`.
+      kwargs: Dict with keyword arguments for `fn`.
+
+    Returns:
+      Merged return value of `fn` across all replicas.
+    """
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._call_for_each_replica(fn, args, kwargs)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _reduce(self, reduce_op, value):
+    # Default implementation until we have an implementation for each strategy.
+    return self._unwrap(self._reduce_to(
+        reduce_op, value, device_util.current() or "/device:CPU:0"))[0]
+
+  def reduce_to(self, reduce_op, value, destinations):
+    """Combine (via e.g. sum or mean) values across replicas.
+
+    Args:
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+        DEPRECATED but still accepted values:
+        `tf.VariableAggregation.SUM`,
+        `tf.VariableAggregation.MEAN`,
+      value: A per-replica value with one value per replica.
+      destinations: A mirrored variable, a per-replica tensor, or a device
+        string. The return value will be copied to all destination devices (or
+        all the devices where the `destinations` value resides). To perform an
+        all-reduction, pass `value` to `destinations`.
+
+    Returns:
+      A value mirrored to `destinations`.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_replica_context_extended(self)
+    assert not isinstance(destinations, (list, tuple))
+
+    # TODO(priyag): Remove this when all callers have been updated.
+    if isinstance(reduce_op, variable_scope.VariableAggregation):
+      assert reduce_op in (
+          variable_scope.VariableAggregation.SUM,
+          variable_scope.VariableAggregation.MEAN,
+      )
+      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    assert (reduce_op == reduce_util.ReduceOp.SUM or
+            reduce_op == reduce_util.ReduceOp.MEAN)
+    return self._reduce_to(reduce_op, value, destinations)
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def batch_reduce_to(self, reduce_op, value_destination_pairs):
+    """Combine multiple `reduce_to` calls into one for faster execution.
+
+    Args:
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+        DEPRECATED but still accepted values:
+        `tf.VariableAggregation.SUM`,
+        `tf.VariableAggregation.MEAN`,
+      value_destination_pairs: A sequence of (value, destinations)
+        pairs. See `reduce_to()` for a description.
+
+    Returns:
+      A list of mirrored values, one per pair in `value_destination_pairs`.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_replica_context_extended(self)
+
+    # TODO(priyag): Remove this when all callers have been updated.
+    if isinstance(reduce_op, variable_scope.VariableAggregation):
+      assert reduce_op in [
+          variable_scope.VariableAggregation.SUM,
+          variable_scope.VariableAggregation.MEAN,
+      ]
+      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    return self._batch_reduce_to(reduce_op, value_destination_pairs)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    return [
+        self.reduce_to(reduce_op, t, destinations=v)
+        for t, v in value_destination_pairs
+    ]
+
+  def update(self, var, fn, args=(), kwargs=None, group=True):
+    """Run `fn` to update `var` using inputs mirrored to the same devices.
+
+    If `var` is mirrored across multiple devices, then this implements
+    logic like:
+
+    ```
+    results = {}
+    for device, v in var:
+      with tf.device(device):
+        # args and kwargs will be unwrapped if they are mirrored.
+        results[device] = fn(v, *args, **kwargs)
+    return merged(results)
+    ```
+
+    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.
+
+    Neither `args` nor `kwargs` may contain per-replica values.
+    If they contain mirrored values, they will be unwrapped before
+    calling `fn`.
+
+    Args:
+      var: Variable, possibly mirrored to multiple devices, to operate on.
+      fn: Function to call. Should take the variable as the first argument.
+      args: Tuple or list. Additional positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
+
+    Returns:
+      By default, the merged return value of `fn` across all replicas.  The
+      merged result has dependencies to make sure that if it is evaluated at
+      all, the side effects (updates) will happen on every replica. If instead
+      "group=False" is specified, this function will return a nest of lists
+      where each list has an element per replica, and the caller is responsible
+      for ensuring all elements are executed.
+    """
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._update(var, fn, args, kwargs, group)
+
+  def _update(self, var, fn, args, kwargs, group):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def update_non_slot(
+      self, colocate_with, fn, args=(), kwargs=None, group=True):
+    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
+
+    Args:
+      colocate_with: The return value of `non_slot_devices()`.
+      fn: Function to execute.
+      args: Tuple or list. Positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
+
+    Returns:
+      Return value of `fn`, possibly merged across devices.
+    """
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._update_non_slot(colocate_with, fn, args, kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _unwrap(self, distributed_value):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def value_container(self, value):
+    """Returns the container that this per-replica `value` belongs to.
+
+    Args:
+      value: A value returned by `call_for_each_replica()` or a variable
+        created in `scope()`.
+
+    Returns:
+      A container that `value` belongs to.
+      If value does not belong to any container (including the case of
+      container having been destroyed), returns the value itself.
+      `value in unwrap(value_container(value))` will always be true.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _group(self, value, name=None):
+    """Shortcut for `tf.group(distribution.unwrap(value))`."""
+    value = nest.flatten(self._unwrap(value))
+
+    if len(value) != 1 or name is not None:
+      return control_flow_ops.group(value, name=name)
+    # Special handling for the common case of one op.
+    v, = value
+    if hasattr(v, "op"):
+      v = v.op
+    return v
+
+  @property
+  def experimental_require_static_shapes(self):
+    return self._require_static_shapes
+
+  @property
+  def _num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def worker_devices(self):
+    """Returns the tuple of all devices used to for compute replica execution.
+    """
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def parameter_devices(self):
+    """Returns the tuple of all devices used to place variables."""
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  def non_slot_devices(self, var_list):
+    """Device(s) for non-slot variables.
+
+    Create variables on these devices in a
+    `with colocate_vars_with(non_slot_devices(...)):` block.
+    Update those using `update_non_slot()`.
+
+    Args:
+      var_list: The list of variables being optimized, needed with the
+        default `tf.distribute.Strategy`.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def experimental_between_graph(self):
+    """Whether the strategy uses between-graph replication or not.
+
+      This is expected to return a constant value that will not be changed
+      throughout its life cycle.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    """Configures the strategy class."""
+    del session_config, cluster_spec, task_type, task_id
+
+  def _update_config_proto(self, config_proto):
+    return copy.deepcopy(config_proto)
+
+  @property
+  def experimental_should_init(self):
+    """Whether initialization is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_checkpoint(self):
+    """Whether checkpointing is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_save_summary(self):
+    """Whether saving summaries is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+
+# A note about the difference between the context managers
+# `ReplicaContext` (defined here) and `_CurrentDistributionContext`
+# (defined above) used by `DistributionStrategy.scope()`:
+#
+# * a ReplicaContext is only present during a `call_for_each_replica()`
+#   call (except during a `merge_run` call) and in such a scope it
+#   will be returned by calls to `get_replica_context()`.  Implementers of new
+#   DistributionStrategy descendants will frequently also need to
+#   define a descendant of ReplicaContext, and are responsible for
+#   entering and exiting this context.
+#
+# * DistributionStrategy.scope() sets up a variable_creator scope that
+#   changes variable creation calls (e.g. to make mirrored
+#   variables). This is intended as an outer scope that users enter once
+#   around their model creation and graph definition. There is no
+#   anticipated need to define descendants of _CurrentDistributionContext.
+#   It sets the current DistributionStrategy for purposes of
+#   `get_strategy()` and `has_strategy()`
+#   and switches the thread mode to a "cross-replica context".
+@tf_export("distribute.ReplicaContext")
+class ReplicaContext(object):
+  """`tf.distribute.Strategy` API when in a replica context.
+
+  To be used inside your replicated step function, such as in a
+  `tf.distribute.StrategyExtended.call_for_each_replica` call.
+  """
+
+  def __init__(self, strategy, replica_id_in_sync_group):
+    self._distribution_strategy = strategy
+    self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
+        self)
+    self._replica_id_in_sync_group = replica_id_in_sync_group
+
+  def __enter__(self):
+    _push_per_thread_mode(self._thread_context)
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    _pop_per_thread_mode()
+
+  def merge_call(self, merge_fn, args=(), kwargs=None):
+    """Merge args across replicas and run `merge_fn` in a cross-replica context.
+
+    This allows communication and coordination when there are multiple calls
+    to a model function triggered by a call to
+    `strategy.extended.call_for_each_replica(model_fn, ...)`.
+
+    See `tf.distribute.StrategyExtended.call_for_each_replica` for an
+    explanation.
+
+    If not inside a distributed scope, this is equivalent to:
+
+    ```
+    strategy = tf.distribute.get_strategy()
+    with cross-replica-context(strategy):
+      return merge_fn(strategy, *args, **kwargs)
+    ```
+
+    Args:
+      merge_fn: function that joins arguments from threads that are given as
+        PerReplica. It accepts `tf.distribute.Strategy` object as
+        the first argument.
+      args: List or tuple with positional per-thread arguments for `merge_fn`.
+      kwargs: Dict with keyword per-thread arguments for `merge_fn`.
+
+    Returns:
+      The return value of `merge_fn`, except for `PerReplica` values which are
+      unpacked.
+    """
+    require_replica_context(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._merge_call(merge_fn, args, kwargs)
+
+  def _merge_call(self, merge_fn, args, kwargs):
+    """Default implementation for single replica."""
+    _push_per_thread_mode(  # thread-local, so not needed with multiple threads
+        distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
+            self._distribution_strategy))
+    try:
+      return merge_fn(self._distribution_strategy, *args, **kwargs)
+    finally:
+      _pop_per_thread_mode()
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    return self._distribution_strategy.num_replicas_in_sync
+
+  @property
+  def replica_id_in_sync_group(self):
+    """Which replica is being defined, from 0 to `num_replicas_in_sync - 1`."""
+    require_replica_context(self)
+    return self._replica_id_in_sync_group
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, use `strategy`
+  def distribution_strategy(self):
+    """DEPRECATED: use `self.stratgey` instead."""
+    return self._distribution_strategy
+
+  @property
+  def strategy(self):
+    """The current `tf.distribute.Strategy` object."""
+    return self._distribution_strategy
+
+  @property
+  def devices(self):
+    """The devices this replica is to be executed on, as a tuple of strings."""
+    require_replica_context(self)
+    return (device_util.current(),)
+
+  # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
+  # all-reduce. It would return a function returning the result of reducing `t`
+  # across all replicas. The caller would wait to call this function until they
+  # needed the reduce result, allowing an efficient implementation:
+  # * With eager execution, the reduction could be performed asynchronously
+  #   in the background, not blocking until the result was needed.
+  # * When constructing a graph, it could batch up all reduction requests up
+  #   to that point that the first result is needed. Most likely this can be
+  #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
+
+# ------------------------------------------------------------------------------
+
+
+class _DefaultDistributionStrategy(DistributionStrategy):
+  """Default `tf.distribute.Strategy` if none is explicitly selected."""
+
+  def __init__(self):
+    super(_DefaultDistributionStrategy, self).__init__(
+        _DefaultDistributionExtended(self))
+
+
+class _DefaultDistributionExtended(DistributionStrategyExtended):
+  """Implementation of _DefaultDistributionStrategy."""
+
+  def _scope(self, strategy):
+    """Context manager setting a variable creator and `self` as current."""
+    if distribution_strategy_context.has_distribution_strategy():
+      raise RuntimeError("Must not nest tf.distribute.Strategy scopes.")
+
+    def creator(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope_strategy(strategy)
+      return next_creator(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        strategy, variable_scope.variable_creator_scope(creator))
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Does not require `self.scope`."""
+    _require_distribution_strategy_scope_extended(self)
+    return ops.colocate_with(colocate_with_variable)
+
+  def _distribute_dataset(self, dataset_fn):
+    return self._call_dataset_fn(dataset_fn)
+
+  def _make_dataset_iterator(self, dataset):
+    return _DefaultDistributionExtended.DefaultInputIterator(dataset)
+
+  def _make_input_fn_iterator(self,
+                              input_fn,
+                              replication_mode=InputReplicationMode.PER_WORKER):
+    return input_fn(InputContext()).make_initializable_iterator()
+
+  def _broadcast_to(self, tensor, destinations):
+    if destinations is None:
+      return tensor
+    else:
+      raise NotImplementedError("TODO")
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    with ReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
+      return fn(*args, **kwargs)
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    # TODO(josh11b): Use destinations?
+    del reduce_op, destinations
+    return value
+
+  def _update(self, var, fn, args, kwargs, group):
+    # The implementations of _update() and _update_non_slot() are identical
+    # except _update() passes `var` as the first argument to `fn()`.
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, should_group):
+    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
+    # once that value is used for something.
+    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
+      result = fn(*args, **kwargs)
+      if should_group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  def read_var(self, replica_local_var):
+    return array_ops.identity(replica_local_var)
+
+  def _unwrap(self, distributed_value):
+    return (distributed_value,)
+
+  def value_container(self, value):
+    return value
+
+  @property
+  def _num_replicas_in_sync(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    raise RuntimeError("worker_devices() method unsupported by default "
+                       "tf.distribute.Strategy.")
+
+  @property
+  def parameter_devices(self):
+    raise RuntimeError("parameter_devices() method unsupported by default "
+                       "tf.distribute.Strategy.")
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
+
+  # TODO(priyag): This should inherit from `InputIterator`, once dependency
+  # issues have been resolved.
+  class DefaultInputIterator(object):
+    """Default implementation of `InputIterator` for default strategy."""
+
+    def __init__(self, dataset):
+      self._dataset = dataset
+      if eager_context.executing_eagerly():
+        self._iterator = dataset.make_one_shot_iterator()
+      else:
+        self._iterator = dataset.make_initializable_iterator()
+
+    def get_next(self):
+      return self._iterator.get_next()
+
+    def initialize(self):
+      if eager_context.executing_eagerly():
+        self._iterator = self._dataset.make_one_shot_iterator()
+        return []
+      else:
+        return [self._iterator.initializer]
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
+
+
+# ------------------------------------------------------------------------------
+# We haven't yet implemented deserialization for DistributedVariables.
+# So here we catch any attempts to deserialize variables
+# when using distribution strategies.
+# pylint: disable=protected-access
+_original_from_proto = resource_variable_ops._from_proto_fn
+
+
+def _from_proto_fn(v, import_scope=None):
+  if distribution_strategy_context.has_distribution_strategy():
+    raise NotImplementedError(
+        "Deserialization of variables is not yet supported when using a "
+        "tf.distribute.Strategy.")
+  else:
+    return _original_from_proto(v, import_scope=import_scope)
+
+resource_variable_ops._from_proto_fn = _from_proto_fn
+# pylint: enable=protected-access
+
+
+#-------------------------------------------------------------------------------
+# Shorthand for some methods from distribution_strategy_context.
+_push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
+_get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
+_pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d63d1fe3c323ac1e98afee52cf544c7c7da5fc65
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -0,0 +1,179 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test DistributionStrategy, ReplicaContext, and supporting APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+
+
+class _TestReplicaContext(distribute_lib.ReplicaContext):
+
+  def merge_call(self, fn, *args, **kwargs):
+    return kwargs["test_arg"]
+
+
+def _get_test_variable(name, synchronization, aggregation):
+  return {
+      "name": name,
+      "synchronization": synchronization,
+      "aggregation": aggregation
+  }
+
+
+class _TestStrategy(distribute_lib.DistributionStrategy):
+
+  def __init__(self):
+    super(_TestStrategy, self).__init__(_TestExtended(self))
+
+
+class _TestExtended(distribute_lib.DistributionStrategyExtended):
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    with _TestReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
+      return fn(*args, **kwargs)
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    return _get_test_variable(kwargs["name"], kwargs["synchronization"],
+                              kwargs["aggregation"])
+
+
+def _assert_in_default_state(t):
+  t.assertIs(distribution_strategy_context._get_default_replica_context(),
+             distribution_strategy_context.get_replica_context())
+  t.assertIs(None, distribution_strategy_context.get_cross_replica_context())
+  t.assertFalse(distribution_strategy_context.in_cross_replica_context())
+  t.assertIs(distribution_strategy_context._get_default_distribution_strategy(),
+             distribution_strategy_context.get_distribution_strategy())
+  t.assertFalse(distribution_strategy_context.has_distribution_strategy())
+
+
+class TestStrategyTest(test.TestCase):
+
+  def testCallForEachReplica(self):
+    _assert_in_default_state(self)
+    dist = _TestStrategy()
+
+    def run_fn():
+      replica_context = distribution_strategy_context.get_replica_context()
+      self.assertTrue(replica_context is not None)
+      self.assertIs(None,
+                    distribution_strategy_context.get_cross_replica_context())
+      self.assertFalse(distribution_strategy_context.in_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_distribution_strategy())
+      self.assertEqual("foo", replica_context.merge_call(None, test_arg="foo"))
+      expected_value = _get_test_variable(
+          "bar", variable_scope.VariableSynchronization.AUTO,
+          variable_scope.VariableAggregation.NONE)
+      self.assertDictEqual(expected_value,
+                           variable_scope.variable(1.0, name="bar"))
+
+    with self.assertRaises(RuntimeError):
+      dist.extended.call_for_each_replica(run_fn)
+    with dist.scope():
+      dist.extended.call_for_each_replica(run_fn)
+    _assert_in_default_state(self)
+
+  def testScope(self):
+    _assert_in_default_state(self)
+    dist = _TestStrategy()
+    with dist.scope():
+      self.assertIs(None, distribution_strategy_context.get_replica_context())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_distribution_strategy())
+      expected_value = _get_test_variable(
+          "baz", variable_scope.VariableSynchronization.AUTO,
+          variable_scope.VariableAggregation.NONE)
+      self.assertDictEqual(expected_value,
+                           variable_scope.variable(1.0, name="baz"))
+    _assert_in_default_state(self)
+
+  def testSettingSynchronizationAndAggregation(self):
+    _assert_in_default_state(self)
+    dist = _TestStrategy()
+    with dist.scope():
+      expected_value = _get_test_variable(
+          "baz", variable_scope.VariableSynchronization.ON_WRITE,
+          variable_scope.VariableAggregation.MEAN)
+      self.assertDictEqual(
+          expected_value,
+          variable_scope.variable(
+              1.0,
+              name="baz",
+              synchronization=variable_scope.VariableSynchronization.ON_WRITE,
+              aggregation=variable_scope.VariableAggregation.MEAN))
+    _assert_in_default_state(self)
+
+
+class DefaultDistributionStrategyTest(test.TestCase):
+
+  def testMergeCall(self):
+    _assert_in_default_state(self)
+
+    def merge_fn(dist, s):
+      self.assertIs(
+          distribution_strategy_context._get_default_distribution_strategy(),
+          dist)
+      self.assertIs(None, distribution_strategy_context.get_replica_context())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      self.assertIs(dist,
+                    distribution_strategy_context.get_distribution_strategy())
+      self.assertFalse(
+          distribution_strategy_context.has_distribution_strategy())
+      return "foo_" + s
+
+    replica_ctx = distribution_strategy_context.get_replica_context()
+    self.assertIs(distribution_strategy_context._get_default_replica_context(),
+                  replica_ctx)
+    self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",)))
+    _assert_in_default_state(self)
+
+
+class InputContextTest(test.TestCase):
+
+  def testProperties(self):
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=2, input_pipeline_id=1, num_replicas_in_sync=6)
+    self.assertEqual(6, input_context.num_replicas_in_sync)
+    self.assertEqual(1, input_context.input_pipeline_id)
+    self.assertEqual(2, input_context.num_input_pipelines)
+
+  def testPerReplicaBatchSize(self):
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=2, input_pipeline_id=1, num_replicas_in_sync=6)
+    self.assertEqual(2, input_context.get_per_replica_batch_size(12))
+    with self.assertRaises(ValueError):
+      input_context.get_per_replica_batch_size(13)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/distribution_strategy_context.py b/tensorflow/python/distribute/distribution_strategy_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..78e096e286727664830f18ac0236c3626c5733d9
--- /dev/null
+++ b/tensorflow/python/distribute/distribution_strategy_context.py
@@ -0,0 +1,236 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to get distribution strategy related contexts."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.python.util.tf_export import tf_export
+
+
+# There is a circular dependency between this and `distribute` module. So we
+# load it lazily to workaround this.
+distribute_lib = LazyLoader(
+    "distribute_lib", globals(),
+    "tensorflow.python.distribute.distribute_lib")
+
+# ------------------------------------------------------------------------------
+# Internal API for setting the current thread mode as being either in a
+# replica or cross-replica context for a particular distribution strategy.
+
+
+class _ThreadMode(object):
+
+  def __init__(self, dist, cross, replica):
+    self.distribution_strategy = dist
+    self.cross_replica_context = cross
+    self.replica_context = replica
+
+
+class _CrossReplicaThreadMode(_ThreadMode):
+
+  def __init__(self, distribution_strategy):
+    _ThreadMode.__init__(
+        self, distribution_strategy, distribution_strategy, None)
+
+
+class _InReplicaThreadMode(_ThreadMode):
+
+  def __init__(self, replica_ctx):
+    _ThreadMode.__init__(
+        self, replica_ctx.distribution_strategy, None, replica_ctx)
+
+
+def _push_per_thread_mode(context):
+  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
+
+
+def _pop_per_thread_mode():
+  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
+
+
+class _DefaultReplicaThreadMode(_ThreadMode):
+  """Type of default value returned by `_get_per_thread_mode()`.
+
+  Used when the thread-local stack is empty.
+  """
+
+  def __init__(self):
+    _ThreadMode.__init__(self, _get_default_distribution_strategy(), None,
+                         _get_default_replica_context())
+
+
+def _get_per_thread_mode():
+  try:
+    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
+  except (AttributeError, IndexError):
+    return _get_default_replica_mode()
+
+
+# ------------------------------------------------------------------------------
+# Public API for accessing the current thread mode
+
+
+@tf_export("distribute.get_replica_context")
+def get_replica_context():
+  """Returns the current `tf.distribute.ReplicaContext` or `None`.
+
+  Returns `None` if in a cross-replica context.
+
+  Note that execution:
+
+  1. starts in the default (single-replica) replica context (this function
+     will return the default `ReplicaContext` object);
+  2. switches to cross-replica context (in which case this will return
+     `None`) when entering a `with tf.distribute.Strategy.scope():` block;
+  3. switches to a (non-default) replica context inside
+     `extended.call_for_each_replica(fn, ...)`;
+  4. if `fn` calls `get_replica_context().merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-replica context (and again
+     this function will return `None`).
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-replica context for the default `tf.distribute.Strategy`. You may
+  also switch from the cross-replica context of 4 to a replica context by
+  calling `extended.call_for_each_replica()`, jumping back to step 3.
+
+  Most `tf.distribute.Strategy` methods may only be executed in
+  a cross-replica context, in a replica context you should use the
+  `ReplicaContext` API instead.
+
+  Returns:
+    The current `ReplicaContext` object when in a replica context scope,
+    else `None`.
+
+    Within a particular block, exactly one of these two things will be true:
+
+    * `get_replica_context()` returns non-`None`, or
+    * `tf.distribute.is_cross_replica_context()` returns True.
+  """
+  return _get_per_thread_mode().replica_context
+
+
+def get_cross_replica_context():
+  """Returns the current tf.distribute.Strategy if in a cross-replica context.
+
+  DEPRECATED: Please use `in_cross_replica_context()` and
+  `get_distribution_strategy()` instead.
+
+  Note that execution:
+
+  1. starts in the default (single-replica) replica context;
+  2. switches to cross-replica context when entering a
+     `with tf.distribute.Strategy.scope():` block;
+  3. switches to a (non-default) replica context inside
+     `call_for_each_replica(fn, ...)`;
+  4. if `fn` calls `get_replica_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-replica context.
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-replica context for the default `tf.distribute.Strategy`. You may
+  also switch from the cross-replica context of 4 to a replica context by
+  calling `call_for_each_replica()`, jumping back to step 3.
+
+  Most `tf.distribute.Strategy` methods may only be executed in
+  a cross-replica context.
+
+  Returns:
+    Returns the current `tf.distribute.Strategy` object in a cross-replica
+    context, or `None`.
+
+    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
+    will return `None` in a particular block.
+  """
+  return _get_per_thread_mode().cross_replica_context
+
+
+@tf_export("distribute.in_cross_replica_context")
+def in_cross_replica_context():
+  """Returns True if in a cross-replica context.
+
+  See `tf.distribute.get_replica_context` for details.
+
+  Returns:
+    True if in a cross-replica context (`get_replica_context()` returns
+    `None`), or False if in a replica context (`get_replica_context()` returns
+    non-`None`).
+  """
+  return _get_per_thread_mode().cross_replica_context is not None
+
+
+@tf_export("distribute.get_strategy")
+def get_distribution_strategy():
+  """Returns the current `tf.distribute.Strategy` object.
+
+  Typically only used in a cross-replica context:
+
+  ```
+  if tf.distribute.in_cross_replica_context():
+    strategy = tf.distribute.get_strategy()
+    ...
+  ```
+
+  Returns:
+    A `tf.distribute.Strategy` object. Inside a
+    `with distribution_strategy.scope()` block, it returns
+    `distribution_strategy`, otherwise it returns the default
+    (single-replica) `tf.distribute.Strategy` object.
+  """
+  return _get_per_thread_mode().distribution_strategy
+
+
+@tf_export("distribute.has_strategy")
+def has_distribution_strategy():
+  """Return if there is a current non-default `tf.distribute.Strategy`.
+
+  Returns:
+    True if inside a `with strategy.scope():`.
+  """
+  return get_distribution_strategy() is not _get_default_distribution_strategy()
+
+
+# ------------------------------------------------------------------------------
+# Defaults that are used when no distribution strategy is explicitly created.
+# We create them lazily in a function so that we can workaround the circular
+# dependency on distribute_lib. See lazy loader at the top of this file.
+
+_defaults = {
+    "distribution_strategy": None,
+    "replica_context": None,
+    "replica_mode": None
+}
+
+
+def _get_default_distribution_strategy():
+  if _defaults["distribution_strategy"] is None:
+    _defaults["distribution_strategy"] = (
+        distribute_lib._DefaultDistributionStrategy())  # pylint: disable=protected-access
+  return _defaults["distribution_strategy"]
+
+
+def _get_default_replica_context():
+  if _defaults["replica_context"] is None:
+    _defaults["replica_context"] = distribute_lib.ReplicaContext(
+        _get_default_distribution_strategy(), replica_id_in_sync_group=0)
+  return _defaults["replica_context"]
+
+
+def _get_default_replica_mode():
+  if _defaults["replica_mode"] is None:
+    _defaults["replica_mode"] = _DefaultReplicaThreadMode()
+  return _defaults["replica_mode"]
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 0289689134e79d4c7637f544620e30e9540e6d16..7d5f231c37da41f10f945adc468f40ffd0ecc743 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -240,6 +240,11 @@ def train_and_evaluate(estimator, train_spec, eval_spec, executor_cls):
       hooks = list(train_spec.hooks)
     else:
       hooks = []
+
+    # Prevent estimator.train from calling distribute coordinator again. This
+    # function calls estimator.train which will use distribute coordinator path
+    # again if `_distribute_coordinator_mode` is set.
+    local_estimator._config._distribute_coordinator_mode = None  # pylint: disable=protected-access
     local_estimator.train(
         input_fn=train_spec.input_fn,
         max_steps=train_spec.max_steps,
@@ -255,6 +260,11 @@ def train_and_evaluate(estimator, train_spec, eval_spec, executor_cls):
     logging.info('Updated config: %s', str(vars(local_estimator._config)))
     local_estimator._eval_distribution = strategy
 
+    # Prevent estimator.evaluate from calling distribute coordinator again. This
+    # function calls estimator.evaluate which will use distribute coordinator
+    # path again if `_distribute_coordinator_mode` is set.
+    local_estimator._config._distribute_coordinator_mode = None  # pylint: disable=protected-access
+
     executor = executor_cls(local_estimator, train_spec, eval_spec)
     executor._start_continuous_evaluation()
     # pylint: enable=protected-access
@@ -277,3 +287,102 @@ def train_and_evaluate(estimator, train_spec, eval_spec, executor_cls):
       mode=run_config._distribute_coordinator_mode,
       cluster_spec=cluster_spec,
       session_config=run_config.session_config)
+
+
+# TODO(yuefengz): maybe merge the following two functions?
+# pylint: disable=protected-access
+def estimator_train(estimator, train_distributed_fn, hooks):
+  """Run distribute coordinator for Estimator's `train` method."""
+  assert estimator._config._distribute_coordinator_mode
+  run_config = estimator._config
+  assert estimator._config.cluster_spec
+  cluster_spec = estimator._config.cluster_spec
+  assert estimator._config._train_distribute
+
+  if 'evaluator' in cluster_spec:
+    raise ValueError("'evaluator' job is not supported if you don't use "
+                     '`train_and_evaluate`')
+
+  if (estimator._config._distribute_coordinator_mode !=  # pylint: disable=protected-access
+      dc.CoordinatorMode.STANDALONE_CLIENT):
+    raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
+                     '`estimator.train`')
+
+  if estimator._config._train_distribute.extended.experimental_between_graph:
+    # TODO(yuefengz): remove this limitation once we figure out how to merge
+    # return values from `_worker_fn`s.
+    raise ValueError('`Estimator.train` API is not supported for %s with '
+                     '`STANDALONE_CLIENT` mode.' %
+                     estimator._config._train_distribute.__class__.__name__)
+
+  def _worker_fn(strategy):
+    """Function for worker task."""
+    local_estimator = copy.deepcopy(estimator)
+    local_estimator._config._train_distribute = strategy
+    context = dc_context.get_current_worker_context()
+    _init_run_config_from_worker_context(local_estimator._config, context)
+    logging.info('Updated config: %s', str(vars(local_estimator._config)))
+    local_estimator._train_distribution = strategy
+
+    if context.is_chief:
+      chief_hooks = hooks
+    else:
+      chief_hooks = []
+    train_distributed_fn(local_estimator, strategy, chief_hooks)
+    return local_estimator
+
+  return dc.run_distribute_coordinator(
+      _worker_fn,
+      estimator._config.train_distribute,
+      mode=run_config._distribute_coordinator_mode,
+      cluster_spec=cluster_spec,
+      session_config=run_config.session_config)
+
+
+def estimator_evaluate(estimator, evaluate_distributed_fn, hooks):
+  """Run distribute coordinator for Estimator's `evaluate` method."""
+  assert estimator._config._distribute_coordinator_mode
+  run_config = estimator._config
+  assert estimator._config.cluster_spec
+  cluster_spec = estimator._config.cluster_spec
+  assert estimator._config._eval_distribute
+
+  if 'evaluator' in cluster_spec:
+    raise ValueError("'evaluator' job is not supported if you don't use "
+                     '`train_and_evaluate`')
+
+  if (estimator._config._distribute_coordinator_mode !=
+      dc.CoordinatorMode.STANDALONE_CLIENT):
+    raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
+                     '`Estimator.evaluate`')
+
+  if estimator._config._eval_distribute.extended.experimental_between_graph:
+    # TODO(yuefengz): remove this limitation once we figure out how to merge
+    # return values from `_worker_fn`s.
+    raise ValueError('`Estimator.evaluate` API is not supported for %s with '
+                     '`STANDALONE_CLIENT` mode.' %
+                     estimator._config._eval_distribute.__class__.__name__)
+
+  def _worker_fn(strategy):
+    """Function for evaluation."""
+    local_estimator = copy.deepcopy(estimator)
+    local_estimator._config._eval_distribute = strategy
+    context = dc_context.get_current_worker_context()
+    _init_run_config_from_worker_context(local_estimator._config, context)
+    logging.info('Updated config: %s', str(vars(local_estimator._config)))
+    local_estimator._eval_distribution = strategy
+
+    if context.is_chief:
+      chief_hooks = hooks
+    else:
+      chief_hooks = []
+    return evaluate_distributed_fn(local_estimator, strategy, chief_hooks)
+
+  return dc.run_distribute_coordinator(
+      _worker_fn,
+      estimator._config.eval_distribute,
+      mode=run_config._distribute_coordinator_mode,
+      cluster_spec=cluster_spec,
+      session_config=run_config.session_config)
+
+# pylint: enable=protected-access
diff --git a/tensorflow/contrib/distribute/python/input_ops.py b/tensorflow/python/distribute/input_ops.py
similarity index 86%
rename from tensorflow/contrib/distribute/python/input_ops.py
rename to tensorflow/python/distribute/input_ops.py
index f07ec8234dfe87f2869cd7c2dd6a64c477712d15..2ded209701e74afe45fc96d66fab65b3ae250596 100644
--- a/tensorflow/contrib/distribute/python/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.experimental.ops import filter_for_shard_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import ops
@@ -27,9 +29,8 @@ from tensorflow.python.platform import tf_logging
 
 # TODO(priyag): Any other reader datasets to consider here?
 _READER_DATASET_OPS = [
-    "TextLineDataset",
-    "TFRecordDataset",
-    "FixedLengthRecordDataset"
+    "TextLineDataset", "TFRecordDataset", "FixedLengthRecordDataset",
+    "FixedLengthRecordDatasetV2"
 ]
 
 
@@ -41,7 +42,8 @@ def auto_shard_dataset(dataset, num_shards, index):
     dataset: A `tf.data.Dataset` instance, typically the result of a bunch of
       dataset transformations.
     num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel. Same usage as in `Dataset.shard`.
+        shards operating in parallel. Same usage as in
+        `tf.data.experimental.filter_for_shard`.
     index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
       Same usage as in `Dataset.shard`.
 
@@ -74,11 +76,15 @@ def auto_shard_dataset(dataset, num_shards, index):
         # constructor. Eventually we will change all cases to clone datasets
         # instead of updating in-place.
         return dataset._clone(
-            filenames=dataset._filenames.shard(num_shards, index))
+            filenames=dataset._filenames.apply(
+                filter_for_shard_ops.filter_for_shard(num_shards, index)))
+      elif isinstance(dataset, dataset_ops.RangeDataset):
+        return dataset.apply(
+            filter_for_shard_ops.filter_for_shard(num_shards, index))
       elif hasattr(dataset, "_map_func"):
         # TODO(priyag): Make this check more robust by enforcing some common
         # property on all map/flatmap/interleave datasets.
-        map_func_def = dataset._map_func.definition
+        map_func_def = dataset._map_func.function.definition
         for node in map_func_def.node_def:
           if node.op in _READER_DATASET_OPS:
             found_reader_op = True
@@ -100,6 +106,11 @@ def auto_shard_dataset(dataset, num_shards, index):
               dataset._input_dataset, found_reader_op)
           return dataset
 
+    if isinstance(dataset, dataset_ops.DatasetV1Adapter):
+      dataset._dataset = _auto_shard_impl(
+          dataset._dataset, found_reader_op)
+      return dataset
+
     # TODO(priyag): Make _input_dataset(s) a common property of all datasets to
     # make this check more robust.
     if hasattr(dataset, "_input_dataset"):
@@ -135,6 +146,7 @@ def auto_shard_dataset(dataset, num_shards, index):
     # TODO(priyag): This will shard the filenames before any shuffling of the
     # filename dataset. It might be desirable to shard after shuffling
     # filenames? If so, how do we achieve that?
-    return dataset.shard(num_shards, index)
+    return dataset.apply(
+        filter_for_shard_ops.filter_for_shard(num_shards, index))
 
   return _auto_shard_impl(dataset=dataset, found_reader_op=False)
diff --git a/tensorflow/contrib/distribute/python/input_ops_test.py b/tensorflow/python/distribute/input_ops_test.py
similarity index 89%
rename from tensorflow/contrib/distribute/python/input_ops_test.py
rename to tensorflow/python/distribute/input_ops_test.py
index 559de97bb1f93f990ddaf775d9203d5a2d46aa99..dcf946ba477635cda5ee3299abf163a2bb9e5bff 100644
--- a/tensorflow/contrib/distribute/python/input_ops_test.py
+++ b/tensorflow/python/distribute/input_ops_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.distribute.python import input_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.distribute import input_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -92,10 +93,11 @@ class AutoShardDatasetTest(test.TestCase):
     with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(record_fn(r, f), sess.run(next_element))
+          self.assertAllEqual(record_fn(r, f), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testTFRecordDataset(self):
     dataset = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset = input_ops.auto_shard_dataset(
@@ -103,6 +105,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._record)
 
+  @test_util.run_deprecated_v1
   def testFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createTFRecordFiles())
@@ -112,6 +115,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._record)
 
+  @test_util.run_deprecated_v1
   def testInterleave(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createTFRecordFiles())
@@ -124,9 +128,10 @@ class AutoShardDatasetTest(test.TestCase):
     # contain records in order of files.
     self._verifySimpleShardingOutput(dataset, self._record)
 
+  @test_util.run_deprecated_v1
   def testListfiles(self):
     filenames = self._createTFRecordFiles()
-    file_pattern = filenames[0].rsplit("/", 1)[0] + "/tf_record.*.txt"
+    file_pattern = filenames[0].rsplit(os.sep, 1)[0] + "/tf_record.*.txt"
     dataset = dataset_ops.Dataset.list_files(file_pattern, shuffle=False)
     dataset = dataset.flat_map(readers.TFRecordDataset)
     dataset = input_ops.auto_shard_dataset(
@@ -138,12 +143,13 @@ class AutoShardDatasetTest(test.TestCase):
       actual, expected = [], []
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          actual.append(sess.run(next_element))
+          actual.append(self.evaluate(next_element))
           expected.append(self._record(r, f))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self.assertAllEqual(expected, actual)
 
+  @test_util.run_deprecated_v1
   def testComplexPipeline(self):
     # Setup a complex input pipeline.
     batch_size = 2
@@ -171,9 +177,9 @@ class AutoShardDatasetTest(test.TestCase):
       num_iterations = (self._num_files * self._num_records * num_epochs) // (
           self._num_shards * batch_size)
       for _ in range(num_iterations):
-        actual.extend(sess.run(next_element))
+        actual.extend(self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
       expected = []
       for f in range(0, self._num_files, self._num_shards):
@@ -183,6 +189,7 @@ class AutoShardDatasetTest(test.TestCase):
 
       self.assertAllEqual(sorted(expected), sorted(actual))
 
+  @test_util.run_deprecated_v1
   def testZip(self):
     dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset2 = readers.TextLineDataset(self._createTextFiles())
@@ -193,6 +200,7 @@ class AutoShardDatasetTest(test.TestCase):
     record_fn = lambda r, f: (self._record(r, f), self._text_line(r, f))
     self._verifySimpleShardingOutput(dataset, record_fn)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset2 = readers.TextLineDataset(self._createTextFiles())
@@ -205,13 +213,15 @@ class AutoShardDatasetTest(test.TestCase):
     with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(self._record(r, f), sess.run(next_element))
+          self.assertAllEqual(self._record(r, f), self.evaluate(next_element))
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(self._text_line(r, f), sess.run(next_element))
+          self.assertAllEqual(
+              self._text_line(r, f), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testTextLineReader(self):
     dataset = readers.TextLineDataset(self._createTextFiles())
     dataset = input_ops.auto_shard_dataset(
@@ -219,6 +229,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._text_line)
 
+  @test_util.run_deprecated_v1
   def testTextLineReaderWithFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(self._createTextFiles())
     dataset = dataset.flat_map(readers.TextLineDataset)
@@ -227,6 +238,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._text_line)
 
+  @test_util.run_deprecated_v1
   def testFixedLengthReader(self):
     dataset = readers.FixedLengthRecordDataset(
         self._createFixedLengthRecordFiles(), self._record_bytes)
@@ -235,6 +247,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._fixed_length_record)
 
+  @test_util.run_deprecated_v1
   def testFixedLengthReaderWithFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createFixedLengthRecordFiles())
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb94dfcfbd206eb81bbb76b36ded23a4f3bc2515
--- /dev/null
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -0,0 +1,919 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class MirroredStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import copy
+import functools
+import threading
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import shared_variable_creator
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import coordinator
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+@contextlib.contextmanager
+def _enter_graph(g):
+  if context.executing_eagerly():
+    with g.as_default(), context.eager_mode():
+      yield
+  else:
+    with g.as_default():
+      yield
+
+
+def _cpu_device(device):
+  cpu_device = tf_device.DeviceSpec.from_string(device)
+  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
+  return cpu_device.to_string()
+
+
+class _RequestedStop(Exception):  # pylint: disable=g-bad-exception-name
+  pass
+
+
+# _call_for_each_replica and _reduce_non_distributed_value are not members of
+# MirroredStrategy so that they are generally not allowed to use anything
+# specific to MirroredStrategy and thus can be shared with other distribution
+# strategies.
+
+
+# TODO(yuefengz): maybe create a common class for those who need to call this
+# _call_for_each_replica.
+def _call_for_each_replica(distribution, fn, args, kwargs):
+  """Run `fn` in separate threads, once per replica/worker device.
+
+  Args:
+    distribution: the DistributionStrategy object.
+    fn: function to run (will be run once per device, each in its own thread).
+    args: positional arguments for `fn`
+    kwargs: keyword arguments for `fn`.
+
+  Returns:
+    Merged return value of `fn` across all replicas.
+
+  Raises:
+    RuntimeError: If fn() calls get_replica_context().merge_call() a different
+        number of times from the available devices.
+  """
+  # TODO(josh11b): Add this option once we add synchronization to variable
+  # creation. Until then, this is pretty unsafe to use.
+  run_concurrently = False
+  if not context.executing_eagerly():
+    # Needed for per-thread device, etc. contexts in graph mode.
+    ops.get_default_graph().switch_to_thread_local()
+
+  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
+
+  shared_variable_store = {}
+
+  # TODO(isaprykin): Create these threads once instead of during every run()
+  # call.
+  threads = []
+  for index, d in enumerate(distribution.extended.worker_devices):
+    variable_creator_fn = shared_variable_creator.make_fn(
+        shared_variable_store, index)
+    t = MirroredExtended._MirroredReplicaThread(  # pylint: disable=protected-access
+        distribution, coord, d, variable_creator_fn, fn,
+        *values.select_device(d, args), **values.select_device(d, kwargs))
+    threads.append(t)
+
+  for t in threads:
+    t.start()
+
+  # When `fn` starts `should_run` event is set on _MirroredReplicaThread
+  # (`MRT`) threads. The execution waits until
+  # `MRT.has_paused` is set, which indicates that either `fn` is
+  # complete or a `get_replica_context().merge_call()` is called.  If `fn` is
+  # complete, then `MRT.done` is set to True.  Otherwise, arguments
+  # of `get_replica_context().merge_call` from all paused threads are grouped
+  # and the `merge_fn` is performed.  Results of the
+  # `get_replica_context().merge_call` are then set to `MRT.merge_result`.
+  # Each such `get_replica_context().merge_call` call returns the
+  # `MRT.merge_result` for that thread when `MRT.should_run` event
+  # is reset again. Execution of `fn` resumes.
+
+  try:
+    with coord.stop_on_exception():
+      all_done = False
+      while not all_done and not coord.should_stop():
+        done = []
+        if run_concurrently:
+          for t in threads:
+            t.should_run.set()
+          for t in threads:
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        else:
+          for t in threads:
+            t.should_run.set()
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        if coord.should_stop():
+          return None
+        all_done = all(done)
+        if not all_done:
+          if any(done):
+            raise RuntimeError("Some replicas made a different number of "
+                               "replica_context().merge_call() calls.")
+          # get_replica_context().merge_call() case
+          merge_args = values.regroup({t.device: t.merge_args for t in threads})
+          merge_kwargs = values.regroup(
+              {t.device: t.merge_kwargs for t in threads})
+          # We capture the name_scope of the MRT when we call merge_fn
+          # to ensure that if we have opened a name scope in the MRT,
+          # it will be respected when executing the merge function. We only
+          # capture the name_scope from the first MRT and assume it is
+          # the same for all other MRTs.
+          mtt_captured_name_scope = threads[0].captured_name_scope
+          # Capture and merge the control dependencies from all the threads.
+          mtt_captured_control_deps = set()
+          for t in threads:
+            mtt_captured_control_deps.update(t.captured_control_deps)
+          with ops.name_scope(mtt_captured_name_scope),\
+              ops.control_dependencies(mtt_captured_control_deps):
+            merge_result = threads[0].merge_fn(distribution, *merge_args,
+                                               **merge_kwargs)
+          for t in threads:
+            t.merge_result = values.select_device(t.device, merge_result)
+  finally:
+    for t in threads:
+      t.should_run.set()
+    coord.join(threads)
+
+  return values.regroup({t.device: t.main_result for t in threads})
+
+
+def _reduce_non_distributed_value(extended, reduce_op, value, destinations):
+  """Reduce a non-DistributedValue `value` to `destinations`."""
+  if isinstance(value, values.DistributedValues):
+    raise ValueError("You are passing a `DistributedValue` to "
+                     "`_reduce_non_distributed_value`, which is not allowed.")
+
+  # If the same value is present on all replicas then the PerReplica value will
+  # be a single value. We also handle the case when `value` is a single value
+  # and equal to 0.
+  if value == 0:
+    return 0
+  # If there is only a single value and the reduce op is MEAN,
+  # that value should be on all destinations.
+  if reduce_op == reduce_util.ReduceOp.MEAN:
+    return value
+
+  cross_device_ops_lib.validate_destinations(destinations)
+  # We do not support a reduce op of SUM if the value is the same across
+  # all replicas. We call this as part of assign functions for MirroredVariables
+  # and summing up identical values across replicas is not clearly defined.
+  if (len(extended.worker_devices) != 1 or
+      not cross_device_ops_lib.check_destinations(destinations)):
+    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
+                     "the given reduce op %s." % (value, reduce_op))
+  # TODO(anjalisridhar): Moves these methods to a device utility file?
+  devices = cross_device_ops_lib.get_devices_from(destinations)
+  if len(devices) == 1:
+    with ops.device(devices[0]):
+      return array_ops.identity(value)
+  else:
+    value_updates = {}
+    for d in devices:
+      with ops.device(d):
+        value_updates[d] = array_ops.identity(value)
+    return values.Mirrored(value_updates)
+
+
+def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
+  # Figure out what collections this variable should be added to.
+  # We'll add the MirroredVariable to those collections instead.
+  collections = kwargs.pop("collections", None)
+  if collections is None:
+    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  kwargs["collections"] = []
+
+  # Get synchronization value
+  synchronization = kwargs.get("synchronization",
+                               variable_scope.VariableSynchronization.ON_WRITE)
+  if synchronization == variable_scope.VariableSynchronization.NONE:
+    raise ValueError("`NONE` variable synchronization mode is not "
+                     "supported with `Mirrored` distribution strategy. Please"
+                     " change the `synchronization` for variable: " +
+                     kwargs["name"])
+  elif synchronization == variable_scope.VariableSynchronization.ON_READ:
+    # Variables that are to be synced on read are replica local.
+    is_replica_local = True
+    kwargs["trainable"] = False
+  elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or
+        synchronization == variable_scope.VariableSynchronization.AUTO):
+    # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`.
+    is_replica_local = False
+  else:
+    raise ValueError("Invalid variable synchronization mode: " +
+                     synchronization + " for variable: " + kwargs["name"])
+
+  # Get aggregation value
+  aggregation = kwargs.pop("aggregation",
+                           variable_scope.VariableAggregation.NONE)
+  if aggregation not in (
+      variable_scope.VariableAggregation.NONE,
+      variable_scope.VariableAggregation.SUM,
+      variable_scope.VariableAggregation.MEAN,
+      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
+  ):
+    raise ValueError("Invalid variable aggregation mode: " + aggregation +
+                     " for variable: " + kwargs["name"])
+
+  # Ignore user-specified caching device, not needed for mirrored variables.
+  kwargs.pop("caching_device", None)
+
+  # TODO(josh11b,apassos): It would be better if variable initialization
+  # was never recorded on the tape instead of having to do this manually
+  # here.
+  with tape.stop_recording():
+    index = real_mirrored_creator(devices, *args, **kwargs)
+
+    if is_replica_local:
+      result = values.ReplicaLocalVariable(
+          index, index[devices[0]], aggregation)
+    else:
+      result = values.MirroredVariable(index, index[devices[0]], aggregation)
+
+  # Add the wrapped variable to the requested collections.
+  # The handling of eager mode and the global step matches
+  # ResourceVariable._init_from_args().
+  if not context.executing_eagerly():
+    g = ops.get_default_graph()
+    # If "trainable" is True, next_creator() will add the member variables
+    # to the TRAINABLE_VARIABLES collection, so we manually remove
+    # them and replace with the MirroredVariable. We can't set
+    # "trainable" to False for next_creator() since that causes functions
+    # like implicit_gradients to skip those variables.
+    if kwargs.get("trainable", True):
+      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+      for v in index.values():
+        if v in l:
+          l.remove(v)
+    g.add_to_collections(collections, result)
+  elif ops.GraphKeys.GLOBAL_STEP in collections:
+    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
+
+  return result
+
+
+def _is_device_list_local(devices):
+  """Checks whether the devices list is for local or multi-worker.
+
+  Args:
+    devices: a list of device strings, either local for remote devices.
+
+  Returns:
+    a boolean indicating whether these device strings are for local or for
+    remote.
+
+  Raises:
+    ValueError: if device strings are not consistent.
+  """
+  all_local = None
+  for d in devices:
+    d_spec = tf_device.DeviceSpec().parse_from_string(d)
+    is_local = d_spec.job in (None, "localhost")
+
+    if all_local is None:  # Determine all_local from first device.
+      all_local = is_local
+
+    if all_local:
+      if not is_local:
+        raise ValueError("Local device string cannot have job specified other "
+                         "than 'localhost'")
+    else:
+      if is_local:
+        raise ValueError("Remote device string must have job specified.")
+      if d_spec.task is None:
+        raise ValueError("Remote device string must have task specified.")
+  return all_local
+
+
+def _cluster_spec_to_device_list(cluster_spec, num_gpus_per_worker):
+  """Returns a device list given a cluster spec."""
+  cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+  devices = []
+  for task_type in ("chief", "worker"):
+    for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
+      if num_gpus_per_worker is 0:
+        devices.append("/job:%s/task:%d" % (task_type, task_id))
+      else:
+        devices.extend([
+            "/job:%s/task:%d/device:GPU:%i" % (task_type, task_id, gpu_id)
+            for gpu_id in range(num_gpus_per_worker)
+        ])
+  return devices
+
+
+def _group_device_list(devices):
+  """Groups the devices list by task_type and task_id.
+
+  Args:
+    devices: a list of device strings for remote devices.
+
+  Returns:
+    a dict of list of device strings mapping from task_type to a list of devices
+    for the task_type in the asceding order of task_id.
+  """
+  assert not _is_device_list_local(devices)
+  device_dict = {}
+
+  for d in devices:
+    d_spec = tf_device.DeviceSpec().parse_from_string(d)
+
+    # Create an entry for the task_type.
+    if d_spec.job not in device_dict:
+      device_dict[d_spec.job] = []
+
+    # Fill the device list for task_type until it covers the task_id.
+    while len(device_dict[d_spec.job]) <= d_spec.task:
+      device_dict[d_spec.job].append([])
+
+    device_dict[d_spec.job][d_spec.task].append(d)
+
+  return device_dict
+
+
+def _infer_num_gpus_per_worker(devices):
+  """Infers the number of GPUs on each worker.
+
+  Currently to make multi-worker cross device ops work, we need all workers to
+  have the same number of GPUs.
+
+  Args:
+    devices: a list of device strings, can be either local devices or remote
+      devices.
+
+  Returns:
+    number of GPUs per worker.
+
+  Raises:
+    ValueError if workers have different number of GPUs or GPU indices are not
+    consecutive and starting from 0.
+  """
+  if _is_device_list_local(devices):
+    return len([d for d in devices if "GPU" in d.upper()])
+  else:
+    device_dict = _group_device_list(devices)
+    num_gpus = None
+    for _, devices_in_task in device_dict.items():
+      for device_in_task in devices_in_task:
+        if num_gpus is None:
+          num_gpus = len([d for d in device_in_task if "GPU" in d.upper()])
+
+        # Verify other workers have the same number of GPUs.
+        elif (
+            num_gpus != len([d for d in device_in_task if "GPU" in d.upper()])):
+          raise ValueError("All workers should have the same number of GPUs.")
+
+        for d in device_in_task:
+          d_spec = tf_device.DeviceSpec().parse_from_string(d)
+          if (d_spec.device_type.upper() == "GPU" and
+              d_spec.device_index >= num_gpus):
+            raise ValueError("Device_index on a worker should be consecutive "
+                             "and start from 0.")
+    return num_gpus
+
+
+def all_local_devices(num_gpus=None):
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  return (tuple("/device:GPU:%d" % i for i in range(num_gpus)) or
+          ("/device:CPU:0",))
+
+
+@tf_export("distribute.MirroredStrategy")
+class MirroredStrategy(distribute_lib.DistributionStrategy):
+  """Mirrors vars to distribute across multiple devices and machines.
+
+  This strategy uses one replica per device and sync replication for its
+  multi-GPU version.
+
+  The multi-worker version will be added in the fture.
+
+  Args:
+    devices: a list of device strings.
+    cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
+      set, nccl will be use by default.
+  """
+
+  def __init__(self, devices=None, cross_device_ops=None):
+    extended = MirroredExtended(
+        self, devices=devices, cross_device_ops=cross_device_ops)
+    super(MirroredStrategy, self).__init__(extended)
+
+
+class MirroredExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of MirroredStrategy."""
+
+  def __init__(self, container_strategy, devices=None, cross_device_ops=None):
+    super(MirroredExtended, self).__init__(container_strategy)
+    if devices is None:
+      devices = all_local_devices()
+    if not devices:
+      raise ValueError("Got an empty `devices` list. Please make sure the "
+                       "`devices` you pass in is not empty.")
+    self._cross_device_ops = cross_device_ops
+    self._initialize_strategy(devices)
+
+  def _initialize_strategy(self, devices):
+    # The _initialize_strategy method is intended to be used by distribute
+    # coordinator as well.
+    if _is_device_list_local(devices):
+      self._initialize_local(devices)
+    else:
+      self._initialize_multi_worker(devices)
+
+  def _initialize_local(self, devices):
+    """Initializes the object for local training."""
+    self._local_mode = True
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = tuple(device_util.resolve(d) for d in devices)
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
+
+    self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
+        devices)
+
+  def _initialize_multi_worker(self, devices):
+    """Initializes the object for multi-worker training."""
+    self._local_mode = False
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = tuple(device_util.resolve(d) for d in devices)
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
+
+    device_dict = _group_device_list(devices)
+    self._workers = []
+    self._worker_devices = []
+    for job in ["chief", "worker"]:
+      for task in range(len(device_dict.get(job, []))):
+        worker = "/job:%s/task:%d" % (job, task)
+        self._workers.append(worker)
+        self._worker_devices.append((worker, device_dict[job][task]))
+
+    # Setting `_default_device` will add a device scope in the
+    # distribution.scope. We set the default device to the first worker. When
+    # users specify device under distribution.scope by
+    #   with tf.device("/cpu:0"):
+    #     ...
+    # their ops will end up on the cpu device of its first worker, e.g.
+    # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
+    self._default_device = self._workers[0]
+
+    self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
+        self._workers, _infer_num_gpus_per_worker(self._devices))
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    """Create a mirrored variable. See `DistributionStrategy.scope`."""
+    colocate_with = kwargs.pop("colocate_with", None)
+    devices = self._get_devices_from(colocate_with)
+
+    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
+      index = {}
+      for i, d in enumerate(devices):
+        with ops.init_scope(), ops.device(d):
+          if i > 0:
+            # Give replicas meaningful distinct names:
+            var0name = index[devices[0]].name.split(":")[0]
+            # We append a / to variable names created on replicas with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
+            # Initialize replicas with the same value:
+            def initial_value_fn(device=d):
+              if context.executing_eagerly():
+                init_value = index[devices[0]].value()
+                return array_ops.identity(init_value)
+              else:
+                with ops.device(device):
+                  init_value = index[devices[0]].initial_value
+                  return array_ops.identity(init_value)
+            kwargs["initial_value"] = initial_value_fn
+          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+            # Don't record operations (e.g. other variable reads) during
+            # variable creation.
+            with tape.stop_recording():
+              v = next_creator(*args, **kwargs)
+          assert not isinstance(v, values.DistributedVariable)
+          index[d] = v
+      return index
+
+    return _create_mirrored_variable(devices, _real_mirrored_creator, *args,
+                                     **kwargs)
+
+  def _distribute_dataset(self, dataset_fn):
+    if self._local_mode:
+      return values.PerReplicaDataset(
+          self._call_dataset_fn(dataset_fn), self._devices)
+    else:
+      return values.MultiWorkerDataset(
+          functools.partial(self._call_dataset_fn, dataset_fn),
+          self._worker_devices,
+          auto_shard=False)
+
+  def _make_dataset_iterator(self, dataset):
+    if self._local_mode:
+      worker = device_util.canonicalize("/device:CPU:0")
+      worker_device_pairs = [(worker, self._devices)]
+    else:
+      worker_device_pairs = self._worker_devices
+
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    input_contexts = []
+    if self._local_mode:
+      num_workers = 1
+      worker = device_util.canonicalize("/device:CPU:0")
+      worker_device_pairs = [(worker, self._devices)]
+    else:
+      num_workers = len(self._worker_devices)
+      worker_device_pairs = self._worker_devices
+
+    for i in range(num_workers):
+      input_contexts.append(distribute_lib.InputContext(
+          num_input_pipelines=num_workers,
+          input_pipeline_id=i,
+          num_replicas_in_sync=self._num_replicas_in_sync))
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs, input_contexts)
+
+  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+
+    ctx = values.MultiStepContext()
+    def body(i, *args):
+      """A wrapper around `fn` to create the while loop body."""
+      del args
+      fn_inputs = iterator.get_next()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, fn_inputs)
+      for (name, output) in ctx.last_step_outputs.items():
+        # Convert all outputs to tensors, potentially from `DistributedValues`.
+        ctx.last_step_outputs[name] = self._unwrap(output)
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      with ops.control_dependencies([fn_result]):
+        return [i + 1] + flat_last_step_outputs
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    cond = lambda i, *args: i < iterations
+    i = constant_op.constant(0)
+    loop_result = control_flow_ops.while_loop(
+        cond, body, [i] + initial_loop_values, name="",
+        parallel_iterations=1, back_prop=False, swap_memory=False,
+        return_same_structure=True)
+    del self._outer_control_flow_context
+
+    ctx.run_op = control_flow_ops.group(loop_result)
+
+    # Convert the last_step_outputs from a list to the original dict structure
+    # of last_step_outputs.
+    last_step_tensor_outputs = loop_result[1:]
+    last_step_tensor_outputs_dict = nest.pack_sequence_as(
+        ctx.last_step_outputs, last_step_tensor_outputs)
+
+    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
+      output = last_step_tensor_outputs_dict[name]
+      # For outputs that have already been reduced, wrap them in a Mirrored
+      # container, else in a PerReplica container.
+      if reduce_op is None:
+        last_step_tensor_outputs_dict[name] = values.regroup(
+            {d: t for d, t in zip(self._devices, output)}, values.PerReplica)
+      else:
+        assert len(output) == 1
+        last_step_tensor_outputs_dict[name] = output[0]
+
+    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+    return ctx
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
+    return self._get_cross_device_ops().broadcast(
+        tensor, destinations or self._devices)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    return _call_for_each_replica(self._container_strategy(), fn, args, kwargs)
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    del task_type, task_id
+
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+    if cluster_spec:
+      # TODO(yuefengz): remove the following code once cluster_resolver is
+      # added.
+      num_gpus_per_worker = _infer_num_gpus_per_worker(self._devices)
+      multi_worker_devices = _cluster_spec_to_device_list(
+          cluster_spec, num_gpus_per_worker)
+      self._initialize_multi_worker(multi_worker_devices)
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    updated_config.isolate_session_state = True
+    return updated_config
+
+  def _get_cross_device_ops(self):
+    return self._cross_device_ops or self._inferred_cross_device_ops
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    if (isinstance(value, values.Mirrored) and
+        reduce_op == reduce_util.ReduceOp.MEAN):
+      return value
+    assert not isinstance(value, values.Mirrored)
+    if not isinstance(value, values.DistributedValues):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return _reduce_non_distributed_value(self, reduce_op, value,
+                                           destinations)
+    return self._get_cross_device_ops().reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    return self._get_cross_device_ops().batch_reduce(reduce_op,
+                                                     value_destination_pairs)
+
+  def _update(self, var, fn, args, kwargs, group):
+    # TODO(josh11b): In eager mode, use one thread per device.
+    assert isinstance(var, values.DistributedVariable)
+    updates = {}
+    for d, v in var._index.items():  # pylint: disable=protected-access
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        # If args and kwargs are not mirrored, the value is returned as is.
+        updates[d] = fn(v,
+                        *values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.update_regroup(self, updates, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    assert isinstance(colocate_with, tuple)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d in colocate_with:
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(*values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.update_regroup(self, updates, group)
+
+  def read_var(self, replica_local_var):
+    """Read the aggregate value of a replica-local variable."""
+    if isinstance(replica_local_var, values.ReplicaLocalVariable):
+      return replica_local_var._get_cross_replica()  # pylint: disable=protected-access
+    assert isinstance(replica_local_var, values.Mirrored)
+    return array_ops.identity(replica_local_var.get())
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      if set(val.devices) == self._canonical_device_set:
+        return tuple(val.get(device=d) for d in self._devices)
+      return tuple(val.get(device=d) for d in sorted(val.devices))
+    return (val,)
+
+  def value_container(self, val):
+    return values.value_container(val)
+
+  @property
+  def _num_replicas_in_sync(self):
+    return len(self._devices)
+
+  @property
+  def worker_devices(self):
+    return self._devices
+
+  @property
+  def parameter_devices(self):
+    return self._devices
+
+  @property
+  def experimental_between_graph(self):
+    return False
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return tuple(self._devices)
+
+  def _get_devices_from(self, colocate_with=None):
+    if colocate_with is None:
+      return self._devices
+    else:
+      return cross_device_ops_lib.get_devices_from(colocate_with)
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
+
+  class _MirroredReplicaThread(threading.Thread):
+    """A thread that runs() a function on a device."""
+
+    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
+                 **kwargs):
+      super(MirroredExtended._MirroredReplicaThread, self).__init__()  # pylint: disable=protected-access
+      self.coord = coord
+      self.distribution = dist
+      self.device = device
+      self.replica_id = dist.extended.worker_devices.index(device)
+      self.variable_creator_fn = variable_creator_fn
+      # State needed to run and return the results of `fn`.
+      self.main_fn = fn
+      self.main_args = args
+      self.main_kwargs = kwargs
+      self.main_result = None
+      self.done = False
+      # State needed to run the next merge_call() (if any) requested via
+      # ReplicaContext.
+      self.merge_fn = None
+      self.merge_args = None
+      self.merge_kwargs = None
+      self.merge_result = None
+      self.captured_name_scope = None
+      # We use a thread.Event for the main thread to signal when this
+      # thread should start running (`should_run`), and another for
+      # this thread to transfer control back to the main thread
+      # (`has_paused`, either when it gets to a
+      # `get_replica_context().merge_call` or when `fn` returns). In
+      # either case the event starts cleared, is signaled by calling
+      # set(). The receiving thread waits for the signal by calling
+      # wait() and then immediately clearing the event using clear().
+      self.should_run = threading.Event()
+      self.has_paused = threading.Event()
+      # These fields have to do with inheriting various contexts from the
+      # parent thread:
+      # pylint: disable=protected-access
+      self.context_mode = context.context()._eager_context.mode
+      if not context.context()._context_handle:
+        context.context()._initialize_handle_and_devices()
+      self.context_device_policy = (
+          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+              context.context()._context_handle))
+      self.graph = ops.get_default_graph()
+      self._variable_creator_stack = self.graph._variable_creator_stack[:]
+      self._captured_var_scope = variable_scope.get_variable_scope()
+      # Adding a "/" at end lets us re-enter this scope later.
+      self._name_scope = self.graph.get_name_scope()
+      if self._name_scope:
+        self._name_scope += "/"
+      if self.replica_id > 0:
+        if not self._name_scope:
+          self._name_scope = ""
+        self._name_scope += "replica_%d/" % self.replica_id
+
+    def run(self):
+      # pylint: disable=protected-access
+      self.graph._variable_creator_stack = self._variable_creator_stack
+      self.should_run.wait()
+      self.should_run.clear()
+      try:
+        if self.coord.should_stop():
+          return
+        with self.coord.stop_on_exception(), \
+            context.context()._mode(self.context_mode), \
+            context.context().device_policy(self.context_device_policy), \
+            _enter_graph(self.graph), \
+            MirroredReplicaContext(self.distribution, constant_op.constant(
+                self.replica_id, dtypes.int32)), \
+            ops.device(self.device), \
+            ops.name_scope(self._name_scope), \
+            variable_scope.variable_scope(
+                self._captured_var_scope, reuse=self.replica_id > 0), \
+            variable_scope.variable_creator_scope(self.variable_creator_fn):
+          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
+          self.done = True
+      finally:
+        self.has_paused.set()
+
+
+class MirroredReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext used in MirroredStrategy.call_for_each_replica().
+
+  Opened in `_MirroredReplicaThread`, to allow the user to invoke
+  `MirroredStrategy`'s specific implementation of `merge_call()`,
+  which works by delegating the function and its arguments to
+  the main thread (the one that invoked
+  `MirroredStrategy.call_for_each_replica()`).
+  """
+
+  def _merge_call(self, fn, args, kwargs):
+    """Delegate to the main thread to actually perform merge_call()."""
+    t = threading.current_thread()  # a _MirroredReplicaThread
+    t.merge_fn = fn
+    t.merge_args = args
+    t.merge_kwargs = kwargs
+    t.captured_name_scope = t.graph.get_name_scope()
+    # Adding a "/" at end lets us re-enter this scope later.
+    if t.captured_name_scope:
+      t.captured_name_scope += "/"
+
+    t.captured_control_deps = t.graph._current_control_dependencies()  # pylint: disable=protected-access
+    t.has_paused.set()
+    t.should_run.wait()
+    t.should_run.clear()
+    if t.coord.should_stop():
+      raise _RequestedStop()
+    return t.merge_result
+
+  @property
+  def devices(self):
+    distribute_lib.require_replica_context(self)
+    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+    return [self._distribution_strategy.extended.worker_devices[replica_id]]
diff --git a/tensorflow/python/distribute/multi_worker_util.py b/tensorflow/python/distribute/multi_worker_util.py
index 360733eff64606db2c4bde1a83351fb414ff2068..2986a6726a5bc2c837a554892f5aebd09da43c91 100644
--- a/tensorflow/python/distribute/multi_worker_util.py
+++ b/tensorflow/python/distribute/multi_worker_util.py
@@ -45,6 +45,33 @@ def normalize_cluster_spec(cluster_spec):
   return cluster_spec
 
 
+# TODO(yuefengz): add more validations.
+def _validate_cluster_spec(cluster_spec, task_type, task_id):
+  """Validates `cluster_spec`.
+
+  It checks
+  1) whether there is such a task type as `task_type` in the
+  `cluster_spec`.
+  2) whether there is at most one "chief" job.
+  3) whether the `task_id` is smaller than the number of `task_type`.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object to be validated.
+    task_type: string indicating the type of the task.
+    task_id: task_id: the id of the `task_type` in this cluster.
+  Throws:
+    ValueError: if `cluster_spec` fails any check.
+  """
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+  if task_type and task_type not in cluster_spec:
+    raise ValueError("`task_type` %r not found in cluster_spec." % task_type)
+  if len(cluster_spec.get("chief", [])) > 1:
+    raise ValueError("There must be at most one 'chief' job.")
+  if task_id >= len(cluster_spec[task_type]):
+    raise ValueError(
+        "The `task_id` %d exceeds the maximum id of %s." % (task_id, task_type))
+
+
 def is_chief(cluster_spec, task_type, task_id):
   """Returns whether the given task is chief in the cluster.
 
@@ -61,20 +88,73 @@ def is_chief(cluster_spec, task_type, task_id):
     ValueError: if `task_type` is not in the `cluster_spec` or `task_id` exceeds
       the maximum id of the `task_type`.
   """
-  cluster_spec = normalize_cluster_spec(cluster_spec)
-  if task_type not in cluster_spec.jobs:
-    raise ValueError(
-        "The task_type \"%s\" is not in the `cluster_spec`." % task_type)
-  if task_id >= cluster_spec.num_tasks(task_type):
-    raise ValueError("The `task_id` %d exceeds the maximum id of %s." % (
-        task_id, task_type))
+  _validate_cluster_spec(cluster_spec, task_type, task_id)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
 
   if task_type == "chief":
     return True
 
   # If chief not in the cluster_spec, use the first worker as chief. This is
   # common in CollectiveAllReduceStrategy.
-  if ("chief" not in cluster_spec.jobs and task_type == "worker" and
-      task_id == 0):
+  if ("chief" not in cluster_spec and task_type == "worker" and task_id == 0):
     return True
   return False
+
+
+def worker_count(cluster_spec, task_type):
+  """Returns the number of workers in the cluster."""
+  _validate_cluster_spec(cluster_spec, task_type, task_id=0)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+
+  # Other jobs such as "ps" shouldn't call this function.
+  if task_type not in ["chief", "worker", "evaluator"]:
+    raise ValueError("Unexpected `task_type` %r" % task_type)
+
+  if task_type == "evaluator":
+    # The "evaluator" is in its own cluster or its own partition of a cluster.
+    # So we don't have to count "chief" or "worker" if the current task is an
+    # "evaluator".
+    return len(cluster_spec["evaluator"])
+  else:
+    # In the non-evaluator case, we return the total number of "chief" and
+    # "worker" tasks as the "chief" is also a worker.
+    return (len(cluster_spec.get("chief", [])) + len(
+        cluster_spec.get("worker", [])))
+
+
+def id_in_cluster(cluster_spec, task_type, task_id):
+  """Returns a unique id for the task in the `task_type`'s cluster.
+
+  It returns an id ranging from [0, `worker_count(task_type, task_id)`).
+
+  Note: this function assumes that "evaluate" job is in its own cluster or its
+  own partition of a cluster.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object to be validated.
+    task_type: string indicating the type of the task.
+    task_id: the id of the `task_type` in this cluster.
+
+  Returns:
+    an int indicating the unique id.
+
+  Throws:
+    ValueError: if `task_type` is not "chief", "worker" or "evaluator".
+  """
+  _validate_cluster_spec(cluster_spec, task_type, task_id)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+
+  # The "chief" job has always id 0 and there is at most one and "worker" jobs
+  # come after it.
+  if task_type == "chief":
+    return 0
+
+  if task_type == "worker":
+    return task_id + len(cluster_spec.get("chief", []))
+
+  # The "evaluator" is in its own cluster or its own partition of a cluster.
+  if task_type == "evaluator":
+    return task_id
+
+  # We currently don't assign ids to other tasks.
+  raise ValueError("There is no id for task_type %r" % task_type)
diff --git a/tensorflow/python/distribute/multi_worker_util_test.py b/tensorflow/python/distribute/multi_worker_util_test.py
index bdc49725c7751873bed665abd3b24b1722b00525..9e1596eefdf6ee83c3b31ef2ccbf1d0637a6027e 100644
--- a/tensorflow/python/distribute/multi_worker_util_test.py
+++ b/tensorflow/python/distribute/multi_worker_util_test.py
@@ -95,7 +95,7 @@ class IsChiefTest(test.TestCase):
     self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1))
 
     with self.assertRaisesRegexp(
-        ValueError, "The task_type \"chief\" is not in the `cluster_spec`."):
+        ValueError, "`task_type` 'chief' not found in cluster_spec."):
       multi_worker_util.is_chief(cluster_spec, "chief", 0)
 
     with self.assertRaisesRegexp(
@@ -103,5 +103,94 @@ class IsChiefTest(test.TestCase):
       multi_worker_util.is_chief(cluster_spec, "worker", 2)
 
 
+class NumWorkersTest(test.TestCase):
+
+  def testCountWorker(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="chief"), 3)
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="worker"), 3)
+
+  def testCountEvaluator(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "evaluator": ["127.0.0.1:7566"]
+    }
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="evaluator"), 1)
+
+  def testTaskTypeNotFound(self):
+    cluster_spec = {}
+    with self.assertRaisesRegexp(
+        ValueError, "`task_type` 'worker' not found in cluster_spec."):
+      multi_worker_util.worker_count(cluster_spec, task_type="worker")
+
+  def testCountPs(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    # A "ps" job shouldn't call this method.
+    with self.assertRaisesRegexp(ValueError, "Unexpected `task_type` 'ps'"):
+      multi_worker_util.worker_count(cluster_spec, task_type="ps")
+
+
+class IdInClusterTest(test.TestCase):
+
+  def testChiefId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "chief", 0), 0)
+
+  def testWorkerId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "worker", 1), 2)
+
+    cluster_spec = {
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "worker", 1), 1)
+
+  def testEvaluatorId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "evaluator": ["127.0.0.1:7566"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "evaluator", 0), 0)
+
+  def testPsId(self):
+    cluster_spec = {"chief": ["127.0.0.1:1234"], "ps": ["127.0.0.1:7566"]}
+    with self.assertRaisesRegexp(ValueError,
+                                 "There is no id for task_type 'ps'"):
+      multi_worker_util.id_in_cluster(cluster_spec, "ps", 0)
+
+  def testMultipleChiefs(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:8258", "127.0.0.1:7566"],
+    }
+    with self.assertRaisesRegexp(ValueError,
+                                 "There must be at most one 'chief' job."):
+      multi_worker_util.id_in_cluster(cluster_spec, "chief", 0)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/reduce_util.py b/tensorflow/python/distribute/reduce_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b2a4e9dba81e38e6bb3ea970e390628fe3cb540
--- /dev/null
+++ b/tensorflow/python/distribute/reduce_util.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilites for reduce operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import enum
+
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("distribute.ReduceOp")
+class ReduceOp(enum.Enum):
+  """Indicates how a set of values should be reduced.
+
+  * `SUM`: Add all the values.
+  * `MEAN`: Take the arithmetic mean ("average") of the values.
+
+  TODO(priyag): Add the following types:
+  * `MIN`: Return the minimum of all values.
+  * `MAX`: Return the maximum of all values.
+  """
+
+  SUM = "SUM"
+  MEAN = "MEAN"
+
+  @staticmethod
+  def from_variable_aggregation(aggregation):
+    mapping = {
+        variable_scope.VariableAggregation.SUM: ReduceOp.SUM,
+        variable_scope.VariableAggregation.MEAN: ReduceOp.MEAN,
+    }
+
+    reduce_op = mapping.get(aggregation)
+    if not reduce_op:
+      raise ValueError("Could not convert from `tf.VariableAggregation` %s to"
+                       "`tf.distribute.ReduceOp` type" % aggregation)
+    return reduce_op
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator.py b/tensorflow/python/distribute/shared_variable_creator.py
similarity index 100%
rename from tensorflow/contrib/distribute/python/shared_variable_creator.py
rename to tensorflow/python/distribute/shared_variable_creator.py
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py b/tensorflow/python/distribute/shared_variable_creator_test.py
similarity index 97%
rename from tensorflow/contrib/distribute/python/shared_variable_creator_test.py
rename to tensorflow/python/distribute/shared_variable_creator_test.py
index 2a9ab51fcfd29a8ae5b37b5c513415af29b277dc..4ddc29f256761c2359f0a49415932b53eda066f4 100644
--- a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
+++ b/tensorflow/python/distribute/shared_variable_creator_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variable_scope
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
new file mode 100644
index 0000000000000000000000000000000000000000..01a1680a246b9beb34c4c5c1b6b3dfe6494c33f3
--- /dev/null
+++ b/tensorflow/python/distribute/values.py
@@ -0,0 +1,1892 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed values.
+
+See go/tf-distribution-strategy.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import contextlib
+import operator
+import weakref
+import six
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import input_ops
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import nest
+
+
+# pylint: disable=line-too-long
+# TODO(josh11b): Should device values be strings or DeviceSpec objects?
+# Not sure DeviceSpec objects are usable as a dict key.
+class DistributedValues(object):
+  """Holds a map from device to values. Either PerReplica or Mirrored."""
+
+  def __init__(self, index):
+    self._index = {device_util.canonicalize(key): value
+                   for key, value in six.iteritems(index)}
+
+  def get(self, device=None):
+    """Returns the value for the current device or raises a ValueError."""
+    if device is None:
+      replica_context = distribution_strategy_context.get_replica_context()
+      if replica_context:
+        # TODO(josh11b): support model parallelism better here
+        device = replica_context.devices[0]
+      else:
+        device = distribute_lib.get_update_device()
+        if device is None:
+          return self._get_cross_replica()
+    device = device_util.canonicalize(device)
+    try:
+      return self._index[device]
+    except KeyError as e:
+      six.raise_from(
+          ValueError("Device %s not found in %s (current device %s)" %
+                     (device, self._index.keys(), device_util.current())), e)
+
+  @property
+  def devices(self):
+    return list(self._index.keys())
+
+  @property
+  def is_tensor_like(self):
+    for v in self._index.values():
+      if not tensor_util.is_tensor(v):
+        return False
+    return True
+
+  def __str__(self):
+    return "%s:%s" % (self.__class__.__name__, self._index)
+
+  def __repr__(self):
+    return "%s(%r)" % (self.__class__.__name__, self._index)
+
+  # TODO(josh11b): Possibly make an accessor for _index for use by
+  # DistributionStrategy implementations.
+
+
+# NOTE(josh11b,apassos): It would be great if we could inspect the values this was
+# initialized with and use that to generate the overloaded operators here.
+# Unfortunately, Python's rules for special methods don't allow this, see
+# https://docs.python.org/3/reference/datamodel.html#special-method-names
+# "if a class defines a method named __getitem__(), and x is an instance of
+# this class, then x[i] is roughly equivalent to type(x).__getitem__(x, i)."
+# In particular, these special methods don't go through __getattr__, and
+# it will only use those methods if they are defined in the class, not the
+# object.
+class DistributedDelegate(DistributedValues):
+  """A map from device to values; acts as the same type as the values."""
+
+  def __getattr__(self, name):
+    # TODO(priyag): This needs to be made robust against pitfalls from mix use
+    # __getattr__ and @property. See b/120402273.
+    return getattr(self.get(), name)
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o): return self.get() + o
+  def __radd__(self, o): return o + self.get()
+  def __sub__(self, o): return self.get() - o
+  def __rsub__(self, o): return o - self.get()
+  def __mul__(self, o): return self.get() * o
+  def __rmul__(self, o): return o * self.get()
+  def __truediv__(self, o): return self.get() / o
+  def __rtruediv__(self, o): return o / self.get()
+  def __floordiv__(self, o): return self.get() // o
+  def __rfloordiv__(self, o): return o // self.get()
+  def __mod__(self, o): return self.get() % o
+  def __rmod__(self, o): return o % self.get()
+  def __lt__(self, o): return self.get() < o
+  def __le__(self, o): return self.get() <= o
+  def __gt__(self, o): return self.get() > o
+  def __ge__(self, o): return self.get() >= o
+  def __and__(self, o): return self.get() & o
+  def __rand__(self, o): return o & self.get()
+  def __or__(self, o): return self.get() | o
+  def __ror__(self, o): return o | self.get()
+  def __xor__(self, o): return self.get() ^ o
+  def __rxor__(self, o): return o ^ self.get()
+  def __getitem__(self, o): return self.get()[o]
+  def __pow__(self, o, modulo=None): return pow(self.get(), o, modulo)
+  def __rpow__(self, o): return pow(o, self.get())
+  def __invert__(self): return ~self.get()
+  def __neg__(self): return -self.get()
+  def __abs__(self): return abs(self.get())
+
+  def __div__(self, o):
+    try:
+      return self.get().__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self.get().__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self.get().__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self.get().__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  # TODO(josh11b): Even more operator overloads.
+
+
+class PerReplica(DistributedValues):
+  """Holds a map from device to unsynchronized values."""
+  pass
+
+
+# Note that unlike PerReplica, Mirrored values inherit from
+# DistributedDelegate and so can be used directly in cross-replica mode.
+class Mirrored(DistributedDelegate):
+  """Holds a map from device to values which are kept in sync."""
+
+  def _get_cross_replica(self):
+    device = device_util.canonicalize(device_util.current())
+    if device in self._index:
+      return self._index[device]
+    return list(self._index.values())[0]
+
+  def _as_graph_element(self):
+    obj = self.get()
+    conv_fn = getattr(obj, "_as_graph_element", None)
+    if conv_fn and callable(conv_fn):
+      return conv_fn()
+    return obj
+
+
+def _assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(array_ops.identity(tensor))
+
+
+DistributedVarOp = collections.namedtuple(
+    "DistributedVarOp", ["name", "graph", "type"])
+
+
+class DistributedVariable(DistributedDelegate):
+  """Holds a map from device to variables."""
+  # TODO(josh11b): Support changing the set of variables if e.g. if new
+  # devices are joining or a device is to leave.
+
+  def __init__(self, index):
+    # Child class must set self._primary_var before calling
+    # super(...).__init__(index).
+    self._common_name = self._primary_var.name.split(":")[0]
+    # Use a weakref to make it easy to map from the contained values
+    # to the container without introducing a reference cycle.
+    for v in six.itervalues(index):
+      v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
+    # tf.keras keeps track of variables initialized using this attribute. When
+    # tf.keras gets the default session, it initializes all uninitialized vars.
+    # We need to make _keras_initialized a member of DistributedVariable because
+    # without this it will use `__getattr__` which will delegate to a component
+    # variable.
+    self._keras_initialized = False
+    # Typically, a `DistributedVariable`'s initializer is composed of the
+    # initializers of the components variables. However, in some cases, such as
+    # when restoring from a checkpoint, we may set the _initializer_op
+    # property on the entire `DistributedVariable`.
+    self._initializer_op = None
+    super(DistributedVariable, self).__init__(index)
+
+  def is_initialized(self, name=None):
+    """Identifies if all the component variables are initialized.
+
+    Args:
+      name: Name of the final `logical_and` op.
+
+    Returns:
+      The op that evaluates to True or False depending on if all the
+      component variables are initialized.
+    """
+    # We have to cast the self._index.values() to a `list` because when we
+    # use `model_to_estimator` to run tf.keras models, self._index.values() is
+    # of type `dict_values` and not `list`.
+    values_list = list(self._index.values())
+    result = values_list[0].is_initialized()
+    # We iterate through the list of values except the last one to allow us to
+    # name the final `logical_and` op the same name that is passed by the user
+    # to the `is_initialized` op. For distributed variables, the
+    # `is_initialized` op is a `logical_and` op.
+    for v in values_list[1:-1]:
+      result = math_ops.logical_and(result, v.is_initialized())
+    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
+                                  name=name)
+    return result
+
+  @property
+  def initializer(self):
+    if self._initializer_op:
+      init_op = self._initializer_op
+    else:
+      # return grouped ops of all the var initializations of component values of
+      # the mirrored variable
+      init_op = control_flow_ops.group(
+          [v.initializer for v in self._index.values()])
+    return init_op
+
+  @property
+  def graph(self):
+    return self._primary_var.graph
+
+  @property
+  def _shared_name(self):
+    return self._common_name
+
+  @property
+  def _unique_id(self):
+    return self._primary_var._unique_id   # pylint: disable=protected-access
+
+  @property
+  def name(self):
+    return self._primary_var.name
+
+  @property
+  def dtype(self):
+    return self._primary_var.dtype
+
+  @property
+  def shape(self):
+    return self._primary_var.shape
+
+  def get_shape(self):
+    return self._primary_var.get_shape()
+
+  def to_proto(self, export_scope=None):
+    return self._primary_var.to_proto(export_scope=export_scope)
+
+  @property
+  def op(self):
+    # We want cross-replica code that does some var.op.X calls
+    # to work (even if the current device isn't in self.devices), but
+    # other uses of var.op in a cross-replica context to fail.
+    if distribution_strategy_context.get_cross_replica_context():
+      return DistributedVarOp(self._primary_var.op.name,
+                              self._primary_var.op.graph,
+                              self._primary_var.op.type)
+    return self.get().op
+
+  @property
+  def _in_graph_mode(self):
+    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
+
+  def read_value(self):
+    return distribution_strategy_context.get_distribution_strategy().read_var(
+        self)
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+
+ops.register_dense_tensor_like_type(DistributedVariable)
+
+
+def _apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.broadcast(strategy.unwrap(value)[0],
+                              destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.extended.reduce_to(reduce_op, value, destinations)
+
+
+class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
+  """Class for defining how to restore a MirroredVariable."""
+
+  def __init__(self, mirrored_variable, primary_variable, name):
+    self._mirrored_variable = mirrored_variable
+    super(_MirroredSaveable, self).__init__(primary_variable, "", name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group([
+        _assign_on_device(d, v, tensor)
+        for d, v in six.iteritems(self._mirrored_variable._index)])  # pylint: disable=protected-access
+
+
+class MirroredVariable(DistributedVariable, Mirrored,
+                       checkpointable.CheckpointableBase):
+  """Holds a map from device to variables whose values are kept in sync."""
+
+  def __init__(self, index, primary_var, aggregation):
+    self._primary_var = primary_var
+    self._aggregation = aggregation
+    super(MirroredVariable, self).__init__(index)
+
+  # The arguments to update() are automatically unwrapped so the update()
+  # function would normally see regular variables, not MirroredVariables.
+  # However, the update function can still operate on wrapped MirroredVariables
+  # through object members, captured arguments, etc. This is more likely in an
+  # update_non_slot() function (like OptimizerV2._finish), which can
+  # update several non-slot variables in one call.
+  def _assign_func(self, *args, **kwargs):
+    f = kwargs.pop("f")
+    if distribution_strategy_context.get_cross_replica_context():
+      update_device = distribute_lib.get_update_device()
+      if update_device is not None:
+        # We are calling an assign function on the mirrored variable in an
+        # update context.
+        v = self.get(device=update_device)
+        return f(v, *args, **kwargs)
+
+      # We are calling assign on the mirrored variable in cross replica context,
+      # use update to update the variable.
+      strategy = distribution_strategy_context.get_distribution_strategy()
+      return strategy.update(self, f, *args, **kwargs)
+    else:
+      _assert_replica_context()
+      # We are calling an assign function on the mirrored variable in replica
+      # context.
+      # We reduce the value we want to assign/add/sub. More details about how we
+      # handle the different use cases can be found in the _reduce method.
+      # We call the function on each of the mirrored variables with the reduced
+      # value.
+      if self._aggregation == vs.VariableAggregation.NONE:
+        raise ValueError("You must specify an aggregation method to update a "
+                         "MirroredVariable in Replica Context.")
+
+      def merge_fn(strategy, value, *other_args, **other_kwargs):
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
+
+      return distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=args, kwargs=kwargs)
+
+  def assign_sub(self, *args, **kwargs):
+    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
+    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
+    return self._assign_func(f=assign_add_fn, *args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
+    return self._assign_func(f=assign_fn, *args, **kwargs)
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  def _get_cross_replica(self):
+    device = device_util.canonicalize(device_util.current())
+    if device in self._index:
+      return array_ops.identity(self._index[device])
+    return array_ops.identity(self._primary_var)
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    if distribution_strategy_context.get_cross_replica_context():
+      return self._primary_var._as_graph_element()
+    return self.get()._as_graph_element()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    MirroredVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _MirroredSaveable(self, self._primary_var, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
+  # Try to avoid assignments to and other mutations of MirroredVariable
+  # state except through a DistributionStrategy.update() call.
+  assert not as_ref
+  return ops.internal_convert_to_tensor(
+      var.get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+ops.register_tensor_conversion_function(MirroredVariable,
+                                        _tensor_conversion_mirrored)
+
+
+def _enclosing_tpu_context():
+  # pylint: disable=protected-access
+  tpu_context = ops.get_default_graph()._get_control_flow_context()
+  # pylint: enable=protected-access
+  while tpu_context is not None and not isinstance(
+      tpu_context, control_flow_ops.XLAControlFlowContext):
+    tpu_context = tpu_context.outer_context
+  return tpu_context
+
+
+# TODO(jhseu): Deduplicate code. We copy code because we don't want to
+# inherit from DistributedDelegate. DistributedDelegate will not work in a
+# tpu.replicate() because it assumes that you're in a device context where you
+# can operate on a single version of the variable, but a tpu.replicate()
+# operates on all variables and is replicated during a rewrite pass.
+class TPUMirroredVariable(checkpointable.CheckpointableBase):
+  """Holds a map from device to TPU variables whose values are kept in sync."""
+
+  def __init__(self, index, primary_var, aggregation):
+    # Use a weakref to make it easy to map from the contained values
+    # to the container without introducing a reference cycle.
+    for v in six.itervalues(index):
+      v._mirrored_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._index = {device_util.canonicalize(key): value
+                   for key, value in six.iteritems(index)}
+    self._primary_var = primary_var
+    self._common_name = self._primary_var.name.split(":")[0]
+    self._aggregation = aggregation
+    # Needed for GradientTape
+    self._trainable = self._primary_var.trainable
+    # Typically like `DistributedVariable`, a `TPUMirroredVariable`'s
+    # initializer is composed of the initializers of the components variables.
+    # However, in some cases, such as when restoring from a checkpoint, we may
+    # set the _initializer_op property on the entire `TPUMirroredVariable`.
+    self._initializer_op = None
+
+  def _get(self, device=None):
+    """Returns the value for the current device or raises a ValueError."""
+    if device is None:
+      replica_context = distribution_strategy_context.get_replica_context()
+      if replica_context:
+        # TODO(josh11b): support model parallelism better here
+        device = replica_context.devices[0]
+      else:
+        device = distribute_lib.get_update_device()
+        if device is None:
+          return self._get_cross_replica()
+    device = device_util.canonicalize(device)
+    try:
+      return self._index[device]
+    except KeyError as e:
+      six.raise_from(
+          ValueError("Device %s not found in %s (current device %s)" %
+                     (device, self._index.keys(), device_util.current())), e)
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o): return self.read_value() + o
+  def __radd__(self, o): return o + self.read_value()
+  def __sub__(self, o): return self.read_value() - o
+  def __rsub__(self, o): return o - self.read_value()
+  def __mul__(self, o): return self.read_value() * o
+  def __rmul__(self, o): return o * self.read_value()
+  def __truediv__(self, o): return self.read_value() / o
+  def __rtruediv__(self, o): return o / self.read_value()
+  def __floordiv__(self, o): return self.read_value() // o
+  def __rfloordiv__(self, o): return o // self.read_value()
+  def __mod__(self, o): return self.read_value() % o
+  def __rmod__(self, o): return o % self.read_value()
+  def __lt__(self, o): return self.read_value() < o
+  def __le__(self, o): return self.read_value() <= o
+  def __gt__(self, o): return self.read_value() > o
+  def __ge__(self, o): return self.read_value() >= o
+  def __and__(self, o): return self.read_value() & o
+  def __rand__(self, o): return o & self.read_value()
+  def __or__(self, o): return self.read_value() | o
+  def __ror__(self, o): return o | self.read_value()
+  def __xor__(self, o): return self.read_value() ^ o
+  def __rxor__(self, o): return o ^ self.read_value()
+  def __getitem__(self, o): return self.read_value()[o]
+  def __pow__(self, o, modulo=None): return pow(self.read_value(), o, modulo)
+  def __rpow__(self, o): return pow(o, self.read_value())
+  def __invert__(self): return ~self.read_value()
+  def __neg__(self): return -self.read_value()
+  def __abs__(self): return abs(self.read_value())
+
+  def __div__(self, o):
+    try:
+      return self.read_value().__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self.read_value().__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self.read_value().__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self.read_value().__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  @property
+  def handle(self):
+    # If we're in a tpu.rewrite(), return the replicated handle.
+    tpu_context = _enclosing_tpu_context()
+    if tpu_context is not None:
+      return tpu_context.get_replicated_var_handle(
+          self._common_name, nest.flatten(self._index))
+
+    device = distribute_lib.get_update_device()
+    if device is None:
+      return self._primary_var.handle
+    device = device_util.canonicalize(device)
+    try:
+      return self._index[device].handle
+    except KeyError as e:
+      six.raise_from(
+          ValueError("Device %s not found in %s (current device %s)" %
+                     (device, self._index.keys(), device_util.current())), e)
+
+  @property
+  def device(self):
+    return self._get().device
+
+  # The arguments to update() are automatically unwrapped so the update()
+  # function would normally see regular variables, not MirroredVariables.
+  # However, the update function can still operate on wrapped MirroredVariables
+  # through object members, captured arguments, etc. This is more likely in an
+  # update_non_slot() function (like OptimizerV2._finish), which can
+  # update several non-slot variables in one call.
+  def _assign_func(self, *args, **kwargs):
+    strategy = distribution_strategy_context.get_distribution_strategy()
+    if strategy.__class__.__name__ != "TPUStrategy":
+      raise ValueError("You may only assign to a TPUMirroredVariable within a "
+                       "TPUStrategy.")
+    f = kwargs.pop("f")
+    if distribution_strategy_context.get_cross_replica_context():
+      if _enclosing_tpu_context() is not None:
+        return distribution_strategy_context.get_distribution_strategy().update(
+            self, f, *args, **kwargs)
+
+      update_device = distribute_lib.get_update_device()
+      # We are calling update on the mirrored variable in cross replica context.
+      if update_device is not None:
+        # We are calling an assign function on the mirrored variable in cross
+        # replica context.
+        v = self._get(device=update_device)
+        return f(v, *args, **kwargs)
+
+      return distribution_strategy_context.get_distribution_strategy().update(
+          self, f, *args, **kwargs)
+    else:
+      _assert_replica_context()
+      # We are calling an assign function on the mirrored variable in replica
+      # context.
+      # We reduce the value we want to assign/add/sub. More details about how we
+      # handle the different use cases can be found in the _reduce method.
+      # We call the function on each of the mirrored variables with the reduced
+      # value.
+      if self._aggregation == vs.VariableAggregation.NONE:
+        raise ValueError("You must specify an aggregation method to update a "
+                         "TPUMirroredVariable in Replica Context.")
+
+      def merge_fn(strategy, value, *other_args, **other_kwargs):
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
+
+      return distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=args, kwargs=kwargs)
+
+  @contextlib.contextmanager
+  def _handle_graph(self, handle):
+    # Note: might have an eager tensor but not be executing eagerly when
+    # building functions.
+    if (context.executing_eagerly() or isinstance(handle, ops.EagerTensor)
+        or ops.has_default_graph()):
+      yield
+    else:
+      with handle.graph.as_default():
+        yield
+
+  @property
+  def trainable(self):
+    return self._trainable
+
+  def _read_variable_op(self, parent_op=None):
+    if self.trainable:
+      tape.variable_accessed(self)
+    if parent_op is not None:
+      with ops.control_dependencies([parent_op]):
+        return gen_resource_variable_ops.read_variable_op(
+            self.handle, self.dtype)
+
+    return gen_resource_variable_ops.read_variable_op(
+        self.handle, self.dtype)
+
+  def read_value(self):
+    return self._read_variable_op()
+
+  def assign_sub(self, *args, **kwargs):
+    def assign_sub_fn(var, delta, **kw):
+      name = kw.pop("name", None)
+      read_value = kw.pop("read_value", True)
+      with self._handle_graph(var.handle):
+        op = gen_resource_variable_ops.assign_sub_variable_op(
+            var.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+            name=name)
+      if read_value:
+        return self._read_variable_op(parent_op=op)
+      return op
+
+    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    def assign_add_fn(var, delta, **kw):
+      name = kw.pop("name", None)
+      read_value = kw.pop("read_value", True)
+      with self._handle_graph(var.handle):
+        op = gen_resource_variable_ops.assign_add_variable_op(
+            var.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+            name=name)
+      if read_value:
+        return self._read_variable_op(parent_op=op)
+      return op
+
+    return self._assign_func(f=assign_add_fn, *args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    def assign_fn(var, value, **kw):
+      name = kw.pop("name", None)
+      read_value = kw.pop("read_value", True)
+      with self._handle_graph(var.handle):
+        op = gen_resource_variable_ops.assign_variable_op(
+            var.handle, ops.convert_to_tensor(value, dtype=self.dtype),
+            name=name)
+      if read_value:
+        return self._read_variable_op(parent_op=op)
+      return op
+
+    return self._assign_func(f=assign_fn, *args, **kwargs)
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  @property
+  def constraint(self):
+    return None
+
+  @property
+  def initializer(self):
+    if self._initializer_op:
+      init_op = self._initializer_op
+    else:
+      init_op = control_flow_ops.group(
+          [v.initializer for v in self._index.values()])
+    return init_op
+
+  @property
+  def graph(self):
+    return self._primary_var.graph
+
+  @property
+  def _shared_name(self):
+    return self._common_name
+
+  @property
+  def _unique_id(self):
+    return self._primary_var._unique_id  # pylint: disable=protected-access
+
+  @property
+  def name(self):
+    return self._primary_var.name
+
+  @property
+  def dtype(self):
+    return self._primary_var.dtype
+
+  @property
+  def shape(self):
+    return self._primary_var.shape
+
+  def get_shape(self):
+    return self._primary_var.get_shape()
+
+  def to_proto(self, export_scope=None):
+    return self._primary_var.to_proto(export_scope=export_scope)
+
+  def _get_cross_replica(self):
+    device = device_util.canonicalize(device_util.current())
+    if device in self._index:
+      return self._index[device]
+    return self._primary_var
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    if distribution_strategy_context.get_cross_replica_context():
+      return self._primary_var._as_graph_element()
+    return self._read_variable_op()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    MirroredVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _MirroredSaveable(self, self._primary_var, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+  # Needed to pass ResourceVariable checks.
+  @property
+  def op(self):
+    return self._primary_var.op
+
+  # pylint: disable=protected-access
+  @property
+  def _save_slice_info(self):
+    return self._primary_var._save_slice_info
+
+  def _get_save_slice_info(self):
+    return self._primary_var._get_save_slice_info()
+
+  def _set_save_slice_info(self, save_slice_info):
+    return self._primary_var._set_save_slice_info(save_slice_info)
+  # pylint: enable=protected-access
+
+  @property
+  def _in_graph_mode(self):
+    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts a variable to a tensor."""
+    # pylint: disable=protected-access
+    if _enclosing_tpu_context() is None:
+      return self._get()._dense_var_to_tensor(dtype, name, as_ref)
+    # pylint: enable=protected-access
+    if dtype is not None and dtype != self.dtype:
+      return math_ops.cast(self.read_value(), dtype)
+    if as_ref:
+      return self.handle
+    else:
+      return self.read_value()
+
+  def is_initialized(self, name=None):
+    """Identifies if all the component variables are initialized.
+
+    Args:
+      name: Name of the final `logical_and` op.
+
+    Returns:
+      The op that evaluates to True or False depending on if all the
+      component variables are initialized.
+    """
+    # TODO(jhseu): Do we need TPU context implementation?
+
+    # We have to cast the self._index.values() to a `list` because when we
+    # use `model_to_estimator` to run tf.keras models, self._index.values() is
+    # of type `dict_values` and not `list`.
+    values_list = nest.flatten(self._index)
+    result = values_list[0].is_initialized()
+    # We iterate through the list of values except the last one to allow us to
+    # name the final `logical_and` op the same name that is passed by the user
+    # to the `is_initialized` op. For distributed variables, the
+    # `is_initialized` op is a `logical_and` op.
+    for v in values_list[1:-1]:
+      result = math_ops.logical_and(result, v.is_initialized())
+    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
+                                  name=name)
+    return result
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion_tpu_mirrored(var, dtype=None, name=None, as_ref=False):
+  return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(TPUMirroredVariable,
+                                        _tensor_conversion_tpu_mirrored)
+ops.register_dense_tensor_like_type(TPUMirroredVariable)
+
+
+class _ReplicaLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Class for defining how to restore a ReplicaLocalVariable."""
+
+  def __init__(self, replica_local_variable, name):
+    self._replica_local_variable = replica_local_variable
+    # We use a callable so that we don't have to evaluate this expression
+    # in the case where we are trying to restore instead of save.
+    def tensor():
+      return distribution_strategy_context.get_distribution_strategy().read_var(
+          replica_local_variable)
+    spec = saver.BaseSaverBuilder.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name,
+        dtype=replica_local_variable.dtype)
+    super(_ReplicaLocalSaveable, self).__init__(tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    tensor, = restored_tensors
+    return self._replica_local_variable.assign(tensor)
+
+
+def _assert_replica_context():
+  if not distribution_strategy_context.get_replica_context():
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+
+
+class ReplicaLocalVariable(DistributedVariable, PerReplica,
+                           checkpointable.CheckpointableBase):
+  """Holds a map from device to variables whose values are reduced on save."""
+
+  def __init__(self, index, primary_var, aggregation):
+    self._primary_var = primary_var
+    self._aggregation = aggregation
+    super(ReplicaLocalVariable, self).__init__(index)
+
+  def assign_sub(self, *args, **kwargs):
+    _assert_replica_context()
+    return self.get().assign_sub(*args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    _assert_replica_context()
+    return self.get().assign_add(*args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    if distribution_strategy_context.get_cross_replica_context():
+      # To preserve the sum across save and restore, we have to divide the
+      # total across all devices when restoring a variable that was summed
+      # when saving.
+      tensor = args[0]
+      if self._aggregation == vs.VariableAggregation.SUM:
+        tensor *= 1. / len(self.devices)
+      return control_flow_ops.group(
+          [_assign_on_device(d, v, tensor)
+           for d, v in six.iteritems(self._index)])
+    else:
+      _assert_replica_context()
+      return self.get().assign(*args, **kwargs)
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  def _get_cross_replica(self):
+    if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+      return self._primary_var
+    all_components = tuple(self._index.values())
+    # TODO(josh11b): Use a strategy-specific method.
+    total = math_ops.add_n(all_components)
+    if self._aggregation == vs.VariableAggregation.MEAN:
+      return total * (1./ len(all_components))
+    return total
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    if distribution_strategy_context.get_cross_replica_context():
+      return self._get_cross_replica()
+    return self.get()._as_graph_element()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    ReplicaLocalVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _ReplicaLocalSaveable(self, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+
+# Register a conversion function for ReplicaLocalVariable which allows as_ref to
+# be true.
+def _tensor_conversion_replica_local(var, dtype=None, name=None, as_ref=False):
+  return ops.internal_convert_to_tensor(
+      var.get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+ops.register_tensor_conversion_function(ReplicaLocalVariable,
+                                        _tensor_conversion_replica_local)
+
+
+def _devices_match(d1, d2):
+  return device_util.canonicalize(d1) == device_util.canonicalize(d2)
+
+
+def regroup(per_replica, wrap_class=PerReplica):
+  """Makes device->nest map into a nest of PerReplica/Mirrored values."""
+  items = list(per_replica.items())
+  assert items
+  v0 = items[0][1]  # First value
+
+  if isinstance(v0, list):
+    for _, v in items[1:]:
+      assert isinstance(v, list)
+      assert len(v) == len(v0), ("len(v) == %d, len(v0) == %d, v: %s, v0: %s" %
+                                 (len(v), len(v0), v, v0))
+    return [regroup({k: v[i] for k, v in items}, wrap_class)
+            for i in range(len(v0))]
+
+  if isinstance(v0, tuple):
+    for _, v in items[1:]:
+      assert isinstance(v, tuple)
+      assert len(v) == len(v0)
+    regrouped_tuple = tuple(regroup({k: v[i] for k, v in items}, wrap_class)
+                            for i in range(len(v0)))
+    if hasattr(v0, "_fields"):
+      # This tuple is in fact a namedtuple! Create a new namedtuple instance
+      # and initialize it with the regrouped values:
+      assert hasattr(type(v0), "_make")
+      return type(v0)._make(regrouped_tuple)
+    else:
+      return regrouped_tuple
+
+  if isinstance(v0, dict):
+    v0keys = set(v0.keys())
+    for _, v in items[1:]:
+      assert isinstance(v, dict)
+      assert set(v.keys()) == v0keys
+    return {key: regroup({k: v[key] for k, v in items}, wrap_class)
+            for key in v0keys}
+
+  # If exactly the same object across all devices, return it unwrapped.
+  same_id = True
+  for _, v in items[1:]:
+    if v is not v0:
+      same_id = False
+      break
+  # Consider three cases where same_id is true:
+  # * If v0 is a DistributedVariable (a MirroredVariable or
+  #   ReplicaLocalVariable, and same_id means it is the same across all
+  #   devices), we want to return it. We check DistributedVariable
+  #   specifically since it can look like it has a
+  #   _distributed_container member since its members do.
+  # * If v0 is a member of a distributed variable, in which case
+  #   hasattr(v0, "_distributed_container") is true, we want to
+  #   return the DistributedVariable that contains it using the
+  #   _distributed_container logic below. This case can trigger
+  #   same_id when there is only one device.
+  # * In any other situation, same_id means we return v0.
+  if same_id and (isinstance(v0, DistributedVariable) or
+                  not hasattr(v0, "_distributed_container")):
+    return v0
+
+  # Detect the case where each device has a parallel component of the
+  # same MirroredVariable (or ReplicaLocalVariable). In this case we
+  # want to return the containing MirroredVariable, after a bunch of
+  # sanity checking. In particular, each component should have the
+  # same container, and the devices of the variables should match the
+  # keys of the per-replica dictionary.
+  if hasattr(v0, "_distributed_container"):
+    # pylint: disable=protected-access
+    assert not isinstance(v0, MirroredVariable), (
+        "ids = %s, items = %s" % ([id(v[1]) for v in items], items))
+    assert _devices_match(v0.device, items[0][0]), (
+        "v0.device = %s, items = %s" % (v0.device, items))
+    distributed_container = v0._distributed_container()
+    assert distributed_container is not None
+    for d, v in items[1:]:
+      assert _devices_match(v.device, d), (
+          "v.device = %s, d = %s, items = %s" % (v.device, d, items))
+      assert distributed_container is v._distributed_container()
+    return distributed_container
+  # pylint: enable=protected-access
+
+  return wrap_class(per_replica)
+
+
+def select_device(device, structured):
+  """Specialize a nest of regular & per-replica values for one device."""
+  def _get(x):
+    return x.get(device) if isinstance(x, DistributedValues) else x
+
+  return nest.map_structure(_get, structured)
+
+
+def select_device_mirrored(device, structured):
+  """Specialize a nest of regular & mirrored values for one device."""
+  def _get_mirrored(x):
+    if isinstance(x, DistributedValues):
+      if not isinstance(x, Mirrored):
+        raise TypeError(
+            "Expected value to be mirrored across replicas: %s in %s." %
+            (x, structured))
+      return x.get(device)
+    else:
+      return x
+
+  return nest.map_structure(_get_mirrored, structured)
+
+
+def update_regroup(extended, updates, group):
+  """Regroup for an update, with dependencies to ensure all updates execute."""
+  regrouped = regroup(updates, Mirrored)
+  if not group:
+    return nest.map_structure(extended._unwrap, regrouped)  # pylint: disable=protected-access
+  grouped_flat = []
+  for u in nest.flatten(regrouped):
+    if isinstance(u, DistributedValues):
+      g = extended._group(u)  # pylint: disable=protected-access
+      if u.is_tensor_like:
+        # Make sure we run all updates. Without this, something like
+        # session.run(extended.update(...)) may only update one replica.
+        index = {}
+        for d in u.devices:
+          with ops.device(d), ops.control_dependencies([g]):
+            index[d] = array_ops.identity(u.get(d))
+        g = Mirrored(index)
+    else:
+      g = u
+    grouped_flat.append(g)
+  return nest.pack_sequence_as(regrouped, grouped_flat)
+
+
+class PerReplicaDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `PerReplicaDataset`."""
+
+  def __init__(self, iterator, devices, prefetch_on_device=None):
+    self._iterator = iterator
+    self._devices = devices
+    self._prefetch_on_device = prefetch_on_device
+
+  @property
+  def initializer(self):
+    return self._iterator.initializer
+
+  def get_next(self, name=None):
+    """Scatter the input across devices."""
+    if self._prefetch_on_device:
+      data_list = self._iterator.get_next()
+      index = dict(zip(self._devices, data_list))
+    else:
+      batch = self._iterator.get_next(name=name)
+      index = {}
+      def get_ith(i):
+        return lambda x: x[i]
+
+      for i, d in enumerate(self._devices):
+        index[d] = nest.map_structure(get_ith(i), batch)
+        if context.executing_eagerly():
+          with ops.device(d):
+            index[d] = nest.map_structure(array_ops.identity, index[d])
+
+    return regroup(index)
+
+  @property
+  def output_classes(self):
+    return self._iterator.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._iterator.output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterator.output_types
+
+
+class PerReplicaDataset(object):
+  """Like `tf.data.Dataset` split devices, producing `PerReplica` data."""
+
+  def __init__(self, dataset, devices, prefetch_on_device=None):
+    self._devices = devices
+
+    # Default to using prefetching in graph mode, unless specified.
+    # TODO(rohanj): Enable prefetching in eager mode.
+    self._prefetch_on_device = prefetch_on_device
+    if self._prefetch_on_device is None:
+      self._prefetch_on_device = not context.executing_eagerly()
+    assert not (self._prefetch_on_device and context.executing_eagerly()), (
+        "Prefetching is only supported in graph mode currently")
+
+    self._dataset = dataset
+    if not self._prefetch_on_device:
+      # TODO(priyag): If dropping remainder is not appropriate, find another
+      # approach to distributing the dataset when not possible to divide evenly.
+      # Possibly not an issue when we start using PartitionedDataset.
+      self._dataset = dataset.batch(len(devices), drop_remainder=True)
+
+  def make_one_shot_iterator(self):
+    """Get a one time use iterator for the distributed PerReplicaDataset."""
+    # Graph mode with one shot iterator is disabled.
+    if not context.executing_eagerly():
+      raise ValueError("Cannot create a one shot iterator. Please use "
+                       "`make_initializable_iterator()` instead.")
+    # Eager mode prefetching would error out in constructor. Only remaining
+    # case is non-prefetching in eager mode. We delegate to
+    # PerReplicaDataIterator to handle that case.
+    dataset_iterator = dataset_ops.make_one_shot_iterator(self._dataset)
+    return PerReplicaDataIterator(
+        dataset_iterator, self._devices, prefetch_on_device=False)
+
+  def make_initializable_iterator(self):
+    """Get an initializable iterator for the distributed PerReplicaDataset."""
+    # Eager mode generates already initialized iterators. Hence we cannot create
+    # an initializable iterator.
+    if context.executing_eagerly():
+      raise ValueError("Cannot create initializable iterator in Eager mode. "
+                       "Please use `make_one_shot_iterator` instead.")
+    if self._prefetch_on_device:
+      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          self._dataset, self._devices)
+    else:
+      dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset)
+    return PerReplicaDataIterator(
+        dataset_iterator,
+        self._devices,
+        prefetch_on_device=self._prefetch_on_device)
+
+
+class MultiWorkerDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
+
+  def __init__(self, iterators, worker_device_pairs):
+    """Initialize the MultiWorkerDataIterator object.
+
+    Args:
+      iterators: a list of worker, iterator pairs.
+      worker_device_pairs: a list of (worker's devices, a list of
+        devices that belong to this worker) pairs.
+
+    Raises:
+      ValueError: if iterators and worker_device_pairs are not compatible.
+    """
+    if [d for d, _ in iterators] != [d for d, _ in worker_device_pairs]:
+      raise ValueError("iterators and worker_device_pairs are not compatible.")
+    self._workers = [d for d, _ in iterators]
+    self._iterators = [i for _, i in iterators]
+    self._worker_devices = [l for _, l in worker_device_pairs]
+
+  @property
+  def initializer(self):
+    return control_flow_ops.group(
+        [iterator.initializer for iterator in self._iterators])
+
+  def get_iterator(self, worker):
+    for i, w in enumerate(self._workers):
+      if worker == w:
+        return self._iterators[i]
+    return None
+
+  @property
+  def output_shapes(self):
+    return self._iterators[0].output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterators[0].output_types
+
+  def get_next(self, name=None):
+    """Scatter the input across hosts and devices."""
+    index = {}
+    worker_info = zip(self._workers, self._iterators, self._worker_devices)
+    for worker, iterator, worker_devices in worker_info:
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        data_per_worker = iterator.get_next(name=new_name)
+
+      # Ungroup these per-replica value so as to get a flat map from devices to
+      # values.
+      for d in worker_devices:
+        v = select_device(d, data_per_worker)
+        if d in index:
+          raise ValueError("Duplicated devices in worker_device_pairs: %r" % v)
+        index[d] = v
+
+    return regroup(index)
+
+
+class MultiWorkerDataset(object):
+  """Like a `tf.data.Dataset` that distributes data to different workers.
+
+  Each worker gets one shard of the input dataset. This currently does not work
+  in eager mode.
+  """
+
+  def __init__(self, dataset_fn, worker_device_pairs, prefetch_on_device=None,
+               auto_shard=False):
+    """Initialize the MultiWorkerDataset object.
+
+    Args:
+      dataset_fn: a function or a list of functions that returns a
+        `tf.data.Dataset`.
+      worker_device_pairs: a list of (worker, list of devices on that worker)
+        pairs; it must have same length with `dataset_fn` if `dataset_fn` is a
+        list.
+      prefetch_on_device: whether to prefetch to devices.
+      auto_shard: whether to auto-shard the dataset.
+    """
+    if isinstance(dataset_fn, list):
+      if len(dataset_fn) != len(worker_device_pairs):
+        raise ValueError("If `dataset_fn` is a list, it must have same length "
+                         "as `worker_device_pairs`")
+      if auto_shard:
+        raise ValueError(
+            "If `dataset_fn` is a list, `auto_shard` is not supported.")
+    self._worker_device_pairs = worker_device_pairs
+    self._datasets = []
+    # TODO(yuefengz, priyag): support different set of jobs for input
+    # processing.
+    for i, (worker, worker_devices) in enumerate(worker_device_pairs):
+      with ops.device(worker):
+        if isinstance(dataset_fn, list):
+          worker_input = dataset_fn[i]()
+        else:
+          worker_input = dataset_fn()
+          if auto_shard:
+            worker_input = input_ops.auto_shard_dataset(
+                worker_input, len(worker_device_pairs), i)
+        dataset = PerReplicaDataset(
+            worker_input, worker_devices, prefetch_on_device=prefetch_on_device)
+        self._datasets.append((worker, dataset))
+
+  def make_one_shot_iterator(self):
+    iterators = []
+    for worker, dataset in self._datasets:
+      with ops.device(worker):
+        iterators.append((worker, dataset_ops.make_one_shot_iterator(dataset)))
+    return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
+
+  def make_initializable_iterator(self):
+    iterators = []
+    for worker, dataset in self._datasets:
+      with ops.device(worker):
+        iterators.append(
+            (worker, dataset_ops.make_initializable_iterator(dataset)))
+    return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
+
+
+class InputIterator(object):
+  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
+
+  def get_next(self):
+    """Returns the next inputs for all replicas."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  def initialize(self):
+    """Initialize the underlying input dataset, when applicable.
+
+    In eager mode, this will create a new iterator and return it.
+    In graph mode, this will initialize the same underlying iterator(s).
+
+    Users are required to call this if
+    - This iterator was returned from a call to `make_input_fn_iterator` with an
+      input function that returns a dataset.
+    - Or this iterator was returned from a call to `make_dataset_iterator`.
+
+    Returns:
+      A list of initialization ops to be executed.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+
+class InputIteratorImpl(InputIterator):
+  """Common implementation for all input iterators."""
+
+  def __init__(self, worker_device_pairs, iterators):
+    if not worker_device_pairs:
+      raise ValueError("Should have at least one worker for input iterator.")
+
+    self._iterators = iterators
+    self._worker_device_pairs = worker_device_pairs
+    self._is_eager = context.executing_eagerly()
+
+  def get_next(self, name=None):
+    """Returns the next input from the iterator for all replicas."""
+    assert self._is_eager == context.executing_eagerly(), (
+        "Iterator should be created and used in same execution mode.")
+
+    index = {}
+    for i, (worker, worker_devices) in enumerate(self._worker_device_pairs):
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        data_per_worker = self._iterators[i].get_next(new_name)
+
+      # Ungroup these per-replica value so as to get a flat map from devices to
+      # values.
+      for d in worker_devices:
+        v = select_device(d, data_per_worker)
+        if d in index:
+          raise ValueError("Duplicated devices in worker_device_pairs: %r" % v)
+        index[d] = v
+
+    return regroup(index)
+
+  def initialize(self):
+    """Initialze underlying iterators.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    assert self._is_eager == context.executing_eagerly(), (
+        "Iterator should be created and used in same execution mode.")
+
+    init_ops = []
+    for it in self._iterators:
+      init_ops.extend(it.initialize())
+    return init_ops
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_classes(self):
+    return self._iterators[0].output_classes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_shapes(self):
+    return self._iterators[0].output_shapes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_types(self):
+    return self._iterators[0].output_types
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  def get_iterator(self, worker):
+    for i, (w, _) in enumerate(self._worker_device_pairs):
+      if worker == w:
+        return self._iterators[i]
+    return None
+
+
+class InputFunctionIterator(InputIteratorImpl):
+  """Iterator created from input function."""
+
+  def __init__(self, input_fn, worker_device_pairs, input_contexts):
+    """Make an iterator for input provided via an input function.
+
+    Currently implements PER_WORKER mode, in which the `input_fn` is called
+    once on each worker.
+
+    TODO(priyag): Add other replication modes.
+    TODO(priyag): Allow taking input function that returns a callable that
+    returns nest of tensors.
+
+    Args:
+      input_fn: Input function that returns a `tf.data.Dataset` object.
+      worker_device_pairs: A list of (worker, list of devices on that worker)
+        pairs.
+      input_contexts: A list of `InputContext` instances to be passed to call(s)
+        to `input_fn`. Length and order should match worker order in
+        `worker_device_pairs`.
+    """
+    if len(worker_device_pairs) != len(input_contexts):
+      raise ValueError(
+          "Number of worker_device_pairs (%d) is not same as number of"
+          "input_contexts (%d)" % (
+              len(worker_device_pairs), len(input_contexts)))
+
+    iterators = []
+    for (worker, devices), ctx in zip(worker_device_pairs, input_contexts):
+      # TODO(priyag): We should probably explicitly specify CPU device on worker.
+      with ops.device(worker):
+        result = input_fn(ctx)
+        if not isinstance(result, dataset_ops.DatasetV2):
+          raise ValueError("input_fn must return a tf.data.Dataset.")
+        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
+        iterators.append(iterator)
+
+    super(InputFunctionIterator, self).__init__(
+        worker_device_pairs, iterators)
+
+
+class DatasetIterator(InputIteratorImpl):
+  """Iterator created from input dataset."""
+
+  def __init__(self, dataset, worker_device_pairs, split_batch_by=None):
+    """Make an iterator for the dataset on given devices.
+
+    If `split_batch_by` is not None, we "split" each batch of the
+    dataset by `split_batch_by` value. To achieve this, we first unbatch the
+    input dataset and then rebatch it with the per replica batch size that is
+    calculated using `global_batch_size // split_batch_by`.
+    The currently supported datasets are as follows:
+    `dataset.batch()` is the last operation on the dataset OR
+    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
+    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
+    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
+
+    TODO(priyag): Support multi worker / host cases properly by cloning
+    and sharding the dataset on each worker. Current setup will only work in
+    some cases, such as in-graph multi worker GPU case. If the input pipeline
+    has random shuffling (with a different seed on each worker), each worker
+    will see random input from the same overall dataset in each step. Otherwise,
+    each worker will see the same input in each step.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be used as the input source.
+      worker_device_pairs: A list of (worker, list of devices on that worker)
+        pairs.
+      split_batch_by: Optional integer. If present, we "split" each batch of the
+        dataset by `split_batch_by` value.
+    """
+    if split_batch_by:
+      dataset = _split_dataset_batch(dataset, split_batch_by)
+
+    iterators = []
+    for worker, worker_devices in worker_device_pairs:
+      with ops.device(worker):
+        iterator = _SingleWorkerDatasetIterator(dataset, worker, worker_devices)
+        iterators.append(iterator)
+
+    super(DatasetIterator, self).__init__(worker_device_pairs, iterators)
+
+
+class _SingleWorkerDatasetIterator(object):
+  """Iterator for a single `tf.data.Dataset`."""
+
+  def __init__(self, dataset, worker, devices):
+    """Create iterator for the `dataset` to fetch data to worker's `devices` .
+
+    `MultiDeviceIterator` is used to prefetch input to the devices on the
+    given worker. `MultiDeviceIterator` doesn't work in eager mode yet.
+
+    Args:
+      dataset: A `tf.data.Dataset` instance.
+      worker: Worker on which ops should be created.
+      devices: Distribute data from `dataset` to these devices.
+    """
+    self._dataset = dataset
+    self._worker = worker
+    self._devices = devices
+    self._is_eager = context.executing_eagerly()
+    self._make_iterator()
+
+  def _make_iterator(self):
+    """Make appropriate iterator on the dataset."""
+    with ops.device(self._worker):
+      if self._is_eager:
+        # TODO(rohanj): Enable prefetching in eager mode.
+        # TODO(priyag): Measure the performance of this approach vs calling
+        # get_next on the original dataset N times.
+        dataset = self._dataset.batch(len(self._devices), drop_remainder=True)
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
+      else:
+        iterator = multi_device_iterator_ops.MultiDeviceIterator(
+            self._dataset, self._devices)
+    self._iterator = iterator
+
+  def get_next(self, name=None):
+    """Get next element from the underlying iterator."""
+    with ops.device(self._worker):
+      if self._is_eager:
+        # Batched dataset case.
+        batch = self._iterator.get_next(name=name)
+        index = {}
+        for i, d in enumerate(self._devices):
+          index[d] = nest.map_structure(operator.itemgetter(i), batch)
+          with ops.device(d):
+            index[d] = nest.map_structure(array_ops.identity, index[d])
+      else:
+        # MultiDeviceIterator case.
+        data_list = self._iterator.get_next()
+        index = dict(zip(self._devices, data_list))
+
+      return regroup(index)
+
+  def initialize(self):
+    """Initialze underlying iterator.
+
+    In eager execution, this simply recreates the underlying iterator.
+    In graph execution, it returns the initializer ops for the underlying
+    iterator.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    if self._is_eager:
+      self._make_iterator()
+      return []
+    else:
+      return [self._iterator.initializer]
+
+  @property
+  def output_classes(self):
+    return self._iterator.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._iterator.output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterator.output_types
+
+
+def _split_dataset_batch(dataset, split_batch_by):
+  """Divide a batch-ed dataset's batches into smaller batches."""
+  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+  # pylint: disable=protected-access
+  def _get_batch_dataset(d):
+    """Get the underlying batch dataset from the dataset object."""
+    if isinstance(d, dataset_ops.DatasetV1Adapter):
+      d = d._dataset
+
+    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
+      return d
+    elif isinstance(d, dataset_ops.PrefetchDataset):
+      return _get_batch_dataset(d._input_dataset)
+    raise ValueError(
+        "Unable to get batched dataset from the input dataset. `batch` "
+        "`map_and_batch` need to be the last operations on the dataset. "
+        "The batch operations can be followed by a prefetch.")
+
+  batched_dataset = _get_batch_dataset(dataset)
+  batch_size = batched_dataset._batch_size
+  drop_remainder = batched_dataset._drop_remainder
+  # pylint: enable=protected-access
+
+  if tensor_util.is_tensor(batch_size):
+    batch_size = tensor_util.constant_value(batch_size)
+
+  if tensor_util.is_tensor(drop_remainder):
+    drop_remainder = tensor_util.constant_value(drop_remainder)
+
+  if batch_size % split_batch_by:
+    raise ValueError(
+        "Batch size %s cannot be sharded evenly across replicas %s" % (
+            batch_size, split_batch_by))
+  new_batch_size = batch_size // split_batch_by
+
+  dataset = dataset.apply(batching.unbatch())
+  return dataset.batch(new_batch_size, drop_remainder=drop_remainder)
+
+
+class MultiStepContext(object):
+  """A context object that can be used to capture things when running steps.
+
+  This context object is useful when running multiple steps at a time using the
+  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
+  function to specify which outputs to emit at what frequency. Currently it
+  supports capturing output from the last step, as well as capturing non tensor
+  outputs.  In the future it will be augmented to support other use cases such
+  as output each N steps.
+  """
+
+  def __init__(self):
+    """Initialize an output context.
+
+    Returns:
+      A context object.
+    """
+    self._last_step_outputs = {}
+    self._last_step_outputs_reduce_ops = {}
+    self._non_tensor_outputs = {}
+
+  @property
+  def last_step_outputs(self):
+    """A dictionary consisting of outputs to be captured on last step.
+
+    Keys in the dictionary are names of tensors to be captured, as specified
+    when `set_last_step_output` is called.
+    Values in the dictionary are the tensors themselves. If
+    `set_last_step_output` was called with a `reduce_op` for this output,
+    then the value is the reduced value.
+
+    Returns:
+      A dictionary with last step outputs.
+    """
+    return self._last_step_outputs
+
+  def _set_last_step_outputs(self, outputs):
+    """Replace the entire dictionary of last step outputs."""
+    if not isinstance(outputs, dict):
+      raise ValueError("Need a dictionary to set last_step_outputs.")
+    self._last_step_outputs = outputs
+
+  def set_last_step_output(self, name, output, reduce_op=None):
+    """Set `output` with `name` to be outputted from the last step.
+
+    Args:
+      name: String, name to identify the output. Doesn't need to match tensor
+        name.
+      output: The tensors that should be outputted with `name`. See below for
+        actual types supported.
+      reduce_op: Reduction method to use to reduce outputs from multiple
+        replicas. Required if `set_last_step_output` is called in a replica
+        context. Optional in cross_replica_context.
+        When present, the outputs from all the replicas are reduced using the
+        current distribution strategy's `reduce` method. Hence, the type of
+        `output` must be what's supported by the corresponding `reduce` method.
+        For e.g. if using MirroredStrategy and reduction is set, output
+        must be a `PerReplica` value.
+        The reduce method is also recorded in a dictionary
+        `_last_step_outputs_reduce_ops` for later interpreting of the
+        outputs as already reduced or not.
+    """
+    if distribution_strategy_context.get_cross_replica_context():
+      self._last_step_outputs_reduce_ops[name] = reduce_op
+      if reduce_op is None:
+        self._last_step_outputs[name] = output
+      else:
+        distribution = distribution_strategy_context.get_distribution_strategy()
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
+    else:
+      assert reduce_op is not None
+      def merge_fn(distribution, value):
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
+        # Setting this inside the `merge_fn` because all replicas share the same
+        # context object, so it's more robust to set it only once (even if all
+        # the replicas are trying to set the same value).
+        self._last_step_outputs_reduce_ops[name] = reduce_op
+
+      distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=(output,))
+
+  @property
+  def non_tensor_outputs(self):
+    """A dictionary consisting of any non tensor outputs to be captured."""
+    return self._non_tensor_outputs
+
+  def set_non_tensor_output(self, name, output):
+    """Set `output` with `name` to be captured as a non tensor output."""
+    if distribution_strategy_context.get_cross_replica_context():
+      self._non_tensor_outputs[name] = output
+    else:
+      def merge_fn(distribution, value):
+        # NOTE(priyag): For non tensor outputs, we simply return all the values
+        # in a list as reduction doesn't make sense on non tensors.
+        self._non_tensor_outputs[name] = distribution.unwrap(value)
+      distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=(output,))
+
+
+def value_container(val):
+  """Returns the container that this per-replica `value` belongs to.
+
+  Args:
+    val: A value returned by `call_for_each_replica()` or a variable
+      created in `scope()`.
+
+  Returns:
+    A container that `value` belongs to.
+    If value does not belong to any container (including the case of
+    container having been destroyed), returns the value itself.
+  """
+  if (hasattr(val, "_distributed_container") and
+      # DistributedVariable has _distributed_container defined
+      # but we don't want to return it.
+      not isinstance(val, DistributedVariable)):
+    container = val._distributed_container()  # pylint: disable=protected-access
+    if container is not None:
+      return container
+  return val
+
+
+# TODO(josh11b): Descend from Variable.
+class AggregatingVariable(checkpointable.CheckpointableBase):
+  """A wrapper around a variable that aggregates updates across replicas."""
+
+  def __init__(self, v, aggregation):
+    self._v = v
+    # NOTE: We don't use "_distributed_container" here because we don't want
+    # to trigger that code path in regroup().
+    v._aggregating_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._aggregation = aggregation
+
+  def get(self):
+    return self._v
+
+  def __getattr__(self, name):
+    return getattr(self._v, name)
+
+  def _assign_func(self, *args, **kwargs):
+    f = kwargs.pop("f")
+    if distribution_strategy_context.get_cross_replica_context():
+      update_device = distribute_lib.get_update_device()
+      if update_device is not None:
+        # We are calling an assign function in an update context.
+        return f(self._v, *args, **kwargs)
+
+      # We are calling an assign function in cross replica context, wrap it in
+      # an update call.
+      return distribution_strategy_context.get_distribution_strategy().update(
+          self, f, *args, **kwargs)
+    else:
+      assert distribution_strategy_context.get_replica_context()
+      # We are calling an assign function in replica context.
+      # We reduce the value we want to assign/add/sub. More details about how we
+      # handle the different use cases can be found in the _reduce method.
+      # We call the function with the reduced value.
+      if self._aggregation == vs.VariableAggregation.NONE:
+        raise ValueError("You must specify an aggregation method to update a "
+                         "a variable in Replica Context.")
+
+      def merge_fn(strategy, value, *other_args, **other_kwargs):
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
+
+      return distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=args, kwargs=kwargs)
+
+  def assign_sub(self, *args, **kwargs):
+    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
+    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
+    return self._assign_func(f=assign_add_fn, *args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
+    return self._assign_func(f=assign_fn, *args, **kwargs)
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  @property
+  def name(self):
+    return self._v.name
+
+  @property
+  def dtype(self):
+    return self._v.dtype
+
+  # TODO(josh11b): Test saving & restoring.
+  def _gather_saveables_for_checkpoint(self):
+    return {checkpointable.VARIABLE_VALUE_KEY: self._v}
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o): return self._v + o
+  def __radd__(self, o): return o + self._v
+  def __sub__(self, o): return self._v - o
+  def __rsub__(self, o): return o - self._v
+  def __mul__(self, o): return self._v * o
+  def __rmul__(self, o): return o * self._v
+  def __truediv__(self, o): return self._v / o
+  def __rtruediv__(self, o): return o / self._v
+  def __floordiv__(self, o): return self._v // o
+  def __rfloordiv__(self, o): return o // self._v
+  def __mod__(self, o): return self._v % o
+  def __rmod__(self, o): return o % self._v
+  def __lt__(self, o): return self._v < o
+  def __le__(self, o): return self._v <= o
+  def __gt__(self, o): return self._v > o
+  def __ge__(self, o): return self._v >= o
+  def __and__(self, o): return self._v & o
+  def __rand__(self, o): return o & self._v
+  def __or__(self, o): return self._v | o
+  def __ror__(self, o): return o | self._v
+  def __xor__(self, o): return self._v ^ o
+  def __rxor__(self, o): return o ^ self._v
+  def __getitem__(self, o): return self._v[o]
+  def __pow__(self, o, modulo=None): return pow(self._v, o, modulo)
+  def __rpow__(self, o): return pow(o, self._v)
+  def __invert__(self): return ~self._v
+  def __neg__(self): return -self._v
+  def __abs__(self): return abs(self._v)
+
+  def __div__(self, o):
+    try:
+      return self._v.__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self._v.__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self._v.__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self._v.__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __str__(self):
+    return str(self._v)
+
+  def __repr__(self):
+    return repr(self._v)
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
+  return ops.internal_convert_to_tensor(
+      var.get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+ops.register_tensor_conversion_function(
+    AggregatingVariable, _tensor_conversion_aggregate)
+ops.register_dense_tensor_like_type(AggregatingVariable)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index d0c1a93118dc51aca9d3b1140f546f61f355bc0a..f43cf9327a1ad6b2b83ebcb2482ad3fc27515251 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -18,7 +18,7 @@ cc_library(
         "pywrap_tfe.h",
     ],
     visibility = [
-        "//learning/deepmind/courier:__pkg__",
+        "//learning/deepmind/courier:__subpackages__",
         "//tensorflow:internal",
     ],
     deps = [
@@ -27,6 +27,7 @@ cc_library(
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:tape",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python:cpp_python_util",
@@ -56,6 +57,7 @@ py_library(
         ":graph_only_ops",
         ":tape",
         ":test",
+        ":wrap_function",
         "//tensorflow/python:pywrap_tensorflow",
     ],
 )
@@ -82,6 +84,7 @@ py_library(
         "//tensorflow/python:errors",
         "//tensorflow/python:platform",
         "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:util",
     ],
 )
@@ -111,9 +114,11 @@ cuda_py_test(
         ":backprop",
         ":context",
         ":test",
+        "//third_party/py/numpy",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:resource_variable_ops",
@@ -140,15 +145,64 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "function_argument_naming_test",
+    size = "medium",
+    srcs = ["function_argument_naming_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "function_defun_collection_test",
+    size = "medium",
+    srcs = ["function_defun_collection_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "function_gradients_test",
+    size = "medium",
+    srcs = ["function_gradients_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":context",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+    shard_count = 5,
+)
+
 cuda_py_test(
     name = "function_test",
+    size = "medium",
     srcs = ["function_test.py"],
     additional_deps = [
         ":backprop",
         ":context",
+        ":def_function",
         ":function",
-        ":tape",
         ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:test_ops",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:layers",
@@ -156,6 +210,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
     ],
+    shard_count = 15,
 )
 
 py_library(
@@ -200,6 +255,18 @@ py_library(
     ],
 )
 
+py_test(
+    name = "execution_callbacks_test",
+    srcs = ["execution_callbacks_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":execution_callbacks",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_library(
     name = "graph_only_ops",
     srcs = ["graph_only_ops.py"],
@@ -244,10 +311,10 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":graph_only_ops",
-        "//tensorflow/python:cond_v2_impl",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:func_graph",
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:graph_to_function_def",
         "//tensorflow/python:util",
@@ -275,10 +342,12 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:unconnected_gradients",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:execute",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "@six_archive//:six",
     ],
 )
@@ -366,6 +435,11 @@ py_library(
     name = "imperative_grad",
     srcs = ["imperative_grad.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:unconnected_gradients",
+        "//tensorflow/python:util",
+    ],
 )
 
 cuda_py_test(
@@ -390,17 +464,33 @@ py_library(
     name = "def_function",
     srcs = ["def_function.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
     deps = [
         ":context",
         ":function",
+        ":lift_to_graph",
+        "//tensorflow/python:cond_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
         "//tensorflow/python/training/checkpointable:base",
     ],
 )
 
+py_library(
+    name = "lift_to_graph",
+    srcs = ["lift_to_graph.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
 py_test(
     name = "def_function_test",
     srcs = ["def_function_test.py"],
@@ -412,3 +502,30 @@ py_test(
         "//tensorflow/python:framework_ops",
     ],
 )
+
+py_library(
+    name = "wrap_function",
+    srcs = ["wrap_function.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        ":function",
+        ":lift_to_graph",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:template",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/training/checkpointable:base",
+    ],
+)
+
+py_test(
+    name = "wrap_function_test",
+    srcs = ["wrap_function_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":wrap_function",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+    ],
+)
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index deac29111f6a613b450b185511ba75b709881021..29f9b2cda3aa2c6e7fff6c6df10fed81779d02c7 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import functools
 import operator
+import sys
 
 import six
 
@@ -33,17 +34,30 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
+# Note that we need to lazy load the following two modules to avoid creating
+# circular dependencies.
+# TODO(b/119775953): fix the circular dependencies.
+pfor_ops = LazyLoader(
+    "pfor_ops", globals(),
+    "tensorflow.python.ops.parallel_for.control_flow_ops")
+
+function = LazyLoader("function", globals(),
+                      "tensorflow.python.eager.function")
+
 _op_attr_type_cache = {}
 
 
@@ -535,11 +549,11 @@ def _aggregate_grads(gradients):
 
   if len(gradients) == 1:
     return gradients[0]
-  if all([isinstance(g, ops.Tensor) for g in gradients]):
+  if all(isinstance(g, ops.Tensor) for g in gradients):
     return gen_math_ops.add_n(gradients)
   else:
-    assert all([isinstance(g, (ops.Tensor, ops.IndexedSlices))
-                for g in gradients])
+    assert all(isinstance(g, (ops.Tensor, ops.IndexedSlices))
+               for g in gradients)
     indexed_slices_list = []
     for grad in gradients:
       # TODO(xpan): Support nested IndexedSlices and core IndexedSlices
@@ -594,7 +608,11 @@ def _zeros(shape, dtype):
   cache_key = shape, dtype, device
   cached = ctx.zeros_cache().get(cache_key)
   if cached is None:
-    cached = _fast_fill(0, shape, dtype)
+    if dtypes.as_dtype(dtype).is_bool:
+      value = False
+    else:
+      value = 0
+    cached = _fast_fill(value, shape, dtype)
     ctx.zeros_cache().put(cache_key, cached)
   return cached
 
@@ -603,9 +621,14 @@ def _ones(shape, dtype):
   if not context.context().executing_eagerly():
     return array_ops.ones(shape, dtype)
 
+  if dtypes.as_dtype(dtype).is_bool:
+    value = True
+  else:
+    value = 1
+
   if shape == ():  # pylint: disable=g-explicit-bool-comparison
-    return constant_op.constant(1, dtype=dtype)
-  return _fast_fill(1, shape, dtype)
+    return constant_op.constant(value, dtype=dtype)
+  return _fast_fill(value, shape, dtype)
 
 
 _default_vspace = imperative_grad.VSpace(
@@ -762,7 +785,12 @@ class GradientTape(object):
 
   def __del__(self):
     if self._created_eagerly:
-      context.context().end_step()
+      try:
+        context.context().end_step()
+      except AttributeError:
+        pass
+      except TypeError:
+        pass
 
   def watch(self, tensor):
     """Ensures that `tensor` is being traced by this tape.
@@ -850,7 +878,11 @@ class GradientTape(object):
     """Returns variables watched by this tape in order of construction."""
     return self._tape.watched_variables()
 
-  def gradient(self, target, sources, output_gradients=None):
+  def gradient(self,
+               target,
+               sources,
+               output_gradients=None,
+               unconnected_gradients=UnconnectedGradients.NONE):
     """Computes the gradient using operations recorded in context of this tape.
 
     Args:
@@ -859,6 +891,10 @@ class GradientTape(object):
         will be differentiated against elements in `sources`.
       output_gradients: a list of gradients, one for each element of
         target. Defaults to None.
+      unconnected_gradients: a value which can either hold 'none' or 'zero' and
+        alters the value which will be returned if the target and sources are
+        unconnected. The possible values and effects are detailed in
+        'UnconnectedGradients' and it defaults to 'none'.
 
     Returns:
       a list or nested structure of Tensors (or IndexedSlices, or None),
@@ -868,6 +904,8 @@ class GradientTape(object):
     Raises:
       RuntimeError: if called inside the context of the tape, or if called more
        than once on a non-persistent tape.
+      ValueError: if the target is a variable or if unconnected gradients is
+       called with an unknown value.
     """
     if self._tape is None:
       raise RuntimeError("GradientTape.gradient can only be called once on "
@@ -887,6 +925,12 @@ class GradientTape(object):
                             "gradient in order to compute higher order "
                             "derrivatives.", 1)
 
+    flat_targets = nest.flatten(target)
+    for t in flat_targets:
+      if resource_variable_ops.is_resource_variable(t):
+        raise ValueError("GradientTape.gradient is not supported for variable "
+                         "targets.")
+
     flat_sources = nest.flatten(sources)
     flat_sources = [_handle_or_self(x) for x in flat_sources]
 
@@ -896,12 +940,223 @@ class GradientTape(object):
 
     flat_grad = imperative_grad.imperative_grad(
         self._tape,
-        nest.flatten(target),
+        flat_targets,
         flat_sources,
-        output_gradients=output_gradients)
+        output_gradients=output_gradients,
+        unconnected_gradients=unconnected_gradients)
 
     if not self._persistent:
       self._tape = None
 
     grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
+
+  def jacobian(self,
+               target,
+               sources,
+               unconnected_gradients=UnconnectedGradients.NONE,
+               parallel_iterations=None,
+               experimental_use_pfor=True):
+    """Computes the jacobian using operations recorded in context of this tape.
+
+    See http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant for the
+    definition of a Jacobian.
+
+    Example usage:
+
+    with tf.GradientTape() as g:
+      x  = tf.constant([1.0, 2.0])
+      g.watch(x)
+      y = x * x
+    jacobian = g.jacobian(y, x)
+    # jacobian value is [[2., 0.], [0., 4.]]
+
+    Args:
+      target: Tensor to be differentiated.
+      sources: a list or nested structure of Tensors or Variables. `target`
+        will be differentiated against elements in `sources`.
+      unconnected_gradients: a value which can either hold 'none' or 'zero' and
+        alters the value which will be returned if the target and sources are
+        unconnected. The possible values and effects are detailed in
+        'UnconnectedGradients' and it defaults to 'none'.
+      parallel_iterations: A knob to control how many iterations are dispatched
+        in parallel. This knob can be used to control the total memory usage.
+      experimental_use_pfor: If true, vectorizes the jacobian computation. Else
+        falls back to a sequential while_loop. Vectorization can sometimes fail
+        or lead to excessive memory usage. This option can be used to disable
+        vectorization in such cases.
+
+    Returns:
+      a list or nested structure of Tensors (or IndexedSlices, or None),
+      one for each element in `sources`. Returned structure is the same as
+      the structure of `sources`.
+
+    Raises:
+      RuntimeError: If called on a non-persistent tape with eager execution
+        enabled and without enabling experimental_use_pfor.
+      ValueError: If vectorization of jacobian computation fails.
+    """
+    flat_sources = nest.flatten(sources)
+    target_static_shape = target.shape
+    target_shape = array_ops.shape(target)
+    # Note that we push and pop the tape here and below. This is needed since we
+    # need gradients through the enclosed operations.
+    self._push_tape()
+    target = array_ops.reshape(target, [-1])
+    self._pop_tape()
+
+    def loop_fn(i):
+      self._push_tape()
+      y = array_ops.gather(target, i)
+      self._pop_tape()
+      return self.gradient(y, flat_sources,
+                           unconnected_gradients=unconnected_gradients)
+
+    try:
+      target_size = int(target.shape[0])
+    except TypeError:
+      target_size = array_ops.shape(target)[0]
+
+    if experimental_use_pfor:
+      try:
+        output = pfor_ops.pfor(loop_fn, target_size,
+                               parallel_iterations=parallel_iterations)
+      except ValueError as err:
+        six.reraise(
+            ValueError,
+            ValueError(
+                str(err) + "\nEncountered an exception while vectorizing the "
+                "jacobian computation. Vectorization can be disabled by setting"
+                " experimental_use_pfor to False."),
+            sys.exc_info()[2])
+    else:
+      if context.executing_eagerly() and not self._persistent:
+        raise RuntimeError(
+            "GradientTape must be created with persistent=True"
+            " to compute the jacobian with eager execution enabled and with "
+            " experimental_use_pfor set to False.")
+      output = pfor_ops.for_loop(
+          loop_fn, [target.dtype] * len(flat_sources), target_size,
+          parallel_iterations=parallel_iterations)
+
+    for i, out in enumerate(output):
+      if out is not None:
+        new_shape = array_ops.concat(
+            [target_shape, array_ops.shape(out)[1:]], axis=0)
+        out = array_ops.reshape(out, new_shape)
+        if context.executing_eagerly():
+          out.set_shape(target_static_shape.concatenate(flat_sources[i].shape))
+      output[i] = out
+
+    return nest.pack_sequence_as(sources, output)
+
+  def batch_jacobian(self,
+                     target,
+                     source,
+                     unconnected_gradients=UnconnectedGradients.NONE,
+                     parallel_iterations=None,
+                     experimental_use_pfor=True):
+    """Computes and stacks per-example jacobians.
+
+    See http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant for the
+    definition of a Jacobian.  This function is essentially an efficient
+    implementation of the following:
+    `tf.stack([self.jacobian(y[i], x[i]) for i in range(x.shape[0])])`.
+
+    Note that compared to `GradientTape.jacobian` which computes gradient of
+    each output value w.r.t each input value, this function is useful when
+    `target[i,...] is independent of `source[j,...]` for `j != i`. This
+    independence assumption allows more efficient computation as compared to
+    `GradientTape.jacobian`. The output, as well as intermediate activations,
+    are lower dimensional and avoid a bunch of redundant zeros which would
+    result in the jacobian computation given the independence assumption.
+
+    Example usage:
+    with tf.GradientTape() as g:
+      x = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
+      g.watch(x)
+      y = x * x
+    batch_jacobian = g.batch_jacobian(y, x)
+    # batch_jacobian is [[[2,  0], [0,  4]], [[6,  0], [0,  8]]]
+
+    Args:
+      target: A tensor with rank 2 or higher and with shape [b, y1, ..., y_n].
+        `target[i,...]` should only depend on `source[i,...]`.
+      source: A tensor with rank 2 or higher and with shape [b, x1, ..., x_m].
+      unconnected_gradients: a value which can either hold 'none' or 'zero' and
+        alters the value which will be returned if the target and sources are
+        unconnected. The possible values and effects are detailed in
+        'UnconnectedGradients' and it defaults to 'none'.
+      parallel_iterations: A knob to control how many iterations are dispatched
+        in parallel. This knob can be used to control the total memory usage.
+      experimental_use_pfor: If true, uses pfor for computing the Jacobian. Else
+        uses a tf.while_loop.
+
+    Returns:
+      A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]`
+      is the jacobian of `target[i, ...]` w.r.t. `source[i, ...]`, i.e. stacked
+      per-example jacobians.
+
+    Raises:
+      RuntimeError: If called on a non-persistent tape with eager execution
+        enabled and without enabling experimental_use_pfor.
+      ValueError: If vectorization of jacobian computation fails or if first
+        dimension of `target` and `source` do not match.
+    """
+    target_shape = target.shape
+    if not target_shape.with_rank_at_least(2)[0].is_compatible_with(
+        source.shape.with_rank_at_least(2)[0]):
+      raise ValueError(
+          "Need first dimension of target shape (%s) and "
+          "source shape (%s) to match." % (target.shape, source.shape))
+    if target_shape.is_fully_defined():
+      batch_size = int(target_shape[0])
+      target_row_size = target_shape.num_elements() // batch_size
+    else:
+      target_shape = array_ops.shape(target)
+      batch_size = target_shape[0]
+      target_row_size = array_ops.size(target) // batch_size
+    source_shape = array_ops.shape(source)
+    # Flatten target to 2-D.
+    # Note that we push and pop the tape here and below. This is needed since we
+    # need gradients through the enclosed operations.
+    self._push_tape()
+    with ops.control_dependencies(
+        [check_ops.assert_equal(batch_size, source_shape[0])]):
+      target = array_ops.reshape(target, [batch_size, target_row_size])
+    self._pop_tape()
+
+    def loop_fn(i):
+      self._push_tape()
+      y = array_ops.gather(target, i, axis=1)
+      self._pop_tape()
+      return self.gradient(y, source,
+                           unconnected_gradients=unconnected_gradients)
+
+    if experimental_use_pfor:
+      try:
+        output = pfor_ops.pfor(loop_fn, target_row_size,
+                               parallel_iterations=parallel_iterations)
+      except ValueError as err:
+        six.reraise(
+            ValueError,
+            ValueError(
+                str(err) + "\nEncountered an exception while vectorizing the "
+                "batch_jacobian computation. Vectorization can be disabled by "
+                "setting experimental_use_pfor to False."),
+            sys.exc_info()[2])
+    else:
+      if context.executing_eagerly() and not self._persistent:
+        raise RuntimeError(
+            "GradientTape must be created with persistent=True"
+            " to compute the batch_jacobian with eager execution enabled and "
+            " with experimental_use_pfor set to False.")
+      output = pfor_ops.for_loop(loop_fn, target.dtype, target_row_size,
+                                 parallel_iterations=parallel_iterations)
+    if output is None:
+      return None
+    output = array_ops.reshape(output,
+                               [target_row_size, batch_size, -1])
+    output = array_ops.transpose(output, [1, 0, 2])
+    new_shape = array_ops.concat([target_shape, source_shape[1:]], axis=0)
+    return array_ops.reshape(output, new_shape)
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 32731747b735d0d14dd1a65e18e00365a016ec83..61c47a29fd2427850006cbe2dfe1e6bb69d988ab 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -23,12 +23,14 @@ import numpy as np
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers.pooling import max_pooling3d
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
@@ -64,7 +66,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.gradients_function(fn, [0])(var)[0]
     grad = self.evaluate(ops.convert_to_tensor(grad))
 
-    with context.graph_mode():
+    if not context.executing_eagerly():
       tf_var = array_ops.constant(var_np, dtypes.float32)
       tf_ind1 = array_ops.constant([0, 1])
       tf_ind2 = array_ops.constant([2, 3])
@@ -72,7 +74,7 @@ class BackpropTest(test.TestCase):
       tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1)
       tf_g2 = embedding_ops.embedding_lookup(tf_var, tf_ind2)
       tf_g3 = embedding_ops.embedding_lookup(tf_var, tf_ind3)
-      tf_g4 = math_ops.reduce_sum(tf_var * 2.0, reduction_indices=(0, 1))
+      tf_g4 = math_ops.reduce_sum(tf_var * 2.0, axis=(0, 1))
       tf_y = tf_g1 * tf_g2 * tf_g3 * tf_g4
       tf_grad = gradients.gradients(tf_y, [tf_var])[0]
 
@@ -198,7 +200,7 @@ class BackpropTest(test.TestCase):
     grad = backprop.implicit_grad(f)()[0][0]
     opt = training.GradientDescentOptimizer(lrn_rate)
 
-    with context.graph_mode(), self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
       tf_x = array_ops.ones((batch_size), dtypes.int64)
       # TODO(ashankar,apassos): Change to ResourceVariable.
       tf_embedding = variables.Variable(
@@ -213,7 +215,7 @@ class BackpropTest(test.TestCase):
       self.assertAllClose(tf_grad.values.eval(), grad.values)
 
       tf_opt.apply_gradients([(tf_grad, tf_embedding)]).run()
-      expected = tf_embedding.eval()
+      expected = self.evaluate(tf_embedding)
     opt.apply_gradients([(grad, embedding)])
     self.assertAllClose(expected, embedding.read_value())
 
@@ -231,6 +233,68 @@ class BackpropTest(test.TestCase):
     self.assertTrue(ordered_variables[0] is v0)
     self.assertTrue(ordered_variables[1] is v1)
 
+  def testTapeNoOpGradient(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x
+    self.assertEqual(t.gradient(y, x).numpy(), 1.0)
+
+  def testTapeIdentityGradientIsIdentity(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = array_ops.identity(x)
+    self.assertEqual(t.gradient(y, x).numpy(), 1.0)
+
+  def testTapeGradientMultiTargetOneIsSource(self):
+    x = constant_op.constant(2.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x*x
+    self.assertEqual(t.gradient([x, y], x).numpy(), 5.0)
+
+  def testTapeNoOpGradientWithMultiTargetAllSource(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x
+    self.assertEqual(t.gradient([y, y], x).numpy(), 2.0)
+
+  def testTapeNoOpGradientWithMultiTargetMultiSource(self):
+    x = constant_op.constant(3.0)
+    y = constant_op.constant(5.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      t.watch(y)
+      z = y * y
+    self.assertAllEqual(t.gradient([x, y, z], [x, y]), [1.0, 11.0])
+
+  def testTapeNoOpOnVariableIsIdentity(self):
+    v0 = resource_variable_ops.ResourceVariable(1.0)
+    with backprop.GradientTape() as t:
+      y = v0.read_value()
+    self.assertEqual(t.gradient(y, v0).numpy(), 1.0)
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testTapeNoOpGradient2By2(self):
+    a_2_by_2 = constant_op.constant(2.0, shape=[2, 2])
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+    dy_dy = tape.gradient(a_2_by_2, [a_2_by_2])[0]
+    self.assertAllEqual(dy_dy.numpy(),
+                        constant_op.constant(1.0, shape=[2, 2]).numpy())
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testTapeNoOpGradientMultiTarget2By2(self):
+    a_2_by_2 = constant_op.constant(2.0, shape=[2, 2])
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+    dy_dy = tape.gradient([a_2_by_2, a_2_by_2], [a_2_by_2])[0]
+    self.assertAllEqual(dy_dy.numpy(),
+                        constant_op.constant(2.0, shape=[2, 2]).numpy())
+
   def testTapeStopRecording(self):
     with backprop.GradientTape() as t:
       x = resource_variable_ops.ResourceVariable(1.0)
@@ -258,6 +322,30 @@ class BackpropTest(test.TestCase):
       loss += v * v
     self.assertAllEqual(t.gradient(loss, v), 2.0)
 
+  def testAutomaticWatchedVariables(self):
+    with backprop.GradientTape() as t:
+      self.assertEqual(0, len(t.watched_variables()))
+      v = resource_variable_ops.ResourceVariable(1.0)
+      loss = v * v
+      self.assertAllEqual([v], t.watched_variables())
+
+      t.reset()
+      self.assertEqual(0, len(t.watched_variables()))
+      loss += v * v
+      self.assertAllEqual([v], t.watched_variables())
+
+  def testExplicitWatchedVariables(self):
+    with backprop.GradientTape() as t:
+      self.assertEqual(0, len(t.watched_variables()))
+      v = resource_variable_ops.ResourceVariable(1.0)
+      t.watch(v)
+      self.assertAllEqual([v], t.watched_variables())
+
+      t.reset()
+      self.assertEqual(0, len(t.watched_variables()))
+      t.watch(v)
+      self.assertAllEqual([v], t.watched_variables())
+
   @test_util.assert_no_new_tensors
   def testGradientNone(self):
 
@@ -548,7 +636,19 @@ class BackpropTest(test.TestCase):
     grad = g.gradient(y, [x])[0]
     self.assertEqual(self.evaluate(grad), 6.0)
 
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes
+  def testGadientTapeCalledOnConstantTarget(self):
+    with backprop.GradientTape() as g:
+      x = variables.Variable([3.0])
+      y = variables.Variable([2.0])
+    with self.assertRaisesRegexp(
+        ValueError,
+        'GradientTape.gradient is not supported for variable targets.'):
+      g.gradient(x, y)
+
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only('b/120545219')
   def testGradientTapeWithCond(self):
     x = constant_op.constant(3.0)
 
@@ -570,6 +670,7 @@ class BackpropTest(test.TestCase):
       self.assertEqual(self.evaluate(dy), 6.0)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only('b/120545219')
   def testGradientTapeWithWhileLoop(self):
     i = constant_op.constant(1)
     x = constant_op.constant(2.)
@@ -605,6 +706,7 @@ class BackpropTest(test.TestCase):
 
   @test_util.assert_no_new_tensors
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only('b/120545219')
   def testPersistentTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -677,6 +779,60 @@ class BackpropTest(test.TestCase):
     self.assertEqual(self.evaluate(dz_dx), 108.0)
     self.assertEqual(self.evaluate(dz_dy), 18.0)
 
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes
+  def testUnconnectedGradientsDefault(self):
+    x = constant_op.constant(1.0)
+    y = constant_op.constant(3.0)
+    with backprop.GradientTape() as g:
+      g.watch([x, y])
+      z = y * 2
+    dz_dx = g.gradient(z, x)
+    self.assertEqual(dz_dx, None)
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes
+  def testUnconnectedGradientsZeros(self):
+    x = constant_op.constant(1.0, shape=[2, 2])
+    y = constant_op.constant(3.0)
+    with backprop.GradientTape() as g:
+      g.watch([x, y])
+      z = y * 2
+    dz_dx = g.gradient(z, x, unconnected_gradients='zero')
+    self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], self.evaluate(dz_dx))
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes
+  def testUnknownUnconnectedGradientsValueGiven(self):
+    x = constant_op.constant(1.0)
+    y = constant_op.constant(1.0)
+    with backprop.GradientTape() as g:
+      g.watch([x, y])
+      z = y * 2
+    with self.assertRaisesRegexp(
+        ValueError, "Unknown value for unconnected_gradients: 'nonsense'"):
+      g.gradient(z, x, unconnected_gradients='nonsense')
+
+  @test_util.run_in_graph_and_eager_modes
+  def testUnconnectedGradientsNestedDefunZeros(self):
+
+    @function.defun
+    def f(x):
+      return x * x
+
+    @function.defun
+    def h(y):
+      z = f(y)
+      return array_ops.stop_gradient(z)
+
+    x = constant_op.constant(1.0)
+    with backprop.GradientTape() as g:
+      g.watch(x)
+      y = h(x)
+
+    dy_dx = g.gradient(y, x, unconnected_gradients='zero')
+    self.assertEqual(0.0, self.evaluate(dy_dx))
+
   @test_util.assert_no_new_tensors
   def testEmptyParamsForValueAndGradFunction(self):
     def fn(a, b):
@@ -951,7 +1107,7 @@ class BackpropTest(test.TestCase):
       val_and_grads_fn(x, y)
 
   def testZerosCacheDoesntLeakAcrossGraphs(self):
-    with context.graph_mode():
+    with ops.Graph().as_default():
       def get_grad():
         with ops.Graph().as_default(), self.cached_session():
           t = constant_op.constant(1, dtype=dtypes.float32, shape=(10, 4))
@@ -982,7 +1138,6 @@ class BackpropTest(test.TestCase):
     self.assertIsNone(dy)
     self.assertEqual(self.evaluate(dz), 3.0)
 
-
   @test_util.run_in_graph_and_eager_modes
   def testDifferentiatingScalarCache(self):
     # In the following test, if x2 = x1 (i.e the objects are the exact same),
@@ -1023,7 +1178,7 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(gradients_constants, gradients_variables)
 
   def testUnknownShapes(self):
-    with context.graph_mode():
+    with ops.Graph().as_default():
       with backprop.GradientTape() as tape:
         a = array_ops.placeholder(dtype=dtypes.float32, shape=None)
         tape.watch(a)
@@ -1034,6 +1189,249 @@ class BackpropTest(test.TestCase):
       with self.cached_session() as sess:
         self.assertEqual((8.0, 12.0), sess.run((b, db_da), feed_dict={a: 2.0}))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testCustomGradientInEagerAndGraph(self):
+    @custom_gradient.custom_gradient
+    def f(x):
+      y = x * x
+
+      def grad(dy):
+        return [4 * dy]
+
+      return y, grad
+
+    with backprop.GradientTape() as t:
+      c = constant_op.constant(1.0)
+      t.watch(c)
+      g = f(c)
+    self.assertAllEqual(self.evaluate(t.gradient(g, c)), 4.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMaxPooling3DGradient(self):
+
+    def forward(a):
+      r = max_pooling3d(a, pool_size=pool_size, strides=strides, padding='SAME')
+      return r
+
+    input_sizes = [1, 3, 2, 4, 1]
+    pool_size = (2, 2, 1)
+    strides = (1, 1, 1)
+
+    total_size = np.prod(input_sizes)
+    x = np.arange(1, total_size + 1, dtype=np.float32)
+    aa = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+    da = backprop.gradients_function(forward)(aa)
+
+    if not context.executing_eagerly():
+      tf_aa = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+      tf_max = max_pooling3d(
+          tf_aa, pool_size=pool_size, strides=strides, padding='SAME')
+      tf_da = gradients.gradients(tf_max, [tf_aa])
+      self.assertAllEqual(da[0], tf_da[0].eval())
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class JacobianTest(test.TestCase):
+
+  def _jacobian(self, experimental_use_pfor):
+    persistent = context.executing_eagerly and not experimental_use_pfor
+    with backprop.GradientTape(persistent=persistent) as g:
+      x = constant_op.constant([1., 2.])
+      y = constant_op.constant([3., 4.])
+      g.watch(x)
+      g.watch(y)
+      z = x * x * y
+    jacobian = g.jacobian(z, [x, y],
+                          experimental_use_pfor=experimental_use_pfor)
+    answer = [array_ops.diag(2 * x * y), array_ops.diag(x * x)]
+    return jacobian, answer
+
+  @test_util.run_v1_only('b/120545219')
+  def testPfor(self):
+    jacobian, answer = self._jacobian(experimental_use_pfor=True)
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  @test_util.run_v1_only('b/120545219')
+  def testWhileLoop(self):
+    jacobian, answer = self._jacobian(experimental_use_pfor=False)
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPforDefun(self):
+
+    @function.defun
+    def _f():
+      return self._jacobian(experimental_use_pfor=True)
+
+    jacobian, answer = _f()
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  @test_util.run_v1_only('b/120545219')
+  def testWhileLoopDefun(self):
+
+    @function.defun
+    def _f():
+      return self._jacobian(experimental_use_pfor=False)
+
+    jacobian, answer = _f()
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPersistentTape(self):
+    if not context.executing_eagerly():
+      return
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([1.0, 2.0])
+      g.watch(x)
+      y = x * x
+    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+      g.jacobian(y, x, experimental_use_pfor=False)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPforException(self):
+    var = variables.Variable([1.])
+
+    @custom_gradient.custom_gradient
+    def op(x):
+      def grad(_):
+        # Note that we perform a stateful operation here that will not be
+        # compatible with parallel for construct.
+        with ops.control_dependencies(
+            [var.assign(random_ops.random_uniform([1]))]):
+          return constant_op.constant(1.)
+      return x, grad
+
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([1., 2.])
+      g.watch(x)
+      y = op(x)
+    with self.assertRaisesRegexp(ValueError, 'No converter'):
+      g.jacobian(y, x, experimental_use_pfor=True)
+
+  @test_util.run_v1_only('b/120545219')
+  def test_parallel_iterations(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant([[1., 2], [3, 4]])
+      g.watch(x)
+      y = math_ops.matmul(x, x)
+    self.assertAllClose(g.jacobian(y, x, parallel_iterations=2),
+                        g.jacobian(y, x, parallel_iterations=3))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BatchJacobianTest(test.TestCase):
+
+  def _batch_jacobian(self, experimental_use_pfor):
+    persistent = context.executing_eagerly and not experimental_use_pfor
+    with backprop.GradientTape(persistent=persistent) as g:
+      x = constant_op.constant([[1., 2.], [3., 4.]])
+      y = constant_op.constant([[3., 4.], [5., 6.]])
+      g.watch(x)
+      z = x * x * y
+    batch_jacobian = g.batch_jacobian(
+        z, x, experimental_use_pfor=experimental_use_pfor)
+    answer = array_ops.stack([array_ops.diag(2 * x[0] * y[0]),
+                              array_ops.diag(2 * x[1] * y[1])])
+    return batch_jacobian, answer
+
+  @test_util.run_v1_only('b/120545219')
+  def testPfor(self):
+    batch_jacobian, answer = self._batch_jacobian(experimental_use_pfor=True)
+    self.assertAllEqual(answer, batch_jacobian)
+
+  @test_util.run_v1_only('b/120545219')
+  def testWhileLoop(self):
+    batch_jacobian, answer = self._batch_jacobian(experimental_use_pfor=False)
+    self.assertAllEqual(answer, batch_jacobian)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPforDefun(self):
+
+    @function.defun
+    def _f():
+      return self._batch_jacobian(experimental_use_pfor=True)
+
+    batch_jacobian, answer = _f()
+    self.assertAllEqual(answer, batch_jacobian)
+
+  @test_util.run_v1_only('b/120545219')
+  def testWhileLoopDefun(self):
+
+    @function.defun
+    def _f():
+      return self._batch_jacobian(experimental_use_pfor=False)
+
+    batch_jacobian, answer = _f()
+    self.assertAllEqual(answer, batch_jacobian)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPersistentTape(self):
+    if not context.executing_eagerly():
+      return
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([[1.0, 2.0]])
+      g.watch(x)
+      y = x * x
+    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+      g.batch_jacobian(y, x, experimental_use_pfor=False)
+
+  @test_util.run_v1_only('b/120545219')
+  def testBadShape(self):
+    x = random_ops.random_uniform([2, 3])
+    with backprop.GradientTape() as g:
+      y = array_ops.concat([x, x], axis=0)
+    with self.assertRaisesRegexp(ValueError, 'Need first dimension'):
+      g.batch_jacobian(y, x)
+
+  @test_util.run_v1_only('b/120545219')
+  def testBadInputRank(self):
+    x = random_ops.random_uniform([2])
+    with backprop.GradientTape() as g:
+      y = random_ops.random_uniform([2, 2])
+    with self.assertRaisesRegexp(ValueError, 'must have rank at least 2'):
+      g.batch_jacobian(y, x)
+
+  def testBadOutputRank(self):
+    x = random_ops.random_uniform([2, 2])
+    with backprop.GradientTape() as g:
+      y = random_ops.random_uniform([2])
+    with self.assertRaisesRegexp(ValueError, 'must have rank at least 2'):
+      g.batch_jacobian(y, x)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPforException(self):
+    var = variables.Variable([1.])
+
+    @custom_gradient.custom_gradient
+    def op(x):
+      def grad(_):
+        # Note that we perform a stateful operation here that will not be
+        # compatible with parallel for construct.
+        with ops.control_dependencies(
+            [var.assign(random_ops.random_uniform([1]))]):
+          return constant_op.constant(1.)
+      return x, grad
+
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([[1.], [2.]])
+      g.watch(x)
+      y = op(x)
+    with self.assertRaisesRegexp(ValueError, 'No converter'):
+      g.batch_jacobian(y, x, experimental_use_pfor=True)
+
+  @test_util.run_v1_only('b/120545219')
+  def test_parallel_iterations(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant([[1., 2], [3, 4]])
+      g.watch(x)
+      w = constant_op.constant([[1., 2, 3, 4], [5, 6, 7, 8]])
+      y = math_ops.matmul(x, w)
+    self.assertAllClose(g.batch_jacobian(y, x, parallel_iterations=2),
+                        g.batch_jacobian(y, x, parallel_iterations=3))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 3fe79ef244329243a575b14ef3b424465dbc36b2..31a7efca82b016bc193ab9985ea7603897edc7ac 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -33,6 +33,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python import keras
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
@@ -48,6 +49,7 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import gradient_descent
 
 CPU = "/device:CPU:0"
 GPU = "/device:GPU:0"
@@ -76,18 +78,18 @@ def c_tfe_py_fastpath_execute(a,
 
 class SubclassedKerasModel(keras.Model):
 
-  def __init__(self):
+  def __init__(self, initializer="ones"):
     super(SubclassedKerasModel, self).__init__()
     self.layer_a = keras.layers.Dense(
-        64, kernel_initializer="ones", bias_initializer="zeros")
+        64, kernel_initializer=initializer, bias_initializer="zeros")
     self.layer_b = keras.layers.Dense(
-        128, kernel_initializer="ones", bias_initializer="zeros")
+        128, kernel_initializer=initializer, bias_initializer="zeros")
     self.layer_c = keras.layers.Dense(
-        256, kernel_initializer="ones", bias_initializer="zeros")
+        256, kernel_initializer=initializer, bias_initializer="zeros")
     self.layer_d = keras.layers.Dense(
-        256, kernel_initializer="ones", bias_initializer="zeros")
+        256, kernel_initializer=initializer, bias_initializer="zeros")
     self.layer_e = keras.layers.Dense(
-        10, kernel_initializer="ones", bias_initializer="zeros")
+        10, kernel_initializer=initializer, bias_initializer="zeros")
 
   def call(self, x):
     x = self.layer_a(x)
@@ -97,34 +99,34 @@ class SubclassedKerasModel(keras.Model):
     return self.layer_e(x)
 
 
-def make_keras_model():
+def make_keras_model(initializer="ones"):
   model_input = keras.Input(shape=(10,))
   x = keras.layers.Dense(
-      64, kernel_initializer="ones", bias_initializer="zeros")(model_input)
+      64, kernel_initializer=initializer, bias_initializer="zeros")(model_input)
   x = keras.layers.Dense(
-      128, kernel_initializer="ones", bias_initializer="zeros")(x)
+      128, kernel_initializer=initializer, bias_initializer="zeros")(x)
   x = keras.layers.Dense(
-      256, kernel_initializer="ones", bias_initializer="zeros")(x)
+      256, kernel_initializer=initializer, bias_initializer="zeros")(x)
   x = keras.layers.Dense(
-      256, kernel_initializer="ones", bias_initializer="zeros")(x)
+      256, kernel_initializer=initializer, bias_initializer="zeros")(x)
   x = keras.layers.Dense(
-      10, kernel_initializer="ones", bias_initializer="zeros")(x)
+      10, kernel_initializer=initializer, bias_initializer="zeros")(x)
   return keras.Model(inputs=model_input, outputs=x)
 
 
-def make_sequential_keras_model():
+def make_sequential_keras_model(initializer="ones"):
   model = keras.models.Sequential()
   model.add(keras.layers.Dense(
-      64, kernel_initializer="ones", bias_initializer="zeros",
+      64, kernel_initializer=initializer, bias_initializer="zeros",
       input_shape=(10,)))
   model.add(keras.layers.Dense(
-      128, kernel_initializer="ones", bias_initializer="zeros"))
+      128, kernel_initializer=initializer, bias_initializer="zeros"))
   model.add(keras.layers.Dense(
-      256, kernel_initializer="ones", bias_initializer="zeros"))
+      256, kernel_initializer=initializer, bias_initializer="zeros"))
   model.add(keras.layers.Dense(
-      256, kernel_initializer="ones", bias_initializer="zeros"))
+      256, kernel_initializer=initializer, bias_initializer="zeros"))
   model.add(keras.layers.Dense(
-      10, kernel_initializer="ones", bias_initializer="zeros"))
+      10, kernel_initializer=initializer, bias_initializer="zeros"))
   return model
 
 
@@ -221,6 +223,18 @@ class MicroBenchmarks(test.Benchmark):
     self._benchmark_create_tensor(
         np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, GPU)
 
+  def benchmark_index_tensor_with_literal(self):
+    func = lambda: constant_op.constant([3.0])[0]
+    self._run(func, 30000)
+
+  def benchmark_index_tensor_with_tensor(self):
+    func = lambda idx=constant_op.constant(0): constant_op.constant([3.0])[idx]
+    self._run(func, 30000)
+
+  def benchmark_index_tensor_with_np_array(self):
+    func = lambda idx=np.array(0): constant_op.constant([3.0])[idx]
+    self._run(func, 30000)
+
   def _benchmark_np_multiply(self, m, num_iters):
     a = m.cpu().numpy()
     func = lambda: a * a
@@ -353,7 +367,7 @@ class MicroBenchmarks(test.Benchmark):
                               num_iters,
                               execution_mode=None):
     f = function.defun(math_ops.matmul)
-    func = lambda: f(m, m, transpose_b)
+    func = lambda: f(m, m, transpose_b=transpose_b)
     self._run(func, num_iters, execution_mode=execution_mode)
 
   def _benchmark_defun_matmul_forward_backward(self,
@@ -366,7 +380,7 @@ class MicroBenchmarks(test.Benchmark):
     def func():
       with backprop.GradientTape() as gt:
         gt.watch(m)
-        y = f(m, m, transpose_b)
+        y = f(m, m, transpose_b=transpose_b)
       _ = gt.gradient(y, m)
 
     self._run(func, num_iters, execution_mode=execution_mode)
@@ -718,6 +732,131 @@ class MicroBenchmarks(test.Benchmark):
     assert np.equal(func(), make_keras_model()(data)).all()
     self._run(func, 30000)
 
+  def _benchmark_keras_model_fit(self, model, run_eagerly=False):
+    data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
+    labels = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
+    dataset = dataset_ops.Dataset.from_tensors((data, labels)).repeat()
+    model.compile(
+        gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
+        loss="mse", run_eagerly=run_eagerly)
+    func = lambda: model.fit(dataset, epochs=1, steps_per_epoch=1000, verbose=0)
+    # First call is more expensive (creates variables etc.), discount that.
+    model.fit(dataset, epochs=1, steps_per_epoch=1, verbose=0)
+
+    self._run(func, 1)
+
+  def _benchmark_keras_model_evaluate(self, model, run_eagerly=False):
+    data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
+    labels = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
+    dataset = dataset_ops.Dataset.from_tensors((data, labels)).repeat()
+    model.compile(
+        gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
+        loss="mse", run_eagerly=run_eagerly)
+    func = lambda: model.evaluate(dataset, steps=1000, verbose=0)
+    # First call is more expensive (creates variables etc.), discount that.
+    model.evaluate(dataset, steps=1, verbose=0)
+
+    self._run(func, 1)
+
+  def _benchmark_keras_model_predict(self, model, run_eagerly=False):
+    data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
+    dataset = dataset_ops.Dataset.from_tensors(tuple([data])).repeat()
+    model.compile(
+        gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
+        loss="mse", run_eagerly=run_eagerly)
+    func = lambda: model.predict(dataset, steps=1000, verbose=0)
+    # First call is more expensive (creates variables etc.), discount that.
+    model.predict(dataset, steps=1, verbose=0)
+
+    self._run(func, 1)
+
+  def benchmark_keras_model_subclassed_fit(self):
+    model = SubclassedKerasModel(initializer="glorot_uniform")
+    self._benchmark_keras_model_fit(model)
+
+  def benchmark_keras_model_subclassed_fit_graph_mode(self):
+    with context.graph_mode():
+      model = SubclassedKerasModel(initializer="glorot_uniform")
+      self._benchmark_keras_model_fit(model)
+
+  def benchmark_keras_model_subclassed_fit_run_model_eagerly(self):
+    model = SubclassedKerasModel(initializer="glorot_uniform")
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
+
+  def benchmark_keras_model_functional_fit(self):
+    model = make_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_fit(model)
+
+  def benchmark_keras_model_functional_fit_graph_mode(self):
+    with context.graph_mode():
+      model = make_keras_model(initializer="glorot_uniform")
+      self._benchmark_keras_model_fit(model)
+
+  def benchmark_keras_model_functional_fit_run_model_eagerly(self):
+    model = make_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
+
+  def benchmark_keras_model_sequential_fit(self):
+    model = make_sequential_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_fit(model)
+
+  def benchmark_keras_model_sequential_fit_graph_mode(self):
+    with context.graph_mode():
+      model = make_sequential_keras_model(initializer="glorot_uniform")
+      self._benchmark_keras_model_fit(model)
+
+  def benchmark_keras_model_sequential_fit_run_model_eagerly(self):
+    model = make_sequential_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
+
+  def benchmark_keras_model_subclassed_evaluate(self):
+    model = SubclassedKerasModel(initializer="glorot_uniform")
+    self._benchmark_keras_model_evaluate(model)
+
+  def benchmark_keras_model_subclassed_evaluate_run_model_eagerly(self):
+    model = SubclassedKerasModel(initializer="glorot_uniform")
+    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
+
+  def benchmark_keras_model_functional_evaluate(self):
+    model = make_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_evaluate(model)
+
+  def benchmark_keras_model_functional_evaluate_run_model_eagerly(self):
+    model = make_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
+
+  def benchmark_keras_model_sequential_evaluate(self):
+    model = make_sequential_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_evaluate(model)
+
+  def benchmark_keras_model_sequential_evaluate_run_model_eagerly(self):
+    model = make_sequential_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
+
+  def benchmark_keras_model_subclassed_predict(self):
+    model = SubclassedKerasModel(initializer="glorot_uniform")
+    self._benchmark_keras_model_predict(model)
+
+  def benchmark_keras_model_subclassed_predict_run_model_eagerly(self):
+    model = SubclassedKerasModel(initializer="glorot_uniform")
+    self._benchmark_keras_model_predict(model, run_eagerly=True)
+
+  def benchmark_keras_model_functional_predict(self):
+    model = make_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_predict(model)
+
+  def benchmark_keras_model_functional_predict_run_model_eagerly(self):
+    model = make_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_predict(model, run_eagerly=True)
+
+  def benchmark_keras_model_sequential_predict(self):
+    model = make_sequential_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_predict(model)
+
+  def benchmark_keras_model_sequential_predict_run_model_eagerly(self):
+    model = make_sequential_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_predict(model, run_eagerly=True)
+
   def benchmarkScan(self):
     elems = math_ops.range(1600)
 
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 778ff85342ddd4c0309e1a0bf92868241b526d8f..cbbe5cf49e20afc63e7710e39dc37ecbc4ac5082 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Experimental API for TensorFlow's "Eager" mode of execution."""
+"""State management for eager execution."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,6 +26,7 @@ import threading
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import tf2
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.util import compat
@@ -36,8 +37,7 @@ from tensorflow.python.util.tf_export import tf_export
 GRAPH_MODE = 0
 EAGER_MODE = 1
 
-# Default execution mode.
-default_execution_mode = GRAPH_MODE
+default_execution_mode = EAGER_MODE if tf2.enabled() else GRAPH_MODE
 
 # Cache from (old_device_name, partial_new_device_name) -> (new_device_name,
 # new_device_spec).
@@ -80,11 +80,60 @@ class _EagerTensorCache(object):
     self._data = {}
 
 
+class FunctionCallOptions(object):
+  """Options applied at call sites of eager functions.
+  Eager functions are functions decorated with tf.contrib.eager.defun.
+  """
+
+  def __init__(self, executor_type=None, config_proto=None):
+    """Constructor.
+
+    Args:
+      executor_type: (optional) name of the executor to be used to execute the
+        eager function. If None or an empty string, the default Tensorflow
+        executor will be used.
+      config_proto: (optional) a `config_pb2.ConfigProto` proto or
+        a serialized string of that proto.
+        The config used by Grappler when optimizing the function graph.
+        Each concrete function is optimized the first time is called. Changing
+        config_proto after the first call has no effect.
+        If config_proto is None, an empty RewriterConfig will be used.
+    """
+    self.config_proto_serialized = config_proto
+    self.executor_type = executor_type
+
+  @property
+  def executor_type(self):
+    return self._executor_type
+
+  @executor_type.setter
+  def executor_type(self, executor_type):
+    self._executor_type = executor_type
+
+  @property
+  def config_proto_serialized(self):
+    return self._config_proto_serialized
+
+  @config_proto_serialized.setter
+  def config_proto_serialized(self, config):
+    if isinstance(config, config_pb2.ConfigProto):
+      self._config_proto_serialized = config.SerializeToString()
+    elif isinstance(config, str):
+      self._config_proto_serialized = config
+    elif config is None:
+      self._config_proto_serialized = (
+          config_pb2.ConfigProto().SerializeToString())
+    else:
+      raise ValueError("the rewriter config must be either a "
+                       "config_pb2.ConfigProto, or a serialized string of that "
+                       "proto or None. got: {}".format(type(config)))
+
+
 # TODO(agarwal): better name ?
 class _EagerContext(threading.local):
   """Thread local eager context."""
 
-  def __init__(self):
+  def __init__(self, config=None):
     super(_EagerContext, self).__init__()
     self.device_spec = pydev.DeviceSpec.from_string("")
     self.device_name = self.device_spec.to_string()
@@ -98,6 +147,15 @@ class _EagerContext(threading.local):
     self.zeros_cache = _EagerTensorCache()
     self.execution_mode = None
 
+    # Default rewriter config corresponds to turning all default grappler
+    # optimizations on.
+    base_config = config_pb2.ConfigProto()
+
+    if config is not None:
+      base_config.MergeFrom(config)
+
+    self.function_call_options = FunctionCallOptions(config_proto=base_config)
+
 
 ContextSwitch = collections.namedtuple(
     "ContextSwitch", ["is_building_function", "enter_context_fn"])
@@ -191,7 +249,7 @@ class Context(object):
     Raises:
      ValueError: If execution_mode is not valid.
     """
-    self._eager_context = _EagerContext()
+    self._eager_context = _EagerContext(config)
     self._context_switches = _ContextSwitchStack(self.executing_eagerly())
     self._context_handle = None
     self._context_devices = None
@@ -440,8 +498,9 @@ class Context(object):
         if old_device_name:
           new_device_spec = copy.copy(old_device_spec)
         else:
+          self._initialize_handle_and_devices()
           new_device_spec = pydev.DeviceSpec.from_string(
-              "/job:localhost/replica:0/task:0/device:CPU:0")
+              self._context_devices[0])
         new_device_spec.merge_from(device_spec)
       else:
         new_device_spec = pydev.DeviceSpec.from_string("")
@@ -486,6 +545,35 @@ class Context(object):
     finally:
       self.set_execution_mode(old_mode)
 
+  def get_function_call_options(self):
+    """Returns function call options for current thread.
+
+    Note that the returned object is still referenced by the eager context.
+
+    Returns: the FunctionCallOptions for current thread.
+    """
+    return self._eager_context.function_call_options
+
+  @tf_contextlib.contextmanager
+  def function_call_options(self, set_options_func):
+    """Context manager for setting function call options of current thread.
+
+    Args:
+      set_options_func: A callable that takes one argument of type
+        FunctionCallOptions. It should set the properties of that
+        FunctionCallOptions.
+
+    Yields:
+      Nothing.
+    """
+    current_options = self.get_function_call_options()
+    old_options = copy.copy(current_options)
+    try:
+      set_options_func(current_options)
+      yield
+    finally:
+      self._eager_context.function_call_options = old_options
+
   def async_wait(self):
     """Waits for ops dispatched in ASYNC mode to finish."""
     pywrap_tensorflow.TFE_ContextAsyncWait(self._handle)
@@ -738,6 +826,25 @@ def execution_mode(mode):
   return context().execution_mode(mode)
 
 
+@tf_export("experimental.function_executor_type")
+def function_executor_type(executor_type):
+  """Context manager for setting the executor of eagar defined functions.
+
+  Eager defined functions are functions decorated by tf.contrib.eager.defun.
+
+  Args:
+    executor_type: a string for the name of the executor to be used
+    to execute functions defined by tf.contrib.eager.defun.
+
+  Returns:
+    Context manager for setting the executor of eager defined functions.
+  """
+  def _set_options_func(options):
+    options.executor_type = executor_type
+
+  return context().function_call_options(_set_options_func)
+
+
 def async_wait():
   """Waits for ops dispatched in ASYNC mode to finish."""
   return context().async_wait()
@@ -783,10 +890,34 @@ def export_run_metadata():
   return context().export_run_metadata()
 
 
+def function_config_proto(config_proto):
+  """Context manager for setting the grappler rewrite config.
+
+  This config is used by Grappler when optimizing the function graph.
+
+  Args:
+    config_proto: a `config_pb2.ConfigProto` proto or
+      a serialized string of that proto or None. If None, the default instance
+      of `config_pb2.ConfigProto` will be used.
+
+  Returns:
+    A context manager.
+  """
+  def _set_options_func(options):
+    options.config_proto_serialized = config_proto
+
+  return context().function_call_options(_set_options_func)
+
+
 def set_server_def(server_def):
   context().set_server_def(server_def)
 
 
+def add_function(fdef):
+  """Add a function definition to the context."""
+  context().add_function(fdef)
+
+
 # Not every user creates a Context via context.context()
 # (for example, enable_eager_execution in python/framework/ops.py),
 # but they do all import this file.  Note that IS_IN_GRAPH_MODE and
diff --git a/tensorflow/python/eager/core.py b/tensorflow/python/eager/core.py
index 8fb69300209d74a164c38654d737432cdfb7884a..e168b4bd5ffedc0b7a244a9c190f2c50726105e4 100644
--- a/tensorflow/python/eager/core.py
+++ b/tensorflow/python/eager/core.py
@@ -60,4 +60,15 @@ class _FallbackException(Exception):
   pass
 
 
+class _SymbolicException(Exception):
+  """Exception class to handle use of symbolic tensors when executing eagerly.
+
+  `keras.Input()` creates symbolic tensors (in a FuncGraph managed by the
+  Keras backend) while in eager execution. This exception is used to
+  identify this case (raised in `convert_to_tensor` cause generated functions
+  for ops to construct graphs instead of executing the kernel).
+  """
+  pass
+
+
 pywrap_tensorflow.TFE_Py_RegisterFallbackExceptionClass(_FallbackException)
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index fb5442b6464bdf36d0d3278e90d227ed316bec76..e601aa376fa2ef8e0e240e4da03bfcd9ea227bd9 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -631,6 +631,34 @@ class TFETest(test_util.TensorFlowTestCase):
     for t in tensors:
       self.assertIsInstance(t, ops.EagerTensor)
 
+  def testSmallIntegerOpsForcedToCPU(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+
+    a = constant_op.constant((1, 2, 3, 4, 5), dtype=dtypes.int64)
+    b = constant_op.constant((2, 3, 4, 5, 6), dtype=dtypes.int64)
+    with context.device('gpu:0'):
+      c = a + b
+
+    # Op forced to CPU since all constants are integers and small.
+    self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:CPU:0')
+
+    a = array_ops.zeros((8, 10), dtype=dtypes.int64)
+    b = array_ops.ones((8, 10), dtype=dtypes.int64)
+
+    with context.device('gpu:0'):
+      c = a + b
+
+    # Op not forced to CPU since the tensors are larger than 64 elements.
+    self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:GPU:0')
+
+    a = constant_op.constant((1, 2, 3, 4, 5), dtype=dtypes.float32)
+    b = constant_op.constant((2, 3, 4, 5, 6), dtype=dtypes.float32)
+    with context.device('gpu:0'):
+      c = a + b
+
+    # Op not forced to CPU since the constants are not integers.
+    self.assertEqual(c.device, '/job:localhost/replica:0/task:0/device:GPU:0')
 
 class SendRecvTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 8dcacd5c99a93e6d257bfbbc27620ec901e35c4f..6bacd7a962fdefb8caf11189b0681694d23b97f0 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -13,19 +13,26 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=unidiomatic-typecheck
-"""Prototype decorator for defining graph-mode functions with eager semantics."""
+"""Prototype decorator for defining graph functions with eager semantics."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import weakref
+
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import function as function_lib
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util.tf_export import tf_export
 
 
 class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
@@ -40,11 +47,12 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
 
   def __init__(self,  # pylint: disable=super-init-not-called
                initial_value=None,
-               trainable=True,
+               trainable=None,
                caching_device=None,
                name=None,
                dtype=None,
                constraint=None,
+               add_initializers_to=None,
                **unused_kwargs):
     """Creates a variable.
 
@@ -75,6 +83,9 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      add_initializers_to: if not None and not in legacy graph mode, the
+        initializer tensor will be added to this map instead of adding the
+        assignment to the function.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -82,14 +93,15 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       RuntimeError: If called outside of a function definition.
     """
     if context.executing_eagerly():
-      raise RuntimeError(
-          "UnliftedInitializerVariable should not be created "
-          "outside of functions.")
+      # If we've been init_scope()d out of the function definition nothing to do
+      # here; we can't really do the capturing or conditional logic.
+      resource_variable_ops.ResourceVariable.__init__(
+          self, initial_value=initial_value, trainable=trainable,
+          caching_device=caching_device, name=name, dtype=dtype,
+          constraint=constraint)
+      return
     with ops.init_scope():
-      if not context.executing_eagerly():
-        raise RuntimeError(
-            "UnliftedInitializerVariable does not support legacy graph mode.")
-    self._in_graph_mode = False
+      self._in_graph_mode = not context.executing_eagerly()
     if initial_value is None:
       raise ValueError("initial_value must be specified.")
     init_from_fn = callable(initial_value)
@@ -102,6 +114,8 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
+    if trainable is None:
+      trainable = True
     self._trainable = trainable
     self._save_slice_info = None
     self._initial_value = None
@@ -116,12 +130,8 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
                         if init_from_fn else [initial_value]) as name:
       # pylint: disable=protected-access
       with ops.init_scope():
-        assert context.executing_eagerly()
         shared_name = ops._name_from_scope_name(name)
         shared_name = "%s_%d" % (shared_name, ops.uid())
-      # Use attr_scope and device(None) to simulate the behavior of
-      # colocate_with when the variable we want to colocate with doesn't
-      # yet exist.
       with ops.name_scope("Initializer"), ops.device(None):
         initial_value = ops.convert_to_tensor(
             initial_value() if init_from_fn else initial_value,
@@ -132,104 +142,590 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
             dtype=initial_value.dtype.base_dtype,
             shared_name=shared_name,
             name=name,
-            graph_mode=False)
+            graph_mode=self._in_graph_mode)
       self._shape = initial_value.shape
       self._unique_id = shared_name
       self._handle_name = shared_name + ":0"
       self._dtype = initial_value.dtype.base_dtype
       self._constraint = constraint
       assert initial_value is not None
-      def assign_fn():
-        with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
-          resource_variable_ops.assign_variable_op(
-              self._handle,
-              initial_value,
-              name=n)
-        # Returning values to keep tf.cond happy.
-        return ops.convert_to_tensor(1)
-      def not_assign_fn():
-        return ops.convert_to_tensor(0)
-      # Note: this cond is always guaranteed to run because we're inside a defun
-      # which will insert automatic control dependencies.
-      control_flow_ops.cond(
-          resource_variable_ops.var_is_initialized_op(self._handle),
-          not_assign_fn, assign_fn)
+      if self._in_graph_mode:
+        with ops.init_scope():
+          outer_graph = ops.get_default_graph()
+        lifted_initializer = lift_to_graph.lift_to_graph(
+            initial_value, outer_graph)[initial_value]
+        with ops.init_scope():
+          self._initial_value = lifted_initializer
+          with ops.name_scope("IsInitialized"):
+            self._is_initialized_op = (
+                resource_variable_ops.var_is_initialized_op(self._handle))
+          if initial_value is not None:
+            with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+              self._initializer_op = resource_variable_ops.assign_variable_op(
+                  self._handle, lifted_initializer, name=n)
+          with ops.name_scope("Read"), ops.colocate_with(self._handle):
+            # Manually assign reads to the handle's device to avoid log
+            # messages.
+            with ops.device(self._handle.device):
+              value = self._read_variable_op()
+            self._graph_element = value
+          ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self)
+      else:
+        if add_initializers_to is not None:
+          add_initializers_to[self] = initial_value
+        else:
+          def assign_fn():
+            with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+              resource_variable_ops.assign_variable_op(
+                  self._handle,
+                  initial_value,
+                  name=n)
+              # Returning values to keep tf.cond happy.
+            return ops.convert_to_tensor(1)
+          def not_assign_fn():
+            return ops.convert_to_tensor(0)
+          # Note: this cond is always guaranteed to run because we're inside a
+          # defun which will insert automatic control dependencies.
+          control_flow_ops.cond(
+              resource_variable_ops.var_is_initialized_op(self._handle),
+              not_assign_fn, assign_fn)
 
     # After the handle has been created, set up a way to clean it up when
     # executing eagerly. We'll hold the only reference to the deleter, so that
     # when this object is garbage collected the deleter will be too. This
     # means ResourceVariables can be part of reference cycles without those
     # cycles being uncollectable.
-    self._handle_deleter = resource_variable_ops.EagerResourceDeleter(
-        handle=self._handle, handle_device=self._handle.device)
+    if not self._in_graph_mode:
+      self._handle_deleter = resource_variable_ops.EagerResourceDeleter(
+          handle=self._handle, handle_device=self._handle.device)
     self._cached_shape_as_list = None
 
 
-def _defun_with_scope(scope, fn):
+class PolymorphicFunction(object):
+  """Wrapper class for the graph functions defined for a Python function.
 
-  def wrapped_fn(*args, **kwds):
-    with variable_scope.variable_creator_scope(scope):
-      return fn(*args, **kwds)
+  See the documentation for `tf.function` for more information on the semantics
+  of defined functions.
 
-  return function.defun(wrapped_fn)
+  PolymorphicFunction is thread-compatible.
+  """
 
+  def __init__(self,
+               python_function,
+               name,
+               input_signature=None,
+               autograph=True,
+               experimental_autograph_options=None):
+    """Initializes a polymorphic function.
 
-def def_function(fn):
-  """Defines a function as per the "functions, not sessions" document."""
+    Args:
+      python_function: the function to be wrapped.
+      name: the name given to it.
+      input_signature: a possibly nested sequence of `TensorSpec` objects
+        specifying the input signature of this function. If `None`, a separate
+        function is instantiated for each inferred input signature.
+      autograph: whether `python_function` should be converted to graph mode.
+        See https://www.tensorflow.org/guide/autograph for more information.
+      experimental_autograph_options: optional tuple of
+        tensorflow.autograph.Feature values. Allows enabling additional
+        conversion options when autograph is set to True.
 
-  # Wrapping the values in lists to bypass python's lack of way to mutate
-  # symbols from an outer scope.
-  first_call = [True]
-  function_to_call = []
+    Raises:
+      ValueError: if `input_signature` is not None and the `python_function`'s
+        argspec has keyword arguments.
+    """
+    self._python_function = python_function
+    self._input_signature = input_signature
+    self._autograph = autograph
+    self._experimental_autograph_options = experimental_autograph_options
+    if self._experimental_autograph_options is not None:
+      raise NotImplementedError()
+    self._created_variables = None
+    self._stateful_fn = None
+    self._descriptor_cache = weakref.WeakKeyDictionary()
+    self._name = name
 
-  # TODO(apassos) represent this as an object and not as a closure.
-  def decorated_fn(*args, **kwds):
-    """Graph function for fn."""
-    if not first_call[0]:
-      return function_to_call[0](*args, **kwds)
+  def _defun_with_scope(self, scope):
+    """Creates a defun wrapped inside a variable creator scope."""
 
-    first_call[0] = False
-    created_variables = []
+    def wrapped_fn(*args, **kwds):
+      with variable_scope.variable_creator_scope(scope):
+        # __wrapped__ allows AutoGraph to swap in a converted function.
+        return wrapped_fn.__wrapped__(*args, **kwds)
 
-    def variable_creator_scope(unused_next_creator, **kwds):
+    # TODO(mdan): Pipe self._experimental_autograph_options through.
+    return function_lib.defun(
+        tf_decorator.make_decorator(self._python_function, wrapped_fn),
+        input_signature=self._input_signature,
+        autograph=self._autograph)
+
+  def _initialize(self, args, kwds, add_initializers_to=None):
+    """Initializes, on the first call."""
+
+    self._created_variables = []
+
+    def variable_capturing_scope(unused_next_creator, **kwds):
       """Creates UnliftedInitializerVariables and saves references to them."""
-      v = UnliftedInitializerVariable(**kwds)
-      created_variables.append(v)
+      v = UnliftedInitializerVariable(
+          add_initializers_to=add_initializers_to, **kwds)
+      self._created_variables.append(weakref.ref(v))
       return v
 
-    first_graph_function = _defun_with_scope(variable_creator_scope, fn)
+    self._stateful_fn = self._defun_with_scope(variable_capturing_scope)
+    self._stateful_fn._name = self._name  # pylint: disable=protected-access
 
     # Force the definition of the function for these arguments
-    first_concrete = first_graph_function.get_concrete_function(*args, **kwds)
+    self._concrete_stateful_fn = (
+        self._stateful_fn._get_concrete_function_internal(*args, **kwds))  # pylint: disable=protected-access
 
     def invalid_creator_scope(*unused_args, **unused_kwds):
       """Disables variable creation."""
       raise ValueError(
-          "def_function-decorated function tried to create "
-          "variables on second call.")
+          "tf.function-decorated function tried to create "
+          "variables on non-first call.")
+
+    self._stateless_fn = self._defun_with_scope(invalid_creator_scope)
+    self._stateless_fn._name = self._name  # pylint: disable=protected-access
+    if self._input_signature is None or args or kwds:
+      return self._stateful_fn._canonicalize_function_inputs(*args, **kwds)  # pylint: disable=protected-access
+    # If an input signature is defined, we may need to fetch a concrete function
+    # without any inputs specified. In this case args and kwds should be ignored
+    # but running _canonicalize_function_inputs would raise an exception.
+    return (), {}
+
+  def __call__(self, *args, **kwds):
+    """Calls the graph function."""
+    if self._created_variables:
+      # In this case we have created variables on the first call, so we run the
+      # defunned version which is guaranteed to never create variables.
+      return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
+    elif self._stateful_fn is not None:
+      # In this case we have not created variables on the first call. So we can
+      # run the first trace but we should fail if variables are created.
+      results = self._stateful_fn(*args, **kwds)
+      if self._created_variables:
+        raise ValueError("Creating variables on a non-first call to a function"
+                         " decorated with tf.function.")
+      return results
 
-    second_graph_function = _defun_with_scope(invalid_creator_scope, fn)
+    canon_args, canon_kwds = self._initialize(args, kwds)
 
-    function_to_call.append(second_graph_function)
-    if not created_variables:
-      # Note: this retracing might be unnecessary, but running the function
-      # forever in the scope which disallows variable creation is safer than not
-      # doing so.
-      return second_graph_function(*args, **kwds)
+    if not self._created_variables:
+      # If we did not create any variables the trace we have is good enough.
+      return self._concrete_stateful_fn._filtered_call(canon_args, canon_kwds)  # pylint: disable=protected-access
 
     def fn_with_cond(*inner_args, **inner_kwds):
       """Conditionally runs initialization if it's needed."""
       condition = True
-      for variable in created_variables:
-        condition = condition and resource_variable_ops.var_is_initialized_op(
-            variable.handle)
-      # We want to call second_graph_function if possible because it avoids
-      # recomputing potentially expensive initializers.
+      for wr in self._created_variables:
+        variable = wr()
+        if variable is None:
+          raise ValueError(
+              "Variable created in a tf.function garbage-collected. Code needs"
+              " to keep python references to variables created in a"
+              " tf.function.")
+        condition = math_ops.logical_and(
+            condition, resource_variable_ops.var_is_initialized_op(
+                variable.handle))
+      # We want to call stateless_fn if possible because it avoids recomputing
+      # potentially expensive initializers.
       return control_flow_ops.cond(
           condition,
-          lambda: second_graph_function(*inner_args, **inner_kwds),
-          lambda: first_concrete(*inner_args, **inner_kwds))
+          lambda: self._stateless_fn(*inner_args, **inner_kwds),
+          functools.partial(self._concrete_stateful_fn._filtered_call,  # pylint: disable=protected-access
+                            inner_args, inner_kwds))
+
+    return function_lib.defun(fn_with_cond)(*canon_args, **canon_kwds)
+
+  @property
+  def python_function(self):
+    """The python function wrapped in this tf.function."""
+    return self._python_function
+
+  def get_initialization_function(self, *args, **kwargs):
+    """Returns a `Function` object which initializes this function's variables.
+
+    Requires that this function hasn't been accessed yet through either calling
+    it or calling get_concrete_function. Fails if we cannot build an initializer
+    function which does not depend on the concrete values of the inputs to this
+    function.
+
+    Args:
+      *args: arguments to the underlying python callable.
+      **kwargs: keyword arguments to the python callable.
+
+    Returns:
+      A `Function` object which initializes the variables of this function.
+
+    Raises:
+      RuntimeError: if called after the variables have been initialized.
+    """
+    if self._stateful_fn is not None:
+      raise RuntimeError(
+          "get_initialization_function cannot be called after the function "
+          "has been used")
+    # Here we trace the function, collect the initializers, and attempt to
+    # extract them and run them eagerly. Fail only if we cannot do so.
+    initializer_map = {}
+    self._initialize(args, kwargs, add_initializers_to=initializer_map)
+
+    # Note: using defun here avoids an infinite recursion.
+    @function_lib.defun
+    def initialize_variables():
+      for v, init in initializer_map.items():
+        v.assign(lift_to_graph.lift_to_graph(
+            init, ops.get_default_graph())[init])
+
+    return initialize_variables.get_concrete_function()
+
+  def get_concrete_function(self, *args, **kwargs):
+    """Returns a `Function` object specialized to inputs and execution context.
+
+    If this `PolymorphicFunction` was created with an `input_signature`, `args`
+    and `kwargs` may be omitted. With an input signature there is only one
+    concrete function associated with this `PolymorphicFunction`.
+
+    If there is no fixed `input_signature` associated with this
+    `PolymorphicFunction`, positional and keyword arguments to
+    `get_concrete_function` follow the same rules as input signature
+    specification, with `tf.TensorSpec` objects describing `tf.Tensor`s which
+    will be passed to the concrete function.
+
+    Each `tf.Tensor` argument to the concrete function must have a unique name,
+    either because it is the only one associated with a named argument of the
+    Python function or because an explicit `name=` was passed to its
+    `tf.TensorSpec` object. These names become the argument names for the
+    concrete function.
+
+    Arguments to the concrete function may always be specified as keyword
+    arguments, naming the Tensor input. Positional arguments may be used instead
+    when each preceding argument to the Python function is a Tensor.
+
+    ```python
+    @tf.function
+    def f(x):
+      return x
+
+    f_concrete = f.get_concrete_function(tf.TensorSpec([], tf.float64))
+    f_concrete(tf.constant(1.))
+    f_concrete(x=tf.constant(1.))
+    ```
+
+    Nested structures containing Tensors may be specified when retrieving
+    concrete functions. Structures with multiple Tensors are expanded into
+    multiple arguments of the concrete function. Since multiple concrete
+    function arguments are associated with one argument to the original
+    function, these Tensors must be named explicitly. Tensors in nested
+    structures may not be passed using positional arguments when calling the
+    concrete function.
+
+    ```python
+    f_concrete2 = f.get_concrete_function(
+        (tf.TensorSpec(None, tf.float64, name="first"),
+         tf.TensorSpec([], tf.float32, name="second")))
+    # Keyword arguments are required when identifying Tensors in nested
+    # structures.
+    f_concrete2(first=tf.constant([1.]), second=tf.constant(0.))
+    ```
+
+    Functions with fixed input signatures have only one concrete function
+    associated with them, which can be retrieved without specifying any
+    arguments. As before Tensors must have unique names, either inferred from
+    the argument names in the original Python function or specified
+    explicitly.
+
+    ```python
+    @tf.function(input_signature=(tf.TensorSpec(None, tf.float32)))
+    def f_sig(y):
+      return y
+
+    f_sig_concrete = f.get_concrete_function()
+    f_sig_concrete(tf.constant(1.))
+    f_sig_concrete(y=tf.constant(1.))
+    ```
+
+    Args:
+      *args: inputs to specialize on.
+      **kwargs: inputs to specialize on.
+
+    Returns:
+      A TensorFlow function which takes exactly one `tf.Tensor` per argument.
+
+    Raises:
+      ValueError: if this object has not yet been called on concrete values.
+    """
+    assert context.executing_eagerly()
+    if self._stateful_fn is None:
+      self.get_initialization_function(*args, **kwargs)()
+
+    if self._created_variables:
+      # In this case we have created variables on the first call, so we run the
+      # defunned version which is guaranteed to never create variables.
+      return self._stateless_fn.get_concrete_function(*args, **kwargs)
+    elif self._stateful_fn is not None:
+      # In this case we have not created variables on the first call. So we can
+      # run the first trace but we should fail if variables are created.
+      concrete = self._stateful_fn.get_concrete_function(*args, **kwargs)
+      if self._created_variables:
+        raise ValueError("Creating variables on a non-first call to a function"
+                         " decorated with tf.function.")
+      return concrete
+
+  def __get__(self, instance, owner):
+    """Makes it possible to defun instance methods."""
+    del owner
+    # `instance` here is the instance that this `PolymorphicFunction` was
+    # accessed through; e.g., for
+    #
+    #   class Foo(object):
+    #
+    #     @function.defun
+    #     def bar(self):
+    #       ...
+    #
+    #   foo = Foo()
+    #   foo.bar()  # `foo.bar` is a `PolymorphicFunction` instance
+    #
+    # then `instance` will be `foo` (and `owner` will be `Foo`).  We create a
+    # new instance of PolymorphicFunction here to allow different instances each
+    # to create variables once, thereby allowing methods to be decorated with
+    # tf.function. Keeps a cache to avoid retracing the function every time the
+    # descriptor is accessed.
+    if instance not in self._descriptor_cache:
+      if instance is None:
+        return self
+      self._descriptor_cache[instance] = (
+          function_lib.class_method_to_instance_method(self, instance))
+    return self._descriptor_cache[instance]
+
+
+# In TensorFlow 1.x, exported as tf.contrib.eager.function
+@tf_export("function", v1=[])
+def function(func=None,
+             input_signature=None,
+             autograph=True,
+             experimental_autograph_options=None):
+  """Creates a callable TensorFlow graph from a Python function.
+
+  `function` constructs a callable that executes a TensorFlow graph
+  (`tf.Graph`) created by tracing the TensorFlow operations in `func`.
+  This allows the TensorFlow runtime to apply optimizations and exploit
+  parallelism in the computation defined by `func`.
+
+  _Example Usage_
+
+  ```python
+  def f(x, y):
+    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
+
+  g = tf.function(f)
+
+  x = tf.constant([[2.0, 3.0]])
+  y = tf.constant([[3.0, -2.0]])
+
+  # `f` and `g` will return the same value, but `g` will be executed as a
+  # TensorFlow graph.
+  assert f(x, y).numpy() == g(x, y).numpy()
+
+  # Tensors and tf.Variables used by the Python function are captured in the
+  # traced graph.
+  @tf.function
+  def h():
+    return f(x, y)
+
+  assert (h().numpy() == f(x, y).numpy()).all()
+  ```
+
+  _Referencing `tf.Variable`s_
+
+  The Python function `func` may reference stateful objects (such as
+  `tf.Variable`).
+  These are captured as implicit inputs to the callable returned by `function`.
+  For example:
+
+  ```python
+  c = tf.Variable(0)
+
+  @tf.function
+  def f(x):
+    c.assign_add(1)
+    return x + tf.to_float(c)
+
+  assert int(c) == 0
+  assert f(1.0) == 2.0
+  assert int(c) == 1
+  assert f(1.0) == 3.0
+  assert int(c) == 2
+  ```
+
+  `function` can be applied to methods of an object. For example:
+
+  ```python
+  class Dense(object):
+    def __init__(self):
+      self.W = tf.Variable(tf.glorot_uniform_initializer()((10, 10)))
+      self.b = tf.Variable(tf.zeros(10))
+
+    @tf.function
+    def compute(self, x):
+      return tf.matmul(x, self.W) + self.b
+
+  d1 = Dense()
+  d2 = Dense()
+  x = tf.random_uniform((10, 10))
+  # d1 and d2 are using distinct variables
+  assert not (d1.compute(x).numpy() == d2.compute(x).numpy()).all()
+  ```
+
+  _Usage with `tf.keras`_
+
+  The `call` methods of a `tf.keras.Model` subclass can be decorated with
+  `function` in order to apply graph execution optimizations on it.
+  For example:
+
+  ```python
+  class MyModel(tf.keras.Model):
+    def __init__(self, keep_probability=0.2):
+      super(MyModel, self).__init__()
+      self.dense1 = tf.keras.layers.Dense(4)
+      self.dense2 = tf.keras.layers.Dense(5)
+      self.keep_probability = keep_probability
+
+    @tf.function
+    def call(self, inputs, training=True):
+      y = self.dense2(self.dense1(inputs))
+      if training:
+        return tf.nn.dropout(y, self.keep_probability)
+      else:
+        return y
+
+  model = MyModel()
+  model(x, training=True)  # executes a graph, with dropout
+  model(x, training=False) # executes a graph, without dropout
+  ```
+
+  _Input Signatures_
+  `function` instantiates a separate graph for every unique set of input
+  shapes and datatypes. For example, the following code snippet will result
+  in three distinct graphs being traced, as each input has a different
+  shape.
+
+  ```python
+  @tf.function
+  def f(x): return tf.add(x, 1.)
+
+  scalar = tf.constant(1.0)
+  vector = tf.constant([1.0, 1.0])
+  matrix = tf.constant([[3.0]])
+
+  f(scalar)
+  f(vector)
+  f(matrix)
+  ```
+
+  An "input signature" can be optionally provided to `function` to control
+  the graphs traced. The input signature specifies the shape and type of each
+  `Tensor` argument to the function using a `tf.TensorSpec` object. For example,
+  the following code snippet ensures that a single graph is created where the
+  input `Tensor` is required to be a floating point tensor with no restrictions
+  on shape.
+
+  ```python
+  @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+  def f(x): return tf.add(x, 1.)
+  ```
+
+  When an `input_signature` is specified, the callable will only accept `Tensor`
+  (or NumPy `ndarray`) objects as arguments.
+
+  _Tracing_
+  Note that `function` only traces TensorFlow operations, all the other
+  Python code that `func` executes will shape the _construction_ of the graph.
+  For example, consider the following:
+
+  ```python
+  import numpy as np
+
+  def add_noise():
+    return tf.eye(5) + np.random.randn(5, 5)
+
+  traced = tf.function(add_noise)
+  ```
+
+  `add_noise()` will return a different output every time it is invoked.
+  However, `traced` will return the same value every time it is called, since a
+  particular random value generated by the `np.random.randn` call will be
+  inserted in the traced TensorFlow graph as a constant. In this particular
+  example, replacing `np.random.randn(5, 5)` with `tf.random_normal((5, 5))`
+  will result in the same behavior for `add_noise()` and `traced()`.
+
+  _Python Side-Effects_
+  A corollary of the previous discussion on tracing is the following: If a
+  Python function `func` has Python side-effects, then executing `func` multiple
+  times
+  may not be semantically equivalent to executing `F = tf.function(func)`
+  multiple times; this difference is due to the fact that `function` only
+  captures the subgraph of TensorFlow operations that is constructed when `func`
+  is invoked to trace a graph.
+
+  Args:
+    func: function to be compiled. If `func` is None, returns a decorator that
+      can be invoked with a single argument - `func`. The end result is
+      equivalent to providing all the arguments up front. In other words,
+      `tf.function(input_signature=...)(func)` is equivalent to
+      `tf.function(func, input_signature=...)`. The former can be used to
+      decorate Python functions, for example:
+        @tf.function(input_signature=...)
+        def foo(...): ...
+    input_signature: A possibly nested sequence of `tf.TensorSpec` objects
+      specifying the shapes and dtypes of the Tensors that will be supplied to
+      this function. If `None`, a separate function is instantiated for each
+      inferred input signature.  If input_signature is specified, every input to
+      `func` must be a `Tensor`, and `func` cannot accept `**kwargs`.
+    autograph: Whether autograph should be applied on `func` before tracing a
+      graph. This allows for dynamic control flow (Python if's, loops etc.)
+      in the traced graph. See https://www.tensorflow.org/guide/autograph for
+        more information.
+    experimental_autograph_options: Experimental knobs (in the form of a tuple
+      of tensorflow.autograph.Feature values) to control behavior when
+      autograph=True.
+
+  Returns:
+     If `func` is not None, returns a callable that will execute the compiled
+     function (and return zero or more `tf.Tensor` objects).
+     If `func` is None, returns a decorator that, when invoked with a single
+     `func` argument, returns a callable equivalent to the case above.
+
+  Raises:
+    TypeError: If `input_signature` is neither `None` nor a sequence of
+      `TensorSpec` objects.
+  """
+  if input_signature is not None:
+    function_lib.validate_signature(input_signature)
+
+  def decorated(inner_function):
+    try:
+      name = inner_function.__name__
+    except AttributeError:
+      name = "function"
+    return tf_decorator.make_decorator(
+        inner_function,
+        PolymorphicFunction(
+            inner_function,
+            name,
+            input_signature=input_signature,
+            autograph=autograph,
+            experimental_autograph_options=experimental_autograph_options))
 
-    return function.defun(fn_with_cond)(*args, **kwds)
+  # This code path is for the `foo = tf.function(foo, ...)` use case
+  if func is not None:
+    return decorated(func)
 
-  return decorated_fn
+  # This code path is for the
+  #
+  # @tf.function(...)
+  # def foo(...):
+  #    ...
+  #
+  # use case, which is equivalent to `foo = tf.function(...)(foo)`
+  return decorated
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 804436c4bb6f3c5ebb41f7b5ef4eb261e7012c7f..4100a10044c3c39763de8bb3eec645e278d94e19 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -17,19 +17,47 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+
+class _ModelWithOptimizer(training.Model):
+
+  def __init__(self):
+    super(_ModelWithOptimizer, self).__init__()
+    self.dense = core.Dense(1)
+    self.optimizer = adam.AdamOptimizer(0.01)
+
+  @def_function.function(
+      input_signature=(tensor_spec.TensorSpec([None, 2], dtypes.float32),
+                       tensor_spec.TensorSpec([None], dtypes.float32)))
+  def call(self, x, y):
+    with backprop.GradientTape() as tape:
+      loss = math_ops.reduce_mean((self.dense(x) - y) ** 2.)
+    trainable_variables = self.trainable_variables
+    gradients = tape.gradient(loss, trainable_variables)
+    self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+    return {'loss': loss}
 
 
 class DefFunctionTest(test.TestCase):
 
   def testNoVariables(self):
 
-    @def_function.def_function
+    @def_function.function
     def fn(x):
       return 2 * x
 
@@ -37,7 +65,7 @@ class DefFunctionTest(test.TestCase):
 
   def testFailIfVariablesAreCreatedMoreThanOnce(self):
 
-    @def_function.def_function
+    @def_function.function
     def fn(x):
       return variables.Variable(1.0) + x
 
@@ -47,7 +75,7 @@ class DefFunctionTest(test.TestCase):
   def testFailIfVariablesAreCreatedMoreThanOnceNoWeakRef(self):
     state = []
 
-    @def_function.def_function
+    @def_function.function
     def fn(x):
       state.append(variables.Variable(1.0))
       return state[-1] + x
@@ -59,7 +87,7 @@ class DefFunctionTest(test.TestCase):
 
     state = []
 
-    @def_function.def_function
+    @def_function.function
     def fn(x):
       if not state:
         state.append(variables.Variable(2.0))
@@ -68,11 +96,40 @@ class DefFunctionTest(test.TestCase):
     self.assertAllEqual(fn(constant_op.constant(1.0)), 2.0)
     self.assertAllEqual(fn(constant_op.constant(3.0)), 6.0)
 
+  def testFunctionInitializer(self):
+
+    state = []
+
+    @def_function.function
+    def fn(x):
+      if not state:
+        state.append(variables.Variable(lambda: 2.0))
+      return state[0] * x
+
+    self.assertAllEqual(fn(constant_op.constant(1.0)), 2.0)
+
+  def testFunctionInitializationFunction(self):
+
+    state = []
+
+    @def_function.function
+    def fn(x):
+      if not state:
+        state.append(variables.Variable(2.0))
+      return state[0] * x
+
+    init_fn = fn.get_initialization_function(constant_op.constant(1.0))
+    self.assertEqual(len(state), 1)
+    self.assertFalse(
+        resource_variable_ops.var_is_initialized_op(state[0].handle))
+    init_fn()
+    self.assertEqual(state[0].numpy(), 2.0)
+
   def testVariableInitializerNotConstant(self):
 
     state = []
 
-    @def_function.def_function
+    @def_function.function
     def fn(x):
       if not state:
         state.append(variables.Variable(2.0 * x))
@@ -81,6 +138,107 @@ class DefFunctionTest(test.TestCase):
     self.assertAllEqual(fn(constant_op.constant(1.0)), 2.0)
     self.assertAllEqual(fn(constant_op.constant(3.0)), 6.0)
 
+  def testLegacyGraphModeVariables(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      state = []
+
+      @def_function.function
+      def fn(x):
+        if not state:
+          state.append(variables.Variable(2.0))
+        return state[0] * x
+
+      result = fn(3.0)
+
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(sess.run(state[0]), 2.0)
+      self.assertAllEqual(self.evaluate(result), 6.0)
+
+  def testLegacyGraphModeVariablesNonTrivialInitializer(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      state = []
+
+      @def_function.function
+      def fn(x):
+        if not state:
+          two = constant_op.constant(2.0)
+          four = two * two
+          two_again = math_ops.sqrt(four)
+          state.append(variables.Variable(two_again + four))
+        return state[0] * x
+
+      result = fn(3.0)
+
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(sess.run(state[0]), 6.0)
+      self.assertAllEqual(self.evaluate(result), 18.0)
+
+  def testLegacyGraphModeInputDependentInitializerFails(self):
+    with ops.Graph().as_default():
+      state = []
+
+      @def_function.function
+      def fn(x):
+        if not state:
+          state.append(variables.Variable(2.0 * x))
+        return state[0] * x
+
+      with self.assertRaises(ValueError):
+        fn(constant_op.constant(3.0))
+
+  def testMethod(self):
+
+    class MyModel(object):
+
+      def __init__(self):
+        self.var = None
+
+      @def_function.function
+      def apply(self, x):
+        if self.var is None:
+          self.var = variables.Variable(2.0)
+        return self.var * x
+
+    m0 = MyModel()
+    self.assertAllEqual(m0.apply(3.0), 6.0)
+    # Calling twice to exercise that we do not recreate variables.
+    m0.var.assign(3.0)
+    self.assertAllEqual(m0.apply(3.0), 9.0)
+
+    m1 = MyModel()
+    self.assertAllEqual(m1.apply(3.0), 6.0)
+
+  def test_functools_partial(self):
+    self.assertAllClose(
+        3.,
+        def_function.function(functools.partial(lambda x, y: x + y, 1.))(
+            constant_op.constant(2.)))
+
+  def test_unspecified_default_argument(self):
+    wrapped = def_function.function(
+        lambda x, y=2: x + y,
+        input_signature=[tensor_spec.TensorSpec((), dtypes.int32)])
+    self.assertEqual(3, wrapped(constant_op.constant(1)).numpy())
+
+  def test_optimizer(self):
+    x = constant_op.constant([[3., 4.]])
+    y = constant_op.constant([2.])
+    model = _ModelWithOptimizer()
+    model(x, y)
+
+  def test_concrete_function_from_signature(self):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    def compute(x):
+      return 2. * x
+
+    concrete = compute.get_concrete_function()
+    self.assertAllClose(1., concrete(constant_op.constant(0.5)))
+    concrete = compute.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32))
+    self.assertAllClose(4., concrete(constant_op.constant(2.)))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index f9b8d2cb5db9aedcd834afcde00dac3afa4008bb..6f8c780170cc8e3bfe5aa23603c0448e70b5e49c 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -64,6 +64,16 @@ def quick_execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
     else:
       message = e.message
     six.raise_from(core._status_to_exception(e.code, message), None)
+  except TypeError as e:
+    if any(ops._is_keras_symbolic_tensor(x) for x in inputs):
+      if any(isinstance(x, ops.EagerTensor) for x in inputs):
+        raise TypeError("You are attempting to mix computation of symbolic "
+                        "Tensors (computation rooted at tf.keras.Input()) "
+                        "and concrete values. This is not supported. "
+                        "If you need this support, file an issue on the "
+                        "TensorFlow GitHub repository.")
+      raise core._SymbolicException
+    raise e
   # pylint: enable=protected-access
   return tensors
 
@@ -188,7 +198,10 @@ def args_to_matching_eager(l, ctx, default_dtype=None):
     ret = []
     for t in l:
       ret.append(internal_convert_to_tensor(
-          t, dtype, preferred_dtype=default_dtype, ctx=ctx))
+          t, dtype,
+          preferred_dtype=default_dtype,
+          ctx=ctx,
+          accept_symbolic_tensors=False))
       if dtype is None:
         dtype = ret[-1].dtype
   else:
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 80ff4459d60a33d1a02f14acaafb8370a48fb6ca..28b6b84a82c6550cd0e1b893b5002d13b306233d 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import functools
 
 import numpy as np
@@ -28,8 +29,13 @@ from tensorflow.python.eager import core
 from tensorflow.python.eager import execute
 from tensorflow.python.platform import tf_logging as logging
 
-_DEFAULT_CALLBACK_ACTION = "raise"
-_VALID_CALLBACK_ACTIONS = (None, "ignore", "print", "raise", "warn")
+IGNORE = "ignore"
+PRINT = "print"
+RAISE = "raise"
+WARN = "warn"
+
+_DEFAULT_CALLBACK_ACTION = RAISE
+_VALID_CALLBACK_ACTIONS = (None, IGNORE, PRINT, RAISE, WARN)
 
 
 # TODO(cais): Consider moving this exception class to errors_impl.py.
@@ -335,3 +341,38 @@ def seterr(inf_or_nan=None):
           functools.partial(inf_nan_callback, action=inf_or_nan))
 
   return old_settings
+
+
+@contextlib.contextmanager
+def errstate(inf_or_nan=None):
+  """Context manager setting error state.
+
+  Example:
+  ```
+  c = tf.log(0.)  # -inf
+
+  with errstate(inf_or_nan="raise"):
+    tf.log(0.)  # <-- Raises InfOrNanError.
+  ```
+
+  Args:
+    inf_or_nan: Set action for infinity (`inf`) and NaN (`nan`) values.
+      Possible values: `{IGNORE, PRINT, RAISE, WARN}`.
+      `IGNORE`: take no action when `inf` values appear.
+      `PRINT`: print a warning to `stdout`.
+      `RAISE`: raise an `InfOrNanError`.
+      `WARN`: print a warning using `tf.logging.warn`.
+      A value of `None` leads to no change in the action of the condition.
+
+  Yields:
+    None.
+
+  Raises:
+    ValueError: If the value of any keyword arguments is invalid.
+  """
+  if not context.executing_eagerly():
+    yield
+  else:
+    old_settings = seterr(inf_or_nan=inf_or_nan)
+    yield
+    seterr(**old_settings)
diff --git a/tensorflow/python/eager/execution_callbacks_test.py b/tensorflow/python/eager/execution_callbacks_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5594ab5f12abffb1e2b3bb4d1d0fa4251eedf809
--- /dev/null
+++ b/tensorflow/python/eager/execution_callbacks_test.py
@@ -0,0 +1,55 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for eager execution_callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import execution_callbacks
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def log_zero():
+  """Computes `log(0.0)`."""
+  return math_ops.log(constant_op.constant(0.))
+
+
+class ExecutionCallbacksTest(test.TestCase):
+
+  def test_errstate_inf_raise(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+      with self.assertRaises(execution_callbacks.InfOrNanError):
+        log_zero()
+
+  def test_errstate_inf_ignore(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.IGNORE):
+      self.assertEqual(-float("inf"), log_zero().numpy())
+
+  def test_errstate_nesting(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+      with execution_callbacks.errstate(inf_or_nan=execution_callbacks.IGNORE):
+        self.assertEqual(-float("inf"), log_zero().numpy())
+
+      with self.assertRaises(execution_callbacks.InfOrNanError):
+        log_zero()
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 2750461fb2fb5992eb17de49c04dc2b3c45e42ed..520c85a2c2093436d8d99b4713f0ad5fcc92321d 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -24,6 +24,7 @@ import functools
 import re
 import sys
 import threading
+import types as types_lib
 import weakref
 
 import numpy as np
@@ -40,25 +41,19 @@ from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import cond_v2_impl
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribution_strategy_context
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
-# This is to avoid a circular dependency with cond_v2_impl
-# (function -> gradients_impl -> control_flow_ops -> cond_v2_impl).
-cond_v2_impl._function = sys.modules[__name__]  # pylint: disable=protected-access
-
 # This is to avoid a circular dependency with gradients_impl
 gradients_impl._function = sys.modules[__name__]  # pylint: disable=protected-access
 
@@ -72,63 +67,10 @@ WHITELIST_FUNCTION_ATTRIBUTE_REGEX = [
     BACKWARD_FUNCTION_ATTRIBUTE_NAME
 ]
 
-
-def _create_substitute_placeholder(value, name=None, dtype=None):
-  """Creates a placeholder for `value` and propagates shape info to it."""
-  # Note: setting ops.control_dependencies(None) ensures we always put
-  # capturing placeholders outside of any control flow context.
-  with ops.control_dependencies(None):
-    placeholder = graph_placeholder(
-        dtype=dtype or value.dtype, shape=value.shape, name=name)
-  _copy_handle_data(value, placeholder)
-  return placeholder
-
-
-def _copy_handle_data(source_t, target_t):
-  """Copies HandleData for variant and resource type tensors if available.
-
-  The CppShapeInferenceResult::HandleData proto contains information about the
-  shapes and types of the element tensors of resource/variant type tensors.
-  We need to copy this across function boundaries, i.e., when capturing a
-  placeholder or when returning a function tensor as output. If we don't do this
-  the element tensors will have unknown shapes, e.g., if a TensorList variant
-  tensor is captured as a placeholder, elements popped from that list would have
-  unknown shape.
-
-  Args:
-    source_t: The tensor to copy HandleData from.
-    target_t: The tensor to copy HandleData to.
-  """
-  if (target_t.dtype == dtypes_module.resource or
-      target_t.dtype == dtypes_module.variant):
-    if isinstance(source_t, ops.EagerTensor):
-      handle_data = source_t._handle_data  # pylint: disable=protected-access
-    else:
-      handle_data = resource_variable_ops.get_resource_handle_data(source_t)
-    if handle_data is not None and handle_data.is_set:
-      # pylint: disable=protected-access
-      pywrap_tensorflow.SetHandleShapeAndType(target_t.graph._c_graph,
-                                              target_t._as_tf_output(),
-                                              handle_data.SerializeToString())
-      # pylint: enable=protected-access
-      # Ensure that shapes and dtypes are propagated.
-      shapes, types = zip(*[(pair.shape, pair.dtype)
-                            for pair in handle_data.shape_and_type])
-      ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
-      shapes = [[d.size for d in s.dim]
-                if not s.unknown_rank else None for s in shapes]
-      pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
-          target_t._op._graph._c_graph,  # pylint: disable=protected-access
-          target_t._as_tf_output(),  # pylint: disable=protected-access
-          shapes, ranks, types)
-
-
-def _get_device_functions(ctx, graph):
-  """Returns a tuple of device functions representing the device stack."""
-  if ctx.executing_eagerly():
-    return (pydev.merge_device(ctx.device_name),)
-  else:
-    return tuple(graph._device_functions_outer_to_inner)  # pylint: disable=protected-access
+CacheKey = collections.namedtuple("CacheKey", [
+    "input_signature", "parent_graph", "device_functions", "colocation_stack",
+    "uses_xla"
+])
 
 
 def _parse_func_attrs(attributes):
@@ -147,8 +89,8 @@ def _parse_func_attrs(attributes):
   """
   attrs = {}
   for key, value in attributes.items():
-    if not any([re.match(reg, key)
-                for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX]):
+    if not any(re.match(reg, key)
+               for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX):
       raise ValueError("Attribute name is not whitelisted. "
                        "Whitelisted: prefix %s, got: %s" %
                        (WHITELIST_FUNCTION_ATTRIBUTE_REGEX, key))
@@ -170,218 +112,6 @@ def _parse_func_attrs(attributes):
   return attrs
 
 
-class FuncGraph(ops.Graph):
-  """Graph representing a function body.
-
-  Attributes:
-    name: The name of the function.
-    inputs: Placeholder tensors representing the inputs to this function. The
-      tensors are in this FuncGraph. This represents "regular" inputs as well as
-      captured inputs (i.e. the values of self.captures), with the regular
-      inputs coming first.
-    outputs: Tensors that will be returned by this function. The tensors are in
-      this FuncGraph.
-    structured_outputs: A possibly-nested python object which will be returned
-      by this function. The Tensors in this structure are the same as those of
-      self.outputs. Note that this structure might contain Python `None`s.
-    variables: Variables that should be watched during function execution.
-    outer_graph: The graph this function is defined in. May be another FuncGraph
-      or the global default Graph.
-    captures: Maps external tensor -> internal tensor (i.e. input placeholder).
-      The entries are in the order they were captured.
-    seed: The graph-level random seed.
-  """
-
-  def __init__(self, name):
-    """Construct a new FuncGraph.
-
-    The graph will inherit its graph key, collections, seed, device stack, and
-    distribution strategy stack from the current context or graph.
-
-    Args:
-      name: the name of the function.
-    """
-    super(FuncGraph, self).__init__()
-
-    self.name = name
-    self.inputs = []
-    self.outputs = []
-    self.structured_outputs = None
-    self._weak_variables = []
-    self.outer_graph = ops.get_default_graph()
-    self.captures = collections.OrderedDict()
-
-    self._building_function = True
-    # Map from resource tensor name to last op (in program order) which uses
-    # this tensor. Used to enforce that execution order matches program order
-    # for resource tensors.
-    self._last_op_using_resource_tensor = {}
-
-    graph = self.outer_graph
-
-    if context.executing_eagerly():
-      self.seed = context.global_seed()
-      self._xla_compile = (context.context().device_spec.device_type == "TPU")
-      self._add_device_to_stack(context.context().device_name)
-    else:
-      self.seed = graph.seed
-      self._xla_compile = getattr(graph, "_xla_compile", False)
-      self._device_function_stack = graph._device_function_stack.copy()  # pylint: disable=protected-access
-      self._colocation_stack = graph._colocation_stack.copy()  # pylint: disable=protected-access
-
-    # TODO(b/112165328, b/112906995): summaries depend on inheriting collections
-    # from the default graph even in eager mode. It'd be nice to not have a
-    # default graph with eager execution, so hopefully this will go away when we
-    # remove collections.
-    # pylint: disable=protected-access
-    self._collections = graph._collections
-    # TODO(b/112906995): distribution strategy depends on inheriting this stack
-    # from the default graph even in eager mode. Maybe it should be part of the
-    # eager context?
-    self._distribution_strategy_stack = graph._distribution_strategy_stack
-    # Inherit the graph key, since this is used for matching variables in
-    # optimizers.
-    self._graph_key = graph._graph_key
-    # pylint: enable=protected-access
-
-  @property
-  def variables(self):
-    """A list of variables accessed by this FuncGraph.
-
-    Note that functions keep only weak references to variables. Calling the
-    function after a variable it accesses has been deleted is an error.
-
-    Yields:
-      Strong references to variables accessed by this FuncGraph.
-    """
-    for weak_v in self._weak_variables:
-      v = weak_v()
-      if v is None:
-        raise AssertionError(
-            "Called a function referencing variables which have been deleted. "
-            "This likely means that function-local variables were created and "
-            "not referenced elsewhere in the program. This is generally a "
-            "mistake; consider storing variables in an object attribute on "
-            "first call.")
-      yield v
-
-  @variables.setter
-  def variables(self, var_list):
-    self._weak_variables = [weakref.ref(v) for v in var_list]
-
-  def create_op(
-      self,
-      op_type,
-      inputs,
-      dtypes,
-      input_types=None,
-      name=None,
-      attrs=None,
-      op_def=None,
-      compute_shapes=True,
-      compute_device=True):
-    """Like Graph.create_op, except handles external input tensors.
-
-    This overload adds functionality to create_op to "capture" any external
-    input tensors, i.e. tensors from the eager context or outer function graphs
-    if this is a nested function. See `capture` for more information.
-
-    Args:
-      op_type: The `Operation` type to create. This corresponds to the
-        `OpDef.name` field for the proto that defines the operation.
-      inputs: A list of `Tensor` objects that will be inputs to the `Operation`.
-      dtypes: A list of `DType` objects that will be the types of the tensors
-        that the operation produces.
-      input_types: (Optional.) A list of `DType`s that will be the types of
-        the tensors that the operation consumes. By default, uses the base
-        `DType` of each input in `inputs`. Operations that expect
-        reference-typed inputs must specify `input_types` explicitly.
-      name: (Optional.) A string name for the operation. If not specified, a
-        name is generated based on `op_type`.
-      attrs: (Optional.) A dictionary where the key is the attribute name (a
-        string) and the value is the respective `attr` attribute of the
-        `NodeDef` proto that will represent the operation (an `AttrValue`
-        proto).
-      op_def: (Optional.) The `OpDef` proto that describes the `op_type` that
-        the operation will have.
-      compute_shapes: (Optional.) Deprecated. Has no effect (shapes are always
-        computed).
-      compute_device: (Optional.) If True, device functions will be executed
-        to compute the device property of the Operation.
-
-    Returns:
-      An `Operation` object.
-    """
-    # This capturing logic interacts poorly with control flow contexts which
-    # want to replace inputs of ops far too late in the process. This can lead
-    # the context to get confused and try to create an Enter for an Enter. We
-    # can detect this here and skip the additional Enter which can confuse loop
-    # validation logic.
-    if op_type == "Enter" and inputs[0].op.type == "Enter":
-      if inputs[0].op.get_attr("frame_name") == attrs["frame_name"].s:
-        return inputs[0].op
-    # Calling AddValue on the control flow contexts to force creation of the
-    # backward accumulators in the original graph before we create placeholders
-    # to capture the inputs.
-    ctxt = ops.get_default_graph()._control_flow_context  # pylint: disable=protected-access
-    for i, inp in enumerate(inputs):
-      # TPU Estimator defines a control flow context with no AddValue method.
-      if ctxt is not None and hasattr(ctxt, "AddValue"):
-        inp = ctxt.AddValue(inp)
-      inp = self.capture(inp)
-      inputs[i] = inp
-    return super(FuncGraph, self).create_op(
-        op_type, inputs, dtypes, input_types, name, attrs, op_def,
-        compute_device=compute_device)
-
-  def capture(self, tensor, name=None):
-    """Captures `tensor` if it's external to this graph.
-
-    If `tensor` is from a different graph, returns a placeholder for it.
-    `tensor` and the placeholder will appear in self.captures, and the
-    placeholder will appear in self.inputs.  Multiple calls to this method with
-    the same `tensor` argument will return the same placeholder. If `tensor` is
-    from this graph, returns `tensor`.
-
-    Args:
-      tensor: Tensor. May be from this FuncGraph or a different graph.
-      name: Optional name if a placeholder is created.
-
-    Returns:
-      Tensor from this FuncGraph.
-    """
-    if isinstance(tensor, ops.EagerTensor):
-      if name is None:
-        name = str(ops.uid())
-      return self._capture_helper(tensor, name)
-    if tensor.graph is not self:
-      if name is None:
-        name = tensor.op.name
-      return self._capture_helper(tensor, name)
-    return tensor
-
-  def _capture_helper(self, tensor, name):
-    captured_tensor = self.captures.get(tensor, None)
-    if captured_tensor is None:
-      captured_tensor = _create_substitute_placeholder(tensor, name=name,
-                                                       dtype=tensor.dtype)
-      self.captures[tensor] = captured_tensor
-      self.inputs.append(captured_tensor)
-    tape.record_operation("captured_value", [captured_tensor], [tensor],
-                          lambda x: [x])
-    return captured_tensor
-
-  @property
-  def external_captures(self):
-    """External tensors captured by this function."""
-    return list(self.captures.keys())
-
-  @property
-  def internal_captures(self):
-    """Placeholders in this function corresponding captured tensors."""
-    return list(self.captures.values())
-
-
 def _forward_name(n):
   """The name of a generated forward defun named n."""
   return "__forward_%s_%s" % (n, ops.uid())
@@ -397,11 +127,6 @@ def _inference_name(n):
   return "__inference_%s_%s" % (n, ops.uid())
 
 
-def _register(fn):
-  """Registers the function `fn`."""
-  context.context().add_function(fn)
-
-
 # TODO(apassos) get rid of this by splitting framework.function._DefinedFunction
 # so it doesn't have the definition-generating logic and is just a container for
 # an already-defined function.
@@ -453,8 +178,9 @@ class _EagerDefinedFunction(object):
       proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
     function_def = function_pb2.FunctionDef()
     function_def.ParseFromString(compat.as_bytes(proto_data))
-    if context.executing_eagerly():
-      _register(fn)
+    with ops.init_scope():
+      if context.executing_eagerly():
+        context.add_function(fn)
     self.definition = function_def
     self.name = compat.as_bytes(function_def.signature.name)
     self.signature = function_def.signature
@@ -534,11 +260,14 @@ class _EagerDefinedFunction(object):
         raise ValueError(
             "Arguments and signature arguments do not match: %s %s " %
             (len(args), len(list(self.signature.input_arg))))
+      function_call_options = ctx.get_function_call_options()
       outputs = functional_ops.partitioned_call(
           args=args,
           f=self,
           tout=self._output_types,
-          executing_eagerly=executing_eagerly)
+          executing_eagerly=executing_eagerly,
+          config=function_call_options.config_proto_serialized,
+          executor_type=function_call_options.executor_type)
 
     if executing_eagerly:
       return outputs
@@ -546,26 +275,10 @@ class _EagerDefinedFunction(object):
       for i, shape in enumerate(self._output_shapes):
         outputs[i].set_shape(shape)
       for i, func_graph_output in enumerate(self._func_graph_outputs):
-        _copy_handle_data(func_graph_output, outputs[i])
+        custom_gradient.copy_handle_data(func_graph_output, outputs[i])
       return outputs
 
 
-def _flatten(sequence):
-  """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
-  # TODO(akshayka): Support `SparseTensor` in a similar fashion.
-  flat_sequence = nest.flatten(sequence)
-  outputs = []
-  for item in flat_sequence:
-    if isinstance(item, ops.IndexedSlices):
-      if item.dense_shape is not None:
-        outputs.extend([item.values, item.indices, item.dense_shape])
-      else:
-        outputs.extend([item.values, item.indices])
-    else:
-      outputs.append(item)
-  return outputs
-
-
 class Function(object):
   """Callable object encapsulating a function definition and its gradient.
 
@@ -573,7 +286,7 @@ class Function(object):
   is differentiable under `tf.GradientTape` objects.
   """
 
-  def __init__(self, func_graph, attrs=None):
+  def __init__(self, func_graph, attrs=None, signature=None):
     """Initialize a Function.
 
     Args:
@@ -581,85 +294,199 @@ class Function(object):
       attrs: (optional) dict mapping names of attributes to their AttrValue
         values. Attributes in `attrs` will be included in this function's
         definition.
-
+     signature: a nested sequence of `TensorSpec` objects specifying the input
+       signature of this function.
     Raises:
       ValueError: If number of input_placeholders is not equal to the number
         of function inputs.
     """
+    self._arg_keywords = None
+    self._num_positional_args = None
     self._func_graph = func_graph
     self._captured_inputs = list(self._func_graph.captures.keys())
     self._num_outputs = len(self._func_graph.outputs)
     self._output_shapes = tuple(
         output.shape for output in self._func_graph.outputs)
     self._attrs = _parse_func_attrs(attrs or {})
-    self._device_functions = tuple(
-        self._func_graph._device_functions_outer_to_inner)  # pylint: disable=protected-access
 
     self._inference_function = _EagerDefinedFunction(
         _inference_name(self._func_graph.name), self._func_graph,
         self._func_graph.inputs, self._func_graph.outputs, self._attrs)
     self._backward_graph_function = None
+    self._signature = signature
+    self._gradient_name = None
+
+  def __call__(self, *args, **kwargs):
+    """Executes the wrapped function.
+
+    Args:
+      *args: Tensors or Variables. Positional arguments are only accepted when
+        they correspond one-to-one with arguments of the traced Python function.
+      **kwargs: Tensors or Variables specified by name. When
+        `get_concrete_function` was called to create this `Function`, each
+        Tensor input was given a name, defaulting to the name of the Python
+        function's argument but possibly overridden by the `name=` argument to
+        `tf.TensorSpec`. These names become the argument names for the concrete
+        function.
+
+    Returns:
+      The result of applying the TF function on the given Tensors.
 
-    # Map holding distributed variables, keyed by resource handle tensors.
-    self._distributed_variables = {}
-    strategy = distribution_strategy_context.get_distribution_strategy()
-    for variable in self._func_graph.variables:
-      # If variable is not distributed, unwrap returns [variable].
-      component_variables = strategy.unwrap(variable)
-      # Only update the dictionary when the variable is actually distributed.
-      if (len(component_variables) > 1 or component_variables[0] != variable):
-        for component_variable in component_variables:
-          self._distributed_variables[component_variable.handle] = variable
-
-  def __call__(self, *args):
+    Raises:
+      AssertionError: If this `Function` was not created through
+        `get_concrete_function`.
+      ValueError: If arguments contains anything other than Tensors or
+        Variables.
+      TypeError: For invalid positional/keyword argument combinations.
+    """
+    if self._arg_keywords is None or self._num_positional_args is None:
+      if self._signature:
+        if kwargs:
+          raise NotImplementedError(
+              "Keyword arguments not supported when calling a "
+              "wrap_function-decorated function.")
+        return self._call_flat(args)
+      raise AssertionError(
+          "Tried to call a concrete function obtained from an internal API "
+          "through the public interface. Use get_concrete_function instead.")
+    if len(args) > self._num_positional_args:
+      raise TypeError(
+          ("Expected at most {} positional arguments ({}), got {}. When "
+           "calling a concrete function, positional arguments may not be bound "
+           "to Tensors within nested structures.").format(
+               self._num_positional_args,
+               self._arg_keywords[:self._num_positional_args],
+               args))
+    args = list(args)
+    for keyword in self._arg_keywords[len(args):]:
+      args.append(kwargs.pop(compat.as_str(keyword)))
+    if kwargs:
+      positional_arg_keywords = set(self._arg_keywords[:len(args)])
+      for unused_key in kwargs:
+        if unused_key in positional_arg_keywords:
+          raise TypeError("Got two values for keyword '{}'.".format(unused_key))
+      raise TypeError("Keyword arguments {} unknown.".format(kwargs.keys()))
+    return self._call_flat(args)
+
+  def _filtered_call(self, args, kwargs):
+    """Executes the function, filtering arguments from the Python function.
+
+    Objects aside from Tensors and Variables are ignored.
+
+    Args:
+      args: Canonicalized positional arguments of the Python function.
+      kwargs: Canonicalized keyword arguments of the Python function.
+
+    Returns:
+      The result of applying the function on the Tensors/Variables contained in
+      `args` and `kwargs`.
+    """
+    return self._call_flat(
+        (t for t in nest.flatten((args, kwargs))
+         if isinstance(
+             t, (ops.Tensor, resource_variable_ops.ResourceVariable))))
+
+  def _call_flat(self, args):
     """Executes the wrapped function.
 
     Args:
-      *args: a list of Tensors or Variables.
+      args: a list of Tensors or Variables.
 
     Returns:
       The result of applying the TF function to `args`.
 
     Raises:
-      ValueError: If the current device stack does not match the device stack
-        under which the function was created, or if `args` contains anything
-        other than Tensors or Variables.
+      ValueError: If `args` contains anything other than Tensors or Variables.
     """
     ctx = context.context()
-    device_functions = _get_device_functions(ctx, ops.get_default_graph())
-    if device_functions != self._device_functions:
-      raise ValueError(
-          "The current device stack does not match the device stack under "
-          "which the TensorFlow function '%s' was created.\n"
-          "Current device stack: %s\n%s device stack: %s" %
-          (self._inference_function.name, device_functions,
-           self._inference_function.name, self._device_functions))
 
     for v in self._func_graph.variables:
       if v.trainable:
         tape.variable_accessed(v)
 
-    captures = self._resolve_captured_inputs()
     tensor_inputs = []
-    for i, arg in enumerate(nest.flatten(args)):
+    for i, arg in enumerate(args):
       if isinstance(arg, resource_variable_ops.ResourceVariable):
         if arg.trainable:
           tape.variable_accessed(arg)
         tensor_inputs.append(arg.handle)
       elif isinstance(arg, ops.Tensor):
         tensor_inputs.append(arg)
+      elif (self._signature is not None and
+            isinstance(self._signature[i], tensor_spec.TensorSpec)):
+        tensor_inputs.append(
+            ops.convert_to_tensor(arg, self._signature[i].dtype))
       else:
         raise ValueError("All inputs to `Function`s must be Tensors; "
                          "on invocation of %s, the %d-th input (%s) was not a "
                          "Tensor." % (self._func_graph.name, i, str(arg)))
-    args = tensor_inputs + captures
+    args = tensor_inputs + self._captured_inputs
 
-    if tape.should_record(tensor_inputs) or tape.should_record(captures):
-      return self._backprop_call(args)
+    if (tape.should_record(tensor_inputs) or
+        tape.should_record(self._captured_inputs)):
+      if context.executing_eagerly():
+        return self._eager_backprop_call(args)
+      else:
+        return self._backprop_call_with_delayed_rewrite(args)
 
-    outputs = self._inference_function.call(ctx, args)
+    # Only need to override the gradient in graph mode and when we have outputs.
+    if context.executing_eagerly() or not self.outputs:
+      outputs = self._inference_function.call(ctx, args)
+    else:
+      if not self._gradient_name:
+        self._gradient_name = "PartitionedCall-%s" % ops.uid()
+        self._register_gradient(self._gradient_name)
+      with ops.get_default_graph().gradient_override_map(
+          {"PartitionedCall": self._gradient_name,
+           "StatefulPartitionedCall": self._gradient_name}):
+        outputs = self._inference_function.call(ctx, args)
     return self._build_call_outputs(outputs)
 
+  def _register_gradient(self, name):
+    """Registers the gradient for the current Function under the given name.
+
+    The gradient rewrites an inference call op to a forward call op, but does
+    not modify a pre-existing forward call op. It then computes the gradient
+    from the output's gradients and the side outputs of the forward op.
+
+    Args:
+      name: The name to register the gradient as.
+    """
+    @ops.RegisterGradient(name)
+    def _registered_grad_fn(op, *doutputs):  # pylint: disable=unused-variable
+      return self._grad_fn(op, *doutputs)
+
+  def _grad_fn(self, op, *doutputs):
+    """Gradients of this function."""
+    if self._backward_graph_function is None:
+      self._construct_backprop_function()
+
+    # pylint: disable=protected-access
+    self._forward_function.add_to_graph(op.graph)
+    num_inference_outputs = self._inference_function._num_outputs
+
+    # Rewrite an inference call op to be a forward call op
+    if op.get_attr("f").name.encode() == self._inference_function.name:
+      op._set_func_attr("f", self._forward_function.name)
+      op._set_type_list_attr("Tout", self._forward_function._output_types)
+      op._add_outputs(
+          self._forward_function._output_types[num_inference_outputs:],
+          self._forward_function._output_shapes[num_inference_outputs:])
+      for i in range(num_inference_outputs, len(op.outputs)):
+        func_graph_output = self._forward_function._func_graph_outputs[i]
+        custom_gradient.copy_handle_data(func_graph_output, op.outputs[i])
+    # pylint: enable=protected-access
+    # Compute the gradients using the side outputs
+    side_outputs = op.outputs[num_inference_outputs:]
+    args = list(doutputs[:num_inference_outputs]) + list(side_outputs)
+    return self._backward_graph_function._call_flat(  # pylint: disable=protected-access
+        (a for a in args if a is not None))
+
+  @property
+  def name(self):
+    """Function name."""
+    return self._inference_function.name
+
   @property
   def graph(self):
     """Returns the graph from which this function was constructed."""
@@ -717,16 +544,52 @@ class Function(object):
     return nest.map_structure(lambda x: x.dtype if x is not None else None,
                               self._func_graph.structured_outputs)
 
+  def add_to_graph(self, g=None, register_gradient_functions=False):
+    """Registers the function, adds it to the graph g or default graph."""
+    # If we are not executing eagerly, adds the function to default graph if no
+    # graph is specified.
+    # In case of eager execution, function definition gets added to context
+    # during construction itself.
+
+    # TODO(allel/shivaniagrawal): rename this to register to reflect the
+    # method's functionality better. Remove register_gradient_functions argument
+    # and figure out if these needs to be registered.
+
+    if not context.executing_eagerly() or g:
+      if not g:
+        g = ops.get_default_graph()
+      self._inference_function.add_to_graph(g)  # pylint: disable=protected-access
+
+      # pylint: disable=protected-access
+      if register_gradient_functions:
+        # There are two situations for the actual call of a defun:
+        # 1. If none of the input args are resource variables or watch by any
+        #   tape, and it will run the _inference_function of concrete_func for
+        #   forward pass, the gradient will be generated by standard mechanism.
+        # 2. Otherwise, defun will create two functions, one for forward pass,
+        #   and the backward pass will be created via tape.
+        #   When registering the function, we register both cases.
+        if self._backward_graph_function is None:
+          self._construct_backprop_function()
+        forward_function = self._forward_function
+        backward_function = self._backward_graph_function._inference_function
+        # pylint: enable=protected-access
+        forward_function.add_to_graph(g)
+        backward_function.add_to_graph(g)
+
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
-    backwards_graph = FuncGraph(_backward_name(self._func_graph.name))
+    backwards_graph = func_graph_module.FuncGraph(
+        _backward_name(self._func_graph.name))
     forward_function_name = _forward_name(self._func_graph.name)
+    outputs = [x for x in self._func_graph.outputs
+               if gradients_impl.IsTrainable(x)]
     with backwards_graph.as_default():
       gradients_wrt_outputs = [
-          graph_placeholder(x.dtype, x.shape) for x in self._func_graph.outputs
+          graph_placeholder(x.dtype, x.shape) for x in outputs
       ]
       gradients_wrt_inputs = gradients_impl._GradientsHelper(  # pylint: disable=protected-access
-          self._func_graph.outputs,
+          outputs,
           self._func_graph.inputs,
           grad_ys=gradients_wrt_outputs,
           src_graph=self._func_graph)
@@ -745,7 +608,8 @@ class Function(object):
     # Clear captures, since we pass them in as inputs.
     backwards_graph.captures = {}
     backwards_graph.outputs.extend(
-        grad for grad in _flatten(gradients_wrt_inputs) if grad is not None)
+        grad for grad in func_graph_module.flatten(gradients_wrt_inputs)
+        if grad is not None)
     backwards_graph.structured_outputs = gradients_wrt_inputs
     self._backward_graph_function = Function(
         backwards_graph, attrs=backward_function_attr)
@@ -759,10 +623,13 @@ class Function(object):
         self._func_graph.outputs + backwards_graph_captures,
         forward_function_attr)
 
-  def _backprop_call(self, args):
+  def _eager_backprop_call(self, args):
     """Calls the forward function and records the result on a tape.
 
-    (Only records results on a tape if the function has outputs)
+    This method fully constructs the forward and backward functions before
+    calling the function and recording them on the tape.
+
+    (Only records results on a tape if the function has outputs).
 
     Args:
       args: All inputs to the function, including resolved captured inputs
@@ -774,7 +641,15 @@ class Function(object):
       self._construct_backprop_function()
 
     ctx = context.context()
-    outputs = self._forward_function.call(ctx, args)
+
+    if not self._gradient_name:
+      self._gradient_name = "PartitionedCall-%s" % ops.uid()
+      self._register_gradient(self._gradient_name)
+    with ops.get_default_graph().gradient_override_map(
+        {"PartitionedCall": self._gradient_name,
+         "StatefulPartitionedCall": self._gradient_name}):
+      outputs = self._forward_function.call(ctx, args)
+
     if isinstance(outputs, ops.Operation) or outputs is None:
       return outputs
 
@@ -782,40 +657,59 @@ class Function(object):
     # `side_outputs` are the intermediate Tensors that were added as outputs to
     # the forward graph function so that we can compute its gradient.
     real_outputs = outputs[:self._num_outputs]
+    skip_positions = [i for i, t in enumerate(real_outputs)
+                      if not gradients_impl.IsTrainable(t)]
     side_outputs = outputs[self._num_outputs:]
 
     def backward_function(*args):
-      return self._backward_graph_function(*(list(args) + side_outputs))  # pylint: disable=not-callable
+      args = [a for i, a in enumerate(args)
+              if a is not None and i not in skip_positions]
+      return self._backward_graph_function._call_flat(  # pylint: disable=protected-access
+          list(args) + side_outputs)
 
     tape.record_operation(self._forward_function.signature.name, real_outputs,
                           args, backward_function)
     return self._build_call_outputs(real_outputs)
 
-  def _resolve_captured_inputs(self):
-    """Resolve captured distributed variables to their current values.
+  def _backprop_call_with_delayed_rewrite(self, args):
+    """Calls the inference function and records the result on a tape.
 
-    Some inputs can be distributed variables. Such variables yield a different
-    component (i.e. actual tf.Variable) variables depending on the context of
-    execution.
+    The recorded backwards function will construct the backwards graph and
+    rewrite the inference function to the forward function. This only happens
+    if the recorded backwards function ends up being used to compute gradients.
+
+    This approach avoids constructing unnecessary graphs, but it only works if
+    we are calling this function when not executing eagerly.
+
+    (Only records results on a tape if the function has outputs)
+
+    Args:
+      args: All inputs to the function, including resolved captured inputs
 
     Returns:
-      a list of resolved captured input tensors.
+      The call output.
     """
-    if self._distributed_variables:
-      # Loop over each captured input and check if it corresponds to something
-      # distributed. If so, get its _distributed_container and fetch the
-      # component appropriate for the current execution context.
-      resolved_captured_inputs = self._captured_inputs[:]
-      for i, captured_input in enumerate(self._captured_inputs):
-        distributed_var = self._distributed_variables.get(captured_input, None)
-        if distributed_var is not None:
-          # distributed variables override __getattr__ and substitute the
-          # right component variable. In here, `distributed_var.handle`
-          # actually does the equivalent of
-          # distributed_var.get_current_component_var().handle.
-          resolved_captured_inputs[i] = distributed_var.handle
-      return resolved_captured_inputs
-    return self._captured_inputs
+    ctx = context.context()
+
+    if not self._gradient_name:
+      self._gradient_name = "PartitionedCall-%s" % ops.uid()
+      self._register_gradient(self._gradient_name)
+    with ops.get_default_graph().gradient_override_map(
+        {"PartitionedCall": self._gradient_name,
+         "StatefulPartitionedCall": self._gradient_name}):
+      outputs = self._inference_function.call(ctx, args)
+
+    if isinstance(outputs, ops.Operation) or outputs is None:
+      return outputs
+
+    call_op = outputs[0].op
+
+    def backward_function(*args):
+      return self._grad_fn(call_op, *args)
+
+    tape.record_operation(self._inference_function.signature.name, outputs,
+                          args, backward_function)
+    return self._build_call_outputs(outputs)
 
   def _build_call_outputs(self, result):
     """Maps the fdef output list to actual output structure.
@@ -828,8 +722,8 @@ class Function(object):
     if self._func_graph.structured_outputs is None:
       return result
 
-    # Use `nest.flatten` instead of `_flatten` in order to preserve any
-    # IndexedSlices in `self._func_graph.structured_outputs`.
+    # Use `nest.flatten` instead of `func_graph_module.flatten` in order to
+    # preserve any IndexedSlices in `self._func_graph.structured_outputs`.
     outputs_list = nest.flatten(self._func_graph.structured_outputs)
     j = 0
     for i, o in enumerate(outputs_list):
@@ -854,163 +748,6 @@ class Function(object):
     return ret
 
 
-def _get_defun_inputs_from_signature(signature):
-  """Maps a signature to graph-construction inputs."""
-  function_inputs = [
-      graph_placeholder(spec.dtype, spec.shape)
-      for spec in nest.flatten(signature)
-  ]
-  return nest.pack_sequence_as(signature, function_inputs)
-
-
-def _get_defun_inputs_from_args(args):
-  """Maps python function args to graph-construction inputs."""
-  function_inputs = [
-      graph_placeholder(arg.dtype, arg.shape)
-      if isinstance(arg, ops.Tensor) else arg for arg in nest.flatten(args)
-  ]
-  return nest.pack_sequence_as(args, function_inputs)
-
-
-def func_graph_from_py_func(name,
-                            python_func,
-                            args,
-                            kwargs,
-                            signature=None,
-                            func_graph=None):
-  """Returns a `FuncGraph` generated from `python_func`.
-
-  Args:
-    name: an identifier for the function.
-    python_func: the Python function to trace.
-    args: the positional args with which the Python function should be called;
-      ignored if a signature is provided.
-    kwargs: the keyword args with which the Python function should be called;
-      ignored if a signature is provided.
-    signature: a possibly nested sequence of `TensorSpecs` specifying the shapes
-      and dtypes of the arguments. When a signature is provided, `args` and
-      `kwargs` are ignored, and `python_func` is traced with Tensors conforming
-      to `signature`. If `None`, the shapes and dtypes are inferred from the
-      inputs.
-    func_graph: Optional. An instance of FuncGraph. If provided, we will use
-      this graph else a new one is built and returned.
-
-  Returns:
-    A FuncGraph.
-
-  Raises:
-    TypeError: If any of `python_func`'s return values is neither `None` nor a
-      `Tensor`.
-  """
-  if func_graph is None:
-    func_graph = FuncGraph(name)
-  assert isinstance(func_graph, FuncGraph)
-  with func_graph.as_default(), AutomaticControlDependencies() as a:
-    variable_scope.get_variable_scope().set_use_resource(True)
-
-    if signature is None:
-      func_args = _get_defun_inputs_from_args(args)
-      func_kwargs = _get_defun_inputs_from_args(kwargs)
-    else:
-      func_args = _get_defun_inputs_from_signature(signature)
-      func_kwargs = {}
-
-    # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
-    # Variables to help check whether mutation happens in calling the function
-    # Copy the recursive list, tuple and map structure, but not base objects
-    func_args_before = nest.pack_sequence_as(func_args, nest.flatten(func_args))
-    func_kwargs_before = nest.pack_sequence_as(
-        func_kwargs, nest.flatten(func_kwargs))
-
-    def convert(x):
-      """Converts an argument to a Tensor."""
-      if x is None:
-        return None
-      try:
-        x = ops.convert_to_tensor_or_indexed_slices(x)
-      except (ValueError, TypeError):
-        raise TypeError(
-            "To be compatible with tf.contrib.eager.defun, Python functions "
-            "must return zero or more Tensors; in compilation of %s, found "
-            "return value of type %s, which is not a Tensor." %
-            (str(python_func), type(x)))
-      x = a.mark_as_return(x)
-      return x
-
-    this_tape = tape.push_new_tape()
-    try:
-      func_outputs = python_func(*func_args, **func_kwargs)
-      # invariant: `func_outputs` contains only Tensors and `None`s.
-      func_outputs = nest.map_structure(convert, func_outputs)
-
-      def check_mutation(n1, n2):
-        """Check if two list of arguments are exactly the same."""
-        errmsg = ("Function to be traced should not modify structure of input "
-                  "arguments. Check if your function has list and dictionary "
-                  "operations that alter input arguments, "
-                  "such as `list.pop`, `list.append`")
-        try:
-          nest.assert_same_structure(n1, n2)
-        except ValueError:
-          raise ValueError(errmsg)
-
-        for arg1, arg2 in zip(nest.flatten(n1), nest.flatten(n2)):
-          if arg1 is not arg2:
-            raise ValueError(errmsg)
-
-      check_mutation(func_args_before, func_args)
-      check_mutation(func_kwargs_before, func_kwargs)
-    finally:
-      tape.pop_tape(this_tape)
-
-    # Variables in `func_args`, `func_kwargs` should be explicit inputs
-    # to the function, not captured inputs.
-    tape_variables = this_tape.watched_variables()
-    arg_variables = set()
-    inputs = []
-    for arg in nest.flatten(func_args) + nest.flatten(func_kwargs):
-      if isinstance(arg, resource_variable_ops.ResourceVariable):
-        try:
-          resource_placeholder = func_graph.captures.pop(arg.handle)
-          arg_variables.add(arg)
-        except KeyError:
-          # This case occurs if a Variable among the inputs is not actually
-          # used by the function; we still add an explicit input for it
-          # because the user should presumably pass the Variable as an input
-          # to the corresponding graph function.
-          resource_placeholder = _create_substitute_placeholder(arg.handle)
-        inputs.append(resource_placeholder)
-      elif isinstance(arg, ops.Tensor):
-        inputs.append(arg)
-    variables = [v for v in tape_variables if v not in arg_variables]
-    func_graph.inputs = inputs + list(func_graph.captures.values())
-
-    func_graph.structured_outputs = func_outputs
-    # Returning a closed-over tensor does not trigger convert_to_tensor.
-    func_graph.outputs.extend(
-        func_graph.capture(x)
-        for x in _flatten(func_graph.structured_outputs)
-        if x is not None)
-
-    # Some captured variables might be components of DistributedValues.
-    # Instead of storing non-distributed component variables, we
-    # store their distributed containers so we can retrieve the correct
-    # component variables at call-time.
-    strategy = distribution_strategy_context.get_distribution_strategy()
-    for i, variable in enumerate(variables):
-      # If variable is not distributed value_container returns itself.
-      variables[i] = strategy.value_container(variable)
-    func_graph.variables = variables
-
-  # Register any other functions defined in the graph.
-  if context.executing_eagerly():
-    for f in func_graph._functions.values():  # pylint: disable=protected-access
-      # TODO(ashankar): What about the gradient registry?
-      _register(f._c_func.func)  # pylint: disable=protected-access
-
-  return func_graph
-
-
 pywrap_tensorflow.RegisterType("Tensor", ops.Tensor)
 pywrap_tensorflow.RegisterType("IndexedSlices", ops.IndexedSlices)
 
@@ -1035,7 +772,8 @@ class PolymorphicFunction(object):
                python_function,
                name,
                input_signature=None,
-               attributes=None):
+               attributes=None,
+               autograph=True):
     """Initializes a polymorphic function.
 
     Args:
@@ -1045,7 +783,10 @@ class PolymorphicFunction(object):
         specifying the input signature of this function. If `None`, a separate
         function is instantiated for each inferred input signature.
       attributes: dict, extra keyword arguments that will be added as attribute
-         of the function.
+        of the function.
+      autograph: whether to use autograph to compile
+        `python_function`. See https://www.tensorflow.org/guide/autograph for
+        more information.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
@@ -1061,10 +802,15 @@ class PolymorphicFunction(object):
       self._args_to_prepend = tuple()
       self._kwargs_to_include = {}
     self._name = name
+    self._autograph = autograph
     self._function_cache = collections.OrderedDict()
     self._function_attributes = attributes or {}
 
     self._lock = threading.Lock()
+    # _descriptor_cache is a of instance of a class to an instance-specific
+    # PolymorphicFunction, used to make sure defun-decorated methods create
+    # different functions for each instance.
+    self._descriptor_cache = weakref.WeakKeyDictionary()
 
     fullargspec = tf_inspect.getfullargspec(self._python_function)
     if tf_inspect.ismethod(self._python_function):
@@ -1076,6 +822,8 @@ class PolymorphicFunction(object):
     # A cache mapping from argument name to index, for canonicalizing
     # arguments that are called in a keyword-like fashion.
     self._args_to_indices = {arg: i for i, arg in enumerate(args)}
+    self._arg_names = args
+    self._vararg_name = fullargspec.varargs
     # A cache mapping from arg index to default value, for canonicalization.
     offset = len(args) - len(fullargspec.defaults or [])
     self._arg_indices_to_default_values = {
@@ -1101,25 +849,92 @@ class PolymorphicFunction(object):
 
   def __call__(self, *args, **kwargs):
     """Calls a graph function specialized to the inputs."""
-    graph_function, inputs = self._maybe_define_function(args, kwargs)
-    return graph_function(*inputs)
+    graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
+    return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
 
   @property
   def python_function(self):
     """Returns the wrapped Python function."""
     return self._python_function
 
+  def _get_concrete_function_internal(self, *args, **kwargs):
+    """Bypasses error checking when getting a graph function."""
+    if self._input_signature:
+      args, kwargs = None, None
+    graph_function, _, _ = self._maybe_define_function(args, kwargs)
+    return graph_function
+
   def get_concrete_function(self, *args, **kwargs):
     """Returns a `Function` object specialized to inputs and execution context.
 
-    `args` and `kwargs` are ignored if this `PolymorphicFunction` was created
-    with an `input_signature`.
-
     Args:
       *args: inputs to specialize on.
       **kwargs: inputs to specialize on.
     """
-    graph_function, _ = self._maybe_define_function(args, kwargs)
+    if self._input_signature:
+      if kwargs:
+        raise ValueError("Cannot define a TensorFlow function from a Python "
+                         "function with keyword arguments when "
+                         "input_signature is provided.")
+      if args:
+        # If args are provided, they must match the input signature.
+        try:
+          nest.assert_same_structure(self._input_signature, args)
+        except (ValueError, TypeError):
+          raise ValueError("Structure of Python function inputs does not match "
+                           "input_signature.")
+        flat_inputs = nest.flatten(args)
+        if any(not isinstance(arg, (ops.Tensor, tensor_spec.TensorSpec))
+               for arg in flat_inputs):
+          raise ValueError("When input_signature is provided, all inputs to "
+                           "the Python function must be Tensors or "
+                           "tf.TensorSpec objects.")
+        if any(not spec.is_compatible_with(other)
+               for spec, other in zip(self._flat_input_signature, flat_inputs)):
+          raise ValueError("Python inputs incompatible with input_signature: "
+                           "inputs (%s), input_signature (%s)" %
+                           (str(args), str(self._input_signature)))
+      args, kwargs = None, None
+    graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
+    if self._input_signature:
+      args = self._input_signature
+      kwargs = {}
+    seen_names = set()
+    captured = frozenset(graph_function.graph.internal_captures)
+    allowed_positional = 0
+    if args:
+      for outer_arg in args:
+        # TODO(allenl): Consider allowing arguments with defaults in the Python
+        # function's signature to be passed as positional arguments to the
+        # concrete function.
+        if not isinstance(
+            outer_arg,
+            (ops.Tensor, resource_variable_ops.ResourceVariable,
+             tensor_spec.TensorSpec)):
+          break
+        allowed_positional += 1
+    # pylint: disable=protected-access
+    graph_function._num_positional_args = allowed_positional
+    graph_function._arg_keywords = []
+    # pylint: enable=protected-access
+    for arg in graph_function.graph.inputs:
+      if arg in captured:
+        break
+      user_arg_name = arg.op.get_attr("_user_specified_name")
+      if user_arg_name in seen_names:
+        raise ValueError(
+            ("Unable to construct a concrete function for {} since some "
+             "arguments do not have unique names. Got two arguments named "
+             "'{}'. When constructing a concrete TensorFlow function from a "
+             "Python function which takes nested structures or variadic "
+             "positional arguments, pass unique names to tf.TensorSpec objects "
+             "used to identify these Tensor inputs. These names may then be "
+             "used as keyword arguments to the concrete function.")
+            .format(
+                self._python_function,
+                compat.as_str(arg.op.get_attr("_user_specified_name"))))
+      seen_names.add(user_arg_name)
+      graph_function._arg_keywords.append(user_arg_name)  # pylint: disable=protected-access
     return graph_function
 
   def __get__(self, instance, owner):
@@ -1137,36 +952,77 @@ class PolymorphicFunction(object):
     #   foo = Foo()
     #   foo.bar()  # `foo.bar` is a `PolymorphicFunction` instance
     #
-    # then `instance` will be `foo` (and `owner` will be `Foo`).
-    return functools.partial(self.__call__, instance)
+    # then `instance` will be `foo` (and `owner` will be `Foo`).  We create a
+    # new instance of PolymorphicFunction here to allow different instances each
+    # to create variables once, thereby allowing methods to be decorated with
+    # defun. Keeps a cache to avoid retracing the function every time the
+    # descriptor is accessed.
+    if instance not in self._descriptor_cache:
+      if instance is None:
+        return self
+      # If there is no instance-specific polymorphic func in the cache,
+      # we construct an instance-specific polymorphic function
+      # that uses a weak reference to the instance (so that the instance will
+      # be correctly gc'd).
+
+      # And finally add the wrapped function to the description cache
+      self._descriptor_cache[instance] = class_method_to_instance_method(
+          self, instance)
+
+    # Return the cached polymorphic function for the instance
+    return self._descriptor_cache[instance]
 
   def _cache_key(self, args, kwargs):
     """Computes the cache key given inputs and execution context."""
     if self._input_signature is None:
       inputs = (args, kwargs) if kwargs else args
-      cache_key = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
+      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
     else:
       del args, kwargs
-      cache_key = self._flat_input_signature
+      input_signature = self._flat_input_signature
 
     ctx = context.context()
-    with ops.init_scope():
-      # The graph, or whether we're executing eagerly, should be a part of the
-      # cache key so we don't improperly capture tensors such as variables.
-      executing_eagerly = ctx.executing_eagerly()
-      execution_context = executing_eagerly or ops.get_default_graph()
 
+    # Don't need to open an init_scope if the _cache_key call is in eager mode
+    # already.
+    executing_eagerly = ctx.executing_eagerly()
+    parent_graph = None
+    if not executing_eagerly:
+      with ops.init_scope():
+        # The graph, or whether we're executing eagerly, should be a part of the
+        # cache key so we don't improperly capture tensors such as variables.
+        executing_eagerly = ctx.executing_eagerly()
+        parent_graph = None if executing_eagerly else ops.get_default_graph()
+
+    # pylint: disable=protected-access
+    default_graph = ops.get_default_graph()
+    # TODO(b/117617952): The current distribution strategy will affect graph
+    # building (e.g. accessing different variables from different devices) and
+    # so requires retracing for each device.
+    uses_distribution_strategy = bool(
+        default_graph._distribution_strategy_stack)
     if executing_eagerly:
-      device_functions = (pydev.merge_device(ctx.device_name),)
       colocation_stack = ()
+      uses_xla = ctx.device_spec.device_type == "TPU"
+      if uses_distribution_strategy or uses_xla:
+        device_functions = (pydev.merge_device(ctx.device_name),)
+      else:
+        device_functions = ()
     else:
-      default_graph = ops.get_default_graph()
-      # Putting the device in the cache key ensures that call-site device
-      # annotations are respected.
-      device_functions = tuple(default_graph._device_functions_outer_to_inner)  # pylint: disable=protected-access
-      colocation_stack = tuple(default_graph._colocation_stack.peek_objs())  # pylint: disable=protected-access
-
-    return (cache_key, execution_context, device_functions, colocation_stack)
+      colocation_stack = tuple(default_graph._colocation_stack.peek_objs())
+      uses_xla = getattr(default_graph, "_xla_compile", False)
+      if (uses_distribution_strategy
+          or uses_xla
+          or func_graph_module.device_stack_has_callable(
+              default_graph._device_function_stack)):
+        # Putting the device in the cache key ensures that call-site device
+        # annotations are respected.
+        device_functions = tuple(default_graph._device_functions_outer_to_inner)
+      else:
+        device_functions = ()
+    # pylint: enable=protected-access
+    return CacheKey(input_signature, parent_graph, device_functions,
+                    colocation_stack, uses_xla)
 
   def _canonicalize_function_inputs(self, *args, **kwargs):
     """Canonicalizes `args` and `kwargs`.
@@ -1238,19 +1094,21 @@ class PolymorphicFunction(object):
       return inputs, kwargs
     else:
       assert not kwargs
+      signature_relevant_inputs = inputs[:len(self._input_signature)]
       try:
-        nest.assert_same_structure(self._input_signature, inputs)
+        nest.assert_same_structure(self._input_signature,
+                                   signature_relevant_inputs)
       except (ValueError, TypeError):
         raise ValueError("Structure of Python function inputs does not match "
                          "input_signature.")
-      if any(not isinstance(arg, ops.Tensor) for arg in flat_inputs):
+      signature_inputs_flat = nest.flatten(signature_relevant_inputs)
+      if any(not pywrap_tensorflow.IsTensor(arg)
+             for arg in signature_inputs_flat):
         raise ValueError("When input_signature is provided, all inputs to "
                          "the Python function must be Tensors.")
-      tensor_specs = [
-          tensor_spec.TensorSpec.from_tensor(tensor) for tensor in flat_inputs
-      ]
       if any(not spec.is_compatible_with(other)
-             for spec, other in zip(self._flat_input_signature, tensor_specs)):
+             for spec, other in zip(self._flat_input_signature,
+                                    signature_inputs_flat)):
         raise ValueError("Python inputs incompatible with input_signature: "
                          "inputs (%s), input_signature (%s)" %
                          (str(inputs), str(self._input_signature)))
@@ -1285,19 +1143,32 @@ class PolymorphicFunction(object):
                         "must be hashable.")
 
       if graph_function is None:
+        logging.vlog(1,
+                     "Creating new FuncGraph for Python function %r (key: %r)",
+                     self._python_function, cache_key)
+        if self._input_signature is None:
+          arglen = len(args)
+        else:
+          arglen = len(self._input_signature)
+        arg_names = (
+            self._arg_names[:arglen]
+            + [self._vararg_name] * (arglen - len(self._arg_names)))
         graph_function = Function(
-            func_graph_from_py_func(self._name, self._python_function, args,
-                                    kwargs, self._input_signature),
+            func_graph_module.func_graph_from_py_func(
+                self._name,
+                self._python_function,
+                args,
+                kwargs,
+                self._input_signature,
+                autograph=self._autograph,
+                arg_names=arg_names),
             self._function_attributes)
         self._function_cache[cache_key] = graph_function
-      return graph_function, [
-          t for t in nest.flatten((args, kwargs))
-          if isinstance(t, (ops.Tensor, resource_variable_ops.ResourceVariable))
-      ]
+      return graph_function, args, kwargs
 
 
 def register(func, *args, **kwargs):
-  """Register the defun function into the graph.
+  """Register a specialization of a PolymorphicFunction into the graph.
 
   This won't actually call the function with the inputs, and only put the
   function definition into graph. Register function with different input param
@@ -1318,37 +1189,18 @@ def register(func, *args, **kwargs):
     raise ValueError("Only defun function is allowed to be registered. "
                      "Got type: %s" % type(func))
   concrete_func = func.get_concrete_function(*args, **kwargs)
-  graph = ops.get_default_graph()
-
-  # There are two situations for the actual call of a defun:
-  # 1. If none of the input args are resource variables or watch by any tape,
-  #   it will run the _inference_function of concrete_func for forward pass, and
-  #   the gradient will be generated by standard mechanism.
-  # 2. Otherwise, defun will create two functions, one for forward pass, and the
-  #   backward pass will be created via tape.
-  # When registering the function, we put both cases into graph.
-  # pylint: disable=protected-access
-  concrete_func._inference_function.add_to_graph(graph)
-
-  if concrete_func._backward_graph_function is None:
-    concrete_func._construct_backprop_function()
-  forward_function = concrete_func._forward_function
-  backward_function = concrete_func._backward_graph_function._inference_function
-  forward_function.add_to_graph(graph)
-  backward_function.add_to_graph(graph)
-  # pylint: enable=protected-access
-
+  concrete_func.add_to_graph(register_gradient_functions=True)
   return concrete_func
 
 
-def _validate_signature(signature):
+def validate_signature(signature):
   if any(not isinstance(arg, tensor_spec.TensorSpec)
          for arg in nest.flatten(signature)):
     raise TypeError("Invalid input_signature %s; input_signature must be "
                     "a possibly nested sequence of TensorSpec objects.")
 
 
-def defun(func=None, input_signature=None):
+def defun(func=None, input_signature=None, autograph=True):
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") trace-compiles a Python function
@@ -1657,6 +1509,10 @@ def defun(func=None, input_signature=None):
       function is instantiated for each inferred input signature.  If a
       signature is specified, every input to `func` must be a `Tensor`, and
       `func` cannot accept `**kwargs`.
+    autograph: Whether `func` should be compiled before
+      constructing the graph. See https://www.tensorflow.org/guide/autograph
+      for more information.
+
 
   Returns:
      If `func` is not None, returns a callable that will execute the compiled
@@ -1668,10 +1524,16 @@ def defun(func=None, input_signature=None):
     TypeError: If `input_signature` is neither `None` nor a sequence of
       `tf.contrib.eager.TensorSpec` objects.
   """
-  return defun_with_attributes(func=func, input_signature=input_signature)
+  return defun_with_attributes(
+      func=func,
+      input_signature=input_signature,
+      autograph=autograph)
 
 
-def defun_with_attributes(func=None, input_signature=None, attributes=None):
+def defun_with_attributes(func=None,
+                          input_signature=None,
+                          attributes=None,
+                          autograph=True):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
@@ -1685,25 +1547,35 @@ def defun_with_attributes(func=None, input_signature=None, attributes=None):
     attributes: A dictionary of arguments which will be added to function def as
       attributes. Currently only support primitive types as value, and only
       whitelisted attribute name is allowed. Unwhitelisted attribute name or
-      unsupported value will result into ValueError.
+      unsupported value will result into ValueError. `func_name` is also one of
+      the whitelisted argument which is a python string, and sets the name for
+      this `Function` in the graph.
+    autograph: same as defun()'s autograph.
 
   Returns:
     Same as the return value of defun, with attributes added to the function in
     graph.
   """
   if input_signature is not None:
-    _validate_signature(input_signature)
+    validate_signature(input_signature)
 
   # TODO(apassos): deal with captured global state. Deal with control flow.
   def decorated(function):
     try:
-      name = function.__name__
+      if attributes:
+        name = attributes.pop("func_name", function.__name__)
+      else:
+        name = function.__name__
     except AttributeError:
       name = "function"
     return tf_decorator.make_decorator(
         function,
-        PolymorphicFunction(function, name, input_signature=input_signature,
-                            attributes=attributes))
+        PolymorphicFunction(
+            function,
+            name,
+            input_signature=input_signature,
+            attributes=attributes,
+            autograph=autograph))
 
   # This code path is for the `foo = tfe.defun(foo, ...)` use case
   if func is not None:
@@ -1719,239 +1591,53 @@ def defun_with_attributes(func=None, input_signature=None, attributes=None):
   return decorated
 
 
-class AutomaticControlDependencies(object):
-  """Context manager to automatically add control dependencies.
+# When a method is bound to objects of this type, it allows AutoGraph to
+# recover a weak reference the original method's self pointer. This uses the
+# mechanism from pyct.inspect_utils.getmethodclass.
+# TODO(b/119246461): This is not pretty. Use a descriptor instead?
+class _WeakrefSelf(object):
 
-  Code under this context manager will act as if a sensible set of control
-  dependencies were present. More specifically:
-    1. All stateful ops in the scope will execute
-    2. Stateful ops which modify the same resource will execute in program order
+  def __init__(self, target):
+    self.ag_self_weakref__ = target
 
-  Note: creating variables in an automatic control dependencies context is not
-  supported (the value of the variables will never change as they will keep
-  getting reinitialized).
 
-  NOT THREAD SAFE
-  """
-
-  def __init__(self):
-    self._returned_tensors = set()
-
-  def mark_as_return(self, tensor):
-    """Acts like identity but marks the `Tensor` as a return value.
+def class_method_to_instance_method(original_function, instance):
+  """Constructs a new PolymorphicFunction with `self` bound."""
+  weak_instance = weakref.ref(instance)
 
-    This will possibly return a copy of the `Tensor`. Usage:
-
-    ```
-      with AutomaticControlDependencies() as a:
-       ...
-       t = a.mark_as_return(t)
-      _ = ...(t...)  # i.e. it's safe to use t here
-    ```
-
-    Args:
-      tensor: the `Tensor` to be marked
-
-    Returns:
-      a copy of the `Tensor`.
-    """
-    if isinstance(tensor, ops.IndexedSlices):
-      values = array_ops.identity(tensor.values)
-      indices = array_ops.identity(tensor.indices)
-      self._returned_tensors.add(indices)
-      self._returned_tensors.add(values)
-      return ops.IndexedSlices(values, indices, dense_shape=tensor.dense_shape)
-    # We want to make the return values depend on the stateful operations, but
-    # we don't want to introduce a cycle, so we make the return value the result
-    # of a new identity operation that the stateful operations definitely don't
-    # depend on.
-    tensor = array_ops.identity(tensor)
-    self._returned_tensors.add(tensor)
-    return tensor
-
-  def __enter__(self):
-    if context.executing_eagerly():
-      return self
-    # This code assumes no other thread is adding ops to the graph while
-    # we're adding ops to the graph.
-    # TODO(apassos): Fix this by locking the graph or using a temporary
-    # graph (but that would mess up devices and collections at least,
-    # probably other things as well).
-    self._graph = ops.get_default_graph()
-    self._n_operations = len(self._graph.get_operations())
-    return self
-
-  def _process_switch(self, switch_op, ops_which_must_run,
-                      last_op_using_resource_tensor, merge_for_resource):
-    """Processes a switch node for a resource input.
-
-    When tensorflow creates a cond, it creates a control flow context for each
-    branch of the cond. Each external tensor accessed by that branch is routed
-    through a switch op, which gets created in the graph _after_ the op which
-    uses that tensor get created.
-
-    If the resource comes from another switch op we process that one first.
-
-    _process_switch creates a corresponding merge node for the switch node. This
-    merge node is added to the outer control flow context of the switch
-    node. We also ensure that:
-
-      1. The switch node executes after the previous op which used the resource
-         tensor
-
-      2. Any op which uses a resource output of the switch node executes before
-         the merge for the switch node.
-
-      3. The next op which uses the input resource to the switch node (which
-         might be another switch node for the other branch of the conditional)
-         will execute after the merge node is done.
-
-      4. The merge node is marked as must_run so it will run even if no
-         subsequent operation uses the resource.
-
-    Args:
-      switch_op: the switch op to be processed
-      ops_which_must_run: the set of ops which must run
-      last_op_using_resource_tensor: map from resource tensor to last op using
-        it
-      merge_for_resource: map from resource tensor to merge which must follow
-        all usages of it.
-    """
-    inp = switch_op.inputs[0]
-    if inp.dtype == dtypes_module.resource and inp.op.type == "Switch":
-      self._process_switch(inp.op, ops_which_must_run,
-                           last_op_using_resource_tensor, merge_for_resource)
-    if switch_op.outputs[0] in merge_for_resource:
-      return
-    new_merge = control_flow_ops.merge(switch_op.outputs,
-                                       name="artificial_merge")
-    new_merge[0].op._control_flow_context = (  # pylint: disable=protected-access
-        switch_op._control_flow_context.outer_context)  # pylint: disable=protected-access
-    # Ensures the merge always runs
-    ops_which_must_run.add(new_merge[0].op)
-    if inp in last_op_using_resource_tensor:
-      # Ensures the switch executes after the previous op using the resource.
-      switch_op._add_control_input(last_op_using_resource_tensor[inp])  # pylint: disable=protected-access
-    # Ensure the next op outside the cond happens after the merge.
-    last_op_using_resource_tensor[inp] = new_merge[0].op
-    if inp in merge_for_resource:
-      merge_for_resource[inp]._add_control_input(new_merge[0].op)  # pylint: disable=protected-access
-    for o in switch_op.outputs:
-      # Ensures the merge will execute after all ops inside the cond
-      merge_for_resource[o] = new_merge[0].op
-
-  def __exit__(self, unused_type, unused_value, unused_traceback):
-    if context.executing_eagerly():
-      return
-
-    if self._graph is not ops.get_default_graph():
-      raise RuntimeError(
-          "Graph changed while trying to add control dependencies.")
-
-    # map from resource tensor to the last op which used it
-    last_op_using_resource_tensor = {}
-    # set of conditional and loop exits
-    ops_which_must_run = set()
-    # merge which must depend on ops which use this resource
-    merge_for_resource = {}
-
-    new_operations = self._graph.get_operations()[self._n_operations:]
-
-    # Ensures that uses of resource tensors get serialized properly and all
-    # execute. This is done by keeping a map from resource tensor to the last op
-    # in graph-construction order which used it (last_op_using_resource_tensor).
-    #
-    # Conditionals are written in TensorFlow such that every external tensor
-    # accessed in the conditional goes through a switch op and every return
-    # tensor (it's guaranteed that there will be at least one) goes through a
-    # merge op.
-    #
-    # To handle conditionals, switches are handled in a special way (see
-    # comments for _process_switch). Merge nodes created by TF's conditional
-    # logic (as opposed to by _process_switch) are forced to run and also get a
-    # control dependency added to them to ensure all stateful ops inside their
-    # control flow context run.
-    #
-    # We also ensure that if an op is using a resource output by a switch node
-    # (that is, a resource tensor for which there's a value in
-    # merge_for_resource) this op will run before the merge for that resource.
-    #
-    # We try to add control inputs to nodes respecting their control flow
-    # contexts to avoid dead nodes propagating everywhere and leading to
-    # "retval[0] doesn't have value" errors. If a node gets a control dependency
-    # on a dead node (i.e. a note from an untaken control flow branch) that node
-    # will be marked as dead unless it's a merge node.
-    #
-    # TODO(apassos): serialize non-resource-taking stateful ops as well, and
-    # test that it works. Support while loops. Support init_scope escaping from
-    # this.
-    for op in new_operations:
-      # TODO(apassos) make this code safely support while loops.
-      if isinstance(op._control_flow_context, control_flow_ops.WhileContext):  # pylint: disable=protected-access
-        continue
-      control_inputs = set()
-      # Ensure stateful ops run
-      if (op.type not in self._graph._registered_ops  # pylint: disable=protected-access
-          or self._graph._registered_ops[op.type].is_stateful):  # pylint: disable=protected-access
-        ops_which_must_run.add(op)
-      # Ignore switches (they're handled separately)
-      if op.type == "Switch" and op.inputs[0].dtype == dtypes_module.resource:
-        continue
-      # Make merges trigger all other computation which must run
-      if op.type == "Merge":
-        for o in ops_which_must_run:
-          op._add_control_input(o)  # pylint: disable=protected-access
-          for inp in o.inputs:
-            if inp in last_op_using_resource_tensor:
-              last_op_using_resource_tensor[inp] = op
-        ops_which_must_run = set([op])
-        continue
-      for inp in op.inputs:
-        if inp.dtype == dtypes_module.resource:
-          # Deal with switches, finally.
-          if inp.op.type == "Switch":
-            self._process_switch(inp.op, ops_which_must_run,
-                                 last_op_using_resource_tensor,
-                                 merge_for_resource)
-          # Ensure uses of resources are serialized
-          if inp in last_op_using_resource_tensor:
-            if (last_op_using_resource_tensor[inp]._control_flow_context  # pylint: disable=protected-access
-                is op._control_flow_context):  # pylint: disable=protected-access
-              control_inputs.add(last_op_using_resource_tensor[inp])
-          # Ensure merges happen after the closing of a cond block
-          if inp in merge_for_resource:
-            merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
-          last_op_using_resource_tensor[inp] = op
-      control_inputs = [c for c in control_inputs
-                        if c._control_flow_context is op._control_flow_context]  # pylint: disable=protected-access
-      op._add_control_inputs(control_inputs)  # pylint: disable=protected-access
-
-    # Ensure all ops which must run do run
-    for r in self._returned_tensors:
-      if ops_which_must_run:
-        r.op._add_control_inputs(  # pylint: disable=protected-access
-            [o for o in ops_which_must_run
-             if o._control_flow_context is r.op._control_flow_context])  # pylint: disable=protected-access
-
-
-def automatic_control_dependencies(f):
-  """Wraps f to automatically insert control dependencies.
-
-  The inserted dependencies ensure that:
-    1. All stateful ops in f run when the result of f runs
-    2. Updates to the same resources happen in order.
+  # Note: while we could bind to a weakref proxy instead, that causes the
+  # bound method to be unhashable.
+  bound_method = types_lib.MethodType(original_function.python_function,
+                                      _WeakrefSelf(weak_instance))
 
-  Args:
-    f: the function to be wrapped.
+  # original_function is expected to be of one of the two PolymorphicFunction
+  # types (defined either in function.py or def_function.py).
+  assert hasattr(original_function, "_name")
+  assert hasattr(original_function, "_autograph")
+  assert hasattr(original_function, "_input_signature")
+  assert hasattr(original_function, "python_function")
 
-  Returns:
-    The wrapped function.
-  """
+  def bound_method_wrapper(*args, **kwargs):
+    # __wrapped__ allows AutoGraph to swap in a converted function.
+    wrapped_fn = bound_method_wrapper.__wrapped__
+    # If __wrapped__ was not replaced, then call original_function.
+    # TODO(b/119246461): This needs to be simplified.
+    if tf_inspect.ismethod(wrapped_fn):
+      wrapped_fn = original_function.python_function
+    return wrapped_fn(weak_instance(), *args, **kwargs)
 
-  def wrapper(*args, **kwargs):
-    with AutomaticControlDependencies() as a:
-      result = f(*args, **kwargs)
-      result_flat = [a.mark_as_return(t) for t in nest.flatten(result)]
-      return nest.pack_sequence_as(result, result_flat)
+  # pylint: disable=protected-access
+  # We make a dummy MethodType object to generate the correct bound method
+  # signature. The actual call is to a function with a weak reference to
+  # `instance`.
+  instance_func = type(original_function)(
+      tf_decorator.make_decorator(bound_method, bound_method_wrapper),
+      name=original_function._name,
+      autograph=original_function._autograph,
+      input_signature=original_function._input_signature)
+  # pylint: enable=protected-access
 
-  return tf_decorator.make_decorator(f, wrapper)
+  # And we wrap the function with tf_decorator so inspection works correctly
+  wrapped_instance_func = tf_decorator.make_decorator(
+      original_function.python_function, instance_func)
+  return wrapped_instance_func
diff --git a/tensorflow/python/eager/function_argument_naming_test.py b/tensorflow/python/eager/function_argument_naming_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9358c4fd07111f7adfbf60241727215f978b2a36
--- /dev/null
+++ b/tensorflow/python/eager/function_argument_naming_test.py
@@ -0,0 +1,258 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@parameterized.named_parameters(
+    dict(testcase_name='Defun', function_decorator=function.defun),
+    dict(testcase_name='DefFunction', function_decorator=def_function.function))
+class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
+  """Tests for recognizable export signatures from concrete functions."""
+
+  def testBasic(self, function_decorator):
+    @function_decorator
+    def fn(a, b):
+      return a + b, a * b
+    # Call the function to make def_function happy
+    fn(array_ops.ones([]), array_ops.ones([]))
+
+    fn_op = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['a', 'b'],
+        [inp.op.name for inp in fn_op.inputs])
+    self.assertEqual(
+        [b'a', b'b'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+    self.assertEqual(2, len(fn_op.graph.structured_outputs))
+    self.assertAllClose(
+        [3., 2.],
+        fn_op(constant_op.constant(1.), constant_op.constant(2.)))
+    self.assertAllClose(
+        [3., 2.],
+        fn_op(a=constant_op.constant(1.), b=constant_op.constant(2.)))
+
+  def testVariable(self, function_decorator):
+    @function_decorator
+    def fn(a, b):
+      return a + b, a * b
+    # Call the function to make def_function happy
+    fn(array_ops.ones([]), array_ops.ones([]))
+
+    fn_op = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+        variables.Variable(1.))
+    self.assertEqual(
+        ['a', 'b'],
+        [inp.op.name for inp in fn_op.inputs])
+    self.assertEqual(
+        [b'a', b'b'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+    self.assertEqual(2, len(fn_op.graph.structured_outputs))
+
+  def testDictReturned(self, function_decorator):
+    @function_decorator
+    def fn(x, z=(1., 2.), y=3.):
+      z1, z2 = z
+      return {'alpha': x + y + z1, 'beta': x * y + z2}
+    # Call the function to make def_function happy
+    fn(array_ops.ones([]))
+
+    fn_op = fn.get_concrete_function(
+        x=tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['x', 'y'],
+        [inp.op.name for inp in fn_op.inputs])
+    self.assertEqual(
+        [b'x', b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op.inputs])
+    self.assertEqual({'alpha', 'beta'},
+                     set(fn_op.graph.structured_outputs.keys()))
+
+    with self.assertRaisesRegexp(ValueError, "two arguments named 'z'"):
+      fn.get_concrete_function(
+          z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+             tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)),
+          y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
+                                   name='custom'),
+          x=4.)
+    fn_op2 = fn.get_concrete_function(
+        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
+                                  name='z_first'),
+           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
+                                  name='z_second')),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
+        x=4.)
+    self.assertEqual(
+        ['z_first', 'z_second', 'custom'],
+        [inp.op.name for inp in fn_op2.inputs])
+    self.assertEqual(
+        [b'z_first', b'z_second', b'custom'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op2.inputs])
+
+    fn_op3 = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='custom'),
+        z=(tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32,
+                                  name='z1'),
+           tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z2')),
+        y=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['custom', 'z1', 'z2', 'y'],
+        [inp.op.name for inp in fn_op3.inputs])
+    self.assertEqual(
+        [b'custom', b'z1', b'z2', b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in fn_op3.inputs])
+
+  def testMethod(self, function_decorator):
+    class HasMethod(object):
+
+      @function_decorator
+      def method(self, x):
+        return x
+
+    has_method = HasMethod()
+    # Call the function to make def_function happy
+    HasMethod.method(has_method, array_ops.ones([]))
+    class_op = HasMethod.method.get_concrete_function(
+        has_method, tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['x'],
+        [inp.op.name for inp in class_op.inputs])
+    self.assertEqual(
+        [b'x'],
+        [inp.op.get_attr('_user_specified_name') for inp in class_op.inputs])
+    # Call the function to make def_function happy
+    has_method.method(array_ops.ones([]))
+    method_op = has_method.method.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32))
+    self.assertEqual(
+        ['x'],
+        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(
+        [b'x'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+    # TODO(allenl): It should be possible to override names when exporting. Do
+    # TensorSpec names need to go in cache keys? Or maybe get_concrete_function
+    # should always retrace?
+    self.skipTest('Not working')
+    method_op = has_method.method.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='y'))
+    self.assertEqual(
+        ['y'],
+        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(
+        [b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+
+  def testMethodSignature(self, function_decorator):
+
+    class HasMethod(object):
+
+      @function_decorator(
+          input_signature=(tensor_spec.TensorSpec(
+              shape=None, dtype=dtypes.float64, name='y'),))
+      def method(self, x):
+        hash(self)  # No weak proxies passed as `self`
+        return x
+
+    has_method = HasMethod()
+    # Call the function to make def_function happy
+    has_method.method(array_ops.ones([], dtype=dtypes.float64))
+    method_op = has_method.method.get_concrete_function()
+    self.assertEqual(
+        ['y'],
+        [inp.op.name for inp in method_op.inputs])
+    self.assertEqual(
+        [b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op.inputs])
+    method_op2 = has_method.method.get_concrete_function()
+    self.assertEqual(
+        ['y'],
+        [inp.op.name for inp in method_op2.inputs])
+    self.assertEqual(
+        [b'y'],
+        [inp.op.get_attr('_user_specified_name') for inp in method_op2.inputs])
+
+  def testVariadic(self, function_decorator):
+    @function_decorator
+    def variadic_fn(x, *args, **kwargs):
+      return x + math_ops.add_n(list(args) + list(kwargs.values()))
+
+    # Call the function to make def_function happy
+    variadic_fn(array_ops.ones([]), array_ops.ones([]))
+    variadic_op = variadic_fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+        tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32,
+                               name='second_variadic'),
+        z=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+        zz=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='cust'))
+    self.assertEqual(
+        ['x', 'y', 'args', 'second_variadic', 'z', 'cust'],
+        [inp.op.name for inp in variadic_op.inputs])
+    self.assertEqual(
+        [b'x', b'y', b'args', b'second_variadic', b'z', b'cust'],
+        [inp.op.get_attr('_user_specified_name')
+         for inp in variadic_op.inputs])
+
+  def testVariadicInputSignature(self, function_decorator):
+    @function_decorator(
+        input_signature=(
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32),
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32, name='y'),
+            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='z'),
+        ))
+    def variadic_fn(x, *args):
+      return x + math_ops.add_n(list(args))
+
+    # Call the function to make def_function happy
+    variadic_fn(array_ops.ones([]), array_ops.ones([]),
+                array_ops.ones([]), array_ops.ones([]))
+    variadic_op = variadic_fn.get_concrete_function()
+    self.assertIn(b'variadic_fn', variadic_op.name)
+    self.assertEqual(
+        ['x', 'y', 'args', 'z'],
+        [inp.op.name for inp in variadic_op.inputs])
+    self.assertEqual(
+        [b'x', b'y', b'args', b'z'],
+        [inp.op.get_attr('_user_specified_name')
+         for inp in variadic_op.inputs])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_defun_collection_test.py b/tensorflow/python/eager/function_defun_collection_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..53478ad121ce689650a9ef9e81215817af605be5
--- /dev/null
+++ b/tensorflow/python/eager/function_defun_collection_test.py
@@ -0,0 +1,102 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class DefunCollectionTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun', function_decorator=function.defun),
+      dict(
+          testcase_name='DefFunction',
+          function_decorator=def_function.function))
+  def testCollectionValueAccess(self, function_decorator):
+    """Read values from graph collections inside of defun."""
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+        x = 2
+        y = 5
+        ops.add_to_collection('x', x)
+        ops.add_to_collection('y', y)
+
+        @function_decorator
+        def fn():
+          x_const = constant_op.constant(ops.get_collection('x')[0])
+          y_const = constant_op.constant(ops.get_collection('y')[0])
+          z = math_ops.add(x_const, y_const)
+          ops.add_to_collection('z', 7)
+          return z
+
+        self.assertEqual(7, int(self.evaluate(fn())))
+        self.assertEquals(ops.get_collection('x'), [2])
+        self.assertEquals(ops.get_collection('y'), [5])
+        self.assertEquals(ops.get_collection('z'), [])
+
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun', function_decorator=function.defun),
+      dict(
+          testcase_name='DefFunction',
+          function_decorator=def_function.function))
+  def testCollectionVariableValueAccess(self, function_decorator):
+    """Read variable value from graph collections inside of defun."""
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+        v = resource_variable_ops.ResourceVariable(1.0)
+
+        @function_decorator
+        def f():
+          return v.read_value()
+
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(1.0, float(self.evaluate(f())))
+        self.assertEquals(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+
+  def testCollectionVariableValueWrite(self):
+    """Write variable value inside defun."""
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g):
+
+        @function.defun
+        def f():
+          v = resource_variable_ops.ResourceVariable(2.0)
+          return v
+
+        _ = f.get_concrete_function()
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(2.0, float(self.evaluate(f())))
+        self.assertEquals(
+            len(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)), 1)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..98dec0b361b76eadbb107a7cd42e4deba6f2ea25
--- /dev/null
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -0,0 +1,756 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
+
+  def testGraphModeWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    @def_function.function
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    self.assertAllEqual(step(), 2.0)
+
+  def testGraphGradientVariable(self):
+    with ops.Graph().as_default(), self.cached_session():
+      v = variables.Variable(1.0)
+
+      @def_function.function
+      def f():
+        return 2.0 * v
+
+      node = f()
+      grads, = gradients_impl.gradients(node, v)
+      v.initializer.run()
+      self.assertAllEqual(grads.eval(), 2.0)
+      self.assertEqual(grads.shape, v.shape)
+
+  def testSymGradGatherNd(self):
+    with ops.Graph().as_default(), self.cached_session() as sess:
+
+      @def_function.function
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertAllEqual(self.evaluate(g).values, [[1.0]])
+
+  def testNoSymGradNestedDefun(self):
+
+    @def_function.function
+    def outer():
+
+      @def_function.function
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertIsInstance(g, ops.IndexedSlices)
+
+    outer()
+
+  def testGraphFunctionWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    @def_function.function
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    step_op = step.get_concrete_function()
+    self.assertEqual(step_op.output_dtypes, dtypes.float32)
+    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
+    self.assertAllEqual(step_op(), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDefunCondGradient(self):
+
+    @def_function.function
+    def f(x):
+      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGraphLoopGradient(self):
+
+    @def_function.function
+    def f(x):
+      return control_flow_ops.while_loop(lambda _, i: i < 2,
+                                         lambda x, i: (2*x, i + 1),
+                                         [x, 0])[0]
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 4.0)
+
+  def testDefunDifferentiable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
+  def testDefunCanBeDifferentiatedTwice(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+    # Ensure that v is watched again.
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
+  def testSymbolicGradientVariableNoneNotZerosLike(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1.0)
+
+      @def_function.function
+      def f(x, v):
+        v.read_value()
+        return x * x
+
+      x = constant_op.constant(1.0)
+      l = f(x, v)
+      _, dv = gradients_impl.gradients(l, [x, v])
+      with self.cached_session():
+        v.initializer.run()
+        self.assertEqual(dv, None)
+
+  def testDefunCallBackprop(self):
+
+    @def_function.function
+    def f(x):
+      return math_ops.add(x, x)
+
+    @def_function.function
+    def g(x):
+      return backprop.gradients_function(f, [0])(x)[0]
+
+    self.assertAllEqual(2, g(constant_op.constant(2.)))
+
+  @test_util.run_v1_only('b/120545219')
+  def testGraphModeEagerGradError(self):
+    with context.graph_mode():
+      def f():
+        x = variable_scope.get_variable(
+            'v', initializer=constant_op.constant(1.0))
+        return x * constant_op.constant(2.0)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'No trainable variables were accessed'):
+        backprop.implicit_val_and_grad(f)()
+
+  def testDefunCallBackpropUsingSameObjectForMultipleArguments(self):
+
+    @def_function.function
+    def g(x):
+      return backprop.gradients_function(math_ops.multiply, [0, 1])(x, x)
+
+    def np_g(x):
+      return [d.numpy() for d in g(x)]
+
+    x = constant_op.constant(1.)
+    self.assertAllEqual([1., 1.], np_g(x))
+    self.assertAllEqual([1., 1.], np_g(1.))
+
+  def testGradientTensorConversionWithDefun(self):
+    three = resource_variable_ops.ResourceVariable(3.0, name='v')
+
+    @def_function.function
+    def f(x):
+      return math_ops.add(x, three)
+
+    def g(x):
+      return f(x)
+
+    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
+    self.assertAllEqual(g, 1.0)
+
+  def testGradient(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    def sq(x):
+      return matmul(x, x, transpose_a=True)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    grad_t, = backprop.gradients_function(sq, [0])(t)
+    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
+
+  def testGradientInFunction(self):
+
+    @def_function.function
+    def f(x):
+      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
+
+    self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
+
+  def testGradientOfGatherWithDefun(self):
+    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+
+    def sum_gather():
+      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+
+    grad_fn = backprop.implicit_grad(sum_gather)
+    gradient = grad_fn()
+    defun_grad_fn = backprop.implicit_grad(def_function.function(sum_gather))
+    defun_gradient = defun_grad_fn()
+    self.assertEqual(len(gradient), len(defun_gradient))
+
+    gradient = gradient[0][0]
+    defun_gradient = defun_gradient[0][0]
+    self.assertAllEqual(gradient.values, defun_gradient.values)
+    self.assertAllEqual(gradient.indices, defun_gradient.indices)
+    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+
+  def testDifferentiableFunctionNoneOutputs(self):
+
+    @def_function.function
+    def my_function(x):
+      return x, None
+
+    def wrapper(x):
+      return my_function(x)[0]
+
+    g = backprop.gradients_function(wrapper, [0])(constant_op.constant(0.0))
+    self.assertAllEqual(g[0], 1.)
+
+    @def_function.function
+    def foo(a):
+      return None, a * a
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      none, r = foo(x)
+    g = tp.gradient(r, x)
+
+    self.assertIs(none, None)
+    self.assertAllEqual(r, 25.0)
+    self.assertAllEqual(g, 2 * 5.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNestedDifferentiableFunction(self):
+    @def_function.function
+    def inner_fn(a, b):
+      return a * math_ops.add(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return inner_fn(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunction(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionWithMultipleGradCalls(self):
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return math_ops.mul(a, inner_fn(a, b))
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, 3.0)
+
+    x = constant_op.constant(5.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+    self.assertAllEqual(middle_fn(3.0, x), 3.0 * (3.0 + 5.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+    y = constant_op.constant(4.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = inner_fn(y, y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInMultNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    @def_function.function
+    def outer_outer_fn(x):
+      return outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInMultNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    @def_function.function
+    def outer_outer_fn(x):
+      return outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariable(self):
+    var = variables.Variable(constant_op.constant(1.0))
+
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, var)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleGradCalls(self):
+    v = variables.Variable(constant_op.constant(3.0))
+
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return math_ops.mul(a, inner_fn(a, b))
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, v)
+
+    x = constant_op.constant(5.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+    self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+    y = constant_op.constant(4.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+    v.assign(constant_op.constant(1.5))
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 1.5)
+
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = inner_fn(y, v)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleTFGrads(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(3.0)
+      v.initializer.run()
+
+      @def_function.function
+      def inner_fn(a, b):
+        return math_ops.add(a, b)
+
+      @def_function.function
+      def middle_fn(a, b):
+        return math_ops.mul(a, inner_fn(a, b))
+
+      @def_function.function
+      def outer_fn(x):
+        return middle_fn(x, v)
+
+      x = constant_op.constant(5.0)
+      self.assertAllEqual(outer_fn(x).eval(), 5.0 * (5.0 + 3.0))
+
+      grad, = gradients_impl.gradients(outer_fn(x), x)
+
+      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+      self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+      self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
+
+      grad, = gradients_impl.gradients(outer_fn(x), x)
+
+      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+      y = constant_op.constant(4.0)
+      grad, = gradients_impl.gradients(outer_fn(y), y)
+      self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+      self.evaluate(v.assign(constant_op.constant(1.5)))
+      grad, = gradients_impl.gradients(outer_fn(y), y)
+
+      self.assertAllEqual(grad, 2 * 4.0 + 1.5)
+
+      grad, = gradients_impl.gradients(inner_fn(y, v), y)
+      self.assertAllEqual(grad, 1.0)
+
+  def testNestedDifferentiableFunctionNoneOutputs(self):
+    @def_function.function
+    def foo(a, b):
+      return None, a * math_ops.add(a, b), None, 2*a
+
+    @def_function.function
+    def bar(x):
+      return foo(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape(persistent=True) as tp:
+      tp.watch(x)
+      none1, r1, none2, r2 = bar(x)
+    g1 = tp.gradient(r1, x)
+    g2 = tp.gradient(r2, x)
+
+    self.assertAllEqual(r1, 30.0)
+    self.assertAllEqual(r2, 10.0)
+    self.assertIs(none1, None)
+    self.assertIs(none2, None)
+    self.assertAllEqual(g1, 2 * 5.0 + 1.0)
+    self.assertAllEqual(g2, 2.0)
+
+  def testGradientWithKeywordArguments(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    def sq(x):
+      return matmul(a=x, b=x, transpose_a=True)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    grad_t, = backprop.gradients_function(sq, [0])(t)
+    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
+
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(t)
+      one = matmul(t, b=t, transpose_a=True)
+      two = matmul(b=t, a=t, transpose_a=True)
+      three = matmul(a=t, b=t, transpose_a=True)
+
+    for output in [one, two, three]:
+      self.assertAllEqual(tape.gradient(output, t), [[6, 6], [14, 14]])
+
+  def testGradientInFunctionWithKeywordArguments(self):
+
+    @def_function.function
+    def f(x):
+      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
+
+    self.assertAllEqual(f(x=constant_op.constant(1.0)), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBackwardNone(self):
+    model = variables.Variable(1.0, name='model')
+    count = variables.Variable(0)
+
+    @function.defun
+    def forward_pass(value):
+      count.assign_add(1)
+      residuals = value - model
+      loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
+      # Note: count is an integer, so its doutput will be None
+      return loss, count
+
+    def reduce_fn(x):
+      if context.executing_eagerly():
+        with backprop.GradientTape() as t:
+          loss, count = forward_pass(x)
+        return t.gradient(loss, model), count
+      loss, count = forward_pass(x)
+      grad_only = gradients_impl.gradients(loss, model)
+      return grad_only, count
+
+    g, _ = reduce_fn(constant_op.constant([7.0]))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(nest.flatten(self.evaluate(g)), [-6.0])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index a2cfb4b4762bd4b3c3ddfd5ca9c7815409c48542..50d1b4b6f77e203e1d9ebb278f1c356024a4226f 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -23,14 +23,14 @@ from multiprocessing.pool import ThreadPool
 import sys
 import weakref
 
+from absl.testing import parameterized
 import numpy
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -40,13 +40,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -55,11 +55,10 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training import adam
-from tensorflow.python.training import momentum
 from tensorflow.python.training import training_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
 
 
 class MiniModel(keras_training.Model):
@@ -84,21 +83,49 @@ class DefunnedMiniModel(MiniModel):
     return super(DefunnedMiniModel, self).call(inputs, training=training)
 
 
-@test_util.with_c_shapes
-class FunctionTest(test.TestCase):
+class FunctionTest(test.TestCase, parameterized.TestCase):
 
   def testBasic(self):
-    matmul = function.defun(math_ops.matmul)
+    matmul = def_function.function(math_ops.matmul)
     t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
     sq = matmul(t, t, transpose_a=True)
     sq2 = matmul(sq, t, transpose_a=True)
     self.assertAllEqual(sq.numpy().reshape(-1), [10, 14, 14, 20])
     self.assertAllEqual(sq2.numpy().reshape(-1), [52, 76, 74, 108])
 
-  def testBasicGraphMode(self):
-    matmul = function.defun(math_ops.matmul)
+  def testWastedAdd(self):
+
+    @def_function.function()
+    def add(x, y):
+      _ = x * y
+      return x + y
+
+    # The default config allows all rewrites.
+    config_proto = config_pb2.ConfigProto()
+
+    with context.function_config_proto(config_proto):
+      t = constant_op.constant(1.0)
+      self.assertAllEqual(add(t, t).numpy(), 2.0)
+
+  def testFuncName(self):
+
+    @function.defun_with_attributes(attributes={'func_name': 'multiply'})
+    def add(x, y):
+      _ = x * y
+      return x + y
 
     @function.defun
+    def add_2(x, y):
+      _ = x * y
+      return x + y
+
+    self.assertEqual(add._name, 'multiply')
+    self.assertEqual(add_2._name, 'add_2')
+
+  def testBasicGraphMode(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    @def_function.function
     def sq(a):
       return matmul(a, a)
 
@@ -107,11 +134,11 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testNestedInputsGraphMode(self):
-    matmul = function.defun(math_ops.matmul)
+    matmul = def_function.function(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
-    @function.defun
+    @def_function.function
     def a_times_b(inputs):
       return matmul(inputs.a['a'], inputs.b['b'])
 
@@ -120,37 +147,28 @@ class FunctionTest(test.TestCase):
     out = a_times_b(pair({'a': t}, {'b': t}))
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
-  def testGraphModeWithGradients(self):
-    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+  def testNestedOutputsGraphMode(self):
+    matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
-    def step():
-      def inner():
-        return v * v
+    pair = collections.namedtuple('pair', ['a', 'b'])
 
-      return backprop.implicit_grad(inner)()[0][0]
+    @def_function.function()
+    def pairs_mul(pair_a, pair_b):
+      return pair(matmul(pair_a.a, pair_b.a), matmul(pair_a.b, pair_b.b))
 
-    self.assertAllEqual(step(), 2.0)
+    a = constant_op.constant([[1.0, 2.0], [1.0, 2.0]])
+    b = constant_op.constant([[3.0, 4.0], [3.0, 4.0]])
 
-  def testGraphGradientVariable(self):
-    with ops.Graph().as_default(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(1.0)
-
-      @function.defun
-      def f():
-        return 2.0 * v
-
-      node = f()
-      grads, = gradients_impl.gradients(node, v)
-      v.initializer.run()
-      self.assertAllEqual(grads.eval(), 2.0)
-      self.assertEqual(grads.shape, v.shape)
+    out = pairs_mul(pair(a, b), pair(b, a))
+    expected = pair(math_ops.matmul(a, b).numpy(),
+                    math_ops.matmul(b, a).numpy())
+    self.assertAllClose(out, expected)
 
   def testGraphEagerIsolation(self):
 
     @function.defun
     def f():
-      self.v = resource_variable_ops.ResourceVariable(1.0)
+      self.v = variables.Variable(1.0)
       return self.v.read_value()
 
     self.assertAllEqual(f(), 1.0)
@@ -159,9 +177,9 @@ class FunctionTest(test.TestCase):
       self.assertEqual(f().shape, ())
 
   def testBasicGraphFunction(self):
-    matmul = function.defun(math_ops.matmul)
+    matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(a):
       return matmul(a, a)
 
@@ -172,9 +190,55 @@ class FunctionTest(test.TestCase):
     out = sq_op(t)
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
+  def testInputSpecGraphFunction(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    @def_function.function
+    def sq(a):
+      return matmul(a, a)
+
+    sq_op = sq.get_concrete_function(
+        tensor_spec.TensorSpec((None, None), dtypes.float32))
+    self.assertEqual([None, None], sq_op.output_shapes.as_list())
+
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    out1 = sq_op(t1)
+    self.assertAllEqual(out1, math_ops.matmul(t1, t1).numpy())
+
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    out2 = sq_op(t2)
+    self.assertAllEqual(out2, math_ops.matmul(t2, t2).numpy())
+
+  def testNestedInputSpecGraphFunction(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    @def_function.function
+    def sq(mats):
+      ((a, b),) = mats
+      return matmul(a, b)
+
+    with self.assertRaisesRegexp(ValueError, "two arguments named 'mats'"):
+      sq.get_concrete_function(
+          [(tensor_spec.TensorSpec((None, None), dtypes.float32),
+            tensor_spec.TensorSpec((None, None), dtypes.float32))])
+    sq_op = sq.get_concrete_function(
+        [(tensor_spec.TensorSpec((None, None), dtypes.float32,
+                                 name='first_mat'),
+          tensor_spec.TensorSpec((None, None), dtypes.float32,
+                                 name='second_mat'))])
+    self.assertEqual([None, None], sq_op.output_shapes.as_list())
+
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.4, 2.4], [3.4, 4.4]])
+    with self.assertRaisesRegexp(
+        TypeError, 'bound to Tensors within nested structures'):
+      sq_op(t1, t2)
+    out = sq_op(first_mat=t1, second_mat=t2)
+    self.assertAllEqual(out, math_ops.matmul(t1, t2).numpy())
+
   def testExecutingStatelessDefunConcurrently(self):
 
-    @function.defun
+    @def_function.function
     def stateless(x):
       return math_ops.multiply(2.0, x)
 
@@ -186,7 +250,7 @@ class FunctionTest(test.TestCase):
 
   def testExecutingManyStatelessDefunsConcurrently(self):
 
-    @function.defun
+    @def_function.function
     def stateless(x):
       del x
       return math_ops.multiply(2.0, 2.0)
@@ -204,7 +268,7 @@ class FunctionTest(test.TestCase):
 
     v = resource_variable_ops.ResourceVariable(1.0)
 
-    @function.defun
+    @def_function.function
     def stateful(x):
       v.assign(x)
 
@@ -217,7 +281,7 @@ class FunctionTest(test.TestCase):
 
     v = resource_variable_ops.ResourceVariable(1.0)
 
-    @function.defun
+    @def_function.function
     def stateful(x):
       del x
       return v.assign(0.0)
@@ -229,7 +293,7 @@ class FunctionTest(test.TestCase):
 
   def disabled_testRandomSeed(self):
 
-    @function.defun
+    @def_function.function
     def f():
       return random_ops.random_normal(())
 
@@ -239,38 +303,27 @@ class FunctionTest(test.TestCase):
     random_seed.set_random_seed(1)
     self.assertAllEqual(f(), x)
 
-  def testSymGradGatherNd(self):
-    with ops.Graph().as_default(), self.cached_session() as sess:
-
-      @function.defun
-      def f(x):
-        return array_ops.gather_nd(x, [[0]])
-
-      c = constant_op.constant([[2.]])
-      f_c = f(c)
-      g, = gradients_impl.gradients(f_c, c)
-      self.assertAllEqual(sess.run(g), [[1.0]])
-
   def testNestedInputsGraphFunction(self):
-    matmul = function.defun(math_ops.matmul)
+    matmul = def_function.function(math_ops.matmul)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
-    @function.defun
+    @def_function.function
     def a_times_b(inputs):
       return matmul(inputs.a['a'], inputs.b['b'])
 
     t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-    inputs = pair({'a': t}, {'b': t})
-    sq_op = a_times_b.get_concrete_function(inputs)
+    sq_op = a_times_b.get_concrete_function(
+        pair(dict(a=tensor_spec.TensorSpec([2, 2], dtypes.float32, 'a')),
+             dict(b=tensor_spec.TensorSpec([2, 2], dtypes.float32, 'b'))))
     self.assertEqual(sq_op.output_shapes, tensor_shape.TensorShape([2, 2]))
-    out = sq_op(inputs)
+    out = sq_op(a=t, b=t)
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testNestedOutputGraphFunction(self):
-    matmul = function.defun(math_ops.matmul)
+    matmul = def_function.function(math_ops.matmul)
 
-    @function.defun
+    @def_function.function
     def sq(a):
       return (matmul(a, a), {'b': constant_op.constant(1.0)})
 
@@ -286,23 +339,8 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(a, math_ops.matmul(t, t).numpy())
     self.assertAllEqual(b['b'].numpy(), 1.0)
 
-  def testGraphFunctionWithGradients(self):
-    v = resource_variable_ops.ResourceVariable(1.0, name='v')
-
-    @function.defun
-    def step():
-      def inner():
-        return v * v
-
-      return backprop.implicit_grad(inner)()[0][0]
-
-    step_op = step.get_concrete_function()
-    self.assertEqual(step_op.output_dtypes, dtypes.float32)
-    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
-    self.assertAllEqual(step_op(), 2.0)
-
   def testGraphFunctionNoneOutput(self):
-    @function.defun
+    @def_function.function
     def fn(unused_a, unused_b):
       return None
 
@@ -312,34 +350,6 @@ class FunctionTest(test.TestCase):
     self.assertEqual(fn_op.output_shapes, None)
     self.assertAllEqual(fn_op(x, x), None)
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testDefunCondGradient(self):
-
-    @function.defun
-    def f(x):
-      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
-
-    with backprop.GradientTape() as t:
-      x = constant_op.constant(1.0)
-      t.watch(x)
-      y = f(x)
-    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testGraphLoopGradient(self):
-
-    @function.defun
-    def f(x):
-      return control_flow_ops.while_loop(lambda _, i: i < 2,
-                                         lambda x, i: (2*x, i + 1),
-                                         [x, 0])[0]
-
-    with backprop.GradientTape() as t:
-      x = constant_op.constant(1.0)
-      t.watch(x)
-      y = f(x)
-    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 4.0)
-
   def testDefunNumpyArraysConvertedToTensors(self):
 
     def f(x):
@@ -366,7 +376,7 @@ class FunctionTest(test.TestCase):
   def testDefunCapturedInt32(self):
     x = constant_op.constant(1, dtype=dtypes.int32)
 
-    @function.defun
+    @def_function.function
     def add_int32s():
       return x + x
 
@@ -375,7 +385,7 @@ class FunctionTest(test.TestCase):
   def testDefunReadVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0)
 
-    @function.defun
+    @def_function.function
     def f():
       return v.read_value()
 
@@ -385,7 +395,7 @@ class FunctionTest(test.TestCase):
     v = resource_variable_ops.ResourceVariable(1.0)
     x = constant_op.constant(2.0)
 
-    @function.defun
+    @def_function.function
     def test_assign_add():
       v.assign_add(x)
       return v.read_value()
@@ -397,7 +407,7 @@ class FunctionTest(test.TestCase):
     error_msg = ('Tensor-typed variable initializers must either be '
                  'wrapped in an init_scope or callable.*')
 
-    @function.defun
+    @def_function.function
     def tensor_init():
       with self.assertRaisesRegexp(ValueError, error_msg):
         resource_variable_ops.ResourceVariable(constant_op.constant(2.0))
@@ -407,7 +417,7 @@ class FunctionTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testCallableTensorInitializationInFunction(self):
 
-    @function.defun
+    @def_function.function
     def tensor_init():
       self.v = resource_variable_ops.ResourceVariable(
           lambda: constant_op.constant(2.0))
@@ -418,20 +428,21 @@ class FunctionTest(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
     self.assertEqual(self.evaluate(value), 2.0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.also_run_as_tf_function
   def testInitScopeTensorInitializationInFunction(self):
 
-    @function.defun
+    @def_function.function
     def tensor_init():
       with ops.init_scope():
         const = constant_op.constant(2.0)
+      # Note: this variable bypasses tf.function's variable creation
+      # requirements by bypassing variable_creator_scope by using
+      # ResourceVariable instead of Variable.
       self.v = resource_variable_ops.ResourceVariable(const)
       return self.v.read_value()
 
     value = tensor_init()
-    if not context.executing_eagerly():
-      self.evaluate(variables.global_variables_initializer())
-    self.assertEqual(self.evaluate(value), 2.0)
+    self.assertAllEqual(value, 2.0)
 
   def testDefunShapeInferenceWithCapturedResourceVariable(self):
     v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
@@ -444,7 +455,7 @@ class FunctionTest(test.TestCase):
       # ResourceVariable returns the read value and not the resource itself.
       return v._handle
 
-    compiled = function.defun(f)
+    compiled = def_function.function(f)
     var_handle = compiled()
     self.assertEqual(var_handle.dtype, dtypes.resource)
     self.assertEqual(var_handle.shape, tensor_shape.scalar())
@@ -478,7 +489,7 @@ class FunctionTest(test.TestCase):
         # ResourceVariable returns the read value and not the resource itself.
         return v._handle
 
-      compiled = function.defun(f)
+      compiled = def_function.function(f)
       var_handle = compiled()
       self.assertEqual(var_handle.dtype, dtypes.resource)
       self.assertEqual(var_handle.shape, tensor_shape.scalar())
@@ -495,7 +506,7 @@ class FunctionTest(test.TestCase):
         self.assertEqual(out.shape, tensor_shape.TensorShape([2, 2]))
 
       # Check that shape inference works while creating the defun
-      compiled = function.defun(f)
+      compiled = def_function.function(f)
       compiled()
 
   def testDefunShapeInferenceWithCapturedTensorListInGraphMode(self):
@@ -514,7 +525,7 @@ class FunctionTest(test.TestCase):
         self.assertEqual(value.shape, tensor_shape.scalar())
         return tl
 
-      compiled = function.defun(f)
+      compiled = def_function.function(f)
       output_tensor_list = compiled()
       _, value = list_ops.tensor_list_pop_back(
           output_tensor_list, element_dtype=dtypes.float32)
@@ -533,25 +544,28 @@ class FunctionTest(test.TestCase):
     self.assertIsInstance(
         self.v, resource_variable_ops.ResourceVariable)
 
-  def testDefunDifferentiable(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
+  def disabled_testRunMetadata(self):
 
-    @function.defun
-    def f():
-      return v * v
-
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-
-  def testDefunCanBeDifferentiatedTwice(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
-
-    @function.defun
-    def f():
-      return v * v
+    @def_function.function
+    def f(x):
+      return x * x
 
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-    # Ensure that v is watched again.
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+    with ops.device('cpu:0'):
+      context.enable_run_metadata()
+      f(constant_op.constant(1.0))
+    run_metadata = context.export_run_metadata()
+    context.disable_run_metadata()
+    step_stats = run_metadata.step_stats
+    self.assertGreater(len(step_stats.dev_stats), 0)
+    cpu_stats = step_stats.dev_stats[0]
+    self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
+                     cpu_stats.device)
+    # Testing for at least 2 because the function call should generate at most
+    # one entry in the step_stats; the ops inside function can generate
+    # arbitrarily many (placeholders, return identities, etc, might be included
+    # or not in the future, so shouldn't be tested for exactly.
+    self.assertGreaterEqual(len(cpu_stats.node_stats), 2)
+    self.assertEqual(len(run_metadata.partition_graphs), 1)
 
   def testGraphModeCaptureVariable(self):
     with context.graph_mode(), self.cached_session() as sess:
@@ -566,34 +580,18 @@ class FunctionTest(test.TestCase):
 
       o = HasAVar()
       variables.global_variables_initializer().run()
-      call = function.defun(o.call)
+      call = def_function.function(o.call)
       op = call()
-      self.assertAllEqual(sess.run(op), 2.0)
-
-  def testSymbolicGradientVariableZerosLike(self):
-    with ops.Graph().as_default():
-      v = resource_variable_ops.ResourceVariable(1.0)
-
-      @function.defun
-      def f(x, v):
-        v.read_value()
-        return x * x
-
-      x = constant_op.constant(1.0)
-      l = f(x, v)
-      _, dv = gradients_impl.gradients(l, [x, v])
-      with self.cached_session():
-        v.initializer.run()
-        self.assertAllEqual(dv.eval(), 0.0)
+      self.assertAllEqual(self.evaluate(op), 2.0)
 
   def testGraphModeManyFunctions(self):
-    with context.graph_mode(), self.cached_session():
+    with ops.Graph().as_default(), self.cached_session():
 
-      @function.defun
+      @def_function.function
       def f(x):
         return x * x
 
-      @function.defun
+      @def_function.function
       def g(x):
         return f(x) + 1
 
@@ -601,7 +599,7 @@ class FunctionTest(test.TestCase):
 
   def testDict(self):
 
-    @function.defun
+    @def_function.function
     def f(x):
       return {'name': x + 1}
 
@@ -609,7 +607,7 @@ class FunctionTest(test.TestCase):
 
   def testTensorConversionWithDefun(self):
 
-    @function.defun
+    @def_function.function
     def f(x):
       return math_ops.add(x, constant_op.constant(3))
 
@@ -617,59 +615,23 @@ class FunctionTest(test.TestCase):
 
   def testTensorConversionCall(self):
 
-    @function.defun
+    @def_function.function
     def f(x):
       return math_ops.add(x, constant_op.constant(3))
 
-    @function.defun
+    @def_function.function
     def g(x):
       return f(f(x))
 
     self.assertAllEqual(8, g(constant_op.constant(2)))
 
-  def testDefunCallBackprop(self):
-
-    @function.defun
-    def f(x):
-      return math_ops.add(x, x)
-
-    @function.defun
-    def g(x):
-      return backprop.gradients_function(f, [0])(x)[0]
-
-    self.assertAllEqual(2, g(constant_op.constant(2.)))
-
-  def testGraphModeEagerGradError(self):
-    with context.graph_mode():
-      def f():
-        x = variable_scope.get_variable(
-            'v', initializer=constant_op.constant(1.0))
-        return x * constant_op.constant(2.0)
-
-      with self.assertRaisesRegexp(ValueError,
-                                   'No trainable variables were accessed'):
-        backprop.implicit_val_and_grad(f)()
-
-  def testDefunCallBackpropUsingSameObjectForMultipleArguments(self):
-
-    @function.defun
-    def g(x):
-      return backprop.gradients_function(math_ops.multiply, [0, 1])(x, x)
-
-    def np_g(x):
-      return [d.numpy() for d in g(x)]
-
-    x = constant_op.constant(1.)
-    self.assertAllEqual([1., 1.], np_g(x))
-    self.assertAllEqual([1., 1.], np_g(1.))
-
   def testCallShape(self):
 
-    @function.defun
+    @def_function.function
     def f(x):
       return x + 1
 
-    @function.defun
+    @def_function.function
     def g(x):
       x = f(x)
       self.assertEqual(x.shape.as_list(), [])
@@ -680,50 +642,19 @@ class FunctionTest(test.TestCase):
   def testNestedDefunWithNoOutputAndTapedInput(self):
     three = resource_variable_ops.ResourceVariable(3.0, name='v')
 
-    @function.defun
+    @def_function.function
     def f(x):
       # This function intentionally takes a taped variable as input,
       # but does not return any values
       math_ops.add(x, three)
 
-    @function.defun
+    @def_function.function
     def g(x):
       y = math_ops.add(x, three)
       f(y)
 
     g(three)
 
-  def testGradientTensorConversionWithDefun(self):
-    three = resource_variable_ops.ResourceVariable(3.0, name='v')
-
-    @function.defun
-    def f(x):
-      return math_ops.add(x, three)
-
-    def g(x):
-      return f(x)
-
-    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
-    self.assertAllEqual(g, 1.0)
-
-  def testGradient(self):
-    matmul = function.defun(math_ops.matmul)
-
-    def sq(x):
-      return matmul(x, x, transpose_a=True)
-
-    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-    grad_t, = backprop.gradients_function(sq, [0])(t)
-    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
-
-  def testGradientInFunction(self):
-
-    @function.defun
-    def f(x):
-      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
-
-    self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
-
   def testGatherResourceWithDefun(self):
     with ops.device('cpu:0'):
       v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
@@ -731,36 +662,18 @@ class FunctionTest(test.TestCase):
     def sum_gather():
       return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
 
-    defined = function.defun(sum_gather)
+    defined = def_function.function(sum_gather)
     self.assertAllEqual(sum_gather(), defined())
 
-  def testGradientOfGatherWithDefun(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
-
-    def sum_gather():
-      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
-
-    grad_fn = backprop.implicit_grad(sum_gather)
-    gradient = grad_fn()
-    defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
-    defun_gradient = defun_grad_fn()
-    self.assertEqual(len(gradient), len(defun_gradient))
-
-    gradient = gradient[0][0]
-    defun_gradient = defun_gradient[0][0]
-    self.assertAllEqual(gradient.values, defun_gradient.values)
-    self.assertAllEqual(gradient.indices, defun_gradient.indices)
-    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
-
   def testReturningIndexedSlicesWithDefun(self):
 
     def validate(indexed_slice):
-      @function.defun
+      @def_function.function
       def f():
         return indexed_slice
 
       output = f()
-      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertIsInstance(output, ops.IndexedSlices)
       self.assertAllEqual(indexed_slice.values, output.values)
       self.assertAllEqual(indexed_slice.indices, output.indices)
       self.assertAllEqual(indexed_slice.dense_shape, output.dense_shape)
@@ -783,13 +696,13 @@ class FunctionTest(test.TestCase):
 
   def testIndexedSliceAsArgumentWithDefun(self):
 
-    @function.defun
+    @def_function.function
     def f(indexed_slice):
       return indexed_slice
 
     def validate(arg):
       output = f(arg)
-      self.assertTrue(isinstance(output, ops.IndexedSlices))
+      self.assertIsInstance(output, ops.IndexedSlices)
       self.assertAllEqual(arg.values, output.values)
       self.assertAllEqual(arg.indices, output.indices)
       self.assertAllEqual(arg.dense_shape, output.dense_shape)
@@ -812,7 +725,7 @@ class FunctionTest(test.TestCase):
       self.skipTest('No GPUs found')
 
     x = constant_op.constant([1.]).gpu()
-    f = function.defun(math_ops.add)
+    f = def_function.function(math_ops.add)
     y = f(x, x).cpu()
     self.assertAllEqual(y, [2.])
 
@@ -853,7 +766,7 @@ class FunctionTest(test.TestCase):
       v_gpu = resource_variable_ops.ResourceVariable(
           [0.0, 1.0, 2.0], name='gpu')
 
-    @function.defun
+    @def_function.function
     def resource_apply_adam():
       training_ops.resource_apply_adam(
           v_cpu.handle,
@@ -881,7 +794,7 @@ class FunctionTest(test.TestCase):
       self.skipTest('No GPUs found')
 
     # The Reshape op requires the shape tensor to be placed in host memory.
-    reshape = function.defun(array_ops.reshape)
+    reshape = def_function.function(array_ops.reshape)
     value = constant_op.constant([1., 2.]).gpu()
     shape = constant_op.constant([2, 1])
     reshaped = reshape(value, shape).cpu()
@@ -892,80 +805,14 @@ class FunctionTest(test.TestCase):
       self.skipTest('No GPUs found')
 
     # The Reshape op requires the shape tensor to be placed in host memory.
-    reshape = function.defun(array_ops.reshape)
+    reshape = def_function.function(array_ops.reshape)
     value = constant_op.constant([1., 2.])
     shape = constant_op.constant([2, 1]).gpu()
     reshape(value, shape)  # No error is raised
 
-  def testDifferentiableFunctionNoneOutputs(self):
-
-    @function.defun
-    def my_function(x):
-      return x, None
-
-    def wrapper(x):
-      return my_function(x)[0]
-
-    g = backprop.gradients_function(wrapper, [0])(constant_op.constant(0.0))
-    self.assertAllEqual(g[0], 1.)
-
-    @function.defun
-    def foo(a):
-      return None, a * a
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      none, r = foo(x)
-    g = tp.gradient(r, x)
-
-    self.assertIs(none, None)
-    self.assertAllEqual(r, 25.0)
-    self.assertAllEqual(g, 2 * 5.0)
-
-  def testNestedDifferentiableFunction(self):
-    @function.defun
-    def inner_fn(a, b):
-      return a * math_ops.add(a, b)
-
-    @function.defun
-    def outer_fn(x):
-      return inner_fn(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  def testNestedDifferentiableFunctionNoneOutputs(self):
-    @function.defun
-    def foo(a, b):
-      return None, a * math_ops.add(a, b), None, 2*a
-
-    @function.defun
-    def bar(x):
-      return foo(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape(persistent=True) as tp:
-      tp.watch(x)
-      none1, r1, none2, r2 = bar(x)
-    g1 = tp.gradient(r1, x)
-    g2 = tp.gradient(r2, x)
-
-    self.assertAllEqual(r1, 30.0)
-    self.assertAllEqual(r2, 10.0)
-    self.assertIs(none1, None)
-    self.assertIs(none2, None)
-    self.assertAllEqual(g1, 2 * 5.0 + 1.0)
-    self.assertAllEqual(g2, 2.0)
-
   def testNoneOutput(self):
 
-    @function.defun
+    @def_function.function
     def my_function(_):
       return None
 
@@ -978,7 +825,7 @@ class FunctionTest(test.TestCase):
     def add(a, b):
       return math_ops.add(a, b)
 
-    @function.defun
+    @def_function.function
     def add_one(x):
       return add(x, 1)
 
@@ -987,11 +834,11 @@ class FunctionTest(test.TestCase):
   def testVariableCaptureInNestedFunctions(self):
     v = resource_variable_ops.ResourceVariable(1, dtype=dtypes.int32)
 
-    @function.defun
+    @def_function.function
     def inner_read():
       return v.read_value()
 
-    @function.defun
+    @def_function.function
     def outer():
       return inner_read()
 
@@ -1000,7 +847,7 @@ class FunctionTest(test.TestCase):
   def testReturnCapturedEagerTensor(self):
     t = constant_op.constant(1)
 
-    @function.defun
+    @def_function.function
     def read():
       return t
 
@@ -1010,20 +857,20 @@ class FunctionTest(test.TestCase):
     with context.graph_mode(), self.cached_session():
       t = constant_op.constant(1)
 
-      @function.defun
+      @def_function.function
       def read():
         return t
 
       self.assertEqual(1, int(self.evaluate(read())))
 
   def testSequenceInputs(self):
-    clip_by_global_norm = function.defun(clip_ops.clip_by_global_norm)
+    clip_by_global_norm = def_function.function(clip_ops.clip_by_global_norm)
     t_list = [constant_op.constant(1.0), constant_op.constant(2.0)]
     clipped_list, global_norm = clip_by_global_norm(t_list,
                                                     constant_op.constant(.2))
     for t in clipped_list:
-      self.assertTrue(isinstance(t, ops.Tensor))
-    self.assertTrue(isinstance(global_norm, ops.Tensor))
+      self.assertIsInstance(t, ops.Tensor)
+    self.assertIsInstance(global_norm, ops.Tensor)
 
   def testNestedSequenceInputs(self):
 
@@ -1033,7 +880,7 @@ class FunctionTest(test.TestCase):
       g, h = e
       return [a + a, [tuple([f + f, g + g]), h + h], c + c], a + f + g + h + c
 
-    my_eager_op = function.defun(my_op)
+    my_eager_op = def_function.function(my_op)
     ret = my_eager_op([
         constant_op.constant(1), [(constant_op.constant(2),
                                    constant_op.constant(3)),
@@ -1044,13 +891,13 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(ret[0][0], 2)
     self.assertAllEqual(ret[0][1][0][0], 8)
     self.assertAllEqual(ret[0][1][0][1], 4)
-    self.assertTrue(isinstance(ret[0][1][0], tuple))
+    self.assertIsInstance(ret[0][1][0], tuple)
     self.assertAllEqual(ret[0][1][1], 6)
     self.assertAllEqual(ret[0][2], 10)
     self.assertAllEqual(ret[1], 15)
 
   def testVariableNamesRespectNameScopesWithDefun(self):
-    @function.defun
+    @def_function.function
     def create_variable():
       with ops.name_scope('foo'):
         v = resource_variable_ops.ResourceVariable(0.0, name='bar')
@@ -1060,7 +907,7 @@ class FunctionTest(test.TestCase):
 
   def testVariableNamesRespectNameScopesWithDefunInGraph(self):
     with context.graph_mode():
-      @function.defun
+      @def_function.function
       def create_variable():
         with ops.name_scope('foo'):
           v = resource_variable_ops.ResourceVariable([1.0, 2.0], name='bar')
@@ -1087,7 +934,7 @@ class FunctionTest(test.TestCase):
     if not context.executing_eagerly():
       self.evaluate(variables.global_variables_initializer())
 
-    self.assertAllEqual([[[[4.0]]]], self.evaluate(y))
+    self.assertAllClose([[[[4.0]]]], self.evaluate(y))
 
     # Remove reference cycles in model
     test_util.dismantle_polymorphic_function(model)
@@ -1116,20 +963,17 @@ class FunctionTest(test.TestCase):
   # construction. Eager's configuration is controlled in `__main__`.
   @test_util.run_in_graph_and_eager_modes(
       config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  @test_util.run_v1_only('b/120545219')
   def testDeviceAnnotationsRespected(self):
 
     def multi_device_fn():
       with ops.device('/cpu:0'):
-        s0 = iterator_ops.Iterator.from_structure(
-            (dtypes.float32,)).string_handle()
+        s0 = test_ops.device_placement_op()
       with ops.device('/cpu:1'):
-        s1 = iterator_ops.Iterator.from_structure(
-            (dtypes.float32,)).string_handle()
+        s1 = test_ops.device_placement_op()
       with ops.device('/cpu:2'):
-        s2 = iterator_ops.Iterator.from_structure(
-            (dtypes.float32,)).string_handle()
-      s3 = iterator_ops.Iterator.from_structure(
-          (dtypes.float32,)).string_handle()
+        s2 = test_ops.device_placement_op()
+      s3 = test_ops.device_placement_op()
       return s0, s1, s2, s3
 
     defined = function.defun(multi_device_fn)
@@ -1141,24 +985,25 @@ class FunctionTest(test.TestCase):
 
     with ops.device('/cpu:3'):
       outputs = self.evaluate(defined())
-    self.assertEqual(len(defined._function_cache), 2)
+    # All function definitions are agnostic to call site devices.
+    self.assertEqual(len(defined._function_cache), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
     self.assertIn(compat.as_bytes('CPU:3'), outputs[3])
 
-    # This should retrieve the call-site-device agnostic function
-    defined()
-    self.assertEqual(len(defined._function_cache), 2)
-
-    # And this should retrieve the function created for '/cpu:3'
-    with ops.device('/cpu:3'):
-      defined()
-    self.assertEqual(len(defined._function_cache), 2)
+    with ops.device('/cpu:0'):
+      outputs = self.evaluate(defined())
+    self.assertEqual(len(defined._function_cache), 1)
+    self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
+    self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
+    self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
+    self.assertIn(compat.as_bytes('CPU:0'), outputs[3])
 
   @test_util.run_in_graph_and_eager_modes(
       config=config_pb2.ConfigProto(device_count={'CPU': 2}))
-  def testCallingGraphFunctionOnIncompatibleDeviceRaisesError(self):
+  @test_util.run_v1_only('b/120545219')
+  def testCallingGraphFunctionOnDifferentDevice(self):
 
     def func():
       return constant_op.constant(0)
@@ -1171,33 +1016,18 @@ class FunctionTest(test.TestCase):
       self.assertEqual(
           self.evaluate(cpu_graph_function()), self.evaluate(func()))
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        'The current device stack does not match the device stack under '
-        'which the TensorFlow function \'.*func.*\' was created.\n'
-        'Current device stack: .*\n.*func.* device stack.*'):
-      with ops.device('cpu:1'):
-        cpu_graph_function()
+    with ops.device('cpu:1'):
+      self.assertEqual(0., self.evaluate(cpu_graph_function()))
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        'The current device stack does not match the device stack under '
-        'which the TensorFlow function \'.*func.*\' was created.\n'
-        'Current device stack: .*\n.*func.* device stack.*'):
-      with ops.device(None):
-        cpu_graph_function()
+    with ops.device(None):
+      self.assertEqual(0., self.evaluate(cpu_graph_function()))
 
     default_graph_function = defined.get_concrete_function()
     self.assertEqual(
         self.evaluate(default_graph_function()), self.evaluate(func()))
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        'The current device stack does not match the device stack under '
-        'which the TensorFlow function \'.*func.*\' was created.\n'
-        'Current device stack: .*\n.*func.* device stack.*'):
-      with ops.device('cpu:1'):
-        default_graph_function()
+    with ops.device('cpu:1'):
+      self.assertEqual(0., self.evaluate(default_graph_function()))
 
   @test_util.run_in_graph_and_eager_modes
   def testColocateWithRespected(self):
@@ -1211,10 +1041,9 @@ class FunctionTest(test.TestCase):
     with ops.device('gpu:0'):
       y = constant_op.constant(1.0)
 
-    @function.defun
+    @def_function.function
     def foo():
-      return iterator_ops.Iterator.from_structure(
-          (dtypes.float32,)).string_handle()
+      return test_ops.device_placement_op()
 
     with ops.colocate_with(x):
       self.assertIn(compat.as_bytes('CPU:0'), self.evaluate(foo()))
@@ -1228,7 +1057,7 @@ class FunctionTest(test.TestCase):
     def foo(x):
       return v * x
 
-    defined = function.defun(foo)
+    defined = def_function.function(foo)
 
     x = constant_op.constant([1.0])
     self.assertEqual(1., self.evaluate(defined(x)))
@@ -1352,9 +1181,13 @@ class FunctionTest(test.TestCase):
     signature = [tensor_spec.TensorSpec(shape=(2,), dtype=dtypes.float32)]
     defined = function.defun(foo, input_signature=signature)
     a = array_ops.ones([2])
-    out = defined(a)
+    self.assertAllEqual(a, defined(a))
+    self.assertEqual(len(defined._function_cache), 1)
+    self.assertAllEqual(a, defined.get_concrete_function()(a))
+    self.assertAllEqual(a, defined.get_concrete_function(a)(a))
+    self.assertAllEqual(a, defined.get_concrete_function(
+        tensor_spec.TensorSpec((2,), dtype=dtypes.float32))(a))
     self.assertEqual(len(defined._function_cache), 1)
-    self.assertAllEqual(out, a)
 
     def bar(a):
       self.assertEqual(a._shape_tuple(), (2, None))
@@ -1418,7 +1251,7 @@ class FunctionTest(test.TestCase):
     a = array_ops.ones([2, 3])
     b = array_ops.ones([1])
     inputs = {'a': a, 'b': a, 'c': b}
-    defined = function.defun(bar, input_signature=signature)
+    defined = def_function.function(bar, input_signature=signature)
     out = defined(inputs)
     nest.assert_same_structure(out, inputs)
     self.assertAllEqual(out['a'], inputs['a'])
@@ -1434,7 +1267,7 @@ class FunctionTest(test.TestCase):
     # Signatures must consist exclusively of `TensorSpec` objects.
     signature = [(2, 3), tensor_spec.TensorSpec([2, 3], dtypes.float32)]
     with self.assertRaisesRegexp(TypeError, 'Invalid input_signature.*'):
-      function.defun(foo, input_signature=signature)
+      def_function.function(foo, input_signature=signature)
 
     # Signatures must be either lists or tuples on their outermost levels.
     signature = {'t1': tensor_spec.TensorSpec([], dtypes.float32)}
@@ -1448,7 +1281,7 @@ class FunctionTest(test.TestCase):
       return a
 
     signature = [tensor_spec.TensorSpec(shape=(2,), dtype=dtypes.float32)]
-    defined = function.defun(foo, input_signature=signature)
+    defined = def_function.function(foo, input_signature=signature)
 
     # Invalid shapes.
     with self.assertRaisesRegexp(ValueError, 'Python inputs incompatible.*'):
@@ -1458,13 +1291,19 @@ class FunctionTest(test.TestCase):
       defined(array_ops.ones([2, 1]))
 
     # Wrong number of arguments.
-    with self.assertRaisesRegexp(ValueError,
-                                 'Structure of Python function inputs.*'):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Arguments and signature arguments do not match.*'):
       defined(array_ops.ones([2]), array_ops.ones([2]))
     with self.assertRaisesRegexp(ValueError,
                                  'Structure of Python function inputs.*'):
       defined()
 
+    with self.assertRaisesRegexp(ValueError,
+                                 'inputs incompatible with input_signature'):
+      defined.get_concrete_function(
+          tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.float32))
+
   def testInputSignatureForFunctionWithNonTensorInputsNotAllowed(self):
 
     def foo(a, training=True):
@@ -1473,12 +1312,16 @@ class FunctionTest(test.TestCase):
       else:
         return -1.0 * a
 
-    signature = [tensor_spec.TensorSpec([], dtypes.float32)] * 2
-    defined = function.defun(foo, input_signature=signature)
+    signature = [
+        tensor_spec.TensorSpec([], dtypes.float32),
+        tensor_spec.TensorSpec([], dtypes.bool),
+    ]
+    defined = def_function.function(foo, input_signature=signature)
     a = constant_op.constant(1.0)
     with self.assertRaisesRegexp(
-        ValueError, 'When input_signature is provided, '
-        'all inputs to the Python function must be Tensors.'):
+        ValueError,
+        'When input_signature is provided, all inputs to '
+        'the Python function must be Tensors.'):
       defined(a, training=True)
 
   def testInputSignatureWithKeywordPositionalArgs(self):
@@ -1569,33 +1412,6 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(six, 2.0)
     self.assertAllEqual(seven, 2.0)
 
-  def testGradientWithKeywordArguments(self):
-    matmul = function.defun(math_ops.matmul)
-
-    def sq(x):
-      return matmul(a=x, b=x, transpose_a=True)
-
-    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-    grad_t, = backprop.gradients_function(sq, [0])(t)
-    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
-
-    with backprop.GradientTape(persistent=True) as tape:
-      tape.watch(t)
-      one = matmul(t, b=t, transpose_a=True)
-      two = matmul(b=t, a=t, transpose_a=True)
-      three = matmul(a=t, b=t, transpose_a=True)
-
-    for output in [one, two, three]:
-      self.assertAllEqual(tape.gradient(output, t), [[6, 6], [14, 14]])
-
-  def testGradientInFunctionWithKeywordArguments(self):
-
-    @function.defun
-    def f(x):
-      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
-
-    self.assertAllEqual(f(x=constant_op.constant(1.0)), 2.0)
-
   def testDefuningInstanceMethod(self):
 
     integer = constant_op.constant(2, dtypes.int64)
@@ -1605,7 +1421,7 @@ class FunctionTest(test.TestCase):
       def one(self, tensor):
         return tensor
 
-      @function.defun
+      @def_function.function
       def two(self, tensor, other=integer):
         return self.one(tensor), other
 
@@ -1621,7 +1437,7 @@ class FunctionTest(test.TestCase):
 
     class Foo(object):
 
-      @function.defun
+      @def_function.function
       def func(self, other=integer):
         return other
 
@@ -1631,7 +1447,7 @@ class FunctionTest(test.TestCase):
   def testPythonCallWithSideEffects(self):
     state = []
 
-    @function.defun
+    @def_function.function
     def side_effecting_function():
       state.append(0)
 
@@ -1708,7 +1524,7 @@ class FunctionTest(test.TestCase):
           t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
           add(t, t)
 
-  def testRegisterFunction(self):
+  def testRegisterPolymorphicFunction(self):
     @function.defun
     def add(x, y):
       return math_ops.add(x, y)
@@ -1744,17 +1560,17 @@ class FunctionTest(test.TestCase):
                                    expected_func_name_regex[i])
 
         # Check the forward and backward function has the correct attributes.
-        self.assertEquals(
+        self.assertEqual(
             functions[1].definition.attr['backward_function_name'].s,
             functions[2].name)
-        self.assertEquals(
+        self.assertEqual(
             functions[2].definition.attr['forward_function_name'].s,
             functions[1].name)
 
-        self.assertEquals(
+        self.assertEqual(
             functions[4].definition.attr['backward_function_name'].s,
             functions[5].name)
-        self.assertEquals(
+        self.assertEqual(
             functions[5].definition.attr['forward_function_name'].s,
             functions[4].name)
 
@@ -1767,8 +1583,67 @@ class FunctionTest(test.TestCase):
         self.assertEqual(len(graph._functions), 6)
         functions = list(graph._functions.values())
         for i in range(len(functions)):
-          self.assertEquals(captured_function_names[i],
-                            functions[i].definition.signature.name)
+          self.assertEqual(captured_function_names[i],
+                           functions[i].definition.signature.name)
+
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun',
+           function_decorator=function.defun),
+      dict(testcase_name='DefFunction',
+           function_decorator=def_function.function))
+  def testRegisterConcreteFunction(self, function_decorator):
+    @function_decorator
+    def py_add(x, y):
+      return math_ops.add(x, y)
+
+    py_add(array_ops.ones([]), array_ops.ones([]))
+    add = py_add.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32),
+        tensor_spec.TensorSpec(None, dtypes.float32))
+
+    @function_decorator
+    def py_composite(x, y):
+      return x, add(x, y)
+
+    py_composite(array_ops.ones([]), array_ops.ones([]))
+    composite = py_composite.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32),
+        tensor_spec.TensorSpec(None, dtypes.float32))
+
+    with context.graph_mode(), self.cached_session():
+      with ops.get_default_graph().as_default():
+        t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+        composite.add_to_graph(register_gradient_functions=True)
+
+        graph = ops.get_default_graph()
+        # pylint: disable=protected-access
+        self.assertEqual(len(graph._functions), 6)
+        # two sets of functions, each of them are (inference, forward, backward)
+        functions = list(graph._functions.values())
+        captured_function_names = [
+            f.definition.signature.name for f in functions
+        ]
+        expected_func_name_regex = [
+            '.*inference.*py_composite.*',
+            '.*inference.*py_add.*',
+            '.*forward.*py_composite.*',
+            '.*forward.*py_add.*',
+            '.*inference.*backward.*py_composite.*',
+            '.*inference.*backward.*py_add.*',
+        ]
+        for expected, found in zip(
+            expected_func_name_regex,
+            captured_function_names):
+          self.assertRegexpMatches(found, expected)
+
+        composite_t, composite_double = composite(t, t)
+        double = add(t, t)
+        self.assertAllEqual([[2, 4], [6, 8]], self.evaluate(double))
+        self.assertAllEqual([[2, 4], [6, 8]], self.evaluate(composite_double))
+        self.assertAllEqual([[1, 2], [3, 4]], self.evaluate(composite_t))
+        # Make sure the pre registered function is used, and no other function
+        # is added.
+        self.assertEqual(len(graph._functions), 6)
 
   def testRegisterFunctionWithInputSignature(self):
     def matmul(x, y):
@@ -1788,11 +1663,10 @@ class FunctionTest(test.TestCase):
         # pylint: disable=protected-access
         self.assertEqual(len(graph._functions), 3)
 
-        # Test input param shape mismatch
-        t2 = constant_op.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-        with self.assertRaisesRegexp(
-            ValueError, 'Python inputs incompatible with input_signature'):
-          function.register(defun_matmul, t2, t2)
+        # Test register function with cache, note inputs are ignored.
+        function.register(defun_matmul)
+        graph = ops.get_default_graph()
+        self.assertEqual(len(graph._functions), 3)
 
   def testRegisterFunctionWithCache(self):
     def matmul(x, y):
@@ -1848,8 +1722,7 @@ class FunctionTest(test.TestCase):
                                  'be Tensors;.*'):
       graph_function('Not a Tensor.')
 
-  # TODO(scottzhu): Revive the test once the grappler plugin is updated.
-  def disabled_testSwapImplementationWithGrapplerPlugin(self):
+  def testSwapImplementationWithGrapplerPlugin(self):
     rewrites = rewriter_config_pb2.RewriterConfig()
     # function_optimizer has to be turn off, otherwise it will delete the
     # registered function if it does not get called.
@@ -1886,13 +1759,13 @@ class FunctionTest(test.TestCase):
 
       function.register(cpu_boost, x)
       y = gpu_boost(x)
-      y_value = sess.run(y)
+      y_value = self.evaluate(y)
 
       if test.is_gpu_available():
-        self.assertEquals(y_value, 5.0)
+        self.assertEqual(y_value, 5.0)
       else:
         # Grappler fallback to use the CPU impl even called with GPU function.
-        self.assertEquals(y_value, 3.0)
+        self.assertEqual(y_value, 3.0)
 
   def testDefunFunctionSeparateGraphs(self):
     with context.graph_mode():
@@ -1924,248 +1797,53 @@ class FunctionTest(test.TestCase):
         self.assertEqual(len(maybe_add._function_cache), 3)
         self.assertEqual(len(add._function_cache), 2)
 
+  def testDecoratedMethod(self):
+    m = DefunnedMiniModel()
+    instance_call_one = m.call(array_ops.ones([1, 2]), training=True)
+    instance_call_two = m.call(
+        inputs=array_ops.ones([1, 2]), training=True)
+    class_call = DefunnedMiniModel.call(m, array_ops.ones([1, 2]),
+                                        training=True)
+    self.assertAllEqual(instance_call_one, instance_call_two)
+    self.assertAllEqual(instance_call_one, class_call)
+
+  def testDecoratedMethodUniquePolymorphicFuncPerInstance(self):
+    m = DefunnedMiniModel()
+    n = DefunnedMiniModel()
 
-@test_util.with_c_shapes
-class AutomaticControlDependenciesTest(test.TestCase):
-
-  def testBasic(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
-      with function.AutomaticControlDependencies() as c:
-        v.assign(v + 1)
-        v.assign(2 * v)
-        val = v.read_value()
-        val = c.mark_as_return(val)
-      self.assertAllEqual(val.eval(), 4.0)
-
-  def testCondMustRun(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
-      p = array_ops.placeholder(dtype=dtypes.bool)
-      with function.AutomaticControlDependencies() as c:
-
-        def true_fn():
-          v.assign(v + 1)
-          return 0.0
-
-        def false_fn():
-          v.assign(v + 4)
-          return 1.0
-
-        control_flow_ops.cond(p, true_fn, false_fn)
-        val = v.read_value()
-        val = c.mark_as_return(val)
-      self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
-      self.assertAllEqual(val.eval(feed_dict={p: True}), 6.0)
-
-  def testCondMustRunSeparateRead(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
-      p = array_ops.placeholder(dtype=dtypes.bool)
-      with function.AutomaticControlDependencies() as c:
-
-        def true_fn():
-          v.assign(v + 1)
-          return 0.0
-
-        def false_fn():
-          v.assign(v + 4)
-          return 1.0
-
-        control_flow_ops.cond(p, true_fn, false_fn)
-        one = constant_op.constant(1.0)
-        one = c.mark_as_return(one)
-      one.eval(feed_dict={p: False})
-      self.assertAllEqual(v.read_value().eval(), 5.0)
-      one.eval(feed_dict={p: True})
-      self.assertAllEqual(v.read_value().eval(), 6.0)
-
-  def testCondNested(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
-      p = array_ops.placeholder(dtype=dtypes.bool)
-      q = array_ops.placeholder(dtype=dtypes.bool)
-      with function.AutomaticControlDependencies() as c:
-
-        def true_fn():
-          v.assign(v + 1, name='true')
-          return 1.0
-
-        def false_fn():
-
-          def inner_true_fn():
-            v.assign(v * 2, name='false_true')
-            return 2.0
-
-          def inner_false_fn():
-            v.assign(v * 3, name='false_false')
-            return 3.0
-
-          control_flow_ops.cond(q, inner_true_fn, inner_false_fn)
-          return 1.0
-
-        control_flow_ops.cond(p, true_fn, false_fn)
-        with ops.name_scope('final'):
-          val = v.read_value()
-        val = c.mark_as_return(val)
-      self.assertAllEqual(val.eval(feed_dict={p: False, q: False}), 3.0)
-      self.assertAllEqual(val.eval(feed_dict={p: False, q: True}), 6.0)
-      self.assertAllEqual(val.eval(feed_dict={p: True, q: True}), 7.0)
-      self.assertAllEqual(val.eval(feed_dict={p: True, q: False}), 8.0)
-
-  def testCondOneBranch(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
-      p = array_ops.placeholder(dtype=dtypes.bool)
-      with function.AutomaticControlDependencies() as c:
-
-        def true_fn():
-          return 0.0
-
-        def false_fn():
-          v.assign(v + 4)
-          return 1.0
-
-        control_flow_ops.cond(p, true_fn, false_fn)
-        val = v.read_value()
-        val = c.mark_as_return(val)
-      self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
-      self.assertAllEqual(val.eval(feed_dict={p: True}), 5.0)
-
-  def testCondOneBranchUpdateBefore(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
-      p = array_ops.placeholder(dtype=dtypes.bool)
-      with function.AutomaticControlDependencies() as c:
-        v.assign(v * 2)
-
-        def true_fn():
-          return 0.0
-
-        def false_fn():
-          v.assign(v + 4)
-          return 1.0
-
-        control_flow_ops.cond(p, true_fn, false_fn)
-        val = v.read_value()
-        val = c.mark_as_return(val)
-      self.assertAllEqual(val.eval(feed_dict={p: False}), 6.0)
-      self.assertAllEqual(val.eval(feed_dict={p: True}), 12.0)
-
-  def testCondOneBranchUpdateAfter(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
-      p = array_ops.placeholder(dtype=dtypes.bool)
-      with function.AutomaticControlDependencies() as c:
-
-        def true_fn():
-          return 0.0
-
-        def false_fn():
-          v.assign(v + 4)
-          return 1.0
-
-        control_flow_ops.cond(p, true_fn, false_fn)
-        v.assign(v * 2)
-        val = v.read_value()
-        val = c.mark_as_return(val)
-      self.assertAllEqual(val.eval(feed_dict={p: False}), 10.0)
-      self.assertAllEqual(val.eval(feed_dict={p: True}), 20.0)
-
-  def testDefunWhileLoopWithCapturedLoopVars(self):
-    n = 3
-    x = constant_op.constant(list(range(n)))
-
-    @function.defun
-    def loop():
-      c = lambda i, x: i < n
-      b = lambda i, x: (i + 1, x + 1)
-      i, out = control_flow_ops.while_loop(c, b, (0, x))
-      return i, out
-
-    i, out = loop()
-    self.assertEqual(int(i), 3)
-    self.assertAllEqual(out, [3, 4, 5])
-
-  def testDecorator(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
-
-      @function.automatic_control_dependencies
-      def f():
-        v.assign(v + 1)
-        v.assign(2 * v)
-        return v.read_value()
-
-      self.assertAllEqual(f().eval(), 4.0)
-
-  def testOptimizerInDefun(self):
-    def loss(v):
-      return v**2
-
-    optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0)
-
-    @function.defun
-    def train():
-      self.v = resource_variable_ops.ResourceVariable(1.0)
-      grad = backprop.implicit_grad(loss)(self.v)
-      optimizer.apply_gradients(grad)
-      return self.v.read_value()
-
-    value = train()
-    self.assertEqual(value.numpy(), -1.0)
-
-  def testReturningNonTensorRaisesError(self):
-    optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0)
-    optimizer.apply_gradients = function.defun(optimizer.apply_gradients)
-    v = resource_variable_ops.ResourceVariable(1.0)
-    grad = backprop.implicit_grad(lambda v: v**2)(v)
-
-    with self.assertRaisesRegexp(TypeError,
-                                 '.*must return zero or more Tensors.*'):
-      # TODO(akshayka): We might want to allow defun-ing Python functions
-      # that return operations (and just execute the op instead of running it).
-      optimizer.apply_gradients(grad)
-
-  # TODO(b/111663004): This should work when the outer context is graph
-  # building.
-  def testOptimizerNonSlotVarsInDefunNoError(self):
-    def loss(v):
-      return v**2
-
-    optimizer = adam.AdamOptimizer(learning_rate=1.0)
-
-    @function.defun
-    def train():
-      self.v = resource_variable_ops.ResourceVariable(1.0)
-      grad = backprop.implicit_grad(loss)(self.v)
-      optimizer.apply_gradients(grad)
-      return self.v.read_value()
+    class_method_one = DefunnedMiniModel.call
+    class_method_two = DefunnedMiniModel.call
 
-    train()
+    m_method_one = m.call
+    m_method_two = m.call
 
-  def testOptimizerInDefunWithCapturedVariable(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
-    def loss():
-      return v**2
+    n_method_one = n.call
+    n_method_two = n.call
 
-    optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0)
+    self.assertEqual(class_method_one, class_method_two)
+    self.assertEqual(m_method_one, m_method_two)
+    self.assertEqual(n_method_one, n_method_two)
+    self.assertNotEqual(m.call, n.call)
 
-    @function.defun
-    def train():
-      grad = backprop.implicit_grad(loss)()
-      optimizer.apply_gradients(grad)
+  def testDecoratedMethodInspect(self):
+    m = DefunnedMiniModel()
+    fullargspec = tf_inspect.getfullargspec(m.call)
+    self.assertIn('training', fullargspec.args)
 
-    train()
-    self.assertEqual(v.numpy(), -1.0)
+  def testDecoratedMethodGetConcreteFunction(self):
+    m = DefunnedMiniModel()
+    instance_call_one = m.call.get_concrete_function(
+        array_ops.ones([1, 2]), training=False)
+    instance_call_two = m.call.get_concrete_function(
+        inputs=array_ops.ones([1, 2]), training=False)
+    self.assertAllEqual(instance_call_one(array_ops.ones([1, 2])),
+                        instance_call_two(array_ops.ones([1, 2])))
+
+    # Also make sure get_concrete_function works on the class method
+    DefunnedMiniModel.call.get_concrete_function(
+        m, array_ops.ones([1, 2]), training=False)
+    DefunnedMiniModel.call.get_concrete_function(
+        m, inputs=array_ops.ones([1, 2]), training=True)
 
   def testFunctionModifiesInputList(self):
     # Tests on `list` methods that do in place modification, except `list.sort`
@@ -2182,7 +1860,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def append(l):
         l.append(constant_op.constant(0.))
 
@@ -2190,7 +1868,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def extend(l):
         l.extend([constant_op.constant(0.)])
 
@@ -2198,7 +1876,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def insert(l):
         l.insert(0, constant_op.constant(0.))
 
@@ -2206,7 +1884,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def pop(l):
         l.pop()
 
@@ -2214,7 +1892,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def reverse(l):
         l.reverse()
 
@@ -2222,7 +1900,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def remove(l):
         l.remove(l[0])
 
@@ -2233,7 +1911,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
       with self.assertRaisesRegexp(ValueError, expected_msg):
 
-        @function.defun
+        @def_function.function
         def clear(l):
           l.clear()
 
@@ -2242,7 +1920,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
     # One last test for keyword arguments
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def kwdappend(**kwargs):
         l = kwargs['l']
         l.append(constant_op.constant(0.))
@@ -2262,7 +1940,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def clear(m):
         m.clear()
 
@@ -2270,7 +1948,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def pop(m):
         m.pop('t1')
 
@@ -2278,7 +1956,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def popitem(m):
         m.popitem()
 
@@ -2286,7 +1964,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def update(m):
         m.update({'t1': constant_op.constant(3.)})
 
@@ -2294,7 +1972,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def setdefault(m):
         m.setdefault('t3', constant_op.constant(3.))
 
@@ -2310,7 +1988,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
 
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
-      @function.defun
+      @def_function.function
       def modify(n):
         n[0]['t1'].append(constant_op.constant(1.))
 
@@ -2325,7 +2003,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, expected_msg):
 
       # The flat list doesn't change whereas the true structure changes
-      @function.defun
+      @def_function.function
       def modify_same_flat(n):
         n[0].append(n[1].pop(0))
 
@@ -2343,6 +2021,26 @@ class AutomaticControlDependenciesTest(test.TestCase):
     del m
     self.assertEqual([], list(weak_variables))
 
+  def testExecutorType(self):
+    @function.defun
+    def add_five(x):
+      return x + 5
+
+    self.assertEqual(
+        5,
+        add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
+
+    with self.assertRaisesRegexp(errors.NotFoundError, 'NON_EXISTENT_EXECUTOR'):
+      with context.function_executor_type('NON_EXISTENT_EXECUTOR'):
+        add_five(constant_op.constant(0, dtype=dtypes.int32))
+
+    for executor_type in ('', 'DEFAULT', None):
+      with context.function_executor_type(executor_type):
+        self.assertAllEqual(
+            5,
+            add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
+
+
 if __name__ == '__main__':
   ops.enable_eager_execution(
       config=config_pb2.ConfigProto(device_count={'CPU': 4}))
diff --git a/tensorflow/python/eager/graph_only_ops_test.py b/tensorflow/python/eager/graph_only_ops_test.py
index 3cf3a61a62b1b22f092ad505017fd54f278b3f95..914b4d9a95ab307a41d1a3c0dba453475edc3956 100644
--- a/tensorflow/python/eager/graph_only_ops_test.py
+++ b/tensorflow/python/eager/graph_only_ops_test.py
@@ -29,12 +29,14 @@ from tensorflow.python.platform import test
 
 class GraphOnlyOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testGraphZerosLike(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     z_tf = graph_only_ops.graph_zeros_like(x)
     with self.cached_session():
-      self.assertAllClose(np.zeros((2, 3)), z_tf.eval())
+      self.assertAllClose(np.zeros((2, 3)), self.evaluate(z_tf))
 
+  @test_util.run_deprecated_v1
   def testGraphPlaceholder(self):
     x_tf = graph_only_ops.graph_placeholder(dtypes.int32, shape=(1,))
     y_tf = math_ops.square(x_tf)
diff --git a/tensorflow/python/eager/imperative_grad.py b/tensorflow/python/eager/imperative_grad.py
index 5f5af4ab6c809d811f4978eb31c27aa1c6d59153..b9b4355e663c3ca479a03dfb4e241dcaa1e2e6ec 100644
--- a/tensorflow/python/eager/imperative_grad.py
+++ b/tensorflow/python/eager/imperative_grad.py
@@ -21,7 +21,8 @@ from __future__ import print_function
 import collections
 
 from tensorflow.python import pywrap_tensorflow
-
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
+from tensorflow.python.util import compat
 
 VSpace = collections.namedtuple("VSpace", [
     "aggregate_fn", "num_elements_fn", "zeros_fn", "ones_fn", "graph_shape_fn"
@@ -32,7 +33,8 @@ def imperative_grad(
     tape,
     target,
     sources,
-    output_gradients=None):
+    output_gradients=None,
+    unconnected_gradients=UnconnectedGradients.NONE):
   """Computes gradients from the imperatively defined tape on top of the stack.
 
   Works by filtering the tape, computing how many downstream usages are of each
@@ -45,20 +47,26 @@ def imperative_grad(
    sources: list of Tensors for which we want gradients
    output_gradients: if not None, a list of gradient provided for each Target,
     or None if we are to use the target's computed downstream gradient.
+   unconnected_gradients: determines the value returned if the target and
+    sources are unconnected. When 'none' the value returned is None wheras when
+    'zero' a zero tensor in the same shape as the sources is returned.
 
   Returns:
    the gradient wrt each of the sources.
 
   Raises:
+    ValueError: if the arguments are invalid.
     RuntimeError: if something goes wrong.
-    ValueError: if there is no sequence of differentiable operations connecting
-     a source and any target Tensor. This can happen either if the target is
-     not computed based on the source, if the tracing was set up incorrectly,
-     or if only non-differentiable functions of the source were used in the
-     computation of target.
   """
+  try:
+    unconnected_gradients = UnconnectedGradients(unconnected_gradients)
+  except ValueError:
+    raise ValueError(
+        "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
+
   return pywrap_tensorflow.TFE_Py_TapeGradient(
       tape._tape,  # pylint: disable=protected-access
       target,
       sources,
-      output_gradients)
+      output_gradients,
+      compat.as_str(unconnected_gradients.value))
diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..c231264047bedccbb11abf996ff9ac93f15964f9
--- /dev/null
+++ b/tensorflow/python/eager/lift_to_graph.py
@@ -0,0 +1,88 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=unidiomatic-typecheck
+"""Utility to lift subgraphs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+
+
+def _graph_inputs(op):
+  return [x.op for x in op.inputs] + list(op.control_inputs)
+
+
+def lift_to_graph(init_tensor, graph, sources=None):
+  """Copies the tensor and all its inputs recursively to the outer graph."""
+  # Check that the initializer does not depend on any placeholders.
+  if sources is None:
+    sources = set([])
+  visited_ops = set([x.op for x in sources])
+  ops_to_visit = [init_tensor.op]
+  op_outputs = collections.defaultdict(set)
+  while ops_to_visit:
+    op = ops_to_visit.pop()
+    if op in visited_ops:
+      continue
+    visited_ops.add(op)
+    # TODO(apassos) distinguish arg placeholders, capture placeholders,
+    # and placeholders the user might directly use to initialize
+    # variables.
+    if op.type == "Placeholder":
+      raise ValueError(
+          "Unable to lift tensor", init_tensor,
+          "because it depends transitively on placeholder ", op)
+    for inp in _graph_inputs(op):
+      op_outputs[inp].add(op)
+      if inp not in visited_ops and inp not in sources:
+        ops_to_visit.append(inp)
+  # Topologically sort the nodes we've extracted. Now we know how many of their
+  # outputs are part of this subgraph.
+  ops_to_copy = []
+  marked_ops = set([])
+  ops_to_visit = [init_tensor.op]
+  while ops_to_visit:
+    op = ops_to_visit.pop()
+    if op in marked_ops:
+      continue
+    marked_ops.add(op)
+    ops_to_copy.append(op)
+    for inp in _graph_inputs(op):
+      if all(x in marked_ops for x in op_outputs[inp]) and inp not in sources:
+        ops_to_visit.append(inp)
+  assert len(ops_to_copy) == len(visited_ops)
+  # ops_to_copy now holds a reverse topologically sorted list of ops which
+  # ends in the initializer. We copy those to the outermost graph and
+  # build the initialization op there.
+  with graph.as_default():
+    op_map = {}
+    for s in sources:
+      op_map[s] = array_ops.placeholder(dtype=s.dtype, shape=s.shape)
+    for op in reversed(ops_to_copy):
+      copied_inputs = [op_map[x] for x in op.inputs]
+      copied_control_inputs = [op_map[x] for x in op.control_inputs]
+      with ops.control_dependencies(copied_control_inputs):
+        copied_op = graph.create_op(
+            op.type, copied_inputs, [x.dtype for x in op.outputs],
+            attrs=op.node_def.attr)
+      op_map[op] = copied_op
+      for i, o in enumerate(op.outputs):
+        op_map[o] = copied_op.outputs[i]
+    return op_map
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 5f44bd4fecd31cb20dc36579004f588a9de5d614..30a93fb0e421e0b26f517a03302d2e96913d8b9a 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 
+#include "tensorflow/core/framework/types.h"
+
 #include "structmember.h"  // NOLINT // For PyMemberDef
 
 // forward declare
@@ -135,6 +137,40 @@ PyObject* PyIntFromDataType(TF_DataType l) {
 }  // namespace
 
 namespace tensorflow {
+// This function checks whether the desired type is "compatible" with the
+// inferred type. At a high level, compatibility means that all integral types
+// are compatible with each other, and all floating types are compatible with
+// each other.
+//
+// Type compatibility doesn't consider overflows (i.e. int64 is *always*
+// compatible with int32). This is intended to match graph behavior.
+bool IsCompatible(int desired_dtype, TF_DataType returned_dtype) {
+  tensorflow::DataType desired =
+      static_cast<tensorflow::DataType>(desired_dtype);
+  tensorflow::DataType returned =
+      static_cast<tensorflow::DataType>(returned_dtype);
+
+  if (desired == returned) return true;
+
+  if (tensorflow::DataTypeIsInteger(desired) &&
+      tensorflow::DataTypeIsInteger(returned)) {
+    return true;
+  } else if (tensorflow::DataTypeIsFloating(desired) &&
+             (tensorflow::DataTypeIsFloating(returned) ||
+              tensorflow::DataTypeIsInteger(returned))) {
+    return true;
+  } else if (tensorflow::DataTypeIsComplex(desired) &&
+             (tensorflow::DataTypeIsComplex(returned) ||
+              tensorflow::DataTypeIsInteger(returned) ||
+              tensorflow::DataTypeIsFloating(returned))) {
+    return true;
+  } else if (tensorflow::DataTypeIsQuantized(desired) &&
+             tensorflow::DataTypeIsInteger(returned)) {
+    return true;
+  }
+  return false;
+}
+
 // Casts data referred to by `handle` from type `src_type_enum` to type
 // `dst_type_enum`.
 TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
@@ -184,6 +220,14 @@ TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) {
       return nullptr;
     }
   }
+  tensorflow::Safe_PyObjectPtr value_decrefer;
+  if (PyArray_IsScalar(value, Generic)) {
+    // Convert numpy scalars to numpy arrays.
+    value = PyArray_FromScalar(value, nullptr);
+    // The returned value needs to be DECREF'd, but the original value was
+    // created in python code, and doesn't need to be DECREF'd.
+    value_decrefer.reset(value);
+  }
   if (PyArray_Check(value)) {
     int desired_np_dtype = -1;
     if (desired_dtype >= 0) {
@@ -376,20 +420,40 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   if (handle == nullptr) return -1;
   TF_DataType handle_dtype = TFE_TensorHandleDataType(handle.get());
   if (desired_dtype >= 0 && desired_dtype != handle_dtype) {
-    handle = tensorflow::make_safe(tensorflow::EagerCast(
-        GetContext(context), handle.get(), handle_dtype,
-        static_cast<TF_DataType>(desired_dtype), self->status));
-    if (TF_GetCode(self->status) != TF_OK) {
-      PyErr_SetString(PyExc_TypeError,
-                      tensorflow::strings::StrCat(
-                          "Error while casting from DataType ", handle_dtype,
-                          " to ", desired_dtype, ". ", TF_Message(self->status))
-                          .c_str());
-      // Cleanup self->status before returning.
-      TF_SetStatus(self->status, TF_OK, "");
+    // Check type compatibility.
+    if (tensorflow::IsCompatible(desired_dtype, handle_dtype)) {
+      handle = tensorflow::make_safe(tensorflow::EagerCast(
+          GetContext(context), handle.get(), handle_dtype,
+          static_cast<TF_DataType>(desired_dtype), self->status));
+      if (TF_GetCode(self->status) != TF_OK) {
+        PyErr_SetString(
+            PyExc_TypeError,
+            tensorflow::strings::StrCat(
+                "Error while casting from DataType ",
+                tensorflow::DataTypeString(
+                    static_cast<tensorflow::DataType>(handle_dtype)),
+                " to ",
+                tensorflow::DataTypeString(
+                    static_cast<tensorflow::DataType>(desired_dtype)),
+                ". ", TF_Message(self->status))
+                .c_str());
+        // Cleanup self->status before returning.
+        TF_SetStatus(self->status, TF_OK, "");
+        return -1;
+      }
+      handle_dtype = TFE_TensorHandleDataType(handle.get());
+    } else {
+      tensorflow::Safe_PyObjectPtr value_str(PyObject_Str(value));
+      PyErr_SetString(
+          PyExc_TypeError,
+          tensorflow::strings::StrCat(
+              "Cannot convert provided value to EagerTensor. Provided value: ",
+              TFE_GetPythonString(value_str.get()), " Requested dtype: ",
+              tensorflow::DataTypeString(
+                  static_cast<tensorflow::DataType>(desired_dtype)))
+              .c_str());
       return -1;
     }
-    handle_dtype = TFE_TensorHandleDataType(handle.get());
   }
 
   // Almost all TensorFlow kernels for GPU devices keep int32 tensors in host
@@ -616,11 +680,29 @@ static PyObject* EagerTensor_device(EagerTensor* self) {
 #endif
 }
 
+// Getter `backing_device`.
+static PyObject* EagerTensor_backing_device(EagerTensor* self) {
+  const char* device =
+      TFE_TensorHandleBackingDeviceName(self->handle, self->status);
+  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+    // Cleanup self->status before returning.
+    TF_SetStatus(self->status, TF_OK, "");
+    return nullptr;
+  }
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_FromString(device);
+#else
+  return PyBytes_FromString(device);
+#endif
+}
+
 static PyGetSetDef EagerTensor_getseters[] = {
     {const_cast<char*>("_id"), (getter)EagerTensor_getid, nullptr,
      const_cast<char*>("_id"), nullptr},
     {const_cast<char*>("device"), (getter)EagerTensor_device, nullptr,
      const_cast<char*>("device"), nullptr},
+    {const_cast<char*>("backing_device"), (getter)EagerTensor_backing_device,
+     nullptr, const_cast<char*>("backing_device"), nullptr},
     {const_cast<char*>("_handle_data"), (getter)EagerTensor_tensor_handle,
      (setter)EagerTensor_settensor_handle, const_cast<char*>("_tensor_handle"),
      nullptr},
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 4eaa1ba5362e912b1a366d7ce27f4a8decf4e2ed..f90fd9bbb68b4bde116378db4df36a42213a0da6 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -26,6 +26,7 @@ tensorflow::DataType PyEagerTensor_Dtype(const PyObject* tensor);
 tensorflow::int64 PyEagerTensor_NumElements(const PyObject* tensor);
 
 namespace tensorflow {
+bool IsCompatible(int desired_dtype, TF_DataType returned_dtype);
 TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype);
 
 // TODO(nareshmodi): Move EagerCast and ReadVariableOp (which use the C API to
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index decd635b585c92d4143f4ed24b02f192171302ef..8d6f212499f80513eeb2a20cee8b2e0d7be21e3f 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -16,6 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
 #define TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
 
+// Place `<locale>` before <Python.h> to avoid build failure in macOS.
+#include <locale>
+
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
 #include <Python.h>
 
 #include "tensorflow/c/eager/c_api.h"
@@ -176,6 +181,7 @@ void TFE_Py_TapeWatchVariable(PyObject* tape, PyObject* variable);
 // target.
 PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
                               PyObject* sources, PyObject* output_gradients,
+                              PyObject* unconnected_gradients,
                               TF_Status* status);
 
 // Execute a tensorflow operation assuming that all provided inputs are
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 6193f40ce8e91bcc3604e8018f68083c823b5004..9ce500bc08e478815f2dbe1d5d5353eefa4f17a8 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstring>
 #include <thread>
 
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 
 #include "absl/strings/str_cat.h"
@@ -249,6 +251,7 @@ bool ParseTypeValue(const string& key, PyObject* py_value, TF_Status* status,
   tensorflow::Safe_PyObjectPtr py_type_enum(
       PyObject_GetAttrString(py_value, "_type_enum"));
   if (py_type_enum == nullptr) {
+    PyErr_Clear();
     TF_SetStatus(
         status, TF_INVALID_ARGUMENT,
         tensorflow::strings::StrCat("Expecting a DType.dtype for attr ", key,
@@ -794,6 +797,13 @@ int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) {
     if (exception_class != nullptr) {
       tensorflow::Safe_PyObjectPtr val(
           Py_BuildValue("si", msg, TF_GetCode(status)));
+      if (PyErr_Occurred()) {
+        // NOTE: This hides the actual error (i.e. the reason `status` was not
+        // TF_OK), but there is nothing we can do at this point since we can't
+        // generate a reasonable error from the status.
+        // Consider adding a message explaining this.
+        return -1;
+      }
       PyErr_SetObject(exception_class, val.get());
       return -1;
     } else {
@@ -1228,8 +1238,9 @@ static PyTypeObject TFE_Py_Tape_Type = {
 // GIL, which is always held when any TFE_Py_* methods are called. We should
 // revisit this if/when decide to not hold the GIL while manipulating the tape
 // stack.
-static tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* tape_set = nullptr;
 tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* GetTapeSet() {
+  thread_local tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>* tape_set{
+      nullptr};
   if (tape_set == nullptr) {
     tape_set = new tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*>;
   }
@@ -1264,27 +1275,10 @@ class SafeTapeSet {
   tensorflow::gtl::CompactPointerSet<TFE_Py_Tape*> tape_set_;
 };
 
-// xcode 7 doesn't define thread_local, so for compatibility we implement our
-// own. TODO(apassos) remove once we can deprecate xcode 7.
-#ifndef __APPLE__
 bool* ThreadTapeIsStopped() {
   thread_local bool thread_tape_is_stopped{false};
   return &thread_tape_is_stopped;
 }
-#else
-static std::unordered_map<std::thread::id, bool>* tape_is_stopped = nullptr;
-bool* ThreadTapeIsStopped() {
-  if (tape_is_stopped == nullptr) {
-    tape_is_stopped = new std::unordered_map<std::thread::id, bool>;
-  }
-  auto it = tape_is_stopped->find(std::this_thread::get_id());
-  if (it != tape_is_stopped->end()) {
-    return &(it->second);
-  }
-  return &(tape_is_stopped->emplace(std::this_thread::get_id(), false)
-               .first->second);
-}
-#endif
 
 void TFE_Py_TapeSetStopOnThread() { *ThreadTapeIsStopped() = true; }
 
@@ -1626,6 +1620,7 @@ std::vector<PyObject*> MakeTensorList(PyObject* tensors) {
 
 PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
                               PyObject* sources, PyObject* output_gradients,
+                              PyObject* unconnected_gradients,
                               TF_Status* status) {
   TFE_Py_Tape* tape_obj = reinterpret_cast<TFE_Py_Tape*>(tape);
   if (!tape_obj->tape->IsPersistent()) {
@@ -1650,6 +1645,29 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   if (PyErr_Occurred()) {
     return nullptr;
   }
+  tensorflow::gtl::FlatSet<tensorflow::int64> sources_set(sources_vec.begin(),
+                                                          sources_vec.end());
+
+  tensorflow::Safe_PyObjectPtr seq =
+      tensorflow::make_safe(PySequence_Fast(target, "expected a sequence"));
+  int len = PySequence_Fast_GET_SIZE(seq.get());
+  tensorflow::gtl::FlatMap<tensorflow::int64, PyTapeTensor>
+      source_tensors_that_are_targets;
+  for (int i = 0; i < len; ++i) {
+    tensorflow::int64 target_id = target_vec[i];
+    if (sources_set.find(target_id) != sources_set.end()) {
+      auto tensor = PySequence_Fast_GET_ITEM(seq.get(), i);
+      source_tensors_that_are_targets.insert(
+          std::make_pair(target_id, TapeTensorFromTensor(tensor)));
+    }
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
+  }
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
+
   std::vector<PyObject*> outgrad_vec;
   if (output_gradients != Py_None) {
     outgrad_vec = MakeTensorList(output_gradients);
@@ -1664,7 +1682,8 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   }
   std::vector<PyObject*> result;
   status->status = tape_obj->tape->ComputeGradient(
-      *py_vspace, target_vec, sources_vec, outgrad_vec, &result);
+      *py_vspace, target_vec, sources_vec, source_tensors_that_are_targets,
+      outgrad_vec, &result);
   if (!status->status.ok()) {
     if (PyErr_Occurred()) {
       // Do not propagate the erroneous status as that would swallow the
@@ -1673,13 +1692,29 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
     }
     return nullptr;
   }
+
+  bool unconnected_gradients_zero =
+      strcmp(TFE_GetPythonString(unconnected_gradients), "zero") == 0;
+  std::vector<PyObject*> sources_obj;
+  if (unconnected_gradients_zero) {
+    sources_obj = MakeTensorList(sources);
+  }
+
   if (!result.empty()) {
     PyObject* py_result = PyList_New(result.size());
     tensorflow::gtl::FlatSet<PyObject*> seen_results(result.size());
     for (int i = 0; i < result.size(); ++i) {
       if (result[i] == nullptr) {
-        Py_INCREF(Py_None);
-        result[i] = Py_None;
+        if (unconnected_gradients_zero) {
+          // generate a zeros tensor in the shape of sources[i]
+          tensorflow::DataType dtype = FastTensorDtype(sources_obj[i]);
+          PyTapeTensor tensor =
+              PyTapeTensor(sources_vec[i], dtype, sources_obj[i]);
+          result[i] = py_vspace->Zeros(tensor);
+        } else {
+          Py_INCREF(Py_None);
+          result[i] = Py_None;
+        }
       } else if (seen_results.find(result[i]) != seen_results.end()) {
         Py_INCREF(result[i]);
       }
@@ -1842,7 +1877,7 @@ bool OpGradientDoesntRequireOutputIndices(
           {"Conv3DBackpropInputV2", {true, {}}},
           {"AvgPool3D", {true, {}}},
           {"AvgPool3DGrad", {true, {}}},
-          {"MaxPool3D", {true, {}}},
+          {"MaxPool3D", {false, {}}},
           {"MaxPool3DGrad", {true, {}}},
           {"MaxPool3DGradGrad", {true, {}}},
           {"BiasAdd", {true, {}}},
@@ -1852,6 +1887,8 @@ bool OpGradientDoesntRequireOutputIndices(
           {"SoftplusGrad", {true, {}}},
           {"Softsign", {true, {}}},
           {"ReluGrad", {true, {}}},
+          {"LeakyRelu", {true, {}}},
+          {"LeakyReluGrad", {true, {}}},
           {"Conv2D", {true, {}}},
           {"DepthwiseConv2dNative", {true, {}}},
           {"Dilation2D", {true, {}}},
@@ -2101,7 +2138,9 @@ bool CastTensor(const FastPathOpExecInfo& op_exec_info,
     *handle = tensorflow::make_safe(
         tensorflow::EagerCast(op_exec_info.ctx, handle->get(), input_dtype,
                               static_cast<TF_DataType>(desired_dtype), status));
-    if (!status->status.ok()) return false;
+    if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+      return false;
+    }
     output_dtype = desired_dtype;
   }
 
@@ -2110,7 +2149,9 @@ bool CastTensor(const FastPathOpExecInfo& op_exec_info,
     // if copying to the same device.
     *handle = tensorflow::make_safe(TFE_TensorHandleCopyToDevice(
         handle->get(), op_exec_info.ctx, op_exec_info.device_name, status));
-    if (!status->status.ok()) return false;
+    if (MaybeRaiseExceptionFromTFStatus(status, nullptr)) {
+      return false;
+    }
   }
   return true;
 }
@@ -2193,14 +2234,19 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
   return true;
 }
 
-// Supports only 2 cases at the moment:
-//  i) input is an EagerTensor
+// Supports 3 cases at the moment:
+//  i) input is an EagerTensor.
 //  ii) input is a ResourceVariable - in this case, the is_variable param is
 //  set to true.
+//  iii) input is an arbitrary python list/tuple (note, this handling doesn't
+//  support packing).
 //
 //  NOTE: dtype_hint_getter must *always* return a PyObject that can be
 //  decref'd. So if no hint is found, Py_RETURN_NONE (which correctly
 //  increfs Py_None).
+//
+//  NOTE: This function sets a python error directly, and returns false.
+//  TF_Status is only passed since we don't want to have to reallocate it.
 bool ConvertToTensor(
     const FastPathOpExecInfo& op_exec_info, PyObject* input,
     tensorflow::Safe_PyObjectPtr* output_handle,
@@ -2227,25 +2273,45 @@ bool ConvertToTensor(
       tensorflow::make_safe(static_cast<TFE_TensorHandle*>(
           tensorflow::ConvertToEagerTensor(input, dtype_hint.get())));
   if (handle == nullptr) {
-    status->status = tensorflow::errors::InvalidArgument(
-        "Unable to convert value to tensor");
-    return false;
+    return MaybeRaiseExceptionFromTFStatus(status, nullptr);
   }
 
   int desired_dtype = -1;
   if (dtype_hint.get() != Py_None) {
     if (!ParseTypeValue("", dtype_hint.get(), status, &desired_dtype)) {
-      status->status = tensorflow::errors::InvalidArgument(
-          "Expecting a DataType value for dtype. Got ",
-          Py_TYPE(dtype_hint.get())->tp_name);
+      PyErr_SetString(PyExc_TypeError,
+                      tensorflow::strings::StrCat(
+                          "Expecting a DataType value for dtype. Got ",
+                          Py_TYPE(dtype_hint.get())->tp_name)
+                          .c_str());
+      return false;
     }
   }
 
-  if (!CastTensor(op_exec_info, static_cast<TF_DataType>(desired_dtype),
-                  &handle, status)) {
-    return false;
-  }
+  // Maybe cast to the desired type. This is intended to match python
+  // convert_to_tensor behavior.
   TF_DataType output_dtype = TFE_TensorHandleDataType(handle.get());
+  if (desired_dtype >= 0 && desired_dtype != output_dtype) {
+    if (tensorflow::IsCompatible(desired_dtype, output_dtype)) {
+      if (!CastTensor(op_exec_info, static_cast<TF_DataType>(desired_dtype),
+                      &handle, status)) {
+        return false;
+      }
+      output_dtype = TFE_TensorHandleDataType(handle.get());
+    } else {
+      tensorflow::Safe_PyObjectPtr input_str(PyObject_Str(input));
+      PyErr_SetString(
+          PyExc_TypeError,
+          tensorflow::strings::StrCat(
+              "Cannot convert provided value to EagerTensor. Provided value: ",
+              TFE_GetPythonString(input_str.get()), " Requested dtype: ",
+              tensorflow::DataTypeString(
+                  static_cast<tensorflow::DataType>(desired_dtype)))
+              .c_str());
+      return false;
+    }
+  }
+
   output_handle->reset(EagerTensorFromHandle(handle.release()));
   dtype_setter(output_dtype);
 
@@ -2561,7 +2627,12 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     if (!input_arg.number_attr().empty()) {
       // The item is a homogeneous list.
       if (!RaiseIfNotPySequence(input, input_arg.number_attr())) return nullptr;
-      Py_ssize_t len = PySequence_Fast_GET_SIZE(input);
+      tensorflow::Safe_PyObjectPtr fast_input(
+          PySequence_Fast(input, "Could not parse sequence."));
+      if (fast_input.get() == nullptr) {
+        return nullptr;
+      }
+      Py_ssize_t len = PySequence_Fast_GET_SIZE(fast_input.get());
 
       TFE_OpSetAttrInt(op, input_arg.number_attr().data(), len);
       if (op_exec_info.run_callbacks) {
@@ -2573,15 +2644,17 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
-                          true, input_arg, flattened_attrs.get(),
+        if (!AddInputToOp(&op_exec_info,
+                          PySequence_Fast_GET_ITEM(fast_input.get(), 0), true,
+                          input_arg, flattened_attrs.get(),
                           flattened_inputs.get(), op, status)) {
           return nullptr;
         }
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(&op_exec_info, PySequence_Fast_GET_ITEM(input, j),
+          if (!AddInputToOp(&op_exec_info,
+                            PySequence_Fast_GET_ITEM(fast_input.get(), j),
                             false, input_arg, nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
             return nullptr;
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 399d90223c21fa2ed28493059e355ea768a3d796..e501b403a39144a673e8ac5155edf0498425bcd6 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -21,6 +21,15 @@ from __future__ import print_function
 import contextlib
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# There is a circular dependency between this, ops.py, and
+# distribution_strategy_context.
+# TODO(b/117329403): Remove this circular dependency.
+distribution_strategy_context = LazyLoader(
+    "distribute_lib", globals(),
+    "tensorflow.python.training."
+    "distribution_strategy_context")
 
 
 class Tape(object):
@@ -52,12 +61,28 @@ def watch(tape, tensor):
 
 def watch_variable(tape, variable):
   """Marks this variable to be watched by the given tape."""
-  pywrap_tensorflow.TFE_Py_TapeWatchVariable(tape._tape, variable)  # pylint: disable=protected-access
+  strategy = distribution_strategy_context.get_distribution_strategy()
+  if distribution_strategy_context.get_replica_context():
+    variables = [strategy.extended.value_container(variable)]
+  else:
+    variables = strategy.unwrap(variable)
+  for var in variables:
+    pywrap_tensorflow.TFE_Py_TapeWatchVariable(tape._tape, var)  # pylint: disable=protected-access
 
 
 def variable_accessed(variable):
-  """Notifies all tapes in the stack that a variable has been accessed."""
-  pywrap_tensorflow.TFE_Py_TapeVariableAccessed(variable)
+  """Notifies all tapes in the stack that a variable has been accessed.
+
+  Args:
+    variable: variable to be watched.
+  """
+  strategy = distribution_strategy_context.get_distribution_strategy()
+  if distribution_strategy_context.get_replica_context():
+    variables = [strategy.extended.value_container(variable)]
+  else:
+    variables = strategy.unwrap(variable)
+  for var in variables:
+    pywrap_tensorflow.TFE_Py_TapeVariableAccessed(var)
 
 
 def pop_tape(tape):
diff --git a/tensorflow/python/eager/tape_test.py b/tensorflow/python/eager/tape_test.py
index acd0e569f11a90e2cc53e113f59df6f072a6de42..48d3b8ac6ee0fb5b747caf32b034f82959611292 100644
--- a/tensorflow/python/eager/tape_test.py
+++ b/tensorflow/python/eager/tape_test.py
@@ -80,8 +80,8 @@ class TapeTest(test.TestCase):
       tf_e = tf_d + tf_f
       tf_da, tf_db = gradients_impl.gradients(tf_e, [tf_a, tf_b])
 
-      self.assertAllEqual(da, tf_da.eval())
-      self.assertAllEqual(db, tf_db.eval())
+      self.assertAllEqual(da, self.evaluate(tf_da))
+      self.assertAllEqual(db, self.evaluate(tf_db))
 
   def testBasicFunctional(self):
 
@@ -142,8 +142,8 @@ class TapeTest(test.TestCase):
       tf_rr = 2 * math_ops.reduce_sum(tf_mm)
       tf_da, tf_db = gradients_impl.gradients(tf_rr, [tf_a, tf_b])
 
-      self.assertAllEqual(da, tf_da.eval())
-      self.assertAllEqual(db, tf_db.eval())
+      self.assertAllEqual(da, self.evaluate(tf_da))
+      self.assertAllEqual(db, self.evaluate(tf_db))
 
   def testGcTwoOutputs(self):
 
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 344a9b25bdd254ea3abfdfa94ebd86516a7ffe0f..0ee2ff68c209aa13aaeb32be610302c11616b9d7 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import copy
 import re
+import sys
 
 import numpy as np
 
@@ -32,6 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import io_ops
 
 
 def _create_tensor(value, device=None, dtype=None):
@@ -93,6 +95,18 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     t = _create_tensor(values)
     self.assertAllEqual(values, t)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testNumpyDtypeSurvivesThroughTensorConversion(self):
+    scalar_creators = [np.int32, np.int64, np.float32, np.float64]
+    conversion_functions = [ops.convert_to_tensor, constant_op.constant]
+
+    for scalar_creator in scalar_creators:
+      for conversion_function in conversion_functions:
+        np_val = scalar_creator(3)
+        tensor_val = conversion_function(np_val)
+        self.assertEqual(tensor_val.numpy().dtype, np_val.dtype)
+        self.assertEqual(tensor_val.numpy(), np_val)
+
   def testNumpyValueWithCast(self):
     values = np.array([3.0], dtype=np.float32)
     t = _create_tensor(values, dtype=dtypes.float64)
@@ -126,6 +140,23 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     tensor = constant_op.constant(numpy_tensor)
     self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
 
+  def testLenAgreesWithNumpy(self):
+    numpy_tensor = np.asarray(1.0)
+    tensor = constant_op.constant(numpy_tensor)
+    with self.assertRaises(TypeError):
+      len(numpy_tensor)
+    with self.assertRaisesRegexp(
+        TypeError, r"Scalar tensor has no `len[(][)]`"):
+      len(tensor)
+
+    numpy_tensor = np.asarray([1.0, 2.0, 3.0])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(len(numpy_tensor), len(tensor))
+
+    numpy_tensor = np.asarray([[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(len(numpy_tensor), len(tensor))
+
   def testCopy(self):
     t = constant_op.constant(1.0)
     tt = copy.copy(t)
@@ -137,8 +168,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(t, 1.0)
 
   def testConstantDtype(self):
-    self.assertEqual(constant_op.constant(1.0, dtype=np.int64).dtype,
-                     dtypes.int64)
+    self.assertEqual(
+        constant_op.constant(1, dtype=np.int64).dtype, dtypes.int64)
 
   def testTensorAndNumpyMatrix(self):
     expected = np.array([[1.0, 2.0], [3.0, 4.0]], np.float32)
@@ -156,9 +187,13 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.float64, t.dtype)
 
   def testBool(self):
-    t = _create_tensor(False)
-    if t:
-      self.assertFalse(True)
+    self.assertFalse(bool(_create_tensor(False)))
+    self.assertFalse(bool(_create_tensor([False])))
+    self.assertFalse(bool(_create_tensor([[False]])))
+    self.assertFalse(bool(_create_tensor([0])))
+    self.assertFalse(bool(_create_tensor([0.])))
+    self.assertTrue(bool(_create_tensor([1])))
+    self.assertTrue(bool(_create_tensor([1.])))
 
   def testIntDowncast(self):
     t = _create_tensor(3)
@@ -242,6 +277,76 @@ class TFETensorTest(test_util.TensorFlowTestCase):
           RuntimeError, "Can't copy Tensor with type string to device"):
         _create_tensor("test string")
 
+  def testInvalidUTF8ProducesReasonableError(self):
+    if sys.version_info[0] < 3:
+      self.skipTest("Test is only valid in python3.")
+    with self.assertRaises(UnicodeDecodeError):
+      io_ops.read_file(b"\xff")
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorPreferredDtypeIsRespected(self):
+    self.assertEqual(
+        ops.convert_to_tensor(0.5, preferred_dtype=dtypes.int32).dtype,
+        dtypes.float32)
+    self.assertEqual(
+        ops.convert_to_tensor(0.5, preferred_dtype=dtypes.float64).dtype,
+        dtypes.float64)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testCompatibility(self):
+    integer_types = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                     dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
+
+    # Floats are not compatible with ints
+    for t in integer_types:
+      with self.assertRaises(TypeError):
+        constant_op.constant(0.5, dtype=t)
+
+    # Ints compatible with floats
+    self.assertEqual(
+        self.evaluate(constant_op.constant(5, dtype=dtypes.float16)), 5.0)
+    self.assertEqual(
+        self.evaluate(constant_op.constant(5, dtype=dtypes.float32)), 5.0)
+    self.assertEqual(
+        self.evaluate(constant_op.constant(5, dtype=dtypes.float64)), 5.0)
+    self.assertEqual(
+        self.evaluate(constant_op.constant(5, dtype=dtypes.bfloat16)), 5.0)
+
+    # Ints and floats are compatible with complex types
+    self.assertEqual(
+        constant_op.constant([[1.0]], dtype=dtypes.complex128).dtype,
+        dtypes.complex128)
+    self.assertEqual(
+        constant_op.constant([[1]], dtype=dtypes.complex128).dtype,
+        dtypes.complex128)
+
+    # Quantized types are not compatible with floats
+    quantized_types = [dtypes.qint16, dtypes.qint32, dtypes.qint8,
+                       dtypes.quint16, dtypes.quint8]
+
+    for t in quantized_types:
+      with self.assertRaises(TypeError):
+        constant_op.constant(0.5, dtype=t)
+
+    # TODO(b/118402529): quantized types are broken in eager.
+
+  @test_util.run_in_graph_and_eager_modes
+  def testCConvertToTensor(self):
+    with self.assertRaises(TypeError):
+      _ = constant_op.constant(0) < 0.5
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorAllowsOverflow(self):
+    _ = ops.convert_to_tensor(123456789, dtype=dtypes.uint8)
+
+  def testEagerTensorError(self):
+    with self.assertRaisesRegexp(
+        TypeError,
+        "Cannot convert provided value to EagerTensor. "
+        "Provided value.*Requested dtype.*"):
+      _ = ops.convert_to_tensor(1., dtype=dtypes.int32)
+
+
 
 class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
@@ -347,6 +452,13 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
     instance_dir.remove("test_attr")
     self.assertEqual(instance_dir, type_dir)
 
+  def testNonRectangularPackAsConstant(self):
+    l = [array_ops.zeros((10, 1)).numpy(), array_ops.zeros(1).numpy()]
+
+    with self.assertRaisesRegexp(
+        ValueError, "non-rectangular Python sequence"):
+      constant_op.constant(l)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/test.py b/tensorflow/python/eager/test.py
index 33ee797678ed73c52ebb17723f688cec4feca402..a45deac962de931ebd8a8804cea7fef2b3f97629 100644
--- a/tensorflow/python/eager/test.py
+++ b/tensorflow/python/eager/test.py
@@ -24,6 +24,6 @@ from tensorflow.python.platform.test import *  # pylint: disable=wildcard-import
 
 
 # TODO(akshayka): Do away with this file.
-def main(argv=None):
+def main(argv=None):  # pylint: disable=function-redefined
   _ops.enable_eager_execution()
   _test.main(argv)
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b39e99a4ea5d145f9bb8cef5c5931c306bcaeea
--- /dev/null
+++ b/tensorflow/python/eager/wrap_function.py
@@ -0,0 +1,147 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=unidiomatic-typecheck
+"""Prototype decorator for defining legacy-graph-mode functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import function
+from tensorflow.python.eager import lift_to_graph
+from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+class VariableHolder(object):
+  """Holds variables for a python function."""
+
+  def __init__(self, fn):
+    self._fn = fn
+    self._variables = []
+
+  def variable_creator_scope(self, next_creator, **kwargs):
+    v = next_creator(**kwargs)
+    self._variables.append(v)
+    return v
+
+  def __call__(self, *args, **kwargs):
+    with variable_scope.variable_creator_scope(self.variable_creator_scope):
+      return self._fn(*args, **kwargs)
+
+
+# TODO(allenl): make this checkpointable
+class WrappedFunction(function.Function):
+  """Wraps a tf V1 piece of code in a function."""
+
+  def __init__(self, fn_graph, variable_holder, attrs=None, signature=None):
+    super(WrappedFunction, self).__init__(
+        fn_graph, attrs=attrs, signature=signature)
+    self._variable_holder = variable_holder
+
+  def prune(self, feeds, fetches):
+    flat_feeds, flat_fetches = nest.flatten(feeds), nest.flatten(fetches)
+    for f in flat_feeds + flat_fetches:
+      if not isinstance(f, ops.Tensor):
+        raise ValueError("Feeds and fetches must be tensors.")
+      if f.graph is not self._func_graph:
+        raise ValueError(
+            "Can only prune function whose feeds and fetches "
+            "are from this graph (%s). Tensor %s from graph %s" % (
+                self._func_graph, f, f.graph))
+    with self._func_graph.as_default():
+      pruned_graph = func_graph.FuncGraph("pruned")
+      sink_tensor = array_ops.identity_n(flat_fetches)[0]
+    lift_map = lift_to_graph.lift_to_graph(
+        sink_tensor, pruned_graph, sources=flat_feeds)
+    pruned_graph.outputs.extend(lift_map[x] for x in flat_fetches)
+    pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
+    pruned_fn = WrappedFunction(
+        pruned_graph, variable_holder=self._variable_holder)
+    pruned_fn._num_positional_args = len(flat_feeds)  # pylint: disable=protected-access
+    pruned_fn._arg_keywords = []  # pylint: disable=protected-access
+    return pruned_fn
+
+
+@tf_export(v1=["wrap_function"])
+def wrap_function(fn, signature, name=None):
+  """Wraps the TF 1.x function fn into a graph function.
+
+  The python function `fn` will be called once with symbolic arguments specified
+  in the `signature`, traced, and turned into a graph function. Any variables
+  created by `fn` will be owned by the object returned by `wrap_function`. The
+  resulting graph function can be called with tensors which match the
+  signature.
+
+  ```python
+  def f(x, do_add):
+    v = tf.Variable(5.0)
+    if do_add:
+      op = v.assign_add(x)
+    else:
+      op = v.assign_sub(x)
+    with tf.control_dependencies([op]):
+      return v.read_value()
+
+  f_add = tf.compat.v1.wrap_function(f, [tf.TensorSpec((), tf.float32), True])
+
+  assert float(f_add(1.0)) == 6.0
+  assert float(f_add(1.0)) == 7.0
+
+  # Can call tf.compat.v1.wrap_function again to get a new trace, a new set
+  # of variables, and possibly different non-template arguments.
+  f_sub= tf.compat.v1.wrap_function(f, [tf.TensorSpec((), tf.float32), False])
+
+  assert float(f_sub(1.0)) == 4.0
+  assert float(f_sub(1.0)) == 3.0
+  ```
+
+  Both `tf.compat.v1.wrap_function` and `tf.function` create a callable
+  TensorFlow graph. But while `tf.function` runs all stateful operations
+  (e.g. `tf.print`) and sequences operations to provide the same semantics as
+  eager execution, `wrap_function` is closer to the behavior of `session.run` in
+  TensorFlow 1.x. It will not run any operations unless they are required to
+  compute the function's outputs, either through a data dependency or a control
+  dependency. Nor will it sequence operations.
+
+  Unlike `tf.function`, `wrap_function` will only trace the Python function
+  once. As with placeholders in TF 1.x, shapes and dtypes must be provided to
+  `wrap_function`'s `signature` argument.
+
+  Since it is only traced once, variables and state may be created inside the
+  function and owned by the function wrapper object.
+
+  Args:
+    fn: python function to be wrapped
+    signature: the placeholder and python arguments to be passed to the
+      wrapped function
+    name: Optional. The name of the function.
+
+  Returns:
+    the wrapped graph function.
+  """
+  holder = VariableHolder(fn)
+  return WrappedFunction(
+      func_graph.func_graph_from_py_func(
+          name,
+          holder,
+          args=None, kwargs=None, signature=signature,
+          add_control_dependencies=False),
+      variable_holder=holder,
+      signature=signature)
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32b6ca42691a6261576da6b105a0afc97e0ec63
--- /dev/null
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -0,0 +1,76 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class WrapFunctionTest(test.TestCase):
+
+  def testDocString(self):
+
+    def f(x, do_add):
+      v = variables.Variable(5.0)
+      if do_add:
+        op = v.assign_add(x)
+      else:
+        op = v.assign_sub(x)
+      with ops.control_dependencies([op]):
+        return v.read_value()
+
+    f_add = wrap_function.wrap_function(
+        f, [tensor_spec.TensorSpec((), dtypes.float32), True])
+
+    self.assertAllEqual(f_add(1.0), 6.0)
+    self.assertAllEqual(f_add(1.0), 7.0)
+
+    # Can call tf.compat.v1.wrap_function again to get a new trace, a new set
+    # of variables, and possibly different non-template arguments.
+    f_sub = wrap_function.wrap_function(
+        f, [tensor_spec.TensorSpec((), dtypes.float32), False])
+
+    self.assertAllEqual(f_sub(1.0), 4.0)
+    self.assertAllEqual(f_sub(1.0), 3.0)
+
+  def testPrune(self):
+
+    x_in = []
+    x_out = []
+
+    def f(x, y):
+      x_in.append(x)
+      xx = x * x
+      x_out.append(xx)
+      return xx, 2 * y*y
+
+    f_wrapped = wrap_function.wrap_function(
+        f, [tensor_spec.TensorSpec((), dtypes.float32)] * 2)
+
+    f_pruned = f_wrapped.prune(x_in[0], [x_out[0]])
+    self.assertAllEqual(f_pruned(ops.convert_to_tensor(2.0)), [4.0])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 1c4c5951dfe8a451fcfffeb664d3346d2c91fbca..7363a112af8ced4358ccdef1bfa944378ae3b72c 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -2,8 +2,6 @@ package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-
 py_library(
     name = "estimator_py",
     srcs = [
@@ -21,6 +19,7 @@ py_library(
         ":dnn",
         ":dnn_linear_combined",
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":export",
         ":exporter",
         ":inputs",
@@ -39,6 +38,7 @@ py_library(
     srcs = ["exporter.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":gc",
         ":metric_keys",
         ":util",
@@ -46,34 +46,12 @@ py_library(
     ],
 )
 
-py_test(
-    name = "exporter_test",
-    size = "small",
-    srcs = ["exporter_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":estimator",
-        ":exporter",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "gc",
     srcs = ["gc.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
-py_test(
-    name = "gc_test",
-    size = "small",
-    srcs = ["gc_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":gc",
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
     ],
 )
@@ -83,30 +61,20 @@ py_library(
     srcs = ["model_fn.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":export_output",
         "//tensorflow:tensorflow_py_no_contrib",
         "@six_archive//:six",
     ],
 )
 
-py_test(
-    name = "model_fn_test",
-    size = "small",
-    srcs = ["model_fn_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":export_output",
-        ":model_fn",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "training",
     srcs = ["training.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":exporter",
         ":run_config",
         "//tensorflow:tensorflow_py_no_contrib",
@@ -114,51 +82,24 @@ py_library(
     ],
 )
 
-py_test(
-    name = "training_test",
-    size = "medium",
-    srcs = ["training_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":dnn",
-        ":estimator",
-        ":exporter",
-        ":inputs",
-        ":run_config",
-        ":training",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "run_config",
     srcs = ["run_config.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "@six_archive//:six",
     ],
 )
 
-py_test(
-    name = "run_config_test",
-    size = "small",
-    srcs = ["run_config_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":run_config",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "baseline",
     srcs = ["canned/baseline.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":head",
         ":model_fn",
         ":optimizers",
@@ -167,31 +108,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "baseline_test",
-    size = "medium",
-    srcs = ["canned/baseline_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "noasan",  # test flakily times out in asan mode.
-        "notsan",  # b/67510291
-        "optonly",  # flakily times out in fastbuild
-    ],
-    deps = [
-        ":baseline",
-        ":estimator",
-        ":export_export",
-        ":metric_keys",
-        ":numpy_io",
-        ":pandas_io",
-        ":run_config",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "boosted_trees",
     srcs = ["canned/boosted_trees.py"],
@@ -199,66 +115,33 @@ py_library(
     deps = [
         ":boosted_trees_utils",
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":head",
         ":model_fn",
         "//tensorflow:tensorflow_py_no_contrib",
     ],
 )
 
-py_test(
-    name = "boosted_trees_test",
-    size = "medium",
-    srcs = ["canned/boosted_trees_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_oss",
-        "notap",
-        "optonly",
-    ],
-    deps = [
-        ":boosted_trees",
-        ":inputs",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "boosted_trees_utils",
     srcs = ["canned/boosted_trees_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":head",
         ":model_fn",
         "//tensorflow:tensorflow_py_no_contrib",
     ],
 )
 
-py_test(
-    name = "boosted_trees_utils_test",
-    size = "medium",
-    srcs = ["canned/boosted_trees_utils_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = [
-        "optonly",
-    ],
-    deps = [
-        ":boosted_trees",
-        ":inputs",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_library(
     name = "dnn",
     srcs = ["canned/dnn.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":head",
         ":model_fn",
         ":optimizers",
@@ -274,6 +157,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":head",
         ":metric_keys",
         ":model_fn",
@@ -286,29 +170,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "dnn_test",
-    size = "medium",
-    srcs = ["canned/dnn_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",  # b/67510291
-    ],
-    deps = [
-        ":dnn",
-        ":dnn_testing_utils",
-        ":export_export",
-        ":numpy_io",
-        ":pandas_io",
-        ":prediction_keys",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "dnn_linear_combined",
     srcs = ["canned/dnn_linear_combined.py"],
@@ -316,6 +177,7 @@ py_library(
     deps = [
         ":dnn",
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":head",
         ":linear",
         ":model_fn",
@@ -325,30 +187,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "dnn_linear_combined_test",
-    size = "medium",
-    srcs = ["canned/dnn_linear_combined_test.py"],
-    shard_count = 8,
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",  # b/67510291
-    ],
-    deps = [
-        ":dnn_linear_combined",
-        ":dnn_testing_utils",
-        ":export_export",
-        ":linear_testing_utils",
-        ":numpy_io",
-        ":pandas_io",
-        ":prediction_keys",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "@absl_py//absl/testing:parameterized",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "util",
     srcs = [
@@ -356,23 +194,11 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
     ],
 )
 
-py_test(
-    name = "util_test",
-    srcs = ["util_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67510291
-    deps = [
-        ":util",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "estimator",
     srcs = [
@@ -380,6 +206,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":export_export",
         ":model_fn",
         ":run_config",
@@ -390,25 +217,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "estimator_test",
-    srcs = ["estimator_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67510291
-    deps = [
-        ":estimator",
-        ":export_export",
-        ":export_output",
-        ":model_fn",
-        ":numpy_io",
-        ":run_config",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "parsing_utils",
     srcs = [
@@ -416,42 +224,23 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "@six_archive//:six",
     ],
 )
 
-py_test(
-    name = "parsing_utils_test",
-    srcs = ["canned/parsing_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":parsing_utils",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "export_output",
     srcs = ["export/export_output.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "@six_archive//:six",
     ],
 )
 
-py_test(
-    name = "export_output_test",
-    size = "small",
-    srcs = ["export/export_output_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":export_output",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "export",
     srcs = [
@@ -459,6 +248,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":export_export",
         ":export_output",
         "//tensorflow:tensorflow_py_no_contrib",
@@ -472,30 +262,19 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":util",
         "//tensorflow:tensorflow_py_no_contrib",
         "@six_archive//:six",
     ],
 )
 
-py_test(
-    name = "export_test",
-    size = "small",
-    srcs = ["export/export_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":export_export",
-        ":export_output",
-        ":util",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "head",
     srcs = ["canned/head.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":export_output",
         ":metric_keys",
         ":model_fn",
@@ -505,31 +284,12 @@ py_library(
     ],
 )
 
-py_test(
-    name = "head_test",
-    size = "medium",
-    srcs = ["canned/head_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
-        ":dnn_testing_utils",
-        ":head",
-        ":metric_keys",
-        ":model_fn",
-        ":numpy_io",
-        ":prediction_keys",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
 py_library(
     name = "inputs",
     srcs = ["inputs/inputs.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":numpy_io",
         ":pandas_io",
         "//tensorflow:tensorflow_py_no_contrib",
@@ -542,6 +302,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":head",
         ":optimizers",
         "//tensorflow:tensorflow_py_no_contrib",
@@ -556,6 +317,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":export_export",
         ":linear",
         ":metric_keys",
@@ -567,28 +329,12 @@ py_library(
     ],
 )
 
-py_test(
-    name = "linear_test",
-    size = "medium",
-    srcs = ["canned/linear_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",  # b/67510291
-    ],
-    deps = [
-        ":linear",
-        ":linear_testing_utils",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "metric_keys",
     srcs = ["canned/metric_keys.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":model_fn",
     ],
 )
@@ -598,57 +344,29 @@ py_library(
     srcs = ["inputs/numpy_io.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         ":inputs_queues",
     ],
 )
 
-py_test(
-    name = "numpy_io_test",
-    size = "small",
-    srcs = ["inputs/numpy_io_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":numpy_io",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "optimizers",
     srcs = ["canned/optimizers.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "@six_archive//:six",
     ],
 )
 
-py_test(
-    name = "optimizers_test",
-    size = "small",
-    srcs = ["canned/optimizers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":optimizers",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "pandas_io",
     srcs = ["inputs/pandas_io.py"],
     srcs_version = "PY2AND3",
-    deps = [":inputs_queues"],
-)
-
-py_test(
-    name = "pandas_io_test",
-    size = "small",
-    srcs = ["inputs/pandas_io_test.py"],
-    srcs_version = "PY2AND3",
     deps = [
-        ":pandas_io",
-        "//tensorflow:tensorflow_py_no_contrib",
+        ":expect_tensorflow_estimator_installed",
+        ":inputs_queues",
     ],
 )
 
@@ -656,7 +374,9 @@ py_library(
     name = "prediction_keys",
     srcs = ["canned/prediction_keys.py"],
     srcs_version = "PY2AND3",
-    deps = [],
+    deps = [
+        ":expect_tensorflow_estimator_installed",
+    ],
 )
 
 py_library(
@@ -668,41 +388,19 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":expect_tensorflow_estimator_installed",
         "//tensorflow:tensorflow_py_no_contrib",
         "@six_archive//:six",
     ],
 )
 
-py_test(
-    name = "feeding_functions_test",
-    size = "small",
-    srcs = [
-        "inputs/queues/feeding_functions_test.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":inputs_queues",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
-py_test(
-    name = "feeding_queue_runner_test",
-    size = "small",
-    srcs = ["inputs/queues/feeding_queue_runner_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":inputs_queues",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
-)
-
 py_library(
     name = "keras",
     srcs = ["keras.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":estimator",
+        ":expect_tensorflow_estimator_installed",
         ":export_export",
         ":model_fn",
         ":run_config",
@@ -710,61 +408,9 @@ py_library(
     ],
 )
 
-py_test(
-    name = "keras_test",
-    size = "large",
-    srcs = ["keras_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",  # b/67510291
-    ],
-    deps = [
-        ":keras",
-        ":numpy_io",
-        ":run_config",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "expect_numpy_installed",
-    # This is a dummy rule used as a numpy dependency in open-source.
-    # We expect numpy to already be installed on the system, e.g. via
-    # `pip install numpy`
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "expect_pandas_installed",
-    # This is a dummy rule used as a numpy dependency in open-source.
-    # We expect pandas to already be installed on the system, e.g. via
-    # `pip install pandas`
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "expect_h5py_installed",
-    # This is a dummy rule used as a numpy dependency in open-source.
-    # We expect h5py to already be installed on the system, e.g. via
-    # `pip install h5py'
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "expect_six_installed",
-    # This is a dummy rule used as a numpy dependency in open-source.
-    # We expect six to already be installed on the system, e.g. via
-    # `pip install six`
-    visibility = ["//visibility:public"],
-)
-
 py_library(
-    name = "expect_tensorflow_installed",
-    # This is a dummy rule used as a numpy dependency in open-source.
-    # We expect tensorflow to already be installed on the system, e.g. via
-    # `pip install tensorflow` or `pip install tensorflow_gpu`
+    name = "expect_tensorflow_estimator_installed",
+    # This is a dummy rule used as a dependency in open-source.
+    # We expect tensorflow_estimator to already be installed.
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/python/estimator/__init__.py b/tensorflow/python/estimator/__init__.py
index 8cf8df567f0e36604b5c3f6fe992b572d6632954..1e32161fbba36a84d71656bc84d7b6c76939d090 100644
--- a/tensorflow/python/estimator/__init__.py
+++ b/tensorflow/python/estimator/__init__.py
@@ -12,14 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Import Estimator APIs.
+"""estimator python module.
 
-Note: This file is imported by the create_estimator_api genrule. It must
-transitively import all Estimator modules/packages for their @estimator_export
-annotations to generate the public Estimator python API.
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
 """
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tensorflow.python.estimator.estimator_lib
+from tensorflow_estimator.python import estimator
+
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+estimator.__all__ = [s for s in dir(estimator) if not s.startswith('__')]
+
+from tensorflow_estimator.python.estimator import *
diff --git a/tensorflow/python/estimator/api/BUILD b/tensorflow/python/estimator/api/BUILD
deleted file mode 100644
index a75fa7d0aee56c4fd4faccfaf2fa07c399cedcc9..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/api/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-package(
-    default_visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-load("//tensorflow/python/tools/api/generator:api_gen.bzl", "gen_api_init_files")
-load("//tensorflow/python/tools/api/generator:api_gen.bzl", "ESTIMATOR_API_INIT_FILES")
-
-gen_api_init_files(
-    name = "estimator_python_api_gen",
-    api_name = "estimator",
-    output_files = ESTIMATOR_API_INIT_FILES,
-    output_package = "tensorflow.python.estimator.api",
-    package = "tensorflow.python.estimator",
-    package_dep = "//tensorflow/python/estimator:estimator_py",
-)
diff --git a/tensorflow/python/estimator/canned/__init__.py b/tensorflow/python/estimator/canned/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d640c8c15a507dbf72801136446f137fc610b879 100644
--- a/tensorflow/python/estimator/canned/__init__.py
+++ b/tensorflow/python/estimator/canned/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""canned python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_estimator.python.estimator import canned
+
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+canned.__all__ = [s for s in dir(canned) if not s.startswith('__')]
+
+from tensorflow_estimator.python.estimator.canned import *
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 20c7a69b7cb071365e5442b512c1a858a7e0b246..2f0e93d911ccf1b96c1ec48be15d63d0e674560f 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,365 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Baseline estimators.
+"""baseline python module.
 
-Baseline estimators are bias-only estimators that can be used for debugging
-and as simple baselines.
-
-Example:
-
-```
-# Build BaselineClassifier
-classifier = BaselineClassifier(n_classes=3)
-
-# Input builders
-def input_fn_train(): # returns x, y (where y represents label's class index).
-  pass
-
-def input_fn_eval(): # returns x, y (where y represents label's class index).
-  pass
-
-# Fit model.
-classifier.train(input_fn=input_fn_train)
-
-# Evaluate cross entropy between the test and train labels.
-loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
-
-# predict outputs the probability distribution of the classes as seen in
-# training.
-predictions = classifier.predict(new_samples)
-```
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
 """
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.training import training_util
-from tensorflow.python.util.tf_export import estimator_export
-
-# The default learning rate of 0.3 is a historical artifact of the initial
-# implementation, but seems a reasonable choice.
-_LEARNING_RATE = 0.3
-
-
-def _get_weight_column_key(weight_column):
-  if weight_column is None:
-    return None
-  if isinstance(weight_column, six.string_types):
-    return weight_column
-  if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
-    raise TypeError('Weight column must be either a string or _NumericColumn.'
-                    ' Given type: {}.'.format(type(weight_column)))
-  return weight_column.key()
-
-
-def _baseline_logit_fn_builder(num_outputs, weight_column=None):
-  """Function builder for a baseline logit_fn.
-
-  Args:
-    num_outputs: Number of outputs for the model.
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-       weights. It will be multiplied by the loss of the example.
-  Returns:
-    A logit_fn (see below).
-  """
-
-  def baseline_logit_fn(features):
-    """Baseline model logit_fn.
-
-    The baseline model simply learns a bias, so the output logits are a
-    `Variable` with one weight for each output that learns the bias for the
-    corresponding output.
-
-    Args:
-      features: The first item returned from the `input_fn` passed to `train`,
-        `evaluate`, and `predict`. This should be a single `Tensor` or dict with
-        `Tensor` values.
-    Returns:
-      A `Tensor` representing the logits.
-    """
-    size_checks = []
-    batch_size = None
-
-    weight_column_key = _get_weight_column_key(weight_column)
-
-    # The first dimension is assumed to be a batch size and must be consistent
-    # among all of the features.
-    for key, feature in features.items():
-      # Skip weight_column to ensure we don't add size checks to it.
-      # These would introduce a dependency on the weight at serving time.
-      if key == weight_column_key:
-        continue
-      first_dim = array_ops.shape(feature)[0]
-      if batch_size is None:
-        batch_size = first_dim
-      else:
-        size_checks.append(check_ops.assert_equal(batch_size, first_dim))
-
-    with ops.control_dependencies(size_checks):
-      with variable_scope.variable_scope('baseline'):
-        bias = variable_scope.get_variable('bias', shape=[num_outputs],
-                                           initializer=init_ops.Zeros)
-        return math_ops.multiply(bias, array_ops.ones([batch_size,
-                                                       num_outputs]))
-
-  return baseline_logit_fn
-
-
-def _baseline_model_fn(features, labels, mode, head, optimizer,
-                       weight_column=None, config=None):
-  """Model_fn for baseline models.
-
-  Args:
-    features: `Tensor` or dict of `Tensor` (depends on data passed to `train`).
-    labels: `Tensor` of labels that are compatible with the `Head` instance.
-    mode: Defines whether this is training, evaluation or prediction.
-      See `ModeKeys`.
-    head: A `Head` instance.
-    optimizer: String, `tf.Optimizer` object, or callable that creates the
-      optimizer to use for training. If not specified, will use `FtrlOptimizer`
-      with a default learning rate of 0.3.
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-       weights. It will be multiplied by the loss of the example.
-    config: `RunConfig` object to configure the runtime settings.
-
-  Raises:
-    KeyError: If weight column is specified but not present.
-    ValueError: If features is an empty dictionary.
-
-  Returns:
-    An `EstimatorSpec` instance.
-  """
-  del config  # Unused.
-
-  logit_fn = _baseline_logit_fn_builder(head.logits_dimension, weight_column)
-  logits = logit_fn(features)
-
-  def train_op_fn(loss):
-    opt = optimizers.get_optimizer_instance(
-        optimizer, learning_rate=_LEARNING_RATE)
-    return opt.minimize(loss, global_step=training_util.get_global_step())
-
-  return head.create_estimator_spec(
-      features=features,
-      mode=mode,
-      logits=logits,
-      labels=labels,
-      train_op_fn=train_op_fn)
-
-
-@estimator_export('estimator.BaselineClassifier')
-class BaselineClassifier(estimator.Estimator):
-  """A classifier that can establish a simple baseline.
-
-  This classifier ignores feature values and will learn to predict the average
-  value of each label. For single-label problems, this will predict the
-  probability distribution of the classes as seen in the labels. For multi-label
-  problems, this will predict the fraction of examples that are positive for
-  each class.
-
-  Example:
-
-  ```python
-
-  # Build BaselineClassifier
-  classifier = BaselineClassifier(n_classes=3)
-
-  # Input builders
-  def input_fn_train: # returns x, y (where y represents label's class index).
-    pass
-
-  def input_fn_eval: # returns x, y (where y represents label's class index).
-    pass
-
-  # Fit model.
-  classifier.train(input_fn=input_fn_train)
-
-  # Evaluate cross entropy between the test and train labels.
-  loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
-
-  # predict outputs the probability distribution of the classes as seen in
-  # training.
-  predictions = classifier.predict(new_samples)
-
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-    otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-     `key=weight_column` whose value is a `Tensor`.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               model_dir=None,
-               n_classes=2,
-               weight_column=None,
-               label_vocabulary=None,
-               optimizer='Ftrl',
-               config=None,
-               loss_reduction=losses.Reduction.SUM):
-    """Initializes a BaselineClassifier instance.
-
-    Args:
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      n_classes: number of label classes. Default is binary classification.
-        It must be greater than 1. Note: Class labels are integers representing
-        the class index (i.e. values from 0 to n_classes-1). For arbitrary
-        label values (e.g. string labels), convert to class indices first.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-         weights. It will be multiplied by the loss of the example.
-      label_vocabulary: Optional list of strings with size `[n_classes]`
-        defining the label vocabulary. Only supported for `n_classes` > 2.
-      optimizer: String, `tf.Optimizer` object, or callable that creates the
-        optimizer to use for training. If not specified, will use
-        `FtrlOptimizer` with a default learning rate of 0.3.
-      config: `RunConfig` object to configure the runtime settings.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM`.
-    Returns:
-      A `BaselineClassifier` estimator.
-
-    Raises:
-      ValueError: If `n_classes` < 2.
-    """
-    if n_classes == 2:
-      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-    else:
-      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
-          n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-    def _model_fn(features, labels, mode, config):
-      return _baseline_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          optimizer=optimizer,
-          weight_column=weight_column,
-          config=config)
-    super(BaselineClassifier, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config)
-
-
-@estimator_export('estimator.BaselineRegressor')
-class BaselineRegressor(estimator.Estimator):
-  """A regressor that can establish a simple baseline.
-
-  This regressor ignores feature values and will learn to predict the average
-  value of each label.
-
-  Example:
-
-  ```python
-
-  # Build BaselineRegressor
-  regressor = BaselineRegressor()
-
-  # Input builders
-  def input_fn_train: # returns x, y (where y is the label).
-    pass
-
-  def input_fn_eval: # returns x, y (where y is the label).
-    pass
-
-  # Fit model.
-  regressor.train(input_fn=input_fn_train)
-
-  # Evaluate squared-loss between the test and train targets.
-  loss = regressor.evaluate(input_fn=input_fn_eval)["loss"]
-
-  # predict outputs the mean value seen during training.
-  predictions = regressor.predict(new_samples)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-    otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-     `key=weight_column` whose value is a `Tensor`.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               model_dir=None,
-               label_dimension=1,
-               weight_column=None,
-               optimizer='Ftrl',
-               config=None,
-               loss_reduction=losses.Reduction.SUM):
-    """Initializes a BaselineRegressor instance.
+from tensorflow_estimator.python.estimator.canned import baseline
 
-    Args:
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      label_dimension: Number of regression targets per example. This is the
-        size of the last dimension of the labels and logits `Tensor` objects
-        (typically, these have shape `[batch_size, label_dimension]`).
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-         weights. It will be multiplied by the loss of the example.
-      optimizer: String, `tf.Optimizer` object, or callable that creates the
-        optimizer to use for training. If not specified, will use
-        `FtrlOptimizer` with a default learning rate of 0.3.
-      config: `RunConfig` object to configure the runtime settings.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM`.
-    Returns:
-      A `BaselineRegressor` estimator.
-    """
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+baseline.__all__ = [s for s in dir(baseline) if not s.startswith('__')]
 
-    head = head_lib._regression_head(  # pylint: disable=protected-access
-        label_dimension=label_dimension,
-        weight_column=weight_column,
-        loss_reduction=loss_reduction)
-    def _model_fn(features, labels, mode, config):
-      return _baseline_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          optimizer=optimizer,
-          config=config)
-    super(BaselineRegressor, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config)
+from tensorflow_estimator.python.estimator.canned.baseline import *
diff --git a/tensorflow/python/estimator/canned/baseline_test.py b/tensorflow/python/estimator/canned/baseline_test.py
deleted file mode 100644
index 1df7216ba60e64fdae16138922e3c8a276dcf028..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/canned/baseline_test.py
+++ /dev/null
@@ -1,1558 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for baseline.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import os
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.estimator.canned import baseline
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.estimator.inputs import pandas_io
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import input as input_lib
-from tensorflow.python.training import optimizer
-from tensorflow.python.training import queue_runner
-from tensorflow.python.training import saver
-
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-# pylint rules which are disabled by default for test files.
-# pylint: disable=invalid-name,protected-access,missing-docstring
-
-# Names of variables created by model.
-BIAS_NAME = 'baseline/bias'
-
-
-def assert_close(expected, actual, rtol=1e-04, name='assert_close'):
-  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
-    expected = ops.convert_to_tensor(expected, name='expected')
-    actual = ops.convert_to_tensor(actual, name='actual')
-    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
-    rtol = ops.convert_to_tensor(rtol, name='rtol')
-    return check_ops.assert_less(
-        rdiff,
-        rtol,
-        data=('Condition expected =~ actual did not hold element-wise:'
-              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
-              'rtol = ', rtol,),
-        name=scope)
-
-
-def save_variables_to_ckpt(model_dir):
-  init_all_op = [variables.global_variables_initializer()]
-  with tf_session.Session() as sess:
-    sess.run(init_all_op)
-    saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
-
-
-def queue_parsed_features(feature_map):
-  tensors_to_enqueue = []
-  keys = []
-  for key, tensor in six.iteritems(feature_map):
-    keys.append(key)
-    tensors_to_enqueue.append(tensor)
-  queue_dtypes = [x.dtype for x in tensors_to_enqueue]
-  input_queue = data_flow_ops.FIFOQueue(capacity=100, dtypes=queue_dtypes)
-  queue_runner.add_queue_runner(
-      queue_runner.QueueRunner(input_queue,
-                               [input_queue.enqueue(tensors_to_enqueue)]))
-  dequeued_tensors = input_queue.dequeue()
-  return {keys[i]: dequeued_tensors[i] for i in range(len(dequeued_tensors))}
-
-
-def sorted_key_dict(unsorted_dict):
-  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
-
-
-def sigmoid(x):
-  return 1 / (1 + np.exp(-1.0 * x))
-
-
-def _baseline_regressor_fn(*args, **kwargs):
-  return baseline.BaselineRegressor(*args, **kwargs)
-
-
-def _baseline_classifier_fn(*args, **kwargs):
-  return baseline.BaselineClassifier(*args, **kwargs)
-
-
-# Tests for Baseline Regressor.
-
-
-# TODO(b/36813849): Add tests with dynamic shape inputs using placeholders.
-class BaselineRegressorEvaluationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_evaluation_for_simple_data(self):
-    with ops.Graph().as_default():
-      variables.Variable([13.0], name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
-    eval_metrics = baseline_regressor.evaluate(
-        input_fn=lambda: ({'age': ((1,),)}, ((10.,),)), steps=1)
-
-    # Logit is bias = 13, while label is 10. Loss is 3**2 = 9.
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 9.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 13.,
-        metric_keys.MetricKeys.LABEL_MEAN: 10.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_batch(self):
-    """Tests evaluation for batch_size==2."""
-    with ops.Graph().as_default():
-      variables.Variable([13.0], name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
-    eval_metrics = baseline_regressor.evaluate(
-        input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
-
-    # Logit is bias = 13, while label is 10.
-    # Loss per example is 3**2 = 9.
-    # Training loss is the sum over batch = 9 + 9 = 18
-    # Average loss is the average over batch = 9
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 18.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 13.,
-        metric_keys.MetricKeys.LABEL_MEAN: 10.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_weights(self):
-    """Tests evaluation with weights."""
-    with ops.Graph().as_default():
-      variables.Variable([13.0], name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    def _input_fn():
-      features = {'age': ((1,), (1,)), 'weights': ((1.,), (2.,))}
-      labels = ((10.,), (10.,))
-      return features, labels
-
-    baseline_regressor = _baseline_regressor_fn(
-        weight_column='weights',
-        model_dir=self._model_dir)
-    eval_metrics = baseline_regressor.evaluate(input_fn=_input_fn, steps=1)
-
-    # Logit is bias = 13, while label is 10.
-    # Loss per example is 3**2 = 9.
-    # Training loss is the weighted sum over batch = 9 + 2*9 = 27
-    # average loss is the weighted average = 9 + 2*9 / (1 + 2) = 9
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 27.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 13.,
-        metric_keys.MetricKeys.LABEL_MEAN: 10.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_for_multi_dimensions(self):
-    label_dim = 2
-    with ops.Graph().as_default():
-      variables.Variable([46.0, 58.0], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(
-        label_dimension=label_dim,
-        model_dir=self._model_dir)
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'age': np.array([[2., 4., 5.]]),
-        },
-        y=np.array([[46., 58.]]),
-        batch_size=1,
-        num_epochs=None,
-        shuffle=False)
-    eval_metrics = baseline_regressor.evaluate(input_fn=input_fn, steps=1)
-
-    self.assertItemsEqual(
-        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
-         metric_keys.MetricKeys.PREDICTION_MEAN,
-         metric_keys.MetricKeys.LABEL_MEAN, ops.GraphKeys.GLOBAL_STEP),
-        eval_metrics.keys())
-
-    # Logit is bias which is [46, 58]
-    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
-
-
-class BaselineRegressorPredictTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_1d(self):
-    """Tests predict when all variables are one-dimensional."""
-    with ops.Graph().as_default():
-      variables.Variable([.2], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': np.array([[2.]])},
-        y=None,
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    predictions = baseline_regressor.predict(input_fn=predict_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    # x * weight + bias = 2. * 10. + .2 = 20.2
-    self.assertAllClose([[.2]], predicted_scores)
-
-  def testMultiDim(self):
-    """Tests predict when all variables are multi-dimenstional."""
-    batch_size = 2
-    label_dimension = 3
-    with ops.Graph().as_default():
-      variables.Variable(  # shape=[label_dimension]
-          [.2, .4, .6], name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    baseline_regressor = _baseline_regressor_fn(
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        # x shape=[batch_size, x_dim]
-        x={'x': np.array([[1., 2., 3., 4.], [5., 6., 7., 8.]])},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predictions = baseline_regressor.predict(input_fn=predict_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    # score = bias, shape=[batch_size, label_dimension]
-    self.assertAllClose([[0.2, 0.4, 0.6], [0.2, 0.4, 0.6]],
-                        predicted_scores)
-
-
-class BaselineRegressorIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, label_dimension, prediction_length):
-    feature_columns = [
-        feature_column_lib.numeric_column('x', shape=(input_dimension,))
-    ]
-    est = _baseline_regressor_fn(
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    # learn y = x
-    est.train(train_input_fn, steps=200)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array(
-        [x['predictions'] for x in est.predict(predict_input_fn)])
-    self.assertAllEqual((prediction_length, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-  def test_pandas_input_fn(self):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-
-    # Pandas DataFrame natually supports 1 dim data only.
-    label_dimension = 1
-    input_dimension = label_dimension
-    batch_size = 10
-    data = np.array([1., 2., 3., 4.], dtype=np.float32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(data)
-    prediction_length = 4
-
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-  def test_input_fn_from_parse_example(self):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=datum)),
-              'y':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=datum[:label_dimension])),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
-    }
-
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-
-class BaselineRegressorTrainingTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _mock_optimizer(self, expected_loss=None):
-    expected_var_names = [
-        '%s:0' % BIAS_NAME
-    ]
-
-    def _minimize(loss, global_step=None, var_list=None):
-      trainable_vars = var_list or ops.get_collection(
-          ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertItemsEqual(expected_var_names,
-                            [var.name for var in trainable_vars])
-
-      # Verify loss. We can't check the value directly, so we add an assert op.
-      self.assertEquals(0, loss.shape.ndims)
-      if expected_loss is None:
-        if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
-        return control_flow_ops.no_op()
-      assert_loss = assert_close(
-          math_ops.to_float(expected_loss, name='expected'),
-          loss,
-          name='assert_loss')
-      with ops.control_dependencies((assert_loss,)):
-        if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
-        return control_flow_ops.no_op()
-
-    mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer.Optimizer,
-        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
-    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
-
-    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
-    # So, return mock_optimizer itself for deepcopy.
-    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
-    return mock_optimizer
-
-  def _assert_checkpoint(self,
-                         label_dimension,
-                         expected_global_step,
-                         expected_bias=None):
-    shapes = {
-        name: shape
-        for (name, shape) in checkpoint_utils.list_variables(self._model_dir)
-    }
-
-    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(self._model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
-
-    self.assertEqual([label_dimension], shapes[BIAS_NAME])
-    if expected_bias is not None:
-      self.assertEqual(expected_bias,
-                       checkpoint_utils.load_variable(self._model_dir,
-                                                      BIAS_NAME))
-
-  def testFromScratchWithDefaultOptimizer(self):
-    # Create BaselineRegressor.
-    label = 5.
-    age = 17
-    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
-
-    # Train for a few steps, and validate final checkpoint.
-    num_steps = 10
-    baseline_regressor.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self._assert_checkpoint(label_dimension=1, expected_global_step=num_steps)
-
-  def testTrainWithOneDimLabel(self):
-    label_dimension = 1
-    batch_size = 20
-    est = _baseline_regressor_fn(
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
-    self.assertEqual((batch_size,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1},
-        y=data_rank_1,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(label_dimension=1, expected_global_step=200)
-
-  def testTrainWithOneDimWeight(self):
-    label_dimension = 1
-    batch_size = 20
-    est = _baseline_regressor_fn(
-        label_dimension=label_dimension,
-        weight_column='w',
-        model_dir=self._model_dir)
-
-    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
-    self.assertEqual((batch_size,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1,
-           'w': data_rank_1},
-        y=data_rank_1,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(label_dimension=1, expected_global_step=200)
-
-  def testFromScratch(self):
-    # Create BaselineRegressor.
-    label = 5.
-    age = 17
-    # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
-    mock_optimizer = self._mock_optimizer(expected_loss=25.)
-    baseline_regressor = _baseline_regressor_fn(
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    baseline_regressor.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        label_dimension=1,
-        expected_global_step=num_steps,
-        expected_bias=[0.])
-
-  def testFromCheckpoint(self):
-    # Create initial checkpoint.
-    bias = 7.0
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable([bias], name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = bias = 6.
-    # loss = (logits - label)^2 = (7 - 5)^2 = 4
-    mock_optimizer = self._mock_optimizer(expected_loss=4.)
-    baseline_regressor = _baseline_regressor_fn(
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    baseline_regressor.train(
-        input_fn=lambda: ({'age': ((17,),)}, ((5.,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        label_dimension=1,
-        expected_global_step=initial_global_step + num_steps,
-        expected_bias=[bias])
-
-  def testFromCheckpointMultiBatch(self):
-    # Create initial checkpoint.
-    bias = 5.0
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable([bias], name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = bias
-    # logits[0] = 5.
-    # logits[1] = 5.
-    # loss = sum(logits - label)^2 = (5 - 5)^2 + (5 - 3)^2 = 4
-    mock_optimizer = self._mock_optimizer(expected_loss=4.)
-    baseline_regressor = _baseline_regressor_fn(
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    baseline_regressor.train(
-        input_fn=lambda: ({'age': ((17,), (15,))}, ((5.,), (3.,))),
-        steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        label_dimension=1,
-        expected_global_step=initial_global_step + num_steps,
-        expected_bias=bias)
-
-
-# Tests for Baseline Classifier.
-
-
-class BaselineClassifierTrainingTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _mock_optimizer(self, expected_loss=None):
-    expected_var_names = [
-        '%s:0' % BIAS_NAME
-    ]
-
-    def _minimize(loss, global_step):
-      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertItemsEqual(
-          expected_var_names,
-          [var.name for var in trainable_vars])
-
-      # Verify loss. We can't check the value directly, so we add an assert op.
-      self.assertEquals(0, loss.shape.ndims)
-      if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
-      assert_loss = assert_close(
-          math_ops.to_float(expected_loss, name='expected'),
-          loss,
-          name='assert_loss')
-      with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
-
-    mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer.Optimizer,
-        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
-    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
-
-    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
-    # So, return mock_optimizer itself for deepcopy.
-    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
-    return mock_optimizer
-
-  def _assert_checkpoint(
-      self, n_classes, expected_global_step, expected_bias=None):
-    logits_dimension = n_classes if n_classes > 2 else 1
-
-    shapes = {
-        name: shape for (name, shape) in
-        checkpoint_utils.list_variables(self._model_dir)
-    }
-
-    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
-    self.assertEqual(
-        expected_global_step,
-        checkpoint_utils.load_variable(
-            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
-
-    self.assertEqual([logits_dimension], shapes[BIAS_NAME])
-    if expected_bias is not None:
-      self.assertAllEqual(expected_bias,
-                          checkpoint_utils.load_variable(
-                              self._model_dir, BIAS_NAME))
-
-  def _testFromScratchWithDefaultOptimizer(self, n_classes):
-    label = 0
-    age = 17
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # Train for a few steps, and validate final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self._assert_checkpoint(n_classes, num_steps)
-
-  def testBinaryClassesFromScratchWithDefaultOptimizer(self):
-    self._testFromScratchWithDefaultOptimizer(n_classes=2)
-
-  def testMultiClassesFromScratchWithDefaultOptimizer(self):
-    self._testFromScratchWithDefaultOptimizer(n_classes=4)
-
-  def _testTrainWithTwoDimsLabel(self, n_classes):
-    batch_size = 20
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    data_rank_2 = np.array([[0], [1]])
-    self.assertEqual((2,), data_rank_1.shape)
-    self.assertEqual((2, 1), data_rank_2.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1},
-        y=data_rank_2,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithTwoDimsLabel(self):
-    self._testTrainWithTwoDimsLabel(n_classes=2)
-
-  def testMultiClassesTrainWithTwoDimsLabel(self):
-    self._testTrainWithTwoDimsLabel(n_classes=4)
-
-  def _testTrainWithOneDimLabel(self, n_classes):
-    batch_size = 20
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    self.assertEqual((2,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1},
-        y=data_rank_1,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithOneDimLabel(self):
-    self._testTrainWithOneDimLabel(n_classes=2)
-
-  def testMultiClassesTrainWithOneDimLabel(self):
-    self._testTrainWithOneDimLabel(n_classes=4)
-
-  def _testTrainWithTwoDimsWeight(self, n_classes):
-    batch_size = 20
-
-    est = baseline.BaselineClassifier(
-        weight_column='w',
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    data_rank_2 = np.array([[0], [1]])
-    self.assertEqual((2,), data_rank_1.shape)
-    self.assertEqual((2, 1), data_rank_2.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1, 'w': data_rank_2}, y=data_rank_1,
-        batch_size=batch_size, num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithTwoDimsWeight(self):
-    self._testTrainWithTwoDimsWeight(n_classes=2)
-
-  def testMultiClassesTrainWithTwoDimsWeight(self):
-    self._testTrainWithTwoDimsWeight(n_classes=4)
-
-  def _testTrainWithOneDimWeight(self, n_classes):
-    batch_size = 20
-
-    est = baseline.BaselineClassifier(
-        weight_column='w',
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    self.assertEqual((2,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1, 'w': data_rank_1}, y=data_rank_1,
-        batch_size=batch_size, num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithOneDimWeight(self):
-    self._testTrainWithOneDimWeight(n_classes=2)
-
-  def testMultiClassesTrainWithOneDimWeight(self):
-    self._testTrainWithOneDimWeight(n_classes=4)
-
-  def _testFromScratch(self, n_classes):
-    label = 1
-    age = 17
-    # For binary classifier:
-    #   loss = sigmoid_cross_entropy(logits, label) where logits=0 (weights are
-    #   all zero initially) and label = 1 so,
-    #      loss = 1 * -log ( sigmoid(logits) ) = 0.69315
-    # For multi class classifier:
-    #   loss = cross_entropy(logits, label) where logits are all 0s (weights are
-    #   all zero initially) and label = 1 so,
-    #      loss = 1 * -log ( 1.0 / n_classes )
-    # For this particular test case, as logits are same, the formula
-    # 1 * -log ( 1.0 / n_classes ) covers both binary and multi class cases.
-    mock_optimizer = self._mock_optimizer(
-        expected_loss=-1 * math.log(1.0/n_classes))
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        n_classes,
-        expected_global_step=num_steps,
-        expected_bias=[0.] if n_classes == 2 else [.0] * n_classes)
-
-  def testBinaryClassesFromScratch(self):
-    self._testFromScratch(n_classes=2)
-
-  def testMultiClassesFromScratch(self):
-    self._testFromScratch(n_classes=4)
-
-  def _testFromCheckpoint(self, n_classes):
-    # Create initial checkpoint.
-    label = 1
-    age = 17
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # For binary classifier:
-    #   logits = bias = -1.
-    #   loss = sigmoid_cross_entropy(logits, label)
-    #   so, loss = 1 * -log ( sigmoid(-1) ) = 1.3133
-    # For multi class classifier:
-    #   loss = cross_entropy(logits, label)
-    #   where logits = bias and label = 1
-    #   so, loss = 1 * -log ( softmax(logits)[1] )
-    if n_classes == 2:
-      expected_loss = 1.3133
-    else:
-      logits = bias
-      logits_exp = np.exp(logits)
-      softmax = logits_exp / logits_exp.sum()
-      expected_loss = -1 * math.log(softmax[label])
-
-    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        n_classes,
-        expected_global_step=initial_global_step + num_steps,
-        expected_bias=bias)
-
-  def testBinaryClassesFromCheckpoint(self):
-    self._testFromCheckpoint(n_classes=2)
-
-  def testMultiClassesFromCheckpoint(self):
-    self._testFromCheckpoint(n_classes=4)
-
-  def _testFromCheckpointFloatLabels(self, n_classes):
-    """Tests float labels for binary classification."""
-    # Create initial checkpoint.
-    if n_classes > 2:
-      return
-    label = 0.8
-    age = 17
-    bias = [-1.0]
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = bias = -1.
-    # loss = sigmoid_cross_entropy(logits, label)
-    # => loss = -0.8 * log(sigmoid(-1)) -0.2 * log(sigmoid(+1)) = 1.1132617
-    mock_optimizer = self._mock_optimizer(expected_loss=1.1132617)
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-
-  def testBinaryClassesFromCheckpointFloatLabels(self):
-    self._testFromCheckpointFloatLabels(n_classes=2)
-
-  def testMultiClassesFromCheckpointFloatLabels(self):
-    self._testFromCheckpointFloatLabels(n_classes=4)
-
-  def _testFromCheckpointMultiBatch(self, n_classes):
-    # Create initial checkpoint.
-    label = [1, 0]
-    age = [17, 18.5]
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # For binary classifier:
-    #   logits = bias
-    #   logits[0] = -1.
-    #   logits[1] = -1.
-    #   loss = sigmoid_cross_entropy(logits, label)
-    #   so, loss[0] = 1 * -log ( sigmoid(-1) ) = 1.3133
-    #       loss[1] = (1 - 0) * -log ( 1- sigmoid(-1) ) = 0.3132
-    # For multi class classifier:
-    #   loss = cross_entropy(logits, label)
-    #   where logits = bias and label = [1, 0]
-    #   so, loss = 1 * -log ( softmax(logits)[label] )
-    if n_classes == 2:
-      expected_loss = (1.3133 + 0.3132)
-    else:
-      # Expand logits since batch_size=2
-      logits = bias * np.ones(shape=(2, 1))
-      logits_exp = np.exp(logits)
-      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
-      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
-      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
-      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
-      expected_loss = expected_loss_0 + expected_loss_1
-
-    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
-
-    est = baseline.BaselineClassifier(
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': (age)}, (label)),
-        steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        n_classes,
-        expected_global_step=initial_global_step + num_steps,
-        expected_bias=bias)
-
-  def testBinaryClassesFromCheckpointMultiBatch(self):
-    self._testFromCheckpointMultiBatch(n_classes=2)
-
-  def testMultiClassesFromCheckpointMultiBatch(self):
-    self._testFromCheckpointMultiBatch(n_classes=4)
-
-
-class BaselineClassifierEvaluationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _test_evaluation_for_simple_data(self, n_classes):
-    label = 1
-    age = 1.
-
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = _baseline_classifier_fn(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=1)
-
-    if n_classes == 2:
-      # Binary classes: loss = -log(sigmoid(-1)) = 1.3133
-      # Prediction = sigmoid(-1) = 0.2689
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: 1.3133,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: 1.3133,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-          metric_keys.MetricKeys.PRECISION: 0.,
-          metric_keys.MetricKeys.RECALL: 0.,
-          metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
-          metric_keys.MetricKeys.LABEL_MEAN: 1.,
-          metric_keys.MetricKeys.ACCURACY_BASELINE: 1,
-          metric_keys.MetricKeys.AUC: 0.,
-          metric_keys.MetricKeys.AUC_PR: 1.,
-      }
-    else:
-      # Multi classes: loss = 1 * -log ( softmax(logits)[label] )
-      logits = bias
-      logits_exp = np.exp(logits)
-      softmax = logits_exp / logits_exp.sum()
-      expected_loss = -1 * math.log(softmax[label])
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-      }
-
-    self.assertAllClose(sorted_key_dict(expected_metrics),
-                        sorted_key_dict(eval_metrics), rtol=1e-3)
-
-  def test_binary_classes_evaluation_for_simple_data(self):
-    self._test_evaluation_for_simple_data(n_classes=2)
-
-  def test_multi_classes_evaluation_for_simple_data(self):
-    self._test_evaluation_for_simple_data(n_classes=4)
-
-  def _test_evaluation_batch(self, n_classes):
-    """Tests evaluation for batch_size==2."""
-    label = [1, 0]
-    age = [17., 18.]
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = _baseline_classifier_fn(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(
-        input_fn=lambda: ({'age': (age)}, (label)), steps=1)
-
-    if n_classes == 2:
-      # Logits are (-1., -1.) labels are (1, 0).
-      # Loss is
-      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
-      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(-1)) = 0.3132
-      # Prediction = sigmoid(-1) = 0.2689
-      expected_loss = 1.3133 + 0.3132
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-          metric_keys.MetricKeys.ACCURACY: 0.5,
-          metric_keys.MetricKeys.PRECISION: 0.,
-          metric_keys.MetricKeys.RECALL: 0.,
-          metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
-          metric_keys.MetricKeys.LABEL_MEAN: 0.5,
-          metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
-          metric_keys.MetricKeys.AUC: 0.5,
-          metric_keys.MetricKeys.AUC_PR: 0.75,
-      }
-    else:
-      # Expand logits since batch_size=2
-      logits = bias * np.ones(shape=(2, 1))
-      logits_exp = np.exp(logits)
-      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
-      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
-      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
-      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
-      expected_loss = expected_loss_0 + expected_loss_1
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-          metric_keys.MetricKeys.ACCURACY: 0.5,
-      }
-
-    self.assertAllClose(sorted_key_dict(expected_metrics),
-                        sorted_key_dict(eval_metrics), rtol=1e-3)
-
-  def test_binary_classes_evaluation_batch(self):
-    self._test_evaluation_batch(n_classes=2)
-
-  def test_multi_classes_evaluation_batch(self):
-    self._test_evaluation_batch(n_classes=4)
-
-  def _test_evaluation_weights(self, n_classes):
-    """Tests evaluation with weights."""
-
-    label = [1, 0]
-    age = [17., 18.]
-    weights = [1., 2.]
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(
-          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = _baseline_classifier_fn(
-        n_classes=n_classes,
-        weight_column='w',
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(
-        input_fn=lambda: ({'age': (age), 'w': (weights)}, (label)), steps=1)
-
-    if n_classes == 2:
-      # Logits are (-1., -1.) labels are (1, 0).
-      # Loss is
-      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
-      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(-1)) = 0.3132
-      #   weights = [1., 2.]
-      expected_loss = 1.3133 * 1. + 0.3132 * 2.
-      loss_mean = expected_loss / (1.0 + 2.0)
-      label_mean = np.average(label, weights=weights)
-      logits = [-1, -1]
-      logistics = sigmoid(np.array(logits))
-      predictions_mean = np.average(logistics, weights=weights)
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
-          metric_keys.MetricKeys.ACCURACY: 2. / (1. + 2.),
-          metric_keys.MetricKeys.PRECISION: 0.,
-          metric_keys.MetricKeys.RECALL: 0.,
-          metric_keys.MetricKeys.PREDICTION_MEAN: predictions_mean,
-          metric_keys.MetricKeys.LABEL_MEAN: label_mean,
-          metric_keys.MetricKeys.ACCURACY_BASELINE: (
-              max(label_mean, 1-label_mean)),
-          metric_keys.MetricKeys.AUC: 0.5,
-          metric_keys.MetricKeys.AUC_PR: 2. / (1. + 2.),
-      }
-    else:
-      # Multi classes: unweighted_loss = 1 * -log ( soft_max(logits)[label] )
-      # Expand logits since batch_size=2
-      logits = bias * np.ones(shape=(2, 1))
-      logits_exp = np.exp(logits)
-      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
-      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
-      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
-      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
-      loss_mean = np.average([expected_loss_0, expected_loss_1],
-                             weights=weights)
-      expected_loss = loss_mean * np.sum(weights)
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
-          metric_keys.MetricKeys.ACCURACY: 2. / (1. + 2.),
-      }
-
-    self.assertAllClose(sorted_key_dict(expected_metrics),
-                        sorted_key_dict(eval_metrics), rtol=1e-3)
-
-  def test_binary_classes_evaluation_weights(self):
-    self._test_evaluation_weights(n_classes=2)
-
-  def test_multi_classes_evaluation_weights(self):
-    self._test_evaluation_weights(n_classes=4)
-
-
-class BaselineClassifierPredictTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _testPredictions(self, n_classes, label_vocabulary, label_output_fn):
-    """Tests predict when all variables are one-dimensional."""
-    age = 1.
-
-    bias = [10.0] if n_classes == 2 else [10.0] * n_classes
-
-    with ops.Graph().as_default():
-      variables.Variable(bias, name=BIAS_NAME)
-      variables.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = _baseline_classifier_fn(
-        label_vocabulary=label_vocabulary,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'age': np.array([[age]])},
-        y=None,
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-
-    if n_classes == 2:
-      scalar_logits = bias[0]
-      two_classes_logits = [0, scalar_logits]
-      two_classes_logits_exp = np.exp(two_classes_logits)
-      softmax = two_classes_logits_exp / two_classes_logits_exp.sum()
-
-      expected_predictions = {
-          'class_ids': [1],
-          'classes': [label_output_fn(1)],
-          'logistic': [sigmoid(np.array(scalar_logits))],
-          'logits': [scalar_logits],
-          'probabilities': softmax,
-      }
-    else:
-      onedim_logits = np.array(bias)
-      class_ids = onedim_logits.argmax()
-      logits_exp = np.exp(onedim_logits)
-      softmax = logits_exp / logits_exp.sum()
-      expected_predictions = {
-          'class_ids': [class_ids],
-          'classes': [label_output_fn(class_ids)],
-          'logits': onedim_logits,
-          'probabilities': softmax,
-      }
-
-    self.assertEqual(1, len(predictions))
-    # assertAllClose cannot handle byte type.
-    self.assertEqual(expected_predictions['classes'], predictions[0]['classes'])
-    expected_predictions.pop('classes')
-    predictions[0].pop('classes')
-    self.assertAllClose(sorted_key_dict(expected_predictions),
-                        sorted_key_dict(predictions[0]))
-
-  def testBinaryClassesWithoutLabelVocabulary(self):
-    n_classes = 2
-    self._testPredictions(n_classes,
-                          label_vocabulary=None,
-                          label_output_fn=lambda x: ('%s' % x).encode())
-
-  def testBinaryClassesWithLabelVocabulary(self):
-    n_classes = 2
-    self._testPredictions(
-        n_classes,
-        label_vocabulary=['class_vocab_{}'.format(i)
-                          for i in range(n_classes)],
-        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
-
-  def testMultiClassesWithoutLabelVocabulary(self):
-    n_classes = 4
-    self._testPredictions(
-        n_classes,
-        label_vocabulary=None,
-        label_output_fn=lambda x: ('%s' % x).encode())
-
-  def testMultiClassesWithLabelVocabulary(self):
-    n_classes = 4
-    self._testPredictions(
-        n_classes,
-        label_vocabulary=['class_vocab_{}'.format(i)
-                          for i in range(n_classes)],
-        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
-
-
-class BaselineClassifierIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, n_classes, train_input_fn, eval_input_fn,
-                          predict_input_fn, input_dimension, prediction_length):
-    feature_columns = [
-        feature_column_lib.numeric_column('x', shape=(input_dimension,))
-    ]
-    est = _baseline_classifier_fn(
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    # learn y = x
-    est.train(train_input_fn, steps=200)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array(
-        [x['classes'] for x in est.predict(predict_input_fn)])
-    self.assertAllEqual((prediction_length, 1), predictions.shape)
-
-    # EXPORT
-    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def _test_numpy_input_fn(self, n_classes):
-    """Tests complete flow with numpy_input_fn."""
-    input_dimension = 4
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, input_dimension)
-    target = np.array([1] * batch_size)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=target,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=target,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-
-    self._test_complete_flow(
-        n_classes=n_classes,
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        prediction_length=prediction_length)
-
-  def test_binary_classes_numpy_input_fn(self):
-    self._test_numpy_input_fn(n_classes=2)
-
-  def test_multi_classes_numpy_input_fn(self):
-    self._test_numpy_input_fn(n_classes=4)
-
-  def _test_pandas_input_fn(self, n_classes):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-
-    # Pandas DataFrame natually supports 1 dim data only.
-    input_dimension = 1
-    batch_size = 10
-    data = np.array([1., 2., 3., 4.], dtype=np.float32)
-    target = np.array([1, 0, 1, 0], dtype=np.int32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(target)
-    prediction_length = 4
-
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        n_classes=n_classes,
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        prediction_length=prediction_length)
-
-  def test_binary_classes_pandas_input_fn(self):
-    self._test_pandas_input_fn(n_classes=2)
-
-  def test_multi_classes_pandas_input_fn(self):
-    self._test_pandas_input_fn(n_classes=4)
-
-  def _test_input_fn_from_parse_example(self, n_classes):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    input_dimension = 2
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, input_dimension)
-    target = np.array([1] * batch_size, dtype=np.int64)
-
-    serialized_examples = []
-    for x, y in zip(data, target):
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=x)),
-              'y':
-                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                      value=[y])),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
-    }
-
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        n_classes=n_classes,
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=input_dimension,
-        prediction_length=prediction_length)
-
-  def test_binary_classes_input_fn_from_parse_example(self):
-    self._test_input_fn_from_parse_example(n_classes=2)
-
-  def test_multi_classes_input_fn_from_parse_example(self):
-    self._test_input_fn_from_parse_example(n_classes=4)
-
-
-# Tests for Baseline logit_fn.
-
-
-class BaselineLogitFnTest(test.TestCase):
-
-  def test_basic_logit_correctness(self):
-    """baseline_logit_fn simply returns the bias variable."""
-    with ops.Graph().as_default():
-      logit_fn = baseline._baseline_logit_fn_builder(num_outputs=2)
-      logits = logit_fn(features={'age': [[23.], [31.]]})
-      with variable_scope.variable_scope('baseline', reuse=True):
-        bias_var = variable_scope.get_variable('bias')
-      with tf_session.Session() as sess:
-        sess.run([variables.global_variables_initializer()])
-        self.assertAllClose([[0., 0.], [0., 0.]], logits.eval())
-        sess.run(bias_var.assign([10., 5.]))
-        self.assertAllClose([[10., 5.], [10., 5.]], logits.eval())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 0278990cfc9bfcf18912886aa752da262e805573..19c0b1dab73ee602c828ec29f949bcde35aa8645 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -12,1553 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Estimator classes for BoostedTrees."""
+"""boosted_trees python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
-import collections
-import functools
-
-import numpy as np
-
-from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.canned import boosted_trees_utils
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import boosted_trees_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.array_ops import identity as tf_identity
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.summary import summary
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training_util
-from tensorflow.python.util.tf_export import estimator_export
-
-# TODO(nponomareva): Reveal pruning params here.
-_TreeHParams = collections.namedtuple('TreeHParams', [
-    'n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity',
-    'min_node_weight', 'center_bias', 'pruning_mode'
-])
-
-_HOLD_FOR_MULTI_CLASS_SUPPORT = object()
-_HOLD_FOR_MULTI_DIM_SUPPORT = object()
-_DUMMY_NUM_BUCKETS = -1
-_DUMMY_NODE_ID = -1
-
-
-def _get_transformed_features(features, sorted_feature_columns):
-  """Gets the transformed features from features/feature_columns pair.
-
-  Args:
-    features: a dicionary of name to Tensor.
-    sorted_feature_columns: a list/set of tf.feature_column, sorted by name.
-
-  Returns:
-    result_features: a list of the transformed features, sorted by the name.
-
-  Raises:
-    ValueError: when unsupported features/columns are tried.
-  """
-  # pylint:disable=protected-access
-  transformed_features = feature_column_lib._transform_features(
-      features, sorted_feature_columns)
-  result_features = []
-  for column in sorted_feature_columns:
-    if isinstance(column, feature_column_lib._BucketizedColumn):
-      source_name = column.source_column.name
-      squeezed_tensor = array_ops.squeeze(transformed_features[column], axis=1)
-      if len(squeezed_tensor.shape) > 1:
-        raise ValueError('For now, only supports features equivalent to rank 1 '
-                         'but column `{}` got: {}'.format(
-                             source_name, features[source_name].shape))
-      result_features.append(squeezed_tensor)
-    elif isinstance(column, feature_column_lib._IndicatorColumn):
-      source_name = column.categorical_column.name
-      tensor = math_ops.to_int32(transformed_features[column])
-      if len(tensor.shape) > 2:
-        raise ValueError('Rank of indicator column must be no more than 2, '
-                         'but column `{}` got: {}'.format(
-                             source_name, features[source_name].shape))
-      unstacked = array_ops.unstack(tensor, axis=1)
-      result_features.extend(unstacked)
-    else:
-      raise ValueError(
-          'For now, only bucketized_column and indicator_column is supported '
-          'but got: {}'.format(column))
-    # pylint:enable=protected-access
-
-  return result_features
-
-
-def _local_variable(initial_value, name=None):
-  """Stores a tensor as a local Variable for faster read."""
-  result = variable_scope.variable(
-      initial_value=initial_value,
-      trainable=False,
-      collections=[ops.GraphKeys.LOCAL_VARIABLES],
-      validate_shape=False,
-      name=name)
-  if isinstance(initial_value, ops.Tensor):
-    # Match the resulting variable's shape if the initial_value is a Tensor.
-    result.set_shape(initial_value.shape)
-  return result
-
-
-def _group_features_by_num_buckets(sorted_feature_columns):
-  """Groups feature ids by the number of buckets.
-
-  Derives the feature ids based on iterating through ordered feature columns
-  and groups them by the number of buckets each feature require. Returns a
-  sorted list of buckets and a list of lists of feature ids for each of those
-  buckets.
-
-  Args:
-    sorted_feature_columns: a list/set of tf.feature_column sorted by name.
-
-  Returns:
-    bucket_size_list: a list of required bucket sizes.
-    feature_ids_list: a list of lists of feature ids for each bucket size.
-
-  Raises:
-    ValueError: when unsupported features columns are provided.
-  """
-  bucket_size_to_feature_ids_dict = collections.OrderedDict()
-
-  # TODO(nponomareva) for now we preserve the previous functionality and bucket
-  # all numeric into the same num of buckets. Can be easily changed to using
-  # each numeric's real buckets num, but we need to test that it does not cause
-  # a performance hit.
-
-  # We will replace this dummy key with the real max after we calculate it.
-  bucket_size_to_feature_ids_dict[_DUMMY_NUM_BUCKETS] = []
-
-  max_buckets_for_bucketized = 2
-  max_buckets_for_indicator = 2
-
-  feature_idx = 0
-  # pylint:disable=protected-access
-
-  for column in sorted_feature_columns:
-    if isinstance(column, feature_column_lib._IndicatorColumn):
-      num_categorical_features = column.categorical_column._num_buckets
-      if max_buckets_for_indicator not in bucket_size_to_feature_ids_dict:
-        bucket_size_to_feature_ids_dict[max_buckets_for_indicator] = []
-
-      for _ in range(num_categorical_features):
-        # We use bucket size of 2 for categorical.
-        bucket_size_to_feature_ids_dict[max_buckets_for_indicator].append(
-            feature_idx)
-        feature_idx += 1
-    elif isinstance(column, feature_column_lib._BucketizedColumn):
-      max_buckets_for_bucketized = max(max_buckets_for_bucketized,
-                                       len(column.boundaries) + 1)
-      bucket_size_to_feature_ids_dict[_DUMMY_NUM_BUCKETS].append(feature_idx)
-      feature_idx += 1
-    elif not isinstance(column, feature_column_lib._IndicatorColumn):  # pylint:disable=protected-access
-      raise ValueError(
-          'For now, only bucketized_column and indicator column are supported '
-          'but got: {}'.format(column))
-
-  # pylint:enable=protected-access
-  # Replace the dummy key with the real max num of buckets for all bucketized
-  # columns.
-  if max_buckets_for_bucketized not in bucket_size_to_feature_ids_dict:
-    bucket_size_to_feature_ids_dict[max_buckets_for_bucketized] = []
-  bucket_size_to_feature_ids_dict[max_buckets_for_bucketized].extend(
-      bucket_size_to_feature_ids_dict[_DUMMY_NUM_BUCKETS])
-  del bucket_size_to_feature_ids_dict[_DUMMY_NUM_BUCKETS]
-
-  feature_ids_list = list(bucket_size_to_feature_ids_dict.values())
-  bucket_size_list = list(bucket_size_to_feature_ids_dict.keys())
-  return bucket_size_list, feature_ids_list
-
-
-def _calculate_num_features(sorted_feature_columns):
-  num_features = 0
-  for column in sorted_feature_columns:
-    if isinstance(column, feature_column_lib._IndicatorColumn):  # pylint:disable=protected-access
-      num_features += column.categorical_column._num_buckets  # pylint:disable=protected-access
-    else:
-      num_features += 1
-  return num_features
-
-
-def _generate_feature_name_mapping(sorted_feature_columns):
-  """Return a list of feature name for feature ids.
-
-  Args:
-    sorted_feature_columns: a list/set of tf.feature_column sorted by name.
-
-  Returns:
-    feature_name_mapping: a list of feature names indexed by the feature ids.
-
-  Raises:
-    ValueError: when unsupported features/columns are tried.
-  """
-  names = []
-  for column in sorted_feature_columns:
-    if isinstance(column, feature_column_lib._IndicatorColumn):  # pylint:disable=protected-access
-      categorical_column = column.categorical_column
-      if isinstance(categorical_column,
-                    feature_column_lib._VocabularyListCategoricalColumn):  # pylint:disable=protected-access
-        for value in categorical_column.vocabulary_list:
-          names.append('{}:{}'.format(column.name, value))
-      elif isinstance(categorical_column,
-                      feature_column_lib._BucketizedColumn):  # pylint:disable=protected-access
-        boundaries = [-np.inf] + list(categorical_column.boundaries) + [np.inf]
-        for pair in zip(boundaries[:-1], boundaries[1:]):
-          names.append('{}:{}'.format(column.name, pair))
-      else:
-        for num in range(categorical_column._num_buckets):  # pylint:disable=protected-access
-          names.append('{}:{}'.format(column.name, num))
-    elif isinstance(column, feature_column_lib._BucketizedColumn):
-      names.append(column.name)
-    else:
-      raise ValueError(
-          'For now, only bucketized_column and indicator_column is supported '
-          'but got: {}'.format(column))
-  return names
-
-
-def _cache_transformed_features(features, sorted_feature_columns, batch_size):
-  """Transform features and cache, then returns (cached_features, cache_op)."""
-  num_features = _calculate_num_features(sorted_feature_columns)
-  cached_features = [
-      _local_variable(
-          array_ops.zeros([batch_size], dtype=dtypes.int32),
-          name='cached_feature_{}'.format(i)) for i in range(num_features)
-  ]
-  are_features_cached = _local_variable(False, name='are_features_cached')
-
-  def cache_features_and_return():
-    """Caches transformed features.
-
-    The intention is to hide get_transformed_features() from the graph by
-    caching the result except the first step, since bucketize operation
-    (inside get_transformed_features) is expensive.
-
-    Returns:
-      input_feature_list: a list of input features.
-      cache_flip_op: op to add to graph to make sure cache update is included to
-          the graph.
-    """
-
-    transformed_features = _get_transformed_features(features,
-                                                     sorted_feature_columns)
-    cached = [
-        state_ops.assign(cached_features[i], transformed_features[i])
-        for i in range(num_features)
-    ]
-    # TODO(youngheek): Try other combination of dependencies so that the
-    # function returns a single result, not a tuple.
-    with ops.control_dependencies(cached):
-      cache_flip_op = are_features_cached.assign(True)
-    return cached, cache_flip_op
-
-  input_feature_list, cache_flip_op = control_flow_ops.cond(
-      are_features_cached, lambda: (cached_features, control_flow_ops.no_op()),
-      cache_features_and_return)
-  return input_feature_list, cache_flip_op
-
-
-class _CacheTrainingStatesUsingHashTable(object):
-  """Caching logits, etc. using MutableHashTable."""
-
-  def __init__(self, example_ids, logits_dimension):
-    """Creates a cache with the given configuration.
-
-    It maintains a MutableDenseHashTable for all values.
-    The API lookup() and insert() would have those specs,
-      tree_ids: shape=[batch_size], dtype=int32
-      node_ids: shape=[batch_size], dtype=int32
-      logits: shape=[batch_size, logits_dimension], dtype=float32
-    However in the MutableDenseHashTable, ids are bitcasted into float32 and
-    all values are concatenated as a single tensor (of float32).
-
-    Hence conversion happens internally before inserting to the HashTable and
-    after lookup from it.
-
-    Args:
-      example_ids: a Rank 1 tensor to be used as a key of the cache.
-      logits_dimension: a constant (int) for the dimension of logits.
-
-    Raises:
-      ValueError: if example_ids is other than int64 or string.
-    """
-    if dtypes.as_dtype(dtypes.int64).is_compatible_with(example_ids.dtype):
-      empty_key = -1 << 62
-    elif dtypes.as_dtype(dtypes.string).is_compatible_with(example_ids.dtype):
-      empty_key = ''
-    else:
-      raise ValueError(
-          'Unsupported example_id_feature dtype %s.' % example_ids.dtype)
-    # Cache holds latest <tree_id, node_id, logits> for each example.
-    # tree_id and node_id are both int32 but logits is a float32.
-    # To reduce the overhead, we store all of them together as float32 and
-    # bitcast the ids to int32.
-    self._table_ref = lookup_ops.mutable_dense_hash_table_v2(
-        empty_key=empty_key, value_dtype=dtypes.float32, value_shape=[3])
-    self._example_ids = ops.convert_to_tensor(example_ids)
-    if self._example_ids.shape.ndims not in (None, 1):
-      raise ValueError(
-          'example_id should have rank 1, but got %s' % self._example_ids)
-    self._logits_dimension = logits_dimension
-
-  def lookup(self):
-    """Returns cached_tree_ids, cached_node_ids, cached_logits."""
-    cached_tree_ids, cached_node_ids, cached_logits = array_ops.split(
-        lookup_ops.lookup_table_find_v2(
-            self._table_ref,
-            self._example_ids,
-            default_value=[0.0, _DUMMY_NODE_ID, 0.0]),
-        [1, 1, self._logits_dimension],
-        axis=1)
-    cached_tree_ids = array_ops.squeeze(
-        array_ops.bitcast(cached_tree_ids, dtypes.int32))
-    cached_node_ids = array_ops.squeeze(
-        array_ops.bitcast(cached_node_ids, dtypes.int32))
-    if self._example_ids.shape.ndims is not None:
-      cached_logits.set_shape(
-          [self._example_ids.shape[0], self._logits_dimension])
-    return (cached_tree_ids, cached_node_ids, cached_logits)
-
-  def insert(self, tree_ids, node_ids, logits):
-    """Inserts values and returns the op."""
-    insert_op = lookup_ops.lookup_table_insert_v2(
-        self._table_ref, self._example_ids,
-        array_ops.concat(
-            [
-                array_ops.expand_dims(
-                    array_ops.bitcast(tree_ids, dtypes.float32), 1),
-                array_ops.expand_dims(
-                    array_ops.bitcast(node_ids, dtypes.float32), 1),
-                logits,
-            ],
-            axis=1,
-            name='value_concat_for_cache_insert'))
-    return insert_op
-
-
-class _CacheTrainingStatesUsingVariables(object):
-  """Caching logits, etc. using Variables."""
-
-  def __init__(self, batch_size, logits_dimension):
-    """Creates a cache with the given configuration.
-
-    It maintains three variables, tree_ids, node_ids, logits, for caching.
-      tree_ids: shape=[batch_size], dtype=int32
-      node_ids: shape=[batch_size], dtype=int32
-      logits: shape=[batch_size, logits_dimension], dtype=float32
-
-    Note, this can be used only with in-memory data setting.
-
-    Args:
-      batch_size: `int`, the size of the cache.
-      logits_dimension: a constant (int) for the dimension of logits.
-    """
-    self._logits_dimension = logits_dimension
-    self._tree_ids = _local_variable(
-        array_ops.zeros([batch_size], dtype=dtypes.int32),
-        name='tree_ids_cache')
-    self._node_ids = _local_variable(
-        _DUMMY_NODE_ID * array_ops.ones([batch_size], dtype=dtypes.int32),
-        name='node_ids_cache')
-    self._logits = _local_variable(
-        array_ops.zeros([batch_size, logits_dimension], dtype=dtypes.float32),
-        name='logits_cache')
-
-  def lookup(self):
-    """Returns cached_tree_ids, cached_node_ids, cached_logits."""
-    return (self._tree_ids, self._node_ids, self._logits)
-
-  def insert(self, tree_ids, node_ids, logits):
-    """Inserts values and returns the op."""
-    return control_flow_ops.group(
-        [
-            self._tree_ids.assign(tree_ids),
-            self._node_ids.assign(node_ids),
-            self._logits.assign(logits)
-        ],
-        name='cache_insert')
-
-
-class _StopAtAttemptsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at the number of attempts."""
-
-  def __init__(self, num_finalized_trees_tensor, num_attempted_layers_tensor,
-               max_trees, max_depth):
-    self._num_finalized_trees_tensor = num_finalized_trees_tensor
-    self._num_attempted_layers_tensor = num_attempted_layers_tensor
-    self._max_trees = max_trees
-    self._max_depth = max_depth
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(
-        [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
-
-  def after_run(self, run_context, run_values):
-    # num_* tensors should be retrieved by a separate session than the training
-    # one, in order to read the values after growing.
-    # So, if it's approaching to the limit, get the actual value by additional
-    # session.
-    num_finalized_trees, num_attempted_layers = run_values.results
-    if (num_finalized_trees >= self._max_trees - 1 or
-        num_attempted_layers > 2 * self._max_trees * self._max_depth - 1):
-      num_finalized_trees, num_attempted_layers = run_context.session.run(
-          [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
-    if (num_finalized_trees >= self._max_trees or
-        num_attempted_layers > 2 * self._max_trees * self._max_depth):
-      run_context.request_stop()
-
-
-def _get_max_splits(tree_hparams):
-  """Calculates the max possible number of splits based on tree params."""
-  # maximum number of splits possible in the whole tree =2^(D-1)-1
-  max_splits = (1 << tree_hparams.max_depth) - 1
-  return max_splits
-
-
-class _EnsembleGrower(object):
-  """Abstract base class for different types of ensemble growers.
-
-  Use it to receive training ops for growing and centering bias, depending
-  on the implementation (for example, in memory or accumulator-based
-  distributed):
-    grower = ...create subclass grower(tree_ensemble, tree_hparams)
-    grow_op = grower.grow_tree(stats_summaries_list, feature_ids_list,
-                               last_layer_nodes_range)
-    training_ops.append(grow_op)
-  """
-
-  def __init__(self, tree_ensemble, tree_hparams, feature_ids_list):
-    """Initializes a grower object.
-
-    Args:
-      tree_ensemble: A TreeEnsemble variable.
-      tree_hparams: TODO. collections.namedtuple for hyper parameters.
-      feature_ids_list: a list of lists of feature ids for each bucket size.
-
-    Raises:
-      ValueError: when pruning mode is invalid or pruning is used and no tree
-      complexity is set.
-    """
-    self._tree_ensemble = tree_ensemble
-    self._tree_hparams = tree_hparams
-    self._feature_ids_list = feature_ids_list
-    # pylint: disable=protected-access
-    self._pruning_mode_parsed = boosted_trees_ops.PruningMode.from_str(
-        tree_hparams.pruning_mode)
-
-    if tree_hparams.tree_complexity > 0:
-      if self._pruning_mode_parsed == boosted_trees_ops.PruningMode.NO_PRUNING:
-        raise ValueError(
-            'Tree complexity have no effect unless pruning mode is chosen.')
-    else:
-      if self._pruning_mode_parsed != boosted_trees_ops.PruningMode.NO_PRUNING:
-        raise ValueError('For pruning, tree_complexity must be positive.')
-    # pylint: enable=protected-access
-
-  @abc.abstractmethod
-  def center_bias(self, center_bias_var, gradients, hessians):
-    """Centers bias, if ready, based on statistics.
-
-    Args:
-      center_bias_var: A variable that will be updated when bias centering
-        finished.
-      gradients: A rank 2 tensor of gradients.
-      hessians: A rank 2 tensor of hessians.
-
-    Returns:
-      An operation for centering bias.
-    """
-
-  @abc.abstractmethod
-  def grow_tree(self, stats_summaries_list, last_layer_nodes_range):
-    """Grows a tree, if ready, based on provided statistics.
-
-    Args:
-      stats_summaries_list: List of stats summary tensors, representing sums of
-        gradients and hessians for each feature bucket.
-      last_layer_nodes_range: A tensor representing ids of the nodes in the
-        current layer, to be split.
-
-    Returns:
-      An op for growing a tree.
-    """
-
-  def chief_init_op(self):
-    """Ops that chief needs to run to initialize the state."""
-    return control_flow_ops.no_op()
-
-  #  ============= Helper methods ===========
-
-  def _center_bias_fn(self, center_bias_var, mean_gradients, mean_hessians):
-    """Updates the ensembles and cache (if needed) with logits prior."""
-    continue_centering = boosted_trees_ops.center_bias(
-        self._tree_ensemble.resource_handle,
-        mean_gradients=mean_gradients,
-        mean_hessians=mean_hessians,
-        l1=self._tree_hparams.l1,
-        l2=self._tree_hparams.l2)
-    return center_bias_var.assign(continue_centering)
-
-  def _grow_tree_from_stats_summaries(self, stats_summaries_list,
-                                      last_layer_nodes_range):
-    """Updates ensemble based on the best gains from stats summaries."""
-    node_ids_per_feature = []
-    gains_list = []
-    thresholds_list = []
-    left_node_contribs_list = []
-    right_node_contribs_list = []
-    all_feature_ids = []
-    assert len(stats_summaries_list) == len(self._feature_ids_list)
-
-    max_splits = _get_max_splits(self._tree_hparams)
-
-    for i, feature_ids in enumerate(self._feature_ids_list):
-      (numeric_node_ids_per_feature, numeric_gains_list,
-       numeric_thresholds_list, numeric_left_node_contribs_list,
-       numeric_right_node_contribs_list) = (
-           boosted_trees_ops.calculate_best_gains_per_feature(
-               node_id_range=last_layer_nodes_range,
-               stats_summary_list=stats_summaries_list[i],
-               l1=self._tree_hparams.l1,
-               l2=self._tree_hparams.l2,
-               tree_complexity=self._tree_hparams.tree_complexity,
-               min_node_weight=self._tree_hparams.min_node_weight,
-               max_splits=max_splits))
-
-      all_feature_ids += feature_ids
-      node_ids_per_feature += numeric_node_ids_per_feature
-      gains_list += numeric_gains_list
-      thresholds_list += numeric_thresholds_list
-      left_node_contribs_list += numeric_left_node_contribs_list
-      right_node_contribs_list += numeric_right_node_contribs_list
-
-    grow_op = boosted_trees_ops.update_ensemble(
-        # Confirm if local_tree_ensemble or tree_ensemble should be used.
-        self._tree_ensemble.resource_handle,
-        feature_ids=all_feature_ids,
-        node_ids=node_ids_per_feature,
-        gains=gains_list,
-        thresholds=thresholds_list,
-        left_node_contribs=left_node_contribs_list,
-        right_node_contribs=right_node_contribs_list,
-        learning_rate=self._tree_hparams.learning_rate,
-        max_depth=self._tree_hparams.max_depth,
-        pruning_mode=self._pruning_mode_parsed)
-    return grow_op
-
-
-class _InMemoryEnsembleGrower(_EnsembleGrower):
-  """An in-memory ensemble grower."""
-
-  def __init__(self, tree_ensemble, tree_hparams, feature_ids_list):
-
-    super(_InMemoryEnsembleGrower, self).__init__(
-        tree_ensemble=tree_ensemble, tree_hparams=tree_hparams,
-        feature_ids_list=feature_ids_list)
-
-  def center_bias(self, center_bias_var, gradients, hessians):
-    # For in memory, we already have a full batch of gradients and hessians,
-    # so just take a mean and proceed with centering.
-    mean_gradients = array_ops.expand_dims(
-        math_ops.reduce_mean(gradients, 0), 0)
-    mean_heassians = array_ops.expand_dims(math_ops.reduce_mean(hessians, 0), 0)
-    return self._center_bias_fn(center_bias_var, mean_gradients, mean_heassians)
-
-  def grow_tree(self, stats_summaries_list, last_layer_nodes_range):
-    # For in memory, we already have full data in one batch, so we can grow the
-    # tree immediately.
-    return self._grow_tree_from_stats_summaries(
-        stats_summaries_list, last_layer_nodes_range)
-
-
-class _AccumulatorEnsembleGrower(_EnsembleGrower):
-  """An accumulator based ensemble grower."""
-
-  def __init__(self, tree_ensemble, tree_hparams, stamp_token,
-               n_batches_per_layer, bucket_size_list, is_chief, center_bias,
-               feature_ids_list):
-    super(_AccumulatorEnsembleGrower, self).__init__(
-        tree_ensemble=tree_ensemble, tree_hparams=tree_hparams,
-        feature_ids_list=feature_ids_list)
-    self._stamp_token = stamp_token
-    self._n_batches_per_layer = n_batches_per_layer
-    self._bucket_size_list = bucket_size_list
-    self._is_chief = is_chief
-    self._growing_accumulators = []
-    self._chief_init_ops = []
-    max_splits = _get_max_splits(self._tree_hparams)
-    for i, feature_ids in enumerate(self._feature_ids_list):
-      accumulator = data_flow_ops.ConditionalAccumulator(
-          dtype=dtypes.float32,
-          # The stats consist of grads and hessians (the last dimension).
-          shape=[len(feature_ids), max_splits, self._bucket_size_list[i], 2],
-          shared_name='numeric_stats_summary_accumulator_' + str(i))
-      self._chief_init_ops.append(
-          accumulator.set_global_step(self._stamp_token))
-      self._growing_accumulators.append(accumulator)
-    self._center_bias = center_bias
-    if center_bias:
-      self._bias_accumulator = data_flow_ops.ConditionalAccumulator(
-          dtype=dtypes.float32,
-          # The stats consist of grads and hessians means only.
-          # TODO(nponomareva): this will change for a multiclass
-          shape=[2, 1],
-          shared_name='bias_accumulator')
-      self._chief_init_ops.append(
-          self._bias_accumulator.set_global_step(self._stamp_token))
-
-  def center_bias(self, center_bias_var, gradients, hessians):
-    # For not in memory situation, we need to accumulate enough of batches first
-    # before proceeding with centering bias.
-
-    # Create an accumulator.
-    if not self._center_bias:
-      raise RuntimeError('center_bias called but bias centering is disabled.')
-    bias_dependencies = []
-    grads_and_hess = array_ops.stack([gradients, hessians], axis=0)
-    grads_and_hess = math_ops.reduce_mean(grads_and_hess, axis=1)
-
-    apply_grad = self._bias_accumulator.apply_grad(
-        grads_and_hess, self._stamp_token)
-    bias_dependencies.append(apply_grad)
-
-    # Center bias if enough batches were processed.
-    with ops.control_dependencies(bias_dependencies):
-      if not self._is_chief:
-        return control_flow_ops.no_op()
-      def _set_accumulators_stamp():
-        return control_flow_ops.group(
-            [acc.set_global_step(self._stamp_token + 1) for acc in
-             self._growing_accumulators])
-
-      def center_bias_from_accumulator():
-        accumulated = array_ops.unstack(self._bias_accumulator.take_grad(1),
-                                        axis=0)
-        center_bias_op = self._center_bias_fn(
-            center_bias_var,
-            array_ops.expand_dims(accumulated[0], 0),
-            array_ops.expand_dims(accumulated[1], 0))
-        with ops.control_dependencies([center_bias_op]):
-          return control_flow_ops.cond(center_bias_var,
-                                       control_flow_ops.no_op,
-                                       _set_accumulators_stamp)
-
-      center_bias_op = control_flow_ops.cond(
-          math_ops.greater_equal(self._bias_accumulator.num_accumulated(),
-                                 self._n_batches_per_layer),
-          center_bias_from_accumulator,
-          control_flow_ops.no_op,
-          name='wait_until_n_batches_for_bias_accumulated')
-      return center_bias_op
-
-  def grow_tree(self, stats_summaries_list, last_layer_nodes_range):
-    dependencies = []
-    for i in range(len(self._feature_ids_list)):
-      stats_summaries = stats_summaries_list[i]
-      apply_grad = self._growing_accumulators[i].apply_grad(
-          array_ops.stack(stats_summaries, axis=0), self._stamp_token)
-      dependencies.append(apply_grad)
-
-    # Grow the tree if enough batches is accumulated.
-    with ops.control_dependencies(dependencies):
-      if not self._is_chief:
-        return control_flow_ops.no_op()
-
-      min_accumulated = math_ops.reduce_min(
-          array_ops.stack([acc.num_accumulated() for acc in
-                           self._growing_accumulators]))
-
-      def grow_tree_from_accumulated_summaries_fn():
-        """Updates tree with the best layer from accumulated summaries."""
-        # Take out the accumulated summaries from the accumulator and grow.
-        stats_summaries_list = []
-        stats_summaries_list = [
-            array_ops.unstack(accumulator.take_grad(1), axis=0)
-            for accumulator in self._growing_accumulators
-        ]
-        grow_op = self._grow_tree_from_stats_summaries(
-            stats_summaries_list, last_layer_nodes_range
-        )
-        return grow_op
-
-      grow_model = control_flow_ops.cond(
-          math_ops.greater_equal(min_accumulated, self._n_batches_per_layer),
-          grow_tree_from_accumulated_summaries_fn,
-          control_flow_ops.no_op,
-          name='wait_until_n_batches_accumulated')
-      return grow_model
-
-  def chief_init_op(self):
-    """Ops that chief needs to run to initialize the state."""
-    return control_flow_ops.group(self._chief_init_ops)
-
-
-def _bt_model_fn(
-    features,
-    labels,
-    mode,
-    head,
-    feature_columns,
-    tree_hparams,
-    n_batches_per_layer,
-    config,
-    closed_form_grad_and_hess_fn=None,
-    example_id_column_name=None,
-    # TODO(youngheek): replace this later using other options.
-    train_in_memory=False,
-    name='boosted_trees'):
-  """Gradient Boosted Trees model_fn.
-
-  Args:
-    features: dict of `Tensor`.
-    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
-      dtype `int32` or `int64` in the range `[0, n_classes)`.
-    mode: Defines whether this is training, evaluation or prediction.
-      See `ModeKeys`.
-    head: A `head_lib._Head` instance.
-    feature_columns: Iterable of `feature_column._FeatureColumn` model inputs.
-    tree_hparams: TODO. collections.namedtuple for hyper parameters.
-    n_batches_per_layer: A `Tensor` of `int64`. Each layer is built after at
-      least n_batches_per_layer accumulations.
-    config: `RunConfig` object to configure the runtime settings.
-    closed_form_grad_and_hess_fn: a function that accepts logits and labels
-      and returns gradients and hessians. By default, they are created by
-      tf.gradients() from the loss.
-    example_id_column_name: Name of the feature for a unique ID per example.
-      Currently experimental -- not exposed to public API.
-    train_in_memory: `bool`, when true, it assumes the dataset is in memory,
-      i.e., input_fn should return the entire dataset as a single batch, and
-      also n_batches_per_layer should be set as 1.
-    name: Name to use for the model.
-
-  Returns:
-      An `EstimatorSpec` instance.
-
-  Raises:
-    ValueError: mode or params are invalid, or features has the wrong type.
-  """
-  sorted_feature_columns = sorted(feature_columns, key=lambda tc: tc.name)
-  with ops.name_scope(name) as name:
-    # Prepare.
-    global_step = training_util.get_or_create_global_step()
-    bucket_size_list, feature_ids_list = _group_features_by_num_buckets(
-        sorted_feature_columns)
-    # Create Ensemble resources.
-    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-
-    # Create logits.
-    if mode != model_fn_lib.ModeKeys.TRAIN:
-      input_feature_list = _get_transformed_features(features,
-                                                     sorted_feature_columns)
-      logits = boosted_trees_ops.predict(
-          # For non-TRAIN mode, ensemble doesn't change after initialization,
-          # so no local copy is needed; using tree_ensemble directly.
-          tree_ensemble_handle=tree_ensemble.resource_handle,
-          bucketized_features=input_feature_list,
-          logits_dimension=head.logits_dimension)
-      return head.create_estimator_spec(
-          features=features,
-          mode=mode,
-          labels=labels,
-          train_op_fn=control_flow_ops.no_op,
-          logits=logits)
-
-    # ============== Training graph ==============
-    center_bias = tree_hparams.center_bias
-    is_single_machine = (config.num_worker_replicas <= 1)
-
-    if train_in_memory:
-      assert n_batches_per_layer == 1, (
-          'When train_in_memory is enabled, input_fn should return the entire '
-          'dataset as a single batch, and n_batches_per_layer should be set as '
-          '1.')
-      if (not config.is_chief or config.num_worker_replicas > 1 or
-          config.num_ps_replicas > 0):
-        raise ValueError('train_in_memory is supported only for '
-                         'non-distributed training.')
-    worker_device = control_flow_ops.no_op().device
-    train_op = []
-    # Extract input features and set up cache for training.
-    training_state_cache = None
-    if train_in_memory:
-      # cache transformed features as well for in-memory training.
-      batch_size = array_ops.shape(labels)[0]
-      input_feature_list, input_cache_op = (
-          _cache_transformed_features(features, sorted_feature_columns,
-                                      batch_size))
-      train_op.append(input_cache_op)
-      training_state_cache = _CacheTrainingStatesUsingVariables(
-          batch_size, head.logits_dimension)
-    else:
-      input_feature_list = _get_transformed_features(features,
-                                                     sorted_feature_columns)
-      if example_id_column_name:
-        example_ids = features[example_id_column_name]
-        training_state_cache = _CacheTrainingStatesUsingHashTable(
-            example_ids, head.logits_dimension)
-    if training_state_cache:
-      cached_tree_ids, cached_node_ids, cached_logits = (
-          training_state_cache.lookup())
-    else:
-      # Always start from the beginning when no cache is set up.
-      batch_size = array_ops.shape(labels)[0]
-      cached_tree_ids, cached_node_ids, cached_logits = (
-          array_ops.zeros([batch_size], dtype=dtypes.int32),
-          _DUMMY_NODE_ID * array_ops.ones([batch_size], dtype=dtypes.int32),
-          array_ops.zeros(
-              [batch_size, head.logits_dimension], dtype=dtypes.float32))
-
-    if is_single_machine:
-      local_tree_ensemble = tree_ensemble
-      ensemble_reload = control_flow_ops.no_op()
-    else:
-      # Have a local copy of ensemble for the distributed setting.
-      with ops.device(worker_device):
-        local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
-            name=name + '_local', is_local=True)
-      # TODO(soroush): Do partial updates if this becomes a bottleneck.
-      ensemble_reload = local_tree_ensemble.deserialize(
-          *tree_ensemble.serialize())
-    with ops.control_dependencies([ensemble_reload]):
-      (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
-       last_layer_nodes_range) = local_tree_ensemble.get_states()
-      partial_logits, tree_ids, node_ids = boosted_trees_ops.training_predict(
-          tree_ensemble_handle=local_tree_ensemble.resource_handle,
-          cached_tree_ids=cached_tree_ids,
-          cached_node_ids=cached_node_ids,
-          bucketized_features=input_feature_list,
-          logits_dimension=head.logits_dimension)
-    logits = cached_logits + partial_logits
-
-    if train_in_memory:
-      grower = _InMemoryEnsembleGrower(tree_ensemble, tree_hparams,
-                                       feature_ids_list=feature_ids_list)
-    else:
-      grower = _AccumulatorEnsembleGrower(tree_ensemble, tree_hparams,
-                                          stamp_token, n_batches_per_layer,
-                                          bucket_size_list, config.is_chief,
-                                          center_bias=center_bias,
-                                          feature_ids_list=feature_ids_list)
-
-    summary.scalar('ensemble/num_trees', num_trees)
-    summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
-    summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
-
-    # Variable that determines whether bias centering is needed.
-    center_bias_var = variable_scope.variable(
-        initial_value=center_bias, name='center_bias_needed', trainable=False,
-        use_resource=True)
-    # Create training graph.
-    def _train_op_fn(loss):
-      """Run one training iteration."""
-      if training_state_cache:
-        # Cache logits only after center_bias is complete, if it's in progress.
-        train_op.append(
-            control_flow_ops.cond(
-                center_bias_var, control_flow_ops.no_op,
-                lambda: training_state_cache.insert(tree_ids, node_ids, logits))
-        )
-
-      if closed_form_grad_and_hess_fn:
-        gradients, hessians = closed_form_grad_and_hess_fn(logits, labels)
-      else:
-        gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0]
-        hessians = gradients_impl.gradients(
-            gradients, logits, name='Hessians')[0]
-
-      # TODO(youngheek): perhaps storage could be optimized by storing stats
-      # with the dimension max_splits_per_layer, instead of max_splits (for the
-      # entire tree).
-      max_splits = _get_max_splits(tree_hparams)
-
-      stats_summaries_list = []
-      for i, feature_ids in enumerate(feature_ids_list):
-        num_buckets = bucket_size_list[i]
-        summaries = [
-            array_ops.squeeze(
-                boosted_trees_ops.make_stats_summary(
-                    node_ids=node_ids,
-                    gradients=gradients,
-                    hessians=hessians,
-                    bucketized_features_list=[input_feature_list[f]],
-                    max_splits=max_splits,
-                    num_buckets=num_buckets),
-                axis=0) for f in feature_ids
-        ]
-        stats_summaries_list.append(summaries)
-      if center_bias:
-        update_model = control_flow_ops.cond(
-            center_bias_var,
-            functools.partial(
-                grower.center_bias,
-                center_bias_var,
-                gradients,
-                hessians,
-            ),
-            functools.partial(grower.grow_tree, stats_summaries_list,
-                              last_layer_nodes_range))
-      else:
-        update_model = grower.grow_tree(stats_summaries_list,
-                                        last_layer_nodes_range)
-      train_op.append(update_model)
-
-      with ops.control_dependencies([update_model]):
-        increment_global = state_ops.assign_add(global_step, 1).op
-        train_op.append(increment_global)
-
-      return control_flow_ops.group(train_op, name='train_op')
-
-  estimator_spec = head.create_estimator_spec(
-      features=features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_train_op_fn,
-      logits=logits)
-
-  # Add an early stop hook.
-  estimator_spec = estimator_spec._replace(
-      training_hooks=estimator_spec.training_hooks +
-      (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
-                           tree_hparams.n_trees, tree_hparams.max_depth),),
-      training_chief_hooks=[GrowerInitializationHook(grower.chief_init_op())] +
-      list(estimator_spec.training_chief_hooks))
-  return estimator_spec
-
-
-class GrowerInitializationHook(session_run_hook.SessionRunHook):
-  """A SessionRunHook handles initialization of `_EnsembleGrower`."""
-
-  def __init__(self, init_op):
-    self._init_op = init_op
-
-  def after_create_session(self, session, coord):
-    session.run(self._init_op)
-
-
-def _create_classification_head(n_classes,
-                                weight_column=None,
-                                label_vocabulary=None):
-  """Creates a classification head. Refer to canned.head for details on args."""
-  # TODO(nponomareva): Support multi-class cases.
-  if n_classes == 2:
-    # pylint: disable=protected-access
-    return head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column=weight_column,
-        label_vocabulary=label_vocabulary,
-        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-    # pylint: enable=protected-access
-  else:
-    raise ValueError('For now only binary classification is supported.'
-                     'n_classes given as {}'.format(n_classes))
-
-
-def _create_classification_head_and_closed_form(n_classes, weight_column,
-                                                label_vocabulary):
-  """Creates a head for classifier and the closed form gradients/hessians."""
-  head = _create_classification_head(n_classes, weight_column, label_vocabulary)
-  if (n_classes == 2 and head.logits_dimension == 1 and
-      weight_column is None and label_vocabulary is None):
-    # Use the closed-form gradients/hessians for 2 class.
-    def _grad_and_hess_for_logloss(logits, labels):
-      """A closed form gradient and hessian for logistic loss."""
-      # TODO(youngheek): add weights handling.
-      predictions = math_ops.reciprocal(math_ops.exp(-logits) + 1.0)
-      normalizer = math_ops.reciprocal(
-          math_ops.cast(array_ops.size(predictions), dtypes.float32))
-      labels = math_ops.cast(labels, dtypes.float32)
-      labels = head_lib._check_dense_labels_match_logits_and_reshape(  # pylint: disable=protected-access
-          labels, logits, head.logits_dimension)
-      gradients = (predictions - labels) * normalizer
-      hessians = predictions * (1.0 - predictions) * normalizer
-      return gradients, hessians
-
-    closed_form = _grad_and_hess_for_logloss
-  else:
-    closed_form = None
-  return (head, closed_form)
-
-
-def _create_regression_head(label_dimension, weight_column=None):
-  if label_dimension != 1:
-    raise ValueError('For now only 1 dimension regression is supported.'
-                     'label_dimension given as {}'.format(label_dimension))
-  # pylint: disable=protected-access
-  return head_lib._regression_head(
-      label_dimension=label_dimension,
-      weight_column=weight_column,
-      loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-  # pylint: enable=protected-access
-
-
-def _compute_feature_importances_per_tree(tree, num_features):
-  """Computes the importance of each feature in the tree."""
-  importances = np.zeros(num_features)
-
-  for node in tree.nodes:
-    node_type = node.WhichOneof('node')
-    if node_type == 'bucketized_split':
-      feature_id = node.bucketized_split.feature_id
-      importances[feature_id] += node.metadata.gain
-    elif node_type == 'leaf':
-      assert node.metadata.gain == 0
-    else:
-      raise ValueError('Unexpected split type %s', node_type)
-
-  return importances
-
-
-def _compute_feature_importances(tree_ensemble, num_features, normalize):
-  """Computes gain-based feature importances.
-
-  The higher the value, the more important the feature.
-
-  Args:
-    tree_ensemble: a trained tree ensemble, instance of proto
-      boosted_trees.TreeEnsemble.
-    num_features: The total number of feature ids.
-    normalize: If True, normalize the feature importances.
-
-  Returns:
-    sorted_feature_idx: A list of feature_id which is sorted
-      by its feature importance.
-    feature_importances: A list of corresponding feature importances.
-
-  Raises:
-    AssertionError: When normalize = True, if feature importances
-      contain negative value, or if normalization is not possible
-      (e.g. ensemble is empty or trees contain only a root node).
-  """
-  tree_importances = [_compute_feature_importances_per_tree(tree, num_features)
-                      for tree in tree_ensemble.trees]
-  tree_importances = np.array(tree_importances)
-  tree_weights = np.array(tree_ensemble.tree_weights).reshape(-1, 1)
-  feature_importances = np.sum(tree_importances * tree_weights, axis=0)
-  if normalize:
-    assert np.all(feature_importances >= 0), ('feature_importances '
-                                              'must be non-negative.')
-    normalizer = np.sum(feature_importances)
-    assert normalizer > 0, 'Trees are all empty or contain only a root node.'
-    feature_importances /= normalizer
-
-  sorted_feature_idx = np.argsort(feature_importances)[::-1]
-  return sorted_feature_idx, feature_importances[sorted_feature_idx]
-
-
-def _bt_explanations_fn(features,
-                        head,
-                        sorted_feature_columns,
-                        name='boosted_trees'):
-  """Gradient Boosted Trees predict with explanations model_fn.
-
-  Args:
-    features: dict of `Tensor`.
-    head: A `head_lib._Head` instance.
-    sorted_feature_columns: Sorted iterable of `feature_column._FeatureColumn`
-      model inputs.
-    name: Name used for the model.
-
-  Returns:
-      An `EstimatorSpec` instance.
-
-  Raises:
-    ValueError: mode or params are invalid, or features has the wrong type.
-  """
-  mode = model_fn_lib.ModeKeys.PREDICT
-  with ops.name_scope(name) as name:
-    # Create Ensemble resources.
-    tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-
-    input_feature_list = _get_transformed_features(features,
-                                                   sorted_feature_columns)
-
-    logits = boosted_trees_ops.predict(
-        # For non-TRAIN mode, ensemble doesn't change after initialization,
-        # so no local copy is needed; using tree_ensemble directly.
-        tree_ensemble_handle=tree_ensemble.resource_handle,
-        bucketized_features=input_feature_list,
-        logits_dimension=head.logits_dimension)
-
-    estimator_spec = head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=None,
-        train_op_fn=control_flow_ops.no_op,
-        logits=logits)
-
-    debug_op = boosted_trees_ops.example_debug_outputs(
-        tree_ensemble.resource_handle,
-        bucketized_features=input_feature_list,
-        logits_dimension=head.logits_dimension)
-    estimator_spec.predictions[boosted_trees_utils._DEBUG_PROTO_KEY] = debug_op  # pylint: disable=protected-access
-    return estimator_spec
-
-
-class _BoostedTreesBase(estimator.Estimator):
-  """Base class for boosted trees estimators.
-
-  This class is intended to keep tree-specific functions (E.g., methods for
-  feature importances and directional feature contributions) in one central
-  place.
-
-  It is not a valid (working) Estimator on its own and should only be used as a
-  base class.
-  """
-
-  def __init__(self, model_fn, model_dir, config, feature_columns, head,
-               center_bias, is_classification):
-    """Initializes a `_BoostedTreesBase` instance.
-
-    Args:
-      model_fn: model_fn: Model function. See base class for more detail.
-      model_dir: Directory to save model parameters, graph and etc. See base
-        class for more detail.
-      config: `estimator.RunConfig` configuration object.
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `FeatureColumn`
-      head: A `head_lib._Head` instance.
-      center_bias: Whether bias centering needs to occur. Bias centering refers
-        to the first node in the very first tree returning the prediction that
-        is aligned with the original labels distribution. For example, for
-        regression problems, the first node will return the mean of the labels.
-        For binary classification problems, it will return a logit for a prior
-        probability of label 1.
-      is_classification: If the estimator is for classification.
-    """
-    super(_BoostedTreesBase, self).__init__(
-        model_fn=model_fn, model_dir=model_dir, config=config)
-    self._sorted_feature_columns = sorted(
-        feature_columns, key=lambda tc: tc.name)
-    self._head = head
-    self._n_features = _calculate_num_features(self._sorted_feature_columns)
-    self._names_for_feature_id = np.array(
-        _generate_feature_name_mapping(self._sorted_feature_columns))
-    self._center_bias = center_bias
-    self._is_classification = is_classification
-
-  def experimental_feature_importances(self, normalize=False):
-    """Computes gain-based feature importances.
-
-    The higher the value, the more important the corresponding feature.
-
-    Args:
-      normalize: If True, normalize the feature importances.
-
-    Returns:
-      sorted_feature_names: 1-D array of feature name which is sorted
-        by its feature importance.
-      feature_importances: 1-D array of the corresponding feature importance.
-
-    Raises:
-      ValueError: When attempting to normalize on an empty ensemble
-        or an ensemble of trees which have no splits. Or when attempting
-        to normalize and feature importances have negative values.
-    """
-    reader = checkpoint_utils.load_checkpoint(self._model_dir)
-    serialized = reader.get_tensor('boosted_trees:0_serialized')
-    if not serialized:
-      raise ValueError('Found empty serialized string for TreeEnsemble.'
-                       'You should only call this method after training.')
-    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-    ensemble_proto.ParseFromString(serialized)
-
-    sorted_feature_id, importances = _compute_feature_importances(
-        ensemble_proto, self._n_features, normalize)
-    return self._names_for_feature_id[sorted_feature_id], importances
-
-  def experimental_predict_with_explanations(self,
-                                             input_fn,
-                                             predict_keys=None,
-                                             hooks=None,
-                                             checkpoint_path=None):
-    """Computes model explainability outputs per example along with predictions.
-
-    Currently supports directional feature contributions (DFCs). For each
-    instance, DFCs indicate the aggregate contribution of each feature. See
-    https://arxiv.org/abs/1312.1121 and
-    http://blog.datadive.net/interpreting-random-forests/ for more details.
-    Args:
-      input_fn: A function that provides input data for predicting as
-        minibatches. See [Premade Estimators](
-        https://tensorflow.org/guide/premade_estimators#create_input_functions)
-          for more information. The function should construct and return one of
-        the following:  * A `tf.data.Dataset` object: Outputs of `Dataset`
-          object must be a tuple `(features, labels)` with same constraints as
-        below. * A tuple `(features, labels)`: Where `features` is a `tf.Tensor`
-          or a dictionary of string feature name to `Tensor` and `labels` is a
-          `Tensor` or a dictionary of string label name to `Tensor`. Both
-          `features` and `labels` are consumed by `model_fn`. They should
-          satisfy the expectation of `model_fn` from inputs.
-      predict_keys: list of `str`, name of the keys to predict. It is used if
-        the `tf.estimator.EstimatorSpec.predictions` is a `dict`. If
-        `predict_keys` is used then rest of the predictions will be filtered
-        from the dictionary, with the exception of 'bias' and 'dfc', which will
-        always be in the dictionary. If `None`, returns all keys in prediction
-        dict, as well as two new keys 'dfc' and 'bias'.
-      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
-        callbacks inside the prediction call.
-      checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
-        latest checkpoint in `model_dir` is used.  If there are no checkpoints
-        in `model_dir`, prediction is run with newly initialized `Variables`
-        instead of ones restored from checkpoint.
-
-    Yields:
-      Evaluated values of `predictions` tensors. The `predictions` tensors will
-      contain at least two keys 'dfc' and 'bias' for model explanations. The
-      `dfc` value corresponds to the contribution of each feature to the overall
-      prediction for this instance (positive indicating that the feature makes
-      it more likely to select class 1 and negative less likely). The 'bias'
-      value will be the same across all the instances, corresponding to the
-      probability (classification) or prediction (regression) of the training
-      data distribution.
-
-    Raises:
-      ValueError: when wrong arguments are given or unsupported functionalities
-       are requested.
-    """
-    if not self._center_bias:
-      raise ValueError('center_bias must be enabled during estimator '
-                       'instantiation when using '
-                       'experimental_predict_with_explanations.')
-    # pylint: disable=protected-access
-    if not self._is_classification:
-      identity_inverse_link_fn = self._head._inverse_link_fn in (None,
-                                                                 tf_identity)
-      # pylint:enable=protected-access
-      if not identity_inverse_link_fn:
-        raise ValueError(
-            'For now only identity inverse_link_fn in regression_head is '
-            'supported for experimental_predict_with_explanations.')
-
-    # pylint:disable=unused-argument
-    def new_model_fn(features, labels, mode):
-      return _bt_explanations_fn(features, self._head,
-                                 self._sorted_feature_columns)
-
-    # pylint:enable=unused-argument
-    est = estimator.Estimator(
-        model_fn=new_model_fn,
-        model_dir=self.model_dir,
-        config=self.config,
-        warm_start_from=self._warm_start_settings)
-    # Make sure bias and dfc will be in prediction dict.
-    user_supplied_predict_keys = predict_keys is not None
-    if user_supplied_predict_keys:
-      predict_keys = set(predict_keys)
-      predict_keys.add(boosted_trees_utils._DEBUG_PROTO_KEY)
-    predictions = est.predict(
-        input_fn,
-        predict_keys=predict_keys,
-        hooks=hooks,
-        checkpoint_path=checkpoint_path,
-        yield_single_examples=True)
-    for pred in predictions:
-      bias, dfcs = boosted_trees_utils._parse_explanations_from_prediction(
-          pred[boosted_trees_utils._DEBUG_PROTO_KEY], self._n_features,
-          self._is_classification)
-      pred['bias'] = bias
-      pred['dfc'] = dfcs
-      # Don't need to expose serialized proto to end user.
-      del pred[boosted_trees_utils._DEBUG_PROTO_KEY]
-      yield pred
-
-
-# pylint: disable=protected-access
-@estimator_export('estimator.BoostedTreesClassifier')
-class BoostedTreesClassifier(_BoostedTreesBase):
-  """A Classifier for Tensorflow Boosted Trees models.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               feature_columns,
-               n_batches_per_layer,
-               model_dir=None,
-               n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
-               weight_column=None,
-               label_vocabulary=None,
-               n_trees=100,
-               max_depth=6,
-               learning_rate=0.1,
-               l1_regularization=0.,
-               l2_regularization=0.,
-               tree_complexity=0.,
-               min_node_weight=0.,
-               config=None,
-               center_bias=False,
-               pruning_mode='none'):
-    """Initializes a `BoostedTreesClassifier` instance.
-
-    Example:
-
-    ```python
-    bucketized_feature_1 = bucketized_column(
-      numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
-    bucketized_feature_2 = bucketized_column(
-      numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
-
-    # Need to see a large portion of the data before we can build a layer, for
-    # example half of data n_batches_per_layer = 0.5 * NUM_EXAMPLES / BATCH_SIZE
-    classifier = estimator.BoostedTreesClassifier(
-        feature_columns=[bucketized_feature_1, bucketized_feature_2],
-        n_batches_per_layer=n_batches_per_layer,
-        n_trees=100,
-        ... <some other params>
-    )
-
-    def input_fn_train():
-      ...
-      return dataset
-
-    classifier.train(input_fn=input_fn_train)
-
-    def input_fn_eval():
-      ...
-      return dataset
-
-    metrics = classifier.evaluate(input_fn=input_fn_eval)
-    ```
-
-    Args:
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `FeatureColumn`.
-      n_batches_per_layer: the number of batches to collect statistics per
-        layer. The total number of batches is total number of data divided by
-        batch size.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      n_classes: number of label classes. Default is binary classification.
-        Multiclass support is not yet implemented.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to downweight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      label_vocabulary: A list of strings represents possible label values. If
-        given, labels must be string type and have any value in
-        `label_vocabulary`. If it is not given, that means labels are
-        already encoded as integer or float within [0, 1] for `n_classes=2` and
-        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
-        Also there will be errors if vocabulary is not provided and labels are
-        string.
-      n_trees: number trees to be created.
-      max_depth: maximum depth of the tree to grow.
-      learning_rate: shrinkage parameter to be used when a tree added to the
-        model.
-      l1_regularization: regularization multiplier applied to the absolute
-        weights of the tree leafs.
-      l2_regularization: regularization multiplier applied to the square weights
-        of the tree leafs.
-      tree_complexity: regularization factor to penalize trees with more leaves.
-      min_node_weight: min_node_weight: minimum hessian a node must have for a
-        split to be considered. The value will be compared with
-        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
-      config: `RunConfig` object to configure the runtime settings.
-      center_bias: Whether bias centering needs to occur. Bias centering refers
-        to the first node in the very first tree returning the prediction that
-        is aligned with the original labels distribution. For example, for
-        regression problems, the first node will return the mean of the labels.
-        For binary classification problems, it will return a logit for a prior
-        probability of label 1.
-      pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
-        pruning (do not split a node if not enough gain is observed) and post
-        pruning (build the tree up to a max depth and then prune branches with
-        negative gain). For pre and post pruning, you MUST provide
-        tree_complexity >0.
-
-    Raises:
-      ValueError: when wrong arguments are given or unsupported functionalities
-         are requested.
-    """
-    # TODO(nponomareva): Support multi-class cases.
-    if n_classes == _HOLD_FOR_MULTI_CLASS_SUPPORT:
-      n_classes = 2
-    head, closed_form = _create_classification_head_and_closed_form(
-        n_classes, weight_column, label_vocabulary=label_vocabulary)
-    # HParams for the model.
-    tree_hparams = _TreeHParams(
-        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity, min_node_weight, center_bias, pruning_mode)
-
-    def _model_fn(features, labels, mode, config):
-      return _bt_model_fn(
-          features,
-          labels,
-          mode,
-          head,
-          feature_columns,
-          tree_hparams,
-          n_batches_per_layer,
-          config,
-          closed_form_grad_and_hess_fn=closed_form)
-
-    super(BoostedTreesClassifier, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config,
-        feature_columns=feature_columns,
-        head=head,
-        center_bias=center_bias,
-        is_classification=True)
-
-
-@estimator_export('estimator.BoostedTreesRegressor')
-class BoostedTreesRegressor(_BoostedTreesBase):
-  """A Regressor for Tensorflow Boosted Trees models.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               feature_columns,
-               n_batches_per_layer,
-               model_dir=None,
-               label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
-               weight_column=None,
-               n_trees=100,
-               max_depth=6,
-               learning_rate=0.1,
-               l1_regularization=0.,
-               l2_regularization=0.,
-               tree_complexity=0.,
-               min_node_weight=0.,
-               config=None,
-               center_bias=False,
-               pruning_mode='none'):
-    """Initializes a `BoostedTreesRegressor` instance.
-
-    Example:
-
-    ```python
-    bucketized_feature_1 = bucketized_column(
-      numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
-    bucketized_feature_2 = bucketized_column(
-      numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
-
-    # Need to see a large portion of the data before we can build a layer, for
-    # example half of data n_batches_per_layer = 0.5 * NUM_EXAMPLES / BATCH_SIZE
-    regressor = estimator.BoostedTreesRegressor(
-        feature_columns=[bucketized_feature_1, bucketized_feature_2],
-        n_batches_per_layer=n_batches_per_layer,
-        n_trees=100,
-        ... <some other params>
-    )
-
-    def input_fn_train():
-      ...
-      return dataset
-
-    regressor.train(input_fn=input_fn_train)
-
-    def input_fn_eval():
-      ...
-      return dataset
-
-    metrics = regressor.evaluate(input_fn=input_fn_eval)
-    ```
-
-    Args:
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `FeatureColumn`.
-      n_batches_per_layer: the number of batches to collect statistics per
-        layer. The total number of batches is total number of data divided by
-        batch size.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      label_dimension: Number of regression targets per example.
-        Multi-dimensional support is not yet implemented.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to downweight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      n_trees: number trees to be created.
-      max_depth: maximum depth of the tree to grow.
-      learning_rate: shrinkage parameter to be used when a tree added to the
-        model.
-      l1_regularization: regularization multiplier applied to the absolute
-        weights of the tree leafs.
-      l2_regularization: regularization multiplier applied to the square weights
-        of the tree leafs.
-      tree_complexity: regularization factor to penalize trees with more leaves.
-      min_node_weight: min_node_weight: minimum hessian a node must have for a
-        split to be considered. The value will be compared with
-        sum(leaf_hessian)/(batch_size * n_batches_per_layer).
-      config: `RunConfig` object to configure the runtime settings.
-      center_bias: Whether bias centering needs to occur. Bias centering refers
-        to the first node in the very first tree returning the prediction that
-        is aligned with the original labels distribution. For example, for
-        regression problems, the first node will return the mean of the labels.
-        For binary classification problems, it will return a logit for a prior
-        probability of label 1.
-      pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
-        pruning (do not split a node if not enough gain is observed) and post
-        pruning (build the tree up to a max depth and then prune branches with
-        negative gain). For pre and post pruning, you MUST provide
-        tree_complexity >0.
-
-    Raises:
-      ValueError: when wrong arguments are given or unsupported functionalities
-         are requested.
-    """
-    # TODO(nponomareva): Extend it to multi-dimension cases.
-    if label_dimension == _HOLD_FOR_MULTI_DIM_SUPPORT:
-      label_dimension = 1
-    head = _create_regression_head(label_dimension, weight_column)
-
-    # HParams for the model.
-    tree_hparams = _TreeHParams(
-        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity, min_node_weight, center_bias, pruning_mode)
-
-    def _model_fn(features, labels, mode, config):
-      return _bt_model_fn(features, labels, mode, head, feature_columns,
-                          tree_hparams, n_batches_per_layer, config)
-
-    super(BoostedTreesRegressor, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config,
-        feature_columns=feature_columns,
-        head=head,
-        center_bias=center_bias,
-        is_classification=False)
+from tensorflow_estimator.python.estimator.canned import boosted_trees
 
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+boosted_trees.__all__ = [
+    s for s in dir(boosted_trees) if not s.startswith('__')
+]
 
-# pylint: enable=protected-access
+from tensorflow_estimator.python.estimator.canned.boosted_trees import *
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
deleted file mode 100644
index 23687a738bd17aa2c5fa8855d7f003331ddc1757..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ /dev/null
@@ -1,2549 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests boosted_trees estimators and model_fn."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from google.protobuf import text_format
-import numpy as np
-
-from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator.canned import boosted_trees
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gen_boosted_trees_ops
-from tensorflow.python.ops import boosted_trees_ops
-from tensorflow.python.ops import resources
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training import session_run_hook
-
-NUM_FEATURES = 3
-
-BUCKET_BOUNDARIES = [-2., .5, 12.]  # Boundaries for all the features.
-INPUT_FEATURES = np.array(
-    [
-        [12.5, 1.0, -2.001, -2.0001, -1.999],  # feature_0 quantized:[3,2,0,0,1]
-        [2.0, -3.0, 0.5, 0.0, 0.4995],         # feature_1 quantized:[2,0,2,1,1]
-        [3.0, 20.0, 50.0, -100.0, 102.75],     # feature_2 quantized:[2,3,3,0,3]
-    ],
-    dtype=np.float32)
-
-CLASSIFICATION_LABELS = [[0.], [1.], [1.], [0.], [0.]]
-REGRESSION_LABELS = [[1.5], [0.3], [0.2], [2.], [5.]]
-FEATURES_DICT = {'f_%d' % i: INPUT_FEATURES[i] for i in range(NUM_FEATURES)}
-
-# EXAMPLE_ID is not exposed to Estimator yet, but supported at model_fn level.
-EXAMPLE_IDS = np.array([0, 1, 2, 3, 4], dtype=np.int64)
-EXAMPLE_ID_COLUMN = '__example_id__'
-
-
-def _make_train_input_fn(is_classification):
-  """Makes train input_fn for classification/regression."""
-
-  def _input_fn():
-    features_dict = dict(FEATURES_DICT)  # copies the dict to add an entry.
-    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
-    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
-    return features_dict, labels
-
-  return _input_fn
-
-
-def _make_train_input_fn_dataset(is_classification, batch=None, repeat=None):
-  """Makes input_fn using Dataset."""
-
-  def _input_fn():
-    features_dict = dict(FEATURES_DICT)  # copies the dict to add an entry.
-    features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
-    labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
-    if batch:
-      ds = dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.from_tensor_slices(features_dict),
-           dataset_ops.Dataset.from_tensor_slices(labels))).batch(batch)
-    else:
-      ds = dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.from_tensors(features_dict),
-           dataset_ops.Dataset.from_tensors(labels)))
-    # repeat indefinitely by default, or stop at the given step.
-    ds = ds.repeat(repeat)
-    return ds
-
-  return _input_fn
-
-
-class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES)
-        for i in range(NUM_FEATURES)
-    }
-
-  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
-                         attempted_layers):
-    self._assert_checkpoint_and_return_model(model_dir, global_step,
-                                             finalized_trees, attempted_layers)
-
-  def _assert_checkpoint_and_return_model(self, model_dir, global_step,
-                                          finalized_trees, attempted_layers):
-    reader = checkpoint_utils.load_checkpoint(model_dir)
-    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
-    serialized = reader.get_tensor('boosted_trees:0_serialized')
-    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-    ensemble_proto.ParseFromString(serialized)
-
-    self.assertEqual(
-        finalized_trees,
-        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
-    self.assertEqual(attempted_layers,
-                     ensemble_proto.growing_metadata.num_layers_attempted)
-
-    return ensemble_proto
-
-  def testFirstCheckpointWorksFine(self):
-    """Tests that eval/pred doesn't crash with the very first checkpoint.
-
-    The step-0 checkpoint will have only an empty ensemble, and a separate eval
-    job might read from it and crash.
-    This test ensures that prediction/evaluation works fine with it.
-    """
-    input_fn = _make_train_input_fn(is_classification=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-
-    class BailOutWithoutTraining(session_run_hook.SessionRunHook):
-
-      def before_run(self, run_context):
-        raise StopIteration('to bail out.')
-
-    est.train(input_fn, steps=100,  # must stop at 0 anyway.
-              hooks=[BailOutWithoutTraining()])
-    self._assert_checkpoint(
-        est.model_dir, global_step=0, finalized_trees=0, attempted_layers=0)
-    # Empty ensemble returns 0 logits, so that all output labels are 0.
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 0.6)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose([[0], [0], [0], [0], [0]],
-                        [pred['class_ids'] for pred in predictions])
-
-  def testTrainAndEvaluateBinaryClassifier(self):
-    input_fn = _make_train_input_fn(is_classification=True)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-
-    # It will stop after 5 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-
-  def testTrainTwiceAndEvaluateBinaryClassifier(self):
-    input_fn = _make_train_input_fn(is_classification=True)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=5,
-        max_depth=10)
-
-    num_steps = 2
-    # Train for a few steps, and validate final checkpoint.
-    est.train(input_fn, steps=num_steps)
-    est.train(input_fn, steps=num_steps)
-
-    self._assert_checkpoint(
-        est.model_dir, global_step=num_steps * 2,
-        finalized_trees=0, attempted_layers=4)
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-
-  def testInferBinaryClassifier(self):
-    train_input_fn = _make_train_input_fn(is_classification=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-
-    # It will stop after 5 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose([[0], [1], [1], [0], [0]],
-                        [pred['class_ids'] for pred in predictions])
-
-  def testTrainClassifierWithRankOneLabel(self):
-    """Tests that label with rank-1 tensor is also accepted by classifier."""
-    def _input_fn_with_rank_one_label():
-      return FEATURES_DICT, [0., 1., 1., 0., 0.]
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-
-    # It will stop after 5 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(_input_fn_with_rank_one_label, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    eval_res = est.evaluate(input_fn=_input_fn_with_rank_one_label, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-
-  def testTrainClassifierWithLabelVocabulary(self):
-    apple, banana = 'apple', 'banana'
-    def _input_fn_with_label_vocab():
-      return FEATURES_DICT, [[apple], [banana], [banana], [apple], [apple]]
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5,
-        label_vocabulary=[apple, banana])
-    est.train(input_fn=_input_fn_with_label_vocab, steps=5)
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    eval_res = est.evaluate(input_fn=_input_fn_with_label_vocab, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose([[0], [1], [1], [0], [0]],
-                        [pred['class_ids'] for pred in predictions])
-
-  def testTrainClassifierWithIntegerLabel(self):
-    def _input_fn_with_integer_label():
-      return (FEATURES_DICT,
-              constant_op.constant([[0], [1], [1], [0], [0]], dtypes.int32))
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-    est.train(input_fn=_input_fn_with_integer_label, steps=5)
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    eval_res = est.evaluate(input_fn=_input_fn_with_integer_label, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose([[0], [1], [1], [0], [0]],
-                        [pred['class_ids'] for pred in predictions])
-
-  def testTrainClassifierWithDataset(self):
-    train_input_fn = _make_train_input_fn_dataset(is_classification=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['accuracy'], 1.0)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose([[0], [1], [1], [0], [0]],
-                        [pred['class_ids'] for pred in predictions])
-
-  def testTrainAndEvaluateRegressor(self):
-    input_fn = _make_train_input_fn(is_classification=False)
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=2,
-        max_depth=5)
-
-    # It will stop after 10 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(input_fn, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 1.008551)
-
-  def testInferRegressor(self):
-    train_input_fn = _make_train_input_fn(is_classification=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-
-    # It will stop after 5 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(train_input_fn, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose(
-        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
-        [pred['predictions'] for pred in predictions])
-
-  def testTrainRegressorWithRankOneLabel(self):
-    """Tests that label with rank-1 tensor is also accepted by regressor."""
-    def _input_fn_with_rank_one_label():
-      return FEATURES_DICT, [1.5, 0.3, 0.2, 2., 5.]
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-
-    # It will stop after 5 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(_input_fn_with_rank_one_label, steps=num_steps)
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    eval_res = est.evaluate(input_fn=_input_fn_with_rank_one_label, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.478283)
-
-  def testTrainRegressorWithDataset(self):
-    train_input_fn = _make_train_input_fn_dataset(is_classification=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.478283)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose(
-        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
-        [pred['predictions'] for pred in predictions])
-
-  def testTrainRegressorWithDatasetBatch(self):
-    # The batch_size as the entire data size should yield the same result as
-    # dataset without batching.
-    train_input_fn = _make_train_input_fn_dataset(
-        is_classification=False, batch=5)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.478283)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose(
-        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
-        [pred['predictions'] for pred in predictions])
-
-  def testTrainRegressorWithDatasetLargerBatch(self):
-    # The batch_size as the multiple of the entire data size should still yield
-    # the same result.
-    train_input_fn = _make_train_input_fn_dataset(
-        is_classification=False, batch=15)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-    est.train(train_input_fn, steps=100)  # will stop after 5 steps anyway.
-    self._assert_checkpoint(
-        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 2.478283)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose(
-        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
-        [pred['predictions'] for pred in predictions])
-
-  def testTrainRegressorWithDatasetSmallerBatch(self):
-    # Even when using small batches, if (n_batches_per_layer * batch_size) makes
-    # the same entire data size, the result should be the same.
-    train_input_fn = _make_train_input_fn_dataset(
-        is_classification=False, batch=1)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=5,
-        n_trees=1,
-        max_depth=5)
-    # Train stops after (n_batches_per_layer * n_trees * max_depth) steps.
-    est.train(train_input_fn, steps=100)
-    self._assert_checkpoint(
-        est.model_dir, global_step=25, finalized_trees=1, attempted_layers=5)
-    # 5 batches = one epoch.
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=5)
-    self.assertAllClose(eval_res['average_loss'], 2.478283)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose(
-        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
-        [pred['predictions'] for pred in predictions])
-
-  def testTrainRegressorWithDatasetWhenInputIsOverEarlier(self):
-    train_input_fn = _make_train_input_fn_dataset(
-        is_classification=False, repeat=3)  # to stop input after 3 steps.
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-    # Note that training will stop when input exhausts.
-    # This might not be a typical pattern, but dataset.repeat(3) causes
-    # the input stream to cease after 3 steps.
-    est.train(train_input_fn, steps=100)
-    self._assert_checkpoint(
-        est.model_dir, global_step=3, finalized_trees=0, attempted_layers=3)
-    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
-    self.assertAllClose(eval_res['average_loss'], 3.777295)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-    self.assertAllClose(
-        [[0.353850], [0.254100], [0.106850], [0.712100], [1.012100]],
-        [pred['predictions'] for pred in predictions])
-
-  def testTrainEvaluateAndPredictWithIndicatorColumn(self):
-    categorical = feature_column.categorical_column_with_vocabulary_list(
-        key='categorical', vocabulary_list=('bad', 'good', 'ok'))
-    feature_indicator = feature_column.indicator_column(categorical)
-    bucketized_col = feature_column.bucketized_column(
-        feature_column.numeric_column(
-            'an_uninformative_feature', dtype=dtypes.float32),
-        BUCKET_BOUNDARIES)
-
-    labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32)
-    # Our categorical feature defines the labels perfectly
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'an_uninformative_feature': np.array([1, 1, 1, 1, 1]),
-            'categorical': np.array(['bad', 'good', 'good', 'ok', 'bad']),
-        },
-        y=labels,
-        batch_size=5,
-        shuffle=False)
-
-    # Train depth 1 tree.
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=[bucketized_col, feature_indicator],
-        n_batches_per_layer=1,
-        n_trees=1,
-        learning_rate=1.0,
-        max_depth=1)
-
-    num_steps = 1
-    est.train(input_fn, steps=num_steps)
-    ensemble = self._assert_checkpoint_and_return_model(
-        est.model_dir, global_step=1, finalized_trees=1, attempted_layers=1)
-
-    # We learnt perfectly.
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['loss'], 0)
-
-    predictions = list(est.predict(input_fn))
-    self.assertAllClose(
-        labels,
-        [pred['predictions'] for pred in predictions])
-
-    self.assertEqual(3, len(ensemble.trees[0].nodes))
-
-    # Check that the split happened on 'good' value, which will be encoded as
-    # feature with index 2 (0-numeric, 1 - 'bad')
-    self.assertEqual(2, ensemble.trees[0].nodes[0].bucketized_split.feature_id)
-    self.assertEqual(0, ensemble.trees[0].nodes[0].bucketized_split.threshold)
-
-  def testTrainEvaluateAndPredictWithOnlyIndicatorColumn(self):
-    categorical = feature_column.categorical_column_with_vocabulary_list(
-        key='categorical', vocabulary_list=('bad', 'good', 'ok'))
-    feature_indicator = feature_column.indicator_column(categorical)
-
-    labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32)
-    # Our categorical feature defines the labels perfectly
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'categorical': np.array(['bad', 'good', 'good', 'ok', 'bad']),
-        },
-        y=labels,
-        batch_size=5,
-        shuffle=False)
-
-    # Train depth 1 tree.
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=[feature_indicator],
-        n_batches_per_layer=1,
-        n_trees=1,
-        learning_rate=1.0,
-        max_depth=1)
-
-    num_steps = 1
-    est.train(input_fn, steps=num_steps)
-    ensemble = self._assert_checkpoint_and_return_model(
-        est.model_dir, global_step=1, finalized_trees=1, attempted_layers=1)
-
-    # We learnt perfectly.
-    eval_res = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertAllClose(eval_res['loss'], 0)
-
-    predictions = list(est.predict(input_fn))
-    self.assertAllClose(
-        labels,
-        [pred['predictions'] for pred in predictions])
-
-    self.assertEqual(3, len(ensemble.trees[0].nodes))
-
-    # Check that the split happened on 'good' value, which will be encoded as
-    # feature with index 1 (0 - 'bad', 2 - 'ok')
-    self.assertEqual(1, ensemble.trees[0].nodes[0].bucketized_split.feature_id)
-    self.assertEqual(0, ensemble.trees[0].nodes[0].bucketized_split.threshold)
-
-  def testFeatureImportancesWithTrainedEnsemble(self):
-    input_fn = _make_train_input_fn(is_classification=True)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=2,
-        max_depth=5)
-
-    # It will stop after 5 steps because of the max depth and num trees.
-    num_steps = 100
-    # Train for a few steps, and validate final checkpoint.
-    est.train(input_fn, steps=num_steps)
-
-    feature_names_expected = ['f_0_bucketized',
-                              'f_2_bucketized',
-                              'f_1_bucketized']
-
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=False)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    self.assertAllClose([0.833933, 0.606342, 0.0], importances)
-
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=True)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    self.assertAllClose([0.579010, 0.420990, 0.0], importances)
-
-  def testFeatureImportancesOnEmptyEnsemble(self):
-    input_fn = _make_train_input_fn(is_classification=True)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-
-    class BailOutWithoutTraining(session_run_hook.SessionRunHook):
-
-      def before_run(self, run_context):
-        raise StopIteration('to bail out.')
-
-    # The step-0 checkpoint will have only an empty ensemble.
-    est.train(input_fn,
-              steps=100,  # must stop at 0 anyway.
-              hooks=[BailOutWithoutTraining()])
-
-    with self.assertRaisesRegexp(ValueError, 'empty serialized string'):
-      est.experimental_feature_importances(normalize=False)
-
-    with self.assertRaisesRegexp(ValueError, 'empty serialized string'):
-      est.experimental_feature_importances(normalize=True)
-
-  def _create_fake_checkpoint_with_tree_ensemble_proto(self,
-                                                       est,
-                                                       tree_ensemble_text):
-    with ops.Graph().as_default():
-      with ops.name_scope('boosted_trees') as name:
-        tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
-        tree_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-        text_format.Merge(tree_ensemble_text, tree_ensemble_proto)
-        stamp_token, _ = tree_ensemble.serialize()
-        restore_op = tree_ensemble.deserialize(
-            stamp_token, tree_ensemble_proto.SerializeToString())
-
-        with session.Session() as sess:
-          resources.initialize_resources(resources.shared_resources()).run()
-          restore_op.run()
-          saver = saver_lib.Saver()
-          save_path = os.path.join(est.model_dir, 'model.ckpt')
-          saver.save(sess, save_path)
-
-  def testFeatureImportancesOnNonEmptyEnsemble(self):
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=2,
-        max_depth=5)
-
-    tree_ensemble_text = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 2.0
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 3.0
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              left_id: 5
-              right_id: 6
-            }
-            metadata {
-              gain: 2.0
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.0
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              left_id: 7
-              right_id: 8
-            }
-            metadata {
-              gain: 1.0
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 3.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.34
-            }
-          }
-        }
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 1.0
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 3.34
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 1.0
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 3.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.34
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        """
-    self._create_fake_checkpoint_with_tree_ensemble_proto(
-        est, tree_ensemble_text)
-
-    feature_names_expected = ['f_0_bucketized',
-                              'f_2_bucketized',
-                              'f_1_bucketized']
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=False)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    # Gain sum for each features:
-    # = 1.0 * [3 + 1, 2, 2] + 1.0 * [1, 1, 0]
-    self.assertAllClose([5.0, 3.0, 2.0], importances)
-
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=True)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    self.assertAllClose([0.5, 0.3, 0.2], importances)
-
-  def testFeatureImportancesWithTreeWeights(self):
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=3,
-        max_depth=5)
-
-    tree_ensemble_text = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 12.5
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 5.0
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.0
-            }
-          }
-        }
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 5.0
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.34
-            }
-          }
-        }
-        trees {
-          nodes {
-            leaf {
-              scalar: 0.0
-            }
-          }
-        }
-        tree_weights: 0.4
-        tree_weights: 0.6
-        tree_weights: 1.0
-        """
-    self._create_fake_checkpoint_with_tree_ensemble_proto(
-        est, tree_ensemble_text)
-
-    feature_names_expected = ['f_0_bucketized',
-                              'f_2_bucketized',
-                              'f_1_bucketized']
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=False)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    # Gain sum for each features:
-    # = 0.4 * [12.5, 0, 5] + 0.6 * [0, 5, 0] + 1.0 * [0, 0, 0]
-    self.assertAllClose([5.0, 3.0, 2.0], importances)
-
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=True)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    self.assertAllClose([0.5, 0.3, 0.2], importances)
-
-  def testFeatureImportancesWithAllEmptyTree(self):
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=2,
-        max_depth=5)
-
-    tree_ensemble_text = """
-        trees {
-          nodes {
-            leaf {
-              scalar: 0.0
-            }
-          }
-        }
-        trees {
-          nodes {
-            leaf {
-              scalar: 0.0
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        """
-    self._create_fake_checkpoint_with_tree_ensemble_proto(
-        est, tree_ensemble_text)
-
-    # Reverse order because feature importances are sorted by np.argsort(f)[::-1]
-    feature_names_expected = ['f_2_bucketized',
-                              'f_1_bucketized',
-                              'f_0_bucketized']
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=False)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    self.assertAllClose([0.0, 0.0, 0.0], importances)
-
-    with self.assertRaisesRegexp(AssertionError,
-                                 'all empty or contain only a root node'):
-      est.experimental_feature_importances(normalize=True)
-
-  def testNegativeFeatureImportances(self):
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5)
-
-    # In order to generate a negative feature importances,
-    # We assign an invalid value -1 to tree_weights here.
-    tree_ensemble_text = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 5.0
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.34
-            }
-          }
-        }
-        tree_weights: -1.0
-        """
-    self._create_fake_checkpoint_with_tree_ensemble_proto(
-        est, tree_ensemble_text)
-
-    # Github #21509 (nataliaponomareva):
-    # The gains stored in the splits can be negative
-    # if people are using complexity regularization.
-    feature_names_expected = ['f_2_bucketized',
-                              'f_0_bucketized',
-                              'f_1_bucketized']
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=False)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    self.assertAllClose([0.0, 0.0, -5.0], importances)
-
-    with self.assertRaisesRegexp(AssertionError, 'non-negative'):
-      est.experimental_feature_importances(normalize=True)
-
-  def testFeatureImportancesNamesForCategoricalColumn(self):
-    categorical = feature_column.categorical_column_with_vocabulary_list(
-        key='categorical', vocabulary_list=('bad', 'good', 'ok'))
-    feature_indicator = feature_column.indicator_column(categorical)
-    bucketized_col = feature_column.bucketized_column(
-        feature_column.numeric_column(
-            'continuous', dtype=dtypes.float32),
-        BUCKET_BOUNDARIES)
-    bucketized_indicator = feature_column.indicator_column(bucketized_col)
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=[feature_indicator,
-                         bucketized_col,
-                         bucketized_indicator],
-        n_batches_per_layer=1,
-        n_trees=2,
-        learning_rate=1.0,
-        max_depth=1)
-
-    tree_ensemble_text = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 5.0
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 4
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 2.0
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.0
-            }
-          }
-        }
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 1.0
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 5
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 2.0
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -2.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 3.34
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 4.34
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        """
-    self._create_fake_checkpoint_with_tree_ensemble_proto(
-        est, tree_ensemble_text)
-
-    feature_names_expected = ['categorical_indicator:ok',
-                              'continuous_bucketized_indicator:(-2.0, 0.5)',
-                              'continuous_bucketized_indicator:(-inf, -2.0)',
-                              'categorical_indicator:bad',
-                              # Reverse order because feature importances
-                              # are sorted by np.argsort(f)[::-1]
-                              'continuous_bucketized_indicator:(12.0, inf)',
-                              'continuous_bucketized_indicator:(0.5, 12.0)',
-                              'continuous_bucketized',
-                              'categorical_indicator:good']
-
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=False)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    # Gain sum for each features:
-    # = 1.0 * [5, 0, 2, 0, 0, 0, 0, 0] + 1.0 * [0, 2, 0, 1, 0, 0, 0, 0]
-    self.assertAllClose([5.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0], importances)
-
-    feature_names, importances = est.experimental_feature_importances(
-        normalize=True)
-    self.assertAllEqual(feature_names_expected, feature_names)
-    self.assertAllClose([0.5, 0.2, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0], importances)
-
-  def testFeatureImportancesNamesForUnsupportedColumn(self):
-    numeric_col = feature_column.numeric_column(
-        'continuous', dtype=dtypes.float32)
-
-    with self.assertRaisesRegexp(ValueError,
-                                 'only bucketized_column and indicator_column'):
-      _ = boosted_trees.BoostedTreesRegressor(
-          feature_columns=[numeric_col],
-          n_batches_per_layer=1,
-          n_trees=2,
-          learning_rate=1.0,
-          max_depth=1)
-
-  def testTreeComplexityIsSetCorrectly(self):
-    input_fn = _make_train_input_fn(is_classification=True)
-
-    num_steps = 10
-    # Tree complexity is set but no pruning.
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5,
-        tree_complexity=1e-3)
-    with self.assertRaisesRegexp(ValueError, 'Tree complexity have no effect'):
-      est.train(input_fn, steps=num_steps)
-
-    # Pruning but no tree complexity.
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5,
-        pruning_mode='pre')
-    with self.assertRaisesRegexp(ValueError,
-                                 'tree_complexity must be positive'):
-      est.train(input_fn, steps=num_steps)
-
-    # All is good.
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5,
-        pruning_mode='pre',
-        tree_complexity=1e-3)
-    est.train(input_fn, steps=num_steps)
-
-
-class BoostedTreesDebugOutputsTest(test_util.TensorFlowTestCase):
-  """Test debug/model explainability outputs for individual predictions.
-
-  Includes directional feature contributions (DFC).
-  """
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
-    }
-
-  def testBinaryClassifierThatDFCIsInPredictions(self):
-    train_input_fn = _make_train_input_fn(is_classification=True)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=3, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesClassifier(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5,
-        center_bias=True)
-
-    num_steps = 100
-    # Train for a few steps. Validate debug outputs in prediction dicts.
-    est.train(train_input_fn, steps=num_steps)
-    debug_predictions = est.experimental_predict_with_explanations(
-        predict_input_fn)
-    biases, dfcs = zip(*[(pred['bias'], pred['dfc'])
-                         for pred in debug_predictions])
-    self.assertAllClose([0.4] * 5, biases)
-    self.assertAllClose(({
-        0: -0.12108613453574479,
-        1: 0.0,
-        2: -0.039254929814481143
-    }, {
-        0: 0.19650601422250574,
-        1: 0.0,
-        2: 0.02693827052766018
-    }, {
-        0: 0.16057487356133376,
-        1: 0.0,
-        2: 0.02693827052766018
-    }, {
-        0: -0.12108613453574479,
-        1: 0.0,
-        2: -0.039254929814481143
-    }, {
-        0: -0.10832468554550384,
-        1: 0.0,
-        2: 0.02693827052766018
-    }), dfcs)
-
-    # Assert sum(dfcs) + bias == probabilities.
-    expected_probabilities = [
-        0.23965894, 0.62344426, 0.58751315, 0.23965894, 0.31861359
-    ]
-    probabilities = [
-        sum(dfc.values()) + bias for (dfc, bias) in zip(dfcs, biases)
-    ]
-    self.assertAllClose(expected_probabilities, probabilities)
-
-    # When user doesn't include bias or dfc in predict_keys, make sure to still
-    # include dfc and bias.
-    debug_predictions = est.experimental_predict_with_explanations(
-        predict_input_fn, predict_keys=['probabilities'])
-    for prediction_dict in debug_predictions:
-      self.assertTrue('bias' in prediction_dict)
-      self.assertTrue('dfc' in prediction_dict)
-      self.assertTrue('probabilities' in prediction_dict)
-      self.assertEqual(len(prediction_dict), 3)
-
-  def testRegressorThatDFCIsInPredictions(self):
-    train_input_fn = _make_train_input_fn(is_classification=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
-
-    est = boosted_trees.BoostedTreesRegressor(
-        feature_columns=self._feature_columns,
-        n_batches_per_layer=1,
-        n_trees=1,
-        max_depth=5,
-        center_bias=True)
-
-    num_steps = 100
-    # Train for a few steps. Validate debug outputs in prediction dicts.
-    est.train(train_input_fn, steps=num_steps)
-    debug_predictions = est.experimental_predict_with_explanations(
-        predict_input_fn)
-    biases, dfcs = zip(*[(pred['bias'], pred['dfc'])
-                         for pred in debug_predictions])
-    self.assertAllClose([1.8] * 5, biases)
-    self.assertAllClose(({
-        0: -0.070499420166015625,
-        1: -0.095000028610229492,
-        2: 0.0
-    }, {
-        0: -0.53763031959533691,
-        1: 0.063333392143249512,
-        2: 0.0
-    }, {
-        0: -0.51756942272186279,
-        1: -0.095000028610229492,
-        2: 0.0
-    }, {
-        0: 0.1563495397567749,
-        1: 0.063333392143249512,
-        2: 0.0
-    }, {
-        0: 0.96934974193572998,
-        1: 0.063333392143249512,
-        2: 0.0
-    }), dfcs)
-
-    # Assert sum(dfcs) + bias == predictions.
-    expected_predictions = [[1.6345005], [1.32570302], [1.1874305],
-                            [2.01968288], [2.83268309]]
-    predictions = [
-        [sum(dfc.values()) + bias] for (dfc, bias) in zip(dfcs, biases)
-    ]
-    self.assertAllClose(expected_predictions, predictions)
-
-    # Test when user doesn't include bias or dfc in predict_keys.
-    debug_predictions = est.experimental_predict_with_explanations(
-        predict_input_fn, predict_keys=['predictions'])
-    for prediction_dict in debug_predictions:
-      self.assertTrue('bias' in prediction_dict)
-      self.assertTrue('dfc' in prediction_dict)
-      self.assertTrue('predictions' in prediction_dict)
-      self.assertEqual(len(prediction_dict), 3)
-
-
-class ModelFnTests(test_util.TensorFlowTestCase):
-  """Tests bt_model_fn including unexposed internal functionalities."""
-
-  def setUp(self):
-    self._feature_columns = {
-        feature_column.bucketized_column(
-            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
-            BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
-    }
-
-  def _get_expected_ensembles_for_classification(self):
-    first_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              threshold: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.387675
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.181818
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.0625
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 1
-          is_finalized: false
-        }
-        growing_metadata {
-          num_trees_attempted: 1
-          num_layers_attempted: 1
-          last_layer_node_start: 1
-          last_layer_node_end: 3
-        }
-        """
-    second_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              threshold: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.387675
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 3
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 0.0
-              original_leaf {
-                scalar: -0.181818
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 0
-              left_id: 5
-              right_id: 6
-            }
-            metadata {
-              gain: 0.105518
-              original_leaf {
-                scalar: 0.0625
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.348397
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.181818
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.224091
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.056815
-            }
-          }
-        }
-        trees {
-          nodes {
-            leaf {
-              scalar: 0.0
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 2
-          is_finalized: true
-        }
-        tree_metadata {
-          num_layers_grown: 0
-          is_finalized: false
-        }
-        growing_metadata {
-          num_trees_attempted: 1
-          num_layers_attempted: 2
-          last_layer_node_start: 0
-          last_layer_node_end: 1
-        }
-        """
-    third_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              threshold: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.387675
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 3
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 0.0
-              original_leaf {
-                scalar: -0.181818
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 0
-              left_id: 5
-              right_id: 6
-            }
-            metadata {
-              gain: 0.105518
-              original_leaf {
-                scalar: 0.0625
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.348397
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.181818
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.224091
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.056815
-            }
-          }
-        }
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              threshold: 0
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.287131
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.162042
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.086986
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 2
-          is_finalized: true
-        }
-        tree_metadata {
-          num_layers_grown: 1
-          is_finalized: false
-        }
-        growing_metadata {
-          num_trees_attempted: 2
-          num_layers_attempted: 3
-          last_layer_node_start: 1
-          last_layer_node_end: 3
-        }
-        """
-    return (first_round, second_round, third_round)
-
-  def _get_expected_ensembles_for_classification_with_bias(self):
-    first_round = """
-        trees {
-          nodes {
-            leaf {
-              scalar: -0.405086
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_metadata {
-        }
-        """
-    second_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              threshold: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.407711
-              original_leaf {
-                scalar: -0.405086
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.556054
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.301233
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 1
-          is_finalized: false
-        }
-        growing_metadata {
-          num_trees_attempted: 1
-          num_layers_attempted: 1
-          last_layer_node_start: 1
-          last_layer_node_end: 3
-        }
-        """
-    third_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              threshold: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.407711
-              original_leaf {
-                scalar: -0.405086
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 3
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              original_leaf {
-                scalar: -0.556054
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 0
-              left_id: 5
-              right_id: 6
-            }
-            metadata {
-              gain: 0.09876
-              original_leaf {
-                scalar: -0.301233
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.698072
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.556054
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.106016
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.27349
-            }
-          }
-        }
-        trees {
-          nodes {
-            leaf {
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 2
-          is_finalized: true
-        }
-        tree_metadata {
-        }
-        growing_metadata {
-          num_trees_attempted: 1
-          num_layers_attempted: 2
-          last_layer_node_end: 1
-        }
-        """
-    forth_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              threshold: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.4077113
-              original_leaf {
-                scalar: -0.405086
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              threshold: 3
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              original_leaf {
-                scalar: -0.556054
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              threshold: 0
-              left_id: 5
-              right_id: 6
-            }
-            metadata {
-              gain: 0.09876
-              original_leaf {
-                scalar: -0.301233
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.698072
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.556054
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.106016
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.27349
-            }
-          }
-        }
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 2
-              threshold: 2
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.289927
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.134588
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.083838            
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 2
-          is_finalized: true
-        }
-        tree_metadata {
-          num_layers_grown: 1
-        }
-        growing_metadata {
-          num_trees_attempted: 2
-          num_layers_attempted: 3
-          last_layer_node_start: 1
-          last_layer_node_end: 3
-        }
-        """
-    return (first_round, second_round, third_round, forth_round)
-
-  def _get_expected_ensembles_for_regression(self):
-    first_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              threshold: 1
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 1.169714
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.241322
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.083951
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 1
-          is_finalized: false
-        }
-        growing_metadata {
-          num_trees_attempted: 1
-          num_layers_attempted: 1
-          last_layer_node_start: 1
-          last_layer_node_end: 3
-        }
-        """
-    second_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              threshold: 1
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 1.169714
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 1
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 2.673407
-              original_leaf {
-                scalar: 0.241322
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 0
-              left_id: 5
-              right_id: 6
-            }
-            metadata {
-              gain: 0.324102
-              original_leaf {
-                scalar: 0.083951
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.563167
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.247047
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.095273
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.222102
-            }
-          }
-        }
-        trees {
-          nodes {
-            leaf {
-              scalar: 0.0
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 2
-          is_finalized: true
-        }
-        tree_metadata {
-          num_layers_grown: 0
-          is_finalized: false
-        }
-        growing_metadata {
-          num_trees_attempted: 1
-          num_layers_attempted: 2
-          last_layer_node_start: 0
-          last_layer_node_end: 1
-        }
-        """
-    third_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              threshold: 1
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 1.169714
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 1
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 2.673407
-              original_leaf {
-                scalar: 0.241322
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 0
-              left_id: 5
-              right_id: 6
-            }
-            metadata {
-              gain: 0.324102
-              original_leaf {
-                scalar: 0.083951
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.563167
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.247047
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.095273
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.222102
-            }
-          }
-        }
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              threshold: 0
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.981026
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.005166
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.180281
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 2
-          is_finalized: true
-        }
-        tree_metadata {
-          num_layers_grown: 1
-          is_finalized: false
-        }
-        growing_metadata {
-          num_trees_attempted: 2
-          num_layers_attempted: 3
-          last_layer_node_start: 1
-          last_layer_node_end: 3
-        }
-        """
-    return (first_round, second_round, third_round)
-
-  def _get_expected_ensembles_for_regression_with_bias(self):
-    first_round = """
-        trees {
-          nodes {
-            leaf {
-              scalar: 1.799974
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_metadata {
-        }
-        """
-    second_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              threshold: 1
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 1.190442
-              original_leaf {
-                scalar: 1.799974
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.862786
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.706149
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 1
-          is_finalized: false
-        }
-        growing_metadata {
-          num_trees_attempted: 1
-          num_layers_attempted: 1
-          last_layer_node_start: 1
-          last_layer_node_end: 3
-        }
-        """
-    third_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              threshold: 1
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 1.190442
-              original_leaf {
-                scalar: 1.799974
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 1
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 2.683594
-              original_leaf {
-                scalar: 1.862786
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              feature_id: 0
-              threshold: 0
-              left_id: 5
-              right_id: 6
-            }
-            metadata {
-              gain: 0.322693
-              original_leaf {
-                scalar: 1.706149
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 2.024487
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.710319
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.559208
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.686037
-            }
-          }
-        }
-        trees {
-          nodes {
-            leaf {
-              scalar: 0.0
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 2
-          is_finalized: true
-        }
-        tree_metadata {
-          num_layers_grown: 0
-          is_finalized: false
-        }
-        growing_metadata {
-          num_trees_attempted: 1
-          num_layers_attempted: 2
-          last_layer_node_start: 0
-          last_layer_node_end: 1
-        }
-        """
-    forth_round = """
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              threshold: 1
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 1.190442
-              original_leaf {
-                scalar:  1.799974
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              threshold: 1
-              left_id: 3
-              right_id: 4
-            }
-            metadata {
-              gain: 2.683594
-              original_leaf {
-                scalar: 1.8627863
-              }
-            }
-          }
-          nodes {
-            bucketized_split {
-              left_id: 5
-              right_id: 6
-            }
-            metadata {
-              gain: 0.322693
-              original_leaf {
-                scalar: 1.706149
-              }
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 2.024487
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.710319
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.5592078
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 1.686037
-            }
-          }
-        }
-        trees {
-          nodes {
-            bucketized_split {
-              feature_id: 1
-              left_id: 1
-              right_id: 2
-            }
-            metadata {
-              gain: 0.972589
-            }
-          }
-          nodes {
-            leaf {
-              scalar: -0.137592
-            }
-          }
-          nodes {
-            leaf {
-              scalar: 0.034926
-            }
-          }
-        }
-        tree_weights: 1.0
-        tree_weights: 1.0
-        tree_metadata {
-          num_layers_grown: 2
-          is_finalized: true
-        }
-        tree_metadata {
-          num_layers_grown: 1
-        }
-        growing_metadata {
-          num_trees_attempted: 2
-          num_layers_attempted: 3
-          last_layer_node_start: 1
-          last_layer_node_end: 3
-        }
-        """
-    return (first_round, second_round, third_round, forth_round)
-
-  def _get_train_op_and_ensemble(self,
-                                 head,
-                                 config,
-                                 is_classification,
-                                 train_in_memory,
-                                 center_bias=False):
-    """Calls bt_model_fn() and returns the train_op and ensemble_serialzed."""
-    features, labels = _make_train_input_fn(is_classification)()
-
-    tree_hparams = boosted_trees._TreeHParams(  # pylint:disable=protected-access
-        n_trees=2,
-        max_depth=2,
-        learning_rate=0.1,
-        l1=0.,
-        l2=0.01,
-        tree_complexity=0.,
-        min_node_weight=0.,
-        center_bias=center_bias,
-        pruning_mode='none')
-
-    estimator_spec = boosted_trees._bt_model_fn(  # pylint:disable=protected-access
-        features=features,
-        labels=labels,
-        mode=model_fn.ModeKeys.TRAIN,
-        head=head,
-        feature_columns=self._feature_columns,
-        tree_hparams=tree_hparams,
-        example_id_column_name=EXAMPLE_ID_COLUMN,
-        n_batches_per_layer=1,
-        config=config,
-        train_in_memory=train_in_memory)
-    resources.initialize_resources(resources.shared_resources()).run()
-    variables.global_variables_initializer().run()
-    variables.local_variables_initializer().run()
-
-    # Gets the train_op and serialized proto of the ensemble.
-    shared_resources = resources.shared_resources()
-    self.assertEqual(1, len(shared_resources))
-    train_op = estimator_spec.train_op
-    with ops.control_dependencies([train_op]):
-      _, ensemble_serialized = (
-          gen_boosted_trees_ops.boosted_trees_serialize_ensemble(
-              shared_resources[0].handle))
-    return train_op, ensemble_serialized
-
-  def testTrainClassifierInMemory(self):
-    ops.reset_default_graph()
-    expected_first, expected_second, expected_third = (
-        self._get_expected_ensembles_for_classification())
-    with self.cached_session() as sess:
-      # Train with train_in_memory mode.
-      with sess.graph.as_default():
-        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
-            boosted_trees._create_classification_head(n_classes=2),
-            run_config.RunConfig(),
-            is_classification=True,
-            train_in_memory=True)
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      # Validate the trained ensemble.
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_first, ensemble_proto)
-
-      # Run one more time and validate the trained ensemble.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_second, ensemble_proto)
-
-      # Third round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_third, ensemble_proto)
-
-  def testTrainClassifierWithCenterBiasInMemory(self):
-    ops.reset_default_graph()
-
-    # When bias centering is on, we expect the very first node to have the
-    expected_first, expected_second, expected_third, expected_forth = (
-        self._get_expected_ensembles_for_classification_with_bias())
-
-    with self.cached_session() as sess:
-      with sess.graph.as_default():
-        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
-            boosted_trees._create_classification_head(n_classes=2),
-            run_config.RunConfig(),
-            is_classification=True,
-            train_in_memory=True,
-            center_bias=True)
-
-      # 4 iterations to center bias.
-      for _ in range(4):
-        _, serialized = sess.run([train_op, ensemble_serialized])
-
-      # Validate the trained ensemble.
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_first, ensemble_proto)
-
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_second, ensemble_proto)
-
-      # Third round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_third, ensemble_proto)
-
-      # Forth round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-
-      self.assertProtoEquals(expected_forth, ensemble_proto)
-
-  def testTrainClassifierNonInMemory(self):
-    ops.reset_default_graph()
-    expected_first, expected_second, expected_third = (
-        self._get_expected_ensembles_for_classification())
-    with self.cached_session() as sess:
-      # Train without train_in_memory mode.
-      with sess.graph.as_default():
-        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
-            boosted_trees._create_classification_head(n_classes=2),
-            run_config.RunConfig(),
-            is_classification=True,
-            train_in_memory=False)
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      # Validate the trained ensemble.
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_first, ensemble_proto)
-
-      # Run one more time and validate the trained ensemble.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_second, ensemble_proto)
-
-      # Third round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_third, ensemble_proto)
-
-  def testTrainClassifierWithCenterBiasNonInMemory(self):
-    ops.reset_default_graph()
-
-    # When bias centering is on, we expect the very first node to have the
-    expected_first, expected_second, expected_third, expected_forth = (
-        self._get_expected_ensembles_for_classification_with_bias())
-
-    with self.cached_session() as sess:
-      with sess.graph.as_default():
-        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
-            boosted_trees._create_classification_head(n_classes=2),
-            run_config.RunConfig(),
-            is_classification=True,
-            train_in_memory=False,
-            center_bias=True)
-      # 4 iterations to center bias.
-      for _ in range(4):
-        _, serialized = sess.run([train_op, ensemble_serialized])
-      # Validate the trained ensemble.
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_first, ensemble_proto)
-
-      # Run one more time and validate the trained ensemble.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_second, ensemble_proto)
-
-      # Third round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_third, ensemble_proto)
-
-      # Forth round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_forth, ensemble_proto)
-
-  def testTrainRegressorInMemory(self):
-    ops.reset_default_graph()
-    expected_first, expected_second, expected_third = (
-        self._get_expected_ensembles_for_regression())
-    with self.cached_session() as sess:
-      # Train with train_in_memory mode.
-      with sess.graph.as_default():
-        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
-            boosted_trees._create_regression_head(label_dimension=1),
-            run_config.RunConfig(),
-            is_classification=False,
-            train_in_memory=True)
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      # Validate the trained ensemble.
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_first, ensemble_proto)
-
-      # Run one more time and validate the trained ensemble.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_second, ensemble_proto)
-
-      # Third round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_third, ensemble_proto)
-
-  def testTrainRegressorInMemoryWithCenterBias(self):
-    ops.reset_default_graph()
-    expected_first, expected_second, expected_third, expected_forth = (
-        self._get_expected_ensembles_for_regression_with_bias())
-    with self.cached_session() as sess:
-      # Train with train_in_memory mode.
-      with sess.graph.as_default():
-        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
-            boosted_trees._create_regression_head(label_dimension=1),
-            run_config.RunConfig(),
-            is_classification=False,
-            train_in_memory=True,
-            center_bias=True)
-      # 3 iterations to center bias.
-      for _ in range(3):
-        _, serialized = sess.run([train_op, ensemble_serialized])
-      # Validate the trained ensemble.
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-
-      self.assertProtoEquals(expected_first, ensemble_proto)
-
-      # Run one more time and validate the trained ensemble.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_second, ensemble_proto)
-
-      # Third round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_third, ensemble_proto)
-
-      # Forth round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_forth, ensemble_proto)
-
-  def testTrainRegressorNonInMemory(self):
-    ops.reset_default_graph()
-    expected_first, expected_second, expected_third = (
-        self._get_expected_ensembles_for_regression())
-    with self.cached_session() as sess:
-      # Train without train_in_memory mode.
-      with sess.graph.as_default():
-        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
-            boosted_trees._create_regression_head(label_dimension=1),
-            run_config.RunConfig(),
-            is_classification=False,
-            train_in_memory=False)
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      # Validate the trained ensemble.
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_first, ensemble_proto)
-
-      # Run one more time and validate the trained ensemble.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_second, ensemble_proto)
-
-      # Third round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_third, ensemble_proto)
-
-  def testTrainRegressorNotInMemoryWithCenterBias(self):
-    ops.reset_default_graph()
-    expected_first, expected_second, expected_third, expected_forth = (
-        self._get_expected_ensembles_for_regression_with_bias())
-    with self.cached_session() as sess:
-      # Train with train_in_memory mode.
-      with sess.graph.as_default():
-        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
-            boosted_trees._create_regression_head(label_dimension=1),
-            run_config.RunConfig(),
-            is_classification=False,
-            train_in_memory=False,
-            center_bias=True)
-      # 3 iterations to center the bias (because we are using regularization).
-      for _ in range(3):
-        _, serialized = sess.run([train_op, ensemble_serialized])
-
-      # Validate the trained ensemble.
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_first, ensemble_proto)
-
-      # Run one more time and validate the trained ensemble.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_second, ensemble_proto)
-
-      # Third round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_third, ensemble_proto)
-
-      # Forth round training and validation.
-      _, serialized = sess.run([train_op, ensemble_serialized])
-      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
-      ensemble_proto.ParseFromString(serialized)
-      self.assertProtoEquals(expected_forth, ensemble_proto)
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/python/estimator/canned/boosted_trees_utils.py b/tensorflow/python/estimator/canned/boosted_trees_utils.py
index 85efc2304abc020d8f1e34fb31f3e073ac45e461..db5178b2b7aac5486e534e9482a288b26d38acdb 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_utils.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_utils.py
@@ -12,69 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Debug and model explainability logic for boosted trees."""
+"""boosted_trees_utils python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
-
-# For directional feature contributions.
-_DEBUG_PROTO_KEY = '_serialized_debug_outputs_proto'
-_BIAS_ID = 0
-
-
-def _parse_debug_proto_string(example_proto_serialized):
-  example_debug_outputs = boosted_trees_pb2.DebugOutput()
-  example_debug_outputs.ParseFromString(example_proto_serialized)
-  feature_ids = example_debug_outputs.feature_ids
-  logits_path = example_debug_outputs.logits_path
-  return feature_ids, logits_path
-
-
-def _compute_directional_feature_contributions(example_feature_ids,
-                                               example_logits_paths, activation,
-                                               num_bucketized_features):
-  """Directional feature contributions and bias, per example."""
-  # Initialize contributions to 0.
-  dfcs = {k: 0 for k in range(num_bucketized_features)}
-
-  # Traverse tree subtracting child prediction from parent prediction and
-  # associating change with feature id used to split.
-  predictions = np.array(activation(example_logits_paths))
-  delta_pred = predictions[_BIAS_ID + 1:] - predictions[:-1]
-  # Group by feature id, then sum delta_pred.
-  contribs = np.bincount(
-      example_feature_ids,
-      weights=delta_pred,
-      minlength=num_bucketized_features)
-  for f, dfc in zip(range(num_bucketized_features), contribs):
-    dfcs[f] = dfc
-  return predictions[_BIAS_ID], dfcs
-
-
-def _identity(logits):
-  return logits
-
-
-def _sigmoid(logits):
-  # TODO(crawles): Change to softmax once multiclass support is available.
-  return 1 / (1 + np.exp(-np.array(logits)))
+from tensorflow_estimator.python.estimator.canned import boosted_trees_utils
 
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+boosted_trees_utils.__all__ = [
+    s for s in dir(boosted_trees_utils) if not s.startswith('__')
+]
 
-def _parse_explanations_from_prediction(serialized_debug_proto,
-                                        n_features,
-                                        classification=False):
-  """Parse serialized explanability proto, compute dfc, and return bias, dfc."""
-  feature_ids, logits_path = _parse_debug_proto_string(serialized_debug_proto)
-  if classification:
-    activation = _sigmoid
-  else:
-    activation = _identity
-  bias, dfcs = _compute_directional_feature_contributions(
-      feature_ids, logits_path, activation, n_features)
-  # TODO(crawles): Prediction path and leaf IDs.
-  return bias, dfcs
+from tensorflow_estimator.python.estimator.canned.boosted_trees_utils import *
diff --git a/tensorflow/python/estimator/canned/boosted_trees_utils_test.py b/tensorflow/python/estimator/canned/boosted_trees_utils_test.py
deleted file mode 100644
index 506d4ea6fb26f5c085de24ceaf17891a1944e901..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/canned/boosted_trees_utils_test.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests boosted_trees estimators and model_fn."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.estimator.canned import boosted_trees_utils
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-
-
-class BoostedTreesDFCTest(test_util.TensorFlowTestCase):
-  """Test directional feature contributions (DFC) helper functions. """
-
-  def testDirectionalFeatureContributionsCompute(self):
-    """Tests logic to compute DFCs given feature ids and logits paths."""
-    num_bucketized_features = 3  # Includes one unused feature.
-    examples_feature_ids = ((2, 2, 0, 0), (2, 2, 0))
-    e1_feature_ids, e2_feature_ids = examples_feature_ids
-
-    # DFCs are computed by traversing the prediction path and subtracting each
-    # child prediction from its parent prediction and associating the change in
-    # prediction with the respective feature id used for the split.
-    # For each activation function, f, (currently identity or sigmoid), DFCs are
-    # calculated for the two examples as:
-    # example 1:
-    #   feature_0 = (f(1.114) - f(1.214)) + (f(6.114) - f(1.114))
-    #   feature_1 = 0  # Feature not in ensemble, thus zero contrib.
-    #   feature_2 = (f(0.114) - bias_pred) + (f(1.214) - f(0.114))
-    # example 2:
-    #   feature_0 = f(-5.486) - f(1.514)
-    #   feature_1 = 0  # Feature not in ensemble, thus zero contrib.
-    #   feature_2 = (f(0.114) - bias_pred) + (f(1.514) - f(0.114))
-    # where bias_pred is = f(0) or f(0.21), with center_bias = {True, False},
-    # respectively.
-    # Keys are center_bias.
-    expected_dfcs_identity = {
-        False: ({
-            0: 4.9,
-            1: 0,
-            2: 1.214
-        }, {
-            0: -7.0,
-            1: 0,
-            2: 1.514
-        }),
-        True: ({
-            0: 4.9,
-            1: 0,
-            2: 1.0039999999999998
-        }, {
-            0: -7.0,
-            1: 0,
-            2: 1.3039999999999998
-        })
-    }
-    expected_dfcs_sigmoid = {
-        False: ({
-            0: 0.22678725678805578,
-            1: 0,
-            2: 0.2710059376234506
-        }, {
-            0: -0.81552596670046507,
-            1: 0,
-            2: 0.319653250251275
-        }),
-        True: ({
-            0: 0.22678725678805578,
-            1: 0,
-            2: 0.2186980280491253
-        }, {
-            0: -0.81552596670046507,
-            1: 0,
-            2: 0.26734534067694971
-        })
-    }
-    # pylint: disable=protected-access
-    for f, expected_dfcs in zip(
-        (boosted_trees_utils._identity, boosted_trees_utils._sigmoid),
-        (expected_dfcs_identity, expected_dfcs_sigmoid)):
-      for center_bias in [False, True]:
-        # If not center_bias, the bias after activation is 0.
-        if center_bias:
-          bias_logit = 0.21  # Root node of tree_0.
-        else:
-          bias_logit = 0  # 0 is default value when there is no original_leaf.
-        f_bias = f(bias_logit)
-
-        # Logits before and after, as is outputed from
-        # boosted_trees_ops.example_debug_outputs
-        examples_logits_paths = ((bias_logit, 0.114, 1.214, 1.114, 6.114),
-                                 (bias_logit, 0.114, 1.514, -5.486))
-        e1_logits_path, e2_logits_path = examples_logits_paths
-        e1_expected_dfcs, e2_expected_dfcs = expected_dfcs[center_bias]
-        # Check feature contributions are correct for both examples.
-        # Example 1.
-        # pylint:disable=line-too-long
-        e1_bias, e1_dfc = boosted_trees_utils._compute_directional_feature_contributions(
-            e1_feature_ids, e1_logits_path, f, num_bucketized_features)
-        self.assertAllClose(e1_bias, f_bias)
-        self.assertAllClose(e1_dfc, e1_expected_dfcs)
-        # Example 2.
-        e2_bias, e2_dfc = boosted_trees_utils._compute_directional_feature_contributions(
-            e2_feature_ids, e2_logits_path, f, num_bucketized_features)
-        # pylint:enable=line-too-long
-        self.assertAllClose(e2_bias, f_bias)
-        self.assertAllClose(e2_dfc, e2_expected_dfcs)
-        # Check if contributions sum to final prediction.
-        # For each tree, get leaf of last tree.
-        expected_logits = (e1_logits_path[-1], e2_logits_path[-1])
-        # Predictions should be the sum of contributions + bias.
-        expected_preds = [f(logit) for logit in expected_logits]
-        e1_pred = e1_bias + sum(e1_dfc.values())
-        e2_pred = e2_bias + sum(e2_dfc.values())
-        preds = [e1_pred, e2_pred]
-        self.assertAllClose(preds, expected_preds)
-    # pylint: enable=protected-access
-
-  def testDFCComputeComparedToExternalExample(self):
-    """Tests `compute_dfc` compared to external example (regression).
-
-    Example from http://blog.datadive.net/interpreting-random-forests.
-    """
-    # DIS:3, RM: 2, LSTAT:1, NOX:0
-    num_bucketized_features = 4
-    e1_feature_ids = (2, 1, 0)
-    e2_feature_ids = (2, 2, 2)
-    e3_feature_ids = (2, 2, 0)
-
-    bias_logit = 22.60  # Root node of tree_0.
-    activation = boosted_trees_utils._identity
-    f_bias = activation(bias_logit)
-    # Logits before and after, as is outputed from
-    # boosted_trees_ops.example_debug_outputs
-    e1_logits_path = (bias_logit, 19.96, 14.91, 18.11)
-    e2_logits_path = (bias_logit, 37.42, 45.10, 45.90)
-    e3_logits_path = (bias_logit, 37.42, 32.30, 33.58)
-    e1_expected_dfcs = {0: 3.20, 1: -5.05, 2: -2.64, 3: 0}
-    e2_expected_dfcs = {0: 0, 1: 0, 2: 23.3, 3: 0}
-    e3_expected_dfcs = {0: 1.28, 1: 0, 2: 9.7, 3: 0}
-    # Check feature contributions are correct for both examples.
-    # Example 1.
-    # pylint: disable=protected-access
-    # pylint: disable=line-too-long
-    e1_bias, e1_dfc = boosted_trees_utils._compute_directional_feature_contributions(
-        e1_feature_ids, e1_logits_path, activation, num_bucketized_features)
-    self.assertAllClose(e1_bias, f_bias)
-    self.assertAllClose(e1_dfc, e1_expected_dfcs)
-    # Example 2.
-    e2_bias, e2_dfc = boosted_trees_utils._compute_directional_feature_contributions(
-        e2_feature_ids, e2_logits_path, activation, num_bucketized_features)
-    self.assertAllClose(e2_bias, f_bias)
-    self.assertAllClose(e2_dfc, e2_expected_dfcs)
-    # Example 3.
-    e3_bias, e3_dfc = boosted_trees_utils._compute_directional_feature_contributions(
-        e3_feature_ids, e3_logits_path, activation, num_bucketized_features)
-    # pylint: enable=line-too-long
-    self.assertAllClose(e3_bias, f_bias)
-    self.assertAllClose(e3_dfc, e3_expected_dfcs)
-    # pylint: enable=protected-access
-    # Check if contributions sum to final prediction.
-    # For each tree, get leaf of last tree.
-    expected_logits = (18.11, 45.90, 33.58)
-    # Predictions should be the sum of contributions + bias.
-    expected_preds = [activation(logit) for logit in expected_logits]
-    e1_pred = e1_bias + sum(e1_dfc.values())
-    e2_pred = e2_bias + sum(e2_dfc.values())
-    e3_pred = e3_bias + sum(e3_dfc.values())
-    preds = [e1_pred, e2_pred, e3_pred]
-    self.assertAllClose(preds, expected_preds)
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index a6c2aaa7d9b556103881273380d10d47e04d73a4..cde0d955dfddd3a9ea810c9a1d333d593cb03465 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,649 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Deep Neural Network estimators."""
+"""dnn python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.feature_column import feature_column_v2
-from tensorflow.python.framework import ops
-from tensorflow.python.keras.engine import training
-from tensorflow.python.layers import core as core_layers
-from tensorflow.python.layers import normalization
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.summary import summary
-from tensorflow.python.util.tf_export import estimator_export
-
-# The default learning rate of 0.05 is a historical artifact of the initial
-# implementation, but seems a reasonable choice.
-_LEARNING_RATE = 0.05
-
-
-def _add_hidden_layer_summary(value, tag):
-  summary.scalar('%s/fraction_of_zero_values' % tag, nn.zero_fraction(value))
-  summary.histogram('%s/activation' % tag, value)
-
-
-def _dnn_logit_fn_builder(units,
-                          hidden_units,
-                          feature_columns,
-                          activation_fn,
-                          dropout,
-                          input_layer_partitioner,
-                          batch_norm,
-                          shared_state_manager=None):
-  """Function builder for a dnn logit_fn.
-
-  Args:
-    units: An int indicating the dimension of the logit layer.  In the
-      MultiHead case, this should be the sum of all component Heads' logit
-      dimensions.
-    hidden_units: Iterable of integer number of hidden units per layer.
-    feature_columns: Iterable of `feature_column._FeatureColumn` model inputs.
-    activation_fn: Activation function applied to each layer.
-    dropout: When not `None`, the probability we will drop out a given
-      coordinate.
-    input_layer_partitioner: Partitioner for input layer.
-    batch_norm: Whether to use batch normalization after each hidden layer.
-    shared_state_manager: A SharedEmbeddingStateManager object to hold the
-      shared state for SharedEmbeddingColumn's.
-
-  Returns:
-    A logit_fn (see below).
-
-  Raises:
-    ValueError: If units is not an int.
-  """
-  if not isinstance(units, int):
-    raise ValueError('units must be an int.  Given type: {}'.format(
-        type(units)))
-
-  def dnn_logit_fn(features, mode):
-    """Deep Neural Network logit_fn.
-
-    Args:
-      features: This is the first item returned from the `input_fn`
-                passed to `train`, `evaluate`, and `predict`. This should be a
-                single `Tensor` or `dict` of same.
-      mode: Optional. Specifies if this training, evaluation or prediction. See
-            `ModeKeys`.
-
-    Returns:
-      A `Tensor` representing the logits, or a list of `Tensor`'s representing
-      multiple logits in the MultiHead case.
-    """
-    dnn_model = _DNNModel(
-        units,
-        hidden_units,
-        feature_columns,
-        activation_fn,
-        dropout,
-        input_layer_partitioner,
-        batch_norm,
-        shared_state_manager,
-        name='dnn')
-    return dnn_model(features, mode)
-
-  return dnn_logit_fn
-
-
-def _get_previous_name_scope():
-  current_name_scope = ops.get_name_scope()
-  return current_name_scope.rsplit('/', 1)[0] + '/'
-
-
-class _DNNModel(training.Model):
-  """A DNN Model."""
-
-  def __init__(self,
-               units,
-               hidden_units,
-               feature_columns,
-               activation_fn,
-               dropout,
-               input_layer_partitioner,
-               batch_norm,
-               shared_state_manager,
-               name=None,
-               **kwargs):
-    super(_DNNModel, self).__init__(name=name, **kwargs)
-    if feature_column_v2.is_feature_column_v2(feature_columns):
-      self._input_layer = feature_column_v2.FeatureLayer(
-          feature_columns=feature_columns,
-          name='input_layer',
-          shared_state_manager=shared_state_manager)
-    else:
-      self._input_layer = feature_column.InputLayer(
-          feature_columns=feature_columns,
-          name='input_layer',
-          create_scope_now=False)
-
-    self._add_layer(self._input_layer, 'input_layer')
-
-    self._dropout = dropout
-    self._batch_norm = batch_norm
-
-    self._hidden_layers = []
-    self._dropout_layers = []
-    self._batch_norm_layers = []
-    self._hidden_layer_scope_names = []
-    for layer_id, num_hidden_units in enumerate(hidden_units):
-      with variable_scope.variable_scope(
-          'hiddenlayer_%d' % layer_id) as hidden_layer_scope:
-        hidden_layer = core_layers.Dense(
-            units=num_hidden_units,
-            activation=activation_fn,
-            kernel_initializer=init_ops.glorot_uniform_initializer(),
-            name=hidden_layer_scope,
-            _scope=hidden_layer_scope)
-        self._add_layer(hidden_layer, hidden_layer_scope.name)
-        self._hidden_layer_scope_names.append(hidden_layer_scope.name)
-        self._hidden_layers.append(hidden_layer)
-        if self._dropout is not None:
-          dropout_layer = core_layers.Dropout(rate=self._dropout)
-          self._add_layer(dropout_layer, dropout_layer.name)
-          self._dropout_layers.append(dropout_layer)
-        if self._batch_norm:
-          batch_norm_layer = normalization.BatchNormalization(
-              # The default momentum 0.99 actually crashes on certain
-              # problem, so here we use 0.999, which is the default of
-              # tf.contrib.layers.batch_norm.
-              momentum=0.999,
-              trainable=True,
-              name='batchnorm_%d' % layer_id,
-              _scope='batchnorm_%d' % layer_id)
-          self._add_layer(batch_norm_layer, batch_norm_layer.name)
-          self._batch_norm_layers.append(batch_norm_layer)
-
-    with variable_scope.variable_scope('logits') as logits_scope:
-      self._logits_layer = core_layers.Dense(
-          units=units,
-          activation=None,
-          kernel_initializer=init_ops.glorot_uniform_initializer(),
-          name=logits_scope,
-          _scope=logits_scope)
-      self._add_layer(self._logits_layer, logits_scope.name)
-      self._logits_scope_name = logits_scope.name
-    self._input_layer_partitioner = input_layer_partitioner
-
-  def call(self, features, mode):
-    is_training = mode == model_fn.ModeKeys.TRAIN
-    # The Keras training.Model adds a name_scope with the name of the model
-    # which modifies the constructed graph. Hence we add another name_scope
-    # here which is the one before the training.Model one was applied.
-    # TODO(rohanj): Remove this in TF 2.0 (b/116728605)
-    with ops.name_scope(name=_get_previous_name_scope()):
-      # TODO(rohanj): Remove dependence on variable scope for partitioning.
-      with variable_scope.variable_scope(
-          'input_from_feature_columns',
-          partitioner=self._input_layer_partitioner):
-        net = self._input_layer(features)
-      for i in range(len(self._hidden_layers)):
-        net = self._hidden_layers[i](net)
-        if self._dropout is not None and is_training:
-          net = self._dropout_layers[i](net, training=True)
-        if self._batch_norm:
-          net = self._batch_norm_layers[i](net, training=is_training)
-        _add_hidden_layer_summary(net, self._hidden_layer_scope_names[i])
-
-      logits = self._logits_layer(net)
-      _add_hidden_layer_summary(logits, self._logits_scope_name)
-      return logits
-
-  def _add_layer(self, layer, layer_name):
-    # "Magic" required for keras.Model classes to track all the variables in
-    # a list of layers.Layer objects.
-    # TODO(ashankar): Figure out API so user code doesn't have to do this.
-    setattr(self, layer_name, layer)
-
-
-def _dnn_model_fn(features,
-                  labels,
-                  mode,
-                  head,
-                  hidden_units,
-                  feature_columns,
-                  optimizer='Adagrad',
-                  activation_fn=nn.relu,
-                  dropout=None,
-                  input_layer_partitioner=None,
-                  config=None,
-                  use_tpu=False,
-                  batch_norm=False,
-                  shared_state_manager=None):
-  """Deep Neural Net model_fn.
-
-  Args:
-    features: dict of `Tensor`.
-    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
-      dtype `int32` or `int64` in the range `[0, n_classes)`.
-    mode: Defines whether this is training, evaluation or prediction.
-      See `ModeKeys`.
-    head: A `head_lib._Head` instance.
-    hidden_units: Iterable of integer number of hidden units per layer.
-    feature_columns: Iterable of `feature_column._FeatureColumn` model inputs.
-    optimizer: String, `tf.Optimizer` object, or callable that creates the
-      optimizer to use for training. If not specified, will use the Adagrad
-      optimizer with a default learning rate of 0.05.
-    activation_fn: Activation function applied to each layer.
-    dropout: When not `None`, the probability we will drop out a given
-      coordinate.
-    input_layer_partitioner: Partitioner for input layer. Defaults
-      to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-    config: `RunConfig` object to configure the runtime settings.
-    use_tpu: Whether to make a DNN model able to run on TPU. Will make function
-      return a `_TPUEstimatorSpec` instance and disable variable partitioning.
-    batch_norm: Whether to use batch normalization after each hidden layer.
-    shared_state_manager: A SharedEmbeddingStateManager object to hold the
-      shared state for SharedEmbeddingColumn's.
-
-  Returns:
-    An `EstimatorSpec` instance.
-
-  Raises:
-    ValueError: If features has the wrong type.
-  """
-  if not isinstance(features, dict):
-    raise ValueError('features should be a dictionary of `Tensor`s. '
-                     'Given type: {}'.format(type(features)))
-
-  optimizer = optimizers.get_optimizer_instance(
-      optimizer, learning_rate=_LEARNING_RATE)
-  num_ps_replicas = config.num_ps_replicas if config else 0
-
-  partitioner = (None if use_tpu else
-                 partitioned_variables.min_max_variable_partitioner(
-                     max_partitions=num_ps_replicas))
-  with variable_scope.variable_scope(
-      'dnn',
-      values=tuple(six.itervalues(features)),
-      partitioner=partitioner):
-    input_layer_partitioner = input_layer_partitioner or (
-        None if use_tpu else
-        partitioned_variables.min_max_variable_partitioner(
-            max_partitions=num_ps_replicas,
-            min_slice_size=64 << 20))
-
-    logit_fn = _dnn_logit_fn_builder(
-        units=head.logits_dimension,
-        hidden_units=hidden_units,
-        feature_columns=feature_columns,
-        activation_fn=activation_fn,
-        dropout=dropout,
-        input_layer_partitioner=input_layer_partitioner,
-        batch_norm=batch_norm,
-        shared_state_manager=shared_state_manager)
-    logits = logit_fn(features=features, mode=mode)
-
-    if use_tpu:
-      return head._create_tpu_estimator_spec(  # pylint: disable=protected-access
-          features=features,
-          mode=mode,
-          labels=labels,
-          optimizer=optimizer,
-          logits=logits)
-    else:
-      return head.create_estimator_spec(
-          features=features,
-          mode=mode,
-          labels=labels,
-          optimizer=optimizer,
-          logits=logits)
-
-
-@estimator_export('estimator.DNNClassifier')
-class DNNClassifier(estimator.Estimator):
-  """A classifier for TensorFlow DNN models.
-
-  Example:
-
-  ```python
-  categorical_feature_a = categorical_column_with_hash_bucket(...)
-  categorical_feature_b = categorical_column_with_hash_bucket(...)
-
-  categorical_feature_a_emb = embedding_column(
-      categorical_column=categorical_feature_a, ...)
-  categorical_feature_b_emb = embedding_column(
-      categorical_column=categorical_feature_b, ...)
-
-  estimator = DNNClassifier(
-      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-      hidden_units=[1024, 512, 256])
-
-  # Or estimator using the ProximalAdagradOptimizer optimizer with
-  # regularization.
-  estimator = DNNClassifier(
-      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-      hidden_units=[1024, 512, 256],
-      optimizer=tf.train.ProximalAdagradOptimizer(
-        learning_rate=0.1,
-        l1_regularization_strength=0.001
-      ))
-
-  # Or estimator using an optimizer with a learning rate decay.
-  estimator = DNNClassifier(
-      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-      hidden_units=[1024, 512, 256],
-      optimizer=lambda: tf.AdamOptimizer(
-          learning_rate=tf.exponential_decay(
-              learning_rate=0.1,
-              global_step=tf.get_global_step(),
-              decay_steps=10000,
-              decay_rate=0.96))
-
-  # Or estimator with warm-starting from a previous checkpoint.
-  estimator = DNNClassifier(
-      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-      hidden_units=[1024, 512, 256],
-      warm_start_from="/path/to/checkpoint/dir")
-
-  # Input builders
-  def input_fn_train: # returns x, y
-    pass
-  estimator.train(input_fn=input_fn_train, steps=100)
-
-  def input_fn_eval: # returns x, y
-    pass
-  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
-  def input_fn_predict: # returns x, None
-    pass
-  predictions = estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-    `key=weight_column` whose value is a `Tensor`.
-  * for each `column` in `feature_columns`:
-    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
-      with `key` the id column name, the second with `key` the weight column
-      name. Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss is calculated by using softmax cross entropy.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(
-      self,
-      hidden_units,
-      feature_columns,
-      model_dir=None,
-      n_classes=2,
-      weight_column=None,
-      label_vocabulary=None,
-      optimizer='Adagrad',
-      activation_fn=nn.relu,
-      dropout=None,
-      input_layer_partitioner=None,
-      config=None,
-      warm_start_from=None,
-      loss_reduction=losses.Reduction.SUM,
-      batch_norm=False,
-  ):
-    """Initializes a `DNNClassifier` instance.
-
-    Args:
-      hidden_units: Iterable of number hidden units per layer. All layers are
-        fully connected. Ex. `[64, 32]` means first layer has 64 nodes and
-        second one has 32.
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `_FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      n_classes: Number of label classes. Defaults to 2, namely binary
-        classification. Must be > 1.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to down weight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      label_vocabulary: A list of strings represents possible label values. If
-        given, labels must be string type and have any value in
-        `label_vocabulary`. If it is not given, that means labels are
-        already encoded as integer or float within [0, 1] for `n_classes=2` and
-        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
-        Also there will be errors if vocabulary is not provided and labels are
-        string.
-      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
-        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
-        callable. Defaults to Adagrad optimizer.
-      activation_fn: Activation function applied to each layer. If `None`, will
-        use `tf.nn.relu`.
-      dropout: When not `None`, the probability we will drop out a given
-        coordinate.
-      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
-        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-      config: `RunConfig` object to configure the runtime settings.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights are warm-started, and it is assumed that vocabularies and Tensor
-        names are unchanged.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM`.
-      batch_norm: Whether to use batch normalization after each hidden layer.
-    """
-    head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
-        n_classes, weight_column, label_vocabulary, loss_reduction)
-
-    shared_state_manager = feature_column_v2.maybe_create_shared_state_manager(
-        feature_columns)
-
-    def _model_fn(features, labels, mode, config):
-      """Call the defined shared _dnn_model_fn."""
-      return _dnn_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          hidden_units=hidden_units,
-          feature_columns=tuple(feature_columns or []),
-          optimizer=optimizer,
-          activation_fn=activation_fn,
-          dropout=dropout,
-          input_layer_partitioner=input_layer_partitioner,
-          config=config,
-          batch_norm=batch_norm,
-          shared_state_manager=shared_state_manager)
-
-    super(DNNClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config,
-        warm_start_from=warm_start_from)
-
-
-@estimator_export('estimator.DNNRegressor')
-class DNNRegressor(estimator.Estimator):
-  """A regressor for TensorFlow DNN models.
-
-  Example:
-
-  ```python
-  categorical_feature_a = categorical_column_with_hash_bucket(...)
-  categorical_feature_b = categorical_column_with_hash_bucket(...)
-
-  categorical_feature_a_emb = embedding_column(
-      categorical_column=categorical_feature_a, ...)
-  categorical_feature_b_emb = embedding_column(
-      categorical_column=categorical_feature_b, ...)
-
-  estimator = DNNRegressor(
-      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-      hidden_units=[1024, 512, 256])
-
-  # Or estimator using the ProximalAdagradOptimizer optimizer with
-  # regularization.
-  estimator = DNNRegressor(
-      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-      hidden_units=[1024, 512, 256],
-      optimizer=tf.train.ProximalAdagradOptimizer(
-        learning_rate=0.1,
-        l1_regularization_strength=0.001
-      ))
-
-  # Or estimator using an optimizer with a learning rate decay.
-  estimator = DNNRegressor(
-      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-      hidden_units=[1024, 512, 256],
-      optimizer=lambda: tf.AdamOptimizer(
-          learning_rate=tf.exponential_decay(
-              learning_rate=0.1,
-              global_step=tf.get_global_step(),
-              decay_steps=10000,
-              decay_rate=0.96))
-
-  # Or estimator with warm-starting from a previous checkpoint.
-  estimator = DNNRegressor(
-      feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-      hidden_units=[1024, 512, 256],
-      warm_start_from="/path/to/checkpoint/dir")
-
-  # Input builders
-  def input_fn_train: # returns x, y
-    pass
-  estimator.train(input_fn=input_fn_train, steps=100)
-
-  def input_fn_eval: # returns x, y
-    pass
-  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
-  def input_fn_predict: # returns x, None
-    pass
-  predictions = estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-    `key=weight_column` whose value is a `Tensor`.
-  * for each `column` in `feature_columns`:
-    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
-      with `key` the id column name, the second with `key` the weight column
-      name. Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss is calculated by using mean squared error.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(
-      self,
-      hidden_units,
-      feature_columns,
-      model_dir=None,
-      label_dimension=1,
-      weight_column=None,
-      optimizer='Adagrad',
-      activation_fn=nn.relu,
-      dropout=None,
-      input_layer_partitioner=None,
-      config=None,
-      warm_start_from=None,
-      loss_reduction=losses.Reduction.SUM,
-      batch_norm=False,
-  ):
-    """Initializes a `DNNRegressor` instance.
-
-    Args:
-      hidden_units: Iterable of number hidden units per layer. All layers are
-        fully connected. Ex. `[64, 32]` means first layer has 64 nodes and
-        second one has 32.
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `_FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model.
-      label_dimension: Number of regression targets per example. This is the
-        size of the last dimension of the labels and logits `Tensor` objects
-        (typically, these have shape `[batch_size, label_dimension]`).
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to down weight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
-        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
-        callable. Defaults to Adagrad optimizer.
-      activation_fn: Activation function applied to each layer. If `None`, will
-        use `tf.nn.relu`.
-      dropout: When not `None`, the probability we will drop out a given
-        coordinate.
-      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
-        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-      config: `RunConfig` object to configure the runtime settings.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights are warm-started, and it is assumed that vocabularies and Tensor
-        names are unchanged.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM`.
-      batch_norm: Whether to use batch normalization after each hidden layer.
-    """
-
-    shared_state_manager = None
-    if feature_column_v2.is_feature_column_v2(feature_columns):
-      shared_state_manager = feature_column_v2.SharedEmbeddingStateManager()
+from tensorflow_estimator.python.estimator.canned import dnn
 
-    def _model_fn(features, labels, mode, config):
-      """Call the defined shared _dnn_model_fn."""
-      return _dnn_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head_lib._regression_head(  # pylint: disable=protected-access
-              label_dimension=label_dimension,
-              weight_column=weight_column,
-              loss_reduction=loss_reduction),
-          hidden_units=hidden_units,
-          feature_columns=tuple(feature_columns or []),
-          optimizer=optimizer,
-          activation_fn=activation_fn,
-          dropout=dropout,
-          input_layer_partitioner=input_layer_partitioner,
-          config=config,
-          batch_norm=batch_norm,
-          shared_state_manager=shared_state_manager)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+dnn.__all__ = [s for s in dir(dnn) if not s.startswith('__')]
 
-    super(DNNRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config,
-        warm_start_from=warm_start_from)
+from tensorflow_estimator.python.estimator.canned.dnn import *
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index f712244c8d02e199d14ea551a8e72f2a47c8a743..e28499368f1687c4ab4972f04888da66a47b0f5a 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,621 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TensorFlow estimators for Linear and DNN joined training models."""
+"""dnn_linear_combined python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
-import six
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import dnn
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import linear
-from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.feature_column import feature_column_v2
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.summary import summary
-from tensorflow.python.training import sync_replicas_optimizer
-from tensorflow.python.training import training_util
-from tensorflow.python.util.tf_export import estimator_export
-
-# The default learning rates are a historical artifact of the initial
-# implementation.
-_DNN_LEARNING_RATE = 0.001
-_LINEAR_LEARNING_RATE = 0.005
-
-
-def _check_no_sync_replicas_optimizer(optimizer):
-  if isinstance(optimizer, sync_replicas_optimizer.SyncReplicasOptimizer):
-    raise ValueError(
-        'SyncReplicasOptimizer does not support multi optimizers case. '
-        'Therefore, it is not supported in DNNLinearCombined model. '
-        'If you want to use this optimizer, please use either DNN or Linear '
-        'model.')
-
-
-def _linear_learning_rate(num_linear_feature_columns):
-  """Returns the default learning rate of the linear model.
-
-  The calculation is a historical artifact of this initial implementation, but
-  has proven a reasonable choice.
-
-  Args:
-    num_linear_feature_columns: The number of feature columns of the linear
-      model.
-
-  Returns:
-    A float.
-  """
-  default_learning_rate = 1. / math.sqrt(num_linear_feature_columns)
-  return min(_LINEAR_LEARNING_RATE, default_learning_rate)
-
-
-def _add_layer_summary(value, tag):
-  summary.scalar('%s/fraction_of_zero_values' % tag, nn.zero_fraction(value))
-  summary.histogram('%s/activation' % tag, value)
-
-
-def _dnn_linear_combined_model_fn(features,
-                                  labels,
-                                  mode,
-                                  head,
-                                  linear_feature_columns=None,
-                                  linear_optimizer='Ftrl',
-                                  dnn_feature_columns=None,
-                                  dnn_optimizer='Adagrad',
-                                  dnn_hidden_units=None,
-                                  dnn_activation_fn=nn.relu,
-                                  dnn_dropout=None,
-                                  input_layer_partitioner=None,
-                                  config=None,
-                                  batch_norm=False,
-                                  linear_sparse_combiner='sum'):
-  """Deep Neural Net and Linear combined model_fn.
-
-  Args:
-    features: dict of `Tensor`.
-    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of dtype
-      `int32` or `int64` in the range `[0, n_classes)`.
-    mode: Defines whether this is training, evaluation or prediction.
-      See `ModeKeys`.
-    head: A `Head` instance.
-    linear_feature_columns: An iterable containing all the feature columns used
-      by the Linear model.
-    linear_optimizer: string, `Optimizer` object, or callable that defines the
-      optimizer to use for training the Linear model. Defaults to the Ftrl
-      optimizer.
-    dnn_feature_columns: An iterable containing all the feature columns used by
-      the DNN model.
-    dnn_optimizer: string, `Optimizer` object, or callable that defines the
-      optimizer to use for training the DNN model. Defaults to the Adagrad
-      optimizer.
-    dnn_hidden_units: List of hidden units per DNN layer.
-    dnn_activation_fn: Activation function applied to each DNN layer. If `None`,
-      will use `tf.nn.relu`.
-    dnn_dropout: When not `None`, the probability we will drop out a given DNN
-      coordinate.
-    input_layer_partitioner: Partitioner for input layer.
-    config: `RunConfig` object to configure the runtime settings.
-    batch_norm: Whether to use batch normalization after each hidden layer.
-    linear_sparse_combiner: A string specifying how to reduce the linear model
-      if a categorical column is multivalent.  One of "mean", "sqrtn", and
-      "sum".
-  Returns:
-    An `EstimatorSpec` instance.
-
-  Raises:
-    ValueError: If both `linear_feature_columns` and `dnn_features_columns`
-      are empty at the same time, or `input_layer_partitioner` is missing,
-      or features has the wrong type.
-  """
-  if not isinstance(features, dict):
-    raise ValueError('features should be a dictionary of `Tensor`s. '
-                     'Given type: {}'.format(type(features)))
-  if not linear_feature_columns and not dnn_feature_columns:
-    raise ValueError(
-        'Either linear_feature_columns or dnn_feature_columns must be defined.')
-
-  num_ps_replicas = config.num_ps_replicas if config else 0
-  input_layer_partitioner = input_layer_partitioner or (
-      partitioned_variables.min_max_variable_partitioner(
-          max_partitions=num_ps_replicas,
-          min_slice_size=64 << 20))
-
-  shared_state_manager = feature_column_v2.maybe_create_shared_state_manager(
-      list(linear_feature_columns) + list(dnn_feature_columns))
-
-  # Build DNN Logits.
-  dnn_parent_scope = 'dnn'
-
-  if not dnn_feature_columns:
-    dnn_logits = None
-  else:
-    dnn_optimizer = optimizers.get_optimizer_instance(
-        dnn_optimizer, learning_rate=_DNN_LEARNING_RATE)
-    _check_no_sync_replicas_optimizer(dnn_optimizer)
-    if not dnn_hidden_units:
-      raise ValueError(
-          'dnn_hidden_units must be defined when dnn_feature_columns is '
-          'specified.')
-    dnn_partitioner = (
-        partitioned_variables.min_max_variable_partitioner(
-            max_partitions=num_ps_replicas))
-    with variable_scope.variable_scope(
-        dnn_parent_scope,
-        values=tuple(six.itervalues(features)),
-        partitioner=dnn_partitioner) as scope:
-      dnn_absolute_scope = scope.name
-      dnn_logit_fn = dnn._dnn_logit_fn_builder(  # pylint: disable=protected-access
-          units=head.logits_dimension,
-          hidden_units=dnn_hidden_units,
-          feature_columns=dnn_feature_columns,
-          activation_fn=dnn_activation_fn,
-          dropout=dnn_dropout,
-          batch_norm=batch_norm,
-          input_layer_partitioner=input_layer_partitioner,
-          shared_state_manager=shared_state_manager)
-      dnn_logits = dnn_logit_fn(features=features, mode=mode)
-
-  linear_parent_scope = 'linear'
-
-  if not linear_feature_columns:
-    linear_logits = None
-  else:
-    linear_optimizer = optimizers.get_optimizer_instance(
-        linear_optimizer,
-        learning_rate=_linear_learning_rate(len(linear_feature_columns)))
-    _check_no_sync_replicas_optimizer(linear_optimizer)
-    with variable_scope.variable_scope(
-        linear_parent_scope,
-        values=tuple(six.itervalues(features)),
-        partitioner=input_layer_partitioner) as scope:
-      linear_absolute_scope = scope.name
-      logit_fn = linear._linear_logit_fn_builder(  # pylint: disable=protected-access
-          units=head.logits_dimension,
-          feature_columns=linear_feature_columns,
-          sparse_combiner=linear_sparse_combiner)
-      linear_logits = logit_fn(features=features)
-      _add_layer_summary(linear_logits, scope.name)
-
-  # Combine logits and build full model.
-  if dnn_logits is not None and linear_logits is not None:
-    logits = dnn_logits + linear_logits
-  elif dnn_logits is not None:
-    logits = dnn_logits
-  else:
-    logits = linear_logits
-
-  def _train_op_fn(loss):
-    """Returns the op to optimize the loss."""
-    train_ops = []
-    global_step = training_util.get_global_step()
-    if dnn_logits is not None:
-      train_ops.append(
-          dnn_optimizer.minimize(
-              loss,
-              var_list=ops.get_collection(
-                  ops.GraphKeys.TRAINABLE_VARIABLES,
-                  scope=dnn_absolute_scope)))
-    if linear_logits is not None:
-      train_ops.append(
-          linear_optimizer.minimize(
-              loss,
-              var_list=ops.get_collection(
-                  ops.GraphKeys.TRAINABLE_VARIABLES,
-                  scope=linear_absolute_scope)))
-
-    train_op = control_flow_ops.group(*train_ops)
-    with ops.control_dependencies([train_op]):
-      return state_ops.assign_add(global_step, 1).op
-
-  return head.create_estimator_spec(
-      features=features,
-      mode=mode,
-      labels=labels,
-      train_op_fn=_train_op_fn,
-      logits=logits)
-
-
-@estimator_export('estimator.DNNLinearCombinedClassifier')
-class DNNLinearCombinedClassifier(estimator.Estimator):
-  """An estimator for TensorFlow Linear and DNN joined classification models.
-
-  Note: This estimator is also known as wide-n-deep.
-
-  Example:
-
-  ```python
-  numeric_feature = numeric_column(...)
-  categorical_column_a = categorical_column_with_hash_bucket(...)
-  categorical_column_b = categorical_column_with_hash_bucket(...)
-
-  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
-  categorical_feature_a_emb = embedding_column(
-      categorical_column=categorical_feature_a, ...)
-  categorical_feature_b_emb = embedding_column(
-      categorical_id_column=categorical_feature_b, ...)
-
-  estimator = DNNLinearCombinedClassifier(
-      # wide settings
-      linear_feature_columns=[categorical_feature_a_x_categorical_feature_b],
-      linear_optimizer=tf.train.FtrlOptimizer(...),
-      # deep settings
-      dnn_feature_columns=[
-          categorical_feature_a_emb, categorical_feature_b_emb,
-          numeric_feature],
-      dnn_hidden_units=[1000, 500, 100],
-      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...),
-      # warm-start settings
-      warm_start_from="/path/to/checkpoint/dir")
-
-  # To apply L1 and L2 regularization, you can set dnn_optimizer to:
-  tf.train.ProximalAdagradOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001,
-      l2_regularization_strength=0.001)
-  # To apply learning rate decay, you can set dnn_optimizer to a callable:
-  lambda: tf.AdamOptimizer(
-      learning_rate=tf.exponential_decay(
-          learning_rate=0.1,
-          global_step=tf.get_global_step(),
-          decay_steps=10000,
-          decay_rate=0.96)
-  # It is the same for linear_optimizer.
-
-  # Input builders
-  def input_fn_train: # returns x, y
-    pass
-  estimator.train(input_fn=input_fn_train, steps=100)
-
-  def input_fn_eval: # returns x, y
-    pass
-  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
-  def input_fn_predict: # returns x, None
-    pass
-  predictions = estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-  * for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
-    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
-      with `key` the id column name, the second with `key` the weight column
-      name. Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss is calculated by using softmax cross entropy.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               model_dir=None,
-               linear_feature_columns=None,
-               linear_optimizer='Ftrl',
-               dnn_feature_columns=None,
-               dnn_optimizer='Adagrad',
-               dnn_hidden_units=None,
-               dnn_activation_fn=nn.relu,
-               dnn_dropout=None,
-               n_classes=2,
-               weight_column=None,
-               label_vocabulary=None,
-               input_layer_partitioner=None,
-               config=None,
-               warm_start_from=None,
-               loss_reduction=losses.Reduction.SUM,
-               batch_norm=False,
-               linear_sparse_combiner='sum'):
-    """Initializes a DNNLinearCombinedClassifier instance.
-
-    Args:
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      linear_feature_columns: An iterable containing all the feature columns
-        used by linear part of the model. All items in the set must be
-        instances of classes derived from `FeatureColumn`.
-      linear_optimizer: An instance of `tf.Optimizer` used to apply gradients to
-        the linear part of the model. Can also be a string (one of 'Adagrad',
-        'Adam', 'Ftrl', 'RMSProp', 'SGD'), or callable. Defaults to FTRL
-        optimizer.
-      dnn_feature_columns: An iterable containing all the feature columns used
-        by deep part of the model. All items in the set must be instances of
-        classes derived from `FeatureColumn`.
-      dnn_optimizer: An instance of `tf.Optimizer` used to apply gradients to
-        the deep part of the model. Can also be a string (one of 'Adagrad',
-        'Adam', 'Ftrl', 'RMSProp', 'SGD'), or callable. Defaults to Adagrad
-        optimizer.
-      dnn_hidden_units: List of hidden units per layer. All layers are fully
-        connected.
-      dnn_activation_fn: Activation function applied to each layer. If None,
-        will use `tf.nn.relu`.
-      dnn_dropout: When not None, the probability we will drop out
-        a given coordinate.
-      n_classes: Number of label classes. Defaults to 2, namely binary
-        classification. Must be > 1.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to down weight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      label_vocabulary: A list of strings represents possible label values. If
-        given, labels must be string type and have any value in
-        `label_vocabulary`. If it is not given, that means labels are
-        already encoded as integer or float within [0, 1] for `n_classes=2` and
-        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
-        Also there will be errors if vocabulary is not provided and labels are
-        string.
-      input_layer_partitioner: Partitioner for input layer. Defaults to
-        `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-      config: RunConfig object to configure the runtime settings.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights are warm-started, and it is assumed that vocabularies and Tensor
-        names are unchanged.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM`.
-      batch_norm: Whether to use batch normalization after each hidden layer.
-      linear_sparse_combiner: A string specifying how to reduce the linear model
-        if a categorical column is multivalent.  One of "mean", "sqrtn", and
-        "sum" -- these are effectively different ways to do example-level
-        normalization, which can be useful for bag-of-words features.  For more
-        details, see `tf.feature_column.linear_model`.
-
-    Raises:
-      ValueError: If both linear_feature_columns and dnn_features_columns are
-        empty at the same time.
-    """
-    linear_feature_columns = linear_feature_columns or []
-    dnn_feature_columns = dnn_feature_columns or []
-    self._feature_columns = (
-        list(linear_feature_columns) + list(dnn_feature_columns))
-    if not self._feature_columns:
-      raise ValueError('Either linear_feature_columns or dnn_feature_columns '
-                       'must be defined.')
-    if n_classes == 2:
-      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-    else:
-      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
-          n_classes,
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-
-    def _model_fn(features, labels, mode, config):
-      """Call the _dnn_linear_combined_model_fn."""
-      return _dnn_linear_combined_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          linear_feature_columns=linear_feature_columns,
-          linear_optimizer=linear_optimizer,
-          dnn_feature_columns=dnn_feature_columns,
-          dnn_optimizer=dnn_optimizer,
-          dnn_hidden_units=dnn_hidden_units,
-          dnn_activation_fn=dnn_activation_fn,
-          dnn_dropout=dnn_dropout,
-          input_layer_partitioner=input_layer_partitioner,
-          config=config,
-          batch_norm=batch_norm,
-          linear_sparse_combiner=linear_sparse_combiner)
-
-    super(DNNLinearCombinedClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config,
-        warm_start_from=warm_start_from)
-
-
-@estimator_export('estimator.DNNLinearCombinedRegressor')
-class DNNLinearCombinedRegressor(estimator.Estimator):
-  """An estimator for TensorFlow Linear and DNN joined models for regression.
-
-  Note: This estimator is also known as wide-n-deep.
-
-  Example:
-
-  ```python
-  numeric_feature = numeric_column(...)
-  categorical_column_a = categorical_column_with_hash_bucket(...)
-  categorical_column_b = categorical_column_with_hash_bucket(...)
-
-  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
-  categorical_feature_a_emb = embedding_column(
-      categorical_column=categorical_feature_a, ...)
-  categorical_feature_b_emb = embedding_column(
-      categorical_column=categorical_feature_b, ...)
-
-  estimator = DNNLinearCombinedRegressor(
-      # wide settings
-      linear_feature_columns=[categorical_feature_a_x_categorical_feature_b],
-      linear_optimizer=tf.train.FtrlOptimizer(...),
-      # deep settings
-      dnn_feature_columns=[
-          categorical_feature_a_emb, categorical_feature_b_emb,
-          numeric_feature],
-      dnn_hidden_units=[1000, 500, 100],
-      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...),
-      # warm-start settings
-      warm_start_from="/path/to/checkpoint/dir")
-
-  # To apply L1 and L2 regularization, you can set dnn_optimizer to:
-  tf.train.ProximalAdagradOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=0.001,
-      l2_regularization_strength=0.001)
-  # To apply learning rate decay, you can set dnn_optimizer to a callable:
-  lambda: tf.AdamOptimizer(
-      learning_rate=tf.exponential_decay(
-          learning_rate=0.1,
-          global_step=tf.get_global_step(),
-          decay_steps=10000,
-          decay_rate=0.96)
-  # It is the same for linear_optimizer.
-
-  # Input builders
-  def input_fn_train: # returns x, y
-    pass
-  estimator.train(input_fn=input_fn_train, steps=100)
-
-  def input_fn_eval: # returns x, y
-    pass
-  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
-  def input_fn_predict: # returns x, None
-    pass
-  predictions = estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-  otherwise there will be a `KeyError`:
-
-  * for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
-    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
-      with `key` the id column name, the second with `key` the weight column
-      name. Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss is calculated by using mean squared error.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               model_dir=None,
-               linear_feature_columns=None,
-               linear_optimizer='Ftrl',
-               dnn_feature_columns=None,
-               dnn_optimizer='Adagrad',
-               dnn_hidden_units=None,
-               dnn_activation_fn=nn.relu,
-               dnn_dropout=None,
-               label_dimension=1,
-               weight_column=None,
-               input_layer_partitioner=None,
-               config=None,
-               warm_start_from=None,
-               loss_reduction=losses.Reduction.SUM,
-               batch_norm=False,
-               linear_sparse_combiner='sum'):
-    """Initializes a DNNLinearCombinedRegressor instance.
-
-    Args:
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      linear_feature_columns: An iterable containing all the feature columns
-        used by linear part of the model. All items in the set must be
-        instances of classes derived from `FeatureColumn`.
-      linear_optimizer: An instance of `tf.Optimizer` used to apply gradients to
-        the linear part of the model. Can also be a string (one of 'Adagrad',
-        'Adam', 'Ftrl', 'RMSProp', 'SGD'), or callable. Defaults to FTRL
-        optimizer.
-      dnn_feature_columns: An iterable containing all the feature columns used
-        by deep part of the model. All items in the set must be instances of
-        classes derived from `FeatureColumn`.
-      dnn_optimizer: An instance of `tf.Optimizer` used to apply gradients to
-        the deep part of the model. Can also be a string (one of 'Adagrad',
-        'Adam', 'Ftrl', 'RMSProp', 'SGD'), or callable. Defaults to Adagrad
-        optimizer.
-      dnn_hidden_units: List of hidden units per layer. All layers are fully
-        connected.
-      dnn_activation_fn: Activation function applied to each layer. If None,
-        will use `tf.nn.relu`.
-      dnn_dropout: When not None, the probability we will drop out
-        a given coordinate.
-      label_dimension: Number of regression targets per example. This is the
-        size of the last dimension of the labels and logits `Tensor` objects
-        (typically, these have shape `[batch_size, label_dimension]`).
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to down weight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      input_layer_partitioner: Partitioner for input layer. Defaults to
-        `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
-      config: RunConfig object to configure the runtime settings.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights are warm-started, and it is assumed that vocabularies and Tensor
-        names are unchanged.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM`.
-      batch_norm: Whether to use batch normalization after each hidden layer.
-      linear_sparse_combiner: A string specifying how to reduce the linear model
-        if a categorical column is multivalent.  One of "mean", "sqrtn", and
-        "sum" -- these are effectively different ways to do example-level
-        normalization, which can be useful for bag-of-words features.  For more
-        details, see `tf.feature_column.linear_model`.
-
-    Raises:
-      ValueError: If both linear_feature_columns and dnn_features_columns are
-        empty at the same time.
-    """
-    linear_feature_columns = linear_feature_columns or []
-    dnn_feature_columns = dnn_feature_columns or []
-    self._feature_columns = (
-        list(linear_feature_columns) + list(dnn_feature_columns))
-    if not self._feature_columns:
-      raise ValueError('Either linear_feature_columns or dnn_feature_columns '
-                       'must be defined.')
+from tensorflow_estimator.python.estimator.canned import dnn_linear_combined
 
-    def _model_fn(features, labels, mode, config):
-      """Call the _dnn_linear_combined_model_fn."""
-      return _dnn_linear_combined_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head_lib._regression_head(  # pylint: disable=protected-access
-              label_dimension=label_dimension, weight_column=weight_column,
-              loss_reduction=loss_reduction),
-          linear_feature_columns=linear_feature_columns,
-          linear_optimizer=linear_optimizer,
-          dnn_feature_columns=dnn_feature_columns,
-          dnn_optimizer=dnn_optimizer,
-          dnn_hidden_units=dnn_hidden_units,
-          dnn_activation_fn=dnn_activation_fn,
-          dnn_dropout=dnn_dropout,
-          input_layer_partitioner=input_layer_partitioner,
-          config=config,
-          batch_norm=batch_norm,
-          linear_sparse_combiner=linear_sparse_combiner)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+dnn_linear_combined.__all__ = [
+    s for s in dir(dnn_linear_combined) if not s.startswith('__')
+]
 
-    super(DNNLinearCombinedRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir, config=config,
-        warm_start_from=warm_start_from)
+from tensorflow_estimator.python.estimator.canned.dnn_linear_combined import *
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py b/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
deleted file mode 100644
index ae968e717a4c9acba4971b61326c15475f1dfcdb..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/canned/dnn_linear_combined_test.py
+++ /dev/null
@@ -1,1050 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for dnn_linear_combined.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-from absl.testing import parameterized
-import numpy as np
-import six
-
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import dnn_linear_combined
-from tensorflow.python.estimator.canned import dnn_testing_utils
-from tensorflow.python.estimator.canned import linear_testing_utils
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.estimator.inputs import pandas_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.feature_column import feature_column_v2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import input as input_lib
-from tensorflow.python.training import optimizer as optimizer_lib
-
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-
-class DNNOnlyModelFnTest(dnn_testing_utils.BaseDNNModelFnTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNModelFnTest.__init__(self, self._dnn_only_model_fn)
-
-  def _dnn_only_model_fn(self,
-                         features,
-                         labels,
-                         mode,
-                         head,
-                         hidden_units,
-                         feature_columns,
-                         optimizer='Adagrad',
-                         activation_fn=nn.relu,
-                         dropout=None,
-                         input_layer_partitioner=None,
-                         config=None):
-    return dnn_linear_combined._dnn_linear_combined_model_fn(
-        features=features,
-        labels=labels,
-        mode=mode,
-        head=head,
-        linear_feature_columns=[],
-        dnn_hidden_units=hidden_units,
-        dnn_feature_columns=feature_columns,
-        dnn_optimizer=optimizer,
-        dnn_activation_fn=activation_fn,
-        dnn_dropout=dropout,
-        input_layer_partitioner=input_layer_partitioner,
-        config=config)
-
-
-# A function to mimic linear-regressor init reuse same tests.
-def _linear_regressor_fn(feature_columns,
-                         model_dir=None,
-                         label_dimension=1,
-                         weight_column=None,
-                         optimizer='Ftrl',
-                         config=None,
-                         partitioner=None,
-                         sparse_combiner='sum'):
-  return dnn_linear_combined.DNNLinearCombinedRegressor(
-      model_dir=model_dir,
-      linear_feature_columns=feature_columns,
-      linear_optimizer=optimizer,
-      label_dimension=label_dimension,
-      weight_column=weight_column,
-      input_layer_partitioner=partitioner,
-      config=config,
-      linear_sparse_combiner=sparse_combiner)
-
-
-class LinearOnlyRegressorPartitionerTest(
-    linear_testing_utils.BaseLinearRegressorPartitionerTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearOnlyRegressorPartitionerV2Test(
-    linear_testing_utils.BaseLinearRegressorPartitionerTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-class LinearOnlyRegressorEvaluationTest(
-    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearOnlyRegressorEvaluationV2Test(
-    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-class LinearOnlyRegressorPredictTest(
-    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearOnlyRegressorPredictV2Test(
-    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-class LinearOnlyRegressorIntegrationTest(
-    linear_testing_utils.BaseLinearRegressorIntegrationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearOnlyRegressorIntegrationV2Test(
-    linear_testing_utils.BaseLinearRegressorIntegrationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-class LinearOnlyRegressorTrainingTest(
-    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearOnlyRegressorTrainingV2Test(
-    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-def _linear_classifier_fn(feature_columns,
-                          model_dir=None,
-                          n_classes=2,
-                          weight_column=None,
-                          label_vocabulary=None,
-                          optimizer='Ftrl',
-                          config=None,
-                          partitioner=None,
-                          sparse_combiner='sum'):
-  return dnn_linear_combined.DNNLinearCombinedClassifier(
-      model_dir=model_dir,
-      linear_feature_columns=feature_columns,
-      linear_optimizer=optimizer,
-      n_classes=n_classes,
-      weight_column=weight_column,
-      label_vocabulary=label_vocabulary,
-      input_layer_partitioner=partitioner,
-      config=config,
-      linear_sparse_combiner=sparse_combiner)
-
-
-class LinearOnlyClassifierTrainingTest(
-    linear_testing_utils.BaseLinearClassifierTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
-
-
-class LinearOnlyClassifierTrainingV2Test(
-    linear_testing_utils.BaseLinearClassifierTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
-        self,
-        linear_classifier_fn=_linear_classifier_fn,
-        fc_lib=feature_column_v2)
-
-
-class LinearOnlyClassifierClassesEvaluationTest(
-    linear_testing_utils.BaseLinearClassifierEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
-
-
-class LinearOnlyClassifierClassesEvaluationV2Test(
-    linear_testing_utils.BaseLinearClassifierEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
-        self,
-        linear_classifier_fn=_linear_classifier_fn,
-        fc_lib=feature_column_v2)
-
-
-class LinearOnlyClassifierPredictTest(
-    linear_testing_utils.BaseLinearClassifierPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
-
-
-class LinearOnlyClassifierPredictV2Test(
-    linear_testing_utils.BaseLinearClassifierPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
-        self,
-        linear_classifier_fn=_linear_classifier_fn,
-        fc_lib=feature_column_v2)
-
-
-class LinearOnlyClassifierIntegrationTest(
-    linear_testing_utils.BaseLinearClassifierIntegrationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
-
-
-class LinearOnlyClassifierIntegrationV2Test(
-    linear_testing_utils.BaseLinearClassifierIntegrationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
-        self,
-        linear_classifier_fn=_linear_classifier_fn,
-        fc_lib=feature_column_v2)
-
-
-@parameterized.parameters((feature_column,), (feature_column_v2,))
-class DNNLinearCombinedRegressorIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, label_dimension, batch_size,
-                          fc_impl):
-    linear_feature_columns = [
-        fc_impl.numeric_column('x', shape=(input_dimension,))
-    ]
-    dnn_feature_columns = [
-        fc_impl.numeric_column('x', shape=(input_dimension,))
-    ]
-    feature_columns = linear_feature_columns + dnn_feature_columns
-    est = dnn_linear_combined.DNNLinearCombinedRegressor(
-        linear_feature_columns=linear_feature_columns,
-        dnn_hidden_units=(2, 2),
-        dnn_feature_columns=dnn_feature_columns,
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = fc_impl.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self, fc_impl):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-  def test_pandas_input_fn(self, fc_impl):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-    label_dimension = 1
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size, dtype=np.float32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(data)
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-  def test_input_fn_from_parse_example(self, fc_impl):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x': feature_pb2.Feature(
-                  float_list=feature_pb2.FloatList(value=datum)),
-              'y': feature_pb2.Feature(
-                  float_list=feature_pb2.FloatList(value=datum)),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
-    }
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = linear_testing_utils.queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = linear_testing_utils.queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = linear_testing_utils.queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-
-# A function to mimic dnn-classifier init reuse same tests.
-def _dnn_classifier_fn(hidden_units,
-                       feature_columns,
-                       model_dir=None,
-                       n_classes=2,
-                       weight_column=None,
-                       label_vocabulary=None,
-                       optimizer='Adagrad',
-                       config=None,
-                       input_layer_partitioner=None):
-  return dnn_linear_combined.DNNLinearCombinedClassifier(
-      model_dir=model_dir,
-      dnn_hidden_units=hidden_units,
-      dnn_feature_columns=feature_columns,
-      dnn_optimizer=optimizer,
-      n_classes=n_classes,
-      weight_column=weight_column,
-      label_vocabulary=label_vocabulary,
-      input_layer_partitioner=input_layer_partitioner,
-      config=config)
-
-
-class DNNOnlyClassifierEvaluateTest(
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column)
-
-
-class DNNOnlyClassifierEvaluateV2Test(
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column_v2)
-
-
-class DNNOnlyClassifierPredictTest(
-    dnn_testing_utils.BaseDNNClassifierPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierPredictTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column)
-
-
-class DNNOnlyClassifierPredictV2Test(
-    dnn_testing_utils.BaseDNNClassifierPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierPredictTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column_v2)
-
-
-class DNNOnlyClassifierTrainTest(
-    dnn_testing_utils.BaseDNNClassifierTrainTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierTrainTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column)
-
-
-class DNNOnlyClassifierTrainV2Test(dnn_testing_utils.BaseDNNClassifierTrainTest,
-                                   test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierTrainTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column_v2)
-
-
-# A function to mimic dnn-regressor init reuse same tests.
-def _dnn_regressor_fn(hidden_units,
-                      feature_columns,
-                      model_dir=None,
-                      label_dimension=1,
-                      weight_column=None,
-                      optimizer='Adagrad',
-                      config=None,
-                      input_layer_partitioner=None):
-  return dnn_linear_combined.DNNLinearCombinedRegressor(
-      model_dir=model_dir,
-      dnn_hidden_units=hidden_units,
-      dnn_feature_columns=feature_columns,
-      dnn_optimizer=optimizer,
-      label_dimension=label_dimension,
-      weight_column=weight_column,
-      input_layer_partitioner=input_layer_partitioner,
-      config=config)
-
-
-class DNNOnlyRegressorEvaluateTest(
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column)
-
-
-class DNNOnlyRegressorEvaluateV2Test(
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column_v2)
-
-
-class DNNOnlyRegressorPredictTest(
-    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column)
-
-
-class DNNOnlyRegressorPredictV2Test(
-    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column_v2)
-
-
-class DNNOnlyRegressorTrainTest(
-    dnn_testing_utils.BaseDNNRegressorTrainTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column)
-
-
-class DNNOnlyRegressorTrainV2Test(dnn_testing_utils.BaseDNNRegressorTrainTest,
-                                  test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column_v2)
-
-
-@parameterized.parameters((feature_column,), (feature_column_v2,))
-class DNNLinearCombinedClassifierIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _as_label(self, data_in_float):
-    return np.rint(data_in_float).astype(np.int64)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, n_classes, batch_size, fc_impl):
-    linear_feature_columns = [
-        fc_impl.numeric_column('x', shape=(input_dimension,))
-    ]
-    dnn_feature_columns = [
-        fc_impl.numeric_column('x', shape=(input_dimension,))
-    ]
-    feature_columns = linear_feature_columns + dnn_feature_columns
-    est = dnn_linear_combined.DNNLinearCombinedClassifier(
-        linear_feature_columns=linear_feature_columns,
-        dnn_hidden_units=(2, 2),
-        dnn_feature_columns=dnn_feature_columns,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predicted_proba = np.array([
-        x[prediction_keys.PredictionKeys.PROBABILITIES]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
-
-    # EXPORT
-    feature_spec = fc_impl.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self, fc_impl):
-    """Tests complete flow with numpy_input_fn."""
-    n_classes = 3
-    input_dimension = 2
-    batch_size = 10
-    data = np.linspace(
-        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
-    x_data = data.reshape(batch_size, input_dimension)
-    y_data = self._as_label(np.reshape(data[:batch_size], (batch_size, 1)))
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data},
-        y=y_data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data},
-        y=y_data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        n_classes=n_classes,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-  def test_pandas_input_fn(self, fc_impl):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-    input_dimension = 1
-    n_classes = 2
-    batch_size = 10
-    data = np.linspace(0., n_classes - 1., batch_size, dtype=np.float32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(self._as_label(data))
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        n_classes=n_classes,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-  def test_input_fn_from_parse_example(self, fc_impl):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    input_dimension = 2
-    n_classes = 3
-    batch_size = 10
-    data = np.linspace(0., n_classes-1., batch_size * input_dimension,
-                       dtype=np.float32)
-    data = data.reshape(batch_size, input_dimension)
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=datum)),
-              'y':
-                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                      value=self._as_label(datum[:1]))),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
-    }
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = linear_testing_utils.queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = linear_testing_utils.queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = linear_testing_utils.queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=input_dimension,
-        n_classes=n_classes,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-
-@parameterized.parameters((feature_column,), (feature_column_v2,))
-class DNNLinearCombinedTests(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _mock_optimizer(self, real_optimizer, var_name_prefix):
-    """Verifies global_step is None and var_names start with given prefix."""
-
-    def _minimize(loss, global_step=None, var_list=None):
-      self.assertIsNone(global_step)
-      trainable_vars = var_list or ops.get_collection(
-          ops.GraphKeys.TRAINABLE_VARIABLES)
-      var_names = [var.name for var in trainable_vars]
-      self.assertTrue(
-          all([name.startswith(var_name_prefix) for name in var_names]))
-      # var is used to check this op called by training.
-      with ops.name_scope(''):
-        var = variables_lib.Variable(0., name=(var_name_prefix + '_called'))
-      with ops.control_dependencies([var.assign(100.)]):
-        return real_optimizer.minimize(loss, global_step, var_list)
-
-    optimizer_mock = test.mock.NonCallableMagicMock(
-        spec=optimizer_lib.Optimizer, wraps=real_optimizer)
-    optimizer_mock.minimize = test.mock.MagicMock(wraps=_minimize)
-
-    return optimizer_mock
-
-  def test_train_op_calls_both_dnn_and_linear(self, fc_impl):
-    opt = gradient_descent.GradientDescentOptimizer(1.)
-    x_column = fc_impl.numeric_column('x')
-    input_fn = numpy_io.numpy_input_fn(
-        x={'x': np.array([[0.], [1.]])},
-        y=np.array([[0.], [1.]]),
-        batch_size=1,
-        shuffle=False)
-    est = dnn_linear_combined.DNNLinearCombinedClassifier(
-        linear_feature_columns=[x_column],
-        # verifies linear_optimizer is used only for linear part.
-        linear_optimizer=self._mock_optimizer(opt, 'linear'),
-        dnn_hidden_units=(2, 2),
-        dnn_feature_columns=[x_column],
-        # verifies dnn_optimizer is used only for linear part.
-        dnn_optimizer=self._mock_optimizer(opt, 'dnn'),
-        model_dir=self._model_dir)
-    est.train(input_fn, steps=1)
-    # verifies train_op fires linear minimize op
-    self.assertEqual(100.,
-                     checkpoint_utils.load_variable(
-                         self._model_dir, 'linear_called'))
-    # verifies train_op fires dnn minimize op
-    self.assertEqual(100.,
-                     checkpoint_utils.load_variable(
-                         self._model_dir, 'dnn_called'))
-
-  def test_dnn_and_linear_logits_are_added(self, fc_impl):
-    with ops.Graph().as_default():
-      variables_lib.Variable([[1.0]], name='linear/linear_model/x/weights')
-      variables_lib.Variable([2.0], name='linear/linear_model/bias_weights')
-      variables_lib.Variable([[3.0]], name='dnn/hiddenlayer_0/kernel')
-      variables_lib.Variable([4.0], name='dnn/hiddenlayer_0/bias')
-      variables_lib.Variable([[5.0]], name='dnn/logits/kernel')
-      variables_lib.Variable([6.0], name='dnn/logits/bias')
-      variables_lib.Variable(1, name='global_step', dtype=dtypes.int64)
-      linear_testing_utils.save_variables_to_ckpt(self._model_dir)
-
-    x_column = fc_impl.numeric_column('x')
-    est = dnn_linear_combined.DNNLinearCombinedRegressor(
-        linear_feature_columns=[x_column],
-        dnn_hidden_units=[1],
-        dnn_feature_columns=[x_column],
-        model_dir=self._model_dir)
-    input_fn = numpy_io.numpy_input_fn(
-        x={'x': np.array([[10.]])}, batch_size=1, shuffle=False)
-    # linear logits = 10*1 + 2 = 12
-    # dnn logits = (10*3 + 4)*5 + 6 = 176
-    # logits = dnn + linear = 176 + 12 = 188
-    self.assertAllClose(
-        {
-            prediction_keys.PredictionKeys.PREDICTIONS: [188.],
-        },
-        next(est.predict(input_fn=input_fn)))
-
-
-@parameterized.parameters((feature_column,), (feature_column_v2,))
-class DNNLinearCombinedWarmStartingTest(test.TestCase):
-
-  def setUp(self):
-    # Create a directory to save our old checkpoint and vocabularies to.
-    self._ckpt_and_vocab_dir = tempfile.mkdtemp()
-
-    # Make a dummy input_fn.
-    def _input_fn():
-      features = {
-          'age': [[23.], [31.]],
-          'city': [['Palo Alto'], ['Mountain View']],
-      }
-      return features, [0, 1]
-
-    self._input_fn = _input_fn
-
-  def tearDown(self):
-    # Clean up checkpoint / vocab dir.
-    writer_cache.FileWriterCache.clear()
-    shutil.rmtree(self._ckpt_and_vocab_dir)
-
-  def test_classifier_basic_warm_starting(self, fc_impl):
-    """Tests correctness of DNNLinearCombinedClassifier default warm-start."""
-    age = fc_impl.numeric_column('age')
-    city = fc_impl.embedding_column(
-        fc_impl.categorical_column_with_vocabulary_list(
-            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
-        dimension=5)
-
-    # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
-    dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
-        linear_feature_columns=[age],
-        dnn_feature_columns=[city],
-        dnn_hidden_units=[256, 128],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        linear_optimizer='SGD',
-        dnn_optimizer='SGD')
-    dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second DNNLinearCombinedClassifier, warm-started from the first.
-    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
-    # have accumulator values that change).
-    warm_started_dnn_lc_classifier = (
-        dnn_linear_combined.DNNLinearCombinedClassifier(
-            linear_feature_columns=[age],
-            dnn_feature_columns=[city],
-            dnn_hidden_units=[256, 128],
-            n_classes=4,
-            linear_optimizer=gradient_descent.GradientDescentOptimizer(
-                learning_rate=0.0),
-            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
-                learning_rate=0.0),
-            warm_start_from=dnn_lc_classifier.model_dir))
-
-    warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
-    for variable_name in warm_started_dnn_lc_classifier.get_variable_names():
-      self.assertAllClose(
-          dnn_lc_classifier.get_variable_value(variable_name),
-          warm_started_dnn_lc_classifier.get_variable_value(variable_name))
-
-  def test_regressor_basic_warm_starting(self, fc_impl):
-    """Tests correctness of DNNLinearCombinedRegressor default warm-start."""
-    age = fc_impl.numeric_column('age')
-    city = fc_impl.embedding_column(
-        fc_impl.categorical_column_with_vocabulary_list(
-            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
-        dimension=5)
-
-    # Create a DNNLinearCombinedRegressor and train to save a checkpoint.
-    dnn_lc_regressor = dnn_linear_combined.DNNLinearCombinedRegressor(
-        linear_feature_columns=[age],
-        dnn_feature_columns=[city],
-        dnn_hidden_units=[256, 128],
-        model_dir=self._ckpt_and_vocab_dir,
-        linear_optimizer='SGD',
-        dnn_optimizer='SGD')
-    dnn_lc_regressor.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second DNNLinearCombinedRegressor, warm-started from the first.
-    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
-    # have accumulator values that change).
-    warm_started_dnn_lc_regressor = (
-        dnn_linear_combined.DNNLinearCombinedRegressor(
-            linear_feature_columns=[age],
-            dnn_feature_columns=[city],
-            dnn_hidden_units=[256, 128],
-            linear_optimizer=gradient_descent.GradientDescentOptimizer(
-                learning_rate=0.0),
-            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
-                learning_rate=0.0),
-            warm_start_from=dnn_lc_regressor.model_dir))
-
-    warm_started_dnn_lc_regressor.train(input_fn=self._input_fn, max_steps=1)
-    for variable_name in warm_started_dnn_lc_regressor.get_variable_names():
-      self.assertAllClose(
-          dnn_lc_regressor.get_variable_value(variable_name),
-          warm_started_dnn_lc_regressor.get_variable_value(variable_name))
-
-  def test_warm_starting_selective_variables(self, fc_impl):
-    """Tests selecting variables to warm-start."""
-    age = fc_impl.numeric_column('age')
-    city = fc_impl.embedding_column(
-        fc_impl.categorical_column_with_vocabulary_list(
-            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
-        dimension=5)
-
-    # Create a DNNLinearCombinedClassifier and train to save a checkpoint.
-    dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier(
-        linear_feature_columns=[age],
-        dnn_feature_columns=[city],
-        dnn_hidden_units=[256, 128],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        linear_optimizer='SGD',
-        dnn_optimizer='SGD')
-    dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second DNNLinearCombinedClassifier, warm-started from the first.
-    # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't
-    # have accumulator values that change).
-    warm_started_dnn_lc_classifier = (
-        dnn_linear_combined.DNNLinearCombinedClassifier(
-            linear_feature_columns=[age],
-            dnn_feature_columns=[city],
-            dnn_hidden_units=[256, 128],
-            n_classes=4,
-            linear_optimizer=gradient_descent.GradientDescentOptimizer(
-                learning_rate=0.0),
-            dnn_optimizer=gradient_descent.GradientDescentOptimizer(
-                learning_rate=0.0),
-            # The provided regular expression will only warm-start the deep
-            # portion of the model.
-            warm_start_from=estimator.WarmStartSettings(
-                ckpt_to_initialize_from=dnn_lc_classifier.model_dir,
-                vars_to_warm_start='.*(dnn).*')))
-
-    warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1)
-    for variable_name in warm_started_dnn_lc_classifier.get_variable_names():
-      if 'dnn' in variable_name:
-        self.assertAllClose(
-            dnn_lc_classifier.get_variable_value(variable_name),
-            warm_started_dnn_lc_classifier.get_variable_value(variable_name))
-      elif 'linear' in variable_name:
-        linear_values = warm_started_dnn_lc_classifier.get_variable_value(
-            variable_name)
-        # Since they're not warm-started, the linear weights will be
-        # zero-initialized.
-        self.assertAllClose(np.zeros_like(linear_values), linear_values)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/canned/dnn_test.py b/tensorflow/python/estimator/canned/dnn_test.py
deleted file mode 100644
index 756696cea08944503d7a66bfbc1577ff7839d07e..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/canned/dnn_test.py
+++ /dev/null
@@ -1,580 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for dnn.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import shutil
-import tempfile
-
-from absl.testing import parameterized
-import numpy as np
-import six
-
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.estimator.canned import dnn
-from tensorflow.python.estimator.canned import dnn_testing_utils
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.estimator.inputs import pandas_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.feature_column import feature_column_v2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import input as input_lib
-from tensorflow.python.training import queue_runner
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-
-def _dnn_classifier_fn(*args, **kwargs):
-  return dnn.DNNClassifier(*args, **kwargs)
-
-
-class DNNModelFnTest(dnn_testing_utils.BaseDNNModelFnTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNModelFnTest.__init__(
-        self, dnn._dnn_model_fn, fc_impl=feature_column)
-
-
-class DNNModelFnV2Test(dnn_testing_utils.BaseDNNModelFnTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNModelFnTest.__init__(
-        self, dnn._dnn_model_fn, fc_impl=feature_column_v2)
-
-
-class DNNLogitFnTest(dnn_testing_utils.BaseDNNLogitFnTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNLogitFnTest.__init__(
-        self, dnn._dnn_logit_fn_builder, fc_impl=feature_column)
-
-
-class DNNLogitFnV2Test(dnn_testing_utils.BaseDNNLogitFnTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNLogitFnTest.__init__(
-        self, dnn._dnn_logit_fn_builder, fc_impl=feature_column_v2)
-
-
-class DNNWarmStartingTest(dnn_testing_utils.BaseDNNWarmStartingTest,
-                          test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNWarmStartingTest.__init__(
-        self, _dnn_classifier_fn, _dnn_regressor_fn, fc_impl=feature_column)
-
-
-class DNNWarmStartingV2Test(dnn_testing_utils.BaseDNNWarmStartingTest,
-                            test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNWarmStartingTest.__init__(
-        self, _dnn_classifier_fn, _dnn_regressor_fn, fc_impl=feature_column_v2)
-
-
-class DNNClassifierEvaluateTest(
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column)
-
-
-class DNNClassifierEvaluateV2Test(
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierEvaluateTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column_v2)
-
-
-class DNNClassifierPredictTest(
-    dnn_testing_utils.BaseDNNClassifierPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierPredictTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column)
-
-
-class DNNClassifierPredictV2Test(dnn_testing_utils.BaseDNNClassifierPredictTest,
-                                 test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierPredictTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column_v2)
-
-
-class DNNClassifierTrainTest(
-    dnn_testing_utils.BaseDNNClassifierTrainTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierTrainTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column)
-
-
-class DNNClassifierTrainV2Test(dnn_testing_utils.BaseDNNClassifierTrainTest,
-                               test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNClassifierTrainTest.__init__(
-        self, _dnn_classifier_fn, fc_impl=feature_column_v2)
-
-
-def _dnn_regressor_fn(*args, **kwargs):
-  return dnn.DNNRegressor(*args, **kwargs)
-
-
-class DNNRegressorEvaluateTest(
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column)
-
-
-class DNNRegressorEvaluateV2Test(dnn_testing_utils.BaseDNNRegressorEvaluateTest,
-                                 test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column_v2)
-
-
-class DNNRegressorPredictTest(
-    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column)
-
-
-class DNNRegressorPredictV2Test(dnn_testing_utils.BaseDNNRegressorPredictTest,
-                                test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column_v2)
-
-
-class DNNRegressorTrainTest(
-    dnn_testing_utils.BaseDNNRegressorTrainTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column)
-
-
-class DNNRegressorTrainV2Test(dnn_testing_utils.BaseDNNRegressorTrainTest,
-                              test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
-        self, _dnn_regressor_fn, fc_impl=feature_column_v2)
-
-
-def _queue_parsed_features(feature_map):
-  tensors_to_enqueue = []
-  keys = []
-  for key, tensor in six.iteritems(feature_map):
-    keys.append(key)
-    tensors_to_enqueue.append(tensor)
-  queue_dtypes = [x.dtype for x in tensors_to_enqueue]
-  input_queue = data_flow_ops.FIFOQueue(capacity=100, dtypes=queue_dtypes)
-  queue_runner.add_queue_runner(
-      queue_runner.QueueRunner(
-          input_queue,
-          [input_queue.enqueue(tensors_to_enqueue)]))
-  dequeued_tensors = input_queue.dequeue()
-  return {keys[i]: dequeued_tensors[i] for i in range(len(dequeued_tensors))}
-
-
-@parameterized.parameters((feature_column,), (feature_column_v2,))
-class DNNRegressorIntegrationTest(test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, label_dimension, batch_size,
-                          fc_impl):
-    feature_columns = [fc_impl.numeric_column('x', shape=(input_dimension,))]
-
-    est = dnn.DNNRegressor(
-        hidden_units=(2, 2),
-        feature_columns=feature_columns,
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = fc_impl.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self, fc_impl):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-  def test_pandas_input_fn(self, fc_impl):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-    label_dimension = 1
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size, dtype=np.float32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(data)
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-  def test_input_fn_from_parse_example(self, fc_impl):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    label_dimension = 2
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x': feature_pb2.Feature(
-                  float_list=feature_pb2.FloatList(value=datum)),
-              'y': feature_pb2.Feature(
-                  float_list=feature_pb2.FloatList(value=datum)),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
-    }
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = _queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = _queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = _queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=label_dimension,
-        label_dimension=label_dimension,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-
-@parameterized.parameters((feature_column,), (feature_column_v2,))
-class DNNClassifierIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _as_label(self, data_in_float):
-    return np.rint(data_in_float).astype(np.int64)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, n_classes, batch_size, fc_impl):
-    feature_columns = [fc_impl.numeric_column('x', shape=(input_dimension,))]
-
-    est = dnn.DNNClassifier(
-        hidden_units=(2, 2),
-        feature_columns=feature_columns,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    num_steps = 10
-    est.train(train_input_fn, steps=num_steps)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
-
-    # PREDICT
-    predicted_proba = np.array([
-        x[prediction_keys.PredictionKeys.PROBABILITIES]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
-
-    # EXPORT
-    feature_spec = fc_impl.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self, fc_impl):
-    """Tests complete flow with numpy_input_fn."""
-    n_classes = 3
-    input_dimension = 2
-    batch_size = 10
-    data = np.linspace(
-        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
-    x_data = data.reshape(batch_size, input_dimension)
-    y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data},
-        y=y_data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data},
-        y=y_data,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': x_data},
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        n_classes=n_classes,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-  def test_pandas_input_fn(self, fc_impl):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-    input_dimension = 1
-    n_classes = 3
-    batch_size = 10
-    data = np.linspace(0., n_classes - 1., batch_size, dtype=np.float32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(self._as_label(data))
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        y=y,
-        batch_size=batch_size,
-        shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x,
-        batch_size=batch_size,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        n_classes=n_classes,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-  def test_input_fn_from_parse_example(self, fc_impl):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    input_dimension = 2
-    n_classes = 3
-    batch_size = 10
-    data = np.linspace(
-        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, input_dimension)
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=datum)),
-              'y':
-                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                      value=self._as_label(datum[:1]))),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
-    }
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = _queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = _queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = _queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=input_dimension,
-        n_classes=n_classes,
-        batch_size=batch_size,
-        fc_impl=fc_impl)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index cd66d0a3bd32eb822ef193f2c22f5e467d50a679..f34b08fb36893bda3170f45897ace6cb2b4bd624 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,1954 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utils to be used in testing DNN estimators."""
+"""dnn_testing_utils python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.platform import test
-from tensorflow.python.summary import summary as summary_lib
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.training import saver
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training_util
-
-# pylint rules which are disabled by default for test files.
-# pylint: disable=invalid-name,protected-access,missing-docstring
-
-# Names of variables created by model.
-LEARNING_RATE_NAME = 'dnn/regression_head/dnn/learning_rate'
-HIDDEN_WEIGHTS_NAME_PATTERN = 'dnn/hiddenlayer_%d/kernel'
-HIDDEN_BIASES_NAME_PATTERN = 'dnn/hiddenlayer_%d/bias'
-BATCH_NORM_BETA_NAME_PATTERN = 'dnn/hiddenlayer_%d/batchnorm_%d/beta'
-BATCH_NORM_GAMMA_NAME_PATTERN = 'dnn/hiddenlayer_%d/batchnorm_%d/gamma'
-BATCH_NORM_MEAN_NAME_PATTERN = 'dnn/hiddenlayer_%d/batchnorm_%d/moving_mean'
-BATCH_NORM_VARIANCE_NAME_PATTERN = (
-    'dnn/hiddenlayer_%d/batchnorm_%d/moving_variance')
-LOGITS_WEIGHTS_NAME = 'dnn/logits/kernel'
-LOGITS_BIASES_NAME = 'dnn/logits/bias'
-OCCUPATION_EMBEDDING_NAME = ('dnn/input_from_feature_columns/input_layer/'
-                             'occupation_embedding/embedding_weights')
-CITY_EMBEDDING_NAME = ('dnn/input_from_feature_columns/input_layer/'
-                       'city_embedding/embedding_weights')
-
-
-def assert_close(expected, actual, rtol=1e-04, message='', name='assert_close'):
-  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
-    expected = ops.convert_to_tensor(expected, name='expected')
-    actual = ops.convert_to_tensor(actual, name='actual')
-    rdiff = math_ops.abs((expected - actual) / expected, 'diff')
-    rtol = ops.convert_to_tensor(rtol, name='rtol')
-    return check_ops.assert_less(
-        rdiff,
-        rtol,
-        data=(message, 'Condition expected =~ actual did not hold element-wise:'
-              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
-              'rtol = ', rtol,),
-        summarize=expected.get_shape().num_elements(),
-        name=scope)
-
-
-def create_checkpoint(weights_and_biases,
-                      global_step,
-                      model_dir,
-                      batch_norm_vars=None):
-  """Create checkpoint file with provided model weights.
-
-  Args:
-    weights_and_biases: Iterable of tuples of weight and bias values.
-    global_step: Initial global step to save in checkpoint.
-    model_dir: Directory into which checkpoint is saved.
-    batch_norm_vars: Variables used for batch normalization.
-  """
-  weights, biases = zip(*weights_and_biases)
-  if batch_norm_vars:
-    assert len(batch_norm_vars) == len(weights_and_biases) - 1
-    (bn_betas, bn_gammas, bn_means, bn_variances) = zip(*batch_norm_vars)
-  model_weights = {}
-
-  # Hidden layer weights.
-  for i in range(0, len(weights) - 1):
-    model_weights[HIDDEN_WEIGHTS_NAME_PATTERN % i] = weights[i]
-    model_weights[HIDDEN_BIASES_NAME_PATTERN % i] = biases[i]
-    if batch_norm_vars:
-      model_weights[BATCH_NORM_BETA_NAME_PATTERN % (i, i)] = bn_betas[i]
-      model_weights[BATCH_NORM_GAMMA_NAME_PATTERN % (i, i)] = bn_gammas[i]
-      model_weights[BATCH_NORM_MEAN_NAME_PATTERN % (i, i)] = bn_means[i]
-      model_weights[BATCH_NORM_VARIANCE_NAME_PATTERN % (i, i)] = bn_variances[i]
-
-  # Output layer weights.
-  model_weights[LOGITS_WEIGHTS_NAME] = weights[-1]
-  model_weights[LOGITS_BIASES_NAME] = biases[-1]
-
-  with ops.Graph().as_default():
-    # Create model variables.
-    for k, v in six.iteritems(model_weights):
-      variables_lib.Variable(v, name=k, dtype=dtypes.float32)
-
-    # Create non-model variables.
-    global_step_var = training_util.create_global_step()
-
-    # Initialize vars and save checkpoint.
-    with tf_session.Session() as sess:
-      variables_lib.global_variables_initializer().run()
-      global_step_var.assign(global_step).eval()
-      saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
-
-
-def mock_head(testcase, hidden_units, logits_dimension, expected_logits):
-  """Returns a mock head that validates logits values and variable names."""
-  hidden_weights_names = [(HIDDEN_WEIGHTS_NAME_PATTERN + '/part_0:0') % i
-                          for i in range(len(hidden_units))]
-  hidden_biases_names = [(HIDDEN_BIASES_NAME_PATTERN + '/part_0:0') % i
-                         for i in range(len(hidden_units))]
-  expected_var_names = (
-      hidden_weights_names + hidden_biases_names +
-      [LOGITS_WEIGHTS_NAME + '/part_0:0', LOGITS_BIASES_NAME + '/part_0:0'])
-
-  def _create_tpu_estimator_spec(
-      features, mode, logits, labels, train_op_fn=None, optimizer=None):
-    del features, labels  # Not used.
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    testcase.assertItemsEqual(expected_var_names,
-                              [var.name for var in trainable_vars])
-    loss = constant_op.constant(1.)
-    assert_logits = assert_close(
-        expected_logits, logits, message='Failed for mode={}. '.format(mode))
-    with ops.control_dependencies([assert_logits]):
-      if mode == model_fn.ModeKeys.TRAIN:
-        if train_op_fn is not None:
-          train_op = train_op_fn(loss)
-        elif optimizer is not None:
-          train_op = optimizer.minimize(loss, global_step=None)
-        return model_fn._TPUEstimatorSpec(
-            mode=mode, loss=loss, train_op=train_op)
-      elif mode == model_fn.ModeKeys.EVAL:
-        return model_fn._TPUEstimatorSpec(
-            mode=mode, loss=array_ops.identity(loss))
-      elif mode == model_fn.ModeKeys.PREDICT:
-        return model_fn._TPUEstimatorSpec(
-            mode=mode, predictions={'logits': array_ops.identity(logits)})
-      else:
-        testcase.fail('Invalid mode: {}'.format(mode))
-
-  def _create_estimator_spec(
-      features, mode, logits, labels, train_op_fn=None, optimizer=None):
-    tpu_spec = _create_tpu_estimator_spec(
-        features, mode, logits, labels, train_op_fn, optimizer)
-    return tpu_spec.as_estimator_spec()
-
-  head = test.mock.NonCallableMagicMock(spec=head_lib._Head)
-  head.logits_dimension = logits_dimension
-  head._create_tpu_estimator_spec = test.mock.MagicMock(
-      wraps=_create_tpu_estimator_spec)
-  head.create_estimator_spec = test.mock.MagicMock(
-      wraps=_create_estimator_spec)
-
-  return head
-
-
-def mock_optimizer(testcase, hidden_units, expected_loss=None):
-  """Creates a mock optimizer to test the train method.
-
-  Args:
-    testcase: A TestCase instance.
-    hidden_units: Iterable of integer sizes for the hidden layers.
-    expected_loss: If given, will assert the loss value.
-
-  Returns:
-    A mock Optimizer.
-  """
-  hidden_weights_names = [(HIDDEN_WEIGHTS_NAME_PATTERN + '/part_0:0') % i
-                          for i in range(len(hidden_units))]
-  hidden_biases_names = [(HIDDEN_BIASES_NAME_PATTERN + '/part_0:0') % i
-                         for i in range(len(hidden_units))]
-  expected_var_names = (
-      hidden_weights_names + hidden_biases_names +
-      [LOGITS_WEIGHTS_NAME + '/part_0:0', LOGITS_BIASES_NAME + '/part_0:0'])
-
-  def _minimize(loss, global_step=None, var_list=None):
-    """Mock of optimizer.minimize."""
-    trainable_vars = var_list or ops.get_collection(
-        ops.GraphKeys.TRAINABLE_VARIABLES)
-    testcase.assertItemsEqual(expected_var_names,
-                              [var.name for var in trainable_vars])
-
-    # Verify loss. We can't check the value directly, so we add an assert op.
-    testcase.assertEquals(0, loss.shape.ndims)
-    if expected_loss is None:
-      if global_step is not None:
-        return state_ops.assign_add(global_step, 1).op
-      return control_flow_ops.no_op()
-    assert_loss = assert_close(
-        math_ops.to_float(expected_loss, name='expected'),
-        loss,
-        name='assert_loss')
-    with ops.control_dependencies((assert_loss,)):
-      if global_step is not None:
-        return state_ops.assign_add(global_step, 1).op
-      return control_flow_ops.no_op()
-
-  optimizer_mock = test.mock.NonCallableMagicMock(
-      spec=optimizer_lib.Optimizer,
-      wraps=optimizer_lib.Optimizer(use_locking=False, name='my_optimizer'))
-  optimizer_mock.minimize = test.mock.MagicMock(wraps=_minimize)
-
-  return optimizer_mock
-
-
-class BaseDNNModelFnTest(object):
-  """Tests that _dnn_model_fn passes expected logits to mock head."""
-
-  def __init__(self, dnn_model_fn, fc_impl=feature_column):
-    self._dnn_model_fn = dnn_model_fn
-    self._fc_impl = fc_impl
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_logits(self, mode, hidden_units, logits_dimension, inputs,
-                   expected_logits):
-    """Tests that the expected logits are passed to mock head."""
-    with ops.Graph().as_default():
-      training_util.create_global_step()
-      head = mock_head(
-          self,
-          hidden_units=hidden_units,
-          logits_dimension=logits_dimension,
-          expected_logits=expected_logits)
-      estimator_spec = self._dnn_model_fn(
-          features={'age': constant_op.constant(inputs)},
-          labels=constant_op.constant([[1]]),
-          mode=mode,
-          head=head,
-          hidden_units=hidden_units,
-          feature_columns=[
-              self._fc_impl.numeric_column(
-                  'age', shape=np.array(inputs).shape[1:])
-          ],
-          optimizer=mock_optimizer(self, hidden_units))
-      with monitored_session.MonitoredTrainingSession(
-          checkpoint_dir=self._model_dir) as sess:
-        if mode == model_fn.ModeKeys.TRAIN:
-          sess.run(estimator_spec.train_op)
-        elif mode == model_fn.ModeKeys.EVAL:
-          sess.run(estimator_spec.loss)
-        elif mode == model_fn.ModeKeys.PREDICT:
-          sess.run(estimator_spec.predictions)
-        else:
-          self.fail('Invalid mode: {}'.format(mode))
-
-  def test_one_dim_logits(self):
-    """Tests one-dimensional logits.
-
-    input_layer = [[10]]
-    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)]] = [[6.1, 4.9]]
-    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)]]
-                   = [[relu(2.38), relu(-0.12)]] = [[2.38, 0]]
-    logits = [[-1*2.38 +1*0 +0.3]] = [[-2.08]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),), base_global_step, self._model_dir)
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=1,
-          inputs=[[10.]],
-          expected_logits=[[-2.08]])
-
-  def test_multi_dim_logits(self):
-    """Tests multi-dimensional logits.
-
-    input_layer = [[10]]
-    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)]] = [[6.1, 4.9]]
-    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)]]
-                   = [[relu(2.38), relu(-0.12)]] = [[2.38, 0]]
-    logits = [[-1*2.38 +0.3, 1*2.38 -0.3, 0.5*2.38]]
-           = [[-2.08, 2.08, 1.19]]
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
-                                                 [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      base_global_step, self._model_dir)
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=3,
-          inputs=[[10.]],
-          expected_logits=[[-2.08, 2.08, 1.19]])
-
-  def test_multi_example_multi_dim_logits(self):
-    """Tests multiple examples and multi-dimensional logits.
-
-    input_layer = [[10], [5]]
-    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)],
-                      [relu(0.6*5 +0.1), relu(0.5*5 -0.1)]]
-                   = [[6.1, 4.9], [3.1, 2.4]]
-    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)],
-                      [relu(1*3.1 -0.8*2.4 +0.2), relu(0.8*3.1 -1*2.4 -0.1)]]
-                   = [[2.38, 0], [1.38, 0]]
-    logits = [[-1*2.38 +0.3, 1*2.38 -0.3, 0.5*2.38],
-              [-1*1.38 +0.3, 1*1.38 -0.3, 0.5*1.38]]
-           = [[-2.08, 2.08, 1.19], [-1.08, 1.08, 0.69]]
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
-                                                 [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      base_global_step, self._model_dir)
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=3,
-          inputs=[[10.], [5.]],
-          expected_logits=[[-2.08, 2.08, 1.19], [-1.08, 1.08, .69]])
-
-  def test_multi_dim_input_one_dim_logits(self):
-    """Tests multi-dimensional inputs and one-dimensional logits.
-
-    input_layer = [[10, 8]]
-    hidden_layer_0 = [[relu(0.6*10 -0.6*8 +0.1), relu(0.5*10 -0.5*8 -0.1)]]
-                   = [[1.3, 0.9]]
-    hidden_layer_1 = [[relu(1*1.3 -0.8*0.9 + 0.2), relu(0.8*1.3 -1*0.9 -0.2)]]
-                   = [[0.78, relu(-0.06)]] = [[0.78, 0]]
-    logits = [[-1*0.78 +1*0 +0.3]] = [[-0.48]]
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5], [-.6, -.5]],
-                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-                       ([[-1.], [1.]], [.3]),), base_global_step,
-                      self._model_dir)
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=1,
-          inputs=[[10., 8.]],
-          expected_logits=[[-0.48]])
-
-  def test_multi_dim_input_multi_dim_logits(self):
-    """Tests multi-dimensional inputs and multi-dimensional logits.
-
-    input_layer = [[10, 8]]
-    hidden_layer_0 = [[relu(0.6*10 -0.6*8 +0.1), relu(0.5*10 -0.5*8 -0.1)]]
-                   = [[1.3, 0.9]]
-    hidden_layer_1 = [[relu(1*1.3 -0.8*0.9 + 0.2), relu(0.8*1.3 -1*0.9 -0.2)]]
-                   = [[0.78, relu(-0.06)]] = [[0.78, 0]]
-    logits = [[-1*0.78 + 0.3, 1*0.78 -0.3, 0.5*0.78]] = [[-0.48, 0.48, 0.39]]
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5], [-.6, -.5]],
-                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      base_global_step, self._model_dir)
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=3,
-          inputs=[[10., 8.]],
-          expected_logits=[[-0.48, 0.48, 0.39]])
-
-  def test_multi_feature_column_multi_dim_logits(self):
-    """Tests multiple feature columns and multi-dimensional logits.
-
-    All numbers are the same as test_multi_dim_input_multi_dim_logits. The only
-    difference is that the input consists of two 1D feature columns, instead of
-    one 2D feature column.
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5], [-.6, -.5]],
-                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      base_global_step, self._model_dir)
-    hidden_units = (2, 2)
-    logits_dimension = 3
-    inputs = ([[10.]], [[8.]])
-    expected_logits = [[-0.48, 0.48, 0.39]]
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      with ops.Graph().as_default():
-        training_util.create_global_step()
-        head = mock_head(
-            self,
-            hidden_units=hidden_units,
-            logits_dimension=logits_dimension,
-            expected_logits=expected_logits)
-        estimator_spec = self._dnn_model_fn(
-            features={
-                'age': constant_op.constant(inputs[0]),
-                'height': constant_op.constant(inputs[1])
-            },
-            labels=constant_op.constant([[1]]),
-            mode=mode,
-            head=head,
-            hidden_units=hidden_units,
-            feature_columns=[
-                self._fc_impl.numeric_column('age'),
-                self._fc_impl.numeric_column('height')
-            ],
-            optimizer=mock_optimizer(self, hidden_units))
-        with monitored_session.MonitoredTrainingSession(
-            checkpoint_dir=self._model_dir) as sess:
-          if mode == model_fn.ModeKeys.TRAIN:
-            sess.run(estimator_spec.train_op)
-          elif mode == model_fn.ModeKeys.EVAL:
-            sess.run(estimator_spec.loss)
-          elif mode == model_fn.ModeKeys.PREDICT:
-            sess.run(estimator_spec.predictions)
-          else:
-            self.fail('Invalid mode: {}'.format(mode))
-
-  def test_features_tensor_raises_value_error(self):
-    """Tests that passing a Tensor for features raises a ValueError."""
-    hidden_units = (2, 2)
-    logits_dimension = 3
-    inputs = ([[10.]], [[8.]])
-    expected_logits = [[0, 0, 0]]
-
-    with ops.Graph().as_default():
-      training_util.create_global_step()
-      head = mock_head(
-          self,
-          hidden_units=hidden_units,
-          logits_dimension=logits_dimension,
-          expected_logits=expected_logits)
-      with self.assertRaisesRegexp(ValueError, 'features should be a dict'):
-        self._dnn_model_fn(
-            features=constant_op.constant(inputs),
-            labels=constant_op.constant([[1]]),
-            mode=model_fn.ModeKeys.TRAIN,
-            head=head,
-            hidden_units=hidden_units,
-            feature_columns=[
-                self._fc_impl.numeric_column(
-                    'age', shape=np.array(inputs).shape[1:])
-            ],
-            optimizer=mock_optimizer(self, hidden_units))
-
-
-class BaseDNNLogitFnTest(object):
-  """Tests correctness of logits calculated from _dnn_logit_fn_builder."""
-
-  def __init__(self, dnn_logit_fn_builder, fc_impl=feature_column):
-    self._dnn_logit_fn_builder = dnn_logit_fn_builder
-    self._fc_impl = fc_impl
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_logits(self,
-                   mode,
-                   hidden_units,
-                   logits_dimension,
-                   inputs,
-                   expected_logits,
-                   batch_norm=False):
-    """Tests that the expected logits are calculated."""
-    with ops.Graph().as_default():
-      # Global step needed for MonitoredSession, which is in turn used to
-      # explicitly set variable weights through a checkpoint.
-      training_util.create_global_step()
-      # Use a variable scope here with 'dnn', emulating the dnn model_fn, so
-      # the checkpoint naming is shared.
-      with variable_scope.variable_scope('dnn'):
-        input_layer_partitioner = (
-            partitioned_variables.min_max_variable_partitioner(
-                max_partitions=0, min_slice_size=64 << 20))
-        logit_fn = self._dnn_logit_fn_builder(
-            units=logits_dimension,
-            hidden_units=hidden_units,
-            feature_columns=[
-                self._fc_impl.numeric_column(
-                    'age', shape=np.array(inputs).shape[1:])
-            ],
-            activation_fn=nn.relu,
-            dropout=None,
-            input_layer_partitioner=input_layer_partitioner,
-            batch_norm=batch_norm)
-        logits = logit_fn(
-            features={'age': constant_op.constant(inputs)}, mode=mode)
-        with monitored_session.MonitoredTrainingSession(
-            checkpoint_dir=self._model_dir) as sess:
-          self.assertAllClose(expected_logits, sess.run(logits))
-
-  def test_one_dim_logits(self):
-    """Tests one-dimensional logits.
-
-    input_layer = [[10]]
-    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)]] = [[6.1, 4.9]]
-    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)]]
-                   = [[relu(2.38), relu(-0.12)]] = [[2.38, 0]]
-    logits = [[-1*2.38 +1*0 +0.3]] = [[-2.08]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),), base_global_step, self._model_dir)
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=1,
-          inputs=[[10.]],
-          expected_logits=[[-2.08]])
-
-  def test_one_dim_logits_with_batch_norm(self):
-    """Tests one-dimensional logits.
-
-    input_layer = [[10]]
-    hidden_layer_0 = [[relu(0.6*10 +1), relu(0.5*10 -1)]] = [[7, 4]]
-    hidden_layer_0 = [[relu(0.6*20 +1), relu(0.5*20 -1)]] = [[13, 9]]
-
-    batch_norm_0, training (epsilon = 0.001):
-      mean1 = 1/2*(7+13) = 10,
-      variance1 = 1/2*(3^2+3^2) = 9
-      x11 = (7-10)/sqrt(9+0.001) = -0.999944449,
-      x21 = (13-10)/sqrt(9+0.001) = 0.999944449,
-
-      mean2 = 1/2*(4+9) = 6.5,
-      variance2 = 1/2*(2.5^2+.2.5^2) = 6.25
-      x12 = (4-6.5)/sqrt(6.25+0.001) = -0.99992001,
-      x22 = (9-6.5)/sqrt(6.25+0.001) = 0.99992001,
-
-    logits = [[-1*(-0.999944449) + 2*(-0.99992001) + 0.3],
-              [-1*0.999944449 + 2*0.99992001 + 0.3]]
-           = [[-0.699895571],[1.299895571]]
-
-    batch_norm_0, not training (epsilon = 0.001):
-      moving_mean1 = 0, moving_variance1 = 1
-      x11 = (7-0)/sqrt(1+0.001) = 6.996502623,
-      x21 = (13-0)/sqrt(1+0.001) = 12.993504871,
-      moving_mean2 = 0, moving_variance2 = 1
-      x12 = (4-0)/sqrt(1+0.001) = 3.998001499,
-      x22 = (9-0)/sqrt(1+0.001) = 8.995503372,
-
-    logits = [[-1*6.996502623 + 2*3.998001499 + 0.3],
-              [-1*12.993504871 + 2*8.995503372 + 0.3]]
-           = [[1.299500375],[5.297501873]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        (
-            ([[.6, .5]], [1., -1.]),
-            ([[-1.], [2.]], [.3]),
-        ),
-        base_global_step,
-        self._model_dir,
-        batch_norm_vars=([[0, 0],  # beta.
-                          [1, 1],  # gamma.
-                          [0, 0],  # moving mean.
-                          [1, 1],  # moving variance.
-                         ],))
-    self._test_logits(
-        model_fn.ModeKeys.TRAIN,
-        hidden_units=[2],
-        logits_dimension=1,
-        inputs=[[10.], [20.]],
-        expected_logits=[[-0.699895571], [1.299895571]],
-        batch_norm=True)
-    for mode in [model_fn.ModeKeys.EVAL, model_fn.ModeKeys.PREDICT]:
-      self._test_logits(
-          mode,
-          hidden_units=[2],
-          logits_dimension=1,
-          inputs=[[10.], [20.]],
-          expected_logits=[[1.299500375], [5.297501873]],
-          batch_norm=True)
-
-  def test_multi_dim_logits(self):
-    """Tests multi-dimensional logits.
-
-    input_layer = [[10]]
-    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)]] = [[6.1, 4.9]]
-    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)]]
-                   = [[relu(2.38), relu(-0.12)]] = [[2.38, 0]]
-    logits = [[-1*2.38 +0.3, 1*2.38 -0.3, 0.5*2.38]]
-           = [[-2.08, 2.08, 1.19]]
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
-                                                 [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      base_global_step, self._model_dir)
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=3,
-          inputs=[[10.]],
-          expected_logits=[[-2.08, 2.08, 1.19]])
-
-  def test_multi_example_multi_dim_logits(self):
-    """Tests multiple examples and multi-dimensional logits.
-
-    input_layer = [[10], [5]]
-    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)],
-                      [relu(0.6*5 +0.1), relu(0.5*5 -0.1)]]
-                   = [[6.1, 4.9], [3.1, 2.4]]
-    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)],
-                      [relu(1*3.1 -0.8*2.4 +0.2), relu(0.8*3.1 -1*2.4 -0.1)]]
-                   = [[2.38, 0], [1.38, 0]]
-    logits = [[-1*2.38 +0.3, 1*2.38 -0.3, 0.5*2.38],
-              [-1*1.38 +0.3, 1*1.38 -0.3, 0.5*1.38]]
-           = [[-2.08, 2.08, 1.19], [-1.08, 1.08, 0.69]]
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
-                                                 [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      base_global_step, self._model_dir)
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=3,
-          inputs=[[10.], [5.]],
-          expected_logits=[[-2.08, 2.08, 1.19], [-1.08, 1.08, .69]])
-
-  def test_multi_dim_input_one_dim_logits(self):
-    """Tests multi-dimensional inputs and one-dimensional logits.
-
-    input_layer = [[10, 8]]
-    hidden_layer_0 = [[relu(0.6*10 -0.6*8 +0.1), relu(0.5*10 -0.5*8 -0.1)]]
-                   = [[1.3, 0.9]]
-    hidden_layer_1 = [[relu(1*1.3 -0.8*0.9 + 0.2), relu(0.8*1.3 -1*0.9 -0.2)]]
-                   = [[0.78, relu(-0.06)]] = [[0.78, 0]]
-    logits = [[-1*0.78 +1*0 +0.3]] = [[-0.48]]
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5], [-.6, -.5]],
-                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-                       ([[-1.], [1.]], [.3]),), base_global_step,
-                      self._model_dir)
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=1,
-          inputs=[[10., 8.]],
-          expected_logits=[[-0.48]])
-
-  def test_multi_dim_input_multi_dim_logits(self):
-    """Tests multi-dimensional inputs and multi-dimensional logits.
-
-    input_layer = [[10, 8]]
-    hidden_layer_0 = [[relu(0.6*10 -0.6*8 +0.1), relu(0.5*10 -0.5*8 -0.1)]]
-                   = [[1.3, 0.9]]
-    hidden_layer_1 = [[relu(1*1.3 -0.8*0.9 + 0.2), relu(0.8*1.3 -1*0.9 -0.2)]]
-                   = [[0.78, relu(-0.06)]] = [[0.78, 0]]
-    logits = [[-1*0.78 + 0.3, 1*0.78 -0.3, 0.5*0.78]] = [[-0.48, 0.48, 0.39]]
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5], [-.6, -.5]],
-                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      base_global_step, self._model_dir)
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=3,
-          inputs=[[10., 8.]],
-          expected_logits=[[-0.48, 0.48, 0.39]])
-
-  def test_multi_feature_column_multi_dim_logits(self):
-    """Tests multiple feature columns and multi-dimensional logits.
-
-    All numbers are the same as test_multi_dim_input_multi_dim_logits. The only
-    difference is that the input consists of two 1D feature columns, instead of
-    one 2D feature column.
-    """
-    base_global_step = 100
-    create_checkpoint((([[.6, .5], [-.6, -.5]],
-                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      base_global_step, self._model_dir)
-
-    hidden_units = (2, 2)
-    logits_dimension = 3
-    inputs = ([[10.]], [[8.]])
-    expected_logits = [[-0.48, 0.48, 0.39]]
-
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      with ops.Graph().as_default():
-        # Global step needed for MonitoredSession, which is in turn used to
-        # explicitly set variable weights through a checkpoint.
-        training_util.create_global_step()
-        # Use a variable scope here with 'dnn', emulating the dnn model_fn, so
-        # the checkpoint naming is shared.
-        with variable_scope.variable_scope('dnn'):
-          input_layer_partitioner = (
-              partitioned_variables.min_max_variable_partitioner(
-                  max_partitions=0, min_slice_size=64 << 20))
-          logit_fn = self._dnn_logit_fn_builder(
-              units=logits_dimension,
-              hidden_units=hidden_units,
-              feature_columns=[
-                  self._fc_impl.numeric_column('age'),
-                  self._fc_impl.numeric_column('height')
-              ],
-              activation_fn=nn.relu,
-              dropout=None,
-              input_layer_partitioner=input_layer_partitioner,
-              batch_norm=False)
-          logits = logit_fn(
-              features={
-                  'age': constant_op.constant(inputs[0]),
-                  'height': constant_op.constant(inputs[1])
-              },
-              mode=mode)
-          with monitored_session.MonitoredTrainingSession(
-              checkpoint_dir=self._model_dir) as sess:
-            self.assertAllClose(expected_logits, sess.run(logits))
-
-
-class BaseDNNWarmStartingTest(object):
-
-  def __init__(self,
-               _dnn_classifier_fn,
-               _dnn_regressor_fn,
-               fc_impl=feature_column):
-    self._dnn_classifier_fn = _dnn_classifier_fn
-    self._dnn_regressor_fn = _dnn_regressor_fn
-    self._fc_impl = fc_impl
-
-  def setUp(self):
-    # Create a directory to save our old checkpoint and vocabularies to.
-    self._ckpt_and_vocab_dir = tempfile.mkdtemp()
-
-    # Make a dummy input_fn.
-    def _input_fn():
-      features = {
-          'city': [['Palo Alto'], ['Mountain View']],
-          'locality': [['Palo Alto'], ['Mountain View']],
-          'occupation': [['doctor'], ['consultant']]
-      }
-      return features, [0, 1]
-
-    self._input_fn = _input_fn
-
-  def tearDown(self):
-    # Clean up checkpoint / vocab dir.
-    writer_cache.FileWriterCache.clear()
-    shutil.rmtree(self._ckpt_and_vocab_dir)
-
-  def assertAllNotClose(self, t1, t2):
-    """Helper assert for arrays."""
-    sum_of_abs_diff = 0.0
-    for x, y in zip(t1, t2):
-      try:
-        for a, b in zip(x, y):
-          sum_of_abs_diff += abs(b - a)
-      except TypeError:
-        sum_of_abs_diff += abs(y - x)
-    self.assertGreater(sum_of_abs_diff, 0)
-
-  def test_classifier_basic_warm_starting(self):
-    """Tests correctness of DNNClassifier default warm-start."""
-    city = self._fc_impl.embedding_column(
-        self._fc_impl.categorical_column_with_vocabulary_list(
-            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
-        dimension=5)
-
-    # Create a DNNClassifier and train to save a checkpoint.
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=[256, 128],
-        feature_columns=[city],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        optimizer='SGD')
-    dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second DNNClassifier, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).
-    warm_started_dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=[256, 128],
-        feature_columns=[city],
-        n_classes=4,
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        warm_start_from=dnn_classifier.model_dir)
-
-    warm_started_dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
-    for variable_name in warm_started_dnn_classifier.get_variable_names():
-      self.assertAllClose(
-          dnn_classifier.get_variable_value(variable_name),
-          warm_started_dnn_classifier.get_variable_value(variable_name))
-
-  def test_regressor_basic_warm_starting(self):
-    """Tests correctness of DNNRegressor default warm-start."""
-    city = self._fc_impl.embedding_column(
-        self._fc_impl.categorical_column_with_vocabulary_list(
-            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
-        dimension=5)
-
-    # Create a DNNRegressor and train to save a checkpoint.
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=[256, 128],
-        feature_columns=[city],
-        model_dir=self._ckpt_and_vocab_dir,
-        optimizer='SGD')
-    dnn_regressor.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second DNNRegressor, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).
-    warm_started_dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=[256, 128],
-        feature_columns=[city],
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        warm_start_from=dnn_regressor.model_dir)
-
-    warm_started_dnn_regressor.train(input_fn=self._input_fn, max_steps=1)
-    for variable_name in warm_started_dnn_regressor.get_variable_names():
-      self.assertAllClose(
-          dnn_regressor.get_variable_value(variable_name),
-          warm_started_dnn_regressor.get_variable_value(variable_name))
-
-  def test_warm_starting_selective_variables(self):
-    """Tests selecting variables to warm-start."""
-    city = self._fc_impl.embedding_column(
-        self._fc_impl.categorical_column_with_vocabulary_list(
-            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
-        dimension=5)
-
-    # Create a DNNClassifier and train to save a checkpoint.
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=[256, 128],
-        feature_columns=[city],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        optimizer='SGD')
-    dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second DNNClassifier, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).
-    warm_started_dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=[256, 128],
-        feature_columns=[city],
-        n_classes=4,
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        # The provided regular expression will only warm-start the city
-        # embedding, not the kernels and biases of the hidden weights.
-        warm_start_from=estimator.WarmStartSettings(
-            ckpt_to_initialize_from=dnn_classifier.model_dir,
-            vars_to_warm_start='.*(city).*'))
-
-    warm_started_dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
-    for variable_name in warm_started_dnn_classifier.get_variable_names():
-      if 'city' in variable_name:
-        self.assertAllClose(
-            dnn_classifier.get_variable_value(variable_name),
-            warm_started_dnn_classifier.get_variable_value(variable_name))
-      elif 'bias' in variable_name:
-        # Hidden layer biases are zero-initialized.
-        bias_values = warm_started_dnn_classifier.get_variable_value(
-            variable_name)
-        self.assertAllClose(np.zeros_like(bias_values), bias_values)
-      elif 'kernel' in variable_name:
-        # We can't override the glorot uniform initializer used for the kernels
-        # in the dense layers, so just make sure we're not getting the same
-        # values from the old checkpoint.
-        self.assertAllNotClose(
-            dnn_classifier.get_variable_value(variable_name),
-            warm_started_dnn_classifier.get_variable_value(variable_name))
-
-  def test_warm_starting_with_vocab_remapping_and_partitioning(self):
-    """Tests warm-starting with vocab remapping and partitioning."""
-    vocab_list = ['doctor', 'lawyer', 'consultant']
-    vocab_file = os.path.join(self._ckpt_and_vocab_dir, 'occupation_vocab')
-    with open(vocab_file, 'w') as f:
-      f.write('\n'.join(vocab_list))
-    occupation = self._fc_impl.embedding_column(
-        self._fc_impl.categorical_column_with_vocabulary_file(
-            'occupation',
-            vocabulary_file=vocab_file,
-            vocabulary_size=len(vocab_list)),
-        dimension=2)
-
-    # Create a DNNClassifier and train to save a checkpoint.
-    partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=[256, 128],
-        feature_columns=[occupation],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        optimizer='SGD',
-        input_layer_partitioner=partitioner)
-    dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second DNNClassifier, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).  Use a new FeatureColumn with a
-    # different vocabulary for occupation.
-    new_vocab_list = ['doctor', 'consultant', 'engineer']
-    new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
-                                  'new_occupation_vocab')
-    with open(new_vocab_file, 'w') as f:
-      f.write('\n'.join(new_vocab_list))
-    new_occupation = self._fc_impl.embedding_column(
-        self._fc_impl.categorical_column_with_vocabulary_file(
-            'occupation',
-            vocabulary_file=new_vocab_file,
-            vocabulary_size=len(new_vocab_list)),
-        dimension=2)
-    # We can create our VocabInfo object from the new and old occupation
-    # FeatureColumn's.
-    occupation_vocab_info = estimator.VocabInfo(
-        new_vocab=new_occupation.categorical_column.vocabulary_file,
-        new_vocab_size=new_occupation.categorical_column.vocabulary_size,
-        num_oov_buckets=new_occupation.categorical_column.num_oov_buckets,
-        old_vocab=occupation.categorical_column.vocabulary_file,
-        old_vocab_size=occupation.categorical_column.vocabulary_size,
-        # Can't use constant_initializer with load_and_remap.  In practice,
-        # use a truncated normal initializer.
-        backup_initializer=init_ops.random_uniform_initializer(
-            minval=0.39, maxval=0.39))
-    warm_started_dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=[256, 128],
-        feature_columns=[occupation],
-        n_classes=4,
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        warm_start_from=estimator.WarmStartSettings(
-            ckpt_to_initialize_from=dnn_classifier.model_dir,
-            var_name_to_vocab_info={
-                OCCUPATION_EMBEDDING_NAME: occupation_vocab_info
-            },
-            # Explicitly providing None here will only warm-start variables
-            # referenced in var_name_to_vocab_info (no hidden weights will be
-            # warmstarted).
-            vars_to_warm_start=None),
-        input_layer_partitioner=partitioner)
-
-    warm_started_dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
-    # 'doctor' was ID-0 and still ID-0.
-    self.assertAllClose(
-        dnn_classifier.get_variable_value(OCCUPATION_EMBEDDING_NAME)[0, :],
-        warm_started_dnn_classifier.get_variable_value(
-            OCCUPATION_EMBEDDING_NAME)[0, :])
-    # 'consultant' was ID-2 and now ID-1.
-    self.assertAllClose(
-        dnn_classifier.get_variable_value(OCCUPATION_EMBEDDING_NAME)[2, :],
-        warm_started_dnn_classifier.get_variable_value(
-            OCCUPATION_EMBEDDING_NAME)[1, :])
-    # 'engineer' is a new entry and should be initialized with the
-    # backup_initializer in VocabInfo.
-    self.assertAllClose([0.39] * 2,
-                        warm_started_dnn_classifier.get_variable_value(
-                            OCCUPATION_EMBEDDING_NAME)[2, :])
-    for variable_name in warm_started_dnn_classifier.get_variable_names():
-      if 'bias' in variable_name:
-        # Hidden layer biases are zero-initialized.
-        bias_values = warm_started_dnn_classifier.get_variable_value(
-            variable_name)
-        self.assertAllClose(np.zeros_like(bias_values), bias_values)
-      elif 'kernel' in variable_name:
-        # We can't override the glorot uniform initializer used for the kernels
-        # in the dense layers, so just make sure we're not getting the same
-        # values from the old checkpoint.
-        self.assertAllNotClose(
-            dnn_classifier.get_variable_value(variable_name),
-            warm_started_dnn_classifier.get_variable_value(variable_name))
-
-  def test_warm_starting_with_naming_change(self):
-    """Tests warm-starting with a Tensor name remapping."""
-    locality = self._fc_impl.embedding_column(
-        self._fc_impl.categorical_column_with_vocabulary_list(
-            'locality', vocabulary_list=['Mountain View', 'Palo Alto']),
-        dimension=5)
-
-    # Create a DNNClassifier and train to save a checkpoint.
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=[256, 128],
-        feature_columns=[locality],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        optimizer='SGD')
-    dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second DNNClassifier, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).
-    city = self._fc_impl.embedding_column(
-        self._fc_impl.categorical_column_with_vocabulary_list(
-            'city', vocabulary_list=['Mountain View', 'Palo Alto']),
-        dimension=5)
-    warm_started_dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=[256, 128],
-        feature_columns=[city],
-        n_classes=4,
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        # The 'city' variable correspond to the 'locality' variable in the
-        # previous model.
-        warm_start_from=estimator.WarmStartSettings(
-            ckpt_to_initialize_from=dnn_classifier.model_dir,
-            var_name_to_prev_var_name={
-                CITY_EMBEDDING_NAME:
-                    CITY_EMBEDDING_NAME.replace('city', 'locality')
-            }))
-
-    warm_started_dnn_classifier.train(input_fn=self._input_fn, max_steps=1)
-    for variable_name in warm_started_dnn_classifier.get_variable_names():
-      if 'city' in variable_name:
-        self.assertAllClose(
-            dnn_classifier.get_variable_value(
-                CITY_EMBEDDING_NAME.replace('city', 'locality')),
-            warm_started_dnn_classifier.get_variable_value(CITY_EMBEDDING_NAME))
-      else:
-        self.assertAllClose(
-            dnn_classifier.get_variable_value(variable_name),
-            warm_started_dnn_classifier.get_variable_value(variable_name))
-
-
-class BaseDNNClassifierEvaluateTest(object):
-
-  def __init__(self, dnn_classifier_fn, fc_impl=feature_column):
-    self._dnn_classifier_fn = dnn_classifier_fn
-    self._fc_impl = fc_impl
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_one_dim(self):
-    """Asserts evaluation metrics for one-dimensional input and logits."""
-    global_step = 100
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),), global_step, self._model_dir)
-
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=(2, 2),
-        feature_columns=[self._fc_impl.numeric_column('age')],
-        model_dir=self._model_dir)
-    def _input_fn():
-      # batch_size = 2, one false label, and one true.
-      return {'age': [[10.], [10.]]}, [[1], [0]]
-    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [[-2.08], [-2.08]] =>
-    # logistic = 1/(1 + exp(-logits)) = [[0.11105597], [0.11105597]]
-    # loss = -1. * log(0.111) -1. * log(0.889) = 2.31544200
-    expected_loss = 2.31544200
-    self.assertAllClose({
-        metric_keys.MetricKeys.LOSS: expected_loss,
-        metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2.,
-        metric_keys.MetricKeys.ACCURACY: 0.5,
-        metric_keys.MetricKeys.PRECISION: 0.0,
-        metric_keys.MetricKeys.RECALL: 0.0,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 0.11105597,
-        metric_keys.MetricKeys.LABEL_MEAN: 0.5,
-        metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
-        # There is no good way to calculate AUC for only two data points. But
-        # that is what the algorithm returns.
-        metric_keys.MetricKeys.AUC: 0.5,
-        metric_keys.MetricKeys.AUC_PR: 0.75,
-
-        ops.GraphKeys.GLOBAL_STEP: global_step
-    }, dnn_classifier.evaluate(input_fn=_input_fn, steps=1))
-
-  def test_multi_dim(self):
-    """Asserts evaluation metrics for multi-dimensional input and logits."""
-    global_step = 100
-    create_checkpoint(
-        (([[.6, .5], [-.6, -.5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
-                                               [.2, -.2]),
-         ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3,
-                                           .0]),), global_step, self._model_dir)
-    n_classes = 3
-
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=(2, 2),
-        feature_columns=[self._fc_impl.numeric_column('age', shape=[2])],
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    def _input_fn():
-      # batch_size = 2, one false label, and one true.
-      return {'age': [[10., 8.], [10., 8.]]}, [[1], [0]]
-    # Uses identical numbers as
-    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [[-0.48, 0.48, 0.39], [-0.48, 0.48, 0.39]]
-    # probabilities = exp(logits)/sum(exp(logits))
-    #               = [[0.16670536, 0.43538380, 0.39791084],
-    #                  [0.16670536, 0.43538380, 0.39791084]]
-    # loss = -log(0.43538380) - log(0.16670536)
-    expected_loss = 2.62305466
-    self.assertAllClose({
-        metric_keys.MetricKeys.LOSS: expected_loss,
-        metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-        metric_keys.MetricKeys.ACCURACY: 0.5,
-        ops.GraphKeys.GLOBAL_STEP: global_step
-    }, dnn_classifier.evaluate(input_fn=_input_fn, steps=1))
-
-  def test_float_labels(self):
-    """Asserts evaluation metrics for float labels in binary classification."""
-    global_step = 100
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),), global_step, self._model_dir)
-
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=(2, 2),
-        feature_columns=[self._fc_impl.numeric_column('age')],
-        model_dir=self._model_dir)
-    def _input_fn():
-      # batch_size = 2, one false label, and one true.
-      return {'age': [[10.], [10.]]}, [[0.8], [0.4]]
-    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [[-2.08], [-2.08]] =>
-    # logistic = 1/(1 + exp(-logits)) = [[0.11105597], [0.11105597]]
-    # loss = -0.8 * log(0.111) -0.2 * log(0.889)
-    #        -0.4 * log(0.111) -0.6 * log(0.889) = 2.7314420
-    metrics = dnn_classifier.evaluate(input_fn=_input_fn, steps=1)
-    self.assertAlmostEqual(2.7314420, metrics[metric_keys.MetricKeys.LOSS])
-
-  def test_multi_dim_weights(self):
-    """Tests evaluation with weights."""
-    # Uses same checkpoint with test_multi_dims
-    global_step = 100
-    create_checkpoint((([[.6, .5], [-.6, -.5]],
-                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      global_step, self._model_dir)
-    n_classes = 3
-
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=(2, 2),
-        feature_columns=[self._fc_impl.numeric_column('age', shape=[2])],
-        n_classes=n_classes,
-        weight_column='w',
-        model_dir=self._model_dir)
-
-    def _input_fn():
-      # batch_size = 2, one false label, and one true.
-      return {'age': [[10., 8.], [10., 8.]], 'w': [[10.], [100.]]}, [[1], [0]]
-
-    # Uses identical numbers as test_multi_dims
-    # See that test for calculation of logits.
-    # loss = -log(0.43538380)*10 - log(0.16670536)*100
-    expected_loss = 187.468007
-    metrics = dnn_classifier.evaluate(input_fn=_input_fn, steps=1)
-    self.assertAlmostEqual(
-        expected_loss, metrics[metric_keys.MetricKeys.LOSS], places=3)
-
-
-class BaseDNNRegressorEvaluateTest(object):
-
-  def __init__(self, dnn_regressor_fn, fc_impl=feature_column):
-    self._dnn_regressor_fn = dnn_regressor_fn
-    self._fc_impl = fc_impl
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_one_dim(self):
-    """Asserts evaluation metrics for one-dimensional input and logits."""
-    # Create checkpoint: num_inputs=1, hidden_units=(2, 2), num_outputs=1.
-    global_step = 100
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),), global_step, self._model_dir)
-
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=(2, 2),
-        feature_columns=[self._fc_impl.numeric_column('age')],
-        model_dir=self._model_dir)
-    def _input_fn():
-      return {'age': [[10.]]}, [[1.]]
-    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [[-2.08]] => predictions = [-2.08].
-    # loss = (1+2.08)^2 = 9.4864
-    expected_loss = 9.4864
-    self.assertAllClose({
-        metric_keys.MetricKeys.LOSS: expected_loss,
-        metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
-        metric_keys.MetricKeys.PREDICTION_MEAN: -2.08,
-        metric_keys.MetricKeys.LABEL_MEAN: 1.0,
-        ops.GraphKeys.GLOBAL_STEP: global_step
-    }, dnn_regressor.evaluate(input_fn=_input_fn, steps=1))
-
-  def test_multi_dim(self):
-    """Asserts evaluation metrics for multi-dimensional input and logits."""
-    # Create checkpoint: num_inputs=2, hidden_units=(2, 2), num_outputs=3.
-    global_step = 100
-    create_checkpoint(
-        (([[.6, .5], [-.6, -.5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
-                                               [.2, -.2]),
-         ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3,
-                                           .0]),), global_step, self._model_dir)
-    label_dimension = 3
-
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=(2, 2),
-        feature_columns=[self._fc_impl.numeric_column('age', shape=[2])],
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-    def _input_fn():
-      return {'age': [[10., 8.]]}, [[1., -1., 0.5]]
-    # Uses identical numbers as
-    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [[-0.48, 0.48, 0.39]]
-    # loss = (1+0.48)^2 + (-1-0.48)^2 + (0.5-0.39)^2 = 4.3929
-    expected_loss = 4.3929
-    self.assertAllClose({
-        metric_keys.MetricKeys.LOSS: expected_loss,
-        metric_keys.MetricKeys.LOSS_MEAN: expected_loss / label_dimension,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 0.39 / 3.0,
-        metric_keys.MetricKeys.LABEL_MEAN: 0.5 / 3.0,
-        ops.GraphKeys.GLOBAL_STEP: global_step
-    }, dnn_regressor.evaluate(input_fn=_input_fn, steps=1))
-
-  def test_multi_dim_weights(self):
-    """Asserts evaluation metrics for multi-dimensional input and logits."""
-    # same checkpoint with test_multi_dim.
-    global_step = 100
-    create_checkpoint((([[.6, .5], [-.6, -.5]],
-                        [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-                       ([[-1., 1., .5], [-1., 1., .5]], [.3, -.3, .0]),),
-                      global_step, self._model_dir)
-    label_dimension = 3
-
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=(2, 2),
-        feature_columns=[self._fc_impl.numeric_column('age', shape=[2])],
-        label_dimension=label_dimension,
-        weight_column='w',
-        model_dir=self._model_dir)
-
-    def _input_fn():
-      return {'age': [[10., 8.]], 'w': [10.]}, [[1., -1., 0.5]]
-
-    # Uses identical numbers as test_multi_dim.
-    # See that test for calculation of logits.
-    # loss = 4.3929*10
-    expected_loss = 43.929
-    metrics = dnn_regressor.evaluate(input_fn=_input_fn, steps=1)
-    self.assertAlmostEqual(
-        expected_loss, metrics[metric_keys.MetricKeys.LOSS], places=3)
-
-
-class BaseDNNClassifierPredictTest(object):
-
-  def __init__(self, dnn_classifier_fn, fc_impl=feature_column):
-    self._dnn_classifier_fn = dnn_classifier_fn
-    self._fc_impl = fc_impl
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_one_dim(self, label_vocabulary, label_output_fn):
-    """Asserts predictions for one-dimensional input and logits."""
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),),
-        global_step=0,
-        model_dir=self._model_dir)
-
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=(2, 2),
-        label_vocabulary=label_vocabulary,
-        feature_columns=(self._fc_impl.numeric_column('x'),),
-        model_dir=self._model_dir)
-    input_fn = numpy_io.numpy_input_fn(
-        x={'x': np.array([[10.]])}, batch_size=1, shuffle=False)
-    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [-2.08] =>
-    # logistic = exp(-2.08)/(1 + exp(-2.08)) = 0.11105597
-    # probabilities = [1-logistic, logistic] = [0.88894403, 0.11105597]
-    # class_ids = argmax(probabilities) = [0]
-    predictions = next(dnn_classifier.predict(input_fn=input_fn))
-    self.assertAllClose([-2.08],
-                        predictions[prediction_keys.PredictionKeys.LOGITS])
-    self.assertAllClose([0.11105597],
-                        predictions[prediction_keys.PredictionKeys.LOGISTIC])
-    self.assertAllClose(
-        [0.88894403,
-         0.11105597], predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-    self.assertAllClose([0],
-                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
-    self.assertAllEqual([label_output_fn(0)],
-                        predictions[prediction_keys.PredictionKeys.CLASSES])
-
-  def test_one_dim_without_label_vocabulary(self):
-    self._test_one_dim(label_vocabulary=None,
-                       label_output_fn=lambda x: ('%s' % x).encode())
-
-  def test_one_dim_with_label_vocabulary(self):
-    n_classes = 2
-    self._test_one_dim(
-        label_vocabulary=['class_vocab_{}'.format(i) for i in range(n_classes)],
-        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
-
-  def _test_multi_dim_with_3_classes(self, label_vocabulary, label_output_fn):
-    """Asserts predictions for multi-dimensional input and logits."""
-    create_checkpoint(
-        (([[.6, .5], [-.6, -.5]], [.1, -.1]),
-         ([[1., .8], [-.8, -1.]], [.2, -.2]), ([[-1., 1., .5], [-1., 1., .5]],
-                                               [.3, -.3, .0]),),
-        global_step=0,
-        model_dir=self._model_dir)
-
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=(2, 2),
-        feature_columns=(self._fc_impl.numeric_column('x', shape=(2,)),),
-        label_vocabulary=label_vocabulary,
-        n_classes=3,
-        model_dir=self._model_dir)
-    input_fn = numpy_io.numpy_input_fn(
-        # Inputs shape is (batch_size, num_inputs).
-        x={'x': np.array([[10., 8.]])},
-        batch_size=1,
-        shuffle=False)
-    # Uses identical numbers as
-    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [-0.48, 0.48, 0.39] =>
-    # probabilities[i] = exp(logits[i]) / sum_j exp(logits[j]) =>
-    # probabilities = [0.16670536, 0.43538380, 0.39791084]
-    # class_ids = argmax(probabilities) = [1]
-    predictions = next(dnn_classifier.predict(input_fn=input_fn))
-    self.assertItemsEqual(
-        [prediction_keys.PredictionKeys.LOGITS,
-         prediction_keys.PredictionKeys.PROBABILITIES,
-         prediction_keys.PredictionKeys.CLASS_IDS,
-         prediction_keys.PredictionKeys.CLASSES],
-        six.iterkeys(predictions))
-    self.assertAllClose(
-        [-0.48, 0.48, 0.39], predictions[prediction_keys.PredictionKeys.LOGITS])
-    self.assertAllClose(
-        [0.16670536, 0.43538380, 0.39791084],
-        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-    self.assertAllEqual(
-        [1], predictions[prediction_keys.PredictionKeys.CLASS_IDS])
-    self.assertAllEqual(
-        [label_output_fn(1)],
-        predictions[prediction_keys.PredictionKeys.CLASSES])
-
-  def test_multi_dim_with_3_classes_but_no_label_vocab(self):
-    self._test_multi_dim_with_3_classes(
-        label_vocabulary=None,
-        label_output_fn=lambda x: ('%s' % x).encode())
-
-  def test_multi_dim_with_3_classes_and_label_vocab(self):
-    n_classes = 3
-    self._test_multi_dim_with_3_classes(
-        label_vocabulary=['class_vocab_{}'.format(i) for i in range(n_classes)],
-        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
-
-
-class BaseDNNRegressorPredictTest(object):
-
-  def __init__(self, dnn_regressor_fn, fc_impl=feature_column):
-    self._dnn_regressor_fn = dnn_regressor_fn
-    self._fc_impl = fc_impl
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_one_dim(self):
-    """Asserts predictions for one-dimensional input and logits."""
-    # Create checkpoint: num_inputs=1, hidden_units=(2, 2), num_outputs=1.
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),),
-        global_step=0,
-        model_dir=self._model_dir)
-
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=(2, 2),
-        feature_columns=(self._fc_impl.numeric_column('x'),),
-        model_dir=self._model_dir)
-    input_fn = numpy_io.numpy_input_fn(
-        x={'x': np.array([[10.]])}, batch_size=1, shuffle=False)
-    # Uses identical numbers as DNNModelTest.test_one_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [[-2.08]] => predictions = [-2.08].
-    self.assertAllClose({
-        prediction_keys.PredictionKeys.PREDICTIONS: [-2.08],
-    }, next(dnn_regressor.predict(input_fn=input_fn)))
-
-  def test_multi_dim(self):
-    """Asserts predictions for multi-dimensional input and logits."""
-    # Create checkpoint: num_inputs=2, hidden_units=(2, 2), num_outputs=3.
-    create_checkpoint(
-        (([[.6, .5], [-.6, -.5]], [.1, -.1]),
-         ([[1., .8], [-.8, -1.]], [.2, -.2]), ([[-1., 1., .5], [-1., 1., .5]],
-                                               [.3, -.3,
-                                                .0]),), 100, self._model_dir)
-
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=(2, 2),
-        feature_columns=(self._fc_impl.numeric_column('x', shape=(2,)),),
-        label_dimension=3,
-        model_dir=self._model_dir)
-    input_fn = numpy_io.numpy_input_fn(
-        # Inputs shape is (batch_size, num_inputs).
-        x={'x': np.array([[10., 8.]])},
-        batch_size=1,
-        shuffle=False)
-    # Uses identical numbers as
-    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [[-0.48, 0.48, 0.39]] => predictions = [-0.48, 0.48, 0.39]
-    self.assertAllClose({
-        prediction_keys.PredictionKeys.PREDICTIONS: [-0.48, 0.48, 0.39],
-    }, next(dnn_regressor.predict(input_fn=input_fn)))
-
-
-class _SummaryHook(session_run_hook.SessionRunHook):
-  """Saves summaries every N steps."""
-
-  def __init__(self):
-    self._summaries = []
-
-  def begin(self):
-    self._summary_op = summary_lib.merge_all()
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs({'summary': self._summary_op})
-
-  def after_run(self, run_context, run_values):
-    s = summary_pb2.Summary()
-    s.ParseFromString(run_values.results['summary'])
-    self._summaries.append(s)
-
-  def summaries(self):
-    return tuple(self._summaries)
-
-
-def _assert_checkpoint(
-    testcase, global_step, input_units, hidden_units, output_units, model_dir):
-  """Asserts checkpoint contains expected variables with proper shapes.
-
-  Args:
-    testcase: A TestCase instance.
-    global_step: Expected global step value.
-    input_units: The dimension of input layer.
-    hidden_units: Iterable of integer sizes for the hidden layers.
-    output_units: The dimension of output layer (logits).
-    model_dir: The model directory.
-  """
-  shapes = {
-      name: shape
-      for (name, shape) in checkpoint_utils.list_variables(model_dir)
-  }
-
-  # Global step.
-  testcase.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
-  testcase.assertEqual(
-      global_step,
-      checkpoint_utils.load_variable(
-          model_dir, ops.GraphKeys.GLOBAL_STEP))
-
-  # Hidden layer weights.
-  prev_layer_units = input_units
-  for i in range(len(hidden_units)):
-    layer_units = hidden_units[i]
-    testcase.assertAllEqual(
-        (prev_layer_units, layer_units),
-        shapes[HIDDEN_WEIGHTS_NAME_PATTERN % i])
-    testcase.assertAllEqual(
-        (layer_units,),
-        shapes[HIDDEN_BIASES_NAME_PATTERN % i])
-    prev_layer_units = layer_units
-
-  # Output layer weights.
-  testcase.assertAllEqual((prev_layer_units, output_units),
-                          shapes[LOGITS_WEIGHTS_NAME])
-  testcase.assertAllEqual((output_units,),
-                          shapes[LOGITS_BIASES_NAME])
-
-
-def _assert_simple_summary(testcase, expected_values, actual_summary):
-  """Assert summary the specified simple values.
-
-  Args:
-    testcase: A TestCase instance.
-    expected_values: Dict of expected tags and simple values.
-    actual_summary: `summary_pb2.Summary`.
-  """
-  testcase.assertAllClose(expected_values, {
-      v.tag: v.simple_value
-      for v in actual_summary.value if (v.tag in expected_values)
-  })
-
-
-class BaseDNNClassifierTrainTest(object):
-
-  def __init__(self, dnn_classifier_fn, fc_impl=feature_column):
-    self._dnn_classifier_fn = dnn_classifier_fn
-    self._fc_impl = fc_impl
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_from_scratch_with_default_optimizer_binary(self):
-    hidden_units = (2, 2)
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=hidden_units,
-        feature_columns=(self._fc_impl.numeric_column('age'),),
-        model_dir=self._model_dir)
-
-    # Train for a few steps, then validate final checkpoint.
-    num_steps = 5
-    dnn_classifier.train(
-        input_fn=lambda: ({'age': [[10.]]}, [[1]]), steps=num_steps)
-    _assert_checkpoint(
-        self, num_steps, input_units=1, hidden_units=hidden_units,
-        output_units=1, model_dir=self._model_dir)
-
-  def test_from_scratch_with_default_optimizer_multi_class(self):
-    hidden_units = (2, 2)
-    n_classes = 3
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=hidden_units,
-        feature_columns=(self._fc_impl.numeric_column('age'),),
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # Train for a few steps, then validate final checkpoint.
-    num_steps = 5
-    dnn_classifier.train(
-        input_fn=lambda: ({'age': [[10.]]}, [[2]]), steps=num_steps)
-    _assert_checkpoint(
-        self, num_steps, input_units=1, hidden_units=hidden_units,
-        output_units=n_classes, model_dir=self._model_dir)
-
-  def test_from_scratch_validate_summary(self):
-    hidden_units = (2, 2)
-    opt = mock_optimizer(
-        self, hidden_units=hidden_units)
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=hidden_units,
-        feature_columns=(self._fc_impl.numeric_column('age'),),
-        optimizer=opt,
-        model_dir=self._model_dir)
-    self.assertEqual(0, opt.minimize.call_count)
-
-    # Train for a few steps, then validate optimizer, summaries, and
-    # checkpoint.
-    num_steps = 5
-    summary_hook = _SummaryHook()
-    dnn_classifier.train(
-        input_fn=lambda: ({'age': [[10.]]}, [[1]]), steps=num_steps,
-        hooks=(summary_hook,))
-    self.assertEqual(1, opt.minimize.call_count)
-    _assert_checkpoint(
-        self, num_steps, input_units=1, hidden_units=hidden_units,
-        output_units=1, model_dir=self._model_dir)
-    summaries = summary_hook.summaries()
-    self.assertEqual(num_steps, len(summaries))
-    for summary in summaries:
-      summary_keys = [v.tag for v in summary.value]
-      self.assertIn(metric_keys.MetricKeys.LOSS, summary_keys)
-      self.assertIn(metric_keys.MetricKeys.LOSS_MEAN, summary_keys)
-
-  def test_binary_classification(self):
-    base_global_step = 100
-    hidden_units = (2, 2)
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),), base_global_step, self._model_dir)
-
-    # Uses identical numbers as DNNModelFnTest.test_one_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [-2.08] => probabilities = [0.889, 0.111]
-    # loss = -1. * log(0.111) = 2.19772100
-    expected_loss = 2.19772100
-    opt = mock_optimizer(
-        self, hidden_units=hidden_units, expected_loss=expected_loss)
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=hidden_units,
-        feature_columns=(self._fc_impl.numeric_column('age'),),
-        optimizer=opt,
-        model_dir=self._model_dir)
-    self.assertEqual(0, opt.minimize.call_count)
-
-    # Train for a few steps, then validate optimizer, summaries, and
-    # checkpoint.
-    num_steps = 5
-    summary_hook = _SummaryHook()
-    dnn_classifier.train(
-        input_fn=lambda: ({'age': [[10.]]}, [[1]]), steps=num_steps,
-        hooks=(summary_hook,))
-    self.assertEqual(1, opt.minimize.call_count)
-    summaries = summary_hook.summaries()
-    self.assertEqual(num_steps, len(summaries))
-    for summary in summaries:
-      _assert_simple_summary(
-          self,
-          {
-              metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
-              'dnn/dnn/hiddenlayer_0/fraction_of_zero_values': 0.,
-              'dnn/dnn/hiddenlayer_1/fraction_of_zero_values': .5,
-              'dnn/dnn/logits/fraction_of_zero_values': 0.,
-              metric_keys.MetricKeys.LOSS: expected_loss,
-          },
-          summary)
-    _assert_checkpoint(
-        self, base_global_step + num_steps, input_units=1,
-        hidden_units=hidden_units, output_units=1, model_dir=self._model_dir)
-
-  def test_binary_classification_float_labels(self):
-    base_global_step = 100
-    hidden_units = (2, 2)
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),), base_global_step, self._model_dir)
-
-    # Uses identical numbers as DNNModelFnTest.test_one_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [-2.08] => probabilities = [0.889, 0.111]
-    # loss = -0.8 * log(0.111) -0.2 * log(0.889) = 1.7817210
-    expected_loss = 1.7817210
-    opt = mock_optimizer(
-        self, hidden_units=hidden_units, expected_loss=expected_loss)
-    dnn_classifier = self._dnn_classifier_fn(
-        hidden_units=hidden_units,
-        feature_columns=(self._fc_impl.numeric_column('age'),),
-        optimizer=opt,
-        model_dir=self._model_dir)
-    self.assertEqual(0, opt.minimize.call_count)
-
-    # Train for a few steps, then validate optimizer, summaries, and
-    # checkpoint.
-    num_steps = 5
-    dnn_classifier.train(
-        input_fn=lambda: ({'age': [[10.]]}, [[0.8]]), steps=num_steps)
-    self.assertEqual(1, opt.minimize.call_count)
-
-  def test_multi_class(self):
-    n_classes = 3
-    base_global_step = 100
-    hidden_units = (2, 2)
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1., 1., .5], [-1., 1., .5]],
-          [.3, -.3, .0]),), base_global_step, self._model_dir)
-
-    # Uses identical numbers as DNNModelFnTest.test_multi_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [-2.08, 2.08, 1.19] => probabilities = [0.0109, 0.7011, 0.2879]
-    # loss = -1. * log(0.7011) = 0.35505795
-    expected_loss = 0.35505795
-    opt = mock_optimizer(
-        self, hidden_units=hidden_units, expected_loss=expected_loss)
-    dnn_classifier = self._dnn_classifier_fn(
-        n_classes=n_classes,
-        hidden_units=hidden_units,
-        feature_columns=(self._fc_impl.numeric_column('age'),),
-        optimizer=opt,
-        model_dir=self._model_dir)
-    self.assertEqual(0, opt.minimize.call_count)
-
-    # Train for a few steps, then validate optimizer, summaries, and
-    # checkpoint.
-    num_steps = 5
-    summary_hook = _SummaryHook()
-    dnn_classifier.train(
-        input_fn=lambda: ({'age': [[10.]]}, [[1]]), steps=num_steps,
-        hooks=(summary_hook,))
-    self.assertEqual(1, opt.minimize.call_count)
-    summaries = summary_hook.summaries()
-    self.assertEqual(num_steps, len(summaries))
-    for summary in summaries:
-      _assert_simple_summary(
-          self,
-          {
-              metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
-              'dnn/dnn/hiddenlayer_0/fraction_of_zero_values': 0.,
-              'dnn/dnn/hiddenlayer_1/fraction_of_zero_values': .5,
-              'dnn/dnn/logits/fraction_of_zero_values': 0.,
-              metric_keys.MetricKeys.LOSS: expected_loss,
-          },
-          summary)
-    _assert_checkpoint(
-        self, base_global_step + num_steps, input_units=1,
-        hidden_units=hidden_units, output_units=n_classes,
-        model_dir=self._model_dir)
-
-
-class BaseDNNRegressorTrainTest(object):
-
-  def __init__(self, dnn_regressor_fn, fc_impl=feature_column):
-    self._dnn_regressor_fn = dnn_regressor_fn
-    self._fc_impl = fc_impl
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_from_scratch_with_default_optimizer(self):
-    hidden_units = (2, 2)
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=hidden_units,
-        feature_columns=(self._fc_impl.numeric_column('age'),),
-        model_dir=self._model_dir)
-
-    # Train for a few steps, then validate final checkpoint.
-    num_steps = 5
-    dnn_regressor.train(
-        input_fn=lambda: ({'age': ((1,),)}, ((10,),)), steps=num_steps)
-    _assert_checkpoint(
-        self, num_steps, input_units=1, hidden_units=hidden_units,
-        output_units=1, model_dir=self._model_dir)
-
-  def test_from_scratch(self):
-    hidden_units = (2, 2)
-    opt = mock_optimizer(self, hidden_units=hidden_units)
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=hidden_units,
-        feature_columns=(self._fc_impl.numeric_column('age'),),
-        optimizer=opt,
-        model_dir=self._model_dir)
-    self.assertEqual(0, opt.minimize.call_count)
-
-    # Train for a few steps, then validate optimizer, summaries, and
-    # checkpoint.
-    num_steps = 5
-    summary_hook = _SummaryHook()
-    dnn_regressor.train(
-        input_fn=lambda: ({'age': ((1,),)}, ((5.,),)), steps=num_steps,
-        hooks=(summary_hook,))
-    self.assertEqual(1, opt.minimize.call_count)
-    _assert_checkpoint(
-        self, num_steps, input_units=1, hidden_units=hidden_units,
-        output_units=1, model_dir=self._model_dir)
-    summaries = summary_hook.summaries()
-    self.assertEqual(num_steps, len(summaries))
-    for summary in summaries:
-      summary_keys = [v.tag for v in summary.value]
-      self.assertIn(metric_keys.MetricKeys.LOSS, summary_keys)
-      self.assertIn(metric_keys.MetricKeys.LOSS_MEAN, summary_keys)
-
-  def test_one_dim(self):
-    """Asserts train loss for one-dimensional input and logits."""
-    base_global_step = 100
-    hidden_units = (2, 2)
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),), base_global_step, self._model_dir)
-
-    # Uses identical numbers as DNNModelFnTest.test_one_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [-2.08] => predictions = [-2.08]
-    # loss = (1 + 2.08)^2 = 9.4864
-    expected_loss = 9.4864
-    opt = mock_optimizer(
-        self, hidden_units=hidden_units, expected_loss=expected_loss)
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=hidden_units,
-        feature_columns=(self._fc_impl.numeric_column('age'),),
-        optimizer=opt,
-        model_dir=self._model_dir)
-    self.assertEqual(0, opt.minimize.call_count)
-
-    # Train for a few steps, then validate optimizer, summaries, and
-    # checkpoint.
-    num_steps = 5
-    summary_hook = _SummaryHook()
-    dnn_regressor.train(
-        input_fn=lambda: ({'age': [[10.]]}, [[1.]]), steps=num_steps,
-        hooks=(summary_hook,))
-    self.assertEqual(1, opt.minimize.call_count)
-    summaries = summary_hook.summaries()
-    self.assertEqual(num_steps, len(summaries))
-    for summary in summaries:
-      _assert_simple_summary(
-          self,
-          {
-              metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
-              'dnn/dnn/hiddenlayer_0/fraction_of_zero_values': 0.,
-              'dnn/dnn/hiddenlayer_1/fraction_of_zero_values': 0.5,
-              'dnn/dnn/logits/fraction_of_zero_values': 0.,
-              metric_keys.MetricKeys.LOSS: expected_loss,
-          },
-          summary)
-    _assert_checkpoint(
-        self, base_global_step + num_steps, input_units=1,
-        hidden_units=hidden_units, output_units=1, model_dir=self._model_dir)
-
-  def test_multi_dim(self):
-    """Asserts train loss for multi-dimensional input and logits."""
-    base_global_step = 100
-    hidden_units = (2, 2)
-    create_checkpoint(
-        (([[.6, .5], [-.6, -.5]], [.1, -.1]), ([[1., .8], [-.8, -1.]],
-                                               [.2, -.2]),
-         ([[-1., 1., .5], [-1., 1., .5]],
-          [.3, -.3, .0]),), base_global_step, self._model_dir)
-    input_dimension = 2
-    label_dimension = 3
+from tensorflow_estimator.python.estimator.canned import dnn_testing_utils
 
-    # Uses identical numbers as
-    # DNNModelFnTest.test_multi_dim_input_multi_dim_logits.
-    # See that test for calculation of logits.
-    # logits = [[-0.48, 0.48, 0.39]]
-    # loss = (1+0.48)^2 + (-1-0.48)^2 + (0.5-0.39)^2 = 4.3929
-    expected_loss = 4.3929
-    opt = mock_optimizer(
-        self, hidden_units=hidden_units, expected_loss=expected_loss)
-    dnn_regressor = self._dnn_regressor_fn(
-        hidden_units=hidden_units,
-        feature_columns=[
-            self._fc_impl.numeric_column('age', shape=[input_dimension])
-        ],
-        label_dimension=label_dimension,
-        optimizer=opt,
-        model_dir=self._model_dir)
-    self.assertEqual(0, opt.minimize.call_count)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+dnn_testing_utils.__all__ = [
+    s for s in dir(dnn_testing_utils) if not s.startswith('__')
+]
 
-    # Train for a few steps, then validate optimizer, summaries, and
-    # checkpoint.
-    num_steps = 5
-    summary_hook = _SummaryHook()
-    dnn_regressor.train(
-        input_fn=lambda: ({'age': [[10., 8.]]}, [[1., -1., 0.5]]),
-        steps=num_steps,
-        hooks=(summary_hook,))
-    self.assertEqual(1, opt.minimize.call_count)
-    summaries = summary_hook.summaries()
-    self.assertEqual(num_steps, len(summaries))
-    for summary in summaries:
-      _assert_simple_summary(
-          self,
-          {
-              metric_keys.MetricKeys.LOSS_MEAN: expected_loss / label_dimension,
-              'dnn/dnn/hiddenlayer_0/fraction_of_zero_values': 0.,
-              'dnn/dnn/hiddenlayer_1/fraction_of_zero_values': 0.5,
-              'dnn/dnn/logits/fraction_of_zero_values': 0.,
-              metric_keys.MetricKeys.LOSS: expected_loss,
-          },
-          summary)
-    _assert_checkpoint(
-        self, base_global_step + num_steps, input_units=input_dimension,
-        hidden_units=hidden_units, output_units=label_dimension,
-        model_dir=self._model_dir)
+from tensorflow_estimator.python.estimator.canned.dnn_testing_utils import *
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 06593f95201e23f58a6fd812c0d86ba1ba0b64d5..6b8fd235f704a5727a439da9c8bb577ff72b784e 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,1590 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Abstractions for the head(s) of a model."""
+"""head python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
-import collections
-
-import six
-
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.summary import summary
-from tensorflow.python.training import training_util
-from tensorflow.python.util import function_utils
-
-_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
-# The above default is defined by TF Serving, but these next three are just
-# a local convention without any special meaning.
-_CLASSIFY_SERVING_KEY = 'classification'
-_REGRESS_SERVING_KEY = 'regression'
-_PREDICT_SERVING_KEY = 'predict'
-
-
-# A LossSpec contains
-# * a scalar `Tensor` representing reduced weighted training loss
-# * a `Tensor` representing the unreduced unweighted loss
-# * a `Tensor` representing the example weights
-# * possibly processed labels (e.g. vocabulary lookup, shape manipulation, etc)
-LossSpec = collections.namedtuple(
-    'LossSpec', ['training_loss', 'unreduced_loss', 'weights',
-                 'processed_labels'])
-
-
-def _summary_key(head_name, val):
-  return '%s/%s' % (val, head_name) if head_name else val
-
-
-def _create_eval_metrics_tuple(fn, kwargs):
-  """Creates TPU eval metrics tuple.
-
-  Helper function to make eval_metric tuple (eval_metric_fn, fn_kwargs) used
-  by `TPUEstimator`. TPUEstimator requires that `eval_metric_fn` take
-  exclusively Tensor arguments. This helper can help create such a function from
-  a more generic function that can take both Tensor and non-Tensor arguments.
-
-  Args:
-    fn: A eval_metric_fn that takes both Tensor and non-Tensor arguments.
-        This function must return a dict of form
-        {'metric name': (metric_tensor, eval_op)}
-    kwargs: Dict of arguments for `fn`.
-
-  Returns:
-    `eval_metric` tuple that can be passed to a `model_fn._TPUEstimatorSpec`.
-  """
-  tensor_kwargs = {}
-  nontensor_kwargs = {}
-  for k, v in six.iteritems(kwargs):
-    if tensor_util.is_tensor(v):
-      tensor_kwargs[k] = v
-    else:
-      nontensor_kwargs[k] = v
-  def _fn(**tensors):
-    return fn(**dict(nontensor_kwargs, **tensors))
-  return (_fn, tensor_kwargs)
-
-
-class _Head(object):
-  """Interface for the head/top of a model.
-
-  Given logits (or output of a hidden layer), a Head knows how to compute
-  predictions, loss, train_op, metrics and export outputs. It is meant to:
-
-  1. Simplify writing model_fn and to make model_fn more configurable
-  2. Support wide range of machine learning models. Since most heads can work
-     with logits, they can support DNN, RNN, Wide, Wide&Deep,
-     Global objectives, Gradient boosted trees and many other types
-     of machine learning models.
-
-  Common usage:
-  Here is simplified model_fn to build a DNN regression model.
-    ```python
-    def _my_dnn_model_fn(features, labels, mode, params, config=None):
-      # Optionally your callers can pass head to model_fn as a param.
-      head = tf.contrib.estimator.regression_head(...)
-      inputs = tf.feature_column.input_layer(features, ...)
-      hidden_layer0 = tf.layers.dense(
-          inputs, units=1000, activation=tf.nn.relu)
-      hidden_layer1 = tf.layers.dense(
-          hidden_layer0, units=500, activation=tf.nn.relu)
-      logits = tf.layers.dense(
-          hidden_layer1, units=head.logits_dimension, activation=None)
-
-      return head.create_estimator_spec(
-          features=features,
-          labels=labels,
-          mode=mode,
-          logits=logits,
-          optimizer=optimizer)
-    ```
-
-  There are cases where computing and applying gradients can not be meaningfully
-  captured with optimizer or train_op_fn we support (for example, with sync
-  optimizer). In such case, you can take the responsibility on your own. Here is
-  a common use case,
-    ```python
-    estimator_spec = head.create_estimator_spec(
-        features=features,
-        labels=labels,
-        mode=mode,
-        logits=logits,
-        train_op_fn=lambda _: tf.no_op())
-    if mode == model_fn.ModeKeys.TRAIN:
-      optimizer = ...
-      sync = tf.train.SyncReplicasOptimizer(opt=optimizer, ...)
-      update_op = sync.minimize(
-          estimator_spec.loss, global_step=tf.get_global_step())
-      hooks = [sync.make_session_run_hook(is_chief)]
-      ... update train_op and hooks in EstimatorSpec and return
-    ```
-  """
-  __metaclass__ = abc.ABCMeta
-
-  @abc.abstractproperty
-  def name(self):
-    """The name of this head.
-
-    Returns:
-      A string.
-    """
-    raise NotImplementedError('Calling an abstract method.')
-
-  @abc.abstractproperty
-  def logits_dimension(self):
-    """Size of the last dimension of the logits `Tensor`.
-
-    Typically, logits is of shape `[batch_size, logits_dimension]`.
-
-    Returns:
-      The expected size of the `logits` tensor.
-    """
-    raise NotImplementedError('Calling an abstract method.')
-
-  @abc.abstractmethod
-  def create_loss(self, features, mode, logits, labels):
-    """Returns a loss Tensor from provided logits.
-
-    This function is designed to be used by framework developers.  Almost all
-    users should use create_estimator_spec(), which calls this internally.
-    `mode` and `features` are most likely not used, but some Head
-    implementations may require them.
-
-    Args:
-      features: Input `dict` of `Tensor` objects.
-      mode: Estimator's `ModeKeys`.
-      logits: logits `Tensor` to be used for loss construction.
-      labels: Labels `Tensor`, or `dict` of same.
-
-    Returns:
-      A LossSpec that contains
-      * the scalar `Tensor` representing reduced weighted training loss
-      * the `Tensor` representing the unreduced unweighted loss
-      * the `Tensor` representing the example weights
-      * possibly processed labels (e.g. vocabulary lookup, shape manipulation,
-        etc.)
-
-      To be extendable in the future.
-    """
-    raise NotImplementedError('Calling an abstract method.')
-
-  # TODO(b/65403806): By default, collect regularization_losses from
-  # GraphKeys.REGULARIZATION_LOSSES collection.
-  def create_estimator_spec(
-      self, features, mode, logits, labels=None, optimizer=None,
-      train_op_fn=None, regularization_losses=None):
-    """Returns `EstimatorSpec` that a model_fn can return.
-
-    Please note that,
-    + All args must be passed via name.
-
-    Args:
-      features: Input `dict` of `Tensor` or `SparseTensor` objects.
-      mode: Estimator's `ModeKeys`.
-      logits: logits `Tensor` to be used by the head.
-      labels: Labels `Tensor`, or `dict` of same.
-      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
-        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
-        updates variables and increments `global_step`.
-      train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
-        to optimize the model with the loss in TRAIN mode. Used if `optimizer`
-        is `None`. Exactly one of `train_op_fn` and `optimizer` must be set in
-        TRAIN mode. None is allowed in other modes. If you want to optimize loss
-        yourself you can pass `lambda _: tf.no_op()` and then use
-        EstimatorSpec.loss to compute and apply gradients.
-      regularization_losses: A list of additional scalar losses to be added to
-        the training loss, such as regularization losses.
-
-    Returns:
-      `EstimatorSpec`.
-    """
-    try:
-      tpu_estimator_spec = (
-          self._create_tpu_estimator_spec(
-              features, mode, logits, labels, optimizer, train_op_fn,
-              regularization_losses))
-      return tpu_estimator_spec.as_estimator_spec()
-    except NotImplementedError:
-      # Not all subclasses of _Head will have implemented
-      # _create_tpu_estimator_spec. If it is implemented, we can use it to
-      # create our `EstimatorSpec` here.
-      raise NotImplementedError(
-          'Subclasses of _Head must implement `create_estimator_spec()` or '
-          '_create_tpu_estimator_spec().')
-
-  def _create_tpu_estimator_spec(
-      self, features, mode, logits, labels=None, optimizer=None,
-      train_op_fn=None, regularization_losses=None):
-    """Returns `model_fn._TPUEstimatorSpec` that a model_fn can return.
-
-    Args:
-      features: Input `dict` of `Tensor` or `SparseTensor` objects.
-      mode: Estimator's `ModeKeys`.
-      logits: logits `Tensor` to be used by the head.
-      labels: Labels `Tensor`, or `dict` of same.
-      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
-        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
-        updates variables and increments `global_step`.
-      train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
-        to optimize the model with the loss in TRAIN mode. Used if `optimizer`
-        is `None`. Exactly one of `train_op_fn` and `optimizer` must be set in
-        TRAIN mode. None is allowed in other modes. If you want to optimize loss
-        yourself you can pass `lambda _: tf.no_op()` and then use
-        EstimatorSpec.loss to compute and apply gradients.
-      regularization_losses: A list of additional scalar losses to be added to
-        the training loss, such as regularization losses.
-
-    Returns:
-      A `model_fn._TPUEstimatorSpec' instance.
-    """
-    raise NotImplementedError(
-        'TPUEstimatorSpec not available for this model head.')
-
-
-def _check_dense_labels_match_logits_and_reshape(
-    labels, logits, expected_labels_dimension):
-  """Checks that labels shape matches logits and reshapes if needed.
-
-  Consider logits of shape [D0, D1, ... DN, logits_dimension]. Then labels
-  shape must be [D0, D1, ... DN, expected_labels_dimension].
-  If expected_labels_dimension=1, labels could be [D0, D1, ... DN] and this
-  method reshapes them to [D0, D1, ... DN, 1].
-
-  Args:
-    labels: labels Tensor.
-    logits: logits Tensor.
-    expected_labels_dimension: Integer.
-  Returns:
-    Validated and reshaped labels Tensor.
-  Raises:
-    ValueError: If labels is a SparseTensor.
-    ValueError: If labels shape is statically defined and fails validation.
-    OpError: If labels shape is not statically defined and fails validation.
-  """
-  if labels is None:
-    raise ValueError(
-        'You must provide a labels Tensor. Given: None. '
-        'Suggested troubleshooting steps: Check that your data contain '
-        'your label feature. Check that your input_fn properly parses and '
-        'returns labels.')
-  with ops.name_scope(None, 'labels', (labels, logits)) as scope:
-    labels = sparse_tensor.convert_to_tensor_or_sparse_tensor(labels)
-    if isinstance(labels, sparse_tensor.SparseTensor):
-      raise ValueError(
-          'SparseTensor labels are not supported. '
-          'labels must be a Tensor of shape [D0, D1, ..., DN, %s], '
-          'e.g. [batch_size, %s]. '
-          'Suggested Fix (1): Check the label feature in your data. '
-          'Each example must contain %s value(s). If not, your choice of label '
-          'was probably incorrect. '
-          'Suggested Fix (2): In your input_fn, use '
-          'tf.sparse_tensor_to_dense() to turn labels into a Tensor.'
-          '' % (expected_labels_dimension, expected_labels_dimension,
-                expected_labels_dimension))
-    if (labels.shape.ndims is not None and logits.shape.ndims is not None and
-        labels.shape.ndims == logits.shape.ndims - 1):
-      labels = array_ops.expand_dims(labels, -1)
-    labels_shape = array_ops.shape(labels)
-    logits_shape = array_ops.shape(logits)
-    err_msg = (
-        'labels shape must be [D0, D1, ... DN, {}]. '
-        'Suggested Fix: check your n_classes argument to the estimator '
-        'and/or the shape of your label.'.format(expected_labels_dimension))
-    assert_rank = check_ops.assert_rank_at_least(labels, 2, message=err_msg)
-    with ops.control_dependencies([assert_rank]):
-      static_shape = labels.shape
-      if static_shape.ndims is not None:
-        dim1 = static_shape[-1]
-        if (dim1 is not None) and (dim1 != expected_labels_dimension):
-          raise ValueError(
-              'Mismatched label shape. '
-              'Expected labels dimension=%s.  Received %s. '
-              'Suggested Fix:'
-              'If your classifier expects one-hot encoding label,'
-              'check your n_classes argument to the estimator '
-              'and/or the shape of your label. '
-              'Otherwise, check the shape of your label.' %
-              (expected_labels_dimension, dim1))
-      expected_labels_shape = array_ops.concat(
-          [logits_shape[:-1], [expected_labels_dimension]], axis=0)
-      assert_dimension = check_ops.assert_equal(
-          expected_labels_shape, labels_shape, message=err_msg,
-          data=['expected_labels_shape: ', expected_labels_shape,
-                'labels_shape: ', labels_shape])
-      with ops.control_dependencies([assert_dimension]):
-        return array_ops.identity(labels, name=scope)
-
-
-def _get_weights_and_check_match_logits(
-    features, weight_column, logits, allow_per_logit_weights=False):
-  """Fetches weights from features and checks that the shape matches logits.
-
-  Consider logits of shape [D0, D1, ... DN, logits_dimension]. Weights shape
-  can be either:
-  * [D0, D1, ... DN, logits_dimension] if `allow_per_logit_weights=True`.
-  * [D0, D1, ... DN, 1]
-  * [D0, D1, ... DN]: In this case, weights is reshaped into
-    [D0, D1, ... DN, 1] to work with weight broadcasting rules.
-
-  Args:
-    features: The features dict that contains weights.
-    weight_column: The weight column. If not given, this method returns 1.
-    logits: logits Tensor.
-    allow_per_logit_weights: Boolean. Whether we allow weights along the logits
-      dimension, namely shape `[D0, D1, ... DN, logits_dimension]`.
-  Returns:
-    Validated and reshaped weights Tensor.
-  Raises:
-    ValueError: If the weights `Tensor` cannot be cast into float.
-  """
-  if allow_per_logit_weights:
-    err_msg = (
-        'weights shape must be [D0, D1, ... DN], [D0, D1, ... DN, 1] or '
-        '[D0, D1, ... DN, logits_dimension]')
-  else:
-    err_msg = (
-        'weights shape must be [D0, D1, ... DN] or [D0, D1, ... DN, 1]')
-  with ops.name_scope(
-      None, 'weights',
-      values=tuple(six.itervalues(features)) + (logits,)) as scope:
-    # Fetch the weights.
-    if weight_column is None:
-      return 1.
-    if isinstance(weight_column, six.string_types):
-      weight_column = feature_column_lib.numeric_column(
-          key=weight_column, shape=(1,))
-    if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
-      raise TypeError('Weight column must be either a string or _NumericColumn.'
-                      ' Given type: {}.'.format(type(weight_column)))
-    weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
-        feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
-    if not (weights.dtype.is_floating or weights.dtype.is_integer):
-      raise ValueError('Weight column should be castable to float. '
-                       'Given dtype: {}'.format(weights.dtype))
-    weights = math_ops.to_float(weights, name='weights')
-
-    # Validate the weights shape.
-    weights_shape = array_ops.shape(weights, name='weights_shape')
-    logits_shape = array_ops.shape(logits, name='logits_shape')
-    if (weights.shape.ndims is not None and logits.shape.ndims is not None and
-        weights.shape.ndims == logits.shape.ndims - 1):
-      assert_dimension = check_ops.assert_equal(
-          logits_shape[:-1], weights_shape, message=err_msg,
-          data=['logits_shape: ', logits_shape,
-                'weights_shape: ', weights_shape])
-      with ops.control_dependencies([assert_dimension]):
-        return array_ops.expand_dims(weights, -1, name=scope)
-    supported_weights_shape = array_ops.concat([logits_shape[:-1], [1]], axis=0)
-    if allow_per_logit_weights:
-      condition = math_ops.reduce_any(
-          [math_ops.reduce_all(math_ops.equal(logits_shape, weights_shape)),
-           math_ops.reduce_all(math_ops.equal(
-               supported_weights_shape, weights_shape))])
-      assert_dimension = control_flow_ops.Assert(
-          condition=condition,
-          data=[err_msg, 'logits_shape: ', logits_shape,
-                'weights_shape: ', weights_shape])
-    else:
-      assert_dimension = check_ops.assert_equal(
-          supported_weights_shape, weights_shape, message=err_msg,
-          data=['logits_shape: ', logits_shape,
-                'weights_shape: ', weights_shape])
-    with ops.control_dependencies([assert_dimension]):
-      return array_ops.identity(weights, name=scope)
-
-
-def _check_logits_final_dim(logits, expected_logits_dimension):
-  """Checks that logits shape is [D0, D1, ... DN, logits_dimension]."""
-  with ops.name_scope(None, 'logits', (logits,)) as scope:
-    logits = math_ops.to_float(logits)
-    logits_shape = array_ops.shape(logits)
-    assert_rank = check_ops.assert_rank_at_least(
-        logits, 2, data=[logits_shape],
-        message='logits shape must be [D0, D1, ... DN, logits_dimension]')
-    with ops.control_dependencies([assert_rank]):
-      static_shape = logits.shape
-      if static_shape.ndims is not None and static_shape[-1] is not None:
-        if static_shape[-1] != expected_logits_dimension:
-          raise ValueError(
-              'logits shape must be [D0, D1, ... DN, logits_dimension], '
-              'got %s.' % (static_shape,))
-        return logits
-      assert_dimension = check_ops.assert_equal(
-          expected_logits_dimension, logits_shape[-1], data=[logits_shape],
-          message='logits shape must be [D0, D1, ... DN, logits_dimension]')
-      with ops.control_dependencies([assert_dimension]):
-        return array_ops.identity(logits, name=scope)
-
-
-def _validate_loss_fn_args(loss_fn):
-  """Validates loss_fn arguments.
-
-  Required arguments: labels, logits.
-  Optional arguments: features.
-
-  Args:
-    loss_fn: The loss function.
-  Raises:
-    ValueError: If the signature is unexpected.
-  """
-  loss_fn_args = function_utils.fn_args(loss_fn)
-  for required_arg in ['labels', 'logits']:
-    if required_arg not in loss_fn_args:
-      raise ValueError(
-          'loss_fn must contain argument: {}. '
-          'Given arguments: {}'.format(required_arg, loss_fn_args))
-  invalid_args = list(set(loss_fn_args) - set(['labels', 'logits', 'features']))
-  if invalid_args:
-    raise ValueError('loss_fn has unexpected args: {}'.format(invalid_args))
-
-
-def _call_loss_fn(loss_fn, labels, logits, features, expected_loss_dim=1):
-  """Calls loss_fn and checks the returned shape.
-
-  Args:
-    loss_fn: The loss function.
-    labels: Processed labels Tensor.
-    logits: Logits Tensor of shape [D0, D1, ... DN, logits_dimension].
-    features: Features dict.
-    expected_loss_dim: The expected last dimension of loss Tensor.
-  Returns:
-    Loss Tensor with shape [D0, D1, ... DN, expected_loss_dim].
-  """
-  loss_fn_args = function_utils.fn_args(loss_fn)
-  kwargs = {}
-  if 'features' in loss_fn_args:
-    kwargs['features'] = features
-  with ops.name_scope(
-      None, 'call_loss_fn',
-      values=[labels, logits] + list(six.itervalues(features))):
-    unweighted_loss = loss_fn(labels=labels, logits=logits, **kwargs)
-    logits_shape = array_ops.shape(logits, name='logits_shape')
-    expected_loss_shape = array_ops.concat(
-        [logits_shape[:-1], [expected_loss_dim]], axis=0,
-        name='expected_loss_shape')
-    loss_shape = array_ops.shape(unweighted_loss, name='loss_shape')
-    check_loss_shape_op = control_flow_ops.Assert(
-        math_ops.reduce_all(math_ops.equal(loss_shape, expected_loss_shape)),
-        data=[
-            'loss_fn must return Tensor of shape '
-            '[D0, D1, ... DN, {}]. '.format(expected_loss_dim),
-            'logits_shape: ', logits_shape, 'loss_shape: ', loss_shape],
-        name='check_loss_shape')
-    with ops.control_dependencies([check_loss_shape_op]):
-      return array_ops.identity(unweighted_loss)
-
-
-def _indicator_labels_mean(labels, weights=None, name=None):
-  with ops.name_scope(name, 'labels_mean', (labels, weights)) as scope:
-    labels = math_ops.to_float(labels, name='labels')
-    if weights is not None:
-      weights = weights_broadcast_ops.broadcast_weights(weights, labels)
-    return metrics_lib.mean(labels, weights=weights, name=scope)
-
-
-def _classification_output(scores, n_classes, label_vocabulary=None):
-  batch_size = array_ops.shape(scores)[0]
-  if label_vocabulary:
-    export_class_list = label_vocabulary
-  else:
-    export_class_list = string_ops.as_string(math_ops.range(n_classes))
-  export_output_classes = array_ops.tile(
-      input=array_ops.expand_dims(input=export_class_list, axis=0),
-      multiples=[batch_size, 1])
-  return export_output.ClassificationOutput(
-      scores=scores,
-      # `ClassificationOutput` requires string classes.
-      classes=export_output_classes)
-
-
-def _accuracy_baseline(labels_mean):
-  """Return accuracy baseline based on labels mean.
-
-  This is the best the model could do by always predicting one class.
-
-  Args:
-    labels_mean: Tuple of value and update op.
-
-  Returns:
-    Tuple of value and update op.
-  """
-  with ops.name_scope(None, 'accuracy_baseline', labels_mean):
-    value, update_op = labels_mean
-    return (
-        math_ops.maximum(value, 1. - value, name='value'),
-        math_ops.maximum(update_op, 1 - update_op, name='update_op'))
-
-
-def _predictions_mean(predictions, weights=None, name=None):
-  with ops.name_scope(
-      name, 'predictions_mean', (predictions, weights)) as scope:
-    predictions = math_ops.to_float(predictions, name='predictions')
-    if weights is not None:
-      weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
-    return metrics_lib.mean(predictions, weights=weights, name=scope)
-
-
-def _auc(labels, predictions, weights=None, curve='ROC', name=None):
-  with ops.name_scope(name, 'auc', (predictions, labels, weights)) as scope:
-    predictions = math_ops.to_float(predictions, name='predictions')
-    if weights is not None:
-      weights = weights_broadcast_ops.broadcast_weights(weights, predictions)
-    return metrics_lib.auc(
-        labels=labels, predictions=predictions, weights=weights, curve=curve,
-        name=scope)
-
-
-def _accuracy_at_threshold(labels, predictions, weights, threshold, name=None):
-  with ops.name_scope(
-      name, 'accuracy_at_%s' % threshold,
-      (predictions, labels, weights, threshold)) as scope:
-    threshold_predictions = math_ops.to_float(
-        math_ops.greater_equal(predictions, threshold))
-    return metrics_lib.accuracy(
-        labels=labels, predictions=threshold_predictions, weights=weights,
-        name=scope)
-
-
-def _precision_at_threshold(labels, predictions, weights, threshold, name=None):
-  with ops.name_scope(
-      name, 'precision_at_%s' % threshold,
-      (predictions, labels, weights, threshold)) as scope:
-    precision_tensor, update_op = metrics_lib.precision_at_thresholds(
-        labels=labels, predictions=predictions, thresholds=(threshold,),
-        weights=weights, name=scope)
-    return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
-
-
-def _recall_at_threshold(labels, predictions, weights, threshold, name=None):
-  with ops.name_scope(
-      name, 'recall_at_%s' % threshold,
-      (predictions, labels, weights, threshold)) as scope:
-    precision_tensor, update_op = metrics_lib.recall_at_thresholds(
-        labels=labels, predictions=predictions, thresholds=(threshold,),
-        weights=weights, name=scope)
-    return array_ops.squeeze(precision_tensor), array_ops.squeeze(update_op)
-
-
-def _multi_class_head_with_softmax_cross_entropy_loss(
-    n_classes,
-    weight_column=None,
-    label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM,
-    loss_fn=None,
-    name=None):
-  """Creates a '_Head' for multi class classification.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`.
-  In many applications, the shape is `[batch_size, n_classes]`.
-
-  `labels` must be a dense `Tensor` with shape matching `logits`, namely
-  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
-  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
-  `labels` must be an integer `Tensor` with values specifying the class index.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
-
-  The loss is the weighted sum over the input dimensions. Namely, if the input
-  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
-  `batch_size`.
-
-  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
-  `(labels, logits, features)` as arguments and returns unreduced loss with
-  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support integer `labels` with
-  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
-  the input labels before passing them to `loss_fn`.
-
-  Args:
-    n_classes: Number of classes, must be greater than 2 (for 2 classes, use
-      `_BinaryLogisticHeadWithSigmoidCrossEntropyLoss`).
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.
-    label_vocabulary: A list or tuple of strings representing possible label
-      values. If it is not given, that means labels are already encoded as an
-      integer within [0, n_classes). If given, labels must be of string type and
-      have any value in `label_vocabulary`. Note that errors will be raised if
-      `label_vocabulary` is not provided but labels are strings.
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
-    loss_fn: Optional loss function.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-
-  Returns:
-    An instance of `_Head` for multi class classification.
-
-  Raises:
-    ValueError: If `n_classes`, `label_vocabulary` or `loss_reduction` is
-      invalid.
-  """
-  if label_vocabulary is not None and not isinstance(label_vocabulary,
-                                                     (list, tuple)):
-    raise ValueError(
-        'label_vocabulary should be a list or a tuple. Given type: {}'.format(
-            type(label_vocabulary)))
-  if (loss_reduction not in losses.Reduction.all() or
-      loss_reduction == losses.Reduction.NONE):
-    raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
-  if loss_fn:
-    _validate_loss_fn_args(loss_fn)
-  return _MultiClassHeadWithSoftmaxCrossEntropyLoss(
-      n_classes=n_classes,
-      weight_column=weight_column,
-      label_vocabulary=label_vocabulary,
-      loss_reduction=loss_reduction,
-      loss_fn=loss_fn,
-      name=name)
-
-
-class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
-  """See `_multi_class_head_with_softmax_cross_entropy_loss`."""
-
-  def __init__(self,
-               n_classes,
-               weight_column=None,
-               label_vocabulary=None,
-               loss_reduction=losses.Reduction.SUM,
-               loss_fn=None,
-               name=None):
-    if (n_classes is None) or (n_classes <= 2):
-      raise ValueError('n_classes must be > 2: %s.' % n_classes)
-    self._n_classes = n_classes
-    self._weight_column = weight_column
-    self._label_vocabulary = label_vocabulary
-    self._loss_reduction = loss_reduction
-    self._loss_fn = loss_fn
-    self._name = name
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def logits_dimension(self):
-    return self._n_classes
-
-  def _eval_metric_ops(
-      self, labels, class_ids, weights, unreduced_loss, regularization_loss):
-    """Returns the Eval metric ops."""
-    with ops.name_scope(
-        None, 'metrics',
-        (labels, class_ids, weights, unreduced_loss, regularization_loss)):
-      keys = metric_keys.MetricKeys
-      metric_ops = {
-          # Estimator already adds a metric for loss.
-          # TODO(xiejw): Any other metrics?
-          _summary_key(self._name, keys.LOSS_MEAN):
-              metrics_lib.mean(
-                  values=unreduced_loss,
-                  weights=weights,
-                  name=keys.LOSS_MEAN),
-          _summary_key(self._name, keys.ACCURACY):
-              metrics_lib.accuracy(
-                  labels=labels,
-                  predictions=class_ids,
-                  weights=weights,
-                  name=keys.ACCURACY),
-      }
-      if regularization_loss is not None:
-        metric_ops[_summary_key(self._name, keys.LOSS_REGULARIZATION)] = (
-            metrics_lib.mean(
-                values=regularization_loss,
-                name=keys.LOSS_REGULARIZATION))
-    return metric_ops
-
-  def _label_ids(self, labels):
-    """Converts labels to integer id space."""
-    if self._label_vocabulary is None:
-      if not labels.dtype.is_integer:
-        raise ValueError('Labels dtype should be integer. Instead got {}.'.
-                         format(labels.dtype))
-      label_ids = labels
-    else:
-      if labels.dtype != dtypes.string:
-        raise ValueError('Labels dtype should be string if there is a '
-                         'vocabulary. Instead got {}'.format(labels.dtype))
-      label_ids = lookup_ops.index_table_from_tensor(
-          vocabulary_list=tuple(self._label_vocabulary),
-          name='class_id_lookup').lookup(labels)
-    return _assert_range(label_ids, self._n_classes)
-
-  def create_loss(self, features, mode, logits, labels):
-    """See `Head`."""
-    del mode  # Unused for this head.
-    logits = ops.convert_to_tensor(logits)
-    labels = _check_dense_labels_match_logits_and_reshape(
-        labels=labels, logits=logits, expected_labels_dimension=1)
-    label_ids = self._label_ids(labels)
-    if self._loss_fn:
-      unweighted_loss = _call_loss_fn(
-          loss_fn=self._loss_fn, labels=label_ids, logits=logits,
-          features=features, expected_loss_dim=1)
-    else:
-      unweighted_loss = losses.sparse_softmax_cross_entropy(
-          labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
-      # Restore the squeezed dim, so unweighted_loss matches the weights shape.
-      unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=-1)
-    weights = _get_weights_and_check_match_logits(
-        features=features, weight_column=self._weight_column, logits=logits)
-    training_loss = losses.compute_weighted_loss(
-        unweighted_loss, weights=weights, reduction=self._loss_reduction)
-    return LossSpec(
-        training_loss=training_loss,
-        unreduced_loss=unweighted_loss,
-        weights=weights,
-        processed_labels=label_ids)
-
-  def _create_tpu_estimator_spec(
-      self, features, mode, logits, labels=None, optimizer=None,
-      train_op_fn=None, regularization_losses=None):
-    """Returns a `model_fn._TPUEstimatorSpec`.
-
-    Args:
-      features: Input `dict` of `Tensor` or `SparseTensor` objects.
-      mode: Estimator's `ModeKeys`.
-      logits: logits `Tensor` with shape `[D0, D1, ... DN, logits_dimension]`.
-        For many applications, the shape is `[batch_size, logits_dimension]`.
-      labels: Labels integer or string `Tensor` with shape matching `logits`,
-        namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is
-        required argument when `mode` equals `TRAIN` or `EVAL`.
-      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
-        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
-        updates variables and increments `global_step`.
-      train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Used if `optimizer` is `None`.
-      regularization_losses: A list of additional scalar losses to be added to
-        the training loss, such as regularization losses. These losses are
-        usually expressed as a batch average, so for best results users need to
-        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
-        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
-        avoid scaling errors.
-    Returns:
-      A `model_fn._TPUEstimatorSpec` instance.
-    Raises:
-      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
-        mode, or if both are set.
-    """
-    with ops.name_scope(self._name, 'head'):
-      logits = _check_logits_final_dim(logits, self.logits_dimension)
-
-      # Predict.
-      pred_keys = prediction_keys.PredictionKeys
-      with ops.name_scope(None, 'predictions', (logits,)):
-        # class_ids's shape is [D0, D1, ... DN].
-        class_ids = math_ops.argmax(logits, axis=-1, name=pred_keys.CLASS_IDS)
-        class_ids = array_ops.expand_dims(class_ids, axis=-1)
-        if self._label_vocabulary:
-          table = lookup_ops.index_to_string_table_from_tensor(
-              vocabulary_list=self._label_vocabulary,
-              name='class_string_lookup')
-          classes = table.lookup(class_ids)
-        else:
-          classes = string_ops.as_string(class_ids, name='str_classes')
-
-        probabilities = nn.softmax(logits, name=pred_keys.PROBABILITIES)
-        predictions = {
-            pred_keys.LOGITS: logits,
-            pred_keys.PROBABILITIES: probabilities,
-            # Expand to [batch_size, 1]
-            pred_keys.CLASS_IDS: class_ids,
-            pred_keys.CLASSES: classes,
-        }
-      if mode == model_fn.ModeKeys.PREDICT:
-        classifier_output = _classification_output(
-            scores=probabilities, n_classes=self._n_classes,
-            label_vocabulary=self._label_vocabulary)
-        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
-            mode=model_fn.ModeKeys.PREDICT,
-            predictions=predictions,
-            export_outputs={
-                _DEFAULT_SERVING_KEY: classifier_output,
-                _CLASSIFY_SERVING_KEY: classifier_output,
-                _PREDICT_SERVING_KEY: export_output.PredictOutput(predictions)
-            })
-
-      training_loss, unreduced_loss, weights, label_ids = self.create_loss(
-          features=features, mode=mode, logits=logits, labels=labels)
-      if regularization_losses:
-        regularization_loss = math_ops.add_n(regularization_losses)
-        regularized_training_loss = math_ops.add_n(
-            [training_loss, regularization_loss])
-      else:
-        regularization_loss = None
-        regularized_training_loss = training_loss
-      # Eval.
-      if mode == model_fn.ModeKeys.EVAL:
-        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
-            mode=model_fn.ModeKeys.EVAL,
-            predictions=predictions,
-            loss=regularized_training_loss,
-            eval_metrics=_create_eval_metrics_tuple(self._eval_metric_ops, {
-                'labels': label_ids,
-                'class_ids': class_ids,
-                'weights': weights,
-                'unreduced_loss': unreduced_loss,
-                'regularization_loss': regularization_loss
-            }))
-
-      # Train.
-      if optimizer is not None:
-        if train_op_fn is not None:
-          raise ValueError('train_op_fn and optimizer cannot both be set.')
-        train_op = optimizer.minimize(
-            regularized_training_loss,
-            global_step=training_util.get_global_step())
-      elif train_op_fn is not None:
-        train_op = train_op_fn(regularized_training_loss)
-      else:
-        raise ValueError('train_op_fn and optimizer cannot both be None.')
-      train_op = _append_update_ops(train_op)
-      # Only summarize mean_loss for SUM reduction to preserve backwards
-      # compatibility. Otherwise skip it to avoid unnecessary computation.
-      if self._loss_reduction == losses.Reduction.SUM:
-        example_weight_sum = math_ops.reduce_sum(
-            weights * array_ops.ones_like(unreduced_loss))
-        mean_loss = training_loss / example_weight_sum
-      else:
-        mean_loss = None
-    with ops.name_scope(''):
-      keys = metric_keys.MetricKeys
-      summary.scalar(
-          _summary_key(self._name, keys.LOSS),
-          regularized_training_loss)
-      if mean_loss is not None:
-        summary.scalar(
-            _summary_key(self._name, keys.LOSS_MEAN),
-            mean_loss)
-      if regularization_loss is not None:
-        summary.scalar(
-            _summary_key(self._name, keys.LOSS_REGULARIZATION),
-            regularization_loss)
-    return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
-        mode=model_fn.ModeKeys.TRAIN,
-        predictions=predictions,
-        loss=regularized_training_loss,
-        train_op=train_op)
-
-
-def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
-    weight_column=None,
-    thresholds=None,
-    label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM,
-    loss_fn=None,
-    name=None):
-  """Creates a `_Head` for single label binary classification.
-
-  This head uses `sigmoid_cross_entropy_with_logits` loss.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, 1]`.
-  In many applications, the shape is `[batch_size, 1]`.
-
-  `labels` must be a dense `Tensor` with shape matching `logits`, namely
-  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
-  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
-  `labels` must be float `Tensor` with values in the interval `[0, 1]`.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
-
-  The loss is the weighted sum over the input dimensions. Namely, if the input
-  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
-  `batch_size`.
-
-  Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
-  `(labels, logits, features)` as arguments and returns unreduced loss with
-  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support float `labels` with
-  shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
-  the input labels before passing them to `loss_fn`.
-
-  Args:
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.
-    thresholds: Iterable of floats in the range `(0, 1)`. For binary
-      classification metrics such as precision and recall, an eval metric is
-      generated for each threshold value. This threshold is applied to the
-      logistic values to determine the binary classification (i.e., above the
-      threshold is `true`, below is `false`.
-    label_vocabulary: A list or tuple of strings representing possible label
-      values. If it is not given, that means labels are already encoded within
-      [0, 1]. If given, labels must be string type and have any value in
-      `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
-      is not provided but labels are strings.
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
-    loss_fn: Optional loss function.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-
-  Returns:
-    An instance of `_Head` for binary classification.
-
-  Raises:
-    ValueError: If `thresholds` contains a value outside of `(0, 1)`.
-    ValueError: If `loss_reduction` is invalid.
-    TypeError: if `label_vocabulary` has invalid type.
-  """
-  thresholds = tuple(thresholds) if thresholds else tuple()
-  if label_vocabulary is not None and not isinstance(label_vocabulary,
-                                                     (list, tuple)):
-    raise TypeError(
-        'label_vocabulary should be a list or tuple. Given type: {}'.format(
-            type(label_vocabulary)))
-
-  for threshold in thresholds:
-    if (threshold <= 0.0) or (threshold >= 1.0):
-      raise ValueError('thresholds not in (0, 1): {}.'.format((thresholds,)))
-  if (loss_reduction not in losses.Reduction.all() or
-      loss_reduction == losses.Reduction.NONE):
-    raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
-  if loss_fn:
-    _validate_loss_fn_args(loss_fn)
-  return _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(
-      weight_column=weight_column,
-      thresholds=thresholds,
-      label_vocabulary=label_vocabulary,
-      loss_reduction=loss_reduction,
-      loss_fn=loss_fn,
-      name=name)
-
-
-class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
-  """See `_binary_logistic_head_with_sigmoid_cross_entropy_loss`."""
-
-  def __init__(self,
-               weight_column=None,
-               thresholds=None,
-               label_vocabulary=None,
-               loss_reduction=losses.Reduction.SUM,
-               loss_fn=None,
-               name=None):
-    self._weight_column = weight_column
-    self._thresholds = thresholds
-    self._label_vocabulary = label_vocabulary
-    self._loss_reduction = loss_reduction
-    self._loss_fn = loss_fn
-    self._name = name
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def logits_dimension(self):
-    return 1
-
-  def _eval_metric_ops(self, labels, logits, logistic, class_ids, weights,
-                       unreduced_loss, regularization_loss):
-    with ops.name_scope(None, 'metrics',
-                        (labels, logits, logistic, class_ids, weights,
-                         unreduced_loss, regularization_loss)):
-      keys = metric_keys.MetricKeys
-      labels_mean = _indicator_labels_mean(
-          labels=labels, weights=weights, name=keys.LABEL_MEAN)
-      metric_ops = {
-          # Estimator already adds a metric for loss.
-          _summary_key(self._name, keys.LOSS_MEAN):
-              metrics_lib.mean(
-                  values=unreduced_loss,
-                  weights=weights,
-                  name=keys.LOSS_MEAN),
-          _summary_key(self._name, keys.ACCURACY):
-              metrics_lib.accuracy(
-                  labels=labels,
-                  predictions=class_ids,
-                  weights=weights,
-                  name=keys.ACCURACY),
-          _summary_key(self._name, keys.PRECISION):
-              metrics_lib.precision(
-                  labels=labels,
-                  predictions=class_ids,
-                  weights=weights,
-                  name=keys.PRECISION),
-          _summary_key(self._name, keys.RECALL):
-              metrics_lib.recall(
-                  labels=labels,
-                  predictions=class_ids,
-                  weights=weights,
-                  name=keys.RECALL),
-          _summary_key(self._name, keys.PREDICTION_MEAN):
-              _predictions_mean(
-                  predictions=logistic,
-                  weights=weights,
-                  name=keys.PREDICTION_MEAN),
-          _summary_key(self._name, keys.LABEL_MEAN):
-              labels_mean,
-          _summary_key(self._name, keys.ACCURACY_BASELINE):
-              _accuracy_baseline(labels_mean),
-          _summary_key(self._name, keys.AUC):
-              _auc(
-                  labels=labels,
-                  predictions=logistic,
-                  weights=weights,
-                  name=keys.AUC),
-          _summary_key(self._name, keys.AUC_PR):
-              _auc(
-                  labels=labels,
-                  predictions=logistic,
-                  weights=weights,
-                  curve='PR',
-                  name=keys.AUC_PR)
-      }
-      if regularization_loss is not None:
-        metric_ops[_summary_key(self._name, keys.LOSS_REGULARIZATION)] = (
-            metrics_lib.mean(
-                values=regularization_loss,
-                name=keys.LOSS_REGULARIZATION))
-      for threshold in self._thresholds:
-        accuracy_key = keys.ACCURACY_AT_THRESHOLD % threshold
-        metric_ops[_summary_key(self._name,
-                                accuracy_key)] = _accuracy_at_threshold(
-                                    labels=labels,
-                                    predictions=logistic,
-                                    weights=weights,
-                                    threshold=threshold,
-                                    name=accuracy_key)
-        # Precision for positive examples.
-        precision_key = keys.PRECISION_AT_THRESHOLD % threshold
-        metric_ops[_summary_key(self._name,
-                                precision_key)] = _precision_at_threshold(
-                                    labels=labels,
-                                    predictions=logistic,
-                                    weights=weights,
-                                    threshold=threshold,
-                                    name=precision_key)
-        # Recall for positive examples.
-        recall_key = keys.RECALL_AT_THRESHOLD % threshold
-        metric_ops[_summary_key(self._name,
-                                recall_key)] = _recall_at_threshold(
-                                    labels=labels,
-                                    predictions=logistic,
-                                    weights=weights,
-                                    threshold=threshold,
-                                    name=recall_key)
-      return metric_ops
-
-  def create_loss(self, features, mode, logits, labels):
-    """See `Head`."""
-    del mode  # Unused for this head.
-    logits = ops.convert_to_tensor(logits)
-    labels = _check_dense_labels_match_logits_and_reshape(
-        labels=labels, logits=logits, expected_labels_dimension=1)
-    if self._label_vocabulary is not None:
-      labels = lookup_ops.index_table_from_tensor(
-          vocabulary_list=tuple(self._label_vocabulary),
-          name='class_id_lookup').lookup(labels)
-    labels = math_ops.to_float(labels)
-    labels = _assert_range(labels, n_classes=2)
-    if self._loss_fn:
-      unweighted_loss = _call_loss_fn(
-          loss_fn=self._loss_fn, labels=labels, logits=logits,
-          features=features, expected_loss_dim=1)
-    else:
-      unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
-          labels=labels, logits=logits)
-    weights = _get_weights_and_check_match_logits(
-        features=features, weight_column=self._weight_column, logits=logits)
-    training_loss = losses.compute_weighted_loss(
-        unweighted_loss, weights=weights, reduction=self._loss_reduction)
-    return LossSpec(
-        training_loss=training_loss,
-        unreduced_loss=unweighted_loss,
-        weights=weights,
-        processed_labels=labels)
-
-  def _create_tpu_estimator_spec(
-      self, features, mode, logits, labels=None, optimizer=None,
-      train_op_fn=None, regularization_losses=None):
-    """Returns an `EstimatorSpec`.
-
-    Args:
-      features: Input `dict` of `Tensor` or `SparseTensor` objects.
-      mode: Estimator's `ModeKeys`.
-      logits: logits `Tensor` with shape `[D0, D1, ... DN, 1]`. For many
-        applications, the shape is `[batch_size, 1]`.
-      labels: Labels integer or string `Tensor` with shape matching `logits`,
-        namely `[D0, D1, ... DN, 1]` or `[D0, D1, ... DN]`. `labels` is required
-        argument when `mode` equals `TRAIN` or `EVAL`.
-      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
-        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
-        updates variables and increments `global_step`.
-      train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Used if `optimizer` is `None`.
-      regularization_losses: A list of additional scalar losses to be added to
-        the training loss, such as regularization losses. These losses are
-        usually expressed as a batch average, so for best results users need to
-        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
-        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
-        avoid scaling errors.
-    Returns:
-      `EstimatorSpec`.
-    Raises:
-      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
-        mode, or if both are set.
-    """
-    # Predict.
-    with ops.name_scope(self._name, 'head'):
-      with ops.name_scope(None, 'predictions', (logits,)):
-        pred_keys = prediction_keys.PredictionKeys
-        logits = _check_logits_final_dim(logits, self.logits_dimension)
-        logistic = math_ops.sigmoid(logits, name=pred_keys.LOGISTIC)
-        two_class_logits = array_ops.concat(
-            (array_ops.zeros_like(logits), logits),
-            axis=-1, name='two_class_logits')
-        probabilities = nn.softmax(
-            two_class_logits, name=pred_keys.PROBABILITIES)
-        class_ids = math_ops.argmax(
-            two_class_logits, axis=-1, name=pred_keys.CLASS_IDS)
-        class_ids = array_ops.expand_dims(class_ids, axis=-1)
-        if self._label_vocabulary:
-          table = lookup_ops.index_to_string_table_from_tensor(
-              vocabulary_list=self._label_vocabulary,
-              name='class_string_lookup')
-          classes = table.lookup(class_ids)
-        else:
-          classes = string_ops.as_string(class_ids, name='str_classes')
-        predictions = {
-            pred_keys.LOGITS: logits,
-            pred_keys.LOGISTIC: logistic,
-            pred_keys.PROBABILITIES: probabilities,
-            pred_keys.CLASS_IDS: class_ids,
-            pred_keys.CLASSES: classes,
-        }
-      if mode == model_fn.ModeKeys.PREDICT:
-        classifier_output = _classification_output(
-            scores=probabilities, n_classes=2,
-            label_vocabulary=self._label_vocabulary)
-        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
-            mode=model_fn.ModeKeys.PREDICT,
-            predictions=predictions,
-            export_outputs={
-                _DEFAULT_SERVING_KEY: classifier_output,
-                _CLASSIFY_SERVING_KEY: classifier_output,
-                _REGRESS_SERVING_KEY: export_output.RegressionOutput(
-                    value=logistic),
-                _PREDICT_SERVING_KEY: export_output.PredictOutput(predictions)
-            })
-
-      (training_loss, unreduced_loss, weights, processed_labels) = (
-          self.create_loss(
-              features=features, mode=mode, logits=logits, labels=labels))
-      if regularization_losses:
-        regularization_loss = math_ops.add_n(regularization_losses)
-        regularized_training_loss = math_ops.add_n(
-            [training_loss, regularization_loss])
-      else:
-        regularization_loss = None
-        regularized_training_loss = training_loss
-
-      # Eval.
-      if mode == model_fn.ModeKeys.EVAL:
-        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
-            mode=model_fn.ModeKeys.EVAL,
-            predictions=predictions,
-            loss=regularized_training_loss,
-            eval_metrics=_create_eval_metrics_tuple(
-                self._eval_metric_ops,
-                {
-                    'labels': processed_labels,
-                    'logits': logits,
-                    'logistic': logistic,
-                    'class_ids': class_ids,
-                    'weights': weights,
-                    'unreduced_loss': unreduced_loss,
-                    'regularization_loss': regularization_loss
-                }
-            ))
-
-      # Train.
-      if optimizer is not None:
-        if train_op_fn is not None:
-          raise ValueError('train_op_fn and optimizer cannot both be set.')
-        train_op = optimizer.minimize(
-            regularized_training_loss,
-            global_step=training_util.get_global_step())
-      elif train_op_fn is not None:
-        train_op = train_op_fn(regularized_training_loss)
-      else:
-        raise ValueError('train_op_fn and optimizer cannot both be None.')
-      train_op = _append_update_ops(train_op)
-      # Only summarize mean_loss for SUM reduction to preserve backwards
-      # compatibility. Otherwise skip it to avoid unnecessary computation.
-      if self._loss_reduction == losses.Reduction.SUM:
-        example_weight_sum = math_ops.reduce_sum(
-            weights * array_ops.ones_like(unreduced_loss))
-        mean_loss = training_loss / example_weight_sum
-      else:
-        mean_loss = None
-    with ops.name_scope(''):
-      keys = metric_keys.MetricKeys
-      summary.scalar(
-          _summary_key(self._name, keys.LOSS),
-          regularized_training_loss)
-      if mean_loss is not None:
-        summary.scalar(
-            _summary_key(self._name, keys.LOSS_MEAN), mean_loss)
-      if regularization_loss is not None:
-        summary.scalar(
-            _summary_key(self._name, keys.LOSS_REGULARIZATION),
-            regularization_loss)
-    return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
-        mode=model_fn.ModeKeys.TRAIN,
-        predictions=predictions,
-        loss=regularized_training_loss,
-        train_op=train_op)
-
-
-def _regression_head(
-    weight_column=None,
-    label_dimension=1,
-    loss_reduction=losses.Reduction.SUM,
-    loss_fn=None,
-    inverse_link_fn=None,
-    name=None):
-  """Creates a `_Head` for regression using the `mean_squared_error` loss.
-
-  The loss is the weighted sum over all input dimensions. Namely, if the input
-  labels have shape `[batch_size, label_dimension]`, the loss is the weighted
-  sum over both `batch_size` and `label_dimension`.
-
-  The head expects `logits` with shape `[D0, D1, ... DN, label_dimension]`.
-  In many applications, the shape is `[batch_size, label_dimension]`.
-
-  The `labels` shape must match `logits`, namely
-  `[D0, D1, ... DN, label_dimension]`. If `label_dimension=1`, shape
-  `[D0, D1, ... DN]` is also supported.
-
-  If `weight_column` is specified, weights must be of shape
-  `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
-  `[D0, D1, ... DN, label_dimension]`.
-
-  Supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
-  `(labels, logits, features)` as arguments and returns unreduced loss with
-  shape `[D0, D1, ... DN, label_dimension]`.
-
-  Also supports custom `inverse_link_fn`, also known as 'mean function'.
-  `inverse_link_fn` takes `logits` as argument and returns predicted values.
-  This function is the inverse of the link function defined in
-  https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function
-  Namely, for poisson regression, set `inverse_link_fn=tf.exp`.
-
-  Args:
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.
-    label_dimension: Number of regression labels per example. This is the size
-      of the last dimension of the labels `Tensor` (typically, this has shape
-      `[batch_size, label_dimension]`).
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
-    loss_fn: Optional loss function. Defaults to `mean_squared_error`.
-    inverse_link_fn: Optional inverse link function, also known as 'mean
-      function'. Defaults to identity.
-    name: name of the head. If provided, summary and metrics keys will be
-      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
-
-  Returns:
-    An instance of `_Head` for linear regression.
-
-  Raises:
-    ValueError: If `label_dimension` or `loss_reduction` is invalid.
-  """
-  if (loss_reduction not in losses.Reduction.all() or
-      loss_reduction == losses.Reduction.NONE):
-    raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
-  if loss_fn:
-    _validate_loss_fn_args(loss_fn)
-  return _RegressionHeadWithMeanSquaredErrorLoss(
-      weight_column=weight_column,
-      label_dimension=label_dimension,
-      loss_reduction=loss_reduction,
-      loss_fn=loss_fn,
-      inverse_link_fn=inverse_link_fn,
-      name=name)
-
-
-class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
-  """`Head` for regression using the mean squared loss."""
-
-  def __init__(
-      self,
-      label_dimension,
-      weight_column=None,
-      loss_reduction=losses.Reduction.SUM,
-      loss_fn=None,
-      inverse_link_fn=None,
-      name=None):
-    """`Head` for regression."""
-    if label_dimension < 1:
-      raise ValueError('Invalid label_dimension %s.' % label_dimension)
-    self._logits_dimension = label_dimension
-    self._weight_column = weight_column
-    self._loss_reduction = loss_reduction
-    self._loss_fn = loss_fn
-    self._inverse_link_fn = inverse_link_fn
-    self._name = name
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def logits_dimension(self):
-    return self._logits_dimension
-
-  def create_loss(self, features, mode, logits, labels):
-    """See `Head`."""
-    del mode  # Unused for this head.
-    logits = ops.convert_to_tensor(logits)
-    labels = _check_dense_labels_match_logits_and_reshape(
-        labels=labels, logits=logits,
-        expected_labels_dimension=self._logits_dimension)
-    labels = math_ops.to_float(labels)
-    if self._loss_fn:
-      unweighted_loss = _call_loss_fn(
-          loss_fn=self._loss_fn, labels=labels, logits=logits,
-          features=features, expected_loss_dim=self._logits_dimension)
-    else:
-      unweighted_loss = losses.mean_squared_error(
-          labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
-    weights = _get_weights_and_check_match_logits(
-        features=features, weight_column=self._weight_column, logits=logits,
-        allow_per_logit_weights=True)
-    training_loss = losses.compute_weighted_loss(
-        unweighted_loss, weights=weights, reduction=self._loss_reduction)
-    return LossSpec(
-        training_loss=training_loss,
-        unreduced_loss=unweighted_loss,
-        weights=weights,
-        processed_labels=labels)
-
-  def _eval_metric_ops(self, predicted_value, labels, weights, unreduced_loss,
-                       regularization_loss):
-    """Returns the Eval metric ops."""
-    keys = metric_keys.MetricKeys
-    # Estimator already adds a metric for loss.
-    eval_metric_ops = {
-        _summary_key(self._name, keys.LOSS_MEAN):
-            metrics_lib.mean(values=unreduced_loss, weights=weights),
-        _summary_key(self._name, keys.PREDICTION_MEAN):
-            _predictions_mean(
-                predictions=predicted_value,
-                weights=weights,
-                name=keys.PREDICTION_MEAN),
-        _summary_key(self._name, keys.LABEL_MEAN):
-            metrics_lib.mean(values=labels, weights=weights)
-    }
-    if regularization_loss is not None:
-      regularization_loss_key = _summary_key(
-          self._name, keys.LOSS_REGULARIZATION)
-      eval_metric_ops[regularization_loss_key] = metrics_lib.mean(
-          values=regularization_loss,
-          name=keys.LOSS_REGULARIZATION)
-    return eval_metric_ops
-
-  def _create_tpu_estimator_spec(
-      self, features, mode, logits, labels=None, optimizer=None,
-      train_op_fn=None, regularization_losses=None):
-    """Returns an `EstimatorSpec`.
-
-    Args:
-      features: Input `dict` of `Tensor` or `SparseTensor` objects.
-      mode: Estimator's `ModeKeys`.
-      logits: logits `Tensor` with shape `[D0, D1, ... DN, logits_dimension]`.
-        For many applications, the shape is `[batch_size, logits_dimension]`.
-      labels: Labels `Tensor` with shape matching `logits`, namely
-        `[D0, D1, ... DN, logits_dimension]`. When `logits_dimension=1`, shape
-        `[D0, D1, ... DN]` is also supported. `labels` is required argument when
-        `mode` equals `TRAIN` or `EVAL`.
-      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
-        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
-        updates variables and increments `global_step`.
-      train_op_fn: Function that takes a scalar loss `Tensor` and returns
-        `train_op`. Used if `optimizer` is `None`.
-      regularization_losses: A list of additional scalar losses to be added to
-        the training loss, such as regularization losses. These losses are
-        usually expressed as a batch average, so for best results users need to
-        set `loss_reduction=SUM_OVER_BATCH_SIZE` or
-        `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
-        avoid scaling errors.
-    Returns:
-      A `model_fn._TPUEstimatorSpec` instance.
-    Raises:
-      ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
-        mode, or if both are set.
-    """
-    # Predict.
-    with ops.name_scope(self._name, 'head'):
-      logits = _check_logits_final_dim(logits, self._logits_dimension)
-      if self._inverse_link_fn:
-        predicted_value = self._inverse_link_fn(logits)
-        predictions = {
-            prediction_keys.PredictionKeys.PREDICTIONS: predicted_value,
-            prediction_keys.PredictionKeys.LOGITS: logits,
-        }
-      else:
-        predicted_value = logits
-        predictions = {
-            prediction_keys.PredictionKeys.PREDICTIONS: predicted_value}
-      if mode == model_fn.ModeKeys.PREDICT:
-        regression_output = export_output.RegressionOutput(
-            value=predicted_value)
-        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
-            mode=model_fn.ModeKeys.PREDICT,
-            predictions=predictions,
-            export_outputs={
-                _DEFAULT_SERVING_KEY: regression_output,
-                _REGRESS_SERVING_KEY: regression_output,
-                _PREDICT_SERVING_KEY: export_output.PredictOutput(predictions)
-            })
-
-      training_loss, unreduced_loss, weights, _ = self.create_loss(
-          features=features, mode=mode, logits=logits, labels=labels)
-      if regularization_losses:
-        regularization_loss = math_ops.add_n(regularization_losses)
-        regularized_training_loss = math_ops.add_n(
-            [training_loss, regularization_loss])
-      else:
-        regularization_loss = None
-        regularized_training_loss = training_loss
-
-      # Eval.
-      if mode == model_fn.ModeKeys.EVAL:
-        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
-            mode=model_fn.ModeKeys.EVAL,
-            predictions=predictions,
-            loss=regularized_training_loss,
-            eval_metrics=_create_eval_metrics_tuple(
-                self._eval_metric_ops, {
-                    'predicted_value': predicted_value,
-                    'labels': labels,
-                    'weights': weights,
-                    'unreduced_loss': unreduced_loss,
-                    'regularization_loss': regularization_loss,
-                }))
-
-      # Train.
-      if optimizer is not None:
-        if train_op_fn is not None:
-          raise ValueError('train_op_fn and optimizer cannot both be set.')
-        train_op = optimizer.minimize(
-            regularized_training_loss,
-            global_step=training_util.get_global_step())
-      elif train_op_fn is not None:
-        train_op = train_op_fn(regularized_training_loss)
-      else:
-        raise ValueError('train_op_fn and optimizer cannot both be None.')
-      train_op = _append_update_ops(train_op)
-      # Only summarize mean_loss for SUM reduction to preserve backwards
-      # compatibility. Otherwise skip it to avoid unnecessary computation.
-      if self._loss_reduction == losses.Reduction.SUM:
-        example_weight_sum = math_ops.reduce_sum(
-            weights * array_ops.ones_like(unreduced_loss))
-        mean_loss = training_loss / example_weight_sum
-      else:
-        mean_loss = None
-    with ops.name_scope(''):
-      keys = metric_keys.MetricKeys
-      summary.scalar(
-          _summary_key(self._name, keys.LOSS),
-          regularized_training_loss)
-      if mean_loss is not None:
-        summary.scalar(
-            _summary_key(self._name, keys.LOSS_MEAN), mean_loss)
-      if regularization_loss is not None:
-        summary.scalar(
-            _summary_key(self._name, keys.LOSS_REGULARIZATION),
-            regularization_loss)
-    return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
-        mode=model_fn.ModeKeys.TRAIN,
-        predictions=predictions,
-        loss=regularized_training_loss,
-        train_op=train_op)
-
-
-def _append_update_ops(train_op):
-  """Returns `train_op` appending `UPDATE_OPS` collection if present."""
-  update_ops = ops.get_collection(ops.GraphKeys.UPDATE_OPS)
-  if update_ops:
-    return control_flow_ops.group(train_op, *update_ops)
-  return train_op
-
-
-def _assert_range(labels, n_classes, message=None):
-  with ops.name_scope(None, 'assert_range', (labels,)):
-    assert_less = check_ops.assert_less_equal(
-        labels,
-        ops.convert_to_tensor(n_classes - 1, dtype=labels.dtype),
-        message=message or 'Labels must <= n_classes - 1')
-    assert_greater = check_ops.assert_non_negative(
-        labels, message=message or 'Labels must >= 0')
-    with ops.control_dependencies((assert_less, assert_greater)):
-      return array_ops.identity(labels)
-
-
-def _binary_logistic_or_multi_class_head(
-    n_classes, weight_column, label_vocabulary, loss_reduction):
-  """Creates either binary or multi-class head.
+from tensorflow_estimator.python.estimator.canned import head
 
-  Args:
-    n_classes: Number of label classes.
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example. If it is a string, it is
-      used as a key to fetch weight tensor from the `features`. If it is a
-      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-      then weight_column.normalizer_fn is applied on it to get weight tensor.
-    label_vocabulary: A list of strings represents possible label values. If
-      given, labels must be string type and have any value in
-      `label_vocabulary`. If it is not given, that means labels are
-      already encoded as integer or float within [0, 1] for `n_classes=2` and
-      encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
-      Also there will be errors if vocabulary is not provided and labels are
-      string.
-    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-      to reduce training loss over batch. Defaults to `SUM`.
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+head.__all__ = [s for s in dir(head) if not s.startswith('__')]
 
-  Returns:
-    `head._Head` instance.
-  """
-  if n_classes == 2:
-    head = _binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column=weight_column,
-        label_vocabulary=label_vocabulary,
-        loss_reduction=loss_reduction)
-  else:
-    head = _multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, weight_column=weight_column,
-        label_vocabulary=label_vocabulary,
-        loss_reduction=loss_reduction)
-  return head
+from tensorflow_estimator.python.estimator.canned.head import *
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
deleted file mode 100644
index de9c84d2ef259025817ea61331e1dad6d9617b77..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/canned/head_test.py
+++ /dev/null
@@ -1,4056 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for head.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.canned import dnn_testing_utils
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column as feature_column_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import queue_runner_impl
-
-
-_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
-
-def _initialize_variables(test_case, scaffold):
-  scaffold.finalize()
-  test_case.assertIsNone(scaffold.init_feed_dict)
-  test_case.assertIsNone(scaffold.init_fn)
-  scaffold.init_op.run()
-  scaffold.ready_for_local_init_op.eval()
-  scaffold.local_init_op.run()
-  scaffold.ready_op.eval()
-  test_case.assertIsNotNone(scaffold.saver)
-
-
-def _assert_simple_summaries(test_case, expected_summaries, summary_str,
-                             tol=1e-6):
-  """Assert summary the specified simple values.
-
-  Args:
-    test_case: test case.
-    expected_summaries: Dict of expected tags and simple values.
-    summary_str: Serialized `summary_pb2.Summary`.
-    tol: Tolerance for relative and absolute.
-  """
-  summary = summary_pb2.Summary()
-  summary.ParseFromString(summary_str)
-  test_case.assertAllClose(expected_summaries, {
-      v.tag: v.simple_value for v in summary.value
-  }, rtol=tol, atol=tol)
-
-
-def _assert_no_hooks(test_case, spec):
-  test_case.assertAllEqual([], spec.training_chief_hooks)
-  test_case.assertAllEqual([], spec.training_hooks)
-
-
-def _sigmoid(logits):
-  return 1 / (1 + np.exp(-logits))
-
-
-class CreateEstimatorSpecTest(test.TestCase):
-
-  class _HeadWithTPUSupport(head_lib._Head):
-    """Head that overrides _create_tpu_estimator_spec."""
-
-    def name(self):
-      return 'HeadWithTPUSupport'
-
-    def logits_dimension(self):
-      return None
-
-    def create_loss(self, features, mode, logits, labels):
-      return None
-
-    def _create_tpu_estimator_spec(self, features, mode, logits, labels=None,
-                                   optimizer=None, train_op_fn=None,
-                                   regularization_losses=None):
-      return model_fn._TPUEstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL,
-          loss=constant_op.constant(0.0, dtype=dtypes.float32))
-
-  class _HeadWithOutTPUSupport(head_lib._Head):
-    """Head that overrides create_estimator_spec."""
-
-    def name(self):
-      return 'HeadWithOutTPUSupport'
-
-    def logits_dimension(self):
-      return None
-
-    def create_loss(self, features, mode, logits, labels):
-      return None
-
-    def create_estimator_spec(self, features, mode, logits, labels=None,
-                              optimizer=None, train_op_fn=None,
-                              regularization_losses=None):
-      return model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL,
-          loss=constant_op.constant(0.0, dtype=dtypes.float32))
-
-  class _InvalidHead(head_lib._Head):
-    """Head that overrides neither estimator_spec functions."""
-
-    def name(self):
-      return 'InvalidHead'
-
-    def logits_dimension(self):
-      return None
-
-    def create_loss(self, features, mode, logits, labels):
-      return None
-
-  def test_head_override_tpu_estimator_spec(self):
-    """Test for `_Head` that overrides _create_tpu_estimator_spec."""
-    head = self._HeadWithTPUSupport()
-
-    tpu_spec = head._create_tpu_estimator_spec(
-        features=None, mode=None, logits=None)
-    self.assertTrue(isinstance(tpu_spec, model_fn._TPUEstimatorSpec))
-    est_spec = head.create_estimator_spec(
-        features=None, mode=None, logits=None)
-    self.assertTrue(isinstance(est_spec, model_fn.EstimatorSpec))
-
-  def test_head_override_estimator_spec(self):
-    """Test for `_Head` that overrides create_estimator_spec."""
-    head = self._HeadWithOutTPUSupport()
-
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        'TPUEstimatorSpec not available for this model head.'):
-      _ = head._create_tpu_estimator_spec(
-          features=None, mode=None, logits=None)
-    est_spec = head.create_estimator_spec(
-        features=None, mode=None, logits=None)
-    self.assertTrue(isinstance(est_spec, model_fn.EstimatorSpec))
-
-  def test_invalid_head_class(self):
-    head = self._InvalidHead()
-
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        'TPUEstimatorSpec not available for this model head.'):
-      _ = head._create_tpu_estimator_spec(
-          features=None, mode=None, logits=None)
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        r'Subclasses of _Head must implement `create_estimator_spec\(\)` or '
-        r'_create_tpu_estimator_spec\(\).'):
-      _ = head.create_estimator_spec(
-          features=None, mode=None, logits=None)
-
-
-class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
-
-  def setUp(self):
-    ops.reset_default_graph()
-
-  def test_n_classes_is_none(self):
-    with self.assertRaisesRegexp(ValueError, 'n_classes must be > 2'):
-      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-          n_classes=None)
-
-  def test_n_classes_is_2(self):
-    with self.assertRaisesRegexp(ValueError, 'n_classes must be > 2'):
-      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-          n_classes=2)
-
-  def test_invalid_loss_reduction(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'Invalid loss_reduction: invalid_loss_reduction'):
-      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-          n_classes=3, loss_reduction='invalid_loss_reduction')
-    with self.assertRaisesRegexp(
-        ValueError, r'Invalid loss_reduction: none'):
-      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-          n_classes=3, loss_reduction=losses.Reduction.NONE)
-
-  def test_loss_fn_arg_labels_missing(self):
-    def _loss_fn(logits):
-      del logits  # Unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn must contain argument: labels\. '
-        r'Given arguments: \(\'logits\',\)'):
-      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-          n_classes=3, loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_logits_missing(self):
-    def _loss_fn(labels):
-      del labels  # unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn must contain argument: logits\. '
-        r'Given arguments: \(\'labels\',\)'):
-      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-          n_classes=3, loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_features_ok(self):
-    def _loss_fn(labels, logits, features):
-      del labels, logits, features  # Unused
-    head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_invalid(self):
-    def _loss_fn(labels, logits, name=None):
-      del labels, logits, name  # Unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn has unexpected args: \[\'name\'\]'):
-      head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-          n_classes=3, loss_fn=_loss_fn)
-
-  def test_invalid_logits_shape(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-    self.assertEqual(n_classes, head.logits_dimension)
-
-    # Logits should be shape (batch_size, 3).
-    logits_2x2 = np.array(((45., 44.), (41., 42.),))
-
-    # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'logits shape'):
-      head.create_estimator_spec(
-          features={'x': np.array(((30.,), (42.,),))},
-          mode=model_fn.ModeKeys.PREDICT,
-          logits=logits_2x2)
-
-    # Dynamic shape.
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((30.,), (42.,),))},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits_placeholder)
-    with self.cached_session():
-      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
-        spec.predictions[prediction_keys.PredictionKeys.PROBABILITIES].eval({
-            logits_placeholder: logits_2x2
-        })
-
-  def test_invalid_labels_shape(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-    self.assertEqual(n_classes, head.logits_dimension)
-
-    # Logits should be shape (batch_size, 3).
-    # Labels should be shape (batch_size, 1).
-    labels_2x2 = np.array(((45, 44), (41, 42),), dtype=np.int)
-    logits_2x3 = np.array(((1., 2., 3.), (1., 2., 3.),))
-    features = {'x': np.array(((42.,),))}
-
-    # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'):
-      head.create_loss(
-          features=features,
-          mode=model_fn.ModeKeys.EVAL,
-          logits=logits_2x3,
-          labels=labels_2x2)
-
-    # Dynamic shape.
-    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_placeholder,
-        labels=labels_placeholder)[0]
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[2 2\]'):
-        training_loss.eval({
-            logits_placeholder: logits_2x3,
-            labels_placeholder: labels_2x2
-        })
-
-  def test_invalid_labels_type(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-    self.assertEqual(n_classes, head.logits_dimension)
-
-    # Logits should be shape (batch_size, 3).
-    # Labels should be shape (batch_size, 1).
-    labels_2x1 = np.array(((1.,), (1.,),))
-    logits_2x3 = np.array(((1., 2., 3.), (1., 2., 3.),))
-    features = {'x': np.array(((42.,),))}
-
-    # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'Labels dtype'):
-      head.create_loss(
-          features=features,
-          mode=model_fn.ModeKeys.EVAL,
-          logits=logits_2x3,
-          labels=labels_2x1)
-
-    # Dynamic shape.
-    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    with self.assertRaisesRegexp(ValueError, 'Labels dtype'):
-      head.create_loss(
-          features=features,
-          mode=model_fn.ModeKeys.EVAL,
-          logits=logits_placeholder,
-          labels=labels_placeholder)
-
-  def test_invalid_labels_values(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-    self.assertEqual(n_classes, head.logits_dimension)
-
-    labels_2x1_with_large_id = np.array(((45,), (1,),), dtype=np.int)
-    labels_2x1_with_negative_id = np.array(((-5,), (1,),), dtype=np.int)
-    logits_2x3 = np.array(((1., 2., 4.), (1., 2., 3.),))
-
-    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    training_loss = head.create_loss(
-        features={'x': np.array(((42.,),))},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_placeholder,
-        labels=labels_placeholder)[0]
-    with self.cached_session():
-      with self.assertRaisesOpError('Labels must <= n_classes - 1'):
-        training_loss.eval({
-            labels_placeholder: labels_2x1_with_large_id,
-            logits_placeholder: logits_2x3
-        })
-
-    with self.cached_session():
-      with self.assertRaisesOpError('Labels must >= 0'):
-        training_loss.eval({
-            labels_placeholder: labels_2x1_with_negative_id,
-            logits_placeholder: logits_2x3
-        })
-
-  def test_invalid_labels_sparse_tensor(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-    self.assertEqual(n_classes, head.logits_dimension)
-
-    labels_2x1 = sparse_tensor.SparseTensor(
-        values=['english', 'italian'],
-        indices=[[0, 0], [1, 0]],
-        dense_shape=[2, 1])
-    logits_2x3 = np.array(((1., 2., 4.), (1., 2., 3.),))
-
-    with self.assertRaisesRegexp(
-        ValueError, 'SparseTensor labels are not supported.'):
-      head.create_loss(
-          features={'x': np.array(((42.,),))},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=logits_2x3,
-          labels=labels_2x1)
-
-  def test_incompatible_labels_shape(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-    self.assertEqual(n_classes, head.logits_dimension)
-
-    # Logits should be shape (batch_size, 3).
-    # Labels should be shape (batch_size, 1).
-    # Here batch sizes are different.
-    values_3x1 = np.array(((1,), (1,), (1,),))
-    values_2x3 = np.array(((1., 2., 3.), (1., 2., 3.),))
-    features = {'x': values_2x3}
-
-    # Static shape.
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Shape mismatch: The shape of labels \(received \(3,\)\) should equal '
-        r'the shape of logits except for the last dimension '
-        r'\(received \(2, 3\)\)\.'
-    ):
-      head.create_loss(
-          features=features,
-          mode=model_fn.ModeKeys.EVAL,
-          logits=values_2x3,
-          labels=values_3x1)
-
-    # Dynamic shape.
-    labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_placeholder,
-        labels=labels_placeholder)[0]
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[3 1\]'):
-        training_loss.eval({
-            labels_placeholder: values_3x1,
-            logits_placeholder: values_2x3
-        })
-
-  def test_name(self):
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, name='foo')
-    self.assertEqual('foo', head.name)
-
-  def test_predict(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-    self.assertEqual(n_classes, head.logits_dimension)
-
-    logits = [[1., 0., 0.], [0., 0., 1.]]
-    expected_probabilities = [[0.576117, 0.2119416, 0.2119416],
-                              [0.2119416, 0.2119416, 0.576117]]
-    expected_class_ids = [[0], [2]]
-    expected_classes = [[b'0'], [b'2']]
-    expected_export_classes = [[b'0', b'1', b'2']] * 2
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'classification'),
-        spec.export_outputs.keys())
-
-    # Assert predictions and export_outputs.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(logits,
-                          predictions[prediction_keys.PredictionKeys.LOGITS])
-      self.assertAllClose(
-          expected_probabilities,
-          predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-      self.assertAllClose(expected_class_ids,
-                          predictions[prediction_keys.PredictionKeys.CLASS_IDS])
-      self.assertAllEqual(expected_classes,
-                          predictions[prediction_keys.PredictionKeys.CLASSES])
-
-      self.assertAllClose(
-          expected_probabilities,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
-      self.assertAllEqual(
-          expected_export_classes,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].classes))
-
-  def test_predict_with_vocabulary_list(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, label_vocabulary=['aang', 'iroh', 'zuko'])
-
-    logits = [[1., 0., 0.], [0., 0., 1.]]
-    expected_classes = [[b'aang'], [b'zuko']]
-    expected_export_classes = [[b'aang', b'iroh', b'zuko']] * 2
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertAllEqual(
-          expected_classes,
-          sess.run(spec.predictions[prediction_keys.PredictionKeys.CLASSES]))
-      self.assertAllEqual(
-          expected_export_classes,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].classes))
-
-  def test_weight_should_not_impact_prediction(self):
-    n_classes = 3
-    logits = [[1., 0., 0.], [0., 0., 1.]]
-    expected_probabilities = [[0.576117, 0.2119416, 0.2119416],
-                              [0.2119416, 0.2119416, 0.576117]]
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, weight_column='label_weights')
-
-    weights_2x1 = [[1.], [2.]]
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array(((42,),), dtype=np.int32),
-            'label_weights': weights_2x1,
-        },
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(logits,
-                          predictions[prediction_keys.PredictionKeys.LOGITS])
-      self.assertAllClose(
-          expected_probabilities,
-          predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-
-  def test_eval_create_loss(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_training_loss = 10.
-    # Create loss.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
-
-  def test_eval_create_loss_loss_fn(self):
-    """Tests head.create_loss for eval mode and custom loss_fn."""
-    loss = np.array([[1.], [2.]], dtype=np.float32)
-    logits_input = np.array([[-10., 10., 0.], [-15., 10., 0]], dtype=np.float32)
-    labels_input = np.array([[1], [2]], dtype=np.int64)
-    def _loss_fn(labels, logits):
-      check_labels = control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
-          data=[labels])
-      check_logits = control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
-          data=[logits])
-      with ops.control_dependencies([check_labels, check_logits]):
-        return constant_op.constant(loss)
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, loss_fn=_loss_fn)
-
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_input,
-        labels=labels_input)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
-
-  def test_eval_create_loss_loss_fn_wrong_shape(self):
-    """Tests custom loss_fn that returns Tensor of unexpected shape."""
-    loss = np.array([1., 2.], dtype=np.float32)
-    def _loss_fn(labels, logits):
-      del labels, logits  # Unused
-      return constant_op.constant(loss)
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, loss_fn=_loss_fn)
-
-    logits = np.array([[-10., 10., 0.], [-15., 10., 0.]], dtype=np.float32)
-    labels = np.array([[1], [2]], dtype=np.int64)
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
-          r'\[logits_shape: \] \[2 3\] \[loss_shape: \] \[2\]'):
-        actual_training_loss.eval()
-
-  def test_eval_labels_none(self):
-    """Tests that error is raised when labels is None."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3)
-
-    with self.assertRaisesRegexp(
-        ValueError, r'You must provide a labels Tensor\. Given: None\.'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32),
-          labels=None)
-
-  def test_eval(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
-    expected_loss = 10.
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_loss / 2,
-        keys.ACCURACY: 0.5,  # 1 of 2 labels is correct.
-    }
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, and metrics.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval()
-                             for k in value_ops},
-          rtol=tol,
-          atol=tol)
-
-  def test_eval_metric_ops_with_head_name(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, name='some_multiclass_head')
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    expected_metric_keys = [
-        '{}/some_multiclass_head'.format(metric_keys.MetricKeys.LOSS_MEAN),
-        '{}/some_multiclass_head'.format(metric_keys.MetricKeys.ACCURACY)
-    ]
-    self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
-
-  def test_eval_with_regularization_losses(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    regularization_losses = [1.5, 0.5]
-    expected_regularization_loss = 2.
-    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
-    #                    = sum(10, 0) / 2 = 5.
-    expected_unregularized_loss = 5.
-    expected_regularized_loss = (
-        expected_unregularized_loss + expected_regularization_loss)
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels,
-        regularization_losses=regularization_losses)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_unregularized_loss,
-        keys.LOSS_REGULARIZATION: expected_regularization_loss,
-        keys.ACCURACY: 0.5,  # 1 of 2 labels is correct.
-    }
-
-    # Assert predictions, loss, and metrics.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_regularized_loss, loss, rtol=tol, atol=tol)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval()
-                             for k in value_ops},
-          rtol=tol,
-          atol=tol)
-
-  def test_eval_with_label_vocabulary_create_loss(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, label_vocabulary=['aang', 'iroh', 'zuko'])
-    logits = [[10., 0, 0], [0, 10, 0]]
-    labels = [[b'iroh'], [b'iroh']]
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_training_loss = 10.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
-
-  def test_eval_with_label_vocabulary(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, label_vocabulary=['aang', 'iroh', 'zuko'])
-
-    logits = [[10., 0, 0], [0, 10, 0]]
-    labels = [[b'iroh'], [b'iroh']]
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
-    expected_loss = 10.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_loss / 2,
-        keys.ACCURACY: 0.5,  # 1 of 2 labels is correct.
-    }
-
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
-          rtol=tol, atol=tol)
-
-  def test_weighted_multi_example_eval(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, weight_column='label_weights')
-
-    # Create estimator spec.
-    logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
-    labels = np.array(((1,), (2,), (2,)), dtype=np.int64)
-    weights_3x1 = np.array(((1.,), (2.,), (3.,)), dtype=np.float64)
-    # loss = sum(cross_entropy(labels, logits) * [1, 2, 3])
-    #      = sum([10, 10, 0] * [1, 2, 3]) = 30
-    expected_loss = 30.
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array(((42,),), dtype=np.int32),
-            'label_weights': weights_3x1,
-        },
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_loss / np.sum(weights_3x1),
-        # Weighted accuracy is 1 * 3.0 / sum weights = 0.5
-        keys.ACCURACY: 0.5,
-    }
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert loss, and metrics.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
-          rtol=tol, atol=tol)
-
-  def test_train_create_loss(self):
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3)
-
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-
-    # unreduced_loss = cross_entropy(labels, logits) = [10, 0].
-    expected_unreduced_loss = [[10.], [0.]]
-    # Weights default to 1.
-    expected_weights = 1.
-    # training_loss = 1 * 10 + 1 * 0
-    expected_training_loss = 10.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    tol = 1e-2
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_unreduced_loss, unreduced_loss.eval(), rtol=tol, atol=tol)
-      self.assertAllClose(expected_weights, actual_weights)
-
-  def test_train_create_loss_loss_reduction(self):
-    """Tests create_loss with loss_reduction."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
-
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-
-    # unreduced_loss = cross_entropy(labels, logits) = [10, 0].
-    expected_unreduced_loss = [[10.], [0.]]
-    # Weights default to 1.
-    expected_weights = 1.
-    # training_loss = 1 * 10 + 1 * 0 / num_nonzero_weights
-    expected_training_loss = 10. / 2.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    tol = 1e-2
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_unreduced_loss, unreduced_loss.eval(), rtol=tol, atol=tol)
-      self.assertAllClose(expected_weights, actual_weights)
-
-  def test_train_labels_none(self):
-    """Tests that error is raised when labels is None."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3)
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    with self.assertRaisesRegexp(
-        ValueError, r'You must provide a labels Tensor\. Given: None\.'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32),
-          labels=None,
-          train_op_fn=_no_op_train_fn)
-
-  def test_train(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=2)])
-
-    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
-    expected_loss = 10.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    self.assertIsNotNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
-          train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-      }, summary_str, tol)
-
-  def test_train_with_optimizer(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    expected_train_result = 'my_train_op'
-
-    class _Optimizer(object):
-
-      def minimize(self, loss, global_step):
-        del global_step
-        return string_ops.string_join(
-            [constant_op.constant(expected_train_result),
-             string_ops.as_string(loss, precision=2)])
-
-    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
-    expected_loss = 10.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        optimizer=_Optimizer())
-
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
-          train_result)
-
-  def test_train_with_update_ops(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
-
-    with ops.Graph().as_default():
-      w = variables.Variable(1)
-      update_op = w.assign_add(1)
-      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
-
-      t = variables.Variable('')
-      expected_train_result = b'my_train_op'
-      def _train_op_fn(loss):
-        del loss
-        return t.assign(expected_train_result)
-
-      spec = head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32),
-          labels=np.array(((1,), (1,)), dtype=np.int64),
-          train_op_fn=_train_op_fn)
-
-      with self.cached_session() as sess:
-        _initialize_variables(self, spec.scaffold)
-        sess.run(spec.train_op)
-        w_value, t_value = sess.run([w, t])
-        self.assertEqual(2, w_value)
-        self.assertEqual(expected_train_result, t_value)
-
-  def test_train_summaries_with_head_name(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, name='some_multiclass_head')
-
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
-    expected_loss = 10.
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    # Assert summaries.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      summary_str = sess.run(spec.scaffold.summary_op)
-      _assert_simple_summaries(self, {
-          '{}/some_multiclass_head'.format(metric_keys.MetricKeys.LOSS):
-              expected_loss,
-          '{}/some_multiclass_head'.format(metric_keys.MetricKeys.LOSS_MEAN):
-              expected_loss / 2,
-      }, summary_str, tol)
-
-  def test_train_with_regularization_losses(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-
-    logits = np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=2)])
-
-    regularization_losses = [1.5, 0.5]
-    expected_regularization_loss = 2.
-    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
-    #                    = sum(10, 0) / 2 = 5.
-    # loss = unregularized_loss + regularization_loss = 7.
-    expected_loss = 7.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        regularization_losses=regularization_losses)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
-          train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          metric_keys.MetricKeys.LOSS_REGULARIZATION: (
-              expected_regularization_loss),
-      }, summary_str, tol)
-
-  def test_train_one_dim_create_loss(self):
-    """Tests create_loss with 1D labels and weights (shape [batch_size])."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, weight_column='label_weights')
-
-    logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
-    labels_rank_1 = np.array((1, 2, 2,), dtype=np.int64)
-    weights_rank_1 = np.array((1., 2., 3.,), dtype=np.float64)
-    features = {
-        'x': np.array(((42,),), dtype=np.float32),
-        'label_weights': weights_rank_1
-    }
-
-    # unreduced_loss = cross_entropy(labels, logits) = [10, 10, 0].
-    expected_unreduced_loss = [[10.], [10.], [0.]]
-    # weights are reshaped to [3, 1] to match logits.
-    expected_weights = [[1.], [2.], [3.]]
-    # training_loss = 1 * 10 + 2 * 10 + 3 * 0 = 30.
-    expected_training_loss = 30.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels_rank_1)
-    tol = 1e-2
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_unreduced_loss, unreduced_loss.eval(), rtol=tol, atol=tol)
-      self.assertAllClose(expected_weights, actual_weights.eval())
-
-  def test_train_one_dim(self):
-    """Tests train with 1D labels and weights (shape [batch_size])."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, weight_column='label_weights')
-
-    logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
-    labels_rank_1 = np.array((1, 2, 2,), dtype=np.int64)
-    weights_rank_1 = np.array((1., 2., 3.,), dtype=np.float64)
-
-    self.assertEqual((3,), labels_rank_1.shape)
-    self.assertEqual((3,), weights_rank_1.shape)
-
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=2)])
-
-    # loss = sum(cross_entropy(labels, logits) * [1, 2, 3])
-    #      = sum([10, 10, 0] * [1, 2, 3]) = 30
-    expected_loss = 30.
-
-    features = {
-        'x': np.array(((42,),), dtype=np.float32),
-        'label_weights': weights_rank_1
-    }
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels_rank_1,
-        train_op_fn=_train_op_fn)
-
-    self.assertIsNotNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
-          train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          metric_keys.MetricKeys.LOSS_MEAN: (
-              expected_loss / np.sum(weights_rank_1)),
-      }, summary_str, tol)
-
-  def test_train_with_vocabulary_create_loss(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, label_vocabulary=['aang', 'iroh', 'zuko'])
-
-    logits = [[10., 0, 0], [0, 10, 0]]
-    labels = [[b'iroh'], [b'iroh']]
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_training_loss = 10.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
-
-  def test_train_with_vocabulary(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, label_vocabulary=['aang', 'iroh', 'zuko'])
-
-    logits = [[10., 0, 0], [0, 10, 0]]
-    labels = [[b'iroh'], [b'iroh']]
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    # loss = sum(cross_entropy(labels, logits)) = sum(10, 0) = 10.
-    expected_loss = 10.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss = sess.run(spec.loss)
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-
-  def test_weighted_multi_example_train(self):
-    n_classes = 3
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes, weight_column='label_weights')
-
-    # Create estimator spec.
-    logits = np.array(((10, 0, 0), (0, 10, 0), (0, 0, 10),), dtype=np.float32)
-    labels = np.array(((1,), (2,), (2,)), dtype=np.int64)
-    weights_3x1 = np.array(((1.,), (2.,), (3.,)), dtype=np.float64)
-    expected_train_result = 'my_train_op'
-    # loss = sum(cross_entropy(labels, logits) * [1, 2, 3])
-    #      = sum([10, 10, 0] * [1, 2, 3]) = 30
-    expected_loss = 30.
-
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=2)])
-
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array(((42,),), dtype=np.float32),
-            'label_weights': weights_3x1,
-        },
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    self.assertIsNotNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
-          train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # loss mean = sum(cross_entropy(labels, logits) * [1,2,3]) / (1+2+3)
-          #      = sum([10, 10, 0] * [1, 2, 3]) / 6 = 30 / 6
-          metric_keys.MetricKeys.LOSS_MEAN:
-              expected_loss / np.sum(weights_3x1),
-      }, summary_str, tol)
-
-  def test_multi_dim_weighted_train_create_loss(self):
-    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2]."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, weight_column='weights')
-
-    logits = np.array([[[10, 0, 0], [12, 0, 0]],
-                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
-    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-
-    # unreduced_loss = cross_entropy(labels, logits) = [[0, 12], [0, 15]].
-    expected_unreduced_loss = [[[0.], [12.]], [[0.], [15.]]]
-    # weights are reshaped to [2, 2, 1] to match logits.
-    expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
-    # training_loss = 1*0 + 1.5*12 + 2*0 + 2.5*15 = 55.5
-    expected_training_loss = 55.5
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    tol = 1e-2
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_unreduced_loss, unreduced_loss.eval(), rtol=tol, atol=tol)
-      self.assertAllClose(expected_weights, actual_weights.eval())
-
-  def test_multi_dim_weighted_train(self):
-    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2]."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, weight_column='weights')
-
-    logits = np.array([[[10, 0, 0], [12, 0, 0]],
-                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
-    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=2)])
-
-    # loss = cross_entropy(labels, logits) = [[0, 12], [0, 15]].
-    # weighted_sum_loss = 1*0 + 1.5*12 + 2*0 + 2.5*15 = 55.5
-    expected_loss = 55.5
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
-          train_result)
-
-  def test_multi_dim_train_weights_wrong_inner_dim(self):
-    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 1]."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, weight_column='weights')
-    logits = np.array([[[10, 0, 0], [12, 0, 0]],
-                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
-    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
-    weights = np.array([[1.], [2.]], dtype=np.float32)
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_no_op_train_fn)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 1\]'):
-        spec.loss.eval()
-
-  def test_multi_dim_train_weights_wrong_outer_dim(self):
-    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2, 3]."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, weight_column='weights')
-    logits = np.array([[[10, 0, 0], [12, 0, 0]],
-                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
-    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
-    weights = np.array([[[1., 1.1, 1.2], [1.5, 1.6, 1.7]],
-                        [[2., 2.1, 2.2], [2.5, 2.6, 2.7]]])
-    weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights_placeholder},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_no_op_train_fn)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \]\s\[2 2 3\]\s\[weights_shape: \]\s\[2 2 3\]'):
-        spec.loss.eval({weights_placeholder: weights})
-
-  def test_multi_dim_weighted_eval(self):
-    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2]."""
-    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-        n_classes=3, weight_column='weights')
-    logits = np.array([[[10, 0, 0], [12, 0, 0]],
-                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
-    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = cross_entropy(labels, logits) = [[0, 12], [0, 15]].
-    # weighted_sum_loss = 1*0 + 1.5*12 + 2*0 + 2.5*15 = 55.5
-    expected_loss = 55.5
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_loss / np.sum(weights),
-        keys.ACCURACY: (1.*1. + 1.5*0. + 2.*1. + 2.5*0.) / np.sum(weights),
-    }
-
-    # Assert predictions, loss, and metrics.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
-          rtol=tol, atol=tol)
-
-
-class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
-
-  def setUp(self):
-    ops.reset_default_graph()
-
-  def test_threshold_too_small(self):
-    with self.assertRaisesRegexp(ValueError, r'thresholds not in \(0, 1\)'):
-      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-          thresholds=(0., 0.5))
-
-  def test_threshold_too_large(self):
-    with self.assertRaisesRegexp(ValueError, r'thresholds not in \(0, 1\)'):
-      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-          thresholds=(0.5, 1.))
-
-  def test_invalid_loss_reduction(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'Invalid loss_reduction: invalid_loss_reduction'):
-      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-          loss_reduction='invalid_loss_reduction')
-    with self.assertRaisesRegexp(
-        ValueError, r'Invalid loss_reduction: none'):
-      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-          loss_reduction=losses.Reduction.NONE)
-
-  def test_loss_fn_arg_labels_missing(self):
-    def _loss_fn(logits):
-      del logits  # Unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn must contain argument: labels\. '
-        r'Given arguments: \(\'logits\',\)'):
-      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-          loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_logits_missing(self):
-    def _loss_fn(labels):
-      del labels  # unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn must contain argument: logits\. '
-        r'Given arguments: \(\'labels\',\)'):
-      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-          loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_features_ok(self):
-    def _loss_fn(labels, logits, features):
-      del labels, logits, features  # Unused
-      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-          loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_invalid(self):
-    def _loss_fn(labels, logits, name=None):
-      del labels, logits, name  # Unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn has unexpected args: \[\'name\'\]'):
-      head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-          loss_fn=_loss_fn)
-
-  def test_invalid_logits_shape(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-    self.assertEqual(1, head.logits_dimension)
-
-    # Logits should be shape (batch_size, 1).
-    logits_2x2 = np.array(((45., 44.), (41., 42.),))
-
-    # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'logits shape'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42.,),))},
-          mode=model_fn.ModeKeys.PREDICT,
-          logits=logits_2x2)
-
-    # Dynamic shape.
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),))},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits_placeholder)
-    with self.cached_session():
-      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
-        spec.predictions[prediction_keys.PredictionKeys.PROBABILITIES].eval({
-            logits_placeholder: logits_2x2
-        })
-
-  def test_invalid_labels_shape(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-    self.assertEqual(1, head.logits_dimension)
-
-    # Labels and logits should be shape (batch_size, 1).
-    labels_2x2 = np.array(((45., 44.), (41., 42.),))
-    logits_2x1 = np.array(((45.,), (41.,),))
-
-    # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'):
-      head.create_loss(
-          features={'x': np.array(((42.,),))},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=logits_2x1,
-          labels=labels_2x2)
-
-    # Dynamic shape.
-    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    training_loss = head.create_loss(
-        features={'x': np.array(((42.,),))},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_placeholder,
-        labels=labels_placeholder)[0]
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[2 2\]'):
-        training_loss.eval({
-            logits_placeholder: logits_2x1,
-            labels_placeholder: labels_2x2
-        })
-
-  def test_incompatible_labels_shape(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-    self.assertEqual(1, head.logits_dimension)
-
-    # Both logits and labels should be shape (batch_size, 1).
-    values_2x1 = np.array(((0.,), (1.,),))
-    values_3x1 = np.array(((0.,), (1.,), (0.,),))
-
-    # Static shape.
-    with self.assertRaisesRegexp(
-        ValueError, 'logits and labels must have the same shape'):
-      head.create_loss(
-          features={'x': values_2x1},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=values_2x1,
-          labels=values_3x1)
-    with self.assertRaisesRegexp(
-        ValueError, 'logits and labels must have the same shape'):
-      head.create_loss(
-          features={'x': values_2x1},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=values_3x1,
-          labels=values_2x1)
-
-    # Dynamic shape.
-    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    training_loss = head.create_loss(
-        features={'x': values_2x1},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_placeholder,
-        labels=labels_placeholder)[0]
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[expected_labels_shape: \] \[3 1\] \[labels_shape: \] \[2 1\]'):
-        training_loss.eval({
-            labels_placeholder: values_2x1,
-            logits_placeholder: values_3x1
-        })
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[3 1\]'):
-        training_loss.eval({
-            labels_placeholder: values_3x1,
-            logits_placeholder: values_2x1
-        })
-
-  def test_name(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        name='foo')
-    self.assertEqual('foo', head.name)
-
-  def test_predict(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = [[0.3], [-0.4]]
-    expected_logistics = [[0.574443], [0.401312]]
-    expected_probabilities = [[0.425557, 0.574443], [0.598688, 0.401312]]
-    expected_class_ids = [[1], [0]]
-    expected_classes = [[b'1'], [b'0']]
-    expected_export_classes = [[b'0', b'1']] * 2
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    # Assert spec contains expected tensors.
-    self.assertIsNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNone(spec.train_op)
-    self.assertItemsEqual(('classification', 'regression', 'predict',
-                           _DEFAULT_SERVING_KEY), spec.export_outputs.keys())
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(logits,
-                          predictions[prediction_keys.PredictionKeys.LOGITS])
-      self.assertAllClose(expected_logistics,
-                          predictions[prediction_keys.PredictionKeys.LOGISTIC])
-      self.assertAllClose(
-          expected_probabilities,
-          predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-      self.assertAllClose(expected_class_ids,
-                          predictions[prediction_keys.PredictionKeys.CLASS_IDS])
-      self.assertAllEqual(expected_classes,
-                          predictions[prediction_keys.PredictionKeys.CLASSES])
-      self.assertAllClose(
-          expected_probabilities,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
-      self.assertAllEqual(
-          expected_export_classes,
-          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].classes))
-      self.assertAllClose(expected_logistics,
-                          sess.run(spec.export_outputs['regression'].value))
-
-  def test_predict_with_vocabulary_list(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        label_vocabulary=['aang', 'iroh'])
-
-    logits = [[1.], [0.]]
-    expected_classes = [[b'iroh'], [b'aang']]
-
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertAllEqual(
-          expected_classes,
-          sess.run(spec.predictions[prediction_keys.PredictionKeys.CLASSES]))
-
-  def test_eval_create_loss(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-
-    # loss = cross_entropy(labels, logits) = [0, 41].
-    expected_training_loss = 41.
-    # Create loss.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
-
-  def test_eval_labels_none(self):
-    """Tests that error is raised when labels is None."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    with self.assertRaisesRegexp(
-        ValueError, r'You must provide a labels Tensor\. Given: None\.'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=np.array(((45,), (-41,),), dtype=np.float32),
-          labels=None)
-
-  def test_eval(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
-        # loss_mean = loss/2 = 41./2 = 20.5
-        keys.LOSS_MEAN: 20.5,
-        keys.ACCURACY: 1./2,
-        keys.PRECISION: 1.,
-        keys.RECALL: 1./2,
-        keys.PREDICTION_MEAN: 1./2,
-        keys.LABEL_MEAN: 2./2,
-        keys.ACCURACY_BASELINE: 2./2,
-        keys.AUC: 0.,
-        keys.AUC_PR: 1.,
-    }
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(41., loss)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
-
-  def test_eval_metric_ops_with_head_name(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        name='some_binary_head')
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    expected_metric_keys = [
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.LOSS_MEAN),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.ACCURACY),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.PRECISION),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.RECALL),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.PREDICTION_MEAN),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.LABEL_MEAN),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.ACCURACY_BASELINE),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.AUC),
-        '{}/some_binary_head'.format(metric_keys.MetricKeys.AUC_PR),
-    ]
-    self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
-
-  def test_eval_with_regularization_losses(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    regularization_losses = [1.5, 0.5]
-    expected_regularization_loss = 2.
-    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
-    #                    = sum(0, 41) / 2 = 20.5
-    expected_unregularized_loss = 20.5
-    expected_regularized_loss = (
-        expected_unregularized_loss + expected_regularization_loss)
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels,
-        regularization_losses=regularization_losses)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_unregularized_loss,
-        keys.LOSS_REGULARIZATION: expected_regularization_loss,
-        keys.ACCURACY: 1./2,
-        keys.PRECISION: 1.,
-        keys.RECALL: 1./2,
-        keys.PREDICTION_MEAN: 1./2,
-        keys.LABEL_MEAN: 2./2,
-        keys.ACCURACY_BASELINE: 2./2,
-        keys.AUC: 0.,
-        keys.AUC_PR: 1.,
-    }
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_regularized_loss, loss)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
-
-  def test_eval_with_vocabulary_list_create_loss(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        label_vocabulary=['aang', 'iroh'])
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = [[b'iroh'], [b'iroh']]
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # Create loss.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(41., training_loss.eval())
-
-  def test_eval_with_vocabulary_list(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        label_vocabulary=['aang', 'iroh'])
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = [[b'iroh'], [b'iroh']]
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      sess.run(update_ops)
-      self.assertAllClose(1. / 2,
-                          value_ops[metric_keys.MetricKeys.ACCURACY].eval())
-
-  def test_eval_with_thresholds_create_loss(self):
-    thresholds = [0.25, 0.5, 0.75]
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        thresholds=thresholds)
-    logits = np.array(((-1,), (1,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # probabilities[i] = 1/(1 + exp(-logits[i])) =>
-    # probabilities = [1/(1 + exp(1)), 1/(1 + exp(-1))] = [0.269, 0.731]
-    # loss = -ln(probabilities[label[i]])) = [-ln(0.269), -ln(0.731)]
-    #      = [1.31304389, 0.31334182]
-    # weighted sum loss = 1.62638571
-    expected_training_loss = 1.62638571
-    # Create loss.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
-
-  def test_eval_with_thresholds(self):
-    thresholds = [0.25, 0.5, 0.75]
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        thresholds=thresholds)
-    logits = np.array(((-1,), (1,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    # probabilities[i] = 1/(1 + exp(-logits[i])) =>
-    # probabilities = [1/(1 + exp(1)), 1/(1 + exp(-1))] = [0.269, 0.731]
-    # loss = -sum(ln(probabilities[label[i]])) = -ln(0.269) -ln(0.731)
-    #      = 1.62652338
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: 1.62652338 / 2.,
-        keys.ACCURACY: 1./2,
-        keys.PRECISION: 1.,
-        keys.RECALL: .5,
-        keys.PREDICTION_MEAN: 1./2,
-        keys.LABEL_MEAN: 2./2,
-        keys.ACCURACY_BASELINE: 2./2,
-        keys.AUC: 0.,
-        keys.AUC_PR: 1.,
-        keys.ACCURACY_AT_THRESHOLD % thresholds[0]: 1.,
-        keys.PRECISION_AT_THRESHOLD % thresholds[0]: 1.,
-        keys.RECALL_AT_THRESHOLD % thresholds[0]: 1.,
-        keys.ACCURACY_AT_THRESHOLD % thresholds[1]: .5,
-        keys.PRECISION_AT_THRESHOLD % thresholds[1]: 1.,
-        keys.RECALL_AT_THRESHOLD % thresholds[1]: .5,
-        keys.ACCURACY_AT_THRESHOLD % thresholds[2]: 0.,
-        keys.PRECISION_AT_THRESHOLD % thresholds[2]: 0.,
-        keys.RECALL_AT_THRESHOLD % thresholds[2]: 0.,
-    }
-    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(1.62652338, loss)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval()
-                             for k in value_ops},
-          atol=tol,
-          rtol=tol)
-
-  def test_train_create_loss(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.float64)
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    # unreduced_loss = cross_entropy(labels, logits) = [0, 41]
-    expected_unreduced_loss = [[0.], [41.]]
-    # weights default to 1.
-    expected_weights = 1.
-    # training loss = 1 * 0 + 1 * 41
-    expected_training_loss = 41.
-    # Create loss.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_training_loss, training_loss.eval())
-      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
-      self.assertAllClose(expected_weights, actual_weights)
-
-  def test_train_create_loss_loss_reduction(self):
-    """Tests create_loss with loss_reduction."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
-
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.float64)
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    # unreduced_loss = cross_entropy(labels, logits) = [0, 41]
-    expected_unreduced_loss = [[0.], [41.]]
-    # weights default to 1.
-    expected_weights = 1.
-    # training loss = (1 * 0 + 1 * 41) / num_nonzero_weights
-    expected_training_loss = 41. / 2.
-    # Create loss.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_training_loss, training_loss.eval())
-      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
-      self.assertAllClose(expected_weights, actual_weights)
-
-  def test_eval_create_loss_loss_fn(self):
-    """Tests head.create_loss for eval mode and custom loss_fn."""
-    loss = np.array([[1.], [2.]], dtype=np.float32)
-    logits_input = np.array([[-10.], [10.]], dtype=np.float32)
-    labels_input = np.array([[1], [0]], dtype=np.int64)
-    def _loss_fn(labels, logits):
-      check_labels = control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
-          data=[labels])
-      check_logits = control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
-          data=[logits])
-      with ops.control_dependencies([check_labels, check_logits]):
-        return constant_op.constant(loss)
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        loss_fn=_loss_fn)
-
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_input,
-        labels=labels_input)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
-
-  def test_eval_create_loss_loss_fn_wrong_shape(self):
-    """Tests custom loss_fn that returns Tensor of unexpected shape."""
-    loss = np.array([1., 2.], dtype=np.float32)
-    def _loss_fn(labels, logits):
-      del labels, logits  # Unused
-      return constant_op.constant(loss)
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        loss_fn=_loss_fn)
-
-    logits = np.array([[-10.], [10.]], dtype=np.float32)
-    labels = np.array([[1], [0]], dtype=np.int64)
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 1\]\. \] '
-          r'\[logits_shape: \] \[2 1\] \[loss_shape: \] \[2\]'):
-        actual_training_loss.eval()
-
-  def test_train_labels_none(self):
-    """Tests that error is raised when labels is None."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    with self.assertRaisesRegexp(
-        ValueError, r'You must provide a labels Tensor\. Given: None\.'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=np.array(((45,), (-41,),), dtype=np.float32),
-          labels=None,
-          train_op_fn=_no_op_train_fn)
-
-  def test_train(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.float64)
-    expected_train_result = b'my_train_op'
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
-    expected_loss = 41.
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # loss_mean = loss/2 = 41/2 = 20.5
-          metric_keys.MetricKeys.LOSS_MEAN: 20.5,
-      }, summary_str)
-
-  def test_train_with_optimizer(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.float64)
-    expected_train_result = b'my_train_op'
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
-    expected_loss = 41.
-
-    class _Optimizer(object):
-
-      def minimize(self, loss, global_step):
-        del global_step
-        with ops.control_dependencies((check_ops.assert_equal(
-            math_ops.to_float(expected_loss), math_ops.to_float(loss),
-            name='assert_loss'),)):
-          return constant_op.constant(expected_train_result)
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        optimizer=_Optimizer())
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-
-  def test_train_with_update_ops(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    with ops.Graph().as_default():
-      w = variables.Variable(1)
-      update_op = w.assign_add(1)
-      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
-
-      t = variables.Variable('')
-      expected_train_result = b'my_train_op'
-      def _train_op_fn(loss):
-        del loss
-        return t.assign(expected_train_result)
-
-      spec = head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=np.array(((45,), (-41,),), dtype=np.float32),
-          labels=np.array(((1,), (1,),), dtype=np.float64),
-          train_op_fn=_train_op_fn)
-
-      with self.cached_session() as sess:
-        _initialize_variables(self, spec.scaffold)
-        sess.run(spec.train_op)
-        w_value, t_value = sess.run([w, t])
-        self.assertEqual(2, w_value)
-        self.assertEqual(expected_train_result, t_value)
-
-  def test_train_summaries_with_head_name(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        name='some_binary_head')
-
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.float64)
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    # loss = sum(cross_entropy(labels, logits)) = sum(0, 41) = 41
-    expected_loss = 41.
-
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    # Assert summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      summary_str = sess.run(spec.scaffold.summary_op)
-      _assert_simple_summaries(
-          self,
-          {
-              '{}/some_binary_head'.format(metric_keys.MetricKeys.LOSS):
-                  expected_loss,
-              # loss_mean = loss/2 = 41/2 = 20.5
-              '{}/some_binary_head'.format(metric_keys.MetricKeys.LOSS_MEAN):
-                  20.5,
-          },
-          summary_str)
-
-  def test_train_with_regularization_losses(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-
-    logits = np.array(((45,), (-41,),), dtype=np.float32)
-    labels = np.array(((1,), (1,),), dtype=np.float64)
-    expected_train_result = b'my_train_op'
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    regularization_losses = [1.5, 0.5]
-    expected_regularization_loss = 2.
-    # unregularized_loss = sum(cross_entropy(labels, logits)) / batch_size
-    #                    = sum(0, 41) / 2 = 20.5
-    # loss = unregularized_loss + regularization_loss = 7.
-    expected_loss = 22.5
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        regularization_losses=regularization_losses)
-
-    # Assert predictions, loss, train_op, and summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((spec.loss, spec.train_op,
-                                                  spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          metric_keys.MetricKeys.LOSS_REGULARIZATION: (
-              expected_regularization_loss),
-      }, summary_str)
-
-  def test_float_labels_invalid_values(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    logits = np.array([[0.5], [-0.3]], dtype=np.float32)
-    labels = np.array([[1.2], [0.4]], dtype=np.float32)
-    features = {'x': np.array([[42]], dtype=np.float32)}
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)[0]
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        r'Labels must <= n_classes - 1'):
-      with self.cached_session():
-        _initialize_variables(self, monitored_session.Scaffold())
-        training_loss.eval()
-
-  def test_float_labels_train_create_loss(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    logits = np.array([[0.5], [-0.3]], dtype=np.float32)
-    labels = np.array([[0.8], [0.4]], dtype=np.float32)
-    features = {'x': np.array([[42]], dtype=np.float32)}
-    # loss = cross_entropy(labels, logits)
-    #      = -label[i]*sigmoid(logit[i]) -(1-label[i])*sigmoid(-logit[i])
-    #      = [-0.8 * log(sigmoid(0.5)) -0.2 * log(sigmoid(-0.5)),
-    #         -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))]
-    #      = [0.57407698418, 0.67435524446]
-    # weighted sum loss = 0.57407698418 + 0.67435524446
-    expected_training_loss = 1.24843222864
-    # Create loss.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
-
-  def test_float_labels_train(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    logits = np.array([[0.5], [-0.3]], dtype=np.float32)
-    labels = np.array([[0.8], [0.4]], dtype=np.float32)
-    expected_train_result = b'my_train_op'
-    features = {'x': np.array([[42]], dtype=np.float32)}
-    # loss = sum(cross_entropy(labels, logits))
-    #      = sum(-label[i]*sigmoid(logit[i]) -(1-label[i])*sigmoid(-logit[i]))
-    #      = -0.8 * log(sigmoid(0.5)) -0.2 * log(sigmoid(-0.5))
-    #        -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))
-    #      = 1.2484322
-    expected_loss = 1.2484322
-    def _train_op_fn(loss):
-      with ops.control_dependencies((dnn_testing_utils.assert_close(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss)),)):
-        return constant_op.constant(expected_train_result)
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    # Assert predictions, loss, train_op, and summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAlmostEqual(expected_loss, loss, delta=1.e-5)
-      self.assertEqual(expected_train_result, train_result)
-
-  def test_float_labels_eval_create_loss(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    logits = np.array([[0.5], [-0.3]], dtype=np.float32)
-    labels = np.array([[0.8], [0.4]], dtype=np.float32)
-    features = {'x': np.array([[42]], dtype=np.float32)}
-    # loss = cross_entropy(labels, logits)
-    #      = -label[i]*sigmoid(logit[i]) -(1-label[i])*sigmoid(-logit[i])
-    #      = [-0.8 * log(sigmoid(0.5)) -0.2 * log(sigmoid(-0.5)),
-    #         -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))]
-    #      = [0.57407698418, 0.67435524446]
-    # weighted sum loss = 0.57407698418 + 0.67435524446
-    expected_training_loss = 1.24843222864
-    # Create loss.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(), rtol=1e-2, atol=1e-2)
-
-  def test_float_labels_eval(self):
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
-
-    logits = np.array([[0.5], [-0.3]], dtype=np.float32)
-    labels = np.array([[0.8], [0.4]], dtype=np.float32)
-    features = {'x': np.array([[42]], dtype=np.float32)}
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    # loss = sum(cross_entropy(labels, logits))
-    #      = sum(-label[i]*sigmoid(logit[i]) -(1-label[i])*sigmoid(-logit[i]))
-    #      = -0.8 * log(sigmoid(0.5)) -0.2 * log(sigmoid(-0.5))
-    #        -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))
-    #      = 1.2484322
-    expected_loss = 1.2484322
-
-    # Assert loss.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAlmostEqual(expected_loss, loss, delta=1.e-5)
-      self.assertAlmostEqual(
-          expected_loss / 2., metrics[metric_keys.MetricKeys.LOSS_MEAN])
-
-  def test_weighted_multi_example_predict(self):
-    """3 examples, 1 batch."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='label_weights')
-
-    # Create estimator spec.
-    logits = np.array(((45,), (-41,), (44,)), dtype=np.int32)
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array(((42,), (43,), (44,)), dtype=np.int32),
-            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float32),
-        },
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      predictions = sess.run(spec.predictions)
-      self.assertAllClose(
-          logits.astype(np.float32),
-          predictions[prediction_keys.PredictionKeys.LOGITS])
-      self.assertAllClose(
-          _sigmoid(logits).astype(np.float32),
-          predictions[prediction_keys.PredictionKeys.LOGISTIC])
-      self.assertAllClose(
-          [[0., 1.], [1., 0.],
-           [0., 1.]], predictions[prediction_keys.PredictionKeys.PROBABILITIES])
-      self.assertAllClose([[1], [0], [1]],
-                          predictions[prediction_keys.PredictionKeys.CLASS_IDS])
-      self.assertAllEqual([[b'1'], [b'0'], [b'1']],
-                          predictions[prediction_keys.PredictionKeys.CLASSES])
-
-  def test_weighted_multi_example_eval(self):
-    """3 examples, 1 batch."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='label_weights')
-
-    # Create estimator spec.
-    logits = np.array(((45,), (-41,), (44,)), dtype=np.int32)
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array(((42,), (43,), (44,)), dtype=np.int32),
-            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float32),
-        },
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=np.array(((1,), (1,), (0,)), dtype=np.int32))
-
-    # label_mean = (1*1 + .1*1 + 1.5*0)/(1 + .1 + 1.5) = 1.1/2.6
-    #            = .42307692307
-    expected_label_mean = .42307692307
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        # losses = label_weights*cross_entropy(labels, logits)
-        #        = (1*0 + .1*41 + 1.5*44) = (1, 4.1, 66)
-        # loss = sum(losses) = 1 + 4.1 + 66 = 70.1
-        # loss_mean = loss/sum(label_weights) = 70.1/(1 + .1 + 1.5)
-        #           = 70.1/2.6 = 26.9615384615
-        keys.LOSS_MEAN: 26.9615384615,
-        # accuracy = (1*1 + .1*0 + 1.5*0)/(1 + .1 + 1.5) = 1/2.6 = .38461538461
-        keys.ACCURACY: .38461538461,
-        keys.PRECISION: 1./2.5,
-        keys.RECALL: 1./1.1,
-        # prediction_mean = (1*1 + .1*0 + 1.5*1)/(1 + .1 + 1.5) = 2.5/2.6
-        #                 = .96153846153
-        keys.PREDICTION_MEAN: .96153846153,
-        keys.LABEL_MEAN: expected_label_mean,
-        keys.ACCURACY_BASELINE: 1 - expected_label_mean,
-        keys.AUC: .45454565,
-        keys.AUC_PR: .6737757325172424,
-    }
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(70.1, loss)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
-
-  def test_train_one_dim_create_loss(self):
-    """Tests create_loss with 1D labels and weights (shape [batch_size])."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='label_weights')
-
-    # Create estimator spec.
-    logits = np.array(((45,), (-41,), (44,)), dtype=np.float32)
-    labels_rank_1 = np.array((1., 1., 0.,))
-    weights_rank_1 = np.array(((1., .1, 1.5,)), dtype=np.float64)
-    features = {
-        'x': np.array(((42.,), (43.,), (44.,)), dtype=np.float32),
-        'label_weights': weights_rank_1,
-    }
-    # unreduced_loss = cross_entropy(labels, logits) = [0, 41, 44]
-    expected_unreduced_loss = [[0.], [41.], [44.]]
-    # weights are reshaped to [3, 1] to match logits.
-    expected_weights = [[1.], [.1], [1.5]]
-    # training loss = 1 * 0 + .1 * 41 + 1.5 * 44
-    expected_training_loss = 70.1
-    # Create loss.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels_rank_1)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(),
-          rtol=1e-2, atol=1e-2)
-      self.assertAllClose(
-          expected_unreduced_loss, unreduced_loss.eval(),
-          rtol=1e-2, atol=1e-2)
-      self.assertAllClose(expected_weights, actual_weights.eval())
-
-  def test_train_one_dim(self):
-    """Tests train with 1D labels and weights (shape [batch_size])."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='label_weights')
-
-    # Create estimator spec.
-    logits = np.array(((45,), (-41,), (44,)), dtype=np.float32)
-    labels_rank_1 = np.array((1., 1., 0.,))
-    weights_rank_1 = np.array(((1., .1, 1.5,)), dtype=np.float64)
-    self.assertEqual((3,), labels_rank_1.shape)
-    self.assertEqual((3,), weights_rank_1.shape)
-    features = {
-        'x': np.array(((42.,), (43.,), (44.,)), dtype=np.float32),
-        'label_weights': weights_rank_1,
-    }
-    expected_train_result = b'my_train_op'
-    # losses = label_weights*cross_entropy(labels, logits)
-    #        = (1*0 + .1*41 + 1.5*44) = (1, 4.1, 66)
-    # loss = sum(losses) = 1 + 4.1 + 66 = 70.1
-    expected_loss = 70.1
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels_rank_1,
-        train_op_fn=_train_op_fn)
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertIsNotNone(spec.train_op)
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((
-          spec.loss, spec.train_op, spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # loss_mean = loss/sum(label_weights) = 70.1/(1 + .1 + 1.5)
-          #           = 70.1/2.6 = 26.9615384615
-          metric_keys.MetricKeys.LOSS_MEAN: 26.9615384615,
-      }, summary_str)
-
-  def test_weighted_multi_example_train(self):
-    """3 examples, 1 batch."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='label_weights')
-
-    # Create estimator spec.
-    logits = np.array(((45,), (-41,), (44,)), dtype=np.float32)
-    expected_train_result = b'my_train_op'
-    # losses = label_weights*cross_entropy(labels, logits)
-    #        = (1*0 + .1*41 + 1.5*44) = (1, 4.1, 66)
-    # loss = sum(losses) = 1 + 4.1 + 66 = 70.1
-    expected_loss = 70.1
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array(((42.,), (43.,), (44.,)), dtype=np.float32),
-            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float64),
-        },
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=np.array(((1.,), (1.,), (0.,))),
-        train_op_fn=_train_op_fn)
-
-    # Assert spec contains expected tensors.
-    self.assertIsNotNone(spec.loss)
-    self.assertIsNotNone(spec.train_op)
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      loss, train_result, summary_str = sess.run((
-          spec.loss, spec.train_op, spec.scaffold.summary_op))
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # loss_mean = loss/sum(label_weights) = 70.1/(1 + .1 + 1.5)
-          #           = 70.1/2.6 = 26.9615384615
-          metric_keys.MetricKeys.LOSS_MEAN: 26.9615384615,
-      }, summary_str)
-
-  def test_multi_dim_weighted_train_create_loss(self):
-    """Logits and labels of shape [2, 2, 1], weights [2, 2]."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='weights')
-
-    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
-    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # unreduced_loss = cross_entropy(labels, logits) = [[10, 0], [0, 12]].
-    expected_unreduced_loss = [[[10.], [0.]], [[0.], [12.]]]
-    # Weights are reshaped to [2, 2, 1] to match logits.
-    expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
-    # training_loss = 1*10 + 1.5*0 + 2*0 + 2.5*12 = 40
-    expected_training_loss = 40.
-    # Create loss.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    tol = 1e-2
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_training_loss, training_loss.eval(),
-          rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_unreduced_loss, unreduced_loss.eval(),
-          rtol=tol, atol=tol)
-      self.assertAllClose(expected_weights, actual_weights.eval())
-
-  def test_multi_dim_weighted_train(self):
-    """Logits and labels of shape [2, 2, 1], weights [2, 2]."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='weights')
-
-    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
-    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = cross_entropy(labels, logits) = [[10, 0], [0, 12]].
-    # weighted_sum_loss = 1*10 + 1.5*0 + 2*0 + 2.5*12 = 40
-    expected_loss = 40.
-    expected_train_result = 'my_train_op'
-    def _train_op_fn(loss):
-      return string_ops.string_join(
-          [constant_op.constant(expected_train_result),
-           string_ops.as_string(loss, precision=2)])
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    # Assert predictions, loss, train_op, and summaries.
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      self.assertEqual(
-          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
-          train_result)
-
-  def test_multi_dim_train_weights_wrong_inner_dim(self):
-    """Logits and labels of shape [2, 2, 1], weights [2, 1]."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='weights')
-
-    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
-    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
-    weights = np.array([[1.], [2.]], dtype=np.float32)
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_no_op_train_fn)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \] \[2 2 1\] \[weights_shape: \] \[2 1\]'):
-        spec.loss.eval()
-
-  def test_multi_dim_train_weights_wrong_outer_dim(self):
-    """Logits and labels of shape [2, 2, 1], weights [2, 2, 2]."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='weights')
-
-    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
-    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
-    weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features={'weights': weights_placeholder},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_no_op_train_fn)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \]\s\[2 2 1\]\s\[weights_shape: \]\s\[2 2 2\]'):
-        spec.loss.eval({
-            weights_placeholder: np.array([[[1., 1.1], [1.5, 1.6]],
-                                           [[2., 2.1], [2.5, 2.6]]])})
-
-  def test_multi_dim_weighted_eval(self):
-    """Logits and labels of shape [2, 2, 1], weights [2, 2]."""
-    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        weight_column='weights')
-
-    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
-    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
-    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
-    # loss = cross_entropy(labels, logits) = [[10, 0], [0, 12]].
-    # weighted_sum_loss = 1*10 + 1.5*0 + 2*0 + 2.5*12 = 40
-    expected_loss = 40.
-
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features={'weights': weights},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_loss / np.sum(weights),
-        keys.ACCURACY: (1.*0. + 1.5*1. + 2.*1. + 2.5*0.) / np.sum(weights),
-        keys.PRECISION: 2.0/3.0,
-        keys.RECALL: 2.0/4.5,
-        keys.PREDICTION_MEAN: (1.*1 + 1.5*0 + 2.*1 + 2.5*0) / np.sum(weights),
-        keys.LABEL_MEAN: (1.*0 + 1.5*0 + 2.*1 + 2.5*1) / np.sum(weights),
-        keys.ACCURACY_BASELINE: (1.*0 + 1.5*0 + 2.*1 + 2.5*1) / np.sum(weights),
-        # We cannot reliably calculate AUC with only 4 data points, but the
-        # values should not change because of backwards-compatibility.
-        keys.AUC: 0.5222,
-        keys.AUC_PR: 0.7341,
-    }
-
-    tol = 1e-2
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      loss, metrics = sess.run((spec.loss, update_ops))
-      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
-          rtol=tol, atol=tol)
-
-
-class RegressionHead(test.TestCase):
-
-  def setUp(self):
-    ops.reset_default_graph()
-
-  def test_invalid_label_dimension(self):
-    with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
-      head_lib._regression_head(label_dimension=-1)
-    with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
-      head_lib._regression_head(label_dimension=0)
-
-  def test_invalid_loss_reduction(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'Invalid loss_reduction: invalid_loss_reduction'):
-      head_lib._regression_head(loss_reduction='invalid_loss_reduction')
-    with self.assertRaisesRegexp(
-        ValueError, r'Invalid loss_reduction: none'):
-      head_lib._regression_head(loss_reduction=losses.Reduction.NONE)
-
-  def test_loss_fn_arg_labels_missing(self):
-    def _loss_fn(logits):
-      del logits  # Unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn must contain argument: labels\. '
-        r'Given arguments: \(\'logits\',\)'):
-      head_lib._regression_head(loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_logits_missing(self):
-    def _loss_fn(labels):
-      del labels  # unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn must contain argument: logits\. '
-        r'Given arguments: \(\'labels\',\)'):
-      head_lib._regression_head(loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_features_ok(self):
-    def _loss_fn(labels, logits, features):
-      del labels, logits, features  # Unused
-      head_lib._regression_head(loss_fn=_loss_fn)
-
-  def test_loss_fn_arg_invalid(self):
-    def _loss_fn(labels, logits, name=None):
-      del labels, logits, name  # Unused
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'loss_fn has unexpected args: \[\'name\'\]'):
-      head_lib._regression_head(loss_fn=_loss_fn)
-
-  def test_invalid_logits(self):
-    head = head_lib._regression_head(label_dimension=3)
-    self.assertEqual(3, head.logits_dimension)
-    logits_1d = np.array(((45.,), (41.,),))
-
-    # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'logits shape'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42.,),))},
-          mode=model_fn.ModeKeys.PREDICT,
-          logits=logits_1d)
-
-    # Dynamic shape.
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),))},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits_placeholder)
-    with self.cached_session():
-      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
-        spec.predictions[prediction_keys.PredictionKeys.PREDICTIONS].eval({
-            logits_placeholder: logits_1d
-        })
-
-  def test_incompatible_labels_eval(self):
-    head = head_lib._regression_head(label_dimension=3)
-    self.assertEqual(3, head.logits_dimension)
-    values_3d = np.array(((45., 46., 47.), (41., 42., 43.),))
-    values_1d = np.array(((43.,), (44.,),))
-
-    # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'):
-      head.create_loss(
-          features={'x': values_1d},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=values_3d,
-          labels=values_1d)
-    with self.assertRaisesRegexp(ValueError, 'logits shape'):
-      head.create_estimator_spec(
-          features={'x': values_3d}, labels=values_3d,
-          mode=model_fn.ModeKeys.EVAL, logits=values_1d, train_op_fn=None)
-
-    # Dynamic shape.
-    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    spec = head.create_estimator_spec(
-        features={'x': values_1d},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_placeholder,
-        labels=labels_placeholder)
-    with self.cached_session():
-      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
-        spec.loss.eval({
-            labels_placeholder: values_3d,
-            logits_placeholder: values_1d
-        })
-    training_loss = head.create_loss(
-        features={'x': values_1d},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_placeholder,
-        labels=labels_placeholder)[0]
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[expected_labels_shape: \] \[2 3\] \[labels_shape: \] \[2 1\]'):
-        training_loss.eval({
-            labels_placeholder: values_1d,
-            logits_placeholder: values_3d
-        })
-
-  def test_incompatible_labels_train(self):
-    head = head_lib._regression_head(label_dimension=3)
-    self.assertEqual(3, head.logits_dimension)
-    values_3d = np.array(((45., 46., 47.), (41., 42., 43.),))
-    values_1d = np.array(((43.,), (44.,),))
-
-    # Static shape.
-    with self.assertRaisesRegexp(ValueError, 'Mismatched label shape'):
-      head.create_loss(
-          features={'x': values_1d},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=values_3d,
-          labels=values_1d)
-
-    with self.assertRaisesRegexp(ValueError, 'logits shape'):
-      head.create_estimator_spec(
-          features={'x': values_3d},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=values_1d,
-          labels=values_3d,
-          train_op_fn=lambda x: x)
-
-    # Dynamic shape.
-    labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    spec = head.create_estimator_spec(
-        features={'x': values_1d},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits_placeholder,
-        labels=labels_placeholder,
-        train_op_fn=lambda x: x)
-    with self.cached_session():
-      with self.assertRaisesRegexp(errors.OpError, 'logits shape'):
-        spec.loss.eval({
-            labels_placeholder: values_3d,
-            logits_placeholder: values_1d
-        })
-    training_loss = head.create_loss(
-        features={'x': values_1d},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits_placeholder,
-        labels=labels_placeholder)[0]
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[expected_labels_shape: \] \[2 3\] \[labels_shape: \] \[2 1\]'):
-        training_loss.eval({
-            labels_placeholder: values_1d,
-            logits_placeholder: values_3d
-        })
-
-  def test_name(self):
-    head = head_lib._regression_head(name='foo')
-    self.assertEqual('foo', head.name)
-
-  def test_predict(self):
-    head = head_lib._regression_head()
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,),), dtype=np.int32)
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    # Assert spec contains expected tensors.
-    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
-    self.assertIsNone(spec.loss)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNone(spec.train_op)
-    default_serving_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-    self.assertItemsEqual(
-        (default_serving_key, 'predict', 'regression'),
-        spec.export_outputs.keys())
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions.
-    with self.cached_session():
-      _initialize_variables(self, spec.scaffold)
-      self.assertAllClose(logits, spec.predictions[prediction_key].eval())
-      self.assertAllClose(
-          logits, spec.export_outputs[default_serving_key].value.eval())
-      self.assertAllClose(
-          logits, spec.export_outputs['regression'].value.eval())
-      self.assertAllClose(
-          logits, spec.export_outputs['predict'].outputs['predictions'].eval())
-
-  def test_predict_with_inverse_link_fn(self):
-    def _inverse_link_fn(logits):
-      return logits - 10.
-    head = head_lib._regression_head(inverse_link_fn=_inverse_link_fn)
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,),), dtype=np.int32)
-    expected_predictions = np.array(((35,), (31,),), dtype=np.int32)
-    spec = head.create_estimator_spec(
-        features={'x': np.array(((42.,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.PREDICT,
-        logits=logits)
-
-    # Assert spec contains expected tensors.
-    keys = prediction_keys.PredictionKeys
-    self.assertItemsEqual(
-        (keys.PREDICTIONS, keys.LOGITS), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[keys.PREDICTIONS].dtype)
-    self.assertEqual(dtypes.float32, spec.predictions[keys.LOGITS].dtype)
-    default_serving_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-    self.assertItemsEqual(
-        (default_serving_key, 'predict', 'regression'),
-        spec.export_outputs.keys())
-
-    # Assert predictions.
-    with self.cached_session():
-      _initialize_variables(self, spec.scaffold)
-      self.assertAllClose(
-          expected_predictions, spec.predictions[keys.PREDICTIONS].eval())
-      self.assertAllClose(logits, spec.predictions[keys.LOGITS].eval())
-      self.assertAllClose(
-          expected_predictions,
-          spec.export_outputs[default_serving_key].value.eval())
-      self.assertAllClose(
-          expected_predictions, spec.export_outputs['regression'].value.eval())
-      self.assertAllClose(
-          expected_predictions,
-          spec.export_outputs['predict'].outputs['predictions'].eval())
-      self.assertAllClose(
-          logits, spec.export_outputs['predict'].outputs['logits'].eval())
-
-  def test_eval_create_loss(self):
-    head = head_lib._regression_head()
-    logits = np.array(((45,), (41,),), dtype=np.float32)
-    labels = np.array(((43,), (44,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    # Create loss.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      # loss = [(43-45)^2, (44-41)] = [4, 9]
-      self.assertAllClose(13., training_loss.eval())
-
-  def test_eval_create_loss_loss_fn(self):
-    """Tests head.create_loss for eval mode and custom loss_fn."""
-    loss = np.array([[0., 1.], [2., 3.]], dtype=np.float32)
-    logits_input = np.array([[-1., 1.], [-2., 2.]], dtype=np.float32)
-    labels_input = np.array([[1., 0.], [2., -1.]], dtype=np.float32)
-    def _loss_fn(labels, logits):
-      check_labels = control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(labels, labels_input)),
-          data=[labels])
-      check_logits = control_flow_ops.Assert(
-          math_ops.reduce_all(math_ops.equal(logits, logits_input)),
-          data=[logits])
-      with ops.control_dependencies([check_labels, check_logits]):
-        return constant_op.constant(loss)
-    head = head_lib._regression_head(label_dimension=2, loss_fn=_loss_fn)
-
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits_input,
-        labels=labels_input)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(np.sum(loss), actual_training_loss.eval())
-
-  def test_eval_create_loss_loss_fn_wrong_shape(self):
-    """Tests custom loss_fn that returns Tensor of unexpected shape."""
-    loss = np.array([[1.], [2.]], dtype=np.float32)
-    def _loss_fn(labels, logits):
-      del labels, logits  # Unused
-      return constant_op.constant(loss)
-    head = head_lib._regression_head(label_dimension=2, loss_fn=_loss_fn)
-
-    logits = np.array([[-1., 1.], [-2., 2.]], dtype=np.float32)
-    labels = np.array([[1., 0.], [2., -1.]], dtype=np.float32)
-    actual_training_loss = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[loss_fn must return Tensor of shape \[D0, D1, ... DN, 2\]\. \] '
-          r'\[logits_shape: \] \[2 2\] \[loss_shape: \] \[2 1\]'):
-        actual_training_loss.eval()
-
-  def test_eval_labels_none(self):
-    """Tests that error is raised when labels is None."""
-    head = head_lib._regression_head()
-
-    with self.assertRaisesRegexp(
-        ValueError, r'You must provide a labels Tensor\. Given: None\.'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.EVAL,
-          logits=np.array(((45,), (41,),), dtype=np.float32),
-          labels=None)
-
-  def test_eval(self):
-    head = head_lib._regression_head()
-    self.assertEqual(1, head.logits_dimension)
-
-    logits = np.array(((45,), (41,),), dtype=np.float32)
-    labels = np.array(((43,), (44,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    # Assert spec contains expected tensors.
-    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
-    self.assertEqual(dtypes.float32, spec.loss.dtype)
-    self.assertItemsEqual((metric_keys.MetricKeys.LOSS_MEAN,
-                           metric_keys.MetricKeys.PREDICTION_MEAN,
-                           metric_keys.MetricKeys.LABEL_MEAN),
-                          spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      loss_mean_value_op, loss_mean_update_op = spec.eval_metric_ops[
-          metric_keys.MetricKeys.LOSS_MEAN]
-      predictions, loss, loss_mean = sess.run((
-          spec.predictions[prediction_key], spec.loss, loss_mean_update_op))
-      self.assertAllClose(logits, predictions)
-      # loss = (43-45)^2 + (44-41)^2 = 4+9 = 13
-      self.assertAllClose(13., loss)
-      # loss_mean = loss/2 = 13/2 = 6.5
-      expected_loss_mean = 6.5
-      # Check results of both update (in `loss_mean`) and value ops.
-      self.assertAllClose(expected_loss_mean, loss_mean)
-      self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
-
-  def test_eval_metric_ops_with_head_name_for_regression(self):
-    head = head_lib._regression_head(name='some_regression_head')
-    logits = np.array(((1,), (9,)), dtype=np.float32)
-    labels = np.array(((1,), (1,)), dtype=np.int64)
-    features = {'x': np.array(((42,),), dtype=np.int32)}
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    expected_metric_keys = [
-        '{}/some_regression_head'.format(metric_keys.MetricKeys.LOSS_MEAN),
-        '{}/some_regression_head'.format(
-            metric_keys.MetricKeys.PREDICTION_MEAN),
-        '{}/some_regression_head'.format(metric_keys.MetricKeys.LABEL_MEAN),
-    ]
-    self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
-
-  def test_eval_with_regularization_losses(self):
-    head = head_lib._regression_head(
-        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-    self.assertEqual(1, head.logits_dimension)
-
-    logits = np.array(((45,), (41,),), dtype=np.float32)
-    labels = np.array(((43,), (44,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    regularization_losses = [1.5, 0.5]
-    expected_regularization_loss = 2.
-    # unregularized_loss = ((43-45)^2 + (44-41)^2) / batch_size
-    #                    = (4 + 9) / 2 = 6.5
-    expected_unregularized_loss = 6.5
-    expected_regularized_loss = (
-        expected_unregularized_loss + expected_regularization_loss)
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels,
-        regularization_losses=regularization_losses)
-
-    keys = metric_keys.MetricKeys
-    expected_metrics = {
-        keys.LOSS_MEAN: expected_unregularized_loss,
-        keys.LOSS_REGULARIZATION: expected_regularization_loss,
-        keys.PREDICTION_MEAN: (45 + 41) / 2.0,
-        keys.LABEL_MEAN: (43 + 44) / 2.0,
-    }
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
-      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-      prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-      predictions, loss, metrics = sess.run((
-          spec.predictions[prediction_key], spec.loss, update_ops))
-      self.assertAllClose(logits, predictions)
-      self.assertAllClose(expected_regularized_loss, loss)
-      # Check results of both update (in `metrics`) and value ops.
-      self.assertAllClose(expected_metrics, metrics)
-      self.assertAllClose(
-          expected_metrics, {k: value_ops[k].eval() for k in value_ops})
-
-  def test_train_create_loss(self):
-    head = head_lib._regression_head()
-    logits = np.array(((45,), (41,),), dtype=np.float32)
-    labels = np.array(((43,), (44,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    # unreduced_loss = [(43-45)^2, (44-41)] = [4, 9]
-    expected_unreduced_loss = [[4.], [9.]]
-    # weights default to 1.
-    expected_weights = 1
-    # training_loss = 1 * 4 + 1 * 9 = 13
-    expected_training_loss = 13.
-    # Create loss.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_training_loss, training_loss.eval())
-      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
-      self.assertAllClose(expected_weights, actual_weights)
-
-  def test_train_create_loss_loss_reduction(self):
-    """Tests create_loss with loss_reduction."""
-    head = head_lib._regression_head(
-        loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
-    logits = np.array(((45,), (41,),), dtype=np.float32)
-    labels = np.array(((43,), (44,),), dtype=np.int32)
-    features = {'x': np.array(((42,),), dtype=np.float32)}
-    # unreduced_loss = [(43-45)^2, (44-41)] = [4, 9]
-    expected_unreduced_loss = [[4.], [9.]]
-    # weights default to 1.
-    expected_weights = 1
-    # training_loss = (1 * 4 + 1 * 9) / num_nonzero_weights
-    expected_training_loss = 13. / 2.
-    # Create loss.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_training_loss, training_loss.eval())
-      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
-      self.assertAllClose(expected_weights, actual_weights)
-
-  def test_train_labels_none(self):
-    """Tests that error is raised when labels is None."""
-    head = head_lib._regression_head()
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    with self.assertRaisesRegexp(
-        ValueError, r'You must provide a labels Tensor\. Given: None\.'):
-      head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=np.array(((45,), (41,),), dtype=np.float32),
-          labels=None,
-          train_op_fn=_no_op_train_fn)
-
-  def test_train(self):
-    head = head_lib._regression_head()
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,),), dtype=np.float32)
-    labels = np.array(((43.,), (44.,),), dtype=np.float64)
-    expected_train_result = b'my_train_op'
-    features = {'x': np.array(((42.,),), dtype=np.float32)}
-    # loss = (43-45)^2 + (44-41)^2 = 4 + 9 = 13
-    expected_loss = 13
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    # Assert spec contains expected tensors.
-    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
-    self.assertEqual(dtypes.float32, spec.loss.dtype)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      predictions, loss, train_result, summary_str = sess.run((
-          spec.predictions[prediction_key], spec.loss, spec.train_op,
-          spec.scaffold.summary_op))
-      self.assertAllClose(logits, predictions)
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # loss_mean = loss/2 = 13/2 = 6.5
-          metric_keys.MetricKeys.LOSS_MEAN: 6.5,
-      }, summary_str)
-
-  def test_train_with_optimizer(self):
-    head = head_lib._regression_head()
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,),), dtype=np.float32)
-    labels = np.array(((43.,), (44.,),), dtype=np.float64)
-    expected_train_result = b'my_train_op'
-    features = {'x': np.array(((42.,),), dtype=np.float32)}
-    # loss = (43-45)^2 + (44-41)^2 = 4 + 9 = 13
-    expected_loss = 13
-
-    class _Optimizer(object):
-
-      def minimize(self, loss, global_step):
-        del global_step
-        with ops.control_dependencies((check_ops.assert_equal(
-            math_ops.to_float(expected_loss), math_ops.to_float(loss),
-            name='assert_loss'),)):
-          return constant_op.constant(expected_train_result)
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        optimizer=_Optimizer())
-
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss, train_result = sess.run((spec.loss, spec.train_op))
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-
-  def test_train_with_update_ops(self):
-    head = head_lib._regression_head()
-
-    with ops.Graph().as_default():
-      w = variables.Variable(1)
-      update_op = w.assign_add(1)
-      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
-
-      t = variables.Variable('')
-      expected_train_result = b'my_train_op'
-      def _train_op_fn(loss):
-        del loss
-        return t.assign(expected_train_result)
-
-      spec = head.create_estimator_spec(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=np.array(((45,), (41,),), dtype=np.float32),
-          labels=np.array(((43.,), (44.,),), dtype=np.float64),
-          train_op_fn=_train_op_fn)
-
-      with self.cached_session() as sess:
-        _initialize_variables(self, spec.scaffold)
-        sess.run(spec.train_op)
-        w_value, t_value = sess.run([w, t])
-        self.assertEqual(2, w_value)
-        self.assertEqual(expected_train_result, t_value)
-
-  def test_train_summaries_with_head_name(self):
-    head = head_lib._regression_head(name='some_regression_head')
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,),), dtype=np.float32)
-    labels = np.array(((43.,), (44.,),), dtype=np.float64)
-    features = {'x': np.array(((42.,),), dtype=np.float32)}
-    # loss = (43-45)^2 + (44-41)^2 = 4 + 9 = 13
-    expected_loss = 13
-
-    def _train_op_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    # Assert summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      summary_str = sess.run(spec.scaffold.summary_op)
-      _assert_simple_summaries(
-          self,
-          {
-              '{}/some_regression_head'.format(metric_keys.MetricKeys.LOSS):
-                  expected_loss,
-              # loss_mean = loss/2 = 13/2 = 6.5
-              '{}/some_regression_head'
-              .format(metric_keys.MetricKeys.LOSS_MEAN):
-                  6.5,
-          },
-          summary_str)
-
-  def test_train_with_regularization_losses(self):
-    head = head_lib._regression_head(
-        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,),), dtype=np.float32)
-    labels = np.array(((43.,), (44.,),), dtype=np.float64)
-    expected_train_result = b'my_train_op'
-    features = {'x': np.array(((42.,),), dtype=np.float32)}
-    regularization_losses = [1.5, 0.5]
-    expected_regularization_loss = 2.
-    # unregularized_loss = ((43-45)^2 + (44-41)^2) / batch_size
-    #                    = (4 + 9) / 2 = 6.5
-    # loss = unregularized_loss + regularization_loss = 8.5
-    expected_loss = 8.5
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        regularization_losses=regularization_losses)
-
-    # Assert predictions, loss, train_op, and summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-      predictions, loss, train_result, summary_str = sess.run((
-          spec.predictions[prediction_key], spec.loss, spec.train_op,
-          spec.scaffold.summary_op))
-      self.assertAllClose(logits, predictions)
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          metric_keys.MetricKeys.LOSS_REGULARIZATION: (
-              expected_regularization_loss),
-      }, summary_str)
-
-  def test_weighted_multi_example_eval(self):
-    """1d label, 3 examples, 1 batch."""
-    head = head_lib._regression_head(weight_column='label_weights')
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,), (44,)), dtype=np.int32)
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array(((42,), (43,), (44,)), dtype=np.int32),
-            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float32),
-        },
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=np.array(((35,), (42,), (45,)), dtype=np.int32))
-
-    # Assert spec contains expected tensors.
-    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
-    self.assertEqual(dtypes.float32, spec.loss.dtype)
-    self.assertItemsEqual((metric_keys.MetricKeys.LOSS_MEAN,
-                           metric_keys.MetricKeys.PREDICTION_MEAN,
-                           metric_keys.MetricKeys.LABEL_MEAN),
-                          spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      loss_mean_value_op, loss_mean_update_op = spec.eval_metric_ops[
-          metric_keys.MetricKeys.LOSS_MEAN]
-      predictions, loss, loss_mean = sess.run((
-          spec.predictions[prediction_key], spec.loss, loss_mean_update_op))
-      self.assertAllClose(logits, predictions)
-      # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
-      self.assertAllClose(101.6, loss)
-      # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.0769231
-      expected_loss_mean = 39.0769231
-      # Check results of both update (in `loss_mean`) and value ops.
-      self.assertAllClose(expected_loss_mean, loss_mean)
-      self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
-
-  def test_weight_with_numeric_column(self):
-    """1d label, 3 examples, 1 batch."""
-    head = head_lib._regression_head(
-        weight_column=feature_column_lib.numeric_column(
-            'label_weights', normalizer_fn=lambda x: x + 1.))
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,), (44,)), dtype=np.int32)
-    spec = head.create_estimator_spec(
-        features={
-            'x':
-                np.array(((42,), (43,), (44,)), dtype=np.int32),
-            'label_weights':
-                np.array(((0.,), (-0.9,), (0.5,)), dtype=np.float32),
-        },
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=np.array(((35,), (42,), (45,)), dtype=np.int32))
-
-    # Assert loss.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      loss = sess.run(spec.loss)
-      # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
-      self.assertAllClose(101.6, loss)
-
-  def test_weighted_multi_example_train(self):
-    """1d label, 3 examples, 1 batch."""
-    head = head_lib._regression_head(weight_column='label_weights')
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,), (44,)), dtype=np.float32)
-    expected_train_result = b'my_train_op'
-    # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
-    expected_loss = 101.6
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-    spec = head.create_estimator_spec(
-        features={
-            'x': np.array(((42,), (43,), (44,)), dtype=np.float32),
-            'label_weights': np.array(((1.,), (.1,), (1.5,)), dtype=np.float64),
-        },
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=np.array(((35.,), (42.,), (45.,)), dtype=np.float32),
-        train_op_fn=_train_op_fn)
-
-    # Assert spec contains expected tensors.
-    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
-    self.assertEqual(dtypes.float32, spec.loss.dtype)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      predictions, loss, train_result, summary_str = sess.run((
-          spec.predictions[prediction_key], spec.loss, spec.train_op,
-          spec.scaffold.summary_op))
-      self.assertAllClose(logits, predictions)
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.0769231
-          metric_keys.MetricKeys.LOSS_MEAN: 39.0769231,
-      }, summary_str)
-
-  def test_train_one_dim_create_loss(self):
-    """Tests create_loss with 1D labels and weights (shape [batch_size])."""
-    head = head_lib._regression_head(weight_column='label_weights')
-    logits = np.array(((45,), (41,), (44,)), dtype=np.float32)
-    x_feature_rank_1 = np.array((42., 43., 44.,), dtype=np.float32)
-    weight_rank_1 = np.array((1., .1, 1.5,), dtype=np.float64)
-    labels_rank_1 = np.array((35., 42., 45.,))
-    # unreduced_loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
-    expected_unreduced_loss = [[100.], [1.], [1.]]
-    # weights are reshaped to [3, 1] to match logits.
-    expected_weights = [[1.], [.1], [1.5]]
-    # training_loss = 100 * 1 + 1 * .1 + 1.5 * 1 = 101.6
-    expected_training_loss = 101.6
-    features = {'x': x_feature_rank_1, 'label_weights': weight_rank_1}
-    # Create loss.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels_rank_1)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_training_loss, training_loss.eval())
-      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
-      self.assertAllClose(expected_weights, actual_weights.eval())
-
-  def test_train_one_dim(self):
-    """Tests train with 1D labels and weights (shape [batch_size])."""
-    head = head_lib._regression_head(weight_column='label_weights')
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45,), (41,), (44,)), dtype=np.float32)
-    expected_train_result = b'my_train_op'
-    # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
-    expected_loss = 101.6
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    x_feature_rank_1 = np.array((42., 43., 44.,), dtype=np.float32)
-    weight_rank_1 = np.array((1., .1, 1.5,), dtype=np.float64)
-    labels_rank_1 = np.array((35., 42., 45.,))
-    features = {'x': x_feature_rank_1, 'label_weights': weight_rank_1}
-    self.assertEqual((3,), x_feature_rank_1.shape)
-    self.assertEqual((3,), weight_rank_1.shape)
-    self.assertEqual((3,), labels_rank_1.shape)
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels_rank_1,
-        train_op_fn=_train_op_fn)
-
-    # Assert spec contains expected tensors.
-    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
-    self.assertEqual(dtypes.float32, spec.loss.dtype)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, train_op, and summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      predictions, loss, train_result, summary_str = sess.run((
-          spec.predictions[prediction_key], spec.loss, spec.train_op,
-          spec.scaffold.summary_op))
-      self.assertAllClose(logits, predictions)
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.0769231
-          metric_keys.MetricKeys.LOSS_MEAN: 39.0769231,
-      }, summary_str)
-
-  def test_weighted_multi_value_eval_create_loss(self):
-    """3d label, 1 example, 1 batch."""
-    head = head_lib._regression_head(
-        weight_column='label_weights', label_dimension=3)
-    logits = np.array(((45., 41., 44.),))
-    labels = np.array(((35., 42., 45.),))
-    features = {
-        'x': np.array(((42., 43., 44.),)),
-        'label_weights': np.array(((1., .1, 1.5),))
-    }
-    # Create loss.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      # loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
-      # weighted sum loss = 1 * 100 + .1 * 1 + 1.5 * 1 = 101.6
-      self.assertAllClose(101.6, training_loss.eval())
-
-  def test_weighted_multi_value_eval(self):
-    """3d label, 1 example, 1 batch."""
-    head = head_lib._regression_head(
-        weight_column='label_weights', label_dimension=3)
-    self.assertEqual(3, head.logits_dimension)
-
-    logits = np.array(((45., 41., 44.),))
-    labels = np.array(((35., 42., 45.),))
-    features = {
-        'x': np.array(((42., 43., 44.),)),
-        'label_weights': np.array(((1., .1, 1.5),))
-    }
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=logits,
-        labels=labels)
-
-    # Assert spec contains expected tensors.
-    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
-    self.assertEqual(dtypes.float32, spec.loss.dtype)
-    self.assertItemsEqual((metric_keys.MetricKeys.LOSS_MEAN,
-                           metric_keys.MetricKeys.PREDICTION_MEAN,
-                           metric_keys.MetricKeys.LABEL_MEAN),
-                          spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Assert predictions, loss, and metrics.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNone(spec.scaffold.summary_op)
-      loss_mean_value_op, loss_mean_update_op = spec.eval_metric_ops[
-          metric_keys.MetricKeys.LOSS_MEAN]
-      predictions, loss, loss_mean = sess.run((
-          spec.predictions[prediction_key], spec.loss, loss_mean_update_op))
-      self.assertAllClose(logits, predictions)
-      # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
-      self.assertAllClose(101.6, loss)
-      # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.076923
-      expected_loss_mean = 39.076923
-      # Check results of both update (in `loss_mean`) and value ops.
-      self.assertAllClose(expected_loss_mean, loss_mean)
-      self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
-
-  def test_weighted_multi_value_train_create_loss(self):
-    """3d label, 1 example, 1 batch."""
-    head = head_lib._regression_head(
-        weight_column='label_weights', label_dimension=3)
-    logits = np.array(((45., 41., 44.),))
-    labels = np.array(((35., 42., 45.),))
-    features = {
-        'x': np.array(((42., 43., 44.),)),
-        'label_weights': np.array(((1., .1, 1.5),))
-    }
-    # Create loss.
-    training_loss = head.create_loss(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)[0]
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      # loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
-      # weighted sum loss = 1 * 100 + .1 * 1 + 1.5 * 1 = 101.6
-      self.assertAllClose(101.6, training_loss.eval())
-
-  def test_weighted_multi_value_train(self):
-    """3d label, 1 example, 1 batch."""
-    head = head_lib._regression_head(
-        weight_column='label_weights', label_dimension=3)
-    self.assertEqual(3, head.logits_dimension)
-
-    logits = np.array(((45., 41., 44.),))
-    labels = np.array(((35., 42., 45.),))
-    expected_train_result = b'my_train_op'
-    # loss = 1*(35-45)^2 + .1*(42-41)^2 + 1.5*(45-44)^2 = 100+.1+1.5 = 101.6
-    expected_loss = 101.6
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    features = {
-        'x': np.array(((42., 43., 44.),)),
-        'label_weights': np.array(((1., .1, 1.5),)),
-    }
-    # Create estimator spec.
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-
-    # Assert spec contains expected tensors.
-    prediction_key = prediction_keys.PredictionKeys.PREDICTIONS
-    self.assertItemsEqual((prediction_key,), spec.predictions.keys())
-    self.assertEqual(dtypes.float32, spec.predictions[prediction_key].dtype)
-    self.assertEqual(dtypes.float32, spec.loss.dtype)
-    self.assertEqual({}, spec.eval_metric_ops)
-    self.assertIsNotNone(spec.train_op)
-    self.assertIsNone(spec.export_outputs)
-    _assert_no_hooks(self, spec)
-
-    # Evaluate predictions, loss, train_op, and summaries.
-    with self.cached_session() as sess:
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      predictions, loss, train_result, summary_str = sess.run((
-          spec.predictions[prediction_key], spec.loss, spec.train_op,
-          spec.scaffold.summary_op))
-      self.assertAllClose(logits, predictions)
-      self.assertAllClose(expected_loss, loss)
-      self.assertEqual(expected_train_result, train_result)
-      _assert_simple_summaries(self, {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.076923
-          metric_keys.MetricKeys.LOSS_MEAN: 39.076923,
-      }, summary_str)
-
-  def test_weighted_multi_batch_eval(self):
-    """1d label, 1 example, 3 batches."""
-    head = head_lib._regression_head(weight_column='label_weights')
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45.,), (41.,), (44.,)))
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'x': np.array(((42.,), (43.,), (44.,))),
-            'label_weights': np.array(((1.,), (.1,), (1.5,))),
-            # 'logits' is not a feature, but we use `numpy_input_fn` to make a
-            # batched version of it, and pop it off before passing to
-            # `create_estimator_spec`.
-            'logits': logits,
-        },
-        y=np.array(((35.,), (42.,), (45.,))),
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    batched_features, batched_labels = input_fn()
-    batched_logits = batched_features.pop('logits')
-    spec = head.create_estimator_spec(
-        features=batched_features,
-        mode=model_fn.ModeKeys.EVAL,
-        logits=batched_logits,
-        labels=batched_labels,
-        train_op_fn=None)
-
-    # losses = [1*(35-45)^2, .1*(42-41)^2, 1.5*(45-44)^2] = [100, .1, 1.5]
-    # loss = sum(losses) = 100+.1+1.5 = 101.6
-    # loss_mean = loss/(1+.1+1.5) = 101.6/2.6 = 39.076923
-    expected_metrics = {
-        metric_keys.MetricKeys.LOSS_MEAN:
-            39.076923,
-        metric_keys.MetricKeys.PREDICTION_MEAN:
-            (45 + 41 * 0.1 + 44 * 1.5) / 2.6,
-        metric_keys.MetricKeys.LABEL_MEAN: (35 + 42 * 0.1 + 45 * 1.5) / 2.6,
-    }
-
-    # Assert spec contains expected tensors.
-    self.assertEqual(dtypes.float32, spec.loss.dtype)
-    self.assertItemsEqual(expected_metrics.keys(), spec.eval_metric_ops.keys())
-    self.assertIsNone(spec.train_op)
-    _assert_no_hooks(self, spec)
-
-    with self.cached_session() as sess:
-      # Finalize graph and initialize variables.
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      queue_runner_impl.start_queue_runners()
-
-      # Run tensors for `steps` steps.
-      steps = len(logits)
-      results = tuple([
-          sess.run((
-              spec.loss,
-              # The `[1]` gives us the metric update op.
-              {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
-          )) for _ in range(steps)
-      ])
-
-      # Assert losses and metrics.
-      self.assertAllClose((100, .1, 1.5), [r[0] for r in results])
-      # For metrics, check results of both update (in `results`) and value ops.
-      # Note: we only check the result of the last step for streaming metrics.
-      self.assertAllClose(expected_metrics, results[steps - 1][1])
-      self.assertAllClose(expected_metrics, {
-          k: spec.eval_metric_ops[k][0].eval() for k in spec.eval_metric_ops
-      })
-
-  def test_weighted_multi_batch_train(self):
-    """1d label, 1 example, 3 batches."""
-    head = head_lib._regression_head(weight_column='label_weights')
-    self.assertEqual(1, head.logits_dimension)
-
-    # Create estimator spec.
-    logits = np.array(((45.,), (41.,), (44.,)))
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'x': np.array(((42.,), (43.,), (44.,))),
-            'label_weights': np.array(((1.,), (.1,), (1.5,))),
-            # 'logits' is not a feature, but we use `numpy_input_fn` to make a
-            # batched version of it, and pop it off before passing to
-            # `create_estimator_spec`.
-            'logits': logits,
-        },
-        y=np.array(((35.,), (42.,), (45.,))),
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    batched_features, batched_labels = input_fn()
-    batched_logits = batched_features.pop('logits')
-    spec = head.create_estimator_spec(
-        features=batched_features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=batched_logits,
-        labels=batched_labels,
-        train_op_fn=lambda loss: loss * -7.)
-
-    # Assert spec contains expected tensors.
-    self.assertEqual(dtypes.float32, spec.loss.dtype)
-    self.assertIsNotNone(spec.train_op)
-
-    with self.cached_session() as sess:
-      # Finalize graph and initialize variables.
-      _initialize_variables(self, spec.scaffold)
-      self.assertIsNotNone(spec.scaffold.summary_op)
-      queue_runner_impl.start_queue_runners()
-
-      results = tuple([
-          sess.run((spec.loss, spec.train_op)) for _ in range(len(logits))
-      ])
-
-      # losses = [1*(35-45)^2, .1*(42-41)^2, 1.5*(45-44)^2] = [100, .1, 1.5]
-      expected_losses = np.array((100, .1, 1.5))
-      self.assertAllClose(expected_losses, [r[0] for r in results])
-      self.assertAllClose(expected_losses * -7., [r[1] for r in results])
-
-  def test_multi_dim_weighted_train_create_loss(self):
-    """Logits, labels of shape [2, 2, 3], weight shape [2, 2]."""
-    label_dimension = 3
-    head = head_lib._regression_head(
-        weight_column='label_weights', label_dimension=label_dimension)
-    logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
-                       [[20., 21., 22.], [30., 31., 32.]]])
-    labels = np.array([[[01., 02., 03.], [12., 13., 14.]],
-                       [[23., 24., 25.], [34., 35., 36.]]])
-    weights = np.array([[1., 1.5], [2., 2.5]])
-    expected_unreduced_loss = [[[1., 1., 1.], [4., 4., 4.]],
-                               [[9., 9., 9.], [16., 16., 16.]]]
-    expected_training_loss = np.sum(
-        np.array([[[1. * x for x in [1., 1., 1.]],
-                   [1.5 * x for x in [4., 4., 4.]]],
-                  [[2. * x for x in [9., 9., 9.]],
-                   [2.5 * x for x in [16., 16., 16.]]]]))
-    # Weights are expanded to [2, 2, 1] to match logits.
-    expected_weights = [[[1.], [1.5]], [[2.], [2.5]]]
-    # Create loss.
-    training_loss, unreduced_loss, actual_weights, _ = head.create_loss(
-        features={'label_weights': weights},
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_training_loss, training_loss.eval())
-      self.assertAllClose(expected_unreduced_loss, unreduced_loss.eval())
-      self.assertAllClose(expected_weights, actual_weights.eval())
-
-  def test_multi_dim_weighted_train(self):
-    """Logits, labels of shape [2, 2, 3], weight shape [2, 2]."""
-    head = head_lib._regression_head(
-        weight_column='label_weights', label_dimension=3)
-    logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
-                       [[20., 21., 22.], [30., 31., 32.]]])
-    labels = np.array([[[01., 02., 03.], [12., 13., 14.]],
-                       [[23., 24., 25.], [34., 35., 36.]]])
-    expected_train_result = b'my_train_op'
-    features = {
-        'label_weights': np.array([[1., 1.5], [2., 2.5]]),
-    }
-    # loss = 1*3*1^2 + 1.5*3*2^2 + 2*3*3^2 +2.5*3*4^2 = 195
-    expected_loss = 195.
-    # Create estimator spec.
-    def _train_op_fn(loss):
-      with ops.control_dependencies((check_ops.assert_equal(
-          math_ops.to_float(expected_loss), math_ops.to_float(loss),
-          name='assert_loss'),)):
-        return constant_op.constant(expected_train_result)
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_train_op_fn)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_loss, spec.loss.eval())
-
-  def test_multi_dim_train_weights_wrong_inner_dim(self):
-    """Logits, labels of shape [2, 2, 3], weight shape [2, 1]."""
-    head = head_lib._regression_head(
-        weight_column='label_weights', label_dimension=3)
-    logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
-                       [[20., 21., 22.], [30., 31., 32.]]])
-    labels = np.array([[[01., 02., 03.], [12., 13., 14.]],
-                       [[23., 24., 25.], [34., 35., 36.]]])
-    features = {
-        'label_weights': np.array([[1.], [2]]),
-    }
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_no_op_train_fn)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 1\]'):
-        spec.loss.eval()
-
-  def test_multi_dim_train_weights_wrong_outer_dim(self):
-    """Logits, labels of shape [2, 2, 3], weight shape [2, 2, 2]."""
-    head = head_lib._regression_head(
-        weight_column='label_weights', label_dimension=3)
-    logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
-                       [[20., 21., 22.], [30., 31., 32.]]])
-    labels = np.array([[[01., 02., 03.], [12., 13., 14.]],
-                       [[23., 24., 25.], [34., 35., 36.]]])
-    weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    features = {
-        'label_weights': weights_placeholder,
-    }
-    def _no_op_train_fn(loss):
-      del loss
-      return control_flow_ops.no_op()
-
-    spec = head.create_estimator_spec(
-        features=features,
-        mode=model_fn.ModeKeys.TRAIN,
-        logits=logits,
-        labels=labels,
-        train_op_fn=_no_op_train_fn)
-    with self.cached_session():
-      _initialize_variables(self, monitored_session.Scaffold())
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[logits_shape: \]\s\[2 2 3\]\s\[weights_shape: \]\s\[2 2 2\]'):
-        spec.loss.eval({
-            weights_placeholder: np.array([[[1., 1.1], [1.5, 1.6]],
-                                           [[2., 2.1], [2.5, 2.6]]])})
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 8b96284bd3a5ae42e8990acee779a232e36bb78f..e8a62c380afd893f711ff4c2568637c9fff94737 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,532 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Linear Estimators."""
+"""linear python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
-import six
-
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator.canned import head as head_lib
-from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.feature_column import feature_column_v2
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variable_ops
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.summary import summary
-from tensorflow.python.training import ftrl
-from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import estimator_export
-
-
-# The default learning rate of 0.2 is a historical artifact of the initial
-# implementation, but seems a reasonable choice.
-_LEARNING_RATE = 0.2
-
-
-def _get_default_optimizer(feature_columns):
-  learning_rate = min(_LEARNING_RATE, 1.0 / math.sqrt(len(feature_columns)))
-  return ftrl.FtrlOptimizer(learning_rate=learning_rate)
-
-
-def _get_expanded_variable_list(var_list):
-  """Given a list of variables, expands them if they are partitioned.
-
-  Args:
-    var_list: A list of variables.
-
-  Returns:
-    A list of variables where each partitioned variable is expanded to its
-    components.
-  """
-  returned_list = []
-  for variable in var_list:
-    if (isinstance(variable, variable_ops.Variable) or
-        resource_variable_ops.is_resource_variable(variable)):
-      returned_list.append(variable)  # Single variable case.
-    else:  # Must be a PartitionedVariable, so convert into a list.
-      returned_list.extend(list(variable))
-  return returned_list
-
-
-# TODO(rohanj): Consider making this a public utility method.
-def _compute_fraction_of_zero(variables):
-  """Given a linear variables list, compute the fraction of zero weights.
-
-  Args:
-    variables: A list or list of list of variables
-
-  Returns:
-    The fraction of zeros (sparsity) in the linear model.
-  """
-  all_weight_vars = []
-  for var_or_var_list in variables:
-    var_list = nest.flatten(var_or_var_list)
-    # Skip empty-lists associated with columns that created no Variables.
-    if var_list:
-      all_weight_vars += [array_ops.reshape(var, [-1]) for var in var_list]
-  return nn.zero_fraction(array_ops.concat(all_weight_vars, axis=0))
-
-
-def _linear_logit_fn_builder(units, feature_columns, sparse_combiner='sum'):
-  """Function builder for a linear logit_fn.
-
-  Args:
-    units: An int indicating the dimension of the logit layer.
-    feature_columns: An iterable containing all the feature columns used by
-      the model.
-    sparse_combiner: A string specifying how to reduce if a categorical column
-      is multivalent.  One of "mean", "sqrtn", and "sum".
-
-  Returns:
-    A logit_fn (see below).
-
-  """
-
-  def linear_logit_fn(features):
-    """Linear model logit_fn.
-
-    Args:
-      features: This is the first item returned from the `input_fn`
-                passed to `train`, `evaluate`, and `predict`. This should be a
-                single `Tensor` or `dict` of same.
-
-    Returns:
-      A `Tensor` representing the logits.
-    """
-    if feature_column_v2.is_feature_column_v2(feature_columns):
-      shared_state_manager = feature_column_v2.SharedEmbeddingStateManager()
-      linear_model = feature_column_v2.LinearModel(
-          feature_columns=feature_columns,
-          units=units,
-          sparse_combiner=sparse_combiner,
-          shared_state_manager=shared_state_manager)
-      logits = linear_model(features)
-      bias = linear_model.bias_variable
-
-      # We'd like to get all the non-bias variables associated with this
-      # LinearModel. This includes the shared embedding variables as well.
-      variables = linear_model.variables
-      variables.remove(bias)
-      variables.extend(shared_state_manager.variables)
-
-      # Expand (potential) Partitioned variables
-      bias = _get_expanded_variable_list([bias])
-      variables = _get_expanded_variable_list(variables)
-    else:
-      linear_model = feature_column._LinearModel(  # pylint: disable=protected-access
-          feature_columns=feature_columns,
-          units=units,
-          sparse_combiner=sparse_combiner,
-          name='linear_model')
-      logits = linear_model(features)
-      cols_to_vars = linear_model.cols_to_vars()
-      bias = cols_to_vars.pop('bias')
-      variables = cols_to_vars.values()
-
-    if units > 1:
-      summary.histogram('bias', bias)
-    else:
-      # If units == 1, the bias value is a length-1 list of a scalar Tensor,
-      # so we should provide a scalar summary.
-      summary.scalar('bias', bias[0][0])
-    summary.scalar('fraction_of_zero_weights',
-                   _compute_fraction_of_zero(variables))
-    return logits
-
-  return linear_logit_fn
-
-
-def _linear_model_fn(features, labels, mode, head, feature_columns, optimizer,
-                     partitioner, config, sparse_combiner='sum'):
-  """A model_fn for linear models that use a gradient-based optimizer.
-
-  Args:
-    features: dict of `Tensor`.
-    labels: `Tensor` of shape `[batch_size, logits_dimension]`.
-    mode: Defines whether this is training, evaluation or prediction.
-      See `ModeKeys`.
-    head: A `Head` instance.
-    feature_columns: An iterable containing all the feature columns used by
-      the model.
-    optimizer: string, `Optimizer` object, or callable that defines the
-      optimizer to use for training. If `None`, will use a FTRL optimizer.
-    partitioner: Partitioner for variables.
-    config: `RunConfig` object to configure the runtime settings.
-    sparse_combiner: A string specifying how to reduce if a categorical column
-      is multivalent.  One of "mean", "sqrtn", and "sum".
-
-  Returns:
-    An `EstimatorSpec` instance.
-
-  Raises:
-    ValueError: mode or params are invalid, or features has the wrong type.
-  """
-  if not isinstance(features, dict):
-    raise ValueError('features should be a dictionary of `Tensor`s. '
-                     'Given type: {}'.format(type(features)))
-
-  optimizer = optimizers.get_optimizer_instance(
-      optimizer or _get_default_optimizer(feature_columns),
-      learning_rate=_LEARNING_RATE)
-  num_ps_replicas = config.num_ps_replicas if config else 0
-
-  partitioner = partitioner or (
-      partitioned_variables.min_max_variable_partitioner(
-          max_partitions=num_ps_replicas,
-          min_slice_size=64 << 20))
-
-  with variable_scope.variable_scope(
-      'linear',
-      values=tuple(six.itervalues(features)),
-      partitioner=partitioner):
-
-    logit_fn = _linear_logit_fn_builder(
-        units=head.logits_dimension, feature_columns=feature_columns,
-        sparse_combiner=sparse_combiner)
-    logits = logit_fn(features=features)
-
-    return head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=optimizer,
-        logits=logits)
-
-
-@estimator_export('estimator.LinearClassifier')
-class LinearClassifier(estimator.Estimator):
-  """Linear classifier model.
-
-  Train a linear model to classify instances into one of multiple possible
-  classes. When number of possible classes is 2, this is binary classification.
-
-  Example:
-
-  ```python
-  categorical_column_a = categorical_column_with_hash_bucket(...)
-  categorical_column_b = categorical_column_with_hash_bucket(...)
-
-  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
-
-  # Estimator using the default optimizer.
-  estimator = LinearClassifier(
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b])
-
-  # Or estimator using the FTRL optimizer with regularization.
-  estimator = LinearClassifier(
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b],
-      optimizer=tf.train.FtrlOptimizer(
-        learning_rate=0.1,
-        l1_regularization_strength=0.001
-      ))
-
-  # Or estimator using an optimizer with a learning rate decay.
-  estimator = LinearClassifier(
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b],
-      optimizer=lambda: tf.train.FtrlOptimizer(
-          learning_rate=tf.exponential_decay(
-              learning_rate=0.1,
-              global_step=tf.get_global_step(),
-              decay_steps=10000,
-              decay_rate=0.96))
-
-  # Or estimator with warm-starting from a previous checkpoint.
-  estimator = LinearClassifier(
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b],
-      warm_start_from="/path/to/checkpoint/dir")
-
-
-  # Input builders
-  def input_fn_train: # returns x, y (where y represents label's class index).
-    ...
-  def input_fn_eval: # returns x, y (where y represents label's class index).
-    ...
-  estimator.train(input_fn=input_fn_train)
-  estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-    otherwise there will be a `KeyError`:
-
-  * if `weight_column` is not `None`, a feature with
-    `key=weight_column` whose value is a `Tensor`.
-  * for each `column` in `feature_columns`:
-    - if `column` is a `SparseColumn`, a feature with `key=column.name`
-      whose `value` is a `SparseTensor`.
-    - if `column` is a `WeightedSparseColumn`, two features: the first with
-      `key` the id column name, the second with `key` the weight column name.
-      Both features' `value` must be a `SparseTensor`.
-    - if `column` is a `RealValuedColumn`, a feature with `key=column.name`
-      whose `value` is a `Tensor`.
-
-  Loss is calculated by using softmax cross entropy.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               feature_columns,
-               model_dir=None,
-               n_classes=2,
-               weight_column=None,
-               label_vocabulary=None,
-               optimizer='Ftrl',
-               config=None,
-               partitioner=None,
-               warm_start_from=None,
-               loss_reduction=losses.Reduction.SUM,
-               sparse_combiner='sum'):
-    """Construct a `LinearClassifier` estimator object.
-
-    Args:
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      n_classes: number of label classes. Default is binary classification.
-        Note that class labels are integers representing the class index (i.e.
-        values from 0 to n_classes-1). For arbitrary label values (e.g. string
-        labels), convert to class indices first.
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to down weight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      label_vocabulary: A list of strings represents possible label values. If
-        given, labels must be string type and have any value in
-        `label_vocabulary`. If it is not given, that means labels are
-        already encoded as integer or float within [0, 1] for `n_classes=2` and
-        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
-        Also there will be errors if vocabulary is not provided and labels are
-        string.
-      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
-        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
-        callable. Defaults to FTRL optimizer.
-      config: `RunConfig` object to configure the runtime settings.
-      partitioner: Optional. Partitioner for input layer.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights and biases are warm-started, and it is assumed that vocabularies
-        and Tensor names are unchanged.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM`.
-      sparse_combiner: A string specifying how to reduce if a categorical column
-        is multivalent.  One of "mean", "sqrtn", and "sum" -- these are
-        effectively different ways to do example-level normalization, which can
-        be useful for bag-of-words features. for more details, see
-        `tf.feature_column.linear_model`.
-
-    Returns:
-      A `LinearClassifier` estimator.
-
-    Raises:
-      ValueError: if n_classes < 2.
-    """
-    if n_classes == 2:
-      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-    else:
-      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
-          n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-
-    def _model_fn(features, labels, mode, config):
-      """Call the defined shared _linear_model_fn."""
-      return _linear_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          feature_columns=tuple(feature_columns or []),
-          optimizer=optimizer,
-          partitioner=partitioner,
-          config=config,
-          sparse_combiner=sparse_combiner)
-
-    super(LinearClassifier, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config,
-        warm_start_from=warm_start_from)
-
-
-@estimator_export('estimator.LinearRegressor')
-class LinearRegressor(estimator.Estimator):
-  """An estimator for TensorFlow Linear regression problems.
-
-  Train a linear regression model to predict label value given observation of
-  feature values.
-
-  Example:
-
-  ```python
-  categorical_column_a = categorical_column_with_hash_bucket(...)
-  categorical_column_b = categorical_column_with_hash_bucket(...)
-
-  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
-
-  # Estimator using the default optimizer.
-  estimator = LinearRegressor(
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b])
-
-  # Or estimator using the FTRL optimizer with regularization.
-  estimator = LinearRegressor(
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b],
-      optimizer=tf.train.FtrlOptimizer(
-        learning_rate=0.1,
-        l1_regularization_strength=0.001
-      ))
-
-  # Or estimator using an optimizer with a learning rate decay.
-  estimator = LinearRegressor(
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b],
-      optimizer=lambda: tf.train.FtrlOptimizer(
-          learning_rate=tf.exponential_decay(
-              learning_rate=0.1,
-              global_step=tf.get_global_step(),
-              decay_steps=10000,
-              decay_rate=0.96))
-
-  # Or estimator with warm-starting from a previous checkpoint.
-  estimator = LinearRegressor(
-      feature_columns=[categorical_column_a,
-                       categorical_feature_a_x_categorical_feature_b],
-      warm_start_from="/path/to/checkpoint/dir")
-
-
-  # Input builders
-  def input_fn_train: # returns x, y
-    ...
-  def input_fn_eval: # returns x, y
-    ...
-  estimator.train(input_fn=input_fn_train)
-  estimator.evaluate(input_fn=input_fn_eval)
-  estimator.predict(input_fn=input_fn_predict)
-  ```
-
-  Input of `train` and `evaluate` should have following features,
-    otherwise there will be a KeyError:
-
-  * if `weight_column` is not `None`:
-    key=weight_column, value=a `Tensor`
-  * for column in `feature_columns`:
-    - if isinstance(column, `SparseColumn`):
-        key=column.name, value=a `SparseTensor`
-    - if isinstance(column, `WeightedSparseColumn`):
-        {key=id column name, value=a `SparseTensor`,
-         key=weight column name, value=a `SparseTensor`}
-    - if isinstance(column, `RealValuedColumn`):
-        key=column.name, value=a `Tensor`
-
-  Loss is calculated by using mean squared error.
-
-  @compatibility(eager)
-  Estimators can be used while eager execution is enabled. Note that `input_fn`
-  and all hooks are executed inside a graph context, so they have to be written
-  to be compatible with graph mode. Note that `input_fn` code using `tf.data`
-  generally works in both graph and eager modes.
-  @end_compatibility
-  """
-
-  def __init__(self,
-               feature_columns,
-               model_dir=None,
-               label_dimension=1,
-               weight_column=None,
-               optimizer='Ftrl',
-               config=None,
-               partitioner=None,
-               warm_start_from=None,
-               loss_reduction=losses.Reduction.SUM,
-               sparse_combiner='sum'):
-    """Initializes a `LinearRegressor` instance.
-
-    Args:
-      feature_columns: An iterable containing all the feature columns used by
-        the model. All items in the set should be instances of classes derived
-        from `FeatureColumn`.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator
-        to continue training a previously saved model.
-      label_dimension: Number of regression targets per example. This is the
-        size of the last dimension of the labels and logits `Tensor` objects
-        (typically, these have shape `[batch_size, label_dimension]`).
-      weight_column: A string or a `_NumericColumn` created by
-        `tf.feature_column.numeric_column` defining feature column representing
-        weights. It is used to down weight or boost examples during training. It
-        will be multiplied by the loss of the example. If it is a string, it is
-        used as a key to fetch weight tensor from the `features`. If it is a
-        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-        then weight_column.normalizer_fn is applied on it to get weight tensor.
-      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
-        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
-        callable. Defaults to FTRL optimizer.
-      config: `RunConfig` object to configure the runtime settings.
-      partitioner: Optional. Partitioner for input layer.
-      warm_start_from: A string filepath to a checkpoint to warm-start from, or
-        a `WarmStartSettings` object to fully configure warm-starting.  If the
-        string filepath is provided instead of a `WarmStartSettings`, then all
-        weights and biases are warm-started, and it is assumed that vocabularies
-        and Tensor names are unchanged.
-      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
-        to reduce training loss over batch. Defaults to `SUM`.
-      sparse_combiner: A string specifying how to reduce if a categorical column
-        is multivalent.  One of "mean", "sqrtn", and "sum" -- these are
-        effectively different ways to do example-level normalization, which can
-        be useful for bag-of-words features. for more details, see
-        `tf.feature_column.linear_model`.
-    """
-    head = head_lib._regression_head(  # pylint: disable=protected-access
-        label_dimension=label_dimension, weight_column=weight_column,
-        loss_reduction=loss_reduction)
+from tensorflow_estimator.python.estimator.canned import linear
 
-    def _model_fn(features, labels, mode, config):
-      """Call the defined shared _linear_model_fn."""
-      return _linear_model_fn(
-          features=features,
-          labels=labels,
-          mode=mode,
-          head=head,
-          feature_columns=tuple(feature_columns or []),
-          optimizer=optimizer,
-          partitioner=partitioner,
-          config=config,
-          sparse_combiner=sparse_combiner)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+linear.__all__ = [s for s in dir(linear) if not s.startswith('__')]
 
-    super(LinearRegressor, self).__init__(
-        model_fn=_model_fn,
-        model_dir=model_dir,
-        config=config,
-        warm_start_from=warm_start_from)
+from tensorflow_estimator.python.estimator.canned.linear import *
diff --git a/tensorflow/python/estimator/canned/linear_test.py b/tensorflow/python/estimator/canned/linear_test.py
deleted file mode 100644
index 3e6da5de225846790d9170ffcdde08e5ee2ad47f..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/canned/linear_test.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for linear.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.estimator.canned import linear
-from tensorflow.python.estimator.canned import linear_testing_utils
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.feature_column import feature_column_v2
-from tensorflow.python.platform import test
-
-
-def _linear_regressor_fn(*args, **kwargs):
-  return linear.LinearRegressor(*args, **kwargs)
-
-
-def _linear_classifier_fn(*args, **kwargs):
-  return linear.LinearClassifier(*args, **kwargs)
-
-
-# Tests for Linear Regressor.
-
-
-class LinearRegressorPartitionerTest(
-    linear_testing_utils.BaseLinearRegressorPartitionerTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearRegressorPartitionerV2Test(
-    linear_testing_utils.BaseLinearRegressorPartitionerTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPartitionerTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-class LinearRegressorEvaluationTest(
-    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearRegressorEvaluationV2Test(
-    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-class LinearRegressorPredictTest(
-    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearRegressorPredictV2Test(
-    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-class LinearRegressorIntegrationTest(
-    linear_testing_utils.BaseLinearRegressorIntegrationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearRegressorIntegrationV2Test(
-    linear_testing_utils.BaseLinearRegressorIntegrationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorIntegrationTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-class LinearRegressorTrainingTest(
-    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column)
-
-
-class LinearRegressorTrainingV2Test(
-    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
-        self, _linear_regressor_fn, fc_lib=feature_column_v2)
-
-
-# Tests for Linear Classifier.
-class LinearClassifierTrainingTest(
-    linear_testing_utils.BaseLinearClassifierTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
-
-
-class LinearClassifierTrainingV2Test(
-    linear_testing_utils.BaseLinearClassifierTrainingTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierTrainingTest.__init__(
-        self,
-        linear_classifier_fn=_linear_classifier_fn,
-        fc_lib=feature_column_v2)
-
-
-class LinearClassifierEvaluationTest(
-    linear_testing_utils.BaseLinearClassifierEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
-
-
-class LinearClassifierEvaluationV2Test(
-    linear_testing_utils.BaseLinearClassifierEvaluationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierEvaluationTest.__init__(
-        self,
-        linear_classifier_fn=_linear_classifier_fn,
-        fc_lib=feature_column_v2)
-
-
-class LinearClassifierPredictTest(
-    linear_testing_utils.BaseLinearClassifierPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
-
-
-class LinearClassifierPredictV2Test(
-    linear_testing_utils.BaseLinearClassifierPredictTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierPredictTest.__init__(
-        self,
-        linear_classifier_fn=_linear_classifier_fn,
-        fc_lib=feature_column_v2)
-
-
-class LinearClassifierIntegrationTest(
-    linear_testing_utils.BaseLinearClassifierIntegrationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
-        self, linear_classifier_fn=_linear_classifier_fn, fc_lib=feature_column)
-
-
-class LinearClassifierIntegrationV2Test(
-    linear_testing_utils.BaseLinearClassifierIntegrationTest, test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearClassifierIntegrationTest.__init__(
-        self,
-        linear_classifier_fn=_linear_classifier_fn,
-        fc_lib=feature_column_v2)
-
-
-# Tests for Linear logit_fn.
-class LinearLogitFnTest(linear_testing_utils.BaseLinearLogitFnTest,
-                        test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearLogitFnTest.__init__(
-        self, fc_lib=feature_column)
-
-
-class LinearLogitFnV2Test(linear_testing_utils.BaseLinearLogitFnTest,
-                          test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearLogitFnTest.__init__(
-        self, fc_lib=feature_column_v2)
-
-
-# Tests for warm-starting with Linear logit_fn.
-class LinearWarmStartingTest(linear_testing_utils.BaseLinearWarmStartingTest,
-                             test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearWarmStartingTest.__init__(
-        self,
-        _linear_classifier_fn,
-        _linear_regressor_fn,
-        fc_lib=feature_column)
-
-
-class LinearWarmStartingV2Test(linear_testing_utils.BaseLinearWarmStartingTest,
-                               test.TestCase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    linear_testing_utils.BaseLinearWarmStartingTest.__init__(
-        self,
-        _linear_classifier_fn,
-        _linear_regressor_fn,
-        fc_lib=feature_column_v2)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index 827352a70bb74e96465d555533fb9f3685b5258b..a9b03e921e6e534f12b0daaa944a1098ea118431 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,2280 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utils for testing linear estimators."""
+"""linear_testing_utils python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-import os
-import shutil
-import tempfile
-
-import numpy as np
-import six
-
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator.canned import linear
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.estimator.inputs import pandas_io
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.feature_column import feature_column_v2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import input as input_lib
-from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.training import queue_runner
-from tensorflow.python.training import saver
-from tensorflow.python.training import session_run_hook
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-# pylint rules which are disabled by default for test files.
-# pylint: disable=invalid-name,protected-access,missing-docstring
-
-# Names of variables created by model.
-AGE_WEIGHT_NAME = 'linear/linear_model/age/weights'
-HEIGHT_WEIGHT_NAME = 'linear/linear_model/height/weights'
-OCCUPATION_WEIGHT_NAME = 'linear/linear_model/occupation/weights'
-BIAS_NAME = 'linear/linear_model/bias_weights'
-LANGUAGE_WEIGHT_NAME = 'linear/linear_model/language/weights'
-
-
-def assert_close(expected, actual, rtol=1e-04, name='assert_close'):
-  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
-    expected = ops.convert_to_tensor(expected, name='expected')
-    actual = ops.convert_to_tensor(actual, name='actual')
-    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
-    rtol = ops.convert_to_tensor(rtol, name='rtol')
-    return check_ops.assert_less(
-        rdiff,
-        rtol,
-        data=('Condition expected =~ actual did not hold element-wise:'
-              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
-              'rtol = ', rtol,),
-        name=scope)
-
-
-def save_variables_to_ckpt(model_dir):
-  init_all_op = [variables_lib.global_variables_initializer()]
-  with tf_session.Session() as sess:
-    sess.run(init_all_op)
-    saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
-
-
-def queue_parsed_features(feature_map):
-  tensors_to_enqueue = []
-  keys = []
-  for key, tensor in six.iteritems(feature_map):
-    keys.append(key)
-    tensors_to_enqueue.append(tensor)
-  queue_dtypes = [x.dtype for x in tensors_to_enqueue]
-  input_queue = data_flow_ops.FIFOQueue(capacity=100, dtypes=queue_dtypes)
-  queue_runner.add_queue_runner(
-      queue_runner.QueueRunner(input_queue,
-                               [input_queue.enqueue(tensors_to_enqueue)]))
-  dequeued_tensors = input_queue.dequeue()
-  return {keys[i]: dequeued_tensors[i] for i in range(len(dequeued_tensors))}
-
-
-def sorted_key_dict(unsorted_dict):
-  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
-
-
-def sigmoid(x):
-  return 1 / (1 + np.exp(-1.0 * x))
-
-
-class CheckPartitionerVarHook(session_run_hook.SessionRunHook):
-  """A `SessionRunHook` to check a partitioned variable."""
-
-  def __init__(self, test_case, var_name, var_dim, partitions):
-    self._test_case = test_case
-    self._var_name = var_name
-    self._var_dim = var_dim
-    self._partitions = partitions
-
-  def begin(self):
-    with variable_scope.variable_scope(
-        variable_scope.get_variable_scope()) as scope:
-      scope.reuse_variables()
-      partitioned_weight = variable_scope.get_variable(
-          self._var_name, shape=(self._var_dim, 1))
-      self._test_case.assertTrue(
-          isinstance(partitioned_weight, variables_lib.PartitionedVariable))
-      for part in partitioned_weight:
-        self._test_case.assertEqual(self._var_dim // self._partitions,
-                                    part.get_shape()[0])
-
-
-class BaseLinearRegressorPartitionerTest(object):
-
-  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
-    self._linear_regressor_fn = linear_regressor_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def testPartitioner(self):
-    x_dim = 64
-    partitions = 4
-
-    def _partitioner(shape, dtype):
-      del dtype  # unused; required by Fn signature.
-      # Only partition the embedding tensor.
-      return [partitions, 1] if shape[0] == x_dim else [1]
-
-    regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.categorical_column_with_hash_bucket(
-            'language', hash_bucket_size=x_dim),),
-        partitioner=_partitioner,
-        model_dir=self._model_dir)
-
-    def _input_fn():
-      return {
-          'language':
-              sparse_tensor.SparseTensor(
-                  values=['english', 'spanish'],
-                  indices=[[0, 0], [0, 1]],
-                  dense_shape=[1, 2])
-      }, [[10.]]
-
-    hook = CheckPartitionerVarHook(self, LANGUAGE_WEIGHT_NAME, x_dim,
-                                   partitions)
-    regressor.train(input_fn=_input_fn, steps=1, hooks=[hook])
-
-  def testDefaultPartitionerWithMultiplePsReplicas(self):
-    partitions = 2
-    # This results in weights larger than the default partition size of 64M,
-    # so partitioned weights are created (each weight uses 4 bytes).
-    x_dim = 32 << 20
-
-    class FakeRunConfig(run_config.RunConfig):
-
-      @property
-      def num_ps_replicas(self):
-        return partitions
-
-    # Mock the device setter as ps is not available on test machines.
-    with test.mock.patch.object(
-        estimator,
-        '_get_replica_device_setter',
-        return_value=lambda _: '/cpu:0'):
-      linear_regressor = self._linear_regressor_fn(
-          feature_columns=(self._fc_lib.categorical_column_with_hash_bucket(
-              'language', hash_bucket_size=x_dim),),
-          config=FakeRunConfig(),
-          model_dir=self._model_dir)
-
-      def _input_fn():
-        return {
-            'language':
-                sparse_tensor.SparseTensor(
-                    values=['english', 'spanish'],
-                    indices=[[0, 0], [0, 1]],
-                    dense_shape=[1, 2])
-        }, [[10.]]
-
-      hook = CheckPartitionerVarHook(self, LANGUAGE_WEIGHT_NAME, x_dim,
-                                     partitions)
-      linear_regressor.train(input_fn=_input_fn, steps=1, hooks=[hook])
-
-
-# TODO(b/36813849): Add tests with dynamic shape inputs using placeholders.
-class BaseLinearRegressorEvaluationTest(object):
-
-  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
-    self._linear_regressor_fn = linear_regressor_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_evaluation_for_simple_data(self):
-    with ops.Graph().as_default():
-      variables_lib.Variable([[11.0]], name=AGE_WEIGHT_NAME)
-      variables_lib.Variable([2.0], name=BIAS_NAME)
-      variables_lib.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        model_dir=self._model_dir)
-    eval_metrics = linear_regressor.evaluate(
-        input_fn=lambda: ({'age': ((1,),)}, ((10.,),)), steps=1)
-
-    # Logit is (1. * 11.0 + 2.0) = 13, while label is 10. Loss is 3**2 = 9.
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 9.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 13.,
-        metric_keys.MetricKeys.LABEL_MEAN: 10.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_batch(self):
-    """Tests evaluation for batch_size==2."""
-    with ops.Graph().as_default():
-      variables_lib.Variable([[11.0]], name=AGE_WEIGHT_NAME)
-      variables_lib.Variable([2.0], name=BIAS_NAME)
-      variables_lib.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        model_dir=self._model_dir)
-    eval_metrics = linear_regressor.evaluate(
-        input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
-
-    # Logit is (1. * 11.0 + 2.0) = 13, while label is 10.
-    # Loss per example is 3**2 = 9.
-    # Training loss is the sum over batch = 9 + 9 = 18
-    # Average loss is the average over batch = 9
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 18.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 13.,
-        metric_keys.MetricKeys.LABEL_MEAN: 10.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_weights(self):
-    """Tests evaluation with weights."""
-    with ops.Graph().as_default():
-      variables_lib.Variable([[11.0]], name=AGE_WEIGHT_NAME)
-      variables_lib.Variable([2.0], name=BIAS_NAME)
-      variables_lib.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    def _input_fn():
-      features = {'age': ((1,), (1,)), 'weights': ((1.,), (2.,))}
-      labels = ((10.,), (10.,))
-      return features, labels
-
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        weight_column='weights',
-        model_dir=self._model_dir)
-    eval_metrics = linear_regressor.evaluate(input_fn=_input_fn, steps=1)
-
-    # Logit is (1. * 11.0 + 2.0) = 13, while label is 10.
-    # Loss per example is 3**2 = 9.
-    # Training loss is the weighted sum over batch = 9 + 2*9 = 27
-    # average loss is the weighted average = 9 + 2*9 / (1 + 2) = 9
-    self.assertDictEqual({
-        metric_keys.MetricKeys.LOSS: 27.,
-        metric_keys.MetricKeys.LOSS_MEAN: 9.,
-        metric_keys.MetricKeys.PREDICTION_MEAN: 13.,
-        metric_keys.MetricKeys.LABEL_MEAN: 10.,
-        ops.GraphKeys.GLOBAL_STEP: 100
-    }, eval_metrics)
-
-  def test_evaluation_for_multi_dimensions(self):
-    x_dim = 3
-    label_dim = 2
-    with ops.Graph().as_default():
-      variables_lib.Variable(
-          [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], name=AGE_WEIGHT_NAME)
-      variables_lib.Variable([7.0, 8.0], name=BIAS_NAME)
-      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('age', shape=(x_dim,)),),
-        label_dimension=label_dim,
-        model_dir=self._model_dir)
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'age': np.array([[2., 4., 5.]]),
-        },
-        y=np.array([[46., 58.]]),
-        batch_size=1,
-        num_epochs=None,
-        shuffle=False)
-    eval_metrics = linear_regressor.evaluate(input_fn=input_fn, steps=1)
-
-    self.assertItemsEqual(
-        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
-         metric_keys.MetricKeys.PREDICTION_MEAN,
-         metric_keys.MetricKeys.LABEL_MEAN, ops.GraphKeys.GLOBAL_STEP),
-        eval_metrics.keys())
-
-    # Logit is
-    #   [2., 4., 5.] * [1.0, 2.0] + [7.0, 8.0] = [39, 50] + [7.0, 8.0]
-    #                  [3.0, 4.0]
-    #                  [5.0, 6.0]
-    # which is [46, 58]
-    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
-
-  def test_evaluation_for_multiple_feature_columns(self):
-    with ops.Graph().as_default():
-      variables_lib.Variable([[10.0]], name=AGE_WEIGHT_NAME)
-      variables_lib.Variable([[2.0]], name=HEIGHT_WEIGHT_NAME)
-      variables_lib.Variable([5.0], name=BIAS_NAME)
-      variables_lib.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    batch_size = 2
-    feature_columns = [
-        self._fc_lib.numeric_column('age'),
-        self._fc_lib.numeric_column('height')
-    ]
-    input_fn = numpy_io.numpy_input_fn(
-        x={'age': np.array([20, 40]),
-           'height': np.array([4, 8])},
-        y=np.array([[213.], [421.]]),
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=False)
-
-    est = self._linear_regressor_fn(
-        feature_columns=feature_columns, model_dir=self._model_dir)
-
-    eval_metrics = est.evaluate(input_fn=input_fn, steps=1)
-    self.assertItemsEqual(
-        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
-         metric_keys.MetricKeys.PREDICTION_MEAN,
-         metric_keys.MetricKeys.LABEL_MEAN, ops.GraphKeys.GLOBAL_STEP),
-        eval_metrics.keys())
-
-    # Logit is [(20. * 10.0 + 4 * 2.0 + 5.0), (40. * 10.0 + 8 * 2.0 + 5.0)] =
-    # [213.0, 421.0], while label is [213., 421.]. Loss = 0.
-    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
-
-
-class BaseLinearRegressorPredictTest(object):
-
-  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
-    self._linear_regressor_fn = linear_regressor_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def test_1d(self):
-    """Tests predict when all variables are one-dimensional."""
-    with ops.Graph().as_default():
-      variables_lib.Variable([[10.]], name='linear/linear_model/x/weights')
-      variables_lib.Variable([.2], name=BIAS_NAME)
-      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('x'),),
-        model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': np.array([[2.]])},
-        y=None,
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    predictions = linear_regressor.predict(input_fn=predict_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    # x * weight + bias = 2. * 10. + .2 = 20.2
-    self.assertAllClose([[20.2]], predicted_scores)
-
-  def testMultiDim(self):
-    """Tests predict when all variables are multi-dimenstional."""
-    batch_size = 2
-    label_dimension = 3
-    x_dim = 4
-    feature_columns = (self._fc_lib.numeric_column('x', shape=(x_dim,)),)
-    with ops.Graph().as_default():
-      variables_lib.Variable(  # shape=[x_dim, label_dimension]
-          [[1., 2., 3.], [2., 3., 4.], [3., 4., 5.], [4., 5., 6.]],
-          name='linear/linear_model/x/weights')
-      variables_lib.Variable(  # shape=[label_dimension]
-          [.2, .4, .6], name=BIAS_NAME)
-      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=feature_columns,
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        # x shape=[batch_size, x_dim]
-        x={'x': np.array([[1., 2., 3., 4.], [5., 6., 7., 8.]])},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predictions = linear_regressor.predict(input_fn=predict_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    # score = x * weight + bias, shape=[batch_size, label_dimension]
-    self.assertAllClose([[30.2, 40.4, 50.6], [70.2, 96.4, 122.6]],
-                        predicted_scores)
-
-  def testTwoFeatureColumns(self):
-    """Tests predict with two feature columns."""
-    with ops.Graph().as_default():
-      variables_lib.Variable([[10.]], name='linear/linear_model/x0/weights')
-      variables_lib.Variable([[20.]], name='linear/linear_model/x1/weights')
-      variables_lib.Variable([.2], name=BIAS_NAME)
-      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('x0'),
-                         self._fc_lib.numeric_column('x1')),
-        model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x0': np.array([[2.]]),
-           'x1': np.array([[3.]])},
-        y=None,
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    predictions = linear_regressor.predict(input_fn=predict_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    # x0 * weight0 + x1 * weight1 + bias = 2. * 10. + 3. * 20 + .2 = 80.2
-    self.assertAllClose([[80.2]], predicted_scores)
-
-  def testSparseCombiner(self):
-    w_a = 2.0
-    w_b = 3.0
-    w_c = 5.0
-    bias = 5.0
-    with ops.Graph().as_default():
-      variables_lib.Variable([[w_a], [w_b], [w_c]], name=LANGUAGE_WEIGHT_NAME)
-      variables_lib.Variable([bias], name=BIAS_NAME)
-      variables_lib.Variable(1, name=ops.GraphKeys.GLOBAL_STEP,
-                             dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    def _input_fn():
-      return dataset_ops.Dataset.from_tensors({
-          'language': sparse_tensor.SparseTensor(
-              values=['a', 'c', 'b', 'c'],
-              indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
-              dense_shape=[2, 2]),
-      })
-
-    feature_columns = (self._fc_lib.categorical_column_with_vocabulary_list(
-        'language', vocabulary_list=['a', 'b', 'c']),)
-
-    # Check prediction for each sparse_combiner.
-    # With sparse_combiner = 'sum', we have
-    # logits_1 = w_a + w_c + bias
-    #          = 2.0 + 5.0 + 5.0 = 12.0
-    # logits_2 = w_b + w_c + bias
-    #          = 3.0 + 5.0 + 5.0 = 13.0
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=feature_columns,
-        model_dir=self._model_dir)
-    predictions = linear_regressor.predict(input_fn=_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    self.assertAllClose([[12.0], [13.0]], predicted_scores)
-
-    # With sparse_combiner = 'mean', we have
-    # logits_1 = 1/2 * (w_a + w_c) + bias
-    #          = 1/2 * (2.0 + 5.0) + 5.0 = 8.5
-    # logits_2 = 1/2 * (w_b + w_c) + bias
-    #          = 1/2 * (3.0 + 5.0) + 5.0 = 9.0
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=feature_columns,
-        model_dir=self._model_dir,
-        sparse_combiner='mean')
-    predictions = linear_regressor.predict(input_fn=_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    self.assertAllClose([[8.5], [9.0]], predicted_scores)
-
-    # With sparse_combiner = 'sqrtn', we have
-    # logits_1 = sqrt(2)/2 * (w_a + w_c) + bias
-    #          = sqrt(2)/2 * (2.0 + 5.0) + 5.0 = 9.94974
-    # logits_2 = sqrt(2)/2 * (w_b + w_c) + bias
-    #          = sqrt(2)/2 * (3.0 + 5.0) + 5.0 = 10.65685
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=feature_columns,
-        model_dir=self._model_dir,
-        sparse_combiner='sqrtn')
-    predictions = linear_regressor.predict(input_fn=_input_fn)
-    predicted_scores = list([x['predictions'] for x in predictions])
-    self.assertAllClose([[9.94974], [10.65685]], predicted_scores)
-
-
-class BaseLinearRegressorIntegrationTest(object):
-
-  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
-    self._linear_regressor_fn = linear_regressor_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
-                          input_dimension, label_dimension, prediction_length):
-    feature_columns = [
-        self._fc_lib.numeric_column('x', shape=(input_dimension,))
-    ]
-    est = self._linear_regressor_fn(
-        feature_columns=feature_columns,
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    # learn y = x
-    est.train(train_input_fn, steps=200)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array(
-        [x['predictions'] for x in est.predict(predict_input_fn)])
-    self.assertAllEqual((prediction_length, label_dimension), predictions.shape)
-
-    # EXPORT
-    feature_spec = self._fc_lib.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def test_numpy_input_fn(self):
-    """Tests complete flow with numpy_input_fn."""
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-  def test_pandas_input_fn(self):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-
-    # Pandas DataFrame natually supports 1 dim data only.
-    label_dimension = 1
-    input_dimension = label_dimension
-    batch_size = 10
-    data = np.array([1., 2., 3., 4.], dtype=np.float32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(data)
-    prediction_length = 4
-
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-  def test_input_fn_from_parse_example(self):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-
-    serialized_examples = []
-    for datum in data:
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=datum)),
-              'y':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=datum[:label_dimension])),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
-    }
-
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=input_dimension,
-        label_dimension=label_dimension,
-        prediction_length=prediction_length)
-
-
-class BaseLinearRegressorTrainingTest(object):
-
-  def __init__(self, linear_regressor_fn, fc_lib=feature_column):
-    self._linear_regressor_fn = linear_regressor_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
-
-  def _mock_optimizer(self, expected_loss=None):
-    expected_var_names = [
-        '%s/part_0:0' % AGE_WEIGHT_NAME,
-        '%s/part_0:0' % BIAS_NAME
-    ]
-
-    def _minimize(loss, global_step=None, var_list=None):
-      trainable_vars = var_list or ops.get_collection(
-          ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertItemsEqual(expected_var_names,
-                            [var.name for var in trainable_vars])
-
-      # Verify loss. We can't check the value directly, so we add an assert op.
-      self.assertEquals(0, loss.shape.ndims)
-      if expected_loss is None:
-        if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
-        return control_flow_ops.no_op()
-      assert_loss = assert_close(
-          math_ops.to_float(expected_loss, name='expected'),
-          loss,
-          name='assert_loss')
-      with ops.control_dependencies((assert_loss,)):
-        if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
-        return control_flow_ops.no_op()
-
-    mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer_lib.Optimizer,
-        wraps=optimizer_lib.Optimizer(use_locking=False, name='my_optimizer'))
-    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
-
-    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
-    # So, return mock_optimizer itself for deepcopy.
-    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
-    return mock_optimizer
-
-  def _assert_checkpoint(self,
-                         expected_global_step,
-                         expected_age_weight=None,
-                         expected_bias=None):
-    shapes = {
-        name: shape
-        for (name, shape) in checkpoint_utils.list_variables(self._model_dir)
-    }
-
-    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
-    self.assertEqual(expected_global_step,
-                     checkpoint_utils.load_variable(self._model_dir,
-                                                    ops.GraphKeys.GLOBAL_STEP))
-
-    self.assertEqual([1, 1], shapes[AGE_WEIGHT_NAME])
-    if expected_age_weight is not None:
-      self.assertEqual(expected_age_weight,
-                       checkpoint_utils.load_variable(self._model_dir,
-                                                      AGE_WEIGHT_NAME))
-
-    self.assertEqual([1], shapes[BIAS_NAME])
-    if expected_bias is not None:
-      self.assertEqual(expected_bias,
-                       checkpoint_utils.load_variable(self._model_dir,
-                                                      BIAS_NAME))
-
-  def testFromScratchWithDefaultOptimizer(self):
-    # Create LinearRegressor.
-    label = 5.
-    age = 17
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        model_dir=self._model_dir)
-
-    # Train for a few steps, and validate final checkpoint.
-    num_steps = 10
-    linear_regressor.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self._assert_checkpoint(num_steps)
-
-  def testTrainWithOneDimLabel(self):
-    label_dimension = 1
-    batch_size = 20
-    feature_columns = [self._fc_lib.numeric_column('age', shape=(1,))]
-    est = self._linear_regressor_fn(
-        feature_columns=feature_columns,
-        label_dimension=label_dimension,
-        model_dir=self._model_dir)
-    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
-    self.assertEqual((batch_size,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1},
-        y=data_rank_1,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(200)
-
-  def testTrainWithOneDimWeight(self):
-    label_dimension = 1
-    batch_size = 20
-    feature_columns = [self._fc_lib.numeric_column('age', shape=(1,))]
-    est = self._linear_regressor_fn(
-        feature_columns=feature_columns,
-        label_dimension=label_dimension,
-        weight_column='w',
-        model_dir=self._model_dir)
-
-    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
-    self.assertEqual((batch_size,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1,
-           'w': data_rank_1},
-        y=data_rank_1,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(200)
-
-  def testFromScratch(self):
-    # Create LinearRegressor.
-    label = 5.
-    age = 17
-    # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
-    mock_optimizer = self._mock_optimizer(expected_loss=25.)
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    linear_regressor.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        expected_global_step=num_steps,
-        expected_age_weight=0.,
-        expected_bias=0.)
-
-  def testFromCheckpoint(self):
-    # Create initial checkpoint.
-    age_weight = 10.0
-    bias = 5.0
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables_lib.Variable([[age_weight]], name=AGE_WEIGHT_NAME)
-      variables_lib.Variable([bias], name=BIAS_NAME)
-      variables_lib.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = age * age_weight + bias = 17 * 10. + 5. = 175
-    # loss = (logits - label)^2 = (175 - 5)^2 = 28900
-    mock_optimizer = self._mock_optimizer(expected_loss=28900.)
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    linear_regressor.train(
-        input_fn=lambda: ({'age': ((17,),)}, ((5.,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        expected_global_step=initial_global_step + num_steps,
-        expected_age_weight=age_weight,
-        expected_bias=bias)
-
-  def testFromCheckpointMultiBatch(self):
-    # Create initial checkpoint.
-    age_weight = 10.0
-    bias = 5.0
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables_lib.Variable([[age_weight]], name=AGE_WEIGHT_NAME)
-      variables_lib.Variable([bias], name=BIAS_NAME)
-      variables_lib.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = age * age_weight + bias
-    # logits[0] = 17 * 10. + 5. = 175
-    # logits[1] = 15 * 10. + 5. = 155
-    # loss = sum(logits - label)^2 = (175 - 5)^2 + (155 - 3)^2 = 52004
-    mock_optimizer = self._mock_optimizer(expected_loss=52004.)
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        model_dir=self._model_dir,
-        optimizer=mock_optimizer)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    linear_regressor.train(
-        input_fn=lambda: ({'age': ((17,), (15,))}, ((5.,), (3.,))),
-        steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        expected_global_step=initial_global_step + num_steps,
-        expected_age_weight=age_weight,
-        expected_bias=bias)
-
-
-class BaseLinearClassifierTrainingTest(object):
-
-  def __init__(self, linear_classifier_fn, fc_lib=feature_column):
-    self._linear_classifier_fn = linear_classifier_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _mock_optimizer(self, expected_loss=None):
-    expected_var_names = [
-        '%s/part_0:0' % AGE_WEIGHT_NAME,
-        '%s/part_0:0' % BIAS_NAME
-    ]
-
-    def _minimize(loss, global_step):
-      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertItemsEqual(
-          expected_var_names,
-          [var.name for var in trainable_vars])
-
-      # Verify loss. We can't check the value directly, so we add an assert op.
-      self.assertEquals(0, loss.shape.ndims)
-      if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
-      assert_loss = assert_close(
-          math_ops.to_float(expected_loss, name='expected'),
-          loss,
-          name='assert_loss')
-      with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
-
-    mock_optimizer = test.mock.NonCallableMock(
-        spec=optimizer_lib.Optimizer,
-        wraps=optimizer_lib.Optimizer(use_locking=False, name='my_optimizer'))
-    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
-
-    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
-    # So, return mock_optimizer itself for deepcopy.
-    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
-    return mock_optimizer
-
-  def _assert_checkpoint(
-      self, n_classes, expected_global_step, expected_age_weight=None,
-      expected_bias=None):
-    logits_dimension = n_classes if n_classes > 2 else 1
-
-    shapes = {
-        name: shape for (name, shape) in
-        checkpoint_utils.list_variables(self._model_dir)
-    }
-
-    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
-    self.assertEqual(
-        expected_global_step,
-        checkpoint_utils.load_variable(
-            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
-
-    self.assertEqual([1, logits_dimension],
-                     shapes[AGE_WEIGHT_NAME])
-    if expected_age_weight is not None:
-      self.assertAllEqual(expected_age_weight,
-                          checkpoint_utils.load_variable(
-                              self._model_dir,
-                              AGE_WEIGHT_NAME))
-
-    self.assertEqual([logits_dimension], shapes[BIAS_NAME])
-    if expected_bias is not None:
-      self.assertAllEqual(expected_bias,
-                          checkpoint_utils.load_variable(
-                              self._model_dir, BIAS_NAME))
-
-  def _testFromScratchWithDefaultOptimizer(self, n_classes):
-    label = 0
-    age = 17
-    est = linear.LinearClassifier(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # Train for a few steps, and validate final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self._assert_checkpoint(n_classes, num_steps)
-
-  def testBinaryClassesFromScratchWithDefaultOptimizer(self):
-    self._testFromScratchWithDefaultOptimizer(n_classes=2)
-
-  def testMultiClassesFromScratchWithDefaultOptimizer(self):
-    self._testFromScratchWithDefaultOptimizer(n_classes=4)
-
-  def _testTrainWithTwoDimsLabel(self, n_classes):
-    batch_size = 20
-
-    est = linear.LinearClassifier(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    data_rank_2 = np.array([[0], [1]])
-    self.assertEqual((2,), data_rank_1.shape)
-    self.assertEqual((2, 1), data_rank_2.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1},
-        y=data_rank_2,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithTwoDimsLabel(self):
-    self._testTrainWithTwoDimsLabel(n_classes=2)
-
-  def testMultiClassesTrainWithTwoDimsLabel(self):
-    self._testTrainWithTwoDimsLabel(n_classes=4)
-
-  def _testTrainWithOneDimLabel(self, n_classes):
-    batch_size = 20
-
-    est = linear.LinearClassifier(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    self.assertEqual((2,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1},
-        y=data_rank_1,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithOneDimLabel(self):
-    self._testTrainWithOneDimLabel(n_classes=2)
-
-  def testMultiClassesTrainWithOneDimLabel(self):
-    self._testTrainWithOneDimLabel(n_classes=4)
-
-  def _testTrainWithTwoDimsWeight(self, n_classes):
-    batch_size = 20
-
-    est = linear.LinearClassifier(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        weight_column='w',
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    data_rank_2 = np.array([[0], [1]])
-    self.assertEqual((2,), data_rank_1.shape)
-    self.assertEqual((2, 1), data_rank_2.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1, 'w': data_rank_2}, y=data_rank_1,
-        batch_size=batch_size, num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithTwoDimsWeight(self):
-    self._testTrainWithTwoDimsWeight(n_classes=2)
-
-  def testMultiClassesTrainWithTwoDimsWeight(self):
-    self._testTrainWithTwoDimsWeight(n_classes=4)
-
-  def _testTrainWithOneDimWeight(self, n_classes):
-    batch_size = 20
-
-    est = linear.LinearClassifier(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        weight_column='w',
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    data_rank_1 = np.array([0, 1])
-    self.assertEqual((2,), data_rank_1.shape)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'age': data_rank_1, 'w': data_rank_1}, y=data_rank_1,
-        batch_size=batch_size, num_epochs=None,
-        shuffle=True)
-    est.train(train_input_fn, steps=200)
-    self._assert_checkpoint(n_classes, 200)
-
-  def testBinaryClassesTrainWithOneDimWeight(self):
-    self._testTrainWithOneDimWeight(n_classes=2)
-
-  def testMultiClassesTrainWithOneDimWeight(self):
-    self._testTrainWithOneDimWeight(n_classes=4)
-
-  def _testFromScratch(self, n_classes):
-    label = 1
-    age = 17
-    # For binary classifier:
-    #   loss = sigmoid_cross_entropy(logits, label) where logits=0 (weights are
-    #   all zero initially) and label = 1 so,
-    #      loss = 1 * -log ( sigmoid(logits) ) = 0.69315
-    # For multi class classifier:
-    #   loss = cross_entropy(logits, label) where logits are all 0s (weights are
-    #   all zero initially) and label = 1 so,
-    #      loss = 1 * -log ( 1.0 / n_classes )
-    # For this particular test case, as logits are same, the formular
-    # 1 * -log ( 1.0 / n_classes ) covers both binary and multi class cases.
-    mock_optimizer = self._mock_optimizer(
-        expected_loss=-1 * math.log(1.0/n_classes))
-
-    est = linear.LinearClassifier(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        n_classes,
-        expected_global_step=num_steps,
-        expected_age_weight=[[0.]] if n_classes == 2 else [[0.] * n_classes],
-        expected_bias=[0.] if n_classes == 2 else [.0] * n_classes)
-
-  def testBinaryClassesFromScratch(self):
-    self._testFromScratch(n_classes=2)
-
-  def testMultiClassesFromScratch(self):
-    self._testFromScratch(n_classes=4)
-
-  def _testFromCheckpoint(self, n_classes):
-    # Create initial checkpoint.
-    label = 1
-    age = 17
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    age_weight = [[2.0]] if n_classes == 2 else (
-        np.reshape(2.0 * np.array(list(range(n_classes)), dtype=np.float32),
-                   (1, n_classes)))
-    bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables_lib.Variable(bias, name=BIAS_NAME)
-      variables_lib.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # For binary classifier:
-    #   logits = age * age_weight + bias = 17 * 2. - 35. = -1.
-    #   loss = sigmoid_cross_entropy(logits, label)
-    #   so, loss = 1 * -log ( sigmoid(-1) ) = 1.3133
-    # For multi class classifier:
-    #   loss = cross_entropy(logits, label)
-    #   where logits = 17 * age_weight + bias and label = 1
-    #   so, loss = 1 * -log ( soft_max(logits)[1] )
-    if n_classes == 2:
-      expected_loss = 1.3133
-    else:
-      logits = age_weight * age + bias
-      logits_exp = np.exp(logits)
-      softmax = logits_exp / logits_exp.sum()
-      expected_loss = -1 * math.log(softmax[0, label])
-
-    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
-
-    est = linear.LinearClassifier(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        n_classes,
-        expected_global_step=initial_global_step + num_steps,
-        expected_age_weight=age_weight,
-        expected_bias=bias)
-
-  def testBinaryClassesFromCheckpoint(self):
-    self._testFromCheckpoint(n_classes=2)
-
-  def testMultiClassesFromCheckpoint(self):
-    self._testFromCheckpoint(n_classes=4)
-
-  def _testFromCheckpointFloatLabels(self, n_classes):
-    """Tests float labels for binary classification."""
-    # Create initial checkpoint.
-    if n_classes > 2:
-      return
-    label = 0.8
-    age = 17
-    age_weight = [[2.0]]
-    bias = [-35.0]
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables_lib.Variable(bias, name=BIAS_NAME)
-      variables_lib.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # logits = age * age_weight + bias = 17 * 2. - 35. = -1.
-    # loss = sigmoid_cross_entropy(logits, label)
-    # => loss = -0.8 * log(sigmoid(-1)) -0.2 * log(sigmoid(+1)) = 1.1132617
-    mock_optimizer = self._mock_optimizer(expected_loss=1.1132617)
-
-    est = linear.LinearClassifier(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-
-  def testBinaryClassesFromCheckpointFloatLabels(self):
-    self._testFromCheckpointFloatLabels(n_classes=2)
-
-  def testMultiClassesFromCheckpointFloatLabels(self):
-    self._testFromCheckpointFloatLabels(n_classes=4)
-
-  def _testFromCheckpointMultiBatch(self, n_classes):
-    # Create initial checkpoint.
-    label = [1, 0]
-    age = [17, 18.5]
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    age_weight = [[2.0]] if n_classes == 2 else (
-        np.reshape(2.0 * np.array(list(range(n_classes)), dtype=np.float32),
-                   (1, n_classes)))
-    bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables_lib.Variable(bias, name=BIAS_NAME)
-      variables_lib.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    # For binary classifier:
-    #   logits = age * age_weight + bias
-    #   logits[0] = 17 * 2. - 35. = -1.
-    #   logits[1] = 18.5 * 2. - 35. = 2.
-    #   loss = sigmoid_cross_entropy(logits, label)
-    #   so, loss[0] = 1 * -log ( sigmoid(-1) ) = 1.3133
-    #       loss[1] = (1 - 0) * -log ( 1- sigmoid(2) ) = 2.1269
-    # For multi class classifier:
-    #   loss = cross_entropy(logits, label)
-    #   where logits = [17, 18.5] * age_weight + bias and label = [1, 0]
-    #   so, loss = 1 * -log ( soft_max(logits)[label] )
-    if n_classes == 2:
-      expected_loss = (1.3133 + 2.1269)
-    else:
-      logits = age_weight * np.reshape(age, (2, 1)) + bias
-      logits_exp = np.exp(logits)
-      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
-      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
-      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
-      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
-      expected_loss = expected_loss_0 + expected_loss_1
-
-    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
-
-    est = linear.LinearClassifier(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        optimizer=mock_optimizer,
-        model_dir=self._model_dir)
-    self.assertEqual(0, mock_optimizer.minimize.call_count)
-
-    # Train for a few steps, and validate optimizer and final checkpoint.
-    num_steps = 10
-    est.train(
-        input_fn=lambda: ({'age': (age)}, (label)),
-        steps=num_steps)
-    self.assertEqual(1, mock_optimizer.minimize.call_count)
-    self._assert_checkpoint(
-        n_classes,
-        expected_global_step=initial_global_step + num_steps,
-        expected_age_weight=age_weight,
-        expected_bias=bias)
-
-  def testBinaryClassesFromCheckpointMultiBatch(self):
-    self._testFromCheckpointMultiBatch(n_classes=2)
-
-  def testMultiClassesFromCheckpointMultiBatch(self):
-    self._testFromCheckpointMultiBatch(n_classes=4)
-
-
-class BaseLinearClassifierEvaluationTest(object):
-
-  def __init__(self, linear_classifier_fn, fc_lib=feature_column):
-    self._linear_classifier_fn = linear_classifier_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _test_evaluation_for_simple_data(self, n_classes):
-    label = 1
-    age = 1.
-
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    age_weight = [[-11.0]] if n_classes == 2 else (
-        np.reshape(-11.0 * np.array(list(range(n_classes)), dtype=np.float32),
-                   (1, n_classes)))
-    bias = [-30.0] if n_classes == 2 else [-30.0] * n_classes
-
-    with ops.Graph().as_default():
-      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables_lib.Variable(bias, name=BIAS_NAME)
-      variables_lib.Variable(
-          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = self._linear_classifier_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(
-        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=1)
-
-    if n_classes == 2:
-      # Binary classes: loss = sum(corss_entropy(41)) = 41.
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: 41.,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: 41.,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-          metric_keys.MetricKeys.PRECISION: 0.,
-          metric_keys.MetricKeys.RECALL: 0.,
-          metric_keys.MetricKeys.PREDICTION_MEAN: 0.,
-          metric_keys.MetricKeys.LABEL_MEAN: 1.,
-          metric_keys.MetricKeys.ACCURACY_BASELINE: 1,
-          metric_keys.MetricKeys.AUC: 0.,
-          metric_keys.MetricKeys.AUC_PR: 1.,
-      }
-    else:
-      # Multi classes: loss = 1 * -log ( soft_max(logits)[label] )
-      logits = age_weight * age + bias
-      logits_exp = np.exp(logits)
-      softmax = logits_exp / logits_exp.sum()
-      expected_loss = -1 * math.log(softmax[0, label])
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-      }
-
-    self.assertAllClose(sorted_key_dict(expected_metrics),
-                        sorted_key_dict(eval_metrics), rtol=1e-3)
-
-  def test_binary_classes_evaluation_for_simple_data(self):
-    self._test_evaluation_for_simple_data(n_classes=2)
-
-  def test_multi_classes_evaluation_for_simple_data(self):
-    self._test_evaluation_for_simple_data(n_classes=4)
-
-  def _test_evaluation_batch(self, n_classes):
-    """Tests evaluation for batch_size==2."""
-    label = [1, 0]
-    age = [17., 18.]
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    age_weight = [[2.0]] if n_classes == 2 else (
-        np.reshape(2.0 * np.array(list(range(n_classes)), dtype=np.float32),
-                   (1, n_classes)))
-    bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables_lib.Variable(bias, name=BIAS_NAME)
-      variables_lib.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = self._linear_classifier_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(
-        input_fn=lambda: ({'age': (age)}, (label)), steps=1)
-
-    if n_classes == 2:
-      # Logits are (-1., 1.) labels are (1, 0).
-      # Loss is
-      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
-      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(1)) = 1.3133
-      expected_loss = 1.3133 * 2
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-          metric_keys.MetricKeys.PRECISION: 0.,
-          metric_keys.MetricKeys.RECALL: 0.,
-          metric_keys.MetricKeys.PREDICTION_MEAN: 0.5,
-          metric_keys.MetricKeys.LABEL_MEAN: 0.5,
-          metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
-          metric_keys.MetricKeys.AUC: 0.,
-          metric_keys.MetricKeys.AUC_PR: 0.25,
-      }
-    else:
-      # Multi classes: loss = 1 * -log ( soft_max(logits)[label] )
-      logits = age_weight * np.reshape(age, (2, 1)) + bias
-      logits_exp = np.exp(logits)
-      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
-      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
-      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
-      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
-      expected_loss = expected_loss_0 + expected_loss_1
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-      }
-
-    self.assertAllClose(sorted_key_dict(expected_metrics),
-                        sorted_key_dict(eval_metrics), rtol=1e-3)
-
-  def test_binary_classes_evaluation_batch(self):
-    self._test_evaluation_batch(n_classes=2)
-
-  def test_multi_classes_evaluation_batch(self):
-    self._test_evaluation_batch(n_classes=4)
-
-  def _test_evaluation_weights(self, n_classes):
-    """Tests evaluation with weights."""
-
-    label = [1, 0]
-    age = [17., 18.]
-    weights = [1., 2.]
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    age_weight = [[2.0]] if n_classes == 2 else (
-        np.reshape(2.0 * np.array(list(range(n_classes)), dtype=np.float32),
-                   (1, n_classes)))
-    bias = [-35.0] if n_classes == 2 else [-35.0] * n_classes
-    initial_global_step = 100
-    with ops.Graph().as_default():
-      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables_lib.Variable(bias, name=BIAS_NAME)
-      variables_lib.Variable(
-          initial_global_step,
-          name=ops.GraphKeys.GLOBAL_STEP,
-          dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = self._linear_classifier_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        n_classes=n_classes,
-        weight_column='w',
-        model_dir=self._model_dir)
-    eval_metrics = est.evaluate(
-        input_fn=lambda: ({'age': (age), 'w': (weights)}, (label)), steps=1)
-
-    if n_classes == 2:
-      # Logits are (-1., 1.) labels are (1, 0).
-      # Loss is
-      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
-      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(1)) = 1.3133
-      #   weights = [1., 2.]
-      expected_loss = 1.3133 * (1. + 2.)
-      loss_mean = expected_loss / (1.0 + 2.0)
-      label_mean = np.average(label, weights=weights)
-      logits = [-1, 1]
-      logistics = sigmoid(np.array(logits))
-      predictions_mean = np.average(logistics, weights=weights)
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-          metric_keys.MetricKeys.PRECISION: 0.,
-          metric_keys.MetricKeys.RECALL: 0.,
-          metric_keys.MetricKeys.PREDICTION_MEAN: predictions_mean,
-          metric_keys.MetricKeys.LABEL_MEAN: label_mean,
-          metric_keys.MetricKeys.ACCURACY_BASELINE: (
-              max(label_mean, 1-label_mean)),
-          metric_keys.MetricKeys.AUC: 0.,
-          metric_keys.MetricKeys.AUC_PR: 0.1668,
-      }
-    else:
-      # Multi classes: unweighted_loss = 1 * -log ( soft_max(logits)[label] )
-      logits = age_weight * np.reshape(age, (2, 1)) + bias
-      logits_exp = np.exp(logits)
-      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
-      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
-      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
-      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
-      loss_mean = np.average([expected_loss_0, expected_loss_1],
-                             weights=weights)
-      expected_loss = loss_mean * np.sum(weights)
-
-      expected_metrics = {
-          metric_keys.MetricKeys.LOSS: expected_loss,
-          ops.GraphKeys.GLOBAL_STEP: 100,
-          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
-          metric_keys.MetricKeys.ACCURACY: 0.,
-      }
-
-    self.assertAllClose(sorted_key_dict(expected_metrics),
-                        sorted_key_dict(eval_metrics), rtol=1e-3)
-
-  def test_binary_classes_evaluation_weights(self):
-    self._test_evaluation_weights(n_classes=2)
-
-  def test_multi_classes_evaluation_weights(self):
-    self._test_evaluation_weights(n_classes=4)
-
-
-class BaseLinearClassifierPredictTest(object):
-
-  def __init__(self, linear_classifier_fn, fc_lib=feature_column):
-    self._linear_classifier_fn = linear_classifier_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _testPredictions(self, n_classes, label_vocabulary, label_output_fn):
-    """Tests predict when all variables are one-dimensional."""
-    age = 1.
-
-    # For binary case, the expected weight has shape (1,1). For multi class
-    # case, the shape is (1, n_classes). In order to test the weights, set
-    # weights as 2.0 * range(n_classes).
-    age_weight = [[-11.0]] if n_classes == 2 else (
-        np.reshape(-11.0 * np.array(list(range(n_classes)), dtype=np.float32),
-                   (1, n_classes)))
-    bias = [10.0] if n_classes == 2 else [10.0] * n_classes
-
-    with ops.Graph().as_default():
-      variables_lib.Variable(age_weight, name=AGE_WEIGHT_NAME)
-      variables_lib.Variable(bias, name=BIAS_NAME)
-      variables_lib.Variable(100, name='global_step', dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    est = self._linear_classifier_fn(
-        feature_columns=(self._fc_lib.numeric_column('age'),),
-        label_vocabulary=label_vocabulary,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'age': np.array([[age]])},
-        y=None,
-        batch_size=1,
-        num_epochs=1,
-        shuffle=False)
-    predictions = list(est.predict(input_fn=predict_input_fn))
-
-    if n_classes == 2:
-      scalar_logits = np.asscalar(
-          np.reshape(np.array(age_weight) * age + bias, (1,)))
-      two_classes_logits = [0, scalar_logits]
-      two_classes_logits_exp = np.exp(two_classes_logits)
-      softmax = two_classes_logits_exp / two_classes_logits_exp.sum()
-
-      expected_predictions = {
-          'class_ids': [0],
-          'classes': [label_output_fn(0)],
-          'logistic': [sigmoid(np.array(scalar_logits))],
-          'logits': [scalar_logits],
-          'probabilities': softmax,
-      }
-    else:
-      onedim_logits = np.reshape(np.array(age_weight) * age + bias, (-1,))
-      class_ids = onedim_logits.argmax()
-      logits_exp = np.exp(onedim_logits)
-      softmax = logits_exp / logits_exp.sum()
-      expected_predictions = {
-          'class_ids': [class_ids],
-          'classes': [label_output_fn(class_ids)],
-          'logits': onedim_logits,
-          'probabilities': softmax,
-      }
-
-    self.assertEqual(1, len(predictions))
-    # assertAllClose cannot handle byte type.
-    self.assertEqual(expected_predictions['classes'], predictions[0]['classes'])
-    expected_predictions.pop('classes')
-    predictions[0].pop('classes')
-    self.assertAllClose(sorted_key_dict(expected_predictions),
-                        sorted_key_dict(predictions[0]))
-
-  def testBinaryClassesWithoutLabelVocabulary(self):
-    n_classes = 2
-    self._testPredictions(n_classes,
-                          label_vocabulary=None,
-                          label_output_fn=lambda x: ('%s' % x).encode())
-
-  def testBinaryClassesWithLabelVocabulary(self):
-    n_classes = 2
-    self._testPredictions(
-        n_classes,
-        label_vocabulary=['class_vocab_{}'.format(i)
-                          for i in range(n_classes)],
-        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
-
-  def testMultiClassesWithoutLabelVocabulary(self):
-    n_classes = 4
-    self._testPredictions(
-        n_classes,
-        label_vocabulary=None,
-        label_output_fn=lambda x: ('%s' % x).encode())
-
-  def testMultiClassesWithLabelVocabulary(self):
-    n_classes = 4
-    self._testPredictions(
-        n_classes,
-        label_vocabulary=['class_vocab_{}'.format(i)
-                          for i in range(n_classes)],
-        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
-
-  def testSparseCombiner(self):
-    w_a = 2.0
-    w_b = 3.0
-    w_c = 5.0
-    bias = 5.0
-    with ops.Graph().as_default():
-      variables_lib.Variable([[w_a], [w_b], [w_c]], name=LANGUAGE_WEIGHT_NAME)
-      variables_lib.Variable([bias], name=BIAS_NAME)
-      variables_lib.Variable(1, name=ops.GraphKeys.GLOBAL_STEP,
-                             dtype=dtypes.int64)
-      save_variables_to_ckpt(self._model_dir)
-
-    def _input_fn():
-      return dataset_ops.Dataset.from_tensors({
-          'language': sparse_tensor.SparseTensor(
-              values=['a', 'c', 'b', 'c'],
-              indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
-              dense_shape=[2, 2]),
-      })
-
-    feature_columns = (self._fc_lib.categorical_column_with_vocabulary_list(
-        'language', vocabulary_list=['a', 'b', 'c']),)
-
-    # Check prediction for each sparse_combiner.
-    # With sparse_combiner = 'sum', we have
-    # logits_1 = w_a + w_c + bias
-    #          = 2.0 + 5.0 + 5.0 = 12.0
-    # logits_2 = w_b + w_c + bias
-    #          = 3.0 + 5.0 + 5.0 = 13.0
-    linear_classifier = self._linear_classifier_fn(
-        feature_columns=feature_columns,
-        model_dir=self._model_dir)
-    predictions = linear_classifier.predict(input_fn=_input_fn)
-    predicted_scores = list([x['logits'] for x in predictions])
-    self.assertAllClose([[12.0], [13.0]], predicted_scores)
-
-    # With sparse_combiner = 'mean', we have
-    # logits_1 = 1/2 * (w_a + w_c) + bias
-    #          = 1/2 * (2.0 + 5.0) + 5.0 = 8.5
-    # logits_2 = 1/2 * (w_b + w_c) + bias
-    #          = 1/2 * (3.0 + 5.0) + 5.0 = 9.0
-    linear_classifier = self._linear_classifier_fn(
-        feature_columns=feature_columns,
-        model_dir=self._model_dir,
-        sparse_combiner='mean')
-    predictions = linear_classifier.predict(input_fn=_input_fn)
-    predicted_scores = list([x['logits'] for x in predictions])
-    self.assertAllClose([[8.5], [9.0]], predicted_scores)
-
-    # With sparse_combiner = 'sqrtn', we have
-    # logits_1 = sqrt(2)/2 * (w_a + w_c) + bias
-    #          = sqrt(2)/2 * (2.0 + 5.0) + 5.0 = 9.94974
-    # logits_2 = sqrt(2)/2 * (w_b + w_c) + bias
-    #          = sqrt(2)/2 * (3.0 + 5.0) + 5.0 = 10.65685
-    linear_classifier = self._linear_classifier_fn(
-        feature_columns=feature_columns,
-        model_dir=self._model_dir,
-        sparse_combiner='sqrtn')
-    predictions = linear_classifier.predict(input_fn=_input_fn)
-    predicted_scores = list([x['logits'] for x in predictions])
-    self.assertAllClose([[9.94974], [10.65685]], predicted_scores)
-
-
-class BaseLinearClassifierIntegrationTest(object):
-
-  def __init__(self, linear_classifier_fn, fc_lib=feature_column):
-    self._linear_classifier_fn = linear_classifier_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _test_complete_flow(self, n_classes, train_input_fn, eval_input_fn,
-                          predict_input_fn, input_dimension, prediction_length):
-    feature_columns = [
-        self._fc_lib.numeric_column('x', shape=(input_dimension,))
-    ]
-    est = self._linear_classifier_fn(
-        feature_columns=feature_columns,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
-
-    # TRAIN
-    # learn y = x
-    est.train(train_input_fn, steps=200)
-
-    # EVALUTE
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
-
-    # PREDICT
-    predictions = np.array(
-        [x['classes'] for x in est.predict(predict_input_fn)])
-    self.assertAllEqual((prediction_length, 1), predictions.shape)
-
-    # EXPORT
-    feature_spec = self._fc_lib.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def _test_numpy_input_fn(self, n_classes):
-    """Tests complete flow with numpy_input_fn."""
-    input_dimension = 4
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, input_dimension)
-    target = np.array([1] * batch_size)
-
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=target,
-        batch_size=batch_size,
-        num_epochs=None,
-        shuffle=True)
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=target,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data},
-        y=None,
-        batch_size=batch_size,
-        num_epochs=1,
-        shuffle=False)
-
-    self._test_complete_flow(
-        n_classes=n_classes,
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        prediction_length=prediction_length)
-
-  def test_binary_classes_numpy_input_fn(self):
-    self._test_numpy_input_fn(n_classes=2)
-
-  def test_multi_classes_numpy_input_fn(self):
-    self._test_numpy_input_fn(n_classes=4)
-
-  def _test_pandas_input_fn(self, n_classes):
-    """Tests complete flow with pandas_input_fn."""
-    if not HAS_PANDAS:
-      return
-
-    # Pandas DataFrame natually supports 1 dim data only.
-    input_dimension = 1
-    batch_size = 10
-    data = np.array([1., 2., 3., 4.], dtype=np.float32)
-    target = np.array([1, 0, 1, 0], dtype=np.int32)
-    x = pd.DataFrame({'x': data})
-    y = pd.Series(target)
-    prediction_length = 4
-
-    train_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
-    eval_input_fn = pandas_io.pandas_input_fn(
-        x=x, y=y, batch_size=batch_size, shuffle=False)
-    predict_input_fn = pandas_io.pandas_input_fn(
-        x=x, batch_size=batch_size, shuffle=False)
-
-    self._test_complete_flow(
-        n_classes=n_classes,
-        train_input_fn=train_input_fn,
-        eval_input_fn=eval_input_fn,
-        predict_input_fn=predict_input_fn,
-        input_dimension=input_dimension,
-        prediction_length=prediction_length)
-
-  def test_binary_classes_pandas_input_fn(self):
-    self._test_pandas_input_fn(n_classes=2)
-
-  def test_multi_classes_pandas_input_fn(self):
-    self._test_pandas_input_fn(n_classes=4)
-
-  def _test_input_fn_from_parse_example(self, n_classes):
-    """Tests complete flow with input_fn constructed from parse_example."""
-    input_dimension = 2
-    batch_size = 10
-    prediction_length = batch_size
-    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, input_dimension)
-    target = np.array([1] * batch_size, dtype=np.int64)
-
-    serialized_examples = []
-    for x, y in zip(data, target):
-      example = example_pb2.Example(features=feature_pb2.Features(
-          feature={
-              'x':
-                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                      value=x)),
-              'y':
-                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                      value=[y])),
-          }))
-      serialized_examples.append(example.SerializeToString())
-
-    feature_spec = {
-        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
-        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
-    }
-
-    def _train_input_fn():
-      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _eval_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      labels = features.pop('y')
-      return features, labels
-
-    def _predict_input_fn():
-      feature_map = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features = queue_parsed_features(feature_map)
-      features.pop('y')
-      return features, None
-
-    self._test_complete_flow(
-        n_classes=n_classes,
-        train_input_fn=_train_input_fn,
-        eval_input_fn=_eval_input_fn,
-        predict_input_fn=_predict_input_fn,
-        input_dimension=input_dimension,
-        prediction_length=prediction_length)
-
-  def test_binary_classes_input_fn_from_parse_example(self):
-    self._test_input_fn_from_parse_example(n_classes=2)
-
-  def test_multi_classes_input_fn_from_parse_example(self):
-    self._test_input_fn_from_parse_example(n_classes=4)
-
-
-class BaseLinearLogitFnTest(object):
-
-  def __init__(self, fc_lib=feature_column):
-    self._fc_lib = fc_lib
-
-  def test_basic_logit_correctness(self):
-    """linear_logit_fn simply wraps feature_column_lib.linear_model."""
-    age = self._fc_lib.numeric_column('age')
-    with ops.Graph().as_default():
-      logit_fn = linear._linear_logit_fn_builder(units=2, feature_columns=[age])
-      logits = logit_fn(features={'age': [[23.], [31.]]})
-      bias_var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                                    'linear_model/bias_weights')[0]
-      age_var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                                   'linear_model/age')[0]
-      with tf_session.Session() as sess:
-        sess.run([variables_lib.global_variables_initializer()])
-        self.assertAllClose([[0., 0.], [0., 0.]], logits.eval())
-        sess.run(bias_var.assign([10., 5.]))
-        self.assertAllClose([[10., 5.], [10., 5.]], logits.eval())
-        sess.run(age_var.assign([[2.0, 3.0]]))
-        # [2 * 23 + 10, 3 * 23 + 5] = [56, 74].
-        # [2 * 31 + 10, 3 * 31 + 5] = [72, 98]
-        self.assertAllClose([[56., 74.], [72., 98.]], logits.eval())
-
-  def test_compute_fraction_of_zero(self):
-    """Tests the calculation of sparsity."""
-    if self._fc_lib != feature_column:
-      return
-    age = feature_column.numeric_column('age')
-    occupation = feature_column.categorical_column_with_hash_bucket(
-        'occupation', hash_bucket_size=5)
-    with ops.Graph().as_default():
-      cols_to_vars = {}
-      feature_column.linear_model(
-          features={
-              'age': [[23.], [31.]],
-              'occupation': [['doctor'], ['engineer']]
-          },
-          feature_columns=[age, occupation],
-          units=3,
-          cols_to_vars=cols_to_vars)
-      cols_to_vars.pop('bias')
-      fraction_zero = linear._compute_fraction_of_zero(cols_to_vars.values())
-      age_var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                                   'linear_model/age')[0]
-      with tf_session.Session() as sess:
-        sess.run([variables_lib.global_variables_initializer()])
-        # Upon initialization, all variables will be zero.
-        self.assertAllClose(1, fraction_zero.eval())
-
-        sess.run(age_var.assign([[2.0, 0.0, -1.0]]))
-        # 1 of the 3 age weights are zero, and all of the 15 (5 hash buckets
-        # x 3-dim output) are zero.
-        self.assertAllClose(16. / 18., fraction_zero.eval())
-
-  def test_compute_fraction_of_zero_v2(self):
-    """Tests the calculation of sparsity."""
-    if self._fc_lib != feature_column_v2:
-      return
-
-    age = feature_column_v2.numeric_column('age')
-    occupation = feature_column_v2.categorical_column_with_hash_bucket(
-        'occupation', hash_bucket_size=5)
-    shared_state_manager = feature_column_v2.SharedEmbeddingStateManager()
-    with ops.Graph().as_default():
-      model = feature_column_v2.LinearModel(
-          feature_columns=[age, occupation],
-          units=3,
-          shared_state_manager=shared_state_manager)
-      features = {
-          'age': [[23.], [31.]],
-          'occupation': [['doctor'], ['engineer']]
-      }
-      model(features)
-      variables = model.variables
-      variables.remove(model.bias_variable)
-      variables.extend(shared_state_manager.variables)
-      fraction_zero = linear._compute_fraction_of_zero(variables)
-      age_var = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                                   'linear_model/age')[0]
-      with tf_session.Session() as sess:
-        sess.run([variables_lib.global_variables_initializer()])
-        # Upon initialization, all variables will be zero.
-        self.assertAllClose(1, fraction_zero.eval())
-
-        sess.run(age_var.assign([[2.0, 0.0, -1.0]]))
-        # 1 of the 3 age weights are zero, and all of the 15 (5 hash buckets
-        # x 3-dim output) are zero.
-        self.assertAllClose(16. / 18., fraction_zero.eval())
-
-
-class BaseLinearWarmStartingTest(object):
-
-  def __init__(self,
-               _linear_classifier_fn,
-               _linear_regressor_fn,
-               fc_lib=feature_column):
-    self._linear_classifier_fn = _linear_classifier_fn
-    self._linear_regressor_fn = _linear_regressor_fn
-    self._fc_lib = fc_lib
-
-  def setUp(self):
-    # Create a directory to save our old checkpoint and vocabularies to.
-    self._ckpt_and_vocab_dir = tempfile.mkdtemp()
-
-    # Make a dummy input_fn.
-    def _input_fn():
-      features = {
-          'age': [[23.], [31.]],
-          'age_in_years': [[23.], [31.]],
-          'occupation': [['doctor'], ['consultant']]
-      }
-      return features, [0, 1]
-
-    self._input_fn = _input_fn
-
-  def tearDown(self):
-    # Clean up checkpoint / vocab dir.
-    writer_cache.FileWriterCache.clear()
-    shutil.rmtree(self._ckpt_and_vocab_dir)
-
-  def test_classifier_basic_warm_starting(self):
-    """Tests correctness of LinearClassifier default warm-start."""
-    age = self._fc_lib.numeric_column('age')
-
-    # Create a LinearClassifier and train to save a checkpoint.
-    linear_classifier = self._linear_classifier_fn(
-        feature_columns=[age],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        optimizer='SGD')
-    linear_classifier.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second LinearClassifier, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).
-    warm_started_linear_classifier = self._linear_classifier_fn(
-        feature_columns=[age],
-        n_classes=4,
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        warm_start_from=linear_classifier.model_dir)
-
-    warm_started_linear_classifier.train(input_fn=self._input_fn, max_steps=1)
-    for variable_name in warm_started_linear_classifier.get_variable_names():
-      self.assertAllClose(
-          linear_classifier.get_variable_value(variable_name),
-          warm_started_linear_classifier.get_variable_value(variable_name))
-
-  def test_regressor_basic_warm_starting(self):
-    """Tests correctness of LinearRegressor default warm-start."""
-    age = self._fc_lib.numeric_column('age')
-
-    # Create a LinearRegressor and train to save a checkpoint.
-    linear_regressor = self._linear_regressor_fn(
-        feature_columns=[age],
-        model_dir=self._ckpt_and_vocab_dir,
-        optimizer='SGD')
-    linear_regressor.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second LinearRegressor, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).
-    warm_started_linear_regressor = self._linear_regressor_fn(
-        feature_columns=[age],
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        warm_start_from=linear_regressor.model_dir)
-
-    warm_started_linear_regressor.train(input_fn=self._input_fn, max_steps=1)
-    for variable_name in warm_started_linear_regressor.get_variable_names():
-      self.assertAllClose(
-          linear_regressor.get_variable_value(variable_name),
-          warm_started_linear_regressor.get_variable_value(variable_name))
-
-  def test_warm_starting_selective_variables(self):
-    """Tests selecting variables to warm-start."""
-    age = self._fc_lib.numeric_column('age')
-
-    # Create a LinearClassifier and train to save a checkpoint.
-    linear_classifier = self._linear_classifier_fn(
-        feature_columns=[age],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        optimizer='SGD')
-    linear_classifier.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second LinearClassifier, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).
-    warm_started_linear_classifier = self._linear_classifier_fn(
-        feature_columns=[age],
-        n_classes=4,
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        # The provided regular expression will only warm-start the age variable
-        # and not the bias.
-        warm_start_from=estimator.WarmStartSettings(
-            ckpt_to_initialize_from=linear_classifier.model_dir,
-            vars_to_warm_start='.*(age).*'))
-
-    warm_started_linear_classifier.train(input_fn=self._input_fn, max_steps=1)
-    self.assertAllClose(
-        linear_classifier.get_variable_value(AGE_WEIGHT_NAME),
-        warm_started_linear_classifier.get_variable_value(AGE_WEIGHT_NAME))
-    # Bias should still be zero from initialization.
-    self.assertAllClose(
-        [0.0] * 4, warm_started_linear_classifier.get_variable_value(BIAS_NAME))
-
-  def test_warm_starting_with_vocab_remapping_and_partitioning(self):
-    """Tests warm-starting with vocab remapping and partitioning."""
-    vocab_list = ['doctor', 'lawyer', 'consultant']
-    vocab_file = os.path.join(self._ckpt_and_vocab_dir, 'occupation_vocab')
-    with open(vocab_file, 'w') as f:
-      f.write('\n'.join(vocab_list))
-    occupation = self._fc_lib.categorical_column_with_vocabulary_file(
-        'occupation',
-        vocabulary_file=vocab_file,
-        vocabulary_size=len(vocab_list))
-
-    # Create a LinearClassifier and train to save a checkpoint.
-    partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
-    linear_classifier = self._linear_classifier_fn(
-        feature_columns=[occupation],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        optimizer='SGD',
-        partitioner=partitioner)
-    linear_classifier.train(input_fn=self._input_fn, max_steps=1)
-
-    # Create a second LinearClassifier, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).  Use a new FeatureColumn with a
-    # different vocabulary for occupation.
-    new_vocab_list = ['doctor', 'consultant', 'engineer']
-    new_vocab_file = os.path.join(self._ckpt_and_vocab_dir,
-                                  'new_occupation_vocab')
-    with open(new_vocab_file, 'w') as f:
-      f.write('\n'.join(new_vocab_list))
-    new_occupation = self._fc_lib.categorical_column_with_vocabulary_file(
-        'occupation',
-        vocabulary_file=new_vocab_file,
-        vocabulary_size=len(new_vocab_list))
-    # We can create our VocabInfo object from the new and old occupation
-    # FeatureColumn's.
-    occupation_vocab_info = estimator.VocabInfo(
-        new_vocab=new_occupation.vocabulary_file,
-        new_vocab_size=new_occupation.vocabulary_size,
-        num_oov_buckets=new_occupation.num_oov_buckets,
-        old_vocab=occupation.vocabulary_file,
-        old_vocab_size=occupation.vocabulary_size,
-        # Can't use constant_initializer with load_and_remap.  In practice,
-        # use a truncated normal initializer.
-        backup_initializer=init_ops.random_uniform_initializer(
-            minval=0.39, maxval=0.39))
-    warm_started_linear_classifier = self._linear_classifier_fn(
-        feature_columns=[occupation],
-        n_classes=4,
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        warm_start_from=estimator.WarmStartSettings(
-            ckpt_to_initialize_from=linear_classifier.model_dir,
-            var_name_to_vocab_info={
-                OCCUPATION_WEIGHT_NAME: occupation_vocab_info
-            },
-            # Explicitly providing None here will only warm-start variables
-            # referenced in var_name_to_vocab_info (the bias will not be
-            # warm-started).
-            vars_to_warm_start=None),
-        partitioner=partitioner)
-
-    warm_started_linear_classifier.train(input_fn=self._input_fn, max_steps=1)
-    # 'doctor' was ID-0 and still ID-0.
-    self.assertAllClose(
-        linear_classifier.get_variable_value(OCCUPATION_WEIGHT_NAME)[0, :],
-        warm_started_linear_classifier.get_variable_value(
-            OCCUPATION_WEIGHT_NAME)[0, :])
-    # 'consultant' was ID-2 and now ID-1.
-    self.assertAllClose(
-        linear_classifier.get_variable_value(OCCUPATION_WEIGHT_NAME)[2, :],
-        warm_started_linear_classifier.get_variable_value(
-            OCCUPATION_WEIGHT_NAME)[1, :])
-    # 'engineer' is a new entry and should be initialized with the
-    # backup_initializer in VocabInfo.
-    self.assertAllClose([0.39] * 4,
-                        warm_started_linear_classifier.get_variable_value(
-                            OCCUPATION_WEIGHT_NAME)[2, :])
-    # Bias should still be zero (from initialization logic).
-    self.assertAllClose(
-        [0.0] * 4, warm_started_linear_classifier.get_variable_value(BIAS_NAME))
-
-  def test_warm_starting_with_naming_change(self):
-    """Tests warm-starting with a Tensor name remapping."""
-    age_in_years = self._fc_lib.numeric_column('age_in_years')
-
-    # Create a LinearClassifier and train to save a checkpoint.
-    linear_classifier = self._linear_classifier_fn(
-        feature_columns=[age_in_years],
-        model_dir=self._ckpt_and_vocab_dir,
-        n_classes=4,
-        optimizer='SGD')
-    linear_classifier.train(input_fn=self._input_fn, max_steps=1)
+from tensorflow_estimator.python.estimator.canned import linear_testing_utils
 
-    # Create a second LinearClassifier, warm-started from the first.  Use a
-    # learning_rate = 0.0 optimizer to check values (use SGD so we don't have
-    # accumulator values that change).
-    warm_started_linear_classifier = self._linear_classifier_fn(
-        feature_columns=[self._fc_lib.numeric_column('age')],
-        n_classes=4,
-        optimizer=gradient_descent.GradientDescentOptimizer(learning_rate=0.0),
-        # The 'age' variable correspond to the 'age_in_years' variable in the
-        # previous model.
-        warm_start_from=estimator.WarmStartSettings(
-            ckpt_to_initialize_from=linear_classifier.model_dir,
-            var_name_to_prev_var_name={
-                AGE_WEIGHT_NAME: AGE_WEIGHT_NAME.replace('age', 'age_in_years')
-            }))
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+linear_testing_utils.__all__ = [
+    s for s in dir(linear_testing_utils) if not s.startswith('__')
+]
 
-    warm_started_linear_classifier.train(input_fn=self._input_fn, max_steps=1)
-    self.assertAllClose(
-        linear_classifier.get_variable_value(
-            AGE_WEIGHT_NAME.replace('age', 'age_in_years')),
-        warm_started_linear_classifier.get_variable_value(AGE_WEIGHT_NAME))
-    # The bias is also warm-started (with no name remapping).
-    self.assertAllClose(
-        linear_classifier.get_variable_value(BIAS_NAME),
-        warm_started_linear_classifier.get_variable_value(BIAS_NAME))
+from tensorflow_estimator.python.estimator.canned.linear_testing_utils import *
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
index 9d49240fea4579fffe25172092080560ccd1d35d..4acd75f21b7ba5fa24b0eb88a7fecadffefbbfcc 100644
--- a/tensorflow/python/estimator/canned/metric_keys.py
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,43 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Enum for model prediction keys."""
+"""metric_keys python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.estimator import model_fn
-
-
-class MetricKeys(object):
-  """Metric key strings."""
-  LOSS = model_fn.LOSS_METRIC_KEY
-  LOSS_MEAN = model_fn.AVERAGE_LOSS_METRIC_KEY
-  LOSS_REGULARIZATION = 'regularization_loss'
-
-  ACCURACY = 'accuracy'
-  PRECISION = 'precision'
-  RECALL = 'recall'
-  # This is the best the model could do by always predicting one class.
-  # Should be < ACCURACY in a trained model.
-  ACCURACY_BASELINE = 'accuracy_baseline'
-  AUC = 'auc'
-  AUC_PR = 'auc_precision_recall'
-  LABEL_MEAN = 'label/mean'
-  PREDICTION_MEAN = 'prediction/mean'
-
-  # The following require a threshold applied, should be float in range (0, 1).
-  ACCURACY_AT_THRESHOLD = 'accuracy/positive_threshold_%g'
-  PRECISION_AT_THRESHOLD = 'precision/positive_threshold_%g'
-  RECALL_AT_THRESHOLD = 'recall/positive_threshold_%g'
+from tensorflow_estimator.python.estimator.canned import metric_keys
 
-  # The following require a class id applied.
-  PROBABILITY_MEAN_AT_CLASS = 'probability_mean/class%d'
-  AUC_AT_CLASS = 'auc/class%d'
-  AUC_PR_AT_CLASS = 'auc_precision_recall/class%d'
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+metric_keys.__all__ = [s for s in dir(metric_keys) if not s.startswith('__')]
 
-  # The following require a class name applied.
-  PROBABILITY_MEAN_AT_NAME = 'probability_mean/%s'
-  AUC_AT_NAME = 'auc/%s'
-  AUC_PR_AT_NAME = 'auc_precision_recall/%s'
+from tensorflow_estimator.python.estimator.canned.metric_keys import *
diff --git a/tensorflow/python/estimator/canned/optimizers.py b/tensorflow/python/estimator/canned/optimizers.py
index 8f51cc3a80dd9b91eb24a83577b7d0614615e008..da31c7af604b4001a50d26cb0d06f9db01490c34 100644
--- a/tensorflow/python/estimator/canned/optimizers.py
+++ b/tensorflow/python/estimator/canned/optimizers.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,69 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Methods related to optimizers used in canned_estimators."""
+"""optimizers python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-
-from tensorflow.python.training import adagrad
-from tensorflow.python.training import adam
-from tensorflow.python.training import ftrl
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.training import rmsprop
-
-
-_OPTIMIZER_CLS_NAMES = {
-    'Adagrad': adagrad.AdagradOptimizer,
-    'Adam': adam.AdamOptimizer,
-    'Ftrl': ftrl.FtrlOptimizer,
-    'RMSProp': rmsprop.RMSPropOptimizer,
-    'SGD': gradient_descent.GradientDescentOptimizer,
-}
-
-
-def get_optimizer_instance(opt, learning_rate=None):
-  """Returns an optimizer instance.
-
-  Supports the following types for the given `opt`:
-  * An `Optimizer` instance: Returns the given `opt`.
-  * A string: Creates an `Optimizer` subclass with the given `learning_rate`.
-    Supported strings:
-    * 'Adagrad': Returns an `AdagradOptimizer`.
-    * 'Adam': Returns an `AdamOptimizer`.
-    * 'Ftrl': Returns an `FtrlOptimizer`.
-    * 'RMSProp': Returns an `RMSPropOptimizer`.
-    * 'SGD': Returns a `GradientDescentOptimizer`.
-
-  Args:
-    opt: An `Optimizer` instance, or string, as discussed above.
-    learning_rate: A float. Only used if `opt` is a string.
+from tensorflow_estimator.python.estimator.canned import optimizers
 
-  Returns:
-    An `Optimizer` instance.
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+optimizers.__all__ = [s for s in dir(optimizers) if not s.startswith('__')]
 
-  Raises:
-    ValueError: If `opt` is an unsupported string.
-    ValueError: If `opt` is a supported string but `learning_rate` was not
-      specified.
-    ValueError: If `opt` is none of the above types.
-  """
-  if isinstance(opt, six.string_types):
-    if opt in six.iterkeys(_OPTIMIZER_CLS_NAMES):
-      if not learning_rate:
-        raise ValueError('learning_rate must be specified when opt is string.')
-      return _OPTIMIZER_CLS_NAMES[opt](learning_rate=learning_rate)
-    raise ValueError(
-        'Unsupported optimizer name: {}. Supported names are: {}'.format(
-            opt, tuple(sorted(six.iterkeys(_OPTIMIZER_CLS_NAMES)))))
-  if callable(opt):
-    opt = opt()
-  if not isinstance(opt, optimizer_lib.Optimizer):
-    raise ValueError(
-        'The given object is not an Optimizer instance. Given: {}'.format(opt))
-  return opt
+from tensorflow_estimator.python.estimator.canned.optimizers import *
diff --git a/tensorflow/python/estimator/canned/optimizers_test.py b/tensorflow/python/estimator/canned/optimizers_test.py
deleted file mode 100644
index eadabdbc496334270cd792f5b8d5ff39a446bcf7..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/canned/optimizers_test.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for optimizers.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.estimator.canned import optimizers
-from tensorflow.python.platform import test
-from tensorflow.python.training import adagrad
-from tensorflow.python.training import adam
-from tensorflow.python.training import ftrl
-from tensorflow.python.training import gradient_descent
-from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.training import rmsprop
-
-
-class _TestOptimizer(optimizer_lib.Optimizer):
-
-  def __init__(self):
-    super(_TestOptimizer, self).__init__(
-        use_locking=False, name='TestOptimizer')
-
-
-class GetOptimizerInstance(test.TestCase):
-
-  def test_unsupported_name(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Unsupported optimizer name: unsupported_name'):
-      optimizers.get_optimizer_instance('unsupported_name', learning_rate=0.1)
-
-  def test_supported_name_but_learning_rate_none(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'learning_rate must be specified when opt is string'):
-      optimizers.get_optimizer_instance('Adagrad', learning_rate=None)
-
-  def test_adagrad(self):
-    opt = optimizers.get_optimizer_instance('Adagrad', learning_rate=0.1)
-    self.assertIsInstance(opt, adagrad.AdagradOptimizer)
-    self.assertAlmostEqual(0.1, opt._learning_rate)
-
-  def test_adam(self):
-    opt = optimizers.get_optimizer_instance('Adam', learning_rate=0.1)
-    self.assertIsInstance(opt, adam.AdamOptimizer)
-    self.assertAlmostEqual(0.1, opt._lr)
-
-  def test_ftrl(self):
-    opt = optimizers.get_optimizer_instance('Ftrl', learning_rate=0.1)
-    self.assertIsInstance(opt, ftrl.FtrlOptimizer)
-    self.assertAlmostEqual(0.1, opt._learning_rate)
-
-  def test_rmsprop(self):
-    opt = optimizers.get_optimizer_instance('RMSProp', learning_rate=0.1)
-    self.assertIsInstance(opt, rmsprop.RMSPropOptimizer)
-    self.assertAlmostEqual(0.1, opt._learning_rate)
-
-  def test_sgd(self):
-    opt = optimizers.get_optimizer_instance('SGD', learning_rate=0.1)
-    self.assertIsInstance(opt, gradient_descent.GradientDescentOptimizer)
-    self.assertAlmostEqual(0.1, opt._learning_rate)
-
-  def test_object(self):
-    opt = optimizers.get_optimizer_instance(_TestOptimizer())
-    self.assertIsInstance(opt, _TestOptimizer)
-
-  def test_object_invalid(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'The given object is not an Optimizer instance'):
-      optimizers.get_optimizer_instance((1, 2, 3))
-
-  def test_callable(self):
-    def _optimizer_fn():
-      return _TestOptimizer()
-    opt = optimizers.get_optimizer_instance(_optimizer_fn)
-    self.assertIsInstance(opt, _TestOptimizer)
-
-  def test_lambda(self):
-    opt = optimizers.get_optimizer_instance(lambda: _TestOptimizer())  # pylint: disable=unnecessary-lambda
-    self.assertIsInstance(opt, _TestOptimizer)
-
-  def test_callable_returns_invalid(self):
-    def _optimizer_fn():
-      return (1, 2, 3)
-    with self.assertRaisesRegexp(
-        ValueError, 'The given object is not an Optimizer instance'):
-      optimizers.get_optimizer_instance(_optimizer_fn)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/canned/parsing_utils.py b/tensorflow/python/estimator/canned/parsing_utils.py
index 1ae0f1e9f7781be84e71790146a90cf99a5e9831..81effa24303bf0adbdac01385770321c343a699a 100644
--- a/tensorflow/python/estimator/canned/parsing_utils.py
+++ b/tensorflow/python/estimator/canned/parsing_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,291 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Parsing related helper function to be used in `input_fn`."""
+"""parsing_utils python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
-from tensorflow.python.feature_column import feature_column as fc
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.util.tf_export import estimator_export
-
-
-@estimator_export('estimator.classifier_parse_example_spec')
-def classifier_parse_example_spec(feature_columns,
-                                  label_key,
-                                  label_dtype=dtypes.int64,
-                                  label_default=None,
-                                  weight_column=None):
-  """Generates parsing spec for tf.parse_example to be used with classifiers.
-
-  If users keep data in tf.Example format, they need to call tf.parse_example
-  with a proper feature spec. There are two main things that this utility helps:
-
-  * Users need to combine parsing spec of features with labels and weights
-    (if any) since they are all parsed from same tf.Example instance. This
-    utility combines these specs.
-  * It is difficult to map expected label by a classifier such as
-    `DNNClassifier` to corresponding tf.parse_example spec. This utility encodes
-    it by getting related information from users (key, dtype).
-
-  Example output of parsing spec:
-
-  ```python
-  # Define features and transformations
-  feature_b = tf.feature_column.numeric_column(...)
-  feature_c_bucketized = tf.feature_column.bucketized_column(
-    tf.feature_column.numeric_column("feature_c"), ...)
-  feature_a_x_feature_c = tf.feature_column.crossed_column(
-      columns=["feature_a", feature_c_bucketized], ...)
-
-  feature_columns = [feature_b, feature_c_bucketized, feature_a_x_feature_c]
-  parsing_spec = tf.estimator.classifier_parse_example_spec(
-      feature_columns, label_key='my-label', label_dtype=tf.string)
-
-  # For the above example, classifier_parse_example_spec would return the dict:
-  assert parsing_spec == {
-    "feature_a": parsing_ops.VarLenFeature(tf.string),
-    "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
-    "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
-    "my-label" : parsing_ops.FixedLenFeature([1], dtype=tf.string)
-  }
-  ```
-
-  Example usage with a classifier:
-
-  ```python
-  feature_columns = # define features via tf.feature_column
-  estimator = DNNClassifier(
-      n_classes=1000,
-      feature_columns=feature_columns,
-      weight_column='example-weight',
-      label_vocabulary=['photos', 'keep', ...],
-      hidden_units=[256, 64, 16])
-  # This label configuration tells the classifier the following:
-  # * weights are retrieved with key 'example-weight'
-  # * label is string and can be one of the following ['photos', 'keep', ...]
-  # * integer id for label 'photos' is 0, 'keep' is 1, ...
-
-
-  # Input builders
-  def input_fn_train():  # Returns a tuple of features and labels.
-    features = tf.contrib.learn.read_keyed_batch_features(
-        file_pattern=train_files,
-        batch_size=batch_size,
-        # creates parsing configuration for tf.parse_example
-        features=tf.estimator.classifier_parse_example_spec(
-            feature_columns,
-            label_key='my-label',
-            label_dtype=tf.string,
-            weight_column='example-weight'),
-        reader=tf.RecordIOReader)
-     labels = features.pop('my-label')
-     return features, labels
-
-  estimator.train(input_fn=input_fn_train)
-  ```
-
-  Args:
-    feature_columns: An iterable containing all feature columns. All items
-      should be instances of classes derived from `_FeatureColumn`.
-    label_key: A string identifying the label. It means tf.Example stores labels
-      with this key.
-    label_dtype: A `tf.dtype` identifies the type of labels. By default it is
-      `tf.int64`. If user defines a `label_vocabulary`, this should be set as
-      `tf.string`. `tf.float32` labels are only supported for binary
-      classification.
-    label_default: used as label if label_key does not exist in given
-      tf.Example. An example usage: let's say `label_key` is 'clicked' and
-      tf.Example contains clicked data only for positive examples in following
-      format `key:clicked, value:1`. This means that if there is no data with
-      key 'clicked' it should count as negative example by setting
-      `label_deafault=0`. Type of this value should be compatible with
-      `label_dtype`.
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example. If it is a string, it is
-      used as a key to fetch weight tensor from the `features`. If it is a
-      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-      then weight_column.normalizer_fn is applied on it to get weight tensor.
-
-  Returns:
-    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
-    value.
-
-  Raises:
-    ValueError: If label is used in `feature_columns`.
-    ValueError: If weight_column is used in `feature_columns`.
-    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
-      instance.
-    ValueError: If `weight_column` is not a `_NumericColumn` instance.
-    ValueError: if label_key is None.
-  """
-  parsing_spec = fc.make_parse_example_spec(feature_columns)
-  if label_key in parsing_spec:
-    raise ValueError('label should not be used as feature. '
-                     'label_key: {}, features: {}'.format(
-                         label_key, parsing_spec.keys()))
-  parsing_spec[label_key] = parsing_ops.FixedLenFeature((1,), label_dtype,
-                                                        label_default)
-
-  if weight_column is None:
-    return parsing_spec
-
-  if isinstance(weight_column, six.string_types):
-    weight_column = fc.numeric_column(weight_column)
-
-  if not isinstance(weight_column, fc._NumericColumn):  # pylint: disable=protected-access
-    raise ValueError('weight_column should be an instance of '
-                     'tf.feature_column.numeric_column. '
-                     'Given type: {} value: {}'.format(
-                         type(weight_column), weight_column))
-
-  if weight_column.key in parsing_spec:
-    raise ValueError('weight_column should not be used as feature. '
-                     'weight_column: {}, features: {}'.format(
-                         weight_column.key, parsing_spec.keys()))
-
-  parsing_spec.update(weight_column._parse_example_spec)  # pylint: disable=protected-access
-  return parsing_spec
-
-
-@estimator_export('estimator.regressor_parse_example_spec')
-def regressor_parse_example_spec(feature_columns,
-                                 label_key,
-                                 label_dtype=dtypes.float32,
-                                 label_default=None,
-                                 label_dimension=1,
-                                 weight_column=None):
-  """Generates parsing spec for tf.parse_example to be used with regressors.
-
-  If users keep data in tf.Example format, they need to call tf.parse_example
-  with a proper feature spec. There are two main things that this utility helps:
-
-  * Users need to combine parsing spec of features with labels and weights
-    (if any) since they are all parsed from same tf.Example instance. This
-    utility combines these specs.
-  * It is difficult to map expected label by a regressor such as `DNNRegressor`
-    to corresponding tf.parse_example spec. This utility encodes it by getting
-    related information from users (key, dtype).
-
-  Example output of parsing spec:
-
-  ```python
-  # Define features and transformations
-  feature_b = tf.feature_column.numeric_column(...)
-  feature_c_bucketized = tf.feature_column.bucketized_column(
-    tf.feature_column.numeric_column("feature_c"), ...)
-  feature_a_x_feature_c = tf.feature_column.crossed_column(
-      columns=["feature_a", feature_c_bucketized], ...)
-
-  feature_columns = [feature_b, feature_c_bucketized, feature_a_x_feature_c]
-  parsing_spec = tf.estimator.regressor_parse_example_spec(
-      feature_columns, label_key='my-label')
-
-  # For the above example, regressor_parse_example_spec would return the dict:
-  assert parsing_spec == {
-    "feature_a": parsing_ops.VarLenFeature(tf.string),
-    "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
-    "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
-    "my-label" : parsing_ops.FixedLenFeature([1], dtype=tf.float32)
-  }
-  ```
-
-  Example usage with a regressor:
-
-  ```python
-  feature_columns = # define features via tf.feature_column
-  estimator = DNNRegressor(
-      hidden_units=[256, 64, 16],
-      feature_columns=feature_columns,
-      weight_column='example-weight',
-      label_dimension=3)
-  # This label configuration tells the regressor the following:
-  # * weights are retrieved with key 'example-weight'
-  # * label is a 3 dimension tensor with float32 dtype.
-
-
-  # Input builders
-  def input_fn_train():  # Returns a tuple of features and labels.
-    features = tf.contrib.learn.read_keyed_batch_features(
-        file_pattern=train_files,
-        batch_size=batch_size,
-        # creates parsing configuration for tf.parse_example
-        features=tf.estimator.classifier_parse_example_spec(
-            feature_columns,
-            label_key='my-label',
-            label_dimension=3,
-            weight_column='example-weight'),
-        reader=tf.RecordIOReader)
-     labels = features.pop('my-label')
-     return features, labels
-
-  estimator.train(input_fn=input_fn_train)
-  ```
-
-  Args:
-    feature_columns: An iterable containing all feature columns. All items
-      should be instances of classes derived from `_FeatureColumn`.
-    label_key: A string identifying the label. It means tf.Example stores labels
-      with this key.
-    label_dtype: A `tf.dtype` identifies the type of labels. By default it is
-      `tf.float32`.
-    label_default: used as label if label_key does not exist in given
-      tf.Example. By default default_value is none, which means
-      `tf.parse_example` will error out if there is any missing label.
-    label_dimension: Number of regression targets per example. This is the
-      size of the last dimension of the labels and logits `Tensor` objects
-      (typically, these have shape `[batch_size, label_dimension]`).
-    weight_column: A string or a `_NumericColumn` created by
-      `tf.feature_column.numeric_column` defining feature column representing
-      weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example. If it is a string, it is
-      used as a key to fetch weight tensor from the `features`. If it is a
-      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
-      then weight_column.normalizer_fn is applied on it to get weight tensor.
-
-  Returns:
-    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
-    value.
-
-  Raises:
-    ValueError: If label is used in `feature_columns`.
-    ValueError: If weight_column is used in `feature_columns`.
-    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
-      instance.
-    ValueError: If `weight_column` is not a `_NumericColumn` instance.
-    ValueError: if label_key is None.
-  """
-  parsing_spec = fc.make_parse_example_spec(feature_columns)
-  if label_key in parsing_spec:
-    raise ValueError('label should not be used as feature. '
-                     'label_key: {}, features: {}'.format(
-                         label_key, parsing_spec.keys()))
-  parsing_spec[label_key] = parsing_ops.FixedLenFeature(
-      (label_dimension,), label_dtype, label_default)
-
-  if weight_column is None:
-    return parsing_spec
-
-  if isinstance(weight_column, six.string_types):
-    weight_column = fc.numeric_column(weight_column)
-
-  if not isinstance(weight_column, fc._NumericColumn):  # pylint: disable=protected-access
-    raise ValueError('weight_column should be an instance of '
-                     'tf.feature_column.numeric_column. '
-                     'Given type: {} value: {}'.format(
-                         type(weight_column), weight_column))
+from tensorflow_estimator.python.estimator.canned import parsing_utils
 
-  if weight_column.key in parsing_spec:
-    raise ValueError('weight_column should not be used as feature. '
-                     'weight_column: {}, features: {}'.format(
-                         weight_column.key, parsing_spec.keys()))
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+parsing_utils.__all__ = [
+    s for s in dir(parsing_utils) if not s.startswith('__')
+]
 
-  parsing_spec.update(weight_column._parse_example_spec)  # pylint: disable=protected-access
-  return parsing_spec
+from tensorflow_estimator.python.estimator.canned.parsing_utils import *
diff --git a/tensorflow/python/estimator/canned/parsing_utils_test.py b/tensorflow/python/estimator/canned/parsing_utils_test.py
deleted file mode 100644
index 366bb104ca574e0ac2f3b80eee5dffa91b010fd1..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/canned/parsing_utils_test.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for parsing_utils.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.estimator.canned import parsing_utils
-from tensorflow.python.feature_column import feature_column as fc
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import test
-
-
-class ClassifierParseExampleSpec(test.TestCase):
-  """Tests tf.estimator.classifier_parse_example_spec."""
-
-  def test_defaults(self):
-    parsing_spec = parsing_utils.classifier_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')], label_key='b')
-    expected_spec = {
-        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.int64),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_string(self):
-    parsing_spec = parsing_utils.classifier_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')],
-        label_key='b',
-        label_dtype=dtypes.string)
-    expected_spec = {
-        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.string),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  # TODO(ispir): test label_default_value compatibility with label_dtype
-  def test_label_default_value(self):
-    parsing_spec = parsing_utils.classifier_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')],
-        label_key='b',
-        label_default=0)
-    expected_spec = {
-        'a':
-            parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b':
-            parsing_ops.FixedLenFeature(
-                (1,), dtype=dtypes.int64, default_value=0),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_weight_column_as_string(self):
-    parsing_spec = parsing_utils.classifier_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')],
-        label_key='b',
-        weight_column='c')
-    expected_spec = {
-        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.int64),
-        'c': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_weight_column_as_numeric_column(self):
-    parsing_spec = parsing_utils.classifier_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')],
-        label_key='b',
-        weight_column=fc.numeric_column('c'))
-    expected_spec = {
-        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.int64),
-        'c': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_label_key_should_not_be_used_as_feature(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'label should not be used as feature'):
-      parsing_utils.classifier_parse_example_spec(
-          feature_columns=[fc.numeric_column('a')], label_key='a')
-
-  def test_weight_column_should_not_be_used_as_feature(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'weight_column should not be used as feature'):
-      parsing_utils.classifier_parse_example_spec(
-          feature_columns=[fc.numeric_column('a')],
-          label_key='b',
-          weight_column=fc.numeric_column('a'))
-
-  def test_weight_column_should_be_a_numeric_column(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'tf.feature_column.numeric_column'):
-      not_a_numeric_column = 3
-      parsing_utils.classifier_parse_example_spec(
-          feature_columns=[fc.numeric_column('a')],
-          label_key='b',
-          weight_column=not_a_numeric_column)
-
-
-class RegressorParseExampleSpec(test.TestCase):
-  """Tests tf.estimator.classifier_parse_example_spec."""
-
-  def test_defaults(self):
-    parsing_spec = parsing_utils.regressor_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')], label_key='b')
-    expected_spec = {
-        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_int64(self):
-    parsing_spec = parsing_utils.regressor_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')],
-        label_key='b',
-        label_dtype=dtypes.int64)
-    expected_spec = {
-        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.int64),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_label_default_value(self):
-    parsing_spec = parsing_utils.regressor_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')],
-        label_key='b',
-        label_default=0.)
-    expected_spec = {
-        'a':
-            parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b':
-            parsing_ops.FixedLenFeature(
-                (1,), dtype=dtypes.float32, default_value=0.),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_label_dimension(self):
-    parsing_spec = parsing_utils.regressor_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')],
-        label_key='b',
-        label_dimension=3)
-    expected_spec = {
-        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b': parsing_ops.FixedLenFeature((3,), dtype=dtypes.float32),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_weight_column_as_string(self):
-    parsing_spec = parsing_utils.regressor_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')],
-        label_key='b',
-        weight_column='c')
-    expected_spec = {
-        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'c': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_weight_column_as_numeric_column(self):
-    parsing_spec = parsing_utils.regressor_parse_example_spec(
-        feature_columns=[fc.numeric_column('a')],
-        label_key='b',
-        weight_column=fc.numeric_column('c'))
-    expected_spec = {
-        'a': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'b': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-        'c': parsing_ops.FixedLenFeature((1,), dtype=dtypes.float32),
-    }
-    self.assertDictEqual(expected_spec, parsing_spec)
-
-  def test_label_key_should_not_be_used_as_feature(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'label should not be used as feature'):
-      parsing_utils.regressor_parse_example_spec(
-          feature_columns=[fc.numeric_column('a')], label_key='a')
-
-  def test_weight_column_should_not_be_used_as_feature(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'weight_column should not be used as feature'):
-      parsing_utils.regressor_parse_example_spec(
-          feature_columns=[fc.numeric_column('a')],
-          label_key='b',
-          weight_column=fc.numeric_column('a'))
-
-  def test_weight_column_should_be_a_numeric_column(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'tf.feature_column.numeric_column'):
-      not_a_numeric_column = 3
-      parsing_utils.regressor_parse_example_spec(
-          feature_columns=[fc.numeric_column('a')],
-          label_key='b',
-          weight_column=not_a_numeric_column)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/canned/prediction_keys.py b/tensorflow/python/estimator/canned/prediction_keys.py
index daa275b46bc77b747add57c302bb31bd38bbb01c..26f5242977cd21382a6475d27a7d4f51e4913c88 100644
--- a/tensorflow/python/estimator/canned/prediction_keys.py
+++ b/tensorflow/python/estimator/canned/prediction_keys.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,24 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Enum for model prediction keys."""
+"""prediction_keys python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow_estimator.python.estimator.canned import prediction_keys
 
-class PredictionKeys(object):
-  """Enum for canonical model prediction keys.
-
-  The following values are defined:
-  PREDICTIONS: Used by models that predict values, such as regressor models.
-  """
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+prediction_keys.__all__ = [
+    s for s in dir(prediction_keys) if not s.startswith('__')
+]
 
-  CLASSES = 'classes'
-  CLASS_IDS = 'class_ids'
-  LOGISTIC = 'logistic'
-  LOGITS = 'logits'
-  PREDICTIONS = 'predictions'
-  PROBABILITIES = 'probabilities'
-  TOP_K = 'top_k'
+from tensorflow_estimator.python.estimator.canned.prediction_keys import *
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index e6d82f0db739f0d8cf02cebd97561cab5963d100..2199cd9fb8a68c1fcf3f6098cd63d07a8f14ab17 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,2166 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""estimator python module.
 
-"""Base Estimator class."""
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
-import os
-import tempfile
+from tensorflow_estimator.python.estimator import estimator
 
-import numpy as np
-import six
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+estimator.__all__ = [s for s in dir(estimator) if not s.startswith('__')]
 
-from google.protobuf import message
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.eager import context
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator import util as estimator_util
-from tensorflow.python.estimator.export import export as export_helpers
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import builder as saved_model_builder
-from tensorflow.python.saved_model import utils_impl as saved_model_utils
-from tensorflow.python.summary import summary
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import device_setter
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import evaluation
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import saver
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-from tensorflow.python.training import warm_starting_util
-from tensorflow.python.util import compat
-from tensorflow.python.util import compat_internal
-from tensorflow.python.util import function_utils
-from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import estimator_export
-
-
-_VALID_MODEL_FN_ARGS = set(
-    ['features', 'labels', 'mode', 'params', 'self', 'config'])
-
-
-@estimator_export('estimator.Estimator')
-class Estimator(object):
-  """Estimator class to train and evaluate TensorFlow models.
-
-  The `Estimator` object wraps a model which is specified by a `model_fn`,
-  which, given inputs and a number of other parameters, returns the ops
-  necessary to perform training, evaluation, or predictions.
-
-  All outputs (checkpoints, event files, etc.) are written to `model_dir`, or a
-  subdirectory thereof. If `model_dir` is not set, a temporary directory is
-  used.
-
-  The `config` argument can be passed `tf.estimator.RunConfig` object containing
-  information about the execution environment. It is passed on to the
-  `model_fn`, if the `model_fn` has a parameter named "config" (and input
-  functions in the same manner). If the `config` parameter is not passed, it is
-  instantiated by the `Estimator`. Not passing config means that defaults useful
-  for local execution are used. `Estimator` makes config available to the model
-  (for instance, to allow specialization based on the number of workers
-  available), and also uses some of its fields to control internals, especially
-  regarding checkpointing.
-
-  The `params` argument contains hyperparameters. It is passed to the
-  `model_fn`, if the `model_fn` has a parameter named "params", and to the input
-  functions in the same manner. `Estimator` only passes params along, it does
-  not inspect it. The structure of `params` is therefore entirely up to the
-  developer.
-
-  None of `Estimator`'s methods can be overridden in subclasses (its
-  constructor enforces this). Subclasses should use `model_fn` to configure
-  the base class, and may add methods implementing specialized functionality.
-
-  @compatibility(eager)
-  Calling methods of `Estimator` will work while eager execution is enabled.
-  However, the `model_fn` and `input_fn` is not executed eagerly, `Estimator`
-  will switch to graph model before calling all user-provided functions (incl.
-  hooks), so their code has to be compatible with graph mode execution. Note
-  that `input_fn` code using `tf.data` generally works in both graph and eager
-  modes.
-  @end_compatibility
-  """
-
-  def __init__(self, model_fn, model_dir=None, config=None, params=None,
-               warm_start_from=None):
-    """Constructs an `Estimator` instance.
-
-    See [estimators](https://tensorflow.org/guide/estimators) for more
-    information.
-
-    To warm-start an `Estimator`:
-
-    ```python
-    estimator = tf.estimator.DNNClassifier(
-        feature_columns=[categorical_feature_a_emb, categorical_feature_b_emb],
-        hidden_units=[1024, 512, 256],
-        warm_start_from="/path/to/checkpoint/dir")
-    ```
-
-    For more details on warm-start configuration, see
-    `tf.estimator.WarmStartSettings`.
-
-    Args:
-      model_fn: Model function. Follows the signature:
-
-        * Args:
-
-          * `features`: This is the first item returned from the `input_fn`
-                 passed to `train`, `evaluate`, and `predict`. This should be a
-                 single `tf.Tensor` or `dict` of same.
-          * `labels`: This is the second item returned from the `input_fn`
-                 passed to `train`, `evaluate`, and `predict`. This should be a
-                 single `tf.Tensor` or `dict` of same (for multi-head models).
-                 If mode is `tf.estimator.ModeKeys.PREDICT`, `labels=None` will
-                 be passed. If the `model_fn`'s signature does not accept
-                 `mode`, the `model_fn` must still be able to handle
-                 `labels=None`.
-          * `mode`: Optional. Specifies if this training, evaluation or
-                 prediction. See `tf.estimator.ModeKeys`.
-          * `params`: Optional `dict` of hyperparameters.  Will receive what
-                 is passed to Estimator in `params` parameter. This allows
-                 to configure Estimators from hyper parameter tuning.
-          * `config`: Optional `estimator.RunConfig` object. Will receive what
-                 is passed to Estimator as its `config` parameter, or a default
-                 value. Allows setting up things in your `model_fn` based on
-                 configuration such as `num_ps_replicas`, or `model_dir`.
-
-        * Returns:
-          `tf.estimator.EstimatorSpec`
-
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into an estimator to
-        continue training a previously saved model. If `PathLike` object, the
-        path will be resolved. If `None`, the model_dir in `config` will be used
-        if set. If both are set, they must be same. If both are `None`, a
-        temporary directory will be used.
-      config: `estimator.RunConfig` configuration object.
-      params: `dict` of hyper parameters that will be passed into `model_fn`.
-              Keys are names of parameters, values are basic python types.
-      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
-                       warm-start from, or a `tf.estimator.WarmStartSettings`
-                       object to fully configure warm-starting.  If the string
-                       filepath is provided instead of a
-                       `tf.estimator.WarmStartSettings`, then all variables are
-                       warm-started, and it is assumed that vocabularies
-                       and `tf.Tensor` names are unchanged.
-
-    Raises:
-      ValueError: parameters of `model_fn` don't match `params`.
-      ValueError: if this is called via a subclass and if that class overrides
-        a member of `Estimator`.
-    """
-    Estimator._assert_members_are_not_overridden(self)
-
-    self._config = maybe_overwrite_model_dir_and_session_config(config,
-                                                                model_dir)
-
-    # The distribute field contains an instance of DistributionStrategy.
-    self._train_distribution = self._config.train_distribute
-    self._eval_distribution = self._config.eval_distribute
-    # Model directory.
-    self._model_dir = self._config.model_dir
-    self._session_config = self._config.session_config
-    logging.info('Using config: %s', str(vars(self._config)))
-
-    self._device_fn = (
-        self._config.device_fn or _get_replica_device_setter(self._config))
-
-    if model_fn is None:
-      raise ValueError('model_fn must be provided to Estimator.')
-    _verify_model_fn_args(model_fn, params)
-    self._model_fn = model_fn
-    self._params = copy.deepcopy(params or {})
-
-    # pylint: disable=protected-access
-    self._warm_start_settings = _get_default_warm_start_settings(
-        warm_start_from)
-    # pylint: enable=protected-access
-
-  @property
-  def model_dir(self):
-    return self._model_dir
-
-  @property
-  def config(self):
-    return copy.deepcopy(self._config)
-
-  @property
-  def params(self):
-    return copy.deepcopy(self._params)
-
-  @property
-  def model_fn(self):
-    """Returns the `model_fn` which is bound to `self.params`.
-
-    Returns:
-      The `model_fn` with following signature:
-        `def model_fn(features, labels, mode, config)`
-    """
-
-    def public_model_fn(features, labels, mode, config):
-      return self._call_model_fn(features, labels, mode, config)
-
-    return public_model_fn
-
-  # TODO(ispir): support a list of names
-  def get_variable_value(self, name):
-    """Returns value of the variable given by name.
-
-    Args:
-      name: string or a list of string, name of the tensor.
-
-    Returns:
-      Numpy array - value of the tensor.
-
-    Raises:
-      ValueError: If the `Estimator` has not produced a checkpoint yet.
-    """
-    _check_checkpoint_available(self.model_dir)
-    with context.graph_mode():
-      return training.load_variable(self.model_dir, name)
-
-  def get_variable_names(self):
-    """Returns list of all variable names in this model.
-
-    Returns:
-      List of names.
-
-    Raises:
-      ValueError: If the `Estimator` has not produced a checkpoint yet.
-    """
-    _check_checkpoint_available(self.model_dir)
-    with context.graph_mode():
-      return [name for name, _ in training.list_variables(self.model_dir)]
-
-  def latest_checkpoint(self):
-    """Finds the filename of the latest saved checkpoint file in `model_dir`.
-
-    Returns:
-      The full path to the latest checkpoint or `None` if no checkpoint was
-      found.
-    """
-    with context.graph_mode():
-      return checkpoint_management.latest_checkpoint(self.model_dir)
-
-  def train(self,
-            input_fn,
-            hooks=None,
-            steps=None,
-            max_steps=None,
-            saving_listeners=None):
-    """Trains a model given training data `input_fn`.
-
-    Args:
-      input_fn: A function that provides input data for training as minibatches.
-        See [Premade Estimators](
-        https://tensorflow.org/guide/premade_estimators#create_input_functions)
-        for more information. The function should construct and return one of
-        the following:  * A
-        `tf.data.Dataset` object: Outputs of `Dataset` object must be a tuple
-        `(features, labels)` with same constraints as below. * A tuple
-        `(features, labels)`: Where `features` is a `tf.Tensor` or a dictionary
-        of string feature name to `Tensor` and `labels` is a `Tensor` or a
-        dictionary of string label name to `Tensor`. Both `features` and
-        `labels` are consumed by `model_fn`. They should satisfy the expectation
-        of `model_fn` from inputs.
-      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
-        callbacks inside the training loop.
-      steps: Number of steps for which to train the model. If `None`, train
-        forever or train until `input_fn` generates the `tf.errors.OutOfRange`
-        error or `StopIteration` exception. `steps` works incrementally. If you
-        call two times `train(steps=10)` then training occurs in total 20 steps.
-        If `OutOfRange` or `StopIteration` occurs in the middle, training stops
-        before 20 steps. If you don't want to have incremental behavior please
-        set `max_steps` instead. If set, `max_steps` must be `None`.
-      max_steps: Number of total steps for which to train model. If `None`,
-        train forever or train until `input_fn` generates the
-        `tf.errors.OutOfRange` error or `StopIteration` exception. If set,
-        `steps` must be `None`. If `OutOfRange` or `StopIteration` occurs in the
-        middle, training stops before `max_steps` steps. Two calls to
-        `train(steps=100)` means 200 training iterations. On the other hand, two
-        calls to `train(max_steps=100)` means that the second call will not do
-        any iteration since first call did all 100 steps.
-      saving_listeners: list of `CheckpointSaverListener` objects. Used for
-        callbacks that run immediately before or after checkpoint savings.
-
-    Returns:
-      `self`, for chaining.
-
-    Raises:
-      ValueError: If both `steps` and `max_steps` are not `None`.
-      ValueError: If either `steps` or `max_steps <= 0`.
-    """
-    if self.config.task_type in (run_config.TaskType.EVALUATOR,
-                                 run_config.TaskType.PS):
-      raise ValueError(
-          'Train has been called wrong configuration. Please use '
-          'tf.estimator.train_and_evaluate which calls proper API according '
-          'to given configuration. Current configuration: {}.'.format(
-              self.config))
-
-    with context.graph_mode():
-      if (steps is not None) and (max_steps is not None):
-        raise ValueError('Can not provide both steps and max_steps.')
-      if steps is not None and steps <= 0:
-        raise ValueError('Must specify steps > 0, given: {}'.format(steps))
-      if max_steps is not None and max_steps <= 0:
-        raise ValueError(
-            'Must specify max_steps > 0, given: {}'.format(max_steps))
-
-      if max_steps is not None:
-        start_step = _load_global_step_from_checkpoint_dir(self._model_dir)
-        if max_steps <= start_step:
-          logging.info('Skipping training since max_steps has already saved.')
-          return self
-
-      hooks = _check_hooks_type(hooks)
-      hooks.extend(self._convert_train_steps_to_hooks(steps, max_steps))
-
-      saving_listeners = _check_listeners_type(saving_listeners)
-      loss = self._train_model(input_fn, hooks, saving_listeners)
-      logging.info('Loss for final step: %s.', loss)
-      return self
-
-  def _convert_train_steps_to_hooks(self, steps, max_steps):
-    """Create hooks to run correct number of steps in training.
-
-    Args:
-      steps: number of steps to run during training.
-      max_steps: maximum number of steps to be run during training. It'll be
-        the maximum number of steps the model will train to after restoring
-        from checkpoint even across multiple estimator.train calls.
-
-    Returns:
-      List of hooks to be passed to the estimator.
-    """
-    if steps is not None or max_steps is not None:
-      if self._train_distribution:
-        steps_per_run = getattr(self._train_distribution, 'steps_per_run', 1)
-        if steps_per_run > 1:
-          return [basic_session_run_hooks._MultiStepStopAtStepHook(  # pylint: disable=protected-access
-              steps, max_steps, steps_per_run)]
-      return [training.StopAtStepHook(steps, max_steps)]
-    else:
-      return []
-
-  def eval_dir(self, name=None):
-    """Shows the directory name where evaluation metrics are dumped.
-
-    Args:
-      name: Name of the evaluation if user needs to run multiple evaluations on
-        different data sets, such as on training data vs test data. Metrics for
-        different evaluations are saved in separate folders, and appear
-        separately in tensorboard.
-
-    Returns:
-      A string which is the path of directory contains evaluation metrics.
-    """
-    return os.path.join(self._model_dir, 'eval' if not name else
-                        'eval_' + name)
-
-  def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
-               name=None):
-    """Evaluates the model given evaluation data `input_fn`.
-
-    For each step, calls `input_fn`, which returns one batch of data.
-    Evaluates until:
-    - `steps` batches are processed, or
-    - `input_fn` raises an end-of-input exception (`tf.errors.OutOfRangeError`
-    or
-    `StopIteration`).
-
-    Args:
-      input_fn: A function that constructs the input data for evaluation. See
-        [Premade Estimators](
-        https://tensorflow.org/guide/premade#create_input_functions)
-        for more information. The
-        function should construct and return one of the following:  * A
-        `tf.data.Dataset` object: Outputs of `Dataset` object must be a tuple
-        `(features, labels)` with same constraints as below. * A tuple
-        `(features, labels)`: Where `features` is a `tf.Tensor` or a dictionary
-        of string feature name to `Tensor` and `labels` is a `Tensor` or a
-        dictionary of string label name to `Tensor`. Both `features` and
-        `labels` are consumed by `model_fn`. They should satisfy the expectation
-        of `model_fn` from inputs.
-      steps: Number of steps for which to evaluate model. If `None`, evaluates
-        until `input_fn` raises an end-of-input exception.
-      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
-        callbacks inside the evaluation call.
-      checkpoint_path: Path of a specific checkpoint to evaluate. If `None`, the
-        latest checkpoint in `model_dir` is used.  If there are no checkpoints
-        in `model_dir`, evaluation is run with newly initialized `Variables`
-        instead of ones restored from checkpoint.
-      name: Name of the evaluation if user needs to run multiple evaluations on
-        different data sets, such as on training data vs test data. Metrics for
-        different evaluations are saved in separate folders, and appear
-        separately in tensorboard.
-
-    Returns:
-      A dict containing the evaluation metrics specified in `model_fn` keyed by
-      name, as well as an entry `global_step` which contains the value of the
-      global step for which this evaluation was performed. For canned
-      estimators, the dict contains the `loss` (mean loss per mini-batch) and
-      the `average_loss` (mean loss per sample). Canned classifiers also return
-      the `accuracy`. Canned regressors also return the `label/mean` and the
-      `prediction/mean`.
-
-    Raises:
-      ValueError: If `steps <= 0`.
-      ValueError: If no model has been trained, namely `model_dir`, or the
-        given `checkpoint_path` is empty.
-    """
-    with context.graph_mode():
-      hooks = _check_hooks_type(hooks)
-      hooks.extend(self._convert_eval_steps_to_hooks(steps))
-
-      # Check that model has been trained (if nothing has been set explicitly).
-      if not checkpoint_path:
-        latest_path = checkpoint_management.latest_checkpoint(self._model_dir)
-        if not latest_path:
-          logging.info('Could not find trained model in model_dir: {}, running '
-                       'initialization to evaluate.'.format(self._model_dir))
-        checkpoint_path = latest_path
-
-      def _evaluate():
-        (scaffold, update_op, eval_dict, all_hooks) = (
-            self._evaluate_build_graph(input_fn, hooks, checkpoint_path))
-        return self._evaluate_run(
-            checkpoint_path=checkpoint_path,
-            scaffold=scaffold,
-            update_op=update_op,
-            eval_dict=eval_dict,
-            all_hooks=all_hooks,
-            output_dir=self.eval_dir(name))
-
-      with ops.Graph().as_default():
-        if self._eval_distribution:
-          # We want to create the iterations variable outside the distribution
-          # scope as that is just stored on the host and mainly used to drive
-          # the loop and doesn't need to be a Mirrored/Device variable.
-          training.get_or_create_steps_per_run_variable()
-          with self._eval_distribution.scope():
-            return _evaluate()
-        else:
-          return _evaluate()
-
-  def _convert_eval_steps_to_hooks(self, steps):
-    """Create hooks to run correct number of steps in evaluation.
-
-    Args:
-      steps: number of steps to run during evaluation.
-
-    Raises:
-      ValueError: if steps is less than or equal to zero.
-
-    Returns:
-      List of hooks to be passed to the estimator.
-    """
-    if steps is None:
-      return []
-
-    if steps <= 0:
-      raise ValueError('Must specify steps > 0, given: {}'.format(steps))
-
-    # The hooks are declared as private in evaluation.py discourage the use
-    # by other libraries or open source users. This should be the only usage
-    # of the estimator evaluation hooks.
-    if self._eval_distribution:
-      steps_per_run = getattr(self._eval_distribution, 'steps_per_run', 1)
-      if steps_per_run > 1:
-        return [evaluation._MultiStepStopAfterNEvalsHook(  # pylint: disable=protected-access
-            num_evals=steps, steps_per_run=steps_per_run)]
-    return [evaluation._StopAfterNEvalsHook(num_evals=steps)]  # pylint: disable=protected-access
-
-  def predict(self,
-              input_fn,
-              predict_keys=None,
-              hooks=None,
-              checkpoint_path=None,
-              yield_single_examples=True):
-    """Yields predictions for given features.
-
-    Please note that interleaving two predict outputs does not work. See:
-    [issue/20506](
-    https://github.com/tensorflow/tensorflow/issues/20506#issuecomment-422208517)
-
-    Args:
-      input_fn: A function that constructs the features. Prediction continues
-        until `input_fn` raises an end-of-input exception
-        (`tf.errors.OutOfRangeError` or `StopIteration`).
-        See [Premade Estimators](
-        https://tensorflow.org/guide/premade_estimators#create_input_functions)
-        for more information. The function should construct and return one of
-        the following:
-
-          * A `tf.data.Dataset` object: Outputs of `Dataset` object must have
-            same constraints as below.
-          * features: A `tf.Tensor` or a dictionary of string feature name to
-            `Tensor`. features are consumed by `model_fn`. They should satisfy
-            the expectation of `model_fn` from inputs.
-          * A tuple, in which case the first item is extracted as features.
-
-      predict_keys: list of `str`, name of the keys to predict. It is used if
-        the `tf.estimator.EstimatorSpec.predictions` is a `dict`. If
-        `predict_keys` is used then rest of the predictions will be filtered
-        from the dictionary. If `None`, returns all.
-      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
-        callbacks inside the prediction call.
-      checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
-        latest checkpoint in `model_dir` is used.  If there are no checkpoints
-        in `model_dir`, prediction is run with newly initialized `Variables`
-        instead of ones restored from checkpoint.
-      yield_single_examples: If `False`, yields the whole batch as returned by
-        the `model_fn` instead of decomposing the batch into individual
-        elements. This is useful if `model_fn` returns some tensors whose first
-        dimension is not equal to the batch size.
-
-    Yields:
-      Evaluated values of `predictions` tensors.
-
-    Raises:
-      ValueError: Could not find a trained model in `model_dir`.
-      ValueError: If batch length of predictions is not the same and
-        `yield_single_examples` is `True`.
-      ValueError: If there is a conflict between `predict_keys` and
-        `predictions`. For example if `predict_keys` is not `None` but
-        `tf.estimator.EstimatorSpec.predictions` is not a `dict`.
-    """
-    with context.graph_mode():
-      hooks = _check_hooks_type(hooks)
-      # Check that model has been trained.
-      if not checkpoint_path:
-        checkpoint_path = checkpoint_management.latest_checkpoint(
-            self._model_dir)
-      if not checkpoint_path:
-        logging.info('Could not find trained model in model_dir: {}, running '
-                     'initialization to predict.'.format(self._model_dir))
-      with ops.Graph().as_default() as g:
-        random_seed.set_random_seed(self._config.tf_random_seed)
-        self._create_and_assert_global_step(g)
-        features, input_hooks = self._get_features_from_input_fn(
-            input_fn, model_fn_lib.ModeKeys.PREDICT)
-        estimator_spec = self._call_model_fn(
-            features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
-
-        # Call to warm_start has to be after model_fn is called.
-        self._maybe_warm_start(checkpoint_path)
-
-        predictions = self._extract_keys(
-            estimator_spec.predictions, predict_keys)
-        all_hooks = list(input_hooks)
-        all_hooks.extend(hooks)
-        all_hooks.extend(list(estimator_spec.prediction_hooks or []))
-        with training.MonitoredSession(
-            session_creator=training.ChiefSessionCreator(
-                checkpoint_filename_with_path=checkpoint_path,
-                master=self._config.master,
-                scaffold=estimator_spec.scaffold,
-                config=self._session_config),
-            hooks=all_hooks) as mon_sess:
-          while not mon_sess.should_stop():
-            preds_evaluated = mon_sess.run(predictions)
-            if not yield_single_examples:
-              yield preds_evaluated
-            elif not isinstance(predictions, dict):
-              for pred in preds_evaluated:
-                yield pred
-            else:
-              for i in range(self._extract_batch_length(preds_evaluated)):
-                yield {
-                    key: value[i]
-                    for key, value in six.iteritems(preds_evaluated)
-                }
-
-  def _assert_members_are_not_overridden(self):
-    """Asserts members of `Estimator` are not overridden."""
-    # TPUEstimator is special cased (owned by TF).
-    if self.__class__.__name__ == 'TPUEstimator':
-      return
-
-    allowed_overrides = set([
-        '_create_and_assert_global_step',
-        '_tf_api_names', '_tf_api_names_v1', '_estimator_api_names',
-        '_estimator_api_names_v1', '_estimator_api_constants',
-        '_estimator_api_constants_v1',
-    ])
-    estimator_members = set([m for m in Estimator.__dict__.keys()
-                             if not m.startswith('__')])
-    subclass_members = set(self.__class__.__dict__.keys())
-    common_members = estimator_members & subclass_members - allowed_overrides
-    overridden_members = [
-        m for m in common_members
-        if Estimator.__dict__[m] != self.__class__.__dict__[m]]
-    if overridden_members:
-      raise ValueError(
-          'Subclasses of Estimator cannot override members of Estimator. '
-          '{} does override {}'.format(self.__class__, overridden_members))
-
-  def export_savedmodel(
-      self, export_dir_base, serving_input_receiver_fn,
-      assets_extra=None,
-      as_text=False,
-      checkpoint_path=None,
-      strip_default_attrs=False):
-    # pylint: disable=line-too-long,g-doc-args,g-doc-return-or-yield
-    """Exports inference graph as a `SavedModel` into the given dir.
-
-    Note that `export_to_savedmodel` will be renamed to `export_saved_model`
-    in TensorFlow 2.0. At that time, `export_to_savedmodel` without the
-    additional underscore will be available only through tf.compat.v1.
-
-    Please see `tf.estimator.Estimator.export_saved_model` for more information.
-
-    There is one additional arg versus the new method:
-      strip_default_attrs: This parameter is going away in TF 2.0, and
-        the new behavior will automatically strip all default attributes.
-        Boolean. If `True`, default-valued attributes will be
-        removed from the `NodeDef`s. For a detailed guide, see [Stripping
-        Default-Valued Attributes](
-        https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-    """
-    # pylint: enable=line-too-long,g-doc-args,g-doc-return-or-yield
-    return self._export_saved_model_for_mode(
-        export_dir_base,
-        serving_input_receiver_fn,
-        assets_extra=assets_extra,
-        as_text=as_text,
-        checkpoint_path=checkpoint_path,
-        strip_default_attrs=strip_default_attrs,
-        mode=model_fn_lib.ModeKeys.PREDICT)
-
-  def export_saved_model(
-      self, export_dir_base, serving_input_receiver_fn,
-      assets_extra=None,
-      as_text=False,
-      checkpoint_path=None):
-    # pylint: disable=line-too-long
-    """Exports inference graph as a `SavedModel` into the given dir.
-
-    For a detailed guide, see
-    [Using SavedModel with Estimators](https://tensorflow.org/guide/saved_model#using_savedmodel_with_estimators).
-
-    This method builds a new graph by first calling the
-    `serving_input_receiver_fn` to obtain feature `Tensor`s, and then calling
-    this `Estimator`'s `model_fn` to generate the model graph based on those
-    features. It restores the given checkpoint (or, lacking that, the most
-    recent checkpoint) into this graph in a fresh session.  Finally it creates
-    a timestamped export directory below the given `export_dir_base`, and writes
-    a `SavedModel` into it containing a single `tf.MetaGraphDef` saved from this
-    session.
-
-    The exported `MetaGraphDef` will provide one `SignatureDef` for each
-    element of the `export_outputs` dict returned from the `model_fn`, named
-    using
-    the same keys.  One of these keys is always
-    `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`,
-    indicating which
-    signature will be served when a serving request does not specify one.
-    For each signature, the outputs are provided by the corresponding
-    `tf.estimator.export.ExportOutput`s, and the inputs are always the input
-    receivers provided by
-    the `serving_input_receiver_fn`.
-
-    Extra assets may be written into the `SavedModel` via the `assets_extra`
-    argument.  This should be a dict, where each key gives a destination path
-    (including the filename) relative to the assets.extra directory.  The
-    corresponding value gives the full path of the source file to be copied.
-    For example, the simple case of copying a single file without renaming it
-    is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-
-    Args:
-      export_dir_base: A string containing a directory in which to create
-        timestamped subdirectories containing exported `SavedModel`s.
-      serving_input_receiver_fn: A function that takes no argument and returns a
-        `tf.estimator.export.ServingInputReceiver` or
-        `tf.estimator.export.TensorServingInputReceiver`.
-      assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported `SavedModel`, or `None` if no extra assets are
-        needed.
-      as_text: whether to write the `SavedModel` proto in text format.
-      checkpoint_path: The checkpoint path to export.  If `None` (the default),
-        the most recent checkpoint found within the model directory is chosen.
-
-    Returns:
-      The string path to the exported directory.
-
-    Raises:
-      ValueError: if no `serving_input_receiver_fn` is provided, no
-      `export_outputs` are provided, or no checkpoint can be found.
-    """
-    # pylint: enable=line-too-long
-    # TODO(b/111442174): `export_to_savedmodel` will be renamed to
-    # `export_saved_model` in TensorFlow 2.0. This function is a wrapper
-    # while staging the new version; do not add any logic here.
-    return self.export_savedmodel(
-        export_dir_base,
-        serving_input_receiver_fn,
-        assets_extra=assets_extra,
-        as_text=as_text,
-        checkpoint_path=checkpoint_path,
-        strip_default_attrs=True)
-
-  def _export_saved_model_for_mode(
-      self, export_dir_base, input_receiver_fn,
-      assets_extra=None,
-      as_text=False,
-      checkpoint_path=None,
-      strip_default_attrs=False,
-      mode=model_fn_lib.ModeKeys.PREDICT):
-    # pylint: disable=line-too-long
-    """Exports a single train/eval/predict graph as a `SavedModel`.
-
-    This method is a wrapper for `_export_all_saved_models`, and wraps a raw
-    `input_receiver_fn` in a dictionary to pass in to that function.
-    See `_export_all_saved_models` for full docs.
-
-    See `tf.contrib.estimator.export_saved_model_for_mode` for the currently
-    exposed version of this function.
-
-    Args:
-      export_dir_base: A string containing a directory in which to create
-        timestamped subdirectories containing exported `SavedModel`s.
-      input_receiver_fn: a function that takes no argument and returns the
-        appropriate subclass of `InputReceiver`.
-      assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported `SavedModel`, or `None` if no extra assets are
-        needed.
-      as_text: whether to write the `SavedModel` proto in text format.
-      checkpoint_path: The checkpoint path to export.  If `None` (the default),
-        the most recent checkpoint found within the model directory is chosen.
-      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the `NodeDef`s. For a detailed guide, see [Stripping
-        Default-Valued
-        Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-      mode: `tf.estimator.ModeKeys` value indicating with mode will be exported.
-
-    Returns:
-      The string path to the exported directory.
-
-    Raises:
-      ValueError: if `input_receiver_fn` is `None`, no `export_outputs`
-        are provided, or no checkpoint can be found.
-    """
-    # pylint: enable=line-too-long
-    if not input_receiver_fn:
-      raise ValueError('An input_receiver_fn must be defined.')
-
-    input_receiver_fn_map = {mode: input_receiver_fn}
-
-    return self._export_all_saved_models(
-        export_dir_base,
-        input_receiver_fn_map,
-        assets_extra=assets_extra,
-        as_text=as_text,
-        checkpoint_path=checkpoint_path,
-        strip_default_attrs=strip_default_attrs)
-
-  def _export_all_saved_models(
-      self, export_dir_base, input_receiver_fn_map,
-      assets_extra=None,
-      as_text=False,
-      checkpoint_path=None,
-      strip_default_attrs=False):
-    # pylint: disable=line-too-long
-    """Exports a `SavedModel` containing `tf.MetaGraphDefs` for each requested mode.
-
-    See `tf.contrib.estimator.export_all_saved_models` for the currently
-    exposed version of this function.
-
-    For each mode passed in via the `input_receiver_fn_map`,
-    this method builds a new graph by calling the `input_receiver_fn` to obtain
-    feature and label `Tensor`s. Next, this method calls the `Estimator`'s
-    `model_fn` in the passed mode to generate the model graph based on
-    those features and labels, and restores the given checkpoint
-    (or, lacking that, the most recent checkpoint) into the graph.
-    Only one of the modes is used for saving variables to the `SavedModel`
-    (order of preference: `tf.estimator.ModeKeys.TRAIN`,
-    `tf.estimator.ModeKeys.EVAL`, then
-    `tf.estimator.ModeKeys.PREDICT`), such that up to three
-    `tf.MetaGraphDefs` are saved with a single set of variables in a single
-    `SavedModel` directory.
-
-    For the variables and `tf.MetaGraphDefs`, a timestamped export directory
-    below
-    `export_dir_base`, and writes a `SavedModel` into it containing
-    the `tf.MetaGraphDef` for the given mode and its associated signatures.
-
-    For prediction, the exported `MetaGraphDef` will provide one `SignatureDef`
-    for each element of the `export_outputs` dict returned from the `model_fn`,
-    named using the same keys.  One of these keys is always
-    `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`,
-    indicating which
-    signature will be served when a serving request does not specify one.
-    For each signature, the outputs are provided by the corresponding
-    `tf.estimator.export.ExportOutput`s, and the inputs are always the input
-    receivers provided by
-    the `serving_input_receiver_fn`.
-
-    For training and evaluation, the `train_op` is stored in an extra
-    collection,
-    and loss, metrics, and predictions are included in a `SignatureDef` for the
-    mode in question.
-
-    Extra assets may be written into the `SavedModel` via the `assets_extra`
-    argument.  This should be a dict, where each key gives a destination path
-    (including the filename) relative to the assets.extra directory.  The
-    corresponding value gives the full path of the source file to be copied.
-    For example, the simple case of copying a single file without renaming it
-    is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-
-    Args:
-      export_dir_base: A string containing a directory in which to create
-        timestamped subdirectories containing exported `SavedModel`s.
-      input_receiver_fn_map: dict of `tf.estimator.ModeKeys` to
-        `input_receiver_fn` mappings, where the `input_receiver_fn` is a
-        function that takes no arguments and returns the appropriate subclass of
-        `InputReceiver`.
-      assets_extra: A dict specifying how to populate the assets.extra directory
-        within the exported `SavedModel`, or `None` if no extra assets are
-        needed.
-      as_text: whether to write the `SavedModel` proto in text format.
-      checkpoint_path: The checkpoint path to export.  If `None` (the default),
-        the most recent checkpoint found within the model directory is chosen.
-      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the `NodeDef`s. For a detailed guide, see [Stripping
-        Default-Valued
-        Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-
-    Returns:
-      A dict of `tf.estimator.ModeKeys` value to string path for each exported
-      directory.
-
-    Raises:
-      ValueError: if any `input_receiver_fn` is `None`, no `export_outputs`
-        are provided, or no checkpoint can be found.
-    """
-    # pylint: enable=line-too-long
-    # TODO(b/65561022): Consider allowing multiple input_receiver_fns per mode.
-    with context.graph_mode():
-      if not checkpoint_path:
-        # Locate the latest checkpoint
-        checkpoint_path = checkpoint_management.latest_checkpoint(
-            self._model_dir)
-      if not checkpoint_path:
-        raise ValueError("Couldn't find trained model at %s." % self._model_dir)
-
-      export_dir = export_helpers.get_timestamped_export_dir(export_dir_base)
-      temp_export_dir = export_helpers.get_temp_export_dir(export_dir)
-
-      builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
-
-      save_variables = True
-      # Note that the order in which we run here matters, as the first
-      # mode we pass through will be used to save the variables. We run TRAIN
-      # first, as that is also the mode used for checkpoints, and therefore
-      # we are not likely to have vars in PREDICT that are not in the checkpoint
-      # created by TRAIN.
-      if input_receiver_fn_map.get(model_fn_lib.ModeKeys.TRAIN):
-        self._add_meta_graph_for_mode(
-            builder, input_receiver_fn_map, checkpoint_path,
-            strip_default_attrs, save_variables,
-            mode=model_fn_lib.ModeKeys.TRAIN)
-        save_variables = False
-      if input_receiver_fn_map.get(model_fn_lib.ModeKeys.EVAL):
-        self._add_meta_graph_for_mode(
-            builder, input_receiver_fn_map, checkpoint_path,
-            strip_default_attrs, save_variables,
-            mode=model_fn_lib.ModeKeys.EVAL)
-        save_variables = False
-      if input_receiver_fn_map.get(model_fn_lib.ModeKeys.PREDICT):
-        self._add_meta_graph_for_mode(
-            builder, input_receiver_fn_map, checkpoint_path,
-            strip_default_attrs, save_variables,
-            mode=model_fn_lib.ModeKeys.PREDICT)
-        save_variables = False
-
-      if save_variables:
-        raise ValueError('No valid modes for exporting found. Got {}.'.format(
-            input_receiver_fn_map.keys()))
-
-      builder.save(as_text)
-
-      # Add the extra assets
-      if assets_extra:
-        assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
-                                         compat.as_bytes('assets.extra'))
-        for dest_relative, source in assets_extra.items():
-          dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
-                                       compat.as_bytes(dest_relative))
-          dest_path = os.path.dirname(dest_absolute)
-          gfile.MakeDirs(dest_path)
-          gfile.Copy(source, dest_absolute)
-
-      gfile.Rename(temp_export_dir, export_dir)
-      return export_dir
-
-  def _add_meta_graph_for_mode(self,
-                               builder,
-                               input_receiver_fn_map,
-                               checkpoint_path,
-                               strip_default_attrs,
-                               save_variables=True,
-                               mode=model_fn_lib.ModeKeys.PREDICT,
-                               export_tags=None,
-                               check_variables=True):
-    # pylint: disable=line-too-long
-    """Loads variables and adds them along with a `tf.MetaGraphDef` for saving.
-
-    Args:
-      builder: instance of `tf.saved_modle.builder.SavedModelBuilder` that will
-        be used for saving.
-      input_receiver_fn_map: dict of `tf.estimator.ModeKeys` to
-        `input_receiver_fn` mappings, where the `input_receiver_fn` is a
-        function that takes no argument and returns the appropriate subclass of
-        `InputReceiver`.
-      checkpoint_path: The checkpoint path to export.  If `None` (the default),
-        the most recent checkpoint found within the model directory is chosen.
-      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the `NodeDef`s. For a detailed guide, see [Stripping
-        Default-Valued
-        Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
-      save_variables: bool, whether variables should be saved. If `False`, just
-        the `tf.MetaGraphDef` will be saved. Note that `save_variables` should
-        only be `True` for the first call to this function, and the
-        `SavedModelBuilder` will raise an error if that is not the case.
-      mode: `tf.estimator.ModeKeys` value indicating which mode will be
-        exported.
-      export_tags: The set of tags with which to save `tf.MetaGraphDef`. If
-        `None`, a default set will be selected to matched the passed mode.
-      check_variables: bool, whether to check the checkpoint has all variables.
-
-    Raises:
-      ValueError: if `save_variables` is `True` and `check_variable` is `False`.
-    """
-    # pylint: enable=line-too-long
-    if export_tags is None:
-      export_tags = model_fn_lib.EXPORT_TAG_MAP[mode]
-    input_receiver_fn = input_receiver_fn_map[mode]
-
-    with ops.Graph().as_default() as g:
-      self._create_and_assert_global_step(g)
-      random_seed.set_random_seed(self._config.tf_random_seed)
-
-      input_receiver = input_receiver_fn()
-
-      # Call the model_fn and collect the export_outputs.
-      estimator_spec = self._call_model_fn(
-          features=input_receiver.features,
-          labels=getattr(input_receiver, 'labels', None),
-          mode=mode,
-          config=self.config)
-
-      export_outputs = model_fn_lib.export_outputs_for_mode(
-          mode=estimator_spec.mode,
-          serving_export_outputs=estimator_spec.export_outputs,
-          predictions=estimator_spec.predictions,
-          loss=estimator_spec.loss,
-          metrics=estimator_spec.eval_metric_ops)
-
-      # Build the SignatureDefs from receivers and all outputs
-      signature_def_map = export_helpers.build_all_signature_defs(
-          input_receiver.receiver_tensors,
-          export_outputs,
-          getattr(input_receiver, 'receiver_tensors_alternatives', None),
-          serving_only=(mode == model_fn_lib.ModeKeys.PREDICT))
-
-      with tf_session.Session(config=self._session_config) as session:
-
-        if estimator_spec.scaffold.local_init_op is not None:
-          local_init_op = estimator_spec.scaffold.local_init_op
-        else:
-          local_init_op = monitored_session.Scaffold.default_local_init_op()
-
-        # This saver will be used both for restoring variables now,
-        # and in saving out the metagraph below. This ensures that any
-        # Custom Savers stored with the Scaffold are passed through to the
-        # SavedModel for restore later.
-        graph_saver = estimator_spec.scaffold.saver or saver.Saver(sharded=True)
-
-        if save_variables and not check_variables:
-          raise ValueError('If `save_variables` is `True, `check_variables`'
-                           'must not be `False`.')
-        if check_variables:
-          try:
-            graph_saver.restore(session, checkpoint_path)
-          except errors.NotFoundError as e:
-            msg = ('Could not load all requested variables from checkpoint. '
-                   'Please make sure your model_fn does not expect variables '
-                   'that were not saved in the checkpoint.\n\n'
-                   'Encountered error with mode `{}` while restoring '
-                   'checkpoint from: `{}`. Full Traceback:\n\n{}').format(
-                       mode, checkpoint_path, e)
-            raise ValueError(msg)
-
-        # We add the train op explicitly for now, so that we don't have to
-        # change the Builder public interface. Note that this is a no-op
-        # for prediction, where train_op is None.
-        builder._add_train_op(estimator_spec.train_op)  # pylint: disable=protected-access
-
-        meta_graph_kwargs = dict(
-            tags=export_tags,
-            signature_def_map=signature_def_map,
-            assets_collection=ops.get_collection(
-                ops.GraphKeys.ASSET_FILEPATHS),
-            strip_default_attrs=strip_default_attrs,
-            legacy_init_op=local_init_op,
-            saver=graph_saver)
-
-        if save_variables:
-          builder.add_meta_graph_and_variables(
-              session, **meta_graph_kwargs)
-        else:
-          builder.add_meta_graph(**meta_graph_kwargs)
-
-  def _get_features_from_input_fn(self, input_fn, mode):
-    """Extracts the `features` from return values of `input_fn`."""
-    result = self._call_input_fn(input_fn, mode)
-    result, _, hooks = estimator_util.parse_input_fn_result(result)
-    self._validate_features_in_predict_input(result)
-    return result, hooks
-
-  def _validate_features_in_predict_input(self, result):
-    if not _has_dataset_or_queue_runner(result):
-      logging.warning('Input graph does not use tf.data.Dataset or contain a '
-                      'QueueRunner. That means predict yields forever. '
-                      'This is probably a mistake.')
-
-  def _get_iterator_from_input_fn(self, input_fn, mode, distribution=None):
-    if distribution is not None:
-      result = distribution.distribute_dataset(
-          lambda: self._call_input_fn(input_fn, mode))
-    else:
-      result = self._call_input_fn(input_fn, mode)
-
-    iterator = result.make_initializable_iterator()
-    input_hooks = [estimator_util._DatasetInitializerHook(iterator)]  # pylint: disable=protected-access
-    return iterator, input_hooks
-
-  def _get_features_and_labels_from_input_fn(self, input_fn, mode):
-    """Extracts the `features` and labels from return values of `input_fn`."""
-    return estimator_util.parse_input_fn_result(
-        self._call_input_fn(input_fn, mode))
-
-  def _extract_batch_length(self, preds_evaluated):
-    """Extracts batch length of predictions."""
-    batch_length = None
-    for key, value in six.iteritems(preds_evaluated):
-      batch_length = batch_length or value.shape[0]
-      if value.shape[0] != batch_length:
-        raise ValueError('Batch length of predictions should be same. %s has '
-                         'different batch length than others.' % key)
-    return batch_length
-
-  def _extract_keys(self, predictions, predict_keys):
-    """Extracts `predict_keys` from `predictions`."""
-    if not predict_keys:
-      return predictions
-    if not isinstance(predictions, dict):
-      raise ValueError(
-          'predict_keys argument is not valid in case of non-dict predictions.')
-    existing_keys = predictions.keys()
-    predictions = {
-        key: value
-        for key, value in six.iteritems(predictions) if key in predict_keys
-    }
-    if not predictions:
-      raise ValueError('Expected to run at least one output from %s, '
-                       'provided %s.' % (existing_keys, predict_keys))
-    return predictions
-
-  def _create_global_step(self, graph):
-    """Creates the global step tensor in graph.
-
-    The global step tensor must be an integer type with name 'global_step' and
-    be added to the collection `tf.GraphKeys.GLOBAL_STEP`.
-
-    Args:
-      graph: The graph in which to create the global step tensor.
-
-    Returns:
-      The global step `tf.Tensor`.
-    """
-    return training.create_global_step(graph)
-
-  def _create_and_assert_global_step(self, graph):
-    """Creates and asserts properties of the global step.
-
-    Args:
-      graph: The graph in which to create the global step tensor.
-
-    Returns:
-      The global step `tf.Tensor`.
-    """
-    step = self._create_global_step(graph)
-    assert step == training.get_global_step()
-    assert step.dtype.is_integer
-    return step
-
-  def _call_input_fn(self, input_fn, mode):
-    """Calls the input function.
-
-    Args:
-      input_fn: The input function.
-      mode: `tf.estimator.ModeKeys`
-
-    Returns:
-      The return value of the passed `input_fn`, which should be one of:
-
-        * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
-            tuple `(features, labels)` with same constraints as below.
-        * A tuple `(features, labels)`: Where `features` is a `Tensor` or a
-          dictionary of string feature name to `Tensor` and `labels` is a
-          `Tensor` or a dictionary of string label name to `Tensor`. Both
-          `features` and `labels` are consumed by `model_fn`. They should
-          satisfy the expectation of `model_fn` from inputs.
-
-    Raises:
-      ValueError: if `input_fn` takes invalid arguments.
-    """
-    input_fn_args = function_utils.fn_args(input_fn)
-    kwargs = {}
-    if 'mode' in input_fn_args:
-      kwargs['mode'] = mode
-    if 'params' in input_fn_args:
-      kwargs['params'] = self.params
-    if 'config' in input_fn_args:
-      kwargs['config'] = self.config
-    with ops.device('/cpu:0'):
-      return input_fn(**kwargs)
-
-  def _call_model_fn(self, features, labels, mode, config):
-    """Calls model function.
-
-    Args:
-      features: features dict.
-      labels: labels dict.
-      mode: `tf.estimator.ModeKeys`
-      config: `tf.estimator.RunConfig`
-
-    Returns:
-      An `tf.estimator.EstimatorSpec` object.
-
-    Raises:
-      ValueError: if `model_fn` returns invalid objects.
-    """
-    model_fn_args = function_utils.fn_args(self._model_fn)
-    kwargs = {}
-    if 'labels' in model_fn_args:
-      kwargs['labels'] = labels
-    else:
-      if labels is not None:
-        raise ValueError(
-            'model_fn does not take labels, but input_fn returns labels.')
-    if 'mode' in model_fn_args:
-      kwargs['mode'] = mode
-    if 'params' in model_fn_args:
-      kwargs['params'] = self.params
-    if 'config' in model_fn_args:
-      kwargs['config'] = config
-
-    logging.info('Calling model_fn.')
-    model_fn_results = self._model_fn(features=features, **kwargs)
-    logging.info('Done calling model_fn.')
-
-    if not isinstance(model_fn_results, model_fn_lib.EstimatorSpec):
-      raise ValueError('model_fn should return an EstimatorSpec.')
-
-    return model_fn_results
-
-  def _train_model(self, input_fn, hooks, saving_listeners):
-    if self._train_distribution:
-      return self._train_model_distributed(input_fn, hooks, saving_listeners)
-    else:
-      return self._train_model_default(input_fn, hooks, saving_listeners)
-
-  def _train_model_default(self, input_fn, hooks, saving_listeners):
-    """Initiate training with `input_fn`, without `DistributionStrategies`.
-
-    Args:
-      input_fn: A function that provides input data for training as minibatches.
-      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
-        callbacks inside the training loop.
-      saving_listeners: list of `tf.train.CheckpointSaverListener` objects. Used
-        for callbacks that run immediately before or after checkpoint savings.
-
-    Returns:
-      Loss from training
-    """
-    worker_hooks = []
-    with ops.Graph().as_default() as g, g.device(self._device_fn):
-      random_seed.set_random_seed(self._config.tf_random_seed)
-      global_step_tensor = self._create_and_assert_global_step(g)
-
-      # Skip creating a read variable if _create_and_assert_global_step
-      # returns None (e.g. tf.contrib.estimator.SavedModelEstimator).
-      if global_step_tensor is not None:
-        training_util._get_or_create_global_step_read(g)  # pylint: disable=protected-access
-
-      features, labels, input_hooks = (
-          self._get_features_and_labels_from_input_fn(
-              input_fn, model_fn_lib.ModeKeys.TRAIN))
-      worker_hooks.extend(input_hooks)
-      estimator_spec = self._call_model_fn(
-          features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
-      global_step_tensor = training_util.get_global_step(g)
-      return self._train_with_estimator_spec(estimator_spec, worker_hooks,
-                                             hooks, global_step_tensor,
-                                             saving_listeners)
-
-  def _train_model_distributed(self, input_fn, hooks, saving_listeners):
-    """Initiate training with `input_fn`, using `DistributionStrategies`.
-
-    Args:
-      input_fn: A function that provides input data for training as minibatches.
-      hooks: List of `tf.train.SessionRunHook` subclass instances. Used for
-        callbacks inside the training loop.
-      saving_listeners: list of `tf.train.CheckpointSaverListener` objects. Used
-        for callbacks that run immediately before or after checkpoint savings.
-
-    Returns:
-      Loss from training
-    """
-    self._train_distribution.configure(self._session_config)
-
-    # TODO(sourabhbajaj): Remove this hack once we migrate the other strategies
-    # to use the new API
-    is_tpu_strategy = (
-        self._train_distribution.__class__.__name__ == 'TPUStrategy')
-
-    worker_hooks = []
-    with ops.Graph().as_default() as g:
-      # We want to create the iterations variable outside the distribution scope
-      # as that is just stored on the host and mainly used to drive the loop
-      # and doesn't need to be a Mirrored/Device variable.
-      if is_tpu_strategy:
-        steps_per_run_variable = training.get_or_create_steps_per_run_variable()
-      with self._train_distribution.scope():
-        random_seed.set_random_seed(self._config.tf_random_seed)
-        iterator, input_hooks = self._get_iterator_from_input_fn(
-            input_fn, model_fn_lib.ModeKeys.TRAIN, self._train_distribution)
-        worker_hooks.extend(input_hooks)
-        global_step_tensor = self._create_and_assert_global_step(g)
-        # we want to add to the global collection in the main thread not the
-        # tower threads.
-        ops.add_to_collection(
-            training_util.GLOBAL_STEP_READ_KEY,
-            self._train_distribution.read_var(global_step_tensor))
-
-        if is_tpu_strategy:
-          # Create a step_fn from the train_op of grouped_estimator_spec
-          def step_fn(ctx, features, labels=None):
-            """A single step that is passed to run_on_dataset."""
-            estimator_spec = self._train_distribution.call_for_each_tower(
-                self._call_model_fn,
-                features,
-                labels,
-                model_fn_lib.ModeKeys.TRAIN,
-                self.config)
-            ctx.set_last_step_output(
-                name='loss',
-                output=estimator_spec.loss,
-                aggregation=distribute_lib.get_loss_reduction())
-            ctx.set_non_tensor_output(
-                name='estimator_spec', output=estimator_spec)
-            return estimator_spec.train_op
-
-          # Create new train_op post graph rewrites
-          initial_training_loss = constant_op.constant(1e7)
-          ctx = self._train_distribution.run_steps_on_dataset(
-              step_fn, iterator, iterations=steps_per_run_variable,
-              initial_loop_values={'loss': initial_training_loss})
-          distributed_train_op = ctx.run_op
-          loss = ctx.last_step_outputs['loss']
-          grouped_estimator_spec = ctx.non_tensor_outputs['estimator_spec']
-        else:
-          features, labels = estimator_util.parse_iterator_result(
-              iterator.get_next())
-          grouped_estimator_spec = self._train_distribution.call_for_each_tower(
-              self._call_model_fn,
-              features,
-              labels,  # although this will be None it seems
-              model_fn_lib.ModeKeys.TRAIN,
-              self.config)
-          loss = self._train_distribution.unwrap(
-              self._train_distribution.reduce(
-                  distribute_lib.get_loss_reduction(),
-                  grouped_estimator_spec.loss,
-                  destinations='/device:CPU:0'))[0]
-          distributed_train_op = grouped_estimator_spec.train_op
-
-        scaffold = _combine_distributed_scaffold(
-            grouped_estimator_spec.scaffold, self._train_distribution)
-
-        # TODO(yuefengz): add a test for unwrapping per_device_hooks.
-        def get_hooks_from_the_first_device(per_device_hooks):
-          return [
-              self._distribution.unwrap(per_device_hook)[0]
-              for per_device_hook in per_device_hooks
-          ]
-
-        training_hooks = get_hooks_from_the_first_device(
-            grouped_estimator_spec.training_hooks)
-        training_chief_hooks = get_hooks_from_the_first_device(
-            grouped_estimator_spec.training_chief_hooks)
-        worker_hooks.append(
-            estimator_util.StrategyInitFinalizeHook(
-                self._train_distribution.initialize,
-                self._train_distribution.finalize))
-
-        estimator_spec = model_fn_lib.EstimatorSpec(
-            mode=grouped_estimator_spec.mode,
-            loss=loss,
-            train_op=self._train_distribution.group(distributed_train_op),
-            training_hooks=training_hooks,
-            training_chief_hooks=training_chief_hooks,
-            scaffold=scaffold)
-        return self._train_with_estimator_spec(estimator_spec, worker_hooks,
-                                               hooks, global_step_tensor,
-                                               saving_listeners)
-
-  def _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks,
-                                 global_step_tensor, saving_listeners):
-    """Train a model with the given Estimator Spec."""
-    if self._warm_start_settings:
-      logging.info('Warm-starting with WarmStartSettings: %s' %
-                   (self._warm_start_settings,))
-      warm_starting_util.warm_start(*self._warm_start_settings)
-    # Check if the user created a loss summary, and add one if they didn't.
-    # We assume here that the summary is called 'loss'. If it is not, we will
-    # make another one with the name 'loss' to ensure it shows up in the right
-    # graph in TensorBoard.
-    if not any([x.op.name == 'loss'
-                for x in ops.get_collection(ops.GraphKeys.SUMMARIES)]):
-      summary.scalar('loss', estimator_spec.loss)
-    ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
-    worker_hooks.extend(hooks)
-    worker_hooks.append(
-        training.NanTensorHook(estimator_spec.loss)
-    )
-    if self._config.log_step_count_steps is not None:
-      worker_hooks.append(
-          training.LoggingTensorHook(
-              {
-                  'loss': estimator_spec.loss,
-                  'step': global_step_tensor
-              },
-              every_n_iter=self._config.log_step_count_steps)
-      )
-    worker_hooks.extend(estimator_spec.training_hooks)
-
-    if not (estimator_spec.scaffold.saver or
-            ops.get_collection(ops.GraphKeys.SAVERS)):
-      ops.add_to_collection(
-          ops.GraphKeys.SAVERS,
-          training.Saver(
-              sharded=True,
-              max_to_keep=self._config.keep_checkpoint_max,
-              keep_checkpoint_every_n_hours=(
-                  self._config.keep_checkpoint_every_n_hours),
-              defer_build=True,
-              save_relative_paths=True))
-
-    chief_hooks = []
-    all_hooks = worker_hooks + list(estimator_spec.training_chief_hooks)
-    saver_hooks = [
-        h for h in all_hooks if isinstance(h, training.CheckpointSaverHook)]
-    if (self._config.save_checkpoints_secs or
-        self._config.save_checkpoints_steps):
-      if not saver_hooks:
-        chief_hooks = [
-            training.CheckpointSaverHook(
-                self._model_dir,
-                save_secs=self._config.save_checkpoints_secs,
-                save_steps=self._config.save_checkpoints_steps,
-                scaffold=estimator_spec.scaffold)
-        ]
-        saver_hooks = [chief_hooks[0]]
-    if saving_listeners:
-      if not saver_hooks:
-        raise ValueError(
-            'There should be a CheckpointSaverHook to use saving_listeners. '
-            'Please set one of the RunConfig.save_checkpoints_steps or '
-            'RunConfig.save_checkpoints_secs.')
-      else:
-        # It is expected to have one CheckpointSaverHook. If multiple, we pick
-        # up the first one to add listener.
-        saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
-
-    # Add summary hooks to worker 0 if we are running with a master, to ensure
-    # that summaries are written at correct intervals even with long-running
-    # evaluations.
-    save_summary_steps = self._config.save_summary_steps
-    log_step_count_steps = self._config.log_step_count_steps
-    if (self._config.cluster_spec and self._config.cluster_spec.jobs and
-        (run_config.TaskType.MASTER in self._config.cluster_spec.jobs)):
-      # Update config values to prevent the default hooks from being created on
-      # the master or other workers.
-      save_summary_steps = 0
-      log_step_count_steps = None
-
-      if (self._config.task_type == run_config.TaskType.WORKER and
-          self._config.task_id == 0):
-        if (self._config.save_summary_steps and
-            self._config.save_summary_steps > 0):
-          worker_hooks.append(
-              training.SummarySaverHook(
-                  save_steps=self._config.save_summary_steps,
-                  output_dir=self._config.model_dir,
-                  scaffold=estimator_spec.scaffold))
-
-        if (self._config.log_step_count_steps and
-            self._config.log_step_count_steps > 0):
-          worker_hooks.append(
-              training.StepCounterHook(
-                  every_n_steps=self._config.log_step_count_steps,
-                  output_dir=self._config.model_dir))
-
-    with training.MonitoredTrainingSession(
-        master=self._config.master,
-        is_chief=self._config.is_chief,
-        checkpoint_dir=self._model_dir,
-        scaffold=estimator_spec.scaffold,
-        hooks=worker_hooks,
-        chief_only_hooks=(
-            tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
-        save_checkpoint_secs=0,  # Saving is handled by a hook.
-        save_summaries_steps=save_summary_steps,
-        config=self._session_config,
-        log_step_count_steps=log_step_count_steps) as mon_sess:
-      loss = None
-      while not mon_sess.should_stop():
-        _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
-    return loss
-
-  def _evaluate_build_graph(self, input_fn, hooks=None, checkpoint_path=None):
-    """Builds the graph and related hooks to run evaluation."""
-    random_seed.set_random_seed(self._config.tf_random_seed)
-    self._create_and_assert_global_step(ops.get_default_graph())
-
-    if self._eval_distribution:
-      (scaffold, evaluation_hooks, input_hooks, update_op, eval_dict) = (
-          self._call_model_fn_eval_distributed(input_fn, self.config))
-    else:
-      (scaffold, evaluation_hooks, input_hooks, update_op, eval_dict) = (
-          self._call_model_fn_eval(input_fn, self.config))
-
-    global_step_tensor = training_util.get_global_step(ops.get_default_graph())
-    # Call to warm_start has to be after model_fn is called.
-    self._maybe_warm_start(checkpoint_path)
-
-    if ops.GraphKeys.GLOBAL_STEP in eval_dict:
-      raise ValueError(
-          'Metric with name `global_step` is not allowed, because Estimator '
-          'already defines a default metric with the same name.')
-    eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
-
-    all_hooks = list(input_hooks)
-    all_hooks.extend(hooks)
-    all_hooks.extend(list(evaluation_hooks or []))
-    # New local variables have been added, so update the estimator spec's
-    # local init op if it was defined.
-    if scaffold and scaffold.local_init_op:
-      # Ensure that eval step has been created before updating local init op.
-      evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
-
-      scaffold = monitored_session.Scaffold(
-          local_init_op=control_flow_ops.group(
-              scaffold.local_init_op,
-              monitored_session.Scaffold.default_local_init_op()),
-          copy_from_scaffold=scaffold
-      )
-
-    return scaffold, update_op, eval_dict, all_hooks
-
-  def _call_model_fn_eval(self, input_fn, config):
-    """Call model_fn for evaluation and handle return values."""
-    features, labels, input_hooks = self._get_features_and_labels_from_input_fn(
-        input_fn, model_fn_lib.ModeKeys.EVAL)
-
-    estimator_spec = self._call_model_fn(
-        features, labels, model_fn_lib.ModeKeys.EVAL, config)
-    eval_metric_ops = _verify_and_create_loss_metric(
-        estimator_spec.eval_metric_ops, estimator_spec.loss)
-    update_op, eval_dict = _extract_metric_update_ops(eval_metric_ops)
-    return (estimator_spec.scaffold, estimator_spec.evaluation_hooks,
-            input_hooks, update_op, eval_dict)
-
-  def _call_model_fn_eval_distributed(self, input_fn, config):
-    """Call model_fn in distribution mode and handle return values."""
-
-    iterator, input_hooks = self._get_iterator_from_input_fn(
-        input_fn, model_fn_lib.ModeKeys.EVAL, self._eval_distribution)
-
-    is_tpu_strategy = (
-        self._eval_distribution.__class__.__name__ == 'TPUStrategy')
-
-    if is_tpu_strategy:
-      steps_per_run_variable = training.get_or_create_steps_per_run_variable()
-      def step_fn(ctx, features, labels=None):
-        """Runs one step of the eval computation and captures outputs."""
-        estimator_spec = self._eval_distribution.call_for_each_tower(
-            self._call_model_fn, features, labels, model_fn_lib.ModeKeys.EVAL,
-            config)
-        eval_metric_ops = _verify_and_create_loss_metric(
-            estimator_spec.eval_metric_ops, estimator_spec.loss,
-            self._eval_distribution)
-        update_op, eval_dict = _extract_metric_update_ops(
-            eval_metric_ops, self._eval_distribution)
-        ctx.set_non_tensor_output(name='estimator_spec', output=estimator_spec)
-        ctx.set_non_tensor_output(name='eval_dict', output=eval_dict)
-        return update_op
-
-      # TODO(priyag): Fix eval step hook to account for steps_per_run.
-      ctx = self._eval_distribution.run_steps_on_dataset(
-          step_fn, iterator, iterations=steps_per_run_variable)
-      update_op = ctx.run_op
-      eval_dict = ctx.non_tensor_outputs['eval_dict']
-      grouped_estimator_spec = ctx.non_tensor_outputs['estimator_spec']
-    else:
-      features, labels = estimator_util.parse_iterator_result(
-          iterator.get_next())
-      grouped_estimator_spec = self._eval_distribution.call_for_each_tower(
-          self._call_model_fn, features, labels,
-          model_fn_lib.ModeKeys.EVAL, config)
-      eval_metric_ops = _verify_and_create_loss_metric(
-          grouped_estimator_spec.eval_metric_ops, grouped_estimator_spec.loss,
-          self._eval_distribution)
-      update_op, eval_dict = _extract_metric_update_ops(
-          eval_metric_ops, self._eval_distribution)
-
-    scaffold = _combine_distributed_scaffold(
-        grouped_estimator_spec.scaffold, self._eval_distribution)
-    evaluation_hooks = self._eval_distribution.unwrap(
-        grouped_estimator_spec.evaluation_hooks)[0]
-    evaluation_hooks = evaluation_hooks + (
-        estimator_util.StrategyInitFinalizeHook(
-            self._eval_distribution.initialize,
-            self._eval_distribution.finalize),)
-
-    return (scaffold, evaluation_hooks, input_hooks, update_op, eval_dict)
-
-  def _evaluate_run(self, checkpoint_path, scaffold, update_op, eval_dict,
-                    all_hooks, output_dir):
-    """Run evaluation."""
-    eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
-        checkpoint_path=checkpoint_path,
-        master=self._config.evaluation_master,
-        scaffold=scaffold,
-        eval_ops=update_op,
-        final_ops=eval_dict,
-        hooks=all_hooks,
-        config=self._session_config)
-
-    current_global_step = eval_results[ops.GraphKeys.GLOBAL_STEP]
-
-    _write_dict_to_summary(
-        output_dir=output_dir,
-        dictionary=eval_results,
-        current_global_step=current_global_step)
-
-    if checkpoint_path:
-      _write_checkpoint_path_to_summary(
-          output_dir=output_dir,
-          checkpoint_path=checkpoint_path,
-          current_global_step=current_global_step)
-
-    return eval_results
-
-  def _maybe_warm_start(self, checkpoint_path):
-    if not checkpoint_path and self._warm_start_settings:
-      logging.info('Warm-starting with WarmStartSettings: %s' %
-                   (self._warm_start_settings,))
-      warm_starting_util.warm_start(*self._warm_start_settings)
-
-
-def _verify_and_create_loss_metric(eval_metric_ops, loss, distribution=None):
-  """Creates a metric for loss and throws an error if one already exists."""
-  if model_fn_lib.LOSS_METRIC_KEY in eval_metric_ops:
-    raise ValueError(
-        'Metric with name "%s" is not allowed, because Estimator ' %
-        (model_fn_lib.LOSS_METRIC_KEY) +
-        'already defines a default metric with the same name.')
-
-  if distribution is None:
-    loss_metric = metrics_lib.mean(loss)
-  else:
-    loss_metric = distribution.call_for_each_tower(
-        metrics_lib.mean, loss)
-  eval_metric_ops[model_fn_lib.LOSS_METRIC_KEY] = loss_metric
-  return eval_metric_ops
-
-
-def maybe_overwrite_model_dir_and_session_config(config, model_dir):
-  """Overwrite estimator config by `model_dir` and `session_config` if needed.
-
-  Args:
-    config: Original estimator config.
-    model_dir: Estimator model checkpoint directory.
-
-  Returns:
-    Overwritten estimator config.
-
-  Raises:
-    ValueError: Model directory inconsistent between `model_dir` and `config`.
-  """
-
-  if config is None:
-    config = run_config.RunConfig()
-    logging.info('Using default config.')
-  if not isinstance(config, run_config.RunConfig):
-    raise ValueError(
-        'config must be an instance of `RunConfig`, but provided %s.' % config)
-
-  if config.session_config is None:
-    session_config = run_config.get_default_session_config()
-    config = run_config.RunConfig.replace(config, session_config=session_config)
-
-  model_dir = compat_internal.path_to_str(model_dir)
-  if model_dir is not None:
-    if (getattr(config, 'model_dir', None) is not None and
-        config.model_dir != model_dir):
-      raise ValueError(
-          "`model_dir` are set both in constructor and `RunConfig`, but with "
-          "different values. In constructor: '{}', in `RunConfig`: "
-          "'{}' ".format(model_dir, config.model_dir))
-  if model_dir:
-    config = run_config.RunConfig.replace(config, model_dir=model_dir)
-  elif getattr(config, 'model_dir', None) is None:
-    model_dir = tempfile.mkdtemp()
-    logging.warning('Using temporary folder as model directory: %s', model_dir)
-    config = run_config.RunConfig.replace(config, model_dir=model_dir)
-
-  return config
-
-
-def create_per_tower_ready_for_local_init_op(scaffold):
-  """Create a `tf.train.Scaffold.ready_for_local_init_op` inside a tower."""
-  if scaffold.ready_for_local_init_op:
-    return scaffold.ready_for_local_init_op
-
-  def default_ready_for_local_init_op():
-    return variables.report_uninitialized_variables(
-        variables.global_variables())
-
-  return monitored_session.Scaffold.get_or_default(
-      'ready_for_local_init_op', ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
-      default_ready_for_local_init_op)
-
-
-def _combine_distributed_scaffold(grouped_scaffold, distribution):
-  """Combines scaffold(s) returned from `distribution.call_for_each_tower`."""
-
-  # TODO(anjalisridhar): Figure out how to resolve the following scaffold
-  # parameters: init_feed_dict, init_fn.
-  scaffold_list = distribution.unwrap(grouped_scaffold)
-  init_feed_dict = [
-      s.init_feed_dict
-      for s in scaffold_list
-      if s.init_feed_dict is not None
-  ]
-  if init_feed_dict:
-    init_feed_dict = distribution.group(init_feed_dict)
-  else:
-    init_feed_dict = None
-
-  init_fn = [s.init_fn for s in scaffold_list if s.init_fn is not None]
-  if init_fn:
-    init_fn = distribution.group(init_fn)
-  else:
-    init_fn = None
-
-  init_op = [s.init_op for s in scaffold_list if s.init_op is not None]
-  if init_op:
-    init_op = distribution.group(init_op)
-  else:
-    init_op = None
-
-  def _unwrap_and_concat(value):
-    value = nest.flatten(distribution.unwrap(value))
-    if len(value) != 1:
-      return array_ops.concat(value, 0)
-    return value[0]
-
-  ready_op = distribution.call_for_each_tower(
-      lambda scaffold: scaffold.ready_op, grouped_scaffold)
-  if ready_op is not None:
-    ready_op = _unwrap_and_concat(ready_op)
-
-  ready_for_local_init_op = distribution.call_for_each_tower(
-      create_per_tower_ready_for_local_init_op, grouped_scaffold)
-  if ready_for_local_init_op is not None:
-    ready_for_local_init_op = _unwrap_and_concat(ready_for_local_init_op)
-  else:
-    ready_for_local_init_op = None
-
-  local_init_op = [
-      s.local_init_op
-      for s in scaffold_list
-      if s.local_init_op is not None
-  ]
-  if local_init_op:
-    local_init_op = distribution.group(local_init_op)
-  else:
-    local_init_op = None
-
-  summary_op = [
-      s.summary_op for s in scaffold_list if s.summary_op is not None
-  ]
-  if summary_op:
-    summary_op = distribution.group(summary_op)
-  else:
-    summary_op = None
-
-  scaffold = monitored_session.Scaffold(
-      init_op=init_op,
-      ready_op=ready_op,
-      ready_for_local_init_op=ready_for_local_init_op,
-      local_init_op=local_init_op,
-      summary_op=summary_op,
-      init_feed_dict=init_feed_dict,
-      init_fn=init_fn)
-  return scaffold
-
-
-def _check_checkpoint_available(model_dir):
-  latest_path = checkpoint_management.latest_checkpoint(model_dir)
-  if not latest_path:
-    raise ValueError(
-        'Could not find trained model in model_dir: {}.'.format(model_dir))
-
-
-def _check_hooks_type(hooks):
-  """Returns hooks if all are `SessionRunHook`, raises TypeError otherwise."""
-  hooks = list(hooks or [])
-  for h in hooks:
-    if not isinstance(h, training.SessionRunHook):
-      raise TypeError('Hooks must be a SessionRunHook, given: {}'.format(h))
-  return hooks
-
-
-def _check_listeners_type(saving_listeners):
-  """Check listeners type."""
-  listeners = list(saving_listeners or [])
-  for l in listeners:
-    if not isinstance(l, training.CheckpointSaverListener):
-      raise TypeError(
-          'saving_listeners must be a list of CheckpointSaverListener, '
-          'given: {}'.format(l))
-  return listeners
-
-
-def _get_replica_device_setter(config):
-  """Creates a replica device setter if required as a default `device_fn`.
-
-  `Estimator` uses `tf.train.ReplicaDeviceSetter` as a default device placer. It
-  sets the
-  distributed related arguments such as number of `ps_replicas` based on given
-  `config`.
-
-  Args:
-    config: A `tf.estimator.RunConfig` instance.
-
-  Returns:
-    A replica device setter, or `None`.
-  """
-  if config.task_type:
-    worker_device = '/job:%s/task:%d' % (config.task_type, config.task_id)
-  else:
-    worker_device = '/job:worker'
-
-  if config.num_ps_replicas > 0:
-    return training.replica_device_setter(
-        ps_tasks=config.num_ps_replicas,
-        worker_device=worker_device,
-        merge_devices=True,
-        ps_ops=list(device_setter.STANDARD_PS_OPS),
-        cluster=config.cluster_spec)
-  else:
-    return None
-
-
-def _verify_model_fn_args(model_fn, params):
-  """Verifies `model_fn` arguments."""
-  args = set(function_utils.fn_args(model_fn))
-  if 'features' not in args:
-    raise ValueError('model_fn (%s) must include features argument.' % model_fn)
-  if params is not None and 'params' not in args:
-    raise ValueError('model_fn (%s) does not include params argument, '
-                     'but params (%s) is passed to Estimator.' % (model_fn,
-                                                                  params))
-  if params is None and 'params' in args:
-    logging.warning('Estimator\'s model_fn (%s) includes params '
-                    'argument, but params are not passed to Estimator.',
-                    model_fn)
-  non_valid_args = list(args - _VALID_MODEL_FN_ARGS)
-  if non_valid_args:
-    raise ValueError('model_fn (%s) has following not expected args: %s' %
-                     (model_fn, non_valid_args))
-
-
-def _load_global_step_from_checkpoint_dir(checkpoint_dir):
-  try:
-    checkpoint_reader = training.NewCheckpointReader(
-        training.latest_checkpoint(checkpoint_dir))
-    return checkpoint_reader.get_tensor(ops.GraphKeys.GLOBAL_STEP)
-  except:  # pylint: disable=bare-except
-    return 0
-
-
-def _extract_metric_update_ops(eval_dict, distribution=None):
-  """Separate update operations from metric value operations."""
-  update_ops = []
-  value_ops = {}
-  # Sort metrics lexicographically so graph is identical every time.
-  for name, value in sorted(six.iteritems(eval_dict)):
-    value_ops[name] = value[0]
-    update_ops.append(
-        distribution.group(value[1]) if distribution else value[1])
-
-  update_op = control_flow_ops.group(*update_ops) if update_ops else None
-  return update_op, value_ops
-
-
-def _dict_to_str(dictionary):
-  """Get a `str` representation of a `dict`.
-
-  Args:
-    dictionary: The `dict` to be represented as `str`.
-
-  Returns:
-    A `str` representing the `dictionary`.
-  """
-  return ', '.join('%s = %s' % (k, v)
-                   for k, v in sorted(six.iteritems(dictionary))
-                   if not isinstance(v, six.binary_type))
-
-
-def _write_dict_to_summary(output_dir,
-                           dictionary,
-                           current_global_step):
-  """Writes a `dict` into summary file in given output directory.
-
-  Args:
-    output_dir: `str`, directory to write the summary file in.
-    dictionary: the `dict` to be written to summary file.
-    current_global_step: `int`, the current global step.
-  """
-  logging.info('Saving dict for global step %d: %s', current_global_step,
-               _dict_to_str(dictionary))
-  summary_writer = writer_cache.FileWriterCache.get(output_dir)
-  summary_proto = summary_pb2.Summary()
-  for key in dictionary:
-    if dictionary[key] is None:
-      continue
-    if key == 'global_step':
-      continue
-    if (isinstance(dictionary[key], np.float32) or
-        isinstance(dictionary[key], float)):
-      summary_proto.value.add(tag=key, simple_value=float(dictionary[key]))
-    elif (isinstance(dictionary[key], np.int64) or
-          isinstance(dictionary[key], np.int32) or
-          isinstance(dictionary[key], int)):
-      summary_proto.value.add(tag=key, simple_value=int(dictionary[key]))
-    elif isinstance(dictionary[key], six.binary_type):
-      try:
-        summ = summary_pb2.Summary.FromString(dictionary[key])
-        for i, _ in enumerate(summ.value):
-          summ.value[i].tag = '%s/%d' % (key, i)
-        summary_proto.value.extend(summ.value)
-      except message.DecodeError:
-        logging.warn('Skipping summary for %s, cannot parse string to Summary.',
-                     key)
-        continue
-    elif isinstance(dictionary[key], np.ndarray):
-      value = summary_proto.value.add()
-      value.tag = key
-      value.node_name = key
-      tensor_proto = tensor_util.make_tensor_proto(dictionary[key])
-      value.tensor.CopyFrom(tensor_proto)
-      # pylint: disable=line-too-long
-      logging.info(
-          'Summary for np.ndarray is not visible in Tensorboard by default. '
-          'Consider using a Tensorboard plugin for visualization (see '
-          'https://github.com/tensorflow/tensorboard-plugin-example/blob/master/README.md'
-          ' for more information).')
-      # pylint: enable=line-too-long
-    else:
-      logging.warn(
-          'Skipping summary for %s, must be a float, np.float32, np.int64, '
-          'np.int32 or int or np.ndarray or a serialized string of Summary.',
-          key)
-  summary_writer.add_summary(summary_proto, current_global_step)
-  summary_writer.flush()
-
-
-def _write_checkpoint_path_to_summary(output_dir, checkpoint_path,
-                                      current_global_step):
-  """Writes `checkpoint_path` into summary file in the given output directory.
-
-  Args:
-    output_dir: `str`, directory to write the summary file in.
-    checkpoint_path: `str`, checkpoint file path to be written to summary file.
-    current_global_step: `int`, the current global step.
-  """
-
-  checkpoint_path_tag = 'checkpoint_path'
-
-  logging.info('Saving \'%s\' summary for global step %d: %s',
-               checkpoint_path_tag, current_global_step, checkpoint_path)
-  summary_proto = summary_pb2.Summary()
-  summary_proto.value.add(
-      tag=checkpoint_path_tag,
-      tensor=tensor_util.make_tensor_proto(
-          checkpoint_path, dtype=dtypes.string))
-  summary_writer = writer_cache.FileWriterCache.get(output_dir)
-  summary_writer.add_summary(summary_proto, current_global_step)
-  summary_writer.flush()
-
-
-def _has_dataset_or_queue_runner(maybe_tensor):
-  """Returns `True` if `Dataset` or `QueueRunner` has been used."""
-  # Check TF dataset first. Here, we use a simple algorithm to check the top
-  # level Tensors only, which should be sufficient for most users.
-  tensors = [x for x in nest.flatten(maybe_tensor) if isinstance(x, ops.Tensor)]
-  if any([t.op.type == 'IteratorGetNext' for t in tensors]):
-    return True
-
-  # Now, check queue.
-  return ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS)
-
-
-VocabInfo = warm_starting_util.VocabInfo  # pylint: disable=invalid-name
-estimator_export('estimator.VocabInfo')(VocabInfo)
-
-
-@estimator_export('estimator.WarmStartSettings')
-class WarmStartSettings(
-    collections.namedtuple('WarmStartSettings', [
-        'ckpt_to_initialize_from',
-        'vars_to_warm_start',
-        'var_name_to_vocab_info',
-        'var_name_to_prev_var_name',
-    ])):
-  """Settings for warm-starting in `tf.estimator.Estimators`.
-
-  Example Use with canned `tf.estimator.DNNEstimator`:
-
-  ```
-  emb_vocab_file = tf.feature_column.embedding_column(
-      tf.feature_column.categorical_column_with_vocabulary_file(
-          "sc_vocab_file", "new_vocab.txt", vocab_size=100),
-      dimension=8)
-  emb_vocab_list = tf.feature_column.embedding_column(
-      tf.feature_column.categorical_column_with_vocabulary_list(
-          "sc_vocab_list", vocabulary_list=["a", "b"]),
-      dimension=8)
-  estimator = tf.estimator.DNNClassifier(
-    hidden_units=[128, 64], feature_columns=[emb_vocab_file, emb_vocab_list],
-    warm_start_from=ws)
-  ```
-
-  where `ws` could be defined as:
-
-  Warm-start all weights in the model (input layer and hidden weights).
-  Either the directory or a specific checkpoint can be provided (in the case
-  of the former, the latest checkpoint will be used):
-
-  ```
-  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp")
-  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp/model-1000")
-  ```
-
-  Warm-start only the embeddings (input layer):
-
-  ```
-  ws = WarmStartSettings(ckpt_to_initialize_from="/tmp",
-                         vars_to_warm_start=".*input_layer.*")
-  ```
-
-  Warm-start all weights but the embedding parameters corresponding to
-  `sc_vocab_file` have a different vocab from the one used in the current
-  model:
-
-  ```
-  vocab_info = tf.estimator.VocabInfo(
-      new_vocab=sc_vocab_file.vocabulary_file,
-      new_vocab_size=sc_vocab_file.vocabulary_size,
-      num_oov_buckets=sc_vocab_file.num_oov_buckets,
-      old_vocab="old_vocab.txt"
-  )
-  ws = WarmStartSettings(
-      ckpt_to_initialize_from="/tmp",
-      var_name_to_vocab_info={
-          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
-      })
-  ```
-
-  Warm-start only `sc_vocab_file` embeddings (and no other variables), which
-  have a different vocab from the one used in the current model:
-
-  ```
-  vocab_info = tf.estimator.VocabInfo(
-      new_vocab=sc_vocab_file.vocabulary_file,
-      new_vocab_size=sc_vocab_file.vocabulary_size,
-      num_oov_buckets=sc_vocab_file.num_oov_buckets,
-      old_vocab="old_vocab.txt"
-  )
-  ws = WarmStartSettings(
-      ckpt_to_initialize_from="/tmp",
-      vars_to_warm_start=None,
-      var_name_to_vocab_info={
-          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
-      })
-  ```
-
-  Warm-start all weights but the parameters corresponding to `sc_vocab_file`
-  have a different vocab from the one used in current checkpoint, and only
-  100 of those entries were used:
-
-  ```
-  vocab_info = tf.estimator.VocabInfo(
-      new_vocab=sc_vocab_file.vocabulary_file,
-      new_vocab_size=sc_vocab_file.vocabulary_size,
-      num_oov_buckets=sc_vocab_file.num_oov_buckets,
-      old_vocab="old_vocab.txt",
-      old_vocab_size=100
-  )
-  ws = WarmStartSettings(
-      ckpt_to_initialize_from="/tmp",
-      var_name_to_vocab_info={
-          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
-      })
-  ```
-
-  Warm-start all weights but the parameters corresponding to `sc_vocab_file`
-  have a different vocab from the one used in current checkpoint and the
-  parameters corresponding to `sc_vocab_list` have a different name from the
-  current checkpoint:
-
-  ```
-  vocab_info = tf.estimator.VocabInfo(
-      new_vocab=sc_vocab_file.vocabulary_file,
-      new_vocab_size=sc_vocab_file.vocabulary_size,
-      num_oov_buckets=sc_vocab_file.num_oov_buckets,
-      old_vocab="old_vocab.txt",
-      old_vocab_size=100
-  )
-  ws = WarmStartSettings(
-      ckpt_to_initialize_from="/tmp",
-      var_name_to_vocab_info={
-          "input_layer/sc_vocab_file_embedding/embedding_weights": vocab_info
-      },
-      var_name_to_prev_var_name={
-          "input_layer/sc_vocab_list_embedding/embedding_weights":
-              "old_tensor_name"
-      })
-  ```
-
-  Attributes:
-    ckpt_to_initialize_from: [Required] A string specifying the directory with
-      checkpoint file(s) or path to checkpoint from which to warm-start the
-      model parameters.
-    vars_to_warm_start: [Optional] One of the following:  - A regular expression
-      (string) that captures which variables to warm-start (see
-      `tf.get_collection`).  This expression will only consider variables in the
-      `TRAINABLE_VARIABLES` collection. - A list of Variables to warm-start. - A
-      list of strings, each representing a full variable name to warm-start. -
-      `None`, in which case only variables specified in `var_name_to_vocab_info`
-      will be warm-started.  Defaults to `'.*'`, which warm-starts all variables
-      in the `TRAINABLE_VARIABLES` collection.  Note that this excludes
-      variables such as accumulators and moving statistics from batch norm.
-    var_name_to_vocab_info: [Optional] Dict of variable names (strings) to
-      `tf.estimator.VocabInfo`. The variable names should be "full" variables,
-      not the names of the partitions.  If not explicitly provided, the variable
-      is assumed to have no (changes to) vocabulary.
-    var_name_to_prev_var_name: [Optional] Dict of variable names (strings) to
-      name of the previously-trained variable in `ckpt_to_initialize_from`. If
-      not explicitly provided, the name of the variable is assumed to be same
-      between previous checkpoint and current model.
-  """
-
-  def __new__(cls,
-              ckpt_to_initialize_from,
-              vars_to_warm_start='.*',
-              var_name_to_vocab_info=None,
-              var_name_to_prev_var_name=None):
-    if not ckpt_to_initialize_from:
-      raise ValueError(
-          '`ckpt_to_initialize_from` MUST be set in WarmStartSettings')
-    return super(WarmStartSettings, cls).__new__(
-        cls,
-        ckpt_to_initialize_from,
-        vars_to_warm_start,
-        var_name_to_vocab_info or {},
-        var_name_to_prev_var_name or {},
-    )
-
-
-def _get_saved_model_ckpt(saved_model_dir):
-  """Return path to variables checkpoint in a `SavedModel` directory."""
-  if not gfile.Exists(
-      os.path.join(saved_model_utils.get_variables_dir(saved_model_dir),
-                   compat.as_text('variables.index'))):
-    raise ValueError('Directory provided has an invalid SavedModel format: %s'
-                     % saved_model_dir)
-  return saved_model_utils.get_variables_path(saved_model_dir)
-
-
-def _get_default_warm_start_settings(warm_start_from):
-  """Returns default `tf.estimator.WarmStartSettings`.
-
-  Args:
-    warm_start_from: Either a string representing the filepath of a checkpoint
-      or `SavedModel` to initialize from, or an instance of
-      `tf.estimator.WarmStartSettings`.
-
-  Returns:
-    Either None or an instance of `WarmStartSettings`.
-
-  Raises:
-    ValueError: If `warm_start_from` is not `None` but is neither a string nor
-    an
-      instance of `WarmStartSettings`.
-  """
-  if warm_start_from is None:
-    return None
-  if isinstance(warm_start_from, (six.string_types, six.binary_type)):
-    # Infer that this is a SavedModel if export_path +
-    # 'variables/variables.index' exists, and if so, construct the
-    # WarmStartSettings pointing to the variables path
-    # (export_path + 'variables/variables').
-    if gfile.Exists(os.path.join(
-        saved_model_utils.get_variables_dir(warm_start_from),
-        compat.as_text('variables.index'))):
-      logging.info('Warm-starting from a SavedModel')
-      return WarmStartSettings(
-          ckpt_to_initialize_from=saved_model_utils.get_variables_path(
-              warm_start_from))
-    return WarmStartSettings(ckpt_to_initialize_from=warm_start_from)
-  elif isinstance(warm_start_from, WarmStartSettings):
-    return warm_start_from
-  else:
-    raise ValueError('warm_start_from must be a string or a WarmStartSettings, '
-                     'instead got {}'.format(type(warm_start_from)))
+from tensorflow_estimator.python.estimator.estimator import *
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index f188f2d4e6096a968691e94201ba69efe506833f..4c18b552366776a586452c3ab5dd204b4229e4f9 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,40 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Estimator: High level tools for working with models."""
+"""estimator_lib python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,line-too-long,wildcard-import
-from tensorflow.python.estimator.canned.baseline import BaselineClassifier
-from tensorflow.python.estimator.canned.baseline import BaselineRegressor
-from tensorflow.python.estimator.canned.boosted_trees import BoostedTreesClassifier
-from tensorflow.python.estimator.canned.boosted_trees import BoostedTreesRegressor
-from tensorflow.python.estimator.canned.dnn import DNNClassifier
-from tensorflow.python.estimator.canned.dnn import DNNRegressor
-from tensorflow.python.estimator.canned.dnn_linear_combined import DNNLinearCombinedClassifier
-from tensorflow.python.estimator.canned.dnn_linear_combined import DNNLinearCombinedRegressor
-from tensorflow.python.estimator.canned.linear import LinearClassifier
-from tensorflow.python.estimator.canned.linear import LinearRegressor
-from tensorflow.python.estimator.canned.parsing_utils import classifier_parse_example_spec
-from tensorflow.python.estimator.canned.parsing_utils import regressor_parse_example_spec
-from tensorflow.python.estimator.estimator import Estimator
-from tensorflow.python.estimator.estimator import VocabInfo
-from tensorflow.python.estimator.estimator import WarmStartSettings
-from tensorflow.python.estimator.export import export_lib as export
-from tensorflow.python.estimator.exporter import Exporter
-from tensorflow.python.estimator.exporter import FinalExporter
-from tensorflow.python.estimator.exporter import LatestExporter
-from tensorflow.python.estimator.inputs import inputs
-from tensorflow.python.estimator.keras import model_to_estimator
-from tensorflow.python.estimator.model_fn import EstimatorSpec
-from tensorflow.python.estimator.model_fn import ModeKeys
-from tensorflow.python.estimator.run_config import RunConfig
-from tensorflow.python.estimator.training import EvalSpec
-from tensorflow.python.estimator.training import train_and_evaluate
-from tensorflow.python.estimator.training import TrainSpec
+from tensorflow_estimator.python.estimator import estimator_lib
 
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+estimator_lib.__all__ = [
+    s for s in dir(estimator_lib) if not s.startswith('__')
+]
 
-# pylint: enable=unused-import,line-too-long,wildcard-import
+from tensorflow_estimator.python.estimator.estimator_lib import *
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
deleted file mode 100644
index 246dfb1a4bd5171cace3be857678cf9a3f031f58..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/estimator_test.py
+++ /dev/null
@@ -1,3280 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Estimator."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import glob
-import json
-import os
-import tempfile
-
-import numpy as np
-import six
-
-from google.protobuf import text_format
-
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.layers import layers
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.ops.random_ops import random_uniform
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import loader
-from tensorflow.python.saved_model import loader_impl
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.summary import summary
-from tensorflow.python.summary import summary_iterator
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import checkpoint_state_pb2
-from tensorflow.python.training import saver
-from tensorflow.python.training import saver_test_utils
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training
-from tensorflow.python.util import compat
-from tensorflow.python.util import function_utils
-
-_TMP_DIR = '/tmp'
-_ANOTHER_TMP_DIR = '/another_tmp'
-
-
-def dummy_model_fn(features, labels, params):
-  _, _, _ = features, labels, params
-
-
-def summaries_with_matching_keyword(keyword, dir_):
-  """Yields summary protos matching given keyword from event file."""
-
-  writer_cache.FileWriterCache.clear()
-
-  event_paths = glob.glob(os.path.join(dir_, 'events*'))
-  for event in summary_iterator.summary_iterator(event_paths[-1]):
-    if event.summary is not None:
-      for value in event.summary.value:
-        if keyword in value.tag:
-          yield event.summary
-
-
-def check_eventfile_for_keyword(keyword, dir_):
-  """Checks event files for the keyword."""
-  return any(summaries_with_matching_keyword(keyword, dir_))
-
-
-def get_mock_saver():
-  real_saver = saver.Saver()
-  return test.mock.Mock(wraps=real_saver, saver_def=real_saver.saver_def)
-
-
-class EstimatorInheritanceConstraintTest(test.TestCase):
-  """Tests that sub classes cannot override methods of Estimator."""
-
-  def test_override_a_method(self):
-    class _Estimator(estimator.Estimator):
-
-      def __init__(self):
-        super(_Estimator, self).__init__(model_fn=dummy_model_fn)
-
-      def predict(self, input_fn, predict_keys=None, hooks=None):
-        pass
-
-    with self.assertRaisesRegexp(
-        ValueError, 'cannot override members of Estimator.*predict'):
-      _Estimator()
-
-  def test_override_a_method_with_tricks(self):
-    class _Estimator(estimator.Estimator):
-
-      def __init__(self):
-        super(_Estimator, self).__init__(model_fn=dummy_model_fn)
-
-      def _assert_members_are_not_overridden(self):
-        pass  # HAHA! I tricked you!
-
-      def predict(self, input_fn, predict_keys=None, hooks=None):
-        pass
-
-    with self.assertRaisesRegexp(
-        ValueError, 'cannot override members of Estimator.*predict'):
-      _Estimator()
-
-  def test_extension_of_api_is_ok(self):
-    class _Estimator(estimator.Estimator):
-
-      def __init__(self):
-        super(_Estimator, self).__init__(model_fn=dummy_model_fn)
-
-      def predict_proba(self, input_fn, predict_keys=None, hooks=None):
-        pass
-
-    _Estimator()
-
-  def test_override_allowed_method(self):
-    class _Estimator(estimator.Estimator):
-
-      def __init__(self):
-        super(_Estimator, self).__init__(model_fn=dummy_model_fn)
-
-      def _tf_api_names(self):
-        pass
-
-    _Estimator()
-
-
-class EstimatorConstructorTest(test.TestCase):
-
-  def test_config_must_be_a_run_config(self):
-    with self.assertRaisesRegexp(ValueError, 'an instance of `RunConfig`'):
-      estimator.Estimator(model_fn=None, config='NotARunConfig')
-
-  def test_model_fn_must_be_provided(self):
-    with self.assertRaisesRegexp(ValueError, 'model_fn.* must be'):
-      estimator.Estimator(model_fn=None)
-
-  def test_property_accessors(self):
-
-    def model_fn(features, labels, params):
-      _, _, _ = features, labels, params
-
-    class FakeConfig(run_config.RunConfig):
-      pass
-
-    params = {'hidden_layers': [3, 4]}
-    est = estimator.Estimator(
-        model_fn=model_fn, model_dir='bla', config=FakeConfig(), params=params)
-    self.assertTrue(isinstance(est.config, FakeConfig))
-    self.assertEqual(params, est.params)
-    self.assertEqual('bla', est.model_dir)
-
-  def test_default_config(self):
-
-    def model_fn(features, labels):
-      _, _ = features, labels
-
-    est = estimator.Estimator(model_fn=model_fn)
-    self.assertTrue(isinstance(est.config, run_config.RunConfig))
-    self.assertTrue(est._session_config.allow_soft_placement)
-    rewrite_options = est._session_config.graph_options.rewrite_options
-    self.assertEqual(rewrite_options.meta_optimizer_iterations,
-                     rewriter_config_pb2.RewriterConfig.ONE)
-
-  def test_default_model_dir(self):
-
-    def model_fn(features, labels):
-      _, _ = features, labels
-
-    with test.mock.patch.object(tempfile, 'mkdtemp', return_value=_TMP_DIR):
-      est = estimator.Estimator(model_fn=model_fn)
-      self.assertEqual(_TMP_DIR, est.config.model_dir)
-      self.assertEqual(_TMP_DIR, est.model_dir)
-
-  def test_model_dir_in_constructor(self):
-
-    def model_fn(features, labels):
-      _, _ = features, labels
-
-    est = estimator.Estimator(model_fn=model_fn, model_dir=_TMP_DIR)
-    self.assertEqual(_TMP_DIR, est.config.model_dir)
-    self.assertEqual(_TMP_DIR, est.model_dir)
-
-  def test_empty_model_dir(self):
-    def model_fn(features, labels):
-      _, _ = features, labels
-
-    with test.mock.patch.object(tempfile, 'mkdtemp', return_value=_TMP_DIR):
-      est = estimator.Estimator(model_fn=model_fn, model_dir='')
-      self.assertEqual(_TMP_DIR, est.config.model_dir)
-      self.assertEqual(_TMP_DIR, est.model_dir)
-
-  def test_model_dir_in_run_config(self):
-
-    class FakeConfig(run_config.RunConfig):
-
-      @property
-      def model_dir(self):
-        return _TMP_DIR
-
-    def model_fn(features, labels):
-      _, _ = features, labels
-
-    est = estimator.Estimator(model_fn=model_fn, config=FakeConfig())
-    self.assertEqual(_TMP_DIR, est.config.model_dir)
-    self.assertEqual(_TMP_DIR, est.model_dir)
-
-  def test_same_model_dir_in_constructor_and_run_config(self):
-
-    class FakeConfig(run_config.RunConfig):
-
-      @property
-      def model_dir(self):
-        return _TMP_DIR
-
-    def model_fn(features, labels):
-      _, _ = features, labels
-
-    est = estimator.Estimator(
-        model_fn=model_fn, config=FakeConfig(), model_dir=_TMP_DIR)
-    self.assertEqual(_TMP_DIR, est.config.model_dir)
-    self.assertEqual(_TMP_DIR, est.model_dir)
-
-  def test_different_model_dir_in_constructor_and_run_config(self):
-
-    class FakeConfig(run_config.RunConfig):
-
-      @property
-      def model_dir(self):
-        return _TMP_DIR
-
-    def model_fn(features, labels):
-      _, _ = features, labels
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        '`model_dir` are set both in constructor and `RunConfig`, but '
-        'with different values'):
-      estimator.Estimator(
-          model_fn=model_fn, config=FakeConfig(), model_dir=_ANOTHER_TMP_DIR)
-
-  def test_model_fn_args_must_include_features(self):
-
-    def model_fn(x, labels):
-      _, _ = x, labels
-
-    with self.assertRaisesRegexp(ValueError, 'features'):
-      estimator.Estimator(model_fn=model_fn)
-
-  def test_model_fn_args_labels_is_optional(self):
-
-    def model_fn(features):
-      _ = features
-
-    estimator.Estimator(model_fn=model_fn)
-
-  def test_if_params_provided_then_model_fn_should_accept_it(self):
-
-    def model_fn(features, labels):
-      _, _ = features, labels
-
-    estimator.Estimator(model_fn=model_fn)
-    with self.assertRaisesRegexp(ValueError, 'params'):
-      estimator.Estimator(model_fn=model_fn, params={'hidden_layers': 4})
-
-  def test_internal_params_is_a_deepcopy(self):
-
-    def model_fn(features, labels, params):
-      _, _, _ = features, labels, params
-
-    params = {'hidden_layers': 4}
-    est = estimator.Estimator(model_fn=model_fn, params=params)
-
-    params['hidden_layers'] = 5
-    self.assertEqual(4, est.params['hidden_layers'])
-
-  def test_not_known_model_fn_args(self):
-
-    def model_fn(features, labels, something):
-      _, _, _ = features, labels, something
-
-    with self.assertRaisesRegexp(ValueError, 'something'):
-      estimator.Estimator(model_fn=model_fn)
-
-  def test_not_known_model_fn_args_handled_by_lambda(self):
-    def model_fn(features, labels, something):
-      _, _, _ = features, labels, something
-
-    new_model_fn = lambda features, labels: model_fn(  # pylint: disable=g-long-lambda
-        features, labels, 'something')
-    estimator.Estimator(model_fn=new_model_fn)
-
-  def test_if_model_fn_is_a_member_function_of_a_class(self):
-
-    class ModelFnClass(object):
-
-      def __init__(self):
-        estimator.Estimator(model_fn=self.model_fn)
-
-      def model_fn(self, features, labels, mode):
-        _, _, _ = features, labels, mode
-
-    ModelFnClass()
-
-  def test_model_fn_property_binds_params(self):
-
-    def model_fn(features, labels, mode, config, params):
-      _, _, _, _, _ = features, labels, mode, config, params
-
-    est = estimator.Estimator(model_fn=model_fn)
-    model_fn_args = function_utils.fn_args(est.model_fn)
-    self.assertEqual(
-        set(['features', 'labels', 'mode', 'config']), set(model_fn_args))
-
-  def test_model_fn_property_returns_fixed_signature(self):
-
-    def model_fn(features, labels):
-      _, _ = features, labels
-
-    est = estimator.Estimator(model_fn=model_fn)
-    model_fn_args = function_utils.fn_args(est.model_fn)
-    self.assertEqual(
-        set(['features', 'labels', 'mode', 'config']), set(model_fn_args))
-
-
-def dummy_input_fn():
-  return ({'x': constant_op.constant([[1], [1]])},
-          constant_op.constant([[1], [1]]))
-
-
-def model_fn_global_step_incrementer(features, labels, mode):
-  _, _ = features, labels
-  global_step = training.get_global_step()
-  return model_fn_lib.EstimatorSpec(
-      mode,
-      loss=constant_op.constant(1.),
-      train_op=state_ops.assign_add(global_step, 1))
-
-
-def assert_features_op(expected_features, actual_features):
-  return [
-      check_ops.assert_equal(
-          expected_features[k], actual_features[k], name='assert_%s' % k)
-      for k in expected_features
-  ]
-
-
-def _estimator_spec(
-    expected_features, expected_labels, actual_features, actual_labels, mode):
-  assert_ops = tuple(
-      assert_features_op(expected_features, actual_features) + [
-          check_ops.assert_equal(
-              expected_labels, actual_labels, name='assert_labels')
-      ])
-  global_step = training.get_global_step()
-  with ops.control_dependencies(assert_ops):
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        predictions=constant_op.constant(0.),
-        loss=constant_op.constant(0.),
-        train_op=state_ops.assign_add(global_step, 1))
-
-
-def _make_input_fn(features, labels):
-  def _input_fn():
-    return {
-        k: constant_op.constant(v)
-        for k, v in six.iteritems(features)
-    }, constant_op.constant(labels)
-  return _input_fn
-
-
-class EstimatorTrainTest(test.TestCase):
-
-  def test_callable_model_fn(self):
-    expected_features = {'x': 42., 'y': 43.}
-    expected_labels = 44.
-
-    model_fn_call_count = [0]
-
-    test_self = self
-
-    class ModelFn(object):
-
-      def __call__(self, features, labels):
-        model_fn_call_count[0] += 1
-        test_self.assertItemsEqual(expected_features.keys(), features.keys())
-        return _estimator_spec(
-            expected_features, expected_labels, features, labels,
-            model_fn_lib.ModeKeys.TRAIN)
-
-    with self.assertRaisesRegexp(ValueError, 'does not include params'):
-      estimator.Estimator(model_fn=ModelFn(), params={'a': 'b'})
-    est = estimator.Estimator(model_fn=ModelFn(), config=run_config.RunConfig())
-    self.assertEqual(0, model_fn_call_count[0])
-    est.train(
-        input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
-    self.assertEqual(1, model_fn_call_count[0])
-
-  def test_callable_input_fn(self):
-    expected_mode = model_fn_lib.ModeKeys.TRAIN
-    expected_params = {'batch_size': 10}
-    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
-    input_fn_call_count = [0]
-
-    def _model_fn(features, labels, mode, params, config):
-      del params, config
-      return model_fn_global_step_incrementer(features, labels, mode)
-
-    test_self = self
-
-    class InputFn(object):
-
-      def __call__(self, mode, params, config):
-        input_fn_call_count[0] += 1
-        test_self.assertEqual(expected_mode, mode)
-        test_self.assertEqual(expected_params, params)
-        test_self.assertEqual(4321, config.tf_random_seed)
-        return dummy_input_fn()
-
-    est = estimator.Estimator(model_fn=_model_fn,
-                              params=expected_params,
-                              config=expected_config)
-    self.assertEqual(0, input_fn_call_count[0])
-    est.train(InputFn(), steps=1)
-    self.assertEqual(1, input_fn_call_count[0])
-
-  def test_nested_input_fn(self):
-    expected_params = {'batch_size': 10}
-
-    def _input_fn():
-      dataset_features = dataset_ops.Dataset.from_tensor_slices(
-          (random_uniform([4]),
-           random_uniform([4, 100], maxval=100, dtype=dtypes.int32)))
-      dataset_labels = dataset_ops.Dataset.from_tensor_slices(
-          random_uniform([4, 10]))
-      dataset = dataset_ops.Dataset.zip((dataset_features, dataset_labels))
-      dataset = dataset.repeat(-1)
-      iterator = dataset.make_initializable_iterator()
-      return iterator.get_next()
-
-    def _model_fn(features, labels, mode, params, config):
-      del params, config
-      return model_fn_global_step_incrementer(features, labels, mode)
-
-    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
-    est = estimator.Estimator(
-        model_fn=_model_fn, params=expected_params, config=expected_config)
-    est.train(_input_fn, steps=4)
-
-  def test_input_fn_args(self):
-    expected_mode = model_fn_lib.ModeKeys.TRAIN
-    expected_params = {'batch_size': 10}
-    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
-    input_fn_call_count = [0]
-
-    def _model_fn(features, labels, mode, params, config):
-      del params, config
-      return model_fn_global_step_incrementer(features, labels, mode)
-
-    def _input_fn(mode, params, config):
-      input_fn_call_count[0] += 1
-      self.assertEqual(expected_mode, mode)
-      self.assertEqual(expected_params, params)
-      self.assertEqual(4321, config.tf_random_seed)
-      return dummy_input_fn()
-
-    est = estimator.Estimator(model_fn=_model_fn,
-                              params=expected_params,
-                              config=expected_config)
-    self.assertEqual(0, input_fn_call_count[0])
-    est.train(_input_fn, steps=1)
-    self.assertEqual(1, input_fn_call_count[0])
-
-  def test_minimal_model_fn_args(self):
-    expected_features = {'x': 4, 'y': 5}
-
-    def _input_fn():
-      return expected_features
-
-    model_fn_call_count = [0]
-    def _model_fn(features):
-      model_fn_call_count[0] += 1
-      self.assertItemsEqual(expected_features.keys(), features.keys())
-      with ops.control_dependencies(
-          assert_features_op(expected_features, features)):
-        return model_fn_lib.EstimatorSpec(
-            mode=None,
-            predictions=constant_op.constant(0.),
-            loss=constant_op.constant(0.),
-            train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    self.assertEqual(0, model_fn_call_count[0])
-    est.train(input_fn=_input_fn, steps=1)
-    self.assertEqual(1, model_fn_call_count[0])
-
-  def test_labels_should_be_none_if_model_fn_does_not_use_labels(self):
-
-    def _input_fn_with_labels():
-      return {'x': 4, 'y': 5}, [4]
-
-    def _model_fn(features):
-      _ = features
-      return model_fn_lib.EstimatorSpec(
-          mode=None,
-          predictions=constant_op.constant(0.),
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    with self.assertRaisesRegexp(ValueError, 'model_fn does not take labels'):
-      est.train(input_fn=_input_fn_with_labels, steps=1)
-
-  def test_input_fn_len_should_be_2_if_tuple_or_list(self):
-
-    def _input_fn():
-      return 4, 5, 6
-
-    def _model_fn(features):
-      _ = features
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    with self.assertRaisesRegexp(ValueError, 'len 2 tuple'):
-      est.train(input_fn=_input_fn, steps=1)
-
-  def test_all_model_fn_args(self):
-    expected_features = {'x': 42., 'y': 43.}
-    expected_labels = 44.
-    expected_params = {'some_param': 'some_value'}
-    expected_config = run_config.RunConfig()
-    expected_config.i_am_test = True
-
-    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
-    # doesn't work with mock fns.
-    model_fn_call_count = [0]
-
-    # Note that args are all passed by keyword, so can be in any order.
-    def _model_fn(mode, params, features, labels, config):
-      model_fn_call_count[0] += 1
-      self.assertItemsEqual(expected_features.keys(), features.keys())
-      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)
-      self.assertEqual(expected_params, params)
-      self.assertTrue(config.i_am_test)
-      return _estimator_spec(
-          expected_features, expected_labels, features, labels, mode)
-
-    est = estimator.Estimator(
-        model_fn=_model_fn, params=expected_params, config=expected_config)
-    self.assertEqual(0, model_fn_call_count[0])
-    est.train(
-        input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
-    self.assertEqual(1, model_fn_call_count[0])
-
-  def test_partial_model_fn_args(self):
-    expected_features = {'x': 42., 'y': 43.}
-    expected_labels = 44.
-    expected_params = {'some_param': 'some_value'}
-    expected_config = run_config.RunConfig()
-    expected_config.i_am_test = True
-    expected_foo = 45.
-    expected_bar = 46.
-
-    # TODO(ptucker): We have to roll our own mock since Estimator._get_arguments
-    # doesn't work with mock fns.
-    model_fn_call_count = [0]
-
-    def _model_fn(features, labels, foo, mode, params, config, bar):
-      model_fn_call_count[0] += 1
-      self.assertEqual(expected_foo, foo)
-      self.assertEqual(expected_bar, bar)
-      self.assertItemsEqual(expected_features.keys(), features.keys())
-      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, mode)
-      self.assertEqual(expected_params, params)
-      self.assertTrue(config.i_am_test)
-      return _estimator_spec(
-          expected_features, expected_labels, features, labels, mode)
-    partial_model_fn = functools.partial(
-        _model_fn, foo=expected_foo, bar=expected_bar)
-
-    est = estimator.Estimator(
-        model_fn=partial_model_fn, params=expected_params,
-        config=expected_config)
-    self.assertEqual(0, model_fn_call_count[0])
-    est.train(
-        input_fn=_make_input_fn(expected_features, expected_labels), steps=1)
-    self.assertEqual(1, model_fn_call_count[0])
-
-  def test_model_fn_must_return_estimator_spec(self):
-
-    def model_fn(features, labels):
-      _, _ = features, labels
-      return 'NotGoodNotGood'
-
-    est = estimator.Estimator(model_fn=model_fn)
-    with self.assertRaisesRegexp(ValueError, 'EstimatorSpec'):
-      est.train(dummy_input_fn, steps=1)
-
-  def test_run_train_op_and_saves_at_the_end(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    est.train(dummy_input_fn, steps=5)
-    self.assertEqual(
-        5, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
-
-  def test_loss_summary(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer,
-                              config=run_config.RunConfig(save_summary_steps=1))
-    est.train(dummy_input_fn, steps=1)
-
-    # Make sure nothing is stuck in limbo.
-    writer_cache.FileWriterCache.clear()
-
-    if check_eventfile_for_keyword('loss', est.model_dir):
-      return
-    self.fail('{} should be part of reported summaries.'.format('loss'))
-
-  def test_latest_checkpoint(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    self.assertIsNone(est.latest_checkpoint())
-    est.train(dummy_input_fn, steps=5)
-    self.assertIsNotNone(est.latest_checkpoint())
-    self.assertTrue(est.latest_checkpoint().startswith(est.model_dir))
-
-  def test_steps_and_saves_reloads(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    est.train(dummy_input_fn, steps=5)
-    self.assertEqual(
-        5, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
-    est.train(dummy_input_fn, steps=5)
-    self.assertEqual(
-        10, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
-
-  def test_warm_starts(self):
-    def _make_model_fn(x):
-      def _variable_creating_model_fn(features, labels, mode):
-        _, _ = features, labels
-        variable_scope.get_variable('x', initializer=x)
-        global_step = training.get_global_step()
-        return model_fn_lib.EstimatorSpec(
-            mode,
-            loss=constant_op.constant(1.),
-            train_op=state_ops.assign_add(global_step, 1))
-      return _variable_creating_model_fn
-
-    est = estimator.Estimator(model_fn=_make_model_fn(42.))
-    est.train(dummy_input_fn, steps=10)
-
-    warm_started_est = estimator.Estimator(
-        model_fn=_make_model_fn(36.),
-        warm_start_from=est.model_dir)
-    warm_started_est.train(dummy_input_fn, steps=5)
-    # warm_start is called after the model_fn, so x should have the value
-    # from the checkpoint.
-    self.assertEqual(42., warm_started_est.get_variable_value('x'))
-    # global_step should not be warm-started.
-    self.assertEqual(
-        5, estimator._load_global_step_from_checkpoint_dir(
-            warm_started_est.model_dir))
-
-  def test_warm_starts_from_savedmodel(self):
-    def _make_model_fn(x):
-      def _variable_creating_and_export_model_fn(features, labels, mode):
-        _, _ = features, labels
-        variable_scope.get_variable('x', initializer=x)
-        global_step = training.get_global_step()
-        return model_fn_lib.EstimatorSpec(
-            mode,
-            predictions={'y': constant_op.constant(1.0)},
-            loss=constant_op.constant(1.),
-            train_op=state_ops.assign_add(global_step, 1),
-            export_outputs={'test': export_output.ClassificationOutput(
-                constant_op.constant([4.2]), constant_op.constant(['label']))})
-      return _variable_creating_and_export_model_fn
-
-    est = estimator.Estimator(model_fn=_make_model_fn(42.))
-    est.train(dummy_input_fn, steps=10)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    tmpdir = tempfile.mkdtemp()
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est.export_savedmodel(
-        export_dir_base, serving_input_receiver_fn)
-
-    warm_started_est = estimator.Estimator(
-        model_fn=_make_model_fn(36.),
-        warm_start_from=export_dir)
-    warm_started_est.train(dummy_input_fn, steps=5)
-    # warm_start is called after the model_fn, so x should have the value
-    # from the SavedModel.
-    self.assertEqual(42., warm_started_est.get_variable_value('x'))
-
-  def test_max_step(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    est.train(dummy_input_fn, max_steps=5)
-    self.assertEqual(
-        5, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
-    est.train(dummy_input_fn, max_steps=5)
-    self.assertEqual(
-        5, estimator._load_global_step_from_checkpoint_dir(est.model_dir))
-
-  def test_checkpoint_contains_relative_paths(self):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(
-        model_dir=tmpdir,
-        model_fn=model_fn_global_step_incrementer)
-    est.train(dummy_input_fn, steps=5)
-
-    checkpoint_file_content = file_io.read_file_to_string(
-        os.path.join(tmpdir, 'checkpoint'))
-    ckpt = checkpoint_state_pb2.CheckpointState()
-    text_format.Merge(checkpoint_file_content, ckpt)
-    self.assertEqual(ckpt.model_checkpoint_path, 'model.ckpt-5')
-    # TODO(b/78461127): Please modify tests to not directly rely on names of
-    # checkpoints.
-    self.assertAllEqual(
-        ['model.ckpt-0', 'model.ckpt-5'], ckpt.all_model_checkpoint_paths)
-
-  def test_train_save_copy_reload(self):
-    tmpdir = tempfile.mkdtemp()
-    model_dir1 = os.path.join(tmpdir, 'model_dir1')
-    est1 = estimator.Estimator(
-        model_dir=model_dir1,
-        model_fn=model_fn_global_step_incrementer)
-    est1.train(dummy_input_fn, steps=5)
-
-    # We have to clear the cache before we can rename the directory,
-    # otherwise open file handles will prevent the delete on Windows.
-    writer_cache.FileWriterCache.clear()
-    model_dir2 = os.path.join(tmpdir, 'model_dir2')
-    os.renames(model_dir1, model_dir2)
-
-    est2 = estimator.Estimator(
-        model_dir=model_dir2,
-        model_fn=model_fn_global_step_incrementer)
-    self.assertEqual(
-        5, estimator._load_global_step_from_checkpoint_dir(est2.model_dir))
-    est2.train(dummy_input_fn, steps=5)
-    self.assertEqual(
-        10, estimator._load_global_step_from_checkpoint_dir(est2.model_dir))
-
-  def test_steps0_raises_error(self):
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops)
-    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
-      est.train(dummy_input_fn, steps=0)
-
-  def test_steps_negative_raises_error(self):
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops)
-    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
-      est.train(dummy_input_fn, steps=-1)
-
-  def test_max_steps0_raises_error(self):
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops)
-    with self.assertRaisesRegexp(ValueError, 'Must specify max_steps > 0'):
-      est.train(dummy_input_fn, max_steps=0)
-
-  def test_max_steps_negative_raises_error(self):
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops)
-    with self.assertRaisesRegexp(ValueError, 'Must specify max_steps > 0'):
-      est.train(dummy_input_fn, max_steps=-1)
-
-  def test_scaffold_is_used(self):
-    self.is_init_fn_called = False
-
-    def _init_fn(scaffold, sess):
-      _, _ = scaffold, sess
-      self.is_init_fn_called = True
-
-    def _model_fn_scaffold(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          scaffold=training.Scaffold(init_fn=_init_fn))
-
-    est = estimator.Estimator(model_fn=_model_fn_scaffold)
-    est.train(dummy_input_fn, steps=1)
-    self.assertTrue(self.is_init_fn_called)
-
-  def test_hooks_should_be_session_run_hook(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    with self.assertRaisesRegexp(TypeError, 'must be a SessionRunHook'):
-      est.train(dummy_input_fn, steps=1, hooks=['NotAHook'])
-
-  def test_training_hooks_are_used(self):
-    chief_hook = test.mock.MagicMock(
-        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
-    hook = test.mock.MagicMock(
-        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
-
-    def _model_fn_hooks(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          training_chief_hooks=[chief_hook],
-          training_hooks=[hook])
-
-    est = estimator.Estimator(model_fn=_model_fn_hooks)
-    self.assertFalse(chief_hook.begin.called)
-    self.assertFalse(hook.begin.called)
-    est.train(dummy_input_fn, steps=1)
-    self.assertTrue(chief_hook.begin.called)
-    self.assertTrue(hook.begin.called)
-
-  def test_saving_listeners_are_used(self):
-    listener = test.mock.Mock(spec=training.CheckpointSaverListener)
-    listener.after_save.return_value = None
-    est = estimator.Estimator(
-        model_fn=model_fn_global_step_incrementer,
-        config=run_config.RunConfig(save_checkpoints_steps=10))
-    est.train(dummy_input_fn, steps=26, saving_listeners=[listener])
-    self.assertEqual(4, listener.before_save.call_count)
-    self.assertEqual(4, listener.after_save.call_count)
-
-  def test_saver_hook_should_exist_to_use_saving_listeners(self):
-    listener = test.mock.Mock(spec=training.CheckpointSaverListener)
-    est = estimator.Estimator(
-        model_fn=model_fn_global_step_incrementer,
-        config=run_config.RunConfig(save_checkpoints_steps=None,
-                                    save_checkpoints_secs=None))
-    with self.assertRaisesRegexp(
-        ValueError, 'CheckpointSaverHook to use saving_listeners'):
-      est.train(dummy_input_fn, steps=1, saving_listeners=[listener])
-
-  def test_listeners_should_be_listeners(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    with self.assertRaisesRegexp(
-        TypeError, 'must be a list of CheckpointSaverListener'):
-      est.train(dummy_input_fn, steps=1, saving_listeners=['not-a-listener'])
-
-  def test_chief_only_hook_should_not_be_called_on_non_chief(self):
-    chief_hook = test.mock.MagicMock(
-        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
-    hook = test.mock.MagicMock(
-        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
-
-    def _model_fn_hooks(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          training_chief_hooks=[chief_hook],
-          training_hooks=[hook])
-
-    class NonChiefRunConfig(run_config.RunConfig):
-      @property
-      def is_chief(self):  # pylint: disable=g-wrong-blank-lines
-        return False
-
-    # Mocking the SessionManager.wait_for_session, so that worker doesn't wait
-    # for chief.
-    def get_initialized_session(*args, **kwargs):
-      # Session doesn't take 'max_wait_secs' argument.
-      kwargs.pop('max_wait_secs', None)
-      scaffold = training.Scaffold().finalize()
-      sess = session.Session(*args, **kwargs)
-      sess.run(scaffold.init_op)
-      return sess
-
-    with test.mock.patch.object(
-        training.SessionManager,
-        'wait_for_session',
-        side_effect=get_initialized_session):
-      est = estimator.Estimator(
-          model_fn=_model_fn_hooks, config=NonChiefRunConfig())
-      self.assertFalse(chief_hook.begin.called)
-      self.assertFalse(hook.begin.called)
-      est.train(dummy_input_fn, steps=1)
-      self.assertFalse(chief_hook.begin.called)
-      self.assertTrue(hook.begin.called)
-
-  def test_features_labels_mode(self):
-    given_features = {'test-features': [[1], [1]]}
-    given_labels = {'test-labels': [[1], [1]]}
-
-    def _input_fn():
-      return given_features, given_labels
-
-    def _model_fn(features, labels, mode):
-      self.features, self.labels, self.mode = features, labels, mode
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[0.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(_input_fn, steps=1)
-    self.assertEqual(given_features, self.features)
-    self.assertEqual(given_labels, self.labels)
-    self.assertEqual(model_fn_lib.ModeKeys.TRAIN, self.mode)
-
-  def test_graph_initialization_global_step_and_random_seed(self):
-    expected_random_seed = run_config.RunConfig().tf_random_seed
-    def _model_fn(features, labels, mode):
-      _, _, _ = features, labels, mode
-      self.assertIsNotNone(training.get_global_step())
-      self.assertEqual(expected_random_seed, ops.get_default_graph().seed)
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[0.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-
-  def test_config_should_not_be_evaluator_or_ps(self):
-
-    class FakeEvaluatorConfig(run_config.RunConfig):
-
-      @property
-      def task_type(self):
-        return run_config.TaskType.EVALUATOR
-
-    est = estimator.Estimator(
-        model_fn=dummy_model_fn, config=FakeEvaluatorConfig())
-    with self.assertRaisesRegexp(ValueError, 'train_and_evaluate'):
-      est.train(dummy_input_fn, steps=1)
-
-  def test_master_distributed_hooks(self):
-    tf_config = json.dumps({
-        'cluster': {
-            run_config.TaskType.PS: ['localhost:1234'],
-            run_config.TaskType.WORKER: ['localhost:1235'],
-            run_config.TaskType.MASTER: ['localhost:1236']
-        },
-        'task': {
-            'type': run_config.TaskType.MASTER,
-            'index': 0
-        }
-    })
-    with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
-      est = estimator.Estimator(
-          model_fn=model_fn_global_step_incrementer,
-          config=run_config.RunConfig())
-
-    with test.mock.patch.object(training,
-                                'MonitoredTrainingSession') as mock_sess:
-      est.train(dummy_input_fn, steps=1)
-      self.assertFalse(
-          any(
-              isinstance(hook, basic_session_run_hooks.SummarySaverHook)
-              for hook in mock_sess.call_args[1]['hooks']))
-      self.assertFalse(
-          any(
-              isinstance(hook, basic_session_run_hooks.StepCounterHook)
-              for hook in mock_sess.call_args[1]['hooks']))
-      self.assertEqual(0, mock_sess.call_args[1]['save_summaries_steps'])
-      self.assertIsNone(mock_sess.call_args[1]['log_step_count_steps'])
-
-  def test_master_distributed_hooks_for_worker_0(self):
-    tf_config = json.dumps({
-        'cluster': {
-            run_config.TaskType.PS: ['localhost:1234'],
-            run_config.TaskType.WORKER: ['localhost:1235'],
-            run_config.TaskType.MASTER: ['localhost:1236']
-        },
-        'task': {
-            'type': run_config.TaskType.WORKER,
-            'index': 0
-        }
-    })
-    with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
-      est = estimator.Estimator(
-          model_fn=model_fn_global_step_incrementer,
-          config=run_config.RunConfig())
-
-    with test.mock.patch.object(training,
-                                'MonitoredTrainingSession') as mock_sess:
-      est.train(dummy_input_fn, steps=1)
-      self.assertTrue(
-          any(
-              isinstance(hook, basic_session_run_hooks.SummarySaverHook)
-              for hook in mock_sess.call_args[1]['hooks']))
-      self.assertTrue(
-          any(
-              isinstance(hook, basic_session_run_hooks.StepCounterHook)
-              for hook in mock_sess.call_args[1]['hooks']))
-      self.assertEqual(0, mock_sess.call_args[1]['save_summaries_steps'])
-      self.assertIsNone(mock_sess.call_args[1]['log_step_count_steps'])
-
-  def test_master_distributed_hooks_for_worker_nonzero(self):
-    tf_config = json.dumps({
-        'cluster': {
-            run_config.TaskType.PS: ['localhost:1234'],
-            run_config.TaskType.WORKER: ['localhost:1235', 'localhost:1237'],
-            run_config.TaskType.MASTER: ['localhost:1236']
-        },
-        'task': {
-            'type': run_config.TaskType.WORKER,
-            'index': 1
-        }
-    })
-    with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
-      est = estimator.Estimator(
-          model_fn=model_fn_global_step_incrementer,
-          config=run_config.RunConfig())
-
-    with test.mock.patch.object(training,
-                                'MonitoredTrainingSession') as mock_sess:
-      est.train(dummy_input_fn, steps=1)
-      self.assertFalse(
-          any(
-              isinstance(hook, basic_session_run_hooks.SummarySaverHook)
-              for hook in mock_sess.call_args[1]['hooks']))
-      self.assertFalse(
-          any(
-              isinstance(hook, basic_session_run_hooks.StepCounterHook)
-              for hook in mock_sess.call_args[1]['hooks']))
-      self.assertEqual(0, mock_sess.call_args[1]['save_summaries_steps'])
-      self.assertIsNone(mock_sess.call_args[1]['log_step_count_steps'])
-
-
-def _model_fn_with_eval_metric_ops(features, labels, mode, params):
-  _, _ = features, labels
-  global_step = training.get_global_step()
-  loss = constant_op.constant(1.)
-  metric_name_1 = params.get('metric_name') or 'metric'
-  metric_value_1 = params.get('metric_value') or 2.
-  metric_name_2 = params.get('metric_name_2') or 'metric2'
-  metric_value_2 = params.get('metric_value_2') or 2.
-
-  metric_update_op = loss.op
-  metric_tensor = control_flow_ops.with_dependencies(
-      [metric_update_op], constant_op.constant(metric_value_1))
-
-  mean = metrics_module.Mean()
-  mean.update_state(metric_value_2)
-  return model_fn_lib.EstimatorSpec(
-      mode,
-      loss=loss,
-      predictions={'predictions': constant_op.constant(1.)},
-      train_op=state_ops.assign_add(global_step, 1),
-      eval_metric_ops={
-          metric_name_1: (metric_tensor, metric_update_op),
-          metric_name_2: mean,
-      })
-
-
-class _StepCounterHook(session_run_hook.SessionRunHook):
-  """Hooks that counts the number of times it is called."""
-
-  def __init__(self):
-    self._steps = 0
-
-  def before_run(self, run_context):
-    del run_context
-    self._steps += 1
-
-  @property
-  def steps(self):
-    return self._steps
-
-
-class EstimatorGetVariablesTest(test.TestCase):
-
-  def test_model_should_be_trained(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      variables.VariableV1(1., name='one')
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    with self.assertRaisesRegexp(ValueError, 'not find trained model'):
-      est.get_variable_names()
-    with self.assertRaisesRegexp(ValueError, 'not find trained model'):
-      est.get_variable_value('one')
-
-  def test_get_variable_utils(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      variables.VariableV1(1., name='one')
-      variables.VariableV1(3., name='three')
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(input_fn=dummy_input_fn, steps=1)
-    self.assertEqual(
-        set(['one', 'three', 'global_step']), set(est.get_variable_names()))
-    self.assertEqual(1., est.get_variable_value('one'))
-    self.assertEqual(3., est.get_variable_value('three'))
-
-
-class EstimatorDatasetIntegrationTest(test.TestCase):
-  """Tests dataset integration."""
-
-  def test_returned_by_input_fn(self):
-
-    def _input_fn():
-      return dataset_ops.Dataset.from_tensors(([1.], [2.]))
-
-    def _model_fn(features, labels, mode):
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=features + labels,  # 1 + 2
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(_input_fn, steps=1)
-    scores = est.evaluate(_input_fn, steps=1)
-    self.assertEqual(3., scores[model_fn_lib.LOSS_METRIC_KEY])
-
-  def test_with_none_labels(self):
-
-    def _input_fn():
-      return dataset_ops.Dataset.from_tensors([7.])
-
-    def _model_fn(features, labels, mode):
-      self.assertIsNone(labels)
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=features,  # 7
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(_input_fn, steps=1)
-    scores = est.evaluate(_input_fn, steps=1)
-    self.assertEqual(7., scores[model_fn_lib.LOSS_METRIC_KEY])
-
-  def test_with_predict(self):
-
-    def _input_fn():
-      return dataset_ops.Dataset.from_tensors([10.])
-
-    def _model_fn(features, labels, mode):
-      _ = labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          predictions=features,  # 10
-          loss=features,  # 10
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(_input_fn, steps=1)
-    self.assertEqual([10.], next(est.predict(input_fn=_input_fn)))
-
-  def test_batching(self):
-
-    def _input_fn():
-      return dataset_ops.Dataset.from_tensor_slices(([[1.], [2.]],
-                                                     [[10.], [20.]])).batch(1)
-
-    def _model_fn(features, labels, mode):
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          predictions=features,
-          loss=features + (0 if labels is None else labels),  # 11, 22
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(_input_fn)
-    scores = est.evaluate(_input_fn)
-    # (11 + 22)/2 = 16.5
-    self.assertEqual(16.5, scores[model_fn_lib.LOSS_METRIC_KEY])
-    self.assertEqual([1., 2.], list(est.predict(_input_fn)))
-
-
-class EstimatorEvaluateTest(test.TestCase):
-
-  def test_eval_dir(self):
-    est = estimator.Estimator(
-        model_fn=model_fn_global_step_incrementer,
-        model_dir='some_path')
-    expected_eval_dir = os.path.join('some_path', 'eval')
-    self.assertEqual(expected_eval_dir, est.eval_dir())
-    expected_eval_dir_name = os.path.join('some_path', 'eval_a_name')
-    self.assertEqual(expected_eval_dir_name, est.eval_dir('a_name'))
-
-  def test_input_fn_args(self):
-    expected_mode = model_fn_lib.ModeKeys.EVAL
-    expected_params = {'batch_size': 10}
-    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
-    input_fn_call_count = [0]
-
-    def _model_fn(features, labels, mode, params, config):
-      del params, config
-      return model_fn_global_step_incrementer(features, labels, mode)
-
-    def _input_fn(mode, params, config):
-      input_fn_call_count[0] += 1
-      self.assertEqual(expected_mode, mode)
-      self.assertEqual(expected_params, params)
-      self.assertEqual(4321, config.tf_random_seed)
-      return dummy_input_fn()
-
-    est = estimator.Estimator(model_fn=_model_fn,
-                              params=expected_params,
-                              config=expected_config)
-    est.train(dummy_input_fn, steps=1)
-    self.assertEqual(0, input_fn_call_count[0])
-    est.evaluate(_input_fn, steps=1)
-    self.assertEqual(1, input_fn_call_count[0])
-
-  def test_model_fn_must_return_estimator_spec(self):
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      if mode == model_fn_lib.ModeKeys.EVAL:
-        return 'NotGoodNotGood'
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(1.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    with self.assertRaisesRegexp(
-        ValueError, 'model_fn should return an EstimatorSpec'):
-      est.evaluate(dummy_input_fn, steps=1)
-
-  def test_no_checkpoint_uses_init(self):
-    def _model_fn(features, labels, mode, params):
-      del features, labels, params
-      mean = metrics_module.Mean()
-      mean.update_state(variables.VariableV1(2.) + 1)
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(1.),
-          eval_metric_ops={
-              'mean1': mean,
-              'mean2': metrics_lib.mean(variables.VariableV1(2.) + 1)
-          })
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    scores = est.evaluate(dummy_input_fn, steps=1)
-    # Metric value here is set to 1 + the value of the Variable that is newly
-    # initialized (since there is no checkpoint).
-    self.assertEqual(3., scores['mean1'])
-    self.assertEqual(3., scores['mean2'])
-
-  def test_no_checkpoint_uses_init_with_warm_starting(self):
-    def _make_model_fn(x):
-      def _variable_creating_and_export_model_fn(features, labels, mode):
-        _, _ = features, labels
-        x_var = variable_scope.get_variable('x', initializer=x)
-        global_step = training.get_global_step()
-        mean = metrics_module.Mean()
-        mean.update_state(x_var + 1)
-        return model_fn_lib.EstimatorSpec(
-            mode,
-            predictions={'y': constant_op.constant(1.0)},
-            loss=constant_op.constant(1.),
-            eval_metric_ops={
-                'mean1': mean,
-                'mean2': metrics_lib.mean(x_var + 1)
-            },
-            train_op=state_ops.assign_add(global_step, 1),
-            export_outputs={
-                'test':
-                    export_output.ClassificationOutput(
-                        constant_op.constant([4.2]),
-                        constant_op.constant(['label']))
-            })
-
-      return _variable_creating_and_export_model_fn
-
-    first_est = estimator.Estimator(model_fn=_make_model_fn(42.))
-    first_est.train(dummy_input_fn, steps=10)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    tmpdir = tempfile.mkdtemp()
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    exported_path = first_est.export_savedmodel(export_dir_base,
-                                                serving_input_receiver_fn)
-
-    # Test that we can pass either warm_start_from as an external checkpoint
-    # or an exported SavedModel.
-    est = estimator.Estimator(model_fn=_make_model_fn(52.),
-                              warm_start_from=exported_path)
-    eval_metrics = est.evaluate(dummy_input_fn, steps=1)
-    # Metric value here is set to 1 + the value of the Variable that is
-    # warm-started from the SavedModel of the first model (42.), as opposed to
-    # the initialization in the new model_fn (52.).
-    self.assertEqual(43., eval_metrics['mean1'])
-    self.assertEqual(43., eval_metrics['mean2'])
-
-    est = estimator.Estimator(model_fn=_make_model_fn(62.),
-                              warm_start_from=first_est.model_dir)
-    eval_metrics = est.evaluate(dummy_input_fn, steps=1)
-    # Metric value here is set to 1 + the value of the Variable that is
-    # warm-started from a checkpoint of the first model (42.), as opposed to
-    # the initialization in the new model_fn (52.).
-    self.assertEqual(43., eval_metrics['mean1'])
-    self.assertEqual(43., eval_metrics['mean2'])
-
-  def test_scores(self):
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops,
-        params={
-            'metric_name': 'metric',
-            'metric_value': 2.,
-            'metric_name_2': 'metric2',
-            'metric_value_2': 3.,
-        })
-    est.train(dummy_input_fn, steps=5)
-    scores = est.evaluate(dummy_input_fn, steps=1)
-    self.assertIn('metric', scores)
-    self.assertAlmostEqual(2., scores['metric'])
-    self.assertIn('metric2', scores)
-    self.assertAlmostEqual(3., scores['metric2'])
-
-  def test_tuple_metrics(self):
-    def _model_fn(features, labels, mode):
-      del features  # unused
-      del labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          loss=constant_op.constant(1.),
-          eval_metric_ops={
-              'nested_metric': (
-                  ((constant_op.constant(2.), constant_op.constant(1)),
-                   constant_op.constant(3., dtype=dtypes.float64)),
-                  control_flow_ops.no_op())})
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    evaluation = est.evaluate(dummy_input_fn, steps=1)
-    ((two_float, one_integer), three_double) = evaluation['nested_metric']
-    self.assertAlmostEqual(2., two_float)
-    self.assertEqual(1, one_integer)
-    self.assertAlmostEqual(3., three_double)
-
-  def test_steps0_raises_error(self):
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops)
-    est.train(dummy_input_fn, steps=5)
-    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
-      est.evaluate(dummy_input_fn, steps=0)
-
-  def test_steps_negative_raises_error(self):
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops)
-    est.train(dummy_input_fn, steps=5)
-    with self.assertRaisesRegexp(ValueError, 'Must specify steps > 0'):
-      est.evaluate(dummy_input_fn, steps=-1)
-
-  def test_global_step_metric_raises_error(self):
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops,
-        params={
-            'metric_name': 'global_step',
-            'metric_value': 2.})
-    est.train(dummy_input_fn, steps=5)
-    with self.assertRaisesRegexp(
-        ValueError, 'Metric with name `global_step` is not allowed'):
-      est.evaluate(dummy_input_fn, steps=1)
-
-  def test_global_step_is_reported(self):
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops,
-        params={
-            'metric_name': 'metric',
-            'metric_value': 2.,
-            'metric_name_2': 'metric2',
-            'metric_value_2': 3.,
-        })
-    est.train(dummy_input_fn, steps=5)
-    scores = est.evaluate(dummy_input_fn, steps=1)
-    self.assertIn('global_step', scores)
-    self.assertEqual(5, scores['global_step'])
-
-  def test_loss_metric_is_reported(self):
-
-    def _model_fn_with_incremental_loss(features, labels, mode):
-      _, _ = features, labels
-      local_weight = variables.VariableV1(
-          0., name='local_weight', collections=[ops.GraphKeys.LOCAL_VARIABLES])
-      # Loss will be 2, 4, 6, ...
-      loss = 2 * state_ops.assign_add(local_weight, 1.)
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=loss,
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    est = estimator.Estimator(model_fn=_model_fn_with_incremental_loss)
-    est.train(dummy_input_fn, steps=1)
-    scores = est.evaluate(dummy_input_fn, steps=5)
-    self.assertIn(model_fn_lib.LOSS_METRIC_KEY, scores)
-    # Average loss will be (2 + 4 + 6 + 8 + 10)/5=6
-    self.assertAlmostEqual(6., scores[model_fn_lib.LOSS_METRIC_KEY])
-
-  def test_hooks_should_be_session_run_hook(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    est.train(dummy_input_fn, steps=1)
-    with self.assertRaisesRegexp(TypeError, 'must be a SessionRunHook'):
-      est.evaluate(dummy_input_fn, steps=5, hooks=['NotAHook'])
-
-  def test_hooks_are_used(self):
-    step_counter_hook = _StepCounterHook()
-
-    est = estimator.Estimator(model_fn=_model_fn_with_eval_metric_ops)
-    est.train(dummy_input_fn, steps=1)
-    est.evaluate(dummy_input_fn, steps=5, hooks=[step_counter_hook])
-    self.assertEqual(5, step_counter_hook.steps)
-
-  def test_evaluate_from_checkpoint(self):
-    params = {
-        'metric_name': 'metric',
-        'metric_value': 2.,
-        'metric_name_2': 'metric2',
-        'metric_value_2': 3.,
-    }
-    est1 = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops,
-        params=params)
-    est1.train(dummy_input_fn, steps=5)
-    est2 = estimator.Estimator(
-        model_fn=_model_fn_with_eval_metric_ops,
-        params=params)
-    scores = est2.evaluate(
-        dummy_input_fn, steps=1, checkpoint_path=est1.latest_checkpoint())
-    self.assertEqual(5, scores['global_step'])
-
-  def test_wrong_shape_throws_reasonable_error(self):
-    """Make sure we are helpful when model_fns change. See b/110263146."""
-    def _get_model_fn(val=1):
-      def _model_fn(features, labels, mode):
-        del features, labels  # unused
-        variables.VariableV1(val, name='weight')
-        return model_fn_lib.EstimatorSpec(
-            mode=mode,
-            predictions=constant_op.constant([[1.]]),
-            loss=constant_op.constant(0.),
-            train_op=state_ops.assign_add(training.get_global_step(), 1))
-      return _model_fn
-
-    model_fn_1 = _get_model_fn()
-    model_fn_2 = _get_model_fn(val=[1])
-
-    est1 = estimator.Estimator(model_fn=model_fn_1)
-    est1.train(dummy_input_fn, steps=5)
-    est2 = estimator.Estimator(
-        model_fn=model_fn_2, model_dir=est1.model_dir)
-
-    expected_msg = 'Restoring from checkpoint failed.*a mismatch between'
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, expected_msg):
-      est2.train(dummy_input_fn, steps=1,)
-
-  def test_scaffold_is_used(self):
-
-    def _model_fn_scaffold(features, labels, mode):
-      _, _ = features, labels
-      variables.VariableV1(1., name='weight')
-      self.mock_saver = get_mock_saver()
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          predictions=constant_op.constant([[1.]]),
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          scaffold=training.Scaffold(saver=self.mock_saver))
-
-    est = estimator.Estimator(model_fn=_model_fn_scaffold)
-    est.train(dummy_input_fn, steps=1)
-    est.evaluate(dummy_input_fn, steps=1)
-    self.assertTrue(self.mock_saver.restore.called)
-
-  def test_features_labels_mode(self):
-    given_features = {'test-features': [[1], [1]]}
-    given_labels = {'test-labels': [[1], [1]]}
-
-    def _input_fn():
-      return given_features, given_labels
-
-    def _model_fn(features, labels, mode):
-      self.features, self.labels, self.mode = features, labels, mode
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[0.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(_input_fn, steps=1)
-    est.evaluate(_input_fn, steps=1)
-    self.assertEqual(given_features, self.features)
-    self.assertEqual(given_labels, self.labels)
-    self.assertEqual(model_fn_lib.ModeKeys.EVAL, self.mode)
-
-  def test_graph_initialization_global_step_and_random_seed(self):
-    expected_random_seed = run_config.RunConfig().tf_random_seed
-    def _model_fn(features, labels, mode):
-      _, _, _ = features, labels, mode
-      self.assertIsNotNone(training.get_global_step())
-      self.assertEqual(expected_random_seed, ops.get_default_graph().seed)
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[0.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    est.evaluate(dummy_input_fn, steps=1)
-
-  def test_evaluation_hooks_are_used(self):
-    hook = test.mock.MagicMock(
-        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
-
-    def _model_fn_hooks(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          evaluation_hooks=[hook])
-
-    est = estimator.Estimator(model_fn=_model_fn_hooks)
-    est.train(dummy_input_fn, steps=1)
-    self.assertFalse(hook.begin.called)
-    est.evaluate(dummy_input_fn, steps=1)
-    self.assertTrue(hook.begin.called)
-
-  def test_summary_writing_with_summary_proto(self):
-
-    def model_fn_global_step_incrementer_image(features, labels, mode):
-      _, _ = features, labels
-      global_step = training.get_global_step()
-
-      image = array_ops.zeros([5, 3, 3, 1])
-      eval_metric_ops = {
-          'foo': (summary.image('image', image, max_outputs=3),
-                  constant_op.constant(1))
-      }
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(1.),
-          train_op=state_ops.assign_add(global_step, 1),
-          eval_metric_ops=eval_metric_ops)
-
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer_image,
-                              config=run_config.RunConfig(save_summary_steps=1))
-    est.train(dummy_input_fn, steps=200)
-    est.evaluate(
-        input_fn=dummy_input_fn,
-        steps=200,
-    )
-
-    # Make sure nothing is stuck in limbo.
-    writer_cache.FileWriterCache.clear()
-
-    # Get last evaluation Event written.
-    for key in ['foo/0', 'foo/1', 'foo/2']:
-      self.assertTrue(
-          check_eventfile_for_keyword(key, est.eval_dir()),
-          '{} should be part of reported summaries.'.format(key))
-
-    # Verify that evaluated checkpoint path is written to event file.
-    checkpoint_path_tag = 'checkpoint_path'
-    self.assertTrue(
-        check_eventfile_for_keyword(checkpoint_path_tag, est.eval_dir()),
-        '{} should be part of reported summaries.'.format(checkpoint_path_tag))
-
-    expected_tensor_proto = tensor_util.make_tensor_proto(
-        est.latest_checkpoint(), dtype=dtypes.string)
-    summaries = summaries_with_matching_keyword(checkpoint_path_tag,
-                                                est.eval_dir())
-    self.assertProtoEquals(expected_tensor_proto,
-                           next(summaries).value[0].tensor)
-
-  def test_summary_writing_with_tensor(self):
-
-    def model_fn_with_prediction_mean_tensor_eval_metric_ops(
-        features, labels, mode, params):
-      _, _ = features, labels
-      global_step = training.get_global_step()
-
-      metric_name = params.get('metric_name') or 'metric'
-      predictions = constant_op.constant([1., .5, 0.])
-      eval_metric_ops = {metric_name: metrics_lib.mean_tensor(predictions)}
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(1.),
-          predictions={'predictions': predictions},
-          train_op=state_ops.assign_add(global_step, 1),
-          eval_metric_ops=eval_metric_ops)
-
-    metric_key = 'PMT'
-    params = {
-        'metric_name': metric_key,
-    }
-    est = estimator.Estimator(
-        model_fn=model_fn_with_prediction_mean_tensor_eval_metric_ops,
-        params=params,
-        config=run_config.RunConfig(save_summary_steps=1))
-    est.train(input_fn=dummy_input_fn, steps=10)
-    est.evaluate(
-        input_fn=dummy_input_fn,
-        steps=10,
-    )
-
-    writer_cache.FileWriterCache.clear()
-
-    self.assertTrue(
-        check_eventfile_for_keyword(metric_key, est.eval_dir()),
-        '{} should be part of reported summaries.'.format(metric_key))
-
-    summaries = summaries_with_matching_keyword(metric_key, est.eval_dir())
-    for value in next(summaries).value:
-      if value.tag == metric_key:
-        self.assertTrue(value.HasField('tensor'))
-
-
-class EstimatorPredictTest(test.TestCase):
-
-  def test_input_fn_args(self):
-    expected_mode = model_fn_lib.ModeKeys.PREDICT
-    expected_params = {'batch_size': 10}
-    expected_config = run_config.RunConfig().replace(tf_random_seed=4321)
-    input_fn_call_count = [0]
-
-    def _model_fn(features, labels, mode, params, config):
-      del features, labels, params, config
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.]]))
-
-    def _input_fn(mode, params, config):
-      input_fn_call_count[0] += 1
-      self.assertEqual(expected_mode, mode)
-      self.assertEqual(expected_params, params)
-      self.assertEqual(4321, config.tf_random_seed)
-      return dummy_input_fn()
-
-    est = estimator.Estimator(model_fn=_model_fn,
-                              params=expected_params,
-                              config=expected_config)
-    est.train(dummy_input_fn, steps=1)
-    self.assertEqual(0, input_fn_call_count[0])
-    next(est.predict(_input_fn))
-    self.assertEqual(1, input_fn_call_count[0])
-
-  def test_no_checkpoint_uses_init(self):
-    def _model_fn(features, labels, mode, params, config):
-      del features, labels, params, config
-      x = variables.VariableV1([[3.]], name='x')
-      return model_fn_lib.EstimatorSpec(mode, predictions=math_ops.add(x, 1.))
-    est = estimator.Estimator(model_fn=_model_fn)
-    # Expected prediction value is 1 + the value of the Variable that is newly
-    # initialized (since there is no checkpoint).
-    self.assertEqual(4., next(est.predict(dummy_input_fn)))
-
-  def test_no_checkpoint_uses_init_with_warm_starting(self):
-    def _make_model_fn(x):
-      def _variable_creating_and_export_model_fn(features, labels, mode):
-        _, _ = features, labels
-        x_var = variables.VariableV1([[x]], name='x')
-        return model_fn_lib.EstimatorSpec(
-            mode,
-            predictions=math_ops.add(x_var, 1.),
-            loss=constant_op.constant(1.),
-            train_op=state_ops.assign_add(training.get_global_step(), 1),
-            export_outputs={'test': export_output.ClassificationOutput(
-                constant_op.constant([4.2]),
-                constant_op.constant(['label']))})
-      return _variable_creating_and_export_model_fn
-
-    first_est = estimator.Estimator(model_fn=_make_model_fn(3.))
-    first_est.train(dummy_input_fn, steps=10)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    tmpdir = tempfile.mkdtemp()
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    exported_path = first_est.export_savedmodel(export_dir_base,
-                                                serving_input_receiver_fn)
-
-    # Test that we can pass either warm_start_from as an external checkpoint
-    # or an exported SavedModel.
-    est = estimator.Estimator(model_fn=_make_model_fn(30.),
-                              warm_start_from=exported_path)
-    # Prediction here is set to 1 + the value of the Variable that is
-    # warm-started from the SavedModel of the first model (3.), as opposed to
-    # the initialization in the new model_fn (30.).
-    self.assertEqual(4., next(est.predict(dummy_input_fn)))
-
-    est = estimator.Estimator(model_fn=_make_model_fn(40.),
-                              warm_start_from=first_est.model_dir)
-    # Prediction here is set to 1 + the value of the Variable that is
-    # warm-started from a checkpoint of the first model (3.), as opposed to
-    # the initialization in the new model_fn (40.).
-    self.assertEqual(4., next(est.predict(dummy_input_fn)))
-
-  def test_no_trained_model_invalid_checkpoint_path(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    with self.assertRaises(ValueError):
-      next(
-          est.predict(
-              dummy_input_fn,
-              checkpoint_path=
-              checkpoint_management.latest_checkpoint('fakedir')))
-
-  def test_tensor_predictions(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    self.assertEqual(10., next(est.predict(dummy_input_fn)))
-
-  def test_predictionhooks_are_used(self):
-    hook = test.mock.MagicMock(
-        wraps=training.SessionRunHook(), spec=training.SessionRunHook)
-
-    def _model_fn_hooks(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.]]),
-          prediction_hooks=[hook])
-
-    est = estimator.Estimator(model_fn=_model_fn_hooks)
-    est.train(dummy_input_fn, steps=1)
-    self.assertFalse(hook.begin.called)
-    next(est.predict(dummy_input_fn))
-    self.assertTrue(hook.begin.called)
-
-  def test_warn_if_no_queue_runner(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      next(est.predict(dummy_input_fn))
-      self.assertRegexpMatches(
-          str(mock_log.call_args),
-          'Input graph does not.*contain a QueueRunner.')
-
-  def test_skip_warn_if_dataset_returns_features(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.]]))
-
-    def _input_fn():
-      it = dataset_ops.Dataset.from_tensors([1]).make_one_shot_iterator()
-      return it.get_next()
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      next(est.predict(_input_fn))
-      # The warning should not have keyword QueueRunner.
-      self.assertRegexpMatches(str(mock_log.call_args), '^((?!QueueRunner).)*$')
-
-  def test_skip_warn_if_dataset_returns_features_dict(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.]]))
-
-    def _input_fn():
-      it = dataset_ops.Dataset.from_tensors([1]).make_one_shot_iterator()
-      features = {'age': it.get_next()}
-      return features
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      next(est.predict(_input_fn))
-      # The warning should not have keyword QueueRunner.
-      self.assertRegexpMatches(str(mock_log.call_args), '^((?!QueueRunner).)*$')
-
-  def test_input_fn_can_return_just_features(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-
-    def _only_features():
-      return {'x': constant_op.constant([[0.]])}
-
-    self.assertEqual([10.], next(est.predict(_only_features)))
-
-  def test_batch_size_mismatch(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions={
-              'y1': constant_op.constant([[10.]]),
-              'y2': constant_op.constant([[12.], [13]])
-          })
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    with self.assertRaisesRegexp(ValueError,
-                                 'Batch length of predictions should be same'):
-      next(est.predict(dummy_input_fn))
-
-  def test_iterate_batches(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions={
-              # First dim is different but the prediction should still work
-              'y1': array_ops.zeros(shape=[3]),
-              'y2': array_ops.zeros(shape=[5, 3])
-          })
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-
-    predictions = next(est.predict(dummy_input_fn, yield_single_examples=False))
-    self.assertAllEqual(predictions['y1'].shape, [3])
-    self.assertAllEqual(predictions['y2'].shape, [5, 3])
-
-  def test_predict_keys_defined_for_tensor(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    with self.assertRaisesRegexp(
-        ValueError,
-        'predict_keys argument is not valid in case of non-dict predictions'):
-      next(est.predict(dummy_input_fn, predict_keys=['y']))
-
-  def test_predict_keys_does_not_exists(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions={
-              'y1': constant_op.constant([[10.]]),
-              'y2': constant_op.constant([[12.]])
-          })
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    with self.assertRaisesRegexp(ValueError,
-                                 'Expected to run at least one output from'):
-      next(est.predict(dummy_input_fn, predict_keys=['y3']))
-
-  def test_return_given_predict_keys(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions={
-              'y1': constant_op.constant([[10.]]),
-              'y2': constant_op.constant([[12.]])
-          })
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    results = next(est.predict(dummy_input_fn, predict_keys=['y1']))
-    self.assertIn('y1', results)
-    self.assertNotIn('y2', results)
-
-  def test_yield_rows_of_tensor(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.], [12.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    results = est.predict(dummy_input_fn)
-    self.assertEqual([10.], next(results))
-    self.assertEqual([12.], next(results))
-
-  def test_yield_rows_of_dict(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions={
-              'y1': constant_op.constant([[10.], [12]]),
-              'y2': constant_op.constant([[0.], [2.]])
-          })
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    results = est.predict(dummy_input_fn)
-    self.assertDictEqual({'y1': [10.], 'y2': [0.]}, next(results))
-    self.assertDictEqual({'y1': [12.], 'y2': [2.]}, next(results))
-
-  def test_hooks_should_be_session_run_hook(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    est.train(dummy_input_fn, steps=1)
-    with self.assertRaisesRegexp(TypeError, 'must be a SessionRunHook'):
-      next(est.predict(dummy_input_fn, hooks=['NotAHook']))
-
-  def test_hooks_are_used(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[10.], [12.]]))
-
-    step_counter_hook = _StepCounterHook()
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    results = est.predict(dummy_input_fn, hooks=[step_counter_hook])
-    self.assertEqual(0, step_counter_hook.steps)  # not called yet
-    next(results)
-    self.assertEqual(1, step_counter_hook.steps)  # first call
-    next(results)
-    self.assertEqual(1, step_counter_hook.steps)  # it's in same batch
-    next(results)
-    self.assertEqual(2, step_counter_hook.steps)  # next batch
-
-  def test_predict_from_old_model_dir(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      v = variables.VariableV1([[16.]], name='weight')
-      prediction = v * 2
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=prediction)
-
-    est1 = estimator.Estimator(model_fn=_model_fn)
-    est1.train(dummy_input_fn, steps=1)
-    est2 = estimator.Estimator(model_fn=_model_fn, model_dir=est1.model_dir)
-    self.assertEqual([32.], next(est2.predict(dummy_input_fn)))
-
-  def test_predict_from_checkpoint_path(self):
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      v = variables.VariableV1([[16.]], name='weight')
-      prediction = v * 2
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=prediction)
-
-    est1 = estimator.Estimator(model_fn=_model_fn)
-    est1.train(dummy_input_fn, steps=1)
-    est2 = estimator.Estimator(model_fn=_model_fn, model_dir=est1.model_dir)
-    self.assertEqual([32.],
-                     next(
-                         est2.predict(
-                             dummy_input_fn,
-                             checkpoint_path=est2.latest_checkpoint())))
-
-  def test_scaffold_is_used(self):
-
-    def _model_fn_scaffold(features, labels, mode):
-      _, _ = features, labels
-      variables.VariableV1(1., name='weight')
-      self.mock_saver = get_mock_saver()
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          predictions=constant_op.constant([[1.]]),
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          scaffold=training.Scaffold(saver=self.mock_saver))
-
-    est = estimator.Estimator(model_fn=_model_fn_scaffold)
-    est.train(dummy_input_fn, steps=1)
-    next(est.predict(dummy_input_fn))
-    self.assertTrue(self.mock_saver.restore.called)
-
-  def test_features_labels_mode(self):
-    given_features = {'test-features': [[1], [1]]}
-    given_labels = {'test-labels': [[1], [1]]}
-
-    def _input_fn():
-      return given_features, given_labels
-
-    def _model_fn(features, labels, mode):
-      self.features, self.labels, self.mode = features, labels, mode
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[0.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(_input_fn, steps=1)
-    next(est.predict(_input_fn))
-    self.assertEqual(given_features, self.features)
-    self.assertIsNone(self.labels)
-    self.assertEqual(model_fn_lib.ModeKeys.PREDICT, self.mode)
-
-  def test_graph_initialization_global_step_and_random_seed(self):
-    expected_random_seed = run_config.RunConfig().tf_random_seed
-    def _model_fn(features, labels, mode):
-      _, _, _ = features, labels, mode
-      self.assertIsNotNone(training.get_global_step())
-      self.assertEqual(expected_random_seed, ops.get_default_graph().seed)
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[0.]]))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    next(est.predict(dummy_input_fn))
-
-
-def _model_fn_for_export_tests(features, labels, mode):
-  _, _ = features, labels
-  variables.VariableV1(1., name='weight')
-  scores = constant_op.constant([3.])
-  classes = constant_op.constant(['wumpus'])
-  update_global_step = state_ops.assign_add(training.get_global_step(), 1)
-  with ops.control_dependencies([update_global_step]):
-    train_op = constant_op.constant(2.)
-  return model_fn_lib.EstimatorSpec(
-      mode,
-      predictions=constant_op.constant(10.),
-      loss=constant_op.constant(1.),
-      train_op=train_op,
-      export_outputs={
-          'test': export_output.ClassificationOutput(scores, classes)})
-
-
-def _x_y_input_fn():
-  return ({'x': constant_op.constant([[1], [1]]),
-           'y': constant_op.constant([[2], [2]])},
-          constant_op.constant([[1], [1]]))
-
-
-def _model_fn_with_x_y(features, labels, mode):
-  _ = labels
-  variables.VariableV1(1., name='weight')
-  scores = constant_op.constant([3.])
-  classes = constant_op.constant(['wumpus'])
-  if mode == model_fn_lib.ModeKeys.PREDICT:
-    variables.VariableV1(36., name='name_collision')
-    return model_fn_lib.EstimatorSpec(
-        mode,
-        predictions=constant_op.constant(10.),
-        export_outputs={
-            'test': export_output.ClassificationOutput(scores, classes)})
-  else:
-    prefix = 'eval_' if mode == model_fn_lib.ModeKeys.EVAL else ''
-
-    multiplied = math_ops.multiply(
-        features['x'], features['y'], name='{}multiplied'.format(prefix))
-    mean = metrics_module.Mean(name='{}mean'.format(prefix))
-    mean.update_state(features['x'] - features['y'])
-    eval_metrics = {
-        'mean1':
-            mean,
-        'mean2':
-            metrics_lib.mean(
-                features['x'] - features['y'], name='{}mean'.format(prefix))
-    }
-    variables.VariableV1(1., name='later_var')
-    variables.VariableV1(3., name='name_collision')
-    return model_fn_lib.EstimatorSpec(
-        mode,
-        predictions=multiplied,
-        loss=constant_op.constant(1.),
-        train_op=state_ops.assign_add(training.get_global_step(), 1),
-        eval_metric_ops=eval_metrics)
-
-
-def _model_fn_with_saveables_for_export_tests(features, labels, mode):
-  _, _ = features, labels
-  table = saver_test_utils.CheckpointedOp(name='v2')
-  update_global_step = state_ops.assign_add(training.get_global_step(), 1)
-  with ops.control_dependencies([update_global_step]):
-    train_op = table.insert('k1', 30.0)
-  prediction = table.lookup('k1', 0.0)
-  return model_fn_lib.EstimatorSpec(
-      mode,
-      predictions=prediction,
-      loss=constant_op.constant(1.),
-      train_op=train_op,
-      export_outputs={
-          'test': export_output.PredictOutput({'prediction': prediction})})
-
-
-def _get_serving_input_receiver_fn():
-  feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                  'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-  return export.build_parsing_serving_input_receiver_fn(feature_spec)
-
-
-def _get_supervised_input_receiver_fn():
-  feature_spec = {
-      'x': array_ops.placeholder(
-          dtype=dtypes.int64, shape=(2, 1), name='feature_x'),
-      'y': array_ops.placeholder(
-          dtype=dtypes.int64, shape=(2, 1), name='feature_y')
-      }
-  label_spec = array_ops.placeholder(
-      dtype=dtypes.float32, shape=[1], name='truth')
-
-  return export.build_raw_supervised_input_receiver_fn(feature_spec, label_spec)
-
-
-_VOCAB_FILE_CONTENT = 'emerson\nlake\npalmer\n'
-_EXTRA_FILE_CONTENT = 'kermit\npiggy\nralph\n'
-
-
-class EstimatorExportTest(test.TestCase):
-
-  def test_export_savedmodel_proto_roundtrip_raw_receiver(self):
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
-    est.train(input_fn=dummy_input_fn, steps=1)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est.export_savedmodel(
-        export_dir_base, serving_input_receiver_fn)
-
-    # Check that all the files are in the right places.
-    self.assertTrue(gfile.Exists(export_dir_base))
-    self._validate_exported_files(export_dir)
-
-    # Restore, to validate that the export was well-formed.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('input_example_tensor' in graph_ops)
-        self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-  def test_export_saved_model_train(self):
-    self._test_export_saved_model_for_mode(
-        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.TRAIN)
-
-  def test_export_saved_model_eval(self):
-    self._test_export_saved_model_for_mode(
-        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.EVAL)
-
-  def test_export_saved_model_predict(self):
-    self._test_export_saved_model_for_mode(
-        _get_serving_input_receiver_fn(), model_fn_lib.ModeKeys.PREDICT)
-
-  def _test_export_saved_model_for_mode(self, input_receiver_fn, mode):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
-    est.train(input_fn=_x_y_input_fn, steps=1)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est._export_saved_model_for_mode(
-        export_dir_base, input_receiver_fn, mode=mode)
-
-    # Check that all the files are in the right places.
-    self.assertTrue(gfile.Exists(export_dir_base))
-    self._validate_exported_files(export_dir)
-
-    # Restore, to validate that the export was well-formed.
-    tag_set = model_fn_lib.EXPORT_TAG_MAP[mode]
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, tag_set, export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertFalse('name_collision_1' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_receiver_map(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('input_example_tensor' in graph_ops)
-        self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        self.assertFalse('feature_x' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_train_only(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('multiplied' in graph_ops)
-        self.assertTrue('mean/update_op' in graph_ops)
-        self.assertFalse('eval_multiplied' in graph_ops)
-        self.assertTrue('feature_x' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_eval_only(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.EVAL], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('eval_multiplied' in graph_ops)
-        self.assertTrue('eval_mean/value' in graph_ops)
-        self.assertFalse('multiplied' in graph_ops)
-        self.assertTrue('feature_x' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_no_serving(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('multiplied' in graph_ops)
-        self.assertFalse('eval_multiplied' in graph_ops)
-        self.assertTrue('feature_x' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.EVAL], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('eval_multiplied' in graph_ops)
-        self.assertFalse('multiplied' in graph_ops)
-        # TODO(karmel): is this the desired behavior when names are shared?
-        self.assertTrue('feature_x_1' in graph_ops)
-        self.assertTrue('feature_y_1' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_three_defs(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    # Restore, to validate that the export was well-formed.
-    for tag_set in model_fn_lib.EXPORT_TAG_MAP.values():
-      with ops.Graph().as_default() as graph:
-        with session.Session(graph=graph) as sess:
-          loader.load(sess, tag_set, export_dir)
-          graph_ops = [x.name for x in graph.get_operations()]
-          self.assertTrue('global_step/Assign' in graph_ops)
-          self.assertTrue('global_step/Initializer/zeros' in graph_ops)
-          self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_proto_roundtrip_all_vars(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('later_var' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertFalse('later_var' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_all_saved_models_name_collision(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-    export_dir, tmpdir = self._test_export_all_saved_models(
-        input_receiver_fn_map)
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('name_collision' in graph_ops)
-        self.assertFalse('name_collision_1' in graph_ops)
-        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-        self.assertEqual(3, collection_vars[-1].eval())
-
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('name_collision' in graph_ops)
-        self.assertFalse('name_collision_1' in graph_ops)
-        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-        # This is a non-obvious detail: when we load the estimator spec
-        # for predict, name_collision gets set to 36. However, we then restore
-        # from checkpoint, which should overwrite that var and make it the 3
-        # from training. In practice, this would not be a good way to write
-        # a model_fn, but leaving this check in for now to ensure consistency
-        # with what would happen given our current order of spec, then
-        # checkpoint.
-        self.assertEqual(3, collection_vars[-1].eval())
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def _test_export_all_saved_models(self, input_receiver_fn_map):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_with_x_y)
-    est.train(input_fn=_x_y_input_fn, steps=1)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est._export_all_saved_models(
-        export_dir_base, input_receiver_fn_map)
-
-    # Check that all the files are in the right places.
-    self.assertTrue(gfile.Exists(export_dir_base))
-
-    self._validate_exported_files(export_dir)
-
-    return export_dir, tmpdir
-
-  def _validate_exported_files(self, export_dir):
-    self.assertTrue(gfile.Exists(export_dir))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('saved_model.pb'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.index'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.data-00000-of-00001'))))
-
-  def test_export_all_saved_models_var_not_found(self):
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-
-    def _model_fn_with_predict_only_vars(features, labels, mode):
-      _, _ = features, labels
-      if mode == model_fn_lib.ModeKeys.PREDICT:
-        variables.VariableV1(1., name='only_in_predict')
-      else:
-        variables.VariableV1(1., name='otherwise')
-
-      prediction = constant_op.constant(1.)
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          predictions=prediction,
-          loss=constant_op.constant(1.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          export_outputs={
-              'test': export_output.PredictOutput({'prediction': prediction})
-          })
-
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_with_predict_only_vars)
-    est.train(input_fn=_x_y_input_fn, steps=1)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-
-    err_regex = r'Could not load all requested variables[\w\W]*infer'
-    with self.assertRaisesRegexp(ValueError, err_regex):
-      est._export_all_saved_models(export_dir_base, input_receiver_fn_map)
-
-  def test_export_all_saved_models_metric_operation(self):
-    """Ensures metrics ops.Operations can be expoerted (b/109740581)."""
-
-    def _model_fn(features, labels, mode):
-      del features, labels  # Unused
-      metric_obj = metrics_module.Mean()
-      metric_obj.update_state(constant_op.constant([0]))
-      eval_metrics = {
-          'metrics1': (constant_op.constant([0]), control_flow_ops.no_op()),
-          'metrics2': metric_obj,
-      }
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          predictions=constant_op.constant(10.),
-          loss=constant_op.constant(1.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          eval_metric_ops=eval_metrics)
-
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(input_fn=dummy_input_fn, steps=1)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('metric_operation_export'))
-
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()}
-
-    export_dir = est._export_all_saved_models(
-        export_dir_base, input_receiver_fn_map)
-
-    # Restore, to validate that the export was well-formed.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        meta_graph = loader.load(sess, [tag_constants.EVAL], export_dir)
-        sig_outputs = meta_graph.signature_def[
-            model_fn_lib.ModeKeys.EVAL].outputs
-        self.assertTrue(sig_outputs['metrics1/update_op'].name.startswith(
-            'metric_op_wrapper'))
-        self.assertTrue(sig_outputs['metrics2/update_op'].name.startswith(
-            'metric_op_wrapper'))
-
-  def test_export_savedmodel_with_saveables_proto_roundtrip(self):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(
-        model_fn=_model_fn_with_saveables_for_export_tests)
-    est.train(input_fn=dummy_input_fn, steps=1)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est.export_savedmodel(
-        export_dir_base, serving_input_receiver_fn)
-
-    # Check that all the files are in the right places.
-    self.assertTrue(gfile.Exists(export_dir_base))
-    self.assertTrue(gfile.Exists(export_dir))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('saved_model.pb'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.index'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.data-00000-of-00001'))))
-
-    # Restore, to validate that the export was well-formed.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('input_example_tensor' in graph_ops)
-        self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        # The original saver is used to restore variables
-        self.assertTrue('save/LookupTableImportV2' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_savedmodel_assets(self):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
-    est.train(input_fn=dummy_input_fn, steps=1)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-
-    # Create a fake asset.
-    vocab_file_name = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('my_vocab_file'))
-    vocab_file = gfile.GFile(vocab_file_name, mode='w')
-    vocab_file.write(_VOCAB_FILE_CONTENT)
-    vocab_file.close()
-
-    # hack in an op that uses the asset, in order to test asset export.
-    # this is not actually valid, of course.
-    def serving_input_receiver_with_asset_fn():
-      features, receiver_tensor, _ = serving_input_receiver_fn()
-      filename = ops.convert_to_tensor(vocab_file_name,
-                                       dtypes.string,
-                                       name='asset_filepath')
-      ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
-      features['bogus_filename'] = filename
-
-      return export.ServingInputReceiver(features, receiver_tensor)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est.export_savedmodel(
-        export_dir_base, serving_input_receiver_with_asset_fn)
-
-    # Check that the asset files are in the right places.
-    expected_vocab_file_name = os.path.join(
-        compat.as_bytes(export_dir), compat.as_bytes('assets/my_vocab_file'))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir), compat.as_bytes('assets'))))
-    self.assertTrue(gfile.Exists(expected_vocab_file_name))
-    self.assertEqual(
-        compat.as_bytes(_VOCAB_FILE_CONTENT),
-        compat.as_bytes(gfile.GFile(expected_vocab_file_name).read()))
-
-    # Restore, to validate that the export was well-formed.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        assets = [
-            x.eval()
-            for x in graph.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
-        ]
-        self.assertItemsEqual([vocab_file_name], assets)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('input_example_tensor' in graph_ops)
-        self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        self.assertTrue('asset_filepath' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
-
-    # cleanup
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_savedmodel_extra_assets(self):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
-    est.train(input_fn=dummy_input_fn, steps=1)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-
-    # Create a fake asset.
-    extra_file_name = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('my_extra_file'))
-    extra_file = gfile.GFile(extra_file_name, mode='w')
-    extra_file.write(_EXTRA_FILE_CONTENT)
-    extra_file.close()
-
-    # Perform the export.
-    assets_extra = {'some/sub/directory/my_extra_file': extra_file_name}
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est.export_savedmodel(export_dir_base,
-                                       serving_input_receiver_fn,
-                                       assets_extra=assets_extra)
-
-    # Check that the asset files are in the right places.
-    expected_extra_path = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('assets.extra/some/sub/directory/my_extra_file'))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir), compat.as_bytes('assets.extra'))))
-    self.assertTrue(gfile.Exists(expected_extra_path))
-    self.assertEqual(
-        compat.as_bytes(_EXTRA_FILE_CONTENT),
-        compat.as_bytes(gfile.GFile(expected_extra_path).read()))
-
-    # cleanup
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_savedmodel_tensor_features(self):
-    """Test that models accepting a single raw Tensor can be exported.
-
-    See https://github.com/tensorflow/tensorflow/issues/11674
-
-    If the model_fn and receiver_fn accept raw tensors rather than dictionaries
-    as input, export_savedmodel should be okay with that, too.
-
-    """
-
-    tmpdir = tempfile.mkdtemp()
-
-    def _input_fn_tensor_features():
-      t = array_ops.constant([1, 2, 3], dtype=dtypes.float32, shape=[1, 3])
-      return (t, None)
-
-    def _model_fn_tensor_features(features, labels, mode):
-      _ = labels
-      prediction = math_ops.matmul(features, features, transpose_b=True)
-
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          predictions=prediction,
-          loss=constant_op.constant(1.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          export_outputs={
-              'test': export_output.PredictOutput({'prediction': prediction})
-          })
-
-    def _serving_input_receiver_fn():
-      feat = array_ops.placeholder(dtype=dtypes.float32)
-      return export.TensorServingInputReceiver(
-          features=feat, receiver_tensors=feat)
-
-    est = estimator.Estimator(model_fn=_model_fn_tensor_features)
-    est.train(input_fn=_input_fn_tensor_features, steps=1)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est.export_savedmodel(
-        export_dir_base, _serving_input_receiver_fn)
-
-    # Restore, to validate that the export was well-formed.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name.lower() for x in graph.get_operations()]
-        self.assertTrue('const' in graph_ops)
-        self.assertTrue('matmul' in graph_ops)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_scaffold_is_used_for_saver(self):
-    tmpdir = tempfile.mkdtemp()
-
-    def _model_fn_scaffold(features, labels, mode):
-      _, _ = features, labels
-      variables.VariableV1(1., name='weight')
-      self.mock_saver = get_mock_saver()
-      scores = constant_op.constant([3.])
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          predictions=constant_op.constant([[1.]]),
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          scaffold=training.Scaffold(saver=self.mock_saver),
-          export_outputs={'test': export_output.ClassificationOutput(scores)})
-
-    est = estimator.Estimator(model_fn=_model_fn_scaffold)
-    est.train(dummy_input_fn, steps=1)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    est.export_savedmodel(export_dir_base, serving_input_receiver_fn)
-
-    self.assertTrue(self.mock_saver.restore.called)
-    self.assertTrue(self.mock_saver.export_meta_graph.called)
-    self.assertTrue(self.mock_saver.save.called)
-
-  def test_scaffold_is_used_for_saver_multiple_modes(self):
-    tmpdir = tempfile.mkdtemp()
-    savers = {'predict_saver': None, 'train_saver': None}
-
-    def _model_fn_scaffold(features, labels, mode):
-      _, _ = features, labels
-      variables.VariableV1(1., name='weight')
-
-      scores = constant_op.constant([3.])
-      if mode == model_fn_lib.ModeKeys.PREDICT:
-        savers['predict_saver'] = get_mock_saver()
-        scaffold = training.Scaffold(saver=savers['predict_saver'])
-      elif mode == model_fn_lib.ModeKeys.TRAIN:
-        savers['train_saver'] = get_mock_saver()
-        scaffold = training.Scaffold(saver=savers['train_saver'])
-      else:
-        scaffold = training.Scaffold()
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          predictions=constant_op.constant([[1.]]),
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          scaffold=scaffold,
-          export_outputs={'test': export_output.ClassificationOutput(scores)})
-
-    est = estimator.Estimator(model_fn=_model_fn_scaffold)
-    est.train(dummy_input_fn, steps=1)
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    est._export_all_saved_models(export_dir_base, input_receiver_fn_map)
-
-    self.assertTrue(savers['train_saver'].restore.called)
-    self.assertEqual(savers['train_saver'].export_meta_graph.call_count, 1)
-    self.assertEqual(savers['train_saver'].save.call_count, 1)
-
-    self.assertTrue(savers['predict_saver'].restore.called)
-    self.assertEqual(savers['predict_saver'].export_meta_graph.call_count, 1)
-    self.assertEqual(savers['predict_saver'].save.call_count, 0)
-
-  def test_scaffold_is_used_for_local_init(self):
-    tmpdir = tempfile.mkdtemp()
-
-    def _model_fn_scaffold(features, labels, mode):
-      _, _ = features, labels
-      my_int = variables.VariableV1(1, name='my_int',
-                                    collections=[ops.GraphKeys.LOCAL_VARIABLES])
-      _ = training.get_or_create_steps_per_run_variable()
-      scores = constant_op.constant([3.])
-      with ops.control_dependencies([
-          variables.local_variables_initializer(),
-          lookup_ops.tables_initializer()
-      ]):
-        assign_op = state_ops.assign(my_int, 12345)
-
-      # local_initSop must be an Operation, not a Tensor.
-      custom_local_init_op = control_flow_ops.group(assign_op)
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          predictions=constant_op.constant([[1.]]),
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          scaffold=training.Scaffold(local_init_op=custom_local_init_op),
-          export_outputs={'test': export_output.ClassificationOutput(scores)})
-
-    est = estimator.Estimator(model_fn=_model_fn_scaffold)
-    est.train(dummy_input_fn, steps=1)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est.export_savedmodel(export_dir_base,
-                                       serving_input_receiver_fn)
-
-    # Restore, to validate that the custom local_init_op runs.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        my_int = graph.get_tensor_by_name('my_int:0')
-        my_int_value = sess.run(my_int)
-        self.assertEqual(12345, my_int_value)
-
-  def test_scaffold_is_used_for_local_init_multiple_modes(self):
-    tmpdir = tempfile.mkdtemp()
-
-    def _model_fn_scaffold(features, labels, mode):
-      _, _ = features, labels
-      my_int = variables.VariableV1(1, name='my_int',
-                                    collections=[ops.GraphKeys.LOCAL_VARIABLES])
-      scores = constant_op.constant([3.])
-      with ops.control_dependencies([
-          variables.local_variables_initializer(),
-          lookup_ops.tables_initializer()
-      ]):
-        assign_op = state_ops.assign(my_int, 12345)
-
-      custom_local_init_op = None
-      if mode == model_fn_lib.ModeKeys.PREDICT:
-        # local_initSop must be an Operation, not a Tensor.
-        custom_local_init_op = control_flow_ops.group(assign_op)
-
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          predictions=constant_op.constant([[1.]]),
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          scaffold=training.Scaffold(local_init_op=custom_local_init_op),
-          export_outputs={'test': export_output.ClassificationOutput(scores)})
-
-    est = estimator.Estimator(model_fn=_model_fn_scaffold)
-    est.train(dummy_input_fn, steps=1)
-    input_receiver_fn_map = {
-        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
-        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
-    }
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir = est._export_all_saved_models(
-        export_dir_base, input_receiver_fn_map)
-
-    # Restore, to validate that the custom local_init_op runs.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        my_int = graph.get_tensor_by_name('my_int:0')
-        my_int_value = sess.run(my_int)
-        self.assertEqual(12345, my_int_value)
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.TRAINING], export_dir)
-        my_int = graph.get_tensor_by_name('my_int:0')
-        my_int_value = sess.run(my_int)
-        self.assertEqual(1, my_int_value)
-
-  def test_features_labels_mode(self):
-    given_features = {'test-features': constant_op.constant([[1], [1]])}
-
-    def serving_input_receiver_fn():
-      return export.ServingInputReceiver(
-          given_features, array_ops.placeholder(dtype=dtypes.string))
-
-    def _model_fn(features, labels, mode):
-      self.features, self.labels, self.mode = features, labels, mode
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[0.]]),
-          export_outputs={
-              'test': export_output.ClassificationOutput(
-                  constant_op.constant([[0.]]))
-          })
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    est.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn)
-    self.assertEqual(given_features, self.features)
-    self.assertIsNone(self.labels)
-    self.assertEqual(model_fn_lib.ModeKeys.PREDICT, self.mode)
-
-  def test_graph_initialization_global_step_and_random_seed(self):
-    expected_random_seed = run_config.RunConfig().tf_random_seed
-    def _model_fn(features, labels, mode):
-      _, _, _ = features, labels, mode
-      self.assertIsNotNone(training.get_global_step())
-      self.assertEqual(expected_random_seed, ops.get_default_graph().seed)
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          loss=constant_op.constant(0.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1),
-          predictions=constant_op.constant([[0.]]),
-          export_outputs={
-              'test': export_output.ClassificationOutput(
-                  constant_op.constant([[0.]]))
-          })
-
-    def serving_input_receiver_fn():
-      return export.ServingInputReceiver(
-          {'test-features': constant_op.constant([[1], [1]])},
-          array_ops.placeholder(dtype=dtypes.string))
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(dummy_input_fn, steps=1)
-    est.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn)
-
-  def test_export_savedmodel_respects_soft_placement(self):
-    def model_fn_with_a_gpu_op_but_no_kernel(features, labels, mode):
-      _, _ = features, labels
-      table = saver_test_utils.CheckpointedOp(name='v2')
-
-      update_global_step = state_ops.assign_add(training.get_global_step(), 1)
-      with ops.control_dependencies([update_global_step]):
-        train_op = table.insert('k1', 30.0)
-
-      #  In this test, there are no GPUs available.  The goal is to verify that
-      #  export_savedmodel executes nevertheless.
-      with ops.device('/gpu:0'):
-        string_op = string_ops.as_string(update_global_step)
-
-      with ops.control_dependencies([string_op]):
-        prediction = table.lookup('k1', 0.0)
-
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          predictions=prediction,
-          loss=constant_op.constant(1.),
-          train_op=train_op,
-          export_outputs={
-              'test': export_output.PredictOutput({
-                  'prediction': prediction
-              })
-          })
-
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(
-        model_fn=model_fn_with_a_gpu_op_but_no_kernel)
-    est.train(input_fn=dummy_input_fn, steps=1)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-
-    export_dir = est.export_savedmodel(
-        export_dir_base, serving_input_receiver_fn)
-
-    # At this point, if export_savedmodel executed with
-    # allow_soft_placement=True, then the GPU-assigned operation was silently
-    # placed on the CPU.  Otherwise, an exception would have been raised
-    # related to the fact that the requested GPU device isn't available.
-
-    # Expectations below assume that export_savedmodel has completed normally.
-    self.assertTrue(gfile.Exists(export_dir_base))
-    self.assertTrue(gfile.Exists(export_dir))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('saved_model.pb'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.index'))))
-    self.assertTrue(gfile.Exists(os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes('variables/variables.data-00000-of-00001'))))
-
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_savedmodel_proto_strip_default_attrs(self):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
-    est.train(input_fn=dummy_input_fn, steps=1)
-    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
-                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('export'))
-    export_dir_stripped = est.export_savedmodel(
-        export_dir_base, serving_input_receiver_fn, strip_default_attrs=True)
-    export_dir_not_stripped = est.export_savedmodel(
-        export_dir_base, serving_input_receiver_fn, strip_default_attrs=False)
-
-    # Load the SavedModel from disk as-is to verify default attrs
-    # are stripped. Reimporting the SavedModel via the loader causes the
-    # default attrs to be populated in the NodeDefs.
-
-    # pylint: disable=protected-access
-    saved_model_stripped_pb = loader_impl._parse_saved_model(
-        export_dir_stripped)
-    saved_model_not_stripped_pb = loader_impl._parse_saved_model(
-        export_dir_not_stripped)
-    self.assertIsNotNone(saved_model_stripped_pb)
-    self.assertIsNotNone(saved_model_not_stripped_pb)
-    # pylint: enable=protected-access
-
-    meta_graph_def_stripped = [
-        x for x in saved_model_stripped_pb.meta_graphs
-        if x.meta_info_def.tags == [tag_constants.SERVING]][0]
-    meta_graph_def_not_stripped = [
-        x for x in saved_model_not_stripped_pb.meta_graphs
-        if x.meta_info_def.tags == [tag_constants.SERVING]][0]
-
-    # "weight" node in graph is a "Variable" Op with 2 default valued attrs.
-    #   o "container"    : "".
-    #   o "shared_name"  : "".
-
-    # saved_model_stripped_pb was exported with strip_default_attrs set to True.
-    # "weight" node shouldn't have attributes "container" and "shared_name".
-    node_def = test_util.get_node_def_from_graph(
-        'weight', meta_graph_def_stripped.graph_def)
-    self.assertNotIn('container', node_def.attr)
-    self.assertNotIn('shared_name', node_def.attr)
-
-    # saved_model_not_stripped_pb was exported with strip_default_attrs
-    # disabled. "weight" node should have attributes "container" and
-    # "shared_name".
-    node_def = test_util.get_node_def_from_graph(
-        'weight', meta_graph_def_not_stripped.graph_def)
-    self.assertIn('container', node_def.attr)
-    self.assertIn('shared_name', node_def.attr)
-
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
-
-  def test_export_savedmodel_no_export_outputs(self):
-    """Ensure that an EstimatorSpec without outputs defined can be exported."""
-
-    def _model_fn(features, labels, mode):
-      _, _ = features, labels
-      variables.VariableV1(1., name='weight')
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          predictions=constant_op.constant(10.),
-          loss=constant_op.constant(1.),
-          train_op=state_ops.assign_add(training.get_global_step(), 1))
-
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn)
-    est.train(input_fn=dummy_input_fn, steps=1)
-
-    # Perform the export.
-    export_dir_base = os.path.join(
-        compat.as_bytes(tmpdir), compat.as_bytes('no_export_outputs'))
-    export_dir = est.export_savedmodel(
-        export_dir_base, _get_serving_input_receiver_fn())
-
-    # Check that all the files are in the right places.
-    self.assertTrue(gfile.Exists(export_dir_base))
-    self._validate_exported_files(export_dir)
-
-    # Restore, to validate that the export was well-formed.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        meta_graph = loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('weight' in graph_ops)
-
-        sig_def = meta_graph.signature_def
-        self.assertEqual(len(sig_def), 1)
-        sig_outputs = sig_def[
-            signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs
-        self.assertEqual(sig_outputs['output'].name, 'Const:0')
-
-
-class EstimatorHookOrderingTest(test.TestCase):
-
-  def testCustomHooksAreCalledBeforeNanTensorHook(self):
-
-    def nan_making_model_fn(mode, features, labels):
-      """A graph that generates NaN's for testing."""
-      del features, labels
-
-      global_step = variables.VariableV1(
-          0, dtype=dtypes.int64, name='global_step')
-      inc_global_step = state_ops.assign_add(global_step, 1)
-      nan_const = constant_op.constant(np.nan, dtype=dtypes.float32)
-      loss = control_flow_ops.cond(
-          inc_global_step > 1, lambda: nan_const, lambda: 1.0)
-
-      return model_fn_lib.EstimatorSpec(
-          mode=mode,
-          predictions=global_step.read_value(),
-          loss=loss,
-          train_op=inc_global_step)
-
-    def empty_input_fn():
-      return dict(), None
-
-    class AfterRunCountingHook(session_run_hook.SessionRunHook):
-      """Hooks that counts the number of times after_run() is called."""
-
-      def __init__(self):
-        self.after_run_count = 0
-
-      def after_run(self, run_context, run_values):
-        del run_context, run_values
-        self.after_run_count += 1
-
-    test_hook = AfterRunCountingHook()
-    est = estimator.Estimator(model_fn=nan_making_model_fn)
-    with self.assertRaises(basic_session_run_hooks.NanLossDuringTrainingError):
-      est.train(input_fn=empty_input_fn, steps=2, hooks=[test_hook])
-    self.assertEqual(2, test_hook.after_run_count)
-
-
-class EstimatorIntegrationTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_complete_flow_with_a_simple_linear_model(self):
-
-    def _model_fn(features, labels, mode):
-      predictions = layers.dense(
-          features['x'], 1, kernel_initializer=init_ops.zeros_initializer())
-      export_outputs = {
-          'predictions': export_output.RegressionOutput(predictions)
-      }
-
-      if mode == model_fn_lib.ModeKeys.PREDICT:
-        return model_fn_lib.EstimatorSpec(
-            mode, predictions=predictions, export_outputs=export_outputs)
-
-      loss = losses.mean_squared_error(labels, predictions)
-      train_op = training.GradientDescentOptimizer(learning_rate=0.5).minimize(
-          loss, training.get_global_step())
-      mean = metrics_module.Mean()
-      mean.update_state(loss)
-      eval_metric_ops = {
-          'absolute_error':
-              metrics_lib.mean_absolute_error(labels, predictions),
-          'mean':
-              mean,
-      }
-
-      return model_fn_lib.EstimatorSpec(
-          mode,
-          predictions=predictions,
-          loss=loss,
-          train_op=train_op,
-          eval_metric_ops=eval_metric_ops,
-          export_outputs=export_outputs)
-
-    est = estimator.Estimator(model_fn=_model_fn)
-    data = np.linspace(0., 1., 100, dtype=np.float32).reshape(-1, 1)
-
-    # TRAIN
-    # learn y = x
-    train_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, y=data, batch_size=50, num_epochs=None, shuffle=True)
-    est.train(train_input_fn, steps=200)
-
-    # EVALUATE
-    eval_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, y=data, batch_size=50, num_epochs=1, shuffle=True)
-    scores = est.evaluate(eval_input_fn)
-    self.assertEqual(200, scores['global_step'])
-    self.assertGreater(0.1, scores['absolute_error'])
-    self.assertAlmostEqual(4.4e-14, scores['mean'], places=2)
-
-    # PREDICT
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, y=None, batch_size=10, num_epochs=1, shuffle=False)
-    predictions = list(est.predict(predict_input_fn))
-    self.assertAllClose(data, predictions, atol=0.01)
-
-    # EXPORT
-    feature_spec = {'x': parsing_ops.FixedLenFeature([1], dtypes.float32)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
-                                       serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/export/__init__.py b/tensorflow/python/estimator/export/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..898efd46efb3635aff64097b62be93f1fd12be07 100644
--- a/tensorflow/python/estimator/export/__init__.py
+++ b/tensorflow/python/estimator/export/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""export python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_estimator.python.estimator import export
+
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+export.__all__ = [s for s in dir(export) if not s.startswith('__')]
+
+from tensorflow_estimator.python.estimator.export import *
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 55aace5fa99822b48f65775fea308db006f60f63..835f0c129fc5ab8d928bf58866c8cc2e4b0ac343 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,625 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Configuration and utilities for receiving inputs at serving time."""
+"""export python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import os
-
-import six
-
-from tensorflow.python.estimator import util
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import signature_def_utils
-from tensorflow.python.util import compat
-from tensorflow.python.util.tf_export import estimator_export
-
-_SINGLE_FEATURE_DEFAULT_NAME = 'feature'
-_SINGLE_RECEIVER_DEFAULT_NAME = 'input'
-_SINGLE_LABEL_DEFAULT_NAME = 'label'
-
-_SINGLE_TENSOR_DEFAULT_NAMES = {
-    'feature': _SINGLE_FEATURE_DEFAULT_NAME,
-    'label': _SINGLE_LABEL_DEFAULT_NAME,
-    'receiver_tensor': _SINGLE_RECEIVER_DEFAULT_NAME,
-    'receiver_tensors_alternative': _SINGLE_RECEIVER_DEFAULT_NAME
-}
-
-
-def _wrap_and_check_input_tensors(tensors, field_name):
-  """Ensure that tensors is a dict of str to Tensor mappings.
-
-  Args:
-    tensors: dict of str to Tensors, or a single Tensor.
-    field_name: name of the member field of `ServingInputReceiver`
-      whose value is being passed to `tensors`.
-
-  Returns:
-    dict of str to Tensors; this is the original dict if one was passed, or
-    the original tensor wrapped in a dictionary.
-
-  Raises:
-    ValueError: if tensors is None, or has non-string keys,
-      or non-Tensor values
-  """
-  if tensors is None:
-    raise ValueError('{}s must be defined.'.format(field_name))
-  if not isinstance(tensors, dict):
-    tensors = {_SINGLE_TENSOR_DEFAULT_NAMES[field_name]: tensors}
-  for name, tensor in tensors.items():
-    _check_tensor_key(name, error_label=field_name)
-    _check_tensor(tensor, name, error_label=field_name)
-  return tensors
-
-
-def _check_tensor(tensor, name, error_label='feature'):
-  """Check that passed `tensor` is a Tensor or SparseTensor."""
-  if not (isinstance(tensor, ops.Tensor) or
-          isinstance(tensor, sparse_tensor.SparseTensor)):
-    fmt_name = ' {}'.format(name) if name else ''
-    value_error = ValueError('{}{} must be a Tensor or SparseTensor.'.format(
-        error_label, fmt_name))
-    # NOTE(ericmc): This if-else block is a specific carve-out for
-    # LabeledTensor, which has a `.tensor` attribute and which is
-    # convertible to tf.Tensor via ops.convert_to_tensor.
-    # Allowing all types convertible to tf.Tensor is considered by soergel@
-    # to be too permissive.
-    # TODO(soergel): accept any type convertible to Tensor,
-    # as in cl/193238295 snapshot #6.
-    if hasattr(tensor, 'tensor'):
-      try:
-        ops.convert_to_tensor(tensor)
-      except TypeError:
-        raise value_error
-    else:
-      raise value_error
-
-
-def _check_tensor_key(name, error_label='feature'):
-  if not isinstance(name, six.string_types):
-    raise ValueError('{} keys must be strings: {}.'.format(error_label, name))
-
-
-@estimator_export('estimator.export.ServingInputReceiver')
-class ServingInputReceiver(
-    collections.namedtuple(
-        'ServingInputReceiver',
-        ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
-  """A return type for a serving_input_receiver_fn.
-
-  The expected return values are:
-    features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
-      `SparseTensor`, specifying the features to be passed to the model. Note:
-      if `features` passed is not a dict, it will be wrapped in a dict with a
-      single entry, using 'feature' as the key.  Consequently, the model must
-      accept a feature dict of the form {'feature': tensor}.  You may use
-      `TensorServingInputReceiver` if you want the tensor to be passed as is.
-    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
-      or `SparseTensor`, specifying input nodes where this receiver expects to
-      be fed by default.  Typically, this is a single placeholder expecting
-      serialized `tf.Example` protos.
-    receiver_tensors_alternatives: a dict of string to additional
-      groups of receiver tensors, each of which may be a `Tensor`,
-      `SparseTensor`, or dict of string to `Tensor` or`SparseTensor`.
-      These named receiver tensor alternatives generate additional serving
-      signatures, which may be used to feed inputs at different points within
-      the input receiver subgraph.  A typical usage is to allow feeding raw
-      feature `Tensor`s *downstream* of the tf.parse_example() op.
-      Defaults to None.
-  """
-
-  def __new__(cls,
-              features,
-              receiver_tensors,
-              receiver_tensors_alternatives=None):
-    features = _wrap_and_check_input_tensors(features, 'feature')
-
-    receiver_tensors = _wrap_and_check_input_tensors(receiver_tensors,
-                                                     'receiver_tensor')
-
-    if receiver_tensors_alternatives is not None:
-      if not isinstance(receiver_tensors_alternatives, dict):
-        raise ValueError(
-            'receiver_tensors_alternatives must be a dict: {}.'.format(
-                receiver_tensors_alternatives))
-      for alternative_name, receiver_tensors_alt in (
-          six.iteritems(receiver_tensors_alternatives)):
-        # Updating dict during iteration is OK in this case.
-        receiver_tensors_alternatives[alternative_name] = (
-            _wrap_and_check_input_tensors(
-                receiver_tensors_alt, 'receiver_tensors_alternative'))
-
-    return super(ServingInputReceiver, cls).__new__(
-        cls,
-        features=features,
-        receiver_tensors=receiver_tensors,
-        receiver_tensors_alternatives=receiver_tensors_alternatives)
-
-
-@estimator_export('estimator.export.TensorServingInputReceiver')
-class TensorServingInputReceiver(
-    collections.namedtuple(
-        'TensorServingInputReceiver',
-        ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
-  """A return type for a serving_input_receiver_fn.
-
-  This is for use with models that expect a single `Tensor` or `SparseTensor`
-  as an input feature, as opposed to a dict of features.
-
-  The normal `ServingInputReceiver` always returns a feature dict, even if it
-  contains only one entry, and so can be used only with models that accept such
-  a dict.  For models that accept only a single raw feature, the
-  `serving_input_receiver_fn` provided to `Estimator.export_savedmodel()` should
-  return this `TensorServingInputReceiver` instead.  See:
-  https://github.com/tensorflow/tensorflow/issues/11674
-
-  Note that the receiver_tensors and receiver_tensor_alternatives arguments
-  will be automatically converted to the dict representation in either case,
-  because the SavedModel format requires each input `Tensor` to have a name
-  (provided by the dict key).
-
-  The expected return values are:
-    features: A single `Tensor` or `SparseTensor`, representing the feature
-      to be passed to the model.
-    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
-      or `SparseTensor`, specifying input nodes where this receiver expects to
-      be fed by default.  Typically, this is a single placeholder expecting
-      serialized `tf.Example` protos.
-    receiver_tensors_alternatives: a dict of string to additional
-      groups of receiver tensors, each of which may be a `Tensor`,
-      `SparseTensor`, or dict of string to `Tensor` or`SparseTensor`.
-      These named receiver tensor alternatives generate additional serving
-      signatures, which may be used to feed inputs at different points within
-      the input receiver subgraph.  A typical usage is to allow feeding raw
-      feature `Tensor`s *downstream* of the tf.parse_example() op.
-      Defaults to None.
-  """
-
-  def __new__(cls,
-              features,
-              receiver_tensors,
-              receiver_tensors_alternatives=None):
-    if features is None:
-      raise ValueError('features must be defined.')
-    _check_tensor(features, None)
-
-    receiver = ServingInputReceiver(
-        features=features,
-        receiver_tensors=receiver_tensors,
-        receiver_tensors_alternatives=receiver_tensors_alternatives)
-
-    return super(TensorServingInputReceiver, cls).__new__(
-        cls,
-        features=receiver.features[_SINGLE_FEATURE_DEFAULT_NAME],
-        receiver_tensors=receiver.receiver_tensors,
-        receiver_tensors_alternatives=receiver.receiver_tensors_alternatives)
-
-
-class UnsupervisedInputReceiver(ServingInputReceiver):
-  """A return type for a training_input_receiver_fn or eval_input_receiver_fn.
-
-  This differs from SupervisedInputReceiver in that it does not require a set
-  of labels.
-
-  The expected return values are:
-    features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
-      `SparseTensor`, specifying the features to be passed to the model.
-    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
-      or `SparseTensor`, specifying input nodes where this receiver expects to
-      be fed by default.  Typically, this is a single placeholder expecting
-      serialized `tf.Example` protos.
-  """
-
-  def __new__(cls, features, receiver_tensors):
-    return super(UnsupervisedInputReceiver, cls).__new__(
-        cls,
-        features=features,
-        receiver_tensors=receiver_tensors,
-        receiver_tensors_alternatives=None)
-
-
-class SupervisedInputReceiver(
-    collections.namedtuple('SupervisedInputReceiver',
-                           ['features', 'labels', 'receiver_tensors'])):
-  """A return type for a training_input_receiver_fn or eval_input_receiver_fn.
-
-  This differs from a ServingInputReceiver in that (1) this receiver expects
-  a set of labels to be passed in with features, and (2) this receiver does
-  not support receiver_tensors_alternatives, which are primarily used for
-  serving.
-
-  The expected return values are:
-    features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
-      `SparseTensor`, specifying the features to be passed to the model.
-    labels: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
-      `SparseTensor`, specifying the labels to be passed to the model.
-    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
-      or `SparseTensor`, specifying input nodes where this receiver expects to
-      be fed by default.  Typically, this is a single placeholder expecting
-      serialized `tf.Example` protos.
-
-  """
-
-  def __new__(cls, features, labels, receiver_tensors):
-    # Both features and labels can be dicts or raw tensors.
-    for input_vals, error_label in ((features, 'feature'), (labels, 'label')):
-      # _wrap_and_check_input_tensors is called here only to validate the
-      # tensors. The wrapped dict that is returned is deliberately discarded.
-      _wrap_and_check_input_tensors(input_vals, error_label)
-
-    receiver_tensors = _wrap_and_check_input_tensors(receiver_tensors,
-                                                     'receiver_tensor')
-
-    return super(SupervisedInputReceiver, cls).__new__(
-        cls,
-        features=features,
-        labels=labels,
-        receiver_tensors=receiver_tensors)
-
-
-@estimator_export('estimator.export.build_parsing_serving_input_receiver_fn')
-def build_parsing_serving_input_receiver_fn(feature_spec,
-                                            default_batch_size=None):
-  """Build a serving_input_receiver_fn expecting fed tf.Examples.
-
-  Creates a serving_input_receiver_fn that expects a serialized tf.Example fed
-  into a string placeholder.  The function parses the tf.Example according to
-  the provided feature_spec, and returns all parsed Tensors as features.
-
-  Args:
-    feature_spec: a dict of string to `VarLenFeature`/`FixedLenFeature`.
-    default_batch_size: the number of query examples expected per batch.
-        Leave unset for variable batch size (recommended).
-
-  Returns:
-    A serving_input_receiver_fn suitable for use in serving.
-  """
-
-  def serving_input_receiver_fn():
-    """An input_fn that expects a serialized tf.Example."""
-    serialized_tf_example = array_ops.placeholder(
-        dtype=dtypes.string,
-        shape=[default_batch_size],
-        name='input_example_tensor')
-    receiver_tensors = {'examples': serialized_tf_example}
-    features = parsing_ops.parse_example(serialized_tf_example, feature_spec)
-    return ServingInputReceiver(features, receiver_tensors)
-
-  return serving_input_receiver_fn
-
-
-def _placeholder_from_tensor(t, default_batch_size=None):
-  """Creates a placeholder that matches the dtype and shape of passed tensor.
-
-  Args:
-    t: Tensor or EagerTensor
-    default_batch_size: the number of query examples expected per batch.
-        Leave unset for variable batch size (recommended).
-
-  Returns:
-    Placeholder that matches the passed tensor.
-  """
-  batch_shape = tensor_shape.TensorShape([default_batch_size])
-  shape = batch_shape.concatenate(t.get_shape()[1:])
-
-  # Reuse the feature tensor's op name (t.op.name) for the placeholder,
-  # excluding the index from the tensor's name (t.name):
-  # t.name = "%s:%d" % (t.op.name, t._value_index)
-  try:
-    name = t.op.name
-  except AttributeError:
-    # In Eager mode, tensors don't have ops or names, and while they do have
-    # IDs, those are not maintained across runs. The name here is used
-    # primarily for debugging, and is not critical to the placeholder.
-    # So, in order to make this Eager-compatible, continue with an empty
-    # name if none is available.
-    name = None
-
-  return array_ops.placeholder(dtype=t.dtype, shape=shape, name=name)
-
-
-def _placeholders_from_receiver_tensors_dict(input_vals,
-                                             default_batch_size=None):
-  return {
-      name: _placeholder_from_tensor(t, default_batch_size)
-      for name, t in input_vals.items()
-  }
-
-
-@estimator_export('estimator.export.build_raw_serving_input_receiver_fn')
-def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
-  """Build a serving_input_receiver_fn expecting feature Tensors.
-
-  Creates an serving_input_receiver_fn that expects all features to be fed
-  directly.
-
-  Args:
-    features: a dict of string to `Tensor`.
-    default_batch_size: the number of query examples expected per batch.
-        Leave unset for variable batch size (recommended).
-
-  Returns:
-    A serving_input_receiver_fn.
-  """
-
-  def serving_input_receiver_fn():
-    """A serving_input_receiver_fn that expects features to be fed directly."""
-    receiver_tensors = _placeholders_from_receiver_tensors_dict(
-        features, default_batch_size)
-    return ServingInputReceiver(receiver_tensors, receiver_tensors)
-
-  return serving_input_receiver_fn
-
-
-def build_raw_supervised_input_receiver_fn(features,
-                                           labels,
-                                           default_batch_size=None):
-  """Build a supervised_input_receiver_fn for raw features and labels.
-
-  This function wraps tensor placeholders in a supervised_receiver_fn
-  with the expectation that the features and labels appear precisely as
-  the model_fn expects them. Features and labels can therefore be dicts of
-  tensors, or raw tensors.
-
-  Args:
-    features: a dict of string to `Tensor` or `Tensor`.
-    labels: a dict of string to `Tensor` or `Tensor`.
-    default_batch_size: the number of query examples expected per batch.
-        Leave unset for variable batch size (recommended).
-
-  Returns:
-    A supervised_input_receiver_fn.
-
-  Raises:
-    ValueError: if features and labels have overlapping keys.
-  """
-  # Check for overlapping keys before beginning.
-  try:
-    feat_keys = features.keys()
-  except AttributeError:
-    feat_keys = [_SINGLE_RECEIVER_DEFAULT_NAME]
-  try:
-    label_keys = labels.keys()
-  except AttributeError:
-    label_keys = [_SINGLE_LABEL_DEFAULT_NAME]
-
-  overlap_keys = set(feat_keys) & set(label_keys)
-  if overlap_keys:
-    raise ValueError('Features and labels must have distinct keys. '
-                     'Found overlapping keys: {}'.format(overlap_keys))
-
-  def supervised_input_receiver_fn():
-    """A receiver_fn that expects pass-through features and labels."""
-    if not isinstance(features, dict):
-      features_cp = _placeholder_from_tensor(features, default_batch_size)
-      receiver_features = {_SINGLE_RECEIVER_DEFAULT_NAME: features_cp}
-    else:
-      receiver_features = _placeholders_from_receiver_tensors_dict(
-          features, default_batch_size)
-      features_cp = receiver_features
-
-    if not isinstance(labels, dict):
-      labels_cp = _placeholder_from_tensor(labels, default_batch_size)
-      receiver_labels = {_SINGLE_LABEL_DEFAULT_NAME: labels_cp}
-    else:
-      receiver_labels = _placeholders_from_receiver_tensors_dict(
-          labels, default_batch_size)
-      labels_cp = receiver_labels
-
-    receiver_tensors = dict(receiver_features)
-    receiver_tensors.update(receiver_labels)
-    return SupervisedInputReceiver(features_cp, labels_cp, receiver_tensors)
-
-  return supervised_input_receiver_fn
-
-
-def build_supervised_input_receiver_fn_from_input_fn(input_fn, **input_fn_args):
-  """Get a function that returns a SupervisedInputReceiver matching an input_fn.
-
-  Note that this function calls the input_fn in a local graph in order to
-  extract features and labels. Placeholders are then created from those
-  features and labels in the default graph.
-
-  Args:
-    input_fn: An Estimator input_fn, which is a function that returns one of:
-
-      * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
-          tuple (features, labels) with same constraints as below.
-      * A tuple (features, labels): Where `features` is a `Tensor` or a
-        dictionary of string feature name to `Tensor` and `labels` is a
-        `Tensor` or a dictionary of string label name to `Tensor`. Both
-        `features` and `labels` are consumed by `model_fn`. They should
-        satisfy the expectation of `model_fn` from inputs.
-
-    **input_fn_args: set of kwargs to be passed to the input_fn. Note that
-      these will not be checked or validated here, and any errors raised by
-      the input_fn will be thrown to the top.
-
-  Returns:
-    A function taking no arguments that, when called, returns a
-    SupervisedInputReceiver. This function can be passed in as part of the
-    input_receiver_map when exporting SavedModels from Estimator with multiple
-    modes.
-  """
-  # Wrap the input_fn call in a graph to prevent sullying the default namespace
-  with ops.Graph().as_default():
-    result = input_fn(**input_fn_args)
-    features, labels, _ = util.parse_input_fn_result(result)
-  # Placeholders are created back in the default graph.
-  return build_raw_supervised_input_receiver_fn(features, labels)
-
-
-### Below utilities are specific to SavedModel exports.
-
-
-def build_all_signature_defs(receiver_tensors,
-                             export_outputs,
-                             receiver_tensors_alternatives=None,
-                             serving_only=True):
-  """Build `SignatureDef`s for all export outputs.
-
-  Args:
-    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
-      input nodes where this receiver expects to be fed by default.  Typically,
-      this is a single placeholder expecting serialized `tf.Example` protos.
-    export_outputs: a dict of ExportOutput instances, each of which has
-      an as_signature_def instance method that will be called to retrieve
-      the signature_def for all export output tensors.
-    receiver_tensors_alternatives: a dict of string to additional
-      groups of receiver tensors, each of which may be a `Tensor` or a dict of
-      string to `Tensor`.  These named receiver tensor alternatives generate
-      additional serving signatures, which may be used to feed inputs at
-      different points within the input receiver subgraph.  A typical usage is
-      to allow feeding raw feature `Tensor`s *downstream* of the
-      tf.parse_example() op.  Defaults to None.
-    serving_only: boolean; if true, resulting signature defs will only include
-      valid serving signatures. If false, all requested signatures will be
-      returned.
-
-  Returns:
-    signature_def representing all passed args.
-
-  Raises:
-    ValueError: if export_outputs is not a dict
-  """
-  if not isinstance(receiver_tensors, dict):
-    receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
-  if export_outputs is None or not isinstance(export_outputs, dict):
-    raise ValueError('export_outputs must be a dict and not'
-                     '{}'.format(type(export_outputs)))
-
-  signature_def_map = {}
-  excluded_signatures = {}
-  for output_key, export_output in export_outputs.items():
-    signature_name = '{}'.format(output_key or 'None')
-    try:
-      signature = export_output.as_signature_def(receiver_tensors)
-      signature_def_map[signature_name] = signature
-    except ValueError as e:
-      excluded_signatures[signature_name] = str(e)
-
-  if receiver_tensors_alternatives:
-    for receiver_name, receiver_tensors_alt in (
-        six.iteritems(receiver_tensors_alternatives)):
-      if not isinstance(receiver_tensors_alt, dict):
-        receiver_tensors_alt = {
-            _SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
-        }
-      for output_key, export_output in export_outputs.items():
-        signature_name = '{}:{}'.format(receiver_name or 'None', output_key or
-                                        'None')
-        try:
-          signature = export_output.as_signature_def(receiver_tensors_alt)
-          signature_def_map[signature_name] = signature
-        except ValueError as e:
-          excluded_signatures[signature_name] = str(e)
-
-  _log_signature_report(signature_def_map, excluded_signatures)
-
-  # The above calls to export_output.as_signature_def should return only
-  # valid signatures; if there is a validity problem, they raise a ValueError,
-  # in which case we exclude that signature from signature_def_map above.
-  # The is_valid_signature check ensures that the signatures produced are
-  # valid for serving, and acts as an additional sanity check for export
-  # signatures produced for serving. We skip this check for training and eval
-  # signatures, which are not intended for serving.
-  if serving_only:
-    signature_def_map = {
-        k: v
-        for k, v in signature_def_map.items()
-        if signature_def_utils.is_valid_signature(v)
-    }
-  return signature_def_map
-
-
-_FRIENDLY_METHOD_NAMES = {
-    signature_constants.CLASSIFY_METHOD_NAME: 'Classify',
-    signature_constants.REGRESS_METHOD_NAME: 'Regress',
-    signature_constants.PREDICT_METHOD_NAME: 'Predict',
-    signature_constants.SUPERVISED_TRAIN_METHOD_NAME: 'Train',
-    signature_constants.SUPERVISED_EVAL_METHOD_NAME: 'Eval',
-}
-
-
-def _log_signature_report(signature_def_map, excluded_signatures):
-  """Log a report of which signatures were produced."""
-  sig_names_by_method_name = collections.defaultdict(list)
-
-  # We'll collect whatever method_names are present, but also we want to make
-  # sure to output a line for each of the three standard methods even if they
-  # have no signatures.
-  for method_name in _FRIENDLY_METHOD_NAMES:
-    sig_names_by_method_name[method_name] = []
-
-  for signature_name, sig in signature_def_map.items():
-    sig_names_by_method_name[sig.method_name].append(signature_name)
-
-  # TODO(b/67733540): consider printing the full signatures, not just names
-  for method_name, sig_names in sig_names_by_method_name.items():
-    if method_name in _FRIENDLY_METHOD_NAMES:
-      method_name = _FRIENDLY_METHOD_NAMES[method_name]
-    logging.info('Signatures INCLUDED in export for {}: {}'.format(
-        method_name, sig_names if sig_names else 'None'))
-
-  if excluded_signatures:
-    logging.info('Signatures EXCLUDED from export because they cannot be '
-                 'be served via TensorFlow Serving APIs:')
-    for signature_name, message in excluded_signatures.items():
-      logging.info('\'{}\' : {}'.format(signature_name, message))
-
-  if not signature_def_map:
-    logging.warn('Export includes no signatures!')
-  elif (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY not in
-        signature_def_map):
-    logging.warn('Export includes no default signature!')
-
-
-def get_timestamped_export_dir(export_dir_base):
-  """Builds a path to a new subdirectory within the base directory.
-
-  Each export is written into a new subdirectory named using the
-  current time.  This guarantees monotonically increasing version
-  numbers even across multiple runs of the pipeline.
-  The timestamp used is the number of seconds since epoch UTC.
-
-  Args:
-    export_dir_base: A string containing a directory to write the exported
-        graph and checkpoints.
-  Returns:
-    The full path of the new subdirectory (which is not actually created yet).
-
-  Raises:
-    RuntimeError: if repeated attempts fail to obtain a unique timestamped
-      directory name.
-  """
-  return util.get_timestamped_dir(export_dir_base)
-
-
-def get_temp_export_dir(timestamped_export_dir):
-  """Builds a directory name based on the argument but starting with 'temp-'.
-
-  This relies on the fact that TensorFlow Serving ignores subdirectories of
-  the base directory that can't be parsed as integers.
+from tensorflow_estimator.python.estimator.export import export
 
-  Args:
-    timestamped_export_dir: the name of the eventual export directory, e.g.
-      /foo/bar/<timestamp>
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+export.__all__ = [s for s in dir(export) if not s.startswith('__')]
 
-  Returns:
-    A sister directory prefixed with 'temp-', e.g. /foo/bar/temp-<timestamp>.
-  """
-  (dirname, basename) = os.path.split(timestamped_export_dir)
-  temp_export_dir = os.path.join(
-      compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename)))
-  return temp_export_dir
+from tensorflow_estimator.python.estimator.export.export import *
diff --git a/tensorflow/python/estimator/export/export_lib.py b/tensorflow/python/estimator/export/export_lib.py
index f4ac8581ea555bfcdf4b714326cb23a16b1f83e5..2e1d7c2474c8920b78522ed8e935c9090056fce2 100644
--- a/tensorflow/python/estimator/export/export_lib.py
+++ b/tensorflow/python/estimator/export/export_lib.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility methods for exporting Estimator."""
+"""export_lib python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,line-too-long
-from tensorflow.python.estimator.export.export import build_parsing_serving_input_receiver_fn
-from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
-from tensorflow.python.estimator.export.export import ServingInputReceiver
-from tensorflow.python.estimator.export.export import TensorServingInputReceiver
-from tensorflow.python.estimator.export.export_output import ClassificationOutput
-from tensorflow.python.estimator.export.export_output import ExportOutput
-from tensorflow.python.estimator.export.export_output import PredictOutput
-from tensorflow.python.estimator.export.export_output import RegressionOutput
+from tensorflow_estimator.python.estimator.export import export_lib
 
-# pylint: enable=unused-import,line-too-long
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+export_lib.__all__ = [s for s in dir(export_lib) if not s.startswith('__')]
 
+from tensorflow_estimator.python.estimator.export.export_lib import *
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index c17fc08f21032efb9a0f190112f86251f06b262a..eed85395c771abf2dddc270bea27153bd3360db0 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,402 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes for different types of export output."""
+"""export_output python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
-
-import six
-
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.saved_model import signature_def_utils
-from tensorflow.python.util.tf_export import estimator_export
-
-
-@estimator_export('estimator.export.ExportOutput')
-class ExportOutput(object):
-  """Represents an output of a model that can be served.
-
-  These typically correspond to model heads.
-  """
-
-  __metaclass__ = abc.ABCMeta
-
-  _SEPARATOR_CHAR = '/'
-
-  @abc.abstractmethod
-  def as_signature_def(self, receiver_tensors):
-    """Generate a SignatureDef proto for inclusion in a MetaGraphDef.
-
-    The SignatureDef will specify outputs as described in this ExportOutput,
-    and will use the provided receiver_tensors as inputs.
-
-    Args:
-      receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
-        input nodes that will be fed.
-    """
-    pass
-
-  def _check_output_key(self, key, error_label):
-    # For multi-head models, the key can be a tuple.
-    if isinstance(key, tuple):
-      key = self._SEPARATOR_CHAR.join(key)
-
-    if not isinstance(key, six.string_types):
-      raise ValueError(
-          '{} output key must be a string; got {}.'.format(error_label, key))
-    return key
-
-  def _wrap_and_check_outputs(
-      self, outputs, single_output_default_name, error_label=None):
-    """Wraps raw tensors as dicts and checks type.
-
-    Note that we create a new dict here so that we can overwrite the keys
-    if necessary.
-
-    Args:
-      outputs: A `Tensor` or a dict of string to `Tensor`.
-      single_output_default_name: A string key for use in the output dict
-        if the provided `outputs` is a raw tensor.
-      error_label: descriptive string for use in error messages. If none,
-        single_output_default_name will be used.
-
-    Returns:
-      A dict of tensors
-
-    Raises:
-      ValueError: if the outputs dict keys are not strings or tuples of strings
-        or the values are not Tensors.
-    """
-    if not isinstance(outputs, dict):
-      outputs = {single_output_default_name: outputs}
-
-    output_dict = {}
-    for key, value in outputs.items():
-      error_name = error_label or single_output_default_name
-      key = self._check_output_key(key, error_name)
-      if not isinstance(value, ops.Tensor):
-        raise ValueError(
-            '{} output value must be a Tensor; got {}.'.format(
-                error_name, value))
-
-      output_dict[key] = value
-    return output_dict
-
-
-@estimator_export('estimator.export.ClassificationOutput')
-class ClassificationOutput(ExportOutput):
-  """Represents the output of a classification head.
-
-  Either classes or scores or both must be set.
-
-  The classes `Tensor` must provide string labels, not integer class IDs.
-
-  If only classes is set, it is interpreted as providing top-k results in
-  descending order.
-
-  If only scores is set, it is interpreted as providing a score for every class
-  in order of class ID.
-
-  If both classes and scores are set, they are interpreted as zipped, so each
-  score corresponds to the class at the same index.  Clients should not depend
-  on the order of the entries.
-  """
-
-  def __init__(self, scores=None, classes=None):
-    """Constructor for `ClassificationOutput`.
-
-    Args:
-      scores: A float `Tensor` giving scores (sometimes but not always
-          interpretable as probabilities) for each class.  May be `None`, but
-          only if `classes` is set.  Interpretation varies-- see class doc.
-      classes: A string `Tensor` giving predicted class labels.  May be `None`,
-          but only if `scores` is set.  Interpretation varies-- see class doc.
-
-    Raises:
-      ValueError: if neither classes nor scores is set, or one of them is not a
-          `Tensor` with the correct dtype.
-    """
-    if (scores is not None
-        and not (isinstance(scores, ops.Tensor)
-                 and scores.dtype.is_floating)):
-      raise ValueError('Classification scores must be a float32 Tensor; '
-                       'got {}'.format(scores))
-    if (classes is not None
-        and not (isinstance(classes, ops.Tensor)
-                 and dtypes.as_dtype(classes.dtype) == dtypes.string)):
-      raise ValueError('Classification classes must be a string Tensor; '
-                       'got {}'.format(classes))
-    if scores is None and classes is None:
-      raise ValueError('At least one of scores and classes must be set.')
-
-    self._scores = scores
-    self._classes = classes
-
-  @property
-  def scores(self):
-    return self._scores
-
-  @property
-  def classes(self):
-    return self._classes
-
-  def as_signature_def(self, receiver_tensors):
-    if len(receiver_tensors) != 1:
-      raise ValueError('Classification input must be a single string Tensor; '
-                       'got {}'.format(receiver_tensors))
-    (_, examples), = receiver_tensors.items()
-    if dtypes.as_dtype(examples.dtype) != dtypes.string:
-      raise ValueError('Classification input must be a single string Tensor; '
-                       'got {}'.format(receiver_tensors))
-    return signature_def_utils.classification_signature_def(
-        examples, self.classes, self.scores)
-
-
-@estimator_export('estimator.export.RegressionOutput')
-class RegressionOutput(ExportOutput):
-  """Represents the output of a regression head."""
-
-  def __init__(self, value):
-    """Constructor for `RegressionOutput`.
-
-    Args:
-      value: a float `Tensor` giving the predicted values.  Required.
-
-    Raises:
-      ValueError: if the value is not a `Tensor` with dtype tf.float32.
-    """
-    if not (isinstance(value, ops.Tensor) and value.dtype.is_floating):
-      raise ValueError('Regression output value must be a float32 Tensor; '
-                       'got {}'.format(value))
-    self._value = value
-
-  @property
-  def value(self):
-    return self._value
-
-  def as_signature_def(self, receiver_tensors):
-    if len(receiver_tensors) != 1:
-      raise ValueError('Regression input must be a single string Tensor; '
-                       'got {}'.format(receiver_tensors))
-    (_, examples), = receiver_tensors.items()
-    if dtypes.as_dtype(examples.dtype) != dtypes.string:
-      raise ValueError('Regression input must be a single string Tensor; '
-                       'got {}'.format(receiver_tensors))
-    return signature_def_utils.regression_signature_def(examples, self.value)
-
-
-@estimator_export('estimator.export.PredictOutput')
-class PredictOutput(ExportOutput):
-  """Represents the output of a generic prediction head.
-
-  A generic prediction need not be either a classification or a regression.
-
-  Named outputs must be provided as a dict from string to `Tensor`,
-  """
-  _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
-
-  def __init__(self, outputs):
-    """Constructor for PredictOutput.
-
-    Args:
-      outputs: A `Tensor` or a dict of string to `Tensor` representing the
-        predictions.
-
-    Raises:
-      ValueError: if the outputs is not dict, or any of its keys are not
-          strings, or any of its values are not `Tensor`s.
-    """
-
-    self._outputs = self._wrap_and_check_outputs(
-        outputs, self._SINGLE_OUTPUT_DEFAULT_NAME, error_label='Prediction')
-
-  @property
-  def outputs(self):
-    return self._outputs
-
-  def as_signature_def(self, receiver_tensors):
-    return signature_def_utils.predict_signature_def(receiver_tensors,
-                                                     self.outputs)
-
-
-class _SupervisedOutput(ExportOutput):
-  """Represents the output of a supervised training or eval process."""
-  __metaclass__ = abc.ABCMeta
-
-  LOSS_NAME = 'loss'
-  PREDICTIONS_NAME = 'predictions'
-  METRICS_NAME = 'metrics'
-
-  METRIC_VALUE_SUFFIX = 'value'
-  METRIC_UPDATE_SUFFIX = 'update_op'
-
-  _loss = None
-  _predictions = None
-  _metrics = None
-
-  def __init__(self, loss=None, predictions=None, metrics=None):
-    """Constructor for SupervisedOutput (ie, Train or Eval output).
-
-    Args:
-      loss: dict of Tensors or single Tensor representing calculated loss.
-      predictions: dict of Tensors or single Tensor representing model
-        predictions.
-      metrics: Dict of metric results keyed by name.
-        The values of the dict can be one of the following:
-        (1) instance of `Metric` class.
-        (2) (metric_value, update_op) tuples, or a single tuple.
-        metric_value must be a Tensor, and update_op must be a Tensor or Op.
-
-    Raises:
-      ValueError: if any of the outputs' dict keys are not strings or tuples of
-        strings or the values are not Tensors (or Operations in the case of
-        update_op).
-    """
-
-    if loss is not None:
-      loss_dict = self._wrap_and_check_outputs(loss, self.LOSS_NAME)
-      self._loss = self._prefix_output_keys(loss_dict, self.LOSS_NAME)
-    if predictions is not None:
-      pred_dict = self._wrap_and_check_outputs(
-          predictions, self.PREDICTIONS_NAME)
-      self._predictions = self._prefix_output_keys(
-          pred_dict, self.PREDICTIONS_NAME)
-    if metrics is not None:
-      self._metrics = self._wrap_and_check_metrics(metrics)
-
-  def _prefix_output_keys(self, output_dict, output_name):
-    """Prepend output_name to the output_dict keys if it doesn't exist.
-
-    This produces predictable prefixes for the pre-determined outputs
-    of SupervisedOutput.
-
-    Args:
-      output_dict: dict of string to Tensor, assumed valid.
-      output_name: prefix string to prepend to existing keys.
-
-    Returns:
-      dict with updated keys and existing values.
-    """
-
-    new_outputs = {}
-    for key, val in output_dict.items():
-      key = self._prefix_key(key, output_name)
-      new_outputs[key] = val
-    return new_outputs
-
-  def _prefix_key(self, key, output_name):
-    if key.find(output_name) != 0:
-      key = output_name + self._SEPARATOR_CHAR + key
-    return key
-
-  def _wrap_and_check_metrics(self, metrics):
-    """Handle the saving of metrics.
-
-    Metrics is either a tuple of (value, update_op), or a dict of such tuples.
-    Here, we separate out the tuples and create a dict with names to tensors.
-
-    Args:
-      metrics: Dict of metric results keyed by name.
-        The values of the dict can be one of the following:
-        (1) instance of `Metric` class.
-        (2) (metric_value, update_op) tuples, or a single tuple.
-        metric_value must be a Tensor, and update_op must be a Tensor or Op.
-
-    Returns:
-      dict of output_names to tensors
-
-    Raises:
-      ValueError: if the dict key is not a string, or the metric values or ops
-        are not tensors.
-    """
-    if not isinstance(metrics, dict):
-      metrics = {self.METRICS_NAME: metrics}
-
-    outputs = {}
-    for key, value in metrics.items():
-      if isinstance(value, metrics_module.Metric):
-        metric_val = value.result()
-        assert len(value.updates) == 1  # We expect only one update op.
-        metric_op = value.updates[0]
-      else:
-        metric_val, metric_op = value
-      key = self._check_output_key(key, self.METRICS_NAME)
-      key = self._prefix_key(key, self.METRICS_NAME)
-
-      val_name = key + self._SEPARATOR_CHAR + self.METRIC_VALUE_SUFFIX
-      op_name = key + self._SEPARATOR_CHAR + self.METRIC_UPDATE_SUFFIX
-      if not isinstance(metric_val, ops.Tensor):
-        raise ValueError(
-            '{} output value must be a Tensor; got {}.'.format(
-                key, metric_val))
-      if (not isinstance(metric_op, ops.Tensor) and
-          not isinstance(metric_op, ops.Operation)):
-        raise ValueError(
-            '{} update_op must be a Tensor or Operation; got {}.'.format(
-                key, metric_op))
-
-      # We must wrap any ops in a Tensor before export, as the SignatureDef
-      # proto expects tensors only. See b/109740581
-      metric_op_tensor = metric_op
-      if isinstance(metric_op, ops.Operation):
-        with ops.control_dependencies([metric_op]):
-          metric_op_tensor = constant_op.constant([], name='metric_op_wrapper')
-
-      outputs[val_name] = metric_val
-      outputs[op_name] = metric_op_tensor
-
-    return outputs
-
-  @property
-  def loss(self):
-    return self._loss
-
-  @property
-  def predictions(self):
-    return self._predictions
-
-  @property
-  def metrics(self):
-    return self._metrics
-
-  @abc.abstractmethod
-  def _get_signature_def_fn(self):
-    """Returns a function that produces a SignatureDef given desired outputs."""
-    pass
-
-  def as_signature_def(self, receiver_tensors):
-    signature_def_fn = self._get_signature_def_fn()
-    return signature_def_fn(
-        receiver_tensors, self.loss, self.predictions, self.metrics)
-
-
-class TrainOutput(_SupervisedOutput):
-  """Represents the output of a supervised training process.
-
-  This class generates the appropriate signature def for exporting
-  training output by type-checking and wrapping loss, predictions, and metrics
-  values.
-  """
-
-  def _get_signature_def_fn(self):
-    return signature_def_utils.supervised_train_signature_def
-
-
-class EvalOutput(_SupervisedOutput):
-  """Represents the output of a supervised eval process.
+from tensorflow_estimator.python.estimator.export import export_output
 
-  This class generates the appropriate signature def for exporting
-  eval output by type-checking and wrapping loss, predictions, and metrics
-  values.
-  """
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+export_output.__all__ = [
+    s for s in dir(export_output) if not s.startswith('__')
+]
 
-  def _get_signature_def_fn(self):
-    return signature_def_utils.supervised_eval_signature_def
+from tensorflow_estimator.python.estimator.export.export_output import *
diff --git a/tensorflow/python/estimator/export/export_output_test.py b/tensorflow/python/estimator/export/export_output_test.py
deleted file mode 100644
index 96ce0e580d7c4d94e3eced7394ce7f138e9e0030..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/export/export_output_test.py
+++ /dev/null
@@ -1,397 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for export."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.framework import tensor_shape_pb2
-from tensorflow.core.framework import types_pb2
-from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import signature_constants
-
-
-class ExportOutputTest(test.TestCase):
-
-  def test_regress_value_must_be_float(self):
-    value = array_ops.placeholder(dtypes.string, 1, name="output-tensor-1")
-    with self.assertRaises(ValueError) as e:
-      export_output_lib.RegressionOutput(value)
-    self.assertEqual('Regression output value must be a float32 Tensor; got '
-                     'Tensor("output-tensor-1:0", shape=(1,), dtype=string)',
-                     str(e.exception))
-
-  def test_classify_classes_must_be_strings(self):
-    classes = array_ops.placeholder(dtypes.float32, 1, name="output-tensor-1")
-    with self.assertRaises(ValueError) as e:
-      export_output_lib.ClassificationOutput(classes=classes)
-    self.assertEqual('Classification classes must be a string Tensor; got '
-                     'Tensor("output-tensor-1:0", shape=(1,), dtype=float32)',
-                     str(e.exception))
-
-  def test_classify_scores_must_be_float(self):
-    scores = array_ops.placeholder(dtypes.string, 1, name="output-tensor-1")
-    with self.assertRaises(ValueError) as e:
-      export_output_lib.ClassificationOutput(scores=scores)
-    self.assertEqual('Classification scores must be a float32 Tensor; got '
-                     'Tensor("output-tensor-1:0", shape=(1,), dtype=string)',
-                     str(e.exception))
-
-  def test_classify_requires_classes_or_scores(self):
-    with self.assertRaises(ValueError) as e:
-      export_output_lib.ClassificationOutput()
-    self.assertEqual("At least one of scores and classes must be set.",
-                     str(e.exception))
-
-  def test_build_standardized_signature_def_regression(self):
-    input_tensors = {
-        "input-1":
-            array_ops.placeholder(
-                dtypes.string, 1, name="input-tensor-1")
-    }
-    value = array_ops.placeholder(dtypes.float32, 1, name="output-tensor-1")
-
-    export_output = export_output_lib.RegressionOutput(value)
-    actual_signature_def = export_output.as_signature_def(input_tensors)
-
-    expected_signature_def = meta_graph_pb2.SignatureDef()
-    shape = tensor_shape_pb2.TensorShapeProto(
-        dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
-    dtype_float = types_pb2.DataType.Value("DT_FLOAT")
-    dtype_string = types_pb2.DataType.Value("DT_STRING")
-    expected_signature_def.inputs[
-        signature_constants.REGRESS_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="input-tensor-1:0",
-                                      dtype=dtype_string,
-                                      tensor_shape=shape))
-    expected_signature_def.outputs[
-        signature_constants.REGRESS_OUTPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="output-tensor-1:0",
-                                      dtype=dtype_float,
-                                      tensor_shape=shape))
-
-    expected_signature_def.method_name = signature_constants.REGRESS_METHOD_NAME
-    self.assertEqual(actual_signature_def, expected_signature_def)
-
-  def test_build_standardized_signature_def_classify_classes_only(self):
-    """Tests classification with one output tensor."""
-    input_tensors = {
-        "input-1":
-            array_ops.placeholder(
-                dtypes.string, 1, name="input-tensor-1")
-    }
-    classes = array_ops.placeholder(dtypes.string, 1, name="output-tensor-1")
-
-    export_output = export_output_lib.ClassificationOutput(classes=classes)
-    actual_signature_def = export_output.as_signature_def(input_tensors)
-
-    expected_signature_def = meta_graph_pb2.SignatureDef()
-    shape = tensor_shape_pb2.TensorShapeProto(
-        dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
-    dtype_string = types_pb2.DataType.Value("DT_STRING")
-    expected_signature_def.inputs[
-        signature_constants.CLASSIFY_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="input-tensor-1:0",
-                                      dtype=dtype_string,
-                                      tensor_shape=shape))
-    expected_signature_def.outputs[
-        signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="output-tensor-1:0",
-                                      dtype=dtype_string,
-                                      tensor_shape=shape))
-
-    expected_signature_def.method_name = (
-        signature_constants.CLASSIFY_METHOD_NAME)
-    self.assertEqual(actual_signature_def, expected_signature_def)
-
-  def test_build_standardized_signature_def_classify_both(self):
-    """Tests multiple output tensors that include classes and scores."""
-    input_tensors = {
-        "input-1":
-            array_ops.placeholder(
-                dtypes.string, 1, name="input-tensor-1")
-    }
-    classes = array_ops.placeholder(dtypes.string, 1,
-                                    name="output-tensor-classes")
-    scores = array_ops.placeholder(dtypes.float32, 1,
-                                   name="output-tensor-scores")
-
-    export_output = export_output_lib.ClassificationOutput(
-        scores=scores, classes=classes)
-    actual_signature_def = export_output.as_signature_def(input_tensors)
-
-    expected_signature_def = meta_graph_pb2.SignatureDef()
-    shape = tensor_shape_pb2.TensorShapeProto(
-        dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
-    dtype_float = types_pb2.DataType.Value("DT_FLOAT")
-    dtype_string = types_pb2.DataType.Value("DT_STRING")
-    expected_signature_def.inputs[
-        signature_constants.CLASSIFY_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="input-tensor-1:0",
-                                      dtype=dtype_string,
-                                      tensor_shape=shape))
-    expected_signature_def.outputs[
-        signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="output-tensor-classes:0",
-                                      dtype=dtype_string,
-                                      tensor_shape=shape))
-    expected_signature_def.outputs[
-        signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="output-tensor-scores:0",
-                                      dtype=dtype_float,
-                                      tensor_shape=shape))
-
-    expected_signature_def.method_name = (
-        signature_constants.CLASSIFY_METHOD_NAME)
-    self.assertEqual(actual_signature_def, expected_signature_def)
-
-  def test_build_standardized_signature_def_classify_scores_only(self):
-    """Tests classification without classes tensor."""
-    input_tensors = {
-        "input-1":
-            array_ops.placeholder(
-                dtypes.string, 1, name="input-tensor-1")
-    }
-
-    scores = array_ops.placeholder(dtypes.float32, 1,
-                                   name="output-tensor-scores")
-
-    export_output = export_output_lib.ClassificationOutput(
-        scores=scores)
-    actual_signature_def = export_output.as_signature_def(input_tensors)
-
-    expected_signature_def = meta_graph_pb2.SignatureDef()
-    shape = tensor_shape_pb2.TensorShapeProto(
-        dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
-    dtype_float = types_pb2.DataType.Value("DT_FLOAT")
-    dtype_string = types_pb2.DataType.Value("DT_STRING")
-    expected_signature_def.inputs[
-        signature_constants.CLASSIFY_INPUTS].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="input-tensor-1:0",
-                                      dtype=dtype_string,
-                                      tensor_shape=shape))
-    expected_signature_def.outputs[
-        signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
-            meta_graph_pb2.TensorInfo(name="output-tensor-scores:0",
-                                      dtype=dtype_float,
-                                      tensor_shape=shape))
-
-    expected_signature_def.method_name = (
-        signature_constants.CLASSIFY_METHOD_NAME)
-    self.assertEqual(actual_signature_def, expected_signature_def)
-
-  def test_predict_outputs_valid(self):
-    """Tests that no errors are raised when provided outputs are valid."""
-    outputs = {
-        "output0": constant_op.constant([0]),
-        u"output1": constant_op.constant(["foo"]),
-    }
-    export_output_lib.PredictOutput(outputs)
-
-    # Single Tensor is OK too
-    export_output_lib.PredictOutput(constant_op.constant([0]))
-
-  def test_predict_outputs_invalid(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        "Prediction output key must be a string"):
-      export_output_lib.PredictOutput({1: constant_op.constant([0])})
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        "Prediction output value must be a Tensor"):
-      export_output_lib.PredictOutput({
-          "prediction1": sparse_tensor.SparseTensor(
-              indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-      })
-
-
-class MockSupervisedOutput(export_output_lib._SupervisedOutput):
-  """So that we can test the abstract class methods directly."""
-
-  def _get_signature_def_fn(self):
-    pass
-
-
-class SupervisedOutputTest(test.TestCase):
-
-  def test_supervised_outputs_valid(self):
-    """Tests that no errors are raised when provided outputs are valid."""
-    loss = {"my_loss": constant_op.constant([0])}
-    predictions = {u"output1": constant_op.constant(["foo"])}
-    metric_obj = metrics_module.Mean()
-    metric_obj.update_state(constant_op.constant([0]))
-    metrics = {
-        "metrics": metric_obj,
-        "metrics2": (constant_op.constant([0]), constant_op.constant([10]))
-    }
-
-    outputter = MockSupervisedOutput(loss, predictions, metrics)
-    self.assertEqual(outputter.loss["loss/my_loss"], loss["my_loss"])
-    self.assertEqual(
-        outputter.predictions["predictions/output1"], predictions["output1"])
-    self.assertEqual(outputter.metrics["metrics/update_op"].name,
-                     "metric_op_wrapper:0")
-    self.assertEqual(
-        outputter.metrics["metrics2/update_op"], metrics["metrics2"][1])
-
-    # Single Tensor is OK too
-    outputter = MockSupervisedOutput(
-        loss["my_loss"], predictions["output1"], metrics["metrics"])
-    self.assertEqual(outputter.loss, {"loss": loss["my_loss"]})
-    self.assertEqual(
-        outputter.predictions, {"predictions": predictions["output1"]})
-    self.assertEqual(outputter.metrics["metrics/update_op"].name,
-                     "metric_op_wrapper_1:0")
-
-  def test_supervised_outputs_none(self):
-    outputter = MockSupervisedOutput(
-        constant_op.constant([0]), None, None)
-    self.assertEqual(len(outputter.loss), 1)
-    self.assertEqual(outputter.predictions, None)
-    self.assertEqual(outputter.metrics, None)
-
-  def test_supervised_outputs_invalid(self):
-    with self.assertRaisesRegexp(ValueError, "predictions output value must"):
-      MockSupervisedOutput(constant_op.constant([0]), [3], None)
-    with self.assertRaisesRegexp(ValueError, "loss output value must"):
-      MockSupervisedOutput("str", None, None)
-    with self.assertRaisesRegexp(ValueError, "metrics output value must"):
-      MockSupervisedOutput(None, None, (15.3, 4))
-    with self.assertRaisesRegexp(ValueError, "loss output key must"):
-      MockSupervisedOutput({25: "Tensor"}, None, None)
-
-  def test_supervised_outputs_tuples(self):
-    """Tests that no errors are raised when provided outputs are valid."""
-    loss = {("my", "loss"): constant_op.constant([0])}
-    predictions = {(u"output1", "2"): constant_op.constant(["foo"])}
-    metric_obj = metrics_module.Mean()
-    metric_obj.update_state(constant_op.constant([0]))
-    metrics = {
-        ("metrics", "1"):
-            metric_obj,
-        ("metrics", "2"): (constant_op.constant([0]),
-                           constant_op.constant([10]))
-    }
-
-    outputter = MockSupervisedOutput(loss, predictions, metrics)
-    self.assertEqual(set(outputter.loss.keys()), set(["loss/my/loss"]))
-    self.assertEqual(set(outputter.predictions.keys()),
-                     set(["predictions/output1/2"]))
-    self.assertEqual(
-        set(outputter.metrics.keys()),
-        set([
-            "metrics/1/value", "metrics/1/update_op", "metrics/2/value",
-            "metrics/2/update_op"
-        ]))
-
-  def test_supervised_outputs_no_prepend(self):
-    """Tests that no errors are raised when provided outputs are valid."""
-    loss = {"loss": constant_op.constant([0])}
-    predictions = {u"predictions": constant_op.constant(["foo"])}
-    metric_obj = metrics_module.Mean()
-    metric_obj.update_state(constant_op.constant([0]))
-    metrics = {
-        "metrics_1": metric_obj,
-        "metrics_2": (constant_op.constant([0]), constant_op.constant([10]))
-    }
-
-    outputter = MockSupervisedOutput(loss, predictions, metrics)
-    self.assertEqual(set(outputter.loss.keys()), set(["loss"]))
-    self.assertEqual(set(outputter.predictions.keys()), set(["predictions"]))
-    self.assertEqual(
-        set(outputter.metrics.keys()),
-        set([
-            "metrics_1/value", "metrics_1/update_op", "metrics_2/update_op",
-            "metrics_2/value"
-        ]))
-
-  def test_train_signature_def(self):
-    loss = {"my_loss": constant_op.constant([0])}
-    predictions = {u"output1": constant_op.constant(["foo"])}
-    metric_obj = metrics_module.Mean()
-    metric_obj.update_state(constant_op.constant([0]))
-    metrics = {
-        "metrics_1": metric_obj,
-        "metrics_2": (constant_op.constant([0]), constant_op.constant([10]))
-    }
-
-    outputter = export_output_lib.TrainOutput(loss, predictions, metrics)
-
-    receiver = {u"features": constant_op.constant(100, shape=(100, 2)),
-                "labels": constant_op.constant(100, shape=(100, 1))}
-    sig_def = outputter.as_signature_def(receiver)
-
-    self.assertTrue("loss/my_loss" in sig_def.outputs)
-    self.assertTrue("metrics_1/value" in sig_def.outputs)
-    self.assertTrue("metrics_2/value" in sig_def.outputs)
-    self.assertTrue("predictions/output1" in sig_def.outputs)
-    self.assertTrue("features" in sig_def.inputs)
-
-  def test_eval_signature_def(self):
-    loss = {"my_loss": constant_op.constant([0])}
-    predictions = {u"output1": constant_op.constant(["foo"])}
-
-    outputter = export_output_lib.EvalOutput(loss, predictions, None)
-
-    receiver = {u"features": constant_op.constant(100, shape=(100, 2)),
-                "labels": constant_op.constant(100, shape=(100, 1))}
-    sig_def = outputter.as_signature_def(receiver)
-
-    self.assertTrue("loss/my_loss" in sig_def.outputs)
-    self.assertFalse("metrics/value" in sig_def.outputs)
-    self.assertTrue("predictions/output1" in sig_def.outputs)
-    self.assertTrue("features" in sig_def.inputs)
-
-  def test_metric_op_is_tensor(self):
-    """Tests that ops.Operation is wrapped by a tensor for metric_ops."""
-    loss = {"my_loss": constant_op.constant([0])}
-    predictions = {u"output1": constant_op.constant(["foo"])}
-    metric_obj = metrics_module.Mean()
-    metric_obj.update_state(constant_op.constant([0]))
-    metrics = {
-        "metrics_1": metric_obj,
-        "metrics_2": (constant_op.constant([0]), control_flow_ops.no_op())
-    }
-
-    outputter = MockSupervisedOutput(loss, predictions, metrics)
-
-    self.assertTrue(outputter.metrics["metrics_1/update_op"].name.startswith(
-        "metric_op_wrapper"))
-    self.assertTrue(
-        isinstance(outputter.metrics["metrics_1/update_op"], ops.Tensor))
-    self.assertTrue(
-        isinstance(outputter.metrics["metrics_1/value"], ops.Tensor))
-
-    self.assertEqual(outputter.metrics["metrics_2/value"],
-                     metrics["metrics_2"][0])
-    self.assertTrue(outputter.metrics["metrics_2/update_op"].name.startswith(
-        "metric_op_wrapper"))
-    self.assertTrue(
-        isinstance(outputter.metrics["metrics_2/update_op"], ops.Tensor))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
deleted file mode 100644
index ed3219c49bf2bb3d08345b81aa798334ab910d0d..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/export/export_test.py
+++ /dev/null
@@ -1,802 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for export."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-import time
-
-from google.protobuf import text_format
-
-from tensorflow.core.example import example_pb2
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import signature_def_utils
-
-
-class LabeledTensorMock(object):
-  """Mock class emulating LabeledTensor."""
-
-  def __init__(self):
-    self.tensor = constant_op.constant([1])
-
-
-def _convert_labeled_tensor_mock_to_tensor(value, *args, **kwargs):
-  return ops.internal_convert_to_tensor(value.tensor, *args, **kwargs)
-
-
-ops.register_tensor_conversion_function(LabeledTensorMock,
-                                        _convert_labeled_tensor_mock_to_tensor)
-
-
-class ServingInputReceiverTest(test_util.TensorFlowTestCase):
-
-  def test_serving_input_receiver_constructor(self):
-    """Tests that no errors are raised when input is expected."""
-    features = {
-        "feature0": constant_op.constant([0]),
-        u"feature1": constant_op.constant([1]),
-        "feature2": sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-    }
-    receiver_tensors = {
-        "example0": array_ops.placeholder(dtypes.string, name="example0"),
-        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
-    }
-    export.ServingInputReceiver(features, receiver_tensors)
-
-  def test_serving_input_receiver_features_invalid(self):
-    receiver_tensors = {
-        "example0": array_ops.placeholder(dtypes.string, name="example0"),
-        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
-    }
-
-    with self.assertRaisesRegexp(ValueError, "features must be defined"):
-      export.ServingInputReceiver(
-          features=None,
-          receiver_tensors=receiver_tensors)
-
-    with self.assertRaisesRegexp(ValueError, "feature keys must be strings"):
-      export.ServingInputReceiver(
-          features={1: constant_op.constant([1])},
-          receiver_tensors=receiver_tensors)
-
-    with self.assertRaisesRegexp(
-        ValueError, "feature feature1 must be a Tensor or SparseTensor"):
-      export.ServingInputReceiver(
-          features={"feature1": [1]},
-          receiver_tensors=receiver_tensors)
-
-  def test_serving_input_receiver_receiver_tensors_invalid(self):
-    features = {
-        "feature0": constant_op.constant([0]),
-        u"feature1": constant_op.constant([1]),
-        "feature2": sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-    }
-
-    with self.assertRaisesRegexp(
-        ValueError, "receiver_tensors must be defined"):
-      export.ServingInputReceiver(
-          features=features,
-          receiver_tensors=None)
-
-    with self.assertRaisesRegexp(
-        ValueError, "receiver_tensor keys must be strings"):
-      export.ServingInputReceiver(
-          features=features,
-          receiver_tensors={
-              1: array_ops.placeholder(dtypes.string, name="example0")})
-
-    with self.assertRaisesRegexp(
-        ValueError, "receiver_tensor example1 must be a Tensor"):
-      export.ServingInputReceiver(
-          features=features,
-          receiver_tensors={"example1": [1]})
-
-  def test_single_feature_single_receiver(self):
-    feature = constant_op.constant(5)
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    input_receiver = export.ServingInputReceiver(
-        feature, receiver_tensor)
-    # single feature is automatically named
-    feature_key, = input_receiver.features.keys()
-    self.assertEqual("feature", feature_key)
-    # single receiver is automatically named
-    receiver_key, = input_receiver.receiver_tensors.keys()
-    self.assertEqual("input", receiver_key)
-
-  def test_multi_feature_single_receiver(self):
-    features = {"foo": constant_op.constant(5),
-                "bar": constant_op.constant(6)}
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    _ = export.ServingInputReceiver(features, receiver_tensor)
-
-  def test_multi_feature_multi_receiver(self):
-    features = {"foo": constant_op.constant(5),
-                "bar": constant_op.constant(6)}
-    receiver_tensors = {"baz": array_ops.placeholder(dtypes.int64),
-                        "qux": array_ops.placeholder(dtypes.float32)}
-    _ = export.ServingInputReceiver(features, receiver_tensors)
-
-  def test_feature_wrong_type(self):
-    feature = "not a tensor"
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    with self.assertRaises(ValueError):
-      _ = export.ServingInputReceiver(feature, receiver_tensor)
-
-  def test_feature_labeled_tensor(self):
-    feature = LabeledTensorMock()
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    _ = export.ServingInputReceiver(feature, receiver_tensor)
-
-  def test_receiver_wrong_type(self):
-    feature = constant_op.constant(5)
-    receiver_tensor = "not a tensor"
-    with self.assertRaises(ValueError):
-      _ = export.ServingInputReceiver(feature, receiver_tensor)
-
-
-class UnsupervisedInputReceiverTest(test_util.TensorFlowTestCase):
-
-  # Since this is basically a wrapper around ServingInputReceiver, we only
-  # have a simple sanity check to ensure that it works.
-
-  def test_unsupervised_input_receiver_constructor(self):
-    """Tests that no errors are raised when input is expected."""
-    features = {
-        "feature0":
-            constant_op.constant([0]),
-        u"feature1":
-            constant_op.constant([1]),
-        "feature2":
-            sparse_tensor.SparseTensor(
-                indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-    }
-    receiver_tensors = {
-        "example0": array_ops.placeholder(dtypes.string, name="example0"),
-        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
-    }
-    export.UnsupervisedInputReceiver(features, receiver_tensors)
-
-
-class SupervisedInputReceiverTest(test_util.TensorFlowTestCase):
-
-  def test_input_receiver_constructor(self):
-    """Tests that no errors are raised when input is expected."""
-    features = {
-        "feature0": constant_op.constant([0]),
-        u"feature1": constant_op.constant([1]),
-        "feature2": sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-    }
-    labels = {
-        "classes": constant_op.constant([0] * 100),
-    }
-
-    receiver_tensors = {
-        "example0": array_ops.placeholder(dtypes.string, name="example0"),
-        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
-    }
-    export.SupervisedInputReceiver(features, labels, receiver_tensors)
-
-  def test_input_receiver_raw_values(self):
-    """Tests that no errors are raised when input is expected."""
-    features = {
-        "feature0": constant_op.constant([0]),
-        u"feature1": constant_op.constant([1]),
-        "feature2": sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-    }
-
-    labels = {
-        "classes": constant_op.constant([0] * 100),
-    }
-
-    receiver_tensors = {
-        "example0": array_ops.placeholder(dtypes.string, name="example0"),
-        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
-    }
-    rec = export.SupervisedInputReceiver(
-        features["feature2"], labels, receiver_tensors)
-    self.assertIsInstance(rec.features, sparse_tensor.SparseTensor)
-
-    rec = export.SupervisedInputReceiver(
-        features, labels["classes"], receiver_tensors)
-    self.assertIsInstance(rec.labels, ops.Tensor)
-
-  def test_input_receiver_features_invalid(self):
-    features = constant_op.constant([0] * 100)
-    labels = constant_op.constant([0])
-    receiver_tensors = {
-        "example0": array_ops.placeholder(dtypes.string, name="example0"),
-        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
-    }
-
-    with self.assertRaisesRegexp(ValueError, "features must be defined"):
-      export.SupervisedInputReceiver(
-          features=None,
-          labels=labels,
-          receiver_tensors=receiver_tensors)
-
-    with self.assertRaisesRegexp(ValueError, "feature keys must be strings"):
-      export.SupervisedInputReceiver(
-          features={1: constant_op.constant([1])},
-          labels=labels,
-          receiver_tensors=receiver_tensors)
-
-    with self.assertRaisesRegexp(ValueError, "label keys must be strings"):
-      export.SupervisedInputReceiver(
-          features=features,
-          labels={1: constant_op.constant([1])},
-          receiver_tensors=receiver_tensors)
-
-    with self.assertRaisesRegexp(
-        ValueError, "feature feature1 must be a Tensor or SparseTensor"):
-      export.SupervisedInputReceiver(
-          features={"feature1": [1]},
-          labels=labels,
-          receiver_tensors=receiver_tensors)
-
-    with self.assertRaisesRegexp(
-        ValueError, "feature must be a Tensor or SparseTensor"):
-      export.SupervisedInputReceiver(
-          features=[1],
-          labels=labels,
-          receiver_tensors=receiver_tensors)
-
-    with self.assertRaisesRegexp(
-        ValueError, "label must be a Tensor or SparseTensor"):
-      export.SupervisedInputReceiver(
-          features=features,
-          labels=100,
-          receiver_tensors=receiver_tensors)
-
-  def test_input_receiver_receiver_tensors_invalid(self):
-    features = {
-        "feature0": constant_op.constant([0]),
-        u"feature1": constant_op.constant([1]),
-        "feature2": sparse_tensor.SparseTensor(
-            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
-    }
-    labels = constant_op.constant([0])
-
-    with self.assertRaisesRegexp(
-        ValueError, "receiver_tensors must be defined"):
-      export.SupervisedInputReceiver(
-          features=features,
-          labels=labels,
-          receiver_tensors=None)
-
-    with self.assertRaisesRegexp(
-        ValueError, "receiver_tensor keys must be strings"):
-      export.SupervisedInputReceiver(
-          features=features,
-          labels=labels,
-          receiver_tensors={
-              1: array_ops.placeholder(dtypes.string, name="example0")})
-
-    with self.assertRaisesRegexp(
-        ValueError, "receiver_tensor example1 must be a Tensor"):
-      export.SupervisedInputReceiver(
-          features=features,
-          labels=labels,
-          receiver_tensors={"example1": [1]})
-
-  def test_single_feature_single_receiver(self):
-    feature = constant_op.constant(5)
-    label = constant_op.constant(5)
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    input_receiver = export.SupervisedInputReceiver(
-        feature, label, receiver_tensor)
-
-    # single receiver is automatically named
-    receiver_key, = input_receiver.receiver_tensors.keys()
-    self.assertEqual("input", receiver_key)
-
-  def test_multi_feature_single_receiver(self):
-    features = {"foo": constant_op.constant(5),
-                "bar": constant_op.constant(6)}
-    labels = {"value": constant_op.constant(5)}
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    _ = export.SupervisedInputReceiver(features, labels, receiver_tensor)
-
-  def test_multi_feature_multi_receiver(self):
-    features = {"foo": constant_op.constant(5),
-                "bar": constant_op.constant(6)}
-    labels = {"value": constant_op.constant(5)}
-    receiver_tensors = {"baz": array_ops.placeholder(dtypes.int64),
-                        "qux": array_ops.placeholder(dtypes.float32)}
-    _ = export.SupervisedInputReceiver(features, labels, receiver_tensors)
-
-  def test_feature_labeled_tensor(self):
-    feature = LabeledTensorMock()
-    label = constant_op.constant(5)
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    _ = export.SupervisedInputReceiver(feature, label, receiver_tensor)
-
-
-class ExportTest(test_util.TensorFlowTestCase):
-
-  def test_build_parsing_serving_input_receiver_fn(self):
-    feature_spec = {"int_feature": parsing_ops.VarLenFeature(dtypes.int64),
-                    "float_feature": parsing_ops.VarLenFeature(dtypes.float32)}
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    with ops.Graph().as_default():
-      serving_input_receiver = serving_input_receiver_fn()
-      self.assertEqual(set(["int_feature", "float_feature"]),
-                       set(serving_input_receiver.features.keys()))
-      self.assertEqual(set(["examples"]),
-                       set(serving_input_receiver.receiver_tensors.keys()))
-
-      example = example_pb2.Example()
-      text_format.Parse("features: { "
-                        "  feature: { "
-                        "    key: 'int_feature' "
-                        "    value: { "
-                        "      int64_list: { "
-                        "        value: [ 21, 2, 5 ] "
-                        "      } "
-                        "    } "
-                        "  } "
-                        "  feature: { "
-                        "    key: 'float_feature' "
-                        "    value: { "
-                        "      float_list: { "
-                        "        value: [ 525.25 ] "
-                        "      } "
-                        "    } "
-                        "  } "
-                        "} ", example)
-
-      with self.cached_session() as sess:
-        sparse_result = sess.run(
-            serving_input_receiver.features,
-            feed_dict={
-                serving_input_receiver.receiver_tensors["examples"].name:
-                [example.SerializeToString()]})
-        self.assertAllEqual([[0, 0], [0, 1], [0, 2]],
-                            sparse_result["int_feature"].indices)
-        self.assertAllEqual([21, 2, 5],
-                            sparse_result["int_feature"].values)
-        self.assertAllEqual([[0, 0]],
-                            sparse_result["float_feature"].indices)
-        self.assertAllEqual([525.25],
-                            sparse_result["float_feature"].values)
-
-  def test_build_raw_serving_input_receiver_fn_name(self):
-    """Test case for issue #12755."""
-    f = {
-        "feature":
-            array_ops.placeholder(
-                name="feature", shape=[32], dtype=dtypes.float32)
-    }
-    serving_input_receiver_fn = export.build_raw_serving_input_receiver_fn(f)
-    v = serving_input_receiver_fn()
-    self.assertTrue(isinstance(v, export.ServingInputReceiver))
-
-  def test_build_raw_serving_input_receiver_fn_without_shape(self):
-    """Test case for issue #21178."""
-    f = {"feature_1": array_ops.placeholder(dtypes.float32),
-         "feature_2": array_ops.placeholder(dtypes.int32)}
-    serving_input_receiver_fn = export.build_raw_serving_input_receiver_fn(f)
-    v = serving_input_receiver_fn()
-    self.assertTrue(isinstance(v, export.ServingInputReceiver))
-    self.assertEqual(
-        tensor_shape.unknown_shape(),
-        v.receiver_tensors["feature_1"].shape)
-    self.assertEqual(
-        tensor_shape.unknown_shape(),
-        v.receiver_tensors["feature_2"].shape)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_build_raw_serving_input_receiver_fn(self):
-    features = {"feature_1": constant_op.constant(["hello"]),
-                "feature_2": constant_op.constant([42])}
-    serving_input_receiver_fn = export.build_raw_serving_input_receiver_fn(
-        features)
-    with ops.Graph().as_default():
-      serving_input_receiver = serving_input_receiver_fn()
-      self.assertEqual(set(["feature_1", "feature_2"]),
-                       set(serving_input_receiver.features.keys()))
-      self.assertEqual(set(["feature_1", "feature_2"]),
-                       set(serving_input_receiver.receiver_tensors.keys()))
-      self.assertEqual(
-          dtypes.string,
-          serving_input_receiver.receiver_tensors["feature_1"].dtype)
-      self.assertEqual(
-          dtypes.int32,
-          serving_input_receiver.receiver_tensors["feature_2"].dtype)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_build_raw_supervised_input_receiver_fn(self):
-    features = {"feature_1": constant_op.constant(["hello"]),
-                "feature_2": constant_op.constant([42])}
-    labels = {"foo": constant_op.constant([5]),
-              "bar": constant_op.constant([6])}
-    input_receiver_fn = export.build_raw_supervised_input_receiver_fn(
-        features, labels)
-    with ops.Graph().as_default():
-      input_receiver = input_receiver_fn()
-      self.assertEqual(set(["feature_1", "feature_2"]),
-                       set(input_receiver.features.keys()))
-      self.assertEqual(set(["foo", "bar"]),
-                       set(input_receiver.labels.keys()))
-      self.assertEqual(set(["feature_1", "feature_2", "foo", "bar"]),
-                       set(input_receiver.receiver_tensors.keys()))
-      self.assertEqual(
-          dtypes.string, input_receiver.receiver_tensors["feature_1"].dtype)
-      self.assertEqual(
-          dtypes.int32, input_receiver.receiver_tensors["feature_2"].dtype)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_build_raw_supervised_input_receiver_fn_raw_tensors(self):
-    features = {"feature_1": constant_op.constant(["hello"]),
-                "feature_2": constant_op.constant([42])}
-    labels = {"foo": constant_op.constant([5]),
-              "bar": constant_op.constant([6])}
-    input_receiver_fn1 = export.build_raw_supervised_input_receiver_fn(
-        features["feature_1"], labels)
-    input_receiver_fn2 = export.build_raw_supervised_input_receiver_fn(
-        features["feature_1"], labels["foo"])
-    with ops.Graph().as_default():
-      input_receiver = input_receiver_fn1()
-      self.assertIsInstance(input_receiver.features, ops.Tensor)
-      self.assertEqual(set(["foo", "bar"]),
-                       set(input_receiver.labels.keys()))
-      self.assertEqual(set(["input", "foo", "bar"]),
-                       set(input_receiver.receiver_tensors.keys()))
-
-      input_receiver = input_receiver_fn2()
-      self.assertIsInstance(input_receiver.features, ops.Tensor)
-      self.assertIsInstance(input_receiver.labels, ops.Tensor)
-      self.assertEqual(set(["input", "label"]),
-                       set(input_receiver.receiver_tensors.keys()))
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_build_raw_supervised_input_receiver_fn_batch_size(self):
-    features = {"feature_1": constant_op.constant(["hello"]),
-                "feature_2": constant_op.constant([42])}
-    labels = {"foo": constant_op.constant([5]),
-              "bar": constant_op.constant([6])}
-    input_receiver_fn = export.build_raw_supervised_input_receiver_fn(
-        features, labels, default_batch_size=10)
-    with ops.Graph().as_default():
-      input_receiver = input_receiver_fn()
-      self.assertEqual([10], input_receiver.receiver_tensors["feature_1"].shape)
-      self.assertEqual([10], input_receiver.features["feature_1"].shape)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_build_raw_supervised_input_receiver_fn_overlapping_keys(self):
-    features = {"feature_1": constant_op.constant(["hello"]),
-                "feature_2": constant_op.constant([42])}
-    labels = {"feature_1": constant_op.constant([5]),
-              "bar": constant_op.constant([6])}
-    with self.assertRaises(ValueError):
-      export.build_raw_supervised_input_receiver_fn(features, labels)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_build_supervised_input_receiver_fn_from_input_fn(self):
-    def dummy_input_fn():
-      return ({"x": constant_op.constant([[1], [1]]),
-               "y": constant_op.constant(["hello", "goodbye"])},
-              constant_op.constant([[1], [1]]))
-
-    input_receiver_fn = export.build_supervised_input_receiver_fn_from_input_fn(
-        dummy_input_fn)
-
-    with ops.Graph().as_default():
-      input_receiver = input_receiver_fn()
-      self.assertEqual(set(["x", "y"]),
-                       set(input_receiver.features.keys()))
-      self.assertIsInstance(input_receiver.labels, ops.Tensor)
-      self.assertEqual(set(["x", "y", "label"]),
-                       set(input_receiver.receiver_tensors.keys()))
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_build_supervised_input_receiver_fn_from_input_fn_args(self):
-    def dummy_input_fn(feature_key="x"):
-      return ({feature_key: constant_op.constant([[1], [1]]),
-               "y": constant_op.constant(["hello", "goodbye"])},
-              {"my_label": constant_op.constant([[1], [1]])})
-
-    input_receiver_fn = export.build_supervised_input_receiver_fn_from_input_fn(
-        dummy_input_fn, feature_key="z")
-
-    with ops.Graph().as_default():
-      input_receiver = input_receiver_fn()
-      self.assertEqual(set(["z", "y"]),
-                       set(input_receiver.features.keys()))
-      self.assertEqual(set(["my_label"]),
-                       set(input_receiver.labels.keys()))
-      self.assertEqual(set(["z", "y", "my_label"]),
-                       set(input_receiver.receiver_tensors.keys()))
-
-  def test_build_all_signature_defs_without_receiver_alternatives(self):
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    output_1 = constant_op.constant([1.])
-    output_2 = constant_op.constant(["2"])
-    output_3 = constant_op.constant(["3"])
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.RegressionOutput(value=output_1),
-        "head-2": export_output.ClassificationOutput(classes=output_2),
-        "head-3": export_output.PredictOutput(outputs={
-            "some_output_3": output_3
-        }),
-    }
-
-    signature_defs = export.build_all_signature_defs(
-        receiver_tensor, export_outputs)
-
-    expected_signature_defs = {
-        "serving_default":
-            signature_def_utils.regression_signature_def(receiver_tensor,
-                                                         output_1),
-        "head-2":
-            signature_def_utils.classification_signature_def(receiver_tensor,
-                                                             output_2, None),
-        "head-3":
-            signature_def_utils.predict_signature_def({
-                "input": receiver_tensor
-            }, {"some_output_3": output_3})
-    }
-
-    self.assertDictEqual(expected_signature_defs, signature_defs)
-
-  def test_build_all_signature_defs_with_dict_alternatives(self):
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    receiver_tensors_alternative_1 = {
-        "foo": array_ops.placeholder(dtypes.int64),
-        "bar": array_ops.sparse_placeholder(dtypes.float32)}
-    receiver_tensors_alternatives = {"other": receiver_tensors_alternative_1}
-    output_1 = constant_op.constant([1.])
-    output_2 = constant_op.constant(["2"])
-    output_3 = constant_op.constant(["3"])
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.RegressionOutput(value=output_1),
-        "head-2": export_output.ClassificationOutput(classes=output_2),
-        "head-3": export_output.PredictOutput(outputs={
-            "some_output_3": output_3
-        }),
-    }
-
-    signature_defs = export.build_all_signature_defs(
-        receiver_tensor, export_outputs, receiver_tensors_alternatives)
-
-    expected_signature_defs = {
-        "serving_default":
-            signature_def_utils.regression_signature_def(
-                receiver_tensor,
-                output_1),
-        "head-2":
-            signature_def_utils.classification_signature_def(
-                receiver_tensor,
-                output_2, None),
-        "head-3":
-            signature_def_utils.predict_signature_def(
-                {"input": receiver_tensor},
-                {"some_output_3": output_3}),
-        "other:head-3":
-            signature_def_utils.predict_signature_def(
-                receiver_tensors_alternative_1,
-                {"some_output_3": output_3})
-
-        # Note that the alternatives 'other:serving_default' and 'other:head-2'
-        # are invalid, because regession and classification signatures must take
-        # a single string input.  Here we verify that these invalid signatures
-        # are not included in the export.
-    }
-
-    self.assertDictEqual(expected_signature_defs, signature_defs)
-
-  def test_build_all_signature_defs_with_single_alternatives(self):
-    receiver_tensor = array_ops.placeholder(dtypes.string)
-    receiver_tensors_alternative_1 = array_ops.placeholder(dtypes.int64)
-    receiver_tensors_alternative_2 = array_ops.sparse_placeholder(
-        dtypes.float32)
-    # Note we are passing single Tensors as values of
-    # receiver_tensors_alternatives, where normally that is a dict.
-    # In this case a dict will be created using the default receiver tensor
-    # name "input".
-    receiver_tensors_alternatives = {"other1": receiver_tensors_alternative_1,
-                                     "other2": receiver_tensors_alternative_2}
-    output_1 = constant_op.constant([1.])
-    output_2 = constant_op.constant(["2"])
-    output_3 = constant_op.constant(["3"])
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.RegressionOutput(value=output_1),
-        "head-2": export_output.ClassificationOutput(classes=output_2),
-        "head-3": export_output.PredictOutput(outputs={
-            "some_output_3": output_3
-        }),
-    }
-
-    signature_defs = export.build_all_signature_defs(
-        receiver_tensor, export_outputs, receiver_tensors_alternatives)
-
-    expected_signature_defs = {
-        "serving_default":
-            signature_def_utils.regression_signature_def(
-                receiver_tensor,
-                output_1),
-        "head-2":
-            signature_def_utils.classification_signature_def(
-                receiver_tensor,
-                output_2, None),
-        "head-3":
-            signature_def_utils.predict_signature_def(
-                {"input": receiver_tensor},
-                {"some_output_3": output_3}),
-        "other1:head-3":
-            signature_def_utils.predict_signature_def(
-                {"input": receiver_tensors_alternative_1},
-                {"some_output_3": output_3}),
-        "other2:head-3":
-            signature_def_utils.predict_signature_def(
-                {"input": receiver_tensors_alternative_2},
-                {"some_output_3": output_3})
-
-        # Note that the alternatives 'other:serving_default' and 'other:head-2'
-        # are invalid, because regession and classification signatures must take
-        # a single string input.  Here we verify that these invalid signatures
-        # are not included in the export.
-    }
-
-    self.assertDictEqual(expected_signature_defs, signature_defs)
-
-  def test_build_all_signature_defs_export_outputs_required(self):
-    receiver_tensor = constant_op.constant(["11"])
-
-    with self.assertRaises(ValueError) as e:
-      export.build_all_signature_defs(receiver_tensor, None)
-
-    self.assertTrue(str(e.exception).startswith(
-        "export_outputs must be a dict"))
-
-  def test_get_timestamped_export_dir(self):
-    export_dir_base = tempfile.mkdtemp() + "export/"
-    export_dir_1 = export.get_timestamped_export_dir(
-        export_dir_base)
-    time.sleep(2)
-    export_dir_2 = export.get_timestamped_export_dir(
-        export_dir_base)
-    time.sleep(2)
-    export_dir_3 = export.get_timestamped_export_dir(
-        export_dir_base)
-
-    # Export directories should be named using a timestamp that is seconds
-    # since epoch.  Such a timestamp is 10 digits long.
-    time_1 = os.path.basename(export_dir_1)
-    self.assertEqual(10, len(time_1))
-    time_2 = os.path.basename(export_dir_2)
-    self.assertEqual(10, len(time_2))
-    time_3 = os.path.basename(export_dir_3)
-    self.assertEqual(10, len(time_3))
-
-    self.assertTrue(int(time_1) < int(time_2))
-    self.assertTrue(int(time_2) < int(time_3))
-
-  def test_build_all_signature_defs_serving_only(self):
-    receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
-    output_1 = constant_op.constant([1.])
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            export_output.PredictOutput(outputs=output_1),
-        "train": export_output.TrainOutput(loss=output_1),
-    }
-
-    signature_defs = export.build_all_signature_defs(
-        receiver_tensor, export_outputs)
-
-    expected_signature_defs = {
-        "serving_default": signature_def_utils.predict_signature_def(
-            receiver_tensor, {"output": output_1})
-    }
-
-    self.assertDictEqual(expected_signature_defs, signature_defs)
-
-    signature_defs = export.build_all_signature_defs(
-        receiver_tensor, export_outputs, serving_only=False)
-
-    expected_signature_defs.update({
-        "train": signature_def_utils.supervised_train_signature_def(
-            receiver_tensor, loss={"loss": output_1})
-    })
-
-    self.assertDictEqual(expected_signature_defs, signature_defs)
-
-
-class TensorServingReceiverTest(test_util.TensorFlowTestCase):
-
-  def test_tensor_serving_input_receiver_constructor(self):
-    features = constant_op.constant([0])
-    receiver_tensors = {
-        "example0": array_ops.placeholder(dtypes.string, name="example0"),
-        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
-    }
-    r = export.TensorServingInputReceiver(features, receiver_tensors)
-    self.assertTrue(isinstance(r.features, ops.Tensor))
-    self.assertTrue(isinstance(r.receiver_tensors, dict))
-
-  def test_tensor_serving_input_receiver_sparse(self):
-    features = sparse_tensor.SparseTensor(
-        indices=[[0, 0]], values=[1], dense_shape=[1, 1])
-    receiver_tensors = {
-        "example0": array_ops.placeholder(dtypes.string, name="example0"),
-        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
-    }
-    r = export.TensorServingInputReceiver(features, receiver_tensors)
-    self.assertTrue(isinstance(r.features, sparse_tensor.SparseTensor))
-    self.assertTrue(isinstance(r.receiver_tensors, dict))
-
-  def test_serving_input_receiver_features_invalid(self):
-    receiver_tensors = {
-        "example0": array_ops.placeholder(dtypes.string, name="example0"),
-        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
-    }
-
-    with self.assertRaisesRegexp(ValueError, "features must be defined"):
-      export.TensorServingInputReceiver(
-          features=None,
-          receiver_tensors=receiver_tensors)
-
-    with self.assertRaisesRegexp(ValueError, "feature must be a Tensor"):
-      export.TensorServingInputReceiver(
-          features={"1": constant_op.constant([1])},
-          receiver_tensors=receiver_tensors)
-
-  def test_serving_input_receiver_receiver_tensors_invalid(self):
-    features = constant_op.constant([0])
-
-    with self.assertRaisesRegexp(
-        ValueError, "receiver_tensors must be defined"):
-      export.TensorServingInputReceiver(
-          features=features,
-          receiver_tensors=None)
-
-    with self.assertRaisesRegexp(
-        ValueError, "receiver_tensor keys must be strings"):
-      export.TensorServingInputReceiver(
-          features=features,
-          receiver_tensors={
-              1: array_ops.placeholder(dtypes.string, name="example0")})
-
-    with self.assertRaisesRegexp(
-        ValueError, "receiver_tensor example1 must be a Tensor"):
-      export.TensorServingInputReceiver(
-          features=features,
-          receiver_tensors={"example1": [1]})
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index b18212cfcda8f817f909672007c5b000db718232..f7b8839813e1543433ca048b5bcdd73b93b0882f 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,495 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""`Exporter` class represents different flavors of model export."""
+"""exporter python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
-import os
-
-from tensorflow.python.estimator import gc
-from tensorflow.python.estimator import util
-from tensorflow.python.estimator.canned import metric_keys
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.summary import summary_iterator
-from tensorflow.python.util.tf_export import estimator_export
-
-
-@estimator_export('estimator.Exporter')
-class Exporter(object):
-  """A class representing a type of model export."""
-
-  @abc.abstractproperty
-  def name(self):
-    """Directory name.
-
-    A directory name under the export base directory where exports of
-    this type are written.  Should not be `None` nor empty.
-    """
-    pass
-
-  @abc.abstractmethod
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    """Exports the given `Estimator` to a specific format.
-
-    Args:
-      estimator: the `Estimator` to export.
-      export_path: A string containing a directory where to write the export.
-      checkpoint_path: The checkpoint path to export.
-      eval_result: The output of `Estimator.evaluate` on this checkpoint.
-      is_the_final_export: This boolean is True when this is an export in the
-        end of training.  It is False for the intermediate exports during
-        the training.
-        When passing `Exporter` to `tf.estimator.train_and_evaluate`
-        `is_the_final_export` is always False if `TrainSpec.max_steps` is
-        `None`.
-
-    Returns:
-      The string path to the exported directory or `None` if export is skipped.
-    """
-    pass
-
-
-class _SavedModelExporter(Exporter):
-  """This class exports the serving graph and checkpoints.
-
-     This class provides a basic exporting functionality and serves as a
-     foundation for specialized `Exporter`s.
-  """
-
-  def __init__(self,
-               name,
-               serving_input_receiver_fn,
-               assets_extra=None,
-               as_text=False,
-               strip_default_attrs=True):
-    """Create an `Exporter` to use with `tf.estimator.EvalSpec`.
-
-    Args:
-      name: unique name of this `Exporter` that is going to be used in the
-        export path.
-      serving_input_receiver_fn: a function that takes no arguments and returns
-        a `ServingInputReceiver`.
-      assets_extra: An optional dict specifying how to populate the assets.extra
-        directory within the exported SavedModel.  Each key should give the
-        destination path (including the filename) relative to the assets.extra
-        directory.  The corresponding value gives the full path of the source
-        file to be copied.  For example, the simple case of copying a single
-        file without renaming it is specified as
-        `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-      as_text: whether to write the SavedModel proto in text format. Defaults to
-        `False`.
-      strip_default_attrs: Boolean. If set, default attrs in the `GraphDef` will
-        be stripped on write. This is the default behavior and recommended for
-        better forward compatibility of the resulting `SavedModel`.
-
-    Raises:
-      ValueError: if any arguments is invalid.
-    """
-    self._name = name
-    self._serving_input_receiver_fn = serving_input_receiver_fn
-    self._assets_extra = assets_extra
-    self._as_text = as_text
-    self._strip_default_attrs = strip_default_attrs
-
-  @property
-  def name(self):
-    return self._name
-
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    del is_the_final_export
-
-    export_result = estimator.export_savedmodel(
-        export_path,
-        self._serving_input_receiver_fn,
-        assets_extra=self._assets_extra,
-        as_text=self._as_text,
-        checkpoint_path=checkpoint_path,
-        strip_default_attrs=self._strip_default_attrs)
-
-    return export_result
-
-
-def _loss_smaller(best_eval_result, current_eval_result):
-  """Compares two evaluation results and returns true if the 2nd one is smaller.
-
-  Both evaluation results should have the values for MetricKeys.LOSS, which are
-  used for comparison.
-
-  Args:
-    best_eval_result: best eval metrics.
-    current_eval_result: current eval metrics.
-
-  Returns:
-    True if the loss of current_eval_result is smaller; otherwise, False.
-
-  Raises:
-    ValueError: If input eval result is None or no loss is available.
-  """
-  default_key = metric_keys.MetricKeys.LOSS
-  if not best_eval_result or default_key not in best_eval_result:
-    raise ValueError(
-        'best_eval_result cannot be empty or no loss is found in it.')
-
-  if not current_eval_result or default_key not in current_eval_result:
-    raise ValueError(
-        'current_eval_result cannot be empty or no loss is found in it.')
-
-  return best_eval_result[default_key] > current_eval_result[default_key]
-
-
-def _verify_compare_fn_args(compare_fn):
-  """Verifies compare_fn arguments."""
-  args = set(util.fn_args(compare_fn))
-  if 'best_eval_result' not in args:
-    raise ValueError(
-        'compare_fn (%s) must include best_eval_result argument.' % compare_fn)
-  if 'current_eval_result' not in args:
-    raise ValueError(
-        'compare_fn (%s) must include current_eval_result argument.' %
-        compare_fn)
-  non_valid_args = list(args - set(['best_eval_result', 'current_eval_result']))
-  if non_valid_args:
-    raise ValueError('compare_fn (%s) has following not expected args: %s' %
-                     (compare_fn, non_valid_args))
-
-
-@estimator_export('estimator.BestExporter')
-class BestExporter(Exporter):
-  """This class exports the serving graph and checkpoints of the best models.
-
-  This class performs a model export everytime when the new model is better
-  than any exsiting model.
-  """
-
-  def __init__(self,
-               name='best_exporter',
-               serving_input_receiver_fn=None,
-               event_file_pattern='eval/*.tfevents.*',
-               compare_fn=_loss_smaller,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
-    """Create an `Exporter` to use with `tf.estimator.EvalSpec`.
-
-    Example of creating a BestExporter for training and evluation:
-    ```python
-    def make_train_and_eval_fn():
-      # Set up feature columns.
-      categorial_feature_a = (
-          tf.feature_column.categorical_column_with_hash_bucket(...))
-      categorial_feature_a_emb = embedding_column(
-          categorical_column=categorial_feature_a, ...)
-      ...  # other feature columns
-
-      estimator = tf.estimator.DNNClassifier(
-          config=tf.estimator.RunConfig(
-              model_dir='/my_model', save_summary_steps=100),
-          feature_columns=[categorial_feature_a_emb, ...],
-          hidden_units=[1024, 512, 256])
-
-      serving_feature_spec = tf.feature_column.make_parse_example_spec(
-          categorial_feature_a_emb)
-      serving_input_receiver_fn = (
-          tf.estimator.export.build_parsing_serving_input_receiver_fn(
-          serving_feature_spec))
-
-      exporter = tf.estimator.BestExporter(
-          name="best_exporter",
-          serving_input_receiver_fn=serving_input_receiver_fn,
-          exports_to_keep=5)
-
-      train_spec = tf.estimator.TrainSpec(...)
-
-      eval_spec = [tf.estimator.EvalSpec(
-        input_fn=eval_input_fn,
-        steps=100,
-        exporters=exporter,
-        start_delay_secs=0,
-        throttle_secs=5)]
-
-      return tf.estimator.DistributedTrainingSpec(estimator, train_spec,
-                                                  eval_spec)
-    ```
-
-    Args:
-      name: unique name of this `Exporter` that is going to be used in the
-        export path.
-      serving_input_receiver_fn: a function that takes no arguments and returns
-        a `ServingInputReceiver`.
-      event_file_pattern: event file name pattern relative to model_dir. If
-        None, however, the exporter would not be preemption-safe. To be
-        preemption-safe, event_file_pattern should be specified.
-      compare_fn: a function that compares two evaluation results and returns
-        true if current evaluation result is better. Follows the signature:
-        * Args:
-          * `best_eval_result`: This is the evaluation result of the best model.
-          * `current_eval_result`: This is the evaluation result of current
-                 candidate model.
-        * Returns:
-          True if current evaluation result is better; otherwise, False.
-      assets_extra: An optional dict specifying how to populate the assets.extra
-        directory within the exported SavedModel.  Each key should give the
-        destination path (including the filename) relative to the assets.extra
-        directory.  The corresponding value gives the full path of the source
-        file to be copied.  For example, the simple case of copying a single
-        file without renaming it is specified as `{'my_asset_file.txt':
-        '/path/to/my_asset_file.txt'}`.
-      as_text: whether to write the SavedModel proto in text format. Defaults to
-        `False`.
-      exports_to_keep: Number of exports to keep.  Older exports will be
-        garbage-collected.  Defaults to 5.  Set to `None` to disable garbage
-        collection.
-
-    Raises:
-      ValueError: if any arguments is invalid.
-    """
-    self._compare_fn = compare_fn
-    if self._compare_fn is None:
-      raise ValueError('`compare_fn` must not be None.')
-    _verify_compare_fn_args(self._compare_fn)
-
-    self._saved_model_exporter = _SavedModelExporter(
-        name, serving_input_receiver_fn, assets_extra, as_text)
-
-    self._event_file_pattern = event_file_pattern
-    self._model_dir = None
-    self._best_eval_result = None
-
-    self._exports_to_keep = exports_to_keep
-    if exports_to_keep is not None and exports_to_keep <= 0:
-      raise ValueError(
-          '`exports_to_keep`, if provided, must be positive number')
-
-  @property
-  def name(self):
-    return self._saved_model_exporter.name
-
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    export_result = None
-
-    if self._model_dir != estimator.model_dir and self._event_file_pattern:
-      # Loads best metric from event files.
-      tf_logging.info('Loading best metric from event files.')
-
-      self._model_dir = estimator.model_dir
-      full_event_file_pattern = os.path.join(self._model_dir,
-                                             self._event_file_pattern)
-      self._best_eval_result = self._get_best_eval_result(
-          full_event_file_pattern)
-
-    if self._best_eval_result is None or self._compare_fn(
-        best_eval_result=self._best_eval_result,
-        current_eval_result=eval_result):
-      tf_logging.info('Performing best model export.')
-      self._best_eval_result = eval_result
-      export_result = self._saved_model_exporter.export(
-          estimator, export_path, checkpoint_path, eval_result,
-          is_the_final_export)
-      self._garbage_collect_exports(export_path)
-
-    return export_result
-
-  def _garbage_collect_exports(self, export_dir_base):
-    """Deletes older exports, retaining only a given number of the most recent.
-
-    Export subdirectories are assumed to be named with monotonically increasing
-    integers; the most recent are taken to be those with the largest values.
-
-    Args:
-      export_dir_base: the base directory under which each export is in a
-        versioned subdirectory.
-    """
-    if self._exports_to_keep is None:
-      return
-
-    def _export_version_parser(path):
-      # create a simple parser that pulls the export_version from the directory.
-      filename = os.path.basename(path.path)
-      if not (len(filename) == 10 and filename.isdigit()):
-        return None
-      return path._replace(export_version=int(filename))
-
-    # pylint: disable=protected-access
-    keep_filter = gc._largest_export_versions(self._exports_to_keep)
-    delete_filter = gc._negation(keep_filter)
-    for p in delete_filter(
-        gc._get_paths(export_dir_base, parser=_export_version_parser)):
-      try:
-        gfile.DeleteRecursively(p.path)
-      except errors_impl.NotFoundError as e:
-        tf_logging.warn('Can not delete %s recursively: %s', p.path, e)
-    # pylint: enable=protected-access
-
-  def _get_best_eval_result(self, event_files):
-    """Get the best eval result from event files.
-
-    Args:
-      event_files: Absolute pattern of event files.
-
-    Returns:
-      The best eval result.
-    """
-    if not event_files:
-      return None
-
-    best_eval_result = None
-    for event_file in gfile.Glob(os.path.join(event_files)):
-      for event in summary_iterator.summary_iterator(event_file):
-        if event.HasField('summary'):
-          event_eval_result = {}
-          for value in event.summary.value:
-            if value.HasField('simple_value'):
-              event_eval_result[value.tag] = value.simple_value
-          if event_eval_result:
-            if best_eval_result is None or self._compare_fn(
-                best_eval_result, event_eval_result):
-              best_eval_result = event_eval_result
-    return best_eval_result
-
-
-@estimator_export('estimator.FinalExporter')
-class FinalExporter(Exporter):
-  """This class exports the serving graph and checkpoints in the end.
-
-  This class performs a single export in the end of training.
-  """
-
-  def __init__(self,
-               name,
-               serving_input_receiver_fn,
-               assets_extra=None,
-               as_text=False):
-    """Create an `Exporter` to use with `tf.estimator.EvalSpec`.
-
-    Args:
-      name: unique name of this `Exporter` that is going to be used in the
-        export path.
-      serving_input_receiver_fn: a function that takes no arguments and returns
-        a `ServingInputReceiver`.
-      assets_extra: An optional dict specifying how to populate the assets.extra
-        directory within the exported SavedModel.  Each key should give the
-        destination path (including the filename) relative to the assets.extra
-        directory.  The corresponding value gives the full path of the source
-        file to be copied.  For example, the simple case of copying a single
-        file without renaming it is specified as
-        `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-      as_text: whether to write the SavedModel proto in text format. Defaults to
-        `False`.
-
-    Raises:
-      ValueError: if any arguments is invalid.
-    """
-    self._saved_model_exporter = _SavedModelExporter(
-        name, serving_input_receiver_fn, assets_extra, as_text)
-
-  @property
-  def name(self):
-    return self._saved_model_exporter.name
-
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    if not is_the_final_export:
-      return None
-
-    tf_logging.info('Performing the final export in the end of training.')
-
-    return self._saved_model_exporter.export(estimator, export_path,
-                                             checkpoint_path, eval_result,
-                                             is_the_final_export)
-
-
-@estimator_export('estimator.LatestExporter')
-class LatestExporter(Exporter):
-  """This class regularly exports the serving graph and checkpoints.
-
-  In addition to exporting, this class also garbage collects stale exports.
-  """
-
-  def __init__(self,
-               name,
-               serving_input_receiver_fn,
-               assets_extra=None,
-               as_text=False,
-               exports_to_keep=5):
-    """Create an `Exporter` to use with `tf.estimator.EvalSpec`.
-
-    Args:
-      name: unique name of this `Exporter` that is going to be used in the
-        export path.
-      serving_input_receiver_fn: a function that takes no arguments and returns
-        a `ServingInputReceiver`.
-      assets_extra: An optional dict specifying how to populate the assets.extra
-        directory within the exported SavedModel.  Each key should give the
-        destination path (including the filename) relative to the assets.extra
-        directory.  The corresponding value gives the full path of the source
-        file to be copied.  For example, the simple case of copying a single
-        file without renaming it is specified as
-        `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
-      as_text: whether to write the SavedModel proto in text format. Defaults to
-        `False`.
-      exports_to_keep: Number of exports to keep.  Older exports will be
-        garbage-collected.  Defaults to 5.  Set to `None` to disable garbage
-        collection.
-
-    Raises:
-      ValueError: if any arguments is invalid.
-    """
-    self._saved_model_exporter = _SavedModelExporter(
-        name, serving_input_receiver_fn, assets_extra, as_text)
-    self._exports_to_keep = exports_to_keep
-    if exports_to_keep is not None and exports_to_keep <= 0:
-      raise ValueError(
-          '`exports_to_keep`, if provided, must be positive number')
-
-  @property
-  def name(self):
-    return self._saved_model_exporter.name
-
-  def export(self, estimator, export_path, checkpoint_path, eval_result,
-             is_the_final_export):
-    export_result = self._saved_model_exporter.export(
-        estimator, export_path, checkpoint_path, eval_result,
-        is_the_final_export)
-
-    self._garbage_collect_exports(export_path)
-    return export_result
-
-  def _garbage_collect_exports(self, export_dir_base):
-    """Deletes older exports, retaining only a given number of the most recent.
-
-    Export subdirectories are assumed to be named with monotonically increasing
-    integers; the most recent are taken to be those with the largest values.
-
-    Args:
-      export_dir_base: the base directory under which each export is in a
-        versioned subdirectory.
-    """
-    if self._exports_to_keep is None:
-      return
+from tensorflow_estimator.python.estimator import exporter
 
-    def _export_version_parser(path):
-      # create a simple parser that pulls the export_version from the directory.
-      filename = os.path.basename(path.path)
-      if not (len(filename) == 10 and filename.isdigit()):
-        return None
-      return path._replace(export_version=int(filename))
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+exporter.__all__ = [s for s in dir(exporter) if not s.startswith('__')]
 
-    # pylint: disable=protected-access
-    keep_filter = gc._largest_export_versions(self._exports_to_keep)
-    delete_filter = gc._negation(keep_filter)
-    for p in delete_filter(
-        gc._get_paths(export_dir_base, parser=_export_version_parser)):
-      try:
-        gfile.DeleteRecursively(p.path)
-      except errors_impl.NotFoundError as e:
-        tf_logging.warn('Can not delete %s recursively: %s', p.path, e)
-    # pylint: enable=protected-access
+from tensorflow_estimator.python.estimator.exporter import *
diff --git a/tensorflow/python/estimator/exporter_test.py b/tensorflow/python/estimator/exporter_test.py
deleted file mode 100644
index fcccfbde7a9eaa26cc170ac6f49fba2ca61fef00..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/exporter_test.py
+++ /dev/null
@@ -1,400 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for `Exporter`s."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-import time
-
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import exporter as exporter_lib
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
-
-
-class BestExporterTest(test.TestCase):
-
-  def test_error_out_if_exports_to_keep_is_zero(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    with self.assertRaisesRegexp(ValueError, "positive number"):
-      exporter = exporter_lib.BestExporter(
-          name="best_exporter",
-          serving_input_receiver_fn=_serving_input_receiver_fn,
-          exports_to_keep=0)
-      self.assertEqual("best_exporter", exporter.name)
-
-  def test_best_exporter(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp()
-    gfile.MkDir(export_dir_base)
-    gfile.MkDir(export_dir_base + "/export")
-    gfile.MkDir(export_dir_base + "/eval")
-
-    exporter = exporter_lib.BestExporter(
-        name="best_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        exports_to_keep=5)
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.export_savedmodel.return_value = "export_result_path"
-    estimator.model_dir = export_dir_base
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {}, False)
-
-    self.assertEqual("export_result_path", export_result)
-    estimator.export_savedmodel.assert_called_with(
-        export_dir_base,
-        _serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        checkpoint_path="checkpoint_path",
-        strip_default_attrs=True)
-
-  def test_best_export_is_saved(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp()
-    gfile.MkDir(export_dir_base)
-    gfile.MkDir(export_dir_base + "/export")
-    gfile.MkDir(export_dir_base + "/eval")
-
-    exporter = exporter_lib.BestExporter(
-        name="best_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        exports_to_keep=1)
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.export_savedmodel.return_value = "export_result_path"
-    estimator.model_dir = export_dir_base
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"loss": 0.5}, False)
-
-    self.assertTrue(estimator.export_savedmodel.called)
-    self.assertEqual("export_result_path", export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"loss": 0.6}, False)
-    self.assertEqual(None, export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"loss": 0.4}, False)
-    self.assertEqual("export_result_path", export_result)
-
-  def test_best_exporter_with_preemption(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp()
-    gfile.MkDir(export_dir_base)
-    gfile.MkDir(export_dir_base + "/export")
-    gfile.MkDir(export_dir_base + "/eval")
-
-    eval_dir_base = os.path.join(export_dir_base, "eval_continuous")
-    estimator_lib._write_dict_to_summary(eval_dir_base, {"loss": 50}, 1)
-    estimator_lib._write_dict_to_summary(eval_dir_base, {"loss": 60}, 2)
-
-    exporter = exporter_lib.BestExporter(
-        name="best_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        event_file_pattern="eval_continuous/*.tfevents.*",
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        exports_to_keep=1)
-
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.model_dir = export_dir_base
-    estimator.export_savedmodel.return_value = "export_result_path"
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"loss": 100}, False)
-    self.assertEqual(None, export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"loss": 10}, False)
-    self.assertEqual("export_result_path", export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"loss": 20}, False)
-    self.assertEqual(None, export_result)
-
-  def test_best_exporter_with_empty_event(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp()
-    gfile.MkDir(export_dir_base)
-    gfile.MkDir(export_dir_base + "/export")
-    gfile.MkDir(export_dir_base + "/eval")
-
-    eval_dir_base = os.path.join(export_dir_base, "eval_continuous")
-    estimator_lib._write_dict_to_summary(eval_dir_base, {}, 1)
-    estimator_lib._write_dict_to_summary(eval_dir_base, {"loss": 60}, 2)
-
-    exporter = exporter_lib.BestExporter(
-        name="best_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        event_file_pattern="eval_continuous/*.tfevents.*",
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        exports_to_keep=1)
-
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.model_dir = export_dir_base
-    estimator.export_savedmodel.return_value = "export_result_path"
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"loss": 100}, False)
-    self.assertEqual(None, export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {"loss": 10}, False)
-    self.assertEqual("export_result_path", export_result)
-
-  def test_garbage_collect_exports(self):
-    export_dir_base = tempfile.mkdtemp()
-    gfile.MkDir(export_dir_base)
-    gfile.MkDir(export_dir_base + "/export")
-    gfile.MkDir(export_dir_base + "/eval")
-
-    export_dir_1 = _create_test_export_dir(export_dir_base)
-    export_dir_2 = _create_test_export_dir(export_dir_base)
-    export_dir_3 = _create_test_export_dir(export_dir_base)
-    export_dir_4 = _create_test_export_dir(export_dir_base)
-
-    self.assertTrue(gfile.Exists(export_dir_1))
-    self.assertTrue(gfile.Exists(export_dir_2))
-    self.assertTrue(gfile.Exists(export_dir_3))
-    self.assertTrue(gfile.Exists(export_dir_4))
-
-    def _serving_input_receiver_fn():
-      return array_ops.constant([1]), None
-
-    exporter = exporter_lib.BestExporter(
-        name="best_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        exports_to_keep=2)
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.model_dir = export_dir_base
-    # Garbage collect all but the most recent 2 exports,
-    # where recency is determined based on the timestamp directory names.
-    exporter.export(estimator, export_dir_base, None, None, False)
-
-    self.assertFalse(gfile.Exists(export_dir_1))
-    self.assertFalse(gfile.Exists(export_dir_2))
-    self.assertTrue(gfile.Exists(export_dir_3))
-    self.assertTrue(gfile.Exists(export_dir_4))
-
-
-class LatestExporterTest(test.TestCase):
-
-  def test_error_out_if_exports_to_keep_is_zero(self):
-    def _serving_input_receiver_fn():
-      pass
-
-    with self.assertRaisesRegexp(ValueError, "positive number"):
-      exporter = exporter_lib.LatestExporter(
-          name="latest_exporter",
-          serving_input_receiver_fn=_serving_input_receiver_fn,
-          exports_to_keep=0)
-      self.assertEqual("latest_exporter", exporter.name)
-
-  def test_latest_exporter(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp() + "export/"
-    gfile.MkDir(export_dir_base)
-
-    exporter = exporter_lib.LatestExporter(
-        name="latest_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        exports_to_keep=5)
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.export_savedmodel.return_value = "export_result_path"
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {}, False)
-
-    self.assertEqual("export_result_path", export_result)
-    estimator.export_savedmodel.assert_called_with(
-        export_dir_base,
-        _serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        checkpoint_path="checkpoint_path",
-        strip_default_attrs=True)
-
-  def test_only_the_last_export_is_saved(self):
-
-    def _serving_input_receiver_fn():
-      pass
-
-    export_dir_base = tempfile.mkdtemp() + "export/"
-    gfile.MkDir(export_dir_base)
-
-    exporter = exporter_lib.FinalExporter(
-        name="latest_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False)
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    estimator.export_savedmodel.return_value = "export_result_path"
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {}, False)
-
-    self.assertFalse(estimator.export_savedmodel.called)
-    self.assertEqual(None, export_result)
-
-    export_result = exporter.export(estimator, export_dir_base,
-                                    "checkpoint_path", {}, True)
-
-    self.assertEqual("export_result_path", export_result)
-    estimator.export_savedmodel.assert_called_with(
-        export_dir_base,
-        _serving_input_receiver_fn,
-        assets_extra={"from/path": "to/path"},
-        as_text=False,
-        checkpoint_path="checkpoint_path",
-        strip_default_attrs=True)
-
-  def test_garbage_collect_exports(self):
-    export_dir_base = tempfile.mkdtemp() + "export/"
-    gfile.MkDir(export_dir_base)
-    export_dir_1 = _create_test_export_dir(export_dir_base)
-    export_dir_2 = _create_test_export_dir(export_dir_base)
-    export_dir_3 = _create_test_export_dir(export_dir_base)
-    export_dir_4 = _create_test_export_dir(export_dir_base)
-
-    self.assertTrue(gfile.Exists(export_dir_1))
-    self.assertTrue(gfile.Exists(export_dir_2))
-    self.assertTrue(gfile.Exists(export_dir_3))
-    self.assertTrue(gfile.Exists(export_dir_4))
-
-    def _serving_input_receiver_fn():
-      return array_ops.constant([1]), None
-
-    exporter = exporter_lib.LatestExporter(
-        name="latest_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        exports_to_keep=2)
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    # Garbage collect all but the most recent 2 exports,
-    # where recency is determined based on the timestamp directory names.
-    exporter.export(estimator, export_dir_base, None, None, False)
-
-    self.assertFalse(gfile.Exists(export_dir_1))
-    self.assertFalse(gfile.Exists(export_dir_2))
-    self.assertTrue(gfile.Exists(export_dir_3))
-    self.assertTrue(gfile.Exists(export_dir_4))
-
-  def test_garbage_collect_exports_with_trailing_delimiter(self):
-    export_dir_base = tempfile.mkdtemp() + "export/"
-    gfile.MkDir(export_dir_base)
-    export_dir_1 = _create_test_export_dir(export_dir_base)
-    export_dir_2 = _create_test_export_dir(export_dir_base)
-    export_dir_3 = _create_test_export_dir(export_dir_base)
-    export_dir_4 = _create_test_export_dir(export_dir_base)
-
-    self.assertTrue(gfile.Exists(export_dir_1))
-    self.assertTrue(gfile.Exists(export_dir_2))
-    self.assertTrue(gfile.Exists(export_dir_3))
-    self.assertTrue(gfile.Exists(export_dir_4))
-
-    def _serving_input_receiver_fn():
-      return array_ops.constant([1]), None
-
-    exporter = exporter_lib.LatestExporter(
-        name="latest_exporter",
-        serving_input_receiver_fn=_serving_input_receiver_fn,
-        exports_to_keep=1)
-    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
-    # Garbage collect all but the most recent 2 exports,
-    # where recency is determined based on the timestamp directory names.
-    with test.mock.patch.object(gfile, "ListDirectory") as mock_list_directory:
-      mock_list_directory.return_value = [
-          os.path.basename(export_dir_1) + b"/",
-          os.path.basename(export_dir_2) + b"/",
-          os.path.basename(export_dir_3) + b"/",
-          os.path.basename(export_dir_4) + b"/",
-          ]
-      exporter.export(estimator, export_dir_base, None, None, False)
-
-    self.assertFalse(gfile.Exists(export_dir_1))
-    self.assertFalse(gfile.Exists(export_dir_2))
-    self.assertFalse(gfile.Exists(export_dir_3))
-    self.assertTrue(gfile.Exists(export_dir_4))
-
-
-def _create_test_export_dir(export_dir_base):
-  export_dir = _get_timestamped_export_dir(export_dir_base)
-  gfile.MkDir(export_dir)
-  time.sleep(2)
-  return export_dir
-
-
-def _get_timestamped_export_dir(export_dir_base):
-  # When we create a timestamped directory, there is a small chance that the
-  # directory already exists because another worker is also writing exports.
-  # In this case we just wait one second to get a new timestamp and try again.
-  # If this fails several times in a row, then something is seriously wrong.
-  max_directory_creation_attempts = 10
-
-  attempts = 0
-  while attempts < max_directory_creation_attempts:
-    export_timestamp = int(time.time())
-
-    export_dir = os.path.join(
-        compat.as_bytes(export_dir_base), compat.as_bytes(
-            str(export_timestamp)))
-    if not gfile.Exists(export_dir):
-      # Collisions are still possible (though extremely unlikely): this
-      # directory is not actually created yet, but it will be almost
-      # instantly on return from this function.
-      return export_dir
-    time.sleep(1)
-    attempts += 1
-    logging.warn(
-        "Export directory {} already exists; retrying (attempt {}/{})".format(
-            export_dir, attempts, max_directory_creation_attempts))
-  raise RuntimeError("Failed to obtain a unique export directory name after "
-                     "{} attempts.".format(max_directory_creation_attempts))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/estimator/gc.py b/tensorflow/python/estimator/gc.py
index 03ad33dd6b77e4eaad80bd9090911add92b29730..d324b31b5c1df125c2549077f14994a8163c2879 100644
--- a/tensorflow/python/estimator/gc.py
+++ b/tensorflow/python/estimator/gc.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,200 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""gc python module.
 
-r"""System for specifying garbage collection (GC) of path based data.
-
-This framework allows for GC of data specified by path names, for example files
-on disk.  gc.Path objects each represent a single item stored at a path and may
-be a base directory,
-  /tmp/exports/0/...
-  /tmp/exports/1/...
-  ...
-or a fully qualified file,
-  /tmp/train-1.ckpt
-  /tmp/train-2.ckpt
-  ...
-
-A gc filter function takes and returns a list of gc.Path items.  Filter
-functions are responsible for selecting Path items for preservation or deletion.
-Note that functions should always return a sorted list.
-
-For example,
-  base_dir = "/tmp"
-  # Create the directories.
-  for e in xrange(10):
-    os.mkdir("%s/%d" % (base_dir, e), 0o755)
-
-  # Create a simple parser that pulls the export_version from the directory.
-  path_regex = "^" + re.escape(base_dir) + "/(\\d+)$"
-  def parser(path):
-    match = re.match(path_regex, path.path)
-    if not match:
-      return None
-    return path._replace(export_version=int(match.group(1)))
-
-  path_list = gc._get_paths("/tmp", parser)  # contains all ten Paths
-
-  every_fifth = gc._mod_export_version(5)
-  print(every_fifth(path_list))  # shows ["/tmp/0", "/tmp/5"]
-
-  largest_three = gc.largest_export_versions(3)
-  print(largest_three(all_paths))  # shows ["/tmp/7", "/tmp/8", "/tmp/9"]
-
-  both = gc._union(every_fifth, largest_three)
-  print(both(all_paths))  # shows ["/tmp/0", "/tmp/5",
-                          #        "/tmp/7", "/tmp/8", "/tmp/9"]
-  # Delete everything not in 'both'.
-  to_delete = gc._negation(both)
-  for p in to_delete(all_paths):
-    gfile.DeleteRecursively(p.path)  # deletes:  "/tmp/1", "/tmp/2",
-                                     # "/tmp/3", "/tmp/4", "/tmp/6",
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
 """
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import heapq
-import math
-import os
-
-from tensorflow.python.platform import gfile
-from tensorflow.python.util import compat
-
-Path = collections.namedtuple('Path', 'path export_version')
-
-
-def _largest_export_versions(n):
-  """Creates a filter that keeps the largest n export versions.
-
-  Args:
-    n: number of versions to keep.
-
-  Returns:
-    A filter function that keeps the n largest paths.
-  """
-  def keep(paths):
-    heap = []
-    for idx, path in enumerate(paths):
-      if path.export_version is not None:
-        heapq.heappush(heap, (path.export_version, idx))
-    keepers = [paths[i] for _, i in heapq.nlargest(n, heap)]
-    return sorted(keepers)
-
-  return keep
-
-
-def _one_of_every_n_export_versions(n):
-  """Creates a filter that keeps one of every n export versions.
-
-  Args:
-    n: interval size.
-
-  Returns:
-    A filter function that keeps exactly one path from each interval
-    [0, n], (n, 2n], (2n, 3n], etc...  If more than one path exists in an
-    interval the largest is kept.
-  """
-  def keep(paths):
-    """A filter function that keeps exactly one out of every n paths."""
-
-    keeper_map = {}  # map from interval to largest path seen in that interval
-    for p in paths:
-      if p.export_version is None:
-        # Skip missing export_versions.
-        continue
-      # Find the interval (with a special case to map export_version = 0 to
-      # interval 0.
-      interval = math.floor(
-          (p.export_version - 1) / n) if p.export_version else 0
-      existing = keeper_map.get(interval, None)
-      if (not existing) or (existing.export_version < p.export_version):
-        keeper_map[interval] = p
-    return sorted(keeper_map.values())
-
-  return keep
-
-
-def _mod_export_version(n):
-  """Creates a filter that keeps every export that is a multiple of n.
-
-  Args:
-    n: step size.
-
-  Returns:
-    A filter function that keeps paths where export_version % n == 0.
-  """
-  def keep(paths):
-    keepers = []
-    for p in paths:
-      if p.export_version % n == 0:
-        keepers.append(p)
-    return sorted(keepers)
-  return keep
-
-
-def _union(lf, rf):
-  """Creates a filter that keeps the union of two filters.
-
-  Args:
-    lf: first filter
-    rf: second filter
-
-  Returns:
-    A filter function that keeps the n largest paths.
-  """
-  def keep(paths):
-    l = set(lf(paths))
-    r = set(rf(paths))
-    return sorted(list(l|r))
-  return keep
-
-
-def _negation(f):
-  """Negate a filter.
-
-  Args:
-    f: filter function to invert
-
-  Returns:
-    A filter function that returns the negation of f.
-  """
-  def keep(paths):
-    l = set(paths)
-    r = set(f(paths))
-    return sorted(list(l-r))
-  return keep
-
-
-def _get_paths(base_dir, parser):
-  """Gets a list of Paths in a given directory.
+from tensorflow_estimator.python.estimator import gc
 
-  Args:
-    base_dir: directory.
-    parser: a function which gets the raw Path and can augment it with
-      information such as the export_version, or ignore the path by returning
-      None.  An example parser may extract the export version from a path
-      such as "/tmp/exports/100" an another may extract from a full file
-      name such as "/tmp/checkpoint-99.out".
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+gc.__all__ = [s for s in dir(gc) if not s.startswith('__')]
 
-  Returns:
-    A list of Paths contained in the base directory with the parsing function
-    applied.
-    By default the following fields are populated,
-      - Path.path
-    The parsing function is responsible for populating,
-      - Path.export_version
-  """
-  raw_paths = gfile.ListDirectory(base_dir)
-  paths = []
-  for r in raw_paths:
-    # ListDirectory() return paths with "/" at the last if base_dir was GCS URL
-    r = compat.as_str_any(r)
-    if r[-1] == '/':
-      r = r[0:len(r)-1]
-    p = parser(Path(os.path.join(compat.as_str_any(base_dir), r), None))
-    if p:
-      paths.append(p)
-  return sorted(paths)
+from tensorflow_estimator.python.estimator.gc import *
diff --git a/tensorflow/python/estimator/gc_test.py b/tensorflow/python/estimator/gc_test.py
deleted file mode 100644
index 53c3d4ca2acbdf2e68d9ca65acf08749e58577c9..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/gc_test.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for garbage collection utilities."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import re
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.python.estimator import gc
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-def _create_parser(base_dir):
-  # create a simple parser that pulls the export_version from the directory.
-  def parser(path):
-    # Modify the path object for RegEx match for Windows Paths
-    if os.name == "nt":
-      match = re.match(
-          "^" + compat.as_str_any(base_dir).replace("\\", "/") + "/(\\d+)$",
-          compat.as_str_any(path.path).replace("\\", "/"))
-    else:
-      match = re.match("^" + compat.as_str_any(base_dir) + "/(\\d+)$",
-                       compat.as_str_any(path.path))
-    if not match:
-      return None
-    return path._replace(export_version=int(match.group(1)))
-
-  return parser
-
-
-class GcTest(test_util.TensorFlowTestCase):
-
-  def testLargestExportVersions(self):
-    paths = [gc.Path("/foo", 8), gc.Path("/foo", 9), gc.Path("/foo", 10)]
-    newest = gc._largest_export_versions(2)
-    n = newest(paths)
-    self.assertEqual(n, [gc.Path("/foo", 9), gc.Path("/foo", 10)])
-
-  def testLargestExportVersionsDoesNotDeleteZeroFolder(self):
-    paths = [gc.Path("/foo", 0), gc.Path("/foo", 3)]
-    newest = gc._largest_export_versions(2)
-    n = newest(paths)
-    self.assertEqual(n, [gc.Path("/foo", 0), gc.Path("/foo", 3)])
-
-  def testModExportVersion(self):
-    paths = [
-        gc.Path("/foo", 4), gc.Path("/foo", 5), gc.Path("/foo", 6),
-        gc.Path("/foo", 9)
-    ]
-    mod = gc._mod_export_version(2)
-    self.assertEqual(mod(paths), [gc.Path("/foo", 4), gc.Path("/foo", 6)])
-    mod = gc._mod_export_version(3)
-    self.assertEqual(mod(paths), [gc.Path("/foo", 6), gc.Path("/foo", 9)])
-
-  def testOneOfEveryNExportVersions(self):
-    paths = [
-        gc.Path("/foo", 0), gc.Path("/foo", 1), gc.Path("/foo", 3),
-        gc.Path("/foo", 5), gc.Path("/foo", 6), gc.Path("/foo", 7),
-        gc.Path("/foo", 8), gc.Path("/foo", 33)
-    ]
-    one_of = gc._one_of_every_n_export_versions(3)
-    self.assertEqual(
-        one_of(paths), [
-            gc.Path("/foo", 3), gc.Path("/foo", 6), gc.Path("/foo", 8),
-            gc.Path("/foo", 33)
-        ])
-
-  def testOneOfEveryNExportVersionsZero(self):
-    # Zero is a special case since it gets rolled into the first interval.
-    # Test that here.
-    paths = [gc.Path("/foo", 0), gc.Path("/foo", 4), gc.Path("/foo", 5)]
-    one_of = gc._one_of_every_n_export_versions(3)
-    self.assertEqual(one_of(paths), [gc.Path("/foo", 0), gc.Path("/foo", 5)])
-
-  def testUnion(self):
-    paths = []
-    for i in xrange(10):
-      paths.append(gc.Path("/foo", i))
-    f = gc._union(gc._largest_export_versions(3), gc._mod_export_version(3))
-    self.assertEqual(
-        f(paths), [
-            gc.Path("/foo", 0), gc.Path("/foo", 3), gc.Path("/foo", 6),
-            gc.Path("/foo", 7), gc.Path("/foo", 8), gc.Path("/foo", 9)
-        ])
-
-  def testNegation(self):
-    paths = [
-        gc.Path("/foo", 4), gc.Path("/foo", 5), gc.Path("/foo", 6),
-        gc.Path("/foo", 9)
-    ]
-    mod = gc._negation(gc._mod_export_version(2))
-    self.assertEqual(mod(paths), [gc.Path("/foo", 5), gc.Path("/foo", 9)])
-    mod = gc._negation(gc._mod_export_version(3))
-    self.assertEqual(mod(paths), [gc.Path("/foo", 4), gc.Path("/foo", 5)])
-
-  def testPathsWithParse(self):
-    base_dir = os.path.join(test.get_temp_dir(), "paths_parse")
-    self.assertFalse(gfile.Exists(base_dir))
-    for p in xrange(3):
-      gfile.MakeDirs(os.path.join(base_dir, "%d" % p))
-    # add a base_directory to ignore
-    gfile.MakeDirs(os.path.join(base_dir, "ignore"))
-
-    self.assertEqual(
-        gc._get_paths(base_dir, _create_parser(base_dir)),
-        [
-            gc.Path(os.path.join(base_dir, "0"), 0),
-            gc.Path(os.path.join(base_dir, "1"), 1),
-            gc.Path(os.path.join(base_dir, "2"), 2)
-        ])
-
-  def testMixedStrTypes(self):
-    temp_dir = compat.as_bytes(test.get_temp_dir())
-
-    for sub_dir in ["str", b"bytes", u"unicode"]:
-      base_dir = os.path.join(
-          (temp_dir if isinstance(sub_dir, bytes) else temp_dir.decode()),
-          sub_dir)
-      self.assertFalse(gfile.Exists(base_dir))
-      gfile.MakeDirs(os.path.join(compat.as_str_any(base_dir), "42"))
-      gc._get_paths(base_dir, _create_parser(base_dir))
-
-  def testGcsDirWithSeparator(self):
-    base_dir = "gs://bucket/foo"
-    with test.mock.patch.object(gfile, "ListDirectory") as mock_list_directory:
-      # gfile.ListDirectory returns directory names with separator '/'
-      mock_list_directory.return_value = ["0/", "1/"]
-      self.assertEqual(
-          gc._get_paths(base_dir, _create_parser(base_dir)),
-          [
-              gc.Path(os.path.join(base_dir, "0"), 0),
-              gc.Path(os.path.join(base_dir, "1"), 1)
-          ])
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/estimator/inputs/__init__.py b/tensorflow/python/estimator/inputs/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..045ede224de5f1a8d532d65e0eb609861cb3cae9 100644
--- a/tensorflow/python/estimator/inputs/__init__.py
+++ b/tensorflow/python/estimator/inputs/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""inputs python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_estimator.python.estimator import inputs
+
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+inputs.__all__ = [s for s in dir(inputs) if not s.startswith('__')]
+
+from tensorflow_estimator.python.estimator.inputs import *
diff --git a/tensorflow/python/estimator/inputs/inputs.py b/tensorflow/python/estimator/inputs/inputs.py
index 6be168ee08ddf7e4a4a03c3fa75e3de927d2a3a3..25756c54bc66910a3b8b5bb4afd0b921d487962e 100644
--- a/tensorflow/python/estimator/inputs/inputs.py
+++ b/tensorflow/python/estimator/inputs/inputs.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility methods to create simple input_fns."""
+"""inputs python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import,line-too-long
-from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn
-from tensorflow.python.estimator.inputs.pandas_io import pandas_input_fn
+from tensorflow_estimator.python.estimator.inputs import inputs
+
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+inputs.__all__ = [s for s in dir(inputs) if not s.startswith('__')]
 
-# pylint: enable=unused-import,line-too-long
+from tensorflow_estimator.python.estimator.inputs.inputs import *
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index a6cefdece21fa8ce944095cb5d3395f2b67142bd..0600a0f35bc408819015e0c1b8bba50455fb95fd 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,214 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Methods to allow dict of numpy arrays."""
+"""numpy_io python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-import numpy as np
-from six import string_types
-
-from tensorflow.python.estimator.inputs.queues import feeding_functions
-from tensorflow.python.util.tf_export import estimator_export
-
-# Key name to pack the target into dict of `features`. See
-# `_get_unique_target_key` for details.
-_TARGET_KEY = '__target_key__'
-
-
-def _get_unique_target_key(features):
-  """Returns a key not existed in the input dict `features`.
-
-  Caller of `input_fn` usually provides `features` (dict of numpy arrays) and
-  `target`, but the underlying feeding module expects a single dict of numpy
-  arrays as input. So, the `target` needs to be packed into the `features`
-  temporarily and unpacked after calling the feeding function. Toward this goal,
-  this function returns a key not existed in the `features` to pack the
-  `target`.
-
-  Args:
-    features: OrderedDict of numpy arrays
-
-  Returns:
-    A unique key that can be used to insert the subsequent target into
-      features dict.
-  """
-  target_key = _TARGET_KEY
-  while target_key in features:
-    target_key += '_n'
-  return target_key
-
-
-def _validate_and_convert_features(x):
-  """Type check input data and make a shadow copy as an ordered dict.
-
-  Args:
-    x: numpy array object or dict of numpy array objects. If an array,
-      the array will be treated as a single feature.
-
-  Returns:
-    OrderedDict copy of x.
-
-  Raises:
-    ValueError: if x is empty
-    TypeError: if x is an unknown type.
-  """
-  if isinstance(x, dict):
-    if not x:
-      raise ValueError('x cannot be an empty dict')
-    # Make a shadow copy and also ensure the order of iteration is consistent.
-    ordered_dict_data = collections.OrderedDict(
-        sorted(x.items(), key=lambda t: t[0]))
-  elif isinstance(x, np.ndarray):
-    if x.size == 0:
-      raise ValueError('x cannot be an empty array')
-
-    # Make a shadow copy and convert to dict to align with dict processing.
-    ordered_dict_data = collections.OrderedDict({'__direct_np_input__': x})
-  else:
-    x_type = type(x).__name__
-    raise TypeError('x must be a dict or array; got {}'.format(x_type))
-
-  return ordered_dict_data
-
-
-@estimator_export('estimator.inputs.numpy_input_fn')
-def numpy_input_fn(x,
-                   y=None,
-                   batch_size=128,
-                   num_epochs=1,
-                   shuffle=None,
-                   queue_capacity=1000,
-                   num_threads=1):
-  """Returns input function that would feed dict of numpy arrays into the model.
-
-  This returns a function outputting `features` and `targets` based on the dict
-  of numpy arrays. The dict `features` has the same keys as the `x`. The dict
-  `targets` has the same keys as the `y` if `y` is a dict.
-
-  Example:
-
-  ```python
-  age = np.arange(4) * 1.0
-  height = np.arange(32, 36)
-  x = {'age': age, 'height': height}
-  y = np.arange(-32, -28)
-
-  with tf.Session() as session:
-    input_fn = numpy_io.numpy_input_fn(
-        x, y, batch_size=2, shuffle=False, num_epochs=1)
-  ```
-
-  Args:
-    x: numpy array object or dict of numpy array objects. If an array,
-      the array will be treated as a single feature.
-    y: numpy array object or dict of numpy array object. `None` if absent.
-    batch_size: Integer, size of batches to return.
-    num_epochs: Integer, number of epochs to iterate over data. If `None` will
-      run forever.
-    shuffle: Boolean, if True shuffles the queue. Avoid shuffle at prediction
-      time.
-    queue_capacity: Integer, size of queue to accumulate.
-    num_threads: Integer, number of threads used for reading and enqueueing. In
-      order to have predicted and repeatable order of reading and enqueueing,
-      such as in prediction and evaluation mode, `num_threads` should be 1.
-
-  Returns:
-    Function, that has signature of ()->(dict of `features`, `targets`)
-
-  Raises:
-    ValueError: if the shape of `y` mismatches the shape of values in `x` (i.e.,
-      values in `x` have same shape).
-    ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
-    ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array.
-    ValueError: if 'shuffle' is not provided or a bool.
-  """
-  if not isinstance(shuffle, bool):
-    raise ValueError('shuffle must be provided and explicitly set as boolean '
-                     '(it is recommended to set it as True for training); '
-                     'got {}'.format(shuffle))
-
-  def input_fn():
-    """Numpy input function."""
-
-    # Note that `x` should not be used after conversion to ordered_dict_data,
-    # as type could be either dict or array.
-    ordered_dict_data = _validate_and_convert_features(x)
-
-    # Deep copy keys which is a view in python 3
-    feature_keys = list(ordered_dict_data.keys())
-
-    if y is None:
-      target_keys = None
-    elif isinstance(y, dict):
-      if not y:
-        raise ValueError('y cannot be empty dict, use None instead.')
-
-      ordered_dict_y = collections.OrderedDict(
-          sorted(y.items(), key=lambda t: t[0]))
-      target_keys = list(ordered_dict_y.keys())
-
-      duplicate_keys = set(feature_keys).intersection(set(target_keys))
-      if duplicate_keys:
-        raise ValueError('{} duplicate keys are found in both x and y: '
-                         '{}'.format(len(duplicate_keys), duplicate_keys))
-
-      ordered_dict_data.update(ordered_dict_y)
-    else:
-      target_keys = _get_unique_target_key(ordered_dict_data)
-      ordered_dict_data[target_keys] = y
-
-    if len(set(v.shape[0] for v in ordered_dict_data.values())) != 1:
-      shape_dict_of_x = {k: ordered_dict_data[k].shape for k in feature_keys}
-
-      if target_keys is None:
-        shape_of_y = None
-      elif isinstance(target_keys, string_types):
-        shape_of_y = y.shape
-      else:
-        shape_of_y = {k: ordered_dict_data[k].shape for k in target_keys}
-
-      raise ValueError('Length of tensors in x and y is mismatched. All '
-                       'elements in x and y must have the same length.\n'
-                       'Shapes in x: {}\n'
-                       'Shapes in y: {}\n'.format(shape_dict_of_x, shape_of_y))
-
-    queue = feeding_functions._enqueue_data(  # pylint: disable=protected-access
-        ordered_dict_data,
-        queue_capacity,
-        shuffle=shuffle,
-        num_threads=num_threads,
-        enqueue_size=batch_size,
-        num_epochs=num_epochs)
-
-    batch = (
-        queue.dequeue_many(batch_size)
-        if num_epochs is None else queue.dequeue_up_to(batch_size))
-
-    # Remove the first `Tensor` in `batch`, which is the row number.
-    if batch:
-      batch.pop(0)
-
-    if isinstance(x, np.ndarray):
-      # Return as the same type as original array.
-      features = batch[0]
-    else:
-      # Return as the original dict type
-      features = dict(zip(feature_keys, batch[:len(feature_keys)]))
+from tensorflow_estimator.python.estimator.inputs import numpy_io
 
-    if target_keys is None:
-      # TODO(martinwicke), return consistent result
-      return features
-    elif isinstance(target_keys, string_types):
-      target = batch[-1]
-      return features, target
-    else:
-      target = dict(zip(target_keys, batch[-len(target_keys):]))
-      return features, target
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+numpy_io.__all__ = [s for s in dir(numpy_io) if not s.startswith('__')]
 
-  return input_fn
+from tensorflow_estimator.python.estimator.inputs.numpy_io import *
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
deleted file mode 100644
index 632908415f78776f1177ed685b8ad7870bc362be..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ /dev/null
@@ -1,620 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for numpy_io."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column_lib as fc
-from tensorflow.python.feature_column.feature_column import _LinearModel
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.platform import test
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import queue_runner_impl
-
-
-class NumpyIoTest(test.TestCase):
-
-  def testNumpyInputFn(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -28)
-
-    with self.cached_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      session.run([features, target])
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithVeryLargeBatchSizeAndMultipleEpochs(self):
-    a = np.arange(2) * 1.0
-    b = np.arange(32, 34)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -30)
-
-    with self.cached_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=128, shuffle=False, num_epochs=2)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1, 0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33, 32, 33])
-      self.assertAllEqual(res[1], [-32, -31, -32, -31])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithZeroEpochs(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -28)
-
-    with self.cached_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=0)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithBatchSizeNotDividedByDataSize(self):
-    batch_size = 2
-    a = np.arange(5) * 1.0
-    b = np.arange(32, 37)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -27)
-
-    with self.cached_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=batch_size, shuffle=False, num_epochs=1)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [2, 3])
-      self.assertAllEqual(res[0]['b'], [34, 35])
-      self.assertAllEqual(res[1], [-30, -29])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [4])
-      self.assertAllEqual(res[0]['b'], [36])
-      self.assertAllEqual(res[1], [-28])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithBatchSizeNotDividedByDataSizeAndMultipleEpochs(self):
-    batch_size = 2
-    a = np.arange(3) * 1.0
-    b = np.arange(32, 35)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -29)
-
-    with self.cached_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=batch_size, shuffle=False, num_epochs=3)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [2, 0])
-      self.assertAllEqual(res[0]['b'], [34, 32])
-      self.assertAllEqual(res[1], [-30, -32])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [1, 2])
-      self.assertAllEqual(res[0]['b'], [33, 34])
-      self.assertAllEqual(res[1], [-31, -30])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1])
-      self.assertAllEqual(res[0]['b'], [32, 33])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [2])
-      self.assertAllEqual(res[0]['b'], [34])
-      self.assertAllEqual(res[1], [-30])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithBatchSizeLargerThanDataSize(self):
-    batch_size = 10
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -28)
-
-    with self.cached_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=batch_size, shuffle=False, num_epochs=1)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [0, 1, 2, 3])
-      self.assertAllEqual(res[0]['b'], [32, 33, 34, 35])
-      self.assertAllEqual(res[1], [-32, -31, -30, -29])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithDifferentDimensionsOfFeatures(self):
-    a = np.array([[1, 2], [3, 4]])
-    b = np.array([5, 6])
-    x = {'a': a, 'b': b}
-    y = np.arange(-32, -30)
-
-    with self.cached_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-      features, target = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      res = session.run([features, target])
-      self.assertAllEqual(res[0]['a'], [[1, 2], [3, 4]])
-      self.assertAllEqual(res[0]['b'], [5, 6])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithXAsNonDict(self):
-    x = list(range(32, 36))
-    y = np.arange(4)
-    with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, 'x must be a dict or array'):
-        failing_input_fn = numpy_io.numpy_input_fn(
-            x, y, batch_size=2, shuffle=False, num_epochs=1)
-        failing_input_fn()
-
-  def testNumpyInputFnWithXIsEmptyDict(self):
-    x = {}
-    y = np.arange(4)
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'x cannot be an empty'):
-        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
-        failing_input_fn()
-
-  def testNumpyInputFnWithXIsEmptyArray(self):
-    x = np.array([[], []])
-    y = np.arange(4)
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'x cannot be an empty'):
-        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
-        failing_input_fn()
-
-  def testNumpyInputFnWithYIsNone(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = None
-
-    with self.cached_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-      features_tensor = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      feature = session.run(features_tensor)
-      self.assertEqual(len(feature), 2)
-      self.assertAllEqual(feature['a'], [0, 1])
-      self.assertAllEqual(feature['b'], [32, 33])
-
-      session.run([features_tensor])
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features_tensor])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithNonBoolShuffle(self):
-    x = np.arange(32, 36)
-    y = np.arange(4)
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError,
-                                   'shuffle must be provided and explicitly '
-                                   'set as boolean'):
-        # Default shuffle is None.
-        numpy_io.numpy_input_fn(x, y)
-
-  def testNumpyInputFnWithTargetKeyAlreadyInX(self):
-    array = np.arange(32, 36)
-    x = {'__target_key__': array}
-    y = np.arange(4)
-
-    with self.cached_session():
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-      input_fn()
-      self.assertAllEqual(x['__target_key__'], array)
-      # The input x should not be mutated.
-      self.assertItemsEqual(x.keys(), ['__target_key__'])
-
-  def testNumpyInputFnWithMismatchLengthOfInputs(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    x_mismatch_length = {'a': np.arange(1), 'b': b}
-    y_longer_length = np.arange(10)
-
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'Length of tensors in x and y is mismatched.'):
-        failing_input_fn = numpy_io.numpy_input_fn(
-            x, y_longer_length, batch_size=2, shuffle=False, num_epochs=1)
-        failing_input_fn()
-
-      with self.assertRaisesRegexp(
-          ValueError, 'Length of tensors in x and y is mismatched.'):
-        failing_input_fn = numpy_io.numpy_input_fn(
-            x=x_mismatch_length,
-            y=None,
-            batch_size=2,
-            shuffle=False,
-            num_epochs=1)
-        failing_input_fn()
-
-  def testNumpyInputFnWithYAsDict(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = {'y1': np.arange(-32, -28), 'y2': np.arange(32, 28, -1)}
-
-    with self.cached_session() as session:
-      input_fn = numpy_io.numpy_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-      features_tensor, targets_tensor = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      features, targets = session.run([features_tensor, targets_tensor])
-      self.assertEqual(len(features), 2)
-      self.assertAllEqual(features['a'], [0, 1])
-      self.assertAllEqual(features['b'], [32, 33])
-      self.assertEqual(len(targets), 2)
-      self.assertAllEqual(targets['y1'], [-32, -31])
-      self.assertAllEqual(targets['y2'], [32, 31])
-
-      session.run([features_tensor, targets_tensor])
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features_tensor, targets_tensor])
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testNumpyInputFnWithYIsEmptyDict(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = {}
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'y cannot be empty'):
-        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
-        failing_input_fn()
-
-  def testNumpyInputFnWithDuplicateKeysInXAndY(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x = {'a': a, 'b': b}
-    y = {'y1': np.arange(-32, -28), 'a': a, 'y2': np.arange(32, 28, -1), 'b': b}
-    with self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, '2 duplicate keys are found in both x and y'):
-        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
-        failing_input_fn()
-
-  def testNumpyInputFnWithXIsArray(self):
-    x = np.arange(4) * 1.0
-    y = np.arange(-32, -28)
-
-    input_fn = numpy_io.numpy_input_fn(
-        x, y, batch_size=2, shuffle=False, num_epochs=1)
-    features, target = input_fn()
-
-    with monitored_session.MonitoredSession() as session:
-      res = session.run([features, target])
-      self.assertAllEqual(res[0], [0, 1])
-      self.assertAllEqual(res[1], [-32, -31])
-
-      session.run([features, target])
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-  def testNumpyInputFnWithXIsNDArray(self):
-    x = np.arange(16).reshape(4, 2, 2) * 1.0
-    y = np.arange(-48, -32).reshape(4, 2, 2)
-
-    input_fn = numpy_io.numpy_input_fn(
-        x, y, batch_size=2, shuffle=False, num_epochs=1)
-    features, target = input_fn()
-
-    with monitored_session.MonitoredSession() as session:
-      res = session.run([features, target])
-      self.assertAllEqual(res[0], [[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
-      self.assertAllEqual(
-          res[1], [[[-48, -47], [-46, -45]], [[-44, -43], [-42, -41]]])
-
-      session.run([features, target])
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features, target])
-
-  def testNumpyInputFnWithXIsArrayYIsDict(self):
-    x = np.arange(4) * 1.0
-    y = {'y1': np.arange(-32, -28)}
-
-    input_fn = numpy_io.numpy_input_fn(
-        x, y, batch_size=2, shuffle=False, num_epochs=1)
-    features_tensor, targets_tensor = input_fn()
-
-    with monitored_session.MonitoredSession() as session:
-      features, targets = session.run([features_tensor, targets_tensor])
-      self.assertEqual(len(features), 2)
-      self.assertAllEqual(features, [0, 1])
-      self.assertEqual(len(targets), 1)
-      self.assertAllEqual(targets['y1'], [-32, -31])
-
-      session.run([features_tensor, targets_tensor])
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run([features_tensor, targets_tensor])
-
-  def testArrayAndDictGiveSameOutput(self):
-    a = np.arange(4) * 1.0
-    b = np.arange(32, 36)
-    x_arr = np.vstack((a, b))
-    x_dict = {'feature1': x_arr}
-    y = np.arange(-48, -40).reshape(2, 4)
-
-    input_fn_arr = numpy_io.numpy_input_fn(
-        x_arr, y, batch_size=2, shuffle=False, num_epochs=1)
-    features_arr, targets_arr = input_fn_arr()
-
-    input_fn_dict = numpy_io.numpy_input_fn(
-        x_dict, y, batch_size=2, shuffle=False, num_epochs=1)
-    features_dict, targets_dict = input_fn_dict()
-
-    with monitored_session.MonitoredSession() as session:
-      res_arr, res_dict = session.run([
-          (features_arr, targets_arr), (features_dict, targets_dict)])
-
-      self.assertAllEqual(res_arr[0], res_dict[0]['feature1'])
-      self.assertAllEqual(res_arr[1], res_dict[1])
-
-
-class FeatureColumnIntegrationTest(test.TestCase):
-
-  def _initialized_session(self, config=None):
-    sess = session_lib.Session(config=config)
-    sess.run(variables_lib.global_variables_initializer())
-    sess.run(lookup_ops.tables_initializer())
-    return sess
-
-  def _get_linear_model_bias(self, name='linear_model'):
-    with variable_scope.variable_scope(name, reuse=True):
-      return variable_scope.get_variable('bias_weights')
-
-  def _get_linear_model_column_var(self, column, name='linear_model'):
-    return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
-                              name + '/' + column.name)[0]
-
-  def _get_keras_linear_model_predictions(
-      self,
-      features,
-      feature_columns,
-      units=1,
-      sparse_combiner='sum',
-      weight_collections=None,
-      trainable=True,
-      cols_to_vars=None):
-    keras_linear_model = _LinearModel(
-        feature_columns,
-        units,
-        sparse_combiner,
-        weight_collections,
-        trainable,
-        name='linear_model')
-    retval = keras_linear_model(features)  # pylint: disable=not-callable
-    if cols_to_vars is not None:
-      cols_to_vars.update(keras_linear_model.cols_to_vars())
-    return retval
-
-  def test_linear_model_numpy_input_fn(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([-1., 2., 13., 104.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    net = fc.linear_model(features, [price_buckets, body_style])
-    # self.assertEqual(1 + 3 + 5, net.shape[1])
-    with self._initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      bias = self._get_linear_model_bias()
-      price_buckets_var = self._get_linear_model_column_var(price_buckets)
-      body_style_var = self._get_linear_model_column_var(body_style)
-
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def test_linear_model_impl_numpy_input_fn(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([-1., 2., 13., 104.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    net = self._get_keras_linear_model_predictions(
-        features, [price_buckets, body_style])
-    # self.assertEqual(1 + 3 + 5, net.shape[1])
-    with self._initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      bias = self._get_linear_model_bias()
-      price_buckets_var = self._get_linear_model_column_var(price_buckets)
-      body_style_var = self._get_linear_model_column_var(body_style)
-
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def test_functional_input_layer_with_numpy_input_fn(self):
-    embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
-    )
-    def _initializer(shape, dtype, partition_info):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    # one_hot_body_style has 3 dims in input_layer.
-    one_hot_body_style = fc.indicator_column(body_style)
-    # embedded_body_style has 5 dims in input_layer.
-    embedded_body_style = fc.embedding_column(body_style, dimension=5,
-                                              initializer=_initializer)
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([11., 12., 13., 14.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    net = fc.input_layer(features,
-                         [price, one_hot_body_style, embedded_body_style])
-    self.assertEqual(1 + 3 + 5, net.shape[1])
-    with self._initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[11., 12., 13., 14., 15., 0., 0., 1., 11.],
-           [1., 2., 3., 4., 5., 1., 0., 0., 12]],
-          sess.run(net))
-
-      coord.request_stop()
-      coord.join(threads)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index 616bcb410f8119e170e991f8320c5b6448ee85c9..cff91cdc9ac0b09357f3f824c7ceabbaa01ec6b7 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,146 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""pandas_io python module.
 
-"""Methods to allow pandas.DataFrame."""
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import six
-import uuid
 
-import numpy as np
-from tensorflow.python.estimator.inputs.queues import feeding_functions
-from tensorflow.python.util.tf_export import estimator_export
+from tensorflow_estimator.python.estimator.inputs import pandas_io
 
-try:
-  # pylint: disable=g-import-not-at-top
-  # pylint: disable=unused-import
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+pandas_io.__all__ = [s for s in dir(pandas_io) if not s.startswith('__')]
 
-
-def _get_unique_target_key(features, target_column_name):
-  """Returns a key that does not exist in the input DataFrame `features`.
-
-  Args:
-    features: DataFrame
-    target_column_name: Name of the target column as a `str`
-
-  Returns:
-    A unique key that can be used to insert the target into
-      features.
-  """
-  if target_column_name in features:
-    target_column_name += '_' + str(uuid.uuid4())
-  return target_column_name
-
-
-@estimator_export('estimator.inputs.pandas_input_fn')
-def pandas_input_fn(x,
-                    y=None,
-                    batch_size=128,
-                    num_epochs=1,
-                    shuffle=None,
-                    queue_capacity=1000,
-                    num_threads=1,
-                    target_column='target'):
-  """Returns input function that would feed Pandas DataFrame into the model.
-
-  Note: `y`'s index must match `x`'s index.
-
-  Args:
-    x: pandas `DataFrame` object.
-    y: pandas `Series` object or `DataFrame`. `None` if absent.
-    batch_size: int, size of batches to return.
-    num_epochs: int, number of epochs to iterate over data. If not `None`,
-      read attempts that would exceed this value will raise `OutOfRangeError`.
-    shuffle: bool, whether to read the records in random order.
-    queue_capacity: int, size of the read queue. If `None`, it will be set
-      roughly to the size of `x`.
-    num_threads: Integer, number of threads used for reading and enqueueing. In
-      order to have predicted and repeatable order of reading and enqueueing,
-      such as in prediction and evaluation mode, `num_threads` should be 1.
-    target_column: str, name to give the target column `y`. This parameter
-      is not used when `y` is a `DataFrame`.
-
-  Returns:
-    Function, that has signature of ()->(dict of `features`, `target`)
-
-  Raises:
-    ValueError: if `x` already contains a column with the same name as `y`, or
-      if the indexes of `x` and `y` don't match.
-    ValueError: if 'shuffle' is not provided or a bool.
-  """
-  if not HAS_PANDAS:
-    raise TypeError(
-        'pandas_input_fn should not be called without pandas installed')
-
-  if not isinstance(shuffle, bool):
-    raise ValueError('shuffle must be provided and explicitly set as boolean '
-                     '(it is recommended to set it as True for training); '
-                     'got {}'.format(shuffle))
-
-  if not isinstance(target_column, six.string_types):
-    raise TypeError('target_column must be a string type')
-
-  x = x.copy()
-  if y is not None:
-    if target_column in x:
-      raise ValueError(
-          'Cannot use name %s for target column: DataFrame already has a '
-          'column with that name: %s' % (target_column, x.columns))
-    if not np.array_equal(x.index, y.index):
-      raise ValueError('Index for x and y are mismatched.\nIndex for x: %s\n'
-                       'Index for y: %s\n' % (x.index, y.index))
-    if isinstance(y, pd.DataFrame):
-      y_columns = [(column, _get_unique_target_key(x, column))
-                   for column in list(y)]
-      target_column = [v for _, v in y_columns]
-      x[target_column] = y
-    else:
-      x[target_column] = y
-
-  # TODO(mdan): These are memory copies. We probably don't need 4x slack space.
-  # The sizes below are consistent with what I've seen elsewhere.
-  if queue_capacity is None:
-    if shuffle:
-      queue_capacity = 4 * len(x)
-    else:
-      queue_capacity = len(x)
-  min_after_dequeue = max(queue_capacity / 4, 1)
-
-  def input_fn():
-    """Pandas input function."""
-    queue = feeding_functions._enqueue_data(  # pylint: disable=protected-access
-        x,
-        queue_capacity,
-        shuffle=shuffle,
-        min_after_dequeue=min_after_dequeue,
-        num_threads=num_threads,
-        enqueue_size=batch_size,
-        num_epochs=num_epochs)
-    if num_epochs is None:
-      features = queue.dequeue_many(batch_size)
-    else:
-      features = queue.dequeue_up_to(batch_size)
-    assert len(features) == len(x.columns) + 1, ('Features should have one '
-                                                 'extra element for the index.')
-    features = features[1:]
-    features = dict(zip(list(x.columns), features))
-    if y is not None:
-      if isinstance(target_column, list):
-        keys = [k for k, _ in y_columns]
-        values = [features.pop(column) for column in target_column]
-        target = {k: v for k, v in zip(keys, values)}
-      else:
-        target = features.pop(target_column)
-      return features, target
-    return features
-  return input_fn
+from tensorflow_estimator.python.estimator.inputs.pandas_io import *
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
deleted file mode 100644
index 9e69fc72dc66515616be147aab42d14819ec0151..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for pandas_io."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.estimator.inputs import pandas_io
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import queue_runner_impl
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-
-class PandasIoTest(test.TestCase):
-
-  def makeTestDataFrame(self):
-    index = np.arange(100, 104)
-    a = np.arange(4)
-    b = np.arange(32, 36)
-    x = pd.DataFrame({'a': a, 'b': b}, index=index)
-    y = pd.Series(np.arange(-32, -28), index=index)
-    return x, y
-
-  def makeTestDataFrameWithYAsDataFrame(self):
-    index = np.arange(100, 104)
-    a = np.arange(4)
-    b = np.arange(32, 36)
-    a_label = np.arange(10, 14)
-    b_label = np.arange(50, 54)
-    x = pd.DataFrame({'a': a, 'b': b}, index=index)
-    y = pd.DataFrame({'a_target': a_label, 'b_target': b_label}, index=index)
-    return x, y
-
-  def callInputFnOnce(self, input_fn, session):
-    results = input_fn()
-    coord = coordinator.Coordinator()
-    threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-    result_values = session.run(results)
-    coord.request_stop()
-    coord.join(threads)
-    return result_values
-
-  def testPandasInputFn_IndexMismatch(self):
-    if not HAS_PANDAS:
-      return
-    x, _ = self.makeTestDataFrame()
-    y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaises(ValueError):
-      pandas_io.pandas_input_fn(
-          x, y_noindex, batch_size=2, shuffle=False, num_epochs=1)
-
-  def testPandasInputFn_RaisesWhenTargetColumnIsAList(self):
-    if not HAS_PANDAS:
-      return
-
-    x, y = self.makeTestDataFrame()
-
-    with self.assertRaisesRegexp(TypeError,
-                                 'target_column must be a string type'):
-      pandas_io.pandas_input_fn(x, y, batch_size=2,
-                                shuffle=False,
-                                num_epochs=1,
-                                target_column=['one', 'two'])
-
-  def testPandasInputFn_NonBoolShuffle(self):
-    if not HAS_PANDAS:
-      return
-    x, _ = self.makeTestDataFrame()
-    y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(ValueError,
-                                 'shuffle must be provided and explicitly '
-                                 'set as boolean'):
-      # Default shuffle is None
-      pandas_io.pandas_input_fn(x, y_noindex)
-
-  def testPandasInputFn_ProducesExpectedOutputs(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      x, y = self.makeTestDataFrame()
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-
-      features, target = self.callInputFnOnce(input_fn, session)
-
-      self.assertAllEqual(features['a'], [0, 1])
-      self.assertAllEqual(features['b'], [32, 33])
-      self.assertAllEqual(target, [-32, -31])
-
-  def testPandasInputFnWhenYIsDataFrame_ProducesExpectedOutput(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      x, y = self.makeTestDataFrameWithYAsDataFrame()
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-
-      features, targets = self.callInputFnOnce(input_fn, session)
-
-      self.assertAllEqual(features['a'], [0, 1])
-      self.assertAllEqual(features['b'], [32, 33])
-      self.assertAllEqual(targets['a_target'], [10, 11])
-      self.assertAllEqual(targets['b_target'], [50, 51])
-
-  def testPandasInputFnYIsDataFrame_HandlesOverlappingColumns(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      x, y = self.makeTestDataFrameWithYAsDataFrame()
-      y = y.rename(columns={'a_target': 'a', 'b_target': 'b'})
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-
-      features, targets = self.callInputFnOnce(input_fn, session)
-
-      self.assertAllEqual(features['a'], [0, 1])
-      self.assertAllEqual(features['b'], [32, 33])
-      self.assertAllEqual(targets['a'], [10, 11])
-      self.assertAllEqual(targets['b'], [50, 51])
-
-  def testPandasInputFnYIsDataFrame_HandlesOverlappingColumnsInTargets(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      x, y = self.makeTestDataFrameWithYAsDataFrame()
-      y = y.rename(columns={'a_target': 'a', 'b_target': 'a_n'})
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-
-      features, targets = self.callInputFnOnce(input_fn, session)
-
-      self.assertAllEqual(features['a'], [0, 1])
-      self.assertAllEqual(features['b'], [32, 33])
-      self.assertAllEqual(targets['a'], [10, 11])
-      self.assertAllEqual(targets['a_n'], [50, 51])
-
-  def testPandasInputFn_ProducesOutputsForLargeBatchAndMultipleEpochs(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      index = np.arange(100, 102)
-      a = np.arange(2)
-      b = np.arange(32, 34)
-      x = pd.DataFrame({'a': a, 'b': b}, index=index)
-      y = pd.Series(np.arange(-32, -30), index=index)
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=128, shuffle=False, num_epochs=2)
-
-      results = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      features, target = session.run(results)
-      self.assertAllEqual(features['a'], [0, 1, 0, 1])
-      self.assertAllEqual(features['b'], [32, 33, 32, 33])
-      self.assertAllEqual(target, [-32, -31, -32, -31])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run(results)
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testPandasInputFn_ProducesOutputsWhenDataSizeNotDividedByBatchSize(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      index = np.arange(100, 105)
-      a = np.arange(5)
-      b = np.arange(32, 37)
-      x = pd.DataFrame({'a': a, 'b': b}, index=index)
-      y = pd.Series(np.arange(-32, -27), index=index)
-
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-
-      results = input_fn()
-
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-
-      features, target = session.run(results)
-      self.assertAllEqual(features['a'], [0, 1])
-      self.assertAllEqual(features['b'], [32, 33])
-      self.assertAllEqual(target, [-32, -31])
-
-      features, target = session.run(results)
-      self.assertAllEqual(features['a'], [2, 3])
-      self.assertAllEqual(features['b'], [34, 35])
-      self.assertAllEqual(target, [-30, -29])
-
-      features, target = session.run(results)
-      self.assertAllEqual(features['a'], [4])
-      self.assertAllEqual(features['b'], [36])
-      self.assertAllEqual(target, [-28])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        session.run(results)
-
-      coord.request_stop()
-      coord.join(threads)
-
-  def testPandasInputFn_OnlyX(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      x, _ = self.makeTestDataFrame()
-      input_fn = pandas_io.pandas_input_fn(
-          x, y=None, batch_size=2, shuffle=False, num_epochs=1)
-
-      features = self.callInputFnOnce(input_fn, session)
-
-      self.assertAllEqual(features['a'], [0, 1])
-      self.assertAllEqual(features['b'], [32, 33])
-
-  def testPandasInputFn_ExcludesIndex(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      x, y = self.makeTestDataFrame()
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)
-
-      features, _ = self.callInputFnOnce(input_fn, session)
-
-      self.assertFalse('index' in features)
-
-  def assertInputsCallableNTimes(self, input_fn, session, n):
-    inputs = input_fn()
-    coord = coordinator.Coordinator()
-    threads = queue_runner_impl.start_queue_runners(session, coord=coord)
-    for _ in range(n):
-      session.run(inputs)
-    with self.assertRaises(errors.OutOfRangeError):
-      session.run(inputs)
-    coord.request_stop()
-    coord.join(threads)
-
-  def testPandasInputFn_RespectsEpoch_NoShuffle(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      x, y = self.makeTestDataFrame()
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=4, shuffle=False, num_epochs=1)
-
-      self.assertInputsCallableNTimes(input_fn, session, 1)
-
-  def testPandasInputFn_RespectsEpoch_WithShuffle(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      x, y = self.makeTestDataFrame()
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=4, shuffle=True, num_epochs=1)
-
-      self.assertInputsCallableNTimes(input_fn, session, 1)
-
-  def testPandasInputFn_RespectsEpoch_WithShuffleAutosize(self):
-    if not HAS_PANDAS:
-      return
-    with self.cached_session() as session:
-      x, y = self.makeTestDataFrame()
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=2, shuffle=True, queue_capacity=None, num_epochs=2)
-
-      self.assertInputsCallableNTimes(input_fn, session, 4)
-
-  def testPandasInputFn_RespectsEpochUnevenBatches(self):
-    if not HAS_PANDAS:
-      return
-    x, y = self.makeTestDataFrame()
-    with self.cached_session() as session:
-      input_fn = pandas_io.pandas_input_fn(
-          x, y, batch_size=3, shuffle=False, num_epochs=1)
-
-      # Before the last batch, only one element of the epoch should remain.
-      self.assertInputsCallableNTimes(input_fn, session, 2)
-
-  def testPandasInputFn_Idempotent(self):
-    if not HAS_PANDAS:
-      return
-    x, y = self.makeTestDataFrame()
-    for _ in range(2):
-      pandas_io.pandas_input_fn(
-          x, y, batch_size=2, shuffle=False, num_epochs=1)()
-    for _ in range(2):
-      pandas_io.pandas_input_fn(
-          x, y, batch_size=2, shuffle=True, num_epochs=1)()
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/inputs/queues/__init__.py b/tensorflow/python/estimator/inputs/queues/__init__.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ecbf199ac0fe67eefaeab7a55db234dde19448eb 100644
--- a/tensorflow/python/estimator/inputs/queues/__init__.py
+++ b/tensorflow/python/estimator/inputs/queues/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""queues python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_estimator.python.estimator.inputs import queues
+
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+queues.__all__ = [s for s in dir(queues) if not s.startswith('__')]
+
+from tensorflow_estimator.python.estimator.inputs.queues import *
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 51a61adb216c9b019aa01bb7e55c71a8464c01b3..7c9f084e996cac05560141d32e84b2d3966f2369 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,502 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Helper functions for enqueuing data from arrays and pandas `DataFrame`s."""
+"""feeding_functions python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import random
-import types as tp
-import numpy as np
-import six
-
-from tensorflow.python.estimator.inputs.queues import feeding_queue_runner as fqr
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import summary
-from tensorflow.python.training import queue_runner
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-
-def _fill_array(arr, seq, fillvalue=0):
-  """Recursively fills padded arr with elements from seq.
-
-  If length of seq is less than arr padded length, fillvalue used.
-  Args:
-    arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
-    seq: Non-padded list of data samples of shape
-      [batch_size, ..., padded_dim(None)]
-    fillvalue: Default fillvalue to use.
-  """
-  if arr.ndim == 1:
-    try:
-      len_ = len(seq)
-    except TypeError:
-      len_ = 0
-    arr[:len_] = seq
-    arr[len_:] = fillvalue
-  else:
-    for subarr, subseq in six.moves.zip_longest(arr, seq, fillvalue=()):
-      _fill_array(subarr, subseq, fillvalue)
-
-
-def _pad_if_needed(batch_key_item, fillvalue=0):
-  """ Returns padded batch.
-
-  Args:
-    batch_key_item: List of data samples of any type with shape
-      [batch_size, ..., padded_dim(None)].
-    fillvalue: Default fillvalue to use.
-
-  Returns:
-    Padded with zeros tensor of same type and shape
-      [batch_size, ..., max_padded_dim_len].
-
-  Raises:
-    ValueError if data samples have different shapes (except last padded dim).
-  """
-  shapes = [
-      seq.shape[:-1] if len(seq.shape) > 0 else -1 for seq in batch_key_item
-  ]
-  if not all(shapes[0] == x for x in shapes):
-    raise ValueError("Array shapes must match.")
-
-  last_length = [
-      seq.shape[-1] if len(seq.shape) > 0 else 0 for seq in batch_key_item
-  ]
-  if all([x == last_length[0] for x in last_length]):
-    return batch_key_item
-
-  batch_size = len(batch_key_item)
-  max_sequence_length = max(last_length)
-  result_batch = np.zeros(
-      shape=[batch_size] + list(shapes[0]) + [max_sequence_length],
-      dtype=batch_key_item[0].dtype)
-  _fill_array(result_batch, batch_key_item, fillvalue)
-  return result_batch
-
-
-def _get_integer_indices_for_next_batch(batch_indices_start, batch_size,
-                                        epoch_end, array_length, current_epoch,
-                                        total_epochs):
-  """Returns the integer indices for next batch.
-
-  If total epochs is not None and current epoch is the final epoch, the end
-  index of the next batch should not exceed the `epoch_end` (i.e., the final
-  batch might not have size `batch_size` to avoid overshooting the last epoch).
-
-  Args:
-    batch_indices_start: Integer, the index to start next batch.
-    batch_size: Integer, size of batches to return.
-    epoch_end: Integer, the end index of the epoch. The epoch could start from a
-      random position, so `epoch_end` provides the end index for that.
-    array_length: Integer, the length of the array.
-    current_epoch: Integer, the epoch number has been emitted.
-    total_epochs: Integer or `None`, the total number of epochs to emit. If
-      `None` will run forever.
-
-  Returns:
-    A tuple of a list with integer indices for next batch and `current_epoch`
-    value after the next batch.
-
-  Raises:
-    OutOfRangeError if `current_epoch` is not less than `total_epochs`.
-
-  """
-  if total_epochs is not None and current_epoch >= total_epochs:
-    raise errors.OutOfRangeError(None, None,
-                                 "Already emitted %s epochs." % current_epoch)
-
-  batch_indices_end = batch_indices_start + batch_size
-  batch_indices = [
-      j % array_length for j in range(batch_indices_start, batch_indices_end)
-  ]
-  epoch_end_indices = [i for i, x in enumerate(batch_indices) if x == epoch_end]
-  current_epoch += len(epoch_end_indices)
-
-  if total_epochs is None or current_epoch < total_epochs:
-    return (batch_indices, current_epoch)
-
-  # Now we might have emitted more data for expected epochs. Need to trim.
-  final_epoch_end_inclusive = epoch_end_indices[
-      -(current_epoch - total_epochs + 1)]
-  batch_indices = batch_indices[:final_epoch_end_inclusive + 1]
-
-  return (batch_indices, total_epochs)
-
-
-class _ArrayFeedFn(object):
-  """Creates feed dictionaries from numpy arrays."""
-
-  def __init__(self,
-               placeholders,
-               array,
-               batch_size,
-               random_start=False,
-               seed=None,
-               num_epochs=None):
-    if len(placeholders) != 2:
-      raise ValueError("_array_feed_fn expects 2 placeholders; got {}.".format(
-          len(placeholders)))
-    self._placeholders = placeholders
-    self._array = array
-    self._max = len(array)
-    self._batch_size = batch_size
-    self._num_epochs = num_epochs
-    self._epoch = 0
-    random.seed(seed)
-    self._trav = random.randrange(self._max) if random_start else 0
-    self._epoch_end = (self._trav - 1) % self._max
-
-  def __call__(self):
-    integer_indexes, self._epoch = _get_integer_indices_for_next_batch(
-        batch_indices_start=self._trav,
-        batch_size=self._batch_size,
-        epoch_end=self._epoch_end,
-        array_length=self._max,
-        current_epoch=self._epoch,
-        total_epochs=self._num_epochs)
-
-    self._trav = (integer_indexes[-1] + 1) % self._max
-    return {
-        self._placeholders[0]: integer_indexes,
-        self._placeholders[1]: self._array[integer_indexes]
-    }
-
-
-class _OrderedDictNumpyFeedFn(object):
-  """Creates feed dictionaries from `OrderedDict`s of numpy arrays."""
-
-  def __init__(self,
-               placeholders,
-               ordered_dict_of_arrays,
-               batch_size,
-               random_start=False,
-               seed=None,
-               num_epochs=None):
-    if len(placeholders) != len(ordered_dict_of_arrays) + 1:
-      raise ValueError("Expected {} placeholders; got {}.".format(
-          len(ordered_dict_of_arrays), len(placeholders)))
-    self._index_placeholder = placeholders[0]
-    self._col_placeholders = placeholders[1:]
-    self._ordered_dict_of_arrays = ordered_dict_of_arrays
-    self._max = len(next(iter(ordered_dict_of_arrays.values())))
-    for _, v in ordered_dict_of_arrays.items():
-      if len(v) != self._max:
-        raise ValueError("Array lengths must match.")
-    self._batch_size = batch_size
-    self._num_epochs = num_epochs
-    self._epoch = 0
-    random.seed(seed)
-    self._trav = random.randrange(self._max) if random_start else 0
-    self._epoch_end = (self._trav - 1) % self._max
-
-  def __call__(self):
-    integer_indexes, self._epoch = _get_integer_indices_for_next_batch(
-        batch_indices_start=self._trav,
-        batch_size=self._batch_size,
-        epoch_end=self._epoch_end,
-        array_length=self._max,
-        current_epoch=self._epoch,
-        total_epochs=self._num_epochs)
-
-    self._trav = (integer_indexes[-1] + 1) % self._max
-    feed_dict = {self._index_placeholder: integer_indexes}
-    cols = [
-        column[integer_indexes]
-        for column in self._ordered_dict_of_arrays.values()
-    ]
-    feed_dict.update(dict(zip(self._col_placeholders, cols)))
-    return feed_dict
-
-
-class _PandasFeedFn(object):
-  """Creates feed dictionaries from pandas `DataFrames`."""
-
-  def __init__(self,
-               placeholders,
-               dataframe,
-               batch_size,
-               random_start=False,
-               seed=None,
-               num_epochs=None):
-    if len(placeholders) != len(dataframe.columns) + 1:
-      raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns) + 1, len(placeholders)))
-    self._index_placeholder = placeholders[0]
-    self._col_placeholders = placeholders[1:]
-    self._dataframe = dataframe
-    self._max = len(dataframe)
-    self._batch_size = batch_size
-    self._num_epochs = num_epochs
-    self._epoch = 0
-    random.seed(seed)
-    self._trav = random.randrange(self._max) if random_start else 0
-    self._epoch_end = (self._trav - 1) % self._max
-
-  def __call__(self):
-    integer_indexes, self._epoch = _get_integer_indices_for_next_batch(
-        batch_indices_start=self._trav,
-        batch_size=self._batch_size,
-        epoch_end=self._epoch_end,
-        array_length=self._max,
-        current_epoch=self._epoch,
-        total_epochs=self._num_epochs)
-
-    self._trav = (integer_indexes[-1] + 1) % self._max
-    result = self._dataframe.iloc[integer_indexes]
-    cols = [result[col].values for col in result.columns]
-    feed_dict = dict(zip(self._col_placeholders, cols))
-    feed_dict[self._index_placeholder] = result.index.values
-    return feed_dict
-
-
-class _GeneratorFeedFn(object):
-  """Creates feed dictionaries from `Generator` of `dicts` of numpy arrays."""
-
-  def __init__(self,
-               placeholders,
-               generator,
-               batch_size,
-               random_start=False,
-               seed=None,
-               num_epochs=None,
-               pad_value=None):
-    first_sample = next(generator())
-    if len(placeholders) != len(first_sample):
-      raise ValueError("Expected {} placeholders; got {}.".format(
-          len(first_sample), len(placeholders)))
-    self._keys = sorted(list(first_sample.keys()))
-    self._col_placeholders = placeholders
-    self._generator_function = generator
-    self._iterator = generator()
-    self._batch_size = batch_size
-    self._num_epochs = num_epochs
-    self._epoch = 0
-    self._pad_value = pad_value
-    random.seed(seed)
-
-  def __call__(self):
-    if self._num_epochs and self._epoch >= self._num_epochs:
-      raise errors.OutOfRangeError(None, None,
-                                   "Already emitted %s epochs." % self._epoch)
-    list_dict = {}
-    list_dict_size = 0
-    while list_dict_size < self._batch_size:
-      try:
-        data_row = next(self._iterator)
-      except StopIteration:
-        self._epoch += 1
-        self._iterator = self._generator_function()
-        data_row = next(self._iterator)
-      for index, key in enumerate(self._keys):
-        if key not in data_row.keys():
-          raise KeyError("key mismatch between dicts emitted by GenFun "
-                         "Expected {} keys; got {}".format(
-                             self._keys, data_row.keys()))
-        list_dict.setdefault(self._col_placeholders[index], list()).append(
-            data_row[key])
-        list_dict_size += 1
-
-    if self._pad_value is not None:
-      feed_dict = {
-          key: np.asarray(_pad_if_needed(item, self._pad_value))
-          for key, item in list(list_dict.items())
-      }
-    else:
-      feed_dict = {
-          key: np.asarray(item)
-          for key, item in list(list_dict.items())
-      }
-    return feed_dict
-
-
-def _enqueue_data(data,
-                  capacity,
-                  shuffle=False,
-                  min_after_dequeue=None,
-                  num_threads=1,
-                  seed=None,
-                  name="enqueue_input",
-                  enqueue_size=1,
-                  num_epochs=None,
-                  pad_value=None):
-  """Creates a queue filled from a numpy array or pandas `DataFrame`.
-
-    Returns a queue filled with the rows of the given (`OrderedDict` of) array
-    or `DataFrame`. In the case of a pandas `DataFrame`, the first enqueued
-    `Tensor` corresponds to the index of the `DataFrame`. For (`OrderedDict` of)
-    numpy arrays, the first enqueued `Tensor` contains the row number.
-
-  Args:
-    data: a numpy `ndarray`, `OrderedDict` of numpy arrays, or a generator
-       yielding `dict`s of numpy arrays or pandas `DataFrame` that will be read
-       into the queue.
-    capacity: the capacity of the queue.
-    shuffle: whether or not to shuffle the rows of the array.
-    min_after_dequeue: minimum number of elements that can remain in the queue
-    after a dequeue operation. Only used when `shuffle` is true. If not set,
-    defaults to `capacity` / 4.
-    num_threads: number of threads used for reading and enqueueing.
-    seed: used to seed shuffling and reader starting points.
-    name: a scope name identifying the data.
-    enqueue_size: the number of rows to enqueue per step.
-    num_epochs: limit enqueuing to a specified number of epochs, if provided.
-    pad_value: default value for dynamic padding of data samples, if provided.
-
-  Returns:
-    A queue filled with the rows of the given (`OrderedDict` of) array or
-      `DataFrame`.
-
-  Raises:
-    TypeError: `data` is not a Pandas `DataFrame`, an `OrderedDict` of numpy
-      arrays, a numpy `ndarray`, or a generator producing these.
-    NotImplementedError: padding and shuffling data at the same time.
-    NotImplementedError: padding usage with non generator data type.
-  """
-  with ops.name_scope(name):
-    if isinstance(data, np.ndarray):
-      types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
-      queue_shapes = [(), data.shape[1:]]
-      get_feed_fn = _ArrayFeedFn
-    elif isinstance(data, collections.OrderedDict):
-      types = [dtypes.int64
-              ] + [dtypes.as_dtype(col.dtype) for col in data.values()]
-      queue_shapes = [()] + [col.shape[1:] for col in data.values()]
-      get_feed_fn = _OrderedDictNumpyFeedFn
-    elif isinstance(data, tp.FunctionType):
-      x_first_el = six.next(data())
-      x_first_keys = sorted(x_first_el.keys())
-      x_first_values = [x_first_el[key] for key in x_first_keys]
-      types = [dtypes.as_dtype(col.dtype) for col in x_first_values]
-      queue_shapes = [col.shape for col in x_first_values]
-      get_feed_fn = _GeneratorFeedFn
-    elif HAS_PANDAS and isinstance(data, pd.DataFrame):
-      types = [
-          dtypes.as_dtype(dt) for dt in [data.index.dtype] + list(data.dtypes)
-      ]
-      queue_shapes = [() for _ in types]
-      get_feed_fn = _PandasFeedFn
-    else:
-      raise TypeError(
-          "data must be either a numpy array or pandas DataFrame if pandas is "
-          "installed; got {}".format(type(data).__name__))
-
-    pad_data = pad_value is not None
-    if pad_data and get_feed_fn is not _GeneratorFeedFn:
-      raise NotImplementedError(
-          "padding is only available with generator usage")
-    if shuffle and pad_data:
-      raise NotImplementedError(
-          "padding and shuffling data at the same time is not implemented")
-
-    # TODO(jamieas): TensorBoard warnings for all warnings below once available.
-
-    if num_threads > 1 and num_epochs is not None:
-      logging.warning(
-          "enqueue_data was called with num_epochs and num_threads > 1. "
-          "num_epochs is applied per thread, so this will produce more "
-          "epochs than you probably intend. "
-          "If you want to limit epochs, use one thread.")
-
-    if shuffle and num_threads > 1 and num_epochs is not None:
-      logging.warning(
-          "enqueue_data was called with shuffle=True, num_threads > 1, and "
-          "num_epochs. This will create multiple threads, all reading the "
-          "array/dataframe in order adding to the same shuffling queue; the "
-          "results will likely not be sufficiently shuffled.")
-
-    if not shuffle and num_threads > 1:
-      logging.warning(
-          "enqueue_data was called with shuffle=False and num_threads > 1. "
-          "This will create multiple threads, all reading the "
-          "array/dataframe in order. If you want examples read in order, use"
-          " one thread; if you want multiple threads, enable shuffling.")
-
-    if shuffle:
-      min_after_dequeue = int(capacity / 4 if min_after_dequeue is None else
-                              min_after_dequeue)
-      queue = data_flow_ops.RandomShuffleQueue(
-          capacity,
-          min_after_dequeue,
-          dtypes=types,
-          shapes=queue_shapes,
-          seed=seed)
-    elif pad_data:
-      min_after_dequeue = 0  # just for the summary text
-      queue_shapes = list(
-          map(lambda x: tuple(list(x[:-1]) + [None]) if len(x) > 0 else x,
-              queue_shapes))
-      queue = data_flow_ops.PaddingFIFOQueue(
-          capacity, dtypes=types, shapes=queue_shapes)
-    else:
-      min_after_dequeue = 0  # just for the summary text
-      queue = data_flow_ops.FIFOQueue(
-          capacity, dtypes=types, shapes=queue_shapes)
-
-    enqueue_ops = []
-    feed_fns = []
-
-    for i in range(num_threads):
-      # Note the placeholders have no shapes, so they will accept any
-      # enqueue_size.  enqueue_many below will break them up.
-      placeholders = [array_ops.placeholder(t) for t in types]
-
-      enqueue_ops.append(queue.enqueue_many(placeholders))
-      seed_i = None if seed is None else (i + 1) * seed
-
-      if not pad_data:
-        feed_fns.append(
-            get_feed_fn(
-                placeholders,
-                data,
-                enqueue_size,
-                random_start=shuffle,
-                seed=seed_i,
-                num_epochs=num_epochs))
-      else:
-        feed_fns.append(
-            get_feed_fn(
-                placeholders,
-                data,
-                enqueue_size,
-                random_start=shuffle,
-                seed=seed_i,
-                num_epochs=num_epochs,
-                pad_value=pad_value))
+from tensorflow_estimator.python.estimator.inputs.queues import feeding_functions
 
-    runner = fqr._FeedingQueueRunner(  # pylint: disable=protected-access
-        queue=queue,
-        enqueue_ops=enqueue_ops,
-        feed_fns=feed_fns)
-    queue_runner.add_queue_runner(runner)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+feeding_functions.__all__ = [
+    s for s in dir(feeding_functions) if not s.startswith('__')
+]
 
-    full = (
-        math_ops.cast(
-            math_ops.maximum(0,
-                             queue.size() - min_after_dequeue), dtypes.float32)
-        * (1. / (capacity - min_after_dequeue)))
-    # Note that name contains a '/' at the end so we intentionally do not place
-    # a '/' after %s below.
-    summary_name = ("queue/%sfraction_over_%d_of_%d_full" %
-                    (queue.name, min_after_dequeue,
-                     capacity - min_after_dequeue))
-    summary.scalar(summary_name, full)
-    return queue
+from tensorflow_estimator.python.estimator.inputs.queues.feeding_functions import *
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions_test.py b/tensorflow/python/estimator/inputs/queues/feeding_functions_test.py
deleted file mode 100644
index 30abd82130822c7fcdfa563ff8b289691d2af98f..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions_test.py
+++ /dev/null
@@ -1,391 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests feeding functions using arrays and `DataFrames`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-import numpy as np
-
-from tensorflow.python.estimator.inputs.queues import feeding_functions as ff
-from tensorflow.python.platform import test
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-
-def vals_to_list(a):
-  return {
-      key: val.tolist() if isinstance(val, np.ndarray) else val
-      for key, val in a.items()
-  }
-
-
-class _FeedingFunctionsTestCase(test.TestCase):
-  """Tests for feeding functions."""
-
-  def testArrayFeedFnBatchOne(self):
-    array = np.arange(32).reshape([16, 2])
-    placeholders = ["index_placeholder", "value_placeholder"]
-    aff = ff._ArrayFeedFn(placeholders, array, 1)
-
-    # cycle around a couple times
-    for x in range(0, 100):
-      i = x % 16
-      expected = {
-          "index_placeholder": [i],
-          "value_placeholder": [[2 * i, 2 * i + 1]]
-      }
-      actual = aff()
-      self.assertEqual(expected, vals_to_list(actual))
-
-  def testArrayFeedFnBatchFive(self):
-    array = np.arange(32).reshape([16, 2])
-    placeholders = ["index_placeholder", "value_placeholder"]
-    aff = ff._ArrayFeedFn(placeholders, array, 5)
-
-    # cycle around a couple times
-    for _ in range(0, 101, 2):
-      aff()
-
-    expected = {
-        "index_placeholder": [15, 0, 1, 2, 3],
-        "value_placeholder": [[30, 31], [0, 1], [2, 3], [4, 5], [6, 7]]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testArrayFeedFnBatchTwoWithOneEpoch(self):
-    array = np.arange(5) + 10
-    placeholders = ["index_placeholder", "value_placeholder"]
-    aff = ff._ArrayFeedFn(placeholders, array, batch_size=2, num_epochs=1)
-
-    expected = {
-        "index_placeholder": [0, 1],
-        "value_placeholder": [10, 11]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-    expected = {
-        "index_placeholder": [2, 3],
-        "value_placeholder": [12, 13]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-    expected = {
-        "index_placeholder": [4],
-        "value_placeholder": [14]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testArrayFeedFnBatchOneHundred(self):
-    array = np.arange(32).reshape([16, 2])
-    placeholders = ["index_placeholder", "value_placeholder"]
-    aff = ff._ArrayFeedFn(placeholders, array, 100)
-
-    expected = {
-        "index_placeholder":
-            list(range(0, 16)) * 6 + list(range(0, 4)),
-        "value_placeholder":
-            np.arange(32).reshape([16, 2]).tolist() * 6 +
-            [[0, 1], [2, 3], [4, 5], [6, 7]]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testArrayFeedFnBatchOneHundredWithSmallerArrayAndMultipleEpochs(self):
-    array = np.arange(2) + 10
-    placeholders = ["index_placeholder", "value_placeholder"]
-    aff = ff._ArrayFeedFn(placeholders, array, batch_size=100, num_epochs=2)
-
-    expected = {
-        "index_placeholder": [0, 1, 0, 1],
-        "value_placeholder": [10, 11, 10, 11],
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testPandasFeedFnBatchOne(self):
-    if not HAS_PANDAS:
-      return
-    array1 = np.arange(32, 64)
-    array2 = np.arange(64, 96)
-    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 128))
-    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
-    aff = ff._PandasFeedFn(placeholders, df, 1)
-
-    # cycle around a couple times
-    for x in range(0, 100):
-      i = x % 32
-      expected = {
-          "index_placeholder": [i + 96],
-          "a_placeholder": [32 + i],
-          "b_placeholder": [64 + i]
-      }
-      actual = aff()
-      self.assertEqual(expected, vals_to_list(actual))
-
-  def testPandasFeedFnBatchFive(self):
-    if not HAS_PANDAS:
-      return
-    array1 = np.arange(32, 64)
-    array2 = np.arange(64, 96)
-    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 128))
-    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
-    aff = ff._PandasFeedFn(placeholders, df, 5)
-
-    # cycle around a couple times
-    for _ in range(0, 101, 2):
-      aff()
-
-    expected = {
-        "index_placeholder": [127, 96, 97, 98, 99],
-        "a_placeholder": [63, 32, 33, 34, 35],
-        "b_placeholder": [95, 64, 65, 66, 67]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testPandasFeedFnBatchTwoWithOneEpoch(self):
-    if not HAS_PANDAS:
-      return
-    array1 = np.arange(32, 37)
-    array2 = np.arange(64, 69)
-    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 101))
-    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
-    aff = ff._PandasFeedFn(placeholders, df, batch_size=2, num_epochs=1)
-
-    expected = {
-        "index_placeholder": [96, 97],
-        "a_placeholder": [32, 33],
-        "b_placeholder": [64, 65]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-    expected = {
-        "index_placeholder": [98, 99],
-        "a_placeholder": [34, 35],
-        "b_placeholder": [66, 67]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-    expected = {
-        "index_placeholder": [100],
-        "a_placeholder": [36],
-        "b_placeholder": [68]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testPandasFeedFnBatchOneHundred(self):
-    if not HAS_PANDAS:
-      return
-    array1 = np.arange(32, 64)
-    array2 = np.arange(64, 96)
-    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 128))
-    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
-    aff = ff._PandasFeedFn(placeholders, df, 100)
-
-    expected = {
-        "index_placeholder": list(range(96, 128)) * 3 + list(range(96, 100)),
-        "a_placeholder": list(range(32, 64)) * 3 + list(range(32, 36)),
-        "b_placeholder": list(range(64, 96)) * 3 + list(range(64, 68))
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testPandasFeedFnBatchOneHundredWithSmallDataArrayAndMultipleEpochs(self):
-    if not HAS_PANDAS:
-      return
-    array1 = np.arange(32, 34)
-    array2 = np.arange(64, 66)
-    df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 98))
-    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
-    aff = ff._PandasFeedFn(placeholders, df, batch_size=100, num_epochs=2)
-
-    expected = {
-        "index_placeholder": [96, 97, 96, 97],
-        "a_placeholder": [32, 33, 32, 33],
-        "b_placeholder": [64, 65, 64, 65]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testOrderedDictNumpyFeedFnBatchTwoWithOneEpoch(self):
-    a = np.arange(32, 37)
-    b = np.arange(64, 69)
-    x = {"a": a, "b": b}
-    ordered_dict_x = collections.OrderedDict(
-        sorted(x.items(), key=lambda t: t[0]))
-    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
-    aff = ff._OrderedDictNumpyFeedFn(
-        placeholders, ordered_dict_x, batch_size=2, num_epochs=1)
-
-    expected = {
-        "index_placeholder": [0, 1],
-        "a_placeholder": [32, 33],
-        "b_placeholder": [64, 65]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-    expected = {
-        "index_placeholder": [2, 3],
-        "a_placeholder": [34, 35],
-        "b_placeholder": [66, 67]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-    expected = {
-        "index_placeholder": [4],
-        "a_placeholder": [36],
-        "b_placeholder": [68]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testOrderedDictNumpyFeedFnLargeBatchWithSmallArrayAndMultipleEpochs(self):
-    a = np.arange(32, 34)
-    b = np.arange(64, 66)
-    x = {"a": a, "b": b}
-    ordered_dict_x = collections.OrderedDict(
-        sorted(x.items(), key=lambda t: t[0]))
-    placeholders = ["index_placeholder", "a_placeholder", "b_placeholder"]
-    aff = ff._OrderedDictNumpyFeedFn(
-        placeholders, ordered_dict_x, batch_size=100, num_epochs=2)
-
-    expected = {
-        "index_placeholder": [0, 1, 0, 1],
-        "a_placeholder": [32, 33, 32, 33],
-        "b_placeholder": [64, 65, 64, 65]
-    }
-    actual = aff()
-    self.assertEqual(expected, vals_to_list(actual))
-
-  def testFillArraySmall(self):
-    a = (np.ones(shape=[32, 32], dtype=np.int32).tolist() +
-         np.ones(shape=[32, 36], dtype=np.int32).tolist())
-    actual = np.ones(shape=[64, 36], dtype=np.int32)
-    ff._fill_array(actual, a)
-    expected = np.ones(shape=[64, 36], dtype=np.int32)
-    expected[:32, 32:] = 0
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-  def testFillArrayLarge(self):
-    a = (np.ones(shape=[8, 8, 8, 8, 32], dtype=np.int32).tolist() +
-         np.ones(shape=[8, 8, 8, 8, 36], dtype=np.int32).tolist())
-    actual = np.ones(shape=[16, 8, 8, 8, 36], dtype=np.int32)
-    ff._fill_array(actual, a)
-    expected = np.ones(shape=[16, 8, 8, 8, 36], dtype=np.int32)
-    expected[:8, ..., 32:] = 0
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-  def testFillArraySmallWithSpecifiedValue(self):
-    fill_value = 8
-    a = (np.ones(shape=[32, 32], dtype=np.int32).tolist() +
-         np.ones(shape=[32, 36], dtype=np.int32).tolist())
-    actual = np.ones(shape=[64, 36], dtype=np.int32)
-    ff._fill_array(actual, a, fill_value)
-    expected = np.ones(shape=[64, 36], dtype=np.int32)
-    expected[:32, 32:] = fill_value
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-  def testFillArrayLargeWithSpecifiedValue(self):
-    fill_value = 8
-    a = (np.ones(shape=[8, 8, 8, 8, 32], dtype=np.int32).tolist() +
-         np.ones(shape=[8, 8, 8, 8, 36], dtype=np.int32).tolist())
-    actual = np.ones(shape=[16, 8, 8, 8, 36], dtype=np.int32)
-    ff._fill_array(actual, a, fill_value)
-    expected = np.ones(shape=[16, 8, 8, 8, 36], dtype=np.int32)
-    expected[:8, ..., 32:] = fill_value
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-  def testPadIfNeededSmall(self):
-    a = (np.ones(shape=[32, 32], dtype=np.int32).tolist() +
-         np.ones(shape=[32, 36], dtype=np.int32).tolist())
-    a = list(map(np.array, a))
-    actual = ff._pad_if_needed(a)
-    expected = np.ones(shape=[64, 36], dtype=np.int32)
-    expected[:32, 32:] = 0
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-  def testPadIfNeededLarge(self):
-    a = (np.ones(shape=[8, 8, 8, 8, 32], dtype=np.int32).tolist() +
-         np.ones(shape=[8, 8, 8, 8, 36], dtype=np.int32).tolist())
-    a = list(map(np.array, a))
-    actual = ff._pad_if_needed(a)
-    expected = np.ones(shape=[16, 8, 8, 8, 36], dtype=np.int32)
-    expected[:8, ..., 32:] = 0
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-  def testPadIfNeededSmallWithSpecifiedValue(self):
-    fill_value = 8
-    a = (np.ones(shape=[32, 32], dtype=np.int32).tolist() +
-         np.ones(shape=[32, 36], dtype=np.int32).tolist())
-    a = list(map(np.array, a))
-    actual = ff._pad_if_needed(a, fill_value)
-    expected = np.ones(shape=[64, 36], dtype=np.int32)
-    expected[:32, 32:] = fill_value
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-  def testPadIfNeededLargeWithSpecifiedValue(self):
-    fill_value = 8
-    a = (np.ones(shape=[8, 8, 8, 8, 32], dtype=np.int32).tolist() +
-         np.ones(shape=[8, 8, 8, 8, 36], dtype=np.int32).tolist())
-    a = list(map(np.array, a))
-    actual = ff._pad_if_needed(a, fill_value)
-    expected = np.ones(shape=[16, 8, 8, 8, 36], dtype=np.int32)
-    expected[:8, ..., 32:] = fill_value
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-  def testPadIfNeededSmallWithSpecifiedNonNumericValue(self):
-    fill_value = False
-    a = (np.ones(shape=[32, 32], dtype=np.bool).tolist() +
-         np.ones(shape=[32, 36], dtype=np.bool).tolist())
-    a = list(map(np.array, a))
-    actual = ff._pad_if_needed(a, fill_value)
-    expected = np.ones(shape=[64, 36], dtype=np.bool)
-    expected[:32, 32:] = fill_value
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-  def testPadIfNeededLargeWithSpecifiedNonNumericValue(self):
-    fill_value = False
-    a = (np.ones(shape=[8, 8, 8, 8, 32], dtype=np.bool).tolist() +
-         np.ones(shape=[8, 8, 8, 8, 36], dtype=np.bool).tolist())
-    a = list(map(np.array, a))
-    actual = ff._pad_if_needed(a, fill_value)
-    expected = np.ones(shape=[16, 8, 8, 8, 36], dtype=np.bool)
-    expected[:8, ..., 32:] = fill_value
-    self.assertEqual(expected.tolist(), actual.tolist())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_queue_runner.py b/tensorflow/python/estimator/inputs/queues/feeding_queue_runner.py
index afbcab596a148edbab39d26e6559f05407d1a96c..2268b0140dfca9b74f241ea97cdf887f1c62075f 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_queue_runner.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_queue_runner.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,169 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""feeding_queue_runner python module.
 
-"""A `QueueRunner` that takes a feed function as an argument."""
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
+from tensorflow_estimator.python.estimator.inputs.queues import feeding_queue_runner
 
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import queue_runner as qr
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+feeding_queue_runner.__all__ = [
+    s for s in dir(feeding_queue_runner) if not s.startswith('__')
+]
 
-
-class _FeedingQueueRunner(qr.QueueRunner):
-  """A queue runner that allows the feeding of values such as numpy arrays."""
-
-  def __init__(self, queue=None, enqueue_ops=None, close_op=None,
-               cancel_op=None, feed_fns=None,
-               queue_closed_exception_types=None):
-    """Initialize the queue runner.
-
-    For further documentation, see `queue_runner.py`. Note that
-    `FeedingQueueRunner` does not support construction from protobuffer nor
-    serialization to protobuffer.
-
-    Args:
-      queue: A `Queue`.
-      enqueue_ops: List of enqueue ops to run in threads later.
-      close_op: Op to close the queue. Pending enqueue ops are preserved.
-      cancel_op: Op to close the queue and cancel pending enqueue ops.
-      feed_fns: a list of functions that return a dictionary mapping fed
-        `Tensor`s to values. Must be the same length as `enqueue_ops`.
-      queue_closed_exception_types: Optional tuple of Exception types that
-        indicate that the queue has been closed when raised during an enqueue
-        operation.  Defaults to
-        `(tf.errors.OutOfRangeError, tf.errors.CancelledError)`.
-
-    Raises:
-      ValueError: `feed_fns` is not `None` and has different length than
-        `enqueue_ops`.
-    """
-    if queue_closed_exception_types is None:
-      queue_closed_exception_types = (
-          errors.OutOfRangeError, errors.CancelledError)
-    super(_FeedingQueueRunner, self).__init__(
-        queue, enqueue_ops, close_op,
-        cancel_op, queue_closed_exception_types=queue_closed_exception_types)
-    if feed_fns is None:
-      self._feed_fns = [None for _ in enqueue_ops]
-    else:
-      if len(feed_fns) != len(enqueue_ops):
-        raise ValueError(
-            "If feed_fns is not None, it must have the same length as "
-            "enqueue_ops.")
-      self._feed_fns = feed_fns
-
-  # pylint: disable=broad-except
-  def _run(self, sess, enqueue_op, feed_fn, coord=None):
-    """Execute the enqueue op in a loop, close the queue in case of error.
-
-    Args:
-      sess: A `Session`.
-      enqueue_op: The `Operation` to run.
-      feed_fn: the feed function to pass to `sess.run`.
-      coord: Optional `Coordinator` object for reporting errors and checking
-        for stop conditions.
-
-    """
-    # TODO(jamieas): Reduce code duplication with `QueueRunner`.
-    if coord:
-      coord.register_thread(threading.current_thread())
-    decremented = False
-    try:
-      while True:
-        if coord and coord.should_stop():
-          break
-        try:
-          feed_dict = None if feed_fn is None else feed_fn()
-          sess.run(enqueue_op, feed_dict=feed_dict)
-        except (errors.OutOfRangeError, errors.CancelledError):
-          # This exception indicates that a queue was closed.
-          with self._lock:
-            self._runs_per_session[sess] -= 1
-            decremented = True
-            if self._runs_per_session[sess] == 0:
-              try:
-                sess.run(self._close_op)
-              except Exception as e:
-                # Intentionally ignore errors from close_op.
-                logging.vlog(1, "Ignored exception: %s", str(e))
-            return
-    except Exception as e:
-      # This catches all other exceptions.
-      if coord:
-        coord.request_stop(e)
-      else:
-        logging.error("Exception in QueueRunner: %s", str(e))
-        with self._lock:
-          self._exceptions_raised.append(e)
-        raise
-    finally:
-      # Make sure we account for all terminations: normal or errors.
-      if not decremented:
-        with self._lock:
-          self._runs_per_session[sess] -= 1
-
-  def create_threads(self, sess, coord=None, daemon=False, start=False):
-    """Create threads to run the enqueue ops for the given session.
-
-    This method requires a session in which the graph was launched.  It creates
-    a list of threads, optionally starting them.  There is one thread for each
-    op passed in `enqueue_ops`.
-
-    The `coord` argument is an optional coordinator, that the threads will use
-    to terminate together and report exceptions.  If a coordinator is given,
-    this method starts an additional thread to close the queue when the
-    coordinator requests a stop.
-
-    If previously created threads for the given session are still running, no
-    new threads will be created.
-
-    Args:
-      sess: A `Session`.
-      coord: Optional `Coordinator` object for reporting errors and checking
-        stop conditions.
-      daemon: Boolean.  If `True` make the threads daemon threads.
-      start: Boolean.  If `True` starts the threads.  If `False` the
-        caller must call the `start()` method of the returned threads.
-
-    Returns:
-      A list of threads.
-    """
-    with self._lock:
-      try:
-        if self._runs_per_session[sess] > 0:
-          # Already started: no new threads to return.
-          return []
-      except KeyError:
-        # We haven't seen this session yet.
-        pass
-      self._runs_per_session[sess] = len(self._enqueue_ops)
-      self._exceptions_raised = []
-
-    ret_threads = [threading.Thread(target=self._run,
-                                    args=(sess, op, feed_fn, coord))
-                   for op, feed_fn in zip(self._enqueue_ops, self._feed_fns)]
-    if coord:
-      ret_threads.append(threading.Thread(target=self._close_on_stop,
-                                          args=(sess, self._cancel_op, coord)))
-    for t in ret_threads:
-      if daemon:
-        t.daemon = True
-      if start:
-        t.start()
-    return ret_threads
-
-  def _init_from_proto(self, queue_runner_def):
-    raise NotImplementedError(
-        "{} does not support initialization from proto.".format(type(
-            self).__name__))
-
-  def to_proto(self):
-    raise NotImplementedError(
-        "{} does not support serialization to proto.".format(type(
-            self).__name__))
+from tensorflow_estimator.python.estimator.inputs.queues.feeding_queue_runner import *
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_queue_runner_test.py b/tensorflow/python/estimator/inputs/queues/feeding_queue_runner_test.py
deleted file mode 100644
index 6292eb7da1b9a52b6b7e11c358b53ab38a9186b6..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/inputs/queues/feeding_queue_runner_test.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests `FeedingQueueRunner` using arrays and `DataFrames`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.estimator.inputs.queues import feeding_functions as ff
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import queue_runner_impl
-
-try:
-  # pylint: disable=g-import-not-at-top
-  import pandas as pd
-  HAS_PANDAS = True
-except IOError:
-  # Pandas writes a temporary file during import. If it fails, don't use pandas.
-  HAS_PANDAS = False
-except ImportError:
-  HAS_PANDAS = False
-
-
-def get_rows(array, row_indices):
-  rows = [array[i] for i in row_indices]
-  return np.vstack(rows)
-
-
-class FeedingQueueRunnerTestCase(test.TestCase):
-  """Tests for `FeedingQueueRunner`."""
-
-  def testArrayFeeding(self):
-    with ops.Graph().as_default():
-      array = np.arange(32).reshape([16, 2])
-      q = ff._enqueue_data(array, capacity=100)
-      batch_size = 3
-      dq_op = q.dequeue_many(batch_size)
-      with session.Session() as sess:
-        coord = coordinator.Coordinator()
-        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-        for i in range(100):
-          indices = [
-              j % array.shape[0]
-              for j in range(batch_size * i, batch_size * (i + 1))
-          ]
-          expected_dq = get_rows(array, indices)
-          dq = sess.run(dq_op)
-          np.testing.assert_array_equal(indices, dq[0])
-          np.testing.assert_array_equal(expected_dq, dq[1])
-        coord.request_stop()
-        coord.join(threads)
-
-  def testArrayFeedingMultiThread(self):
-    with ops.Graph().as_default():
-      array = np.arange(256).reshape([128, 2])
-      q = ff._enqueue_data(array, capacity=128, num_threads=8, shuffle=True)
-      batch_size = 3
-      dq_op = q.dequeue_many(batch_size)
-      with session.Session() as sess:
-        coord = coordinator.Coordinator()
-        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-        for _ in range(100):
-          dq = sess.run(dq_op)
-          indices = dq[0]
-          expected_dq = get_rows(array, indices)
-          np.testing.assert_array_equal(expected_dq, dq[1])
-        coord.request_stop()
-        coord.join(threads)
-
-  def testPandasFeeding(self):
-    if not HAS_PANDAS:
-      return
-    with ops.Graph().as_default():
-      array1 = np.arange(32)
-      array2 = np.arange(32, 64)
-      df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(64, 96))
-      q = ff._enqueue_data(df, capacity=100)
-      batch_size = 5
-      dq_op = q.dequeue_many(5)
-      with session.Session() as sess:
-        coord = coordinator.Coordinator()
-        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-        for i in range(100):
-          indices = [
-              j % array1.shape[0]
-              for j in range(batch_size * i, batch_size * (i + 1))
-          ]
-          expected_df_indices = df.index[indices]
-          expected_rows = df.iloc[indices]
-          dq = sess.run(dq_op)
-          np.testing.assert_array_equal(expected_df_indices, dq[0])
-          for col_num, col in enumerate(df.columns):
-            np.testing.assert_array_equal(expected_rows[col].values,
-                                          dq[col_num + 1])
-        coord.request_stop()
-        coord.join(threads)
-
-  def testPandasFeedingMultiThread(self):
-    if not HAS_PANDAS:
-      return
-    with ops.Graph().as_default():
-      array1 = np.arange(128, 256)
-      array2 = 2 * array1
-      df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(128))
-      q = ff._enqueue_data(df, capacity=128, num_threads=8, shuffle=True)
-      batch_size = 5
-      dq_op = q.dequeue_many(batch_size)
-      with session.Session() as sess:
-        coord = coordinator.Coordinator()
-        threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-        for _ in range(100):
-          dq = sess.run(dq_op)
-          indices = dq[0]
-          expected_rows = df.iloc[indices]
-          for col_num, col in enumerate(df.columns):
-            np.testing.assert_array_equal(expected_rows[col].values,
-                                          dq[col_num + 1])
-        coord.request_stop()
-        coord.join(threads)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index 5d5ed81fbbdfe3e437c7bfdb36ba55a4f0902a67..1f471396cc334e3b74fe9595dfb4ae0fe426fd77 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,489 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=protected-access
-"""Home of estimator related functions.
+"""keras python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
 """
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import re
-import six
-
-from tensorflow.python.client import session
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import export as export_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import metrics
-from tensorflow.python.keras import models
-from tensorflow.python.keras import optimizers
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics as metrics_module
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.training import optimizer as tf_optimizer_module
-from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training import training_util
-
-
-_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
-
-def _cast_tensor_to_floatx(x):
-  """Cast tensor to keras's floatx dtype if it is not already the same dtype."""
-  if x.dtype == K.floatx():
-    return x
-  else:
-    return math_ops.cast(x, K.floatx())
-
-
-def _convert_tensor(x):
-  """Create or cast tensor if needed."""
-  if not tensor_util.is_tensor(x):
-    # x is a numpy array
-    x = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(x)
-  if check_ops.is_numeric_tensor(x):
-    # is_numeric_tensor returns False if provided with a numpy array
-    x = _cast_tensor_to_floatx(x)
-  return x
-
-
-def _any_weight_initialized(keras_model):
-  """Check if any weights has been initialized in the Keras model.
-
-  Args:
-    keras_model: An instance of compiled keras model.
-
-  Returns:
-    boolean, True if at least one weight has been initialized, else False.
-    Currently keras initialize all weights at get_session().
-  """
-  if keras_model is None:
-    return False
-  for layer in keras_model.layers:
-    for weight in layer.weights:
-      if hasattr(weight, '_keras_initialized'):
-        return True
-  return False
-
-
-def _convert_estimator_io_to_keras(keras_model, features, labels):
-  """Converts estimator features and labels to keras input and target tensors.
-
-  Args:
-    keras_model: a compiled `tf.keras.Model` instance, used to determine the
-      order of the returned lists.
-    features: Dict of tensors or `None`.
-    labels: Dict of tensors, a single tensor, or `None`.
-
-  Returns:
-    Tuple of (
-      list of input tensors or `None`,
-      list of target tensors or `None`)
-    The order of tensors is determined by the order set in the keras model.
-  """
-
-  def _to_ordered_tensor_list(obj, key_order, obj_name, order_name):
-    """Convert obj to an ordered list of tensors.
-
-    Args:
-      obj: List, dict, or single tensor. May be `None`.
-      key_order: List of strings with the order to return (used if obj is a
-        dict).
-      obj_name: String name of object (e.g. "features" or "labels")
-      order_name: String name of the key order (e.g. "inputs" or "outputs")
-
-    Returns:
-      List of tensors, or `None`
-
-    Raises:
-      KeyError: If obj has invalid keys.
-    """
-    if obj is None:
-      return None
-    elif isinstance(obj, (list, tuple)):
-      return [_convert_tensor(x) for x in obj]
-    elif isinstance(obj, dict):
-      # Ensure that the obj keys and keys in key_order are exactly the same.
-      different_keys = set(obj.keys()) ^ set(key_order)
-
-      if different_keys:
-        raise KeyError(
-            'The dictionary passed into {obj_name} does not have the expected '
-            '{order_name} keys defined in the keras model.'
-            '\n\tExpected keys: {order_keys}'
-            '\n\t{obj_name} keys: {obj_keys}'
-            '\n\tDifference: {different_keys}'.format(
-                order_name=order_name, order_keys=set(key_order),
-                obj_name=obj_name, obj_keys=set(obj.keys()),
-                different_keys=different_keys))
-
-      return [_convert_tensor(obj[key]) for key in key_order]
-    else:  # Assume obj is a tensor.
-      return [_convert_tensor(obj)]
-
-  input_names = None
-  output_names = None
-  if isinstance(features, dict):
-    input_names = (
-        keras_model.input_names if keras_model._is_graph_network else
-        ['input_%d' % i for i in range(1, len(features) + 1)])
-  if isinstance(labels, dict):
-    output_names = (
-        keras_model.output_names if keras_model._is_graph_network else
-        ['output_%d' % i for i in range(1, len(labels) + 1)])
-
-  input_tensors = _to_ordered_tensor_list(
-      features, input_names, 'features', 'inputs')
-  target_tensors = _to_ordered_tensor_list(
-      labels, output_names, 'labels', 'outputs')
-
-  return input_tensors, target_tensors
-
-
-def _clone_and_build_model(mode,
-                           keras_model,
-                           custom_objects,
-                           features=None,
-                           labels=None):
-  """Clone and build the given keras_model.
-
-  Args:
-    mode: training mode.
-    keras_model: an instance of compiled keras model.
-    custom_objects: Dictionary for custom objects.
-    features: Dict of tensors.
-    labels: Dict of tensors, or single tensor instance.
-
-  Returns:
-    The newly built model.
-  """
-  # Set to True during training, False for inference or testing.
-  K.set_learning_phase(mode == model_fn_lib.ModeKeys.TRAIN)
-  input_tensors, target_tensors = _convert_estimator_io_to_keras(
-      keras_model, features, labels)
-
-  compile_clone = (mode != model_fn_lib.ModeKeys.PREDICT)
-
-  global_step = None
-  if compile_clone:
-    # Set iterations to the global step created by tf.train.create_global_step()
-    # which is automatically run in the estimator framework.
-    global_step = training_util.get_or_create_global_step()
-    K.track_variable(global_step)
-
-  clone = models.clone_and_build_model(
-      keras_model, input_tensors, target_tensors, custom_objects,
-      compile_clone=compile_clone,
-      in_place_reset=(not keras_model._is_graph_network),
-      optimizer_iterations=global_step)
-
-  return clone
-
-
-def _convert_keras_metrics_to_estimator(model):
-  """Convert metrics from a Keras model to ops used by the Estimator framework.
-
-  Args:
-    model: A `tf.keras.Model` object.
-
-  Returns:
-    Dictionary mapping metric names to tuples of (value, update) ops. May return
-    `None` if the model does not contain any metrics.
-  """
-  if not getattr(model, 'metrics', None):
-    return None
-
-  eval_metric_ops = {}
-
-  def get_metric_name(metric):
-    if isinstance(metric, metrics.Metric):
-      return metric.name
-    if callable(metric):
-      return metric.__name__
-    assert isinstance(metric, six.string_types)
-    return metric
-
-  # When each metric maps to an output
-  if isinstance(model.metrics, dict):
-    for i, output_name in enumerate(model.metrics.keys()):
-      # `metric` is the user given metric value in `compile`. This can be
-      # metric name (`acc`), metric function (binary_accuracy) or a metric
-      # object (BinaryAccuracy()).
-      metric = model.metrics[output_name]
-      metric_name = get_metric_name(metric)
-      # When some outputs use the same metric
-      if list(model.metrics.values()).count(metric_name) > 1:
-        metric_name += '_' + output_name
-      if isinstance(metric, metrics.Metric):
-        eval_metric_ops[metric_name] = metric
-      else:
-        eval_metric_ops[metric_name] = metrics_module.mean(
-            model.metrics_tensors[i - len(model.metrics)])
-  else:
-    for i, metric in enumerate(model.metrics):
-      metric_name = get_metric_name(metric)
-      if isinstance(metric, metrics.Metric):
-        eval_metric_ops[metric_name] = metric
-      else:
-        eval_metric_ops[metric_name] = metrics_module.mean(
-            model.metrics_tensors[i])
-  return eval_metric_ops
-
-
-def _create_keras_model_fn(keras_model, custom_objects=None):
-  """Creates model_fn for keras Estimator.
-
-  Args:
-    keras_model: an instance of compiled keras model.
-    custom_objects: Dictionary for custom objects.
-
-  Returns:
-    The model_fn for a keras Estimator.
-  """
-
-  def model_fn(features, labels, mode):
-    """model_fn for keras Estimator."""
-    # Raise an error when users use DistributionStrategy with native Keras
-    # optimizers. Currently we only support native TensorFlow optimizers.
-    if distribution_strategy_context.has_distribution_strategy() and \
-        not isinstance(keras_model.optimizer,
-                       (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
-      raise ValueError('Only TensorFlow native optimizers are supported with '
-                       'DistributionStrategy.')
-
-    model = _clone_and_build_model(mode, keras_model, custom_objects, features,
-                                   labels)
-    model_output_names = []
-    # We need to make sure that the output names of the last layer in the model
-    # is the same for each of the cloned models. This is required for mirrored
-    # strategy when we call regroup.
-    if distribution_strategy_context.has_distribution_strategy():
-      for name in model.output_names:
-        name = re.compile(r'_\d$').sub('', name)
-        model_output_names.append(name)
-    else:
-      model_output_names = model.output_names
-
-    # Get inputs to EstimatorSpec
-    predictions = dict(zip(model_output_names, model.outputs))
-
-    loss = None
-    train_op = None
-    eval_metric_ops = None
-
-    # Set loss and metric only during train and evaluate.
-    if mode is not model_fn_lib.ModeKeys.PREDICT:
-      if mode is model_fn_lib.ModeKeys.TRAIN:
-        model._make_train_function()  # pylint: disable=protected-access
-      else:
-        model._make_test_function()  # pylint: disable=protected-access
-      loss = model.total_loss
-
-      eval_metric_ops = _convert_keras_metrics_to_estimator(model)
-
-    # Set train_op only during train.
-    if mode is model_fn_lib.ModeKeys.TRAIN:
-      train_op = model.train_function.updates_op
-
-    if not model._is_graph_network:
-      # Reset model state to original state,
-      # to avoid `model_fn` being destructive for the initial model argument.
-      models.in_place_subclassed_model_state_restoration(keras_model)
-    return model_fn_lib.EstimatorSpec(
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metric_ops=eval_metric_ops,
-        export_outputs={
-            _DEFAULT_SERVING_KEY:
-            export_lib.export_output.PredictOutput(predictions)
-        })
-
-  return model_fn
-
-
-def _save_first_checkpoint(keras_model, custom_objects, config):
-  """Save first checkpoint for the keras Estimator.
-
-  Args:
-    keras_model: an instance of compiled keras model.
-    custom_objects: Dictionary for custom objects.
-    config: Estimator config.
-
-  Returns:
-    The path where keras model checkpoint is saved.
-  """
-  # save checkpoint into subdirectory to allow warm start
-  keras_model_dir = os.path.join(config.model_dir, 'keras')
-  # Load weights and save to checkpoint if there is no checkpoint
-  latest_path = checkpoint_management.latest_checkpoint(keras_model_dir)
-  if not latest_path:
-    keras_weights = None
-    if _any_weight_initialized(keras_model):
-      keras_weights = keras_model.get_weights()
-    if not gfile.IsDirectory(keras_model_dir):
-      gfile.MakeDirs(keras_model_dir)
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(config.tf_random_seed)
-      training_util.create_global_step()
-      model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model,
-                                     custom_objects)
-      # save to checkpoint
-      with session.Session(config=config.session_config) as sess:
-        if keras_weights:
-          model.set_weights(keras_weights)
-        # Make update ops and initialize all variables.
-        if not model.train_function:
-          # pylint: disable=protected-access
-          model._make_train_function()
-          K._initialize_variables(sess)
-          # pylint: enable=protected-access
-        saver = saver_lib.Saver()
-        latest_path = os.path.join(keras_model_dir, 'keras_model.ckpt')
-        saver.save(sess, latest_path)
-  return latest_path
-
-
-def _get_file_from_google_storage(keras_model_path, model_dir):
-  """Get file from google storage and download to local file.
-
-  Args:
-    keras_model_path: a google storage path for compiled keras model.
-    model_dir: the directory from estimator config.
-
-  Returns:
-    The path where keras model is saved.
-
-  Raises:
-    ValueError: if storage object name does not end with .h5.
-  """
-  try:
-    from google.cloud import storage  # pylint:disable=g-import-not-at-top
-  except ImportError:
-    raise TypeError('Could not save model to Google cloud storage; please '
-                    'install `google-cloud-storage` via '
-                    '`pip install google-cloud-storage`.')
-  storage_client = storage.Client()
-  path, blob_name = os.path.split(keras_model_path)
-  _, bucket_name = os.path.split(path)
-  keras_model_dir = os.path.join(model_dir, 'keras')
-  if not gfile.Exists(keras_model_dir):
-    gfile.MakeDirs(keras_model_dir)
-  file_name = os.path.join(keras_model_dir, 'keras_model.h5')
-  try:
-    blob = storage_client.get_bucket(bucket_name).blob(blob_name)
-    blob.download_to_filename(file_name)
-  except:
-    raise ValueError('Failed to download keras model, please check '
-                     'environment variable GOOGLE_APPLICATION_CREDENTIALS '
-                     'and model path storage.googleapis.com/{bucket}/{object}.')
-  logging.info('Saving model to {}'.format(file_name))
-  del storage_client
-  return file_name
-
-
-def model_to_estimator(keras_model=None,
-                       keras_model_path=None,
-                       custom_objects=None,
-                       model_dir=None,
-                       config=None):
-  """Constructs an `Estimator` instance from given keras model.
-
-  For usage example, please see:
-  [Creating estimators from Keras
-  Models](https://tensorflow.org/guide/estimators#model_to_estimator).
-
-  Args:
-    keras_model: A compiled Keras model object. This argument is mutually
-      exclusive with `keras_model_path`.
-    keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
-      format, which can be generated with the `save()` method of a Keras model.
-      This argument is mutually exclusive with `keras_model`.
-    custom_objects: Dictionary for custom objects.
-    model_dir: Directory to save `Estimator` model parameters, graph, summary
-      files for TensorBoard, etc.
-    config: `RunConfig` to config `Estimator`.
-
-  Returns:
-    An Estimator from given keras model.
-
-  Raises:
-    ValueError: if neither keras_model nor keras_model_path was given.
-    ValueError: if both keras_model and keras_model_path was given.
-    ValueError: if the keras_model_path is a GCS URI.
-    ValueError: if keras_model has not been compiled.
-  """
-  if not (keras_model or keras_model_path):
-    raise ValueError(
-        'Either `keras_model` or `keras_model_path` needs to be provided.')
-  if keras_model and keras_model_path:
-    raise ValueError(
-        'Please specity either `keras_model` or `keras_model_path`, '
-        'but not both.')
-
-  config = estimator_lib.maybe_overwrite_model_dir_and_session_config(
-      config, model_dir)
-  if not keras_model:
-    if keras_model_path.startswith(
-        'gs://') or 'storage.googleapis.com' in keras_model_path:
-      keras_model_path = _get_file_from_google_storage(keras_model_path,
-                                                       config.model_dir)
-    logging.info('Loading models from %s', keras_model_path)
-    keras_model = models.load_model(keras_model_path)
-  else:
-    logging.info('Using the Keras model provided.')
-    keras_model = keras_model
-
-  if not hasattr(keras_model, 'optimizer') or not keras_model.optimizer:
-    raise ValueError(
-        'The given keras model has not been compiled yet. '
-        'Please compile the model with `model.compile()` '
-        'before calling `model_to_estimator()`.')
-
-  keras_model_fn = _create_keras_model_fn(keras_model, custom_objects)
-  if _any_weight_initialized(keras_model):
-    # Warn if config passed to estimator tries to update GPUOptions. If a
-    # session has already been created, the GPUOptions passed to the first
-    # session sticks.
-    if config.session_config.HasField('gpu_options'):
-      logging.warning(
-          'The Keras backend session has already been set. '
-          'The _session_config passed to model_to_estimator will not be used.')
-  else:
-    # Pass the config into keras backend's default session.
-    sess = session.Session(config=config.session_config)
-    K.set_session(sess)
-
-  warm_start_path = None
-  if keras_model._is_graph_network:
-    warm_start_path = _save_first_checkpoint(keras_model, custom_objects,
-                                             config)
-  elif keras_model.built:
-    logging.warning('You are creating an Estimator from a Keras model manually '
-                    'subclassed from `Model`, that was already called on some '
-                    'inputs (and thus already had weights). We are currently '
-                    'unable to preserve the model\'s state (its weights) as '
-                    'part of the estimator in this case. Be warned that the '
-                    'estimator has been created using a freshly initialized '
-                    'version of your model.\n'
-                    'Note that this doesn\'t affect the state of the model '
-                    'instance you passed as `keras_model` argument.')
+from tensorflow_estimator.python.estimator import keras
 
-  estimator = estimator_lib.Estimator(keras_model_fn,
-                                      config=config,
-                                      warm_start_from=warm_start_path)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+keras.__all__ = [s for s in dir(keras) if not s.startswith('__')]
 
-  return estimator
+from tensorflow_estimator.python.estimator.keras import *
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
deleted file mode 100644
index 4e285fa25a662b1375fd790b8dc2ead49f935bb5..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/keras_test.py
+++ /dev/null
@@ -1,805 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for training routines."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-from math import log10
-import os
-import tempfile
-
-import numpy as np
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import keras as keras_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.optimizers import SGD
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.parsing_ops import gen_parsing_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import rmsprop
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training_util
-
-
-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
-_RANDOM_SEED = 1337
-_TRAIN_SIZE = 200
-_INPUT_SIZE = (10,)
-_NUM_CLASS = 2
-
-_TMP_DIR = '/tmp'
-
-
-def simple_sequential_model():
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(16, activation='relu', input_shape=_INPUT_SIZE))
-  model.add(keras.layers.Dropout(0.1))
-  model.add(keras.layers.Dense(_NUM_CLASS, activation='softmax'))
-  return model
-
-
-def simple_functional_model(activation='relu'):
-  a = keras.layers.Input(shape=_INPUT_SIZE)
-  b = keras.layers.Dense(16, activation=activation)(a)
-  b = keras.layers.Dropout(0.1)(b)
-  b = keras.layers.Dense(_NUM_CLASS, activation='softmax')(b)
-  model = keras.models.Model(inputs=[a], outputs=[b])
-  return model
-
-
-def simple_subclassed_model():
-
-  class SimpleModel(keras.Model):
-
-    def __init__(self):
-      super(SimpleModel, self).__init__()
-      self.dense1 = keras.layers.Dense(16, activation='relu')
-      self.dp = keras.layers.Dropout(0.1)
-      self.dense2 = keras.layers.Dense(_NUM_CLASS, activation='softmax')
-
-    def call(self, inputs):
-      x = self.dense1(inputs)
-      x = self.dp(x)
-      return self.dense2(x)
-
-  return SimpleModel()
-
-
-def gen_input_fn(x, y=None, batch_size=128, num_epochs=1, shuffle=False):
-  def input_fn():
-    ds = dataset_ops.Dataset.from_tensor_slices((x, y) if y is not None else x)
-    if shuffle:
-      ds = ds.shuffle(1000)
-    return ds.repeat(num_epochs).batch(batch_size)
-  return input_fn
-
-
-def get_multi_inputs_multi_outputs_data():
-  (a_train, c_train), (a_test, c_test) = testing_utils.get_test_data(
-      train_samples=_TRAIN_SIZE,
-      test_samples=50,
-      input_shape=(16,),
-      num_classes=3,
-      random_seed=_RANDOM_SEED)
-  (b_train, d_train), (b_test, d_test) = testing_utils.get_test_data(
-      train_samples=_TRAIN_SIZE,
-      test_samples=50,
-      input_shape=(16,),
-      num_classes=2,
-      random_seed=_RANDOM_SEED)
-  (m_train, _), (m_test, _) = testing_utils.get_test_data(
-      train_samples=_TRAIN_SIZE,
-      test_samples=50,
-      input_shape=(8,),
-      num_classes=2,
-      random_seed=_RANDOM_SEED)
-
-  c_train = keras.utils.to_categorical(c_train)
-  c_test = keras.utils.to_categorical(c_test)
-  d_train = keras.utils.to_categorical(d_train)
-  d_test = keras.utils.to_categorical(d_test)
-
-  train_data = {
-      'input_a': a_train,
-      'input_b': b_train,
-      'input_m': m_train,
-      'output_c': c_train,
-      'output_d': d_train
-  }
-  test_data = {
-      'input_a': a_test,
-      'input_b': b_test,
-      'input_m': m_test,
-      'output_c': c_test,
-      'output_d': d_test
-  }
-
-  return (train_data, test_data)
-
-
-def get_resource_for_simple_model(model_type='sequential',
-                                  is_evaluate=False,):
-  if model_type == 'sequential':
-    model = simple_sequential_model()
-    model.build()
-  elif model_type == 'subclass':
-    model = simple_subclassed_model()
-  else:
-    assert model_type == 'functional'
-    model = simple_functional_model()
-
-  if model_type == 'subclass':
-    input_name = 'input_1'
-    output_name = 'output_1'
-  else:
-    input_name = model.input_names[0]
-    output_name = model.output_names[0]
-
-  np.random.seed(_RANDOM_SEED)
-  (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-      train_samples=_TRAIN_SIZE,
-      test_samples=50,
-      input_shape=_INPUT_SIZE,
-      num_classes=_NUM_CLASS)
-  y_train = keras.utils.to_categorical(y_train)
-  y_test = keras.utils.to_categorical(y_test)
-
-  train_input_fn = gen_input_fn(
-      x=randomize_io_type(x_train, input_name),
-      y=randomize_io_type(y_train, output_name),
-      shuffle=False,
-      num_epochs=None,
-      batch_size=16)
-
-  evaluate_input_fn = gen_input_fn(
-      x=randomize_io_type(x_test, input_name),
-      y=randomize_io_type(y_test, output_name),
-      num_epochs=1, shuffle=False)
-
-  predict_input_fn = gen_input_fn(
-      x=randomize_io_type(x_test, input_name), num_epochs=1, shuffle=False)
-
-  inference_input_fn = evaluate_input_fn if is_evaluate else predict_input_fn
-
-  return model, (x_train, y_train), (x_test,
-                                     y_test), train_input_fn, inference_input_fn
-
-
-def randomize_io_type(array, name):
-  switch = np.random.random()
-  if switch > 0.5:
-    return array
-  else:
-    return {name: array}
-
-
-def multi_inputs_multi_outputs_model():
-  input_a = keras.layers.Input(shape=(16,), name='input_a')
-  input_b = keras.layers.Input(shape=(16,), name='input_b')
-  input_m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
-  dense = keras.layers.Dense(8, name='dense_1')
-
-  interm_a = dense(input_a)
-  # Read m
-  interm_m = keras.layers.Lambda(gen_parsing_ops.string_to_number)(input_m)
-  interm_s = keras.layers.Lambda(lambda k: k[0] * k[1])([interm_m, interm_a])
-  interm_b = dense(input_b)
-  merged = keras.layers.concatenate([interm_s, interm_b], name='merge')
-  output_c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
-  output_d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
-  model = keras.models.Model(
-      inputs=[input_a, input_b, input_m], outputs=[output_c, output_d])
-  model.compile(
-      loss='categorical_crossentropy',
-      optimizer='rmsprop',
-      metrics={
-          'dense_2': 'categorical_accuracy',
-          'dense_3': 'categorical_accuracy'
-      })
-  return model
-
-
-class MyHook(session_run_hook.SessionRunHook):
-
-  def begin(self):
-    _ = variable_scope.get_variable('temp', [1])
-
-
-class TestKerasEstimator(test_util.TensorFlowTestCase):
-
-  def setUp(self):
-    self._base_dir = os.path.join(self.get_temp_dir(), 'keras_estimator_test')
-    gfile.MakeDirs(self._base_dir)
-    self._config = run_config_lib.RunConfig(
-        tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
-    super(TestKerasEstimator, self).setUp()
-
-  def tearDown(self):
-    # Make sure nothing is stuck in limbo.
-    writer_cache.FileWriterCache.clear()
-    if os.path.isdir(self._base_dir):
-      gfile.DeleteRecursively(self._base_dir)
-    super(TestKerasEstimator, self).tearDown()
-
-  def test_train(self):
-    for model_type in ['sequential', 'functional']:
-      keras_model, (_, _), (
-          _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
-              model_type=model_type, is_evaluate=True)
-      keras_model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-      with self.cached_session():
-        est_keras = keras_lib.model_to_estimator(
-            keras_model=keras_model, config=self._config)
-        before_eval_results = est_keras.evaluate(
-            input_fn=eval_input_fn, steps=1)
-        est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
-        after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
-        self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
-
-      writer_cache.FileWriterCache.clear()
-      gfile.DeleteRecursively(self._config.model_dir)
-
-  # see b/109935364
-  @test_util.run_in_graph_and_eager_modes
-  def test_train_with_hooks(self):
-    for model_type in ['sequential', 'functional']:
-      keras_model, (_, _), (
-          _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
-              model_type=model_type, is_evaluate=True)
-      keras_model.compile(
-          loss='categorical_crossentropy',
-          optimizer=rmsprop.RMSPropOptimizer(1e-3),
-          metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-      my_hook = MyHook()
-      with self.cached_session():
-        est_keras = keras_lib.model_to_estimator(
-            keras_model=keras_model, config=self._config)
-        before_eval_results = est_keras.evaluate(
-            input_fn=eval_input_fn, steps=1)
-        est_keras.train(input_fn=train_input_fn, hooks=[my_hook],
-                        steps=_TRAIN_SIZE / 16)
-        after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
-        self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
-
-      writer_cache.FileWriterCache.clear()
-      gfile.DeleteRecursively(self._config.model_dir)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_train_with_model_fit_and_hooks(self):
-    keras_model, (x_train, y_train), _, \
-      train_input_fn, eval_input_fn = get_resource_for_simple_model(
-          model_type='sequential', is_evaluate=True)
-
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=['mse', keras.metrics.CategoricalAccuracy()])
-    my_hook = MyHook()
-    with self.cached_session():
-      keras_model.fit(x_train, y_train, epochs=1)
-
-      keras_est = keras_lib.model_to_estimator(
-          keras_model=keras_model, config=self._config)
-      before_eval_results = keras_est.evaluate(input_fn=eval_input_fn)
-      keras_est.train(input_fn=train_input_fn, hooks=[my_hook],
-                      steps=_TRAIN_SIZE / 16)
-      after_eval_results = keras_est.evaluate(input_fn=eval_input_fn, steps=1)
-      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_train_with_tf_optimizer(self):
-    for model_type in ['sequential', 'functional']:
-      keras_model, (_, _), (
-          _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
-              model_type=model_type, is_evaluate=True)
-      keras_model.compile(
-          loss='categorical_crossentropy',
-          optimizer=rmsprop.RMSPropOptimizer(1e-3),
-          metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-      with self.cached_session():
-        est_keras = keras_lib.model_to_estimator(
-            keras_model=keras_model,
-            config=self._config)
-        before_eval_results = est_keras.evaluate(
-            input_fn=eval_input_fn, steps=1)
-        est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
-        after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
-        self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
-
-      writer_cache.FileWriterCache.clear()
-      gfile.DeleteRecursively(self._config.model_dir)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_train_with_subclassed_model(self):
-    keras_model, (_, _), (
-        _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
-            model_type='subclass', is_evaluate=True)
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-    with self.cached_session():
-      est_keras = keras_lib.model_to_estimator(
-          keras_model=keras_model, config=self._config)
-      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
-      before_eval_results = est_keras.evaluate(
-          input_fn=eval_input_fn, steps=1)
-      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
-      after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
-      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
-
-  def test_train_with_subclassed_model_with_existing_state(self):
-    keras_model, (_, _), (
-        _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
-            model_type='subclass', is_evaluate=True)
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-    with self.cached_session():
-      # Create state
-      keras_model.train_on_batch(np.random.random((10,) + _INPUT_SIZE),
-                                 np.random.random((10, _NUM_CLASS)))
-      original_preds = keras_model.predict(np.ones((10,) + _INPUT_SIZE))
-
-      est_keras = keras_lib.model_to_estimator(
-          keras_model=keras_model, config=self._config)
-      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
-      before_eval_results = est_keras.evaluate(
-          input_fn=eval_input_fn, steps=1)
-      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
-      after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
-      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
-
-      # Check that original model state was not altered
-      preds = keras_model.predict(np.ones((10,) + _INPUT_SIZE))
-      self.assertAllClose(original_preds, preds, atol=1e-5)
-      # Check that the original model compilation did not break
-      keras_model.train_on_batch(np.random.random((10,) + _INPUT_SIZE),
-                                 np.random.random((10, _NUM_CLASS)))
-
-  def test_evaluate(self):
-    keras_model, (x_train, y_train), (
-        x_test, y_test), _, eval_input_fn = get_resource_for_simple_model(
-            model_type='functional', is_evaluate=True)
-
-    with self.cached_session():
-      metrics = [
-          'binary_accuracy', 'binary_crossentropy', 'categorical_accuracy',
-          'categorical_crossentropy', 'cosine_proximity', 'hinge',
-          'kullback_leibler_divergence', 'mean_absolute_error',
-          'mean_absolute_percentage_error', 'mean_squared_error',
-          'mean_squared_logarithmic_error', 'poisson', 'squared_hinge',
-          'top_k_categorical_accuracy'
-      ]
-      keras_model.compile(
-          loss='categorical_crossentropy', optimizer='adam', metrics=metrics)
-      keras_model.fit(x_train, y_train, epochs=1)
-      keras_eval = keras_model.evaluate(x_test, y_test, batch_size=32)
-
-    with self.cached_session():
-      keras_est = keras_lib.model_to_estimator(
-          keras_model=keras_model, config=self._config)
-      est_eval = keras_est.evaluate(input_fn=eval_input_fn)
-
-    metrics = ['loss'] + metrics
-
-    # Check loss and all metrics match between keras and estimator.
-    def shift(val):
-      if val == 0:
-        return 0
-      else:
-        return val / 10**int(log10(abs(val)))
-
-    for i, metric_name in enumerate(metrics):
-      self.assertAlmostEqual(
-          shift(est_eval[metric_name]),
-          shift(keras_eval[i]),
-          places=4,
-          msg='%s mismatch, keras model: %s, estimator: %s' %
-          (metric_name, est_eval[metric_name], keras_eval[i]))
-
-  def test_predict(self):
-    # Check that predict on a pretrained model yield the same result.
-    keras_model, (x_train, y_train), (
-        x_test, _), _, pred_input_fn = get_resource_for_simple_model(
-            model_type='sequential', is_evaluate=False)
-
-    with self.cached_session():
-      keras_model.compile(
-          loss='categorical_crossentropy',
-          optimizer='adam',
-          metrics=['accuracy'])
-      keras_model.fit(x_train, y_train, epochs=1)
-      keras_pred = [np.argmax(y) for y in keras_model.predict(x_test)]
-
-    with self.cached_session():
-      keras_est = keras_lib.model_to_estimator(
-          keras_model=keras_model, config=self._config)
-      est_pred = [
-          np.argmax(y[keras_model.output_names[0]])
-          for y in keras_est.predict(input_fn=pred_input_fn)
-      ]
-    self.assertAllEqual(est_pred, keras_pred)
-
-  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self):
-    train_data, test_data = get_multi_inputs_multi_outputs_data()
-
-    def train_input_fn():
-      input_dict = {
-          'input_a': train_data['input_a'],
-          'input_b': train_data['input_b'],
-          'input_m': train_data['input_m'].astype(np.str)
-      }
-      output_dict = {
-          'dense_2': train_data['output_c'],
-          'dense_3': train_data['output_d']
-      }
-      return input_dict, output_dict
-
-    def eval_input_fn():
-      input_dict = {
-          'input_a': test_data['input_a'],
-          'input_b': test_data['input_b'],
-          'input_m': test_data['input_m'].astype(np.str)
-      }
-      output_dict = {
-          'dense_2': test_data['output_c'],
-          'dense_3': test_data['output_d']
-      }
-      return input_dict, output_dict
-
-    def pred_input_fn():
-      input_dict = {
-          'input_a': test_data['input_a'],
-          'input_b': test_data['input_b'],
-          'input_m': test_data['input_m'].astype(np.str)
-      }
-      return input_dict
-
-    self.do_test_multi_inputs_multi_outputs_with_input_fn(
-        train_input_fn, eval_input_fn, pred_input_fn)
-
-  def test_multi_inputs_multi_outputs_with_input_fn_as_list(self):
-    train_data, test_data = get_multi_inputs_multi_outputs_data()
-
-    def train_input_fn():
-      input_list = [
-          train_data['input_a'], train_data['input_b'],
-          train_data['input_m'].astype(np.str)
-      ]
-      output_list = [train_data['output_c'], train_data['output_d']]
-      return input_list, output_list
-
-    def eval_input_fn():
-      input_list = [
-          test_data['input_a'], test_data['input_b'],
-          test_data['input_m'].astype(np.str)
-      ]
-      output_list = [test_data['output_c'], test_data['output_d']]
-      return input_list, output_list
-
-    def pred_input_fn():
-      input_list = [
-          test_data['input_a'], test_data['input_b'],
-          test_data['input_m'].astype(np.str)
-      ]
-      return input_list
-
-    self.do_test_multi_inputs_multi_outputs_with_input_fn(
-        train_input_fn, eval_input_fn, pred_input_fn)
-
-  def do_test_multi_inputs_multi_outputs_with_input_fn(
-      self, train_input_fn, eval_input_fn, pred_input_fn):
-    with self.cached_session():
-      model = multi_inputs_multi_outputs_model()
-      est_keras = keras_lib.model_to_estimator(
-          keras_model=model, config=self._config)
-      baseline_eval_results = est_keras.evaluate(
-          input_fn=eval_input_fn, steps=1)
-      est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
-      eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
-      self.assertLess(eval_results['loss'], baseline_eval_results['loss'])
-      est_keras.predict(input_fn=pred_input_fn)
-
-  def test_init_from_file(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
-
-    keras_model, (x_train, y_train), (
-        x_test, _), _, pred_input_fn = get_resource_for_simple_model(
-            model_type='functional', is_evaluate=False)
-
-    with self.cached_session():
-      keras_model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['categorical_accuracy'])
-      keras_model.fit(x_train, y_train, epochs=1)
-      keras_pred = [np.argmax(y) for y in keras_model.predict(x_test)]
-      fname = os.path.join(self._base_dir, 'keras_model.h5')
-      keras.models.save_model(keras_model, fname)
-
-    with self.cached_session():
-      keras_est = keras_lib.model_to_estimator(
-          keras_model_path=fname, config=self._config)
-      est_pred = [
-          np.argmax(y[keras_model.output_names[0]])
-          for y in keras_est.predict(input_fn=pred_input_fn)
-      ]
-    self.assertAllEqual(est_pred, keras_pred)
-
-  def test_keras_model_init_error(self):
-    with self.assertRaisesRegexp(ValueError, 'Either'):
-      keras_lib.model_to_estimator()
-
-    with self.cached_session():
-      keras_model = simple_sequential_model()
-      with self.assertRaisesRegexp(ValueError, 'not both'):
-        keras_lib.model_to_estimator(
-            keras_model=keras_model,
-            keras_model_path=tempfile.mkdtemp(dir=self._base_dir))
-
-    with self.cached_session():
-      keras_model = simple_sequential_model()
-      with self.assertRaisesRegexp(ValueError, 'compiled'):
-        keras_lib.model_to_estimator(keras_model=keras_model)
-
-  def test_invalid_ionames_error(self):
-    (x_train, y_train), (_, _) = testing_utils.get_test_data(
-        train_samples=_TRAIN_SIZE,
-        test_samples=100,
-        input_shape=(10,),
-        num_classes=2)
-    y_train = keras.utils.to_categorical(y_train)
-
-    def invald_input_name_input_fn():
-      input_dict = {'invalid_input_name': x_train}
-      return input_dict, y_train
-
-    def invald_output_name_input_fn():
-      input_dict = {'input_1': x_train}
-      output_dict = {'invalid_output_name': y_train}
-      return input_dict, output_dict
-    model = simple_functional_model()
-    model.compile(
-        loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
-    with self.cached_session():
-      est_keras = keras_lib.model_to_estimator(
-          keras_model=model, config=self._config)
-    with self.cached_session():
-      with self.assertRaisesRegexp(KeyError,
-                                   'Difference: .*invalid_input_name'):
-        est_keras.train(input_fn=invald_input_name_input_fn, steps=100)
-
-      with self.assertRaisesRegexp(KeyError,
-                                   'Difference: .*invalid_output_name'):
-        est_keras.train(input_fn=invald_output_name_input_fn, steps=100)
-
-  def test_custom_objects(self):
-
-    def relu6(x):
-      return keras.backend.relu(x, max_value=6)
-
-    keras_model = simple_functional_model(activation=relu6)
-    keras_model.compile(loss='categorical_crossentropy', optimizer='adam')
-    custom_objects = {
-        'relu6': relu6
-    }
-
-    (x_train, y_train), _ = testing_utils.get_test_data(
-        train_samples=_TRAIN_SIZE,
-        test_samples=50,
-        input_shape=(10,),
-        num_classes=2)
-    y_train = keras.utils.to_categorical(y_train, 2)
-    input_name = keras_model.input_names[0]
-    output_name = keras_model.output_names[0]
-    train_input_fn = gen_input_fn(
-        x=randomize_io_type(x_train, input_name),
-        y=randomize_io_type(y_train, output_name),
-        shuffle=False,
-        num_epochs=None,
-        batch_size=16)
-    with self.assertRaisesRegexp(ValueError, 'relu6'):
-      with self.cached_session():
-        est = keras_lib.model_to_estimator(
-            keras_model=keras_model,
-            model_dir=tempfile.mkdtemp(dir=self._base_dir))
-        est.train(input_fn=train_input_fn, steps=1)
-
-    with self.cached_session():
-      est = keras_lib.model_to_estimator(
-          keras_model=keras_model,
-          model_dir=tempfile.mkdtemp(dir=self._base_dir),
-          custom_objects=custom_objects)
-      est.train(input_fn=train_input_fn, steps=1)
-
-  def test_tf_config(self):
-    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-    tf_config = json.dumps({
-        'cluster': {
-            run_config_lib.TaskType.PS: ['localhost:1234'],
-            run_config_lib.TaskType.WORKER: ['localhost:1236'],
-            run_config_lib.TaskType.MASTER: ['localhost:1238']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 0
-        }
-    })
-    with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
-      with self.cached_session():
-        keras_lib.model_to_estimator(
-            keras_model=keras_model,
-            model_dir=tempfile.mkdtemp(dir=self._base_dir))
-
-  def test_gpu_config(self):
-    with ops.Graph().as_default():
-      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-      keras_model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-      gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
-      sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
-      self._config._session_config = sess_config
-      with self.cached_session():
-        keras_lib.model_to_estimator(
-            keras_model=keras_model, config=self._config)
-        self.assertEqual(
-            keras.backend.get_session()
-            ._config.gpu_options.per_process_gpu_memory_fraction,
-            gpu_options.per_process_gpu_memory_fraction)
-
-  def test_with_empty_config(self):
-    keras_model, _, _, _, _ = get_resource_for_simple_model(
-        model_type='sequential', is_evaluate=True)
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-    with self.cached_session():
-      est_keras = keras_lib.model_to_estimator(
-          keras_model=keras_model, model_dir=self._base_dir,
-          config=run_config_lib.RunConfig())
-      self.assertEqual(run_config_lib.get_default_session_config(),
-                       est_keras._session_config)
-      self.assertEqual(est_keras._session_config,
-                       est_keras._config.session_config)
-      self.assertEqual(self._base_dir, est_keras._config.model_dir)
-      self.assertEqual(self._base_dir, est_keras._model_dir)
-
-    with self.cached_session():
-      est_keras = keras_lib.model_to_estimator(
-          keras_model=keras_model, model_dir=self._base_dir,
-          config=None)
-      self.assertEqual(run_config_lib.get_default_session_config(),
-                       est_keras._session_config)
-      self.assertEqual(est_keras._session_config,
-                       est_keras._config.session_config)
-      self.assertEqual(self._base_dir, est_keras._config.model_dir)
-      self.assertEqual(self._base_dir, est_keras._model_dir)
-
-  def test_with_empty_config_and_empty_model_dir(self):
-    keras_model, _, _, _, _ = get_resource_for_simple_model(
-        model_type='sequential', is_evaluate=True)
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-    with self.cached_session():
-      with test.mock.patch.object(tempfile, 'mkdtemp', return_value=_TMP_DIR):
-        est_keras = keras_lib.model_to_estimator(
-            keras_model=keras_model,
-            config=run_config_lib.RunConfig())
-        self.assertEqual(est_keras._model_dir, _TMP_DIR)
-
-  def test_with_conflicting_model_dir_and_config(self):
-    keras_model, _, _, _, _ = get_resource_for_simple_model(
-        model_type='sequential', is_evaluate=True)
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer='rmsprop',
-        metrics=['mse', keras.metrics.CategoricalAccuracy()])
-
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, '`model_dir` are set both in '
-                                   'constructor and `RunConfig`'):
-        keras_lib.model_to_estimator(
-            keras_model=keras_model, model_dir=self._base_dir,
-            config=run_config_lib.RunConfig(model_dir=_TMP_DIR))
-
-  def test_pretrained_weights(self):
-    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=['mse', keras.metrics.CategoricalAccuracy()])
-    with self.cached_session():
-      keras_model.train_on_batch(
-          np.random.random((10,) + _INPUT_SIZE),
-          np.random.random((10, _NUM_CLASS)))
-      weights = keras_model.get_weights()
-      keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
-      keras_model.set_weights(weights)
-      keras_model.compile(
-          loss='categorical_crossentropy',
-          optimizer=SGD(lr=0.0001, momentum=0.9),
-          metrics=['mse', keras.metrics.CategoricalAccuracy()])
-      keras_lib.model_to_estimator(
-          keras_model=keras_model, config=self._config)
-
-  def assert_increasing_global_step(self, optimizer):
-    keras_model, _, _, train_input_fn, _ = get_resource_for_simple_model(
-        model_type='sequential', is_evaluate=True)
-    keras_model.compile(
-        loss='categorical_crossentropy',
-        optimizer=optimizer,
-        metrics=['mse', keras.metrics.CategoricalAccuracy()])
-    with self.cached_session() as sess:
-      keras_model_fn = keras_lib._create_keras_model_fn(keras_model)
-      global_step = training_util.create_global_step()
-      features, labels = train_input_fn().make_one_shot_iterator().get_next()
-      spec = keras_model_fn(features, labels, mode=model_fn_lib.ModeKeys.TRAIN)
-
-      sess.run(variables.global_variables_initializer())
-      sess.run(variables.local_variables_initializer())
-
-      self.assertEqual(global_step.eval(), 0)  # Sanity check
-      sess.run(spec.train_op)
-      self.assertEqual(global_step.eval(), 1)
-
-  def test_model_fn_increments_global_step_tf_optimizer(self):
-    self.assert_increasing_global_step(rmsprop.RMSPropOptimizer(1e-3))
-
-  def test_model_fn_increments_global_step_keras_optimizer(self):
-    self.assert_increasing_global_step('rmsprop')
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index 824789467dcd69b5a6d63bf93d84ad022747550e..a206810f3d57684a2d0fff46e4bb914c07df66b4 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,509 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""model_fn python module.
 
-"""Classes and methods related to model_fn."""
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+from tensorflow_estimator.python.estimator import model_fn
 
-import six
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+model_fn.__all__ = [s for s in dir(model_fn) if not s.startswith('__')]
 
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.metrics import Metric
-from tensorflow.python.ops import array_ops
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import estimator_export
-
-
-@estimator_export('estimator.ModeKeys')
-class ModeKeys(object):
-  """Standard names for model modes.
-
-  The following standard keys are defined:
-
-  * `TRAIN`: training mode.
-  * `EVAL`: evaluation mode.
-  * `PREDICT`: inference mode.
-  """
-
-  TRAIN = 'train'
-  EVAL = 'eval'
-  PREDICT = 'infer'
-
-
-LOSS_METRIC_KEY = 'loss'
-AVERAGE_LOSS_METRIC_KEY = 'average_loss'
-
-# Mapping of the modes to appropriate tag_constants that are used for saving.
-EXPORT_TAG_MAP = {
-    ModeKeys.PREDICT: [tag_constants.SERVING],
-    ModeKeys.TRAIN: [tag_constants.TRAINING],
-    ModeKeys.EVAL: [tag_constants.EVAL],
-}
-
-
-@estimator_export('estimator.EstimatorSpec')
-class EstimatorSpec(
-    collections.namedtuple('EstimatorSpec', [
-        'mode', 'predictions', 'loss', 'train_op', 'eval_metric_ops',
-        'export_outputs', 'training_chief_hooks', 'training_hooks', 'scaffold',
-        'evaluation_hooks', 'prediction_hooks'
-    ])):
-  """Ops and objects returned from a `model_fn` and passed to an `Estimator`.
-
-  `EstimatorSpec` fully defines the model to be run by an `Estimator`.
-  """
-
-  def __new__(cls,
-              mode,
-              predictions=None,
-              loss=None,
-              train_op=None,
-              eval_metric_ops=None,
-              export_outputs=None,
-              training_chief_hooks=None,
-              training_hooks=None,
-              scaffold=None,
-              evaluation_hooks=None,
-              prediction_hooks=None):
-    """Creates a validated `EstimatorSpec` instance.
-
-    Depending on the value of `mode`, different arguments are required. Namely
-
-    * For `mode == ModeKeys.TRAIN`: required fields are `loss` and `train_op`.
-    * For `mode == ModeKeys.EVAL`: required field is `loss`.
-    * For `mode == ModeKeys.PREDICT`: required fields are `predictions`.
-
-    model_fn can populate all arguments independent of mode. In this case, some
-    arguments will be ignored by an `Estimator`. E.g. `train_op` will be
-    ignored in eval and infer modes. Example:
-
-    ```python
-    def my_model_fn(features, labels, mode):
-      predictions = ...
-      loss = ...
-      train_op = ...
-      return tf.estimator.EstimatorSpec(
-          mode=mode,
-          predictions=predictions,
-          loss=loss,
-          train_op=train_op)
-    ```
-
-    Alternatively, model_fn can just populate the arguments appropriate to the
-    given mode. Example:
-
-    ```python
-    def my_model_fn(features, labels, mode):
-      if (mode == tf.estimator.ModeKeys.TRAIN or
-          mode == tf.estimator.ModeKeys.EVAL):
-        loss = ...
-      else:
-        loss = None
-      if mode == tf.estimator.ModeKeys.TRAIN:
-        train_op = ...
-      else:
-        train_op = None
-      if mode == tf.estimator.ModeKeys.PREDICT:
-        predictions = ...
-      else:
-        predictions = None
-
-      return tf.estimator.EstimatorSpec(
-          mode=mode,
-          predictions=predictions,
-          loss=loss,
-          train_op=train_op)
-    ```
-
-    Args:
-      mode: A `ModeKeys`. Specifies if this is training, evaluation or
-        prediction.
-      predictions: Predictions `Tensor` or dict of `Tensor`.
-      loss: Training loss `Tensor`. Must be either scalar, or with shape `[1]`.
-      train_op: Op for the training step.
-      eval_metric_ops: Dict of metric results keyed by name.
-        The values of the dict can be one of the following:
-        (1) instance of `Metric` class.
-        (2) Results of calling a metric function, namely a
-        `(metric_tensor, update_op)` tuple. `metric_tensor` should be
-        evaluated without any impact on state (typically is a pure computation
-        results based on variables.). For example, it should not trigger the
-        `update_op` or requires any input fetching.
-      export_outputs: Describes the output signatures to be exported to
-        `SavedModel` and used during serving.
-        A dict `{name: output}` where:
-        * name: An arbitrary name for this output.
-        * output: an `ExportOutput` object such as `ClassificationOutput`,
-            `RegressionOutput`, or `PredictOutput`.
-        Single-headed models only need to specify one entry in this dictionary.
-        Multi-headed models should specify one entry for each head, one of
-        which must be named using
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.
-        If no entry is provided, a default `PredictOutput` mapping to
-        `predictions` will be created.
-      training_chief_hooks: Iterable of `tf.train.SessionRunHook` objects to
-        run on the chief worker during training.
-      training_hooks: Iterable of `tf.train.SessionRunHook` objects to run
-        on all workers during training.
-      scaffold: A `tf.train.Scaffold` object that can be used to set
-        initialization, saver, and more to be used in training.
-      evaluation_hooks: Iterable of `tf.train.SessionRunHook` objects to
-        run during evaluation.
-      prediction_hooks: Iterable of `tf.train.SessionRunHook` objects to
-        run during predictions.
-
-    Returns:
-      A validated `EstimatorSpec` object.
-
-    Raises:
-      ValueError: If validation fails.
-      TypeError: If any of the arguments is not the expected type.
-    """
-    # Validate train_op.
-    if train_op is None:
-      if mode == ModeKeys.TRAIN:
-        raise ValueError('Missing train_op.')
-    else:
-      _check_is_tensor_or_operation(train_op, 'train_op')
-
-    # Validate loss.
-    if loss is None:
-      if mode in (ModeKeys.TRAIN, ModeKeys.EVAL):
-        raise ValueError('Missing loss.')
-    else:
-      loss = _check_is_tensor(loss, 'loss')
-      loss_shape = loss.get_shape()
-      if loss_shape.num_elements() not in (None, 1):
-        raise ValueError('Loss must be scalar, given: {}'.format(loss))
-      if not loss_shape.is_compatible_with(tensor_shape.scalar()):
-        loss = array_ops.reshape(loss, [])
-
-    # Validate predictions.
-    if predictions is None:
-      if mode == ModeKeys.PREDICT:
-        raise ValueError('Missing predictions.')
-      predictions = {}
-    else:
-      if isinstance(predictions, dict):
-        predictions = {
-            k: _check_is_tensor(v, 'predictions[{}]'.format(k))
-            for k, v in six.iteritems(predictions)
-        }
-      else:
-        predictions = _check_is_tensor(predictions, 'predictions')
-
-    # Validate eval_metric_ops.
-    if eval_metric_ops is None:
-      eval_metric_ops = {}
-    else:
-      if not isinstance(eval_metric_ops, dict):
-        raise TypeError(
-            'eval_metric_ops must be a dict, given: {}'.format(eval_metric_ops))
-      for key, value in six.iteritems(eval_metric_ops):
-        # TODO(psv): When we deprecate the old metrics, throw an error here if
-        # the value is not an instance of `Metric` class.
-        if isinstance(value, Metric):
-          if not value.updates:  # Check if metrics updates are available.
-            raise ValueError(
-                'Please call update_state(...) on the "{metric_name}" metric'
-                .format(metric_name=value.name))
-        else:
-          if not isinstance(value, tuple) or len(value) != 2:
-            raise TypeError(
-                'Values of eval_metric_ops must be (metric_value, update_op) '
-                'tuples, given: {} for key: {}'.format(value, key))
-          metric_value, metric_update = value
-          for metric_value_member in nest.flatten(metric_value):
-            # Allow (possibly nested) tuples for metric values, but require that
-            # each of them be Tensors or Operations.
-            _check_is_tensor_or_operation(metric_value_member,
-                                          'eval_metric_ops[{}]'.format(key))
-          _check_is_tensor_or_operation(metric_update,
-                                        'eval_metric_ops[{}]'.format(key))
-
-    # Validate the passed export outputs, or generate defaults.
-    if mode == ModeKeys.PREDICT:
-      export_outputs = _get_export_outputs(export_outputs, predictions)
-
-    # Validate that all tensors and ops are from the default graph.
-    default_graph = ops.get_default_graph()
-
-    # We enumerate possible error causes here to aid in debugging.
-    error_message_template = (
-        '{0} with "{1}" must be from the default graph. '
-        'Possible causes of this error include: \n\n'
-        '1) {0} was created outside the context of the default graph.'
-        '\n\n'
-        '2) The object passed through to EstimatorSpec was not created '
-        'in the most recent call to "model_fn".')
-
-    if isinstance(predictions, dict):
-      for key, value in six.iteritems(predictions):
-        if value.graph is not default_graph:
-          raise ValueError(error_message_template.format(
-              'prediction values',
-              '{0}: {1}'.format(key, value.name)))
-    elif predictions is not None:
-      # 'predictions' must be a single Tensor.
-      if predictions.graph is not default_graph:
-        raise ValueError(error_message_template.format(
-            'prediction values', predictions.name))
-
-    if loss is not None and loss.graph is not default_graph:
-      raise ValueError(error_message_template.format('loss', loss.name))
-    if train_op is not None and train_op.graph is not default_graph:
-      raise ValueError(error_message_template.format('train_op', train_op.name))
-    for key, value in list(six.iteritems(eval_metric_ops)):
-      if isinstance(value, Metric):
-        values_to_check = value.updates[:]
-        values_to_check.append(value.result())
-      else:
-        values_to_check = nest.flatten(value)
-      for val in values_to_check:
-        if val.graph is not default_graph:
-          raise ValueError(error_message_template.format(
-              'eval_metric_ops',
-              '{0}: {1}'.format(key, val.name)))
-
-    # Validate hooks.
-    training_chief_hooks = tuple(training_chief_hooks or [])
-    training_hooks = tuple(training_hooks or [])
-    evaluation_hooks = tuple(evaluation_hooks or [])
-    prediction_hooks = tuple(prediction_hooks or [])
-
-    for hook in (training_hooks + training_chief_hooks + evaluation_hooks +
-                 prediction_hooks):
-      if not isinstance(hook, session_run_hook.SessionRunHook):
-        raise TypeError(
-            'All hooks must be SessionRunHook instances, given: {}'.format(
-                hook))
-
-    # Add metric variables to the `LOCAL_VARIABLES` collection. Metric variables
-    # are by default not added to any collections. We are doing this here, so
-    # that metric variables get initialized.
-    local_vars = set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
-    vars_to_add = set()
-    for key, value in six.iteritems(eval_metric_ops):
-      if isinstance(value, Metric):
-        vars_to_add.update(value.variables)
-        # Convert Metric instances to (value_tensor, update_op) tuple.
-        eval_metric_ops[key] = (value.result(), value.updates[0])
-    # Remove variables that are in the local variables collection already.
-    vars_to_add = vars_to_add.difference(local_vars)
-    for v in vars_to_add:
-      ops.add_to_collection(ops.GraphKeys.LOCAL_VARIABLES, v)
-
-    scaffold = scaffold or monitored_session.Scaffold()
-    # Validate scaffold.
-    if not isinstance(scaffold, monitored_session.Scaffold):
-      raise TypeError(
-          'scaffold must be tf.train.Scaffold. Given: {}'.format(scaffold))
-
-    return super(EstimatorSpec, cls).__new__(
-        cls,
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metric_ops=eval_metric_ops,
-        export_outputs=export_outputs,
-        training_chief_hooks=training_chief_hooks,
-        training_hooks=training_hooks,
-        scaffold=scaffold,
-        evaluation_hooks=evaluation_hooks,
-        prediction_hooks=prediction_hooks)
-
-  def _replace(self, **kwds):
-    """Return a new EstimatorSpec replacing specified fields with new values."""
-    if 'mode' in kwds:
-      if self.mode != kwds['mode']:
-        raise ValueError('mode of EstimatorSpec cannot be changed.')
-    new_fields = map(kwds.pop, self._fields, list(self))
-    return EstimatorSpec(*new_fields)
-
-
-def _get_export_outputs(export_outputs, predictions):
-  """Validate export_outputs or create default export_outputs.
-
-  Args:
-    export_outputs: Describes the output signatures to be exported to
-      `SavedModel` and used during serving. Should be a dict or None.
-    predictions:  Predictions `Tensor` or dict of `Tensor`.
-
-  Returns:
-    Valid export_outputs dict
-
-  Raises:
-    TypeError: if export_outputs is not a dict or its values are not
-      ExportOutput instances.
-  """
-  if export_outputs is None:
-    default_output = export_output_lib.PredictOutput(predictions)
-    export_outputs = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: default_output}
-
-  if not isinstance(export_outputs, dict):
-    raise TypeError('export_outputs must be dict, given: {}'.format(
-        export_outputs))
-  for v in six.itervalues(export_outputs):
-    if not isinstance(v, export_output_lib.ExportOutput):
-      raise TypeError(
-          'Values in export_outputs must be ExportOutput objects. '
-          'Given: {}'.format(export_outputs))
-
-  _maybe_add_default_serving_output(export_outputs)
-
-  return export_outputs
-
-
-def _maybe_add_default_serving_output(export_outputs):
-  """Add a default serving output to the export_outputs if not present.
-
-  Args:
-    export_outputs: Describes the output signatures to be exported to
-      `SavedModel` and used during serving. Should be a dict.
-
-  Returns:
-    export_outputs dict with default serving signature added if necessary
-
-  Raises:
-    ValueError: if multiple export_outputs were provided without a default
-      serving key.
-  """
-  if len(export_outputs) == 1:
-    (key, value), = export_outputs.items()
-    if key != signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-      export_outputs[
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = value
-  if len(export_outputs) > 1:
-    if (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-        not in export_outputs):
-      raise ValueError(
-          'Multiple export_outputs were provided, but none of them is '
-          'specified as the default.  Do this by naming one of them with '
-          'signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.')
-
-  return export_outputs
-
-
-class _TPUEstimatorSpec(
-    collections.namedtuple('TPUEstimatorSpec', [
-        'mode', 'predictions', 'loss', 'train_op', 'eval_metrics',
-        'export_outputs', 'scaffold_fn', 'host_call', 'training_hooks',
-        'evaluation_hooks', 'prediction_hooks'
-    ])):
-  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
-
-  This is a simplified implementation of `tf.contrib.tpu.EstimatorSpec`. See
-  tensorflow/contrib/tpu/python/tpu/tpu_estimator.py for more detailed
-  documentation.
-  """
-
-  def __new__(cls,
-              mode,
-              predictions=None,
-              loss=None,
-              train_op=None,
-              eval_metrics=None,
-              export_outputs=None,
-              scaffold_fn=None,
-              host_call=None,
-              training_hooks=None,
-              evaluation_hooks=None,
-              prediction_hooks=None):
-    """Creates a `_TPUEstimatorSpec` instance."""
-    return super(_TPUEstimatorSpec, cls).__new__(
-        cls,
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metrics=eval_metrics,
-        export_outputs=export_outputs,
-        scaffold_fn=scaffold_fn,
-        host_call=host_call,
-        training_hooks=training_hooks,
-        evaluation_hooks=evaluation_hooks,
-        prediction_hooks=prediction_hooks)
-
-  def as_estimator_spec(self):
-    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    if not self.eval_metrics:
-      eval_metric_ops = None
-    else:
-      metric_fn, tensors = self.eval_metrics
-      eval_metric_ops = metric_fn(**tensors)
-    return EstimatorSpec(
-        mode=self.mode,
-        predictions=self.predictions,
-        loss=self.loss,
-        train_op=self.train_op,
-        eval_metric_ops=eval_metric_ops,
-        export_outputs=self.export_outputs,
-        training_hooks=self.training_hooks,
-        evaluation_hooks=self.evaluation_hooks,
-        prediction_hooks=self.prediction_hooks)
-
-
-def _check_is_tensor_or_operation(x, name):
-  if not (isinstance(x, ops.Operation) or ops.is_dense_tensor_like(x)):
-    raise TypeError('{} must be Operation or Tensor, given: {}'.format(name, x))
-
-
-def _check_is_tensor(x, tensor_name):
-  """Returns `x` if it is a `Tensor`, raises TypeError otherwise."""
-  if not ops.is_dense_tensor_like(x):
-    raise TypeError('{} must be Tensor, given: {}'.format(tensor_name, x))
-  return x
-
-
-def export_outputs_for_mode(
-    mode, serving_export_outputs=None, predictions=None, loss=None,
-    metrics=None):
-  """Util function for constructing a `ExportOutput` dict given a mode.
-
-  The returned dict can be directly passed to `build_all_signature_defs` helper
-  function as the `export_outputs` argument, used for generating a SignatureDef
-  map.
-
-  Args:
-    mode: A `ModeKeys` specifying the mode.
-    serving_export_outputs: Describes the output signatures to be exported to
-      `SavedModel` and used during serving. Should be a dict or None.
-    predictions: A dict of Tensors or single Tensor representing model
-        predictions. This argument is only used if serving_export_outputs is not
-        set.
-    loss: A dict of Tensors or single Tensor representing calculated loss.
-    metrics: A dict of (metric_value, update_op) tuples, or a single tuple.
-      metric_value must be a Tensor, and update_op must be a Tensor or Op
-
-  Returns:
-    Dictionary mapping the a key to an `tf.estimator.export.ExportOutput` object
-    The key is the expected SignatureDef key for the mode.
-
-  Raises:
-    ValueError: if an appropriate ExportOutput cannot be found for the mode.
-  """
-  # TODO(b/113185250): move all model export helper functions into an util file.
-  if mode == ModeKeys.PREDICT:
-    return _get_export_outputs(serving_export_outputs, predictions)
-  elif mode == ModeKeys.TRAIN:
-    return {mode: export_output_lib.TrainOutput(
-        loss=loss, predictions=predictions, metrics=metrics)}
-  elif mode == ModeKeys.EVAL:
-    return {mode: export_output_lib.EvalOutput(
-        loss=loss, predictions=predictions, metrics=metrics)}
-  else:
-    raise ValueError(
-        'Export output type not found for mode: {}'.format(mode))
+from tensorflow_estimator.python.estimator.model_fn import *
diff --git a/tensorflow/python/estimator/model_fn_test.py b/tensorflow/python/estimator/model_fn_test.py
deleted file mode 100644
index 8a3a9f3f51261369eddeb47d234154b5210895b3..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/model_fn_test.py
+++ /dev/null
@@ -1,661 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for model_fn.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator.export import export_output
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.keras import metrics
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import session_run_hook
-
-
-class _FakeHook(session_run_hook.SessionRunHook):
-  """Fake implementation of `SessionRunHook`."""
-
-
-class _InvalidHook(object):
-  """Invalid hook (not a subclass of `SessionRunHook`)."""
-
-
-class _InvalidScaffold(object):
-  """Invalid scaffold (not a subclass of `Scaffold`)."""
-
-
-class EstimatorSpecTrainTest(test.TestCase):
-  """Tests EstimatorSpec in train mode."""
-
-  def testRequiredArgumentsSet(self):
-    """Tests that no errors are raised when all required arguments are set."""
-    with ops.Graph().as_default(), self.cached_session():
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.TRAIN,
-          loss=constant_op.constant(1.),
-          train_op=control_flow_ops.no_op())
-
-  def testAllArgumentsSet(self):
-    """Tests that no errors are raised when all arguments are set."""
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      predictions = {'loss': loss}
-      classes = constant_op.constant('hello')
-      metric_obj = metrics.Mean()
-      metric_obj.update_state(loss)
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.TRAIN,
-          predictions=predictions,
-          loss=loss,
-          train_op=control_flow_ops.no_op(),
-          eval_metric_ops={
-              'loss': (control_flow_ops.no_op(), loss),
-              'mean': metric_obj,
-          },
-          export_outputs={
-              'head_name': export_output.ClassificationOutput(classes=classes)
-          },
-          training_chief_hooks=[_FakeHook()],
-          training_hooks=[_FakeHook()],
-          scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()],
-          prediction_hooks=[_FakeHook()])
-
-  def testLossNumber(self):
-    """Tests that error is raised when loss is a number (not Tensor)."""
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(TypeError, 'loss must be Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN,
-            loss=1.,
-            train_op=control_flow_ops.no_op())
-
-  def testLoss1DTensor(self):
-    """Tests that no errors are raised when loss is 1D tensor."""
-    with ops.Graph().as_default(), self.cached_session():
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.TRAIN,
-          loss=constant_op.constant([1.]),
-          train_op=control_flow_ops.no_op())
-
-  def testLossMissing(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'Missing loss'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN, train_op=control_flow_ops.no_op())
-
-  def testLossNotScalar(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'Loss must be scalar'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN,
-            loss=constant_op.constant([1., 2.]),
-            train_op=control_flow_ops.no_op())
-
-  def testLossSparseTensor(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = sparse_tensor.SparseTensor(
-          indices=[[0]],
-          values=[0.],
-          dense_shape=[1])
-      with self.assertRaisesRegexp(TypeError, 'loss must be Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN,
-            loss=loss,
-            train_op=control_flow_ops.no_op())
-
-  def testLossFromDifferentGraph(self):
-    with ops.Graph().as_default():
-      loss = constant_op.constant(1.)
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'must be from the default graph'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN,
-            loss=loss,
-            train_op=control_flow_ops.no_op())
-
-  def testTrainOpMissing(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'Missing train_op'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN, loss=constant_op.constant(1.))
-
-  def testTrainOpNotOperationAndTensor(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(TypeError,
-                                   'train_op must be Operation or Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN,
-            loss=constant_op.constant(1.),
-            train_op='Not an Operation or Tensor')
-
-  def testTrainOpFromDifferentGraph(self):
-    with ops.Graph().as_default():
-      train_op = control_flow_ops.no_op()
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'must be from the default graph'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN,
-            loss=constant_op.constant(1.),
-            train_op=train_op)
-
-  def testTrainingChiefHookInvalid(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          TypeError, 'All hooks must be SessionRunHook instances'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN,
-            loss=constant_op.constant(1.),
-            train_op=control_flow_ops.no_op(),
-            training_chief_hooks=[_InvalidHook()])
-
-  def testTrainingHookInvalid(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          TypeError, 'All hooks must be SessionRunHook instances'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN,
-            loss=constant_op.constant(1.),
-            train_op=control_flow_ops.no_op(),
-            training_hooks=[_InvalidHook()])
-
-  def testScaffoldInvalid(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          TypeError, r'scaffold must be tf\.train\.Scaffold'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.TRAIN,
-            loss=constant_op.constant(1.),
-            train_op=control_flow_ops.no_op(),
-            scaffold=_InvalidScaffold())
-
-  def testReturnDefaultScaffold(self):
-    with ops.Graph().as_default(), self.cached_session():
-      estimator_spec = model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.TRAIN,
-          loss=constant_op.constant(1.),
-          train_op=control_flow_ops.no_op())
-      self.assertIsNotNone(estimator_spec.scaffold)
-
-
-class EstimatorSpecEvalTest(test.TestCase):
-  """Tests EstimatorSpec in eval mode."""
-
-  def testRequiredArgumentsSet(self):
-    """Tests that no errors are raised when all required arguments are set."""
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL,
-          predictions={'loss': loss},
-          loss=loss)
-
-  def testAllArgumentsSet(self):
-    """Tests that no errors are raised when all arguments are set."""
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      predictions = {'loss': loss}
-      classes = constant_op.constant('hello')
-      metric_obj = metrics.Mean()
-      metric_obj.update_state(loss)
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL,
-          predictions=predictions,
-          loss=loss,
-          train_op=control_flow_ops.no_op(),
-          eval_metric_ops={
-              'loss': (control_flow_ops.no_op(), loss),
-              'mean': metric_obj,
-          },
-          export_outputs={
-              'head_name': export_output.ClassificationOutput(classes=classes)
-          },
-          training_chief_hooks=[_FakeHook()],
-          training_hooks=[_FakeHook()],
-          scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()])
-
-  def testEvaluationHookInvalid(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          TypeError, 'All hooks must be SessionRunHook instances'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            loss=constant_op.constant(1.),
-            evaluation_hooks=[_InvalidHook()])
-
-  def testTupleMetric(self):
-    """Tests that no errors are raised when a metric is tuple-valued."""
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL,
-          loss=loss,
-          eval_metric_ops={
-              'some_metric': ((loss, loss, (constant_op.constant(2), loss)),
-                              control_flow_ops.no_op())})
-
-  def testLoss1DTensor(self):
-    """Tests that no errors are raised when loss is 1D tensor."""
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant([1.])
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL,
-          predictions={'loss': loss},
-          loss=loss)
-
-  def testLossNumber(self):
-    """Tests that error is raised when loss is a number (not Tensor)."""
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(TypeError, 'loss must be Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': constant_op.constant(1.)},
-            loss=1.)
-
-  def testLossMissing(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'Missing loss'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': constant_op.constant(1.)})
-
-  def testLossNotScalar(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant([1., 2.])
-      with self.assertRaisesRegexp(ValueError, 'Loss must be scalar'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': loss},
-            loss=loss)
-
-  def testLossSparseTensor(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = sparse_tensor.SparseTensor(
-          indices=[[0]],
-          values=[0.],
-          dense_shape=[1])
-      with self.assertRaisesRegexp(
-          TypeError, 'loss must be Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'prediction': constant_op.constant(1.)},
-            loss=loss)
-
-  def testLossFromDifferentGraph(self):
-    with ops.Graph().as_default():
-      loss = constant_op.constant(1.)
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'must be from the default graph'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'prediction': constant_op.constant(1.)},
-            loss=loss)
-
-  def testReplaceRaisesConstructorChecks(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      spec = model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL, predictions={'loss': loss}, loss=loss)
-      with self.assertRaisesRegexp(ValueError, 'Loss must be scalar'):
-        spec._replace(loss=constant_op.constant([1., 2.]))
-
-  def testReplaceDoesReplace(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      spec = model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL, predictions={'loss': loss}, loss=loss)
-      new_spec = spec._replace(predictions={'m': loss})
-      self.assertEqual(['m'], list(new_spec.predictions.keys()))
-
-  def testReplaceNotAllowModeChange(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      spec = model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL, predictions={'loss': loss}, loss=loss)
-      spec._replace(mode=model_fn.ModeKeys.EVAL)
-      with self.assertRaisesRegexp(ValueError,
-                                   'mode of EstimatorSpec cannot be changed'):
-        spec._replace(mode=model_fn.ModeKeys.TRAIN)
-
-  def testPredictionsMissingIsOkay(self):
-    with ops.Graph().as_default(), self.cached_session():
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL, loss=constant_op.constant(1.))
-
-  def testPredictionsTensor(self):
-    """Tests that no error is raised when predictions is Tensor (not dict)."""
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.EVAL,
-          predictions=loss,
-          loss=loss)
-
-  def testPredictionsNumber(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          TypeError, r'predictions\[number\] must be Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'number': 1.},
-            loss=constant_op.constant(1.))
-
-  def testPredictionsSparseTensor(self):
-    with ops.Graph().as_default(), self.cached_session():
-      predictions = {
-          'sparse': sparse_tensor.SparseTensor(
-              indices=[[0]],
-              values=[0.],
-              dense_shape=[1])}
-      with self.assertRaisesRegexp(
-          TypeError, r'predictions\[sparse\] must be Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions=predictions,
-            loss=constant_op.constant(1.))
-
-  def testPredictionsFromDifferentGraph(self):
-    with ops.Graph().as_default():
-      predictions = {'loss': constant_op.constant(1.)}
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          ValueError, 'must be from the default graph'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions=predictions,
-            loss=constant_op.constant(1.))
-
-  def testEvalMetricOpsNoDict(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      with self.assertRaisesRegexp(
-          TypeError, 'eval_metric_ops must be a dict'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': loss},
-            loss=loss,
-            eval_metric_ops=loss)
-
-  def testEvalMetricOpsNoTuple(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      with self.assertRaisesRegexp(
-          TypeError,
-          (r'Values of eval_metric_ops must be \(metric_value, update_op\) '
-           'tuples')):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': loss},
-            loss=loss,
-            eval_metric_ops={'loss': loss})
-
-  def testEvalMetricOpsNoTensorOrOperation(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      with self.assertRaisesRegexp(TypeError, 'must be Operation or Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': loss},
-            loss=loss,
-            eval_metric_ops={'loss': ('NonTensor', loss)})
-
-  def testEvalMetricNestedNoTensorOrOperation(self):
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      with self.assertRaisesRegexp(TypeError, 'must be Operation or Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': loss},
-            loss=loss,
-            eval_metric_ops={'loss': ((('NonTensor',),),
-                                      control_flow_ops.no_op())})
-
-  def testEvalMetricOpsFromDifferentGraphWithMetricTuple(self):
-    with ops.Graph().as_default():
-      eval_metric_ops = {
-          'loss': (control_flow_ops.no_op(), constant_op.constant(1.))}
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      with self.assertRaisesRegexp(
-          ValueError, 'must be from the default graph'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': loss},
-            loss=loss,
-            eval_metric_ops=eval_metric_ops)
-
-  def testEvalMetricOpsFromDifferentGraphWithMetricObject(self):
-    with ops.Graph().as_default():
-      metric_obj = metrics.Mean()
-      metric_obj.update_state(constant_op.constant(1.))
-      eval_metric_ops = {'metric': metric_obj}
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      with self.assertRaisesRegexp(
-          ValueError, 'must be from the default graph'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': loss},
-            loss=loss,
-            eval_metric_ops=eval_metric_ops)
-
-  def testEvalMetricOpsWithoutUpdates(self):
-    with ops.Graph().as_default():
-      eval_metric_ops = {'mean': metrics.Mean()}
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      with self.assertRaisesRegexp(ValueError, 'Please call update_state(...)'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.EVAL,
-            predictions={'loss': loss},
-            loss=loss,
-            eval_metric_ops=eval_metric_ops)
-
-
-class EstimatorSpecInferTest(test.TestCase):
-  """Tests EstimatorSpec in infer mode."""
-
-  def testRequiredArgumentsSet(self):
-    """Tests that no errors are raised when all required arguments are set."""
-    with ops.Graph().as_default(), self.cached_session():
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.PREDICT,
-          predictions={'loss': constant_op.constant(1.)})
-
-  def testAllArgumentsSet(self):
-    """Tests that no errors are raised when all arguments are set."""
-    with ops.Graph().as_default(), self.cached_session():
-      loss = constant_op.constant(1.)
-      predictions = {'loss': loss}
-      classes = constant_op.constant('hello')
-      metric_obj = metrics.Mean()
-      metric_obj.update_state(loss)
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.PREDICT,
-          predictions=predictions,
-          loss=loss,
-          train_op=control_flow_ops.no_op(),
-          eval_metric_ops={
-              'loss': (control_flow_ops.no_op(), loss),
-              'mean': metric_obj,
-          },
-          export_outputs={
-              'head_name': export_output.ClassificationOutput(classes=classes)
-          },
-          training_chief_hooks=[_FakeHook()],
-          training_hooks=[_FakeHook()],
-          scaffold=monitored_session.Scaffold(),
-          evaluation_hooks=[_FakeHook()],
-          prediction_hooks=[_FakeHook()])
-
-  def testPredictionHookInvalid(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          TypeError, 'All hooks must be SessionRunHook instances'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.PREDICT,
-            predictions=constant_op.constant(1.),
-            prediction_hooks=[_InvalidHook()])
-
-  def testPredictionsMissing(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'Missing predictions'):
-        model_fn.EstimatorSpec(mode=model_fn.ModeKeys.PREDICT)
-
-  def testPredictionsTensor(self):
-    """Tests that no error is raised when predictions is Tensor (not dict)."""
-    with ops.Graph().as_default(), self.cached_session():
-      model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.PREDICT, predictions=constant_op.constant(1.))
-
-  def testPredictionsNumber(self):
-    with ops.Graph().as_default(), self.cached_session():
-      with self.assertRaisesRegexp(
-          TypeError, r'predictions\[number\] must be Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.PREDICT, predictions={'number': 1.})
-
-  def testPredictionsSparseTensor(self):
-    with ops.Graph().as_default(), self.cached_session():
-      predictions = {
-          'sparse': sparse_tensor.SparseTensor(
-              indices=[[0]],
-              values=[0.],
-              dense_shape=[1])}
-      with self.assertRaisesRegexp(
-          TypeError, r'predictions\[sparse\] must be Tensor'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.PREDICT, predictions=predictions)
-
-  def testExportOutputsNoDict(self):
-    with ops.Graph().as_default(), self.cached_session():
-      predictions = {'loss': constant_op.constant(1.)}
-      classes = constant_op.constant('hello')
-      with self.assertRaisesRegexp(
-          TypeError, 'export_outputs must be dict'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.PREDICT,
-            predictions=predictions,
-            export_outputs=export_output.ClassificationOutput(classes=classes))
-
-  def testExportOutputsValueNotExportOutput(self):
-    with ops.Graph().as_default(), self.cached_session():
-      predictions = {'loss': constant_op.constant(1.)}
-      with self.assertRaisesRegexp(
-          TypeError,
-          r"Values in export_outputs must be ExportOutput objects. "
-          r"Given: {'head_name': {'loss': <tf.Tensor 'Const:0' shape=\(\) "
-          r"dtype=float32>}}"):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.PREDICT,
-            predictions=predictions,
-            export_outputs={'head_name': predictions})
-
-  def testExportOutputsSingleheadMissingDefault(self):
-    with ops.Graph().as_default(), self.cached_session():
-      predictions = {'loss': constant_op.constant(1.)}
-      output_1 = constant_op.constant([1.])
-      regression_output = export_output.RegressionOutput(value=output_1)
-      export_outputs = {
-          'head-1': regression_output,
-          }
-      estimator_spec = model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.PREDICT,
-          predictions=predictions,
-          export_outputs=export_outputs)
-      expected_export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-          regression_output,
-          'head-1': regression_output,
-      }
-      self.assertEqual(expected_export_outputs, estimator_spec.export_outputs)
-
-  def testExportOutputsMultiheadWithDefault(self):
-    with ops.Graph().as_default(), self.cached_session():
-      predictions = {'loss': constant_op.constant(1.)}
-      output_1 = constant_op.constant([1.])
-      output_2 = constant_op.constant(['2'])
-      output_3 = constant_op.constant(['3'])
-      export_outputs = {
-          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-          export_output.RegressionOutput(value=output_1),
-          'head-2': export_output.ClassificationOutput(classes=output_2),
-          'head-3': export_output.PredictOutput(outputs={
-              'some_output_3': output_3
-          })}
-      estimator_spec = model_fn.EstimatorSpec(
-          mode=model_fn.ModeKeys.PREDICT,
-          predictions=predictions,
-          export_outputs=export_outputs)
-      self.assertEqual(export_outputs, estimator_spec.export_outputs)
-
-  def testExportOutputsMultiheadMissingDefault(self):
-    with ops.Graph().as_default(), self.cached_session():
-      predictions = {'loss': constant_op.constant(1.)}
-      output_1 = constant_op.constant([1.])
-      output_2 = constant_op.constant(['2'])
-      output_3 = constant_op.constant(['3'])
-      export_outputs = {
-          'head-1': export_output.RegressionOutput(value=output_1),
-          'head-2': export_output.ClassificationOutput(classes=output_2),
-          'head-3': export_output.PredictOutput(outputs={
-              'some_output_3': output_3
-          })}
-      with self.assertRaisesRegexp(
-          ValueError,
-          'Multiple export_outputs were provided, but none of them is '
-          'specified as the default.  Do this by naming one of them with '
-          'signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.'):
-        model_fn.EstimatorSpec(
-            mode=model_fn.ModeKeys.PREDICT,
-            predictions=predictions,
-            export_outputs=export_outputs)
-
-  def testDefaultExportOutputCreated(self):
-    """Ensure that a default PredictOutput is created for export."""
-    with ops.Graph().as_default(), self.cached_session():
-      predictions = constant_op.constant(1.)
-      self._assertDefaultExportOutputForPredictions(predictions)
-
-  def testDefaultExportOutputCreatedDict(self):
-    """Ensure that a default PredictOutput is created for export for dicts."""
-    with ops.Graph().as_default(), self.cached_session():
-      predictions = {'loss': constant_op.constant(1.),
-                     'score': constant_op.constant(10.)}
-      self._assertDefaultExportOutputForPredictions(predictions)
-
-  def _assertDefaultExportOutputForPredictions(self, predictions):
-    spec = model_fn.EstimatorSpec(
-        mode=model_fn.ModeKeys.PREDICT, predictions=predictions)
-
-    expected = export_output.PredictOutput(predictions).outputs
-    serving_output = spec.export_outputs[
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-    self.assertEqual(serving_output.outputs, expected)
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 3773810a0461354c0343bb713551034ef0156fdd..cb3c731d1ae4f69c620f860319b666dfb1f4d76c 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,904 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Environment configuration object for Estimators."""
+"""run_config python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-import json
-import os
-
-import six
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.distribute import estimator_training as distribute_coordinator_training
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat_internal
-from tensorflow.python.util import function_utils
-from tensorflow.python.util.tf_export import estimator_export
-
-
-_USE_DEFAULT = object()
-_VALID_DEVICE_FN_ARGS = set(['op'])
-
-# A list of the property names in RunConfig that the user is allowed to change.
-_DEFAULT_REPLACEABLE_LIST = [
-    'model_dir',
-    'tf_random_seed',
-    'save_summary_steps',
-    'save_checkpoints_steps',
-    'save_checkpoints_secs',
-    'session_config',
-    'keep_checkpoint_max',
-    'keep_checkpoint_every_n_hours',
-    'log_step_count_steps',
-    'train_distribute',
-    'device_fn',
-    'protocol',
-    'eval_distribute',
-    'experimental_distribute',
-]
-
-_SAVE_CKPT_ERR = (
-    '`save_checkpoints_steps` and `save_checkpoints_secs` cannot be both set.'
-)
-
-_TF_CONFIG_ENV = 'TF_CONFIG'
-_TASK_ENV_KEY = 'task'
-_TASK_TYPE_KEY = 'type'
-_TASK_ID_KEY = 'index'
-_CLUSTER_KEY = 'cluster'
-_SERVICE_KEY = 'service'
-_SESSION_MASTER_KEY = 'session_master'
-_EVAL_SESSION_MASTER_KEY = 'eval_session_master'
-_MODEL_DIR_KEY = 'model_dir'
-_LOCAL_MASTER = ''
-_GRPC_SCHEME = 'grpc://'
-
-
-def _get_session_master(cluster_spec, task_type, task_id, tf_config):
-  """Returns the appropriate address for TensorFlow master.
-
-  The order of precedence to deteremine the TF session master is as follows:
-  1. If `tf_session_master` is set in TF_CONFIG environment variable, takes it.
-  2. If the cluster has only one node, returns empty string ''.
-  3. Returns the grpc address according to the task type and id in the cluster.
-     This is between-graph replication.
-
-  Note: task_type and task_id must be validated. Typically, validated using
-  `_validate_task_type_and_task_id`.
-
-  Args:
-    cluster_spec: A `ClusterSpec` instance.
-    task_type: String. Task type for current node.
-    task_id: Int. Task id for current node.
-    tf_config: Dict. Python dict for the TF_CONFIG environment variable.
-
-  Raises:
-    RuntimeError: If `cluster_spec` is not set.
-
-  """
-  if _SESSION_MASTER_KEY in tf_config:
-    return tf_config[_SESSION_MASTER_KEY]
-
-  if not cluster_spec:
-    raise RuntimeError('Internal error: `_get_session_master` '
-                       'does not expect empty cluster_spec.')
-
-  jobs = cluster_spec.jobs
-
-  # If there is only one node in the cluster, do things locally by setting
-  # master to ''.  If a service or user sets TF_CONFIG with a single node, it's
-  # more performant to use a direct master rather than an RPC service.
-  if len(jobs) == 1 and len(cluster_spec.job_tasks(jobs[0])) == 1:
-    return _LOCAL_MASTER
-
-  # Lookup the master in cluster_spec using task_type and task_id,
-  # if possible.
-  addresses = cluster_spec.job_tasks(task_type)
-  return _GRPC_SCHEME + addresses[task_id]
-
-
-def _get_eval_session_master(task_type, tf_config):
-  """Returns the appropriate address for TensorFlow evaluation master."""
-  if task_type == TaskType.EVALUATOR:
-    return tf_config.get(_EVAL_SESSION_MASTER_KEY, _LOCAL_MASTER)
-
-  if _EVAL_SESSION_MASTER_KEY in tf_config:
-    raise ValueError('Key ({}) should not be set for task type other than {}. '
-                     'Task type: {}'.format(_EVAL_SESSION_MASTER_KEY,
-                                            TaskType.EVALUATOR, task_type))
-  return _LOCAL_MASTER
-
-
-def _count_ps(cluster_spec):
-  """Counts the number of parameter servers in cluster_spec."""
-  if not cluster_spec:
-    raise RuntimeError(
-        'Internal error: `_count_ps` does not expect empty cluster_spec.')
-
-  return len(cluster_spec.as_dict().get(TaskType.PS, []))
-
-
-def _count_worker(cluster_spec, chief_task_type):
-  """Counts the number of workers (including chief) in cluster_spec."""
-  if not cluster_spec:
-    raise RuntimeError(
-        'Internal error: `_count_worker` does not expect empty cluster_spec.')
-
-  return (len(cluster_spec.as_dict().get(TaskType.WORKER, [])) +
-          len(cluster_spec.as_dict().get(chief_task_type, [])))
-
-
-def _validate_service(service):
-  """Validates the service key."""
-  if service is not None and not isinstance(service, dict):
-    raise TypeError(
-        'If "service" is set in TF_CONFIG, it must be a dict. Given %s' %
-        type(service))
-  return service
-
-
-def _validate_task_type_and_task_id(cluster_spec, task_env, chief_task_type):
-  """Validates the task type and index in `task_env` according to cluster."""
-  if chief_task_type not in cluster_spec.jobs:
-    raise ValueError(
-        'If "cluster" is set in TF_CONFIG, it must have one "%s" node.' %
-        chief_task_type)
-  if len(cluster_spec.job_tasks(chief_task_type)) > 1:
-    raise ValueError(
-        'The "cluster" in TF_CONFIG must have only one "%s" node.' %
-        chief_task_type)
-
-  task_type = task_env.get(_TASK_TYPE_KEY, None)
-  task_id = task_env.get(_TASK_ID_KEY, None)
-
-  if not task_type:
-    raise ValueError(
-        'If "cluster" is set in TF_CONFIG, task type must be set.')
-  if task_id is None:
-    raise ValueError(
-        'If "cluster" is set in TF_CONFIG, task index must be set.')
-
-  task_id = int(task_id)
-
-  # Check the task id bounds. Upper bound is not necessary as
-  # - for evaluator, there is no upper bound.
-  # - for non-evaluator, task id is upper bounded by the number of jobs in
-  # cluster spec, which will be checked later (when retrieving the `master`)
-  if task_id < 0:
-    raise ValueError('Task index must be non-negative number.')
-
-  # Evaluator is not part of the training cluster.
-  if task_type == TaskType.EVALUATOR:
-    return task_type, task_id
-
-  if task_type not in cluster_spec.jobs:
-    raise ValueError(
-        '%s is not a valid task_type in the cluster_spec:\n'
-        '%s\n\n'
-        'Note that these values may be coming from the TF_CONFIG environment '
-        'variable.' % (task_type, cluster_spec))
-  addresses = cluster_spec.job_tasks(task_type)
-  if not 0 <= task_id < len(addresses):
-    raise ValueError(
-        '%d is not a valid task_id for task_type %s in the cluster_spec:\n'
-        '%s\n\n'
-        'Note that these values may be coming from the TF_CONFIG environment '
-        'variable.' % (task_id, task_type, cluster_spec))
-
-  return task_type, task_id
-
-
-def _get_global_id_in_cluster(
-    cluster_spec, task_type, task_id, chief_task_type):
-  """Returns the global id in cluster."""
-  # Note: This is implementation details, which user should not rely on.
-  # The first id is 0, which is always for the `chief` node. All other nodes,
-  # except `ps`, are ordered alphabetical based on task type (alphabetically)
-  # and task id (ascendingly). `ps` are ordered last.
-
-  # Sort task names in cluster
-  task_type_ordered_list = [chief_task_type]
-  task_type_ordered_list.extend([
-      t for t in sorted(cluster_spec.jobs)
-      if t != chief_task_type and t != TaskType.PS
-  ])
-  if TaskType.PS in cluster_spec.jobs:
-    task_type_ordered_list.append(TaskType.PS)
-
-  next_global_id = 0
-  for t in task_type_ordered_list:
-    if t == task_type:
-      return next_global_id + task_id
-    next_global_id += len(cluster_spec.job_tasks(t))
-
-  # This should never happen.
-  raise RuntimeError('Internal Error: `task_type` ({}) is not in '
-                     'cluster_spec ({}).'.format(task_type, cluster_spec))
-
-
-def _validate_save_ckpt_with_replaced_keys(new_copy, replaced_keys):
-  """Validates the save ckpt properties."""
-  # Ensure one (and only one) of save_steps and save_secs is not None.
-  # Also, if user sets one save ckpt property, say steps, the other one (secs)
-  # should be set as None to improve usability.
-
-  save_steps = new_copy.save_checkpoints_steps
-  save_secs = new_copy.save_checkpoints_secs
-
-  if ('save_checkpoints_steps' in replaced_keys and
-      'save_checkpoints_secs' in replaced_keys):
-    # If user sets both properties explicitly, we need to error out if both
-    # are set or neither of them are set.
-    if save_steps is not None and save_secs is not None:
-      raise ValueError(_SAVE_CKPT_ERR)
-  elif 'save_checkpoints_steps' in replaced_keys and save_steps is not None:
-    new_copy._save_checkpoints_secs = None  # pylint: disable=protected-access
-  elif 'save_checkpoints_secs' in replaced_keys and save_secs is not None:
-    new_copy._save_checkpoints_steps = None  # pylint: disable=protected-access
-
-
-def _validate_properties(run_config):
-  """Validates the properties."""
-  def _validate(property_name, cond, message):
-    property_value = getattr(run_config, property_name)
-    if property_value is not None and not cond(property_value):
-      raise ValueError(message)
-
-  _validate('model_dir', lambda dir: dir,
-            message='model_dir should be non-empty')
-
-  _validate('save_summary_steps', lambda steps: steps >= 0,
-            message='save_summary_steps should be >= 0')
-
-  _validate('save_checkpoints_steps', lambda steps: steps >= 0,
-            message='save_checkpoints_steps should be >= 0')
-  _validate('save_checkpoints_secs', lambda secs: secs >= 0,
-            message='save_checkpoints_secs should be >= 0')
-
-  _validate('session_config',
-            lambda sc: isinstance(sc, config_pb2.ConfigProto),
-            message='session_config must be instance of ConfigProto')
-
-  _validate('keep_checkpoint_max', lambda keep_max: keep_max >= 0,
-            message='keep_checkpoint_max should be >= 0')
-  _validate('keep_checkpoint_every_n_hours', lambda keep_hours: keep_hours > 0,
-            message='keep_checkpoint_every_n_hours should be > 0')
-  _validate('log_step_count_steps', lambda num_steps: num_steps > 0,
-            message='log_step_count_steps should be > 0')
-
-  _validate('tf_random_seed', lambda seed: isinstance(seed, six.integer_types),
-            message='tf_random_seed must be integer.')
-
-  _validate('device_fn', lambda device_fn: six.callable(device_fn) and
-            set(function_utils.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS,
-            message='device_fn must be callable with exactly'
-                    ' one argument "op".')
-
-  _validate('protocol',
-            lambda protocol: protocol in (None, "grpc", "grpc+verbs"),
-            message='protocol should be grpc or grpc+verbs')
-
-
-def get_default_session_config():
-  """Returns tf.ConfigProto instance."""
-
-  rewrite_opts = rewriter_config_pb2.RewriterConfig(
-      meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE)
-  graph_opts = config_pb2.GraphOptions(rewrite_options=rewrite_opts)
-
-  return config_pb2.ConfigProto(allow_soft_placement=True,
-                                graph_options=graph_opts)
-
-
-class TaskType(object):
-  MASTER = 'master'
-  PS = 'ps'
-  WORKER = 'worker'
-  CHIEF = 'chief'
-  EVALUATOR = 'evaluator'
-
-
-@estimator_export('estimator.RunConfig')
-class RunConfig(object):
-  """This class specifies the configurations for an `Estimator` run."""
-
-  def __init__(self,
-               model_dir=None,
-               tf_random_seed=None,
-               save_summary_steps=100,
-               save_checkpoints_steps=_USE_DEFAULT,
-               save_checkpoints_secs=_USE_DEFAULT,
-               session_config=None,
-               keep_checkpoint_max=5,
-               keep_checkpoint_every_n_hours=10000,
-               log_step_count_steps=100,
-               train_distribute=None,
-               device_fn=None,
-               protocol=None,
-               eval_distribute=None,
-               experimental_distribute=None):
-    """Constructs a RunConfig.
-
-    All distributed training related properties `cluster_spec`, `is_chief`,
-    `master` , `num_worker_replicas`, `num_ps_replicas`, `task_id`, and
-    `task_type` are set based on the `TF_CONFIG` environment variable, if the
-    pertinent information is present. The `TF_CONFIG` environment variable is a
-    JSON object with attributes: `cluster` and `task`.
-
-    `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from
-    `server_lib.py`, mapping task types (usually one of the `TaskType` enums) to
-    a list of task addresses.
-
-    `task` has two attributes: `type` and `index`, where `type` can be any of
-    the task types in `cluster`. When `TF_CONFIG` contains said information,
-    the following properties are set on this class:
-
-    * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If
-      present, must have one and only one node in the `chief` attribute of
-      `cluster_spec`.
-    * `task_type` is set to `TF_CONFIG['task']['type']`. Must set if
-      `cluster_spec` is present; must be `worker` (the default value) if
-      `cluster_spec` is not set.
-    * `task_id` is set to `TF_CONFIG['task']['index']`. Must set if
-      `cluster_spec` is present; must be 0 (the default value) if
-      `cluster_spec` is not set.
-    * `master` is determined by looking up `task_type` and `task_id` in the
-      `cluster_spec`. Defaults to ''.
-    * `num_ps_replicas` is set by counting the number of nodes listed
-      in the `ps` attribute of `cluster_spec`. Defaults to 0.
-    * `num_worker_replicas` is set by counting the number of nodes listed
-      in the `worker` and `chief` attributes of `cluster_spec`. Defaults to 1.
-    * `is_chief` is determined based on `task_type` and `cluster`.
-
-    There is a special node with `task_type` as `evaluator`, which is not part
-    of the (training) `cluster_spec`. It handles the distributed evaluation job.
-
-    Example of non-chief node:
-    ```
-      cluster = {'chief': ['host0:2222'],
-                 'ps': ['host1:2222', 'host2:2222'],
-                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
-      os.environ['TF_CONFIG'] = json.dumps(
-          {'cluster': cluster,
-           'task': {'type': 'worker', 'index': 1}})
-      config = RunConfig()
-      assert config.master == 'host4:2222'
-      assert config.task_id == 1
-      assert config.num_ps_replicas == 2
-      assert config.num_worker_replicas == 4
-      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
-      assert config.task_type == 'worker'
-      assert not config.is_chief
-    ```
-
-    Example of chief node:
-    ```
-      cluster = {'chief': ['host0:2222'],
-                 'ps': ['host1:2222', 'host2:2222'],
-                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
-      os.environ['TF_CONFIG'] = json.dumps(
-          {'cluster': cluster,
-           'task': {'type': 'chief', 'index': 0}})
-      config = RunConfig()
-      assert config.master == 'host0:2222'
-      assert config.task_id == 0
-      assert config.num_ps_replicas == 2
-      assert config.num_worker_replicas == 4
-      assert config.cluster_spec == server_lib.ClusterSpec(cluster)
-      assert config.task_type == 'chief'
-      assert config.is_chief
-    ```
-
-    Example of evaluator node (evaluator is not part of training cluster):
-    ```
-      cluster = {'chief': ['host0:2222'],
-                 'ps': ['host1:2222', 'host2:2222'],
-                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
-      os.environ['TF_CONFIG'] = json.dumps(
-          {'cluster': cluster,
-           'task': {'type': 'evaluator', 'index': 0}})
-      config = RunConfig()
-      assert config.master == ''
-      assert config.evaluator_master == ''
-      assert config.task_id == 0
-      assert config.num_ps_replicas == 0
-      assert config.num_worker_replicas == 0
-      assert config.cluster_spec == {}
-      assert config.task_type == 'evaluator'
-      assert not config.is_chief
-    ```
-
-    N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set,
-    `keep_checkpoint_max` might need to be adjusted accordingly, especially in
-    distributed training. For example, setting `save_checkpoints_secs` as 60
-    without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation
-    that checkpoint would be garbage collected after 5 minutes. In distributed
-    training, the evaluation job starts asynchronously and might fail to load or
-    find the checkpoint due to race condition.
-
-    Args:
-      model_dir: directory where model parameters, graph, etc are saved. If
-        `PathLike` object, the path will be resolved. If `None`, will use a
-        default value set by the Estimator.
-      tf_random_seed: Random seed for TensorFlow initializers.
-        Setting this value allows consistency between reruns.
-      save_summary_steps: Save summaries every this many steps.
-      save_checkpoints_steps: Save checkpoints every this many steps. Can not be
-          specified with `save_checkpoints_secs`.
-      save_checkpoints_secs: Save checkpoints every this many seconds. Can not
-          be specified with `save_checkpoints_steps`. Defaults to 600 seconds if
-          both `save_checkpoints_steps` and `save_checkpoints_secs` are not set
-          in constructor.  If both `save_checkpoints_steps` and
-          `save_checkpoints_secs` are None, then checkpoints are disabled.
-      session_config: a ConfigProto used to set session parameters, or None.
-      keep_checkpoint_max: The maximum number of recent checkpoint files to
-        keep. As new files are created, older files are deleted. If None or 0,
-        all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent
-        checkpoint files are kept.)
-      keep_checkpoint_every_n_hours: Number of hours between each checkpoint
-        to be saved. The default value of 10,000 hours effectively disables
-        the feature.
-      log_step_count_steps: The frequency, in number of global steps, that the
-        global step/sec and the loss will be logged during training.
-      train_distribute: An optional instance of
-        `tf.contrib.distribute.DistributionStrategy`. If specified,
-        then Estimator will distribute the user's model during training,
-        according to the policy specified by that strategy. Setting
-        `experimental_distribute.train_distribute` is preferred.
-      device_fn: A callable invoked for every `Operation` that takes the
-        `Operation` and returns the device string. If `None`, defaults to
-        the device function returned by `tf.train.replica_device_setter`
-        with round-robin strategy.
-      protocol: An optional argument which specifies the protocol used when
-        starting server. None means default to grpc.
-      eval_distribute: An optional instance of
-        `tf.contrib.distribute.DistributionStrategy`. If specified,
-        then Estimator will distribute the user's model during evaluation,
-        according to the policy specified by that strategy. Setting
-        `experimental_distribute.eval_distribute` is preferred.
-      experimental_distribute: an optional
-        `tf.contrib.distribute.DistributeConfig` object specifying
-        DistributionStrategy-related configuration. The `train_distribute` and
-        `eval_distribute` can be passed as parameters to `RunConfig` or set in
-        `experimental_distribute` but not both.
-
-    Raises:
-      ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
-      are set.
-    """
-    if (save_checkpoints_steps == _USE_DEFAULT and
-        save_checkpoints_secs == _USE_DEFAULT):
-      save_checkpoints_steps = None
-      save_checkpoints_secs = 600
-    elif save_checkpoints_secs == _USE_DEFAULT:
-      save_checkpoints_secs = None
-    elif save_checkpoints_steps == _USE_DEFAULT:
-      save_checkpoints_steps = None
-    elif (save_checkpoints_steps is not None and
-          save_checkpoints_secs is not None):
-      raise ValueError(_SAVE_CKPT_ERR)
-
-    tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
-    if tf_config:
-      logging.info('TF_CONFIG environment variable: %s', tf_config)
-
-    model_dir = _get_model_dir(tf_config,
-                               compat_internal.path_to_str(model_dir))
-
-    RunConfig._replace(
-        self,
-        allowed_properties_list=_DEFAULT_REPLACEABLE_LIST,
-        model_dir=model_dir,
-        tf_random_seed=tf_random_seed,
-        save_summary_steps=save_summary_steps,
-        save_checkpoints_steps=save_checkpoints_steps,
-        save_checkpoints_secs=save_checkpoints_secs,
-        session_config=session_config,
-        keep_checkpoint_max=keep_checkpoint_max,
-        keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
-        log_step_count_steps=log_step_count_steps,
-        train_distribute=train_distribute,
-        device_fn=device_fn,
-        protocol=protocol,
-        eval_distribute=eval_distribute,
-        experimental_distribute=experimental_distribute)
-
-    # TODO(frankchn,priyag): Eventually use distributed coordinator for TPUs.
-    if ((train_distribute and
-         train_distribute.__class__.__name__ != 'TPUStrategy') or
-        (eval_distribute and
-         eval_distribute.__class__.__name__ != 'TPUStrategy') or
-        experimental_distribute):
-      logging.info('Initializing RunConfig with distribution strategies.')
-      distribute_coordinator_training.init_run_config(self, tf_config)
-    else:
-      self._init_distributed_setting_from_environment_var(tf_config)
-      self._maybe_overwrite_session_config_for_distributed_training()
-
-  def _maybe_overwrite_session_config_for_distributed_training(self):
-    """Overwrites the session_config for distributed training.
-
-    The default overwrite is optimized for between-graph training. Subclass
-    should override this method if necessary.
-    """
-    # Get session_config only for between-graph distributed mode (cluster_spec
-    # is present).
-    if not self._session_config and self._cluster_spec:
-      RunConfig._replace(
-          self,
-          allowed_properties_list=_DEFAULT_REPLACEABLE_LIST,
-          session_config=self._get_default_session_config_distributed())
-
-  def _get_default_session_config_distributed(self):
-    """Returns None or tf.ConfigProto instance with default device_filters set.
-
-    Device filters are set such that chief/master and worker communicates with
-    only ps. session_config=None for evaluators or any other TaskType.
-    """
-
-    rewrite_opts = rewriter_config_pb2.RewriterConfig(
-        meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE)
-    graph_opts = config_pb2.GraphOptions(rewrite_options=rewrite_opts)
-
-    device_filters = None
-    if self._task_type == TaskType.MASTER:
-      device_filters = ['/job:ps', '/job:master']
-    elif self._task_type == TaskType.CHIEF:
-      device_filters = ['/job:ps', '/job:chief']
-    elif self._task_type == TaskType.WORKER:
-      device_filters = ['/job:ps', '/job:worker/task:%d' % self._task_id]
-    elif self._task_type == TaskType.PS:
-      device_filters = ['/job:ps', '/job:worker', '/job:master']
-    else:
-      # If the task_type is `EVALUATOR` or something other than the ones in
-      # TaskType then don't set any device filters.
-      return None
-
-    return config_pb2.ConfigProto(
-        allow_soft_placement=True,
-        graph_options=graph_opts,
-        device_filters=device_filters)
-
-  def _init_distributed_setting_from_environment_var(self, tf_config):
-    """Initialize distributed properties based on `tf_config`."""
-
-    self._service = _validate_service(tf_config.get(_SERVICE_KEY))
-    self._cluster_spec = server_lib.ClusterSpec(tf_config.get(_CLUSTER_KEY, {}))
-    task_env = tf_config.get(_TASK_ENV_KEY, {})
-
-    if self._cluster_spec and TaskType.MASTER in self._cluster_spec.jobs:
-      return self._init_distributed_setting_from_environment_var_with_master(
-          tf_config)
-
-    if self._cluster_spec:
-      # Distributed mode.
-      self._task_type, self._task_id = _validate_task_type_and_task_id(
-          self._cluster_spec, task_env, TaskType.CHIEF)
-
-      self._evaluation_master = _get_eval_session_master(
-          self._task_type, tf_config)
-
-      if self._task_type != TaskType.EVALUATOR:
-        self._master = _get_session_master(self._cluster_spec, self._task_type,
-                                           self._task_id, tf_config)
-        self._num_ps_replicas = _count_ps(self._cluster_spec)
-        self._num_worker_replicas = _count_worker(
-            self._cluster_spec, chief_task_type=TaskType.CHIEF)
-        self._global_id_in_cluster = _get_global_id_in_cluster(
-            self._cluster_spec,
-            self._task_type,
-            self._task_id,
-            chief_task_type=TaskType.CHIEF)
-      else:
-        # Evaluator is not part of the training cluster.
-        self._cluster_spec = server_lib.ClusterSpec({})
-        self._master = _LOCAL_MASTER
-        self._num_ps_replicas = 0
-        self._num_worker_replicas = 0
-        self._global_id_in_cluster = None  # undefined
-
-      self._is_chief = self._task_type == TaskType.CHIEF
-    else:
-      # Local mode.
-      self._task_type = task_env.get(_TASK_TYPE_KEY, TaskType.WORKER)
-      self._task_id = int(task_env.get(_TASK_ID_KEY, 0))
-      self._global_id_in_cluster = 0
-
-      if self._task_type != TaskType.WORKER:
-        raise ValueError(
-            'If "cluster" is not set in TF_CONFIG, task type must be WORKER.')
-      if self._task_id != 0:
-        raise ValueError(
-            'If "cluster" is not set in TF_CONFIG, task index must be 0.')
-
-      self._master = tf_config.get(_SESSION_MASTER_KEY, _LOCAL_MASTER)
-      self._evaluation_master = tf_config.get(_EVAL_SESSION_MASTER_KEY,
-                                              _LOCAL_MASTER)
-      self._is_chief = True
-      self._num_ps_replicas = 0
-      self._num_worker_replicas = 1
-
-  def _init_distributed_setting_from_environment_var_with_master(self,
-                                                                 tf_config):
-    """Initialize distributed properties for legacy cluster with `master`."""
-    # There is no tech reason, why user cannot have chief and master in the same
-    # cluster, but it is super confusing (which is really the chief?). So, block
-    # this case.
-    if TaskType.CHIEF in self._cluster_spec.jobs:
-      raise ValueError('If `master` node exists in `cluster`, job '
-                       '`chief` is not supported.')
-
-    task_env = tf_config.get(_TASK_ENV_KEY, {})
-
-    self._task_type, self._task_id = _validate_task_type_and_task_id(
-        self._cluster_spec, task_env, TaskType.MASTER)
-
-    if self._task_type == TaskType.EVALUATOR:
-      raise ValueError('If `master` node exists in `cluster`, task_type '
-                       '`evaluator` is not supported.')
-
-    self._global_id_in_cluster = _get_global_id_in_cluster(
-        self._cluster_spec,
-        self._task_type,
-        self._task_id,
-        chief_task_type=TaskType.MASTER)
-
-    self._master = _get_session_master(self._cluster_spec, self._task_type,
-                                       self._task_id, tf_config)
-    self._evaluation_master = _get_eval_session_master(self._task_type,
-                                                       tf_config)
-    self._num_ps_replicas = _count_ps(self._cluster_spec)
-    self._num_worker_replicas = _count_worker(
-        self._cluster_spec, chief_task_type=TaskType.MASTER)
-
-    self._is_chief = self._task_type == TaskType.MASTER
-
-  @property
-  def cluster_spec(self):
-    return self._cluster_spec
-
-  @property
-  def device_fn(self):
-    """Returns the device_fn.
-
-    If device_fn is not `None`, it overrides the default
-    device function used in `Estimator`.
-    Otherwise the default one is used.
-    """
-    return self._device_fn
-
-  @property
-  def evaluation_master(self):
-    return self._evaluation_master
-
-  @property
-  def is_chief(self):
-    return self._is_chief
-
-  @property
-  def master(self):
-    return self._master
-
-  @property
-  def num_ps_replicas(self):
-    return self._num_ps_replicas
-
-  @property
-  def num_worker_replicas(self):
-    return self._num_worker_replicas
-
-  @property
-  def task_id(self):
-    return self._task_id
-
-  @property
-  def global_id_in_cluster(self):
-    """The global id in the training cluster.
-
-    All global ids in the training cluster are assigned from an increasing
-    sequence of consecutive integers. The first id is 0.
-
-    Note: Task id (the property field `task_id`) is tracking the index of the
-    node among all nodes with the SAME task type. For example, given the cluster
-    definition as follows:
-
-    ```
-      cluster = {'chief': ['host0:2222'],
-                 'ps': ['host1:2222', 'host2:2222'],
-                 'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
-    ```
-
-    Nodes with task type `worker` can have id 0, 1, 2.  Nodes with task type
-    `ps` can have id, 0, 1. So, `task_id` is not unique, but the pair
-    (`task_type`, `task_id`) can uniquely determine a node in the cluster.
-
-    Global id, i.e., this field, is tracking the index of the node among ALL
-    nodes in the cluster. It is uniquely assigned.  For example, for the cluster
-    spec given above, the global ids are assigned as:
-    ```
-      task_type  | task_id  |  global_id
-      --------------------------------
-      chief      | 0        |  0
-      worker     | 0        |  1
-      worker     | 1        |  2
-      worker     | 2        |  3
-      ps         | 0        |  4
-      ps         | 1        |  5
-    ```
-
-    Returns:
-      An integer id.
-    """
-    return self._global_id_in_cluster
-
-  @property
-  def task_type(self):
-    return self._task_type
-
-  @property
-  def tf_random_seed(self):
-    return self._tf_random_seed
-
-  @property
-  def save_summary_steps(self):
-    return self._save_summary_steps
-
-  @property
-  def save_checkpoints_secs(self):
-    return self._save_checkpoints_secs
-
-  @property
-  def session_config(self):
-    return self._session_config
-
-  @property
-  def save_checkpoints_steps(self):
-    return self._save_checkpoints_steps
-
-  @property
-  def keep_checkpoint_max(self):
-    return self._keep_checkpoint_max
-
-  @property
-  def keep_checkpoint_every_n_hours(self):
-    return self._keep_checkpoint_every_n_hours
-
-  @property
-  def log_step_count_steps(self):
-    return self._log_step_count_steps
-
-  @property
-  def model_dir(self):
-    return self._model_dir
-
-  @property
-  def service(self):
-    """Returns the platform defined (in TF_CONFIG) service dict."""
-    return self._service
-
-  @property
-  def train_distribute(self):
-    """Optional `tf.contrib.distribute.DistributionStrategy` for training.
-    """
-    return self._train_distribute
-
-  @property
-  def eval_distribute(self):
-    """Optional `tf.contrib.distribute.DistributionStrategy` for evaluation.
-    """
-    return self._eval_distribute
-
-  @property
-  def protocol(self):
-    """Returns the optional protocol value."""
-    return self._protocol
-
-  def replace(self, **kwargs):
-    """Returns a new instance of `RunConfig` replacing specified properties.
-
-    Only the properties in the following list are allowed to be replaced:
-
-      - `model_dir`,
-      - `tf_random_seed`,
-      - `save_summary_steps`,
-      - `save_checkpoints_steps`,
-      - `save_checkpoints_secs`,
-      - `session_config`,
-      - `keep_checkpoint_max`,
-      - `keep_checkpoint_every_n_hours`,
-      - `log_step_count_steps`,
-      - `train_distribute`,
-      - `device_fn`,
-      - `protocol`.
-      - `eval_distribute`,
-      - `experimental_distribute`,
-
-    In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
-    can be set (should not be both).
-
-    Args:
-      **kwargs: keyword named properties with new values.
-
-    Raises:
-      ValueError: If any property name in `kwargs` does not exist or is not
-        allowed to be replaced, or both `save_checkpoints_steps` and
-        `save_checkpoints_secs` are set.
-
-    Returns:
-      a new instance of `RunConfig`.
-    """
-    return RunConfig._replace(
-        copy.deepcopy(self),
-        allowed_properties_list=_DEFAULT_REPLACEABLE_LIST,
-        **kwargs)
-
-  @staticmethod
-  def _replace(config, allowed_properties_list=None, **kwargs):
-    """See `replace`.
-
-    N.B.: This implementation assumes that for key named "foo", the underlying
-    property the RunConfig holds is "_foo" (with one leading underscore).
-
-    Args:
-      config: The RunConfig to replace the values of.
-      allowed_properties_list: The property name list allowed to be replaced.
-      **kwargs: keyword named properties with new values.
-
-    Raises:
-      ValueError: If any property name in `kwargs` does not exist or is not
-        allowed to be replaced, or both `save_checkpoints_steps` and
-        `save_checkpoints_secs` are set.
-
-    Returns:
-      a new instance of `RunConfig`.
-    """
-
-    allowed_properties_list = allowed_properties_list or []
-
-    for key, new_value in six.iteritems(kwargs):
-      if key in allowed_properties_list:
-        setattr(config, '_' + key, new_value)
-        continue
-
-      raise ValueError(
-          'Replacing {} is not supported. Allowed properties are {}.'.format(
-              key, allowed_properties_list))
-
-    _validate_save_ckpt_with_replaced_keys(config, kwargs.keys())
-    _validate_properties(config)
-    return config
-
-
-def _get_model_dir(tf_config, model_dir):
-  """Returns `model_dir` based user provided `tf_config` or `model_dir`."""
-  # pylint: disable=g-explicit-bool-comparison
-
-  # Empty string is treated as False in Python condition check, which triggers
-  # some confusing error messages. For example, 'a or b' returns None if a is ''
-  # and b is None. `None` is allowed for model_dir but '' is not allowed. Here,
-  # explicitly check empty string to provide clear error message.
-  if model_dir == '':
-    raise ValueError('model_dir should be non-empty.')
-
-  model_dir_in_tf_config = tf_config.get('model_dir')
-  if model_dir_in_tf_config == '':
-    raise ValueError('model_dir in TF_CONFIG should be non-empty.')
-
-  if model_dir_in_tf_config:
-    if model_dir and model_dir_in_tf_config != model_dir:
-      raise ValueError(
-          '`model_dir` provided in RunConfig construct, if set, '
-          'must have the same value as the model_dir in TF_CONFIG. '
-          'model_dir: {}\nTF_CONFIG["model_dir"]: {}.\n'.format(
-              model_dir, model_dir_in_tf_config))
+from tensorflow_estimator.python.estimator import run_config
 
-    logging.info('Using model_dir in TF_CONFIG: %s', model_dir_in_tf_config)
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+run_config.__all__ = [s for s in dir(run_config) if not s.startswith('__')]
 
-  return model_dir or model_dir_in_tf_config
+from tensorflow_estimator.python.estimator.run_config import *
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
deleted file mode 100644
index 06df7cb9dd4ae3d167d622601e551079b64e80a2..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/run_config_test.py
+++ /dev/null
@@ -1,1235 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""RunConfig tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.platform import test
-
-_TEST_DIR = 'test_dir'
-_MASTER = 'master_'
-_NOT_SUPPORTED_REPLACE_PROPERTY_MSG = 'Replacing .*is not supported'
-_SAVE_CKPT_ERR = (
-    '`save_checkpoints_steps` and `save_checkpoints_secs` cannot be both set.'
-)
-_MODEL_DIR_ERR = 'model_dir should be non-empty'
-_MODEL_DIR_TF_CONFIG_ERR = 'model_dir in TF_CONFIG should be non-empty'
-_MODEL_DIR_MISMATCH_ERR = (
-    '`model_dir` provided in RunConfig construct, if set, '
-    'must have the same value as the model_dir in TF_CONFIG. ')
-_SAVE_SUMMARY_STEPS_ERR = 'save_summary_steps should be >= 0'
-_SAVE_CKPT_STEPS_ERR = 'save_checkpoints_steps should be >= 0'
-_SAVE_CKPT_SECS_ERR = 'save_checkpoints_secs should be >= 0'
-_SESSION_CONFIG_ERR = 'session_config must be instance of ConfigProto'
-_KEEP_CKPT_MAX_ERR = 'keep_checkpoint_max should be >= 0'
-_KEEP_CKPT_HOURS_ERR = 'keep_checkpoint_every_n_hours should be > 0'
-_TF_RANDOM_SEED_ERR = 'tf_random_seed must be integer'
-_DEVICE_FN_ERR = 'device_fn must be callable with exactly one argument "op".'
-_ONE_CHIEF_ERR = 'The "cluster" in TF_CONFIG must have only one "chief" node.'
-_ONE_MASTER_ERR = 'The "cluster" in TF_CONFIG must have only one "master" node.'
-_INVALID_TASK_TYPE_FOR_EVAL_MASTER = (
-    'Key.*eval.*master.*should not be set for task type other than')
-_MISSING_CHIEF_ERR = 'If "cluster" is set .* it must have one "chief" node'
-_MISSING_TASK_TYPE_ERR = 'If "cluster" is set .* task type must be set'
-_MISSING_TASK_ID_ERR = 'If "cluster" is set .* task index must be set'
-_INVALID_TASK_INDEX_ERR = 'is not a valid task_id'
-_NEGATIVE_TASK_INDEX_ERR = 'Task index must be non-negative number.'
-_INVALID_TASK_TYPE_ERR = 'is not a valid task_type'
-_INVALID_TASK_TYPE_FOR_LOCAL_ERR = (
-    'If "cluster" is not set in TF_CONFIG, task type must be WORKER.')
-_INVALID_TASK_INDEX_FOR_LOCAL_ERR = (
-    'If "cluster" is not set in TF_CONFIG, task index must be 0.')
-_INVALID_EVALUATOR_IN_CLUSTER_WITH_MASTER_ERR = (
-    'If `master` node exists in `cluster`, task_type `evaluator` is not '
-    'supported.')
-_INVALID_CHIEF_IN_CLUSTER_WITH_MASTER_ERR = (
-    'If `master` node exists in `cluster`, job `chief` is not supported.')
-_INVALID_SERVICE_TYPE_ERR = (
-    'If "service" is set in TF_CONFIG, it must be a dict. Given')
-
-
-def _create_run_config_with_cluster_spec(tf_config, **kwargs):
-  with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}):
-    return run_config_lib.RunConfig(**kwargs)
-
-
-class RunConfigTest(test.TestCase):
-
-  def test_default_property_values(self):
-    config = run_config_lib.RunConfig()
-    self.assertIsNone(config.model_dir)
-    self.assertIsNone(config.session_config)
-    self.assertIsNone(config.tf_random_seed)
-    self.assertEqual(100, config.save_summary_steps)
-    self.assertEqual(600, config.save_checkpoints_secs)
-    self.assertIsNone(config.save_checkpoints_steps)
-    self.assertEqual(5, config.keep_checkpoint_max)
-    self.assertEqual(10000, config.keep_checkpoint_every_n_hours)
-    self.assertIsNone(config.service)
-    self.assertIsNone(config.device_fn)
-
-  def test_model_dir(self):
-    empty_config = run_config_lib.RunConfig()
-    self.assertIsNone(empty_config.model_dir)
-
-    new_config = empty_config.replace(model_dir=_TEST_DIR)
-    self.assertEqual(_TEST_DIR, new_config.model_dir)
-
-  def test_replace_with_allowed_properties(self):
-    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-    device_fn = lambda op: "/cpu:0"
-
-    config = run_config_lib.RunConfig().replace(
-        tf_random_seed=11,
-        save_summary_steps=12,
-        save_checkpoints_secs=14,
-        session_config=session_config,
-        keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17,
-        device_fn=device_fn)
-    self.assertEqual(11, config.tf_random_seed)
-    self.assertEqual(12, config.save_summary_steps)
-    self.assertEqual(14, config.save_checkpoints_secs)
-    self.assertEqual(session_config, config.session_config)
-    self.assertEqual(16, config.keep_checkpoint_max)
-    self.assertEqual(17, config.keep_checkpoint_every_n_hours)
-    self.assertEqual(device_fn, config.device_fn)
-
-  def test_replace_none_value(self):
-    config = run_config_lib.RunConfig().replace(
-        tf_random_seed=None,
-        model_dir=None,
-        save_summary_steps=None,
-        save_checkpoints_secs=None,
-        save_checkpoints_steps=None,
-        session_config=None,
-        keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None,
-        device_fn=None)
-    self.assertIsNone(config.tf_random_seed)
-    self.assertIsNone(config.model_dir)
-    self.assertIsNone(config.save_summary_steps)
-    self.assertIsNone(config.save_checkpoints_secs)
-    self.assertIsNone(config.save_checkpoints_steps)
-    self.assertIsNone(config.session_config)
-    self.assertIsNone(config.keep_checkpoint_max)
-    self.assertIsNone(config.keep_checkpoint_every_n_hours)
-    self.assertIsNone(config.device_fn)
-
-  def test_replace_with_disallowallowed_properties(self):
-    config = run_config_lib.RunConfig()
-    with self.assertRaises(ValueError):
-      # tf_random_seed is not allowed to be replaced.
-      config.replace(master='_master')
-    with self.assertRaises(ValueError):
-      config.replace(some_undefined_property=123)
-
-  def test_replace(self):
-    config = run_config_lib.RunConfig()
-
-    with self.assertRaisesRegexp(
-        ValueError, _NOT_SUPPORTED_REPLACE_PROPERTY_MSG):
-      # master is not allowed to be replaced.
-      config.replace(master=_MASTER)
-
-    with self.assertRaisesRegexp(
-        ValueError, _NOT_SUPPORTED_REPLACE_PROPERTY_MSG):
-      config.replace(some_undefined_property=_MASTER)
-
-  def test_replace_invalid_values(self):
-    config = run_config_lib.RunConfig()
-
-    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
-      config.replace(model_dir='')
-    with self.assertRaisesRegexp(ValueError, _SAVE_SUMMARY_STEPS_ERR):
-      config.replace(save_summary_steps=-1)
-    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_STEPS_ERR):
-      config.replace(save_checkpoints_steps=-1)
-    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_SECS_ERR):
-      config.replace(save_checkpoints_secs=-1)
-    with self.assertRaisesRegexp(ValueError, _SESSION_CONFIG_ERR):
-      config.replace(session_config={})
-    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_MAX_ERR):
-      config.replace(keep_checkpoint_max=-1)
-    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_HOURS_ERR):
-      config.replace(keep_checkpoint_every_n_hours=0)
-    with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
-      config.replace(tf_random_seed=1.0)
-    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
-      config.replace(device_fn=lambda x, y: 0)
-
-  def test_init_with_allowed_properties(self):
-    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-    device_fn = lambda op: "/cpu:0"
-
-    config = run_config_lib.RunConfig(
-        tf_random_seed=11,
-        save_summary_steps=12,
-        save_checkpoints_secs=14,
-        session_config=session_config,
-        keep_checkpoint_max=16,
-        keep_checkpoint_every_n_hours=17,
-        device_fn=device_fn)
-    self.assertEqual(11, config.tf_random_seed)
-    self.assertEqual(12, config.save_summary_steps)
-    self.assertEqual(14, config.save_checkpoints_secs)
-    self.assertEqual(session_config, config.session_config)
-    self.assertEqual(16, config.keep_checkpoint_max)
-    self.assertEqual(17, config.keep_checkpoint_every_n_hours)
-    self.assertEqual(device_fn, config.device_fn)
-
-  def test_init_none_value(self):
-    config = run_config_lib.RunConfig(
-        tf_random_seed=None,
-        model_dir=None,
-        save_summary_steps=None,
-        save_checkpoints_secs=None,
-        save_checkpoints_steps=None,
-        session_config=None,
-        keep_checkpoint_max=None,
-        keep_checkpoint_every_n_hours=None,
-        device_fn=None)
-    self.assertIsNone(config.tf_random_seed)
-    self.assertIsNone(config.model_dir)
-    self.assertIsNone(config.save_summary_steps)
-    self.assertIsNone(config.save_checkpoints_secs)
-    self.assertIsNone(config.save_checkpoints_steps)
-    self.assertIsNone(config.session_config)
-    self.assertIsNone(config.keep_checkpoint_max)
-    self.assertIsNone(config.keep_checkpoint_every_n_hours)
-    self.assertIsNone(config.device_fn)
-
-  def test_init_invalid_values(self):
-    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
-      run_config_lib.RunConfig(model_dir='')
-    with self.assertRaisesRegexp(ValueError, _SAVE_SUMMARY_STEPS_ERR):
-      run_config_lib.RunConfig(save_summary_steps=-1)
-    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_STEPS_ERR):
-      run_config_lib.RunConfig(save_checkpoints_steps=-1)
-    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_SECS_ERR):
-      run_config_lib.RunConfig(save_checkpoints_secs=-1)
-    with self.assertRaisesRegexp(ValueError, _SESSION_CONFIG_ERR):
-      run_config_lib.RunConfig(session_config={})
-    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_MAX_ERR):
-      run_config_lib.RunConfig(keep_checkpoint_max=-1)
-    with self.assertRaisesRegexp(ValueError, _KEEP_CKPT_HOURS_ERR):
-      run_config_lib.RunConfig(keep_checkpoint_every_n_hours=0)
-    with self.assertRaisesRegexp(ValueError, _TF_RANDOM_SEED_ERR):
-      run_config_lib.RunConfig(tf_random_seed=1.0)
-    with self.assertRaisesRegexp(ValueError, _DEVICE_FN_ERR):
-      run_config_lib.RunConfig(device_fn=lambda x: "/cpu:0")
-
-
-class RunConfigDistributedSettingTest(test.TestCase):
-
-  def _assert_distributed_properties(self, run_config,
-                                     expected_cluster_spec,
-                                     expected_task_type,
-                                     expected_task_id,
-                                     expected_master,
-                                     expected_evaluation_master,
-                                     expected_is_chief,
-                                     expected_num_worker_replicas,
-                                     expected_num_ps_replicas):
-    self.assertEqual(expected_cluster_spec, run_config.cluster_spec.as_dict())
-    self.assertEqual(expected_task_type, run_config.task_type)
-    self.assertEqual(expected_task_id, run_config.task_id)
-    self.assertEqual(expected_master, run_config.master)
-    self.assertEqual(expected_evaluation_master, run_config.evaluation_master)
-    self.assertEqual(expected_is_chief, run_config.is_chief)
-    self.assertEqual(expected_num_worker_replicas,
-                     run_config.num_worker_replicas)
-    self.assertEqual(expected_num_ps_replicas, run_config.num_ps_replicas)
-
-  def test_default_values(self):
-    self._assert_distributed_properties(
-        run_config=run_config_lib.RunConfig(),
-        expected_cluster_spec={},
-        expected_task_type=run_config_lib.TaskType.WORKER,
-        expected_task_id=0,
-        expected_master='',
-        expected_evaluation_master='',
-        expected_is_chief=True,
-        expected_num_worker_replicas=1,
-        expected_num_ps_replicas=0)
-
-  def test_tf_config_for_local(self):
-    tf_config = {
-        'task': {
-            'type': run_config_lib.TaskType.WORKER,
-            'index': 0
-        }
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self._assert_distributed_properties(
-        run_config=run_config,
-        expected_cluster_spec={},
-        expected_task_type=run_config_lib.TaskType.WORKER,
-        expected_task_id=0,
-        expected_master='',
-        expected_evaluation_master='',
-        expected_is_chief=True,
-        expected_num_worker_replicas=1,
-        expected_num_ps_replicas=0)
-    self.assertEqual(0, run_config.global_id_in_cluster)
-    self.assertIsNone(run_config.session_config, None)
-
-  def test_session_master_for_local(self):
-    tf_config = {'session_master': '_my_master'}
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec={},
-        expected_task_type=run_config_lib.TaskType.WORKER,
-        expected_task_id=0,
-        expected_master='_my_master',
-        expected_evaluation_master='',
-        expected_is_chief=True,
-        expected_num_worker_replicas=1,
-        expected_num_ps_replicas=0)
-
-  def test_eval_session_master_for_local(self):
-    tf_config = {'eval_session_master': '_my_eval_master'}
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec={},
-        expected_task_type=run_config_lib.TaskType.WORKER,
-        expected_task_id=0,
-        expected_master='',
-        expected_evaluation_master='_my_eval_master',
-        expected_is_chief=True,
-        expected_num_worker_replicas=1,
-        expected_num_ps_replicas=0)
-
-  def test_invalid_task_type_for_local(self):
-    tf_config = {
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE_FOR_LOCAL_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_invalid_task_index_for_local(self):
-    tf_config = {
-        'task': {
-            'type': run_config_lib.TaskType.WORKER,
-            'index': 1
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_INDEX_FOR_LOCAL_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_chief_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec=tf_config['cluster'],
-        expected_task_type=run_config_lib.TaskType.CHIEF,
-        expected_task_id=0,
-        expected_master='grpc://host0:0',
-        expected_evaluation_master='',
-        expected_is_chief=True,
-        expected_num_worker_replicas=4,
-        expected_num_ps_replicas=2)
-
-  def test_session_master_from_single_node_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        },
-        'session_master': '_my_master'
-    }
-    self.assertEqual('_my_master',
-                     _create_run_config_with_cluster_spec(tf_config).master)
-
-  def test_session_master_from_multiple_nodes_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        },
-        'session_master': '_my_master'
-    }
-    self.assertEqual('_my_master',
-                     _create_run_config_with_cluster_spec(tf_config).master)
-
-  def test_fail_with_eval_session_master_for_non_evaluator(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        },
-        'eval_session_master': 'grpc://123',
-    }
-    with self.assertRaisesRegexp(
-        ValueError, _INVALID_TASK_TYPE_FOR_EVAL_MASTER):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_multiple_chief_nodes(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0', 'host:6:6'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-    }
-    with self.assertRaisesRegexp(ValueError, _ONE_CHIEF_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_missing_chief_node(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-    }
-    with self.assertRaisesRegexp(ValueError, _MISSING_CHIEF_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_single_chief_node(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec=tf_config['cluster'],
-        expected_task_type=run_config_lib.TaskType.CHIEF,
-        expected_task_id=0,
-        expected_master='',
-        expected_evaluation_master='',
-        expected_is_chief=True,
-        expected_num_worker_replicas=1,
-        expected_num_ps_replicas=0)
-
-  def test_fail_with_missing_task_type_for_distributed(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3']
-        },
-    }
-    with self.assertRaisesRegexp(ValueError, _MISSING_TASK_TYPE_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_missing_task_index_for_distributed(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _MISSING_TASK_ID_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_index_is_too_large(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 1
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_INDEX_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_invalid_task_index(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': -1
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _NEGATIVE_TASK_INDEX_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_invalid_task_type(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.WORKER,
-            'index': 0
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_worker_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.WORKER,
-            'index': 1
-        }
-    }
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec=tf_config['cluster'],
-        expected_task_type=run_config_lib.TaskType.WORKER,
-        expected_task_id=1,
-        expected_master='grpc://host4:4',
-        expected_evaluation_master='',
-        expected_is_chief=False,
-        expected_num_worker_replicas=4,
-        expected_num_ps_replicas=2)
-
-  def test_ps_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.PS,
-            'index': 0
-        }
-    }
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec=tf_config['cluster'],
-        expected_task_type=run_config_lib.TaskType.PS,
-        expected_task_id=0,
-        expected_master='grpc://host1:1',
-        expected_evaluation_master='',
-        expected_is_chief=False,
-        expected_num_worker_replicas=4,
-        expected_num_ps_replicas=2)
-
-  def test_evaluator_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.EVALUATOR,
-            'index': 12
-        }
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self._assert_distributed_properties(
-        run_config=run_config,
-        expected_cluster_spec={},
-        expected_task_type=run_config_lib.TaskType.EVALUATOR,
-        expected_task_id=12,
-        expected_master='',
-        expected_evaluation_master='',
-        expected_is_chief=False,  # evaluator is never chief.
-        expected_num_worker_replicas=0,  # evaluator is not in training cluster.
-        expected_num_ps_replicas=0)
-    self.assertIsNone(run_config.global_id_in_cluster)
-
-  def test_eval_master_for_evaluator(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.EVALUATOR,
-            'index': 12
-        },
-        'eval_session_master': 'grpc://123',
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual('grpc://123', run_config.evaluation_master)
-
-  def test_fail_with_invalid_task_index_for_evaluator(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host3:3']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.EVALUATOR,
-            'index': -1
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _NEGATIVE_TASK_INDEX_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_global_id_in_cluster_for_chief(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
-            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0,
-        },
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual(0, run_config.global_id_in_cluster)
-
-  def test_global_id_in_cluster_for_worker(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
-            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.WORKER,
-            'index': 2,
-        },
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual(3, run_config.global_id_in_cluster)
-
-  def test_global_id_in_cluster_for_ps(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
-            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.PS,
-            'index': 1,
-        },
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual(5, run_config.global_id_in_cluster)
-
-  def test_global_id_in_cluster_for_multipe_worker_types(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            'worker': ['host3:3', 'host4:4', 'host5:5'],
-            'other_type': ['host3:1', 'host4:2'],
-            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
-        },
-        'task': {
-            'type': 'other_type',
-            'index': 1,
-        },
-    }
-    # Though 'other_type' is defined after 'worker', based on alphabetical
-    # order, the task type order should be 'chief', 'other_type', 'worker',
-    # 'ps', where 'chief' and 'ps' are predefined to be the top and last in the
-    # order list.
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual(2, run_config.global_id_in_cluster)
-
-
-class RunConfigDistributedSettingWithMasterTest(test.TestCase):
-
-  def _assert_distributed_properties(self, run_config,
-                                     expected_cluster_spec,
-                                     expected_task_type,
-                                     expected_task_id,
-                                     expected_master,
-                                     expected_evaluation_master,
-                                     expected_is_chief,
-                                     expected_num_worker_replicas,
-                                     expected_num_ps_replicas):
-    self.assertEqual(expected_cluster_spec, run_config.cluster_spec.as_dict())
-    self.assertEqual(expected_task_type, run_config.task_type)
-    self.assertEqual(expected_task_id, run_config.task_id)
-    self.assertEqual(expected_master, run_config.master)
-    self.assertEqual(expected_evaluation_master, run_config.evaluation_master)
-    self.assertEqual(expected_is_chief, run_config.is_chief)
-    self.assertEqual(expected_num_worker_replicas,
-                     run_config.num_worker_replicas)
-    self.assertEqual(expected_num_ps_replicas, run_config.num_ps_replicas)
-
-  def test_invalid_task_type_for_local(self):
-    tf_config = {
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 0
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE_FOR_LOCAL_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_master_node(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 0
-        }
-    }
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec=tf_config['cluster'],
-        expected_task_type=run_config_lib.TaskType.MASTER,
-        expected_task_id=0,
-        expected_master='grpc://host0:0',
-        expected_evaluation_master='',
-        expected_is_chief=True,
-        expected_num_worker_replicas=4,
-        expected_num_ps_replicas=2)
-
-  def test_session_master_in_single_node_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 0
-        },
-        'session_master': '_my_master'
-    }
-    self.assertEqual('_my_master',
-                     _create_run_config_with_cluster_spec(tf_config).master)
-
-  def test_session_master_in_multiple_nodes_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 0
-        },
-        'session_master': '_my_master'
-    }
-    self.assertEqual('_my_master',
-                     _create_run_config_with_cluster_spec(tf_config).master)
-
-  def test_fail_with_eval_session_master(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 0
-        },
-        'eval_session_master': 'grpc://123',
-    }
-    with self.assertRaisesRegexp(
-        ValueError, _INVALID_TASK_TYPE_FOR_EVAL_MASTER):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_multiple_master_nodes(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0', 'host:6:6'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-    }
-    with self.assertRaisesRegexp(ValueError, _ONE_MASTER_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_single_master_node(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 0
-        }
-    }
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec=tf_config['cluster'],
-        expected_task_type=run_config_lib.TaskType.MASTER,
-        expected_task_id=0,
-        expected_master='',
-        expected_evaluation_master='',
-        expected_is_chief=True,
-        expected_num_worker_replicas=1,
-        expected_num_ps_replicas=0)
-
-  def test_fail_with_missing_task_type_for_distributed(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host3:3']
-        },
-    }
-    with self.assertRaisesRegexp(ValueError, _MISSING_TASK_TYPE_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_missing_task_index_for_distributed(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host3:3']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _MISSING_TASK_ID_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_index_is_too_large(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host3:3']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 1
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_INDEX_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_invalid_task_index(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host3:3']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': -1
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _NEGATIVE_TASK_INDEX_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_invalid_task_type(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host3:3']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.WORKER,
-            'index': 0
-        }
-    }
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_worker_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.WORKER,
-            'index': 1
-        }
-    }
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec=tf_config['cluster'],
-        expected_task_type=run_config_lib.TaskType.WORKER,
-        expected_task_id=1,
-        expected_master='grpc://host4:4',
-        expected_evaluation_master='',
-        expected_is_chief=False,
-        expected_num_worker_replicas=4,
-        expected_num_ps_replicas=2)
-
-  def test_ps_tf_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.PS,
-            'index': 0
-        }
-    }
-    self._assert_distributed_properties(
-        run_config=_create_run_config_with_cluster_spec(tf_config),
-        expected_cluster_spec=tf_config['cluster'],
-        expected_task_type=run_config_lib.TaskType.PS,
-        expected_task_id=0,
-        expected_master='grpc://host1:1',
-        expected_evaluation_master='',
-        expected_is_chief=False,
-        expected_num_worker_replicas=4,
-        expected_num_ps_replicas=2)
-
-  def test_fail_with_evaluator(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.EVALUATOR,
-            'index': 1
-        }
-    }
-    with self.assertRaisesRegexp(ValueError,
-                                 _INVALID_EVALUATOR_IN_CLUSTER_WITH_MASTER_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_fail_with_chief(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.CHIEF: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.PS,
-            'index': 1
-        }
-    }
-    with self.assertRaisesRegexp(ValueError,
-                                 _INVALID_CHIEF_IN_CLUSTER_WITH_MASTER_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-  def test_global_id_in_cluster_for_master(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
-            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 0,
-        },
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual(0, run_config.global_id_in_cluster)
-
-  def test_global_id_in_cluster_for_worker(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
-            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.WORKER,
-            'index': 2,
-        },
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual(3, run_config.global_id_in_cluster)
-
-  def test_global_id_in_cluster_for_ps(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5'],
-            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.PS,
-            'index': 1,
-        },
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual(5, run_config.global_id_in_cluster)
-
-  def test_global_id_in_cluster_for_multipe_worker_types(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            'worker': ['host3:3', 'host4:4', 'host5:5'],
-            'other_type': ['host3:1', 'host4:2'],
-            run_config_lib.TaskType.PS: ['host6:3', 'host7:4', 'host8:5']
-        },
-        'task': {
-            'type': 'other_type',
-            'index': 1,
-        },
-    }
-    # Though 'other_type' is defined after 'worker', based on alphabetical
-    # order, the task type order should be 'chief', 'other_type', 'worker',
-    # 'ps', where 'chief' and 'ps' are predefined to be the top and last in the
-    # order list.
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual(2, run_config.global_id_in_cluster)
-
-
-class RunConfigSaveCheckpointsTest(test.TestCase):
-
-  def test_save_checkpoint(self):
-    empty_config = run_config_lib.RunConfig()
-    self.assertEqual(600, empty_config.save_checkpoints_secs)
-    self.assertIsNone(empty_config.save_checkpoints_steps)
-
-    config_with_steps = empty_config.replace(save_checkpoints_steps=100)
-    del empty_config
-    self.assertEqual(100, config_with_steps.save_checkpoints_steps)
-    self.assertIsNone(config_with_steps.save_checkpoints_secs)
-
-    config_with_secs = config_with_steps.replace(save_checkpoints_secs=200)
-    del config_with_steps
-    self.assertEqual(200, config_with_secs.save_checkpoints_secs)
-    self.assertIsNone(config_with_secs.save_checkpoints_steps)
-
-  def test_save_checkpoint_both_steps_and_secs_are_not_none(self):
-    empty_config = run_config_lib.RunConfig()
-    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_ERR):
-      empty_config.replace(save_checkpoints_steps=100,
-                           save_checkpoints_secs=200)
-
-    with self.assertRaisesRegexp(ValueError, _SAVE_CKPT_ERR):
-      run_config_lib.RunConfig(save_checkpoints_steps=100,
-                               save_checkpoints_secs=200)
-
-  def test_save_checkpoint_both_steps_and_secs_are_none(self):
-    config_with_secs = run_config_lib.RunConfig()
-    config_without_ckpt = config_with_secs.replace(
-        save_checkpoints_steps=None, save_checkpoints_secs=None)
-    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
-    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
-
-  def test_save_checkpoint_flip_secs_to_none(self):
-    config_with_secs = run_config_lib.RunConfig()
-    config_without_ckpt = config_with_secs.replace(save_checkpoints_secs=None)
-    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
-    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
-
-  def test_save_checkpoint_flip_steps_to_none(self):
-    config_with_steps = run_config_lib.RunConfig().replace(
-        save_checkpoints_steps=100)
-    config_without_ckpt = config_with_steps.replace(save_checkpoints_steps=None)
-    self.assertIsNone(config_without_ckpt.save_checkpoints_steps)
-    self.assertIsNone(config_without_ckpt.save_checkpoints_secs)
-
-
-class RunConfigServiceKeyTest(test.TestCase):
-
-  def test_arbitrary_key_value_pairs(self):
-    tf_config = {
-        'service': {
-            'key1': [1, 2],
-            'key2': {'a': 3, 'b': 4},
-            'key3': 789,
-        },
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual(tf_config['service'], run_config.service)
-
-  def test_missing_service_key(self):
-    tf_config = {
-        'model_dir': '/tmp/123',
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertIsNone(run_config.service)
-
-  def test_fail_with_non_dict(self):
-    tf_config = {
-        'service': 789,
-    }
-    with self.assertRaisesRegexp(TypeError, _INVALID_SERVICE_TYPE_ERR):
-      _create_run_config_with_cluster_spec(tf_config)
-
-
-class RunConfigModelDirTest(test.TestCase):
-
-  def test_default(self):
-    run_config = run_config_lib.RunConfig()
-    self.assertIsNone(run_config.model_dir)
-
-  def test_model_dir_in_constructor(self):
-    run_config = run_config_lib.RunConfig(model_dir='/tmp/123')
-    self.assertEqual('/tmp/123', run_config.model_dir)
-
-  def test_model_dir_in_tf_config(self):
-    tf_config = {
-        'model_dir': '/tmp/123',
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertEqual('/tmp/123', run_config.model_dir)
-
-  def test_model_dir_both_set_in_both_constructor_and_tf_config(self):
-    model_dir = '/tmp/123'
-    tf_config = {'model_dir': model_dir}
-    kwargs = {'model_dir': model_dir}
-    run_config = _create_run_config_with_cluster_spec(tf_config, **kwargs)
-    self.assertEqual('/tmp/123', run_config.model_dir)
-
-  def test_model_dir_different_in_both_constructor_and_tf_config(self):
-    tf_config = {'model_dir': '/tmp/123'}
-    kwargs = {'model_dir': '/tmp/456'}
-    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_MISMATCH_ERR):
-      _create_run_config_with_cluster_spec(tf_config, **kwargs)
-
-  def test_fail_with_empty_string_in_constructor(self):
-    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_ERR):
-      run_config_lib.RunConfig(model_dir='')
-
-  def test_fail_with_empty_string_in_tf_config(self):
-    with self.assertRaisesRegexp(ValueError, _MODEL_DIR_TF_CONFIG_ERR):
-      tf_config = {'model_dir': ''}
-      _create_run_config_with_cluster_spec(tf_config)
-
-
-class RunConfigSessionConfigTest(test.TestCase):
-
-  def _assert_equal_session_config(self, session_config,
-                                   expected_device_filters):
-
-    rewrite_opts = rewriter_config_pb2.RewriterConfig(
-        meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE)
-    graph_opts = config_pb2.GraphOptions(rewrite_options=rewrite_opts)
-    expected_session_config = config_pb2.ConfigProto(
-        allow_soft_placement=True,
-        graph_options=graph_opts,
-        device_filters=expected_device_filters)
-    self.assertEqual(session_config, expected_session_config)
-
-  def test_master_session_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.MASTER,
-            'index': 0
-        }
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self._assert_equal_session_config(run_config.session_config,
-                                      ['/job:ps', '/job:master'])
-
-  def test_chief_session_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.CHIEF,
-            'index': 0
-        }
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self._assert_equal_session_config(run_config.session_config,
-                                      ['/job:ps', '/job:chief'])
-
-  def test_worker_session_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.WORKER,
-            'index': 1
-        }
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self._assert_equal_session_config(run_config.session_config,
-                                      ['/job:ps', '/job:worker/task:1'])
-
-  def test_ps_session_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.PS,
-            'index': 1
-        }
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self._assert_equal_session_config(run_config.session_config,
-                                      ['/job:ps', '/job:worker', '/job:master'])
-
-  def test_evaluator_session_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': run_config_lib.TaskType.EVALUATOR,
-            'index': 0
-        }
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertIsNone(run_config.session_config)
-
-  def test_other_type_session_config(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.MASTER: ['host0:0'],
-            run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-            'other_type': ['host3:1', 'host4:2'],
-            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
-        },
-        'task': {
-            'type': 'other_type',
-            'index': 0
-        }
-    }
-    run_config = _create_run_config_with_cluster_spec(tf_config)
-    self.assertIsNone(run_config.session_config)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 240be5dabe80dff1c6f319951fcea012ff0b660f..59ea0429a9aea78b569b81e6315a22edd508a5d7 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,1062 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes and functions related to train_and_evaluate."""
+"""training python module.
+
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import json
-import os
-import time
-
-import six
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.distribute import estimator_training as distribute_coordinator_training
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import exporter as exporter_lib
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import server_lib
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.util import compat
-from tensorflow.python.util.tf_export import estimator_export
-
-_MAX_DELAY_SECS = 60
-_DELAY_SECS_PER_WORKER = 5
-_TF_CONFIG_ENV = 'TF_CONFIG'
-_ENVIRONMENT_KEY = 'environment'
-_ENVIRONMENT_GOOGLE_VALUE = 'google'
-_TRAINER_JOBS = (run_config_lib.TaskType.CHIEF, run_config_lib.TaskType.MASTER,
-                 run_config_lib.TaskType.WORKER)
-
-
-def _validate_input_fn(input_fn):
-  """Validates the `input_fn`."""
-  if not callable(input_fn):
-    raise TypeError('`input_fn` must be callable, given: {}'.format(input_fn))
-
-
-def _validate_hooks(hooks):
-  """Validates the `hooks`."""
-  hooks = tuple(hooks or [])
-  for hook in hooks:
-    if not isinstance(hook, session_run_hook.SessionRunHook):
-      raise TypeError(
-          'All hooks must be `SessionRunHook` instances, given: {}'.format(
-              hook))
-  return hooks
-
-
-def _validate_exporters(exporters):
-  """Validates `exporters` and returns them as a tuple."""
-  if not exporters:
-    return ()
-
-  if isinstance(exporters, exporter_lib.Exporter):
-    exporters = [exporters]
-
-  unique_names = []  # `Exporter`s should have unique names.
-  try:
-    for exporter in exporters:
-      if not isinstance(exporter, exporter_lib.Exporter):
-        # Error message will be printed out by the outer try/except.
-        raise TypeError
-
-      if not exporter.name:
-        full_list_of_names = [e.name for e in exporters]
-        raise ValueError('An Exporter cannot have a name that is `None` or'
-                         ' empty. All exporter names:'
-                         ' {}'.format(full_list_of_names))
-
-      if not isinstance(exporter.name, six.string_types):
-        raise ValueError('An Exporter must have a string name. Given: '
-                         '{}'.format(type(exporter.name)))
-
-      if exporter.name in unique_names:
-        full_list_of_names = [e.name for e in exporters]
-        raise ValueError(
-            '`exporters` must have unique names. Such a name cannot be `None`.'
-            ' All exporter names: {}'.format(full_list_of_names))
-      unique_names.append(exporter.name)
-  except TypeError:
-    # Two possibilities:
-    # - `exporters` is neither `Exporter` nor iterable.  Python has
-    #   raised a `TypeError` when iterating over `exporters`.
-    # - an `exporter` was None or not of type `Exporter`, so we raised a
-    #   `TypeError`.
-    raise TypeError('`exporters` must be an Exporter,'
-                    ' an iterable of Exporter, or `None`,'
-                    ' found %s.' % exporters)
-
-  return tuple(exporters)
-
-
-def _is_google_env():
-  """Detects whether current environment is google."""
-  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV) or '{}')
-  if not tf_config:
-    logging.warn('TF_CONFIG should not be empty in distributed environment.')
-  return tf_config.get(_ENVIRONMENT_KEY) == _ENVIRONMENT_GOOGLE_VALUE
-
-
-@estimator_export('estimator.TrainSpec')
-class TrainSpec(
-    collections.namedtuple('TrainSpec', ['input_fn', 'max_steps', 'hooks'])):
-  """Configuration for the "train" part for the `train_and_evaluate` call.
-
-  `TrainSpec` determines the input data for the training, as well as the
-  duration. Optional hooks run at various stages of training.
-  """
-
-  def __new__(cls, input_fn, max_steps=None, hooks=None):
-    """Creates a validated `TrainSpec` instance.
-
-    Args:
-      input_fn: A function that provides input data for training as minibatches.
-        See [Premade Estimators](https://tensorflow.org/guide/premade_estimators#create_input_functions)
-        for more information. The function should construct and return one of
-        the following:
-          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
-            tuple (features, labels) with same constraints as below.
-          * A tuple (features, labels): Where features is a `Tensor` or a
-            dictionary of string feature name to `Tensor` and labels is a
-            `Tensor` or a dictionary of string label name to `Tensor`.
-
-      max_steps: Int. Positive number of total steps for which to train model.
-        If `None`, train forever. The training `input_fn` is not expected to
-        generate `OutOfRangeError` or `StopIteration` exceptions. See the
-        `train_and_evaluate` stop condition section for details.
-      hooks: Iterable of `tf.train.SessionRunHook` objects to run
-        on all workers (including chief) during training.
-
-    Returns:
-      A validated `TrainSpec` object.
-
-    Raises:
-      ValueError: If any of the input arguments is invalid.
-      TypeError: If any of the arguments is not of the expected type.
-    """
-    # Validate input_fn.
-    _validate_input_fn(input_fn)
-
-    # Validate max_steps.
-    if max_steps is not None and max_steps <= 0:
-      raise ValueError(
-          'Must specify max_steps > 0, given: {}'.format(max_steps))
-
-    # Validate hooks.
-    hooks = _validate_hooks(hooks)
-
-    return super(TrainSpec, cls).__new__(
-        cls, input_fn=input_fn, max_steps=max_steps, hooks=hooks)
-
-
-@estimator_export('estimator.EvalSpec')
-class EvalSpec(
-    collections.namedtuple('EvalSpec', [
-        'input_fn', 'steps', 'name', 'hooks', 'exporters', 'start_delay_secs',
-        'throttle_secs'
-    ])):
-  """Configuration for the "eval" part for the `train_and_evaluate` call.
-
-  `EvalSpec` combines details of evaluation of the trained model as well as its
-  export. Evaluation consists of computing metrics to judge the performance of
-  the trained model.  Export writes out the trained model on to external
-  storage.
-  """
-
-  def __new__(cls,
-              input_fn,
-              steps=100,
-              name=None,
-              hooks=None,
-              exporters=None,
-              start_delay_secs=120,
-              throttle_secs=600):
-    """Creates a validated `EvalSpec` instance.
-
-    Args:
-      input_fn: A function that constructs the input data for evaluation.
-        See [Premade Estimators](https://tensorflow.org/api_guides/premade_estimators#create_input_functions)
-        for more information. The function should construct and return one of
-        the following:
-          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
-            tuple (features, labels) with same constraints as below.
-          * A tuple (features, labels): Where features is a `Tensor` or a
-            dictionary of string feature name to `Tensor` and labels is a
-            `Tensor` or a dictionary of string label name to `Tensor`.
-
-      steps: Int. Positive number of steps for which to evaluate model. If
-        `None`, evaluates until `input_fn` raises an end-of-input exception.
-        See `Estimator.evaluate` for details.
-      name: String. Name of the evaluation if user needs to run multiple
-        evaluations on different data sets. Metrics for different evaluations
-        are saved in separate folders, and appear separately in tensorboard.
-      hooks: Iterable of `tf.train.SessionRunHook` objects to run
-        during evaluation.
-      exporters: Iterable of `Exporter`s, or a single one, or `None`.
-        `exporters` will be invoked after each evaluation.
-      start_delay_secs: Int. Start evaluating after waiting for this many
-        seconds.
-      throttle_secs: Int. Do not re-evaluate unless the last evaluation was
-        started at least this many seconds ago. Of course, evaluation does not
-        occur if no new checkpoints are available, hence, this is the minimum.
-
-    Returns:
-      A validated `EvalSpec` object.
-
-    Raises:
-      ValueError: If any of the input arguments is invalid.
-      TypeError: If any of the arguments is not of the expected type.
-    """
-    # Validate input_fn.
-    _validate_input_fn(input_fn)
-
-    # Validate steps.
-    if steps is not None and steps <= 0:
-      raise ValueError('Must specify steps > 0, given: {}'.format(steps))
-
-    # Validate name.
-    if name is not None and not isinstance(name, six.string_types):
-      raise TypeError('`name` must be string, given: {}'.format(name))
-
-    # Validate hooks.
-    hooks = _validate_hooks(hooks)
-
-    # Validate exporters.
-    exporters = _validate_exporters(exporters)
-
-    # Validate start_delay_secs.
-    if start_delay_secs < 0:
-      raise ValueError('Must specify start_delay_secs >= 0, given: {}'.format(
-          start_delay_secs))
-
-    # Validate throttle_secs.
-    if throttle_secs < 0:
-      raise ValueError(
-          'Must specify throttle_secs >= 0, given: {}'.format(throttle_secs))
-
-    return super(EvalSpec, cls).__new__(
-        cls,
-        input_fn=input_fn,
-        steps=steps,
-        name=name,
-        hooks=hooks,
-        exporters=exporters,
-        start_delay_secs=start_delay_secs,
-        throttle_secs=throttle_secs)
-
-
-@estimator_export('estimator.train_and_evaluate')
-def train_and_evaluate(estimator, train_spec, eval_spec):
-  """Train and evaluate the `estimator`.
-
-  This utility function trains, evaluates, and (optionally) exports the model by
-  using the given `estimator`. All training related specification is held in
-  `train_spec`, including training `input_fn` and training max steps, etc. All
-  evaluation and export related specification is held in `eval_spec`, including
-  evaluation `input_fn`, steps, etc.
-
-  This utility function provides consistent behavior for both local
-  (non-distributed) and distributed configurations. The default distribution
-  configuration is parameter server-based between-graph replication. For other
-  types of distribution configurations such as all-reduce training, please use
-  [DistributionStrategies](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).  # pylint: disable=line-too-long
-
-  Overfitting: In order to avoid overfitting, it is recommended to set up the
-  training `input_fn` to shuffle the training data properly.
-
-  Stop condition: In order to support both distributed and non-distributed
-  configuration reliably, the only supported stop condition for model
-  training is `train_spec.max_steps`. If `train_spec.max_steps` is `None`, the
-  model is trained forever. *Use with care* if model stop condition is
-  different. For example, assume that the model is expected to be trained with
-  one epoch of training data, and the training `input_fn` is configured to throw
-  `OutOfRangeError` after going through one epoch, which stops the
-  `Estimator.train`. For a three-training-worker distributed configuration, each
-  training worker is likely to go through the whole epoch independently. So, the
-  model will be trained with three epochs of training data instead of one epoch.
-
-  Example of local (non-distributed) training:
-
-  ```python
-  # Set up feature columns.
-  categorial_feature_a = categorial_column_with_hash_bucket(...)
-  categorial_feature_a_emb = embedding_column(
-      categorical_column=categorial_feature_a, ...)
-  ...  # other feature columns
-
-  estimator = DNNClassifier(
-      feature_columns=[categorial_feature_a_emb, ...],
-      hidden_units=[1024, 512, 256])
-
-  # Or set up the model directory
-  #   estimator = DNNClassifier(
-  #       config=tf.estimator.RunConfig(
-  #           model_dir='/my_model', save_summary_steps=100),
-  #       feature_columns=[categorial_feature_a_emb, ...],
-  #       hidden_units=[1024, 512, 256])
-
-  # Input pipeline for train and evaluate.
-  def train_input_fn(): # returns x, y
-    # please shuffle the data.
-    pass
-  def eval_input_fn(): # returns x, y
-    pass
-
-  train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=1000)
-  eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
-
-  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
-  ```
-  Note that in current implementation `estimator.evaluate` will be called
-  multiple times. This means that evaluation graph (including eval_input_fn)
-  will be re-created for each `evaluate` call. `estimator.train` will be called
-  only once.
-
-  Example of distributed training:
-
-  Regarding the example of distributed training, the code above can be used
-  without a change (Please do make sure that the `RunConfig.model_dir` for all
-  workers is set to the same directory, i.e., a shared file system all workers
-  can read and write). The only extra work to do is setting the environment
-  variable `TF_CONFIG` properly for each worker correspondingly.
-
-  Also see
-  [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed).
-
-  Setting environment variable depends on the platform. For example, on Linux,
-  it can be done as follows (`$` is the shell prompt):
-
-  ```
-  $ TF_CONFIG='<replace_with_real_content>' python train_model.py
-  ```
-
-  For the content in `TF_CONFIG`, assume that the training cluster spec looks
-  like:
-
-  ```
-  cluster = {"chief": ["host0:2222"],
-             "worker": ["host1:2222", "host2:2222", "host3:2222"],
-             "ps": ["host4:2222", "host5:2222"]}
-  ```
-
-  Example of `TF_CONFIG` for chief training worker (must have one and only one):
-
-  ```
-  # This should be a JSON string, which is set as environment variable. Usually
-  # the cluster manager handles that.
-  TF_CONFIG='{
-      "cluster": {
-          "chief": ["host0:2222"],
-          "worker": ["host1:2222", "host2:2222", "host3:2222"],
-          "ps": ["host4:2222", "host5:2222"]
-      },
-      "task": {"type": "chief", "index": 0}
-  }'
-  ```
-  Note that the chief worker also does the model training job, similar to other
-  non-chief training workers (see next paragraph). In addition to the model
-  training, it manages some extra work, e.g., checkpoint saving and restoring,
-  writing summaries, etc.
-
-  Example of `TF_CONFIG` for non-chief training worker (optional, could be
-  multiple):
-
-  ```
-  # This should be a JSON string, which is set as environment variable. Usually
-  # the cluster manager handles that.
-  TF_CONFIG='{
-      "cluster": {
-          "chief": ["host0:2222"],
-          "worker": ["host1:2222", "host2:2222", "host3:2222"],
-          "ps": ["host4:2222", "host5:2222"]
-      },
-      "task": {"type": "worker", "index": 0}
-  }'
-  ```
-  where the `task.index` should be set as 0, 1, 2, in this example, respectively
-  for non-chief training workers.
-
-  Example of `TF_CONFIG` for parameter server, aka ps (could be multiple):
-
-  ```
-  # This should be a JSON string, which is set as environment variable. Usually
-  # the cluster manager handles that.
-  TF_CONFIG='{
-      "cluster": {
-          "chief": ["host0:2222"],
-          "worker": ["host1:2222", "host2:2222", "host3:2222"],
-          "ps": ["host4:2222", "host5:2222"]
-      },
-      "task": {"type": "ps", "index": 0}
-  }'
-  ```
-  where the `task.index` should be set as 0 and 1, in this example, respectively
-  for parameter servers.
-
-  Example of `TF_CONFIG` for evaluator task. Evaluator is a special task that is
-  not part of the training cluster. There could be only one. It is used for
-  model evaluation.
-
-  ```
-  # This should be a JSON string, which is set as environment variable. Usually
-  # the cluster manager handles that.
-  TF_CONFIG='{
-      "cluster": {
-          "chief": ["host0:2222"],
-          "worker": ["host1:2222", "host2:2222", "host3:2222"],
-          "ps": ["host4:2222", "host5:2222"]
-      },
-      "task": {"type": "evaluator", "index": 0}
-  }'
-  ```
-
-  When `distribute` or `experimental_distribute.train_distribute` and
-  `experimental_distribute.remote_cluster` is set, this method will start a
-  client running on the current host which connects to the `remote_cluster` for
-  training and evaluation.
-
-  Args:
-    estimator: An `Estimator` instance to train and evaluate.
-    train_spec: A `TrainSpec` instance to specify the training specification.
-    eval_spec: A `EvalSpec` instance to specify the evaluation and export
-      specification.
-
-  Returns:
-    A tuple of the result of the `evaluate` call to the `Estimator` and the
-    export results using the specified `ExportStrategy`.
-    Currently, the return value is undefined for distributed training mode.
-
-  Raises:
-    ValueError: if environment variable `TF_CONFIG` is incorrectly set.
-  """
-  _assert_eval_spec(eval_spec)  # fail fast if eval_spec is invalid.
-
-  executor = _TrainingExecutor(
-      estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
-  config = estimator.config
-
-  # If `distribute_coordinator_mode` is set and running in distributed
-  # environment, we run `train_and_evaluate` via distribute coordinator.
-  if distribute_coordinator_training.should_run_distribute_coordinator(config):
-    logging.info('Running `train_and_evaluate` with Distribute Coordinator.')
-    distribute_coordinator_training.train_and_evaluate(
-        estimator, train_spec, eval_spec, _TrainingExecutor)
-    return
-
-  if (config.task_type == run_config_lib.TaskType.EVALUATOR and
-      config.task_id > 0):
-    raise ValueError(
-        'For distributed training, there can only be one `evaluator` task '
-        '(with task id 0).  Given task id {}'.format(config.task_id))
-
-  return executor.run()
-
-
-class _StopAtSecsHook(session_run_hook.SessionRunHook):
-  """Stops given secs after begin is called."""
-
-  def __init__(self, stop_after_secs):
-    self._stop_after_secs = stop_after_secs
-    self._start_time = None
-
-  def begin(self):
-    self._start_time = time.time()
-
-  def after_run(self, run_context, run_values):
-    del run_values
-    if time.time() - self._start_time >= self._stop_after_secs:
-      run_context.request_stop()
-
-
-class _NewCheckpointListenerForEvaluate(
-    basic_session_run_hooks.CheckpointSaverListener):
-  """A saver listener to run evaluate with every checkpoint."""
-
-  def __init__(self, evaluator, eval_throttle_secs, continuous_eval_listener):
-    self._evaluator = evaluator
-    self._eval_throttle_secs = eval_throttle_secs
-    self._continuous_eval_listener = continuous_eval_listener
-    self.eval_result, self.export_results = None, None
-
-  def begin(self):
-    self._timer = basic_session_run_hooks.SecondOrStepTimer(
-        every_secs=self._eval_throttle_secs)
-    self._is_first_run = True
-
-  def after_save(self, session, global_step_value):
-    del session  # unused; required by signature.
-    # skip first run model is not trained yet.
-    if self._is_first_run:
-      self._is_first_run = False
-      return
-
-    if not self._continuous_eval_listener.before_eval():
-      logging.info('Exiting training and evaluation loop, as requested by '
-                   '_ContinuousEvalListener.before_eval.')
-      return True
-    if self._timer.should_trigger_for_step(global_step_value):
-      self._evaluate(global_step_value)  # updates self.eval_result
-      if not self._continuous_eval_listener.after_eval(self.eval_result):
-        logging.info('Exiting evaluation, as requested by '
-                     '_ContinuousEvalListener.after_eval.')
-        return True
-    else:
-      # TODO(ispir): add remaining time in the log.
-      logging.info('Skip the current checkpoint eval due to throttle secs '
-                   '({} secs).'.format(self._eval_throttle_secs))
-
-  def end(self, session, global_step_value):
-    # Evaluate if the last step has not been evaluated, yet.
-    if global_step_value != self._timer.last_triggered_step():
-      if self._continuous_eval_listener.before_eval():
-        self._evaluate(global_step_value)
-        self._continuous_eval_listener.after_eval(self.eval_result)
-
-  def _evaluate(self, global_step_value):
-    self._timer.update_last_triggered_step(global_step_value)
-    self.eval_result, self.export_results = (
-        self._evaluator.evaluate_and_export())
-    if self.eval_result.status != _EvalStatus.EVALUATED:
-      #  This is unexpected; should never happen.
-      #  Training should always end with a new checkpoint.
-      raise RuntimeError('There was no new checkpoint after the training. '
-                         'Eval status: {}'.format(self.eval_result.status))
-
-
-class _TrainingExecutor(object):
-  """The executor to run `Estimator` training and evaluation.
-
-  This implementation supports both distributed and non-distributed (aka local)
-  training and evaluation based on the setting in `tf.estimator.RunConfig`.
-  """
-
-  def __init__(self,
-               estimator,
-               train_spec,
-               eval_spec,
-               train_hooks=None,
-               continuous_eval_listener=None):
-    if not isinstance(estimator, estimator_lib.Estimator):
-      raise TypeError(
-          '`estimator` must have type `tf.estimator.Estimator`. '
-          'Got: {}'.format(type(estimator)))
-    self._estimator = estimator
-
-    if not isinstance(train_spec, TrainSpec):
-      raise TypeError(
-          '`train_spec` must have type `tf.estimator.TrainSpec`. '
-          'Got: {}'.format(type(train_spec)))
-    self._train_spec = train_spec
-
-    if eval_spec and not isinstance(eval_spec, EvalSpec):
-      raise TypeError('`eval_spec` must be either `None` or have type '
-                      '`tf.estimator.EvalSpec`. Got: {}'.format(
-                          type(eval_spec)))
-    self._eval_spec = eval_spec
-
-    self._train_hooks = _validate_hooks(train_hooks)
-
-    if (continuous_eval_listener and
-        not isinstance(continuous_eval_listener, _ContinuousEvalListener)):
-      raise TypeError('`continuous_eval_listener` must have type '
-                      '`_ContinuousEvalListener`.')
-    self._continuous_eval_listener = (
-        continuous_eval_listener or _ContinuousEvalListener())
-
-  @property
-  def estimator(self):
-    return self._estimator
-
-  def run(self):
-    """Executes the run_foo for task type `foo`.
-
-    `_TrainingExecutor` predefines the procedure for task type 'chief',
-    'worker', 'ps', and 'evaluator'. For task type `foo`, the corresponding
-    procedure is `run_foo'. This `run` method invoke the procedure base on the
-    `RunConfig.task_type`.
-
-    Returns:
-      A tuple of the result of the `evaluate` call to the `Estimator` and the
-      export results using the specified `ExportStrategy`.
-      Currently undefined for distributed training mode.
-
-    Raises:
-      ValueError: if the estimator.config is mis-configured.
-    """
-    config = self._estimator.config
-
-    if (not config.cluster_spec and
-        config.task_type != run_config_lib.TaskType.EVALUATOR):
-      logging.info('Running training and evaluation locally (non-distributed).')
-      return self.run_local()
-
-    # Distributed case.
-    if not config.task_type:
-      # TODO(xiejw): Improve the error message about how to set the TF_CONFIG
-      # correctly.
-      raise ValueError(
-          '`estimator.config` must have task_type set. This usually means '
-          'TF_CONFIG environment is not set correctly.')
-
-    if config.task_type == 'local':
-      raise ValueError(
-          '`task.type` in TF_CONFIG cannot be `local`. Leaving `cluster` and '
-          '`task` properties in TF_CONFIG absent triggers train and evaluate '
-          '`Estimator` locally (non-distributed).')
-
-    # For task type foo, call executor.run_foo.
-    available_tasks = [
-        x for x in dir(self)
-        if x.startswith('run_') and x != 'run_local' and
-        callable(getattr(self, x))
-    ]
-    task_to_run = 'run_' + config.task_type
-    if task_to_run not in available_tasks:
-      raise ValueError(
-          'Task type {} is not supported. Supported task types are {}'.format(
-              config.task_type, [x[len('run_'):] for x in available_tasks]))
-    getattr(self, task_to_run)()
-
-  def run_chief(self):
-    """Runs task chief."""
-    # TODO(xiejw): To allow execution framework to add train hooks.
-    return self._start_distributed_training()
-
-  def run_worker(self):
-    """Runs task (training) worker."""
-    # TODO(xiejw): To allow execution framework to add train hooks.
-    return self._start_distributed_training()
-
-  def run_master(self):
-    """Runs task master."""
-    _assert_eval_spec(self._eval_spec)
-
-    # Final export signal: For any eval result with global_step >= train
-    # max_steps, the evaluator will send the final export signal. There is a
-    # small chance that the Estimator.train stopping logic sees a different
-    # global_step value (due to global step race condition and the fact the
-    # saver sees a larger value for checkpoint saving), which does not end
-    # the training. When the training ends, a new checkpoint is generated, which
-    # triggers the listener again. So, it could be the case the final export is
-    # triggered twice.
-    #
-    # But here, throttle_secs will skip the next intermediate checkpoint and,
-    # so, the double final export chance is very small.
-    evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
-                                             self._train_spec.max_steps)
-
-    # When the underlying `Estimator` object saves a new checkpoint, we would
-    # like this callback to be called so that evaluation and export can trigger.
-    saving_listeners = [
-        _NewCheckpointListenerForEvaluate(evaluator,
-                                          self._eval_spec.throttle_secs,
-                                          _ContinuousEvalListener())
-    ]
-    self._start_distributed_training(saving_listeners=saving_listeners)
-
-  def run_evaluator(self):
-    """Runs task evaluator."""
-    # TODO(xiejw): To allow execution framework to add continuous eval listener.
-    return self._start_continuous_evaluation()
-
-  def run_ps(self):
-    """Runs task parameter server (in training cluster spec)."""
-    config = self._estimator.config
-    server = self._start_std_server(config)
-    server.join()
-
-  def run_local(self):
-    """Runs training and evaluation locally (non-distributed)."""
-    _assert_eval_spec(self._eval_spec)
-
-    train_hooks = list(self._train_spec.hooks) + list(self._train_hooks)
-    logging.info('Start train and evaluate loop. The evaluate will happen '
-                 'after every checkpoint. Checkpoint frequency is determined '
-                 'based on RunConfig arguments: save_checkpoints_steps {} or '
-                 'save_checkpoints_secs {}.'.format(
-                     self._estimator.config.save_checkpoints_steps,
-                     self._estimator.config.save_checkpoints_secs))
-
-    evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
-                                             self._train_spec.max_steps)
-
-    listener_for_eval = _NewCheckpointListenerForEvaluate(
-        evaluator, self._eval_spec.throttle_secs,
-        self._continuous_eval_listener)
-    saving_listeners = [listener_for_eval]
-
-    self._estimator.train(
-        input_fn=self._train_spec.input_fn,
-        max_steps=self._train_spec.max_steps,
-        hooks=train_hooks,
-        saving_listeners=saving_listeners)
-
-    eval_result = listener_for_eval.eval_result or _EvalResult(
-        status=_EvalStatus.MISSING_CHECKPOINT)
-    return eval_result.metrics, listener_for_eval.export_results
-
-  def _start_std_server(self, config):
-    """Creates, starts, and returns a server_lib.Server."""
-    if (not config.cluster_spec or not config.task_type or
-        config.task_id is None):
-      raise RuntimeError('Could not start server; be sure to specify '
-                         'cluster_spec, task_type, and task in '
-                         'RunConfig or set the TF_CONFIG environment variable.')
-
-    if not config.master:
-      jobs = config.cluster_spec.jobs
-      if (len(jobs) == 1 and
-          len(config.cluster_spec.job_tasks(jobs[0])) == 1 and
-          config.task_type in _TRAINER_JOBS):
-        # For distributed training, config.master is empty if and only if it has
-        # a single node in the cluster spec. In this case, we should not start
-        # the server.
-        logging.info('Skip starting Tensorflow server as there is only one '
-                     'node in the cluster.')
-        return
-      else:
-        raise RuntimeError(
-            'Could not start server; be sure to specify master in '
-            'RunConfig or set the TF_CONFIG environment variable.')
-
-    logging.info('Start Tensorflow server.')
-
-    if config.session_config is None:
-      session_config = config_pb2.ConfigProto(log_device_placement=False)
-    else:
-      session_config = config_pb2.ConfigProto(
-          log_device_placement=False,
-          gpu_options=config.session_config.gpu_options)
-
-    server = server_lib.Server(
-        config.cluster_spec,
-        job_name=config.task_type,
-        task_index=config.task_id,
-        config=session_config,
-        start=False,
-        protocol=config.protocol)
-    server.start()
-    return server
-
-  def _start_distributed_training(self, saving_listeners=None):
-    """Calls `Estimator` train in a distributed setting."""
-    config = self._estimator.config
-
-    # Start in-process TensorFlow server if needed. It's important to start the
-    # server before we (optionally) sleep. Otherwise, the servers will wait to
-    # connect to each other before starting to train.
-    if not _is_google_env():
-      self._start_std_server(config)
-
-    # Delay worker to start. For asynchronous training, this usually helps model
-    # to converge faster.  Chief starts the training immediately, so, worker
-    # with task id x (0-based) should wait (x+1) * _DELAY_SECS_PER_WORKER.
-    start_delay_secs = 0
-    if config.task_type == run_config_lib.TaskType.WORKER:
-      # TODO(xiejw): Replace the hard code logic (task_id + 1) with unique id in
-      # training cluster.
-      start_delay_secs = min(_MAX_DELAY_SECS,
-                             (config.task_id + 1) * _DELAY_SECS_PER_WORKER)
-    if start_delay_secs > 0:
-      logging.info('Waiting %d secs before starting training.',
-                   start_delay_secs)
-      time.sleep(start_delay_secs)
-
-    self._estimator.train(
-        input_fn=self._train_spec.input_fn,
-        max_steps=self._train_spec.max_steps,
-        hooks=list(self._train_spec.hooks) + list(self._train_hooks),
-        saving_listeners=saving_listeners)
-
-  def _start_continuous_evaluation(self):
-    """Repeatedly calls `Estimator` evaluate and export until training ends."""
-
-    _assert_eval_spec(self._eval_spec)
-
-    start_delay_secs = self._eval_spec.start_delay_secs
-    if start_delay_secs:
-      logging.info('Waiting %f secs before starting eval.', start_delay_secs)
-      time.sleep(start_delay_secs)
-
-    latest_eval_result = None
-    evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
-                                             self._train_spec.max_steps)
-
-    should_early_stop = False
-    while not should_early_stop:
-      if (latest_eval_result and
-          latest_eval_result.status == _EvalStatus.EVALUATED):
-        global_step = latest_eval_result.metrics.get(ops.GraphKeys.GLOBAL_STEP)
-        if (global_step and self._train_spec.max_steps and
-            global_step >= self._train_spec.max_steps):
-          logging.info(
-              'Exiting evaluation, global_step=%s >= train max_steps=%s',
-              global_step, self._train_spec.max_steps)
-          return
-
-      latest_eval_result, should_early_stop = self._execute_evaluator_once(
-          evaluator, self._continuous_eval_listener,
-          self._eval_spec.throttle_secs)
-
-  def _execute_evaluator_once(self, evaluator, continuous_eval_listener,
-                              throttle_secs):
-    """Executes the `evaluator`."""
-
-    _assert_eval_spec(self._eval_spec)
-
-    start = time.time()
-
-    eval_result = None
-    should_early_stop = False
-
-    if not continuous_eval_listener.before_eval():
-      logging.info('Exiting evaluation, as requested by '
-                   '_ContinuousEvalListener.before_eval.')
-      should_early_stop = True
-      return (eval_result, should_early_stop)
-
-    # Final export signal: For any eval result with global_step >= train
-    # max_steps, the evaluator will send the final export signal. The next
-    # iteration of while loop will end the continuous eval as the stopping
-    # condition is satisfied (both checks use the same global_step value,
-    # i.e., no race condition)
-    eval_result, _ = evaluator.evaluate_and_export()
-
-    if not self._continuous_eval_listener.after_eval(eval_result):
-      logging.info('Exiting evaluation, as requested by '
-                   '_ContinuousEvalListener.after_eval.')
-      should_early_stop = True
-      return (eval_result, should_early_stop)
-
-    # Throttle if necessary.
-    elapsed_time = time.time() - start
-    difference = throttle_secs - elapsed_time
-    if difference > 0:
-      logging.info('Waiting %f secs before starting next eval run.', difference)
-      time.sleep(difference)
-    elif (throttle_secs == 0 and
-          eval_result.status != _EvalStatus.EVALUATED):
-      # Prints a user-actionable warning to avoid unnecessary load on evaluator.
-      logging.warning(
-          'EvalSpec.throttle_secs is set as 0. This might overload the job '
-          'before finding (next) new checkpoint. Please consider to increase '
-          'it.')
-
-    return (eval_result, should_early_stop)
-
-  class _Evaluator(object):
-    """A helper class to call `Estimator.evaluate` and export model."""
-
-    def __init__(self, estimator, eval_spec, max_training_steps):
-      self._estimator = estimator
-
-      _assert_eval_spec(eval_spec)
-      self._eval_spec = eval_spec
-
-      self._is_final_export_triggered = False
-      self._previous_ckpt_path = None
-      self._last_warning_time = 0
-      self._max_training_steps = max_training_steps
-
-    @property
-    def is_final_export_triggered(self):
-      return self._is_final_export_triggered
-
-    def evaluate_and_export(self):
-      """Evaluate and (maybe) export the current model.
-
-      Returns:
-        A tuple of `EvalResult` instance and the export results.
-
-      Raises:
-        RuntimeError: for any unexpected internal error.
-        TypeError: if evaluation result has wrong type.
-      """
-      latest_ckpt_path = self._estimator.latest_checkpoint()
-      if not latest_ckpt_path:
-        self._log_err_msg('Estimator is not trained yet. Will start an '
-                          'evaluation when a checkpoint is ready.')
-        return _EvalResult(status=_EvalStatus.MISSING_CHECKPOINT), []
-
-      if latest_ckpt_path == self._previous_ckpt_path:
-        self._log_err_msg(
-            'No new checkpoint ready for evaluation. Skip the current '
-            'evaluation pass as evaluation results are expected to be same '
-            'for the same checkpoint.')
-        return _EvalResult(status=_EvalStatus.NO_NEW_CHECKPOINT), []
-
-      metrics = self._estimator.evaluate(
-          input_fn=self._eval_spec.input_fn,
-          steps=self._eval_spec.steps,
-          name=self._eval_spec.name,
-          checkpoint_path=latest_ckpt_path,
-          hooks=self._eval_spec.hooks)
-
-      # _EvalResult validates the metrics.
-      eval_result = _EvalResult(
-          status=_EvalStatus.EVALUATED,
-          metrics=metrics,
-          checkpoint_path=latest_ckpt_path)
-
-      is_the_final_export = (
-          eval_result.metrics[ops.GraphKeys.GLOBAL_STEP] >=
-          self._max_training_steps if self._max_training_steps else False)
-      export_results = self._export_eval_result(eval_result,
-                                                is_the_final_export)
-
-      if is_the_final_export:
-        logging.debug('Calling exporter with the `is_the_final_export=True`.')
-        self._is_final_export_triggered = True
-
-      self._last_warning_time = 0
-      self._previous_ckpt_path = latest_ckpt_path
-      return eval_result, export_results
-
-    def _log_err_msg(self, message):
-      """Prints warning `message` every 10 mins."""
-      current_time = time.time()
-      if current_time - self._last_warning_time > 600:
-        logging.warning(message)
-        self._last_warning_time = current_time
-
-    def _export_eval_result(self, eval_result, is_the_final_export):
-      """Export `eval_result` according to exporters in `EvalSpec`."""
-      export_dir_base = os.path.join(
-          compat.as_str_any(self._estimator.model_dir),
-          compat.as_str_any('export'))
-
-      export_results = []
-      for exporter in self._eval_spec.exporters:
-        export_results.append(
-            exporter.export(
-                estimator=self._estimator,
-                export_path=os.path.join(
-                    compat.as_str_any(export_dir_base),
-                    compat.as_str_any(exporter.name)),
-                checkpoint_path=eval_result.checkpoint_path,
-                eval_result=eval_result.metrics,
-                is_the_final_export=is_the_final_export))
-      return export_results
-
-
-class _EvalStatus(object):
-  """The status of an evaluation event.
-
-  For local training and evaluation, the status can only be `EVALUATED` as
-  `Estimator.train` always generates a new checkpoint.
-
-  For distributed training and evaluation, a separated evaluator keeps looking
-  for new checkpoint. So, multiple situations might occur:
-
-  - EVALUATED: A new checkpoint is found since last evaluation.
-      `Estimator.evaluate` will be invoked.
-  - MISSING_CHECKPOINT: No checkpoint can be found. Typically, this means
-      the trainer has not yet produced any checkpoint.
-  - NO_NEW_CHECKPOINT: No new checkpoint can be found since last evaluation.
-      Typically, this means the trainer has not yet produced any new checkpoint.
-  """
-
-  EVALUATED = 'evaluated'
-  MISSING_CHECKPOINT = 'missing checkpoint'
-  NO_NEW_CHECKPOINT = 'no new checkpoint'
-
-
-class _EvalResult(
-    collections.namedtuple('EvalResult',
-                           ['status', 'metrics', 'checkpoint_path'])):
-  """_EvalResult holds the result of an evaluation event."""
-
-  def __new__(cls, status, metrics=None, checkpoint_path=None):
-    """Creates a validated `_EvalResult`.
-
-    Args:
-      status: See `_EvalStatus`.
-      metrics: The evaluation results returned by `Estimator.evaluate`. Only set
-          if status is `EVALUATED`.
-      checkpoint_path: The corresponding checkpoint path for the `metrics`. Only
-          set if status is `EVALUATED`.
-    Returns:
-      A validated `_EvalResult` object.
-
-    Raises:
-      ValueError: If validation fails.
-      TypeError: If any of the arguments is not the expected type.
-    """
-
-    if status != _EvalStatus.EVALUATED:
-      if metrics:
-        raise ValueError(
-            'metrics must be `None` if status is not {}; got status {},'
-            ' metrics {}'.format(_EvalStatus.EVALUATED, status, metrics))
-      if checkpoint_path:
-        raise ValueError(
-            'checkpoint must be `None` if status is not {}; got status {}, '
-            'checkpoint_path {}'.format(_EvalStatus.EVALUATED, status,
-                                        checkpoint_path))
-      return super(_EvalResult, cls).__new__(cls, status, metrics,
-                                             checkpoint_path)
-
-    # Now, evaluated case.
-    assert status == _EvalStatus.EVALUATED
-
-    # Validates metrics.
-    if not metrics:
-      raise ValueError(
-          'Internal error: `Estimator.evaluate` should never return empty '
-          'metrics.')
-    if not isinstance(metrics, dict):
-      raise TypeError(
-          '`Estimator.evaluate` should return dict. Given {}.'.format(
-              type(metrics)))
-    if ops.GraphKeys.GLOBAL_STEP not in metrics:
-      raise ValueError(
-          'Internal error: `Estimator.evaluate` result should have '
-          '`global_step` in result. Given {}'.format(metrics))
-
-    # Validates checkpoint_path.
-    if not checkpoint_path:
-      raise ValueError(
-          'Internal error: `checkpoint_path` should never be empty.')
-
-    return super(_EvalResult, cls).__new__(cls, status, metrics,
-                                           checkpoint_path)
-
-
-class _ContinuousEvalListener(object):
-  """Interface for listeners that take action before or after evaluation."""
-
-  def before_eval(self):
-    """Called before evaluation.
-
-    Returns:
-      `False` if you want to skip the current evaluation and early stop the
-      continuous evaluation; `True` otherwise.
-    """
-    return True
-
-  def after_eval(self, eval_result):
-    """Called after the evaluation is executed.
-
-    Args:
-      eval_result: An `_EvalResult` instance.
-
-    Returns:
-      False if you want to early stop continuous evaluation; `True` otherwise.
-    """
-    del eval_result
-    return True
+from tensorflow_estimator.python.estimator import training
 
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+training.__all__ = [s for s in dir(training) if not s.startswith('__')]
 
-def _assert_eval_spec(eval_spec):
-  """Raise error if `eval_spec` is not of the right type."""
-  if not isinstance(eval_spec, EvalSpec):
-    raise TypeError('`eval_spec` must have type `tf.estimator.EvalSpec`. '
-                    'Got: {}'.format(type(eval_spec)))
+from tensorflow_estimator.python.estimator.training import *
diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
deleted file mode 100644
index 7d46917a6f60da52fffe274f36a5c2954d03e560..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/training_test.py
+++ /dev/null
@@ -1,2198 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for training.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import glob
-import json
-import os
-import random
-import shutil
-import tempfile
-import time
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import exporter as exporter_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.estimator import training
-from tensorflow.python.estimator.canned import dnn
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export as export_lib
-from tensorflow.python.feature_column import feature_column
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import metrics as metrics_lib
-from tensorflow.python.ops import state_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import summary_iterator
-from tensorflow.python.summary.writer import writer_cache
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import monitored_session
-from tensorflow.python.training import server_lib
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training_util
-from tensorflow.python.util import compat
-
-_DEFAULT_EVAL_STEPS = 100
-_DEFAULT_EVAL_DELAY_SECS = 120
-_DEFAULT_EVAL_THROTTLE_SECS = 600
-_DELAY_SECS_PER_WORKER = 5
-_GLOBAL_STEP_KEY = ops.GraphKeys.GLOBAL_STEP
-_INVALID_INPUT_FN_MSG = '`input_fn` must be callable'
-_INVALID_HOOK_MSG = 'All hooks must be `SessionRunHook` instances'
-_INVALID_MAX_STEPS_MSG = 'Must specify max_steps > 0'
-_INVALID_STEPS_MSG = 'Must specify steps > 0'
-_INVALID_NAME_MSG = '`name` must be string'
-_INVALID_EVAL_DELAY_SECS_MSG = 'Must specify start_delay_secs >= 0'
-_INVALID_EVAL_THROTTLE_SECS_MSG = 'Must specify throttle_secs >= 0'
-_INVALID_ESTIMATOR_MSG = '`estimator` must have type `tf.estimator.Estimator`'
-_STALE_CHECKPOINT_MSG = 'There was no new checkpoint after the training.'
-_INVALID_EXPORTER_MSG = '`exporters` must be an Exporter'
-_INVALID_EXPORTER_NAME_TYPE_MSG = 'An Exporter must have a string name'
-_DUPLICATE_EXPORTER_NAMES_MSG = '`exporters` must have unique names.'
-_NONE_EXPORTER_NAME_MSG = (
-    'An Exporter cannot have a name that is `None` or empty.')
-_INVALID_TRAIN_SPEC_MSG = '`train_spec` must have type `tf.estimator.TrainSpec`'
-_INVALID_EVAL_SPEC_MSG = '`eval_spec` must have type `tf.estimator.EvalSpec`'
-_EVAL_SPEC_OR_NONE_MSG = (
-    '`eval_spec` must be either `None` or have type `tf.estimator.EvalSpec`')
-_INVALID_EVAL_LISTENER_MSG = 'must have type `_ContinuousEvalListener`'
-_INVALID_CONFIG_FOR_STD_SERVER_MSG = 'Could not start server; .*TF_CONFIG'
-_INVALID_LOCAL_TASK_WITH_CLUSTER = '`task.type` in TF_CONFIG cannot be `local`'
-_INVALID_TASK_TYPE = '`estimator.config` must have task_type set.'
-_INPROPER_THROTTL_SECS = (
-    'EvalSpec.throttle_secs is set as 0.*Please consider to increase')
-
-# The message should NOT have 'local' word as part of it. As (?!word) is looking
-# ahead, so, the $ (ending) check is required; otherwise, it will match
-# partially and return successuful.
-_INVALID_TASK_TO_RUN = (
-    'Task type .* is not supported. Supported task types are ((?!local).)*$')
-_INVALID_EMPTY_EVAL_RESULT_ERR = (
-    'Internal error: `Estimator.evaluate` should never return empty metrics')
-_INVALID_EVAL_RESULT_TYPE_ERR = '`Estimator.evaluate` should return dict.'
-_MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR = (
-    'Internal error: `Estimator.evaluate` result should have `global_step`')
-_INVALID_EVAL_TASK_ID_ERR = (
-    'there can only be one `evaluator` task .*with task id 0')
-
-_TF_CONFIG_FOR_CHIEF = {
-    'cluster': {
-        run_config_lib.TaskType.CHIEF: ['host0:0'],
-        run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-        run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4']
-    },
-    'task': {
-        'type': run_config_lib.TaskType.CHIEF,
-        'index': 0
-    }
-}
-
-_TF_CONFIG_FOR_MASTER = {
-    'cluster': {
-        run_config_lib.TaskType.MASTER: ['host0:0'],
-        run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-        run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4']
-    },
-    'task': {
-        'type': run_config_lib.TaskType.MASTER,
-        'index': 0
-    }
-}
-
-_TF_CONFIG_FOR_WORKER = {
-    'cluster': {
-        run_config_lib.TaskType.CHIEF: ['host0:0'],
-        run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-        run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4']
-    },
-    'task': {
-        'type': run_config_lib.TaskType.WORKER,
-        'index': 1
-    }
-}
-
-_TF_CONFIG_FOR_PS = {
-    'cluster': {
-        run_config_lib.TaskType.CHIEF: ['host0:0'],
-        run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-        run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4']
-    },
-    'task': {
-        'type': run_config_lib.TaskType.PS,
-        'index': 1
-    }
-}
-
-_TF_CONFIG_FOR_EVALUATOR = {
-    'cluster': {
-        run_config_lib.TaskType.CHIEF: ['host0:0'],
-        run_config_lib.TaskType.PS: ['host1:1', 'host2:2'],
-        run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4']
-    },
-    'task': {
-        'type': run_config_lib.TaskType.EVALUATOR,
-        'index': 0
-    }
-}
-
-_TF_CONFIG_FOR_GOOGLE = {'environment': 'google'}
-
-
-class _FakeHook(session_run_hook.SessionRunHook):
-  """Fake implementation of `SessionRunHook`."""
-
-
-class _InvalidHook(object):
-  """Invalid hook (not a subclass of `SessionRunHook`)."""
-
-
-def _create_exporter(name):
-  class FakeExporter(exporter_lib.Exporter):
-
-    def __init__(self, name):
-      self._name = name
-
-    @property
-    def name(self):
-      return self._name
-
-    def export(self, *args, **kwargs):
-      del args, kwargs
-
-  return FakeExporter(name=name)
-
-
-def _create_run_config_with_cluster_spec(tf_config):
-  with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}):
-    return run_config_lib.RunConfig()
-
-
-class TrainSpecTest(test.TestCase):
-  """Tests TrainSpec."""
-
-  def testRequiredArgumentsSet(self):
-    """Tests that no errors are raised when all required arguments are set."""
-    spec = training.TrainSpec(input_fn=lambda: 1)
-    self.assertEqual(1, spec.input_fn())
-    self.assertIsNone(spec.max_steps)
-    self.assertEqual(0, len(spec.hooks))
-
-  def testAllArgumentsSet(self):
-    """Tests that no errors are raised when all arguments are set."""
-    hooks = [_FakeHook()]
-    spec = training.TrainSpec(input_fn=lambda: 1, max_steps=2, hooks=hooks)
-    self.assertEqual(1, spec.input_fn())
-    self.assertEqual(2, spec.max_steps)
-    self.assertEqual(tuple(hooks), spec.hooks)
-
-  def testInvalidInputFn(self):
-    with self.assertRaisesRegexp(TypeError, _INVALID_INPUT_FN_MSG):
-      training.TrainSpec(input_fn='invalid')
-
-  def testInvalidMaxStep(self):
-    with self.assertRaisesRegexp(ValueError, _INVALID_MAX_STEPS_MSG):
-      training.TrainSpec(input_fn=lambda: 1, max_steps=0)
-
-  def testInvalidHook(self):
-    with self.assertRaisesRegexp(TypeError, _INVALID_HOOK_MSG):
-      training.TrainSpec(input_fn=lambda: 1, hooks=[_InvalidHook()])
-
-
-class EvalSpecTest(test.TestCase):
-  """Tests EvalSpec."""
-
-  def testRequiredArgumentsSet(self):
-    """Tests that no errors are raised when all required arguments are set."""
-    spec = training.EvalSpec(input_fn=lambda: 1)
-    self.assertEqual(1, spec.input_fn())
-    self.assertEqual(_DEFAULT_EVAL_STEPS, spec.steps)
-    self.assertIsNone(spec.name)
-    self.assertEqual(0, len(spec.hooks))
-    self.assertEqual(0, len(spec.exporters))
-    self.assertEqual(_DEFAULT_EVAL_DELAY_SECS, spec.start_delay_secs)
-    self.assertEqual(_DEFAULT_EVAL_THROTTLE_SECS, spec.throttle_secs)
-
-  def testAllArgumentsSet(self):
-    """Tests that no errors are raised when all arguments are set."""
-    hooks = [_FakeHook()]
-    exporter = _create_exporter('a')
-
-    spec = training.EvalSpec(
-        input_fn=lambda: 1,
-        steps=2,
-        name='name',
-        hooks=hooks,
-        exporters=exporter,
-        start_delay_secs=3,
-        throttle_secs=4)
-    self.assertEqual(1, spec.input_fn())
-    self.assertEqual(2, spec.steps)
-    self.assertEqual('name', spec.name)
-    self.assertEqual(tuple(hooks), spec.hooks)
-    self.assertEqual((exporter,), spec.exporters)
-    self.assertEqual(3, spec.start_delay_secs)
-    self.assertEqual(4, spec.throttle_secs)
-
-  def testListOfExporters(self):
-    """Tests that no errors are raised with multiple exporters."""
-    exporters = [_create_exporter('a'), _create_exporter('b')]
-
-    spec = training.EvalSpec(input_fn=lambda: 1, exporters=exporters)
-    self.assertEqual(1, spec.input_fn())
-    self.assertEqual(tuple(exporters), spec.exporters)
-
-  def testInvalidInputFn(self):
-    with self.assertRaisesRegexp(TypeError, _INVALID_INPUT_FN_MSG):
-      training.EvalSpec(input_fn='invalid')
-
-  def testInvalidMaxStep(self):
-    with self.assertRaisesRegexp(ValueError, _INVALID_STEPS_MSG):
-      training.EvalSpec(input_fn=lambda: 1, steps=0)
-
-  def testInvalidName(self):
-    with self.assertRaisesRegexp(TypeError, _INVALID_NAME_MSG):
-      training.EvalSpec(input_fn=lambda: 1, name=123)
-
-  def testInvalidHook(self):
-    with self.assertRaisesRegexp(TypeError, _INVALID_HOOK_MSG):
-      training.EvalSpec(input_fn=lambda: 1, hooks=[_InvalidHook()])
-
-  def testInvalidDelaySecs(self):
-    with self.assertRaisesRegexp(ValueError, _INVALID_EVAL_DELAY_SECS_MSG):
-      training.EvalSpec(input_fn=lambda: 1, start_delay_secs=-1)
-
-  def testInvalidThrottleSecs(self):
-    with self.assertRaisesRegexp(ValueError, _INVALID_EVAL_THROTTLE_SECS_MSG):
-      training.EvalSpec(input_fn=lambda: 1, throttle_secs=-1)
-
-  def testInvalidTypeOfListOfExporters(self):
-    with self.assertRaisesRegexp(TypeError, _INVALID_EXPORTER_MSG):
-      training.EvalSpec(
-          input_fn=lambda: 1, exporters=[_create_exporter('a'),
-                                         _FakeHook()])
-
-  def testInvalidTypeOfIndividualExporter(self):
-    with self.assertRaisesRegexp(TypeError, _INVALID_EXPORTER_MSG):
-      training.EvalSpec(input_fn=lambda: 1, exporters=_FakeHook())
-
-  def testInvalidTypeOfExporterName(self):
-    with self.assertRaisesRegexp(ValueError, _INVALID_EXPORTER_NAME_TYPE_MSG):
-      training.EvalSpec(input_fn=lambda: 1,
-                        exporters=_create_exporter(name=123))
-
-  def testMultipleExportersWithTheSameName(self):
-    with self.assertRaisesRegexp(ValueError, _DUPLICATE_EXPORTER_NAMES_MSG):
-      training.EvalSpec(
-          input_fn=lambda: 1,
-          exporters=[_create_exporter('a'), _create_exporter('a')])
-
-  def testMultipleExportersAndOneWithoutAName(self):
-    with self.assertRaisesRegexp(ValueError, _NONE_EXPORTER_NAME_MSG):
-      training.EvalSpec(
-          input_fn=lambda: 1,
-          exporters=[_create_exporter('a'),
-                     _create_exporter(None)])
-
-  def testSingleExporterWithoutAName(self):
-    with self.assertRaisesRegexp(ValueError, _NONE_EXPORTER_NAME_MSG):
-      training.EvalSpec(input_fn=lambda: 1, exporters=_create_exporter(None))
-
-
-class TrainAndEvaluateTest(test.TestCase):
-
-  def test_run_task(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
-      mock_executor_instance = test.mock.Mock()
-      mock_executor.return_value = mock_executor_instance
-      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
-      mock_executor.assert_called_with(estimator=mock_est,
-                                       train_spec=mock_train_spec,
-                                       eval_spec=mock_eval_spec)
-      self.assertTrue(mock_executor_instance.run.called)
-
-  def test_error_out_if_evaluator_task_id_is_non_zero(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-        },
-        'task': {
-            'type': run_config_lib.TaskType.EVALUATOR,
-            'index': 1
-        }
-    }
-
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    with self.assertRaisesRegexp(ValueError, _INVALID_EVAL_TASK_ID_ERR):
-      training.train_and_evaluate(mock_est, mock_train_spec, mock_eval_spec)
-
-  def test_invalid_estimator(self):
-    invalid_estimator = object()
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    with self.assertRaisesRegexp(TypeError, _INVALID_ESTIMATOR_MSG):
-      training.train_and_evaluate(invalid_estimator, mock_train_spec,
-                                  mock_eval_spec)
-
-  def test_fail_fast_if_invalid_eval_spec(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    invalid_eval_spec = object()
-
-    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
-      with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
-        training.train_and_evaluate(mock_est, mock_train_spec,
-                                    invalid_eval_spec)
-
-      mock_executor.assert_not_called()
-
-
-class TrainingExecutorConstructorTest(test.TestCase):
-  """Tests constructor of _TrainingExecutor."""
-
-  def test_required_arguments_set(self):
-    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
-    train_spec = training.TrainSpec(input_fn=lambda: 1)
-    eval_spec = training.EvalSpec(input_fn=lambda: 1)
-
-    executor = training._TrainingExecutor(estimator, train_spec, eval_spec)
-    self.assertEqual(estimator, executor.estimator)
-
-  def test_invalid_estimator(self):
-    invalid_estimator = object()
-    train_spec = training.TrainSpec(input_fn=lambda: 1)
-    eval_spec = training.EvalSpec(input_fn=lambda: 1)
-
-    with self.assertRaisesRegexp(TypeError, _INVALID_ESTIMATOR_MSG):
-      training._TrainingExecutor(invalid_estimator, train_spec, eval_spec)
-
-  def test_invalid_train_spec(self):
-    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
-    invalid_train_spec = object()
-    eval_spec = training.EvalSpec(input_fn=lambda: 1)
-
-    with self.assertRaisesRegexp(TypeError, _INVALID_TRAIN_SPEC_MSG):
-      training._TrainingExecutor(estimator, invalid_train_spec, eval_spec)
-
-  def test_invalid_eval_spec(self):
-    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
-    train_spec = training.TrainSpec(input_fn=lambda: 1)
-    invalid_eval_spec = object()
-
-    with self.assertRaisesRegexp(TypeError, _EVAL_SPEC_OR_NONE_MSG):
-      training._TrainingExecutor(estimator, train_spec, invalid_eval_spec)
-
-  def test_eval_spec_none(self):
-    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
-    train_spec = training.TrainSpec(input_fn=lambda: 1)
-    eval_spec = None
-
-    # Tests that no error is raised.
-    training._TrainingExecutor(estimator, train_spec, eval_spec)
-
-  def test_invalid_train_hooks(self):
-    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
-    train_spec = training.TrainSpec(input_fn=lambda: 1)
-    eval_spec = training.EvalSpec(input_fn=lambda: 1)
-    invalid_train_hooks = [object()]
-
-    with self.assertRaisesRegexp(TypeError, _INVALID_HOOK_MSG):
-      training._TrainingExecutor(
-          estimator, train_spec, eval_spec, train_hooks=invalid_train_hooks)
-
-  def test_invalid_continuous_eval_listener(self):
-    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
-    train_spec = training.TrainSpec(input_fn=lambda: 1)
-    eval_spec = training.EvalSpec(input_fn=lambda: 1)
-    invalid_continuous_eval_listener = object()
-
-    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_LISTENER_MSG):
-      training._TrainingExecutor(
-          estimator,
-          train_spec,
-          eval_spec,
-          continuous_eval_listener=invalid_continuous_eval_listener)
-
-
-class _TrainingExecutorTrainingTest(object):
-  """Tests training of _TrainingExecutor."""
-
-  def __init__(self, run_config):
-    self._run_config = run_config
-
-  def _run_task(self, executor):
-    # We should not call executor.run as the test here is intended to test
-    # run_foo explicitly (foo is the task type).
-    return getattr(executor, 'run_' + self._run_config.task_type)()
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_train_with_train_spec(self, mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = self._run_config
-    train_spec = training.TrainSpec(
-        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-    mock_server_instance = mock_server.return_value
-
-    executor = training._TrainingExecutor(mock_est, train_spec, mock_eval_spec)
-    self._run_task(executor)
-
-    mock_server.assert_called_with(
-        mock_est.config.cluster_spec,
-        job_name=mock_est.config.task_type,
-        task_index=mock_est.config.task_id,
-        config=test.mock.ANY,
-        protocol=None,
-        start=False)
-
-    self.assertTrue(mock_server_instance.start.called)
-
-    mock_est.train.assert_called_with(
-        input_fn=train_spec.input_fn,
-        max_steps=train_spec.max_steps,
-        hooks=list(train_spec.hooks),
-        saving_listeners=test.mock.ANY)
-    mock_est.evaluate.assert_not_called()
-    mock_est.export_savedmodel.assert_not_called()
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_train_with_no_eval_spec(self, mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = self._run_config
-    train_spec = training.TrainSpec(
-        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
-    eval_spec = None
-    mock_server_instance = mock_server.return_value
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    self._run_task(executor)
-
-    mock_server.assert_called_with(
-        mock_est.config.cluster_spec,
-        job_name=mock_est.config.task_type,
-        task_index=mock_est.config.task_id,
-        config=test.mock.ANY,
-        protocol=None,
-        start=False)
-
-    self.assertTrue(mock_server_instance.start.called)
-
-    mock_est.train.assert_called_with(
-        input_fn=train_spec.input_fn,
-        max_steps=train_spec.max_steps,
-        hooks=list(train_spec.hooks),
-        saving_listeners=test.mock.ANY)
-    mock_est.evaluate.assert_not_called()
-    mock_est.export_savedmodel.assert_not_called()
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_train_with_train_hooks(self, unused_mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = self._run_config
-    train_spec = training.TrainSpec(
-        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-    extra_hooks = [_FakeHook()]
-
-    executor = training._TrainingExecutor(
-        mock_est, train_spec, mock_eval_spec, train_hooks=extra_hooks)
-    self._run_task(executor)
-
-    mock_est.train.assert_called_with(
-        input_fn=train_spec.input_fn,
-        max_steps=train_spec.max_steps,
-        hooks=list(train_spec.hooks) + extra_hooks,
-        saving_listeners=test.mock.ANY)
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_no_server_startup_in_google(self, mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-    tf_config = {'TF_CONFIG': json.dumps(_TF_CONFIG_FOR_GOOGLE)}
-    with test.mock.patch.dict('os.environ', tf_config):
-      self._run_task(executor)
-      mock_server.assert_not_called()
-
-  def test_fail_with_empty_cluster_spec(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = None
-    mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'worker'
-    mock_est.config.task_id = 2
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
-                                                mock_eval_spec))
-
-  def test_fail_with_empty_master(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec(
-        {'worker': ['dummy', 'dummy1']})
-    mock_est.config.master = ''
-    mock_est.config.task_type = 'worker'
-    mock_est.config.task_id = 2
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
-                                                mock_eval_spec))
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_single_worker_node_with_empty_tf_master(
-      self, mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    # Single node cluster.
-    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
-    mock_est.config.master = ''
-    mock_est.config.task_type = 'worker'
-    mock_est.config.task_id = 2
-
-    self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
-                                              mock_eval_spec))
-    self.assertTrue(mock_est.train.called)
-    mock_server.assert_not_called()
-
-  def test_fail_with_empty_task_type(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
-    mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = ''
-    mock_est.config.task_id = 2
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
-                                                mock_eval_spec))
-
-  def test_fail_with_none_task_id(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
-    mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'worker'
-    mock_est.config.task_id = None
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
-                                                mock_eval_spec))
-
-
-class TrainingExecutorRunWorkerTest(_TrainingExecutorTrainingTest,
-                                    test.TestCase):
-  """Tests run_worker of _TrainingExecutor."""
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    _TrainingExecutorTrainingTest.__init__(
-        self,
-        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_WORKER))
-
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_delay_for_worker(self, _):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-
-    expected_secs = (self._run_config.task_id + 1) * _DELAY_SECS_PER_WORKER
-    with test.mock.patch.object(time, 'sleep') as mock_sleep:
-      mock_sleep.side_effect = lambda s: self.assertEqual(expected_secs, s)
-      self._run_task(executor)
-      self.assertTrue(mock_sleep.called)
-
-
-class TrainingExecutorRunChiefTest(_TrainingExecutorTrainingTest,
-                                   test.TestCase):
-  """Tests run_chief of _TrainingExecutor."""
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    test.TestCase.__init__(self, methodName)
-    _TrainingExecutorTrainingTest.__init__(
-        self,
-        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_CHIEF))
-
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_no_delay_for_chief(self, _):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-
-    with test.mock.patch.object(time, 'sleep') as mock_sleep:
-      self._run_task(executor)
-      mock_sleep.assert_not_called()
-
-
-class TrainingExecutorRunMasterTest(test.TestCase):
-  """Tests run_chief of _TrainingExecutor."""
-
-  def setUp(self):
-    self._run_config = _create_run_config_with_cluster_spec(
-        _TF_CONFIG_FOR_MASTER)
-
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_no_delay_for_master(self, _):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
-    mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(
-        spec=training.TrainSpec, max_steps=123, hooks=[])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-
-    with test.mock.patch.object(time, 'sleep') as mock_sleep:
-      executor.run_master()
-      mock_sleep.assert_not_called()
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_train_with_train_spec(self, mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
-    mock_est.config = self._run_config
-    train_spec = training.TrainSpec(
-        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
-    mock_server_instance = mock_server.return_value
-
-    executor = training._TrainingExecutor(mock_est, train_spec, mock_eval_spec)
-    executor.run_master()
-
-    mock_server.assert_called_with(
-        mock_est.config.cluster_spec,
-        job_name=mock_est.config.task_type,
-        task_index=mock_est.config.task_id,
-        config=test.mock.ANY,
-        protocol=None,
-        start=False)
-
-    self.assertTrue(mock_server_instance.start.called)
-
-    mock_est.train.assert_called_with(
-        input_fn=train_spec.input_fn,
-        max_steps=train_spec.max_steps,
-        hooks=list(train_spec.hooks),
-        saving_listeners=test.mock.ANY)
-    mock_est.export_savedmodel.assert_not_called()
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_train_with_no_eval_spec_fails(self, mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
-    mock_est.config = self._run_config
-    train_spec = training.TrainSpec(
-        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
-    eval_spec = None
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
-      executor.run_master()
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_train_with_train_hooks(self, mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
-    mock_est.config = self._run_config
-    train_spec = training.TrainSpec(
-        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
-    extra_hooks = [_FakeHook()]
-
-    executor = training._TrainingExecutor(
-        mock_est, train_spec, mock_eval_spec, train_hooks=extra_hooks)
-    executor.run_master()
-
-    mock_est.train.assert_called_with(
-        input_fn=train_spec.input_fn,
-        max_steps=train_spec.max_steps,
-        hooks=list(train_spec.hooks) + extra_hooks,
-        saving_listeners=test.mock.ANY)
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_no_server_startup_in_google(self, mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
-    mock_est.config = self._run_config
-    mock_train_spec = test.mock.Mock(
-        spec=training.TrainSpec, max_steps=123, hooks=[])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-    tf_config = {'TF_CONFIG': json.dumps(_TF_CONFIG_FOR_GOOGLE)}
-    with test.mock.patch.dict('os.environ', tf_config):
-      executor.run_master()
-      mock_server.assert_not_called()
-
-  def test_fail_with_empty_cluster_spec(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = None
-    mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'master'
-    mock_est.config.task_id = 2
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      training._TrainingExecutor(
-          mock_est, mock_train_spec, mock_eval_spec).run_master()
-
-  def test_fail_with_empty_master(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec(
-        {'master': ['dummy'], 'worker': ['dummy1']})
-    mock_est.config.master = ''
-    mock_est.config.task_type = 'master'
-    mock_est.config.task_id = 0
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      training._TrainingExecutor(
-          mock_est, mock_train_spec, mock_eval_spec).run_master()
-
-  @test.mock.patch.object(time, 'sleep')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_single_master_node_with_empty_tf_master(
-      self, mock_server, unused_mock_sleep):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
-
-    mock_train_spec = test.mock.Mock(
-        spec=training.TrainSpec, max_steps=123, hooks=[])
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec(
-        {'master': ['dummy']})
-    mock_est.config.master = ''
-    mock_est.config.task_type = 'master'
-    mock_est.config.task_id = 0
-
-    executor = training._TrainingExecutor(
-        mock_est, mock_train_spec, mock_eval_spec)
-    executor.run_master()
-
-    mock_server.assert_not_called()
-    self.assertTrue(mock_est.train.called)
-
-  def test_fail_with_empty_task_type(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec({'master': ['dummy']})
-    mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = ''
-    mock_est.config.task_id = 2
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      training._TrainingExecutor(
-          mock_est, mock_train_spec, mock_eval_spec).run_master()
-
-  def test_fail_with_none_task_id(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec({'master': ['dummy']})
-    mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'master'
-    mock_est.config.task_id = None
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      training._TrainingExecutor(
-          mock_est, mock_train_spec, mock_eval_spec).run_master()
-
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_run_master_triggers_evaluate_and_export(self, _):
-
-    def estimator_train(saving_listeners, *args, **kwargs):
-      #  There shalt be a saving_listener.  Estimator is going to call
-      # `after_save`.
-      del args, kwargs
-      saving_listeners[0].begin()
-      saving_listeners[0].after_save(session=None, global_step_value=0)
-      saving_listeners[0].after_save(session=None, global_step_value=10)
-
-    mock_est = test.mock.Mock(
-        spec=estimator_lib.Estimator, model_dir='path/', train=estimator_train)
-    mock_est.latest_checkpoint.return_value = 'checkpoint_path/'
-    mock_est.config = self._run_config
-
-    exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
-    exporter.name = 'see_whether_export_is_called'
-
-    train_spec = training.TrainSpec(input_fn=lambda: 1, max_steps=300)
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, steps=2, exporters=exporter)
-    eval_result = {_GLOBAL_STEP_KEY: train_spec.max_steps}
-    mock_est.evaluate.return_value = eval_result
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    executor.run_master()
-
-    mock_est.evaluate.assert_called_with(
-        name=eval_spec.name,
-        input_fn=eval_spec.input_fn,
-        steps=eval_spec.steps,
-        checkpoint_path='checkpoint_path/',
-        hooks=eval_spec.hooks)
-    self.assertEqual(1, exporter.export.call_count)
-    exporter.export.assert_called_with(
-        estimator=mock_est,
-        export_path=os.path.join('path/', 'export', exporter.name),
-        checkpoint_path='checkpoint_path/',
-        eval_result=eval_result,
-        is_the_final_export=True)
-
-  @test.mock.patch.object(basic_session_run_hooks, 'SecondOrStepTimer')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_run_master_throttle_eval(self, _, mock_timer_class):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
-
-    mock_timer = test.mock.Mock()
-    mock_timer_class.return_value = mock_timer
-
-    def estimator_train(saving_listeners, *args, **kwargs):
-      del args, kwargs
-      saving_listeners[0].begin()
-
-      # Call four times.
-      mock_timer.should_trigger_for_step.return_value = True
-      saving_listeners[0].after_save(session=None, global_step_value=None)
-
-      mock_timer.should_trigger_for_step.return_value = True
-      saving_listeners[0].after_save(session=None, global_step_value=None)
-
-      mock_timer.should_trigger_for_step.return_value = False
-      saving_listeners[0].after_save(session=None, global_step_value=None)
-
-      mock_timer.should_trigger_for_step.return_value = True
-      saving_listeners[0].after_save(session=None, global_step_value=None)
-
-    mock_est.train = estimator_train
-    mock_est.latest_checkpoint.side_effect = ['ckpt1', 'ckpt2']
-    mock_est.config = self._run_config
-
-    exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
-    exporter.name = 'see_whether_export_is_called'
-
-    train_spec = training.TrainSpec(input_fn=lambda: 1, max_steps=300)
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, steps=2, exporters=exporter, throttle_secs=10)
-
-    mock_est.evaluate.side_effect = [
-        {_GLOBAL_STEP_KEY: train_spec.max_steps //2},
-        {_GLOBAL_STEP_KEY: train_spec.max_steps}
-    ]
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    executor.run_master()
-
-    self.assertEqual(2, mock_est.evaluate.call_count)
-    self.assertEqual(2, exporter.export.call_count)
-
-    is_final_export_list = [call[1]['is_the_final_export']
-                            for call in exporter.export.call_args_list]
-    self.assertEqual([False, True], is_final_export_list)
-
-  @test.mock.patch.object(basic_session_run_hooks, 'SecondOrStepTimer')
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_run_master_throttle_eval_which_skips_final_ckpt(
-      self, _, mock_timer_class):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
-
-    mock_timer = test.mock.Mock()
-    mock_timer_class.return_value = mock_timer
-
-    def estimator_train(saving_listeners, *args, **kwargs):
-      del args, kwargs
-      saving_listeners[0].begin()
-
-      # Call tree times (one for first saving).
-      mock_timer.should_trigger_for_step.return_value = True
-      saving_listeners[0].after_save(session=None, global_step_value=0)
-
-      mock_timer.should_trigger_for_step.return_value = True
-      saving_listeners[0].after_save(session=None, global_step_value=125)
-
-      mock_timer.should_trigger_for_step.return_value = False
-      saving_listeners[0].after_save(session=None, global_step_value=250)
-
-      # At the end evaluate should be called even if throttle secs prevents it.
-      mock_timer.should_trigger_for_step.return_value = False
-      saving_listeners[0].end(session=None, global_step_value=300)
-
-    mock_est.train = estimator_train
-    mock_est.latest_checkpoint.side_effect = ['ckpt1', 'ckpt2']
-    mock_est.config = self._run_config
-
-    exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
-    exporter.name = 'see_whether_export_is_called'
-
-    train_spec = training.TrainSpec(input_fn=lambda: 1, max_steps=300)
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, steps=2, exporters=exporter, throttle_secs=10)
-
-    mock_est.evaluate.side_effect = [
-        {_GLOBAL_STEP_KEY: train_spec.max_steps //2},
-        {_GLOBAL_STEP_KEY: train_spec.max_steps}
-    ]
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    executor.run_master()
-
-    self.assertEqual(2, mock_est.evaluate.call_count)
-    self.assertEqual(2, exporter.export.call_count)
-
-    is_final_export_list = [call[1]['is_the_final_export']
-                            for call in exporter.export.call_args_list]
-    self.assertEqual([False, True], is_final_export_list)
-
-
-class TrainingExecutorRunEvaluatorTest(test.TestCase):
-  """Tests run_evaluator of _TrainingExecutor."""
-
-  def _set_up_mock_est_to_train_and_evaluate_once(self, mock_est,
-                                                  mock_train_spec):
-    """Sets global step in eval result to end the while True eval loop."""
-    training_max_step = 200
-    mock_est.evaluate.return_value = {_GLOBAL_STEP_KEY: training_max_step}
-    mock_train_spec.max_steps = training_max_step
-
-  def test_evaluate_with_evaluate_spec(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.latest_checkpoint.return_value = 'latest_it_is'
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, steps=2, hooks=[_FakeHook()], name='cont_eval',
-        start_delay_secs=0, throttle_secs=0)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    executor.run_evaluator()
-
-    mock_est.evaluate.assert_called_with(
-        name='cont_eval',
-        input_fn=eval_spec.input_fn,
-        steps=eval_spec.steps,
-        checkpoint_path='latest_it_is',
-        hooks=eval_spec.hooks)
-    self.assertFalse(mock_est.train.called)
-
-  def test_evaluate_with_no_eval_spec_fails(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.latest_checkpoint.return_value = 'latest_it_is'
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
-
-    eval_spec = None
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-
-    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
-      executor.run_evaluator()
-
-  def test_evaluate_with_train_hooks(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.latest_checkpoint.return_value = 'latest_it_is'
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1,
-        steps=2,
-        hooks=[_FakeHook()],
-        name='cont_eval',
-        start_delay_secs=0,
-        throttle_secs=0)
-
-    # The train_hooks will not be called during eval.
-    mock_hook = test.mock.Mock(spec=session_run_hook.SessionRunHook)
-    executor = training._TrainingExecutor(
-        mock_est, mock_train_spec, eval_spec, train_hooks=[mock_hook])
-    executor.run_evaluator()
-
-    mock_hook.begin.assert_not_called()
-
-  def test_evaluate_multiple_times(self):
-    training_max_step = 200
-
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.model_dir = compat.as_bytes(test.get_temp_dir())
-    mock_est.evaluate.side_effect = [
-        {_GLOBAL_STEP_KEY: training_max_step // 2},
-        {_GLOBAL_STEP_KEY: training_max_step}
-    ]
-    mock_est.latest_checkpoint.side_effect = ['path_1', 'path_2']
-
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_train_spec.max_steps = training_max_step
-
-    exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
-    exporter.name = 'see_how_many_times_export_is_called'
-
-    mock_est.times_export_was_called = 0
-    mock_est.times_final_export_was_called = 0
-    def export(estimator, export_path, checkpoint_path, eval_result,
-               is_the_final_export):
-      del export_path, checkpoint_path, eval_result
-      estimator.times_export_was_called += 1
-      # final_export is happened at the end.
-      self.assertEqual(0, estimator.times_final_export_was_called)
-      if is_the_final_export:
-        estimator.times_final_export_was_called += 1
-
-    exporter.export = export
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1,
-        start_delay_secs=0,
-        throttle_secs=0,
-        exporters=exporter)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    executor.run_evaluator()
-
-    self.assertEqual(2, mock_est.evaluate.call_count)
-    self.assertEqual(2, mock_est.times_export_was_called)
-    self.assertEqual(1, mock_est.times_final_export_was_called)
-
-  def test_evaluate_listener_before_eval(self):
-    training_max_step = 200
-
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.model_dir = compat.as_bytes(test.get_temp_dir())
-    # Without early stopping, this eval will be run twice.
-    mock_est.evaluate.side_effect = [{
-        _GLOBAL_STEP_KEY: training_max_step // 2
-    }, {
-        _GLOBAL_STEP_KEY: training_max_step
-    }]
-    mock_est.latest_checkpoint.side_effect = ['path_1', 'path_2']
-
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, hooks=[])
-    mock_train_spec.max_steps = training_max_step
-
-    class _Listener(training._ContinuousEvalListener):
-
-      def __init__(self):
-        self.call_count = 0
-
-      def before_eval(self):
-        self.call_count += 1
-        return  self.call_count == 1
-
-    listener = _Listener()
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
-
-    training._TrainingExecutor(
-        mock_est, mock_train_spec, eval_spec,
-        continuous_eval_listener=listener).run_evaluator()
-
-    # Before_eval returns False during the second time, so, evaluate will be
-    # called once.
-    self.assertEqual(1, mock_est.evaluate.call_count)
-    self.assertEqual(2, listener.call_count)
-
-  def test_evaluate_listener_after_eval(self):
-    training_max_step = 200
-
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.model_dir = compat.as_bytes(test.get_temp_dir())
-    # Without early stopping, this eval will be run twice.
-    expected_eval_metrics = [{
-        _GLOBAL_STEP_KEY: training_max_step // 2
-    }, {
-        _GLOBAL_STEP_KEY: training_max_step
-    }]
-    mock_est.evaluate.side_effect = expected_eval_metrics
-    mock_est.latest_checkpoint.side_effect = ['path_1', 'path_2']
-
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_train_spec.max_steps = training_max_step
-
-    class _Listener(training._ContinuousEvalListener):
-
-      def __init__(self):
-        self.call_count = 0
-
-      def after_eval(self, eval_result):
-        self.call_count += 1
-        self.eval_result = eval_result
-        return False
-
-    listener = _Listener()
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
-
-    training._TrainingExecutor(
-        mock_est, mock_train_spec, eval_spec,
-        continuous_eval_listener=listener).run_evaluator()
-
-    # after_eval returns False during the first time, so, evaluate will be
-    # called once.
-    self.assertEqual(1, mock_est.evaluate.call_count)
-    self.assertEqual(1, listener.call_count)
-    self.assertAllEqual(expected_eval_metrics[0], listener.eval_result.metrics)
-    self.assertEqual('path_1', listener.eval_result.checkpoint_path)
-
-  def test_final_export_is_true_in_the_end(self):
-    training_max_step = 200
-
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.model_dir = compat.as_bytes(test.get_temp_dir())
-    mock_est.evaluate.side_effect = [
-        {_GLOBAL_STEP_KEY: training_max_step // 2},
-        {_GLOBAL_STEP_KEY: training_max_step}
-    ]
-    mock_est.latest_checkpoint.side_effect = ['path_1', 'path_2']
-
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_train_spec.max_steps = training_max_step
-
-    mock_est.times_export_fn_was_called = 0
-    mock_est.times_the_final_export_was_true = 0
-    def export(estimator, export_path, checkpoint_path, eval_result,
-               is_the_final_export):
-      del export_path, checkpoint_path, eval_result
-      estimator.times_export_fn_was_called += 1
-      if is_the_final_export:
-        estimator.times_the_final_export_was_true += 1
-
-    exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
-    exporter.name = 'see_how_many_times_export_is_called'
-    exporter.export = export
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1,
-        start_delay_secs=0,
-        throttle_secs=0,
-        exporters=exporter)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    executor.run_evaluator()
-
-    self.assertEqual(2, mock_est.evaluate.call_count)
-    self.assertEqual(2, mock_est.times_export_fn_was_called)
-    self.assertEqual(1, mock_est.times_the_final_export_was_true)
-
-  def test_skip_evaluation_due_to_ckpt(self):
-    training_max_step = 200
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.evaluate.side_effect = [
-        {_GLOBAL_STEP_KEY: training_max_step // 2},
-        {_GLOBAL_STEP_KEY: training_max_step}
-    ]
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_train_spec.max_steps = training_max_step
-
-    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
-
-    # First two items are invalid, next two items are same.
-    mock_est.latest_checkpoint.side_effect = [
-        None, '', 'same', 'same', 'path_2'
-    ]
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=2)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      executor.run_evaluator()
-
-    # Three checkpoint paths are invalid.
-    self.assertEqual(5, mock_est.latest_checkpoint.call_count)
-    self.assertEqual(2, mock_est.evaluate.call_count)
-
-    # Two warning logs are expected (last warning time is reset after a
-    # successuful evaluation)
-    self.assertEqual(2, mock_log.call_count)
-
-  def test_warning_if_throttle_secs_is_zero(self):
-    training_max_step = 200
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.evaluate.side_effect = [
-        {_GLOBAL_STEP_KEY: training_max_step}
-    ]
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_train_spec.max_steps = training_max_step
-
-    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
-
-    # We need to make the first one invalid, so it will check the
-    # throttle_secs=0.
-    mock_est.latest_checkpoint.side_effect = [None, 'path']
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      executor.run_evaluator()
-
-    # First ckpt is invalid.
-    self.assertEqual(2, mock_est.latest_checkpoint.call_count)
-    self.assertEqual(1, mock_est.evaluate.call_count)
-
-    self.assertRegexpMatches(str(mock_log.call_args), _INPROPER_THROTTL_SECS)
-
-  def test_continuous_eval_listener_eval_result(self):
-    training_max_step = 200
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    expected_eval_metrics = [{
-        _GLOBAL_STEP_KEY: training_max_step // 2
-    }, {
-        _GLOBAL_STEP_KEY: training_max_step
-    }]
-    mock_est.evaluate.side_effect = expected_eval_metrics
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_train_spec.max_steps = training_max_step
-
-    class _Listener(training._ContinuousEvalListener):
-
-      def __init__(self):
-        self.eval_results = []
-
-      def after_eval(self, eval_result):
-        self.eval_results.append(eval_result)
-        return True
-
-    continuous_eval_listener = _Listener()
-
-    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
-
-    # First two items are invalid, next two items are same.
-    mock_est.latest_checkpoint.side_effect = [
-        None, '', 'same', 'same', 'path_2'
-    ]
-    expected_eval_results = [
-        training._EvalResult(training._EvalStatus.MISSING_CHECKPOINT),
-        training._EvalResult(training._EvalStatus.MISSING_CHECKPOINT),
-        training._EvalResult(
-            training._EvalStatus.EVALUATED,
-            metrics=expected_eval_metrics[0],
-            checkpoint_path='same'),
-        training._EvalResult(training._EvalStatus.NO_NEW_CHECKPOINT),
-        training._EvalResult(
-            training._EvalStatus.EVALUATED,
-            metrics=expected_eval_metrics[1],
-            checkpoint_path='path_2'),
-    ]
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=0)
-
-    executor = training._TrainingExecutor(
-        mock_est,
-        mock_train_spec,
-        eval_spec,
-        continuous_eval_listener=continuous_eval_listener)
-    executor.run_evaluator()
-
-    # Three checkpoint paths are invalid.
-    self.assertEqual(5, mock_est.latest_checkpoint.call_count)
-    self.assertEqual(2, mock_est.evaluate.call_count)
-
-    self.assertEqual(5, len(continuous_eval_listener.eval_results))
-    for i, result in enumerate(continuous_eval_listener.eval_results):
-      self.assertEqual(expected_eval_results[i].status, result.status)
-      self.assertAllEqual(expected_eval_results[i].metrics, result.metrics)
-      self.assertEqual(expected_eval_results[i].checkpoint_path,
-                       result.checkpoint_path)
-
-  def test_sleep_start_delay_secs(self):
-    training_max_step = 200
-    start_delay_secs = 123
-
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.evaluate.return_value = {_GLOBAL_STEP_KEY: training_max_step}
-    mock_est.model_dir = compat.as_bytes(test.get_temp_dir())
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_train_spec.max_steps = training_max_step
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, steps=2, hooks=[_FakeHook()], name='cont_eval',
-        start_delay_secs=start_delay_secs, throttle_secs=0)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    with test.mock.patch.object(time, 'sleep') as mock_sleep:
-      executor.run_evaluator()
-      mock_sleep.assert_called_with(start_delay_secs)
-      self.assertTrue(mock_est.evaluate.called)
-
-  @test.mock.patch.object(time, 'time')
-  @test.mock.patch.object(time, 'sleep')
-  def test_throttle_secs(self, mock_sleep, mock_time):
-    throttle_secs = 123
-    operation_secs = 12
-
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1, start_delay_secs=0, throttle_secs=throttle_secs)
-
-    mock_time.side_effect = [921, 921 + operation_secs]
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    # Disable logging as it calls time.time also.
-    with test.mock.patch.object(logging, 'info'):
-      executor.run_evaluator()
-    mock_sleep.assert_called_with(throttle_secs - operation_secs)
-    self.assertTrue(mock_est.evaluate.called)
-
-  def test_that_export_is_called(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
-
-    def export(estimator, *args, **kwargs):
-      del args, kwargs
-      estimator.export_was_called = True
-
-    exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
-    exporter.name = 'see_whether_export_is_called'
-    exporter.export = export
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: 1,
-        steps=2,
-        start_delay_secs=0,
-        throttle_secs=0,
-        exporters=exporter)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    executor.run_evaluator()
-
-    # Verify that export was called on the right estimator.
-    self.assertTrue(mock_est.export_was_called)
-
-  def test_errors_out_if_evaluate_returns_empty_dict(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    train_spec = training.TrainSpec(input_fn=lambda: 1)
-    eval_spec = training.EvalSpec(input_fn=(lambda: 1),
-                                  start_delay_secs=0, throttle_secs=0)
-    mock_est.evaluate.return_value = {}
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(ValueError, _INVALID_EMPTY_EVAL_RESULT_ERR):
-      executor.run_evaluator()
-
-  def test_errors_out_if_evaluate_returns_non_dict(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    train_spec = training.TrainSpec(input_fn=lambda: 1)
-    eval_spec = training.EvalSpec(input_fn=(lambda: 1),
-                                  start_delay_secs=0, throttle_secs=0)
-    mock_est.evaluate.return_value = 123
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_RESULT_TYPE_ERR):
-      executor.run_evaluator()
-
-  def test_errors_out_if_evaluate_returns_dict_without_global_step(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    train_spec = training.TrainSpec(input_fn=lambda: 1)
-    eval_spec = training.EvalSpec(input_fn=(lambda: 1),
-                                  start_delay_secs=0, throttle_secs=0)
-    mock_est.evaluate.return_value = {'loss': 123}
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(ValueError,
-                                 _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR):
-      executor.run_evaluator()
-
-
-class TrainingExecutorRunPsTest(test.TestCase):
-  """Tests run_ps of _TrainingExecutor."""
-
-  @test.mock.patch.object(server_lib, 'Server')
-  def test_std_server(self, mock_server):
-    mock_server_instance = test.mock.Mock()
-    mock_server.return_value = mock_server_instance
-
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = _create_run_config_with_cluster_spec(_TF_CONFIG_FOR_PS)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-    executor.run_ps()
-
-    mock_server.assert_called_with(
-        mock_est.config.cluster_spec,
-        job_name=mock_est.config.task_type,
-        task_index=mock_est.config.task_id,
-        config=test.mock.ANY,
-        protocol=None,
-        start=False)
-
-    self.assertTrue(mock_server_instance.start.called)
-    self.assertTrue(mock_server_instance.join.called)
-
-  def test_fail_with_empty_cluster_spec(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = None
-    mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'ps'
-    mock_est.config.task_id = 2
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      training._TrainingExecutor(mock_est, mock_train_spec,
-                                 mock_eval_spec).run_ps()
-
-  def test_fail_with_empty_master(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
-    mock_est.config.master = ''
-    mock_est.config.task_type = 'ps'
-    mock_est.config.task_id = 2
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      training._TrainingExecutor(mock_est, mock_train_spec,
-                                 mock_eval_spec).run_ps()
-
-  def test_fail_with_empty_task_type(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
-    mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = ''
-    mock_est.config.task_id = 2
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      training._TrainingExecutor(mock_est, mock_train_spec,
-                                 mock_eval_spec).run_ps()
-
-  def test_fail_with_none_task_id(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
-    mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'ps'
-    mock_est.config.task_id = None
-
-    with self.assertRaisesRegexp(RuntimeError,
-                                 _INVALID_CONFIG_FOR_STD_SERVER_MSG):
-      training._TrainingExecutor(mock_est, mock_train_spec,
-                                 mock_eval_spec).run_ps()
-
-
-class StopAtSecsHookTest(test.TestCase):
-  """Tests StopAtSecsHook."""
-
-  @test.mock.patch.object(time, 'time')
-  def test_stops_after_time(self, mock_time):
-    mock_time.return_value = 1484695987.209386
-    hook = training._StopAtSecsHook(1000)
-    with ops.Graph().as_default():
-      no_op = control_flow_ops.no_op()
-      # some time passed before training starts
-      mock_time.return_value += 250
-      with monitored_session.MonitoredSession(hooks=[hook]) as sess:
-        self.assertFalse(sess.should_stop())
-        sess.run(no_op)
-        self.assertFalse(sess.should_stop())
-        mock_time.return_value += 500
-        sess.run(no_op)
-        self.assertFalse(sess.should_stop())
-        mock_time.return_value += 400
-        sess.run(no_op)
-        self.assertFalse(sess.should_stop())
-        mock_time.return_value += 200
-        sess.run(no_op)
-        self.assertTrue(sess.should_stop())
-
-
-class TrainingExecutorRunLocalTest(test.TestCase):
-  """Tests run_local of _TrainingExecutor."""
-
-  def _model_fn(self, features, labels, mode):
-    del labels
-    with ops.control_dependencies([features]):
-      train_op = state_ops.assign_add(training_util.get_global_step(), 1)
-    return model_fn_lib.EstimatorSpec(
-        mode,
-        loss=constant_op.constant(0.),
-        train_op=train_op,
-        predictions=constant_op.constant([[10.]]),
-        eval_metric_ops={'mean_of_features': metrics_lib.mean(features)})
-
-  def _input_fn(self, repeat=True):
-    ds = dataset_ops.Dataset.from_tensors([1])
-    if repeat:
-      return ds.repeat()
-    return ds
-
-  def unique_checkpoint_every_time_fn(self):
-    return 'checkpoint_path_%s/' % random.random()
-
-  def test_runs_evaluate_with_every_new_checkpoint(self):
-    est = estimator_lib.Estimator(
-        model_fn=self._model_fn,
-        config=run_config_lib.RunConfig(save_checkpoints_steps=10))
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-
-    mock_est.times_export_was_called = 0
-    mock_est.times_final_export_was_called = 0
-    def export(estimator, export_path, checkpoint_path, eval_result,
-               is_the_final_export):
-      del export_path, checkpoint_path, eval_result
-      estimator.times_export_was_called += 1
-      # final_export is happened at the end.
-      self.assertEqual(0, estimator.times_final_export_was_called)
-      if is_the_final_export:
-        estimator.times_final_export_was_called += 1
-
-    exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
-    exporter.name = 'see_how_many_times_export_is_called'
-    exporter.export = export
-
-    train_spec = training.TrainSpec(input_fn=self._input_fn, max_steps=22)
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False),
-        throttle_secs=0,
-        exporters=exporter)
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    executor.run_local()
-
-    self.assertEqual(1, mock_est.train.call_count)
-    self.assertEqual(3, mock_est.evaluate.call_count)
-    self.assertEqual(3, mock_est.times_export_was_called)
-    self.assertEqual(1, mock_est.times_final_export_was_called)
-
-  def test_runs_with_eval_listener_before_eval(self):
-    est = estimator_lib.Estimator(
-        model_fn=self._model_fn,
-        config=run_config_lib.RunConfig(save_checkpoints_steps=10))
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-    mock_est.latest_checkpoint = self.unique_checkpoint_every_time_fn
-
-    train_spec = training.TrainSpec(input_fn=self._input_fn, max_steps=12)
-    eval_spec = training.EvalSpec(input_fn=lambda: self._input_fn(repeat=False))
-    mock_est.evaluate.side_effect = [{_GLOBAL_STEP_KEY: train_spec.max_steps}]
-
-    class _Listener(training._ContinuousEvalListener):
-
-      def __init__(self):
-        self.call_count = 0
-
-      def before_eval(self):
-        self.call_count += 1
-        return False  # Will stop the run_local before first eval.
-
-    listener = _Listener()
-
-    executor = training._TrainingExecutor(
-        mock_est, train_spec, eval_spec, continuous_eval_listener=listener)
-    executor.run_local()
-
-    self.assertEqual(1, mock_est.train.call_count)
-    self.assertEqual(0, mock_est.evaluate.call_count)
-
-  def test_runs_with_eval_listener_after_eval(self):
-    est = estimator_lib.Estimator(
-        model_fn=self._model_fn,
-        config=run_config_lib.RunConfig(save_checkpoints_steps=10))
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-
-    train_spec = training.TrainSpec(input_fn=self._input_fn, max_steps=3000)
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False), throttle_secs=0)
-
-    class _Listener(training._ContinuousEvalListener):
-
-      def __init__(self):
-        self.call_count = 0
-
-      def after_eval(self, eval_result):
-        self.call_count += 1
-        return False  # Will stop the run_local after first eval.
-
-    listener = _Listener()
-
-    executor = training._TrainingExecutor(
-        mock_est, train_spec, eval_spec, continuous_eval_listener=listener)
-    metrics, _ = executor.run_local()  # pylint: disable=assignment-from-no-return
-
-    self.assertEqual(1, mock_est.train.call_count)
-    self.assertEqual(1, mock_est.evaluate.call_count)
-    self.assertEqual(1, listener.call_count)
-    # Should be less than max_steps since listener did early stopping.
-    self.assertLess(metrics[_GLOBAL_STEP_KEY], train_spec.max_steps)
-
-  def test_handles_no_new_checkpoint_found(self):
-    est = estimator_lib.Estimator(
-        model_fn=self._model_fn,
-        # disable saving checkpoint
-        config=run_config_lib.RunConfig(
-            save_checkpoints_steps=None, save_checkpoints_secs=None))
-    train_spec = training.TrainSpec(
-        input_fn=self._input_fn, max_steps=300, hooks=[_FakeHook()])
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False),
-        hooks=[_FakeHook()],
-        throttle_secs=100)
-
-    executor = training._TrainingExecutor(est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(ValueError,
-                                 'There should be a CheckpointSaverHook'):
-      executor.run_local()
-
-  def test_final_export_is_true_in_the_end(self):
-    est = estimator_lib.Estimator(
-        model_fn=self._model_fn,
-        config=run_config_lib.RunConfig(save_checkpoints_steps=10))
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-
-    mock_est.times_export_fn_was_called = 0
-    mock_est.times_the_final_export_was_true = 0
-    def export(estimator, export_path, checkpoint_path, eval_result,
-               is_the_final_export):
-      del export_path, checkpoint_path, eval_result
-      estimator.times_export_fn_was_called += 1
-      if is_the_final_export:
-        estimator.times_the_final_export_was_true += 1
-
-    exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
-    exporter.name = 'see_how_many_times_export_is_called'
-    exporter.export = export
-
-    train_spec = training.TrainSpec(
-        input_fn=self._input_fn, max_steps=12, hooks=[_FakeHook()])
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False),
-        throttle_secs=0,
-        exporters=exporter)
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    executor.run_local()
-
-    self.assertEqual(1, mock_est.train.call_count)
-    self.assertEqual(2, mock_est.evaluate.call_count)
-    self.assertEqual(2, mock_est.times_export_fn_was_called)
-    self.assertEqual(1, mock_est.times_the_final_export_was_true)
-
-  def test_train_and_evaluate_args(self):
-    est = estimator_lib.Estimator(model_fn=self._model_fn)
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-    train_spec = training.TrainSpec(
-        input_fn=self._input_fn, max_steps=300, hooks=[_FakeHook()])
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False),
-        steps=2,
-        hooks=[_FakeHook()],
-        name='local_eval')
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    executor.run_local()
-
-    mock_est.evaluate.assert_called_with(
-        name=eval_spec.name,
-        input_fn=eval_spec.input_fn,
-        steps=eval_spec.steps,
-        checkpoint_path=est.latest_checkpoint(),
-        hooks=eval_spec.hooks)
-
-    train_args = mock_est.train.call_args[1]
-    self.assertEqual(list(train_spec.hooks), list(train_args['hooks']))
-    self.assertEqual(train_spec.input_fn, train_args['input_fn'])
-    self.assertEqual(train_spec.max_steps, train_args['max_steps'])
-
-  def test_train_with_no_eval_spec_fails(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    train_spec = training.TrainSpec(
-        input_fn=lambda: 1, max_steps=300, hooks=[_FakeHook()])
-    eval_spec = None
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-
-    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
-      executor.run_local()
-
-  def test_train_hooks(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
-    mock_est.latest_checkpoint.return_value = 'checkpoint_path/'
-    train_spec = training.TrainSpec(
-        input_fn=lambda: 1, max_steps=300, hooks=[_FakeHook()])
-    eval_spec = training.EvalSpec(input_fn=lambda: 1, steps=2)
-    mock_est.evaluate.return_value = {_GLOBAL_STEP_KEY: train_spec.max_steps}
-    extra_hooks = [_FakeHook()]
-
-    executor = training._TrainingExecutor(
-        mock_est, train_spec, eval_spec, train_hooks=extra_hooks)
-    executor.run_local()
-
-    train_args = mock_est.train.call_args[1]
-    self.assertEqual(
-        list(train_spec.hooks) + extra_hooks, [
-            h for h in train_args['hooks']
-            if not isinstance(h, training._StopAtSecsHook)
-        ])
-
-  def test_that_export_is_called_with_run_local(self):
-    est = estimator_lib.Estimator(model_fn=self._model_fn)
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-    train_spec = training.TrainSpec(input_fn=self._input_fn, max_steps=12)
-    mock_est.evaluate.return_value = {_GLOBAL_STEP_KEY: train_spec.max_steps}
-
-    def export(estimator, *args, **kwargs):
-      del args, kwargs
-      estimator.export_was_called = True
-      return 'path_to_export'
-
-    exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
-    exporter.name = 'see_whether_export_is_called'
-    exporter.export = export
-
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False),
-        steps=2,
-        start_delay_secs=0,
-        throttle_secs=213,
-        exporters=exporter)
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    # pylint: disable=assignment-from-no-return
-    _, export_results = executor.run_local()
-    # pylint: enable=assignment-from-no-return
-
-    self.assertTrue(mock_est.export_was_called)
-    self.assertEqual(export_results, ['path_to_export'])
-
-  def test_errors_out_if_evaluate_returns_empty_dict(self):
-    est = estimator_lib.Estimator(
-        model_fn=self._model_fn,
-        config=run_config_lib.RunConfig(save_checkpoints_steps=2))
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-    train_spec = training.TrainSpec(input_fn=self._input_fn)
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False), throttle_secs=0)
-    mock_est.evaluate.return_value = {}
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(ValueError, _INVALID_EMPTY_EVAL_RESULT_ERR):
-      executor.run_local()
-
-  def test_errors_out_if_evaluate_returns_non_dict(self):
-    est = estimator_lib.Estimator(
-        model_fn=self._model_fn,
-        config=run_config_lib.RunConfig(save_checkpoints_steps=2))
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-    train_spec = training.TrainSpec(input_fn=self._input_fn)
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False), throttle_secs=0)
-    mock_est.evaluate.return_value = 123
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_RESULT_TYPE_ERR):
-      executor.run_local()
-
-  def test_errors_out_if_evaluate_returns_dict_without_global_step(self):
-    est = estimator_lib.Estimator(
-        model_fn=self._model_fn,
-        config=run_config_lib.RunConfig(save_checkpoints_steps=2))
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-    train_spec = training.TrainSpec(input_fn=self._input_fn)
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False), throttle_secs=0)
-    mock_est.evaluate.return_value = {'loss': 123}
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    with self.assertRaisesRegexp(ValueError,
-                                 _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR):
-      executor.run_local()
-
-  def test_train_and_evaluate_return_metrics(self):
-    est = estimator_lib.Estimator(model_fn=self._model_fn)
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, wraps=est)
-    train_spec = training.TrainSpec(
-        input_fn=self._input_fn, max_steps=12, hooks=[_FakeHook()])
-    eval_spec = training.EvalSpec(
-        input_fn=lambda: self._input_fn(repeat=False),
-        steps=2,
-        hooks=[_FakeHook()],
-        name='local_eval')
-
-    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
-    # pylint: disable=assignment-from-no-return
-    metrics, _ = executor.run_local()
-    # pylint: enable=assignment-from-no-return
-    self.assertEqual(metrics['global_step'], 12)
-
-
-class TrainAndEvaluateRunTest(test.TestCase):
-
-  def _test_run_task_and_executor(self, run_config):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = run_config
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-
-    executor.call_task = {}
-
-    def task_fn(name):
-
-      def _fn():
-        executor.call_task[name] = 1
-
-      return _fn
-
-    executor.run_chief = task_fn('chief')
-    executor.run_master = task_fn('master')
-    executor.run_ps = task_fn('ps')
-    executor.run_evaluator = task_fn('evaluator')
-    executor.run_worker = task_fn('worker')
-    executor.run_local = task_fn('local')
-    return executor
-
-  def test_run_chief(self):
-    executor = self._test_run_task_and_executor(
-        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_CHIEF))
-    executor.run()
-    self.assertEqual(1, executor.call_task['chief'])
-
-  def test_run_worker(self):
-    executor = self._test_run_task_and_executor(
-        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_WORKER))
-    executor.run()
-    self.assertEqual(1, executor.call_task['worker'])
-
-  def test_run_ps(self):
-    executor = self._test_run_task_and_executor(
-        run_config=_create_run_config_with_cluster_spec(_TF_CONFIG_FOR_PS))
-    executor.run()
-    self.assertEqual(1, executor.call_task['ps'])
-
-  def test_run_evaluator(self):
-    executor = self._test_run_task_and_executor(
-        run_config=_create_run_config_with_cluster_spec(
-            _TF_CONFIG_FOR_EVALUATOR))
-    executor.run()
-    self.assertEqual(1, executor.call_task['evaluator'])
-
-  def test_run_local(self):
-    executor = self._test_run_task_and_executor(
-        run_config=run_config_lib.RunConfig())
-    executor.run()
-    self.assertEqual(1, executor.call_task['local'])
-
-  def test_invalid_local_task(self):
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            'local': ['hos1:1'],
-        },
-        'task': {
-            'type': 'local',  # invalid task type.
-            'index': 0
-        }
-    }
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-    with self.assertRaisesRegexp(ValueError, _INVALID_LOCAL_TASK_WITH_CLUSTER):
-      executor.run()
-
-  def test_unsupported_task_due_to_missing_run_task(self):
-    unsupported_task = 'alloc'
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            unsupported_task: ['hos1:1'],
-        },
-        'task': {
-            'type': unsupported_task,
-            'index': 0
-        }
-    }
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN):
-      executor.run()
-
-  def test_unsupported_task_due_to_not_callable(self):
-    unsupported_task = 'alloc'
-    tf_config = {
-        'cluster': {
-            run_config_lib.TaskType.CHIEF: ['host0:0'],
-            unsupported_task: ['hos1:1'],
-        },
-        'task': {
-            'type': unsupported_task,
-            'index': 0
-        }
-    }
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = _create_run_config_with_cluster_spec(tf_config)
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-    executor.run_alloc = 123  # not callable
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TO_RUN):
-      executor.run()
-
-  def test_invalid_task_type(self):
-    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
-    mock_est.config = test.mock.Mock()
-    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
-    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
-
-    mock_est.config = test.mock.Mock()
-    mock_est.config.cluster_spec = server_lib.ClusterSpec({'1': ['dummy']})
-    mock_est.config.task_type = ''
-
-    executor = training._TrainingExecutor(mock_est, mock_train_spec,
-                                          mock_eval_spec)
-    with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE):
-      executor.run()
-
-
-class TrainAndEvaluateIntegrationTest(test.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    if self._model_dir:
-      shutil.rmtree(self._model_dir)
-
-  def _as_label(self, data_in_float):
-    return np.rint(data_in_float).astype(np.int64)
-
-  def _get_exporter(self, name, fc):
-    feature_spec = feature_column.make_parse_example_spec(fc)
-    serving_input_receiver_fn = (
-        export_lib.build_parsing_serving_input_receiver_fn(feature_spec))
-    return exporter_lib.LatestExporter(
-        name, serving_input_receiver_fn=serving_input_receiver_fn)
-
-  def _extract_loss_and_global_step(self, event_folder):
-    """Returns the loss and global step in last event."""
-    event_paths = glob.glob(os.path.join(event_folder, 'events*'))
-
-    loss = None
-    global_step_count = None
-
-    for e in summary_iterator.summary_iterator(event_paths[-1]):
-      current_loss = None
-      for v in e.summary.value:
-        if v.tag == 'loss':
-          current_loss = v.simple_value
-
-      # If loss is not found, global step is meaningless.
-      if current_loss is None:
-        continue
-
-      current_global_step = e.step
-      if global_step_count is None or current_global_step > global_step_count:
-        global_step_count = current_global_step
-        loss = current_loss
-
-    return (loss, global_step_count)
-
-  def test_complete_flow_with_non_distributed_configuration(self):
-    n_classes = 3
-    input_dimension = 2
-    batch_size = 10
-
-    eval_name = 'foo'
-    exporter_name = 'saved_model_exporter'
-
-    # max_steps should be larger than save_summary_steps
-    max_steps = 10
-    save_summary_steps = 9
-
-    data = np.linspace(
-        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
-    x_data = data.reshape(batch_size, input_dimension)
-    y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
-
-    # learn y = x
-    def train_input_fn():
-      return dataset_ops.Dataset.from_tensor_slices(({
-          'x': x_data
-      }, y_data)).batch(batch_size).repeat().shuffle(1000)
-
-    def eval_input_fn():
-      return dataset_ops.Dataset.from_tensor_slices(({
-          'x': x_data
-      }, y_data)).batch(batch_size)
-
-    def predict_input_fn():
-      return dataset_ops.Dataset.from_tensor_slices({
-          'x': x_data
-      }).batch(batch_size)
-
-    feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))]
-
-    est = dnn.DNNClassifier(
-        hidden_units=(2, 2),
-        feature_columns=feature_columns,
-        n_classes=n_classes,
-        config=run_config_lib.RunConfig(save_summary_steps=save_summary_steps),
-        model_dir=self._model_dir)
-
-    train_spec = training.TrainSpec(input_fn=train_input_fn,
-                                    max_steps=max_steps)
-
-    eval_spec = training.EvalSpec(
-        name=eval_name,
-        input_fn=eval_input_fn,
-        steps=None,
-        exporters=self._get_exporter(exporter_name, feature_columns),
-        throttle_secs=0)
-
-    training.train_and_evaluate(est, train_spec, eval_spec)
-
-    # Make sure nothing is stuck in limbo.
-    writer_cache.FileWriterCache.clear()
-
-    # Examine the training events. Use a range to check global step to avoid
-    # flakyness due to global step race condition.
-    training_loss, _ = self._extract_loss_and_global_step(est.model_dir)
-    self.assertIsNotNone(training_loss)
-
-    # Examine the eval events. The global step should be accurate.
-    eval_loss, eval_global_step = self._extract_loss_and_global_step(
-        event_folder=est.eval_dir(eval_name))
-    self.assertIsNotNone(eval_loss)
-    self.assertEqual(max_steps, eval_global_step)
-
-    # Examine the export folder.
-    export_dir = os.path.join(os.path.join(est.model_dir, 'export'),
-                              exporter_name)
-    self.assertTrue(gfile.Exists(export_dir))
-
-    # Examine the ckpt for predict.
-    predicted_proba = np.array([
-        x[prediction_keys.PredictionKeys.PROBABILITIES]
-        for x in est.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index fb110c4b7bbefad83870df682da4c9f3cdaf8242..88f36e572ad0f89655d5696f8316b49f3e2ec04a 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,142 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""util python module.
 
-"""Utilities for Estimators."""
+Importing from tensorflow.python.estimator is unsupported
+and will soon break!
+"""
+# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import time
+from tensorflow_estimator.python.estimator import util
 
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import training
-from tensorflow.python.util import compat
-from tensorflow.python.util import function_utils
+# Include attrs that start with single underscore.
+_HAS_DYNAMIC_ATTRIBUTES = True
+util.__all__ = [s for s in dir(util) if not s.startswith('__')]
 
-fn_args = function_utils.fn_args
-
-# When we create a timestamped directory, there is a small chance that the
-# directory already exists because another process is also creating these
-# directories. In this case we just wait one second to get a new timestamp and
-# try again. If this fails several times in a row, then something is seriously
-# wrong.
-MAX_DIRECTORY_CREATION_ATTEMPTS = 10
-
-
-def get_timestamped_dir(dir_base):
-  """Builds a path to a new subdirectory within the base directory.
-
-  The subdirectory will be named using the current time.
-  This guarantees monotonically increasing directory numbers even across
-  multiple runs of the pipeline.
-  The timestamp used is the number of seconds since epoch UTC.
-
-  Args:
-    dir_base: A string containing a directory to create the subdirectory under.
-
-  Returns:
-    The full path of the new subdirectory (which is not actually created yet).
-
-  Raises:
-    RuntimeError: if repeated attempts fail to obtain a unique timestamped
-      directory name.
-  """
-  attempts = 0
-  while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
-    timestamp = int(time.time())
-
-    result_dir = os.path.join(
-        compat.as_bytes(dir_base), compat.as_bytes(str(timestamp)))
-    if not gfile.Exists(result_dir):
-      # Collisions are still possible (though extremely unlikely): this
-      # directory is not actually created yet, but it will be almost
-      # instantly on return from this function.
-      return result_dir
-    time.sleep(1)
-    attempts += 1
-    logging.warn('Directory {} already exists; retrying (attempt {}/{})'.format(
-        result_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS))
-  raise RuntimeError('Failed to obtain a unique export directory name after '
-                     '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS))
-
-
-def parse_input_fn_result(result):
-  """Gets features, labels, and hooks from the result of an Estimator input_fn.
-
-  Args:
-    result: output of an input_fn to an estimator, which should be one of:
-
-      * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
-          tuple (features, labels) with same constraints as below.
-      * A tuple (features, labels): Where `features` is a `Tensor` or a
-        dictionary of string feature name to `Tensor` and `labels` is a
-        `Tensor` or a dictionary of string label name to `Tensor`. Both
-        `features` and `labels` are consumed by `model_fn`. They should
-        satisfy the expectation of `model_fn` from inputs.
-
-  Returns:
-    Tuple of features, labels, and input_hooks, where features are as described
-    above, labels are as described above or None, and input_hooks are a list
-    of SessionRunHooks to be included when running.
-
-  Raises:
-    ValueError: if the result is a list or tuple of length != 2.
-  """
-  input_hooks = []
-  try:
-    # We can't just check whether this is a tf.data.Dataset instance here,
-    # as this is plausibly a PerDeviceDataset. Try treating as a dataset first.
-    iterator = result.make_initializable_iterator()
-  except AttributeError:
-    # Not a dataset or dataset-like-object. Move along.
-    pass
-  else:
-    input_hooks.append(_DatasetInitializerHook(iterator))
-    result = iterator.get_next()
-  return parse_iterator_result(result) + (input_hooks,)
-
-
-def parse_iterator_result(result):
-  """Gets features, labels from result."""
-  if isinstance(result, (list, tuple)):
-    if len(result) != 2:
-      raise ValueError(
-          'input_fn should return (features, labels) as a len 2 tuple.')
-    return result[0], result[1]
-  return result, None
-
-
-class _DatasetInitializerHook(training.SessionRunHook):
-  """Creates a SessionRunHook that initializes the passed iterator."""
-
-  def __init__(self, iterator):
-    self._iterator = iterator
-
-  def begin(self):
-    self._initializer = self._iterator.initializer
-
-  def after_create_session(self, session, coord):
-    del coord
-    session.run(self._initializer)
-
-
-class StrategyInitFinalizeHook(training.SessionRunHook):
-  """Creates a SessionRunHook that initializes and shutsdown devices."""
-
-  def __init__(self, initialization_fn, finalize_fn):
-    self._initialization_fn = initialization_fn
-    self._finalize_fn = finalize_fn
-
-  def begin(self):
-    # We only create the init ops, but don't run it. We rely on SessionManager
-    # to run it for us.
-    self._init_ops = self._initialization_fn()
-    self._finalize_ops = self._finalize_fn()
-
-  def end(self, session):
-    logging.info('Finalize system.')
-    session.run(self._finalize_ops)
+from tensorflow_estimator.python.estimator.util import *
diff --git a/tensorflow/python/estimator/util_test.py b/tensorflow/python/estimator/util_test.py
deleted file mode 100644
index d440c454dc7857bf555f441469690864ff0a693d..0000000000000000000000000000000000000000
--- a/tensorflow/python/estimator/util_test.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for util.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import util
-from tensorflow.python.framework import constant_op
-from tensorflow.python.platform import test
-from tensorflow.python.training import training
-
-
-class UtilTest(test.TestCase):
-  """Tests for miscellaneous Estimator utils."""
-
-  def test_parse_input_fn_result_tuple(self):
-    def _input_fn():
-      features = constant_op.constant(np.arange(100))
-      labels = constant_op.constant(np.arange(100, 200))
-      return features, labels
-
-    features, labels, hooks = util.parse_input_fn_result(_input_fn())
-
-    with self.cached_session() as sess:
-      vals = sess.run([features, labels])
-
-    self.assertAllEqual(vals[0], np.arange(100))
-    self.assertAllEqual(vals[1], np.arange(100, 200))
-    self.assertEqual(hooks, [])
-
-  def test_parse_input_fn_result_dataset(self):
-    def _input_fn():
-      features = np.expand_dims(np.arange(100), 0)
-      labels = np.expand_dims(np.arange(100, 200), 0)
-      return dataset_ops.Dataset.from_tensor_slices((features, labels))
-
-    features, labels, hooks = util.parse_input_fn_result(_input_fn())
-
-    with training.MonitoredSession(hooks=hooks) as sess:
-      vals = sess.run([features, labels])
-
-    self.assertAllEqual(vals[0], np.arange(100))
-    self.assertAllEqual(vals[1], np.arange(100, 200))
-    self.assertIsInstance(hooks[0], util._DatasetInitializerHook)
-
-  def test_parse_input_fn_result_features_only(self):
-    def _input_fn():
-      return constant_op.constant(np.arange(100))
-
-    features, labels, hooks = util.parse_input_fn_result(_input_fn())
-
-    with self.cached_session() as sess:
-      vals = sess.run([features])
-
-    self.assertAllEqual(vals[0], np.arange(100))
-    self.assertEqual(labels, None)
-    self.assertEqual(hooks, [])
-
-  def test_parse_input_fn_result_features_only_dataset(self):
-    def _input_fn():
-      features = np.expand_dims(np.arange(100), 0)
-      return dataset_ops.Dataset.from_tensor_slices(features)
-
-    features, labels, hooks = util.parse_input_fn_result(_input_fn())
-
-    with training.MonitoredSession(hooks=hooks) as sess:
-      vals = sess.run([features])
-
-    self.assertAllEqual(vals[0], np.arange(100))
-    self.assertEqual(labels, None)
-    self.assertIsInstance(hooks[0], util._DatasetInitializerHook)
-
-  def test_parse_input_fn_result_invalid(self):
-    def _input_fn():
-      features = np.expand_dims(np.arange(100), 0)
-      labels = np.expand_dims(np.arange(100, 200), 0)
-      return dataset_ops.Dataset.from_tensor_slices((features, labels, labels))
-
-    with self.assertRaisesRegexp(ValueError, 'input_fn should return'):
-      util.parse_input_fn_result(_input_fn())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index ac53a84eef8ef1482cf4b3efbaac97a608668ac3..d24a7ae80c86d407ae3bb60ca55fff98be9f27a1 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -54,6 +54,7 @@ py_library(
     srcs = ["feature_column_v2.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":feature_column",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
@@ -101,6 +102,7 @@ py_test(
     tags = [
         "no_cuda_on_cpu_tap",
         "no_pip",
+        "no_windows",
     ],
     deps = [
         ":feature_column",
@@ -123,6 +125,7 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -130,10 +133,12 @@ py_test(
     name = "feature_column_v2_test",
     srcs = ["feature_column_v2_test.py"],
     data = [":vocabulary_testdata"],
+    shard_count = 5,
     srcs_version = "PY2AND3",
     tags = [
         "no_cuda_on_cpu_tap",
         "no_pip",
+        "no_windows",
     ],
     deps = [
         ":feature_column_py",
@@ -156,7 +161,7 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 5352796174b5f7b465fe470dca9877bb0622cc38..a858d92608db1a0d9d00b34f91860b7d4be01d68 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -121,6 +121,10 @@ Example of building model using FeatureColumns, this can be used in a
 
 NOTE: Functions prefixed with "_" indicate experimental or private parts of
 the API subject to change, and should not be relied upon!
+
+NOTE: The new feature columns are being developed in feature_column_v2.py and
+are a somewhat duplicate of the code here. Please make sure to update logic
+in both places.
 """
 
 from __future__ import absolute_import
@@ -226,7 +230,7 @@ def _internal_input_layer(features,
       return _get_logits()
 
 
-@tf_export('feature_column.input_layer')
+@tf_export(v1=['feature_column.input_layer'])
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -361,7 +365,7 @@ class InputLayer(object):
     return self._input_layer_template.weights
 
 
-@tf_export('feature_column.linear_model')
+@tf_export(v1=['feature_column.linear_model'])
 def linear_model(features,
                  feature_columns,
                  units=1,
@@ -441,15 +445,16 @@ def linear_model(features,
             [0, 0]: "d"
             [1, 0]: "e"
             [1, 1]: "f"
-            [1, 2]: "g"
+            [1, 2]: "f"
         }
       ```
-      with `sparse_combiner` as "mean", the linear model outputs conceptly are:
+      with `sparse_combiner` as "mean", the linear model outputs consequently
+      are:
       ```
-        y_0 = 1.0 / 2.0 * ( w_a + w_ b) + w_c + b_0
-        y_1 = w_d + 1.0 / 3.0 * ( w_e + w_ f + w_g) + b_1
+        y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b
+        y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b
       ```
-      where `y_i` is the output, `b_i` is the bias, and `w_x` is the weight
+      where `y_i` is the output, `b` is the bias, and `w_x` is the weight
       assigned to the presence of `x` in the input features.
     weight_collections: A list of collection names to which the Variable will be
       added. Note that, variables will also be added to collections
@@ -741,7 +746,7 @@ def _transform_features(features, feature_columns):
   return outputs
 
 
-@tf_export('feature_column.make_parse_example_spec')
+@tf_export(v1=['feature_column.make_parse_example_spec'])
 def make_parse_example_spec(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -802,11 +807,14 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
-@tf_export('feature_column.embedding_column')
-def embedding_column(
-    categorical_column, dimension, combiner='mean', initializer=None,
-    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
-    trainable=True):
+def _embedding_column(categorical_column,
+                      dimension,
+                      combiner='mean',
+                      initializer=None,
+                      ckpt_to_load_from=None,
+                      tensor_name_in_ckpt=None,
+                      max_norm=None,
+                      trainable=True):
   """`_DenseColumn` that converts from sparse, categorical input.
 
   Use this when your inputs are sparse, but you want to convert them to a dense
@@ -914,178 +922,11 @@ def embedding_column(
       trainable=trainable)
 
 
-@tf_export('feature_column.shared_embedding_columns')
-def shared_embedding_columns(
-    categorical_columns, dimension, combiner='mean', initializer=None,
-    shared_embedding_collection_name=None, ckpt_to_load_from=None,
-    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
-  """List of dense columns that convert from sparse, categorical input.
-
-  This is similar to `embedding_column`, except that it produces a list of
-  embedding columns that share the same embedding weights.
-
-  Use this when your inputs are sparse and of the same type (e.g. watched and
-  impression video IDs that share the same vocabulary), and you want to convert
-  them to a dense representation (e.g., to feed to a DNN).
-
-  Inputs must be a list of categorical columns created by any of the
-  `categorical_column_*` function. They must all be of the same type and have
-  the same arguments except `key`. E.g. they can be
-  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
-  all columns could also be weighted_categorical_column.
-
-  Here is an example embedding of two features for a DNNClassifier model:
-
-  ```python
-  watched_video_id = categorical_column_with_vocabulary_file(
-      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
-  impression_video_id = categorical_column_with_vocabulary_file(
-      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
-  columns = shared_embedding_columns(
-      [watched_video_id, impression_video_id], dimension=10)
-
-  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
-
-  label_column = ...
-  def input_fn():
-    features = tf.parse_example(
-        ..., features=make_parse_example_spec(columns + [label_column]))
-    labels = features.pop(label_column.name)
-    return features, labels
-
-  estimator.train(input_fn=input_fn, steps=100)
-  ```
-
-  Here is an example using `shared_embedding_columns` with model_fn:
-
-  ```python
-  def model_fn(features, ...):
-    watched_video_id = categorical_column_with_vocabulary_file(
-        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
-    impression_video_id = categorical_column_with_vocabulary_file(
-        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
-    columns = shared_embedding_columns(
-        [watched_video_id, impression_video_id], dimension=10)
-    dense_tensor = input_layer(features, columns)
-    # Form DNN layers, calculate loss, and return EstimatorSpec.
-    ...
-  ```
-
-  Args:
-    categorical_columns: List of categorical columns created by a
-      `categorical_column_with_*` function. These columns produce the sparse IDs
-      that are inputs to the embedding lookup. All columns must be of the same
-      type and have the same arguments except `key`. E.g. they can be
-      categorical_column_with_vocabulary_file with the same vocabulary_file.
-      Some or all columns could also be weighted_categorical_column.
-    dimension: An integer specifying dimension of the embedding, must be > 0.
-    combiner: A string specifying how to reduce if there are multiple entries
-      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
-      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
-      with bag-of-words columns. Each of this can be thought as example level
-      normalizations on the column. For more information, see
-      `tf.embedding_lookup_sparse`.
-    initializer: A variable initializer function to be used in embedding
-      variable initialization. If not specified, defaults to
-      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-      `1/sqrt(dimension)`.
-    shared_embedding_collection_name: Optional name of the collection where
-      shared embedding weights are added. If not given, a reasonable name will
-      be chosen based on the names of `categorical_columns`. This is also used
-      in `variable_scope` when creating shared embedding weights.
-    ckpt_to_load_from: String representing checkpoint name/pattern from which to
-      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
-    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
-      which to restore the column weights. Required if `ckpt_to_load_from` is
-      not `None`.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value, before combining.
-    trainable: Whether or not the embedding is trainable. Default is True.
-
-  Returns:
-    A list of dense columns that converts from sparse input. The order of
-    results follows the ordering of `categorical_columns`.
-
-  Raises:
-    ValueError: if `dimension` not > 0.
-    ValueError: if any of the given `categorical_columns` is of different type
-      or has different arguments than the others.
-    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
-      is specified.
-    ValueError: if `initializer` is specified and is not callable.
-    RuntimeError: if eager execution is enabled.
-  """
-  if context.executing_eagerly():
-    raise RuntimeError('shared_embedding_columns are not supported when eager '
-                       'execution is enabled.')
-
-  if (dimension is None) or (dimension < 1):
-    raise ValueError('Invalid dimension {}.'.format(dimension))
-  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
-    raise ValueError('Must specify both `ckpt_to_load_from` and '
-                     '`tensor_name_in_ckpt` or none of them.')
-
-  if (initializer is not None) and (not callable(initializer)):
-    raise ValueError('initializer must be callable if specified.')
-  if initializer is None:
-    initializer = init_ops.truncated_normal_initializer(
-        mean=0.0, stddev=1. / math.sqrt(dimension))
-
-  # Sort the columns so the default collection name is deterministic even if the
-  # user passes columns from an unsorted collection, such as dict.values().
-  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
-
-  c0 = sorted_columns[0]
-  num_buckets = c0._num_buckets  # pylint: disable=protected-access
-  if not isinstance(c0, _CategoricalColumn):
-    raise ValueError(
-        'All categorical_columns must be subclasses of _CategoricalColumn. '
-        'Given: {}, of type: {}'.format(c0, type(c0)))
-  if isinstance(c0, _WeightedCategoricalColumn):
-    c0 = c0.categorical_column
-  for c in sorted_columns[1:]:
-    if isinstance(c, _WeightedCategoricalColumn):
-      c = c.categorical_column
-    if not isinstance(c, type(c0)):
-      raise ValueError(
-          'To use shared_embedding_column, all categorical_columns must have '
-          'the same type, or be weighted_categorical_column of the same type. '
-          'Given column: {} of type: {} does not match given column: {} of '
-          'type: {}'.format(c0, type(c0), c, type(c)))
-    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
-      raise ValueError(
-          'To use shared_embedding_column, all categorical_columns must have '
-          'the same number of buckets. Given column: {} with buckets: {} does  '
-          'not match column: {} with buckets: {}'.format(
-              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
-
-  if not shared_embedding_collection_name:
-    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
-    shared_embedding_collection_name += '_shared_embedding'
-
-  result = []
-  for column in categorical_columns:
-    result.append(
-        _SharedEmbeddingColumn(
-            categorical_column=column,
-            initializer=initializer,
-            dimension=dimension,
-            combiner=combiner,
-            shared_embedding_collection_name=shared_embedding_collection_name,
-            ckpt_to_load_from=ckpt_to_load_from,
-            tensor_name_in_ckpt=tensor_name_in_ckpt,
-            max_norm=max_norm,
-            trainable=trainable))
-
-  return result
-
-
-@tf_export('feature_column.numeric_column')
-def numeric_column(key,
-                   shape=(1,),
-                   default_value=None,
-                   dtype=dtypes.float32,
-                   normalizer_fn=None):
+def _numeric_column(key,
+                    shape=(1,),
+                    default_value=None,
+                    dtype=dtypes.float32,
+                    normalizer_fn=None):
   """Represents real valued or numerical features.
 
   Example:
@@ -1156,8 +997,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
-@tf_export('feature_column.bucketized_column')
-def bucketized_column(source_column, boundaries):
+def _bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
   Buckets include the left boundary, and exclude the right boundary. Namely,
@@ -1253,10 +1093,9 @@ def _assert_key_is_string(key):
             type(key), key))
 
 
-@tf_export('feature_column.categorical_column_with_hash_bucket')
-def categorical_column_with_hash_bucket(key,
-                                        hash_bucket_size,
-                                        dtype=dtypes.string):
+def _categorical_column_with_hash_bucket(key,
+                                         hash_bucket_size,
+                                         dtype=dtypes.string):
   """Represents sparse feature where ids are set by hashing.
 
   Use this when your sparse features are in string or integer format, and you
@@ -1312,13 +1151,12 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
-@tf_export('feature_column.categorical_column_with_vocabulary_file')
-def categorical_column_with_vocabulary_file(key,
-                                            vocabulary_file,
-                                            vocabulary_size=None,
-                                            num_oov_buckets=0,
-                                            default_value=None,
-                                            dtype=dtypes.string):
+def _categorical_column_with_vocabulary_file(key,
+                                             vocabulary_file,
+                                             vocabulary_size=None,
+                                             num_oov_buckets=0,
+                                             default_value=None,
+                                             dtype=dtypes.string):
   """A `_CategoricalColumn` with a vocabulary file.
 
   Use this when your inputs are in string or integer format, and you have a
@@ -1432,9 +1270,11 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
-@tf_export('feature_column.categorical_column_with_vocabulary_list')
-def categorical_column_with_vocabulary_list(
-    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+def _categorical_column_with_vocabulary_list(key,
+                                             vocabulary_list,
+                                             dtype=None,
+                                             default_value=-1,
+                                             num_oov_buckets=0):
   """A `_CategoricalColumn` with in-memory vocabulary.
 
   Use this when your inputs are in string or integer format, and you have an
@@ -1543,8 +1383,7 @@ def categorical_column_with_vocabulary_list(
       default_value=default_value, num_oov_buckets=num_oov_buckets)
 
 
-@tf_export('feature_column.categorical_column_with_identity')
-def categorical_column_with_identity(key, num_buckets, default_value=None):
+def _categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `_CategoricalColumn` that returns identity values.
 
   Use this when your inputs are integers in the range `[0, num_buckets)`, and
@@ -1611,8 +1450,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, num_buckets=num_buckets, default_value=default_value)
 
 
-@tf_export('feature_column.indicator_column')
-def indicator_column(categorical_column):
+def _indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
   - For DNN model, `indicator_column` can be used to wrap any
@@ -1646,9 +1484,9 @@ def indicator_column(categorical_column):
   return _IndicatorColumn(categorical_column)
 
 
-@tf_export('feature_column.weighted_categorical_column')
-def weighted_categorical_column(
-    categorical_column, weight_feature_key, dtype=dtypes.float32):
+def _weighted_categorical_column(categorical_column,
+                                 weight_feature_key,
+                                 dtype=dtypes.float32):
   """Applies weight values to a `_CategoricalColumn`.
 
   Use this when each of your sparse inputs has both an ID and a value. For
@@ -1721,8 +1559,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
-@tf_export('feature_column.crossed_column')
-def crossed_column(keys, hash_bucket_size, hash_key=None):
+def _crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
   Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
@@ -1904,6 +1741,7 @@ class _EmbeddingColumnLayer(base.Layer):
     return self._embedding_weight_var
 
 
+@six.add_metaclass(abc.ABCMeta)
 class _FeatureColumn(object):
   """Represents a feature column abstraction.
 
@@ -1919,7 +1757,6 @@ class _FeatureColumn(object):
 
   This class is an abstract class. User should not create instances of this.
   """
-  __metaclass__ = abc.ABCMeta
 
   @abc.abstractproperty
   def name(self):
@@ -1996,8 +1833,6 @@ class _DenseColumn(_FeatureColumn):
   indicator_column.
   """
 
-  __metaclass__ = abc.ABCMeta
-
   @abc.abstractproperty
   def _variable_shape(self):
     """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
@@ -2090,7 +1925,6 @@ class _CategoricalColumn(_FeatureColumn):
 
   A categorical feature typically handled with a `tf.SparseTensor` of IDs.
   """
-  __metaclass__ = abc.ABCMeta
 
   IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
       'IdWeightPair', ['id_tensor', 'weight_tensor'])
@@ -2195,8 +2029,6 @@ def _create_categorical_column_weighted_sum(column,
 class _SequenceDenseColumn(_FeatureColumn):
   """Represents dense sequence data."""
 
-  __metaclass__ = abc.ABCMeta
-
   TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
       'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
 
@@ -2660,6 +2492,7 @@ class _EmbeddingColumn(
         inputs=inputs,
         weight_collections=weight_collections,
         trainable=trainable)
+
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
@@ -3383,19 +3216,29 @@ class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
 
 
 def _verify_static_batch_size_equality(tensors, columns):
+  """Validates that the first dim (batch size) of all tensors are equal or None.
+
+  Args:
+    tensors: list of tensors to check.
+    columns: list of feature columns matching tensors. Will be used for error
+      messaging.
+
+  Raises:
+    ValueError: if one of the tensors has a variant batch size
+  """
   # bath_size is a tf.Dimension object.
   expected_batch_size = None
   for i in range(0, len(tensors)):
-    if tensors[i].shape[0].value is not None:
+    if tensors[i].shape.dims[0].value is not None:
       if expected_batch_size is None:
         bath_size_column_index = i
-        expected_batch_size = tensors[i].shape[0]
-      elif not expected_batch_size.is_compatible_with(tensors[i].shape[0]):
+        expected_batch_size = tensors[i].shape.dims[0]
+      elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]):
         raise ValueError(
             'Batch size (first dimension) of each feature must be same. '
             'Batch size of columns ({}, {}): ({}, {})'.format(
                 columns[bath_size_column_index].name, columns[i].name,
-                expected_batch_size, tensors[i].shape[0]))
+                expected_batch_size, tensors[i].shape.dims[0]))
 
 
 def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
@@ -3403,9 +3246,18 @@ def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
   with ops.name_scope(None, 'sequence_length') as name_scope:
     row_ids = sp_tensor.indices[:, 0]
     column_ids = sp_tensor.indices[:, 1]
+    # Add one to convert column indices to element length
     column_ids += array_ops.ones_like(column_ids)
-    seq_length = math_ops.to_int64(
-        math_ops.segment_max(column_ids, segment_ids=row_ids) / num_elements)
+    # Get the number of elements we will have per example/row
+    seq_length = math_ops.segment_max(column_ids, segment_ids=row_ids)
+
+    # The raw values are grouped according to num_elements;
+    # how many entities will we have after grouping?
+    # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
+    # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
+    # these will get grouped, and the final seq_length is [1, 1]
+    seq_length = math_ops.to_int64(math_ops.ceil(seq_length / num_elements))
+
     # If the last n rows do not have ids, seq_length will have shape
     # [batch_size - n]. Pad the remaining values with zeros.
     n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
@@ -3439,25 +3291,17 @@ class _SequenceCategoricalColumn(
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
     id_tensor = sparse_tensors.id_tensor
     weight_tensor = sparse_tensors.weight_tensor
-    # Expands final dimension, so that embeddings are not combined during
-    # embedding lookup.
-    check_id_rank = check_ops.assert_equal(
-        array_ops.rank(id_tensor), 2,
-        data=[
-            'Column {} expected ID tensor of rank 2. '.format(self.name),
-            'id_tensor shape: ', array_ops.shape(id_tensor)])
-    with ops.control_dependencies([check_id_rank]):
-      id_tensor = sparse_ops.sparse_reshape(
-          id_tensor,
-          shape=array_ops.concat([id_tensor.dense_shape, [1]], axis=0))
+
+    # Expands third dimension, if necessary so that embeddings are not
+    # combined during embedding lookup. If the tensor is already 3D, leave
+    # as-is.
+    shape = array_ops.shape(id_tensor)
+    # Compute the third dimension explicitly instead of setting it to -1, as
+    # that doesn't work for dynamically shaped tensors with 0-length at runtime.
+    # This happens for empty sequences.
+    target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])]
+    id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape)
     if weight_tensor is not None:
-      check_weight_rank = check_ops.assert_equal(
-          array_ops.rank(weight_tensor), 2,
-          data=[
-              'Column {} expected weight tensor of rank 2.'.format(self.name),
-              'weight_tensor shape:', array_ops.shape(weight_tensor)])
-      with ops.control_dependencies([check_weight_rank]):
-        weight_tensor = sparse_ops.sparse_reshape(
-            weight_tensor,
-            shape=array_ops.concat([weight_tensor.dense_shape, [1]], axis=0))
+      weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)
+
     return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 3b818f18b5b0fce99b81e51ce89e58c72cab0b91..68a2712425c56ae4b3e42c6bd7ae497c0358a074 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -20,4 +20,5 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.feature_column.feature_column import *
+from tensorflow.python.feature_column.feature_column_v2 import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 1ae510250cfd030d965d0480599d4e333fe30b50..daa0a3b3a4bb5fd067681c5ca91eaccdc64d3144 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -30,7 +30,8 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_v2 as fc_new
 from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
@@ -169,6 +170,7 @@ class LazyColumnTest(test.TestCase):
         TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
       builder.get(NotAFeatureColumn())
 
+  @test_util.run_deprecated_v1
   def test_expand_dim_rank_1_sparse_tensor_empty_batch(self):
     # empty 1-D sparse tensor:
     builder = _LazyBuilder(features={'a': sparse_tensor.SparseTensor(
@@ -184,8 +186,9 @@ class LazyColumnTest(test.TestCase):
 
 class NumericColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    a = fc.numeric_column('aaa')
+    a = fc._numeric_column('aaa')
     self.assertEqual('aaa', a.key)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a._var_scope_name)
@@ -196,53 +199,53 @@ class NumericColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.numeric_column(key=('aaa',))
+      fc._numeric_column(key=('aaa',))
 
   def test_shape_saved_as_tuple(self):
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual((1, 2), a.shape)
 
   def test_default_value_saved_as_tuple(self):
-    a = fc.numeric_column('aaa', default_value=4.)
+    a = fc._numeric_column('aaa', default_value=4.)
     self.assertEqual((4.,), a.default_value)
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual(((3., 2.),), a.default_value)
 
   def test_shape_and_default_value_compatibility(self):
-    fc.numeric_column('aaa', shape=[2], default_value=[1, 2.])
+    fc._numeric_column('aaa', shape=[2], default_value=[1, 2.])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
-    fc.numeric_column(
+      fc._numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
+    fc._numeric_column(
         'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]])
 
   def test_default_value_type_check(self):
-    fc.numeric_column(
+    fc._numeric_column(
         'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32)
-    fc.numeric_column(
+    fc._numeric_column(
         'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32)
     with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32)
     with self.assertRaisesRegexp(TypeError,
                                  'default_value must be compatible with dtype'):
-      fc.numeric_column('aaa', default_value=['string'])
+      fc._numeric_column('aaa', default_value=['string'])
 
   def test_shape_must_be_positive_integer(self):
     with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[
               1.0,
           ])
 
     with self.assertRaisesRegexp(ValueError,
                                  'shape dimensions must be greater than 0'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[
               0,
           ])
@@ -250,20 +253,21 @@ class NumericColumnTest(test.TestCase):
   def test_dtype_is_convertible_to_float(self):
     with self.assertRaisesRegexp(ValueError,
                                  'dtype must be convertible to float'):
-      fc.numeric_column('aaa', dtype=dtypes.string)
+      fc._numeric_column('aaa', dtype=dtypes.string)
 
   def test_scalar_default_value_fills_the_shape(self):
-    a = fc.numeric_column('aaa', shape=[2, 3], default_value=2.)
+    a = fc._numeric_column('aaa', shape=[2, 3], default_value=2.)
     self.assertEqual(((2., 2., 2.), (2., 2., 2.)), a.default_value)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
+    a = fc._numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
     self.assertEqual({
         'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
     }, a._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example_no_default_value(self):
-    price = fc.numeric_column('price', shape=[2])
+    price = fc._numeric_column('price', shape=[2])
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -277,8 +281,9 @@ class NumericColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+  @test_util.run_deprecated_v1
   def test_parse_example_with_default_value(self):
-    price = fc.numeric_column('price', shape=[2], default_value=11.)
+    price = fc._numeric_column('price', shape=[2], default_value=11.)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -301,29 +306,31 @@ class NumericColumnTest(test.TestCase):
 
   def test_normalizer_fn_must_be_callable(self):
     with self.assertRaisesRegexp(TypeError, 'must be a callable'):
-      fc.numeric_column('price', normalizer_fn='NotACallable')
+      fc._numeric_column('price', normalizer_fn='NotACallable')
 
+  @test_util.run_deprecated_v1
   def test_normalizer_fn_transform_feature(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
-    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price])
     with self.cached_session():
       self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
-    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     builder = _LazyBuilder({'price': [[1., 2.], [5., 6.]]})
     self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
 
   def test_sparse_tensor_not_supported(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     builder = _LazyBuilder({
         'price':
             sparse_tensor.SparseTensor(
@@ -332,109 +339,113 @@ class NumericColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       price._transform_feature(builder)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
     a_copy = copy.deepcopy(a)
     self.assertEqual(a_copy.name, 'aaa')
     self.assertEqual(a_copy.shape, (1, 2))
     self.assertEqual(a_copy.default_value, ((3., 2.),))
 
   def test_numpy_default_value(self):
-    a = fc.numeric_column(
+    a = fc._numeric_column(
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
 
 class BucketizedColumnTest(test.TestCase):
 
   def test_invalid_source_column_type(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
+    a = fc._categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
     with self.assertRaisesRegexp(
         ValueError,
         'source_column must be a column generated with numeric_column'):
-      fc.bucketized_column(a, boundaries=[0, 1])
+      fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_source_column_shape(self):
-    a = fc.numeric_column('aaa', shape=[2, 3])
+    a = fc._numeric_column('aaa', shape=[2, 3])
     with self.assertRaisesRegexp(
         ValueError, 'source_column must be one-dimensional column'):
-      fc.bucketized_column(a, boundaries=[0, 1])
+      fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_boundaries(self):
-    a = fc.numeric_column('aaa')
+    a = fc._numeric_column('aaa')
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=None)
+      fc._bucketized_column(a, boundaries=None)
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=1.)
+      fc._bucketized_column(a, boundaries=1.)
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=[1, 0])
+      fc._bucketized_column(a, boundaries=[1, 0])
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=[1, 1])
+      fc._bucketized_column(a, boundaries=[1, 1])
 
   def test_name(self):
-    a = fc.numeric_column('aaa', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual('aaa_bucketized', b.name)
 
   def test_var_scope_name(self):
-    a = fc.numeric_column('aaa', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual('aaa_bucketized', b._var_scope_name)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual({
         'aaa': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32)
     }, b._parse_example_spec)
 
   def test_variable_shape(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     # Column 'aaa` has shape [2] times three buckets -> variable_shape=[2, 3].
     self.assertAllEqual((2, 3), b._variable_shape)
 
   def test_num_buckets(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
     self.assertEqual(6, b._num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -448,9 +459,10 @@ class BucketizedColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       transformed_tensor = _transform_features({
           'price': [[-1., 1.], [5., 6.]]
@@ -461,24 +473,22 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_get_dense_tensor_one_input_value(self):
     """Tests _get_dense_tensor() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
       with _initialized_session():
         bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
         self.assertAllClose(
             # One-hot tensor.
-            [[[1., 0., 0., 0., 0.]],
-             [[0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.]],
-             [[0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            [[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 1.]]],
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_dense_tensor_two_input_values(self):
     """Tests _get_dense_tensor() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
       with _initialized_session():
@@ -487,12 +497,12 @@ class BucketizedColumnTest(test.TestCase):
             # One-hot tensor.
             [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
              [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_sparse_tensors_one_input_value(self):
     """Tests _get_sparse_tensors() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
       with _initialized_session() as sess:
@@ -506,8 +516,8 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_two_input_values(self):
     """Tests _get_sparse_tensors() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
       with _initialized_session() as sess:
@@ -522,8 +532,8 @@ class BucketizedColumnTest(test.TestCase):
         self.assertAllEqual([2, 2], id_tensor_value.dense_shape)
 
   def test_sparse_tensor_input_not_supported(self):
-    price = fc.numeric_column('price')
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+    price = fc._numeric_column('price')
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 1])
     builder = _LazyBuilder({
         'price':
             sparse_tensor.SparseTensor(
@@ -532,9 +542,10 @@ class BucketizedColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       bucketized_price._transform_feature(builder)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.numeric_column('aaa', shape=[2])
-    a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2])
+    a_bucketized = fc._bucketized_column(a, boundaries=[0, 1])
     a_bucketized_copy = copy.deepcopy(a_bucketized)
     self.assertEqual(a_bucketized_copy.name, 'aaa_bucketized')
     self.assertAllEqual(a_bucketized_copy._variable_shape, (2, 3))
@@ -542,45 +553,48 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = fc.linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = fc.linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.],
              [60.], [70.], [80.], [90.], [100.]]))
@@ -590,14 +604,14 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_keras_linear_model_one_input_value(self):
     """Tests _LinearModel for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -605,25 +619,28 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_keras_linear_model_two_input_values(self):
     """Tests _LinearModel for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -631,12 +648,12 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
                                          [60.], [70.], [80.], [90.], [100.]]))
@@ -646,15 +663,16 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
 
 class HashedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a._var_scope_name)
     self.assertEqual('aaa', a.key)
@@ -663,25 +681,26 @@ class HashedCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_hash_bucket(('key',), 10)
+      fc._categorical_column_with_hash_bucket(('key',), 10)
 
   def test_bucket_size_should_be_given(self):
     with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
-      fc.categorical_column_with_hash_bucket('aaa', None)
+      fc._categorical_column_with_hash_bucket('aaa', None)
 
   def test_bucket_size_should_be_positive(self):
     with self.assertRaisesRegexp(ValueError,
                                  'hash_bucket_size must be at least 1'):
-      fc.categorical_column_with_hash_bucket('aaa', 0)
+      fc._categorical_column_with_hash_bucket('aaa', 0)
 
   def test_dtype_should_be_string_or_integer(self):
-    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
-    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
+    fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
+      fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_hash_bucket('aaa', 10)
+    original = fc._categorical_column_with_hash_bucket('aaa', 10)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(10, column.hash_bucket_size)
@@ -689,19 +708,20 @@ class HashedCategoricalColumnTest(test.TestCase):
       self.assertEqual(dtypes.string, column.dtype)
 
   def test_parse_spec_string(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, a._parse_example_spec)
 
   def test_parse_spec_int(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, a._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -721,8 +741,9 @@ class HashedCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_strings_should_be_hashed(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -739,11 +760,11 @@ class HashedCategoricalColumnTest(test.TestCase):
                           output.dense_shape.eval())
 
   def test_tensor_dtype_should_be_string_or_integer(self):
-    string_fc = fc.categorical_column_with_hash_bucket(
+    string_fc = fc._categorical_column_with_hash_bucket(
         'a_string', 10, dtype=dtypes.string)
-    int_fc = fc.categorical_column_with_hash_bucket(
+    int_fc = fc._categorical_column_with_hash_bucket(
         'a_int', 10, dtype=dtypes.int32)
-    float_fc = fc.categorical_column_with_hash_bucket(
+    float_fc = fc._categorical_column_with_hash_bucket(
         'a_float', 10, dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(
         values=[101],
@@ -768,7 +789,7 @@ class HashedCategoricalColumnTest(test.TestCase):
       builder.get(float_fc)
 
   def test_dtype_should_match_with_tensor(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -776,8 +797,9 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       builder.get(hashed_sparse)
 
+  @test_util.run_deprecated_v1
   def test_ints_should_be_hashed(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=[101, 201, 301],
@@ -790,8 +812,9 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
+  @test_util.run_deprecated_v1
   def test_int32_64_is_compatible(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=constant_op.constant([101, 201, 301], dtype=dtypes.int32),
@@ -804,8 +827,9 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     builder = _LazyBuilder({
         'wire':
             sparse_tensor.SparseTensor(
@@ -818,7 +842,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_hash_bucket('aaa', 10)
+    column = fc._categorical_column_with_hash_bucket('aaa', 10)
     inputs = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -832,15 +856,17 @@ class HashedCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     builder = _LazyBuilder({'wire': (('omar', ''), ('stringer', 'marlo'))})
     id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
     self.assertIsNone(id_weight_pair.weight_tensor)
     self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -852,16 +878,18 @@ class HashedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
@@ -874,13 +902,14 @@ class HashedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
 
 class CrossedColumnTest(test.TestCase):
@@ -888,100 +917,102 @@ class CrossedColumnTest(test.TestCase):
   def test_keys_empty(self):
     with self.assertRaisesRegexp(
         ValueError, 'keys must be a list with length > 1'):
-      fc.crossed_column([], 10)
+      fc._crossed_column([], 10)
 
   def test_keys_length_one(self):
     with self.assertRaisesRegexp(
         ValueError, 'keys must be a list with length > 1'):
-      fc.crossed_column(['a'], 10)
+      fc._crossed_column(['a'], 10)
 
   def test_key_type_unsupported(self):
     with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
-      fc.crossed_column(['a', fc.numeric_column('c')], 10)
+      fc._crossed_column(['a', fc._numeric_column('c')], 10)
 
     with self.assertRaisesRegexp(
         ValueError, 'categorical_column_with_hash_bucket is not supported'):
-      fc.crossed_column(
-          ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
+      fc._crossed_column(
+          ['a', fc._categorical_column_with_hash_bucket('c', 10)], 10)
 
   def test_hash_bucket_size_negative(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], -1)
+      fc._crossed_column(['a', 'c'], -1)
 
   def test_hash_bucket_size_zero(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], 0)
+      fc._crossed_column(['a', 'c'], 0)
 
   def test_hash_bucket_size_none(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], None)
+      fc._crossed_column(['a', 'c'], None)
 
   def test_name(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_name_ordered_alphabetically(self):
     """Tests that the name does not depend on the order of given columns."""
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([crossed1, 'c', b], 10)
+    crossed2 = fc._crossed_column([crossed1, 'c', b], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_name_leaf_keys_ordered_alphabetically(self):
     """Tests that the name does not depend on the order of given columns."""
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d2', 'c'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d2', 'c'], 10)
 
-    crossed2 = fc.crossed_column([crossed1, 'd1', b], 10)
+    crossed2 = fc._crossed_column([crossed1, 'd1', b], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_var_scope_name(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2._var_scope_name)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed = fc.crossed_column([b, 'c'], 10)
+    a = fc._numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed = fc._crossed_column([b, 'c'], 10)
     self.assertEqual({
         'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32),
         'c': parsing_ops.VarLenFeature(dtypes.string),
     }, crossed._parse_example_spec)
 
   def test_num_buckets(self):
-    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed = fc.crossed_column([b, 'c'], 15)
+    a = fc._numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed = fc._crossed_column([b, 'c'], 15)
     self.assertEqual(15, crossed._num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 15, hash_key=5)
     crossed2_copy = copy.deepcopy(crossed2)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
     self.assertEqual(15, crossed2_copy.hash_bucket_size)
     self.assertEqual(5, crossed2_copy.hash_key)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
-    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
+    price_cross_wire = fc._crossed_column([bucketized_price, 'wire'], 10)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -1004,12 +1035,13 @@ class CrossedColumnTest(test.TestCase):
       self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
       self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
     hash_bucket_size = 10
-    price_cross_wire = fc.crossed_column(
-        [bucketized_price, 'wire'], hash_bucket_size)
+    price_cross_wire = fc._crossed_column([bucketized_price, 'wire'],
+                                          hash_bucket_size)
     features = {
         'price': constant_op.constant([[1., 2.], [5., 6.]]),
         'wire': sparse_tensor.SparseTensor(
@@ -1020,18 +1052,19 @@ class CrossedColumnTest(test.TestCase):
     outputs = _transform_features(features, [price_cross_wire])
     output = outputs[price_cross_wire]
     with self.cached_session() as sess:
-      output_val = sess.run(output)
+      output_val = self.evaluate(output)
       self.assertAllEqual(
           [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
       for val in output_val.values:
         self.assertIn(val, list(range(hash_bucket_size)))
       self.assertAllEqual([2, 4], output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 15, hash_key=5)
     with ops.Graph().as_default():
       builder = _LazyBuilder({
           'a':
@@ -1069,9 +1102,9 @@ class CrossedColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_simple(self):
     """Same as test_get_sparse_tensors, but with simpler values."""
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       builder = _LazyBuilder({
           'a':
@@ -1094,14 +1127,15 @@ class CrossedColumnTest(test.TestCase):
         self.assertAllEqual(expected_values, id_tensor_eval.values)
         self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     """Tests linear_model.
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
           'a': constant_op.constant(((-1., .5), (.5, 1.))),
@@ -1113,15 +1147,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(
-            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_linear_model_with_weights(self):
     class _TestColumnWithWeights(_CategoricalColumn):
@@ -1155,7 +1189,7 @@ class CrossedColumnTest(test.TestCase):
             id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
     t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(
           ValueError,
@@ -1175,14 +1209,15 @@ class CrossedColumnTest(test.TestCase):
                 dense_shape=(2, 2)),
         }, (crossed,))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     """Tests _LinearModel.
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
           'a':
@@ -1196,15 +1231,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_keras_linear_model_with_weights(self):
 
@@ -1242,7 +1277,7 @@ class CrossedColumnTest(test.TestCase):
             id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
     t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(
           ValueError,
@@ -1331,31 +1366,31 @@ class LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.linear_model(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_dense_bias(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1366,15 +1401,16 @@ class LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1389,7 +1425,7 @@ class LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -1442,25 +1478,25 @@ class LinearModelTest(test.TestCase):
         sess.run(dense_and_sparse_column_var.assign(
             [[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1471,29 +1507,29 @@ class LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
                 1000., 1100., 1200.
             ], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.linear_model(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = array_ops.sparse_placeholder(dtypes.string)
       wire_value = sparse_tensor.SparseTensorValue(
@@ -1504,7 +1540,7 @@ class LinearModelTest(test.TestCase):
       predictions = fc.linear_model(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -1514,7 +1550,7 @@ class LinearModelTest(test.TestCase):
             predictions.eval(feed_dict={wire_tensor: wire_value}))
 
   def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1528,11 +1564,11 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc._weighted_categorical_column(wire_cast, 'weights')
 
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
@@ -1550,25 +1586,25 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -1577,22 +1613,22 @@ class LinearModelTest(test.TestCase):
         fc.linear_model(features, [price])
 
   def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [5., 6.]],
@@ -1603,18 +1639,18 @@ class LinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       cols_to_vars = {}
@@ -1627,8 +1663,8 @@ class LinearModelTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[price2], [price2_var])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=3)
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2', shape=3)
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [6., 7.]],
@@ -1653,13 +1689,13 @@ class LinearModelTest(test.TestCase):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
     # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
     # creates a Variable.
-    apple_numeric_column = fc.numeric_column('apple_numeric_column')
-    banana_dense_feature = fc.numeric_column('banana_dense_feature')
-    banana_dense_feature_bucketized = fc.bucketized_column(
+    apple_numeric_column = fc._numeric_column('apple_numeric_column')
+    banana_dense_feature = fc._numeric_column('banana_dense_feature')
+    banana_dense_feature_bucketized = fc._bucketized_column(
         banana_dense_feature, boundaries=[0.])
-    cherry_sparse_column = fc.categorical_column_with_hash_bucket(
+    cherry_sparse_column = fc._categorical_column_with_hash_bucket(
         'cherry_sparse_feature', hash_bucket_size=5)
-    dragonfruit_embedding_column = fc.embedding_column(
+    dragonfruit_embedding_column = fc._embedding_column(
         cherry_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -1684,7 +1720,7 @@ class LinearModelTest(test.TestCase):
       self.assertItemsEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price], weight_collections=['my-vars'])
@@ -1695,7 +1731,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(price_var, my_vars)
 
   def test_sparse_collection(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1709,7 +1745,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, my_vars)
 
   def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price])
@@ -1720,7 +1756,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(price_var, trainable_vars)
 
   def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1733,7 +1769,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, trainable_vars)
 
   def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price], trainable=False)
@@ -1741,7 +1777,7 @@ class LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1751,9 +1787,9 @@ class LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       features = {
           'price_a': [[1.]],
@@ -1787,8 +1823,8 @@ class LinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -1800,9 +1836,9 @@ class LinearModelTest(test.TestCase):
       fc.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -1815,8 +1851,8 @@ class LinearModelTest(test.TestCase):
         fc.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -1830,8 +1866,8 @@ class LinearModelTest(test.TestCase):
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -1846,10 +1882,16 @@ class LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -1873,14 +1915,21 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -1917,8 +1966,9 @@ class LinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -1939,7 +1989,7 @@ class LinearModelTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
   def test_multiple_linear_models(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features1 = {'price': [[1.], [5.]]}
       features2 = {'price': [[2.], [10.]]}
@@ -1950,14 +2000,14 @@ class LinearModelTest(test.TestCase):
       price_var1 = get_linear_model_column_var(price, name='linear_model')
       price_var2 = get_linear_model_column_var(price, name='linear_model_1')
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
 
 class _LinearModelTest(test.TestCase):
@@ -1996,31 +2046,31 @@ class _LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       get_keras_linear_model_predictions(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_dense_bias(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2031,15 +2081,16 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2055,7 +2106,7 @@ class _LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -2114,10 +2165,10 @@ class _LinearModelTest(test.TestCase):
             dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
                                                 [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(
@@ -2125,15 +2176,15 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2145,29 +2196,29 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
                                   [1000., 1100.,
                                    1200.], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = array_ops.sparse_placeholder(dtypes.string)
       wire_value = sparse_tensor.SparseTensorValue(
@@ -2178,7 +2229,7 @@ class _LinearModelTest(test.TestCase):
       predictions = get_keras_linear_model_predictions(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -2188,7 +2239,7 @@ class _LinearModelTest(test.TestCase):
             predictions.eval(feed_dict={wire_tensor: wire_value}))
 
   def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2202,10 +2253,10 @@ class _LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(
@@ -2213,15 +2264,15 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -2230,22 +2281,22 @@ class _LinearModelTest(test.TestCase):
         get_keras_linear_model_predictions(features, [price])
 
   def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -2254,18 +2305,18 @@ class _LinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       cols_to_vars = {}
@@ -2279,8 +2330,8 @@ class _LinearModelTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[price2], [price2_var])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=3)
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2', shape=3)
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [6., 7.]],
@@ -2303,7 +2354,7 @@ class _LinearModelTest(test.TestCase):
         self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
 
   def test_dense_collection(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(
@@ -2315,7 +2366,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(price_var, my_vars)
 
   def test_sparse_collection(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2329,7 +2380,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, my_vars)
 
   def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price])
@@ -2340,7 +2391,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(price_var, trainable_vars)
 
   def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2353,7 +2404,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, trainable_vars)
 
   def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price], trainable=False)
@@ -2361,7 +2412,7 @@ class _LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2371,9 +2422,9 @@ class _LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       features = {
           'price_a': [[1.]],
@@ -2407,8 +2458,8 @@ class _LinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2420,9 +2471,9 @@ class _LinearModelTest(test.TestCase):
       get_keras_linear_model_predictions(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2435,8 +2486,8 @@ class _LinearModelTest(test.TestCase):
         get_keras_linear_model_predictions(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2451,8 +2502,8 @@ class _LinearModelTest(test.TestCase):
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -2468,15 +2519,16 @@ class _LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -2506,19 +2558,21 @@ class _LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -2554,8 +2608,9 @@ class _LinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -2581,7 +2636,7 @@ class InputLayerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_retrieving_input(self):
     features = {'a': [0.]}
-    input_layer = InputLayer(fc.numeric_column('a'))
+    input_layer = InputLayer(fc._numeric_column('a'))
     inputs = self.evaluate(input_layer(features))
     self.assertAllClose([[0.]], inputs)
 
@@ -2593,8 +2648,8 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
+      categorical_column = fc._categorical_column_with_identity(
+          key='a', num_buckets=3)
       embedding_dimension = 2
       def _embedding_column_initializer(shape, dtype, partition_info):
         del shape  # unused
@@ -2605,7 +2660,8 @@ class InputLayerTest(test.TestCase):
             (0, 1),  # id 1
             (1, 1))  # id 2
         return embedding_values
-      embedding_column = fc.embedding_column(
+
+      embedding_column = fc._embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
@@ -2636,8 +2692,8 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
+      categorical_column = fc._categorical_column_with_identity(
+          key='a', num_buckets=3)
       embedding_dimension = 2
 
       def _embedding_column_initializer(shape, dtype, partition_info):
@@ -2650,7 +2706,7 @@ class InputLayerTest(test.TestCase):
             (1, 1))  # id 2
         return embedding_values
 
-      embedding_column = fc.embedding_column(
+      embedding_column = fc._embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
@@ -2687,56 +2743,56 @@ class FunctionalInputLayerTest(test.TestCase):
       fc.input_layer(
           features={'a': [[0]]},
           feature_columns=[
-              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+              fc._categorical_column_with_hash_bucket('wire_cast', 4)
           ])
 
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.input_layer(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_bare_column(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
-      net = fc.input_layer(features, fc.numeric_column('a'))
+      net = fc.input_layer(features, fc._numeric_column('a'))
       with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+        self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
-      columns = (fc.numeric_column(key) for key in features)
+      columns = (fc._numeric_column(key) for key in features)
       net = fc.input_layer(features, columns)
       with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+        self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.input_layer(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_one_column(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+        self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -2745,16 +2801,16 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price])
 
   def test_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [5., 6.]],
@@ -2762,19 +2818,19 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       net = fc.input_layer(features, [price1, price2])
       with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_fills_cols_to_vars(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
     # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
     # creates a Variable.
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -2793,24 +2849,25 @@ class FunctionalInputLayerTest(test.TestCase):
                             variables_lib.Variable)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
+  @test_util.run_deprecated_v1
   def test_fills_cols_to_vars_shared_embedding(self):
     # Provide 5 DenseColumn's to input_layer: a NumericColumn, a
     # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
     # EmbeddingColumn creates a Variable and the two SharedEmbeddingColumns
     # shared one variable.
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
       features = {
@@ -2850,13 +2907,13 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[shared_embedding_a][0].shape, [3, 2])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -2883,8 +2940,8 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[some_embedding_column][2].shape, [1, 10])
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
     with ops.Graph().as_default():
       features = {
           'price_a': [[1.]],
@@ -2893,11 +2950,11 @@ class FunctionalInputLayerTest(test.TestCase):
       net1 = fc.input_layer(features, [price_a, price_b])
       net2 = fc.input_layer(features, [price_b, price_a])
       with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+        self.assertAllClose([[1., 3.]], self.evaluate(net1))
+        self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
-    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    animal = fc._categorical_column_with_identity('animal', num_buckets=4)
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -2908,8 +2965,8 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [animal])
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2921,9 +2978,9 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2936,8 +2993,8 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2950,8 +3007,8 @@ class FunctionalInputLayerTest(test.TestCase):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -2967,9 +3024,9 @@ class FunctionalInputLayerTest(test.TestCase):
             })
 
   def test_multiple_layers_with_same_embedding_column(self):
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
 
     with ops.Graph().as_default():
@@ -2990,13 +3047,14 @@ class FunctionalInputLayerTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
 
@@ -3023,13 +3081,14 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     all_cols = [embedding_column_a, embedding_column_b]
@@ -3074,6 +3133,7 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
@@ -3085,18 +3145,18 @@ class FunctionalInputLayerTest(test.TestCase):
       return embedding_values
 
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
 
     # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
+    one_hot_body_style = fc._indicator_column(body_style)
 
     # embedded_body_style has 5 dims in input_layer.
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(country, dimension=5,
-                                           initializer=_initializer)
+    embedded_country = fc._embedding_column(
+        country, dimension=5, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
     features = {
@@ -3124,6 +3184,7 @@ class FunctionalInputLayerTest(test.TestCase):
            [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
           sess.run(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
         (1., 2.),  # id 0
@@ -3135,17 +3196,17 @@ class FunctionalInputLayerTest(test.TestCase):
       return embedding_values
 
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
 
     # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
+    one_hot_body_style = fc._indicator_column(body_style)
 
     # embedded_body_style has 5 dims in input_layer.
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(
+    embedded_country = fc._embedding_column(
         country, dimension=2, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
@@ -3183,9 +3244,10 @@ class FunctionalInputLayerTest(test.TestCase):
                   features['country']: country_data
               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -3313,8 +3375,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'python/feature_column/testdata/wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column._var_scope_name)
@@ -3326,22 +3389,30 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    column = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+    column = fc._categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     self.assertEqual(7, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+    original = fc._categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(7, column._num_buckets)
@@ -3351,16 +3422,17 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_vocabulary_file_none(self):
     with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file=None, vocabulary_size=3)
 
   def test_vocabulary_file_empty_string(self):
     with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_invalid_vocabulary_file(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -3373,16 +3445,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
+  @test_util.run_deprecated_v1
   def test_too_large_vocabulary_size(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size + 1)
@@ -3397,20 +3472,24 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
     with self.assertRaisesRegexp(
         ValueError, 'both num_oov_buckets and default_value'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=self._wire_vocabulary_size,
@@ -3418,7 +3497,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           default_value=2)
 
   def test_invalid_input_dtype_int32(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3431,7 +3510,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3443,8 +3522,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_file(
+    a = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3465,8 +3545,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3485,8 +3566,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_none_vocabulary_size(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -3503,8 +3585,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                                       dense_shape=inputs.dense_shape),
                                   id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3514,16 +3597,15 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     id_tensor = _transform_features({'aaa': inputs}, [column])[column]
     with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3540,8 +3622,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3559,8 +3642,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3580,8 +3664,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3601,11 +3686,12 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_small_vocabulary_size(self):
     # 'marlo' is the last entry in our vocabulary file, so be setting
     # `vocabulary_size` to 1 less than number of entries in file, we take
     # 'marlo' out of the vocabulary.
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size - 1)
@@ -3624,8 +3710,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3645,9 +3732,10 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3667,8 +3755,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=(3, 3)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3689,8 +3778,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
+    wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3706,16 +3796,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
+    wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3732,19 +3824,20 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_defaults_string(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
@@ -3756,11 +3849,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key=('aaa',), vocabulary_list=('omar', 'stringer', 'marlo'))
 
   def test_defaults_int(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36))
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
@@ -3770,17 +3863,21 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=(12, 24, 36),
+        dtype=dtypes.int32,
         default_value=-99)
     self.assertEqual(3, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_vocabulary_list(
+    original = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
@@ -3791,65 +3888,65 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12., 24., 36.))
 
   def test_mismatched_int_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'dtype.*and vocabulary dtype.*do not match'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'dtype.*and vocabulary dtype.*do not match'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
 
   def test_none_mapping(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary_list.*must be non-empty'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=None)
 
   def test_empty_mapping(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary_list.*must be non-empty'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=tuple([]))
 
   def test_duplicate_mapping(self):
     with self.assertRaisesRegexp(ValueError, 'Duplicate keys'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 12))
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=(12, 24, 36),
-          num_oov_buckets=-1)
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
     with self.assertRaisesRegexp(
         ValueError, 'both num_oov_buckets and default_value'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=(12, 24, 36),
           num_oov_buckets=100,
           default_value=2)
 
   def test_invalid_input_dtype_int32(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
@@ -3858,9 +3955,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=(12, 24, 36))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -3868,8 +3964,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
+  @test_util.run_deprecated_v1
   def test_parse_example_string(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3890,8 +3987,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_parse_example_int(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(11, 21, 31))
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3912,10 +4010,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -3931,10 +4029,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -3946,13 +4044,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -3966,10 +4062,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     id_weight_pair = column._get_sparse_tensors(
         _LazyBuilder({
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
@@ -3984,8 +4080,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         default_value=2)
@@ -4004,8 +4101,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=100)
@@ -4024,8 +4122,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32)
@@ -4044,9 +4143,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32,
@@ -4067,8 +4167,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=(3, 3)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32,
@@ -4088,8 +4189,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
+    wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
@@ -4104,16 +4206,18 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
+    wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
@@ -4129,19 +4233,20 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
 
 class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_constructor(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
     self.assertEqual('aaa', column._var_scope_name)
@@ -4152,10 +4257,11 @@ class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
+      fc._categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    original = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(3, column._num_buckets)
@@ -4165,24 +4271,24 @@ class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_invalid_num_buckets_zero(self):
     with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'):
-      fc.categorical_column_with_identity(key='aaa', num_buckets=0)
+      fc._categorical_column_with_identity(key='aaa', num_buckets=0)
 
   def test_invalid_num_buckets_negative(self):
     with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'):
-      fc.categorical_column_with_identity(key='aaa', num_buckets=-1)
+      fc._categorical_column_with_identity(key='aaa', num_buckets=-1)
 
   def test_invalid_default_value_too_small(self):
     with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'):
-      fc.categorical_column_with_identity(
+      fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=-1)
 
   def test_invalid_default_value_too_big(self):
     with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'):
-      fc.categorical_column_with_identity(
+      fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=3)
 
   def test_invalid_input_dtype(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -4190,8 +4296,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=30)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4211,8 +4318,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4228,8 +4336,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4241,11 +4350,10 @@ class IdentityCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4259,8 +4367,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     id_weight_pair = column._get_sparse_tensors(
         _LazyBuilder({
             'aaa': ((0, -1), (1, 0))
@@ -4275,8 +4384,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_small(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, -1, 0),
@@ -4288,8 +4398,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
           errors.OpError, 'assert_greater_or_equal_0'):
         id_weight_pair.id_tensor.eval()
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_big(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, 99, 0),
@@ -4301,8 +4412,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
           errors.OpError, 'assert_less_than_num_buckets'):
         id_weight_pair.id_tensor.eval()
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value(self):
-    column = fc.categorical_column_with_identity(
+    column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -4319,8 +4431,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
-    column = fc.categorical_column_with_identity(
+    column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
     input_indices = array_ops.placeholder(dtype=dtypes.int64)
     input_values = array_ops.placeholder(dtype=dtypes.int32)
@@ -4344,8 +4457,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               input_shape: (2, 2),
           }))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -4357,16 +4471,17 @@ class IdentityCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
@@ -4379,13 +4494,13 @@ class IdentityCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
 
 class TransformFeaturesTest(test.TestCase):
@@ -4393,9 +4508,9 @@ class TransformFeaturesTest(test.TestCase):
   # All transform tests are distributed in column test.
   # Here we only test multi column case and naming
   def transform_multi_column(self):
-    bucketized_price = fc.bucketized_column(
-        fc.numeric_column('price'), boundaries=[0, 2, 4, 6])
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    bucketized_price = fc._bucketized_column(
+        fc._numeric_column('price'), boundaries=[0, 2, 4, 6])
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     with ops.Graph().as_default():
       features = {
           'price': [[-1.], [5.]],
@@ -4452,32 +4567,33 @@ class TransformFeaturesTest(test.TestCase):
 class IndicatorColumnTest(test.TestCase):
 
   def test_indicator_column(self):
-    a = fc.categorical_column_with_hash_bucket('a', 4)
-    indicator_a = fc.indicator_column(a)
+    a = fc._categorical_column_with_hash_bucket('a', 4)
+    indicator_a = fc._indicator_column(a)
     self.assertEqual(indicator_a.categorical_column.name, 'a')
     self.assertEqual(indicator_a.name, 'a_indicator')
     self.assertEqual(indicator_a._var_scope_name, 'a_indicator')
     self.assertEqual(indicator_a._variable_shape, [1, 4])
 
-    b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
-    indicator_b = fc.indicator_column(b)
+    b = fc._categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    indicator_b = fc._indicator_column(b)
     self.assertEqual(indicator_b.categorical_column.name, 'b')
     self.assertEqual(indicator_b.name, 'b_indicator')
     self.assertEqual(indicator_b._var_scope_name, 'b_indicator')
     self.assertEqual(indicator_b._variable_shape, [1, 100])
 
   def test_1D_shape_succeeds(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_hash_bucket('animal', 4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({'animal': ['fox', 'fox']})
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
     # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
-    animal = fc.indicator_column(
-        fc.categorical_column_with_hash_bucket('animal', 4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({
         'animal':
             sparse_tensor.SparseTensor(
@@ -4487,11 +4603,12 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_multi_hot(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
 
     builder = _LazyBuilder({
         'animal':
@@ -4500,11 +4617,11 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+      self.assertAllEqual([[0., 2., 0., 0.]], self.evaluate(output))
 
   def test_multi_hot2(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     builder = _LazyBuilder({
         'animal':
             sparse_tensor.SparseTensor(
@@ -4512,20 +4629,22 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.categorical_column_with_hash_bucket('a', 4)
-    column = fc.indicator_column(a)
+    a = fc._categorical_column_with_hash_bucket('a', 4)
+    column = fc._indicator_column(a)
     column_copy = copy.deepcopy(column)
     self.assertEqual(column_copy.categorical_column.name, 'a')
     self.assertEqual(column.name, 'a_indicator')
     self.assertEqual(column._variable_shape, [1, 4])
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc.indicator_column(a)
+    a_indicator = fc._indicator_column(a)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4545,10 +4664,11 @@ class IndicatorColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc.indicator_column(a)
+    a_indicator = fc._indicator_column(a)
     features = {
         'aaa': sparse_tensor.SparseTensorValue(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -4557,51 +4677,56 @@ class IndicatorColumnTest(test.TestCase):
     }
     indicator_tensor = _transform_features(features, [a_indicator])[a_indicator]
     with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+      self.assertAllEqual([[0, 0, 1], [1, 0, 0]],
+                          self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_weighted_column(self):
     # Github issue 12557
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc.weighted_categorical_column(ids, 'weights')
-    indicator = fc.indicator_column(weights)
+    weights = fc._weighted_categorical_column(ids, 'weights')
+    indicator = fc._indicator_column(weights)
     features = {
         'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
         'weights': constant_op.constant([[2., 4., 6., 1.]])
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[6., 4., 3.]], indicator_tensor.eval())
+      self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_weighted_column(self):
     # Github issue 12583
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc.weighted_categorical_column(ids, 'weights')
-    indicator = fc.indicator_column(weights)
+    weights = fc._weighted_categorical_column(ids, 'weights')
+    indicator = fc._indicator_column(weights)
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
         'weights': constant_op.constant([[2., 4., 6.]])
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_categorical_column(self):
     # Github issue 12583
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    indicator = fc.indicator_column(ids)
+    indicator = fc._indicator_column(ids)
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4613,14 +4738,15 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4632,14 +4758,15 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4648,16 +4775,17 @@ class IndicatorColumnTest(test.TestCase):
       }
       net = fc.input_layer(features, [animal])
       with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+        self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
 
 class EmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    embedding_column = fc.embedding_column(
+    embedding_column = fc._embedding_column(
         categorical_column, dimension=embedding_dimension)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
@@ -4674,15 +4802,20 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
@@ -4698,15 +4831,20 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    original = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+    original = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     for embedding_column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', embedding_column.categorical_column.name)
       self.assertEqual(3, embedding_column.categorical_column._num_buckets)
@@ -4727,16 +4865,19 @@ class EmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
+      fc._embedding_column(
+          categorical_column, dimension=2, initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded = fc.embedding_column(a, dimension=2)
+    a_embedded = fc._embedding_column(a, dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4756,9 +4897,10 @@ class EmbeddingColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    a_embedded = fc.embedding_column(a, dimension=2)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+    a_embedded = fc._embedding_column(a, dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -4769,9 +4911,10 @@ class EmbeddingColumnTest(test.TestCase):
     output_a = outputs[a]
     output_embedded = outputs[a_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_embedded))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
@@ -4810,10 +4953,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4828,8 +4972,9 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
     vocabulary_size = 4
@@ -4870,10 +5015,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4888,8 +5034,9 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
@@ -4901,9 +5048,9 @@ class EmbeddingColumnTest(test.TestCase):
         dense_shape=(4, 5))
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    embedding_column = fc.embedding_column(categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     # Provide sparse input and get dense result.
     embedding_column._get_dense_tensor(
@@ -4919,6 +5066,7 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(
         ('embedding_weights:0',), tuple([v.name for v in my_vars]))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -4957,10 +5105,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4989,6 +5138,7 @@ class EmbeddingColumnTest(test.TestCase):
               input_shape: sparse_input.dense_shape,
           }))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_restore_from_ckpt(self):
     # Inputs.
     vocabulary_size = 3
@@ -5025,10 +5175,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         ckpt_to_load_from=ckpt_path,
         tensor_name_in_ckpt=ckpt_tensor)
 
@@ -5044,8 +5195,9 @@ class EmbeddingColumnTest(test.TestCase):
         ('embedding_weights:0',), tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -5070,10 +5222,11 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     with ops.Graph().as_default():
@@ -5100,11 +5253,13 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5119,8 +5274,10 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -5146,9 +5303,9 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
+    embedding_column = fc._embedding_column(
         categorical_column,
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5176,11 +5333,13 @@ class EmbeddingColumnTest(test.TestCase):
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5195,8 +5354,10 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
     vocabulary_size = 3
@@ -5235,10 +5396,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -5255,8 +5417,9 @@ class EmbeddingColumnTest(test.TestCase):
         tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
+  @test_util.run_deprecated_v1
   def test_input_layer_not_trainable(self):
     # Inputs.
     vocabulary_size = 3
@@ -5295,11 +5458,13 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        initializer=_initializer, trainable=False)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
 
     # Provide sparse input and get dense result.
     input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
@@ -5313,18 +5478,19 @@ class EmbeddingColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
 class SharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
@@ -5362,13 +5528,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5413,13 +5580,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    original_a, _ = fc.shared_embedding_columns(
+    original_a, _ = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5427,7 +5595,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
         shared_embedding_collection_name='shared_embedding_collection_name',
         ckpt_to_load_from='my_ckpt',
         tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        max_norm=42.,
+        trainable=False)
     for embedding_column_a in (original_a, copy.deepcopy(original_a)):
       self.assertEqual('aaa', embedding_column_a.categorical_column.name)
       self.assertEqual(3, embedding_column_a.categorical_column._num_buckets)
@@ -5450,55 +5619,60 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column_a._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.shared_embedding_columns(
-          [categorical_column_a, categorical_column_b], dimension=2,
+      fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=2,
           initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_incompatible_column_type(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    categorical_column_c = fc.categorical_column_with_hash_bucket(
+    categorical_column_c = fc._categorical_column_with_hash_bucket(
         key='ccc', hash_bucket_size=3)
     with self.assertRaisesRegexp(
         ValueError,
         'all categorical_columns must have the same type.*'
         '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
-      fc.shared_embedding_columns(
+      fc_new.shared_embedding_columns(
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_weighted_categorical_column_ok(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    weighted_categorical_column_a = fc.weighted_categorical_column(
+    weighted_categorical_column_a = fc._weighted_categorical_column(
         categorical_column_a, weight_feature_key='aaa_weights')
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    weighted_categorical_column_b = fc.weighted_categorical_column(
+    weighted_categorical_column_b = fc._weighted_categorical_column(
         categorical_column_b, weight_feature_key='bbb_weights')
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [weighted_categorical_column_a, categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [categorical_column_a, weighted_categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    b = fc.categorical_column_with_vocabulary_list(
+    b = fc._categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                             dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -5529,11 +5703,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['bbb'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+    b = fc._categorical_column_with_identity(key='bbb', num_buckets=3)
+    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                             dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -5550,11 +5725,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
     output_b = outputs[b]
     output_b_embedded = outputs[b_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_a_embedded.eval())
-      _assert_sparse_tensor_value(
-          self, output_b.eval(), output_b_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_a_embedded))
+      _assert_sparse_tensor_value(self, self.evaluate(output_b),
+                                  self.evaluate(output_b_embedded))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
@@ -5598,13 +5774,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a._get_dense_tensor(
@@ -5618,10 +5795,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
     # Inputs.
     vocabulary_size = 3
@@ -5651,11 +5829,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5674,6 +5852,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in my_vars))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -5712,13 +5891,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a._get_dense_tensor(
@@ -5729,6 +5909,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     with _initialized_session() as sess:
       sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 2
@@ -5752,13 +5933,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -5790,13 +5972,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding_1/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5814,8 +5998,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1, ids [], embedding[1] = 0, 0]
         # sum(embeddings * linear_weights)
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+        self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     # Inputs.
     batch_size = 2
@@ -5842,11 +6027,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5881,13 +6066,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding_1/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5905,7 +6092,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1, ids [], embedding[1] = 0, 0]
         # sum(embeddings * linear_weights)
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+        self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
   def _test_input_layer(self, trainable=True):
     # Inputs.
@@ -5949,13 +6136,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer,
+        dimension=embedding_dimension,
+        initializer=_initializer,
         trainable=trainable)
 
     # Provide sparse input and get dense result.
@@ -5978,20 +6166,23 @@ class SharedEmbeddingColumnTest(test.TestCase):
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     self._test_input_layer()
 
+  @test_util.run_deprecated_v1
   def test_input_layer_no_trainable(self):
     self._test_input_layer(trainable=False)
 
 
 class WeightedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     self.assertEqual('ids_weighted_by_values', column.name)
@@ -6002,10 +6193,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
         'values': parsing_ops.VarLenFeature(dtypes.float32)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
-    original = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    original = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     for column in (original, copy.deepcopy(original)):
@@ -6018,23 +6210,23 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_invalid_dtype_none(self):
     with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
           weight_feature_key='values',
           dtype=None)
 
   def test_invalid_dtype_string(self):
     with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
           weight_feature_key='values',
           dtype=dtypes.string)
 
   def test_invalid_input_dtype(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     strings = sparse_tensor.SparseTensorValue(
@@ -6046,14 +6238,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_column_name_collision(self):
     with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='aaa', num_buckets=3),
           weight_feature_key='aaa')._parse_example_spec()
 
   def test_missing_weights(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6064,10 +6256,12 @@ class WeightedCategoricalColumnTest(test.TestCase):
         ValueError, 'values is not in features dictionary'):
       _transform_features({'ids': inputs}, (column,))
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
+    a_weighted = fc._weighted_categorical_column(
+        a, weight_feature_key='weights')
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -6098,9 +6292,10 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['weights'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_features(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6121,19 +6316,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_input(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     weights = sparse_tensor.SparseTensorValue(
@@ -6150,19 +6344,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_weights(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6179,19 +6372,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)),
-          weight_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(weight_tensor))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6210,18 +6402,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_keras_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6241,8 +6433,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         }, (column,))
 
   def test_keras_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6263,11 +6455,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_keras_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6282,18 +6474,19 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6310,18 +6503,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6339,8 +6532,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         }, (column,))
 
   def test_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6361,11 +6554,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6379,14 +6572,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   # TODO(ptucker): Add test with embedding of weighted categorical.
 
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index b79373c4753eb6b2a6b81309481a09456d7a0cd4..6308926494237f3546ddac0b893e4f6a23b116de 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -136,11 +136,16 @@ import six
 
 
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine.base_layer import Layer
+# TODO(b/118385027): Dependency on keras can be problematic if Keras moves out
+# of the main repo.
+from tensorflow.python.keras import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -157,7 +162,16 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+_FEATURE_COLUMN_DEPRECATION_DATE = '2018-11-30'
+_FEATURE_COLUMN_DEPRECATION = ('The old _FeatureColumn APIs are being '
+                               'deprecated. Please use the new FeatureColumn '
+                               'APIs instead.')
 
 
 class StateManager(object):
@@ -175,6 +189,7 @@ class StateManager(object):
                       shape,
                       dtype=None,
                       trainable=True,
+                      use_resource=True,
                       initializer=None):
     """Creates a new variable.
 
@@ -184,12 +199,14 @@ class StateManager(object):
       shape: variable shape.
       dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
       trainable: Whether this variable is trainable or not.
+      use_resource: If true, we use resource variables. Otherwise we use
+        RefVariable.
       initializer: initializer instance (callable).
 
     Returns:
       The created variable.
     """
-    del feature_column, name, shape, dtype, trainable, initializer
+    del feature_column, name, shape, dtype, trainable, use_resource, initializer
     raise NotImplementedError('StateManager.create_variable')
 
   def add_variable(self, feature_column, var):
@@ -242,7 +259,7 @@ class StateManager(object):
 
 
 class _StateManagerImpl(StateManager):
-  """Manages the state of FeatureLayer and LinearModel."""
+  """Manages the state of DenseFeatures and LinearLayer."""
 
   def __init__(self, layer, trainable):
     """Creates an _StateManagerImpl object.
@@ -261,6 +278,7 @@ class _StateManagerImpl(StateManager):
                       shape,
                       dtype=None,
                       trainable=True,
+                      use_resource=True,
                       initializer=None):
     if name in self._cols_to_vars_map[feature_column]:
       raise ValueError('Variable already exists.')
@@ -271,7 +289,7 @@ class _StateManagerImpl(StateManager):
         dtype=dtype,
         initializer=initializer,
         trainable=self._trainable and trainable,
-        use_resource=True,
+        use_resource=use_resource,
         # TODO(rohanj): Get rid of this hack once we have a mechanism for
         # specifying a default partitioner for an entire layer. In that case,
         # the default getter for Layers should work.
@@ -285,7 +303,8 @@ class _StateManagerImpl(StateManager):
     raise ValueError('Variable does not exist.')
 
 
-class FeatureLayer(Layer):
+@tf_export('keras.layers.DenseFeatures', v1=[])
+class DenseFeatures(Layer):
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -301,7 +320,7 @@ class FeatureLayer(Layer):
   keywords_embedded = embedding_column(
       categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
   columns = [price, keywords_embedded, ...]
-  feature_layer = FeatureLayer(columns)
+  feature_layer = DenseFeatures(columns)
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   dense_tensor = feature_layer(features)
@@ -315,9 +334,8 @@ class FeatureLayer(Layer):
                feature_columns,
                trainable=True,
                name=None,
-               shared_state_manager=None,
                **kwargs):
-    """Constructs a FeatureLayer.
+    """Constructs a DenseFeatures.
 
     Args:
       feature_columns: An iterable containing the FeatureColumns to use as
@@ -328,47 +346,19 @@ class FeatureLayer(Layer):
         `indicator_column`.
       trainable: If `True` also add the variable to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-      name: Name to give to the FeatureLayer.
-      shared_state_manager: SharedEmbeddingStateManager that manages the state
-        of SharedEmbeddingColumns. The state of SharedEmbeddingColumns, unlike
-        regular embedding columns cannot be owned by the InputLayer itself since
-        SharedEmbeddingColumns can be shared across different InputLayers. As a
-        result users are expected to create a SharedEmbeddingStateManager object
-        which would be responsible for managing the shared state and can be
-        passed into different InputLayer objects to share state. For example,
-
-        ```python
-        sc_1, sc_2 = shared_embedding_column_v2(...)
-        sc_3, sc_4 = shared_embedding_column_v2(...)
-        ssm = SharedEmbeddingStateManager()
-        feature_layer1 = FeatureLayer([sc_1, sc_3], ...,
-                                      shared_state_manager=ssm)
-        feature_layer2 = FeatureLayer([sc_2, sc_4], ...,
-                                      shared_state_manager=ssm)
-        ```
-        now input_layer1 and input_layer2 will share variables across. If
-        sharing is not desired, one can create 2 separate
-        SharedEmbeddingStateManager objects
-
-        ```python
-        ssm1 = SharedEmbeddingStateManager()
-        ssm2 = SharedEmbeddingStateManager()
-        feature_layer1 = FeatureLayer([sc_1, sc_3], ...,
-                                      shared_state_manager=ssm1)
-        feature_layer2 = FeatureLayer([sc_2, sc_4], ...,
-                                      shared_state_manager=ssm2)
-        ```
+      name: Name to give to the DenseFeatures.
       **kwargs: Keyword arguments to construct a layer.
 
     Raises:
       ValueError: if an item in `feature_columns` is not a `DenseColumn`.
     """
-    super(FeatureLayer, self).__init__(name=name, trainable=trainable, **kwargs)
+    super(DenseFeatures, self).__init__(
+        name=name, trainable=trainable, **kwargs)
 
     self._feature_columns = _normalize_feature_columns(feature_columns)
+    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
     self._state_manager = _StateManagerImpl(self, self.trainable)
-    self._shared_state_manager = shared_state_manager
-    for column in sorted(self._feature_columns, key=lambda x: x.name):
+    for column in self._feature_columns:
       if not isinstance(column, DenseColumn):
         raise ValueError(
             'Items of feature_columns must be a DenseColumn. '
@@ -380,14 +370,11 @@ class FeatureLayer(Layer):
     return True
 
   def build(self, _):
-    for column in sorted(self._feature_columns, key=lambda x: x.name):
-      if isinstance(column, SharedEmbeddingColumn):
-        column.create_state(self._shared_state_manager)
-      else:
-        with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
-          with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
-            column.create_state(self._state_manager)
-      super(FeatureLayer, self).build(None)
+    for column in self._feature_columns:
+      with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
+        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
+          column.create_state(self._state_manager)
+      super(DenseFeatures, self).build(None)
 
   def call(self, features, cols_to_output_tensors=None):
     """Returns a dense tensor corresponding to the `feature_columns`.
@@ -414,15 +401,11 @@ class FeatureLayer(Layer):
     transformation_cache = FeatureTransformationCache(features)
     output_tensors = []
     ordered_columns = []
-    for column in sorted(self._feature_columns, key=lambda x: x.name):
+    for column in self._feature_columns:
       with ops.name_scope(column.name):
         ordered_columns.append(column)
-        if isinstance(column, SharedEmbeddingColumn):
-          tensor = column.get_dense_tensor(transformation_cache,
-                                           self._shared_state_manager)
-        else:
-          tensor = column.get_dense_tensor(transformation_cache,
-                                           self._state_manager)
+        tensor = column.get_dense_tensor(transformation_cache,
+                                         self._state_manager)
         num_elements = column.variable_shape.num_elements()
         batch_size = array_ops.shape(tensor)[0]
         tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
@@ -435,23 +418,115 @@ class FeatureLayer(Layer):
 
   def compute_output_shape(self, input_shape):
     total_elements = 0
-    for column in sorted(self._feature_columns, key=lambda x: x.name):
+    for column in self._feature_columns:
       total_elements += column.variable_shape.num_elements()
     return (input_shape[0], total_elements)
 
 
-def _strip_leading_slashes(name):
-  return name.rsplit('/', 1)[-1]
+class _LinearModelLayer(Layer):
+  """Layer that contains logic for `LinearModel`."""
+
+  def __init__(self,
+               feature_columns,
+               units=1,
+               sparse_combiner='sum',
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_LinearModelLayer, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+
+    self._feature_columns = _normalize_feature_columns(feature_columns)
+    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
+    for column in self._feature_columns:
+      if not isinstance(column, (DenseColumn, CategoricalColumn)):
+        raise ValueError(
+            'Items of feature_columns must be either a '
+            'DenseColumn or CategoricalColumn. Given: {}'.format(column))
+
+    self._units = units
+    self._sparse_combiner = sparse_combiner
+
+    self._state_manager = _StateManagerImpl(self, self.trainable)
+    self.bias = None
+
+  def build(self, _):
+    # We need variable scopes for now because we want the variable partitioning
+    # information to percolate down. We also use _pure_variable_scope's here
+    # since we want to open up a name_scope in the `call` method while creating
+    # the ops.
+    with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
+      for column in self._feature_columns:
+        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
+          # Create the state for each feature column
+          column.create_state(self._state_manager)
+
+          # Create a weight variable for each column.
+          if isinstance(column, CategoricalColumn):
+            first_dim = column.num_buckets
+          else:
+            first_dim = column.variable_shape.num_elements()
+          self._state_manager.create_variable(
+              column,
+              name='weights',
+              dtype=dtypes.float32,
+              shape=(first_dim, self._units),
+              initializer=init_ops.zeros_initializer(),
+              trainable=self.trainable)
+
+      # Create a bias variable.
+      self.bias = self.add_variable(
+          name='bias_weights',
+          dtype=dtypes.float32,
+          shape=[self._units],
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable,
+          use_resource=True,
+          # TODO(rohanj): Get rid of this hack once we have a mechanism for
+          # specifying a default partitioner for an entire layer. In that case,
+          # the default getter for Layers should work.
+          getter=variable_scope.get_variable)
+
+    super(_LinearModelLayer, self).build(None)
+
+  def call(self, features):
+    if not isinstance(features, dict):
+      raise ValueError('We expected a dictionary here. Instead we got: {}'
+                       .format(features))
+    with ops.name_scope(self.name):
+      transformation_cache = FeatureTransformationCache(features)
+      weighted_sums = []
+      for column in self._feature_columns:
+        with ops.name_scope(column.name):
+          # All the weights used in the linear model are owned by the state
+          # manager associated with this Linear Model.
+          weight_var = self._state_manager.get_variable(column, 'weights')
+
+          weighted_sum = _create_weighted_sum(
+              column=column,
+              transformation_cache=transformation_cache,
+              state_manager=self._state_manager,
+              sparse_combiner=self._sparse_combiner,
+              weight_var=weight_var)
+          weighted_sums.append(weighted_sum)
+
+      _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
+      predictions_no_bias = math_ops.add_n(
+          weighted_sums, name='weighted_sum_no_bias')
+      predictions = nn_ops.bias_add(
+          predictions_no_bias, self.bias, name='weighted_sum')
+      return predictions
 
 
-class LinearModel(Layer):
+@tf_export('keras.layers.LinearModel', v1=[])
+class LinearModel(training.Model):
   """Produces a linear prediction `Tensor` based on given `feature_columns`.
 
   This layer generates a weighted sum based on output dimension `units`.
   Weighted sum refers to logits in classification problems. It refers to the
   prediction itself for linear regression problems.
 
-  Note on supported columns: `LinearModel` treats categorical columns as
+  Note on supported columns: `LinearLayer` treats categorical columns as
   `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
   like:
 
@@ -476,7 +551,7 @@ class LinearModel(Layer):
   keywords = categorical_column_with_hash_bucket("keywords", 10K)
   keywords_price = crossed_column('keywords', price_buckets, ...)
   columns = [price_buckets, keywords, keywords_price ...]
-  linear_model = LinearModel(columns)
+  linear_model = LinearLayer(columns)
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   prediction = linear_model(features)
@@ -489,9 +564,8 @@ class LinearModel(Layer):
                sparse_combiner='sum',
                trainable=True,
                name=None,
-               shared_state_manager=None,
                **kwargs):
-    """Constructs a LinearModel.
+    """Constructs a LinearLayer.
 
     Args:
       feature_columns: An iterable containing the FeatureColumns to use as
@@ -541,75 +615,21 @@ class LinearModel(Layer):
         `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
       name: Name to give to the Linear Model. All variables and ops created will
         be scoped by this name.
-      shared_state_manager: SharedEmbeddingStateManager that manages the state
-        of SharedEmbeddingColumns. For more info, look at `FeatureLayer`.
       **kwargs: Keyword arguments to construct a layer.
 
     Raises:
       ValueError: if an item in `feature_columns` is neither a `DenseColumn`
         nor `CategoricalColumn`.
     """
-    super(LinearModel, self).__init__(name=name, trainable=trainable, **kwargs)
-
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
-    for column in self._feature_columns:
-      if not isinstance(column, (DenseColumn, CategoricalColumn)):
-        raise ValueError(
-            'Items of feature_columns must be either a '
-            'DenseColumn or CategoricalColumn. Given: {}'.format(column))
-
-    self._units = units
-    self._sparse_combiner = sparse_combiner
-
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    self._shared_state_manager = shared_state_manager
-    self._bias_variable = None
-
-  def build(self, _):
-    # Create state for shared embedding columns.
-    for column in self._feature_columns:
-      if isinstance(column, SharedEmbeddingColumn):
-        column.create_state(self._shared_state_manager)
-
-    # We need variable scopes for now because we want the variable partitioning
-    # information to percolate down. We also use _pure_variable_scope's here
-    # since we want to open up a name_scope in the `call` method while creating
-    # the ops.
-    with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
-      for column in self._feature_columns:
-        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
-          # Create the state for each feature column
-          if not isinstance(column, SharedEmbeddingColumn):
-            column.create_state(self._state_manager)
-
-          # Create a weight variable for each column.
-          if isinstance(column, CategoricalColumn):
-            first_dim = column.num_buckets
-          else:
-            first_dim = column.variable_shape.num_elements()
-          self._state_manager.create_variable(
-              column,
-              name='weights',
-              dtype=dtypes.float32,
-              shape=(first_dim, self._units),
-              initializer=init_ops.zeros_initializer(),
-              trainable=self.trainable)
-
-      # Create a bias variable.
-      self._bias_variable = self.add_variable(
-          name='bias_weights',
-          dtype=dtypes.float32,
-          shape=[self._units],
-          initializer=init_ops.zeros_initializer(),
-          trainable=self.trainable,
-          use_resource=True,
-          # TODO(rohanj): Get rid of this hack once we have a mechanism for
-          # specifying a default partitioner for an entire layer. In that case,
-          # the default getter for Layers should work.
-          getter=variable_scope.get_variable)
 
-    super(LinearModel, self).build(None)
+    super(LinearModel, self).__init__(name=name, **kwargs)
+    self.layer = _LinearModelLayer(
+        feature_columns,
+        units,
+        sparse_combiner,
+        trainable,
+        name=self.name,
+        **kwargs)
 
   def call(self, features):
     """Returns a `Tensor` the represents the predictions of a linear model.
@@ -627,47 +647,14 @@ class LinearModel(Layer):
     Raises:
       ValueError: If features are not a dictionary.
     """
-    if not isinstance(features, dict):
-      raise ValueError('We expected a dictionary here. Instead we got: ',
-                       features)
-    with ops.name_scope(self.name):
-      transformation_cache = FeatureTransformationCache(features)
-      weighted_sums = []
-      for column in self._feature_columns:
-        with ops.name_scope(column.name):
-          # All the weights used in the linear model are owned by the state
-          # manager associated with this Linear Model.
-          weight_var = self._state_manager.get_variable(column, 'weights')
-
-          # The embedding weights for the SharedEmbeddingColumn are owned by
-          # the shared_state_manager and so we need to pass that in while
-          # creating the weighted sum. For all other columns, the state is owned
-          # by the Linear Model's state manager.
-          if isinstance(column, SharedEmbeddingColumn):
-            state_manager = self._shared_state_manager
-          else:
-            state_manager = self._state_manager
-          weighted_sum = _create_weighted_sum(
-              column=column,
-              transformation_cache=transformation_cache,
-              state_manager=state_manager,
-              sparse_combiner=self._sparse_combiner,
-              weight_var=weight_var)
-          weighted_sums.append(weighted_sum)
-
-      _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
-      predictions_no_bias = math_ops.add_n(
-          weighted_sums, name='weighted_sum_no_bias')
-      predictions = nn_ops.bias_add(
-          predictions_no_bias, self._bias_variable, name='weighted_sum')
-      return predictions
+    return self.layer(features)
 
   @property
-  def bias_variable(self):
-    return self._bias_variable
+  def bias(self):
+    return self.layer.bias
 
 
-def _transform_features(features, feature_columns, state_manager):
+def _transform_features_v2(features, feature_columns, state_manager):
   """Returns transformed features based on features columns passed in.
 
   Please note that most probably you would not need to use this function. Please
@@ -712,7 +699,8 @@ def _transform_features(features, feature_columns, state_manager):
   return outputs
 
 
-def make_parse_example_spec(feature_columns):
+@tf_export('feature_column.make_parse_example_spec', v1=[])
+def make_parse_example_spec_v2(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
   The returned dictionary can be used as arg 'features' in `tf.parse_example`.
@@ -771,16 +759,21 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
-def embedding_column(
-    categorical_column, dimension, combiner='mean', initializer=None,
-    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
-    trainable=True):
-  """`_DenseColumn` that converts from sparse, categorical input.
+@tf_export('feature_column.embedding_column')
+def embedding_column(categorical_column,
+                     dimension,
+                     combiner='mean',
+                     initializer=None,
+                     ckpt_to_load_from=None,
+                     tensor_name_in_ckpt=None,
+                     max_norm=None,
+                     trainable=True):
+  """`DenseColumn` that converts from sparse, categorical input.
 
   Use this when your inputs are sparse, but you want to convert them to a dense
   representation (e.g., to feed to a DNN).
 
-  Inputs must be a `_CategoricalColumn` created by any of the
+  Inputs must be a `CategoricalColumn` created by any of the
   `categorical_column_*` function. Here is an example of using
   `embedding_column` with `DNNClassifier`:
 
@@ -814,12 +807,12 @@ def embedding_column(
   ```
 
   Args:
-    categorical_column: A `_CategoricalColumn` created by a
+    categorical_column: A `CategoricalColumn` created by a
       `categorical_column_with_*` function. This column produces the sparse IDs
       that are inputs to the embedding lookup.
     dimension: An integer specifying dimension of the embedding, must be > 0.
-    combiner: A string specifying how to reduce if there are multiple entries
-      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+    combiner: A string specifying how to reduce if there are multiple entries in
+      a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
       'mean' the default. 'sqrtn' often achieves good accuracy, in particular
       with bag-of-words columns. Each of this can be thought as example level
       normalizations on the column. For more information, see
@@ -830,14 +823,14 @@ def embedding_column(
       `1/sqrt(dimension)`.
     ckpt_to_load_from: String representing checkpoint name/pattern from which to
       restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
-    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
-      which to restore the column weights. Required if `ckpt_to_load_from` is
-      not `None`.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which
+      to restore the column weights. Required if `ckpt_to_load_from` is not
+      `None`.
     max_norm: If not `None`, embedding values are l2-normalized to this value.
     trainable: Whether or not the embedding is trainable. Default is True.
 
   Returns:
-    `_DenseColumn` that converts from sparse input.
+    `DenseColumn` that converts from sparse input.
 
   Raises:
     ValueError: if `dimension` not > 0.
@@ -871,15 +864,16 @@ def embedding_column(
       trainable=trainable)
 
 
-def shared_embedding_columns_v2(categorical_columns,
-                                dimension,
-                                combiner='mean',
-                                initializer=None,
-                                shared_embedding_collection_name=None,
-                                ckpt_to_load_from=None,
-                                tensor_name_in_ckpt=None,
-                                max_norm=None,
-                                trainable=True):
+@tf_export(v1=['feature_column.shared_embedding_columns'])
+def shared_embedding_columns(categorical_columns,
+                             dimension,
+                             combiner='mean',
+                             initializer=None,
+                             shared_embedding_collection_name=None,
+                             ckpt_to_load_from=None,
+                             tensor_name_in_ckpt=None,
+                             max_norm=None,
+                             trainable=True):
   """List of dense columns that convert from sparse, categorical input.
 
   This is similar to `embedding_column`, except that it produces a list of
@@ -940,8 +934,8 @@ def shared_embedding_columns_v2(categorical_columns,
       categorical_column_with_vocabulary_file with the same vocabulary_file.
       Some or all columns could also be weighted_categorical_column.
     dimension: An integer specifying dimension of the embedding, must be > 0.
-    combiner: A string specifying how to reduce if there are multiple entries
-      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+    combiner: A string specifying how to reduce if there are multiple entries in
+      a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
       'mean' the default. 'sqrtn' often achieves good accuracy, in particular
       with bag-of-words columns. Each of this can be thought as example level
       normalizations on the column. For more information, see
@@ -950,16 +944,17 @@ def shared_embedding_columns_v2(categorical_columns,
       variable initialization. If not specified, defaults to
       `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
       `1/sqrt(dimension)`.
-    shared_embedding_collection_name: Optional collective name of these columns.
-      If not given, a reasonable name will be chosen based on the names of
-      `categorical_columns`.
+    shared_embedding_collection_name: Optional name of the collection where
+      shared embedding weights are added. If not given, a reasonable name will
+      be chosen based on the names of `categorical_columns`. This is also used
+      in `variable_scope` when creating shared embedding weights.
     ckpt_to_load_from: String representing checkpoint name/pattern from which to
       restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
-    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
-      which to restore the column weights. Required if `ckpt_to_load_from` is
-      not `None`.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value, before combining.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which
+      to restore the column weights. Required if `ckpt_to_load_from` is not
+      `None`.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
+      than this value, before combining.
     trainable: Whether or not the embedding is trainable. Default is True.
 
   Returns:
@@ -996,15 +991,17 @@ def shared_embedding_columns_v2(categorical_columns,
   sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
 
   c0 = sorted_columns[0]
-  num_buckets = c0.num_buckets
-  if not isinstance(c0, CategoricalColumn):
+  num_buckets = c0._num_buckets  # pylint: disable=protected-access
+  if not isinstance(c0, fc_old._CategoricalColumn):  # pylint: disable=protected-access
     raise ValueError(
-        'All categorical_columns must be subclasses of CategoricalColumn. '
+        'All categorical_columns must be subclasses of _CategoricalColumn. '
         'Given: {}, of type: {}'.format(c0, type(c0)))
-  if isinstance(c0, WeightedCategoricalColumn):
+  if isinstance(c0,
+                (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)):  # pylint: disable=protected-access
     c0 = c0.categorical_column
   for c in sorted_columns[1:]:
-    if isinstance(c, WeightedCategoricalColumn):
+    if isinstance(
+        c, (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)):  # pylint: disable=protected-access
       c = c.categorical_column
     if not isinstance(c, type(c0)):
       raise ValueError(
@@ -1012,12 +1009,12 @@ def shared_embedding_columns_v2(categorical_columns,
           'the same type, or be weighted_categorical_column of the same type. '
           'Given column: {} of type: {} does not match given column: {} of '
           'type: {}'.format(c0, type(c0), c, type(c)))
-    if num_buckets != c.num_buckets:
+    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
       raise ValueError(
           'To use shared_embedding_column, all categorical_columns must have '
           'the same number of buckets. Given column: {} with buckets: {} does  '
           'not match column: {} with buckets: {}'.format(
-              c0, num_buckets, c, c.num_buckets))
+              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
 
   if not shared_embedding_collection_name:
     shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
@@ -1026,7 +1023,7 @@ def shared_embedding_columns_v2(categorical_columns,
   result = []
   for column in categorical_columns:
     result.append(
-        SharedEmbeddingColumn(
+        fc_old._SharedEmbeddingColumn(  # pylint: disable=protected-access
             categorical_column=column,
             initializer=initializer,
             dimension=dimension,
@@ -1040,26 +1037,193 @@ def shared_embedding_columns_v2(categorical_columns,
   return result
 
 
-def numeric_column(key,
-                   shape=(1,),
-                   default_value=None,
-                   dtype=dtypes.float32,
-                   normalizer_fn=None):
-  """Represents real valued or numerical features.
+@tf_export('feature_column.shared_embedding_columns', v1=[])
+def shared_embedding_columns_v2(categorical_columns,
+                                dimension,
+                                combiner='mean',
+                                initializer=None,
+                                shared_embedding_collection_name=None,
+                                ckpt_to_load_from=None,
+                                tensor_name_in_ckpt=None,
+                                max_norm=None,
+                                trainable=True):
+  """List of dense columns that convert from sparse, categorical input.
 
-  Example:
+  This is similar to `embedding_column`, except that it produces a list of
+  embedding columns that share the same embedding weights.
+
+  Use this when your inputs are sparse and of the same type (e.g. watched and
+  impression video IDs that share the same vocabulary), and you want to convert
+  them to a dense representation (e.g., to feed to a DNN).
+
+  Inputs must be a list of categorical columns created by any of the
+  `categorical_column_*` function. They must all be of the same type and have
+  the same arguments except `key`. E.g. they can be
+  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
+  all columns could also be weighted_categorical_column.
+
+  Here is an example embedding of two features for a DNNClassifier model:
 
   ```python
-  price = numeric_column('price')
-  columns = [price, ...]
-  features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  dense_tensor = input_layer(features, columns)
+  watched_video_id = categorical_column_with_vocabulary_file(
+      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+  impression_video_id = categorical_column_with_vocabulary_file(
+      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+  columns = shared_embedding_columns(
+      [watched_video_id, impression_video_id], dimension=10)
 
-  # or
-  bucketized_price = bucketized_column(price, boundaries=[...])
-  columns = [bucketized_price, ...]
-  features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  linear_prediction = linear_model(features, columns)
+  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
+
+  label_column = ...
+  def input_fn():
+    features = tf.parse_example(
+        ..., features=make_parse_example_spec(columns + [label_column]))
+    labels = features.pop(label_column.name)
+    return features, labels
+
+  estimator.train(input_fn=input_fn, steps=100)
+  ```
+
+  Here is an example using `shared_embedding_columns` with model_fn:
+
+  ```python
+  def model_fn(features, ...):
+    watched_video_id = categorical_column_with_vocabulary_file(
+        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+    impression_video_id = categorical_column_with_vocabulary_file(
+        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+    columns = shared_embedding_columns(
+        [watched_video_id, impression_video_id], dimension=10)
+    dense_tensor = input_layer(features, columns)
+    # Form DNN layers, calculate loss, and return EstimatorSpec.
+    ...
+  ```
+
+  Args:
+    categorical_columns: List of categorical columns created by a
+      `categorical_column_with_*` function. These columns produce the sparse IDs
+      that are inputs to the embedding lookup. All columns must be of the same
+      type and have the same arguments except `key`. E.g. they can be
+      categorical_column_with_vocabulary_file with the same vocabulary_file.
+      Some or all columns could also be weighted_categorical_column.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries
+      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+      with bag-of-words columns. Each of this can be thought as example level
+      normalizations on the column. For more information, see
+      `tf.embedding_lookup_sparse`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+    shared_embedding_collection_name: Optional collective name of these columns.
+      If not given, a reasonable name will be chosen based on the names of
+      `categorical_columns`.
+    ckpt_to_load_from: String representing checkpoint name/pattern from which to
+      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
+      which to restore the column weights. Required if `ckpt_to_load_from` is
+      not `None`.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is
+      larger than this value, before combining.
+    trainable: Whether or not the embedding is trainable. Default is True.
+
+  Returns:
+    A list of dense columns that converts from sparse input. The order of
+    results follows the ordering of `categorical_columns`.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if any of the given `categorical_columns` is of different type
+      or has different arguments than the others.
+    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
+      is specified.
+    ValueError: if `initializer` is specified and is not callable.
+    RuntimeError: if eager execution is enabled.
+  """
+  if context.executing_eagerly():
+    raise RuntimeError('shared_embedding_columns are not supported when eager '
+                       'execution is enabled.')
+
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+    raise ValueError('Must specify both `ckpt_to_load_from` and '
+                     '`tensor_name_in_ckpt` or none of them.')
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified.')
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1. / math.sqrt(dimension))
+
+  # Sort the columns so the default collection name is deterministic even if the
+  # user passes columns from an unsorted collection, such as dict.values().
+  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
+
+  c0 = sorted_columns[0]
+  num_buckets = c0.num_buckets
+  if not isinstance(c0, CategoricalColumn):
+    raise ValueError(
+        'All categorical_columns must be subclasses of CategoricalColumn. '
+        'Given: {}, of type: {}'.format(c0, type(c0)))
+  if isinstance(c0, WeightedCategoricalColumn):
+    c0 = c0.categorical_column
+  for c in sorted_columns[1:]:
+    if isinstance(c, WeightedCategoricalColumn):
+      c = c.categorical_column
+    if not isinstance(c, type(c0)):
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same type, or be weighted_categorical_column of the same type. '
+          'Given column: {} of type: {} does not match given column: {} of '
+          'type: {}'.format(c0, type(c0), c, type(c)))
+    if num_buckets != c.num_buckets:
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same number of buckets. Given column: {} with buckets: {} does  '
+          'not match column: {} with buckets: {}'.format(
+              c0, num_buckets, c, c.num_buckets))
+
+  if not shared_embedding_collection_name:
+    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
+    shared_embedding_collection_name += '_shared_embedding'
+
+  column_creator = SharedEmbeddingColumnCreator(
+      dimension, initializer, ckpt_to_load_from, tensor_name_in_ckpt,
+      num_buckets, trainable, shared_embedding_collection_name)
+
+  result = []
+  for column in categorical_columns:
+    result.append(
+        column_creator(
+            categorical_column=column, combiner=combiner, max_norm=max_norm))
+
+  return result
+
+
+@tf_export('feature_column.numeric_column')
+def numeric_column(key,
+                   shape=(1,),
+                   default_value=None,
+                   dtype=dtypes.float32,
+                   normalizer_fn=None):
+  """Represents real valued or numerical features.
+
+  Example:
+
+  ```python
+  price = numeric_column('price')
+  columns = [price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+
+  # or
+  bucketized_price = bucketized_column(price, boundaries=[...])
+  columns = [bucketized_price, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
   ```
 
   Args:
@@ -1115,6 +1279,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+@tf_export('feature_column.bucketized_column')
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -1181,7 +1346,7 @@ def bucketized_column(source_column, boundaries):
       one-dimensional.
     ValueError: If `boundaries` is not a sorted list or tuple.
   """
-  if not isinstance(source_column, NumericColumn):
+  if not isinstance(source_column, (NumericColumn, fc_old._NumericColumn)):  # pylint: disable=protected-access
     raise ValueError(
         'source_column must be a column generated with numeric_column(). '
         'Given: {}'.format(source_column))
@@ -1211,6 +1376,7 @@ def _assert_key_is_string(key):
             type(key), key))
 
 
+@tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -1269,6 +1435,7 @@ def categorical_column_with_hash_bucket(key,
   return HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
+@tf_export(v1=['feature_column.categorical_column_with_vocabulary_file'])
 def categorical_column_with_vocabulary_file(key,
                                             vocabulary_file,
                                             vocabulary_size=None,
@@ -1346,6 +1513,97 @@ def categorical_column_with_vocabulary_file(key,
   Returns:
     A `CategoricalColumn` with a vocabulary file.
 
+  Raises:
+    ValueError: `vocabulary_file` is missing or cannot be opened.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return categorical_column_with_vocabulary_file_v2(
+      key, vocabulary_file, vocabulary_size,
+      dtype, default_value,
+      num_oov_buckets)
+
+
+@tf_export('feature_column.categorical_column_with_vocabulary_file', v1=[])
+def categorical_column_with_vocabulary_file_v2(key,
+                                               vocabulary_file,
+                                               vocabulary_size=None,
+                                               dtype=dtypes.string,
+                                               default_value=None,
+                                               num_oov_buckets=0):
+  """A `CategoricalColumn` with a vocabulary file.
+
+  Use this when your inputs are in string or integer format, and you have a
+  vocabulary file that maps each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use either (but not both) of
+  `num_oov_buckets` and `default_value` to specify how to include
+  out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string, which will be dropped by this feature column.
+
+  Example with `num_oov_buckets`:
+  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
+  abbreviation. All inputs with values in that file are assigned an ID 0-49,
+  corresponding to its line number. All other values are hashed and assigned an
+  ID 50-54.
+
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Example with `default_value`:
+  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
+  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
+  in input, and other values missing from the file, will be assigned ID 0. All
+  others are assigned the corresponding line number 1-50.
+
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
+      default_value=0)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  And to make an embedding with either:
+
+  ```python
+  columns = [embedding_column(states, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored. If None, it is set to the length of `vocabulary_file`.
+    dtype: The type of features. Only string and integer types are supported.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+
+  Returns:
+    A `CategoricalColumn` with a vocabulary file.
+
   Raises:
     ValueError: `vocabulary_file` is missing or cannot be opened.
     ValueError: `vocabulary_size` is missing or < 1.
@@ -1388,9 +1646,13 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
-def categorical_column_with_vocabulary_list(
-    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
-  """A `_CategoricalColumn` with in-memory vocabulary.
+@tf_export('feature_column.categorical_column_with_vocabulary_list')
+def categorical_column_with_vocabulary_list(key,
+                                            vocabulary_list,
+                                            dtype=None,
+                                            default_value=-1,
+                                            num_oov_buckets=0):
+  """A `CategoricalColumn` with in-memory vocabulary.
 
   Use this when your inputs are in string or integer format, and you have an
   in-memory vocabulary mapping each value to an integer ID. By default,
@@ -1439,14 +1701,14 @@ def categorical_column_with_vocabulary_list(
   ```
 
   Args:
-    key: A unique string identifying the input feature. It is used as the
-      column name and the dictionary key for feature parsing configs, feature
-      `Tensor` objects, and feature columns.
+    key: A unique string identifying the input feature. It is used as the column
+      name and the dictionary key for feature parsing configs, feature `Tensor`
+      objects, and feature columns.
     vocabulary_list: An ordered iterable defining the vocabulary. Each feature
       is mapped to the index of its value (if present) in `vocabulary_list`.
       Must be castable to `dtype`.
-    dtype: The type of features. Only string and integer types are supported.
-      If `None`, it will be inferred from `vocabulary_list`.
+    dtype: The type of features. Only string and integer types are supported. If
+      `None`, it will be inferred from `vocabulary_list`.
     default_value: The integer ID value to return for out-of-vocabulary feature
       values, defaults to `-1`. This can not be specified with a positive
       `num_oov_buckets`.
@@ -1501,6 +1763,7 @@ def categorical_column_with_vocabulary_list(
       num_oov_buckets=num_oov_buckets)
 
 
+@tf_export('feature_column.categorical_column_with_identity')
 def categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `CategoricalColumn` that returns identity values.
 
@@ -1568,6 +1831,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, number_buckets=num_buckets, default_value=default_value)
 
 
+@tf_export('feature_column.indicator_column')
 def indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
@@ -1602,9 +1866,11 @@ def indicator_column(categorical_column):
   return IndicatorColumn(categorical_column)
 
 
-def weighted_categorical_column(
-    categorical_column, weight_feature_key, dtype=dtypes.float32):
-  """Applies weight values to a `_CategoricalColumn`.
+@tf_export('feature_column.weighted_categorical_column')
+def weighted_categorical_column(categorical_column,
+                                weight_feature_key,
+                                dtype=dtypes.float32):
+  """Applies weight values to a `CategoricalColumn`.
 
   Use this when each of your sparse inputs has both an ID and a value. For
   example, if you're representing text documents as a collection of word
@@ -1655,7 +1921,7 @@ def weighted_categorical_column(
   the same indices and dense shape.
 
   Args:
-    categorical_column: A `_CategoricalColumn` created by
+    categorical_column: A `CategoricalColumn` created by
       `categorical_column_with_*` functions.
     weight_feature_key: String key for weight values.
     dtype: Type of weights, such as `tf.float32`. Only float and integer weights
@@ -1676,6 +1942,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
+@tf_export('feature_column.crossed_column')
 def crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
@@ -1788,12 +2055,13 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
         'keys must be a list with length > 1. Given: {}'.format(keys))
   for key in keys:
     if (not isinstance(key, six.string_types) and
-        not isinstance(key, CategoricalColumn)):
+        not isinstance(key, (CategoricalColumn, fc_old._CategoricalColumn))):  # pylint: disable=protected-access
       raise ValueError(
           'Unsupported key type. All keys must be either string, or '
           'categorical column except HashedCategoricalColumn. '
           'Given: {}'.format(key))
-    if isinstance(key, HashedCategoricalColumn):
+    if isinstance(key,
+                  (HashedCategoricalColumn, fc_old._HashedCategoricalColumn)):  # pylint: disable=protected-access
       raise ValueError(
           'categorical_column_with_hash_bucket is not supported for crossing. '
           'Hashing before crossing will increase probability of collision. '
@@ -1802,6 +2070,7 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
       keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)
 
 
+@six.add_metaclass(abc.ABCMeta)
 class FeatureColumn(object):
   """Represents a feature column abstraction.
 
@@ -1817,7 +2086,6 @@ class FeatureColumn(object):
 
   This class is an abstract class. Users should not create instances of this.
   """
-  __metaclass__ = abc.ABCMeta
 
   @abc.abstractproperty
   def name(self):
@@ -1882,6 +2150,111 @@ class FeatureColumn(object):
     """
     pass
 
+  @abc.abstractproperty
+  def _is_v2_column(self):
+    """Returns whether this FeatureColumn is fully conformant to the new API.
+
+    This is needed for composition type cases where an EmbeddingColumn etc.
+    might take in old categorical columns as input and then we want to use the
+    old API.
+    """
+    pass
+
+  @abc.abstractproperty
+  def parents(self):
+    """Returns a list of immediate raw feature and FeatureColumn dependencies.
+
+    For example:
+    # For the following feature columns
+    a = numeric_column('f1')
+    c = crossed_column(a, 'f2')
+    # The expected parents are:
+    a.parents = ['f1']
+    c.parents = [a, 'f2']
+    """
+    pass
+
+  @abc.abstractmethod
+  def _get_config(self):
+    """Returns the config of the feature column.
+
+    A FeatureColumn config is a Python dictionary (serializable) containing the
+    configuration of a FeatureColumn. The same FeatureColumn can be
+    reinstantiated later from this configuration.
+
+    The config of a feature column does not include information about feature
+    columns depending on it nor the FeatureColumn class name.
+
+    Example with (de)serialization practices followed in this file:
+    ```python
+    class SerializationExampleFeatureColumn(
+        FeatureColumn, collections.namedtuple(
+            'SerializationExampleFeatureColumn',
+            ('dimension', 'parent', 'dtype', 'normalizer_fn'))):
+
+      def _get_config(self):
+        # Create a dict from the namedtuple.
+        # Python attribute literals can be directly copied from / to the config.
+        # For example 'dimension', assuming it is an integer literal.
+        config = dict(zip(self._fields, self))
+
+        # (De)serialization of parent FeatureColumns should use the provided
+        # (de)serialize_feature_column() methods that take care of de-duping.
+        config['parent'] = serialize_feature_column(self.parent)
+
+        # Many objects provide custom (de)serialization e.g: for tf.DType
+        # tf.DType.name, tf.as_dtype() can be used.
+        config['dtype'] = self.dtype.name
+
+        # Non-trivial dependencies should be Keras-(de)serializable.
+        config['normalizer_fn'] = utils.serialize_keras_object(
+            self.normalizer_fn)
+
+        return config
+
+      @classmethod
+      def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+        # This should do the inverse transform from `_get_config` and construct
+        # the namedtuple.
+        kwargs = config.copy()
+        kwargs['parent'] = deserialize_feature_column(
+            config['parent'], custom_objects, columns_by_name)
+        kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+        kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+          config['normalizer_fn'], custom_objects=custom_objects)
+        return cls(**kwargs)
+
+    ```
+    Returns:
+      A serializable Dict that can be used to deserialize the object with
+      from_config.
+    """
+    pass
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """Creates a FeatureColumn from its config.
+
+    This method should be the reverse of `_get_config`, capable of instantiating
+    the same FeatureColumn from the config dictionary. See `_get_config` for an
+    example of common (de)serialization practices followed in this file.
+
+    TODO(b/118939620): This is a private method until consensus is reached on
+    supporting object deserialization deduping within Keras.
+
+    Args:
+      config: A Dict config acquired with `_get_config`.
+      custom_objects: Optional dictionary mapping names (strings) to custom
+        classes or functions to be considered during deserialization.
+      columns_by_name: A Dict[String, FeatureColumn] of existing columns in
+        order to avoid duplication. Should be passed to any calls to
+        deserialize_feature_column().
+
+    Returns:
+      A FeatureColumn for the input config.
+    """
+    pass
+
 
 class DenseColumn(FeatureColumn):
   """Represents a column which can be represented as `Tensor`.
@@ -1890,8 +2263,6 @@ class DenseColumn(FeatureColumn):
   indicator_column.
   """
 
-  __metaclass__ = abc.ABCMeta
-
   @abc.abstractproperty
   def variable_shape(self):
     """`TensorShape` of `get_dense_tensor`, without batch dimension."""
@@ -1927,6 +2298,8 @@ def is_feature_column_v2(feature_columns):
   for feature_column in feature_columns:
     if not isinstance(feature_column, FeatureColumn):
       return False
+    if not feature_column._is_v2_column:  # pylint: disable=protected-access
+      return False
   return True
 
 
@@ -1963,7 +2336,6 @@ class CategoricalColumn(FeatureColumn):
 
   A categorical feature typically handled with a `tf.SparseTensor` of IDs.
   """
-  __metaclass__ = abc.ABCMeta
 
   IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
       'IdWeightPair', ('id_tensor', 'weight_tensor'))
@@ -2036,7 +2408,7 @@ def _create_categorical_column_weighted_sum(
     weight_tensor = sparse_ops.sparse_reshape(
         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
-  return _safe_embedding_lookup_sparse(
+  return embedding_ops.safe_embedding_lookup_sparse(
       weight_var,
       id_tensor,
       sparse_weights=weight_tensor,
@@ -2047,8 +2419,6 @@ def _create_categorical_column_weighted_sum(
 class SequenceDenseColumn(FeatureColumn):
   """Represents dense sequence data."""
 
-  __metaclass__ = abc.ABCMeta
-
   TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
       'TensorSequenceLengthPair', ('dense_tensor', 'sequence_length'))
 
@@ -2201,19 +2571,6 @@ class FeatureTransformationCache(object):
           lambda: feature_tensor)
 
 
-# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
-def _shape_offsets(shape):
-  """Returns moving offset for each dimension given shape."""
-  offsets = []
-  for dim in reversed(shape):
-    if offsets:
-      offsets.append(dim * offsets[-1])
-    else:
-      offsets.append(dim)
-  offsets.reverse()
-  return offsets
-
-
 # TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
 def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):
   """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
@@ -2306,14 +2663,19 @@ def _normalize_feature_columns(feature_columns):
 
 class NumericColumn(
     DenseColumn,
+    fc_old._DenseColumn,  # pylint: disable=protected-access
     collections.namedtuple(
         'NumericColumn',
         ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))):
   """see `numeric_column`."""
 
   @property
-  def name(self):
-    """See `FeatureColumn` base class."""
+  def _is_v2_column(self):
+    return True
+
+  @property
+  def name(self):
+    """See `FeatureColumn` base class."""
     return self.key
 
   @property
@@ -2325,6 +2687,27 @@ class NumericColumn(
                                         self.default_value)
     }
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.parse_example_spec
+
+  def _transform_input_tensor(self, input_tensor):
+    if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
+      raise ValueError(
+          'The corresponding Tensor of numerical column must be a Tensor. '
+          'SparseTensor is not supported. key: {}'.format(self.key))
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return math_ops.to_float(input_tensor)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    input_tensor = inputs.get(self.key)
+    return self._transform_input_tensor(input_tensor)
+
   def transform_feature(self, transformation_cache, state_manager):
     """See `FeatureColumn` base class.
 
@@ -2342,19 +2725,19 @@ class NumericColumn(
       ValueError: If a SparseTensor is passed in.
     """
     input_tensor = transformation_cache.get(self.key, state_manager)
-    if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
-      raise ValueError(
-          'The corresponding Tensor of numerical column must be a Tensor. '
-          'SparseTensor is not supported. key: {}'.format(self.key))
-    if self.normalizer_fn is not None:
-      input_tensor = self.normalizer_fn(input_tensor)
-    return math_ops.to_float(input_tensor)
+    return self._transform_input_tensor(input_tensor)
 
   @property
   def variable_shape(self):
     """See `DenseColumn` base class."""
     return tensor_shape.TensorShape(self.shape)
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _variable_shape(self):
+    return self.variable_shape
+
   def get_dense_tensor(self, transformation_cache, state_manager):
     """Returns dense `Tensor` representing numeric feature.
 
@@ -2371,12 +2754,54 @@ class NumericColumn(
     # representation created by _transform_feature.
     return transformation_cache.get(self, state_manager)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    return inputs.get(self)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['normalizer_fn'] = utils.serialize_keras_object(self.normalizer_fn)
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
+    if config['normalizer_fn']:
+      kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+          config['normalizer_fn'], custom_objects=custom_objects)
+    else:
+      kwargs['normalizer_fn'] = None
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
-class BucketizedColumn(DenseColumn, CategoricalColumn,
-                       collections.namedtuple('BucketizedColumn',
-                                              ('source_column', 'boundaries'))):
+class BucketizedColumn(
+    DenseColumn,
+    CategoricalColumn,
+    fc_old._DenseColumn,  # pylint: disable=protected-access
+    fc_old._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple('BucketizedColumn',
+                           ('source_column', 'boundaries'))):
   """See `bucketized_column`."""
 
+  @property
+  def _is_v2_column(self):
+    return (isinstance(self.source_column, FeatureColumn) and
+            self.source_column._is_v2_column)  # pylint: disable=protected-access
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
@@ -2387,6 +2812,21 @@ class BucketizedColumn(DenseColumn, CategoricalColumn,
     """See `FeatureColumn` base class."""
     return self.source_column.parse_example_spec
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.source_column._parse_example_spec  # pylint: disable=protected-access
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    """Returns bucketized categorical `source_column` tensor."""
+    source_tensor = inputs.get(self.source_column)
+    return math_ops._bucketize(  # pylint: disable=protected-access
+        source_tensor,
+        boundaries=self.boundaries)
+
   def transform_feature(self, transformation_cache, state_manager):
     """Returns bucketized categorical `source_column` tensor."""
     source_tensor = transformation_cache.get(self.source_column, state_manager)
@@ -2400,24 +2840,45 @@ class BucketizedColumn(DenseColumn, CategoricalColumn,
     return tensor_shape.TensorShape(
         tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
 
-  def get_dense_tensor(self, transformation_cache, state_manager):
-    """Returns one hot encoded dense `Tensor`."""
-    input_tensor = transformation_cache.get(self, state_manager)
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _variable_shape(self):
+    return self.variable_shape
+
+  def _get_dense_tensor_for_input_tensor(self, input_tensor):
     return array_ops.one_hot(
         indices=math_ops.to_int64(input_tensor),
         depth=len(self.boundaries) + 1,
         on_value=1.,
         off_value=0.)
 
+  def get_dense_tensor(self, transformation_cache, state_manager):
+    """Returns one hot encoded dense `Tensor`."""
+    input_tensor = transformation_cache.get(self, state_manager)
+    return self._get_dense_tensor_for_input_tensor(input_tensor)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return self._get_dense_tensor_for_input_tensor(input_tensor)
+
   @property
   def num_buckets(self):
     """See `CategoricalColumn` base class."""
     # By construction, source_column is always one-dimensional.
     return (len(self.boundaries) + 1) * self.source_column.shape[0]
 
-  def get_sparse_tensors(self, transformation_cache, state_manager):
-    """Converts dense inputs to SparseTensor so downstream code can use it."""
-    input_tensor = transformation_cache.get(self, state_manager)
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _num_buckets(self):
+    return self.num_buckets
+
+  def _get_sparse_tensors_for_input_tensor(self, input_tensor):
     batch_size = array_ops.shape(input_tensor)[0]
     # By construction, source_column is always one-dimensional.
     source_dimension = self.source_column.shape[0]
@@ -2443,15 +2904,58 @@ class BucketizedColumn(DenseColumn, CategoricalColumn,
         dense_shape=dense_shape)
     return CategoricalColumn.IdWeightPair(sparse_tensor, None)
 
+  def get_sparse_tensors(self, transformation_cache, state_manager):
+    """Converts dense inputs to SparseTensor so downstream code can use it."""
+    input_tensor = transformation_cache.get(self, state_manager)
+    return self._get_sparse_tensors_for_input_tensor(input_tensor)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    """Converts dense inputs to SparseTensor so downstream code can use it."""
+    del weight_collections
+    del trainable
+    input_tensor = inputs.get(self)
+    return self._get_sparse_tensors_for_input_tensor(input_tensor)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.source_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['source_column'] = serialize_feature_column(self.source_column)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['source_column'] = deserialize_feature_column(
+        config['source_column'], custom_objects, columns_by_name)
+    return cls(**kwargs)
+
 
 class EmbeddingColumn(
-    DenseColumn, SequenceDenseColumn,
+    DenseColumn,
+    SequenceDenseColumn,
+    fc_old._DenseColumn,  # pylint: disable=protected-access
+    fc_old._SequenceDenseColumn,  # pylint: disable=protected-access
     collections.namedtuple(
         'EmbeddingColumn',
         ('categorical_column', 'dimension', 'combiner', 'initializer',
          'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
   """See `embedding_column`."""
 
+  @property
+  def _is_v2_column(self):
+    return (isinstance(self.categorical_column, FeatureColumn) and
+            self.categorical_column._is_v2_column)  # pylint: disable=protected-access
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
@@ -2462,37 +2966,50 @@ class EmbeddingColumn(
     """See `FeatureColumn` base class."""
     return self.categorical_column.parse_example_spec
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
   def transform_feature(self, transformation_cache, state_manager):
     """Transforms underlying `categorical_column`."""
     return transformation_cache.get(self.categorical_column, state_manager)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    return inputs.get(self.categorical_column)
+
   @property
   def variable_shape(self):
     """See `DenseColumn` base class."""
     return tensor_shape.vector(self.dimension)
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _variable_shape(self):
+    return self.variable_shape
+
   def create_state(self, state_manager):
     """Creates the embedding lookup variable."""
-    embedding_shape = (self.categorical_column.num_buckets, self.dimension)
+    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
     state_manager.create_variable(
         self,
         name='embedding_weights',
         shape=embedding_shape,
         dtype=dtypes.float32,
         trainable=self.trainable,
+        # TODO(rohanj): Make this True when b/118500434 is fixed.
+        use_resource=False,
         initializer=self.initializer)
 
-  def _get_dense_tensor_internal(self, transformation_cache, state_manager):
-    """Private method that follows the signature of _get_dense_tensor."""
-    # Get sparse IDs and weights.
-    sparse_tensors = self.categorical_column.get_sparse_tensors(
-        transformation_cache, state_manager)
+  def _get_dense_tensor_internal_helper(self, sparse_tensors,
+                                        embedding_weights):
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
-    embedding_weights = state_manager.get_variable(
-        self, name='embedding_weights')
-
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
       if isinstance(to_restore, variables.PartitionedVariable):
@@ -2502,7 +3019,7 @@ class EmbeddingColumn(
       })
 
     # Return embedding lookup result.
-    return _safe_embedding_lookup_sparse(
+    return embedding_ops.safe_embedding_lookup_sparse(
         embedding_weights=embedding_weights,
         sparse_ids=sparse_ids,
         sparse_weights=sparse_weights,
@@ -2510,6 +3027,30 @@ class EmbeddingColumn(
         name='%s_weights' % self.name,
         max_norm=self.max_norm)
 
+  def _get_dense_tensor_internal(self, sparse_tensors, state_manager):
+    """Private method that follows the signature of get_dense_tensor."""
+    embedding_weights = state_manager.get_variable(
+        self, name='embedding_weights')
+    return self._get_dense_tensor_internal_helper(sparse_tensors,
+                                                  embedding_weights)
+
+  def _old_get_dense_tensor_internal(self, sparse_tensors, weight_collections,
+                                     trainable):
+    """Private method that follows the signature of _get_dense_tensor."""
+    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+    if (weight_collections and
+        ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections):
+      weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+    embedding_weights = variable_scope.get_variable(
+        name='embedding_weights',
+        shape=embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self.initializer,
+        trainable=self.trainable and trainable,
+        collections=weight_collections)
+    return self._get_dense_tensor_internal_helper(sparse_tensors,
+                                                  embedding_weights)
+
   def get_dense_tensor(self, transformation_cache, state_manager):
     """Returns tensor after doing the embedding lookup.
 
@@ -2535,7 +3076,30 @@ class EmbeddingColumn(
           'sequence_input_layer instead of input_layer. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
-    return self._get_dense_tensor_internal(transformation_cache, state_manager)
+    # Get sparse IDs and weights.
+    sparse_tensors = self.categorical_column.get_sparse_tensors(
+        transformation_cache, state_manager)
+    return self._get_dense_tensor_internal(sparse_tensors, state_manager)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    if isinstance(
+        self.categorical_column,
+        (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)):  # pylint: disable=protected-access
+      raise ValueError(
+          'In embedding_column: {}. '
+          'categorical_column must not be of type _SequenceCategoricalColumn. '
+          'Suggested fix A: If you wish to use input_layer, use a '
+          'non-sequence categorical_column_with_*. '
+          'Suggested fix B: If you wish to create sequence input, use '
+          'sequence_input_layer instead of input_layer. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
+    sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
+        inputs, weight_collections, trainable)
+    return self._old_get_dense_tensor_internal(sparse_tensors,
+                                               weight_collections, trainable)
 
   def get_sequence_dense_tensor(self, transformation_cache, state_manager):
     """See `SequenceDenseColumn` base class."""
@@ -2547,143 +3111,169 @@ class EmbeddingColumn(
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
-    dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
-        transformation_cache, state_manager)
-    sparse_tensors = self.categorical_column.get_sparse_tensors(
+    sparse_tensors = self.categorical_column.get_sequence_sparse_tensors(
         transformation_cache, state_manager)
-    sequence_length = _sequence_length_from_sparse_tensor(
+    dense_tensor = self._get_dense_tensor_internal(sparse_tensors,
+                                                   state_manager)
+    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sequence_dense_tensor(self,
+                                 inputs,
+                                 weight_collections=None,
+                                 trainable=None):
+    if not isinstance(
+        self.categorical_column,
+        (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)):  # pylint: disable=protected-access
+      raise ValueError(
+          'In embedding_column: {}. '
+          'categorical_column must be of type _SequenceCategoricalColumn '
+          'to use sequence_input_layer. '
+          'Suggested fix: Use one of sequence_categorical_column_with_*. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
+    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    dense_tensor = self._old_get_dense_tensor_internal(
+        sparse_tensors,
+        weight_collections=weight_collections,
+        trainable=trainable)
+    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+        sparse_tensors.id_tensor)
+    return SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
 
-def _get_graph_for_variable(var):
-  if isinstance(var, variables.PartitionedVariable):
-    return list(var)[0].graph
-  else:
-    return var.graph
-
-
-class SharedEmbeddingStateManager(Layer):
-  """A state manager that handle the state of shared embedding columns.
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    config['initializer'] = utils.serialize_keras_object(self.initializer)
+    return config
 
-  This can handle multiple sets of columns that share variables."""
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
+    if config['initializer']:
+      kwargs['initializer'] = utils.deserialize_keras_object(
+          config['initializer'], custom_objects=custom_objects)
+    else:
+      kwargs['initializer'] = None
+    return cls(**kwargs)
 
-  def __init__(self, trainable=True, name=None, **kwargs):
-    """Constructs a `SharedEmbeddingStateManager`.
 
-    Args:
-      trainable: If true, variables created are trainable.
-      name: Name of the State Manager.
-      **kwargs: Keyword arguments.
-    """
-    super(SharedEmbeddingStateManager, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-    self._var_dict = {}
+def _raise_shared_embedding_column_error():
+  raise ValueError('SharedEmbeddingColumns are not supported in '
+                   '`linear_model` or `input_layer`. Please use '
+                   '`DenseFeatures` or `LinearModel` instead.')
 
-  def create_variable(self,
-                      name,
-                      shape,
-                      dtype=None,
-                      trainable=True,
-                      initializer=None):
-    """Creates a variable.
 
-    Makes sure only one var is created per `shared_collection_name`. `name` is
-    ignored here as the variable is named `shared_collection_name` instead.
+class SharedEmbeddingColumnCreator(tracking.Checkpointable):
 
-    Args:
-      name: Name of the variable. Not used.
-      shape: Variable shape.
-      dtype: Variable type.
-      trainable: If variable created should be trainable or not.
-      initializer: Variable initializer.
+  def __init__(self,
+               dimension,
+               initializer,
+               ckpt_to_load_from,
+               tensor_name_in_ckpt,
+               num_buckets,
+               trainable,
+               name='shared_embedding_column_creator'):
+    self._dimension = dimension
+    self._initializer = initializer
+    self._ckpt_to_load_from = ckpt_to_load_from
+    self._tensor_name_in_ckpt = tensor_name_in_ckpt
+    self._num_buckets = num_buckets
+    self._trainable = trainable
+    self._name = name
+    # Map from graph keys to embedding_weight variables.
+    self._embedding_weights = {}
 
-    Returns:
-      A variable or partitioned variable.
-    """
-    if name in self._var_dict:
-      var = self._var_dict[name]
-      return var
-    with variable_scope.variable_scope(
-        self.name, reuse=variable_scope.AUTO_REUSE):
-      var = self.add_variable(
-          name=name,
-          shape=shape,
-          dtype=dtype,
-          trainable=self.trainable and trainable,
-          initializer=initializer,
-          use_resource=True,
-          # TODO(rohanj): Get rid of this hack once we have a mechanism for
-          # specifying a default partitioner for an entire layer. In that case,
-          # the default getter for Layers should work.
-          getter=variable_scope.get_variable)
-    self._var_dict[name] = var
-    return var
+  def __call__(self, categorical_column, combiner, max_norm):
+    return SharedEmbeddingColumn(categorical_column, self, combiner, max_norm)
 
-  def get_variable(self, feature_column, name):
-    if name not in self._var_dict:
-      raise ValueError('Variable name: {} not recognized.'.format(name))
-    return self._var_dict[name]
+  @property
+  def embedding_weights(self):
+    key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+    if key not in self._embedding_weights:
+      embedding_shape = (self._num_buckets, self._dimension)
+      var = variable_scope.get_variable(
+          name=self._name,
+          shape=embedding_shape,
+          dtype=dtypes.float32,
+          initializer=self._initializer,
+          trainable=self._trainable)
 
+      if self._ckpt_to_load_from is not None:
+        to_restore = var
+        if isinstance(to_restore, variables.PartitionedVariable):
+          to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
+        checkpoint_utils.init_from_checkpoint(
+            self._ckpt_to_load_from, {self._tensor_name_in_ckpt: to_restore})
+      self._embedding_weights[key] = var
+    return self._embedding_weights[key]
 
-def maybe_create_shared_state_manager(feature_columns):
-  if is_feature_column_v2(feature_columns):
-    return SharedEmbeddingStateManager()
-  return None
+  @property
+  def dimension(self):
+    return self._dimension
 
 
 class SharedEmbeddingColumn(
-    DenseColumn, SequenceDenseColumn,
+    DenseColumn,
+    SequenceDenseColumn,
+    fc_old._DenseColumn,  # pylint: disable=protected-access
+    fc_old._SequenceDenseColumn,  # pylint: disable=protected-access
     collections.namedtuple(
         'SharedEmbeddingColumn',
-        ('categorical_column', 'dimension', 'combiner', 'initializer',
-         'shared_embedding_collection_name', 'ckpt_to_load_from',
-         'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
+        ('categorical_column', 'shared_embedding_column_creator', 'combiner',
+         'max_norm'))):
   """See `embedding_column`."""
 
+  @property
+  def _is_v2_column(self):
+    return True
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
     return '{}_shared_embedding'.format(self.categorical_column.name)
 
-  @property
-  def shared_collection_name(self):
-    """Returns the shared name of this column.
-
-    A group of columns share an embedding. Each one of those columns would have
-    the same `shared_collection_name` by which they could be collectively
-    referred to.
-    """
-    return self.shared_embedding_collection_name
-
   @property
   def parse_example_spec(self):
     """See `FeatureColumn` base class."""
     return self.categorical_column.parse_example_spec
 
+  @property
+  def _parse_example_spec(self):
+    return _raise_shared_embedding_column_error()
+
   def transform_feature(self, transformation_cache, state_manager):
     """See `FeatureColumn` base class."""
     return transformation_cache.get(self.categorical_column, state_manager)
 
+  def _transform_feature(self, inputs):
+    return _raise_shared_embedding_column_error()
+
   @property
   def variable_shape(self):
     """See `DenseColumn` base class."""
-    return tensor_shape.vector(self.dimension)
+    return tensor_shape.vector(self.shared_embedding_column_creator.dimension)
 
-  def create_state(self, state_manager):
-    """Creates the shared embedding lookup variable."""
-    if not isinstance(state_manager, SharedEmbeddingStateManager):
-      raise ValueError('Expected state_manager to be of type '
-                       'SharedEmbeddingStateManager. Obtained type: {}'.format(
-                           type(state_manager)))
-    embedding_shape = (self.categorical_column.num_buckets, self.dimension)
-    state_manager.create_variable(
-        name=self.shared_collection_name,
-        shape=embedding_shape,
-        dtype=dtypes.float32,
-        trainable=self.trainable,
-        initializer=self.initializer)
+  @property
+  def _variable_shape(self):
+    return _raise_shared_embedding_column_error()
 
   def _get_dense_tensor_internal(self, transformation_cache, state_manager):
     """Private method that follows the signature of _get_dense_tensor."""
@@ -2697,19 +3287,10 @@ class SharedEmbeddingColumn(
       sparse_ids = sparse_tensors.id_tensor
       sparse_weights = sparse_tensors.weight_tensor
 
-      embedding_weights = state_manager.get_variable(
-          self, name=self.shared_collection_name)
-
-      if self.ckpt_to_load_from is not None:
-        to_restore = embedding_weights
-        if isinstance(to_restore, variables.PartitionedVariable):
-          to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
-        checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
-            self.tensor_name_in_ckpt: to_restore
-        })
+      embedding_weights = self.shared_embedding_column_creator.embedding_weights
 
       # Return embedding lookup result.
-      return _safe_embedding_lookup_sparse(
+      return embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights=embedding_weights,
           sparse_ids=sparse_ids,
           sparse_weights=sparse_weights,
@@ -2731,6 +3312,9 @@ class SharedEmbeddingColumn(
                                        self.categorical_column))
     return self._get_dense_tensor_internal(transformation_cache, state_manager)
 
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    return _raise_shared_embedding_column_error()
+
   def get_sequence_dense_tensor(self, transformation_cache, state_manager):
     """See `SequenceDenseColumn` base class."""
     if not isinstance(self.categorical_column, SequenceCategoricalColumn):
@@ -2745,11 +3329,31 @@ class SharedEmbeddingColumn(
                                                   state_manager)
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
-    sequence_length = _sequence_length_from_sparse_tensor(
+    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
+  def _get_sequence_dense_tensor(self,
+                                 inputs,
+                                 weight_collections=None,
+                                 trainable=None):
+    return _raise_shared_embedding_column_error()
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    raise NotImplementedError()
+
 
 def _create_tuple(shape, value):
   """Returns a tuple with given shape and filled with value."""
@@ -2858,10 +3462,15 @@ def _check_default_value(shape, default_value, dtype, key):
 
 class HashedCategoricalColumn(
     CategoricalColumn,
+    fc_old._CategoricalColumn,  # pylint: disable=protected-access
     collections.namedtuple('HashedCategoricalColumn',
                            ('key', 'hash_bucket_size', 'dtype'))):
   """see `categorical_column_with_hash_bucket`."""
 
+  @property
+  def _is_v2_column(self):
+    return True
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
@@ -2872,10 +3481,14 @@ class HashedCategoricalColumn(
     """See `FeatureColumn` base class."""
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
-  def transform_feature(self, transformation_cache, state_manager):
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.parse_example_spec
+
+  def _transform_input_tensor(self, input_tensor):
     """Hashes the values in the feature_column."""
-    input_tensor = _to_sparse_input_and_drop_ignore_values(
-        transformation_cache.get(self.key, state_manager))
     if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
       raise ValueError('SparseColumn input must be a SparseTensor.')
 
@@ -2899,24 +3512,74 @@ class HashedCategoricalColumn(
     return sparse_tensor_lib.SparseTensor(
         input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
 
+  def transform_feature(self, transformation_cache, state_manager):
+    """Hashes the values in the feature_column."""
+    input_tensor = _to_sparse_input_and_drop_ignore_values(
+        transformation_cache.get(self.key, state_manager))
+    return self._transform_input_tensor(input_tensor)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
+    return self._transform_input_tensor(input_tensor)
+
   @property
   def num_buckets(self):
     """Returns number of buckets in this sparse feature."""
     return self.hash_bucket_size
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _num_buckets(self):
+    return self.num_buckets
+
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
     return CategoricalColumn.IdWeightPair(
         transformation_cache.get(self, state_manager), None)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class VocabularyFileCategoricalColumn(
     CategoricalColumn,
+    fc_old._CategoricalColumn,  # pylint: disable=protected-access
     collections.namedtuple('VocabularyFileCategoricalColumn',
                            ('key', 'vocabulary_file', 'vocabulary_size',
                             'num_oov_buckets', 'dtype', 'default_value'))):
   """See `categorical_column_with_vocabulary_file`."""
 
+  @property
+  def _is_v2_column(self):
+    return True
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
@@ -2927,11 +3590,14 @@ class VocabularyFileCategoricalColumn(
     """See `FeatureColumn` base class."""
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
-  def transform_feature(self, transformation_cache, state_manager):
-    """Creates a lookup table for the vocabulary."""
-    input_tensor = _to_sparse_input_and_drop_ignore_values(
-        transformation_cache.get(self.key, state_manager))
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.parse_example_spec
 
+  def _transform_input_tensor(self, input_tensor):
+    """Creates a lookup table for the vocabulary."""
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
           'Column dtype and SparseTensors dtype must be compatible. '
@@ -2957,25 +3623,75 @@ class VocabularyFileCategoricalColumn(
         key_dtype=key_dtype,
         name='{}_lookup'.format(self.key)).lookup(input_tensor)
 
+  def transform_feature(self, transformation_cache, state_manager):
+    """Creates a lookup table for the vocabulary."""
+    input_tensor = _to_sparse_input_and_drop_ignore_values(
+        transformation_cache.get(self.key, state_manager))
+    return self._transform_input_tensor(input_tensor)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
+    return self._transform_input_tensor(input_tensor)
+
   @property
   def num_buckets(self):
     """Returns number of buckets in this sparse feature."""
     return self.vocabulary_size + self.num_oov_buckets
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _num_buckets(self):
+    return self.num_buckets
+
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
     return CategoricalColumn.IdWeightPair(
         transformation_cache.get(self, state_manager), None)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class VocabularyListCategoricalColumn(
     CategoricalColumn,
+    fc_old._CategoricalColumn,  # pylint: disable=protected-access
     collections.namedtuple(
         'VocabularyListCategoricalColumn',
         ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'))
 ):
   """See `categorical_column_with_vocabulary_list`."""
 
+  @property
+  def _is_v2_column(self):
+    return True
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
@@ -2986,11 +3702,14 @@ class VocabularyListCategoricalColumn(
     """See `FeatureColumn` base class."""
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
-  def transform_feature(self, transformation_cache, state_manager):
-    """Creates a lookup table for the vocabulary list."""
-    input_tensor = _to_sparse_input_and_drop_ignore_values(
-        transformation_cache.get(self.key, state_manager))
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.parse_example_spec
 
+  def _transform_input_tensor(self, input_tensor):
+    """Creates a lookup table for the vocabulary list."""
     if self.dtype.is_integer != input_tensor.dtype.is_integer:
       raise ValueError(
           'Column dtype and SparseTensors dtype must be compatible. '
@@ -3015,24 +3734,74 @@ class VocabularyListCategoricalColumn(
         dtype=key_dtype,
         name='{}_lookup'.format(self.key)).lookup(input_tensor)
 
+  def transform_feature(self, transformation_cache, state_manager):
+    """Creates a lookup table for the vocabulary list."""
+    input_tensor = _to_sparse_input_and_drop_ignore_values(
+        transformation_cache.get(self.key, state_manager))
+    return self._transform_input_tensor(input_tensor)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
+    return self._transform_input_tensor(input_tensor)
+
   @property
   def num_buckets(self):
     """Returns number of buckets in this sparse feature."""
     return len(self.vocabulary_list) + self.num_oov_buckets
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _num_buckets(self):
+    return self.num_buckets
+
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
     return CategoricalColumn.IdWeightPair(
         transformation_cache.get(self, state_manager), None)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class IdentityCategoricalColumn(
     CategoricalColumn,
+    fc_old._CategoricalColumn,  # pylint: disable=protected-access
     collections.namedtuple('IdentityCategoricalColumn',
                            ('key', 'number_buckets', 'default_value'))):
 
   """See `categorical_column_with_identity`."""
 
+  @property
+  def _is_v2_column(self):
+    return True
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
@@ -3043,11 +3812,14 @@ class IdentityCategoricalColumn(
     """See `FeatureColumn` base class."""
     return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
 
-  def transform_feature(self, transformation_cache, state_manager):
-    """Returns a SparseTensor with identity values."""
-    input_tensor = _to_sparse_input_and_drop_ignore_values(
-        transformation_cache.get(self.key, state_manager))
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.parse_example_spec
 
+  def _transform_input_tensor(self, input_tensor):
+    """Returns a SparseTensor with identity values."""
     if not input_tensor.dtype.is_integer:
       raise ValueError(
           'Invalid input, not integer. key: {} dtype: {}'.format(
@@ -3082,24 +3854,71 @@ class IdentityCategoricalColumn(
         values=values,
         dense_shape=input_tensor.dense_shape)
 
+  def transform_feature(self, transformation_cache, state_manager):
+    """Returns a SparseTensor with identity values."""
+    input_tensor = _to_sparse_input_and_drop_ignore_values(
+        transformation_cache.get(self.key, state_manager))
+    return self._transform_input_tensor(input_tensor)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
+    return self._transform_input_tensor(input_tensor)
+
   @property
   def num_buckets(self):
     """Returns number of buckets in this sparse feature."""
     return self.number_buckets
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _num_buckets(self):
+    return self.num_buckets
+
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
     return CategoricalColumn.IdWeightPair(
         transformation_cache.get(self, state_manager), None)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    return CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    return dict(zip(self._fields, self))
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    return cls(**config)
+
 
 class WeightedCategoricalColumn(
     CategoricalColumn,
+    fc_old._CategoricalColumn,  # pylint: disable=protected-access
     collections.namedtuple(
         'WeightedCategoricalColumn',
         ('categorical_column', 'weight_feature_key', 'dtype'))):
   """See `weighted_categorical_column`."""
 
+  @property
+  def _is_v2_column(self):
+    return (isinstance(self.categorical_column, FeatureColumn) and
+            self.categorical_column._is_v2_column)  # pylint: disable=protected-access
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
@@ -3116,15 +3935,29 @@ class WeightedCategoricalColumn(
     config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
     return config
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+    if self.weight_feature_key in config:
+      raise ValueError('Parse config {} already exists for {}.'.format(
+          config[self.weight_feature_key], self.weight_feature_key))
+    config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
+    return config
+
   @property
   def num_buckets(self):
     """See `DenseColumn` base class."""
     return self.categorical_column.num_buckets
 
-  def transform_feature(self, transformation_cache, state_manager):
-    """Applies weights to tensor generated from `categorical_column`'."""
-    weight_tensor = transformation_cache.get(self.weight_feature_key,
-                                             state_manager)
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _num_buckets(self):
+    return self.categorical_column._num_buckets  # pylint: disable=protected-access
+
+  def _transform_weight_tensor(self, weight_tensor):
     if weight_tensor is None:
       raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
     weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
@@ -3138,27 +3971,92 @@ class WeightedCategoricalColumn(
           weight_tensor, ignore_value=0.0)
     if not weight_tensor.dtype.is_floating:
       weight_tensor = math_ops.to_float(weight_tensor)
+    return weight_tensor
+
+  def transform_feature(self, transformation_cache, state_manager):
+    """Applies weights to tensor generated from `categorical_column`'."""
+    print('WeightedCategoricalColumn.transform_feature: ', self.name)
+    print('Weight feature key: ', self.weight_feature_key)
+    weight_tensor = transformation_cache.get(self.weight_feature_key,
+                                             state_manager)
+    print('Weight tensor before: ', weight_tensor)
+    weight_tensor = self._transform_weight_tensor(weight_tensor)
+    print('Weight tensor after: ', weight_tensor)
     return (transformation_cache.get(self.categorical_column, state_manager),
             weight_tensor)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    """Applies weights to tensor generated from `categorical_column`'."""
+    weight_tensor = inputs.get(self.weight_feature_key)
+    weight_tensor = self._transform_weight_tensor(weight_tensor)
+    return (inputs.get(self.categorical_column), weight_tensor)
+
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
+    print('WeightedCategoricalColumn.get_sparse_tensors: ', self.name)
     tensors = transformation_cache.get(self, state_manager)
+    print('tensors[1]: ', tensors[1])
     return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    del weight_collections
+    del trainable
+    tensors = inputs.get(self)
+    return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column, self.weight_feature_key]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    config['dtype'] = self.dtype.name
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+    return cls(**kwargs)
+
 
 class CrossedColumn(
     CategoricalColumn,
+    fc_old._CategoricalColumn,  # pylint: disable=protected-access
     collections.namedtuple('CrossedColumn',
                            ('keys', 'hash_bucket_size', 'hash_key'))):
   """See `crossed_column`."""
 
+  @property
+  def _is_v2_column(self):
+    for key in _collect_leaf_level_keys(self):
+      if isinstance(key, six.string_types):
+        continue
+      if not isinstance(key, FeatureColumn):
+        return False
+      if not key._is_v2_column:  # pylint: disable=protected-access
+        return False
+    return True
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
     feature_names = []
     for key in _collect_leaf_level_keys(self):
-      if isinstance(key, FeatureColumn):
+      if isinstance(key, (FeatureColumn, fc_old._FeatureColumn)):  # pylint: disable=protected-access
         feature_names.append(key.name)
       else:  # key must be a string
         feature_names.append(key)
@@ -3171,17 +4069,25 @@ class CrossedColumn(
     for key in self.keys:
       if isinstance(key, FeatureColumn):
         config.update(key.parse_example_spec)
+      elif isinstance(key, fc_old._FeatureColumn):  # pylint: disable=protected-access
+        config.update(key._parse_example_spec)  # pylint: disable=protected-access
       else:  # key must be a string
         config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
     return config
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.parse_example_spec
+
   def transform_feature(self, transformation_cache, state_manager):
     """Generates a hashed sparse cross from the input tensors."""
     feature_tensors = []
     for key in _collect_leaf_level_keys(self):
       if isinstance(key, six.string_types):
         feature_tensors.append(transformation_cache.get(key, state_manager))
-      elif isinstance(key, CategoricalColumn):
+      elif isinstance(key, (fc_old._CategoricalColumn, CategoricalColumn)):  # pylint: disable=protected-access
         ids_and_weights = key.get_sparse_tensors(transformation_cache,
                                                  state_manager)
         if ids_and_weights.weight_tensor is not None:
@@ -3197,16 +4103,76 @@ class CrossedColumn(
         num_buckets=self.hash_bucket_size,
         hash_key=self.hash_key)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    """Generates a hashed sparse cross from the input tensors."""
+    feature_tensors = []
+    for key in _collect_leaf_level_keys(self):
+      if isinstance(key, six.string_types):
+        feature_tensors.append(inputs.get(key))
+      elif isinstance(key, (CategoricalColumn, fc_old._CategoricalColumn)):  # pylint: disable=protected-access
+        ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+        if ids_and_weights.weight_tensor is not None:
+          raise ValueError(
+              'crossed_column does not support weight_tensor, but the given '
+              'column populates weight_tensor. '
+              'Given column: {}'.format(key.name))
+        feature_tensors.append(ids_and_weights.id_tensor)
+      else:
+        raise ValueError('Unsupported column type. Given: {}'.format(key))
+    return sparse_ops.sparse_cross_hashed(
+        inputs=feature_tensors,
+        num_buckets=self.hash_bucket_size,
+        hash_key=self.hash_key)
+
   @property
   def num_buckets(self):
     """Returns number of buckets in this sparse feature."""
     return self.hash_bucket_size
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _num_buckets(self):
+    return self.num_buckets
+
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
     return CategoricalColumn.IdWeightPair(
         transformation_cache.get(self, state_manager), None)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    """See `CategoricalColumn` base class."""
+    del weight_collections
+    del trainable
+    return CategoricalColumn.IdWeightPair(inputs.get(self), None)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return list(self.keys)
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['keys'] = tuple([serialize_feature_column(fc) for fc in self.keys])
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['keys'] = tuple([
+        deserialize_feature_column(c, custom_objects, columns_by_name)
+        for c in config['keys']
+    ])
+    return cls(**kwargs)
+
 
 def _collect_leaf_level_keys(cross):
   """Collects base keys by expanding all nested crosses.
@@ -3226,140 +4192,6 @@ def _collect_leaf_level_keys(cross):
   return leaf_level_keys
 
 
-# TODO(zakaria): Move this to embedding_ops and make it public.
-def _safe_embedding_lookup_sparse(embedding_weights,
-                                  sparse_ids,
-                                  sparse_weights=None,
-                                  combiner='mean',
-                                  default_id=None,
-                                  name=None,
-                                  partition_strategy='div',
-                                  max_norm=None):
-  """Lookup embedding results, accounting for invalid IDs and empty features.
-
-  The partitioned embedding in `embedding_weights` must all be the same shape
-  except for the first dimension. The first dimension is allowed to vary as the
-  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
-  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
-  partitioner.
-
-  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
-  with non-positive weight. For an entry with no features, the embedding vector
-  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
-
-  The ids and weights may be multi-dimensional. Embeddings are always aggregated
-  along the last dimension.
-
-  Args:
-    embedding_weights:  A list of `P` float `Tensor`s or values representing
-        partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
-        created by partitioning along dimension 0.  The total unpartitioned
-        shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
-        vocab size and `e_1, ..., e_m` are the embedding dimensions.
-    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
-        ids. `d_0` is typically batch size.
-    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
-        float weights corresponding to `sparse_ids`, or `None` if all weights
-        are be assumed to be 1.0.
-    combiner: A string specifying how to combine embedding results for each
-        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
-        the default.
-    default_id: The id to use for an entry with no features.
-    name: A name for this operation (optional).
-    partition_strategy: A string specifying the partitioning strategy.
-        Currently `"div"` and `"mod"` are supported. Default is `"div"`.
-    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
-        combining.
-
-
-  Returns:
-    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
-
-  Raises:
-    ValueError: if `embedding_weights` is empty.
-  """
-  if embedding_weights is None:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
-  if isinstance(embedding_weights, variables.PartitionedVariable):
-    embedding_weights = list(embedding_weights)  # get underlying Variables.
-  if not isinstance(embedding_weights, list):
-    embedding_weights = [embedding_weights]
-  if len(embedding_weights) < 1:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
-
-  dtype = sparse_weights.dtype if sparse_weights is not None else None
-  # TODO(rohanj): Look into removing this convert_to_tensor call.
-  embedding_weights = [
-      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
-  ]
-
-  with ops.name_scope(name, 'embedding_lookup',
-                      embedding_weights + [sparse_ids,
-                                           sparse_weights]) as scope:
-    # Reshape higher-rank sparse ids and weights to linear segment ids.
-    original_shape = sparse_ids.dense_shape
-    original_rank_dim = sparse_ids.dense_shape.get_shape()[0]
-    original_rank = (
-        array_ops.size(original_shape)
-        if original_rank_dim.value is None
-        else original_rank_dim.value)
-    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
-        math_ops.reduce_prod(
-            array_ops.slice(original_shape, [0], [original_rank - 1])),
-        array_ops.gather(original_shape, original_rank - 1)])
-    if sparse_weights is not None:
-      sparse_weights = sparse_tensor_lib.SparseTensor(
-          sparse_ids.indices,
-          sparse_weights.values, sparse_ids.dense_shape)
-
-    # Prune invalid ids and weights.
-    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
-    if combiner != 'sum':
-      sparse_ids, sparse_weights = _prune_invalid_weights(
-          sparse_ids, sparse_weights)
-
-    # Fill in dummy values for empty features, if necessary.
-    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
-                                                                 default_id or
-                                                                 0)
-    if sparse_weights is not None:
-      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
-
-    result = embedding_ops.embedding_lookup_sparse(
-        embedding_weights,
-        sparse_ids,
-        sparse_weights,
-        combiner=combiner,
-        partition_strategy=partition_strategy,
-        name=None if default_id is None else scope,
-        max_norm=max_norm)
-
-    if default_id is None:
-      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
-      # for use in Select.
-      is_row_empty = array_ops.tile(
-          array_ops.reshape(is_row_empty, [-1, 1]),
-          array_ops.stack([1, array_ops.shape(result)[1]]))
-
-      result = array_ops.where(is_row_empty,
-                               array_ops.zeros_like(result),
-                               result,
-                               name=scope)
-
-    # Reshape back from linear ids back into higher-dimensional dense result.
-    final_result = array_ops.reshape(
-        result,
-        array_ops.concat([
-            array_ops.slice(
-                math_ops.cast(original_shape, dtypes.int32), [0],
-                [original_rank - 1]),
-            array_ops.slice(array_ops.shape(result), [1], [-1])
-        ], 0))
-    final_result.set_shape(tensor_shape.unknown_shape(
-        (original_rank_dim - 1).value).concatenate(result.get_shape()[1:]))
-    return final_result
-
-
 def _prune_invalid_ids(sparse_ids, sparse_weights):
   """Prune invalid IDs (< 0) from the input ids and weights."""
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
@@ -3382,9 +4214,12 @@ def _prune_invalid_weights(sparse_ids, sparse_weights):
   return sparse_ids, sparse_weights
 
 
-class IndicatorColumn(DenseColumn, SequenceDenseColumn,
-                      collections.namedtuple('IndicatorColumn',
-                                             ('categorical_column'))):
+class IndicatorColumn(
+    DenseColumn,
+    SequenceDenseColumn,
+    fc_old._DenseColumn,  # pylint: disable=protected-access
+    fc_old._SequenceDenseColumn,  # pylint: disable=protected-access
+    collections.namedtuple('IndicatorColumn', ('categorical_column'))):
   """Represents a one-hot column for use in deep networks.
 
   Args:
@@ -3392,28 +4227,17 @@ class IndicatorColumn(DenseColumn, SequenceDenseColumn,
       `categorical_column_with_*` function.
   """
 
+  @property
+  def _is_v2_column(self):
+    return (isinstance(self.categorical_column, FeatureColumn) and
+            self.categorical_column._is_v2_column)  # pylint: disable=protected-access
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
     return '{}_indicator'.format(self.categorical_column.name)
 
-  def transform_feature(self, transformation_cache, state_manager):
-    """Returns dense `Tensor` representing feature.
-
-    Args:
-      transformation_cache: A `FeatureTransformationCache` object to access
-        features.
-      state_manager: A `StateManager` to create / access resources such as
-        lookup tables.
-
-    Returns:
-      Transformed feature `Tensor`.
-
-    Raises:
-      ValueError: if input rank is not known at graph building time.
-    """
-    id_weight_pair = self.categorical_column.get_sparse_tensors(
-        transformation_cache, state_manager)
+  def _transform_id_weight_pair(self, id_weight_pair):
     id_tensor = id_weight_pair.id_tensor
     weight_tensor = id_weight_pair.weight_tensor
 
@@ -3422,11 +4246,15 @@ class IndicatorColumn(DenseColumn, SequenceDenseColumn,
       weighted_column = sparse_ops.sparse_merge(
           sp_ids=id_tensor,
           sp_values=weight_tensor,
-          vocab_size=int(self.variable_shape[-1]))
-      # Remove (?, -1) index
+          vocab_size=int(self._variable_shape[-1]))
+      # Remove (?, -1) index.
       weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
                                                 weighted_column.dense_shape)
-      return sparse_ops.sparse_tensor_to_dense(weighted_column)
+      # Use scatter_nd to merge duplicated indices if existed,
+      # instead of sparse_tensor_to_dense.
+      return array_ops.scatter_nd(weighted_column.indices,
+                                  weighted_column.values,
+                                  weighted_column.dense_shape)
 
     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
         id_tensor, default_value=-1)
@@ -3435,22 +4263,62 @@ class IndicatorColumn(DenseColumn, SequenceDenseColumn,
     # input_layer are float32.
     one_hot_id_tensor = array_ops.one_hot(
         dense_id_tensor,
-        depth=self.variable_shape[-1],
+        depth=self._variable_shape[-1],
         on_value=1.0,
         off_value=0.0)
 
     # Reduce to get a multi-hot per example.
     return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
 
+  def transform_feature(self, transformation_cache, state_manager):
+    """Returns dense `Tensor` representing feature.
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+
+    Returns:
+      Transformed feature `Tensor`.
+
+    Raises:
+      ValueError: if input rank is not known at graph building time.
+    """
+    id_weight_pair = self.categorical_column.get_sparse_tensors(
+        transformation_cache, state_manager)
+    return self._transform_id_weight_pair(id_weight_pair)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    id_weight_pair = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    return self._transform_id_weight_pair(id_weight_pair)
+
   @property
   def parse_example_spec(self):
     """See `FeatureColumn` base class."""
     return self.categorical_column.parse_example_spec
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
   @property
   def variable_shape(self):
     """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
-    return tensor_shape.TensorShape([1, self.categorical_column.num_buckets])
+    if isinstance(self.categorical_column, FeatureColumn):
+      return tensor_shape.TensorShape([1, self.categorical_column.num_buckets])
+    else:
+      return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
+
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _variable_shape(self):
+    return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
 
   def get_dense_tensor(self, transformation_cache, state_manager):
     """Returns dense `Tensor` representing feature.
@@ -3481,6 +4349,27 @@ class IndicatorColumn(DenseColumn, SequenceDenseColumn,
     # representation created by transform_feature.
     return transformation_cache.get(self, state_manager)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    del weight_collections
+    del trainable
+    if isinstance(
+        self.categorical_column,
+        (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)):  # pylint: disable=protected-access
+      raise ValueError(
+          'In indicator_column: {}. '
+          'categorical_column must not be of type _SequenceCategoricalColumn. '
+          'Suggested fix A: If you wish to use input_layer, use a '
+          'non-sequence categorical_column_with_*. '
+          'Suggested fix B: If you wish to create sequence input, use '
+          'sequence_input_layer instead of input_layer. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
+    # Feature has been already transformed. Return the intermediate
+    # representation created by transform_feature.
+    return inputs.get(self)
+
   def get_sequence_dense_tensor(self, transformation_cache, state_manager):
     """See `SequenceDenseColumn` base class."""
     if not isinstance(self.categorical_column, SequenceCategoricalColumn):
@@ -3496,49 +4385,101 @@ class IndicatorColumn(DenseColumn, SequenceDenseColumn,
     dense_tensor = transformation_cache.get(self, state_manager)
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
-    sequence_length = _sequence_length_from_sparse_tensor(
+    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+        sparse_tensors.id_tensor)
+    return SequenceDenseColumn.TensorSequenceLengthPair(
+        dense_tensor=dense_tensor, sequence_length=sequence_length)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sequence_dense_tensor(self,
+                                 inputs,
+                                 weight_collections=None,
+                                 trainable=None):
+    # Do nothing with weight_collections and trainable since no variables are
+    # created in this function.
+    del weight_collections
+    del trainable
+    if not isinstance(
+        self.categorical_column,
+        (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)):  # pylint: disable=protected-access
+      raise ValueError(
+          'In indicator_column: {}. '
+          'categorical_column must be of type _SequenceCategoricalColumn '
+          'to use sequence_input_layer. '
+          'Suggested fix: Use one of sequence_categorical_column_with_*. '
+          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
+                                       self.categorical_column))
+    # Feature has been already transformed. Return the intermediate
+    # representation created by _transform_feature.
+    dense_tensor = inputs.get(self)
+    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
 
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    return cls(**kwargs)
+
 
 def _verify_static_batch_size_equality(tensors, columns):
+  """Verify equality between static batch sizes.
+
+  Args:
+    tensors: iterable of input tensors.
+    columns: Corresponding feature columns.
+
+  Raises:
+    ValueError: in case of mismatched batch sizes.
+  """
   # bath_size is a tf.Dimension object.
   expected_batch_size = None
   for i in range(0, len(tensors)):
-    if tensors[i].shape[0].value is not None:
+    batch_size = tensor_shape.Dimension(tensor_shape.dimension_value(
+        tensors[i].shape[0]))
+    if batch_size.value is not None:
       if expected_batch_size is None:
         bath_size_column_index = i
-        expected_batch_size = tensors[i].shape[0]
-      elif not expected_batch_size.is_compatible_with(tensors[i].shape[0]):
+        expected_batch_size = batch_size
+      elif not expected_batch_size.is_compatible_with(batch_size):
         raise ValueError(
             'Batch size (first dimension) of each feature must be same. '
             'Batch size of columns ({}, {}): ({}, {})'.format(
                 columns[bath_size_column_index].name, columns[i].name,
-                expected_batch_size, tensors[i].shape[0]))
-
-
-def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
-  """Returns a [batch_size] Tensor with per-example sequence length."""
-  with ops.name_scope(None, 'sequence_length') as name_scope:
-    row_ids = sp_tensor.indices[:, 0]
-    column_ids = sp_tensor.indices[:, 1]
-    column_ids += array_ops.ones_like(column_ids)
-    seq_length = math_ops.to_int64(
-        math_ops.segment_max(column_ids, segment_ids=row_ids) / num_elements)
-    # If the last n rows do not have ids, seq_length will have shape
-    # [batch_size - n]. Pad the remaining values with zeros.
-    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
-    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
-    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
-
-
-class SequenceCategoricalColumn(FeatureColumn,
-                                collections.namedtuple(
-                                    'SequenceCategoricalColumn',
-                                    ('categorical_column'))):
+                expected_batch_size, batch_size))
+
+
+class SequenceCategoricalColumn(
+    FeatureColumn,
+    fc_old._CategoricalColumn,  # pylint: disable=protected-access
+    collections.namedtuple('SequenceCategoricalColumn',
+                           ('categorical_column'))):
   """Represents sequences of categorical data."""
 
+  @property
+  def _is_v2_column(self):
+    return (isinstance(self.categorical_column, FeatureColumn) and
+            self.categorical_column._is_v2_column)  # pylint: disable=protected-access
+
   @property
   def name(self):
     """See `FeatureColumn` base class."""
@@ -3549,16 +4490,49 @@ class SequenceCategoricalColumn(FeatureColumn,
     """See `FeatureColumn` base class."""
     return self.categorical_column.parse_example_spec
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _parse_example_spec(self):
+    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
+
   def transform_feature(self, transformation_cache, state_manager):
     """See `FeatureColumn` base class."""
     return self.categorical_column.transform_feature(transformation_cache,
                                                      state_manager)
 
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _transform_feature(self, inputs):
+    return self.categorical_column._transform_feature(inputs)  # pylint: disable=protected-access
+
   @property
   def num_buckets(self):
     """Returns number of buckets in this sparse feature."""
     return self.categorical_column.num_buckets
 
+  @property
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _num_buckets(self):
+    return self.categorical_column._num_buckets  # pylint: disable=protected-access
+
+  def _get_sparse_tensors_helper(self, sparse_tensors):
+    id_tensor = sparse_tensors.id_tensor
+    weight_tensor = sparse_tensors.weight_tensor
+    # Expands third dimension, if necessary so that embeddings are not
+    # combined during embedding lookup. If the tensor is already 3D, leave
+    # as-is.
+    shape = array_ops.shape(id_tensor)
+    # Compute the third dimension explicitly instead of setting it to -1, as
+    # that doesn't work for dynamically shaped tensors with 0-length at runtime.
+    # This happens for empty sequences.
+    target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])]
+    id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape)
+    if weight_tensor is not None:
+      weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)
+    return CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
+
   def get_sequence_sparse_tensors(self, transformation_cache, state_manager):
     """Returns an IdWeightPair.
 
@@ -3580,27 +4554,197 @@ class SequenceCategoricalColumn(FeatureColumn,
     """
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
-    id_tensor = sparse_tensors.id_tensor
-    weight_tensor = sparse_tensors.weight_tensor
-    # Expands final dimension, so that embeddings are not combined during
-    # embedding lookup.
-    check_id_rank = check_ops.assert_equal(
-        array_ops.rank(id_tensor), 2,
-        data=[
-            'Column {} expected ID tensor of rank 2. '.format(self.name),
-            'id_tensor shape: ', array_ops.shape(id_tensor)])
-    with ops.control_dependencies([check_id_rank]):
-      id_tensor = sparse_ops.sparse_reshape(
-          id_tensor,
-          shape=array_ops.concat([id_tensor.dense_shape, [1]], axis=0))
-    if weight_tensor is not None:
-      check_weight_rank = check_ops.assert_equal(
-          array_ops.rank(weight_tensor), 2,
-          data=[
-              'Column {} expected weight tensor of rank 2.'.format(self.name),
-              'weight_tensor shape:', array_ops.shape(weight_tensor)])
-      with ops.control_dependencies([check_weight_rank]):
-        weight_tensor = sparse_ops.sparse_reshape(
-            weight_tensor,
-            shape=array_ops.concat([weight_tensor.dense_shape, [1]], axis=0))
-    return CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
+    return self._get_sparse_tensors_helper(sparse_tensors)
+
+  @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
+                          _FEATURE_COLUMN_DEPRECATION)
+  def _get_sparse_tensors(self, inputs, weight_collections=None,
+                          trainable=None):
+    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
+    return self._get_sparse_tensors_helper(sparse_tensors)
+
+  @property
+  def parents(self):
+    """See 'FeatureColumn` base class."""
+    return [self.categorical_column]
+
+  def _get_config(self):
+    """See 'FeatureColumn` base class."""
+    config = dict(zip(self._fields, self))
+    config['categorical_column'] = serialize_feature_column(
+        self.categorical_column)
+    return config
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """See 'FeatureColumn` base class."""
+    _check_config_keys(config, cls._fields)
+    kwargs = config.copy()
+    kwargs['categorical_column'] = deserialize_feature_column(
+        config['categorical_column'], custom_objects, columns_by_name)
+    return cls(**kwargs)
+
+
+# FeatureColumn serialization, deserialization logic.
+
+
+def _check_config_keys(config, expected_keys):
+  """Checks that a config has all expected_keys."""
+  if set(config.keys()) != set(expected_keys):
+    raise ValueError('Invalid config: {}, expected keys: {}'.format(
+        config, expected_keys))
+
+
+def serialize_feature_column(fc):
+  """Serializes a FeatureColumn or a raw string key.
+
+  This method should only be used to serialize parent FeatureColumns when
+  implementing FeatureColumn._get_config(), else serialize_feature_columns()
+  is preferable.
+
+  This serialization also keeps information of the FeatureColumn class, so
+  deserialization is possible without knowing the class type. For example:
+
+  a = numeric_column('x')
+  a._get_config() gives:
+  {
+      'key': 'price',
+      'shape': (1,),
+      'default_value': None,
+      'dtype': 'float32',
+      'normalizer_fn': None
+  }
+  While serialize_feature_column(a) gives:
+  {
+      'class_name': 'NumericColumn',
+      'config': {
+          'key': 'price',
+          'shape': (1,),
+          'default_value': None,
+          'dtype': 'float32',
+          'normalizer_fn': None
+      }
+  }
+
+  Args:
+    fc: A FeatureColumn or raw feature key string.
+
+  Returns:
+    Keras serialization for FeatureColumns, leaves string keys unaffected.
+
+  Raises:
+    ValueError if called with input that is not string or FeatureColumn.
+  """
+  if isinstance(fc, six.string_types):
+    return fc
+  elif isinstance(fc, FeatureColumn):
+    return utils.serialize_keras_class_and_config(fc.__class__.__name__,
+                                                  fc._get_config())
+  else:
+    raise ValueError('Instance: {} is not a FeatureColumn'.format(fc))
+
+
+def deserialize_feature_column(config,
+                               custom_objects=None,
+                               columns_by_name=None):
+  """Deserializes a `config` generated with `serialize_feature_column`.
+
+  This method should only be used to deserialize parent FeatureColumns when
+  implementing FeatureColumn._from_config(), else deserialize_feature_columns()
+  is preferable. Returns a FeatureColumn for this config.
+  TODO(b/118939620): Simplify code if Keras utils support object deduping.
+
+  Args:
+    config: A Dict with the serialization of feature columns acquired by
+      `serialize_feature_column`, or a string representing a raw column.
+    custom_objects: A Dict from custom_object name to the associated keras
+      serializable objects (FeatureColumns, classes or functions).
+    columns_by_name: A Dict[String, FeatureColumn] of existing columns in order
+      to avoid duplication.
+
+  Raises:
+    ValueError if `config` has invalid format (e.g: expected keys missing,
+    or refers to unknown classes).
+
+  Returns:
+    A FeatureColumn corresponding to the input `config`.
+  """
+  if isinstance(config, six.string_types):
+    return config
+  # A dict from class_name to class for all FeatureColumns in this module.
+  # FeatureColumns not part of the module can be passed as custom_objects.
+  module_feature_column_classes = {
+      cls.__name__: cls for cls in [
+          BucketizedColumn, EmbeddingColumn, HashedCategoricalColumn,
+          IdentityCategoricalColumn, IndicatorColumn, NumericColumn,
+          SequenceCategoricalColumn, SequenceDenseColumn, SharedEmbeddingColumn,
+          VocabularyFileCategoricalColumn, VocabularyListCategoricalColumn,
+          WeightedCategoricalColumn
+      ]
+  }
+  if columns_by_name is None:
+    columns_by_name = {}
+
+  (cls, cls_config) = utils.class_and_config_for_serialized_keras_object(
+      config,
+      module_objects=module_feature_column_classes,
+      custom_objects=custom_objects,
+      printable_module_name='feature_column_v2')
+
+  if not issubclass(cls, FeatureColumn):
+    raise ValueError(
+        'Expected FeatureColumn class, instead found: {}'.format(cls))
+
+  # Always deserialize the FeatureColumn, in order to get the name.
+  new_instance = cls._from_config(  # pylint: disable=protected-access
+      cls_config,
+      custom_objects=custom_objects,
+      columns_by_name=columns_by_name)
+
+  # If the name already exists, re-use the column from columns_by_name,
+  # (new_instance remains unused).
+  return columns_by_name.setdefault(new_instance.name, new_instance)
+
+
+def serialize_feature_columns(feature_columns):
+  """Serializes a list of FeatureColumns.
+
+  Returns a list of Keras-style config dicts that represent the input
+  FeatureColumns and can be used with `deserialize_feature_columns` for
+  reconstructing the original columns.
+
+  Args:
+    feature_columns: A list of FeatureColumns.
+
+  Returns:
+    Keras serialization for the list of FeatureColumns.
+
+  Raises:
+    ValueError if called with input that is not a list of FeatureColumns.
+  """
+  return [serialize_feature_column(fc) for fc in feature_columns]
+
+
+def deserialize_feature_columns(configs, custom_objects=None):
+  """Deserializes a list of FeatureColumns configs.
+
+  Returns a list of FeatureColumns given a list of config dicts acquired by
+  `serialize_feature_columns`.
+
+  Args:
+    configs: A list of Dicts with the serialization of feature columns acquired
+      by `serialize_feature_columns`.
+    custom_objects: A Dict from custom_object name to the associated keras
+      serializable objects (FeatureColumns, classes or functions).
+
+  Returns:
+    FeatureColumn objects corresponding to the input configs.
+
+  Raises:
+    ValueError if called with input that is not a list of FeatureColumns.
+  """
+  columns_by_name = {}
+  return [
+      deserialize_feature_column(c, custom_objects, columns_by_name)
+      for c in configs
+  ]
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index d3787146edd0c188eb98a3185e0f1bcf4d8420e2..0755c0b6ac23f5ad73df855ab2bcbce11fec2653 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -27,16 +27,12 @@ from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column.feature_column_v2 import _transform_features
-from tensorflow.python.feature_column.feature_column_v2 import FeatureColumn
-from tensorflow.python.feature_column.feature_column_v2 import FeatureLayer
-from tensorflow.python.feature_column.feature_column_v2 import FeatureTransformationCache
-from tensorflow.python.feature_column.feature_column_v2 import StateManager
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -46,11 +42,14 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import queue_runner_impl
+from tensorflow.python.training import rmsprop
+from tensorflow_estimator.python.estimator.inputs import numpy_io
 
 
 def _initialized_session(config=None):
@@ -60,15 +59,48 @@ def _initialized_session(config=None):
   return sess
 
 
+def get_linear_model_bias(name='linear_model'):
+  with variable_scope.variable_scope(name, reuse=True):
+    return variable_scope.get_variable('bias_weights')
+
+
+def get_linear_model_column_var(column, name='linear_model'):
+  return ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                            name + '/' + column.name)[0]
+
+
+class BaseFeatureColumnForTests(fc.FeatureColumn):
+  """A base FeatureColumn useful to avoid boiler-plate in tests.
+
+  Provides dummy implementations for abstract methods that raise ValueError in
+  order to avoid re-defining all abstract methods for each test sub-class.
+  """
+
+  @property
+  def parents(self):
+    raise ValueError('Should not use this method.')
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    raise ValueError('Should not use this method.')
+
+  def _get_config(self):
+    raise ValueError('Should not use this method.')
+
+
 class LazyColumnTest(test.TestCase):
 
   def test_transformations_called_once(self):
 
-    class TransformCounter(FeatureColumn):
+    class TransformCounter(BaseFeatureColumnForTests):
 
       def __init__(self):
         self.num_transform = 0
 
+      @property
+      def _is_v2_column(self):
+        return True
+
       @property
       def name(self):
         return 'TransformCounter'
@@ -81,7 +113,7 @@ class LazyColumnTest(test.TestCase):
       def parse_example_spec(self):
         pass
 
-    transformation_cache = FeatureTransformationCache(
+    transformation_cache = fc.FeatureTransformationCache(
         features={'a': [[2], [3.]]})
     column = TransformCounter()
     self.assertEqual(0, column.num_transform)
@@ -92,7 +124,11 @@ class LazyColumnTest(test.TestCase):
 
   def test_returns_transform_output(self):
 
-    class Transformer(FeatureColumn):
+    class Transformer(BaseFeatureColumnForTests):
+
+      @property
+      def _is_v2_column(self):
+        return True
 
       @property
       def name(self):
@@ -105,7 +141,7 @@ class LazyColumnTest(test.TestCase):
       def parse_example_spec(self):
         pass
 
-    transformation_cache = FeatureTransformationCache(
+    transformation_cache = fc.FeatureTransformationCache(
         features={'a': [[2], [3.]]})
     column = Transformer()
     self.assertEqual('Output', transformation_cache.get(column, None))
@@ -113,7 +149,11 @@ class LazyColumnTest(test.TestCase):
 
   def test_does_not_pollute_given_features_dict(self):
 
-    class Transformer(FeatureColumn):
+    class Transformer(BaseFeatureColumnForTests):
+
+      @property
+      def _is_v2_column(self):
+        return True
 
       @property
       def name(self):
@@ -127,12 +167,12 @@ class LazyColumnTest(test.TestCase):
         pass
 
     features = {'a': [[2], [3.]]}
-    transformation_cache = FeatureTransformationCache(features=features)
+    transformation_cache = fc.FeatureTransformationCache(features=features)
     transformation_cache.get(Transformer(), None)
     self.assertEqual(['a'], list(features.keys()))
 
   def test_error_if_feature_is_not_found(self):
-    transformation_cache = FeatureTransformationCache(
+    transformation_cache = fc.FeatureTransformationCache(
         features={'a': [[2], [3.]]})
     with self.assertRaisesRegexp(ValueError,
                                  'bbb is not in features dictionary'):
@@ -143,7 +183,11 @@ class LazyColumnTest(test.TestCase):
 
   def test_not_supported_feature_column(self):
 
-    class NotAProperColumn(FeatureColumn):
+    class NotAProperColumn(BaseFeatureColumnForTests):
+
+      @property
+      def _is_v2_column(self):
+        return True
 
       @property
       def name(self):
@@ -157,7 +201,7 @@ class LazyColumnTest(test.TestCase):
       def parse_example_spec(self):
         pass
 
-    transformation_cache = FeatureTransformationCache(
+    transformation_cache = fc.FeatureTransformationCache(
         features={'a': [[2], [3.]]})
     with self.assertRaisesRegexp(ValueError,
                                  'NotAProperColumn is not supported'):
@@ -168,15 +212,16 @@ class LazyColumnTest(test.TestCase):
     class NotAFeatureColumn(object):
       pass
 
-    transformation_cache = FeatureTransformationCache(
+    transformation_cache = fc.FeatureTransformationCache(
         features={'a': [[2], [3.]]})
     with self.assertRaisesRegexp(
         TypeError, '"key" must be either a "str" or "FeatureColumn".'):
       transformation_cache.get(NotAFeatureColumn(), None)
 
+  @test_util.run_deprecated_v1
   def test_expand_dim_rank_1_sparse_tensor_empty_batch(self):
     # empty 1-D sparse tensor:
-    transformation_cache = FeatureTransformationCache(
+    transformation_cache = fc.FeatureTransformationCache(
         features={
             'a':
                 sparse_tensor.SparseTensor(
@@ -184,15 +229,16 @@ class LazyColumnTest(test.TestCase):
                     dense_shape=[0],
                     values=np.array([]))
         })
-    with self.cached_session():
-      spv = transformation_cache.get('a', None).eval()
-      self.assertAllEqual(np.array([0, 1], dtype=np.int64), spv.dense_shape)
-      self.assertAllEqual(
-          np.reshape(np.array([], dtype=np.int64), (0, 2)), spv.indices)
+
+    spv = self.evaluate(transformation_cache.get('a', None))
+    self.assertAllEqual(np.array([0, 1], dtype=np.int64), spv.dense_shape)
+    self.assertAllEqual(
+        np.reshape(np.array([], dtype=np.int64), (0, 2)), spv.indices)
 
 
 class NumericColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     a = fc.numeric_column('aaa')
     self.assertEqual('aaa', a.key)
@@ -201,6 +247,7 @@ class NumericColumnTest(test.TestCase):
     self.assertIsNone(a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
     self.assertIsNone(a.normalizer_fn)
+    self.assertTrue(a._is_v2_column)
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
@@ -270,64 +317,74 @@ class NumericColumnTest(test.TestCase):
         'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
     }, a.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example_no_default_value(self):
     price = fc.numeric_column('price', shape=[2])
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([price]))
+        features=fc.make_parse_example_spec_v2([price]))
     self.assertIn('price', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+    self.assertAllEqual([[20., 110.]], self.evaluate(features['price']))
+
+  @test_util.run_deprecated_v1
   def test_parse_example_with_default_value(self):
     price = fc.numeric_column('price', shape=[2], default_value=11.)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
-    no_data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'something_else':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
+    no_data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'something_else':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString(),
                     no_data.SerializeToString()],
-        features=fc.make_parse_example_spec([price]))
+        features=fc.make_parse_example_spec_v2([price]))
     self.assertIn('price', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
+
+    self.assertAllEqual([[20., 110.], [11., 11.]],
+                        self.evaluate(features['price']))
 
   def test_normalizer_fn_must_be_callable(self):
     with self.assertRaisesRegexp(TypeError, 'must be a callable'):
       fc.numeric_column('price', normalizer_fn='NotACallable')
 
+  @test_util.run_deprecated_v1
   def test_normalizer_fn_transform_feature(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
-    output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price], None)
-    with self.cached_session():
-      self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
+    output = fc._transform_features_v2({
+        'price': [[1., 2.], [5., 6.]]
+    }, [price], None)
 
+    self.assertAllEqual([[3., 4.], [7., 8.]], self.evaluate(output[price]))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
-    transformation_cache = FeatureTransformationCache({
+    transformation_cache = fc.FeatureTransformationCache({
         'price': [[1., 2.], [5., 6.]]
     })
     self.assertEqual(
@@ -336,7 +393,7 @@ class NumericColumnTest(test.TestCase):
 
   def test_sparse_tensor_not_supported(self):
     price = fc.numeric_column('price')
-    transformation_cache = FeatureTransformationCache({
+    transformation_cache = fc.FeatureTransformationCache({
         'price':
             sparse_tensor.SparseTensor(
                 indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
@@ -344,6 +401,7 @@ class NumericColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       price.transform_feature(transformation_cache, None)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
     a_copy = copy.deepcopy(a)
@@ -356,6 +414,7 @@ class NumericColumnTest(test.TestCase):
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
@@ -364,11 +423,48 @@ class NumericColumnTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
+
+  def test_old_linear_model(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc_old.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+
+    def _increment_two(input_tensor):
+      return input_tensor + 2.
+
+    price = fc.numeric_column('price', normalizer_fn=_increment_two)
+    self.assertEqual(['price'], price.parents)
+
+    config = price._get_config()
+    self.assertEqual({
+        'key': 'price',
+        'shape': (1,),
+        'default_value': None,
+        'dtype': 'float32',
+        'normalizer_fn': '_increment_two'
+    }, config)
+
+    self.assertEqual(
+        price,
+        fc.NumericColumn._from_config(
+            config, custom_objects={'_increment_two': _increment_two}))
 
 
 class BucketizedColumnTest(test.TestCase):
@@ -388,22 +484,29 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_invalid_boundaries(self):
     a = fc.numeric_column('aaa')
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=None)
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=1.)
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=[1, 0])
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=[1, 1])
 
   def test_name(self):
     a = fc.numeric_column('aaa', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertTrue(b._is_v2_column)
+    self.assertEqual('aaa_bucketized', b.name)
+
+  def test_is_v2_column_old_numeric(self):
+    a = fc_old._numeric_column('aaa', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertFalse(b._is_v2_column)
     self.assertEqual('aaa_bucketized', b.name)
 
   def test_parse_spec(self):
@@ -425,75 +528,85 @@ class BucketizedColumnTest(test.TestCase):
     # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
     self.assertEqual(6, b.num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([bucketized_price]))
+        features=fc.make_parse_example_spec_v2([bucketized_price]))
     self.assertIn('price', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+    self.assertAllEqual([[20., 110.]], self.evaluate(features['price']))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      transformed_tensor = _transform_features({
+      transformed_tensor = fc._transform_features_v2({
           'price': [[-1., 1.], [5., 6.]]
       }, [bucketized_price], None)
-      with _initialized_session():
-        self.assertAllEqual([[0, 1], [3, 4]],
-                            transformed_tensor[bucketized_price].eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllEqual([[0, 1], [3, 4]],
+                          self.evaluate(transformed_tensor[bucketized_price]))
 
   def test_get_dense_tensor_one_input_value(self):
     """Tests _get_dense_tensor() for input with shape=[1]."""
     price = fc.numeric_column('price', shape=[1])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      transformation_cache = FeatureTransformationCache({
+      transformation_cache = fc.FeatureTransformationCache({
           'price': [[-1.], [1.], [5.], [6.]]
       })
-      with _initialized_session():
-        bucketized_price_tensor = bucketized_price.get_dense_tensor(
-            transformation_cache, None)
-        self.assertAllClose(
-            # One-hot tensor.
-            [[[1., 0., 0., 0., 0.]],
-             [[0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.]],
-             [[0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      bucketized_price_tensor = bucketized_price.get_dense_tensor(
+          transformation_cache, None)
+      self.assertAllClose(
+          # One-hot tensor.
+          [[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]],
+           [[0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 1.]]],
+          self.evaluate(bucketized_price_tensor))
 
   def test_get_dense_tensor_two_input_values(self):
     """Tests _get_dense_tensor() for input with shape=[2]."""
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      transformation_cache = FeatureTransformationCache({
+      transformation_cache = fc.FeatureTransformationCache({
           'price': [[-1., 1.], [5., 6.]]
       })
-      with _initialized_session():
-        bucketized_price_tensor = bucketized_price.get_dense_tensor(
-            transformation_cache, None)
-        self.assertAllClose(
-            # One-hot tensor.
-            [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      bucketized_price_tensor = bucketized_price.get_dense_tensor(
+          transformation_cache, None)
+      self.assertAllClose(
+          # One-hot tensor.
+          [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
+           [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
+          self.evaluate(bucketized_price_tensor))
 
   def test_get_sparse_tensors_one_input_value(self):
     """Tests _get_sparse_tensors() for input with shape=[1]."""
     price = fc.numeric_column('price', shape=[1])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      transformation_cache = FeatureTransformationCache({
+      transformation_cache = fc.FeatureTransformationCache({
           'price': [[-1.], [1.], [5.], [6.]]
       })
       with _initialized_session() as sess:
@@ -501,8 +614,8 @@ class BucketizedColumnTest(test.TestCase):
             transformation_cache, None)
         self.assertIsNone(id_weight_pair.weight_tensor)
         id_tensor_value = sess.run(id_weight_pair.id_tensor)
-        self.assertAllEqual(
-            [[0, 0], [1, 0], [2, 0], [3, 0]], id_tensor_value.indices)
+        self.assertAllEqual([[0, 0], [1, 0], [2, 0], [3, 0]],
+                            id_tensor_value.indices)
         self.assertAllEqual([0, 1, 3, 4], id_tensor_value.values)
         self.assertAllEqual([4, 1], id_tensor_value.dense_shape)
 
@@ -511,7 +624,7 @@ class BucketizedColumnTest(test.TestCase):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      transformation_cache = FeatureTransformationCache({
+      transformation_cache = fc.FeatureTransformationCache({
           'price': [[-1., 1.], [5., 6.]]
       })
       with _initialized_session() as sess:
@@ -519,8 +632,8 @@ class BucketizedColumnTest(test.TestCase):
             transformation_cache, None)
         self.assertIsNone(id_weight_pair.weight_tensor)
         id_tensor_value = sess.run(id_weight_pair.id_tensor)
-        self.assertAllEqual(
-            [[0, 0], [0, 1], [1, 0], [1, 1]], id_tensor_value.indices)
+        self.assertAllEqual([[0, 0], [0, 1], [1, 0], [1, 1]],
+                            id_tensor_value.indices)
         # Values 0-4 correspond to the first column of the input price.
         # Values 5-9 correspond to the second column of the input price.
         self.assertAllEqual([0, 6, 3, 9], id_tensor_value.values)
@@ -529,7 +642,7 @@ class BucketizedColumnTest(test.TestCase):
   def test_sparse_tensor_input_not_supported(self):
     price = fc.numeric_column('price')
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
-    transformation_cache = FeatureTransformationCache({
+    transformation_cache = fc.FeatureTransformationCache({
         'price':
             sparse_tensor.SparseTensor(
                 indices=[[0, 0]], values=[0.3], dense_shape=[1, 1])
@@ -537,6 +650,7 @@ class BucketizedColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       bucketized_price.transform_feature(transformation_cache, None)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.numeric_column('aaa', shape=[2])
     a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
@@ -555,20 +669,23 @@ class BucketizedColumnTest(test.TestCase):
       predictions = model(features)
       bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
-        sess.run(bucketized_price_var.assign(
-            [[10.], [20.], [30.], [40.], [50.]]))
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
@@ -580,34 +697,151 @@ class BucketizedColumnTest(test.TestCase):
       predictions = model(features)
       bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        # One weight per bucket per input column, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
+                                         [60.], [70.], [80.], [90.], [100.]]))
+        # 1st example:
+        #   price -1. is in the 0th bucket, whose weight is 10.
+        #   price 1. is in the 6th bucket, whose weight is 70.
+        # 2nd example:
+        #   price 5. is in the 3rd bucket, whose weight is 40.
+        #   price 6. is in the 9th bucket, whose weight is 100.
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
+
+  def test_old_linear_model_one_input_value(self):
+    """Tests linear_model() for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
+      predictions = fc_old.linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias))
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
+
+  def test_old_linear_model_two_input_values(self):
+    """Tests linear_model() for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1., 1.], [5., 6.]]}
+      predictions = fc_old.linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
-        sess.run(bucketized_price_var.assign(
-            [[10.], [20.], [30.], [40.], [50.],
-             [60.], [70.], [80.], [90.], [100.]]))
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
+                                         [60.], [70.], [80.], [90.], [100.]]))
         # 1st example:
         #   price -1. is in the 0th bucket, whose weight is 10.
         #   price 1. is in the 6th bucket, whose weight is 70.
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
+
+  def test_old_linear_model_one_input_value_old_numeric(self):
+    """Tests linear_model() for input with shape=[1]."""
+    price = fc_old._numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
+      predictions = fc_old.linear_model(features, [bucketized_price])
+      bias = get_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias))
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    self.assertEqual([price], bucketized_price.parents)
+
+    config = bucketized_price._get_config()
+    self.assertEqual({
+        'source_column': {
+            'class_name': 'NumericColumn',
+            'config': {
+                'key': 'price',
+                'shape': (2,),
+                'default_value': None,
+                'dtype': 'float32',
+                'normalizer_fn': None
+            }
+        },
+        'boundaries': (0, 2, 4, 6)
+    }, config)
+
+    new_bucketized_price = fc.BucketizedColumn._from_config(config)
+    self.assertEqual(bucketized_price, new_bucketized_price)
+    self.assertIsNot(price, new_bucketized_price.source_column)
+
+    new_bucketized_price = fc.BucketizedColumn._from_config(
+        config, columns_by_name={price.name: price})
+    self.assertEqual(bucketized_price, new_bucketized_price)
+    self.assertIs(price, new_bucketized_price.source_column)
 
 
 class HashedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     a = fc.categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a.key)
     self.assertEqual(10, a.hash_bucket_size)
     self.assertEqual(dtypes.string, a.dtype)
+    self.assertTrue(a._is_v2_column)
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
@@ -628,6 +862,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_hash_bucket('aaa', 10)
     for column in (original, copy.deepcopy(original)):
@@ -648,43 +883,50 @@ class HashedCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, a.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_hash_bucket('aaa', 10)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_strings_should_be_hashed(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
-    outputs = _transform_features({'wire': wire_tensor}, [hashed_sparse], None)
+    outputs = fc._transform_features_v2({
+        'wire': wire_tensor
+    }, [hashed_sparse], None)
     output = outputs[hashed_sparse]
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [6, 4, 1]
-    with self.cached_session():
-      self.assertEqual(dtypes.int64, output.values.dtype)
-      self.assertAllEqual(expected_values, output.values.eval())
-      self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval())
-      self.assertAllEqual(wire_tensor.dense_shape.eval(),
-                          output.dense_shape.eval())
+
+    self.assertEqual(dtypes.int64, output.values.dtype)
+    self.assertAllEqual(expected_values, self.evaluate(output.values))
+    self.assertAllEqual(
+        self.evaluate(wire_tensor.indices), self.evaluate(output.indices))
+    self.assertAllEqual(
+        self.evaluate(wire_tensor.dense_shape),
+        self.evaluate(output.dense_shape))
 
   def test_tensor_dtype_should_be_string_or_integer(self):
     string_fc = fc.categorical_column_with_hash_bucket(
@@ -694,18 +936,12 @@ class HashedCategoricalColumnTest(test.TestCase):
     float_fc = fc.categorical_column_with_hash_bucket(
         'a_float', 10, dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(
-        values=[101],
-        indices=[[0, 0]],
-        dense_shape=[1, 1])
+        values=[101], indices=[[0, 0]], dense_shape=[1, 1])
     string_tensor = sparse_tensor.SparseTensor(
-        values=['101'],
-        indices=[[0, 0]],
-        dense_shape=[1, 1])
+        values=['101'], indices=[[0, 0]], dense_shape=[1, 1])
     float_tensor = sparse_tensor.SparseTensor(
-        values=[101.],
-        indices=[[0, 0]],
-        dense_shape=[1, 1])
-    transformation_cache = FeatureTransformationCache({
+        values=[101.], indices=[[0, 0]], dense_shape=[1, 1])
+    transformation_cache = fc.FeatureTransformationCache({
         'a_int': int_tensor,
         'a_string': string_tensor,
         'a_float': float_tensor
@@ -720,10 +956,11 @@ class HashedCategoricalColumnTest(test.TestCase):
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-    transformation_cache = FeatureTransformationCache({'wire': wire_tensor})
+    transformation_cache = fc.FeatureTransformationCache({'wire': wire_tensor})
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       transformation_cache.get(hashed_sparse, None)
 
+  @test_util.run_deprecated_v1
   def test_ints_should_be_hashed(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
@@ -731,13 +968,14 @@ class HashedCategoricalColumnTest(test.TestCase):
         values=[101, 201, 301],
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
-    transformation_cache = FeatureTransformationCache({'wire': wire_tensor})
+    transformation_cache = fc.FeatureTransformationCache({'wire': wire_tensor})
     output = transformation_cache.get(hashed_sparse, None)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.cached_session():
-      self.assertAllEqual(expected_values, output.values.eval())
 
+    self.assertAllEqual(expected_values, self.evaluate(output.values))
+
+  @test_util.run_deprecated_v1
   def test_int32_64_is_compatible(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
@@ -745,16 +983,17 @@ class HashedCategoricalColumnTest(test.TestCase):
         values=constant_op.constant([101, 201, 301], dtype=dtypes.int32),
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
-    transformation_cache = FeatureTransformationCache({'wire': wire_tensor})
+    transformation_cache = fc.FeatureTransformationCache({'wire': wire_tensor})
     output = transformation_cache.get(hashed_sparse, None)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.cached_session():
-      self.assertAllEqual(expected_values, output.values.eval())
 
+    self.assertAllEqual(expected_values, self.evaluate(output.values))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
-    transformation_cache = FeatureTransformationCache({
+    transformation_cache = fc.FeatureTransformationCache({
         'wire':
             sparse_tensor.SparseTensor(
                 values=['omar', 'stringer', 'marlo'],
@@ -767,9 +1006,10 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
-    transformation_cache = FeatureTransformationCache({
+    transformation_cache = fc.FeatureTransformationCache({
         'wire': (('omar', ''), ('stringer', 'marlo'))
     })
     id_weight_pair = hashed_sparse.get_sparse_tensors(transformation_cache,
@@ -778,6 +1018,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column.num_buckets)
@@ -791,26 +1032,69 @@ class HashedCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       wire_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 3: wire_var[3] = 4
-        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 3: wire_var[3] = 4
+      # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+      self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
+
+  def test_old_linear_model(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(4, wire_column.num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 3: wire_var[3] = 4
+      # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+      self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(['wire'], wire_column.parents)
+
+    config = wire_column._get_config()
+    self.assertEqual({
+        'key': 'wire',
+        'hash_bucket_size': 4,
+        'dtype': 'string'
+    }, config)
+
+    self.assertEqual(wire_column,
+                     fc.HashedCategoricalColumn._from_config(config))
 
 
 class CrossedColumnTest(test.TestCase):
 
   def test_keys_empty(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'keys must be a list with length > 1'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'keys must be a list with length > 1'):
       fc.crossed_column([], 10)
 
   def test_keys_length_one(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'keys must be a list with length > 1'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'keys must be a list with length > 1'):
       fc.crossed_column(['a'], 10)
 
   def test_key_type_unsupported(self):
@@ -823,26 +1107,35 @@ class CrossedColumnTest(test.TestCase):
           ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
 
   def test_hash_bucket_size_negative(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], -1)
 
   def test_hash_bucket_size_zero(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], 0)
 
   def test_hash_bucket_size_none(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], None)
 
   def test_name(self):
     a = fc.numeric_column('a', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    self.assertTrue(crossed1._is_v2_column)
+
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    self.assertTrue(crossed2._is_v2_column)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
+
+  def test_is_v2_column(self):
+    a = fc_old._numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    self.assertTrue(crossed1._is_v2_column)
 
     crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    self.assertFalse(crossed2._is_v2_column)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_name_ordered_alphabetically(self):
@@ -878,72 +1171,83 @@ class CrossedColumnTest(test.TestCase):
     crossed = fc.crossed_column([b, 'c'], 15)
     self.assertEqual(15, crossed.num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.numeric_column('a', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     crossed1 = fc.crossed_column(['d1', 'd2'], 10)
     crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
     crossed2_copy = copy.deepcopy(crossed2)
-    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
+    self.assertEqual(
+        'a_bucketized_X_c_X_d1_X_d2',
+        crossed2_copy.name,
+    )
     self.assertEqual(15, crossed2_copy.hash_bucket_size)
     self.assertEqual(5, crossed2_copy.hash_key)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
     price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.])),
-            'wire':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.])),
+                'wire':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer'])),
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([price_cross_wire]))
+        features=fc.make_parse_example_spec_v2([price_cross_wire]))
     self.assertIn('price', features)
     self.assertIn('wire', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
-      wire_sparse = features['wire']
-      self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
-      # Use byte constants to pass the open-source test.
-      self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
-      self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
 
+    self.assertAllEqual([[20., 110.]], self.evaluate(features['price']))
+    wire_sparse = features['wire']
+    self.assertAllEqual([[0, 0], [0, 1]], self.evaluate(wire_sparse.indices))
+    # Use byte constants to pass the open-source test.
+    self.assertAllEqual([b'omar', b'stringer'],
+                        self.evaluate(wire_sparse.values))
+    self.assertAllEqual([1, 2], self.evaluate(wire_sparse.dense_shape))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
     hash_bucket_size = 10
-    price_cross_wire = fc.crossed_column(
-        [bucketized_price, 'wire'], hash_bucket_size)
+    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'],
+                                         hash_bucket_size)
     features = {
-        'price': constant_op.constant([[1., 2.], [5., 6.]]),
-        'wire': sparse_tensor.SparseTensor(
-            values=['omar', 'stringer', 'marlo'],
-            indices=[[0, 0], [1, 0], [1, 1]],
-            dense_shape=[2, 2]),
+        'price':
+            constant_op.constant([[1., 2.], [5., 6.]]),
+        'wire':
+            sparse_tensor.SparseTensor(
+                values=['omar', 'stringer', 'marlo'],
+                indices=[[0, 0], [1, 0], [1, 1]],
+                dense_shape=[2, 2]),
     }
-    outputs = _transform_features(features, [price_cross_wire], None)
+    outputs = fc._transform_features_v2(features, [price_cross_wire], None)
     output = outputs[price_cross_wire]
-    with self.cached_session() as sess:
-      output_val = sess.run(output)
-      self.assertAllEqual(
-          [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
-      for val in output_val.values:
-        self.assertIn(val, list(range(hash_bucket_size)))
-      self.assertAllEqual([2, 4], output_val.dense_shape)
-
+    output_val = self.evaluate(output)
+    self.assertAllEqual([[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]],
+                        output_val.indices)
+    for val in output_val.values:
+      self.assertIn(val, list(range(hash_bucket_size)))
+    self.assertAllEqual([2, 4], output_val.dense_shape)
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
     b = fc.bucketized_column(a, boundaries=(0, 1))
     crossed1 = fc.crossed_column(['d1', 'd2'], 10)
     crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
     with ops.Graph().as_default():
-      transformation_cache = FeatureTransformationCache({
+      transformation_cache = fc.FeatureTransformationCache({
           'a':
               constant_op.constant(((-1., .5), (.5, 1.))),
           'c':
@@ -963,19 +1267,21 @@ class CrossedColumnTest(test.TestCase):
                   dense_shape=(2, 2)),
       })
       id_weight_pair = crossed2.get_sparse_tensors(transformation_cache, None)
-      with _initialized_session():
-        id_tensor_eval = id_weight_pair.id_tensor.eval()
-        self.assertAllEqual(
-            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
-             (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
-             (1, 14), (1, 15)),
-            id_tensor_eval.indices)
-        # Check exact hashed output. If hashing changes this test will break.
-        # All values are within [0, hash_bucket_size).
-        expected_values = (
-            6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0, 10, 11)
-        self.assertAllEqual(expected_values, id_tensor_eval.values)
-        self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      id_tensor_eval = self.evaluate(id_weight_pair.id_tensor)
+      self.assertAllEqual(
+          ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
+           (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
+           (1, 14), (1, 15)), id_tensor_eval.indices)
+      # Check exact hashed output. If hashing changes this test will break.
+      # All values are within [0, hash_bucket_size).
+      expected_values = (6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0,
+                         10, 11)
+      self.assertAllEqual(expected_values, id_tensor_eval.values)
+      self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
 
   def test_get_sparse_tensors_simple(self):
     """Same as test_get_sparse_tensors, but with simpler values."""
@@ -983,7 +1289,7 @@ class CrossedColumnTest(test.TestCase):
     b = fc.bucketized_column(a, boundaries=(0, 1))
     crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
-      transformation_cache = FeatureTransformationCache({
+      transformation_cache = fc.FeatureTransformationCache({
           'a':
               constant_op.constant(((-1., .5), (.5, 1.))),
           'c':
@@ -993,17 +1299,20 @@ class CrossedColumnTest(test.TestCase):
                   dense_shape=(2, 2)),
       })
       id_weight_pair = crossed.get_sparse_tensors(transformation_cache, None)
-      with _initialized_session():
-        id_tensor_eval = id_weight_pair.id_tensor.eval()
-        self.assertAllEqual(
-            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
-            id_tensor_eval.indices)
-        # Check exact hashed output. If hashing changes this test will break.
-        # All values are within [0, hash_bucket_size).
-        expected_values = (1, 0, 1, 3, 4, 2)
-        self.assertAllEqual(expected_values, id_tensor_eval.values)
-        self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      id_tensor_eval = self.evaluate(id_weight_pair.id_tensor)
+      self.assertAllEqual(((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
+                          id_tensor_eval.indices)
+      # Check exact hashed output. If hashing changes this test will break.
+      # All values are within [0, hash_bucket_size).
+      expected_values = (1, 0, 1, 3, 4, 2)
+      self.assertAllEqual(expected_values, id_tensor_eval.values)
+      self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     """Tests linear_model.
 
@@ -1025,21 +1334,26 @@ class CrossedColumnTest(test.TestCase):
       })
       crossed_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(
-            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_linear_model_with_weights(self):
 
-    class _TestColumnWithWeights(fc.CategoricalColumn):
+    class _TestColumnWithWeights(BaseFeatureColumnForTests,
+                                 fc.CategoricalColumn):
       """Produces sparse IDs and sparse weights."""
 
+      @property
+      def _is_v2_column(self):
+        return True
+
       @property
       def name(self):
         return 'test_column'
@@ -1047,10 +1361,11 @@ class CrossedColumnTest(test.TestCase):
       @property
       def parse_example_spec(self):
         return {
-            self.name: parsing_ops.VarLenFeature(dtypes.int32),
-            '{}_weights'.format(self.name): parsing_ops.VarLenFeature(
-                dtypes.float32),
-            }
+            self.name:
+                parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name):
+                parsing_ops.VarLenFeature(dtypes.float32),
+        }
 
       @property
       def num_buckets(self):
@@ -1092,79 +1407,274 @@ class CrossedColumnTest(test.TestCase):
                     dense_shape=(2, 2)),
         })
 
+  def test_old_linear_model(self):
+    """Tests linear_model.
 
-class LinearModelTest(test.TestCase):
-
-  def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
-      fc.LinearModel(feature_columns=[])
+    Uses data from test_get_sparse_tesnsors_simple.
+    """
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+      }, (crossed,))
+      bias = get_linear_model_bias()
+      crossed_var = get_linear_model_column_var(crossed)
+      with _initialized_session() as sess:
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
-  def test_should_be_feature_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a FeatureColumn'):
-      fc.LinearModel(feature_columns='NotSupported')
+  def test_old_linear_model_with_weights(self):
 
-  def test_should_be_dense_or_categorical_column(self):
+    class _TestColumnWithWeights(BaseFeatureColumnForTests,
+                                 fc.CategoricalColumn,
+                                 fc_old._CategoricalColumn):
+      """Produces sparse IDs and sparse weights."""
 
-    class NotSupportedColumn(fc.FeatureColumn):
+      @property
+      def _is_v2_column(self):
+        return True
 
       @property
       def name(self):
-        return 'NotSupportedColumn'
-
-      def transform_feature(self, transformation_cache, state_manager):
-        pass
+        return 'test_column'
 
       @property
       def parse_example_spec(self):
-        pass
+        return {
+            self.name:
+                parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name):
+                parsing_ops.VarLenFeature(dtypes.float32),
+        }
 
-    with self.assertRaisesRegexp(
-        ValueError, 'must be either a DenseColumn or CategoricalColumn'):
-      fc.LinearModel(feature_columns=[NotSupportedColumn()])
+      @property
+      def _parse_example_spec(self):
+        return self.parse_example_spec
 
-  def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.LinearModel(feature_columns={'a': fc.numeric_column('a')})
+      @property
+      def num_buckets(self):
+        return 5
 
-  def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Duplicate feature column name found for columns'):
-      fc.LinearModel(
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+      @property
+      def _num_buckets(self):
+        return self.num_buckets
 
-  def test_dense_bias(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        sess.run(price_var.assign([[10.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+      def transform_feature(self, transformation_cache, state_manager):
+        raise ValueError('Should not be called.')
 
-  def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast])
-      predictions = model(features)
+      def _transform_feature(self, inputs):
+        return (inputs.get(self.name),
+                inputs.get('{}_weights'.format(self.name)))
+
+      def get_sparse_tensors(self, transformation_cache, state_manager):
+        raise ValueError('Should not be called.')
+
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
+                              trainable=None):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = inputs.get(self)
+        return fc.CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        fc_old.linear_model({
+            t.name:
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[0, 1, 2],
+                    dense_shape=(2, 2)),
+            '{}_weights'.format(t.name):
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[1., 10., 2.],
+                    dense_shape=(2, 2)),
+            'c':
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=['cA', 'cB', 'cC'],
+                    dense_shape=(2, 2)),
+        }, (crossed,))
+
+  def test_old_linear_model_old_numeric(self):
+    """Tests linear_model.
+
+    Uses data from test_get_sparse_tesnsors_simple.
+    """
+    a = fc_old._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+      }, (crossed,))
+      bias = get_linear_model_bias()
+      crossed_var = get_linear_model_column_var(crossed)
+      with _initialized_session() as sess:
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+
+    self.assertEqual([b, 'c'], crossed.parents)
+
+    config = crossed._get_config()
+    self.assertEqual({
+        'hash_bucket_size':
+            5,
+        'hash_key':
+            5,
+        'keys': ({
+            'config': {
+                'boundaries': (0, 1),
+                'source_column': {
+                    'config': {
+                        'dtype': 'int32',
+                        'default_value': None,
+                        'key': 'a',
+                        'normalizer_fn': None,
+                        'shape': (2,)
+                    },
+                    'class_name': 'NumericColumn'
+                }
+            },
+            'class_name': 'BucketizedColumn'
+        }, 'c')
+    }, config)
+
+    new_crossed = fc.CrossedColumn._from_config(config)
+    self.assertEqual(crossed, new_crossed)
+    self.assertIsNot(b, new_crossed.keys[0])
+
+    new_crossed = fc.CrossedColumn._from_config(
+        config, columns_by_name={b.name: b})
+    self.assertEqual(crossed, new_crossed)
+    self.assertIs(b, new_crossed.keys[0])
+
+
+class LinearModelTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.LinearModel(feature_columns=[])
+
+  def test_should_be_feature_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a FeatureColumn'):
+      fc.LinearModel(feature_columns='NotSupported')
+
+  def test_should_be_dense_or_categorical_column(self):
+
+    class NotSupportedColumn(BaseFeatureColumnForTests):
+
+      @property
+      def _is_v2_column(self):
+        return True
+
+      @property
+      def name(self):
+        return 'NotSupportedColumn'
+
+      def transform_feature(self, transformation_cache, state_manager):
+        pass
+
+      @property
+      def parse_example_spec(self):
+        pass
+
+    with self.assertRaisesRegexp(
+        ValueError, 'must be either a DenseColumn or CategoricalColumn'):
+      fc.LinearModel(feature_columns=[NotSupportedColumn()])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.LinearModel(feature_columns={'a': fc.numeric_column('a')})
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.LinearModel(
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_not_dict_input_features(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = [[1.], [5.]]
+      model = fc.LinearModel([price])
+      with self.assertRaisesRegexp(ValueError, 'We expected a dictionary here'):
+        model(features)
+
+  def test_dense_bias(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      model = fc.LinearModel([price])
+      predictions = model(features)
+      price_var, bias = model.variables
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias))
+        sess.run(price_var.assign([[10.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
+
+  def test_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      model = fc.LinearModel([wire_cast])
+      predictions = model(features)
       wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1182,12 +1692,17 @@ class LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
 
-    class _DenseAndSparseColumn(fc.DenseColumn, fc.CategoricalColumn):
+    class _DenseAndSparseColumn(BaseFeatureColumnForTests, fc.DenseColumn,
+                                fc.CategoricalColumn):
+
+      @property
+      def _is_v2_column(self):
+        return True
 
       @property
       def name(self):
@@ -1229,10 +1744,11 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       dense_and_sparse_column_var, bias = model.variables
       with _initialized_session() as sess:
-        sess.run(dense_and_sparse_column_var.assign(
-            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(
+            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
+                                                [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
@@ -1242,12 +1758,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1261,15 +1777,15 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
-            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
-                1000., 1100., 1200.
-            ], [10000., 11000., 12000.]]))
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
+                                  [1000., 1100., 1200.],
+                                  [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
@@ -1279,9 +1795,9 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, _ = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1296,7 +1812,7 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, _ = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -1319,7 +1835,7 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1340,7 +1856,7 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
@@ -1350,12 +1866,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -1375,32 +1891,29 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
     price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3.], [4.]]
-      }
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       model = fc.LinearModel([price1, price2])
       predictions = model(features)
       price1_var, price2_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_dense_trainable_default(self):
     price = fc.numeric_column('price')
@@ -1501,6 +2014,8 @@ class LinearModelTest(test.TestCase):
           'sparse_feature': [['a'], ['x']],
       }
       model(features)
+      for var in model.variables:
+        self.assertTrue(isinstance(var, variables_lib.RefVariable))
       variable_names = [var.name for var in model.variables]
       self.assertItemsEqual([
           'linear_model/dense_feature_bucketized/weights:0',
@@ -1510,6 +2025,23 @@ class LinearModelTest(test.TestCase):
           'linear_model/bias_weights:0',
       ], variable_names)
 
+  def test_fit_and_predict(self):
+    columns = [fc.numeric_column('a')]
+
+    model = fc.LinearModel(columns)
+    model.compile(
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    x = {'a': np.random.random((10, 1))}
+    y = np.random.randint(20, size=(10, 1))
+    y = keras.utils.to_categorical(y, num_classes=20)
+    model.fit(x, y, epochs=1, batch_size=5)
+    model.fit(x, y, epochs=1, batch_size=5)
+    model.evaluate(x, y, batch_size=5)
+    model.predict(x, batch_size=5)
+
   def test_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
     price2 = fc.numeric_column('price2')
@@ -1574,6 +2106,7 @@ class LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_numpy_input_fn(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -1606,11 +2139,13 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]],
+                          self.evaluate(net))
 
       coord.request_stop()
       coord.join(threads)
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -1624,11 +2159,16 @@ class LinearModelTest(test.TestCase):
 
     # Provides 1-dim tensor and dense tensor.
     features = {
-        'price': constant_op.constant([-1., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
+        'price':
+            constant_op.constant([
+                -1.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
     }
     self.assertEqual(1, features['price'].shape.ndims)
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
@@ -1642,8 +2182,10 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -1668,9 +2210,7 @@ class LinearModelTest(test.TestCase):
 
     price_data = np.array([-1., 12.])
     body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)),
-        values=('sedan', 'hardtop'),
-        dense_shape=(2,))
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
     country_data = np.array(['US', 'CA'])
 
     model = fc.LinearModel([price_buckets, body_style, country])
@@ -1690,6 +2230,7 @@ class LinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     price = fc.numeric_column('price')
     features = {
@@ -1725,158 +2266,1044 @@ class LinearModelTest(test.TestCase):
       price_var1, bias1 = model1.variables
       price_var2, bias2 = model2.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
-
-
-class FeatureLayerTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_retrieving_input(self):
-    features = {'a': [0.]}
-    feature_layer = FeatureLayer(fc.numeric_column('a'))
-    inputs = self.evaluate(feature_layer(features))
-    self.assertAllClose([[0.]], inputs)
-
-  def test_reuses_variables(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
-
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
-      def _embedding_column_initializer(shape, dtype, partition_info):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
-
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
-
-      feature_layer = FeatureLayer([embedding_column])
-      features = {'a': sparse_input}
-
-      inputs = feature_layer(features)
-      variables = feature_layer.variables
-
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
-      # Check that only one variable was created.
-      self.assertEqual(1, len(variables))
 
-      # Check that invoking feature_layer on the same features does not create
-      # additional variables
-      _ = feature_layer(features)
-      self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], feature_layer.variables[0])
+class OldLinearModelTest(test.TestCase):
 
-  def test_feature_column_feature_layer_gradient(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc_old.linear_model(features={}, feature_columns=[])
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
+  def test_should_be_feature_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+      fc_old.linear_model(features={'a': [[0]]}, feature_columns='NotSupported')
 
-      def _embedding_column_initializer(shape, dtype, partition_info):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+  def test_should_be_dense_or_categorical_column(self):
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+    class NotSupportedColumn(BaseFeatureColumnForTests, fc.FeatureColumn,
+                             fc_old._FeatureColumn):
 
-      feature_layer = FeatureLayer([embedding_column])
-      features = {'a': sparse_input}
+      @property
+      def _is_v2_column(self):
+        return True
 
-      def scale_matrix():
-        matrix = feature_layer(features)
-        return 2 * matrix
+      @property
+      def name(self):
+        return 'NotSupportedColumn'
 
-      # Sanity check: Verify that scale_matrix returns the correct output.
-      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+      def transform_feature(self, transformation_cache, state_manager):
+        pass
 
-      # Check that the returned gradient is correct.
-      grad_function = backprop.implicit_grad(scale_matrix)
-      grads_and_vars = grad_function()
-      indexed_slice = grads_and_vars[0][0]
-      gradient = grads_and_vars[0][0].values
+      def _transform_feature(self, inputs):
+        pass
 
-      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
-      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+      @property
+      def parse_example_spec(self):
+        pass
 
-  def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
-      FeatureLayer(feature_columns=[])(features={})
+      @property
+      def _parse_example_spec(self):
+        pass
 
-  def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a DenseColumn'):
-      FeatureLayer(feature_columns=[
-          fc.categorical_column_with_hash_bucket('wire_cast', 4)
-      ])(
-          features={
-              'a': [[0]]
-          })
+    with self.assertRaisesRegexp(
+        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
+      fc_old.linear_model(
+          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
 
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      FeatureLayer(feature_columns={'a': fc.numeric_column('a')})(
-          features={
-              'a': [[0]]
-          })
-
-  def test_bare_column(self):
-    with ops.Graph().as_default():
-      features = features = {'a': [0.]}
-      net = FeatureLayer(fc.numeric_column('a'))(features)
-      with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
-
-  def test_column_generator(self):
-    with ops.Graph().as_default():
-      features = features = {'a': [0.], 'b': [1.]}
-      columns = (fc.numeric_column(key) for key in features)
-      net = FeatureLayer(columns)(features)
-      with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+      fc_old.linear_model(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
-      FeatureLayer(
+      fc_old.linear_model(
+          features={'a': [[0]]},
           feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])(
+                           fc.numeric_column('a')])
+
+  def test_dense_bias(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc_old.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias))
+        sess.run(price_var.assign([[10.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
+
+  def test_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc_old.linear_model(features, [wire_cast])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
+
+  def test_dense_and_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
+      predictions = fc_old.linear_model(features, [wire_cast, price])
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
+
+  def test_dense_and_sparse_column(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _DenseAndSparseColumn(BaseFeatureColumnForTests, fc.DenseColumn,
+                                fc.CategoricalColumn, fc_old._DenseColumn,
+                                fc_old._CategoricalColumn):
+
+      @property
+      def _is_v2_column(self):
+        return True
+
+      @property
+      def name(self):
+        return 'dense_and_sparse_column'
+
+      @property
+      def parse_example_spec(self):
+        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
+
+      @property
+      def _parse_example_spec(self):
+        return self.parse_example_spec
+
+      def transform_feature(self, transformation_cache, state_manager):
+        raise ValueError('Should not use this method.')
+
+      def _transform_feature(self, inputs):
+        return inputs.get(self.name)
+
+      @property
+      def variable_shape(self):
+        return self.variable_shape
+
+      @property
+      def _variable_shape(self):
+        return self.variable_shape
+
+      def get_dense_tensor(self, transformation_cache, state_manager):
+        raise ValueError('Should not use this method.')
+
+      def _get_dense_tensor(self, inputs):
+        raise ValueError('Should not use this method.')
+
+      @property
+      def num_buckets(self):
+        return 4
+
+      @property
+      def _num_buckets(self):
+        return self.num_buckets
+
+      def get_sparse_tensors(self, transformation_cache, state_manager):
+        raise ValueError('Should not use this method.')
+
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
+                              trainable=None):
+        sp_tensor = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [1, 0], [1, 1]],
+            values=[2, 0, 3],
+            dense_shape=[2, 2])
+        return fc.CategoricalColumn.IdWeightPair(sp_tensor, None)
+
+    dense_and_sparse_column = _DenseAndSparseColumn()
+    with ops.Graph().as_default():
+      sp_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {dense_and_sparse_column.name: sp_tensor}
+      predictions = fc_old.linear_model(features, [dense_and_sparse_column])
+      bias = get_linear_model_bias()
+      dense_and_sparse_column_var = get_linear_model_column_var(
+          dense_and_sparse_column)
+      with _initialized_session() as sess:
+        sess.run(
+            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
+                                                [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
+
+  def test_dense_multi_output(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = fc_old.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
+        sess.run(price_var.assign([[10., 100., 1000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
+                            self.evaluate(predictions))
+
+  def test_sparse_multi_output(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc_old.linear_model(features, [wire_cast], units=3)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
+        sess.run(
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
+                                  [1000., 1100., 1200.],
+                                  [10000., 11000., 12000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
+                            self.evaluate(predictions))
+
+  def test_dense_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc_old.linear_model(features, [price])
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
+
+  def test_sparse_multi_rank(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
+      wire_value = sparse_tensor.SparseTensorValue(
+          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
+          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
+          dense_shape=[2, 2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc_old.linear_model(features, [wire_cast])
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
+        self.assertAllClose(
+            np.zeros((2, 1)),
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        self.assertAllClose(
+            [[1010.], [11000.]],
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+
+  def test_sparse_combiner(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = fc_old.linear_model(
+          features, [wire_cast], sparse_combiner='mean')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
+
+  def test_sparse_combiner_with_negative_weights(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {
+          'wire_cast': wire_tensor,
+          'weights': constant_op.constant([[1., 1., -1.0]])
+      }
+      predictions = fc_old.linear_model(
+          features, [wire_cast_weights], sparse_combiner='sum')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
+
+  def test_dense_multi_dimension_multi_output(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = fc_old.linear_model(features, [price], units=3)
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
+        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
+        sess.run(bias.assign([2., 3., 4.]))
+        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
+                            self.evaluate(predictions))
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        fc_old.linear_model(features, [price])
+
+  def test_dense_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      predictions = fc_old.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
+
+  def test_dense_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      predictions = fc_old.linear_model(features, [price1, price2])
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
+        sess.run(price1_var.assign([[10.], [100.]]))
+        sess.run(price2_var.assign([[1000.]]))
+        sess.run(bias.assign([7.]))
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
+
+  def test_fills_cols_to_vars(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      cols_to_vars = {}
+      fc_old.linear_model(features, [price1, price2], cols_to_vars=cols_to_vars)
+      bias = get_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      self.assertAllEqual(cols_to_vars['bias'], [bias])
+      self.assertAllEqual(cols_to_vars[price1], [price1_var])
+      self.assertAllEqual(cols_to_vars[price2], [price2_var])
+
+  def test_fills_cols_to_vars_partitioned_variables(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2', shape=3)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [6., 7.]],
+          'price2': [[3., 4., 5.], [8., 9., 10.]]
+      }
+      cols_to_vars = {}
+      with variable_scope.variable_scope(
+          'linear',
+          partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
+        fc_old.linear_model(
+            features, [price1, price2], cols_to_vars=cols_to_vars)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertEqual([0.], self.evaluate(cols_to_vars['bias'][0]))
+      # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
+      self.assertAllEqual([[0.]], self.evaluate(cols_to_vars[price1][0]))
+      self.assertAllEqual([[0.]], self.evaluate(cols_to_vars[price1][1]))
+      # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
+      # a [1, 1] Variable.
+      self.assertAllEqual([[0.], [0.]], self.evaluate(cols_to_vars[price2][0]))
+      self.assertAllEqual([[0.]], self.evaluate(cols_to_vars[price2][1]))
+
+  def test_fills_cols_to_output_tensors(self):
+    # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
+    # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
+    # creates a Variable.
+    apple_numeric_column = fc.numeric_column('apple_numeric_column')
+    banana_dense_feature = fc.numeric_column('banana_dense_feature')
+    banana_dense_feature_bucketized = fc.bucketized_column(
+        banana_dense_feature, boundaries=[0.])
+    cherry_sparse_column = fc.categorical_column_with_hash_bucket(
+        'cherry_sparse_feature', hash_bucket_size=5)
+    dragonfruit_embedding_column = fc.embedding_column(
+        cherry_sparse_column, dimension=10)
+    with ops.Graph().as_default():
+      features = {
+          'apple_numeric_column': [[3.], [4.]],
+          'banana_dense_feature': [[-1.], [4.]],
+          'cherry_sparse_feature': [['a'], ['x']],
+      }
+      cols_to_output_tensors = {}
+      all_cols = [
+          apple_numeric_column, banana_dense_feature_bucketized,
+          dragonfruit_embedding_column
+      ]
+      input_layer = fc_old.input_layer(
+          features, all_cols, cols_to_output_tensors=cols_to_output_tensors)
+
+      # We check the mapping by checking that we have the right keys,
+      # and that the values (output_tensors) were indeed the ones used to
+      # form the input layer.
+      self.assertItemsEqual(all_cols, cols_to_output_tensors.keys())
+      input_layer_inputs = [tensor for tensor in input_layer.op.inputs[:-1]]
+      output_tensors = [tensor for tensor in cols_to_output_tensors.values()]
+      self.assertItemsEqual(input_layer_inputs, output_tensors)
+
+  def test_dense_collection(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc_old.linear_model(features, [price], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      self.assertIn(bias, my_vars)
+      self.assertIn(price_var, my_vars)
+
+  def test_sparse_collection(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc_old.linear_model(features, [wire_cast], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, my_vars)
+      self.assertIn(wire_cast_var, my_vars)
+
+  def test_dense_trainable_default(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc_old.linear_model(features, [price])
+      bias = get_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(price_var, trainable_vars)
+
+  def test_sparse_trainable_default(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc_old.linear_model(features, [wire_cast])
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(wire_cast_var, trainable_vars)
+
+  def test_dense_trainable_false(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      fc_old.linear_model(features, [price], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_sparse_trainable_false(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      fc_old.linear_model(features, [wire_cast], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc_old.linear_model(
+          features, [price_a, wire_cast, price_b],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      fc_old.linear_model(
+          features, [wire_cast, price_b, price_a],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+      fc_old.linear_model(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc_old.linear_model(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      predictions = fc_old.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'must have the same size and shape'):
+          sess.run(
+              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      predictions = fc_old.linear_model(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            predictions,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  @test_util.run_deprecated_v1
+  def test_with_1d_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price':
+            constant_op.constant([
+                -1.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+
+    net = fc_old.linear_model(features, [price_buckets, body_style])
+    with _initialized_session() as sess:
+      bias = get_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
+
+  @test_util.run_deprecated_v1
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+
+    price_data = np.array([-1., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
+    country_data = np.array(['US', 'CA'])
+
+    net = fc_old.linear_model(features, [price_buckets, body_style, country])
+    bias = get_linear_model_bias()
+    price_buckets_var = get_linear_model_column_var(price_buckets)
+    body_style_var = get_linear_model_column_var(body_style)
+    with _initialized_session() as sess:
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          sess.run(
+                              net,
+                              feed_dict={
+                                  features['price']: price_data,
+                                  features['body-style']: body_style_data,
+                                  features['country']: country_data
+                              }))
+
+  @test_util.run_deprecated_v1
+  def test_with_rank_0_feature(self):
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
+
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      fc_old.linear_model(features, [price])
+
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = fc_old.linear_model(features, [price])
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
+
+  def test_multiple_linear_models(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features1 = {'price': [[1.], [5.]]}
+      features2 = {'price': [[2.], [10.]]}
+      predictions1 = fc_old.linear_model(features1, [price])
+      predictions2 = fc_old.linear_model(features2, [price])
+      bias1 = get_linear_model_bias(name='linear_model')
+      bias2 = get_linear_model_bias(name='linear_model_1')
+      price_var1 = get_linear_model_column_var(price, name='linear_model')
+      price_var2 = get_linear_model_column_var(price, name='linear_model_1')
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], self.evaluate(bias1))
+        sess.run(price_var1.assign([[10.]]))
+        sess.run(bias1.assign([5.]))
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
+        sess.run(price_var2.assign([[10.]]))
+        sess.run(bias2.assign([5.]))
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
+
+  @test_util.run_deprecated_v1
+  def test_linear_model_v1_shared_embedding_all_other_v2(self):
+    price = fc.numeric_column('price')  # v2
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)  # v2
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)  # v2
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)  # v2
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)  # v2
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)  # v1
+    all_cols = [
+        price, some_embedding_column, shared_embedding_a, shared_embedding_b
+    ]
+
+    with ops.Graph().as_default():
+      features = {
+          'price': [[3.], [4.]],
+          'sparse_feature': [['a'], ['x']],
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      fc_old.linear_model(features, all_cols)
+      bias = get_linear_model_bias()
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([0.], self.evaluate(bias))
+
+  @test_util.run_deprecated_v1
+  def test_linear_model_v1_shared_embedding_with_v2_cat_all_other_v2(self):
+    price = fc.numeric_column('price')  # v2
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)  # v2
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)  # v2
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)  # v2
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)  # v2
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)  # v1
+    all_cols = [
+        price, some_embedding_column, shared_embedding_a, shared_embedding_b
+    ]
+
+    with ops.Graph().as_default():
+      features = {
+          'price': [[3.], [4.]],
+          'sparse_feature': [['a'], ['x']],
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      fc_old.linear_model(features, all_cols)
+      bias = get_linear_model_bias()
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([0.], self.evaluate(bias))
+
+  @test_util.run_deprecated_v1
+  def test_linear_model_v1_v2_mix(self):
+    price = fc.numeric_column('price')  # v2
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)  # v1
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)  # v1
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)  # v2
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)  # v2
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)  # v1
+    all_cols = [
+        price, some_embedding_column, shared_embedding_a, shared_embedding_b
+    ]
+
+    with ops.Graph().as_default():
+      features = {
+          'price': [[3.], [4.]],
+          'sparse_feature': [['a'], ['x']],
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      fc_old.linear_model(features, all_cols)
+      bias = get_linear_model_bias()
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([0.], self.evaluate(bias))
+
+  @test_util.run_deprecated_v1
+  def test_linear_model_v2_shared_embedding_all_other_v1(self):
+    price = fc.numeric_column('price')  # v1
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)  # v1
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)  # v1
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)  # v2
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)  # v2
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b], dimension=2)  # v2
+    all_cols = [
+        price, some_embedding_column, shared_embedding_a, shared_embedding_b
+    ]
+
+    with ops.Graph().as_default():
+      features = {
+          'price': [[3.], [4.]],
+          'sparse_feature': [['a'], ['x']],
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      with self.assertRaisesRegexp(ValueError,
+                                   'SharedEmbeddingColumns are not supported'):
+        fc_old.linear_model(features, all_cols)
+
+
+class DenseFeaturesTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_retrieving_input(self):
+    features = {'a': [0.]}
+    dense_features = fc.DenseFeatures(fc.numeric_column('a'))
+    inputs = self.evaluate(dense_features(features))
+    self.assertAllClose([[0.]], inputs)
+
+  def test_reuses_variables(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(
+          key='a', num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      dense_features = fc.DenseFeatures([embedding_column])
+      features = {'a': sparse_input}
+
+      inputs = dense_features(features)
+      variables = dense_features.variables
+
+      # Sanity check: test that the inputs are correct.
+      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+      # Check that only one variable was created.
+      self.assertEqual(1, len(variables))
+
+      # Check that invoking dense_features on the same features does not create
+      # additional variables
+      _ = dense_features(features)
+      self.assertEqual(1, len(variables))
+      self.assertEqual(variables[0], dense_features.variables[0])
+
+  def test_feature_column_dense_features_gradient(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(
+          key='a', num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      dense_features = fc.DenseFeatures([embedding_column])
+      features = {'a': sparse_input}
+
+      def scale_matrix():
+        matrix = dense_features(features)
+        return 2 * matrix
+
+      # Sanity check: Verify that scale_matrix returns the correct output.
+      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+      # Check that the returned gradient is correct.
+      grad_function = backprop.implicit_grad(scale_matrix)
+      grads_and_vars = grad_function()
+      indexed_slice = grads_and_vars[0][0]
+      gradient = grads_and_vars[0][0].values
+
+      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc.DenseFeatures(feature_columns=[])(features={})
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a DenseColumn'):
+      fc.DenseFeatures(feature_columns=[
+          fc.categorical_column_with_hash_bucket('wire_cast', 4)
+      ])(
+          features={
+              'a': [[0]]
+          })
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})(
+          features={
+              'a': [[0]]
+          })
+
+  def test_bare_column(self):
+    with ops.Graph().as_default():
+      features = features = {'a': [0.]}
+      net = fc.DenseFeatures(fc.numeric_column('a'))(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0.]], self.evaluate(net))
+
+  def test_column_generator(self):
+    with ops.Graph().as_default():
+      features = features = {'a': [0.], 'b': [1.]}
+      columns = (fc.numeric_column(key) for key in features)
+      net = fc.DenseFeatures(columns)(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1.]], self.evaluate(net))
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc.DenseFeatures(
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])(
                                features={
                                    'a': [[0]]
                                })
@@ -1885,32 +3312,645 @@ class FeatureLayerTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      net = FeatureLayer([price])(features)
-      with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+      net = fc.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1.], [5.]], self.evaluate(net))
+
+  def test_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      net = fc.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
+
+  def test_compute_output_shape(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2', shape=4)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1., 2.], [5., 6.]],
+          'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
+      }
+      dense_features = fc.DenseFeatures([price1, price2])
+      self.assertEqual((None, 6), dense_features.compute_output_shape((None,)))
+      net = dense_features(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]],
+                          self.evaluate(net))
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        fc.DenseFeatures([price])(features)
+
+  def test_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      net = fc.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
+
+  def test_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      net = fc.DenseFeatures([price1, price2])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
+
+  def test_cols_to_output_tensors(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      cols_dict = {}
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      dense_features = fc.DenseFeatures([price1, price2])
+      net = dense_features(features, cols_dict)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]],
+                          self.evaluate(cols_dict[price1]))
+      self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2]))
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    with ops.Graph().as_default():
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+      }
+      net1 = fc.DenseFeatures([price_a, price_b])(features)
+      net2 = fc.DenseFeatures([price_b, price_a])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 3.]], self.evaluate(net1))
+      self.assertAllClose([[1., 3.]], self.evaluate(net2))
+
+  def test_fails_for_categorical_column(self):
+    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      with self.assertRaisesRegexp(Exception, 'must be a DenseColumn'):
+        fc.DenseFeatures([animal])(features)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.DenseFeatures([price1, price2])(features)
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        fc.DenseFeatures([price1, price2, price3])(features)
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      net = fc.DenseFeatures([price1, price2])(features)
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'Dimensions of inputs should match'):
+          sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      net = fc.DenseFeatures([price1, price2])(features)
+      with _initialized_session() as sess:
+        sess.run(
+            net,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_multiple_layers_with_same_embedding_column(self):
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+
+    with ops.Graph().as_default():
+      features = {
+          'sparse_feature': [['a'], ['x']],
+      }
+      all_cols = [some_embedding_column]
+      fc.DenseFeatures(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
+      # Make sure that 2 variables get created in this case.
+      self.assertEqual(2, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      expected_var_names = [
+          'dense_features/sparse_feature_embedding/embedding_weights:0',
+          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
+      ]
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  @test_util.run_deprecated_v1
+  def test_multiple_layers_with_same_shared_embedding_column(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      all_cols = [embedding_column_a, embedding_column_b]
+      fc.DenseFeatures(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['aaa_bbb_shared_embedding:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  @test_util.run_deprecated_v1
+  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+    all_cols = [embedding_column_a, embedding_column_b]
+
+    with ops.Graph().as_default():
+      features = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      fc.DenseFeatures(all_cols)(features)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+
+    with ops.Graph().as_default():
+      features1 = {
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+
+      fc.DenseFeatures(all_cols)(features1)
+      # Make sure that only 1 variable gets created in this case.
+      self.assertEqual(1, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      self.assertItemsEqual(
+          ['aaa_bbb_shared_embedding:0'],
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+
+  @test_util.run_deprecated_v1
+  def test_with_numpy_input_fn(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    # one_hot_body_style has 3 dims in dense_features.
+    one_hot_body_style = fc.indicator_column(body_style)
+    # embedded_body_style has 5 dims in dense_features.
+    embedded_body_style = fc.embedding_column(
+        body_style, dimension=5, initializer=_initializer)
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([11., 12., 13., 14.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_body_style])(
+        features)
+    self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual([[11., 12., 13., 14., 15., 0., 0., 1., 11.],
+                           [1., 2., 3., 4., 5., 1., 0., 0., 12]], sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  @test_util.run_deprecated_v1
+  def test_with_1d_sparse_tensor(self):
+    embedding_values = (
+        (1., 2., 3., 4., 5.),  # id 0
+        (6., 7., 8., 9., 10.),  # id 1
+        (11., 12., 13., 14., 15.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in dense_features.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in dense_features.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(
+        country, dimension=5, initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price':
+            constant_op.constant([
+                11.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
+        # This is dense tensor for the categorical_column.
+        'country':
+            constant_op.constant(['CA', 'US']),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+    self.assertEqual(1, features['country'].shape.ndims)
+
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
+        features)
+    self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+                           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+                          sess.run(net))
+
+  @test_util.run_deprecated_v1
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    embedding_values = (
+        (1., 2.),  # id 0
+        (6., 7.),  # id 1
+        (11., 12.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return embedding_values
+
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+
+    # one_hot_body_style has 3 dims in dense_features.
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    one_hot_body_style = fc.indicator_column(body_style)
+
+    # embedded_body_style has 5 dims in dense_features.
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+    embedded_country = fc.embedding_column(
+        country, dimension=2, initializer=_initializer)
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        # This is dense tensor for the categorical_column.
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+    self.assertIsNone(features['country'].shape.ndims)
+
+    price_data = np.array([11., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
+    country_data = np.array([['US'], ['CA']])
+
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
+        features)
+    self.assertEqual(1 + 3 + 2, net.shape[1])
+    with _initialized_session() as sess:
+
+      # Each row is formed by concatenating `embedded_body_style`,
+      # `one_hot_body_style`, and `price` in order.
+      self.assertAllEqual(
+          [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
+          sess.run(
+              net,
+              feed_dict={
+                  features['price']: price_data,
+                  features['body-style']: body_style_data,
+                  features['country']: country_data
+              }))
+
+  @test_util.run_deprecated_v1
+  def test_with_rank_0_feature(self):
+    # price has 1 dimension in dense_features
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
+
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      fc.DenseFeatures([price])(features)
+
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = fc.DenseFeatures([price])(features)
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
+
+
+class InputLayerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_retrieving_input(self):
+    features = {'a': [0.]}
+    input_layer = fc_old.InputLayer(fc.numeric_column('a'))
+    inputs = self.evaluate(input_layer(features))
+    self.assertAllClose([[0.]], inputs)
+
+  def test_reuses_variables(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(
+          key='a', num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = fc_old.InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      inputs = input_layer(features)
+      variables = input_layer.variables
+
+      # Sanity check: test that the inputs are correct.
+      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+      # Check that only one variable was created.
+      self.assertEqual(1, len(variables))
+
+      # Check that invoking input_layer on the same features does not create
+      # additional variables
+      _ = input_layer(features)
+      self.assertEqual(1, len(variables))
+      self.assertEqual(variables[0], input_layer.variables[0])
+
+  def test_feature_column_input_layer_gradient(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(
+          key='a', num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = fc_old.InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      def scale_matrix():
+        matrix = input_layer(features)
+        return 2 * matrix
+
+      # Sanity check: Verify that scale_matrix returns the correct output.
+      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+      # Check that the returned gradient is correct.
+      grad_function = backprop.implicit_grad(scale_matrix)
+      grads_and_vars = grad_function()
+      indexed_slice = grads_and_vars[0][0]
+      gradient = grads_and_vars[0][0].values
+
+      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+
+class FunctionalInputLayerTest(test.TestCase):
+
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      fc_old.input_layer(features={}, feature_columns=[])
+
+  def test_should_be_dense_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _DenseColumn'):
+      fc_old.input_layer(
+          features={'a': [[0]]},
+          feature_columns=[
+              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+          ])
+
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc_old.input_layer(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+
+  def test_bare_column(self):
+    with ops.Graph().as_default():
+      features = features = {'a': [0.]}
+      net = fc_old.input_layer(features, fc.numeric_column('a'))
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0.]], self.evaluate(net))
+
+  def test_column_generator(self):
+    with ops.Graph().as_default():
+      features = features = {'a': [0.], 'b': [1.]}
+      columns = (fc.numeric_column(key) for key in features)
+      net = fc_old.input_layer(features, columns)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1.]], self.evaluate(net))
+
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      fc_old.input_layer(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
+
+  def test_one_column(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      net = fc_old.input_layer(features, [price])
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      net = FeatureLayer([price])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+      net = fc_old.input_layer(features, [price])
 
-  def test_compute_output_shape(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=4)
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
-      }
-      feature_layer = FeatureLayer([price1, price2])
-      self.assertEqual((None, 6), feature_layer.compute_output_shape((None,)))
-      net = feature_layer(features)
-      with _initialized_session():
-        self.assertAllClose(
-            [[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]], net.eval())
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -1919,40 +3959,149 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        FeatureLayer([price])(features)
+        fc_old.input_layer(features, [price])
 
   def test_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      net = FeatureLayer([price])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+      net = fc_old.input_layer(features, [price])
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
     price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      net = fc_old.input_layer(features, [price1, price2])
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
+
+  def test_fills_cols_to_vars(self):
+    # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
+    # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
+    # creates a Variable.
+    price1 = fc.numeric_column('price1')
+    dense_feature = fc.numeric_column('dense_feature')
+    dense_feature_bucketized = fc.bucketized_column(
+        dense_feature, boundaries=[0.])
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3.], [4.]]
+          'price1': [[3.], [4.]],
+          'dense_feature': [[-1.], [4.]],
+          'sparse_feature': [['a'], ['x']],
       }
-      net = FeatureLayer([price1, price2])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
-
-  def test_cols_to_output_tensors(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+      cols_to_vars = {}
+      all_cols = [price1, dense_feature_bucketized, some_embedding_column]
+      fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
+      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertEqual(0, len(cols_to_vars[price1]))
+      self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
+      self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
+      self.assertIsInstance(cols_to_vars[some_embedding_column][0],
+                            variables_lib.Variable)
+      self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
+
+  @test_util.run_deprecated_v1
+  def test_fills_cols_to_vars_shared_embedding(self):
+    # Provide 5 DenseColumn's to input_layer: a NumericColumn, a
+    # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
+    # EmbeddingColumn creates a Variable and the two SharedEmbeddingColumns
+    # shared one variable.
+    price1 = fc.numeric_column('price1')
+    dense_feature = fc.numeric_column('dense_feature')
+    dense_feature_bucketized = fc.bucketized_column(
+        dense_feature, boundaries=[0.])
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
-      cols_dict = {}
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      feature_layer = FeatureLayer([price1, price2])
-      net = feature_layer(features, cols_dict)
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], cols_dict[price1].eval())
-        self.assertAllClose([[3.], [4.]], cols_dict[price2].eval())
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+      features = {
+          'price1': [[3.], [4.]],
+          'dense_feature': [[-1.], [4.]],
+          'sparse_feature': [['a'], ['x']],
+          'aaa':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 1, 0),
+                  dense_shape=(2, 2)),
+          'bbb':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(1, 2, 1),
+                  dense_shape=(2, 2)),
+      }
+      cols_to_vars = {}
+      all_cols = [
+          price1, dense_feature_bucketized, some_embedding_column,
+          shared_embedding_a, shared_embedding_b
+      ]
+      fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
+      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertEqual(0, len(cols_to_vars[price1]))
+      self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
+      self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
+      self.assertEqual(1, len(cols_to_vars[shared_embedding_a]))
+      # This is a bug in the current implementation and should be fixed in the
+      # new one.
+      self.assertEqual(0, len(cols_to_vars[shared_embedding_b]))
+      self.assertIsInstance(cols_to_vars[some_embedding_column][0],
+                            variables_lib.Variable)
+      self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
+      self.assertIsInstance(cols_to_vars[shared_embedding_a][0],
+                            variables_lib.Variable)
+      self.assertAllEqual(cols_to_vars[shared_embedding_a][0].shape, [3, 2])
+
+  def test_fills_cols_to_vars_partitioned_variables(self):
+    price1 = fc.numeric_column('price1')
+    dense_feature = fc.numeric_column('dense_feature')
+    dense_feature_bucketized = fc.bucketized_column(
+        dense_feature, boundaries=[0.])
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[3.], [4.]],
+          'dense_feature': [[-1.], [4.]],
+          'sparse_feature': [['a'], ['x']],
+      }
+      cols_to_vars = {}
+      all_cols = [price1, dense_feature_bucketized, some_embedding_column]
+      with variable_scope.variable_scope(
+          'input_from_feature_columns',
+          partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
+        fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
+      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertEqual(0, len(cols_to_vars[price1]))
+      self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
+      self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
+      self.assertEqual(
+          'input_from_feature_columns/input_layer/sparse_feature_embedding/'
+          'embedding_weights/part_0:0',
+          cols_to_vars[some_embedding_column][0].name)
+      self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [2, 10])
+      self.assertAllEqual(cols_to_vars[some_embedding_column][1].shape, [2, 10])
+      self.assertAllEqual(cols_to_vars[some_embedding_column][2].shape, [1, 10])
 
   def test_column_order(self):
     price_a = fc.numeric_column('price_a')
@@ -1962,11 +4111,14 @@ class FeatureLayerTest(test.TestCase):
           'price_a': [[1.]],
           'price_b': [[3.]],
       }
-      net1 = FeatureLayer([price_a, price_b])(features)
-      net2 = FeatureLayer([price_b, price_a])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+      net1 = fc_old.input_layer(features, [price_a, price_b])
+      net2 = fc_old.input_layer(features, [price_b, price_a])
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 3.]], self.evaluate(net1))
+      self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
     animal = fc.categorical_column_with_identity('animal', num_buckets=4)
@@ -1976,8 +4128,8 @@ class FeatureLayerTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      with self.assertRaisesRegexp(Exception, 'must be a DenseColumn'):
-        FeatureLayer([animal])(features)
+      with self.assertRaisesRegexp(Exception, 'must be a _DenseColumn'):
+        fc_old.input_layer(features, [animal])
 
   def test_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -1990,7 +4142,7 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        FeatureLayer([price1, price2])(features)
+        fc_old.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -2005,7 +4157,7 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        FeatureLayer([price1, price2, price3])(features)
+        fc_old.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -2015,215 +4167,74 @@ class FeatureLayerTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      net = FeatureLayer([price1, price2])(features)
+      net = fc_old.input_layer(features, [price1, price2])
       with _initialized_session() as sess:
         with self.assertRaisesRegexp(errors.OpError,
                                      'Dimensions of inputs should match'):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
-  def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-      }
-      net = FeatureLayer([price1, price2])(features)
-      with _initialized_session() as sess:
-        sess.run(
-            net,
-            feed_dict={
-                features['price1']: [[1.], [5.]],
-                features['price2']: [[1.], [5.]],
-            })
-
-  def test_multiple_layers_with_same_embedding_column(self):
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
-        some_sparse_column, dimension=10)
-
-    with ops.Graph().as_default():
-      features = {
-          'sparse_feature': [['a'], ['x']],
-      }
-      all_cols = [some_embedding_column]
-      FeatureLayer(all_cols)(features)
-      FeatureLayer(all_cols)(features)
-      # Make sure that 2 variables get created in this case.
-      self.assertEqual(2, len(
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      expected_var_names = [
-          'feature_layer/sparse_feature_embedding/embedding_weights:0',
-          'feature_layer_1/sparse_feature_embedding/embedding_weights:0'
-      ]
-      self.assertItemsEqual(
-          expected_var_names,
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-
-  def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-    shared_state_manager = fc.SharedEmbeddingStateManager(
-        name='shared_feature_layer')
-
-    with ops.Graph().as_default():
-      features = {
-          'aaa':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-      all_cols = [embedding_column_a, embedding_column_b]
-      FeatureLayer(
-          all_cols, shared_state_manager=shared_state_manager)(
-              features)
-      FeatureLayer(
-          all_cols, shared_state_manager=shared_state_manager)(
-              features)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(1, len(
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
-          ['shared_feature_layer/aaa_bbb_shared_embedding:0'],
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-
-  def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns_v2(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension)
-    all_cols = [embedding_column_a, embedding_column_b]
-
-    with ops.Graph().as_default():
-      shared_state_manager1 = fc.SharedEmbeddingStateManager(
-          name='shared_feature_layer')
-      features = {
-          'aaa':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-      FeatureLayer(
-          all_cols, shared_state_manager=shared_state_manager1)(
-              features)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(1, len(
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-
-    with ops.Graph().as_default():
-      shared_state_manager2 = fc.SharedEmbeddingStateManager(
-          name='shared_feature_layer')
-      features1 = {
-          'aaa':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 1, 0),
-                  dense_shape=(2, 2)),
-          'bbb':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(1, 2, 1),
-                  dense_shape=(2, 2)),
-      }
-
-      FeatureLayer(
-          all_cols, shared_state_manager=shared_state_manager2)(
-              features1)
-      # Make sure that only 1 variable gets created in this case.
-      self.assertEqual(1, len(
-          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
-          ['shared_feature_layer/aaa_bbb_shared_embedding:0'],
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-
-  def test_with_numpy_input_fn(self):
-    embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
-    )
-    def _initializer(shape, dtype, partition_info):
-      del shape, dtype, partition_info
-      return embedding_values
-
-    # price has 1 dimension in feature_layer
-    price = fc.numeric_column('price')
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    # one_hot_body_style has 3 dims in feature_layer.
-    one_hot_body_style = fc.indicator_column(body_style)
-    # embedded_body_style has 5 dims in feature_layer.
-    embedded_body_style = fc.embedding_column(
-        body_style, dimension=5, initializer=_initializer)
-
-    input_fn = numpy_io.numpy_input_fn(
-        x={
-            'price': np.array([11., 12., 13., 14.]),
-            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
-        },
-        batch_size=2,
-        shuffle=False)
-    features = input_fn()
-    net = FeatureLayer([price, one_hot_body_style, embedded_body_style])(
-        features)
-    self.assertEqual(1 + 3 + 5, net.shape[1])
-    with _initialized_session() as sess:
-      coord = coordinator.Coordinator()
-      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
-
-      # Each row is formed by concatenating `embedded_body_style`,
-      # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[11., 12., 13., 14., 15., 0., 0., 1., 11.],
-           [1., 2., 3., 4., 5., 1., 0., 0., 12]],
-          sess.run(net))
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      net = fc_old.input_layer(features, [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            net,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
 
-      coord.request_stop()
-      coord.join(threads)
+  def test_multiple_layers_with_same_embedding_column(self):
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
+        'sparse_feature', hash_bucket_size=5)
+    some_embedding_column = fc.embedding_column(
+        some_sparse_column, dimension=10)
+
+    with ops.Graph().as_default():
+      features = {
+          'sparse_feature': [['a'], ['x']],
+      }
+      all_cols = [some_embedding_column]
+      fc_old.input_layer(features, all_cols)
+      fc_old.input_layer(features, all_cols)
+      # Make sure that 2 variables get created in this case.
+      self.assertEqual(2, len(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
+      expected_var_names = [
+          'input_layer/sparse_feature_embedding/embedding_weights:0',
+          'input_layer_1/sparse_feature_embedding/embedding_weights:0'
+      ]
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
         (6., 7., 8., 9., 10.),  # id 1
         (11., 12., 13., 14., 15.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in input_layer
     price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in input_layer.
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in input_layer.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
     embedded_country = fc.embedding_column(
@@ -2231,48 +4242,56 @@ class FeatureLayerTest(test.TestCase):
 
     # Provides 1-dim tensor and dense tensor.
     features = {
-        'price': constant_op.constant([11., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
+        'price':
+            constant_op.constant([
+                11.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
         # This is dense tensor for the categorical_column.
-        'country': constant_op.constant(['CA', 'US']),
+        'country':
+            constant_op.constant(['CA', 'US']),
     }
     self.assertEqual(1, features['price'].shape.ndims)
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
     self.assertEqual(1, features['country'].shape.ndims)
 
-    net = FeatureLayer([price, one_hot_body_style, embedded_country])(features)
+    net = fc_old.input_layer(features,
+                             [price, one_hot_body_style, embedded_country])
     self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
 
       # Each row is formed by concatenating `embedded_body_style`,
       # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
-           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
-          sess.run(net))
+      self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+                           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+                          sess.run(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
         (1., 2.),  # id 0
         (6., 7.),  # id 1
         (11., 12.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in input_layer
     price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in input_layer.
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in input_layer.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
     embedded_country = fc.embedding_column(
@@ -2291,12 +4310,11 @@ class FeatureLayerTest(test.TestCase):
 
     price_data = np.array([11., 12.])
     body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)),
-        values=('sedan', 'hardtop'),
-        dense_shape=(2,))
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
     country_data = np.array([['US'], ['CA']])
 
-    net = FeatureLayer([price, one_hot_body_style, embedded_country])(features)
+    net = fc_old.input_layer(features,
+                             [price, one_hot_body_style, embedded_country])
     self.assertEqual(1 + 3 + 2, net.shape[1])
     with _initialized_session() as sess:
 
@@ -2312,8 +4330,9 @@ class FeatureLayerTest(test.TestCase):
                   features['country']: country_data
               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in input_layer
     price = fc.numeric_column('price')
     features = {
         'price': constant_op.constant(0),
@@ -2322,13 +4341,13 @@ class FeatureLayerTest(test.TestCase):
 
     # Static rank 0 should fail
     with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      FeatureLayer([price])(features)
+      fc_old.input_layer(features, [price])
 
     # Dynamic rank 0 should fail
     features = {
         'price': array_ops.placeholder(dtypes.float32),
     }
-    net = FeatureLayer([price])(features)
+    net = fc_old.input_layer(features, [price])
     self.assertEqual(1, net.shape[1])
     with _initialized_session() as sess:
       with self.assertRaisesOpError('Feature .* cannot have rank 0'):
@@ -2337,10 +4356,14 @@ class FeatureLayerTest(test.TestCase):
 
 class MakeParseExampleSpecTest(test.TestCase):
 
-  class _TestFeatureColumn(FeatureColumn,
+  class _TestFeatureColumn(BaseFeatureColumnForTests,
                            collections.namedtuple('_TestFeatureColumn',
                                                   ('parse_spec'))):
 
+    @property
+    def _is_v2_column(self):
+      return True
+
     @property
     def name(self):
       return '_TestFeatureColumn'
@@ -2348,12 +4371,19 @@ class MakeParseExampleSpecTest(test.TestCase):
     def transform_feature(self, transformation_cache, state_manager):
       pass
 
+    def _transform_feature(self, inputs):
+      pass
+
     @property
     def parse_example_spec(self):
       return self.parse_spec
 
+    @property
+    def _parse_example_spec(self):
+      return self.parse_spec
+
   def test_no_feature_columns(self):
-    actual = fc.make_parse_example_spec([])
+    actual = fc.make_parse_example_spec_v2([])
     self.assertDictEqual({}, actual)
 
   def test_invalid_type(self):
@@ -2363,15 +4393,17 @@ class MakeParseExampleSpecTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'All feature_columns must be FeatureColumn instances.*invalid_column'):
-      fc.make_parse_example_spec(
-          (self._TestFeatureColumn({key1: parse_spec1}), 'invalid_column'))
+      fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+          key1: parse_spec1
+      }), 'invalid_column'))
 
   def test_one_feature_column(self):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }),))
     self.assertDictEqual({key1: parse_spec1}, actual)
 
   def test_two_feature_columns(self):
@@ -2380,9 +4412,11 @@ class MakeParseExampleSpecTest(test.TestCase):
         shape=(2,), dtype=dtypes.float32, default_value=0.)
     key2 = 'key2'
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key2: parse_spec2})))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key2: parse_spec2
+    })))
     self.assertDictEqual({key1: parse_spec1, key2: parse_spec2}, actual)
 
   def test_equal_keys_different_parse_spec(self):
@@ -2393,17 +4427,21 @@ class MakeParseExampleSpecTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'feature_columns contain different parse_spec for key key1'):
-      fc.make_parse_example_spec(
-          (self._TestFeatureColumn({key1: parse_spec1}),
-           self._TestFeatureColumn({key1: parse_spec2})))
+      fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+          key1: parse_spec1
+      }), self._TestFeatureColumn({
+          key1: parse_spec2
+      })))
 
   def test_equal_keys_equal_parse_spec(self):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key1: parse_spec1})))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key1: parse_spec1
+    })))
     self.assertDictEqual({key1: parse_spec1}, actual)
 
   def test_multiple_features_dict(self):
@@ -2415,11 +4453,17 @@ class MakeParseExampleSpecTest(test.TestCase):
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
     key3 = 'key3'
     parse_spec3 = parsing_ops.VarLenFeature(dtype=dtypes.int32)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key2: parse_spec2, key3: parse_spec3})))
-    self.assertDictEqual(
-        {key1: parse_spec1, key2: parse_spec2, key3: parse_spec3}, actual)
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key2: parse_spec2,
+        key3: parse_spec3
+    })))
+    self.assertDictEqual({
+        key1: parse_spec1,
+        key2: parse_spec2,
+        key3: parse_spec3
+    }, actual)
 
 
 def _assert_sparse_tensor_value(test_case, expected, actual):
@@ -2427,7 +4471,8 @@ def _assert_sparse_tensor_value(test_case, expected, actual):
   test_case.assertAllEqual(expected.indices, actual.indices)
 
   test_case.assertEqual(
-      np.array(expected.values).dtype, np.array(actual.values).dtype)
+      np.array(expected.values).dtype,
+      np.array(actual.values).dtype)
   test_case.assertAllEqual(expected.values, actual.values)
 
   test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
@@ -2449,6 +4494,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'python/feature_column/testdata/wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
@@ -2458,25 +4504,34 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, column.parse_example_spec)
+    self.assertTrue(column._is_v2_column)
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
       fc.categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     self.assertEqual(7, column.num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(7, column.num_buckets)
@@ -2494,6 +4549,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       fc.categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_invalid_vocabulary_file(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
@@ -2501,21 +4557,26 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    column.get_sparse_tensors(FeatureTransformationCache({'aaa': inputs}), None)
+    column.get_sparse_tensors(
+        fc.FeatureTransformationCache({
+            'aaa': inputs
+        }), None)
     with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
+  @test_util.run_deprecated_v1
   def test_too_large_vocabulary_size(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -2525,26 +4586,32 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    column.get_sparse_tensors(FeatureTransformationCache({'aaa': inputs}), None)
+    column.get_sparse_tensors(
+        fc.FeatureTransformationCache({
+            'aaa': inputs
+        }), None)
     with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'both num_oov_buckets and default_value'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
@@ -2564,7 +4631,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column.get_sparse_tensors(
-          FeatureTransformationCache({
+          fc.FeatureTransformationCache({
               'aaa': inputs
           }), None)
 
@@ -2580,32 +4647,35 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column.get_sparse_tensors(
-          FeatureTransformationCache({
+          fc.FeatureTransformationCache({
               'aaa': inputs
           }), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -2616,19 +4686,23 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_none_vocabulary_size(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
@@ -2637,19 +4711,23 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -2659,35 +4737,43 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    id_tensor = _transform_features({'aaa': inputs}, [column], None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_tensor.eval())
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -2699,19 +4785,23 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 2, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -2723,19 +4813,23 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         values=('marlo', 'skywalker', 'omar', 'heisenberg'),
         dense_shape=(2, 3))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 33, 0, 62), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_small_vocabulary_size(self):
     # 'marlo' is the last entry in our vocabulary file, so be setting
     # `vocabulary_size` to 1 less than number of entries in file, we take
@@ -2749,19 +4843,23 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((-1, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((-1, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -2773,19 +4871,23 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         values=(11, 100, 30, 22),
         dense_shape=(3, 3))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
     column = fc.categorical_column_with_vocabulary_file(
@@ -2795,19 +4897,22 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dtype=dtypes.int32,
         default_value=default_value)
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+            values=np.array((2, default_value, 0, 4), dtype=np.int64),
+            dense_shape=(3, 3)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -2820,19 +4925,23 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         values=(11, 100, 30, 22),
         dense_shape=(3, 3))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 60, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
         key='wire',
@@ -2850,14 +4959,69 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       wire_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  def test_old_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column.num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+
+    self.assertEqual(['wire'], wire_column.parents)
+
+    config = wire_column._get_config()
+    self.assertEqual({
+        'default_value': -1,
+        'dtype': 'string',
+        'key': 'wire',
+        'num_oov_buckets': 1,
+        'vocabulary_file': self._wire_vocabulary_file_name,
+        'vocabulary_size': 3
+    }, config)
+
+    self.assertEqual(wire_column,
+                     fc.VocabularyFileCategoricalColumn._from_config(config))
 
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
@@ -2871,6 +5035,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, column.parse_example_spec)
+    self.assertTrue(column._is_v2_column)
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
@@ -2887,15 +5052,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+        key='aaa',
+        vocabulary_list=(12, 24, 36),
+        dtype=dtypes.int32,
         default_value=-99)
     self.assertEqual(3, column.num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
@@ -2909,37 +5078,39 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary dtype must be string or integer'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'vocabulary dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12., 24., 36.))
 
   def test_mismatched_int_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'dtype.*and vocabulary dtype.*do not match'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'dtype.*and vocabulary dtype.*do not match'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
 
   def test_none_mapping(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'vocabulary_list.*must be non-empty'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=None)
 
   def test_empty_mapping(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'vocabulary_list.*must be non-empty'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=tuple([]))
 
@@ -2951,12 +5122,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=(12, 24, 36),
-          num_oov_buckets=-1)
+          key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'both num_oov_buckets and default_value'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=(12, 24, 36),
@@ -2965,134 +5135,144 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_input_dtype_int32(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column.get_sparse_tensors(
-          FeatureTransformationCache({
+          fc.FeatureTransformationCache({
               'aaa': inputs
           }), None)
 
   def test_invalid_input_dtype_string(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=(12, 24, 36))
+        key='aaa', vocabulary_list=(12, 24, 36))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column.get_sparse_tensors(
-          FeatureTransformationCache({
+          fc.FeatureTransformationCache({
               'aaa': inputs
           }), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example_string(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_parse_example_int(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(11, 21, 31))
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[11, 21]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[11, 21]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=[11, 21],
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]], values=[11, 21], dense_shape=[1, 2]),
+        self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    id_tensor = _transform_features({'aaa': inputs}, [column], None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -3103,19 +5283,23 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 2, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -3126,19 +5310,23 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         values=('marlo', 'skywalker', 'omar', 'heisenberg'),
         dense_shape=(2, 3))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 33, 0, 62), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -3149,19 +5337,23 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         values=np.array((11, 100, 30, 22), dtype=np.int32),
         dense_shape=(3, 3))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
     column = fc.categorical_column_with_vocabulary_list(
@@ -3170,21 +5362,24 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         dtype=dtypes.int32,
         default_value=default_value)
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa':
-                np.array(
-                    ((11, -1, -1), (100, 30, -1), (-1, -1, 22)), dtype=np.int32)
+                np.array(((11, -1, -1), (100, 30, -1), (-1, -1, 22)),
+                         dtype=np.int32)
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+            values=np.array((2, default_value, 0, 4), dtype=np.int64),
+            dense_shape=(3, 3)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -3196,43 +5391,99 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         values=(11, 100, 30, 22),
         dense_shape=(3, 3))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
-  def test_linear_model(self):
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 60, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
+  def test_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column.num_buckets)
+    with ops.Graph().as_default():
+      model = fc.LinearModel((wire_column,))
+      predictions = model({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      })
+      wire_var, bias = model.variables
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
     self.assertEqual(4, wire_column.num_buckets)
     with ops.Graph().as_default():
-      model = fc.LinearModel((wire_column,))
-      predictions = model({
+      predictions = fc_old.linear_model({
           wire_column.name:
               sparse_tensor.SparseTensorValue(
                   indices=((0, 0), (1, 0), (1, 1)),
                   values=('marlo', 'skywalker', 'omar'),
                   dense_shape=(2, 2))
-      })
-      wire_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+      }, (wire_column,))
+      bias = get_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=1)
+
+    self.assertEqual(['aaa'], wire_column.parents)
+
+    config = wire_column._get_config()
+    self.assertEqual({
+        'default_value': -1,
+        'dtype': 'string',
+        'key': 'aaa',
+        'num_oov_buckets': 1,
+        'vocabulary_list': ('omar', 'stringer', 'marlo')
+    }, config)
+
+    self.assertEqual(wire_column,
+                     fc.VocabularyListCategoricalColumn._from_config(config))
 
 
 class IdentityCategoricalColumnTest(test.TestCase):
@@ -3245,11 +5496,13 @@ class IdentityCategoricalColumnTest(test.TestCase):
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column.parse_example_spec)
+    self.assertTrue(column._is_v2_column)
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
       fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
@@ -3285,115 +5538,128 @@ class IdentityCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'):
       column.get_sparse_tensors(
-          FeatureTransformationCache({
+          fc.FeatureTransformationCache({
               'aaa': inputs
           }), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[11, 21]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[11, 21]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([11, 21], dtype=np.int64),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([11, 21], dtype=np.int64),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor = _transform_features({'aaa': inputs}, [column], None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
-
+        indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2))
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': ((0, -1), (1, 0))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_small(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, -1, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(1, -1, 0), dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      with self.assertRaisesRegexp(
-          errors.OpError, 'assert_greater_or_equal_0'):
-        id_weight_pair.id_tensor.eval()
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    with self.assertRaisesRegexp(errors.OpError, 'assert_greater_or_equal_0'):
+      self.evaluate(id_weight_pair.id_tensor)
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_big(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 99, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(1, 99, 0), dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      with self.assertRaisesRegexp(
-          errors.OpError, 'assert_less_than_num_buckets'):
-        id_weight_pair.id_tensor.eval()
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    with self.assertRaisesRegexp(errors.OpError,
+                                 'assert_less_than_num_buckets'):
+      self.evaluate(id_weight_pair.id_tensor)
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value(self):
     column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
@@ -3402,19 +5668,23 @@ class IdentityCategoricalColumnTest(test.TestCase):
         values=(1, -1, 99),
         dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((1, 3, 3), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((1, 3, 3), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
     column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
@@ -3422,14 +5692,15 @@ class IdentityCategoricalColumnTest(test.TestCase):
     input_values = array_ops.placeholder(dtype=dtypes.int32)
     input_shape = array_ops.placeholder(dtype=dtypes.int64)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=input_indices,
-        values=input_values,
-        dense_shape=input_shape)
+        indices=input_indices, values=input_values, dense_shape=input_shape)
     id_weight_pair = column.get_sparse_tensors(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
     with _initialized_session():
       _assert_sparse_tensor_value(
           self,
@@ -3437,12 +5708,14 @@ class IdentityCategoricalColumnTest(test.TestCase):
               indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
               values=np.array((1, 3, 3), dtype=np.int64),
               dense_shape=np.array((2, 2), dtype=np.int64)),
-          id_weight_pair.id_tensor.eval(feed_dict={
-              input_indices: ((0, 0), (1, 0), (1, 1)),
-              input_values: (1, -1, 99),
-              input_shape: (2, 2),
-          }))
+          id_weight_pair.id_tensor.eval(
+              feed_dict={
+                  input_indices: ((0, 0), (1, 0), (1, 1)),
+                  input_values: (1, -1, 99),
+                  input_shape: (2, 2),
+              }))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column.num_buckets)
@@ -3456,14 +5729,57 @@ class IdentityCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       weight_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] = 1
-        # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] = 1
+      # weight_var[2] + weight_var[1] = 3+2 = 5
+      self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
+
+  def test_old_linear_model(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual(3, column.num_buckets)
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] = 1
+      # weight_var[2] + weight_var[1] = 3+2 = 5
+      self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+
+    self.assertEqual(['aaa'], column.parents)
+
+    config = column._get_config()
+    self.assertEqual({
+        'default_value': None,
+        'key': 'aaa',
+        'number_buckets': 3
+    }, config)
+
+    self.assertEqual(column, fc.IdentityCategoricalColumn._from_config(config))
 
 
 class TransformFeaturesTest(test.TestCase):
@@ -3483,22 +5799,31 @@ class TransformFeaturesTest(test.TestCase):
                   indices=[[0, 0], [1, 0], [1, 1]],
                   dense_shape=[2, 2])
       }
-      transformed = _transform_features(features,
-                                        [bucketized_price, hashed_sparse], None)
-      with _initialized_session():
-        self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
-        self.assertAllEqual([[0], [3]], transformed[bucketized_price].eval())
-        self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
-        self.assertAllEqual([6, 4, 1], transformed[hashed_sparse].values.eval())
+      transformed = fc._transform_features_v2(
+          features, [bucketized_price, hashed_sparse], None)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
+      self.assertAllEqual([[0], [3]],
+                          self.evaluate(transformed[bucketized_price]))
+      self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
+      self.assertAllEqual([6, 4, 1],
+                          self.evaluate(transformed[hashed_sparse].values))
 
   def test_column_order(self):
     """When the column is both dense and sparse, uses sparse tensors."""
 
-    class _LoggerColumn(FeatureColumn):
+    class _LoggerColumn(BaseFeatureColumnForTests):
 
       def __init__(self, name):
         self._name = name
 
+      @property
+      def _is_v2_column(self):
+        return True
+
       @property
       def name(self):
         return self._name
@@ -3516,12 +5841,12 @@ class TransformFeaturesTest(test.TestCase):
       column1 = _LoggerColumn('1')
       column2 = _LoggerColumn('2')
       call_logger = {'count': 0}
-      _transform_features({}, [column1, column2], None)
+      fc._transform_features_v2({}, [column1, column2], None)
       self.assertEqual(0, column1.call_order)
       self.assertEqual(1, column2.call_order)
 
       call_logger = {'count': 0}
-      _transform_features({}, [column2, column1], None)
+      fc._transform_features_v2({}, [column2, column1], None)
       self.assertEqual(0, column1.call_order)
       self.assertEqual(1, column2.call_order)
 
@@ -3534,28 +5859,31 @@ class IndicatorColumnTest(test.TestCase):
     self.assertEqual(indicator_a.categorical_column.name, 'a')
     self.assertEqual(indicator_a.name, 'a_indicator')
     self.assertEqual(indicator_a.variable_shape, [1, 4])
+    self.assertTrue(indicator_a._is_v2_column)
 
-    b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    b = fc_old._categorical_column_with_hash_bucket('b', hash_bucket_size=100)
     indicator_b = fc.indicator_column(b)
     self.assertEqual(indicator_b.categorical_column.name, 'b')
     self.assertEqual(indicator_b.name, 'b_indicator')
     self.assertEqual(indicator_b.variable_shape, [1, 100])
+    self.assertFalse(indicator_b._is_v2_column)
 
   def test_1D_shape_succeeds(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_hash_bucket('animal', 4))
-    transformation_cache = FeatureTransformationCache({
+    transformation_cache = fc.FeatureTransformationCache({
         'animal': ['fox', 'fox']
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+    self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                        self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
     # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
     animal = fc.indicator_column(
         fc.categorical_column_with_hash_bucket('animal', 4))
-    transformation_cache = FeatureTransformationCache({
+    transformation_cache = fc.FeatureTransformationCache({
         'animal':
             sparse_tensor.SparseTensor(
                 indices=[[0, 0], [1, 0]],
@@ -3563,34 +5891,36 @@ class IndicatorColumnTest(test.TestCase):
                 dense_shape=[2, 1])
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+    self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                        self.evaluate(output))
 
   def test_multi_hot(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
 
-    transformation_cache = FeatureTransformationCache({
+    transformation_cache = fc.FeatureTransformationCache({
         'animal':
             sparse_tensor.SparseTensor(
                 indices=[[0, 0], [0, 1]], values=[1, 1], dense_shape=[1, 2])
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+
+    self.assertAllEqual([[0., 2., 0., 0.]], self.evaluate(output))
 
   def test_multi_hot2(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
-    transformation_cache = FeatureTransformationCache({
+    transformation_cache = fc.FeatureTransformationCache({
         'animal':
             sparse_tensor.SparseTensor(
                 indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
 
+    self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
+
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.categorical_column_with_hash_bucket('a', 4)
     column = fc.indicator_column(a)
@@ -3599,44 +5929,52 @@ class IndicatorColumnTest(test.TestCase):
     self.assertEqual(column.name, 'a_indicator')
     self.assertEqual(column.variable_shape, [1, 4])
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_indicator = fc.indicator_column(a)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_indicator]))
+        features=fc.make_parse_example_spec_v2([a_indicator]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_transform(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_indicator = fc.indicator_column(a)
     features = {
-        'aaa': sparse_tensor.SparseTensorValue(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=('marlo', 'skywalker', 'omar'),
-            dense_shape=(2, 2))
+        'aaa':
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=('marlo', 'skywalker', 'omar'),
+                dense_shape=(2, 2))
     }
-    indicator_tensor = _transform_features(features, [a_indicator],
-                                           None)[a_indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [a_indicator],
+                                                 None)[a_indicator]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual([[0, 0, 1], [1, 0, 0]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_weighted_column(self):
     # Github issue 12557
     ids = fc.categorical_column_with_vocabulary_list(
@@ -3644,14 +5982,18 @@ class IndicatorColumnTest(test.TestCase):
     weights = fc.weighted_categorical_column(ids, 'weights')
     indicator = fc.indicator_column(weights)
     features = {
-        'ids': constant_op.constant([['c', 'b', 'a']]),
-        'weights': constant_op.constant([[2., 4., 6.]])
+        'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
+        'weights': constant_op.constant([[2., 4., 6., 1.]])
     }
-    indicator_tensor = _transform_features(features, [indicator],
-                                           None)[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[6., 4., 2.]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_weighted_column(self):
     # Github issue 12583
     ids = fc.categorical_column_with_vocabulary_list(
@@ -3662,11 +6004,15 @@ class IndicatorColumnTest(test.TestCase):
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
         'weights': constant_op.constant([[2., 4., 6.]])
     }
-    indicator_tensor = _transform_features(features, [indicator],
-                                           None)[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
+    self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_categorical_column(self):
     # Github issue 12583
     ids = fc.categorical_column_with_vocabulary_list(
@@ -3675,11 +6021,15 @@ class IndicatorColumnTest(test.TestCase):
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
     }
-    indicator_tensor = _transform_features(features, [indicator],
-                                           None)[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -3693,14 +6043,79 @@ class IndicatorColumnTest(test.TestCase):
       model = fc.LinearModel([animal])
       predictions = model(features)
       weight_var, _ = model.variables
-      with _initialized_session():
-        # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
-        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
-
-  def test_feature_layer(self):
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # All should be zero-initialized.
+      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+      self.assertAllClose([[0.]], self.evaluate(predictions))
+      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
+      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
+
+  def test_old_linear_model(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+
+      predictions = fc_old.linear_model(features, [animal])
+      weight_var = get_linear_model_column_var(animal)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # All should be zero-initialized.
+      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+      self.assertAllClose([[0.]], self.evaluate(predictions))
+      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
+      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
+
+  def test_old_linear_model_old_categorical(self):
+    animal = fc.indicator_column(
+        fc_old._categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+
+      predictions = fc_old.linear_model(features, [animal])
+      weight_var = get_linear_model_column_var(animal)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # All should be zero-initialized.
+      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+      self.assertAllClose([[0.]], self.evaluate(predictions))
+      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
+      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = fc.DenseFeatures([animal])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+  @test_util.run_deprecated_v1
+  def test_input_layer(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
@@ -3709,12 +6124,59 @@ class IndicatorColumnTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      net = FeatureLayer([animal])(features)
-      with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+      net = fc_old.input_layer(features, [animal])
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+  def test_input_layer_old_categorical(self):
+    animal = fc.indicator_column(
+        fc_old._categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = fc_old.input_layer(features, [animal])
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+    parent = fc.categorical_column_with_identity('animal', num_buckets=4)
+    animal = fc.indicator_column(parent)
 
+    self.assertEqual([parent], animal.parents)
 
-class _TestStateManager(StateManager):
+    config = animal._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'class_name': 'IdentityCategoricalColumn',
+            'config': {
+                'key': 'animal',
+                'default_value': None,
+                'number_buckets': 4
+            }
+        }
+    }, config)
+
+    new_animal = fc.IndicatorColumn._from_config(config)
+    self.assertEqual(animal, new_animal)
+    self.assertIsNot(parent, new_animal.categorical_column)
+
+    new_animal = fc.IndicatorColumn._from_config(
+        config, columns_by_name={parent.name: parent})
+    self.assertEqual(animal, new_animal)
+    self.assertIs(parent, new_animal.categorical_column)
+
+
+class _TestStateManager(fc.StateManager):
 
   def __init__(self, trainable=True):
     # Dict of feature_column to a dict of variables.
@@ -3727,6 +6189,7 @@ class _TestStateManager(StateManager):
                       shape,
                       dtype=None,
                       trainable=True,
+                      use_resource=True,
                       initializer=None):
     if feature_column not in self._all_variables:
       self._all_variables[feature_column] = {}
@@ -3739,6 +6202,7 @@ class _TestStateManager(StateManager):
           shape=shape,
           dtype=dtype,
           trainable=self._trainable and trainable,
+          use_resource=use_resource,
           initializer=initializer)
       var_dict[name] = var
       return var
@@ -3753,6 +6217,7 @@ class _TestStateManager(StateManager):
 
 class EmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -3771,16 +6236,30 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column.parse_example_spec)
+    self.assertTrue(embedding_column._is_v2_column)
+
+  def test_is_v2_column(self):
+    categorical_column = fc_old._categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension)
+    self.assertFalse(embedding_column._is_v2_column)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
@@ -3794,15 +6273,20 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     original = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     for embedding_column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', embedding_column.categorical_column.name)
       self.assertEqual(3, embedding_column.categorical_column.num_buckets)
@@ -3822,52 +6306,127 @@ class EmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column.parse_example_spec)
 
-  def test_invalid_initializer(self):
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
+  @test_util.run_deprecated_v1
+  def test_invalid_initializer(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
+      fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
+
+  @test_util.run_deprecated_v1
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_embedded = fc.embedding_column(a, dimension=2)
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec_v2([a_embedded]))
+    self.assertIn('aaa', features)
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
+  def test_transform_feature(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    a_embedded = fc.embedding_column(a, dimension=2)
+    features = {
+        'aaa':
+            sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(0, 1, 0),
+                dense_shape=(2, 2))
+    }
+    outputs = fc._transform_features_v2(features, [a, a_embedded], None)
+    output_a = outputs[a]
+    output_embedded = outputs[a_embedded]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                self.evaluate(output_embedded))
+
+  @test_util.run_deprecated_v1
+  def test_get_dense_tensor(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+    state_manager = _TestStateManager()
+    embedding_column.create_state(state_manager)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column.get_dense_tensor(
+        fc.FeatureTransformationCache({
+            'aaa': sparse_input
+        }), state_manager)
 
-  def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded = fc.embedding_column(a, dimension=2)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
-    features = parsing_ops.parse_example(
-        serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_embedded]))
-    self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
 
-  def test_transform_feature(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    a_embedded = fc.embedding_column(a, dimension=2)
-    features = {
-        'aaa': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(0, 1, 0),
-            dense_shape=(2, 2))
-    }
-    outputs = _transform_features(features, [a, a_embedded], None)
-    output_a = outputs[a]
-    output_embedded = outputs[a_embedded]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_embedded.eval())
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
-  def test_get_dense_tensor(self):
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
+  def test_get_dense_tensor_old_categorical(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -3886,6 +6445,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -3905,28 +6465,31 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
-    state_manager = _TestStateManager()
-    embedding_column.create_state(state_manager)
 
     # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column.get_dense_tensor(
-        FeatureTransformationCache({
+    embedding_lookup = embedding_column._get_dense_tensor(
+        fc_old._LazyBuilder({
             'aaa': sparse_input
-        }), state_manager)
+        }))
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
     vocabulary_size = 4
@@ -3942,11 +6505,12 @@ class EmbeddingColumnTest(test.TestCase):
     # Embedding variable.
     embedding_dimension = 3
     embedding_values = (
-        (1., 2., 4.),   # id 0
-        (3., 5., 1.),   # id 1
+        (1., 2., 4.),  # id 0
+        (3., 5., 1.),  # id 1
         (7., 11., 2.),  # id 2
-        (2., 7., 12.)   # id 3
+        (2., 7., 12.)  # id 3
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -3970,14 +6534,15 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
 
     # Provide sparse input and get dense result.
     embedding_lookup = embedding_column.get_dense_tensor(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': sparse_input
         }), state_manager)
 
@@ -3985,10 +6550,14 @@ class EmbeddingColumnTest(test.TestCase):
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -4008,6 +6577,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -4030,7 +6600,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -4040,7 +6611,7 @@ class EmbeddingColumnTest(test.TestCase):
     input_values = array_ops.placeholder(dtype=dtypes.int64)
     input_shape = array_ops.placeholder(dtype=dtypes.int64)
     embedding_lookup = embedding_column.get_dense_tensor(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa':
                 sparse_tensor.SparseTensorValue(
                     indices=input_indices,
@@ -4050,17 +6621,23 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
     with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval(
-          feed_dict={
-              input_indices: sparse_input.indices,
-              input_values: sparse_input.values,
-              input_shape: sparse_input.dense_shape,
-          }))
+      self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+      self.assertAllEqual(
+          expected_lookups,
+          embedding_lookup.eval(
+              feed_dict={
+                  input_indices: sparse_input.indices,
+                  input_values: sparse_input.values,
+                  input_shape: sparse_input.dense_shape,
+              }))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_restore_from_ckpt(self):
     # Inputs.
     vocabulary_size = 3
@@ -4100,7 +6677,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         ckpt_to_load_from=ckpt_path,
         tensor_name_in_ckpt=ckpt_tensor)
     state_manager = _TestStateManager()
@@ -4108,18 +6686,22 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Provide sparse input and get dense result.
     embedding_lookup = embedding_column.get_dense_tensor(
-        FeatureTransformationCache({
+        fc.FeatureTransformationCache({
             'aaa': sparse_input
         }), state_manager)
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -4137,6 +6719,7 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_dimension = 2
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -4163,39 +6746,45 @@ class EmbeddingColumnTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
-          v.name: v for v in ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES)
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
-      linear_weights = trainable_vars[
-          'linear_model/aaa_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
-
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # example 2, ids [], embedding[2] = [0, 0]
-        # example 3, ids [1], embedding[3] = [3, 5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
-
-  def test_feature_layer(self):
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # example 2, ids [], embedding[2] = [0, 0]
+      # example 3, ids [1], embedding[3] = [3, 5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                          self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -4214,6 +6803,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -4241,21 +6831,93 @@ class EmbeddingColumnTest(test.TestCase):
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
-    l = FeatureLayer((embedding_column,))
-    feature_layer = l({'aaa': sparse_input})
+    l = fc.DenseFeatures((embedding_column,))
+    dense_features = l({'aaa': sparse_input})
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertTrue(isinstance(v, variables_lib.RefVariable))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
 
-  def test_feature_layer_not_trainable(self):
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
+
+    # Provide sparse input and get dense result.
+    dense_features = fc.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertItemsEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_input_layer(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -4274,49 +6936,271 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
       self.assertIsNone(partition_info)
       return embedding_values
 
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    feature_layer = fc_old.input_layer({
+        'aaa': sparse_input
+    }, (embedding_column,))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(feature_layer))
+
+  def test_old_linear_model(self):
+    # Inputs.
+    batch_size = 4
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(batch_size, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          categorical_column.name: sparse_input
+      }, (embedding_column,))
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_embedding/weights:0',
+          'linear_model/aaa_embedding/embedding_weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_embedding/embedding_weights:0']
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # example 2, ids [], embedding[2] = [0, 0]
+      # example 3, ids [1], embedding[3] = [3, 5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                          self.evaluate(predictions))
+
+  def test_old_linear_model_old_categorical(self):
+    # Inputs.
+    batch_size = 4
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(batch_size, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column = fc_old._categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          categorical_column.name: sparse_input
+      }, (embedding_column,))
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_embedding/weights:0',
+          'linear_model/aaa_embedding/embedding_weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_embedding/embedding_weights:0']
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # example 2, ids [], embedding[2] = [0, 0]
+      # example 3, ids [1], embedding[3] = [3, 5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                          self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return ValueError('Not expected to be called')
 
     # Build columns.
     categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
+        key='aaa', num_buckets=3)
     embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
+        categorical_column, dimension=2, initializer=_initializer)
 
-    # Provide sparse input and get dense result.
-    feature_layer = FeatureLayer((embedding_column,))({'aaa': sparse_input})
+    self.assertEqual([categorical_column], embedding_column.parents)
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
+    config = embedding_column._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'class_name': 'IdentityCategoricalColumn',
+            'config': {
+                'number_buckets': 3,
+                'key': 'aaa',
+                'default_value': None
+            }
+        },
+        'ckpt_to_load_from': None,
+        'combiner': 'mean',
+        'dimension': 2,
+        'initializer': '_initializer',
+        'max_norm': None,
+        'tensor_name_in_ckpt': None,
+        'trainable': True
+    }, config)
+
+    custom_objects = {
+        '_initializer': _initializer,
+    }
+
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config, custom_objects=custom_objects)
+    self.assertEqual(embedding_column, new_embedding_column)
+    self.assertIsNot(categorical_column,
+                     new_embedding_column.categorical_column)
+
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config,
+        custom_objects=custom_objects,
+        columns_by_name={categorical_column.name: categorical_column})
+    self.assertEqual(embedding_column, new_embedding_column)
+    self.assertIs(categorical_column, new_embedding_column.categorical_column)
 
 
 class SharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4328,22 +7212,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
         dimension=embedding_dimension)
     self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
     self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
-    self.assertEqual(embedding_dimension, embedding_column_a.dimension)
-    self.assertEqual(embedding_dimension, embedding_column_b.dimension)
-    self.assertEqual('mean', embedding_column_a.combiner)
-    self.assertEqual('mean', embedding_column_b.combiner)
-    self.assertIsNone(embedding_column_a.ckpt_to_load_from)
-    self.assertIsNone(embedding_column_b.ckpt_to_load_from)
-    self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_a.shared_collection_name)
-    self.assertEqual('aaa_bbb_shared_embedding',
-                     embedding_column_b.shared_collection_name)
-    self.assertIsNone(embedding_column_a.tensor_name_in_ckpt)
-    self.assertIsNone(embedding_column_b.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column_a.max_norm)
     self.assertIsNone(embedding_column_b.max_norm)
-    self.assertTrue(embedding_column_a.trainable)
-    self.assertTrue(embedding_column_b.trainable)
     self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
     self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
     self.assertEqual((embedding_dimension,), embedding_column_a.variable_shape)
@@ -4355,6 +7225,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4373,22 +7244,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
         trainable=False)
     self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
     self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
-    self.assertEqual(embedding_dimension, embedding_column_a.dimension)
-    self.assertEqual(embedding_dimension, embedding_column_b.dimension)
-    self.assertEqual('my_combiner', embedding_column_a.combiner)
-    self.assertEqual('my_combiner', embedding_column_b.combiner)
-    self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_a.shared_collection_name)
-    self.assertEqual('shared_embedding_collection_name',
-                     embedding_column_b.shared_collection_name)
-    self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
-    self.assertEqual('my_ckpt', embedding_column_b.ckpt_to_load_from)
-    self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
-    self.assertEqual('my_ckpt_tensor', embedding_column_b.tensor_name_in_ckpt)
     self.assertEqual(42., embedding_column_a.max_norm)
     self.assertEqual(42., embedding_column_b.max_norm)
-    self.assertFalse(embedding_column_a.trainable)
-    self.assertFalse(embedding_column_b.trainable)
     self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
     self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
     self.assertEqual((embedding_dimension,), embedding_column_a.variable_shape)
@@ -4400,6 +7257,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4423,14 +7281,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column_a.categorical_column.parse_example_spec)
 
-      self.assertEqual(embedding_dimension, embedding_column_a.dimension)
-      self.assertEqual('my_combiner', embedding_column_a.combiner)
-      self.assertEqual('shared_embedding_collection_name',
-                       embedding_column_a.shared_collection_name)
-      self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
-      self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column_a.max_norm)
-      self.assertFalse(embedding_column_a.trainable)
       self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
       self.assertEqual((embedding_dimension,),
                        embedding_column_a.variable_shape)
@@ -4438,6 +7289,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column_a.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4449,6 +7301,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           dimension=2,
           initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_incompatible_column_type(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4463,6 +7316,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_weighted_categorical_column_ok(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -4480,82 +7334,90 @@ class SharedEmbeddingColumnTest(test.TestCase):
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     b = fc.categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_embedded, b_embedded = fc.shared_embedding_columns_v2([a, b], dimension=2)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-            'bbb':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'stringer', b'marlo'])),
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer'])),
+                'bbb':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'stringer', b'marlo'])),
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_embedded, b_embedded]))
+        features=fc.make_parse_example_spec_v2([a_embedded, b_embedded]))
     self.assertIn('aaa', features)
     self.assertIn('bbb', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'stringer', b'marlo'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['bbb'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'stringer', b'marlo'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['bbb']))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
     a_embedded, b_embedded = fc.shared_embedding_columns_v2([a, b], dimension=2)
     features = {
-        'aaa': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(0, 1, 0),
-            dense_shape=(2, 2)),
-        'bbb': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(1, 2, 1),
-            dense_shape=(2, 2)),
+        'aaa':
+            sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(0, 1, 0),
+                dense_shape=(2, 2)),
+        'bbb':
+            sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(1, 2, 1),
+                dense_shape=(2, 2)),
     }
-    outputs = _transform_features(features, [a, a_embedded, b, b_embedded],
-                                  None)
+    outputs = fc._transform_features_v2(features,
+                                        [a, a_embedded, b, b_embedded], None)
     output_a = outputs[a]
     output_a_embedded = outputs[a_embedded]
     output_b = outputs[b]
     output_b_embedded = outputs[b_embedded]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_a_embedded.eval())
-      _assert_sparse_tensor_value(
-          self, output_b.eval(), output_b_embedded.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                self.evaluate(output_a_embedded))
+    _assert_sparse_tensor_value(self, self.evaluate(output_b),
+                                self.evaluate(output_b_embedded))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
-    input_features = {
-        'aaa': input_a,
-        'bbb': input_b
-    }
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
 
     # Embedding variable.
     embedding_dimension = 2
@@ -4564,6 +7426,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -4593,36 +7456,39 @@ class SharedEmbeddingColumnTest(test.TestCase):
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
-    state_manager = fc.SharedEmbeddingStateManager(name='shared_feature_layer')
-    embedding_column_a.create_state(state_manager)
-    embedding_column_b.create_state(state_manager)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a.get_dense_tensor(
-        FeatureTransformationCache(input_features), state_manager)
+        fc.FeatureTransformationCache(input_features), None)
     embedding_lookup_b = embedding_column_b.get_dense_tensor(
-        FeatureTransformationCache(input_features), state_manager)
+        fc.FeatureTransformationCache(input_features), None)
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('shared_feature_layer/aaa_bbb_shared_embedding:0',),
+    self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+    self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
     # Specify shape, because dense input must have rank specified.
     input_a_placeholder = array_ops.placeholder(
         dtype=dtypes.int64, shape=[None, 3])
@@ -4644,6 +7510,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -4659,35 +7526,36 @@ class SharedEmbeddingColumnTest(test.TestCase):
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
-    state_manager = fc.SharedEmbeddingStateManager()
-    embedding_column_a.create_state(state_manager)
-    embedding_column_b.create_state(state_manager)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a.get_dense_tensor(
-        FeatureTransformationCache(input_features), state_manager)
+        fc.FeatureTransformationCache(input_features), None)
     embedding_lookup_b = embedding_column_b.get_dense_tensor(
-        FeatureTransformationCache(input_features), state_manager)
+        fc.FeatureTransformationCache(input_features), None)
 
     with _initialized_session() as sess:
       sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 2
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
 
     # Embedding variable.
     embedding_dimension = 2
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -4705,9 +7573,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         initializer=_initializer)
 
     with ops.Graph().as_default():
-      model = fc.LinearModel(
-          (embedding_column_a, embedding_column_b),
-          shared_state_manager=fc.SharedEmbeddingStateManager())
+      model = fc.LinearModel((embedding_column_a, embedding_column_b))
       predictions = model({
           categorical_column_a.name: input_a,
           categorical_column_b.name: input_b
@@ -4718,53 +7584,57 @@ class SharedEmbeddingColumnTest(test.TestCase):
       expected_var_names = (
           'linear_model/bias_weights:0',
           'linear_model/aaa_shared_embedding/weights:0',
-          'shared_embedding_state_manager/aaa_bbb_shared_embedding:0',
+          'aaa_bbb_shared_embedding:0',
           'linear_model/bbb_shared_embedding/weights:0',
       )
       self.assertItemsEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
-          v.name: v for v in ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES)
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
-      embedding_weights = trainable_vars[
-          'shared_embedding_state_manager/aaa_bbb_shared_embedding:0']
+      embedding_weights = trainable_vars['aaa_bbb_shared_embedding:0']
       linear_weights_a = trainable_vars[
           'linear_model/aaa_shared_embedding/weights:0']
       linear_weights_b = trainable_vars[
           'linear_model/bbb_shared_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
-
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights_a.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
-        linear_weights_b.assign(((3.,), (5.,))).eval()
-        # example 0, ids [0], embedding[0] = [1, 2]
-        # example 1, ids [], embedding[1] = 0, 0]
-        # sum(embeddings * linear_weights)
-        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
-
-  def _test_feature_layer(self, trainable=True):
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights_a.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+      self.evaluate(linear_weights_b.assign(((3.,), (5.,))))
+      # example 0, ids [0], embedding[0] = [1, 2]
+      # example 1, ids [], embedding[1] = 0, 0]
+      # sum(embeddings * linear_weights)
+      # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+      self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
+
+  def _test_dense_features(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
     sparse_input_a = sparse_tensor.SparseTensorValue(
@@ -4799,6 +7669,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -4841,8 +7712,6 @@ class SharedEmbeddingColumnTest(test.TestCase):
         dimension=embedding_dimension,
         initializer=_initializer,
         trainable=trainable)
-    shared_state_manager = fc.SharedEmbeddingStateManager(
-        name='shared_feature_layer')
 
     features = {
         'aaa': sparse_input_a,
@@ -4852,89 +7721,66 @@ class SharedEmbeddingColumnTest(test.TestCase):
     }
 
     # Provide sparse input and get dense result.
-    feature_layer = FeatureLayer(
+    dense_features = fc.DenseFeatures(
         feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d),
-        shared_state_manager=shared_state_manager)(
-            features)
+                         embedding_column_c, embedding_column_d))(
+                             features)
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual([
-        'shared_feature_layer/aaa_bbb_shared_embedding:0',
-        'shared_feature_layer/ccc_ddd_shared_embedding:0'
-    ], tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(
+        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+        tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertTrue(isinstance(v, variables_lib.RefVariable))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
-      self.assertItemsEqual([
-          'shared_feature_layer/aaa_bbb_shared_embedding:0',
-          'shared_feature_layer/ccc_ddd_shared_embedding:0'
-      ], tuple([v.name for v in trainable_vars]))
+      self.assertItemsEqual(
+          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+          tuple([v.name for v in trainable_vars]))
     else:
       self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
 
-  def test_feature_layer(self):
-    self._test_feature_layer()
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
-  def test_feature_layer_no_trainable(self):
-    self._test_feature_layer(trainable=False)
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
 
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
 
-class SharedEmbeddingStateManagerTest(test.TestCase):
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+
+    def _initializer(shape, dtype, partition_info):
+      del shape, dtype, partition_info
+      return ValueError('Not expected to be called')
 
-  def test_basic(self):
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=3)
-    fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b], dimension=2)
-    shared_state_manager = fc.SharedEmbeddingStateManager(
-        name='shared_feature_layer')
-    var_a = shared_state_manager.create_variable('aaa_bbb_shared_embedding',
-                                                 [5, 10])
-    var_b = shared_state_manager.create_variable('aaa_bbb_shared_embedding',
-                                                 [5, 10])
-    self.assertEqual(var_a, var_b)
-    self.assertEqual('shared_feature_layer/aaa_bbb_shared_embedding:0',
-                     var_a.name)
-    self.assertIsInstance(var_a, variables_lib.Variable)
-
-  def test_multiple_sets(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    categorical_column_c = fc.categorical_column_with_identity(
-        key='ccc', num_buckets=3)
-    categorical_column_d = fc.categorical_column_with_identity(
-        key='ddd', num_buckets=3)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=2,
+        initializer=_initializer)
 
-    fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b], dimension=2)
-    fc.shared_embedding_columns_v2(
-        [categorical_column_c, categorical_column_d], dimension=2)
-    shared_state_manager = fc.SharedEmbeddingStateManager(
-        name='shared_feature_layer')
-    var_a = shared_state_manager.create_variable('aaa_bbb_shared_embedding',
-                                                 [5, 10])
-    var_c = shared_state_manager.create_variable('ccc_ddd_shared_embedding',
-                                                 [5, 10])
-    self.assertIsInstance(var_a, variables_lib.Variable)
-    self.assertIsInstance(var_c, variables_lib.Variable)
-    self.assertNotEquals(var_a, var_c)
-    self.assertEqual('shared_feature_layer/aaa_bbb_shared_embedding:0',
-                     var_a.name)
-    self.assertEqual('shared_feature_layer/ccc_ddd_shared_embedding:0',
-                     var_c.name)
+    self.assertEqual([categorical_column_a], embedding_column_a.parents)
+    self.assertEqual([categorical_column_b], embedding_column_b.parents)
+    # TODO(rohanj): Add tests for (from|get)_config once implemented
 
 
 class WeightedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -4946,7 +7792,16 @@ class WeightedCategoricalColumnTest(test.TestCase):
         'ids': parsing_ops.VarLenFeature(dtypes.int64),
         'values': parsing_ops.VarLenFeature(dtypes.float32)
     }, column.parse_example_spec)
+    self.assertTrue(column._is_v2_column)
+
+  def test_is_v2_column(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc_old._categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    self.assertFalse(column._is_v2_column)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
     original = fc.weighted_categorical_column(
@@ -4987,7 +7842,10 @@ class WeightedCategoricalColumnTest(test.TestCase):
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(ValueError, 'Bad dtype'):
-      _transform_features({'ids': strings, 'values': strings}, (column,), None)
+      fc._transform_features_v2({
+          'ids': strings,
+          'values': strings
+      }, (column,), None)
 
   def test_column_name_collision(self):
     with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
@@ -5005,77 +7863,79 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(
-        ValueError, 'values is not in features dictionary'):
-      _transform_features({'ids': inputs}, (column,), None)
+    with self.assertRaisesRegexp(ValueError,
+                                 'values is not in features dictionary'):
+      fc._transform_features_v2({'ids': inputs}, (column,), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-            'weights':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[1., 10.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer'])),
+                'weights':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[1., 10.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_weighted]))
+        features=fc.make_parse_example_spec_v2([a_weighted]))
     self.assertIn('aaa', features)
     self.assertIn('weights', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([1., 10.], dtype=np.float32),
-              dense_shape=[1, 2]),
-          features['weights'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([1., 10.], dtype=np.float32),
+            dense_shape=[1, 2]), self.evaluate(features['weights']))
+
+  @test_util.run_deprecated_v1
   def test_transform_features(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2))
     weights = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0.5, 1.0, 0.1),
         dense_shape=(2, 2))
-    id_tensor, weight_tensor = _transform_features({
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': inputs,
         'values': weights,
     }, (column,), None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array(inputs.values, dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=weights.indices,
+            values=np.array(weights.values, dtype=np.float32),
+            dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_input(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -5085,55 +7945,57 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0.5, 1.0, 0.1),
         dense_shape=(2, 2))
-    id_tensor, weight_tensor = _transform_features({
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': ((0, -1), (1, 0)),
         'values': weights,
     }, (column,), None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_tensor.eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_tensor))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=weights.indices,
+            values=np.array(weights.values, dtype=np.float32),
+            dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_weights(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor, weight_tensor = _transform_features({
+        indices=((0, 0), (1, 0), (1, 1)), values=(2, 1, 0), dense_shape=(2, 2))
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': inputs,
         'values': ((.5, 0.), (1., .1)),
     }, (column,), None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)),
-          weight_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array(inputs.values, dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((.5, 1., .1), dtype=np.float32),
+            dense_shape=(2, 2)), self.evaluate(weight_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -5154,15 +8016,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       weight_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_linear_model_mismatched_shape(self):
     column = fc.weighted_categorical_column(
@@ -5208,7 +8073,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_linear_model_mismatched_dense_shape(self):
     column = fc.weighted_categorical_column(
@@ -5226,17 +8091,292 @@ class WeightedCategoricalColumnTest(test.TestCase):
           'values': ((.5,), (1.,), (.1,))
       })
       weight_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
+
+  def test_old_linear_model(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(.5, 1., .1),
+                  dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
+
+  def test_old_linear_model_mismatched_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError,
+                                   r'Dimensions.*are not compatible'):
+        fc_old.linear_model({
+            'ids':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 2, 1),
+                    dense_shape=(2, 2)),
+            'values':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (0, 1), (1, 0), (1, 1)),
+                    values=(.5, 11., 1., .1),
+                    dense_shape=(2, 2))
+        }, (column,))
+
+  def test_old_linear_model_mismatched_dense_values(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values': ((.5,), (1.,))
+      }, (column,),
+                                        sparse_combiner='mean')
+      # Disabling the constant folding optimizer here since it changes the
+      # error message differently on CPU and GPU.
+      config = config_pb2.ConfigProto()
+      config.graph_options.rewrite_options.constant_folding = (
+          rewriter_config_pb2.RewriterConfig.OFF)
+      with _initialized_session(config):
+        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+          self.evaluate(predictions)
+
+  def test_old_linear_model_mismatched_dense_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values': ((.5,), (1.,), (.1,))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
+
+  def test_old_linear_model_old_categorical(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc_old._categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = fc_old.linear_model({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(.5, 1., .1),
+                  dense_shape=(2, 2))
+      }, (column,))
+      bias = get_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   # TODO(ptucker): Add test with embedding of weighted categorical.
 
+  @test_util.run_deprecated_v1
+  def test_serialization(self):
+    categorical_column = fc.categorical_column_with_identity(
+        key='ids', num_buckets=3)
+    column = fc.weighted_categorical_column(
+        categorical_column=categorical_column, weight_feature_key='weight')
+
+    self.assertEqual([categorical_column, 'weight'], column.parents)
+
+    config = column._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'config': {
+                'key': 'ids',
+                'number_buckets': 3,
+                'default_value': None
+            },
+            'class_name': 'IdentityCategoricalColumn'
+        },
+        'dtype': 'float32',
+        'weight_feature_key': 'weight'
+    }, config)
+
+    self.assertEqual(column, fc.WeightedCategoricalColumn._from_config(config))
+
+    new_column = fc.WeightedCategoricalColumn._from_config(
+        config, columns_by_name={categorical_column.name: categorical_column})
+    self.assertEqual(column, new_column)
+    self.assertIs(categorical_column, new_column.categorical_column)
+
+
+class FeatureColumnForSerializationTest(BaseFeatureColumnForTests):
+
+  @property
+  def _is_v2_column(self):
+    return True
+
+  @property
+  def name(self):
+    return 'BadParentsFeatureColumn'
+
+  def transform_feature(self, transformation_cache, state_manager):
+    return 'Output'
+
+  @property
+  def parse_example_spec(self):
+    pass
+
+
+class SerializationTest(test.TestCase):
+  """Tests for serialization, deserialization helpers."""
+
+  def test_serialize_non_feature_column(self):
+
+    class NotAFeatureColumn(object):
+      pass
+
+    with self.assertRaisesRegexp(ValueError, 'is not a FeatureColumn'):
+      fc.serialize_feature_column(NotAFeatureColumn())
+
+  def test_deserialize_invalid_config(self):
+    with self.assertRaisesRegexp(ValueError, 'Improper config format: {}'):
+      fc.deserialize_feature_column({})
+
+  def test_deserialize_config_missing_key(self):
+    config_missing_key = {
+        'config': {
+            # Dtype is missing and should cause a failure.
+            # 'dtype': 'int32',
+            'default_value': None,
+            'key': 'a',
+            'normalizer_fn': None,
+            'shape': (2,)
+        },
+        'class_name': 'NumericColumn'
+    }
+    with self.assertRaisesRegexp(ValueError, 'Invalid config:'):
+      fc.deserialize_feature_column(config_missing_key)
+
+  def test_deserialize_invalid_class(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Unknown feature_column_v2: NotExistingFeatureColumnClass'):
+      fc.deserialize_feature_column({
+          'class_name': 'NotExistingFeatureColumnClass',
+          'config': {}
+      })
+
+  def test_deserialization_deduping(self):
+    price = fc.numeric_column('price')
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+
+    configs = fc.serialize_feature_columns([price, bucketized_price])
+
+    deserialized_feature_columns = fc.deserialize_feature_columns(configs)
+    self.assertEqual(2, len(deserialized_feature_columns))
+    new_price = deserialized_feature_columns[0]
+    new_bucketized_price = deserialized_feature_columns[1]
+
+    # Ensure these are not the original objects:
+    self.assertIsNot(price, new_price)
+    self.assertIsNot(bucketized_price, new_bucketized_price)
+    # But they are equivalent:
+    self.assertEquals(price, new_price)
+    self.assertEquals(bucketized_price, new_bucketized_price)
+
+    # Check that deduping worked:
+    self.assertIs(new_bucketized_price.source_column, new_price)
+
+  def deserialization_custom_objects(self):
+    # Note that custom_objects is also tested extensively above per class, this
+    # test ensures that the public wrappers also handle it correctly.
+    def _custom_fn(input_tensor):
+      return input_tensor + 42.
+
+    price = fc.numeric_column('price', normalizer_fn=_custom_fn)
+
+    configs = fc.serialize_feature_columns([price])
+
+    deserialized_feature_columns = fc.deserialize_feature_columns(configs)
+
+    self.assertEqual(1, len(deserialized_feature_columns))
+    new_price = deserialized_feature_columns[0]
+
+    # Ensure these are not the original objects:
+    self.assertIsNot(price, new_price)
+    # But they are equivalent:
+    self.assertEquals(price, new_price)
+
+    # Check that normalizer_fn points to the correct function.
+    self.assertIs(new_price.normalizer_fn, _custom_fn)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
new file mode 100644
index 0000000000000000000000000000000000000000..30dc959e9a9f717bdb5c56bfbdde5ffa9d48c257
--- /dev/null
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -0,0 +1,286 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""AutomaticControlDependencies and related functionality."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+
+
+class AutomaticControlDependencies(object):
+  """Context manager to automatically add control dependencies.
+
+  Code under this context manager will act as if a sensible set of control
+  dependencies were present. More specifically:
+    1. All stateful ops in the scope will execute
+    2. Stateful ops which modify the same resource will execute in program order
+
+  Note: creating variables in an automatic control dependencies context is not
+  supported (the value of the variables will never change as they will keep
+  getting reinitialized).
+
+  NOT THREAD SAFE
+  """
+
+  def __init__(self):
+    self._returned_tensors = set()
+
+  def mark_as_return(self, tensor):
+    """Acts like identity but marks the `Tensor` as a return value.
+
+    This will possibly return a copy of the `Tensor`. Usage:
+
+    ```
+      with AutomaticControlDependencies() as a:
+       ...
+       t = a.mark_as_return(t)
+      _ = ...(t...)  # i.e. it's safe to use t here
+    ```
+
+    Args:
+      tensor: the `Tensor` to be marked
+
+    Returns:
+      a copy of the `Tensor`.
+    """
+    if isinstance(tensor, ops.IndexedSlices):
+      values = array_ops.identity(tensor.values)
+      indices = array_ops.identity(tensor.indices)
+      self._returned_tensors.add(indices)
+      self._returned_tensors.add(values)
+      return ops.IndexedSlices(values, indices, dense_shape=tensor.dense_shape)
+    elif isinstance(tensor, sparse_tensor.SparseTensor):
+      values = array_ops.identity(tensor.values)
+      indices = array_ops.identity(tensor.indices)
+      self._returned_tensors.add(indices)
+      self._returned_tensors.add(values)
+      return sparse_tensor.SparseTensor(
+          indices, values, dense_shape=tensor.dense_shape)
+    elif isinstance(tensor, tensor_array_ops.TensorArray):
+      flow = array_ops.identity(tensor.flow)
+      self._returned_tensors.add(flow)
+      return tensor_array_ops.build_ta_with_new_flow(tensor, flow)
+    # We want to make the return values depend on the stateful operations, but
+    # we don't want to introduce a cycle, so we make the return value the result
+    # of a new identity operation that the stateful operations definitely don't
+    # depend on.
+    tensor = array_ops.identity(tensor)
+    self._returned_tensors.add(tensor)
+    return tensor
+
+  def __enter__(self):
+    if context.executing_eagerly():
+      return self
+    # This code assumes no other thread is adding ops to the graph while
+    # we're adding ops to the graph.
+    # TODO(apassos): Fix this by locking the graph or using a temporary
+    # graph (but that would mess up devices and collections at least,
+    # probably other things as well).
+    self._graph = ops.get_default_graph()
+    self._n_operations = len(self._graph.get_operations())
+    return self
+
+  def _process_switch(self, switch_op, ops_which_must_run,
+                      last_op_using_resource_tensor, merge_for_resource):
+    """Processes a switch node for a resource input.
+
+    When tensorflow creates a cond, it creates a control flow context for each
+    branch of the cond. Each external tensor accessed by that branch is routed
+    through a switch op, which gets created in the graph _after_ the op which
+    uses that tensor get created.
+
+    If the resource comes from another switch op we process that one first.
+
+    _process_switch creates a corresponding merge node for the switch node. This
+    merge node is added to the outer control flow context of the switch
+    node. We also ensure that:
+
+      1. The switch node executes after the previous op which used the resource
+         tensor
+
+      2. Any op which uses a resource output of the switch node executes before
+         the merge for the switch node.
+
+      3. The next op which uses the input resource to the switch node (which
+         might be another switch node for the other branch of the conditional)
+         will execute after the merge node is done.
+
+      4. The merge node is marked as must_run so it will run even if no
+         subsequent operation uses the resource.
+
+    Args:
+      switch_op: the switch op to be processed
+      ops_which_must_run: the set of ops which must run
+      last_op_using_resource_tensor: map from resource tensor to last op using
+        it
+      merge_for_resource: map from resource tensor to merge which must follow
+        all usages of it.
+    """
+    inp = switch_op.inputs[0]
+    if inp.dtype == dtypes_module.resource and inp.op.type == "Switch":
+      self._process_switch(inp.op, ops_which_must_run,
+                           last_op_using_resource_tensor, merge_for_resource)
+    if switch_op.outputs[0] in merge_for_resource:
+      return
+    new_merge = control_flow_ops.merge(switch_op.outputs,
+                                       name="artificial_merge")
+    new_merge[0].op._control_flow_context = (  # pylint: disable=protected-access
+        switch_op._control_flow_context.outer_context)  # pylint: disable=protected-access
+    # Ensures the merge always runs
+    ops_which_must_run.add(new_merge[0].op)
+    if inp in last_op_using_resource_tensor:
+      # Ensures the switch executes after the previous op using the resource.
+      switch_op._add_control_input(last_op_using_resource_tensor[inp])  # pylint: disable=protected-access
+    # Ensure the next op outside the cond happens after the merge.
+    last_op_using_resource_tensor[inp] = new_merge[0].op
+    if inp in merge_for_resource:
+      merge_for_resource[inp]._add_control_input(new_merge[0].op)  # pylint: disable=protected-access
+    for o in switch_op.outputs:
+      # Ensures the merge will execute after all ops inside the cond
+      merge_for_resource[o] = new_merge[0].op
+
+  def __exit__(self, unused_type, unused_value, unused_traceback):
+    if context.executing_eagerly():
+      return
+
+    if self._graph is not ops.get_default_graph():
+      raise RuntimeError(
+          "Graph changed while trying to add control dependencies.")
+
+    # map from resource tensor to the last op which used it
+    last_op_using_resource_tensor = {}
+    # set of conditional and loop exits
+    ops_which_must_run = set()
+    # merge which must depend on ops which use this resource
+    merge_for_resource = {}
+
+    new_operations = self._graph.get_operations()[self._n_operations:]
+
+    # Ensures that uses of resource tensors get serialized properly and all
+    # execute. This is done by keeping a map from resource tensor to the last op
+    # in graph-construction order which used it (last_op_using_resource_tensor).
+    #
+    # Conditionals are written in TensorFlow such that every external tensor
+    # accessed in the conditional goes through a switch op and every return
+    # tensor (it's guaranteed that there will be at least one) goes through a
+    # merge op.
+    #
+    # To handle conditionals, switches are handled in a special way (see
+    # comments for _process_switch). Merge nodes created by TF's conditional
+    # logic (as opposed to by _process_switch) are forced to run and also get a
+    # control dependency added to them to ensure all stateful ops inside their
+    # control flow context run.
+    #
+    # We also ensure that if an op is using a resource output by a switch node
+    # (that is, a resource tensor for which there's a value in
+    # merge_for_resource) this op will run before the merge for that resource.
+    #
+    # We try to add control inputs to nodes respecting their control flow
+    # contexts to avoid dead nodes propagating everywhere and leading to
+    # "retval[0] doesn't have value" errors. If a node gets a control dependency
+    # on a dead node (i.e. a note from an untaken control flow branch) that node
+    # will be marked as dead unless it's a merge node.
+    #
+    # TODO(apassos): serialize non-resource-taking stateful ops as well, and
+    # test that it works. Support while loops. Support init_scope escaping from
+    # this.
+    for op in new_operations:
+      # TODO(apassos) make this code safely support while loops.
+      if control_flow_util.IsInWhileLoop(op):
+        continue
+      control_inputs = set()
+      # Ensure stateful ops run
+      if (op.type not in self._graph._registered_ops  # pylint: disable=protected-access
+          or self._graph._registered_ops[op.type].is_stateful):  # pylint: disable=protected-access
+        ops_which_must_run.add(op)
+      # Ignore switches (they're handled separately)
+      if op.type == "Switch" and op.inputs[0].dtype == dtypes_module.resource:
+        continue
+      # Make merges trigger all other computation which must run
+      if op.type == "Merge":
+        for o in ops_which_must_run:
+          op._add_control_input(o)  # pylint: disable=protected-access
+          for inp in o.inputs:
+            if inp in last_op_using_resource_tensor:
+              last_op_using_resource_tensor[inp] = op
+        ops_which_must_run = set([op])
+        continue
+      found_resource = False
+      for inp in op.inputs:
+        if inp.dtype == dtypes_module.resource:
+          found_resource = True
+          # Deal with switches, finally.
+          if inp.op.type == "Switch":
+            self._process_switch(inp.op, ops_which_must_run,
+                                 last_op_using_resource_tensor,
+                                 merge_for_resource)
+          # Ensure uses of resources are serialized
+          if inp in last_op_using_resource_tensor:
+            if (last_op_using_resource_tensor[inp]._control_flow_context  # pylint: disable=protected-access
+                is op._control_flow_context):  # pylint: disable=protected-access
+              control_inputs.add(last_op_using_resource_tensor[inp])
+          # Ensure merges happen after the closing of a cond block
+          if inp in merge_for_resource:
+            merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
+          last_op_using_resource_tensor[inp] = op
+      if (op.op_def.is_stateful and not found_resource
+          and op._control_flow_context is None):  # pylint: disable=protected-access
+        if None in last_op_using_resource_tensor:
+          op._add_control_input(last_op_using_resource_tensor[None])  # pylint: disable=protected-access
+        last_op_using_resource_tensor[None] = op
+      control_inputs = [c for c in control_inputs
+                        if c._control_flow_context is op._control_flow_context]  # pylint: disable=protected-access
+      op._add_control_inputs(control_inputs)  # pylint: disable=protected-access
+
+    # Ensure all ops which must run do run
+    for r in self._returned_tensors:
+      if ops_which_must_run:
+        r.op._add_control_inputs(  # pylint: disable=protected-access
+            [o for o in ops_which_must_run
+             if o._control_flow_context is r.op._control_flow_context])  # pylint: disable=protected-access
+
+
+def automatic_control_dependencies(f):
+  """Wraps f to automatically insert control dependencies.
+
+  The inserted dependencies ensure that:
+    1. All stateful ops in f run when the result of f runs
+    2. Updates to the same resources happen in order.
+
+  Args:
+    f: the function to be wrapped.
+
+  Returns:
+    The wrapped function.
+  """
+
+  def wrapper(*args, **kwargs):
+    with AutomaticControlDependencies() as a:
+      result = f(*args, **kwargs)
+      result_flat = [a.mark_as_return(t) for t in nest.flatten(result)]
+      return nest.pack_sequence_as(result, result_flat)
+
+  return tf_decorator.make_decorator(f, wrapper)
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f5de45b9ee44da8a3440b5f3a5d55fbf7b8a02f
--- /dev/null
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -0,0 +1,287 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import auto_control_deps as acd
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+from tensorflow.python.training import momentum
+
+
+class AutomaticControlDependenciesTest(test.TestCase):
+
+  def testBasic(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      with acd.AutomaticControlDependencies() as c:
+        v.assign(v + 1)
+        v.assign(2 * v)
+        val = v.read_value()
+        val = c.mark_as_return(val)
+      self.assertAllEqual(val.eval(), 4.0)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCondMustRun(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with acd.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          v.assign(v + 1)
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        val = v.read_value()
+        val = c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True}), 6.0)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCondMustRunSeparateRead(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with acd.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          v.assign(v + 1)
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        one = constant_op.constant(1.0)
+        one = c.mark_as_return(one)
+      one.eval(feed_dict={p: False})
+      self.assertAllEqual(v.read_value().eval(), 5.0)
+      one.eval(feed_dict={p: True})
+      self.assertAllEqual(v.read_value().eval(), 6.0)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCondNested(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      q = array_ops.placeholder(dtype=dtypes.bool)
+      with acd.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          v.assign(v + 1, name='true')
+          return 1.0
+
+        def false_fn():
+
+          def inner_true_fn():
+            v.assign(v * 2, name='false_true')
+            return 2.0
+
+          def inner_false_fn():
+            v.assign(v * 3, name='false_false')
+            return 3.0
+
+          control_flow_ops.cond(q, inner_true_fn, inner_false_fn)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        with ops.name_scope('final'):
+          val = v.read_value()
+        val = c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False, q: False}), 3.0)
+      self.assertAllEqual(val.eval(feed_dict={p: False, q: True}), 6.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True, q: True}), 7.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True, q: False}), 8.0)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCondOneBranch(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with acd.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        val = v.read_value()
+        val = c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True}), 5.0)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCondOneBranchUpdateBefore(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with acd.AutomaticControlDependencies() as c:
+        v.assign(v * 2)
+
+        def true_fn():
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        val = v.read_value()
+        val = c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False}), 6.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True}), 12.0)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCondOneBranchUpdateAfter(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+      p = array_ops.placeholder(dtype=dtypes.bool)
+      with acd.AutomaticControlDependencies() as c:
+
+        def true_fn():
+          return 0.0
+
+        def false_fn():
+          v.assign(v + 4)
+          return 1.0
+
+        control_flow_ops.cond(p, true_fn, false_fn)
+        v.assign(v * 2)
+        val = v.read_value()
+        val = c.mark_as_return(val)
+      self.assertAllEqual(val.eval(feed_dict={p: False}), 10.0)
+      self.assertAllEqual(val.eval(feed_dict={p: True}), 20.0)
+
+  def testDefunWhileLoopWithCapturedLoopVars(self):
+    n = 3
+    x = constant_op.constant(list(range(n)))
+
+    @function.defun
+    def loop():
+      c = lambda i, x: i < n
+      b = lambda i, x: (i + 1, x + 1)
+      i, out = control_flow_ops.while_loop(c, b, (0, x))
+      return i, out
+
+    i, out = loop()
+    self.assertEqual(int(i), 3)
+    self.assertAllEqual(out, [3, 4, 5])
+
+  def testDecorator(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      variables.global_variables_initializer().run()
+
+      @acd.automatic_control_dependencies
+      def f():
+        v.assign(v + 1)
+        v.assign(2 * v)
+        return v.read_value()
+
+      self.assertAllEqual(f().eval(), 4.0)
+
+  def testOptimizerInDefun(self):
+    def loss(v):
+      return v**2
+
+    optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0)
+
+    @function.defun
+    def train():
+      self.v = resource_variable_ops.ResourceVariable(1.0)
+      grad = backprop.implicit_grad(loss)(self.v)
+      optimizer.apply_gradients(grad)
+      return self.v.read_value()
+
+    value = train()
+    self.assertEqual(value.numpy(), -1.0)
+
+  def testReturningNonTensorRaisesError(self):
+    optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0)
+    optimizer.apply_gradients = function.defun(optimizer.apply_gradients)
+    v = resource_variable_ops.ResourceVariable(1.0)
+    grad = backprop.implicit_grad(lambda v: v**2)(v)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 '.*must return zero or more Tensors.*'):
+      # TODO(akshayka): We might want to allow defun-ing Python functions
+      # that return operations (and just execute the op instead of running it).
+      optimizer.apply_gradients(grad)
+
+  # TODO(b/111663004): This should work when the outer context is graph
+  # building.
+  def testOptimizerNonSlotVarsInDefunNoError(self):
+    def loss(v):
+      return v**2
+
+    optimizer = adam.AdamOptimizer(learning_rate=1.0)
+
+    @function.defun
+    def train():
+      self.v = resource_variable_ops.ResourceVariable(1.0)
+      grad = backprop.implicit_grad(loss)(self.v)
+      optimizer.apply_gradients(grad)
+      return self.v.read_value()
+
+    train()
+
+  def testOptimizerInDefunWithCapturedVariable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+    def loss():
+      return v**2
+
+    optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0)
+
+    @function.defun
+    def train():
+      grad = backprop.implicit_grad(loss)()
+      optimizer.apply_gradients(grad)
+
+    train()
+    self.assertEqual(v.numpy(), -1.0)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 4b2706d4cf8df70818158d18ebf260c52d4f4218..ade0797dcdbac0334a7cc7e657922b2d1139be4c 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -106,16 +106,17 @@ def convert_to_eager_tensor(value, ctx, dtype=None):
     tensor = scalar_cache.get(cache_key, None)
     if tensor is not None:
       return ops.EagerTensor(
-          value, context=handle, device=device, dtype=dtype, other_value=tensor)
-    t = ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+          value, handle, device, dtype, tensor)
+    t = ops.EagerTensor(value, handle, device, dtype)
     scalar_cache[cache_key] = t
     return t
   else:
-    return ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+    return ops.EagerTensor(value, handle, device, dtype)
 
 
-@tf_export("constant")
-def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
+@tf_export(v1=["constant"])
+def constant_v1(
+    value, dtype=None, shape=None, name="Const", verify_shape=False):
   """Creates a constant tensor.
 
   The resulting tensor is populated with values of type `dtype`, as
@@ -174,6 +175,79 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   Raises:
     TypeError: if shape is incorrectly specified or unsupported.
   """
+  return _constant_impl(value, dtype, shape, name, verify_shape=verify_shape,
+                        allow_broadcast=False)
+
+
+@tf_export("constant", v1=[])
+def constant(value, dtype=None, shape=None, name="Const"):
+  """Creates a constant tensor.
+
+  The resulting tensor is populated with values of type `dtype`, as
+  specified by arguments `value` and (optionally) `shape` (see examples
+  below).
+
+  The argument `value` can be a constant value, or a list of values of type
+  `dtype`. If `value` is a list, then the length of the list must be less
+  than or equal to the number of elements implied by the `shape` argument (if
+  specified). In the case where the list length is less than the number of
+  elements specified by `shape`, the last element in the list will be used
+  to fill the remaining entries.
+
+  The argument `shape` is optional. If present, it specifies the dimensions of
+  the resulting tensor. If not present, the shape of `value` is used.
+
+  If the argument `dtype` is not specified, then the type is inferred from
+  the type of `value`.
+
+  For example:
+
+  ```python
+  # Constant 1-D Tensor populated with value list.
+  tensor = tf.constant([1, 2, 3, 4, 5, 6]) => [1 2 3 4 5 6]
+
+  # Constant 1-D Tensor populated with value list.
+  tensor = tf.constant([1, 2, 3, 4, 5, 6], shape=(2,3))
+       => [[1 2 3], [4 5 6]]
+
+  # Constant 2-D tensor populated with scalar value -1.
+  tensor = tf.constant(-1.0, shape=[2, 3]) => [[-1. -1. -1.]
+                                               [-1. -1. -1.]]
+  ```
+
+  `tf.constant` differs from `tf.fill` in a few ways:
+
+  *   `tf.constant` supports arbitrary constants, not just uniform scalar
+      Tensors like `tf.fill`.
+  *   `tf.constant` creates a `Const` node in the computation graph with the
+      exact value at graph construction time. On the other hand, `tf.fill`
+      creates an Op in the graph that is expanded at runtime.
+  *   Because `tf.constant` only embeds constant values in the graph, it does
+      not support dynamic shapes based on other runtime Tensors, whereas
+      `tf.fill` does.
+
+  Args:
+    value:          A constant value (or list) of output type `dtype`.
+
+    dtype:          The type of the elements of the resulting tensor.
+
+    shape:          Optional dimensions of resulting tensor.
+
+    name:           Optional name for the tensor.
+
+  Returns:
+    A Constant Tensor.
+
+  Raises:
+    TypeError: if shape is incorrectly specified or unsupported.
+  """
+  return _constant_impl(value, dtype, shape, name, verify_shape=False,
+                        allow_broadcast=True)
+
+
+def _constant_impl(
+    value, dtype, shape, name, verify_shape, allow_broadcast):
+  """Implementation of constant."""
   ctx = context.context()
   if ctx.executing_eagerly():
     t = convert_to_eager_tensor(value, ctx, dtype)
@@ -205,7 +279,8 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   tensor_value = attr_value_pb2.AttrValue()
   tensor_value.tensor.CopyFrom(
       tensor_util.make_tensor_proto(
-          value, dtype=dtype, shape=shape, verify_shape=verify_shape))
+          value, dtype=dtype, shape=shape, verify_shape=verify_shape,
+          allow_broadcast=allow_broadcast))
   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
   const_tensor = g.create_op(
       "Const", [], [dtype_value.type],
diff --git a/tensorflow/python/framework/device.py b/tensorflow/python/framework/device.py
index 7f6e0a75a5c508e35ff5bf3c28d4ab31af205715..e7ac6444a4ac1e116675dbb059cd1953df1213ab 100644
--- a/tensorflow/python/framework/device.py
+++ b/tensorflow/python/framework/device.py
@@ -23,7 +23,7 @@ import threading
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("DeviceSpec")
+@tf_export(v1=["DeviceSpec"])
 class DeviceSpec(object):
   """Represents a (possibly partial) specification for a TensorFlow device.
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 64d3b42d89d78aeae6f2325f4844379cf27927c8..9a4fe4e93b32aeedcb74cf0f7b2703f64d9db23a 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import builtins
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow
@@ -322,57 +323,59 @@ dtype_range = {
 
 # Define standard wrappers for the types_pb2.DataType enum.
 resource = DType(types_pb2.DT_RESOURCE)
-tf_export("resource").export_constant(__name__, "resource")
+tf_export("dtypes.resource", "resource").export_constant(__name__, "resource")
 variant = DType(types_pb2.DT_VARIANT)
-tf_export("variant").export_constant(__name__, "variant")
+tf_export("dtypes.variant", "variant").export_constant(__name__, "variant")
 float16 = DType(types_pb2.DT_HALF)
-tf_export("float16").export_constant(__name__, "float16")
+tf_export("dtypes.float16", "float16").export_constant(__name__, "float16")
 half = float16
-tf_export("half").export_constant(__name__, "half")
+tf_export("dtypes.half", "half").export_constant(__name__, "half")
 float32 = DType(types_pb2.DT_FLOAT)
-tf_export("float32").export_constant(__name__, "float32")
+tf_export("dtypes.float32", "float32").export_constant(__name__, "float32")
 float64 = DType(types_pb2.DT_DOUBLE)
-tf_export("float64").export_constant(__name__, "float64")
+tf_export("dtypes.float64", "float64").export_constant(__name__, "float64")
 double = float64
-tf_export("double").export_constant(__name__, "double")
+tf_export("dtypes.double", "double").export_constant(__name__, "double")
 int32 = DType(types_pb2.DT_INT32)
-tf_export("int32").export_constant(__name__, "int32")
+tf_export("dtypes.int32", "int32").export_constant(__name__, "int32")
 uint8 = DType(types_pb2.DT_UINT8)
-tf_export("uint8").export_constant(__name__, "uint8")
+tf_export("dtypes.uint8", "uint8").export_constant(__name__, "uint8")
 uint16 = DType(types_pb2.DT_UINT16)
-tf_export("uint16").export_constant(__name__, "uint16")
+tf_export("dtypes.uint16", "uint16").export_constant(__name__, "uint16")
 uint32 = DType(types_pb2.DT_UINT32)
-tf_export("uint32").export_constant(__name__, "uint32")
+tf_export("dtypes.uint32", "uint32").export_constant(__name__, "uint32")
 uint64 = DType(types_pb2.DT_UINT64)
-tf_export("uint64").export_constant(__name__, "uint64")
+tf_export("dtypes.uint64", "uint64").export_constant(__name__, "uint64")
 int16 = DType(types_pb2.DT_INT16)
-tf_export("int16").export_constant(__name__, "int16")
+tf_export("dtypes.int16", "int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
-tf_export("int8").export_constant(__name__, "int8")
+tf_export("dtypes.int8", "int8").export_constant(__name__, "int8")
 string = DType(types_pb2.DT_STRING)
-tf_export("string").export_constant(__name__, "string")
+tf_export("dtypes.string", "string").export_constant(__name__, "string")
 complex64 = DType(types_pb2.DT_COMPLEX64)
-tf_export("complex64").export_constant(__name__, "complex64")
+tf_export("dtypes.complex64", "complex64").export_constant(
+    __name__, "complex64")
 complex128 = DType(types_pb2.DT_COMPLEX128)
-tf_export("complex128").export_constant(__name__, "complex128")
+tf_export("dtypes.complex128", "complex128").export_constant(
+    __name__, "complex128")
 int64 = DType(types_pb2.DT_INT64)
-tf_export("int64").export_constant(__name__, "int64")
+tf_export("dtypes.int64", "int64").export_constant(__name__, "int64")
 bool = DType(types_pb2.DT_BOOL)  # pylint: disable=redefined-builtin
-tf_export("bool").export_constant(__name__, "bool")
+tf_export("dtypes.bool", "bool").export_constant(__name__, "bool")
 qint8 = DType(types_pb2.DT_QINT8)
-tf_export("qint8").export_constant(__name__, "qint8")
+tf_export("dtypes.qint8", "qint8").export_constant(__name__, "qint8")
 quint8 = DType(types_pb2.DT_QUINT8)
-tf_export("quint8").export_constant(__name__, "quint8")
+tf_export("dtypes.quint8", "quint8").export_constant(__name__, "quint8")
 qint16 = DType(types_pb2.DT_QINT16)
-tf_export("qint16").export_constant(__name__, "qint16")
+tf_export("dtypes.qint16", "qint16").export_constant(__name__, "qint16")
 quint16 = DType(types_pb2.DT_QUINT16)
-tf_export("quint16").export_constant(__name__, "quint16")
+tf_export("dtypes.quint16", "quint16").export_constant(__name__, "quint16")
 qint32 = DType(types_pb2.DT_QINT32)
-tf_export("qint32").export_constant(__name__, "qint32")
+tf_export("dtypes.qint32", "qint32").export_constant(__name__, "qint32")
 resource_ref = DType(types_pb2.DT_RESOURCE_REF)
 variant_ref = DType(types_pb2.DT_VARIANT_REF)
 bfloat16 = DType(types_pb2.DT_BFLOAT16)
-tf_export("bfloat16").export_constant(__name__, "bfloat16")
+tf_export("dtypes.bfloat16", "bfloat16").export_constant(__name__, "bfloat16")
 float16_ref = DType(types_pb2.DT_HALF_REF)
 half_ref = float16_ref
 float32_ref = DType(types_pb2.DT_FLOAT_REF)
@@ -546,8 +549,8 @@ _NP_TO_TF = frozenset([
     (np.int8, int8),
     (np.complex64, complex64),
     (np.complex128, complex128),
-    (np.object, string),
-    (np.bool, bool),
+    (np.object_, string),
+    (np.bool_, bool),
     (_np_qint8, qint8),
     (_np_quint8, quint8),
     (_np_qint16, qint16),
@@ -650,11 +653,15 @@ _QUANTIZED_DTYPES_NO_REF = frozenset([qint8, quint8, qint16, quint16, qint32])
 _QUANTIZED_DTYPES_REF = frozenset(
     [qint8_ref, quint8_ref, qint16_ref, quint16_ref, qint32_ref])
 QUANTIZED_DTYPES = _QUANTIZED_DTYPES_REF.union(_QUANTIZED_DTYPES_NO_REF)
-tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
+tf_export(
+    "dtypes.QUANTIZED_DTYPES",
+    v1=["dtypes.QUANTIZED_DTYPES", "QUANTIZED_DTYPES"]).export_constant(
+        __name__, "QUANTIZED_DTYPES")
 
 _PYTHON_TO_TF = {
-    float: float32,
-    bool: bool,
+    builtins.float: float32,
+    builtins.bool: bool,
+    builtins.object: string
 }
 
 
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index a873670e0461884d06cde1db4db2cf2db98fde3c..719fdc0953ae4d5bbe016b3dc2730f5601c3494e 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -81,10 +81,10 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertIs(dtypes.int8, dtypes.as_dtype(np.int8))
     self.assertIs(dtypes.complex64, dtypes.as_dtype(np.complex64))
     self.assertIs(dtypes.complex128, dtypes.as_dtype(np.complex128))
-    self.assertIs(dtypes.string, dtypes.as_dtype(np.object))
+    self.assertIs(dtypes.string, dtypes.as_dtype(np.object_))
     self.assertIs(dtypes.string,
                   dtypes.as_dtype(np.array(["foo", "bar"]).dtype))
-    self.assertIs(dtypes.bool, dtypes.as_dtype(np.bool))
+    self.assertIs(dtypes.bool, dtypes.as_dtype(np.bool_))
     with self.assertRaises(TypeError):
       dtypes.as_dtype(np.dtype([("f1", np.uint), ("f2", np.int32)]))
 
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index bc3c81b2a2f19bfa89bb2e2a418ea8239a5075d9..37a634d80679b095d319cabcd29208a35c4fe44f 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -40,6 +40,7 @@ _ParseTag = collections.namedtuple("_ParseTag", ["type", "name"])
 
 _BAD_FILE_SUBSTRINGS = [
     os.path.join("tensorflow", "python"),
+    os.path.join("tensorflow", "contrib"),
     "<embedded",
 ]
 
@@ -267,8 +268,8 @@ def compute_field_dict(op):
 def interpolate(error_message, graph):
   """Interpolates an error message.
 
-  The error message can contain tags of the form ^^type:name^^ which will
-  be replaced.
+  The error message can contain tags of the form `{{type name}}` which will be
+  replaced.
 
   Args:
     error_message: A string to interpolate.
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 1b77548592cec08ff4fadfe2e740b746c6a9d115..9eaa4a5f2d04c8baaf720d4b9a32c5c707d33772 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -23,6 +23,7 @@ import os
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import traceable_stack
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_stack
@@ -112,6 +113,7 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
     self.assertIn("No node-device colocations", summary)
 
 
+@test_util.run_v1_only("b/120545219")
 class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
 
   def setUp(self):
@@ -193,6 +195,7 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
     self.assertRegexpMatches(interpolated_string, "constant_op.py:[0-9]+.*")
 
 
+@test_util.run_v1_only("b/120545219")
 class InterpolateDeviceSummaryTest(test.TestCase):
 
   def _fancy_device_function(self, unused_op):
@@ -236,6 +239,7 @@ class InterpolateDeviceSummaryTest(test.TestCase):
     self.assertRegexpMatches(result, expected_re)
 
 
+@test_util.run_v1_only("b/120545219")
 class InterpolateColocationSummaryTest(test.TestCase):
 
   def setUp(self):
@@ -260,11 +264,13 @@ class InterpolateColocationSummaryTest(test.TestCase):
 
     self.graph = node_three.graph
 
+  @test_util.run_v1_only("b/120545219")
   def testNodeThreeHasColocationInterpolation(self):
     message = "{{colocation_node Three_with_one}}"
     result = error_interpolation.interpolate(message, self.graph)
     self.assertIn("colocate_with(One)", result)
 
+  @test_util.run_v1_only("b/120545219")
   def testNodeFourHasColocationInterpolationForNodeThreeOnly(self):
     message = "{{colocation_node Four_with_three}}"
     result = error_interpolation.interpolate(message, self.graph)
@@ -273,12 +279,14 @@ class InterpolateColocationSummaryTest(test.TestCase):
         "One", result,
         "Node One should not appear in Four_with_three's summary:\n%s" % result)
 
+  @test_util.run_v1_only("b/120545219")
   def testNodeFiveHasColocationInterpolationForNodeOneAndTwo(self):
     message = "{{colocation_node Five_with_one_with_two}}"
     result = error_interpolation.interpolate(message, self.graph)
     self.assertIn("colocate_with(One)", result)
     self.assertIn("colocate_with(Two)", result)
 
+  @test_util.run_v1_only("b/120545219")
   def testColocationInterpolationForNodeLackingColocation(self):
     message = "{{colocation_node One}}"
     result = error_interpolation.interpolate(message, self.graph)
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 8b303fa8a9432f6305229ff041bd484686d96cb0..faa4fa7c6fa47f4328c6c04569aacde48b51b6c0 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -30,7 +30,7 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("errors.OpError", "OpError")
+@tf_export("errors.OpError", v1=["errors.OpError", "OpError"])
 @deprecation.deprecated_endpoints("OpError")
 class OpError(Exception):
   """A generic error that is raised when TensorFlow execution fails.
diff --git a/tensorflow/python/framework/file_system_test.py b/tensorflow/python/framework/file_system_test.py
index 6901715e5d0f40a4cd4c3ba2e2556892210ef8c3..8687bc5a7850b25f363d23451ffeb58a68b5d0ef 100644
--- a/tensorflow/python/framework/file_system_test.py
+++ b/tensorflow/python/framework/file_system_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import load_library
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
@@ -36,13 +37,14 @@ class FileSystemTest(test.TestCase):
                                        "test_file_system.so")
     load_library.load_file_system_library(file_system_library)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       reader = io_ops.WholeFileReader("test_reader")
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       queue.enqueue_many([["test://foo"]]).run()
       queue.close().run()
-      key, value = sess.run(reader.read(queue))
+      key, value = self.evaluate(reader.read(queue))
     self.assertEqual(key, compat.as_bytes("test://foo"))
     self.assertEqual(value, compat.as_bytes("AAAAAAAAAA"))
 
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd4ed5553e7b0b2445344d5c36c2209e59d64d14
--- /dev/null
+++ b/tensorflow/python/framework/func_graph.py
@@ -0,0 +1,654 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""FuncGraph and related functionality."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.eager.graph_only_ops import graph_placeholder
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework.auto_control_deps import AutomaticControlDependencies
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# This is to avoid a circular dependency:
+# function -> func_graph
+function = LazyLoader("function", globals(),
+                      "tensorflow.python.eager.function")
+def_function = LazyLoader(
+    "def_function", globals(),
+    "tensorflow.python.eager.def_function")
+
+WHITELIST_COLLECTIONS = [
+    ops.GraphKeys.GLOBAL_VARIABLES,
+    ops.GraphKeys.LOCAL_VARIABLES,
+    ops.GraphKeys.TRAINABLE_VARIABLES,
+    variable_scope._VARSTORE_KEY,  # pylint: disable=protected-access
+    variable_scope._VARSCOPESTORE_KEY  # pylint: disable=protected-access
+]
+
+
+class FuncGraph(ops.Graph):
+  """Graph representing a function body.
+
+  Attributes:
+    name: The name of the function.
+    inputs: Placeholder tensors representing the inputs to this function. The
+      tensors are in this FuncGraph. This represents "regular" inputs as well as
+      captured inputs (i.e. the values of self.captures), with the regular
+      inputs coming first.
+    outputs: Tensors that will be returned by this function. The tensors are in
+      this FuncGraph.
+    structured_outputs: A possibly-nested python object which will be returned
+      by this function. The Tensors in this structure are the same as those of
+      self.outputs. Note that this structure might contain Python `None`s.
+    variables: Variables that should be watched during function execution.
+    outer_graph: The graph this function is defined in. May be another FuncGraph
+      or the global default Graph.
+    captures: Maps external tensor -> internal tensor (i.e. input placeholder).
+      The entries are in the order they were captured.
+    seed: The graph-level random seed.
+  """
+
+  def __init__(self, name, read_only_collections=True):
+    """Construct a new FuncGraph.
+
+    The graph will inherit its graph key, collections, seed, and distribution
+    strategy stack from the current context or graph.
+
+    Args:
+      name: the name of the function.
+      read_only_collections: whether to not write function graph collections
+        back to default graph. Defaults to True.
+    """
+    super(FuncGraph, self).__init__()
+
+    self.name = name
+    self.inputs = []
+    self.outputs = []
+    self.structured_outputs = None
+    self._read_only_collections = read_only_collections
+    self._weak_variables = []
+    self.outer_graph = ops.get_default_graph()
+    self.captures = collections.OrderedDict()
+
+    self._building_function = True
+    # Map from resource tensor name to last op (in program order) which uses
+    # this tensor. Used to enforce that execution order matches program order
+    # for resource tensors.
+    self._last_op_using_resource_tensor = {}
+
+    graph = self.outer_graph
+
+    # pylint: disable=protected-access
+    # TODO(b/112906995, nareshmodi): distribution strategy depends on inheriting
+    # this stack from the default graph even in eager mode. Maybe it should be
+    # part of the eager context? This would also allow us to remove a
+    # get_default_graph() call from the function cache lookup.
+    self._distribution_strategy_stack = list(graph._distribution_strategy_stack)
+    # We ignore device placements from any outer scopes while tracing the
+    # function when possible, to avoid hard-coding them in the function
+    # graph. "Default" placements come from the PartitionedCallOp's placement,
+    # so that the same trace of the Python function may be placed on several
+    # different devices and saved functions may be placed on new devices when
+    # restored.
+    if context.executing_eagerly():
+      self.seed = context.global_seed()
+      device_type = context.context().device_spec.device_type
+      self._xla_compile = (device_type == "TPU" or device_type == "XLA_GPU"
+                           or device_type == "XLA_CPU")
+      if self._distribution_strategy_stack or self._xla_compile:
+        self._add_device_to_stack(context.context().device_name)
+    else:
+      self.seed = graph.seed
+      self._xla_compile = getattr(graph, "_xla_compile", False)
+      # TODO(allenl): Figure out if we can remove colocation stack
+      # specialization (currently used in cond_v2), here and in the cache key.
+      self._colocation_stack = graph._colocation_stack.copy()
+      if (self._distribution_strategy_stack
+          or self._xla_compile
+          or device_stack_has_callable(graph._device_function_stack)):
+        # Hard-code devices from device functions in the function body
+        self._device_function_stack = graph._device_function_stack.copy()
+    if not self._read_only_collections:
+      self._collections = graph._collections
+    else:
+      for collection_name in graph.get_all_collection_keys():
+        if collection_name not in WHITELIST_COLLECTIONS:
+          self._collections[collection_name] = graph.get_collection(
+              collection_name)
+      for collection_name in WHITELIST_COLLECTIONS:
+        self._collections[collection_name] = graph.get_collection_ref(
+            collection_name)
+
+    self._variable_creator_stack = graph._variable_creator_stack
+    # Inherit the graph key, since this is used for matching variables in
+    # optimizers.
+    self._graph_key = graph._graph_key
+    # pylint: enable=protected-access
+
+  @property
+  def output_types(self):
+    return [t.dtype for t in self.outputs]
+
+  @property
+  def output_shapes(self):
+    return [t.shape for t in self.outputs]
+
+  @property
+  def variables(self):
+    """A list of variables accessed by this FuncGraph.
+
+    Note that functions keep only weak references to variables. Calling the
+    function after a variable it accesses has been deleted is an error.
+
+    Yields:
+      Strong references to variables accessed by this FuncGraph.
+    """
+    for weak_v in self._weak_variables:
+      v = weak_v()
+      if v is None:
+        raise AssertionError(
+            "Called a function referencing variables which have been deleted. "
+            "This likely means that function-local variables were created and "
+            "not referenced elsewhere in the program. This is generally a "
+            "mistake; consider storing variables in an object attribute on "
+            "first call.")
+      yield v
+
+  @variables.setter
+  def variables(self, var_list):
+    self._weak_variables = [weakref.ref(v) for v in var_list]
+
+  def create_op(
+      self,
+      op_type,
+      inputs,
+      dtypes,
+      input_types=None,
+      name=None,
+      attrs=None,
+      op_def=None,
+      compute_shapes=True,
+      compute_device=True):
+    """Like Graph.create_op, except handles external input tensors.
+
+    This overload adds functionality to create_op to "capture" any external
+    input tensors, i.e. tensors from the eager context or outer function graphs
+    if this is a nested function. See `capture` for more information.
+
+    Args:
+      op_type: The `Operation` type to create. This corresponds to the
+        `OpDef.name` field for the proto that defines the operation.
+      inputs: A list of `Tensor` objects that will be inputs to the `Operation`.
+      dtypes: A list of `DType` objects that will be the types of the tensors
+        that the operation produces.
+      input_types: (Optional.) A list of `DType`s that will be the types of
+        the tensors that the operation consumes. By default, uses the base
+        `DType` of each input in `inputs`. Operations that expect
+        reference-typed inputs must specify `input_types` explicitly.
+      name: (Optional.) A string name for the operation. If not specified, a
+        name is generated based on `op_type`.
+      attrs: (Optional.) A dictionary where the key is the attribute name (a
+        string) and the value is the respective `attr` attribute of the
+        `NodeDef` proto that will represent the operation (an `AttrValue`
+        proto).
+      op_def: (Optional.) The `OpDef` proto that describes the `op_type` that
+        the operation will have.
+      compute_shapes: (Optional.) Deprecated. Has no effect (shapes are always
+        computed).
+      compute_device: (Optional.) If True, device functions will be executed
+        to compute the device property of the Operation.
+
+    Returns:
+      An `Operation` object.
+    """
+    # This capturing logic interacts poorly with control flow contexts which
+    # want to replace inputs of ops far too late in the process. This can lead
+    # the context to get confused and try to create an Enter for an Enter. We
+    # can detect this here and skip the additional Enter which can confuse loop
+    # validation logic.
+    if op_type == "Enter" and inputs[0].op.type == "Enter":
+      if inputs[0].op.get_attr("frame_name") == attrs["frame_name"].s:
+        return inputs[0].op
+    # Calling AddValue on the control flow contexts to force creation of the
+    # backward accumulators in the original graph before we create placeholders
+    # to capture the inputs.
+    ctxt = ops.get_default_graph()._control_flow_context  # pylint: disable=protected-access
+    for i, inp in enumerate(inputs):
+      # TPU Estimator defines a control flow context with no AddValue method.
+      if ctxt is not None and hasattr(ctxt, "AddValue"):
+        inp = ctxt.AddValue(inp)
+      inp = self.capture(inp)
+      inputs[i] = inp
+    return super(FuncGraph, self).create_op(
+        op_type, inputs, dtypes, input_types, name, attrs, op_def,
+        compute_device=compute_device)
+
+  def capture(self, tensor, name=None):
+    """Captures `tensor` if it's external to this graph.
+
+    If `tensor` is from a different graph, returns a placeholder for it.
+    `tensor` and the placeholder will appear in self.captures, and the
+    placeholder will appear in self.inputs.  Multiple calls to this method with
+    the same `tensor` argument will return the same placeholder. If `tensor` is
+    from this graph, returns `tensor`.
+
+    Args:
+      tensor: Tensor. May be from this FuncGraph or a different graph.
+      name: Optional name if a placeholder is created.
+
+    Returns:
+      Tensor from this FuncGraph.
+    """
+    if isinstance(tensor, ops.EagerTensor):
+      if name is None:
+        name = str(ops.uid())
+      return self._capture_helper(tensor, name)
+    if tensor.graph is not self:
+      if name is None:
+        name = tensor.op.name
+      return self._capture_helper(tensor, name)
+    return tensor
+
+  def _capture_helper(self, tensor, name):
+    captured_tensor = self.captures.get(tensor, None)
+    if captured_tensor is None:
+      captured_tensor = _create_substitute_placeholder(tensor, name=name,
+                                                       dtype=tensor.dtype)
+      self.captures[tensor] = captured_tensor
+      self.inputs.append(captured_tensor)
+    tape.record_operation("captured_value", [captured_tensor], [tensor],
+                          lambda x: [x])
+    return captured_tensor
+
+  @property
+  def external_captures(self):
+    """External tensors captured by this function."""
+    return list(self.captures.keys())
+
+  @property
+  def internal_captures(self):
+    """Placeholders in this function corresponding captured tensors."""
+    return list(self.captures.values())
+
+
+def func_graph_from_py_func(name,
+                            python_func,
+                            args,
+                            kwargs,
+                            signature=None,
+                            func_graph=None,
+                            autograph=False,
+                            add_control_dependencies=True,
+                            arg_names=None,
+                            op_return_value=None):
+  """Returns a `FuncGraph` generated from `python_func`.
+
+  Args:
+    name: an identifier for the function.
+    python_func: the Python function to trace.
+    args: the positional args with which the Python function should be called;
+      ignored if a signature is provided.
+    kwargs: the keyword args with which the Python function should be called;
+      ignored if a signature is provided.
+    signature: a possibly nested sequence of `TensorSpecs` specifying the shapes
+      and dtypes of the arguments. When a signature is provided, `args` and
+      `kwargs` are ignored, and `python_func` is traced with Tensors conforming
+      to `signature`. If `None`, the shapes and dtypes are inferred from the
+      inputs.
+    func_graph: Optional. An instance of FuncGraph. If provided, we will use
+      this graph else a new one is built and returned.
+    autograph: whether to use autograph to compile `python_func`.
+      See https://www.tensorflow.org/guide/autograph for more information.
+    add_control_dependencies: If True, automatically adds control dependencies
+      to ensure program order matches execution order and stateful ops always
+      execute.
+    arg_names: Optional list of argument names, used to give input placeholders
+      recognizable names.
+    op_return_value: Optional. A Tensor. If set and `python_func` returns
+      Operations, those return values will be replaced with this value. If not
+      set, returning an Operation triggers an error.
+
+  Returns:
+    A FuncGraph.
+
+  Raises:
+    TypeError: If any of `python_func`'s return values is neither `None` nor a
+      `Tensor`.
+  """
+  if op_return_value is not None:
+    assert isinstance(op_return_value, ops.Tensor), op_return_value
+  if func_graph is None:
+    func_graph = FuncGraph(name)
+  assert isinstance(func_graph, FuncGraph)
+  if add_control_dependencies:
+    control_manager = AutomaticControlDependencies
+  else:
+    control_manager = ops.NullContextmanager
+  with func_graph.as_default(), control_manager() as a:
+    current_scope = variable_scope.get_variable_scope()
+    default_use_recource = current_scope.use_resource
+    current_scope.set_use_resource(True)
+
+    if signature is not None:
+      args = signature
+      kwargs = {}
+
+    # Creates and names placeholders for all arguments.
+    func_args = _get_defun_inputs_from_args(args, arg_names)
+    func_kwargs = _get_defun_inputs_from_kwargs(kwargs)
+
+    # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
+    # Variables to help check whether mutation happens in calling the function
+    # Copy the recursive list, tuple and map structure, but not base objects
+    func_args_before = nest.pack_sequence_as(func_args, nest.flatten(func_args))
+    func_kwargs_before = nest.pack_sequence_as(
+        func_kwargs, nest.flatten(func_kwargs))
+
+    def convert(x):
+      """Converts a function output to a Tensor."""
+      if x is None:
+        return None
+      if op_return_value is not None and isinstance(x, ops.Operation):
+        # TODO(b/79881896): we currently can't capture external control deps, so
+        # this won't work if x needs to be captured (i.e. if python_func returns
+        # captured Operations).
+        with ops.control_dependencies([x]):
+          x = array_ops.identity(op_return_value)
+      elif not isinstance(x, tensor_array_ops.TensorArray):
+        try:
+          x = ops.convert_to_tensor_or_indexed_slices(x)
+        except (ValueError, TypeError):
+          raise TypeError(
+              "To be compatible with tf.contrib.eager.defun, Python functions "
+              "must return zero or more Tensors; in compilation of %s, found "
+              "return value of type %s, which is not a Tensor." %
+              (str(python_func), type(x)))
+      if add_control_dependencies:
+        x = a.mark_as_return(x)
+      return x
+
+    this_tape = tape.push_new_tape()
+    try:
+      if autograph:
+        from tensorflow.python import autograph  # pylint: disable=g-import-not-at-top
+        _, original_func = tf_decorator.unwrap(python_func)
+
+        def wrapper(*args, **kwargs):
+          return autograph.converted_call(
+              original_func, None,
+              autograph.ConversionOptions(
+                  verbose=autograph.Verbosity.BRIEF,
+                  recursive=True,
+                  strip_decorators=(def_function.function,),
+                  optional_features=(),
+              ), *args, **kwargs)
+
+        # Wrapping around a decorator allows checks like tf_inspect.getargspec
+        # to be accurate.
+        converted_func = tf_decorator.make_decorator(original_func, wrapper)
+        tf_decorator.rewrap(python_func, original_func, converted_func)
+
+      func_outputs = python_func(*func_args, **func_kwargs)
+
+      # invariant: `func_outputs` contains only Tensors, IndexedSlices,
+      # SparseTensors, TensorArrays and `None`s.
+      func_outputs = nest.map_structure(convert, func_outputs)
+
+      check_mutation(func_args_before, func_args)
+      check_mutation(func_kwargs_before, func_kwargs)
+    finally:
+      tape.pop_tape(this_tape)
+      current_scope.set_use_resource(default_use_recource)
+
+    # Variables in `func_args`, `func_kwargs` should be explicit inputs
+    # to the function, not captured inputs.
+    tape_variables = this_tape.watched_variables()
+    arg_variables = set()
+    inputs = []
+    for arg in nest.flatten(func_args) + nest.flatten(func_kwargs):
+      if isinstance(arg, resource_variable_ops.ResourceVariable):
+        # Even if an argument variable was not used in the function, we've
+        # already manually captured the resource Tensor when creating argument
+        # placeholders.
+        resource_placeholder = func_graph.captures.pop(arg.handle)
+        arg_variables.add(arg)
+        inputs.append(resource_placeholder)
+      elif isinstance(arg, ops.Tensor):
+        inputs.append(arg)
+    variables = [v for v in tape_variables if v not in arg_variables]
+    func_graph.inputs = inputs + list(func_graph.captures.values())
+
+    func_graph.structured_outputs = func_outputs
+    # Returning a closed-over tensor does not trigger convert_to_tensor.
+    func_graph.outputs.extend(
+        func_graph.capture(x)
+        for x in flatten(func_graph.structured_outputs)
+        if x is not None)
+
+    func_graph.variables = variables
+
+  # Register any other functions defined in the graph.
+  with ops.init_scope():
+    if context.executing_eagerly():
+      for f in func_graph._functions.values():  # pylint: disable=protected-access
+        # TODO(ashankar): What about the gradient registry?
+        context.add_function(f._c_func.func)  # pylint: disable=protected-access
+
+  return func_graph
+
+
+def maybe_captured(tensor):
+  """If t is a captured value placeholder, returns the original captured value.
+
+  Args:
+    tensor: Tensor.
+
+  Returns:
+    A tensor, potentially from a different Graph/FuncGraph.
+  """
+  if (not isinstance(tensor, ops.EagerTensor) and
+      tensor.op.graph.building_function and tensor.op.type == "Placeholder"):
+    for input_t, placeholder_t in tensor.op.graph.captures.items():
+      if tensor == placeholder_t:
+        return maybe_captured(input_t)
+  # pylint: enable=protected-access
+  return tensor
+
+
+def device_stack_has_callable(device_stack):
+  """Checks whether a device stack contains a callable."""
+  return any(callable(spec._device_name_or_function)  # pylint: disable=protected-access
+             for spec in device_stack.peek_objs())
+
+
+def check_mutation(n1, n2):
+  """Check if two list of arguments are exactly the same."""
+  errmsg = ("Function to be traced should not modify structure of input "
+            "arguments. Check if your function has list and dictionary "
+            "operations that alter input arguments, "
+            "such as `list.pop`, `list.append`")
+  try:
+    nest.assert_same_structure(n1, n2)
+  except ValueError:
+    raise ValueError(errmsg)
+
+  for arg1, arg2 in zip(nest.flatten(n1), nest.flatten(n2)):
+    if arg1 is not arg2:
+      raise ValueError(errmsg)
+
+
+def flatten(sequence):
+  """Like `nest.flatten` but also unpacks other Tensor-like objects.
+
+  Flattens non-tensor objects into their constituent tensors.
+
+  Args:
+    sequence: A nested structure of Tensors, IndexedSlices, SparseTensors and
+      TensorArrays.
+
+  Returns:
+    A list of tensors.
+  """
+  # TODO(akshayka): Support `SparseTensor` in a similar fashion.
+  flat_sequence = nest.flatten(sequence)
+  outputs = []
+  for item in flat_sequence:
+    if isinstance(item, ops.IndexedSlices):
+      if item.dense_shape is not None:
+        outputs.extend([item.values, item.indices, item.dense_shape])
+      else:
+        outputs.extend([item.values, item.indices])
+    elif isinstance(item, sparse_tensor.SparseTensor):
+      outputs.extend([item.indices, item.values, item.dense_shape])
+    elif isinstance(item, tensor_array_ops.TensorArray):
+      outputs.append(item.flow)
+    else:
+      outputs.append(item)
+  return outputs
+
+
+def pack_sequence_as(structure, flat_sequence):
+  """Like `nest.pack_sequence_as` but also packs other Tensor-like objects.
+
+  Args:
+    structure: The structure to pack into. May contain Tensors, IndexedSlices,
+      TensorArrays or SparseTensors.
+    flat_sequence: An iterable containing tensors.
+
+  Returns:
+    A nested structure.
+
+  Raises:
+    AssertionError if `structure` and `flat_sequence` are not compatible.
+  """
+  flattened_structure = nest.flatten(structure)
+  flat_sequence_with_slices_and_tas = []
+  index = 0
+  for t in flattened_structure:
+    if isinstance(t, ops.IndexedSlices):
+      if t.dense_shape is not None:
+        flat_sequence_with_slices_and_tas.append(
+            ops.IndexedSlices(*flat_sequence[index:index + 3]))
+        index += 3
+      else:
+        flat_sequence_with_slices_and_tas.append(
+            ops.IndexedSlices(*flat_sequence[index:index + 2]))
+        index += 2
+    elif isinstance(t, sparse_tensor.SparseTensor):
+      flat_sequence_with_slices_and_tas.append(
+          sparse_tensor.SparseTensor(*flat_sequence[index:index + 3]))
+      index += 3
+    elif isinstance(t, tensor_array_ops.TensorArray):
+      flow = flat_sequence[index]
+      ta = tensor_array_ops.build_ta_with_new_flow(t, flow)
+      flat_sequence_with_slices_and_tas.append(ta)
+      index += 1
+    else:
+      flat_sequence_with_slices_and_tas.append(flat_sequence[index])
+      index += 1
+  assert len(flattened_structure) == len(flat_sequence_with_slices_and_tas)
+  return nest.pack_sequence_as(structure, flat_sequence_with_slices_and_tas)
+
+
+def _create_substitute_placeholder(value, name=None, dtype=None):
+  """Creates a placeholder for `value` and propagates shape info to it."""
+  # Note: setting ops.control_dependencies(None) ensures we always put
+  # capturing placeholders outside of any control flow context.
+  with ops.control_dependencies(None):
+    placeholder = graph_placeholder(
+        dtype=dtype or value.dtype, shape=value.shape, name=name)
+  custom_gradient.copy_handle_data(value, placeholder)
+  return placeholder
+
+
+def _get_defun_inputs_from_args(args, names):
+  """Maps Python function positional args to graph-construction inputs."""
+  return _get_defun_inputs(args, names, structure=args)
+
+
+def _get_defun_inputs(flat_args, names, structure):
+  """Maps python function args to graph-construction inputs.
+
+  Args:
+    flat_args: A flat list of user-specified arguments.
+    names: A list of strings with user-specified argument names, same length as
+      `flat_args`. May be `None`, in which case a generic name is used.
+    structure: The original argument list or dictionary.
+
+  Returns:
+    Placeholders with the same structure as `structure`.
+  """
+  func_graph = ops.get_default_graph()
+  function_inputs = []
+  if names is None:
+    names = [None] * len(flat_args)
+  for arg_value, name in zip(flat_args, names):
+    for arg in nest.flatten(arg_value):
+      if isinstance(arg, (ops.Tensor, tensor_spec.TensorSpec)):
+        if isinstance(arg, tensor_spec.TensorSpec) and arg.name:
+          requested_name = arg.name
+        else:
+          requested_name = name
+        placeholder = graph_placeholder(
+            arg.dtype, arg.shape,
+            name=requested_name)
+        if name is not None:
+          # Record the requested/user-specified name in case it's different than
+          # the uniquified name, for validation when exporting signatures.
+          placeholder.op._set_attr(  # pylint: disable=protected-access
+              "_user_specified_name",
+              attr_value_pb2.AttrValue(s=compat.as_bytes(requested_name)))
+        function_inputs.append(placeholder)
+      elif isinstance(arg, resource_variable_ops.ResourceVariable):
+        # Capture arg variables to create placeholders for them. These will be
+        # removed as captures after the function is traced (since otherwise we'd
+        # just add it back with a new placeholder when the variable was
+        # referenced).
+        placeholder = func_graph.capture(arg.handle, name=name)
+        placeholder.op._set_attr(  # pylint: disable=protected-access
+            "_user_specified_name",
+            attr_value_pb2.AttrValue(s=compat.as_bytes(name)))
+        function_inputs.append(arg)
+      else:
+        function_inputs.append(arg)
+  return nest.pack_sequence_as(structure, function_inputs)
+
+
+def _get_defun_inputs_from_kwargs(kwargs):
+  """Maps Python function keyword args to graph-construction inputs."""
+  if kwargs:
+    names, flat_args = zip(*sorted(kwargs.items()))
+  else:
+    names = []
+    flat_args = []
+  return _get_defun_inputs(flat_args, names, structure=kwargs)
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 225208944ed2dfa516b8bfb0b69ad6c5f0c897a6..cfdc915a1b34930b8f5205550c547d0eec331e52 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -209,6 +209,7 @@ class _DefinedFunction(object):
                out_names=None,
                shape_func=None,
                capture_by_value=False,
+               whitelisted_stateful_ops=None,
                **kwargs):
     """Creates _DefinedFunction.
 
@@ -229,6 +230,8 @@ class _DefinedFunction(object):
         output shapes.
       capture_by_value: Boolean (defaults to False). If True, captured values
         will be copied into the function body.
+      whitelisted_stateful_ops: A set of ops that if stateful we ignore and
+        copy into the function body, when `capture_by_value` is True.
       **kwargs: The keyword arguments. **kwargs is passed to every call
         site of this function.
 
@@ -244,6 +247,9 @@ class _DefinedFunction(object):
     self._out_names = out_names
     self._shape_func = shape_func
     self._capture_by_value = capture_by_value
+    self._whitelisted_stateful_ops = whitelisted_stateful_ops
+    if self._whitelisted_stateful_ops is None:
+      self._whitelisted_stateful_ops = set()
     self._extra_kwargs = kwargs
     # Constructed only when C API is disabled, lazily
     self._definition = None
@@ -340,8 +346,13 @@ class _DefinedFunction(object):
       return
 
     temp_graph = func_graph_from_py_func(
-        self._func, self._arg_names, self._arg_types, self._func_name,
-        self._capture_by_value, self._caller_device)
+        self._func,
+        self._arg_names,
+        self._arg_types,
+        self._func_name,
+        self._capture_by_value,
+        self._caller_device,
+        whitelisted_stateful_ops=self._whitelisted_stateful_ops)
 
     self._extra_inputs = temp_graph.extra_inputs
     # pylint: disable=protected-access
@@ -625,9 +636,11 @@ class _FuncGraph(ops.Graph):
   function argument and the caller passes in the captured tensor.
   """
 
-  def __init__(self, name, capture_by_value, *args, **kwargs):
+  def __init__(self, name, capture_by_value, whitelisted_stateful_ops, *args,
+               **kwargs):
     super(_FuncGraph, self).__init__(*args, **kwargs)
     self._capture_by_value = capture_by_value
+    self._whitelisted_stateful_ops = whitelisted_stateful_ops
     self._building_function = True
     self._outer_graph = ops.get_default_graph()
     self._vscope = vs.get_variable_scope()
@@ -756,20 +769,17 @@ class _FuncGraph(ops.Graph):
       ph = array_ops.placeholder(
           tensor.dtype, shape=tensor.get_shape(), name=name)
     # pylint: disable=protected-access
-    if ops._USE_C_SHAPES:
-      if isinstance(tensor, ops.EagerTensor):
-        handle_data = tensor._handle_data
-        if handle_data:
-          handle_data = handle_data.SerializeToString()
-      else:
-        handle_data = c_api.GetHandleShapeAndType(tensor.graph._c_graph,
-                                                  tensor._as_tf_output())
-
+    if isinstance(tensor, ops.EagerTensor):
+      handle_data = tensor._handle_data
       if handle_data:
-        c_api.SetHandleShapeAndType(ph.graph._c_graph, ph._as_tf_output(),
-                                    compat.as_bytes(handle_data))
+        handle_data = handle_data.SerializeToString()
     else:
-      ph._handle_data = tensor._handle_data
+      handle_data = c_api.GetHandleShapeAndType(tensor.graph._c_graph,
+                                                tensor._as_tf_output())
+
+    if handle_data:
+      c_api.SetHandleShapeAndType(ph.graph._c_graph, ph._as_tf_output(),
+                                  compat.as_bytes(handle_data))
     # pylint: enable=protected-access
     self.inputs.append(ph)
     self._captured[tensor] = ph
@@ -788,7 +798,7 @@ class _FuncGraph(ops.Graph):
     # pylint: disable=protected-access
     op_def = graph_to_function_def._get_op_def(op)
     # pylint: enable=protected-access
-    if op_def.is_stateful:
+    if op_def.is_stateful and op not in self._whitelisted_stateful_ops:
       raise ValueError("Cannot capture a stateful node (name:%s, type:%s) "
                        "by value." % (op.name, op.type))
     elif op.type in ("Placeholder", "PlaceholderV2"):
@@ -810,10 +820,17 @@ class _FuncGraph(ops.Graph):
     return captured_op
 
 
-def func_graph_from_py_func(func, arg_names, arg_types, name=None,
-                            capture_by_value=False, device=None,
-                            colocation_stack=None, container=None,
-                            collections_ref=None, arg_shapes=None):
+def func_graph_from_py_func(func,
+                            arg_names,
+                            arg_types,
+                            name=None,
+                            capture_by_value=False,
+                            device=None,
+                            colocation_stack=None,
+                            container=None,
+                            collections_ref=None,
+                            arg_shapes=None,
+                            whitelisted_stateful_ops=None):
   """Returns a _FuncGraph generated from `func`.
 
   Args:
@@ -831,6 +848,8 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
     collections_ref: A reference to a collections dict the _FuncGraph should
       use internally.
     arg_shapes: A sequence of the function's argument shapes.
+    whitelisted_stateful_ops: A set of ops that if stateful we ignore and
+      re-create.
 
   Returns:
     A _FuncGraph.
@@ -840,7 +859,7 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
   """
   if not name:
     name = function_utils.get_func_name(func)
-  func_graph = _FuncGraph(name, capture_by_value)
+  func_graph = _FuncGraph(name, capture_by_value, whitelisted_stateful_ops)
 
   with func_graph.as_default(), ops.device(device):
     # pylint: disable=protected-access
@@ -877,8 +896,8 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
       # If func only returned one value, make it a tuple.
       if not isinstance(outputs, (list, tuple)):
         outputs = (outputs,)
-      if any([_ is None for _ in outputs]):
-        raise ValueError("Function can not return None.")
+      if any(_ is None for _ in outputs):
+        raise ValueError("Function %s can not return None." % name)
     # Ensures each output is a Tensor in the function graph.
     outputs = [ops.convert_to_tensor(t) for t in outputs]
     outputs = [func_graph.capture(t) if t.graph is not func_graph else t
@@ -1193,7 +1212,7 @@ def get_extra_args():
 
 
 def _type_list_to_str(types):
-  if any([_ not in _DTYPE_TO_STR for _ in types]):
+  if any(_ not in _DTYPE_TO_STR for _ in types):
     raise ValueError("Unsupported dtypes: %s" % types)
   return "".join([_DTYPE_TO_STR[_] for _ in types])
 
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index a04fa369ae507722622ca73b560287c881df2cb9..4d1aabde06984ded2a6e04d549538bc0afdbdc75 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -18,23 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
-from tensorflow.python.eager import function
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import versions
-from tensorflow.python.ops import cond_v2_impl
-
-# This is to avoid a circular dependency with cond_v2_impl.
-cond_v2_impl._function_def_to_graph = sys.modules[__name__]  # pylint: disable=protected-access
+from tensorflow.python.framework.func_graph import FuncGraph
 
 
 def function_def_to_graph(fdef, input_shapes=None):
-  """Converts a FunctionDef to a function.FuncGraph (sub-class Graph).
+  """Converts a FunctionDef to a FuncGraph (sub-class Graph).
 
   The returned FuncGraph's `name`, `inputs` and `outputs` fields will be set.
   The input tensors are represented as placeholders.
@@ -52,7 +46,7 @@ def function_def_to_graph(fdef, input_shapes=None):
   Returns:
     A FuncGraph.
   """
-  func_graph = function.FuncGraph(fdef.signature.name)
+  func_graph = FuncGraph(fdef.signature.name)
   graph_def, nested_to_flat_tensor_name = function_def_to_graph_def(
       fdef, input_shapes)
 
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index e013fb6e4dad1a014a90d3c9ccb9f611b4f7cebf..ddf1a6e74d2f7772c94dc5b39034a28ba0d715b2 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import test_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -52,6 +53,7 @@ class FunctionDefToGraphTest(test.TestCase):
     fdef.signature.name = "_whats_in_a_name"
     return fdef
 
+  @test_util.run_deprecated_v1
   def testInputsAndOutputs(self):
     fdef = self._build_function_def()
     g = function_def_to_graph.function_def_to_graph(fdef)
@@ -186,6 +188,7 @@ class FunctionDefToGraphDefTest(test.TestCase):
     self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
     self.assertFalse("shape" in g.node[2].attr)
 
+  @test_util.run_deprecated_v1
   def testFunctionCallsFromFunction(self):
     x = constant_op.constant(5.0)
     y = constant_op.constant(10.0)
@@ -238,7 +241,7 @@ class FunctionDefToGraphDefTest(test.TestCase):
     op = func_graph.get_operation_by_name("y")
     self.assertEqual(len(op.control_inputs), 2)
     self.assertEqual(op.control_inputs[0].name, "x")
-    self.assertEqual(op.control_inputs[1].name, "placeholder")
+    self.assertEqual(op.control_inputs[1].name, "inp")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 87f567db0eb59c4ac674e6dae1c73cfcb4de25b9..6ec71ba8e9053000629ce0cd0e020494adabfe2d 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -86,7 +86,6 @@ def _OptimizerOptions():
         yield cfg
 
 
-@test_util.with_c_shapes
 class FunctionTest(test.TestCase):
   """Test methods for verifying Function support.
 
@@ -104,8 +103,9 @@ class FunctionTest(test.TestCase):
       call = MyIdentityFunc([18.0])
       self.assertEqual("MyIdentity", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([18.0], sess.run(call))
+        self.assertAllEqual([18.0], self.evaluate(call))
 
+  @test_util.run_deprecated_v1
   def testIdentityImplicitDeref(self):
 
     @function.Defun(dtypes.float32, func_name="MyIdentity")
@@ -118,8 +118,8 @@ class FunctionTest(test.TestCase):
       self.assertEqual("MyIdentity", call.op.name)
       for cfg in _OptimizerOptions():
         with session.Session(config=cfg) as sess:
-          sess.run(var.initializer)
-          self.assertAllEqual([18.0], sess.run(call))
+          self.evaluate(var.initializer)
+          self.assertAllEqual([18.0], self.evaluate(call))
 
   def testIdentityOutputName(self):
 
@@ -132,7 +132,7 @@ class FunctionTest(test.TestCase):
       call = MyIdentityFunc([18.0])
       self.assertEqual("MyIdentity", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([18.0], sess.run(call))
+        self.assertAllEqual([18.0], self.evaluate(call))
 
   def testTooManyOutputNames(self):
 
@@ -160,7 +160,7 @@ class FunctionTest(test.TestCase):
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([5.0], sess.run(call))
+        self.assertAllEqual([5.0], self.evaluate(call))
 
   def testFunctionWithNoOutput(self):
 
@@ -189,7 +189,7 @@ class FunctionTest(test.TestCase):
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([5.0], sess.run(call))
+        self.assertAllEqual([5.0], self.evaluate(call))
 
   def testDefineFunctionDuplicateOutputs(self):
 
@@ -226,8 +226,8 @@ class FunctionTest(test.TestCase):
       call_g = XSquarePlusOneGrad([2.0], [0.1])
 
       with session.Session() as sess:
-        self.assertAllClose([5.0], sess.run(call_f))
-        self.assertAllClose([0.4], sess.run(call_g))
+        self.assertAllClose([5.0], self.evaluate(call_f))
+        self.assertAllClose([0.4], self.evaluate(call_g))
 
   def testTanhSymGrad(self):
 
@@ -324,6 +324,7 @@ class FunctionTest(test.TestCase):
       self.assertEqual(x.get_shape(), dx.get_shape())
       self.assertEqual(y.get_shape(), dy.get_shape())
 
+  @test_util.run_deprecated_v1
   def testSymGradAttr(self):
 
     @function.Defun(noinline=True)
@@ -367,7 +368,7 @@ class FunctionTest(test.TestCase):
       else:
         dx, dy = gradients_impl.gradients([z], [x, y])
       with session.Session() as sess:
-        dx_val, dy_val = sess.run([dx, dy])
+        dx_val, dy_val = self.evaluate([dx, dy])
         self.assertEqual([2.0], dx_val)
         self.assertEqual([0.0], dy_val)
 
@@ -389,7 +390,7 @@ class FunctionTest(test.TestCase):
       call = AConstant()
       self.assertEqual("AConstant", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([42], sess.run(call))
+        self.assertAllEqual([42], self.evaluate(call))
 
   def testDefineFunctionNames(self):
 
@@ -440,6 +441,7 @@ class FunctionTest(test.TestCase):
                                    "assertion failed.*-3"):
         self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0)
 
+  @test_util.run_deprecated_v1
   def testAssertWrapper(self):
 
     @function.Defun(dtypes.float32)
@@ -454,8 +456,9 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
+  @test_util.run_deprecated_v1
   def testWhileLoopCallsFunc(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
 
       @function.Defun(dtypes.float32)
       def Times2(x):
@@ -470,9 +473,10 @@ class FunctionTest(test.TestCase):
 
       loop = control_flow_ops.while_loop(lambda x: x < 1e5, Body, [1.0])
 
-      ans = sess.run(loop)
+      ans = self.evaluate(loop)
       self.assertAllClose(ans, 131072.)
 
+  @test_util.run_deprecated_v1
   def testControlFlowStrictness(self):
     """Inlined functions must not execute in a untaken control flow branch."""
 
@@ -519,6 +523,7 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         sess.run(loop, {pred: True, x: 3})
 
+  @test_util.run_deprecated_v1
   def testVar(self):
 
     @function.Defun(dtypes.float32)
@@ -534,6 +539,7 @@ class FunctionTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllEqual(z.eval(), 101.)
 
+  @test_util.run_deprecated_v1
   def testResourceVarAsImplicitInput(self):
     g = ops.Graph()
     with g.as_default(), ops.device("cpu:0"):
@@ -554,8 +560,8 @@ class FunctionTest(test.TestCase):
 
     with self.session(graph=g):
       v.initializer.run()
-      self.assertAllEqual(expected_val.eval(), actual_val.eval())
-      self.assertAllEqual(expected_shape, actual_shape.eval())
+      self.assertAllEqual(expected_val.eval(), self.evaluate(actual_val))
+      self.assertAllEqual(expected_shape, self.evaluate(actual_shape))
 
   def testDefineErrors(self):
     with ops.Graph().as_default():
@@ -652,8 +658,8 @@ class FunctionTest(test.TestCase):
       # pylint: enable=unexpected-keyword-arg
       self.assertEqual("next", call2.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([1], sess.run(call1))
-        self.assertAllEqual([0], sess.run(call2))
+        self.assertAllEqual([1], self.evaluate(call1))
+        self.assertAllEqual([0], self.evaluate(call2))
 
   def testNestedFunction(self):
 
@@ -709,6 +715,7 @@ class FunctionTest(test.TestCase):
     gdef = g.as_graph_def()
     self.assertEqual(0, len(gdef.library.function))
 
+  @test_util.run_deprecated_v1
   def testReduction(self):
     g = ops.Graph()
 
@@ -737,6 +744,7 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(vals[0], vals[1])
       self.assertAllClose(vals[2], vals[3])
 
+  @test_util.run_deprecated_v1
   def testCapture(self):
     g = ops.Graph()
     with g.as_default():
@@ -783,6 +791,7 @@ class FunctionTest(test.TestCase):
         # NOTE: We still do not support capturing control deps.
         _ = Foo(x)
 
+  @test_util.run_deprecated_v1
   def testCaptureInWhileLoop(self):
     g = ops.Graph()
     with g.as_default():
@@ -796,8 +805,9 @@ class FunctionTest(test.TestCase):
       y = Foo()
 
     with self.session(graph=g) as sess:
-      self.assertEqual(sess.run(y), 10)
+      self.assertEqual(self.evaluate(y), 10)
 
+  @test_util.run_deprecated_v1
   def testCaptureInCond(self):
     g = ops.Graph()
     with g.as_default():
@@ -811,8 +821,8 @@ class FunctionTest(test.TestCase):
       z = Foo(False)
 
     with self.session(graph=g) as sess:
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 2)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 2)
 
   def testStableName(self):
 
@@ -827,6 +837,7 @@ class FunctionTest(test.TestCase):
       self.assertEqual("Foo_aCYSbwBkR5A",
                        Foo.instantiate([dtypes.float32] * 3).name)
 
+  @test_util.run_deprecated_v1
   def testSignatureHash(self):
     # Foo.Inner and Bar.Inner have identical function body but have
     # different signatures. They should be treated as two different functions.
@@ -856,7 +867,7 @@ class FunctionTest(test.TestCase):
       z = Bar(x)
 
     with self.session(graph=g) as sess:
-      v0, v1 = sess.run([y, z])
+      v0, v1 = self.evaluate([y, z])
       self.assertAllEqual(v0, 20.)
       self.assertAllEqual(v1, 20.)
 
@@ -879,6 +890,7 @@ class FunctionTest(test.TestCase):
       y = Bar(array_ops.zeros([1, 2, 3]))
       self.assertAllEqual(y.get_shape().as_list(), [1, 1, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testVariableReuse(self):
 
     def LinearWithReuse(input_tensor, reuse=None):
@@ -902,11 +914,12 @@ class FunctionTest(test.TestCase):
     self.assertEqual(global_vars[0].name, "linear/w:0")
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       output_val = sess.run(
           output_op, feed_dict={input_op: np.random.rand(32, 100)})
       self.assertEqual(output_val.shape, (32, 100))
 
+  @test_util.run_deprecated_v1
   def testFunctionCallInDifferentVariableScopes(self):
 
     @function.Defun(dtypes.float32)
@@ -930,7 +943,7 @@ class FunctionTest(test.TestCase):
     self.assertEqual(global_vars[0].name, "vs1/var:0")
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       out1, out2 = sess.run(
           [out1_op, out2_op], feed_dict={input_op: np.linspace(1, 10, 10)})
       self.assertAllEqual(out1, np.linspace(2, 11, 10))
@@ -970,6 +983,7 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(
           np.array([1.0, 0.0]).astype(np.float32), sess.run(dinp, {inp: x}))
 
+  @test_util.run_deprecated_v1
   def testFunctionMarkedStateful(self):
 
     @function.Defun(dtypes.int32, dtypes.float32)
@@ -993,10 +1007,11 @@ class FunctionTest(test.TestCase):
     result_2 = Bar(constant_op.constant(100, dtype=dtypes.int64))
 
     with session.Session() as sess:
-      self.assertEqual(4.0, sess.run(result_1))
-      self.assertEqual(100, sess.run(result_2))
+      self.assertEqual(4.0, self.evaluate(result_1))
+      self.assertEqual(100, self.evaluate(result_2))
       self.assertEqual((4.0, 100), sess.run((result_1, result_2)))
 
+  @test_util.run_deprecated_v1
   def testStatefulFunction(self):
 
     @function.Defun()
@@ -1039,6 +1054,29 @@ class FunctionTest(test.TestCase):
         self.assertFalse(all(val3 == val1))
         self.assertFalse(all(val4 == val2))
 
+  def testStatefulFunctionWithWhitelisting(self):
+    t = random_ops.random_uniform([100], maxval=10, dtype=dtypes.int32)
+
+    @function.Defun(capture_by_value=True)
+    def StatefulFn():
+      return t + constant_op.constant(3, dtype=dtypes.int32)
+
+    # First time we try to capture a stateful RandomUniform op.
+    with self.assertRaisesRegexp(ValueError, "Cannot capture a stateful node"):
+      res = StatefulFn()
+
+    # This time we whitelist this op, so that its recreated.
+    @function.Defun(capture_by_value=True, whitelisted_stateful_ops=set([t.op]))
+    def StatefulFn2():
+      return t + constant_op.constant(3, dtype=dtypes.int32)
+
+    res = StatefulFn2()
+    with session.Session() as sess:
+      r = sess.run(res)
+      for i in r:
+        self.assertGreaterEqual(i, 3)
+
+  @test_util.run_deprecated_v1
   def testSameFunctionOnTwoDevices(self):
 
     @function.Defun(dtypes.float32)
@@ -1054,10 +1092,11 @@ class FunctionTest(test.TestCase):
     for config in _OptimizerOptions():
       config.device_count["CPU"] = 2
       with session.Session(config=config) as sess:
-        self.assertEqual(42.0, sess.run(f_0))
-        self.assertEqual(44.0, sess.run(f_1))
+        self.assertEqual(42.0, self.evaluate(f_0))
+        self.assertEqual(44.0, self.evaluate(f_1))
         self.assertEqual((42.0, 44.0), sess.run((f_0, f_1)))
 
+  @test_util.run_deprecated_v1
   def testGuaranteedConstsAreCaptured(self):
     var = variables.Variable(1.0)
     const = array_ops.guarantee_const(var)
@@ -1077,10 +1116,11 @@ class FunctionTest(test.TestCase):
       self.assertNotEqual("GuaranteeConst", fifth.consumers()[0].node_def.op)
       return output
 
-    with self.test_session(use_gpu=False) as sess:
-      sess.run(var.initializer)
+    with self.session(use_gpu=False) as sess:
+      self.evaluate(var.initializer)
       _ = sess.run(CapturesGuaranteedConst(), {also_not_const: 1.0})
 
+  @test_util.run_deprecated_v1
   def testSameFunctionDifferentGrads(self):
 
     def PartOne(x):
@@ -1129,14 +1169,13 @@ class FunctionTest(test.TestCase):
       dx2, = gradients_impl.gradients(ys=[y2], xs=[x2])
 
     with self.session(graph=g) as sess:
-      v0, v1, v2 = sess.run([dx0, dx1, dx2])
+      v0, v1, v2 = self.evaluate([dx0, dx1, dx2])
 
     self.assertAllEqual(v0, 2.)
     self.assertAllEqual(v1, 101.)
     self.assertAllEqual(v2, 50.)
 
 
-@test_util.with_c_shapes
 class FunctionsFromProtos(test.TestCase):
 
   def expectFunctionsEqual(self, func, grad_func=None, new_func=None):
@@ -1153,6 +1192,7 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(func.declared_input_types, new_func.declared_input_types)
     self.assertEqual(func.captured_inputs, new_func.captured_inputs)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
 
     @function.Defun(dtypes.float32, dtypes.float32)
@@ -1360,9 +1400,9 @@ class FunctionsFromProtos(test.TestCase):
                      True)
 
 
-@test_util.with_c_shapes
 class FunctionOverloadTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
 
     @function.Defun()
@@ -1413,9 +1453,9 @@ class FunctionOverloadTest(test.TestCase):
                      "Successor of x.")
 
 
-@test_util.with_c_shapes
 class FunctionCaptureByValueTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCaptureByValue(self):
     g = ops.Graph()
     with g.as_default():
@@ -1443,7 +1483,6 @@ class FunctionCaptureByValueTest(test.TestCase):
       self.assertAllEqual(y.eval(), [[12.0]])
 
 
-@test_util.with_c_shapes
 class UnrollLSTMTest(test.TestCase):
   BATCH_SIZE = 16
   LSTM_DIMS = 32
@@ -1538,7 +1577,7 @@ class UnrollLSTMTest(test.TestCase):
       tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start,
                       len(str(gdef)), len(gdef.SerializeToString()))
       with g.as_default(), session.Session(config=cfg) as sess:
-        return sess.run(m)
+        return self.evaluate(m)
 
     mv0 = RunForward("complete")
     for cfg in _OptimizerOptions():
@@ -1567,7 +1606,7 @@ class UnrollLSTMTest(test.TestCase):
       tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start,
                       len(str(gdef)), len(gdef.SerializeToString()))
       with g.as_default(), session.Session(config=cfg) as sess:
-        return sess.run(dw)
+        return self.evaluate(dw)
 
     d0 = RunForwardBackward("complete")
     for cfg in _OptimizerOptions():
@@ -1580,7 +1619,6 @@ class UnrollLSTMTest(test.TestCase):
       self.assertAllClose(d0, d3, rtol=1e-4, atol=1e-4)
 
 
-@test_util.with_c_shapes
 class FunctionInlineControlTest(test.TestCase):
 
   def testFoo(self):
@@ -1639,29 +1677,19 @@ class FunctionInlineControlTest(test.TestCase):
       self.assertEqual(MetadataHasCell(run_metadata), noinline)
 
 
-@function.Defun(*[dtypes.float32] * 3)
-def Linear(w, b, x):
-  return nn_ops.relu(math_ops.matmul(x, w) + b)
-
-
-@function.Defun(*[dtypes.float32] * 5)
-def Linear2(w1, b1, w2, b2, x):
-  return Linear(w2, b2, Linear(w1, b1, x))
-
-
-@function.Defun(*[dtypes.float32] * 3)
-def LinearWithCApi(w, b, x):
-  return nn_ops.relu(math_ops.matmul(x, w) + b)
-
+class ModuleFunctionTest(test.TestCase):
 
-@function.Defun(*[dtypes.float32] * 5)
-def Linear2WithCApi(w1, b1, w2, b2, x):
-  return LinearWithCApi(w2, b2, LinearWithCApi(w1, b1, x))
+  @test_util.run_deprecated_v1
+  def testBasic(self):
 
+    @function.Defun(*[dtypes.float32] * 3)
+    def LinearWithCApi(w, b, x):
+      return nn_ops.relu(math_ops.matmul(x, w) + b)
 
-class ModuleFunctionTest(test.TestCase):
+    @function.Defun(*[dtypes.float32] * 5)
+    def Linear2WithCApi(w1, b1, w2, b2, x):
+      return LinearWithCApi(w2, b2, LinearWithCApi(w1, b1, x))
 
-  def testBasic(self):
     with ops.Graph().as_default():
       a, b, c, d, e = [
           constant_op.constant([[_]], dtype=dtypes.float32) for _ in range(5)
@@ -1669,11 +1697,10 @@ class ModuleFunctionTest(test.TestCase):
       y = LinearWithCApi(a, b, c)
       z = Linear2WithCApi(a, b, c, d, e)
       with session.Session() as sess:
-        self.assertAllEqual([[1]], sess.run(y))
-        self.assertAllEqual([[5]], sess.run(z))
+        self.assertAllEqual([[1]], self.evaluate(y))
+        self.assertAllEqual([[5]], self.evaluate(z))
 
 
-@test_util.with_c_shapes
 class VariableHoistingTest(test.TestCase):
 
   def _testSimpleModel(self, use_forward_func, use_resource=False):
@@ -1723,8 +1750,8 @@ class VariableHoistingTest(test.TestCase):
     self.assertEqual("Foo/b", b.op.name)
 
     with self.session(graph=g) as sess:
-      sess.run(variables.global_variables_initializer())
-      w, b, x, y0, loss, dw, db = sess.run([w, b, x, y0, loss, dw, db])
+      self.evaluate(variables.global_variables_initializer())
+      w, b, x, y0, loss, dw, db = self.evaluate([w, b, x, y0, loss, dw, db])
 
     self.assertAllEqual(w.shape, (64, 64))
     self.assertAllClose(np.sum(w), 2050.44)
@@ -1736,10 +1763,12 @@ class VariableHoistingTest(test.TestCase):
     self.assertAllEqual(db.shape, (64,))
     self.assertAllClose(np.sum(db), 0.509, rtol=1e-2)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testSimpleModel(True)
     self._testSimpleModel(False)
 
+  @test_util.run_deprecated_v1
   def testBasicResource(self):
     self._testSimpleModel(True, use_resource=True)
     self._testSimpleModel(False, use_resource=True)
diff --git a/tensorflow/python/framework/graph_io.py b/tensorflow/python/framework/graph_io.py
index 47e1344eaeda7d0cc6a4b0e652071f79f1bc24fa..ee0fd227eec688ec7c48dad241931f6700173ee0 100644
--- a/tensorflow/python/framework/graph_io.py
+++ b/tensorflow/python/framework/graph_io.py
@@ -27,7 +27,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('io.write_graph', 'train.write_graph')
+@tf_export('io.write_graph', v1=['io.write_graph', 'train.write_graph'])
 def write_graph(graph_or_graph_def, logdir, name, as_text=True):
   """Writes a graph proto to a file.
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 394fac6c856197030f85aab5b11fa881eddf670d..1b61ac925ce3d555525c9086172d43c75a3af10c 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 _VARIABLE_OPS = {
@@ -50,7 +51,10 @@ def _is_variable_op(op):
   return op in _VARIABLE_OPS
 
 
-@tf_export("graph_util.must_run_on_cpu")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.must_run_on_cpu")
+@tf_export(v1=["graph_util.must_run_on_cpu"])
 def must_run_on_cpu(node, pin_variables_on_cpu=False):
   """Returns True if the given node_def must run on CPU, otherwise False.
 
@@ -149,7 +153,10 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
   return nodes_to_keep
 
 
-@tf_export("graph_util.extract_sub_graph")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.extract_sub_graph")
+@tf_export(v1=["graph_util.extract_sub_graph"])
 def extract_sub_graph(graph_def, dest_nodes):
   """Extract the subgraph that can reach any of the nodes in 'dest_nodes'.
 
@@ -187,7 +194,10 @@ def extract_sub_graph(graph_def, dest_nodes):
   return out
 
 
-@tf_export("graph_util.tensor_shape_from_node_def_name")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+@tf_export(v1=["graph_util.tensor_shape_from_node_def_name"])
 def tensor_shape_from_node_def_name(graph, input_name):
   """Convenience function to get a shape from a NodeDef's input string."""
   # To get a tensor, the name must be in the form <input>:<port>, for example
@@ -202,7 +212,10 @@ def tensor_shape_from_node_def_name(graph, input_name):
   return shape
 
 
-@tf_export("graph_util.convert_variables_to_constants")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.convert_variables_to_constants")
+@tf_export(v1=["graph_util.convert_variables_to_constants"])
 def convert_variables_to_constants(sess,
                                    input_graph_def,
                                    output_node_names,
@@ -289,7 +302,10 @@ def convert_variables_to_constants(sess,
   return output_graph_def
 
 
-@tf_export("graph_util.remove_training_nodes")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+@tf_export(v1=["graph_util.remove_training_nodes"])
 def remove_training_nodes(input_graph, protected_nodes=None):
   """Prunes out nodes that aren't needed for inference.
 
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 563a177dd06b3b165335c91c3a92ff8877609efc..dd26b8a78e9d2e13b34770775fcb1219745396e0 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops as math_ops_lib
@@ -102,6 +103,7 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertDeviceEqual(var_5.device, "/device:GPU:0")
     self.assertDeviceEqual(var_6.device, "/device:CPU:0")
 
+  @test_util.run_v1_only("b/120545219")
   def testNestedDeviceFunctions(self):
     with ops.Graph().as_default():
       var_0 = variables.VariableV1(0)
@@ -210,8 +212,8 @@ class DeviceFunctionsTest(test.TestCase):
 
       with session.Session() as sess:
         init = variables.variables_initializer([variable_node])
-        sess.run(init)
-        output = sess.run(output_node)
+        self.evaluate(init)
+        output = self.evaluate(output_node)
         self.assertNear(4.0, output, 0.00001)
         variable_graph_def = sess.graph.as_graph_def()
 
@@ -242,8 +244,8 @@ class DeviceFunctionsTest(test.TestCase):
         output_node = math_ops_lib.multiply(
             variable_node, 2.0, name="output_node")
         with session.Session() as sess:
-          sess.run(variable_node.initializer)
-          output = sess.run(output_node)
+          self.evaluate(variable_node.initializer)
+          output = self.evaluate(output_node)
           self.assertNear(2.0, output, 0.00001)
           variable_graph_def = sess.graph.as_graph_def()
           # First get the constant_graph_def when variable_names_whitelist is
@@ -256,7 +258,7 @@ class DeviceFunctionsTest(test.TestCase):
 
           # Then initialize the unused variable, and get another
           # constant_graph_def when variable_names_whitelist is not set.
-          sess.run(another_variable.initializer)
+          self.evaluate(another_variable.initializer)
           constant_graph_def_without_variable_whitelist = (
               graph_util.convert_variables_to_constants(
                   sess, variable_graph_def, ["output_node"]))
@@ -295,7 +297,7 @@ class DeviceFunctionsTest(test.TestCase):
             ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"])
       with session.Session() as sess:
         output_node = sess.graph.get_tensor_by_name("output_node:0")
-        output = sess.run(output_node)
+        output = self.evaluate(output_node)
         self.assertNear(2.0, output, 0.00001)
 
   def create_node_def(self, op, name, inputs):
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index c6595918ae4ec595f757bc8059e907d2d55c14c9..98c7aeccc4b19edfc433a6556108ef8b77d12aa4 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -21,6 +21,7 @@ import contextlib
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python import tf2
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import errors
@@ -253,7 +254,9 @@ def _ProcessNewOps(graph):
     # Find any device in the list of colocated ops that have a device, if it
     # exists.  We assume that if multiple ops have devices, they refer to the
     # same device.  Otherwise, a runtime error will occur since the colocation
-    # property cannot be guaranteed.
+    # property cannot be guaranteed.  Note in TF2 colocations have been removed
+    # from the public API and will be considered a hint, so there is no runtime
+    # error.
     #
     # One possible improvement is to try to check for compatibility of all
     # devices in this list at import time here, which would require
@@ -262,6 +265,10 @@ def _ProcessNewOps(graph):
       try:
         coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
       except KeyError:
+        # Do not error in TF2 if the colocation cannot be guaranteed
+        if tf2.enabled():
+          continue
+
         raise ValueError('Specified colocation to an op that '
                          'does not exist during import: %s in %s' %
                          (coloc_op_name, op.name))
@@ -370,7 +377,8 @@ def import_graph_def(graph_def,
 
   Returns:
     A list of `Operation` and/or `Tensor` objects from the imported graph,
-    corresponding to the names in `return_elements`.
+    corresponding to the names in `return_elements`,
+    and None if `returns_elements` is None.
 
   Raises:
     TypeError: If `graph_def` is not a `GraphDef` proto,
@@ -430,17 +438,16 @@ def import_graph_def(graph_def,
     #
     # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
     # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
-    # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
-    # _USE_C_SHAPES is removed.
-    if graph_def.library and graph_def.library.function:
-      # pylint: disable=protected-access
-      functions = function._from_library(graph_def.library)
-      for f in functions:
-        f.add_to_graph(graph)
-      # pylint: enable=protected-access
 
     _ProcessNewOps(graph)
 
+  if graph_def.library and graph_def.library.function:
+    # pylint: disable=protected-access
+    functions = function._from_library(graph_def.library)
+    for f in functions:
+      f.add_to_graph(graph)
+    # pylint: enable=protected-access
+
   # Treat input mappings that don't appear in the graph as an error, because
   # they are likely to be due to a typo.
   missing_unused_input_keys = (
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 2b4d8e7299559b689763e18f204556890a412410..66e80b558523bcab64a1a509aae60d5b9e679e40 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -397,11 +397,11 @@ class ImportGraphDefTest(test.TestCase):
       # Run the imported graph.
       # TODO(b/76173421): make this work (currently DCHECKS)
       # with self.cached_session() as sess:
-      #   sess.run(imported_init)
-      #   self.assertEqual(sess.run(imported_var), 1.0)
-      #   self.assertEqual(sess.run(imported_assign), 2.0)
-      #   self.assertEqual(list(sess.run(imported_shape)), [])
-      #   self.assertEqual(list(sess.run(new_var_shape)), [])
+      #   self.evaluate(imported_init)
+      #   self.assertEqual(self.evaluate(imported_var), 1.0)
+      #   self.assertEqual(self.evaluate(imported_assign), 2.0)
+      #   self.assertEqual(list(self.evaluate(imported_shape)), [])
+      #   self.assertEqual(list(self.evaluate(new_var_shape)), [])
 
   def testWhileLoop(self):
     # Produce GraphDef containing while loop.
@@ -418,7 +418,7 @@ class ImportGraphDefTest(test.TestCase):
                                               return_elements=[r.name])
       self.assertEqual(imported_r.name, "import/" + r.name)
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(imported_r), 10)
+        self.assertEqual(self.evaluate(imported_r), 10)
 
   def testImportWhileLoopInCond(self):
     # Produce GraphDef containing while loop.
@@ -458,7 +458,7 @@ class ImportGraphDefTest(test.TestCase):
           lambda i: i < 2, ImportFn, [0],
           shape_invariants=[tensor_shape.TensorShape(None)])
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(out), 10)
+        self.assertEqual(self.evaluate(out), 10)
 
   def testTypeMismatchInGraphDef(self):
     # TODO(skyewm): improve error message
@@ -930,7 +930,7 @@ class ImportGraphDefTest(test.TestCase):
           name="",
           return_elements=["id:0"])
       with self.cached_session():
-        self.assertEqual(5.0, t.eval())
+        self.assertEqual(5.0, self.evaluate(t))
 
   def testInvalidInputForReturnOperations(self):
     with ops.Graph().as_default():
@@ -1071,7 +1071,7 @@ class ImportGraphDefTest(test.TestCase):
       tensor_input = np.ones(input_shape, dtype=np.float32)
       t = constant_op.constant(tensor_input, shape=input_shape)
       g = array_ops.identity(t)
-      g.eval()
+      self.evaluate(g)
 
   def testVersion(self):
     v0 = versions.GRAPH_DEF_VERSION_MIN_CONSUMER
@@ -1255,7 +1255,7 @@ class ImportGraphDefTest(test.TestCase):
     z = TestFunc()
 
     with self.cached_session():
-      z_val = z.eval()
+      z_val = self.evaluate(z)
       self.assertEqual(z_val, -2.0)
 
   def testImportGraphWithFunctionTwice(self):
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 908a5f521e15690dee0683ee25dea86e43b5f1f0..727f6aa44c2ed11414e805eb635a9adbc5519da6 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -31,6 +31,7 @@ from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-i
 from tensorflow.python import pywrap_tensorflow as py_tf
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -83,7 +84,8 @@ def load_op_library(library_filename):
   return module
 
 
-@tf_export('load_file_system_library')
+@deprecation.deprecated(date=None, instructions='Use tf.load_library instead.')
+@tf_export(v1=['load_file_system_library'])
 def load_file_system_library(library_filename):
   """Loads a TensorFlow plugin, containing file system implementation.
 
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 33631282bd03a15daddb334e6f40e6b52f84c750..ddf6f66e8ab5e17aa611cce40b01953fb7a5d3b1 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -462,7 +462,7 @@ def _is_default_attr_value(op_def, attr_name, attr_value):
   return False
 
 
-def _strip_graph_default_valued_attrs(meta_graph_def):
+def strip_graph_default_valued_attrs(meta_graph_def):
   """Strips default valued attributes for node defs in given MetaGraphDef.
 
   This method also sets `meta_info_def.stripped_default_attrs` in the given
@@ -587,7 +587,7 @@ def create_meta_graph_def(meta_info_def=None,
 
   # Strip default valued attributes in graph_def.
   if strip_default_attrs:
-    _strip_graph_default_valued_attrs(meta_graph_def)
+    strip_graph_default_valued_attrs(meta_graph_def)
 
   # Adds saver_def.
   if saver_def:
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index fc98b91a016cf40b32607320bb2ebb65cc7d6a63..e6e87881649729ca65db8cba9914e29b5a0d064e 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -63,6 +63,7 @@ def _TestDir(test_name):
 
 class SimpleMetaGraphTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoVariables(self):
     test_dir = _TestDir("no_variables")
     filename = os.path.join(test_dir, "metafile")
@@ -116,6 +117,7 @@ class SimpleMetaGraphTest(test.TestCase):
                                   {new_input_tensor: input_feed_value})
       self.assertEqual(new_output_value, output_value)
 
+  @test_util.run_deprecated_v1
   def testStrippedOpListNestedFunctions(self):
     with self.cached_session():
       # Square two levels deep
@@ -158,6 +160,7 @@ class SimpleMetaGraphTest(test.TestCase):
     op_list = meta_graph.stripped_op_list_for_graph(graph)
     self.assertEqual(["Const"], [op.name for op in op_list.op])
 
+  @test_util.run_deprecated_v1
   def testDefaultAttrStripping(self):
     """Verifies that default attributes are stripped from a graph def."""
 
@@ -210,6 +213,7 @@ class SimpleMetaGraphTest(test.TestCase):
       self.assertEqual(node_def.attr["Tout"].type, dtypes.complex128)
       self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
 
+  @test_util.run_deprecated_v1
   def testDefaultAttrStrippingNestedFunctions(self):
     """Verifies that default attributes are stripped from function node defs."""
     with self.cached_session():
@@ -261,6 +265,7 @@ class SimpleMetaGraphTest(test.TestCase):
       self.assertEqual(node_def.attr["attr_1"].i, 1)
       self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
 
+  @test_util.run_deprecated_v1
   def testVariableObjectsAreSharedAmongCollections(self):
     with ops.Graph().as_default() as graph1:
       v = variables.Variable(3.0)
@@ -454,6 +459,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
   # Verifies that we can export the subgraph under each layer and import
   # them into new layers in a new graph.
+  @test_util.run_deprecated_v1
   def testScopedExportAndImport(self):
     test_dir = _TestDir("scoped_export_import")
     filenames = [
@@ -492,8 +498,8 @@ class ScopedMetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
       grad = gradients_impl.gradients([output], [var])
       with session.Session() as sess:
-        sess.run(init_op)
-        expected_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        expected_grad_value = self.evaluate(grad)
 
     # Restore the MetaGraphDef into a new Graph with an import scope.
     with ops.Graph().as_default():
@@ -518,10 +524,11 @@ class ScopedMetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with session.Session() as sess:
-        sess.run(init_op)
-        actual_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        actual_grad_value = self.evaluate(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
+  @test_util.run_v1_only("b/120545219")
   def testImportWhileLoopInWhileLoop(self):
     # Create a simple while loop.
     with ops.Graph().as_default():
@@ -544,9 +551,10 @@ class ScopedMetaGraphTest(test.TestCase):
       _, x = control_flow_ops.while_loop(lambda i, x: i < 2, body, [0, 0.0],
                                          name="")
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(x)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(x)
 
+  @test_util.run_deprecated_v1
   def testScopedImportUnderNameScope(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -562,6 +570,7 @@ class ScopedMetaGraphTest(test.TestCase):
         self.assertEqual(list(imported_variables.values())[0].name,
                          "foo/bar/myvar:0")
 
+  @test_util.run_deprecated_v1
   def testScopedImportUnderNameScopeNoVarScope(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -590,6 +599,7 @@ class ScopedMetaGraphTest(test.TestCase):
         self.assertEqual(list(imported_variables.values())[0].name,
                          "s" + suffix + "/v:0")
 
+  @test_util.run_deprecated_v1
   def testScopedImportWithSelectedCollections(self):
     meta_graph_filename = os.path.join(
         _TestDir("selected_collections_import"), "meta_graph.pb")
@@ -600,11 +610,11 @@ class ScopedMetaGraphTest(test.TestCase):
     with graph.as_default():
       variables.Variable(initial_value=1.0, trainable=True)
     self.assertTrue(
-        all([
+        all(
             graph.get_collection(key)
             for key in
             [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES]
-        ]))
+        ))
     meta_graph.export_scoped_meta_graph(
         filename=meta_graph_filename, graph=graph)
 
@@ -687,6 +697,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
   # Verifies that we can export the subgraph containing a FIFOQueue under
   # "queue1" and import it into "new_queue1" in a new graph.
+  @test_util.run_deprecated_v1
   def testScopedWithQueue(self):
     test_dir = _TestDir("scoped_with_queue")
     orig_meta_graph = self._testScopedExportWithQueue(test_dir,
@@ -749,12 +760,15 @@ class ScopedMetaGraphTest(test.TestCase):
     for n, e in zip(nodes, expected):
       self.assertEqual([e], graph2.get_operation_by_name(n).get_attr("_class"))
 
+  @test_util.run_deprecated_v1
   def testExportNestedNames(self):
     self.doTestExportNestedNames(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testExportNestedNamesResource(self):
     self.doTestExportNestedNames(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testPotentialCycle(self):
     graph1 = ops.Graph()
     with graph1.as_default():
@@ -783,6 +797,7 @@ class ScopedMetaGraphTest(test.TestCase):
                   4.0, shape=[2, 2])
           })
 
+  @test_util.run_deprecated_v1
   def testClearDevices(self):
     graph1 = ops.Graph()
     with graph1.as_default():
@@ -842,6 +857,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
 class MetaGraphWithVariableScopeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
 
     def _enqueue_vector(sess, queue, values, shape=None):
@@ -868,8 +884,8 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
       _, update_op = metrics.mean(values)
 
       initializer = variables.local_variables_initializer()
-      sess.run(initializer)
-      sess.run(update_op)
+      self.evaluate(initializer)
+      self.evaluate(update_op)
 
     meta_graph.export_scoped_meta_graph(
         filename=meta_graph_filename, graph=graph)
@@ -880,7 +896,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
     with self.session(graph=graph) as sess:
       meta_graph.import_scoped_meta_graph(meta_graph_filename)
       initializer = variables.local_variables_initializer()
-      sess.run(initializer)
+      self.evaluate(initializer)
 
     # Verifies that importing an old meta_graph where "local_variables"
     # collection is of node_list type works, but cannot build initializer
@@ -899,6 +915,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
 
 class ExportImportAcrossScopesTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testPartionedVariables(self):
 
     def make_graph_with_partitioned_variables(use_resource):
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index e85bba11cd15a0cf4e70d16e981ac14f4308a251..2318b32ef10d67c48950061d2c489f6c7dfb20a0 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -482,7 +482,8 @@ class OpDefLibrary(object):
               else:
                 raise TypeError("%s that don't all match." % prefix)
             else:
-              raise TypeError("%s that are invalid." % prefix)
+              raise TypeError(
+                  "%s that are invalid. Tensors: %s" % (prefix, values))
 
           types = [x.dtype for x in values]
           inputs.extend(values)
@@ -569,7 +570,7 @@ class OpDefLibrary(object):
                   "than minimum length %d." %
                   (input_name, op_type_name, len(values), num_attr.minimum))
           # All tensors must have the same base type.
-          if any([bt != base_types[0] for bt in base_types]):
+          if any(bt != base_types[0] for bt in base_types):
             raise TypeError(
                 "All tensors passed to '%s' of '%s' Op "
                 "must have the same type." %
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 8bb177939e29716e065dd35bb9d38e3f2b9aba28..fa306936d653b233bba3b54d4f9a03ea202684e6 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -36,14 +36,13 @@ from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
-from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
@@ -101,7 +100,7 @@ class _UserDeviceSpec(object):
       self.function = pydev.merge_device(self._device_name_or_function)
 
 
-class _NullContextmanager(object):
+class NullContextmanager(object):
 
   def __enter__(self):
     pass
@@ -318,19 +317,13 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-
+    # This will be set by self._as_tf_output().
+    self._tf_output = None
     # This will be set by self.shape().
     self._shape_val = None
-
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
-
-    if not _USE_C_SHAPES:
-      # Attributes used for C++ shape inference. Not inspected, only forwarded.
-      # If set, will be a HandleData object from cpp_shape_inference.proto.
-      self._handle_data = None
-
     self._id = uid()
 
   @property
@@ -405,17 +398,7 @@ class Tensor(_TensorLike):
 
     """
     if self._shape_val is None:
-      if _USE_C_SHAPES:
-        self._shape_val = self._c_api_shape()
-      else:
-        # Call set_shape_and_handle_data_for_outputs in topological order on all
-        # ops that are needed to compute self.op's shape. We do this instead of
-        # having set_shape_and_handle_data_for_outputs recursively call
-        # Operation.shape on self.op.inputs to overflowing the call stack.
-        need_shapes = self._get_input_ops_without_shapes(self.op)
-        need_shapes.sort(key=lambda op: op._id)
-        for op in need_shapes:
-          set_shape_and_handle_data_for_outputs(op)
+      self._shape_val = self._c_api_shape()
     return self._shape_val
 
   def _get_input_ops_without_shapes(self, target_op):
@@ -530,14 +513,10 @@ class Tensor(_TensorLike):
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    if _USE_C_SHAPES:  # pylint: disable=protected-access
-      # Reset cached shape.
-      self._shape_val = None
-    else:
-      self._shape_val = self.shape.merge_with(shape)
+    # Reset cached shape.
+    self._shape_val = None
 
-    # Update C shape even if _USE_C_SHAPES = False, since we still want
-    # set_shape to be reflected in the C API graph for when we run it.
+    # We want set_shape to be reflected in the C API graph for when we run it.
     if not isinstance(shape, tensor_shape.TensorShape):
       shape = tensor_shape.TensorShape(shape)
     dim_list = []
@@ -601,7 +580,13 @@ class Tensor(_TensorLike):
 
   def _as_tf_output(self):
     # pylint: disable=protected-access
-    return c_api_util.tf_output(self.op._c_op, self.value_index)
+    # NOTE: Beyond preventing unnecessary (re-)allocation, the cached object
+    # also guarantees that a dictionary of tf_output objects will retain a
+    # deterministic (yet unsorted) order which prevents memory blowup in the
+    # cache of executor(s) stored for every session.
+    if self._tf_output is None:
+      self._tf_output = c_api_util.tf_output(self.op._c_op, self.value_index)
+    return self._tf_output
     # pylint: enable=protected-access
 
   def __str__(self):
@@ -625,10 +610,7 @@ class Tensor(_TensorLike):
     return id(self) == id(other)
 
   def __copy__(self):
-    # Make sure _shape_val is computed before we copy.
     # TODO(b/77597810): get rid of Tensor copies.
-    if self._shape_val is None:
-      set_shape_and_handle_data_for_outputs(self.op)
     cls = self.__class__
     result = cls.__new__(cls)
     result.__dict__.update(self.__dict__)
@@ -765,6 +747,18 @@ class _EagerTensorBase(Tensor):
   def _numpy(self):
     raise NotImplementedError()
 
+  @property
+  def backing_device(self):
+    """Returns the name of the device holding this tensor's memory.
+
+    `.backing_device` is usually the same as `.device`, which returns
+    the device on which the kernel of the operation that produced this tensor
+    ran. However, some operations can produce tensors on a different device
+    (e.g., an operation that executes on the GPU but produces output tensors
+    in host memory).
+    """
+    raise NotImplementedError()
+
   def __copy__(self):
     # Eager Tensors are immutable so it's safe to return themselves as a copy.
     return self
@@ -881,6 +875,12 @@ class _EagerTensorBase(Tensor):
     """Returns the number of Tensor dimensions."""
     return self.shape.ndims
 
+  def __len__(self):
+    """Returns the length of the first dimension in the Tensor."""
+    if not self.shape.ndims:
+      raise TypeError("Scalar tensor has no `len()`")
+    return self._shape_tuple()[0]
+
   def _cpu_nograd(self):
     """A copy of this Tensor with contents backed by host memory.
 
@@ -909,13 +909,7 @@ class _EagerTensorBase(Tensor):
     return self._copy(context.context(), "GPU:" + str(gpu_index))
 
   def __bool__(self):
-    if self._shape_tuple() != ():  # pylint: disable=g-explicit-bool-comparison
-      raise ValueError(
-          "Non-scalar tensor %s cannot be converted to boolean." % repr(self))
-    if self.dtype != dtypes.bool:
-      raise ValueError(
-          "Non-boolean tensor %s cannot be converted to boolean." % repr(self))
-    return bool(self.cpu().numpy())
+    return bool(self.numpy())
 
   def __nonzero__(self):
     return self.__bool__()
@@ -992,7 +986,7 @@ _tensor_conversion_func_lock = threading.Lock()
 register_dense_tensor_like_type(Tensor)
 
 
-@tf_export("convert_to_tensor")
+@tf_export(v1=["convert_to_tensor"])
 def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
   """Converts the given `value` to a `Tensor`.
 
@@ -1035,70 +1029,92 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
       `preferred_dtype` is not possible, this argument has no effect.
 
   Returns:
-    An `Output` based on `value`.
+    An `Tensor` based on `value`.
 
   Raises:
-    TypeError: If no conversion function is registered for `value`.
+    TypeError: If no conversion function is registered for `value` to `dtype`.
     RuntimeError: If a registered conversion function returns an invalid value.
-
+    ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
-  return internal_convert_to_tensor(
-      value=value,
-      dtype=dtype,
-      name=name,
-      preferred_dtype=preferred_dtype,
-      as_ref=False)
-
-
-def _error_prefix(name):
-  return "" if name is None else "%s: " % name
+  return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
 
 
-def internal_convert_to_tensor(value,
-                               dtype=None,
-                               name=None,
-                               as_ref=False,
-                               preferred_dtype=None,
-                               ctx=None):
-  """Converts the given `value` to an `Tensor`.
+@tf_export("convert_to_tensor", v1=[])
+def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
+  """Converts the given `value` to a `Tensor`.
 
   This function converts Python objects of various types to `Tensor`
   objects. It accepts `Tensor` objects, numpy arrays, Python lists,
   and Python scalars. For example:
 
+  ```python
+  import numpy as np
+
+  def my_func(arg):
+    arg = tf.convert_to_tensor(arg, dtype=tf.float32)
+    return tf.matmul(arg, arg) + arg
+
+  # The following calls are equivalent.
+  value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]]))
+  value_2 = my_func([[1.0, 2.0], [3.0, 4.0]])
+  value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32))
+  ```
+
   This function can be useful when composing a new operation in Python
-  All standard Python op constructors apply this function to each of their
-  Tensor-valued inputs, which allows those ops to accept numpy arrays, Python
-  lists, and scalars in addition to `Tensor` objects.
+  (such as `my_func` in the example above). All standard Python op
+  constructors apply this function to each of their Tensor-valued
+  inputs, which allows those ops to accept numpy arrays, Python lists,
+  and scalars in addition to `Tensor` objects.
+
+  Note: This function diverges from default Numpy behavior for `float` and
+    `string` types when `None` is present in a Python list or scalar. Rather
+    than silently converting `None` values, an error will be thrown.
 
   Args:
     value: An object whose type has a registered `Tensor` conversion function.
     dtype: Optional element type for the returned tensor. If missing, the
       type is inferred from the type of `value`.
-    name: Optional name to use if a new `Tensor` is created.
-    as_ref: True if we want the mutable view of Variables, if applicable.
-    preferred_dtype: Optional element type for the returned tensor,
+    dtype_hint: Optional element type for the returned tensor,
       used when dtype is None. In some cases, a caller may not have a
-      dtype in mind when converting to a tensor, so preferred_dtype
+      dtype in mind when converting to a tensor, so dtype_hint
       can be used as a soft preference.  If the conversion to
-      `preferred_dtype` is not possible, this argument has no effect.
-    ctx: Optional: The value of context.context().
+      `dtype_hint` is not possible, this argument has no effect.
+    name: Optional name to use if a new `Tensor` is created.
 
   Returns:
-    A `Tensor` based on `value`.
+    An `Tensor` based on `value`.
 
   Raises:
-    TypeError: If no conversion function is registered for `value`.
+    TypeError: If no conversion function is registered for `value` to `dtype`.
     RuntimeError: If a registered conversion function returns an invalid value.
-
+    ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
+  return internal_convert_to_tensor(
+      value=value,
+      dtype=dtype,
+      name=name,
+      preferred_dtype=dtype_hint,
+      as_ref=False)
+
+
+def _error_prefix(name):
+  return "" if name is None else "%s: " % name
+
+
+def internal_convert_to_tensor(value,
+                               dtype=None,
+                               name=None,
+                               as_ref=False,
+                               preferred_dtype=None,
+                               ctx=None,
+                               accept_symbolic_tensors=True):
+  """Implementation of the public convert_to_tensor."""
   if ctx is None: ctx = context.context()
   if isinstance(value, EagerTensor):
     if ctx.executing_eagerly():
-      # Fast path for EagerTensors that don't need any conversion.
-      # Note that we don't check that value's dtype matches the dtype
-      # argument.  We expect that the C runtime will do that checking
-      # when we execute the kernel.
+      if dtype is not None:
+        dtype = dtypes.as_dtype(dtype)
+        value = _TensorTensorConversionFunction(value, dtype=dtype)
       return value
     else:
       graph = get_default_graph()
@@ -1106,6 +1122,19 @@ def internal_convert_to_tensor(value,
         raise RuntimeError("Attempting to capture an EagerTensor without "
                            "building a function.")
       return graph.capture(value, name=name)
+  elif ((not accept_symbolic_tensors) and
+        isinstance(value, Tensor) and
+        ctx.executing_eagerly()):
+    # Found a symbolic tensor in an eager context.
+    # This happens when we use the Keras functional API (i.e. calling layers
+    # on the output of `keras.Input()`, which is symbolic) while eager
+    # execution is enabled.
+    if _is_keras_symbolic_tensor(value):
+      # If the graph of the tensor isn't the Keras graph, we should still
+      # fail, for the time being. TODO(fchollet): consider allowing
+      # all symbolic tensors to raise this exception in this case.
+      raise core._SymbolicException(  # pylint: disable=protected-access
+          "Using the symbolic output of a Keras layer during eager execution.")
 
   if dtype is not None:
     dtype = dtypes.as_dtype(dtype)
@@ -1244,7 +1273,7 @@ def convert_n_to_tensor(values, dtype=None, name=None, preferred_dtype=None):
       as_ref=False)
 
 
-@tf_export("convert_to_tensor_or_indexed_slices")
+@tf_export(v1=["convert_to_tensor_or_indexed_slices"])
 def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
   """Converts the given object to a `Tensor` or an `IndexedSlices`.
 
@@ -1605,6 +1634,8 @@ def _create_c_op(graph, node_def, inputs, control_inputs):
   op_desc = c_api.TF_NewOperation(graph._c_graph,
                                   compat.as_str(node_def.op),
                                   compat.as_str(node_def.name))
+  if node_def.device:
+    c_api.TF_SetDevice(op_desc, compat.as_str(node_def.device))
   # Add inputs
   for op_input in inputs:
     if isinstance(op_input, (list, tuple)):
@@ -2047,12 +2078,6 @@ class Operation(object):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
 
-    # Make sure output shapes are already computed for this op in case we create
-    # a cycle (we cannot compute shapes for cycles). Usually shapes are computed
-    # lazily upon request.
-    if not _USE_C_SHAPES:
-      set_shape_and_handle_data_for_outputs(self)
-
     # Reset cached inputs.
     self._inputs_val = None
     c_api.UpdateEdge(
@@ -2060,6 +2085,31 @@ class Operation(object):
         tensor._as_tf_output(),  # pylint: disable=protected-access
         self._tf_input(index))
 
+  def _add_while_inputs(self, tensors):
+    """See AddWhileInputHack in python_api.h.
+
+    NOTE: This is for TF internal use only. Please don't use it.
+
+    Args:
+      tensors: list of Tensors
+
+    Raises:
+      TypeError: if tensor is not a Tensor,
+        or if input tensor type is not convertible to dtype.
+      ValueError: if the Tensor is from a different graph.
+    """
+    for tensor in tensors:
+      if not isinstance(tensor, Tensor):
+        raise TypeError("tensor must be a Tensor: %s" % tensor)
+      _assert_same_graph(self, tensor)
+
+      # Reset cached inputs.
+      self._inputs_val = None
+      c_api.AddWhileInputHack(
+          self._graph._c_graph,  # pylint: disable=protected-access
+          tensor._as_tf_output(),  # pylint: disable=protected-access
+          self._c_op)
+
   def _add_control_inputs(self, ops):
     """Add a list of new control inputs to this operation.
 
@@ -2093,6 +2143,23 @@ class Operation(object):
     """Removes any control inputs to this operation."""
     c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
 
+  def _add_outputs(self, types, shapes):
+    """Adds new Tensors to self.outputs.
+
+    Note: this is generally unsafe to use. This is used in certain situations in
+    conjunction with _set_type_list_attr.
+
+    Arguments:
+      types: list of DTypes
+      shapes: list of TensorShapes
+    """
+    assert len(types) == len(shapes)
+    orig_num_outputs = len(self.outputs)
+    for i in range(len(types)):
+      t = Tensor(self, orig_num_outputs + i, types[i])
+      self._outputs.append(t)
+      t.set_shape(shapes[i])
+
   def __str__(self):
     return str(self.node_def)
 
@@ -2305,6 +2372,25 @@ class Operation(object):
     finally:
       c_api.TF_DeleteBuffer(buf)
 
+  def _set_func_attr(self, attr_name, func_name):
+    """Private method used to set a function attribute in the node_def."""
+    func = attr_value_pb2.NameAttrList(name=func_name)
+    self._set_attr(attr_name, attr_value_pb2.AttrValue(func=func))
+
+  def _set_type_list_attr(self, attr_name, types):
+    """Private method used to set a function attribute in the node_def."""
+    if not types: return
+    if isinstance(types[0], dtypes.DType):
+      types = [dt.as_datatype_enum for dt in types]
+    types_list = attr_value_pb2.AttrValue.ListValue(type=types)
+    self._set_attr(attr_name, attr_value_pb2.AttrValue(list=types_list))
+
+  def _set_shape_list_attr(self, attr_name, shapes):
+    """Private method used to set a function attribute in the node_def."""
+    shapes = [s.as_proto() for s in shapes]
+    shapes_list = attr_value_pb2.AttrValue.ListValue(shape=shapes)
+    self._set_attr(attr_name, attr_value_pb2.AttrValue(list=shapes_list))
+
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
 
@@ -2317,7 +2403,7 @@ class Operation(object):
     Raises:
       ValueError: If this op does not have an attr with the given `name`.
     """
-    fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
+    fields = ("s", "i", "f", "b", "type", "shape", "tensor", "func")
     try:
       with c_api_util.tf_buffer() as buf:
         c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
@@ -2328,25 +2414,21 @@ class Operation(object):
     x = attr_value_pb2.AttrValue()
     x.ParseFromString(data)
 
-    # Treat an empty oneof value as an empty list.
-    if not x.WhichOneof("value"):
+    oneof_value = x.WhichOneof("value")
+    if oneof_value is None:
       return []
-    if x.HasField("list"):
+    if oneof_value == "list":
       for f in fields:
         if getattr(x.list, f):
           if f == "type":
-            return [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
+            return [dtypes.as_dtype(t) for t in x.list.type]
           else:
             return list(getattr(x.list, f))
       return []
-    else:
-      for f in fields:
-        if x.HasField(f):
-          if f == "type":
-            return dtypes.as_dtype(getattr(x, f))
-          else:
-            return getattr(x, f)
-      assert False, "Unsupported field type in " + str(x)
+    if oneof_value == "type":
+      return dtypes.as_dtype(x.type)
+    assert oneof_value in fields, "Unsupported field type in " + str(x)
+    return getattr(x, oneof_value)
 
   def run(self, feed_dict=None, session=None):
     """Runs this operation in a `Session`.
@@ -2413,8 +2495,9 @@ class RegisterGradient(object):
     return f
 
 
-@tf_export("NoGradient", "NotDifferentiable")
-def NotDifferentiable(op_type):
+@deprecation.deprecated_endpoints("NotDifferentiable", "NoGradient")
+@tf_export("no_gradient", v1=["no_gradient", "NotDifferentiable", "NoGradient"])
+def no_gradient(op_type):
   """Specifies that ops of type `op_type` is not differentiable.
 
   This function should *not* be used for operations that have a
@@ -2447,8 +2530,9 @@ def NotDifferentiable(op_type):
   _gradient_registry.register(None, op_type)
 
 
-# Alias for the old name, will be eventually removed.
-NoGradient = NotDifferentiable
+# Aliases for the old names, will be eventually removed.
+NoGradient = no_gradient
+NotDifferentiable = no_gradient
 
 
 def get_gradient_function(op):
@@ -2524,72 +2608,9 @@ class RegisterShape(object):
     return f
 
 
-# TODO(b/74620627): remove when _USE_C_SHAPES is removed
-def _set_shape_and_handle_data_for_outputs_c_api(op):
-  """Set shapes and resource handle data using info from the C API."""
-  assert not _USE_C_SHAPES
-  for output in op.outputs:
-    output._shape_val = output._c_api_shape()
-    # Set the resource handle data for compatibility with the Python shape
-    # inference code.
-    serialized = c_api.GetHandleShapeAndType(op._graph._c_graph,  # pylint: disable=protected-access
-                                             output._as_tf_output())
-    if serialized:
-      output._handle_data = (
-          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData
-          .FromString(compat.as_bytes(serialized)))
-    else:
-      output._handle_data = None
-
-
-# TODO(b/74620627): remove when _USE_C_SHAPES is removed
-def set_shape_and_handle_data_for_outputs(op):
-  """Set the shapes and resource handle data for op's outputs.
-
-  When _USE_C_SHAPES = False, this is lazily called when a tensor's shape is
-  first requested. Usually this should work automatically, but some edge cases
-  may require manually calling this first to make sure Tensor._shape_val and
-  Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
-  Tensor).
-  """
-  if _USE_C_SHAPES: return
-
-  if op.graph._is_function(op.type):
-    for output in op.outputs:
-      output._shape_val = tensor_shape.unknown_shape()
-    return
-
-  try:
-    shape_func = _shape_registry.lookup(op.type)
-  except LookupError:
-    try:
-      shape_func = _default_shape_function_registry.lookup(op.type)
-    except LookupError:
-      shape_func = _call_cpp_shape_fn_and_require_op
-
-  shapes = shape_func(op)
-  if shapes is None:
-    raise RuntimeError(
-        "Shape function for op %s did not return any shapes" % op)
-  elif isinstance(shapes, dict):
-    # Returned by call_cpp_shape_fn
-    shapes_dict = shapes
-    shapes = shapes_dict["shapes"]
-    handle_datas = shapes_dict["handle_data"]
-    for output, handle_data in zip(op.outputs, handle_datas):
-      # Don't override any existing handle data that may have been manually set.
-      # pylint: disable=protected-access
-      if output._handle_data is None:
-        output._handle_data = handle_data
-      # pylint: enable=protected-access
-
-  if len(op.outputs) != len(shapes):
-    raise RuntimeError(
-        "Shape function for op %s returned %d shapes but expected %d %s %s" %
-        (op, len(shapes), len(op.outputs), shape_func.__name__, str(shapes)))
-  for output, s in zip(op.outputs, shapes):
-    output._shape_val = tensor_shape.unknown_shape()
-    output._shape_val = output._shape_val.merge_with(s)
+def set_shape_and_handle_data_for_outputs(_):
+  """No op. TODO(b/74620627): Remove this."""
+  pass
 
 
 class OpStats(object):
@@ -2817,8 +2838,8 @@ class Graph(object):
     self._stack_state_is_thread_local = False
     self._thread_local = threading.local()
     # Functions that will be applied to choose a device if none is specified.
-    # After switch_to_thread_local(), self._thread_local._device_function_stack
-    # is used instead.
+    # In TF2.x or after switch_to_thread_local(),
+    # self._thread_local._device_function_stack is used instead.
     self._graph_device_function_stack = traceable_stack.TraceableStack()
     # Default original_op applied to new ops.
     self._default_original_op = None
@@ -2826,7 +2847,7 @@ class Graph(object):
     # WhileContext defined in ops/control_flow_ops.py
     self._control_flow_context = None
     # A new node will depend of the union of all of the nodes in the stack.
-    # After switch_to_thread_local(),
+    # In TF2.x or after switch_to_thread_local(),
     # self._thread_local._control_dependencies_stack is used instead.
     self._graph_control_dependencies_stack = []
     # Arbitrary collections of objects.
@@ -2850,7 +2871,7 @@ class Graph(object):
         producer=versions.GRAPH_DEF_VERSION,
         min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER)
     self._building_function = False
-    # Stack of colocate_with ops. After switch_to_thread_local(),
+    # Stack of colocate_with ops. In TF2.x or after switch_to_thread_local(),
     # self._thread_local._colocation_stack is used instead.
     self._graph_colocation_stack = traceable_stack.TraceableStack()
     # Set of tensors that are dangerous to feed!
@@ -2883,6 +2904,8 @@ class Graph(object):
     # requirement (many custom ops do not have shape functions, and we don't
     # want to break these existing cases).
     c_api.SetRequireShapeInferenceFns(self._c_graph, False)
+    if tf2.enabled():
+      self.switch_to_thread_local()
 
   # Note: this method is private because the API of tf.Graph() is public and
   # frozen, and this functionality is still not ready for public visibility.
@@ -3307,36 +3330,6 @@ class Graph(object):
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
-  def _make_colocation_conflict_message(self, op, colocation_op):
-    """Return detailed error message about device conflict due to colocation."""
-    # Example error message:
-    #   Tried to colocate op 'a' (defined at file1.py:149) having device
-    #   '/device:GPU:0' with op 'b' (defined at file2:96) which had an
-    #   incompatible device '/device:CPU:0'.
-    #
-    #   No node-device colocations were active during op 'a' creation.
-    #   Device assignments active during op 'a' creation:
-    #     with tf.device(/device:GPU:0): file1.py:148>
-    #
-    #   Node-device colocations active during op 'b' creation:
-    #     with tf.colocate_with(a): file2.py:93>
-    #   Device assignments active during op 'b' creation:
-    #     with tf.device(/cpu:0): file2.py:94
-    op_info = error_interpolation.compute_field_dict(op)
-    coloc_op_info = error_interpolation.compute_field_dict(colocation_op)
-    msg = ("Tried to colocate op '{op_name}'{op_loc} having device '{op_dev}' "
-           "with op '{coloc_op_name}'{coloc_op_loc} which had an incompatible "
-           "device '{coloc_op_dev}'.\n\n{op_summary}\n\n{coloc_op_summary}"
-           .format(op_name=op.name,
-                   op_loc=op_info["defined_at"],
-                   op_dev=op.device,
-                   op_summary=op_info["devs_and_colocs"],
-                   coloc_op_name=colocation_op.name,
-                   coloc_op_loc=coloc_op_info["defined_at"],
-                   coloc_op_dev=colocation_op.device,
-                   coloc_op_summary=coloc_op_info["devs_and_colocs"]))
-    return msg
-
   def _create_op_helper(self, op, compute_device=True):
     """Common logic for creating an op in this graph."""
     # Apply any additional attributes requested. Do not overwrite any existing
@@ -3389,12 +3382,9 @@ class Graph(object):
       for colocation_op in self._colocation_stack.peek_objs():
         all_colocation_groups.extend(colocation_op.colocation_groups())
         if colocation_op.device:
-          if (op.device and pydev.canonical_name(op.device) !=
-              pydev.canonical_name(colocation_op.device)):
-            msg = self._make_colocation_conflict_message(op, colocation_op)
-            logging.warning(msg)
-          else:
-            op._set_device(colocation_op.device)  # pylint: disable=protected-access
+          # pylint: disable=protected-access
+          op._set_device(colocation_op.device)
+          # pylint: enable=protected-access
 
       all_colocation_groups = sorted(set(all_colocation_groups))
       # pylint: disable=protected-access
@@ -3442,11 +3432,6 @@ class Graph(object):
 
     # pylint: disable=protected-access
     for op in new_ops:
-      # Operations created by the C API always retrieve shapes from the C API so
-      # we preserve the shapes of ops created in import_graph_def (from the
-      # "_output_shapes" attr of the imported NodeDef).
-      if not _USE_C_SHAPES:
-        _set_shape_and_handle_data_for_outputs_c_api(op)
       new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
       op._add_control_inputs(new_control_inputs)
       op._control_flow_post_processing()
@@ -4140,10 +4125,7 @@ class Graph(object):
     if op is None and not ignore_existing:
       raise ValueError("Trying to reset colocation (op is None) but "
                        "ignore_existing is not True")
-
-    if op is not None and not isinstance(op, Operation):
-      # We always want to colocate with the reference op.
-      op = internal_convert_to_tensor_or_indexed_slices(op, as_ref=True).op
+    op = _op_to_colocate_with(op)
 
     # By default, colocate_with resets the device function stack,
     # since colocate_with is typically used in specific internal
@@ -4905,7 +4887,7 @@ class Graph(object):
 # apply to inner graph mode code. Fix that.
 
 
-@tf_export("device")
+@tf_export(v1=["device"])
 def device(device_name_or_function):
   """Wrapper for `Graph.device()` using the default graph.
 
@@ -4935,7 +4917,41 @@ def device(device_name_or_function):
     return get_default_graph().device(device_name_or_function)
 
 
-@tf_export("container")
+@tf_export("device", v1=[])
+def device_v2(device_name):
+  """Specifies the device for ops created/executed in this context.
+
+  `device_name` can be fully specified, as in "/job:worker/task:1/device:cpu:0",
+  or partially specified, containing only a subset of the "/"-separated
+  fields. Any fields which are specified override device annotations from outer
+  scopes. For example:
+
+  with tf.device('/job:foo'):
+    # ops created here have devices with /job:foo
+    with tf.device('/job:bar/task:0/device:gpu:2'):
+      # ops created here have the fully specified device above
+    with tf.device('/device:gpu:1'):
+      # ops created here have the device '/job:foo/device:gpu:1'
+
+  Args:
+    device_name: The device name to use in the context.
+
+  Returns:
+    A context manager that specifies the default device to use for newly
+    created ops.
+
+  Raises:
+    RuntimeError: If a function is passed in.
+  """
+  if callable(device_name):
+    raise RuntimeError("tf.device does not support functions.")
+  if context.executing_eagerly():
+    return context.device(device_name)
+  else:
+    return get_default_graph().device(device_name)
+
+
+@tf_export(v1=["container"])
 def container(container_name):
   """Wrapper for `Graph.container()` using the default graph.
 
@@ -4952,9 +4968,11 @@ def container(container_name):
 def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
   if context.executing_eagerly():
     if op is not None:
+      if not hasattr(op, "device"):
+        op = internal_convert_to_tensor_or_indexed_slices(op)
       return device(op.device)
     else:
-      return _NullContextmanager()
+      return NullContextmanager()
   else:
     default_graph = get_default_graph()
     if isinstance(op, EagerTensor):
@@ -4967,7 +4985,10 @@ def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
         op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
 
 
-@tf_export("colocate_with")
+@deprecation.deprecated(
+    date=None,
+    instructions="Colocations handled automatically by placer.")
+@tf_export(v1=["colocate_with"])
 def colocate_with(op, ignore_existing=False):
   return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
 
@@ -4999,7 +5020,7 @@ def control_dependencies(control_inputs):
       for control in control_inputs:
         if callable(control):
           control()
-    return _NullContextmanager()
+    return NullContextmanager()
   else:
     return get_default_graph().control_dependencies(control_inputs)
 
@@ -5099,7 +5120,7 @@ def default_session(session):
   return _default_session_stack.get_controller(session)
 
 
-@tf_export("get_default_session")
+@tf_export(v1=["get_default_session"])
 def get_default_session():
   """Returns the default session for the current thread.
 
@@ -5352,7 +5373,21 @@ def init_scope():
         outer_graph._device_function_stack = outer_device_stack  # pylint: disable=protected-access
 
 
-@tf_export("enable_eager_execution")
+def executing_eagerly_outside_functions():
+  """Returns True if executing eagerly, even if inside a graph function."""
+  # Fastpath for when this is called eagerly (its not necessary to init_scope).
+  if context.executing_eagerly():
+    return True
+
+  with init_scope():
+    return context.executing_eagerly()
+
+
+def inside_function():
+  return get_default_graph().building_function
+
+
+@tf_export(v1=["enable_eager_execution"])
 def enable_eager_execution(config=None,
                            device_policy=None,
                            execution_mode=None):
@@ -5423,6 +5458,17 @@ def enable_eager_execution(config=None,
         server_def=None)
 
 
+@tf_export(v1=["disable_eager_execution"])
+def disable_eager_execution():
+  """Disables eager execution.
+
+  This function can only be called before any Graphs, Ops, or Tensors have been
+  created. It can be used at the beginning of the program for complex migration
+  projects from TensorFlow 1.x to 2.x.
+  """
+  context.default_execution_mode = context.GRAPH_MODE
+
+
 def enable_eager_execution_internal(config=None,
                                     device_policy=None,
                                     execution_mode=None,
@@ -5430,6 +5476,7 @@ def enable_eager_execution_internal(config=None,
   """Enables eager execution for the lifetime of this program.
 
   Most of the doc string for enable_eager_execution is relevant here as well.
+
   Args:
     config: See enable_eager_execution doc string
     device_policy: See enable_eager_execution doc string
@@ -5460,8 +5507,7 @@ def enable_eager_execution_internal(config=None,
         "tf.contrib.eager.ASYNC")
   if context.default_execution_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
-        _default_session_stack.stack
-        or len(get_default_graph().get_operations()) > 0)  # pylint: disable=g-explicit-length-test
+        _default_graph_stack._global_default_graph is not None) # pylint: disable=protected-access
     if graph_mode_has_been_used:
       raise ValueError(
           "tf.enable_eager_execution must be called at program startup.")
@@ -5523,7 +5569,7 @@ def eager_run(main=None, argv=None):
   app.run(main, argv)
 
 
-@tf_export("reset_default_graph")
+@tf_export(v1=["reset_default_graph"])
 def reset_default_graph():
   """Clears the default graph stack and resets the global default graph.
 
@@ -5542,7 +5588,7 @@ def reset_default_graph():
   _default_graph_stack.reset()
 
 
-@tf_export("get_default_graph")
+@tf_export(v1=["get_default_graph"])
 def get_default_graph():
   """Returns the default graph for the current thread.
 
@@ -5669,7 +5715,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
   return graph or get_default_graph()
 
 
-@tf_export("GraphKeys")
+@tf_export(v1=["GraphKeys"])
 class GraphKeys(object):
   """Standard names to use for graph collections.
 
@@ -5835,7 +5881,7 @@ def dismantle_graph(graph):
   graph.__dict__ = {}
 
 
-@tf_export("add_to_collection")
+@tf_export(v1=["add_to_collection"])
 def add_to_collection(name, value):
   """Wrapper for `Graph.add_to_collection()` using the default graph.
 
@@ -5854,7 +5900,8 @@ def add_to_collection(name, value):
   """
   get_default_graph().add_to_collection(name, value)
 
-@tf_export("add_to_collections")
+
+@tf_export(v1=["add_to_collections"])
 def add_to_collections(names, value):
   """Wrapper for `Graph.add_to_collections()` using the default graph.
 
@@ -5874,7 +5921,7 @@ def add_to_collections(names, value):
   get_default_graph().add_to_collections(names, value)
 
 
-@tf_export("get_collection_ref")
+@tf_export(v1=["get_collection_ref"])
 def get_collection_ref(key):
   """Wrapper for `Graph.get_collection_ref()` using the default graph.
 
@@ -5898,7 +5945,7 @@ def get_collection_ref(key):
   return get_default_graph().get_collection_ref(key)
 
 
-@tf_export("get_collection")
+@tf_export(v1=["get_collection"])
 def get_collection(key, scope=None):
   """Wrapper for `Graph.get_collection()` using the default graph.
 
@@ -5978,6 +6025,13 @@ class name_scope(object):  # pylint: disable=invalid-name
     self._values = values
     self._ctx = context.context()
     self._in_eager_mode = self._ctx.executing_eagerly()
+    self._has_symbolic_input_in_eager = False
+    if self._values and self._in_eager_mode:
+      # The presence of a graph tensor in `self._values` overrides the context.
+      for value in self._values:
+        if hasattr(value, "graph"):
+          self._has_symbolic_input_in_eager = True
+          self._name_scope = value.graph.name_scope(self._name)
 
   def __enter__(self):
     """Start the scope block.
@@ -5989,6 +6043,9 @@ class name_scope(object):  # pylint: disable=invalid-name
       ValueError: if neither `name` nor `default_name` is provided
         but `values` are.
     """
+    if self._has_symbolic_input_in_eager:
+      return self._name_scope.__enter__()
+
     if self._in_eager_mode:
       self._old_name = self._ctx.scope_name
       if not self._name:
@@ -6031,7 +6088,9 @@ class name_scope(object):  # pylint: disable=invalid-name
         raise
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
-    if self._in_eager_mode:
+    if self._has_symbolic_input_in_eager:
+      self._name_scope.__exit__(type_arg, value_arg, traceback_arg)
+    elif self._in_eager_mode:
       self._ctx.scope_name = self._old_name
     else:
       self._name_scope.__exit__(type_arg, value_arg, traceback_arg)
@@ -6096,7 +6155,7 @@ def prepend_name_scope(name, import_scope):
 
 # pylint: disable=g-doc-return-or-yield
 # pylint: disable=not-context-manager
-@tf_export("op_scope")
+@tf_export(v1=["op_scope"])
 @tf_contextlib.contextmanager
 def op_scope(values, name, default_name=None):
   """DEPRECATED. Same as name_scope above, just different argument order."""
@@ -6168,4 +6227,31 @@ def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
                                                                name, as_ref))
 
 
+def _op_to_colocate_with(v):
+  """Operation object corresponding to v to use for colocation constraints."""
+  if v is None:
+    return None
+  if isinstance(v, Operation):
+    return v
+  # We always want to colocate with the reference op.
+  # When 'v' is a ResourceVariable, the reference op is the handle creating op.
+  #
+  # What this should be is:
+  # if isinstance(v, ResourceVariable):
+  #   return v.handle.op
+  # However, that would require a circular import dependency.
+  # As of October 2018, there were attempts underway to remove
+  # colocation constraints altogether. Assuming that will
+  # happen soon, perhaps this hack to work around the circular
+  # import dependency is acceptable.
+  if hasattr(v, "handle") and hasattr(v.handle, "op") and isinstance(
+      v.handle.op, Operation):
+    return v.handle.op
+  return internal_convert_to_tensor_or_indexed_slices(v, as_ref=True).op
+
+
+def _is_keras_symbolic_tensor(x):
+  return hasattr(x, "graph") and getattr(x.graph, "name", None) == "keras_graph"
+
+
 register_tensor_conversion_function(Operation, _operation_conversion_error)
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index c3a3437743c9f6dad3ab1c17e367fb57ce85bd59..0fcbcd6ee4dd1f103c599dc4db26432b61879e83 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -57,11 +57,13 @@ ops._set_call_cpp_shape_fn(common_shapes.call_cpp_shape_fn)
 
 class ResourceTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBuildGraph(self):
     with self.cached_session():
       pt = test_ops.stub_resource_handle_op(container="a", shared_name="b")
       test_ops.resource_create_op(pt).run()
 
+  @test_util.run_deprecated_v1
   def testInitialize(self):
     with self.cached_session():
       handle = test_ops.stub_resource_handle_op(container="a", shared_name="b")
@@ -106,6 +108,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual([2, 3], c.shape)
 
+  @test_util.run_deprecated_v1
   def testUnknownDim(self):
     with self.cached_session():
       a = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None, 3])
@@ -113,6 +116,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual([2, None, 3], c.shape.as_list())
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.cached_session():
       a = array_ops.placeholder(dtype=dtypes.float32, shape=None)
@@ -120,6 +124,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual(tensor_shape.unknown_shape(), c.shape)
 
+  @test_util.run_deprecated_v1
   def testScalarShape(self):
     with self.cached_session():
       a = array_ops.placeholder(dtype=dtypes.float32, shape=[])
@@ -127,6 +132,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual(tensor_shape.scalar(), c.shape)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionError(self):
     with self.cached_session():
       a = array_ops.ones([1, 2, 3])
@@ -140,15 +146,16 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes
   def testToTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
-      indices = constant_op.constant([0, 2])
-      dense_shape = constant_op.constant([3, 2])
-      x = ops.IndexedSlices(values, indices, dense_shape)
-      tensor = ops.convert_to_tensor(x, name="tensor")
-      self.assertAllEqual(tensor.eval(), [[2, 3], [0, 0], [5, 7]])
-
+    values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
+    indices = constant_op.constant([0, 2])
+    dense_shape = constant_op.constant([3, 2])
+    x = ops.IndexedSlices(values, indices, dense_shape)
+    tensor = ops.convert_to_tensor(x, name="tensor")
+    self.assertAllEqual(self.evaluate(tensor), [[2, 3], [0, 0], [5, 7]])
+
+  @test_util.run_deprecated_v1
   def testNegation(self):
     with self.cached_session():
       values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
@@ -157,6 +164,7 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x.values.eval(), [[-2, -3], [-5, -7]])
       self.assertAllEqual(x.indices.eval(), [0, 2])
 
+  @test_util.run_deprecated_v1
   def testScalarMul(self):
     with self.cached_session():
       values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
@@ -190,6 +198,7 @@ def _apply_op(g, *args, **kwargs):
 
 class OperationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testNoInputs(self):
     op = test_ops.float_output_string_output(name="myop").a.op
     self.assertEqual(2, len(op.values()))
@@ -212,6 +221,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertProtoEquals("op:'FloatOutputStringOutput' name:'myop'",
                            op.node_def)
 
+  @test_util.run_deprecated_v1
   def testNoOutputs(self):
     op1 = test_ops.float_output(name="myop1").op
     float_t, = op1.values()
@@ -227,6 +237,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertProtoEquals("op:'FloatInput' name:'myop2' input:'myop1'",
                            op2.node_def)
 
+  @test_util.run_deprecated_v1
   def testInputsAndOutputs(self):
     op1 = test_ops.float_output(name="myop1").op
     self.assertEqual(1, len(op1.values()))
@@ -254,6 +265,12 @@ class OperationTest(test_util.TensorFlowTestCase):
     input:'myop1' input:'myop2:1' input:'myop2:1'
     """, op3.node_def)
 
+  def testDeviceFromNodeDef(self):
+    op = ops.Operation(
+        ops._NodeDef("None", "myop", device="/job:goo/device:GPU:0"),
+        ops.Graph(), [], [])
+    self.assertEqual("/job:goo/device:GPU:0", op.device)
+
   def testDeviceObject(self):
     op = ops.Operation(ops._NodeDef("None", "myop"), ops.Graph(), [], [])
     op._set_device("/job:goo/device:GPU:0")
@@ -302,16 +319,17 @@ class OperationTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       ops.Operation(ops._NodeDef("op", "invalid:0"), g)
 
+  @test_util.run_deprecated_v1
   def testNoShapeFunction(self):
     op = test_ops.a()
     self.assertEqual(tensor_shape.unknown_shape(), op.get_shape())
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedArray(self):
-    with self.cached_session():
-      values = [[2], [3], [5], [7]]
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
+    values = [[2], [3], [5], [7]]
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
 
   def testShapeTuple(self):
     with self.cached_session():
@@ -327,57 +345,63 @@ class OperationTest(test_util.TensorFlowTestCase):
       converted = ops.convert_to_tensor(1)
       self.assertTrue(isinstance(converted, ops.EagerTensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedTuple(self):
-    with self.cached_session():
-      values = ((2,), (3,), (5,), (7,))
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, ops.convert_to_tensor(values).eval())
+    values = ((2,), (3,), (5,), (7,))
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(ops.convert_to_tensor(values)))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedTensors(self):
-    with self.cached_session():
-      values = ((2,), (3,), (5,), (7,))
-      tensor = ops.convert_to_tensor(
-          [constant_op.constant(row) for row in values])
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
-      tensor = ops.convert_to_tensor(
-          [[constant_op.constant(v) for v in row] for row in values])
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
+    values = ((2,), (3,), (5,), (7,))
+    tensor = ops.convert_to_tensor(
+        [constant_op.constant(row) for row in values])
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
+    tensor = ops.convert_to_tensor(
+        [[constant_op.constant(v) for v in row] for row in values])
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedMix(self):
-    with self.cached_session():
-      values = ([2], (3,), [constant_op.constant(5)], constant_op.constant([7]))
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(((2,), (3,), (5,), (7,)), tensor.eval())
+    values = ([2], (3,), [constant_op.constant(5)], constant_op.constant([7]))
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(((2,), (3,), (5,), (7,)), self.evaluate(tensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorPreferred(self):
-    with self.cached_session():
-      values = [2, 3, 5, 7]
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.float32)
-      self.assertEqual(dtypes.float32, tensor.dtype)
+    values = [2, 3, 5, 7]
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.float32)
+    self.assertEqual(dtypes.float32, tensor.dtype)
 
-    with self.cached_session():
-      # Convert empty tensor to anything.
-      values = []
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
-      self.assertEqual(dtypes.int64, tensor.dtype)
+    # Convert empty tensor to anything.
+    values = []
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
+    self.assertEqual(dtypes.int64, tensor.dtype)
 
-    with self.cached_session():
-      # The preferred dtype is a type error and will convert to
-      # float32 instead.
-      values = [1.23]
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
-      self.assertEqual(dtypes.float32, tensor.dtype)
+    # The preferred dtype is a type error and will convert to
+    # float32 instead.
+    values = [1.23]
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
+    self.assertEqual(dtypes.float32, tensor.dtype)
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToInvalidTensorType(self):
     with self.assertRaises(TypeError):
       # Forcing an invalid dtype should fail with a type error.
       values = [1.23]
-      _ = ops.convert_to_tensor(values, dtype=dtypes.int64)
+      ops.convert_to_tensor(values, dtype=dtypes.int64)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorFromInvalidTensor(self):
+    tensor = constant_op.constant(42.0, dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      ops.convert_to_tensor(tensor, dtype=dtypes.int32)
+
+  @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Operation cannot be converted to Tensor.
     op = control_flow_ops.no_op()
@@ -395,6 +419,7 @@ class OperationTest(test_util.TensorFlowTestCase):
         ops._NodeDef("None", "op1"), ops.Graph(), [], [dtypes.float32])
     self.assertEqual("<tf.Operation 'op1' type=None>", repr(op))
 
+  @test_util.run_deprecated_v1
   def testGetAttr(self):
     op = test_ops.default_attrs()
     self.assertEqual(op.get_attr("string_val"), b"abc")
@@ -440,6 +465,7 @@ class OperationTest(test_util.TensorFlowTestCase):
 
   # TODO(b/65162920): remove this test when users who are directly mutating the
   # node_def have been updated to proper usage.
+  @test_util.run_deprecated_v1
   def testSetAttr(self):
     op = test_ops.int_attr().op
     op._set_attr("foo", attr_value_pb2.AttrValue(i=2))
@@ -460,6 +486,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x, y])
     self.assertEqual(x._control_outputs, [z])
 
+  @test_util.run_deprecated_v1
   def testRemoveAllControlInputs(self):
     a = constant_op.constant(1)
     with ops.control_dependencies([a]):
@@ -484,6 +511,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(f.op.control_inputs, [])
     self.assertEqual(list(f.op.inputs), [d, e])
 
+  @test_util.run_deprecated_v1
   def testControlInputCycle(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -497,7 +525,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "Graph is invalid, contains a cycle with 2 nodes"):
-        sess.run(x)
+        self.evaluate(x)
 
   def testUpdateInput(self):
     g = ops.Graph()
@@ -511,21 +539,21 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEquals(x.consumers(), [])
     self.assertEquals(y.consumers(), [z.op, z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 4)
+      self.assertEquals(self.evaluate(z), 4)
 
     z.op._update_input(0, x)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
     self.assertEquals(x.consumers(), [z.op])
     self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 3)
+      self.assertEquals(self.evaluate(z), 3)
 
     z.op._update_input(1, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
     self.assertEquals(x.consumers(), [z.op])
     self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 3)
+      self.assertEquals(self.evaluate(z), 3)
 
   def testUpdateInputGraphError(self):
     g_0 = ops.Graph()
@@ -551,7 +579,7 @@ class OperationTest(test_util.TensorFlowTestCase):
           errors.InvalidArgumentError,
           "Input 0 of node add was passed string from Const_1:0 incompatible "
           "with expected int32"):
-        sess.run(z)
+        self.evaluate(z)
 
   def testUpdateInputShapeError(self):
     g = ops.Graph()
@@ -576,6 +604,33 @@ class OperationTest(test_util.TensorFlowTestCase):
     ):
       x.op._update_input(1, x)  # pylint: disable=protected-access
 
+  @test_util.enable_control_flow_v2
+  @test_util.run_v1_only("b/120545219")
+  def testAddWhileInput(self):
+    @eager_function.defun
+    def test():
+      output = control_flow_ops.while_loop(lambda x: x < 3, lambda x: x + 1,
+                                           [1])
+      while_op = output.op.inputs[0].op
+      self.assertEqual(while_op.type, "While")
+      orig_num_inputs = len(while_op.inputs)
+
+      new_input1 = constant_op.constant(1.0)
+      new_input2 = constant_op.constant(True)
+
+      while_op._set_type_list_attr("T",
+                                   [t.dtype for t in while_op.inputs] +
+                                   [new_input1.dtype, new_input2.dtype])
+
+      while_op._add_while_inputs([new_input1, new_input2])
+      # Can't add an edge beyond what's specified by "T"
+      with self.assertRaises(errors.OutOfRangeError):
+        while_op._add_while_inputs([new_input2])
+      self.assertEqual(len(while_op.inputs), orig_num_inputs + 2)  # pylint: disable=g-deprecated-assert
+
+    test()
+
+  @test_util.run_deprecated_v1
   def testOpDef(self):
     x = constant_op.constant(0)
     y = constant_op.constant(1)
@@ -675,6 +730,7 @@ class CreateOpTest(test_util.TensorFlowTestCase):
 # the low-level behavior.
 class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
@@ -695,7 +751,6 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(g.get_operation_by_name("myop"), op)
     self.assertEqual(g.get_tensor_by_name("myop:0"), op.outputs[0])
 
-  @test_util.enable_c_shapes
   def testShape(self):
     g = ops.Graph()
     with g.as_default():
@@ -726,6 +781,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(op3.name, "myop_2")
     self.assertEqual(op4.name, "myop_1_1")
 
+  @test_util.run_v1_only("b/120545219")
   def testCond(self):
     g = ops.Graph()
     with g.as_default():
@@ -755,6 +811,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
                      "cond/cond_text")
     # pylint: enable=protected-access
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileLoop(self):
     g = ops.Graph()
     with g.as_default():
@@ -784,6 +841,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
                      "myloop/while_context")
     # pylint: enable=protected-access
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileLoopWithInternalControlDep(self):
     g = ops.Graph()
     with g.as_default():
@@ -807,6 +865,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     # Internal control dep is preserved
     self.assertEqual(op.control_inputs, [c])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileLoopWithExternalControlDep(self):
     g = ops.Graph()
     with g.as_default():
@@ -940,6 +999,7 @@ class NameStackTest(test_util.TensorFlowTestCase):
     self.assertEqual("bar_2", g.unique_name("bar", mark_as_used=False))
     self.assertEqual("bar_2", g.unique_name("bar"))
 
+  @test_util.run_deprecated_v1
   def testNameAndVariableScope(self):
     with self.cached_session() as sess:
       with sess.graph.name_scope("l0"):
@@ -1070,6 +1130,13 @@ class DeviceTest(test_util.TensorFlowTestCase):
       node { name: "FloatOutput" op: "FloatOutput" }
     """, gd)
 
+  def testEagerBackingDevice(self):
+    with context.eager_mode():
+      with ops.device("/device:CPU:0"):
+        t = constant_op.constant(1.0)
+        self.assertRegexpMatches(t.device, "/device:CPU:0")
+        self.assertRegexpMatches(t.backing_device, "/device:CPU:0")
+
   def testDevicePartialString(self):
     g = ops.Graph()
     with g.device("/job:worker/replica:2"):
@@ -1634,8 +1701,8 @@ class CollectionTest(test_util.TensorFlowTestCase):
 
         self.assertEqual(ops.get_collection("int"), [1])
         three = inner_defun()
-        self.assertEqual(ops.get_collection("int"), [1, 2])
-        self.assertEqual(ops.get_collection("foo"), ["bar"])
+        self.assertEqual(ops.get_collection("int"), [1])
+        self.assertEqual(ops.get_collection("foo"), [])
         return three
 
       three = defun()
@@ -1659,6 +1726,7 @@ def _CopyOverrideGrad(op, x_grad):  # pylint: disable=invalid-name
 
 class RegistrationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testRegisterGradients(self):
     x = test_ops.float_output()
     y = test_ops.copy_op(x)
@@ -1698,6 +1766,7 @@ class ComparisonTest(test_util.TensorFlowTestCase):
 
 class ControlDependenciesTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
@@ -1941,6 +2010,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       with ops.name_scope(None, "default2") as scope2:
         self.assertEqual(scope2, "default/default2/")
 
+  @test_util.run_deprecated_v1
   def testNoScopeName(self):
     g0 = ops.Graph()
     values = [
@@ -1954,6 +2024,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       with ops.name_scope(None, None, values):
         pass
 
+  @test_util.run_deprecated_v1
   def testEmptyScopeName(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
@@ -1965,6 +2036,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual("", scope)
       self.assertEqual(g0, ops.get_default_graph())
 
+  @test_util.run_deprecated_v1
   def testDefaultScopeName(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
@@ -1989,12 +2061,14 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       with ops.name_scope(scope_name, values=graph_elements + [a]):
         pass
 
+  @test_util.run_deprecated_v1
   def testTensor(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
     b = g0.create_op("B", [], [dtypes.float32])
     self._testGraphElements([a, b])
 
+  @test_util.run_deprecated_v1
   def testSparseTensor(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
@@ -2005,6 +2079,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
         _apply_op(g0, "Int64Output", [], [dtypes.int64]))
     self._testGraphElements([a, sparse, b])
 
+  @test_util.run_deprecated_v1
   def testVariable(self):
     g0 = ops.Graph()
     with g0.as_default():
@@ -2209,6 +2284,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual(4, int(compiled_outer(inner=compiled_inner)))
       self.assertEqual(7, int(compiled_outer(inner=compiled_inner)))
 
+  @test_util.run_v1_only("b/120545219")
   def testFallsBackToGlobalGraphWhenAllGraphsAreBuildingFunctions(self):
     with context.graph_mode():
       ops.reset_default_graph()
@@ -2286,6 +2362,19 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       foo_compiled()
       self.assertEqual(ops.get_name_scope(), "")
 
+  def testExecutingEagerlyOutsideFunctions(self):
+
+    @eager_function.defun
+    def f():
+      return ops.executing_eagerly_outside_functions()
+
+    with context.eager_mode():
+      self.assertTrue(ops.executing_eagerly_outside_functions())
+      self.assertTrue(f())
+      g = ops.Graph()
+      with g.as_default():
+        self.assertFalse(ops.executing_eagerly_outside_functions())
+
 
 class GraphTest(test_util.TensorFlowTestCase):
 
@@ -2332,6 +2421,7 @@ class GraphTest(test_util.TensorFlowTestCase):
     g.prevent_feeding(a)
     self.assertFalse(g.is_feedable(a))
 
+  @test_util.run_deprecated_v1
   def testPreventFetching(self):
     g = ops.Graph()
     a = constant_op.constant(2.0)
@@ -2372,7 +2462,7 @@ class GraphTest(test_util.TensorFlowTestCase):
       c = math_ops.add(a, b)
     # Create a session we can delete
     with session.Session(graph=g) as sess:
-      sess.run(c)
+      self.evaluate(c)
     # Delete all references and trigger gc
     del g
     del a
@@ -2388,7 +2478,7 @@ class GraphTest(test_util.TensorFlowTestCase):
         math_ops.add([1, 2], [1, 2, 3])
       a = constant_op.constant(1)
       with session.Session() as sess:
-        sess.run(a)
+        self.evaluate(a)
 
   def testRunnableAfterInvalidShapeWithKernelLabelMap(self):
     g = ops.Graph()
@@ -2398,7 +2488,7 @@ class GraphTest(test_util.TensorFlowTestCase):
           test_ops.kernel_label_required(1)
       a = constant_op.constant(1)
       with session.Session() as sess:
-        sess.run(a)
+        self.evaluate(a)
 
 
 class AttrScopeTest(test_util.TensorFlowTestCase):
@@ -2415,10 +2505,12 @@ class AttrScopeTest(test_util.TensorFlowTestCase):
       b = None
     return (a, b)
 
+  @test_util.run_deprecated_v1
   def testNoLabel(self):
     with self.cached_session():
       self.assertAllEqual((None, None), self._get_test_attrs())
 
+  @test_util.run_deprecated_v1
   def testLabelMap(self):
     with self.cached_session() as sess:
       a1 = self._get_test_attrs()
@@ -2453,11 +2545,13 @@ ops.RegisterShape("KernelLabel")(common_shapes.scalar_shape)
 
 class KernelLabelTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testNoLabel(self):
     with self.cached_session():
       self.assertAllEqual(b"My label is: default",
                           test_ops.kernel_label().eval())
 
+  @test_util.run_deprecated_v1
   def testLabelMap(self):
     with self.cached_session() as sess:
       default_1 = test_ops.kernel_label()
@@ -2472,12 +2566,14 @@ class KernelLabelTest(test_util.TensorFlowTestCase):
       # pylint: enable=protected-access
       default_3 = test_ops.kernel_label()
 
-      self.assertAllEqual(b"My label is: default", default_1.eval())
-      self.assertAllEqual(b"My label is: default", default_2.eval())
-      self.assertAllEqual(b"My label is: default", default_3.eval())
-      self.assertAllEqual(b"My label is: overload_1", overload_1_1.eval())
-      self.assertAllEqual(b"My label is: overload_1", overload_1_2.eval())
-      self.assertAllEqual(b"My label is: overload_2", overload_2.eval())
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_1))
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_2))
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_3))
+      self.assertAllEqual(b"My label is: overload_1",
+                          self.evaluate(overload_1_1))
+      self.assertAllEqual(b"My label is: overload_1",
+                          self.evaluate(overload_1_2))
+      self.assertAllEqual(b"My label is: overload_2", self.evaluate(overload_2))
 
 
 class AsGraphDefTest(test_util.TensorFlowTestCase):
@@ -2572,6 +2668,7 @@ class StatisticsTest(test_util.TensorFlowTestCase):
 
 class DeviceStackTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicDeviceAssignmentMetadata(self):
 
     def device_func(unused_op):
@@ -2603,6 +2700,7 @@ class DeviceStackTest(test_util.TensorFlowTestCase):
     expected_regex = r"device_func<.*ops_test.py, [0-9]+"
     self.assertRegexpMatches(func_description, expected_regex)
 
+  @test_util.run_deprecated_v1
   def testDeviceAssignmentMetadataForGraphDeviceAndTfDeviceFunctions(self):
 
     with ops.device("/cpu"):
@@ -2622,6 +2720,7 @@ class DeviceStackTest(test_util.TensorFlowTestCase):
 
 class ColocationGroupTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2632,6 +2731,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       c.op.get_attr("_class")
 
+  @test_util.run_deprecated_v1
   def testBasicColocationMetadata(self):
     const_two = constant_op.constant([2.0], name="two")
     with ops.colocate_with(const_two.op):
@@ -2644,6 +2744,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     # colocation statement.
     self.assertEqual("ops_test.py", os.path.basename(metadata.filename))
 
+  @test_util.run_deprecated_v1
   def testColocationDeviceInteraction(self):
     with ops.device("/cpu:0"):
       with ops.device("/device:GPU:0"):
@@ -2656,6 +2757,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual(a.op.device, b.op.device)
 
+  @test_util.run_deprecated_v1
   def testColocationCanonicalization(self):
     with ops.device("/device:GPU:0"):
       _ = constant_op.constant(2.0)
@@ -2671,6 +2773,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     # inherits B's device name, after canonicalizing the names.
     self.assertEqual(b.op.device, c.op.device)
 
+  @test_util.run_deprecated_v1
   def testLocationOverrides(self):
     with ops.device("/cpu:0"):
       with ops.device("/device:GPU:0"):
@@ -2692,6 +2795,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual("/device:GPU:0", c.op.device)
     self.assertEqual("/device:CPU:0", d.op.device)
 
+  @test_util.run_deprecated_v1
   def testNestedColocateWith(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2701,6 +2805,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual([b"loc:@a"], c.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testMultiColocationGroups(self):
     a = constant_op.constant([2.0], name="a")
     b = constant_op.constant(3.0, name="b")
@@ -2709,6 +2814,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
         c = constant_op.constant(4.0)
     self.assertEqual(set([b"loc:@a", b"loc:@b"]), set(c.op.colocation_groups()))
 
+  @test_util.run_deprecated_v1
   def testColocationIgnoreStack(self):
     a = constant_op.constant([2.0], name="a")
     b = constant_op.constant(3.0, name="b")
@@ -2717,6 +2823,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
         c = constant_op.constant(4.0)
     self.assertEqual(set([b"loc:@b"]), set(c.op.colocation_groups()))
 
+  @test_util.run_deprecated_v1
   def testColocateWithReset(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2726,6 +2833,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual([b"loc:@c"], c.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testColocateWithInitialNoneThenNested(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2736,47 +2844,13 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@b"], b.op.colocation_groups())
     self.assertEqual([b"loc:@b"], c.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testColocateVariables(self):
     a = variables.Variable([2.0], name="a")
     with ops.colocate_with(a.op):
       b = variables.Variable([3.0], name="b")
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
-  def testInconsistentDeviceWithinColocate(self):
-    with ops.device("/device:GPU:0"):
-      a = constant_op.constant([2.0], name="a")
-      with ops.colocate_with(a.op):
-        # This is allowed due to legacy but clearly wrong, since we
-        # should really be colocating with 'a'.  We allow devices to
-        # override colocate_with, but we log warnings to suggest that
-        # this is probably unintentional or misguided.
-        with ops.device("/cpu:0"):
-          b = constant_op.constant([3.0], name="b")
-
-    self.assertEqual("/device:CPU:0", b.device)
-
-  def testMakeColocationConflictMessage(self):
-    """Test that provides an example of a complicated error message."""
-    # We could test the message with any ops, but this test will be more
-    # instructive with a real colocation conflict.
-    with ops.device("/device:GPU:0"):
-      a = constant_op.constant([2.0], name="a")
-      with ops.colocate_with(a.op):
-        with ops.device("/cpu:0"):
-          b = constant_op.constant([3.0], name="b")
-    # The definition-location of the nodes will be wrong because of running
-    # from within a TF unittest.  The rest of the info should be correct.
-    message = ops.get_default_graph()._make_colocation_conflict_message(a.op,
-                                                                        b.op)
-    self.assertRegexpMatches(message,
-                             r"Tried to colocate op 'a' \(defined at.*\)")
-    self.assertRegexpMatches(message, "No node-device.*'a'")
-    self.assertRegexpMatches(message, "Device assignments active.*'a'")
-    self.assertRegexpMatches(message, "GPU:0")
-    self.assertRegexpMatches(message, "Node-device colocations active.*'b'")
-    self.assertRegexpMatches(message, "Device assignments active.*'b'")
-    self.assertRegexpMatches(message, "cpu:0")
-
 
 class DeprecatedTest(test_util.TensorFlowTestCase):
 
@@ -2899,6 +2973,7 @@ class NameScopeTest(test_util.TensorFlowTestCase):
 
 class TracebackTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTracebackWithStartLines(self):
     with self.cached_session() as sess:
       a = constant_op.constant(2.0)
@@ -2920,6 +2995,7 @@ class TracebackTest(test_util.TensorFlowTestCase):
 
 class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBadArgumentsToEnableEagerExecution(self):
     with self.assertRaisesRegexp(TypeError, "config must be a tf.ConfigProto"):
       ops.enable_eager_execution(context.DEVICE_PLACEMENT_SILENT)
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 2022fbcbaad8697c147ae63fbea295270046f7f2..d460168631c3032bb91894c9997b2de29bf026e6 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -142,6 +142,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   void AddEagerAttrs(const string& indentation);
   void AddEagerExecute(const string& indentation,
                        const string& num_outputs_expr);
+  void AddDispatch(const string& prefix);
 
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
@@ -355,15 +356,17 @@ string GenEagerPythonOp::Code() {
 }
 
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
-  // Handle graph-mode case
-  strings::StrAppend(&result_,
-                     "  _ctx = _context._context\n"
-                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
-                     function_setup,
-                     "    _, _, _op = _op_def_lib._apply_op_helper(\n");
-  AddBodyNoReturn("        ");
+  strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
+  strings::StrAppend(&result_, function_setup);
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "  try:\n  ");
+  }
+  strings::StrAppend(&result_, "  _, _, _op = _op_def_lib._apply_op_helper(\n");
+  AddBodyNoReturn(strings::StrCat("        \"", op_def_.name(), "\", "));
+  AddDispatch("  ");
+
   if (num_outs_ > 0) {
-    strings::StrAppend(&result_, "    _result = _op.outputs[:]\n");
+    strings::StrAppend(&result_, "  _result = _op.outputs[:]\n");
     // Special case handling for stateful op with single list output
     // that might be empty.
     if (num_outs_ == 1 && op_def_.is_stateful() &&
@@ -372,10 +375,10 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
       // TODO(josh11b): Can skip this if the number_attr/type_list_attr has
       // a constraint indicating that this can never be empty.
       strings::StrAppend(&result_,
-                         "    if not _result:\n"
-                         "      return _op\n");
+                         "  if not _result:\n"
+                         "    return _op\n");
     }
-    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
+    strings::StrAppend(&result_, "  _inputs_flat = _op.inputs\n");
 
     // Compute graph-mode attrs.
     if (op_def_.attr_size() > 0) {
@@ -387,14 +390,13 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
                            attr_name, "\")");
       }
       strings::StrAppend(&attr_values, ")");
-      strings::StrAppend(&result_,
-                         WordWrap("    _attrs = (", attr_values, kRightMargin),
-                         "\n");
+      strings::StrAppend(
+          &result_, WordWrap("  _attrs = (", attr_values, kRightMargin), "\n");
     } else {
-      strings::StrAppend(&result_, "    _attrs = None\n");
+      strings::StrAppend(&result_, "  _attrs = None\n");
     }
   } else {
-    strings::StrAppend(&result_, "    return _op\n");
+    strings::StrAppend(&result_, "  return _op\n");
   }
 }
 
@@ -632,6 +634,9 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
+  }
   AddExport();
   AddDefLine(function_name_, parameters);
   AddDocStringDescription();
@@ -643,25 +648,26 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   AddDocStringOutputs();
   strings::StrAppend(&result_, "  \"\"\"\n");
 
+  strings::StrAppend(&result_,
+                     "  _ctx = _context._context\n"
+                     "  if _ctx is not None and _ctx._eager_context.is_eager:",
+                     "\n");
+  if (eager_not_allowed_error.empty()) {
+    AddEagerFastPathExecute();
+  } else {
+    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
+  }
+
   // Handle graph-mode case
   string function_setup;
-  if (!GetEagerFunctionSetup("    ", &function_setup)) {
+  if (!GetEagerFunctionSetup("  ", &function_setup)) {
     result_ = function_setup;
     return false;
   }
   HandleGraphMode(function_setup);
-  AddEagerFunctionTeardown("    ", output_sizes,
+  AddEagerFunctionTeardown("  ", output_sizes,
                            true /* execute_record_gradient */);
 
-  // Handle eager-mode case
-  strings::StrAppend(&result_, "  else:\n");
-
-  if (eager_not_allowed_error.empty()) {
-    AddEagerFastPathExecute();
-  } else {
-    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
-  }
-
   strings::StrAppend(&result_, "\n\n");
   return true;
 }
@@ -669,13 +675,14 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
 bool GenEagerPythonOp::AddEagerFallbackCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& num_outputs_expr, const string& eager_not_allowed_error) {
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
+             strings::StrCat(parameters, ", ctx=None"));
+
   if (!eager_not_allowed_error.empty()) {
     strings::StrAppend(&result_, "  ", eager_not_allowed_error);
     return true;
   }
 
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
-             strings::StrCat(parameters, ", ctx=None"));
   strings::StrAppend(
       &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
   strings::StrAppend(&result_, "  This is for function ", function_name_,
@@ -750,12 +757,17 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
   strings::StrAppend(&fallback_params, "ctx=_ctx");
   strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
+  strings::StrAppend(&result_, "      try:\n");
   strings::StrAppend(
-      &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
+      &result_, "        ", "return ", function_name_, kEagerFallbackSuffix,
       "(\n",
-      WordWrap(strings::StrCat("          "),
+      WordWrap(strings::StrCat("            "),
                strings::StrCat(fallback_params, ")"), kRightMargin),
       "\n");
+  strings::StrAppend(&result_, "      except _core._SymbolicException:\n");
+  strings::StrAppend(&result_,
+                     "        pass  # Add nodes to the TensorFlow graph.\n");
+  AddDispatch("      ");
 
   // Any errors thrown from execute need to be unwrapped from
   // _NotOkStatusException.
@@ -896,6 +908,19 @@ void GenEagerPythonOp::AddEagerExecute(const string& indentation,
                      WordWrap(return_prefix, return_args, kRightMargin), "\n");
 }
 
+void GenEagerPythonOp::AddDispatch(const string& prefix) {
+  if (api_def_.visibility() != ApiDef::VISIBLE) return;
+
+  strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
+  strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
+  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));
+  strings::StrAppend(&result_, prefix,
+                     "  if result is not "
+                     "_dispatch.OpDispatcher.NOT_SUPPORTED:\n");
+  strings::StrAppend(&result_, prefix, "    return result\n");
+  strings::StrAppend(&result_, prefix, "  raise\n");
+}
+
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops, bool require_shapes,
                     const string& source_file_name = "") {
@@ -935,6 +960,7 @@ from tensorflow.python.framework import op_def_registry as _op_def_registry
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework import op_def_library as _op_def_library
 from tensorflow.python.util.deprecation import deprecated_endpoints
+from tensorflow.python.util import dispatch as _dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 )");
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index f6aef5bc50b57a25016b966a4525fdf81596fd94..cbdeecfbfb93ad776ff9d3db755503c47970d330 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -45,6 +45,9 @@ namespace tensorflow {
 namespace python_op_gen_internal {
 
 const int kRightMargin = 78;
+// Names specified in tf_export decorators are exported to
+// TensorFlow 2.0 by default.
+const int kLatestAPIExportVersion = 2;
 
 bool IsPythonReserved(const string& s) {
   static const std::set<string>* const kPythonReserved = new std::set<string>(
@@ -585,28 +588,42 @@ void GenPythonOp::AddExport() {
   if (api_def_.visibility() != ApiDef::VISIBLE) {
     return;
   }
+  // Whether op should be available in latest export version.
+  bool op_available_in_latest =
+      !api_def_.deprecation_version() ||
+      api_def_.deprecation_version() > kLatestAPIExportVersion;
 
-  // Add @tf_export decorator.
-  strings::StrAppend(&result_, "@tf_export(");
+  string names;
+  string names_v1;
+  string deprecated_endpoints;
 
-  // Add all endpoint names to tf_export.
-  bool first_endpoint = true;
-  std::vector<string> deprecated_endpoints;
   for (const auto& endpoint : api_def_.endpoint()) {
-    if (!first_endpoint) {
-      strings::StrAppend(&result_, ", ");
-    } else {
-      first_endpoint = false;
-    }
     string endpoint_name;
     python_op_gen_internal::GenerateLowerCaseOpName(endpoint.name(),
                                                     &endpoint_name);
-    if (endpoint.deprecated()) {
-      deprecated_endpoints.push_back(endpoint_name);
+    if (endpoint.deprecated() || endpoint.deprecation_version() > 0) {
+      AddDelimiter(&deprecated_endpoints, ", ");
+      strings::StrAppend(&deprecated_endpoints, "'", endpoint_name, "'");
+    }
+    // Add all endpoints to TensorFlow 1.* API.
+    AddDelimiter(&names_v1, ", ");
+    strings::StrAppend(&names_v1, "'", endpoint_name, "'");
+    // Add non-deprecated endpoints to TensorFlow 2.* API.
+    if (op_available_in_latest &&
+        (!endpoint.deprecation_version() ||
+         endpoint.deprecation_version() > kLatestAPIExportVersion)) {
+      AddDelimiter(&names, ", ");
+      strings::StrAppend(&names, "'", endpoint_name, "'");
     }
-    strings::StrAppend(&result_, "'", endpoint_name, "'");
   }
-  strings::StrAppend(&result_, ")\n");
+
+  // tf_export decorator has the following format:
+  // @tf_export(v2_name, v2_name, v1=[v1_name, v1_name])
+  if (names != names_v1) {
+    AddDelimiter(&names, ", ");
+    strings::StrAppend(&names, "v1=[", names_v1, "]");
+  }
+  strings::StrAppend(&result_, "@tf_export(", names, ")\n");
 
   // If all endpoints are deprecated, add @deprecated decorator.
   if (!api_def_.deprecation_message().empty()) {
@@ -615,17 +632,8 @@ void GenPythonOp::AddExport() {
   }
   // Add @deprecated_endpoints decorator.
   if (!deprecated_endpoints.empty()) {
-    strings::StrAppend(&result_, "@deprecated_endpoints(");
-    bool first_endpoint = true;
-    for (auto& endpoint_name : deprecated_endpoints) {
-      if (first_endpoint) {
-        first_endpoint = false;
-      } else {
-        strings::StrAppend(&result_, ", ");
-      }
-      strings::StrAppend(&result_, "'", endpoint_name, "'");
-    }
-    strings::StrAppend(&result_, ")\n");
+    strings::StrAppend(&result_, "@deprecated_endpoints(", deprecated_endpoints,
+                       ")\n");
   }
 }
 
@@ -796,8 +804,8 @@ void GenPythonOp::AddDocStringOutputs() {
 }
 
 void GenPythonOp::AddBody(const string& prefix) {
-  const string apply_prefix =
-      strings::StrCat(prefix, "_result = _op_def_lib.apply_op(");
+  const string apply_prefix = strings::StrCat(
+      prefix, "_result = _op_def_lib.apply_op(\"", op_def_.name(), "\", ");
   AddBodyNoReturn(apply_prefix);
   if (num_outs_ > 1) {
     strings::StrAppend(&result_, prefix, "_result = _", op_def_.name(),
@@ -807,7 +815,7 @@ void GenPythonOp::AddBody(const string& prefix) {
 }
 
 void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
-  string args = strings::StrCat("\"", op_def_.name(), "\", ");
+  string args;
   for (size_t i = 0; i < param_names_.size(); ++i) {
     strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
                        "=", param_names_[i].GetRenameTo(), ", ");
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 6f9f347a99d4a537a3a6dc31198cc8bb92ec9131..6b7f56a92cc02fd9f44a541ed3536b35653031d9 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -34,7 +34,7 @@ def _truncate_seed(seed):
   return seed % _MAXINT32  # Truncate to fit into 32-bit integer
 
 
-@tf_export('random.get_seed', 'get_seed')
+@tf_export(v1=['random.get_seed', 'get_seed'])
 @deprecation.deprecated_endpoints('get_seed')
 def get_seed(op_seed):
   """Returns the local seeds an operation should use given an op-specific seed.
@@ -45,7 +45,7 @@ def get_seed(op_seed):
   graph, or for only specific operations.
 
   For details on how the graph-level seed interacts with op seeds, see
-  `tf.set_random_seed`.
+  `tf.random.set_random_seed`.
 
   Args:
     op_seed: integer.
@@ -82,7 +82,7 @@ def get_seed(op_seed):
   return seeds
 
 
-@tf_export('random.set_random_seed', 'set_random_seed')
+@tf_export(v1=['random.set_random_seed', 'set_random_seed'])
 def set_random_seed(seed):
   """Sets the graph-level random seed.
 
@@ -154,7 +154,7 @@ def set_random_seed(seed):
   sessions, set a graph-level seed:
 
   ```python
-  tf.set_random_seed(1234)
+  tf.random.set_random_seed(1234)
   a = tf.random_uniform([1])
   b = tf.random_normal([1])
 
@@ -182,3 +182,103 @@ def set_random_seed(seed):
     context.set_global_seed(seed)
   else:
     ops.get_default_graph().seed = seed
+
+
+@tf_export('random.set_seed', v1=[])
+def set_seed(seed):
+  """Sets the graph-level random seed.
+
+  Operations that rely on a random seed actually derive it from two seeds:
+  the graph-level and operation-level seeds. This sets the graph-level seed.
+
+  Its interactions with operation-level seeds is as follows:
+
+    1. If neither the graph-level nor the operation seed is set:
+      A random seed is used for this op.
+    2. If the graph-level seed is set, but the operation seed is not:
+      The system deterministically picks an operation seed in conjunction
+      with the graph-level seed so that it gets a unique random sequence.
+    3. If the graph-level seed is not set, but the operation seed is set:
+      A default graph-level seed and the specified operation seed are used to
+      determine the random sequence.
+    4. If both the graph-level and the operation seed are set:
+      Both seeds are used in conjunction to determine the random sequence.
+
+  To illustrate the user-visible effects, consider these examples:
+
+  To generate different sequences across sessions, set neither
+  graph-level nor op-level seeds:
+
+  ```python
+  a = tf.random_uniform([1])
+  b = tf.random_normal([1])
+
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A3'
+    print(sess2.run(a))  # generates 'A4'
+    print(sess2.run(b))  # generates 'B3'
+    print(sess2.run(b))  # generates 'B4'
+  ```
+
+  To generate the same repeatable sequence for an op across sessions, set the
+  seed for the op:
+
+  ```python
+  a = tf.random_uniform([1], seed=1)
+  b = tf.random_normal([1])
+
+  # Repeatedly running this block with the same graph will generate the same
+  # sequence of values for 'a', but different sequences of values for 'b'.
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A1'
+    print(sess2.run(a))  # generates 'A2'
+    print(sess2.run(b))  # generates 'B3'
+    print(sess2.run(b))  # generates 'B4'
+  ```
+
+  To make the random sequences generated by all ops be repeatable across
+  sessions, set a graph-level seed:
+
+  ```python
+  tf.random.set_seed(1234)
+  a = tf.random_uniform([1])
+  b = tf.random_normal([1])
+
+  # Repeatedly running this block with the same graph will generate the same
+  # sequences of 'a' and 'b'.
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A1'
+    print(sess2.run(a))  # generates 'A2'
+    print(sess2.run(b))  # generates 'B1'
+    print(sess2.run(b))  # generates 'B2'
+  ```
+
+  Args:
+    seed: integer.
+  """
+  # TODO(go/tf2-random): change doc, update to match design doc
+  set_random_seed(seed)
diff --git a/tensorflow/python/framework/registry.py b/tensorflow/python/framework/registry.py
index 2e45acb499581e02c0661aa7cf63187cc213c5cd..4357c76bd6cc8ccac55b5e123fa0ce7cf3c0d19d 100644
--- a/tensorflow/python/framework/registry.py
+++ b/tensorflow/python/framework/registry.py
@@ -23,10 +23,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import traceback
-
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_stack
 
 
 # Registry mechanism below is based on mapreduce.python.mrpython.Register.
@@ -57,15 +56,17 @@ class Registry(object):
     if name in self._registry:
       (filename, line_number, function_name, _) = (
           self._registry[name][_LOCATION_TAG])
-      raise KeyError("Registering two %s with name '%s' !"
+      raise KeyError("Registering two %s with name '%s'! "
                      "(Previous registration was in %s %s:%d)" %
                      (self._name, name, function_name, filename, line_number))
 
     logging.vlog(1, "Registering %s (%s) in %s.", name, candidate, self._name)
     # stack trace is [this_function, Register(), user_function,...]
     # so the user function is #2.
-    stack = traceback.extract_stack()
-    self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: stack[2]}
+    stack = tf_stack.extract_stack()
+    user_function = stack[2]
+    location_tag = tf_stack.convert_stack([user_function])[0]
+    self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: location_tag}
 
   def list(self):
     """Lists registered items.
diff --git a/tensorflow/python/framework/registry_test.py b/tensorflow/python/framework/registry_test.py
index a821e16f26007632886532bfd868dbf8716eafb6..1a0d3f200d9427363ae36c19b6214ac6c9b75bec 100644
--- a/tensorflow/python/framework/registry_test.py
+++ b/tensorflow/python/framework/registry_test.py
@@ -45,7 +45,9 @@ class RegistryTest(test.TestCase):
   def testDuplicate(self):
     myreg = registry.Registry('testbar')
     myreg.register(bar, 'Bar')
-    with self.assertRaises(KeyError):
+    with self.assertRaisesRegexp(
+        KeyError, r'Registering two testbar with name \'Bar\'! '
+        r'\(Previous registration was in [^ ]+ .*.py:[0-9]+\)'):
       myreg.register(bar, 'Bar')
 
 
diff --git a/tensorflow/python/framework/smart_cond_test.py b/tensorflow/python/framework/smart_cond_test.py
index b8a9672b06da9b24d567a9779fb703ac7178d411..f964c87f0243bd00faf44a10f1468680a2fb272d 100644
--- a/tensorflow/python/framework/smart_cond_test.py
+++ b/tensorflow/python/framework/smart_cond_test.py
@@ -35,6 +35,7 @@ def raise_exception():
 
 class SmartCondTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTrue(self):
     with ops.Graph().as_default():
       with session.Session():
@@ -44,6 +45,7 @@ class SmartCondTest(test_util.TensorFlowTestCase):
                                   lambda: math_ops.multiply(y, 5))
         self.assertEqual(z.eval(), 32)
 
+  @test_util.run_deprecated_v1
   def testFalse(self):
     with ops.Graph().as_default():
       with session.Session():
@@ -99,6 +101,7 @@ class SmartCondTest(test_util.TensorFlowTestCase):
 
 class SmartCaseTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTrue(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(True, lambda: constant_op.constant(1)),
@@ -109,9 +112,10 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
                               exclusive=True)
     with session.Session() as sess:
       # No feed_dict necessary
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 1)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 1)
 
+  @test_util.run_deprecated_v1
   def testFalse(self):
     conditions = [(False, raise_exception)]
     y = smart_cond.smart_case(conditions,
@@ -121,9 +125,10 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
                               default=lambda: constant_op.constant(1),
                               exclusive=True)
     with session.Session() as sess:
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 1)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 1)
 
+  @test_util.run_deprecated_v1
   def testMix(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     y = constant_op.constant(10)
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 440e3a0968cefb8d3bda200545b442de6ce66c60..5e1a95a26be034bff0a1f5eb996ac6f16c61e282 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -134,10 +134,10 @@ class SparseTensor(_TensorLike):
     dense_shape_shape = dense_shape.get_shape().with_rank(1)
 
     # Assert number of rows in indices match the number of elements in values.
-    indices_shape[0].merge_with(values_shape[0])
+    indices_shape.dims[0].merge_with(values_shape.dims[0])
     # Assert number of columns in indices matches the number of elements in
     # dense_shape.
-    indices_shape[1].merge_with(dense_shape_shape[0])
+    indices_shape.dims[1].merge_with(dense_shape_shape.dims[0])
 
   def get_shape(self):
     """Get the `TensorShape` representing the shape of the dense tensor.
@@ -244,11 +244,11 @@ class SparseTensor(_TensorLike):
 
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
-tf_export("SparseTensorValue")(SparseTensorValue)
+tf_export(v1=["SparseTensorValue"])(SparseTensorValue)
 pywrap_tensorflow.RegisterType("SparseTensorValue", SparseTensorValue)
 
 
-@tf_export("convert_to_tensor_or_sparse_tensor")
+@tf_export(v1=["convert_to_tensor_or_sparse_tensor"])
 def convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None):
   """Converts value to a `SparseTensor` or `Tensor`.
 
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index 22423c4f58ca510a2e247b9cd783d5596ca65e46..a999c12ca89b0c1746751eb04e9abfe380abf336 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -46,11 +46,11 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
       self.assertEqual(sp.get_shape(), (4, 5))
 
       with self.cached_session() as sess:
-        value = sp.eval()
+        value = self.evaluate(sp)
         self.assertAllEqual(indices, value.indices)
         self.assertAllEqual(values, value.values)
         self.assertAllEqual(shape, value.dense_shape)
-        sess_run_value = sess.run(sp)
+        sess_run_value = self.evaluate(sp)
         self.assertAllEqual(sess_run_value.indices, value.indices)
         self.assertAllEqual(sess_run_value.values, value.values)
         self.assertAllEqual(sess_run_value.dense_shape, value.dense_shape)
@@ -65,6 +65,7 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
         sparse_tensor.is_sparse(
             sparse_tensor.SparseTensorValue([[0]], [0], [1])))
 
+  @test_util.run_deprecated_v1
   def testConsumers(self):
     sp = sparse_tensor.SparseTensor([[0, 0], [1, 2]], [1.0, 3.0], [3, 4])
     w = ops.convert_to_tensor(np.ones([4, 1], np.float32))
@@ -85,8 +86,9 @@ class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
       value = [42, 43]
       from_value = sparse_tensor.convert_to_tensor_or_sparse_tensor(
           value)
-      self.assertAllEqual(value, from_value.eval())
+      self.assertAllEqual(value, self.evaluate(from_value))
 
+  @test_util.run_deprecated_v1
   def test_convert_sparse(self):
     with self.cached_session():
       indices = [[0, 1], [1, 0]]
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index cab426844d4eed1bfdb5a7978cd8d98eab3cf0cc..a74e96f9d9d6469b66426dd85628f926297afcd0 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -43,6 +43,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertTrue(
         all(subscribe._is_subscribed_identity(x) for x in container))
 
+  @test_util.run_deprecated_v1
   def testSideEffect(self):
     a = constant_op.constant(1)
     b = constant_op.constant(1)
@@ -66,15 +67,16 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertTrue(c.op in d.op.control_inputs)
 
     with self.cached_session() as sess:
-      c_out = sess.run([c])
-      n_out = sess.run([n])
-      d_out = sess.run([d])
+      c_out = self.evaluate([c])
+      n_out = self.evaluate([n])
+      d_out = self.evaluate([d])
 
     self.assertEqual(n_out, [-2])
     self.assertEqual(c_out, [2])
     self.assertEqual(d_out, [42])
     self.assertEqual(shared, [2, 2, 2])
 
+  @test_util.run_deprecated_v1
   def testSupportedTypes(self):
     """Confirm that supported types are correctly detected and handled."""
 
@@ -120,6 +122,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
       subscribe.subscribe(c.name,
                           lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
+  @test_util.run_deprecated_v1
   def testCaching(self):
     """Confirm caching of control output is recalculated between calls."""
     a = constant_op.constant(1)
@@ -145,13 +148,14 @@ class SubscribeTest(test_util.TensorFlowTestCase):
                             lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
     with self.cached_session() as sess:
-      c_out = sess.run([c])
-      d_out = sess.run([d])
+      c_out = self.evaluate([c])
+      d_out = self.evaluate([d])
 
     self.assertEqual(c_out, [42])
     self.assertEqual(d_out, [11])
     self.assertEqual(shared, {2: 1, 1: 1})
 
+  @test_util.run_deprecated_v1
   def testIsSubscribedIdentity(self):
     """Confirm subscribed identity ops are correctly detected."""
     a = constant_op.constant(1)
@@ -165,6 +169,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertFalse(subscribe._is_subscribed_identity(idop))
     self.assertTrue(subscribe._is_subscribed_identity(c_sub))
 
+  @test_util.run_deprecated_v1
   def testSubscribeExtend(self):
     """Confirm side effect are correctly added for different input types."""
     a = constant_op.constant(1)
@@ -205,11 +210,12 @@ class SubscribeTest(test_util.TensorFlowTestCase):
 
     # Expect the three side effect graphs to have been evaluated.
     with self.cached_session() as sess:
-      sess.run([c_sub])
+      self.evaluate([c_sub])
     self.assertIn('graph1', shared)
     self.assertIn('graph2', shared)
     self.assertIn('graph3', shared)
 
+  @test_util.run_v1_only('b/120545219')
   def testSubscribeVariable(self):
     """Confirm that variables can be subscribed."""
     v1 = variables.VariableV1(0.0)
@@ -229,25 +235,26 @@ class SubscribeTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       # Initialize the variables first.
-      sess.run([v1.initializer])
-      sess.run([v2.initializer])
+      self.evaluate([v1.initializer])
+      self.evaluate([v2.initializer])
 
       # Expect the side effects to be triggered when evaluating the add op as
       # it will read the value of the variable.
-      sess.run([add])
+      self.evaluate([add])
       self.assertEqual(1, len(shared))
 
       # Expect the side effect not to be triggered when evaluating the assign
       # op as it will not access the 'read' output of the variable.
-      sess.run([assign_v1])
+      self.evaluate([assign_v1])
       self.assertEqual(1, len(shared))
 
-      sess.run([add])
+      self.evaluate([add])
       self.assertEqual(2, len(shared))
 
       # Make sure the values read from the variable match the expected ones.
       self.assertEqual([0.0, 3.0], shared)
 
+  @test_util.run_v1_only('b/120545219')
   def testResourceType(self):
     """Confirm that subscribe correctly handles tensors with 'resource' type."""
     tensor_array = tensor_array_ops.TensorArray(
@@ -273,9 +280,10 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertFalse(subscribe._is_subscribed_identity(tensor_array.handle))
 
     with self.cached_session() as sess:
-      sess.run([reader])
+      self.evaluate([reader])
     self.assertEqual(0, len(shared))
 
+  @test_util.run_deprecated_v1
   def testMultipleOutputs(self):
     """Handle subscriptions to multiple outputs from the same op."""
     sparse_tensor_1 = sparse_tensor.SparseTensor(
@@ -304,11 +312,12 @@ class SubscribeTest(test_util.TensorFlowTestCase):
                         lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
     with self.cached_session() as sess:
-      sess.run([neg])
+      self.evaluate([neg])
 
     # All three ops have been processed.
     self.assertEqual(3, len(shared))
 
+  @test_util.run_deprecated_v1
   def test_subscribe_tensors_on_different_devices(self):
     """Side effect ops are added with the same device of the subscribed op."""
     c1 = constant_op.constant(10)
@@ -335,6 +344,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertEqual(add.device, add_sub.device)
     self.assertEqual(mul.device, mul_sub.device)
 
+  @test_util.run_v1_only('b/120545219')
   def test_subscribe_tensors_within_control_flow_context(self):
     """Side effect ops are added with the same control flow context."""
     c1 = constant_op.constant(10)
@@ -375,7 +385,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertIsNot(context(subscriptions[0]), context(subscriptions[1]))
 
     with self.cached_session() as sess:
-      sess.run(cond)
+      self.evaluate(cond)
 
     self.assertEqual(3, len(results))
 
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 3c2a736fb98915af3048a593ad1908b8afb879b3..960a3dad7389553955c999e444a9f98c1857f588 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -18,12 +18,158 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python import tf2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("Dimension")
+_TENSORSHAPE_V2_OVERRIDE = None
+
+
+@tf_export(v1=["enable_v2_tensorshape"])
+def enable_v2_tensorshape():
+  """In TensorFlow 2.0, iterating over a TensorShape instance returns values.
+
+  This enables the new behavior.
+
+  Concretely, `tensor_shape[i]` returned a Dimension instance in V1, but
+  it V2 it returns either an integer, or None.
+
+  Examples:
+
+  ```
+  #######################
+  # If you had this in V1:
+  value = tensor_shape[i].value
+
+  # Do this in V2 instead:
+  value = tensor_shape[i]
+
+  #######################
+  # If you had this in V1:
+  for dim in tensor_shape:
+    value = dim.value
+    print(value)
+
+  # Do this in V2 instead:
+  for value in tensor_shape:
+    print(value)
+
+  #######################
+  # If you had this in V1:
+  dim = tensor_shape[i]
+  dim.assert_is_compatible_with(other_shape)  # or using any other shape method
+
+  # Do this in V2 instead:
+  if tensor_shape.rank is None:
+    dim = Dimension(None)
+  else:
+    dim = tensor_shape.dims[i]
+  dim.assert_is_compatible_with(other_shape)  # or using any other shape method
+
+  # The V2 suggestion above is more explicit, which will save you from
+  # the following trap (present in V1):
+  # you might do in-place modifications to `dim` and expect them to be reflected
+  # in `tensor_shape[i]`, but they would not be.
+  ```
+  """
+  global _TENSORSHAPE_V2_OVERRIDE, TensorShape  # pylint: disable=invalid-name
+  _TENSORSHAPE_V2_OVERRIDE = True
+  TensorShape = TensorShapeV2
+
+
+@tf_export(v1=["disable_v2_tensorshape"])
+def disable_v2_tensorshape():
+  """Disables the V2 TensorShape behavior and reverts to V1 behavior.
+
+  See docstring for `enable_v2_tensorshape` for details about the new behavior.
+  """
+  global _TENSORSHAPE_V2_OVERRIDE, TensorShape  # pylint: disable=invalid-name
+  _TENSORSHAPE_V2_OVERRIDE = False
+  TensorShape = TensorShapeV1
+
+
+@tf_export(v1=["dimension_value"])
+def dimension_value(dimension):
+  """Compatibility utility required to allow for both V1 and V2 behavior in TF.
+
+  Until the release of TF 2.0, we need the legacy behavior of `TensorShape` to
+  coexist with the new behavior. This utility is a bridge between the two.
+
+  When accessing the value of a TensorShape dimension,
+  use this utility, like this:
+
+  ```
+  # If you had this in your V1 code:
+  value = tensor_shape[i].value
+
+  # Use `dimension_value` as direct replacement compatible with both V1 & V2:
+  value = dimension_value(tensor_shape[i])
+
+  # This would be the V2 equivalent:
+  value = tensor_shape[i]  # Warning: this will return the dim value in V2!
+  ```
+
+  Arguments:
+    dimension: Either a `Dimension` instance, an integer, or None.
+
+  Returns:
+    A plain value, i.e. an integer or None.
+  """
+  if isinstance(dimension, Dimension):
+    return dimension.value
+  return dimension
+
+
+@tf_export(v1=["dimension_at_index"])
+def dimension_at_index(shape, index):
+  """Compatibility utility required to allow for both V1 and V2 behavior in TF.
+
+  Until the release of TF 2.0, we need the legacy behavior of `TensorShape` to
+  coexist with the new behavior. This utility is a bridge between the two.
+
+  If you want to retrieve the Dimension instance corresponding to a certain
+  index in a TensorShape instance, use this utility, like this:
+
+  ```
+  # If you had this in your V1 code:
+  dim = tensor_shape[i]
+
+  # Use `dimension_at_index` as direct replacement compatible with both V1 & V2:
+  dim = dimension_at_index(tensor_shape, i)
+
+  # Another possibility would be this, but WARNING: it only works if the
+  # tensor_shape instance has a defined rank.
+  dim = tensor_shape.dims[i]  # `dims` may be None if the rank is undefined!
+
+  # In native V2 code, we recommend instead being more explicit:
+  if tensor_shape.rank is None:
+    dim = Dimension(None)
+  else:
+    dim = tensor_shape.dims[i]
+
+  # Being more explicit will save you from the following trap (present in V1):
+  # you might do in-place modifications to `dim` and expect them to be reflected
+  # in `tensor_shape[i]`, but they would not be (as the Dimension object was
+  # instantiated on the fly.
+  ```
+
+  Arguments:
+    shape: A TensorShape instance.
+    index: An integer index.
+
+  Returns:
+    A dimension object.
+  """
+  assert isinstance(shape, TensorShape)
+  if shape.rank is None:
+    return Dimension(None)
+  else:
+    return shape.dims[index]
+
+
+@tf_export(v1=["Dimension"])
 class Dimension(object):
   """Represents the value of one dimension in a TensorShape."""
 
@@ -31,6 +177,8 @@ class Dimension(object):
     """Creates a new Dimension with the given value."""
     if value is None:
       self._value = None
+    elif isinstance(value, Dimension):
+      self._value = value.value
     elif isinstance(value, dtypes.DType):
       raise TypeError("Cannot convert %s to Dimension" % value)
     else:
@@ -125,7 +273,9 @@ class Dimension(object):
     tf.Dimension(n)   .merge_with(tf.Dimension(None)) == tf.Dimension(n)
     tf.Dimension(None).merge_with(tf.Dimension(n))    == tf.Dimension(n)
     tf.Dimension(None).merge_with(tf.Dimension(None)) == tf.Dimension(None)
-    tf.Dimension(n)   .merge_with(tf.Dimension(m))  # raises ValueError for n != m
+
+    # raises ValueError for n != m
+    tf.Dimension(n)   .merge_with(tf.Dimension(m))
     ```
 
     Args:
@@ -482,8 +632,8 @@ def as_dimension(value):
     return Dimension(value)
 
 
-@tf_export("TensorShape")
-class TensorShape(object):
+@tf_export(v1=["TensorShape"])
+class TensorShapeV1(object):
   """Represents the shape of a `Tensor`.
 
   A `TensorShape` represents a possibly-partial shape specification for a
@@ -509,12 +659,10 @@ class TensorShape(object):
 
     Args:
       dims: A list of Dimensions, or None if the shape is unspecified.
-        DEPRECATED: A single integer is treated as a singleton list.
 
     Raises:
       TypeError: If dims cannot be converted to a list of dimensions.
     """
-    # TODO(irving): Eliminate the single integer special case.
     if dims is None:
       self._dims = None
     elif isinstance(dims, compat.bytes_or_text_types):
@@ -540,18 +688,42 @@ class TensorShape(object):
       else:
         # Got a list of dimensions
         self._dims = [as_dimension(d) for d in dims_iter]
-    self._ndims = None
+
+  @property
+  def _v2_behavior(self):
+    if _TENSORSHAPE_V2_OVERRIDE is None:
+      return False
+    return _TENSORSHAPE_V2_OVERRIDE
 
   def __repr__(self):
-    return "TensorShape(%r)" % self._dims
+    if self._v2_behavior:
+      if self._dims is not None:
+        return "TensorShape(%r)" % [dim.value for dim in self._dims]
+      else:
+        return "TensorShape(None)"
+    else:
+      return "TensorShape(%r)" % self._dims
 
   def __str__(self):
-    if self.ndims is None:
+    if self.rank is None:
       return "<unknown>"
-    elif self.ndims == 1:
-      return "(%s,)" % self._dims[0]
+    elif self.rank == 1:
+      if self._v2_behavior:
+        return "(%s,)" % self._dims[0].value
+      else:
+        return "(%s,)" % self._dims[0]
     else:
-      return "(%s)" % ", ".join(str(d) for d in self._dims)
+      if self._v2_behavior:
+        return "(%s)" % ", ".join(str(d.value) for d in self._dims)
+      else:
+        return "(%s)" % ", ".join(str(d) for d in self._dims)
+
+  @property
+  def rank(self):
+    """Returns the rank of this shape, or None if it is unspecified."""
+    if self._dims is not None:
+      return len(self._dims)
+    return None
 
   @property
   def dims(self):
@@ -561,23 +733,17 @@ class TensorShape(object):
   @dims.setter
   def dims(self, dims):
     self._dims = dims
-    self._ndims = None
 
   @property
   def ndims(self):
-    """Returns the rank of this shape, or None if it is unspecified."""
-    if self._dims is None:
-      return None
-    else:
-      if self._ndims is None:
-        self._ndims = len(self._dims)
-      return self._ndims
+    """Deprecated accessor for `rank`."""
+    return self.rank
 
   def __len__(self):
     """Returns the rank of this shape, or raises ValueError if unspecified."""
     if self._dims is None:
-      raise ValueError("Cannot take the length of Shape with unknown rank.")
-    return self.ndims
+      raise ValueError("Cannot take the length of shape with unknown rank.")
+    return len(self._dims)
 
   def __bool__(self):
     """Returns True if this shape contains non-zero information."""
@@ -591,7 +757,10 @@ class TensorShape(object):
     if self._dims is None:
       raise ValueError("Cannot iterate over a shape with unknown rank.")
     else:
-      return iter(self._dims)
+      if self._v2_behavior:
+        return iter(d.value for d in self._dims)
+      else:
+        return iter(d for d in self._dims)
 
   def __getitem__(self, key):
     """Returns the value of a dimension or a shape, depending on the key.
@@ -602,7 +771,7 @@ class TensorShape(object):
         dimensions are those selected by the slice from `self`.
 
     Returns:
-      A dimension if `key` is an integer, or a `TensorShape` if `key` is a
+      An integer if `key` is an integer, or a `TensorShape` if `key` is a
       slice.
 
     Raises:
@@ -613,7 +782,10 @@ class TensorShape(object):
       if isinstance(key, slice):
         return TensorShape(self._dims[key])
       else:
-        return self._dims[key]
+        if self._v2_behavior:
+          return self._dims[key].value
+        else:
+          return self._dims[key]
     else:
       if isinstance(key, slice):
         start = key.start if key.start is not None else 0
@@ -633,9 +805,12 @@ class TensorShape(object):
           # suffixes of otherwise unknown shapes.
           return unknown_shape()
         else:
-          return unknown_shape(ndims=stop - start)
+          return unknown_shape(rank=stop - start)
       else:
-        return Dimension(None)
+        if self._v2_behavior:
+          return None
+        else:
+          return Dimension(None)
 
   def num_elements(self):
     """Returns the total number of elements, or none for incomplete shapes."""
@@ -710,8 +885,8 @@ class TensorShape(object):
         same rank.
     """
     other = as_shape(other)
-    if self.ndims is not None and other.ndims is not None:
-      if self.ndims != other.ndims:
+    if self.rank is not None and other.rank is not None:
+      if self.rank != other.rank:
         raise ValueError("Shapes %s and %s must have the same rank" % (self,
                                                                        other))
 
@@ -724,7 +899,7 @@ class TensorShape(object):
     Raises:
       ValueError: If `self` does not represent a shape with the given `rank`.
     """
-    if self.ndims not in (None, rank):
+    if self.rank not in (None, rank):
       raise ValueError("Shape %s must have rank %d" % (self, rank))
 
   def with_rank(self, rank):
@@ -743,7 +918,7 @@ class TensorShape(object):
       ValueError: If `self` does not represent a shape with the given `rank`.
     """
     try:
-      return self.merge_with(unknown_shape(ndims=rank))
+      return self.merge_with(unknown_shape(rank=rank))
     except ValueError:
       raise ValueError("Shape %s must have rank %d" % (self, rank))
 
@@ -761,7 +936,7 @@ class TensorShape(object):
       ValueError: If `self` does not represent a shape with at least the given
         `rank`.
     """
-    if self.ndims is not None and self.ndims < rank:
+    if self.rank is not None and self.rank < rank:
       raise ValueError("Shape %s must have rank at least %d" % (self, rank))
     else:
       return self
@@ -780,7 +955,7 @@ class TensorShape(object):
       ValueError: If `self` does not represent a shape with at most the given
         `rank`.
     """
-    if self.ndims is not None and self.ndims > rank:
+    if self.rank is not None and self.rank > rank:
       raise ValueError("Shape %s must have rank at most %d" % (self, rank))
     else:
       return self
@@ -825,7 +1000,7 @@ class TensorShape(object):
     """
     other = as_shape(other)
     if self._dims is not None and other.dims is not None:
-      if self.ndims != other.ndims:
+      if self.rank != other.rank:
         return False
       for x_dim, y_dim in zip(self._dims, other.dims):
         if not x_dim.is_compatible_with(y_dim):
@@ -868,10 +1043,10 @@ class TensorShape(object):
     """
 
     other = as_shape(other)
-    if self._dims is None or other.dims is None or self.ndims != other.ndims:
+    if self._dims is None or other.dims is None or self.rank != other.rank:
       return unknown_shape()
 
-    dims = [(Dimension(None))] * self.ndims
+    dims = [(Dimension(None))] * self.rank
     for i, (d1, d2) in enumerate(zip(self._dims, other.dims)):
       if d1 is not None and d2 is not None and d1 == d2:
         dims[i] = d1
@@ -929,15 +1104,18 @@ class TensorShape(object):
       other = as_shape(other)
     except TypeError:
       return NotImplemented
-    if self.ndims is None or other.ndims is None:
+    if self.rank is None or other.rank is None:
       raise ValueError("The inequality of unknown TensorShapes is undefined.")
-    if self.ndims != other.ndims:
+    if self.rank != other.rank:
       return True
     return self._dims != other.dims
 
   def __reduce__(self):
     return TensorShape, (self._dims,)
 
+  def __concat__(self, other):
+    return self.concatenate(other)
+
 
 def as_shape(shape):
   """Converts the given object to a TensorShape."""
@@ -947,27 +1125,48 @@ def as_shape(shape):
     return TensorShape(shape)
 
 
-def unknown_shape(ndims=None):
+def unknown_shape(rank=None, **kwargs):
   """Returns an unknown TensorShape, optionally with a known rank.
 
   Args:
-    ndims: (Optional) If specified, the number of dimensions in the shape.
+    rank: (Optional) If specified, the number of dimensions in the shape.
+    **kwargs: For backwards compatibility.
 
   Returns:
     An unknown TensorShape.
+
+  Raises:
+    TypeError: In case of invalid arguments.
   """
-  if ndims is None:
+  if rank is None and "ndims" in kwargs:
+    rank = kwargs.pop("ndims")
+  if kwargs:
+    raise TypeError("Unknown argument: %s" % kwargs)
+  if rank is None:
     return TensorShape(None)
   else:
-    return TensorShape([Dimension(None)] * ndims)
+    return TensorShape([Dimension(None)] * rank)
+
+
+@tf_export("TensorShape", v1=[])
+class TensorShapeV2(TensorShapeV1):
+
+  @property
+  def _v2_behavior(self):
+    if _TENSORSHAPE_V2_OVERRIDE is None:
+      return True
+    return _TENSORSHAPE_V2_OVERRIDE
 
 
-_SCALAR_SHAPE = TensorShape([])
+if tf2.enabled():
+  TensorShape = TensorShapeV2
+else:
+  TensorShape = TensorShapeV1
 
 
 def scalar():
   """Returns a shape representing a scalar."""
-  return _SCALAR_SHAPE
+  return TensorShape([])
 
 
 def vector(length):
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 9232d99a1f932b9e48cd7ddc125e353390064873..7d85e0a99e662512b29e4134091658190a3bc500 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -212,7 +212,7 @@ class ShapeTest(test_util.TensorFlowTestCase):
     s = tensor_shape.TensorShape(None)
     with self.assertRaises(ValueError):
       s.assert_is_fully_defined()
-    self.assertIs(None, s.ndims)
+    self.assertIs(None, s.rank)
     with self.assertRaises(ValueError):
       len(s)
     self.assertFalse(s)
@@ -225,7 +225,7 @@ class ShapeTest(test_util.TensorFlowTestCase):
     s = tensor_shape.TensorShape([tensor_shape.Dimension(
         3), tensor_shape.Dimension(4), tensor_shape.Dimension(7)])
     s.assert_is_fully_defined()
-    self.assertEqual(3, s.ndims)
+    self.assertEqual(3, s.rank)
     self.assertEqual(3, len(s))
     self.assertTrue(s)
     s.assert_has_rank(3)
@@ -239,23 +239,23 @@ class ShapeTest(test_util.TensorFlowTestCase):
     s.assert_is_compatible_with([3, 4, 7])
     s.assert_same_rank([6, 3, 7])
     for d1, d2 in zip(s, [3, 4, 7]):
-      assert d1.value == d2
+      assert tensor_shape.dimension_value(d1) == d2
 
   def testPartiallyDefinedShape(self):
     s = tensor_shape.TensorShape([tensor_shape.Dimension(
         3), tensor_shape.Dimension(None), tensor_shape.Dimension(7)])
     with self.assertRaises(ValueError):
       s.assert_is_fully_defined()
-    self.assertEqual(3, s.ndims)
+    self.assertEqual(3, s.rank)
     self.assertEqual(3, len(s))
     self.assertTrue(s)
     s.assert_has_rank(3)
     self.assertEqual(tensor_shape.Dimension(3), s[0])
-    self.assertEqual(tensor_shape.Dimension(None).value, s[1].value)
-    self.assertEqual(tensor_shape.Dimension(7), s[2])
+    self.assertEqual(tensor_shape.Dimension(None).value, s.dims[1].value)
+    self.assertEqual(tensor_shape.Dimension(7), s.dims[2])
     s.assert_same_rank([6, 3, 7])
     for d1, d2 in zip(s, [3, None, 7]):
-      assert d1.value == d2
+      assert tensor_shape.dimension_value(d1) == d2
 
   def testMergeFullShapes(self):
     self.assertEqual([3, 4, 7],
@@ -283,7 +283,9 @@ class ShapeTest(test_util.TensorFlowTestCase):
     tensor_shape.TensorShape([1, 2, 3]).assert_is_compatible_with(known[1:4])
 
     unknown = tensor_shape.TensorShape(None)
-    self.assertEqual(tensor_shape.Dimension(None).value, unknown[2].value)
+    self.assertEqual(
+        tensor_shape.Dimension(None).value,
+        tensor_shape.dimension_value(unknown[2]))
     tensor_shape.TensorShape(
         [None, None, None]).assert_is_compatible_with(unknown[1:4])
 
@@ -358,29 +360,34 @@ class ShapeTest(test_util.TensorFlowTestCase):
         make_tensor_shape_proto([-1, 37, 42]))
     partial_shape = tensor_shape.TensorShape([None, 37, 42])
     self.assertNotEqual(partial_proto_shape, partial_shape)
-    self.assertEqual(partial_proto_shape[0].value, None)
-    self.assertEqual(partial_proto_shape[1].value, 37)
-    self.assertEqual(partial_proto_shape[2].value, 42)
+    self.assertEqual(tensor_shape.dimension_value(partial_proto_shape[0]), None)
+    self.assertEqual(tensor_shape.dimension_value(partial_proto_shape[1]), 37)
+    self.assertEqual(tensor_shape.dimension_value(partial_proto_shape[2]), 42)
     self.assertTrue(partial_shape.is_compatible_with(partial_proto_shape))
 
   def testStr(self):
     self.assertEqual("<unknown>", str(tensor_shape.unknown_shape()))
-    self.assertEqual("(?,)", str(tensor_shape.unknown_shape(ndims=1)))
-    self.assertEqual("(?, ?)", str(tensor_shape.unknown_shape(ndims=2)))
-    self.assertEqual("(?, ?, ?)", str(tensor_shape.unknown_shape(ndims=3)))
-
+    self.assertEqual(
+        "(None,)",
+        str(tensor_shape.unknown_shape(rank=1)).replace("?", "None"))
+    self.assertEqual(
+        "(None, None)",
+        str(tensor_shape.unknown_shape(rank=2)).replace("?", "None"))
+    self.assertEqual(
+        "(None, None, None)",
+        str(tensor_shape.unknown_shape(rank=3)).replace("?", "None"))
+    self.assertEqual(
+        "(32, None, 1, 9)",
+        str(tensor_shape.TensorShape([32, None, 1, 9])).replace("?", "None"))
     self.assertEqual("()", str(tensor_shape.scalar()))
     self.assertEqual("(7,)", str(tensor_shape.vector(7)))
     self.assertEqual("(3, 8)", str(tensor_shape.matrix(3, 8)))
     self.assertEqual("(4, 5, 2)", str(tensor_shape.TensorShape([4, 5, 2])))
 
-    self.assertEqual("(32, ?, 1, 9)",
-                     str(tensor_shape.TensorShape([32, None, 1, 9])))
-
   def testAsProto(self):
     self.assertTrue(tensor_shape.unknown_shape().as_proto().unknown_rank)
     self.assertFalse(
-        tensor_shape.unknown_shape(ndims=3).as_proto().unknown_rank)
+        tensor_shape.unknown_shape(rank=3).as_proto().unknown_rank)
     self.assertFalse(
         tensor_shape.TensorShape([1, 2, 3]).as_proto().unknown_rank)
     self.assertFalse(
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index fbea930fe0e6a4545b9a5ac55c0a7684b3cd8e28..c44636edc4ec5101c588766714c98a7da15793e4 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -24,14 +24,15 @@ from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("TensorSpec")
 class TensorSpec(object):
   """Describes a tf.Tensor.
 
-  A TensorSpec allows an API to describe the Tensors that it accepts or
-  returns, before that Tensor exists. This allows dynamic and flexible graph
-  construction and configuration.
+  Metadata for describing the `tf.Tensor` objects accepted or returned
+  by some TensorFlow APIs.
   """
 
   __slots__ = ["_shape", "_shape_tuple", "_dtype", "_name"]
@@ -69,11 +70,6 @@ class TensorSpec(object):
     else:
       raise ValueError("`tensor` should be a tf.Tensor")
 
-  @classmethod
-  def is_bounded(cls):
-    del cls
-    return False
-
   @property
   def shape(self):
     """Returns the `TensorShape` that represents the shape of the tensor."""
@@ -86,21 +82,21 @@ class TensorSpec(object):
 
   @property
   def name(self):
-    """Returns the name of the described tensor."""
+    """Returns the (optionally provided) name of the described tensor."""
     return self._name
 
-  @property
-  def is_discrete(self):
-    """Whether spec is discrete."""
-    return self.dtype.is_integer
+  def is_compatible_with(self, spec_or_tensor):
+    """Returns True if spec_or_tensor is compatible with this TensorSpec.
 
-  @property
-  def is_continuous(self):
-    """Whether spec is continuous."""
-    return self.dtype.is_floating
+    Two tensors are considered compatible if they have the same dtype
+    and their shapes are compatible (see `tf.TensorShape.is_compatible_with`).
 
-  def is_compatible_with(self, spec_or_tensor):
-    """True if the shape and dtype of `spec_or_tensor` are compatible."""
+    Args:
+      spec_or_tensor: A tf.TensorSpec or a tf.Tensor
+
+    Returns:
+      True if spec_or_tensor is compatible with self.
+    """
     return (self._dtype.is_compatible_with(spec_or_tensor.dtype) and
             self._shape.is_compatible_with(spec_or_tensor.shape))
 
@@ -188,11 +184,6 @@ class BoundedTensorSpec(TensorSpec):
     self._maximum = np.array(maximum, dtype=self.dtype.as_numpy_dtype())
     self._maximum.setflags(write=False)
 
-  @classmethod
-  def is_bounded(cls):
-    del cls
-    return True
-
   @classmethod
   def from_spec(cls, spec):
     dtype = dtypes.as_dtype(spec.dtype)
@@ -223,4 +214,3 @@ class BoundedTensorSpec(TensorSpec):
   def __reduce__(self):
     return BoundedTensorSpec, (self._shape, self._dtype, self._minimum,
                                self._maximum, self._name)
-
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index 2e9e43e12279fe833d640d4163c5474c398e70cd..75c197df09e97b8e5c9ebf15ffb33206f69a172f 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -45,6 +45,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     desc = tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)
     self.assertEqual(desc.shape, tensor_shape.TensorShape(None))
 
+  @test_util.run_deprecated_v1
   def testShapeCompatibility(self):
     unknown = array_ops.placeholder(dtypes.int64)
     partial = array_ops.placeholder(dtypes.int64, shape=[None, 1])
@@ -75,6 +76,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertFalse(desc_rank3.is_compatible_with(full))
     self.assertTrue(desc_rank3.is_compatible_with(rank3))
 
+  @test_util.run_deprecated_v1
   def testTypeCompatibility(self):
     floats = array_ops.placeholder(dtypes.float32, shape=[10, 10])
     ints = array_ops.placeholder(dtypes.int32, shape=[10, 10])
@@ -92,15 +94,21 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
         repr(desc1),
         "TensorSpec(shape=(1,), dtype=tf.float32, name='beep')")
     desc2 = tensor_spec.TensorSpec([1, None], dtypes.int32)
-    self.assertEqual(
-        repr(desc2),
-        "TensorSpec(shape=(1, ?), dtype=tf.int32, name=None)")
+    if desc2.shape._v2_behavior:
+      self.assertEqual(
+          repr(desc2),
+          "TensorSpec(shape=(1, None), dtype=tf.int32, name=None)")
+    else:
+      self.assertEqual(
+          repr(desc2),
+          "TensorSpec(shape=(1, ?), dtype=tf.int32, name=None)")
 
   def testFromTensorSpec(self):
     spec_1 = tensor_spec.TensorSpec((1, 2), dtypes.int32)
     spec_2 = tensor_spec.TensorSpec.from_spec(spec_1)
     self.assertEqual(spec_1, spec_2)
 
+  @test_util.run_deprecated_v1
   def testFromTensor(self):
     zero = constant_op.constant(0)
     spec = tensor_spec.TensorSpec.from_tensor(zero)
@@ -108,6 +116,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(spec.shape, [])
     self.assertEqual(spec.name, "Const")
 
+  @test_util.run_deprecated_v1
   def testFromPlaceholder(self):
     unknown = array_ops.placeholder(dtypes.int64, name="unknown")
     partial = array_ops.placeholder(dtypes.float32,
@@ -129,22 +138,6 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(bounded_spec.dtype, spec.dtype)
     self.assertEqual(bounded_spec.name, spec.name)
 
-  def testIsDiscrete(self):
-    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
-    self.assertTrue(discrete_spec.is_discrete)
-    self.assertFalse(continuous_spec.is_discrete)
-
-  def testIsContinuous(self):
-    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
-    self.assertFalse(discrete_spec.is_continuous)
-    self.assertTrue(continuous_spec.is_continuous)
-
-  def testIsBounded(self):
-    unbounded_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    self.assertFalse(unbounded_spec.is_bounded())
-
   def testSerialization(self):
     desc = tensor_spec.TensorSpec([1, 5], dtypes.float32, "test")
     self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
@@ -160,11 +153,6 @@ class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "not compatible"):
       tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, 0, (1, 1, 1))
 
-  def testIsBounded(self):
-    bounded_spec = tensor_spec.BoundedTensorSpec(
-        (1, 2), dtypes.int32, minimum=0, maximum=1)
-    self.assertTrue(bounded_spec.is_bounded())
-
   def testMinimumMaximumAttributes(self):
     spec = tensor_spec.BoundedTensorSpec(
         (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 26170b000d75fa05d168cafe081a0b651b5c4618..f98f301b38a946146df3051db9b8d26c8b816b33 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -339,11 +339,29 @@ _TF_TO_IS_OK = {
     dtypes.string: [_FilterStr],
     dtypes.uint16: [_FilterInt],
     dtypes.uint8: [_FilterInt],
+    dtypes.uint32: [_FilterInt],
+    dtypes.uint64: [_FilterInt],
 }
 
 
 def _AssertCompatible(values, dtype):
-  fn_list = _TF_TO_IS_OK.get(dtype, [_FilterNotTensor])
+  if dtype is None:
+    fn_list = [_FilterNotTensor]
+  else:
+    try:
+      fn_list = _TF_TO_IS_OK[dtype]
+    except KeyError:
+      # There isn't a specific fn_list, so we try to do the best possible.
+      if dtype.is_integer:
+        fn_list = [_FilterInt]
+      elif dtype.is_floating:
+        fn_list = [_FilterFloat]
+      elif dtype.is_complex:
+        fn_list = [_FilterComplex]
+      elif dtype.is_quantized:
+        fn_list = [_FilterInt, _FilterTuple]
+      else:
+        fn_list = [_FilterNotTensor]
   mismatch = _FirstNotNone([fn(values) for fn in fn_list])
   if mismatch is not None:
     if dtype is None:
@@ -353,8 +371,10 @@ def _AssertCompatible(values, dtype):
                       (dtype.name, repr(mismatch), type(mismatch).__name__))
 
 
-@tf_export("make_tensor_proto")
-def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
+# pylint: disable=invalid-name
+@tf_export(v1=["make_tensor_proto"])
+def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False,
+                      allow_broadcast=False):
   """Create a TensorProto.
 
   Args:
@@ -362,6 +382,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
     dtype:          Optional tensor_pb2 DataType value.
     shape:          List of integers representing the dimensions of tensor.
     verify_shape:   Boolean that enables verification of a shape of values.
+    allow_broadcast:Boolean that enables allowing scalars and 1 length vector
+        broadcasting. Cannot be true when verify_shape is true.
 
   Returns:
     A `TensorProto`. Depending on the type, it may contain data in the
@@ -398,6 +420,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   can not have more elements than what "shape" specifies.
 
   """
+  if allow_broadcast and verify_shape:
+    raise ValueError("allow_broadcast and verify_shape are not both allowed.")
   if isinstance(values, tensor_pb2.TensorProto):
     return values
 
@@ -486,15 +510,22 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
     shape_size = np.prod(shape, dtype=np.int64)
     is_same_size = shape_size == nparray.size
 
-    if verify_shape:
-      if not nparray.shape == tuple(shape):
+    if allow_broadcast:
+      if nparray.shape == (1,) or nparray.shape == tuple():
+        pass
+      elif nparray.size != shape_size:
         raise TypeError("Expected Tensor's shape: %s, got %s." %
                         (tuple(shape), nparray.shape))
 
-    if nparray.size > shape_size:
-      raise ValueError(
-          "Too many elements provided. Needed at most %d, but received %d" %
-          (shape_size, nparray.size))
+    else:
+      if verify_shape and nparray.shape != tuple(shape):
+        raise TypeError("Expected Tensor's shape: %s, got %s." %
+                        (tuple(shape), nparray.shape))
+
+      if nparray.size > shape_size:
+        raise ValueError(
+            "Too many elements provided. Needed at most %d, but received %d" %
+            (shape_size, nparray.size))
 
   tensor_proto = tensor_pb2.TensorProto(
       dtype=numpy_dtype.as_datatype_enum,
@@ -542,6 +573,7 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   append_fn(tensor_proto, proto_values)
 
   return tensor_proto
+# pylint: enable=invalid-name
 
 
 @tf_export("make_ndarray")
@@ -930,7 +962,7 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
     except TypeError:  # Could come from slicing prev.
       pass
 
-  ret = tensor_shape.unknown_shape(shape[0].value)
+  ret = tensor_shape.unknown_shape(shape.dims[0].value)
   value = constant_value(tensor)
   if value is not None:
     ret = ret.merge_with(
@@ -943,7 +975,7 @@ def is_tensor(x):  # pylint: disable=invalid-name
 
   Check whether an object is a tensor. This check is equivalent to calling
   `isinstance(x, (tf.Tensor, tf.SparseTensor, tf.Variable))` and also checks
-  if all the component variables of a MirroredVariable or a TowerLocalVariable
+  if all the component variables of a MirroredVariable or a ReplicaLocalVariable
   are tensors.
 
   Args:
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index bdf759f22047fe62a7820bc170654fed07f7adc9..00337546186d3a01313a49d11dd266e6dade3227 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -758,6 +758,7 @@ class TensorUtilTest(test.TestCase):
     self.assertFalse(tensor_util.ShapeEquals(t, [1, 4]))
     self.assertFalse(tensor_util.ShapeEquals(t, [4]))
 
+  @test_util.run_deprecated_v1
   def testMockArray(self):
 
     class MockArray(object):
@@ -771,7 +772,7 @@ class TensorUtilTest(test.TestCase):
     with self.cached_session() as sess:
       ma = MockArray(np.array([10, 20, 30]))
       t = ops.convert_to_tensor(ma)
-      a = sess.run(t)
+      a = self.evaluate(t)
       self.assertEquals(np.int64, a.dtype)
       self.assertAllClose(np.array([10, 20, 30], dtype=np.int64), a)
 
@@ -787,6 +788,7 @@ class ConstantValueTest(test.TestCase):
     tf_val = constant_op.constant(np_val)
     self.assertAllClose(np_val, tensor_util.constant_value(tf_val))
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     tf_val = gen_state_ops.variable(
         shape=[3, 4, 7],
@@ -815,12 +817,14 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertEqual(6, c_val)
 
+  @test_util.run_deprecated_v1
   def testSizeOfScalar(self):
     tf_val = array_ops.size(constant_op.constant(0.0))
     c_val = tensor_util.constant_value(tf_val)
     self.assertEqual(1, c_val)
     self.assertEqual(np.ndarray, type(c_val))
 
+  @test_util.run_deprecated_v1
   def testRank(self):
     tf_val = array_ops.rank(constant_op.constant(0.0, shape=[1, 2, 3]))
     c_val = tensor_util.constant_value(tf_val)
@@ -852,6 +856,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertAllClose(np_val.astype(np.float64), c_val)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     np_val = np.random.rand(3, 4, 7).astype(np.float32)
     tf_val = array_ops.concat(
@@ -871,6 +876,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertIs(None, c_val)
 
+  @test_util.run_deprecated_v1
   def testPack_Axis0(self):
     inputs = [np.random.rand(4, 7) for _ in range(3)]
     np_val = np.array(inputs)
@@ -883,6 +889,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertIs(None, c_val)
 
+  @test_util.run_deprecated_v1
   def testPack_Axis1(self):
     inputs = [np.random.rand(4, 7) for _ in range(3)]
     tf_val = array_ops.stack(inputs, axis=1)
@@ -894,6 +901,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertIs(None, c_val)
 
+  @test_util.run_deprecated_v1
   def testPack_Partial_Axis0(self):
     input_ = np.random.rand(4, 7)
     tf_val = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)])
@@ -901,6 +909,7 @@ class ConstantValueTest(test.TestCase):
     self.assertAllClose(input_, c_val[0])
     self.assertIsNone(c_val[1])
 
+  @test_util.run_deprecated_v1
   def testPack_Partial_Axis1(self):
     input_ = np.random.rand(4, 7)
     tf_val = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)],
@@ -966,12 +975,14 @@ class ConstantValueAsShapeTest(test.TestCase):
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([None, 1, None], c_val.as_list())
 
+  @test_util.run_deprecated_v1
   def testPack(self):
     tf_val = array_ops.stack(
         [constant_op.constant(16), 37, array_ops.placeholder(dtypes.int32)])
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([16, 37, None], c_val.as_list())
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     tf_val = array_ops.concat(
         [[16, 37], array_ops.placeholder(
@@ -985,6 +996,7 @@ class ConstantValueAsShapeTest(test.TestCase):
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([16, 37, None, 48], c_val.as_list())
 
+  @test_util.run_deprecated_v1
   def testSlice(self):
     tf_val = array_ops.placeholder(dtypes.int32, shape=(4,))[0:2]
     c_val = tensor_util.constant_value_as_shape(tf_val)
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 070b5ac11f563443a97b304ddcdaabd2f4338445..99e184a8acd44012774917c4baaecd48bae6cbe3 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -657,4 +657,27 @@ REGISTER_OP("ComplexStruct")
     .Attr("t_c: list(type) >= 0")
     .SetShapeFn(shape_inference::UnknownShape);
 
+// An op which returns its own device placement as a string, useful for testing
+// where ops get placed.
+REGISTER_OP("DevicePlacementOp")
+    .Output("device: string")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
+class DevicePlacementOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("device", TensorShape({}), &output));
+    output->scalar<string>()() = ctx->device()->name();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("DevicePlacementOp").Device(DEVICE_CPU),
+                        DevicePlacementOp);
+REGISTER_KERNEL_BUILDER(Name("DevicePlacementOp").Device(DEVICE_GPU),
+                        DevicePlacementOp);
 }  // end namespace tensorflow
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 4ec4b41b5ee9403a1f42770676ca4695dd4a6304..df3cebd2e0c2f37711dc41cf60409c2660bf3e2c 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -50,10 +50,12 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import tf2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape  # pylint: disable=unused-import
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -61,17 +63,22 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import memory
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.protobuf import compare
 from tensorflow.python.util.tf_export import tf_export
@@ -112,8 +119,28 @@ def assert_ops_in_graph(expected_ops, graph):
   return actual_ops
 
 
-@tf_export("test.assert_equal_graph_def")
-def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
+@tf_export("test.assert_equal_graph_def", v1=[])
+def assert_equal_graph_def_v2(actual, expected):
+  """Asserts that two `GraphDef`s are (mostly) the same.
+
+  Compares two `GraphDef` protos for equality, ignoring versions and ordering of
+  nodes, attrs, and control inputs.  Node names are used to match up nodes
+  between the graphs, so the naming of nodes must be consistent. This function
+  ignores randomized attribute values that may appear in V2 checkpoints.
+
+  Args:
+    actual: The `GraphDef` we have.
+    expected: The `GraphDef` we expected.
+
+  Raises:
+    AssertionError: If the `GraphDef`s do not match.
+    TypeError: If either argument is not a `GraphDef`.
+  """
+  assert_equal_graph_def(actual, expected, checkpoint_v2=True)
+
+
+@tf_export(v1=["test.assert_equal_graph_def"])
+def assert_equal_graph_def_v1(actual, expected, checkpoint_v2=False):
   """Asserts that two `GraphDef`s are (mostly) the same.
 
   Compares two `GraphDef` protos for equality, ignoring versions and ordering of
@@ -130,6 +157,10 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     AssertionError: If the `GraphDef`s do not match.
     TypeError: If either argument is not a `GraphDef`.
   """
+  assert_equal_graph_def(actual, expected, checkpoint_v2)
+
+
+def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
   if not isinstance(actual, graph_pb2.GraphDef):
     raise TypeError(
         "Expected tf.GraphDef for actual, got %s" % type(actual).__name__)
@@ -352,53 +383,12 @@ def skip_if(condition):
 
 
 def enable_c_shapes(fn):
-  """Decorator for enabling C shapes on a test.
-
-  Note this enables the C shapes after running the test class's setup/teardown
-  methods.
-
-  Args:
-    fn: the function to be wrapped
-
-  Returns:
-    The wrapped function
-  """
-
-  # pylint: disable=protected-access
-  def wrapper(*args, **kwargs):
-    prev_value = ops._USE_C_SHAPES
-    ops._USE_C_SHAPES = True
-    try:
-      fn(*args, **kwargs)
-    finally:
-      ops._USE_C_SHAPES = prev_value
-
-  # pylint: enable=protected-access
-
-  return wrapper
+  """No-op. TODO(b/74620627): Remove this."""
+  return fn
 
 
 def with_c_shapes(cls):
-  """Adds methods that call original methods but with C API shapes enabled.
-
-  Note this enables C shapes in new methods after running the test class's
-  setup method.
-
-  Args:
-    cls: class to decorate
-
-  Returns:
-    cls with new test methods added
-  """
-  # If C shapes are already enabled, don't do anything. Some tests break if the
-  # same test is run twice, so this allows us to turn on the C shapes by default
-  # without breaking these tests.
-  if ops._USE_C_SHAPES:
-    return cls
-
-  for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith("test"):
-      setattr(cls, name + "WithCShapes", enable_c_shapes(value))
+  """No-op. TODO(b/74620627): Remove this."""
   return cls
 
 
@@ -421,13 +411,40 @@ def enable_control_flow_v2(fn):
   def wrapper(*args, **kwargs):
     enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
     enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
+    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
     control_flow_ops.ENABLE_COND_V2 = True
     control_flow_ops.ENABLE_WHILE_V2 = True
+    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
     try:
       fn(*args, **kwargs)
     finally:
       control_flow_ops.ENABLE_COND_V2 = enable_cond_v2_old
       control_flow_ops.ENABLE_WHILE_V2 = enable_while_v2_old
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
+
+  return wrapper
+
+
+def enable_tensor_array_v2(fn):
+  """Decorator for enabling _GraphTensorArrayV2 on a test.
+
+  Note this enables _GraphTensorArrayV2 after running the test class's
+  setup/teardown methods.
+
+  Args:
+    fn: the function to be wrapped
+
+  Returns:
+    The wrapped function
+  """
+
+  def wrapper(*args, **kwargs):
+    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
+    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
+    try:
+      fn(*args, **kwargs)
+    finally:
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
 
   return wrapper
 
@@ -480,7 +497,8 @@ def with_control_flow_v2(cls):
     return cls
 
   for name, value in cls.__dict__.copy().items():
-    if (callable(value) and name.startswith("test") and
+    if (callable(value) and
+        name.startswith(unittest.TestLoader.testMethodPrefix) and
         not getattr(value, "_disable_control_flow_v2", False)):
       setattr(cls, name + "WithControlFlowV2", enable_control_flow_v2(value))
   return cls
@@ -506,9 +524,9 @@ def disable_control_flow_v2(unused_msg):
 def assert_no_new_pyobjects_executing_eagerly(f):
   """Decorator for asserting that no new Python objects persist after a test.
 
-  Runs the test multiple times executing eagerly, first as a warmup and then
-  several times to let objects accumulate. The warmup helps ignore caches which
-  do not grow as the test is run repeatedly.
+  Runs the test multiple times executing eagerly, first as a warmup and then to
+  let objects accumulate. The warmup helps ignore caches which do not grow as
+  the test is run repeatedly.
 
   Useful for checking that there are no missing Py_DECREFs in the C exercised by
   a bit of Python.
@@ -518,7 +536,14 @@ def assert_no_new_pyobjects_executing_eagerly(f):
     """Warms up, gets an object count, runs the test, checks for new objects."""
     with context.eager_mode():
       gc.disable()
-      f(self, **kwargs)
+      # Run the test 2 times as warmup, in an attempt to fill up caches, which
+      # should not grow as the test is run repeatedly below.
+      #
+      # TODO(b/117156879): Running warmup twice is black magic; we have seen
+      # tests that fail with 1 warmup run, and pass with 2, on various versions
+      # of python2.7.x.
+      for _ in range(2):
+        f(self, **kwargs)
       gc.collect()
       previous_count = len(gc.get_objects())
       if ops.has_default_graph():
@@ -621,6 +646,109 @@ def assert_no_new_tensors(f):
   return decorator
 
 
+def _find_reference_cycle(objects, idx):
+
+  def get_ignore_reason(obj, blacklist):
+    """Tests whether an object should be omitted from the dependency graph."""
+    if len(blacklist) > 100:
+      return "<depth limit>"
+    if tf_inspect.isframe(obj):
+      if "test_util.py" in tf_inspect.getframeinfo(obj)[0]:
+        return "<test code>"
+    for b in blacklist:
+      if b is obj:
+        return "<test code>"
+    if obj is blacklist:
+      return "<test code>"
+    return None
+
+  # Note: this function is meant to help with diagnostics. Its output is purely
+  # a human readable representation, so you may freely modify it to suit your
+  # needs.
+  def describe(obj, blacklist, leaves_only=False):
+    """Returns a custom human-readable summary of obj.
+
+    Args:
+      obj: the value to describe.
+      blacklist: same as blacklist in get_ignore_reason.
+      leaves_only: boolean flag used when calling describe recursively. Useful
+        for summarizing collections.
+    """
+    if get_ignore_reason(obj, blacklist):
+      return "{}{}".format(get_ignore_reason(obj, blacklist), type(obj))
+    if tf_inspect.isframe(obj):
+      return "frame: {}".format(tf_inspect.getframeinfo(obj))
+    elif tf_inspect.ismodule(obj):
+      return "module: {}".format(obj.__name__)
+    else:
+      if leaves_only:
+        return "{}, {}".format(type(obj), id(obj))
+      elif isinstance(obj, list):
+        return "list({}): {}".format(
+            id(obj), [describe(e, blacklist, leaves_only=True) for e in obj])
+      elif isinstance(obj, tuple):
+        return "tuple({}): {}".format(
+            id(obj), [describe(e, blacklist, leaves_only=True) for e in obj])
+      elif isinstance(obj, dict):
+        return "dict({}): {} keys".format(id(obj), len(obj.keys()))
+      elif tf_inspect.isfunction(obj):
+        return "function({}) {}; globals ID: {}".format(
+            id(obj), obj.__name__, id(obj.__globals__))
+      else:
+        return "{}, {}".format(type(obj), id(obj))
+
+  def build_ref_graph(obj, graph, reprs, blacklist):
+    """Builds a reference graph as <referrer> -> <list of refferents>.
+
+    Args:
+      obj: The object to start from. The graph will be built by recursively
+        adding its referrers.
+      graph: Dict holding the graph to be built. To avoid creating extra
+        references, the graph holds object IDs rather than actual objects.
+      reprs: Auxiliary structure that maps object IDs to their human-readable
+        description.
+      blacklist: List of objects to ignore.
+    """
+    referrers = gc.get_referrers(obj)
+    blacklist = blacklist + (referrers,)
+
+    obj_id = id(obj)
+    for r in referrers:
+      if get_ignore_reason(r, blacklist) is None:
+        r_id = id(r)
+        if r_id not in graph:
+          graph[r_id] = []
+        if obj_id not in graph[r_id]:
+          graph[r_id].append(obj_id)
+          build_ref_graph(r, graph, reprs, blacklist)
+          reprs[r_id] = describe(r, blacklist)
+
+  def find_cycle(el, graph, reprs, path):
+    """Finds and prints a single cycle in the dependency graph."""
+    if el not in graph:
+      return
+    for r in graph[el]:
+      if r in path:
+        logging.error("Reference cycle sample:")
+        for p in path + (r,):
+          logging.error(reprs.get(p, "unknown object " + str(p)))
+        return True
+      else:
+        if find_cycle(r, graph, reprs, path + (r,)):
+          return True
+    return False
+
+  obj = objects[idx]
+  graph = {}  # referrer ID -> object ID
+  reprs = {}  # object ID -> description
+  build_ref_graph(obj, graph, reprs, (objects, graph, reprs, get_ignore_reason,
+                                      describe, build_ref_graph, find_cycle))
+  for k in graph:
+    if find_cycle(k, graph, reprs, ()):
+      return True
+  return False
+
+
 def assert_no_garbage_created(f):
   """Test method decorator to assert that no garbage has been created.
 
@@ -636,6 +764,10 @@ def assert_no_garbage_created(f):
 
   def decorator(self, **kwargs):
     """Sets DEBUG_SAVEALL, runs the test, and checks for new garbage."""
+    # Force-load `distribution_strategy_context` to prevent GC at
+    # test time when using eager. Remove once b/117329403 is resolved.
+    tape.distribution_strategy_context.get_distribution_strategy()
+
     gc.disable()
     previous_debug_flags = gc.get_debug()
     gc.set_debug(gc.DEBUG_SAVEALL)
@@ -643,7 +775,8 @@ def assert_no_garbage_created(f):
     previous_garbage = len(gc.garbage)
     f(self, **kwargs)
     gc.collect()
-    if len(gc.garbage) > previous_garbage:
+    new_garbage = len(gc.garbage)
+    if new_garbage > previous_garbage:
       logging.error(
           "The decorated test created work for Python's garbage collector, "
           "likely due to a reference cycle. New objects in cycle(s):")
@@ -667,11 +800,19 @@ def assert_no_garbage_created(f):
           logging.error(obj)
           logging.error("  Object __repr__:")
           logging.error(repr(obj))
-        except Exception:
+        except Exception:  # pylint: disable=broad-except
           logging.error("(Exception while printing object)")
+
+    # When garbage is created, this call can help identify reference cycles,
+    # which are typically the cause of such garbage.
+    if new_garbage > previous_garbage:
+      for i in range(previous_garbage, new_garbage):
+        if _find_reference_cycle(gc.garbage, i):
+          break
+
     # This will fail if any garbage has been created, typically because of a
     # reference cycle.
-    self.assertEqual(previous_garbage, len(gc.garbage))
+    self.assertEqual(previous_garbage, new_garbage)
     # TODO(allenl): Figure out why this debug flag reset doesn't work. It would
     # be nice to be able to decorate arbitrary tests in a large test suite and
     # not hold on to every object in other tests.
@@ -756,7 +897,10 @@ def run_all_in_graph_and_eager_modes(cls):
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
   for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith("test"):
+    if (callable(value) and
+        name.startswith(unittest.TestLoader.testMethodPrefix) and
+        not (name.startswith("testSkipEager")
+             or name.startswith("test_skip_eager"))):
       setattr(cls, name, base_decorator(value))
   return cls
 
@@ -823,23 +967,23 @@ def run_in_graph_and_eager_modes(func=None,
   def decorator(f):
     if tf_inspect.isclass(f):
       raise ValueError(
-          "`run_test_in_graph_and_eager_modes` only supports test methods. "
-          "Did you mean to use `run_all_tests_in_graph_and_eager_modes`?")
+          "`run_in_graph_and_eager_modes` only supports test methods. "
+          "Did you mean to use `run_all_in_graph_and_eager_modes`?")
 
-    def decorated(self, **kwargs):
+    def decorated(self, *args, **kwargs):
       try:
         with context.graph_mode():
           with self.test_session(use_gpu=use_gpu, config=config):
-            f(self, **kwargs)
+            f(self, *args, **kwargs)
       except unittest.case.SkipTest:
         pass
 
       def run_eagerly(self, **kwargs):
         if not use_gpu:
           with ops.device("/device:CPU:0"):
-            f(self, **kwargs)
+            f(self, *args, **kwargs)
         else:
-          f(self, **kwargs)
+          f(self, *args, **kwargs)
 
       if assert_no_eager_garbage:
         ops.reset_default_graph()
@@ -868,6 +1012,235 @@ def run_in_graph_and_eager_modes(func=None,
   return decorator
 
 
+def py_func_if_in_function(f):
+
+  def decorated(*args, **kwds):
+    if not ops.get_default_graph()._building_function:
+      return f(*args, **kwds)
+
+    tensor_args, tensor_indices = zip(
+        *[(x, i) for i, x in enumerate(args)
+          if isinstance(x, (ops.Tensor, variables.Variable))])
+
+    def inner_f(*inner_tensor_args):
+      my_args = list(args)
+      for i, n in zip(tensor_indices, inner_tensor_args):
+        my_args[i] = n
+      return f(*my_args, **kwds)
+
+    return script_ops.py_func(inner_f, tensor_args, [])
+
+  return tf_decorator.make_decorator(f, decorated)
+
+
+def also_run_as_tf_function(f):
+  """Runs the decorated test twice--once as is, once inside a tf.function.
+
+  This allows you to run a test both in eager execution and inside a
+  tf.function, exercising the two execution modes supported in tf 2.0. The test
+  assertions are automatically done inside tf.py_funcs, and tf.function ensures
+  that they run in the proper order and with the proper side effects.
+
+  Currently variable creation is not supported in tests annotated with this
+  decorator since it's tricky to ensure the variable doesn't get repeatedly
+  created when retracing the tf.function.
+
+  Args:
+    f: the test method to be decorated
+
+  Returns:
+    The decorated test method, which will run both in eager and inside a
+    tf.function.
+  """
+
+  def decorated(*args, **kwds):
+    with context.eager_mode():
+      # Running in eager mode
+      f(*args, **kwds)
+
+      defun_f = def_function.function(f)
+      defun_f(*args, **kwds)
+
+  return decorated
+
+
+def run_deprecated_v1(func=None):
+  """Execute the decorated test in graph mode.
+
+  This function returns a decorator intended to be applied to tests that have
+  not been updated to a style that is compatible with both TensorFlow 1.x and
+  2.x. When this decorated is applied, the test body will be run in
+  an environment where API calls construct graphs instead of executing eagerly.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+  Returns:
+    Returns a decorator that will run the decorated test method in graph mode.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_deprecated_v1` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if tf2.enabled():
+        with context.graph_mode():
+          f(self, *args, **kwargs)
+      else:
+        f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_v1_only(reason, func=None):
+  """Execute the decorated test only if running in v1 mode.
+
+  This function is intended to be applied to tests that exercise v1 only
+  functionality. If the test is run in v2 mode it will simply be skipped.
+
+  Args:
+    reason: string giving a reason for limiting the test to v1 only.
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      setup = f.__dict__.get("setUp")
+      if setup is not None:
+        setattr(f, "setUp", decorator(setup))
+
+      for name, value in f.__dict__.copy().items():
+        if (callable(value) and
+            name.startswith(unittest.TestLoader.testMethodPrefix)):
+          setattr(f, name, decorator(value))
+
+      return f
+
+    def decorated(self, *args, **kwargs):
+      if tf2.enabled():
+        self.skipTest(reason)
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_v2_only(func=None):
+  """Execute the decorated test only if running in v2 mode.
+
+  This function is intended to be applied to tests that exercise v2 only
+  functionality. If the test is run in v1 mode it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_v2_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not tf2.enabled():
+        self.skipTest("Test is only comptaible in v2")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_gpu_only(func=None):
+  """Execute the decorated test only if a GPU is available.
+
+  This function is intended to be applied to tests that require the precense
+  of a GPU. If a GPU is absent, it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_gpu_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not is_gpu_available():
+        self.skipTest("Test requires GPU")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_cuda_only(func=None):
+  """Execute the decorated test only if a GPU is available.
+
+  This function is intended to be applied to tests that require the precense
+  of a CUDA GPU. If a CUDA GPU is absent, it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_cuda_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not is_gpu_available(cuda_only=True):
+        self.skipTest("Test requires CUDA GPU")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
 @tf_export("test.is_gpu_available")
 def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   """Returns whether TensorFlow can access a GPU.
@@ -907,7 +1280,7 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
         return True
     return False
   except errors_impl.NotFoundError as e:
-    if not all([x in str(e) for x in ["CUDA", "not find"]]):
+    if not all(x in str(e) for x in ["CUDA", "not find"]):
       raise e
     else:
       logging.error(str(e))
@@ -925,6 +1298,27 @@ def device(use_gpu):
     yield
 
 
+@contextlib.contextmanager
+def use_gpu():
+  """Uses gpu when requested and available."""
+  with device(use_gpu=True):
+    yield
+
+
+@contextlib.contextmanager
+def force_gpu():
+  """Force the gpu to be used."""
+  with ops.device("/device:GPU:0"):
+    yield
+
+
+@contextlib.contextmanager
+def force_cpu():
+  """Force the cpu to be used."""
+  with ops.device("/device:CPU:0"):
+    yield
+
+
 class CapturedWrites(object):
   """A utility class to load the captured writes made to a stream."""
 
@@ -938,6 +1332,63 @@ class CapturedWrites(object):
     return output_data
 
 
+class FakeEagerSession(object):
+  """Fake session so tests that conditionally use placeholders can use eager.
+
+  There are a number of tests that conditionally use placeholders for shape
+  inference. The pattern is demonstrated here:
+
+  ```python
+  with self.cached_session() as sess:
+    if static_shape:
+      y = math_ops.matmul(x, ...)
+      feed_dict = {}
+    else:
+      x_ph = array_ops.placeholder(...)
+      y = math_ops.matmul(x_ph, ...)
+      feed_dict = {x_ph: x}
+    val = sess.run(y, feed_dict=feed_dict)
+  ```
+
+  Since the feed_dict is empty when not using placeholders we should be able to
+  call self.evaluate(), however this requires rewriting the test case.
+  This class shold be considered a stop-gap solution to get tests running with
+  eager with minimal changes to the actual test.
+  """
+
+  def __init__(self, test_case):
+    self._test_case = test_case
+
+  def run(self, fetches, *args, **kwargs):
+    """Evalaute `fetches`.
+
+    Fail if additional args are specified.
+
+    Args:
+      fetches: A Tensor or a nested list/tuple of Tensors.
+      *args: Positional arguments
+      **kwargs: Keyword arguments
+
+    Raises:
+      RuntimeError: If args or kwargs are specified.
+
+    Returns:
+      Tensors as numpy values.
+    """
+    feed_dict = kwargs.pop("feed_dict", {})
+    if feed_dict:
+      raise RuntimeError(
+          "feed_dict is not supported when eager execution is enabled "
+          "(in this case, sess.run(t) is shorthand for t.numpy()")
+
+    if args or kwargs:
+      raise RuntimeError(
+          "Optional args are not supported when eager execution is enabled "
+          "(in this case, sess.run(t) is shorthand for t.numpy()")
+
+    return self._test_case.evaluate(fetches)
+
+
 class ErrorLoggingSession(session.Session):
   """Wrapper around a Session that logs errors in run().
   """
@@ -979,6 +1430,10 @@ class TensorFlowTestCase(googletest.TestCase):
     ops.reset_default_graph()
     random_seed.set_random_seed(random_seed.DEFAULT_GRAPH_SEED)
 
+    # Avoiding calling setUp() for the poorly named test_session method.
+    if self.id().endswith(".test_session"):
+      self.skipTest("Not a test.")
+
   def tearDown(self):
     for thread in self._threads:
       thread.check_termination()
@@ -1126,6 +1581,9 @@ class TensorFlowTestCase(googletest.TestCase):
       return self._eval_helper(tensor())
     else:
       try:
+        if sparse_tensor.is_sparse(tensor):
+          return sparse_tensor.SparseTensorValue(tensor.indices, tensor.values,
+                                                 tensor.dense_shape)
         return tensor.numpy()
       except AttributeError as e:
         six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e)
@@ -1242,7 +1700,7 @@ class TensorFlowTestCase(googletest.TestCase):
       the graph building and execution code in a test case.
     """
     if context.executing_eagerly():
-      yield None
+      yield FakeEagerSession(self)
     else:
       sess = self._get_cached_session(
           graph, config, force_gpu, crash_if_inconsistent_args=True)
@@ -1251,6 +1709,8 @@ class TensorFlowTestCase(googletest.TestCase):
         yield cached
 
   @contextlib.contextmanager
+  @deprecation.deprecated(None, "Use `self.session()` or "
+                          "`self.cached_session()` instead.")
   def test_session(self,
                    graph=None,
                    config=None,
@@ -1259,7 +1719,6 @@ class TensorFlowTestCase(googletest.TestCase):
     """Use cached_session instead."""
     if self.id().endswith(".test_session"):
       self.skipTest("Not a test.")
-
     if context.executing_eagerly():
       yield None
     else:
@@ -1382,8 +1841,8 @@ class TensorFlowTestCase(googletest.TestCase):
     return ret
 
 
-# pylint: enable=invalid-name
-
+  # pylint: enable=invalid-name
+  @py_func_if_in_function
   def assertNear(self, f1, f2, err, msg=None):
     """Asserts that two floats are near each other.
 
@@ -1402,6 +1861,7 @@ class TensorFlowTestCase(googletest.TestCase):
         "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
                                if msg is not None else ""))
 
+  @py_func_if_in_function
   def assertArrayNear(self, farray1, farray2, err, msg=None):
     """Asserts that two float arrays are near each other.
 
@@ -1421,6 +1881,7 @@ class TensorFlowTestCase(googletest.TestCase):
   def _NDArrayNear(self, ndarray1, ndarray2, err):
     return np.linalg.norm(ndarray1 - ndarray2) < err
 
+  @py_func_if_in_function
   def assertNDArrayNear(self, ndarray1, ndarray2, err, msg=None):
     """Asserts that two numpy arrays have near values.
 
@@ -1558,6 +2019,7 @@ class TensorFlowTestCase(googletest.TestCase):
         e.args = ((e.args[0] + " : " + msg,) + e.args[1:])
         raise
 
+  @py_func_if_in_function
   def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
     """Asserts that two structures of numpy arrays or Tensors, have near values.
 
@@ -1583,6 +2045,7 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     self._assertAllCloseRecursive(a, b, rtol=rtol, atol=atol, msg=msg)
 
+  @py_func_if_in_function
   def assertAllCloseAccordingToType(self,
                                     a,
                                     b,
@@ -1630,6 +2093,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
     self.assertAllClose(a, b, rtol=rtol, atol=atol, msg=msg)
 
+  @py_func_if_in_function
   def assertNotAllClose(self, a, b, **kwargs):
     """Assert that two numpy arrays, or or Tensors, do not have near values.
 
@@ -1648,6 +2112,7 @@ class TensorFlowTestCase(googletest.TestCase):
       return
     raise AssertionError("The two values are close at all elements")
 
+  @py_func_if_in_function
   def assertAllEqual(self, a, b, msg=None):
     """Asserts that two numpy arrays or Tensors have the same values.
 
@@ -1659,9 +2124,16 @@ class TensorFlowTestCase(googletest.TestCase):
     msg = msg if msg else ""
     a = self._GetNdArray(a)
     b = self._GetNdArray(b)
-    self.assertEqual(
-        a.shape, b.shape, "Shape mismatch: expected %s, got %s."
-        " %s" % (a.shape, b.shape, msg))
+    # Arbitrary bounds so that we don't print giant tensors.
+    if (b.ndim <= 3 or b.size < 500):
+      self.assertEqual(
+          a.shape, b.shape, "Shape mismatch: expected %s, got %s."
+          " Contents: %s. \n%s." % (a.shape, b.shape, b, msg))
+    else:
+      self.assertEqual(
+          a.shape, b.shape, "Shape mismatch: expected %s, got %s."
+          " %s" % (a.shape, b.shape, msg))
+
     same = (a == b)
 
     if (a.dtype in [
@@ -1683,6 +2155,7 @@ class TensorFlowTestCase(googletest.TestCase):
       msgs.append("not equal rhs = {}".format(y))
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))
 
+  @py_func_if_in_function
   def assertAllGreater(self, a, comparison_target):
     """Assert element values are all greater than a target value.
 
@@ -1694,8 +2167,9 @@ class TensorFlowTestCase(googletest.TestCase):
     a = self._GetNdArray(a)
     self.assertGreater(np.min(a), comparison_target)
 
+  @py_func_if_in_function
   def assertAllLess(self, a, comparison_target):
-    """Assert element values are all greater than a target value.
+    """Assert element values are all less than a target value.
 
     Args:
       a: The numpy `ndarray`, or anything that can be converted into a
@@ -1705,8 +2179,9 @@ class TensorFlowTestCase(googletest.TestCase):
     a = self._GetNdArray(a)
     self.assertLess(np.max(a), comparison_target)
 
+  @py_func_if_in_function
   def assertAllGreaterEqual(self, a, comparison_target):
-    """Assert element values are all greater than a target value.
+    """Assert element values are all greater than or equal to a target value.
 
     Args:
       a: The numpy `ndarray`, or anything that can be converted into a
@@ -1716,8 +2191,9 @@ class TensorFlowTestCase(googletest.TestCase):
     a = self._GetNdArray(a)
     self.assertGreaterEqual(np.min(a), comparison_target)
 
+  @py_func_if_in_function
   def assertAllLessEqual(self, a, comparison_target):
-    """Assert element values are all greater than a target value.
+    """Assert element values are all less than or equal to a target value.
 
     Args:
       a: The numpy `ndarray`, or anything that can be converted into a
@@ -1758,6 +2234,7 @@ class TensorFlowTestCase(googletest.TestCase):
       lines.append(prefix + "...")
     return lines
 
+  @py_func_if_in_function
   def assertAllInRange(self,
                        target,
                        lower_bound,
@@ -1816,6 +2293,7 @@ class TensorFlowTestCase(googletest.TestCase):
           "Subscript(s) and value(s) of the offending elements:\n" +
           "\n".join(self._format_subscripts(violation_subscripts, target)))
 
+  @py_func_if_in_function
   def assertAllInSet(self, target, expected_set):
     """Assert that elements of a Tensor are all in a given closed set.
 
@@ -1837,6 +2315,7 @@ class TensorFlowTestCase(googletest.TestCase):
       raise AssertionError("%d unique element(s) are not in the set %s: %s" %
                            (np.size(diff), expected_set, diff))
 
+  @py_func_if_in_function
   def assertDTypeEqual(self, target, expected_dtype):
     """Assert ndarray data type is equal to expected.
 
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 22189afa591628ae5e0d76ca5920832d7668fa11..dfdced5a9886089884fede9dea9b69587499e28f 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -24,6 +24,7 @@ import random
 import threading
 import weakref
 
+from absl.testing import parameterized
 import numpy as np
 
 from google.protobuf import text_format
@@ -46,8 +47,9 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
-class TestUtilTest(test_util.TensorFlowTestCase):
+class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_assert_ops_in_graph(self):
     with self.test_session():
       constant_op.constant(["hello", "taffy"], name="hello")
@@ -59,6 +61,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertRaises(ValueError, test_util.assert_ops_in_graph,
                       {"hello": "Variable"}, ops.get_default_graph())
 
+  @test_util.run_deprecated_v1
   def test_session_functions(self):
     with self.test_session() as sess:
       sess_ref = weakref.ref(sess)
@@ -361,6 +364,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     b = [1, 2]
     self.assertArrayNear(a, b, 0.001)
 
+  @test_util.skip_if(True)  # b/117665998
   def testForceGPU(self):
     with self.assertRaises(errors.InvalidArgumentError):
       with self.test_session(force_gpu=True):
@@ -549,6 +553,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     with self.assertRaises(AssertionError):
       self.assertAllLessEqual(x, 95.0)
 
+  @test_util.run_deprecated_v1
   def testAssertAllInRangeWithNonNumericValuesFails(self):
     s1 = constant_op.constant("Hello, ", name="s1")
     c = constant_op.constant([1 + 2j, -3 + 5j], name="c")
@@ -612,6 +617,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     with self.assertRaises(AssertionError):
       self.assertAllInSet(x, (42,))
 
+  @test_util.run_deprecated_v1
   def testRandomSeed(self):
     # Call setUp again for WithCApi case (since it makes a new defeault graph
     # after setup).
@@ -679,7 +685,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertIsNone(test_util.get_node_def_from_graph("bar", graph_def))
 
   def test_run_in_eager_and_graph_modes_test_class(self):
-    msg = "`run_test_in_graph_and_eager_modes` only supports test methods.*"
+    msg = "`run_in_graph_and_eager_modes` only supports test methods.*"
     with self.assertRaisesRegexp(ValueError, msg):
       @test_util.run_in_graph_and_eager_modes()
       class Foo(object):
@@ -704,6 +710,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     test_util.run_in_graph_and_eager_modes(_test)(self)
     self.assertEqual(modes, ["graph"])
 
+  @test_util.run_deprecated_v1
   def test_run_in_graph_and_eager_modes_setup_in_same_mode(self):
     modes = []
     mode_name = lambda: "eager" if context.executing_eagerly() else "graph"
@@ -727,6 +734,12 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertEqual(modes[0:2], ["setup_graph", "run_graph"])
     self.assertEqual(modes[2:], ["setup_eager", "run_eager"])
 
+  @parameterized.named_parameters(dict(testcase_name="argument",
+                                       arg=True))
+  @test_util.run_in_graph_and_eager_modes
+  def test_run_in_graph_and_eager_works_with_parameterized_keyword(self, arg):
+    self.assertEqual(arg, True)
+
 
 # Its own test case to reproduce variable sharing issues which only pop up when
 # setUp() is overridden and super() is not called.
diff --git a/tensorflow/python/framework/traceable_stack.py b/tensorflow/python/framework/traceable_stack.py
index 7f4d28237ffba80e5aa604b880fccf00482a9ca5..c4e35a83256c2d546ae45d6b8ed9292de1f7ff0b 100644
--- a/tensorflow/python/framework/traceable_stack.py
+++ b/tensorflow/python/framework/traceable_stack.py
@@ -58,7 +58,7 @@ class TraceableObject(object):
     frame_records = tf_stack.extract_stack()
     if not frame_records:
       return self.FAILURE
-    if len(frame_records) >= local_offset:
+    if len(frame_records) > local_offset:
       # Negative indexing is one-indexed instead of zero-indexed.
       negative_offset = -(local_offset + 1)
       self.filename, self.lineno = frame_records[negative_offset][:2]
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index 472ccbcac7a447926989cfbef27ec1ea9d71e91c..37f2b37b31eaa153ab9cd85e83617fdc115fe8f9 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -29,30 +29,59 @@ __cxx11_abi_flag__ = pywrap_tensorflow.__cxx11_abi_flag__
 __monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 
 VERSION = __version__
-tf_export("VERSION", "__version__").export_constant(__name__, "VERSION")
+tf_export(
+    "version.VERSION",
+    "__version__",
+    v1=["version.VERSION", "VERSION", "__version__"]).export_constant(
+        __name__, "VERSION")
 GIT_VERSION = __git_version__
-tf_export("GIT_VERSION", "__git_version__").export_constant(
-    __name__, "GIT_VERSION")
+tf_export(
+    "version.GIT_VERSION",
+    "__git_version__",
+    v1=["version.GIT_VERSION", "GIT_VERSION",
+        "__git_version__"]).export_constant(__name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
-tf_export("COMPILER_VERSION", "__compiler_version__").export_constant(
-    __name__, "COMPILER_VERSION")
+tf_export(
+    "version.COMPILER_VERSION",
+    "__compiler_version__",
+    v1=["version.COMPILER_VERSION", "COMPILER_VERSION",
+        "__compiler_version__"]).export_constant(__name__, "COMPILER_VERSION")
+
 CXX11_ABI_FLAG = __cxx11_abi_flag__
-tf_export("CXX11_ABI_FLAG", "__cxx11_abi_flag__").export_constant(
-    __name__, "CXX11_ABI_FLAG")
+tf_export(
+    "sysconfig.CXX11_ABI_FLAG",
+    "__cxx11_abi_flag__",
+    v1=["sysconfig.CXX11_ABI_FLAG", "CXX11_ABI_FLAG",
+        "__cxx11_abi_flag__"]).export_constant(__name__, "CXX11_ABI_FLAG")
 MONOLITHIC_BUILD = __monolithic_build__
-tf_export("MONOLITHIC_BUILD", "__monolithic_build__").export_constant(
-    __name__, "MONOLITHIC_BUILD")
+tf_export(
+    "sysconfig.MONOLITHIC_BUILD",
+    "__monolithic_build__",
+    v1=[
+        "sysconfig.MONOLITHIC_BUILD", "MONOLITHIC_BUILD", "__monolithic_build__"
+    ]).export_constant(__name__, "MONOLITHIC_BUILD")
 
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
-tf_export("GRAPH_DEF_VERSION").export_constant(__name__, "GRAPH_DEF_VERSION")
+tf_export(
+    "version.GRAPH_DEF_VERSION",
+    v1=["version.GRAPH_DEF_VERSION", "GRAPH_DEF_VERSION"]).export_constant(
+        __name__, "GRAPH_DEF_VERSION")
 GRAPH_DEF_VERSION_MIN_CONSUMER = (
     pywrap_tensorflow.GRAPH_DEF_VERSION_MIN_CONSUMER)
-tf_export("GRAPH_DEF_VERSION_MIN_CONSUMER").export_constant(
-    __name__, "GRAPH_DEF_VERSION_MIN_CONSUMER")
+tf_export(
+    "version.GRAPH_DEF_VERSION_MIN_CONSUMER",
+    v1=[
+        "version.GRAPH_DEF_VERSION_MIN_CONSUMER",
+        "GRAPH_DEF_VERSION_MIN_CONSUMER"
+    ]).export_constant(__name__, "GRAPH_DEF_VERSION_MIN_CONSUMER")
 GRAPH_DEF_VERSION_MIN_PRODUCER = (
     pywrap_tensorflow.GRAPH_DEF_VERSION_MIN_PRODUCER)
-tf_export("GRAPH_DEF_VERSION_MIN_PRODUCER").export_constant(
-    __name__, "GRAPH_DEF_VERSION_MIN_PRODUCER")
+tf_export(
+    "version.GRAPH_DEF_VERSION_MIN_PRODUCER",
+    v1=[
+        "version.GRAPH_DEF_VERSION_MIN_PRODUCER",
+        "GRAPH_DEF_VERSION_MIN_PRODUCER"
+    ]).export_constant(__name__, "GRAPH_DEF_VERSION_MIN_PRODUCER")
 
 __all__ = [
     "__version__",
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 6816e204075bc37c6958efa5b028417078c36b2b..87795ffcfb5d21c408d646e581e19fe23a37b945 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -308,7 +308,7 @@ static PyObject* TF_GetSupportedDevices(GCluster cluster, GItem item) {
 
 static double TF_EstimatePerformance(const tensorflow::NamedDevice& device) {
   tensorflow::grappler::OpLevelCostEstimator estimator;
-  tensorflow::grappler::OpLevelCostEstimator::DeviceInfo info =
+  tensorflow::grappler::DeviceInfo info =
       estimator.GetDeviceInfo(device.properties());
   return info.gigaops;
 }
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
index ab1d0ed25b9130fabcffbb8da2265c046206da46..30c1e1468146ce58216acbfbb1aef1ab1408027f 100644
--- a/tensorflow/python/grappler/constant_folding_test.py
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -61,7 +61,7 @@ class ConstantFoldingTest(test.TestCase):
           back_prop=False,
           parallel_iterations=1)
       with session.Session() as sess:
-        y_v = sess.run(y)
+        y_v = self.evaluate(y)
         self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
 
 
diff --git a/tensorflow/python/grappler/cost_analyzer_test.py b/tensorflow/python/grappler/cost_analyzer_test.py
index b8225b81a52f1a2ee10663544d54f1c9bd7ee785..ee3e289f65d05e96a580a62adb7f39552e6ced1c 100644
--- a/tensorflow/python/grappler/cost_analyzer_test.py
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.training import adam
 
 class CostAnalysisTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicCost(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant(10, name="a")
@@ -62,6 +63,7 @@ class CostAnalysisTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  @test_util.run_deprecated_v1
   def testVerbose(self):
     """Make sure the full report is generated with verbose=True."""
     a = constant_op.constant(10, name="a")
@@ -81,6 +83,7 @@ class CostAnalysisTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  @test_util.run_deprecated_v1
   def testSmallNetworkCost(self):
     image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1])
     label = array_ops.placeholder(dtypes.float32, shape=[1, 10])
@@ -96,8 +99,8 @@ class CostAnalysisTest(test.TestCase):
     b_fc = variables.Variable(random_ops.truncated_normal([10], stddev=0.1))
     y_conv = nn_ops.softmax(math_ops.matmul(h_conv_flat, w_fc) + b_fc)
 
-    cross_entropy = math_ops.reduce_mean(-math_ops.reduce_sum(
-        label * math_ops.log(y_conv), reduction_indices=[1]))
+    cross_entropy = math_ops.reduce_mean(
+        -math_ops.reduce_sum(label * math_ops.log(y_conv), axis=[1]))
     _ = adam.AdamOptimizer(1e-4).minimize(cross_entropy)
 
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
@@ -129,6 +132,7 @@ class CostAnalysisTest(test.TestCase):
       # self.assertTrue(0 < upper)
       # self.assertTrue(lower <= upper)
 
+  @test_util.run_deprecated_v1
   def testBasicMemory(self):
     """Make sure arguments can be passed correctly."""
     with test_util.device(use_gpu=False):
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index e6229e18566d7b6431f77ac32118bb56cda615ec..7dbaf449cad6f65fbf84054f9e2d5a631b46d13b 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -25,8 +25,8 @@ from google.protobuf import message
 from google.protobuf import text_format
 from tensorflow.contrib.fused_conv.ops import gen_fused_conv2d_bias_activation_op  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
@@ -79,10 +79,11 @@ def get_metagraph():
 
 def main(_):
   metagraph = get_metagraph()
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
+  config = config_pb2.ConfigProto()
   if FLAGS.rewriter_config is not None:
-    text_format.Merge(FLAGS.rewriter_config, rewriter_config)
-  optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+    text_format.Merge(FLAGS.rewriter_config,
+                      config.graph_options.rewrite_options)
+  optimized_graph = tf_optimizer.OptimizeGraph(config, metagraph)
   metagraph.graph_def.CopyFrom(optimized_graph)
 
   report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report,
diff --git a/tensorflow/python/grappler/datasets_test.py b/tensorflow/python/grappler/datasets_test.py
index 2d942af597c180576ebe65e26ad39923754092f3..6937301ab255b87fa51444b70bc0e2b20d306ea3 100644
--- a/tensorflow/python/grappler/datasets_test.py
+++ b/tensorflow/python/grappler/datasets_test.py
@@ -48,7 +48,7 @@ class GrapplerTest(test.TestCase):
     for test_case in test_cases:
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -73,7 +73,7 @@ class GrapplerTest(test.TestCase):
     for test_case in test_cases:
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensor_slices(test_case['tensor'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -109,7 +109,7 @@ class GrapplerTest(test.TestCase):
             make_generator(test_case['tensor']),
             dtypes.int64,
             output_shapes=test_case['shape'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -122,7 +122,7 @@ class GrapplerTest(test.TestCase):
   def testRange(self):
     with ops.Graph().as_default() as g:
       dataset = dataset_ops.Dataset.range(42)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       get_next = iterator.get_next()
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(get_next)
@@ -148,7 +148,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = fn(dataset, test_case['tensor'], test_case['shape'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -252,7 +252,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.batch(42)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -261,7 +261,7 @@ class GrapplerTest(test.TestCase):
         op_properties = grappler_item.GetOpProperties()
         inferred_shape = self.as_tensor_shape(
             op_properties['IteratorGetNext'][0].shape)
-        self.assertTrue(test_case['shape'][0].is_compatible_with(
+        self.assertTrue(test_case['shape'].dims[0].is_compatible_with(
             inferred_shape[0]))
         self.assertEqual(test_case['shape'][1:], inferred_shape[1:])
 
@@ -281,7 +281,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.padded_batch(42, padded_shapes=test_case['shape'][1:])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -290,7 +290,7 @@ class GrapplerTest(test.TestCase):
         op_properties = grappler_item.GetOpProperties()
         inferred_shape = self.as_tensor_shape(
             op_properties['IteratorGetNext'][0].shape)
-        self.assertTrue(test_case['shape'][0].is_compatible_with(
+        self.assertTrue(test_case['shape'].dims[0].is_compatible_with(
             inferred_shape[0]))
         self.assertEqual(test_case['shape'][1:], inferred_shape[1:])
 
@@ -318,7 +318,7 @@ class GrapplerTest(test.TestCase):
           return dataset_fn
 
         dataset = dataset.flat_map(make_dataset(test_case['tensor']))
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -353,7 +353,7 @@ class GrapplerTest(test.TestCase):
 
         dataset = dataset.interleave(
             make_dataset(test_case['tensor']), cycle_length=42)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -382,7 +382,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.map(array_ops.transpose)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
diff --git a/tensorflow/python/grappler/graph_placer.py b/tensorflow/python/grappler/graph_placer.py
index 654013b23c5811acbd10633d692e2d214d530b26..9c05ad81790d61fe0d19e5738d64e6502ca88915 100644
--- a/tensorflow/python/grappler/graph_placer.py
+++ b/tensorflow/python/grappler/graph_placer.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import time
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.grappler import cluster as gcluster
@@ -54,9 +54,9 @@ def PlaceGraph(metagraph,
     cluster = gcluster.Cluster()
 
   # Optimize the metagraph to speedup the placement
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
+  config = config_pb2.ConfigProto()
   optimized_graph = tf_optimizer.OptimizeGraph(
-      rewriter_config, metagraph, verbose=verbose, cluster=cluster)
+      config, metagraph, verbose=verbose, cluster=cluster)
   optimized_metagraph = meta_graph_pb2.MetaGraphDef()
   optimized_metagraph.CopyFrom(metagraph)
   optimized_metagraph.graph_def.CopyFrom(optimized_graph)
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index d3d96c646cd00ede612ad93cca3975b92389bfa1..c02fd9f55b885c0e8b0647a74547887eff7453f0 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import item
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
@@ -107,6 +108,7 @@ class ItemTest(test.TestCase):
     newest_tf_item = grappler_item.tf_item
     self.assertEqual(new_tf_item, newest_tf_item)
 
+  @test_util.run_v1_only('b/120545219')
   def testColocationContraints(self):
     with ops.Graph().as_default() as g:
       c = constant_op.constant([10])
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 8cc971c61d5964d0fad1bfa843c3ef8d3407599f..98f2e6d71816a4b6d8cd3f7fc836b09e5cc058a4 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import cluster as gcluster
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.layers import convolutional as conv_layers
@@ -241,7 +242,7 @@ class LayoutOptimizerTest(test.TestCase):
       if restore:
         saver.restore(sess, checkpoint_path)
       else:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
       np.random.seed(0)
       for _ in range(2):
@@ -262,7 +263,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _two_layer_model(x)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -365,7 +366,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(pad)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -396,7 +397,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -425,7 +426,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(cast)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -456,7 +457,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -486,7 +487,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -516,7 +517,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -545,7 +546,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -574,7 +575,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -603,7 +604,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -632,7 +633,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -662,7 +663,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -691,7 +692,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -724,7 +725,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(concat)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -835,7 +836,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reverse)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -905,7 +906,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -966,7 +967,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1179,7 +1180,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(s)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1214,7 +1215,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(s)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1347,7 +1348,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1374,7 +1375,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop_with_branch()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1398,7 +1399,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop_with_vec_and_4d()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1422,7 +1423,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _model_with_second_port()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1441,13 +1442,16 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Add-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     meta_graph = _simple_metagraph()
-    rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
-        min_graph_nodes=-1)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
+            min_graph_nodes=-1))
     optimized_graph = tf_optimizer.OptimizeGraph(
-        rewrite_options, meta_graph, cluster=_get_cluster())
+        config, meta_graph, cluster=_get_cluster())
 
     found = 0
     for node in optimized_graph.node:
@@ -1456,13 +1460,16 @@ class LayoutOptimizerTest(test.TestCase):
         self.assertEqual(node.attr['data_format'].s, b'NCHW')
     self.assertEqual(found, 5)
 
+  @test_util.run_deprecated_v1
   def testDepthwise(self):
     meta_graph = _simple_metagraph(depthwise=True)
-    rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
-        min_graph_nodes=-1)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
+            min_graph_nodes=-1))
     optimized_graph = tf_optimizer.OptimizeGraph(
-        rewrite_options, meta_graph, cluster=_get_cluster())
+        config, meta_graph, cluster=_get_cluster())
 
     found = 0
     for node in optimized_graph.node:
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 03b42f6453975c097810b300324f8ab0a2879329..e2864ebb4df646262456f2d04e4a24bdd06482b7 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -37,6 +38,7 @@ from tensorflow.python.training import training as train
 class MemoryOptimizerSwapTest(test.TestCase):
   """Tests the Grappler memory optimizer."""
 
+  @test_util.run_deprecated_v1
   def testNoSwapping(self):
     """Make sure the graph is preserved when there is nothing to swap."""
     a = variables.VariableV1(10, name='a')
@@ -49,15 +51,18 @@ class MemoryOptimizerSwapTest(test.TestCase):
     graph_size = len(mg.graph_def.node)
     nodes = [node.name for node in mg.graph_def.node]
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            disable_model_pruning=True,
+            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), graph_size)
     self.assertItemsEqual([node.name for node in graph.node], nodes)
 
+  @test_util.run_v1_only('b/120545219')
   def testSimpleSwap(self):
     """Check that the swap annotations are followed."""
     a = variables.VariableV1(10, name='a')
@@ -72,13 +77,15 @@ class MemoryOptimizerSwapTest(test.TestCase):
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
     graph_size = len(mg.graph_def.node)
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
-        min_graph_nodes=-1)
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            disable_model_pruning=True,
+            meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
+            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
+            min_graph_nodes=-1))
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), graph_size + 2)
     self.assertTrue(
@@ -127,7 +134,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
   def testRewritingDefaultGradientNames(self):
     """Tests that rewriting occurs with default gradient names."""
     (original_metagraph, _, _, _) = self._GetMetaGraph()
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
@@ -135,8 +143,9 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS), original_metagraph)
+            memory_optimization=(
+                rewriter_config_pb2.RewriterConfig.RECOMPUTATION_HEURISTICS)))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
         len(original_metagraph.graph_def.node))
@@ -153,7 +162,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     """Tests that rewriting occurs with non-standard gradient names."""
     (original_metagraph, _, _, _) = self._GetMetaGraph(
         optimizer_scope_name='optimizer')
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
@@ -161,11 +171,11 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS,
+            memory_optimization=rewriter_config_pb2.RewriterConfig
+            .RECOMPUTATION_HEURISTICS,
             # Checks that name scope "gradients/" also match sub-scope.
-            memory_optimizer_target_node_name_scope='gradients/'),
-        original_metagraph)
+            memory_optimizer_target_node_name_scope='gradients/'))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
         len(original_metagraph.graph_def.node))
@@ -182,18 +192,19 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     """Tests that rewriting occurs with non-standard gradient names."""
     (original_metagraph, _, _,
      _) = self._GetMetaGraph(optimizer_scope_name='foo/bar')
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
             dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS,
+            memory_optimization=rewriter_config_pb2.RewriterConfig
+            .RECOMPUTATION_HEURISTICS,
             # This should not match anything.
-            memory_optimizer_target_node_name_scope='r/gradients/'),
-        original_metagraph)
+            memory_optimizer_target_node_name_scope='r/gradients/'))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertEqual(
         len(rewritten_graph_def.node), len(original_metagraph.graph_def.node))
     self.assertEqual(0,
@@ -223,10 +234,10 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
       train_op = graph.get_operation_by_name(train_op_name)
       loss_op = graph.get_tensor_by_name(loss_op_name)
       with session.Session(config=config, graph=graph) as sess:
-        sess.run(init_op)
-        sess.run(train_op)
-        sess.run(train_op)
-        return sess.run(loss_op)
+        self.evaluate(init_op)
+        self.evaluate(train_op)
+        self.evaluate(train_op)
+        return self.evaluate(loss_op)
 
   def testRecomputationRewritingNoErrors(self):
     """Tests that graph output is not significantly different with rewriting."""
@@ -287,8 +298,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
           rewrite_options=manual_memory_config)
       session_config = config_pb2.ConfigProto(graph_options=graph_options)
       with session.Session(config=session_config) as sess:
-        sess.run(init_op)
-        sess.run(train_op)
+        self.evaluate(init_op)
+        self.evaluate(train_op)
 
   def testHintDoesRewrite(self):
     graph = self._annotated_graph()[0]
@@ -298,11 +309,12 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         0,
         len([node for node in metagraph.graph_def.node
              if 'Recomputed/' in node.name]))
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL),
-        metagraph)
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, metagraph)
     self.assertEqual(
         9,
         len([node for node in rewritten_graph_def.node
diff --git a/tensorflow/python/grappler/model_analyzer_test.py b/tensorflow/python/grappler/model_analyzer_test.py
index ec172755f1ae43fc7581e97c6a18471da45f9100..d000cfa1ba2ec6ab2974332b8cc0cae8d6cf821d 100644
--- a/tensorflow/python/grappler/model_analyzer_test.py
+++ b/tensorflow/python/grappler/model_analyzer_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import model_analyzer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -28,6 +29,7 @@ from tensorflow.python.platform import test
 
 class PyWrapOptimizeGraphTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant([10, 11], name="a")
@@ -49,6 +51,7 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  @test_util.run_deprecated_v1
   def testDebugMode(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant([10, 11], name="a")
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index 39ca71e99af06c19fb7fe5bf185c29106729f5e9..b746c3ec261e1bc75f6374d27b52a522a83934b9 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -34,8 +34,8 @@ limitations under the License.
   $1 = &temp;
 }
 
-%typemap(in) const tensorflow::RewriterConfig& (
-    tensorflow::RewriterConfig temp) {
+%typemap(in) const tensorflow::ConfigProto& (
+    tensorflow::ConfigProto temp) {
   char* c_string;
   Py_ssize_t py_size;
   if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
@@ -46,7 +46,7 @@ limitations under the License.
   if (!temp.ParseFromString(string(c_string, py_size))) {
     PyErr_SetString(
         PyExc_TypeError,
-        "The RewriterConfig could not be parsed as a valid protocol buffer");
+        "The ConfigProto could not be parsed as a valid protocol buffer");
     SWIG_fail;
   }
   $1 = &temp;
@@ -67,20 +67,20 @@ limitations under the License.
   #include "tensorflow/core/grappler/clusters/utils.h"
   #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
   #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+  #include "tensorflow/core/protobuf/config.pb.h"
   #include "tensorflow/core/protobuf/meta_graph.pb.h"
-  #include "tensorflow/core/protobuf/rewriter_config.pb.h"
   #include "tensorflow/core/public/session_options.h"
 
 
 void DetectDevices(std::unordered_map<string, tensorflow::DeviceProperties>* device_map) {
   tensorflow::SessionOptions options;
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   tensorflow::Status status = tensorflow::DeviceFactory::AddDevices(options, "", &devices);
   if (!status.ok()) {
     return;
   }
 
-  for (const tensorflow::Device* device : devices) {
+  for (const std::unique_ptr<tensorflow::Device>& device : devices) {
     tensorflow::DeviceProperties& prop = (*device_map)[device->name()];
     prop = tensorflow::grappler::GetDeviceInfo(device->parsed_name());
 
@@ -88,13 +88,12 @@ void DetectDevices(std::unordered_map<string, tensorflow::DeviceProperties>* dev
     // available device memory.
     const tensorflow::DeviceAttributes& attr = device->attributes();
     prop.set_memory_size(attr.memory_limit());
-    delete device;
   }
 }
 
 PyObject* TF_OptimizeGraph(
       GCluster cluster,
-      const tensorflow::RewriterConfig& rewriter_config,
+      const tensorflow::ConfigProto& config_proto,
       const tensorflow::MetaGraphDef& metagraph,
       bool verbose, const string& graph_id, TF_Status* out_status) {
     tensorflow::grappler::ItemConfig item_config;
@@ -110,7 +109,7 @@ PyObject* TF_OptimizeGraph(
 
     tensorflow::DeviceBase* cpu_device = nullptr;
     tensorflow::GraphDef out_graph;
-    tensorflow::grappler::MetaOptimizer optimizer(cpu_device, rewriter_config);
+    tensorflow::grappler::MetaOptimizer optimizer(cpu_device, config_proto);
     tensorflow::Status status = optimizer.Optimize(cluster.get(), *grappler_item, &out_graph);
     if (verbose) {
       optimizer.PrintResult();
@@ -127,7 +126,7 @@ PyObject* TF_OptimizeGraph(
 // Wrap this function
 PyObject* TF_OptimizeGraph(
     GCluster cluster,
-    const tensorflow::RewriterConfig& rewriter_config,
+    const tensorflow::ConfigProto& config_proto,
     const tensorflow::MetaGraphDef& metagraph, bool verbose,
     const string& graph_id, TF_Status* out_status);
 
diff --git a/tensorflow/python/grappler/tf_optimizer.py b/tensorflow/python/grappler/tf_optimizer.py
index a73a4a98fc5a883cf8681a20ca332f16f3b7f0ce..e72667b6f3184c7f2900fb410102a08220c44e2e 100644
--- a/tensorflow/python/grappler/tf_optimizer.py
+++ b/tensorflow/python/grappler/tf_optimizer.py
@@ -19,22 +19,26 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_opt
 from tensorflow.python.framework import errors
 from tensorflow.python.grappler import cluster as gcluster
 
 
-def OptimizeGraph(rewriter_config,
+def OptimizeGraph(config_proto,
                   metagraph,
                   verbose=True,
                   graph_id=b'graph_to_optimize',
                   cluster=None):
   """Optimize the provided metagraph."""
+  if not isinstance(config_proto, config_pb2.ConfigProto):
+    raise TypeError('Expected config_proto to be a ConfigProto, saw type %s' %
+                    type(config_proto))
   with errors.raise_exception_on_not_ok_status() as status:
     if cluster is None:
       cluster = gcluster.Cluster()
     ret_from_swig = tf_opt.TF_OptimizeGraph(cluster.tf_cluster,
-                                            rewriter_config.SerializeToString(),
+                                            config_proto.SerializeToString(),
                                             metagraph.SerializeToString(),
                                             verbose, graph_id, status)
   if ret_from_swig is None:
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index eca0f679829507212608e75f2c792b4bddf9b1da..8186c81378af7c9fdbd39d4001998d2f959d4dd3 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -17,12 +17,13 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import item as gitem
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
@@ -34,6 +35,7 @@ from tensorflow.python.platform import test
 
 class PyWrapOptimizeGraphTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant(10, name='a')
@@ -45,15 +47,17 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     train_op.append(d)
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.optimizers.append('constfold')
     rewriter_config.min_graph_nodes = -1
 
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), 1)
     self.assertItemsEqual([node.name for node in graph.node], ['d'])
 
+  @test_util.run_v1_only('b/120545219')
   def testKeepNodes(self):
     g = ops.Graph()
     with g.as_default():
@@ -68,18 +72,21 @@ class PyWrapOptimizeGraphTest(test.TestCase):
 
     # Optimize the graph.
     mg = meta_graph.create_meta_graph_def(graph=g)
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.min_graph_nodes = -1
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    optimized_graph = tf_optimizer.OptimizeGraph(config, mg)
 
     # Check that the nodes referenced in various collections have been preserved
-    self.assertEqual(len(optimized_graph.node), 5)
-    self.assertEqual(d.op.name, optimized_graph.node[0].name)
-    self.assertEqual(a1.op.name, optimized_graph.node[1].name)
-    self.assertEqual('Variable/initial_value', optimized_graph.node[2].name)
-    self.assertEqual(a2.op.name, optimized_graph.node[3].name)
-    self.assertEqual('Variable/Assign', optimized_graph.node[4].name)
-
+    optimized_graph_nodes = [node.name for node in optimized_graph.node]
+    expected_nodes = [
+        d.op.name, a1.op.name, a2.op.name, 'Variable/initial_value',
+        'Variable/Assign'
+    ]
+    self.assertEqual(len(optimized_graph_nodes), len(expected_nodes))
+    self.assertAllInSet(optimized_graph_nodes, expected_nodes)
+
+  @test_util.run_v1_only('b/120545219')
   def testLoops(self):
     g = ops.Graph()
     with g.as_default():
@@ -110,9 +117,10 @@ class PyWrapOptimizeGraphTest(test.TestCase):
 
     # Optimize the graph.
     mg = meta_graph.create_meta_graph_def(graph=g)
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.min_graph_nodes = -1
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    optimized_graph = tf_optimizer.OptimizeGraph(config, mg)
     mg.graph_def.CopyFrom(optimized_graph)
 
     # Check that the nodes referenced in various collections have been preserved
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 4a72c4b3f3e7839cb255a1b5f7487e23e410c52f..36fea36389dc15104cca8a0d421ba50906295e9a 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -3,10 +3,10 @@
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
 package(default_visibility = ["//visibility:public"])
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -41,6 +41,7 @@ py_library(
         "datasets/mnist.py",
         "datasets/reuters.py",
         "estimator/__init__.py",
+        "keras_parameterized.py",
         "preprocessing/__init__.py",
         "preprocessing/image.py",
         "preprocessing/sequence.py",
@@ -55,18 +56,26 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = select({
-        ":empty_condition": [],
-        "//conditions:default": [],
-    }) + [
+    deps = [
         ":backend",
         ":engine",
         ":layers",
-        "//tensorflow/python/saved_model",
+        ":pil_for_keras",
         "//tensorflow/python:training",
+        "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python/saved_model",
+        "@keras_applications_archive//:keras_applications",
     ],
 )
 
+py_library(
+    name = "pil_for_keras",
+    deps = select({
+        ":empty_condition": [],
+        "//conditions:default": [],
+    }),
+)
+
 py_library(
     name = "backend",
     srcs = ["backend.py"],
@@ -114,8 +123,10 @@ py_library(
         "constraints.py",
         "engine/__init__.py",
         "engine/base_layer.py",
+        "engine/base_layer_utils.py",
         "engine/distributed_training_utils.py",
         "engine/input_layer.py",
+        "engine/input_spec.py",
         "engine/network.py",
         "engine/saving.py",
         "engine/sequential.py",
@@ -133,11 +144,14 @@ py_library(
         "regularizers.py",
         "utils/data_utils.py",
         "utils/io_utils.py",
+        "utils/losses_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/training/checkpointable:data_structures",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
@@ -172,7 +186,6 @@ py_library(
         ":engine",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:cudnn_rnn_ops_gen",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
@@ -186,6 +199,7 @@ py_library(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
@@ -201,6 +215,7 @@ py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:nn",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -213,6 +228,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -225,6 +241,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -238,6 +255,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -249,6 +267,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -256,6 +275,7 @@ py_test(
     name = "optimizers_test",
     size = "medium",
     srcs = ["optimizers_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -263,6 +283,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -275,6 +296,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -292,6 +314,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -310,12 +333,13 @@ py_test(
 
 py_test(
     name = "advanced_activations_test",
-    size = "small",
+    size = "medium",
     srcs = ["layers/advanced_activations_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -329,6 +353,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -336,11 +361,13 @@ py_test(
     name = "convolutional_test",
     size = "large",
     srcs = ["layers/convolutional_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -360,12 +387,13 @@ cuda_py_test(
 
 py_test(
     name = "pooling_test",
-    size = "small",
+    size = "large",
     srcs = ["layers/pooling_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -373,11 +401,13 @@ py_test(
     name = "core_test",
     size = "medium",
     srcs = ["layers/core_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -387,19 +417,23 @@ cuda_py_test(
     srcs = ["layers/embeddings_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
 )
 
 py_test(
     name = "local_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/local_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -412,6 +446,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -423,6 +458,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -436,6 +472,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -449,6 +486,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -462,6 +500,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -479,19 +518,34 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
     name = "recurrent_test",
-    size = "medium",
+    size = "large",
     srcs = ["layers/recurrent_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "unified_lstm_test",
+    size = "medium",
+    srcs = ["layers/unified_lstm_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
 py_test(
@@ -502,6 +556,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -519,6 +574,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -532,6 +588,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -551,6 +608,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -559,6 +617,18 @@ py_test(
     size = "small",
     srcs = ["utils/generic_utils_test.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "tf_utils_test",
+    size = "small",
+    srcs = ["utils/tf_utils_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -578,6 +648,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -590,6 +661,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -598,6 +670,7 @@ cuda_py_test(
     srcs = ["utils/multi_gpu_utils_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
@@ -613,6 +686,7 @@ cuda_py_test(
     srcs = ["engine/training_gpu_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
@@ -640,6 +714,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -652,6 +727,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -664,6 +740,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -677,19 +754,59 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
     name = "training_test",
-    size = "enormous",
+    size = "medium",
     srcs = ["engine/training_test.py"],
+    shard_count = 16,
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "manual",  # TODO(b/120560388)
+        "no_oss",  # TODO(b/120560388)
+        "notap",  # TODO(b/120560388)
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "training_dataset_test",
+    size = "medium",
+    srcs = ["engine/training_dataset_test.py"],
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "training_generator_test",
+    size = "enormous",
+    srcs = ["engine/training_generator_test.py"],
+    shard_count = 3,
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "notsan",
+    ],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -704,6 +821,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -717,6 +835,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -730,6 +849,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -737,12 +857,14 @@ py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -758,6 +880,20 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "base_layer_test",
+    size = "small",
+    srcs = ["engine/base_layer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -791,6 +927,7 @@ py_test(
     name = "models_test",
     size = "medium",
     srcs = ["models_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67509773
     deps = [
@@ -798,6 +935,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -815,15 +953,16 @@ py_test(
     ],
 )
 
-py_library(
-    name = "testing_utils",
-    srcs = [
-        "testing_utils.py",
-    ],
+py_test(
+    name = "keras_parameterized_test",
+    size = "small",
+    srcs = ["keras_parameterized_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
-        "//tensorflow/python:util",
+        "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index 198c66d9e184c82423e529540b92ad447b947cf8..be46a894e1b9979ea682aa2b635dc68da35c6097 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -44,7 +44,7 @@ from tensorflow.python.keras.models import Sequential
 
 from tensorflow.python.util.tf_export import tf_export
 
-__version__ = '2.1.6-tf'
+__version__ = '2.2.4-tf'
 
 tf_export('keras.__version__').export_constant(__name__, '__version__')
 
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 99645de736fc9e3f34c3ea29171cde0f91d8345a..d69791ce8d6b328067610f70c91373da5288d7d6 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -160,6 +160,11 @@ def sigmoid(x):
   return nn.sigmoid(x)
 
 
+@tf_export('keras.activations.exponential')
+def exponential(x):
+  return math_ops.exp(x)
+
+
 @tf_export('keras.activations.hard_sigmoid')
 def hard_sigmoid(x):
   """Hard sigmoid activation function.
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index dd0bbcff3958c703ccc4648af746e8b7272cc1e9..6b7bfb698b8abef4a3e0ac115f2f247103b92abc 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -67,6 +68,7 @@ class KerasActivationsTest(test.TestCase):
     expected = _ref_softmax(test_values[0, 0])
     self.assertAllClose(result[0, 0], expected, rtol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_selu(self):
     x = keras.backend.placeholder(ndim=2)
     f = keras.backend.function([x], [keras.activations.selu(x)])
@@ -124,6 +126,7 @@ class KerasActivationsTest(test.TestCase):
     expected = sigmoid(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_hard_sigmoid(self):
     def ref_hard_sigmoid(x):
       x = (x * 0.2) + 0.5
@@ -147,6 +150,7 @@ class KerasActivationsTest(test.TestCase):
     # No negative values in test values...
     self.assertAllClose(result, test_values, rtol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_elu(self):
     with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
@@ -169,6 +173,16 @@ class KerasActivationsTest(test.TestCase):
     expected = np.tanh(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
+  def test_exponential(self):
+    with self.cached_session():
+      test_values = np.random.random((2, 5))
+      x = keras.backend.placeholder(ndim=2)
+      exp = keras.activations.exponential(x)
+      f = keras.backend.function([x], [exp])
+      result = f([test_values])[0]
+    expected = np.exp(test_values)
+    self.assertAllClose(result, expected, rtol=1e-05)
+
   def test_linear(self):
     x = np.random.random((10, 5))
     self.assertAllClose(x, keras.activations.linear(x))
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 8ebf7356cd747b3b386af877db41f884dacaf62e..420c457a0ca2c74c5a0148a98e281b4663ab3226 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -25,6 +25,7 @@ import collections
 import itertools
 import json
 import os
+import threading
 import weakref
 
 import numpy as np
@@ -32,8 +33,10 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_module
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
@@ -57,6 +60,7 @@ from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-im
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
 
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
@@ -66,9 +70,13 @@ py_sum = sum
 
 # INTERNAL UTILS
 
-# This is the default internal TF session used by Keras.
-# It can be set manually via `set_session(sess)`.
-_SESSION = None
+# The internal graph maintained by Keras and used by the symbolic Keras APIs
+# while executing eagerly (such as the functional API for model-building).
+_GRAPH = None
+
+# This is a thread local object that will hold the default internal TF session
+# used by Keras. It can be set manually via `set_session(sess)`.
+_SESSION = threading.local()
 
 # This dictionary holds a mapping {graph: learning_phase}.
 # A learning phase is a bool tensor used to run Keras models in
@@ -300,7 +308,7 @@ def get_uid(prefix=''):
     2
   ```
   """
-  graph = ops.get_default_graph()
+  graph = get_graph()
   if graph not in PER_GRAPH_LAYER_NAME_UIDS:
     PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(int)
   layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph]
@@ -330,13 +338,15 @@ def clear_session():
   global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
   ops.reset_default_graph()
   reset_uids()
-  _SESSION = None
-  phase = array_ops.placeholder_with_default(
-      False, shape=(), name='keras_learning_phase')
-  _GRAPH_LEARNING_PHASES = {}
-  _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = phase
-  _GRAPH_VARIABLES.pop(ops.get_default_graph(), None)
-  _GRAPH_TF_OPTIMIZERS.pop(ops.get_default_graph(), None)
+  _SESSION.session = None
+  graph = get_graph()
+  with graph.as_default():
+    phase = array_ops.placeholder_with_default(
+        False, shape=(), name='keras_learning_phase')
+    _GRAPH_LEARNING_PHASES = {}
+    _GRAPH_LEARNING_PHASES[graph] = phase
+    _GRAPH_VARIABLES.pop(graph, None)
+    _GRAPH_TF_OPTIMIZERS.pop(graph, None)
 
 
 @tf_export('keras.backend.manual_variable_initialization')
@@ -367,21 +377,17 @@ def learning_phase():
   Returns:
       Learning phase (scalar integer tensor or Python integer).
   """
-  with ops.init_scope():
-    # We always check & set the learning phase inside the init_scope,
-    # otherwise the wrong default_graph will be used to look up the learning
-    # phase inside of functions & defuns.
-    #
-    # This is because functions & defuns (both in graph & in eager mode)
-    # will always execute non-eagerly using a function-specific default
-    # subgraph.
-    if context.executing_eagerly():
-      if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
-        # Fallback to inference mode as default.
-        return 0
-      return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+  if context.executing_eagerly():
+    if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
+      # Fallback to inference mode as default.
+      return 0
+    return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+  return symbolic_learning_phase()
 
-    graph = ops.get_default_graph()
+
+def symbolic_learning_phase():
+  graph = get_graph()
+  with graph.as_default():
     if graph not in _GRAPH_LEARNING_PHASES:
       phase = array_ops.placeholder_with_default(
           False, shape=(), name='keras_learning_phase')
@@ -406,7 +412,7 @@ def set_learning_phase(value):
     if context.executing_eagerly():
       _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
     else:
-      _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = value
+      _GRAPH_LEARNING_PHASES[get_graph()] = value
 
 
 @tf_contextlib.contextmanager
@@ -436,10 +442,24 @@ def learning_phase_scope(value):
       if context.executing_eagerly():
         _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_value
       else:
-        _GRAPH_LEARNING_PHASES[ops.get_default_graph()] = previous_value
+        _GRAPH_LEARNING_PHASES[get_graph()] = previous_value
+
+
+def _get_session():
+  """Returns the session object for the current thread."""
+  global _SESSION
+  default_session = ops.get_default_session()
+  if default_session is not None:
+    session = default_session
+  else:
+    if getattr(_SESSION, 'session', None) is None:
+      _SESSION.session = session_module.Session(
+          config=get_default_session_config())
+    session = _SESSION.session
+  return session
 
 
-@tf_export('keras.backend.get_session')
+@tf_export(v1=['keras.backend.get_session'])
 def get_session():
   """Returns the TF session to be used by the backend.
 
@@ -456,20 +476,23 @@ def get_session():
   Returns:
       A TensorFlow session.
   """
-  global _SESSION
-  default_session = ops.get_default_session()
-  if default_session is not None:
-    session = default_session
-  else:
-    if _SESSION is None:
-      _SESSION = session_module.Session(config=get_default_session_config())
-    session = _SESSION
+  session = _get_session()
   if not _MANUAL_VAR_INIT:
     with session.graph.as_default():
       _initialize_variables(session)
   return session
 
 
+def get_graph():
+  if context.executing_eagerly():
+    global _GRAPH
+    if _GRAPH is None:
+      _GRAPH = func_graph.FuncGraph('keras_graph')
+    return _GRAPH
+  else:
+    return ops.get_default_graph()
+
+
 @tf_export('keras.backend.set_session')
 def set_session(session):
   """Sets the global TensorFlow session.
@@ -478,7 +501,7 @@ def set_session(session):
       session: A TF Session.
   """
   global _SESSION
-  _SESSION = session
+  _SESSION.session = session
 
 
 def get_default_session_config():
@@ -513,9 +536,9 @@ def _get_current_tf_device():
       the device (`CPU` or `GPU`). If the scope is not explicitly set, it will
       return `None`.
   """
-  g = ops.get_default_graph()
+  graph = get_graph()
   op = _TfDeviceCaptureOp()
-  g._apply_device_functions(op)
+  graph._apply_device_functions(op)
   return op.device
 
 
@@ -675,7 +698,6 @@ def variable(value, dtype=None, name=None, constraint=None):
     v = sparse_tensor.SparseTensor(
         indices=indices, values=sparse_coo.data, dense_shape=sparse_coo.shape)
     v._keras_shape = sparse_coo.shape
-    v._uses_learning_phase = False
     return v
   v = resource_variable_ops.ResourceVariable(
       value,
@@ -686,7 +708,6 @@ def variable(value, dtype=None, name=None, constraint=None):
     v._keras_shape = value.shape
   elif hasattr(value, 'shape'):
     v._keras_shape = int_shape(value)
-  v._uses_learning_phase = False
   track_variable(v)
   return v
 
@@ -695,15 +716,16 @@ def track_tf_optimizer(tf_optimizer):
   """Tracks the given TF optimizer for initialization of its variables."""
   if context.executing_eagerly():
     return
-  graph = ops.get_default_graph()
+  graph = get_graph()
   optimizers = _GRAPH_TF_OPTIMIZERS.setdefault(graph, weakref.WeakSet())
   optimizers.add(tf_optimizer)
 
+
 def track_variable(v):
   """Tracks the given variable for initialization."""
   if context.executing_eagerly():
     return
-  graph = v.graph if hasattr(v, 'graph') else ops.get_default_graph()
+  graph = v.graph if hasattr(v, 'graph') else get_graph()
   if graph not in _GRAPH_VARIABLES:
     _GRAPH_VARIABLES[graph] = weakref.WeakSet()
   _GRAPH_VARIABLES[graph].add(v)
@@ -720,7 +742,7 @@ def _get_variables(graph=None):
 
 def _initialize_variables(session):
   """Utility to initialize uninitialized variables on the fly."""
-  variables = _get_variables(ops.get_default_graph())
+  variables = _get_variables(get_graph())
   candidate_vars = []
   for v in variables:
     if not getattr(v, '_keras_initialized', False):
@@ -774,6 +796,8 @@ def is_keras_tensor(x):
 
   Examples:
   ```python
+      >>> import tensorflow as tf
+      >>> import numpy
       >>> from keras import backend as K
       >>> from keras.layers import Input, Dense
       >>> np_var = numpy.array([1, 2])
@@ -800,10 +824,9 @@ def is_keras_tensor(x):
       True
   ```
   """
-  if (not isinstance(x, (ops.Tensor,
-                         variables_module.Variable,
-                         sparse_tensor.SparseTensor)) and
-      x.__class__.__name__ != 'DeferredTensor'):
+  if not isinstance(x, (ops.Tensor,
+                        variables_module.Variable,
+                        sparse_tensor.SparseTensor)):
     raise ValueError('Unexpectedly found an instance of type `' + str(type(x)) +
                      '`. Expected a symbolic tensor instance.')
   return hasattr(x, '_keras_history')
@@ -823,6 +846,9 @@ def placeholder(shape=None, ndim=None, dtype=None, sparse=False, name=None):
       sparse: Boolean, whether the placeholder should have a sparse type.
       name: Optional name string for the placeholder.
 
+  Raises:
+      ValueError: If called with eager execution.
+
   Returns:
       Tensor instance (with Keras metadata included).
 
@@ -839,11 +865,11 @@ def placeholder(shape=None, ndim=None, dtype=None, sparse=False, name=None):
   if not shape:
     if ndim:
       shape = tuple([None for _ in range(ndim)])
-  if sparse:
-    x = array_ops.sparse_placeholder(dtype, shape=shape, name=name)
-  else:
-    x = array_ops.placeholder(dtype, shape=shape, name=name)
-  x._uses_learning_phase = False
+  with get_graph().as_default():
+    if sparse:
+      x = array_ops.sparse_placeholder(dtype, shape=shape, name=name)
+    else:
+      x = array_ops.placeholder(dtype, shape=shape, name=name)
   return x
 
 
@@ -1004,7 +1030,7 @@ def eval(x):
              [ 3.,  4.]], dtype=float32)
   ```
   """
-  return to_dense(x).eval(session=get_session())
+  return get_value(to_dense(x))
 
 
 @tf_export('keras.backend.zeros')
@@ -1694,10 +1720,7 @@ def var(x, axis=None, keepdims=False):
   """
   if x.dtype.base_dtype == dtypes_module.bool:
     x = math_ops.cast(x, floatx())
-  m = math_ops.reduce_mean(x, axis, True)
-  devs_squared = math_ops.square(x - m)
-  return math_ops.reduce_mean(
-      devs_squared, axis, keepdims)
+  return math_ops.reduce_variance(x, axis=axis, keepdims=keepdims)
 
 
 @tf_export('keras.backend.std')
@@ -1715,7 +1738,9 @@ def std(x, axis=None, keepdims=False):
   Returns:
       A tensor with the standard deviation of elements of `x`.
   """
-  return math_ops.sqrt(var(x, axis=axis, keepdims=keepdims))
+  if x.dtype.base_dtype == dtypes_module.bool:
+    x = math_ops.cast(x, floatx())
+  return math_ops.reduce_std(x, axis=axis, keepdims=keepdims)
 
 
 @tf_export('keras.backend.mean')
@@ -2221,7 +2246,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
 
 
 @tf_export('keras.backend.batch_normalization')
-def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
+def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
   I.e. returns:
@@ -2233,11 +2258,49 @@ def batch_normalization(x, mean, var, beta, gamma, epsilon=1e-3):
       var: Variance of batch.
       beta: Tensor with which to center the input.
       gamma: Tensor by which to scale the input.
+      axis: Integer, the axis that should be normalized.
+          (typically the features axis).
       epsilon: Fuzz factor.
 
   Returns:
       A tensor.
   """
+  if ndim(x) == 4:
+    # The CPU implementation of `fused_batch_norm` only supports NHWC
+    if axis == 1 or axis == -3:
+      tf_data_format = 'NCHW'
+    elif axis == 3 or axis == -1:
+      tf_data_format = 'NHWC'
+    else:
+      tf_data_format = None
+
+    if (tf_data_format == 'NHWC' or
+        tf_data_format == 'NCHW' and _has_nchw_support()):
+      # The mean / var / beta / gamma tensors may be broadcasted
+      # so they may have extra axes of size 1, which should be squeezed.
+      if ndim(mean) > 1:
+        mean = array_ops.reshape(mean, [-1])
+      if ndim(var) > 1:
+        var = array_ops.reshape(var, [-1])
+      if beta is None:
+        beta = zeros_like(mean)
+      elif ndim(beta) > 1:
+        beta = array_ops.reshape(beta, [-1])
+      if gamma is None:
+        gamma = ones_like(mean)
+      elif ndim(gamma) > 1:
+        gamma = array_ops.reshape(gamma, [-1])
+    y, _, _ = nn.fused_batch_norm(
+        x,
+        gamma,
+        beta,
+        epsilon=epsilon,
+        mean=mean,
+        variance=var,
+        data_format=tf_data_format,
+        is_training=False
+    )
+    return y
   return nn.batch_normalization(x, mean, var, beta, gamma, epsilon)
 
 
@@ -2262,7 +2325,7 @@ def concatenate(tensors, axis=-1):
     else:
       axis = 0
 
-  if py_all([is_sparse(x) for x in tensors]):
+  if py_all(is_sparse(x) for x in tensors):
     return sparse_ops.sparse_concat(axis, tensors)
   else:
     return array_ops.concat([to_dense(x) for x in tensors], axis)
@@ -2298,7 +2361,8 @@ def permute_dimensions(x, pattern):
 
 
 @tf_export('keras.backend.resize_images')
-def resize_images(x, height_factor, width_factor, data_format):
+def resize_images(x, height_factor, width_factor, data_format,
+                  interpolation='nearest'):
   """Resizes the images contained in a 4D tensor.
 
   Arguments:
@@ -2306,40 +2370,55 @@ def resize_images(x, height_factor, width_factor, data_format):
       height_factor: Positive integer.
       width_factor: Positive integer.
       data_format: One of `"channels_first"`, `"channels_last"`.
+      interpolation: A string, one of `nearest` or `bilinear`.
 
   Returns:
       A tensor.
 
   Raises:
-      ValueError: if `data_format` is neither
-          `channels_last` or `channels_first`.
+      ValueError: in case of incorrect value for
+        `data_format` or `interpolation`.
   """
   if data_format == 'channels_first':
-    original_shape = int_shape(x)
-    new_shape = array_ops.shape(x)[2:]
-    new_shape *= constant_op.constant(
-        np.array([height_factor, width_factor]).astype('int32'))
+    rows, cols = 2, 3
+  elif data_format == 'channels_last':
+    rows, cols = 1, 2
+  else:
+    raise ValueError('Invalid `data_format` argument: %s' % (data_format,))
+
+  original_shape = int_shape(x)
+  new_shape = array_ops.shape(x)[rows:cols + 1]
+  new_shape *= constant_op.constant(
+      np.array([height_factor, width_factor], dtype='int32'))
+
+  if data_format == 'channels_first':
     x = permute_dimensions(x, [0, 2, 3, 1])
+  if interpolation == 'nearest':
     x = image_ops.resize_nearest_neighbor(x, new_shape)
+  elif interpolation == 'bilinear':
+    x = image_ops.resize_bilinear(x, new_shape)
+  else:
+    raise ValueError('interpolation should be one '
+                     'of "nearest" or "bilinear".')
+  if data_format == 'channels_first':
     x = permute_dimensions(x, [0, 3, 1, 2])
-    x.set_shape((None, None, original_shape[2] * height_factor
-                 if original_shape[2] is not None else None,
-                 original_shape[3] * width_factor
-                 if original_shape[3] is not None else None))
-    return x
-  elif data_format == 'channels_last':
-    original_shape = int_shape(x)
-    new_shape = array_ops.shape(x)[1:3]
-    new_shape *= constant_op.constant(
-        np.array([height_factor, width_factor]).astype('int32'))
-    x = image_ops.resize_nearest_neighbor(x, new_shape)
-    x.set_shape((None, original_shape[1] * height_factor
-                 if original_shape[1] is not None else None,
-                 original_shape[2] * width_factor
-                 if original_shape[2] is not None else None, None))
-    return x
+
+  if original_shape[rows] is None:
+    new_height = None
   else:
-    raise ValueError('Invalid data_format: ' + str(data_format))
+    new_height = original_shape[rows] * height_factor
+
+  if original_shape[cols] is None:
+    new_width = None
+  else:
+    new_width = original_shape[cols] * width_factor
+
+  if data_format == 'channels_first':
+    output_shape = (None, None, new_height, new_width)
+  else:
+    output_shape = (None, new_height, new_width, None)
+  x.set_shape(output_shape)
+  return x
 
 
 @tf_export('keras.backend.resize_volumes')
@@ -2476,7 +2555,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
     result = cast(result, dtype)
   return result
 
-
+@tf_export('keras.backend.tile')
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -2704,9 +2783,14 @@ def get_value(x):
 
   Returns:
       A Numpy array.
+
+  Raises:
+      RuntimeError: If this method is called inside defun.
   """
   if context.executing_eagerly():
     return x.numpy()
+  elif ops.inside_function():
+    raise RuntimeError('Cannot get value inside Tensorflow graph function.')
   return x.eval(session=get_session())
 
 
@@ -2719,9 +2803,14 @@ def batch_get_value(tensors):
 
   Returns:
       A list of Numpy arrays.
+
+  Raises:
+      RuntimeError: If this method is called inside defun.
   """
   if context.executing_eagerly():
     return [x.numpy() for x in tensors]
+  elif ops.inside_function():  # pylint: disable=protected-access
+    raise RuntimeError('Cannot get value inside Tensorflow graph function.')
   if tensors:
     return get_session().run(tensors)
   else:
@@ -2738,19 +2827,20 @@ def set_value(x, value):
           (of the same shape).
   """
   value = np.asarray(value, dtype=dtype(x))
-  if context.executing_eagerly():
+  if ops.executing_eagerly_outside_functions():
     x.assign(value)
   else:
-    tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
-    if hasattr(x, '_assign_placeholder'):
-      assign_placeholder = x._assign_placeholder
-      assign_op = x._assign_op
-    else:
-      assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
-      assign_op = x.assign(assign_placeholder)
-      x._assign_placeholder = assign_placeholder
-      x._assign_op = assign_op
-    get_session().run(assign_op, feed_dict={assign_placeholder: value})
+    with get_graph().as_default():
+      tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
+      if hasattr(x, '_assign_placeholder'):
+        assign_placeholder = x._assign_placeholder
+        assign_op = x._assign_op
+      else:
+        assign_placeholder = array_ops.placeholder(tf_dtype, shape=value.shape)
+        assign_op = x.assign(assign_placeholder)
+        x._assign_placeholder = assign_placeholder
+        x._assign_op = assign_op
+      get_session().run(assign_op, feed_dict={assign_placeholder: value})
 
 
 @tf_export('keras.backend.batch_set_value')
@@ -2761,28 +2851,29 @@ def batch_set_value(tuples):
       tuples: a list of tuples `(tensor, value)`.
           `value` should be a Numpy array.
   """
-  if context.executing_eagerly():
+  if ops.executing_eagerly_outside_functions():
     for x, value in tuples:
       x.assign(np.asarray(value, dtype=dtype(x)))
   else:
-    if tuples:
-      assign_ops = []
-      feed_dict = {}
-      for x, value in tuples:
-        value = np.asarray(value, dtype=dtype(x))
-        tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
-        if hasattr(x, '_assign_placeholder'):
-          assign_placeholder = x._assign_placeholder
-          assign_op = x._assign_op
-        else:
-          assign_placeholder = array_ops.placeholder(tf_dtype,
-                                                     shape=value.shape)
-          assign_op = x.assign(assign_placeholder)
-          x._assign_placeholder = assign_placeholder
-          x._assign_op = assign_op
-        assign_ops.append(assign_op)
-        feed_dict[assign_placeholder] = value
-      get_session().run(assign_ops, feed_dict=feed_dict)
+    with get_graph().as_default():
+      if tuples:
+        assign_ops = []
+        feed_dict = {}
+        for x, value in tuples:
+          value = np.asarray(value, dtype=dtype(x))
+          tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
+          if hasattr(x, '_assign_placeholder'):
+            assign_placeholder = x._assign_placeholder
+            assign_op = x._assign_op
+          else:
+            assign_placeholder = array_ops.placeholder(tf_dtype,
+                                                       shape=value.shape)
+            assign_op = x.assign(assign_placeholder)
+            x._assign_placeholder = assign_placeholder
+            x._assign_op = assign_op
+          assign_ops.append(assign_op)
+          feed_dict[assign_placeholder] = value
+        get_session().run(assign_ops, feed_dict=feed_dict)
 
 
 @tf_export('keras.backend.print_tensor')
@@ -2812,7 +2903,7 @@ def print_tensor(x, message=''):
 # GRAPH MANIPULATION
 
 
-class Function(object):
+class GraphExecutionFunction(object):
   """Runs a computation graph.
 
   It's possible to pass arguments to `tf.Session.run()` via `session_kwargs`.
@@ -2836,13 +2927,13 @@ class Function(object):
                **session_kwargs):
     updates = updates or []
     if not isinstance(inputs, (list, tuple)):
-      raise TypeError('`inputs` to a TensorFlow backend function '
+      raise TypeError('`inputs` to a Keras backend function '
                       'should be a list or tuple.')
     if not isinstance(outputs, (list, tuple)):
-      raise TypeError('`outputs` of a TensorFlow backend function '
+      raise TypeError('`outputs` of a Keras backend function '
                       'should be a list or tuple.')
     if not isinstance(updates, (list, tuple)):
-      raise TypeError('`updates` in a TensorFlow backend function '
+      raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
     self.inputs = list(inputs)
     self.outputs = list(outputs)
@@ -2878,7 +2969,7 @@ class Function(object):
 
     if session_kwargs:
       raise ValueError('Some keys in session_kwargs are not supported at this '
-                       'time: %s', session_kwargs.keys())
+                       'time: %s' % (session_kwargs.keys(),))
 
     self._callable_fn = None
     self._feed_arrays = None
@@ -2989,30 +3080,129 @@ class Function(object):
     return fetched[:len(self.outputs)]
 
 
+class EagerExecutionFunction(object):
+  """Helper class for constructing a TF graph function from the Keras graph.
+
+  Arguments:
+    inputs: Feed placeholders to the computation graph.
+    outputs: Output tensors to fetch.
+    updates: Additional update ops to be run at function call.
+    name: A name to help users identify what this function does.
+    session_kwargs: Unsupported.
+  """
+
+  def __init__(self, inputs, outputs, updates=None, name=None):
+    updates = updates or []
+    if not isinstance(inputs, (list, tuple)):
+      raise TypeError('`inputs` to a Keras backend function '
+                      'should be a list or tuple.')
+    if not isinstance(outputs, (list, tuple)):
+      raise TypeError('`outputs` of a Keras backend function '
+                      'should be a list or tuple.')
+    if not isinstance(updates, (list, tuple)):
+      raise TypeError('`updates` in a Keras backend function '
+                      'should be a list or tuple.')
+    self.inputs = list(inputs)
+    self.outputs = list(outputs)
+    self.name = name
+
+    graph = get_graph()
+    # Consolidate updates
+    with graph.as_default():
+      with ops.control_dependencies(self.outputs):
+        # In general, updates should be run after the outputs have been
+        # computed. However, we can only ensure this when we create
+        # the updates here (i.e. when updates are passed as tuples).
+        # We cannot modify the control dependencies of preexisting update ops.
+        updates_ops = []
+        for update in updates:
+          # For legacy reasons it is allowed to pass an update as a tuple
+          # `(variable, new_value)` (this maps to an assign op).
+          if isinstance(update, tuple):
+            p, new_p = update
+            updates_ops.append(state_ops.assign(p, new_p))
+          else:
+            # Assumed already an op -- we cannot control its execution order.
+            updates_ops.append(update)
+
+      # We set the update ops to run at the end by conditioning it on output[0]
+      if updates and not self.outputs:
+        # Edge case; never happens in practice
+        raise ValueError('Cannot create a Keras backend function with updates'
+                         ' but no outputs during eager execution.')
+      with ops.control_dependencies(updates_ops):
+        self.outputs[0] = array_ops.identity(self.outputs[0])
+
+    # Prepare graph function
+    # TODO(fchollet): can we restrict `captures` to variables actually used in
+    # the relevant subgraph?
+    graph.inputs = self.inputs + list(graph.captures.values())
+    graph.outputs = self.outputs
+    graph_fn = eager_function.Function(graph)
+    graph_fn._num_positional_args = len(self.inputs)
+    graph_fn._arg_keywords = []
+    self._graph_fn = graph_fn
+
+    # Handle placeholders with default
+    # (treated as required placeholder by graph functions)
+    self._placeholder_default_values = {}
+    with graph.as_default():
+      for x in self.inputs:
+        if x.op.type == 'PlaceholderWithDefault':
+          self._placeholder_default_values[x] = tensor_util.constant_value(
+              x.op.inputs[0])
+
+  def __call__(self, inputs):
+    converted_inputs = []
+    for tensor, value in zip(self.inputs, inputs):
+      if value is None:
+        # Assume `value` is a placeholder with default
+        value = self._placeholder_default_values.get(tensor, None)
+        if value is None:
+          raise ValueError(
+              'You must feed a value for placeholder %s' % (tensor,))
+      if not isinstance(value, ops.Tensor):
+        value = ops.convert_to_tensor(value, dtype=tensor.dtype)
+      if value.dtype != tensor.dtype:
+        # Temporary workaround due to `convert_to_tensor` not casting floats.
+        # See b/119637405
+        value = math_ops.cast(value, tensor.dtype)
+      converted_inputs.append(value)
+    outputs = self._graph_fn(*converted_inputs)
+    return [x.numpy() for x in outputs]
+
+
 @tf_export('keras.backend.function')
-def function(inputs, outputs, updates=None, **kwargs):
+def function(inputs, outputs, updates=None, name=None, **kwargs):
   """Instantiates a Keras function.
 
   Arguments:
       inputs: List of placeholder tensors.
       outputs: List of output tensors.
       updates: List of update ops.
+      name: String, name of function.
       **kwargs: Passed to `tf.Session.run`.
 
   Returns:
       Output values as Numpy arrays.
 
   Raises:
-      ValueError: if invalid kwargs are passed in.
+      ValueError: if invalid kwargs are passed in or if in eager execution.
   """
+  if ops.executing_eagerly_outside_functions():
+    if kwargs:
+      raise ValueError('Session keyword arguments are not support during '
+                       'eager execution. You passed: %s' % (kwargs,))
+    return EagerExecutionFunction(inputs, outputs, updates=updates, name=name)
+
   if kwargs:
     for key in kwargs:
       if (key not in tf_inspect.getfullargspec(session_module.Session.run)[0]
-          and key not in tf_inspect.getfullargspec(Function.__init__)[0]):
+          and key not in ['inputs', 'outputs', 'updates', 'name']):
         msg = ('Invalid argument "%s" passed to K.function with TensorFlow '
                'backend') % key
         raise ValueError(msg)
-  return Function(inputs, outputs, updates=updates, **kwargs)
+  return GraphExecutionFunction(inputs, outputs, updates=updates, **kwargs)
 
 
 @tf_export('keras.backend.gradients')
@@ -3060,7 +3250,8 @@ def rnn(step_function,
         constants=None,
         unroll=False,
         input_length=None,
-        time_major=False):
+        time_major=False,
+        zero_output_for_mask=False):
   """Iterates over the time dimension of a tensor.
 
   Arguments:
@@ -3077,11 +3268,13 @@ def rnn(step_function,
                   as 'states'. The first state in the list must be the
                   output tensor at the previous timestep.
       inputs: Tensor of temporal data of shape `(samples, time, ...)`
-          (at least 3D).
-      initial_states: Tensor with shape `(samples, output_dim)`
-          (no time dimension),
-          containing the initial values for the states used in
-          the step function.
+          (at least 3D), or nested tensors, and each of which has shape
+          `(samples, time, ...)`.
+      initial_states: Tensor with shape `(samples, state_size)`
+          (no time dimension), containing the initial values for the states used
+          in the step function. In the case that state_size is in a nested
+          shape, the shape of initial_states will also follow the nested
+          structure.
       go_backwards: Boolean. If True, do the iteration over the time
           dimension in reverse order and return the reversed sequence.
       mask: Binary tensor with shape `(samples, time, 1)`,
@@ -3096,7 +3289,9 @@ def rnn(step_function,
           RNN calculation. However, most TensorFlow data is batch-major, so by
           default this function accepts input and emits output in batch-major
           form.
-
+      zero_output_for_mask: Boolean. If True, the output for masked timestep
+          will be zeros, whereas in the False case, output from previous
+          timestep is returned.
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
           last_output: the latest output of the rnn, of shape `(samples, ...)`
@@ -3113,62 +3308,87 @@ def rnn(step_function,
       ValueError: if `mask` is provided (not `None`) but states is not provided
           (`len(states)` == 0).
   """
-  ndim = len(inputs.shape)
-  if ndim < 3:
-    raise ValueError('Input should be at least 3D.')
-  inputs_shape = inputs.shape
+
+  def swap_batch_timestep(input_t):
+    # Swap the batch and timestep dim for the incoming tensor.
+    axes = list(range(len(input_t.shape)))
+    axes[0], axes[1] = 1, 0
+    return array_ops.transpose(input_t, axes)
+
   if not time_major:
-    axes = [1, 0] + list(range(2, ndim))
-    inputs = array_ops.transpose(inputs, axes)
+    inputs = nest.map_structure(swap_batch_timestep, inputs)
+
+  flatted_inputs = nest.flatten(inputs)
+  time_steps = flatted_inputs[0].shape[0]
+  batch = flatted_inputs[0].shape[1]
+  time_steps_t = array_ops.shape(flatted_inputs[0])[0]
+
+  for input_ in flatted_inputs:
+    input_.get_shape().with_rank_at_least(3)
 
   if mask is not None:
     if mask.dtype != dtypes_module.bool:
       mask = math_ops.cast(mask, dtypes_module.bool)
-    if len(mask.shape) == ndim - 1:
+    if len(mask.shape) == 2:
       mask = expand_dims(mask)
     if not time_major:
-      mask = array_ops.transpose(mask, axes)
+      mask = swap_batch_timestep(mask)
 
   if constants is None:
     constants = []
 
-  global uses_learning_phase  # pylint: disable=global-variable-undefined
-  uses_learning_phase = False
+  # tf.where needs its condition tensor to be the same shape as its two
+  # result tensors, but in our case the condition (mask) tensor is
+  # (nsamples, 1), and inputs are (nsamples, ndimensions) or even more.
+  # So we need to broadcast the mask to match the shape of inputs.
+  # That's what the tile call does, it just repeats the mask along its
+  # second dimension n times.
+  def _expand_mask(mask_t, input_t, fixed_dim=1):
+    assert not nest.is_sequence(mask_t)
+    assert not nest.is_sequence(input_t)
+    rank_diff = len(input_t.shape) - len(mask_t.shape)
+    for _ in range(rank_diff):
+      mask_t = array_ops.expand_dims(mask_t, -1)
+    multiples = [1] * fixed_dim + input_t.shape.as_list()[fixed_dim:]
+    return array_ops.tile(mask_t, multiples)
 
   if unroll:
-    if not inputs.shape[0]:
+    if not time_steps:
       raise ValueError('Unrolling requires a fixed number of timesteps.')
     states = initial_states
     successive_states = []
     successive_outputs = []
 
-    input_list = array_ops.unstack(inputs)
-    if go_backwards:
-      input_list.reverse()
+    # Process the input tensors. The input tensor need to be split on the
+    # time_step dim, and reverse if go_backwards is True. In the case of nested
+    # input, the input is flattened and then transformed individually.
+    # The result of this will be a tuple of lists, each of the item in tuple is
+    # list of the tensor with shape (batch, feature)
+    def _process_single_input_t(input_t):
+      input_t = array_ops.unstack(input_t)  # unstack for time_step dim
+      if go_backwards:
+        input_t.reverse()
+      return input_t
+
+    if nest.is_sequence(inputs):
+      processed_input = nest.map_structure(_process_single_input_t, inputs)
+    else:
+      processed_input = (_process_single_input_t(inputs),)
+
+    def _get_input_tensor(time):
+      inp = [t_[time] for t_ in processed_input]
+      return nest.pack_sequence_as(inputs, inp)
 
     if mask is not None:
       mask_list = array_ops.unstack(mask)
       if go_backwards:
         mask_list.reverse()
 
-      for inp, mask_t in zip(input_list, mask_list):
+      for i in range(time_steps):
+        inp = _get_input_tensor(i)
+        mask_t = mask_list[i]
         output, new_states = step_function(inp, states + constants)
-        if getattr(output, '_uses_learning_phase', False):
-          uses_learning_phase = True
-
-        # tf.where needs its condition tensor
-        # to be the same shape as its two
-        # result tensors, but in our case
-        # the condition (mask) tensor is
-        # (nsamples, 1), and A and B are (nsamples, ndimensions).
-        # So we need to
-        # broadcast the mask to match the shape of A and B.
-        # That's what the tile call does,
-        # it just repeats the mask along its second dimension
-        # n times.
-        tiled_mask_t = array_ops.tile(mask_t,
-                                      array_ops.stack(
-                                          [1, array_ops.shape(output)[1]]))
+        tiled_mask_t = _expand_mask(mask_t, output)
 
         if not successive_outputs:
           prev_output = zeros_like(output)
@@ -3180,10 +3400,7 @@ def rnn(step_function,
         return_states = []
         for state, new_state in zip(states, new_states):
           # (see earlier comment for tile explanation)
-          tiled_mask_t = array_ops.tile(mask_t,
-                                        array_ops.stack(
-                                            [1,
-                                             array_ops.shape(new_state)[1]]))
+          tiled_mask_t = _expand_mask(mask_t, new_state)
           return_states.append(array_ops.where(tiled_mask_t, new_state, state))
         states = return_states
         successive_outputs.append(output)
@@ -3191,11 +3408,21 @@ def rnn(step_function,
       last_output = successive_outputs[-1]
       new_states = successive_states[-1]
       outputs = array_ops.stack(successive_outputs)
+
+      if zero_output_for_mask:
+        last_output = array_ops.where(
+            _expand_mask(mask_list[-1], last_output),
+            last_output,
+            zeros_like(last_output))
+        outputs = array_ops.where(
+            _expand_mask(mask, outputs, fixed_dim=2),
+            outputs,
+            zeros_like(outputs))
+
     else:
-      for inp in input_list:
+      for i in range(time_steps):
+        inp = _get_input_tensor(i)
         output, states = step_function(inp, states + constants)
-        if getattr(output, '_uses_learning_phase', False):
-          uses_learning_phase = True
         successive_outputs.append(output)
         successive_states.append(states)
       last_output = successive_outputs[-1]
@@ -3203,20 +3430,47 @@ def rnn(step_function,
       outputs = array_ops.stack(successive_outputs)
 
   else:
-    if go_backwards:
-      inputs = reverse(inputs, 0)
-
     states = tuple(initial_states)
 
-    time_steps = array_ops.shape(inputs)[0]
-    outputs, _ = step_function(inputs[0], initial_states + constants)
-    output_ta = tensor_array_ops.TensorArray(
-        dtype=outputs.dtype, size=time_steps, tensor_array_name='output_ta')
-    input_ta = tensor_array_ops.TensorArray(
-        dtype=inputs.dtype, size=time_steps, tensor_array_name='input_ta')
-    input_ta = input_ta.unstack(inputs)
+    # Create input tensor array, if the inputs is nested tensors, then it will
+    # be flattened first, and tensor array will be created one per flattened
+    # tensor.
+    input_ta = tuple(
+        tensor_array_ops.TensorArray(
+            dtype=inp.dtype,
+            size=time_steps_t,
+            tensor_array_name='input_ta_%s' % i)
+        for i, inp in enumerate(flatted_inputs))
+    input_ta = tuple(
+        ta.unstack(input_) if not go_backwards else ta
+        .unstack(reverse(input_, 0))
+        for ta, input_ in zip(input_ta, flatted_inputs))
+
+    # Get the time(0) input and compute the output for that, the output will be
+    # used to determine the dtype of output tensor array. Don't read from
+    # input_ta due to TensorArray clear_after_read default to True.
+    input_time_zero = nest.pack_sequence_as(inputs,
+                                            [inp[0] for inp in flatted_inputs])
+    # output_time_zero is used to determine the cell output shape and its dtype.
+    # the value is discarded.
+    output_time_zero, _ = step_function(input_time_zero,
+                                        initial_states + constants)
+    output_ta = tuple(
+        tensor_array_ops.TensorArray(
+            dtype=out.dtype,
+            size=time_steps_t,
+            tensor_array_name='output_ta_%s' % i)
+        for i, out in enumerate(nest.flatten(output_time_zero)))
+
     time = constant_op.constant(0, dtype='int32', name='time')
 
+    while_loop_kwargs = {
+        'cond': lambda time, *_: time < time_steps_t,
+        'maximum_iterations': input_length,
+        'parallel_iterations': 32,
+        'swap_memory': True,
+    }
+
     if mask is not None:
       if not states:
         raise ValueError('No initial states provided! '
@@ -3230,48 +3484,65 @@ def rnn(step_function,
 
       mask_ta = tensor_array_ops.TensorArray(
           dtype=dtypes_module.bool,
-          size=time_steps,
+          size=time_steps_t,
           tensor_array_name='mask_ta')
       mask_ta = mask_ta.unstack(mask)
 
-      def _step(time, output_ta_t, *states):
+      # Mask for the T output will be base on the output of T - 1. In the case
+      # T = 0, a zero filled tensor will be used.
+      flat_zero_output = tuple(array_ops.zeros_like(o)
+                               for o in nest.flatten(output_time_zero))
+      def _step(time, output_ta_t, prev_output, *states):
         """RNN step function.
 
         Arguments:
             time: Current timestep value.
             output_ta_t: TensorArray.
+            prev_output: tuple of outputs from time - 1.
             *states: List of states.
 
         Returns:
-            Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
+            Tuple: `(time + 1, output_ta_t, output) + tuple(new_states)`
         """
-        current_input = input_ta.read(time)
+        current_input = tuple(ta.read(time) for ta in input_ta)
+        # maybe set shape.
+        current_input = nest.pack_sequence_as(inputs, current_input)
         mask_t = mask_ta.read(time)
         output, new_states = step_function(current_input,
                                            tuple(states) + tuple(constants))
-        if getattr(output, '_uses_learning_phase', False):
-          global uses_learning_phase  # pylint: disable=global-variable-undefined
-          uses_learning_phase = True
-        for state, new_state in zip(states, new_states):
+        # mask output
+        flat_output = nest.flatten(output)
+        flat_mask_output = (flat_zero_output if zero_output_for_mask
+                            else nest.flatten(prev_output))
+        tiled_mask_t = tuple(_expand_mask(mask_t, o) for o in flat_output)
+        flat_new_output = tuple(
+            array_ops.where(m, o, zo) for m, o, zo in zip(
+                tiled_mask_t, flat_output, flat_mask_output))
+
+        # mask states
+        flat_state = nest.flatten(states)
+        flat_new_state = nest.flatten(new_states)
+        for state, new_state in zip(flat_state, flat_new_state):
           new_state.set_shape(state.shape)
-        tiled_mask_t = array_ops.tile(mask_t,
-                                      array_ops.stack(
-                                          [1, array_ops.shape(output)[1]]))
-        output = array_ops.where(tiled_mask_t, output, states[0])
-
-        masked_states = []
-        for i in range(len(states)):
-          states_dim = array_ops.shape(new_states[i])[1]
-          stacked_states_dim = array_ops.stack([1, states_dim])
-          tiled_mask = array_ops.tile(mask_t, stacked_states_dim)
-          masked_state = array_ops.where(tiled_mask, new_states[i], states[i])
-          masked_states.append(masked_state)
-        new_states = masked_states
-
-        output_ta_t = output_ta_t.write(time, output)
-        return (time + 1, output_ta_t) + tuple(new_states)
+        tiled_mask_t = tuple(_expand_mask(mask_t, s) for s in flat_state)
+        flat_final_state = tuple(
+            array_ops.where(m, s, ps)
+            for m, s, ps in zip(tiled_mask_t, flat_new_state, flat_state))
+        new_states = nest.pack_sequence_as(new_states, flat_final_state)
+
+        output_ta_t = tuple(
+            ta.write(time, out)
+            for ta, out in zip(output_ta_t, flat_new_output))
+        return (time + 1, output_ta_t,
+                tuple(flat_new_output)) + tuple(new_states)
+
+      final_outputs = control_flow_ops.while_loop(
+          body=_step,
+          loop_vars=(time, output_ta, flat_zero_output) + states,
+          **while_loop_kwargs)
+      # Skip final_outputs[2] which is the output for final timestep.
+      new_states = final_outputs[3:]
     else:
-
       def _step(time, output_ta_t, *states):
         """RNN step function.
 
@@ -3283,43 +3554,48 @@ def rnn(step_function,
         Returns:
             Tuple: `(time + 1,output_ta_t) + tuple(new_states)`
         """
-        current_input = input_ta.read(time)
+        current_input = tuple(ta.read(time) for ta in input_ta)
+        current_input = nest.pack_sequence_as(inputs, current_input)
         output, new_states = step_function(current_input,
                                            tuple(states) + tuple(constants))
-        if getattr(output, '_uses_learning_phase', False):
-          global uses_learning_phase  # pylint: disable=global-variable-undefined
-          uses_learning_phase = True
-        for state, new_state in zip(states, new_states):
+        flat_state = nest.flatten(states)
+        flat_new_state = nest.flatten(new_states)
+        for state, new_state in zip(flat_state, flat_new_state):
           new_state.set_shape(state.shape)
-        output_ta_t = output_ta_t.write(time, output)
+
+        flat_output = nest.flatten(output)
+        output_ta_t = tuple(
+            ta.write(time, out) for ta, out in zip(output_ta_t, flat_output))
+        new_states = nest.pack_sequence_as(initial_states, flat_new_state)
         return (time + 1, output_ta_t) + tuple(new_states)
 
-    final_outputs = control_flow_ops.while_loop(
-        cond=lambda time, *_: time < time_steps,
-        body=_step,
-        loop_vars=(time, output_ta) + states,
-        maximum_iterations=input_length,
-        parallel_iterations=32,
-        swap_memory=True)
-    last_time = final_outputs[0]
+      final_outputs = control_flow_ops.while_loop(
+          body=_step,
+          loop_vars=(time, output_ta) + states,
+          **while_loop_kwargs)
+      new_states = final_outputs[2:]
+
     output_ta = final_outputs[1]
-    new_states = final_outputs[2:]
 
-    outputs = output_ta.stack()
-    last_output = output_ta.read(last_time - 1)
+    outputs = tuple(o.stack() for o in output_ta)
+    last_output = tuple(o[-1] for o in outputs)
 
-  if not time_major:
-    axes = [1, 0] + list(range(2, len(outputs.shape)))
-    outputs = array_ops.transpose(outputs, axes)
+    outputs = nest.pack_sequence_as(output_time_zero, outputs)
+    last_output = nest.pack_sequence_as(output_time_zero, last_output)
 
-  # Static shape inference: (samples, time, ...) or (time, sample, ...)
-  outputs_shape = outputs.shape.as_list()
-  outputs_shape[0] = inputs_shape[0]
-  outputs_shape[1] = inputs_shape[1]
-  outputs.set_shape(outputs_shape)
+  # static shape inference
+  def set_shape(output_):
+    shape = output_.shape.as_list()
+    shape[0] = time_steps
+    shape[1] = batch
+    output_.set_shape(shape)
+    return output_
+
+  outputs = nest.map_structure(set_shape, outputs)
+
+  if not time_major:
+    outputs = nest.map_structure(swap_batch_timestep, outputs)
 
-  if not context.executing_eagerly():
-    last_output._uses_learning_phase = uses_learning_phase
   return last_output, outputs, new_states
 
 
@@ -3408,17 +3684,14 @@ def in_train_phase(x, alt, training=None):
   """
   if training is None:
     training = learning_phase()
-    uses_learning_phase = True
-  else:
-    uses_learning_phase = False
 
-  if training is 1 or training is True:
+  if training == 1 or training is True:
     if callable(x):
       return x()
     else:
       return x
 
-  elif training is 0 or training is False:
+  elif training == 0 or training is False:
     if callable(alt):
       return alt()
     else:
@@ -3426,8 +3699,6 @@ def in_train_phase(x, alt, training=None):
 
   # else: assume learning phase is a placeholder tensor.
   x = switch(training, x, alt)
-  if uses_learning_phase:
-    x._uses_learning_phase = True
   return x
 
 
@@ -3796,19 +4067,23 @@ def _preprocess_conv1d_input(x, data_format):
   return x, tf_data_format
 
 
-def _preprocess_conv2d_input(x, data_format):
+def _preprocess_conv2d_input(x, data_format, force_transpose=False):
   """Transpose and cast the input before the conv2d.
 
   Arguments:
       x: input tensor.
       data_format: string, `"channels_last"` or `"channels_first"`.
+      force_transpose: Boolean. If True, the input will always be transposed
+          from NCHW to NHWC if `data_format` is `"channels_first"`.
+          If False, the transposition only occurs on CPU (GPU ops are
+          assumed to support NCHW).
 
   Returns:
       A tensor.
   """
   tf_data_format = 'NHWC'
   if data_format == 'channels_first':
-    if not _has_nchw_support():
+    if not _has_nchw_support() or force_transpose:
       x = array_ops.transpose(x, (0, 2, 3, 1))  # NCHW -> NHWC
     else:
       tf_data_format = 'NCHW'
@@ -3956,7 +4231,8 @@ def conv2d_transpose(x,
                      output_shape,
                      strides=(1, 1),
                      padding='valid',
-                     data_format=None):
+                     data_format=None,
+                     dilation_rate=(1, 1)):
   """2D deconvolution (i.e.
 
   transposed convolution).
@@ -3970,6 +4246,7 @@ def conv2d_transpose(x,
       data_format: string, `"channels_last"` or `"channels_first"`.
           Whether to use Theano or TensorFlow/CNTK data format
           for inputs/kernels/outputs.
+      dilation_rate: Tuple of 2 integers.
 
   Returns:
       A tensor, result of transposed 2D convolution.
@@ -3985,7 +4262,13 @@ def conv2d_transpose(x,
   if isinstance(output_shape, (tuple, list)):
     output_shape = array_ops.stack(output_shape)
 
-  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
+  # `atrous_conv2d_transpose` only supports NHWC format, even on GPU.
+  if data_format == 'channels_first' and dilation_rate != (1, 1):
+    force_transpose = True
+  else:
+    force_transpose = False
+
+  x, tf_data_format = _preprocess_conv2d_input(x, data_format, force_transpose)
 
   if data_format == 'channels_first' and tf_data_format == 'NHWC':
     output_shape = (output_shape[0], output_shape[2], output_shape[3],
@@ -4000,13 +4283,18 @@ def conv2d_transpose(x,
   else:
     strides = (1, 1) + strides
 
-  x = nn.conv2d_transpose(
-      x,
-      kernel,
-      output_shape,
-      strides,
-      padding=padding,
-      data_format=tf_data_format)
+  if dilation_rate == (1, 1):
+    x = nn.conv2d_transpose(x, kernel, output_shape, strides,
+                            padding=padding,
+                            data_format=tf_data_format)
+  else:
+    assert dilation_rate[0] == dilation_rate[1]
+    x = nn.atrous_conv2d_transpose(
+        x,
+        kernel,
+        output_shape,
+        rate=dilation_rate[0],
+        padding=padding)
   if data_format == 'channels_first' and tf_data_format == 'NHWC':
     x = array_ops.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
   return x
@@ -4110,6 +4398,8 @@ def separable_conv2d(x,
     data_format = image_data_format()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format: ' + str(data_format))
+  if len(strides) != 2:
+    raise ValueError('`strides` must be a tuple of 2 integers.')
 
   x, tf_data_format = _preprocess_conv2d_input(x, data_format)
   padding = _preprocess_padding(padding)
@@ -4316,6 +4606,10 @@ def pool2d(x,
     data_format = image_data_format()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format: ' + str(data_format))
+  if len(pool_size) != 2:
+    raise ValueError('`pool_size` must be a tuple of 2 integers.')
+  if len(strides) != 2:
+    raise ValueError('`strides` must be a tuple of 2 integers.')
 
   x, tf_data_format = _preprocess_conv2d_input(x, data_format)
   padding = _preprocess_padding(padding)
@@ -4897,7 +5191,6 @@ def foldr(fn, elems, initializer=None, name=None):
   """
   return functional_ops.foldr(fn, elems, initializer=initializer, name=name)
 
-
 # Load Keras default configuration from config file if present.
 # Set Keras base dir path given KERAS_HOME env variable, if applicable.
 # Otherwise either ~/.keras or /tmp.
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index ab7158994060d0d2f074d56d1e44bd926b9ca67d..af01b46fa9a4a45201de930cfb7827ac1d2bafbd 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -23,9 +23,14 @@ import scipy.sparse
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
@@ -88,6 +93,7 @@ def compare_two_inputs_op_to_numpy(keras_op,
                          str(keras_output))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BackendUtilsTest(test.TestCase):
 
   def test_backend(self):
@@ -129,8 +135,9 @@ class BackendUtilsTest(test.TestCase):
       keras.backend.set_learning_phase(0)
       x = keras.Input((3,))
       y = keras.layers.BatchNormalization()(x)
-      sess.run(variables.global_variables_initializer())
-      sess.run(y, feed_dict={x: np.random.random((2, 3))})
+      if not context.executing_eagerly():
+        self.evaluate(variables.global_variables_initializer())
+        sess.run(y, feed_dict={x: np.random.random((2, 3))})
 
   def test_learning_phase_scope(self):
     with self.cached_session():
@@ -149,22 +156,29 @@ class BackendUtilsTest(test.TestCase):
       self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
 
   def test_int_shape(self):
-    x = keras.backend.placeholder(shape=(3, 4))
+    x = keras.backend.ones(shape=(3, 4))
     self.assertEqual(keras.backend.int_shape(x), (3, 4))
 
-    x = keras.backend.placeholder(shape=(None, 4))
-    self.assertEqual(keras.backend.int_shape(x), (None, 4))
+    if not context.executing_eagerly():
+      x = keras.backend.placeholder(shape=(None, 4))
+      self.assertEqual(keras.backend.int_shape(x), (None, 4))
 
   def test_in_train_phase(self):
     with self.cached_session():
       y1 = keras.backend.variable(1)
       y2 = keras.backend.variable(2)
-      y = keras.backend.in_train_phase(y1, y2)
-      f = keras.backend.function([keras.backend.learning_phase()], [y])
-      y_val = f([0])[0]
-      self.assertAllClose(y_val, 2)
-      y_val = f([1])[0]
-      self.assertAllClose(y_val, 1)
+      if context.executing_eagerly():
+        with keras.backend.learning_phase_scope(0):
+          y_val_test = keras.backend.in_train_phase(y1, y2).numpy()
+        with keras.backend.learning_phase_scope(1):
+          y_val_train = keras.backend.in_train_phase(y1, y2).numpy()
+      else:
+        y = keras.backend.in_train_phase(y1, y2)
+        f = keras.backend.function([keras.backend.learning_phase()], [y])
+        y_val_test = f([0])[0]
+        y_val_train = f([1])[0]
+      self.assertAllClose(y_val_test, 2)
+      self.assertAllClose(y_val_train, 1)
 
   def test_is_keras_tensor(self):
     x = keras.backend.variable(1)
@@ -174,164 +188,20 @@ class BackendUtilsTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.backend.is_keras_tensor(0)
 
-  def test_is_placeholder(self):
-    x = keras.backend.placeholder(shape=(1,))
-    self.assertEqual(keras.backend.is_placeholder(x), True)
-    # Test with TF placeholder
-    x = keras.backend.array_ops.placeholder(dtype='float32', shape=(1,))
-    self.assertEqual(keras.backend.is_placeholder(x), True)
-    x = keras.backend.variable(1)
-    self.assertEqual(keras.backend.is_placeholder(x), False)
-
   def test_stop_gradient(self):
     x = keras.backend.variable(1)
     y = keras.backend.stop_gradient(x)
-    self.assertEqual(y.op.name[:12], 'StopGradient')
+    if not context.executing_eagerly():
+      self.assertEqual(y.op.name[:12], 'StopGradient')
 
     xs = [keras.backend.variable(1) for _ in range(3)]
     ys = keras.backend.stop_gradient(xs)
-    for y in ys:
-      self.assertEqual(y.op.name[:12], 'StopGradient')
-
-  def test_function_tf_feed_symbols(self):
-    with self.cached_session():
-      # Test feeding a resource variable to `function`.
-      x1 = keras.backend.placeholder(shape=())
-      x2 = keras.backend.placeholder(shape=())
-      lr = keras.backend.learning_phase()  # Include a placeholder_with_default.
-
-      y1 = keras.backend.variable(10.)
-      y2 = 3
-
-      f = keras.backend.function(
-          inputs=[x1, x2, lr],
-          outputs=[x1 + 1,
-                   keras.backend.in_train_phase(x2 + 2, x2 - 1)])
-      outs = f([y1, y2, None])  # Use default learning_phase value.
-      self.assertEqual(outs, [11., 2.])
-      outs = f([y1, y2, 1])  # Set learning phase value.
-      self.assertEqual(outs, [11., 5.])
-
-      # Test triggering a callable refresh by changing the input.
-      y3 = keras.backend.constant(20.)  # Test with tensor
-      outs = f([y3, y2, None])
-      self.assertEqual(outs, [21., 2.])
-
-      y4 = 4  # Test with non-symbol
-      outs = f([y4, y2, None])
-      self.assertEqual(outs, [5., 2.])
-
-      # Test with a different dtype
-      y5 = keras.backend.constant(10., dtype='float64')
-      outs = f([y5, y2, None])
-      self.assertEqual(outs, [11., 2.])
-
-  def test_function_tf_fetches(self):
-    # Additional operations can be passed to tf.Session().run() via its
-    # `fetches` arguments. In contrast to `updates` argument of
-    # keras.backend.function() these do not have control dependency on `outputs`
-    # so they can run in parallel. Also they should not contribute to output of
-    # keras.backend.function().
-    with self.cached_session():
-      x = keras.backend.variable(0.)
-      y = keras.backend.variable(0.)
-      x_placeholder = keras.backend.placeholder(shape=())
-      y_placeholder = keras.backend.placeholder(shape=())
-
-      f = keras.backend.function(inputs=[x_placeholder, y_placeholder],
-                                 outputs=[x_placeholder + y_placeholder],
-                                 updates=[(x, x_placeholder + 1.)],
-                                 fetches=[keras.backend.update(y, 5.)])
-      output = f([10., 20.])
-      self.assertEqual(output, [30.])
-      self.assertEqual(
-          keras.backend.get_session().run(fetches=[x, y]), [11., 5.])
-
-  def test_function_tf_feed_dict(self):
-    # Additional substitutions can be passed to `tf.Session().run()` via its
-    # `feed_dict` arguments. Note that the feed_dict is passed once in the
-    # constructor but we can modify the values in the dictionary. Through
-    # this feed_dict we can provide additional substitutions besides Keras
-    # inputs.
-    with self.cached_session():
-      x = keras.backend.variable(0.)
-      y = keras.backend.variable(0.)
-      x_placeholder = keras.backend.placeholder(shape=())
-      y_placeholder = keras.backend.placeholder(shape=())
-
-      feed_dict = {y_placeholder: 3.}
-      fetches = [keras.backend.update(y, y_placeholder * 10.)]
-      f = keras.backend.function(inputs=[x_placeholder],
-                                 outputs=[x_placeholder + 1.],
-                                 updates=[(x, x_placeholder + 10.)],
-                                 feed_dict=feed_dict,
-                                 fetches=fetches)
-      output = f([10.])
-      self.assertEqual(output, [11.])
-      self.assertEqual(
-          keras.backend.get_session().run(fetches=[x, y]), [20., 30.])
-
-      # updated value in feed_dict will be modified within the K.function()
-      feed_dict[y_placeholder] = 4.
-      output = f([20.])
-      self.assertEqual(output, [21.])
-      self.assertEqual(
-          keras.backend.get_session().run(fetches=[x, y]), [30., 40.])
-
-  def test_function_tf_run_options_with_run_metadata(self):
-    with self.cached_session():
-      x_placeholder = keras.backend.placeholder(shape=())
-      y_placeholder = keras.backend.placeholder(shape=())
-
-      run_options = config_pb2.RunOptions(output_partition_graphs=True)
-      run_metadata = config_pb2.RunMetadata()
-      # enable run_options.
-      f = keras.backend.function(inputs=[x_placeholder, y_placeholder],
-                                 outputs=[x_placeholder + y_placeholder],
-                                 options=run_options,
-                                 run_metadata=run_metadata)
-      output = f([10., 20.])
-      self.assertEqual(output, [30.])
-      self.assertGreater(len(run_metadata.partition_graphs), 0)
-      # disable run_options.
-      f1 = keras.backend.function(inputs=[x_placeholder, y_placeholder],
-                                  outputs=[x_placeholder + y_placeholder],
-                                  run_metadata=run_metadata)
-      output1 = f1([10., 20.])
-      self.assertEqual(output1, [30.])
-      self.assertEqual(len(run_metadata.partition_graphs), 0)
-
-  def test_function_fetch_callbacks(self):
-
-    class CallbackStub(object):
-
-      def __init__(self):
-        self.times_called = 0
-        self.callback_result = 0
-
-      def _fetch_callback(self, result):
-        self.times_called += 1
-        self.callback_result = result
-
-    with self.cached_session():
-      callback = CallbackStub()
-      x_placeholder = keras.backend.placeholder(shape=())
-      y_placeholder = keras.backend.placeholder(shape=())
-
-      callback_op = x_placeholder * y_placeholder
-
-      f = keras.backend.function(
-          inputs=[x_placeholder, y_placeholder],
-          outputs=[x_placeholder + y_placeholder])
-      f.fetches.append(callback_op)
-      f.fetch_callbacks[callback_op] = callback._fetch_callback
-
-      _ = f([10., 20.])
-
-      self.assertEqual(callback.times_called, 1)
-      self.assertEqual(callback.callback_result, 200)
+    if not context.executing_eagerly():
+      for y in ys:
+        self.assertEqual(y.op.name[:12], 'StopGradient')
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BackendVariableTest(test.TestCase):
 
   def test_zeros(self):
@@ -404,23 +274,18 @@ class BackendVariableTest(test.TestCase):
       y = keras.backend.to_dense(x)
       self.assertFalse(keras.backend.is_sparse(y))
 
-  def test_placeholder(self):
-    x = keras.backend.placeholder(shape=(3, 4))
-    self.assertEqual(x.get_shape().as_list(), [3, 4])
-    x = keras.backend.placeholder(shape=(3, 4), sparse=True)
-    self.assertEqual(x.get_shape().as_list(), [3, 4])
-
 
+@test_util.run_all_in_graph_and_eager_modes
 class BackendLinearAlgebraTest(test.TestCase):
 
   def test_dot(self):
-    x = keras.backend.placeholder(shape=(2, 3))
-    y = keras.backend.placeholder(shape=(3, 4))
+    x = keras.backend.ones(shape=(2, 3))
+    y = keras.backend.ones(shape=(3, 4))
     xy = keras.backend.dot(x, y)
     self.assertEqual(xy.get_shape().as_list(), [2, 4])
 
-    x = keras.backend.placeholder(shape=(32, 28, 3))
-    y = keras.backend.placeholder(shape=(3, 4))
+    x = keras.backend.ones(shape=(32, 28, 3))
+    y = keras.backend.ones(shape=(3, 4))
     xy = keras.backend.dot(x, y)
     self.assertEqual(xy.get_shape().as_list(), [32, 28, 4])
 
@@ -524,7 +389,8 @@ class BackendLinearAlgebraTest(test.TestCase):
 
       # alpha (leaky relu used)
       relu_op = keras.backend.relu(x, alpha=0.5)
-      self.assertTrue('LeakyRelu' in relu_op.name)
+      if not context.executing_eagerly():
+        self.assertTrue('LeakyRelu' in relu_op.name)
       self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
 
       # max_value < some elements
@@ -533,7 +399,8 @@ class BackendLinearAlgebraTest(test.TestCase):
 
       # nn.relu6 used
       relu_op = keras.backend.relu(x, max_value=6)
-      self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
+      if not context.executing_eagerly():
+        self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
       self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
 
       # max value > 6
@@ -577,6 +444,7 @@ class BackendLinearAlgebraTest(test.TestCase):
       self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BackendShapeOpsTest(test.TestCase):
 
   def test_reshape(self):
@@ -662,9 +530,10 @@ class BackendShapeOpsTest(test.TestCase):
     self.assertEqual(y.get_shape().as_list(), [1, 9, 2])
 
     # Use with a dynamic axis:
-    x = keras.backend.placeholder(shape=(2, None, 2))
-    y = keras.backend.repeat_elements(x, 3, axis=1)
-    self.assertEqual(y.get_shape().as_list(), [2, None, 2])
+    if not context.executing_eagerly():
+      x = keras.backend.placeholder(shape=(2, None, 2))
+      y = keras.backend.repeat_elements(x, 3, axis=1)
+      self.assertEqual(y.get_shape().as_list(), [2, None, 2])
 
   def test_repeat(self):
     x = keras.backend.variable(np.ones((1, 3)))
@@ -779,6 +648,7 @@ class BackendShapeOpsTest(test.TestCase):
           np_kwargs={'data_format': 'channels_first'})
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
   def test_bias_add(self):
@@ -798,7 +668,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                      input_shape_a=(4, 3, 5, 2, 7),
                                      input_shape_b=(7,))
 
-      with self.assertRaises(ValueError):
+      with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
         x = keras.backend.variable((3, 4))
         b = keras.backend.variable((3, 4))
         keras.backend.bias_add(x, b)
@@ -1199,13 +1069,13 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                                              initial_states,
                                                              **kwargs)
         # check static shape inference
-        self.assertEquals(last_output.get_shape().as_list(),
-                          [num_samples, output_dim])
-        self.assertEquals(outputs.get_shape().as_list(),
-                          [num_samples, timesteps, output_dim])
+        self.assertEqual(last_output.get_shape().as_list(),
+                         [num_samples, output_dim])
+        self.assertEqual(outputs.get_shape().as_list(),
+                         [num_samples, timesteps, output_dim])
         for state in new_states:
-          self.assertEquals(state.get_shape().as_list(),
-                            [num_samples, output_dim])
+          self.assertEqual(state.get_shape().as_list(),
+                           [num_samples, output_dim])
 
         last_output_list[i].append(keras.backend.eval(last_output))
         outputs_list[i].append(keras.backend.eval(outputs))
@@ -1277,8 +1147,11 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
     rnn_fn = rnn_step_fn()
     inputs = keras.backend.variable(input_val)
-    initial_states = [keras.backend.variable(init_state_val),
-                      np.concatenate([init_state_val, init_state_val], axis=-1)]
+    initial_states = [
+        keras.backend.variable(init_state_val),
+        ops.convert_to_tensor(
+            np.concatenate([init_state_val, init_state_val], axis=-1))
+    ]
     mask = keras.backend.variable(np_mask)
 
     kwargs_list = [
@@ -1300,7 +1173,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(outputs.get_shape().as_list(),
                          [num_samples, timesteps, output_dim])
         # for state in new_states:
-        #   self.assertEquals(state.get_shape().as_list(),
+        #   self.assertEqual(state.get_shape().as_list(),
         #                     [num_samples, output_dim])
         self.assertEqual(new_states[0].get_shape().as_list(),
                          [num_samples, output_dim])
@@ -1350,6 +1223,121 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
       for s, u_s in zip(additional_state_list[2], additional_state_list[3]):
         self.assertAllClose(s, u_s, atol=1e-04)
 
+  def test_rnn_output_and_state_masking_independent(self):
+    num_samples = 2
+    num_timesteps = 4
+    state_and_io_size = 2
+    mask_last_num_timesteps = 2  # for second sample only
+
+    # a step function that just outputs inputs,
+    # but increments states +1 per timestep
+    def step_function(inputs, states):
+      return inputs, [s + 1 for s in states]
+
+    inputs_vals = np.random.random((num_samples, num_timesteps,
+                                    state_and_io_size))
+    initial_state_vals = np.random.random((num_samples, state_and_io_size))
+    # masking of two last timesteps for second sample only
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[1, -mask_last_num_timesteps:] = 0
+
+    # outputs expected to be same as inputs for the first sample
+    expected_outputs = inputs_vals.copy()
+    # but for the second sample all outputs in masked region should be the same
+    # as last output before masked region
+    expected_outputs[1, -mask_last_num_timesteps:] = \
+        expected_outputs[1, -(mask_last_num_timesteps + 1)]
+
+    expected_last_state = initial_state_vals.copy()
+    # first state should be incremented for every timestep (no masking)
+    expected_last_state[0] += num_timesteps
+    # second state should not be incremented for last two timesteps
+    expected_last_state[1] += (num_timesteps - mask_last_num_timesteps)
+
+    # verify same expected output for `unroll=true/false`
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, outputs, last_states = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
+      self.assertAllClose(
+          keras.backend.eval(last_states[0]), expected_last_state)
+
+  def test_rnn_output_num_dim_larger_than_2_masking(self):
+    num_samples = 3
+    num_timesteps = 4
+    num_features = 5
+
+    def step_function(inputs, states):
+      outputs = keras.backend.tile(keras.backend.expand_dims(inputs), [1, 1, 2])
+      return outputs, [keras.backend.identity(s) for s in states]
+      # Note: cannot just return states (which can be a problem) ->
+      # tensorflow/python/ops/resource_variable_ops.py", line 824, in set_shape
+      # NotImplementedError: ResourceVariable does not implement set_shape()
+
+    inputs_vals = np.random.random((num_samples, num_timesteps, num_features))
+    initial_state_vals = np.random.random((num_samples, 6))
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[-1, -1] = 0  # final timestep masked for last sample
+
+    expected_outputs = np.repeat(inputs_vals[..., None], repeats=2, axis=-1)
+    # for the last sample, the final timestep (in masked region) should be the
+    # same as the second to final output (before masked region)
+    expected_outputs[-1, -1] = expected_outputs[-1, -2]
+
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, outputs, _ = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
+
+  def test_rnn_state_num_dim_larger_than_2_masking(self):
+    num_samples = 3
+    num_timesteps = 4
+
+    def step_function(inputs, states):
+      return inputs, [s + 1 for s in states]
+
+    inputs_vals = np.random.random((num_samples, num_timesteps, 5))
+    initial_state_vals = np.random.random((num_samples, 6, 7))
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[0, -2:] = 0  # final two timesteps masked for first sample
+
+    expected_last_state = initial_state_vals.copy()
+    expected_last_state[0] += (num_timesteps - 2)
+    expected_last_state[1:] += num_timesteps
+
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, _, last_states = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(
+          keras.backend.eval(last_states[0]), expected_last_state)
+
   def test_normalize_batch_in_training(self):
     val = np.random.random((10, 3, 10, 10))
     x = keras.backend.variable(val)
@@ -1382,6 +1370,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(var.get_shape().as_list(), [3,])
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class TestCTC(test.TestCase):
 
   def test_ctc_decode(self):
@@ -1433,6 +1422,7 @@ class TestCTC(test.TestCase):
                 decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
       self.assertAllClose(log_prob_truth, log_prob_pred)
 
+  @test_util.run_v1_only('b/120545219')
   def test_ctc_batch_cost(self):
     with self.cached_session():
       label_lens = np.expand_dims(np.asarray([5, 4]), 1)
@@ -1487,6 +1477,7 @@ class TestCTC(test.TestCase):
       self.assertAllClose(res[:, 0], ref, atol=1e-05)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class TestRandomOps(test.TestCase):
 
   def test_random_binomial(self):
@@ -1506,12 +1497,242 @@ class TestRandomOps(test.TestCase):
       self.assertAllClose(np.min(y), -2., atol=0.1)
 
   def test_string_input(self):
-    seq = keras.Sequential([
-        keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
-        keras.layers.Lambda(lambda x: x[0])
-    ])
-    preds = seq.predict([['tensorflow eager']])
-    self.assertEqual(preds.shape, (1,))
+    with self.cached_session():
+      seq = keras.Sequential([
+          keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
+          keras.layers.Lambda(lambda x: x[0])
+      ])
+      preds = seq.predict([['tensorflow eager']])
+      self.assertEqual(preds.shape, (1,))
+
+
+class BackendGraphTests(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_is_placeholder(self):
+    x = keras.backend.placeholder(shape=(1,))
+    self.assertEqual(keras.backend.is_placeholder(x), True)
+    # Test with TF placeholder
+    x = keras.backend.array_ops.placeholder(dtype='float32', shape=(1,))
+    self.assertEqual(keras.backend.is_placeholder(x), True)
+    x = keras.backend.variable(1)
+    self.assertEqual(keras.backend.is_placeholder(x), False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_basics(self):
+    x1 = keras.backend.placeholder(shape=(), dtype='float32')
+    x2 = keras.backend.placeholder(shape=(), dtype='int32')
+    v = keras.backend.variable(10.)
+    with keras.backend.get_graph().as_default():
+      y1 = x1 + keras.backend.cast(x2, 'float32') + v
+      y2 = x1 * keras.backend.cast(x2, 'float32')
+      with ops.control_dependencies([y1]):
+        u = keras.backend.update(v, 5.)
+    f = keras.backend.function([x1, x2], [y1, y2], updates=[u])
+    output_values = f([2, 3])
+    self.assertEqual(output_values, [15., 6.])
+    self.assertEqual(keras.backend.eval(v), 5.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_placeholder_with_default(self):
+    with keras.backend.get_graph().as_default():
+      x1 = array_ops.placeholder_with_default(
+          np.array(2., dtype='float32'), shape=())
+      x2 = array_ops.placeholder_with_default(
+          np.array(3, dtype='int32'), shape=())
+    y1 = x1 + keras.backend.cast(x2, 'float32')
+    y2 = x1 * keras.backend.cast(x2, 'float32')
+    f = keras.backend.function([x1, x2], [y1, y2])
+    output_values = f([4, 5])
+    self.assertEqual(output_values, [9., 20.])
+    output_values = f([None, None])
+    self.assertEqual(output_values, [5., 6.])
+
+  @test_util.run_deprecated_v1
+  def test_function_tf_feed_symbols(self):
+    # Test Keras backend functions with TF tensor inputs.
+    with self.cached_session():
+      # Test feeding a resource variable to `function`.
+      x1 = keras.backend.placeholder(shape=())
+      x2 = keras.backend.placeholder(shape=())
+      lr = keras.backend.learning_phase()  # Include a placeholder_with_default.
+
+      y1 = keras.backend.variable(10.)
+      y2 = 3
+
+      f = keras.backend.function(
+          inputs=[x1, x2, lr],
+          outputs=[x1 + 1, keras.backend.in_train_phase(x2 + 2, x2 - 1)])
+      outs = f([y1, y2, None])  # Use default learning_phase value.
+      self.assertEqual(outs, [11., 2.])
+      outs = f([y1, y2, 1])  # Set learning phase value.
+      self.assertEqual(outs, [11., 5.])
+
+      # Test triggering a callable refresh by changing the input.
+      y3 = keras.backend.constant(20.)  # Test with tensor
+      outs = f([y3, y2, None])
+      self.assertEqual(outs, [21., 2.])
+
+      y4 = 4  # Test with non-symbol
+      outs = f([y4, y2, None])
+      self.assertEqual(outs, [5., 2.])
+
+      # Test with a different dtype
+      y5 = keras.backend.constant(10., dtype='float64')
+      outs = f([y5, y2, None])
+      self.assertEqual(outs, [11., 2.])
+
+  @test_util.run_deprecated_v1
+  def test_function_tf_fetches(self):
+    # Additional operations can be passed to tf.Session().run() via its
+    # `fetches` arguments. In contrast to `updates` argument of
+    # keras.backend.function() these do not have control dependency on `outputs`
+    # so they can run in parallel. Also they should not contribute to output of
+    # keras.backend.function().
+    with self.cached_session():
+      x = keras.backend.variable(0.)
+      y = keras.backend.variable(0.)
+      x_placeholder = keras.backend.placeholder(shape=())
+      y_placeholder = keras.backend.placeholder(shape=())
+
+      f = keras.backend.function(
+          inputs=[x_placeholder, y_placeholder],
+          outputs=[x_placeholder + y_placeholder],
+          updates=[(x, x_placeholder + 1.)],
+          fetches=[keras.backend.update(y, 5.)])
+      output = f([10., 20.])
+      self.assertEqual(output, [30.])
+      self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
+                       [11., 5.])
+
+  @test_util.run_deprecated_v1
+  def test_function_tf_feed_dict(self):
+    # Additional substitutions can be passed to `tf.Session().run()` via its
+    # `feed_dict` arguments. Note that the feed_dict is passed once in the
+    # constructor but we can modify the values in the dictionary. Through
+    # this feed_dict we can provide additional substitutions besides Keras
+    # inputs.
+    with self.cached_session():
+      x = keras.backend.variable(0.)
+      y = keras.backend.variable(0.)
+      x_placeholder = keras.backend.placeholder(shape=())
+      y_placeholder = keras.backend.placeholder(shape=())
+
+      feed_dict = {y_placeholder: 3.}
+      fetches = [keras.backend.update(y, y_placeholder * 10.)]
+      f = keras.backend.function(
+          inputs=[x_placeholder],
+          outputs=[x_placeholder + 1.],
+          updates=[(x, x_placeholder + 10.)],
+          feed_dict=feed_dict,
+          fetches=fetches)
+      output = f([10.])
+      self.assertEqual(output, [11.])
+      self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
+                       [20., 30.])
+
+      # updated value in feed_dict will be modified within the K.function()
+      feed_dict[y_placeholder] = 4.
+      output = f([20.])
+      self.assertEqual(output, [21.])
+      self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
+                       [30., 40.])
+
+  @test_util.run_deprecated_v1
+  def test_function_tf_run_options_with_run_metadata(self):
+    with self.cached_session():
+      x_placeholder = keras.backend.placeholder(shape=())
+      y_placeholder = keras.backend.placeholder(shape=())
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
+      # enable run_options.
+      f = keras.backend.function(
+          inputs=[x_placeholder, y_placeholder],
+          outputs=[x_placeholder + y_placeholder],
+          options=run_options,
+          run_metadata=run_metadata)
+      output = f([10., 20.])
+      self.assertEqual(output, [30.])
+      self.assertGreater(len(run_metadata.partition_graphs), 0)
+      # disable run_options.
+      f1 = keras.backend.function(
+          inputs=[x_placeholder, y_placeholder],
+          outputs=[x_placeholder + y_placeholder],
+          run_metadata=run_metadata)
+      output1 = f1([10., 20.])
+      self.assertEqual(output1, [30.])
+      self.assertEqual(len(run_metadata.partition_graphs), 0)
+
+  @test_util.run_deprecated_v1
+  def test_function_fetch_callbacks(self):
+
+    class CallbackStub(object):
+
+      def __init__(self):
+        self.times_called = 0
+        self.callback_result = 0
+
+      def _fetch_callback(self, result):
+        self.times_called += 1
+        self.callback_result = result
+
+    with self.cached_session():
+      callback = CallbackStub()
+      x_placeholder = keras.backend.placeholder(shape=())
+      y_placeholder = keras.backend.placeholder(shape=())
+
+      callback_op = x_placeholder * y_placeholder
+
+      f = keras.backend.function(
+          inputs=[x_placeholder, y_placeholder],
+          outputs=[x_placeholder + y_placeholder])
+      f.fetches.append(callback_op)
+      f.fetch_callbacks[callback_op] = callback._fetch_callback
+
+      _ = f([10., 20.])
+
+      self.assertEqual(callback.times_called, 1)
+      self.assertEqual(callback.callback_result, 200)
+
+  def test_placeholder(self):
+    x = keras.backend.placeholder(shape=(3, 4))
+    self.assertEqual(x.get_shape().as_list(), [3, 4])
+    x = keras.backend.placeholder(shape=(3, 4), sparse=True)
+    self.assertEqual(x.get_shape().as_list(), [3, 4])
+
+  @test_util.run_deprecated_v1
+  def test_batch_normalization(self):
+    # No eager CPU kernel.
+    g_val = np.random.random((3,))
+    b_val = np.random.random((3,))
+    gamma = keras.backend.variable(g_val)
+    beta = keras.backend.variable(b_val)
+
+    # 3D NHC case
+    val = np.random.random((10, 5, 3))
+    x = keras.backend.variable(val)
+    mean, var = nn.moments(x, (0, 1), None, None, False)
+    normed = keras.backend.batch_normalization(
+        x, mean, var, beta, gamma, axis=-1, epsilon=1e-3)
+    self.assertEqual(normed.shape.as_list(), [10, 5, 3])
+
+    # 4D NHWC case
+    val = np.random.random((10, 5, 5, 3))
+    x = keras.backend.variable(val)
+    mean, var = nn.moments(x, (0, 1, 2), None, None, False)
+    normed = keras.backend.batch_normalization(
+        x, mean, var, beta, gamma, axis=-1, epsilon=1e-3)
+    self.assertEqual(normed.shape.as_list(), [10, 5, 5, 3])
+
+    # 4D NCHW case
+    val = np.random.random((10, 3, 5, 5))
+    x = keras.backend.variable(val)
+    mean, var = nn.moments(x, (0, 2, 3), None, None, False)
+    normed = keras.backend.batch_normalization(
+        x, mean, var, beta, gamma, axis=1, epsilon=1e-3)
+    self.assertEqual(normed.shape.as_list(), [10, 3, 5, 5])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 6dfbbf3694f2afe021b862ac774b2a8234d55a48..2d7d5a415d422cea300ab722ceacdb83803d3db8 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -19,13 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import deque
-from collections import Iterable
-from collections import OrderedDict
+import collections
 import copy
 import csv
+import io
 import json
-import math
 import os
 import time
 
@@ -36,7 +34,6 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.training_utils import standardize_input_data
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
@@ -55,74 +52,67 @@ except ImportError:
   requests = None
 
 
+# pylint: disable=protected-access
 def configure_callbacks(callbacks,
                         model,
                         do_validation=False,
-                        val_inputs=None,
-                        val_targets=None,
-                        val_sample_weights=None,
                         batch_size=None,
                         epochs=None,
                         steps_per_epoch=None,
                         samples=None,
-                        validation_steps=None,
                         verbose=1,
-                        count_mode='steps'):
+                        count_mode='steps',
+                        mode='train'):
   """Configures callbacks for use in various training loops.
 
   Arguments:
       callbacks: List of Callbacks.
       model: Model being trained.
       do_validation: Whether or not validation loop will be run.
-      val_inputs: Inputs to Model for validation loop. Can be any
-        data format Keras accepts.
-      val_targets: Targets for Model for validation loop. Can be any
-        data format Keras accepts.
-      val_sample_weights: Sample weights for Model for validation loop.
-        Can be any data format Keras accepts.
       batch_size: Number of samples per batch.
       epochs: Number of epoch to train.
       steps_per_epoch: Number of batches to run per training epoch.
       samples: Number of training samples.
-      validation_steps: Number of batches to run per validation epoch.
       verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
       count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
+      mode: String. One of 'train', 'test', or 'predict'. Which loop mode to
+        configure callbacks for.
 
   Returns:
       Instance of CallbackList used to control all Callbacks.
   """
-
-  # Add additional callbacks
-  model.history = History()
-  stateful_metric_names = None
-  if hasattr(model, 'stateful_metric_names'):
-    stateful_metric_names = model.stateful_metric_names
-  callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
-              ] + (callbacks or []) + [model.history]
-  if verbose:
-    callbacks.append(
-        ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
+  # Check if callbacks have already been configured.
+  if isinstance(callbacks, CallbackList):
+    return callbacks
+
+  if not callbacks:
+    callbacks = []
+
+  # Add additional callbacks during training.
+  if mode == 'train':
+    model.history = History()
+    stateful_metric_names = None
+    if hasattr(model, 'metrics_names'):
+      stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+    callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
+                ] + (callbacks or []) + [model.history]
+    if verbose:
+      callbacks.append(
+          ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
   callback_list = CallbackList(callbacks)
 
   # Set callback model
-  callback_model = model._get_callback_model()  # pylint: disable=protected-access
-  if do_validation and val_inputs and not context.executing_eagerly():
-    # Need to create the test_function before start of the first epoch
-    # because TensorBoard callback on_epoch_begin adds summary to the
-    # list of fetches of the test_function
-    callback_model._make_test_function()  # pylint: disable=protected-access
+  callback_model = model._get_callback_model()
   callback_list.set_model(callback_model)
 
   # Set callback parameters
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
   # when we standardize first batch of data.
-  if model._is_compiled:  # pylint: disable=protected-access
+  if mode != 'predict' and hasattr(model, 'metrics_names'):
     callback_metrics = copy.copy(model.metrics_names)
     if do_validation:
       callback_metrics += ['val_' + n for n in model.metrics_names]
-  if validation_steps is None and isinstance(val_inputs, Sequence):
-    validation_steps = len(val_inputs)
   callback_params = {
       'batch_size': batch_size,
       'epochs': epochs,
@@ -131,26 +121,19 @@ def configure_callbacks(callbacks,
       'verbose': verbose,
       'do_validation': do_validation,
       'metrics': callback_metrics,
-      'validation_steps': validation_steps
   }
   callback_list.set_params(callback_params)
 
-  # Pass validation data to callbacks
-  if not val_inputs:
-    val_data = []
-  elif _is_generator_like(val_inputs):
-    val_data = val_inputs
-  else:
-    val_data = val_inputs + val_targets
-    if val_sample_weights:
-      val_data += val_sample_weights
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      val_data += [0.]
-  for cbk in callbacks:
-    cbk.validation_data = val_data
+  if (do_validation and not model._distribution_strategy and
+      not model.run_eagerly):
+    # Need to create the eval_function before start of the first epoch
+    # because TensorBoard callback on_epoch_begin adds summary to the
+    # list of fetches of the eval_function
+    callback_model._make_eval_function()
 
   callback_list.model.stop_training = False
   return callback_list
+# pylint: enable=protected-access
 
 
 def _is_generator_like(data):
@@ -174,6 +157,12 @@ class CallbackList(object):
     self.queue_length = queue_length
     self.params = {}
     self.model = None
+    self._reset_batch_timing()
+
+  def _reset_batch_timing(self):
+    self._delta_t_batch = 0.
+    self._delta_ts = collections.defaultdict(
+        lambda: collections.deque([], maxlen=self.queue_length))
 
   def append(self, callback):
     self.callbacks.append(callback)
@@ -188,72 +177,96 @@ class CallbackList(object):
     for callback in self.callbacks:
       callback.set_model(model)
 
-  def on_epoch_begin(self, epoch, logs=None):
+  def _call_batch_hook(self, mode, hook, batch, logs=None):
+    """Helper function for all batch_{begin | end} methods."""
+    # TODO(omalleyt): add batch hooks for test/predict.
+    if mode != 'train':
+      return
+
+    hook_name = 'on_{mode}_batch_{hook}'.format(mode=mode, hook=hook)
+    if hook == 'begin':
+      self._t_enter_batch = time.time()
+    if hook == 'end':
+      # Batch is ending, calculate batch time.
+      self._delta_t_batch = time.time() - self._t_enter_batch
+
+    logs = logs or {}
+    t_before_callbacks = time.time()
+    for callback in self.callbacks:
+      batch_hook = getattr(callback, hook_name)
+      batch_hook(batch, logs)
+    self._delta_ts[hook_name].append(time.time() - t_before_callbacks)
+
+    delta_t_median = np.median(self._delta_ts[hook_name])
+    if (self._delta_t_batch > 0. and
+        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
+      logging.warning(
+          'Method (%s) is slow compared '
+          'to the batch update (%f). Check your callbacks.', hook_name,
+          delta_t_median)
+
+  def _call_begin_hook(self, mode):
+    """Helper function for on_{train|test|predict}_begin methods."""
+    # TODO(omalleyt): add test/predict methods.
+    if mode == 'train':
+      self.on_train_begin()
+
+  def _call_end_hook(self, mode):
+    """Helper function for on_{train|test|predict}_end methods."""
+    # TODO(omalleyt): add test/predict methods.
+    if mode == 'train':
+      self.on_train_end()
+
+  def on_batch_begin(self, batch, logs=None):
+    self._call_batch_hook('train', 'begin', batch, logs=logs)
+
+  def on_batch_end(self, batch, logs=None):
+    self._call_batch_hook('train', 'end', batch, logs=logs)
+
+  def on_epoch_begin(self, epoch, logs=None, mode='train'):
     """Called at the start of an epoch.
 
     Arguments:
         epoch: integer, index of epoch.
         logs: dictionary of logs.
+        mode: One of 'train'/'test'/'predict'
     """
-    logs = logs or {}
-    for callback in self.callbacks:
-      callback.on_epoch_begin(epoch, logs)
-    self._delta_t_batch = 0.
-    self._delta_ts_batch_begin = deque([], maxlen=self.queue_length)
-    self._delta_ts_batch_end = deque([], maxlen=self.queue_length)
+    if mode == 'train':
+      logs = logs or {}
+      for callback in self.callbacks:
+        callback.on_epoch_begin(epoch, logs)
+    self._reset_batch_timing()
 
-  def on_epoch_end(self, epoch, logs=None):
+  def on_epoch_end(self, epoch, logs=None, mode='train'):
     """Called at the end of an epoch.
 
     Arguments:
         epoch: integer, index of epoch.
         logs: dictionary of logs.
+        mode: One of 'train'/'test'/'predict'
     """
-    logs = logs or {}
-    for callback in self.callbacks:
-      callback.on_epoch_end(epoch, logs)
+    if mode == 'train':
+      logs = logs or {}
+      for callback in self.callbacks:
+        callback.on_epoch_end(epoch, logs)
 
-  def on_batch_begin(self, batch, logs=None):
-    """Called right before processing a batch.
+  def on_train_batch_begin(self, batch, logs=None):
+    """Called at the beginning of a training batch in `fit` methods.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
         logs: dictionary of logs.
     """
-    logs = logs or {}
-    t_before_callbacks = time.time()
-    for callback in self.callbacks:
-      callback.on_batch_begin(batch, logs)
-    self._delta_ts_batch_begin.append(time.time() - t_before_callbacks)
-    delta_t_median = np.median(self._delta_ts_batch_begin)
-    if (self._delta_t_batch > 0. and
-        delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1):
-      logging.warning('Method on_batch_begin() is slow compared '
-                      'to the batch update (%f). Check your callbacks.',
-                      delta_t_median)
-    self._t_enter_batch = time.time()
+    self._call_batch_hook('train', 'begin', batch, logs=logs)
 
-  def on_batch_end(self, batch, logs=None):
-    """Called at the end of a batch.
+  def on_train_batch_end(self, batch, logs=None):
+    """Called at the end of a training batch in `fit` methods.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
         logs: dictionary of logs.
     """
-    logs = logs or {}
-    if not hasattr(self, '_t_enter_batch'):
-      self._t_enter_batch = time.time()
-    self._delta_t_batch = time.time() - self._t_enter_batch
-    t_before_callbacks = time.time()
-    for callback in self.callbacks:
-      callback.on_batch_end(batch, logs)
-    self._delta_ts_batch_end.append(time.time() - t_before_callbacks)
-    delta_t_median = np.median(self._delta_ts_batch_end)
-    if (self._delta_t_batch > 0. and
-        (delta_t_median > 0.95 * self._delta_t_batch and delta_t_median > 0.1)):
-      logging.warning('Method on_batch_end() is slow compared '
-                      'to the batch update (%f). Check your callbacks.',
-                      delta_t_median)
+    self._call_batch_hook('train', 'end', batch, logs=logs)
 
   def on_train_begin(self, logs=None):
     """Called at the beginning of training.
@@ -329,6 +342,14 @@ class Callback(object):
   def on_batch_end(self, batch, logs=None):
     pass
 
+  def on_train_batch_begin(self, batch, logs=None):
+    # For backwards compatibility
+    self.on_batch_begin(batch, logs=logs)
+
+  def on_train_batch_end(self, batch, logs=None):
+    # For backwards compatibility
+    self.on_batch_end(batch, logs=logs)
+
   def on_train_begin(self, logs=None):
     pass
 
@@ -431,18 +452,20 @@ class ProgbarLogger(Callback):
     self.epochs = self.params['epochs']
 
   def on_epoch_begin(self, epoch, logs=None):
+    self.seen = 0
+    if self.use_steps:
+      self.target = self.params['steps']
+    else:
+      self.target = self.params['samples']
+
     if self.verbose:
-      print('Epoch %d/%d' % (epoch + 1, self.epochs))
-      if self.use_steps:
-        target = self.params['steps']
-      else:
-        target = self.params['samples']
-      self.target = target
+      if self.epochs > 1:
+        print('Epoch %d/%d' % (epoch + 1, self.epochs))
       self.progbar = Progbar(
           target=self.target,
           verbose=self.verbose,
-          stateful_metrics=self.stateful_metrics)
-    self.seen = 0
+          stateful_metrics=self.stateful_metrics,
+          unit_name='step' if self.use_steps else 'sample')
 
   def on_batch_begin(self, batch, logs=None):
     if self.seen < self.target:
@@ -606,24 +629,28 @@ class EarlyStopping(Callback):
   """Stop training when a monitored quantity has stopped improving.
 
   Arguments:
-      monitor: quantity to be monitored.
-      min_delta: minimum change in the monitored quantity
+      monitor: Quantity to be monitored.
+      min_delta: Minimum change in the monitored quantity
           to qualify as an improvement, i.e. an absolute
           change of less than min_delta, will count as no
           improvement.
-      patience: number of epochs with no improvement
+      patience: Number of epochs with no improvement
           after which training will be stopped.
       verbose: verbosity mode.
-      mode: one of {auto, min, max}. In `min` mode,
+      mode: One of `{"auto", "min", "max"}`. In `min` mode,
           training will stop when the quantity
           monitored has stopped decreasing; in `max`
           mode it will stop when the quantity
           monitored has stopped increasing; in `auto`
           mode, the direction is automatically inferred
           from the name of the monitored quantity.
-      baseline: baseline value for the monitored quantity.
+      baseline: Baseline value for the monitored quantity.
           Training will stop if the model doesn't show improvement over the
           baseline.
+      restore_best_weights: Whether to restore model weights from
+          the epoch with the best value of the monitored quantity.
+          If False, the model weights obtained at the last step of
+          training are used.
   """
 
   def __init__(self,
@@ -632,7 +659,8 @@ class EarlyStopping(Callback):
                patience=0,
                verbose=0,
                mode='auto',
-               baseline=None):
+               baseline=None,
+               restore_best_weights=False):
     super(EarlyStopping, self).__init__()
 
     self.monitor = monitor
@@ -642,6 +670,8 @@ class EarlyStopping(Callback):
     self.min_delta = abs(min_delta)
     self.wait = 0
     self.stopped_epoch = 0
+    self.restore_best_weights = restore_best_weights
+    self.best_weights = None
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('EarlyStopping mode %s is unknown, '
@@ -673,25 +703,37 @@ class EarlyStopping(Callback):
       self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
   def on_epoch_end(self, epoch, logs=None):
-    current = logs.get(self.monitor)
+    current = self.get_monitor_value(logs)
     if current is None:
-      logging.warning('Early stopping conditioned on metric `%s` '
-                      'which is not available. Available metrics are: %s',
-                      self.monitor, ','.join(list(logs.keys())))
       return
     if self.monitor_op(current - self.min_delta, self.best):
       self.best = current
       self.wait = 0
+      if self.restore_best_weights:
+        self.best_weights = self.model.get_weights()
     else:
       self.wait += 1
       if self.wait >= self.patience:
         self.stopped_epoch = epoch
         self.model.stop_training = True
+        if self.restore_best_weights:
+          if self.verbose > 0:
+            print('Restoring model weights from the end of the best epoch.')
+          self.model.set_weights(self.best_weights)
 
   def on_train_end(self, logs=None):
     if self.stopped_epoch > 0 and self.verbose > 0:
       print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))
 
+  def get_monitor_value(self, logs):
+    logs = logs or {}
+    monitor_value = logs.get(self.monitor)
+    if monitor_value is None:
+      logging.warning('Early stopping conditioned on metric `%s` '
+                      'which is not available. Available metrics are: %s',
+                      self.monitor, ','.join(list(logs.keys())))
+    return monitor_value
+
 
 @tf_export('keras.callbacks.RemoteMonitor')
 class RemoteMonitor(Callback):
@@ -781,6 +823,10 @@ class LearningRateScheduler(Callback):
       print('\nEpoch %05d: LearningRateScheduler reducing learning '
             'rate to %s.' % (epoch + 1, lr))
 
+  def on_epoch_end(self, epoch, logs=None):
+    logs = logs or {}
+    logs['lr'] = K.get_value(self.model.optimizer.lr)
+
 
 @tf_export('keras.callbacks.TensorBoard')
 class TensorBoard(Callback):
@@ -835,6 +881,12 @@ class TensorBoard(Callback):
           `embeddings_layer_names`. Numpy array (if the model has a single
           input) or list of Numpy arrays (if the model has multiple inputs).
           Learn [more about embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+      update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
+          writes the losses and metrics to TensorBoard after each batch.
+          The same applies for `'epoch'`. If using an integer, let's say `1000`,
+          the callback will write the metrics and losses to TensorBoard every
+          1000 samples. Note that writing too frequently to TensorBoard
+          can slow down your training.
 
   Raises:
       ValueError: If histogram_freq is set and no validation data is provided.
@@ -858,7 +910,8 @@ class TensorBoard(Callback):
                embeddings_freq=0,
                embeddings_layer_names=None,
                embeddings_metadata=None,
-               embeddings_data=None):
+               embeddings_data=None,
+               update_freq='epoch'):
     super(TensorBoard, self).__init__()
     self.log_dir = log_dir
     self.histogram_freq = histogram_freq
@@ -874,10 +927,17 @@ class TensorBoard(Callback):
     self.batch_size = batch_size
     self._current_batch = 0
     self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
     self.embeddings_freq = embeddings_freq
     self.embeddings_layer_names = embeddings_layer_names
     self.embeddings_metadata = embeddings_metadata
     self.embeddings_data = embeddings_data
+    if update_freq == 'batch':
+      self.update_freq = 1
+    else:
+      self.update_freq = update_freq
+    self._samples_seen = 0
+    self._samples_seen_at_last_write = 0
 
   def _init_writer(self):
     """Sets file writer."""
@@ -956,8 +1016,10 @@ class TensorBoard(Callback):
     # If both embedding_freq and embeddings_data are available, we will
     # visualize embeddings.
     if self.embeddings_freq and self.embeddings_data is not None:
-      self.embeddings_data = standardize_input_data(self.embeddings_data,
-                                                    model.input_names)
+      # Avoid circular dependency.
+      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
+      self.embeddings_data = training_utils.standardize_input_data(
+          self.embeddings_data, model.input_names)
 
       # If embedding_layer_names are not provided, get all of the embedding
       # layers from the model.
@@ -1022,10 +1084,8 @@ class TensorBoard(Callback):
       projector.visualize_embeddings(self.writer, config)
 
   def _fetch_callback(self, summary):
-    self.writer.add_summary(
-        summary,
-        self._epoch + self._current_val_batch / self._validation_batches)
-    self._current_val_batch += 1
+    self.writer.add_summary(summary, self._total_val_batches_seen)
+    self._total_val_batches_seen += 1
 
   def _write_custom_summaries(self, step, logs=None):
     """Writes metrics out as custom scalar summaries.
@@ -1041,55 +1101,48 @@ class TensorBoard(Callback):
       # use v2 summary ops
       with self.writer.as_default(), summary_ops_v2.always_record_summaries():
         for name, value in logs.items():
-          summary_ops_v2.scalar(name, value.item(), step=step)
+          if isinstance(value, np.ndarray):
+            value = value.item()
+          summary_ops_v2.scalar(name, value, step=step)
     else:
       # use FileWriter from v1 summary
       for name, value in logs.items():
+        if isinstance(value, np.ndarray):
+          value = value.item()
         summary = tf_summary.Summary()
         summary_value = summary.value.add()
-        summary_value.simple_value = value.item()
+        summary_value.simple_value = value
         summary_value.tag = name
         self.writer.add_summary(summary, step)
     self.writer.flush()
 
-  def on_train_begin(self, logs=None):
-    """Checks if histogram summaries can be run."""
-    # will never be set when in eager
-    if self.histogram_freq:
-      if self.params.get('validation_steps', None) is not None:
-        self._validation_batches = self.params['validation_steps']
-      elif self.validation_data:
-        self._validation_batches = math.ceil(
-            self.validation_data[0].shape[0] / self.batch_size)
-      else:
-        raise ValueError('If printing histograms, validation data must be '
-                         'provided.')
-      if self._validation_batches == 0:
-        raise ValueError(
-            'If printing histograms, validation data must have length > 0.')
-
   def on_batch_end(self, batch, logs=None):
     """Writes scalar summaries for metrics on every training batch."""
     # Don't output batch_size and batch number as Tensorboard summaries
     logs = logs or {}
-    batch_logs = {('batch_' + k): v
-                  for k, v in logs.items()
-                  if k not in ['batch', 'size', 'num_steps']}
-    self._write_custom_summaries(self._total_batches_seen, batch_logs)
+    self._samples_seen += logs.get('size', 1)
+    samples_seen_since = self._samples_seen - self._samples_seen_at_last_write
+    if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq:
+      batch_logs = {('batch_' + k): v
+                    for k, v in logs.items()
+                    if k not in ['batch', 'size', 'num_steps']}
+      self._write_custom_summaries(self._total_batches_seen, batch_logs)
+      self._samples_seen_at_last_write = self._samples_seen
     self._total_batches_seen += 1
 
   def on_epoch_begin(self, epoch, logs=None):
-    """Add histogram op to Model test_function callbacks, reset batch count."""
+    """Add histogram op to Model eval_function callbacks, reset batch count."""
 
     # check if histogram summary should be run for this epoch
     if self.histogram_freq and epoch % self.histogram_freq == 0:
       self._epoch = epoch
-      self._current_val_batch = 0
+      # pylint: disable=protected-access
       # add the histogram summary op if it should run this epoch
-      if self.merged not in self.model.test_function.fetches:
-        self.model.test_function.fetches.append(self.merged)
-        self.model.test_function.fetch_callbacks[
+      if self.merged not in self.model._eval_function.fetches:
+        self.model._eval_function.fetches.append(self.merged)
+        self.model._eval_function.fetch_callbacks[
             self.merged] = self._fetch_callback
+      # pylint: enable=protected-access
 
   def on_epoch_end(self, epoch, logs=None):
     """Checks if summary ops should run next epoch, logs scalar summaries."""
@@ -1099,14 +1152,20 @@ class TensorBoard(Callback):
     logs = {('epoch_' + k): v
             for k, v in logs.items()
             if k not in ['batch', 'size', 'num_steps']}
-    self._write_custom_summaries(epoch, logs)
+    if self.update_freq == 'epoch':
+      step = epoch
+    else:
+      step = self._samples_seen
+    self._write_custom_summaries(step, logs)
 
     # pop the histogram summary op after each epoch
     if self.histogram_freq:
-      if self.merged in self.model.test_function.fetches:
-        self.model.test_function.fetches.remove(self.merged)
-      if self.merged in self.model.test_function.fetch_callbacks:
-        self.model.test_function.fetch_callbacks.pop(self.merged)
+      # pylint: disable=protected-access
+      if self.merged in self.model._eval_function.fetches:
+        self.model._eval_function.fetches.remove(self.merged)
+      if self.merged in self.model._eval_function.fetch_callbacks:
+        self.model._eval_function.fetch_callbacks.pop(self.merged)
+      # pylint: enable=protected-access
 
     if self.embeddings_data is None and self.embeddings_freq:
       raise ValueError('To visualize embeddings, embeddings_data must '
@@ -1138,7 +1197,7 @@ class TensorBoard(Callback):
 
           feed_dict.update({self.batch_id: i, self.step: step})
 
-          if self.model.uses_learning_phase:
+          if not isinstance(K.learning_phase(), int):
             feed_dict[K.learning_phase()] = False
 
           self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
@@ -1305,7 +1364,12 @@ class CSVLogger(Callback):
     self.writer = None
     self.keys = None
     self.append_header = True
-    self.file_flags = 'b' if six.PY2 and os.name == 'nt' else ''
+    if six.PY2:
+      self.file_flags = 'b'
+      self._open_args = {}
+    else:
+      self.file_flags = ''
+      self._open_args = {'newline': '\n'}
     super(CSVLogger, self).__init__()
 
   def on_train_begin(self, logs=None):
@@ -1313,9 +1377,12 @@ class CSVLogger(Callback):
       if os.path.exists(self.filename):
         with open(self.filename, 'r' + self.file_flags) as f:
           self.append_header = not bool(len(f.readline()))
-      self.csv_file = open(self.filename, 'a' + self.file_flags)
+      mode = 'a'
     else:
-      self.csv_file = open(self.filename, 'w' + self.file_flags)
+      mode = 'w'
+    self.csv_file = io.open(self.filename,
+                            mode + self.file_flags,
+                            **self._open_args)
 
   def on_epoch_end(self, epoch, logs=None):
     logs = logs or {}
@@ -1324,7 +1391,7 @@ class CSVLogger(Callback):
       is_zero_dim_ndarray = isinstance(k, np.ndarray) and k.ndim == 0
       if isinstance(k, six.string_types):
         return k
-      elif isinstance(k, Iterable) and not is_zero_dim_ndarray:
+      elif isinstance(k, collections.Iterable) and not is_zero_dim_ndarray:
         return '"[%s]"' % (', '.join(map(str, k)))
       else:
         return k
@@ -1341,14 +1408,18 @@ class CSVLogger(Callback):
       class CustomDialect(csv.excel):
         delimiter = self.sep
 
+      fieldnames = ['epoch'] + self.keys
+      if six.PY2:
+        fieldnames = [unicode(x) for x in fieldnames]
+
       self.writer = csv.DictWriter(
           self.csv_file,
-          fieldnames=['epoch'] + self.keys,
+          fieldnames=fieldnames,
           dialect=CustomDialect)
       if self.append_header:
         self.writer.writeheader()
 
-    row_dict = OrderedDict({'epoch': epoch})
+    row_dict = collections.OrderedDict({'epoch': epoch})
     row_dict.update((key, handle_value(logs[key])) for key in self.keys)
     self.writer.writerow(row_dict)
     self.csv_file.flush()
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 467bc4cdc41fae82caca16c1ce54b8d48530b39b..4a65ade33c7f9c6159ab5cb8f50a06124507dbdd 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import adam
 
 try:
@@ -313,6 +312,42 @@ class KerasCallbacksTest(test.TestCase):
       hist = model.fit(data, labels, callbacks=[stopper], verbose=0, epochs=20)
       assert len(hist.epoch) >= patience
 
+  def test_EarlyStopping_final_weights_when_restoring_model_weights(self):
+
+    class DummyModel(object):
+
+      def __init__(self):
+        self.stop_training = False
+        self.weights = -1
+
+      def get_weights(self):
+        return self.weights
+
+      def set_weights(self, weights):
+        self.weights = weights
+
+      def set_weight_to_epoch(self, epoch):
+        self.weights = epoch
+
+    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss',
+                                               patience=2,
+                                               restore_best_weights=True)
+    early_stop.model = DummyModel()
+    losses = [0.2, 0.15, 0.1, 0.11, 0.12]
+    # The best configuration is in the epoch 2 (loss = 0.1000).
+    epochs_trained = 0
+    early_stop.on_train_begin()
+    for epoch in range(len(losses)):
+      epochs_trained += 1
+      early_stop.model.set_weight_to_epoch(epoch=epoch)
+      early_stop.on_epoch_end(epoch, logs={'val_loss': losses[epoch]})
+      if early_stop.model.stop_training:
+        break
+    # The best configuration is in epoch 2 (loss = 0.1000),
+    # and while patience = 2, we're restoring the best weights,
+    # so we end up at the epoch with the best weights, i.e. epoch 2
+    self.assertEqual(early_stop.model.get_weights(), 2)
+
   def test_RemoteMonitor(self):
     if requests is None:
       return
@@ -368,6 +403,7 @@ class KerasCallbacksTest(test.TestCase):
           float(keras.backend.get_value(
               model.optimizer.lr)) - 0.01 / 4) < keras.backend.epsilon()
 
+  @test_util.run_v1_only('b/120545219')
   def test_ReduceLROnPlateau(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -386,8 +422,7 @@ class KerasCallbacksTest(test.TestCase):
             num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
         model.compile(
             loss='categorical_crossentropy',
-            optimizer=keras.optimizers.SGD(lr=0.1),
-            metrics=['accuracy'])
+            optimizer=keras.optimizers.SGD(lr=0.1))
         return model
 
       model = make_model()
@@ -534,11 +569,15 @@ class KerasCallbacksTest(test.TestCase):
           batch_size=BATCH_SIZE,
           validation_data=(x_test, y_test),
           callbacks=cbks,
-          epochs=1,
+          epochs=2,
           verbose=0)
 
       with open(filepath) as csvfile:
-        output = ' '.join(csvfile.readlines())
+        list_lines = csvfile.readlines()
+        for line in list_lines:
+          assert line.count(sep) == 4
+        assert len(list_lines) == 5
+        output = ' '.join(list_lines)
         assert len(re.findall('epoch', output)) == 1
 
       os.remove(filepath)
@@ -633,9 +672,10 @@ class KerasCallbacksTest(test.TestCase):
           callbacks=cbks,
           epochs=20)
       loss = history.history['loss']
-      assert len(loss) == 1
-      assert loss[0] == np.inf
+      self.assertEqual(len(loss), 1)
+      self.assertEqual(loss[0], np.inf)
 
+  @test_util.run_v1_only('b/120545219')
   def test_TensorBoard(self):
     np.random.seed(1337)
 
@@ -739,78 +779,7 @@ class KerasCallbacksTest(test.TestCase):
           data_generator(True), len(x_train), epochs=2, callbacks=cbks)
       assert os.path.exists(temp_dir)
 
-  def test_TensorBoard_histogram_freq_must_have_validation_data(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    with self.cached_session():
-      filepath = os.path.join(tmpdir, 'logs')
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      def data_generator(train):
-        if train:
-          max_batch_index = len(x_train) // BATCH_SIZE
-        else:
-          max_batch_index = len(x_test) // BATCH_SIZE
-        i = 0
-        while 1:
-          if train:
-            # simulate multi-input/output models
-            yield (x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE],
-                   y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE])
-          else:
-            yield (x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE],
-                   y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE])
-          i += 1
-          i %= max_batch_index
-
-      inp = keras.Input((INPUT_DIM,))
-      hidden = keras.layers.Dense(2, activation='relu')(inp)
-      hidden = keras.layers.Dropout(0.1)(hidden)
-      output = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      model = keras.models.Model(inputs=inp, outputs=output)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer='sgd',
-                    metrics=['accuracy'])
-
-      # we must generate new callbacks for each test, as they aren't stateless
-      def callbacks_factory(histogram_freq):
-        return [keras.callbacks.TensorBoard(
-            log_dir=filepath,
-            histogram_freq=histogram_freq,
-            write_images=True, write_grads=True,
-            batch_size=5)]
-
-      # fit w/o validation data should raise ValueError if histogram_freq > 0
-      cbs = callbacks_factory(histogram_freq=1)
-      with self.assertRaises(ValueError):
-        model.fit(
-            x_train, y_train, batch_size=BATCH_SIZE, callbacks=cbs, epochs=3)
-
-      for cb in cbs:
-        cb.on_train_end()
-
-      # fit generator without validation data should raise ValueError if
-      # histogram_freq > 0
-      cbs = callbacks_factory(histogram_freq=1)
-      with self.assertRaises(ValueError):
-        model.fit_generator(
-            data_generator(True), len(x_train), epochs=2, callbacks=cbs)
-
-      for cb in cbs:
-        cb.on_train_end()
-
-      # Make sure file writer cache is clear to avoid failures during cleanup.
-      writer_cache.FileWriterCache.clear()
-
+  @test_util.run_v1_only('b/120545219')
   def test_TensorBoard_multi_input_output(self):
     np.random.seed(1337)
     tmpdir = self.get_temp_dir()
@@ -882,6 +851,7 @@ class KerasCallbacksTest(test.TestCase):
                           callbacks=callbacks_factory(histogram_freq=1))
       assert os.path.isdir(filepath)
 
+  @test_util.run_v1_only('b/120545219')
   def test_Tensorboard_histogram_summaries_in_test_function(self):
 
     class FileWriterStub(object):
@@ -957,8 +927,9 @@ class KerasCallbacksTest(test.TestCase):
           epochs=3,
           verbose=0)
 
-      self.assertAllEqual(tsb.writer.steps_seen, [0, 0.5, 1, 1.5, 2, 2.5])
+      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
 
+  @test_util.run_v1_only('b/120545219')
   def test_Tensorboard_histogram_summaries_with_generator(self):
     np.random.seed(1337)
     tmpdir = self.get_temp_dir()
@@ -1090,6 +1061,7 @@ class KerasCallbacksTest(test.TestCase):
 
       assert os.path.exists(temp_dir)
 
+  @test_util.run_deprecated_v1
   def test_Tensorboard_batch_logging(self):
 
     class FileWriterStub(object):
@@ -1115,15 +1087,16 @@ class KerasCallbacksTest(test.TestCase):
     temp_dir = self.get_temp_dir()
     self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
 
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir)
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
     tb_cbk.writer = FileWriterStub(temp_dir)
 
     for batch in range(5):
-      tb_cbk.on_batch_end(batch, {'acc': np.float32(batch)})
+      tb_cbk.on_batch_end(batch, {'acc': batch})
     self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
     self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
     self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
 
+  @test_util.run_deprecated_v1
   def test_Tensorboard_epoch_and_batch_logging(self):
 
     class FileWriterStub(object):
@@ -1147,14 +1120,17 @@ class KerasCallbacksTest(test.TestCase):
     temp_dir = self.get_temp_dir()
     self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
 
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir)
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
     tb_cbk.writer = FileWriterStub(temp_dir)
 
-    tb_cbk.on_batch_end(0, {'acc': np.float32(5.0)})
-    tb_cbk.on_epoch_end(0, {'acc': np.float32(10.0)})
+    tb_cbk.on_batch_end(0, {'acc': 5.0})
     batch_step, batch_summary = tb_cbk.writer.batch_summary
     self.assertEqual(batch_step, 0)
     self.assertEqual(batch_summary.value[0].simple_value, 5.0)
+
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='epoch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+    tb_cbk.on_epoch_end(0, {'acc': 10.0})
     epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
     self.assertEqual(epoch_step, 0)
     self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
@@ -1192,6 +1168,67 @@ class KerasCallbacksTest(test.TestCase):
 
     self.assertTrue(os.path.exists(temp_dir))
 
+  @test_util.run_deprecated_v1
+  def test_TensorBoard_update_freq(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batch_summaries = []
+        self.epoch_summaries = []
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summaries.append((step, summary))
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summaries.append((step, summary))
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    # Epoch mode
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='epoch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(tb_cbk.writer.batch_summaries, [])
+    tb_cbk.on_epoch_end(0, {'acc': 10.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.epoch_summaries), 1)
+
+    # Batch mode
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    self.assertFalse(tb_cbk.writer.epoch_summaries)
+
+    # Integer mode
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq=20)
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertFalse(tb_cbk.writer.batch_summaries)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    tb_cbk.on_batch_end(0, {'acc': 10.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    self.assertFalse(tb_cbk.writer.epoch_summaries)
+
   def test_RemoteMonitorWithJsonPayload(self):
     if requests is None:
       self.skipTest('`requests` required to run this test')
@@ -1223,9 +1260,11 @@ class KerasCallbacksTest(test.TestCase):
             callbacks=cbks,
             epochs=1)
 
+  @test_util.run_deprecated_v1
   def test_fit_generator_with_callback(self):
 
     class TestCallback(keras.callbacks.Callback):
+
       def set_model(self, model):
         # Check the model operations for the optimizer operations that
         # the _make_train_function adds under a named scope for the
diff --git a/tensorflow/python/keras/engine/__init__.py b/tensorflow/python/keras/engine/__init__.py
index 26aed34766f9e1e2094db7a4c8b66ff057dacc4b..005f6462ffa4e6120c66373f7be9e31d5eac5449 100644
--- a/tensorflow/python/keras/engine/__init__.py
+++ b/tensorflow/python/keras/engine/__init__.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 # TODO(fchollet): Remove hourglass imports once external code is done importing
 # non-public APIs.
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
 
 del absolute_import
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index a75ce30d316eceb251d66a2b2c525bbe09a08c8a..858fa76472b3806f36b76f761043f011a260b66d 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections as collections_lib
-import enum  # pylint: disable=g-bad-import-order
 import functools
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
 
@@ -27,8 +25,8 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -36,13 +34,15 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
 from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
 from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import function_utils
@@ -53,20 +53,6 @@ from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
 
 
-class CallConvention(enum.Enum):
-  """Calling conventions for passing `Layer` inputs to `Layer.call`."""
-  # The Layer takes inputs as its first argument, named "inputs" for
-  # compatibility with the signature of Layer.__call__. This is the mode assumed
-  # for Layers which are not subclassed Models.
-  EXPLICIT_INPUTS_ARGUMENT = 1
-  # The Layer takes a single positional argument, not named "inputs". It's
-  # treated like an "inputs" argument.
-  SINGLE_POSITIONAL_ARGUMENT = 2
-  # The Layer has multiple positional arguments to which its inputs should be
-  # bound.
-  POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
-
-
 @tf_export('keras.layers.Layer')
 class Layer(checkpointable.CheckpointableBase):
   """Base layer class.
@@ -101,10 +87,6 @@ class Layer(checkpointable.CheckpointableBase):
     name: The name of the layer (string).
     dtype: Default dtype of the layer's weights (default of `None` means use the
       type of the first input).
-    trainable_variables: List of trainable variables.
-    non_trainable_variables: List of non-trainable variables.
-    variables: List of all variables of this layer, trainable and
-      non-trainable.
     updates: List of update ops of this layer.
     losses: List of losses added by this layer.
     trainable_weights: List of variables to be included in backprop.
@@ -149,44 +131,54 @@ class Layer(checkpointable.CheckpointableBase):
     self.built = False
     # Provides information about which inputs are compatible with the layer.
     self.input_spec = None
+    self.supports_masking = False
 
     self._init_set_name(name)
-
-    activity_regularizer = kwargs.pop('activity_regularizer', None)
-    if activity_regularizer and context.executing_eagerly():
-      raise ValueError(
-          ('Activity regularization is not supported when executing eagerly. '
-           'Got activity_regularizer=%s') % (activity_regularizer,))
-    self._activity_regularizer = activity_regularizer
+    self._activity_regularizer = kwargs.pop('activity_regularizer', None)
     self._trainable_weights = []
     self._non_trainable_weights = []
     self._updates = []
     # A list of zero-argument lambdas which return Tensors, used for variable
     # regularizers.
     self._callable_losses = []
-    # A list of Tensors containing activity regularizers and losses manually
-    # added through `add_loss`. Empty when executing eagerly.
+    # A list of symbolic Tensors containing activity regularizers and losses
+    # manually added through `add_loss` in graph-building mode.
     self._losses = []
-    self._in_call = False  # Flag for error checking in add_loss
+    # A list of loss values containing activity regularizers and losses
+    # manually added through `add_loss` during eager execution. It is cleared
+    # after every batch.
+    # Because we plan on eventually allowing a same model instance to be trained
+    # in eager mode or graph mode alternatively, we need to keep track of
+    # eager losses and symbolic losses via separate attributes.
+    self._eager_losses = []
+    # A list of metric instances corresponding to the symbolic metric tensors
+    # added using the `add_metric` API.
+    self._metrics = []
+    # TODO(psv): Remove this property.
+    # A dictionary that maps metric names to metric result tensors. The results
+    # are the running averages of metric values over an epoch.
+    self._metrics_tensors = {}
     self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
     self._call_fn_args = function_utils.fn_args(self.call)
     self._compute_previous_mask = ('mask' in self._call_fn_args or
                                    hasattr(self, 'compute_mask'))
-    self._call_convention = CallConvention.EXPLICIT_INPUTS_ARGUMENT
+    self._call_convention = (base_layer_utils
+                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
 
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
     self._inbound_nodes = []
     self._outbound_nodes = []
 
-    self.supports_masking = False
-
     call_argspec = tf_inspect.getfullargspec(self.call)
     if 'training' in call_argspec.args:
       self._expects_training_arg = True
     else:
       self._expects_training_arg = False
 
+    # Whether the `call` method can be used to build a TF graph without issues.
+    self._call_is_graph_friendly = True
+
     # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
       # In this case we will later create an input layer
@@ -207,158 +199,447 @@ class Layer(checkpointable.CheckpointableBase):
     else:
       self._initial_weights = None
 
-  def _init_set_name(self, name, zero_based=True):
-    if not name:
-      self._name = unique_layer_name(
-          generic_utils.to_snake_case(self.__class__.__name__),
-          zero_based=zero_based)
-    else:
-      self._name = name
-
-  @property
-  def dtype(self):
-    return self._dtype
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def activity_regularizer(self):
-    """Optional regularizer function for the output of this layer."""
-    return self._activity_regularizer
-
-  @activity_regularizer.setter
-  def activity_regularizer(self, regularizer):
-    """Optional regularizer function for the output of this layer."""
-    self._activity_regularizer = self._no_dependency(regularizer)
+  def build(self, input_shape):
+    """Creates the variables of the layer (optional, for subclass implementers).
 
-  @property
-  def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
+    This is a method that implementers of subclasses of `Layer` or `Model`
+    can override if they need a state-creation step in-between
+    layer instantiation and layer call.
 
-  @property
-  def non_trainable_weights(self):
-    if self.trainable:
-      return self._non_trainable_weights
-    else:
-      return self._trainable_weights + self._non_trainable_weights
+    This is typically used to create the weights of `Layer` subclasses.
 
-  @property
-  def trainable_variables(self):
-    return self.trainable_weights
+    Arguments:
+      input_shape: Instance of `TensorShape`, or list of instances of
+        `TensorShape` if the layer expects a list of inputs
+        (one instance per input).
+    """
+    self.built = True
 
-  @property
-  def non_trainable_variables(self):
-    return self.non_trainable_weights
+  @doc_controls.for_subclass_implementers
+  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+    """This is where the layer's logic lives.
 
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
+    Arguments:
+        inputs: Input tensor, or list/tuple of input tensors.
+        **kwargs: Additional keyword arguments.
 
     Returns:
-      A list of variables.
+        A tensor or list/tuple of tensors.
     """
-    return self.trainable_weights + self.non_trainable_weights
+    return inputs
 
-  @property
-  def variables(self):
-    """Returns the list of all layer variables/weights.
+  @doc_controls.for_subclass_implementers
+  def add_weight(self,
+                 name,
+                 shape,
+                 dtype=None,
+                 initializer=None,
+                 regularizer=None,
+                 trainable=None,
+                 constraint=None,
+                 partitioner=None,
+                 use_resource=None,
+                 synchronization=tf_variables.VariableSynchronization.AUTO,
+                 aggregation=tf_variables.VariableAggregation.NONE,
+                 **kwargs):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
 
-    Returns:
-      A list of variables.
-    """
-    return self.weights
+    Arguments:
+      name: variable name.
+      shape: variable shape.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      initializer: initializer instance (callable).
+      regularizer: regularizer instance (callable).
+      trainable: whether the variable should be part of the layer's
+        "trainable_variables" (e.g. variables, biases)
+        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+        Note, if the current variable scope is marked as non-trainable
+        then this parameter is ignored and any added variables are also
+        marked as non-trainable. `trainable` defaults to `True` unless
+        `synchronization` is set to `ON_READ`.
+      constraint: constraint instance (callable).
+      partitioner: Partitioner to be passed to the `Checkpointable` API.
+      use_resource: Whether to use `ResourceVariable`.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      **kwargs: Additional keyword arguments. Accepted values are `getter` and
+        `collections`.
 
-  @property
-  def updates(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.updates not supported in Eager mode.')
-    if not self.trainable and not self.stateful:
-      return []
-    return self._updates
+    Returns:
+      The created variable.  Usually either a `Variable` or `ResourceVariable`
+      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
+      instance is returned.
 
-  @doc_controls.for_subclass_implementers
-  def add_update(self, updates, inputs=None):
-    """Add update op(s), potentially dependent on layer inputs.
+    Raises:
+      RuntimeError: If called with partioned variable regularization and
+        eager execution is enabled.
+      ValueError: When giving unsupported dtype and no initializer or when
+        trainable has been set to True with synchronization set as `ON_READ`.
+    """
+    # Validate optional keyword arguments.
+    for kwarg in kwargs:
+      if kwarg not in ['getter', 'collections']:
+        raise TypeError('Unknown keyword argument:', kwarg)
+    getter = kwargs.pop('getter', None)
+    collections = kwargs.pop('collections', None)
 
-    Weight updates (for instance, the updates of the moving mean and variance
-    in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing the same layer on
-    different inputs `a` and `b`, some entries in `layer.updates` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
+    if dtype is None:
+      dtype = self.dtype or backend.floatx()
+    dtype = dtypes.as_dtype(dtype)
+    initializer = initializers.get(initializer)
+    regularizer = regularizers.get(regularizer)
+    constraint = constraints.get(constraint)
 
-    The `get_updates_for` method allows to retrieve the updates relevant to a
-    specific set of inputs.
+    if synchronization == tf_variables.VariableSynchronization.ON_READ:
+      if trainable:
+        raise ValueError(
+            'Synchronization value can be set to '
+            'VariableSynchronization.ON_READ only for non-trainable variables. '
+            'You have specified trainable=True and '
+            'synchronization=VariableSynchronization.ON_READ.')
+      else:
+        # Set trainable to be false when variable is to be synced on read.
+        trainable = False
+    elif trainable is None:
+      trainable = True
 
-    This call is ignored when eager execution is enabled (in that case, variable
-    updates are run on the fly and thus do not need to be tracked for later
-    execution).
+    # Initialize variable when no initializer provided
+    if initializer is None:
+      # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
+      if dtype.is_floating:
+        initializer = initializers.glorot_uniform()
+      # If dtype is DT_INT/DT_UINT, provide a default value `zero`
+      # If dtype is DT_BOOL, provide a default value `FALSE`
+      elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
+        initializer = initializers.zeros()
+      # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
+      else:
+        raise ValueError('An initializer for variable %s of type %s is required'
+                         ' for layer %s' % (name, dtype.base_dtype, self.name))
 
-    Arguments:
-      updates: Update op, or list/tuple of update ops.
-      inputs: If anything other than None is passed, it signals the updates
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for BatchNormalization updates, for instance.
-        If None, the updates will be taken into account unconditionally,
-        and you are responsible for making sure that any dependency they might
-        have is available at runtime.
-        A step counter might fall into this category.
-    """
-    if context.executing_eagerly():
-      return  # Updates already applied when in eager mode.
+    variable = self._add_variable_with_custom_getter(
+        name=name,
+        shape=shape,
+        # TODO(allenl): a `make_variable` equivalent should be added as a
+        # `Checkpointable` method.
+        getter=getter or base_layer_utils.make_variable,
+        # Manage errors in Layer rather than Checkpointable.
+        overwrite=True,
+        initializer=initializer,
+        dtype=dtype,
+        constraint=constraint,
+        trainable=trainable and self.trainable,
+        partitioner=partitioner,
+        use_resource=use_resource,
+        collections=collections,
+        synchronization=synchronization,
+        aggregation=aggregation)
+    backend.track_variable(variable)
 
-    def process_update(x):
-      if isinstance(x, ops.Operation):
-        return x
-      elif hasattr(x, 'op'):
-        return x.op
-      else:
-        return ops.convert_to_tensor(x)
+    if regularizer is not None:
+      # TODO(fchollet): in the future, this should be handled at the
+      # level of variable creation, and weight regularization losses
+      # should be variable attributes.
+      self._handle_weight_regularization(name, variable, regularizer)
 
-    updates = generic_utils.to_list(updates)
-    updates = [process_update(x) for x in updates]
-    self._updates += updates
-    if inputs is None:
-      for u in updates:
-        u._unconditional_update = True  # pylint: disable=protected-access
+    if trainable:
+      self._trainable_weights.append(variable)
     else:
-      for u in updates:
-        u._unconditional_update = False  # pylint: disable=protected-access
+      self._non_trainable_weights.append(variable)
+    return variable
 
-  def get_updates_for(self, inputs):
-    """Retrieves updates relevant to a specific set of inputs.
+  def get_config(self):
+    """Returns the config of the layer.
 
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
+    A layer config is a Python dictionary (serializable)
+    containing the configuration of a layer.
+    The same layer can be reinstantiated later
+    (without its trained weights) from this configuration.
 
-    Returns:
-      List of update ops of the layer that depend on `inputs`.
+    The config of a layer does not include connectivity
+    information, nor the layer class name. These are handled
+    by `Network` (one layer of abstraction above).
 
-    Raises:
-      RuntimeError: If called in Eager mode.
+    Returns:
+        Python dictionary.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
-
-    # Updates disabled if layer is not trainable and not explicitly stateful.
-    if not self.trainable and not self.stateful:
-      return []
+    config = {'name': self.name, 'trainable': self.trainable}
+    if hasattr(self, '_batch_input_shape'):
+      config['batch_input_shape'] = self._batch_input_shape
+    if hasattr(self, 'dtype'):
+      config['dtype'] = self.dtype
+    return config
 
-    if inputs is None:
-      # Requesting unconditional updates.
-      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+  @classmethod
+  def from_config(cls, config):
+    """Creates a layer from its config.
 
-    # Requesting input-conditional updates.
-    inputs = nest.flatten(inputs)
-    reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates)
-    updates = []
-    for update in self.updates:
-      if update in reachable:
-        updates.append(update)
-    return updates
+    This method is the reverse of `get_config`,
+    capable of instantiating the same layer from the config
+    dictionary. It does not handle layer connectivity
+    (handled by Network), nor weights (handled by `set_weights`).
+
+    Arguments:
+        config: A Python dictionary, typically the
+            output of get_config.
+
+    Returns:
+        A layer instance.
+    """
+    return cls(**config)
+
+  def compute_output_shape(self, input_shape):
+    """Computes the output shape of the layer.
+
+    Assumes that the layer will be built
+    to match that input shape provided.
+
+    Arguments:
+        input_shape: Shape tuple (tuple of integers)
+            or list of shape tuples (one per output tensor of the layer).
+            Shape tuples can include None for free dimensions,
+            instead of an integer.
+
+    Returns:
+        An input shape tuple.
+    """
+    if context.executing_eagerly():
+      # In this case we build the model first in order to do shape inference.
+      # This is acceptable because the framework only calls
+      # `compute_output_shape` on shape values that the layer would later be
+      # built for. It would however cause issues in case a user attempts to
+      # use `compute_output_shape` manually (these users will have to
+      # implement `compute_output_shape` themselves).
+      self.build(input_shape)
+      with context.graph_mode():
+        graph = func_graph.FuncGraph('graph')
+        with graph.as_default():
+          if isinstance(input_shape, list):
+            inputs = [base_layer_utils.generate_placeholders_from_shape(shape)
+                      for shape in input_shape]
+          else:
+            inputs = base_layer_utils.generate_placeholders_from_shape(
+                input_shape)
+
+          try:
+            if self._expects_training_arg:
+              outputs = self(inputs, training=False)
+            else:
+              outputs = self(inputs)
+          except TypeError:
+            raise NotImplementedError('We could not automatically infer '
+                                      'the static shape of the layer\'s output.'
+                                      ' Please implement the '
+                                      '`compute_output_shape` method on your '
+                                      'layer (%s).' % self.__class__.__name__)
+      if isinstance(outputs, list):
+        return [output.shape for output in outputs]
+      else:
+        return outputs.shape
+    raise NotImplementedError
+
+  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
+    """Computes an output mask tensor.
+
+    Arguments:
+        inputs: Tensor or list of tensors.
+        mask: Tensor or list of tensors.
+
+    Returns:
+        None or a tensor (or list of tensors,
+            one per output tensor of the layer).
+    """
+    if not self.supports_masking:
+      if mask is not None:
+        if isinstance(mask, list):
+          if any(m is not None for m in mask):
+            raise TypeError('Layer ' + self.name + ' does not support masking, '
+                            'but was passed an input_mask: ' + str(mask))
+        else:
+          raise TypeError('Layer ' + self.name + ' does not support masking, '
+                          'but was passed an input_mask: ' + str(mask))
+      # masking not explicitly supported: return None as mask
+      return None
+    # if masking is explicitly supported, by default
+    # carry over the input mask
+    return mask
+
+  def __call__(self, inputs, *args, **kwargs):
+    """Wraps `call`, applying pre- and post-processing steps.
+
+    Arguments:
+      inputs: input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+
+    Note:
+      - The following optional keyword arguments are reserved for specific uses:
+        * `training`: Boolean scalar tensor of Python boolean indicating
+          whether the `call` is meant for training or inference.
+        * `mask`: Boolean input mask.
+      - If the layer's `call` method takes a `mask` argument (as some Keras
+        layers do), its default value will be set to the mask generated
+        for `inputs` by the previous layer (if `input` did come from
+        a layer that generated a corresponding mask, i.e. if it came from
+        a Keras layer with masking support.
+
+    Raises:
+      ValueError: if the layer's `call` method returns None (an invalid value).
+    """
+    input_list = nest.flatten(inputs)
+
+    if context.executing_eagerly():
+      # Accept NumPy inputs by converting to Tensors when executing eagerly.
+      if all(isinstance(x, (np.ndarray, float, int)) for x in input_list):
+        inputs = nest.map_structure(ops.convert_to_tensor, inputs)
+        input_list = nest.flatten(inputs)
+
+    # We will attempt to build a TF graph if & only if all inputs are symbolic.
+    # This is always the case in graph mode. It can also be the case in eager
+    # mode when all inputs can be traced back to `keras.Input()` (when building
+    # models using the functional API).
+    build_graph = tf_utils.are_all_symbolic_tensors(input_list)
+    executing_eagerly = context.executing_eagerly()
+
+    # Handle Keras mask propagation from previous layer to current layer.
+    previous_mask = None
+    if build_graph and (not hasattr(self, '_compute_previous_mask') or
+                        self._compute_previous_mask):
+      previous_mask = base_layer_utils.collect_previous_mask(inputs)
+      if not hasattr(self, '_call_fn_args'):
+        self._call_fn_args = self._no_dependency(
+            function_utils.fn_args(self.call))
+      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
+          not generic_utils.is_all_none(previous_mask)):
+        # The previous layer generated a mask, and mask was not explicitly pass
+        # to __call__, hence we set previous_mask as the default value.
+        kwargs['mask'] = previous_mask
+
+    input_shapes = None
+
+    with ops.name_scope(self._name_scope()):
+      if not self.built:
+        # Build layer if applicable (if the `build` method has been overridden).
+        self._maybe_build(inputs)
+        # We must set self.built since user defined build functions are not
+        # constrained to set self.built.
+        self.built = True
+
+      # Check input assumptions set after layer building, e.g. input shape.
+      if build_graph:
+        # Symbolic execution on symbolic tensors. We will attempt to build
+        # the corresponding TF subgraph inside `backend.get_graph()`
+        input_spec.assert_input_compatibility(
+            self.input_spec, inputs, self.name)
+        graph = backend.get_graph()
+        with graph.as_default():
+          if not executing_eagerly:
+            # In graph mode, failure to build the layer's graph
+            # implies a user-side bug. We don't catch exceptions.
+            outputs = self.call(inputs, *args, **kwargs)
+          else:
+            try:
+              outputs = self.call(inputs, *args, **kwargs)
+            except Exception:  # pylint: disable=broad-except
+              # Any issue during graph-building means we will later run the
+              # model in eager mode, whether the issue was related to
+              # graph mode or not. This provides a nice debugging experience.
+              self._call_is_graph_friendly = False
+              # We will use static shape inference to return symbolic tensors
+              # matching the specifications of the layer outputs.
+              # Since we have set `self._call_is_graph_friendly = False`,
+              # we will never attempt to run the underlying TF graph (which is
+              # disconnected).
+              # TODO(fchollet): consider py_func as an alternative, which
+              # would enable us to run the underlying graph if needed.
+              input_shapes = nest.map_structure(lambda x: x.shape, inputs)
+              output_shapes = self.compute_output_shape(input_shapes)
+              outputs = nest.map_structure(
+                  lambda shape: backend.placeholder(shape, dtype=self.dtype),
+                  output_shapes)
+
+          if outputs is None:
+            raise ValueError('A layer\'s `call` method should return a '
+                             'Tensor or a list of Tensors, not None '
+                             '(layer: ' + self.name + ').')
+          self._handle_activity_regularization(inputs, outputs)
+          self._set_mask_metadata(inputs, outputs, previous_mask)
+          if base_layer_utils.have_all_keras_metadata(inputs):
+            inputs, outputs = self._set_connectivity_metadata_(
+                inputs, outputs, args, kwargs)
+          if hasattr(self, '_set_inputs') and not self.inputs:
+            # Subclassed network: explicitly set metadata normally set by
+            # a call to self._set_inputs().
+            # This is not relevant in eager execution.
+            self._set_inputs(inputs, outputs)
+      else:
+        # Eager execution on data tensors.
+        outputs = self.call(inputs, *args, **kwargs)
+        self._handle_activity_regularization(inputs, outputs)
+        return outputs
+
+    if not context.executing_eagerly():
+      # Optionally load weight values specified at layer instantiation.
+      # TODO(fchollet): consider enabling this with eager execution too.
+      if (hasattr(self, '_initial_weights') and
+          self._initial_weights is not None):
+        self.set_weights(self._initial_weights)
+        del self._initial_weights
+    return outputs
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def activity_regularizer(self):
+    """Optional regularizer function for the output of this layer."""
+    return self._activity_regularizer
+
+  @activity_regularizer.setter
+  def activity_regularizer(self, regularizer):
+    """Optional regularizer function for the output of this layer."""
+    self._activity_regularizer = self._no_dependency(regularizer)
+
+  @property
+  def trainable_weights(self):
+    return self._trainable_weights if self.trainable else []
+
+  @property
+  def non_trainable_weights(self):
+    if self.trainable:
+      return self._non_trainable_weights
+    else:
+      return self._trainable_weights + self._non_trainable_weights
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.trainable_weights + self.non_trainable_weights
+
+  @property
+  def updates(self):
+    if not self.trainable and not self.stateful:
+      return []
+    return self._updates
 
   @property
   def losses(self):
@@ -372,7 +653,10 @@ class Layer(checkpointable.CheckpointableBase):
       A list of tensors.
     """
     collected_losses = []
-    collected_losses.extend(self._losses)
+    if context.executing_eagerly():
+      collected_losses.extend(self._eager_losses)
+    else:
+      collected_losses.extend(self._losses)
     for regularizer in self._callable_losses:
       loss_tensor = regularizer()
       if loss_tensor is not None:
@@ -399,34 +683,15 @@ class Layer(checkpointable.CheckpointableBase):
 
     Arguments:
       losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
-        may also be zero-argument callables which create a loss tensor. Only
-        callable losses are supported when executing eagerly.
-      inputs: If anything other than None is passed, it signals the losses
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for activity regularization losses, for instance.
-        If `None` is passed, the losses are assumed
+        may also be zero-argument callables which create a loss tensor.
+      inputs: Ignored when executing eagerly. If anything other than None is
+        passed, it signals the losses are conditional on some of the layer's
+        inputs, and thus they should only be run where these inputs are
+        available. This is the case for activity regularization losses, for
+        instance. If `None` is passed, the losses are assumed
         to be unconditional, and will apply across all dataflows of the layer
         (e.g. weight regularization losses).
-
-    Raises:
-      RuntimeError: If called in Eager mode with a `Tensor` rather than a
-        callable, or if `inputs` is not None.
     """
-    executing_eagerly = context.executing_eagerly()
-    if executing_eagerly:
-      if inputs is not None:
-        raise RuntimeError(
-            'Activity regularization (via the "inputs" argument to '
-            'Layer.add_loss) is not supported when executing eagerly. Consider '
-            'returning activity regularization losses from a Model\'s call() '
-            'method.')
-      if getattr(self, '_in_call', False):
-        # TODO(psv): Support activity regularization and a way to reset losses.
-        raise RuntimeError(
-            'Adding losses inside a Layer\'s call() method is not currently '
-            'supported when executing eagerly. Please file a feature request '
-            'if you need this limitation lifted.')
     losses = generic_utils.to_list(losses)
 
     def _tag_unconditional(loss):
@@ -444,683 +709,187 @@ class Layer(checkpointable.CheckpointableBase):
         self._callable_losses.append(
             functools.partial(_tag_unconditional, loss))
       else:
-        if executing_eagerly:
-          raise RuntimeError(
-              'Layer.add_loss only supported for zero-argument lambdas when '
-              'executing eagerly.')
-        self._losses.append(_tag_unconditional(loss))
-
-  def get_losses_for(self, inputs):
-    """Retrieves losses relevant to a specific set of inputs.
+        if context.executing_eagerly():
+          self._eager_losses.append(_tag_unconditional(loss))
+        else:
+          self._losses.append(_tag_unconditional(loss))
 
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
+  @doc_controls.for_subclass_implementers
+  def add_metric(self, value, aggregation=None, name=None):
+    """Adds metric tensor to the layer.
 
-    Returns:
-      List of loss tensors of the layer that depend on `inputs`.
+    Args:
+      value: Metric tensor.
+      aggregation: Sample-wise metric reduction function. If `aggregation=None`,
+        it indicates that the metric tensor provided has been aggregated
+        already. eg, `model.add_metric(BinaryAccuracy(name='acc')(y_true,
+        y_pred))`. If aggregation='mean', the given metric tensor will be
+        sample-wise reduced using `mean` function. eg, `model.add_metric(
+        tf.reduce_mean(outputs), name='output_mean', aggregation='mean')`.
+      name: String metric name.
 
     Raises:
-      RuntimeError: If called in Eager mode.
+      ValueError: If `aggregation` is anything other than None or `mean`.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-
-    if inputs is None:
-      # Requesting unconditional losses.
-      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
-
-    # Requesting input-conditional losses.
-    inputs = nest.flatten(inputs)
-    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
-    # The losses we want to return will be part of this set.
-    # To avoid unnecessary work, we stop the search in case all of
-    # `self.losses` have been retrieved.
-    reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses)
-    losses = []
-    for loss in self.losses:
-      if loss in reachable:
-        losses.append(loss)
-    return losses
-
-  def _name_scope(self):
-    return self.name
+    if aggregation is not None and aggregation != 'mean':
+      raise ValueError(
+          'We currently support only `mean` sample-wise metric aggregation. '
+          'You provided aggregation=`%s`' % aggregation)
 
-  def build(self, input_shape):
-    """Creates the variables of the layer."""
-    self.built = True
+    if tf_utils.is_symbolic_tensor(value):
+      self._symbolic_add_metric(value, aggregation, name)
+    else:
+      self._eager_add_metric(value, aggregation, name)
 
   @doc_controls.for_subclass_implementers
-  def add_variable(self, *args, **kwargs):
-    """Alias for `add_weight`."""
-    return self.add_weight(*args, **kwargs)
-
-  @doc_controls.for_subclass_implementers
-  def add_weight(self,
-                 name,
-                 shape,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=None,
-                 constraint=None,
-                 partitioner=None,
-                 use_resource=None,
-                 synchronization=tf_variables.VariableSynchronization.AUTO,
-                 aggregation=tf_variables.VariableAggregation.NONE,
-                 **kwargs):
-    """Adds a new variable to the layer, or gets an existing one; returns it.
-
-    Arguments:
-      name: variable name.
-      shape: variable shape.
-      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-      initializer: initializer instance (callable).
-      regularizer: regularizer instance (callable).
-      trainable: whether the variable should be part of the layer's
-        "trainable_variables" (e.g. variables, biases)
-        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-        Note, if the current variable scope is marked as non-trainable
-        then this parameter is ignored and any added variables are also
-        marked as non-trainable. `trainable` defaults to `True` unless
-        `synchronization` is set to `ON_READ`.
-      constraint: constraint instance (callable).
-      partitioner: Partitioner to be passed to the `Checkpointable` API.
-      use_resource: Whether to use `ResourceVariable`.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize. If `synchronization` is set to `ON_READ`,
-        `trainable` must not be set to `True`.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
-      **kwargs: Additional keyword arguments. Accepted values are `getter` and
-        `collections`.
-
-    Returns:
-      The created variable.  Usually either a `Variable` or `ResourceVariable`
-      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
-      instance is returned.
-
-    Raises:
-      RuntimeError: If called with partioned variable regularization and
-        eager execution is enabled.
-      ValueError: When giving unsupported dtype and no initializer or when
-        trainable has been set to True with synchronization set as `ON_READ`.
-    """
-    # Validate optional keyword arguments.
-    for kwarg in kwargs:
-      if kwarg not in ['getter', 'collections']:
-        raise TypeError('Unknown keyword argument:', kwarg)
-    getter = kwargs.pop('getter', None)
-    collections = kwargs.pop('collections', None)
-
-    if dtype is None:
-      dtype = self.dtype or backend.floatx()
-    dtype = dtypes.as_dtype(dtype)
-    initializer = initializers.get(initializer)
-    regularizer = regularizers.get(regularizer)
-    constraint = constraints.get(constraint)
-
-    if synchronization == tf_variables.VariableSynchronization.ON_READ:
-      if trainable:
-        raise ValueError(
-            'Synchronization value can be set to '
-            'VariableSynchronization.ON_READ only for non-trainable variables. '
-            'You have specified trainable=True and '
-            'synchronization=VariableSynchronization.ON_READ.')
-      else:
-        # Set trainable to be false when variable is to be synced on read.
-        trainable = False
-    elif trainable is None:
-      trainable = True
-
-    # Initialize variable when no initializer provided
-    if initializer is None:
-      # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
-      if dtype.is_floating:
-        initializer = initializers.glorot_uniform()
-      # If dtype is DT_INT/DT_UINT, provide a default value `zero`
-      # If dtype is DT_BOOL, provide a default value `FALSE`
-      elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
-        initializer = initializers.zeros()
-      # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
-      else:
-        raise ValueError('An initializer for variable %s of type %s is required'
-                         ' for layer %s' % (name, dtype.base_dtype, self.name))
-
-    variable = self._add_variable_with_custom_getter(
-        name=name,
-        shape=shape,
-        # TODO(allenl): a `make_variable` equivalent should be added as a
-        # `Checkpointable` method.
-        getter=getter or make_variable,
-        # Manage errors in Layer rather than Checkpointable.
-        overwrite=True,
-        initializer=initializer,
-        dtype=dtype,
-        constraint=constraint,
-        trainable=trainable and self.trainable,
-        partitioner=partitioner,
-        use_resource=use_resource,
-        collections=collections,
-        synchronization=synchronization,
-        aggregation=aggregation)
-    backend.track_variable(variable)
-
-    if regularizer is not None:
-      # TODO(fchollet): in the future, this should be handled at the
-      # level of variable creation, and weight regularization losses
-      # should be variable attributes.
-      self._handle_weight_regularization(name, variable, regularizer)
-
-    if trainable:
-      self._trainable_weights.append(variable)
-    else:
-      self._non_trainable_weights.append(variable)
-    return variable
-
-  def _handle_weight_regularization(self, name, variable, regularizer):
-    """Create lambdas which compute regularization losses."""
-
-    def _loss_for_variable(v):
-      """Creates a regularization loss `Tensor` for variable `v`."""
-      with ops.colocate_with(v):
-        with ops.name_scope(name + '/Regularizer'):
-          regularization = regularizer(v)
-      return regularization
-
-    if isinstance(variable, tf_variables.PartitionedVariable):
-      for v in variable:
-        self.add_loss(functools.partial(_loss_for_variable, v))
-    else:
-      self.add_loss(functools.partial(_loss_for_variable, variable))
-
-  def _handle_activity_regularization(self, inputs, outputs):
-    # Apply activity regularization.
-    # Note that it should be applied every time the layer creates a new
-    # output, since it is output-specific.
-    if self._activity_regularizer:
-      output_list = nest.flatten(outputs)
-      for output in output_list:
-        with ops.name_scope('ActivityRegularizer'):
-          activity_regularization = self._activity_regularizer(output)
-        self.add_loss(activity_regularization, inputs=inputs)
-
-  @doc_controls.for_subclass_implementers
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """This is where the layer's logic lives.
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
 
-    Arguments:
-        inputs: Input tensor, or list/tuple of input tensors.
-        **kwargs: Additional keyword arguments.
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing the same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
 
-    Returns:
-        A tensor or list/tuple of tensors.
-    """
-    return inputs
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
 
-  def __call__(self, inputs, *args, **kwargs):
-    """Wraps `call`, applying pre- and post-processing steps.
+    This call is ignored when eager execution is enabled (in that case, variable
+    updates are run on the fly and thus do not need to be tracked for later
+    execution).
 
     Arguments:
-      inputs: input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-
-    Note:
-      - The following optional keyword arguments are reserved for specific uses:
-        * `training`: Boolean scalar tensor of Python boolean indicating
-          whether the `call` is meant for training or inference.
-        * `mask`: Boolean input mask.
-      - If the layer's `call` method takes a `mask` argument (as some Keras
-        layers do), its default value will be set to the mask generated
-        for `inputs` by the previous layer (if `input` did come from
-        a layer that generated a corresponding mask, i.e. if it came from
-        a Keras layer with masking support.
-
-    Raises:
-      ValueError: if the layer's `call` method returns None (an invalid value).
+      updates: Update op, or list/tuple of update ops.
+      inputs: If anything other than None is passed, it signals the updates
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for BatchNormalization updates, for instance.
+        If None, the updates will be taken into account unconditionally,
+        and you are responsible for making sure that any dependency they might
+        have is available at runtime.
+        A step counter might fall into this category.
     """
-    input_list = nest.flatten(inputs)
-
-    build_graph = not context.executing_eagerly()
-    # TODO(fchollet, allenl): Make deferred mode work with subclassed Models
-    # which don't use an "inputs" argument.
-    in_deferred_mode = isinstance(input_list[0], DeferredTensor)
-
-    # Handle Keras mask propagation from previous layer to current layer.
-    previous_mask = None
-    if build_graph and (not hasattr(self, '_compute_previous_mask') or
-                        self._compute_previous_mask):
-      previous_mask = collect_previous_mask(inputs)
-      if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = self._no_dependency(
-            function_utils.fn_args(self.call))
-      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
-          not generic_utils.is_all_none(previous_mask)):
-        # The previous layer generated a mask, and mask was not explicitly pass
-        # to __call__, hence we set previous_mask as the default value.
-        kwargs['mask'] = previous_mask
-
-    input_shapes = None
-
-    with ops.name_scope(self._name_scope()):
-      if not self.built:
-        if not build_graph:
-          # Activity regularization is currently unsupported in Eager mode.
-          if self._activity_regularizer:
-            raise ValueError(
-                'activity_regularizer currently unsupported with '
-                'eager execution enabled. Found an activity_regularizer in '
-                '%s(%s).' % (self.__class__.__name__, self))
-        if not build_graph and not in_deferred_mode:
-          for x in input_list:
-            if hasattr(x, '_keras_history'):
-              raise ValueError('_keras_history currently unsupported in '
-                               'Eager mode. Found _keras_history in %s while '
-                               'executing __call__ for %s(%s)' %
-                               (x, self.__class_.__name__, self))
-
-        # Check input assumptions set before layer building, e.g. input rank.
-        self._assert_input_compatibility(inputs)
-        if input_list and self._dtype is None:
-          try:
-            self._dtype = input_list[0].dtype.base_dtype.name
-          except AttributeError:
-            pass
-
-        if all(hasattr(x, 'shape') for x in input_list):
-          input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-
-        if (not hasattr(self, '_is_graph_network') or
-            self.__class__.__name__ == 'Sequential' or
-            not hasattr(self.build, '_is_default')):
-          # Only if self is a layer, an instance of a sequential model, or
-          # the user has manually overwritten the build method do we need to
-          # build it.
-          self.build(input_shapes)
-        # We must set self.built since user defined build functions are not
-        # constrained to set self.built.
-        self.built = True
-
-      # Check input assumptions set after layer building, e.g. input shape.
-      if build_graph or in_deferred_mode:
-        self._assert_input_compatibility(inputs)
+    if context.executing_eagerly():
+      return  # Updates already applied when in eager mode.
 
-      if not in_deferred_mode:
-        self._in_call = True
-        outputs = self.call(inputs, *args, **kwargs)
-        self._in_call = False
-        if outputs is None:
-          raise ValueError('A layer\'s `call` method should return a Tensor '
-                           'or a list of Tensors, not None (layer: ' +
-                           self.name + ').')
+    def process_update(x):
+      if isinstance(x, ops.Operation):
+        return x
+      elif hasattr(x, 'op'):
+        return x.op
       else:
-        # Deferred mode behavior: use `compute_output_shape` to
-        # infer the number of outputs of the layer and their shapes.
-        if input_shapes is None:
-          input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-
-        output_shapes = self.compute_output_shape(input_shapes)
-        output_shapes = nest.flatten(output_shapes)
-        outputs = [
-            # TODO(fchollet): name the deferred tensors?
-            DeferredTensor(shape=shape, dtype=self._dtype)
-            for shape in output_shapes
-        ]
-        if len(outputs) == 1:
-          outputs = outputs[0]
-
-      if build_graph:
-        self._handle_activity_regularization(inputs, outputs)
-        self._set_mask_metadata(inputs, outputs, previous_mask)
-
-      if in_deferred_mode or build_graph and have_all_keras_metadata(inputs):
-        inputs, outputs = self._set_connectivity_metadata_(
-            inputs, outputs, args, kwargs)
-      if context.executing_eagerly():
-        return outputs
-
-      if hasattr(self, '_symbolic_set_inputs') and not self.inputs:
-        # Subclassed network: explicitly set metadata normally set by a call to
-        # self._set_inputs(). This is not relevant in eager execution.
-        self._symbolic_set_inputs(inputs, outputs)
-
-      if in_deferred_mode or build_graph:
-        self._set_learning_phase_metadata(inputs, outputs)
-
-    # Optionally load weight values that were specified at layer instantiation.
-    # TODO(fchollet): consider enabling this with eager execution too.
-    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
-      self.set_weights(self._initial_weights)
-      del self._initial_weights
-    return outputs
-
-  def apply(self, inputs, *args, **kwargs):
-    """Apply the layer on a input.
-
-    This simply wraps `self.__call__`.
-
-    Arguments:
-      inputs: Input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-    """
-    return self.__call__(inputs, *args, **kwargs)
-
-  def _set_learning_phase_metadata(self, inputs, outputs):
-    # Update learning phase info. To work with subclassed models,
-    # this should be done even if Keras metadata is absent.
-    output_tensors = generic_utils.to_list(outputs)
-    uses_lp = any(
-        [getattr(x, '_uses_learning_phase', False)
-         for x in generic_utils.to_list(inputs)])
-    uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
-    for i in range(len(output_tensors)):
-      try:
-        output_tensors[i]._uses_learning_phase = getattr(
-            output_tensors[i], '_uses_learning_phase', False) or uses_lp
-      except AttributeError:
-        # An output element happens to be a C type (such as tuple or dict).
-        # We don't track learning phase info in such edge cases.
-        pass
-
-  def _set_mask_metadata(self, inputs, outputs, previous_mask):
-    # In some cases the mask of the outputs has already been computed by
-    # inner layers and does not need to be recomputed by this layer.
-    mask_already_computed = all(
-        hasattr(x, '_keras_mask') for x in generic_utils.to_list(outputs))
-    if hasattr(self, 'compute_mask') and not mask_already_computed:
-      output_mask = self.compute_mask(inputs, previous_mask)
-    else:
-      output_mask = None
-    if isinstance(outputs, (list, tuple)):
-      if output_mask is None:
-        output_mask = [None for _ in range(len(outputs))]
-      for x, m in zip(outputs, output_mask):
-        try:
-          x._keras_mask = m  # pylint: disable=protected-access
-        except AttributeError:
-          pass  # C type such as dict. Masking not supported in this case.
-    else:
-      try:
-        outputs._keras_mask = output_mask  # pylint: disable=protected-access
-      except AttributeError:
-        pass  # C type such as dict. Masking not supported in this case.
-
-  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
-    call_convention = getattr(self, '_call_convention',
-                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
-    if args:
-      if call_convention == CallConvention.EXPLICIT_INPUTS_ARGUMENT:
-        raise TypeError(
-            'This Layer takes an `inputs` argument to call(), and only the '
-            '`inputs` argument may be specified as a positional argument. '
-            'Pass everything else as a keyword argument (those arguments will'
-            ' not be tracked as inputs to the Layer).')
-      elif call_convention == CallConvention.SINGLE_POSITIONAL_ARGUMENT:
-        raise TypeError(
-            'This Layer takes a single positional argument to call(), which is '
-            'by convention the inputs argument, and only this argument may be '
-            'specified as a positional argument. Pass everything else as a '
-            'keyword argument (those arguments will not be tracked as inputs '
-            'to the Layer).')
-
-    # If the layer returns tensors from its inputs, unmodified,
-    # we copy them to avoid loss of tensor metadata.
-    output_ls = nest.flatten(outputs)
-    output_ls_copy = []
-    for x in output_ls:
-      if x in nest.flatten(inputs):
-        with ops.name_scope(self.name):
-          x = array_ops.identity(x)
-      output_ls_copy.append(x)
-    if len(output_ls_copy) == 1:
-      outputs = output_ls_copy[0]
-    else:
-      outputs = output_ls_copy
-
-    inputs, kwargs = self._inputs_from_call_args(
-        call_args=(inputs,) + args, call_kwargs=kwargs)
-    # Add an inbound node to the layer, so it can keep track of this call.
-    # This updates the layer history of the output tensor(s).
-    kwargs.pop('mask', None)  # `mask` should not be serialized.
-    self._add_inbound_node(
-        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
-    return inputs, outputs
-
-  def _inputs_from_call_args(self, call_args, call_kwargs):
-    """Get Layer inputs from __call__ *args and **kwargs.
-
-    Args:
-      call_args: The positional arguments passed to __call__.
-      call_kwargs: The keyword argument dict passed to __call__.
+        return ops.convert_to_tensor(x)
 
-    Returns:
-      A tuple of (inputs, non_input_kwargs). These may be the same objects as
-      were passed in (call_args and call_kwargs).
-    """
-    call_convention = getattr(self, '_call_convention',
-                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
-    if (call_convention in (
-        CallConvention.EXPLICIT_INPUTS_ARGUMENT,
-        CallConvention.SINGLE_POSITIONAL_ARGUMENT)):
-      assert len(call_args) == 1  # TypeError raised earlier in __call__.
-      return call_args[0], call_kwargs
+    updates = generic_utils.to_list(updates)
+    updates = [process_update(x) for x in updates]
+    self._updates += updates
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
     else:
-      call_arg_spec = tf_inspect.getfullargspec(self.call)
-      # There is no explicit "inputs" argument expected or provided to
-      # call(). Arguments which have default values are considered non-inputs,
-      # and arguments without are considered inputs.
-      if call_arg_spec.defaults:
-        if call_arg_spec.varargs is not None:
-          raise TypeError(
-              'Layer.call() may not accept both *args and arguments with '
-              'default values (unable to determine which are inputs to the '
-              'Layer).')
-        keyword_arg_names = set(
-            call_arg_spec.args[-len(call_arg_spec.defaults):])
-      else:
-        keyword_arg_names = set()
-        # Training is never an input argument name, to allow signatures like
-        # call(x, training).
-      keyword_arg_names.add('training')
-      _, unwrapped_call = tf_decorator.unwrap(self.call)
-      bound_args = inspect.getcallargs(
-          unwrapped_call, *call_args, **call_kwargs)
-      if call_arg_spec.varkw is not None:
-        var_kwargs = bound_args.pop(call_arg_spec.varkw)
-        bound_args.update(var_kwargs)
-        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
-      all_args = call_arg_spec.args
-      if all_args and bound_args[all_args[0]] is self:
-        # Ignore the 'self' argument of methods
-        bound_args.pop(call_arg_spec.args[0])
-        all_args = all_args[1:]
-      non_input_arg_values = {}
-      input_arg_values = []
-      remaining_args_are_keyword = False
-      for argument_name in all_args:
-        if argument_name in keyword_arg_names:
-          remaining_args_are_keyword = True
-        else:
-          if remaining_args_are_keyword:
-            raise TypeError(
-                'Found a positional argument to call() after a non-input '
-                'argument. All arguments after "training" must be keyword '
-                'arguments, and are not tracked as inputs to the Layer.')
-        if remaining_args_are_keyword:
-          non_input_arg_values[argument_name] = bound_args[argument_name]
-        else:
-          input_arg_values.append(bound_args[argument_name])
-      if call_arg_spec.varargs is not None:
-        input_arg_values.extend(bound_args[call_arg_spec.varargs])
-      return input_arg_values, non_input_arg_values
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer.
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
 
-    Assumes that the layer will be built
-    to match that input shape provided.
+  def set_weights(self, weights):
+    """Sets the weights of the layer, from Numpy arrays.
 
     Arguments:
-        input_shape: Shape tuple (tuple of integers)
-            or list of shape tuples (one per output tensor of the layer).
-            Shape tuples can include None for free dimensions,
-            instead of an integer.
+        weights: a list of Numpy arrays. The number
+            of arrays and their shape must match
+            number of the dimensions of the weights
+            of the layer (i.e. it should match the
+            output of `get_weights`).
 
-    Returns:
-        An input shape tuple.
+    Raises:
+        ValueError: If the provided weights list does not match the
+            layer's specifications.
     """
-    if context.executing_eagerly():
-      # In this case we build the model first in order to do shape inference.
-      # This is acceptable because the framework only calls
-      # `compute_output_shape` on shape values that the layer would later be
-      # built for. It would however cause issues in case a user attempts to
-      # use `compute_output_shape` manually (these users will have to
-      # implement `compute_output_shape` themselves).
-      self.build(input_shape)
-
-      with context.graph_mode():
-        graph = eager_function.FuncGraph('graph')
-        with graph.as_default():
-          if isinstance(input_shape, list):
-            inputs = [generate_placeholders_from_shape(shape)
-                      for shape in input_shape]
-          else:
-            inputs = generate_placeholders_from_shape(input_shape)
-
-          try:
-            if self._expects_training_arg:
-              outputs = self(inputs, training=False)
-            else:
-              outputs = self(inputs)
-          except TypeError:
-            raise NotImplementedError('We could not automatically infer '
-                                      'the static shape of the layer\'s output.'
-                                      ' Please implement the '
-                                      '`compute_output_shape` method on your '
-                                      'layer (%s).' % self.__class__.__name__)
-      if isinstance(outputs, list):
-        return [output.shape for output in outputs]
-      else:
-        return outputs.shape
-    raise NotImplementedError
-
-  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
-    """Computes an output mask tensor.
+    params = self.weights
+    if len(params) != len(weights):
+      raise ValueError('You called `set_weights(weights)` on layer "' +
+                       self.name + '" with a  weight list of length ' +
+                       str(len(weights)) + ', but the layer was expecting ' +
+                       str(len(params)) + ' weights. Provided weights: ' +
+                       str(weights)[:50] + '...')
+    if not params:
+      return
+    weight_value_tuples = []
+    param_values = backend.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError('Layer weight shape ' + str(pv.shape) +
+                         ' not compatible with '
+                         'provided weight shape ' + str(w.shape))
+      weight_value_tuples.append((p, w))
+    backend.batch_set_value(weight_value_tuples)
 
-    Arguments:
-        inputs: Tensor or list of tensors.
-        mask: Tensor or list of tensors.
+  def get_weights(self):
+    """Returns the current weights of the layer.
 
     Returns:
-        None or a tensor (or list of tensors,
-            one per output tensor of the layer).
+        Weights values as a list of numpy arrays.
     """
-    if not self.supports_masking:
-      if mask is not None:
-        if isinstance(mask, list):
-          if any(m is not None for m in mask):
-            raise TypeError('Layer ' + self.name + ' does not support masking, '
-                            'but was passed an input_mask: ' + str(mask))
-        else:
-          raise TypeError('Layer ' + self.name + ' does not support masking, '
-                          'but was passed an input_mask: ' + str(mask))
-      # masking not explicitly supported: return None as mask
-      return None
-    # if masking is explicitly supported, by default
-    # carry over the input mask
-    return mask
+    params = self.weights
+    return backend.batch_get_value(params)
 
-  def _add_inbound_node(self,
-                        input_tensors,
-                        output_tensors,
-                        arguments=None):
-    """Internal method to create an inbound node for the layer.
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
 
     Arguments:
-        input_tensors: list of input tensors.
-        output_tensors: list of output tensors.
-        arguments: dictionary of keyword arguments that were passed to the
-            `call` method of the layer at the call that created the node.
-    """
-    input_tensors = nest.flatten(input_tensors)
-    output_tensors = nest.flatten(output_tensors)
+      inputs: Input tensor or list/tuple of input tensors.
 
-    # Collect input tensor(s) coordinates.
-    inbound_layers = []
-    node_indices = []
-    tensor_indices = []
-    for x in input_tensors:
-      assert hasattr(x, '_keras_history')
-      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      inbound_layers.append(inbound_layer)
-      node_indices.append(node_index)
-      tensor_indices.append(tensor_index)
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
 
-    # Create node, add it to inbound nodes.
-    Node(
-        self,
-        inbound_layers=inbound_layers,
-        node_indices=node_indices,
-        tensor_indices=tensor_indices,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        arguments=arguments)
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    # Updates disabled if layer is not trainable and not explicitly stateful.
+    if not self.trainable and not self.stateful:
+      return []
 
-    # Update tensor history metadata.
-    for i in range(len(output_tensors)):
-      # The metadata attribute consists of 1) a layer instance
-      # 2) a node index for the layer, 3) a tensor index for the node.
-      # The allows layer reuse (multiple nodes per layer) and multi-output
-      # or multi-input layers (e.g. a layer can return multiple tensors,
-      # and each can be sent to a different layer).
-      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+    if inputs is None:
+      # Requesting unconditional updates.
+      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
 
-  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
-    """Private utility to retrieves an attribute (e.g. inputs) from a node.
+    # Requesting input-conditional updates.
+    inputs = nest.flatten(inputs)
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates)
+    updates = []
+    for update in self.updates:
+      if update in reachable:
+        updates.append(update)
+    return updates
 
-    This is used to implement the methods:
-        - get_input_shape_at
-        - get_output_shape_at
-        - get_input_at
-        etc...
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
 
     Arguments:
-        node_index: Integer index of the node from which
-            to retrieve the attribute.
-        attr: Exact node attribute name.
-        attr_name: Human-readable attribute name, for error messages.
+      inputs: Input tensor or list/tuple of input tensors.
 
     Returns:
-        The layer's attribute `attr` at the node of index `node_index`.
+      List of loss tensors of the layer that depend on `inputs`.
 
     Raises:
-        RuntimeError: If the layer has no inbound nodes, or if called in Eager
-        mode.
-        ValueError: If the index provided does not match any node.
+      RuntimeError: If called in Eager mode.
     """
-    if not self._inbound_nodes:
-      raise RuntimeError('The layer has never been called '
-                         'and thus has no defined ' + attr_name + '.')
-    if not len(self._inbound_nodes) > node_index:
-      raise ValueError('Asked to get ' + attr_name + ' at node ' +
-                       str(node_index) + ', but the layer has only ' +
-                       str(len(self._inbound_nodes)) + ' inbound nodes.')
-    values = getattr(self._inbound_nodes[node_index], attr)
-    if len(values) == 1:
-      return values[0]
-    else:
-      return values
+    if inputs is None:
+      # Requesting unconditional losses.
+      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+
+    # Requesting input-conditional losses.
+    inputs = nest.flatten(inputs)
+    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
+    # The losses we want to return will be part of this set.
+    # To avoid unnecessary work, we stop the search in case all of
+    # `self.losses` have been retrieved.
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses)
+    losses = []
+    for loss in self.losses:
+      if loss in reachable:
+        losses.append(loss)
+    return losses
 
   def get_input_mask_at(self, node_index):
     """Retrieves the input mask tensor(s) of a layer at a given node.
@@ -1375,8 +1144,7 @@ class Layer(checkpointable.CheckpointableBase):
                          ', but the layer isn\'t built. '
                          'You can build it manually via: `' + self.name +
                          '.build(batch_input_shape)`.')
-    weight_shapes = [w.shape.as_list() for w in self.weights]
-    return int(sum([np.prod(w) for w in weight_shapes]))
+    return int(sum(np.prod(w.shape.as_list()) for w in self.weights))
 
   @property
   def output_shape(self):
@@ -1428,230 +1196,400 @@ class Layer(checkpointable.CheckpointableBase):
     """Deprecated, do NOT use! Only for compatibility with external Keras."""
     return self._outbound_nodes
 
-  def _assert_input_compatibility(self, inputs):
-    """Checks compatibility between the layer and provided inputs.
+  ##############################################################################
+  # Methods & attributes below are public aliases of other methods.            #
+  ##############################################################################
+
+  def apply(self, inputs, *args, **kwargs):
+    """Apply the layer on a input.
+
+    This is an alias of `self.__call__`.
+
+    Arguments:
+      inputs: Input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+    """
+    return self.__call__(inputs, *args, **kwargs)
+
+  @doc_controls.for_subclass_implementers
+  def add_variable(self, *args, **kwargs):
+    """Alias for `add_weight`."""
+    return self.add_weight(*args, **kwargs)
+
+  @property
+  def variables(self):
+    """Returns the list of all layer variables/weights.
+
+    Alias of `self.weights`.
+
+    Returns:
+      A list of variables.
+    """
+    return self.weights
+
+  @property
+  def trainable_variables(self):
+    return self.trainable_weights
+
+  @property
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
+
+  ##############################################################################
+  # Methods & attributes below are all private and only used by the framework. #
+  ##############################################################################
+
+  def _name_scope(self):
+    return self.name
+
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      self._name = base_layer_utils.unique_layer_name(
+          generic_utils.to_snake_case(self.__class__.__name__),
+          zero_based=zero_based)
+    else:
+      self._name = name
+
+  def _get_existing_metric(self, name=None):
+    match = [m for m in self._metrics if m.name == name]
+    if not match:
+      return
+    if len(match) > 1:
+      raise ValueError(
+          'Please provide different names for the metrics you have added. '
+          'We found {} metrics with the name: "{}"'.format(len(match), name))
+    return match[0]
+
+  def _eager_add_metric(self, value, aggregation=None, name=None):
+    # If the given metric is available in `metrics` list we just update state
+    # on it, otherwise we create a new metric instance and
+    # add it to the `metrics` list.
+    match = self._get_existing_metric(name)
+    if match:
+      match(value)  # Update the metric state.
+      return
+    else:
+      if aggregation is None:
+        raise ValueError('We do not support adding an aggregated metric tensor '
+                         'in `call` in eager execution.')
+      metric_obj, _ = base_layer_utils.create_mean_metric(value, name)
+      self._metrics.append(metric_obj)
+
+  def _symbolic_add_metric(self, value, aggregation=None, name=None):
+    if aggregation is None:
+      # Iterate over the metrics and check if the given metric exists already.
+      # This can happen when a metric instance is created in subclassed model
+      # layer `__init__` and we have tracked that instance already in
+      # model.__setattr__.
+      match = self._get_existing_metric(name)
+      if match:
+        result_tensor = value
+        if match.name not in self._metrics_tensors:
+          self._metrics_tensors[match.name] = result_tensor
+          return
+        else:
+          raise ValueError(
+              'We currently do not support reusing a metric instance.')
+      else:
+        # We track the instance using the metadata on the result tensor.
+        result_tensor = value
+        metric_obj = result_tensor._metric_obj
+    else:
+      # If a non-aggregated tensor is given as input (ie. `aggregation` is
+      # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
+      metric_obj, result_tensor = base_layer_utils.create_mean_metric(
+          value, name)
+    self._metrics.append(metric_obj)
+    self._metrics_tensors[metric_obj.name] = result_tensor
+
+  def _handle_weight_regularization(self, name, variable, regularizer):
+    """Create lambdas which compute regularization losses."""
+
+    def _loss_for_variable(v):
+      """Creates a regularization loss `Tensor` for variable `v`."""
+      with ops.name_scope(name + '/Regularizer'):
+        regularization = regularizer(v)
+      return regularization
+
+    if isinstance(variable, tf_variables.PartitionedVariable):
+      for v in variable:
+        self.add_loss(functools.partial(_loss_for_variable, v))
+    else:
+      self.add_loss(functools.partial(_loss_for_variable, variable))
+
+  def _handle_activity_regularization(self, inputs, outputs):
+    # Apply activity regularization.
+    # Note that it should be applied every time the layer creates a new
+    # output, since it is output-specific.
+    if self._activity_regularizer:
+      output_list = nest.flatten(outputs)
+      with ops.name_scope('ActivityRegularizer'):
+        for output in output_list:
+          activity_loss = self._activity_regularizer(output)
+          batch_size = math_ops.cast(
+              array_ops.shape(output)[0], activity_loss.dtype)
+          # Make activity regularization strength batch-agnostic.
+          mean_activity_loss = activity_loss / batch_size
+          self.add_loss(mean_activity_loss, inputs=inputs)
+
+  def _set_mask_metadata(self, inputs, outputs, previous_mask):
+    # In some cases the mask of the outputs has already been computed by
+    # inner layers and does not need to be recomputed by this layer.
+    mask_already_computed = all(
+        hasattr(x, '_keras_mask') for x in generic_utils.to_list(outputs))
+    if hasattr(self, 'compute_mask') and not mask_already_computed:
+      output_mask = self.compute_mask(inputs, previous_mask)
+    else:
+      output_mask = None
+    if isinstance(outputs, (list, tuple)):
+      if output_mask is None:
+        output_mask = [None for _ in range(len(outputs))]
+      for x, m in zip(outputs, output_mask):
+        try:
+          x._keras_mask = m  # pylint: disable=protected-access
+        except AttributeError:
+          pass  # C type such as dict. Masking not supported in this case.
+    else:
+      try:
+        outputs._keras_mask = output_mask  # pylint: disable=protected-access
+      except AttributeError:
+        pass  # C type such as dict. Masking not supported in this case.
+
+  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
+    call_convention = getattr(
+        self, '_call_convention',
+        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if args:
+      if call_convention == (base_layer_utils
+                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT):
+        raise TypeError(
+            'This layer ("{}") takes an `inputs` argument in `call()`, '
+            'and only the `inputs` argument may be specified as a positional '
+            'argument. Pass everything else as a keyword argument '
+            '(those arguments will not be tracked '
+            'as inputs to the layer).'.format(self.name))
+      elif call_convention == (base_layer_utils
+                               .CallConvention.SINGLE_POSITIONAL_ARGUMENT):
+        raise TypeError(
+            'This layer ("{}") takes a single positional argument in `call()`,'
+            ' which is by convention the `inputs` argument, '
+            'and only this argument may be specified as a positional argument. '
+            'Pass everything else as a keyword argument '
+            '(those arguments will not be tracked '
+            'as inputs to the layer).'.format(self.name))
+
+    # If the layer returns tensors from its inputs, unmodified,
+    # we copy them to avoid loss of tensor metadata.
+    output_ls = nest.flatten(outputs)
+    output_ls_copy = []
+    for x in output_ls:
+      if x in nest.flatten(inputs):
+        with ops.name_scope(self.name):
+          x = array_ops.identity(x)
+      output_ls_copy.append(x)
+    if len(output_ls_copy) == 1:
+      outputs = output_ls_copy[0]
+    else:
+      outputs = output_ls_copy
+
+    inputs, kwargs = self._inputs_from_call_args(
+        call_args=(inputs,) + args, call_kwargs=kwargs)
+    # Add an inbound node to the layer, so it can keep track of this call.
+    # This updates the layer history of the output tensor(s).
+    kwargs.pop('mask', None)  # `mask` should not be serialized.
+    self._add_inbound_node(
+        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
+    return inputs, outputs
 
-    This checks that the tensor(s) `inputs` verify the input assumptions
-    of the layer (if any). If not, a clear and actional exception gets raised.
+  def _inputs_from_call_args(self, call_args, call_kwargs):
+    """Get Layer inputs from __call__ *args and **kwargs.
 
-    Arguments:
-        inputs: input tensor or list of input tensors.
+    Args:
+      call_args: The positional arguments passed to __call__.
+      call_kwargs: The keyword argument dict passed to __call__.
 
-    Raises:
-        ValueError: in case of mismatch between
-            the provided inputs and the expectations of the layer.
+    Returns:
+      A tuple of (inputs, non_input_kwargs). These may be the same objects as
+      were passed in (call_args and call_kwargs).
     """
-    if not self.input_spec:
-      return
-    if not isinstance(self.input_spec, (list, tuple)):
-      input_spec = nest.flatten(self.input_spec)
+    call_convention = getattr(
+        self, '_call_convention',
+        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if (call_convention in (
+        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT,
+        base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT)):
+      assert len(call_args) == 1  # TypeError raised earlier in __call__.
+      return call_args[0], call_kwargs
     else:
-      input_spec = self.input_spec
-    inputs = nest.flatten(inputs)
-    if len(inputs) != len(input_spec):
-      raise ValueError('Layer ' + self.name + ' expects ' +
-                       str(len(input_spec)) + ' inputs, '
-                       'but it received ' + str(len(inputs)) +
-                       ' input tensors. Inputs received: ' + str(inputs))
-    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-      if spec is None:
-        continue
-
-      if (spec.ndim is not None or
-          spec.min_ndim is not None or
-          spec.max_ndim is not None):
-        if x.shape.ndims is None:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'its rank is undefined, but the layer requires a '
-                           'defined rank.')
-
-      # Check ndim.
-      if spec.ndim is not None:
-        ndim = x.shape.ndims
-        if ndim != spec.ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
-                           str(ndim) + '. Full shape received: ' +
-                           str(x.shape.as_list()))
-      if spec.max_ndim is not None:
-        ndim = x.shape.ndims
-        if ndim is not None and ndim > spec.max_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected max_ndim=' + str(spec.max_ndim) +
-                           ', found ndim=' + str(ndim))
-      if spec.min_ndim is not None:
-        ndim = x.shape.ndims
-        if ndim is not None and ndim < spec.min_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           ': expected min_ndim=' + str(spec.min_ndim) +
-                           ', found ndim=' + str(ndim) +
-                           '. Full shape received: ' +
-                           str(x.shape.as_list()))
-      # Check dtype.
-      if spec.dtype is not None:
-        if x.dtype != spec.dtype:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected dtype=' + str(spec.dtype) +
-                           ', found dtype=' + str(x.dtype))
-      # Check specific shape axes.
-      if spec.axes:
-        shape = x.shape.as_list()
-        if shape is not None:
-          for axis, value in spec.axes.items():
-            if hasattr(value, 'value'):
-              value = value.value
-            if value is not None and shape[int(axis)] not in {value, None}:
-              raise ValueError(
-                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
-                  ' incompatible with the layer: expected axis ' + str(axis) +
-                  ' of input shape to have value ' + str(value) +
-                  ' but received input with shape ' + str(shape))
-      # Check shape.
-      if spec.shape is not None:
-        shape = x.shape.as_list()
-        if shape is not None:
-          for spec_dim, dim in zip(spec.shape, shape):
-            if spec_dim is not None and dim is not None:
-              if spec_dim != dim:
-                raise ValueError('Input ' + str(input_index) +
-                                 ' is incompatible with layer ' + self.name +
-                                 ': expected shape=' + str(spec.shape) +
-                                 ', found shape=' + str(shape))
+      call_arg_spec = tf_inspect.getfullargspec(self.call)
+      # There is no explicit "inputs" argument expected or provided to
+      # call(). Arguments which have default values are considered non-inputs,
+      # and arguments without are considered inputs.
+      if call_arg_spec.defaults:
+        if call_arg_spec.varargs is not None:
+          raise TypeError(
+              'Layers may not accept both positional arguments and '
+              'arguments with default values (unable to determine which '
+              'are inputs to the layer). '
+              'Issue occurred with layer "%s"' % (self.name))
+        keyword_arg_names = set(
+            call_arg_spec.args[-len(call_arg_spec.defaults):])
+      else:
+        keyword_arg_names = set()
+        # Training is never an input argument name, to allow signatures like
+        # call(x, training).
+      keyword_arg_names.add('training')
+      _, unwrapped_call = tf_decorator.unwrap(self.call)
+      bound_args = inspect.getcallargs(
+          unwrapped_call, *call_args, **call_kwargs)
+      if call_arg_spec.varkw is not None:
+        var_kwargs = bound_args.pop(call_arg_spec.varkw)
+        bound_args.update(var_kwargs)
+        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
+      all_args = call_arg_spec.args
+      if all_args and bound_args[all_args[0]] is self:
+        # Ignore the 'self' argument of methods
+        bound_args.pop(call_arg_spec.args[0])
+        all_args = all_args[1:]
+      non_input_arg_values = {}
+      input_arg_values = []
+      remaining_args_are_keyword = False
+      for argument_name in all_args:
+        if argument_name in keyword_arg_names:
+          remaining_args_are_keyword = True
+        else:
+          if remaining_args_are_keyword:
+            raise TypeError(
+                'Found a positional argument in a layer call after a non-input '
+                'argument. All arguments after "training" must be keyword '
+                'arguments, and are not tracked as inputs to the layer. '
+                'Issue occurred with layer "%s"' % (self.name))
+        if remaining_args_are_keyword:
+          non_input_arg_values[argument_name] = bound_args[argument_name]
+        else:
+          input_arg_values.append(bound_args[argument_name])
+      if call_arg_spec.varargs is not None:
+        input_arg_values.extend(bound_args[call_arg_spec.varargs])
+      return input_arg_values, non_input_arg_values
 
-  def set_weights(self, weights):
-    """Sets the weights of the layer, from Numpy arrays.
+  def _add_inbound_node(self,
+                        input_tensors,
+                        output_tensors,
+                        arguments=None):
+    """Internal method to create an inbound node for the layer.
 
     Arguments:
-        weights: a list of Numpy arrays. The number
-            of arrays and their shape must match
-            number of the dimensions of the weights
-            of the layer (i.e. it should match the
-            output of `get_weights`).
-
-    Raises:
-        ValueError: If the provided weights list does not match the
-            layer's specifications.
-    """
-    params = self.weights
-    if len(params) != len(weights):
-      raise ValueError('You called `set_weights(weights)` on layer "' +
-                       self.name + '" with a  weight list of length ' +
-                       str(len(weights)) + ', but the layer was expecting ' +
-                       str(len(params)) + ' weights. Provided weights: ' +
-                       str(weights)[:50] + '...')
-    if not params:
-      return
-    weight_value_tuples = []
-    param_values = backend.batch_get_value(params)
-    for pv, p, w in zip(param_values, params, weights):
-      if pv.shape != w.shape:
-        raise ValueError('Layer weight shape ' + str(pv.shape) +
-                         ' not compatible with '
-                         'provided weight shape ' + str(w.shape))
-      weight_value_tuples.append((p, w))
-    backend.batch_set_value(weight_value_tuples)
-
-  def get_weights(self):
-    """Returns the current weights of the layer.
-
-    Returns:
-        Weights values as a list of numpy arrays.
+        input_tensors: list of input tensors.
+        output_tensors: list of output tensors.
+        arguments: dictionary of keyword arguments that were passed to the
+            `call` method of the layer at the call that created the node.
     """
-    params = self.weights
-    return backend.batch_get_value(params)
-
-  def get_config(self):
-    """Returns the config of the layer.
+    input_tensors = nest.flatten(input_tensors)
+    output_tensors = nest.flatten(output_tensors)
 
-    A layer config is a Python dictionary (serializable)
-    containing the configuration of a layer.
-    The same layer can be reinstantiated later
-    (without its trained weights) from this configuration.
+    # Collect input tensor(s) coordinates.
+    inbound_layers = []
+    node_indices = []
+    tensor_indices = []
+    for x in input_tensors:
+      assert hasattr(x, '_keras_history')
+      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      inbound_layers.append(inbound_layer)
+      node_indices.append(node_index)
+      tensor_indices.append(tensor_index)
 
-    The config of a layer does not include connectivity
-    information, nor the layer class name. These are handled
-    by `Network` (one layer of abstraction above).
+    # Create node, add it to inbound nodes.
+    Node(
+        self,
+        inbound_layers=inbound_layers,
+        node_indices=node_indices,
+        tensor_indices=tensor_indices,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        arguments=arguments)
 
-    Returns:
-        Python dictionary.
-    """
-    config = {'name': self.name, 'trainable': self.trainable}
-    if hasattr(self, '_batch_input_shape'):
-      config['batch_input_shape'] = self._batch_input_shape
-    if hasattr(self, 'dtype'):
-      config['dtype'] = self.dtype
-    return config
+    # Update tensor history metadata.
+    for i in range(len(output_tensors)):
+      # The metadata attribute consists of 1) a layer instance
+      # 2) a node index for the layer, 3) a tensor index for the node.
+      # The allows layer reuse (multiple nodes per layer) and multi-output
+      # or multi-input layers (e.g. a layer can return multiple tensors,
+      # and each can be sent to a different layer).
+      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
 
-  @classmethod
-  def from_config(cls, config):
-    """Creates a layer from its config.
+  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+    """Private utility to retrieves an attribute (e.g. inputs) from a node.
 
-    This method is the reverse of `get_config`,
-    capable of instantiating the same layer from the config
-    dictionary. It does not handle layer connectivity
-    (handled by Network), nor weights (handled by `set_weights`).
+    This is used to implement the methods:
+        - get_input_shape_at
+        - get_output_shape_at
+        - get_input_at
+        etc...
 
     Arguments:
-        config: A Python dictionary, typically the
-            output of get_config.
+        node_index: Integer index of the node from which
+            to retrieve the attribute.
+        attr: Exact node attribute name.
+        attr_name: Human-readable attribute name, for error messages.
 
     Returns:
-        A layer instance.
-    """
-    return cls(**config)
-
-
-@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
-class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
+        The layer's attribute `attr` at the node of index `node_index`.
 
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
+    Raises:
+        RuntimeError: If the layer has no inbound nodes, or if called in Eager
+        mode.
+        ValueError: If the index provided does not match any node.
+    """
+    if not self._inbound_nodes:
+      raise RuntimeError('The layer has never been called '
+                         'and thus has no defined ' + attr_name + '.')
+    if not len(self._inbound_nodes) > node_index:
+      raise ValueError('Asked to get ' + attr_name + ' at node ' +
+                       str(node_index) + ', but the layer has only ' +
+                       str(len(self._inbound_nodes)) + ' inbound nodes.')
+    values = getattr(self._inbound_nodes[node_index], attr)
+    if len(values) == 1:
+      return values[0]
+    else:
+      return values
 
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
+  @property
+  def _static_graph_friendly(self):
+    """Whether the layer can be called to create a static graph.
 
-  Arguments:
-      dtype: Expected DataType of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
-  """
+    Because of nesting, there are two components to being "graph-friendly":
+      1) all inner layers are graph-friendly
+      2) the way they are composed is graph-friendly.
+    We denote the latter as "_call_is_graph_friendly", and define
+    "_static_graph_friendly" as being the combination of
+    "_call_is_graph_friendly" and "all inner layers are _static_graph_friendly".
+    For atomic layers (no inner layers), this is just "_call_is_graph_friendly".
 
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None):
-    self.dtype = dtype
-    self.shape = shape
-    if shape is not None:
-      self.ndim = len(shape)
-    else:
-      self.ndim = ndim
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.axes = axes or {}
+    Returns:
+      Boolean.
+    """
+    return self._call_is_graph_friendly
 
-  def __repr__(self):
-    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
-            ('shape=' + str(self.shape)) if self.shape else '',
-            ('ndim=' + str(self.ndim)) if self.ndim else '',
-            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
-            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
-            ('axes=' + str(self.axes)) if self.axes else '']
-    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+  def _maybe_build(self, inputs):
+    # Check input assumptions set before layer building, e.g. input rank.
+    input_spec.assert_input_compatibility(
+        self.input_spec, inputs, self.name)
+    input_list = nest.flatten(inputs)
+    if input_list and self._dtype is None:
+      try:
+        self._dtype = input_list[0].dtype.base_dtype.name
+      except AttributeError:
+        pass
+    input_shapes = None
+    if all(hasattr(x, 'shape') for x in input_list):
+      input_shapes = nest.map_structure(lambda x: x.shape, inputs)
+    # Only call `build` if the user has manually overridden the build method.
+    if not hasattr(self.build, '_is_default'):
+      self.build(input_shapes)
 
 
 class Node(object):
@@ -1766,223 +1704,12 @@ class Node(object):
     }
 
 
-class DeferredTensor(object):
-  """Tensor-like object used to build graphs of layers in Eager mode.
-
-  When calling a layer on a DeferredTensor, the layer will not perform any
-  computation and will simply perform shape inference to return new
-  DeferredTensors with appropriate shape information. Thus DeferredTensor
-  behaves like a graph-mode Tensor when manipulated by layers.
-  """
-
-  def __init__(self, shape, dtype, name=None):
-    self.shape = tensor_shape.TensorShape(shape)
-    if dtype is None:
-      self.dtype = dtypes.as_dtype(np.float32)
-    else:
-      self.dtype = dtypes.as_dtype(dtype)
-    self.name = name
-
-  def get_shape(self):
-    return self.shape
-
-  def __str__(self):
-    return "DeferredTensor('%s', shape=%s, dtype=%s)" % (self.name,
-                                                         self.shape,
-                                                         self.dtype.name)
-
-  def __repr__(self):
-    return "<DeferredTensor '%s' shape=%s dtype=%s>" % (self.name,
-                                                        self.shape,
-                                                        self.dtype.name)
-
-
-def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
-                      zero_based=False):
-  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
-
-  Arguments:
-    name: String name to make unique.
-    name_uid_map: An optional defaultdict(int) to use when creating unique
-      names. If None (default), uses a per-Graph dictionary.
-    avoid_names: An optional set or dict with names which should not be used. If
-      None (default) does not avoid any names.
-    namespace: Gets a name which is unique within the (graph, namespace). Layers
-      which are not Networks use a blank namespace and so get graph-global
-      names.
-    zero_based: If True, name sequences start with no suffix (e.g. "dense",
-      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
-
-  Returns:
-    Unique string name.
-
-  Example:
-
-  ```python
-  _unique_layer_name('dense')  # dense_1
-  _unique_layer_name('dense')  # dense_2
-  ```
-  """
-  if name_uid_map is None:
-    name_uid_map = get_default_graph_uid_map()
-  if avoid_names is None:
-    avoid_names = set()
-  proposed_name = None
-  while proposed_name is None or proposed_name in avoid_names:
-    name_key = (namespace, name)
-    if zero_based:
-      number = name_uid_map[name_key]
-      if number:
-        proposed_name = name + '_' + str(number)
-      else:
-        proposed_name = name
-      name_uid_map[name_key] += 1
-    else:
-      name_uid_map[name_key] += 1
-      proposed_name = name + '_' + str(name_uid_map[name_key])
-  return proposed_name
-
-
-def have_all_keras_metadata(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  return all([hasattr(x, '_keras_history') for x in iterable])
-
-
-def collect_previous_mask(input_tensors):
-  """Retrieves the output mask(s) of the previous node.
-
-  Arguments:
-      input_tensors: A tensor or list of tensors.
-
-  Returns:
-      A mask tensor or list of mask tensors.
-  """
-  input_tensors = nest.flatten(input_tensors)
-  masks = []
-  for x in input_tensors:
-    if hasattr(x, '_keras_mask'):
-      mask = x._keras_mask  # pylint: disable=protected-access
-      masks.append(mask)
-    else:
-      masks.append(None)
-  if len(masks) == 1:
-    return masks[0]
-  return masks
-
-
-def get_default_graph_uid_map():
-  # TODO(fchollet): refactor this into backend.
-  graph = ops.get_default_graph()
-  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
-  if name_uid_map is None:
-    name_uid_map = collections_lib.defaultdict(int)
-    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
-  return name_uid_map
-
-
-def make_variable(name,
-                  shape=None,
-                  dtype=dtypes.float32,
-                  initializer=None,
-                  partition_info=None,
-                  trainable=None,
-                  caching_device=None,
-                  validate_shape=True,
-                  constraint=None,
-                  use_resource=None,
-                  collections=None,
-                  synchronization=tf_variables.VariableSynchronization.AUTO,
-                  aggregation=tf_variables.VariableAggregation.NONE,
-                  partitioner=None):  # pylint: disable=unused-argument
-  """Temporary util to create a variable (relies on `variable_scope.variable`).
-
-  Some reuse-related technicalities prevent us from using
-  `variable_scope.get_variable()` directly, so we use a subcomponent
-  that has fewer constraints (`variable_scope.variable()`).
-
-  In the longer term, it seems like a similar "default variable creator" method
-  should exist in `CheckpointableBase` instead. When this happens, we can get
-  rid of this temporary solution.
-
-  TODO(fchollet): remove this method when no longer needed.
-  TODO(fchollet): handle `partitioner` argument.
-
-  Arguments:
-    name: Variable name.
-    shape: Variable shape.
-    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-    initializer: Initializer instance (callable).
-    partition_info: Not handled at this time.
-    trainable: Whether the variable should be part of the layer's
-      "trainable_variables" (e.g. variables, biases)
-      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-      Note, if the current variable scope is marked as non-trainable
-      then this parameter is ignored and any added variables are also
-      marked as non-trainable. `trainable` defaults to `True` unless
-      `synchronization` is set to `ON_READ`.
-    caching_device: Passed to `tf.Variable`.
-    validate_shape: Passed to `tf.Variable`.
-    constraint: Constraint instance (callable).
-    use_resource: Whether to use a `ResourceVariable`.
-    collections: List of graph collections keys. The new variable is added to
-      these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-    synchronization: Indicates when a distributed a variable will be
-      aggregated. Accepted values are constants defined in the class
-      `tf.VariableSynchronization`. By default the synchronization is set to
-      `AUTO` and the current `DistributionStrategy` chooses
-      when to synchronize. If `synchronization` is set to `ON_READ`,
-      `trainable` must not be set to `True`.
-    aggregation: Indicates how a distributed variable will be aggregated.
-      Accepted values are constants defined in the class
-      `tf.VariableAggregation`.
-    partitioner: Not handled at this time.
-
-  Returns:
-    Variable instance.
-  """
-  initializing_from_value = False
-  if initializer is not None and not callable(initializer):
-    initializing_from_value = True
-
-  with ops.init_scope():
-    if initializing_from_value:
-      init_val = initializer
-      variable_dtype = None
-    else:
-      # Instantiate initializer if provided initializer is a type object.
-      if isinstance(initializer, type(init_ops.Initializer)):
-        initializer = initializer(dtype=dtype)
-      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-          shape, dtype=dtype, partition_info=partition_info)
-      variable_dtype = dtype.base_dtype
-  if use_resource is None:
-    use_resource = True
-
-  # TODO(apassos,rohanj) figure out how to remove collections from here so we
-  # can remove the V1.
-  v = tf_variables.VariableV1(
-      initial_value=init_val,
-      name=name,
-      trainable=trainable,
-      caching_device=caching_device,
-      dtype=variable_dtype,
-      validate_shape=validate_shape,
-      constraint=constraint,
-      use_resource=use_resource,
-      collections=collections,
-      synchronization=synchronization,
-      aggregation=aggregation)
-  return v
-
-
 def default(method):
   """Decorates a method to detect overrides in subclasses."""
   method._is_default = True
   return method
 
 
-def generate_placeholders_from_shape(shape):
-  return array_ops.placeholder(shape=shape, dtype=backend.floatx())
+# Avoid breaking users who directly import this symbol from this file.
+# TODO(fchollet): remove this.
+InputSpec = input_spec.InputSpec  # pylint:disable=invalid-name
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..798775b6a5b29aa72a2c766584811aa469db2471
--- /dev/null
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -0,0 +1,187 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TensorFlow 2.0 layer behavior."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class DynamicLayer1(base_layer.Layer):
+
+  def call(self, inputs):
+    if math_ops.reduce_sum(inputs) > 0:
+      return math_ops.sqrt(inputs)
+    else:
+      return math_ops.square(inputs)
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+
+class DynamicLayer2(base_layer.Layer):
+
+  def call(self, inputs):
+    samples = []
+    for sample in inputs:
+      samples.append(math_ops.square(sample))
+    return array_ops.stack(samples, axis=0)
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+
+class InvalidLayer(base_layer.Layer):
+
+  def call(self, inputs):
+    raise ValueError('You did something wrong!')
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+
+class BaseLayerTest(test.TestCase):
+
+  def test_dynamic_layer_in_functional_model_in_graph_mode(self):
+    with context.graph_mode():
+      inputs = keras.Input((3,))
+      with self.assertRaisesRegexp(
+          TypeError, 'Using a `tf.Tensor` as a Python `bool` is not allowed'):
+        _ = DynamicLayer1()(inputs)
+
+      inputs = keras.Input((3,))
+      with self.assertRaisesRegexp(
+          TypeError, 'Tensor objects are only iterable when eager'):
+        _ = DynamicLayer2()(inputs)
+
+  def test_dynamic_layer_in_functional_model_in_eager_mode(self):
+    inputs = keras.Input((3,))
+    outputs = DynamicLayer1()(inputs)
+    model = keras.Model(inputs, outputs)
+    self.assertEqual(model._static_graph_friendly, False)
+    model.compile(RMSPropOptimizer(0.001), loss='mse')
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+    inputs = keras.Input((3,))
+    outputs = DynamicLayer2()(inputs)
+    model = keras.Model(inputs, outputs)
+    self.assertEqual(model._static_graph_friendly, False)
+    model.compile(RMSPropOptimizer(0.001), loss='mse')
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+  def nested_dynamic_layers_in_eager_mode(self):
+    inputs = keras.Input((3,))
+    outputs = DynamicLayer1()(inputs)
+    inner_model = keras.Model(inputs, outputs)
+
+    inputs = keras.Input((3,))
+    x = DynamicLayer2()(inputs)
+    outputs = inner_model(x)
+
+    model = keras.Model(inputs, outputs)
+    self.assertEqual(model._static_graph_friendly, False)
+    model.compile(RMSPropOptimizer(0.001), loss='mse')
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+  def test_invalid_forward_pass_in_graph_mode(self):
+    with context.graph_mode():
+      inputs = keras.Input((3,))
+      with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
+        _ = InvalidLayer()(inputs)
+
+  def test_invalid_forward_pass_in_eager_mode(self):
+    inputs = keras.Input((3,))
+    outputs = InvalidLayer()(inputs)
+    model = keras.Model(inputs, outputs)
+    self.assertEqual(model._static_graph_friendly, False)
+    model.compile(RMSPropOptimizer(0.001), loss='mse')
+    with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
+      model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+  def test_using_symbolic_tensors_with_tf_ops(self):
+    # Single-input.
+    x = keras.Input((3,))
+    y = math_ops.square(x)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+    # Multi-inputs.
+    x1, x2 = keras.Input((3,)), keras.Input((3,))
+    y = array_ops.concat([x1, x2], axis=1)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+    # Mixing Keras symbolic tensors and graph tensors from the same graph works.
+    with keras.backend.get_graph().as_default():
+      x1 = keras.Input((3,))
+    x2 = keras.Input((3,))
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+    # Creating same op type (matmul) multiple times in the Keras graph works.
+    x1 = keras.Input((3,))
+    x2 = keras.Input((3,))
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+
+  def test_mixing_eager_and_graph_tensors(self):
+    with ops.Graph().as_default():
+      x1 = array_ops.ones((3, 3))
+    x2 = array_ops.ones((3, 3))
+    self.assertTrue(isinstance(x2, ops.EagerTensor))
+    with self.assertRaisesRegexp(TypeError,
+                                 'provided list of inputs contains '
+                                 'objects other than \'EagerTensor\''):
+      math_ops.matmul(x1, x2)
+
+  def test_mixing_numpy_arrays_and_graph_tensors(self):
+    with ops.Graph().as_default():
+      x1 = array_ops.ones((3, 3))
+    x2 = np.ones((3, 3), dtype='float32')
+    with self.assertRaisesRegexp(TypeError,
+                                 'provided list of inputs contains '
+                                 'objects other than \'EagerTensor\''):
+      math_ops.matmul(x1, x2)
+
+  def test_mixing_keras_symbolic_tensors_and_eager_tensors(self):
+    x1 = keras.Input((3,))
+    x2 = array_ops.ones((3, 3))
+    with self.assertRaisesRegexp(
+        TypeError,
+        'mix computation of symbolic Tensors'):
+      math_ops.matmul(x1, x2)
+
+  def test_mixing_keras_symbolic_tensors_and_numpy_arrays(self):
+    # For the time being we treat Numpy arrays as EagerTensors when mixing both.
+    x1 = keras.Input((3,))
+    x2 = np.ones((3, 3), dtype='float32')
+    with self.assertRaisesRegexp(
+        TypeError,
+        'mix computation of symbolic Tensors'):
+      math_ops.matmul(x1, x2)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2f947f17723fbb01280d7ef09f327dd64fc938e
--- /dev/null
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -0,0 +1,236 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains private utilities used mainly by the base Layer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as collections_lib
+import enum
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.util import nest
+
+
+class CallConvention(enum.Enum):
+  """Calling conventions for passing `Layer` inputs to `Layer.call`."""
+  # The Layer takes inputs as its first argument, named "inputs" for
+  # compatibility with the signature of Layer.__call__. This is the mode assumed
+  # for Layers which are not subclassed Models.
+  EXPLICIT_INPUTS_ARGUMENT = 1
+  # The Layer takes a single positional argument, not named "inputs". It's
+  # treated like an "inputs" argument.
+  SINGLE_POSITIONAL_ARGUMENT = 2
+  # The Layer has multiple positional arguments to which its inputs should be
+  # bound.
+  POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
+
+
+def create_mean_metric(value, name=None):
+  # TODO(psv): Remove this import when b/110718070 is fixed.
+  from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+  metric_obj = metrics_module.Mean(name=name)
+  result = metric_obj(value)
+  return metric_obj, result
+
+
+def make_variable(name,
+                  shape=None,
+                  dtype=dtypes.float32,
+                  initializer=None,
+                  partition_info=None,
+                  trainable=None,
+                  caching_device=None,
+                  validate_shape=True,
+                  constraint=None,
+                  use_resource=None,
+                  collections=None,
+                  synchronization=tf_variables.VariableSynchronization.AUTO,
+                  aggregation=tf_variables.VariableAggregation.NONE,
+                  partitioner=None):  # pylint: disable=unused-argument
+  """Temporary util to create a variable (relies on `variable_scope.variable`).
+
+  Some reuse-related technicalities prevent us from using
+  `variable_scope.get_variable()` directly, so we use a subcomponent
+  that has fewer constraints (`variable_scope.variable()`).
+
+  In the longer term, it seems like a similar "default variable creator" method
+  should exist in `CheckpointableBase` instead. When this happens, we can get
+  rid of this temporary solution.
+
+  TODO(fchollet): remove this method when no longer needed.
+  TODO(fchollet): handle `partitioner` argument.
+
+  Arguments:
+    name: Variable name.
+    shape: Variable shape.
+    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+    initializer: Initializer instance (callable).
+    partition_info: Not handled at this time.
+    trainable: Whether the variable should be part of the layer's
+      "trainable_variables" (e.g. variables, biases)
+      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+      Note, if the current variable scope is marked as non-trainable
+      then this parameter is ignored and any added variables are also
+      marked as non-trainable. `trainable` defaults to `True` unless
+      `synchronization` is set to `ON_READ`.
+    caching_device: Passed to `tf.Variable`.
+    validate_shape: Passed to `tf.Variable`.
+    constraint: Constraint instance (callable).
+    use_resource: Whether to use a `ResourceVariable`.
+    collections: List of graph collections keys. The new variable is added to
+      these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      `tf.VariableSynchronization`. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      `tf.VariableAggregation`.
+    partitioner: Not handled at this time.
+
+  Returns:
+    Variable instance.
+  """
+  initializing_from_value = False
+  if initializer is not None and not callable(initializer):
+    initializing_from_value = True
+
+  with ops.init_scope():
+    if initializing_from_value:
+      init_val = initializer
+      variable_dtype = None
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+          shape, dtype=dtype, partition_info=partition_info)
+      variable_dtype = dtype.base_dtype
+  if use_resource is None:
+    use_resource = True
+
+  # TODO(apassos,rohanj) figure out how to remove collections from here so we
+  # can remove the V1.
+  v = tf_variables.VariableV1(
+      initial_value=init_val,
+      name=name,
+      trainable=trainable,
+      caching_device=caching_device,
+      dtype=variable_dtype,
+      validate_shape=validate_shape,
+      constraint=constraint,
+      use_resource=use_resource,
+      collections=collections,
+      synchronization=synchronization,
+      aggregation=aggregation)
+  return v
+
+
+def get_default_graph_uid_map():
+  # TODO(fchollet): refactor this into backend.
+  graph = ops.get_default_graph()
+  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
+  if name_uid_map is None:
+    name_uid_map = collections_lib.defaultdict(int)
+    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
+  return name_uid_map
+
+
+def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
+                      zero_based=False):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+    name_uid_map: An optional defaultdict(int) to use when creating unique
+      names. If None (default), uses a per-Graph dictionary.
+    avoid_names: An optional set or dict with names which should not be used. If
+      None (default) does not avoid any names.
+    namespace: Gets a name which is unique within the (graph, namespace). Layers
+      which are not Networks use a blank namespace and so get graph-global
+      names.
+    zero_based: If True, name sequences start with no suffix (e.g. "dense",
+      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```python
+  _unique_layer_name('dense')  # dense_1
+  _unique_layer_name('dense')  # dense_2
+  ```
+  """
+  if name_uid_map is None:
+    name_uid_map = get_default_graph_uid_map()
+  if avoid_names is None:
+    avoid_names = set()
+  proposed_name = None
+  while proposed_name is None or proposed_name in avoid_names:
+    name_key = (namespace, name)
+    if zero_based:
+      number = name_uid_map[name_key]
+      if number:
+        proposed_name = name + '_' + str(number)
+      else:
+        proposed_name = name
+      name_uid_map[name_key] += 1
+    else:
+      name_uid_map[name_key] += 1
+      proposed_name = name + '_' + str(name_uid_map[name_key])
+  return proposed_name
+
+
+def collect_previous_mask(input_tensors):
+  """Retrieves the output mask(s) of the previous node.
+
+  Arguments:
+      input_tensors: A tensor or list of tensors.
+
+  Returns:
+      A mask tensor or list of mask tensors.
+  """
+  input_tensors = nest.flatten(input_tensors)
+  masks = []
+  for x in input_tensors:
+    if hasattr(x, '_keras_mask'):
+      mask = x._keras_mask  # pylint: disable=protected-access
+      masks.append(mask)
+    else:
+      masks.append(None)
+  if len(masks) == 1:
+    return masks[0]
+  return masks
+
+
+def have_all_keras_metadata(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = nest.flatten(iterable_or_element)
+  return all(hasattr(x, '_keras_history') for x in iterable)
+
+
+def generate_placeholders_from_shape(shape):
+  return array_ops.placeholder(shape=shape, dtype=backend.floatx())
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index 050602868a16d282d2ee9707678dbfaf00d684dc..32129afe64761048ed219a4e0caaae19292b9bc4 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -22,15 +22,17 @@ import numpy as np
 from tensorflow.python.client import session as session_module
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
@@ -52,14 +54,18 @@ def set_weights(distribution_strategy, dist_model, weights):
     num_param = len(layer.weights)
     layer_weights = weights[:num_param]
     for sw, w in zip(layer.weights, layer_weights):
-      assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
-
+      if ops.executing_eagerly_outside_functions():
+        sw.assign(w)
+      else:
+        assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
     weights = weights[num_param:]
-  K.get_session().run(assign_ops)
+
+  if not ops.executing_eagerly_outside_functions():
+    K.get_session().run(assign_ops)
 
 
 def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
-                  grouped_updates, grouped_session_args,
+                  grouped_updates=None, grouped_session_args=None,
                   with_loss_tensor=False):
   """Unwrap and return the list of values contained in the PerDevice parameters.
 
@@ -92,11 +98,8 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
                                         grouped_inputs)
   if with_loss_tensor:
     # reduce loss tensor before adding it to the list of fetches
-    loss = distribution_strategy.unwrap(
-        distribution_strategy.reduce(distribute_lib.get_loss_reduction(),
-                                     grouped_outputs[0],
-                                     destinations='/device:CPU:0'))[0]
-
+    loss = distribution_strategy.reduce(distribute_lib.get_loss_reduction(),
+                                        grouped_outputs[0])
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs[1:])
     all_outputs = [loss] + all_outputs
@@ -104,20 +107,25 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs)
 
-  all_updates = flatten_perdevice_values(distribution_strategy,
-                                         grouped_updates)
+  if grouped_updates:
+    all_updates = flatten_perdevice_values(distribution_strategy,
+                                           grouped_updates)
+  else:
+    all_updates = None
 
   all_session_args = {}
-  grouped_feed_dict = grouped_session_args.get('feed_dict')
-  if grouped_feed_dict:
-    all_session_args['feed_dict'] = flatten_perdevice_values(
-        distribution_strategy, grouped_feed_dict)
-
-  grouped_fetches = grouped_session_args.get('fetches')
-  if grouped_fetches:
-    all_session_args['fetches'] = flatten_perdevice_values(
-        distribution_strategy, grouped_fetches)
-
+  if grouped_session_args:
+    grouped_feed_dict = grouped_session_args.get('feed_dict')
+    if grouped_feed_dict:
+      all_session_args['feed_dict'] = flatten_perdevice_values(
+          distribution_strategy, grouped_feed_dict)
+
+    grouped_fetches = grouped_session_args.get('fetches')
+    if grouped_fetches:
+      all_session_args['fetches'] = flatten_perdevice_values(
+          distribution_strategy, grouped_fetches)
+
+  # TODO(priyag): Return only non empty/None values
   return all_inputs, all_outputs, all_updates, all_session_args
 
 
@@ -144,11 +152,14 @@ def flatten_perdevice_values(distribution_strategy, perdevice_values):
           for e in distribution_strategy.unwrap(flattened)]
 
 
-def validate_callbacks(input_callbacks):
+def validate_callbacks(input_callbacks, optimizer, current_strategy):
   """Validate whether given callbacks are supported by DistributionStrategy.
 
   Args:
     input_callbacks: List of callbacks passed by the user to fit.
+    optimizer: Optimizer instance used to train the model.
+    current_strategy: The DistributionStrategy used to distribute training
+      and validation.
 
   Raises:
     ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of the
@@ -170,26 +181,41 @@ def validate_callbacks(input_callbacks):
                         'these attributes are not set. You can access each of '
                         'the individual distributed models using the '
                         '`_grouped_model` attribute of your original model.')
-      if isinstance(callback, callbacks.LearningRateScheduler):
-        raise ValueError('LearningRateScheduler callback is not supported with '
-                         'DistributionStrategy.')
-      if isinstance(callback, callbacks.ReduceLROnPlateau):
-        raise ValueError('ReduceLROnPlateau callback is not supported with '
-                         'DistributionStrategy.')
+      if isinstance(callback, (callbacks.LearningRateScheduler,
+                               callbacks.ReduceLROnPlateau)):
+        strategy_name = current_strategy.__class__.__name__
+        # TODO(anjalisridhar): We might need to add a condition for multi
+        # worker strategy when we support it in Keras.
+        if is_tpu_strategy(current_strategy):
+          raise ValueError('%s callback is not supported with %s.' %
+                           (callback, strategy_name))
+
+        if not isinstance(optimizer, optimizer_v2.OptimizerV2):
+          raise ValueError('You must specify a Keras Optimizer V2 when using '
+                           '%s callback with DistributionStrategy.' % callback)
 
       # If users want to use the TensorBoard callback they cannot use certain
       # features of the callback that involve accessing model attributes and
       # running ops.
       if isinstance(callback, callbacks.TensorBoard):
         if callback.__getattribute__('histogram_freq'):
-          raise ValueError('histogram_freq in the TensorBoard callback is not '
-                           'supported when using DistributionStrategy.')
+          logging.warning(
+              UserWarning(
+                  '`histogram_freq` in the TensorBoard callback is not '
+                  'supported when using DistributionStrategy. Setting '
+                  '`histogram_freq` to `0`.'))
+          callback.histogram_freq = 0
         if callback.__getattribute__('write_grads'):
-          raise ValueError('write_grads in the TensorBoard callback is not '
-                           'supported when using DistributionStrategy.')
+          logging.warning(
+              UserWarning(
+                  '`write_grads` in the TensorBoard callback is not supported '
+                  'when using DistributionStrategy. Setting `write_grads` '
+                  'to `False`.'))
+          callback.histogram_freq = False
 
 
-def validate_distributed_dataset_inputs(distribution_strategy, x, y):
+def validate_distributed_dataset_inputs(distribution_strategy, x, y,
+                                        sample_weights=None):
   """Validate all the components of a DistributedValue Dataset input.
 
   Args:
@@ -203,6 +229,9 @@ def validate_distributed_dataset_inputs(distribution_strategy, x, y):
         `MirroredStrategy` this is a PerDevice object with a tensor for each
         device set in the dict. y can also be a tuple or dict. The keys of the
         dict should match the names of the output layers of the model.
+    sample_weights: Sample weights Dataset DistributedValue object. For example,
+        when we use `MirroredStrategy` this is a PerDevice object with a tensor
+        for each device set in the dict.
 
   Returns:
     The unwrapped values list of the x and y DistributedValues inputs.
@@ -225,8 +254,14 @@ def validate_distributed_dataset_inputs(distribution_strategy, x, y):
   else:
     y_values_list = None
 
+  if sample_weights is not None:
+    sample_weights_list = validate_per_device_inputs(distribution_strategy,
+                                                     sample_weights)
+  else:
+    sample_weights_list = None
+
   # Return the unwrapped values to avoid calling `unwrap` a second time.
-  return x_values_list, y_values_list
+  return x_values_list, y_values_list, sample_weights_list
 
 
 def validate_per_device_inputs(distribution_strategy, x):
@@ -283,19 +318,64 @@ def validate_all_tensor_shapes(x, x_values):
                        ' inputs {}'.format(x))
 
 
+def _wait_for_variable_initialization(session):
+  """Utility to wait for variables to be initialized."""
+  all_variables = K._get_variables(K.get_graph())  # pylint: disable=protected-access
+  candidate_vars = []
+  for v in all_variables:
+    if not getattr(v, '_keras_initialized', False):
+      candidate_vars.append(v)
+
+  if not candidate_vars:
+    return
+
+  while True:
+    is_initialized = session.run(
+        [variables.is_variable_initialized(v) for v in candidate_vars])
+    uninitialized_vars = []
+    for flag, v in zip(is_initialized, candidate_vars):
+      if not flag:
+        uninitialized_vars.append(v)
+      v._keras_initialized = True  # pylint: disable=protected-access
+    if not uninitialized_vars:
+      break
+
+
+def init_restore_or_wait_for_variables():
+  """Initialize or restore variables or wait for variables to be initialized."""
+  session = K._get_session()  # pylint: disable=protected-access
+  worker_context = dc_context.get_current_worker_context()
+  if not worker_context or worker_context.experimental_should_init:
+    # TODO(yuefengz): if checkpoints exit, restore from checkpoint.
+    K._initialize_variables(session)  # pylint: disable=protected-access
+  else:
+    _wait_for_variable_initialization(session)
+
+
 def configure_and_create_session(distribution_strategy):
   """Configure session config and create a session with it."""
   # TODO(priyag): Throw error if a session already exists.
   session_config = K.get_default_session_config()
-  distribution_strategy.configure(session_config)
 
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
-    # TODO(priyag): Remove this workaround when Distributed Coordinator is
-    # integrated with keras and we can create a session from there.
-    master = distribution_strategy._tpu_cluster_resolver.master()  # pylint: disable=protected-access
+  if is_tpu_strategy(distribution_strategy):
+    # TODO(priyag, yuefengz): Remove this workaround when Distribute
+    # Coordinator is integrated with keras and we can create a session from
+    # there.
+    distribution_strategy.configure(session_config)
+    master = distribution_strategy.extended._tpu_cluster_resolver.master()  # pylint: disable=protected-access
     session = session_module.Session(config=session_config, target=master)
   else:
-    session = session_module.Session(config=session_config)
+    worker_context = dc_context.get_current_worker_context()
+    if worker_context:
+      dc_session_config = worker_context.session_config
+      # Merge the default session config to the one from distribute coordinator,
+      # which is fine for now since they don't have conflicting configurations.
+      dc_session_config.MergeFrom(session_config)
+      session = session_module.Session(
+          config=dc_session_config, target=worker_context.master_target)
+    else:
+      distribution_strategy.configure(session_config)
+      session = session_module.Session(config=session_config)
 
   K.set_session(session)
 
@@ -324,11 +404,15 @@ def validate_inputs(x, y, distribution_strategy):
                      'Iterator. You must pass a `tf.data.Dataset` object or a '
                      'numpy array as input.')
 
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
+  if is_tpu_strategy(distribution_strategy):
     for i in [x, y]:
-      if isinstance(i, dataset_ops.Dataset):
+      if isinstance(i, dataset_ops.DatasetV2):
         shapes = nest.flatten(i.output_shapes)
-        if any([not s.is_fully_defined() for s in shapes]):
+        try:
+          s = next(s for s in shapes if not s.is_fully_defined())
+        except StopIteration:
+          continue
+        else:
           raise ValueError(
               'Using TPUs currently requires fully defined shapes. Either use '
               'set_shape() on the input tensors or use '
@@ -336,40 +420,97 @@ def validate_inputs(x, y, distribution_strategy):
               'Found unknown shape {} in input {}.'.format(s, i))
 
 
-def get_input_batch_params(first_x_value, batch_size, distribution_strategy):
+# TODO(b/118776054): Currently we support global batch size for TPUStrategy and
+# core MirroredStrategy only. Remove this check when contrib MirroredStrategy is
+# no longer needed.
+def global_batch_size_supported(distribution_strategy):
+  return distribution_strategy.extended._global_batch_size  # pylint: disable=protected-access
+
+
+# TODO(sourabhbajaj): Remove this once we use the same API for all strategies.
+def is_tpu_strategy(strategy):
+  """We're executing TPU Strategy."""
+  return strategy is not None and strategy.__class__.__name__ == 'TPUStrategy'
+
+
+def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
+                     is_training=False):
   """Calculate the number of batches and steps/steps_per_epoch.
 
   Args:
+    distribution_strategy: The DistributionStrategy used to compile the model.
     first_x_value: This is the first input numpy array that is passed in as the
       model input.
-    batch_size: The specified batch_size or the default batch_size of 32.
-    distribution_strategy: The current DistributionStrategy used to compile the
-      model.
+    steps:  The specified number of steps.
+    batch_size: The specified batch_size.
+    is_training: Boolean to relax the constraints on consuming all the training
+      samples to keep compatibility till we support partial batches.
 
   Returns:
-    The steps or steps_per_epoch argument depending on if a user is
-    calling `fit`, `evaluate` or `predict`.
+    steps: The steps or steps_per_epoch argument depending on if a user is
+        calling `fit`, `evaluate` or `predict`. If the is_training flag is set
+        we don't require the number of samples to be used completely.
+    batch_size: The batch size to be used in model iterations.
 
   Raises:
     ValueError: If the number of batches or steps evaluates to 0.
 
   """
-  num_batches = first_x_value.shape[0] // batch_size
-  if not num_batches:
-    raise ValueError('Please specify a batch_size that is smaller than'
-                     'the number of input samples %d.' % first_x_value.shape[0])
-  # TODO(anjalisridhar): TPU currently supports using the num_towers property.
-  # We might want to look into implementing worker_devices. In multi worker
-  # strategy, perhaps num_towers works better?
-  steps = num_batches // distribution_strategy.num_towers
-  if not steps:
-    # TODO(anjalisridhar): Number of towers in the error message may not convey
-    # what we want to the user. Is there another terminology that we can use
-    # that is consistent across different strategies.
-    raise ValueError('The number of batches %d is smaller than the number '
-                     'of towers %d used for DistributionStrategy. ' %
-                     (num_batches, distribution_strategy.num_towers))
-  return steps
+  num_samples = first_x_value.shape[0]
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
+  use_per_replica_batch = not global_batch_size_supported(
+      distribution_strategy)
+
+  if steps is None:
+    if batch_size is None:
+      # If neither the batch size or number of steps are set. We choose the
+      # global batch size as the minimum of number of samples and 32. 32 is
+      # chosen to provide backward compatibility.
+      global_batch_size = min(num_samples, 32)
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+    if not is_training and num_samples % global_batch_size:
+      raise ValueError('The number of samples %s is not divisible by '
+                       'batch size %s.' % (num_samples, global_batch_size))
+    steps = num_samples // global_batch_size
+  else:
+    if batch_size is None:
+      # We calculate the batch size based on the number of steps specified
+      if num_samples % steps:
+        raise ValueError('The number of samples %s is not divisible by '
+                         'steps %s. Please change the number of steps to a '
+                         'value that can consume all the samples' % (
+                             num_samples, steps))
+      global_batch_size = num_samples // steps
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+
+      if num_samples < (global_batch_size * steps):
+        raise ValueError('Number of samples %s is less than samples required '
+                         'for specified batch_size %s and steps %s' % (
+                             num_samples, global_batch_size, steps))
+
+  # We need to return the per replica or global batch size based on the strategy
+  if use_per_replica_batch:
+    if global_batch_size % distribution_strategy.num_replicas_in_sync:
+      raise ValueError(
+          'The batch size (%s) could not be sharded evenly across the sync '
+          'replicas (%s) in the distribution strategy.' % (
+              global_batch_size, distribution_strategy.num_replicas_in_sync))
+    batch_size = global_batch_size // distribution_strategy.num_replicas_in_sync
+  else:
+    batch_size = global_batch_size
+
+  return steps, batch_size
 
 
 def get_batch_dimension(iterator):
@@ -395,12 +536,12 @@ def get_cpu_device(distribution_strategy):
     NotImplementedError: We currently don't support copying numpy data to
     multiple hosts in the case of Cloud TPU pods.
   """
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
-    if distribution_strategy.num_hosts > 1:
+  if is_tpu_strategy(distribution_strategy):
+    if distribution_strategy.extended.num_hosts > 1:
       raise NotImplementedError('TPUDistributionStrategy does not '
                                 'support numpy inputs when running on Cloud'
                                 'TPU pods.')
-    return distribution_strategy.get_host_cpu_device(0)
+    return distribution_strategy.extended.get_host_cpu_device(0)
   else:
     # For all strategies except TPUDistributionStrategy
     # TODO(anjalisridhar): We may need to modify this when we add support for
@@ -459,7 +600,7 @@ def _get_var_for_numpy(distribution_strategy, input_array):
                                 input_var.dtype.size
 
   # Calculate number of elements we want to copy per slice.
-  batch_size_per_slice = np.ceil((64 << 20) / byte_size_per_batch_element)
+  batch_size_per_slice = int(np.ceil((64 << 20) / byte_size_per_batch_element))
 
   # Copy slices of the above size starting at 0, except the last slice will be
   # smaller.
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index e0478ee357b7a5e93d73be2c939930172b5943f7..b3f8cfe72585188d631c072b690729054d5db775 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -22,9 +22,10 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
@@ -33,7 +34,7 @@ class TestDNNModel(keras.models.Model):
 
   def __init__(self, feature_columns, units, name=None, **kwargs):
     super(TestDNNModel, self).__init__(name=name, **kwargs)
-    self._input_layer = fc.FeatureLayer(feature_columns, name='input_layer')
+    self._input_layer = fc.DenseFeatures(feature_columns, name='input_layer')
     self._dense_layer = keras.layers.Dense(units, name='dense_layer')
 
   def call(self, features):
@@ -42,23 +43,24 @@ class TestDNNModel(keras.models.Model):
     return net
 
 
-class FeatureColumnsIntegrationTest(test.TestCase):
+class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
 
   """
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_model(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.FeatureLayer(columns),
+        fc.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
     model.compile(
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = {'a': np.random.random((10, 1))}
     y = np.random.randint(20, size=(10, 1))
@@ -68,18 +70,19 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     model.evaluate(x, y, batch_size=5)
     model.predict(x, batch_size=5)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_model_with_ds_input(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.FeatureLayer(columns),
+        fc.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
     model.compile(
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
@@ -92,7 +95,7 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     model.evaluate(ds, steps=1)
     model.predict(ds, steps=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_subclassed_model_with_feature_columns(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
@@ -102,7 +105,8 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.compile(
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = {'a': np.random.random((10, 1)), 'b': np.random.random((10, 1))}
     y = np.random.randint(20, size=(10, 1))
@@ -112,7 +116,7 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.evaluate(x=x, y=y, batch_size=5)
     dnn_model.predict(x=x, batch_size=5)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_subclassed_model_with_feature_columns_with_ds_input(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
@@ -122,7 +126,8 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.compile(
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
@@ -135,15 +140,16 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.evaluate(ds, steps=1)
     dnn_model.predict(ds, steps=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  # TODO(kaftan) seems to throw an error when enabled.
+  @keras_parameterized.run_all_keras_modes
   def DISABLED_test_function_model_feature_layer_input(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
-    feature_layer = fc.FeatureLayer([col_a, col_b], name='fc')
+    feature_layer = fc.DenseFeatures([col_a, col_b], name='fc')
     dense = keras.layers.Dense(4)
 
-    # This seems problematic.... We probably need something for FeatureLayer
+    # This seems problematic.... We probably need something for DenseFeatures
     # the way Input is for InputLayer.
     output = dense(feature_layer)
 
@@ -161,17 +167,18 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     data = ({'a': np.arange(10), 'b': np.arange(10)}, np.arange(10, 20))
     print(model.fit(*data, epochs=1))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  # TODO(kaftan) seems to throw an error when enabled.
+  @keras_parameterized.run_all_keras_modes
   def DISABLED_test_function_model_multiple_feature_layer_inputs(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
     col_c = fc.numeric_column('c')
 
-    fc1 = fc.FeatureLayer([col_a, col_b], name='fc1')
-    fc2 = fc.FeatureLayer([col_b, col_c], name='fc2')
+    fc1 = fc.DenseFeatures([col_a, col_b], name='fc1')
+    fc2 = fc.DenseFeatures([col_b, col_c], name='fc2')
     dense = keras.layers.Dense(4)
 
-    # This seems problematic.... We probably need something for FeatureLayer
+    # This seems problematic.... We probably need something for DenseFeatures
     # the way Input is for InputLayer.
     output = dense(fc1) + dense(fc2)
 
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 6a69d0ed901511627609f9e6f7e91165df84830b..9874efe2bccd5e2db370ed54089424063afe88b5 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -19,11 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -71,13 +70,13 @@ class InputLayer(base_layer.Layer):
 
     if not name:
       prefix = 'input'
-      name = prefix + '_' + str(K.get_uid(prefix))
+      name = prefix + '_' + str(backend.get_uid(prefix))
 
     if not dtype:
       if input_tensor is None:
-        dtype = K.floatx()
+        dtype = backend.floatx()
       else:
-        dtype = K.dtype(input_tensor)
+        dtype = backend.dtype(input_tensor)
     super(InputLayer, self).__init__(dtype=dtype, name=name)
     self.built = True
     self.sparse = sparse
@@ -92,39 +91,31 @@ class InputLayer(base_layer.Layer):
         batch_input_shape = (batch_size,) + tuple(input_shape)
       else:
         batch_input_shape = None
-
-      if context.executing_eagerly():
-        # In eager mode, create a temporary placeholder to call the layer on.
-        input_tensor = base_layer.DeferredTensor(  # pylint: disable=protected-access
-            shape=batch_input_shape,
-            dtype=dtype,
-            name=self.name)
-      else:
+      graph = backend.get_graph()
+      with graph.as_default():
         # In graph mode, create a graph placeholder to call the layer on.
         if sparse:
-          input_tensor = array_ops.sparse_placeholder(
+          input_tensor = backend.placeholder(
               shape=batch_input_shape,
               dtype=dtype,
-              name=self.name)
+              name=self.name,
+              sparse=True)
         else:
-          input_tensor = array_ops.placeholder(
+          input_tensor = backend.placeholder(
               shape=batch_input_shape,
               dtype=dtype,
               name=self.name)
 
-      # For compatibility with Keras API.
       self.is_placeholder = True
       self._batch_input_shape = batch_input_shape
     else:
-      # For compatibility with Keras API.
-      self.is_placeholder = False
-      self._batch_input_shape = tuple(input_tensor.get_shape().as_list())
-
-      if context.executing_eagerly():
-        raise ValueError('You should not pass an input tensor when executing '
-                         'in eager mode. For example, instead of creating an '
+      if not tf_utils.is_symbolic_tensor(input_tensor):
+        raise ValueError('You should not pass an EagerTensor to `Input`. '
+                         'For example, instead of creating an '
                          'InputLayer, you should instantiate your model and '
                          'directly call it on your input.')
+      self.is_placeholder = False
+      self._batch_input_shape = tuple(input_tensor.get_shape().as_list())
 
     # Create an input node to add to self.outbound_node
     # and set output_tensors' _keras_history.
@@ -200,6 +191,16 @@ def Input(  # pylint: disable=invalid-name
       model = Model(x, y)
       ```
 
+      Note that even if eager execution is enabled,
+      `Input` produces a symbolic tensor (i.e. a placeholder).
+      This symbolic tensor can be used with other
+      TensorFlow ops, as such:
+
+      ```python
+      x = Input(shape=(32,))
+      y = tf.square(x)
+      ```
+
   Raises:
     ValueError: in case of invalid arguments.
   """
@@ -215,7 +216,7 @@ def Input(  # pylint: disable=invalid-name
     raise ValueError('Unrecognized keyword arguments:', kwargs.keys())
 
   if dtype is None:
-    dtype = K.floatx()
+    dtype = backend.floatx()
   if shape is None and tensor is None:
     raise ValueError('Please provide to Input either a `shape`'
                      ' or a `tensor` argument. Note that '
diff --git a/tensorflow/python/keras/engine/input_spec.py b/tensorflow/python/keras/engine/input_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..7277c16fe51197af3bf0e045814ccc29f7feaf7c
--- /dev/null
+++ b/tensorflow/python/keras/engine/input_spec.py
@@ -0,0 +1,170 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Contains the InputSpec class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('keras.layers.InputSpec',
+           v1=['keras.layers.InputSpec', 'layers.InputSpec'])
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
+
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
+
+def assert_input_compatibility(input_spec, inputs, layer_name):
+  """Checks compatibility between the layer and provided inputs.
+
+  This checks that the tensor(s) `inputs` verify the input assumptions
+  of a layer (if any). If not, a clear and actional exception gets raised.
+
+  Arguments:
+      input_spec: An InputSpec instance, or None.
+      inputs: Input tensor or list of input tensors.
+      layer_name: String, name of the layer (for error message formatting).
+
+  Raises:
+      ValueError: in case of mismatch between
+          the provided inputs and the expectations of the layer.
+  """
+  if not input_spec:
+    return
+  if not isinstance(input_spec, (list, tuple)):
+    input_spec = nest.flatten(input_spec)
+
+  inputs = nest.flatten(inputs)
+  if len(inputs) != len(input_spec):
+    raise ValueError('Layer ' + layer_name + ' expects ' +
+                     str(len(input_spec)) + ' inputs, '
+                     'but it received ' + str(len(inputs)) +
+                     ' input tensors. Inputs received: ' + str(inputs))
+  for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+    if spec is None:
+      continue
+
+    if (spec.ndim is not None or
+        spec.min_ndim is not None or
+        spec.max_ndim is not None):
+      if x.shape.ndims is None:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'its rank is undefined, but the layer requires a '
+                         'defined rank.')
+
+    # Check ndim.
+    if spec.ndim is not None:
+      ndim = x.shape.ndims
+      if ndim != spec.ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected ndim=' + str(spec.ndim) + ', found ndim=' +
+                         str(ndim) + '. Full shape received: ' +
+                         str(x.shape.as_list()))
+    if spec.max_ndim is not None:
+      ndim = x.shape.ndims
+      if ndim is not None and ndim > spec.max_ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected max_ndim=' + str(spec.max_ndim) +
+                         ', found ndim=' + str(ndim))
+    if spec.min_ndim is not None:
+      ndim = x.shape.ndims
+      if ndim is not None and ndim < spec.min_ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         ': expected min_ndim=' + str(spec.min_ndim) +
+                         ', found ndim=' + str(ndim) +
+                         '. Full shape received: ' +
+                         str(x.shape.as_list()))
+    # Check dtype.
+    if spec.dtype is not None:
+      if x.dtype != spec.dtype:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected dtype=' + str(spec.dtype) +
+                         ', found dtype=' + str(x.dtype))
+    # Check specific shape axes.
+    if spec.axes:
+      shape = x.shape.as_list()
+      if shape is not None:
+        for axis, value in spec.axes.items():
+          if hasattr(value, 'value'):
+            value = value.value
+          if value is not None and shape[int(axis)] not in {value, None}:
+            raise ValueError(
+                'Input ' + str(input_index) + ' of layer ' + layer_name + ' is'
+                ' incompatible with the layer: expected axis ' + str(axis) +
+                ' of input shape to have value ' + str(value) +
+                ' but received input with shape ' + str(shape))
+    # Check shape.
+    if spec.shape is not None:
+      shape = x.shape.as_list()
+      if shape is not None:
+        for spec_dim, dim in zip(spec.shape, shape):
+          if spec_dim is not None and dim is not None:
+            if spec_dim != dim:
+              raise ValueError('Input ' + str(input_index) +
+                               ' is incompatible with layer ' + layer_name +
+                               ': expected shape=' + str(spec.shape) +
+                               ', found shape=' + str(shape))
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 918488bd7a460c3318eee574b59a4e9e32254b5f..7e6cc7bfeef97f9ad567aed82757a0a18e8c06be 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -29,14 +29,16 @@ from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -112,11 +114,6 @@ class Network(base_layer.Layer):
     self.trainable = True
     self._is_compiled = False
     self._expects_training_arg = False
-    # A list of "extra" variables assigned to attributes of this class, included
-    # in self.weights and self.variables. Always empty for graph networks (but
-    # included in base_init to avoid excessive special casing when retrieving
-    # the value).
-    self._extra_variables = []
     # In many internal cases one needs to compute both the model's output
     # and its output mask without relying on `__call__` (which would do both and
     # set mask metadata), but for models, computing the mask requires to
@@ -134,10 +131,19 @@ class Network(base_layer.Layer):
       self.optimizer = None
 
     # Private attributes to implement compatibility with Layer.
+    self._trainable_weights = []
+    self._non_trainable_weights = []
     self._updates = []  # Used in symbolic mode only.
-    self._losses = []   # Used in symbolic mode only.
+    self._losses = []
+    self._eager_losses = []
+    # A list of metric instances corresponding to the symbolic metric tensors
+    # added using the `add_metric` API.
+    self._metrics = []
+    # A dictionary that maps metric names to metric result tensors.
+    self._metrics_tensors = {}
     self._scope = None  # Never used.
     self._reuse = None  # Never used.
+    self._call_is_graph_friendly = True
     if context.executing_eagerly():
       self._graph = None
     else:
@@ -158,7 +164,8 @@ class Network(base_layer.Layer):
 
   @checkpointable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs, name=None):
-    self._call_convention = base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
+    self._call_convention = (base_layer_utils
+                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, (list, tuple)):
       self.inputs = list(inputs)  # Tensor or list of tensors.
@@ -168,60 +175,7 @@ class Network(base_layer.Layer):
       self.outputs = list(outputs)
     else:
       self.outputs = [outputs]
-
-    # User-provided argument validation.
-    if context.executing_eagerly():
-      # Check that all inputs/outputs are DeferredTensors.
-      for tensor in self.inputs:
-        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
-          raise TypeError('When eager execution is enabled, '
-                          'inputs must come from a call to '
-                          '`tf.keras.Input` (called after '
-                          'tf.enable_eager_execution()). '
-                          'Received invalid input: ' + str(tensor))
-      for tensor in self.outputs:
-        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
-          raise TypeError('When eager execution is enabled, '
-                          'outputs must come from a call to '
-                          'a layer (called after '
-                          'tf.enable_eager_execution()). '
-                          'Received invalid output: ' + str(tensor))
-    # Check for redundancy in inputs.
-    if len(set(self.inputs)) != len(self.inputs):
-      raise ValueError('The list of inputs passed to the model '
-                       'is redundant. '
-                       'All inputs should only appear once.'
-                       ' Found: ' + str(self.inputs))
-    for x in self.inputs:
-      # Check that x has appropriate `_keras_history` metadata.
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Input tensors to a ' + cls_name + ' ' +
-                         'must come from `tf.layers.Input`. '
-                         'Received: ' + str(x) +
-                         ' (missing previous layer metadata).')
-      # Check that x is an input tensor.
-      # pylint: disable=protected-access
-      layer, node_index, tensor_index = x._keras_history
-      if len(layer._inbound_nodes) > 1 or (
-          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
-        cls_name = self.__class__.__name__
-        logging.warning(cls_name + ' inputs must come from '
-                        '`tf.layers.Input` (thus holding past layer metadata), '
-                        'they cannot be the output of '
-                        'a previous non-Input layer. '
-                        'Here, a tensor specified as '
-                        'input to "' + self.name + '" was not an Input tensor, '
-                        'it was generated by layer ' + layer.name + '.\n'
-                        'Note that input tensors are '
-                        'instantiated via `tensor = tf.layers.Input(shape)`.\n'
-                        'The tensor that caused the issue was: ' + str(x.name))
-    for x in self.outputs:
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Output tensors to a ' + cls_name + ' must be '
-                         'the output of a TensorFlow `Layer` '
-                         '(thus holding past layer metadata). Found: ' + str(x))
+    self._validate_graph_inputs_and_outputs()
 
     self._base_init(name=name)
     self._compute_previous_mask = (
@@ -293,9 +247,7 @@ class Network(base_layer.Layer):
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
         self._feed_input_shapes.append(backend.int_shape(self.inputs[i]))
-        # layer.input gives an error in eager mode
-        if not context.executing_eagerly():
-          self._feed_inputs.append(layer.input)
+        self._feed_inputs.append(layer.input)
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
@@ -313,8 +265,14 @@ class Network(base_layer.Layer):
     self.inputs = []
     self.built = False
 
+  @property
+  def _static_graph_friendly(self):
+    if self._is_graph_network:
+      return all(layer._static_graph_friendly for layer in self.layers)
+    return self._call_is_graph_friendly
+
   def _determine_call_convention(self, call_argspec):
-    """Decides how `self.call()` is invoked. See base_layer.CallConvention."""
+    """Decides how `self.call()` is invoked. See `CallConvention`."""
     if call_argspec.varargs:
       may_take_single_argument = False
     else:
@@ -346,11 +304,11 @@ class Network(base_layer.Layer):
               "Model.call() takes a single positional argument (to which "
               "inputs are passed by convention) and a separate 'inputs' "
               "argument. Unable to determine which arguments are inputs.")
-        return base_layer.CallConvention.SINGLE_POSITIONAL_ARGUMENT
+        return base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT
     if 'inputs' in call_argspec.args:
-      return base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
+      return base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT
     else:
-      return base_layer.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
+      return base_layer_utils.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
 
   def _track_layers(self, layers):
     """Add Checkpointable dependencies on a list of Layers."""
@@ -419,34 +377,26 @@ class Network(base_layer.Layer):
             # simply by assigning them to attributes.
           not self._is_graph_network
           and isinstance(value, variables.Variable)):
-        self._extra_variables.append(value)
+        if value.trainable:
+          # Could already be added via `add_weight`.
+          if value not in self._trainable_weights:
+            self._trainable_weights.append(value)
+        else:
+          if value not in self._non_trainable_weights:
+            self._non_trainable_weights.append(value)
+
+    # Keeping track of metric instance created in subclassed model/layer.
+    # We do this so that we can maintain the correct order of metrics by adding
+    # the instance to the `metrics` list as soon as it is created.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    if isinstance(value, metrics_module.Metric):
+      self._metrics.append(value)
     super(Network, self).__setattr__(name, value)
 
-  def add_variable(self, name, shape, dtype=None, initializer=None,
-                   regularizer=None, trainable=True, constraint=None):
-    if self._is_graph_network:
-      raise NotImplementedError('`add_variable` is not supported on Networks.')
-    else:
-      raise NotImplementedError(
-          '`add_variable` is not supported on Networks. However, you may '
-          'assign variables to attributes and they will show up in the weights '
-          'and variables properties.')
-
-  def add_loss(self, *args, **kwargs):
-    if context.executing_eagerly():
-      raise NotImplementedError('`add_loss` is not supported on Networks '
-                                'when eager execution is enabled.')
-    super(Network, self).add_loss(*args, **kwargs)
-
-  @property
-  def uses_learning_phase(self):
-    return any(
-        [getattr(x, '_uses_learning_phase', False) for x in self.outputs])
-
   @property
   def stateful(self):
-    return any([(hasattr(layer, 'stateful') and layer.stateful)
-                for layer in self.layers])
+    return any((hasattr(layer, 'stateful') and layer.stateful)
+               for layer in self.layers)
 
   def reset_states(self):
     for layer in self.layers:
@@ -551,19 +501,22 @@ class Network(base_layer.Layer):
 
   @property
   def _unfiltered_updates(self):
-    if context.executing_eagerly():
-      return []
     updates = []
     for layer in self.layers:
       if isinstance(layer, Network):
         updates += layer._unfiltered_updates
       else:
         updates += layer.updates
+    updates += self._updates
     return updates
 
   @property
   def _unfiltered_losses(self):
     losses = []
+    if context.executing_eagerly():
+      losses.extend(self._eager_losses)
+    else:
+      losses.extend(self._losses)
     for layer in self.layers:
       if isinstance(layer, Network):
         losses += layer._unfiltered_losses
@@ -571,6 +524,16 @@ class Network(base_layer.Layer):
         losses += layer.losses
     return losses
 
+  @checkpointable.no_automatic_dependency_tracking
+  def _clear_losses(self):
+    """Used every step in eager to reset losses."""
+    self._eager_losses = []
+    for layer in self.layers:
+      if isinstance(layer, Network):
+        layer._clear_losses()
+      else:
+        layer._eager_losses = []
+
   @property
   def updates(self):
     """Retrieves the network's updates.
@@ -621,9 +584,6 @@ class Network(base_layer.Layer):
     Returns:
         A list of update ops.
     """
-    if context.executing_eagerly():
-      return []
-
     if not self.trainable and not self.stateful:
       return []
 
@@ -639,7 +599,7 @@ class Network(base_layer.Layer):
       else:
         relevant_inputs.append(inputs)
     if not relevant_inputs:
-      return updates
+      return list(set(updates))
 
     reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, updates)
     relevant_conditional_updates = [x for x in updates if x in reachable]
@@ -647,8 +607,7 @@ class Network(base_layer.Layer):
         x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
     # A layer could be used multiple times in a nested structure,
     # so the updates list must be de-duped.
-    return list(set(
-        relevant_conditional_updates + unconditional_updates + self._updates))
+    return list(set(relevant_conditional_updates + unconditional_updates))
 
   @property
   def losses(self):
@@ -665,9 +624,27 @@ class Network(base_layer.Layer):
         A list of loss tensors.
     """
     losses = self._unfiltered_losses
+
     if context.executing_eagerly():
       return losses
 
+    # TODO(kaftan/fchollet): Clean this up / make it obsolete.
+    # This is a super ugly, confusing check necessary to
+    # handle the case where we are executing in a function graph in eager mode
+    # but the model was constructed symbolically in a separate graph scope.
+    # We need to capture the losses created in the current graph function,
+    # and filter out the incorrect loss tensors created when symbolically
+    # building the graph.
+    # We have to use this check because the code after it that checks
+    # for reachable inputs only captures the part of the model that was
+    # built symbolically, and captures the wrong tensors from a different
+    # func graph (causing a crash later on when trying to execute the
+    # graph function)
+    with ops.init_scope():
+      if context.executing_eagerly():
+        return [loss for loss in losses
+                if loss.graph == ops.get_default_graph()]
+
     relevant_inputs = []
     for i in range(0, len(self._inbound_nodes)):
       inputs = self.get_input_at(i)
@@ -690,14 +667,38 @@ class Network(base_layer.Layer):
     return checkpointable_layer_utils.gather_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
-        extra_variables=self._extra_variables)
+        extra_variables=self._trainable_weights)
 
   @property
   def non_trainable_weights(self):
     return checkpointable_layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
-        extra_variables=self._extra_variables)
+        extra_variables=self._non_trainable_weights + self._trainable_weights)
+
+  @property
+  def metrics(self):
+    """Returns the network's symbolic metrics.
+
+    Model overrides this function to include the metrics from `compile` API.
+    """
+    metrics = []
+    for layer in self.layers:
+      metrics += layer._metrics  # pylint: disable=protected-access
+    return metrics + self._metrics
+
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    # TODO(psv): Remove this property.
+    metrics_tensors = {}
+    for layer in self.layers:
+      if isinstance(layer, Network):
+        metrics_tensors.update(layer._all_metrics_tensors)
+      else:
+        metrics_tensors.update(layer._metrics_tensors)
+    metrics_tensors.update(self._metrics_tensors)
+    return metrics_tensors
 
   @property
   def input_spec(self):
@@ -733,6 +734,11 @@ class Network(base_layer.Layer):
     This is to be used for subclassed models, which do not know at instantiation
     time what their inputs look like.
 
+    This method only exists for users who want to call `model.build()` in a
+    standalone way (as a substitute for calling the model on real data to
+    build it). It will never be called by the framework (and thus it will
+    never throw unexpected errors in an unrelated workflow).
+
     Args:
      input_shape: Single tuple, TensorShape, or list of shapes, where shapes
          are tuples, integers, or TensorShapes.
@@ -769,48 +775,53 @@ class Network(base_layer.Layer):
       # in a Graph. Since tf.Variable is compatible with both eager execution
       # and graph building, the variables created after building the model in
       # a Graph are still valid when executing eagerly.
-      with context.graph_mode():
-        graph = eager_function.FuncGraph('graph')
-        with graph.as_default():
-          if isinstance(input_shape, list):
-            x = [base_layer.generate_placeholders_from_shape(shape)
-                 for shape in input_shape]
+      if context.executing_eagerly():
+        graph = func_graph.FuncGraph('build_graph')
+      else:
+        graph = backend.get_graph()
+      with graph.as_default():
+        if isinstance(input_shape, list):
+          x = [base_layer_utils.generate_placeholders_from_shape(shape)
+               for shape in input_shape]
+        else:
+          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
+
+        kwargs = {}
+        call_signature = tf_inspect.getfullargspec(self.call)
+        call_args = call_signature.args
+        # Exclude `self`, `inputs`, and any argument with a default value.
+        if len(call_args) > 2:
+          if call_signature.defaults:
+            call_args = call_args[2:-len(call_signature.defaults)]
           else:
-            x = base_layer.generate_placeholders_from_shape(input_shape)
-
-          kwargs = {}
-          num_call_args = len(tf_inspect.getfullargspec(self.call).args)
-          if self._expects_training_arg and num_call_args == 3:
-            # Has call signature of call(self, input, training)
-            kwargs['training'] = False
-          elif num_call_args > 2:
-            # Has invalid call signature of call(self, input, *args, **kwargs)
-            raise ValueError('Currently, you cannot build your model if it has '
-                             'positional or keyword arguments that are not '
-                             'inputs to the model, but are required for its '
-                             '`call` method. Instead, in order to instantiate '
-                             'and build your model, `call` your model on real '
-                             'tensor data with all expected call arguments.')
-
-          try:
-            self.call(x, **kwargs)
-          except (errors.InvalidArgumentError, TypeError):
-            raise ValueError('You cannot build your model by calling `build` '
-                             'if your layers do not support float type inputs. '
-                             'Instead, in order to instantiate and build your '
-                             'model, `call` your model on real tensor data (of '
-                             'the correct dtype).')
-
+            call_args = call_args[2:]
+          for arg in call_args:
+            if arg == 'training':
+              # Case where `training` is a positional arg with no default.
+              kwargs['training'] = False
+            else:
+              # Has invalid call signature with unknown positional arguments.
+              raise ValueError(
+                  'Currently, you cannot build your model if it has '
+                  'positional or keyword arguments that are not '
+                  'inputs to the model, but are required for its '
+                  '`call` method. Instead, in order to instantiate '
+                  'and build your model, `call` your model on real '
+                  'tensor data with all expected call arguments.')
+        elif len(call_args) < 2:
+          # Signature without `inputs`.
+          raise ValueError('You can only call `build` on a model if its `call` '
+                           'method accepts an `inputs` argument.')
+        try:
+          self.call(x, **kwargs)
+        except (errors.InvalidArgumentError, TypeError):
+          raise ValueError('You cannot build your model by calling `build` '
+                           'if your layers do not support float type inputs. '
+                           'Instead, in order to instantiate and build your '
+                           'model, `call` your model on real tensor data (of '
+                           'the correct dtype).')
     if self._layers:
       self._track_layers(self._layers)
-    if self.layers:
-      for layer in self.layers:
-        if not layer.built:
-          raise ValueError('Layer: {} was not built in your model. Calling '
-                           '`build` manually on a subclassed model is only '
-                           'allowed for models with a static topology. '
-                           'In this case, you can build your model by '
-                           'calling it on real tensor data.'.format(layer))
     self.built = True
 
   def call(self, inputs, training=None, mask=None):
@@ -857,9 +868,7 @@ class Network(base_layer.Layer):
 
   def compute_output_shape(self, input_shape):
     if not self._is_graph_network:
-      if context.executing_eagerly():
-        return super(Network, self).compute_output_shape(input_shape)
-      raise NotImplementedError
+      return super(Network, self).compute_output_shape(input_shape)
 
     if isinstance(input_shape, list):
       input_shapes = []
@@ -1077,11 +1086,8 @@ class Network(base_layer.Layer):
                   pass
 
               # Apply activity regularizer if any.
-              if layer.activity_regularizer is not None:
-                regularization_losses = [
-                    layer.activity_regularizer(x) for x in output_tensors
-                ]
-                layer.add_loss(regularization_losses, computed_tensors)
+              layer._handle_activity_regularization(computed_tensors,
+                                                    output_tensors)
 
           # Update tensor_map.
           for x, y, mask in zip(reference_output_tensors, output_tensors,
@@ -1641,18 +1647,76 @@ class Network(base_layer.Layer):
         ValueError: if `summary()` is called before the model is built.
     """
     if not self.built:
-      raise ValueError('This model has never been called, thus its weights '
-                       'have not yet been created, so no summary can be '
-                       'displayed. Build the model first '
-                       '(e.g. by calling it on some data).')
+      raise ValueError('This model has not yet been built. '
+                       'Build the model first by calling `build()` or calling '
+                       '`fit()` with some data, or specify '
+                       'an `input_shape` argument in the first layer(s) for '
+                       'automatic build.')
     layer_utils.print_summary(self,
                               line_length=line_length,
                               positions=positions,
                               print_fn=print_fn)
 
+  def _validate_graph_inputs_and_outputs(self):
+    """Validates the inputs and outputs of a Graph Network."""
+    # Check for redundancy in inputs.
+    if len(set(self.inputs)) != len(self.inputs):
+      raise ValueError('The list of inputs passed to the model '
+                       'is redundant. '
+                       'All inputs should only appear once.'
+                       ' Found: ' + str(self.inputs))
+
+    for x in self.inputs:
+      # Check that x has appropriate `_keras_history` metadata.
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Input tensors to a ' + cls_name + ' ' +
+                         'must come from `tf.keras.Input`. '
+                         'Received: ' + str(x) +
+                         ' (missing previous layer metadata).')
+      # Check that x is an input tensor.
+      # pylint: disable=protected-access
+      layer, _, _ = x._keras_history
+      if len(layer._inbound_nodes) > 1 or (
+          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
+        cls_name = self.__class__.__name__
+        logging.warning(cls_name + ' inputs must come from '
+                        '`tf.keras.Input` (thus holding past layer metadata), '
+                        'they cannot be the output of '
+                        'a previous non-Input layer. '
+                        'Here, a tensor specified as '
+                        'input to "' + self.name + '" was not an Input tensor, '
+                        'it was generated by layer ' + layer.name + '.\n'
+                        'Note that input tensors are '
+                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
+                        'The tensor that caused the issue was: ' + str(x.name))
+
+    # Check compatibility of batch sizes of Input Layers.
+    input_batch_sizes = [
+        training_utils.get_static_batch_size(x._keras_history[0])
+        for x in self.inputs
+    ]
+    consistent_batch_size = None
+    for batch_size in input_batch_sizes:
+      if batch_size is not None:
+        if (consistent_batch_size is not None and
+            batch_size != consistent_batch_size):
+          raise ValueError('The specified batch sizes of the Input Layers'
+                           ' are incompatible. Found batch sizes: {}'.format(
+                               input_batch_sizes))
+        consistent_batch_size = batch_size
+
+    for x in self.outputs:
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Output tensors to a ' + cls_name + ' must be '
+                         'the output of a TensorFlow `Layer` '
+                         '(thus holding past layer metadata). Found: ' + str(x))
+
 
 def _is_hdf5_filepath(filepath):
-  return filepath.endswith('.h5') or filepath.endswith('.keras')
+  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
+          filepath.endswith('.hdf5'))
 
 
 def _make_node_key(layer_name, node_index):
diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index a2f31fda8fa20f1be6e04693f3d8619db4a69707..54d9e32fb258343dfd9b75351015959952893c1a 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -79,6 +79,10 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
 
   from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
+  # TODO(psv) Add warning when we save models that contain non-serializable
+  # entities like metrics added using `add_metric` and losses added using
+  # `add_loss.`
+
   if not isinstance(filepath, h5py.File):
     # If file exists and should not be overwritten.
     if not overwrite and os.path.isfile(filepath):
@@ -126,8 +130,8 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
                     'config': model.optimizer.get_config()
                 },
                 'loss': model.loss,
-                'metrics': model.metrics,
-                'weighted_metrics': model.weighted_metrics,
+                'metrics': model._compile_metrics,
+                'weighted_metrics': model._compile_weighted_metrics,
                 'sample_weight_mode': model.sample_weight_mode,
                 'loss_weights': model.loss_weights,
             },
@@ -264,22 +268,32 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
       # Set optimizer weights.
       if 'optimizer_weights' in f:
         # Build train function (to get weight updates).
-        model._make_train_function()
-        optimizer_weights_group = f['optimizer_weights']
-        optimizer_weight_names = [
-            n.decode('utf8')
-            for n in optimizer_weights_group.attrs['weight_names']
-        ]
-        optimizer_weight_values = [
-            optimizer_weights_group[n] for n in optimizer_weight_names
-        ]
-        try:
-          model.optimizer.set_weights(optimizer_weight_values)
-        except ValueError:
-          logging.warning('Error in loading the saved optimizer '
-                          'state. As a result, your model is '
-                          'starting with a freshly initialized '
-                          'optimizer.')
+        # Models that aren't graph networks must wait until they are called
+        # with data to _make_train_function() and so can't load optimizer
+        # weights.
+        if model._is_graph_network:  # pylint: disable=protected-access
+          model._make_train_function()
+          optimizer_weights_group = f['optimizer_weights']
+          optimizer_weight_names = [
+              n.decode('utf8')
+              for n in optimizer_weights_group.attrs['weight_names']
+          ]
+          optimizer_weight_values = [
+              optimizer_weights_group[n] for n in optimizer_weight_names
+          ]
+          try:
+            model.optimizer.set_weights(optimizer_weight_values)
+          except ValueError:
+            logging.warning('Error in loading the saved optimizer '
+                            'state. As a result, your model is '
+                            'starting with a freshly initialized '
+                            'optimizer.')
+        else:
+          logging.warning('Sequential models without an `input_shape` '
+                          'passed to the first layer cannot reload their '
+                          'optimizer state. As a result, your model is'
+                          'starting with a freshly initialized optimizer.')
+
   finally:
     if opened_new_file:
       f.close()
@@ -903,7 +917,7 @@ def save_attributes_to_hdf5_group(group, name, data):
   chunked_data = np.array_split(data_npy, num_chunks)
 
   # This will never loop forever thanks to the test above.
-  while any([x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data]):
+  while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
     num_chunks += 1
     chunked_data = np.array_split(data_npy, num_chunks)
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index f5045be907327aacf849900b2d27473173afdafa..bc33a3ea7f3ef38e9f94854043fe7bdc7a9bfe46 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import training
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -288,6 +289,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                                  r'element\(s\)\.'):
       saving.load_weights_from_hdf5_group_by_name(f_model, model.layers)
 
+  @test_util.run_deprecated_v1
   def test_sequential_weight_loading_group_name_with_incorrect_shape(self):
     if h5py is None:
       return
@@ -330,6 +332,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
 class TestWholeModelSaving(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def test_sequential_model_saving(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -382,6 +385,43 @@ class TestWholeModelSaving(test.TestCase):
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_deprecated_v1
+  def test_sequential_model_saving_without_input_shape(self):
+    if h5py is None:
+      self.skipTest('h5py required to run this test')
+
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          metrics=[
+              keras.metrics.categorical_accuracy,
+              keras.metrics.CategoricalAccuracy()
+          ],
+          weighted_metrics=[
+              keras.metrics.categorical_accuracy,
+              keras.metrics.CategoricalAccuracy()
+          ],
+          sample_weight_mode='temporal')
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      fd, fname = tempfile.mkstemp('.h5', dir=self.get_temp_dir())
+      model.save(fname)
+
+      new_model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
+
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
   def test_sequential_model_saving_without_compile(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -406,6 +446,7 @@ class TestWholeModelSaving(test.TestCase):
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_sequential_model_saving_2(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -442,6 +483,7 @@ class TestWholeModelSaving(test.TestCase):
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_functional_model_saving(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -593,6 +635,7 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  @test_util.run_v1_only('b/120545219')
   def test_saving_model_with_long_weights_names(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -638,6 +681,7 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  @test_util.run_deprecated_v1
   def test_model_saving_to_pre_created_h5py_file(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -679,6 +723,25 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  def test_saving_constant_initializer_with_numpy(self):
+    if h5py is None:
+      self.skipTest('h5py required to run this test')
+
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              2,
+              input_shape=(3,),
+              kernel_initializer=keras.initializers.Constant(np.ones((3, 2)))))
+      model.add(keras.layers.Dense(3))
+      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
+      model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
+
 
 class SubclassedModel(training.Model):
 
@@ -693,6 +756,7 @@ class SubclassedModel(training.Model):
 
 class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def test_keras_optimizer_warning(self):
     graph = ops.Graph()
     with graph.as_default(), self.session(graph):
@@ -936,5 +1000,57 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
         AssertionError, 'Nothing except the root object matched'):
       m.load_weights(save_path)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_directory_passed(self):
+    m = keras.Model()
+    v = m.add_weight(name='v', shape=[])
+    self.evaluate(v.assign(42.))
+    prefix = os.path.join(self.get_temp_dir(), '{}'.format(ops.uid()), 'ckpt/')
+    m.save_weights(prefix)
+    self.evaluate(v.assign(2.))
+    m.load_weights(prefix)
+    self.assertEqual(42., self.evaluate(v))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_relative_path(self):
+    m = keras.Model()
+    v = m.add_weight(name='v', shape=[])
+    os.chdir(self.get_temp_dir())
+
+    prefix = 'ackpt'
+    self.evaluate(v.assign(42.))
+    m.save_weights(prefix)
+    self.assertTrue(file_io.file_exists('ackpt.index'))
+    self.evaluate(v.assign(1.))
+    m.load_weights(prefix)
+    self.assertEqual(42., self.evaluate(v))
+
+    prefix = 'subdir/ackpt'
+    self.evaluate(v.assign(43.))
+    m.save_weights(prefix)
+    self.assertTrue(file_io.file_exists('subdir/ackpt.index'))
+    self.evaluate(v.assign(2.))
+    m.load_weights(prefix)
+    self.assertEqual(43., self.evaluate(v))
+
+    prefix = 'ackpt/'
+    self.evaluate(v.assign(44.))
+    m.save_weights(prefix)
+    self.assertTrue(file_io.file_exists('ackpt/.index'))
+    self.evaluate(v.assign(3.))
+    m.load_weights(prefix)
+    self.assertEqual(44., self.evaluate(v))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_nonexistant_prefix_directory(self):
+    m = keras.Model()
+    v = m.add_weight(name='v', shape=[])
+    self.evaluate(v.assign(42.))
+    prefix = os.path.join(self.get_temp_dir(), '{}'.format(ops.uid()), 'bckpt')
+    m.save_weights(prefix)
+    self.evaluate(v.assign(2.))
+    m.load_weights(prefix)
+    self.assertEqual(42., self.evaluate(v))
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 9f4019e29cef4404f0976b374919584ec083c76b..3255613f6af07988e874339b96002355e39e6d14 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.engine.network import Network
@@ -119,6 +120,10 @@ class Sequential(Model):
       return layers[1:]
     return layers[:]
 
+  @property
+  def _static_graph_friendly(self):
+    return all(layer._static_graph_friendly for layer in self.layers)
+
   @checkpointable.no_automatic_dependency_tracking
   def add(self, layer):
     """Adds a layer instance on top of the layer stack.
@@ -146,7 +151,7 @@ class Sequential(Model):
         assert len(layer._inbound_nodes[-1].output_tensors) == 1
         set_inputs = True
       else:
-        batch_shape, dtype = get_input_shape_and_dtype(layer)
+        batch_shape, dtype = training_utils.get_input_shape_and_dtype(layer)
         if batch_shape:
           # Instantiate an input layer.
           x = Input(
@@ -208,20 +213,16 @@ class Sequential(Model):
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
       self.built = True
 
+  @base_layer.default
   def build(self, input_shape=None):
     if self._is_graph_network:
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
     else:
       if input_shape is None:
         raise ValueError('You must provide an `input_shape` argument.')
+      input_shape = tuple(input_shape)
       self._build_input_shape = input_shape
-      shape = input_shape
-      for layer in self.layers:
-        if not layer.built:
-          with ops.name_scope(layer._name_scope()):
-            layer.build(shape)
-          layer.built = True
-        shape = layer.compute_output_shape(shape)
+      super(Sequential, self).build(input_shape)
     self.built = True
 
   def call(self, inputs, training=None, mask=None):
@@ -233,8 +234,8 @@ class Sequential(Model):
     return outputs
 
   def _call_and_compute_mask(self, inputs, training=None, mask=None):
-    if not self.built:
-      self.build(inputs.shape)
+    if not self.built and self._is_graph_network:
+      self._init_graph_network(self.inputs, self.outputs, name=self.name)
 
     x = inputs
     for layer in self.layers:
@@ -247,6 +248,11 @@ class Sequential(Model):
       if isinstance(layer, Network) and layer._compute_output_and_mask_jointly:
         x, mask = layer._call_and_compute_mask(x, **kwargs)
       else:
+        if not layer.built:
+          # Build layer if applicable.
+          with ops.name_scope(layer._name_scope()):
+            layer._maybe_build(x)
+          layer.built = True
         x = layer.call(x, **kwargs)
         if layer.supports_masking:
           mask = layer.compute_mask(x, mask)
@@ -308,6 +314,10 @@ class Sequential(Model):
     else:
       return (proba > 0.5).astype('int32')
 
+  def save(self, filepath, overwrite=True, include_optimizer=True):
+    from tensorflow.python.keras.models import save_model  # pylint: disable=g-import-not-at-top
+    save_model(self, filepath, overwrite, include_optimizer)
+
   def get_config(self):
     layer_configs = []
     for layer in self.layers:
@@ -340,39 +350,13 @@ class Sequential(Model):
       model.add(layer)
     if not model.inputs and build_input_shape:
       model.build(build_input_shape)
+    if not model._is_graph_network:
+      # Still needs to be built when passed input data.
+      model.built = False
     return model
 
-
-def get_input_shape_and_dtype(layer):
-  """Retrieve input shape and input dtype of layer if applicable.
-
-  Args:
-    layer: Layer (or model) instance.
-
-  Returns:
-    Tuple (input_shape, input_dtype). Both could be None if the layer
-      does not have a defined input shape.
-
-  Raises:
-    ValueError: in case an empty Sequential or Graph Network is passed.
-  """
-  if ((isinstance(layer, Model) and layer._is_graph_network)
-      or isinstance(layer, Sequential)):
-    # We were passed a model as first layer.
-    # This requires a specific way to figure out the
-    # input shape and dtype.
-    if not layer.layers:
-      raise ValueError('Cannot add an empty model '
-                       'to a `Sequential` model.')
-    # In case of nested models: recover the first layer
-    # of the deepest model to infer input shape and dtype.
-    layer = layer.layers[0]
-    while ((isinstance(layer, Model) and layer._is_graph_network)
-           or isinstance(layer, Sequential)):
-      layer = layer.layers[0]
-
-  if hasattr(layer, '_batch_input_shape'):
-    batch_shape = layer._batch_input_shape
-    dtype = layer.dtype
-    return batch_shape, dtype
-  return None, None
+  @property
+  def input_spec(self):
+    if self.layers and hasattr(self.layers[0], 'input_spec'):
+      return self.layers[0].input_spec
+    return None
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 9d615c9b0cbd45d1e852af2d1d3366cb19fac281..10f69da061c336cd1727ce4d34f1637e21329f3a 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -23,19 +23,21 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
 
-class TestSequential(test.TestCase, parameterized.TestCase):
+class TestSequential(keras_parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
   """
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_basic_methods(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(1, input_dim=2))
@@ -46,7 +48,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertEqual(model.get_layer(name='dp').name, 'dp')
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_pop(self):
     num_hidden = 5
     input_dim = 3
@@ -55,14 +57,16 @@ class TestSequential(test.TestCase, parameterized.TestCase):
 
     model = testing_utils.get_small_sequential_mlp(
         num_hidden, num_classes, input_dim)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((batch_size, input_dim))
     y = np.random.random((batch_size, num_classes))
     model.fit(x, y, epochs=1)
     model.pop()
     self.assertEqual(len(model.layers), 1)
     self.assertEqual(model.output_shape, (None, num_hidden))
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
     y = np.random.random((batch_size, num_hidden))
     model.fit(x, y, epochs=1)
 
@@ -78,7 +82,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     with self.assertRaises(TypeError):
       model.pop()
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_with_np_arrays(self):
     num_hidden = 5
     input_dim = 3
@@ -89,7 +93,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=[keras.metrics.CategoricalAccuracy()])
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
     self.assertEqual(len(model.weights), 0)
     self.assertFalse(model.built)
@@ -101,7 +106,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertFalse(model._is_graph_network)
     self.assertEqual(len(model.weights), 2 * 2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_with_dataset_iterators(self):
     num_hidden = 5
     input_dim = 3
@@ -113,7 +118,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=[keras.metrics.CategoricalAccuracy()])
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
     self.assertEqual(len(model.weights), 0)
     self.assertFalse(model.built)
@@ -123,14 +129,16 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     model.fit(iterator, epochs=1, steps_per_epoch=steps_per_epoch)
     self.assertTrue(model.built)
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertFalse(model._is_graph_network)
 
+  # TODO(kaftan) This test fails w/ run_with_all_keras_modes. File ticket
   @parameterized.parameters((True,), (False,))
+  @tf_test_util.run_deprecated_v1
   def test_training_and_eval_methods_on_symbolic_tensors(self, deferred):
     with self.cached_session():
 
@@ -173,7 +181,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
           validation_data=(inputs, targets),
           validation_steps=2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_invalid_use_cases(self):
     # Added objects must be layer instances
     with self.assertRaises(TypeError):
@@ -197,7 +205,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
       model.add(keras.layers.Dense(1, input_dim=1))
       model.add(MyLayer())
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_nested_sequential_trainability(self):
     input_dim = 20
     num_units = 10
@@ -218,6 +226,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_sequential_update_disabling(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -246,7 +255,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
       x2 = model.predict(val_a)
       assert np.abs(np.sum(x1 - x2)) > 1e-5
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_serialization(self):
     num_hidden = 5
     input_dim = 3
@@ -257,7 +266,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=[keras.metrics.CategoricalAccuracy()])
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
     self.assertFalse(model.built)
 
     x = np.random.random((batch_size, input_dim))
@@ -269,17 +279,16 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertIn('build_input_shape', config)
 
     new_model = keras.models.Sequential.from_config(config)
-    self.assertTrue(new_model.built)
-    self.assertEqual(len(model.layers), 2)
-    self.assertEqual(len(model.weights), 4)
+    self.assertEqual(len(new_model.layers), 2)
+    self.assertEqual(len(new_model.weights), 4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_shape_inference_deferred(self):
     model = testing_utils.get_small_sequential_mlp(4, 5)
     output_shape = model.compute_output_shape((None, 7))
     self.assertEqual(tuple(output_shape.as_list()), (None, 5))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_build_deferred(self):
     model = testing_utils.get_small_sequential_mlp(4, 5)
 
@@ -294,21 +303,21 @@ class TestSequential(test.TestCase, parameterized.TestCase):
 
     model.build((None, 10))
     self.assertTrue(model.built)
-    self.assertTrue(model.layers[-1].built)
     self.assertEqual(len(model.weights), 8)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_nesting(self):
     model = testing_utils.get_small_sequential_mlp(4, 3)
     inner_model = testing_utils.get_small_sequential_mlp(4, 5)
     model.add(inner_model)
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_variable_names(self):
     model = keras.models.Sequential([keras.layers.Dense(3)])
     model.add(keras.layers.Dense(2))
@@ -318,10 +327,19 @@ class TestSequential(test.TestCase, parameterized.TestCase):
          'sequential/dense_1/kernel:0', 'sequential/dense_1/bias:0'],
         [v.name for v in model.variables])
 
+  @keras_parameterized.run_all_keras_modes
+  def test_input_assumptions_propagation(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1))
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(ValueError,
+                                   'expected min_ndim=2, found ndim=0'):
+        model(1.0)
+
 
-class TestSequentialEagerIntegration(test.TestCase):
+class TestSequentialEagerIntegration(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_defun_on_call(self):
     # Check that one can subclass Sequential and place the `call` in a `defun`.
 
@@ -335,17 +353,19 @@ class TestSequentialEagerIntegration(test.TestCase):
     model.add(keras.layers.Dense(4, activation='relu'))
     model.add(keras.layers.Dense(5, activation='softmax'))
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_build_before_fit(self):
     # Fix for b/112433577
     model = testing_utils.get_small_sequential_mlp(4, 5)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     model.build((None, 6))
 
@@ -353,6 +373,26 @@ class TestSequentialEagerIntegration(test.TestCase):
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_sequential_model_fails_with_dict_inputs(self):
+    num_classes = 5
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes)
+    model.compile(
+        rmsprop.RMSPropOptimizer(learning_rate=0.001),
+        metrics=['acc'],
+        weighted_metrics=['mae'],
+        loss='categorical_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    x = {'dense_input': np.random.random((10, 1))}
+    y = np.random.randint(num_classes, size=(10, 1))
+
+    with self.assertRaisesRegexp(
+        ValueError, 'Passing a dictionary input to a Sequential Model which '
+        'doesn\'t have FeatureLayer as the first layer is an error'):
+      model.fit(x, y, batch_size=5, epochs=1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index b4488033cda764e3f2954f151eb4cb797f33c522..4071e2c091eede29af9418105e63c157ce2dc101 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
 from tensorflow.python.keras.engine import network as network_lib
 from tensorflow.python.ops import array_ops
@@ -43,6 +42,7 @@ except ImportError:
 
 class TopologyConstructionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_get_updates(self):
 
     class MyLayer(keras.layers.Layer):
@@ -107,6 +107,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(network.updates), 5)
     self.assertEqual(len(network.get_updates_for(x4)), 2)
 
+  @test_util.run_v1_only('b/120545219')
   def test_get_updates_bn(self):
     x1 = input_layer_lib.Input(shape=(1,))
     layer = keras.layers.BatchNormalization()
@@ -116,6 +117,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(layer.get_updates_for(x1)), 2)
     self.assertEqual(len(layer.get_updates_for(None)), 0)
 
+  @test_util.run_deprecated_v1
   def test_get_losses(self):
 
     class MyLayer(keras.layers.Layer):
@@ -269,6 +271,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
     self.assertEqual(test_layer.output_shape, (None, 32))
 
+  @test_util.run_deprecated_v1
   def testBasicNetwork(self):
     # minimum viable network
     x = input_layer_lib.Input(shape=(32,))
@@ -342,62 +345,25 @@ class TopologyConstructionTest(test.TestCase):
     self.assertListEqual(model.trainable_weights, [])
     self.assertListEqual(model.non_trainable_weights, weights)
 
-  def test_learning_phase(self):
-    with self.cached_session():
-      a = keras.layers.Input(shape=(32,), name='input_a')
-      b = keras.layers.Input(shape=(32,), name='input_b')
-
-      a_2 = keras.layers.Dense(16, name='dense_1')(a)
-      dp = keras.layers.Dropout(0.5, name='dropout')
-      b_2 = dp(b)
-
-      self.assertFalse(a_2._uses_learning_phase)
-      self.assertTrue(b_2._uses_learning_phase)
-
-      # test merge
-      m = keras.layers.concatenate([a_2, b_2])
-      self.assertTrue(m._uses_learning_phase)
-
-      # Test recursion
-      model = keras.models.Model([a, b], [a_2, b_2])
-      self.assertTrue(model.uses_learning_phase)
-
-      c = keras.layers.Input(shape=(32,), name='input_c')
-      d = keras.layers.Input(shape=(32,), name='input_d')
-
-      c_2, b_2 = model([c, d])
-      self.assertTrue(c_2._uses_learning_phase)
-      self.assertTrue(b_2._uses_learning_phase)
-
-      # try actually running graph
-      fn = keras.backend.function(
-          model.inputs + [keras.backend.learning_phase()], model.outputs)
-      input_a_np = np.random.random((10, 32))
-      input_b_np = np.random.random((10, 32))
-      fn_outputs_no_dp = fn([input_a_np, input_b_np, 0])
-      fn_outputs_dp = fn([input_a_np, input_b_np, 1])
-      # output a: nothing changes
-      self.assertEqual(fn_outputs_no_dp[0].sum(), fn_outputs_dp[0].sum())
-      # output b: dropout applied
-      self.assertNotEqual(fn_outputs_no_dp[1].sum(), fn_outputs_dp[1].sum())
-
+  @test_util.run_deprecated_v1
   def test_layer_call_arguments(self):
     # Test the ability to pass and serialize arguments to `call`.
     inp = keras.layers.Input(shape=(2,))
     x = keras.layers.Dense(3)(inp)
     x = keras.layers.Dropout(0.5)(x, training=True)
     model = keras.models.Model(inp, x)
-    self.assertFalse(model.uses_learning_phase)
+    # Would be `dropout/cond/Merge` by default
+    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
 
     # Test that argument is kept when applying the model
     inp2 = keras.layers.Input(shape=(2,))
     out2 = model(inp2)
-    self.assertFalse(out2._uses_learning_phase)
+    self.assertTrue(out2.op.name.endswith('dropout/mul'))
 
     # Test that argument is kept after loading a model
     config = model.get_config()
     model = keras.models.Model.from_config(config)
-    self.assertFalse(model.uses_learning_phase)
+    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
 
   def test_node_construction(self):
     # test basics
@@ -530,6 +496,7 @@ class TopologyConstructionTest(test.TestCase):
       fn_outputs = fn([input_a_np, input_b_np])
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
 
+  @test_util.run_deprecated_v1
   def test_recursion(self):
     with self.cached_session():
       a = keras.layers.Input(shape=(32,), name='input_a')
@@ -714,6 +681,7 @@ class TopologyConstructionTest(test.TestCase):
     with self.assertRaises(Exception):
       keras.models.Model([j, k], [m, n, 0])
 
+  @test_util.run_deprecated_v1
   def test_raw_tf_compatibility(self):
     # test calling layers/models on TF tensors
     a = keras.layers.Input(shape=(32,), name='input_a')
@@ -758,6 +726,7 @@ class TopologyConstructionTest(test.TestCase):
     model = keras.models.Model(a, b)
     self.assertEqual(model.output_mask.get_shape().as_list(), [None, 10])
 
+  @test_util.run_deprecated_v1
   def testMaskingSingleInput(self):
 
     class MaskedLayer(keras.layers.Layer):
@@ -795,6 +764,7 @@ class TopologyConstructionTest(test.TestCase):
       y_2 = network(x_2)
       self.assertEqual(y_2.get_shape().as_list(), [None, 32])
 
+  @test_util.run_deprecated_v1
   def test_activity_regularization_with_model_composition(self):
 
     def reg(x):
@@ -864,6 +834,7 @@ class TopologyConstructionTest(test.TestCase):
       output_val_2 = m2.predict(x_val)
       self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
+  @test_util.run_v1_only('b/120545219')
   def test_explicit_training_argument(self):
     with self.cached_session():
       a = keras.layers.Input(shape=(2,))
@@ -934,26 +905,15 @@ class TopologyConstructionTest(test.TestCase):
 
 class DeferredModeTest(test.TestCase):
 
-  def testDeferredTensorAttributes(self):
-    x = base_layer.DeferredTensor(shape=(None, 2),
-                                  dtype='float32',
-                                  name='x')
-    self.assertEqual(str(x),
-                     'DeferredTensor(\'x\', shape=(?, 2), dtype=float32)')
-    self.assertEqual(repr(x),
-                     '<DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
-
   @test_util.run_in_graph_and_eager_modes()
   def testSimpleNetworkBuilding(self):
     inputs = input_layer_lib.Input(shape=(32,))
     if context.executing_eagerly():
-      self.assertIsInstance(inputs, base_layer.DeferredTensor)
       self.assertEqual(inputs.dtype.name, 'float32')
       self.assertEqual(inputs.shape.as_list(), [None, 32])
 
     x = keras.layers.Dense(2)(inputs)
     if context.executing_eagerly():
-      self.assertIsInstance(x, base_layer.DeferredTensor)
       self.assertEqual(x.dtype.name, 'float32')
       self.assertEqual(x.shape.as_list(), [None, 2])
 
@@ -1067,27 +1027,6 @@ class DefaultShapeInferenceBehaviorTest(test.TestCase):
     model = keras.Model(inputs, outputs)
     self._testShapeInference(model, (2, 3), (2, 4))
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testUnsupportedSignature(self):
-
-    class LayerWithAdditionalArg(keras.layers.Layer):
-
-      def build(self, input_shape):
-        self.w = array_ops.ones(shape=(3, 4))
-
-      def call(self, inputs, some_arg):
-        return keras.backend.dot(inputs, self.w) + some_arg
-
-    inputs = input_layer_lib.Input(shape=(3,))
-    if context.executing_eagerly():
-      with self.assertRaises(NotImplementedError):
-        outputs = LayerWithAdditionalArg()(inputs, some_arg=0)
-    else:
-      # Works with graph mode because the graph of ops is built together with
-      # the graph of layers.
-      outputs = LayerWithAdditionalArg()(inputs, some_arg=0)
-      _ = keras.Model(inputs, outputs)
-
   @test_util.run_in_graph_and_eager_modes()
   def testNoneInShape(self):
 
@@ -1216,6 +1155,7 @@ class DefaultShapeInferenceBehaviorTest(test.TestCase):
 
 class GraphUtilsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGetReachableFromInputs(self):
 
     with self.cached_session():
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 2ebb4cf99f939dd9e039f7fad8b59c762491bb38..462694fda690fbaa2d1474b9b1ddba558a84e201 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import weakref
 import numpy as np
 
@@ -26,12 +27,12 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_distributed
@@ -41,8 +42,8 @@ from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -125,6 +126,8 @@ class Model(Network):
     # predict on a model without compiling it.
     self._distribution_strategy = None
 
+    self.run_eagerly = None
+
   def _set_sample_weight_attributes(self, sample_weight_mode,
                                     skip_target_weighing_indices):
     """Sets sample weight related attributes on the model."""
@@ -175,19 +178,66 @@ class Model(Network):
       metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
     j = 1
     base_metric_name = metric_name
-    while metric_name in self.metrics_names:
+    while metric_name in self._compile_metrics_names:
       metric_name = '%s_%d' % (base_metric_name, j)
       j += 1
 
     return metric_name
 
+  @property
+  def metrics(self):
+    """Returns the model's metrics added using `compile`, `add_metric` APIs."""
+    metrics = []
+    if self._is_compiled:
+      metrics += self._compile_stateful_metric_functions
+    return metrics + super(Model, self).metrics
+
+  @property
+  def metrics_names(self):
+    """Returns the model's display labels for all outputs."""
+    metrics_names = []
+    if self._is_compiled:
+      metrics_names += self._compile_metrics_names  # Includes names of losses.
+
+    # Add metric names from layers.
+    for layer in self.layers:
+      metrics_names += [m.name for m in layer._metrics]  # pylint: disable=protected-access
+    metrics_names += [m.name for m in self._metrics]
+    return metrics_names
+
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
+  @property
+  def _all_stateful_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_stateful_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
   def _init_metric_attributes(self):
     """Initialized model metric attributes."""
-    self.metrics_names = ['loss']
-    self.metrics_tensors = []
-    self.metrics_updates = []
-    self.stateful_metric_names = []
-    self.stateful_metric_functions = []
+    # List of all metric names in the model.
+    self._compile_metrics_names = ['loss']
+    # List of stateful metric functions. Used for resetting metric state during
+    # training/eval.
+    # This includes loss functions when there are multiple outputs.
+    self._compile_stateful_metric_functions = []
+    # Dict of all aggregated metric result tensors. This includes aggregated
+    # loss result tensors when there are multiple outputs.
+    self._compile_stateful_metrics_tensors = {}
+    # Dict of all metric result tensors (aggregated or not - based on the
+    # values given in compile.). This includes aggregated loss result tensors
+    # when there are multiple outputs.
+    self._compile_metrics_tensors = {}
 
   def _set_per_output_metric_attributes(self, metrics_dict, output_index):
     """Sets the metric attributes on the model for the given output.
@@ -196,33 +246,47 @@ class Model(Network):
       metrics_dict: A dict with metric names as keys and metric fns as values.
       output_index: The index of the model output for which the metric
         attributes are added.
+
+    Returns:
+      Metrics dict updated with unique metric names as keys.
     """
-    for metric_name, metric_fn in metrics_dict.items():
+    updated_metrics_dict = collections.OrderedDict()
+    for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
       metric_name = self._add_unique_metric_name(metric_name, output_index)
-      # Keep track of metric name.
-      self.metrics_names.append(metric_name)
-
-      # Keep track of stateful metric attributes (name and metric function).
-      if isinstance(metric_fn, base_layer.Layer) and metric_fn.stateful:
-        self.stateful_metric_names.append(metric_name)
-        self.stateful_metric_functions.append(metric_fn)
+      updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
+      # Keep track of metric name, function and stateful function.
+      self._compile_metrics_names.append(metric_name)
+      self._compile_stateful_metric_functions.append(stateful_metric_fn)
+    return updated_metrics_dict
 
   def _set_metric_attributes(self, outputs, skip_target_indices=None):
     """Sets the metric attributes on the model for all the model outputs."""
     skip_target_indices = skip_target_indices or []
+    updated_per_output_metrics = []
+    updated_per_output_weighted_metrics = []
     for i in range(len(outputs)):
       if i in skip_target_indices:
+        updated_per_output_metrics.append(self._per_output_metrics[i])
+        updated_per_output_weighted_metrics.append(
+            self._per_output_weighted_metrics[i])
         continue
-      self._set_per_output_metric_attributes(self._per_output_metrics[i], i)
-      self._set_per_output_metric_attributes(
-          self._per_output_weighted_metrics[i], i)
+      updated_per_output_metrics.append(
+          self._set_per_output_metric_attributes(self._per_output_metrics[i],
+                                                 i))
+      updated_per_output_weighted_metrics.append(
+          self._set_per_output_metric_attributes(
+              self._per_output_weighted_metrics[i], i))
+
+    self._per_output_metrics = updated_per_output_metrics
+    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
 
   def _handle_per_output_metrics(self,
                                  metrics_dict,
                                  y_true,
                                  y_pred,
                                  mask,
-                                 weights=None):
+                                 weights=None,
+                                 return_stateful_result=True):
     """Calls metric functions for a single output.
 
     Arguments:
@@ -231,52 +295,50 @@ class Model(Network):
       y_pred: Predicted output.
       mask: Computed mask value for the current output.
       weights: Weights to be applied on the current output.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
 
     Returns:
       A list of metric result tensors.
     """
     metric_results = []
-    for metric_name, metric_fn in metrics_dict.items():
+    for metric_name, (metric_fn, stateful_fn) in metrics_dict.items():
       with K.name_scope(metric_name):
+
+        def _call_stateful_fn(fn):
+          return training_utils.call_metric_function(
+              fn, y_true, y_pred, weights=weights, mask=mask)
+
+        def _call_stateless_fn(fn):
+          weighted_metric_fn = training_utils.weighted_masked_objective(fn)
+          return weighted_metric_fn(y_true, y_pred, weights=weights, mask=mask)
+
+        def _track_metric_tensors(name, stateless_result, stateful_result):
+          self._compile_metrics_tensors[name] = stateless_result
+          self._compile_stateful_metrics_tensors[name] = stateful_result
+
         if isinstance(metric_fn, metrics_module.Metric):
-          # Call the stateful metric function.
-          if mask is not None:
-            mask = math_ops.cast(mask, y_pred.dtype)
-            # Update weights with mask.
-            if weights is None:
-              weights = mask
-            else:
-              # Update shape of weights if possible before adding mask.
-              # Update dimensions of weights to match with mask if possible.
-              mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
-                  mask, None, weights)
-              try:
-                # Broadcast weights if possible.
-                weights = weights_broadcast_ops.broadcast_weights(weights, mask)
-              except ValueError:
-                pass
-                # TODO(psv): Handle case when mask and weight shapes are not
-                # compatible.
-              weights *= mask
-
-          metric_result = metric_fn(y_true, y_pred, weights)
+          # If the given metric fn is stateful, call the fn and return result.
+          metric_result = _call_stateful_fn(metric_fn)
+          metric_results.append(metric_result)
+          if not self.run_eagerly:
+            _track_metric_tensors(metric_name, metric_result, metric_result)
+        elif self.run_eagerly:
+          # In eager mode, if the given metric fn is not stateful, we invoke the
+          # given fn or its stateful version based on the given flag.
+          if return_stateful_result:
+            metric_result = _call_stateful_fn(stateful_fn)
+          else:
+            metric_result = _call_stateless_fn(metric_fn)
+          metric_results.append(metric_result)
         else:
-          # Call the stateless metric function.
-          weighted_metric_fn = training_utils.weighted_masked_objective(
-              metric_fn)
-          metric_result = weighted_metric_fn(
-              y_true, y_pred, weights=weights, mask=mask)
-
-        if not context.executing_eagerly():
-          # Keep track of metric result tensor.
-          self.metrics_tensors.append(metric_result)
-
-      metric_results.append(metric_result)
-      is_stateful = isinstance(metric_fn,
-                               base_layer.Layer) and metric_fn.stateful
-      if is_stateful and not context.executing_eagerly():
-        # Keep track of updates created by stateful metrics.
-        self.metrics_updates += metric_fn.updates
+          # In graph mode, we build the sub-graph for both the stateful and the
+          # stateless fns.
+          stateful_metric_result = _call_stateful_fn(stateful_fn)
+          metric_result = _call_stateless_fn(metric_fn)
+          _track_metric_tensors(metric_name, metric_result,
+                                stateful_metric_result)
+
     return metric_results
 
   def _handle_metrics(self,
@@ -284,7 +346,8 @@ class Model(Network):
                       skip_target_indices=None,
                       targets=None,
                       sample_weights=None,
-                      masks=None):
+                      masks=None,
+                      return_stateful_result=True):
     """Handles calling metric functions.
 
     Arguments:
@@ -293,6 +356,8 @@ class Model(Network):
       targets: List of targets.
       sample_weights: Optional list of sample weight arrays.
       masks: List of computed output mask values.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
 
     Returns:
       A list of metric result tensors.
@@ -300,6 +365,7 @@ class Model(Network):
     skip_target_indices = skip_target_indices or []
     metric_results = []
     with K.name_scope('metrics'):
+      # Invoke all metrics added using `compile`.
       for i in range(len(outputs)):
         if i in skip_target_indices:
           continue
@@ -307,17 +373,62 @@ class Model(Network):
         target = targets[i] if targets else None
         output_mask = masks[i] if masks else None
         metric_results.extend(
-            self._handle_per_output_metrics(self._per_output_metrics[i], target,
-                                            output, output_mask))
+            self._handle_per_output_metrics(
+                self._per_output_metrics[i],
+                target,
+                output,
+                output_mask,
+                return_stateful_result=return_stateful_result))
         metric_results.extend(
             self._handle_per_output_metrics(
                 self._per_output_weighted_metrics[i],
                 target,
                 output,
                 output_mask,
-                weights=sample_weights[i]))
+                weights=sample_weights[i],
+                return_stateful_result=return_stateful_result))
+
+    # Add metric results from the `add_metric` metrics in eager mode.
+    if context.executing_eagerly():
+      for m in self.metrics:
+        if m not in self._compile_stateful_metric_functions:
+          metric_results.append(m.result())
     return metric_results
 
+  @property
+  def run_eagerly(self):
+    """Settable attribute indicating whether the model should run eagerly.
+
+    Running eagerly means that your model will be run step by step,
+    like Python code. Your model might run slower, but it should become easier
+    for you to debug it by stepping into individual layer calls.
+
+    By default, we will attempt to compile your model to a static graph to
+    deliver the best execution performance.
+
+    Returns:
+      Boolean, whether the model should run eagerly.
+    """
+    if self._run_eagerly is True and not context.executing_eagerly():
+      raise ValueError('You can only set `run_eagerly=True` if eager execution '
+                       'is enabled.')
+    if self._static_graph_friendly:
+      if self._run_eagerly is None:
+        return False
+      else:
+        return self._run_eagerly
+    else:
+      if self._run_eagerly is False:
+        # TODO(fchollet): consider using py_func to enable this.
+        raise ValueError('Your model contains layers that can only be '
+                         'successfully run in eager execution. '
+                         'You cannot set `run_eagerly=False`.')
+      return context.executing_eagerly()
+
+  @run_eagerly.setter
+  def run_eagerly(self, value):
+    self._run_eagerly = value
+
   @checkpointable.no_automatic_dependency_tracking
   def compile(self,
               optimizer,
@@ -379,6 +490,9 @@ class Model(Network):
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
+    run_eagerly = kwargs.pop('run_eagerly', None)
+    self._run_eagerly = run_eagerly
+
     # Validate that arguments passed by the user to `compile` are supported by
     # DistributionStrategy.
     if distribute:
@@ -387,9 +501,6 @@ class Model(Network):
         raise NotImplementedError(
             'optimizer must be an instance of '
             'tf.train.Optimizer, not a %s' % type(optimizer))
-      if context.executing_eagerly():
-        raise NotImplementedError('DistributionStrategy is not supported '
-                                  'when eager execution is enabled.')
       if sample_weight_mode:
         raise NotImplementedError('sample_weight_mode is not supported with '
                                   'DistributionStrategy.')
@@ -401,11 +512,12 @@ class Model(Network):
                          'DistributionStrategy.')
 
     loss = loss or {}
-    if context.executing_eagerly() and not isinstance(
+    if self.run_eagerly and not isinstance(
         optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
       raise ValueError(
-          'optimizer must be an instance of tf.train.Optimizer, not '
-          'a %s' % type(optimizer))
+          'When running a model in eager execution, the optimizer must be an '
+          'instance of tf.train.Optimizer. Received: '
+          '%s' % optimizer)
 
     self.optimizer = optimizers.get(optimizer)
     # We've disabled automatic dependency tracking for this method, but do want
@@ -414,12 +526,14 @@ class Model(Network):
       self._track_checkpointable(
           self.optimizer, name='optimizer', overwrite=True)
     self.loss = loss
-    self.metrics = metrics or []
+    self._compile_metrics = metrics or []
     self.loss_weights = loss_weights
     self.sample_weight_mode = sample_weight_mode
-    self.weighted_metrics = weighted_metrics
-    if context.executing_eagerly() and target_tensors is not None:
-      raise ValueError('target_tensors is not supported in Eager mode.')
+    self._compile_weighted_metrics = weighted_metrics
+    if self.run_eagerly and target_tensors is not None:
+      raise ValueError(
+          'target_tensors argument is not supported when '
+          'running a model eagerly.')
     self.target_tensors = target_tensors
 
     # Set DistributionStrategy specific parameters.
@@ -429,6 +543,8 @@ class Model(Network):
     if self._distribution_strategy is not None:
       distributed_training_utils.configure_and_create_session(
           self._distribution_strategy)
+    # Initialize model metric attributes.
+    self._init_metric_attributes()
     if not self.built:
       # Model is not compilable because it does not know its number of inputs
       # and outputs, nor their shapes and names. We will compile after the first
@@ -452,34 +568,32 @@ class Model(Network):
               '" missing from loss dictionary. We assume '
               'this was done on purpose. The fit and evaluate APIs will not be '
               'expecting any data to be passed to "' + name + '".')
-        loss_functions.append(losses.get(loss.get(name)))
+        loss_functions.append(training_utils.get_loss_function(loss.get(name)))
     elif isinstance(loss, list):
       if len(loss) != len(self.outputs):
         raise ValueError('When passing a list as loss, '
                          'it should have one entry per model outputs. '
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed loss=' + str(loss))
-      loss_functions = [losses.get(l) for l in loss]
+      loss_functions = [training_utils.get_loss_function(l) for l in loss]
     else:
-      loss_function = losses.get(loss)
+      loss_function = training_utils.get_loss_function(loss)
       loss_functions = [loss_function for _ in range(len(self.outputs))]
     self.loss_functions = loss_functions
 
-    weighted_losses = [training_utils.weighted_masked_objective(fn)
-                       for fn in loss_functions]
     skip_target_indices = []
     skip_target_weighing_indices = []
     self._feed_outputs = []
     self._feed_output_names = []
     self._feed_output_shapes = []
     self._feed_loss_fns = []
-    for i in range(len(weighted_losses)):
-      if weighted_losses[i] is None:
+    for i in range(len(loss_functions)):
+      if loss_functions[i] is None:
         skip_target_indices.append(i)
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
-    if not context.executing_eagerly():
+    if not self.run_eagerly:
       masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
       if not isinstance(masks, list):
         masks = [masks]
@@ -510,11 +624,8 @@ class Model(Network):
                       str(loss_weights) + ' - expected a list of dicts.')
     self.loss_weights_list = loss_weights_list
 
-    # Initialize model metric attributes.
-    self._init_metric_attributes()
-
     # Initialization for Eager mode execution.
-    if context.executing_eagerly():
+    if self.run_eagerly:
       # Prepare sample weights.
       self._set_sample_weight_attributes(sample_weight_mode,
                                          skip_target_weighing_indices)
@@ -527,7 +638,7 @@ class Model(Network):
       self.total_loss = None
       for i in range(len(self.outputs)):
         if len(self.outputs) > 1:
-          self.metrics_names.append(self.output_names[i] + '_loss')
+          self._compile_metrics_names.append(self.output_names[i] + '_loss')
 
       # Set metric attributes on model.
       self._set_metric_attributes(
@@ -541,125 +652,167 @@ class Model(Network):
       self._collected_trainable_weights = self.trainable_weights
       return
 
-    # Prepare targets of model.
-    self.targets = []
-    self._feed_targets = []
-    if target_tensors not in (None, []):
-      if isinstance(target_tensors, list):
-        if len(target_tensors) != len(self.outputs):
-          raise ValueError(
-              'When passing a list as `target_tensors`, '
-              'it should have one entry per model output. '
-              'The model has ' + str(len(self.outputs)) +
-              ' outputs, but you passed target_tensors=' + str(target_tensors))
-      elif isinstance(target_tensors, dict):
-        for name in target_tensors:
-          if name not in self.output_names:
+    with K.get_graph().as_default():
+      # Prepare targets of model.
+      self.targets = []
+      self._feed_targets = []
+      if target_tensors not in (None, []):
+        if isinstance(target_tensors, list):
+          if len(target_tensors) != len(self.outputs):
             raise ValueError(
-                'Unknown entry in `target_tensors` '
-                'dictionary: "' + name + '". '
-                'Only expected the following keys: ' + str(self.output_names))
-        tmp_target_tensors = []
-        for name in self.output_names:
-          tmp_target_tensors.append(target_tensors.get(name, None))
-        target_tensors = tmp_target_tensors
-      else:
-        raise TypeError('Expected `target_tensors` to be '
-                        'a list or dict, but got:', target_tensors)
-
-    for i in range(len(self.outputs)):
-      if i in skip_target_indices:
-        self.targets.append(None)
-      else:
-        shape = K.int_shape(self.outputs[i])
-        name = self.output_names[i]
-        if target_tensors not in (None, []):
-          target = target_tensors[i]
+                'When passing a list as `target_tensors`, '
+                'it should have one entry per model output. '
+                'The model has %s outputs, but you passed target_tensors=%s' %
+                (len(self.outputs), target_tensors))
+        elif isinstance(target_tensors, dict):
+          for name in target_tensors:
+            if name not in self.output_names:
+              raise ValueError(
+                  'Unknown entry in `target_tensors` '
+                  'dictionary: "' + name + '". '
+                  'Only expected the following keys: ' + str(self.output_names))
+          tmp_target_tensors = []
+          for name in self.output_names:
+            tmp_target_tensors.append(target_tensors.get(name, None))
+          target_tensors = tmp_target_tensors
+        elif tensor_util.is_tensor(target_tensors):
+          target_tensors = [target_tensors]
         else:
-          target = None
-        if target is None or K.is_placeholder(target):
-          if target is None:
-            target = K.placeholder(
-                ndim=len(shape),
-                name=name + '_target',
-                sparse=K.is_sparse(self.outputs[i]),
-                dtype=K.dtype(self.outputs[i]))
-          self._feed_targets.append(target)
-          self._feed_outputs.append(self.outputs[i])
-          self._feed_output_names.append(name)
-          self._feed_output_shapes.append(shape)
-          self._feed_loss_fns.append(self.loss_functions[i])
-        else:
-          skip_target_weighing_indices.append(i)
-        self.targets.append(target)
-
-    # Prepare sample weights.
-    self._set_sample_weight_attributes(sample_weight_mode,
-                                       skip_target_weighing_indices)
-    # Save all metric attributes per output of the model.
-    self._cache_output_metric_attributes(metrics, weighted_metrics)
-
-    # Compute total loss.
-    total_loss = None
-    with K.name_scope('loss'):
+          raise TypeError('Expected `target_tensors` to be a list or tuple or '
+                          'dict or a single tensor, but got:', target_tensors)
+
       for i in range(len(self.outputs)):
         if i in skip_target_indices:
-          continue
-        y_true = self.targets[i]
-        y_pred = self.outputs[i]
-        weighted_loss = weighted_losses[i]
-        sample_weight = self.sample_weights[i]
-        mask = masks[i]
-        loss_weight = loss_weights_list[i]
-        with K.name_scope(self.output_names[i] + '_loss'):
-          output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
-        if len(self.outputs) > 1:
-          self.metrics_tensors.append(output_loss)
-          self.metrics_names.append(self.output_names[i] + '_loss')
-        if total_loss is None:
-          total_loss = loss_weight * output_loss
+          self.targets.append(None)
         else:
-          total_loss += loss_weight * output_loss
-      if total_loss is None:
-        if not self.losses:
-          raise ValueError('The model cannot be compiled '
-                           'because it has no loss to optimize.')
-        else:
-          total_loss = 0.
-
-      # Add regularization penalties
-      # and other layer-specific losses.
-      for loss_tensor in self.losses:
-        total_loss += loss_tensor
-
-    # Set metric attributes on model.
-    self._set_metric_attributes(
-        self.outputs,
-        skip_target_indices=skip_target_indices,
-    )
-    # Invoke metric functions for all the outputs.
-    self._handle_metrics(
-        self.outputs,
-        masks=masks,
-        targets=self.targets,
-        skip_target_indices=skip_target_indices,
-        sample_weights=self.sample_weights)
-
-    # Prepare gradient updates and state updates.
-    self.total_loss = total_loss
-
-    # Functions for train, test and predict will
-    # be compiled lazily when required.
-    # This saves time when the user is not using all functions.
-    self._function_kwargs = kwargs
-
-    self.train_function = None
-    self.test_function = None
-    self.predict_function = None
-
-    # Collected trainable weights, sorted in topological order.
-    trainable_weights = self.trainable_weights
-    self._collected_trainable_weights = trainable_weights
+          shape = K.int_shape(self.outputs[i])
+          name = self.output_names[i]
+          if target_tensors not in (None, []):
+            target = target_tensors[i]
+          else:
+            target = None
+          if target is None or K.is_placeholder(target):
+            if target is None:
+              target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
+                  self.loss_functions[i],
+                  K.dtype(self.outputs[i]))
+
+              target = K.placeholder(
+                  ndim=len(shape),
+                  name=name + '_target',
+                  sparse=K.is_sparse(self.outputs[i]),
+                  dtype=target_dtype)
+            self._feed_targets.append(target)
+            self._feed_outputs.append(self.outputs[i])
+            self._feed_output_names.append(name)
+            self._feed_output_shapes.append(shape)
+            self._feed_loss_fns.append(self.loss_functions[i])
+          else:
+            skip_target_weighing_indices.append(i)
+          self.targets.append(target)
+
+      # Prepare sample weights.
+      self._set_sample_weight_attributes(sample_weight_mode,
+                                         skip_target_weighing_indices)
+      # Save all metric attributes per output of the model.
+      self._cache_output_metric_attributes(metrics, weighted_metrics)
+
+      # Compute total loss.
+      total_loss = None
+      with K.name_scope('loss'):
+        for i in range(len(self.outputs)):
+          if i in skip_target_indices:
+            continue
+          y_true = self.targets[i]
+          y_pred = self.outputs[i]
+          loss_fn = loss_functions[i]
+          sample_weight = self.sample_weights[i]
+          mask = masks[i]
+          loss_weight = loss_weights_list[i]
+          with K.name_scope(self.output_names[i] + '_loss'):
+            if isinstance(loss_fn, losses.Loss):
+              if mask is not None:
+                mask = math_ops.cast(mask, y_pred.dtype)
+                # Update weights with mask.
+                if sample_weight is None:
+                  sample_weight = mask
+                else:
+                  # Update dimensions of weights to match with mask if possible.
+                  mask, _, sample_weight = squeeze_or_expand_dimensions(
+                      mask, None, sample_weight)
+                  sample_weight *= mask
+              output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
+            else:
+              weighted_loss = training_utils.weighted_masked_objective(loss_fn)
+              output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+
+          if len(self.outputs) > 1:
+            # Keep track of the un-aggregated loss result tensor.
+            self._compile_metrics_tensors[self.output_names[i] +
+                                          '_loss'] = output_loss
+
+            # Keep track of stateful result tensor and function for the loss.
+            loss_name = loss_fn.name if isinstance(
+                loss_fn, losses.Loss) else loss_fn.__name__
+            mean_wrapped_loss = metrics_module.MeanMetricWrapper(
+                loss_fn, name=loss_name)
+            result_tensor = training_utils.call_metric_function(
+                mean_wrapped_loss,
+                y_true,
+                y_pred,
+                weights=sample_weight,
+                mask=mask)
+            self._compile_stateful_metrics_tensors[self.output_names[i] +
+                                                   '_loss'] = result_tensor
+            self._compile_stateful_metric_functions.append(mean_wrapped_loss)
+
+            self._compile_metrics_names.append(self.output_names[i] + '_loss')
+          if total_loss is None:
+            total_loss = loss_weight * output_loss
+          else:
+            total_loss += loss_weight * output_loss
+        if total_loss is None:
+          if not self.losses:
+            raise ValueError('The model cannot be compiled '
+                             'because it has no loss to optimize.')
+          else:
+            total_loss = 0.
+
+        # Add regularization penalties
+        # and other layer-specific losses.
+        for loss_tensor in self.losses:
+          total_loss += loss_tensor
+
+      # Set metric attributes on model.
+      self._set_metric_attributes(
+          self.outputs,
+          skip_target_indices=skip_target_indices,
+      )
+      # Invoke metric functions for all the outputs.
+      self._handle_metrics(
+          self.outputs,
+          masks=masks,
+          targets=self.targets,
+          skip_target_indices=skip_target_indices,
+          sample_weights=self.sample_weights)
+
+      # Prepare gradient updates and state updates.
+      self.total_loss = total_loss
+
+      # Functions for train, test and predict will
+      # be compiled lazily when required.
+      # This saves time when the user is not using all functions.
+      self._function_kwargs = kwargs
+
+      self._fit_function = None
+      self._eval_function = None
+      self.train_function = None
+      self.test_function = None
+      self.predict_function = None
+
+      # Collected trainable weights, sorted in topological order.
+      trainable_weights = self.trainable_weights
+      self._collected_trainable_weights = trainable_weights
 
   def _check_trainable_weights_consistency(self):
     """Check trainable weights count consistency.
@@ -674,75 +827,123 @@ class Model(Network):
       return
 
     if len(self.trainable_weights) != len(self._collected_trainable_weights):
-      logging.warning(
-          UserWarning(
-              'Discrepancy between trainable weights and collected trainable'
-              ' weights, did you set `model.trainable` without calling'
-              ' `model.compile` after ?'))
+      logging.log_first_n(
+          logging.WARN, 'Discrepancy between trainable weights and collected'
+          ' trainable weights, did you set `model.trainable`'
+          ' without calling `model.compile` after ?', 1)
 
-  def _make_train_function(self):
-    if not hasattr(self, 'train_function'):
+  def _make_train_function_helper(self, fn_name, outputs, metric_updates=None):
+    if not hasattr(self, fn_name):
       raise RuntimeError('You must compile your model before using it.')
     self._check_trainable_weights_consistency()
-    if self.train_function is None:
+    if getattr(self, fn_name) is None:
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        inputs += [K.learning_phase()]
+      if not isinstance(K.symbolic_learning_phase(), int):
+        inputs += [K.symbolic_learning_phase()]
+
+      with K.get_graph().as_default():
+        with K.name_scope('training'):
+          with K.name_scope(self.optimizer.__class__.__name__):
+            # Training updates
+            updates = self.optimizer.get_updates(
+                params=self._collected_trainable_weights, loss=self.total_loss)
+      # Unconditional updates
+      updates += self.get_updates_for(None)
+      # Conditional updates relevant to this model
+      updates += self.get_updates_for(self.inputs)
+      # Add stateful metrics updates.
+      if metric_updates is not None:
+        updates += metric_updates
 
       with K.name_scope('training'):
-        with K.name_scope(self.optimizer.__class__.__name__):
-          # Training updates
-          updates = self.optimizer.get_updates(
-              params=self._collected_trainable_weights, loss=self.total_loss)
-        # Unconditional updates
-        updates += self.get_updates_for(None)
-        # Conditional updates relevant to this model
-        updates += self.get_updates_for(self.inputs)
-        # Stateful metrics updates
-        updates += self.metrics_updates
         # Gets loss and metrics. Updates weights at each call.
-        self.train_function = K.function(
-            inputs, [self.total_loss] + self.metrics_tensors,
+        fn = K.function(
+            inputs,
+            outputs,
             updates=updates,
             name='train_function',
             **self._function_kwargs)
+        setattr(self, fn_name, fn)
 
-  def _make_test_function(self):
-    if not hasattr(self, 'test_function'):
+  def _make_train_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_train_function_helper('train_function',
+                                     [self.total_loss] + metrics_tensors)
+
+  def _make_fit_function(self):
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_train_function_helper(
+        '_fit_function', [self.total_loss] + metrics_tensors)
+
+  def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
+    if not hasattr(self, fn_name):
       raise RuntimeError('You must compile your model before using it.')
-    if self.test_function is None:
+    if getattr(self, fn_name) is None:
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        inputs += [K.learning_phase()]
-      # Return loss and metrics, no gradient updates.
-      # Does update the network states.
-      self.test_function = K.function(
-          inputs, [self.total_loss] + self.metrics_tensors,
-          updates=self.state_updates + self.metrics_updates,
-          name='test_function',
-          **self._function_kwargs)
+
+      with K.name_scope('evaluation'):
+        updates = self.state_updates
+        # Add stateful metrics updates.
+        if metric_updates is not None:
+          updates += metric_updates
+        # Return loss and metrics, no gradient updates.
+        # Does update the network states.
+        fn = K.function(
+            inputs,
+            outputs,
+            updates=updates,
+            name='test_function',
+            **self._function_kwargs)
+        setattr(self, fn_name, fn)
+
+  def _make_test_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_test_function_helper('test_function',
+                                    [self.total_loss] + metrics_tensors)
+
+  def _make_eval_function(self):
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_test_function_helper(
+        '_eval_function', [self.total_loss] + metrics_tensors)
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
       self.predict_function = None
     if self.predict_function is None:
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        inputs = self._feed_inputs + [K.learning_phase()]
-      else:
-        inputs = self._feed_inputs
+      inputs = self._feed_inputs
       # Gets network outputs. Does not update weights.
       # Does update the network states.
       kwargs = getattr(self, '_function_kwargs', {})
-      self.predict_function = K.function(
-          inputs,
-          self.outputs,
-          updates=self.state_updates,
-          name='predict_function',
-          **kwargs)
+      with K.name_scope('predict'):
+        self.predict_function = K.function(
+            inputs,
+            self.outputs,
+            updates=self.state_updates,
+            name='predict_function',
+            **kwargs)
+
+  def _make_execution_function(self, mode):
+    if mode == 'train':
+      self._make_fit_function()
+      return self._fit_function
+    if mode == 'test':
+      self._make_eval_function()
+      return self._eval_function
+    if mode == 'predict':
+      self._make_predict_function()
+      return self.predict_function
 
   def _get_iterator_get_next_tensors(self, iterator):
     get_next_op = self._iterator_get_next.get(iterator, None)
@@ -760,7 +961,8 @@ class Model(Network):
                                           check_steps=False,
                                           steps_name='steps',
                                           steps=None,
-                                          validation_split=0):
+                                          validation_split=0,
+                                          shuffle=False):
     """Runs validation checks on input and target data passed by the user.
 
     This is called when using DistributionStrategy to train, evaluate or serve
@@ -783,6 +985,7 @@ class Model(Network):
         execute.
       validation_split: Float between 0 and 1.
         Fraction of the training data to be used as validation data.
+      shuffle: Boolean whether to shuffle the training data before each epoch.
 
     Returns:
       Iterator for reading the dataset `x`.
@@ -791,29 +994,28 @@ class Model(Network):
       ValueError: In case of invalid user-provided data.
       RuntimeError: If the model was never compiled.
     """
-    if sample_weight is not None and sample_weight.all():
-      raise NotImplementedError('`sample_weight` is currently not supported '
-                                'when using DistributionStrategy.')
     if class_weight:
       raise NotImplementedError('`class_weight` is currently not supported '
                                 'when using DistributionStrategy.')
 
+    if (sample_weight is not None and sample_weight.all() and
+        distributed_training_utils.is_tpu_strategy(
+            self._distribution_strategy)):
+      raise NotImplementedError('`sample_weight` is currently not supported '
+                                'when using TPUStrategy.')
+
     # Validates `steps` argument right at the beginning since we use it to
     # construct the dataset object.
-    # TODO(anjalisridhar): This may not be a valid error since we now accept
-    # numpy array inputs. We still want to assert that we have a populated steps
-    # parameter.
-    if check_steps:
-      if steps is None:
-        raise ValueError('When using DistributionStrategy, '
-                         'you should specify the `{steps_name}` argument.'
-                         .format(steps_name=steps_name))
+    # TODO(anjalisridhar): Remove this check once we refactor the
+    # _standardize_user_data code path. This check is already present elsewhere
+    # in the codebase.
+    if check_steps and isinstance(x, dataset_ops.DatasetV2) and steps is None:
+      raise ValueError('When using Datasets as input, '
+                       'you should specify the `{steps_name}` argument.'
+                       .format(steps_name=steps_name))
 
     first_x_value = nest.flatten(x)[0]
     if isinstance(first_x_value, np.ndarray):
-      x_shape = first_x_value.shape
-      if batch_size is None:
-        batch_size = x_shape[0] // steps
       # We need to use the drop_remainder argument to allow for a static
       # input shape which is required for TPUs.
       drop_remainder = self._distribution_strategy.require_static_shapes
@@ -822,38 +1024,41 @@ class Model(Network):
             self._distribution_strategy, x)
         var_y = distributed_training_utils.get_var_for_numpy(
             self._distribution_strategy, y)
+        if sample_weight is not None:
+          var_sample_weights = distributed_training_utils.get_var_for_numpy(
+              self._distribution_strategy, sample_weight)
+
+          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y,
+                                                      var_sample_weights))
+        else:
+          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
 
         x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
-        # TODO(anjalisridhar): What should the buffer size be?
-        x = x.shuffle(10000)
+        if shuffle:
+          # 1024 is a good buffer size since it is much larger than the average
+          # batch size provided by the user and provides sufficient randomness.
+          # One thing to keep in mind is the memory usage based on the size of
+          # each sample.
+          x = x.shuffle(1024)
         x = x.repeat()
         x = x.batch(batch_size, drop_remainder=drop_remainder)
         y = None
+        sample_weight = None
       else:
         # This case is for the predict call where the dataset only contains
-        # inputs and no targets i.e it does not return a tuple.
-        # TODO(anjalisridhar): Raise an error if we are not able to process
-        # all the predict samples. This can happen if the number of batches is
-        # not evenly divisible by the number of worker devices.
+        # inputs and no targets, i.e. it does not return a tuple
         var_x = distributed_training_utils.get_var_for_numpy(
             self._distribution_strategy, x)
         x = dataset_ops.Dataset.from_tensor_slices(var_x)
-        x = x.repeat()
         x = x.batch(batch_size, drop_remainder=drop_remainder)
 
-    # TODO(anjalisridhar): Can we use the iterator and getnext op cache?
-    # We require users to pass Datasets since we distribute the dataset across
-    # multiple devices.
-    assert isinstance(x, dataset_ops.Dataset)
-
-    # TODO(anjalisridhar): We want distribute_dataset() to accept a Dataset or a
-    # function which returns a Dataset. Currently distribute_dataset() only
-    # accepts a function that returns a Dataset. Once we add support for being
-    # able to clone a Dataset on multiple workers we can remove this lambda.
-    result = self._distribution_strategy.distribute_dataset(lambda: x)
-    iterator = result.make_initializable_iterator()
+    assert isinstance(x, dataset_ops.DatasetV2)
+
     with self._distribution_strategy.scope():
-      K.get_session().run(iterator.initializer)
+      iterator = self._distribution_strategy.make_dataset_iterator(x)
+      init_op = iterator.initialize()
+      if not context.executing_eagerly():
+        K.get_session().run(init_op)
 
     training_utils.validate_iterator_input(x, y, sample_weight,
                                            validation_split)
@@ -868,7 +1073,8 @@ class Model(Network):
                              check_steps=False,
                              steps_name='steps',
                              steps=None,
-                             validation_split=0):
+                             validation_split=0,
+                             shuffle=False):
     """Runs validation checks on input and target data passed by the user.
 
     Also standardizes the data to lists of arrays, in order.
@@ -910,6 +1116,7 @@ class Model(Network):
         execute.
       validation_split: Float between 0 and 1.
         Fraction of the training data to be used as validation data.
+      shuffle: Boolean whether to shuffle the training data before each epoch.
 
     Returns:
       A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a dict
@@ -932,17 +1139,18 @@ class Model(Network):
           check_steps=check_steps,
           steps_name=steps_name,
           steps=steps,
-          validation_split=validation_split)
+          validation_split=validation_split,
+          shuffle=shuffle)
       return iterator, None, None
 
-    if isinstance(x, dataset_ops.Dataset):
+    if isinstance(x, dataset_ops.DatasetV2):
       if context.executing_eagerly():
-        x = x.make_one_shot_iterator()
+        x = iter(x)
       else:
         if x in self._dataset_iterator_cache:
           x = self._dataset_iterator_cache[x]
         else:
-          iterator = x.make_initializable_iterator()
+          iterator = dataset_ops.make_initializable_iterator(x)
           self._dataset_iterator_cache[x] = iterator
           x = iterator
         K.get_session().run(x.initializer)
@@ -962,7 +1170,7 @@ class Model(Network):
     # For eager iterators, when we have to process multiple batches of samples,
     # we will standardize the data when we actually loop over iterator and get
     # the batches. For now, we just return the iterator as is.
-    if is_x_eager_iterator and steps is not None:
+    if is_x_eager_iterator:
       return x, y, sample_weight
 
     # If input data is a dataset iterator in graph mode or if it is an eager
@@ -991,12 +1199,57 @@ class Model(Network):
           x, y, sample_weight = next_element
       else:
         x = next_element
-    x, y, sample_weights = self._standardize_weights(x, y, sample_weight,
-                                                     class_weight, batch_size)
+    x, y, sample_weights = self._standardize_weights(
+        x, y, sample_weight, class_weight, batch_size, is_x_iterator)
     return x, y, sample_weights
 
-  def _standardize_weights(self, x, y, sample_weight=None, class_weight=None,
-                           batch_size=None,):
+  def _standardize_weights(self,
+                           x,
+                           y,
+                           sample_weight=None,
+                           class_weight=None,
+                           batch_size=None,
+                           from_iterator=False):
+    """Standardize input data, target data, and weight values.
+
+    This method reformats all data passed to the model to an ordered list of
+    array/tensors, matching the order expected by the model. This also validates
+    the input and target data shapes.
+
+    Args:
+      x: Input data. It could be:
+        - A Numpy array (or array-like), or a list of arrays
+          (in case the model has multiple inputs).
+        - A TensorFlow tensor, or a list of tensors
+          (in case the model has multiple inputs).
+        - A dict mapping input names to the corresponding array/tensors,
+          if the model has named inputs.
+        x cannot not be an iterator.
+      y: Target data. Like the input data `x`,
+        it could be either Numpy array(s) or TensorFlow tensor(s).
+        It should be consistent with `x` (you cannot have Numpy inputs and
+        tensor targets, or inversely).
+      sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`.
+      class_weight: An optional class-weight array by the user to
+        weight the importance of samples in `x` based on the class they belong
+        to, as conveyed by `y`.
+      batch_size: Integer batch size. If provided, it is used to run additional
+        validation checks on stateful models.
+      from_iterator: Whether x and y were obtained from an iterator.
+
+    Returns:
+      Tuple of standardized data that will be fed to the model:
+        (input data, target data, sample weights)
+
+    Raises:
+      RuntimeError: If target data is provided, but the model has not yet been
+        compiled.
+      ValueError: If the input data, target data, and batch size have invalid
+        shapes or formats (e.g. the model expects input to be a list of three
+        tensors, but x is a list with two tensors). Error is also raised if the
+        input and target data are not both arrays or tensors.
+    """
     # TODO(sourabhbajaj): Split input validation from weight standardization.
     if sample_weight is not None and class_weight is not None:
       logging.warning(
@@ -1006,6 +1259,8 @@ class Model(Network):
     all_inputs = []
     is_build_called = False
     is_compile_called = False
+    # Whether this is a subclassed model that expects dictionary inputs
+    # rather than list inputs (e.g. FeatureColumn-based models).
     dict_inputs = False
     if not self.inputs:
       # We need to use `x` to set the model inputs.
@@ -1028,13 +1283,23 @@ class Model(Network):
         all_inputs.append(x)
 
       # Build the model using the retrieved inputs (value or symbolic).
-      # If values, then in symbolic-mode placeholders will be created
-      # to match the value shapes.
+      # If values or generated from a dataset, then in symbolic-mode
+      # placeholders will be created to match the value shapes.
       if not self.inputs:
         is_build_called = True
-        self._set_inputs(x)
+        if from_iterator:
+          cast_inputs = nest.map_structure(lambda v: v.shape, x)
+        elif training_utils.has_tensors(x):
+          cast_inputs = training_utils.cast_if_floating_dtype(x)
+        else:
+          cast_inputs = x
+        self._set_inputs(cast_inputs)
     else:
       dict_inputs = isinstance(self.inputs, dict)
+    if dict_inputs and context.executing_eagerly():
+      # No support for graph functions when the model expects dictionary inputs
+      # (i.e. FeatureColumn-based models).
+      self.run_eagerly = True
 
     if y is not None:
       if not self.optimizer:
@@ -1044,6 +1309,8 @@ class Model(Network):
       if not self._is_compiled:
         # On-the-fly compilation of the model.
         # We need to use `y` to set the model targets.
+        if training_utils.has_tensors(y):
+          y = training_utils.cast_if_floating_dtype(y)
         if isinstance(y, (list, tuple)):
           if not all(isinstance(v, np.ndarray) or
                      tensor_util.is_tensor(v) for v in y):
@@ -1068,19 +1335,22 @@ class Model(Network):
                              'TensorFlow tensors. '
                              'You passed: x=' + str(x) + '; y=' + str(y))
 
-        if context.executing_eagerly():
+        if self.run_eagerly or from_iterator:
           target_tensors = None
         else:
           # Handle target tensors if any passed.
           if not isinstance(y, (list, tuple)):
             y = [y]
-          target_tensors = [v for v in y if tensor_util.is_tensor(v)]
+          target_tensors = [v for v in y if _is_symbolic_tensor(v)]
         is_compile_called = True
-        self.compile(optimizer=self.optimizer,
-                     loss=self.loss,
-                     metrics=self.metrics,
-                     loss_weights=self.loss_weights,
-                     target_tensors=target_tensors)
+        self.compile(
+            optimizer=self.optimizer,
+            loss=self.loss,
+            metrics=self._compile_metrics,
+            weighted_metrics=self._compile_weighted_metrics,
+            loss_weights=self.loss_weights,
+            target_tensors=target_tensors,
+            run_eagerly=self.run_eagerly)
 
     # In graph mode, if we had just set inputs and targets as symbolic tensors
     # by invoking build and compile on the model respectively, we do not have to
@@ -1088,15 +1358,14 @@ class Model(Network):
     # part of the graph.
     # Note: in this case, `any` and `all` are equivalent since we disallow
     # mixed symbolic/value inputs.
-    if (not context.executing_eagerly() and is_build_called and
-        is_compile_called and
-        any(tensor_util.is_tensor(v) for v in all_inputs)):
+    if (not self.run_eagerly and is_build_called and is_compile_called and
+        not from_iterator and any(_is_symbolic_tensor(v) for v in all_inputs)):
       return [], [], []
 
     # What follows is input validation and standardization to list format,
     # in the case where all inputs are value arrays.
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       # In eager mode, do not do shape validation
       # since the network has no input nodes (placeholders) to be fed.
       feed_input_names = self.input_names
@@ -1152,7 +1421,9 @@ class Model(Network):
       y = training_utils.standardize_input_data(
           y,
           feed_output_names,
-          feed_output_shapes,
+          # Don't enforce target shapes to match output shapes.
+          # Precise checks will be run in `check_loss_and_target_compatibility`.
+          shapes=None,
           check_batch_axis=False,  # Don't enforce the batch size.
           exception_prefix='target')
 
@@ -1170,7 +1441,7 @@ class Model(Network):
       # Check that all arrays have the same length.
       if not self._distribution_strategy:
         training_utils.check_array_lengths(x, y, sample_weights)
-        if self._is_graph_network and not context.executing_eagerly():
+        if self._is_graph_network and not self.run_eagerly:
           # Additional checks to avoid users mistakenly using improper loss fns.
           training_utils.check_loss_and_target_compatibility(
               y, self._feed_loss_fns, feed_output_shapes)
@@ -1194,7 +1465,7 @@ class Model(Network):
     return x, y, sample_weights
 
   @checkpointable.no_automatic_dependency_tracking
-  def _set_inputs(self, inputs, training=None):
+  def _set_inputs(self, inputs, outputs=None, training=None):
     """Set model's input and output specs based on the input data received.
 
     This is to be used for Model subclasses, which do not know at instantiation
@@ -1202,14 +1473,17 @@ class Model(Network):
 
     Args:
       inputs: Single array, or list of arrays. The arrays could be placeholders,
-        Numpy arrays, or data tensors.
+        Numpy arrays, data tensors, or TensorShapes.
         - if placeholders: the model is built on top of these placeholders,
           and we expect Numpy data to be fed for them when calling `fit`/etc.
-        - if Numpy data: we create placeholders matching the shape of the Numpy
-          arrays. We expect Numpy data to be fed for these placeholders
-          when calling `fit`/etc.
+        - if Numpy data or TensorShapes: we create placeholders matching the
+          TensorShapes or shapes of the Numpy arrays. We expect Numpy data to be
+          fed for these placeholders when calling `fit`/etc.
         - if data tensors: the model is built on top of these tensors.
           We do not expect any Numpy data to be provided when calling `fit`/etc.
+      outputs: None, a data tensor, or a list of tensors. If None, the
+        outputs will be determined by invoking `self.call()`, otherwise the
+        provided value will be used.
       training: Boolean or None. Only relevant in symbolic mode. Specifies
         whether to build the model's graph in inference mode (False), training
         mode (True), or using the Keras learning phase (None).
@@ -1217,103 +1491,29 @@ class Model(Network):
       ValueError: If dict inputs are passed to a Sequential Model where the
         first layer isn't FeatureLayer.
     """
-    call_convention = getattr(
-        self,
-        '_call_convention',
-        base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
-    if call_convention not in (
-        base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT,
-        base_layer.CallConvention.SINGLE_POSITIONAL_ARGUMENT):
-      raise NotImplementedError(
-          'Subclassed Models without "inputs" (or single positional arguments) '
-          'in their call() signatures do not yet support shape inference. File '
-          'a feature request if this limitation bothers you.')
-    if self.__class__.__name__ == 'Sequential':
+    if self.inputs:
+      raise ValueError('Model inputs are already set.')
+
+    if self.__class__.__name__ == 'Sequential' and not self.built:
       if tensor_util.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
-        self.build(input_shape=input_shape)
+        input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
+      elif isinstance(inputs, tensor_shape.TensorShape):
+        input_shape = (None,) + tuple(inputs.as_list()[1:])
       elif isinstance(inputs, dict):
         # We assert that the first layer is a FeatureLayer.
         if not training_utils.is_feature_layer(self.layers[0]):
           raise ValueError('Passing a dictionary input to a Sequential Model '
-                           'which doesnt have FeatureLayer as the first layer '
-                           'is an error')
+                           'which doesn\'t have FeatureLayer as the first layer'
+                           ' is an error.')
         input_shape = (None,)
-        self.build(input_shape=input_shape)
       else:
-        input_shape = (None,) + inputs.shape[1:]
-        self.build(input_shape=input_shape)
-    if context.executing_eagerly():
-      self._eager_set_inputs(inputs)
-    else:
-      self._symbolic_set_inputs(inputs, training=training)
-
-  @checkpointable.no_automatic_dependency_tracking
-  def _eager_set_inputs(self, inputs):
-    """Set model's input and output specs based on the input data received.
-
-    This is to be used for Model subclasses, which do not know at instantiation
-    time what their inputs look like.
-
-    We assume the number and ndim of outputs
-    does not change over different calls.
-
-    Args:
-      inputs: Argument `x` (input data) passed by the user upon first model use.
-
-    Raises:
-      ValueError: If the model's inputs are already set.
-    """
-    assert context.executing_eagerly()
-    if self.inputs:
-      raise ValueError('Model inputs are already set.')
-
-    # On-the-fly setting of model inputs/outputs as DeferredTensors,
-    # to keep track of number of inputs and outputs and their ndim.
-    model_inputs = training_utils.ModelInputs(inputs)
-    dummy_input_values = model_inputs.get_input_values()
-    dummy_output_values = self.call(dummy_input_values)
-
-    self.inputs = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.input_names = model_inputs.get_input_names()
-
-    dummy_output_values = nest.flatten(dummy_output_values)
-    self.outputs = [
-        base_layer.DeferredTensor(shape=(None
-                                         for _ in v.shape), dtype=v.dtype)
-        for v in dummy_output_values
-    ]
-    self.output_names = [
-        'output_%d' % (i + 1) for i in range(len(dummy_output_values))]
-    self.built = True
-
-  @checkpointable.no_automatic_dependency_tracking
-  def _symbolic_set_inputs(self, inputs, outputs=None, training=None):
-    """Set model's inputs and output specs based.
-
-    This is to be used for Model subclasses, which do not know at instantiation
-    time what their inputs look like.
-
-    Args:
-      inputs: Argument `x` (input data) passed by the user upon first model use.
-      outputs: None, a data tensor, or a list of data tensors. If None, the
-        outputs will be determined by invoking self.call(), otherwise the
-        provided value will be used.
-      training: Boolean or None. Only relevant in symbolic mode. Specifies
-        whether to build the model's graph in inference mode (False), training
-        mode (True), or using the Keras learning phase (None).
-
-    Raises:
-      ValueError: If the model's inputs are already set.
-    """
-    assert not context.executing_eagerly()
-    if self.inputs:
-      raise ValueError('Model inputs are already set.')
+        input_shape = (None,) + tuple(inputs.shape[1:])
+      self._build_input_shape = input_shape
 
     # On-the-fly setting of symbolic model inputs (either by using the tensor
     # provided, or by creating a placeholder if Numpy data was provided).
     model_inputs = training_utils.ModelInputs(inputs)
-    dummy_input_values = model_inputs.get_symbolic_inputs()
+    inputs = model_inputs.get_symbolic_inputs()
     self.inputs = model_inputs.get_symbolic_inputs(return_single_as_list=True)
     self.input_names = model_inputs.get_input_names()
 
@@ -1327,12 +1527,15 @@ class Model(Network):
         self._feed_input_names.append(k)
         self._feed_input_shapes.append(K.int_shape(v))
 
+    # TODO(fchollet): consider calling `_maybe_build` before calling the model.
+
     if outputs is None:
       # Obtain symbolic outputs by calling the model.
-      if self._expects_training_arg:
-        outputs = self.call(dummy_input_values, training=training)
-      else:
-        outputs = self.call(dummy_input_values)
+      with K.get_graph().as_default():
+        if self._expects_training_arg:
+          outputs = self.call(inputs, training=training)
+        else:
+          outputs = self.call(inputs)
 
     outputs = nest.flatten(outputs)
     self.outputs = outputs
@@ -1488,7 +1691,6 @@ class Model(Network):
     """
     # TODO(fchollet): this method may be creating reference cycles, which would
     # lead to accumulating garbage in memory when called in a loop. Investigate.
-
     if data_utils.is_generator_or_sequence(x):
       training_utils.check_generator_arguments(y, sample_weight)
       return self.fit_generator(
@@ -1506,9 +1708,6 @@ class Model(Network):
           shuffle=shuffle,
           initial_epoch=initial_epoch)
 
-    # Backwards compatibility
-    if batch_size is None and steps_per_epoch is None:
-      batch_size = 32
     # Legacy support
     if 'nb_epoch' in kwargs:
       logging.warning(
@@ -1520,15 +1719,21 @@ class Model(Network):
 
     # Validate and standardize user data.
     if self._distribution_strategy:
-      distributed_training_utils.validate_callbacks(callbacks)
+      distributed_training_utils.validate_callbacks(callbacks, self.optimizer,
+                                                    self._distribution_strategy)
 
       distributed_training_utils.validate_inputs(
           x, y, self._distribution_strategy)
 
       first_x_value = nest.flatten(x)[0]
-      if not steps_per_epoch and isinstance(first_x_value, np.ndarray):
-        steps_per_epoch = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps_per_epoch, batch_size = (
+            distributed_training_utils.get_input_params(
+                self._distribution_strategy, first_x_value, steps_per_epoch,
+                batch_size, is_training=True))
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps_per_epoch,
+                                                    x)
 
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1539,13 +1744,14 @@ class Model(Network):
         check_steps=True,
         steps_name='steps_per_epoch',
         steps=steps_per_epoch,
-        validation_split=validation_split)
+        validation_split=validation_split,
+        shuffle=shuffle)
 
     # Prepare validation data.
     if validation_data:
       if (isinstance(validation_data, iterator_ops.Iterator) or
           isinstance(validation_data, iterator_ops.EagerIterator) or
-          isinstance(validation_data, dataset_ops.Dataset)):
+          isinstance(validation_data, dataset_ops.DatasetV2)):
         val_x = validation_data
         val_y = None
         val_sample_weight = None
@@ -1568,9 +1774,10 @@ class Model(Network):
         distributed_training_utils.validate_inputs(
             val_x, val_y, self._distribution_strategy)
         first_valx_value = nest.flatten(val_x)[0]
-        if not validation_steps and isinstance(first_valx_value, np.ndarray):
-          validation_steps = distributed_training_utils.get_input_batch_params(
-              first_valx_value, batch_size, self._distribution_strategy)
+        if isinstance(first_valx_value, np.ndarray):
+          validation_steps, _ = distributed_training_utils.get_input_params(
+              self._distribution_strategy, first_valx_value, validation_steps,
+              batch_size)
 
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
@@ -1600,27 +1807,25 @@ class Model(Network):
       val_y = None
       val_sample_weights = None
 
-    if context.executing_eagerly():
-      return training_eager.fit_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
-          class_weight=class_weight,
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.fit_generator(
+          self, (x, y, sample_weights),
+          steps_per_epoch=steps_per_epoch,
           batch_size=batch_size,
           epochs=epochs,
+          shuffle=shuffle,
           verbose=verbose,
           callbacks=callbacks,
-          val_inputs=val_x,
-          val_targets=val_y,
-          val_sample_weights=val_sample_weights,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
-    elif self._distribution_strategy:
-      return training_distributed.fit_loop(
-          self, x,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          workers=0,
+          initial_epoch=initial_epoch)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_fit_loop(
+          self,
+          x,
           epochs=epochs,
           verbose=verbose,
           callbacks=callbacks,
@@ -1630,7 +1835,9 @@ class Model(Network):
           validation_steps=validation_steps)
     else:
       return training_arrays.fit_loop(
-          self, x, y,
+          self,
+          x,
+          y,
           sample_weights=sample_weights,
           batch_size=batch_size,
           epochs=epochs,
@@ -1733,19 +1940,16 @@ class Model(Network):
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
-
     # Validate and standardize user data.
     if self._distribution_strategy:
       distributed_training_utils.validate_inputs(
           x, y, self._distribution_strategy)
       first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray) and not steps:
-        steps = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1756,21 +1960,18 @@ class Model(Network):
         steps_name='steps',
         steps=steps)
 
-    if context.executing_eagerly():
-      return training_eager.test_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.evaluate_generator(
+          self, (x, y, sample_weights),
+          steps=steps,
           batch_size=batch_size,
           verbose=verbose,
-          steps=steps)
-    elif self._distribution_strategy:
-      return training_distributed.test_loop(
-          self,
-          iterator=x,
-          verbose=verbose,
-          steps=steps)
+          workers=0)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_test_loop(
+          self, iterator=x, verbose=verbose, steps=steps)
     else:
       return training_arrays.test_loop(
           self,
@@ -1844,77 +2045,89 @@ class Model(Network):
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
-
     if self._distribution_strategy:
-      # Turn off prefetching since this is currently not deterministic. Once
-      # b/112498930 is fixed we can turn it back on.
-      # `_prefetch_on_device` is currently a property of only
-      # `MirroredStrategy`.
-      if hasattr(self._distribution_strategy, '_prefetch_on_device'):
-        self._distribution_strategy._prefetch_on_device = False  # pylint: disable=protected-access
       distributed_training_utils.validate_inputs(
           x, None, self._distribution_strategy)
       first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray) and not steps:
-        steps = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
 
-    # Validate and standardize user data.
-    # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
-    # means that we end up calculating it twice which we should avoid.
-    x, _, _ = self._standardize_user_data(
-        x, check_steps=True, steps_name='steps', steps=steps)
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
-    if context.executing_eagerly():
-      return training_eager.predict_loop(
-          self, x, batch_size=batch_size, verbose=verbose, steps=steps)
-    elif self._distribution_strategy:
-      results = training_distributed.predict_loop(
+    # Validate and standardize user data.
+    if self._distribution_strategy:
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps,
+          batch_size=batch_size)
+    else:
+      # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
+      # means we need to special case distribution strategy which needs the
+      # batch size.
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps)
+
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.predict_generator(
+          self,
+          x,
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_predict_loop(
           self, x, verbose=verbose, steps=steps)
-      # Turn prefetching back on since we turned it off previously.
-      if hasattr(self._distribution_strategy, '_prefetch_on_device'):
-        self._distribution_strategy._prefetch_on_device = True  # pylint: disable=protected-access
-      return results
     else:
       return training_arrays.predict_loop(
           self, x, batch_size=batch_size, verbose=verbose, steps=steps)
 
-  def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None):
+  def reset_metrics(self):
+    """Resets the state of metrics."""
+    if hasattr(self, 'metrics'):
+      for m in self.metrics:
+        m.reset_states()
+      if self._distribution_strategy:
+        training_distributed._reset_metrics(self)  # pylint: disable=protected-access
+
+  def train_on_batch(self,
+                     x,
+                     y=None,
+                     sample_weight=None,
+                     class_weight=None,
+                     reset_metrics=True):
     """Runs a single gradient update on a single batch of data.
 
     Arguments:
         x: Input data. It could be:
           - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
+              (in case the model has multiple inputs).
           - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
+              (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
+              if the model has named inputs.
           - A `tf.data` dataset or a dataset iterator.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset or a
-          dataset iterator, `y` should not be specified
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s). It should be consistent with `x`
+          (you cannot have Numpy inputs and tensor targets, or inversely). If
+          `x` is a dataset or a dataset iterator, `y` should not be specified
           (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
-            weights to apply to the model's loss for each sample.
-            In the case of temporal data, you can pass a 2D array
-            with shape (samples, sequence_length),
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile(). This argument is not
-            supported when `x` is a dataset or a dataset iterator.
-        class_weight: Optional dictionary mapping
-            class indices (integers) to
-            a weight (float) to apply to the model's loss for the samples
-            from this class during training.
-            This can be useful to tell the model to "pay more attention" to
-            samples from an under-represented class.
+          weights to apply to the model's loss for each sample. In the case of
+          temporal data, you can pass a 2D array with shape (samples,
+          sequence_length), to apply a different weight to every timestep of
+          every sample. In this case you should make sure to specify
+          sample_weight_mode="temporal" in compile(). This argument is not
+          supported when `x` is a dataset or a dataset iterator.
+        class_weight: Optional dictionary mapping class indices (integers) to a
+          weight (float) to apply to the model's loss for the samples from this
+          class during training. This can be useful to tell the model to "pay
+          more attention" to samples from an under-represented class.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
 
     Returns:
         Scalar training loss
@@ -1933,23 +2146,30 @@ class Model(Network):
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight, class_weight=class_weight)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       outputs = training_eager.train_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        ins = x + y + sample_weights + [1]
+      if not isinstance(K.symbolic_learning_phase(), int):
+        ins = x + y + sample_weights + [True]
       else:
         ins = x + y + sample_weights
 
-      self._make_train_function()
-      outputs = self.train_function(ins)
+      if reset_metrics:
+        self._make_train_function()
+        outputs = self.train_function(ins)  # pylint: disable=not-callable
+      else:
+        self._make_fit_function()
+        outputs = self._fit_function(ins)  # pylint: disable=not-callable
+
+    if reset_metrics:
+      self.reset_metrics()
 
     if len(outputs) == 1:
       return outputs[0]
     return outputs
 
-  def test_on_batch(self, x, y=None, sample_weight=None):
+  def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
     """Test the model on a single batch of samples.
 
     Arguments:
@@ -1975,6 +2195,9 @@ class Model(Network):
             In this case you should make sure to specify
             sample_weight_mode="temporal" in compile(). This argument is not
             supported when `x` is a dataset or a dataset iterator.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1992,16 +2215,20 @@ class Model(Network):
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       outputs = training_eager.test_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        ins = x + y + sample_weights + [0]
+      inputs = x + y + sample_weights
+      if reset_metrics:
+        self._make_test_function()
+        outputs = self.test_function(inputs)  # pylint: disable=not-callable
       else:
-        ins = x + y + sample_weights
-      self._make_test_function()
-      outputs = self.test_function(ins)
+        self._make_eval_function()
+        outputs = self._eval_function(inputs)  # pylint: disable=not-callable
+
+    if reset_metrics:
+      self.reset_metrics()
 
     if len(outputs) == 1:
       return outputs[0]
@@ -2030,28 +2257,21 @@ class Model(Network):
                                 'models compiled with DistributionStrategy.')
     # Validate and standardize user data.
     inputs, _, _ = self._standardize_user_data(x)
-    if context.executing_eagerly():
-      if (isinstance(x, iterator_ops.EagerIterator) or
-          (isinstance(x, dataset_ops.Dataset) and context.executing_eagerly())):
+    if self.run_eagerly:
+      if (isinstance(inputs, iterator_ops.EagerIterator) or
+          (isinstance(inputs, dataset_ops.DatasetV2))):
         inputs = training_utils.cast_if_floating_dtype(inputs)
-      else:
+      elif isinstance(inputs, collections.Sequence):
         inputs = [
-            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs
-        ]
+            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs]
       return self(inputs)  # pylint: disable=not-callable
 
-    if not context.executing_eagerly():
-      if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        ins = inputs + [0]
-      else:
-        ins = inputs
-
-      self._make_predict_function()
-      outputs = self.predict_function(ins)
+    self._make_predict_function()
+    outputs = self.predict_function(inputs)
 
-      if len(outputs) == 1:
-        return outputs[0]
-      return outputs
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
 
   def fit_generator(self,
                     generator,
@@ -2161,11 +2381,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`fit_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`fit_generator` is not yet enabled for unbuilt Model subclasses')
-
     return training_generator.fit_generator(
         self,
         generator,
@@ -2232,12 +2447,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`evaluate_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`evaluate_generator` is not yet enabled for '
-          'unbuilt Model subclasses')
-
     return training_generator.evaluate_generator(
         self,
         generator,
@@ -2289,11 +2498,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`predict_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`predict_generator` is not yet enabled for unbuilt Model subclasses')
-
     return training_generator.predict_generator(
         self,
         generator,
@@ -2318,22 +2522,71 @@ class Model(Network):
       return self.callback_model
     return self
 
-  def _make_callback_model(self):
+  def _make_callback_model(self, grouped_model):
     first_replicated_model = self._distribution_strategy.unwrap(
-        self._grouped_model)[0]
+        grouped_model)[0]
     # We initialize the callback model with the first replicated model.
     self._replicated_model = DistributedCallbackModel(first_replicated_model)
     self._replicated_model.set_original_model(self)
 
+  def _validate_or_infer_batch_size(self, batch_size, steps, x):
+    """Validates that the `batch_size` provided is consistent with InputLayer.
+
+    It's possible that the user specified a static batch size in their
+    InputLayer. If so, this method checks the provided `batch_size` and `x`
+    arguments are consistent with this static batch size. Also, if
+    `batch_size` is `None`, this method will attempt to infer the batch size
+    from the static batch size of the InputLayer.
+
+    Arguments:
+      batch_size: The batch_size provided as an argument to
+        fit/evaluate/predict.
+      steps: The steps provided as an argument to fit/evaluate/predict.
+      x: The data passed as `x` to fit/evaluate/predict.
+
+    Returns:
+      The validated batch_size, auto-inferred from the first layer if not
+      provided.
+    """
+    layers = super(Model, self).layers  # Avoids the override in Sequential.
+    if layers:
+      first_layer = layers[0]
+      static_batch_size = training_utils.get_static_batch_size(first_layer)
+      if static_batch_size is not None:
+
+        # Check `batch_size` argument is consistent with InputLayer.
+        if batch_size is not None and batch_size != static_batch_size:
+          raise ValueError('The `batch_size` argument value {} is incompatible '
+                           'with the specified batch size of your Input Layer: '
+                           '{}'.format(batch_size, static_batch_size))
+
+        # Check Dataset/Iterator batch size is consistent with InputLayer.
+        if isinstance(x, (dataset_ops.DatasetV2, iterator_ops.Iterator,
+                          iterator_ops.EagerIterator)):
+          ds_batch_size = tensor_shape.as_dimension(
+              nest.flatten(x.output_shapes)[0][0]).value
+          if ds_batch_size is not None and ds_batch_size != static_batch_size:
+            raise ValueError('The batch output shape of your `Dataset` is {}, '
+                             'which is incompatible with the specified batch '
+                             'size of your Input Layer: {}'.format(
+                                 ds_batch_size, static_batch_size))
+
+        # Set inferred batch size from the InputLayer.
+        if steps is None:
+          batch_size = static_batch_size
+
+    if batch_size is None and steps is None:
+      # Backwards compatibility
+      batch_size = 32
+    return batch_size
+
 
 class DistributedCallbackModel(Model):
   """Model that is used for callbacks with DistributionStrategy."""
 
   def __init__(self, model):
     super(DistributedCallbackModel, self).__init__()
-    # TODO(anjalisridhar): Right now the only attributes set are the layer and
-    # weights. We may need to set additional attributes as needed since we have
-    # not called compile on this model.
+    self.optimizer = model.optimizer
 
   def set_original_model(self, orig_model):
     self._original_model = orig_model
@@ -2365,3 +2618,7 @@ class DistributedCallbackModel(Model):
       logging.warning('You are accessing attribute ' + item + ' of the '
                       'DistributedCallbackModel that may not have been set '
                       'correctly.')
+
+
+def _is_symbolic_tensor(x):
+  return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index 95b864bef028ecb8de3c85cd3df13bc41fd8dff3..196d48faec23acd42bca33414b4862a5084d18f5 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -19,14 +19,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras.engine import training_distributed
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.platform import tf_logging as logging
 
@@ -36,22 +39,108 @@ except ImportError:
   issparse = None
 
 
-def fit_loop(model,
-             inputs,
-             targets,
-             sample_weights=None,
-             batch_size=None,
-             epochs=100,
-             verbose=1,
-             callbacks=None,
-             val_inputs=None,
-             val_targets=None,
-             val_sample_weights=None,
-             shuffle=True,
-             initial_epoch=0,
-             steps_per_epoch=None,
-             validation_steps=None):
-  """Abstract fit function for arrays of data.
+def _get_model_feed(model, mode):
+  if mode == 'predict':
+    feed = model._feed_inputs
+  else:
+    feed = (
+        model._feed_inputs + model._feed_targets + model._feed_sample_weights)
+  return feed
+
+
+def _validate_arguments(steps_per_epoch, validation_steps, kwargs):
+  for k in kwargs:
+    if k != 'steps':
+      raise ValueError('Invalid argument passed: {}'.format(k))
+
+  # Validate inputs when in training mode.
+  if validation_steps and steps_per_epoch is None:
+    raise ValueError('Can only use `validation_steps` '
+                     'when doing step-wise '
+                     'training, i.e. `steps_per_epoch` '
+                     'must be set.')
+
+
+def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
+  if (val_inputs and steps_per_epoch is None and verbose and inputs and
+      hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
+    print('Train on %d samples, validate on %d samples' %
+          (inputs[0].shape[0], val_inputs[0].shape[0]))
+
+
+def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
+  """Returns total number of samples (when training in batch mode) or steps."""
+  if steps_per_epoch:
+    return steps_per_epoch
+  return training_utils.check_num_samples(ins, batch_size, steps_per_epoch,
+                                          'steps_per_epoch')
+
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of 'train'/'test'/'predict'.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  if model._distribution_strategy:
+    def get_distributed_inputs():
+      return training_distributed._prepare_feed_values(
+          model, inputs, targets, sample_weights, mode)
+
+    # In the eager case, we want to call the input method per step, so return
+    # a lambda from here that can be called. Note that this is applicable only
+    # in Distribution Strategy case as it follows the same code path for both
+    # eager and graph modes.
+    # TODO(priyag,omalleyt): Either we should move the training DS with
+    # EagerIterator to use training_generator code path, or figure out how to
+    # set a symbolic Iterator out of a Dataset when in eager mode.
+    if context.executing_eagerly():
+      return get_distributed_inputs
+    else:
+      return get_distributed_inputs()
+
+  inputs = training_utils.ModelInputs(inputs).as_list()
+  targets = targets or []
+  sample_weights = sample_weights or []
+  ins = inputs + targets + sample_weights
+  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
+    ins += [True]
+  return ins
+
+
+def _make_execution_function(model, mode):
+  """Makes function to run one step of model execution."""
+  if model._distribution_strategy:
+    return training_distributed._make_execution_function(model, mode)
+  return model._make_execution_function(mode)
+
+
+def model_iteration(model,
+                    inputs,
+                    targets=None,
+                    sample_weights=None,
+                    batch_size=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    val_inputs=None,
+                    val_targets=None,
+                    val_sample_weights=None,
+                    shuffle=True,
+                    initial_epoch=0,
+                    steps_per_epoch=None,
+                    validation_steps=None,
+                    mode='train',
+                    validation_in_fit=False,
+                    **kwargs):
+  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
 
   Arguments:
       model: Keras Model instance.
@@ -66,137 +155,157 @@ def fit_loop(model,
       val_targets: List of target arrays.
       val_sample_weights: Optional list of sample weight arrays.
       shuffle: Whether to shuffle the data at the beginning of each epoch
-          concatenation of list the display names of the outputs of
-           `f` and the list of display names of the outputs of `f_val`.
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for
-          (only if doing validation from data tensors).
-          Ignored with the default value of `None`.
+        concatenation of list the display names of the outputs of `f` and the
+        list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training (useful for resuming a
+        previous training run)
+      steps_per_epoch: Total number of steps (batches of samples) before
+        declaring one epoch finished and starting the next epoch. Ignored with
+        the default value of `None`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with the default value of `None`.
+      mode: One of 'train'/'test'/'predict'.
+      validation_in_fit: if true, then this method is invoked from within
+        training iteration (for validation). In this case, do not copy weights
+        when using a tf.distribute.Strategy.
+      **kwargs: Additional arguments for backwards compatibility.
 
   Returns:
-      `History` object.
+      - In 'train' mode: `History` object.
+      - In 'test' mode: Evaluation metrics.
+      - In 'predict' mode: Outputs of the Model called on inputs.
 
   Raises:
       ValueError: in case of invalid arguments.
   """
-  model._make_train_function()
-  f = model.train_function
-
-  sample_weights = sample_weights or []
-  val_sample_weights = val_sample_weights or []
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = inputs + targets + sample_weights + [1]
-  else:
-    ins = inputs + targets + sample_weights
-
-  do_validation = False
-  if val_inputs:
-    do_validation = True
-    if (steps_per_epoch is None and verbose and inputs and
-        hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
-      print('Train on %d samples, validate on %d samples' %
-            (inputs[0].shape[0], val_inputs[0].shape[0]))
-  if validation_steps:
-    do_validation = True
-    if steps_per_epoch is None:
-      raise ValueError('Can only use `validation_steps` '
-                       'when doing step-wise '
-                       'training, i.e. `steps_per_epoch` '
-                       'must be set.')
-
-  num_train_samples = training_utils.check_num_samples(
-      ins, batch_size, steps_per_epoch, 'steps_per_epoch')
-  count_mode = 'steps' if steps_per_epoch else 'samples'
+  # Backwards compatibility.
+  if 'steps' in kwargs:
+    steps_per_epoch = kwargs['steps']
+
+  _validate_arguments(steps_per_epoch, validation_steps, kwargs)
+  if mode == 'train':
+    _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
+
+  # Enter DistributionStrategy scope.
+  if model._distribution_strategy:
+    scope = model._distribution_strategy.scope()
+    scope.__enter__()
+
+  # Get step function and loop type.
+  f = _make_execution_function(model, mode)
+  use_steps = steps_per_epoch is not None
+  do_validation = val_inputs is not None
+
+  # Prepare input data.
+  ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
+  num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
+                                                   steps_per_epoch)
+
+  # Configure callbacks.
+  count_mode = 'steps' if use_steps else 'samples'
   callbacks = cbks.configure_callbacks(
       callbacks,
       model,
       do_validation=do_validation,
-      val_inputs=val_inputs,
-      val_targets=val_targets,
-      val_sample_weights=val_sample_weights,
       batch_size=batch_size,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
-      samples=num_train_samples,
-      validation_steps=validation_steps,
-      verbose=verbose,
-      count_mode=count_mode)
-
-  if num_train_samples is not None:
-    index_array = np.arange(num_train_samples)
-
-  # To prevent a slowdown, we find beforehand the arrays that need conversion.
-  feed = model._feed_inputs + model._feed_targets + model._feed_sample_weights
-  indices_for_conversion_to_dense = []
-  for i in range(len(feed)):
-    if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
-      indices_for_conversion_to_dense.append(i)
-
-  callbacks.on_train_begin()
+      samples=num_samples_or_steps,
+      verbose=0,  # Handle ProgBarLogger separately in this loop.
+      mode=mode)
+  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
+  progbar = training_utils.get_progbar(model, count_mode)
+  progbar.params = callbacks.params
+  progbar.params['verbose'] = verbose
+
+  # Find beforehand arrays that need sparse-to-dense conversion.
+  if issparse is not None and not use_steps:
+    indices_for_conversion_to_dense = []
+    feed = _get_model_feed(model, mode)
+    for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
+      if issparse(input_data) and not K.is_sparse(feed_tensor):
+        indices_for_conversion_to_dense.append(i)
+
+  # Select aggregation method.
+  if mode == 'predict':
+    aggregator = training_utils.OutputsAggregator(use_steps,
+                                                  num_samples_or_steps)
+  else:
+    aggregator = training_utils.MetricsAggregator(use_steps,
+                                                  num_samples_or_steps)
+
+  if model._distribution_strategy and not validation_in_fit:
+    training_distributed._copy_weights_to_distributed_model(
+        model, model._grouped_model)
+
+  callbacks.model.stop_training = False
+  callbacks._call_begin_hook(mode)
+  progbar.on_train_begin()
+
   for epoch in range(initial_epoch, epochs):
-    # Reset stateful metrics
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-    # Update callbacks
-    callbacks.on_epoch_begin(epoch)
+    if callbacks.model.stop_training:
+      break
+
+    # Setup work for each epoch
     epoch_logs = {}
-    if steps_per_epoch is not None:
-      # Step-wise fit loop.
-      for step_index in range(steps_per_epoch):
-        batch_logs = {'batch': step_index, 'size': 1}
-        callbacks.on_batch_begin(step_index, batch_logs)
+    model.reset_metrics()
+    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_begin(epoch, epoch_logs)
+
+    if use_steps:
+      # Step-wise loop.
+      for step in range(steps_per_epoch):
+        batch_logs = {'batch': step, 'size': 1}
+        callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
+        progbar.on_batch_begin(step, batch_logs)
+
+        # Get outputs.
         try:
-          outs = f(ins)
+          # `ins` can be callable in DistributionStrategy + eager case.
+          actual_inputs = ins() if callable(ins) else ins
+          batch_outs = f(actual_inputs)
         except errors.OutOfRangeError:
           logging.warning('Your dataset iterator ran out of data; '
                           'interrupting training. Make sure that your dataset '
                           'can generate at least `steps_per_epoch * epochs` '
                           'batches (in this case, %d batches). You may need to'
                           'use the repeat() function when building your '
-                          'dataset.' %
-                          steps_per_epoch * epochs)
+                          'dataset.' % steps_per_epoch * epochs)
           break
+        if not isinstance(batch_outs, list):
+          batch_outs = [batch_outs]
 
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
+        if model._distribution_strategy:
+          batch_outs = training_distributed._per_device_aggregate_batch(
+              batch_outs, model, mode)
+
+        # Aggregate results.
+        if step == 0:
+          aggregator.create(batch_outs)
+        aggregator.aggregate(batch_outs)
+
+        # Callbacks batch end.
+        batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
+        callbacks._call_batch_hook(mode, 'end', step, batch_logs)
+        progbar.on_batch_end(step, batch_logs)
 
-        callbacks.on_batch_end(step_index, batch_logs)
         if callbacks.model.stop_training:
           break
-
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_inputs,
-            val_targets,
-            sample_weights=val_sample_weights,
-            steps=validation_steps,
-            verbose=0)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(model.metrics_names, val_outs):
-          epoch_logs['val_' + l] = o
     else:
-      # Sample-wise fit loop.
+      # Sample-wise loop.
+      index_array = np.arange(num_samples_or_steps)
       if shuffle == 'batch':
         index_array = training_utils.batch_shuffle(index_array, batch_size)
       elif shuffle:
         np.random.shuffle(index_array)
-
-      batches = make_batches(num_train_samples, batch_size)
+      batches = make_batches(num_samples_or_steps, batch_size)
 
       for batch_index, (batch_start, batch_end) in enumerate(batches):
         batch_ids = index_array[batch_start:batch_end]
+
+        # Slice into a batch.
         try:
-          if isinstance(ins[-1], int):
+          if ins and isinstance(ins[-1], int):
             # Do not slice the training phase flag.
             ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
           else:
@@ -205,256 +314,77 @@ def fit_loop(model,
           raise TypeError('TypeError while preparing batch. '
                           'If using HDF5 input data, '
                           'pass shuffle="batch".')
-        batch_logs = {}
-        batch_logs['batch'] = batch_index
-        batch_logs['size'] = len(batch_ids)
-        callbacks.on_batch_begin(batch_index, batch_logs)
-        for i in indices_for_conversion_to_dense:
-          ins_batch[i] = ins_batch[i].toarray()
-
-        outs = f(ins_batch)
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
-
-        callbacks.on_batch_end(batch_index, batch_logs)
-        if callbacks.model.stop_training:
-          break
-
-        if batch_index == len(batches) - 1:  # Last batch.
-          if do_validation:
-            val_outs = test_loop(
-                model,
-                val_inputs,
-                val_targets,
-                sample_weights=val_sample_weights,
-                batch_size=batch_size,
-                verbose=0)
-            if not isinstance(val_outs, list):
-              val_outs = [val_outs]
-            # Same labels assumed.
-            for l, o in zip(model.metrics_names, val_outs):
-              epoch_logs['val_' + l] = o
-    callbacks.on_epoch_end(epoch, epoch_logs)
-    if callbacks.model.stop_training:
-      break
-  callbacks.on_train_end()
-  return model.history
 
+        # Sparse to dense conversion.
+        if issparse is not None:
+          for i in indices_for_conversion_to_dense:
+            ins_batch[i] = ins_batch[i].toarray()
 
-def predict_loop(model, inputs, batch_size=32, verbose=0, steps=None):
-  """Abstract method to loop over some data in batches.
-
-  Arguments:
-      model: Keras Model instance.
-      inputs: list of tensors to be fed to `f`.
-      batch_size: integer batch size.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  model._make_predict_function()
-  f = model.predict_function
+        # Callbacks batch_begin.
+        batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
+        callbacks._call_batch_hook(mode, 'begin', batch_index, batch_logs)
+        progbar.on_batch_begin(batch_index, batch_logs)
 
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = inputs + [0]
-  else:
-    ins = inputs
+        # Get outputs.
+        batch_outs = f(ins_batch)
+        if not isinstance(batch_outs, list):
+          batch_outs = [batch_outs]
 
-  num_samples = training_utils.check_num_samples(
-      inputs, batch_size, steps, 'steps')
-  if verbose == 1:
-    if steps is not None:
-      progbar = Progbar(target=steps)
-    else:
-      progbar = Progbar(target=num_samples)
-
-  indices_for_conversion_to_dense = []
-  for i in range(len(model._feed_inputs)):
-    if (issparse is not None and issparse(inputs[i]) and
-        not K.is_sparse(model._feed_inputs[i])):
-      indices_for_conversion_to_dense.append(i)
-
-  if steps is not None:
-    # Step-based predictions.
-    # Since we do not know how many samples
-    # we will see, we cannot pre-allocate
-    # the returned Numpy arrays.
-    # Instead, we store one array per batch seen
-    # and concatenate them upon returning.
-    unconcatenated_outs = []
-    for step in range(steps):
-      batch_outs = f(ins)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if step == 0:
-        for batch_out in batch_outs:
-          unconcatenated_outs.append([])
-      for i, batch_out in enumerate(batch_outs):
-        unconcatenated_outs[i].append(batch_out)
-      if verbose == 1:
-        progbar.update(step + 1)
-    if len(unconcatenated_outs) == 1:
-      return np.concatenate(unconcatenated_outs[0], axis=0)
-    return [
-        np.concatenate(unconcatenated_outs[i], axis=0)
-        for i in range(len(unconcatenated_outs))
-    ]
-  else:
-    # Sample-based predictions.
-    outs = []
-    batches = make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      if ins and isinstance(ins[-1], int):
-        # Do not slice the training phase flag.
-        ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-      else:
-        ins_batch = slice_arrays(ins, batch_ids)
-      for i in indices_for_conversion_to_dense:
-        ins_batch[i] = ins_batch[i].toarray()
-
-      batch_outs = f(ins_batch)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if batch_index == 0:
-        # Pre-allocate the results arrays.
-        for batch_out in batch_outs:
-          shape = (num_samples,) + batch_out.shape[1:]
-          outs.append(np.zeros(shape, dtype=batch_out.dtype))
-      for i, batch_out in enumerate(batch_outs):
-        outs[i][batch_start:batch_end] = batch_out
-      if verbose == 1:
-        progbar.update(batch_end)
-    if len(outs) == 1:
-      return outs[0]
-    return outs
-
-
-def test_loop(model,
-              inputs,
-              targets,
-              sample_weights=None,
-              batch_size=None,
-              verbose=0,
-              steps=None):
-  """Abstract method to loop over some data in batches.
+        # Aggregate results.
+        if batch_index == 0:
+          aggregator.create(batch_outs)
+        aggregator.aggregate(batch_outs, batch_start, batch_end)
 
-  Arguments:
-      model: Keras Model instance.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: integer batch size or `None`.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
+        # Callbacks batch end.
+        batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
+        callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
+        progbar.on_batch_end(batch_index, batch_logs)
 
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  model._make_test_function()
-  f = model.test_function
+        if callbacks.model.stop_training:
+          break
 
-  sample_weights = sample_weights or []
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-    ins = inputs + targets + sample_weights + [0]
-  else:
-    ins = inputs + targets + sample_weights
-
-  if hasattr(model, 'metrics'):
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-    stateful_metric_indices = [
-        i for i, name in enumerate(model.metrics_names)
-        if str(name) in model.stateful_metric_names
-    ]
-  else:
-    stateful_metric_indices = []
-
-  num_samples = training_utils.check_num_samples(
-      ins, batch_size, steps, 'steps')
-  outs = []
-  if verbose == 1:
-    if steps is not None:
-      progbar = Progbar(target=steps)
-    else:
-      progbar = Progbar(target=num_samples)
-
-  # To prevent a slowdown, we find beforehand the arrays that need conversion.
-  feed = model._feed_inputs + model._feed_targets + model._feed_sample_weights
-  indices_for_conversion_to_dense = []
-  for i in range(len(feed)):
-    if issparse is not None and issparse(ins[i]) and not K.is_sparse(feed[i]):
-      indices_for_conversion_to_dense.append(i)
-
-  if steps is not None:
-    for step in range(steps):
-      batch_outs = f(ins)
-      if isinstance(batch_outs, list):
-        if step == 0:
-          for _ in enumerate(batch_outs):
-            outs.append(0.)
-        for i, batch_out in enumerate(batch_outs):
-          if i in stateful_metric_indices:
-            outs[i] = batch_out
-          else:
-            outs[i] += batch_out
-      else:
-        if step == 0:
-          outs.append(0.)
-        outs[0] += batch_outs
-      if verbose == 1:
-        progbar.update(step + 1)
-    for i in range(len(outs)):
-      if i not in stateful_metric_indices:
-        outs[i] /= steps
-  else:
-    batches = make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      if isinstance(ins[-1], int):
-        # Do not slice the training phase flag.
-        ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
-      else:
-        ins_batch = slice_arrays(ins, batch_ids)
-      for i in indices_for_conversion_to_dense:
-        ins_batch[i] = ins_batch[i].toarray()
-
-      batch_outs = f(ins_batch)
-
-      if isinstance(batch_outs, list):
-        if batch_index == 0:
-          outs.extend([0.] * len(batch_outs))
-        for i, batch_out in enumerate(batch_outs):
-          if i in stateful_metric_indices:
-            outs[i] = batch_out
-          else:
-            outs[i] += batch_out * len(batch_ids)
-      else:
-        if batch_index == 0:
-          outs.append(0.)
-        outs[0] += batch_outs * len(batch_ids)
-      if verbose == 1:
-        progbar.update(batch_end)
-    for i in range(len(outs)):
-      if i not in stateful_metric_indices:
-        outs[i] /= num_samples
-  if len(outs) == 1:
-    return outs[0]
-  return outs
+    aggregator.finalize()
+    results = aggregator.results
+    epoch_logs.update(training_utils.make_logs(model, results, mode))
+    if len(results) == 1:
+      results = results[0]
+
+    # Run the test loop every epoch during training.
+    if do_validation and not callbacks.model.stop_training:
+      val_results = model_iteration(
+          model,
+          val_inputs,
+          targets=val_targets,
+          sample_weights=val_sample_weights,
+          batch_size=batch_size,
+          steps_per_epoch=validation_steps,
+          callbacks=callbacks,
+          verbose=0,
+          mode='test',
+          validation_in_fit=True)
+      if not isinstance(val_results, list):
+        val_results = [val_results]
+      epoch_logs.update(
+          training_utils.make_logs(model, val_results, mode, prefix='val_'))
+
+    callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_end(epoch, epoch_logs)
+  callbacks._call_end_hook(mode)
+
+  if model._distribution_strategy:
+    # TODO(priyag, psv): Copy back metrics to the original model as well?
+    if not validation_in_fit:
+      training_distributed._copy_weights_to_original_model(
+          model, model._grouped_model, mode)
+
+    scope.__exit__(None, None, None)
+
+  if mode == 'train':
+    return model.history
+  return results
+
+
+# For backwards compatibility for internal users of these loops.
+fit_loop = functools.partial(model_iteration, mode='train')
+test_loop = functools.partial(model_iteration, mode='test', shuffle=False)
+predict_loop = functools.partial(model_iteration, mode='predict', shuffle=False)
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6cc93d1ef77b14142851e6267158d61edcbc13b
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -0,0 +1,351 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_training_and_eval_methods_on_iterators_single_io(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(iterator, steps=2, verbose=1)
+    model.predict(iterator, steps=2)
+
+    # Test with validation data
+    model.fit(iterator,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=iterator, validation_steps=2)
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(iterator,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_split=0.5, validation_steps=2)
+
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          iterator,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
+
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(iterator, iterator,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(iterator, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(iterator, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(iterator, verbose=0)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_get_next_op_created_once(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    # Finalize graph to make sure we are not appending another iterator
+    # get_next op in the graph.
+    ops.get_default_graph().finalize()
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_iterators_running_out_of_data(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(2)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'dataset iterator ran out of data')
+
+
+class TestTrainingWithDataset(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_calling_model_on_same_dataset(self):
+    if ((not testing_utils.should_run_eagerly())
+        and testing_utils.get_model_type() == 'subclass'
+        and context.executing_eagerly()):
+      self.skipTest('b/120673224')
+
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    # Call fit with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+    # Finalize the graph to make sure new ops aren't added when calling on the
+    # same dataset
+    ops.get_default_graph().finalize()
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_training_and_eval_methods_on_dataset(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+    # Test with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(dataset,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_split=0.5, validation_steps=2)
+
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          dataset,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
+
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(dataset, dataset,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(dataset, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(dataset, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(dataset, verbose=0)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_dataset_with_sample_weights(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    sample_weights = np.ones((10), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
+                                                      sample_weights))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_dataset_with_sparse_labels(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    for loss in ['sparse_categorical_crossentropy',
+                 losses_impl.sparse_softmax_cross_entropy]:
+      model.compile(optimizer, loss,
+                    run_eagerly=testing_utils.should_run_eagerly())
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.random.randint(0, 4, size=10, dtype=np.int32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @tf_test_util.run_deprecated_v1
+  def test_dataset_input_shape_validation(self):
+    with self.cached_session():
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
+      ):
+        model.train_on_batch(dataset)
+
+      # Wrong input shape
+      inputs = np.zeros((10, 5))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   r'expected (.*?) to have shape \(3,\)'):
+        model.train_on_batch(dataset)
+
+
+class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_metrics_correctness_with_iterator(self):
+    layers = [
+        keras.layers.Dense(8, activation='relu', input_dim=4,
+                           kernel_initializer='ones'),
+        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
+    ]
+
+    model = testing_utils.get_model_from_layers(layers, (4,))
+
+    model.compile(
+        loss='binary_crossentropy',
+        metrics=['accuracy', metrics_module.BinaryAccuracy()],
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    np.random.seed(123)
+    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
+    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
+    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
+
+    y = np.zeros((100, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(outs[1], 0.)
+    self.assertEqual(outs[2], 0.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 04e8d079c0c817b65c5041da492821c5186cfbbf..d20d092d8e61499e4a005f7d6770a3c0a0ee60fc 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -18,218 +18,64 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+import enum  # pylint: disable=g-bad-import-order
 import numpy as np
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import distributed_training_utils
-from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
+# TODO(sourabhbajaj): Check if we can merge the test and prediction graphs
+class _Mode(enum.Enum):
+  TRAIN = 'train'
+  TEST = 'test'
+  PREDICT = 'predict'
 # TODO(priyag, sourabhbajaj): Refactor this file to address code duplication.
 
 
-def fit_loop(
-    model,
-    iterator,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    val_iterator=None,
-    initial_epoch=0,
-    steps_per_epoch=None,
-    validation_steps=None):
-  """Fit loop for training with DistributionStrategy.
+def experimental_fit_loop(model,
+                          iterator,
+                          epochs=100,
+                          verbose=1,
+                          callbacks=None,
+                          initial_epoch=0,
+                          steps_per_epoch=None,
+                          val_iterator=None,
+                          validation_steps=None):
+  """Fit loop for training with TPU DistributionStrategy.
 
   Arguments:
       model: Keras Model instance.
-      iterator: Iterator for input data.
+      iterator: Iterator that returns inputs and targets
       epochs: Number of times to iterate over the data
       verbose: Integer, Verbosity mode, 0, 1 or 2
       callbacks: List of callbacks to be called during training
-      val_iterator: Iterator for validation data.
       initial_epoch: Epoch at which to start training
           (useful for resuming a previous training run)
       steps_per_epoch: Total number of steps (batches of samples)
           before declaring one epoch finished and starting the
           next epoch. Ignored with the default value of `None`.
+      val_iterator: Iterator for validation data.
       validation_steps: Number of steps to run validation for
           (only if doing validation from data tensors).
           Ignored with the default value of `None`.
 
-  Returns:
-      `History` object.
-
-  Raises:
-      ValueError: in case of invalid arguments.
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_fit_loop(
-        model, iterator, epochs, verbose, callbacks, initial_epoch,
-        steps_per_epoch)
-
-  if not model._grouped_model:
-    clone_model_on_towers(model, current_strategy, make_callback_model=True)
-
-  def _per_device_train_function(model):
-    model._make_train_function()
-    return (model.train_function.inputs,
-            model.train_function.outputs,
-            model.train_function.updates_op,
-            model.train_function.session_kwargs)
-
-  inputs, targets = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_train_function`.
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_tower(
-         _per_device_train_function, model._grouped_model)
-    # Unwrap all the per device values returned from `call_for_each_tower`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of update ops on
-    # all the devices over which the model is distributed.
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs,
-         grouped_updates, grouped_session_args, with_loss_tensor=True)
-
-    # Dataset inputs and targets are also per devices values that need to be
-    # unwrapped.
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-    dataset_targets = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, targets)
-
-    # Create a train function that is composed of all the parameters above.
-    distributed_train_function = K.Function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_train_function',
-        **all_session_args)
-
-    # We need to set sample_weights to None since there are sample weight
-    # placeholders that are created with default values.
-    sample_weights = [None for _ in range(len(model.outputs) *
-                                          current_strategy.num_towers)]
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + dataset_targets + sample_weights + [1]
-    else:
-      ins = dataset_inputs + dataset_targets
-
-    do_validation = False
-    if validation_steps:
-      do_validation = True
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        val_inputs=None,
-        val_targets=None,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        verbose=verbose)
-    out_labels = model.metrics_names or []
-    callbacks.on_train_begin()
-
-    assert steps_per_epoch is not None
-
-    for epoch in range(initial_epoch, epochs):
-      # Reset stateful metrics
-      for m in model.stateful_metric_functions:
-        m.reset_states()
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      for step_index in range(steps_per_epoch):
-        batch_logs = {'batch': step_index, 'size': 1}
-        callbacks.on_batch_begin(step_index, batch_logs)
-        try:
-          outs = distributed_train_function(ins)
-        except errors.OutOfRangeError:
-          logging.warning('Your dataset iterator ran out of data; '
-                          'interrupting training. Make sure that your dataset '
-                          'can generate at least `steps_per_epoch * epochs` '
-                          'batches (in this case, %d batches).' %
-                          steps_per_epoch * epochs)
-          break
-
-        if not isinstance(outs, list):
-          outs = [outs]
-
-        outs = _aggregate_metrics_across_towers(current_strategy.num_towers,
-                                                out_labels,
-                                                model.stateful_metric_names,
-                                                outs)
-        for l, o in zip(out_labels, outs):
-          batch_logs[l] = o
-        callbacks.on_batch_end(step_index, batch_logs)
-        if callbacks.model.stop_training:
-          break
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_iterator,
-            steps=validation_steps,
-            verbose=0)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(out_labels, val_outs):
-          epoch_logs['val_' + l] = o
-
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callbacks.model.stop_training:
-        break
-    callbacks.on_train_end()
-
-    # Copy the weights back from the replicated model to the original model.
-    updated_weights = current_strategy.unwrap(
-        model._grouped_model)[0].get_weights()
-    model.set_weights(updated_weights)
-    return model.history
-
-
-def _experimental_fit_loop(
-    model,
-    iterator,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    initial_epoch=0,
-    steps_per_epoch=None):
-  """Fit loop for training with TPU DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator that returns inputs and targets
-      epochs: Number of times to iterate over the data
-      verbose: Integer, Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-
   Returns:
       Returns `None`.
 
@@ -240,50 +86,51 @@ def _experimental_fit_loop(
 
   K.get_session().run(current_strategy.initialize())
 
-  def _per_device_train_function(model):
-    model._make_train_function()
-    return (model.train_function.inputs,
-            model.train_function.outputs,
-            model.train_function.updates_op,
-            model.train_function.session_kwargs)
+  def _per_device_fit_function(model):
+    model._make_fit_function()
+    return (model._fit_function.inputs, model._fit_function.outputs,
+            model._fit_function.updates_op, model._fit_function.session_kwargs)
 
   # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
   K.set_learning_phase(1)
+  out_labels = model.metrics_names or []
 
-  def step_fn(ctx, inputs, targets):
-    """Clones the model and calls make_train_function."""
+  def step_fn(ctx, inputs):
+    """Clones the model and calls make_fit_function."""
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
-    clone_model_on_towers(
+    inputs, targets = inputs
+    clone_model_on_replicas(
         model,
         current_strategy,
         make_callback_model=True,
         inputs=inputs,
-        targets=targets)
+        targets=targets,
+        mode=_Mode.TRAIN)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_tower(
-         _per_device_train_function, model._grouped_model)
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
+         _per_device_fit_function, args=(model._grouped_model_train,))
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs,
          grouped_updates, grouped_session_args)
-    combined_fn = K.Function(
-        all_inputs, all_outputs,
+    combined_fn = K.function(
+        all_inputs,
+        all_outputs,
         updates=all_updates,
-        name='distributed_train_function',
+        name='distributed_fit_function',
         **all_session_args)
 
-    out_labels = model.metrics_names or []
     for label, output in zip(out_labels, combined_fn.outputs):
       if label == 'loss':
-        aggregation = distribute_lib.get_loss_reduction()
+        reduce_op = distribute_lib.get_loss_reduction()
       else:
-        # We aggregate all other metrics using mean for now. This is temporary
+        # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
-        aggregation = variable_scope.VariableAggregation.MEAN
-      ctx.set_last_step_output(label, output, aggregation)
+        reduce_op = ds_reduce_util.ReduceOp.MEAN
+      ctx.set_last_step_output(label, output, reduce_op)
 
     # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
     # feed_dict, session kwargs, run options, run_metadata for now. These should
@@ -293,51 +140,51 @@ def _experimental_fit_loop(
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
-  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
+  for name in model.metrics_names[1:]:
+    tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   if steps_per_epoch is None:
     raise ValueError('`steps_per_epoch` should be specified when calling '
                      '`fit` on the model.')
   steps_per_run = K.variable(
-      value=min(steps_per_epoch, current_strategy.steps_per_run),
+      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
       dtype='int32',
       name='steps_per_run')
 
   with current_strategy.scope():
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=steps_per_run,
         initial_loop_values=initial_loop_values)
 
   train_op = ctx.run_op
   output_tensors = ctx.last_step_outputs
 
+  do_validation = bool(validation_steps)
+
   # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
   with current_strategy.scope():
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
+    _copy_weights_to_distributed_model(model, model._grouped_model_train)
+
   callbacks = cbks.configure_callbacks(
       callbacks,
       model,
-      do_validation=False,
-      val_inputs=None,
-      val_targets=None,
+      do_validation=do_validation,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
       verbose=verbose)
-  # TODO(priyag, sourabhbajaj): Add callbacks support for per step callback
-  # TODO(priyag, sourabhbajaj): Add validation.
 
   # Calculate the steps each time on the device.
-  steps_to_run = [current_strategy.steps_per_run] * (
-      steps_per_epoch // current_strategy.steps_per_run)
-  if steps_per_epoch % current_strategy.steps_per_run:
-    steps_to_run.append(steps_per_epoch % current_strategy.steps_per_run)
+  steps_to_run = [current_strategy.extended.steps_per_run] * (
+      steps_per_epoch // current_strategy.extended.steps_per_run)
+  if steps_per_epoch % current_strategy.extended.steps_per_run:
+    steps_to_run.append(
+        steps_per_epoch % current_strategy.extended.steps_per_run)
 
   callbacks.on_train_begin()
   for epoch in range(initial_epoch, epochs):
+    with current_strategy.scope():
+      _reset_metrics(model, model._grouped_model_train)
     callbacks.on_epoch_begin(epoch)
     epoch_logs = {}
     step_index = 0
@@ -364,6 +211,27 @@ def _experimental_fit_loop(
       if callbacks.model.stop_training:
         break
 
+    if do_validation:
+      logging.info('Running validation at fit epoch: %s', epoch)
+
+      # Since we create a new clone from the original model we need to copy
+      # the weights back to the original model before we can run validation.
+      with current_strategy.scope():
+        _copy_weights_to_original_model(model, model._grouped_model_train,
+                                        'train')
+
+      val_outs = experimental_test_loop(  # pylint: disable=undefined-variable
+          model,
+          val_iterator,
+          steps=validation_steps,
+          verbose=verbose,
+          initialize_finalize_strategy=False)
+      if not isinstance(val_outs, list):
+        val_outs = [val_outs]
+      # Same labels assumed.
+      for label, val_out in zip(out_labels, val_outs):
+        epoch_logs['val_' + label] = val_out
+
     callbacks.on_epoch_end(epoch, epoch_logs)
     if callbacks.model.stop_training:
       break
@@ -371,125 +239,17 @@ def _experimental_fit_loop(
 
   # Copy the weights back from the replicated model to the original model.
   with current_strategy.scope():
-    updated_weights = current_strategy.unwrap(
-        model._grouped_model)[0].get_weights()
-    model.set_weights(updated_weights)
+    _copy_weights_to_original_model(model, model._grouped_model_train, 'train')
 
   K.get_session().run(current_strategy.finalize())
   return model.history
 
 
-def test_loop(model, iterator, verbose=0, steps=None):
-  """Test loop for evaluating with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the outputs.
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_test_loop(model, iterator, verbose, steps)
-
-  if not model._grouped_model:
-    clone_model_on_towers(model, current_strategy)
-
-  def _per_device_test_function(model):
-    model._make_test_function()
-    return (model.test_function.inputs,
-            model.test_function.outputs,
-            model.test_function.updates_op,
-            model.test_function.session_kwargs)
-
-  inputs, targets = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_tower(
-         _per_device_test_function, model._grouped_model)
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args, with_loss_tensor=True)
-
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-    dataset_targets = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, targets)
-
-    distributed_test_function = K.Function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_test_function',
-        **all_session_args)
-
-    # We need to set sample_weights to None since there are sample weight
-    # placeholders that are created with default values.
-    sample_weights = [None for _ in range(len(model.outputs) *
-                                          current_strategy.num_towers)]
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + dataset_targets + sample_weights + [0]
-    else:
-      ins = dataset_inputs + dataset_targets
-
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-    stateful_metric_indices = [
-        i for i, name in enumerate(model.metrics_names)
-        if str(name) in model.stateful_metric_names
-    ]
-
-    outs = []
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    assert steps is not None
-    for step in range(steps):
-      batch_outs = distributed_test_function(ins)
-      batch_outs = _aggregate_metrics_across_towers(
-          current_strategy.num_towers, model.metrics_names,
-          model.stateful_metric_names, batch_outs)
-      if isinstance(batch_outs, list):
-        if step == 0:
-          outs = [0.] * len(batch_outs)
-        for i, batch_out in enumerate(batch_outs):
-          if i in stateful_metric_indices:
-            outs[i] = batch_out
-          else:
-            outs[i] += batch_out
-      else:
-        if step == 0:
-          outs.append(0.)
-        outs[0] += batch_outs
-      if verbose >= 1:
-        progbar.update(step + 1)
-    for i in range(len(outs)):
-      if i not in stateful_metric_indices:
-        outs[i] /= steps
-
-    if len(outs) == 1:
-      return outs[0]
-    return outs
-
-
-def _experimental_test_loop(model, iterator, verbose=0, steps=None):
+def experimental_test_loop(model,
+                           iterator,
+                           verbose=0,
+                           steps=None,
+                           initialize_finalize_strategy=True):
   """Test loop for evaluating with TPU DistributionStrategy.
 
   Arguments:
@@ -499,6 +259,8 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None):
       steps: Total number of steps (batches of samples)
           before declaring predictions finished.
           Ignored with the default value of `None`.
+      initialize_finalize_strategy: Should the strategy initialize and finalize
+          functions be called.
 
   Returns:
       Scalar loss (if the model has a single output and no metrics)
@@ -507,40 +269,42 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None):
       the display labels for the outputs.
   """
   current_strategy = model._distribution_strategy
-  K.get_session().run(current_strategy.initialize())
+  if initialize_finalize_strategy:
+    K.get_session().run(current_strategy.initialize())
 
-  def _per_device_test_function(model):
-    model._make_test_function()
-    return (model.test_function.inputs,
-            model.test_function.outputs,
-            model.test_function.updates_op,
-            model.test_function.session_kwargs)
+  def _per_device_eval_function(model):
+    model._make_eval_function()
+    return (model._eval_function.inputs, model._eval_function.outputs,
+            model._eval_function.updates_op,
+            model._eval_function.session_kwargs)
 
   # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
   K.set_learning_phase(0)
 
-  def step_fn(ctx, inputs, targets):
-    """Clones the model and calls make_test_function."""
+  def step_fn(ctx, inputs):
+    """Clones the model and calls make_eval_function."""
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
-    clone_model_on_towers(
+    inputs, targets = inputs
+    clone_model_on_replicas(
         model,
         current_strategy,
         make_callback_model=False,
         inputs=inputs,
-        targets=targets)
+        targets=targets,
+        mode=_Mode.TEST)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_tower(
-         _per_device_test_function, model._grouped_model)
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
+         _per_device_eval_function, args=(model._grouped_model_test,))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
          grouped_session_args)
 
-    combined_fn = K.Function(
+    combined_fn = K.function(
         all_inputs, all_outputs,
         updates=all_updates,
         name='distributed_test_function',
@@ -548,25 +312,26 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None):
 
     for label, output in zip(model.metrics_names, combined_fn.outputs):
       if label == 'loss':
-        aggregation = distribute_lib.get_loss_reduction()
+        reduce_op = distribute_lib.get_loss_reduction()
       else:
-        # We aggregate all other metrics using mean for now. This is temporary
+        # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
-        aggregation = variable_scope.VariableAggregation.MEAN
-      ctx.set_last_step_output(label, output, aggregation)
+        reduce_op = ds_reduce_util.ReduceOp.MEAN
+      ctx.set_last_step_output(label, output, reduce_op)
 
     return combined_fn.updates_op
 
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
-  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
+  for name in model.metrics_names[1:]:
+    tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   with current_strategy.scope():
     # TODO(priyag): Use steps_per_run when we use new metrics as they will
     # allow handling metric computation at each step using variables.
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=1,
         initial_loop_values=initial_loop_values)
 
@@ -577,123 +342,36 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None):
     progbar = Progbar(target=steps)
 
   # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
   with current_strategy.scope():
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
+    _copy_weights_to_distributed_model(model, model._grouped_model_test)
+    _reset_metrics(model, model._grouped_model_test)
   assert steps is not None
   outs = [0.] * len(model.metrics_names)
   for step in range(steps):
     _, batch_outs = K.get_session().run([test_op, output_tensors])
     for i, label in enumerate(model.metrics_names):
-      outs[i] += batch_outs[label]
+      if i == 0:
+        # Loss is stateless metrics.
+        outs[i] += batch_outs[label]
+      else:
+        # For all stateful metrics, the aggregation is handled by mirrored vars.
+        outs[i] = batch_outs[label]
+
     if verbose >= 1:
       progbar.update(step + 1)
-  for i in range(len(outs)):
-    outs[i] /= (steps)
 
-  K.get_session().run(current_strategy.finalize())
+  if len(outs) >= 0:
+    outs[0] /= (steps)
+
+  if initialize_finalize_strategy:
+    K.get_session().run(current_strategy.finalize())
 
   if len(outs) == 1:
     return outs[0]
   return outs
 
 
-def predict_loop(model, iterator, verbose=0, steps=None):
-  """Predict loop for predicting with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_predict_loop(model, iterator, verbose, steps)
-
-  if not model._grouped_model:
-    clone_model_on_towers(model, current_strategy)
-
-  def _per_device_predict_function(model):
-    model._make_predict_function()
-    return (model.predict_function.inputs,
-            model.predict_function.outputs,
-            model.predict_function.updates_op,
-            model.predict_function.session_kwargs)
-
-  inputs, _ = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_tower(
-         _per_device_predict_function, model._grouped_model)
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args)
-
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-
-    distributed_predict_function = K.Function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_predict_function',
-        **all_session_args)
-
-    if model.uses_learning_phase and not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + [0]
-    else:
-      ins = dataset_inputs
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    if steps is not None:
-      # Since we do not know how many samples we will see, we cannot
-      # pre-allocate the returned Numpy arrays. Instead, we store one array per
-      # batch seen and concatenate them upon returning.
-      unconcatenated_outs = []
-      for step in range(steps):
-        batch_outs = distributed_predict_function(ins)
-        if not isinstance(batch_outs, list):
-          batch_outs = [batch_outs]
-        if step == 0:
-          for _ in batch_outs:
-            unconcatenated_outs.append([])
-        # TODO(anjalisridhar): Should combine the outputs from multiple towers
-        # correctly here.
-        for i, batch_out in enumerate(batch_outs):
-          unconcatenated_outs[i].append(batch_out)
-        if verbose >= 1:
-          progbar.update(step + 1)
-      if len(unconcatenated_outs) == 1:
-        return np.concatenate(unconcatenated_outs[0], axis=0)
-      return [
-          np.concatenate(unconcatenated_outs[i], axis=0)
-          for i in range(len(unconcatenated_outs))
-      ]
-
-
-def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
+def experimental_predict_loop(model, iterator, verbose=0, steps=None):
   """Predict loop for predicting with TPU DistributionStrategy.
 
   Arguments:
@@ -722,28 +400,29 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
             model.predict_function.updates_op,
             model.predict_function.session_kwargs)
 
-  def step_fn(ctx, *inputs):
+  def step_fn(ctx, inputs):
     """Clones the model and calls make_predict_function."""
 
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
-    clone_model_on_towers(
+    clone_model_on_replicas(
         model,
         current_strategy,
         make_callback_model=False,
-        inputs=inputs)
+        inputs=inputs,
+        mode=_Mode.PREDICT)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_tower(
-         _per_device_predict_function, model._grouped_model)
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
+         _per_device_predict_function, args=(model._grouped_model_predict,))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
          grouped_session_args)
 
-    combined_fn = K.Function(
+    combined_fn = K.function(
         all_inputs, all_outputs,
         updates=all_updates,
         name='distributed_predict_function',
@@ -766,7 +445,7 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
 
   with current_strategy.scope():
     # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=1,
         initial_loop_values=initial_loop_values)
 
@@ -777,12 +456,9 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
     progbar = Progbar(target=steps)
 
   # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
   with current_strategy.scope():
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
+    _copy_weights_to_distributed_model(model, model._grouped_model_predict)
+    _reset_metrics(model, model._grouped_model_predict)
   assert steps is not None
   # Since we do not know how many samples we will see, we cannot pre-allocate
   # the returned Numpy arrays. Instead, we store one array per batch seen
@@ -806,7 +482,17 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
   ]
 
 
-def _clone_and_build_model(model, inputs=None, targets=None):
+def _custom_compile_for_predict(model):
+  """Custom compile for TPU predict mode."""
+  model.total_loss = None
+  model._fit_function = None
+  model._eval_function = None
+  model.train_function = None
+  model.test_function = None
+  model.predict_function = None
+
+
+def _clone_and_build_model(model, inputs=None, targets=None, mode=None):
   """Clone and build the given keras_model."""
   # We need to set the import here since we run into a circular dependency
   # error.
@@ -820,70 +506,50 @@ def _clone_and_build_model(model, inputs=None, targets=None):
     optimizer_config = model.optimizer.get_config()
     optimizer = model.optimizer.__class__.from_config(optimizer_config)
 
-  # TODO(priyag): Is there a cleaner way to do this? The API doc suggests a
-  # single tensor should be OK but it throws an error in that case.
-  if targets is not None and not isinstance(targets, (list, dict, tuple)):
-    targets = [targets]
+  # Recast all low precision outputs back to float32 since we only casted
+  # the inputs to bfloat16 and not targets. This is done so that we can preserve
+  # precision when calculating the loss value.
+  def _upcast_low_precision_outputs(output):
+    if output.dtype == dtypes.bfloat16:
+      return math_ops.cast(output, dtypes.float32)
+    else:
+      return output
+  cloned_model.outputs = [_upcast_low_precision_outputs(o)
+                          for o in cloned_model.outputs]
+
   if isinstance(targets, tuple):
     targets = nest.flatten(targets)
-  cloned_model.compile(
-      optimizer,
-      model.loss,
-      metrics=metrics_module.clone_metrics(model.metrics),
-      loss_weights=model.loss_weights,
-      sample_weight_mode=model.sample_weight_mode,
-      weighted_metrics=metrics_module.clone_metrics(model.weighted_metrics),
-      target_tensors=targets)
+  if mode == _Mode.PREDICT:
+    _custom_compile_for_predict(cloned_model)
+  else:
+    cloned_model.compile(
+        optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
+        target_tensors=targets)
   return cloned_model
 
 
-def clone_model_on_towers(
-    model, strategy, make_callback_model=False, inputs=None, targets=None):
-  """Create a cloned model on each tower."""
-  with strategy.scope():
-    model._grouped_model = strategy.call_for_each_tower(
-        _clone_and_build_model, model, inputs, targets)
-  if make_callback_model:
-    model._make_callback_model()
-
-
-def _aggregate_metrics_across_towers(num_devices, out_labels,
-                                     stateful_metric_names, outs):
-  """Aggregates stateless metrics values across towers.
-
-  When using `MirroredStrategy`, the number of towers is equal to the
-  number of devices over which training is distributed. This may not always be
-  the case.
-
-  Args:
-    num_devices: Number of devices over which the model is being distributed.
-    out_labels: The list of metric names passed to `compile`.
-    stateful_metric_names: List of stateful metric names on the model.
-    outs: The output from all the towers.
-
-  Returns:
-    The average value of each metric across the towers.
-  """
-  # TODO(anjalisridhar): Temporary workaround for aggregating metrics
-  # across towers. Replace with the new metrics module eventually.
-  merged_output = []
-  # The first output is the total loss.
-  merged_output.append(outs[0])
-  current_index = 1
-  # Each label in `out_labels` corresponds to one set of metrics. The
-  # number of metric values corresponds to the number of devices. We
-  # currently take the mean of the values.
-  for metric_name in out_labels[1:]:
-    if metric_name in stateful_metric_names:
-      # For stateful metrics, we get one aggregated result value.
-      merged_output.append(outs[current_index])
-      current_index += 1
+def clone_model_on_replicas(model, strategy, make_callback_model=False,
+                            inputs=None, targets=None, mode=None):
+  """Create a cloned model on each replica."""
+  with K.get_graph().as_default(), strategy.scope():
+    grouped_model = strategy.extended.call_for_each_replica(
+        _clone_and_build_model, args=(model, inputs, targets, mode))
+    if mode is _Mode.TRAIN:
+      model._grouped_model_train = grouped_model
+    elif mode is _Mode.TEST:
+      model._grouped_model_test = grouped_model
+    elif mode is _Mode.PREDICT:
+      model._grouped_model_predict = grouped_model
     else:
-      m = np.mean(outs[current_index:current_index + num_devices])
-      merged_output.append(m)
-      current_index += num_devices
-
-  return merged_output
+      model._grouped_model = grouped_model
+  if make_callback_model:
+    model._make_callback_model(grouped_model)
 
 
 def _get_input_from_iterator(iterator, model):
@@ -893,15 +559,175 @@ def _get_input_from_iterator(iterator, model):
   if len(nest.flatten(next_element)) == len(model.inputs):
     x = next_element
     y = None
-  else:
+    sample_weights = None
+  elif len(nest.flatten(next_element)) == (len(model.inputs) +
+                                           len(model.outputs)):
     x, y = next_element
+    sample_weights = None
+  else:
+    x, y, sample_weights = next_element
 
   # Validate that all the elements in x and y are of the same type and shape.
   # We can then pass the first element of x and y to `_standardize_weights`
   # below and be confident of the output.
-  x_values, y_values = distributed_training_utils.\
-    validate_distributed_dataset_inputs(model._distribution_strategy, x, y)
-  # TODO(sourabhbajaj): Add support for sample weights in distribution
-  # strategy.
-  model._standardize_weights(x_values, y_values)
-  return x, y
+  x_values, y_values, sample_weights_values = distributed_training_utils.\
+    validate_distributed_dataset_inputs(model._distribution_strategy, x, y,
+                                        sample_weights)
+  model._standardize_weights(x_values, y_values,
+                             sample_weight=sample_weights_values)
+  return x, y, sample_weights
+
+
+def _make_execution_function(model, mode):
+  """Makes function to run one step of distributed model execution."""
+  if context.executing_eagerly():
+    return _make_eager_execution_function(model, mode)
+
+  strategy = model._distribution_strategy
+  if not model._grouped_model:
+    clone_model_on_replicas(
+        model, strategy, make_callback_model=(mode == 'train'))
+
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
+
+  with strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs, grouped_updates,
+     grouped_session_args) = strategy.extended.call_for_each_replica(
+         _per_device_function, args=(model._grouped_model,))
+
+    if mode == 'train':
+      # Initialize the variables in the replicated model. This is necessary for
+      # multi-worker training because on some workers, initialization is not
+      # needed. This method does initialization or waiting for initialization
+      # according to the context object of distribute coordinator.
+      distributed_training_utils.init_restore_or_wait_for_variables()
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of update ops on
+    # all the devices over which the model is distributed.
+    (all_inputs, all_outputs, all_updates,
+     all_session_args) = distributed_training_utils.unwrap_values(
+         strategy,
+         grouped_inputs,
+         grouped_outputs,
+         grouped_updates,
+         grouped_session_args,
+         with_loss_tensor=(mode != 'predict'))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        updates=all_updates,
+        name='distributed_{}_function'.format(mode),
+        **all_session_args)
+
+
+def _make_eager_execution_function(model, mode):
+  """Makes function to run one step of distributed model eager execution."""
+  strategy = model._distribution_strategy
+  if not model._grouped_model:
+    clone_model_on_replicas(
+        model, strategy, make_callback_model=(mode == 'train'))
+
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs)
+
+  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
+  # the global one.
+  with K.get_graph().as_default(), strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs) = strategy.call_for_each_replica(
+        _per_device_function, args=(model._grouped_model,))
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of inptus/outputs
+    # on all the devices over which the model is distributed.
+    (all_inputs, all_outputs, _, _) = distributed_training_utils.unwrap_values(
+        strategy,
+        grouped_inputs,
+        grouped_outputs,
+        with_loss_tensor=(mode != 'predict'))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        name='eager_distributed_{}_function'.format(mode))
+
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of 'train'/'test'/'predict'.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  strategy = model._distribution_strategy
+  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
+  inputs = distributed_training_utils.flatten_perdevice_values(strategy, inputs)
+  targets = distributed_training_utils.flatten_perdevice_values(
+      strategy, targets)
+  if mode == 'predict':
+    sample_weights = []
+    targets = []
+  else:
+    sample_weights = [
+        None for _ in range(len(model.outputs) * strategy.num_replicas_in_sync)
+    ]
+  ins = inputs + targets + sample_weights
+  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
+    ins += [True]
+  return ins
+
+
+def _copy_weights_to_distributed_model(original_model, grouped_model):
+  """Copies weights from original model to distributed models."""
+  strategy = original_model._distribution_strategy
+  if strategy:
+    # Copy the weights from the original model to each of the replicated
+    # models.
+    orig_model_weights = original_model.get_weights()
+    distributed_model = strategy.unwrap(grouped_model)[0]
+    distributed_training_utils.set_weights(strategy, distributed_model,
+                                           orig_model_weights)
+
+
+def _copy_weights_to_original_model(model, grouped_model, mode):
+  """Copies weights from first distributed model back to original model."""
+  if model._distribution_strategy and mode == 'train':
+    updated_weights = model._distribution_strategy.unwrap(
+        grouped_model)[0].get_weights()
+    model.set_weights(updated_weights)
+
+
+def _per_device_aggregate_batch(batch_outs, model, mode):
+  """Aggregates the per-device batch-level outputs from a distributed step."""
+  if model._distribution_strategy is not None and mode == 'predict':
+    total_batch_outs = []
+    for i in range(len(model.outputs)):
+      num_replicas = model._distribution_strategy.num_replicas_in_sync
+      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
+      total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
+    return total_batch_outs
+  return batch_outs
+
+
+def _reset_metrics(model, distributed_model=None):
+  if model._distribution_strategy:
+    distributed_model = (
+        distributed_model or
+        model._distribution_strategy.unwrap(model._grouped_model)[0])
+    distributed_model.reset_metrics()
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 2a62edd698700249d797bdc453340955130adf81..895db5bc633669641b0493b8bfb918094f312513 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -19,19 +19,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
+import collections
 
-import numpy as np
-
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager.backprop import GradientTape
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
-from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras import losses as losses_module
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -41,7 +39,12 @@ def _eager_loss_fn(outputs, targets, loss_fn, output_name):
   return loss
 
 
-def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
+def _eager_metrics_fn(model,
+                      outputs,
+                      targets,
+                      sample_weights=None,
+                      masks=None,
+                      return_stateful_result=True):
   """Calculates the metrics for each output of the given model.
 
   Arguments:
@@ -50,6 +53,8 @@ def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
       targets: The predictions or targets of the given model.
       sample_weights: Optional list of sample weights for each output.
       masks: Optional list of masks for each output.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
 
   Returns:
       Returns the metric results for each output of the model.
@@ -58,11 +63,20 @@ def _eager_metrics_fn(model, outputs, targets, sample_weights=None, masks=None):
   targets = generic_utils.to_list(targets)
   # TODO(psv): Consider supporting skip target indices in eager mode?
   metric_results = model._handle_metrics(
-      outputs, targets=targets, sample_weights=sample_weights, masks=masks)
+      outputs,
+      targets=targets,
+      sample_weights=sample_weights,
+      masks=masks,
+      return_stateful_result=return_stateful_result)
   return [backend.mean(t) for t in metric_results]
 
 
-def _model_loss(model, inputs, targets, sample_weights=None, training=False):
+def _model_loss(model,
+                inputs,
+                targets,
+                output_loss_metrics=None,
+                sample_weights=None,
+                training=False):
   """Calculates the loss for a given model.
 
   Arguments:
@@ -70,6 +84,8 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       inputs: Either a dictionary of inputs to the model or a list of input
         arrays.
       targets: List of target arrays.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
       sample_weights: Optional list of sample weight arrays.
       training: Whether the model should be run in inference or training mode.
 
@@ -99,6 +115,7 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
   targets = generic_utils.to_list(targets)
 
   loss_metrics = []
+  aggregated_loss_metrics = []
   with backend.name_scope('loss'):
     for i, loss_fn in enumerate(model.loss_functions):
       if sample_weights:
@@ -106,11 +123,24 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       else:
         weights = None
       mask = masks[i]
-
-      weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
       with backend.name_scope(model.output_names[i] + '_loss'):
-        output_loss = weighted_masked_fn(
-            targets[i], outs[i], weights, mask=mask)
+        if isinstance(loss_fn, losses_module.Loss):
+          if mask is not None:
+            mask = math_ops.cast(mask, outs[i].dtype)
+            # Update weights with mask.
+            if weights is None:
+              weights = mask
+            else:
+              # Update dimensions of weights to match with mask if possible.
+              mask, _, weights = squeeze_or_expand_dimensions(
+                  mask, None, weights)
+              weights *= mask
+          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
+        else:
+          weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
+          output_loss = weighted_masked_fn(
+              targets[i], outs[i], weights, mask=mask)
+
       # If the number of outputs is 1 then we don't append the loss metric
       # associated with each model output. When there are multiple outputs
       # associated with a model, each output's loss is calculated and returned
@@ -118,6 +148,16 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
       if len(model.outputs) > 1:
         loss_metrics.append(backend.mean(output_loss))
 
+        if output_loss_metrics is not None:
+          # Keep track of the stateful loss result.
+          aggregated_loss_metrics.append(
+              training_utils.call_metric_function(
+                  output_loss_metrics[i],
+                  targets[i],
+                  outs[i],
+                  weights=weights,
+                  mask=mask))
+
       loss_weight = model.loss_weights_list[i]
       if total_loss is None:
         total_loss = loss_weight * output_loss
@@ -126,359 +166,18 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
 
     total_loss = backend.mean(total_loss)
     # Add regularization losses
-    custom_losses = []
-    for layer in model.layers:
-      if layer.losses:
-        custom_losses += layer.losses
-
+    custom_losses = model.losses
     if custom_losses:
-      total_loss += sum(custom_losses)
-
-  return outs, total_loss, loss_metrics, masks
-
-
-def iterator_fit_loop(model,
-                      inputs,
-                      class_weight,
-                      steps_per_epoch,
-                      epoch_logs,
-                      val_inputs=None,
-                      val_targets=None,
-                      val_sample_weights=None,
-                      epochs=1,
-                      verbose=1,
-                      callbacks=None,
-                      validation_steps=None,
-                      do_validation=False,
-                      batch_size=None):
-  """Fit function for eager execution when input is given as dataset iterator.
-
-  Updates the given epoch logs.
-
-  Arguments:
-      model: Instance of the `Model`.
-      inputs: Input dataset iterator.
-      class_weight: Optional class-weight array to weight the importance of
-          samples in `inputs` based on the class they belong to, as conveyed by
-          the targets from the `inputs` iterator.
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch.
-      epoch_logs: Dictionary of logs from every epoch.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: CallbackList instance. Controls callbacks during training.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-      do_validation: Boolean value indicating whether we should do validation.
-      batch_size: int, val_inputs and val_targets will be evaled batch by
-        batch with size batch_size if they are array.
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-
-  # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, (list, tuple)) or
-      len(inputs.output_shapes) not in (2, 3)):
-    raise ValueError('Please provide either inputs and targets '
-                     'or inputs, targets, and sample_weights')
-
-  for step_index in range(steps_per_epoch):
-    batch_logs = {'batch': step_index, 'size': 1}
-    callbacks.on_batch_begin(step_index, batch_logs)
-
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data; interrupting training. Make '
-          'sure that your dataset can generate at least '
-          '`steps_per_epoch * epochs` batches (in this case, %d batches). You '
-          'may need to use the repeat() function when building your '
-          'dataset.' % steps_per_epoch * epochs)
-      break
-
-    if len(inputs.output_shapes) == 2:
-      x, y = next_element
-      sample_weights = None
-    else:
-      x, y, sample_weights = next_element
-
-    # Validate and standardize data.
-    x, y, sample_weights = model._standardize_user_data(
-        x, y, sample_weight=sample_weights, class_weight=class_weight)
-    x = training_utils.cast_if_floating_dtype(x)
-    y = training_utils.cast_if_floating_dtype(y)
-    if sample_weights:
-      sample_weights = [
-          training_utils.cast_if_floating_dtype(
-              ops.convert_to_tensor(val, dtype=backend.floatx()))
-          if val is not None else None for val in sample_weights
-      ]
-
-    # Set stateful_metrics in callbacks. We do not do this before the
-    # `steps_per_epoch` loop because model will be compiled only in the first
-    # iteration of this loop in the deferred build scenario.
-    if step_index == 0:
-      for cbk in callbacks:
-        if (isinstance(cbk, cbks.BaseLogger) or
-            isinstance(cbk, cbks.ProgbarLogger)):
-          cbk.stateful_metrics = model.stateful_metric_names
-
-    if step_index == 0 and not callbacks.params['metrics']:
-      callback_metrics = copy.copy(model.metrics_names)
-      if do_validation:
-        callback_metrics += ['val_' + n for n in model.metrics_names]
-      callbacks.set_params({
-          'batch_size': batch_size,
-          'epochs': epochs,
-          'steps': steps_per_epoch,
-          'verbose': verbose,
-          'do_validation': do_validation,
-          'metrics': callback_metrics or [],
-          'validation_steps': validation_steps
-      })
-
-    # Train model.
-    outs, loss, loss_metrics, masks = _process_single_batch(
-        model, x, y, sample_weights=sample_weights, training=True)
-    outs = generic_utils.to_list(outs)
-
-    # Calculate metrics.
-    for l, o in zip(model.metrics_names, outs):
-      batch_logs[l] = o
-    # Required for eager execution
-    metrics_results = _eager_metrics_fn(
-        model, outs, y, sample_weights=sample_weights, masks=masks)
-    batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
-
-    for k, v in zip(model.metrics_names,
-                    [backend.mean(loss)] + loss_metrics + metrics_results):
-      batch_logs[k] = tensor_util.constant_value(v)
-    callbacks.on_batch_end(step_index, batch_logs)
-    if callbacks.model.stop_training:
-      break
-
-    if step_index == steps_per_epoch - 1:
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_inputs,
-            val_targets,
-            sample_weights=val_sample_weights,
-            steps=validation_steps,
-            verbose=0,
-            batch_size=batch_size)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(model.metrics_names, val_outs):
-          epoch_logs['val_' + l] = o
-
-
-def iterator_test_loop(model, inputs, steps, verbose=0):
-  """Test function for eager execution when input is given as dataset iterator.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: Input dataset iterator.
-      steps: Total number of steps (batches of samples) before declaring
-      predictions finished.
-      verbose: Verbosity mode.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-  # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, (list, tuple)) or
-      len(inputs.output_shapes) < 2 or len(inputs.output_shapes) > 3):
-    raise ValueError('Please provide either inputs and targets'
-                     'or inputs, targets, and sample_weights')
-  outs = []
-  num_samples = 0
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=steps)
-  for step_index in range(steps):
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data interrupting testing. '
-          'Make sure that your dataset can generate at least `steps` batches '
-          '(in this case, %d batches). You may need to use the repeat() '
-          'function when building your dataset.', steps)
-      break
-
-    if len(inputs.output_shapes) == 2:
-      x, y = next_element
-      sample_weights = None
-    else:
-      x, y, sample_weights = next_element
-
-    # Validate and standardize data.
-    x, y, sample_weights = model._standardize_user_data(
-        x, y, sample_weight=sample_weights)
-    x = training_utils.cast_if_floating_dtype(x)
-    y = training_utils.cast_if_floating_dtype(y)
-    if sample_weights:
-      sample_weights = [
-          training_utils.cast_if_floating_dtype(
-              ops.convert_to_tensor(val, dtype=backend.floatx()))
-          if val is not None else None for val in sample_weights
-      ]
-
-    if step_index == 0:
-      # Get stateful metrics indices. We do not do this before the `steps` loop
-      # because model will be compiled only in the first iteration of this loop
-      # in the deferred build scenario.
-      if hasattr(model, 'metrics'):
-        for m in model.stateful_metric_functions:
-          m.reset_states()
-        stateful_metric_indices = [
-            i for i, name in enumerate(model.metrics_names)
-            if str(name) in model.stateful_metric_names
-        ]
-      else:
-        stateful_metric_indices = []
-
-    # Calculate model output, loss values.
-    loss_outs, loss, loss_metrics, masks = _model_loss(
-        model, x, y, sample_weights=sample_weights, training=False)
-    metrics_results = _eager_metrics_fn(
-        model, loss_outs, y, sample_weights=sample_weights, masks=masks)
-    batch_outs = []
-    for _, v in zip(model.metrics_names,
-                    [backend.mean(loss)] + loss_metrics + metrics_results):
-      batch_outs.append(tensor_util.constant_value(v))
-
-    # Get current step size.
-    if isinstance(x, list):
-      step_size = x[0].get_shape().as_list()[0]
-    elif isinstance(x, dict):
-      step_size = list(x.values())[0].get_shape().as_list()[0]
-    else:
-      step_size = x.get_shape().as_list()[0]
-
-    # Accumulate results in output array.
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-    if step_index == 0:
-      for _ in enumerate(batch_outs):
-        outs.append(0.)
-    for i, batch_out in enumerate(batch_outs):
-      if i in stateful_metric_indices:
-        outs[i] = batch_out
-      else:
-        outs[i] += batch_out * step_size
-
-    # Calculate sample size.
-    num_samples += step_size
-    if verbose == 1:
-      progbar.update(step_index + 1)
-
-  for i in range(len(outs)):
-    if i not in stateful_metric_indices:
-      outs[i] /= num_samples
-  if len(outs) == 1:
-    return outs[0]
-  return outs
+      total_loss += math_ops.add_n(custom_losses)
+    model._clear_losses()
 
-
-def iterator_predict_loop(model, inputs, steps, verbose=0):
-  """Predict function for eager execution when input is dataset iterator.
-
-  Arguments:
-      model: Instance of `Model`.
-      inputs: Input dataset iterator.
-      steps: Total number of steps (batches of samples) before declaring
-          `_predict_loop` finished.
-      verbose: Verbosity mode.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions (if the model has multiple outputs).
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-  if not isinstance(inputs.output_shapes,
-                    (list, tuple)) or len(inputs.output_shapes) > 3:
-    raise ValueError(
-        'Please provide data as a list or tuple of 1, 2, or 3 elements '
-        ' - `(input)`, or `(input, target)`, or `(input, target,'
-        'sample_weights)`. Received %s. We do not use the `target` or'
-        '`sample_weights` value here.' % inputs.output_shapes)
-  outs = []
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=steps)
-  for step_index in range(steps):
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data; interrupting prediction. '
-          'Make sure that your dataset can generate at least `steps` batches '
-          '(in this case, %d batches). You may need to use the repeat() '
-          'function when building your dataset.', steps)
-      break
-
-    # expects a tuple, where first element of tuple represents inputs
-    x = next_element[0]
-
-    # Validate and standardize data.
-    x, _, _ = model._standardize_user_data(x)
-    x = training_utils.cast_if_floating_dtype(x)
-
-    if isinstance(x, list) and len(x) == 1:
-      x = x[0]
-
-    if model._expects_training_arg:
-      batch_outs = model.call(x, training=False)
-    else:
-      batch_outs = model.call(x)
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-
-    # We collect the results from every step and then concatenate them once
-    # in the end. This is an expensive process. We are doing this because we
-    # do not know the number of samples beforehand.
-    if step_index == 0:
-      for _ in batch_outs:
-        outs.append([])
-    for i, batch_out in enumerate(batch_outs):
-      outs[i].append(backend.get_value(batch_out))
-
-    if verbose == 1:
-      progbar.update(step_index + 1)
-  for i, out in enumerate(outs):
-    outs[i] = np.concatenate(tuple(out), axis=0)
-  if len(outs) == 1:
-    return outs[0]
-  return outs
+  return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
 
 
 def _process_single_batch(model,
                           inputs,
                           targets,
+                          output_loss_metrics=None,
                           sample_weights=None,
                           training=False):
   """Calculate the loss and gradient for one input batch.
@@ -489,6 +188,8 @@ def _process_single_batch(model,
       model: Model whose loss has to be calculated.
       inputs: List of input arrays.
       targets: List of target arrays.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
       sample_weights: Optional list of sample weight arrays.
       training: The boolean represents if the weights of the model are updated.
               'fit' methods will set this to True while 'evaluate' methods will
@@ -503,12 +204,14 @@ def _process_single_batch(model,
   """
   with backend.learning_phase_scope(1 if training else 0):
     with GradientTape() as tape:
-      outs, loss, loss_metrics, masks = _model_loss(
-          model,
-          inputs,
-          targets,
-          sample_weights=sample_weights,
-          training=training)
+      outs, loss, loss_metrics, aggregated_loss_metrics, masks\
+        = _model_loss(
+            model,
+            inputs,
+            targets,
+            output_loss_metrics=output_loss_metrics,
+            sample_weights=sample_weights,
+            training=training)
       if loss is None:
         raise ValueError('The model cannot be run '
                          'because it has no loss to optimize.')
@@ -521,7 +224,7 @@ def _process_single_batch(model,
         grads = tape.gradient(loss, model._collected_trainable_weights)
         model.optimizer.apply_gradients(zip(grads,
                                             model._collected_trainable_weights))
-    return outs, loss, loss_metrics, masks
+    return outs, loss, loss_metrics, aggregated_loss_metrics, masks
 
 
 def train_on_batch(model, inputs, targets, sample_weights=None):
@@ -536,28 +239,34 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss and the loss associated with each output.
   """
-  if len(inputs) and tensor_util.is_tensor(inputs[0]):
-    inputs = training_utils.cast_if_floating_dtype(inputs)
-    targets = training_utils.cast_if_floating_dtype(targets)
-  else:
-    inputs = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-    ]
-    targets = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-    ]
+  if isinstance(inputs, collections.Sequence):
+    if len(inputs) and tensor_util.is_tensor(inputs[0]):
+      inputs = training_utils.cast_if_floating_dtype(inputs)
+      targets = training_utils.cast_if_floating_dtype(targets)
+    else:
+      inputs = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in inputs
+      ])
+      targets = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in targets
+      ])
   if sample_weights:
     sample_weights = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
+        training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
 
-  outs, loss, loss_metrics, masks = _process_single_batch(
+  outs, loss, loss_metrics, _, masks = _process_single_batch(
       model, inputs, targets, sample_weights=sample_weights, training=True)
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
-      model, outs, targets, sample_weights=sample_weights, masks=masks)
+      model,
+      outs,
+      targets,
+      sample_weights=sample_weights,
+      masks=masks,
+      return_stateful_result=True)
   loss = generic_utils.to_list(loss)
 
   return [
@@ -578,191 +287,36 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss, loss and metrics associated with each output.
   """
-  if len(inputs) and tensor_util.is_tensor(inputs[0]):
-    inputs = training_utils.cast_if_floating_dtype(inputs)
-    targets = training_utils.cast_if_floating_dtype(targets)
-  else:
-    inputs = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-    ]
-    targets = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-    ]
+  if isinstance(inputs, collections.Sequence):
+    if len(inputs) and tensor_util.is_tensor(inputs[0]):
+      inputs = training_utils.cast_if_floating_dtype(inputs)
+      targets = training_utils.cast_if_floating_dtype(targets)
+    else:
+      inputs = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in inputs
+      ])
+      targets = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in targets
+      ])
   if sample_weights:
     sample_weights = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
+        training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
-  outs, loss, loss_metrics, masks = _model_loss(
+  outs, loss, loss_metrics, _, masks = _model_loss(
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
-      model, outs, targets, sample_weights=sample_weights, masks=masks)
+      model,
+      outs,
+      targets,
+      sample_weights=sample_weights,
+      masks=masks,
+      return_stateful_result=True)
   loss = generic_utils.to_list(loss)
 
   return [
       tensor_util.constant_value(v)
       for v in loss + loss_metrics + metrics_results
   ]
-
-
-def fit_loop(model,
-             inputs,
-             targets,
-             sample_weights=None,
-             class_weight=None,
-             val_inputs=None,
-             val_targets=None,
-             val_sample_weights=None,
-             batch_size=None,
-             epochs=1,
-             verbose=1,
-             callbacks=None,
-             shuffle=True,
-             initial_epoch=0,
-             steps_per_epoch=None,
-             validation_steps=None):
-  """Fit function for eager execution.
-
-  Arguments:
-      model: Instance of the model that is being executed in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      class_weight: Optional class-weight array to weight the importance of
-          samples in `inputs` based on the class they belong to, as conveyed by
-          `targets`.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      batch_size: Integer batch size or None if unknown.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      shuffle: Whether to shuffle the data at the beginning of each epoch
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-
-  Returns:
-      `History` object.
-
-  Raises:
-    ValueError: In case of invalid argument values.
-  """
-  # Convert training inputs to an EagerIterator
-  inputs, steps_per_epoch = training_utils.convert_to_iterator(
-      x=inputs,
-      y=targets,
-      sample_weights=sample_weights,
-      batch_size=batch_size,
-      steps_per_epoch=steps_per_epoch,
-      epochs=epochs,
-      shuffle=shuffle)
-  # Required for eager execution
-  with backend.learning_phase_scope(1):
-    do_validation = val_inputs is not None
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        batch_size=batch_size,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        val_inputs=val_inputs,
-        val_targets=val_targets,
-        val_sample_weights=val_sample_weights,
-        validation_steps=validation_steps,
-        verbose=verbose)
-
-    callbacks.on_train_begin()
-    for epoch in range(initial_epoch, epochs):
-      if model._is_compiled:  # Model may not be compiled the first time.
-        # Reset stateful metrics
-        for m in model.stateful_metric_functions:
-          m.reset_states()
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      iterator_fit_loop(
-          model,
-          inputs,
-          class_weight,
-          steps_per_epoch=steps_per_epoch,
-          epoch_logs=epoch_logs,
-          val_inputs=val_inputs,
-          val_targets=val_targets,
-          val_sample_weights=val_sample_weights,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_steps=validation_steps,
-          do_validation=do_validation,
-          batch_size=batch_size)
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callbacks.model.stop_training:
-        break
-  callbacks.on_train_end()
-  return model.history
-
-
-def test_loop(model, inputs, targets,
-              sample_weights=None,
-              batch_size=None,
-              verbose=0,
-              steps=None):
-  """Test function for eager execution.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: integer batch size or `None`.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  inputs, steps = training_utils.convert_to_iterator(
-      x=inputs,
-      y=targets,
-      sample_weights=sample_weights,
-      batch_size=batch_size,
-      steps_per_epoch=steps,
-      is_validation=True)
-  with backend.learning_phase_scope(0):
-    return iterator_test_loop(model, inputs, steps, verbose=verbose)
-
-
-def predict_loop(model, inputs, batch_size=32, verbose=0, steps=None):
-  """Predict function for eager execution.
-
-  Arguments:
-      model: Instance of `Model`.
-      inputs: List of input arrays.
-      batch_size: integer batch size.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  with backend.learning_phase_scope(0):
-    inputs, steps = training_utils.convert_to_iterator(
-        x=inputs, batch_size=batch_size, steps_per_epoch=steps)
-    return iterator_predict_loop(model, inputs, steps, verbose=verbose)
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 943ede1be9aacec6758725ee8305ed16ac76fc31..3fabbb17edc05138c57bf61c16a94c6647813963 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
@@ -51,6 +51,7 @@ class TrainingTest(test.TestCase):
         loss,
         metrics=metrics,
         loss_weights=loss_weights,
+        run_eagerly=True,
         sample_weight_mode=None)
 
     input_a = keras.backend.zeros(shape=(10, 3))
@@ -111,7 +112,7 @@ class TrainingTest(test.TestCase):
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics, run_eagerly=True)
 
     inputs = keras.backend.zeros(shape=(10, 3))
     targets = keras.backend.zeros(shape=(10, 4))
@@ -129,29 +130,34 @@ class TrainingTest(test.TestCase):
     x = keras.layers.Input(shape=(3,), name='input')
     y = keras.layers.Dense(4, name='dense')(x)
     model = keras.Model(x, y)
-    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse',
+                  run_eagerly=True)
 
     x = keras.backend.zeros(shape=(10, 3))
     y = keras.backend.zeros(shape=(10, 4))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     validation_dataset = dataset_ops.Dataset.from_tensor_slices(
         (x, y)).repeat(10).batch(5)
-    validation_iterator = validation_dataset.make_one_shot_iterator()
+    validation_iterator = dataset_ops.make_one_shot_iterator(validation_dataset)
 
     with self.assertRaisesRegexp(
         ValueError, r'specify .* `steps_per_epoch`'):
       model.fit(iterator, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
-      model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
-                validation_data=(x, y))
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
+    if not context.executing_eagerly():
+      # In eager execution, `keras.backend.zeros` returns value tensors
+      # which can be used for validation without a `validation_steps` argument.
+      with self.assertRaisesRegexp(
+          ValueError, r'provide either `batch_size` or `validation_steps`'):
+        model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
+                  validation_data=(x, y))
+    with self.assertRaisesRegexp(ValueError,
+                                 'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_dataset)
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_iterator)
 
@@ -160,25 +166,31 @@ class TrainingTest(test.TestCase):
     model.add(keras.layers.Dense(4, input_shape=(3,)))
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     model.compile(
-        optimizer, 'mse', metrics=['mae',
-                                   metrics_module.CategoricalAccuracy()])
+        optimizer,
+        loss='mse',
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=True)
 
     x = np.random.random((10, 3))
     y = np.random.random((10, 4))
 
-    def iterator():
+    def numpy_iterator():
       while True:
         yield x, y
 
-    model.fit_generator(iterator(), steps_per_epoch=3, epochs=1)
-    model.evaluate_generator(iterator(), steps=3)
-    out = model.predict_generator(iterator(), steps=3)
+    model.fit_generator(numpy_iterator(), steps_per_epoch=3, epochs=1)
+    model.evaluate_generator(numpy_iterator(), steps=3)
+
+    def inference_numpy_iterator():
+      while True:
+        yield x
+
+    out = model.predict_generator(inference_numpy_iterator(), steps=3)
     self.assertEqual(out.shape, (30, 4))
 
 
 class CorrectnessTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_loss_correctness(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
@@ -191,15 +203,14 @@ class CorrectnessTest(test.TestCase):
                                  activation='softmax',
                                  kernel_initializer='ones'))
     model.compile(loss='sparse_categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  run_eagerly=False)
     x = np.ones((100, 4))
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     history = model.fit(x, y, epochs=1, batch_size=10)
-    self.assertEqual(
-        np.around(history.history['loss'][-1], decimals=4), 0.6173)
+    self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_loss_correctness_with_iterator(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
@@ -211,18 +222,19 @@ class CorrectnessTest(test.TestCase):
         keras.layers.Dense(2, activation='softmax', kernel_initializer='ones'))
     model.compile(
         loss='sparse_categorical_crossentropy',
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=True)
     x = np.ones((100, 4), dtype=np.float32)
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     history = model.fit(iterator, epochs=1, steps_per_epoch=10)
-    self.assertEqual(np.around(history.history['loss'][-1], decimals=4), 0.6173)
+    self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
 
-  def test_no_loss_in_call(self):
+  def test_loss_in_call(self):
 
     class HasLoss(keras.layers.Layer):
 
@@ -231,11 +243,9 @@ class CorrectnessTest(test.TestCase):
         return x
 
     layer = HasLoss()
-    with self.assertRaises(RuntimeError):
-      layer(1.)
+    layer(1.)  # Plain-value inputs are only valid in eager mode.
+    self.assertEqual(1, len(layer.losses))
 
-    with ops.Graph().as_default():
-      layer(1.)
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index 2e074699da844e013fc181381bb3bb81360ea0f5..0abf0b8270915a37f1d59803cacd11bdf9abe132 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -19,423 +19,433 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import math
+
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
-from tensorflow.python.keras.utils.data_utils import Sequence
-from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+
+def model_iteration(model,
+                    data,
+                    steps_per_epoch=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    class_weight=None,
+                    max_queue_size=10,
+                    workers=1,
+                    use_multiprocessing=False,
+                    shuffle=True,
+                    initial_epoch=0,
+                    mode='train',
+                    batch_size=None,
+                    **kwargs):
+  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
+
+  Arguments:
+      model: Keras Model instance.
+      data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or
+        `(x, y, sample_weights)`) or a generator or
+        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+      steps_per_epoch: Total number of steps (batches of samples) before
+        declaring one epoch finished and starting the next epoch. Ignored with
+        the default value of `None`.
+      epochs: Number of times to iterate over the data.
+      verbose: Verbosity mode, 0, 1 or 2.
+      callbacks: List of callbacks to be called during training.
+      validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or
+        `(x, y)` or `(x, y, sample_weights)`) or a generator or
+        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+      validation_steps: Total number of steps (batches of samples) before
+        declaring validation finished.
+      class_weight: Dictionary mapping class indices to a weight for the class.
+      max_queue_size: Integer. Maximum size for the generator queue. If
+        unspecified, `max_queue_size` will default to 10.
+      workers: Integer. Maximum number of processes to spin up when using
+        process-based threading. If unspecified, `workers` will default to 1. If
+        0, will execute the generator on the main thread.
+      use_multiprocessing: Boolean. If `True`, use process-based threading. If
+        unspecified, `use_multiprocessing` will default to `False`. Note that
+        because this implementation relies on multiprocessing, you should not
+        pass non-picklable arguments to the generator as they can't be passed
+        easily to children processes.
+      shuffle: Boolean. Whether to shuffle the order of the batches at the
+        beginning of each epoch. Only used with instances of `Sequence`
+        (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not
+        `None`.
+      initial_epoch: Epoch at which to start training (useful for resuming a
+        previous training run).
+      mode: One of 'train'/'test'/'predict'.
+      batch_size: Integer batch size or None if unknown. Will only be used if
+        `data` is in NumPy/Tensor format.
+      **kwargs: Additional arguments for backwards compatibility. `steps` is
+        accepted as an alias for `steps_per_epoch`.
+
+  Returns:
+      - In 'train' mode: `History` object.
+      - In 'test' mode: Evaluation metrics.
+      - In 'predict' mode: Outputs of the Model called on inputs.
+
+  Raises:
+      ValueError: in case of invalid arguments.
+  """
+  if 'steps' in kwargs:
+    steps_per_epoch = kwargs['steps']
+
+  # Convert to a format that supports `next(generator)`.
+  generator, steps_per_epoch = convert_to_generator_like(
+      data,
+      steps_per_epoch=steps_per_epoch,
+      batch_size=batch_size,
+      epochs=epochs - initial_epoch,
+      shuffle=shuffle)
+
+  do_validation = validation_data is not None
+  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
+  is_sequence = isinstance(generator, data_utils.Sequence)
+  _validate_arguments(is_sequence, use_multiprocessing, workers,
+                      steps_per_epoch, validation_data, validation_steps, mode,
+                      kwargs)
+
+  batch_function = _make_execution_function(
+      model, mode, class_weight=class_weight)
+
+  # Create the queue for the generator.
+  output_generator, enqueuer = _make_enqueued_generator(
+      generator,
+      workers=workers,
+      use_multiprocessing=use_multiprocessing,
+      max_queue_size=max_queue_size,
+      shuffle=shuffle)
+
+  num_samples_or_steps, use_steps = _get_num_samples_or_steps(
+      data, steps_per_epoch)
+
+  count_mode = 'steps' if use_steps else 'samples'
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=do_validation,
+      epochs=epochs,
+      steps_per_epoch=steps_per_epoch,
+      batch_size=batch_size,
+      samples=num_samples_or_steps,
+      verbose=0,  # Handle ProgBar as part of Callbacks once hooks are ready.
+      mode=mode)
+  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
+  progbar = training_utils.get_progbar(model, count_mode)
+  progbar.params = callbacks.params
+  progbar.params['verbose'] = verbose
+
+  if mode == 'predict':
+    aggregator = training_utils.OutputsAggregator(True, steps_per_epoch)
+  else:
+    aggregator = training_utils.MetricsAggregator(True, steps_per_epoch)
 
+  if should_set_learning_phase:
+    old_learning_phase = backend.learning_phase()
+    backend.set_learning_phase(1 if mode == 'train' else 0)
 
-def fit_generator(model,
-                  generator,
-                  steps_per_epoch=None,
-                  epochs=1,
-                  verbose=1,
-                  callbacks=None,
-                  validation_data=None,
-                  validation_steps=None,
-                  class_weight=None,
-                  max_queue_size=10,
-                  workers=1,
-                  use_multiprocessing=False,
-                  shuffle=True,
-                  initial_epoch=0):
-  """See docstring for `Model.fit_generator`."""
-  wait_time = 0.01  # in seconds
-  epoch = initial_epoch
-
-  do_validation = bool(validation_data)
-  if not context.executing_eagerly():
-    model._make_train_function()
-    if do_validation:
-      model._make_test_function()
-
-  is_sequence = isinstance(generator, Sequence)
-  if not is_sequence and use_multiprocessing and workers > 1:
-    logging.warning(
-        UserWarning('Using a generator with `use_multiprocessing=True`'
-                    ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
-                    ' class.'))
-  if steps_per_epoch is None:
-    if is_sequence:
-      steps_per_epoch = len(generator)
-    else:
-      raise ValueError('`steps_per_epoch=None` is only valid for a'
-                       ' generator based on the `keras.utils.Sequence`'
-                       ' class. Please specify `steps_per_epoch` or use'
-                       ' the `keras.utils.Sequence` class.')
+  callbacks.model.stop_training = False
+  callbacks._call_begin_hook(mode)
+  progbar.on_train_begin()
+  for epoch in range(initial_epoch, epochs):
+    if callbacks.model.stop_training:
+      break
 
-  # python 2 has 'next', 3 has '__next__'
-  # avoid any explicit version checks
-  val_gen = (
-      hasattr(validation_data, 'next') or
-      hasattr(validation_data, '__next__') or
-      isinstance(validation_data, Sequence))
-  if (val_gen and not isinstance(validation_data, Sequence) and
-      not validation_steps):
-    raise ValueError('`validation_steps=None` is only valid for a'
-                     ' generator based on the `keras.utils.Sequence`'
-                     ' class. Please specify `validation_steps` or use'
-                     ' the `keras.utils.Sequence` class.')
+    # Setup work for each epoch.
+    model.reset_metrics()
+    epoch_logs = {}
+    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_begin(epoch, epoch_logs)
 
-  enqueuer = None
-  val_enqueuer = None
+    for step in range(steps_per_epoch):
+      batch_data = _get_next_batch(output_generator, mode)
+      if batch_data is None:
+        callbacks.model.stop_training = True
+        break
 
-  try:
-    val_x, val_y, val_sample_weights = validation_data, None, None
-    if do_validation and not val_gen:
-      # Prepare data for validation
-      if len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weights = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weights = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError(
-            '`validation_data` should be a tuple '
-            '`(val_x, val_y, val_sample_weight)` '
-            'or `(val_x, val_y)`. Found: ' + str(validation_data))
-      val_x, val_y, val_sample_weights = model._standardize_user_data(
-          val_x, val_y, val_sample_weights)
-
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        val_inputs=val_x,
-        val_targets=val_y,
-        val_sample_weights=val_sample_weights,
-        epochs=epochs,
-        validation_steps=validation_steps,
-        steps_per_epoch=steps_per_epoch,
-        verbose=verbose)
-
-    if workers > 0:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            shuffle=shuffle)
-      else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            wait_time=wait_time)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
-    else:
-      if is_sequence:
-        output_generator = iter(generator)
-      else:
-        output_generator = generator
+      # `batch_size` used for validation data if validation
+      # data is NumPy/EagerTensors.
+      batch_size = int(nest.flatten(batch_data)[0].shape[0])
 
-    callbacks.on_train_begin()
-    # Construct epoch logs.
-    epoch_logs = {}
-    while epoch < epochs:
-      for m in model.stateful_metric_functions:
-        m.reset_states()
-      callbacks.on_epoch_begin(epoch)
-      steps_done = 0
-      batch_index = 0
-      while steps_done < steps_per_epoch:
-        generator_output = next(output_generator)
-
-        if not hasattr(generator_output, '__len__'):
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-
-        if len(generator_output) == 2:
-          x, y = generator_output
-          sample_weight = None
-        elif len(generator_output) == 3:
-          x, y, sample_weight = generator_output
-        else:
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-        # build batch logs
-        batch_logs = {}
-        if isinstance(x, list):
-          batch_size = x[0].shape[0]
-        elif isinstance(x, dict):
-          batch_size = list(x.values())[0].shape[0]
-        else:
-          batch_size = x.shape[0]
-        batch_logs['batch'] = batch_index
-        batch_logs['size'] = batch_size
-        callbacks.on_batch_begin(batch_index, batch_logs)
-
-        outs = model.train_on_batch(
-            x, y, sample_weight=sample_weight, class_weight=class_weight)
-
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
-
-        callbacks.on_batch_end(batch_index, batch_logs)
-
-        batch_index += 1
-        steps_done += 1
-
-        # Epoch finished.
-        if steps_done >= steps_per_epoch and do_validation:
-          if val_gen:
-            val_outs = evaluate_generator(
-                model,
-                validation_data,
-                validation_steps,
-                workers=workers,
-                use_multiprocessing=use_multiprocessing,
-                max_queue_size=max_queue_size)
-          else:
-            # No need for try/except because
-            # data has already been validated.
-            val_outs = model.evaluate(
-                val_x,
-                val_y,
-                batch_size=batch_size,
-                sample_weight=val_sample_weights,
-                verbose=0)
-          if not isinstance(val_outs, list):
-            val_outs = [val_outs]
-          # Same labels assumed.
-          for l, o in zip(model.metrics_names, val_outs):
-            epoch_logs['val_' + l] = o
-
-        if callbacks.model.stop_training:
-          break
-
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      epoch += 1
-      if callbacks.model.stop_training:
-        break
+      # Callbacks batch begin.
+      batch_logs = {'batch': step, 'size': batch_size}
+      callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
+      progbar.on_batch_begin(step, batch_logs)
 
-  finally:
-    try:
-      if enqueuer is not None:
-        enqueuer.stop()
-    finally:
-      if val_enqueuer is not None:
-        val_enqueuer.stop()
-
-  callbacks.on_train_end()
-  return model.history
-
-
-def evaluate_generator(model,
-                       generator,
-                       steps=None,
-                       max_queue_size=10,
-                       workers=1,
-                       use_multiprocessing=False,
-                       verbose=0):
-  """See docstring for `Model.evaluate_generator`."""
-  if not context.executing_eagerly():
-    model._make_test_function()
-
-  if hasattr(model, 'metrics'):
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-    stateful_metric_indices = [
-        i for i, name in enumerate(model.metrics_names)
-        if str(name) in model.stateful_metric_names]
-  else:
-    stateful_metric_indices = []
+      batch_outs = batch_function(*batch_data)
+      if not isinstance(batch_outs, list):
+        batch_outs = [batch_outs]
 
-  steps_done = 0
-  wait_time = 0.01
-  all_outs = []
-  batch_sizes = []
-  is_sequence = isinstance(generator, Sequence)
-  if not is_sequence and use_multiprocessing and workers > 1:
-    logging.warning(
-        UserWarning('Using a generator with `use_multiprocessing=True`'
-                    ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
-                    ' class.'))
-  if steps is None:
-    if is_sequence:
-      steps = len(generator)
-    else:
-      raise ValueError('`steps=None` is only valid for a generator'
-                       ' based on the `keras.utils.Sequence` class.'
-                       ' Please specify `steps` or use the'
-                       ' `keras.utils.Sequence` class.')
-  enqueuer = None
+      # Aggregate results.
+      if step == 0:
+        aggregator.create(batch_outs)
+      aggregator.aggregate(batch_outs)
+
+      # Callbacks batch end.
+      batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
+      callbacks._call_batch_hook(mode, 'end', step, batch_logs)
+      progbar.on_batch_end(step, batch_logs)
+
+      if callbacks.model.stop_training:
+        break
 
+    aggregator.finalize()
+    results = aggregator.results
+    epoch_logs.update(training_utils.make_logs(model, results, mode))
+    if len(results) == 1:
+      results = results[0]
+
+    # Run the test loop every epoch during training.
+    if do_validation and not callbacks.model.stop_training:
+      val_results = model_iteration(
+          model,
+          validation_data,
+          steps_per_epoch=validation_steps,
+          batch_size=batch_size,
+          class_weight=class_weight,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          max_queue_size=max_queue_size,
+          mode='test')
+
+      if not isinstance(val_results, list):
+        val_results = [val_results]
+      epoch_logs.update(
+          training_utils.make_logs(model, val_results, mode, prefix='val_'))
+
+    callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_end(epoch, epoch_logs)
+  callbacks._call_end_hook(mode)
+
+  if enqueuer is not None:
+    enqueuer.stop()
+
+  if should_set_learning_phase:
+    backend.set_learning_phase(old_learning_phase)
+
+  if mode == 'train':
+    return model.history
+  return results
+
+
+# Maintain compatibility with the existing names.
+fit_generator = functools.partial(model_iteration, mode='train')
+evaluate_generator = functools.partial(model_iteration, mode='test')
+predict_generator = functools.partial(model_iteration, mode='predict')
+
+
+def _get_next_batch(output_generator, mode):
+  """Retrieves the next batch of input data."""
   try:
-    if workers > 0:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
-      else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            wait_time=wait_time)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+    generator_output = next(output_generator)
+  except (errors.OutOfRangeError, StopIteration):
+    # Returning `None` will trigger looping to stop.
+    logging.warning('Your dataset iterator ran out of data.')
+    return None
+  if not isinstance(generator_output, tuple):
+    if mode == 'predict':
+      # Always wrap in a tuple.
+      return (generator_output,)
     else:
-      if is_sequence:
-        output_generator = iter(generator)
-      else:
-        output_generator = generator
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    while steps_done < steps:
-      generator_output = next(output_generator)
-      if not hasattr(generator_output, '__len__'):
-        raise ValueError('Output of generator should be a tuple '
-                         '(x, y, sample_weight) '
-                         'or (x, y). Found: ' + str(generator_output))
-      if len(generator_output) == 2:
-        x, y = generator_output
-        sample_weight = None
-      elif len(generator_output) == 3:
-        x, y, sample_weight = generator_output
-      else:
-        raise ValueError('Output of generator should be a tuple '
-                         '(x, y, sample_weight) '
-                         'or (x, y). Found: ' + str(generator_output))
-      outs = model.test_on_batch(x, y, sample_weight=sample_weight)
-
-      if isinstance(x, list):
-        batch_size = x[0].shape[0]
-      elif isinstance(x, dict):
-        batch_size = list(x.values())[0].shape[0]
-      else:
-        batch_size = x.shape[0]
-      if batch_size == 0:
-        raise ValueError('Received an empty batch. '
-                         'Batches should at least contain one item.')
-      all_outs.append(outs)
-
-      steps_done += 1
-      batch_sizes.append(batch_size)
-      if verbose == 1:
-        progbar.update(steps_done)
-
-  finally:
-    if enqueuer is not None:
-      enqueuer.stop()
-
-  if not isinstance(outs, list):
-    return np.average(np.asarray(all_outs), weights=batch_sizes)
-  else:
-    averages = []
-    for i in range(len(outs)):
-      if i not in stateful_metric_indices:
-        averages.append(
-            np.average([out[i] for out in all_outs], weights=batch_sizes))
-      else:
-        averages.append(np.float64(all_outs[-1][i]))
-    return averages
-
-
-def predict_generator(model,
-                      generator,
-                      steps=None,
-                      max_queue_size=10,
-                      workers=1,
-                      use_multiprocessing=False,
-                      verbose=0):
-  """See docstring for `Model.predict_generator`."""
-  if not context.executing_eagerly():
-    model._make_test_function()
-
-  steps_done = 0
-  wait_time = 0.01
-  all_outs = []
-  is_sequence = isinstance(generator, Sequence)
+      raise ValueError('Output of generator should be '
+                       'a tuple `(x, y, sample_weight)` '
+                       'or `(x, y)`. Found: ' + str(generator_output))
+
+  if len(generator_output) < 1 or len(generator_output) > 3:
+    raise ValueError('Output of generator should be '
+                     'a tuple `(x, y, sample_weight)` '
+                     'or `(x, y)` or (x,). Found: ' + str(generator_output))
+  return generator_output
+
+
+def _validate_arguments(is_sequence, use_multiprocessing, workers,
+                        steps_per_epoch, validation_data, validation_steps,
+                        mode, kwargs):
+  """Raises errors if arguments are invalid.
+
+  Arguments:
+    is_sequence: Boolean, whether data is a `keras.utils.data_utils.Sequence`
+      instance.
+    use_multiprocessing: Boolean. If `True`, use process-based threading. If
+      unspecified, `use_multiprocessing` will default to `False`. Note that
+      because this implementation relies on multiprocessing, you should not pass
+      non-picklable arguments to the generator as they can't be passed easily to
+      children processes.
+    workers: Integer. Maximum number of processes to spin up when using
+      process-based threading. If unspecified, `workers` will default to 1. If
+      0, will execute the generator on the main thread.
+    steps_per_epoch: Total number of steps (batches of samples) before declaring
+      one epoch finished and starting the next epoch. Ignored with the default
+      value of `None`.
+    validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x,
+      y)` or `(x, y, sample_weights)`) or a generator or
+      `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+    validation_steps: Total number of steps (batches of samples) before
+      declaring validation finished.
+    mode: One of 'train'/'test'/'predict'.
+    kwargs: Additional arguments for backwards compatibility.
+
+  Raises:
+    ValueError: If `steps_per_epoch` or `validation_steps` are not passed
+      for data types that require them, or if unrecognized keyword
+      arguments are passed.
+  """
   if not is_sequence and use_multiprocessing and workers > 1:
     logging.warning(
         UserWarning('Using a generator with `use_multiprocessing=True`'
                     ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
+                    ' Please consider using the `keras.utils.Sequence`'
                     ' class.'))
-  if steps is None:
-    if is_sequence:
-      steps = len(generator)
-    else:
-      raise ValueError('`steps=None` is only valid for a generator'
-                       ' based on the `keras.utils.Sequence` class.'
-                       ' Please specify `steps` or use the'
-                       ' `keras.utils.Sequence` class.')
-  enqueuer = None
 
-  try:
-    if workers > 0:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
-      else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            wait_time=wait_time)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+  if steps_per_epoch is None:
+    arg_name = 'steps_per_epoch' if mode == 'train' else 'steps'
+    raise ValueError('Please specify the number of steps via the '
+                     '`{}` argument.'.format(arg_name))
+
+  val_gen = (
+      data_utils.is_generator_or_sequence(validation_data) or
+      isinstance(validation_data, iterator_ops.EagerIterator) or
+      isinstance(validation_data, dataset_ops.DatasetV2))
+  if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
+      not validation_steps):
+    raise ValueError('Please specify the `validation_steps` argument.')
+
+  if any(k != 'steps' for k in kwargs):
+    raise ValueError('Invalid arguments passed: {}'.format(
+        [k for k in kwargs if k != 'steps']))
+
+
+def convert_to_generator_like(data,
+                              batch_size=None,
+                              steps_per_epoch=None,
+                              epochs=1,
+                              shuffle=False):
+  """Make a generator out of NumPy or EagerTensor inputs.
+
+  Arguments:
+    data: Either a generator or `keras.utils.data_utils.Sequence` object or
+      `Dataset` or `EagerIterator` or a {1,2,3}-tuple of NumPy arrays or
+      EagerTensors. If a tuple, the elements represent `(x, y, sample_weights)`
+      and may be `None` or `[None]`.
+    batch_size: Used when creating a generator out of tuples of NumPy arrays or
+      EagerTensors.
+    steps_per_epoch: Steps of the generator to run each epoch.
+    epochs: Total number of epochs to run.
+    shuffle: Whether the data should be shuffled.
+
+  Returns:
+    - Generator or `keras.utils.data_utils.Sequence` or EagerIterator.
+
+  Raises:
+    - ValueError: If `batch_size` is not provided for NumPy or EagerTensor
+      inputs.
+  """
+  if isinstance(data, tuple):
+    # Scrub `Nones` that might have been passed for `targets`, `sample_weights`.
+    data = tuple(
+        ele for ele in data if not all(e is None for e in nest.flatten(ele)))
+    if len(data) == 1:
+      data = data[0]
+
+  if data_utils.is_generator_or_sequence(data) or isinstance(
+      data, iterator_ops.EagerIterator):
+    if isinstance(data, data_utils.Sequence):
+      steps_per_epoch = len(data)
+    return data, steps_per_epoch
+  if isinstance(data, dataset_ops.DatasetV2):
+    return dataset_ops.make_one_shot_iterator(data), steps_per_epoch
+
+  # Create generator from NumPy or EagerTensor Input.
+  num_samples = int(nest.flatten(data)[0].shape[0])
+  if batch_size is None:
+    raise ValueError('You must specify `batch_size`')
+  steps_per_epoch = int(math.ceil(num_samples / batch_size))
+
+  def _gen(data):
+    """Makes a generator out of a structure of NumPy/EagerTensors."""
+    index_array = np.arange(num_samples)
+    for _ in range(epochs):
+      if shuffle:
+        np.random.shuffle(index_array)
+      batches = generic_utils.make_batches(num_samples, batch_size)
+      for (batch_start, batch_end) in batches:
+        batch_ids = index_array[batch_start:batch_end]
+        flat_batch_data = training_utils.slice_arrays(
+            nest.flatten(data), batch_ids, contiguous=(not shuffle))
+        yield nest.pack_sequence_as(data, flat_batch_data)
+
+  return _gen(data), steps_per_epoch
+
+
+def _make_enqueued_generator(generator,
+                             workers=1,
+                             use_multiprocessing=False,
+                             max_queue_size=10,
+                             shuffle=False):
+  """Create a buffered queue of next elements of the generator."""
+  is_sequence = isinstance(generator, data_utils.Sequence)
+  enqueuer = None
+  if workers > 0:
+    if is_sequence:
+      enqueuer = data_utils.OrderedEnqueuer(
+          generator, use_multiprocessing=use_multiprocessing, shuffle=shuffle)
     else:
-      if is_sequence:
-        output_generator = iter(generator)
-      else:
-        output_generator = generator
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    while steps_done < steps:
-      generator_output = next(output_generator)
-      if isinstance(generator_output, tuple):
-        # Compatibility with the generators
-        # used for training.
-        if len(generator_output) == 2:
-          x, _ = generator_output
-        elif len(generator_output) == 3:
-          x, _, _ = generator_output
-        else:
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-      else:
-        # Assumes a generator that only
-        # yields inputs (not targets and sample weights).
-        x = generator_output
-
-      outs = model.predict_on_batch(x)
-      if not isinstance(outs, list):
-        outs = [outs]
-
-      if not all_outs:
-        for out in outs:
-          all_outs.append([])
-
-      for i, out in enumerate(outs):
-        all_outs[i].append(out)
-      steps_done += 1
-      if verbose == 1:
-        progbar.update(steps_done)
-
-  finally:
-    if enqueuer is not None:
-      enqueuer.stop()
-
-  if len(all_outs) == 1:
-    if steps_done == 1:
-      return all_outs[0][0]
+      enqueuer = data_utils.GeneratorEnqueuer(
+          generator, use_multiprocessing=use_multiprocessing)
+    enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+    output_generator = enqueuer.get()
+  else:
+    if is_sequence:
+      output_generator = data_utils.iter_sequence_infinite(generator)
     else:
-      return np.concatenate(all_outs[0])
-  if steps_done == 1:
-    return [out[0] for out in all_outs]
+      output_generator = generator
+  return output_generator, enqueuer
+
+
+def _make_execution_function(model, mode, class_weight=None):
+  """Makes function to run one step of model execution."""
+  if mode == 'train':
+    if not context.executing_eagerly():
+      model._make_fit_function()
+    f = functools.partial(model.train_on_batch, class_weight=class_weight)
+  elif mode == 'test':
+    if not context.executing_eagerly():
+      model._make_eval_function()
+    f = model.test_on_batch
   else:
-    return [np.concatenate(out) for out in all_outs]
+    # Match signature of other modes to allow
+    # 1, 2, or 3-tuples from generator
+    def predict_on_batch(x, y=None, sample_weights=None):  # pylint: disable=unused-argument
+      return model.predict_on_batch(x)
+
+    f = predict_on_batch
+
+  # Maintain stateful metrics across batch-level calls.
+  if mode != 'predict':
+    f = functools.partial(f, reset_metrics=False)
+
+  return f
+
+
+def _get_num_samples_or_steps(data, steps_per_epoch):
+  """Returns number of samples or steps, and whether to use steps count mode."""
+  flat_inputs = nest.flatten(data)
+  if hasattr(flat_inputs[0], 'shape'):
+    return int(flat_inputs[0].shape[0]), False
+  return steps_per_epoch, True
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8941428e43ac5d7b4b439d86795e93a70fd270f0
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -0,0 +1,389 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.util import nest
+
+
+def custom_generator(mode=2):
+  batch_size = 10
+  num_samples = 50
+  arr_data = np.random.random((num_samples, 2))
+  arr_labels = np.random.random((num_samples, 4))
+  arr_weights = np.random.random((num_samples,))
+  i = 0
+  while True:
+    batch_index = i * batch_size % num_samples
+    i += 1
+    start = batch_index
+    end = start + batch_size
+    x = arr_data[start: end]
+    y = arr_labels[start: end]
+    w = arr_weights[start: end]
+    if mode == 1:
+      yield x
+    elif mode == 2:
+      yield x, y
+    else:
+      yield x, y, w
+
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
+
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
+  @parameterized.parameters('sequential', 'functional')
+  def test_fit_generator_method(self, model_type):
+    if model_type == 'sequential':
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    else:
+      model = testing_utils.get_small_functional_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        workers=4,
+                        use_multiprocessing=True)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False,
+                        validation_data=custom_generator(),
+                        validation_steps=10)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        validation_data=custom_generator(),
+                        validation_steps=1,
+                        workers=0)
+
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
+  @parameterized.parameters('sequential', 'functional')
+  def test_evaluate_generator_method(self, model_type):
+    if model_type == 'sequential':
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    else:
+      model = testing_utils.get_small_functional_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+    model.summary()
+
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_queue_size=10,
+                             workers=2,
+                             verbose=1,
+                             use_multiprocessing=True)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_queue_size=10,
+                             use_multiprocessing=False)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_queue_size=10,
+                             use_multiprocessing=False,
+                             workers=0)
+
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
+  @parameterized.parameters('sequential', 'functional')
+  def test_predict_generator_method(self, model_type):
+    if model_type == 'sequential':
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    else:
+      model = testing_utils.get_small_functional_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_queue_size=10,
+                            workers=2,
+                            use_multiprocessing=True)
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_queue_size=10,
+                            use_multiprocessing=False)
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_queue_size=10,
+                            workers=0)
+    # Test generator with just inputs (no targets)
+    model.predict_generator(custom_generator(mode=1),
+                            steps=5,
+                            max_queue_size=10,
+                            workers=2,
+                            use_multiprocessing=True)
+    model.predict_generator(custom_generator(mode=1),
+                            steps=5,
+                            max_queue_size=10,
+                            use_multiprocessing=False)
+    model.predict_generator(custom_generator(mode=1),
+                            steps=5,
+                            max_queue_size=10,
+                            workers=0)
+
+  def test_generator_methods_with_sample_weights(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+
+    model.fit_generator(custom_generator(mode=3),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False)
+    model.fit_generator(custom_generator(mode=3),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False,
+                        validation_data=custom_generator(mode=3),
+                        validation_steps=10)
+    model.predict_generator(custom_generator(mode=3),
+                            steps=5,
+                            max_queue_size=10,
+                            use_multiprocessing=False)
+    model.evaluate_generator(custom_generator(mode=3),
+                             steps=5,
+                             max_queue_size=10,
+                             use_multiprocessing=False)
+
+  def test_generator_methods_invalid_use_case(self):
+
+    def invalid_generator():
+      while 1:
+        yield 0
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model.compile(loss='mse', optimizer='sgd')
+
+    with self.assertRaises(ValueError):
+      model.fit_generator(invalid_generator(),
+                          steps_per_epoch=5,
+                          epochs=1,
+                          verbose=1,
+                          max_queue_size=10,
+                          use_multiprocessing=False)
+    with self.assertRaises(ValueError):
+      model.fit_generator(custom_generator(),
+                          steps_per_epoch=5,
+                          epochs=1,
+                          verbose=1,
+                          max_queue_size=10,
+                          use_multiprocessing=False,
+                          validation_data=invalid_generator(),
+                          validation_steps=10)
+    with self.assertRaises(AttributeError):
+      model.predict_generator(invalid_generator(),
+                              steps=5,
+                              max_queue_size=10,
+                              use_multiprocessing=False)
+    with self.assertRaises(ValueError):
+      model.evaluate_generator(invalid_generator(),
+                               steps=5,
+                               max_queue_size=10,
+                               use_multiprocessing=False)
+
+  def test_generator_input_to_fit_eval_predict(self):
+    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+
+    def ones_generator():
+      while True:
+        yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+
+    inputs = keras.layers.Input(shape=(10,))
+    x = keras.layers.Dense(10, activation='relu')(inputs)
+    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
+    model = keras.Model(inputs, outputs)
+
+    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
+    model.fit(
+        ones_generator(),
+        steps_per_epoch=2,
+        validation_data=val_data,
+        epochs=2)
+    model.evaluate(ones_generator(), steps=2)
+    model.predict(ones_generator(), steps=2)
+
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestGeneratorMethodsWithSequences(test.TestCase):
+
+  def test_training_with_sequences(self):
+
+    class DummySequence(keras.utils.Sequence):
+
+      def __getitem__(self, idx):
+        return np.zeros([10, 2]), np.ones([10, 4])
+
+      def __len__(self):
+        return 10
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model.compile(loss='mse', optimizer='sgd')
+
+    model.fit_generator(DummySequence(),
+                        steps_per_epoch=10,
+                        validation_data=custom_generator(),
+                        validation_steps=1,
+                        max_queue_size=10,
+                        workers=0,
+                        use_multiprocessing=True)
+    model.fit_generator(DummySequence(),
+                        steps_per_epoch=10,
+                        validation_data=custom_generator(),
+                        validation_steps=1,
+                        max_queue_size=10,
+                        workers=0,
+                        use_multiprocessing=False)
+
+  def test_sequence_input_to_fit_eval_predict(self):
+    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+
+    class CustomSequence(keras.utils.Sequence):
+
+      def __getitem__(self, idx):
+        return np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+
+      def __len__(self):
+        return 2
+
+    inputs = keras.layers.Input(shape=(10,))
+    x = keras.layers.Dense(10, activation='relu')(inputs)
+    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
+    model = keras.Model(inputs, outputs)
+
+    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
+    model.fit(CustomSequence(), validation_data=val_data, epochs=2)
+    model.evaluate(CustomSequence())
+    model.predict(CustomSequence())
+
+    with self.assertRaisesRegexp(ValueError, '`y` argument is not supported'):
+      model.fit(CustomSequence(), y=np.ones([10, 1]))
+
+    with self.assertRaisesRegexp(ValueError,
+                                 '`sample_weight` argument is not supported'):
+      model.fit(CustomSequence(), sample_weight=np.ones([10, 1]))
+
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestConvertToGeneratorLike(test.TestCase, parameterized.TestCase):
+  simple_inputs = (np.ones((10, 10)), np.ones((10, 1)))
+  nested_inputs = ((np.ones((10, 10)), np.ones((10, 20))), (np.ones((10, 1)),
+                                                            np.ones((10, 3))))
+
+  def _make_dataset(self, inputs, batches):
+    return dataset_ops.DatasetV2.from_tensors(inputs).repeat(batches)
+
+  def _make_iterator(self, inputs, batches):
+    return dataset_ops.make_one_shot_iterator(
+        self._make_dataset(inputs, batches))
+
+  def _make_generator(self, inputs, batches):
+
+    def _gen():
+      for _ in range(batches):
+        yield inputs
+
+    return _gen()
+
+  def _make_numpy(self, inputs, _):
+    return inputs
+
+  @parameterized.named_parameters(
+      ('simple_dataset', _make_dataset, simple_inputs),
+      ('simple_iterator', _make_iterator, simple_inputs),
+      ('simple_generator', _make_generator, simple_inputs),
+      ('simple_numpy', _make_numpy, simple_inputs),
+      ('nested_dataset', _make_dataset, nested_inputs),
+      ('nested_iterator', _make_iterator, nested_inputs),
+      ('nested_generator', _make_generator, nested_inputs),
+      ('nested_numpy', _make_numpy, nested_inputs))
+  def test_convert_to_generator_like(self, input_fn, inputs):
+    expected_batches = 5
+    data = input_fn(self, inputs, expected_batches)
+
+    # Dataset and Iterator not supported in Legacy Graph mode.
+    if (not context.executing_eagerly() and
+        isinstance(data, (dataset_ops.DatasetV2, iterator_ops.Iterator))):
+      return
+
+    generator, steps = training_generator.convert_to_generator_like(
+        data, batch_size=2, steps_per_epoch=expected_batches)
+    self.assertEqual(steps, expected_batches)
+
+    for _ in range(expected_batches):
+      outputs = next(generator)
+    nest.assert_same_structure(outputs, inputs)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/training_gpu_test.py b/tensorflow/python/keras/engine/training_gpu_test.py
index 5825ce814fd84bf59637f6079e7402d752e2b77b..45dcfe43995b280072395b11a573e20d57bcadc7 100644
--- a/tensorflow/python/keras/engine/training_gpu_test.py
+++ b/tensorflow/python/keras/engine/training_gpu_test.py
@@ -69,7 +69,7 @@ class TrainingGPUTest(test.TestCase):
       return simple_model
 
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         losses_to_test = ['sparse_categorical_crossentropy',
                           'categorical_crossentropy', 'binary_crossentropy']
 
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 54ad74c08b906a60e1821b47e5cefac8ff8287ad..91a0c7cc2f2dc5cf3e76eafdaaa79cfe6bc10336 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -18,11 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import io
 import logging
-import os
-import unittest
+import sys
 
 import numpy as np
+import six
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
@@ -31,11 +32,13 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.callbacks import Callback
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
-from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
@@ -48,19 +51,20 @@ except ImportError:
   scipy_sparse = None
 
 
-class TrainingTest(test.TestCase):
+class TrainingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes
   def test_fit_on_arrays(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    input_a = keras.layers.Input(shape=(3,), name='input_a')
+    input_b = keras.layers.Input(shape=(3,), name='input_b')
 
     dense = keras.layers.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+    dropout = keras.layers.Dropout(0.5, name='dropout')
+    branch_a = [input_a, dense]
+    branch_b = [input_b, dense, dropout]
 
-    model = keras.models.Model([a, b], [d, e])
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
@@ -69,7 +73,8 @@ class TrainingTest(test.TestCase):
         optimizer,
         loss,
         metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
-        loss_weights=loss_weights)
+        loss_weights=loss_weights,
+        run_eagerly=testing_utils.should_run_eagerly())
 
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 3))
@@ -133,61 +138,63 @@ class TrainingTest(test.TestCase):
         verbose=0,
         validation_split=0.2)
 
-    # Test with dictionary inputs
-    model.fit(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    model.fit(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        epochs=1,
-        batch_size=5,
-        verbose=1)
-    model.fit(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        validation_data=({
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        }),
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    model.train_on_batch({
-        'input_a': input_a_np,
-        'input_b': input_b_np
-    }, {
-        'dense': output_d_np,
-        'dropout': output_e_np
-    })
+    if testing_utils.get_model_type() == 'functional':
+      # Test with dictionary inputs
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
+          epochs=1,
+          batch_size=5,
+          verbose=1)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
+          validation_data=({
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          }),
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.train_on_batch({
+          'input_a': input_a_np,
+          'input_b': input_b_np
+      }, {
+          'dense': output_d_np,
+          'dropout': output_e_np
+      })
 
     # Test with lists for loss, metrics
     loss = ['mae', 'mse']
     model.compile(
         optimizer,
         loss,
-        metrics=[metrics_module.CategoricalAccuracy(), 'mae'])
+        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=1,
@@ -195,13 +202,15 @@ class TrainingTest(test.TestCase):
         verbose=0)
 
     # Test with dictionaries for loss, metrics, loss weights
-    loss = {'dense': 'mse', 'dropout': 'mae'}
-    loss_weights = {'dense': 1., 'dropout': 0.5}
-    metrics = {
-        'dense': 'mse',
-        'dropout': metrics_module.CategoricalAccuracy()
-    }
-    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    if testing_utils.get_model_type() == 'functional':
+      loss = {'dense': 'mse', 'dropout': 'mae'}
+      loss_weights = {'dense': 1., 'dropout': 0.5}
+      metrics = {
+          'dense': 'mse',
+          'dropout': metrics_module.CategoricalAccuracy()
+      }
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights,
+                    run_eagerly=testing_utils.should_run_eagerly())
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=1,
@@ -237,7 +246,8 @@ class TrainingTest(test.TestCase):
     x = keras.layers.Input(shape=(3,), name='input_a')
     y = keras.layers.Dense(4)(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer, loss='mse')
+    model.compile(optimizer, loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     # This will work
     model.fit([input_a_np], output_d_np, epochs=1)
     with self.assertRaises(ValueError):
@@ -253,7 +263,7 @@ class TrainingTest(test.TestCase):
               batch_size=5,
               verbose=2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_evaluate_predict_on_arrays(self):
     a = keras.layers.Input(shape=(3,), name='input_a')
     b = keras.layers.Input(shape=(3,), name='input_b')
@@ -273,7 +283,8 @@ class TrainingTest(test.TestCase):
         loss,
         metrics=['mae', metrics_module.CategoricalAccuracy()],
         loss_weights=loss_weights,
-        sample_weight_mode=None)
+        sample_weight_mode=None,
+        run_eagerly=testing_utils.should_run_eagerly())
 
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 3))
@@ -334,74 +345,133 @@ class TrainingTest(test.TestCase):
     })
     self.assertEqual(len(out), 2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_invalid_loss(self):
-    num_classes = 5
-    train_samples = 1000
-    test_samples = 1000
-    input_dim = 5
+  @keras_parameterized.run_all_keras_modes
+  def test_activity_regularizer_fit(self):
+    loss = {}
+    for reg in [None, 'l2']:
+      inputs = keras.layers.Input(shape=(10,))
+      x = keras.layers.Dense(
+          10, activation='relu', activity_regularizer=reg,
+          kernel_initializer='ones', use_bias=False)(inputs)
+      outputs = keras.layers.Dense(1, activation='sigmoid',
+                                   kernel_initializer='ones', use_bias=False)(x)
+      model = keras.Model(inputs, outputs)
 
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+      x = np.ones((10, 10), 'float32')
+      y = np.ones((10, 1), 'float32')
+
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      model.compile(optimizer, 'binary_crossentropy',
+                    run_eagerly=testing_utils.should_run_eagerly())
+      model.fit(x, y, batch_size=2, epochs=5)
+      loss[reg] = model.evaluate(x, y)
+    self.assertLess(loss[None], loss['l2'])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_activity_regularizer_loss_value(self):
+    inputs = keras.layers.Input(shape=(10,))
+    outputs = keras.layers.Dense(
+        1,
+        kernel_initializer=keras.initializers.zeros(),
+        bias_initializer=keras.initializers.ones(),
+        activity_regularizer='l2')(
+            inputs)
+    model = keras.Model(inputs, outputs)
+    x = np.ones((10, 10), 'float32')
+    y = np.ones((10, 1), 'float32')
     optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, loss='categorical_crossentropy')
-    np.random.seed(1337)
-    (x_train, y_train), (_, _) = testing_utils.get_test_data(
-        train_samples=train_samples,
-        test_samples=test_samples,
-        input_shape=(input_dim,),
-        num_classes=num_classes)
+    model.compile(optimizer, 'binary_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    loss = model.test_on_batch(x, y)
+    self.assertAlmostEqual(0.01, loss, places=4)
 
-    with self.assertRaises(ValueError):
-      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
+  @keras_parameterized.run_all_keras_modes
+  def test_activity_regularizer_batch_independent(self):
+    inputs = keras.layers.Input(shape=(10,))
+    x = keras.layers.Dense(
+        10, activation='relu', activity_regularizer='l2')(
+            inputs)
+    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
+    model = keras.Model(inputs, outputs)
 
-    if not context.executing_eagerly():
-      # TODO(psv): Investigate these use cases in eager mode.
-      with self.assertRaises(ValueError):
-        model.fit(x_train, y_train)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(optimizer, 'binary_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
 
-      with self.assertRaises(ValueError):
-        model.compile(optimizer, loss=None)
+    x = np.ones((10, 10), 'float32')
+    y = np.ones((10, 1), 'float32')
+    loss_small_batch = model.test_on_batch(x, y)
+
+    x2 = np.ones((20, 10), 'float32')
+    y2 = np.ones((20, 1), 'float32')
+    loss_big_batch = model.test_on_batch(x2, y2)
+
+    self.assertAlmostEqual(loss_small_batch, loss_big_batch, places=4)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_activity_regularizer_in_model_call(self):
 
+    class MyModel(keras.Model):
+
+      def call(self, inputs):
+        self.add_loss(inputs)
+        return inputs
+
+    x = ops.convert_to_tensor(1.)
+    model = MyModel()
+    _ = model(x)
+    self.assertEqual(1, len(model.losses))
+
+  @keras_parameterized.run_all_keras_modes
   def test_training_on_sparse_data_with_dense_placeholders(self):
+    # TODO(kaftan) Test seems to not work, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      self.skipTest('Skipping running model eagerly.')
+
     if scipy_sparse is None:
       return
 
-    with self.cached_session():
-      test_inputs = [
-          scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)
-      ]
-      test_outputs = [
-          scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
-      ]
-      in1 = keras.layers.Input(shape=(3,))
-      in2 = keras.layers.Input(shape=(3,))
-      out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
-      out2 = keras.layers.Dense(4, name='dense_1')(in2)
-      model = keras.Model([in1, in2], [out1, out2])
-      model.predict(test_inputs, batch_size=2)
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      model.compile(
-          optimizer,
-          'mse',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-      model.fit(test_inputs, test_outputs,
-                epochs=1, batch_size=2, validation_split=0.5)
-      model.evaluate(test_inputs, test_outputs, batch_size=2)
+    test_inputs = [
+        scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)
+    ]
+    test_outputs = [
+        scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
+    ]
+    in1 = keras.layers.Input(shape=(3,))
+    in2 = keras.layers.Input(shape=(3,))
+    out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
+    out2 = keras.layers.Dense(4, name='dense_1')(in2)
+    model = keras.Model([in1, in2], [out1, out2])
+    model.predict(test_inputs, batch_size=2)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(
+        optimizer,
+        'mse',
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(test_inputs, test_outputs,
+              epochs=1, batch_size=2, validation_split=0.5)
+    model.evaluate(test_inputs, test_outputs, batch_size=2)
 
+  @keras_parameterized.run_all_keras_modes
   def test_compile_with_sparse_placeholders(self):
-    with self.cached_session():
-      input_layer = keras.layers.Input(shape=(10,), sparse=True)
-      weights = variables_lib.Variable(
-          np.ones((10, 1)).astype(np.float32), name='weights')
-      weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights)
-      output_layer = keras.layers.Lambda(weights_mult)(input_layer)
-      model = keras.Model([input_layer], output_layer)
-      model.compile(
-          loss='binary_crossentropy',
-          optimizer=keras.optimizers.Adam(lr=0.0001),
-          metrics=['accuracy'])
+    # TODO(kaftan) Test seems to not work, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      self.skipTest('Skipping running model eagerly.')
+
+    input_layer = keras.layers.Input(shape=(10,), sparse=True)
+    weights = variables_lib.Variable(
+        np.ones((10, 1)).astype(np.float32), name='weights')
+    weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights)
+    output_layer = keras.layers.Lambda(weights_mult)(input_layer)
+    model = keras.Model([input_layer], output_layer)
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=keras.optimizers.Adam(lr=0.0001),
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_that_trainable_disables_updates(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -440,7 +510,261 @@ class TrainingTest(test.TestCase):
       x2 = model.predict(val_a)
       self.assertAllClose(x1, x2, atol=1e-7)
 
+  def test_logs_passed_to_callbacks(self):
+    with self.cached_session():
+      input_dim = 5
+      num_classes = 1
+
+      class TestCallback(Callback):
+
+        def __init__(self):
+          super(TestCallback, self).__init__()
+          self.epoch_end_logs = None
+          self.batch_end_logs = None
+          self.epoch_end_call_count = 0
+          self.batch_end_call_count = 0
+
+        def on_epoch_end(self, epoch, logs=None):
+          self.epoch_end_logs = logs
+          self.epoch_end_call_count += 1
+
+        def on_batch_end(self, batch, logs=None):
+          self.batch_end_logs = logs
+          self.batch_end_call_count += 1
+
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+      model.compile(
+          loss='binary_crossentropy',
+          metrics=['acc'],
+          weighted_metrics=['mae'],
+          optimizer=RMSPropOptimizer(learning_rate=0.01))
+
+      np.random.seed(1337)
+      (x_train, y_train), (_, _) = testing_utils.get_test_data(
+          train_samples=10,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+
+      test_callback = TestCallback()
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=2,
+          epochs=2,
+          verbose=0,
+          callbacks=[test_callback],
+          validation_data=(x_train, y_train))
+      self.assertEqual(test_callback.batch_end_call_count, 10)
+      self.assertEqual(test_callback.epoch_end_call_count, 2)
+      self.assertSetEqual(
+          set(test_callback.batch_end_logs.keys()),
+          set(['batch', 'size', 'acc', 'loss', 'weighted_mean_absolute_error']))
+      self.assertSetEqual(
+          set(test_callback.epoch_end_logs.keys()),
+          set([
+              'acc', 'loss', 'weighted_mean_absolute_error', 'val_acc',
+              'val_loss', 'val_weighted_mean_absolute_error'
+          ]))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_mismatched_output_shape_and_target_shape(self):
+    model = keras.Sequential([
+        keras.layers.Dense(2, input_shape=(3, 4)),
+        keras.layers.Dense(5),
+    ])
+    model.compile(RMSPropOptimizer(learning_rate=0.001),
+                  loss='sparse_categorical_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    # Test with Numpy data
+    x_train = np.random.random((10, 3, 4))
+    y_train = np.random.randint(0, 5, size=(10, 3))
+    model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+    # Test with iterator
+    dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+    dataset = dataset.repeat(10)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    model.fit(iterator, epochs=1, steps_per_epoch=2)
+
+    if context.executing_eagerly():
+      # Test with eager execution
+      model.compile(RMSPropOptimizer(learning_rate=0.001),
+                    loss='sparse_categorical_crossentropy',
+                    run_eagerly=True)
+      model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+      # Test with eager execution and iterator
+      model.fit(iterator, epochs=1, steps_per_epoch=2)
+
+  def test_losses_in_defun(self):
+    with context.eager_mode():
+      layer = keras.layers.Dense(1, kernel_regularizer='l1')
+      layer(array_ops.ones([1, 10]))
+
+      @function.defun
+      def get_losses():
+        return layer.losses
+
+      self.assertAllEqual(
+          self.evaluate(layer.losses), self.evaluate(get_losses()))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_logging(self):
+    mock_stdout = io.BytesIO() if six.PY2 else io.StringIO()
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, activation='relu'))
+    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model.compile(
+        RMSPropOptimizer(learning_rate=0.001), loss='binary_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
+    with test.mock.patch.object(sys, 'stdout', mock_stdout):
+      model.fit(
+          np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
+    self.assertTrue('Epoch 5/10' in mock_stdout.getvalue())
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_training_with_loss_instance(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+    loss_weights = [1., 0.5]
+    model.compile(
+        RMSPropOptimizer(learning_rate=0.001),
+        loss=keras.losses.MeanSquaredError(),
+        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
+        loss_weights=loss_weights)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
+              epochs=1,
+              batch_size=5)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_static_batch_in_input_layer(self):
+
+    class Counter(keras.callbacks.Callback):
+
+      def __init__(self):
+        self.batches = 0
+
+      def on_batch_end(self, batch, logs=None):
+        self.batches += 1
+
+    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
+
+    for batch_size, expected_batches in [(None, 2), (4, 16)]:
+      inputs = keras.Input(batch_size=batch_size, shape=(10,))
+      outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+      model = keras.Model(inputs, outputs)
+
+      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      counter = Counter()
+      model.fit(x, y, callbacks=[counter])
+      self.assertEqual(counter.batches, expected_batches)
+
+      model = keras.Sequential(
+          [keras.layers.Dense(1, batch_input_shape=(batch_size, 10))])
+      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      counter = Counter()
+      model.fit(x, y, callbacks=[counter])
+      self.assertEqual(counter.batches, expected_batches)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_static_batch_in_input_layer_consistency_checks(self):
+    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
+
+    inputs = keras.Input(batch_size=2, shape=(10,))
+    outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+    with self.assertRaisesRegexp(ValueError,
+                                 'incompatible with the specified batch size'):
+      model.fit(x, y, batch_size=4)
+
+    data = dataset_ops.DatasetV2.from_tensor_slices((x, y))
+    data = data.batch(4, drop_remainder=True)
+    with self.assertRaisesRegexp(ValueError,
+                                 'incompatible with the specified batch size'):
+      model.fit(data, steps_per_epoch=16)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_compatible_batch_size_functional_model(self):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return array_ops.concat(inputs, axis=0)
+
+    input1 = keras.Input(batch_size=2, shape=(10,))
+    input2 = keras.Input(batch_size=3, shape=(10,))
+    outputs = MyLayer()([input1, input2])
+    with self.assertRaisesRegexp(ValueError,
+                                 'specified batch sizes of the Input Layers'):
+      keras.Model([input1, input2], outputs)
+
   @tf_test_util.run_in_graph_and_eager_modes
+  def test_calling_subclass_model_on_different_datasets(self):
+
+    class SubclassedModel(keras.models.Model):
+
+      def call(self, inputs):
+        return inputs * 2
+
+    model = SubclassedModel()
+    dataset_one = dataset_ops.Dataset.range(2).batch(2)
+    dataset_two = dataset_ops.Dataset.range(3, 10).batch(2)
+    self.assertAllEqual([[0], [2]], model.predict(dataset_one, steps=1))
+    self.assertAllEqual([[6], [8], [10], [12]],
+                        model.predict(dataset_two, steps=2))
+
+
+class TestExceptionsAndWarnings(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  def test_invalid_loss(self):
+    num_classes = 5
+    train_samples = 1000
+    test_samples = 1000
+    input_dim = 5
+
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=num_classes, input_dim=input_dim)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(optimizer, loss='categorical_crossentropy')
+    np.random.seed(1337)
+    (x_train, y_train), (_, _) = testing_utils.get_test_data(
+        train_samples=train_samples,
+        test_samples=test_samples,
+        input_shape=(input_dim,),
+        num_classes=num_classes)
+
+    with self.assertRaises(ValueError):
+      model.fit(x_train, np.concatenate([y_train, y_train], axis=-1))
+
+    if not context.executing_eagerly():
+      # TODO(psv): Investigate these use cases in eager mode.
+      with self.assertRaises(ValueError):
+        model.fit(x_train, y_train)
+
+      with self.assertRaises(ValueError):
+        model.compile(optimizer, loss=None,
+                      run_eagerly=testing_utils.should_run_eagerly())
+
+  @keras_parameterized.run_all_keras_modes
   def test_compile_warning_for_loss_missing_output(self):
     with self.cached_session():
       inp = keras.layers.Input(shape=(16,), name='input_a')
@@ -458,17 +782,19 @@ class TrainingTest(test.TestCase):
             metrics={
                 'dense_2': 'categorical_accuracy',
                 'dense_1': metrics_module.CategoricalAccuracy(),
-            })
+            },
+            run_eagerly=testing_utils.should_run_eagerly())
         msg = ('Output "dense_1" missing from loss dictionary. We assume this '
                'was done on purpose. The fit and evaluate APIs will not be '
                'expecting any data to be passed to "dense_1".')
         self.assertRegexpMatches(str(mock_log.call_args), msg)
 
 
-class LossWeightingTest(test.TestCase):
+class LossWeightingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_class_weights(self):
+  @keras_parameterized.run_all_keras_modes
+  # TODO(b/120562577): Test failing with assertion error.
+  def DISABLED_test_class_weights(self):
     num_classes = 5
     batch_size = 5
     epochs = 5
@@ -484,7 +810,8 @@ class LossWeightingTest(test.TestCase):
         loss='categorical_crossentropy',
         metrics=['acc', metrics_module.CategoricalAccuracy()],
         weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=learning_rate))
+        optimizer=RMSPropOptimizer(learning_rate=learning_rate),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     np.random.seed(1337)
     (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
@@ -536,26 +863,8 @@ class LossWeightingTest(test.TestCase):
         x_test[test_ids, :], y_test[test_ids, :], verbose=0)
     self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_sequential_model_fails_with_dict_inputs(self):
-    num_classes = 5
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=10, num_classes=num_classes)
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        metrics=['acc'],
-        weighted_metrics=['mae'],
-        loss='categorical_crossentropy')
-
-    x = {'dense_input': np.random.random((10, 1))}
-    y = np.random.randint(num_classes, size=(10, 1))
-
-    with self.assertRaisesRegexp(
-        ValueError, 'Passing a dictionary input to a Sequential Model which '
-        'doesnt have FeatureLayer as the first layer is an error'):
-      model.fit(x, y, batch_size=5, epochs=1)
-
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
+  @tf_test_util.run_v1_only('b/120545219')
   def test_sample_weights(self):
     num_classes = 5
     batch_size = 5
@@ -572,7 +881,8 @@ class LossWeightingTest(test.TestCase):
         RMSPropOptimizer(learning_rate=learning_rate),
         metrics=['acc', metrics_module.CategoricalAccuracy()],
         weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-        loss='categorical_crossentropy')
+        loss='categorical_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     np.random.seed(43)
     (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
@@ -620,13 +930,15 @@ class LossWeightingTest(test.TestCase):
           x_test[test_ids, :], y_test[test_ids, :], verbose=0)
       self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_warning_for_concurrent_sample_and_class_weights(self):
+
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(10, input_shape=(3,)))
     model.compile(
         loss='mse',
-        optimizer=RMSPropOptimizer(learning_rate=0.01))
+        optimizer=RMSPropOptimizer(learning_rate=0.01),
+        run_eagerly=testing_utils.should_run_eagerly())
     x_train = np.random.random((10, 3))
     y_train = np.random.random((10, 10))
     sample_weight = np.ones((y_train.shape[0]))
@@ -640,11 +952,19 @@ class LossWeightingTest(test.TestCase):
           verbose=0,
           sample_weight=sample_weight,
           class_weight=class_weight)
-      msg = ('The `class_weight` argument will be ignored.')
-      self.assertRegexpMatches(str(mock_log.call_args), msg)
+      msg = 'The `class_weight` argument will be ignored.'
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_temporal_sample_weights(self):
+      msg_found = False
+      for call_args in mock_log.call_args_list:
+        if msg in str(call_args):
+          msg_found = True
+
+      self.assertTrue(msg_found)
+
+  @keras_parameterized.run_all_keras_modes
+  @tf_test_util.run_v1_only('b/120545219')
+  # TODO(b/120562577): Test failing with assertion error.
+  def DISABLED_test_temporal_sample_weights(self):
     num_classes = 5
     batch_size = 5
     epochs = 5
@@ -701,7 +1021,8 @@ class LossWeightingTest(test.TestCase):
           loss='binary_crossentropy',
           metrics=['acc', metrics_module.CategoricalAccuracy()],
           weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-          sample_weight_mode='temporal')
+          sample_weight_mode='temporal',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       model.fit(
           temporal_x_train,
@@ -733,7 +1054,7 @@ class LossWeightingTest(test.TestCase):
             temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
         self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_class_weight_invalid_use_case(self):
     num_classes = 5
     train_samples = 1000
@@ -750,7 +1071,8 @@ class LossWeightingTest(test.TestCase):
               input_shape=(timesteps, input_dim)))
       model.add(keras.layers.Activation('softmax'))
       optimizer = RMSPropOptimizer(learning_rate=learning_rate)
-      model.compile(optimizer, loss='binary_crossentropy')
+      model.compile(optimizer, loss='binary_crossentropy',
+                    run_eagerly=testing_utils.should_run_eagerly())
 
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=train_samples,
@@ -768,14 +1090,16 @@ class LossWeightingTest(test.TestCase):
 
       with self.assertRaises(ValueError):
         model.compile(
-            optimizer, loss='binary_crossentropy', sample_weight_mode=[])
+            optimizer, loss='binary_crossentropy', sample_weight_mode=[],
+            run_eagerly=testing_utils.should_run_eagerly())
 
       # Build multi-output model
       x = keras.Input((3,))
       y1 = keras.layers.Dense(4, name='1')(x)
       y2 = keras.layers.Dense(4, name='2')(x)
       model = keras.models.Model(x, [y1, y2])
-      model.compile(optimizer, loss='mse')
+      model.compile(optimizer, loss='mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
       x_np = np.random.random((10, 3))
       y_np = np.random.random((10, 4))
       w_np = np.random.random((10,))
@@ -802,7 +1126,7 @@ class LossWeightingTest(test.TestCase):
         model.fit(x_np, [y_np, y_np], epochs=1,
                   sample_weight={'1': bad_w_np})
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_default_sample_weight(self):
     """Verifies that fit works without having to set sample_weight."""
 
@@ -823,38 +1147,46 @@ class LossWeightingTest(test.TestCase):
       optimizer = RMSPropOptimizer(learning_rate=learning_rate)
 
       # sample_weight_mode is a list and mode value is None
-      model.compile(optimizer, loss='mse', sample_weight_mode=[None])
+      model.compile(optimizer, loss='mse', sample_weight_mode=[None],
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a list and mode value is `temporal`
-      model.compile(optimizer, loss='mse', sample_weight_mode=['temporal'])
+      model.compile(optimizer, loss='mse', sample_weight_mode=['temporal'],
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a dict and mode value is None
       model.compile(
-          optimizer, loss='mse', sample_weight_mode={'time_distributed': None})
+          optimizer, loss='mse', sample_weight_mode={'time_distributed': None},
+          run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a dict and mode value is `temporal`
       model.compile(
           optimizer,
           loss='mse',
-          sample_weight_mode={'time_distributed': 'temporal'})
+          sample_weight_mode={'time_distributed': 'temporal'},
+          run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a not a list/dict and mode value is None
-      model.compile(optimizer, loss='mse', sample_weight_mode=None)
+      model.compile(optimizer, loss='mse', sample_weight_mode=None,
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a not a list/dict and mode value is `temporal`
-      model.compile(optimizer, loss='mse', sample_weight_mode='temporal')
+      model.compile(optimizer, loss='mse', sample_weight_mode='temporal',
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
 
-class LossMaskingTest(test.TestCase):
+class LossMaskingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_masking_graph_sequential(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       model = keras.models.Sequential()
@@ -862,13 +1194,16 @@ class LossMaskingTest(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.array([[[1], [1]], [[1], [1]]])
       loss = model.train_on_batch(x, y)
       self.assertEqual(float(loss), 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_masking_deferred_sequential(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       model = keras.models.Sequential()
@@ -876,13 +1211,16 @@ class LossMaskingTest(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.array([[[1], [1]], [[1], [1]]])
       loss = model.train_on_batch(x, y)
       self.assertEqual(float(loss), 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_masking_functional(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       inputs = keras.layers.Input((2, 1))
@@ -890,12 +1228,13 @@ class LossMaskingTest(test.TestCase):
       outputs = keras.layers.TimeDistributed(
           keras.layers.Dense(1, kernel_initializer='one'))(outputs)
       model = keras.Model(inputs, outputs)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.array([[[1], [1]], [[1], [1]]])
       loss = model.train_on_batch(x, y)
       self.assertEqual(float(loss), 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_mask_argument_in_layer(self):
     # Test that the mask argument gets correctly passed to a layer in the
     # functional API.
@@ -920,7 +1259,8 @@ class LossMaskingTest(test.TestCase):
       outputs = CustomMaskedLayer()(masked)
 
       model = keras.Model(inputs, outputs)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.random.random((5, 3))
       model.train_on_batch(x, y)
 
@@ -943,24 +1283,9 @@ class LossMaskingTest(test.TestCase):
               keras.backend.variable(weights), keras.backend.variable(mask)))
 
 
-class LearningPhaseTest(test.TestCase):
-
-  def test_empty_model_no_learning_phase(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      self.assertFalse(model.uses_learning_phase)
-
-  def test_dropout_has_learning_phase(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_dim=3))
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(2))
-      self.assertTrue(model.uses_learning_phase)
-
-
-class TestDynamicTrainability(test.TestCase):
+class TestDynamicTrainability(keras_parameterized.TestCase):
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_trainable_warning(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -974,6 +1299,7 @@ class TestDynamicTrainability(test.TestCase):
       model.train_on_batch(x, y)
       self.assertRaises(Warning)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_trainable_argument(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -1102,450 +1428,154 @@ class TestDynamicTrainability(test.TestCase):
       self.assertListEqual(outer_model.trainable_weights, [])
 
 
-class TestGeneratorMethods(test.TestCase):
-
-  @unittest.skipIf(
-      os.name == 'nt',
-      'use_multiprocessing=True does not work on windows properly.')
-  def test_generator_methods(self):
-    arr_data = np.random.random((50, 2))
-    arr_labels = np.random.random((50,))
-
-    def custom_generator():
-      batch_size = 10
-      num_samples = 50
-      while True:
-        batch_index = np.random.randint(0, num_samples - batch_size)
-        start = batch_index
-        end = start + batch_size
-        x = arr_data[start: end]
-        y = arr_labels[start: end]
-        yield x, y
-
-    with self.cached_session():
-      x = keras.Input((2,))
-      y = keras.layers.Dense(1)(x)
-      fn_model = keras.models.Model(x, y)
-      fn_model.compile(
-          loss='mse',
-          optimizer='sgd',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-
-      seq_model = keras.models.Sequential()
-      seq_model.add(keras.layers.Dense(1, input_shape=(2,)))
-      seq_model.compile(loss='mse', optimizer='sgd')
-
-      for model in [fn_model, seq_model]:
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_queue_size=10,
-                            workers=4,
-                            use_multiprocessing=True)
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_queue_size=10,
-                            use_multiprocessing=False)
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_queue_size=10,
-                            use_multiprocessing=False,
-                            validation_data=custom_generator(),
-                            validation_steps=10)
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            validation_data=custom_generator(),
-                            validation_steps=1,
-                            workers=0)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                workers=2,
-                                use_multiprocessing=True)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                use_multiprocessing=False)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                workers=0)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 workers=2,
-                                 verbose=1,
-                                 use_multiprocessing=True)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 use_multiprocessing=False)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 use_multiprocessing=False,
-                                 workers=0)
-
-  def test_generator_methods_with_sample_weights(self):
-    arr_data = np.random.random((50, 2))
-    arr_labels = np.random.random((50,))
-    arr_sample_weights = np.random.random((50,))
-
-    def custom_generator():
-      batch_size = 10
-      num_samples = 50
-      while True:
-        batch_index = np.random.randint(0, num_samples - batch_size)
-        start = batch_index
-        end = start + batch_size
-        x = arr_data[start: end]
-        y = arr_labels[start: end]
-        w = arr_sample_weights[start: end]
-        yield x, y, w
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(
-          loss='mse',
-          optimizer='sgd',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-
-      model.fit_generator(custom_generator(),
-                          steps_per_epoch=5,
-                          epochs=1,
-                          verbose=1,
-                          max_queue_size=10,
-                          use_multiprocessing=False)
-      model.fit_generator(custom_generator(),
-                          steps_per_epoch=5,
-                          epochs=1,
-                          verbose=1,
-                          max_queue_size=10,
-                          use_multiprocessing=False,
-                          validation_data=custom_generator(),
-                          validation_steps=10)
-      model.predict_generator(custom_generator(),
-                              steps=5,
-                              max_queue_size=10,
-                              use_multiprocessing=False)
-      model.evaluate_generator(custom_generator(),
-                               steps=5,
-                               max_queue_size=10,
-                               use_multiprocessing=False)
-
-  def test_generator_methods_invalid_use_case(self):
-
-    def custom_generator():
-      while 1:
-        yield 0
+class TestTrainingWithDataTensors(keras_parameterized.TestCase):
 
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(loss='mse', optimizer='sgd')
+  @keras_parameterized.run_all_keras_modes
+  def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
+    # TODO(kaftan) Test seems to not work, file ticket
+    if  context.executing_eagerly():
+      self.skipTest('Skipping eager execution.')
 
-      with self.assertRaises(ValueError):
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_queue_size=10,
-                            use_multiprocessing=False)
-      with self.assertRaises(ValueError):
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_queue_size=10,
-                            use_multiprocessing=False,
-                            validation_data=custom_generator(),
-                            validation_steps=10)
-      with self.assertRaises(AttributeError):
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                use_multiprocessing=False)
-      with self.assertRaises(ValueError):
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 use_multiprocessing=False)
+    x = keras.layers.Input(shape=(3,), name='input')
+    y = keras.layers.Dense(4, name='dense')(x)
+    model = keras.Model(x, y)
 
-  def test_training_with_sequences(self):
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(
+        optimizer,
+        loss,
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-    class DummySequence(keras.utils.Sequence):
+    inputs = keras.backend.zeros(shape=(10, 3))
+    targets = keras.backend.zeros(shape=(10, 4))
 
-      def __getitem__(self, idx):
-        return np.zeros([10, 2]), np.ones([10])
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+    model.evaluate(inputs, targets, steps=2, verbose=0)
+    model.predict(inputs, steps=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+    model.fit(inputs, targets,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=(inputs, targets), validation_steps=2)
+
+    # Test with dynamic shape
+    inputs = array_ops.placeholder_with_default(
+        np.zeros((2, 3)), shape=tensor_shape.TensorShape([None, 3]))
+    targets = array_ops.placeholder_with_default(
+        np.zeros((2, 4)), shape=tensor_shape.TensorShape([None, 4]))
+    self.assertEqual(inputs.shape.dims[0].value, None)
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+    model.evaluate(inputs, targets, steps=2, verbose=0)
+    model.predict(inputs, steps=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+    model.fit(inputs, targets,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=(inputs, targets), validation_steps=2)
 
-      def __len__(self):
-        return 10
+  @keras_parameterized.run_all_keras_modes
+  def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
+    # TODO(kaftan) Test seems to not work, file ticket
+    if context.executing_eagerly():
+      self.skipTest('Skipping eager execution.')
 
-    arr_data = np.random.random((50, 2))
-    arr_labels = np.random.random((50,))
-    arr_sample_weights = np.random.random((50,))
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
 
-    def custom_generator():
-      batch_size = 10
-      num_samples = 50
-      while True:
-        batch_index = np.random.randint(0, num_samples - batch_size)
-        start = batch_index
-        end = start + batch_size
-        x = arr_data[start: end]
-        y = arr_labels[start: end]
-        w = arr_sample_weights[start: end]
-        yield x, y, w
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(loss='mse', optimizer='sgd')
-
-    model.fit_generator(DummySequence(),
-                        steps_per_epoch=10,
-                        validation_data=custom_generator(),
-                        validation_steps=1,
-                        max_queue_size=10,
-                        workers=0,
-                        use_multiprocessing=True)
-    model.fit_generator(DummySequence(),
-                        steps_per_epoch=10,
-                        validation_data=custom_generator(),
-                        validation_steps=1,
-                        max_queue_size=10,
-                        workers=0,
-                        use_multiprocessing=False)
+    model = keras.models.Model([a, b], [d, e])
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_generator_input_to_fit_eval_predict(self):
-    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+    optimizer = 'rmsprop'
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    model.compile(
+        optimizer,
+        loss,
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        loss_weights=loss_weights,
+        run_eagerly=testing_utils.should_run_eagerly())
 
-    def custom_generator():
-      while True:
-        yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+    input_a_tf = keras.backend.zeros(shape=(10, 3))
+    input_b_tf = keras.backend.zeros(shape=(10, 3))
 
-    inputs = keras.layers.Input(shape=(10,))
-    x = keras.layers.Dense(10, activation='relu')(inputs)
-    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
-    model = keras.Model(inputs, outputs)
+    output_d_tf = keras.backend.zeros(shape=(10, 4))
+    output_e_tf = keras.backend.zeros(shape=(10, 4))
 
-    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
     model.fit(
-        custom_generator(),
+        [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+        epochs=1,
         steps_per_epoch=2,
-        validation_data=val_data,
-        epochs=2)
-    model.evaluate(custom_generator(), steps=2)
-    model.predict(custom_generator(), steps=2)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_sequence_input_to_fit_eval_predict(self):
-    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-    class CustomSequence(keras.utils.Sequence):
-
-      def __getitem__(self, idx):
-        return np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-      def __len__(self):
-        return 2
-
-    inputs = keras.layers.Input(shape=(10,))
-    x = keras.layers.Dense(10, activation='relu')(inputs)
-    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
-    model = keras.Model(inputs, outputs)
-
-    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
-    model.fit(CustomSequence(), validation_data=val_data, epochs=2)
-    model.evaluate(CustomSequence())
-    model.predict(CustomSequence())
-
-    with self.assertRaisesRegexp(ValueError, '`y` argument is not supported'):
-      model.fit(CustomSequence(), y=np.ones([10, 1]))
-
+        verbose=0)
     with self.assertRaisesRegexp(ValueError,
-                                 '`sample_weight` argument is not supported'):
-      model.fit(CustomSequence(), sample_weight=np.ones([10, 1]))
-
-
-class TestTrainingUtils(test.TestCase):
-
-  def test_check_array_lengths(self):
-    keras.engine.training_utils.check_array_lengths(None, None, None)
-    a_np = np.random.random((4, 3, 3))
-    keras.engine.training_utils.check_array_lengths(a_np, a_np, a_np)
-    keras.engine.training_utils.check_array_lengths(
-        [a_np, a_np], [a_np, a_np], [a_np, a_np])
-    keras.engine.training_utils.check_array_lengths([None], [None], [None])
-
-    b_np = np.random.random((3, 4))
-    with self.assertRaises(ValueError):
-      keras.engine.training_utils.check_array_lengths([a_np], [b_np], None)
-
-  def test_slice_arrays(self):
-    input_a = np.random.random((10, 3))
-    slice_arrays(input_a, 0)
-    slice_arrays(None)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = [None, [1, 1], None, [1, 1]]
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = [None]
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-    input_a = None
-    slice_arrays(input_a, 0)
-    slice_arrays(input_a, 0, 1)
-    slice_arrays(input_a, stop=2)
-
-
-class TestTrainingWithDataTensors(test.TestCase):
-
-  def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
-
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(
-          optimizer,
-          loss,
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-
-      inputs = keras.backend.zeros(shape=(10, 3))
-      targets = keras.backend.zeros(shape=(10, 4))
-
-      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(inputs, targets, steps=2, verbose=0)
-      model.predict(inputs, steps=2)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
-      model.fit(inputs, targets,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=(inputs, targets), validation_steps=2)
-
-      # Test with dynamic shape
-      inputs = array_ops.placeholder_with_default(
-          np.zeros((2, 3)), shape=tensor_shape.TensorShape([None, 3]))
-      targets = array_ops.placeholder_with_default(
-          np.zeros((2, 4)), shape=tensor_shape.TensorShape([None, 4]))
-      self.assertEqual(inputs.shape[0].value, None)
-      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(inputs, targets, steps=2, verbose=0)
-      model.predict(inputs, steps=2)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
-      model.fit(inputs, targets,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=(inputs, targets), validation_steps=2)
-
-  def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
-    with self.cached_session():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
-
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
-
-      model = keras.models.Model([a, b], [d, e])
-
-      optimizer = 'rmsprop'
-      loss = 'mse'
-      loss_weights = [1., 0.5]
-      model.compile(
-          optimizer,
-          loss,
-          metrics=['mae', metrics_module.CategoricalAccuracy()],
-          loss_weights=loss_weights)
-
-      input_a_tf = keras.backend.zeros(shape=(10, 3))
-      input_b_tf = keras.backend.zeros(shape=(10, 3))
-
-      output_d_tf = keras.backend.zeros(shape=(10, 4))
-      output_e_tf = keras.backend.zeros(shape=(10, 4))
-
+                                 'should specify the `steps_per_epoch`'):
       model.fit(
           [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
           epochs=1,
-          steps_per_epoch=2,
+          batch_size=5,
           verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'should specify the `steps_per_epoch`'):
-        model.fit(
-            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-            epochs=1,
-            batch_size=5,
-            verbose=0)
-      model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+    model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
 
-      # Test with dictionary inputs
-      model.fit(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf},
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0)
-      model.fit(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf},
-          validation_data=({'input_a': input_a_tf,
-                            'input_b': input_b_tf},
-                           {'dense': output_d_tf,
-                            'dropout': output_e_tf}),
-          epochs=1,
-          steps_per_epoch=2,
-          validation_steps=2,
-          verbose=0)
-      model.train_on_batch(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf})
+    # Test with dictionary inputs
+    model.fit(
+        {'input_a': input_a_tf,
+         'input_b': input_b_tf},
+        {'dense': output_d_tf,
+         'dropout': output_e_tf},
+        epochs=1,
+        steps_per_epoch=2,
+        verbose=0)
+    model.fit(
+        {'input_a': input_a_tf,
+         'input_b': input_b_tf},
+        {'dense': output_d_tf,
+         'dropout': output_e_tf},
+        validation_data=({'input_a': input_a_tf,
+                          'input_b': input_b_tf},
+                         {'dense': output_d_tf,
+                          'dropout': output_e_tf}),
+        epochs=1,
+        steps_per_epoch=2,
+        validation_steps=2,
+        verbose=0)
+    model.train_on_batch(
+        {'input_a': input_a_tf,
+         'input_b': input_b_tf},
+        {'dense': output_d_tf,
+         'dropout': output_e_tf})
 
-      # Test with validation data
+    # Test with validation data
+    model.fit(
+        [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+        validation_data=([input_a_tf, input_b_tf],
+                         [output_d_tf, output_e_tf]),
+        epochs=1,
+        steps_per_epoch=2,
+        validation_steps=2,
+        verbose=0)
+    # Test with validation split
+    with self.assertRaisesRegexp(ValueError,
+                                 'you cannot use `validation_split`'):
       model.fit(
           [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-          validation_data=([input_a_tf, input_b_tf],
-                           [output_d_tf, output_e_tf]),
-          epochs=1,
+          epochs=2,
           steps_per_epoch=2,
-          validation_steps=2,
-          verbose=0)
-      # Test with validation split
-      with self.assertRaisesRegexp(ValueError,
-                                   'you cannot use `validation_split`'):
-        model.fit(
-            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-            epochs=2,
-            steps_per_epoch=2,
-            verbose=0,
-            validation_split=0.2,
-            validation_steps=2)
-
-      # Test evaluation / prediction methods
-      model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-                     steps=2, verbose=0)
-      model.predict([input_a_tf, input_b_tf], steps=2)
-      model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+          verbose=0,
+          validation_split=0.2,
+          validation_steps=2)
 
+    # Test evaluation / prediction methods
+    model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+                   steps=2, verbose=0)
+    model.predict([input_a_tf, input_b_tf], steps=2)
+    model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
+  @tf_test_util.run_deprecated_v1
   def test_model_with_input_feed_tensor(self):
     """We test building a model with a TF variable as input.
 
@@ -1724,6 +1754,7 @@ class TestTrainingWithDataTensors(test.TestCase):
       # evaluate
       _ = model.evaluate(input_a_np, [output_a_np])
 
+  @tf_test_util.run_deprecated_v1
   def test_model_with_external_loss(self):
     with self.cached_session():
       # None loss, only regularization loss.
@@ -1865,6 +1896,10 @@ class TestTrainingWithDataTensors(test.TestCase):
       model.compile(optimizer='rmsprop', loss='mse', target_tensors=[target])
       model.train_on_batch(input_val, None)
 
+      # single-output, as single tensor
+      model.compile(optimizer='rmsprop', loss='mse', target_tensors=target)
+      model.train_on_batch(input_val, None)
+
       # single-output, as dict
       model.compile(optimizer='rmsprop', loss='mse',
                     target_tensors={'dense': target})
@@ -1915,6 +1950,7 @@ class TestTrainingWithDataTensors(test.TestCase):
       model.train_on_batch(input_val, None,
                            sample_weight={'dense_a': np.random.random((10,))})
 
+  @tf_test_util.run_deprecated_v1
   def test_model_custom_target_tensors(self):
     with self.cached_session():
       a = keras.Input(shape=(3,), name='input_a')
@@ -1976,268 +2012,10 @@ class TestTrainingWithDataTensors(test.TestCase):
                            [output_a_np, output_b_np])
 
 
-class TestTrainingWithDatasetIterators(test.TestCase):
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_training_and_eval_methods_on_iterators_single_io(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(iterator, steps=2, verbose=1)
-    model.predict(iterator, steps=2)
-    model.train_on_batch(iterator)
-    model.test_on_batch(iterator)
-    model.predict_on_batch(iterator)
-
-    # Test with validation data
-    model.fit(iterator,
-              epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=iterator, validation_steps=2)
-    # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(iterator,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_split=0.5, validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
-        ValueError, '`sample_weight` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(
-          iterator,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    # Test invalid usage
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should not specify a target'):
-      model.fit(iterator, iterator,
-                epochs=1, steps_per_epoch=2, verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
-      model.fit(iterator, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.evaluate(iterator, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.predict(iterator, verbose=0)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_get_next_op_created_once(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    # Finalize graph to make sure we are not appending another iterator
-    # get_next op in the graph.
-    ops.get_default_graph().finalize()
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_iterators_running_out_of_data(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(2)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
-      self.assertRegexpMatches(
-          str(mock_log.call_args),
-          'dataset iterator ran out of data')
-
-
-class TestTrainingWithDataset(test.TestCase):
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_calling_model_on_same_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    # Call fit with validation data
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-    # Finalize the graph to make sure new ops aren't added when calling on the
-    # same dataset
-    ops.get_default_graph().finalize()
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_training_and_eval_methods_on_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-    model.train_on_batch(dataset)
-    model.predict_on_batch(dataset)
-
-    # Test with validation data
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-
-    # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(dataset,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_split=0.5, validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
-        ValueError, '`sample_weight` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(
-          dataset,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    # Test invalid usage
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should not specify a target'):
-      model.fit(dataset, dataset,
-                epochs=1, steps_per_epoch=2, verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
-      model.fit(dataset, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.evaluate(dataset, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.predict(dataset, verbose=0)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dataset_with_sample_weights(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    sample_weights = np.ones((10), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
-                                                      sample_weights))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-    model.train_on_batch(dataset)
-    model.predict_on_batch(dataset)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dataset_with_sparse_labels(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'sparse_categorical_crossentropy'
-    model.compile(optimizer, loss)
-
-    inputs = np.zeros((10, 3))
-    targets = np.random.randint(0, 4, size=10, dtype=np.int32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-
-  def test_dataset_input_shape_validation(self):
-    with self.cached_session():
-      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
-
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-
-      with self.assertRaisesRegexp(
-          ValueError,
-          r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
-      ):
-        model.train_on_batch(dataset)
-
-      # Wrong input shape
-      inputs = np.zeros((10, 5))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      with self.assertRaisesRegexp(ValueError,
-                                   r'expected (.*?) to have shape \(3,\)'):
-        model.train_on_batch(dataset)
-
-
-class TestTrainingWithMetrics(test.TestCase):
+class TestTrainingWithMetrics(keras_parameterized.TestCase):
   """Training tests related to metrics."""
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_names(self):
     a = keras.layers.Input(shape=(3,), name='input_a')
     b = keras.layers.Input(shape=(3,), name='input_b')
@@ -2251,18 +2029,14 @@ class TestTrainingWithMetrics(test.TestCase):
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     metrics = ['mse', metrics_module.BinaryAccuracy()]
-    model.compile(optimizer, loss='mae', metrics=metrics)
+    model.compile(optimizer, loss='mae', metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
     reference_metric_names = [
         'loss', 'dense_loss', 'dropout_loss', 'dense_mean_squared_error',
         'dense_binary_accuracy', 'dropout_mean_squared_error',
         'dropout_binary_accuracy'
     ]
-    reference_stateful_metric_names = [
-        'dense_binary_accuracy', 'dropout_binary_accuracy'
-    ]
     self.assertEqual(reference_metric_names, model.metrics_names)
-    self.assertEqual(reference_stateful_metric_names,
-                     model.stateful_metric_names)
 
     # Verify that model metric names are not altered during training.
     input_a_np = np.random.random((10, 3))
@@ -2275,10 +2049,8 @@ class TestTrainingWithMetrics(test.TestCase):
               epochs=1,
               batch_size=5)
     self.assertEqual(reference_metric_names, model.metrics_names)
-    self.assertEqual(reference_stateful_metric_names,
-                     model.stateful_metric_names)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_correctness(self):
     model = keras.Sequential()
     model.add(
@@ -2290,7 +2062,8 @@ class TestTrainingWithMetrics(test.TestCase):
     model.compile(
         loss='mae',
         metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     # verify correctness of stateful and stateless metrics.
     x = np.ones((100, 4))
@@ -2304,40 +2077,7 @@ class TestTrainingWithMetrics(test.TestCase):
     self.assertEqual(outs[1], 0.)
     self.assertEqual(outs[2], 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_metrics_correctness_with_iterator(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            8, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(
-            1, activation='sigmoid', kernel_initializer='ones'))
-    model.compile(
-        loss='binary_crossentropy',
-        metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-    np.random.seed(123)
-    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
-    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
-    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
-
-    y = np.zeros((100, 1), dtype=np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(outs[1], 0.)
-    self.assertEqual(outs[2], 0.)
-
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_correctness_with_weighted_metrics(self):
     np.random.seed(1337)
     x = np.array([[[1.], [1.]], [[0.], [0.]]])
@@ -2350,8 +2090,8 @@ class TestTrainingWithMetrics(test.TestCase):
         RMSPropOptimizer(learning_rate=0.001),
         loss='mse',
         sample_weight_mode='temporal',
-        weighted_metrics=['accuracy',
-                          metrics_module.BinaryAccuracy()])
+        weighted_metrics=['accuracy', 'mse'],
+        run_eagerly=testing_utils.should_run_eagerly())
     y = np.array([[[1.], [1.]], [[1.], [1.]]])
 
     outs = model.evaluate(x, y)
@@ -2363,9 +2103,17 @@ class TestTrainingWithMetrics(test.TestCase):
 
     w = np.array([[3., 4.], [1., 2.]])
     outs = model.evaluate(x, y, sample_weight=w)
-    self.assertArrayNear(outs, [0.3, 0.7, 0.7], .001)
+    self.assertArrayNear(outs, [0.75, 0.7, 0.3], .001)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+    # Verify that metric value is same with arbitrary weights and batch size.
+    x = np.random.random((50, 2, 1))
+    y = np.random.random((50, 2, 1))
+    w = np.random.random((50, 2))
+    mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[2]
+    mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[2]
+    self.assertNear(mse1, mse2, err=1e-7)
+
+  @keras_parameterized.run_all_keras_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
     model = keras.Sequential()
     model.add(keras.layers.Dense(3, activation='relu', input_dim=4))
@@ -2374,7 +2122,8 @@ class TestTrainingWithMetrics(test.TestCase):
     model.compile(
         loss='mae',
         metrics=[acc_obj],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x_train = np.random.random((100, 4))
     y_train = np.random.random((100, 1))
@@ -2386,7 +2135,7 @@ class TestTrainingWithMetrics(test.TestCase):
     model.evaluate(x_test, y_test, batch_size=5)
     self.assertEqual(self.evaluate(acc_obj.count), 10)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_invalid_metrics(self):
     num_classes = 5
     input_dim = 5
@@ -2400,10 +2149,13 @@ class TestTrainingWithMetrics(test.TestCase):
       model.compile(
           RMSPropOptimizer(learning_rate=0.001),
           loss='categorical_crossentropy',
-          metrics=metrics_module.CategoricalAccuracy())
+          metrics=metrics_module.CategoricalAccuracy(),
+          run_eagerly=testing_utils.should_run_eagerly())
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_masking(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       np.random.seed(1337)
       model = keras.models.Sequential()
@@ -2414,31 +2166,286 @@ class TestTrainingWithMetrics(test.TestCase):
       model.compile(
           RMSPropOptimizer(learning_rate=0.001),
           loss='mse',
-          weighted_metrics=['accuracy',
-                            metrics_module.BinaryAccuracy()])
+          weighted_metrics=['accuracy'],
+          run_eagerly=testing_utils.should_run_eagerly())
 
-      # verify that masking is applied for stateless and stateful metrics.
+      # verify that masking is applied.
       x = np.array([[[1], [1]], [[1], [1]], [[0], [0]]])
       y = np.array([[[1], [1]], [[0], [1]], [[1], [1]]])
       scores = model.train_on_batch(x, y)
-      self.assertArrayNear(scores, [0.25, 0.75, 0.75], 0.1)
+      self.assertArrayNear(scores, [0.25, 0.75], 0.1)
 
       # verify that masking is combined with sample weights.
       w = np.array([3, 2, 4])
       scores = model.train_on_batch(x, y, sample_weight=w)
-      self.assertArrayNear(scores, [0.2, 0.8, 0.8], 0.1)
+      self.assertArrayNear(scores, [0.3328, 0.8], 0.001)
 
-  def test_losses_in_defun(self):
+  @tf_test_util.run_deprecated_v1
+  def test_add_metric_with_tensor_on_model_in_graph_mode(self):
+    with self.cached_session():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      model = keras.models.Model(x, y)
+      model.add_metric(
+          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+
+      # test with a metric which does not have the standard signature:
+      # (y_true, y_pred, sample_Weight)
+      model.add_metric(metrics_module.Mean(name='metric_2')(y))
+      model.compile('sgd', loss='mse')
+
+      inputs = np.ones(shape=(10, 1))
+      targets = np.ones(shape=(10, 1))
+      history = model.fit(
+          inputs,
+          targets,
+          epochs=2,
+          batch_size=5,
+          validation_data=(inputs, targets))
+      self.assertEqual(history.history['metric_1'][-1], 5)
+      self.assertEqual(history.history['metric_2'][-1], 1)
+      self.assertEqual(history.history['val_metric_1'][-1], 5)
+      self.assertEqual(history.history['val_metric_2'][-1], 1)
+
+      eval_results = model.evaluate(inputs, targets, batch_size=5)
+      self.assertEqual(eval_results[-1], 1)
+      self.assertEqual(eval_results[-2], 5)
+
+      model.predict(inputs, batch_size=5)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_in_model_call(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean = metrics_module.Mean(name='metric_1')
+
+      def call(self, x):
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_2', aggregation='mean')
+        # Provide same name as in the instance created in __init__
+        # for eager mode
+        self.add_metric(self.mean(x), name='metric_1')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['val_metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_2'][-1], 5, 0)
+    self.assertAlmostEqual(history.history['val_metric_2'][-1], 5, 0)
+
+    eval_results = model.evaluate(x, y, batch_size=5)
+    self.assertAlmostEqual(eval_results[1], 1, 0)
+    self.assertAlmostEqual(eval_results[2], 5, 0)
+
+    model.predict(x, batch_size=5)
+    model.train_on_batch(x, y)
+    model.test_on_batch(x, y)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_in_layer_call(self):
+
+    class TestLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable(
+            'a', (1, 1), initializer='ones', trainable=False)
+        self.built = True
+
+      def call(self, inputs):
+        self.add_metric(
+            math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
+        return inputs + 1
+
+    model = keras.Sequential()
+    model.add(TestLayer(input_shape=(1,)))
+    model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertEqual(history.history['metric_1'][-1], 5)
+    self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
+
+  @tf_test_util.run_deprecated_v1
+  def test_model_metrics_list(self):
+    with self.cached_session():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      model = keras.models.Model(x, y)
+      model.add_metric(
+          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+      model.add_metric(metrics_module.Mean(name='metric_2')(y))
+      model.compile('sgd', loss='mse', metrics=['acc'])
+
+      # Verify that the metrics added using `compile` and `add_metric` API are
+      # included
+      self.assertEqual(model._compile_metrics, ['acc'])
+      names = []
+      for m in model.metrics:
+        if isinstance(m, metrics_module.Metric):
+          names.append(m.name)
+        else:
+          names.append(m.__name__)
+      self.assertEqual(names, ['binary_accuracy', 'metric_1', 'metric_2'])
+
+  def test_model_eager_metrics_list(self):
     with context.eager_mode():
-      layer = keras.layers.Dense(1, kernel_regularizer='l1')
-      layer(array_ops.ones([1, 10]))
 
-      @function.defun
-      def get_losses():
-        return layer.losses
+      class TestModel(keras.Model):
+
+        def __init__(self):
+          super(TestModel, self).__init__(name='test_model')
+          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+        def call(self, x):
+          self.add_metric(
+              math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
+          return self.dense1(x)
+
+      model = TestModel()
+      model.compile(
+          loss='mse',
+          optimizer=RMSPropOptimizer(0.01),
+          metrics=['acc'],
+          run_eagerly=True)
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+      self.assertEqual(model._compile_metrics, ['acc'])
+      names = []
+      for m in model.metrics:
+        if isinstance(m, metrics_module.Metric):
+          names.append(m.name)
+        else:
+          names.append(m.__name__)
+      self.assertEqual(names, ['categorical_accuracy', 'metric_1'])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_multiple_add_metric_calls(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean1 = metrics_module.Mean(name='metric_1')
+        self.mean2 = metrics_module.Mean(name='metric_2')
+
+      def call(self, x):
+        self.add_metric(self.mean2(x), name='metric_2')
+        self.add_metric(self.mean1(x), name='metric_1')
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_3', aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_2'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_3'][-1], 5, 0)
+
+    eval_results = model.evaluate(x, y, batch_size=5)
+    self.assertArrayNear(eval_results[1:4], [1, 1, 5], 0.1)
+
+    model.predict(x, batch_size=5)
+    model.train_on_batch(x, y)
+    model.test_on_batch(x, y)
+
+  def test_invalid_metric_tensor_in_call(self):
+    with context.eager_mode():
+
+      class TestLayer(keras.layers.Layer):
+
+        def call(self, inputs):
+          self.add_metric(metrics_module.Mean(name='metric_1')(inputs))
+          return inputs + 1
+
+      model = keras.Sequential()
+      model.add(TestLayer(input_shape=(1,)))
+      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+      model.compile(
+          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      with self.assertRaisesRegexp(
+          ValueError,
+          'We do not support adding an aggregated metric tensor in `call` in '
+          'eager execution.'):
+        model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_duplicate_metric_name_in_add_metric(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean = metrics_module.Mean(name='metric_1')
+        self.mean2 = metrics_module.Mean(name='metric_1')
+
+      def call(self, x):
+        self.add_metric(self.mean(x), name='metric_1')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Please provide different names for the metrics you have added. '
+        'We found 2 metrics with the name: "metric_1"'):
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_multiple_no_name_input_to_add_metric(self):
+    # TODO(kaftan) Test seems to not work, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      self.skipTest('Skipping running model eagerly.')
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+      def call(self, x):
+        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
+        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertEqual([m.name for m in model.metrics], ['mean', 'mean_1'])
 
-      self.assertAllEqual(self.evaluate(layer.losses),
-                          self.evaluate(get_losses()))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index dd2a7f16eca11c474e014860de0d554cb9eb8747..01a09eb031eef20538d587e3f17a31ecbb5e5f9a 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -18,163 +18,176 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 from collections import OrderedDict
 import copy
-import math
 
 import numpy as np
 import six
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.util import nest
 
 
-def _map_nested(data, func):
-  """Maps each nested element using func."""
-  if isinstance(data, list):
-    return [_map_nested(nested_data, func) for nested_data in data]
-  elif isinstance(data, tuple):
-    return tuple(_map_nested(nested_data, func) for nested_data in data)
-  elif isinstance(data, dict):
-    return {
-        k: _map_nested(nested_data, func) for k, nested_data in data.items()
-    }
-  else:
-    return func(data)
+@six.add_metaclass(abc.ABCMeta)
+class Aggregator(object):
+  """Abstract base class used to aggregate batch-level outputs of a loop.
 
+  Attributes:
+    use_steps: Whether the loop is using `step` or `batch_size`.
+    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
+    results: What to return at the end of the aggregation loop.
+  """
 
-def _nested_all(data, cond_func):
-  """Checks if all elements in a nested structure satisfy cond_func."""
-  if isinstance(data, (tuple, list)):
-    return all([_nested_all(nested_data, cond_func) for nested_data in data])
-  elif isinstance(data, dict):
-    return all(
-        [_nested_all(nested_data, cond_func) for nested_data in data.values()])
-  else:
-    return cond_func(data)
+  def __init__(self, use_steps, num_samples_or_steps):
+    self.use_steps = use_steps
+    self.num_samples_or_steps = num_samples_or_steps
+    self.results = []
 
+  @abc.abstractmethod
+  def create(self, batch_outs):
+    """Creates the initial results from the first batch outputs.
 
-def _nested_any(data, cond_func):
-  """Checks if any nested_elements in a nested structure satisfy cond_func."""
-  if isinstance(data, (tuple, list)):
-    return any([_nested_any(nested_data, cond_func) for nested_data in data])
-  elif isinstance(data, dict):
-    return any(
-        [_nested_any(nested_data, cond_func) for nested_data in data.values()])
-  else:
-    return cond_func(data)
-
-
-def _convert_lists_to_tuples(data):
-  """Converts all lists to tuples, since Datasets expect tuples."""
-  if isinstance(data, (tuple, list)):
-    return tuple(_convert_lists_to_tuples(nested_data) for nested_data in data)
-  elif isinstance(data, dict):
-    return {
-        k: _convert_lists_to_tuples(nested_data)
-        for k, nested_data in data.items()
-    }
-  else:
-    return data
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
 
+  @abc.abstractmethod
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    """Aggregates batch-level results into total results.
 
-def _get_batch_axis_size(data):
-  """Returns batch axis shape for nested data."""
-  if isinstance(data, (tuple, list)):
-    return _get_batch_axis_size(data[0])
-  elif isinstance(data, dict):
-    return _get_batch_axis_size(list(data.values()))
-  else:
-    return int(data.shape[0])
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+      batch_start: The start index of this batch. Always `None` if `use_steps`
+        is `True`.
+      batch_end: The end index of this batch. Always `None` if `use_steps` is
+        `True`.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
 
+  @abc.abstractmethod
+  def finalize(self):
+    """Prepares the total results to be returned."""
+    NotImplementedError('Must be implemented in subclasses.')
 
-def convert_to_iterator(x=None,
-                        y=None,
-                        sample_weights=None,
-                        batch_size=None,
-                        steps_per_epoch=None,
-                        epochs=1,
-                        shuffle=False,
-                        is_validation=False):
-  """Converts NumPy arrays or EagerTensors to an EagerIterator.
 
-  Combines all provided data into a single EagerIterator.
+class MetricsAggregator(Aggregator):
+  """Aggregator that calculates loss and metrics info."""
 
-  Arguments:
-      x: NumPy array or EagerTensor,  or list of Numpy arrays or EagerTensors
-        representing inputs to a model.
-      y: Optional. NumPy array or EagerTensor, or list of Numpy arrays or
-        EagerTensors representing targets of a model.
-      sample_weights: Optional NumPy array or EagerTensor representing sample
-        weights.
-      batch_size: Used to batch data and calculate how many steps EagerIterator
-        should take per epoch.
-      steps_per_epoch: If provided, how many steps EagerIterator should take per
-        epoch.
-      epochs: Epochs to repeat iterator for.
-      shuffle: Whether to shuffle data after each epoch.
-      is_validation: Whether this call is for validation during a training
-        (e.g., `fit()`) call. This info is used to construct error messages
-        (if any).
+  def create(self, batch_outs):
+    self.results = [0.] * len(batch_outs)
 
-  Raises:
-      ValueError: if steps_per_epoch cannot be calculated from the data
-      provided.
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    # Loss.
+    if self.use_steps:
+      self.results[0] += batch_outs[0]
+    else:
+      self.results[0] += batch_outs[0] * (batch_end - batch_start)
+    # Metrics (always stateful, just grab current values.)
+    self.results[1:] = batch_outs[1:]
 
-  Returns:
-      (Iterator, steps_per_epoch).
+  def finalize(self):
+    self.results[0] /= self.num_samples_or_steps
 
-  """
-  if isinstance(x, iterator_ops.EagerIterator):
-    return x, steps_per_epoch
 
-  if not _nested_any(sample_weights, lambda x: x is None):
-    data = (x, y, sample_weights)
-  elif not _nested_any(y, lambda x: x is None):
-    data = (x, y)
-  else:
-    # always wrap in a tuple, so we know y, sample_weights weren't set
-    # even when x has multiple elements
-    data = (x,)
-
-  data = _convert_lists_to_tuples(data)
-  if steps_per_epoch is None and batch_size is not None:
-    num_samples = _get_batch_axis_size(data)
-    steps_per_epoch = int(math.ceil(num_samples / batch_size))
-
-  if steps_per_epoch is None:
-    alternative_arg_name = (
-        'validation_steps' if is_validation else 'steps_per_epoch')
-    raise ValueError(
-        'Could not determine how to convert EagerTensors into EagerIterator. '
-        'Please provide either `batch_size` or '
-        '`%s`.' % alternative_arg_name)
+class OutputsAggregator(Aggregator):
+  """Aggregator that concatenates outputs."""
+
+  def create(self, batch_outs):
+    if self.use_steps:
+      # Cannot pre-allocate the returned NumPy arrays bc
+      # batch sizes are unknown. Concatenate batches at the end.
+      for _ in batch_outs:
+        self.results.append([])
+    else:
+      # Pre-allocate NumPy arrays.
+      for batch_out in batch_outs:
+        shape = (self.num_samples_or_steps,) + batch_out.shape[1:]
+        self.results.append(np.zeros(shape, dtype=batch_out.dtype))
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    if self.use_steps:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i].append(batch_out)
+    else:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i][batch_start:batch_end] = batch_out
+
+  def finalize(self):
+    if self.use_steps:
+      self.results = [np.concatenate(result, axis=0) for result in self.results]
 
-  # TODO(omalleyt) for NumPy arrays in graph mode
-  # placeholder ops should be used
-  # this is only ideal for eager mode
-  dataset = dataset_ops.Dataset.from_tensor_slices(data)
 
-  if batch_size is not None:
-    dataset = dataset.batch(batch_size)
-  if shuffle:
-    dataset = dataset.shuffle(buffer_size=10000)
-  dataset = dataset.repeat(epochs)
-  iterator = dataset.make_one_shot_iterator()
+def make_logs(model, outputs, mode, prefix=''):
+  """Computes logs for sending to `on_batch_end` methods."""
+  logs = {}
+  # TODO(omalleyt): handle outputs in prediction when Callback
+  # hooks are ready.
+  if mode in ['train', 'test']:
+    if hasattr(model, 'metrics_names'):
+      for label, output in zip(model.metrics_names, outputs):
+        logs[prefix + label] = output
+  return logs
 
-  return iterator, steps_per_epoch
+
+def get_progbar(model, count_mode):
+  """Get Progbar."""
+  stateful_metric_names = None
+  if hasattr(model, 'metrics_names'):
+    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
+
+
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
+
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
+
+  Arguments:
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
+
+  Returns:
+    Slice of data (either single array or list of arrays).
+  """
+  converted_to_list = False
+  if not isinstance(arrays, list):
+    converted_to_list = True
+    arrays = [arrays]
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
+  else:
+    slices = generic_utils.slice_arrays(arrays, indices)
+
+  if converted_to_list:
+    slices = slices[0]
+  return slices
 
 
 def check_num_samples(ins,
@@ -219,14 +232,18 @@ def check_num_samples(ins,
   return None  # Edge case where ins == [static_learning_phase]
 
 
-def standardize_single_array(x):
+def standardize_single_array(x, expected_shape=None):
+  """Expand data of shape (x,) to (x, 1), unless len(expected_shape)==1."""
   if x is None:
     return None
-  if x.shape is not None and len(x.shape) == 1:
+
+  if (x.shape is not None
+      and len(x.shape) == 1
+      and (expected_shape is None or len(expected_shape) != 1)):
     if tensor_util.is_tensor(x):
-      return array_ops.expand_dims(x, axis=1)
+      x = array_ops.expand_dims(x, axis=1)
     else:
-      return np.expand_dims(x, 1)
+      x = np.expand_dims(x, 1)
   return x
 
 
@@ -288,7 +305,11 @@ def standardize_input_data(data,
   else:
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
-  data = [standardize_single_array(x) for x in data]
+  if shapes is not None:
+    data = [standardize_single_array(x, shape)
+            for (x, shape) in zip(data, shapes)]
+  else:
+    data = [standardize_single_array(x) for x in data]
 
   if len(data) != len(names):
     if data and hasattr(data[0], 'shape'):
@@ -511,8 +532,15 @@ def collect_per_output_metric_info(metrics,
       For instance, if the model has 2 outputs, and for the first output
       we want to compute "binary_accuracy" and "binary_crossentropy",
       and just "binary_accuracy" for the second output,
-      the list would look like: `[[('acc', binary_accuracy()),
-      ('ce', binary_crossentropy())], [('acc', binary_accuracy())]]`
+      the list would look like: `[
+        {
+          'acc': (binary_accuracy(), mean_obj_1),
+          'ce': (binary_crossentropy(), mean_obj_2)
+        },
+        {
+          'acc': (binary_accuracy(), mean_obj_3)
+        }
+      ]`
 
   Raises:
       TypeError: if an incorrect type is passed for the `metrics` argument.
@@ -542,7 +570,19 @@ def collect_per_output_metric_info(metrics,
       metric_name = get_metric_name(metric, weighted)
       metric_fn = get_metric_function(
           metric, output_shape=output_shapes[i], loss_fn=loss_fns[i])
-      metrics_dict[metric_name] = metric_fn
+
+      # If the metric function is not stateful, we create a stateful version and
+      # return both the stateless and the stateful version together. For batch
+      # APIs like `train_on_batch` we will use the stateless version and for
+      # other APIs like `fit` we will use the stateful version.
+      is_stateful = isinstance(metric_fn,
+                               base_layer.Layer) and metric_fn.stateful
+      stateful_fn = metric_fn
+      if not is_stateful:
+        stateful_fn = metrics_module.MeanMetricWrapper(
+            metric_fn, name=metric_fn.__name__)
+
+      metrics_dict[metric_name] = (metric_fn, stateful_fn)
     per_output_metrics.append(metrics_dict)
 
   return per_output_metrics
@@ -609,25 +649,15 @@ def weighted_masked_objective(fn):
       if weights is None:
         weights = mask
       else:
-        # Update shape of weights if possible before adding mask.
         # Update dimensions of weights to match with mask if possible.
-        mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
-            mask, None, weights)
-        try:
-          # Broadcast weights if possible.
-          weights = weights_broadcast_ops.broadcast_weights(weights, mask)
-          weights *= mask
-        except ValueError:
-          score_array *= mask
-          score_array /= K.mean(mask)
-          # TODO(psv): Handle case when mask and weight shapes are not
-          # compatible.
+        mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
+        weights *= mask
 
     # Apply sample weighting.
     if weights is not None:
 
       # Update dimensions of weights to match with values if possible.
-      score_array, _, weights = metrics_module.squeeze_or_expand_dimensions(
+      score_array, _, weights = squeeze_or_expand_dimensions(
           score_array, None, weights)
       try:
         # Broadcast weights if possible.
@@ -641,7 +671,7 @@ def weighted_masked_objective(fn):
       score_array = math_ops.multiply(score_array, weights)
       score_array = math_ops.reduce_sum(score_array)
       weights = math_ops.reduce_sum(weights)
-      score_array = metrics_module.safe_div(score_array, weights)
+      score_array = math_ops.div_no_nan(score_array, weights)
     return K.mean(score_array)
 
   return weighted
@@ -814,6 +844,33 @@ def get_metric_function(metric, output_shape=None, loss_fn=None):
   return metrics_module.get(metric)
 
 
+def call_metric_function(metric_fn, y_true, y_pred, weights=None, mask=None):
+  """Invokes metric function and returns the metric result tensor."""
+  if mask is None:
+    return metric_fn(y_true, y_pred, sample_weight=weights)
+
+  mask = math_ops.cast(mask, y_pred.dtype)
+  if weights is None:
+    # Use mask as sample weight.
+    return metric_fn(y_true, y_pred, sample_weight=mask)
+
+  # Update dimensions of weights to match with mask.
+  mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
+  weights *= mask
+  return metric_fn(y_true, y_pred, sample_weight=weights)
+
+
+def get_loss_function(loss):
+  """Returns the loss function corresponding to the given loss input."""
+  if loss is None or isinstance(loss, losses.Loss):
+    return loss
+
+  # TODO(psv): After we have added all V2 losses, update this function.
+  if loss in ['mse', 'MSE', 'mean_squared_error']:
+    return losses.MeanSquaredError()
+  return losses.get(loss)
+
+
 def validate_iterator_input(x, y, sample_weight, validation_split=None):
   """Validates user input arguments when a dataset iterator is passed.
 
@@ -1026,9 +1083,11 @@ class ModelInputs(object):
     self._inputs = inputs
     self._is_dict = isinstance(self._inputs, dict)
     self._is_single_input = not isinstance(self._inputs, (list, tuple, dict))
+
     self._flattened_inputs = []
     self._input_names = []
-    if isinstance(self._inputs, dict):
+
+    if self._is_dict:
       for k in sorted(self._inputs.keys()):
         self._flattened_inputs.append(self._inputs[k])
         self._input_names.append(k)
@@ -1037,7 +1096,6 @@ class ModelInputs(object):
       self._input_names = [
           'input_%d' % (i + 1) for i in range(len(self._flattened_inputs))
       ]
-    assert len(self._input_names) == len(self._flattened_inputs)
 
   def get_input_names(self):
     """Returns keys to name inputs by.
@@ -1047,57 +1105,32 @@ class ModelInputs(object):
     """
     return self._input_names
 
-  def _get(self, return_single_as_list=False):
-    """Returns provided inputs, potentially transformed.
-
-    Inputs are returned in the same format they were provided i.e. lists
-    are returned as lists, single entries as single entries (unless
-    `return_single_as_list` is true), dictionaries as dictionaries.
-
-    Args:
-      return_single_as_list: Returns a list of size 1 for single entry case.
-    """
-    if self._is_dict:
-      return dict(zip(self._input_names, self._flattened_inputs))
-    if self._is_single_input and not return_single_as_list:
-      return self._flattened_inputs[0]
-    return self._flattened_inputs
-
-  def get_input_values(self):
-    """Returns input values passed in."""
-    if context.executing_eagerly():
-      for i in range(len(self._flattened_inputs)):
-        v = self._flattened_inputs[i]
-        if tensor_util.is_tensor(v):
-          v = cast_single_tensor(v)
-        else:
-          v = ops.convert_to_tensor(v, dtype=K.floatx())
-        self._flattened_inputs[i] = v
-    return self._get(return_single_as_list=False)
-
   def get_symbolic_inputs(self, return_single_as_list=False):
     """Returns inputs to be set as self.inputs for a model."""
     for i in range(len(self._flattened_inputs)):
       k = self._input_names[i]
       v = self._flattened_inputs[i]
-      if context.executing_eagerly():
-        v = base_layer.DeferredTensor(
-            shape=(None for _ in v.shape), dtype=v.dtype)
-      else:
-        if isinstance(v, list):
-          v = np.asarray(v)
-          if v.ndim == 1:
-            v = np.expand_dims(v, 1)
-        if isinstance(v, (np.ndarray)):
-          # We fix the placeholder shape except the batch size.
-          # This is suboptimal, but it is the best we can do with the info
-          # we have. The user should call `model._set_inputs(placeholders)`
-          # to specify custom placeholders if the need arises.
-          shape = (None,) + v.shape[1:]
-          v = K.placeholder(shape=shape, name=k)
+      if isinstance(v, (list, float, int)):
+        v = np.asarray(v)
+        if v.ndim == 1:
+          v = np.expand_dims(v, 1)
+      if isinstance(v, (np.ndarray, ops.EagerTensor)):
+        # We fix the placeholder shape except the batch size.
+        # This is suboptimal, but it is the best we can do with the info
+        # we have. The user should call `model._set_inputs(placeholders)`
+        # to specify custom placeholders if the need arises.
+        shape = (None,) + tuple(v.shape[1:])
+        v = K.placeholder(shape=shape, name=k)
+      elif isinstance(v, tensor_shape.TensorShape):
+        shape = (None,) + tuple(v.as_list()[1:])
+        v = K.placeholder(shape=shape, name=k)
       self._flattened_inputs[i] = v
 
-    return self._get(return_single_as_list)
+    if self._is_dict:
+      return dict(zip(self._input_names, self._flattened_inputs))
+    if self._is_single_input and not return_single_as_list:
+      return self._flattened_inputs[0]
+    return self._flattened_inputs
 
   def as_dict(self):
     """An iterable over a dictionary version of inputs."""
@@ -1107,3 +1140,54 @@ class ModelInputs(object):
   def as_list(self):
     """Returning the inputs as a list."""
     return self._flattened_inputs
+
+
+# Allow use of methods not exposed to the user.
+# pylint: disable=protected-access
+def get_input_shape_and_dtype(layer):
+  """Retrieves input shape and input dtype of layer if applicable.
+
+  Args:
+    layer: Layer (or model) instance.
+
+  Returns:
+    Tuple (input_shape, input_dtype). Both could be None if the layer
+      does not have a defined input shape.
+
+  Raises:
+    ValueError: in case an empty Sequential or Graph Network is passed.
+  """
+
+  def _is_graph_model(layer):
+    return ((hasattr(layer, '_is_graph_network') and layer._is_graph_network) or
+            layer.__class__.__name__ == 'Sequential')
+
+  # In case of nested models: recover the first layer
+  # of the deepest model to infer input shape and dtype.
+  # Subclassed Models may not have been built so can't be checked.
+  while _is_graph_model(layer):
+    if not layer.layers:
+      raise ValueError('An empty Model cannot be used as a Layer.')
+    layer = layer.layers[0]
+
+  if hasattr(layer, '_batch_input_shape'):
+    return layer._batch_input_shape, layer.dtype
+  return None, None
+
+
+# pylint: enable=protected-access
+
+
+def get_static_batch_size(layer):
+  """Gets the static batch size of a Layer.
+
+  Arguments:
+    layer: a `Layer` instance.
+
+  Returns:
+    The static batch size of a Layer.
+  """
+  batch_input_shape, _ = get_input_shape_and_dtype(layer)
+  if batch_input_shape is not None:
+    return tensor_shape.as_dimension(batch_input_shape[0]).value
+  return None
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index e777cb6db37716c14978fa2a6fc4cea20d799cb3..44ea23998fe6f3b614fb09b9667add179cf3fd85 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -21,173 +21,39 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import test
 
 
-class TrainingUtilTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_single_numpy(self):
-    batch_size = 2
-    a = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_single_tensor(self):
-    batch_size = 2
-    a = ops.convert_to_tensor(np.ones([10, 10]))
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_y(self):
-    batch_size = 2
-    a = np.ones([10, 100])
-    b = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, y=b, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x = a[:batch_size, :]
-    expected_y = b[:batch_size, :]
-    actual_x, actual_y = iterator.get_next()
-    self.assertAllEqual(expected_x, actual_x)
-    self.assertAllEqual(expected_y, actual_y)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_sample_weights(self):
-    batch_size = 2
-    a = ops.convert_to_tensor(np.ones([10, 100]))
-    b = ops.convert_to_tensor(np.ones([10, 10]))
-    sw = ops.convert_to_tensor(np.ones([10]))
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, y=b, sample_weights=sw, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x = a[:batch_size, :]
-    expected_y = b[:batch_size, :]
-    expected_sw = sw[:batch_size]
-    actual_x, actual_y, actual_sw = iterator.get_next()
-    self.assertAllEqual(expected_x, actual_x)
-    self.assertAllEqual(expected_y, actual_y)
-    self.assertAllEqual(expected_sw, actual_sw)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_nested(self):
-    batch_size = 2
-    x = {'1': np.ones([10, 100]), '2': [np.zeros([10, 10]), np.ones([10, 20])]}
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=x, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x1 = x['1'][:batch_size, :]
-    expected_x2_0 = x['2'][0][:batch_size, :]
-    expected_x2_1 = x['2'][1][:batch_size, :]
-
-    actual_x, = iterator.get_next()
-    actual_x1 = actual_x['1'][:batch_size, :]
-    actual_x2_0 = actual_x['2'][0][:batch_size, :]
-    actual_x2_1 = actual_x['2'][1][:batch_size, :]
-
-    self.assertAllEqual(expected_x1, actual_x1)
-    self.assertAllEqual(expected_x2_0, actual_x2_0)
-    self.assertAllEqual(expected_x2_1, actual_x2_1)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_epochs(self):
-    batch_size = 2
-    a = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size, epochs=2)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    # loop through one whole epoch
-    for _ in range(6):
-      actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_insufficient_info(self):
-    # with batch_size and steps_per_epoch not set
-    with self.assertRaises(ValueError):
-      a = np.ones([10, 10])
-      _ = training_utils.convert_to_iterator(x=a)
-
-  def test_nested_all(self):
-    nested_data = {'a': True, 'b': [True, True, (False, True)]}
-    all_true = training_utils._nested_all(nested_data, lambda x: x)
-    self.assertEquals(all_true, False)
-
-    nested_data = {'a': True, 'b': [True, True, (True, True)]}
-    all_true = training_utils._nested_all(nested_data, lambda x: x)
-    self.assertEquals(all_true, True)
-
-  def test_nested_any(self):
-    nested_data = [False, {'a': False, 'b': (False, True)}]
-    any_true = training_utils._nested_any(nested_data, lambda x: x)
-    self.assertEquals(any_true, True)
-
-    nested_data = [False, {'a': False, 'b': (False, False)}]
-    any_true = training_utils._nested_any(nested_data, lambda x: x)
-    self.assertEquals(any_true, False)
-
-
 class ModelInputsTest(test.TestCase):
 
   def test_single_thing(self):
     a = np.ones(10)
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['input_1'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertAllEqual(np.ones(10), vals)
-    self.assertFalse(tensor_util.is_tensor(vals))
+    self.assertEqual(['input_1'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals))
     vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.assertEquals(1, len(vals))
+    self.assertEqual(1, len(vals))
     self.assertTrue(tensor_util.is_tensor(vals[0]))
 
   def test_single_thing_eager(self):
     with context.eager_mode():
       a = np.ones(10)
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['input_1'], model_inputs.get_input_names())
-      vals = model_inputs.get_input_values()
-      self.assertAllEqual(np.ones(10), vals)
-      self.assertTrue(tensor_util.is_tensor(vals))
-      vals = model_inputs.get_symbolic_inputs()
-      self.assertTrue(isinstance(vals, base_layer.DeferredTensor))
+      self.assertEqual(['input_1'], model_inputs.get_input_names())
+      val = model_inputs.get_symbolic_inputs()
+      self.assertTrue(tf_utils.is_symbolic_tensor(val))
       vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-      self.assertEquals(1, len(vals))
-      self.assertTrue(isinstance(vals[0], base_layer.DeferredTensor))
+      self.assertEqual(1, len(vals))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertEqual(2, len(vals))
-    self.assertAllEqual(np.ones(10), vals[0])
-    self.assertAllEqual(np.ones(20), vals[1])
-    self.assertFalse(tensor_util.is_tensor(vals[0]))
-    self.assertFalse(tensor_util.is_tensor(vals[1]))
+    self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals[0]))
     self.assertTrue(tensor_util.is_tensor(vals[1]))
@@ -196,26 +62,15 @@ class ModelInputsTest(test.TestCase):
     with context.eager_mode():
       a = [np.ones(10), np.ones(20)]
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
-      vals = model_inputs.get_input_values()
-      self.assertEqual(2, len(vals))
-      self.assertAllEqual(np.ones(10), vals[0])
-      self.assertAllEqual(np.ones(20), vals[1])
-      self.assertTrue(tensor_util.is_tensor(vals[0]))
-      self.assertTrue(tensor_util.is_tensor(vals[1]))
+      self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
       vals = model_inputs.get_symbolic_inputs()
-      self.assertTrue(isinstance(vals[0], base_layer.DeferredTensor))
-      self.assertTrue(isinstance(vals[1], base_layer.DeferredTensor))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
 
   def test_dict(self):
     a = {'b': np.ones(10), 'a': np.ones(20)}
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['a', 'b'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertAllEqual(np.ones(20), vals['a'])
-    self.assertAllEqual(np.ones(10), vals['b'])
-    self.assertFalse(tensor_util.is_tensor(vals['a']))
-    self.assertFalse(tensor_util.is_tensor(vals['b']))
+    self.assertEqual(['a', 'b'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals['a']))
     self.assertTrue(tensor_util.is_tensor(vals['b']))
@@ -224,15 +79,10 @@ class ModelInputsTest(test.TestCase):
     with context.eager_mode():
       a = {'b': np.ones(10), 'a': np.ones(20)}
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['a', 'b'], model_inputs.get_input_names())
-      vals = model_inputs.get_input_values()
-      self.assertAllEqual(np.ones(20), vals['a'])
-      self.assertAllEqual(np.ones(10), vals['b'])
-      self.assertTrue(tensor_util.is_tensor(vals['a']))
-      self.assertTrue(tensor_util.is_tensor(vals['b']))
+      self.assertEqual(['a', 'b'], model_inputs.get_input_names())
       vals = model_inputs.get_symbolic_inputs()
-      self.assertTrue(isinstance(vals['a'], base_layer.DeferredTensor))
-      self.assertTrue(isinstance(vals['b'], base_layer.DeferredTensor))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index b244beb5b58cf339a4687216b87418c88b953c17..dcd0600897005f1905b5f6b65cdc0f225172fa1b 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -24,23 +24,54 @@ from tensorflow.python.util.tf_export import tf_export
 # As long as you depend //third_party/py/tensorflow:tensorflow target
 # everything will work as normal.
 
-try:
-  from tensorflow.python.estimator import keras as keras_lib  # pylint: disable=g-import-not-at-top
-  model_to_estimator = tf_export('keras.estimator.model_to_estimator')(
-      keras_lib.model_to_estimator)
-except Exception:  # pylint: disable=broad-except
-
-  # pylint: disable=unused-argument
-  def stub_model_to_estimator(keras_model=None,
-                              keras_model_path=None,
-                              custom_objects=None,
-                              model_dir=None,
-                              config=None):
+
+# LINT.IfChange
+@tf_export('keras.estimator.model_to_estimator')
+def model_to_estimator(
+    keras_model=None,
+    keras_model_path=None,
+    custom_objects=None,
+    model_dir=None,
+    config=None):
+  """Constructs an `Estimator` instance from given keras model.
+
+  For usage example, please see:
+  [Creating estimators from Keras
+  Models](https://tensorflow.org/guide/estimators#model_to_estimator).
+
+  Args:
+    keras_model: A compiled Keras model object. This argument is mutually
+      exclusive with `keras_model_path`.
+    keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
+      format, which can be generated with the `save()` method of a Keras model.
+      This argument is mutually exclusive with `keras_model`.
+    custom_objects: Dictionary for custom objects.
+    model_dir: Directory to save `Estimator` model parameters, graph, summary
+      files for TensorBoard, etc.
+    config: `RunConfig` to config `Estimator`.
+
+  Returns:
+    An Estimator from given keras model.
+
+  Raises:
+    ValueError: if neither keras_model nor keras_model_path was given.
+    ValueError: if both keras_model and keras_model_path was given.
+    ValueError: if the keras_model_path is a GCS URI.
+    ValueError: if keras_model has not been compiled.
+  """
+  try:
+    from tensorflow_estimator.python.estimator import keras as keras_lib  # pylint: disable=g-import-not-at-top
+  except ImportError:
     raise NotImplementedError(
         'tf.keras.estimator.model_to_estimator function not available in your '
         'installation.')
-  # pylint: enable=unused-argument
+  return keras_lib.model_to_estimator(
+      keras_model=keras_model,
+      keras_model_path=keras_model_path,
+      custom_objects=custom_objects,
+      model_dir=model_dir,
+      config=config)
+
+# LINT.ThenChange(//third_party/tensorflow_estimator/python/estimator/keras.py)
 
-  model_to_estimator = tf_export('keras.estimator.model_to_estimator')(
-      stub_model_to_estimator)
 
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index 2b758a98f30fee7cb9385db93a97e7a132c3b816..4f91bea1e331f0b52a4f34fc848b3d51509e1360 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.ops import init_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -38,6 +39,7 @@ class KerasInitializersTest(test.TestCase):
     output_2 = keras.backend.get_value(variable)
     self.assertAllClose(output, output_2, atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def test_uniform(self):
     tensor_shape = (9, 6, 7)
     with self.cached_session():
@@ -47,6 +49,7 @@ class KerasInitializersTest(test.TestCase):
                    tensor_shape,
                    target_mean=0., target_max=1, target_min=-1)
 
+  @test_util.run_deprecated_v1
   def test_normal(self):
     tensor_shape = (8, 12, 99)
     with self.cached_session():
@@ -54,6 +57,7 @@ class KerasInitializersTest(test.TestCase):
                    tensor_shape,
                    target_mean=0., target_std=1)
 
+  @test_util.run_deprecated_v1
   def test_truncated_normal(self):
     tensor_shape = (12, 99, 7)
     with self.cached_session():
@@ -69,6 +73,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.Constant(2), tensor_shape,
                    target_mean=2, target_max=2, target_min=2)
 
+  @test_util.run_deprecated_v1
   def test_lecun_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -77,6 +82,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -85,6 +91,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -93,6 +100,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -101,6 +109,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -109,6 +118,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -117,6 +127,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_orthogonal(self):
     tensor_shape = (20, 20)
     with self.cached_session():
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index 3c0f73b1c3aab037164f612e0e9b3a2fc7b32385..c516514f63270a9507101209680c1be221ba3f99 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.layers import core as tf_core_layers
 from tensorflow.python.ops import nn
@@ -34,6 +35,7 @@ class KerasIntegrationTest(test.TestCase):
   def test_version(self):
     self.assertTrue(keras.__version__.endswith('-tf'))
 
+  @test_util.run_v1_only('b/120545219')
   def test_vector_classification_sequential(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -59,6 +61,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_vector_classification_functional(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -83,6 +86,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_temporal_classification_sequential(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -105,6 +109,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_temporal_classification_sequential_tf_rnn(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -129,6 +134,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_v1_only('b/120545219')
   def test_image_classification_sequential(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -163,6 +169,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_v1_only('b/120545219')
   def test_video_classification_functional(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -191,6 +198,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_v1_only('b/120545219')
   def test_vector_classification_shared_sequential(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
@@ -225,6 +233,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_v1_only('b/120545219')
   def test_vector_classification_shared_model(self):
     # Test that functional models that feature internal updates
     # and internal losses can be shared.
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
new file mode 100644
index 0000000000000000000000000000000000000000..d76bbadeb3613a8e71b1a6fc313fb7e68630de93
--- /dev/null
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -0,0 +1,298 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for unit-testing Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import itertools
+import unittest
+
+from absl.testing import parameterized
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+from tensorflow.python.eager import context
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class TestCase(test.TestCase, parameterized.TestCase):
+
+  def tearDown(self):
+    keras.backend.clear_session()
+    super(TestCase, self).tearDown()
+
+
+# TODO(kaftan): Possibly enable 'subclass_custom_build' when tests begin to pass
+# it. Or perhaps make 'subclass' always use a custom build method.
+def run_with_all_model_types(
+    test_or_class=None,
+    exclude_models=None):
+  """Execute the decorated test with all Keras model types.
+
+  This decorator is intended to be applied either to individual test methods in
+  a `keras_parameterized.TestCase` class, or directly to a test class that
+  extends it. Doing so will cause the contents of the individual test
+  method (or all test methods in the class) to be executed multiple times - once
+  for each Keras model type.
+
+  The Keras model types are: ['functional', 'subclass', 'sequential']
+
+  Note: if stacking this decorator with absl.testing's parameterized decorators,
+  those should be at the bottom of the stack.
+
+  Various methods in `testing_utils` to get models will auto-generate a model
+  of the currently active Keras model type. This allows unittests to confirm
+  the equivalence between different Keras models.
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(testing_utils.KerasTestCase):
+
+    @testing_utils.run_with_all_model_types(
+      exclude_models = ['sequential'])
+    def test_foo(self):
+      model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test tries building a small mlp as both a functional model and as a
+  subclass model.
+
+  We can also annotate the whole class if we want this to apply to all tests in
+  the class:
+  ```python
+  @testing_utils.run_with_all_model_types(exclude_models = ['sequential'])
+  class MyTests(testing_utils.KerasTestCase):
+
+    def test_foo(self):
+      model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+
+  Args:
+    test_or_class: test method or class to be annotated. If None,
+      this method returns a decorator that can be applied to a test method or
+      test class. If it is not None this returns the decorator applied to the
+      test or class.
+    exclude_models: A collection of Keras model types to not run.
+      (May also be a single model type not wrapped in a collection).
+      Defaults to None.
+
+  Returns:
+    Returns a decorator that will run the decorated test method multiple times:
+    once for each desired Keras model type.
+
+  Raises:
+    ImportError: If abseil parameterized is not installed or not included as
+      a target dependency.
+  """
+  model_types = ['functional', 'subclass', 'sequential']
+  params = [('_%s' % model, model) for model in model_types
+            if model not in nest.flatten(exclude_models)]
+
+  def single_method_decorator(f):
+    """Decorator that constructs the test cases."""
+    # Use named_parameters so it can be individually run from the command line
+    @parameterized.named_parameters(*params)
+    @functools.wraps(f)
+    def decorated(self, model_type, *args, **kwargs):
+      """A run of a single test case w/ the specified model type."""
+      with testing_utils.model_type_scope(model_type):
+        f(self, *args, **kwargs)
+
+    return decorated
+
+  return _test_or_class_decorator(test_or_class, single_method_decorator)
+
+
+def run_all_keras_modes(
+    test_or_class=None,
+    config=None,
+    always_skip_v1=False):
+  """Execute the decorated test with all keras execution modes.
+
+  This decorator is intended to be applied either to individual test methods in
+  a `keras_parameterized.TestCase` class, or directly to a test class that
+  extends it. Doing so will cause the contents of the individual test
+  method (or all test methods in the class) to be executed multiple times -
+  once executing in legacy graph mode, once running eagerly and with
+  `should_run_eagerly` returning True, and once running eagerly with
+  `should_run_eagerly` returning False.
+
+  If Tensorflow v2 behavior is enabled, legacy graph mode will be skipped, and
+  the test will only run twice.
+
+  Note: if stacking this decorator with absl.testing's parameterized decorators,
+  those should be at the bottom of the stack.
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(testing_utils.KerasTestCase):
+
+    @testing_utils.run_all_keras_modes
+    def test_foo(self):
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics,
+                    run_eagerly=testing_utils.should_run_eagerly())
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test will try compiling & fitting the small functional mlp using all
+  three Keras execution modes.
+
+  Args:
+    test_or_class: test method or class to be annotated. If None,
+      this method returns a decorator that can be applied to a test method or
+      test class. If it is not None this returns the decorator applied to the
+      test or class.
+    config: An optional config_pb2.ConfigProto to use to configure the
+      session when executing graphs.
+    always_skip_v1: If True, does not try running the legacy graph mode even
+      when Tensorflow v2 behavior is not enabled.
+
+  Returns:
+    Returns a decorator that will run the decorated test method multiple times.
+
+  Raises:
+    ImportError: If abseil parameterized is not installed or not included as
+      a target dependency.
+  """
+  params = [('_v2_eager', 'v2_eager'),
+            ('_v2_function', 'v2_function')]
+  if not (always_skip_v1 or tf2.enabled()):
+    params.append(('_v1_graph', 'v1_graph'))
+
+  def single_method_decorator(f):
+    """Decorator that constructs the test cases."""
+
+    # Use named_parameters so it can be individually run from the command line
+    @parameterized.named_parameters(*params)
+    @functools.wraps(f)
+    def decorated(self, run_mode, *args, **kwargs):
+      """A run of a single test case w/ specified run mode."""
+      if run_mode == 'v1_graph':
+        with context.graph_mode(), testing_utils.run_eagerly_scope(False):
+          with self.test_session(use_gpu=True, config=config):
+            f(self, *args, **kwargs)
+      elif run_mode == 'v2_function':
+        with context.eager_mode():
+          with testing_utils.run_eagerly_scope(False):
+            f(self, *args, **kwargs)
+      elif run_mode == 'v2_eager':
+        with context.eager_mode():
+          with testing_utils.run_eagerly_scope(True):
+            f(self, *args, **kwargs)
+      else:
+        return ValueError('Unknown run mode %s' % run_mode)
+
+    return decorated
+
+  return _test_or_class_decorator(test_or_class, single_method_decorator)
+
+
+def _test_or_class_decorator(test_or_class, single_method_decorator):
+  """Decorate a test or class with a decorator intended for one method.
+
+  If the test_or_class is a class:
+    This will apply the decorator to all test methods in the class.
+
+  If the test_or_class is an iterable of already-parameterized test cases:
+    This will apply the decorator to all the cases, and then flatten the
+    resulting cross-product of test cases. This allows stacking the Keras
+    parameterized decorators w/ each other, and to apply them to test methods
+    that have already been marked with an absl parameterized decorator.
+
+  Otherwise, treat the obj as a single method and apply the decorator directly.
+
+  Args:
+    test_or_class: A test method (that may have already been decorated with a
+      parameterized decorator, or a test class that extends
+      keras_parameterized.TestCase
+    single_method_decorator:
+      A parameterized decorator intended for a single test method.
+  Returns:
+    The decorated result.
+  """
+  def _decorate_test_or_class(obj):
+    if isinstance(obj, collections.Iterable):
+      return itertools.chain.from_iterable(
+          single_method_decorator(method) for method in obj)
+    if isinstance(obj, type):
+      cls = obj
+      for name, value in cls.__dict__.copy().items():
+        if callable(value) and name.startswith(
+            unittest.TestLoader.testMethodPrefix):
+          setattr(cls, name, single_method_decorator(value))
+
+      cls = type(cls).__new__(type(cls), cls.__name__, cls.__bases__,
+                              cls.__dict__.copy())
+      return cls
+
+    return single_method_decorator(obj)
+
+  if test_or_class is not None:
+    return _decorate_test_or_class(test_or_class)
+
+  return _decorate_test_or_class
diff --git a/tensorflow/python/keras/keras_parameterized_test.py b/tensorflow/python/keras/keras_parameterized_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0ff40cfc7a17114fad20a51f29a6aed89b56015
--- /dev/null
+++ b/tensorflow/python/keras/keras_parameterized_test.py
@@ -0,0 +1,552 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras testing_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from absl.testing import parameterized
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+from tensorflow.python.eager import context
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import googletest
+
+
+class KerasParameterizedTest(keras_parameterized.TestCase):
+
+  def test_run_with_all_model_types(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    e.testBody_functional()
+    e.testBody_subclass()
+    e.testBody_sequential()
+
+    self.assertLen(model_types, 3)
+    self.assertAllEqual(model_types, [
+        "functional",
+        "subclass",
+        "sequential"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+    self.assertNotIsInstance(models[1], keras.models.Sequential)
+    self.assertIsInstance(models[2], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 6)
+
+  def test_run_with_all_model_types_and_extra_params(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types
+      @parameterized.named_parameters(
+          [dict(testcase_name="_0", with_brackets=True),
+           dict(testcase_name="_1", with_brackets=False)])
+      def testBody(self, with_brackets):
+        with_brackets = "with_brackets" if with_brackets else "without_brackets"
+        model_types.append((with_brackets, testing_utils.get_model_type()))
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    e.testBody_0_functional()
+    e.testBody_0_subclass()
+    e.testBody_0_sequential()
+    e.testBody_1_functional()
+    e.testBody_1_subclass()
+    e.testBody_1_sequential()
+
+    self.assertLen(model_types, 6)
+    self.assertAllEqual(model_types, [
+        ("with_brackets", "functional"),
+        ("with_brackets", "subclass"),
+        ("with_brackets", "sequential"),
+        ("without_brackets", "functional"),
+        ("without_brackets", "subclass"),
+        ("without_brackets", "sequential"),
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+    self.assertNotIsInstance(models[1], keras.models.Sequential)
+    self.assertIsInstance(models[2], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 12)
+
+  def test_run_with_all_model_types_exclude_one(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types(exclude_models="sequential")
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    if hasattr(e, "testBody_functional"):
+      e.testBody_functional()
+    if hasattr(e, "testBody_subclass"):
+      e.testBody_subclass()
+    if hasattr(e, "testBody_sequential"):
+      e.testBody_sequential()
+
+    self.assertLen(model_types, 2)
+    self.assertAllEqual(model_types, [
+        "functional",
+        "subclass"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+    self.assertNotIsInstance(models[1], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 4)
+
+  def test_run_with_all_model_types_exclude_multiple(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types(
+          exclude_models=["sequential", "functional"])
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    if hasattr(e, "testBody_functional"):
+      e.testBody_functional()
+    if hasattr(e, "testBody_subclass"):
+      e.testBody_subclass()
+    if hasattr(e, "testBody_sequential"):
+      e.testBody_sequential()
+
+    self.assertLen(model_types, 1)
+    self.assertAllEqual(model_types, [
+        "subclass"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertFalse(models[0]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 2)
+
+  def test_run_all_keras_modes(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if not tf2.enabled():
+      e.testBody_v1_graph()
+    e.testBody_v2_eager()
+    e.testBody_v2_function()
+
+    if not tf2.enabled():
+      self.assertLen(l, 3)
+      self.assertAllEqual(l, [
+          ("graph", False),
+          ("eager", True),
+          ("eager", False),
+      ])
+
+      ts = unittest.makeSuite(ExampleTest)
+      res = unittest.TestResult()
+      ts.run(res)
+      self.assertLen(l, 6)
+    else:
+      self.assertLen(l, 2)
+      self.assertAllEqual(l, [
+          ("eager", True),
+          ("eager", False),
+      ])
+
+      ts = unittest.makeSuite(ExampleTest)
+      res = unittest.TestResult()
+      ts.run(res)
+      self.assertLen(l, 4)
+
+  def test_run_all_keras_modes_extra_params(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      @parameterized.named_parameters(
+          [dict(testcase_name="_0", with_brackets=True),
+           dict(testcase_name="_1", with_brackets=False)])
+      def testBody(self, with_brackets):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        with_brackets = "with_brackets" if with_brackets else "without_brackets"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((with_brackets, mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if not tf2.enabled():
+      e.testBody_0_v1_graph()
+      e.testBody_1_v1_graph()
+
+    e.testBody_0_v2_eager()
+    e.testBody_0_v2_function()
+    e.testBody_1_v2_eager()
+    e.testBody_1_v2_function()
+
+    expected_combinations = {
+        ("with_brackets", "eager", True),
+        ("with_brackets", "eager", False),
+        ("without_brackets", "eager", True),
+        ("without_brackets", "eager", False),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("with_brackets", "graph", False),
+          ("without_brackets", "graph", False),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_keras_modes_always_skip_v1(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if hasattr(e, "testBody_v1_graph"):
+      e.testBody_v1_graph()
+    if hasattr(e, "testBody_v2_eager"):
+      e.testBody_v2_eager()
+    if hasattr(e, "testBody_v2_function"):
+      e.testBody_v2_function()
+
+    self.assertLen(l, 2)
+    self.assertEqual(set(l), {
+        ("eager", True),
+        ("eager", False),
+    })
+
+  def test_run_all_keras_modes_with_all_model_types(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types
+      @keras_parameterized.run_all_keras_modes
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_v2_eager_functional()
+    e.testBody_v2_function_functional()
+    e.testBody_v2_eager_sequential()
+    e.testBody_v2_function_sequential()
+    e.testBody_v2_eager_subclass()
+    e.testBody_v2_function_subclass()
+
+    if not tf2.enabled():
+      e.testBody_v1_graph_functional()
+      e.testBody_v1_graph_sequential()
+      e.testBody_v1_graph_subclass()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_model_types_with_all_keras_modes(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      @keras_parameterized.run_with_all_model_types
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_functional_v2_eager()
+    e.testBody_functional_v2_function()
+    e.testBody_sequential_v2_eager()
+    e.testBody_sequential_v2_function()
+    e.testBody_subclass_v2_eager()
+    e.testBody_subclass_v2_function()
+
+    if not tf2.enabled():
+      e.testBody_functional_v1_graph()
+      e.testBody_sequential_v1_graph()
+      e.testBody_subclass_v1_graph()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_keras_modes_with_all_model_types_annotate_class(self):
+    l = []
+
+    @keras_parameterized.run_with_all_model_types
+    @keras_parameterized.run_all_keras_modes
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @parameterized.named_parameters(dict(testcase_name="_arg",
+                                           arg=True))
+      def testBody(self, arg):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_arg_v2_eager_functional()
+    e.testBody_arg_v2_function_functional()
+    e.testBody_arg_v2_eager_sequential()
+    e.testBody_arg_v2_function_sequential()
+    e.testBody_arg_v2_eager_subclass()
+    e.testBody_arg_v2_function_subclass()
+
+    if not tf2.enabled():
+      e.testBody_arg_v1_graph_functional()
+      e.testBody_arg_v1_graph_sequential()
+      e.testBody_arg_v1_graph_subclass()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_keras_modes_with_all_model_types_annotate_class_2(self):
+    l = []
+
+    @keras_parameterized.run_with_all_model_types
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      @parameterized.named_parameters(dict(testcase_name="_arg",
+                                           arg=True))
+      def testBody(self, arg):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_arg_v2_eager_functional()
+    e.testBody_arg_v2_function_functional()
+    e.testBody_arg_v2_eager_sequential()
+    e.testBody_arg_v2_function_sequential()
+    e.testBody_arg_v2_eager_subclass()
+    e.testBody_arg_v2_function_subclass()
+
+    if not tf2.enabled():
+      e.testBody_arg_v1_graph_functional()
+      e.testBody_arg_v1_graph_sequential()
+      e.testBody_arg_v1_graph_subclass()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(dict(testcase_name="argument",
+                                       arg=True))
+  def test_run_all_keras_modes_extra_params_2(self, arg):
+    self.assertEqual(arg, True)
+
+  @keras_parameterized.run_with_all_model_types
+  @parameterized.named_parameters(dict(testcase_name="argument",
+                                       arg=True))
+  def test_run_with_all_model_types_extra_params_2(self, arg):
+    self.assertEqual(arg, True)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index e3a686f45d92dde8ea90d496b3cb5099f6b84b58..df7571e5d5fc862c29016fc0e12d1d33059405ad 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 # pylint: disable=g-bad-import-order
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 
 # Advanced activations.
@@ -145,9 +145,11 @@ from tensorflow.python.keras.layers.recurrent import StackedRNNCells
 from tensorflow.python.keras.layers.recurrent import SimpleRNNCell
 from tensorflow.python.keras.layers.recurrent import GRUCell
 from tensorflow.python.keras.layers.recurrent import LSTMCell
+from tensorflow.python.keras.layers.recurrent import PeepholeLSTMCell
 from tensorflow.python.keras.layers.recurrent import SimpleRNN
 from tensorflow.python.keras.layers.recurrent import GRU
 from tensorflow.python.keras.layers.recurrent import LSTM
+from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
 
 # Convolutional-recurrent layers.
 from tensorflow.python.keras.layers.convolutional_recurrent import ConvLSTM2D
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index a2385dfdbb253a1bbf42c58c31d35d8df05507f2..35ac7830b2e2f37ffc270227d44450d730a9149c 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -22,8 +22,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index c41087be0aa407eac7fe69f36101511789961b57..f32bb457c825d9769c6dccf625d9318c07843237 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -19,55 +19,52 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-class AdvancedActivationsTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class AdvancedActivationsTest(keras_parameterized.TestCase):
 
   def test_leaky_relu(self):
-    with self.cached_session():
-      for alpha in [0., .5, -1.]:
-        testing_utils.layer_test(keras.layers.LeakyReLU,
-                                 kwargs={'alpha': alpha},
-                                 input_shape=(2, 3, 4))
+    for alpha in [0., .5, -1.]:
+      testing_utils.layer_test(keras.layers.LeakyReLU,
+                               kwargs={'alpha': alpha},
+                               input_shape=(2, 3, 4))
 
   def test_prelu(self):
-    with self.cached_session():
-      testing_utils.layer_test(keras.layers.PReLU, kwargs={},
-                               input_shape=(2, 3, 4))
+    testing_utils.layer_test(keras.layers.PReLU, kwargs={},
+                             input_shape=(2, 3, 4))
 
   def test_prelu_share(self):
-    with self.cached_session():
-      testing_utils.layer_test(keras.layers.PReLU,
-                               kwargs={'shared_axes': 1},
-                               input_shape=(2, 3, 4))
+    testing_utils.layer_test(keras.layers.PReLU,
+                             kwargs={'shared_axes': 1},
+                             input_shape=(2, 3, 4))
 
   def test_elu(self):
-    with self.cached_session():
-      for alpha in [0., .5, -1.]:
-        testing_utils.layer_test(keras.layers.ELU,
-                                 kwargs={'alpha': alpha},
-                                 input_shape=(2, 3, 4))
+    for alpha in [0., .5, -1.]:
+      testing_utils.layer_test(keras.layers.ELU,
+                               kwargs={'alpha': alpha},
+                               input_shape=(2, 3, 4))
 
   def test_thresholded_relu(self):
-    with self.cached_session():
-      testing_utils.layer_test(keras.layers.ThresholdedReLU,
-                               kwargs={'theta': 0.5},
-                               input_shape=(2, 3, 4))
+    testing_utils.layer_test(keras.layers.ThresholdedReLU,
+                             kwargs={'theta': 0.5},
+                             input_shape=(2, 3, 4))
 
   def test_softmax(self):
-    with self.cached_session():
-      testing_utils.layer_test(keras.layers.Softmax,
-                               kwargs={'axis': 1},
-                               input_shape=(2, 3, 4))
+    testing_utils.layer_test(keras.layers.Softmax,
+                             kwargs={'axis': 1},
+                             input_shape=(2, 3, 4))
 
   def test_relu(self):
-    with self.cached_session():
-      testing_utils.layer_test(keras.layers.ReLU,
-                               kwargs={'max_value': 10},
-                               input_shape=(2, 3, 4))
-      x = keras.backend.ones((3, 4))
+    testing_utils.layer_test(keras.layers.ReLU,
+                             kwargs={'max_value': 10},
+                             input_shape=(2, 3, 4))
+    x = keras.backend.ones((3, 4))
+    if not context.executing_eagerly():
       # Test that we use `leaky_relu` when appropriate in graph mode.
       self.assertTrue(
           'LeakyRelu' in keras.layers.ReLU(negative_slope=0.2)(x).name)
@@ -79,10 +76,9 @@ class AdvancedActivationsTest(test.TestCase):
   def test_relu_with_invalid_arg(self):
     with self.assertRaisesRegexp(
         ValueError, 'max_value of Relu layer cannot be negative value: -10'):
-      with self.cached_session():
-        testing_utils.layer_test(keras.layers.ReLU,
-                                 kwargs={'max_value': -10},
-                                 input_shape=(2, 3, 4))
+      testing_utils.layer_test(keras.layers.ReLU,
+                               kwargs={'max_value': -10},
+                               input_shape=(2, 3, 4))
     with self.assertRaisesRegexp(
         ValueError,
         'negative_slope of Relu layer cannot be negative value: -2'):
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index d00def07bbdc7ee3c77a0e2a336ccd6594fe2b51..6564d6e8fdba6d6f8b384b06125032d16f34e28a 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -26,8 +26,8 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
 from tensorflow.python.keras.layers.pooling import AveragePooling1D
@@ -149,7 +149,7 @@ class Conv(Layer):
       channel_axis = 1
     else:
       channel_axis = -1
-    if input_shape[channel_axis].value is None:
+    if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = int(input_shape[channel_axis])
@@ -645,6 +645,14 @@ class Conv2DTranspose(Conv2D):
           Specifying any stride value != 1 is incompatible with specifying
           any `dilation_rate` value != 1.
       padding: one of `"valid"` or `"same"` (case-insensitive).
+      output_padding: An integer or tuple/list of 2 integers,
+          specifying the amount of padding along the height and width
+          of the output tensor.
+          Can be a single integer to specify the same value for all
+          spatial dimensions.
+          The amount of output padding along a given dimension must be
+          lower than the stride along that same dimension.
+          If set to `None` (default), the output shape is inferred.
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
@@ -700,7 +708,9 @@ class Conv2DTranspose(Conv2D):
                kernel_size,
                strides=(1, 1),
                padding='valid',
+               output_padding=None,
                data_format=None,
+               dilation_rate=(1, 1),
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -717,6 +727,7 @@ class Conv2DTranspose(Conv2D):
         strides=strides,
         padding=padding,
         data_format=data_format,
+        dilation_rate=dilation_rate,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -728,6 +739,16 @@ class Conv2DTranspose(Conv2D):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+    self.output_padding = output_padding
+    if self.output_padding is not None:
+      self.output_padding = conv_utils.normalize_tuple(
+          self.output_padding, 2, 'output_padding')
+      for stride, out_pad in zip(self.strides, self.output_padding):
+        if out_pad >= stride:
+          raise ValueError('Stride ' + str(self.strides) + ' must be '
+                           'greater than output padding ' +
+                           str(self.output_padding))
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if len(input_shape) != 4:
@@ -737,7 +758,7 @@ class Conv2DTranspose(Conv2D):
       channel_axis = 1
     else:
       channel_axis = -1
-    if input_shape[channel_axis].value is None:
+    if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = int(input_shape[channel_axis])
@@ -769,51 +790,50 @@ class Conv2DTranspose(Conv2D):
     inputs_shape = array_ops.shape(inputs)
     batch_size = inputs_shape[0]
     if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
+      h_axis, w_axis = 2, 3
     else:
-      c_axis, h_axis, w_axis = 3, 1, 2
+      h_axis, w_axis = 1, 2
 
     height, width = inputs_shape[h_axis], inputs_shape[w_axis]
     kernel_h, kernel_w = self.kernel_size
     stride_h, stride_w = self.strides
 
+    if self.output_padding is None:
+      out_pad_h = out_pad_w = None
+    else:
+      out_pad_h, out_pad_w = self.output_padding
+
     # Infer the dynamic output shape:
     out_height = conv_utils.deconv_output_length(height,
                                                  kernel_h,
-                                                 self.padding,
-                                                 stride_h)
+                                                 padding=self.padding,
+                                                 output_padding=out_pad_h,
+                                                 stride=stride_h,
+                                                 dilation=self.dilation_rate[0])
     out_width = conv_utils.deconv_output_length(width,
                                                 kernel_w,
-                                                self.padding,
-                                                stride_w)
+                                                padding=self.padding,
+                                                output_padding=out_pad_w,
+                                                stride=stride_w,
+                                                dilation=self.dilation_rate[1])
     if self.data_format == 'channels_first':
       output_shape = (batch_size, self.filters, out_height, out_width)
-      strides = (1, 1, stride_h, stride_w)
     else:
       output_shape = (batch_size, out_height, out_width, self.filters)
-      strides = (1, stride_h, stride_w, 1)
 
     output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv2d_transpose(
+    outputs = backend.conv2d_transpose(
         inputs,
         self.kernel,
         output_shape_tensor,
-        strides,
-        padding=self.padding.upper(),
-        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+        strides=self.strides,
+        padding=self.padding,
+        data_format=self.data_format,
+        dilation_rate=self.dilation_rate)
 
     if not context.executing_eagerly():
       # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
-                                                          kernel_h,
-                                                          self.padding,
-                                                          stride_h)
-      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
-                                                          kernel_w,
-                                                          self.padding,
-                                                          stride_w)
+      out_shape = self.compute_output_shape(inputs.shape)
       outputs.set_shape(out_shape)
 
     if self.use_bias:
@@ -837,13 +857,33 @@ class Conv2DTranspose(Conv2D):
     kernel_h, kernel_w = self.kernel_size
     stride_h, stride_w = self.strides
 
+    if self.output_padding is None:
+      out_pad_h = out_pad_w = None
+    else:
+      out_pad_h, out_pad_w = self.output_padding
+
     output_shape[c_axis] = self.filters
     output_shape[h_axis] = conv_utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
+        output_shape[h_axis],
+        kernel_h,
+        padding=self.padding,
+        output_padding=out_pad_h,
+        stride=stride_h,
+        dilation=self.dilation_rate[0])
     output_shape[w_axis] = conv_utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
+        output_shape[w_axis],
+        kernel_w,
+        padding=self.padding,
+        output_padding=out_pad_w,
+        stride=stride_w,
+        dilation=self.dilation_rate[1])
     return tensor_shape.TensorShape(output_shape)
 
+  def get_config(self):
+    config = super(Conv2DTranspose, self).get_config()
+    config['output_padding'] = self.output_padding
+    return config
+
 
 @tf_export('keras.layers.Conv3DTranspose',
            'keras.layers.Convolution3DTranspose')
@@ -878,6 +918,14 @@ class Conv3DTranspose(Conv3D):
           Specifying any stride value != 1 is incompatible with specifying
           any `dilation_rate` value != 1.
       padding: one of `"valid"` or `"same"` (case-insensitive).
+      output_padding: An integer or tuple/list of 3 integers,
+          specifying the amount of padding along the depth, height, and
+          width.
+          Can be a single integer to specify the same value for all
+          spatial dimensions.
+          The amount of output padding along a given dimension must be
+          lower than the stride along that same dimension.
+          If set to `None` (default), the output shape is inferred.
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
           The ordering of the dimensions in the inputs.
@@ -943,6 +991,7 @@ class Conv3DTranspose(Conv3D):
                kernel_size,
                strides=(1, 1, 1),
                padding='valid',
+               output_padding=None,
                data_format=None,
                activation=None,
                use_bias=True,
@@ -971,6 +1020,16 @@ class Conv3DTranspose(Conv3D):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+    self.output_padding = output_padding
+    if self.output_padding is not None:
+      self.output_padding = conv_utils.normalize_tuple(
+          self.output_padding, 3, 'output_padding')
+      for stride, out_pad in zip(self.strides, self.output_padding):
+        if out_pad >= stride:
+          raise ValueError('Stride ' + str(self.strides) + ' must be '
+                           'greater than output padding ' +
+                           str(self.output_padding))
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if len(input_shape) != 5:
@@ -980,7 +1039,7 @@ class Conv3DTranspose(Conv3D):
       channel_axis = 1
     else:
       channel_axis = -1
-    if input_shape[channel_axis].value is None:
+    if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined, found None: ' + str(input_shape))
     input_dim = int(input_shape[channel_axis])
@@ -1012,11 +1071,9 @@ class Conv3DTranspose(Conv3D):
     inputs_shape = array_ops.shape(inputs)
     batch_size = inputs_shape[0]
     if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+      d_axis, h_axis, w_axis = 2, 3, 4
     else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    self.input_spec = InputSpec(ndim=5, axes={c_axis: inputs_shape[c_axis]})
+      d_axis, h_axis, w_axis = 1, 2, 3
 
     depth = inputs_shape[d_axis]
     height = inputs_shape[h_axis]
@@ -1025,19 +1082,27 @@ class Conv3DTranspose(Conv3D):
     kernel_d, kernel_h, kernel_w = self.kernel_size
     stride_d, stride_h, stride_w = self.strides
 
+    if self.output_padding is None:
+      out_pad_d = out_pad_h = out_pad_w = None
+    else:
+      out_pad_d, out_pad_h, out_pad_w = self.output_padding
+
     # Infer the dynamic output shape:
     out_depth = conv_utils.deconv_output_length(depth,
                                                 kernel_d,
-                                                self.padding,
-                                                stride_d)
+                                                padding=self.padding,
+                                                output_padding=out_pad_d,
+                                                stride=stride_d)
     out_height = conv_utils.deconv_output_length(height,
                                                  kernel_h,
-                                                 self.padding,
-                                                 stride_h)
+                                                 padding=self.padding,
+                                                 output_padding=out_pad_h,
+                                                 stride=stride_h)
     out_width = conv_utils.deconv_output_length(width,
                                                 kernel_w,
-                                                self.padding,
-                                                stride_w)
+                                                padding=self.padding,
+                                                output_padding=out_pad_w,
+                                                stride=stride_w)
     if self.data_format == 'channels_first':
       output_shape = (batch_size, self.filters, out_depth, out_height,
                       out_width)
@@ -1058,20 +1123,7 @@ class Conv3DTranspose(Conv3D):
 
     if not context.executing_eagerly():
       # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[d_axis] = conv_utils.deconv_output_length(out_shape[d_axis],
-                                                          kernel_d,
-                                                          self.padding,
-                                                          stride_d)
-      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
-                                                          kernel_h,
-                                                          self.padding,
-                                                          stride_h)
-      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
-                                                          kernel_w,
-                                                          self.padding,
-                                                          stride_w)
+      out_shape = self.compute_output_shape(inputs.shape)
       outputs.set_shape(out_shape)
 
     if self.use_bias:
@@ -1109,15 +1161,38 @@ class Conv3DTranspose(Conv3D):
     kernel_d, kernel_h, kernel_w = self.kernel_size
     stride_d, stride_h, stride_w = self.strides
 
+    if self.output_padding is None:
+      out_pad_d = out_pad_h = out_pad_w = None
+    else:
+      out_pad_d, out_pad_h, out_pad_w = self.output_padding
+
     output_shape[c_axis] = self.filters
     output_shape[d_axis] = conv_utils.deconv_output_length(
-        output_shape[d_axis], kernel_d, self.padding, stride_d)
+        output_shape[d_axis],
+        kernel_d,
+        padding=self.padding,
+        output_padding=out_pad_d,
+        stride=stride_d)
     output_shape[h_axis] = conv_utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
+        output_shape[h_axis],
+        kernel_h,
+        padding=self.padding,
+        output_padding=out_pad_h,
+        stride=stride_h)
     output_shape[w_axis] = conv_utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
+        output_shape[w_axis],
+        kernel_w,
+        padding=self.padding,
+        output_padding=out_pad_w,
+        stride=stride_w)
     return tensor_shape.TensorShape(output_shape)
 
+  def get_config(self):
+    config = super(Conv3DTranspose, self).get_config()
+    config.pop('dilation_rate')
+    config['output_padding'] = self.output_padding
+    return config
+
 
 class SeparableConv(Conv):
   """Abstract base layer for separable nD convolution.
@@ -1238,7 +1313,7 @@ class SeparableConv(Conv):
       channel_axis = 1
     else:
       channel_axis = -1
-    if input_shape[channel_axis].value is None:
+    if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = int(input_shape[channel_axis])
@@ -1733,7 +1808,7 @@ class DepthwiseConv2D(Conv2D):
       channel_axis = 1
     else:
       channel_axis = 3
-    if input_shape[channel_axis] is None:
+    if input_shape.dims[channel_axis].value is None:
       raise ValueError('The channel dimension of the inputs to '
                        '`DepthwiseConv2D` '
                        'should be defined. Found `None`.')
@@ -1876,6 +1951,7 @@ class UpSampling2D(Layer):
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
+      interpolation: A string, one of `nearest` or `bilinear`.
 
   Input shape:
       4D tensor with shape:
@@ -1892,10 +1968,18 @@ class UpSampling2D(Layer):
           `(batch, channels, upsampled_rows, upsampled_cols)`
   """
 
-  def __init__(self, size=(2, 2), data_format=None, **kwargs):
+  def __init__(self,
+               size=(2, 2),
+               data_format=None,
+               interpolation='nearest',
+               **kwargs):
     super(UpSampling2D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.size = conv_utils.normalize_tuple(size, 2, 'size')
+    if interpolation not in {'nearest', 'bilinear'}:
+      raise ValueError('`interpolation` argument should be one of `"nearest"` '
+                       'or `"bilinear"`.')
+    self.interpolation = interpolation
     self.input_spec = InputSpec(ndim=4)
 
   def compute_output_shape(self, input_shape):
@@ -1917,7 +2001,8 @@ class UpSampling2D(Layer):
 
   def call(self, inputs):
     return backend.resize_images(
-        inputs, self.size[0], self.size[1], self.data_format)
+        inputs, self.size[0], self.size[1], self.data_format,
+        interpolation=self.interpolation)
 
   def get_config(self):
     config = {'size': self.size, 'data_format': self.data_format}
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index e61dd3043d96e69f76cb5bb041de304f5c1c2642..cf3861da21858d0ef0ab4e7567795edbf41635b8 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -26,8 +26,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _generate_dropout_mask
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.layers.recurrent import RNN
@@ -391,10 +391,6 @@ class ConvRNN2D(RNN):
     else:
       output = last_output
 
-    # Properly set learning phase
-    if getattr(last_output, '_uses_learning_phase', False):
-      output._uses_learning_phase = True
-
     if self.return_state:
       if not isinstance(states, (list, tuple)):
         states = [states]
@@ -723,11 +719,6 @@ class ConvLSTM2DCell(Layer):
     c = f * c_tm1 + i * self.activation(x_c + h_c)
     o = self.recurrent_activation(x_o + h_o)
     h = o * self.activation(c)
-
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None:
-        h._uses_learning_phase = True
-
     return h, [h, c]
 
   def input_conv(self, x, w, b=None, padding='valid'):
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index cad5e4c8bd62c6cd5200cb2fd60a2eb2ad976f2d..d3339a8413095cae2b74e19d768fcda0e1b4e4fb 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class Convolution1DTest(test.TestCase):
 
   def _run_test(self, kwargs, arg, values):
@@ -39,13 +40,12 @@ class Convolution1DTest(test.TestCase):
     test_kwargs = copy.copy(kwargs)
     for value in values:
       test_kwargs[arg] = value
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.Conv1D,
             kwargs=test_kwargs,
             input_shape=(num_samples, length, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_conv1d(self):
     kwargs = {
         'filters': 2,
@@ -74,7 +74,7 @@ class Convolution1DTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv1D(**kwargs)
       layer.build((None, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -93,13 +93,14 @@ class Convolution1DTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv1D(**kwargs)
       layer.build((None, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class Conv2DTest(test.TestCase):
 
   def _run_test(self, kwargs, arg, values):
@@ -111,13 +112,12 @@ class Conv2DTest(test.TestCase):
     test_kwargs = copy.copy(kwargs)
     for value in values:
       test_kwargs[arg] = value
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.Conv2D,
             kwargs=test_kwargs,
             input_shape=(num_samples, num_row, num_col, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_conv2d(self):
     kwargs = {
         'filters': 2,
@@ -149,7 +149,7 @@ class Conv2DTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv2D(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -168,13 +168,14 @@ class Conv2DTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv2D(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class Conv2DTransposeTest(test.TestCase):
 
   def _run_test(self, kwargs, arg, values):
@@ -186,13 +187,12 @@ class Conv2DTransposeTest(test.TestCase):
     test_kwargs = copy.copy(kwargs)
     for value in values:
       test_kwargs[arg] = value
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.Conv2DTranspose,
             kwargs=test_kwargs,
             input_shape=(num_samples, num_row, num_col, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_conv2dtranspose(self):
     kwargs = {
         'filters': 2,
@@ -204,6 +204,9 @@ class Conv2DTransposeTest(test.TestCase):
     if test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, 'data_format', ['channels_first'])
 
+    kwargs['strides'] = (2, 2)
+    self._run_test(kwargs, 'output_padding', [(1, 1)])
+
   def test_conv2dtranspose_regularizers(self):
     kwargs = {
         'filters': 3,
@@ -214,7 +217,7 @@ class Conv2DTransposeTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv2DTranspose(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -233,13 +236,38 @@ class Conv2DTransposeTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv2DTranspose(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
-
+  def test_conv2d_transpose_dilation(self):
+    testing_utils.layer_test(keras.layers.Conv2DTranspose,
+                             kwargs={'filters': 2,
+                                     'kernel_size': 3,
+                                     'padding': 'same',
+                                     'data_format': 'channels_last',
+                                     'dilation_rate': (2, 2)},
+                             input_shape=(2, 5, 6, 3))
+
+    input_data = np.arange(48).reshape((1, 4, 4, 3)).astype(np.float32)
+    expected_output = np.float32([[192, 228, 192, 228],
+                                  [336, 372, 336, 372],
+                                  [192, 228, 192, 228],
+                                  [336, 372, 336, 372]]).reshape((1, 4, 4, 1))
+    testing_utils.layer_test(keras.layers.Conv2DTranspose,
+                             input_data=input_data,
+                             kwargs={'filters': 1,
+                                     'kernel_size': 3,
+                                     'padding': 'same',
+                                     'data_format': 'channels_last',
+                                     'dilation_rate': (2, 2),
+                                     'kernel_initializer': 'ones'},
+                             expected_output=expected_output)
+
+
+@tf_test_util.run_all_in_graph_and_eager_modes
 class Conv3DTransposeTest(test.TestCase):
 
   def _run_test(self, kwargs, arg, values):
@@ -252,13 +280,12 @@ class Conv3DTransposeTest(test.TestCase):
     test_kwargs = copy.copy(kwargs)
     for value in values:
       test_kwargs[arg] = value
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.Conv3DTranspose,
             kwargs=test_kwargs,
             input_shape=(num_samples, depth, num_row, num_col, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_conv3dtranspose(self):
     kwargs = {
         'filters': 2,
@@ -270,6 +297,9 @@ class Conv3DTransposeTest(test.TestCase):
     if test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, 'data_format', ['channels_first'])
 
+    kwargs['strides'] = (2, 2, 2)
+    self._run_test(kwargs, 'output_padding', [(1, 1, 1)])
+
   def test_conv3dtranspose_regularizers(self):
     kwargs = {
         'filters': 3,
@@ -280,7 +310,7 @@ class Conv3DTransposeTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv3DTranspose(**kwargs)
       layer.build((None, 5, 5, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -299,13 +329,14 @@ class Conv3DTransposeTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv3DTranspose(**kwargs)
       layer.build((None, 5, 5, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class SeparableConv1DTest(test.TestCase):
 
   def _run_test(self, kwargs, arg, values):
@@ -316,13 +347,12 @@ class SeparableConv1DTest(test.TestCase):
     test_kwargs = copy.copy(kwargs)
     for value in values:
       test_kwargs[arg] = value
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.SeparableConv1D,
             kwargs=test_kwargs,
             input_shape=(num_samples, length, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_separable_conv1d(self):
     kwargs = {
         'filters': 2,
@@ -352,7 +382,7 @@ class SeparableConv1DTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.SeparableConv1D(**kwargs)
       layer.build((None, 5, 2))
       self.assertEqual(len(layer.losses), 3)
@@ -373,7 +403,7 @@ class SeparableConv1DTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.SeparableConv1D(**kwargs)
       layer.build((None, 5, 2))
       self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
@@ -381,6 +411,7 @@ class SeparableConv1DTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class SeparableConv2DTest(test.TestCase):
 
   def _run_test(self, kwargs, arg, values):
@@ -392,13 +423,12 @@ class SeparableConv2DTest(test.TestCase):
     test_kwargs = copy.copy(kwargs)
     for value in values:
       test_kwargs[arg] = value
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.SeparableConv2D,
             kwargs=test_kwargs,
             input_shape=(num_samples, num_row, num_col, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_separable_conv2d(self):
     kwargs = {
         'filters': 2,
@@ -430,7 +460,7 @@ class SeparableConv2DTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.SeparableConv2D(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(len(layer.losses), 3)
@@ -451,7 +481,7 @@ class SeparableConv2DTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.SeparableConv2D(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
@@ -459,6 +489,7 @@ class SeparableConv2DTest(test.TestCase):
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class Conv3DTest(test.TestCase):
 
   def _run_test(self, kwargs, arg, values):
@@ -471,13 +502,12 @@ class Conv3DTest(test.TestCase):
     test_kwargs = copy.copy(kwargs)
     for value in values:
       test_kwargs[arg] = value
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.Conv3D,
             kwargs=test_kwargs,
             input_shape=(num_samples, depth, num_row, num_col, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_conv3d(self):
     kwargs = {
         'filters': 2,
@@ -500,7 +530,7 @@ class Conv3DTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv3D(**kwargs)
       layer.build((None, 5, 5, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -520,16 +550,16 @@ class Conv3DTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       layer = keras.layers.Conv3D(**kwargs)
       layer.build((None, 5, 5, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class ZeroPaddingTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_zero_padding_1d(self):
     num_samples = 2
     input_dim = 2
@@ -537,8 +567,8 @@ class ZeroPaddingTest(test.TestCase):
     shape = (num_samples, num_steps, input_dim)
     inputs = np.ones(shape)
 
-    # basic test
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
+      # basic test
       testing_utils.layer_test(
           keras.layers.ZeroPadding1D,
           kwargs={'padding': 2},
@@ -548,8 +578,7 @@ class ZeroPaddingTest(test.TestCase):
           kwargs={'padding': (1, 2)},
           input_shape=inputs.shape)
 
-    # correctness test
-    with self.test_session(use_gpu=True):
+      # correctness test
       layer = keras.layers.ZeroPadding1D(padding=2)
       layer.build(shape)
       output = layer(keras.backend.variable(inputs))
@@ -581,7 +610,6 @@ class ZeroPaddingTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.ZeroPadding1D(padding=None)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_zero_padding_2d(self):
     num_samples = 2
     stack_size = 2
@@ -592,7 +620,7 @@ class ZeroPaddingTest(test.TestCase):
       inputs = np.ones((num_samples, stack_size, input_num_row, input_num_col))
 
       # basic test
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.ZeroPadding2D,
             kwargs={'padding': (2, 2),
@@ -605,7 +633,7 @@ class ZeroPaddingTest(test.TestCase):
             input_shape=inputs.shape)
 
       # correctness test
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         layer = keras.layers.ZeroPadding2D(
             padding=(2, 2), data_format=data_format)
         layer.build(inputs.shape)
@@ -660,7 +688,6 @@ class ZeroPaddingTest(test.TestCase):
       with self.assertRaises(ValueError):
         keras.layers.ZeroPadding2D(padding=None)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_zero_padding_3d(self):
     num_samples = 2
     stack_size = 2
@@ -671,15 +698,14 @@ class ZeroPaddingTest(test.TestCase):
     inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
                       input_len_dim3, stack_size))
 
-    # basic test
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
+      # basic test
       testing_utils.layer_test(
           keras.layers.ZeroPadding3D,
           kwargs={'padding': (2, 2, 2)},
           input_shape=inputs.shape)
 
-    # correctness test
-    with self.test_session(use_gpu=True):
+      # correctness test
       layer = keras.layers.ZeroPadding3D(padding=(2, 2, 2))
       layer.build(inputs.shape)
       output = layer(keras.backend.variable(inputs))
@@ -700,15 +726,14 @@ class ZeroPaddingTest(test.TestCase):
       keras.layers.ZeroPadding3D(padding=None)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class UpSamplingTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_upsampling_1d(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       testing_utils.layer_test(
           keras.layers.UpSampling1D, kwargs={'size': 2}, input_shape=(3, 5, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_upsampling_2d(self):
     num_samples = 2
     stack_size = 2
@@ -724,7 +749,7 @@ class UpSamplingTest(test.TestCase):
                                 stack_size)
 
       # basic test
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.UpSampling2D,
             kwargs={'size': (2, 2),
@@ -758,7 +783,41 @@ class UpSamplingTest(test.TestCase):
 
             np.testing.assert_allclose(np_output, expected_out)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  def test_upsampling_2d_bilinear(self):
+    num_samples = 2
+    stack_size = 2
+    input_num_row = 11
+    input_num_col = 12
+    for data_format in ['channels_first', 'channels_last']:
+      if data_format == 'channels_first':
+        inputs = np.random.rand(num_samples, stack_size, input_num_row,
+                                input_num_col)
+      else:
+        inputs = np.random.rand(num_samples, input_num_row, input_num_col,
+                                stack_size)
+
+      testing_utils.layer_test(keras.layers.UpSampling2D,
+                               kwargs={'size': (2, 2),
+                                       'data_format': data_format,
+                                       'interpolation': 'bilinear'},
+                               input_shape=inputs.shape)
+
+      if not context.executing_eagerly():
+        for length_row in [2]:
+          for length_col in [2, 3]:
+            layer = keras.layers.UpSampling2D(
+                size=(length_row, length_col),
+                data_format=data_format)
+            layer.build(inputs.shape)
+            outputs = layer(keras.backend.variable(inputs))
+            np_output = keras.backend.eval(outputs)
+            if data_format == 'channels_first':
+              self.assertEqual(np_output.shape[2], length_row * input_num_row)
+              self.assertEqual(np_output.shape[3], length_col * input_num_col)
+            else:
+              self.assertEqual(np_output.shape[1], length_row * input_num_row)
+              self.assertEqual(np_output.shape[2], length_col * input_num_col)
+
   def test_upsampling_3d(self):
     num_samples = 2
     stack_size = 2
@@ -775,7 +834,7 @@ class UpSamplingTest(test.TestCase):
                                 input_len_dim3, stack_size)
 
       # basic test
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.UpSampling3D,
             kwargs={'size': (2, 2, 2),
@@ -816,16 +875,16 @@ class UpSamplingTest(test.TestCase):
               np.testing.assert_allclose(np_output, expected_out)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class CroppingTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_cropping_1d(self):
     num_samples = 2
     time_length = 4
     input_len_dim1 = 2
     inputs = np.random.rand(num_samples, time_length, input_len_dim1)
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       testing_utils.layer_test(
           keras.layers.Cropping1D,
           kwargs={'cropping': (2, 2)},
@@ -837,7 +896,6 @@ class CroppingTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.Cropping1D(cropping=None)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_cropping_2d(self):
     num_samples = 2
     stack_size = 2
@@ -852,15 +910,14 @@ class CroppingTest(test.TestCase):
       else:
         inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
                                 stack_size)
-      # basic test
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
+        # basic test
         testing_utils.layer_test(
             keras.layers.Cropping2D,
             kwargs={'cropping': cropping,
                     'data_format': data_format},
             input_shape=inputs.shape)
-      # correctness test
-      with self.test_session(use_gpu=True):
+        # correctness test
         layer = keras.layers.Cropping2D(
             cropping=cropping, data_format=data_format)
         layer.build(inputs.shape)
@@ -886,7 +943,7 @@ class CroppingTest(test.TestCase):
         inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
                                 stack_size)
       # another correctness test (no cropping)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         cropping = ((0, 0), (0, 0))
         layer = keras.layers.Cropping2D(
             cropping=cropping, data_format=data_format)
@@ -905,7 +962,6 @@ class CroppingTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.Cropping2D(cropping=None)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_cropping_3d(self):
     num_samples = 2
     stack_size = 2
@@ -923,7 +979,7 @@ class CroppingTest(test.TestCase):
           inputs = np.random.rand(num_samples, input_len_dim1, input_len_dim2,
                                   input_len_dim3, stack_size)
         # basic test
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           testing_utils.layer_test(
               keras.layers.Cropping3D,
               kwargs={'cropping': cropping,
@@ -932,7 +988,7 @@ class CroppingTest(test.TestCase):
 
         if len(croppings) == 3 and len(croppings[0]) == 2:
           # correctness test
-          with self.test_session(use_gpu=True):
+          with self.cached_session(use_gpu=True):
             layer = keras.layers.Cropping3D(
                 cropping=cropping, data_format=data_format)
             layer.build(inputs.shape)
@@ -961,6 +1017,7 @@ class CroppingTest(test.TestCase):
       keras.layers.Cropping3D(cropping=None)
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class DepthwiseConv2DTest(test.TestCase):
 
   def _run_test(self, kwargs, arg, values):
@@ -972,7 +1029,7 @@ class DepthwiseConv2DTest(test.TestCase):
     test_kwargs = copy.copy(kwargs)
     for value in values:
       test_kwargs[arg] = value
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         testing_utils.layer_test(
             keras.layers.DepthwiseConv2D,
             kwargs=test_kwargs,
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index efa21955e69bb6f8e52a64fdef2bd7fa70e8a4b0..854774c569e3c86d1665f39fcdec74960df2928b 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -34,8 +34,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -134,7 +134,6 @@ class Dropout(Layer):
     return nn_ops._get_noise_shape(inputs, self.noise_shape)  # pylint: disable=protected-access
 
   def call(self, inputs, training=None):
-    original_training_value = training
     if training is None:
       training = K.learning_phase()
 
@@ -145,9 +144,6 @@ class Dropout(Layer):
     output = tf_utils.smart_cond(training,
                                  dropped_inputs,
                                  lambda: array_ops.identity(inputs))
-    # EagerTensor object has no attribute _uses_learning_phase
-    if not context.executing_eagerly() and original_training_value is None:
-      output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
   def compute_output_shape(self, input_shape):
@@ -510,6 +506,9 @@ class Permute(Layer):
 class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
+  If inputs are shaped `(batch,)` without a channel dimension, then flattening
+  adds an extra channel dimension and output shapes are `(batch, 1)`.
+
   Arguments:
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
@@ -538,23 +537,27 @@ class Flatten(Layer):
   def __init__(self, data_format=None, **kwargs):
     super(Flatten, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(min_ndim=2)
+    self.input_spec = InputSpec(min_ndim=1)
 
   def call(self, inputs):
-    if self.data_format == 'channels_first':
+    if (self.data_format == 'channels_first'
+        and K.ndim(inputs) is not None and K.ndim(inputs) > 1):
       permutation = [0]
       permutation.extend([i for i in
                           range(2, K.ndim(inputs))])
       permutation.append(1)
       inputs = array_ops.transpose(inputs, perm=permutation)
 
-    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
+    outputs = array_ops.reshape(
+        inputs, (inputs.shape[0].value or array_ops.shape(inputs)[0], -1))
     if not context.executing_eagerly():
       outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
     return outputs
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if not input_shape:
+      output_shape = tensor_shape.TensorShape([1])
     output_shape = [input_shape[0]]
     if all(input_shape[1:]):
       output_shape += [np.prod(input_shape[1:])]
@@ -929,14 +932,15 @@ class Dense(Layer):
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1].value is None:
+    if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
+    last_dim = tensor_shape.dimension_value(input_shape[-1])
     self.input_spec = InputSpec(min_ndim=2,
-                                axes={-1: input_shape[-1].value})
+                                axes={-1: last_dim})
     self.kernel = self.add_weight(
         'kernel',
-        shape=[input_shape[-1].value, self.units],
+        shape=[last_dim, self.units],
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
         constraint=self.kernel_constraint,
@@ -956,7 +960,7 @@ class Dense(Layer):
     self.built = True
 
   def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    inputs = ops.convert_to_tensor(inputs)
     rank = common_shapes.rank(inputs)
     if rank > 2:
       # Broadcasting is required for the inputs.
@@ -977,7 +981,7 @@ class Dense(Layer):
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     input_shape = input_shape.with_rank_at_least(2)
-    if input_shape[-1].value is None:
+    if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError(
           'The innermost dimension of input_shape must be defined, but saw: %s'
           % input_shape)
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index f0fea1f65c091cfafc60f1ba2f4932b04bb877e5..f138adf76026b116b2a4d771e8ae90194e065bef 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import math_ops
@@ -148,6 +149,20 @@ class CoreLayersTest(test.TestCase):
         np.transpose(inputs, (0, 2, 3, 1)), (-1, 5 * 5 * 3))
     self.assertAllClose(outputs, target_outputs)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_flatten_scalar_channels(self):
+    testing_utils.layer_test(
+        keras.layers.Flatten, kwargs={}, input_shape=(3,))
+
+    # Test channels_first
+    inputs = np.random.random((10,)).astype('float32')
+    outputs = testing_utils.layer_test(
+        keras.layers.Flatten,
+        kwargs={'data_format': 'channels_first'},
+        input_data=inputs)
+    target_outputs = np.expand_dims(inputs, -1)
+    self.assertAllClose(outputs, target_outputs)
+
   @tf_test_util.run_in_graph_and_eager_modes
   def test_repeat_vector(self):
     testing_utils.layer_test(
@@ -280,9 +295,7 @@ class CoreLayersTest(test.TestCase):
 
     l = keras.layers.Lambda(lambda_fn, output_shape=(None, 10))
     output_shape = l.compute_output_shape((5, 10, 20))
-    # Dimension(None) != Dimension(None), so check
-    # str representations for equality.
-    self.assertAllEqual(('5', '?', '10'), tuple([str(s) for s in output_shape]))
+    self.assertAllEqual([5, None, 10], output_shape.as_list())
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_lambda_output_shape_function_multiple_outputs(self):
@@ -310,5 +323,17 @@ class CoreLayersTest(test.TestCase):
 
       layer = keras.layers.Lambda.from_config(config)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_numpy_inputs(self):
+    if context.executing_eagerly():
+      layer = keras.layers.RepeatVector(2)
+      x = np.ones((10, 10))
+      self.assertAllEqual(np.ones((10, 2, 10)), layer(x))
+
+      layer = keras.layers.Concatenate()
+      x, y = np.ones((10, 10)), np.ones((10, 10))
+      self.assertAllEqual(np.ones((10, 20)), layer([x, y]))
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index 29a09a3d71239084e44295cc7102bed9520f6f15..16692753afbc83d55349f5b3843952f1b8c8d2bf 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -25,7 +25,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
+from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
@@ -77,13 +78,9 @@ class _CuDNNRNN(RNN):
     self.constants_spec = None
     self._states = None
     self._num_constants = None
+    self._num_inputs = None
     self._vector_shape = constant_op.constant([-1])
 
-  def _canonical_to_params(self, weights, biases):
-    weights = [array_ops.reshape(x, self._vector_shape) for x in weights]
-    biases = [array_ops.reshape(x, self._vector_shape) for x in biases]
-    return array_ops.concat(weights + biases, axis=0)
-
   def call(self, inputs, mask=None, training=None, initial_state=None):
     if isinstance(mask, list):
       mask = mask[0]
@@ -278,7 +275,7 @@ class CuDNNGRU(_CuDNNRNN):
     input_h = initial_state[0]
     input_h = array_ops.expand_dims(input_h, axis=0)
 
-    params = self._canonical_to_params(
+    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, self.units:self.units * 2],
             self.kernel[:, :self.units],
@@ -295,7 +292,7 @@ class CuDNNGRU(_CuDNNRNN):
             self.bias[self.units * 3:self.units * 4],
             self.bias[self.units * 5:],
         ],
-    )
+        shape=self._vector_shape)
 
     outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs,
@@ -473,7 +470,7 @@ class CuDNNLSTM(_CuDNNRNN):
     input_h = array_ops.expand_dims(input_h, axis=0)
     input_c = array_ops.expand_dims(input_c, axis=0)
 
-    params = self._canonical_to_params(
+    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, :self.units],
             self.kernel[:, self.units:self.units * 2],
@@ -494,7 +491,7 @@ class CuDNNLSTM(_CuDNNRNN):
             self.bias[self.units * 6:self.units * 7],
             self.bias[self.units * 7:],
         ],
-    )
+        shape=self._vector_shape)
 
     outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs,
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 7becbfede1ac9c7ddcf812e845f6564a10c3d90b..cc93364aaec5dd0e09cb0e3f31a163f49c3f73c3 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -36,7 +36,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_cudnn_rnn_basics(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         input_size = 10
         timesteps = 6
         units = 2
@@ -64,7 +64,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_trainability(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         input_size = 10
         units = 2
         for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
@@ -88,7 +88,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
   )
   def test_regularizer(self, layer_class):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         input_size = 10
         timesteps = 6
         units = 2
@@ -120,7 +120,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
   )
   def test_return_state(self, layer_class):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         input_size = 10
         timesteps = 6
         units = 2
@@ -171,7 +171,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
   )
   def test_specify_initial_state_keras_tensor(self, layer_class):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         input_size = 10
         timesteps = 6
         units = 2
@@ -203,7 +203,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
   )
   def test_statefulness(self, layer_class):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         input_size = 10
         timesteps = 6
         units = 2
@@ -255,7 +255,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
                                              bidirectional, implementation,
                                              model_nest_level, model_type):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         input_size = 10
         timesteps = 6
         input_shape = (timesteps, input_size)
@@ -335,7 +335,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
     # Similar test as test_load_weights_between_noncudnn_rnn() but has different
     # rank of input due to usage of TimeDistributed. Issue: #10356.
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         input_size = 10
         steps = 6
         timesteps = 6
@@ -377,7 +377,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_cudnnrnn_bidirectional(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         rnn = keras.layers.CuDNNGRU
         samples = 2
         dim = 2
@@ -441,7 +441,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
     Should fail fast with an exception.
     """
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         input_shape = (3, 5)
 
         def gru(cudnn=False, **kwargs):
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 824a0b069edf7a4771f1a41ef1c4352f593f83b3..e8a8575705ab5c412ae4a793faaa89ef8918130c 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -45,11 +45,11 @@ class Embedding(Layer):
     model = Sequential()
     model.add(Embedding(1000, 64, input_length=10))
     # the model will take as input an integer matrix of size (batch,
-    input_length).
+    # input_length).
     # the largest integer (i.e. word index) in the input should be no larger
-    than 999 (vocabulary size).
+    # than 999 (vocabulary size).
     # now model.output_shape == (None, 10, 64), where None is the batch
-    dimension.
+    # dimension.
 
     input_array = np.random.randint(1000, size=(32, 10))
 
@@ -82,10 +82,10 @@ class Embedding(Layer):
         (without it, the shape of the dense outputs cannot be computed).
 
   Input shape:
-      2D tensor with shape: `(batch_size, sequence_length)`.
+      2D tensor with shape: `(batch_size, input_length)`.
 
   Output shape:
-      3D tensor with shape: `(batch_size, sequence_length, output_dim)`.
+      3D tensor with shape: `(batch_size, input_length, output_dim)`.
 
   """
 
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index 2e42e403aa3815a8530b1755bb8b271a6fe3c96e..aaa17b7e96078dea9b84e0f0e62a4bdcbe071fa0 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -69,16 +69,16 @@ class EmbeddingTest(test.TestCase):
         input_dtype='int32',
         expected_output_dtype='float32')
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_embedding_correctness(self):
-    with self.cached_session():
-      layer = keras.layers.Embedding(output_dim=2, input_dim=2)
-      layer.build((None, 2))
-      matrix = np.array([[1, 1], [2, 2]])
-      layer.set_weights([matrix])
+    layer = keras.layers.Embedding(output_dim=2, input_dim=2)
+    layer.build((None, 2))
+    matrix = np.array([[1, 1], [2, 2]])
+    layer.set_weights([matrix])
 
-      inputs = keras.backend.constant([[0, 1, 0]], dtype='int32')
-      outputs = keras.backend.eval(layer(inputs))
-      self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
+    inputs = keras.backend.constant([[0, 1, 0]], dtype='int32')
+    outputs = keras.backend.eval(layer(inputs))
+    self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
 
   @tf_test_util.run_in_graph_and_eager_modes()
   def test_eager_gpu_cpu(self):
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index 33d09a1660f662f00bbdb950e8071603a9849662..d2c4aaa125e7f1415c4e33224056c18418670769 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -23,8 +23,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 8589b32b3c5bd942f0a78978e0ce3173c85950ac..e4f4d0a639a6bac4605b3f03e23c6f14a2fdaa88 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -27,40 +27,44 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class LocallyConnectedLayersTest(test.TestCase):
+class LocallyConnected1DLayersTest(test.TestCase):
+  # TODO(fchollet): investigate why LocallyConnected1D
+  # fails inside a graph function in an eager context (fails with error
+  # "Incompatible shapes between op input and calculated input gradient").
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @tf_test_util.run_deprecated_v1
   def test_locallyconnected_1d(self):
-    num_samples = 2
-    num_steps = 8
-    input_dim = 5
-    filter_length = 3
-    filters = 4
-
-    for padding in ['valid', 'same']:
-      for strides in [1]:
-        if padding == 'same' and strides != 1:
-          continue
-        for data_format in ['channels_first', 'channels_last']:
-          for implementation in [1, 2]:
-            kwargs = {
-                'filters': filters,
-                'kernel_size': filter_length,
-                'padding': padding,
-                'strides': strides,
-                'data_format': data_format,
-                'implementation': implementation
-            }
+    with self.cached_session():
+      num_samples = 2
+      num_steps = 8
+      input_dim = 5
+      filter_length = 3
+      filters = 4
 
-            if padding == 'same' and implementation == 1:
-              self.assertRaises(ValueError,
-                                keras.layers.LocallyConnected1D,
-                                **kwargs)
-            else:
-              testing_utils.layer_test(
-                  keras.layers.LocallyConnected1D,
-                  kwargs=kwargs,
-                  input_shape=(num_samples, num_steps, input_dim))
+      for padding in ['valid', 'same']:
+        for strides in [1]:
+          if padding == 'same' and strides != 1:
+            continue
+          for data_format in ['channels_first', 'channels_last']:
+            for implementation in [1, 2]:
+              kwargs = {
+                  'filters': filters,
+                  'kernel_size': filter_length,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': data_format,
+                  'implementation': implementation
+              }
+
+              if padding == 'same' and implementation == 1:
+                self.assertRaises(ValueError,
+                                  keras.layers.LocallyConnected1D,
+                                  **kwargs)
+              else:
+                testing_utils.layer_test(
+                    keras.layers.LocallyConnected1D,
+                    kwargs=kwargs,
+                    input_shape=(num_samples, num_steps, input_dim))
 
   def test_locallyconnected_1d_regularization(self):
     num_samples = 2
@@ -111,29 +115,65 @@ class LocallyConnectedLayersTest(test.TestCase):
               self.assertEqual(layer.kernel.constraint, k_constraint)
               self.assertEqual(layer.bias.constraint, b_constraint)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+
+class LocallyConnected2DLayersTest(test.TestCase):
+  # TODO(fchollet): investigate why LocallyConnected2D
+  # fails inside a graph function in an eager context (fails with error
+  # "Incompatible shapes between op input and calculated input gradient").
+
+  @tf_test_util.run_deprecated_v1
   def test_locallyconnected_2d(self):
-    num_samples = 8
-    filters = 3
-    stack_size = 4
-    num_row = 6
-    num_col = 10
+    with self.cached_session():
+      num_samples = 8
+      filters = 3
+      stack_size = 4
+      num_row = 6
+      num_col = 10
 
-    for padding in ['valid', 'same']:
-      for strides in [(1, 1), (2, 2)]:
-        for implementation in [1, 2]:
-          if padding == 'same' and strides != (1, 1):
-            continue
+      for padding in ['valid', 'same']:
+        for strides in [(1, 1), (2, 2)]:
+          for implementation in [1, 2]:
+            if padding == 'same' and strides != (1, 1):
+              continue
+
+            kwargs = {
+                'filters': filters,
+                'kernel_size': 3,
+                'padding': padding,
+                'kernel_regularizer': 'l2',
+                'bias_regularizer': 'l2',
+                'strides': strides,
+                'data_format': 'channels_last',
+                'implementation': implementation
+            }
+
+            if padding == 'same' and implementation == 1:
+              self.assertRaises(ValueError,
+                                keras.layers.LocallyConnected2D,
+                                **kwargs)
+            else:
+              testing_utils.layer_test(
+                  keras.layers.LocallyConnected2D,
+                  kwargs=kwargs,
+                  input_shape=(num_samples, num_row, num_col, stack_size))
 
+  @tf_test_util.run_deprecated_v1
+  def test_locallyconnected_2d_channels_first(self):
+    with self.cached_session():
+      num_samples = 8
+      filters = 3
+      stack_size = 4
+      num_row = 6
+      num_col = 10
+
+      for implementation in [1, 2]:
+        for padding in ['valid', 'same']:
           kwargs = {
               'filters': filters,
               'kernel_size': 3,
-              'padding': padding,
-              'kernel_regularizer': 'l2',
-              'bias_regularizer': 'l2',
-              'strides': strides,
-              'data_format': 'channels_last',
-              'implementation': implementation
+              'data_format': 'channels_first',
+              'implementation': implementation,
+              'padding': padding
           }
 
           if padding == 'same' and implementation == 1:
@@ -146,40 +186,12 @@ class LocallyConnectedLayersTest(test.TestCase):
                 kwargs=kwargs,
                 input_shape=(num_samples, num_row, num_col, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_locallyconnected_2d_channels_first(self):
-    num_samples = 8
-    filters = 3
-    stack_size = 4
-    num_row = 6
-    num_col = 10
-
-    for implementation in [1, 2]:
-      for padding in ['valid', 'same']:
-        kwargs = {
-            'filters': filters,
-            'kernel_size': 3,
-            'data_format': 'channels_first',
-            'implementation': implementation,
-            'padding': padding
-        }
-
-        if padding == 'same' and implementation == 1:
-          self.assertRaises(ValueError,
-                            keras.layers.LocallyConnected2D,
-                            **kwargs)
-        else:
-          testing_utils.layer_test(
-              keras.layers.LocallyConnected2D,
-              kwargs=kwargs,
-              input_shape=(num_samples, num_row, num_col, stack_size))
-
   def test_locallyconnected_2d_regularization(self):
-    num_samples = 8
+    num_samples = 2
     filters = 3
     stack_size = 4
     num_row = 6
-    num_col = 10
+    num_col = 7
     for implementation in [1, 2]:
       for padding in ['valid', 'same']:
         kwargs = {
@@ -220,63 +232,70 @@ class LocallyConnectedLayersTest(test.TestCase):
             self.assertEqual(layer.kernel.constraint, k_constraint)
             self.assertEqual(layer.bias.constraint, b_constraint)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_locallyconnected_implementation(self):
-    n_train = 4
-    n_classes = 3
-    n_epochs = 2
 
-    np.random.seed(1)
-    targets = np.random.randint(0, n_classes, (n_train,))
+class LocallyConnectedImplementationModeTest(test.TestCase):
 
-    for width in [1, 17]:
-      for height in [16]:
-        for filters in [2]:
-          for data_format in ['channels_first', 'channels_last']:
-            inputs = get_inputs(data_format, filters, height, n_train, width)
-
-            for kernel_x in [(3,)]:
-              for kernel_y in [()] if width == 1 else [(2,)]:
-                for stride_x in [(1,)]:
-                  for stride_y in [()] if width == 1 else [(3,)]:
-                    for layers in [2]:
-                      kwargs = {
-                          'layers': layers,
-                          'filters': filters,
-                          'kernel_size': kernel_x + kernel_y,
-                          'strides': stride_x + stride_y,
-                          'data_format': data_format,
-                          'n_classes': n_classes,
-                          'input_shape': inputs.shape
-                      }
-
-                      model_1 = get_model(implementation=1, **kwargs)
-                      model_2 = get_model(implementation=2, **kwargs)
-
-                      copy_model_weights(model_2, model_1)
-
-                      # Compare outputs at initialization.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(out_1, out_2,
-                                                         rtol=1e-5, atol=1e-5)
-
-                      # Train.
-                      model_1.fit(x=inputs,
-                                  y=targets,
-                                  epochs=n_epochs,
-                                  batch_size=n_train)
-
-                      model_2.fit(x=inputs,
-                                  y=targets,
-                                  epochs=n_epochs,
-                                  batch_size=n_train)
-
-                      # Compare outputs after a few training steps.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(out_1, out_2,
-                                                         rtol=1e-5, atol=1e-5)
+  @tf_test_util.run_deprecated_v1
+  def test_locallyconnected_implementation(self):
+    with self.cached_session():
+      num_samples = 4
+      num_classes = 3
+      num_epochs = 2
+
+      np.random.seed(1)
+      targets = np.random.randint(0, num_classes, (num_samples,))
+
+      for width in [1, 6]:
+        for height in [7]:
+          for filters in [2]:
+            for data_format in ['channels_first', 'channels_last']:
+              inputs = get_inputs(
+                  data_format, filters, height, num_samples, width)
+
+              for kernel_x in [(3,)]:
+                for kernel_y in [()] if width == 1 else [(2,)]:
+                  for stride_x in [(1,)]:
+                    for stride_y in [()] if width == 1 else [(3,)]:
+                      for layers in [2]:
+                        kwargs = {
+                            'layers': layers,
+                            'filters': filters,
+                            'kernel_size': kernel_x + kernel_y,
+                            'strides': stride_x + stride_y,
+                            'data_format': data_format,
+                            'num_classes': num_classes
+                        }
+                        model_1 = get_model(implementation=1, **kwargs)
+                        model_2 = get_model(implementation=2, **kwargs)
+
+                        # Build models.
+                        model_1.train_on_batch(inputs, targets)
+                        model_2.train_on_batch(inputs, targets)
+
+                        # Copy weights.
+                        copy_model_weights(model_2, model_1)
+
+                        # Compare outputs at initialization.
+                        out_1 = model_1.call(inputs)
+                        out_2 = model_2.call(inputs)
+                        self.assertAllCloseAccordingToType(out_1, out_2,
+                                                           rtol=1e-5, atol=1e-5)
+
+                        # Train.
+                        model_1.fit(x=inputs,
+                                    y=targets,
+                                    epochs=num_epochs,
+                                    batch_size=num_samples)
+                        model_2.fit(x=inputs,
+                                    y=targets,
+                                    epochs=num_epochs,
+                                    batch_size=num_samples)
+
+                        # Compare outputs after a few training steps.
+                        out_1 = model_1.call(inputs)
+                        out_2 = model_2.call(inputs)
+                        self.assertAllCloseAccordingToType(
+                            out_1, out_2, atol=2e-4)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_make_2d(self):
@@ -316,7 +335,7 @@ class LocallyConnectedLayersTest(test.TestCase):
       self.assertAllCloseAccordingToType(inputs_2d, inputs_2d_tf)
 
 
-def get_inputs(data_format, filters, height, n_train, width):
+def get_inputs(data_format, filters, height, num_samples, width):
   if data_format == 'channels_first':
     if width == 1:
       input_shape = (filters, height)
@@ -333,7 +352,7 @@ def get_inputs(data_format, filters, height, n_train, width):
     raise NotImplementedError(data_format)
 
   inputs = np.random.normal(0, 1,
-                            (n_train,) + input_shape).astype(np.float32)
+                            (num_samples,) + input_shape).astype(np.float32)
   return inputs
 
 
@@ -352,9 +371,8 @@ def get_model(implementation,
               kernel_size,
               strides,
               layers,
-              n_classes,
-              data_format,
-              input_shape):
+              num_classes,
+              data_format):
   model = keras.Sequential()
 
   if len(kernel_size) == 1:
@@ -377,13 +395,12 @@ def get_model(implementation,
         implementation=implementation))
 
   model.add(keras.layers.Flatten())
-  model.add(keras.layers.Dense(n_classes))
+  model.add(keras.layers.Dense(num_classes))
   model.compile(
       optimizer=RMSPropOptimizer(0.01),
       metrics=[keras.metrics.categorical_accuracy],
       loss=xent
   )
-  model.build(input_shape)
   return model
 
 
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index f5369153240780bcfbdd2a90207efccde856444b..aea426150260cf4c7b849b18319789eaf4f5da5a 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -18,18 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+from tensorflow.python.training import gradient_descent
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class LSTMLayerTest(test.TestCase):
+@tf_test_util.run_all_in_graph_and_eager_modes
+class LSTMLayerTest(test.TestCase, parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_return_sequences_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -54,9 +57,8 @@ class LSTMLayerTest(test.TestCase):
     layer = keras.layers.LSTM(units, return_sequences=True)
     model.add(layer)
     outputs = model.layers[-1].output
-    self.assertEquals(outputs.get_shape().as_list(), [None, timesteps, units])
+    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_dynamic_behavior_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -70,7 +72,6 @@ class LSTMLayerTest(test.TestCase):
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_dropout_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -83,133 +84,62 @@ class LSTMLayerTest(test.TestCase):
                 'recurrent_dropout': 0.1},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_implementation_mode_LSTM(self):
+  @parameterized.parameters([0, 1, 2])
+  def test_implementation_mode_LSTM(self, implementation_mode):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    for mode in [0, 1, 2]:
-      testing_utils.layer_test(
-          keras.layers.LSTM,
-          kwargs={'units': units,
-                  'implementation': mode},
-          input_shape=(num_samples, timesteps, embedding_dim))
-
-  def test_statefulness_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer_class = keras.layers.LSTM
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Embedding(
-              4,
-              embedding_dim,
-              mask_zero=True,
-              input_length=timesteps,
-              batch_input_shape=(num_samples, timesteps)))
-      layer = layer_class(
-          units, return_sequences=False, stateful=True, weights=None)
-      model.add(layer)
-      model.compile(optimizer='sgd', loss='mse')
-      out1 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertEqual(out1.shape, (num_samples, units))
-
-      # train once so that the states change
-      model.train_on_batch(
-          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-      out2 = model.predict(np.ones((num_samples, timesteps)))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out2.max(), out3.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertAllClose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out4.max(), out5.max())
-
-      # Check masking
-      layer.reset_states()
-
-      left_padded_input = np.ones((num_samples, timesteps))
-      left_padded_input[0, :1] = 0
-      left_padded_input[1, :2] = 0
-      out6 = model.predict(left_padded_input)
-
-      layer.reset_states()
-
-      right_padded_input = np.ones((num_samples, timesteps))
-      right_padded_input[0, -1:] = 0
-      right_padded_input[1, -2:] = 0
-      out7 = model.predict(right_padded_input)
-
-      self.assertAllClose(out7, out6, atol=1e-5)
-
-  def test_regularizers_LSTM(self):
-    embedding_dim = 4
-    layer_class = keras.layers.LSTM
-    with self.cached_session():
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2',
-          activity_regularizer='l1')
-      layer.build((None, None, 2))
-      self.assertEqual(len(layer.losses), 3)
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+    testing_utils.layer_test(
+        keras.layers.LSTM,
+        kwargs={'units': units,
+                'implementation': implementation_mode},
+        input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_constraints_LSTM(self):
     embedding_dim = 4
     layer_class = keras.layers.LSTM
-    with self.cached_session():
-      k_constraint = keras.constraints.max_norm(0.01)
-      r_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_constraint=k_constraint,
-          recurrent_constraint=r_constraint,
-          bias_constraint=b_constraint)
-      layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  @tf_test_util.run_in_graph_and_eager_modes
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+  @tf_test_util.run_v1_only('b/120545219')
   def test_with_masking_layer_LSTM(self):
     layer_class = keras.layers.LSTM
-    with self.cached_session():
-      inputs = np.random.random((2, 3, 4))
-      targets = np.abs(np.random.random((2, 3, 5)))
-      targets /= targets.sum(axis=-1, keepdims=True)
-      model = keras.models.Sequential()
-      model.add(keras.layers.Masking(input_shape=(3, 4)))
-      model.add(layer_class(units=5, return_sequences=True, unroll=False))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=RMSPropOptimizer(0.01))
-      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @tf_test_util.run_v1_only('b/120545219')
+  def test_masking_with_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    lstm_cells = [keras.layers.LSTMCell(10), keras.layers.LSTMCell(5)]
+    model.add(keras.layers.RNN(lstm_cells, return_sequences=True, unroll=False))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_LSTM(self):
     layer_class = keras.layers.LSTM
@@ -225,25 +155,25 @@ class LSTMLayerTest(test.TestCase):
     units = 3
     num_samples = 2
 
-    with self.cached_session():
-      # Test with Keras tensor
-      inputs = keras.Input((timesteps, embedding_dim))
-      initial_state = [keras.Input((units,)) for _ in range(num_states)]
-      layer = keras.layers.LSTM(units)
-      if len(initial_state) == 1:
-        output = layer(inputs, initial_state=initial_state[0])
-      else:
-        output = layer(inputs, initial_state=initial_state)
-      assert initial_state[0] in layer._inbound_nodes[0].input_tensors
-
-      model = keras.models.Model([inputs] + initial_state, output)
-      model.compile(loss='categorical_crossentropy', optimizer='adam')
-
-      inputs = np.random.random((num_samples, timesteps, embedding_dim))
-      initial_state = [np.random.random((num_samples, units))
-                       for _ in range(num_states)]
-      targets = np.random.random((num_samples, units))
-      model.train_on_batch([inputs] + initial_state, targets)
+    # Test with Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    layer = keras.layers.LSTM(units)
+    if len(initial_state) == 1:
+      output = layer(inputs, initial_state=initial_state[0])
+    else:
+      output = layer(inputs, initial_state=initial_state)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=adam.AdamOptimizer())
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [np.random.random((num_samples, units))
+                     for _ in range(num_states)]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
 
   def test_specify_initial_state_non_keras_tensor(self):
     num_states = 2
@@ -252,21 +182,21 @@ class LSTMLayerTest(test.TestCase):
     units = 3
     num_samples = 2
 
-    with self.cached_session():
-      # Test with non-Keras tensor
-      inputs = keras.Input((timesteps, embedding_dim))
-      initial_state = [keras.backend.random_normal_variable(
-          (num_samples, units), 0, 1)
-                       for _ in range(num_states)]
-      layer = keras.layers.LSTM(units)
-      output = layer(inputs, initial_state=initial_state)
+    # Test with non-Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.backend.random_normal_variable(
+        (num_samples, units), 0, 1)
+                     for _ in range(num_states)]
+    layer = keras.layers.LSTM(units)
+    output = layer(inputs, initial_state=initial_state)
 
-      model = keras.models.Model(inputs, output)
-      model.compile(loss='categorical_crossentropy', optimizer='adam')
+    model = keras.models.Model(inputs, output)
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=adam.AdamOptimizer())
 
-      inputs = np.random.random((num_samples, timesteps, embedding_dim))
-      targets = np.random.random((num_samples, units))
-      model.train_on_batch(inputs, targets)
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch(inputs, targets)
 
   def test_reset_states_with_values(self):
     num_states = 2
@@ -275,29 +205,28 @@ class LSTMLayerTest(test.TestCase):
     units = 3
     num_samples = 2
 
-    with self.cached_session():
-      layer = keras.layers.LSTM(units, stateful=True)
-      layer.build((num_samples, timesteps, embedding_dim))
-      layer.reset_states()
-      assert len(layer.states) == num_states
-      assert layer.states[0] is not None
-      self.assertAllClose(
-          keras.backend.eval(layer.states[0]),
-          np.zeros(keras.backend.int_shape(layer.states[0])),
-          atol=1e-4)
-      state_shapes = [keras.backend.int_shape(state) for state in layer.states]
-      values = [np.ones(shape) for shape in state_shapes]
-      if len(values) == 1:
-        values = values[0]
-      layer.reset_states(values)
-      self.assertAllClose(
-          keras.backend.eval(layer.states[0]),
-          np.ones(keras.backend.int_shape(layer.states[0])),
-          atol=1e-4)
-
-      # Test with invalid data
-      with self.assertRaises(ValueError):
-        layer.reset_states([1] * (len(layer.states) + 1))
+    layer = keras.layers.LSTM(units, stateful=True)
+    layer.build((num_samples, timesteps, embedding_dim))
+    layer.reset_states()
+    assert len(layer.states) == num_states
+    assert layer.states[0] is not None
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.zeros(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+    state_shapes = [keras.backend.int_shape(state) for state in layer.states]
+    values = [np.ones(shape) for shape in state_shapes]
+    if len(values) == 1:
+      values = values[0]
+    layer.reset_states(values)
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.ones(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+
+    # Test with invalid data
+    with self.assertRaises(ValueError):
+      layer.reset_states([1] * (len(layer.states) + 1))
 
   def test_specify_state_with_masking(self):
     num_states = 2
@@ -306,21 +235,20 @@ class LSTMLayerTest(test.TestCase):
     units = 3
     num_samples = 2
 
-    with self.cached_session():
-      inputs = keras.Input((timesteps, embedding_dim))
-      _ = keras.layers.Masking()(inputs)
-      initial_state = [keras.Input((units,)) for _ in range(num_states)]
-      output = keras.layers.LSTM(units)(inputs, initial_state=initial_state)
+    inputs = keras.Input((timesteps, embedding_dim))
+    _ = keras.layers.Masking()(inputs)
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    output = keras.layers.LSTM(units)(inputs, initial_state=initial_state)
 
-      model = keras.models.Model([inputs] + initial_state, output)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=RMSPropOptimizer(0.01))
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(0.01))
 
-      inputs = np.random.random((num_samples, timesteps, embedding_dim))
-      initial_state = [np.random.random((num_samples, units))
-                       for _ in range(num_states)]
-      targets = np.random.random((num_samples, units))
-      model.train_on_batch([inputs] + initial_state, targets)
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [np.random.random((num_samples, units))
+                     for _ in range(num_states)]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
 
   def test_return_state(self):
     num_states = 2
@@ -329,17 +257,16 @@ class LSTMLayerTest(test.TestCase):
     units = 3
     num_samples = 2
 
-    with self.cached_session():
-      inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-      layer = keras.layers.LSTM(units, return_state=True, stateful=True)
-      outputs = layer(inputs)
-      state = outputs[1:]
-      assert len(state) == num_states
-      model = keras.models.Model(inputs, state[0])
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = keras.layers.LSTM(units, return_state=True, stateful=True)
+    outputs = layer(inputs)
+    state = outputs[1:]
+    assert len(state) == num_states
+    model = keras.models.Model(inputs, state[0])
 
-      inputs = np.random.random((num_samples, timesteps, embedding_dim))
-      state = model.predict(inputs)
-      self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    state = model.predict(inputs)
+    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
 
   def test_state_reuse(self):
     timesteps = 3
@@ -347,16 +274,15 @@ class LSTMLayerTest(test.TestCase):
     units = 3
     num_samples = 2
 
-    with self.cached_session():
-      inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-      layer = keras.layers.LSTM(units, return_state=True, return_sequences=True)
-      outputs = layer(inputs)
-      output, state = outputs[0], outputs[1:]
-      output = keras.layers.LSTM(units)(output, initial_state=state)
-      model = keras.models.Model(inputs, output)
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = keras.layers.LSTM(units, return_state=True, return_sequences=True)
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.LSTM(units)(output, initial_state=state)
+    model = keras.models.Model(inputs, output)
 
-      inputs = np.random.random((num_samples, timesteps, embedding_dim))
-      outputs = model.predict(inputs)
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    outputs = model.predict(inputs)
 
   def test_initial_states_as_other_inputs(self):
     timesteps = 3
@@ -366,25 +292,111 @@ class LSTMLayerTest(test.TestCase):
     num_states = 2
     layer_class = keras.layers.LSTM
 
+    # Test with Keras tensor
+    main_inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    inputs = [main_inputs] + initial_state
+
+    layer = layer_class(units)
+    output = layer(inputs)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model(inputs, output)
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=adam.AdamOptimizer())
+
+    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [np.random.random((num_samples, units))
+                     for _ in range(num_states)]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([main_inputs] + initial_state, targets)
+
+
+class LSTMLayerGraphOnlyTest(test.TestCase):
+
+  @tf_test_util.run_v1_only('b/120545219')
+  def test_statefulness_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.LSTM
     with self.cached_session():
-      # Test with Keras tensor
-      main_inputs = keras.Input((timesteps, embedding_dim))
-      initial_state = [keras.Input((units,)) for _ in range(num_states)]
-      inputs = [main_inputs] + initial_state
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                    loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertAllClose(out3, out4, atol=1e-5)
 
-      layer = layer_class(units)
-      output = layer(inputs)
-      assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
 
-      model = keras.models.Model(inputs, output)
-      model.compile(loss='categorical_crossentropy', optimizer='adam')
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
 
-      main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
-      initial_state = [np.random.random((num_samples, units))
-                       for _ in range(num_states)]
-      targets = np.random.random((num_samples, units))
-      model.train_on_batch([main_inputs] + initial_state, targets)
+      self.assertAllClose(out7, out6, atol=1e-5)
 
+  @tf_test_util.run_deprecated_v1
+  def test_regularizers_LSTM(self):
+    embedding_dim = 4
+    layer_class = keras.layers.LSTM
+    with self.cached_session():
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      x = keras.backend.variable(np.ones((2, 3, 2)))
+      layer(x)
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index f295af3fe04d87d260e4f6a98762dcfb90883531..45e705c69606c4dd839429597aa9903a9442234a 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -212,7 +212,7 @@ class _Merge(Layer):
     if len(mask) != len(inputs):
       raise ValueError('The lists `inputs` and `mask` '
                        'should have the same length.')
-    if all([m is None for m in mask]):
+    if all(m is None for m in mask):
       return None
     masks = [array_ops.expand_dims(m, axis=0) for m in mask if m is not None]
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
@@ -378,7 +378,7 @@ class Concatenate(_Merge):
     if not isinstance(input_shape, list) or len(input_shape) < 2:
       raise ValueError('A `Concatenate` layer should be called '
                        'on a list of at least 2 inputs')
-    if all([shape is None for shape in input_shape]):
+    if all(shape is None for shape in input_shape):
       return
     reduced_inputs_shapes = [list(shape) for shape in input_shape]
     shape_set = set()
@@ -418,7 +418,7 @@ class Concatenate(_Merge):
     if len(mask) != len(inputs):
       raise ValueError('The lists `inputs` and `mask` '
                        'should have the same length.')
-    if all([m is None for m in mask]):
+    if all(m is None for m in mask):
       return None
     # Make a list of masks while making sure
     # the dimensionality of each mask
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index 7bcfcaeddb0b1d8cb6363da456f821dad5b8233a..fcb161ae20a4caeaa9514477529c2885d6e5bd41 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -26,9 +26,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class MergeLayersTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_merge_add(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
@@ -45,25 +45,6 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 4, 5))
     self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
 
-  def test_merge_add_masking(self):
-    with self.cached_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      m1 = keras.layers.Masking()(i1)
-      layer = keras.layers.Add()
-      o = layer([m1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
-      mask = layer.output_mask
-      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
-
-  def test_merge_add_dynamic_shape(self):
-    with self.cached_session():
-      i1 = array_ops.placeholder(shape=(4, None), dtype='float32')
-      i2 = array_ops.placeholder(shape=(4, 5), dtype='float32')
-      layer = keras.layers.Add()
-      o = layer([i1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [4, 5])
-
   def test_merge_elementwise_errors(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 6))
@@ -76,7 +57,6 @@ class MergeLayersTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.add([i1])
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_merge_multiply(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
@@ -92,7 +72,6 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 4, 5))
     self.assertAllClose(out, x1 * x2 * x3, atol=1e-4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_merge_average(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
@@ -106,7 +85,6 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 4, 5))
     self.assertAllClose(out, 0.5 * (x1 + x2), atol=1e-4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_merge_maximum(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
@@ -120,7 +98,6 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 4, 5))
     self.assertAllClose(out, np.maximum(x1, x2), atol=1e-4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_merge_minimum(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
@@ -134,7 +111,6 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 4, 5))
     self.assertAllClose(out, np.minimum(x1, x2), atol=1e-4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_merge_concatenate(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
@@ -148,17 +124,6 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 8, 5))
     self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
 
-  def test_merge_concatenate_masking(self):
-    with self.cached_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      m1 = keras.layers.Masking()(i1)
-      layer = keras.layers.Concatenate()
-      o = layer([m1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 10])
-      mask = layer.output_mask
-      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
-
   def test_concatenate_errors(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(3, 5))
@@ -169,7 +134,6 @@ class MergeLayersTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'called on a list'):
       keras.layers.concatenate([i1], axis=-1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_merge_dot(self):
     i1 = keras.layers.Input(shape=(4,))
     i2 = keras.layers.Input(shape=(4,))
@@ -215,7 +179,6 @@ class MergeLayersTest(test.TestCase):
       dot = keras.layers.Dot(1)
       dot.compute_output_shape(1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_merge_subtract(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
@@ -231,5 +194,39 @@ class MergeLayersTest(test.TestCase):
       keras.layers.subtract([i1, i1, i1])
 
 
+class MergeLayersGraphOnlyTest(test.TestCase):
+
+  def test_merge_add_masking(self):
+    with self.cached_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      m1 = keras.layers.Masking()(i1)
+      layer = keras.layers.Add()
+      o = layer([m1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      mask = layer.output_mask
+      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
+
+  @tf_test_util.run_deprecated_v1
+  def test_merge_add_dynamic_shape(self):
+    with self.cached_session():
+      i1 = array_ops.placeholder(shape=(4, None), dtype='float32')
+      i2 = array_ops.placeholder(shape=(4, 5), dtype='float32')
+      layer = keras.layers.Add()
+      o = layer([i1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [4, 5])
+
+  def test_merge_concatenate_masking(self):
+    with self.cached_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      m1 = keras.layers.Masking()(i1)
+      layer = keras.layers.Concatenate()
+      o = layer([m1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 10])
+      mask = layer.output_mask
+      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py
index cea304680be885d76a849328df432ae66669bc48..325dd933b21bd4182fcd8c20493acba70834383f 100644
--- a/tensorflow/python/keras/layers/noise_test.py
+++ b/tensorflow/python/keras/layers/noise_test.py
@@ -24,23 +24,21 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class NoiseLayersTest(test.TestCase):
 
   def test_GaussianNoise(self):
-    with self.cached_session():
-      testing_utils.layer_test(
-          keras.layers.GaussianNoise,
-          kwargs={'stddev': 1.},
-          input_shape=(3, 2, 3))
+    testing_utils.layer_test(
+        keras.layers.GaussianNoise,
+        kwargs={'stddev': 1.},
+        input_shape=(3, 2, 3))
 
   def test_GaussianDropout(self):
-    with self.cached_session():
-      testing_utils.layer_test(
-          keras.layers.GaussianDropout,
-          kwargs={'rate': 0.5},
-          input_shape=(3, 2, 3))
+    testing_utils.layer_test(
+        keras.layers.GaussianDropout,
+        kwargs={'rate': 0.5},
+        input_shape=(3, 2, 3))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_AlphaDropout(self):
     testing_utils.layer_test(
         keras.layers.AlphaDropout,
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 013d57208883b777a5006e5b2fb84673118f6dd3..75b10222edd19ea59361d1312ead727e02431cac 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -18,6 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
+from tensorflow.python import tf2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -26,8 +30,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -36,12 +40,11 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.layers.BatchNormalization')
-class BatchNormalization(Layer):
+@tf_export('keras.layers.BatchNormalization', v1=[])
+class BatchNormalizationV2(Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
   Normalize the activations of the previous layer at each batch,
@@ -84,8 +87,10 @@ class BatchNormalization(Layer):
       and should be neither too small (which would add noise) nor too large
       (which would give stale estimates). Note that `momentum` is still applied
       to get the means and variances for inference.
-    fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.
+    fused: if `True`, use a faster, fused implementation, or raise a ValueError
+      if the fused implementation cannot be used. If `None`, use the faster
+      implementation if possible. If False, do not used the fused
+      implementation.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
@@ -120,6 +125,9 @@ class BatchNormalization(Layer):
         Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
   """
 
+  # The BatchNormalizationV1 subclass sets this to False to use the V1 behavior.
+  _USE_V2_BEHAVIOR = True
+
   def __init__(self,
                axis=-1,
                momentum=0.99,
@@ -143,12 +151,15 @@ class BatchNormalization(Layer):
                adjustment=None,
                name=None,
                **kwargs):
-    super(BatchNormalization, self).__init__(
+    super(BatchNormalizationV2, self).__init__(
         name=name, trainable=trainable, **kwargs)
     if isinstance(axis, list):
       self.axis = axis[:]
-    else:
+    elif isinstance(axis, int):
       self.axis = axis
+    else:
+      raise TypeError('axis must be int or list, type given: %s'
+                      % type(self.axis))
     self.momentum = momentum
     self.epsilon = epsilon
     self.center = center
@@ -165,7 +176,14 @@ class BatchNormalization(Layer):
     self.renorm = renorm
     self.virtual_batch_size = virtual_batch_size
     self.adjustment = adjustment
-    if fused is None:
+    if self._USE_V2_BEHAVIOR:
+      if fused:
+        self._raise_if_fused_cannot_be_used()
+      # We leave fused as None if self._fused_can_be_used()==True, since we
+      # still may set it to False in self.build() if the input rank is not 4.
+      elif fused is None and not self._fused_can_be_used():
+        fused = False
+    elif fused is None:
       fused = True
     self.supports_masking = True
 
@@ -181,6 +199,38 @@ class BatchNormalization(Layer):
       self.renorm_clipping = renorm_clipping
       self.renorm_momentum = renorm_momentum
 
+  def _raise_if_fused_cannot_be_used(self):
+    """Raises a ValueError if fused implementation cannot be used.
+
+    In addition to the checks done in this function, the input tensors rank must
+    be 4. The input rank check can only be done once the input shape is known.
+    """
+    # Currently fused batch norm doesn't support renorm. It also only supports a
+    # channel dimension on axis 1 or 3, when no virtual batch size or adjustment
+    # is used.
+    if self.renorm:
+      raise ValueError('Passing both fused=True and renorm=True is '
+                       'unsupported')
+    axis = [self.axis] if isinstance(self.axis, int) else self.axis
+    # Axis -3 is equivalent to 1, and axis -1 is equivalent to 3, because the
+    # input rank is required to be 4 (which is checked later).
+    if len(axis) > 1 or axis[0] not in (-3, -1, 1, 3):
+      raise ValueError('Passing fused=True is only supported when axis is 1 '
+                       'or 3')
+    if self.virtual_batch_size is not None:
+      raise ValueError('Passing fused=True is unsupported when '
+                       'virtual_batch_size is specified.')
+    if self.adjustment is not None:
+      raise ValueError('Passing fused=True is unsupported when '
+                       'adjustment is specified.')
+
+  def _fused_can_be_used(self):
+    try:
+      self._raise_if_fused_cannot_be_used()
+      return True
+    except ValueError:
+      return False
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if not input_shape.ndims:
@@ -191,10 +241,6 @@ class BatchNormalization(Layer):
     if isinstance(self.axis, int):
       self.axis = [self.axis]
 
-    if not isinstance(self.axis, list):
-      raise TypeError('axis must be int or list, type given: %s'
-                      % type(self.axis))
-
     for idx, x in enumerate(self.axis):
       if x < 0:
         self.axis[idx] = ndims + x
@@ -219,16 +265,18 @@ class BatchNormalization(Layer):
         raise ValueError('When using virtual_batch_size, adjustment cannot '
                          'be specified')
 
-    if self.fused:
-      # Currently fused batch norm doesn't support renorm. It also only supports
-      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
+    if self.fused in (None, True):
       # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
       # output back to its original shape accordingly.
-      self.fused = (not self.renorm and
-                    ndims == 4 and
-                    self.axis in [[1], [3]] and
-                    self.virtual_batch_size is None and
-                    self.adjustment is None)
+      if self._USE_V2_BEHAVIOR:
+        if self.fused is None:
+          self.fused = (ndims == 4)
+        elif self.fused and ndims != 4:
+          raise ValueError('Batch normalization layers with fused=True only '
+                           'support 4D input tensors.')
+      else:
+        assert self.fused is not None
+        self.fused = (ndims == 4 and self._fused_can_be_used())
       # TODO(chrisying): fused batch norm is currently not supported for
       # multi-axis batch norm and by extension virtual batches. In some cases,
       # it might be possible to use fused batch norm but would require reshaping
@@ -251,7 +299,7 @@ class BatchNormalization(Layer):
     else:
       param_dtype = self.dtype or dtypes.float32
 
-    axis_to_dim = {x: input_shape[x].value for x in self.axis}
+    axis_to_dim = {x: input_shape.dims[x].value for x in self.axis}
     for x in axis_to_dim:
       if axis_to_dim[x] is None:
         raise ValueError('Input has undefined `axis` dimension. Input shape: ',
@@ -366,11 +414,19 @@ class BatchNormalization(Layer):
   def _assign_moving_average(self, variable, value, momentum):
     with ops.name_scope(None, 'AssignMovingAvg',
                         [variable, value, momentum]) as scope:
-      with ops.colocate_with(variable):
+      # TODO(apassos,srbs,skyewm): the colocation constraints here are disabled
+      # because of a bug which leads cond_v2 to skip rewriting them creating
+      # conflicts.
+      if tf2.enabled():
+        cm = contextlib.contextmanager(lambda: (yield))
+      else:
+        cm = ops.colocate_with(variable)
+      with cm:
         decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
-        update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
+        update_delta = (
+            variable - math_ops.cast(value, variable.dtype)) * decay
         return state_ops.assign_sub(variable, update_delta, name=scope)
 
   def _fused_batch_norm(self, inputs, training):
@@ -491,8 +547,10 @@ class BatchNormalization(Layer):
 
     return (r, d, new_mean, new_variance)
 
+  def _moments(self, inputs, reduction_axes, keep_dims):
+    return nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+
   def call(self, inputs, training=None):
-    original_training_value = training
     if training is None:
       training = K.learning_phase()
 
@@ -516,8 +574,6 @@ class BatchNormalization(Layer):
         # Currently never reaches here since fused_batch_norm does not support
         # virtual batching
         outputs = undo_virtual_batching(outputs)
-      if not context.executing_eagerly() and original_training_value is None:
-        outputs._uses_learning_phase = True  # pylint: disable=protected-access
       return outputs
 
     # Compute the axes along which to reduce the mean / variance
@@ -530,7 +586,7 @@ class BatchNormalization(Layer):
     # Broadcasting only necessary for single-axis batch norm where the axis is
     # not the last dimension
     broadcast_shape = [1] * ndims
-    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
+    broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
     def _broadcast(v):
       if (v is not None and
           len(v.get_shape()) != ndims and
@@ -565,7 +621,8 @@ class BatchNormalization(Layer):
       # Some of the computations here are not necessary when training==False
       # but not a constant. However, this makes the code simpler.
       keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
-      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+      mean, variance = self._moments(
+          inputs, reduction_axes, keep_dims=keep_dims)
 
       moving_mean = self.moving_mean
       moving_variance = self.moving_variance
@@ -634,8 +691,6 @@ class BatchNormalization(Layer):
 
     if self.virtual_batch_size is not None:
       outputs = undo_virtual_batching(outputs)
-    if not context.executing_eagerly() and original_training_value is None:
-      outputs._uses_learning_phase = True  # pylint: disable=protected-access
     return outputs
 
   def compute_output_shape(self, input_shape):
@@ -673,5 +728,36 @@ class BatchNormalization(Layer):
                       'layer cannot be serialized and has been omitted from '
                       'the layer config. It will not be included when '
                       're-creating the layer from the saved config.')
-    base_config = super(BatchNormalization, self).get_config()
+    base_config = super(BatchNormalizationV2, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+def _replace_in_v2_docstring(old, new):
+  string = BatchNormalizationV2.__doc__
+  if old not in string:
+    raise ValueError('Could not find following string in BatchNormalizationV2 '
+                     'docstring: "{}"'.format(old))
+  return string.replace(old, new)
+
+
+@tf_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
+class BatchNormalizationV1(BatchNormalizationV2):
+
+  __doc__ = _replace_in_v2_docstring(
+      '''
+    fused: if `True`, use a faster, fused implementation, or raise a ValueError
+      if the fused implementation cannot be used. If `None`, use the faster
+      implementation if possible. If False, do not used the fused
+      implementation.''',
+
+      '''
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.''')
+
+  _USE_V2_BEHAVIOR = False
+
+
+if tf2.enabled():
+  BatchNormalization = BatchNormalizationV2
+else:
+  BatchNormalization = BatchNormalizationV1
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 2844b84799f906b85a1edb70a661e097f7cd01d9..c1acc2eb3a3a463f4f71d5a010a3388029cb82f4 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -21,106 +21,112 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import normalization
 from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
+@tf_test_util.run_v1_only('b/120545219')
 class NormalizationLayersTest(test.TestCase):
 
   def test_basic_batchnorm(self):
-    with self.cached_session():
-      testing_utils.layer_test(
-          keras.layers.BatchNormalization,
-          kwargs={
-              'momentum': 0.9,
-              'epsilon': 0.1,
-              'gamma_regularizer': keras.regularizers.l2(0.01),
-              'beta_regularizer': keras.regularizers.l2(0.01)
-          },
-          input_shape=(3, 4, 2))
-      testing_utils.layer_test(
-          keras.layers.BatchNormalization,
-          kwargs={
-              'gamma_initializer': 'ones',
-              'beta_initializer': 'ones',
-              'moving_mean_initializer': 'zeros',
-              'moving_variance_initializer': 'ones'
-          },
-          input_shape=(3, 4, 2))
-      testing_utils.layer_test(
-          keras.layers.BatchNormalization,
-          kwargs={'scale': False,
-                  'center': False},
-          input_shape=(3, 3))
+    testing_utils.layer_test(
+        keras.layers.BatchNormalization,
+        kwargs={
+            'momentum': 0.9,
+            'epsilon': 0.1,
+            'gamma_regularizer': keras.regularizers.l2(0.01),
+            'beta_regularizer': keras.regularizers.l2(0.01)
+        },
+        input_shape=(3, 4, 2))
+    testing_utils.layer_test(
+        keras.layers.BatchNormalization,
+        kwargs={
+            'gamma_initializer': 'ones',
+            'beta_initializer': 'ones',
+            'moving_mean_initializer': 'zeros',
+            'moving_variance_initializer': 'ones'
+        },
+        input_shape=(3, 4, 2))
+    testing_utils.layer_test(
+        keras.layers.BatchNormalization,
+        kwargs={'scale': False,
+                'center': False},
+        input_shape=(3, 3))
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': True},
+        input_shape=(3, 3, 3, 3))
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': None},
+        input_shape=(3, 3, 3))
 
   def test_batchnorm_weights(self):
-    with self.cached_session():
-      layer = keras.layers.BatchNormalization(scale=False, center=False)
-      layer.build((None, 3, 4))
-      self.assertEqual(len(layer.trainable_weights), 0)
-      self.assertEqual(len(layer.weights), 2)
+    layer = keras.layers.BatchNormalization(scale=False, center=False)
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.trainable_weights), 0)
+    self.assertEqual(len(layer.weights), 2)
 
-      layer = keras.layers.BatchNormalization()
-      layer.build((None, 3, 4))
-      self.assertEqual(len(layer.trainable_weights), 2)
-      self.assertEqual(len(layer.weights), 4)
+    layer = keras.layers.BatchNormalization()
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.trainable_weights), 2)
+    self.assertEqual(len(layer.weights), 4)
 
   def test_batchnorm_regularization(self):
-    with self.cached_session():
-      layer = keras.layers.BatchNormalization(
-          gamma_regularizer='l1', beta_regularizer='l1')
-      layer.build((None, 3, 4))
-      self.assertEqual(len(layer.losses), 2)
-      max_norm = keras.constraints.max_norm
-      layer = keras.layers.BatchNormalization(
-          gamma_constraint=max_norm, beta_constraint=max_norm)
-      layer.build((None, 3, 4))
-      self.assertEqual(layer.gamma.constraint, max_norm)
-      self.assertEqual(layer.beta.constraint, max_norm)
+    layer = keras.layers.BatchNormalization(
+        gamma_regularizer='l1', beta_regularizer='l1')
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.losses), 2)
+    max_norm = keras.constraints.max_norm
+    layer = keras.layers.BatchNormalization(
+        gamma_constraint=max_norm, beta_constraint=max_norm)
+    layer.build((None, 3, 4))
+    self.assertEqual(layer.gamma.constraint, max_norm)
+    self.assertEqual(layer.beta.constraint, max_norm)
+
+  def _test_batchnorm_correctness(self, dtype, use_v2=True, fused=False):
+    model = keras.models.Sequential()
+    layer_ctor = (normalization.BatchNormalizationV2 if use_v2
+                  else normalization.BatchNormalizationV1)
+    norm = layer_ctor(input_shape=(2, 2, 2), momentum=0.8, fused=fused)
+    model.add(norm)
+    model.compile(loss='mse',
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    # centered on 5.0, variance 10.0
+    x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
+         .astype(dtype))
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+    out -= keras.backend.eval(norm.beta)
+    out /= keras.backend.eval(norm.gamma)
+
+    np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+    np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
   def test_batchnorm_correctness(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
-      model.add(norm)
-      model.compile(loss='mse', optimizer='sgd')
-
-      # centered on 5.0, variance 10.0
-      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
-      model.fit(x, x, epochs=4, verbose=0)
-      out = model.predict(x)
-      out -= keras.backend.eval(norm.beta)
-      out /= keras.backend.eval(norm.gamma)
-
-      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+    self._test_batchnorm_correctness(np.float32)
+    self._test_batchnorm_correctness(np.float32, fused=True)
+    self._test_batchnorm_correctness(np.float32, use_v2=False)
 
   def test_batchnorm_mixed_precision(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
-      model.add(norm)
-      model.compile(loss='mse', optimizer='sgd')
-
-      # centered on 5.0, variance 10.0
-      x = np.random.normal(
-          loc=5.0, scale=10.0, size=(1000, 10)).astype(np.float16)
-      model.fit(x, x, epochs=4, verbose=0)
-      out = model.predict(x)
-      out -= keras.backend.eval(norm.beta)
-      out /= keras.backend.eval(norm.gamma)
-
-      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+    self._test_batchnorm_correctness(np.float16)
+    self._test_batchnorm_correctness(np.float16, fused=True)
+    self._test_batchnorm_correctness(np.float16, use_v2=False)
 
   def test_batchnorm_convnet(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         model = keras.models.Sequential()
         norm = keras.layers.BatchNormalization(
             axis=1, input_shape=(3, 4, 4), momentum=0.8)
         model.add(norm)
-        model.compile(loss='mse', optimizer='sgd')
+        model.compile(loss='mse',
+                      optimizer=gradient_descent.GradientDescentOptimizer(0.01))
 
         # centered on 5.0, variance 10.0
         x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
@@ -133,24 +139,97 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
   def test_batchnorm_convnet_channel_last(self):
-    with self.cached_session():
-      # keras.backend.set_learning_phase(True)
-
-      model = keras.models.Sequential()
-      norm = keras.layers.BatchNormalization(
-          axis=-1, input_shape=(4, 4, 3), momentum=0.8)
-      model.add(norm)
-      model.compile(loss='mse', optimizer='sgd')
-
-      # centered on 5.0, variance 10.0
-      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
-      model.fit(x, x, epochs=4, verbose=0)
-      out = model.predict(x)
-      out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
-      out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
-
-      np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
-      np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+    model = keras.models.Sequential()
+    norm = keras.layers.BatchNormalization(
+        axis=-1, input_shape=(4, 4, 3), momentum=0.8)
+    model.add(norm)
+    model.compile(loss='mse',
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    # centered on 5.0, variance 10.0
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+    out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+    out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+    np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+    np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
+  def test_v1_fused_attribute(self):
+    norm = normalization.BatchNormalizationV1()
+    inp = keras.layers.Input((4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, True)
+
+    norm = normalization.BatchNormalizationV1(fused=False)
+    self.assertEqual(norm.fused, False)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV1(virtual_batch_size=2)
+    self.assertEqual(norm.fused, True)
+    inp = keras.layers.Input(shape=(2, 2, 2))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+  def test_v2_fused_attribute(self):
+    norm = normalization.BatchNormalizationV2()
+    self.assertEqual(norm.fused, None)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, True)
+
+    norm = normalization.BatchNormalizationV2()
+    self.assertEqual(norm.fused, None)
+    inp = keras.layers.Input(shape=(4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV2(virtual_batch_size=2)
+    self.assertEqual(norm.fused, False)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV2(fused=False)
+    self.assertEqual(norm.fused, False)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV2(fused=True, axis=[3])
+    self.assertEqual(norm.fused, True)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, True)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*renorm'):
+      normalization.BatchNormalizationV2(fused=True, renorm=True)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
+      normalization.BatchNormalizationV2(fused=True, axis=2)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
+      normalization.BatchNormalizationV2(fused=True, axis=[1, 3])
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*virtual_batch_size'):
+      normalization.BatchNormalizationV2(fused=True, virtual_batch_size=2)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*adjustment'):
+      normalization.BatchNormalizationV2(fused=True,
+                                         adjustment=lambda _: (1, 0))
+
+    norm = normalization.BatchNormalizationV2(fused=True)
+    self.assertEqual(norm.fused, True)
+    inp = keras.layers.Input(shape=(4, 4))
+    with self.assertRaisesRegexp(ValueError, '4D input tensors'):
+      norm(inp)
+
+
+@tf_test_util.run_v1_only('b/120545219')
+class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
   def test_shared_batchnorm(self):
     """Test that a BN layer can be shared across different data streams.
@@ -167,7 +246,7 @@ class NormalizationLayersTest(test.TestCase):
       x = np.random.normal(loc=5.0, scale=10.0, size=(2, 10))
       model = keras.models.Model(x2, y2)
 
-      model.compile('sgd', 'mse')
+      model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       model.train_on_batch(x, x)
 
       self.assertEqual(len(bn.updates), 4)
@@ -183,7 +262,7 @@ class NormalizationLayersTest(test.TestCase):
       self.assertEqual(len(new_model.updates), 2)
       self.assertEqual(len(model.updates), 4)
       self.assertEqual(len(new_model.get_updates_for(x3)), 2)
-      new_model.compile('sgd', 'mse')
+      new_model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       new_model.train_on_batch(x, x)
 
   def test_that_trainable_disables_updates(self):
@@ -199,7 +278,7 @@ class NormalizationLayersTest(test.TestCase):
       model.trainable = False
       assert not model.updates
 
-      model.compile('sgd', 'mse')
+      model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       assert not model.updates
 
       x1 = model.predict(val_a)
@@ -208,7 +287,7 @@ class NormalizationLayersTest(test.TestCase):
       self.assertAllClose(x1, x2, atol=1e-7)
 
       model.trainable = True
-      model.compile('sgd', 'mse')
+      model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       assert model.updates
 
       model.train_on_batch(val_a, val_out)
@@ -216,7 +295,7 @@ class NormalizationLayersTest(test.TestCase):
       assert np.abs(np.sum(x1 - x2)) > 1e-5
 
       layer.trainable = False
-      model.compile('sgd', 'mse')
+      model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
       assert not model.updates
 
       x1 = model.predict(val_a)
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index 912e8bd619db8b35a54853c0752382479567fd04..a0744cddad682fdcae18f571413b668d7767cb2f 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
@@ -41,16 +44,18 @@ class Pooling1D(Layer):
       strides of the pooling operation.
     padding: A string. The padding method, either 'valid' or 'same'.
       Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
       `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
+      `(batch, steps, features)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, features, steps)`.
     name: A string, the name of the layer.
   """
 
   def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format=None,
+               padding='valid', data_format='channels_last',
                name=None, **kwargs):
     super(Pooling1D, self).__init__(name=name, **kwargs)
     if data_format is None:
@@ -65,45 +70,39 @@ class Pooling1D(Layer):
     self.input_spec = InputSpec(ndim=3)
 
   def call(self, inputs):
-    # There is no TF op for 1D pooling, hence we make the inputs 4D.
-    if self.data_format == 'channels_last':
-      # input is NWC, make it NHWC
-      inputs = array_ops.expand_dims(inputs, 1)
-      # pool on the W dim
-      pool_shape = (1, 1) + self.pool_size + (1,)
-      strides = (1, 1) + self.strides + (1,)
-      data_format = 'NHWC'
-    else:
-      # input is NCW, make it NCHW
-      inputs = array_ops.expand_dims(inputs, 2)
-      # pool on the W dim
-      pool_shape = (1, 1, 1) + self.pool_size
-      strides = (1, 1, 1) + self.strides
-      data_format = 'NCHW'
-
+    pad_axis = 2 if self.data_format == 'channels_last' else 3
+    inputs = array_ops.expand_dims(inputs, pad_axis)
     outputs = self.pool_function(
         inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=data_format)
-
-    if self.data_format == 'channels_last':
-      return array_ops.squeeze(outputs, 1)
-    else:
-      return array_ops.squeeze(outputs, 2)
+        self.pool_size + (1,),
+        strides=self.strides + (1,),
+        padding=self.padding,
+        data_format=self.data_format)
+    return array_ops.squeeze(outputs, pad_axis)
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    length = conv_utils.conv_output_length(input_shape[1], self.pool_size[0],
-                                           self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+    if self.data_format == 'channels_first':
+      steps = input_shape[2]
+      features = input_shape[1]
+    else:
+      steps = input_shape[1]
+      features = input_shape[2]
+    length = conv_utils.conv_output_length(steps,
+                                           self.pool_size[0],
+                                           self.padding,
+                                           self.strides[0])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape([input_shape[0], features, length])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], length, features])
 
   def get_config(self):
     config = {
         'strides': self.strides,
         'pool_size': self.pool_size,
-        'padding': self.padding
+        'padding': self.padding,
+        'data_format': self.data_format,
     }
     base_config = super(Pooling1D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -119,19 +118,36 @@ class MaxPooling1D(Pooling1D):
           E.g. 2 will halve the input.
           If None, it will default to `pool_size`.
       padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, steps, features)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, features, steps)`.
 
   Input shape:
-      3D tensor with shape: `(batch_size, steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, steps)`
 
   Output shape:
-      3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, downsampled_steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, downsampled_steps)`
   """
 
   def __init__(self, pool_size=2, strides=None,
-               padding='valid', data_format=None, **kwargs):
+               padding='valid', data_format='channels_last', **kwargs):
 
     super(MaxPooling1D, self).__init__(
-        nn.max_pool,
+        functools.partial(backend.pool2d, pool_mode='max'),
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -149,18 +165,35 @@ class AveragePooling1D(Pooling1D):
           E.g. 2 will halve the input.
           If None, it will default to `pool_size`.
       padding: One of `"valid"` or `"same"` (case-insensitive).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, steps, features)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, features, steps)`.
 
   Input shape:
-      3D tensor with shape: `(batch_size, steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, steps)`
 
   Output shape:
-      3D tensor with shape: `(batch_size, downsampled_steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, downsampled_steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, downsampled_steps)`
   """
 
   def __init__(self, pool_size=2, strides=None,
-               padding='valid', data_format=None, **kwargs):
+               padding='valid', data_format='channels_last', **kwargs):
     super(AveragePooling1D, self).__init__(
-        nn.avg_pool,
+        functools.partial(backend.pool2d, pool_mode='avg'),
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -561,41 +594,96 @@ class GlobalPooling1D(Layer):
   """Abstract class for different global pooling 1D layers.
   """
 
-  def __init__(self, **kwargs):
+  def __init__(self, data_format='channels_last', **kwargs):
     super(GlobalPooling1D, self).__init__(**kwargs)
     self.input_spec = InputSpec(ndim=3)
+    self.data_format = conv_utils.normalize_data_format(data_format)
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    return tensor_shape.TensorShape([input_shape[0], input_shape[2]])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape([input_shape[0], input_shape[1]])
+    else:
+      return tensor_shape.TensorShape([input_shape[0], input_shape[2]])
 
   def call(self, inputs):
     raise NotImplementedError
 
+  def get_config(self):
+    config = {'data_format': self.data_format}
+    base_config = super(GlobalPooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @tf_export('keras.layers.GlobalAveragePooling1D',
            'keras.layers.GlobalAvgPool1D')
 class GlobalAveragePooling1D(GlobalPooling1D):
   """Global average pooling operation for temporal data.
 
+  Arguments:
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, steps, features)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, features, steps)`.
+
   Input shape:
-      3D tensor with shape: `(batch_size, steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, steps)`
 
   Output shape:
       2D tensor with shape:
       `(batch_size, features)`
   """
 
-  def call(self, inputs):
-    return backend.mean(inputs, axis=1)
+  def __init__(self, data_format='channels_last', **kwargs):
+    super(GlobalAveragePooling1D, self).__init__(data_format=data_format,
+                                                 **kwargs)
+    self.supports_masking = True
+
+  def call(self, inputs, mask=None):
+    steps_axis = 1 if self.data_format == 'channels_last' else 2
+    if mask is not None:
+      mask = math_ops.cast(mask, backend.floatx())
+      input_shape = inputs.shape.as_list()
+      broadcast_shape = [-1, input_shape[steps_axis], 1]
+      mask = array_ops.reshape(mask, broadcast_shape)
+      inputs *= mask
+      return backend.sum(inputs, axis=steps_axis) / math_ops.reduce_sum(
+          mask, axis=steps_axis)
+    else:
+      return backend.mean(inputs, axis=steps_axis)
+
+  def compute_mask(self, inputs, mask=None):
+    return None
 
 
 @tf_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
 class GlobalMaxPooling1D(GlobalPooling1D):
   """Global max pooling operation for temporal data.
 
+  Arguments:
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, steps, features)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, features, steps)`.
+
   Input shape:
-      3D tensor with shape: `(batch_size, steps, features)`.
+      - If `data_format='channels_last'`:
+          3D tensor with shape:
+          `(batch_size, steps, features)`
+      - If `data_format='channels_first'`:
+          3D tensor with shape:
+          `(batch_size, features, steps)`
 
   Output shape:
       2D tensor with shape:
@@ -603,7 +691,8 @@ class GlobalMaxPooling1D(GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return backend.max(inputs, axis=1)
+    steps_axis = 1 if self.data_format == 'channels_last' else 2
+    return backend.max(inputs, axis=steps_axis)
 
 
 class GlobalPooling2D(Layer):
diff --git a/tensorflow/python/keras/layers/pooling_test.py b/tensorflow/python/keras/layers/pooling_test.py
index 2cd9939e66ff869dac5058d2dd00d8d495e40f55..936e73ecf9dab86cb12a9e45499bf0e7599a0dc4 100644
--- a/tensorflow/python/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/layers/pooling_test.py
@@ -18,11 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training import rmsprop
 
 
 class GlobalPoolingTest(test.TestCase):
@@ -31,8 +34,26 @@ class GlobalPoolingTest(test.TestCase):
   def test_globalpooling_1d(self):
     testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
                              input_shape=(3, 4, 5))
+    testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
+                             kwargs={'data_format': 'channels_first'},
+                             input_shape=(3, 4, 5))
     testing_utils.layer_test(
         keras.layers.pooling.GlobalAveragePooling1D, input_shape=(3, 4, 5))
+    testing_utils.layer_test(keras.layers.pooling.GlobalAveragePooling1D,
+                             kwargs={'data_format': 'channels_first'},
+                             input_shape=(3, 4, 5))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_globalpooling_1d_masking_support(self):
+    model = keras.Sequential()
+    model.add(keras.layers.Masking(mask_value=0., input_shape=(3, 4)))
+    model.add(keras.layers.GlobalAveragePooling1D())
+    model.compile(loss='mae', optimizer=rmsprop.RMSPropOptimizer(0.001))
+
+    model_input = np.random.random((2, 3, 4))
+    model_input[0, 1:, :] = 0
+    output = model.predict(model_input)
+    self.assertAllClose(output[0], model_input[0, 0, :])
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_globalpooling_2d(self):
@@ -172,6 +193,10 @@ class Pooling1DTest(test.TestCase):
             kwargs={'strides': stride,
                     'padding': padding},
             input_shape=(3, 5, 4))
+    testing_utils.layer_test(
+        keras.layers.MaxPooling1D,
+        kwargs={'data_format': 'channels_first'},
+        input_shape=(3, 2, 6))
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_averagepooling_1d(self):
@@ -183,6 +208,11 @@ class Pooling1DTest(test.TestCase):
                     'padding': padding},
             input_shape=(3, 5, 4))
 
+    testing_utils.layer_test(
+        keras.layers.AveragePooling1D,
+        kwargs={'data_format': 'channels_first'},
+        input_shape=(3, 2, 6))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index b07ec71178d74e7109aa48ab9078d106fd155941..86a69e45d900bfd037a9d39076c22d9bd2d11c43 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,20 +19,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import uuid
+
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -87,18 +94,8 @@ class StackedRNNCells(Layer):
 
   @property
   def state_size(self):
-    # States are a flat list of the individual cell state size.
-    # e.g. states of a 2-layer LSTM would be `[h1, c1, h2, c2]`.
-    # (assuming one LSTM has states [h, c])
-    # In the case of reverse_state_order=True, the state_size will be
-    # [h2, c2, h1, c1].
-    state_size = []
-    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
-      if _is_multiple_state(cell.state_size):
-        state_size += list(cell.state_size)
-      else:
-        state_size.append(cell.state_size)
-    return tuple(state_size)
+    return tuple(c.state_size for c in
+                 (self.cells[::-1] if self.reverse_state_order else self.cells))
 
   @property
   def output_size(self):
@@ -110,8 +107,6 @@ class StackedRNNCells(Layer):
       return self.cells[-1].state_size
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    # The init state is flattened into a list because state_size is a flattened
-    # list.
     initial_states = []
     for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
       get_initial_state_fn = getattr(cell, 'get_initial_state', None)
@@ -122,39 +117,27 @@ class StackedRNNCells(Layer):
         initial_states.append(_generate_zero_filled_state_for_cell(
             cell, inputs, batch_size, dtype))
 
-    return nest.flatten(initial_states)
+    return tuple(initial_states)
 
   def call(self, inputs, states, constants=None, **kwargs):
     # Recover per-cell states.
-    nested_states = []
-    for cell in self.cells[::-1] if self.reverse_state_order else self.cells:
-      if _is_multiple_state(cell.state_size):
-        nested_states.append(states[:len(cell.state_size)])
-        states = states[len(cell.state_size):]
-      else:
-        nested_states.append([states[0]])
-        states = states[1:]
-    if self.reverse_state_order:
-      nested_states = nested_states[::-1]
+    state_size = (self.state_size[::-1]
+                  if self.reverse_state_order else self.state_size)
+    nested_states = nest.pack_sequence_as(state_size, nest.flatten(states))
 
     # Call the cells in order and store the returned states.
     new_nested_states = []
     for cell, states in zip(self.cells, nested_states):
+      states = states if nest.is_sequence(states) else [states]
       if generic_utils.has_arg(cell.call, 'constants'):
         inputs, states = cell.call(inputs, states, constants=constants,
                                    **kwargs)
       else:
         inputs, states = cell.call(inputs, states, **kwargs)
-
       new_nested_states.append(states)
 
-    # Format the new states as a flat list
-    new_states = []
-    if self.reverse_state_order:
-      new_nested_states = new_nested_states[::-1]
-    for cell_states in new_nested_states:
-      new_states += cell_states
-    return inputs, new_states
+    return inputs, nest.pack_sequence_as(state_size,
+                                         nest.flatten(new_nested_states))
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -470,6 +453,9 @@ class RNN(Layer):
                        'an attribute `state_size` '
                        '(tuple of integers, '
                        'one integer per RNN state).')
+    # If True, the output for masked timestep will be zeros, whereas in the
+    # False case, output from previous timestep is returned for masked timestep.
+    self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
     if isinstance(cell, checkpointable.CheckpointableBase):
@@ -482,11 +468,14 @@ class RNN(Layer):
     self.time_major = time_major
 
     self.supports_masking = True
-    self.input_spec = [None]  # The input shape is unknown yet, at least rank 3.
+    # The input shape is unknown yet, it could have nested tensor inputs, and
+    # the input spec will be the list of specs for flattened inputs.
+    self.input_spec = None
     self.state_spec = None
     self._states = None
     self.constants_spec = None
     self._num_constants = None
+    self._num_inputs = None
 
   @property
   def states(self):
@@ -499,40 +488,54 @@ class RNN(Layer):
   def states(self, states):
     self._states = states
 
-  @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
+    # Check whether the input shape contains any nested shapes. It could be
+    # (tensor_shape(1, 2), tensor_shape(3, 4)) or (1, 2, 3) which is from numpy
+    # inputs.
+    try:
+      input_shape = tensor_shape.as_shape(input_shape)
+    except (ValueError, TypeError):
+      # A nested tensor input
+      input_shape = nest.flatten(input_shape)[0]
+
+    batch = input_shape[0]
+    time_step = input_shape[1]
+    if self.time_major:
+      batch, time_step = time_step, batch
 
     if _is_multiple_state(self.cell.state_size):
       state_size = self.cell.state_size
     else:
       state_size = [self.cell.state_size]
 
+    def _get_output_shape(flat_output_size):
+      output_dim = tensor_shape.as_shape(flat_output_size).as_list()
+      if self.return_sequences:
+        if self.time_major:
+          output_shape = tensor_shape.as_shape([time_step, batch] + output_dim)
+        else:
+          output_shape = tensor_shape.as_shape([batch, time_step] + output_dim)
+      else:
+        output_shape = tensor_shape.as_shape([batch] + output_dim)
+      return output_shape
+
     if getattr(self.cell, 'output_size', None) is not None:
-      output_dim = tensor_shape.as_shape(self.cell.output_size).as_list()
+      # cell.output_size could be nested structure.
+      output_shape = nest.flatten(nest.map_structure(
+          _get_output_shape, self.cell.output_size))
+      output_shape = output_shape[0] if len(output_shape) == 1 else output_shape
     else:
       # Note that state_size[0] could be a tensor_shape or int.
-      output_dim = tensor_shape.as_shape(state_size[0]).as_list()
-
-    batch = input_shape[0]
-    time_step = input_shape[1]
-    if self.time_major:
-      batch, time_step = time_step, batch
-    if self.return_sequences:
-      if self.time_major:
-        output_shape = tuple([time_step, batch] + output_dim)
-      else:
-        output_shape = tuple([batch, time_step] + output_dim)
-    else:
-      output_shape = tuple([batch] + output_dim)
+      output_shape = _get_output_shape(state_size[0])
 
     if self.return_state:
-      state_shape = [
-          tuple([batch] + tensor_shape.as_shape(dim).as_list())
-          for dim in state_size
-      ]
-      return [output_shape] + state_shape
+      def _get_state_shape(flat_state):
+        state_shape = [batch] + tensor_shape.as_shape(flat_state).as_list()
+        return tensor_shape.as_shape(state_shape)
+      state_shape = nest.map_structure(_get_state_shape, state_size)
+      return generic_utils.to_list(output_shape) + nest.flatten(state_shape)
     else:
       return output_shape
 
@@ -546,28 +549,66 @@ class RNN(Layer):
     else:
       return output_mask
 
-  @tf_utils.shape_type_conversion
   def build(self, input_shape):
     # Note input_shape will be list of shapes of initial states and
     # constants if these are passed in __call__.
     if self._num_constants is not None:
       constants_shape = input_shape[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
+      constants_shape = nest.map_structure(
+          lambda s: tuple(tensor_shape.TensorShape(s).as_list()),
+          constants_shape)
     else:
       constants_shape = None
 
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
+      # The input_shape here could be a nest structure.
 
-    input_spec_shape = list(input_shape)
-    batch_index, time_step_index = (1, 0) if self.time_major else (0, 1)
-    if not self.stateful:
-      input_spec_shape[batch_index] = None
-    input_spec_shape[time_step_index] = None
-    self.input_spec[0] = InputSpec(shape=tuple(input_spec_shape))
+    # do the tensor_shape to shapes here. The input could be single tensor, or a
+    # nested structure of tensors.
+    def get_input_spec(shape):
+      if isinstance(shape, tensor_shape.TensorShape):
+        input_spec_shape = shape.as_list()
+      else:
+        input_spec_shape = list(shape)
+      batch_index, time_step_index = (1, 0) if self.time_major else (0, 1)
+      if not self.stateful:
+        input_spec_shape[batch_index] = None
+      input_spec_shape[time_step_index] = None
+      return InputSpec(shape=tuple(input_spec_shape))
+
+    def get_step_input_shape(shape):
+      if isinstance(shape, tensor_shape.TensorShape):
+        shape = tuple(shape.as_list())
+      # remove the timestep from the input_shape
+      return shape[1:] if self.time_major else (shape[0],) + shape[2:]
+
+    # Check whether the input shape contains any nested shapes. It could be
+    # (tensor_shape(1, 2), tensor_shape(3, 4)) or (1, 2, 3) which is from numpy
+    # inputs.
+    try:
+      input_shape = tensor_shape.as_shape(input_shape)
+    except (ValueError, TypeError):
+      # A nested tensor input
+      pass
+
+    if not nest.is_sequence(input_shape):
+      # This indicates the there is only one input.
+      if self.input_spec is not None:
+        self.input_spec[0] = get_input_spec(input_shape)
+      else:
+        self.input_spec = [get_input_spec(input_shape)]
+      step_input_shape = get_step_input_shape(input_shape)
+    else:
+      flat_input_shapes = nest.flatten(input_shape)
+      flat_input_shapes = nest.map_structure(get_input_spec, flat_input_shapes)
+      assert len(flat_input_shapes) == self._num_inputs
+      if self.input_spec is not None:
+        self.input_spec[:self._num_inputs] = flat_input_shapes
+      else:
+        self.input_spec = flat_input_shapes
+      step_input_shape = nest.map_structure(get_step_input_shape, input_shape)
 
-    batch = input_shape[batch_index]
-    input_dim = input_shape[2:]
-    step_input_shape = (batch,) + input_dim
     # allow cell (if layer) to build before we set or validate state_spec
     if isinstance(self.cell, Layer):
       if constants_shape is not None:
@@ -623,6 +664,11 @@ class RNN(Layer):
   def get_initial_state(self, inputs):
     get_initial_state_fn = getattr(self.cell, 'get_initial_state', None)
 
+    if nest.is_sequence(inputs):
+      # The input are nested sequences. Use the first element in the seq to get
+      # batch size and dtype.
+      inputs = nest.flatten(inputs)[0]
+
     input_shape = array_ops.shape(inputs)
     batch_size = input_shape[1] if self.time_major else input_shape[0]
     dtype = inputs.dtype
@@ -642,7 +688,13 @@ class RNN(Layer):
     inputs, initial_state, constants = _standardize_args(inputs,
                                                          initial_state,
                                                          constants,
-                                                         self._num_constants)
+                                                         self._num_constants,
+                                                         self._num_inputs)
+    # in case the real inputs is a nested structure, set the size of flatten
+    # input so that we can distinguish between real inputs, initial_state and
+    # constants.
+    self._num_inputs = len(nest.flatten(inputs))
+
     if initial_state is None and constants is None:
       return super(RNN, self).__call__(inputs, **kwargs)
 
@@ -653,14 +705,12 @@ class RNN(Layer):
     additional_inputs = []
     additional_specs = []
     if initial_state is not None:
-      kwargs['initial_state'] = initial_state
       additional_inputs += initial_state
       self.state_spec = [
           InputSpec(shape=K.int_shape(state)) for state in initial_state
       ]
       additional_specs += self.state_spec
     if constants is not None:
-      kwargs['constants'] = constants
       additional_inputs += constants
       self.constants_spec = [
           InputSpec(shape=K.int_shape(constant)) for constant in constants
@@ -680,7 +730,10 @@ class RNN(Layer):
     if is_keras_tensor:
       # Compute the full input spec, including state and constants
       full_input = [inputs] + additional_inputs
-      full_input_spec = self.input_spec + additional_specs
+      # The original input_spec is None since there could be a nested tensor
+      # input. Update the input_spec to match the inputs.
+      full_input_spec = [None for _ in range(len(nest.flatten(inputs)))
+                        ] + additional_specs
       # Perform the call with temporarily replaced input_spec
       original_input_spec = self.input_spec
       self.input_spec = full_input_spec
@@ -688,6 +741,10 @@ class RNN(Layer):
       self.input_spec = original_input_spec
       return output
     else:
+      if initial_state is not None:
+        kwargs['initial_state'] = initial_state
+      if constants is not None:
+        kwargs['constants'] = constants
       return super(RNN, self).__call__(inputs, **kwargs)
 
   def call(self,
@@ -696,34 +753,17 @@ class RNN(Layer):
            training=None,
            initial_state=None,
            constants=None):
-    # input shape: `(samples, time (padded with zeros), input_dim)`
-    # note that the .build() method of subclasses MUST define
-    # self.input_spec and self.state_spec with complete input shapes.
-    if isinstance(inputs, list):
-      # get initial_state from full input spec
-      # as they could be copied to multiple GPU.
-      if self._num_constants is None:
-        initial_state = inputs[1:]
-      else:
-        initial_state = inputs[1:-self._num_constants]
-      if len(initial_state) == 0:
-        initial_state = None
-      inputs = inputs[0]
-    if initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
-      initial_state = self.get_initial_state(inputs)
+    inputs, initial_state, constants = self._process_inputs(
+        inputs, initial_state, constants)
 
     if isinstance(mask, list):
       mask = mask[0]
 
-    if len(initial_state) != len(self.states):
-      raise ValueError(
-          'Layer has ' + str(len(self.states)) + ' states but was passed ' +
-          str(len(initial_state)) + ' initial states.')
-    input_shape = K.int_shape(inputs)
+    if nest.is_sequence(inputs):
+      # In the case of nested input, use the first element for shape check.
+      input_shape = K.int_shape(nest.flatten(inputs)[0])
+    else:
+      input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
     if self.unroll and timesteps in [None, 1]:
       raise ValueError('Cannot unroll a RNN if the '
@@ -776,7 +816,8 @@ class RNN(Layer):
         mask=mask,
         unroll=self.unroll,
         input_length=timesteps,
-        time_major=self.time_major)
+        time_major=self.time_major,
+        zero_output_for_mask=self.zero_output_for_mask)
     if self.stateful:
       updates = []
       for i in range(len(states)):
@@ -788,21 +829,43 @@ class RNN(Layer):
     else:
       output = last_output
 
-    # Properly set learning phase
-    if getattr(last_output, '_uses_learning_phase', False):
-      output._uses_learning_phase = True
-      for state in states:
-        state._uses_learning_phase = True
-
     if self.return_state:
       if not isinstance(states, (list, tuple)):
         states = [states]
       else:
         states = list(states)
-      return [output] + states
+      return generic_utils.to_list(output) + states
     else:
       return output
 
+  def _process_inputs(self, inputs, initial_state, constants):
+    # input shape: `(samples, time (padded with zeros), input_dim)`
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      # get initial_state from full input spec
+      # as they could be copied to multiple GPU.
+      if self._num_constants is None:
+        initial_state = inputs[1:]
+      else:
+        initial_state = inputs[1:-self._num_constants]
+        constants = inputs[-self._num_constants:]
+      if len(initial_state) == 0:
+        initial_state = None
+      inputs = inputs[0]
+    if initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
+    return inputs, initial_state, constants
+
   def reset_states(self, states=None):
     if not self.stateful:
       raise AttributeError('Layer must be stateful.')
@@ -876,6 +939,8 @@ class RNN(Layer):
     }
     if self._num_constants is not None:
       config['num_constants'] = self._num_constants
+    if self.zero_output_for_mask:
+      config['zero_output_for_mask'] = self.zero_output_for_mask
 
     cell_config = self.cell.get_config()
     config['cell'] = {
@@ -1055,12 +1120,6 @@ class SimpleRNNCell(Layer):
     if self.activation is not None:
       output = self.activation(output)
 
-    # Properly set learning phase on output tensor.
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None and not context.executing_eagerly():
-        # This would be harmless to set in eager mode, but eager tensors
-        # disallow setting arbitrary attributes.
-        output._uses_learning_phase = True
     return output, [output]
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
@@ -1205,6 +1264,7 @@ class SimpleRNN(RNN):
         unroll=unroll,
         **kwargs)
     self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.input_spec = [InputSpec(ndim=3)]
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     self.cell._dropout_mask = None
@@ -1562,12 +1622,6 @@ class GRUCell(Layer):
       hh = self.activation(x_h + recurrent_h)
     # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None and not context.executing_eagerly():
-        # This would be harmless to set in eager mode, but eager tensors
-        # disallow setting arbitrary attributes.
-        h._uses_learning_phase = True
-
     return h, [h]
 
   def get_config(self):
@@ -1734,6 +1788,7 @@ class GRU(RNN):
         unroll=unroll,
         **kwargs)
     self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.input_spec = [InputSpec(ndim=3)]
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     self.cell._dropout_mask = None
@@ -1873,7 +1928,7 @@ class LSTMCell(Layer):
           for the recurrent step.
           Default: hard sigmoid (`hard_sigmoid`).
           If you pass `None`, no activation is applied
-          (ie. "linear" activation: `a(x) = x`).x
+          (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
           used for the linear transformation of the inputs.
@@ -1951,7 +2006,7 @@ class LSTMCell(Layer):
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.implementation = implementation
-    self.state_size = (self.units, self.units)
+    self.state_size = [self.units, self.units]
     self.output_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
@@ -1993,6 +2048,29 @@ class LSTMCell(Layer):
       self.bias = None
     self.built = True
 
+  def _compute_carry_and_output(self, x, h_tm1, c_tm1):
+    """Computes carry and output using split kernels."""
+    x_i, x_f, x_c, x_o = x
+    h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1
+    i = self.recurrent_activation(
+        x_i + K.dot(h_tm1_i, self.recurrent_kernel[:, :self.units]))
+    f = self.recurrent_activation(x_f + K.dot(
+        h_tm1_f, self.recurrent_kernel[:, self.units:self.units * 2]))
+    c = f * c_tm1 + i * self.activation(x_c + K.dot(
+        h_tm1_c, self.recurrent_kernel[:, self.units * 2:self.units * 3]))
+    o = self.recurrent_activation(
+        x_o + K.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3:]))
+    return c, o
+
+  def _compute_carry_and_output_fused(self, z, c_tm1):
+    """Computes carry and output using fused kernels."""
+    z0, z1, z2, z3 = z
+    i = self.recurrent_activation(z0)
+    f = self.recurrent_activation(z1)
+    c = f * c_tm1 + i * self.activation(z2)
+    o = self.recurrent_activation(z3)
+    return c, o
+
   def call(self, inputs, states, training=None):
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
@@ -2047,16 +2125,9 @@ class LSTMCell(Layer):
         h_tm1_f = h_tm1
         h_tm1_c = h_tm1
         h_tm1_o = h_tm1
-      i = self.recurrent_activation(
-          x_i + K.dot(h_tm1_i, self.recurrent_kernel[:, :self.units]))
-      f = self.recurrent_activation(
-          x_f + K.dot(h_tm1_f,
-                      self.recurrent_kernel[:, self.units: self.units * 2]))
-      c = f * c_tm1 + i * self.activation(
-          x_c + K.dot(h_tm1_c,
-                      self.recurrent_kernel[:, self.units * 2: self.units * 3]))
-      o = self.recurrent_activation(
-          x_o + K.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3:]))
+      x = (x_i, x_f, x_c, x_o)
+      h_tm1 = (h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o)
+      c, o = self._compute_carry_and_output(x, h_tm1, c_tm1)
     else:
       if 0. < self.dropout < 1.:
         inputs *= dp_mask[0]
@@ -2072,17 +2143,10 @@ class LSTMCell(Layer):
       z2 = z[:, 2 * self.units:3 * self.units]
       z3 = z[:, 3 * self.units:]
 
-      i = self.recurrent_activation(z0)
-      f = self.recurrent_activation(z1)
-      c = f * c_tm1 + i * self.activation(z2)
-      o = self.recurrent_activation(z3)
+      z = (z0, z1, z2, z3)
+      c, o = self._compute_carry_and_output_fused(z, c_tm1)
 
     h = o * self.activation(c)
-    if 0 < self.dropout + self.recurrent_dropout:
-      if training is None and not context.executing_eagerly():
-        # This would be harmless to set in eager mode, but eager tensors
-        # disallow setting arbitrary attributes.
-        h._uses_learning_phase = True
     return h, [h, c]
 
   def get_config(self):
@@ -2126,7 +2190,88 @@ class LSTMCell(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    return _generate_zero_filled_state_for_cell(self, inputs, batch_size, dtype)
+    return list(_generate_zero_filled_state_for_cell(
+        self, inputs, batch_size, dtype))
+
+
+@tf_export('keras.experimental.PeepholeLSTMCell')
+class PeepholeLSTMCell(LSTMCell):
+  """Equivalent to LSTMCell class but adds peephole connections.
+
+  Peephole connections allow the gates to utilize the previous internal state as
+  well as the previous hidden state (which is what LSTMCell is limited to).
+  This allows PeepholeLSTMCell to better learn precise timings over LSTMCell.
+
+  From [Gers et al.](http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf):
+
+    "We find that LSTM augmented by 'peephole connections' from its internal
+    cells to its multiplicative gates can learn the fine distinction between
+    sequences of spikes spaced either 50 or 49 time steps apart without the help
+    of any short training exemplars."
+
+  The peephole implementation is based on:
+
+    https://research.google.com/pubs/archive/43905.pdf
+
+  Hasim Sak, Andrew Senior, and Francoise Beaufays.
+  "Long short-term memory recurrent neural network architectures for
+   large scale acoustic modeling." INTERSPEECH, 2014.
+
+  Example:
+
+  ```python
+      # Create 2 PeepholeLSTMCells
+      peephole_lstm_cells = [PeepholeLSTMCell(size) for size in [128, 256]]
+      # Create a layer composed sequentially of the peephole LSTM cells.
+      layer = RNN(peephole_lstm_cells)
+      input = keras.Input((timesteps, input_dim))
+      output = layer(input)
+  ```
+  """
+
+  def build(self, input_shape):
+    super(PeepholeLSTMCell, self).build(input_shape)
+    # The following are the weight matrices for the peephole connections. These
+    # are multiplied with the previous internal state during the computation of
+    # carry and output.
+    self.input_gate_peephole_weights = self.add_weight(
+        shape=(self.units,),
+        name='input_gate_peephole_weights',
+        initializer=self.kernel_initializer)
+    self.forget_gate_peephole_weights = self.add_weight(
+        shape=(self.units,),
+        name='forget_gate_peephole_weights',
+        initializer=self.kernel_initializer)
+    self.output_gate_peephole_weights = self.add_weight(
+        shape=(self.units,),
+        name='output_gate_peephole_weights',
+        initializer=self.kernel_initializer)
+
+  def _compute_carry_and_output(self, x, h_tm1, c_tm1):
+    x_i, x_f, x_c, x_o = x
+    h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1
+    i = self.recurrent_activation(
+        x_i + K.dot(h_tm1_i, self.recurrent_kernel[:, :self.units]) +
+        self.input_gate_peephole_weights * c_tm1)
+    f = self.recurrent_activation(x_f + K.dot(
+        h_tm1_f, self.recurrent_kernel[:, self.units:self.units * 2]) +
+                                  self.forget_gate_peephole_weights * c_tm1)
+    c = f * c_tm1 + i * self.activation(x_c + K.dot(
+        h_tm1_c, self.recurrent_kernel[:, self.units * 2:self.units * 3]))
+    o = self.recurrent_activation(
+        x_o + K.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3:]) +
+        self.output_gate_peephole_weights * c)
+    return c, o
+
+  def _compute_carry_and_output_fused(self, z, c_tm1):
+    z0, z1, z2, z3 = z
+    i = self.recurrent_activation(z0 +
+                                  self.input_gate_peephole_weights * c_tm1)
+    f = self.recurrent_activation(z1 +
+                                  self.forget_gate_peephole_weights * c_tm1)
+    c = f * c_tm1 + i * self.activation(z2)
+    o = self.recurrent_activation(z3 + self.output_gate_peephole_weights * c)
+    return c, o
 
 
 @tf_export('keras.layers.LSTM')
@@ -2262,6 +2407,7 @@ class LSTM(RNN):
         unroll=unroll,
         **kwargs)
     self.activity_regularizer = regularizers.get(activity_regularizer)
+    self.input_spec = [InputSpec(ndim=3)]
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     self.cell._dropout_mask = None
@@ -2387,6 +2533,427 @@ class LSTM(RNN):
     return cls(**config)
 
 
+class UnifiedLSTM(LSTM):
+  """Long Short-Term Memory layer - Hochreiter 1997.
+
+  `UnifiedLSTM` unifies the implementations between standard `LSTM` layer and
+  `CuDNNLSTM` layer. Based on available runtime hardware and constrains,
+  `UnifiedLSTM` will choose different implementations to maximize the
+  performance. For instance, if GPU is available and all the parameters meet the
+  requirement of CuDNN kernel, `UnifiedLSTM` will use CuDNN kernel for the
+  calculation.
+
+  Arguments:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
+          is applied
+        (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use for the recurrent step.
+        Default: hard sigmoid (`hard_sigmoid`). If you pass `None`, no
+          activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix, used for
+      the linear transformation of the inputs..
+    recurrent_initializer: Initializer for the `recurrent_kernel` weights
+      matrix, used for the linear transformation of the recurrent state..
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
+      initialization. Setting it to true will also force
+      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to the `kernel` weights
+      matrix.
+    recurrent_regularizer: Regularizer function applied to the
+      `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to the output of the
+      layer (its "activation")..
+    kernel_constraint: Constraint function applied to the `kernel` weights
+      matrix.
+    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
+      weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
+      transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
+      its operations as a larger number of smaller dot products and additions,
+      whereas mode 2 will batch them into fewer, larger operations. These modes
+      will have different performance profiles on different hardware and for
+      different applications.
+    return_sequences: Boolean. Whether to return the last output. in the output
+      sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state in addition to the
+      output.
+    go_backwards: Boolean (default False). If True, process the input sequence
+      backwards and return the reversed sequence.
+    stateful: Boolean (default False). If True, the last state for each sample
+      at index i in a batch will be used as initial state for the sample of
+      index i in the following batch.
+    unroll: Boolean (default False). If True, the network will be unrolled, else
+      a symbolic loop will be used. Unrolling can speed-up a RNN, although it
+      tends to be more memory-intensive. Unrolling is only suitable for short
+      sequences.
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               time_major=False,
+               unroll=False,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self.return_runtime = kwargs.pop('return_runtime', False)
+
+    super(UnifiedLSTM, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        unit_forget_bias=unit_forget_bias,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        time_major=time_major,
+        unroll=unroll,
+        **kwargs)
+
+    self.state_spec = [
+        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
+    ]
+    self._num_constants = None
+    self._num_inputs = None
+    self._dropout_mask = None
+    self.could_use_cudnn = (
+        activation == 'tanh' and recurrent_dropout == 0 and
+        not unroll and use_bias and bias_regularizer is None)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # LSTM does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal LSTM.
+      kwargs = {'training': training}
+
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+    else:
+      # Use the new defun approach for backend implementation swap.
+      # Note that different implementations need to have same function
+      # signature, eg, the tensor parameters need to have same shape and dtypes.
+      # Since the CuDNN has an extra set of bias, those bias will be passed to
+      # both normal and CuDNN implementations.
+      if self.go_backwards:
+        # Reverse time axis.
+        inputs = K.reverse(inputs, 0 if self.time_major else 1)
+
+      if 0 < self.dropout < 1:
+        if self._dropout_mask is None:
+          self._dropout_mask = _generate_dropout_mask(
+              array_ops.ones_like(inputs),
+              self.dropout,
+              training=training,
+              count=4)
+
+        inputs *= self._dropout_mask[0]
+
+      # Each time a defun function is called, we will give a unique identifiable
+      # API name, so that the grappler won't get confused when it sees multiple
+      # LSTM layer added into same graph, and it will be able to pair up the
+      # different implementations across them.
+      experimental_api_name = 'lstm_' + str(uuid.uuid4())
+      standard_lstm_attributes = {
+          'experimental_api_implements': experimental_api_name,
+          'experimental_api_preferred_device': 'CPU',
+      }
+      cudnn_lstm_attributes = {
+          'experimental_api_implements': experimental_api_name,
+          'experimental_api_preferred_device': 'GPU',
+      }
+      defun_standard_lstm = function.defun_with_attributes(
+          standard_lstm, attributes=standard_lstm_attributes)
+      defun_cudnn_lstm = function.defun_with_attributes(
+          cudnn_lstm, attributes=cudnn_lstm_attributes)
+
+      if ops.executing_eagerly_outside_functions():
+        # Under eager context, the device placement is already known. Prefer the
+        # GPU implementation here.
+        if context.num_gpus() > 0:
+          last_output, outputs, new_h, new_c, runtime = defun_cudnn_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.time_major)
+        else:
+          last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.activation,
+              self.recurrent_activation, self.time_major)
+      else:
+        # Call the normal LSTM impl and register the CuDNN impl function. The
+        # grappler will kick in during session execution to optimize the graph.
+        last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+            inputs, initial_state[0], initial_state[1], self.cell.kernel,
+            self.cell.recurrent_kernel, self.cell.bias, self.activation,
+            self.recurrent_activation, self.time_major)
+
+        function.register(defun_cudnn_lstm, inputs, initial_state[0],
+                          initial_state[1], self.cell.kernel,
+                          self.cell.recurrent_kernel, self.cell.bias,
+                          self.time_major)
+      states = [new_h, new_c]
+
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(state_ops.assign(self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + states
+    elif self.return_runtime:
+      return output, runtime
+    else:
+      return output
+
+  @property
+  def trainable_weights(self):
+    if self.trainable:
+      weights = []
+      weights += self.cell.trainable_weights
+      return weights
+    return []
+
+  @property
+  def non_trainable_weights(self):
+    if not self.trainable:
+      weights = []
+      weights += self.cell.non_trainable_weights
+      return weights
+    return []
+
+  @property
+  def losses(self):
+    losses = []
+    losses += self.cell.losses
+    return losses + self._losses
+
+  @property
+  def updates(self):
+    updates = []
+    updates += self.cell.updates
+    return updates + self._updates
+
+  def get_weights(self):
+    weights = []
+    weights += self.cell.weights
+    return K.batch_get_value(weights)
+
+  def set_weights(self, weights):
+    tuples = []
+    cell_weights = weights[:len(self.cell.weights)]
+    if cell_weights:
+      tuples.append((self.cell.weights, cell_weights))
+    K.batch_set_value(tuples)
+
+
+def _canonical_to_params(weights, biases, shape, transpose_weights=False):
+  """Utility function convert variable to CuDNN compatible parameter.
+
+  Note that Keras weights for kernels are different from the CuDNN format. Eg.:
+
+  ```
+    Keras                 CuDNN
+    [[0, 1, 2],  <--->  [[0, 2, 4],
+     [3, 4, 5]]          [1, 3, 5]]
+  ```
+
+  If the input weights need to be in a unified format, then set
+  `transpose_weights=True` to convert the weights.
+
+  Args:
+    weights: list of weights for the individual kernels and recurrent kernels.
+    biases: list of biases for individual gate.
+    shape: the shape for the converted variables that will be feed to CuDNN.
+    transpose_weights: boolean, whether to transpose the weights.
+
+  Returns:
+    The converted weights that can be feed to CuDNN ops as param.
+  """
+  def convert(w):
+    return array_ops.transpose(w) if transpose_weights else w
+
+  weights = [array_ops.reshape(convert(x), shape) for x in weights]
+  biases = [array_ops.reshape(x, shape) for x in biases]
+  return array_ops.concat(weights + biases, axis=0)
+
+
+def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
+                  activation, recurrent_activation, time_major):
+  """LSTM with standard kernel implementation.
+
+  This implementation can be run on all types for hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Note that the first half of the bias tensor should be ignored by this impl.
+  The CuDNN impl need an extra set of input gate bias. In order to make the both
+  function take same shape of parameter, that extra set of bias is also feed
+  here.
+
+  Args:
+    inputs: input tensor of LSTM layer.
+    init_h: initial state tensor for the cell output.
+    init_c: initial state tensor for the cell hidden state.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
+      is used in this case.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    state_1: the cell hidden state, which has same shape as init_c.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]  # previous memory state
+    c_tm1 = cell_states[1]  # previous carry state
+
+    z = K.dot(cell_inputs, kernel)
+    z += K.dot(h_tm1, recurrent_kernel)
+    z = K.bias_add(z, bias)
+
+    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
+
+    i = recurrent_activation(z0)
+    f = recurrent_activation(z1)
+    c = f * c_tm1 + i * activation(z2)
+    o = recurrent_activation(z3)
+
+    h = o * activation(c)
+    return h, [h, c]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h, init_c],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], new_states[
+      1], constant_op.constant('cpu', dtype=dtypes.string, name='runtime')
+
+
+def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
+               time_major):
+  """LSTM with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  input_h = array_ops.expand_dims(input_h, axis=0)
+  input_c = array_ops.expand_dims(input_c, axis=0)
+
+  weights = array_ops.split(kernel, 4, axis=1)
+  weights += array_ops.split(recurrent_kernel, 4, axis=1)
+  # CuDNN has an extra set of bias for inputs, we disable them (setting to 0),
+  # so that mathematically it is same as the canonical LSTM implementation.
+  full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
+
+  params = _canonical_to_params(
+      weights=weights,
+      biases=array_ops.split(full_bias, 8),
+      shape=constant_op.constant([-1]),
+      transpose_weights=True)
+
+  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs, input_h=input_h, input_c=input_c, params=params, is_training=True)
+  last_output = outputs[-1]
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  c = c[0]
+
+  return last_output, outputs, h, c, constant_op.constant(
+      'cudnn', dtype=dtypes.string, name='runtime')
+
+
 def _generate_dropout_mask(ones, rate, training=None, count=1):
   def dropped_inputs():
     return K.dropout(ones, rate)
@@ -2399,7 +2966,8 @@ def _generate_dropout_mask(ones, rate, training=None, count=1):
   return K.in_train_phase(dropped_inputs, ones, training=training)
 
 
-def _standardize_args(inputs, initial_state, constants, num_constants):
+def _standardize_args(
+    inputs, initial_state, constants, num_constants, num_inputs=1):
   """Standardizes `__call__` to a single list of tensor inputs.
 
   When running a model loaded from a file, the input tensors
@@ -2415,20 +2983,40 @@ def _standardize_args(inputs, initial_state, constants, num_constants):
       constants: Tensor or list of tensors or None, constant tensors.
       num_constants: Expected number of constants (if constants are passed as
         part of the `inputs` list.
+      num_inputs: Expected number of real input tensors (exclude initial_states
+        and constants).
 
   Returns:
-      inputs: Single tensor.
+      inputs: Single tensor or tuple of tensors.
       initial_state: List of tensors or None.
       constants: List of tensors or None.
   """
   if isinstance(inputs, list):
+    # There are several situations here:
+    # In the graph mode, __call__ will be only called once. The initial_state
+    # and constants could be in inputs (from file loading).
+    # In the eager mode, __call__ will be called twice, once during
+    # rnn_layer(inputs=input_t, constants=c_t, ...), and second time will be
+    # model.fit/train_on_batch/predict with real np data. In the second case,
+    # the inputs will contain initial_state and constants, and more importantly,
+    # the real inputs will be in a flat list, instead of nested tuple.
+    #
+    # For either case, we will use num_inputs to split the input list, and
+    # restructure the real input into tuple.
     assert initial_state is None and constants is None
     if num_constants is not None:
       constants = inputs[-num_constants:]
       inputs = inputs[:-num_constants]
+    if num_inputs is None:
+      num_inputs = 1
+    if len(inputs) > num_inputs:
+      initial_state = inputs[num_inputs:]
+      inputs = inputs[:num_inputs]
+
     if len(inputs) > 1:
-      initial_state = inputs[1:]
-    inputs = inputs[0]
+      inputs = tuple(inputs)
+    else:
+      inputs = inputs[0]
 
   def to_list_or_none(x):
     if x is None or isinstance(x, list):
@@ -2458,19 +3046,17 @@ def _generate_zero_filled_state_for_cell(cell, inputs, batch_size, dtype):
 
 def _generate_zero_filled_state(batch_size_tensor, state_size, dtype):
   """Generate a zero filled tensor with shape [batch_size, state_size]."""
-  if None in [batch_size_tensor, dtype]:
+  if batch_size_tensor is None or dtype is None:
     raise ValueError(
         'batch_size and dtype cannot be None while constructing initial state: '
         'batch_size={}, dtype={}'.format(batch_size_tensor, dtype))
-  if _is_multiple_state(state_size):
-    states = []
-    for dims in state_size:
-      flat_dims = tensor_shape.as_shape(dims).as_list()
-      init_state_size = [batch_size_tensor] + flat_dims
-      init_state = array_ops.zeros(init_state_size, dtype=dtype)
-      states.append(init_state)
-    return states
-  else:
-    flat_dims = tensor_shape.as_shape(state_size).as_list()
+
+  def create_zeros(unnested_state_size):
+    flat_dims = tensor_shape.as_shape(unnested_state_size).as_list()
     init_state_size = [batch_size_tensor] + flat_dims
     return array_ops.zeros(init_state_size, dtype=dtype)
+
+  if nest.is_sequence(state_size):
+    return nest.map_structure(create_zeros, state_size)
+  else:
+    return create_zeros(state_size)
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index d246be6b45a48abb536ff6479b05c2760502d416..b1449069e3279e27b08ecc383e72aed63525e521 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -21,18 +21,34 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.training import rmsprop
 from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.util import nest
+
+# Used for nested input/output/state RNN test.
+NestedInput = collections.namedtuple('NestedInput', ['t1', 't2'])
+NestedState = collections.namedtuple('NestedState', ['s1', 's2'])
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class RNNTest(test.TestCase):
 
   def test_minimal_rnn_cell_non_layer(self):
@@ -50,25 +66,26 @@ class RNNTest(test.TestCase):
         output = keras.backend.dot(inputs, self.kernel) + prev_output
         return output, [output]
 
-    with self.cached_session():
-      # Basic test case.
-      cell = MinimalRNNCell(32, 5)
-      x = keras.Input((None, 5))
-      layer = keras.layers.RNN(cell)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-      # Test stacking.
-      cells = [MinimalRNNCell(8, 5),
-               MinimalRNNCell(32, 8),
-               MinimalRNNCell(32, 32)]
-      layer = keras.layers.RNN(cells)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+    # Basic test case.
+    cell = MinimalRNNCell(32, 5)
+    x = keras.Input((None, 5))
+    layer = keras.layers.RNN(cell)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    # Test stacking.
+    cells = [MinimalRNNCell(8, 5),
+             MinimalRNNCell(32, 8),
+             MinimalRNNCell(32, 32)]
+    layer = keras.layers.RNN(cells)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_non_layer_multiple_states(self):
 
@@ -88,27 +105,28 @@ class RNNTest(test.TestCase):
         output -= prev_output_2
         return output, [output * 2, output * 3]
 
-    with self.cached_session():
-      # Basic test case.
-      cell = MinimalRNNCell(32, 5)
-      x = keras.Input((None, 5))
-      layer = keras.layers.RNN(cell)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-      # Test stacking.
-      cells = [MinimalRNNCell(8, 5),
-               MinimalRNNCell(16, 8),
-               MinimalRNNCell(32, 16)]
-      layer = keras.layers.RNN(cells)
-      self.assertEqual(layer.cell.state_size, (8, 8, 16, 16, 32, 32))
-      self.assertEqual(layer.cell.output_size, 32)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+    # Basic test case.
+    cell = MinimalRNNCell(32, 5)
+    x = keras.Input((None, 5))
+    layer = keras.layers.RNN(cell)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    # Test stacking.
+    cells = [MinimalRNNCell(8, 5),
+             MinimalRNNCell(16, 8),
+             MinimalRNNCell(32, 16)]
+    layer = keras.layers.RNN(cells)
+    self.assertEqual(layer.cell.state_size, ((8, 8), (16, 16), (32, 32)))
+    self.assertEqual(layer.cell.output_size, 32)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_layer(self):
 
@@ -140,51 +158,52 @@ class RNNTest(test.TestCase):
         base_config = super(MinimalRNNCell, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    with self.cached_session():
-      # Test basic case.
-      x = keras.Input((None, 5))
-      cell = MinimalRNNCell(32)
-      layer = keras.layers.RNN(cell)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-      # Test basic case serialization.
-      x_np = np.random.random((6, 5, 5))
-      y_np = model.predict(x_np)
-      weights = model.get_weights()
-      config = layer.get_config()
-      with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-        layer = keras.layers.RNN.from_config(config)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.set_weights(weights)
-      y_np_2 = model.predict(x_np)
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-      # Test stacking.
-      cells = [MinimalRNNCell(8),
-               MinimalRNNCell(12),
-               MinimalRNNCell(32)]
-      layer = keras.layers.RNN(cells)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-      # Test stacked RNN serialization.
-      x_np = np.random.random((6, 5, 5))
-      y_np = model.predict(x_np)
-      weights = model.get_weights()
-      config = layer.get_config()
-      with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-        layer = keras.layers.RNN.from_config(config)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.set_weights(weights)
-      y_np_2 = model.predict(x_np)
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+    # Test basic case.
+    x = keras.Input((None, 5))
+    cell = MinimalRNNCell(32)
+    layer = keras.layers.RNN(cell)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    # Test basic case serialization.
+    x_np = np.random.random((6, 5, 5))
+    y_np = model.predict(x_np)
+    weights = model.get_weights()
+    config = layer.get_config()
+    with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
+      layer = keras.layers.RNN.from_config(config)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.set_weights(weights)
+    y_np_2 = model.predict(x_np)
+    self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+    # Test stacking.
+    cells = [MinimalRNNCell(8),
+             MinimalRNNCell(12),
+             MinimalRNNCell(32)]
+    layer = keras.layers.RNN(cells)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    # Test stacked RNN serialization.
+    x_np = np.random.random((6, 5, 5))
+    y_np = model.predict(x_np)
+    weights = model.get_weights()
+    config = layer.get_config()
+    with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
+      layer = keras.layers.RNN.from_config(config)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.set_weights(weights)
+    y_np_2 = model.predict(x_np)
+    self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
   def test_rnn_with_time_major(self):
     batch = 10
@@ -192,89 +211,89 @@ class RNNTest(test.TestCase):
     embedding_dim = 4
     units = 3
 
-    with self.cached_session():
-      # Test basic case.
-      x = keras.Input((time_step, embedding_dim))
-      time_major_x = keras.layers.Lambda(
-          lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
-      layer = keras.layers.SimpleRNN(
-          units, time_major=True, return_sequences=True)
-      self.assertEqual(
-          layer.compute_output_shape((time_step, None,
-                                      embedding_dim)).as_list(),
-          [time_step, None, units])
-      y = layer(time_major_x)
-      self.assertEqual(layer.output_shape, (time_step, None, units))
-
-      y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
-
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          np.zeros((batch, time_step, embedding_dim)),
-          np.zeros((batch, time_step, units)))
-
-    with self.cached_session():
-      # Test stacking.
-      x = keras.Input((time_step, embedding_dim))
-      time_major_x = keras.layers.Lambda(
-          lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
-      cell_units = [10, 8, 6]
-      cells = [keras.layers.SimpleRNNCell(cell_units[i]) for i in range(3)]
-      layer = keras.layers.RNN(cells, time_major=True, return_sequences=True)
-      y = layer(time_major_x)
-      self.assertEqual(layer.output_shape, (time_step, None, cell_units[-1]))
-
-      y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          np.zeros((batch, time_step, embedding_dim)),
-          np.zeros((batch, time_step, cell_units[-1])))
-
-    with self.cached_session():
-      # Test masking.
-      x = keras.Input((time_step, embedding_dim))
-      time_major = keras.layers.Lambda(
-          lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
-      mask = keras.layers.Masking()(time_major)
-      rnn = keras.layers.SimpleRNN(
-          units, time_major=True, return_sequences=True)(mask)
-      y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(rnn)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          np.zeros((batch, time_step, embedding_dim)),
-          np.zeros((batch, time_step, units)))
-
-    with self.cached_session():
-      # Test layer output
-      x = keras.Input((time_step, embedding_dim))
-      rnn_1 = keras.layers.SimpleRNN(units, return_sequences=True)
-      y = rnn_1(x)
-
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          np.zeros((batch, time_step, embedding_dim)),
-          np.zeros((batch, time_step, units)))
-
-      x_np = np.random.random((batch, time_step, embedding_dim))
-      y_np_1 = model.predict(x_np)
-
-      time_major = keras.layers.Lambda(
-          lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
-      rnn_2 = keras.layers.SimpleRNN(
-          units, time_major=True, return_sequences=True)
-      y_2 = rnn_2(time_major)
-      y_2 = keras.layers.Lambda(
-          lambda t: array_ops.transpose(t, [1, 0, 2]))(y_2)
-
-      model_2 = keras.models.Model(x, y_2)
-      rnn_2.set_weights(rnn_1.get_weights())
-
-      y_np_2 = model_2.predict(x_np)
-      self.assertAllClose(y_np_1, y_np_2, atol=1e-4)
+    # Test basic case.
+    x = keras.Input((time_step, embedding_dim))
+    time_major_x = keras.layers.Lambda(
+        lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
+    layer = keras.layers.SimpleRNN(
+        units, time_major=True, return_sequences=True)
+    self.assertEqual(
+        layer.compute_output_shape((time_step, None,
+                                    embedding_dim)).as_list(),
+        [time_step, None, units])
+    y = layer(time_major_x)
+    self.assertEqual(layer.output_shape, (time_step, None, units))
+
+    y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
+
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        np.zeros((batch, time_step, embedding_dim)),
+        np.zeros((batch, time_step, units)))
+
+    # Test stacking.
+    x = keras.Input((time_step, embedding_dim))
+    time_major_x = keras.layers.Lambda(
+        lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
+    cell_units = [10, 8, 6]
+    cells = [keras.layers.SimpleRNNCell(cell_units[i]) for i in range(3)]
+    layer = keras.layers.RNN(cells, time_major=True, return_sequences=True)
+    y = layer(time_major_x)
+    self.assertEqual(layer.output_shape, (time_step, None, cell_units[-1]))
+
+    y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        np.zeros((batch, time_step, embedding_dim)),
+        np.zeros((batch, time_step, cell_units[-1])))
+
+    # Test masking.
+    x = keras.Input((time_step, embedding_dim))
+    time_major = keras.layers.Lambda(
+        lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
+    mask = keras.layers.Masking()(time_major)
+    rnn = keras.layers.SimpleRNN(
+        units, time_major=True, return_sequences=True)(mask)
+    y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(rnn)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        np.zeros((batch, time_step, embedding_dim)),
+        np.zeros((batch, time_step, units)))
+
+    # Test layer output
+    x = keras.Input((time_step, embedding_dim))
+    rnn_1 = keras.layers.SimpleRNN(units, return_sequences=True)
+    y = rnn_1(x)
+
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        np.zeros((batch, time_step, embedding_dim)),
+        np.zeros((batch, time_step, units)))
+
+    x_np = np.random.random((batch, time_step, embedding_dim))
+    y_np_1 = model.predict(x_np)
+
+    time_major = keras.layers.Lambda(
+        lambda t: array_ops.transpose(t, [1, 0, 2]))(x)
+    rnn_2 = keras.layers.SimpleRNN(
+        units, time_major=True, return_sequences=True)
+    y_2 = rnn_2(time_major)
+    y_2 = keras.layers.Lambda(
+        lambda t: array_ops.transpose(t, [1, 0, 2]))(y_2)
+
+    model_2 = keras.models.Model(x, y_2)
+    rnn_2.set_weights(rnn_1.get_weights())
+
+    y_np_2 = model_2.predict(x_np)
+    self.assertAllClose(y_np_1, y_np_2, atol=1e-4)
 
   def test_rnn_cell_with_constants_layer(self):
 
@@ -319,89 +338,86 @@ class RNNTest(test.TestCase):
         base_config = super(RNNCellWithConstants, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    with self.cached_session():
-      # Test basic case.
-      x = keras.Input((None, 5))
-      c = keras.Input((3,))
-      cell = RNNCellWithConstants(32)
-      layer = keras.layers.RNN(cell)
-      y = layer(x, constants=c)
-
-      model = keras.models.Model([x, c], y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-          np.zeros((6, 32))
-      )
-
-    with self.cached_session():
-      # Test basic case serialization.
-      x_np = np.random.random((6, 5, 5))
-      c_np = np.random.random((6, 3))
-      y_np = model.predict([x_np, c_np])
-      weights = model.get_weights()
-      config = layer.get_config()
-      custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
-      with keras.utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.RNN.from_config(config.copy())
-      y = layer(x, constants=c)
-      model = keras.models.Model([x, c], y)
-      model.set_weights(weights)
-      y_np_2 = model.predict([x_np, c_np])
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-    with self.cached_session():
-      # test flat list inputs.
-      with keras.utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.RNN.from_config(config.copy())
-      y = layer([x, c])
-      model = keras.models.Model([x, c], y)
-      model.set_weights(weights)
-      y_np_3 = model.predict([x_np, c_np])
-      self.assertAllClose(y_np, y_np_3, atol=1e-4)
-
-    with self.cached_session():
-      # Test stacking.
-      cells = [keras.layers.recurrent.GRUCell(8),
-               RNNCellWithConstants(12),
-               RNNCellWithConstants(32)]
-      layer = keras.layers.recurrent.RNN(cells)
-      y = layer(x, constants=c)
-      model = keras.models.Model([x, c], y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-          np.zeros((6, 32))
-      )
-
-    with self.cached_session():
-      # Test GRUCell reset_after property.
-      x = keras.Input((None, 5))
-      c = keras.Input((3,))
-      cells = [keras.layers.recurrent.GRUCell(32, reset_after=True)]
-      layer = keras.layers.recurrent.RNN(cells)
-      y = layer(x, constants=c)
-      model = keras.models.Model([x, c], y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-          np.zeros((6, 32))
-      )
-
-    with self.cached_session():
-      # Test stacked RNN serialization
-      x_np = np.random.random((6, 5, 5))
-      c_np = np.random.random((6, 3))
-      y_np = model.predict([x_np, c_np])
-      weights = model.get_weights()
-      config = layer.get_config()
-      with keras.utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.recurrent.RNN.from_config(config.copy())
-      y = layer(x, constants=c)
-      model = keras.models.Model([x, c], y)
-      model.set_weights(weights)
-      y_np_2 = model.predict([x_np, c_np])
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+    # Test basic case.
+    x = keras.Input((None, 5))
+    c = keras.Input((3,))
+    cell = RNNCellWithConstants(32)
+    layer = keras.layers.RNN(cell)
+    y = layer(x, constants=c)
+
+    model = keras.models.Model([x, c], y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+        np.zeros((6, 32))
+    )
+
+    # Test basic case serialization.
+    x_np = np.random.random((6, 5, 5))
+    c_np = np.random.random((6, 3))
+    y_np = model.predict([x_np, c_np])
+    weights = model.get_weights()
+    config = layer.get_config()
+    custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
+    with keras.utils.CustomObjectScope(custom_objects):
+      layer = keras.layers.RNN.from_config(config.copy())
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
+    model.set_weights(weights)
+    y_np_2 = model.predict([x_np, c_np])
+    self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+    # test flat list inputs.
+    with keras.utils.CustomObjectScope(custom_objects):
+      layer = keras.layers.RNN.from_config(config.copy())
+    y = layer([x, c])
+    model = keras.models.Model([x, c], y)
+    model.set_weights(weights)
+    y_np_3 = model.predict([x_np, c_np])
+    self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
+    # Test stacking.
+    cells = [keras.layers.recurrent.GRUCell(8),
+             RNNCellWithConstants(12),
+             RNNCellWithConstants(32)]
+    layer = keras.layers.recurrent.RNN(cells)
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+        np.zeros((6, 32))
+    )
+
+    # Test GRUCell reset_after property.
+    x = keras.Input((None, 5))
+    c = keras.Input((3,))
+    cells = [keras.layers.recurrent.GRUCell(32, reset_after=True)]
+    layer = keras.layers.recurrent.RNN(cells)
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+        np.zeros((6, 32))
+    )
+
+    # Test stacked RNN serialization
+    x_np = np.random.random((6, 5, 5))
+    c_np = np.random.random((6, 3))
+    y_np = model.predict([x_np, c_np])
+    weights = model.get_weights()
+    config = layer.get_config()
+    with keras.utils.CustomObjectScope(custom_objects):
+      layer = keras.layers.recurrent.RNN.from_config(config.copy())
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
+    model.set_weights(weights)
+    y_np_2 = model.predict([x_np, c_np])
+    self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
   def test_rnn_cell_with_constants_layer_passing_initial_state(self):
 
@@ -446,54 +462,55 @@ class RNNTest(test.TestCase):
         base_config = super(RNNCellWithConstants, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
-    with self.cached_session():
-      # Test basic case.
-      x = keras.Input((None, 5))
-      c = keras.Input((3,))
-      s = keras.Input((32,))
-      cell = RNNCellWithConstants(32)
-      layer = keras.layers.RNN(cell)
-      y = layer(x, initial_state=s, constants=c)
-      model = keras.models.Model([x, s, c], y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
-          np.zeros((6, 32))
-      )
-
-    with self.cached_session():
-      # Test basic case serialization.
-      x_np = np.random.random((6, 5, 5))
-      s_np = np.random.random((6, 32))
-      c_np = np.random.random((6, 3))
-      y_np = model.predict([x_np, s_np, c_np])
-      weights = model.get_weights()
-      config = layer.get_config()
-      custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
-      with keras.utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.RNN.from_config(config.copy())
-      y = layer(x, initial_state=s, constants=c)
-      model = keras.models.Model([x, s, c], y)
-      model.set_weights(weights)
-      y_np_2 = model.predict([x_np, s_np, c_np])
-      self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-      # verify that state is used
-      y_np_2_different_s = model.predict([x_np, s_np + 10., c_np])
-      with self.assertRaises(AssertionError):
-        self.assertAllClose(y_np, y_np_2_different_s, atol=1e-4)
-
-    with self.cached_session():
-      # test flat list inputs
-      with keras.utils.CustomObjectScope(custom_objects):
-        layer = keras.layers.RNN.from_config(config.copy())
-      y = layer([x, s, c])
-      model = keras.models.Model([x, s, c], y)
-      model.set_weights(weights)
-      y_np_3 = model.predict([x_np, s_np, c_np])
-      self.assertAllClose(y_np, y_np_3, atol=1e-4)
+    # Test basic case.
+    x = keras.Input((None, 5))
+    c = keras.Input((3,))
+    s = keras.Input((32,))
+    cell = RNNCellWithConstants(32)
+    layer = keras.layers.RNN(cell)
+    y = layer(x, initial_state=s, constants=c)
+    model = keras.models.Model([x, s, c], y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
+        np.zeros((6, 32))
+    )
+
+    # Test basic case serialization.
+    x_np = np.random.random((6, 5, 5))
+    s_np = np.random.random((6, 32))
+    c_np = np.random.random((6, 3))
+    y_np = model.predict([x_np, s_np, c_np])
+    weights = model.get_weights()
+    config = layer.get_config()
+    custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
+    with keras.utils.CustomObjectScope(custom_objects):
+      layer = keras.layers.RNN.from_config(config.copy())
+    y = layer(x, initial_state=s, constants=c)
+    model = keras.models.Model([x, s, c], y)
+    model.set_weights(weights)
+    y_np_2 = model.predict([x_np, s_np, c_np])
+    self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+    # verify that state is used
+    y_np_2_different_s = model.predict([x_np, s_np + 10., c_np])
+    with self.assertRaises(AssertionError):
+      self.assertAllClose(y_np, y_np_2_different_s, atol=1e-4)
+
+    # test flat list inputs
+    with keras.utils.CustomObjectScope(custom_objects):
+      layer = keras.layers.RNN.from_config(config.copy())
+    y = layer([x, s, c])
+    model = keras.models.Model([x, s, c], y)
+    model.set_weights(weights)
+    y_np_3 = model.predict([x_np, s_np, c_np])
+    self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
   def test_stacked_rnn_attributes(self):
+    if context.executing_eagerly():
+      self.skipTest('reduce_sum is not available in eager mode.')
+
     cells = [keras.layers.LSTMCell(1),
              keras.layers.LSTMCell(1)]
     layer = keras.layers.RNN(cells)
@@ -558,67 +575,67 @@ class RNNTest(test.TestCase):
     timesteps = 2
     num_samples = 2
 
-    with self.cached_session():
-      input1 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-      layer = layer_class(units,
-                          return_state=True,
-                          return_sequences=True,
-                          dropout=0.2)
-      state = layer(input1)[1:]
+    input1 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = layer_class(units,
+                        return_state=True,
+                        return_sequences=True,
+                        dropout=0.2)
+    state = layer(input1)[1:]
 
-      input2 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-      output = layer_class(units)(input2, initial_state=state)
-      model = keras.Model([input1, input2], output)
+    input2 = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    output = layer_class(units)(input2, initial_state=state)
+    model = keras.Model([input1, input2], output)
 
-      inputs = [np.random.random((num_samples, timesteps, embedding_dim)),
-                np.random.random((num_samples, timesteps, embedding_dim))]
-      model.predict(inputs)
+    inputs = [np.random.random((num_samples, timesteps, embedding_dim)),
+              np.random.random((num_samples, timesteps, embedding_dim))]
+    model.predict(inputs)
 
   def test_builtin_rnn_cell_serialization(self):
     for cell_class in [keras.layers.SimpleRNNCell,
                        keras.layers.GRUCell,
                        keras.layers.LSTMCell]:
-      with self.cached_session():
-        # Test basic case.
-        x = keras.Input((None, 5))
-        cell = cell_class(32)
-        layer = keras.layers.RNN(cell)
-        y = layer(x)
-        model = keras.models.Model(x, y)
-        model.compile(optimizer='rmsprop', loss='mse')
-
-        # Test basic case serialization.
-        x_np = np.random.random((6, 5, 5))
-        y_np = model.predict(x_np)
-        weights = model.get_weights()
-        config = layer.get_config()
-        layer = keras.layers.RNN.from_config(config)
-        y = layer(x)
-        model = keras.models.Model(x, y)
-        model.set_weights(weights)
-        y_np_2 = model.predict(x_np)
-        self.assertAllClose(y_np, y_np_2, atol=1e-4)
-
-        # Test stacking.
-        cells = [cell_class(8),
-                 cell_class(12),
-                 cell_class(32)]
-        layer = keras.layers.RNN(cells)
-        y = layer(x)
-        model = keras.models.Model(x, y)
-        model.compile(optimizer='rmsprop', loss='mse')
-
-        # Test stacked RNN serialization.
-        x_np = np.random.random((6, 5, 5))
-        y_np = model.predict(x_np)
-        weights = model.get_weights()
-        config = layer.get_config()
-        layer = keras.layers.RNN.from_config(config)
-        y = layer(x)
-        model = keras.models.Model(x, y)
-        model.set_weights(weights)
-        y_np_2 = model.predict(x_np)
-        self.assertAllClose(y_np, y_np_2, atol=1e-4)
+      # Test basic case.
+      x = keras.Input((None, 5))
+      cell = cell_class(32)
+      layer = keras.layers.RNN(cell)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                    loss='mse')
+
+      # Test basic case serialization.
+      x_np = np.random.random((6, 5, 5))
+      y_np = model.predict(x_np)
+      weights = model.get_weights()
+      config = layer.get_config()
+      layer = keras.layers.RNN.from_config(config)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.set_weights(weights)
+      y_np_2 = model.predict(x_np)
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+      # Test stacking.
+      cells = [cell_class(8),
+               cell_class(12),
+               cell_class(32)]
+      layer = keras.layers.RNN(cells)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                    loss='mse')
+
+      # Test stacked RNN serialization.
+      x_np = np.random.random((6, 5, 5))
+      y_np = model.predict(x_np)
+      weights = model.get_weights()
+      config = layer.get_config()
+      layer = keras.layers.RNN.from_config(config)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.set_weights(weights)
+      y_np_2 = model.predict(x_np)
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
   def DISABLED_test_stacked_rnn_dropout(self):
     # Temporarily disabled test due an occasional Grappler segfault.
@@ -627,14 +644,13 @@ class RNNTest(test.TestCase):
              keras.layers.LSTMCell(3, dropout=0.1, recurrent_dropout=0.1)]
     layer = keras.layers.RNN(cells)
 
-    with self.cached_session():
-      x = keras.Input((None, 5))
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile('sgd', 'mse')
-      x_np = np.random.random((6, 5, 5))
-      y_np = np.random.random((6, 3))
-      model.train_on_batch(x_np, y_np)
+    x = keras.Input((None, 5))
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile('sgd', 'mse')
+    x_np = np.random.random((6, 5, 5))
+    y_np = np.random.random((6, 3))
+    model.train_on_batch(x_np, y_np)
 
   def test_stacked_rnn_compute_output_shape(self):
     cells = [keras.layers.LSTMCell(3),
@@ -669,62 +685,65 @@ class RNNTest(test.TestCase):
 
   def test_checkpointable_dependencies(self):
     rnn = keras.layers.SimpleRNN
-    with self.cached_session():
-      x = np.random.random((2, 2, 2))
-      y = np.random.random((2, 2))
-      model = keras.models.Sequential()
-      model.add(rnn(2))
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.fit(x, y, epochs=1, batch_size=1)
-
-      # check whether the model variables are present in the
-      # checkpointable list of objects
-      checkpointed_objects = set(checkpointable_util.list_objects(model))
-      for v in model.variables:
-        self.assertIn(v, checkpointed_objects)
+    x = np.random.random((2, 2, 2))
+    y = np.random.random((2, 2))
+    model = keras.models.Sequential()
+    model.add(rnn(2))
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.fit(x, y, epochs=1, batch_size=1)
+
+    # check whether the model variables are present in the
+    # checkpointable list of objects
+    checkpointed_objects = set(checkpointable_util.list_objects(model))
+    for v in model.variables:
+      self.assertIn(v, checkpointed_objects)
 
   def test_high_dimension_RNN(self):
-    with self.cached_session():
-      # Basic test case.
-      unit_a = 10
-      unit_b = 20
-      input_a = 5
-      input_b = 10
-      batch = 32
-      time_step = 4
-
-      cell = Minimal2DRNNCell(unit_a, unit_b)
-      x = keras.Input((None, input_a, input_b))
-      layer = keras.layers.RNN(cell)
-      y = layer(x)
+    # Basic test case.
+    unit_a = 10
+    unit_b = 20
+    input_a = 5
+    input_b = 10
+    batch = 32
+    time_step = 4
+
+    cell = Minimal2DRNNCell(unit_a, unit_b)
+    x = keras.Input((None, input_a, input_b))
+    layer = keras.layers.RNN(cell)
+    y = layer(x)
+
+    self.assertEqual(cell.state_size.as_list(), [unit_a, unit_b])
 
-      self.assertEqual(cell.state_size.as_list(), [unit_a, unit_b])
+    if not context.executing_eagerly():
       init_state = layer.get_initial_state(x)
       self.assertEqual(len(init_state), 1)
       self.assertEqual(init_state[0].get_shape().as_list(),
                        [None, unit_a, unit_b])
 
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          np.zeros((batch, time_step, input_a, input_b)),
-          np.zeros((batch, unit_a, unit_b)))
-      self.assertEqual(model.output_shape, (None, unit_a, unit_b))
-
-      # Test stacking.
-      cells = [
-          Minimal2DRNNCell(unit_a, unit_b),
-          Minimal2DRNNCell(unit_a * 2, unit_b * 2),
-          Minimal2DRNNCell(unit_a * 4, unit_b * 4)
-      ]
-      layer = keras.layers.RNN(cells)
-      y = layer(x)
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          np.zeros((batch, time_step, input_a, input_b)),
-          np.zeros((batch, unit_a * 4, unit_b * 4)))
-      self.assertEqual(model.output_shape, (None, unit_a * 4, unit_b * 4))
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        np.zeros((batch, time_step, input_a, input_b)),
+        np.zeros((batch, unit_a, unit_b)))
+    self.assertEqual(model.output_shape, (None, unit_a, unit_b))
+
+    # Test stacking.
+    cells = [
+        Minimal2DRNNCell(unit_a, unit_b),
+        Minimal2DRNNCell(unit_a * 2, unit_b * 2),
+        Minimal2DRNNCell(unit_a * 4, unit_b * 4)
+    ]
+    layer = keras.layers.RNN(cells)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        np.zeros((batch, time_step, input_a, input_b)),
+        np.zeros((batch, unit_a * 4, unit_b * 4)))
+    self.assertEqual(model.output_shape, (None, unit_a * 4, unit_b * 4))
 
   def test_high_dimension_RNN_with_init_state(self):
     unit_a = 10
@@ -734,57 +753,57 @@ class RNNTest(test.TestCase):
     batch = 32
     time_step = 4
 
-    with self.cached_session():
-      # Basic test case.
-      cell = Minimal2DRNNCell(unit_a, unit_b)
-      x = keras.Input((None, input_a, input_b))
-      s = keras.Input((unit_a, unit_b))
-      layer = keras.layers.RNN(cell)
-      y = layer(x, initial_state=s)
-
-      model = keras.models.Model([x, s], y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch([
-          np.zeros((batch, time_step, input_a, input_b)),
-          np.zeros((batch, unit_a, unit_b))
-      ], np.zeros((batch, unit_a, unit_b)))
-      self.assertEqual(model.output_shape, (None, unit_a, unit_b))
-
-    with self.cached_session():
-      # Bad init state shape.
-      bad_shape_a = unit_a * 2
-      bad_shape_b = unit_b * 2
-      cell = Minimal2DRNNCell(unit_a, unit_b)
-      x = keras.Input((None, input_a, input_b))
-      s = keras.Input((bad_shape_a, bad_shape_b))
-      layer = keras.layers.RNN(cell)
-      with self.assertRaisesWithPredicateMatch(ValueError,
-                                               'however `cell.state_size` is'):
-        layer(x, initial_state=s)
+    # Basic test case.
+    cell = Minimal2DRNNCell(unit_a, unit_b)
+    x = keras.Input((None, input_a, input_b))
+    s = keras.Input((unit_a, unit_b))
+    layer = keras.layers.RNN(cell)
+    y = layer(x, initial_state=s)
+
+    model = keras.models.Model([x, s], y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch([
+        np.zeros((batch, time_step, input_a, input_b)),
+        np.zeros((batch, unit_a, unit_b))
+    ], np.zeros((batch, unit_a, unit_b)))
+    self.assertEqual(model.output_shape, (None, unit_a, unit_b))
+
+    # Bad init state shape.
+    bad_shape_a = unit_a * 2
+    bad_shape_b = unit_b * 2
+    cell = Minimal2DRNNCell(unit_a, unit_b)
+    x = keras.Input((None, input_a, input_b))
+    s = keras.Input((bad_shape_a, bad_shape_b))
+    layer = keras.layers.RNN(cell)
+    with self.assertRaisesWithPredicateMatch(ValueError,
+                                             'however `cell.state_size` is'):
+      layer(x, initial_state=s)
 
   def test_inconsistent_output_state_size(self):
-    with self.cached_session():
-      batch = 32
-      time_step = 4
-      state_size = 5
-      input_size = 6
-      cell = PlusOneRNNCell(state_size)
-      x = keras.Input((None, input_size))
-      layer = keras.layers.RNN(cell)
-      y = layer(x)
-
-      self.assertEqual(cell.state_size, state_size)
+    batch = 32
+    time_step = 4
+    state_size = 5
+    input_size = 6
+    cell = PlusOneRNNCell(state_size)
+    x = keras.Input((None, input_size))
+    layer = keras.layers.RNN(cell)
+    y = layer(x)
+
+    self.assertEqual(cell.state_size, state_size)
+    if not context.executing_eagerly():
       init_state = layer.get_initial_state(x)
       self.assertEqual(len(init_state), 1)
       self.assertEqual(init_state[0].get_shape().as_list(),
                        [None, state_size])
 
-      model = keras.models.Model(x, y)
-      model.compile(optimizer='rmsprop', loss='mse')
-      model.train_on_batch(
-          np.zeros((batch, time_step, input_size)),
-          np.zeros((batch, input_size)))
-      self.assertEqual(model.output_shape, (None, input_size))
+    model = keras.models.Model(x, y)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        np.zeros((batch, time_step, input_size)),
+        np.zeros((batch, input_size)))
+    self.assertEqual(model.output_shape, (None, input_size))
 
   def test_get_initial_state(self):
     cell = keras.layers.SimpleRNNCell(5)
@@ -792,16 +811,299 @@ class RNNTest(test.TestCase):
                                  'batch_size and dtype cannot be None'):
       cell.get_initial_state(None, None, None)
 
-    inputs = keras.Input((None, 2, 10))
-    initial_state = cell.get_initial_state(inputs, None, None)
-    self.assertEqual(initial_state.shape.as_list(), [None, 5])
-    self.assertEqual(initial_state.dtype, inputs.dtype)
+    if not context.executing_eagerly():
+      inputs = keras.Input((None, 10))
+      initial_state = cell.get_initial_state(inputs, None, None)
+      self.assertEqual(initial_state.shape.as_list(), [None, 5])
+      self.assertEqual(initial_state.dtype, inputs.dtype)
+
+      batch = array_ops.shape(inputs)[0]
+      dtype = inputs.dtype
+      initial_state = cell.get_initial_state(None, batch, dtype)
+      self.assertEqual(initial_state.shape.as_list(), [None, 5])
+      self.assertEqual(initial_state.dtype, inputs.dtype)
+    else:
+      batch = 8
+      inputs = np.random.random((batch, 10))
+      initial_state = cell.get_initial_state(inputs, None, None)
+      self.assertEqual(initial_state.shape.as_list(), [8, 5])
+      self.assertEqual(initial_state.dtype, inputs.dtype)
+
+      dtype = inputs.dtype
+      initial_state = cell.get_initial_state(None, batch, dtype)
+      self.assertEqual(initial_state.shape.as_list(), [batch, 5])
+      self.assertEqual(initial_state.dtype, inputs.dtype)
+
+  def test_nested_input_output(self):
+    batch = 10
+    t = 5
+    i1, i2, i3 = 3, 4, 5
+    o1, o2, o3 = 2, 3, 4
+
+    cell = NestedCell(o1, o2, o3)
+    rnn = keras.layers.RNN(cell)
+
+    input_1 = keras.Input((t, i1))
+    input_2 = keras.Input((t, i2, i3))
+
+    outputs = rnn((input_1, input_2))
+
+    self.assertEqual(len(outputs), 2)
+    self.assertEqual(outputs[0].shape.as_list(), [None, o1])
+    self.assertEqual(outputs[1].shape.as_list(), [None, o2, o3])
+
+    model = keras.models.Model((input_1, input_2), outputs)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
+        [np.zeros((batch, o1)), np.zeros((batch, o2, o3))])
+    self.assertEqual(model.output_shape, [(None, o1), (None, o2, o3)])
 
-    batch = array_ops.shape(inputs)[0]
-    dtype = inputs.dtype
-    initial_state = cell.get_initial_state(None, batch, dtype)
-    self.assertEqual(initial_state.shape.as_list(), [None, 5])
-    self.assertEqual(initial_state.dtype, inputs.dtype)
+    cell = NestedCell(o1, o2, o3, use_tuple=True)
+
+    rnn = keras.layers.RNN(cell)
+
+    input_1 = keras.Input((t, i1))
+    input_2 = keras.Input((t, i2, i3))
+
+    outputs = rnn(NestedInput(t1=input_1, t2=input_2))
+
+    self.assertEqual(len(outputs), 2)
+    self.assertEqual(outputs[0].shape.as_list(), [None, o1])
+    self.assertEqual(outputs[1].shape.as_list(), [None, o2, o3])
+
+    model = keras.models.Model([input_1, input_2], outputs)
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((batch, t, i1)),
+         np.zeros((batch, t, i2, i3))],
+        [np.zeros((batch, o1)), np.zeros((batch, o2, o3))])
+    self.assertEqual(model.output_shape, [(None, o1), (None, o2, o3)])
+
+  def test_nested_input_output_with_state(self):
+    batch = 10
+    t = 5
+    i1, i2, i3 = 3, 4, 5
+    o1, o2, o3 = 2, 3, 4
+
+    cell = NestedCell(o1, o2, o3)
+    rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
+
+    input_1 = keras.Input((t, i1))
+    input_2 = keras.Input((t, i2, i3))
+
+    output1, output2, s1, s2 = rnn((input_1, input_2))
+
+    self.assertEqual(output1.shape.as_list(), [None, t, o1])
+    self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
+    self.assertEqual(s1.shape.as_list(), [None, o1])
+    self.assertEqual(s2.shape.as_list(), [None, o2, o3])
+
+    model = keras.models.Model([input_1, input_2], [output1, output2])
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((batch, t, i1)),
+         np.zeros((batch, t, i2, i3))],
+        [np.zeros((batch, t, o1)),
+         np.zeros((batch, t, o2, o3))])
+    self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
+
+    cell = NestedCell(o1, o2, o3, use_tuple=True)
+
+    rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
+
+    input_1 = keras.Input((t, i1))
+    input_2 = keras.Input((t, i2, i3))
+
+    output1, output2, s1, s2 = rnn(NestedInput(t1=input_1, t2=input_2))
+
+    self.assertEqual(output1.shape.as_list(), [None, t, o1])
+    self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
+    self.assertEqual(s1.shape.as_list(), [None, o1])
+    self.assertEqual(s2.shape.as_list(), [None, o2, o3])
+
+    model = keras.models.Model([input_1, input_2], [output1, output2])
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((batch, t, i1)),
+         np.zeros((batch, t, i2, i3))],
+        [np.zeros((batch, t, o1)),
+         np.zeros((batch, t, o2, o3))])
+    self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
+
+  def test_nest_input_output_with_init_state(self):
+    batch = 10
+    t = 5
+    i1, i2, i3 = 3, 4, 5
+    o1, o2, o3 = 2, 3, 4
+
+    cell = NestedCell(o1, o2, o3)
+    rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
+
+    input_1 = keras.Input((t, i1))
+    input_2 = keras.Input((t, i2, i3))
+    init_s1 = keras.Input((o1,))
+    init_s2 = keras.Input((o2, o3))
+
+    output1, output2, s1, s2 = rnn((input_1, input_2),
+                                   initial_state=(init_s1, init_s2))
+
+    self.assertEqual(output1.shape.as_list(), [None, t, o1])
+    self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
+    self.assertEqual(s1.shape.as_list(), [None, o1])
+    self.assertEqual(s2.shape.as_list(), [None, o2, o3])
+
+    model = keras.models.Model([input_1, input_2, init_s1, init_s2],
+                               [output1, output2])
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((batch, t, i1)),
+         np.zeros((batch, t, i2, i3)),
+         np.zeros((batch, o1)),
+         np.zeros((batch, o2, o3))],
+        [np.zeros((batch, t, o1)),
+         np.zeros((batch, t, o2, o3))])
+    self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
+
+    cell = NestedCell(o1, o2, o3, use_tuple=True)
+
+    rnn = keras.layers.RNN(cell, return_sequences=True, return_state=True)
+
+    input_1 = keras.Input((t, i1))
+    input_2 = keras.Input((t, i2, i3))
+    init_s1 = keras.Input((o1,))
+    init_s2 = keras.Input((o2, o3))
+    init_state = NestedState(s1=init_s1, s2=init_s2)
+
+    output1, output2, s1, s2 = rnn(NestedInput(t1=input_1, t2=input_2),
+                                   initial_state=init_state)
+
+    self.assertEqual(output1.shape.as_list(), [None, t, o1])
+    self.assertEqual(output2.shape.as_list(), [None, t, o2, o3])
+    self.assertEqual(s1.shape.as_list(), [None, o1])
+    self.assertEqual(s2.shape.as_list(), [None, o2, o3])
+
+    model = keras.models.Model([input_1, input_2, init_s1, init_s2],
+                               [output1, output2])
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+    model.train_on_batch(
+        [np.zeros((batch, t, i1)),
+         np.zeros((batch, t, i2, i3)),
+         np.zeros((batch, o1)),
+         np.zeros((batch, o2, o3))],
+        [np.zeros((batch, t, o1)),
+         np.zeros((batch, t, o2, o3))])
+    self.assertEqual(model.output_shape, [(None, t, o1), (None, t, o2, o3)])
+
+  def test_peephole_lstm_cell(self):
+
+    def _run_cell(cell_fn, **kwargs):
+      with self.cached_session() as sess:
+        inputs = array_ops.one_hot([1, 2, 3, 4], 4)
+        cell = cell_fn(5, **kwargs)
+        cell.build(inputs.shape)
+        initial_state = cell.get_initial_state(
+            inputs=inputs, batch_size=4, dtype=dtypes.float32)
+        inputs, _ = cell(inputs, initial_state)
+        output = inputs
+        if not context.executing_eagerly():
+          self.evaluate(variables_lib.global_variables_initializer())
+          output = self.evaluate(output)
+        return output
+
+    random_seed.set_random_seed(12345)
+    # `recurrent_activation` kwarg is set to sigmoid as that is hardcoded into
+    # rnn_cell.LSTMCell.
+    no_peephole_output = _run_cell(
+        keras.layers.LSTMCell,
+        kernel_initializer='ones',
+        recurrent_activation='sigmoid',
+        implementation=1)
+    first_implementation_output = _run_cell(
+        keras.layers.PeepholeLSTMCell,
+        kernel_initializer='ones',
+        recurrent_activation='sigmoid',
+        implementation=1)
+    second_implementation_output = _run_cell(
+        keras.layers.PeepholeLSTMCell,
+        kernel_initializer='ones',
+        recurrent_activation='sigmoid',
+        implementation=2)
+    tf_lstm_cell_output = _run_cell(
+        rnn_cell.LSTMCell,
+        use_peepholes=True,
+        initializer=init_ops.ones_initializer)
+    self.assertNotAllClose(first_implementation_output, no_peephole_output)
+    self.assertAllClose(first_implementation_output,
+                        second_implementation_output)
+    self.assertAllClose(first_implementation_output, tf_lstm_cell_output)
+
+  def test_masking_rnn_with_output_and_states(self):
+
+    class Cell(keras.layers.Layer):
+
+      def __init__(self):
+        self.state_size = None
+        self.output_size = None
+        super(Cell, self).__init__()
+
+      def build(self, input_shape):
+        self.state_size = input_shape[-1]
+        self.output_size = input_shape[-1]
+
+      def call(self, inputs, states):
+        return inputs, [s + 1 for s in states]
+
+    x = keras.Input((3, 1), name='x')
+    x_masked = keras.layers.Masking()(x)
+    s_0 = keras.Input((1,), name='s_0')
+    y, s = keras.layers.RNN(
+        Cell(), return_state=True)(x_masked, initial_state=s_0)
+    model = keras.models.Model([x, s_0], [y, s])
+    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse')
+
+    # last time step masked
+    x_np = np.array([[[1.], [2.], [0.]]])
+    s_0_np = np.array([[10.]])
+    y_np, s_np = model.predict([x_np, s_0_np])
+
+    # 1 is added to initial state two times
+    self.assertAllClose(s_np, s_0_np + 2)
+    # Expect last output to be the same as last output before masking
+    self.assertAllClose(y_np, x_np[:, 1, :])
+
+  def test_zero_output_for_masking(self):
+
+    for unroll in [True, False]:
+      cell = keras.layers.SimpleRNNCell(5)
+      x = keras.Input((5, 5))
+      mask = keras.layers.Masking()
+      layer = keras.layers.RNN(
+          cell, return_sequences=True, zero_output_for_mask=True, unroll=unroll)
+      masked_input = mask(x)
+      y = layer(masked_input)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                    loss='mse')
+
+      np_x = np.ones((6, 5, 5))
+      result_1 = model.predict(np_x)
+
+      # set the time 4 and 5 for last record to be zero (masked).
+      np_x[5, 3:] = 0
+      result_2 = model.predict(np_x)
+
+      # expect the result_2 has same output, except the time 4,5 for last
+      # record.
+      result_1[5, 3:] = 0
+      self.assertAllClose(result_1, result_2)
 
 
 class Minimal2DRNNCell(keras.layers.Layer):
@@ -858,5 +1160,55 @@ class PlusOneRNNCell(keras.layers.Layer):
     return inputs + 1, [states[0] + 1]
 
 
+class NestedCell(keras.layers.Layer):
+
+  def __init__(self, unit_1, unit_2, unit_3, use_tuple=False, **kwargs):
+    self.unit_1 = unit_1
+    self.unit_2 = unit_2
+    self.unit_3 = unit_3
+    self.use_tuple = use_tuple
+    super(NestedCell, self).__init__(**kwargs)
+    # A nested state.
+    if use_tuple:
+      self.state_size = NestedState(
+          s1=unit_1, s2=tensor_shape.TensorShape([unit_2, unit_3]))
+    else:
+      self.state_size = (unit_1, tensor_shape.TensorShape([unit_2, unit_3]))
+    self.output_size = (unit_1, tensor_shape.TensorShape([unit_2, unit_3]))
+
+  def build(self, inputs_shape):
+    # expect input_shape to contain 2 items, [(batch, i1), (batch, i2, i3)]
+    if self.use_tuple:
+      input_1 = inputs_shape.t1[1]
+      input_2, input_3 = inputs_shape.t2[1:]
+    else:
+      input_1 = inputs_shape[0][1]
+      input_2, input_3 = inputs_shape[1][1:]
+
+    self.kernel_1 = self.add_weight(
+        shape=(input_1, self.unit_1), initializer='uniform', name='kernel_1')
+    self.kernel_2_3 = self.add_weight(
+        shape=(input_2, input_3, self.unit_2, self.unit_3),
+        initializer='uniform',
+        name='kernel_2_3')
+
+  def call(self, inputs, states):
+    # inputs should be in [(batch, input_1), (batch, input_2, input_3)]
+    # state should be in shape [(batch, unit_1), (batch, unit_2, unit_3)]
+    flatten_inputs = nest.flatten(inputs)
+    s1, s2 = states
+
+    output_1 = math_ops.matmul(flatten_inputs[0], self.kernel_1)
+    output_2_3 = special_math_ops.einsum('bij,ijkl->bkl', flatten_inputs[1],
+                                         self.kernel_2_3)
+    state_1 = s1 + output_1
+    state_2_3 = s2 + output_2_3
+
+    output = [output_1, output_2_3]
+    new_states = NestedState(s1=state_1, s2=state_2_3)
+
+    return output, new_states
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index 5872185ef7c30aa50e8ca5aac32cc1804369017c..548c3ec1ac760a33d6eb998e7d601c843bd87779 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -19,9 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.platform import test
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class LayerSerializationTest(test.TestCase):
 
   def test_serialize_deserialize(self):
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index 2f2295a793872bb996638570408c0e0c94805510..bb3fea26926959c15e76556b836a120c02905c6f 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -24,12 +24,13 @@ from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
 class SimpleRNNLayerTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_return_sequences_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -41,7 +42,6 @@ class SimpleRNNLayerTest(test.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_dynamic_behavior_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -55,7 +55,6 @@ class SimpleRNNLayerTest(test.TestCase):
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_dropout_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -68,7 +67,6 @@ class SimpleRNNLayerTest(test.TestCase):
                 'recurrent_dropout': 0.1},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_implementation_mode_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -81,6 +79,49 @@ class SimpleRNNLayerTest(test.TestCase):
                   'implementation': mode},
           input_shape=(num_samples, timesteps, embedding_dim))
 
+  def test_constraints_SimpleRNN(self):
+    embedding_dim = 4
+    layer_class = keras.layers.SimpleRNN
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+  @tf_test_util.run_v1_only('b/120545219')
+  def test_with_masking_layer_SimpleRNN(self):
+    layer_class = keras.layers.SimpleRNN
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSPropOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_SimpleRNN(self):
+    layer_class = keras.layers.SimpleRNN
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+
+class SimpleRNNLayerGraphOnlyTest(test.TestCase):
+
+  @tf_test_util.run_v1_only('b/120545219')
   def test_statefulness_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -99,7 +140,8 @@ class SimpleRNNLayerTest(test.TestCase):
       layer = layer_class(
           units, return_sequences=False, stateful=True, weights=None)
       model.add(layer)
-      model.compile(optimizer='sgd', loss='mse')
+      model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                    loss='mse')
       out1 = model.predict(np.ones((num_samples, timesteps)))
       self.assertEqual(out1.shape, (num_samples, units))
 
@@ -143,6 +185,7 @@ class SimpleRNNLayerTest(test.TestCase):
 
       np.testing.assert_allclose(out7, out6, atol=1e-5)
 
+  @tf_test_util.run_deprecated_v1
   def test_regularizers_SimpleRNN(self):
     embedding_dim = 4
     layer_class = keras.layers.SimpleRNN
@@ -163,47 +206,5 @@ class SimpleRNNLayerTest(test.TestCase):
       layer(x)
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-  def test_constraints_SimpleRNN(self):
-    embedding_dim = 4
-    layer_class = keras.layers.SimpleRNN
-    with self.cached_session():
-      k_constraint = keras.constraints.max_norm(0.01)
-      r_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_constraint=k_constraint,
-          recurrent_constraint=r_constraint,
-          bias_constraint=b_constraint)
-      layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_with_masking_layer_SimpleRNN(self):
-    layer_class = keras.layers.SimpleRNN
-    with self.cached_session():
-      inputs = np.random.random((2, 3, 4))
-      targets = np.abs(np.random.random((2, 3, 5)))
-      targets /= targets.sum(axis=-1, keepdims=True)
-      model = keras.models.Sequential()
-      model.add(keras.layers.Masking(input_shape=(3, 4)))
-      model.add(layer_class(units=5, return_sequences=True, unroll=False))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=RMSPropOptimizer(0.01))
-      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  def test_from_config_SimpleRNN(self):
-    layer_class = keras.layers.SimpleRNN
-    for stateful in (False, True):
-      l1 = layer_class(units=1, stateful=stateful)
-      l2 = layer_class.from_config(l1.get_config())
-      assert l1.get_config() == l2.get_config()
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/unified_lstm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..932b2d331dcb60c6ff3a70ec418d47424d4b8575
--- /dev/null
+++ b/tensorflow/python/keras/layers/unified_lstm_test.py
@@ -0,0 +1,918 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnifiedLSTM layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import time
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNLSTM
+from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import gradient_descent
+
+
+# Global config for grappler setting that is used for graph mode test.
+_rewrites = rewriter_config_pb2.RewriterConfig()
+_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
+_customer_optimizer = _rewrites.custom_optimizers.add()
+_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.min_graph_nodes = -1
+_graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
+_config = config_pb2.ConfigProto(graph_options=_graph_options)
+
+
+@test_util.run_v1_only('b/120545219')
+class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
+
+  def test_unifiedLSTM(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      outputs, runtime = layer(inputs)
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  def test_unifiedLSTM_with_cond(self):
+    # This test is to demonstrate the graph rewrite of grappler plugin under
+    # the condition that the function returns different number of internal
+    # states.
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      zeros = array_ops.zeros([batch, output_shape])
+      dummy_runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+      a = constant_op.constant(0)
+      b = constant_op.constant(1)
+      # Will always run the lstm layer.
+      outputs, runtime = control_flow_ops.cond(
+          gen_math_ops.less(a, b),
+          lambda: layer(inputs),
+          lambda: (zeros, dummy_runtime))
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  @parameterized.named_parameters(
+      ('_non_tan_activation', 'relu', 0, False, True, None),
+      ('_use_recurrent_dropout', 'tanh', 0.1, False, True, None),
+      ('_unroll', 'tanh', 0, True, True, None),
+      ('_not_use_bias', 'tanh', 0, False, False, None),
+      ('_use_bias_regularizer', 'tanh', 0, False, True, 'l2')
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_could_use_defun_backend(self, activation, recurrent_dropout,
+                                   unroll, use_bias, bias_regularizer):
+    layer = UnifiedLSTM(1,
+                        activation=activation,
+                        recurrent_dropout=recurrent_dropout,
+                        unroll=unroll,
+                        use_bias=use_bias,
+                        bias_regularizer=bias_regularizer)
+    self.assertFalse(layer.could_use_cudnn)
+
+  def test_unified_lstm_feature_parity_with_canonical_lstm(self):
+    with context.eager_mode():
+      # Run this test under eager only due to b/120160788 for model.set_weights.
+      input_shape = 10
+      rnn_state_size = 8
+      timestep = 4
+      batch = 20
+
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=rnn_state_size)
+      y_train = keras.utils.to_categorical(y_train, rnn_state_size)
+
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      lstm_layer = keras.layers.LSTM(rnn_state_size,
+                                     recurrent_activation='sigmoid')
+      output = lstm_layer(inputs)
+      lstm_model = keras.models.Model(inputs, output)
+      weights = lstm_model.get_weights()
+      y_1 = lstm_model.predict(x_train)
+      lstm_model.compile('rmsprop', 'mse')
+      lstm_model.fit(x_train, y_train)
+      y_2 = lstm_model.predict(x_train)
+
+      with test_util.device(use_gpu=True):
+        cudnn_layer = keras.layers.UnifiedLSTM(rnn_state_size,
+                                               recurrent_activation='sigmoid')
+        cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
+      cudnn_model.set_weights(weights)
+      y_3 = cudnn_model.predict(x_train)
+      cudnn_model.compile('rmsprop', 'mse')
+      cudnn_model.fit(x_train, y_train)
+      y_4 = cudnn_model.predict(x_train)
+
+      self.assertAllClose(y_1, y_3)
+      self.assertAllClose(y_2, y_4)
+
+  @parameterized.named_parameters(
+      # test_name, use_bias, bias_initializer, activation
+      ('normal', True, 'zeros'),
+      ('no_bias', False, 'zeros'),
+      ('random_bias', True, 'random_uniform'),
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_unified_lstm_model_save_load(self, use_bias, bias_initializer):
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    h5_path = os.path.join(temp_dir, 'test.h5')
+
+    batch = 10
+    timestep = 3
+    input_dim = 5
+    units = 2
+
+    x = np.random.random((batch, timestep, input_dim))
+
+    def build_model():
+      inputs = keras.layers.Input(
+          shape=[timestep, input_dim], dtype=dtypes.float32)
+      layer = keras.layers.UnifiedLSTM(
+          units,
+          use_bias=use_bias,
+          bias_initializer=bias_initializer)
+      output = layer(inputs)
+      return keras.models.Model(inputs, output), layer
+
+    model, layer = build_model()
+    y_ref = model.predict(x)
+    model.save_weights(h5_path)
+
+    cloned_model, new_layer = build_model()
+    cloned_model.load_weights(h5_path)
+    y = cloned_model.predict(x)
+
+    self.assertAllClose(y, y_ref)
+    self.assertAllClose(layer.get_weights(), new_layer.get_weights())
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_unified_lstm_output_on_multiple_kernel(self):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+    with test_util.device(use_gpu=False):
+      # Note that CuDNN use 'sigmoid' as activation. Force the CPU
+      # implementation to use 'sigmoid' so that it will generate same output as
+      # CuDNN implementation.
+      layer = UnifiedLSTM(rnn_state_size, recurrent_activation='sigmoid')
+      output = layer(inputs)
+      cpu_model = keras.models.Model(inputs, output)
+      weights = cpu_model.get_weights()
+      y_1 = cpu_model.predict(x_train)
+
+    with test_util.device(use_gpu=True):
+      layer = UnifiedLSTM(rnn_state_size, recurrent_activation='sigmoid')
+      output = layer(inputs)
+      gpu_model = keras.models.Model(inputs, output)
+      gpu_model.set_weights(weights)
+      y_2 = gpu_model.predict(x_train)
+
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.LSTM(rnn_state_size, recurrent_activation='sigmoid')
+      output = layer(inputs)
+      canonical_model = keras.models.Model(inputs, output)
+      # Remove the extra cudnn bias since canonical lstm will not use it.
+      canonical_model.set_weights(weights[:3])
+      y_3 = canonical_model.predict(x_train)
+
+    self.assertAllClose(y_1, y_2)
+    self.assertAllClose(y_2, y_3)
+
+  @parameterized.named_parameters(
+      # test_name, time_major, go_backwards
+      ('normal', False, False),
+      ('time_major', True, False),
+      ('go_backwards', False, True),
+      ('both', True, True),
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_time_major_and_go_backward(self, time_major, go_backwards):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    def build_model(layer_cls):
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      layer = layer_cls(rnn_state_size,
+                        recurrent_activation='sigmoid',
+                        time_major=time_major,
+                        return_sequences=True,
+                        go_backwards=go_backwards)
+      if time_major:
+        converted_input = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(inputs)
+        outputs = layer(converted_input)
+        outputs = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(outputs)
+      else:
+        outputs = layer(inputs)
+      return keras.models.Model(inputs, outputs)
+
+    lstm_model = build_model(keras.layers.LSTM)
+    y_ref = lstm_model.predict(x_train)
+    weights = lstm_model.get_weights()
+
+    unified_lstm_model = build_model(keras.layers.UnifiedLSTM)
+    unified_lstm_model.set_weights(weights)
+    y = unified_lstm_model.predict(x_train)
+
+    self.assertAllClose(y, y_ref)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_keras_model_with_lstm(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 10
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(timestep, input_shape),
+        num_classes=output_shape)
+    y_train = keras.utils.to_categorical(y_train, output_shape)
+
+    layer = UnifiedLSTM(rnn_state_size)
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('rmsprop', loss='mse')
+    model.fit(x_train, y_train, epochs=epoch)
+    model.evaluate(x_train, y_train)
+    model.predict(x_train)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_return_sequences_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'return_sequences': True
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_static_shape_inference_LSTM(self):
+    # Github issue: 15165
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+
+    model = keras.models.Sequential()
+    inputs = keras.layers.Dense(
+        embedding_dim, input_shape=(timesteps, embedding_dim))
+    model.add(inputs)
+    layer = UnifiedLSTM(units, return_sequences=True)
+    model.add(layer)
+    outputs = model.layers[-1].output
+    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dynamic_behavior_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer = UnifiedLSTM(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dropout_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'dropout': 0.1,
+            'recurrent_dropout': 0.1
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @parameterized.parameters([0, 1, 2])
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_implementation_mode_LSTM(self, implementation_mode):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'implementation': implementation_mode
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_constraints_LSTM(self):
+    embedding_dim = 4
+    layer_class = UnifiedLSTM
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_with_masking_layer_LSTM(self):
+    layer_class = UnifiedLSTM
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_masking_with_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_from_config_LSTM(self):
+    layer_class = UnifiedLSTM
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_specify_initial_state_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    # Test with Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    layer = UnifiedLSTM(units)
+    if len(initial_state) == 1:
+      output = layer(inputs, initial_state=initial_state[0])
+    else:
+      output = layer(inputs, initial_state=initial_state)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def DISABLED_test_specify_initial_state_non_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    # Test with non-Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [
+        keras.backend.random_normal_variable((num_samples, units), 0, 1)
+        for _ in range(num_states)
+    ]
+    layer = UnifiedLSTM(units)
+    output = layer(inputs, initial_state=initial_state)
+
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch(inputs, targets)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_reset_states_with_values(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    layer = UnifiedLSTM(units, stateful=True)
+    layer.build((num_samples, timesteps, embedding_dim))
+    layer.reset_states()
+    assert len(layer.states) == num_states
+    assert layer.states[0] is not None
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.zeros(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+    state_shapes = [keras.backend.int_shape(state) for state in layer.states]
+    values = [np.ones(shape) for shape in state_shapes]
+    if len(values) == 1:
+      values = values[0]
+    layer.reset_states(values)
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.ones(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+
+    # Test with invalid data
+    with self.assertRaises(ValueError):
+      layer.reset_states([1] * (len(layer.states) + 1))
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_specify_state_with_masking(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input((timesteps, embedding_dim))
+    _ = keras.layers.Masking()(inputs)
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    output = UnifiedLSTM(units)(inputs, initial_state=initial_state)
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_return_state(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = UnifiedLSTM(units, return_state=True, stateful=True)
+    outputs = layer(inputs)
+    state = outputs[1:]
+    assert len(state) == num_states
+    model = keras.models.Model(inputs, state[0])
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    state = model.predict(inputs)
+    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_state_reuse(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = UnifiedLSTM(units, return_state=True, return_sequences=True)
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = UnifiedLSTM(units)(output, initial_state=state)
+    model = keras.models.Model(inputs, output)
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    model.predict(inputs)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_initial_states_as_other_inputs(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+    num_states = 2
+    layer_class = UnifiedLSTM
+
+    # Test with Keras tensor
+    main_inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    inputs = [main_inputs] + initial_state
+
+    layer = layer_class(units)
+    output = layer(inputs)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([main_inputs] + initial_state, targets)
+
+
+@test_util.run_v1_only('b/120545219')
+class LSTMLayerGraphOnlyTest(test.TestCase):
+
+  def test_statefulness_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = UnifiedLSTM
+    with self.cached_session(config=_config):
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(0.01), loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertAllClose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
+
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
+
+      self.assertAllClose(out7, out6, atol=1e-5)
+
+  def test_regularizers_LSTM(self):
+    embedding_dim = 4
+    layer_class = UnifiedLSTM
+    with self.cached_session(config=_config):
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      x = keras.backend.variable(np.ones((2, 3, 2)))
+      layer(x)
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+
+class UnifiedLSTMPerformanceTest(test.Benchmark):
+
+  def _measure_performance(self, test_config, model, x_train, y_train):
+    batch = test_config['batch']
+    epoch = test_config['epoch']
+    warmup_epoch = test_config['warmup_epoch']
+
+    # warm up the model
+    model.fit(x_train, y_train, batch_size=batch, epochs=warmup_epoch)
+    start_time = time.time()
+    model.fit(x_train, y_train, batch_size=batch, epochs=epoch - warmup_epoch)
+    end_time = time.time()
+    return (end_time - start_time) / (epoch - warmup_epoch)
+
+  def _time_performance_run_cudnn_lstm(self, test_config, x_train, y_train):
+    # Get the performance number for standard Cudnn LSTM
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    cudnn_lstm_layer = CuDNNLSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = cudnn_lstm_layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'CuDNN LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _time_performance_run_unifed_lstm_gpu(
+      self, test_config, x_train, y_train):
+    # Get performance number for Unified_LSTM with grappler swap the impl
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    layer = UnifiedLSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'Unified LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _time_performance_run_normal_lstm(
+      self, test_config, x_train, y_train):
+    # Get performance number for standard LSTM on GPU.
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    layer = keras.layers.LSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'Normal LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _benchmark_performance_with_standard_cudnn_impl(self):
+    if not test.is_gpu_available():
+      self.skipTest('performance test will only run on GPU')
+
+    mode = 'eager' if context.executing_eagerly() else 'graph'
+    batch = 64
+    num_batch = 10
+    test_config = {
+        'input_shape': 128,
+        'rnn_state_size': 64,
+        'output_shape': 64,
+        'timestep': 50,
+        'batch': batch,
+        'epoch': 20,
+        # The performance for warmup epoch is ignored.
+        'warmup_epoch': 1,
+    }
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=(batch * num_batch),
+        test_samples=0,
+        input_shape=(test_config['timestep'], test_config['input_shape']),
+        num_classes=test_config['output_shape'])
+    y_train = keras.utils.to_categorical(y_train, test_config['output_shape'])
+
+    cudnn_sec_per_epoch = self._time_performance_run_cudnn_lstm(
+        test_config, x_train, y_train)
+    unified_lstm_sec_per_epoch = self._time_performance_run_unifed_lstm_gpu(
+        test_config, x_train, y_train)
+    normal_lstm_sec_per_epoch = self._time_performance_run_normal_lstm(
+        test_config, x_train, y_train)
+
+    cudnn_vs_unified = cudnn_sec_per_epoch / unified_lstm_sec_per_epoch
+    unified_vs_normal = normal_lstm_sec_per_epoch / unified_lstm_sec_per_epoch
+
+    self.report_benchmark(name='keras_cudnn_lstm_' + mode,
+                          wall_time=cudnn_sec_per_epoch,
+                          iters=test_config['epoch'],
+                          extras=test_config)
+    self.report_benchmark(name='keras_unified_lstm_' + mode,
+                          wall_time=unified_lstm_sec_per_epoch,
+                          iters=test_config['epoch'],
+                          extras=test_config)
+    self.report_benchmark(name='keras_canonical_lstm_' + mode,
+                          wall_time=normal_lstm_sec_per_epoch,
+                          iters=test_config['epoch'],
+                          extras=test_config)
+
+    logging.info('Expect the performance of Unified LSTM is within 80% of '
+                 'CuDNN LSTM, got {0:.2f}%'.format(cudnn_vs_unified * 100))
+    logging.info('Expect the performance of Unified LSTM is more than 5 times'
+                 ' of normal LSTM, got {0:.2f}'.format(unified_vs_normal))
+
+  def benchmark_performance_graph(self):
+    with context.graph_mode(), session_lib.Session(config=_config):
+      self._benchmark_performance_with_standard_cudnn_impl()
+
+  def benchmark_performance_eager(self):
+    with context.eager_mode():
+      self._benchmark_performance_with_standard_cudnn_impl()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index a1933c11b067ba25de30cc54a3904cc3b6de4bea..67b154141efc036b5fa7920c8179b35f5eb38cc1 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -23,12 +23,13 @@ import copy
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -229,17 +230,12 @@ class TimeDistributed(Wrapper):
     kwargs = {}
     if generic_utils.has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
-    uses_learning_phase = False  # pylint: disable=redefined-outer-name
 
     input_shape = K.int_shape(inputs)
     if input_shape[0]:
       # batch size matters, use rnn-based implementation
       def step(x, _):
-        global uses_learning_phase  # pylint: disable=global-variable-undefined
         output = self.layer.call(x, **kwargs)
-        if hasattr(output, '_uses_learning_phase'):
-          uses_learning_phase = (output._uses_learning_phase or
-                                 uses_learning_phase)
         return output, []
 
       _, outputs, _ = K.rnn(
@@ -267,8 +263,6 @@ class TimeDistributed(Wrapper):
         inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
         kwargs['mask'] = K.reshape(mask, inner_mask_shape)
       y = self.layer.call(inputs, **kwargs)
-      if hasattr(y, '_uses_learning_phase'):
-        uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
       output_shape = self.compute_output_shape(input_shape).as_list()
       output_shape = self._get_shape_tuple(
@@ -280,9 +274,6 @@ class TimeDistributed(Wrapper):
         self.layer.activity_regularizer is not None):
       regularization_loss = self.layer.activity_regularizer(y)
       self.add_loss(regularization_loss, inputs)
-
-    if uses_learning_phase:
-      y._uses_learning_phase = True
     return y
 
   def compute_mask(self, inputs, mask=None):
@@ -398,6 +389,10 @@ class Bidirectional(Wrapper):
       raise ValueError('Invalid merge mode. '
                        'Merge mode should be one of '
                        '{"sum", "mul", "ave", "concat", None}')
+    if getattr(layer, 'zero_output_for_mask', None) is not None:
+      # Force the zero_output_for_mask to be True if it presents.
+      layer.zero_output_for_mask = True
+
     self.forward_layer = copy.copy(layer)
     config = layer.get_config()
     config['go_backwards'] = not config['go_backwards']
@@ -517,7 +512,10 @@ class Bidirectional(Wrapper):
     if is_keras_tensor:
       # Compute the full input spec, including state
       full_input = [inputs] + additional_inputs
-      full_input_spec = self.input_spec + additional_specs
+      # The original input_spec is None since there could be a nested tensor
+      # input. Update the input_spec to match the inputs.
+      full_input_spec = [None for _ in range(len(nest.flatten(inputs)))
+                        ] + additional_specs
 
       # Perform the call with temporarily replaced input_spec
       original_input_spec = self.input_spec
@@ -587,15 +585,9 @@ class Bidirectional(Wrapper):
       output = y * y_rev
     elif self.merge_mode is None:
       output = [y, y_rev]
-
-    # Properly set learning phase
-    if (getattr(y, '_uses_learning_phase', False) or
-        getattr(y_rev, '_uses_learning_phase', False)):
-      if self.merge_mode is None:
-        for out in output:
-          out._uses_learning_phase = True
-      else:
-        output._uses_learning_phase = True
+    else:
+      raise ValueError(
+          'Unrecognized value for `merge_mode`: %s' % (self.merge_mode))
 
     if self.return_state:
       if self.merge_mode is None:
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 965960917cc6b54cc9c81c09cb3fe5c4fdeeccc0..727f33dadc8abf113e9af76ef63e3e016de319ce 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -165,6 +165,7 @@ class TimeDistributedTest(test.TestCase):
       y = model.predict(np.random.random((10, 3, 2)))
       self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_batchnorm(self):
     with self.cached_session():
       # test that wrapped BN updates still work.
@@ -187,13 +188,14 @@ class TimeDistributedTest(test.TestCase):
       # Verify input_map has one mapping from inputs to reshaped inputs.
       self.assertEqual(len(td._input_map.keys()), 1)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_trainable(self):
     # test layers that need learning_phase to be set
     x = keras.layers.Input(shape=(3, 2))
     layer = keras.layers.TimeDistributed(keras.layers.BatchNormalization())
     _ = layer(x)
-    self.assertEquals(len(layer.updates), 2)
-    self.assertEquals(len(layer.trainable_weights), 2)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.trainable_weights), 2)
     layer.trainable = False
     assert not layer.updates
     assert not layer.trainable_weights
@@ -201,6 +203,7 @@ class TimeDistributedTest(test.TestCase):
     assert len(layer.updates) == 2
     assert len(layer.trainable_weights) == 2
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
     with self.cached_session():
       # test with unspecified shape and Embeddings with mask_zero
@@ -233,6 +236,7 @@ class TimeDistributedTest(test.TestCase):
         self.assertAllEqual(mask_outputs_val[i], ref_mask_val[i])
       self.assertIs(mask_outputs[-1], None)  # final layer
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_with_masking_layer(self):
     with self.cached_session():
       # test with Masking layer
@@ -375,6 +379,7 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_merged_value(self):
     rnn = keras.layers.LSTM
     samples = 2
@@ -452,16 +457,13 @@ class BidirectionalTest(test.TestCase):
       wrapped = keras.layers.Bidirectional(
           rnn(units, dropout=0.2, recurrent_dropout=0.2), merge_mode=merge_mode)
       outputs = _to_list(wrapped(inputs, training=True))
-      assert all(not getattr(x, '_uses_learning_phase') for x in outputs)
 
       inputs = keras.Input((timesteps, dim))
       wrapped = keras.layers.Bidirectional(
           rnn(units, dropout=0.2, return_state=True), merge_mode=merge_mode)
       outputs = _to_list(wrapped(inputs))
-      assert all(x._uses_learning_phase for x in outputs)
 
       model = keras.Model(inputs, outputs)
-      assert model.uses_learning_phase
       y1 = _to_list(model.predict(x))
       y2 = _to_list(model.predict(x))
       for x1, x2 in zip(y1, y2):
@@ -508,6 +510,7 @@ class BidirectionalTest(test.TestCase):
       layer.trainable = True
       assert len(layer.trainable_weights) == 6
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_updates(self):
     with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
@@ -638,6 +641,34 @@ class BidirectionalTest(test.TestCase):
       y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
+  def test_Bidirectional_with_masking(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    merge_mode = 'concat'
+    x = np.random.rand(samples, timesteps, dim)
+    # clear the first record's timestep 2, and expect the output of timestep 2
+    # is also 0s.
+    x[0, 2] = 0
+
+    with self.cached_session():
+      inputs = keras.Input((timesteps, dim))
+      masked_inputs = keras.layers.Masking()(inputs)
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, return_sequences=True),
+          merge_mode=merge_mode)
+      outputs = _to_list(wrapped(masked_inputs, training=True))
+      self.assertEqual(len(outputs), 1)
+      self.assertEqual(outputs[0].get_shape().as_list(),
+                       [None, timesteps, units * 2])
+
+      model = keras.Model(inputs, outputs)
+      y = _to_list(model.predict(x))
+      self.assertEqual(len(y), 1)
+      self.assertAllClose(y[0][0, 2], np.zeros(units * 2))
+
 
 def _to_list(ls):
   if isinstance(ls, list):
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 9f548bfe0408d5c053c25b9ae14810d582b83e1e..4c584d0ff059ba8eabd3de06ebb06b2703400a73 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -19,16 +19,382 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
+
 import six
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.losses_utils import compute_weighted_loss
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Loss(object):
+  """Loss base class.
+
+  To be implemented by subclasses:
+  * `call()`: Contains the logic for loss calculation using `y_true`, `y_pred`.
+
+  Example subclass implementation:
+  ```
+  class MeanSquaredError(Loss):
+    def call(self, y_true, y_pred):
+      y_pred = ops.convert_to_tensor(y_pred)
+      y_true = math_ops.cast(y_true, y_pred.dtype)
+      return K.mean(math_ops.square(y_pred - y_true), axis=-1)
+  ```
+
+  Args:
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    self.reduction = reduction
+    self.name = name
+
+  def __call__(self, y_true, y_pred, sample_weight=None):
+    """Invokes the `Loss` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional `Tensor` whose rank is either 0, or the same rank
+        as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a
+        coefficient for the loss. If a scalar is provided, then the loss is
+        simply scaled by the given value. If `sample_weight` is a tensor of size
+        `[batch_size]`, then the total loss for each sample of the batch is
+        rescaled by the corresponding element in the `sample_weight` vector. If
+        the shape of `sample_weight` matches the shape of `y_pred`, then the
+        loss of each measurable element of `y_pred` is scaled by the
+        corresponding value of `sample_weight`.
+
+    Returns:
+      Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+        shape as `y_true`; otherwise, it is scalar.
+
+    Raises:
+      ValueError: If the shape of `sample_weight` is invalid.
+    """
+    with ops.name_scope(self.name, format(self.__class__.__name__),
+                        (y_pred, y_true, sample_weight)):
+      losses = self.call(y_true, y_pred)
+      return compute_weighted_loss(
+          losses, sample_weight, reduction=self.reduction)
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates a `Loss` from its config (output of `get_config()`).
+
+    Args:
+        config: Output of `get_config()`.
+
+    Returns:
+        A `Loss` instance.
+    """
+    return cls(**config)
+
+  def get_config(self):
+    return {'reduction': self.reduction, 'name': self.name}
+
+  @abc.abstractmethod
+  def call(self, y_true, y_pred):
+    """Invokes the `Loss` instance.
+
+    Args:
+      y_true: Ground truth values, with the same shape as 'y_pred'.
+      y_pred: The predicted values.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
+
+
+@tf_export('keras.losses.MeanSquaredError')
+class MeanSquaredError(Loss):
+  """Computes the mean of squares of errors between labels and predictions.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean squared error value is 3/4 (0.75).
+
+  Usage:
+
+  ```python
+  mse = tf.keras.losses.MeanSquaredError()
+  loss = mse([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanSquaredError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanSquaredError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean squared error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_squared_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.MeanAbsoluteError')
+class MeanAbsoluteError(Loss):
+  """Computes the mean of absolute difference between labels and predictions.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean absolute error value is 3/4 (0.75).
+
+  Usage:
+
+  ```python
+  mae = tf.keras.losses.MeanAbsoluteError()
+  loss = mae([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanAbsoluteError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanAbsoluteError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean absolute error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_absolute_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.MeanAbsolutePercentageError')
+class MeanAbsolutePercentageError(Loss):
+  """Computes the mean absolute percentage error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean absolute percentage error value is 5e+08.
+
+  Usage:
+
+  ```python
+  mape = tf.keras.losses.MeanAbsolutePercentageError()
+  loss = mape([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 5e+08
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanAbsolutePercentageError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanAbsolutePercentageError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean absolute percentage error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_absolute_percentage_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.MeanSquaredLogarithmicError')
+class MeanSquaredLogarithmicError(Loss):
+  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean squared logarithmic error value is 0.36034.
+
+  Usage:
+
+  ```python
+  msle = tf.keras.losses.MeanSquaredLogarithmicError()
+  loss = msle([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 0.36034
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanSquaredLogarithmicError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanSquaredLogarithmicError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean squared logarithmic error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_squared_logarithmic_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.BinaryCrossentropy')
+class BinaryCrossentropy(Loss):
+  """Computes the binary cross entropy loss between the labels and predictions.
+
+  Usage:
+
+  ```python
+  bce = tf.keras.losses.BinaryCrossentropy()
+  loss = bce([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 12.007
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.BinaryCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `output` is expected to be a logits tensor. By default,
+      we consider that `output` encodes a probability distribution.
+    label_smoothing: If greater than `0` then smooth the labels.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               label_smoothing=0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(BinaryCrossentropy, self).__init__(reduction=reduction, name=name)
+    self.from_logits = from_logits
+    self.label_smoothing = label_smoothing
+
+  def call(self, y_true, y_pred):
+    """Invokes the `BinaryCrossentropy` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Binary cross entropy losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+
+    if self.label_smoothing > 0:
+      y_true = y_true * (1 - self.label_smoothing) + 0.5 * self.label_smoothing
+
+    return binary_crossentropy(y_true, y_pred, from_logits=self.from_logits)
+
+
+@tf_export('keras.losses.CategoricalCrossentropy')
+class CategoricalCrossentropy(Loss):
+  """Computes categorical cross entropy loss between the `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  cce = tf.keras.losses.CategoricalCrossentropy()
+  loss = cce(
+    [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+    [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
+  print('Loss: ', loss.numpy())  # Loss: 0.3239
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CategoricalCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `output` is expected to be a logits tensor. By default,
+      we consider that `output` encodes a probability distribution.
+    label_smoothing: If greater than `0` then smooth the labels. This option is
+      currently not supported when `y_pred` is a sparse input (not one-hot).
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               label_smoothing=0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(CategoricalCrossentropy, self).__init__(
+        reduction=reduction, name=name)
+    self.from_logits = from_logits
+    self.label_smoothing = label_smoothing
+
+  def call(self, y_true, y_pred):
+    """Invokes the `CategoricalCrossentropy` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Categorical cross entropy losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = ops.convert_to_tensor(y_true)
+    is_sparse = y_pred.shape != y_true.shape
+
+    if is_sparse:
+      return sparse_categorical_crossentropy(
+          y_true, y_pred, from_logits=self.from_logits)
+    else:
+      y_true = math_ops.cast(y_true, y_pred.dtype)
+      if self.label_smoothing > 0:
+        num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
+        smooth_positives = 1.0 - self.label_smoothing
+        smooth_negatives = self.label_smoothing / num_classes
+        y_true = y_true * smooth_positives + smooth_negatives
+
+      return categorical_crossentropy(
+          y_true, y_pred, from_logits=self.from_logits)
+
+
 @tf_export('keras.metrics.mean_squared_error',
            'keras.metrics.mse',
            'keras.metrics.MSE',
@@ -116,20 +482,22 @@ def logcosh(y_true, y_pred):
 
 @tf_export('keras.metrics.categorical_crossentropy',
            'keras.losses.categorical_crossentropy')
-def categorical_crossentropy(y_true, y_pred):
-  return K.categorical_crossentropy(y_true, y_pred)
+def categorical_crossentropy(y_true, y_pred, from_logits=False):
+  return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
 
 
 @tf_export('keras.metrics.sparse_categorical_crossentropy',
            'keras.losses.sparse_categorical_crossentropy')
-def sparse_categorical_crossentropy(y_true, y_pred):
-  return K.sparse_categorical_crossentropy(y_true, y_pred)
+def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False):
+  return K.sparse_categorical_crossentropy(
+      y_true, y_pred, from_logits=from_logits)
 
 
 @tf_export('keras.metrics.binary_crossentropy',
            'keras.losses.binary_crossentropy')
-def binary_crossentropy(y_true, y_pred):
-  return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
+def binary_crossentropy(y_true, y_pred, from_logits=False):
+  return K.mean(
+      K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
 
 
 @tf_export('keras.metrics.kullback_leibler_divergence',
@@ -159,6 +527,40 @@ def cosine_proximity(y_true, y_pred):
   return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
 
 
+class CosineProximity(Loss):
+  """Computes the cosine distance between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  cosine_loss = tf.losses.CosineProximity()
+  loss = cosine_loss([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: -0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.CosineProximity())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Calculates the cosine proximity loss.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Cosine distance loss.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return cosine_proximity(y_true, y_pred)
+
+
 # Aliases.
 
 mse = MSE = mean_squared_error
@@ -197,3 +599,9 @@ def get(identifier):
   else:
     raise ValueError('Could not interpret '
                      'loss function identifier:', identifier)
+
+
+LABEL_DTYPES_FOR_LOSSES = {
+    losses_impl.sparse_softmax_cross_entropy: 'int32',
+    sparse_categorical_crossentropy: 'int32'
+}
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index c7015270accc9f8244f8650d7edd78d609a47f09..d2791cdcd3bdac799c92112174f9edf2dbdf87ee 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -24,6 +24,11 @@ import shutil
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import test
 
 try:
@@ -138,5 +143,633 @@ class KerasLossesTest(test.TestCase):
         loaded_model.predict(np.random.rand(128, 2))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class MeanSquaredErrorTest(test.TestCase):
+
+  def test_config(self):
+    mse_obj = keras.losses.MeanSquaredError(
+        reduction=losses_impl.ReductionV2.SUM, name='mse_1')
+    self.assertEqual(mse_obj.name, 'mse_1')
+    self.assertEqual(mse_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+    loss = mse_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 49.5, 3)
+
+  def test_scalar_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 113.85, 3)
+
+  def test_sample_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 767.8 / 6, 3)
+
+  def test_timestep_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 587 / 6, 3)
+
+  def test_zero_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_invalid_sample_weight(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
+    with self.assertRaisesRegexp(
+        ValueError, r'Shapes \(2, 2\) and \(2, 3\) are incompatible'):
+      mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+  def test_no_reduction(self):
+    mse_obj = keras.losses.MeanSquaredError(
+        reduction=losses_impl.ReductionV2.NONE)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+    loss = self.evaluate(loss)
+    self.assertArrayNear(loss, [84.3333, 143.3666], 1e-3)
+
+  def test_sum_reduction(self):
+    mse_obj = keras.losses.MeanSquaredError(
+        reduction=losses_impl.ReductionV2.SUM)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 227.69998, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanAbsoluteErrorTest(test.TestCase):
+
+  def test_config(self):
+    mae_obj = keras.losses.MeanAbsoluteError(
+        reduction=losses_impl.ReductionV2.SUM, name='mae_1')
+    self.assertEqual(mae_obj.name, 'mae_1')
+    self.assertEqual(mae_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+    loss = mae_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 5.5, 3)
+
+  def test_scalar_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 12.65, 3)
+
+  def test_sample_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 81.4 / 6, 3)
+
+  def test_timestep_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 83 / 6, 3)
+
+  def test_zero_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_invalid_sample_weight(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
+    with self.assertRaisesRegexp(
+        ValueError, r'Shapes \(2, 2\) and \(2, 3\) are incompatible'):
+      mae_obj(y_true, y_pred, sample_weight=sample_weight)
+
+  def test_no_reduction(self):
+    mae_obj = keras.losses.MeanAbsoluteError(
+        reduction=losses_impl.ReductionV2.NONE)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+    loss = self.evaluate(loss)
+    self.assertArrayNear(loss, [10.7333, 14.5666], 1e-3)
+
+  def test_sum_reduction(self):
+    mae_obj = keras.losses.MeanAbsoluteError(
+        reduction=losses_impl.ReductionV2.SUM)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 25.29999, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanAbsolutePercentageErrorTest(test.TestCase):
+
+  def test_config(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError(
+        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+    self.assertEqual(mape_obj.name, 'mape_1')
+    self.assertEqual(mape_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 211.8518, 3)
+
+  def test_scalar_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 487.259, 3)
+
+  def test_sample_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 422.8888, 3)
+
+  def test_timestep_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 694.4445, 3)
+
+  def test_zero_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanSquaredLogarithmicErrorTest(test.TestCase):
+
+  def test_config(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError(
+        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+    self.assertEqual(msle_obj.name, 'mape_1')
+    self.assertEqual(msle_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = msle_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 1.4370, 3)
+
+  def test_scalar_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = msle_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 3.3051, 3)
+
+  def test_sample_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 3.7856, 3)
+
+  def test_timestep_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 2.6473, 3)
+
+  def test_zero_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = msle_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CosineProximityTest(test.TestCase):
+
+  def test_config(self):
+    cosine_obj = keras.losses.CosineProximity(
+        reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
+    self.assertEqual(cosine_obj.name, 'cosine_loss')
+    self.assertEqual(cosine_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cosine_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), -0.18722, 3)
+
+  def test_scalar_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cosine_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), -0.43060, 3)
+
+  def test_sample_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.15599, 3)
+
+  def test_timestep_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), -2.0000, 3)
+
+  def test_zero_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cosine_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BinaryCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    bce_obj = keras.losses.BinaryCrossentropy(
+        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+    self.assertEqual(bce_obj.name, 'bce_1')
+    self.assertEqual(bce_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                                  dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy()
+    loss = bce_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[100.0, -100.0, -100.0],
+                                   [-100.0, 100.0, -100.0],
+                                   [-100.0, -100.0, 100.0]])
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = bce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 8.0004, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([10., 10., 10., -10., 10, -10],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 5., 3)
+
+  def test_scalar_weighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = bce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 18.4010, 3)
+
+    # Test with logits.
+    y_true = array_ops.ones((32, 1))
+    logits = array_ops.ones((32, 1), dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 0.7205, 3)
+
+  def test_sample_weighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float64)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 21.4907, 3)
+
+    # Test with logits.
+    y_true = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
+    logits = constant_op.constant(
+        [[100.0, -100.0, -100.0], [-100.0, 100.0, -100.0],
+         [-100.0, -100.0, 100.0]],
+        dtype=dtypes.float64)
+    weights = constant_op.constant([3, 2, 8])
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits, sample_weight=weights)
+    self.assertAlmostEqual(self.evaluate(loss), 288.8888, 3)
+
+  def test_no_reduction(self):
+    y_true = constant_op.constant(((1, 0, 1), (1, 1, 0), (0, 1, 1)))
+    logits = constant_op.constant(((100.0, -100.0, 100.0),
+                                   (100.0, -100.0, 100.0),
+                                   (100.0, 100.0, -100.0)))
+    bce_obj = keras.losses.BinaryCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = bce_obj(y_true, logits)
+    self.assertAllClose((0., 66.6666, 66.6666), self.evaluate(loss), 3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant([[100.0, -100.0, -100.0]])
+    y_true = constant_op.constant([[1, 0, 1]])
+    label_smoothing = 0.1
+    # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    # Label smoothing: z' = z * (1 - L) + 0.5L
+    #                  1  = 1 - 0.5L
+    #                  0  = 0.5L
+    # Applying the above two fns to the given input:
+    # (100 - 100 * (1 - 0.5 L)  + 0 +
+    #  0   + 100 * (0.5 L)      + 0 +
+    #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
+    #  = (100 + 50L) * 1/3
+    bce_obj = keras.losses.BinaryCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    loss = bce_obj(y_true, logits)
+    expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+    self.assertEqual(cce_obj.name, 'bce_1')
+    self.assertEqual(cce_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                                  dtype=dtypes.int64)
+    y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+                                  dtype=dtypes.float32)
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
+
+  def test_scalar_weighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
+
+  def test_sample_weighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    sample_weight = constant_op.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+  def test_no_reduction(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant([[100.0, -100.0, -100.0]])
+    y_true = constant_op.constant([[1, 0, 0]])
+    label_smoothing = 0.1
+    # Softmax Cross Entropy Loss: -\sum_i p_i \log q_i
+    # where for a softmax activation
+    # \log q_i = x_i - \log \sum_j \exp x_j
+    #          = x_i - x_max - \log \sum_j \exp (x_j - x_max)
+    # For our activations, [100, -100, -100]
+    # \log ( exp(0) + exp(-200) + exp(-200) ) = 0
+    # so our log softmaxes become: [0, -200, -200]
+    # Label smoothing: z' = z * (1 - L) + L/n
+    #                  1  = 1 - L + L/n
+    #                  0  = L/n
+    # Applying the above two fns to the given input:
+    # -0 * (1 - L + L/n) + 200 * L/n + 200 * L/n = 400 L/n
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    loss = cce_obj(y_true, logits)
+    expected_value = 400.0 * label_smoothing / 3.0
+    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+  def test_all_correct_unweighted_sparse(self):
+    y_true = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
+    y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+                                  dtype=dtypes.float32)
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([0, 1, 2])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
+
+  def test_scalar_weighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[0], [1], [2]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
+
+  def test_sample_weighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[0], [1], [2]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    sample_weight = constant_op.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+  def test_no_reduction_sparse(self):
+    y_true = constant_op.constant([[0], [1], [2]])
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index d217244e2f9b632277b9b7b732510a6056a41f0f..331a8636d1c93ce9c8ee03a8d6c0f486617bf6dd 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -19,15 +19,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from abc import ABCMeta
-from abc import abstractmethod
-
+import abc
 import functools
 import sys
 import types
 import weakref
+from enum import Enum
+import numpy as np
 import six
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -49,8 +50,10 @@ from tensorflow.python.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras.losses import squared_hinge
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import to_list
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -58,19 +61,11 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
 
 
-def check_is_tensor_or_operation(x, name):
-  """Raises type error if the given input is not a tensor or operation."""
-  if not (isinstance(x, ops.Tensor) or isinstance(x, ops.Operation)):
-    raise TypeError('{0} must be a Tensor or Operation, given: {1}'.format(
-        name, x))
-
-
 def clone_metric(metric):
   """Returns a clone of the metric if stateful, otherwise returns it as is."""
   if isinstance(metric, Metric):
@@ -103,8 +98,6 @@ def update_state_wrapper(update_state_fn):
     update_op = update_state_fn(*args, **kwargs)
     if update_op is not None:  # update_op will be None in eager execution.
       metric_obj.add_update(update_op, inputs=True)
-      check_is_tensor_or_operation(
-          update_op, 'Metric {0}\'s update'.format(metric_obj.name))
     return update_op
 
   return tf_decorator.make_decorator(update_state_fn, decorated)
@@ -116,7 +109,7 @@ def result_wrapper(result_fn):
   Result computation is an idempotent operation that simply calculates the
   metric value using the state variables.
 
-  If metric state variables are distributed across towers/devices and
+  If metric state variables are distributed across replicas/devices and
   `result()` is requested from the context of one device - This function wraps
   `result()` in a distribution strategy `merge_call()`. With this,
   the metric state variables will be aggregated across devices.
@@ -129,10 +122,10 @@ def result_wrapper(result_fn):
     `merge_call()`.
   """
 
-  def decorated(metric_obj, *args):
+  def decorated(_, *args):
     """Decorated function with merge_call."""
-    tower_context = distribution_strategy_context.get_tower_context()
-    if tower_context is None:  # if in cross tower context already
+    replica_context = distribution_strategy_context.get_replica_context()
+    if replica_context is None:  # if in cross replica context already
       result_t = result_fn(*args)
     else:
       # TODO(psv): Test distribution of metrics using different distribution
@@ -147,10 +140,9 @@ def result_wrapper(result_fn):
         return distribution.unwrap(merge_fn)[0](*args)
 
       # Wrapping result in merge_call. merge_call is used when we want to leave
-      # tower mode and compute a value in cross tower mode.
-      result_t = tower_context.merge_call(merge_fn_wrapper, result_fn, *args)
-    check_is_tensor_or_operation(result_t,
-                                 'Metric {0}\'s result'.format(metric_obj.name))
+      # replica mode and compute a value in cross replica mode.
+      result_t = replica_context.merge_call(
+          merge_fn_wrapper, args=(result_fn,) + args)
     return result_t
 
   return tf_decorator.make_decorator(result_fn, decorated)
@@ -171,98 +163,169 @@ def weakmethod(method):
   return inner
 
 
-def safe_div(numerator, denominator):
-  """Divides two tensors element-wise, returning 0 if the denominator is <= 0.
+class _ConfusionMatrix(Enum):
+  TRUE_POSITIVES = 'tp'
+  FALSE_POSITIVES = 'fp'
+  TRUE_NEGATIVES = 'tn'
+  FALSE_NEGATIVES = 'fn'
 
-  Args:
-    numerator: A `Tensor`.
-    denominator: A `Tensor`, with dtype matching `numerator`.
 
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  t = math_ops.truediv(numerator, denominator)
-  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-  condition = math_ops.greater(denominator, zero)
-  zero = math_ops.cast(zero, t.dtype)
-  return array_ops.where(condition, t, zero)
+def _assert_thresholds_range(thresholds):
+  invalid_thresholds = [t for t in thresholds if t < 0 or t > 1]
+  if any(invalid_thresholds):
+    raise ValueError('Threshold values must be in [0, 1]. Invalid values: {}'
+                     .format(invalid_thresholds))
+
 
+def _update_confusion_matrix_variables(variables_to_update,
+                                       y_true,
+                                       y_pred,
+                                       thresholds,
+                                       sample_weight=None):
+  """Returns op to update the given confusion matrix variables.
 
-def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
-  """Squeeze or expand last dimension if needed.
+  For every pair of values in y_true and y_pred:
 
-  1. Squeezes last dim of `y_pred` or `y_true` if their rank differs by 1
-  (using `confusion_matrix.remove_squeezable_dimensions`).
-  2. Squeezes or expands last dim of `sample_weight` if its rank differs by 1
-  from the new rank of `y_pred`.
-  If `sample_weight` is scalar, it is kept scalar.
+  true_positive: y_true == True and y_pred > thresholds
+  false_negatives: y_true == True and y_pred <= thresholds
+  true_negatives: y_true == False and y_pred <= thresholds
+  false_positive: y_true == False and y_pred > thresholds
 
-  This will use static shape if available. Otherwise, it will add graph
-  operations, which could result in a performance hit.
+  The results will be weighted and added together. When multiple thresholds are
+  provided, we will repeat the same for every threshold.
+
+  For estimation of these metrics over a stream of data, the function creates an
+  `update_op` operation that updates the given variables.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use weights of 0 to mask values.
 
   Args:
-    y_pred: Predicted values, a `Tensor` of arbitrary dimensions.
-    y_true: Optional label `Tensor` whose dimensions match `y_pred`.
-    sample_weight: Optional weight scalar or `Tensor` whose dimensions match
-      `y_pred`.
+    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
+      and corresponding variables to update as values.
+    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
+    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
+      the range `[0, 1]`.
+    thresholds: A float value or a python list or tuple of float thresholds in
+      `[0, 1]`.
+    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
+      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `y_true` dimension).
 
   Returns:
-    Tuple of `y_pred`, `y_true` and `sample_weight`. Each of them possibly has
-    the last dimension squeezed,
-    `sample_weight` could be extended by one dimension.
+    Update op.
+
+  Raises:
+    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
+      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
+      `variables_to_update` contains invalid keys.
   """
-  if y_true is not None:
-    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
-    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
-        y_true, y_pred)
-
-  if sample_weight is None:
-    return y_pred, y_true, None
-
-  sample_weight = ops.convert_to_tensor(sample_weight)
-  weights_shape = sample_weight.get_shape()
-  weights_rank = weights_shape.ndims
-  if weights_rank == 0:  # If weights is scalar, do nothing.
-    return y_pred, y_true, sample_weight
-
-  y_pred_shape = y_pred.get_shape()
-  y_pred_rank = y_pred_shape.ndims
-  if (y_pred_rank is not None) and (weights_rank is not None):
-    # Use static rank.
-    if weights_rank - y_pred_rank == 1:
-      sample_weight = array_ops.squeeze(sample_weight, [-1])
-    elif y_pred_rank - weights_rank == 1:
-      sample_weight = array_ops.expand_dims(sample_weight, [-1])
-    return y_pred, y_true, sample_weight
-
-  # Use dynamic rank.
-  weights_rank_tensor = array_ops.rank(sample_weight)
-  rank_diff = weights_rank_tensor - array_ops.rank(y_pred)
-  maybe_squeeze_weights = lambda: array_ops.squeeze(sample_weight, [-1])
-
-  def _maybe_expand_weights():
-    return control_flow_ops.cond(
-        math_ops.equal(rank_diff,
-                       -1), lambda: array_ops.expand_dims(sample_weight, [-1]),
-        lambda: sample_weight)
-
-  def _maybe_adjust_weights():
-    return control_flow_ops.cond(
-        math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
-        _maybe_expand_weights)
-
-  # squeeze or expand last dim of `sample_weight` if its rank differs by 1
-  # from the new rank of `y_pred`.
-  sample_weight = control_flow_ops.cond(
-      math_ops.equal(weights_rank_tensor, 0), lambda: sample_weight,
-      _maybe_adjust_weights)
-  return y_pred, y_true, sample_weight
+  if variables_to_update is None:
+    return
+  y_true = ops.convert_to_tensor(y_true)
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_pred.shape.assert_is_compatible_with(y_true.shape)
+
+  if not any(
+      key for key in variables_to_update if key in list(_ConfusionMatrix)):
+    raise ValueError(
+        'Please provide at least one valid confusion matrix '
+        'variable to update. Valid variable key options are: "{}". '
+        'Received: "{}"'.format(
+            list(_ConfusionMatrix), variables_to_update.keys()))
+
+  invalid_keys = [
+      key for key in variables_to_update if key not in list(_ConfusionMatrix)
+  ]
+  if invalid_keys:
+    raise ValueError(
+        'Invalid keys: {}. Valid variable key options are: "{}"'.format(
+            invalid_keys, list(_ConfusionMatrix)))
+
+  with ops.control_dependencies([
+      check_ops.assert_greater_equal(
+          y_pred,
+          math_ops.cast(0.0, dtype=y_pred.dtype),
+          message='predictions must be >= 0'),
+      check_ops.assert_less_equal(
+          y_pred,
+          math_ops.cast(1.0, dtype=y_pred.dtype),
+          message='predictions must be <= 1')
+  ]):
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        math_ops.cast(y_pred, dtype=dtypes.float32),
+        math_ops.cast(y_true, dtype=dtypes.bool), sample_weight)
+
+  thresholds = to_list(thresholds)
+  num_thresholds = len(thresholds)
+  num_predictions = array_ops.size(y_pred)
 
+  # Reshape predictions and labels.
+  predictions_2d = array_ops.reshape(y_pred, [1, -1])
+  labels_2d = array_ops.reshape(
+      math_ops.cast(y_true, dtype=dtypes.bool), [1, -1])
 
+  # Tile the thresholds for every prediction.
+  thresh_tiled = array_ops.tile(
+      array_ops.expand_dims(array_ops.constant(thresholds), 1),
+      array_ops.stack([1, num_predictions]))
+
+  # Tile the predictions for every threshold.
+  preds_tiled = array_ops.tile(predictions_2d, [num_thresholds, 1])
+
+  # Compare predictions and threshold.
+  pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled)
+
+  # Tile labels by number of thresholds
+  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
+
+  if sample_weight is not None:
+    weights = weights_broadcast_ops.broadcast_weights(
+        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
+    weights_tiled = array_ops.tile(
+        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
+  else:
+    weights_tiled = None
+
+  update_ops = []
+
+  def weighted_assign_add(label, pred, weights, var):
+    label_and_pred = math_ops.cast(
+        math_ops.logical_and(label, pred), dtype=dtypes.float32)
+    if weights is not None:
+      label_and_pred *= weights
+    return state_ops.assign_add(var, math_ops.reduce_sum(label_and_pred, 1))
+
+  loop_vars = {
+      _ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
+  }
+  update_tn = _ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
+  update_fp = _ConfusionMatrix.FALSE_POSITIVES in variables_to_update
+  update_fn = _ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
+
+  if update_fn or update_tn:
+    pred_is_neg = math_ops.logical_not(pred_is_pos)
+    loop_vars[_ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
+
+  if update_fp or update_tn:
+    label_is_neg = math_ops.logical_not(label_is_pos)
+    loop_vars[_ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
+    if update_tn:
+      loop_vars[_ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg)
+
+  for matrix_cond, (label, pred) in loop_vars.items():
+    if matrix_cond in variables_to_update:
+      update_ops.append(
+          weighted_assign_add(label, pred, weights_tiled,
+                              variables_to_update[matrix_cond]))
+  return control_flow_ops.group(update_ops)
+
+
+@six.add_metaclass(abc.ABCMeta)
 class Metric(Layer):
   """Encapsulates metric logic and state.
 
-  Usage with eager execution:
+  Usage:
 
   ```python
   m = SomeMetric(...)
@@ -271,19 +334,6 @@ class Metric(Layer):
   print('Final result: ', m.result().numpy())
   ```
 
-  Usage with graph execution:
-
-  ```python
-  m = SomeMetric(...)
-  init_op = tf.variables_initializer(m.variables)  # Initialize variables
-  with tf.Session() as sess:
-    sess.run(init_op)
-    for input in ...:
-      update_op = m.update_state(input)
-      sess.run(update_op)
-    print('Final result: ', sess.run(m.result()))
-  ```
-
   Usage with tf.keras API:
 
   ```python
@@ -341,7 +391,6 @@ class Metric(Layer):
       return array_ops.identity(self.true_positives)
   ```
   """
-  __metaclass__ = ABCMeta
 
   def __init__(self, name=None, dtype=None):
     super(Metric, self).__init__(name=name, dtype=dtype)
@@ -380,9 +429,20 @@ class Metric(Layer):
     Returns:
       The metric value tensor.
     """
-    update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
+    update_op = self.update_state(*args, **kwargs)
     with ops.control_dependencies([update_op]):
-      return self.result()  # pylint: disable=not-callable
+      result_t = self.result()
+
+      # We are adding the metric object as metadata on the result tensor.
+      # This is required when we want to use a metric with `add_metric` API on
+      # a Model/Layer in graph mode. This metric instance will later be used
+      # to reset variable state after each epoch of training.
+      # Example:
+      #   model = Model()
+      #   model.add_metric(Mean()(values), name='mean')
+      if not context.executing_eagerly():
+        result_t._metric_obj = self  # pylint: disable=protected-access
+      return result_t
 
   def reset_states(self):
     """Resets all of the metric state variables.
@@ -393,7 +453,7 @@ class Metric(Layer):
     for v in self.variables:
       K.set_value(v, 0)
 
-  @abstractmethod
+  @abc.abstractmethod
   def update_state(self, *args, **kwargs):
     """Accumulates statistics for the metric.
 
@@ -414,7 +474,7 @@ class Metric(Layer):
     """
     NotImplementedError('Must be implemented in subclasses.')
 
-  @abstractmethod
+  @abc.abstractmethod
   def result(self):
     """Computes and returns the metric value tensor.
 
@@ -451,15 +511,35 @@ class Metric(Layer):
   ### End: For use by subclasses ###
 
 
+@tf_export('keras.metrics.Mean')
 class Mean(Metric):
   """Computes the (weighted) mean of the given values.
 
+  For example, if values is [1, 3, 5, 7] then the mean is 4.
+  If the weights were specified as [1, 1, 0, 0] then the mean would be 2.
+
   This metric creates two variables, `total` and `count` that are used to
   compute the average of `values`. This average is ultimately returned as `mean`
   which is an idempotent operation that simply divides `total` by `count`.
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Mean()
+  m.update_state([1, 3, 5, 7])
+  print('Final result: ', m.result().numpy())  # Final result: 4.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.add_metric(tf.keras.metrics.Mean(name='mean_1')(outputs))
+  model.compile('sgd', loss='mse')
+  ```
   """
 
   def __init__(self, name='mean', dtype=None):
@@ -513,13 +593,14 @@ class Mean(Metric):
       values = math_ops.multiply(values, sample_weight)
     values = math_ops.reduce_sum(values)
 
-    # Update state variables
+    # Update state variables. Count should be updated only when total is
+    # updated.
     update_total_op = state_ops.assign_add(self.total, values)
-    update_count_op = state_ops.assign_add(self.count, num_values)
-    return control_flow_ops.group(update_total_op, update_count_op)
+    with ops.control_dependencies([update_total_op]):
+      return state_ops.assign_add(self.count, num_values)
 
   def result(self):
-    return safe_div(self.total, self.count)
+    return math_ops.div_no_nan(self.total, self.count)
 
 
 class MeanMetricWrapper(Mean):
@@ -564,14 +645,62 @@ class MeanMetricWrapper(Mean):
         matches, sample_weight=sample_weight)
 
   def get_config(self):
-    config = self._fn_kwargs
+    config = {'fn': self._fn}
+    config.update(self._fn_kwargs)
     base_config = super(MeanMetricWrapper, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.metrics.Accuracy')
+class Accuracy(MeanMetricWrapper):
+  """Calculates how often predictions matches labels.
+
+  For example, if `y_true` is [1, 2, 3, 4] and `y_pred` is [0, 2, 3, 4]
+  then the accuracy is 3/4 or .75.  If the weights were specified as
+  [1, 1, 0, 0] then the accuracy would be 1/2 or .5.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the frequency with which `y_pred` matches `y_true`. This frequency is
+  ultimately returned as `binary accuracy`: an idempotent operation that simply
+  divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Accuracy()
+  m.update_state([1, 2, 3, 4], [0, 2, 3, 4])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Accuracy()])
+  ```
+  """
+
+  def __init__(self, name='accuracy', dtype=None):
+    super(Accuracy, self).__init__(accuracy, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(Accuracy, cls).from_config(config)
+
+
+@tf_export('keras.metrics.BinaryAccuracy')
 class BinaryAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
+  For example, if `y_true` is [1, 1, 0, 0] and `y_pred` is [0.98, 1, 0, 0.6]
+  then the binary accuracy is 3/4 or .75.  If the weights were specified as
+  [1, 0, 0, 1] then the binary accuracy would be 1/2 or .5.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `binary accuracy`: an idempotent operation that simply
@@ -579,6 +708,21 @@ class BinaryAccuracy(MeanMetricWrapper):
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.BinaryAccuracy()
+  m.update_state([1, 1, 0, 0], [0.98, 1, 0, 0.6])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.BinaryAccuracy()])
+  ```
   """
 
   def __init__(self, name='binary_accuracy', dtype=None, threshold=0.5):
@@ -593,17 +737,50 @@ class BinaryAccuracy(MeanMetricWrapper):
     super(BinaryAccuracy, self).__init__(
         binary_accuracy, name, dtype=dtype, threshold=threshold)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(BinaryAccuracy, cls).from_config(config)
+
 
+@tf_export('keras.metrics.CategoricalAccuracy')
 class CategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
+  For example, if `y_true` is [[0, 0, 1], [0, 1, 0]] and `y_pred` is
+  [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
+  If the weights were specified as [0.7, 0.3] then the categorical accuracy
+  would be .3.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `categorical accuracy`: an idempotent operation that
   simply divides `total` by `count`.
 
+  `y_pred` and `y_true` should be passed in as vectors of probabilities, rather
+  than as labels. If necessary, use `tf.one_hot` to expand `y_true` as a vector.
+
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.CategoricalAccuracy()
+  m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.CategoricalAccuracy()])
+  ```
   """
 
   def __init__(self, name='categorical_accuracy', dtype=None):
@@ -616,10 +793,22 @@ class CategoricalAccuracy(MeanMetricWrapper):
     super(CategoricalAccuracy, self).__init__(
         categorical_accuracy, name, dtype=dtype)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(CategoricalAccuracy, cls).from_config(config)
+
 
+@tf_export('keras.metrics.SparseCategoricalAccuracy')
 class SparseCategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches integer labels.
 
+  For example, if `y_true` is [[2], [1]] and `y_pred` is
+  [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
+  If the weights were specified as [0.7, 0.3] then the categorical accuracy
+  would be .3.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `sparse categorical accuracy`: an idempotent operation
@@ -627,12 +816,712 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SparseCategoricalAccuracy()
+  m.update_state([[2], [1]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
+  ```
   """
 
   def __init__(self, name='sparse_categorical_accuracy', dtype=None):
     super(SparseCategoricalAccuracy, self).__init__(
         sparse_categorical_accuracy, name, dtype=dtype)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(SparseCategoricalAccuracy, cls).from_config(config)
+
+
+class _ConfusionMatrixConditionCount(Metric):
+  """Calculates the number of the given confusion matrix condition."""
+
+  def __init__(self,
+               confusion_matrix_cond,
+               thresholds=None,
+               name=None,
+               dtype=None):
+    """Creates a `_ConfusionMatrixConditionCount` instance.
+
+    Args:
+      confusion_matrix_cond: One of `_ConfusionMatrix` conditions.
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(_ConfusionMatrixConditionCount, self).__init__(name=name, dtype=dtype)
+    self._confusion_matrix_cond = confusion_matrix_cond
+    self.thresholds = 0.5 if thresholds is None else thresholds
+    thresholds = to_list(thresholds)
+    _assert_thresholds_range(thresholds)
+    self.accumulator = self.add_weight(
+        'accumulator',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates the given confusion matrix condition statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        self._confusion_matrix_cond: self.accumulator
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    if isinstance(self.thresholds, (list, tuple)):
+      result = self.accumulator
+    else:
+      result = self.accumulator[0]
+    return ops.convert_to_tensor(result)
+
+  def reset_states(self):
+    num_thresholds = len(to_list(self.thresholds))
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@tf_export('keras.metrics.FalsePositives')
+class FalsePositives(_ConfusionMatrixConditionCount):
+  """Calculates the number of false positives.
+
+  For example, if `y_true` is [0, 1, 0, 0] and `y_pred` is [0, 0, 1, 1]
+  then the false positives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the false positives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  false positives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of false positives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.FalsePositives()
+  m.update_state([0, 1, 0, 0], [0, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalsePositives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `FalsePositives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(FalsePositives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.FALSE_POSITIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('keras.metrics.FalseNegatives')
+class FalseNegatives(_ConfusionMatrixConditionCount):
+  """Calculates the number of false negatives.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [0, 1, 0, 0]
+  then the false negatives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the false negatives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  false negatives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of false negatives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.FalseNegatives()
+  m.update_state([0, 1, 1, 1], [0, 1, 0, 0])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalseNegatives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `FalseNegatives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(FalseNegatives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.FALSE_NEGATIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('keras.metrics.TrueNegatives')
+class TrueNegatives(_ConfusionMatrixConditionCount):
+  """Calculates the number of true negatives.
+
+  For example, if `y_true` is [0, 1, 0, 0] and `y_pred` is [1, 1, 0, 0]
+  then the true negatives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the true negatives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  true negatives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of true negatives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.TrueNegatives()
+  m.update_state([0, 1, 0, 0], [1, 1, 0, 0])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TrueNegatives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `TrueNegatives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TrueNegatives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.TRUE_NEGATIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('keras.metrics.TruePositives')
+class TruePositives(_ConfusionMatrixConditionCount):
+  """Calculates the number of true positives.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1]
+  then the true positives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the true positives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  true positives. This metric creates one local variable, `true_positives`
+  that is used to keep track of the number of true positives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.TruePositives()
+  m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TruePositives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `TruePositives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TruePositives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.TRUE_POSITIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('keras.metrics.Precision')
+class Precision(Metric):
+  """Computes the precision of the predictions with respect to the labels.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1]
+  then the precision value is 2/(2+1) ie. 0.66. If the weights were specified as
+  [0, 0, 1, 0] then the precision value would be 1.
+
+  The metric creates two local variables, `true_positives` and `false_positives`
+  that are used to compute the precision. This value is ultimately returned as
+  `precision`, an idempotent operation that simply divides `true_positives`
+  by the sum of `true_positives` and `false_positives`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Precision()
+  m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Precision()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `Precision` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Precision, self).__init__(name=name, dtype=dtype)
+    self.thresholds = 0.5 if thresholds is None else thresholds
+    thresholds = to_list(thresholds)
+    _assert_thresholds_range(thresholds)
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+    self.fp = self.add_weight(
+        'false_positives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates true positive and false positive statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.FALSE_POSITIVES: self.fp
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    result = math_ops.div_no_nan(self.tp, self.tp + self.fp)
+    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+
+  def reset_states(self):
+    num_thresholds = len(to_list(self.thresholds))
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@tf_export('keras.metrics.Recall')
+class Recall(Metric):
+  """Computes the recall of the predictions with respect to the labels.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1]
+  then the recall value is 2/(2+1) ie. 0.66. If the weights were specified as
+  [0, 0, 1, 0] then the recall value would be 1.
+
+  This metric creates two local variables, `true_positives` and
+  `false_negatives`, that are used to compute the recall. This value is
+  ultimately returned as `recall`, an idempotent operation that simply divides
+  `true_positives` by the sum of `true_positives` and `false_negatives`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Recall()
+  m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Recall()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `Recall` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Recall, self).__init__(name=name, dtype=dtype)
+    self.thresholds = 0.5 if thresholds is None else thresholds
+    thresholds = to_list(thresholds)
+    _assert_thresholds_range(thresholds)
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+    self.fn = self.add_weight(
+        'false_negatives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates true positive and false negative statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.FALSE_NEGATIVES: self.fn
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    result = math_ops.div_no_nan(self.tp, self.tp + self.fn)
+    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+
+  def reset_states(self):
+    num_thresholds = len(to_list(self.thresholds))
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@six.add_metaclass(abc.ABCMeta)
+class SensitivitySpecificityBase(Metric):
+  """Abstract base class for computing sensitivity and specificity.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+  """
+
+  def __init__(self, value, num_thresholds=200, name=None, dtype=None):
+    super(SensitivitySpecificityBase, self).__init__(name=name, dtype=dtype)
+    if num_thresholds <= 0:
+      raise ValueError('`num_thresholds` must be > 0.')
+    self.value = value
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.tn = self.add_weight(
+        'true_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.fp = self.add_weight(
+        'false_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.fn = self.add_weight(
+        'false_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+
+    # Compute `num_thresholds` thresholds in [0, 1]
+    if num_thresholds == 1:
+      self.thresholds = [0.5]
+    else:
+      thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                    for i in range(num_thresholds - 2)]
+      self.thresholds = [0.0] + thresholds + [1.0]
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates confusion matrix statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.TRUE_NEGATIVES: self.tn,
+        _ConfusionMatrix.FALSE_POSITIVES: self.fp,
+        _ConfusionMatrix.FALSE_NEGATIVES: self.fn,
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def reset_states(self):
+    num_thresholds = len(self.thresholds)
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@tf_export('keras.metrics.SensitivityAtSpecificity')
+class SensitivityAtSpecificity(SensitivitySpecificityBase):
+  """Computes the sensitivity at a given specificity.
+
+  `Sensitivity` measures the proportion of actual positives that are correctly
+  identified as such (tp / (tp + fn)).
+  `Specificity` measures the proportion of actual negatives that are correctly
+  identified as such (tn / (tn + fp)).
+
+  This metric creates four local variables, `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` that are used to compute the
+  sensitivity at the given specificity. The threshold for the given specificity
+  value is computed and used to evaluate the corresponding sensitivity.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SensitivityAtSpecificity(0.4, num_thresholds=1)
+  m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.SensitivityAtSpecificity()])
+  ```
+  """
+
+  def __init__(self, specificity, num_thresholds=200, name=None, dtype=None):
+    """Creates a `SensitivityAtSpecificity` instance.
+
+    Args:
+      specificity: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+        use for matching the given specificity.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    if specificity < 0 or specificity > 1:
+      raise ValueError('`specificity` must be in the range [0, 1].')
+    super(SensitivityAtSpecificity, self).__init__(
+        specificity, num_thresholds=num_thresholds, name=name, dtype=dtype)
+
+  def result(self):
+    # Calculate specificities at all the thresholds.
+    specificities = math_ops.div_no_nan(self.tn, self.tn + self.fp)
+
+    # Find the index of the threshold where the specificity is closest to the
+    # given specificity.
+    min_index = math_ops.argmin(
+        math_ops.abs(specificities - self.value), axis=0)
+    min_index = math_ops.cast(min_index, dtypes.int32)
+
+    # Compute sensitivity at that index.
+    return math_ops.div_no_nan(self.tp[min_index],
+                               self.tp[min_index] + self.fn[min_index])
+
+
+@tf_export('keras.metrics.SpecificityAtSensitivity')
+class SpecificityAtSensitivity(SensitivitySpecificityBase):
+  """Computes the specificity at a given sensitivity.
+
+  `Sensitivity` measures the proportion of actual positives that are correctly
+  identified as such (tp / (tp + fn)).
+  `Specificity` measures the proportion of actual negatives that are correctly
+  identified as such (tn / (tn + fp)).
+
+  This metric creates four local variables, `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` that are used to compute the
+  specificity at the given sensitivity. The threshold for the given sensitivity
+  value is computed and used to evaluate the corresponding specificity.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SpecificityAtSensitivity(0.8, num_thresholds=1)
+  m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.SpecificityAtSensitivity()])
+  ```
+  """
+
+  def __init__(self, sensitivity, num_thresholds=200, name=None, dtype=None):
+    """Creates a `SpecificityAtSensitivity` instance.
+
+    Args:
+      sensitivity: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+        use for matching the given specificity.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    if sensitivity < 0 or sensitivity > 1:
+      raise ValueError('`sensitivity` must be in the range [0, 1].')
+    super(SpecificityAtSensitivity, self).__init__(
+        sensitivity, num_thresholds=num_thresholds, name=name, dtype=dtype)
+
+  def result(self):
+    # Calculate sensitivities at all the thresholds.
+    sensitivities = math_ops.div_no_nan(self.tp, self.tp + self.fn)
+
+    # Find the index of the threshold where the sensitivity is closest to the
+    # given specificity.
+    min_index = math_ops.argmin(
+        math_ops.abs(sensitivities - self.value), axis=0)
+    min_index = math_ops.cast(min_index, dtypes.int32)
+
+    # Compute specificity at that index.
+    return math_ops.div_no_nan(self.tn[min_index],
+                               self.tn[min_index] + self.fp[min_index])
+
+
+class CosineProximity(MeanMetricWrapper):
+  """Computes the cosine distance between the labels and predictions.
+
+  For example, if `y_true` is [0, 1, 1], and `y_pred` is [1, 0, 1], the cosine
+  proximity is -0.5.
+
+  This metric keeps the average cosine distance between `predictions` and
+  `labels` over a stream of data.
+
+  Usage:
+  ```python
+  m = tf.metrics.CosineProximity()
+  m.update_state([0, 1, 1], [1, 0, 1])
+  print('Final result: ', m.result().numpy())  # Final result: -0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.metrics.CosineProximity()])
+  ```
+  """
+
+  def __init__(self, name='cosine_proximity', dtype=None):
+    super(CosineProximity, self).__init__(cosine, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(CosineProximity, cls).from_config(config)
+
+
+def accuracy(y_true, y_pred):
+  y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
+  if y_true.dtype != y_pred.dtype:
+    y_pred = math_ops.cast(y_pred, y_true.dtype)
+  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+
 
 @tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred, threshold=0.5):
@@ -656,10 +1545,10 @@ def sparse_categorical_accuracy(y_true, y_pred):
     y_true = array_ops.squeeze(y_true, [-1])
   y_pred = math_ops.argmax(y_pred, axis=-1)
 
-  # If the expected labels are float, we need to cast the int returned by
-  # argmax to compare.
-  if K.dtype(y_true) == K.floatx():
-    y_pred = math_ops.cast(y_pred, K.floatx())
+  # If the predicted output and actual output types don't match, force cast them
+  # to match.
+  if K.dtype(y_pred) != K.dtype(y_true):
+    y_pred = math_ops.cast(y_pred, K.dtype(y_true))
 
   return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
 
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 5f5565d4d5a547d640217cf799a20d0050584ed6..92398acd8e6dc683e37cf759c667c4665961b356 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -19,22 +19,25 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
-from tensorflow.python.keras.engine.training import Model
+from tensorflow.python.keras.models import Sequential
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class KerasMetricsTest(test.TestCase):
@@ -47,7 +50,7 @@ class KerasMetricsTest(test.TestCase):
         output = metric(y_a, y_b)
         self.assertEqual(K.eval(output).shape, (6,))
 
-  def test_sparse_categorical_accuracy(self):
+  def test_sparse_categorical_accuracy_int(self):
     with self.cached_session():
       metric = metrics.sparse_categorical_accuracy
       y_true = K.variable(np.random.randint(0, 7, (6,)))
@@ -128,116 +131,6 @@ class KerasMetricsTest(test.TestCase):
       result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
       self.assertEqual(result, 0.)
 
-  def test_stateful_metrics(self):
-    with self.cached_session():
-      np.random.seed(1334)
-
-      class BinaryTruePositives(layers.Layer):
-        """Stateful Metric to count the total true positives over all batches.
-
-        Assumes predictions and targets of shape `(samples, 1)`.
-
-        Arguments:
-            threshold: Float, lower limit on prediction value that counts as a
-                positive class prediction.
-            name: String, name for the metric.
-        """
-
-        def __init__(self, name='true_positives', **kwargs):
-          super(BinaryTruePositives, self).__init__(name=name, **kwargs)
-          self.true_positives = K.variable(value=0, dtype='int32')
-          self.stateful = True
-
-        def reset_states(self):
-          K.set_value(self.true_positives, 0)
-
-        def __call__(self, y_true, y_pred):
-          """Computes the number of true positives in a batch.
-
-          Args:
-              y_true: Tensor, batch_wise labels
-              y_pred: Tensor, batch_wise predictions
-
-          Returns:
-              The total number of true positives seen this epoch at the
-                  completion of the batch.
-          """
-          y_true = math_ops.cast(y_true, 'int32')
-          y_pred = math_ops.cast(math_ops.round(y_pred), 'int32')
-          correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32')
-          true_pos = math_ops.cast(
-              math_ops.reduce_sum(correct_preds * y_true), 'int32')
-          current_true_pos = self.true_positives * 1
-          self.add_update(
-              state_ops.assign_add(self.true_positives, true_pos),
-              inputs=[y_true, y_pred])
-          return current_true_pos + true_pos
-
-      metric_fn = BinaryTruePositives()
-      config = metrics.serialize(metric_fn)
-      metric_fn = metrics.deserialize(
-          config, custom_objects={'BinaryTruePositives': BinaryTruePositives})
-
-      # Test on simple model
-      inputs = layers.Input(shape=(2,))
-      outputs = layers.Dense(1, activation='sigmoid')(inputs)
-      model = Model(inputs, outputs)
-      model.compile(optimizer='sgd',
-                    loss='binary_crossentropy',
-                    metrics=['acc', metric_fn])
-
-      # Test fit, evaluate
-      samples = 100
-      x = np.random.random((samples, 2))
-      y = np.random.randint(2, size=(samples, 1))
-      val_samples = 10
-      val_x = np.random.random((val_samples, 2))
-      val_y = np.random.randint(2, size=(val_samples, 1))
-
-      history = model.fit(x, y,
-                          epochs=1,
-                          batch_size=10,
-                          validation_data=(val_x, val_y))
-      outs = model.evaluate(x, y, batch_size=10)
-      preds = model.predict(x)
-
-      def ref_true_pos(y_true, y_pred):
-        return np.sum(np.logical_and(y_pred > 0.5, y_true == 1))
-
-      # Test correctness (e.g. updates should have been run)
-      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
-
-      # Test correctness of the validation metric computation
-      val_preds = model.predict(val_x)
-      val_outs = model.evaluate(val_x, val_y, batch_size=10)
-      self.assertAllClose(
-          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
-      self.assertAllClose(
-          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
-
-      # Test with generators
-      gen = [(np.array([x0]), np.array([y0])) for x0, y0 in zip(x, y)]
-      val_gen = [(np.array([x0]), np.array([y0]))
-                 for x0, y0 in zip(val_x, val_y)]
-      history = model.fit_generator(iter(gen),
-                                    epochs=1,
-                                    steps_per_epoch=samples,
-                                    validation_data=iter(val_gen),
-                                    validation_steps=val_samples)
-      outs = model.evaluate_generator(iter(gen), steps=samples)
-      preds = model.predict_generator(iter(gen), steps=samples)
-
-      # Test correctness of the metric results
-      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
-
-      # Test correctness of the validation metric computation
-      val_preds = model.predict_generator(iter(val_gen), steps=val_samples)
-      val_outs = model.evaluate_generator(iter(val_gen), steps=val_samples)
-      self.assertAllClose(
-          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
-      self.assertAllClose(
-          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
-
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def test_mean(self):
     m = metrics.Mean(name='my_mean')
@@ -319,19 +212,19 @@ class KerasMetricsTest(test.TestCase):
       m = metrics.Mean()
       v = array_ops.placeholder(dtypes.float32)
       w = array_ops.placeholder(dtypes.float32)
-      sess.run(variables.variables_initializer(m.variables))
+      self.evaluate(variables.variables_initializer(m.variables))
 
       # check __call__()
       result_t = m(v, sample_weight=w)
       result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
-      self.assertEqual(sess.run(m.total), 50)
-      self.assertEqual(sess.run(m.count), 0.5)
+      self.assertEqual(self.evaluate(m.total), 50)
+      self.assertEqual(self.evaluate(m.count), 0.5)
       self.assertEqual(result, 50 / 0.5)
 
       # check update_state() and result()
       result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
-      self.assertAlmostEqual(sess.run(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
-      self.assertAlmostEqual(sess.run(m.count), 1.7, 2)  # 0.5 + 1.2
+      self.assertAlmostEqual(self.evaluate(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
+      self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
       self.assertAlmostEqual(result, 52 / 1.7, 2)
 
   @test_util.run_in_graph_and_eager_modes
@@ -365,6 +258,28 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(200., self.evaluate(restore_mean.result()))
     self.assertEqual(3, self.evaluate(restore_mean.count))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_accuracy(self):
+    acc_obj = metrics.Accuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[1], [2], [3], [4]], [[1], [2], [3], [4]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check with sample_weight
+    result_t = acc_obj([[2], [1]], [[2], [0]], sample_weight=[[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
+
   @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy(self):
     acc_obj = metrics.BinaryAccuracy(name='my acc')
@@ -398,11 +313,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
 
-    # check incompatible shapes
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Shapes \(1,\) and \(2,\) are incompatible'):
-      acc_obj.update_state([1, 1], [1])
-
   @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy_threshold(self):
     acc_obj = metrics.BinaryAccuracy(threshold=0.7)
@@ -436,47 +346,830 @@ class KerasMetricsTest(test.TestCase):
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
   @test_util.run_in_graph_and_eager_modes
-  def test_invalid_result(self):
+  def test_sparse_categorical_accuracy(self):
+    acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
 
-    class InvalidResult(metrics.Metric):
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[2], [1]],
+                                     [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
 
-      def __init__(self, name='invalid-result', dtype=dtypes.float64):
-        super(InvalidResult, self).__init__(name=name, dtype=dtype)
+    # check with sample_weight
+    result_t = acc_obj([[2], [1]], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+                       [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
-      def update_state(self, *args, **kwargs):
-        pass
 
-      def result(self):
-        return 1
+def _get_simple_sequential_model(compile_metrics):
+  model = Sequential()
+  model.add(
+      layers.Dense(
+          3, activation='relu', input_dim=4, kernel_initializer='ones'))
+  model.add(layers.Dense(1, activation='sigmoid', kernel_initializer='ones'))
+  model.compile(
+      loss='mae',
+      metrics=compile_metrics,
+      optimizer=RMSPropOptimizer(learning_rate=0.001))
+  return model
 
-    invalid_result_obj = InvalidResult()
-    with self.assertRaisesRegexp(
-        TypeError,
-        'Metric invalid-result\'s result must be a Tensor or Operation, given:'
-    ):
-      invalid_result_obj.result()
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_invalid_update(self):
+@test_util.run_all_in_graph_and_eager_modes
+class FalsePositivesTest(test.TestCase):
 
-    class InvalidUpdate(metrics.Metric):
+  def test_config(self):
+    fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
+    self.assertEqual(fp_obj.name, 'my_fp')
+    self.assertEqual(len(fp_obj.variables), 1)
+    self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
 
-      def __init__(self, name='invalid-update', dtype=dtypes.float64):
-        super(InvalidUpdate, self).__init__(name=name, dtype=dtype)
+  def test_unweighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
 
-      def update_state(self, *args, **kwargs):
-        return [1]
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-      def result(self):
-        pass
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(14., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose([7., 4., 2.], result)
 
-    invalid_update_obj = InvalidUpdate()
-    with self.assertRaisesRegexp(
-        TypeError,
-        'Metric invalid-update\'s update must be a Tensor or Operation, given:'
-    ):
-      invalid_update_obj.update_state()
+  def test_weighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0),
+                     (19.0, 23.0, 29.0, 31.0), (5.0, 15.0, 10.0, 0))
 
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([125., 42., 12.], self.evaluate(result))
+
+  def test_threshold_limit(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
+      metrics.FalsePositives(thresholds=[-1, 0.5, 2])
+
+  def test_reset_states(self):
+    fp_obj = metrics.FalsePositives()
+    model = _get_simple_sequential_model([fp_obj])
+    x = np.ones((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FalseNegativesTest(test.TestCase):
+
+  def test_config(self):
+    fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
+    self.assertEqual(fn_obj.name, 'my_fn')
+    self.assertEqual(len(fn_obj.variables), 1)
+    self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(5., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose([1., 4., 6.], result)
+
+  def test_weighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
+
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([4., 16., 23.], self.evaluate(result))
+
+  def test_reset_states(self):
+    fn_obj = metrics.FalseNegatives()
+    model = _get_simple_sequential_model([fn_obj])
+    x = np.zeros((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TrueNegativesTest(test.TestCase):
+
+  def test_config(self):
+    tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
+    self.assertEqual(tn_obj.name, 'my_tn')
+    self.assertEqual(len(tn_obj.variables), 1)
+    self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(4., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose([2., 5., 7.], result)
+
+  def test_weighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((0.0, 2.0, 3.0, 5.0),)
+
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([5., 15., 23.], self.evaluate(result))
+
+  def test_reset_states(self):
+    tn_obj = metrics.TrueNegatives()
+    model = _get_simple_sequential_model([tn_obj])
+    x = np.zeros((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TruePositivesTest(test.TestCase):
+
+  def test_config(self):
+    tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
+    self.assertEqual(tp_obj.name, 'my_tp')
+    self.assertEqual(len(tp_obj.variables), 1)
+    self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(12., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose([6., 3., 1.], result)
+
+  def test_weighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    result = tp_obj(y_true, y_pred, sample_weight=37.)
+    self.assertAllClose([222., 111., 37.], self.evaluate(result))
+
+  def test_reset_states(self):
+    tp_obj = metrics.TruePositives()
+    model = _get_simple_sequential_model([tp_obj])
+    x = np.ones((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PrecisionTest(test.TestCase):
+
+  def test_config(self):
+    p_obj = metrics.Precision(name='my_precision', thresholds=[0.4, 0.9])
+    self.assertEqual(p_obj.name, 'my_precision')
+    self.assertLen(p_obj.variables, 2)
+    self.assertEqual([v.name for v in p_obj.variables],
+                     ['true_positives:0', 'false_positives:0'])
+    self.assertEqual(p_obj.thresholds, [0.4, 0.9])
+
+  def test_value_is_idempotent(self):
+    p_obj = metrics.Precision(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = p_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_precision = self.evaluate(p_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_precision, self.evaluate(p_obj.result()),
+                           1e-3)
+
+  def test_unweighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    p_obj = metrics.Precision(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 4.0
+    weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
+                         1e-3)
+
+  def test_reset_states(self):
+    p_obj = metrics.Precision()
+    model = _get_simple_sequential_model([p_obj])
+    x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.tp), 50.)
+    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.tp), 50.)
+    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RecallTest(test.TestCase):
+
+  def test_config(self):
+    r_obj = metrics.Recall(name='my_recall', thresholds=[0.4, 0.9])
+    self.assertEqual(r_obj.name, 'my_recall')
+    self.assertLen(r_obj.variables, 2)
+    self.assertEqual([v.name for v in r_obj.variables],
+                     ['true_positives:0', 'false_negatives:0'])
+    self.assertEqual(r_obj.thresholds, [0.4, 0.9])
+
+  def test_value_is_idempotent(self):
+    r_obj = metrics.Recall(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = r_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_recall = self.evaluate(r_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_recall, self.evaluate(r_obj.result()), 1e-3)
+
+  def test_unweighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    r_obj = metrics.Recall(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 1.0
+    weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+    expected_recall = weighted_tp / weighted_t
+    self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
+                         1e-3)
+
+  def test_reset_states(self):
+    r_obj = metrics.Recall()
+    model = _get_simple_sequential_model([r_obj])
+    x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.tp), 50.)
+    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.tp), 50.)
+    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SensitivityAtSpecificity(
+        0.4, num_thresholds=100, name='sensitivity_at_specificity_1')
+    self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.value, 0.4)
+    self.assertLen(s_obj.thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_sensitivity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_sensitivity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.8, self.evaluate(result))
+
+  def test_unweighted_low_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.675, self.evaluate(result))
+
+  def test_invalid_specificity(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`specificity` must be in the range \[0, 1\].'):
+      metrics.SensitivityAtSpecificity(-1)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
+
+  def test_reset_states(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
+    model = _get_simple_sequential_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SpecificityAtSensitivity(
+        0.4, num_thresholds=100, name='specificity_at_sensitivity_1')
+    self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.value, 0.4)
+    self.assertLen(s_obj.thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_specificity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_specificity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_unweighted_low_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_invalid_sensitivity(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`sensitivity` must be in the range \[0, 1\].'):
+      metrics.SpecificityAtSensitivity(-1)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
+
+  def test_reset_states(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
+    model = _get_simple_sequential_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CosineProximityTest(test.TestCase):
+
+  def test_config(self):
+    cosine_obj = metrics.CosineProximity(name='my_cos', dtype=dtypes.int32)
+    self.assertEqual(cosine_obj.name, 'my_cos')
+    self.assertEqual(cosine_obj._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    cosine_obj = metrics.CosineProximity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = cosine_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = cosine_obj.result()
+    self.assertAllClose(-0.60723, result, atol=1e-5)
+
+  def test_weighted(self):
+    cosine_obj = metrics.CosineProximity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(-0.59916, self.evaluate(result), atol=1e-5)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 3a1b00041fb01b93258f284b9c871b1da19d7278..553c7fb00969fd8c1e042b84ffff37bc82981d02 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -21,11 +21,11 @@ from __future__ import print_function
 import os
 
 import numpy as np
-import six
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -186,9 +186,10 @@ def get_nested_model_3(input_dim, num_classes):
   return keras.Model(inputs, outputs, name='nested_model_3')
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.run_v1_only('b/120545219')
 class ModelSubclassingTest(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def test_custom_build(self):
     class DummyModel(keras.Model):
 
@@ -209,7 +210,6 @@ class ModelSubclassingTest(test.TestCase):
     self.assertTrue(test_model.uses_custom_build, 'Model should use user '
                                                   'defined build when called.')
 
-  @test_util.run_in_graph_and_eager_modes
   def test_invalid_input_shape_build(self):
     num_classes = 2
     input_dim = 50
@@ -225,7 +225,6 @@ class ModelSubclassingTest(test.TestCase):
         ValueError, 'input shape is not one of the valid types'):
       model.build(input_shape=tensor_shape.Dimension(input_dim))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_embed_dtype_with_subclass_build(self):
     class Embedding(keras.layers.Layer):
       """An Embedding layer."""
@@ -263,7 +262,6 @@ class ModelSubclassingTest(test.TestCase):
         ValueError, 'if your layers do not support float type inputs'):
       model.build(input_shape=(35, 20))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_single_time_step_rnn_build(self):
     dim = 4
     timesteps = 1
@@ -289,7 +287,6 @@ class ModelSubclassingTest(test.TestCase):
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
     model(array_ops.ones((32, timesteps, dim)))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_single_io_subclass_build(self):
     num_classes = 2
     input_dim = 50
@@ -308,7 +305,6 @@ class ModelSubclassingTest(test.TestCase):
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
     model(array_ops.ones((32, input_dim)))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_single_io_dimension_subclass_build(self):
     num_classes = 2
     input_dim = tensor_shape.Dimension(50)
@@ -327,7 +323,6 @@ class ModelSubclassingTest(test.TestCase):
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
     model(array_ops.ones((32, input_dim)))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_multidim_io_subclass_build(self):
     num_classes = 10
     # Input size, e.g. image
@@ -346,7 +341,6 @@ class ModelSubclassingTest(test.TestCase):
 
     model(array_ops.ones(batch_input_shape))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_tensorshape_io_subclass_build(self):
     num_classes = 10
     # Input size, e.g. image
@@ -397,7 +391,6 @@ class ModelSubclassingTest(test.TestCase):
     model.load_weights(tf_format_name)
     self.assertAllClose(weights, model.get_weights())
 
-  @test_util.run_in_graph_and_eager_modes
   def test_multi_io_subclass_build(self):
     batch_size = None
     num_samples = 1000
@@ -416,7 +409,6 @@ class ModelSubclassingTest(test.TestCase):
     x2 = array_ops.ones((num_samples, input_dim))
     model([x1, x2])
 
-  @test_util.run_in_graph_and_eager_modes
   def test_single_io_workflow_with_np_arrays(self):
     num_classes = 2
     num_samples = 100
@@ -436,7 +428,6 @@ class ModelSubclassingTest(test.TestCase):
     model.fit(x, y, epochs=2, batch_size=32, verbose=0)
     _ = model.evaluate(x, y, verbose=0)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_multi_io_workflow_with_np_arrays(self):
     num_classes = (2, 3)
     num_samples = 1000
@@ -457,45 +448,6 @@ class ModelSubclassingTest(test.TestCase):
     model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
     _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
 
-  def test_single_io_workflow_with_tensors(self):
-
-    num_classes = 2
-    num_samples = 10
-    input_dim = 50
-
-    with self.cached_session():
-      model = SimpleTestModel(num_classes=num_classes,
-                              use_dp=True,
-                              use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-      x = array_ops.ones((num_samples, input_dim))
-      y = array_ops.zeros((num_samples, num_classes))
-
-      model.fit(x, y, epochs=2, steps_per_epoch=10, verbose=0)
-      _ = model.evaluate(steps=10, verbose=0)
-
-  def test_multi_io_workflow_with_tensors(self):
-
-    num_classes = (2, 3)
-    num_samples = 10
-    input_dim = 50
-
-    with self.cached_session():
-      model = MultiIOTestModel(num_classes=num_classes,
-                               use_dp=True,
-                               use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-      x1 = array_ops.ones((num_samples, input_dim))
-      x2 = array_ops.ones((num_samples, input_dim))
-      y1 = array_ops.zeros((num_samples, num_classes[0]))
-      y2 = array_ops.zeros((num_samples, num_classes[1]))
-
-      model.fit([x1, x2], [y1, y2], epochs=2, steps_per_epoch=10, verbose=0)
-      _ = model.evaluate(steps=10, verbose=0)
-
-  @test_util.run_in_graph_and_eager_modes
   def test_single_io_workflow_with_dataset_iterators(self):
     num_classes = 2
     num_samples = 10
@@ -505,41 +457,16 @@ class ModelSubclassingTest(test.TestCase):
       model = SimpleTestModel(num_classes=num_classes, use_dp=True, use_bn=True)
       model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
 
-      x = np.ones((num_samples, input_dim))
-      y = np.zeros((num_samples, num_classes))
+      x = np.ones((num_samples, input_dim), dtype=np.float32)
+      y = np.zeros((num_samples, num_classes), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
 
       model.fit(iterator, epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(iterator, steps=10, verbose=0)
 
-  def test_multi_io_workflow_with_numpy_arrays_and_custom_placeholders(self):
-
-    num_classes = (2, 3)
-    num_samples = 1000
-    input_dim = 50
-
-    with self.cached_session():
-      model = MultiIOTestModel(num_classes=num_classes,
-                               use_dp=True,
-                               use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-      x1 = np.ones((num_samples, input_dim))
-      x2 = np.ones((num_samples, input_dim))
-      y1 = np.zeros((num_samples, num_classes[0]))
-      y2 = np.zeros((num_samples, num_classes[1]))
-
-      x2_placeholder = array_ops.placeholder(
-          dtype='float32', shape=(None, input_dim))
-      model._set_inputs([x1, x2_placeholder])
-
-      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
-      _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def test_attributes(self):
     # layers, weights, trainable_weights, non_trainable_weights, inputs, outputs
 
@@ -569,7 +496,6 @@ class ModelSubclassingTest(test.TestCase):
     self.assertEqual(len(model.inputs), 2)
     self.assertEqual(len(model.outputs), 2)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_updates(self):
     # test that updates get run during training
     num_samples = 100
@@ -596,74 +522,6 @@ class ModelSubclassingTest(test.TestCase):
     y_new = model.predict(x)
     self.assertGreater(np.sum(np.abs(y_ref - y_new)), 0.1)
 
-  def test_updates_and_losses_for_nested_models_in_subclassed_model(self):
-
-    # Case 1: deferred-build sequential nested in subclass.
-    class TestModel1(keras.Model):
-
-      def __init__(self):
-        super(TestModel1, self).__init__()
-        self.fc = keras.layers.Dense(10, input_shape=(784,),
-                                     activity_regularizer='l1')
-        self.bn = keras.Sequential([keras.layers.BatchNormalization(axis=1)])
-
-      def call(self, x):
-        return self.bn(self.fc(x))
-
-    with self.cached_session():
-      model = TestModel1()
-
-      x = array_ops.ones(shape=[100, 784], dtype='float32')
-      model(x)
-      self.assertEqual(len(model.get_updates_for(x)), 2)
-      self.assertEqual(len(model.get_losses_for(x)), 1)
-
-    # Case 2: placeholder-sequential nested in subclass.
-    class TestModel2(keras.Model):
-
-      def __init__(self):
-        super(TestModel2, self).__init__()
-        self.fc = keras.layers.Dense(10, input_shape=(784,),
-                                     activity_regularizer='l1')
-        self.bn = keras.Sequential(
-            [keras.layers.BatchNormalization(axis=1, input_shape=(10,))])
-
-      def call(self, x):
-        return self.bn(self.fc(x))
-
-    with self.cached_session():
-      model = TestModel2()
-
-      x = array_ops.ones(shape=[100, 784], dtype='float32')
-      model(x)
-      self.assertEqual(len(model.get_updates_for(x)), 2)
-      self.assertEqual(len(model.get_losses_for(x)), 1)
-
-    # Case 3: functional-API model nested in subclass.
-    inputs = keras.Input((10,))
-    outputs = keras.layers.BatchNormalization(axis=1)(inputs)
-    bn = keras.Model(inputs, outputs)
-
-    class TestModel3(keras.Model):
-
-      def __init__(self):
-        super(TestModel3, self).__init__()
-        self.fc = keras.layers.Dense(10, input_shape=(784,),
-                                     activity_regularizer='l1')
-        self.bn = bn
-
-      def call(self, x):
-        return self.bn(self.fc(x))
-
-    with self.cached_session():
-      model = TestModel3()
-
-      x = array_ops.ones(shape=[100, 784], dtype='float32')
-      model(x)
-      self.assertEqual(len(model.get_updates_for(x)), 2)
-      self.assertEqual(len(model.get_losses_for(x)), 1)
-
-  @test_util.run_in_graph_and_eager_modes
   def test_training_and_inference_behavior(self):
     # test that dropout is applied in training and not inference
 
@@ -691,7 +549,6 @@ class ModelSubclassingTest(test.TestCase):
     loss = model.train_on_batch(x, y)
     self.assertGreater(loss, 0.1)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_training_methods(self):
     # test fit, train_on_batch
     # on different input types: list, dict
@@ -720,7 +577,6 @@ class ModelSubclassingTest(test.TestCase):
     model.train_on_batch({'input_1': x1, 'input_2': x2},
                          {'output_1': y1, 'output_2': y2})
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def test_inference_methods(self):
     # test predict, evaluate, test_on_batch, predict_on_batch
     # on different input types: list, dict
@@ -744,14 +600,6 @@ class ModelSubclassingTest(test.TestCase):
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
     model.predict_on_batch([x1, x2])
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_trainable_mutation(self):
-    # test that you can change `trainable` on a model or layer, and that
-    # it freezes the model state during training
-    # TODO(fchollet): add test after we unify BN behavior in eager and symbolic.
-    pass
-
-  @test_util.run_in_graph_and_eager_modes
   def test_saving(self):
 
     num_classes = (2, 3)
@@ -793,7 +641,6 @@ class ModelSubclassingTest(test.TestCase):
       self.assertAllClose(y_ref_1, y1, atol=1e-5)
       self.assertAllClose(y_ref_2, y2, atol=1e-5)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_summary(self):
 
     class ToString(object):
@@ -819,7 +666,6 @@ class ModelSubclassingTest(test.TestCase):
     model.summary(print_fn=print_fn)
     self.assertTrue('Trainable params: 587' in print_fn.contents)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_subclass_nested_in_subclass(self):
     num_classes = 2
     num_samples = 100
@@ -842,7 +688,6 @@ class ModelSubclassingTest(test.TestCase):
     self.assertEqual(len(model.trainable_weights),
                      6 + len(model.test_net.trainable_weights))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_graph_nested_in_subclass(self):
     num_classes = 2
     num_samples = 100
@@ -865,7 +710,6 @@ class ModelSubclassingTest(test.TestCase):
     self.assertEqual(len(model.trainable_weights),
                      6 + len(model.test_net.trainable_weights))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_subclass_nested_in_graph(self):
     num_classes = 2
     num_samples = 100
@@ -883,11 +727,41 @@ class ModelSubclassingTest(test.TestCase):
     _ = model.evaluate(x, y, verbose=0)
 
     self.assertEqual(len(model.weights), 16)
-    self.assertEqual(
-        len(model.non_trainable_weights), 4)
+    self.assertEqual(len(model.non_trainable_weights), 4)
     self.assertEqual(len(model.trainable_weights), 12)
 
-  @test_util.run_in_graph_and_eager_modes
+  def test_subclass_nested_in_sequential(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    class Inner(keras.Model):
+
+      def __init__(self):
+        super(Inner, self).__init__()
+        self.dense1 = keras.layers.Dense(32, activation='relu')
+        self.dense2 = keras.layers.Dense(num_classes, activation='relu')
+        self.bn = keras.layers.BatchNormalization()
+
+      def call(self, inputs):
+        x = self.dense1(inputs)
+        x = self.dense2(x)
+        return self.bn(x)
+
+    model = keras.Sequential([Inner()])
+    model.compile(loss='mse',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  metrics=['acc'])
+
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
+
+    self.assertEqual(len(model.weights), 8)
+    self.assertEqual(len(model.non_trainable_weights), 2)
+    self.assertEqual(len(model.trainable_weights), 6)
+
   def test_support_for_manual_training_arg(self):
     # In most cases, the `training` argument is left unspecified, in which
     # case it defaults to value corresponding to the Model method being used
@@ -978,6 +852,203 @@ class ModelSubclassingTest(test.TestCase):
     self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
                      m.non_trainable_variables)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_add_weight_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+    class MyModelCustomBuild(keras.Model):
+
+      def build(self, input_shape):
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModelCustomBuild()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+  def test_add_update_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,))
+
+      def call(self, inputs):
+        # Unconditional
+        self.add_update(self.b.assign(self.b * 2))
+        # Conditional
+        self.add_update(self.c.assign(inputs[1, :]), inputs)
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+
+    if context.executing_eagerly():
+      self.assertEqual(0, len(model.updates))
+    else:
+      self.assertEqual(2, len(model.updates))
+      self.assertEqual(1, len(model.get_updates_for(None)))
+      self.assertEqual(1, len(model.get_updates_for(x)))
+
+
+@test_util.run_v1_only('b/120545219')
+class GraphSpecificModelSubclassingTests(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_single_io_workflow_with_tensors(self):
+    num_classes = 2
+    num_samples = 10
+    input_dim = 50
+
+    with self.cached_session():
+      model = SimpleTestModel(num_classes=num_classes,
+                              use_dp=True,
+                              use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+      x = array_ops.ones((num_samples, input_dim))
+      y = array_ops.zeros((num_samples, num_classes))
+
+      model.fit(x, y, epochs=2, steps_per_epoch=10, verbose=0)
+      _ = model.evaluate(steps=10, verbose=0)
+
+  @test_util.run_deprecated_v1
+  def test_multi_io_workflow_with_tensors(self):
+    num_classes = (2, 3)
+    num_samples = 10
+    input_dim = 50
+
+    with self.cached_session():
+      model = MultiIOTestModel(num_classes=num_classes,
+                               use_dp=True,
+                               use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+      x1 = array_ops.ones((num_samples, input_dim))
+      x2 = array_ops.ones((num_samples, input_dim))
+      y1 = array_ops.zeros((num_samples, num_classes[0]))
+      y2 = array_ops.zeros((num_samples, num_classes[1]))
+
+      model.fit([x1, x2], [y1, y2], epochs=2, steps_per_epoch=10, verbose=0)
+      _ = model.evaluate(steps=10, verbose=0)
+
+  @test_util.run_deprecated_v1
+  def test_updates_and_losses_for_nested_models_in_subclassed_model(self):
+
+    # Case 1: deferred-build sequential nested in subclass.
+    class TestModel1(keras.Model):
+
+      def __init__(self):
+        super(TestModel1, self).__init__()
+        self.fc = keras.layers.Dense(10, input_shape=(784,),
+                                     activity_regularizer='l1')
+        self.bn = keras.Sequential([keras.layers.BatchNormalization(axis=1)])
+
+      def call(self, x):
+        return self.bn(self.fc(x))
+
+    with self.cached_session():
+      model = TestModel1()
+
+      x = array_ops.ones(shape=[100, 784], dtype='float32')
+      model(x)
+      self.assertEqual(len(model.get_updates_for(x)), 2)
+      self.assertEqual(len(model.get_losses_for(x)), 1)
+
+    # Case 2: placeholder-sequential nested in subclass.
+    class TestModel2(keras.Model):
+
+      def __init__(self):
+        super(TestModel2, self).__init__()
+        self.fc = keras.layers.Dense(10, input_shape=(784,),
+                                     activity_regularizer='l1')
+        self.bn = keras.Sequential(
+            [keras.layers.BatchNormalization(axis=1, input_shape=(10,))])
+
+      def call(self, x):
+        return self.bn(self.fc(x))
+
+    with self.cached_session():
+      model = TestModel2()
+
+      x = array_ops.ones(shape=[100, 784], dtype='float32')
+      model(x)
+      self.assertEqual(len(model.get_updates_for(x)), 2)
+      self.assertEqual(len(model.get_losses_for(x)), 1)
+
+    # Case 3: functional-API model nested in subclass.
+    inputs = keras.Input((10,))
+    outputs = keras.layers.BatchNormalization(axis=1)(inputs)
+    bn = keras.Model(inputs, outputs)
+
+    class TestModel3(keras.Model):
+
+      def __init__(self):
+        super(TestModel3, self).__init__()
+        self.fc = keras.layers.Dense(10, input_shape=(784,),
+                                     activity_regularizer='l1')
+        self.bn = bn
+
+      def call(self, x):
+        return self.bn(self.fc(x))
+
+    with self.cached_session():
+      model = TestModel3()
+
+      x = array_ops.ones(shape=[100, 784], dtype='float32')
+      model(x)
+      self.assertEqual(len(model.get_updates_for(x)), 2)
+      self.assertEqual(len(model.get_losses_for(x)), 1)
+
+  @test_util.run_deprecated_v1
+  def test_multi_io_workflow_with_numpy_arrays_and_custom_placeholders(self):
+    num_classes = (2, 3)
+    num_samples = 1000
+    input_dim = 50
+
+    with self.cached_session():
+      model = MultiIOTestModel(num_classes=num_classes,
+                               use_dp=True,
+                               use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+      x1 = np.ones((num_samples, input_dim))
+      x2 = np.ones((num_samples, input_dim))
+      y1 = np.zeros((num_samples, num_classes[0]))
+      y2 = np.zeros((num_samples, num_classes[1]))
+
+      x2_placeholder = array_ops.placeholder(
+          dtype='float32', shape=(None, input_dim))
+      model._set_inputs([x1, x2_placeholder])
+
+      model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
+      _ = model.evaluate([x1, x2], [y1, y2], verbose=0)
+
 
 class CustomCallModel(keras.Model):
 
@@ -1004,6 +1075,16 @@ class TrainingNoDefaultModel(keras.Model):
     return self.dense1(x)
 
 
+class TrainingMaskingModel(keras.Model):
+
+  def __init__(self):
+    super(TrainingMaskingModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1)
+
+  def call(self, x, training=False, mask=None):
+    return self.dense1(x)
+
+
 class CustomCallSignatureTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -1033,6 +1114,19 @@ class CustomCallSignatureTests(test.TestCase):
                                     'has been properly built.'))
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_training_and_mask_args_call_build(self):
+    input_dim = 2
+
+    model = TrainingMaskingModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build((None, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
   @test_util.run_in_graph_and_eager_modes
   def test_custom_call_kwargs_and_build(self):
     first_input_shape = (2, 3)
@@ -1068,14 +1162,14 @@ class CustomCallSignatureTests(test.TestCase):
 
     class HasKwargs(keras.Model):
 
-      def call(self, x, y=3, **key_words):
+      def call(self, x, y=3, **kwargs):
         return x
 
     model = HasKwargs()
     arg = array_ops.ones([])
     model(arg, a=3)
     if not context.executing_eagerly():
-      six.assertCountEqual(self, [arg], model.inputs)
+      self.assertEqual(len(model.inputs), 1)
 
   @test_util.run_in_graph_and_eager_modes
   def test_args_in_signature(self):
@@ -1091,8 +1185,7 @@ class CustomCallSignatureTests(test.TestCase):
     model = HasArgs()
     x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
     model(x1, x2, x3, a=3)
-    if not context.executing_eagerly():
-      six.assertCountEqual(self, [x1, x2, x3], model.inputs)
+    self.assertEqual(len(model.inputs), 3)
 
   def test_args_and_keywords_in_signature(self):
 
@@ -1104,7 +1197,8 @@ class CustomCallSignatureTests(test.TestCase):
     with context.graph_mode():
       model = HasArgs()
       x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
-      with self.assertRaisesRegexp(TypeError, 'args and arguments with'):
+      with self.assertRaisesRegexp(
+          TypeError, 'may not accept both positional arguments and '):
         model(x1, x2, x3, a=3)
 
   def test_training_no_default(self):
@@ -1113,7 +1207,7 @@ class CustomCallSignatureTests(test.TestCase):
       model = TrainingNoDefaultModel()
       arg = array_ops.ones([1, 1])
       model(arg, True)
-      six.assertCountEqual(self, [arg], model.inputs)
+      self.assertEqual(len(model.inputs), 1)
 
   def test_training_no_default_with_positional(self):
 
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 2883c9ad744b222a8ed83230dc24ee7af10d6820..2637191bb75b357341376a703b2620243bd925bf 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -100,17 +100,19 @@ def _clone_functional_model(model, input_tensors=None):
       input_tensors = list(input_tensors)
     input_tensors = generic_utils.to_list(input_tensors)
     input_tensors_ = []
-    for i, x in enumerate(input_tensors):
-      if not K.is_keras_tensor(x):
-        name = model._input_layers[i].name
-        input_tensor = Input(tensor=x, name='input_wrapper_for_' + name)
+    for i in range(len(input_tensors)):
+      input_tensor = input_tensors[i]
+      if not K.is_keras_tensor(input_tensor):
+        original_input_layer = model._input_layers[i]
+        name = original_input_layer.name
+        input_tensor = Input(tensor=input_tensor,
+                             name='input_wrapper_for_' + name)
         input_tensors_.append(input_tensor)
         # Cache newly created input layer.
-        original_input_layer = x._keras_history[0]
         newly_created_input_layer = input_tensor._keras_history[0]
         layer_map[original_input_layer] = newly_created_input_layer
       else:
-        input_tensors_.append(x)
+        input_tensors_.append(input_tensor)
     input_tensors = input_tensors_
 
   for x, y in zip(model.inputs, input_tensors):
@@ -206,10 +208,20 @@ def _clone_sequential_model(model, input_tensors=None):
   def clone(layer):
     return layer.__class__.from_config(layer.get_config())
 
-  layers = [clone(layer) for layer in model.layers]
+  # Use model._layers to ensure that all layers are cloned. The model's layers
+  # property will exclude the initial InputLayer (if it exists) in the model,
+  # resulting in a different Sequential model structure.
   if input_tensors is None:
+    layers = [clone(layer) for layer in model._layers]
     return Sequential(layers=layers, name=model.name)
   else:
+    # If input tensors are provided, the original model's InputLayer is
+    # overwritten with a different InputLayer.
+    layers = [
+        clone(layer)
+        for layer in model._layers
+        if not isinstance(layer, InputLayer)
+    ]
     if len(generic_utils.to_list(input_tensors)) != 1:
       raise ValueError('To clone a `Sequential` model, we expect '
                        ' at most one tensor '
@@ -243,7 +255,7 @@ def clone_model(model, input_tensors=None):
   Arguments:
       model: Instance of `Model`
           (could be a functional model or a Sequential model).
-      input_tensors: optional list of input tensors
+      input_tensors: optional list of input tensors or InputLayer objects
           to build the model upon. If not provided,
           placeholders will be created.
 
@@ -297,8 +309,9 @@ def _in_place_subclassed_model_reset(model):
       attributes_cache[name] = value
       assert value in model._layers
     elif isinstance(
-        value, (list, tuple)) and name not in ('layers', '_layers',
-                                               'stateful_metric_functions'):
+        value,
+        (list, tuple)) and name not in ('layers', '_layers', 'metrics',
+                                        '_compile_stateful_metric_functions'):
       # Handle case: list/tuple of layers (also tracked by the Network API).
       if value and all(isinstance(val, Layer) for val in value):
         raise ValueError('We do not support the use of list-of-layers '
@@ -338,14 +351,11 @@ def _in_place_subclassed_model_reset(model):
           'targets',
           '_feed_targets',
           'sample_weight_modes',
-          'weighted_metrics',
-          'metrics_names',
-          'metrics_tensors',
-          'metrics_updates',
-          'stateful_metric_names',
           'total_loss',
           'sample_weights',
           '_feed_sample_weights',
+          '_fit_function',
+          '_eval_function',
           'train_function',
           'test_function',
           'predict_function',
@@ -407,6 +417,9 @@ def clone_and_build_model(
   This function can be be run in the same graph or in a separate graph from the
   model. When using a separate graph, `in_place_reset` must be `False`.
 
+  Note that, currently, the clone produced from this function may not work with
+  TPU DistributionStrategy. Try at your own risk.
+
   Args:
     model: `tf.keras.Model` object. Can be Functional, Sequential, or
       sub-classed.
@@ -431,15 +444,30 @@ def clone_and_build_model(
     Clone of the model.
 
   Raises:
-    ValueError: if trying to clone a subclassed model, and `in_place_reset` is
-      set to False.
+    ValueError: Cloning fails in the following cases
+      - cloning a subclassed model with `in_place_reset` set to False.
+      - compiling the clone when the original model has not been compiled.
   """
-  if model._is_graph_network:
+  if compile_clone and not model.optimizer:
+    raise ValueError(
+        'Error when cloning model: compile_clone was set to True, but the '
+        'original model has not been compiled.')
+
+  if model._is_graph_network or isinstance(model, Sequential):
     if custom_objects:
       with CustomObjectScope(custom_objects):
         clone = clone_model(model, input_tensors=input_tensors)
     else:
       clone = clone_model(model, input_tensors=input_tensors)
+
+    if all([isinstance(clone, Sequential),
+            not clone._is_graph_network,
+            getattr(model, '_build_input_shape', None) is not None]):
+      # Set model inputs to build the model and add input/output properties.
+      # TODO(kathywu): Add multiple placeholders to handle edge case where
+      # sequential model has multiple inputs.
+      clone._set_inputs(
+          K.placeholder(model._build_input_shape, dtype=model.inputs[0].dtype))
   else:
     if not in_place_reset:
       raise ValueError(
@@ -456,11 +484,7 @@ def clone_and_build_model(
         input_tensors = input_tensors[0]
       clone._set_inputs(input_tensors)
 
-  # Compile/Build model
-  if not compile_clone:
-    if isinstance(clone, Sequential):
-      clone.build()
-  elif model.optimizer:
+  if compile_clone and model.optimizer:
     if isinstance(model.optimizer, optimizers.TFOptimizer):
       optimizer = optimizers.TFOptimizer(
           model.optimizer.optimizer, optimizer_iterations)
@@ -474,10 +498,11 @@ def clone_and_build_model(
     clone.compile(
         optimizer,
         model.loss,
-        metrics=metrics_module.clone_metrics(model.metrics),
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
         loss_weights=model.loss_weights,
         sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(model.weighted_metrics),
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
         target_tensors=target_tensors)
 
   return clone
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index c550caeb80a2471d9b35e3a28213e2068890c63c..c466d94fed8f34e0ca9e25425f88d6028c806131 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -26,10 +26,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
@@ -50,18 +52,30 @@ class TestModel(keras.Model):
     return self.layer1(x)
 
 
+def sequential_model(add_input_layer, include_input_shape=True):
+  model = keras.models.Sequential()
+  if add_input_layer:
+    model.add(keras.layers.InputLayer(input_shape=(4,)))
+    model.add(keras.layers.Dense(4))
+  elif include_input_shape:
+    model.add(keras.layers.Dense(4, input_shape=(4,)))
+  else:
+    model.add(keras.layers.Dense(4))
+  model.add(keras.layers.BatchNormalization())
+  model.add(keras.layers.Dropout(0.5))
+  model.add(keras.layers.Dense(4))
+  return model
+
+
 class TestModelCloning(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def test_clone_sequential_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
       val_out = np.random.random((10, 4))
 
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
+      model = sequential_model(False)
 
     # Everything should work in a new session.
     keras.backend.clear_session()
@@ -70,26 +84,64 @@ class TestModelCloning(test.TestCase):
       # With placeholder creation
       new_model = keras.models.clone_model(model)
       # update ops from batch norm needs to be included
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new tensor
       input_a = keras.Input(shape=(4,))
-      new_model = keras.models.clone_model(
-          model, input_tensors=input_a)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      new_model = keras.models.clone_model(model, input_tensors=input_a)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new, non-Keras tensor
       input_a = keras.backend.variable(val_a)
-      new_model = keras.models.clone_model(
-          model, input_tensors=input_a)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      new_model = keras.models.clone_model(model, input_tensors=input_a)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
+  @test_util.run_v1_only('b/120545219')
+  def test_clone_sequential_model_input_layer(self):
+
+    def test_input_layer(include_inputs):
+      with self.cached_session():
+        val_a = np.random.random((10, 4))
+        model = sequential_model(include_inputs, include_inputs)
+        # Sanity check
+        self.assertEqual(
+            isinstance(model._layers[0], keras.layers.InputLayer),
+            include_inputs)
+        self.assertEqual(model._is_graph_network, include_inputs)
+
+      keras.backend.clear_session()
+      with self.cached_session():
+        # With placeholder creation -- clone model should have an InputLayer
+        # if the original model has one.
+        new_model = keras.models.clone_model(model)
+        self.assertEqual(
+            isinstance(new_model._layers[0], keras.layers.InputLayer),
+            include_inputs)
+        self.assertEqual(new_model._is_graph_network, model._is_graph_network)
+
+        # On top of new tensor  -- clone model should always have an InputLayer.
+        input_a = keras.Input(shape=(4,))
+        new_model = keras.models.clone_model(model, input_tensors=input_a)
+        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+        self.assertTrue(new_model._is_graph_network)
+
+        # On top of new, non-Keras tensor  -- clone model should always have an
+        # InputLayer.
+        input_a = keras.backend.variable(val_a)
+        new_model = keras.models.clone_model(model, input_tensors=input_a)
+        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+        self.assertTrue(new_model._is_graph_network)
+
+    test_input_layer(True)
+    test_input_layer(False)
+
+  @test_util.run_v1_only('b/120545219')
   def test_clone_functional_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
@@ -115,7 +167,7 @@ class TestModelCloning(test.TestCase):
     with self.cached_session():
       # With placeholder creation
       new_model = keras.models.clone_model(model)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch([val_a, val_b], val_out)
 
@@ -124,7 +176,7 @@ class TestModelCloning(test.TestCase):
       input_b = keras.Input(shape=(4,), name='b')
       new_model = keras.models.clone_model(
           model, input_tensors=[input_a, input_b])
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch([val_a, val_b], val_out)
 
@@ -133,7 +185,7 @@ class TestModelCloning(test.TestCase):
       input_b = keras.backend.variable(val_b)
       new_model = keras.models.clone_model(
           model, input_tensors=[input_a, input_b])
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
@@ -173,6 +225,34 @@ class TestModelCloning(test.TestCase):
     with self.assertRaises(ValueError):
       keras.models._clone_sequential_model(seq_model, input_tensors=y)
 
+  def test_functional_cloning_does_not_create_unnecessary_placeholders(self):
+    with ops.Graph().as_default():
+      x = keras.Input((4,))
+      y = keras.layers.Dense(4)(x)
+      model = keras.models.Model(x, y)
+    graph = ops.Graph()
+    with graph.as_default():
+      x = array_ops.ones((10, 4))
+      _ = keras.models.clone_model(model, input_tensors=[x])
+      has_placeholder = _has_placeholder(graph)
+      self.assertFalse(has_placeholder)
+
+  def test_sequential_cloning_does_not_create_unnecessary_placeholders(self):
+    with ops.Graph().as_default():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(4, input_shape=(4,)))
+    graph = ops.Graph()
+    with graph.as_default():
+      x = array_ops.ones((10, 4))
+      _ = keras.models.clone_model(model, input_tensors=[x])
+      has_placeholder = _has_placeholder(graph)
+      self.assertFalse(has_placeholder)
+
+
+def _has_placeholder(graph):
+  ops_types = [op.type for op in graph.get_operations()]
+  return any('Placeholder' in s for s in ops_types)
+
 
 class CheckpointingTests(test.TestCase):
 
@@ -237,6 +317,7 @@ class TestModelDeepCopy(test.TestCase):
                       model_copy.get_weights()[0]))
 
 
+@test_util.run_v1_only('b/120545219')
 class TestCloneAndBuildModel(test.TestCase):
 
   def test_clone_and_build_non_compiled_model(self):
@@ -254,8 +335,11 @@ class TestCloneAndBuildModel(test.TestCase):
     keras.backend.clear_session()
 
     with self.cached_session():
+      with self.assertRaisesRegexp(ValueError, 'has not been compiled'):
+        models.clone_and_build_model(model, compile_clone=True)
+
       # With placeholder creation
-      new_model = models.clone_and_build_model(model, compile_clone=True)
+      new_model = models.clone_and_build_model(model, compile_clone=False)
       with self.assertRaisesRegexp(RuntimeError, 'must compile'):
         new_model.evaluate(inp, out)
       with self.assertRaisesRegexp(RuntimeError, 'must compile'):
@@ -268,7 +352,7 @@ class TestCloneAndBuildModel(test.TestCase):
       target_a = keras.Input(shape=(4,))
       new_model = models.clone_and_build_model(model, input_tensors=input_a,
                                                target_tensors=[target_a],
-                                               compile_clone=True)
+                                               compile_clone=False)
       with self.assertRaisesRegexp(RuntimeError, 'must compile'):
         new_model.evaluate(inp, out)
       with self.assertRaisesRegexp(RuntimeError, 'must compile'):
@@ -281,8 +365,11 @@ class TestCloneAndBuildModel(test.TestCase):
 
     self.assertEqual('mse', model.loss)
     self.assertTrue(
-        isinstance(model.optimizer, keras.optimizers.RMSprop))
-    self.assertEqual(['acc', metrics.categorical_accuracy], model.metrics)
+        isinstance(model.optimizer,
+                   (keras.optimizers.RMSprop,
+                    keras.optimizer_v2.rmsprop.RMSprop)))
+    self.assertEqual(['acc', metrics.categorical_accuracy],
+                     model._compile_metrics)
 
   def _clone_and_build_test_helper(self, model, is_subclassed=False):
     inp = np.random.random((10, 4))
@@ -396,6 +483,19 @@ class TestCloneAndBuildModel(test.TestCase):
   def test_replace_keras_optimizer_iterations_variable(self):
     self.assert_optimizer_iterations_increases('adam')
 
+  def test_clone_and_build_sequential_model_without_inputs_defined(self):
+    with self.cached_session():
+      model = sequential_model(False, False)
+      model.compile('rmsprop', 'mse',
+                    metrics=['acc', metrics.categorical_accuracy])
+    self._clone_and_build_test_helper(model, False)
+
+    with self.cached_session():
+      inp = np.random.random((10, 4))
+      out = np.random.random((10, 4))
+      model.train_on_batch(inp, out)
+    self._clone_and_build_test_helper(model, False)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b8f01249419c595a735442310c735bc10648cba6
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -0,0 +1,215 @@
+# Description:
+#   Contains the Keras OptimizerV2 API (internal TensorFlow version).
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_library(
+    name = "optimizer_v2",
+    srcs = [
+        "adadelta.py",
+        "adagrad.py",
+        "adam.py",
+        "adamax.py",
+        "ftrl.py",
+        "gradient_descent.py",
+        "nadam.py",
+        "optimizer_v2.py",
+        "rmsprop.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distribute",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:reduce_util",
+    ],
+)
+
+cuda_py_test(
+    name = "adagrad_test",
+    size = "medium",
+    srcs = ["adagrad_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "medium",
+    srcs = ["adam_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adamax_test",
+    size = "medium",
+    srcs = ["adamax_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adadelta_test",
+    size = "medium",
+    srcs = ["adadelta_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "ftrl_test",
+    size = "medium",
+    srcs = ["ftrl_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "gradient_descent_test",
+    size = "medium",
+    srcs = ["gradient_descent_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "nadam_test",
+    size = "medium",
+    srcs = ["nadam_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+py_test(
+    name = "optimizer_v2_test",
+    size = "large",
+    srcs = ["optimizer_v2_test.py"],
+    shard_count = 4,
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "rmsprop_test",
+    size = "medium",
+    srcs = ["rmsprop_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+)
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..55b4eba1051287420b8ab1adeea1598eb4647c36
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -0,0 +1,148 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adadelta for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class Adadelta(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the Adadelta algorithm.
+
+  Adadelta optimization is a stochastic gradient descent method that is based on
+  adaptive learning rate per dimension to address two drawbacks:
+    1) the continual decay of learning rates throughout training
+    2) the need for a manually selected global learning rate
+
+  Two accumulation steps are required:
+    1) the accumulation of gradients squared,
+    2) the accumulation of updates squared.
+
+  Initialization:
+
+  $$accum_g_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
+  $$accum_x_0 := 0 \text{(Initialize variable update 2nd order moment vector)}$$
+
+  $$t := t + 1$$
+  $$accum_g_t := rho * accum_g_{t-1} + (1 - rho) * g * g$$
+  $$delta = -\sqrt{accum_x_{t-1}} / (\sqrt{accum_g_{t-1}} + \epsilon)$$
+  $$accum_x_t := rho * accum_x_{t-1} + (1 - rho) * delta * delta$$
+
+  References
+    See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
+      ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
+
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.95,
+               epsilon=1e-7,
+               name='Adadelta',
+               **kwargs):
+    """Construct a new Adadelta optimizer.
+
+    Adadelta is a more robust extension of Adagrad that adapts learning rates
+    based on a moving window of gradient updates, instead of accumulating all
+    past gradients. This way, Adadelta continues learning even when many updates
+    have been done. Compared to Adagrad, in the original version of Adadelta you
+    don't have to set an initial learning rate. In this version, initial
+    learning rate can be set, as in most other Keras optimizers.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value. The learning rate.
+        To match the exact form in the original paper use 1.0.
+      rho: A `Tensor` or a floating point value. The decay rate.
+      epsilon: A `Tensor` or a floating point value.  A constant epsilon used
+               to better conditioning the grad update.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adadelta".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
+    each be a callable that takes no arguments and returns the actual value to
+    use. This can be useful for changing these values across different
+    invocations of optimizer functions.
+    @end_compatibility
+    """
+    super(Adadelta, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('rho', rho)
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for v in var_list:
+      self.add_slot(v, 'accum_grad')
+    for v in var_list:
+      self.add_slot(v, 'accum_var')
+
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(Adadelta, self).set_weights(weights)
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    accum_grad = self.get_slot(var, 'accum_grad')
+    accum_var = self.get_slot(var, 'accum_var')
+    return training_ops.resource_apply_adadelta(
+        var.handle,
+        accum_grad.handle,
+        accum_var.handle,
+        lr_t,
+        self._get_hyper('rho', var_dtype),
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    accum_grad = self.get_slot(var, 'accum_grad')
+    accum_var = self.get_slot(var, 'accum_var')
+    return training_ops.resource_sparse_apply_adadelta(
+        var.handle,
+        accum_grad.handle,
+        accum_var.handle,
+        lr_t,
+        self._get_hyper('rho', var_dtype),
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(Adadelta, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'rho': self._serialize_hyperparameter('rho'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fb67d0cd1675fa0d02db7b78f6d90d86b64888f
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -0,0 +1,170 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adadelta Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdadeltaOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    num_updates = 4  # number of ADADELTA steps to perform
+    for dtype in [dtypes.half, dtypes.float32]:
+      for grad in [0.2, 0.1, 0.01]:
+        for lr in [1.0, 0.5, 0.1]:
+          var0_init = [1.0, 2.0]
+          var1_init = [3.0, 4.0]
+          if use_resource:
+            var0 = resource_variable_ops.ResourceVariable(
+                var0_init, dtype=dtype)
+            var1 = resource_variable_ops.ResourceVariable(
+                var1_init, dtype=dtype)
+          else:
+            var0 = variables.Variable(var0_init, dtype=dtype)
+            var1 = variables.Variable(var1_init, dtype=dtype)
+
+          grads = constant_op.constant([grad, grad], dtype=dtype)
+
+          accum = 0.0
+          accum_update = 0.0
+
+          # ADADELTA gradient optimizer
+          rho = 0.95
+          epsilon = 1e-8
+          if use_callable_params:
+            adadelta_opt = adadelta.Adadelta(
+                learning_rate=lambda: lr,  # pylint: disable=cell-var-from-loop
+                rho=lambda: rho,  # pylint: disable=cell-var-from-loop
+                epsilon=lambda: epsilon)  # pylint: disable=cell-var-from-loop
+          else:
+            adadelta_opt = adadelta.Adadelta(
+                learning_rate=lr, rho=rho, epsilon=epsilon)
+          if not context.executing_eagerly():
+            adadelta_update = adadelta_opt.apply_gradients(
+                zip([grads, grads], [var0, var1]))
+            self.evaluate(variables.global_variables_initializer())
+
+            # Assign slots
+            slot = [None] * 2
+            slot_update = [None] * 2
+            slot[0] = adadelta_opt.get_slot(var0, "accum_grad")
+            self.assertEqual(slot[0].get_shape(), var0.get_shape())
+
+            slot_update[0] = adadelta_opt.get_slot(var0, "accum_var")
+            self.assertEqual(slot_update[0].get_shape(), var0.get_shape())
+
+            slot[1] = adadelta_opt.get_slot(var1, "accum_grad")
+            self.assertEqual(slot[1].get_shape(), var1.get_shape())
+
+            slot_update[1] = adadelta_opt.get_slot(var1, "accum_var")
+            self.assertEqual(slot_update[1].get_shape(), var1.get_shape())
+
+          # Fetch params to validate initial values
+          self.assertAllClose(var0_init, self.evaluate(var0))
+          self.assertAllClose(var1_init, self.evaluate(var1))
+
+          update = [None] * num_updates
+          tot_update = 0
+          for step in range(num_updates):
+            # Run adadelta update for comparison
+            if not context.executing_eagerly():
+              self.evaluate(adadelta_update)
+            else:
+              adadelta_opt.apply_gradients(zip([grads, grads], [var0, var1]))
+
+            # Perform initial update without previous accum values
+            accum = accum * rho + (grad**2) * (1 - rho)
+            update[step] = (
+                np.sqrt(accum_update + epsilon) *
+                (1. / np.sqrt(accum + epsilon)) * grad)
+            accum_update = (
+                accum_update * rho + (update[step]**2) * (1.0 - rho))
+            tot_update += update[step] * lr
+
+            if not context.executing_eagerly():
+              # Check that the accumulators have been updated
+              # TODO(lxuechen): This is hard to test in eager mode
+              for slot_idx in range(2):
+                self.assertAllCloseAccordingToType(
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
+                    self.evaluate(slot[slot_idx]),
+                    rtol=1e-5)
+
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [accum_update, accum_update],
+                        dtype=dtype.as_numpy_dtype()),
+                    self.evaluate(slot_update[slot_idx]),
+                    rtol=1e-5)
+
+              # Check that the parameters have been updated
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var0_init[0] - tot_update, var0_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  self.evaluate(var0),
+                  rtol=1e-5)
+
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var1_init[0] - tot_update, var1_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  self.evaluate(var1),
+                  rtol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adadelta.Adadelta(1.0, 1.0, 1.0).minimize(
+            loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..670cad70e63354650aeb47ed2324e2c1756e12c1
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -0,0 +1,171 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adagrad for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+
+
+class Adagrad(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the Adagrad algorithm.
+
+  Adagrad is an optimizer with parameter-specific learning rates,
+  which are adapted relative to how frequently a parameter gets
+  updated during training. The more updates a parameter receives,
+  the smaller the updates.
+
+  Initialization:
+
+  $$accum_g_0 := initial_accumulator_value$$
+
+  $$t := t + 1$$
+  $$accum_g_t := accum_g_{t-1} + g * g$$
+  $$theta_t := theta_{t-1} - lr * g / (\sqrt{accum_g_t} + \epsilon)$$
+
+  References
+    See [paper]
+      (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    or this
+      [intro](https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               initial_accumulator_value=0.1,
+               epsilon=1e-7,
+               name='Adagrad',
+               **kwargs):
+    """Construct a new Adagrad optimizer.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value.  The learning rate.
+      initial_accumulator_value: A floating point value.
+        Starting value for the accumulators, must be positive.
+      epsilon: A floating point value.
+        Starting value for the accumulators, must be positive.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adagrad".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    Raises:
+      ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate` can be a callable that
+    takes no arguments and returns the actual value to use. This can be useful
+    for changing these values across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    if initial_accumulator_value < 0.0:
+      raise ValueError('initial_accumulator_value must be non-negative: %s' %
+                       initial_accumulator_value)
+    if epsilon < 1e-7:
+      raise ValueError('epsilon must be larger than 1e-7: %s' % epsilon)
+    super(Adagrad, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._initial_accumulator_value = initial_accumulator_value
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = init_ops.constant_initializer(
+          self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, 'accumulator', init)
+
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(Adagrad, self).set_weights(weights)
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    """Creates an optimizer from its config.
+
+    This method is the reverse of `get_config`,
+    capable of instantiating the same optimizer from the config
+    dictionary.
+
+    Arguments:
+        config: A Python dictionary, typically the output of get_config.
+        custom_objects: A Python dictionary mapping names to additional Python
+          objects used to create this optimizer, such as a function used for a
+          hyperparameter.
+
+    Returns:
+        An optimizer instance.
+    """
+    if 'initial_accumulator_value' not in config:
+      config['initial_accumulator_value'] = 0.
+    if 'lr' in config:
+      config['learning_rate'] = config.pop('lr')
+    return cls(**config)
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
+    acc = self.get_slot(var, 'accumulator')
+
+    acc_t = state_ops.assign_add(
+        acc, math_ops.square(grad), use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, lr_t * grad / (math_ops.sqrt(acc_t) + epsilon))
+    return var_update
+
+  def _resource_apply_sparse(self, grad, var, indices):
+
+    def _resource_scatter_add(x, i, v):
+      with ops.control_dependencies(
+          [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+        return x.value()
+
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
+    acc = self.get_slot(var, 'accumulator')
+
+    acc_t = _resource_scatter_add(acc, indices, math_ops.square(grad))
+    acc_t_slice = array_ops.gather(acc_t, indices)
+    var_update = _resource_scatter_add(
+        var, indices, -lr_t * grad / (math_ops.sqrt(acc_t_slice) + epsilon))
+    return var_update
+
+  def get_config(self):
+    config = super(Adagrad, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'initial_accumulator_value': self._initial_accumulator_value,
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c290178fe8a62d1c7240df1d6c04f7b62456e1
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -0,0 +1,400 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for aggregate operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adagrad_update_numpy(param, accum, g_t, lr=0.001, epsilon=1e-7):
+  accum_t = accum + g_t * g_t
+  param_t = param - lr * g_t / (np.sqrt(accum_t) + epsilon)
+  return param_t, accum_t
+
+
+def sparse_adagrad_update_numpy(param,
+                                accum,
+                                gindexs,
+                                gvalues,
+                                lr=0.001,
+                                epsilon=1e-7):
+  accum_t = copy.deepcopy(accum)
+  param_t = copy.deepcopy(param)
+  # first loop accumulates repeated indices if necessary.
+  for i in range(len(gindexs)):
+    gindex = gindexs[i]
+    gvalue = gvalues[i]
+    accum_t[gindex] = accum_t[gindex] + gvalue * gvalue
+  for i in range(len(gindexs)):
+    gindex = gindexs[i]
+    gvalue = gvalues[i]
+    param_t[gindex] = param_t[gindex] - lr * gvalue / (
+        np.sqrt(accum_t[gindex]) + epsilon)
+  return param_t, accum_t
+
+
+class AdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_callable_params=False):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 3.0
+        if not use_callable_params:
+          learning_rate = learning_rate()
+
+        ada_opt = adagrad.Adagrad(learning_rate)
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        if not context.executing_eagerly():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(ada_update)
+          else:
+            ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, 3.0)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, 3.0)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasic(self):
+    self.doTestBasic()
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_callable_params=True)
+
+  def testBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 3.0
+        decay = 0.5
+
+        ada_opt = adagrad.Adagrad(learning_rate, decay=decay)
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        if not context.executing_eagerly():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for t in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(ada_update)
+          else:
+            ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          lr_np = learning_rate / (1 + decay * t)
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, lr_np)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, lr_np)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adagrad.Adagrad(1.0).minimize(loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType(
+            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = constant_op.constant(3.0)
+        ada_opt = adagrad.Adagrad(learning_rate)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, learning_rate)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, learning_rate)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        learning_rate = 3.0
+        ada_opt = adagrad.Adagrad(learning_rate)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
+
+        accum0_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+
+          var0_np, accum0_np = sparse_adagrad_update_numpy(
+              var0_np, accum0_np, grads0_np_indices,
+              grads0_np[grads0_np_indices], learning_rate)
+          var1_np, accum1_np = sparse_adagrad_update_numpy(
+              var1_np, accum1_np, grads1_np_indices,
+              grads1_np[grads1_np_indices], learning_rate)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+
+        repeated_index_update_var = resource_variable_ops.ResourceVariable(
+            var_np, dtype=dtype)
+        aggregated_update_var = resource_variable_ops.ResourceVariable(
+            var_np, dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndicesByEmbeddingLookUp(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = adagrad.Adagrad(2.0).minimize(
+            loss_repeated, var_list=[var_repeated])
+        update_op_aggregated = adagrad.Adagrad(2.0).minimize(
+            loss_aggregated, var_list=[var_aggregated])
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(
+            var_repeated.eval(), var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(
+              var_repeated.eval(), var_aggregated.eval())
+
+  @test_util.run_deprecated_v1
+  def testSparseStability(self):
+    for dtype in [dtypes.half]:
+      with self.cached_session():
+        shape = [1, 6]
+        var0_np = np.array([[
+            0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257, -0.0105945
+        ]],
+                           dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        grads0_np = np.array([[
+            -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, -8.4877e-05,
+            -9.48906e-05
+        ]],
+                             dtype=dtype.as_numpy_dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np), constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = adagrad.Adagrad(1.0)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 3.0
+        ada_opt = adagrad.Adagrad(learning_rate)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        for _ in range(3):
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, learning_rate)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, learning_rate)
+        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef3d783f8910e791cf8591e0604935102c2b52cf
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -0,0 +1,256 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+
+
+class Adam(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adam algorithm.
+
+  Adam optimization is a stochastic gradient descent method that is based on
+  adaptive estimation of first-order and second-order moments. According to the
+  reference, the method is 'computationally efficient, has little memory
+  requirement, invariant to diagonal rescaling of gradients, and is well suited
+  for problems that are large in terms of data/parameters'.
+
+  Note, amsgrad is currently not supported and the argument can only be False.
+
+  # References
+      See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+        ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+      For AMSGrad see [Reddi et al., 2-18]
+        (https://openreview.net/pdf?id=ryQu7f-RZ)
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               amsgrad=False,
+               name='Adam',
+               **kwargs):
+    r"""Construct a new Adam optimizer.
+
+    If amsgrad = False:
+      Initialization:
+
+      $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+      $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$t := 0 \text{(Initialize timestep)}$$
+
+      The update rule for `variable` with gradient `g` uses an optimization
+      described at the end of section2 of the paper:
+
+      $$t := t + 1$$
+      $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+
+      $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+      $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+      $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+    If amsgrad = True:
+      Initialization:
+
+      $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+      $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$v_hat_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$t := 0 \text{(Initialize timestep)}$$
+
+      The update rule for `variable` with gradient `g` uses an optimization
+      described at the end of section2 of the paper:
+
+      $$t := t + 1$$
+      $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+
+      $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+      $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+      $$v_hat_t := max(v_hat_{t-1}, v_t)
+      $$variable := variable - lr_t * m_t / (\sqrt{v_hat_t} + \epsilon)$$
+
+    The default value of 1e-7 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+        the paper "On the Convergence of Adam and beyond".
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".  @compatibility(eager) When eager execution is
+        enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
+        a callable that takes no arguments and returns the actual value to use.
+        This can be useful for changing these values across different
+        invocations of optimizer functions. @end_compatibility
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+
+    super(Adam, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('beta_1', beta_1)
+    self._set_hyper('beta_2', beta_2)
+    self._set_hyper('epsilon', epsilon)
+    self._amsgrad = amsgrad
+
+  def _create_slots(self, var_list):
+    # Create slots for the first and second moments.
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for var in var_list:
+      self.add_slot(var, 'm')
+    for var in var_list:
+      self.add_slot(var, 'v')
+    if self._amsgrad:
+      for var in var_list:
+        self.add_slot(var, 'vhat')
+
+  def set_weights(self, weights):
+    params = self.weights
+    # If the weights are generated by Keras V1 optimizer, it includes vhats
+    # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
+    # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
+    num_vars = int((len(params) - 1) / 2)
+    if len(weights) == 3 * num_vars + 1:
+      weights = weights[:len(params)]
+    super(Adam, self).set_weights(weights)
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    if not self._amsgrad:
+      return training_ops.resource_apply_adam(
+          var.handle,
+          m.handle,
+          v.handle,
+          beta_1_power,
+          beta_2_power,
+          lr_t,
+          beta_1_t,
+          beta_2_t,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+    else:
+      vhat = self.get_slot(var, 'vhat')
+      return training_ops.resource_apply_adam_with_amsgrad(
+          var.handle,
+          m.handle,
+          v.handle,
+          vhat.handle,
+          beta_1_power,
+          beta_2_power,
+          lr_t,
+          beta_1_t,
+          beta_2_t,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_scaled_g_values = grad * (1 - beta_1_t)
+    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, 'v')
+    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
+    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+    if not self._amsgrad:
+      v_sqrt = math_ops.sqrt(v_t)
+      var_update = state_ops.assign_sub(
+          var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+      return control_flow_ops.group(*[var_update, m_t, v_t])
+    else:
+      v_hat = self.get_slot(var, 'vhat')
+      v_hat_t = math_ops.maximum(v_hat, v_t)
+      with ops.control_dependencies([v_hat_t]):
+        v_hat_t = state_ops.assign(
+            v_hat, v_hat_t, use_locking=self._use_locking)
+      v_hat_sqrt = math_ops.sqrt(v_hat_t)
+      var_update = state_ops.assign_sub(
+          var,
+          lr * m_t / (v_hat_sqrt + epsilon_t),
+          use_locking=self._use_locking)
+      return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
+
+  def get_config(self):
+    config = super(Adam, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'beta_1': self._serialize_hyperparameter('beta_1'),
+        'beta_2': self._serialize_hyperparameter('beta_2'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+        'amsgrad': self._amsgrad,
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bbafe12f8e27df9bcc158ae6b50cba2fb086914
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -0,0 +1,508 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      lr=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-7):
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - lr_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+def adam_update_numpy_amsgrad(param,
+                              g_t,
+                              t,
+                              m,
+                              v,
+                              vhat,
+                              lr=0.001,
+                              beta1=0.9,
+                              beta2=0.999,
+                              epsilon=1e-7):
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+  vhat_t = np.maximum(vhat, v_t)
+
+  param_t = param - lr_t * m_t / (np.sqrt(vhat_t) + epsilon)
+  return param_t, m_t, v_t, vhat_t
+
+
+def adam_sparse_update_numpy_amsgrad(param,
+                                     indices,
+                                     g_t,
+                                     t,
+                                     m,
+                                     v,
+                                     vhat,
+                                     lr=0.001,
+                                     beta1=0.9,
+                                     beta2=0.999,
+                                     epsilon=1e-7):
+  m_t, v_t, vhat_t, param_t = (np.copy(m), np.copy(v), np.copy(vhat),
+                               np.copy(param))
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = beta2 * v[indices] + (1 - beta2) * g_t * g_t
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  v_hat_t = np.maximum(vhat_t, v_t)
+  v_hat_t_slice = v_hat_t[indices]
+  param_t_slice = param[indices] - (
+      lr_t * (m_t_slice / (np.sqrt(v_hat_t_slice) + epsilon)))
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t, vhat_t
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  beta_2_t = math_ops.cast(opt._get_hyper("beta_2"), dtype)
+  beta_2_power = math_ops.pow(beta_2_t, local_step)
+  return (beta_1_power, beta_2_power)
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = adam.Adam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
+
+        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam.Adam(3.0)
+        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam.Adam().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adam.Adam().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            self.evaluate(repeated_index_update_var))
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              self.evaluate(repeated_index_update_var))
+
+  def doTestBasic(self, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = adam.Adam(learning_rate=learning_rate)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic()
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_callable_params=True)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasicWithAmsgrad(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, v0hat, m1, v1, v1hat = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adam.Adam(amsgrad=True)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0, v0hat = adam_update_numpy_amsgrad(
+              var0_np, grads0_np, t, m0, v0, v0hat)
+          var1_np, m1, v1, v1hat = adam_update_numpy_amsgrad(
+              var1_np, grads1_np, t, m1, v1, v1hat)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSparseWithAmsgrad(self):
+    # dtypes.half does not work on gpu + eager.
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        m0 = np.array([[0.0], [0.0]])
+        v0 = np.array([[0.0], [0.0]])
+        v0hat = np.array([[0.0], [0.0]])
+        indices_np = np.array([1])
+        indices = constant_op.constant(indices_np, dtype=dtypes.int32)
+        var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+        repeated_index_update_var = variables.Variable(var0_np, dtype=dtype)
+        aggregated_update_var = variables.Variable(var0_np, dtype=dtype)
+        grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]), constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(grads0_np, indices,
+                                            constant_op.constant([2, 1]))
+        opt_repeated = adam.Adam(amsgrad=True)
+        opt_aggregated = adam.Adam(amsgrad=True)
+        if not context.executing_eagerly():
+          repeated_update = opt_repeated.apply_gradients(
+              [(grad_repeated_index, repeated_index_update_var)])
+          aggregated_update = opt_aggregated.apply_gradients(
+              [(grad_aggregated, aggregated_update_var)])
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllClose(
+            self.evaluate(aggregated_update_var),
+            self.evaluate(repeated_index_update_var))
+        for t in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(repeated_update)
+            self.evaluate(aggregated_update)
+          else:
+            opt_repeated.apply_gradients(
+                [(grad_repeated_index, repeated_index_update_var)])
+            opt_aggregated.apply_gradients(
+                [(grad_aggregated, aggregated_update_var)])
+
+          var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad(
+              var0_np, indices_np, grads0_np, t, m0, v0, v0hat)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(aggregated_update_var))
+          self.assertAllCloseAccordingToType(
+              self.evaluate(aggregated_update_var),
+              self.evaluate(repeated_index_update_var))
+
+  @test_util.run_deprecated_v1
+  def testBasicWithLearningRateDecay(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 0.001
+        beta_1 = 0.9
+        beta_2 = 0.999
+        epsilon = 1e-7
+        decay = 0.5
+
+        opt = adam.Adam(
+            learning_rate=learning_rate,
+            beta_1=beta_1,
+            beta_2=beta_2,
+            epsilon=epsilon,
+            decay=decay)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.evaluate(update)
+          lr_np = learning_rate / (1 + decay * t)
+
+          var0_np, m0, v0 = adam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, lr=lr_np)
+          var1_np, m1, v1 = adam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, lr=lr_np)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.Adam(1.)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+  def testSetWeightsFromV1AdamWithoutMinimize(self):
+    keras_v1_adam = optimizers.Adam()
+    keras_v2_adam = adam.Adam()
+    keras_v2_adam.set_weights(keras_v1_adam.get_weights())
+    keras_v1_iteration = keras_v1_adam.iterations
+    keras_v2_iteration = keras_v2_adam.iterations
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(
+        self.evaluate(keras_v1_iteration), self.evaluate(keras_v2_iteration))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddd78584f852f24f9da6277888d1883bb44db327
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -0,0 +1,159 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adamax for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import training_ops
+
+
+class Adamax(adam.Adam):
+  """Optimizer that implements the Adamax algorithm.
+
+  It is a variant of Adam based on the infinity norm.
+  Default parameters follow those provided in the paper.
+  Adamax is sometimes superior to adam, specially in models with embeddings.
+
+  References
+    see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+      ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               name='Adamax',
+               **kwargs):
+    """Construct a new Adamax optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section 7.1 of the paper:
+
+    ```
+    t <- t + 1
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- max(beta2 * v_{t-1}, abs(g))
+    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+    ```
+
+    Similar to AdamOptimizer, the epsilon is added for numerical stability
+    (especially to get rid of division by zero when v_t = 0).
+
+    Contrast to AdamOptimizer, the sparse implementation of this algorithm
+    (used when the gradient is an IndexedSlices object, typically because of
+    `tf.gather` or an embedding lookup in the forward pass) only updates
+    variable slices and corresponding `m_t`, `v_t` terms when that part of
+    the variable was used in the forward pass. This means that the sparse
+    behavior is contrast to the dense behavior (similar to some momentum
+    implementations which ignore momentum unless a variable slice was actually
+    used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adamax".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+    # pylint: disable=useless-super-delegation
+    super(Adamax, self).__init__(
+        learning_rate=learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon,
+        amsgrad=False,
+        name=name,
+        **kwargs)
+    # pylint: enable=useless-super-delegation
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    return training_ops.resource_apply_ada_max(
+        var.handle,
+        m.handle,
+        v.handle,
+        beta_1_power,
+        lr_t,
+        beta_1_t,
+        beta_2_t,
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_slice = array_ops.gather(m, indices)
+    m_t_slice = m_slice * beta_1_t + grad * (1 - beta_1_t)
+    with ops.control_dependencies([m_t_slice]):
+      m_t = self._resource_scatter_update(m, indices, m_t_slice)
+
+    # u_t = max(beta2 * u, abs(g_t))
+    v = self.get_slot(var, 'v')
+    v_slice = array_ops.gather(v, indices)
+    v_t_slice = math_ops.maximum(v_slice * beta_2_t, math_ops.abs(grad))
+    with ops.control_dependencies([v_t_slice]):
+      v_t = self._resource_scatter_update(v, indices, v_t_slice)
+    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
+    var_slice = -lr_t / (1 - beta_1_power) * (
+        m_t_slice / (v_t_slice + epsilon_t))
+    with ops.control_dependencies([var_slice]):
+      var_update = self._resource_scatter_add(var, indices, var_slice)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(
+            x.handle, i, v)]):
+      return x.value()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf131fbb0ce5bd4ab6c7d9b8c49e0519290dcef
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -0,0 +1,367 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adamax."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adamax
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adamax_update_numpy(param,
+                        g_t,
+                        t,
+                        m,
+                        v,
+                        alpha=0.001,
+                        beta1=0.9,
+                        beta2=0.999,
+                        epsilon=1e-8):
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = np.maximum(beta2 * v, np.abs(g_t))
+  param_t = param - (alpha / (1 - beta1**(t + 1))) * (m_t / (v_t + epsilon))
+  return param_t, m_t, v_t
+
+
+def adamax_sparse_update_numpy(param,
+                               indices,
+                               g_t,
+                               t,
+                               m,
+                               v,
+                               alpha=0.001,
+                               beta1=0.9,
+                               beta2=0.999,
+                               epsilon=1e-8):
+  m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
+  param_t_slice = param[indices] - (
+      (alpha / (1 - beta1**(t + 1))) * (m_t_slice / (v_t_slice + epsilon)))
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  return beta_1_power
+
+
+class AdamaxOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)  # pylint: disable=cell-var-from-loop
+        m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
+        var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([2, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = adamax.Adamax()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
+        self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_sparse_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_sparse_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  @test_util.run_deprecated_v1
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adamax.Adamax(3.0)
+        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adamax.Adamax().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adamax.Adamax().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasic(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adamax.Adamax()
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          beta_1_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(var0), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var1_np, self.evaluate(var1), rtol=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasicWithLearningRateDecay(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 0.001
+        decay = 0.002
+        opt = adamax.Adamax(learning_rate=learning_rate, decay=decay)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          beta_1_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          lr = learning_rate / (1 + decay * t)
+
+          var0_np, m0, v0 = adamax_update_numpy(
+              var0_np, grads0_np, t, m0, v0, alpha=lr)
+          var1_np, m1, v1 = adamax_update_numpy(
+              var1_np, grads1_np, t, m1, v1, alpha=lr)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.Adamax(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.Adamax()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adamax1 and Adamax2.
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adamax.Adamax(1.)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..e278e352f551a12718f6b400b16f9d7e05d0c02e
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -0,0 +1,210 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ftrl-proximal for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class Ftrl(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the FTRL algorithm.
+
+  See this [paper](
+  https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
+  This version has support for both online L2 (the L2 penalty given in the paper
+  above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
+  loss function).
+  """
+
+  def __init__(self,
+               learning_rate,
+               learning_rate_power=-0.5,
+               initial_accumulator_value=0.1,
+               l1_regularization_strength=0.0,
+               l2_regularization_strength=0.0,
+               name='Ftrl',
+               l2_shrinkage_regularization_strength=0.0,
+               **kwargs):
+    r"""Construct a new FTRL optimizer.
+
+    Args:
+      learning_rate: A float value or a constant float `Tensor`.
+      learning_rate_power: A float value, must be less or equal to zero.
+        Controls how the learning rate decreases during training. Use zero for
+        a fixed learning rate.
+      initial_accumulator_value: The starting value for accumulators.
+        Only zero or positive values are allowed.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Ftrl".
+      l2_shrinkage_regularization_strength: A float value, must be greater than
+        or equal to zero. This differs from L2 above in that the L2 above is a
+        stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
+        The FTRL formulation can be written as:
+        w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
+        \hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
+        function w.r.t. the weights w.
+        Specifically, in the absence of L1 regularization, it is equivalent to
+        the following update rule:
+        w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
+                  2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
+        where lr_t is the learning rate at t.
+        When input is sparse shrinkage will only happen on the active weights.\
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+
+    References
+      See [paper]
+        (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+    """
+    super(Ftrl, self).__init__(name, **kwargs)
+
+    if initial_accumulator_value < 0.0:
+      raise ValueError(
+          'initial_accumulator_value %f needs to be positive or zero' %
+          initial_accumulator_value)
+    if learning_rate_power > 0.0:
+      raise ValueError('learning_rate_power %f needs to be negative or zero' %
+                       learning_rate_power)
+    if l1_regularization_strength < 0.0:
+      raise ValueError(
+          'l1_regularization_strength %f needs to be positive or zero' %
+          l1_regularization_strength)
+    if l2_regularization_strength < 0.0:
+      raise ValueError(
+          'l2_regularization_strength %f needs to be positive or zero' %
+          l2_regularization_strength)
+    if l2_shrinkage_regularization_strength < 0.0:
+      raise ValueError(
+          'l2_shrinkage_regularization_strength %f needs to be positive'
+          ' or zero' % l2_shrinkage_regularization_strength)
+
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('learning_rate_power', learning_rate_power)
+    self._set_hyper('l1_regularization_strength', l1_regularization_strength)
+    self._set_hyper('l2_regularization_strength', l2_regularization_strength)
+    self._initial_accumulator_value = initial_accumulator_value
+    self._l2_shrinkage_regularization_strength = (
+        l2_shrinkage_regularization_strength)
+
+  def _create_slots(self, var_list):
+    # Create the "accum" and "linear" slots.
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = init_ops.constant_initializer(
+          self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, 'accumulator', init)
+      self.add_slot(var, 'linear')
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    learning_rate_power = self._get_hyper('learning_rate_power', var_dtype)
+    l1_regularization_strength = self._get_hyper('l1_regularization_strength',
+                                                 var_dtype)
+    l2_regularization_strength = self._get_hyper('l2_regularization_strength',
+                                                 var_dtype)
+    accum = self.get_slot(var, 'accumulator')
+    linear = self.get_slot(var, 'linear')
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          learning_rate_power,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          math_ops.cast(self._l2_shrinkage_regularization_strength, var_dtype),
+          learning_rate_power,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    learning_rate_power = self._get_hyper('learning_rate_power', var_dtype)
+    l1_regularization_strength = self._get_hyper('l1_regularization_strength',
+                                                 var_dtype)
+    l2_regularization_strength = self._get_hyper('l2_regularization_strength',
+                                                 var_dtype)
+    accum = self.get_slot(var, 'accumulator')
+    linear = self.get_slot(var, 'linear')
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_sparse_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          learning_rate_power,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          math_ops.cast(self._l2_shrinkage_regularization_strength, var_dtype),
+          learning_rate_power,
+          use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(Ftrl, self).get_config()
+    config.update({
+        'learning_rate':
+            self._serialize_hyperparameter('learning_rate'),
+        'decay':
+            self._serialize_hyperparameter('decay'),
+        'initial_accumulator_value':
+            self._initial_accumulator_value,
+        'learning_rate_power':
+            self._serialize_hyperparameter('learning_rate_power'),
+        'l1_regularization_strength':
+            self._serializer_hyperparameter('l1_regularization_strength'),
+        'l2_regularization_strength':
+            self._serializer_hyperparameter('l2_regularization_strength'),
+        'l2_shrinkage_regularization_strength':
+            self._l2_shrinkage_regularization_strength,
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bec400e8cbba2654decaf520a24800095e4d16f5
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -0,0 +1,440 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for Ftrl operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import ftrl
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import gradient_descent
+
+
+class FtrlOptimizerTest(test.TestCase):
+
+  def doTestFtrlwithoutRegularization(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([0.0, 0.0], dtype=dtype)
+          var1 = variables.Variable([0.0, 0.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([0.0, 0.0], v0_val)
+        self.assertAllClose([0.0, 0.0], v1_val)
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.60260963, -4.29698515]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.28432083, -0.56694895]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithoutRegularization(self):
+    self.doTestFtrlwithoutRegularization(use_resource=False)
+
+  @test_util.run_deprecated_v1
+  def testResourceFtrlWithoutRegularization(self):
+    self.doTestFtrlwithoutRegularization(use_resource=True)
+
+  @test_util.run_deprecated_v1
+  def testFtrlwithoutRegularization2(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.55607247, -3.98729396]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.28232238, -0.56096673]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = ftrl.Ftrl(1.0).minimize(loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-7.66718769, -10.91273689]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.93460727, -1.86147261]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1_L2(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-0.24059935, -0.46829352]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02406147, -0.04830509]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1_L2_L2Shrinkage(self):
+    """Test the new FTRL op with support for l2 shrinkage.
+
+    The addition of this parameter which places a constant pressure on weights
+    towards the origin causes the gradient descent trajectory to differ. The
+    weights will tend to have smaller magnitudes with this parameter set.
+    """
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-0.22578995, -0.44345796]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.14378493, -0.13229476]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1_L2_L2ShrinkageSparse(self):
+    """Tests the new FTRL op with support for l2 shrinkage on sparse grads."""
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[4.0], [3.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.02], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
+        self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([[-0.22578995], [2.]], v0_val)
+        self.assertAllCloseAccordingToType([[4.], [-0.13229476]], v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
+    """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([1.0, 2.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.1, 0.2], dtype=dtype)
+
+        opt0 = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        opt1 = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0)
+        update0 = opt0.apply_gradients([(grads0, var0)])
+        update1 = opt1.apply_gradients([(grads1, var1)])
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([1.0, 2.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update0.run()
+          update1.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        # var0 is experiencing L2 shrinkage so it should be smaller than var1
+        # in magnitude.
+        self.assertTrue((v0_val**2 < v1_val**2).all())
+        accum0 = sess.run(opt0.get_slot(var0, "accumulator"))
+        accum1 = sess.run(opt1.get_slot(var1, "accumulator"))
+        # L2 shrinkage should not change how we update grad accumulator.
+        self.assertAllCloseAccordingToType(accum0, accum1)
+
+  def applyOptimizer(self, opt, dtype, steps=5, is_sparse=False):
+    if is_sparse:
+      var0 = variables.Variable([[0.0], [0.0]], dtype=dtype)
+      var1 = variables.Variable([[0.0], [0.0]], dtype=dtype)
+      grads0 = ops.IndexedSlices(
+          constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+          constant_op.constant([0]), constant_op.constant([2, 1]))
+      grads1 = ops.IndexedSlices(
+          constant_op.constant([0.02], shape=[1, 1], dtype=dtype),
+          constant_op.constant([1]), constant_op.constant([2, 1]))
+    else:
+      var0 = variables.Variable([0.0, 0.0], dtype=dtype)
+      var1 = variables.Variable([0.0, 0.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    variables.global_variables_initializer().run()
+
+    sess = ops.get_default_session()
+    v0_val, v1_val = self.evaluate([var0, var1])
+    if is_sparse:
+      self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
+      self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
+    else:
+      self.assertAllCloseAccordingToType([0.0, 0.0], v0_val)
+      self.assertAllCloseAccordingToType([0.0, 0.0], v1_val)
+
+    # Run Ftrl for a few steps
+    for _ in range(steps):
+      update.run()
+
+    v0_val, v1_val = self.evaluate([var0, var1])
+    return v0_val, v1_val
+
+  # When variables are initialized with Zero, FTRL-Proximal has two properties:
+  # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
+  # with GradientDescent.
+  # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is identical
+  # with Adagrad.
+  # So, basing on these two properties, we test if our implementation of
+  # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
+  @test_util.run_deprecated_v1
+  def testEquivAdagradwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Adagrad learning rate
+                learning_rate_power=-0.5,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1), dtype)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  @test_util.run_deprecated_v1
+  def testEquivSparseAdagradwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Adagrad learning rate
+                learning_rate_power=-0.5,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype,
+            is_sparse=True)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1),
+            dtype,
+            is_sparse=True)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  @test_util.run_deprecated_v1
+  def testEquivSparseGradientDescentwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Fixed learning rate
+                learning_rate_power=-0.0,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype,
+            is_sparse=True)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            gradient_descent.GradientDescentOptimizer(3.0),
+            dtype,
+            is_sparse=True)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  @test_util.run_deprecated_v1
+  def testEquivGradientDescentwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Fixed learning rate
+                learning_rate_power=-0.0,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            gradient_descent.GradientDescentOptimizer(3.0), dtype)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b82b5e78dedce5ff68b860d143b1ecadd18e0bd
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -0,0 +1,145 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Momentum for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import training_ops
+
+
+class SGD(optimizer_v2.OptimizerV2):
+  """Stochastic gradient descent and momentum optimizer.
+
+  Computes:
+  ```
+  theta(t+1) = theta(t) - learning_rate * gradient
+  gradient is evaluated at theta(t).
+  ```
+
+  or Computes (if `use_nesterov = False`):
+  ```
+  v(t+1) = momentum * v(t) - learning_rate * gradient
+  theta(t+1) = theta(t) + v(t+1)
+  if `nesterov` is False, gradient is evaluated at theta(t).
+  if `nesterov` is True, gradient is evaluated at theta(t) + momentum * v(t),
+    and the variables always store theta + m v instead of theta
+  ```
+
+  Some of the args below are hyperparameters, where a hyperparameter is
+  defined as a scalar Tensor, a regular Python value, or a callable (which
+  will be evaluated when `apply_gradients` is called) returning a scalar
+  Tensor or a Python value.
+
+  @compatibility(eager)
+  When eager execution is enabled, learning_rate can be a callable that takes
+  no arguments and returns the actual value to use. This can be useful for
+  changing these values across different invocations of optimizer functions.
+  @end_compatibility
+
+  # References
+      nesterov = True, See [Sutskever et al., 2013](
+        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               momentum=0.0,
+               nesterov=False,
+               name="SGD",
+               **kwargs):
+    """Construct a new Stochastic Gradient Descent or Momentum optimizer.
+
+    Arguments:
+      learning_rate: float hyperparameter >= 0. Learning rate.
+      momentum: float hyperparameter >= 0 that accelerates SGD in the relevant
+        direction and dampens oscillations.
+      nesterov: boolean. Whether to apply Nesterov momentum.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to 'SGD'.
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+    super(SGD, self).__init__(name, **kwargs)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
+
+    self._momentum = False
+    if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0:
+      self._momentum = True
+    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
+      raise ValueError("`momentum` must be between [0, 1].")
+    self._set_hyper("momentum", momentum)
+
+    self._nesterov = nesterov
+
+  def _create_slots(self, var_list):
+    if self._momentum:
+      for var in var_list:
+        self.add_slot(var, "momentum")
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    if self._momentum:
+      momentum_var = self.get_slot(var, "momentum")
+      return training_ops.resource_apply_keras_momentum(
+          var.handle,
+          momentum_var.handle,
+          lr_t,
+          grad,
+          self._get_hyper("momentum", var_dtype),
+          use_locking=self._use_locking,
+          use_nesterov=self._nesterov)
+    else:
+      return training_ops.resource_apply_gradient_descent(
+          var.handle, lr_t, grad, use_locking=self._use_locking)
+
+  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
+    if self._momentum:
+      return super(SGD, self)._resource_apply_sparse_duplicate_indices(
+          grad, var, indices)
+    else:
+      var_dtype = var.dtype.base_dtype
+      lr_t = self._decayed_lr(var_dtype)
+      return resource_variable_ops.resource_scatter_add(var.handle, indices,
+                                                        -grad * lr_t)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    # This method is only needed for momentum optimization.
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    momentum_var = self.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_keras_momentum(
+        var.handle,
+        momentum_var.handle,
+        lr_t,
+        grad,
+        indices,
+        self._get_hyper("momentum", var_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._nesterov)
+
+  def get_config(self):
+    config = super(SGD, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
+        "momentum": self._serialize_hyperparameter("momentum"),
+        "nesterov": self._nesterov,
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c64202da81c36e4140be7ca7719e9d426c549cc
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -0,0 +1,688 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for GradientDescent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class GradientDescentOptimizerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd = gradient_descent.SGD(3.0)
+        sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        learning_rate = 3.0
+        decay = 0.5
+        sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
+        if not context.executing_eagerly():
+          sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 2 steps of sgd
+        if not context.executing_eagerly():
+          self.evaluate(sgd_op)
+        else:
+          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           self.evaluate(var1))
+
+        if not context.executing_eagerly():
+          self.evaluate(sgd_op)
+        else:
+          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1 - 2.0 * 0.1, 2.0 - 3.0 * 0.1 - 2.0 * 0.1],
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
+            self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasicCallableParams(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lr = lambda: 3.0
+        sgd = gradient_descent.SGD(lr)
+        sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMinimizeResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        loss = lambda: math_ops.matmul(var0, x) + var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+        sgd = gradient_descent.SGD(1.0)
+        sgd_op = sgd.minimize(loss, [var0, var1])
+        self.evaluate(variables.global_variables_initializer())
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.0 - 4.0, 2.0 - 5.0]],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        pred += var1
+        loss = pred * pred
+        sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
+        self.evaluate(variables.global_variables_initializer())
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        sgd_op = gradient_descent.SGD(lrate).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testGradWrtRef(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        opt = gradient_descent.SGD(3.0)
+        values = [1.0, 3.0]
+        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        self.evaluate(variables.global_variables_initializer())
+        for grad, _ in grads_and_vars:
+          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        sgd_op = gradient_descent.SGD(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                           self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        sgd_op = gradient_descent.SGD(
+            3.0, decay=0.5).apply_gradients(
+                zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 2 steps of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                           self.evaluate(var1))
+
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
+
+  def testCapturingInDefunWhileExecutingEagerly(self):
+    with context.eager_mode():
+      optimizer = gradient_descent.SGD(1.0)
+
+      def step():
+        self.v = resource_variable_ops.ResourceVariable(1.0)
+        with backprop.GradientTape() as tape:
+          loss = self.v**2
+        grad = tape.gradient(loss, self.v)
+        optimizer.apply_gradients([(grad, self.v)])
+        return self.v.read_value()
+
+      compiled_step = function.defun(step)
+
+      self.assertEqual(float(step()), -1.0)
+      self.assertEqual(float(compiled_step()), -1.0)
+      # This shouldn't fail; in particular, the learning rate tensor should
+      # be an EagerTensor once again, not a graph Tensor.
+      self.assertEqual(float(step()), -1.0)
+
+
+class MomentumOptimizerTest(test.TestCase):
+
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    accum = accum * momentum - g * lr
+    var += (accum * momentum - g * lr)
+    return var, accum
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasic(self):
+    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtype,
+                                                      name="var0")
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtype,
+                                                      name="var1")
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        learning_rate = 2.0
+        momentum = 0.9
+        mom_opt = gradient_descent.SGD(
+            learning_rate=learning_rate, momentum=momentum)
+        # self.assertFalse(mom_opt._initial_decay)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+
+        # Check we have slots
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
+
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(mom_update)
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
+        # Step 2: the momentum accumulators contain the previous update.
+        self.evaluate(mom_update)
+        if context.executing_eagerly():
+          mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                      dtype=dtype,
+                                                      name="var0")
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
+                                                      dtype=dtype,
+                                                      name="var1")
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        loss = 5 * var0 * var0 + 3 * var1
+        mom_op = gradient_descent.SGD(
+            learning_rate=2.0, momentum=0.9, nesterov=True)
+        opt_op = mom_op.minimize(loss, [var0, var1])
+        variables.global_variables_initializer().run()
+        for _ in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        grads = []
+        for t in range(1, 5):
+          grads.append(var0_np * 10)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, dtype=dtype, name="var0")
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, dtype=dtype, name="var1")
+        mom_op = gradient_descent.SGD(
+            learning_rate=2.0, momentum=0.9, nesterov=True)
+        x_feed = array_ops.placeholder(dtype)
+        y_feed = ops.IndexedSlices(x_feed, constant_op.constant([0, 1]),
+                                   constant_op.constant([2]))
+        grads_and_vars = [(y_feed, var0),
+                          (constant_op.constant([3.0, 3.0], dtype=dtype), var1)]
+        opt_update = mom_op.apply_gradients(grads_and_vars)
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_update.run(feed_dict={x_feed: grads[t - 1]})
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9)
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        return pred * pred
+
+      # pylint: enable=cell-var-from-loop
+
+      opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss, [var0])
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndicesForEmbeddingLookup(self):
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss, [var0])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRateAndMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = gradient_descent.SGD(
+            learning_rate=constant_op.constant(2.0),
+            momentum=constant_op.constant(0.9))
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Check we have slots
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
+        var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([[.1, .1]], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([4, 2]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([[.01, .01], [.01, .01]], dtype=dtype),
+            constant_op.constant([2, 3]), constant_op.constant([4, 2]))
+        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Check we have slots
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([0, 0], self.evaluate(var0)[0])
+        self.assertAllClose([0, 0], self.evaluate(var0)[1])
+        self.assertAllClose([1, 1], self.evaluate(var1)[2])
+
+        # Step 1: the momentum accumulators are 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([0, 0]),
+            self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.0 * .1, -2.0 * .1]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.0 * .01, -2.0 * .01]),
+            self.evaluate(slot1)[2])
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([0, 0]),
+            self.evaluate(var0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
+            self.evaluate(var0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
+            self.evaluate(var1)[2])
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]),
+            self.evaluate(slot1)[2])
+        # Check that the parameters have been updated.
+        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]),
+            self.evaluate(var0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]),
+            self.evaluate(var1)[2])
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = gradient_descent.SGD(learning_rate=2.0, momentum=0.9)
+        mom_update1 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        mom_update2 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update1.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
+        # Step 2: the second momentum accumulators contain the previous update.
+        mom_update2.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConfig(self):
+    with self.cached_session():
+      opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True)
+      config = opt.get_config()
+      opt2 = gradient_descent.SGD.from_config(config)
+      # assert both are equal float values.
+      self.assertEqual(
+          opt._get_hyper("learning_rate"), opt2._get_hyper("learning_rate"))
+      self.assertEqual(opt._get_hyper("momentum"), opt2._get_hyper("momentum"))
+      # self.assertEqual(opt._get_hyper("decay"), opt2._get_hyper("decay"))
+      var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
+      loss = lambda: 3 * var0
+      # learning rate variable created when calling minimize.
+      opt.minimize(loss, [var0])
+      self.evaluate(variables.global_variables_initializer())
+      config = opt.get_config()
+      opt3 = gradient_descent.SGD.from_config(config)
+      self.assertEqual(
+          self.evaluate(opt._get_hyper("learning_rate")),
+          opt3._get_hyper("learning_rate"))
+      self.assertEqual(
+          self.evaluate(opt._get_hyper("momentum")),
+          opt3._get_hyper("momentum"))
+      # self.assertEqual(
+      #     self.evaluate(opt._get_hyper("decay")), opt3._get_hyper("decay"))
+      self.assertTrue(opt3._nesterov)
+
+  def testNesterovWithoutMomentum(self):
+    with self.assertRaisesRegexp(ValueError, "must be between"):
+      gradient_descent.SGD(learning_rate=1.0, momentum=2.0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b095e0dc950c7e68414c1657847b891652a5ba
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -0,0 +1,143 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Nadam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+
+
+class Nadam(adam.Adam):
+  r"""Optimizer that implements the NAdam algorithm.
+
+  Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
+  Nesterov momentum.
+
+  Initialization:
+
+  $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+  $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+  $$t := 0 \text{(Initialize timestep)}$$
+
+  Computes:
+  $$t := t + 1$$
+  $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+  $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+  $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+  $$m_bar_t := beta_1 * v_t + (1 - beta_1) * g$$
+  $$theta_t := theta_{t-1} - lr_t * m_bar_t / (\sqrt{v_t} + \epsilon)$$
+
+  gradient is evaluated at theta(t) + momentum * v(t), and the variables always
+  store theta + beta_1 * m / sqrt(v) instead of theta.
+
+  References
+    See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               name='Nadam',
+               **kwargs):
+    """Construct a new Nadam optimizer.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adamax".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+
+    # pylint: disable=useless-super-delegation
+    super(Nadam, self).__init__(
+        learning_rate=learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon,
+        amsgrad=False,
+        name=name,
+        **kwargs)
+    # pylint: enable=useless-super-delegation
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        beta_1_power,
+        beta_2_power,
+        lr_t,
+        beta_1_t,
+        beta_2_t,
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        use_locking=self._use_locking,
+        use_nesterov=True)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_scaled_g_values = grad * (1 - beta_1_t)
+    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+      # m_bar = (1 - beta1) * g_t + beta1 * m_t
+      m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices)
+
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, 'v')
+    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
+    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+    v_t_slice = array_ops.gather(v_t, indices)
+    v_sqrt = math_ops.sqrt(v_t_slice)
+    var_update = self._resource_scatter_add(var, indices,
+                                            -lr * m_bar / (v_sqrt + epsilon_t))
+    return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d991e3117cad4530ffb1f3a4315b49dc46d26bfc
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -0,0 +1,213 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nadam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import nadam
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  beta_2_t = math_ops.cast(opt._get_hyper("beta_2"), dtype)
+  beta_2_power = math_ops.pow(beta_2_t, local_step)
+  return (beta_1_power, beta_2_power)
+
+
+def nadam_update_numpy(param,
+                       g_t,
+                       t,
+                       m,
+                       v,
+                       alpha=0.001,
+                       beta1=0.9,
+                       beta2=0.999,
+                       epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  m_bar = (1 - beta1) * g_t + beta1 * m_t
+
+  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class NadamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    sparse_epsilon = 1e-7
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = nadam.Nadam(epsilon=sparse_epsilon)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, epsilon=sparse_epsilon)
+          var1_np, m1, v1 = nadam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, epsilon=sparse_epsilon)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  @test_util.run_deprecated_v1
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def doTestBasic(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = nadam.Nadam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  @test_util.run_deprecated_v1
+  def testBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        learning_rate = 0.001
+        decay = 0.5
+        opt = nadam.Nadam(learning_rate=learning_rate, decay=decay)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          update.run()
+
+          lr = learning_rate / (1 + decay * t)
+          var0_np, m0, v0 = nadam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, alpha=lr)
+          var1_np, m1, v1 = nadam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, alpha=lr)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..15f3009a4af4270f2f845f6c5bf945f330efe545
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -0,0 +1,656 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Version 2 of class Optimizer."""
+# pylint: disable=g-bad-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.util import nest
+
+
+@six.add_metaclass(abc.ABCMeta)
+class OptimizerV2(optimizer_v1.Optimizer):
+  """Updated base class for optimizers.
+
+  This class defines the API to add Ops to train a model.  You never use this
+  class directly, but instead instantiate one of its subclasses such as
+  `GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
+
+  ### Usage
+
+  ```python
+  # Create an optimizer with the desired parameters.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+  # Add Ops to the graph to minimize a cost by updating a list of variables.
+  # "cost" is a Tensor, and the list of variables contains tf.Variable
+  # objects.
+  opt_op = opt.minimize(cost, var_list=<list of variables>)
+  ```
+
+  In the training program you will just have to run the returned Op.
+
+  ```python
+  # Execute opt_op to do one step of training:
+  opt_op.run()
+  ```
+
+  ### Processing gradients before applying them.
+
+  Calling `minimize()` takes care of both computing the gradients and
+  applying them to the variables.  If you want to process the gradients
+  before applying them you can instead use the optimizer in three steps:
+
+  1.  Compute the gradients with `compute_gradients()`.
+  2.  Process the gradients as you wish.
+  3.  Apply the processed gradients with `apply_gradients()`.
+
+  Example:
+
+  ```python
+  # Create an optimizer.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+
+  # Compute the gradients for a list of variables.
+  grads_and_vars = opt.compute_gradients(loss, <list of variables>)
+
+  # grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
+  # need to the 'gradient' part, for example cap them, etc.
+  capped_grads_and_vars = [(MyCapper(gv[0]), gv[1]) for gv in grads_and_vars]
+
+  # Ask the optimizer to apply the capped gradients.
+  opt.apply_gradients(capped_grads_and_vars)
+  ```
+
+  ### Slots
+
+  Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
+  allocate and manage additional variables associated with the variables to
+  train.  These are called <i>Slots</i>.  Slots have names and you can ask the
+  optimizer for the names of the slots that it uses.  Once you have a slot name
+  you can ask the optimizer for the variable it created to hold the slot value.
+
+  This can be useful if you want to log debug a training algorithm, report stats
+  about the slots, etc.
+
+  ### Hyper parameters
+
+  These are arguments passed to the optimizer subclass constructor
+  (the `__init__` method), and then passed to `self._set_hyper()`.
+  They can be either regular Python values (like 1.0), tensors, or
+  callables. If they are callable, the callable will be called during
+  `apply_gradients()` to get the value for the hyper parameter.
+
+  """
+
+  def __init__(self, name, **kwargs):
+    """Create a new Optimizer.
+
+    This must be called by the constructors of subclasses.
+    Note that Optimizer instances should not bind to a single graph,
+    and so shouldn't keep Tensors as member variables. Generally
+    you should be able to use the _set_hyper()/state.get_hyper()
+    facility instead.
+
+    This class in stateful and thread-compatible.
+
+    Args:
+      name: A non-empty string.  The name to use for accumulators created
+        for the optimizer.
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    Raises:
+      ValueError: If name is malformed.
+      RuntimeError: If _create_slots has been overridden instead of
+          _create_vars.
+    """
+    self._use_locking = True
+    super(OptimizerV2, self).__init__(self._use_locking, name)
+    self._hyper = {}
+    # dict: {variable name : {slot name : variable}}
+    self._slots = {}
+    self._weights = []
+
+    decay = kwargs.pop("decay", 0.0)
+    if decay < 0.:
+      raise ValueError("decay cannot be less than 0: {}".format(decay))
+    self._initial_decay = decay
+
+    self._prepared = False
+
+  def minimize(self,
+               loss,
+               var_list,
+               aggregation_method=None,
+               colocate_gradients_with_ops=False,
+               name=None,
+               grad_loss=None):
+    """Add operations to minimize `loss` by updating `var_list`.
+
+    This method simply combines calls `compute_gradients()` and
+    `apply_gradients()`. If you want to process the gradient before applying
+    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    of using this function.
+
+    Args:
+      loss: A `Tensor` containing the value to minimize.
+      var_list: list or tuple of `Variable` objects to update to minimize
+        `loss`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with the
+        corresponding op.
+      name: Optional name for the returned operation.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+
+    Returns:
+      An Operation that updates the variables in `var_list`.  If `global_step`
+      was not `None`, that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If some of the variables are not `Variable` objects.
+
+    @compatibility(eager)
+    When eager execution is enabled, `loss` should be a Python function that
+    takes no arguments and computes the value to be minimized. Minimization (and
+    gradient computation) is done with respect to the elements of `var_list` if
+    not None, else with respect to any trainable variables created during the
+    execution of the `loss` function. `gate_gradients`, `aggregation_method`,
+    `colocate_gradients_with_ops` and `grad_loss` are ignored when eager
+    execution is enabled.
+    @end_compatibility
+    """
+    grads_and_vars = self.compute_gradients(
+        loss,
+        var_list=var_list,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss)
+
+    return self.apply_gradients(grads_and_vars, name=name)
+
+  def compute_gradients(self,
+                        loss,
+                        var_list,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None,
+                        stop_gradients=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize or a callable taking no
+        arguments which returns the value to minimize. When eager execution is
+        enabled it must be a callable.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph under
+        the key `GraphKeys.TRAINABLE_VARIABLES`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with the
+        corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid, or var_list is None.
+      RuntimeError: If called with eager execution enabled and `loss` is
+        not callable.
+
+    @compatibility(eager)
+    When eager execution is enabled, `aggregation_method`, and
+    `colocate_gradients_with_ops` are ignored.
+    @end_compatibility
+    """
+    var_list = nest.flatten(var_list)
+    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
+    if callable(loss):
+      with backprop.GradientTape() as tape:
+        tape.watch(var_list)
+        loss_value = loss()
+        loss_value = self._scale_loss(loss_value)
+      grads = tape.gradient(loss_value, var_list, grad_loss)
+    else:
+      if context.executing_eagerly():
+        raise RuntimeError("`loss` passed to Optimizer.compute_gradients "
+                           "should be a function when eager execution is "
+                           "enabled.")
+      loss = self._scale_loss(loss)
+      self._assert_valid_dtypes([loss])
+      if grad_loss is not None:
+        self._assert_valid_dtypes([grad_loss])
+      grads = gradients.gradients(
+          loss,
+          var_list,
+          grad_ys=grad_loss,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          stop_gradients=stop_gradients)
+
+    grads_and_vars = list(zip(grads, var_list))
+    self._assert_valid_dtypes([
+        v for g, v in grads_and_vars
+        if g is not None and v.dtype != dtypes.resource
+    ])
+
+    return grads_and_vars
+
+  @staticmethod
+  def _scale_loss(loss_value):
+    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
+      num_replicas = \
+        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      if num_replicas > 1:
+        loss_value *= (1. / num_replicas)
+    return loss_value
+
+  def apply_gradients(self, grads_and_vars, name=None):
+    """Apply gradients to variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      name: Optional name for the returned operation.  Default to the name
+        passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    grads_and_vars = _filter_grads(grads_and_vars)
+    var_list = [v for (_, v) in grads_and_vars]
+    if distribute_ctx.has_distribution_strategy():
+      reduced_grads = merge_grads(grads_and_vars)
+      grads_and_vars = zip(reduced_grads, var_list)
+
+    with ops.init_scope():
+      self._prepare()
+      self._create_slots(var_list)
+    update_ops = []
+
+    def update_grad_to_var(grad, var):
+      """Apply gradient to variable."""
+      if isinstance(var, ops.Tensor):
+        raise NotImplementedError("Trying to update a Tensor ", var)
+      if isinstance(grad, ops.IndexedSlices):
+        if var.constraint is not None:
+          raise RuntimeError(
+              "Cannot use a constraint function on a sparse variable.")
+        return self._resource_apply_sparse_duplicate_indices(
+            grad.values, var, grad.indices)
+      update_op = self._resource_apply_dense(grad, var)
+      if var.constraint is not None:
+        with ops.control_dependencies([update_op]):
+          return var.assign(var.constraint(var))
+      else:
+        return update_op
+
+    with ops.name_scope(name, self._name) as name:
+      for grad, var in grads_and_vars:
+        scope_name = ("" if ops.executing_eagerly_outside_functions() else
+                      "_" + var.op.name)
+        with ops.name_scope("update" + scope_name):
+          update_ops.append(update_grad_to_var(grad, var))
+      # control dependencies does not work in per replica mode, please change
+      # this once b/118841692 is fixed.
+      # with ops.control_dependencies(update_ops):
+      #   apply_updates = self._iterations.assign_add(1).op
+      apply_updates = merge_update_step(update_ops, self.iterations)
+      return apply_updates
+
+  def get_updates(self, loss, params):
+    return [self.minimize(loss, params)]
+
+  def _set_hyper(self, name, value):
+    """set hyper `name` to value. value can be callable, tensor, numeric."""
+    if name not in self._hyper:
+      self._hyper[name] = value
+    else:
+      prev_value = self._hyper[name]
+      if callable(prev_value) or isinstance(prev_value,
+                                            (ops.Tensor, int, float)):
+        self._hyper[name] = value
+      else:
+        backend.set_value(self._hyper[name], value)
+
+  def _get_hyper(self, name, dtype=None):
+    value = self._hyper[name]
+    if callable(value):
+      value = value()
+    if dtype:
+      return math_ops.cast(value, dtype)
+    else:
+      return value
+
+  def __getattribute__(self, name):
+    """Overridden to support hyperparameter access."""
+    try:
+      return super(OptimizerV2, self).__getattribute__(name)
+    except AttributeError as e:
+      # Needed to avoid infinite recursion with __setattr__.
+      if name == "_hyper":
+        raise e
+      # Backwards compatibility with Keras optimizers.
+      if name == "lr":
+        name = "learning_rate"
+      if name in self._hyper:
+        return self._hyper[name]
+      raise e
+
+  def __setattr__(self, name, value):
+    """Override setattr to support dynamic hyperparameter setting."""
+    # Backwards compatibility with Keras optimizers.
+    if name == "lr":
+      name = "learning_rate"
+    if hasattr(self, "_hyper") and name in self._hyper:
+      self._set_hyper(name, value)
+    else:
+      super(OptimizerV2, self).__setattr__(name, value)
+
+  def add_slot(self, var, slot_name, initializer="zeros"):
+    var_key = _var_key(var)
+    slot_dict = self._slots.setdefault(var_key, {})
+    if slot_name not in slot_dict:
+      slot_key = _get_slot_key_from_var(var, slot_name)
+      weight = self.add_weight(
+          name=slot_key,
+          shape=var.shape,
+          dtype=var.dtype,
+          initializer=initializer)
+      slot_dict[slot_name] = weight
+      self._weights.append(weight)
+
+  def get_slot(self, var, slot_name):
+    var_key = _var_key(var)
+    slot_dict = self._slots[var_key]
+    return slot_dict[slot_name]
+
+  def _prepare(self):
+    if self._prepared:
+      return
+    with ops.device("cpu:0"):
+      self._iterations = self.add_weight(
+          "iter",
+          shape=[],
+          dtype=dtypes.int64,
+          trainable=False,
+          aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      self._weights.append(self._iterations)
+    for name, value in self._hyper.items():
+      if isinstance(value, ops.Tensor) or callable(value):
+        pass
+      else:
+        self._hyper[name] = self.add_weight(
+            name,
+            shape=[],
+            trainable=False,
+            initializer=value,
+            aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+    self._prepared = True
+
+  @property
+  def iterations(self):
+    if not self._prepared:
+      self._prepare()
+    return self._iterations
+
+  def _decayed_lr(self, var_dtype):
+    """Get decayed learning rate as a Tensor with dtype=var_dtype."""
+    lr_t = self._get_hyper("learning_rate", var_dtype)
+    if self._initial_decay > 0.:
+      local_step = math_ops.cast(self.iterations, var_dtype)
+      decay_t = self._get_hyper("decay", var_dtype)
+      lr_t = lr_t / (1. + decay_t * local_step)
+    return lr_t
+
+  @abc.abstractmethod
+  def get_config(self):
+    """Returns the config of the optimimizer.
+
+    An optimizer config is a Python dictionary (serializable)
+    containing the configuration of an optimizer.
+    The same optimizer can be reinstantiated later
+    (without any saved state) from this configuration.
+
+    Returns:
+        Python dictionary.
+    """
+    return {"name": self._name}
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    """Creates an optimizer from its config.
+
+    This method is the reverse of `get_config`,
+    capable of instantiating the same optimizer from the config
+    dictionary.
+
+    Arguments:
+        config: A Python dictionary, typically the output of get_config.
+        custom_objects: A Python dictionary mapping names to additional Python
+          objects used to create this optimizer, such as a function used for a
+          hyperparameter.
+
+    Returns:
+        An optimizer instance.
+    """
+    if "lr" in config:
+      config["learning_rate"] = config.pop("lr")
+    return cls(**config)
+
+  def _serialize_hyperparameter(self, hyperparameter_name):
+    """Serialize a hyperparameter that can be a float, callable, or Tensor."""
+    value = self._get_hyper(hyperparameter_name)
+    if callable(value):
+      return value()
+    if isinstance(value, (ops.Tensor, tf_variables.Variable)):
+      return backend.get_value(value)
+    return value
+
+  def variables(self):
+    """Returns variables of this Optimizer based on the order created."""
+    return self._weights
+
+  @property
+  def weights(self):
+    """Returns variables of this Optimizer based on the order created."""
+    return self._weights
+
+  def get_weights(self):
+    params = self.weights
+    return backend.batch_get_value(params)
+
+  # TODO(tanzheny): Maybe share this logic with base_layer.
+  def set_weights(self, weights):
+    params = self.weights
+    if len(params) != len(weights):
+      raise ValueError(
+          "You called `set_weights(weights)` on optimizer " + self._name +
+          " with a  weight list of length " + str(len(weights)) +
+          ", but the optimizer was expecting " + str(len(params)) +
+          " weights. Provided weights: " + str(weights)[:50] + "...")
+    if not params:
+      return
+    weight_value_tuples = []
+    param_values = backend.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError("Optimizer weight shape " + str(pv.shape) +
+                         " not compatible with "
+                         "provided weight shape " + str(w.shape))
+      weight_value_tuples.append((p, w))
+    backend.batch_set_value(weight_value_tuples)
+
+  def add_weight(self,
+                 name,
+                 shape,
+                 dtype=None,
+                 initializer="zeros",
+                 trainable=None,
+                 synchronization=tf_variables.VariableSynchronization.AUTO,
+                 aggregation=tf_variables.VariableAggregation.NONE):
+
+    if dtype is None:
+      dtype = dtypes.float32
+    if isinstance(initializer, six.string_types) or callable(initializer):
+      initializer = initializers.get(initializer)
+
+    if synchronization == tf_variables.VariableSynchronization.ON_READ:
+      if trainable:
+        raise ValueError(
+            "Synchronization value can be set to "
+            "VariableSynchronization.ON_READ only for non-trainable variables. "
+            "You have specified trainable=True and "
+            "synchronization=VariableSynchronization.ON_READ.")
+      else:
+        # Set trainable to be false when variable is to be synced on read.
+        trainable = False
+    elif trainable is None:
+      trainable = True
+
+    variable = self._add_variable_with_custom_getter(
+        name=name,
+        shape=shape,
+        getter=base_layer_utils.make_variable,
+        overwrite=True,
+        initializer=initializer,
+        dtype=dtype,
+        trainable=trainable,
+        use_resource=True,
+        synchronization=synchronization,
+        aggregation=aggregation)
+    backend.track_variable(variable)
+
+    return variable
+
+
+def _filter_grads(grads_and_vars):
+  """Filter out iterable with grad equal to None."""
+  grads_and_vars = tuple(grads_and_vars)
+  if not grads_and_vars:
+    raise ValueError("No variables provided.")
+  filtered = []
+  vars_with_empty_grads = []
+  for grad, var in grads_and_vars:
+    if grad is None:
+      vars_with_empty_grads.append(var)
+    else:
+      filtered.append((grad, var))
+  filtered = tuple(filtered)
+  if not filtered:
+    raise ValueError("No gradients provided for any variable: %s." %
+                     ([v.name for _, v in grads_and_vars],))
+  if vars_with_empty_grads:
+    logging.warning(
+        ("Gradients does not exist for variables %s when minimizing the loss."),
+        ([v.name for v in vars_with_empty_grads]))
+  return filtered
+
+
+def merge_update_step(update_ops, local_step):
+  """Merge local step counter update from different replicas."""
+
+  def merge_update_step_fn(strategy, update_ops, local_step):
+    merged_ops = []
+    for update_op in update_ops:
+      merged_ops.append(strategy.group(update_op))
+    with ops.control_dependencies(merged_ops):
+      incre_op = local_step.assign_add(1).op
+    return incre_op
+
+  return distribute_ctx.get_replica_context().merge_call(
+      merge_update_step_fn, args=(update_ops, local_step))
+
+
+def merge_grads(grads_and_vars):
+  """Merge gradients from different replicas."""
+
+  def merge_grad_fn(strategy, grads_and_vars):
+    reduced_grads = strategy.batch_reduce(ds_reduce_util.ReduceOp.SUM,
+                                          grads_and_vars)
+    return reduced_grads
+
+  return distribute_ctx.get_replica_context().merge_call(
+      merge_grad_fn, args=(grads_and_vars,))
+
+
+def _var_key(var):
+  """Key for representing a primary variable, for looking up slots.
+
+  In graph mode the name is derived from the var shared name.
+  In eager mode the name is derived from the var unique id.
+  If distribution strategy exists, get the primary variable first.
+
+  Args:
+    var: the variable.
+
+  Returns:
+    the unique name of the variable.
+  """
+
+  # pylint: disable=protected-access
+  if distribute_ctx.has_distribution_strategy() and hasattr(
+      var, "_primary_var"):
+    var = var._primary_var
+  if hasattr(var, "op"):
+    return var._shared_name
+  return var._unique_id
+
+
+def _get_slot_key_from_var(var, slot_name):
+  """Get the slot key for the variable: var_name/slot_name."""
+
+  name = _var_key(var)
+  return name + "/" + slot_name
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..158577fe64afefaff28ee644caf084cb40d429ea
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -0,0 +1,657 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for OptimizerV2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
+
+
+class OptimizerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasic(self):
+    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+        sgd = gradient_descent.SGD(3.0)
+
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd through optimizer
+        opt_op = sgd.minimize(loss, var_list=[var0, var1])
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(opt_op)
+        # Validate updated params
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAdaptiveLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+
+      sgd = gradient_descent.SGD(1.0)
+
+      self.evaluate(variables.global_variables_initializer())
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Run 1 step of sgd through optimizer
+      opt_op = sgd.minimize(loss, [var0, var1])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+      # Validate updated params
+      # var0 = [1., 2.] - 1.0 * [5, 5]
+      self.assertAllClose([-4., -3.], self.evaluate(var0))
+      # var1 = [3., 4.] - 1.0 * [3, 3]
+      self.assertAllClose([0., 1.], self.evaluate(var1))
+
+      sgd.learning_rate = 0.5
+      if context.executing_eagerly():
+        sgd.minimize(loss, [var0, var1])
+      else:
+        self.evaluate(opt_op)
+      # Validate updated params
+      # var0 = [-4., -3.] - 0.5 * [5, 5]
+      self.assertAllClose([-6.5, -5.5], self.evaluate(var0))
+      # var1 = [0., 1.] - 0.5 * [3, 3]
+      self.assertAllClose([-1.5, -0.5], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAggregationMethod(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+        sgd = gradient_descent.SGD(3.0)
+
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd through optimizer
+        opt_op = sgd.minimize(
+            loss,
+            var_list=[var0, var1],
+            aggregation_method=gradients_impl.AggregationMethod
+            .EXPERIMENTAL_ACCUMULATE_N)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(opt_op)
+        # Validate updated params
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testPrecomputedGradient(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+        grad_loss = constant_op.constant([42, -42], dtype=dtype)
+        sgd = gradient_descent.SGD(3.0)
+
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd through optimizer
+        opt_op = sgd.minimize(loss, var_list=[var0, var1], grad_loss=grad_loss)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(opt_op)
+        # Validate updated params
+        self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
+                            self.evaluate(var0))
+        self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
+                            self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoGradients(self):
+    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        loss = lambda: 5 * var0  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+        sgd_op = gradient_descent.SGD(3.0)
+        with self.assertRaisesRegexp(ValueError, 'No gradients'):
+          # var1 has no gradient
+          sgd_op.minimize(loss, var_list=[var1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoGradientsForAnyVariables_Minimize(self):
+    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        loss = lambda: constant_op.constant(5.0)
+        if not context.executing_eagerly():
+          loss = loss()
+
+        sgd_op = gradient_descent.SGD(3.0)
+        with self.assertRaisesRegexp(ValueError,
+                                     'No gradients provided for any variable'):
+          sgd_op.minimize(loss, var_list=[var0, var1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNoGradientsForAnyVariables_ApplyGradients(self):
+    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        sgd_op = gradient_descent.SGD(3.0)
+        with self.assertRaisesRegexp(ValueError,
+                                     'No gradients provided for any variable'):
+          sgd_op.apply_gradients([(None, var0), (None, var1)])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGradientsAsVariables(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+
+        sgd = gradient_descent.SGD(3.0)
+        grads_and_vars = sgd.compute_gradients(loss, [var0, var1])
+        # Convert gradients to tf.Variables
+        converted_grads = [
+            resource_variable_ops.ResourceVariable(
+                array_ops.zeros([2], dtype), name='c_%d_%d' % (i, j))
+            for j, gv in enumerate(grads_and_vars)
+        ]
+        convert_ops = [
+            state_ops.assign(converted_grads[j], gv[0])
+            for j, gv in enumerate(grads_and_vars)
+        ]
+
+        # Run convert_ops to achieve the gradients converting
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(convert_ops)
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 1 step of sgd through optimizer
+        converted_grads_and_vars = list(zip(converted_grads, [var0, var1]))
+        opt_op = sgd.apply_gradients(converted_grads_and_vars)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(convert_ops)
+        self.evaluate(opt_op)
+
+        # Validate updated params
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testComputeGradientsWithTensors(self):
+    with self.cached_session():
+      x = ops.convert_to_tensor(1.0)
+
+      def f():
+        return x * x
+
+      sgd = gradient_descent.SGD(3.0)
+      grads_and_vars = sgd.compute_gradients(f, [x])
+      self.assertEqual(1, len(grads_and_vars))
+      grad, x_as_var = grads_and_vars[0]
+      self.assertIs(x, x_as_var)
+      self.assertEqual(2.0, self.evaluate(grad))
+
+      with self.assertRaises(NotImplementedError):
+        sgd.apply_gradients(grads_and_vars)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstraint(self):
+    constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
+    constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
+    with self.cached_session():
+      var0 = variables.Variable([1.0, 2.0],
+                                constraint=constraint_01)
+      var1 = variables.Variable([3.0, 4.0],
+                                constraint=constraint_0)
+      loss = lambda: 5 * var0 + 3 * var1
+      if not context.executing_eagerly():  # pylint: disable=cell-var-from-loop
+        loss = loss()
+      sgd = gradient_descent.SGD(3.0)
+
+      self.evaluate(variables.global_variables_initializer())
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Run 1 step of sgd through optimizer
+      opt_op = sgd.minimize(loss, var_list=[var0, var1])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+      # Validate updated params
+      self.assertAllClose([-0.1, -0.1], self.evaluate(var0))
+      self.assertAllClose([0., 0.], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testIterationWithoutMinimize(self):
+    with self.cached_session():
+      sgd = gradient_descent.SGD(3.0)
+      self.evaluate(sgd.iterations.initializer)
+      self.assertEqual(0, self.evaluate(sgd.iterations))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSerializationWithinDefun(self):
+    with self.cached_session():
+      sgd = gradient_descent.SGD(3.0)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                    dtype=dtypes.float32)
+      loss = lambda: 3 * var0
+      sgd.minimize(loss, [var0])
+
+      def serialize():
+        config = sgd.get_config()
+        gradient_descent.SGD.from_config(config)
+
+      compiled_serialize = function.defun(serialize)
+      with self.assertRaisesRegexp(RuntimeError, 'inside Tensorflow graph'):
+        compiled_serialize()
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConfig(self):
+    with self.cached_session():
+      opt = gradient_descent.SGD(learning_rate=1.0)
+      config = opt.get_config()
+      opt2 = gradient_descent.SGD.from_config(config)
+      # assert both are equal float values.
+      self.assertEqual(
+          opt._get_hyper('learning_rate'), opt2._get_hyper('learning_rate'))
+      var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
+      loss = lambda: 3 * var0
+      # learning rate variable created when calling minimize.
+      opt.minimize(loss, [var0])
+      self.evaluate(variables.global_variables_initializer())
+      config = opt.get_config()
+      opt3 = gradient_descent.SGD.from_config(config)
+      self.assertEqual(
+          self.evaluate(opt._get_hyper('learning_rate')),
+          opt3._get_hyper('learning_rate'))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testWeights(self):
+    with self.cached_session():
+      opt1 = adam.Adam(learning_rate=1.0)
+      var1 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                    dtype=dtypes.float32)
+      loss1 = lambda: 3 * var1
+      opt_op_1 = opt1.minimize(loss1, [var1])
+      self.evaluate(variables.global_variables_initializer())
+      config = opt1.get_config()
+      opt2 = adam.Adam.from_config(config)
+      var2 = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                    dtype=dtypes.float32)
+      loss2 = lambda: 3 * var2
+      opt_op_2 = opt2.minimize(loss2, [var2])
+      weights = opt1.get_weights()
+
+      # Assert set_weights and both variables get updated to same value.
+      self.evaluate(variables.global_variables_initializer())
+      opt2.set_weights(weights)
+      self.evaluate([opt_op_1, opt_op_2])
+      self.assertAllClose(self.evaluate(var1), self.evaluate(var2))
+      self.assertEqual(1, self.evaluate(opt1.iterations))
+      self.assertEqual(1, self.evaluate(opt2.iterations))
+
+      var3 = resource_variable_ops.ResourceVariable([1.0, 2.0, 3.0],
+                                                    dtype=dtypes.float32)
+      var4 = resource_variable_ops.ResourceVariable([4.0, 5.0, 6.0],
+                                                    dtype=dtypes.float32)
+      loss3 = lambda: 3 * var3 + 5 * var4
+      opt_op_3 = opt1.minimize(loss3, [var3, var4])
+
+      # Assert set_weights with ValueError since weight list does not match.
+      self.evaluate(variables.global_variables_initializer())
+      weights = opt1.get_weights()
+      with self.assertRaisesRegexp(ValueError, 'but the optimizer was'):
+        opt2.set_weights(weights)
+
+      # Assert set_weights and variables get updated to same value.
+      var5 = resource_variable_ops.ResourceVariable([1.0, 2.0, 3.0],
+                                                    dtype=dtypes.float32)
+      var6 = resource_variable_ops.ResourceVariable([4.0, 5.0, 6.0],
+                                                    dtype=dtypes.float32)
+      loss4 = lambda: 3 * var5 + 5 * var6
+      opt_op_4 = opt2.minimize(loss4, [var5, var6])
+      self.evaluate(variables.global_variables_initializer())
+      opt2.set_weights(weights)
+      self.evaluate([opt_op_3, opt_op_4])
+      self.assertAllClose(
+          self.evaluate([var3, var4]), self.evaluate([var5, var6]))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGettingHyperParameters(self):
+    opt = adam.Adam(learning_rate=1.0)
+    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                 dtype=dtypes.float32)
+    loss = lambda: 3 * var
+    opt_op = opt.minimize(loss, [var])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt_op)
+
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(1.0, lr)
+
+    opt.lr = 2.0
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(2.0, lr)
+
+    self.evaluate(opt.lr.assign(3.0))
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(3.0, lr)
+
+    with self.assertRaises(AttributeError):
+      opt.not_an_attr += 3
+
+  @test_util.run_in_graph_and_eager_modes
+  def testOptimizerWithKerasModel(self):
+    a = input_layer.Input(shape=(3,), name='input_a')
+    b = input_layer.Input(shape=(3,), name='input_b')
+
+    dense = core.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = core.Dropout(0.5, name='dropout')(c)
+
+    model = training.Model([a, b], [d, e])
+
+    optimizer = gradient_descent.SGD(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(optimizer, loss, metrics=['mae'])
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
+              epochs=1,
+              batch_size=5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testOptimizerWithCallbacks(self):
+    input_np = np.random.random((10, 3))
+    output_np = np.random.random((10, 4))
+    a = input_layer.Input(shape=(3,), name='input_a')
+    model = sequential.Sequential()
+    model.add(core.Dense(4, name='dense'))
+    model.add(core.Dropout(0.5, name='dropout'))
+    model(a)
+    optimizer = gradient_descent.SGD(learning_rate=0.1)
+    model.compile(optimizer, loss='mse', metrics=['mae'])
+    # This does not reduce the LR after the first epoch (due to low delta).
+    cbks = [
+        callbacks.ReduceLROnPlateau(
+            monitor='val_loss', factor=0.1, min_delta=0, patience=1, cooldown=5)
+    ]
+    model.fit(
+        input_np,
+        output_np,
+        batch_size=10,
+        validation_data=(input_np, output_np),
+        callbacks=cbks,
+        epochs=5,
+        verbose=0)
+    self.assertAllClose(
+        float(backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
+
+    # This should reduce the LR after the first epoch (due to high delta).
+    cbks = [
+        callbacks.ReduceLROnPlateau(
+            monitor='val_loss',
+            factor=0.1,
+            min_delta=10,
+            patience=1,
+            cooldown=5)
+    ]
+    model.fit(
+        input_np,
+        output_np,
+        batch_size=10,
+        validation_data=(input_np, output_np),
+        callbacks=cbks,
+        epochs=5,
+        verbose=2)
+    self.assertAllClose(
+        float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
+
+
+class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
+
+  # TODO(tanzheny): remove test_numeric after algorithm for Momentum, Adam and
+  # NAdam has been unified: currently these three algorithms behave differently.
+  @parameterized.named_parameters(
+      ('adadelta', 'adadelta', True, True), ('adagrad', 'adagrad', True, True),
+      ('adam', 'adam', True, True), ('adamax', 'adamax', True, True),
+      ('nadam', 'nadam', True, False), ('momentum', 'momentum', True, True),
+      ('sgd', 'sgd', False, True))
+  def testOptimizersCompatibility(self, opt_str, test_weights, test_numeric):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+
+      old_mode = os.environ.get('TF2_BEHAVIOR', None)
+      # Disable tf2 to create V1 optimizer.
+      disable_tf2()
+      if opt_str == 'momentum':
+        opt_v1 = optimizers.SGD(momentum=0.9)
+      else:
+        opt_v1 = optimizers.get(opt_str)
+
+      # Test compile and fit with v1 optimizer.
+      model.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
+      model.fit(x, y, batch_size=5, epochs=1)
+      model_dir = tempfile.mkdtemp()
+      gfile.MakeDirs(model_dir)
+      file_name = os.path.join(model_dir, 'model.h5')
+      model.save(file_name)
+
+      enable_tf2()
+      # Test load and fit with v2 optimizer.
+      model_2 = saving.load_model(file_name)
+      opt_v2 = model_2.optimizer
+      self.assertIsInstance(opt_v2, optimizer_v2.OptimizerV2)
+      # set_weights is called inside load_model but exception is swallowed,
+      # this call checks the weights can be set correctly.
+      if test_weights:
+        opt_v2.set_weights(opt_v1.get_weights())
+      if test_numeric:
+        hist_1 = model.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+        hist_2 = model_2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+        self.assertAllClose(model.get_weights(), model_2.get_weights())
+        self.assertAllClose(model.get_weights(), model_2.get_weights())
+        self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'])
+
+      if old_mode is not None:
+        os.environ['TF2_BEHAVIOR'] = old_mode
+
+  def testNumericEquivalenceForNesterovMomentum(self):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model_k_v1 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2.set_weights(model_k_v1.get_weights())
+      model_tf = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_tf.set_weights(model_k_v2.get_weights())
+
+      opt_k_v1 = optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)
+      opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
+      opt_tf = momentum.MomentumOptimizer(
+          learning_rate=0.001, momentum=0.9, use_nesterov=True)
+
+      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
+      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
+      model_tf.compile(opt_tf, loss='categorical_crossentropy', metrics=[])
+
+      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_tf = model_tf.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+
+      self.assertAllClose(model_k_v1.get_weights(), model_tf.get_weights())
+      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
+      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+      self.assertAllClose(hist_k_v1.history['loss'], hist_tf.history['loss'])
+      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
+
+  def testNumericEquivalenceForAmsgrad(self):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model_k_v1 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2.set_weights(model_k_v1.get_weights())
+
+      opt_k_v1 = optimizers.Adam(amsgrad=True)
+      opt_k_v2 = adam.Adam(amsgrad=True)
+
+      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
+      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
+
+      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+
+      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
+      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
+
+
+def disable_tf2():
+  if 'TF2_BEHAVIOR' in os.environ:
+    del os.environ['TF2_BEHAVIOR']
+
+
+def enable_tf2():
+  os.environ['TF2_BEHAVIOR'] = 'enabled'
+
+
+# Note: These tests are kept in a separate class to avoid bugs in some
+# distributions of Python that break AutoGraph which is used by tf.function.
+class OptimizerWithFunctionTest(test.TestCase):
+
+  def testBasic(self):
+    with context.eager_mode():
+      var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                   dtype=dtypes.float32)
+      loss = lambda: 3 * var
+      opt = adam.Adam(learning_rate=1.0)
+
+      @def_function.function
+      def fn():
+        opt.minimize(loss, [var])
+        return var
+
+      self.assertAllClose([0., 1.], fn(), atol=1e-4)
+      self.assertAllClose([-1, 0.], fn(), atol=1e-4)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a5b334fc46f6ae76f48cce29bc119cdc8f0eaf2
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -0,0 +1,196 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RMSprop for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class RMSprop(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the RMSprop algorithm.
+
+  A detailed description of rmsprop.
+
+    - maintain a moving (discounted) average of the square of gradients
+    - divide gradient by the root of this average
+
+  $$mean_square_t = rho * mean_square{t-1} + (1-rho) * gradient ** 2$$
+  $$mom_t = momentum * mom_{t-1} + learning_rate * gradient / \sqrt{ /
+      mean_square_t + \epsilon}$$
+  $$variable_t := variable_{t-1} - mom_t
+
+  This implementation of RMSprop uses plain momentum, not Nesterov momentum.
+
+  The centered version additionally maintains a moving average of the
+  gradients, and uses that average to estimate the variance:
+
+  $$mean_grad_t = rho * mean_grad_{t-1} + (1-rho) * gradient$$
+  $$mean_square_t = rho * mean_square_{t-1} + (1-rho) * gradient ** 2$$
+  $$mom_t = momentum * mom_{t-1} + learning_rate * gradient /
+      sqrt(mean_square_t - mean_grad_t**2 + epsilon)$$
+  $$variable_t := variable_{t-1} - mom_t
+
+  References
+    See ([pdf]
+      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.9,
+               momentum=0.0,
+               epsilon=1e-7,
+               centered=False,
+               name="RMSprop",
+               **kwargs):
+    """Construct a new RMSprop optimizer.
+
+    Note that in the dense implementation of this algorithm, variables and their
+    corresponding accumulators (momentum, gradient moving average, square
+    gradient moving average) will be updated even if the gradient is zero
+    (i.e. accumulators will decay, momentum will be applied). The sparse
+    implementation (used when the gradient is an `IndexedSlices` object,
+    typically because of `tf.gather` or an embedding lookup in the forward pass)
+    will not update variable slices or their accumulators unless those slices
+    were used in the forward pass (nor is there an "eventual" correction to
+    account for these omitted updates). This leads to more efficient updates for
+    large embedding lookup tables (where most of the slices are not accessed in
+    a particular graph execution), but differs from the published algorithm.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      rho: Discounting factor for the history/coming gradient
+      momentum: A scalar tensor.
+      epsilon: Small value to avoid zero denominator.
+      centered: If True, gradients are normalized by the estimated variance of
+        the gradient; if False, by the uncentered second moment. Setting this to
+        True may help with training, but is slightly more expensive in terms of
+        computation and memory. Defaults to False.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "RMSprop".  @compatibility(eager) When eager
+        execution is enabled, `learning_rate`, `decay`, `momentum`, and
+        `epsilon` can each be a callable that takes no arguments and returns the
+        actual value to use. This can be useful for changing these values across
+        different invocations of optimizer functions. @end_compatibility
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+    super(RMSprop, self).__init__(name, **kwargs)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
+    self._set_hyper("rho", rho)
+
+    self._momentum = False
+    if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0:
+      self._momentum = True
+    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
+      raise ValueError("`momentum` must be between [0, 1].")
+    self._set_hyper("momentum", momentum)
+
+    self._set_hyper("epsilon", epsilon)
+    self._centered = centered
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      self.add_slot(var, "rms")
+      self.add_slot(var, "momentum")
+      if self._centered:
+        self.add_slot(var, "mg")
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    rms = self.get_slot(var, "rms")
+    mom = self.get_slot(var, "momentum")
+    rho = self._get_hyper("rho", var_dtype)
+    momentum = self._get_hyper("momentum", var_dtype)
+    epsilon = self._get_hyper("epsilon", var_dtype)
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    rms = self.get_slot(var, "rms")
+    mom = self.get_slot(var, "momentum")
+    rho = self._get_hyper("rho", var_dtype)
+    momentum = self._get_hyper("momentum", var_dtype)
+    epsilon = self._get_hyper("epsilon", var_dtype)
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_sparse_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          indices,
+          use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(RMSprop, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
+        "rho": self._serialize_hyperparameter("rho"),
+        "momentum": self._serialize_hyperparameter("momentum"),
+        "epsilon": self._serialize_hyperparameter("epsilon"),
+        "centered": self._centered,
+    })
+    return config
+
+
+RMSProp = RMSprop
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8658a8550760a04c6031e26721038b88fad0ebd
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -0,0 +1,410 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rmsprop."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import itertools
+import math
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+_DATA_TYPES = [dtypes.half, dtypes.float32]
+
+_TEST_PARAM_VALUES = [
+    # learning_rate, rho, momentum, epsilon, centered
+    [0.05, 0.9, 0.0, 1e-3, True],
+    [0.05, 0.9, 0.0, 1e-3, False],
+    [0.1, 0.9, 0.0, 1e-3, True],
+    [0.01, 0.9, 0.0, 1e-5, True],
+    [0.01, 0.9, 0.9, 1e-5, True],
+]
+
+_TESTPARAMS = [
+    [data_type] + values
+    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
+]
+
+
+class RMSpropOptimizerTest(test.TestCase):
+
+  def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
+                            epsilon, centered):
+    rms_t = rms * rho + (1 - rho) * g * g
+    denom_t = rms_t + epsilon
+    if centered:
+      mg_t = mg * rho + (1 - rho) * g
+      denom_t -= mg_t * mg_t
+    else:
+      mg_t = mg
+    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
+    var_t = var - mom_t
+    return var_t, mg_t, rms_t, mom_t
+
+  def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
+                                   lr, rho, momentum, epsilon, centered):
+    mg_t = copy.deepcopy(mg)
+    rms_t = copy.deepcopy(rms)
+    mom_t = copy.deepcopy(mom)
+    var_t = copy.deepcopy(var)
+    for i in range(len(gindexs)):
+      gindex = gindexs[i]
+      gvalue = gvalues[i]
+      rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
+      denom_t = rms_t[gindex] + epsilon
+      if centered:
+        mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
+        denom_t -= mg_t[gindex] * mg_t[gindex]
+      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
+      var_t[gindex] = var[gindex] - mom_t[gindex]
+    return var_t, mg_t, rms_t, mom_t
+
+  @test_util.run_deprecated_v1
+  def testDense(self):
+    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
+      with test_util.use_gpu():
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np, dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable(var1_np, dtype=dtype)
+        grads0 = constant_op.constant(grads0_np, dtype=dtype)
+        grads1 = constant_op.constant(grads1_np, dtype=dtype)
+        opt = rmsprop.RMSprop(
+            learning_rate=learning_rate,
+            rho=rho,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        if centered:
+          mg0 = opt.get_slot(var0, "mg")
+          mg1 = opt.get_slot(var1, "mg")
+        else:
+          mg0 = None
+          mg1 = None
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 4 steps of RMSprop
+        for _ in range(1, 5):
+          self.evaluate(update)
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate, rho,
+              momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate, rho,
+              momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testDenseWithLearningRateDecay(self):
+    var0_np = np.array([1.0, 2.0])
+    grads0_np = np.array([0.1, 0.2])
+    var1_np = np.array([3.0, 4.0])
+    grads1_np = np.array([0.01, 0.2])
+
+    var0 = resource_variable_ops.ResourceVariable(var0_np)
+    var1 = resource_variable_ops.ResourceVariable(var1_np)
+    grads0 = constant_op.constant(grads0_np)
+    grads1 = constant_op.constant(grads1_np)
+    learning_rate = 0.01
+    rho = 0.9
+    momentum = 0.0
+    epsilon = 1e-7
+    centered = False
+    decay = 0.5
+    opt = rmsprop.RMSprop(
+        learning_rate=learning_rate,
+        rho=rho,
+        momentum=momentum,
+        epsilon=epsilon,
+        centered=centered,
+        decay=decay)
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    self.evaluate(variables.global_variables_initializer())
+
+    rms0 = opt.get_slot(var0, "rms")
+    self.assertTrue(rms0 is not None)
+    rms1 = opt.get_slot(var1, "rms")
+    self.assertTrue(rms1 is not None)
+    mom0 = opt.get_slot(var0, "momentum")
+    self.assertTrue(mom0 is not None)
+    mom1 = opt.get_slot(var1, "momentum")
+    self.assertTrue(mom1 is not None)
+
+    mg0_np = np.array([0.0, 0.0])
+    mg1_np = np.array([0.0, 0.0])
+    rms0_np = np.array([0.0, 0.0])
+    rms1_np = np.array([0.0, 0.0])
+    mom0_np = np.array([0.0, 0.0])
+    mom1_np = np.array([0.0, 0.0])
+
+    # Fetch params to validate initial values
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+    # Run 4 steps of RMSprop
+    for t in range(2):
+      self.evaluate(update)
+
+      lr = learning_rate / (1 + decay * t)
+      var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+          var0_np, grads0_np, mg0_np, rms0_np, mom0_np, lr, rho, momentum,
+          epsilon, centered)
+      var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+          var1_np, grads1_np, mg1_np, rms1_np, mom1_np, lr, rho, momentum,
+          epsilon, centered)
+
+      # Validate updated params
+      self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+      self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+      self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+      self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+      self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+      self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSprop(
+            learning_rate=1.0,
+            rho=0.0,
+            momentum=0.0,
+            epsilon=0.0,
+            centered=False).minimize(
+                loss, var_list=[var0])
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[0., 1.]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariableCentered(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSprop(
+            learning_rate=1.0,
+            rho=0.0,
+            momentum=0.0,
+            epsilon=1.0,
+            centered=True).minimize(
+                loss, var_list=[var0])
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
+      with test_util.use_gpu():
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+        grads1_np_indices = np.array([1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+        opt = rmsprop.RMSprop(
+            learning_rate=learning_rate,
+            rho=rho,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        if centered:
+          mg0 = opt.get_slot(var0, "mg")
+          self.assertEqual(mg0 is not None, centered)
+          mg1 = opt.get_slot(var1, "mg")
+          self.assertEqual(mg1 is not None, centered)
+        else:
+          mg0 = None
+          mg1 = None
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 4 steps of RMSprop
+        for _ in range(1, 5):
+          self.evaluate(update)
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+              learning_rate, rho, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+              learning_rate, rho, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testCallableParams(self):
+    with context.eager_mode():
+      for dtype in [dtypes.half, dtypes.float32]:
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+        learning_rate = lambda: 2.0
+        rho = lambda: 0.9
+        momentum = lambda: 0.0
+        epsilon = lambda: 1.0
+        opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Step 1: the rms accumulators where 1. So we should see a normal
+        # update: v -= grad * learning_rate
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
+            ]), self.evaluate(var1))
+        # Step 2: the root mean square accumulators contain the previous update.
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
+            ]), self.evaluate(var1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index ab13e5c6328ef1cf706e46e4667ff4e17c2ea9e6..ee6dbba5ad62ee4b35101d1496a77ae91412fd64 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -22,13 +22,23 @@ from __future__ import print_function
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python import tf2
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
+from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -285,14 +295,21 @@ class RMSprop(Optimizer):
 class Adagrad(Optimizer):
   """Adagrad optimizer.
 
+  Adagrad is an optimizer with parameter-specific learning rates,
+  which are adapted relative to how frequently a parameter gets
+  updated during training. The more updates a parameter receives,
+  the smaller the updates.
+
   It is recommended to leave the parameters of this optimizer
   at their default values.
 
-  Arguments:
-      lr: float >= 0. Learning rate.
+  # Arguments
+      lr: float >= 0. Initial learning rate.
       epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
       decay: float >= 0. Learning rate decay over each update.
 
+  # References
+      - [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
   """
 
   def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
@@ -345,16 +362,27 @@ class Adagrad(Optimizer):
 class Adadelta(Optimizer):
   """Adadelta optimizer.
 
+  Adadelta is a more robust extension of Adagrad
+  that adapts learning rates based on a moving window of gradient updates,
+  instead of accumulating all past gradients. This way, Adadelta continues
+  learning even when many updates have been done. Compared to Adagrad, in the
+  original version of Adadelta you don't have to set an initial learning
+  rate. In this version, initial learning rate and decay factor can
+  be set, as in most other Keras optimizers.
+
   It is recommended to leave the parameters of this optimizer
   at their default values.
 
-  Arguments:
-      lr: float >= 0. Learning rate.
+  # Arguments
+      lr: float >= 0. Initial learning rate, defaults to 1.
           It is recommended to leave it at the default value.
-      rho: float >= 0.
+      rho: float >= 0. Adadelta decay factor, corresponding to fraction of
+          gradient to keep at each time step.
       epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
-      decay: float >= 0. Learning rate decay over each update.
+      decay: float >= 0. Initial learning rate decay.
 
+  # References
+      - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
   """
 
   def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
@@ -455,7 +483,7 @@ class Adam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    self.updates = []
 
     lr = self.lr
     if self.initial_decay > 0:
@@ -463,7 +491,8 @@ class Adam(Optimizer):
           1. / (1. + self.decay * math_ops.cast(self.iterations,
                                                 K.dtype(self.decay))))
 
-    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
     lr_t = lr * (
         K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
         (1. - math_ops.pow(self.beta_1, t)))
@@ -777,16 +806,27 @@ def deserialize(config, custom_objects=None):
   Returns:
       A Keras Optimizer instance.
   """
-  all_classes = {
-      'sgd': SGD,
-      'rmsprop': RMSprop,
-      'adagrad': Adagrad,
-      'adadelta': Adadelta,
-      'adam': Adam,
-      'adamax': Adamax,
-      'nadam': Nadam,
-      'tfoptimizer': TFOptimizer,
-  }
+  if tf2.enabled():
+    all_classes = {
+        'adadelta': adadelta_v2.Adadelta,
+        'adagrad': adagrad_v2.Adagrad,
+        'adam': adam_v2.Adam,
+        'adamax': adamax_v2.Adamax,
+        'nadam': nadam_v2.Nadam,
+        'rmsprop': rmsprop_v2.RMSprop,
+        'sgd': gradient_descent_v2.SGD
+    }
+  else:
+    all_classes = {
+        'adadelta': Adadelta,
+        'adagrad': Adagrad,
+        'adam': Adam,
+        'adamax': Adamax,
+        'nadam': Nadam,
+        'rmsprop': RMSprop,
+        'sgd': SGD,
+        'tfoptimizer': TFOptimizer
+    }
   # Make deserialization case-insensitive for built-in optimizers.
   if config['class_name'].lower() in all_classes:
     config['class_name'] = config['class_name'].lower()
@@ -815,17 +855,17 @@ def get(identifier):
   Raises:
       ValueError: If `identifier` cannot be interpreted.
   """
+  if isinstance(identifier, (Optimizer, optimizer_v2.OptimizerV2)):
+    return identifier
   # Wrap TF optimizer instances
-  if isinstance(identifier, tf_optimizer_module.Optimizer):
+  elif isinstance(identifier, tf_optimizer_module.Optimizer):
     opt = TFOptimizer(identifier)
     K.track_tf_optimizer(opt)
     return opt
-  if isinstance(identifier, dict):
+  elif isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
     config = {'class_name': str(identifier), 'config': {}}
     return deserialize(config)
-  if isinstance(identifier, Optimizer):
-    return identifier
   else:
     raise ValueError('Could not interpret optimizer identifier:', identifier)
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 9664f09fff0ad872c40b58e3ff2347a2a595d429..77104a5d4d526792dde209b3c7cce2262a138dce 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
+import os
 import weakref
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -88,22 +91,26 @@ def _test_optimizer(optimizer, target=0.75):
 
 class KerasOptimizersTest(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def test_sgd(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
                                            momentum=0.9,
                                            nesterov=True))
 
+  @test_util.run_v1_only('b/120545219')
   def test_rmsprop(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.RMSprop())
       _test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
 
+  @test_util.run_v1_only('b/120545219')
   def test_adagrad(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adagrad())
       _test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
 
+  @test_util.run_v1_only('b/120545219')
   def test_adadelta(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adadelta(), target=0.6)
@@ -112,27 +119,32 @@ class KerasOptimizersTest(test.TestCase):
       # the accuracy.
       _test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
 
+  @test_util.run_v1_only('b/120545219')
   def test_adam(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adam())
       _test_optimizer(keras.optimizers.Adam(decay=1e-3))
       _test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
+  @test_util.run_v1_only('b/120545219')
   def test_adamax(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adamax())
       _test_optimizer(keras.optimizers.Adamax(decay=1e-3))
 
+  @test_util.run_v1_only('b/120545219')
   def test_nadam(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Nadam())
 
+  @test_util.run_v1_only('b/120545219')
   def test_clipnorm(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
                                            momentum=0.9,
                                            clipnorm=0.5))
 
+  @test_util.run_v1_only('b/120545219')
   def test_clipvalue(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
@@ -208,5 +220,40 @@ class KerasOptimizersTest(test.TestCase):
       _ = keras.optimizers.Adam(clipnorm=-2.0)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class KerasV2OptimizersTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('adadelta_tf2', 'adadelta', True), ('adadelta_tf1', 'adadelta', False),
+      ('adagrad_tf2', 'adagrad', True), ('adagrad_tf1', 'adagrad', False),
+      ('adam_tf2', 'adam', True), ('adam_tf1', 'adam', False),
+      ('adamax_tf2', 'adamax', True), ('adamax_tf1', 'adamax', False),
+      ('sgd_tf2', 'sgd', True), ('sgd_tf1', 'sgd', False),
+      ('nadam_tf2', 'nadam', True), ('nadam_tf1', 'nadam', False),
+      ('rmsprop_tf2', 'rmsprop', True), ('rmsprop_tf1', 'rmsprop', False))
+  def test_load_from_string(self, optimizer_string, tf2mode):
+    old_mode = os.environ.get('TF2_BEHAVIOR', None)
+    if tf2mode:
+      os.environ['TF2_BEHAVIOR'] = 'enabled'
+    else:
+      if 'TF2_BEHAVIOR' in os.environ:
+        del os.environ['TF2_BEHAVIOR']
+
+    # Sanity check.
+    self.assertEqual(tf2.enabled(), tf2mode)
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_shape=(10,)))
+    model.compile(optimizer_string, 'binary_crossentropy')
+
+    self.assertEqual(optimizer_string,
+                     model.optimizer.__class__.__name__.lower())
+
+    model.fit(np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'))
+
+    if old_mode is not None:
+      os.environ['TF2_BEHAVIOR'] = old_mode
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index bba4ebb287b2bd3e8509abd215dc5be4cbcdd929..3d6b259d87de8b6533d008a839f0df2226d71ed4 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python import keras
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -61,6 +62,7 @@ class KerasRegularizersTest(test.TestCase):
         model.fit(x_train, y_train, batch_size=10,
                   epochs=1, verbose=0)
 
+  @test_util.run_deprecated_v1
   def test_activity_regularization(self):
     with self.cached_session():
       (x_train, y_train), _ = get_data()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 501b50ba5f8c7606e5fc3dc3b096385737a598a7..fd062b0ab337aa6fa62a7603a36749cde315c3da 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -18,11 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
+
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 
 
@@ -73,9 +77,13 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   Returns:
     The output data (Numpy array) returned by the layer, for additional
     checks to be done by the calling code.
+
+  Raises:
+    ValueError: if `input_shape is None`.
   """
   if input_data is None:
-    assert input_shape
+    if input_shape is None:
+      raise ValueError('input_shape is None')
     if not input_dtype:
       input_dtype = 'float32'
     input_data_shape = list(input_shape)
@@ -149,7 +157,15 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     np.testing.assert_allclose(output, actual_output, rtol=1e-3)
 
   # test training mode (e.g. useful for dropout tests)
-  model.compile(RMSPropOptimizer(0.01), 'mse')
+  # Rebuild the model to avoid the graph being reused between predict() and
+  # train(). This was causing some error for layer with Defun as it body.
+  # See b/120160788 for more details. This should be mitigated after 2.0.
+  model = keras.models.Model(x, layer(x))
+  if _thread_local_data.run_eagerly is not None:
+    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'],
+                  run_eagerly=should_run_eagerly())
+  else:
+    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'])
   model.train_on_batch(input_data, actual_output)
 
   # test as first layer in Sequential API
@@ -166,8 +182,9 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     if expected_dim is not None:
       if expected_dim != actual_dim:
         raise AssertionError(
-            'When testing layer %s, for input %s, found output_shape='
-            '%s but expected to find %s.\nFull kwargs: %s' %
+            'When testing layer %s **after deserialization**, '
+            'for input %s, found output_shape='
+            '%s but expected to find inferred shape %s.\nFull kwargs: %s' %
             (layer_cls.__name__,
              x,
              actual_output_shape,
@@ -189,6 +206,74 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   return actual_output
 
 
+_thread_local_data = threading.local()
+_thread_local_data.model_type = None
+_thread_local_data.run_eagerly = None
+
+
+@tf_contextlib.contextmanager
+def model_type_scope(value):
+  """Provides a scope within which the model type to test is equal to `value`.
+
+  The model type gets restored to its original value upon exiting the scope.
+
+  Arguments:
+     value: model type value
+
+  Yields:
+    The provided value.
+  """
+  previous_value = _thread_local_data.model_type
+  try:
+    _thread_local_data.model_type = value
+    yield value
+  finally:
+    # Restore model type to initial value.
+    _thread_local_data.model_type = previous_value
+
+
+@tf_contextlib.contextmanager
+def run_eagerly_scope(value):
+  """Provides a scope within which we compile models to run eagerly or not.
+
+  The boolean gets restored to its original value upon exiting the scope.
+
+  Arguments:
+     value: Bool specifying if we should run models eagerly in the active test.
+     Should be True or False.
+
+  Yields:
+    The provided value.
+  """
+  previous_value = _thread_local_data.run_eagerly
+  try:
+    _thread_local_data.run_eagerly = value
+    yield value
+  finally:
+    # Restore model type to initial value.
+    _thread_local_data.run_eagerly = previous_value
+
+
+def should_run_eagerly():
+  """Returns whether the models we are testing should be run eagerly."""
+  if _thread_local_data.run_eagerly is None:
+    raise ValueError('Cannot call `should_run_eagerly()` outside of a '
+                     '`run_eagerly_scope()` or `run_all_keras_modes` '
+                     'decorator.')
+
+  return _thread_local_data.run_eagerly and context.executing_eagerly()
+
+
+def get_model_type():
+  """Gets the model type that should be tested."""
+  if _thread_local_data.model_type is None:
+    raise ValueError('Cannot call `get_model_type()` outside of a '
+                     '`model_type_scope()` or `run_with_all_model_types` '
+                     'decorator.')
+
+  return _thread_local_data.model_type
+
+
 def get_small_sequential_mlp(num_hidden, num_classes, input_dim=None):
   model = keras.models.Sequential()
   if input_dim:
@@ -207,3 +292,337 @@ def get_small_functional_mlp(num_hidden, num_classes, input_dim):
   activation = 'sigmoid' if num_classes == 1 else 'softmax'
   outputs = keras.layers.Dense(num_classes, activation=activation)(outputs)
   return keras.Model(inputs, outputs)
+
+
+class _SmallSubclassMLP(keras.Model):
+  """A subclass model based small MLP."""
+
+  def __init__(self, num_hidden, num_classes):
+    super(_SmallSubclassMLP, self).__init__()
+    self.layer_a = keras.layers.Dense(num_hidden, activation='relu')
+    activation = 'sigmoid' if num_classes == 1 else 'softmax'
+    self.layer_b = keras.layers.Dense(num_classes, activation=activation)
+
+  def call(self, inputs, **kwargs):
+    x = self.layer_a(inputs)
+    return self.layer_b(x)
+
+
+class _SmallSubclassMLPCustomBuild(keras.Model):
+  """A subclass model small MLP that uses a custom build method."""
+
+  def __init__(self, num_hidden, num_classes):
+    super(_SmallSubclassMLPCustomBuild, self).__init__()
+    self.layer_a = None
+    self.layer_b = None
+    self.num_hidden = num_hidden
+    self.num_classes = num_classes
+
+  def build(self, input_shape):
+    self.layer_a = keras.layers.Dense(self.num_hidden, activation='relu')
+    activation = 'sigmoid' if self.num_classes == 1 else 'softmax'
+    self.layer_b = keras.layers.Dense(self.num_classes, activation=activation)
+
+  def call(self, inputs, **kwargs):
+    x = self.layer_a(inputs)
+    return self.layer_b(x)
+
+
+def get_small_subclass_mlp(num_hidden, num_classes):
+  return _SmallSubclassMLP(num_hidden, num_classes)
+
+
+def get_small_subclass_mlp_with_custom_build(num_hidden, num_classes):
+  return _SmallSubclassMLPCustomBuild(num_hidden, num_classes)
+
+
+def get_small_mlp(num_hidden, num_classes, input_dim):
+  """Get a small mlp of the model type specified by `get_model_type`."""
+  model_type = get_model_type()
+  if model_type == 'subclass':
+    return get_small_subclass_mlp(num_hidden, num_classes)
+  if model_type == 'subclass_custom_build':
+    return get_small_subclass_mlp_with_custom_build(num_hidden, num_classes)
+  if model_type == 'sequential':
+    return get_small_sequential_mlp(num_hidden, num_classes, input_dim)
+  if model_type == 'functional':
+    return get_small_functional_mlp(num_hidden, num_classes, input_dim)
+  raise ValueError('Unknown model type {}'.format(model_type))
+
+
+class _SubclassModel(keras.Model):
+  """A Keras subclass model."""
+
+  def __init__(self, layers):
+    super(_SubclassModel, self).__init__()
+    self.all_layers = layers
+
+  def call(self, inputs, **kwargs):
+    x = inputs
+    for layer in self.all_layers:
+      x = layer(x)
+    return x
+
+
+class _SubclassModelCustomBuild(keras.Model):
+  """A Keras subclass model that uses a custom build method."""
+
+  def __init__(self, layer_generating_func):
+    super(_SubclassModelCustomBuild, self).__init__()
+    self.all_layers = None
+    self._layer_generating_func = layer_generating_func
+
+  def build(self, input_shape):
+    layers = []
+    for layer in self._layer_generating_func():
+      layers.append(layer)
+    self.all_layers = layers
+
+  def call(self, inputs, **kwargs):
+    x = inputs
+    for layer in self.all_layers:
+      x = layer(x)
+    return x
+
+
+def get_model_from_layers(layers, input_shape=None):
+  """Builds a model from a sequence of layers."""
+  model_type = get_model_type()
+  if model_type == 'subclass':
+    return _SubclassModel(layers)
+
+  if model_type == 'subclass_custom_build':
+    layer_generating_func = lambda: layers
+    return _SubclassModelCustomBuild(layer_generating_func)
+
+  if model_type == 'sequential':
+    model = keras.models.Sequential()
+    if input_shape:
+      model.add(keras.layers.InputLayer(input_shape=input_shape))
+    for layer in layers:
+      model.add(layer)
+    return model
+
+  if model_type == 'functional':
+    if not input_shape:
+      raise ValueError('Cannot create a functional model from layers with no '
+                       'input shape.')
+    inputs = keras.Input(shape=input_shape)
+    outputs = inputs
+    for layer in layers:
+      outputs = layer(outputs)
+    return keras.Model(inputs, outputs)
+
+  raise ValueError('Unknown model type {}'.format(model_type))
+
+
+class _MultiIOSubclassModel(keras.Model):
+  """Multi IO Keras subclass model."""
+
+  def __init__(self, branch_a, branch_b, shared_input_branch=None,
+               shared_output_branch=None):
+    super(_MultiIOSubclassModel, self).__init__()
+    self._shared_input_branch = shared_input_branch
+    self._branch_a = branch_a
+    self._branch_b = branch_b
+    self._shared_output_branch = shared_output_branch
+
+  def call(self, inputs, **kwargs):
+    if self._shared_input_branch:
+      for layer in self._shared_input_branch:
+        inputs = layer(inputs)
+      a = inputs
+      b = inputs
+    else:
+      a, b = inputs
+
+    for layer in self._branch_a:
+      a = layer(a)
+    for layer in self._branch_b:
+      b = layer(b)
+    outs = [a, b]
+
+    if self._shared_output_branch:
+      for layer in self._shared_output_branch:
+        outs = layer(outs)
+
+    return outs
+
+
+class _MultiIOSubclassModelCustomBuild(keras.Model):
+  """Multi IO Keras subclass model that uses a custom build method."""
+
+  def __init__(self, branch_a_func, branch_b_func,
+               shared_input_branch_func=None,
+               shared_output_branch_func=None):
+    super(_MultiIOSubclassModelCustomBuild, self).__init__()
+    self._shared_input_branch_func = shared_input_branch_func
+    self._branch_a_func = branch_a_func
+    self._branch_b_func = branch_b_func
+    self._shared_output_branch_func = shared_output_branch_func
+
+    self._shared_input_branch = None
+    self._branch_a = None
+    self._branch_b = None
+    self._shared_output_branch = None
+
+  def build(self, input_shape):
+    if self._shared_input_branch_func():
+      self._shared_input_branch = self._shared_input_branch_func()
+    self._branch_a = self._branch_a_func()
+    self._branch_b = self._branch_b_func()
+
+    if self._shared_output_branch_func():
+      self._shared_output_branch = self._shared_output_branch_func()
+
+  def call(self, inputs, **kwargs):
+    if self._shared_input_branch:
+      for layer in self._shared_input_branch:
+        inputs = layer(inputs)
+      a = inputs
+      b = inputs
+    else:
+      a, b = inputs
+
+    for layer in self._branch_a:
+      a = layer(a)
+    for layer in self._branch_b:
+      b = layer(b)
+    outs = a, b
+
+    if self._shared_output_branch:
+      for layer in self._shared_output_branch:
+        outs = layer(outs)
+
+    return outs
+
+
+def get_multi_io_model(
+    branch_a,
+    branch_b,
+    shared_input_branch=None,
+    shared_output_branch=None):
+  """Builds a multi-io model that contains two branches.
+
+  The produced model will be of the type specified by `get_model_type`.
+
+  To build a two-input, two-output model:
+    Specify a list of layers for branch a and branch b, but do not specify any
+    shared input branch or shared output branch. The resulting model will apply
+    each branch to a different input, to produce two outputs.
+
+    The first value in branch_a must be the Keras 'Input' layer for branch a,
+    and the first value in branch_b must be the Keras 'Input' layer for
+    branch b.
+
+    example usage:
+    ```
+    branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
+    branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
+
+    model = get_multi_io_model(branch_a, branch_b)
+    ```
+
+  To build a two-input, one-output model:
+    Specify a list of layers for branch a and branch b, and specify a
+    shared output branch. The resulting model will apply
+    each branch to a different input. It will then apply the shared output
+    branch to a tuple containing the intermediate outputs of each branch,
+    to produce a single output. The first layer in the shared_output_branch
+    must be able to merge a tuple of two tensors.
+
+    The first value in branch_a must be the Keras 'Input' layer for branch a,
+    and the first value in branch_b must be the Keras 'Input' layer for
+    branch b.
+
+    example usage:
+    ```
+    input_branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
+    input_branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
+    shared_output_branch = [Concatenate(), Dense(), Dense()]
+
+    model = get_multi_io_model(input_branch_a, input_branch_b,
+                               shared_output_branch=shared_output_branch)
+    ```
+  To build a one-input, two-output model:
+    Specify a list of layers for branch a and branch b, and specify a
+    shared input branch. The resulting model will take one input, and apply
+    the shared input branch to it. It will then respectively apply each branch
+    to that intermediate result in parallel, to produce two outputs.
+
+    The first value in the shared_input_branch must be the Keras 'Input' layer
+    for the whole model. Branch a and branch b should not contain any Input
+    layers.
+
+    example usage:
+    ```
+    shared_input_branch = [Input(shape=(2,), name='in'), Dense(), Dense()]
+    output_branch_a = [Dense(), Dense()]
+    output_branch_b = [Dense(), Dense()]
+
+
+    model = get_multi_io_model(output__branch_a, output_branch_b,
+                               shared_input_branch=shared_input_branch)
+    ```
+
+  Args:
+    branch_a: A sequence of layers for branch a of the model.
+    branch_b: A sequence of layers for branch b of the model.
+    shared_input_branch: An optional sequence of layers to apply to a single
+      input, before applying both branches to that intermediate result. If set,
+      the model will take only one input instead of two. Defaults to None.
+    shared_output_branch: An optional sequence of layers to merge the
+      intermediate results produced by branch a and branch b. If set,
+      the model will produce only one output instead of two. Defaults to None.
+
+  Returns:
+    A multi-io model of the type specified by `get_model_type`, specified
+    by the different branches.
+  """
+  # Extract the functional inputs from the layer lists
+  if shared_input_branch:
+    inputs = shared_input_branch[0]
+    shared_input_branch = shared_input_branch[1:]
+  else:
+    inputs = branch_a[0], branch_b[0]
+    branch_a = branch_a[1:]
+    branch_b = branch_b[1:]
+
+  model_type = get_model_type()
+  if model_type == 'subclass':
+    return _MultiIOSubclassModel(branch_a, branch_b, shared_input_branch,
+                                 shared_output_branch)
+
+  if model_type == 'subclass_custom_build':
+    return _MultiIOSubclassModelCustomBuild((lambda: branch_a),
+                                            (lambda: branch_b),
+                                            (lambda: shared_input_branch),
+                                            (lambda: shared_output_branch))
+
+  if model_type == 'sequential':
+    raise ValueError('Cannot use `get_multi_io_model` to construct '
+                     'sequential models')
+
+  if model_type == 'functional':
+    if shared_input_branch:
+      a_and_b = inputs
+      for layer in shared_input_branch:
+        a_and_b = layer(a_and_b)
+      a = a_and_b
+      b = a_and_b
+    else:
+      a, b = inputs
+
+    for layer in branch_a:
+      a = layer(a)
+    for layer in branch_b:
+      b = layer(b)
+    outputs = a, b
+
+    if shared_output_branch:
+      for layer in shared_output_branch:
+        outputs = layer(outputs)
+
+    return keras.Model(inputs, outputs)
+
+  raise ValueError('Unknown model type {}'.format(model_type))
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index c442b31116091955335423d2e60eaacf464c568e..61940ad789c4009fca5462079014482fb8bfec1b 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -23,15 +23,18 @@ from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.data_utils import SequenceEnqueuer
+from tensorflow.python.keras.utils.generic_utils import class_and_config_for_serialized_keras_object
 from tensorflow.python.keras.utils.generic_utils import custom_object_scope
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import get_custom_objects
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_class_and_config
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index 8ebca1418dd226d1b8d9e4036807de27786df199..f486e631e50e5beb8da606879f23cd67131389f5 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -137,26 +137,49 @@ def conv_input_length(output_length, filter_size, padding, stride):
   return (output_length - 1) * stride - 2 * pad + filter_size
 
 
-def deconv_output_length(input_length, filter_size, padding, stride):
+def deconv_output_length(input_length, filter_size, padding,
+                         output_padding=None, stride=0, dilation=1):
   """Determines output length of a transposed convolution given input length.
 
   Arguments:
-      input_length: integer.
-      filter_size: integer.
-      padding: one of "same", "valid", "full".
-      stride: integer.
+      input_length: Integer.
+      filter_size: Integer.
+      padding: one of `"same"`, `"valid"`, `"full"`.
+      output_padding: Integer, amount of padding along the output dimension.
+          Can be set to `None` in which case the output length is inferred.
+      stride: Integer.
+      dilation: Integer.
 
   Returns:
       The output length (integer).
   """
+  assert padding in {'same', 'valid', 'full'}
   if input_length is None:
     return None
-  input_length *= stride
-  if padding == 'valid':
-    input_length += max(filter_size - stride, 0)
-  elif padding == 'full':
-    input_length -= (stride + filter_size - 2)
-  return input_length
+
+  # Get the dilated kernel size
+  filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+
+  # Infer length if output padding is None, else compute the exact length
+  if output_padding is None:
+    if padding == 'valid':
+      length = input_length * stride + max(filter_size - stride, 0)
+    elif padding == 'full':
+      length = input_length * stride - (stride + filter_size - 2)
+    elif padding == 'same':
+      length = input_length * stride
+
+  else:
+    if padding == 'same':
+      pad = filter_size // 2
+    elif padding == 'valid':
+      pad = 0
+    elif padding == 'full':
+      pad = filter_size - 1
+
+    length = ((input_length - 1) * stride + filter_size - 2 * pad +
+              output_padding)
+  return length
 
 
 def normalize_data_format(value):
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index b736daa46de3b74953e0bbab7126d1314ec0d3cb..d133e3fa8aeb0ee420bfa131b98401f617f1daae 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -30,7 +30,6 @@ import sys
 import tarfile
 import threading
 import time
-import traceback
 import zipfile
 
 import numpy as np
@@ -117,16 +116,16 @@ def _extract_archive(file_path, path='.', archive_format='auto'):
   """
   if archive_format is None:
     return False
-  if archive_format is 'auto':
+  if archive_format == 'auto':
     archive_format = ['tar', 'zip']
   if isinstance(archive_format, six.string_types):
     archive_format = [archive_format]
 
   for archive_type in archive_format:
-    if archive_type is 'tar':
+    if archive_type == 'tar':
       open_fn = tarfile.open
       is_match_fn = tarfile.is_tarfile
-    if archive_type is 'zip':
+    if archive_type == 'zip':
       open_fn = zipfile.ZipFile
       is_match_fn = zipfile.is_zipfile
 
@@ -237,7 +236,7 @@ def get_file(fname,
 
     def dl_progress(count, block_size, total_size):
       if ProgressTracker.progbar is None:
-        if total_size is -1:
+        if total_size == -1:
           total_size = None
         ProgressTracker.progbar = Progbar(total_size)
       else:
@@ -288,7 +287,7 @@ def _hash_file(fpath, algorithm='sha256', chunk_size=65535):
   Returns:
       The file hash
   """
-  if (algorithm is 'sha256') or (algorithm is 'auto' and len(hash) is 64):
+  if (algorithm == 'sha256') or (algorithm == 'auto' and len(hash) == 64):
     hasher = hashlib.sha256()
   else:
     hasher = hashlib.md5()
@@ -314,8 +313,7 @@ def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
   Returns:
       Whether the file is valid
   """
-  if ((algorithm is 'sha256') or
-      (algorithm is 'auto' and len(file_hash) is 64)):
+  if (algorithm == 'sha256') or (algorithm == 'auto' and len(file_hash) == 64):
     hasher = 'sha256'
   else:
     hasher = 'md5'
@@ -400,14 +398,23 @@ class Sequence(object):
     pass
 
   def __iter__(self):
-    """Creates an infinite generator that iterate over the Sequence.
+    """Create a generator that iterate over the Sequence."""
+    for item in (self[i] for i in range(len(self))):
+      yield item
 
-    Yields:
-      Sequence items.
-    """
-    while True:
-      for item in (self[i] for i in range(len(self))):
-        yield item
+
+def iter_sequence_infinite(seq):
+  """Iterates indefinitely over a Sequence.
+
+  Arguments:
+    seq: Sequence instance.
+
+  Yields:
+    Batches of data from the Sequence.
+  """
+  while True:
+    for item in seq:
+      yield item
 
 
 # Global variables to be shared across processes
@@ -445,7 +452,7 @@ class SequenceEnqueuer(object):
   The task of an Enqueuer is to use parallelism to speed up preprocessing.
   This is done with processes or threads.
 
-  Examples:
+  Example:
 
   ```python
       enqueuer = SequenceEnqueuer(...)
@@ -458,61 +465,10 @@ class SequenceEnqueuer(object):
   ```
 
   The `enqueuer.get()` should be an infinite stream of datas.
-
-  """
-
-  @abstractmethod
-  def is_running(self):
-    raise NotImplementedError
-
-  @abstractmethod
-  def start(self, workers=1, max_queue_size=10):
-    """Starts the handler's workers.
-
-    Arguments:
-        workers: number of worker threads
-        max_queue_size: queue size
-            (when full, threads could block on `put()`).
-    """
-    raise NotImplementedError
-
-  @abstractmethod
-  def stop(self, timeout=None):
-    """Stop running threads and wait for them to exit, if necessary.
-
-    Should be called by the same thread which called start().
-
-    Arguments:
-        timeout: maximum time to wait on thread.join()
-    """
-    raise NotImplementedError
-
-  @abstractmethod
-  def get(self):
-    """Creates a generator to extract data from the queue.
-
-    Skip the data if it is `None`.
-
-    Returns:
-        Generator yielding tuples `(inputs, targets)`
-            or `(inputs, targets, sample_weights)`.
-    """
-    raise NotImplementedError
-
-
-@tf_export('keras.utils.OrderedEnqueuer')
-class OrderedEnqueuer(SequenceEnqueuer):
-  """Builds a Enqueuer from a Sequence.
-
-  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
-
-  Arguments:
-      sequence: A `keras.utils.data_utils.Sequence` object.
-      use_multiprocessing: use multiprocessing if True, otherwise threading
-      shuffle: whether to shuffle the data at the beginning of each epoch
   """
 
-  def __init__(self, sequence, use_multiprocessing=False, shuffle=False):
+  def __init__(self, sequence,
+               use_multiprocessing=False):
     self.sequence = sequence
     self.use_multiprocessing = use_multiprocessing
 
@@ -535,7 +491,6 @@ class OrderedEnqueuer(SequenceEnqueuer):
         self.uid = _SEQUENCE_COUNTER.value
         _SEQUENCE_COUNTER.value += 1
 
-    self.shuffle = shuffle
     self.workers = 0
     self.executor_fn = None
     self.queue = None
@@ -546,16 +501,15 @@ class OrderedEnqueuer(SequenceEnqueuer):
     return self.stop_signal is not None and not self.stop_signal.is_set()
 
   def start(self, workers=1, max_queue_size=10):
-    """Start the handler's workers.
+    """Starts the handler's workers.
 
     Arguments:
-        workers: number of worker threads
+        workers: Number of workers.
         max_queue_size: queue size
             (when full, workers could block on `put()`)
     """
     if self.use_multiprocessing:
-      self.executor_fn = lambda seqs: multiprocessing.Pool(  # pylint: disable=g-long-lambda
-          workers, initializer=init_pool, initargs=(seqs,))
+      self.executor_fn = self._get_executor_init(workers)
     else:
       # We do not need the init since it's threads.
       self.executor_fn = lambda _: ThreadPool(workers)
@@ -566,6 +520,87 @@ class OrderedEnqueuer(SequenceEnqueuer):
     self.run_thread.daemon = True
     self.run_thread.start()
 
+  def _send_sequence(self):
+    """Sends current Iterable to all workers."""
+    # For new processes that may spawn
+    _SHARED_SEQUENCES[self.uid] = self.sequence
+
+  def stop(self, timeout=None):
+    """Stops running threads and wait for them to exit, if necessary.
+
+    Should be called by the same thread which called `start()`.
+
+    Arguments:
+        timeout: maximum time to wait on `thread.join()`
+    """
+    self.stop_signal.set()
+    with self.queue.mutex:
+      self.queue.queue.clear()
+      self.queue.unfinished_tasks = 0
+      self.queue.not_full.notify()
+    self.run_thread.join(timeout)
+    _SHARED_SEQUENCES[self.uid] = None
+
+  @abstractmethod
+  def _run(self):
+    """Submits request to the executor and queue the `Future` objects."""
+    raise NotImplementedError
+
+  @abstractmethod
+  def _get_executor_init(self, workers):
+    """Gets the Pool initializer for multiprocessing.
+
+    Arguments:
+        workers: Number of workers.
+
+    Returns:
+        Function, a Function to initialize the pool
+    """
+    raise NotImplementedError
+
+  @abstractmethod
+  def get(self):
+    """Creates a generator to extract data from the queue.
+
+    Skip the data if it is `None`.
+    # Returns
+        Generator yielding tuples `(inputs, targets)`
+            or `(inputs, targets, sample_weights)`.
+    """
+    raise NotImplementedError
+
+
+@tf_export('keras.utils.OrderedEnqueuer')
+class OrderedEnqueuer(SequenceEnqueuer):
+  """Builds a Enqueuer from a Sequence.
+
+  Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
+
+  Arguments:
+      sequence: A `tf.keras.utils.data_utils.Sequence` object.
+      use_multiprocessing: use multiprocessing if True, otherwise threading
+      shuffle: whether to shuffle the data at the beginning of each epoch
+  """
+
+  def __init__(self, sequence, use_multiprocessing=False, shuffle=False):
+    super(OrderedEnqueuer, self).__init__(sequence, use_multiprocessing)
+    self.shuffle = shuffle
+
+  def _get_executor_init(self, workers):
+    """Gets the Pool initializer for multiprocessing.
+
+    Arguments:
+        workers: Number of workers.
+
+    Returns:
+        Function, a Function to initialize the pool
+    """
+    def pool_fn(seqs):
+      return multiprocessing.Pool(
+          workers, initializer=init_pool_generator, initargs=(seqs, None))
+
+    return pool_fn
+
   def _wait_queue(self):
     """Wait for the queue to be empty."""
     while True:
@@ -615,30 +650,34 @@ class OrderedEnqueuer(SequenceEnqueuer):
         self.queue.task_done()
         if inputs is not None:
           yield inputs
-    except Exception as e:  # pylint: disable=broad-except
+    except Exception:  # pylint: disable=broad-except
       self.stop()
-      six.raise_from(StopIteration(e), e)
+      six.reraise(*sys.exc_info())
 
-  def _send_sequence(self):
-    """Send current Sequence to all workers."""
-    # For new processes that may spawn
-    _SHARED_SEQUENCES[self.uid] = self.sequence
 
-  def stop(self, timeout=None):
-    """Stops running threads and wait for them to exit, if necessary.
+def init_pool_generator(gens, random_seed=None):
+  global _SHARED_SEQUENCES
+  _SHARED_SEQUENCES = gens
 
-    Should be called by the same thread which called `start()`.
+  if random_seed is not None:
+    ident = multiprocessing.current_process().ident
+    np.random.seed(random_seed + ident)
 
-    Arguments:
-        timeout: maximum time to wait on `thread.join()`
-    """
-    self.stop_signal.set()
-    with self.queue.mutex:
-      self.queue.queue.clear()
-      self.queue.unfinished_tasks = 0
-      self.queue.not_full.notify()
-    self.run_thread.join(timeout)
-    _SHARED_SEQUENCES[self.uid] = None
+
+def next_sample(uid):
+  """Gets the next value from the generator `uid`.
+
+  To allow multiple generators to be used at the same time, we use `uid` to
+  get a specific one. A single generator would cause the validation to
+  overwrite the training generator.
+
+  Arguments:
+      uid: int, generator identifier
+
+  Returns:
+      The next value of generator `uid`.
+  """
+  return six.next(_SHARED_SEQUENCES[uid])
 
 
 @tf_export('keras.utils.GeneratorEnqueuer')
@@ -658,145 +697,36 @@ class GeneratorEnqueuer(SequenceEnqueuer):
           will be incremented by one for each worker.
   """
 
-  def __init__(self,
-               generator,
+  def __init__(self, sequence,
                use_multiprocessing=False,
-               wait_time=0.05,
-               seed=None):
-    self.wait_time = wait_time
-    self._generator = generator
-    if os.name is 'nt' and use_multiprocessing is True:
-      # On Windows, avoid **SYSTEMATIC** error in `multiprocessing`:
-      # `TypeError: can't pickle generator objects`
-      # => Suggest multithreading instead of multiprocessing on Windows
-      raise ValueError('Using a generator with `use_multiprocessing=True`'
-                       ' is not supported on Windows (no marshalling of'
-                       ' generators across process boundaries). Instead,'
-                       ' use single thread/process or multithreading.')
-    else:
-      self._use_multiprocessing = use_multiprocessing
-    self._threads = []
-    self._stop_event = None
-    self._manager = None
-    self.queue = None
-    self.seed = seed
-
-  def _data_generator_task(self):
-    if self._use_multiprocessing is False:
-      while not self._stop_event.is_set():
-        with self.genlock:
-          try:
-            if (self.queue is not None and
-                self.queue.qsize() < self.max_queue_size):
-              # On all OSes, avoid **SYSTEMATIC** error
-              # in multithreading mode:
-              # `ValueError: generator already executing`
-              # => Serialize calls to
-              # infinite iterator/generator's next() function
-              generator_output = next(self._generator)
-              self.queue.put((True, generator_output))
-            else:
-              time.sleep(self.wait_time)
-          except StopIteration:
-            break
-          except Exception as e:  # pylint: disable=broad-except
-            # Can't pickle tracebacks.
-            # As a compromise, print the traceback and pickle None instead.
-            if not hasattr(e, '__traceback__'):
-              setattr(e, '__traceback__', sys.exc_info()[2])
-            self.queue.put((False, e))
-            self._stop_event.set()
-            break
-    else:
-      while not self._stop_event.is_set():
-        try:
-          if (self.queue is not None and
-              self.queue.qsize() < self.max_queue_size):
-            generator_output = next(self._generator)
-            self.queue.put((True, generator_output))
-          else:
-            time.sleep(self.wait_time)
-        except StopIteration:
-          break
-        except Exception as e:  # pylint: disable=broad-except
-          # Can't pickle tracebacks.
-          # As a compromise, print the traceback and pickle None instead.
-          traceback.print_exc()
-          setattr(e, '__traceback__', None)
-          self.queue.put((False, e))
-          self._stop_event.set()
-          break
+               random_seed=None):
+    super(GeneratorEnqueuer, self).__init__(sequence, use_multiprocessing)
+    self.random_seed = random_seed
 
-  def start(self, workers=1, max_queue_size=10):
-    """Kicks off threads which add data from the generator into the queue.
+  def _get_executor_init(self, workers):
+    """Gets the Pool initializer for multiprocessing.
 
     Arguments:
-        workers: number of worker threads
-        max_queue_size: queue size
-            (when full, threads could block on `put()`)
-    """
-    try:
-      self.max_queue_size = max_queue_size
-      if self._use_multiprocessing:
-        self._manager = multiprocessing.Manager()
-        self.queue = self._manager.Queue(maxsize=max_queue_size)
-        self._stop_event = multiprocessing.Event()
-      else:
-        # On all OSes, avoid **SYSTEMATIC** error in multithreading mode:
-        # `ValueError: generator already executing`
-        # => Serialize calls to infinite iterator/generator's next() function
-        self.genlock = threading.Lock()
-        self.queue = queue.Queue(maxsize=max_queue_size)
-        self._stop_event = threading.Event()
-
-      for _ in range(workers):
-        if self._use_multiprocessing:
-          # Reset random seed else all children processes
-          # share the same seed
-          np.random.seed(self.seed)
-          thread = multiprocessing.Process(target=self._data_generator_task)
-          thread.daemon = True
-          if self.seed is not None:
-            self.seed += 1
-        else:
-          thread = threading.Thread(target=self._data_generator_task)
-        self._threads.append(thread)
-        thread.start()
-    except:
-      self.stop()
-      raise
-
-  def is_running(self):
-    return self._stop_event is not None and not self._stop_event.is_set()
-
-  def stop(self, timeout=None):
-    """Stops running threads and wait for them to exit, if necessary.
-
-    Should be called by the same thread which called `start()`.
+      workers: Number of works.
 
-    Arguments:
-        timeout: maximum time to wait on `thread.join()`.
+    Returns:
+        A Function to initialize the pool
     """
-    if self.is_running():
-      self._stop_event.set()
-
-    for thread in self._threads:
-      if self._use_multiprocessing:
-        if thread.is_alive():
-          thread.terminate()
-      else:
-        # The thread.is_alive() test is subject to a race condition:
-        # the thread could terminate right after the test and before the
-        # join, rendering this test meaningless -> Call thread.join()
-        # always, which is ok no matter what the status of the thread.
-        thread.join(timeout)
-
-    if self._manager:
-      self._manager.shutdown()
+    def pool_fn(seqs):
+      return multiprocessing.Pool(workers,
+                                  initializer=init_pool_generator,
+                                  initargs=(seqs, self.random_seed))
+    return pool_fn
 
-    self._threads = []
-    self._stop_event = None
-    self.queue = None
+  def _run(self):
+    """Submits request to the executor and queue the `Future` objects."""
+    self._send_sequence()  # Share the initial generator
+    with closing(self.executor_fn(_SHARED_SEQUENCES)) as executor:
+      while True:
+        if self.stop_signal.is_set():
+          return
+        self.queue.put(
+            executor.apply_async(next_sample, (self.uid,)), block=True)
 
   def get(self):
     """Creates a generator to extract data from the queue.
@@ -808,24 +738,30 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         `(inputs, targets)` or
         `(inputs, targets, sample_weights)`.
     """
-    while self.is_running():
-      if not self.queue.empty():
-        success, value = self.queue.get()
-        # Rethrow any exceptions found in the queue
-        if not success:
-          six.reraise(value.__class__, value, value.__traceback__)
-        # Yield regular values
-        if value is not None:
-          yield value
-      else:
-        all_finished = all([not thread.is_alive() for thread in self._threads])
-        if all_finished and self.queue.empty():
-          raise StopIteration()
-        else:
-          time.sleep(self.wait_time)
-
-    # Make sure to rethrow the first exception in the queue, if any
-    while not self.queue.empty():
-      success, value = self.queue.get()
-      if not success:
-        six.reraise(value.__class__, value, value.__traceback__)
+    try:
+      while self.is_running():
+        inputs = self.queue.get(block=True).get()
+        self.queue.task_done()
+        if inputs is not None:
+          yield inputs
+    except StopIteration:
+      # Special case for finite generators
+      last_ones = []
+      while self.queue.qsize() > 0:
+        last_ones.append(self.queue.get(block=True))
+      # Wait for them to complete
+      for f in last_ones:
+        f.wait()
+      # Keep the good ones
+      last_ones = [future.get() for future in last_ones if future.successful()]
+      for inputs in last_ones:
+        if inputs is not None:
+          yield inputs
+    except Exception as e:  # pylint: disable=broad-except
+      self.stop()
+      if 'generator already executing' in str(e):
+        raise RuntimeError(
+            'Your generator is NOT thread-safe. '
+            'Keras requires a thread-safe generator when '
+            '`use_multiprocessing=False, workers > 1`. ')
+      six.reraise(*sys.exc_info())
diff --git a/tensorflow/python/keras/utils/data_utils_test.py b/tensorflow/python/keras/utils/data_utils_test.py
index 395df7e0e786d510e785c3ed099905a91e09a149..cc95803d6d6b7f8674d2fc944496b0e74d63f765 100644
--- a/tensorflow/python/keras/utils/data_utils_test.py
+++ b/tensorflow/python/keras/utils/data_utils_test.py
@@ -228,7 +228,7 @@ class TestEnqueuers(test.TestCase):
         FaultSequence(), use_multiprocessing=False)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
-    with self.assertRaises(StopIteration):
+    with self.assertRaises(IndexError):
       next(gen_output)
 
   def test_ordered_enqueuer_fail_processes(self):
@@ -236,7 +236,7 @@ class TestEnqueuers(test.TestCase):
         FaultSequence(), use_multiprocessing=True)
     enqueuer.start(3, 10)
     gen_output = enqueuer.get()
-    with self.assertRaises(StopIteration):
+    with self.assertRaises(IndexError):
       next(gen_output)
 
   def test_on_epoch_end_processes(self):
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 2e56fa2dc5474678ba3ef765bc148f09c4665ec0..c331ce430bd761ca4beb2d6f8ab2e314e2e3178c 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -125,61 +125,80 @@ def get_custom_objects():
   return _GLOBAL_CUSTOM_OBJECTS
 
 
+def serialize_keras_class_and_config(cls_name, cls_config):
+  """Returns the serialization of the class with the given config."""
+  return {'class_name': cls_name, 'config': cls_config}
+
+
 @tf_export('keras.utils.serialize_keras_object')
 def serialize_keras_object(instance):
   _, instance = tf_decorator.unwrap(instance)
   if instance is None:
     return None
   if hasattr(instance, 'get_config'):
-    return {
-        'class_name': instance.__class__.__name__,
-        'config': instance.get_config()
-    }
+    return serialize_keras_class_and_config(instance.__class__.__name__,
+                                            instance.get_config())
   if hasattr(instance, '__name__'):
     return instance.__name__
   else:
     raise ValueError('Cannot serialize', instance)
 
 
+def class_and_config_for_serialized_keras_object(
+    config,
+    module_objects=None,
+    custom_objects=None,
+    printable_module_name='object'):
+  """Returns the class name and config for a serialized keras object."""
+  if (not isinstance(config, dict) or 'class_name' not in config or
+      'config' not in config):
+    raise ValueError('Improper config format: ' + str(config))
+
+  class_name = config['class_name']
+  if custom_objects and class_name in custom_objects:
+    cls = custom_objects[class_name]
+  elif class_name in _GLOBAL_CUSTOM_OBJECTS:
+    cls = _GLOBAL_CUSTOM_OBJECTS[class_name]
+  else:
+    module_objects = module_objects or {}
+    cls = module_objects.get(class_name)
+    if cls is None:
+      raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
+  return (cls, config['config'])
+
+
 @tf_export('keras.utils.deserialize_keras_object')
 def deserialize_keras_object(identifier,
                              module_objects=None,
                              custom_objects=None,
                              printable_module_name='object'):
+  if identifier is None:
+    return None
   if isinstance(identifier, dict):
     # In this case we are dealing with a Keras config dictionary.
     config = identifier
-    if 'class_name' not in config or 'config' not in config:
-      raise ValueError('Improper config format: ' + str(config))
-    class_name = config['class_name']
-    if custom_objects and class_name in custom_objects:
-      cls = custom_objects[class_name]
-    elif class_name in _GLOBAL_CUSTOM_OBJECTS:
-      cls = _GLOBAL_CUSTOM_OBJECTS[class_name]
-    else:
-      module_objects = module_objects or {}
-      cls = module_objects.get(class_name)
-      if cls is None:
-        raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
+    (cls, cls_config) = class_and_config_for_serialized_keras_object(
+        config, module_objects, custom_objects, printable_module_name)
+
     if hasattr(cls, 'from_config'):
       arg_spec = tf_inspect.getfullargspec(cls.from_config)
       custom_objects = custom_objects or {}
 
       if 'custom_objects' in arg_spec.args:
         return cls.from_config(
-            config['config'],
+            cls_config,
             custom_objects=dict(
                 list(_GLOBAL_CUSTOM_OBJECTS.items()) +
                 list(custom_objects.items())))
       with CustomObjectScope(custom_objects):
-        return cls.from_config(config['config'])
+        return cls.from_config(cls_config)
     else:
       # Then `cls` may be a function returning a class.
       # in this case by convention `config` holds
       # the kwargs of the function.
       custom_objects = custom_objects or {}
       with CustomObjectScope(custom_objects):
-        return cls(**config['config'])
+        return cls(**cls_config)
   elif isinstance(identifier, six.string_types):
     function_name = identifier
     if custom_objects and function_name in custom_objects:
@@ -300,14 +319,16 @@ class Progbar(object):
           will be displayed as-is. All others will be averaged
           by the progbar before display.
       interval: Minimum visual progress update interval (in seconds).
+      unit_name: Display name for step counts (usually "step" or "sample").
   """
 
   def __init__(self, target, width=30, verbose=1, interval=0.05,
-               stateful_metrics=None):
+               stateful_metrics=None, unit_name='step'):
     self.target = target
     self.width = width
     self.verbose = verbose
     self.interval = interval
+    self.unit_name = unit_name
     if stateful_metrics:
       self.stateful_metrics = set(stateful_metrics)
     else:
@@ -406,12 +427,12 @@ class Progbar(object):
 
         info = ' - ETA: %s' % eta_format
       else:
-        if time_per_unit >= 1:
-          info += ' %.0fs/step' % time_per_unit
+        if time_per_unit >= 1 or time_per_unit == 0:
+          info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
         elif time_per_unit >= 1e-3:
-          info += ' %.0fms/step' % (time_per_unit * 1e3)
+          info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
         else:
-          info += ' %.0fus/step' % (time_per_unit * 1e6)
+          info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
 
       for k in self._values_order:
         info += ' - %s:' % k
diff --git a/tensorflow/python/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
index 87bc19eb37d15d35bb8ad0f5d086404f9c4f55ca..ead4beee1cbeb7e285ad37622092953827375bdd 100644
--- a/tensorflow/python/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -71,5 +71,15 @@ class TestCustomObjectScope(test.TestCase):
       self.assertEqual(cl.__class__, CustomClass)
 
 
+class SerializeKerasObjectTest(test.TestCase):
+
+  def test_serialize_none(self):
+    serialized = keras.utils.generic_utils.serialize_keras_object(None)
+    self.assertEqual(serialized, None)
+    deserialized = keras.utils.generic_utils.deserialize_keras_object(
+        serialized)
+    self.assertEqual(deserialized, None)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index 158a9a5e76d214eef1f853f964aafe00b030b112..60677be73512c921f9fbbc96911655f28de29638 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -77,7 +77,7 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  return int(np.sum([np.prod(p.get_shape().as_list()) for p in set(weights)]))
+  return int(sum(np.prod(p.get_shape().as_list()) for p in set(weights)))
 
 
 def print_summary(model, line_length=None, positions=None, print_fn=None):
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc4b4ac7dfd0966af5f4c21d4b78ba8ecd6bf46a
--- /dev/null
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -0,0 +1,189 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utilities related to loss functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.ops.losses import losses_impl
+
+
+def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
+  """Squeeze or expand last dimension if needed.
+
+  1. Squeezes last dim of `y_pred` or `y_true` if their rank differs by 1
+  (using `confusion_matrix.remove_squeezable_dimensions`).
+  2. Squeezes or expands last dim of `sample_weight` if its rank differs by 1
+  from the new rank of `y_pred`.
+  If `sample_weight` is scalar, it is kept scalar.
+
+  This will use static shape if available. Otherwise, it will add graph
+  operations, which could result in a performance hit.
+
+  Args:
+    y_pred: Predicted values, a `Tensor` of arbitrary dimensions.
+    y_true: Optional label `Tensor` whose dimensions match `y_pred`.
+    sample_weight: Optional weight scalar or `Tensor` whose dimensions match
+      `y_pred`.
+
+  Returns:
+    Tuple of `y_pred`, `y_true` and `sample_weight`. Each of them possibly has
+    the last dimension squeezed,
+    `sample_weight` could be extended by one dimension.
+  """
+  if y_true is not None:
+    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
+    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
+        y_true, y_pred)
+
+  if sample_weight is None:
+    return y_pred, y_true, None
+
+  sample_weight = ops.convert_to_tensor(sample_weight)
+  weights_shape = sample_weight.get_shape()
+  weights_rank = weights_shape.ndims
+  if weights_rank == 0:  # If weights is scalar, do nothing.
+    return y_pred, y_true, sample_weight
+
+  y_pred_shape = y_pred.get_shape()
+  y_pred_rank = y_pred_shape.ndims
+  if (y_pred_rank is not None) and (weights_rank is not None):
+    # Use static rank.
+    if weights_rank - y_pred_rank == 1:
+      sample_weight = array_ops.squeeze(sample_weight, [-1])
+    elif y_pred_rank - weights_rank == 1:
+      sample_weight = array_ops.expand_dims(sample_weight, [-1])
+    return y_pred, y_true, sample_weight
+
+  # Use dynamic rank.
+  weights_rank_tensor = array_ops.rank(sample_weight)
+  rank_diff = weights_rank_tensor - array_ops.rank(y_pred)
+  maybe_squeeze_weights = lambda: array_ops.squeeze(sample_weight, [-1])
+
+  def _maybe_expand_weights():
+    return control_flow_ops.cond(
+        math_ops.equal(rank_diff,
+                       -1), lambda: array_ops.expand_dims(sample_weight, [-1]),
+        lambda: sample_weight)
+
+  def _maybe_adjust_weights():
+    return control_flow_ops.cond(
+        math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
+        _maybe_expand_weights)
+
+  # squeeze or expand last dim of `sample_weight` if its rank differs by 1
+  # from the new rank of `y_pred`.
+  sample_weight = control_flow_ops.cond(
+      math_ops.equal(weights_rank_tensor, 0), lambda: sample_weight,
+      _maybe_adjust_weights)
+  return y_pred, y_true, sample_weight
+
+
+def _safe_mean(losses, num_present):
+  """Computes a safe mean of the losses.
+
+  Args:
+    losses: `Tensor` whose elements contain individual loss measurements.
+    num_present: The number of measurable elements in `losses`.
+
+  Returns:
+    A scalar representing the mean of `losses`. If `num_present` is zero,
+      then zero is returned.
+  """
+  total_loss = math_ops.reduce_sum(losses)
+  return math_ops.div_no_nan(total_loss, num_present, name='value')
+
+
+def _num_elements(losses):
+  """Computes the number of elements in `losses` tensor."""
+  with ops.name_scope(None, 'num_elements', values=[losses]) as scope:
+    return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
+
+
+def _reduce_weighted_loss(
+    weighted_losses, reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE):
+  """Reduces the individual weighted loss measurements."""
+  if reduction == losses_impl.ReductionV2.NONE:
+    loss = weighted_losses
+  else:
+    loss = math_ops.reduce_sum(weighted_losses)
+    if reduction == losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE:
+      loss = _safe_mean(loss, _num_elements(weighted_losses))
+  return loss
+
+
+def compute_weighted_loss(losses,
+                          sample_weight=None,
+                          reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+                          name=None):
+  """Computes the weighted loss.
+
+  Args:
+    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
+    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
+      `losses`, or be broadcastable to `losses`.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+
+  Raises:
+    ValueError: If the shape of `sample_weight` is not compatible with `losses`.
+
+  Returns:
+    Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
+    `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
+  """
+  losses_impl.ReductionV2.validate(reduction)
+  if sample_weight is None:
+    sample_weight = 1.0
+  with ops.name_scope(name, 'weighted_loss', (losses, sample_weight)):
+    # Save the `reduction` argument for loss normalization when distributing
+    # to multiple replicas.
+    # TODO(josh11b): Associate it with the returned op for more precision.
+    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
+
+    # Update dimensions of `sample_weight` to match with `losses` if possible.
+    losses, _, sample_weight = squeeze_or_expand_dimensions(
+        losses, None, sample_weight)
+    losses = ops.convert_to_tensor(losses)
+    input_dtype = losses.dtype
+    losses = math_ops.to_float(losses)
+    sample_weight = math_ops.to_float(sample_weight)
+
+    try:
+      # Broadcast weights if possible.
+      sample_weight = weights_broadcast_ops.broadcast_weights(
+          sample_weight, losses)
+    except ValueError:
+      # Reduce values to same ndim as weight array.
+      ndim = K.ndim(losses)
+      weight_ndim = K.ndim(sample_weight)
+      losses = K.mean(losses, axis=list(range(weight_ndim, ndim)))
+
+    sample_weight.get_shape().assert_is_compatible_with(losses.get_shape())
+    weighted_losses = math_ops.multiply(losses, sample_weight)
+    # Apply reduction function to the individual weighted losses.
+    loss = _reduce_weighted_loss(weighted_losses, reduction)
+    # Convert the result back to the input type.
+    loss = math_ops.cast(loss, input_dtype)
+    return loss
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils.py b/tensorflow/python/keras/utils/multi_gpu_utils.py
index e1c49bc85221aa94241ed746c2063aadf881f3cd..04b2ea8fe314afaf935bc81bfa62e0c0f1424aa7 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils.py
@@ -244,9 +244,24 @@ def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False):
         for o in range(len(outputs)):
           all_outputs[o].append(outputs[o])
 
+  # Deduplicate output names to handle Siamese networks.
+  occurrences = {}
+  for n in model.output_names:
+    if n not in occurrences:
+      occurrences[n] = 1
+    else:
+      occurrences[n] += 1
+  conflict_counter = {n: 0 for n, count in occurrences.items() if count > 1}
+  output_names = []
+  for n in model.output_names:
+    if n in conflict_counter:
+      conflict_counter[n] += 1
+      n += '_%d' % conflict_counter[n]
+    output_names.append(n)
+
   # Merge outputs under expected scope.
   with ops.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]):
     merged = []
-    for name, outputs in zip(model.output_names, all_outputs):
+    for name, outputs in zip(output_names, all_outputs):
       merged.append(concatenate(outputs, axis=0, name=name))
     return Model(model.inputs, merged)
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 3d0351a11f44d4c047a38aafb4267497c75e2425..8c1abd632484273a01fd99cbd72ee73b66e46f27 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -158,7 +158,7 @@ class TestMultiGPUModel(test.TestCase):
       dataset = data.Dataset.from_tensor_slices((x_train, y_train))
       dataset = dataset.repeat()
       dataset = dataset.batch(4)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = data.make_one_shot_iterator(dataset)
 
       inputs, targets = iterator.get_next()
 
@@ -198,5 +198,31 @@ class TestMultiGPUModel(test.TestCase):
       parallel_model.compile(loss='mean_squared_error', optimizer='adam')
       parallel_model.train_on_batch(x, y)
 
+  def test_multi_gpu_with_siamese_network(self):
+    gpus = 2
+
+    if not check_if_compatible_devices(gpus=gpus):
+      return
+
+    with self.cached_session():
+      input_shape = (3,)
+      nested_model = keras.models.Sequential([
+          keras.layers.Dense(32, input_shape=input_shape),
+          keras.layers.Dense(1)
+      ], name='nested')
+
+      input1 = keras.Input(input_shape)
+      input2 = keras.Input(input_shape)
+      score1 = nested_model(input1)
+      score2 = nested_model(input2)
+      score_sum = keras.layers.Add(name='add')([score1, score2])
+
+      siamese = keras.models.Model(inputs=[input1, input2],
+                                   outputs=[score_sum, score1, score2],
+                                   name='siamese')
+      parallel_siamese = keras.utils.multi_gpu_model(siamese, gpus)
+      self.assertEqual(parallel_siamese.output_names,
+                       ['add', 'nested_1', 'nested_2'])
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/utils/np_utils.py b/tensorflow/python/keras/utils/np_utils.py
index c24e87308bee20e4ed978514699d4beb2ee4fbb9..3763999bff4f6c920e1fadeb98e964fe62f8412c 100644
--- a/tensorflow/python/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/utils/np_utils.py
@@ -22,7 +22,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.utils.to_categorical')
-def to_categorical(y, num_classes=None):
+def to_categorical(y, num_classes=None, dtype='float32'):
   """Converts a class vector (integers) to binary class matrix.
 
   E.g. for use with categorical_crossentropy.
@@ -31,6 +31,7 @@ def to_categorical(y, num_classes=None):
       y: class vector to be converted into a matrix
           (integers from 0 to num_classes).
       num_classes: total number of classes.
+      dtype: The data type expected by the input. Default: `'float32'`.
 
   Returns:
       A binary matrix representation of the input. The classes axis is placed
@@ -44,7 +45,7 @@ def to_categorical(y, num_classes=None):
   if not num_classes:
     num_classes = np.max(y) + 1
   n = y.shape[0]
-  categorical = np.zeros((n, num_classes), dtype=np.float32)
+  categorical = np.zeros((n, num_classes), dtype=dtype)
   categorical[np.arange(n), y] = 1
   output_shape = input_shape + (num_classes,)
   categorical = np.reshape(categorical, output_shape)
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index cfdb3de2aa7d9f5d39eb61cb21ec2505365fc6f7..7b4c9e7239e2f097e0351b160bd7520ee587a8b3 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -17,8 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
@@ -153,3 +155,64 @@ def shape_type_conversion(fn):
       return tensor_shape.TensorShape(output_shape)
 
   return wrapper
+
+
+def are_all_symbolic_tensors(tensors):
+  return all(is_symbolic_tensor(tensor) for tensor in tensors)
+
+
+_user_convertible_tensor_types = set()
+
+
+def is_symbolic_tensor(tensor):
+  """Returns whether a tensor is symbolic (from a TF graph) or an eager tensor.
+
+  A Variable can be seen as either: it is considered symbolic
+  when we are in a graph scope, and eager when we are in an eager scope.
+
+  Arguments:
+    tensor: A tensor instance to test.
+
+  Returns:
+    True for symbolic tensors, False for eager tensors.
+  """
+  if isinstance(tensor, variables.Variable):
+    return not context.executing_eagerly()
+  if isinstance(tensor, (ops.Tensor, sparse_tensor.SparseTensor)):
+    return hasattr(tensor, 'graph')
+  if isinstance(tensor, tuple(_user_convertible_tensor_types)):
+    return hasattr(ops.convert_to_tensor(tensor), 'graph')
+  return False
+
+
+def register_symbolic_tensor_type(cls):
+  """Allows users to specify types regarded as symbolic `Tensor`s.
+
+  Used in conjunction with `tf.register_tensor_conversion_function`, calling
+  `tf.keras.utils.register_symbolic_tensor_type(cls)` allows non-`Tensor`
+  objects to be plumbed through Keras layers.
+
+  Example:
+
+  ```python
+  # One-time setup.
+  class Foo(object):
+    def __init__(self, input_):
+      self._input = input_
+    def value(self):
+      return tf.constant(42.)
+
+  tf.register_tensor_conversion_function(
+      Foo, lambda x, *args, **kwargs: x.value())
+
+  tf.keras.utils.register_symbolic_tensor_type(Foo)
+
+  # User-land.
+  layer = tf.keras.layers.Lambda(lambda input_: Foo(input_))
+  ```
+
+  Arguments:
+    cls: A `class` type which shall be regarded as a symbolic `Tensor`.
+  """
+  global _user_convertible_tensor_types
+  _user_convertible_tensor_types.add(cls)
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9833a492993feb3a989d09160919fbf85c3a21e7
--- /dev/null
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras TF utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TestIsSymbolicTensor(test.TestCase):
+
+  def test_default_behavior(self):
+    if context.executing_eagerly():
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+    else:
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+
+  def test_works_with_registered(self):
+
+    class CustomClass(object):
+
+      def value(self):
+        return ops.convert_to_tensor(42.)
+
+    ops.register_tensor_conversion_function(
+        CustomClass, lambda value, **_: value.value())
+
+    tf_utils.register_symbolic_tensor_type(CustomClass)
+
+    if context.executing_eagerly():
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+      self.assertFalse(tf_utils.is_symbolic_tensor(CustomClass()))
+    else:
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+      self.assertTrue(tf_utils.is_symbolic_tensor(CustomClass()))
+
+  def test_enables_nontensor_plumbing(self):
+    # Setup.
+
+    class Foo(object):
+
+      def __init__(self, input_):
+        self._input = input_
+        self.value = ops.convert_to_tensor(42.)
+
+    ops.register_tensor_conversion_function(
+        Foo, lambda x, *args, **kwargs: x.value)
+    tf_utils.register_symbolic_tensor_type(Foo)
+
+    class PlumbingLayer(keras.layers.Lambda):
+
+      def __init__(self, fn, **kwargs):
+        def _fn(*fargs, **fkwargs):
+          d = fn(*fargs, **fkwargs)
+          x = ops.convert_to_tensor(d)
+          d.shape = x.shape
+          d.get_shape = x.get_shape
+          return d, x
+        super(PlumbingLayer, self).__init__(_fn, **kwargs)
+        self._enter_dunder_call = False
+
+      def __call__(self, inputs, *args, **kwargs):
+        self._enter_dunder_call = True
+        d, _ = super(PlumbingLayer, self).__call__(inputs, *args, **kwargs)
+        self._enter_dunder_call = False
+        return d
+
+      def call(self, inputs, *args, **kwargs):
+        d, v = super(PlumbingLayer, self).call(inputs, *args, **kwargs)
+        if self._enter_dunder_call:
+          return d, v
+        return d
+
+    # User-land.
+    model = keras.Sequential([
+        keras.layers.InputLayer([]),
+        PlumbingLayer(Foo),  # Makes a `Foo` object.
+    ])
+    # Let's ensure Keras graph history is preserved by composing the models.
+    model = keras.Model(model.inputs, model(model.outputs))
+    # Now we instantiate the model and verify we have a `Foo` object, not a
+    # `Tensor`.
+    y = model(ops.convert_to_tensor(7.))
+    self.assertIsInstance(y, Foo)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index e055ef1c1b780b04bf8031e3db58ca6d6e7b0a03..df8c14970a0af7e2b1bd19162b344ff4329d385f 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -118,10 +118,13 @@ cuda_py_test(
     size = "small",
     srcs = ["list_ops_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:list_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/eager:context",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -267,9 +270,9 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "ctc_loss_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["ctc_loss_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -607,7 +610,7 @@ tf_py_test(
 
 tf_py_test(
     name = "matrix_exponential_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["matrix_exponential_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -641,7 +644,7 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
     ],
-    tags = ["notap"],
+    tags = ["optonly"],
 )
 
 cuda_py_test(
@@ -658,6 +661,18 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "matrix_square_root_op_test",
+    size = "medium",
+    srcs = ["matrix_square_root_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+    ],
+)
+
 cuda_py_test(
     name = "matrix_solve_op_test",
     size = "medium",
@@ -687,6 +702,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["parameterized_truncated_normal_op_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
@@ -815,6 +831,7 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:io_ops_gen",
     ],
@@ -1052,9 +1069,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "summary_ops_test",
+    name = "summary_v1_ops_test",
     size = "small",
-    srcs = ["summary_ops_test.py"],
+    srcs = ["summary_v1_ops_test.py"],
     additional_deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -1065,9 +1082,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "summary_tensor_op_test",
+    name = "summary_v1_tensor_op_test",
     size = "small",
-    srcs = ["summary_tensor_op_test.py"],
+    srcs = ["summary_v1_tensor_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -1076,7 +1093,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:summary_ops",
+        "//tensorflow/python:summary",
     ],
 )
 
@@ -1140,6 +1157,45 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "unicode_encode_op_test",
+    size = "small",
+    srcs = ["unicode_encode_op_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+    ],
+)
+
+tf_py_test(
+    name = "unicode_transcode_op_test",
+    size = "small",
+    srcs = ["unicode_transcode_op_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
+tf_py_test(
+    name = "unicode_decode_op_test",
+    size = "small",
+    srcs = ["unicode_decode_op_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "unique_op_test",
     size = "small",
@@ -1172,6 +1228,7 @@ tf_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:wrap_function",
     ],
     tags = ["no_windows"],
 )
@@ -1323,6 +1380,7 @@ cuda_py_test(
         "//tensorflow/python:test_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
     ],
     shard_count = 10,
     tags = [
@@ -1434,6 +1492,7 @@ cuda_py_test(
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:math_ops",
@@ -1479,9 +1538,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "control_flow_ops_py_test",
-    # TODO(b/70473603): change this back to "small" once the C API is
-    # permanently enabled
-    size = "large",
+    size = "small",
     srcs = ["control_flow_ops_py_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1515,6 +1572,8 @@ cuda_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python:while_v2",
     ],
+    shard_count = 16,
+    tags = ["no_gpu"],  # TODO(b/117928656)
 )
 
 tf_py_test(
@@ -1530,6 +1589,21 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "control_flow_util_v2_test",
+    size = "small",
+    srcs = ["control_flow_util_v2_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:cond_v2",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util_v2",
+        "//tensorflow/python:while_v2",
+    ],
+    tags = ["no_gpu"],  # TODO(b/117796385): runs out of memory
+)
+
 cuda_py_test(
     name = "conv1d_test",
     size = "small",
@@ -1706,9 +1780,11 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python:while_v2",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
     grpc_enabled = True,
+    shard_count = 2,
     tags = ["no_windows"],
 )
 
@@ -1785,6 +1861,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["linalg_ops_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1811,6 +1888,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "lu_op_test",
+    size = "small",
+    srcs = ["lu_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/ops/linalg",
+    ],
+)
+
 cuda_py_test(
     name = "manip_ops_test",
     size = "small",
@@ -1996,12 +2089,13 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python:tf2",
     ],
 )
 
@@ -2291,9 +2385,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "summary_audio_op_test",
+    name = "summary_v1_audio_op_test",
     size = "small",
-    srcs = ["summary_audio_op_test.py"],
+    srcs = ["summary_v1_audio_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
@@ -2304,9 +2398,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "summary_image_op_test",
+    name = "summary_v1_image_op_test",
     size = "small",
-    srcs = ["summary_image_op_test.py"],
+    srcs = ["summary_v1_image_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
@@ -2340,6 +2434,8 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:cond_v2",
+        "//tensorflow/python:while_v2",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
     ],
@@ -2532,6 +2628,8 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = [
+        # TODO(b/118887316): Re-enable this test in Kokoro.
+        "no_oss",
         "optonly",  # times out
     ],
 )
@@ -2549,6 +2647,8 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    # TODO(b/118842098): Re-enable this test in Kokoro.
+    tags = ["no_oss"],
 )
 
 tf_py_test(
@@ -2579,34 +2679,6 @@ cuda_py_test(
     tags = ["manual"],
 )
 
-cuda_py_test(
-    name = "dct_ops_test",
-    srcs = ["dct_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-)
-
-cuda_py_test(
-    name = "fft_ops_test",
-    size = "medium",
-    srcs = ["fft_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-    shard_count = 4,
-    tags = ["optonly"],
-)
-
 cuda_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
@@ -2696,6 +2768,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "huge_slice_op_test",
+    size = "medium",
+    srcs = ["huge_slice_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+    tags = [
+        "no_oss",  # Requires 4GB+ RAM
+    ],
+)
+
 cuda_py_test(
     name = "sparse_matmul_op_test",
     size = "medium",
@@ -2837,7 +2925,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
-    shard_count = 20,
+    shard_count = 30,
 )
 
 cuda_py_test(
@@ -3214,7 +3302,10 @@ tf_py_test(
         "//tensorflow/python:platform",
     ],
     data = [":ackermann_op.so"],
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
 )
 
 tf_custom_op_library(
@@ -3233,7 +3324,10 @@ tf_py_test(
         "//tensorflow/python:platform",
     ],
     data = [":duplicate_op.so"],
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
 )
 
 tf_custom_op_library(
@@ -3252,15 +3346,19 @@ tf_py_test(
         "//tensorflow/python:platform",
     ],
     data = [":invalid_op.so"],
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "notap",
+    ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "cond_v2_test",
     size = "medium",
     srcs = ["cond_v2_test.py"],
     additional_deps = [
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:test_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:cond_v2",
         "//tensorflow/python:constant_op",
@@ -3269,10 +3367,11 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:while_v2",
     ],
     grpc_enabled = True,
-    tags = ["no_gpu"],  # TODO(b/111656070)
 )
 
 cuda_py_test(
@@ -3289,8 +3388,11 @@ cuda_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:functional_ops",
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:list_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tf_optimizer",
         "//tensorflow/python:while_v2",
     ],
diff --git a/tensorflow/python/kernel_tests/accumulate_n_test.py b/tensorflow/python/kernel_tests/accumulate_n_test.py
index 0bc5268f381193d5de7e4f5a1c3366f1047f3556..5eece9c94137c190331b4c39aea72dc96551d0bb 100644
--- a/tensorflow/python/kernel_tests/accumulate_n_test.py
+++ b/tensorflow/python/kernel_tests/accumulate_n_test.py
@@ -32,40 +32,44 @@ from tensorflow.python.platform import googletest
 class AccumulateNV2Test(test_util.TensorFlowTestCase):
   """Tests of the new, differentiable version of accumulate_n."""
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     np.random.seed(12345)
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
     tf_x = ops.convert_n_to_tensor(x)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x).eval())
       self.assertAllClose(x[0] * 5,
                           math_ops.accumulate_n([tf_x[0]] * 5).eval())
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     np.random.seed(54321)
     x = [np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(6)]
     tf_x = ops.convert_n_to_tensor(x)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(sum(x), math_ops.accumulate_n(tf_x).eval())
       self.assertAllEqual(x[0] * 6,
                           math_ops.accumulate_n([tf_x[0]] * 6).eval())
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x0 = array_ops.placeholder(dtype=dtypes_lib.int32, shape=[None])
       acc = math_ops.accumulate_n([x0, x0], shape=[None])
       self.assertAllEqual([2, 4], acc.eval(feed_dict={x0: [1, 2]}))
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     np.random.seed(42)
     for num_inputs in range(1, 10):
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         input_vars = [
             variables.Variable(10.0 * np.random.random())
             for _ in range(0, num_inputs)
         ]
         accum_n = math_ops.accumulate_n(input_vars)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         accum_n_grad = gradients.gradients(accum_n, input_vars)
         self.assertAllEqual(
             np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
@@ -88,13 +92,13 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       np_val = random_arrays[0]
       for random_array in random_arrays[1:]:
         np_val += random_array
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testZeroArgs(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         tf_val = math_ops.accumulate_n([])
-        tf_val.eval()
+        self.evaluate(tf_val)
 
   def testWrongShape(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/ackermann_test.py b/tensorflow/python/kernel_tests/ackermann_test.py
index d267e4975272b9df9e189c92695eea2a9500f27b..6c20b19be9e6353e40dedd84db9edda9de8cc827 100644
--- a/tensorflow/python/kernel_tests/ackermann_test.py
+++ b/tensorflow/python/kernel_tests/ackermann_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import load_library
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 
 
 class AckermannTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'ackermann_op.so')
diff --git a/tensorflow/python/kernel_tests/aggregate_ops_test.py b/tensorflow/python/kernel_tests/aggregate_ops_test.py
index 0a08c01dad38f9b31b775b25f01db2e4361df552..d9787cc3bf6b6bdbdc917c9d40b8ebdfed9eb3bb 100644
--- a/tensorflow/python/kernel_tests/aggregate_ops_test.py
+++ b/tensorflow/python/kernel_tests/aggregate_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
@@ -39,8 +40,10 @@ class AddNTest(test.TestCase):
 
   def _supported_types(self):
     if test.is_gpu_available():
-      return [dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
-              dtypes.complex128]
+      return [
+          dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
+          dtypes.complex128, dtypes.int64
+      ]
     return [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
             dtypes.float16, dtypes.float32, dtypes.float64, dtypes.complex64,
             dtypes.complex128]
@@ -55,19 +58,20 @@ class AddNTest(test.TestCase):
 
   def testAddN(self):
     np.random.seed(12345)
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for dtype in self._supported_types():
         for count in range(1, self._MAX_N + 1):
           data = [self._buildData((2, 2), dtype) for _ in range(count)]
-          actual = sess.run(math_ops.add_n(data))
+          actual = self.evaluate(math_ops.add_n(data))
           expected = np.sum(np.vstack(
               [np.expand_dims(d, 0) for d in data]), axis=0)
           tol = 5e-3 if dtype == dtypes.float16 else 5e-7
           self.assertAllClose(expected, actual, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testUnknownShapes(self):
     np.random.seed(12345)
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for dtype in self._supported_types():
         data = self._buildData((2, 2), dtype)
         for count in range(1, self._MAX_N + 1):
@@ -78,6 +82,7 @@ class AddNTest(test.TestCase):
           tol = 5e-3 if dtype == dtypes.float16 else 5e-7
           self.assertAllClose(expected, actual, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testVariant(self):
 
     def create_constant_variant(value):
@@ -94,7 +99,7 @@ class AddNTest(test.TestCase):
 
     # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
     # copying between CPU and GPU is supported.
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       variant_const_3 = create_constant_variant(3)
       variant_const_4 = create_constant_variant(4)
       variant_const_5 = create_constant_variant(5)
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index 127d14c25088462c13352e70295f56581629ffb8..06ec0948c25006c06039bfde9ef9e3e6da760889 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -34,17 +35,17 @@ class ArgMaxTest(test.TestCase):
                expected_values,
                use_gpu=False,
                expected_err_re=None):
-    with self.test_session(use_gpu=use_gpu):
+    with self.session(use_gpu=use_gpu):
       ans = method(x, axis=axis)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         # Defaults to int64 output.
         self.assertEqual(np.int64, tf_ans.dtype)
         self.assertAllEqual(tf_ans, expected_values)
         self.assertShapeEqual(expected_values, ans)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBothArg(self,
                    method,
@@ -77,17 +78,17 @@ class ArgMaxTest(test.TestCase):
   def testFloatInt32Output(self):
     x = np.asarray(100 * np.random.randn(200), dtype=np.float32)
     expected_values = x.argmax()
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ans = math_ops.argmax(x, axis=0, output_type=dtypes.int32)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
       self.assertEqual(np.int32, tf_ans.dtype)
       # The values are equal when comparing int32 to int64 because
       # the values don't have a range that exceeds 32-bit integers.
       self.assertAllEqual(tf_ans, expected_values)
     expected_values = x.argmin()
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ans = math_ops.argmin(x, axis=0, output_type=dtypes.int32)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
       self.assertEqual(np.int32, tf_ans.dtype)
       self.assertAllEqual(tf_ans, expected_values)
 
@@ -110,12 +111,14 @@ class ArgMaxTest(test.TestCase):
             r"Reduction axis 0 is empty in shape \[0\]"):
           op([], 0).eval()
 
+  @test_util.run_deprecated_v1
   def testDefaultAxis(self):
     with self.cached_session():
       for op in math_ops.argmin, math_ops.argmax:
         ans = op([1]).eval()
         self.assertAllEqual(ans, 0)
 
+  @test_util.run_deprecated_v1
   def testOutputEmpty(self):
     with self.cached_session():
       for op in math_ops.argmin, math_ops.argmax:
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index dcc594789ebc669f7906c7e356448a67f0b2c7db..f4c442b7b1932c3ddab0d255f57c3fac5a23954a 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import time
 import unittest
 
@@ -24,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -31,6 +33,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -45,24 +48,23 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
 
   def testNonBatchMatrix(self):
     matrix = [[1, 2, 3], [4, 5, 6]]  # Shape (2, 3)
     expected_transposed = [[1, 4], [2, 5], [3, 6]]  # Shape (3, 2)
-    with self.cached_session():
-      transposed = array_ops.matrix_transpose(matrix)
-      self.assertEqual((3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    transposed = array_ops.matrix_transpose(matrix)
+    self.assertEqual((3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testConjugate(self):
     m = [[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j, 6 + 6j]]
     expected_transposed = [[1 - 1j, 4 - 4j], [2 - 2j, 5 - 5j], [3 - 3j, 6 - 6j]]
-    with self.cached_session():
-      matrix = ops.convert_to_tensor(m)
-      transposed = array_ops.matrix_transpose(matrix, conjugate=True)
-      self.assertEqual((3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    matrix = ops.convert_to_tensor(m)
+    transposed = array_ops.matrix_transpose(matrix, conjugate=True)
+    self.assertEqual((3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testBatchMatrix(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
@@ -71,43 +73,44 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
     matrix_1_t = [[11, 44], [22, 55], [33, 66]]
     batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
     expected_transposed = [matrix_0_t, matrix_1_t]  # Shape (2, 3, 2)
-    with self.cached_session():
-      transposed = array_ops.matrix_transpose(batch_matrix)
-      self.assertEqual((2, 3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    transposed = array_ops.matrix_transpose(batch_matrix)
+    self.assertEqual((2, 3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testNonBatchMatrixDynamicallyDefined(self):
-    matrix = [[1, 2, 3], [4, 5, 6]]  # Shape (2, 3)
+    # needs explicit `constant` because lists are not automatically
+    # converted to sensors when applying `transpose` below
+    matrix = constant_op.constant([[1, 2, 3], [4, 5, 6]])  # Shape (2, 3)
     expected_transposed = [[1, 4], [2, 5], [3, 6]]  # Shape (3, 2)
-    with self.cached_session():
-      matrix_ph = array_ops.placeholder(dtypes.int32)
-      transposed = array_ops.matrix_transpose(matrix_ph)
-      self.assertAllEqual(
-          expected_transposed, transposed.eval(feed_dict={
-              matrix_ph: matrix
-          }))
+    @def_function.function(input_signature=
+                           [tensor_spec.TensorSpec
+                            (shape=None, dtype=dtypes.int32)])
+    def transpose(matrix):
+      self.assertIs(matrix.shape.ndims, None)
+      return array_ops.matrix_transpose(matrix)
+    self.assertAllEqual(expected_transposed, transpose(matrix))
 
   def testBatchMatrixDynamicallyDefined(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
     matrix_0_t = [[1, 4], [2, 5], [3, 6]]
     matrix_1 = [[11, 22, 33], [44, 55, 66]]
     matrix_1_t = [[11, 44], [22, 55], [33, 66]]
-    batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
+    # needs explicit `constant` because lists are not automatically
+    # converted to sensors when applying `transpose` below
+    batch_matrix = constant_op.constant([matrix_0, matrix_1])  # Shape (2, 2, 3)
     expected_transposed = [matrix_0_t, matrix_1_t]  # Shape (2, 3, 2)
-    with self.cached_session():
-      batch_matrix_ph = array_ops.placeholder(dtypes.int32)
-      transposed = array_ops.matrix_transpose(batch_matrix_ph)
-      self.assertAllEqual(
-          expected_transposed,
-          transposed.eval(feed_dict={
-              batch_matrix_ph: batch_matrix
-          }))
+    @def_function.function(input_signature=
+                           [tensor_spec.TensorSpec
+                            (shape=None, dtype=dtypes.int32)])
+    def transpose(matrix):
+      self.assertIs(matrix.shape.ndims, None)
+      return array_ops.matrix_transpose(matrix)
+    self.assertAllEqual(expected_transposed, transpose(batch_matrix))
 
   def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self):
     vector = [1, 2, 3]
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, "should be a "):
-        array_ops.matrix_transpose(vector)
+    with self.assertRaisesRegexp(ValueError, "should be a "):
+      array_ops.matrix_transpose(vector)
 
 
 class BooleanMaskTest(test_util.TensorFlowTestCase):
@@ -140,36 +143,43 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
 
       self.assertAllClose(masked_arr, masked_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def testMaskDim1ArrDim2Axis1(self):
     ndims_mask = 1
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
 
+  @test_util.run_deprecated_v1
   def testMaskDim2ArrDim2Axis1(self):
     ndims_mask = 2
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
 
+  @test_util.run_deprecated_v1
   def testMaskDim1ArrDim1(self):
     ndims_mask = 1
     for arr_shape in [(1,), (2,), (3,), (10,)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testMaskDim1ArrDim2(self):
     ndims_mask = 1
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testMaskDim2ArrDim2(self):
     ndims_mask = 2
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testMaskDim2ArrDim3(self):
     ndims_mask = 2
     for arr_shape in [(1, 1, 1), (1, 2, 2), (2, 2, 1)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyInput2D(self):
     mask = np.array([True, False])
     arr = np.array([[], []]).astype(np.float32)
@@ -188,6 +198,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
     with self.cached_session():
       self.assertAllClose(numpy_result, tf_result.eval())
 
+  @test_util.run_deprecated_v1
   def testEmptyOutput(self):
     make_mask = lambda shape: np.zeros(shape, dtype=bool)
     for ndims_mask in range(1, 4):
@@ -196,6 +207,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
           arr_shape = np.random.randint(1, 5, size=ndims_arr)
           self.CheckVersusNumpy(ndims_mask, arr_shape, make_mask=make_mask)
 
+  @test_util.run_deprecated_v1
   def testWorksWithDimensionsEqualToNoneDuringGraphBuild(self):
     # The rank of the mask tensor must be specified. This is explained
     # in the docstring as well.
@@ -214,6 +226,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
           })
       np.testing.assert_allclose(masked_tensor, arr[mask])
 
+  @test_util.run_deprecated_v1
   def testMaskDimensionsSetToNoneRaises(self):
     # The rank of the mask tensor must be specified. This is explained
     # in the docstring as well.
@@ -280,10 +293,11 @@ class OperatorShapeTest(test_util.TensorFlowTestCase):
 
 class ReverseV2Test(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testReverse0DimAuto(self):
     x_np = 4
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         x_tf = array_ops.reverse_v2(x_np, []).eval()
         self.assertAllEqual(x_tf, x_np)
 
@@ -292,7 +306,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
 
     for use_gpu in [False, True]:
       for axis_dtype in [dtypes.int32, dtypes.int64]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           x_tf = array_ops.reverse_v2(x_np,
                                       constant_op.constant(
                                           [0], dtype=axis_dtype)).eval()
@@ -304,7 +318,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
       for use_gpu in [False, True]:
         for axis_dtype in [dtypes.int32, dtypes.int64]:
-          with self.test_session(use_gpu=use_gpu):
+          with self.cached_session(use_gpu=use_gpu):
             x_tf_1 = reverse_f(x_np, constant_op.constant(
                 [0], dtype=axis_dtype)).eval()
             x_tf_2 = reverse_f(x_np, constant_op.constant(
@@ -324,6 +338,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
 
   # This test covers the axis validation in the shape function
   # (no eval())
+  @test_util.run_deprecated_v1
   def testInvalidAxis(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
     with self.assertRaisesRegexp(ValueError,
@@ -342,6 +357,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
   #
   # Note: this test passes placeholder as constant axis is validated
   # in shape function (see testInvalidAxis)
+  @test_util.run_deprecated_v1
   def testInvalid(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
     axis = array_ops.placeholder(dtypes.int32)
@@ -356,6 +372,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
                                    "axis 0 specified more than once"):
         array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [0, -2]})
 
+  @test_util.run_deprecated_v1
   def testReverse1DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
@@ -364,6 +381,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     ]:
       self._reverse1DimAuto(dtype)
 
+  @test_util.run_deprecated_v1
   def testReverse2DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
@@ -372,6 +390,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     ]:
       self._reverse2DimAuto(dtype)
 
+  @test_util.run_deprecated_v1
   def testUnknownDims(self):
     reverse_v2 = array_ops.reverse_v2
     data_t = array_ops.placeholder(dtypes.float32)
@@ -389,9 +408,10 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     reverse_2d_t = reverse_v2(data_2d_t, axis_2d_t)
     self.assertEqual(2, reverse_2d_t.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testReverseRowsOf3Channels(self):
     """Tests optimized code for reversing rows with last dim size = 3."""
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
         for outer_size in (1, 2):
           for middle_size in list(range(50)) + [100000]:
@@ -402,8 +422,9 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
             np_answer = x_np[:, ::-1, :]
             self.assertAllEqual(x_tf, np_answer)
 
+  @test_util.run_deprecated_v1
   def testReverseRowsOf4Channels(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
         for outer_size in (1, 2):
           for middle_size in list(range(50)) + [100000]:
@@ -415,7 +436,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
             self.assertAllEqual(x_tf, np_answer)
 
   def testReverseColumnsOf3Channels(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
         for outer_size in list(range(50)) + [100000]:
           for middle_size in (1, 2):
@@ -433,7 +454,7 @@ class MeshgridTest(test_util.TensorFlowTestCase):
     for index in ("ij", "xy"):
       numpy_out = np.meshgrid(x, y, indexing=index)
       tf_out = array_ops.meshgrid(x, y, indexing=index)
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         for xx, yy in zip(numpy_out, tf_out):
           self.assertAllEqual(xx, yy.eval())
 
@@ -446,11 +467,12 @@ class MeshgridTest(test_util.TensorFlowTestCase):
           x += 1j
         inputs.append(x)
       numpy_out = np.meshgrid(*inputs, indexing=index)
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         tf_out = array_ops.meshgrid(*inputs, indexing=index)
         for x_np, x_tf in zip(numpy_out, tf_out):
           self.assertAllEqual(x_np, x_tf.eval())
 
+  @test_util.run_deprecated_v1
   def testCompare(self):
     for t in (np.float16, np.float32, np.float64, np.int32, np.int64,
               np.complex64, np.complex128):
@@ -474,6 +496,8 @@ class StridedSliceChecker(object):
 
   def __init__(self, test, x, tensor_type=dtypes.int32, check_type_infer=True):
     self.x_np = np.array(x).astype(tensor_type.as_numpy_dtype)
+    if tensor_type.is_bool:
+      self.x_np = np.array(x % 3).astype(np.bool)
     # Give the value a non-zero imaginary component for complex types.
     if tensor_type.is_complex:
       self.x_np -= 1j * self.x_np
@@ -514,16 +538,17 @@ class StridedSliceChecker(object):
 
 STRIDED_SLICE_TYPES = [
     dtypes.int32, dtypes.int64, dtypes.int16, dtypes.int8, dtypes.float32,
-    dtypes.float64, dtypes.complex64, dtypes.complex128
+    dtypes.float64, dtypes.complex64, dtypes.complex128, dtypes.bool
 ]
 
 
 class StridedSliceTest(test_util.TensorFlowTestCase):
   """Test the strided slice operation with variants of slices."""
 
+  @test_util.run_deprecated_v1
   def test_basic_slice(self):
     for tensor_type in STRIDED_SLICE_TYPES:
-      with self.test_session(use_gpu=not tensor_type.is_integer):
+      with self.cached_session(use_gpu=True):
         checker = StridedSliceChecker(
             self, StridedSliceChecker.REF_TENSOR, tensor_type=tensor_type)
         _ = checker[:, :, :]
@@ -551,7 +576,8 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
   def testInt64GPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
-    with self.test_session(use_gpu=True, force_gpu=True):
+
+    with test_util.force_gpu():
       x = constant_op.constant([1., 2., 3.])
       begin = constant_op.constant([2], dtype=dtypes.int64)
       end = constant_op.constant([3], dtype=dtypes.int64)
@@ -575,8 +601,9 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       v = variables.Variable([1., 2.])
       v[0]  # pylint: disable=pointless-statement
 
+  @test_util.run_deprecated_v1
   def testDegenerateSlices(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
       # degenerate by offering a forward interval with a negative stride
       _ = checker[0:-1:-1, :, :]
@@ -585,8 +612,9 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       # empty interval in every dimension
       _ = checker[-1:0, 2:2, 2:3:-1]
 
+  @test_util.run_deprecated_v1
   def testEllipsis(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       raw = [[[[[1, 2], [3, 4], [5, 6]]], [[[7, 8], [9, 10], [11, 12]]]]]
       checker = StridedSliceChecker(self, raw)
 
@@ -605,8 +633,9 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "Multiple ellipses"):
         _ = checker[..., :, ...].eval()
 
+  @test_util.run_deprecated_v1
   def testShrink(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
               [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
       checker = StridedSliceChecker(self, raw)
@@ -615,16 +644,18 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       _ = checker[:, 0]
       _ = checker[:, :, 0]
 
+  @test_util.run_deprecated_v1
   def testBothNewAxisAndShrink(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ones = array_ops.placeholder(shape=[2, 2], dtype=dtypes.int16)
       self.assertAllEqual(
           ones[array_ops.newaxis, :, 0].eval(
               feed_dict={ones: [[1, 1], [1, 1]]}),
           [[1, 1]])
 
+  @test_util.run_deprecated_v1
   def testTensorIndexing(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
               [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
       checker = StridedSliceChecker(self, raw, check_type_infer=False)
@@ -632,15 +663,25 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       bar2 = constant_op.constant(3)
       _ = checker[..., bar:bar2]
       _ = checker[..., bar]
-      with self.assertRaisesRegexp(
-          TypeError,
-          "Value passed to parameter 'begin' has DataType float32 not in "
-          "list of allowed values"):
-        _ = checker[..., 3.0]
       _ = checker[..., 3]
+      _ = checker[..., 2 ** 64 // 2**63]  # Test longs in Python 2
 
+  def testTensorIndexingTypeError(self):
+    with self.session(use_gpu=True):
+      checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
+      expected = re.escape(array_ops._SLICE_TYPE_ERROR)
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker["foo"]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[constant_op.constant("foo")]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[0.0]
+      with self.assertRaisesRegexp(TypeError, expected):
+        _ = checker[constant_op.constant(0.0)]
+
+  @test_util.run_deprecated_v1
   def testExpand(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
               [[[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]]]]
       checker = StridedSliceChecker(self, raw)
@@ -656,16 +697,18 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       # Ellipsis in middle of two newaxis
       _ = checker[np.newaxis, ..., np.newaxis]
 
+  @test_util.run_deprecated_v1
   def testExpandVariable(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = variables.Variable(7, dtype=dtypes.int32)
       x.initializer.run()
       y = x[None].eval()
       self.assertEqual(y.shape, (1,))
       self.assertAllEqual(y, (7,))
 
+  @test_util.run_deprecated_v1
   def testOptimizedCases(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       checker = StridedSliceChecker(self,
                                     StridedSliceChecker.REF_TENSOR_ALIGNED)
       # Identity
@@ -693,8 +736,9 @@ class StridedSliceShapeChecker(object):
 class StridedSliceShapeTest(test_util.TensorFlowTestCase):
   """Test the shape inference of StridedSliceShapes."""
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       uncertain_tensor = array_ops.placeholder(dtypes.float32)
       a = StridedSliceShapeChecker(uncertain_tensor)
       a_slice_shape = a[...]
@@ -704,8 +748,9 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
     self.assertTrue(x is not None and y is not None or x is None and y is None)
     self.assertEqual(x.as_list(), y.as_list())
 
+  @test_util.run_deprecated_v1
   def testTensorShapeUncertain(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       uncertain_tensor = array_ops.placeholder(
           dtypes.float32, shape=(5, None, 7))
       a = StridedSliceShapeChecker(uncertain_tensor)
@@ -727,8 +772,9 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
       self.tensorShapeEqual(a[::-1, :, array_ops.newaxis, ::-2],
                             tensor_shape.TensorShape([5, None, 1, 4]))
 
+  @test_util.run_deprecated_v1
   def testTensorValuedIndexShape(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       defined_shape_tensor = array_ops.placeholder(
           dtypes.float32, shape=(5, 3, 7))
       index_value = array_ops.placeholder(dtypes.int32, shape=())
@@ -783,8 +829,9 @@ class GradSliceChecker(object):
 class StridedSliceGradTest(test_util.TensorFlowTestCase):
   """Test that strided slice's custom gradient produces correct gradients."""
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       var = variables.Variable(
           array_ops.reshape(
               math_ops.range(1, 97, 1, dtype=dtypes.float32), shape=(6, 4, 4)))
@@ -805,46 +852,49 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
         _ = grad[:, 200, :]
 
   def testGradientZero(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       var = variables.Variable(8.)
       init = variables.global_variables_initializer()
       sess.run(init)
       grad = GradSliceChecker(self, sess, var, np.array(8))
       _ = grad[tuple()]
 
+  @test_util.run_deprecated_v1
   def testInt64Indices(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       a = math_ops.range(3, dtype=dtypes.float32)
       index = constant_op.constant(1, dtype=dtypes.int64)
       b = 2. * a[index]
       grad, = gradients_impl.gradients(b, a)
-      self.assertAllEqual(sess.run(grad), [0., 2., 0.])
+      self.assertAllEqual(self.evaluate(grad), [0., 2., 0.])
 
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
   """Test varied index types and host located memory."""
 
+  @test_util.run_deprecated_v1
   def testHostVsDevice(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       var2 = variables.Variable(
           array_ops.reshape(
               math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
               shape=(4, 1, 1)))
       varshape = variables.Variable([6, 4, 4], dtype=dtypes.int32)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0])
       end = constant_op.constant([4, 1, 1])
       strides = constant_op.constant([1, 1, 1])
       foo = array_ops.strided_slice_grad(varshape, begin, end, strides, var2)
       sess.run(foo)
 
+  @test_util.run_deprecated_v1
   def testInt64Shape(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       original_dy = array_ops.reshape(
           math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
           shape=(4, 1, 1))
       original_shape = constant_op.constant([6, 4, 4], dtype=dtypes.int64)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0], dtype=dtypes.int64)
       end = constant_op.constant([4, 1, 1], dtype=dtypes.int64)
       strides = constant_op.constant([1, 1, 1], dtype=dtypes.int64)
@@ -852,13 +902,14 @@ class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
                                         original_dy)
       sess.run(dx)
 
+  @test_util.run_deprecated_v1
   def testMixedIndexTypes(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       original_dy = array_ops.reshape(
           math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
           shape=(4, 1, 1))
       original_shape = constant_op.constant([6, 4, 4], dtype=dtypes.int64)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0], dtype=dtypes.int32)
       end = constant_op.constant([4, 1, 1], dtype=dtypes.int64)
       strides = constant_op.constant([1, 1, 1], dtype=dtypes.int64)
@@ -942,8 +993,7 @@ class StridedSliceAssignChecker(object):
     if self.tensor_type.is_complex:
       value -= 1j * value
 
-    with self.test.test_session(
-        use_gpu=not self.tensor_type.is_integer) as sess:
+    with self.test.test_session(use_gpu=True) as sess:
       if self._use_resource:
         var = resource_variable_ops.ResourceVariable(self.x)
       else:
@@ -961,6 +1011,7 @@ class StridedSliceAssignChecker(object):
 
 class SliceAssignTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testInvalidSlice(self):
     with self.cached_session() as sess:
       foo = constant_op.constant([1, 2, 3])
@@ -998,12 +1049,15 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     checker2[...] = 6  # ellipsis
     checker2[None] = [6]  # new axis
 
+  @test_util.run_deprecated_v1
   def testSliceAssign(self):
     self.doTestSliceAssign(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testSliceAssignResource(self):
     self.doTestSliceAssign(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testUninitialized(self):
     with self.assertRaisesRegexp(
         errors.FailedPreconditionError,
@@ -1022,13 +1076,14 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError):
       v[:].assign(too_large_val)
 
+  @test_util.run_deprecated_v1
   def testTypeErrorResource(self):
     init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
     too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
     too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
     v = resource_variable_ops.ResourceVariable(init_val)
     with self.cached_session() as sess:
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
       with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_large_val))
       with self.assertRaises(ValueError):
@@ -1078,6 +1133,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "maxlen must be scalar"):
         array_ops.sequence_mask([10, 20], [10, 20])
 
+  @test_util.run_deprecated_v1
   def testOneDimensionalWithMaxlen(self):
     with self.cached_session():
       res = array_ops.sequence_mask(constant_op.constant([1, 3, 2]), 5)
@@ -1087,7 +1143,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[True, False, False, False, False], [True, True, True, False, False],
            [True, True, False, False, False]])
 
-  @test_util.enable_c_shapes
+  @test_util.run_deprecated_v1
   def testOneDimensionalDtypeWithoutMaxlen(self):
     with self.cached_session():
       # test dtype and default maxlen:
@@ -1098,7 +1154,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           res.eval(),
           [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
 
-  @test_util.enable_c_shapes
+  @test_util.run_deprecated_v1
   def testOneDimensionalWithoutMaxlen(self):
     with self.cached_session():
       res = array_ops.sequence_mask(
@@ -1110,7 +1166,6 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
            [True, False, False, False],
            [True, True, True, True]])
 
-  @test_util.enable_c_shapes
   def testTwoDimensional(self):
     with self.cached_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
@@ -1128,11 +1183,13 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
            [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0]]])
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     lengths = array_ops.placeholder(dtype=dtypes.int32)
     res = array_ops.sequence_mask(lengths)
     self.assertEqual(res.shape, None)
 
+  @test_util.run_deprecated_v1
   def testDtypes(self):
 
     def check_dtypes(lengths_dtype, maxlen_dtype):
@@ -1155,6 +1212,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
 class ConcatSliceResourceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testConcatSlice(self):
     r1 = test_ops.stub_resource_handle_op(container="a", shared_name="b")
     r2 = test_ops.stub_resource_handle_op(container="a", shared_name="c")
@@ -1177,18 +1235,18 @@ class IdentityTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(x.numpy(), y.numpy())
         self.assertTrue(device in y.device.lower())
 
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         a = constant_op.constant([[2], [3]], dtype=dtypes.float32)
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         b = array_ops.identity(a)
         _test(a, b, "gpu")
-      with ops.device("cpu:0"):
+      with test_util.force_cpu():
         c = array_ops.identity(b)
         _test(b, c, "cpu")
-      with ops.device("cpu:0"):
+      with test_util.force_cpu():
         d = array_ops.identity(c)
         _test(c, d, "cpu")
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         e = array_ops.identity(d)
         _test(d, e, "gpu")
 
@@ -1210,9 +1268,10 @@ class PadTest(test_util.TensorFlowTestCase):
 
 class InvertPermutationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testInvertPermutation(self):
     for dtype in [dtypes.int32, dtypes.int64]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = constant_op.constant([3, 4, 0, 2, 1], dtype=dtype)
         y = array_ops.invert_permutation(x)
         self.assertAllEqual(y.get_shape(), [5])
@@ -1244,12 +1303,14 @@ class UnravelIndexTest(test_util.TensorFlowTestCase):
 
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.cached_session():
       a = array_ops.constant(10)
       guarantee_a = array_ops.guarantee_const(a)
       self.assertEqual(10, guarantee_a.eval())
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
     with self.cached_session() as sess:
       for use_resource in [False, True]:
@@ -1258,9 +1319,10 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
             initializer=init_ops.constant_initializer(10.0),
             use_resource=use_resource)
         guarantee_a = array_ops.guarantee_const(a)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertEqual(10.0, guarantee_a.eval())
 
+  @test_util.run_deprecated_v1
   def testResourceRejection(self):
     with self.cached_session() as sess:
       a = variable_scope.get_variable(
@@ -1268,7 +1330,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
           initializer=init_ops.constant_initializer(10.0),
           use_resource=True)
       guarantee_a = array_ops.guarantee_const(a.handle)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "cannot be a resource variable"):
         guarantee_a.eval()
@@ -1276,9 +1338,10 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
 class SnapshotOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testInvertPermutation(self):
     for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = constant_op.constant([0, 1, 2, 3], dtype=dtype)
         y = gen_array_ops.snapshot(x)
         self.assertAllEqual(y.eval(), [0, 1, 2, 3])
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index dd4a90e5f65bc66f23bf4d1fb469afb4916fb815..287701a73e43464a5cee4334d0a011de2d3746ba 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -27,6 +28,7 @@ from tensorflow.python.platform import test
 
 class AsStringOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     float_inputs_ = [
         0, 1, -1, 0.5, 0.25, 0.125, float("INF"), float("NAN"), float("-INF")
@@ -78,6 +80,7 @@ class AsStringOpTest(test.TestCase):
         output = string_ops.as_string(input_, fill="ab")
         output.eval(feed_dict={input_: float_inputs_})
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     # Cannot use values outside -128..127 for test, because we're also
     # testing int8
@@ -112,6 +115,7 @@ class AsStringOpTest(test.TestCase):
         output = string_ops.as_string(input_, precision=0)
         output.eval(feed_dict={input_: int_inputs_})
 
+  @test_util.run_deprecated_v1
   def testLargeInt(self):
     # Cannot use values outside -128..127 for test, because we're also
     # testing int8
@@ -130,6 +134,7 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  @test_util.run_deprecated_v1
   def testHalfInt(self):
     s = lambda strs: [x.decode("ascii") for x in strs]
 
@@ -140,6 +145,7 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  @test_util.run_deprecated_v1
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
@@ -152,6 +158,7 @@ class AsStringOpTest(test.TestCase):
         result = output.eval(feed_dict={input_: bool_inputs_})
         self.assertAllEqual(s(result), ["false", "true"])
 
+  @test_util.run_deprecated_v1
   def testComplex(self):
     float_inputs_ = [
         0, 1, -1, 0.5, 0.25, 0.125, complex("INF"), complex("NAN"),
diff --git a/tensorflow/python/kernel_tests/atrous_conv2d_test.py b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
index ab1d698f6e24888c2bffb3c966f6ef118be15143..a13e325835cfd343eda61037b8392e83bed0f1c2 100644
--- a/tensorflow/python/kernel_tests/atrous_conv2d_test.py
+++ b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
@@ -58,8 +59,9 @@ def _upsample_filters(filters, rate):
 
 class AtrousConv2DTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAtrousConv2DForward(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
       height = 9
       for width in [9, 10]:  # Test both odd and even width.
@@ -79,8 +81,10 @@ class AtrousConv2DTest(test.TestCase):
                 y1 = nn_ops.atrous_conv2d(x, f, rate, padding=padding)
                 y2 = nn_ops.conv2d(
                     x, f_up, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testAtrousSequence(self):
     """Tests optimization of sequence of atrous convolutions.
 
@@ -105,7 +109,7 @@ class AtrousConv2DTest(test.TestCase):
     padding = "SAME"  # The padding needs to be "SAME"
     np.random.seed(1)  # Make it reproducible.
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
       for height in range(15, 17):
         for width in range(15, 17):
@@ -131,10 +135,12 @@ class AtrousConv2DTest(test.TestCase):
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = array_ops.batch_to_space(y2, crops=pad, block_size=rate)
-              self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-2, atol=1e-2)
+              self.assertAllClose(
+                  y1.eval(), self.evaluate(y2), rtol=1e-2, atol=1e-2)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
       x_shape = [2, 5, 6, 2]
       # Filter: [kernel_height, kernel_width, input_depth, output_depth]
@@ -160,8 +166,9 @@ class AtrousConv2DTest(test.TestCase):
 
 class AtrousConv2DTransposeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAtrousConv2DTransposeForward(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
       height = 9
       for width in [9, 10]:  # Test both odd and even width.
@@ -193,14 +200,16 @@ class AtrousConv2DTransposeTest(test.TestCase):
                                                     padding)
                 y2 = nn_ops.conv2d_transpose(
                     x, f_up, y_shape, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
 
 class AtrousDepthwiseConv2DTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAtrousDepthwiseConv2DForward(self):
     strides = [1, 1, 1, 1]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
       height = 9
       for width in [9, 10]:  # Test both odd and even width.
@@ -220,7 +229,8 @@ class AtrousDepthwiseConv2DTest(test.TestCase):
                 y1 = nn_impl.depthwise_conv2d(
                     x, f, strides, padding, rate=[rate, rate])
                 y2 = nn_impl.depthwise_conv2d(x, f_up, strides, padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index 6b16fca29d0277e0e5f1f52f6c4a48343a441f67..2fb8a37e2b94bd81409970eb3c485362a17634b6 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -110,6 +110,7 @@ class AtrousConvolutionTest(test.TestCase):
 
     add_check(check, y1, y2)
 
+  @test_util.run_v1_only("b/120545219")
   def test_unknown_spatial_dims_for_channel_last_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, None, None, 10])
     w = array_ops.zeros([3, 3, 10, 20])
@@ -117,6 +118,7 @@ class AtrousConvolutionTest(test.TestCase):
         x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
     self.assertEqual(y.shape.as_list(), [1, None, None, 20])
 
+  @test_util.run_v1_only("b/120545219")
   def test_unknown_spatial_dims_for_channel_first_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, 10, None, None])
     w = array_ops.zeros([3, 3, 10, 20])
@@ -262,6 +264,7 @@ class AtrousConvolutionTest(test.TestCase):
     err_tolerance = 1e-3
     self.assertLess(err, err_tolerance)
 
+  @test_util.run_v1_only("b/120545219")
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 1e09ba5b65cee3b74d350e0d2433c6a459517e5e..00dba9996dd909786301d56da41fa037328ba3e5 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -85,7 +85,7 @@ class ExtractGlimpseTest(test.TestCase):
 
     # Evaluate the TensorFlow Graph.
     with self.cached_session() as sess:
-      value_rows, value_cols = sess.run([glimpse_rows, glimpse_cols])
+      value_rows, value_cols = self.evaluate([glimpse_rows, glimpse_cols])
 
     # Check dimensions of returned glimpse.
     self.assertEqual(value_rows.shape[1], glimpse_sizes[0])
@@ -121,8 +121,7 @@ class ExtractGlimpseTest(test.TestCase):
     with self.cached_session():
       result = image_ops.extract_glimpse(empty_image, [1, 1], offsets)
       self.assertAllEqual(
-          np.zeros(
-              (0, 1, 1, 0), dtype=np.float32), result.eval())
+          np.zeros((0, 1, 1, 0), dtype=np.float32), self.evaluate(result))
 
   def testLargeCenterGlimpse(self):
     self._VerifyValues(
diff --git a/tensorflow/python/kernel_tests/barrier_ops_test.py b/tensorflow/python/kernel_tests/barrier_ops_test.py
index 4d36b3a4658121729bcde440b1c25b3849a5a818..60fe6f0eecdd597ea78c006b3b5552e118a0eacb 100644
--- a/tensorflow/python/kernel_tests/barrier_ops_test.py
+++ b/tensorflow/python/kernel_tests/barrier_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 
@@ -66,6 +67,7 @@ class BarrierTest(test.TestCase):
       attr { key: 'shared_name' value: { s: 'B' } }
       """, b.barrier_ref.op.node_def)
 
+  @test_util.run_deprecated_v1
   def testInsertMany(self):
     with self.cached_session():
       b = data_flow_ops.Barrier(
@@ -90,6 +92,7 @@ class BarrierTest(test.TestCase):
         data_flow_ops.Barrier(
             (dtypes.float32, dtypes.float32), shapes=((1,), (0,)), name="B")
 
+  @test_util.run_deprecated_v1
   def testInsertManyEmptyTensorUnknown(self):
     with self.cached_session():
       b = data_flow_ops.Barrier((dtypes.float32, dtypes.float32), name="B")
@@ -102,6 +105,7 @@ class BarrierTest(test.TestCase):
           ".*Tensors with no elements are not supported.*"):
         insert_0_op.run()
 
+  @test_util.run_deprecated_v1
   def testTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -127,6 +131,7 @@ class BarrierTest(test.TestCase):
       self.assertEqual(values_0_val[idx], v0)
       self.assertEqual(values_1_val[idx], v1)
 
+  @test_util.run_deprecated_v1
   def testTakeManySmallBatch(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -191,6 +196,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed"):
         insert_1_3_op.run()
 
+  @test_util.run_deprecated_v1
   def testUseBarrierWithShape(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -220,6 +226,7 @@ class BarrierTest(test.TestCase):
       self.assertAllEqual(values_0_val[idx], v0)
       self.assertAllEqual(values_1_val[idx], v1)
 
+  @test_util.run_deprecated_v1
   def testParallelInsertMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(dtypes.float32, shapes=())
@@ -229,7 +236,7 @@ class BarrierTest(test.TestCase):
       insert_ops = [b.insert_many(0, [k], [v]) for k, v in zip(keys, values)]
       take_t = b.take_many(10)
 
-      sess.run(insert_ops)
+      self.evaluate(insert_ops)
       self.assertEquals(size_t.eval(), [10])
 
       indices_val, keys_val, values_val = sess.run(
@@ -240,6 +247,7 @@ class BarrierTest(test.TestCase):
       idx = keys_val.tolist().index(k)
       self.assertEqual(values_val[idx], v)
 
+  @test_util.run_deprecated_v1
   def testParallelTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(dtypes.float32, shapes=())
@@ -274,6 +282,7 @@ class BarrierTest(test.TestCase):
     self.assertItemsEqual(
         zip(keys, values), [(k[0], v[0]) for k, v in zip(key_vals, value_vals)])
 
+  @test_util.run_deprecated_v1
   def testBlockingTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(dtypes.float32, shapes=())
@@ -296,6 +305,7 @@ class BarrierTest(test.TestCase):
         insert_op.run()
       t.join()
 
+  @test_util.run_deprecated_v1
   def testParallelInsertManyTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -375,6 +385,7 @@ class BarrierTest(test.TestCase):
              2 + outer_indices_from_keys + inner_indices_from_keys)).T
         self.assertAllEqual(taken_i["values_1"], expected_values_1)
 
+  @test_util.run_deprecated_v1
   def testClose(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -433,6 +444,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed and has insufficient elements"):
         sess.run(take_t[0])
 
+  @test_util.run_deprecated_v1
   def testCancel(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -491,10 +503,11 @@ class BarrierTest(test.TestCase):
       b = data_flow_ops.Barrier(
           (dtypes.float32, dtypes.float32), shapes=((), ()), name="B")
       take_t = b.take_many(1, allow_small_batch=True)
-      sess.run(b.close(cancel))
+      self.evaluate(b.close(cancel))
       with self.assertRaisesOpError("is closed and has insufficient elements"):
-        sess.run(take_t)
+        self.evaluate(take_t)
 
+  @test_util.run_deprecated_v1
   def testClosedEmptyBarrierTakeManyAllowSmallBatchRaises(self):
     self._testClosedEmptyBarrierTakeManyAllowSmallBatchRaises(cancel=False)
     self._testClosedEmptyBarrierTakeManyAllowSmallBatchRaises(cancel=True)
@@ -569,9 +582,11 @@ class BarrierTest(test.TestCase):
           sorted(taken),
           [0] * (num_iterations // 2) + [10] * (num_iterations // 2))
 
+  @test_util.run_deprecated_v1
   def testParallelInsertManyTakeManyCloseHalfwayThrough(self):
     self._testParallelInsertManyTakeManyCloseHalfwayThrough(cancel=False)
 
+  @test_util.run_deprecated_v1
   def testParallelInsertManyTakeManyCancelHalfwayThrough(self):
     self._testParallelInsertManyTakeManyCloseHalfwayThrough(cancel=True)
 
@@ -669,12 +684,15 @@ class BarrierTest(test.TestCase):
       else:
         self.assertEqual(taken, [10] * num_iterations)
 
+  @test_util.run_deprecated_v1
   def testParallelPartialInsertManyTakeManyCloseHalfwayThrough(self):
     self._testParallelPartialInsertManyTakeManyCloseHalfwayThrough(cancel=False)
 
+  @test_util.run_deprecated_v1
   def testParallelPartialInsertManyTakeManyCancelHalfwayThrough(self):
     self._testParallelPartialInsertManyTakeManyCloseHalfwayThrough(cancel=True)
 
+  @test_util.run_deprecated_v1
   def testIncompatibleSharedBarrierErrors(self):
     with self.cached_session():
       # Do component types and shapes.
diff --git a/tensorflow/python/kernel_tests/base64_ops_test.py b/tensorflow/python/kernel_tests/base64_ops_test.py
index 1b399942efbcef227f24de9737f2fc0f6a427c7f..381f190b8df6d65afaa80654e3d98377a69b9ae3 100644
--- a/tensorflow/python/kernel_tests/base64_ops_test.py
+++ b/tensorflow/python/kernel_tests/base64_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class Base64OpsTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -93,7 +94,7 @@ class Base64OpsTest(test_util.TensorFlowTestCase):
         decoded = string_ops.decode_base64(encoded)
 
         with self.cached_session() as sess:
-          encoded_value, decoded_value = sess.run([encoded, decoded])
+          encoded_value, decoded_value = self.evaluate([encoded, decoded])
 
         self.assertEqual(encoded_value.shape, msg.shape)
         self.assertEqual(decoded_value.shape, msg.shape)
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 67e861819810444bd81032815d7953e5d0fd7f52..1a8513d022d43e3bd206bc0ab607012d05aef6a9 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -40,17 +40,17 @@ from tensorflow.python.platform import test
 class GPUBinaryOpsTest(test.TestCase):
 
   def _compareGPU(self, x, y, np_func, tf_func):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = sess.run(out)
+      tf_gpu = self.evaluate(out)
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_cpu = sess.run(out)
+      tf_cpu = self.evaluate(out)
 
     self.assertAllClose(tf_cpu, tf_gpu)
 
@@ -93,10 +93,10 @@ class MathBuiltinUnaryTest(test.TestCase):
 
   def _compare(self, x, np_func, tf_func, use_gpu):
     np_out = np_func(x)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       inx = ops.convert_to_tensor(x)
       ofunc = tf_func(inx)
-      tf_out = sess.run(ofunc)
+      tf_out = self.evaluate(ofunc)
     self.assertAllClose(np_out, tf_out)
 
   def _inv(self, x):
@@ -143,12 +143,12 @@ class MathBuiltinUnaryTest(test.TestCase):
 
     np_out = np.floor_divide(x, y + 0.1)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y + 0.1)
       ofunc = inx / iny
       out_func2 = math_ops.floor(ofunc)
-      tf_out = sess.run(out_func2)
+      tf_out = self.evaluate(out_func2)
 
     self.assertAllClose(np_out, tf_out)
 
@@ -156,9 +156,10 @@ class MathBuiltinUnaryTest(test.TestCase):
 class BroadcastSimpleTest(test.TestCase):
 
   def _GetGradientArgs(self, xs, ys):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       return sess.run(broadcast_gradient_args(xs, ys))
 
+  @test_util.run_deprecated_v1
   def testBroadcast(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
     self.assertAllEqual(r0, [])
@@ -210,15 +211,16 @@ class BroadcastSimpleTest(test.TestCase):
 
   def _compareGpu(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = out.eval()
+      tf_gpu = self.evaluate(out)
     self.assertAllClose(np_ans, tf_gpu)
     self.assertShapeEqual(np_ans, out)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
         [1, 3, 2])
@@ -255,6 +257,7 @@ class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
           if len(results) != 1:
             break
 
+  @test_util.run_deprecated_v1
   def testConcurrentSessions(self):
     n_threads = 4
     threads = []
diff --git a/tensorflow/python/kernel_tests/batch_gather_op_test.py b/tensorflow/python/kernel_tests/batch_gather_op_test.py
index 84e93b8136f332f525266dd64ef663b2eb531341..7e0b3e1b5eadc7fe5541612fc607aeb9a135ceb4 100644
--- a/tensorflow/python/kernel_tests/batch_gather_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_gather_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -44,7 +45,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
   def testSimpleGather(self, indices_dtype):
     data = np.array([0, 1, 2, 3, 7, 5, 8, 9, 10, 11, 15, 13])
     indices = [3, 4]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
         params_np = self._buildParams(data, dtype)
         params = constant_op.constant(params_np)
@@ -52,7 +53,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([3, 7])
         np_val = self._buildParams(expected_result, dtype)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
@@ -60,7 +61,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
   def test2DArray(self, indices_dtype):
     data = np.array([[0, 1, 2, 3, 7, 5], [8, 9, 10, 11, 15, 13]])
     indices = [[3], [4]]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
         params_np = self._buildParams(data, dtype)
         params = constant_op.constant(params_np)
@@ -68,25 +69,26 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([[3], [15]])
         np_val = self._buildParams(expected_result, dtype)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
   def testHigherRank(self):
     data = np.array([[[0, 1, 2], [3, 7, 5]], [[8, 9, 10], [11, 15, 13]]])
     indices = [[[2, 0], [1, 2]], [[2, 0], [0, 1]]]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
         params_np = self._buildParams(data, dtype)
         params = constant_op.constant(params_np)
         indices_tf = constant_op.constant(indices)
         gather_t = array_ops.batch_gather(params, indices_tf)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         expected_result = np.array([[[2, 0], [7, 5]], [[10, 8], [11, 15]]])
         np_val = self._buildParams(expected_result, dtype)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
     with self.cached_session():
@@ -94,6 +96,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual([[b"qwer", b"uiop"]],
                           array_ops.batch_gather(params, indices_tf).eval())
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32, shape=[None, None])
@@ -101,13 +104,14 @@ class GatherTest(test.TestCase, parameterized.TestCase):
     self.assertEqual([1, None], gather_t.get_shape().as_list())
 
   def testBadIndicesCPU(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0\] = 7 is not in \[0, 2\)"):
         array_ops.batch_gather(params, [7]).eval()
 
+  @test_util.run_deprecated_v1
   def testEmptySlices(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
           params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index 34089e8dbea32714ee6c560c6df41ed8feb62f08..c32a6c7e41759ac9abade06bb83be19a7392f2da 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import constant_op
+from tensorflow.python import tf2
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -83,10 +84,10 @@ class BatchMatmulOpTest(test.TestCase):
     y = y_in if not adjoint_b else y_in.reshape(y_t_shape)
     is_floating = x.dtype != np.int32
     tol = 100 * np.finfo(x.dtype).eps if is_floating else 0
-    with self.test_session(use_gpu=is_floating) as sess:
+    with self.cached_session(use_gpu=is_floating) as sess:
       if static_shape:
         z0 = math_ops.matmul(x, y, adjoint_a=adjoint_a, adjoint_b=adjoint_b)
-        z0_val = z0.eval()
+        z0_val = self.evaluate(z0)
       else:
         x_ph = array_ops.placeholder(x.dtype)
         y_ph = array_ops.placeholder(y.dtype)
@@ -105,36 +106,37 @@ class BatchMatmulOpTest(test.TestCase):
 
   def _testNonEmpty(self, dtype, adjoint_a, adjoint_b, use_static_shape):
 
-    def compareNonEmpty(self, a_shape, b_shape):
+    def CompareNonEmpty(self, a_shape, b_shape):
       self._compare(
           self._rand(a_shape, dtype),
           self._rand(b_shape, dtype), adjoint_a, adjoint_b, use_static_shape)
 
-    compareNonEmpty(self, [1, 2, 3], [1, 3, 5])
-    compareNonEmpty(self, [1, 2, 3], [1, 3, 1])
-    compareNonEmpty(self, [1, 1, 3], [1, 3, 5])
-    compareNonEmpty(self, [1, 2, 3], [1, 3, 5])
-    compareNonEmpty(self, [7, 1, 3], [7, 3, 5])
-    compareNonEmpty(self, [7, 2, 3], [7, 3, 1])
-    compareNonEmpty(self, [7, 2, 3], [7, 3, 5])
-    compareNonEmpty(self, [10, 64, 75], [10, 75, 30])
-    compareNonEmpty(self, [5, 7, 2, 3], [5, 7, 3, 5])
+    CompareNonEmpty(self, [1, 2, 3], [1, 3, 5])
+    CompareNonEmpty(self, [1, 2, 3], [1, 3, 1])
+    CompareNonEmpty(self, [1, 1, 3], [1, 3, 5])
+    CompareNonEmpty(self, [1, 2, 3], [1, 3, 5])
+    CompareNonEmpty(self, [7, 1, 3], [7, 3, 5])
+    CompareNonEmpty(self, [7, 2, 3], [7, 3, 1])
+    CompareNonEmpty(self, [7, 2, 3], [7, 3, 5])
+    CompareNonEmpty(self, [10, 64, 75], [10, 75, 30])
+    CompareNonEmpty(self, [5, 7, 2, 3], [5, 7, 3, 5])
 
   def _testEmpty(self, dtype, adjoint_a, adjoint_b, use_static_shape):
 
-    def compareEmpty(self, a_shape, b_shape):
+    def CompareEmpty(self, a_shape, b_shape):
       self._compare(
           np.zeros(a_shape).astype(dtype),
           np.zeros(b_shape).astype(dtype), adjoint_a, adjoint_b,
           use_static_shape)
 
-    compareEmpty(self, [0, 3, 2], [0, 2, 4])
-    compareEmpty(self, [3, 0, 2], [3, 2, 5])
-    compareEmpty(self, [3, 3, 2], [3, 2, 0])
+    CompareEmpty(self, [0, 3, 2], [0, 2, 4])
+    CompareEmpty(self, [3, 0, 2], [3, 2, 5])
+    CompareEmpty(self, [3, 3, 2], [3, 2, 0])
 
 
 def _GetBatchMatmulOpTest(dtype, adjoint_a, adjoint_b, use_static_shape):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(42)
     self._testNonEmpty(dtype, adjoint_a, adjoint_b, use_static_shape)
@@ -154,17 +156,13 @@ class BatchMatmulGradientTest(test.TestCase):
     y = y_in if not adjoint_b else y_in.reshape(y_t_shape)
     epsilon = np.finfo(x.dtype).eps
     delta = epsilon**(1.0 / 3.0)
-    with self.test_session(use_gpu=True):
-      inx = constant_op.constant(x)
-      iny = constant_op.constant(y)
-      z = math_ops.matmul(inx, iny, adjoint_a, adjoint_b)
-      loss = math_ops.reduce_sum(z)
-      ((x_jacob_t, x_jacob_n),
-       (y_jacob_t, y_jacob_n)) = gradient_checker.compute_gradient(
-           [inx, iny], [x.shape, y.shape],
-           loss, [1],
-           x_init_value=[x, y],
-           delta=delta)
+    def Loss(x, y):
+      z = math_ops.matmul(x, y, adjoint_a, adjoint_b)
+      return math_ops.reduce_sum(z)
+    with self.cached_session(use_gpu=True):
+      ((x_jacob_t, y_jacob_t),
+       (x_jacob_n, y_jacob_n)) = gradient_checker_v2.compute_gradient(
+           Loss, [x, y], delta=delta)
       tol = 20 * delta
       self.assertAllClose(x_jacob_t, x_jacob_n, rtol=tol, atol=tol)
       self.assertAllClose(y_jacob_t, y_jacob_n, rtol=tol, atol=tol)
@@ -188,6 +186,7 @@ class BatchMatmulGradientTest(test.TestCase):
 
 def _GetBatchMatmulGradientTest(dtype, adjoint_a, adjoint_b):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     self._compare(1, 2, 3, 5, dtype, adjoint_a, adjoint_b)
     self._compare(3, 4, 7, 10, dtype, adjoint_a, adjoint_b)
@@ -202,11 +201,12 @@ if __name__ == "__main__":
     for adjoint_a_ in False, True:
       for adjoint_b_ in False, True:
         name = "%s_%s_%s" % (dtype_.__name__, adjoint_a_, adjoint_b_)
-        for use_static_shape in True, False:
+        # TF2 does not support placeholders under eager so we skip it
+        for use_static_shape_ in set([True, tf2.enabled()]):
           setattr(BatchMatmulOpTest,
-                  "testBatchMatmulOp_" + name + ("_%s" % use_static_shape),
+                  "testBatchMatmulOp_" + name + ("_%s" % use_static_shape_),
                   _GetBatchMatmulOpTest(dtype_, adjoint_a_, adjoint_b_,
-                                        use_static_shape))
+                                        use_static_shape_))
         if dtype_ is not np.int32:
           setattr(BatchMatmulGradientTest, "testBatchMatmulGradient_" + name,
                   _GetBatchMatmulGradientTest(dtype_, adjoint_a_, adjoint_b_))
diff --git a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
index 0d41a7e3b3dbc6e9ee9d1e3f273acd836a913327..f70fb93da9d51c1f9838f67977dbbd4aef65562e 100644
--- a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
@@ -22,6 +22,8 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -49,9 +51,10 @@ class ScatterTest(test.TestCase):
                         vtype,
                         itype,
                         repeat_indices=False,
-                        updates_are_scalar=False):
+                        updates_are_scalar=False,
+                        method=False):
     np.random.seed(8)
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       for indices_shape in (2,), (3, 7), (3, 4, 7):
         for extra_shape in (), (5,), (5, 9):
           # Generate random indices with no duplicates for easy numpy comparison
@@ -70,19 +73,13 @@ class ScatterTest(test.TestCase):
           # Scatter via tensorflow
           ref = variables.Variable(old)
           ref.initializer.run()
-          tf_scatter(ref, indices, updates).eval()
+          if method:
+            ref.batch_scatter_update(ops.IndexedSlices(indices, updates))
+          else:
+            tf_scatter(ref, indices, updates).eval()
           self.assertAllClose(ref.eval(), new)
 
-  def _VariableRankTests(self,
-                         tf_scatter):
-    vtypes = [np.float32, np.float64]
-    if tf_scatter != state_ops.scatter_div:
-      vtypes.append(np.int32)
-
-    for vtype in vtypes:
-      for itype in (np.int32, np.int64):
-        self._VariableRankTest(tf_scatter, vtype, itype)
-
+  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     vtypes = [np.float32, np.float64]
     for vtype in vtypes:
@@ -90,8 +87,9 @@ class ScatterTest(test.TestCase):
         self._VariableRankTest(
             state_ops.batch_scatter_update, vtype, itype)
 
+  @test_util.run_deprecated_v1
   def testBooleanScatterUpdate(self):
-    with self.test_session(use_gpu=False) as session:
+    with self.session(use_gpu=False) as session:
       var = variables.Variable([True, False])
       update0 = state_ops.batch_scatter_update(var, [1], [True])
       update1 = state_ops.batch_scatter_update(
@@ -101,12 +99,13 @@ class ScatterTest(test.TestCase):
 
       session.run([update0, update1])
 
-      self.assertAllEqual([False, True], var.eval())
+      self.assertAllEqual([False, True], self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testScatterOutOfRange(self):
     params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
     updates = np.array([-3, -4, -5]).astype(np.float32)
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       ref = variables.Variable(params)
       ref.initializer.run()
 
diff --git a/tensorflow/python/kernel_tests/batchtospace_op_test.py b/tensorflow/python/kernel_tests/batchtospace_op_test.py
index 03f3f64353d8367afad18a3fd07750b575b1fafa..c422df8806f595f2926bc603ffa1f40064664df0 100644
--- a/tensorflow/python/kernel_tests/batchtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/batchtospace_op_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
@@ -50,6 +51,7 @@ class CppOpImpl(object):
 class BatchToSpaceDepthToSpace(test.TestCase, PythonOpImpl):
 
   # Verifies that: batch_to_space(x) = transpose(depth_to_space(transpose(x)))
+  @test_util.run_deprecated_v1
   def testDepthToSpaceTranspose(self):
     x = np.arange(20 * 5 * 8 * 7, dtype=np.float32).reshape([20, 5, 8, 7])
     block_size = 2
@@ -70,6 +72,7 @@ class BatchToSpaceDepthToSpaceCpp(BatchToSpaceDepthToSpace, CppOpImpl):
 
 class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
@@ -78,6 +81,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.batch_to_space(x_np, crops, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -87,6 +91,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -96,6 +101,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -105,6 +111,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeSquaredNotDivisibleBatch(self):
     # The block size squared does not divide the batch.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
@@ -113,6 +120,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.batch_to_space(x_np, crops, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = self.batch_to_space(
         array_ops.placeholder(dtypes.float32),
@@ -160,28 +168,35 @@ class BatchToSpaceNDErrorHandlingTest(test.TestCase):
     self._testStaticShape(input_shape, block_shape, paddings, error)
     self._testDynamicShape(input_shape, block_shape, paddings)
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     self._testShape([2, 2], [2, 2], [[0, 0], [0, 0]], ValueError)
     self._testShape([2, 2, 3], [2, 2, 3], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     self._testShape([1, 2, 2, 1], [0, 1], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNegative(self):
     self._testShape([1, 2, 2, 1], [-1, 1], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testNegativePadding(self):
     self._testShape([1, 2, 2], [1, 1], [[0, -1], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testCropTooLarge(self):
     # The amount to crop exceeds the padded size.
     self._testShape([1 * 2 * 2, 2, 3, 1], [2, 2], [[3, 2], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeSquaredNotDivisibleBatch(self):
     # The batch dimension is not divisible by the product of the block_shape.
     self._testShape([3, 1, 1, 1], [2, 3], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     # Verify that input shape and paddings shape can be unknown.
     _ = array_ops.batch_to_space_nd(
@@ -263,18 +278,21 @@ class BatchToSpaceGradientTest(test.TestCase, PythonOpImpl):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     crop_beg = 0
     crop_end = 0
     self._compare(1, 2, 3, 5, block_size, crop_beg, crop_end)
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 2
     crop_beg = 0
     crop_end = 0
     self._compare(2, 4, 3, 2, block_size, crop_beg, crop_end)
 
+  @test_util.run_deprecated_v1
   def testSmallCrop1x1(self):
     block_size = 2
     crop_beg = 1
@@ -316,14 +334,17 @@ class BatchToSpaceNDGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     for dtype in [dtypes.int64, dtypes.int32]:
       self._compare([1, 2, 3, 5], [2, 2], [[0, 0], [0, 0]], dtype)
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     for dtype in [dtypes.int64, dtypes.int32]:
       self._compare([2, 4, 3, 2], [2, 2], [[0, 0], [0, 0]], dtype)
 
+  @test_util.run_deprecated_v1
   def testSmallCrop1x1(self):
     for dtype in [dtypes.int64, dtypes.int32]:
       self._compare([1, 2, 3, 5], [2, 2], [[1, 1], [1, 1]], dtype)
diff --git a/tensorflow/python/kernel_tests/bcast_ops_test.py b/tensorflow/python/kernel_tests/bcast_ops_test.py
index 3ec820aeadadf361e9291c5431e21cd7b2ba52be..ae00955ac29001c5748705d8b94c9f560ac60c26 100644
--- a/tensorflow/python/kernel_tests/bcast_ops_test.py
+++ b/tensorflow/python/kernel_tests/bcast_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.gen_array_ops import broadcast_args
 from tensorflow.python.ops.gen_array_ops import broadcast_gradient_args
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ class BcastOpsTest(test.TestCase):
     with self.cached_session() as sess:
       return sess.run(broadcast_gradient_args(xs, ys))
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     r = self._GetBroadcastShape([2, 3, 5], [1])
     self.assertAllEqual(r, [2, 3, 5])
@@ -66,6 +68,7 @@ class BcastOpsTest(test.TestCase):
     r = self._GetBroadcastShape([3, 1], [2, 1, 5])
     self.assertAllEqual(r, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testBasicGradient(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
     self.assertAllEqual(r0, [])
@@ -107,6 +110,7 @@ class BcastOpsTest(test.TestCase):
     self.assertAllEqual(r0, [0, 2])
     self.assertAllEqual(r1, [1])
 
+  @test_util.run_deprecated_v1
   def testZeroDims(self):
     r = self._GetBroadcastShape([2, 0, 3, 0, 5], [3, 0, 5])
     self.assertAllEqual(r, [2, 0, 3, 0, 5])
@@ -120,6 +124,7 @@ class BcastOpsTest(test.TestCase):
     r = self._GetBroadcastShape([3, 1, 5], [2, 0, 3, 0, 5])
     self.assertAllEqual(r, [2, 0, 3, 0, 5])
 
+  @test_util.run_deprecated_v1
   def testZeroDimsGradient(self):
     r0, r1 = self._GetGradientArgs([2, 0, 3, 0, 5], [3, 0, 5])
     self.assertAllEqual(r0, [])
@@ -137,6 +142,7 @@ class BcastOpsTest(test.TestCase):
     self.assertAllEqual(r0, [0, 1, 3])
     self.assertAllEqual(r1, [])
 
+  @test_util.run_deprecated_v1
   def testDataTypes(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       r = self._GetBroadcastShape(
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index 78b6e38d949b2887052952ae8e34bd0b4721ce72..bffa5e6e8f4d9125f5021eb531319f67fd6e77bb 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -21,9 +21,12 @@ import json
 import os
 import random
 
+import numpy as np
+
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.client import session
-from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -64,11 +67,17 @@ class TestReportingBenchmark(test.Benchmark):
                 "other_key": "string"})
 
   def benchmark_times_an_op(self):
-    with session.Session() as sess:
-      a = constant_op.constant(0.0)
+    input_size = 5
+    with session.Session(config=benchmark.benchmark_config()) as sess:
+      a = array_ops.placeholder(dtype=dtypes.float32, shape=(input_size))
       a_plus_a = a + a
       return self.run_op_benchmark(
-          sess, a_plus_a, min_iters=1000, store_trace=True, name="op_benchmark")
+          sess,
+          a_plus_a,
+          feed_dict={a: np.arange(input_size)},
+          min_iters=1000,
+          store_trace=True,
+          name="op_benchmark")
 
 
 class BenchmarkTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 92d21462d52f40c22aa60dac1a0c3d6b74ab2f3f..9dc34a606282e03cd5729c91bb9c4cffb10afc1c 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -48,7 +49,7 @@ class BetaincTest(test.TestCase):
       tf_x_s = constant_op.constant(x_s, dtype=dtype)
       tf_out_t = math_ops.betainc(tf_a_s, tf_b_s, tf_x_s)
       with self.cached_session():
-        tf_out = tf_out_t.eval()
+        tf_out = self.evaluate(tf_out_t)
       scipy_out = special.betainc(a_s, b_s, x_s).astype(np_dt)
 
       # the scipy version of betainc uses a double-only implementation.
@@ -109,36 +110,42 @@ class BetaincTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testBetaIncFloat(self):
     a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testBetaIncDouble(self):
     a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testBetaIncDoubleVeryLargeValues(self):
     a_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testBetaIncDoubleVerySmallValues(self):
     a_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testBetaIncFloatVerySmallValues(self):
     a_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testBetaIncFpropAndBpropAreNeverNAN(self):
     with self.cached_session() as sess:
       space = np.logspace(-8, 5).tolist()
@@ -159,6 +166,7 @@ class BetaincTest(test.TestCase):
       self.assertAllEqual(np.zeros_like(grads_x).astype(np.bool),
                           np.isnan(grads_x))
 
+  @test_util.run_deprecated_v1
   def testBetaIncGrads(self):
     err_tolerance = 1e-3
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/bias_op_test.py b/tensorflow/python/kernel_tests/bias_op_test.py
index 42ba6657253900c0bbac064f96cb24960c2db8ba..66f442dbddb5f609e7525ba0db9809dc3943ee25 100644
--- a/tensorflow/python/kernel_tests/bias_op_test.py
+++ b/tensorflow/python/kernel_tests/bias_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -30,19 +31,6 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
-def GetTestConfigs():
-  """Get all the valid tests configs to run.
-
-  Returns:
-    all the valid test configs as tuples of data_format and use_gpu.
-  """
-  test_configs = [("NHWC", False), ("NHWC", True)]
-  if test.is_gpu_available(cuda_only=True):
-    # "NCHW" format is currently only supported on CUDA.
-    test_configs += [("NCHW", True)]
-  return test_configs
-
-
 class BiasAddTest(test.TestCase):
 
   def _npBias(self, inputs, bias):
@@ -61,7 +49,7 @@ class BiasAddTest(test.TestCase):
 
   def _testBias(self, np_inputs, np_bias, use_gpu=False):
     np_val = self._npBias(np_inputs, np_bias)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_val = nn_ops.bias_add(np_inputs, np_bias).eval()
     self.assertAllCloseAccordingToType(np_val, tf_val)
 
@@ -74,37 +62,40 @@ class BiasAddTest(test.TestCase):
   def _NHWCToNCHW(self, np_value):
     # fill the input value to at least 3-dimension
     np_value = self._AtLeast3d(np_value)
-    # move the last dimension to third-to-last
+    # move the last dimension to second
     np_dim = list(range(np_value.ndim))
-    np_dim_new = list(np_dim[0:-3]) + list(np_dim[-1:]) + list(np_dim[-3:-1])
+    np_dim_new = list(np_dim[0:1]) + list(np_dim[-1:]) + list(np_dim[1:-1])
     return np.transpose(np_value, np_dim_new)
 
   def _NCHWToNHWC(self, np_value):
     assert len(np_value.shape) >= 3
     np_dim = list(range(np_value.ndim))
-    # move the third-to-last dimension to the last
-    np_dim_new = list(np_dim[0:-3]) + list(np_dim[-2:]) + list(np_dim[-3:-2])
+    # move the second dimension to the last
+    np_dim_new = list(np_dim[0:1]) + list(np_dim[2:]) + list(np_dim[1:2])
     return np.transpose(np_value, np_dim_new)
 
   def _testBiasNCHW(self, np_inputs, np_bias, use_gpu):
     np_val = self._npBias(np_inputs, np_bias)
     np_inputs = self._NHWCToNCHW(np_inputs)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_val = nn_ops.bias_add(np_inputs, np_bias, data_format="NCHW").eval()
     tf_val = self._NCHWToNHWC(tf_val)
     self.assertAllCloseAccordingToType(self._AtLeast3d(np_val), tf_val)
 
   def _testAll(self, np_inputs, np_bias):
     self._testBias(np_inputs, np_bias, use_gpu=False)
+    self._testBiasNCHW(np_inputs, np_bias, use_gpu=False)
     if np_inputs.dtype in [np.float16, np.float32, np.float64]:
       self._testBias(np_inputs, np_bias, use_gpu=True)
-      if test.is_gpu_available(cuda_only=True):
-        self._testBiasNCHW(np_inputs, np_bias, use_gpu=True)
+      self._testBiasNCHW(np_inputs, np_bias, use_gpu=True)
 
+
+  @test_util.run_deprecated_v1
   def testInputDims(self):
     with self.assertRaises(ValueError):
       nn_ops.bias_add([1, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBiasVec(self):
     with self.assertRaises(ValueError):
       nn_ops.bias_add(
@@ -113,6 +104,7 @@ class BiasAddTest(test.TestCase):
           array_ops.reshape(
               [1, 2], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testBiasInputsMatch(self):
     with self.assertRaises(ValueError):
       nn_ops.bias_add(
@@ -121,19 +113,35 @@ class BiasAddTest(test.TestCase):
           array_ops.reshape(
               [1], shape=[1]))
 
+  @test_util.run_deprecated_v1
   def testIntTypes(self):
     for t in [np.int8, np.int16, np.int32, np.int64]:
       self._testAll(
           np.array([[10, 20, 30], [40, 50, 60]]).astype(t),
           np.array([1, 2, 3]).astype(t))
 
+  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testAll(
           np.random.rand(4, 3, 3).astype(t), np.random.rand(3).astype(t))
 
+  @test_util.run_deprecated_v1
+  def test4DFloatTypes(self):
+    for t in [np.float16, np.float32, np.float64]:
+      self._testAll(
+          np.random.rand(4, 3, 2, 3).astype(t),
+          np.random.rand(3).astype(t))
+
+  @test_util.run_deprecated_v1
+  def test5DFloatTypes(self):
+    for t in [np.float16, np.float32, np.float64]:
+      self._testAll(
+          np.random.rand(4, 3, 2, 3, 4).astype(t),
+          np.random.rand(4).astype(t))
+
   def _testGradient(self, np_input, bias, dtype, data_format, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       if data_format == "NCHW":
         np_input = self._NHWCToNCHW(np_input)
       input_tensor = constant_op.constant(
@@ -187,8 +195,11 @@ class BiasAddTest(test.TestCase):
       self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
       self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
 
+  @test_util.run_deprecated_v1
   def testGradientTensor(self):
-    for (data_format, use_gpu) in GetTestConfigs():
+    # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
+    # all dimensions are supported.
+    for (data_format, use_gpu) in ("NHWC", False), ("NHWC", True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
         np_input = np.array(
             [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
@@ -196,8 +207,11 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testGradientTensor4D(self):
-    for (data_format, use_gpu) in GetTestConfigs():
+    # BiasAddGrad with NCHW support 4D so all are enabled.
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
         np_input = np.arange(
             1.0, 49.0, dtype=dtype.as_numpy_dtype).reshape(
@@ -205,13 +219,17 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     np.random.seed(7)
     for shape in (0, 0), (2, 0), (0, 2), (4, 3, 0), (4, 0, 3), (0, 4, 3):
       self._testAll(np.random.randn(*shape), np.random.randn(shape[-1]))
 
+  @test_util.run_deprecated_v1
   def testEmptyGradient(self):
-    for data_format, use_gpu in GetTestConfigs():
+    # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
+    # all dimensions are supported.
+    for (data_format, use_gpu) in ("NHWC", False), ("NHWC", True):
       for shape in (0, 0), (2, 0), (0, 2), (4, 3, 0), (4, 0, 3), (0, 4, 3):
         self._testGradient(
             np.random.randn(*shape),
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 8a58b3f97ecb071daa916527cad60997205fd5d6..d064d736cf253ddf6ebf3ef0f416f449fcf7f565 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -22,14 +22,17 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
 
 class BincountTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def test_empty(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(
           math_ops.bincount([], minlength=5).eval(), [0, 0, 0, 0, 0])
       self.assertAllEqual(math_ops.bincount([], minlength=1).eval(), [0])
@@ -41,8 +44,9 @@ class BincountTest(test_util.TensorFlowTestCase):
           math_ops.bincount([], minlength=3, dtype=np.float64).eval().dtype,
           np.float64)
 
+  @test_util.run_deprecated_v1
   def test_values(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(
           math_ops.bincount([1, 1, 1, 2, 2, 3]).eval(), [0, 3, 2, 1])
       arr = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
@@ -56,15 +60,17 @@ class BincountTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(
           math_ops.bincount(np.arange(10000)).eval(), np.ones(10000))
 
+  @test_util.run_deprecated_v1
   def test_maxlength(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(math_ops.bincount([5], maxlength=3).eval(), [0, 0, 0])
       self.assertAllEqual(math_ops.bincount([1], maxlength=3).eval(), [0, 1])
       self.assertAllEqual(math_ops.bincount([], maxlength=3).eval(), [])
 
+  @test_util.run_deprecated_v1
   def test_random_with_weights(self):
     num_samples = 10000
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       np.random.seed(42)
       for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
         arr = np.random.randint(0, 1000, num_samples)
@@ -75,9 +81,10 @@ class BincountTest(test_util.TensorFlowTestCase):
         self.assertAllClose(
             math_ops.bincount(arr, weights).eval(), np.bincount(arr, weights))
 
+  @test_util.run_deprecated_v1
   def test_random_without_weights(self):
     num_samples = 10000
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       np.random.seed(42)
       for dtype in [np.int32, np.float32]:
         arr = np.random.randint(0, 1000, num_samples)
@@ -85,8 +92,9 @@ class BincountTest(test_util.TensorFlowTestCase):
         self.assertAllClose(
             math_ops.bincount(arr, None).eval(), np.bincount(arr, weights))
 
+  @test_util.run_deprecated_v1
   def test_zero_weights(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(
           math_ops.bincount(np.arange(1000), np.zeros(1000)).eval(),
           np.zeros(1000))
@@ -97,6 +105,23 @@ class BincountTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         math_ops.bincount([1, 2, 3, -1, 6, 8]).eval()
 
+  @test_util.run_deprecated_v1
+  def test_shape_function(self):
+    # size must be scalar.
+    with self.assertRaisesRegexp(
+        ValueError, "Shape must be rank 0 but is rank 1 for 'Bincount'"):
+      gen_math_ops.bincount([1, 2, 3, -1, 6, 8], [1], [])
+    # size must be positive.
+    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+      gen_math_ops.bincount([1, 2, 3, -1, 6, 8], -5, [])
+    # if size is a constant then the shape is known.
+    v1 = gen_math_ops.bincount([1, 2, 3, -1, 6, 8], 5, [])
+    self.assertAllEqual(v1.get_shape().as_list(), [5])
+    # if size is a placeholder then the shape is unknown.
+    s = array_ops.placeholder(dtype=dtypes.int32)
+    v2 = gen_math_ops.bincount([1, 2, 3, -1, 6, 8], s, [])
+    self.assertAllEqual(v2.get_shape().as_list(), [None])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index a2c6b54273f7f617ee78253e6184befd8f81e4ac..b4f9a21a899c9207811e3c72a58180e4370c140a 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -28,9 +29,9 @@ from tensorflow.python.platform import test
 class BitcastTest(test.TestCase):
 
   def _testBitcast(self, x, datatype, shape):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       tf_ans = array_ops.bitcast(x, datatype)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       buff_after = memoryview(out).tobytes()
       buff_before = memoryview(x).tobytes()
       self.assertEqual(buff_before, buff_after)
@@ -59,6 +60,7 @@ class BitcastTest(test.TestCase):
     shape = [3, 4]
     self._testBitcast(x, dtypes.int64, shape)
 
+  @test_util.run_deprecated_v1
   def testErrors(self):
     x = np.zeros([1, 1], np.int8)
     datatype = dtypes.int32
@@ -71,6 +73,7 @@ class BitcastTest(test.TestCase):
     shape = [4]
     self._testBitcast(x, datatype, shape)
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     x = array_ops.placeholder(dtypes.float32)
     datatype = dtypes.int8
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index 7cdc67f83f0af65ff76b7109f088023220ab2b15..6b04e8abf40dc6fc396581e82b59bc6c4dec2a41 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.platform import googletest
 class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
   """Tests prediction ops for training."""
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionOnEmptyEnsemble(self):
     """Tests that prediction on a dummy ensemble does not fail."""
     with self.cached_session() as session:
@@ -61,6 +62,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(cached_node_ids, new_node_ids)
       self.assertAllClose([[0], [0]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testNoCachedPredictionButTreeExists(self):
     """Tests that predictions are updated once trees are added."""
     with self.cached_session() as session:
@@ -127,6 +129,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([2, 1], new_node_ids)
       self.assertAllClose([[0.1 * 8.79], [0.1 * 1.14]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionIsCurrent(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -199,6 +202,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(cached_node_ids, new_node_ids)
       self.assertAllClose([[0], [0]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromTheSameTree(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -313,6 +317,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # 1.65 and -3.875, and then multiply them by 0.1 (lr)
       self.assertAllClose([[0.1 * 1.65], [0.1 * -3.875]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromPreviousTree(self):
     """Tests the predictions work when we have cache from previous trees."""
     with self.cached_session() as session:
@@ -445,6 +450,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       #            change= 0.1(1.14+7.0-7.0)
       self.assertAllClose([[1], [0.114]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCategoricalSplits(self):
     """Tests the training prediction work for categorical splits."""
     with self.cached_session() as session:
@@ -517,6 +523,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([3, 4, 2], new_node_ids)
       self.assertAllClose([[5.], [6.], [7.]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromTheSameTreeWithPostPrunedNodes(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -647,6 +654,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([[0.01], [0.01], [0.0553], [0.0783], [0.01], [0.01]],
                           logits_updates + cached_values)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromThePreviousTreeWithPostPrunedNodes(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -792,6 +800,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
                            [root + 0.0783], [root + 0.01], [root + 0.01]],
                           logits_updates + cached_values)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionTheWholeTreeWasPruned(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -864,6 +873,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
 class PredictionOpsTest(test_util.TensorFlowTestCase):
   """Tests prediction ops for inference."""
 
+  @test_util.run_deprecated_v1
   def testPredictionOnEmptyEnsemble(self):
     """Tests that prediction on a empty ensemble does not fail."""
     with self.cached_session() as session:
@@ -886,6 +896,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
   def testPredictionMultipleTree(self):
     """Tests the predictions work when we have multiple trees."""
     with self.cached_session() as session:
@@ -996,6 +1007,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
   def testCategoricalSplits(self):
     """Tests the predictions work for categorical splits."""
     with self.cached_session() as session:
@@ -1062,6 +1074,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
   """Tests feature contribs ops for model understanding."""
 
+  @test_util.run_deprecated_v1
   def testContribsForOnlyABiasNode(self):
     """Tests case when, after training, only left with a bias node.
 
@@ -1122,6 +1135,7 @@ class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(feature_ids, expected_feature_ids)
       self.assertAllClose(logits_paths, expected_logits_paths)
 
+  @test_util.run_deprecated_v1
   def testContribsMultipleTreeWhenFirstTreeIsABiasNode(self):
     """Tests case when, after training, first tree contains only a bias node."""
     with self.cached_session() as session:
@@ -1219,6 +1233,7 @@ class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(feature_ids, expected_feature_ids)
       self.assertAllClose(logits_paths, expected_logits_paths)
 
+  @test_util.run_deprecated_v1
   def testContribsMultipleTree(self):
     """Tests that the contribs work when we have multiple trees."""
     with self.cached_session() as session:
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index e0d46bae83a2c731d98f199c1af74196f5956201..2b9863fb89bac80f6a2f012a3f25c23f993d03ad 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -18,14 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import tempfile
+
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import boosted_trees_ops
 from tensorflow.python.ops import resources
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_handle_op as resource_handle_op
 from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as resource_initialized
 from tensorflow.python.platform import googletest
+from tensorflow.python.training import saver
 
 
 class QuantileOpsTest(test_util.TensorFlowTestCase):
@@ -57,18 +64,16 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     | 5        |     1            |   2.2     |   0.8
     """
 
-    self._feature_0 = constant_op.constant(
-        [[1.2], [12.1], [0.3], [0.5], [0.6], [2.2]], dtype=dtypes.float32)
-    self._feature_1 = constant_op.constant(
-        [[2.3], [1.2], [1.1], [2.6], [3.2], [0.8]], dtype=dtypes.float32)
-    self._feature_0_boundaries = constant_op.constant(
-        [0.3, 0.6, 1.2, 12.1], dtype=dtypes.float32)
-    self._feature_1_boundaries = constant_op.constant(
-        [0.8, 1.2, 2.3, 3.2], dtype=dtypes.float32)
-    self._feature_0_quantiles = constant_op.constant(
-        [[2], [3], [0], [1], [1], [3]], dtype=dtypes.int32)
-    self._feature_1_quantiles = constant_op.constant(
-        [[2], [1], [1], [3], [3], [0]], dtype=dtypes.int32)
+    self._feature_0 = constant_op.constant([1.2, 12.1, 0.3, 0.5, 0.6, 2.2],
+                                           dtype=dtypes.float32)
+    self._feature_1 = constant_op.constant([2.3, 1.2, 1.1, 2.6, 3.2, 0.8],
+                                           dtype=dtypes.float32)
+    self._feature_0_boundaries = np.array([0.3, 0.6, 1.2, 12.1])
+    self._feature_1_boundaries = np.array([0.8, 1.2, 2.3, 3.2])
+    self._feature_0_quantiles = constant_op.constant([2, 3, 0, 1, 1, 3],
+                                                     dtype=dtypes.int32)
+    self._feature_1_quantiles = constant_op.constant([2, 1, 1, 3, 3, 0],
+                                                     dtype=dtypes.int32)
 
     self._example_weights = constant_op.constant(
         [10, 1, 1, 1, 1, 1], dtype=dtypes.float32)
@@ -77,6 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     self.max_elements = 1 << 16
     self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
 
+  @test_util.run_v1_only("b/120545219")
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle = self.create_resource("floats", self.eps,
@@ -93,14 +99,15 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
           quantile_accumulator_handle, num_features=2)
       quantiles = boosted_trees_ops.boosted_trees_bucketize(
           [self._feature_0, self._feature_1], buckets)
-      sess.run(summary_op)
-      sess.run(flush_op)
+      self.evaluate(summary_op)
+      self.evaluate(flush_op)
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testBasicQuantileBucketsMultipleResources(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps,
@@ -127,14 +134,79 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
           quantile_accumulator_handle_1, num_features=1)
       quantiles = boosted_trees_ops.boosted_trees_bucketize(
           [self._feature_0, self._feature_1], bucket_0 + bucket_1)
-      sess.run([summary_op_0, summary_op_1])
-      sess.run([flush_op_0, flush_op_1])
+      self.evaluate([summary_op_0, summary_op_1])
+      self.evaluate([flush_op_0, flush_op_1])
       self.assertAllClose(self._feature_0_boundaries, bucket_0[0].eval())
       self.assertAllClose(self._feature_1_boundaries, bucket_1[0].eval())
 
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
+  def testSaveRestoreAfterFlush(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.test_session() as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+
+      save = saver.Saver()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose([], buckets[0].eval())
+      self.assertAllClose([], buckets[1].eval())
+      summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
+                                            self._example_weights)
+      with ops.control_dependencies([summaries]):
+        flush = accumulator.flush()
+      self.evaluate(flush)
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+      save.save(sess, save_path)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+      save = saver.Saver()
+      save.restore(sess, save_path)
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+
+  @test_util.run_v1_only("b/120545219")
+  def testSaveRestoreBeforeFlush(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.test_session() as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+
+      save = saver.Saver()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
+                                            self._example_weights)
+      self.evaluate(summaries)
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose([], buckets[0].eval())
+      self.assertAllClose([], buckets[1].eval())
+      save.save(sess, save_path)
+      self.evaluate(accumulator.flush())
+      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
+      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      accumulator = boosted_trees_ops.QuantileAccumulator(
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+      save = saver.Saver()
+      save.restore(sess, save_path)
+      buckets = accumulator.get_bucket_boundaries()
+      self.assertAllClose([], buckets[0].eval())
+      self.assertAllClose([], buckets[1].eval())
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
index 65bb9ab55f00c0ad9506122bf357484c7a4acd5f..0a34277bbdb43ca449923550000970e63ca14905 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -30,19 +30,21 @@ from tensorflow.python.platform import googletest
 class ResourceOpsTest(test_util.TensorFlowTestCase):
   """Tests resource_ops."""
 
+  @test_util.run_deprecated_v1
   def testCreate(self):
     with self.cached_session():
       ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
       resources.initialize_resources(resources.shared_resources()).run()
       stamp_token = ensemble.get_stamp_token()
-      self.assertEqual(0, stamp_token.eval())
+      self.assertEqual(0, self.evaluate(stamp_token))
       (_, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(0, num_trees.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertEqual(0, num_attempted_layers.eval())
-      self.assertAllEqual([0, 1], nodes_range.eval())
+      self.assertEqual(0, self.evaluate(num_trees))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertEqual(0, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([0, 1], self.evaluate(nodes_range))
 
+  @test_util.run_deprecated_v1
   def testCreateWithProto(self):
     with self.cached_session():
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
@@ -154,12 +156,13 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(7, stamp_token.eval())
-      self.assertEqual(2, num_trees.eval())
-      self.assertEqual(1, num_finalized_trees.eval())
-      self.assertEqual(6, num_attempted_layers.eval())
-      self.assertAllEqual([16, 19], nodes_range.eval())
+      self.assertEqual(7, self.evaluate(stamp_token))
+      self.assertEqual(2, self.evaluate(num_trees))
+      self.assertEqual(1, self.evaluate(num_finalized_trees))
+      self.assertEqual(6, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([16, 19], self.evaluate(nodes_range))
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserialize(self):
     with self.cached_session():
       # Initialize.
@@ -167,11 +170,11 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(5, stamp_token.eval())
-      self.assertEqual(0, num_trees.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertEqual(0, num_attempted_layers.eval())
-      self.assertAllEqual([0, 1], nodes_range.eval())
+      self.assertEqual(5, self.evaluate(stamp_token))
+      self.assertEqual(0, self.evaluate(num_trees))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertEqual(0, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([0, 1], self.evaluate(nodes_range))
 
       # Deserialize.
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
@@ -219,18 +222,18 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       ]):
         (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
          nodes_range) = ensemble.get_states()
-      self.assertEqual(3, stamp_token.eval())
-      self.assertEqual(1, num_trees.eval())
+      self.assertEqual(3, self.evaluate(stamp_token))
+      self.assertEqual(1, self.evaluate(num_trees))
       # This reads from metadata, not really counting the layers.
-      self.assertEqual(5, num_attempted_layers.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertAllEqual([3, 7], nodes_range.eval())
+      self.assertEqual(5, self.evaluate(num_attempted_layers))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertAllEqual([3, 7], self.evaluate(nodes_range))
 
 
       # Serialize.
       new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
       new_stamp_token, new_serialized = ensemble.serialize()
-      self.assertEqual(3, new_stamp_token.eval())
+      self.assertEqual(3, self.evaluate(new_stamp_token))
       new_ensemble_proto.ParseFromString(new_serialized.eval())
       self.assertProtoEquals(ensemble_proto, new_ensemble_proto)
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index 09e9cfa3affb9750938f2292e6e2dc3edddecedb..e2e23486b5a9fb93e11971147b0784a62e636a7b 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -65,16 +65,16 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[0.004775, 0.41184], [0.02823, 0.41184]],
-                          sess.run(gains_list))
-      self.assertAllEqual([[1, 1], [1, 1]], sess.run(thresholds_list))
+                          self.evaluate(gains_list))
+      self.assertAllEqual([[1, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[-.416667], [.568966]], [[-.6], [-.75]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-.592593], [-.75]], [[-.076923], [.568966]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithL2(self):
     """Testing Gain calculation with L2."""
@@ -113,16 +113,16 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[0., 0.33931375], [0.01879096, 0.33931375]],
-                          sess.run(gains_list))
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+                          self.evaluate(gains_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithL1(self):
     """Testing Gain calculation with L1."""
@@ -162,18 +162,18 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[[0.0], [0.3965517]], [[-0.4], [-0.5]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
 
       self.assertAllClose([[[-0.3333333], [-0.5]], [[0.0], [0.396552]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
       # Gain should also include an adjustment of the gradient by l1.
       self.assertAllClose([[0.0, 0.191207], [0.01, 0.191207]],
-                          sess.run(gains_list))
+                          self.evaluate(gains_list))
 
   def testCalculateBestGainsWithTreeComplexity(self):
     """Testing Gain calculation with L2."""
@@ -214,18 +214,18 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
 
       self.assertAllClose([[-3., -2.66068625], [-2.98120904, -2.66068625]],
-                          sess.run(gains_list))
+                          self.evaluate(gains_list))
 
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithMinNodeWeight(self):
     """Testing Gain calculation without any regularization."""
@@ -266,13 +266,13 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
 
       # We can't split node 1 on feature 1 and node 2 on feature 2 because of
       # the min node weight.
-      self.assertAllEqual([[2], [1]], sess.run(node_ids_list))
-      self.assertAllClose([[0.384314], [0.098013]], sess.run(gains_list))
-      self.assertAllEqual([[1], [1]], sess.run(thresholds_list))
+      self.assertAllEqual([[2], [1]], self.evaluate(node_ids_list))
+      self.assertAllClose([[0.384314], [0.098013]], self.evaluate(gains_list))
+      self.assertAllEqual([[1], [1]], self.evaluate(thresholds_list))
       self.assertAllClose([[[0.4852941]], [[-.6]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-0.75]], [[-0.014925]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithMinNodeWeightNoSplitOnFeturePossible(self):
     """Testing Gain calculation without any regularization."""
@@ -311,9 +311,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
            max_splits=max_splits)
 
       # We can't split either of the nodes on the first feature
-      self.assertEqual(2, len(sess.run(node_ids_list)))
-      self.assertAllEqual([], sess.run(node_ids_list)[0])
-      self.assertAllEqual([1], sess.run(node_ids_list)[1])
+      self.assertEqual(2, len(self.evaluate(node_ids_list)))
+      self.assertAllEqual([], self.evaluate(node_ids_list)[0])
+      self.assertAllEqual([1], self.evaluate(node_ids_list)[1])
 
       # Now check when we can't split on any feature
       (node_ids_list, _, _, _,
@@ -325,8 +325,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
            tree_complexity=0.0,
            min_node_weight=10,
            max_splits=max_splits)
-      self.assertAllEqual([[], []], sess.run(node_ids_list))
+      self.assertAllEqual([[], []], self.evaluate(node_ids_list))
 
+  @test_util.run_deprecated_v1
   def testMakeStatsSummarySimple(self):
     """Simple test for MakeStatsSummary."""
     with self.cached_session():
@@ -359,7 +360,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
               [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
               [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
           ]],
-          result.eval())
+          self.evaluate(result))
 
   def testMakeStatsSummaryMultipleFeatures(self):
     """Tests that MakeStatsSummary works for multiple features."""
@@ -389,7 +390,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
                   [[.3, .4], [0., 0.], [-.4, .5], [.07, .08]],  # node 2
               ],  # feature 1
           ],
-          result.eval())
+          self.evaluate(result))
 
   def _verify_precision(self, length):
     with self.cached_session():
@@ -408,7 +409,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           node_ids, gradients, hessians, [bucketized_features], max_splits,
           num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
 
-      self.assertAllClose([[[[2., 0.2]]]], result.eval())
+      self.assertAllClose([[[[2., 0.2]]]], self.evaluate(result))
 
   def testMakeStatsSummaryNumericalPrecisionSmallBatch(self):
     """Tests numeric precision."""
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index ea022820e44e5461585c35a5bd4b9e256d923d13..afc0564fc5a7939d6a7ec7b3f4c3f2108c00ac92 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.platform import googletest
 class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
   """Tests for growing tree ensemble from split candidates."""
 
+  @test_util.run_deprecated_v1
   def testGrowWithEmptyEnsemble(self):
     """Test growing an empty ensemble."""
     with self.cached_session() as session:
@@ -139,6 +140,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testBiasCenteringOnEmptyEnsemble(self):
     """Test growing with bias centering on an empty ensemble."""
     with self.cached_session() as session:
@@ -182,6 +184,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testGrowExistingEnsembleTreeNotFinalized(self):
     """Test growing an existing ensemble with the last tree not finalized."""
     with self.cached_session() as session:
@@ -366,6 +369,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testGrowExistingEnsembleTreeFinalized(self):
     """Test growing an existing ensemble with the last tree finalized."""
     with self.cached_session() as session:
@@ -515,6 +519,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPrePruning(self):
     """Test growing an existing ensemble with pre-pruning."""
     with self.cached_session() as session:
@@ -671,6 +676,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testMetadataWhenCantSplitDueToEmptySplits(self):
     """Test that the metadata is updated even though we can't split."""
     with self.cached_session() as session:
@@ -782,6 +788,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testMetadataWhenCantSplitDuePrePruning(self):
     """Test metadata is updated correctly when no split due to prepruning."""
     with self.cached_session() as session:
@@ -917,6 +924,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPostPruningOfSomeNodes(self):
     """Test growing an ensemble with post-pruning."""
     with self.cached_session() as session:
@@ -1251,6 +1259,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 3)
       self.assertProtoEquals(expected_result, res_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPostPruningOfAllNodes(self):
     """Test growing an ensemble with post-pruning, with all nodes are pruned."""
     with self.cached_session() as session:
@@ -1434,6 +1443,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       }
       """, res_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPostPruningChangesNothing(self):
     """Test growing an ensemble with post-pruning with all gains >0."""
     with self.cached_session() as session:
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
index 09c325f2bc954bd08c7d3342efacace6df575be1..b9eb2391b490f659bd20e26a2c5b290ab4bfea1b 100644
--- a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -21,7 +21,6 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -30,32 +29,36 @@ from tensorflow.python.platform import test as test_lib
 
 class BroadcastToTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBroadcastToBasic(self):
     for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         x = np.array([1, 2, 3], dtype=dtype)
         v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
         v_np = np.broadcast_to(x, [3, 3])
         self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToString(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = np.array([b"1", b"2", b"3"])
       v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToBool(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = np.array([True, False, True], dtype=np.bool)
       v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToShape(self):
     for input_dim in range(1, 6):
       for output_dim in range(input_dim, 6):
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           input_shape = [2] * input_dim
           output_shape = [2] * output_dim
           x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
@@ -63,16 +66,26 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
           v_np = np.broadcast_to(x, output_shape)
           self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToScalar(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = np.array(1, dtype=np.int32)
       v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
+  def testBroadcastScalarToNonScalar(self):
+    with self.session(use_gpu=True):
+      x = np.array(1.0, dtype=np.float)
+      v_tf = array_ops.broadcast_to(constant_op.constant(1.0), [2, 3, 4])
+      v_np = np.broadcast_to(x, [2, 3, 4])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
   def testBroadcastToShapeTypeAndInference(self):
     for dtype in [dtypes.int32, dtypes.int64]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = np.array([1, 2, 3])
         v_tf = array_ops.broadcast_to(
             constant_op.constant(x),
@@ -83,18 +96,17 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
         # check shape inference when shape input is constant
         self.assertAllEqual(shape, v_np.shape)
 
+  @test_util.run_deprecated_v1
   def testGradientForScalar(self):
-    # TODO(alextp): There is a bug with broadcast_to on GPU from scalars,
-    # hence we make this test cpu-only.
-    with ops.device("cpu:0"):
-      x = constant_op.constant(1, dtype=dtypes.float32)
-      v = array_ops.broadcast_to(x, [2, 4, 3])
-      out = 2 * v
-      with self.cached_session():
-        err = gradient_checker.compute_gradient_error(x, x.get_shape(),
-                                                      out, out.get_shape())
+    x = constant_op.constant(1, dtype=dtypes.float32)
+    v = array_ops.broadcast_to(x, [2, 4, 3])
+    out = 2 * v
+    with self.cached_session():
+      err = gradient_checker.compute_gradient_error(x, x.get_shape(), out,
+                                                    out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithSameRank(self):
     x = constant_op.constant(np.reshape(np.arange(6), (2, 1, 3)),
                              dtype=dtypes.float32)
@@ -105,6 +117,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
                                                     out, out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithIncreasingRank(self):
     x = constant_op.constant([[1], [2]],
                              dtype=dtypes.float32)
@@ -115,6 +128,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
                                                     out, out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithBroadcastAllDimensions(self):
     x = constant_op.constant([[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32)
     v = array_ops.broadcast_to(x, [5, 4, 6])
diff --git a/tensorflow/python/kernel_tests/bucketize_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
index e612b1c1349b95899cc4809155732474e50d4b84..95df6943705d3bfcc1e6674782526d3d68fc577a 100644
--- a/tensorflow/python/kernel_tests/bucketize_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -31,32 +32,33 @@ class BucketizationOpTest(test.TestCase):
         constant_op.constant([-5, 0, 2, 3, 5, 8, 10, 11, 12]),
         boundaries=[0, 3, 8, 11])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
-    with self.test_session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+    with self.session(use_gpu=True) as sess:
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
   def testFloat(self):
     op = math_ops._bucketize(
         constant_op.constant([-5., 0., 2., 3., 5., 8., 10., 11., 12.]),
         boundaries=[0., 3., 8., 11.])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
-    with self.test_session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+    with self.session(use_gpu=True) as sess:
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
   def test2DInput(self):
     op = math_ops._bucketize(
         constant_op.constant([[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]),
         boundaries=[0, 3, 8, 11])
     expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
-    with self.test_session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+    with self.session(use_gpu=True) as sess:
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def testInvalidBoundariesOrder(self):
     op = math_ops._bucketize(
         constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError, "Expected sorted boundaries"):
-        sess.run(op)
+        self.evaluate(op)
 
   def testBoundariesNotList(self):
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
index b19077db560363a22ab3c4c5400541edb9ab4600..fa6eb5c968965f0bd1f4e38ae8eec1d8f632d086 100644
--- a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
+++ b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import math_ops
@@ -37,6 +38,7 @@ class RangeSamplerOpsTest(test.TestCase):
 
   TRUE_LABELS = [[1, 2], [0, 4], [3, 3]]
 
+  @test_util.run_deprecated_v1
   def testTrueCandidates(self):
     with self.cached_session() as sess:
       indices = constant_op.constant([0, 0, 1, 1, 2, 2])
@@ -55,7 +57,7 @@ class RangeSamplerOpsTest(test.TestCase):
           [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
       sampled_candidates, _, _ = candidate_sampling_ops.all_candidate_sampler(
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
-      result = sampled_candidates.eval()
+      result = self.evaluate(sampled_candidates)
 
     expected_ids = [0, 1, 2, 3, 4]
     self.assertAllEqual(result, expected_ids)
@@ -68,7 +70,7 @@ class RangeSamplerOpsTest(test.TestCase):
       _, true_expected_count, _ = candidate_sampling_ops.all_candidate_sampler(
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       true_log_expected_count = math_ops.log(true_expected_count)
-      result = true_log_expected_count.eval()
+      result = self.evaluate(true_log_expected_count)
 
     self.assertAllEqual(result, [[0.0] * self.NUM_TRUE] * self.BATCH_SIZE)
     self.assertEqual(true_expected_count.get_shape(),
@@ -83,7 +85,7 @@ class RangeSamplerOpsTest(test.TestCase):
       _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(  # pylint: disable=line-too-long
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       sampled_log_expected_count = math_ops.log(sampled_expected_count)
-      result = sampled_log_expected_count.eval()
+      result = self.evaluate(sampled_log_expected_count)
 
     self.assertAllEqual(result, [0.0] * self.NUM_SAMPLED)
     self.assertEqual(sampled_expected_count.get_shape(), [self.NUM_SAMPLED])
@@ -97,7 +99,7 @@ class RangeSamplerOpsTest(test.TestCase):
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       accidental_hits = candidate_sampling_ops.compute_accidental_hits(
           true_classes, sampled_candidates, self.NUM_TRUE)
-      indices, ids, weights = sess.run(accidental_hits)
+      indices, ids, weights = self.evaluate(accidental_hits)
 
     self.assertEqual(1, accidental_hits[0].get_shape().ndims)
     self.assertEqual(1, accidental_hits[1].get_shape().ndims)
@@ -106,6 +108,7 @@ class RangeSamplerOpsTest(test.TestCase):
       self.assertTrue(id_ in self.TRUE_LABELS[index])
       self.assertLess(weight, -1.0e37)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
 
     def draw(seed):
@@ -114,7 +117,7 @@ class RangeSamplerOpsTest(test.TestCase):
             [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
         sampled, _, _ = candidate_sampling_ops.log_uniform_candidate_sampler(
             true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True, 5, seed=seed)
-        return sampled.eval()
+        return self.evaluate(sampled)
 
     # Non-zero seed. Repeatable.
     for seed in [1, 12, 123, 1234]:
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index c90520e46d4c612ec9d9bc921db921d8582439d6..b3187e1637193a8b34f7f3668220d94d783b6170 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -25,6 +25,7 @@ import platform
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -54,7 +55,7 @@ class CastOpTest(test.TestCase):
       return None
 
   def _cast(self, x, dtype, use_gpu=False):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       val = constant_op.constant(x, self._toDataType(np.array([x]).dtype))
       return math_ops.cast(val, self._toDataType(dtype), name="cast").eval()
 
@@ -90,10 +91,12 @@ class CastOpTest(test.TestCase):
     if x.dtype == np.float32 or x.dtype == np.float64:
       self._testTypes(x, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testAll(np.arange(-10, 10).reshape(2, 10))
     self._testAll(np.linspace(-10, 10, 17))
 
+  @test_util.run_deprecated_v1
   def testSmallValues(self):
     f4 = np.finfo(np.float32)
     f8 = np.finfo(np.float64)
@@ -105,13 +108,14 @@ class CastOpTest(test.TestCase):
 
   def testBfloat16(self):
     a = np.random.uniform(-100, 100, 100).astype(np.float32)
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       b = math_ops.cast(math_ops.cast(a, dtypes.bfloat16), dtypes.float32)
-      self.assertAllClose(a, b.eval(), rtol=1 / 128.)
-    with self.test_session(use_gpu=True):
+      self.assertAllClose(a, self.evaluate(b), rtol=1 / 128.)
+    with self.cached_session(use_gpu=True):
       b = math_ops.cast(math_ops.cast(a, dtypes.bfloat16), dtypes.float32)
-      self.assertAllClose(a, b.eval(), rtol=1 / 128.)
+      self.assertAllClose(a, self.evaluate(b), rtol=1 / 128.)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     self._testAll(np.random.normal(0, 10, 210).reshape([2, 3, 5, 7]))
     self._testAll(np.random.normal(0, 1e6, 210).reshape([2, 3, 5, 7]))
@@ -124,6 +128,7 @@ class CastOpTest(test.TestCase):
         self._cast(
             x, dst_dtype, use_gpu=use_gpu), dst_dtype(expected))
 
+  @test_util.run_deprecated_v1
   def testIntToFloatBoundary(self):
     i4 = np.iinfo(np.int32)
     i8 = np.iinfo(np.int64)
@@ -138,6 +143,7 @@ class CastOpTest(test.TestCase):
     self._compare(i8.max, np.float64, i8.max, False)
     # NOTE: GPU does not support int32/int64 for casting.
 
+  @test_util.run_deprecated_v1
   def testInfNan(self):
     i4 = np.iinfo(np.int32)
     i8 = np.iinfo(np.int64)
@@ -181,14 +187,16 @@ class CastOpTest(test.TestCase):
   def testNotImplemented(self):
     self._OpError(np.arange(0, 10), dtypes.string, "Cast.*int64.*string.*")
 
+  @test_util.run_deprecated_v1
   def testCastToTypeOfVariable(self):
     with self.cached_session() as sess:
       x = variables.Variable(5, dtype=dtypes.float32)
       y = variables.Variable(True, dtype=dtypes.bool)
       cast = math_ops.cast(y, x.dtype)
       variables.global_variables_initializer().run()
-      self.assertEqual(1.0, sess.run(cast))
+      self.assertEqual(1.0, self.evaluate(cast))
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     t = [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
     for src_t in t:
@@ -203,6 +211,7 @@ class CastOpTest(test.TestCase):
 
 class SparseTensorCastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCast(self):
     indices = constant_op.constant([[0], [1], [2]], dtypes.int64)
     values = constant_op.constant(np.array([1, 2, 3], np.int64))
@@ -229,7 +238,7 @@ class SaturateCastTest(test.TestCase):
               [lo, lo + 1, lo // 2, hi // 2, hi - 1, hi], dtype=in_type)
           y = math_ops.saturate_cast(x, dtype=out_type)
           self.assertEqual(y.dtype, out_type)
-          x, y = sess.run([x, y])
+          x, y = self.evaluate([x, y])
           correct = np.maximum(out_type.min, np.minimum(out_type.max, x))
           self.assertAllEqual(correct, y)
 
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index bd4011d58e6c50f49c4dd95f02b396ed684e76a2..95bac85027bd1709420dcfc7f96f92195f8f2472 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -39,6 +40,69 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
+class AssertV2Asserts(test.TestCase):
+
+  def test_passes_when_it_should(self):
+    # This is a v2 test and need to run eagerly
+    with context.eager_mode():
+      c1 = constant_op.constant(-1, name="minus_one", dtype=dtypes.int32)
+      c2 = constant_op.constant(2, name="two", dtype=dtypes.int32)
+      c3 = constant_op.constant([3., 3.], name="three", dtype=dtypes.float32)
+      c4 = constant_op.constant([3., 3.5], name="three_and_a_half",
+                                dtype=dtypes.float32)
+      scalar = c1
+      non_scalar = c3
+      integer = c1
+      non_integer = c3
+      positive = c2
+      negative = c1
+      cases = [
+          (check_ops.assert_equal_v2, (c1, c1), (c1, c2)),
+          (check_ops.assert_less_v2, (c1, c2), (c1, c1)),
+          (check_ops.assert_near_v2, (c3, c3), (c3, c4)),
+          (check_ops.assert_greater_v2, (c2, c1), (c1, c1)),
+          (check_ops.assert_negative_v2, (negative,), (positive,)),
+          (check_ops.assert_positive_v2, (positive,), (negative,)),
+          (check_ops.assert_less_equal_v2, (c1, c1), (c2, c1)),
+          (check_ops.assert_none_equal_v2, (c1, c2), (c3, c4)),
+          (check_ops.assert_non_negative_v2, (positive,), (negative,)),
+          (check_ops.assert_non_positive_v2, (negative,), (positive,)),
+          (check_ops.assert_greater_equal_v2, (c1, c1), (c1, c2)),
+          (check_ops.assert_type_v2, (c1, dtypes.int32), (c1, dtypes.float32),
+           TypeError),
+          (check_ops.assert_integer_v2, (integer,), (non_integer,),
+           TypeError),
+          (check_ops.assert_scalar_v2, (scalar,), (non_scalar,),
+           ValueError),
+          (check_ops.assert_rank_v2, (c1, 0), (c3, 2), ValueError),
+          (check_ops.assert_rank_in_v2, (c1, [0, 1]), (c1, [1, 2]),
+           ValueError),
+          (check_ops.assert_rank_at_least_v2, (non_scalar, 1), (scalar, 1),
+           ValueError),
+      ]
+
+      for case in cases:
+        fn = case[0]
+        passing_args = case[1]
+        failing_args = case[2]
+        error = errors.InvalidArgumentError if len(case) < 4 else case[3]
+
+        print("Testing %s passing properly." % fn)
+
+        fn(*passing_args)
+
+        print("Testing %s failing properly." % fn)
+
+        @def_function.function
+        def failing_fn():
+          fn(*failing_args, message="fail")  # pylint: disable=cell-var-from-loop
+
+        with self.assertRaisesRegexp(error, "fail"):
+          failing_fn()
+
+        del failing_fn
+
+
 class AssertProperIterableTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -109,6 +173,7 @@ class AssertEqualTest(test.TestCase):
       assert x is None
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_greater(self):
     # Static check
     static_small = constant_op.constant([1, 2], name="small")
@@ -116,6 +181,7 @@ class AssertEqualTest(test.TestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
+  @test_util.run_deprecated_v1
   def test_raises_when_greater_dynamic(self):
     with self.cached_session():
       small = array_ops.placeholder(dtypes.int32, name="small")
@@ -187,6 +253,7 @@ First 2 elements of y:
                                summarize=2)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_less(self):
     # Static check
     static_small = constant_op.constant([3, 1], name="small")
@@ -194,6 +261,7 @@ First 2 elements of y:
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
+  @test_util.run_deprecated_v1
   def test_raises_when_less_dynamic(self):
     with self.cached_session():
       small = array_ops.placeholder(dtypes.int32, name="small")
@@ -253,6 +321,7 @@ class AssertNoneEqualTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([3, 1], name="small")
     with self.assertRaisesOpError("x != y did not hold"):
@@ -302,6 +371,30 @@ class AssertNoneEqualTest(test.TestCase):
       x = check_ops.assert_none_equal(t1, t2)
       assert x is None
 
+  def test_error_message_eager(self):
+    # Note that the following three strings are regexes
+    expected_error_msg_full = r"""0.0, 1.0, 2.0, 3.0, 4.0, 5.0"""
+    expected_error_msg_default = r"""0.0, 1.0, 2.0, \.\.\."""
+    expected_error_msg_short = r"""0.0, 1.0, \.\.\."""
+    with context.eager_mode():
+      t = constant_op.constant(
+          np.array(range(6)), shape=[2, 3], dtype=np.float32)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   expected_error_msg_full):
+        check_ops.assert_none_equal(
+            t, t, message="This is the error message.", summarize=10)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   expected_error_msg_full):
+        check_ops.assert_none_equal(
+            t, t, message="This is the error message.", summarize=-1)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   expected_error_msg_default):
+        check_ops.assert_none_equal(t, t, message="This is the error message.")
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   expected_error_msg_short):
+        check_ops.assert_none_equal(
+            t, t, message="This is the error message.", summarize=2)
+
 
 class AssertAllCloseTest(test.TestCase):
 
@@ -418,6 +511,7 @@ class AssertAllCloseTest(test.TestCase):
 class AssertLessTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([1, 2], name="small")
     with self.assertRaisesOpError("failure message.*\n*.* x < y did not hold"):
@@ -428,6 +522,7 @@ class AssertLessTest(test.TestCase):
       self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_greater(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -494,6 +589,7 @@ class AssertLessEqualTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_greater(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -549,6 +645,7 @@ class AssertLessEqualTest(test.TestCase):
 class AssertGreaterTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([1, 2], name="small")
     with self.assertRaisesOpError("fail"):
@@ -559,6 +656,7 @@ class AssertGreaterTest(test.TestCase):
       self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_less(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -618,6 +716,7 @@ class AssertGreaterEqualTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_less(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -682,6 +781,7 @@ class AssertNegativeTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_positive(self):
     doug = constant_op.constant([1, 2], name="doug")
     with self.assertRaisesOpError("fail"):
@@ -692,6 +792,7 @@ class AssertNegativeTest(test.TestCase):
       self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_zero(self):
     claire = constant_op.constant([0], name="claire")
     with self.assertRaisesOpError("x < 0 did not hold"):
@@ -714,6 +815,7 @@ class AssertNegativeTest(test.TestCase):
 class AssertPositiveTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_negative(self):
     freddie = constant_op.constant([-1, -2], name="freddie")
     with self.assertRaisesOpError("fail"):
@@ -731,6 +833,7 @@ class AssertPositiveTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_zero(self):
     meechum = constant_op.constant([0], name="meechum")
     with self.assertRaisesOpError("x > 0 did not hold"):
@@ -753,26 +856,31 @@ class AssertPositiveTest(test.TestCase):
 class EnsureShapeTest(test.TestCase):
 
   # Static shape inference
+  @test_util.run_deprecated_v1
   def testStaticShape(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     ensure_shape_op = check_ops.ensure_shape(placeholder, (3, 3, 3))
     self.assertEqual(ensure_shape_op.get_shape(), (3, 3, 3))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_MergesShapes(self):
     placeholder = array_ops.placeholder(dtypes.int32, shape=(None, None, 3))
     ensure_shape_op = check_ops.ensure_shape(placeholder, (5, 4, None))
     self.assertEqual(ensure_shape_op.get_shape(), (5, 4, 3))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_RaisesErrorWhenRankIncompatible(self):
     placeholder = array_ops.placeholder(dtypes.int32, shape=(None, None, 3))
     with self.assertRaises(ValueError):
       check_ops.ensure_shape(placeholder, (2, 3))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_RaisesErrorWhenDimIncompatible(self):
     placeholder = array_ops.placeholder(dtypes.int32, shape=(None, None, 3))
     with self.assertRaises(ValueError):
       check_ops.ensure_shape(placeholder, (2, 2, 4))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_CanSetUnknownShape(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -780,6 +888,7 @@ class EnsureShapeTest(test.TestCase):
     self.assertEqual(ensure_shape_op.get_shape(), None)
 
   # Dynamic shape check
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape_RaisesError(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = math_ops.divide(placeholder, 3, name="MyDivide")
@@ -792,6 +901,7 @@ class EnsureShapeTest(test.TestCase):
           r"expected shape \[3,3,3\]."):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape_RaisesErrorDimUnknown(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -804,6 +914,7 @@ class EnsureShapeTest(test.TestCase):
           r"expected shape \[\?,\?,3\]."):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -812,6 +923,7 @@ class EnsureShapeTest(test.TestCase):
     with self.cached_session() as sess:
       sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape_WithUnknownDims(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -820,6 +932,7 @@ class EnsureShapeTest(test.TestCase):
     with self.cached_session() as sess:
       sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     placeholder = array_ops.placeholder(dtypes.float32)
     derived = check_ops.ensure_shape(placeholder, (None, None))
@@ -915,6 +1028,7 @@ class AssertRankTest(test.TestCase):
               tensor, desired_rank, message="fail")]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -933,6 +1047,7 @@ class AssertRankTest(test.TestCase):
         [check_ops.assert_rank(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -950,6 +1065,7 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_too_large_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -967,6 +1083,7 @@ class AssertRankTest(test.TestCase):
         [check_ops.assert_rank(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -984,6 +1101,7 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -999,6 +1117,7 @@ class AssertRankTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
       check_ops.assert_rank(tensor, np.array([], dtype=np.int32))
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1016,6 +1135,7 @@ class AssertRankTest(test.TestCase):
                                  "must be of type <dtype: 'int32'>"):
       check_ops.assert_rank(tensor, .5)
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1039,6 +1159,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank0, (1, 2), message="fail")]):
         self.evaluate(array_ops.identity(tensor_rank0))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_raises_if_rank_mismatch_dynamic_rank(self):
     with self.cached_session():
       tensor_rank0 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1055,6 +1176,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank0, desired_ranks)]):
         self.evaluate(array_ops.identity(tensor_rank0))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.cached_session():
       tensor_rank0 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1071,6 +1193,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank1, desired_ranks)]):
         self.evaluate(array_ops.identity(tensor_rank1))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.cached_session():
       tensor_rank1 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1089,6 +1212,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank1, (0, 2))]):
         self.evaluate(array_ops.identity(tensor_rank1))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_mismatches_dynamic_rank(self):
     with self.cached_session():
       tensor_rank1 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1108,6 +1232,7 @@ class AssertRankInTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
       check_ops.assert_rank_in(tensor, desired_ranks)
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1130,6 +1255,7 @@ class AssertRankInTest(test.TestCase):
                                  "must be of type <dtype: 'int32'>"):
       check_ops.assert_rank_in(tensor, (1, .5,))
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1153,6 +1279,7 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1170,6 +1297,7 @@ class AssertRankAtLeastTest(test.TestCase):
         [check_ops.assert_rank_at_least(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1186,6 +1314,7 @@ class AssertRankAtLeastTest(test.TestCase):
         [check_ops.assert_rank_at_least(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_ten_doesnt_raise_if_rank_too_large_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1202,6 +1331,7 @@ class AssertRankAtLeastTest(test.TestCase):
         [check_ops.assert_rank_at_least(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1219,6 +1349,7 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1232,6 +1363,7 @@ class AssertRankAtLeastTest(test.TestCase):
 class AssertNonNegativeTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_negative(self):
     zoe = constant_op.constant([-1, -2], name="zoe")
     with self.assertRaisesOpError("x >= 0 did not hold"):
@@ -1268,6 +1400,7 @@ class AssertNonPositiveTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_positive(self):
     rachel = constant_op.constant([0, 2], name="rachel")
     with self.assertRaisesOpError("x <= 0 did not hold"):
diff --git a/tensorflow/python/kernel_tests/checkpoint_ops_test.py b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
index 51611b75afb051b2f69abb1749c18b3cbf1f66a0..6e289bf9b780ae2ba16f400cc001ddce59f547b3 100644
--- a/tensorflow/python/kernel_tests/checkpoint_ops_test.py
+++ b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_checkpoint_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
@@ -48,6 +49,7 @@ class GenerateVocabRemappingTest(test.TestCase):
     with open(self.old_vocab_file, 'w') as f:
       f.write('\n'.join(['knitting', 'eminem', 'MISSING']) + '\n')
 
+  @test_util.run_deprecated_v1
   def test_generate_remapping_with_no_vocab_changes(self):
     """Tests where vocab does not change at all."""
     remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
@@ -58,8 +60,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = range(0, 3)
     expected_num_present = 3
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_shifted_vocab(self):
     """Tests where vocab is the same, but shifted / ordered differently."""
@@ -71,8 +73,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [2, 0, 1]
     expected_num_present = 3
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_offset(self):
     """Tests offset and num_new_vocab logic."""
@@ -84,8 +86,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [0]
     expected_num_present = 1
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_old_vocab_size(self):
     """Tests where old_vocab_size is specified."""
@@ -99,10 +101,11 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [-1, 0, 1]
     expected_num_present = 2
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
 
+@test_util.run_v1_only('b/120545219')
 class LoadAndRemapMatrixTest(test.TestCase):
   """Tests for the load_and_remap_matrix() op."""
 
@@ -142,7 +145,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=self.old_num_cols)
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
     # No row remapping, new weight matrix has third col, then first col.
     row_remapping = list(range(self.old_num_rows))
@@ -157,7 +160,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=len(col_remapping))
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
     # Both row and column remappings.
     row_remapping = [1, 0, 4]
@@ -172,7 +175,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=len(col_remapping))
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_with_init(self):
     """Tests the op's load and remap where there are missing entries."""
@@ -190,7 +193,8 @@ class LoadAndRemapMatrixTest(test.TestCase):
         [33, init_val, init_val, init_val, 1, init_val], [3, 2])
 
     with self.cached_session():
-      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+      self.assertAllClose(expected_remapped_matrix,
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_all_missing_rows(self):
     """Tests when all the rows are missing and need to be initialized."""
@@ -207,7 +211,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session():
       self.assertAllClose(
           np.reshape(initializing_values, (num_rows, self.old_num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_all_missing_rows_and_cols(self):
     """Tests when all the rows & cols are missing and need to be initialized."""
@@ -225,7 +229,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session():
       self.assertAllClose(
           np.reshape(initializing_values, (num_rows, num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_invalid_remapping(self):
     """Tests that errors are raised when an ID maps to multiple new IDs.
@@ -244,7 +248,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=len(invalid_remapping),
         num_cols=self.old_num_cols)
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
     # Invalid column remapping.
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
@@ -256,7 +260,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=self.old_num_rows,
         num_cols=len(invalid_remapping))
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
   def test_load_and_remap_incorrect_initializing_values(self):
     """Tests that errors are raised with incorrect number of init values."""
@@ -273,7 +277,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=3,
         num_cols=2)
     with self.cached_session(), self.assertRaises(errors.InvalidArgumentError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
@@ -285,7 +289,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=3,
         num_cols=2)
     with self.cached_session(), self.assertRaises(errors.InvalidArgumentError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
 
 class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
@@ -324,7 +328,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           num_rows=num_rows,
           num_cols=num_cols,
           max_rows_in_memory=max_rows_in_memory)
-      self.assertAllClose(np_value[::-1], remapped_matrix.eval())
+      self.assertAllClose(np_value[::-1], self.evaluate(remapped_matrix))
 
       # Tests loading the tensor (except for the first and last rows), with
       # uninitialized values. Requires num_rows to be at least 3 since we're
@@ -348,7 +352,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           np.vstack([
               np.tile(42, [prefix_rows, num_cols]), np_value[1:-1],
               np.tile(42, [suffix_rows, num_cols])
-          ]), remapped_matrix.eval())
+          ]), self.evaluate(remapped_matrix))
 
       # Tests when everything is taken from initializing_values.
       new_rows = 7
@@ -365,8 +369,9 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           max_rows_in_memory=max_rows_in_memory)
       self.assertAllClose(
           np.reshape(initializing_values, (new_rows, num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
+  @test_util.run_deprecated_v1
   def test_loading_rows_divisible_by_max_rows(self):
     """Tests loading normal var when rows are evenly divisible by max_rows."""
     self._test_loading_variable_with_max_rows(
@@ -375,6 +380,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # 9 is evenly divisible by 3.
         max_rows_in_memory=3)
 
+  @test_util.run_deprecated_v1
   def test_loading_rows_not_divisible_by_max_rows(self):
     """Tests loading normal var when rows aren't divisible by max_rows."""
     self._test_loading_variable_with_max_rows(
@@ -383,6 +389,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # 9 is not evenly divisible by 4.
         max_rows_in_memory=4)
 
+  @test_util.run_deprecated_v1
   def test_loading_rows_less_than_max_rows(self):
     """Tests loading normal var as a single slice.
 
@@ -394,6 +401,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # 10 > 9.
         max_rows_in_memory=10)
 
+  @test_util.run_deprecated_v1
   def test_loading_no_max_rows(self):
     """Tests loading normal var as a single slice with no valid max_rows."""
     self._test_loading_variable_with_max_rows(
@@ -401,6 +409,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         partitioner=None,
         max_rows_in_memory=-1)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_equals_max_rows(self):
     """Tests loading partitioned var sliced on partition boundary."""
     self._test_loading_variable_with_max_rows(
@@ -410,6 +419,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # exactly 3 rows.
         max_rows_in_memory=3)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_greater_than_max_rows(self):
     """Tests loading partitioned var with more slices than partitions."""
     self._test_loading_variable_with_max_rows(
@@ -419,6 +429,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # row at a time.
         max_rows_in_memory=1)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_less_than_max_rows(self):
     """Tests loading partitioned var as a single slice.
 
@@ -429,6 +440,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(3),
         max_rows_in_memory=10)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_no_max_rows(self):
     """Tests loading partitioned var as single slice with no valid max_rows."""
     self._test_loading_variable_with_max_rows(
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index 782e6b5068bb82397c39f76783392f3e48e6d3c3..a08cfe960d005451ab5a02aff02e90a0fbcb92a0 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
@@ -36,6 +37,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -96,7 +98,7 @@ def TriAngInvCompositeGrad(l, grad):
 class CholeskyOpTest(test.TestCase):
 
   def _verifyCholeskyBase(self, sess, x, chol, verification):
-    chol_np, verification_np = sess.run([chol, verification])
+    chol_np, verification_np = self.evaluate([chol, verification])
     self.assertAllClose(x, verification_np)
     self.assertShapeEqual(x, chol)
     # Check that the cholesky is lower triangular, and has positive diagonal
@@ -110,7 +112,7 @@ class CholeskyOpTest(test.TestCase):
 
   def _verifyCholesky(self, x):
     # Verify that LL^T == x.
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       chol = linalg_ops.cholesky(x)
       verification = math_ops.matmul(chol, chol, adjoint_b=True)
       self._verifyCholeskyBase(sess, x, chol, verification)
@@ -144,6 +146,7 @@ class CholeskyOpTest(test.TestCase):
       matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
     self._verifyCholesky(matrices)
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]]))
@@ -152,6 +155,7 @@ class CholeskyOpTest(test.TestCase):
           np.array([[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]]
                    ]))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     tensor3 = constant_op.constant([1., 2.])
     with self.assertRaises(ValueError):
@@ -161,7 +165,7 @@ class CholeskyOpTest(test.TestCase):
 
   def testNotInvertibleCPU(self):
     # The input should be invertible.
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError,
           "Cholesky decomposition was not successful. The"
@@ -174,16 +178,17 @@ class CholeskyOpTest(test.TestCase):
     self._verifyCholesky(np.empty([0, 2, 2]))
     self._verifyCholesky(np.empty([2, 0, 0]))
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
       matrix2 = random_ops.random_normal([5, 5], seed=42)
       matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True)
       matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
       c1 = linalg_ops.cholesky(matrix1)
       c2 = linalg_ops.cholesky(matrix2)
-      c1_val, c2_val = sess.run([c1, c2])
-      self.assertAllEqual(c1_val, c2_val)
+      c1_val, c2_val = self.evaluate([c1, c2])
+      self.assertAllClose(c1_val, c2_val)
 
 
 class CholeskyGradTest(test.TestCase):
@@ -192,18 +197,21 @@ class CholeskyGradTest(test.TestCase):
   def getShapes(self, shapeList):
     return ((elem, int(np.floor(1.2 * elem))) for elem in shapeList)
 
+  @test_util.run_deprecated_v1
   def testSmallMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64))
 
+  @test_util.run_deprecated_v1
   def testSmallMatricesComplex(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
 
+  @test_util.run_deprecated_v1
   def testOneBlockMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([self._backprop_block_size + 1])
@@ -212,24 +220,28 @@ class CholeskyGradTest(test.TestCase):
         dtypes=(dtypes_lib.float32, dtypes_lib.float64),
         scalarTest=True)
 
+  @test_util.run_deprecated_v1
   def testTwoBlockMatrixFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32,), scalarTest=True)
 
+  @test_util.run_deprecated_v1
   def testTwoBlockMatrixDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float64,), scalarTest=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testTwoBlockMatrixComplexFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.complex64,), scalarTest=True)
 
+  @test_util.run_deprecated_v1
   def testTwoBlockMatrixComplexDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
@@ -242,7 +254,7 @@ class CholeskyGradTest(test.TestCase):
     data = np.matmul(data, data.T)
     grad_data = np.random.randn(*data.shape).astype(np.float32)
 
-    with ops.Graph().as_default(), self.test_session(use_gpu=False) as s:
+    with ops.Graph().as_default(), self.session(use_gpu=False) as s:
       x = constant_op.constant(data, dtypes_lib.float32)
       chol = linalg_ops.cholesky(x)
       composite_grad = gradients_impl.gradients(chol, x, grad_data)[0]
@@ -255,7 +267,7 @@ class CholeskyGradTest(test.TestCase):
                            dtypes=(dtypes_lib.float32, dtypes_lib.float64,
                                    dtypes_lib.complex64, dtypes_lib.complex128),
                            scalarTest=False):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for shape in shapes:
         for batch in False, True:
           for dtype in dtypes:
@@ -327,7 +339,7 @@ class CholeskyBenchmark(test.Benchmark):
   def benchmarkCholeskyOp(self):
     for shape in self.shapes:
       with ops.Graph().as_default(), \
-          session.Session() as sess, \
+          session.Session(config=benchmark.benchmark_config()) as sess, \
           ops.device("/cpu:0"):
         matrix = variables.Variable(self._GenerateMatrix(shape))
         l = linalg_ops.cholesky(matrix)
@@ -341,7 +353,7 @@ class CholeskyBenchmark(test.Benchmark):
 
       if test.is_gpu_available(True):
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/device:GPU:0"):
           matrix = variables.Variable(self._GenerateMatrix(shape))
           l = linalg_ops.cholesky(matrix)
@@ -359,7 +371,7 @@ class CholeskyBenchmark(test.Benchmark):
       for shape in self.shapes:
         matrix = self._GenerateMatrix(shape)
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device(device):
           l = variables.Variable(np.linalg.cholesky(matrix))
           grad_matrix = variables.Variable(
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index bb7b645da266d048bac038e7625b16dc05939c11..45f1e6152a2a335a83dec1f385354df123a192bf 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -24,10 +24,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -50,12 +52,12 @@ class ClipTest(test.TestCase):
 
   # ClipByValue test
   def testClipByValue(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
       clip_value = 4.4
       ans = clip_ops.clip_by_value(x, -clip_value, clip_value)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -65,13 +67,13 @@ class ClipTest(test.TestCase):
         dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
         dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
     ]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
         np_ans = [[2, 2, 3], [4, 4, 4]]
         clip_value_min = 2
         clip_value_max = 4
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -81,14 +83,14 @@ class ClipTest(test.TestCase):
         dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
         dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
     ]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
         np_ans = [[2, 2, 3], [4, 4, 4]]
         clip_value_min = constant_op.constant(
             [2, 2, 2, 3, 3, 3], shape=[2, 3], dtype=dtype)
         clip_value_max = 4
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -98,14 +100,14 @@ class ClipTest(test.TestCase):
         dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
         dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
     ]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
         np_ans = [[4, 4, 4], [4, 5, 6]]
         clip_value_min = 4
         clip_value_max = constant_op.constant(
             [6, 6, 6, 6, 6, 6], shape=[2, 3], dtype=dtype)
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -115,7 +117,7 @@ class ClipTest(test.TestCase):
         dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int8,
         dtypes.int16, dtypes.int32, dtypes.int64, dtypes.uint8, dtypes.uint16
     ]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
         np_ans = [[2, 2, 3], [5, 5, 6]]
         clip_value_min = constant_op.constant(
@@ -123,12 +125,12 @@ class ClipTest(test.TestCase):
         clip_value_max = constant_op.constant(
             [5, 5, 5, 7, 7, 7], shape=[2, 3], dtype=dtype)
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
   def testClipByValueBadShape(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -144,36 +146,37 @@ class ClipTest(test.TestCase):
       np_ans = [float('NaN'), 4.0, -4.0]
       clip_value = 4.0
       ans = clip_ops.clip_by_value(x, -clip_value, clip_value)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   # ClipByNorm tests
   def testClipByNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans_tensor = ans.eval()
+      tf_ans_tensor = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
     self.assertAllClose(np_ans, tf_ans_tensor)
 
+  @test_util.run_deprecated_v1
   def testClipByNormGradientZeros(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = array_ops.zeros([3])
       b = clip_ops.clip_by_norm(x, 1.)
       grad, = gradients_impl.gradients(b, x)
       self.assertAllEqual(grad.eval(), [1., 1., 1.])
 
   def testClipByNormBadShape(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -182,68 +185,69 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   def testClipByNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   def testClipByNormClippedWithDim0(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[:, 0] = sqrt(3^2 + 4^2) = 5, x[:, 2] = 3
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [0])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   def testClipByNormClippedWithDim1(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [1])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   def testClipByNormNotClippedWithAxes(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [1])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   # ClipByGlobalNorm tests
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -256,15 +260,16 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormClippedTensor(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -277,15 +282,16 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormSupportsNone(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -300,15 +306,16 @@ class ClipTest(test.TestCase):
       self.assertTrue(ans[3] is None)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[2].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormWithIndexedSlicesClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = ops.IndexedSlices(
           constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4]))
@@ -322,7 +329,7 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].values.eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
@@ -339,9 +346,10 @@ class ClipTest(test.TestCase):
     self.assertEqual(dense_shape, slices.dense_shape)
     self.assertEqual(dense_shape, modified_slices.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -352,15 +360,16 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([0.0, 0.0])
       # Norm = 0, no changes
@@ -371,14 +380,15 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 0.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormInf(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0],
                                 shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
@@ -386,7 +396,7 @@ class ClipTest(test.TestCase):
 
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
-        norm.eval()
+        self.evaluate(norm)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
         ans[0].eval()
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
@@ -394,52 +404,68 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClipped(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
       clip_norm = 0.8
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   def testClipByAverageNormClippedTensor(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
       clip_norm = constant_op.constant(0.8)
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   def testClipByAverageNormNotClipped(self):
     # No norm clipping when average clip_norm >= 0.83333333
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
       clip_norm = 0.9
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   def testClipByAverageNormZero(self):
     # No norm clipping when average clip_norm = 0
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Average norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
       clip_norm = 0.9
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByAverageNormReplacedWithClipByNorm(self):
+    # Check clip_by_average_norm(t) is the same as
+    # clip_by_norm(t, clip_norm * tf.to_float(tf.size(t)))
+    with self.session(use_gpu=True):
+      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
+      # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
+      # expected answer [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
+      clip_norm = constant_op.constant(0.8)
+      with_norm = clip_ops.clip_by_average_norm(x, clip_norm)
+      without_norm = clip_ops.clip_by_norm(
+          x, clip_norm * math_ops.to_float(array_ops.size(x)))
+      clip_by_average_norm_ans = self.evaluate(with_norm)
+      clip_by_norm_ans = self.evaluate(without_norm)
+      self.assertAllClose(clip_by_average_norm_ans, clip_by_norm_ans)
+
+  @test_util.run_deprecated_v1
   def testClipByValueEmptyTensor(self):
     # Test case for GitHub issue 19337
     zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
@@ -447,7 +473,7 @@ class ClipTest(test.TestCase):
     y = clip_ops.clip_by_value(zero, 1.0, 1.0)
     z = clip_ops.clip_by_value(zero, zero, 1.0)
     w = clip_ops.clip_by_value(zero, 1.0, zero)
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
 
 
diff --git a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
index 56ddd6e42826e4055ee163e154489bfa7a92dbfa..215ea97f36d5fc72581f1ad96e7e68166e12e08c 100644
--- a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
+++ b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -30,15 +31,15 @@ class CompareAndBitpackTest(test.TestCase):
                              x, threshold,
                              truth,
                              expected_err_re=None):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       ans = math_ops.compare_and_bitpack(x, threshold)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertShapeEqual(truth, ans)
         self.assertAllEqual(tf_ans, truth)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBasic(self, dtype):
     rows = 371
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 0e59ce697268abf7bf9fd0e18ce8c59f3ee839a3..474760a93ff84be698388a7784f66445c21cd8ca 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
@@ -34,8 +35,9 @@ from tensorflow.python.platform import test
 
 class ConcatOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testHStack(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       c = array_ops.concat([p1, p2], 0)
@@ -49,8 +51,9 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[:4, :], params[p1])
     self.assertAllEqual(result[4:, :], params[p2])
 
+  @test_util.run_deprecated_v1
   def testVStack(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       p2 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
       c = array_ops.concat([p1, p2], 1)
@@ -65,25 +68,25 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[:, 4:], params[p2])
 
   def testInt32GPU(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       p1 = np.random.rand(2, 3).astype("i")
       p2 = np.random.rand(2, 3).astype("i")
       x1 = constant_op.constant(p1)
       x2 = constant_op.constant(p2)
       c = array_ops.concat([x1, x2], 0)
-      result = c.eval()
+      result = self.evaluate(c)
     self.assertAllEqual(result[:2, :], p1)
     self.assertAllEqual(result[2:, :], p2)
 
   def testRefType(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       p1 = np.random.rand(4, 4).astype("f")
       p2 = np.random.rand(4, 4).astype("f")
       v1 = variables.Variable(p1)
       v2 = variables.Variable(p2)
       c = array_ops.concat([v1, v2], 0)
-      variables.global_variables_initializer().run()
-      result = c.eval()
+      self.evaluate(variables.global_variables_initializer())
+      result = self.evaluate(c)
 
     self.assertEqual(result.shape, c.get_shape())
     self.assertAllEqual(result[:4, :], p1)
@@ -101,7 +104,7 @@ class ConcatOpTest(test.TestCase):
       dtype_feed = dtypes.float32
     else:
       dtype_feed = dtype
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       p = []
       for i in np.arange(num_tensors):
         input_shape = shape
@@ -137,6 +140,7 @@ class ConcatOpTest(test.TestCase):
       else:
         self.assertAllClose(result[ind], params[p[i]], 0.01)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     self._testRandom(dtypes.bool)
     self._testRandom(dtypes.float32)
@@ -147,6 +151,7 @@ class ConcatOpTest(test.TestCase):
     self._testRandom(dtypes.complex64)
     self._testRandom(dtypes.complex128)
 
+  @test_util.run_deprecated_v1
   def testInvalidConcatDimTypeAndShape(self):
     a = variables.Variable(constant_op.constant(1.0, shape=[1]))
     b = variables.Variable(constant_op.constant(2.0, shape=[1]))
@@ -172,7 +177,7 @@ class ConcatOpTest(test.TestCase):
     # Test both positive and negative concat axis.
     # -2 and 1 correspond to the same axis for 3-dimensional tensors.
     for axis in [-2, 1]:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
@@ -195,15 +200,17 @@ class ConcatOpTest(test.TestCase):
             grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
     self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsSimple(self):
     self._testGradientsSimple(dtypes.float32)
     self._testGradientsSimple(dtypes.complex64)
 
+  @test_util.run_deprecated_v1
   def testGradientsFirstDim(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       inp = []
       inp_tensors = []
       for x in [1, 2, 6]:
@@ -222,15 +229,16 @@ class ConcatOpTest(test.TestCase):
           grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, 0)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsLastDim(self):
     # Test both positive and negative concat axis.
     # -1 and 2 correspond to the same axis for 3-dimensional tensors.
     for axis in [-1, 2]:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
@@ -249,7 +257,7 @@ class ConcatOpTest(test.TestCase):
             grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -261,7 +269,7 @@ class ConcatOpTest(test.TestCase):
     # Random dim to concat on
     concat_dim = np.random.randint(5)
     concat_dim_sizes = np.random.randint(1, 5, size=num_tensors)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       inp = []
       inp_tensors = []
       for x in concat_dim_sizes:
@@ -279,16 +287,18 @@ class ConcatOpTest(test.TestCase):
       grad_tensor = constant_op.constant(grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, concat_dim)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsRandom(self):
     for _ in range(5):
       self._RunAndVerifyGradientsRandom()
 
+  @test_util.run_deprecated_v1
   def testGradientWithUnknownInputDim(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.float32)
       c = array_ops.concat([x, y], 2)
@@ -308,6 +318,7 @@ class ConcatOpTest(test.TestCase):
 
       self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testShapeError(self):
     # Rank doesn't match.
     with self.assertRaises(ValueError):
@@ -337,6 +348,7 @@ class ConcatOpTest(test.TestCase):
            constant_op.constant(20.0, shape=[4, 4, 4])
           ], -4)
 
+  @test_util.run_deprecated_v1
   def testShapeWithUnknownConcatDim(self):
     p1 = array_ops.placeholder(dtypes.float32)
     c1 = constant_op.constant(10.0, shape=[4, 4, 4, 4])
@@ -355,10 +367,11 @@ class ConcatOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.concat([p1, c1, p2, c3], dim)
 
+  @test_util.run_deprecated_v1
   def testZeroSize(self):
     # Verify that concat doesn't crash and burn for zero size inputs
     np.random.seed(7)
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       for shape0 in (), (2,):
         axis = len(shape0)
         for shape1 in (), (3,):
@@ -370,12 +383,13 @@ class ConcatOpTest(test.TestCase):
               # TODO(irving): Make tf.concat handle map, then drop list().
               xs = list(map(constant_op.constant, [x0, x1]))
               c = array_ops.concat(xs, axis)
-              self.assertAllEqual(c.eval(), correct)
+              self.assertAllEqual(self.evaluate(c), correct)
               # Check gradients
               dc = np.random.randn(*c.get_shape().as_list())
-              dxs = sess.run(gradients_impl.gradients(c, xs, dc))
+              dxs = self.evaluate(gradients_impl.gradients(c, xs, dc))
               self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
 
+  @test_util.run_deprecated_v1
   def testTensorConcatDim0Grad(self):
     x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
     output_shape = [44, 7, 3]
@@ -390,6 +404,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testTensorConcatDim1Grad(self):
     x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
     output_shape = [20, 11, 3]
@@ -404,6 +419,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim0Grad(self):
     x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
     output_shape = [4, 7, 3]
@@ -419,6 +435,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim1Grad(self):
     x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
     output_shape = [4, 11, 3]
@@ -434,6 +451,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim2Grad(self):
     x_shapes = [[20, 7, 3], [20, 7, 1], [20, 7, 2]]
     output_shape = [4, 7, 6]
@@ -449,6 +467,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim1Grad_UnknownInputDim(self):
     x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
     output_shape = [4, 11, 3]
@@ -473,23 +492,24 @@ class ConcatOpTest(test.TestCase):
   def testConcatTuple(self):
     c1 = np.random.rand(4, 4)
     c2 = np.random.rand(4, 4)
-    with self.cached_session():
-      concat_list_t = array_ops.concat([c1, c2], 0)
-      concat_tuple_t = array_ops.concat((c1, c2), 0)
-      self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval())
+    concat_list_t = array_ops.concat([c1, c2], 0)
+    concat_tuple_t = array_ops.concat((c1, c2), 0)
+    self.assertAllEqual(
+        self.evaluate(concat_list_t), self.evaluate(concat_tuple_t))
 
+  @test_util.run_deprecated_v1
   def testConcatNoScalars(self):
-    with self.cached_session():
-      scalar = constant_op.constant(7)
-      dim = array_ops.placeholder(dtypes.int32)
-      with self.assertRaisesRegexp(
-          ValueError, r"Can't concatenate scalars \(use tf\.stack instead\)"):
-        array_ops.concat([scalar, scalar, scalar], dim)
+    scalar = constant_op.constant(7)
+    dim = array_ops.placeholder(dtypes.int32)
+    with self.assertRaisesRegexp(
+        ValueError, r"Can't concatenate scalars \(use tf\.stack instead\)"):
+      array_ops.concat([scalar, scalar, scalar], dim)
 
   # important as gpu implementation could fail if
   # shared memory is not large for all the inputs
+  @test_util.run_deprecated_v1
   def testConcatLargeNumberOfTensors(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for concat_dim in range(2):
         params = {}
         p = []
@@ -523,33 +543,34 @@ class ConcatOpTest(test.TestCase):
           self.assertAllEqual(result[index], params[p[i]])
 
   def testConcatEmpty(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       t1 = []
       t2 = []
-      output = gen_array_ops.concat_v2([t1, t2], 0).eval()
-      self.assertFalse(output)  # Checks that output is empty
+      output = gen_array_ops.concat_v2([t1, t2], 0)
+      self.assertFalse(self.evaluate(output))  # Checks that output is empty
 
+  @test_util.run_deprecated_v1
   def testConcatInvalidAxis(self):
     with self.assertRaises(ValueError):
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         t1 = [1]
         t2 = [2]
         gen_array_ops.concat_v2([t1, t2], 1).eval()
 
   def testConcatNegativeAxis(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       t1 = [[1, 2, 3], [4, 5, 6]]
       t2 = [[7, 8, 9], [10, 11, 12]]
 
       c = gen_array_ops.concat_v2([t1, t2], -2)
       self.assertEqual([4, 3], c.get_shape().as_list())
-      output = c.eval()
+      output = self.evaluate(c)
       self.assertAllEqual([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
                           output)
 
       c = gen_array_ops.concat_v2([t1, t2], -1)
       self.assertEqual([2, 6], c.get_shape().as_list())
-      output = c.eval()
+      output = self.evaluate(c)
       self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
   def _testGradientsForAxis(
@@ -578,6 +599,7 @@ class ConcatOpTest(test.TestCase):
       result = concated_grad.eval(feed_dict=feed_dict)
       self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsNegativeAxis(self):
     x1 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
     x2 = [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]
@@ -608,78 +630,78 @@ class ConcatOpTest(test.TestCase):
 
   def testConcatAxisType(self):
     for dtype in [dtypes.int32, dtypes.int64]:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         t1 = [[1, 2, 3], [4, 5, 6]]
         t2 = [[7, 8, 9], [10, 11, 12]]
 
         c = gen_array_ops.concat_v2([t1, t2],
                                     constant_op.constant(1, dtype=dtype))
         self.assertEqual([2, 6], c.get_shape().as_list())
-        output = c.eval()
+        output = self.evaluate(c)
         self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
 class ConcatOffsetTest(test.TestCase):
 
   def testBasic(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       cdim = constant_op.constant(1, dtypes.int32)
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
+  @test_util.run_deprecated_v1
   def testNotVector(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(1, dtypes.int32)
-      s0 = constant_op.constant([[2, 3, 5]], dtypes.int32)
-      s1 = constant_op.constant([[2, 7, 5]], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"should be a vector"):
-        sess.run(off)
-
+    cdim = constant_op.constant(1, dtypes.int32)
+    s0 = constant_op.constant([[2, 3, 5]], dtypes.int32)
+    s1 = constant_op.constant([[2, 7, 5]], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                 r"should be a vector"):
+      self.evaluate(off)
+
+  @test_util.run_deprecated_v1
   def testConcatDimOutOfRange(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(4, dtypes.int32)
-      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-      s1 = constant_op.constant([2, 7, 5], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"Concat dim is out of range: 4 vs. 3"):
-        sess.run(off)
-
+    cdim = constant_op.constant(4, dtypes.int32)
+    s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+    s1 = constant_op.constant([2, 7, 5], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                 r"Concat dim is out of range: 4 vs. 3"):
+      self.evaluate(off)
+
+  @test_util.run_deprecated_v1
   def testDimMismatch(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(1, dtypes.int32)
-      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-      s1 = constant_op.constant([2, 7, 5, 10], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"should contain 3 elem"):
-        sess.run(off)
-
+    cdim = constant_op.constant(1, dtypes.int32)
+    s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+    s1 = constant_op.constant([2, 7, 5, 10], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                 r"should contain 3 elem"):
+      self.evaluate(off)
+
+  @test_util.run_deprecated_v1
   def testSizeMismatch(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(1, dtypes.int32)
-      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-      s1 = constant_op.constant([2, 7, 10], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          r"All dimensions except 1 must match. Input 1 has shape \[2 7 10\] "
-          r"and doesn't match input 0 with shape \[2 3 5\]."):
-        sess.run(off)
+    cdim = constant_op.constant(1, dtypes.int32)
+    s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+    s1 = constant_op.constant([2, 7, 10], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(
+        errors_impl.InvalidArgumentError,
+        r"All dimensions except 1 must match. Input 1 has shape \[2 7 10\] "
+        r"and doesn't match input 0 with shape \[2 3 5\]."):
+      self.evaluate(off)
 
   def testNegativeDim(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       cdim = constant_op.constant(-2, dtypes.int32)
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
       cdim = constant_op.constant(-3, dtypes.int32)
@@ -687,7 +709,7 @@ class ConcatOffsetTest(test.TestCase):
       s1 = constant_op.constant([1, 3, 5], dtypes.int32)
       s2 = constant_op.constant([3, 3, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [2, 0, 0], [3, 0, 0]])
 
 
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 377c0416755726b6d9de659484e30309328daa30..8fe3ba41e27aa101fd4f2e3b41b0a0b226471047 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,16 +20,20 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -64,6 +68,7 @@ class CondV2Test(test.TestCase):
       self.assertEqual(expected_val, actual_val)
       self.assertEqual(expected_grad_val, actual_grad_val)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -78,6 +83,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testMultipleOutputs(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(3.0, name="y")
@@ -92,6 +98,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testBasic2(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -106,6 +113,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testNoInputs(self):
     with self.cached_session() as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
@@ -122,6 +130,7 @@ class CondV2Test(test.TestCase):
       self.assertEqual(sess.run(out, {pred: False}), (2.0,))
 
   def _createCond(self, name):
+    """Creates a cond_v2 call and returns the output tensor and the cond op."""
     pred = constant_op.constant(True, name="pred")
     x = constant_op.constant(1.0, name="x")
 
@@ -131,27 +140,37 @@ class CondV2Test(test.TestCase):
     def false_fn():
       return x + 1
 
-    return cond_v2.cond_v2(pred, true_fn, false_fn, name=name).op
+    output = cond_v2.cond_v2(pred, true_fn, false_fn, name=name)
+    cond_op = output.op.inputs[0].op
+    self.assertEqual(cond_op.type, "If")
+    return output, cond_op
 
   def testDefaultName(self):
     with ops.Graph().as_default():
-      cond = self._createCond(None)
-      self.assertEqual(cond.name, "cond")
-      self.assertIn("cond_true", ops.get_default_graph()._functions)
-      self.assertIn("cond_false", ops.get_default_graph()._functions)
+      _, cond_op = self._createCond(None)
+      self.assertEqual(cond_op.name, "cond")
+      self.assertRegexpMatches(
+          cond_op.get_attr("then_branch").name, r"cond_true_\d*")
+      self.assertRegexpMatches(
+          cond_op.get_attr("else_branch").name, r"cond_false_\d*")
 
     with ops.Graph().as_default():
       with ops.name_scope("foo"):
-        cond = self._createCond("")
-        self.assertEqual(cond.name, "foo/cond")
-        self.assertIn("foo_cond_true", ops.get_default_graph()._functions)
-        self.assertIn("foo_cond_false", ops.get_default_graph()._functions)
-
-        cond2 = self._createCond(None)
-        self.assertEqual(cond2.name, "foo/cond_1")
-        self.assertIn("foo_cond_1_true", ops.get_default_graph()._functions)
-        self.assertIn("foo_cond_1_false", ops.get_default_graph()._functions)
-
+        _, cond1_op = self._createCond("")
+        self.assertEqual(cond1_op.name, "foo/cond")
+        self.assertRegexpMatches(
+            cond1_op.get_attr("then_branch").name, r"foo_cond_true_\d*")
+        self.assertRegexpMatches(
+            cond1_op.get_attr("else_branch").name, r"foo_cond_false_\d*")
+
+        _, cond2_op = self._createCond(None)
+        self.assertEqual(cond2_op.name, "foo/cond_1")
+        self.assertRegexpMatches(
+            cond2_op.get_attr("then_branch").name, r"foo_cond_1_true_\d*")
+        self.assertRegexpMatches(
+            cond2_op.get_attr("else_branch").name, r"foo_cond_1_false_\d*")
+
+  @test_util.run_v1_only("b/120545219")
   def testDefunInCond(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -171,9 +190,8 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testNestedDefunInCond(self):
-    self.skipTest("b/110550782")
-
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
@@ -197,9 +215,8 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testDoubleNestedDefunInCond(self):
-    self.skipTest("b/110550782")
-
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
@@ -468,7 +485,6 @@ class CondV2Test(test.TestCase):
             }), [5., 0.])
 
   def testBuildCondAndGradientInsideDefun(self):
-    self.skipTest("b/110550782")
 
     def build_graph():
       pred_outer = array_ops.placeholder(dtypes.bool, name="pred_outer")
@@ -502,30 +518,31 @@ class CondV2Test(test.TestCase):
 
       return grads, pred_outer, pred_inner
 
-    with ops.Graph().as_default():
+    with ops.Graph().as_default(), self.session(
+        graph=ops.get_default_graph()) as sess:
       grads, pred_outer, pred_inner = build_graph()
-      with self.session(graph=ops.get_default_graph()) as sess:
-        self.assertSequenceEqual(
-            sess.run(grads, {
-                pred_outer: True,
-                pred_inner: True
-            }), [0., 0.])
-        self.assertSequenceEqual(
-            sess.run(grads, {
-                pred_outer: True,
-                pred_inner: False
-            }), [0., 0.])
-        self.assertSequenceEqual(
-            sess.run(grads, {
-                pred_outer: False,
-                pred_inner: True
-            }), [4., 2.])
-        self.assertSequenceEqual(
-            sess.run(grads, {
-                pred_outer: False,
-                pred_inner: False
-            }), [5., 0.])
-
+      self.assertSequenceEqual(
+          sess.run(grads, {
+              pred_outer: True,
+              pred_inner: True
+          }), [0., 0.])
+      self.assertSequenceEqual(
+          sess.run(grads, {
+              pred_outer: True,
+              pred_inner: False
+          }), [0., 0.])
+      self.assertSequenceEqual(
+          sess.run(grads, {
+              pred_outer: False,
+              pred_inner: True
+          }), [4., 2.])
+      self.assertSequenceEqual(
+          sess.run(grads, {
+              pred_outer: False,
+              pred_inner: False
+          }), [5., 0.])
+
+  @test_util.run_deprecated_v1
   def testSecondDerivative(self):
     with self.cached_session() as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
@@ -598,11 +615,11 @@ class CondV2Test(test.TestCase):
   def testLowering(self):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
-        out_cond = self._createCond("cond")
+        cond_output, _ = self._createCond("cond")
 
         run_options = config_pb2.RunOptions(output_partition_graphs=True)
         run_metadata = config_pb2.RunMetadata()
-        sess.run(out_cond, options=run_options, run_metadata=run_metadata)
+        sess.run(cond_output, options=run_options, run_metadata=run_metadata)
 
         # If lowering was enabled, there should be a `Switch` node
         switch_found = any(
@@ -622,17 +639,18 @@ class CondV2Test(test.TestCase):
         self.assertFalse(if_found,
                          "An `If` op was found, but it should be lowered.")
 
+  @test_util.run_deprecated_v1
   def testLoweringDisabledInXLA(self):
     with self.session(graph=ops.Graph()) as sess:
       # Build the cond_v2 in an XLA context
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
-      out_cond = self._createCond("cond")
+      cond_output, _ = self._createCond("cond")
       xla_context.Exit()
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       run_metadata = config_pb2.RunMetadata()
-      sess.run(out_cond, options=run_options, run_metadata=run_metadata)
+      sess.run(cond_output, options=run_options, run_metadata=run_metadata)
 
       # Lowering disabled in XLA, there should be no `Switch` node
       switch_found = any(
@@ -654,6 +672,130 @@ class CondV2Test(test.TestCase):
           if_found,
           "An `If` op was not found, but the graph should not be lowered.")
 
+  @test_util.run_deprecated_v1
+  def testLoweringDisabledWithSingleThreadedExecutorContext(self):
+    with self.session(graph=ops.Graph()) as sess:
+      @function.defun
+      def _add_cond(x):
+        return cond_v2.cond_v2(
+            constant_op.constant(True, name="pred"),
+            lambda: x,
+            lambda: x + 1)
+
+      x = array_ops.placeholder(shape=None, dtype=dtypes.float32)
+      with context.function_executor_type("SINGLE_THREADED_EXECUTOR"):
+        out_cond = _add_cond(x)
+
+      # The fact that sess.run() succeeds means lowering is disabled, because
+      # the single threaded executor does not support cond v1 ops.
+      sess.run(out_cond, feed_dict={x: 1.0})
+
+  @test_util.enable_control_flow_v2
+  def testStructuredOutputs(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(3.0, name="y")
+
+    def true_fn():
+      return ((x * y,), y)
+
+    def false_fn():
+      return ((x,), y * 3.0)
+
+    output = control_flow_ops.cond(
+        constant_op.constant(False), true_fn, false_fn)
+    self.assertEqual(self.evaluate(output[0][0]), 1.)
+    self.assertEqual(self.evaluate(output[1]), 9.)
+
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
+  def testRaisesOutputStructuresMismatch(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(3.0, name="y")
+
+    def true_fn():
+      return x * y, y
+
+    def false_fn():
+      return ((x,), y * 3.0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Outputs of true_fn and false_fn must"
+        " have the same structure"):
+      control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
+
+  @test_util.enable_control_flow_v2
+  def testCondAndTensorArray(self):
+    x = math_ops.range(-5, 5)
+    output = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=x.shape[0])
+
+    def loop_body(i, output):
+
+      def if_true():
+        return output.write(i, x[i]**2)
+
+      def if_false():
+        return output.write(i, x[i])
+
+      output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+      return i + 1, output
+
+    _, output = control_flow_ops.while_loop(
+        lambda i, arr: i < x.shape[0],
+        loop_body,
+        loop_vars=(constant_op.constant(0), output))
+    output_t = output.stack()
+    self.assertAllEqual(
+        self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
+
+  @test_util.enable_control_flow_v2
+  def testCondAndTensorArrayInDefun(self):
+
+    @function.defun
+    def f():
+      x = math_ops.range(-5, 5)
+      output = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=x.shape[0])
+
+      def loop_body(i, output):
+
+        def if_true():
+          return output.write(i, x[i]**2)
+
+        def if_false():
+          return output.write(i, x[i])
+
+        output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+        return i + 1, output
+
+      _, output = control_flow_ops.while_loop(
+          lambda i, arr: i < x.shape[0],
+          loop_body,
+          loop_vars=(constant_op.constant(0), output))
+      return output.stack()
+
+    output_t = f()
+    self.assertAllEqual(
+        self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
+
+  @test_util.run_deprecated_v1
+  def testForwardPassRewrite(self):
+    x = constant_op.constant(1.0, name="x")
+    output = cond_v2.cond_v2(constant_op.constant(True),
+                             lambda: x * 2.0,
+                             lambda: x)
+    if_op = output.op.inputs[0].op
+    self.assertEqual(if_op.type, "If")
+    # pylint: disable=g-deprecated-assert
+    self.assertEqual(len(if_op.outputs), 1)
+
+    gradients_impl.gradients(output, x)
+    # if_op should have been rewritten to output 2.0 intermediate.
+    self.assertEqual(len(if_op.outputs), 2)
+
+    gradients_impl.gradients(output, x)
+    # Computing the gradient again shouldn't rewrite if_op again.
+    self.assertEqual(len(if_op.outputs), 2)
+    # pylint: enable=g-deprecated-assert
+
 
 class CondV2CollectionTest(test.TestCase):
 
@@ -670,7 +812,7 @@ class CondV2CollectionTest(test.TestCase):
           y_const = constant_op.constant(ops.get_collection("y")[0])
           return math_ops.add(x_const, y_const)
 
-        cnd = cond_v2.cond_v2(True, fn, fn)
+        cnd = cond_v2.cond_v2(constant_op.constant(True), fn, fn)
         self.assertEquals(cnd.eval(), 7)
 
   def testCollectionTensorValueAccessInCond(self):
@@ -705,9 +847,7 @@ class CondV2CollectionTest(test.TestCase):
           z = math_ops.add(x, y)
           return math_ops.mul(x, z)
 
-        cnd = cond_v2.cond_v2(
-            True, true_fn,
-            false_fn)
+        cnd = cond_v2.cond_v2(constant_op.constant(True), true_fn, false_fn)
         self.assertEquals(cnd.eval(), 14)
 
         read_z_collection = ops.get_collection("z")
@@ -780,10 +920,12 @@ class CondV2ContainerTest(test.TestCase):
           return constant_op.constant(6.0)
 
         with ops.container("l1"):
-          cnd_true = cond_v2.cond_v2(True, true_fn, false_fn)
+          cnd_true = cond_v2.cond_v2(
+              constant_op.constant(True), true_fn, false_fn)
           self.assertEquals(cnd_true.eval(), 2)
 
-          cnd_false = cond_v2.cond_v2(False, true_fn, false_fn)
+          cnd_false = cond_v2.cond_v2(
+              constant_op.constant(False), true_fn, false_fn)
           self.assertEquals(cnd_false.eval(), 6)
 
           v4 = variables.Variable([3])
@@ -812,7 +954,8 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
           return c
 
         with ops.colocate_with(a.op):
-          self.assertEquals(cond_v2.cond_v2(True, fn, fn).eval(), 3)
+          self.assertEquals(
+              cond_v2.cond_v2(constant_op.constant(True), fn, fn).eval(), 3)
 
         def fn2():
           c = constant_op.constant(3.0)
@@ -821,7 +964,8 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
 
         with ops.colocate_with(a.op):
           with ops.colocate_with(b.op):
-            self.assertEquals(cond_v2.cond_v2(True, fn2, fn2).eval(), 3)
+            self.assertEquals(
+                cond_v2.cond_v2(constant_op.constant(True), fn2, fn2).eval(), 3)
 
   def testColocateWithInAndOutOfCond(self):
     with ops.Graph().as_default() as g:
@@ -837,14 +981,15 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
             return c
 
         with ops.colocate_with(a.op):
-          self.assertEquals(cond_v2.cond_v2(True, fn2, fn2).eval(), 3)
+          self.assertEquals(
+              cond_v2.cond_v2(constant_op.constant(True), fn2, fn2).eval(), 3)
 
           d = constant_op.constant([2.0], name="d")
           self.assertEqual([b"loc:@a"], d.op.colocation_groups())
 
   def testColocateWithInCondGraphPartitioning(self):
     with ops.Graph().as_default() as g:
-      with self.test_session(
+      with self.session(
           graph=g,
           config=config_pb2.ConfigProto(device_count={"CPU": 2})
       ) as sess:
@@ -858,7 +1003,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
           with ops.colocate_with(b.op):
             c = math_ops.add(a, a, name="c")
           return c
-        out_cond_2 = cond_v2.cond_v2(True, fn, fn)
+        out_cond_2 = cond_v2.cond_v2(constant_op.constant(True), fn, fn)
 
         run_options = config_pb2.RunOptions(output_partition_graphs=True)
         run_metadata = config_pb2.RunMetadata()
@@ -875,24 +1020,31 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
       with self.session(graph=g):
 
         def fn():
-          c = constant_op.constant(3.0)
-          self.assertEqual("/device:CPU:0", c.op.device)
-          return c
+          self.assertEqual("", constant_op.constant(3.0).op.device)
+          return test_ops.device_placement_op()
 
         with ops.device("/device:CPU:0"):
-          self.assertEquals(cond_v2.cond_v2(True, fn, fn).eval(), 3)
+          self.assertIn(
+              compat.as_bytes("CPU:0"),
+              self.evaluate(cond_v2.cond_v2(constant_op.constant(True),
+                                            fn, fn)))
 
         def fn2():
-          c = constant_op.constant(3.0)
-          self.assertEqual("/device:GPU:0", c.op.device)
-          return c
-
-        with ops.device("/device:GPU:0"):
-          self.assertEquals(cond_v2.cond_v2(True, fn2, fn2).eval(), 3)
+          self.assertEqual("", constant_op.constant(3.0).op.device)
+          return test_ops.device_placement_op()
+
+        if test_util.is_gpu_available():
+          with ops.device("/device:GPU:0"):
+            self.assertIn(
+                compat.as_bytes("GPU:0"),
+                self.evaluate(cond_v2.cond_v2(constant_op.constant(True),
+                                              fn2, fn2)))
+        else:
+          self.skipTest("Test requrires a GPU to check GPU device placement.")
 
   def testDeviceInAndOutOfCond(self):
     with ops.Graph().as_default() as g:
-      with self.test_session(
+      with self.session(
           graph=g, config=config_pb2.ConfigProto(device_count={"CPU": 2})):
 
         def fn2():
@@ -902,14 +1054,15 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
             return c
 
         with ops.device("/device:CPU:0"):
-          self.assertEquals(cond_v2.cond_v2(True, fn2, fn2).eval(), 3)
+          self.assertEquals(
+              cond_v2.cond_v2(constant_op.constant(True), fn2, fn2).eval(), 3)
 
           d = constant_op.constant(4.0)
           self.assertEqual("/device:CPU:0", d.op.device)
 
   def testDeviceInCondGraphPartitioning(self):
     with ops.Graph().as_default() as g:
-      with self.test_session(
+      with self.session(
           graph=g,
           config=config_pb2.ConfigProto(device_count={"CPU": 2})
       ) as sess:
@@ -921,7 +1074,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
 
         with ops.device("/device:CPU:0"):
           a = constant_op.constant([2.0], name="a")
-          out_cond_2 = cond_v2.cond_v2(True, fn, fn)
+          out_cond_2 = cond_v2.cond_v2(constant_op.constant(True), fn, fn)
 
         run_options = config_pb2.RunOptions(output_partition_graphs=True)
         run_metadata = config_pb2.RunMetadata()
diff --git a/tensorflow/python/kernel_tests/conditional_accumulator_test.py b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
index 97ab23fe49b6eea388b61876b99495486e17d9f9..ce34201706492ca488afbec95cddf436f38c820d 100644
--- a/tensorflow/python/kernel_tests/conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -79,11 +80,13 @@ class ConditionalAccumulatorTest(test.TestCase):
       attr { key: 'reduction_type' value {s: 'MEAN'} }
       """, q.accumulator_ref.op.node_def)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeEmpty(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(dtypes_lib.float32, name="Q")
       self.assertEqual(q.num_accumulated().eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSetGlobalStep(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -91,6 +94,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       set_global_step_op = q.set_global_step(1)
       set_global_step_op.run()
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyGradFloat32(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -98,6 +102,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       accum_op = q.apply_grad((10.0,))
       accum_op.run()
 
+  @test_util.run_deprecated_v1
   def testDtypes(self):
     with self.cached_session() as sess:
       dtypes = [dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64]
@@ -111,10 +116,11 @@ class ConditionalAccumulatorTest(test.TestCase):
         for e in elems:
           q.apply_grad((e,)).run()
 
-        result = sess.run(q.take_grad(1))
+        result = self.evaluate(q.take_grad(1))
 
         self.assertEqual(sum(elems) / len(elems), result)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorMultipleAccumulators(self):
     with self.cached_session():
       q_f32_0 = data_flow_ops.ConditionalAccumulator(
@@ -134,6 +140,7 @@ class ConditionalAccumulatorTest(test.TestCase):
         result = accums[i].take_grad(1).eval()
         self.assertEqual(result, i + 10.0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyAndTakeGradWithShape(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -149,12 +156,13 @@ class ConditionalAccumulatorTest(test.TestCase):
         accum_op.run()
 
       is_all_equal = True
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       for i in range(len(val)):
         for j in range(len(val[i])):
           is_all_equal &= (val[i][j] == elems_ave[i][j])
       self.assertTrue(is_all_equal)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyGradWithWrongShape(self):
     q = data_flow_ops.ConditionalAccumulator(
         dtypes_lib.float32, name="Q", shape=(3, 2))
@@ -165,6 +173,7 @@ class ConditionalAccumulatorTest(test.TestCase):
     with self.assertRaises(ValueError):
       q.apply_grad([[1.0], [2.0], [3.0]])
 
+  @test_util.run_deprecated_v1
   def testAccumulatorDynamicShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -184,12 +193,13 @@ class ConditionalAccumulatorTest(test.TestCase):
         sess.run(accum_op, feed_dict={x: elem})
 
       is_all_equal = True
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       for i in range(len(val)):
         for j in range(len(val[i])):
           is_all_equal &= (val[i][j] == elems_ave[i][j])
       self.assertTrue(is_all_equal)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorWrongDynamicShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -208,6 +218,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         sess.run(accum_op, feed_dict={x: [[1.0], [2.0], [3.0]]})
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeAfterApplyGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -219,6 +230,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
       self.assertEqual(q.num_accumulated().eval(), 2)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeAfterApplyGradAndTakeGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -247,6 +259,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       extract_t.op.run()
       self.assertEqual(q.num_accumulated().eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradMean(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -259,7 +272,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(15.0, val)
 
       accum_ops = [q.apply_grad((x,), local_step=1) for x in elems]
@@ -268,9 +281,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(15.0, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradSum(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -286,7 +300,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(30.0, val)
 
       accum_ops = [q.apply_grad((x,), local_step=1) for x in elems]
@@ -295,9 +309,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(30.0, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradInvalidReductionType(self):
     with self.assertRaises(ValueError):
       data_flow_ops.ConditionalAccumulator(
@@ -306,6 +321,7 @@ class ConditionalAccumulatorTest(test.TestCase):
           shape=tensor_shape.TensorShape([1]),
           reduction_type="Invalid")
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorInvalidTakeGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -319,8 +335,9 @@ class ConditionalAccumulatorTest(test.TestCase):
         accum_op.run()
 
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        takeg_t.eval()
+        self.evaluate(takeg_t)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorRepeatedTakeGradMean(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -334,7 +351,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_ave, val)
 
       elems = [20.0, 30.0]
@@ -345,9 +362,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_ave + 0.0, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorRepeatedTakeGradSum(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -364,7 +382,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_sum, val)
 
       elems = [20.0, 30.0]
@@ -375,9 +393,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_sum, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorIncrementGlobalStep(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -392,8 +411,9 @@ class ConditionalAccumulatorTest(test.TestCase):
       variables.global_variables_initializer().run()
       for _ in range(3):
         set_global_step_op.run()
-        inc_global_step.eval()
+        self.evaluate(inc_global_step)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSetGlobalStepPreventsAccumulation(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -410,11 +430,12 @@ class ConditionalAccumulatorTest(test.TestCase):
           accum_op.run()
         takeg_t = q.take_grad(1)
 
-        val = takeg_t.eval()
+        val = self.evaluate(takeg_t)
         self.assertEqual(0.0 + sum(x for x in local_steps
                                    if x >= ls) / sum(1 for x in local_steps
                                                      if x >= ls), val)
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelApplyGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -424,7 +445,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_grad(1)
 
       def apply_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(
@@ -436,10 +457,11 @@ class ConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
 
       self.assertEqual(val, sum(elems) / len(elems))
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -451,14 +473,14 @@ class ConditionalAccumulatorTest(test.TestCase):
       def apply_grad():
         for accum_op in accum_ops:
           time.sleep(1.0)
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       apply_grad_thread = self.checkedThread(target=apply_grad)
 
       results = []
 
       def take_grad():
-        results.append(sess.run(takeg_t))
+        results.append(self.evaluate(takeg_t))
 
       threads = [self.checkedThread(target=take_grad) for _ in range(10)]
 
@@ -472,6 +494,7 @@ class ConditionalAccumulatorTest(test.TestCase):
 
       self.assertItemsEqual(elems, results)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorApplyAndBlockingTake(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -485,12 +508,12 @@ class ConditionalAccumulatorTest(test.TestCase):
       def apply_grad():
         time.sleep(1.0)
         for accum_op in accum_ops:
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       return_array = []
 
       def take_grad():
-        return_array.append(sess.run(takeg_t))
+        return_array.append(self.evaluate(takeg_t))
 
       accum_thread = self.checkedThread(target=apply_grad)
       takeg_thread = self.checkedThread(target=take_grad)
@@ -503,8 +526,9 @@ class ConditionalAccumulatorTest(test.TestCase):
 
   def _blocking_takeg(self, sess, takeg_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(takeg_op)
+      self.evaluate(takeg_op)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index bc24345261e5bb7beaa0aa2273ec277b53ea01fb..ae13c8e32e5ed5c8f3e6b670835db66d1e7dad0f 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -71,9 +71,11 @@ class ConfusionMatrixTest(test.TestCase):
 
     self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testInt32Basic(self):
     self._testBasic(dtype=np.int32)
 
+  @test_util.run_deprecated_v1
   def testInt64Basic(self):
     self._testBasic(dtype=np.int64)
 
@@ -111,9 +113,11 @@ class ConfusionMatrixTest(test.TestCase):
       self.assertEqual(cm_out.dtype, np_dtype)
       self.assertAllClose(cm_out, truth, atol=1e-10)
 
+  @test_util.run_deprecated_v1
   def testOnTensors_int32(self):
     self._testConfMatrixOnTensors(dtypes.int32, np.int32)
 
+  @test_util.run_deprecated_v1
   def testOnTensors_int64(self):
     self._testConfMatrixOnTensors(dtypes.int64, np.int64)
 
@@ -133,9 +137,11 @@ class ConfusionMatrixTest(test.TestCase):
 
     self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testInt32DifferentLabels(self, dtype=np.int32):
     self._testDifferentLabelsInPredictionAndTarget(dtype)
 
+  @test_util.run_deprecated_v1
   def testInt64DifferentLabels(self, dtype=np.int64):
     self._testDifferentLabelsInPredictionAndTarget(dtype)
 
@@ -155,12 +161,15 @@ class ConfusionMatrixTest(test.TestCase):
 
     self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testInt32MultipleLabels(self, dtype=np.int32):
     self._testMultipleLabels(dtype)
 
+  @test_util.run_deprecated_v1
   def testInt64MultipleLabels(self, dtype=np.int64):
     self._testMultipleLabels(dtype)
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = np.arange(5, dtype=np.int32)
     predictions = np.arange(5, dtype=np.int32)
@@ -177,6 +186,7 @@ class ConfusionMatrixTest(test.TestCase):
     self._testConfMatrix(
         labels=labels, predictions=predictions, weights=weights, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testLabelsTooLarge(self):
     labels = np.asarray([1, 1, 0, 3, 5], dtype=np.int32)
     predictions = np.asarray([2, 1, 0, 2, 2], dtype=np.int32)
@@ -191,6 +201,7 @@ class ConfusionMatrixTest(test.TestCase):
       self._testConfMatrix(
           labels=labels, predictions=predictions, num_classes=3, truth=None)
 
+  @test_util.run_deprecated_v1
   def testPredictionsTooLarge(self):
     labels = np.asarray([1, 1, 0, 2, 2], dtype=np.int32)
     predictions = np.asarray([2, 1, 0, 3, 5], dtype=np.int32)
@@ -205,6 +216,7 @@ class ConfusionMatrixTest(test.TestCase):
       self._testConfMatrix(
           labels=labels, predictions=predictions, num_classes=3, truth=None)
 
+  @test_util.run_deprecated_v1
   def testInvalidRank_predictionsTooBig(self):
     labels = np.asarray([1, 2, 3])
     predictions = np.asarray([[1, 2, 3]])
@@ -212,6 +224,7 @@ class ConfusionMatrixTest(test.TestCase):
                             confusion_matrix.confusion_matrix, predictions,
                             labels)
 
+  @test_util.run_deprecated_v1
   def testInvalidRank_predictionsTooSmall(self):
     labels = np.asarray([[1, 2, 3]])
     predictions = np.asarray([1, 2, 3])
@@ -219,6 +232,7 @@ class ConfusionMatrixTest(test.TestCase):
                             confusion_matrix.confusion_matrix, predictions,
                             labels)
 
+  @test_util.run_deprecated_v1
   def testInputDifferentSize(self):
     labels = np.asarray([1, 2])
     predictions = np.asarray([1, 2, 3])
@@ -232,7 +246,7 @@ class ConfusionMatrixTest(test.TestCase):
     with self.cached_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int32)
-      tf_cm = cm.eval()
+      tf_cm = self.evaluate(cm)
     self.assertEqual(tf_cm.dtype, np.int32)
 
   def testOutputIsInt64(self):
@@ -241,12 +255,13 @@ class ConfusionMatrixTest(test.TestCase):
     with self.cached_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int64)
-      tf_cm = cm.eval()
+      tf_cm = self.evaluate(cm)
     self.assertEqual(tf_cm.dtype, np.int64)
 
 
 class RemoveSqueezableDimensionsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBothScalarShape(self):
     label_values = 1.0
     prediction_values = 0.0
@@ -261,8 +276,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -272,6 +287,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSameShape(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros_like(label_values)
@@ -286,8 +302,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -297,6 +313,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSameShapeExpectedRankDiff0(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros_like(label_values)
@@ -311,8 +328,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder, expected_rank_diff=0))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -322,6 +339,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezableLabels(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros(shape=(2, 3))
@@ -337,8 +355,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_label_values = np.reshape(label_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(expected_label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(expected_label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -348,6 +366,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezableLabelsExpectedRankDiffPlus1(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros(shape=(2, 3, 5))
@@ -363,8 +382,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_label_values = np.reshape(label_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(expected_label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(expected_label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -374,6 +393,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezablePredictions(self):
     label_values = np.ones(shape=(2, 3))
     prediction_values = np.zeros(shape=(2, 3, 1))
@@ -389,8 +409,9 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(expected_prediction_values,
+                          self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -401,6 +422,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           expected_prediction_values,
           dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezablePredictionsExpectedRankDiffMinus1(self):
     label_values = np.ones(shape=(2, 3, 5))
     prediction_values = np.zeros(shape=(2, 3, 1))
@@ -416,8 +438,9 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(expected_prediction_values,
+                          self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -428,6 +451,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           expected_prediction_values,
           dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testUnsqueezableLabels(self):
     label_values = np.ones(shape=(2, 3, 2))
     prediction_values = np.zeros(shape=(2, 3))
@@ -453,6 +477,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testUnsqueezablePredictions(self):
     label_values = np.ones(shape=(2, 3))
     prediction_values = np.zeros(shape=(2, 3, 2))
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index d1e4e5477fc6bfbe3f273216b83939a59e03189b..583082c2aa283e326a933d2beaf88f711b7a280f 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -43,7 +43,7 @@ class ConstantTest(test.TestCase):
 
   def _testCpu(self, x):
     np_ans = np.array(x)
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       tf_ans = ops.convert_to_tensor(x).eval()
     dtype = dtypes_lib.as_dtype(np_ans.dtype)
     if dtype.is_floating or dtype.is_complex:
@@ -53,7 +53,7 @@ class ConstantTest(test.TestCase):
 
   def _testGpu(self, x):
     np_ans = np.array(x)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_ans = ops.convert_to_tensor(x).eval()
     dtype = dtypes_lib.as_dtype(np_ans.dtype)
     if dtype.is_floating or dtype.is_complex:
@@ -70,6 +70,7 @@ class ConstantTest(test.TestCase):
     with self.assertRaises(TypeError):
       constant_op.constant(dtypes_lib.string, "[,]")
 
+  @test_util.run_deprecated_v1
   def testBFloat16(self):
     bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16))
@@ -77,36 +78,42 @@ class ConstantTest(test.TestCase):
         np.random.normal(size=30).reshape([2, 3, 5]).astype(bfloat16))
     self._testAll(np.empty((2, 0, 5)).astype(bfloat16))
 
+  @test_util.run_deprecated_v1
   def testHalf(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float16))
     self._testAll(
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float16))
     self._testAll(np.empty((2, 0, 5)).astype(np.float16))
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float32))
     self._testAll(
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float32))
     self._testAll(np.empty((2, 0, 5)).astype(np.float32))
 
+  @test_util.run_deprecated_v1
   def testDouble(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float64))
     self._testAll(
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float64))
     self._testAll(np.empty((2, 0, 5)).astype(np.float64))
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.int32))
     self._testAll((100 * np.random.normal(size=30)).reshape([2, 3, 5]).astype(
         np.int32))
     self._testAll(np.empty((2, 0, 5)).astype(np.int32))
 
+  @test_util.run_deprecated_v1
   def testInt64(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.int64))
     self._testAll((100 * np.random.normal(size=30)).reshape([2, 3, 5]).astype(
         np.int64))
     self._testAll(np.empty((2, 0, 5)).astype(np.int64))
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     self._testAll(
         np.complex(1, 2) *
@@ -116,6 +123,7 @@ class ConstantTest(test.TestCase):
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.complex64))
     self._testAll(np.empty((2, 0, 5)).astype(np.complex64))
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     self._testAll(
         np.complex(1, 2) *
@@ -125,16 +133,18 @@ class ConstantTest(test.TestCase):
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.complex128))
     self._testAll(np.empty((2, 0, 5)).astype(np.complex128))
 
+  @test_util.run_deprecated_v1
   def testString(self):
     self._testCpu(
         np.array([compat.as_bytes(str(x)) for x in np.arange(-15, 15)]).reshape(
             [2, 3, 5]))
     self._testCpu(np.empty((2, 0, 5)).astype(np.str_))
 
+  @test_util.run_deprecated_v1
   def testVariant(self):
     # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
     # copying between CPU and GPU is supported.
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       variant_tensor = tensor_pb2.TensorProto(
           dtype=dtypes_lib.variant.as_datatype_enum,
           tensor_shape=tensor_shape.TensorShape([]).as_proto(),
@@ -161,6 +171,7 @@ class ConstantTest(test.TestCase):
           message="Variant storing an int, decoded const value:").op
       logging_const_op.run()
 
+  @test_util.run_deprecated_v1
   def testStringWithNulls(self):
     with self.cached_session():
       val = ops.convert_to_tensor(b"\0\0\0\0").eval()
@@ -219,16 +230,28 @@ class ConstantTest(test.TestCase):
 
   def testShapeInconsistent(self):
     with ops.Graph().as_default():
-      c = constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[10])
+      c = constant_op.constant_v1([1, 2, 3, 4, 5, 6, 7], shape=[10])
+    self.assertEqual(c.get_shape(), [10])
+
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          TypeError, "Expected Tensor's shape"):
+        c = constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[10])
+
+  def testPromotionShapes(self):
+    with ops.Graph().as_default():
+      c = constant_op.constant([7], shape=[10])
+    self.assertEqual(c.get_shape(), [10])
+    with ops.Graph().as_default():
+      c = constant_op.constant(3, shape=[10])
     self.assertEqual(c.get_shape(), [10])
 
   # pylint: disable=g-long-lambda
   def testShapeWrong(self):
     with ops.Graph().as_default():
-      with self.assertRaisesWithPredicateMatch(
-          ValueError,
-          lambda e: ("Too many elements provided. Needed at most 5, "
-                     "but received 7" == str(e))):
+      with self.assertRaisesRegexp(ValueError, "Too many elements provided."):
+        constant_op.constant_v1([1, 2, 3, 4, 5, 6, 7], shape=[5])
+      with self.assertRaisesRegexp(TypeError, "Expected Tensor's shape"):
         constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[5])
 
   # pylint: enable=g-long-lambda
@@ -253,6 +276,7 @@ class ConstantTest(test.TestCase):
                                    "GraphDef cannot be larger than 2GB."):
         g.as_graph_def()
 
+  @test_util.run_deprecated_v1
   def testSparseValuesRaiseErrors(self):
     with self.assertRaisesRegexp(ValueError,
                                  "setting an array element with a sequence"):
@@ -282,29 +306,29 @@ class AsTensorTest(test.TestCase):
     with self.cached_session():
       x = ops.convert_to_tensor(tensor_shape.TensorShape([]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([], x.eval())
+      self.assertAllEqual([], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([1, 2, 3]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([1, 2, 3], x.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+      self.assertAllEqual([2**31 - 1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]),
                                 dtype=dtypes_lib.int32)
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+      self.assertAllEqual([2**31 - 1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]))
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([2**31, 2, 3], x.eval())
+      self.assertAllEqual([2**31, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]),
                                 dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([2**31, 2, 3], x.eval())
+      self.assertAllEqual([2**31, 2, 3], self.evaluate(x))
 
       with self.assertRaisesRegexp(
           ValueError, "a dimension is too large .2147483648."):
@@ -314,11 +338,11 @@ class AsTensorTest(test.TestCase):
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3]), dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([1, 2, 3], x.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(x))
 
       x = array_ops.reshape(
           array_ops.zeros([6]), tensor_shape.TensorShape([2, 3]))
-      self.assertAllEqual([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], x.eval())
+      self.assertAllEqual([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], self.evaluate(x))
 
     with self.assertRaisesRegexp(ValueError, "partially known"):
       ops.convert_to_tensor(tensor_shape.TensorShape(None))
@@ -330,26 +354,29 @@ class AsTensorTest(test.TestCase):
       ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3]), dtype=dtypes_lib.float32)
 
+  @test_util.run_deprecated_v1
   def testAsTensorForDimensionInput(self):
     with self.cached_session():
       x = ops.convert_to_tensor(tensor_shape.TensorShape([1, 2, 3])[1])
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual(2, x.eval())
+      self.assertAllEqual(2, self.evaluate(x))
 
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3])[1], dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual(2, x.eval())
-
-    with self.assertRaisesRegexp(ValueError, "unknown Dimension"):
-      ops.convert_to_tensor(tensor_shape.TensorShape(None)[1])
-
-    with self.assertRaisesRegexp(ValueError, "unknown Dimension"):
-      ops.convert_to_tensor(tensor_shape.TensorShape([1, None, 64])[1])
-
-    with self.assertRaises(TypeError):
-      ops.convert_to_tensor(
-          tensor_shape.TensorShape([1, 2, 3])[1], dtype=dtypes_lib.float32)
+      self.assertAllEqual(2, self.evaluate(x))
+
+    shape = tensor_shape.TensorShape(None)
+    if shape._v2_behavior:
+      with self.assertRaisesRegexp(ValueError, "None values not supported"):
+        ops.convert_to_tensor(shape[1])
+      with self.assertRaisesRegexp(ValueError, "None values not supported"):
+        ops.convert_to_tensor(tensor_shape.TensorShape([1, None, 64])[1])
+    else:
+      with self.assertRaisesRegexp(ValueError, "unknown Dimension"):
+        ops.convert_to_tensor(shape[1])
+      with self.assertRaisesRegexp(ValueError, "unknown Dimension"):
+        ops.convert_to_tensor(tensor_shape.TensorShape([1, None, 64])[1])
 
 
 class IdentityOpTest(test.TestCase):
@@ -370,7 +397,7 @@ class ZerosTest(test.TestCase):
     with self.cached_session():
       ret = array_ops.zeros(shape)
       self.assertEqual(shape, ret.get_shape())
-      return ret.eval()
+      return self.evaluate(ret)
 
   def testConst(self):
     self.assertTrue(
@@ -381,7 +408,7 @@ class ZerosTest(test.TestCase):
     self.assertEqual(0, self._Zeros(()))
     with self.cached_session():
       scalar = array_ops.zeros(constant_op.constant([], dtype=dtypes_lib.int32))
-      self.assertEqual(0, scalar.eval())
+      self.assertEqual(0, self.evaluate(scalar))
 
   def testDynamicSizes(self):
     np_ans = np.array([[0] * 3] * 2)
@@ -390,11 +417,12 @@ class ZerosTest(test.TestCase):
       d = array_ops.fill([2, 3], 12., name="fill")
       # Constructs a tensor of zeros of the same dimensions as "d".
       z = array_ops.zeros(array_ops.shape(d))
-      out = z.eval()
+      out = self.evaluate(z)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, d)
     self.assertShapeEqual(np_ans, z)
 
+  @test_util.run_deprecated_v1
   def testDtype(self):
     with self.cached_session():
       d = array_ops.fill([2, 3], 12., name="fill")
@@ -418,13 +446,13 @@ class ZerosTest(test.TestCase):
         z = array_ops.zeros([2, 3], dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        z_value = z.eval()
+        z_value = self.evaluate(z)
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
         z = array_ops.zeros(array_ops.shape(d), dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        z_value = z.eval()
+        z_value = self.evaluate(z)
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
 
@@ -432,7 +460,7 @@ class ZerosTest(test.TestCase):
 class ZerosLikeTest(test.TestCase):
 
   def _compareZeros(self, dtype, fully_defined_shape, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       # Creates a tensor of non-zero values with shape 2 x 3.
       # NOTE(kearnes): The default numpy dtype associated with tf.string is
       # np.object (and can't be changed without breaking a lot things), which
@@ -463,6 +491,7 @@ class ZerosLikeTest(test.TestCase):
       self.assertFalse(np.any(z_value))
       self.assertEqual((2, 3), z_value.shape)
 
+  @test_util.run_deprecated_v1
   def testZerosLikeCPU(self):
     for dtype in [
         dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
@@ -473,6 +502,7 @@ class ZerosLikeTest(test.TestCase):
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
 
+  @test_util.run_deprecated_v1
   def testZerosLikeGPU(self):
     for dtype in [
         dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
@@ -482,11 +512,13 @@ class ZerosLikeTest(test.TestCase):
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=True)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testZerosLikePartialShape(self):
     d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None])
     z = array_ops.zeros_like(d)
     self.assertEqual(d.get_shape().as_list(), z.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testZerosLikeDtype(self):
     # Make sure zeros_like works even for dtypes that cannot be cast between
     with self.cached_session():
@@ -500,12 +532,13 @@ class ZerosLikeTest(test.TestCase):
           self.assertEqual(y.shape, shape)
           self.assertAllEqual(y, np.zeros(shape, dtype=out_type))
 
+  @test_util.run_deprecated_v1
   def testZerosLikeVariant(self):
     # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
     # copying between CPU and GPU is supported AND we register a
     # ZerosLike callback for GPU for Variant storing primitive types
     # in variant_op_registry.cc.
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       variant_tensor = tensor_pb2.TensorProto(
           dtype=dtypes_lib.variant.as_datatype_enum,
           tensor_shape=tensor_shape.TensorShape([]).as_proto(),
@@ -536,7 +569,7 @@ class OnesTest(test.TestCase):
     with self.cached_session():
       ret = array_ops.ones(shape)
       self.assertEqual(shape, ret.get_shape())
-      return ret.eval()
+      return self.evaluate(ret)
 
   def testConst(self):
     self.assertTrue(np.array_equal(self._Ones([2, 3]), np.array([[1] * 3] * 2)))
@@ -546,7 +579,7 @@ class OnesTest(test.TestCase):
     self.assertEqual(1, self._Ones(()))
     with self.cached_session():
       scalar = array_ops.ones(constant_op.constant([], dtype=dtypes_lib.int32))
-      self.assertEqual(1, scalar.eval())
+      self.assertEqual(1, self.evaluate(scalar))
 
   def testDynamicSizes(self):
     np_ans = np.array([[1] * 3] * 2)
@@ -555,11 +588,12 @@ class OnesTest(test.TestCase):
       d = array_ops.fill([2, 3], 12., name="fill")
       # Constructs a tensor of ones of the same dimensions as "d".
       z = array_ops.ones(array_ops.shape(d))
-      out = z.eval()
+      out = self.evaluate(z)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, d)
     self.assertShapeEqual(np_ans, z)
 
+  @test_util.run_deprecated_v1
   def testAutoPack(self):
     with self.cached_session():
       h = array_ops.placeholder(dtypes_lib.int32, shape=[])
@@ -568,6 +602,7 @@ class OnesTest(test.TestCase):
       out = z.eval(feed_dict={h: 4, w: 16})
     self.assertAllEqual(out, np.array([[1] * 16] * 4))
 
+  @test_util.run_deprecated_v1
   def testDtype(self):
     with self.cached_session():
       d = array_ops.fill([2, 3], 12., name="fill")
@@ -615,12 +650,13 @@ class OnesLikeTest(test.TestCase):
         z_var = array_ops.ones_like(d)
         # Test that the type is correct
         self.assertEqual(z_var.dtype, dtype)
-        z_value = z_var.eval()
+        z_value = self.evaluate(z_var)
 
       # Test that the value is correct
       self.assertTrue(np.array_equal(z_value, np.array([[1] * 3] * 2)))
       self.assertEqual([2, 3], z_var.get_shape())
 
+  @test_util.run_deprecated_v1
   def testOnesLikePartialShape(self):
     d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None])
     z = array_ops.ones_like(d)
@@ -630,9 +666,9 @@ class OnesLikeTest(test.TestCase):
 class FillTest(test.TestCase):
 
   def _compare(self, dims, val, np_ans, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.fill(dims, val, name="fill")
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     # Fill does not set the shape.
     # self.assertShapeEqual(np_ans, tf_ans)
@@ -665,12 +701,14 @@ class FillTest(test.TestCase):
     np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex128)
     self._compareAll([2, 3], np_ans[0][0], np_ans)
 
+  @test_util.run_deprecated_v1
   def testFillString(self):
     np_ans = np.array([[b"yolo"] * 3] * 2)
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       tf_ans = array_ops.fill([2, 3], np_ans[0][0], name="fill").eval()
     self.assertAllEqual(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testFillNegative(self):
     with self.cached_session():
       for shape in (-1,), (2, -1), (-1, 2), (-2), (-3):
@@ -684,6 +722,7 @@ class FillTest(test.TestCase):
         with self.assertRaises(errors_impl.InvalidArgumentError):
           fill_t.eval({dims: shape})
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # Non-vector dimensions.
     with self.assertRaises(ValueError):
@@ -702,6 +741,7 @@ class FillTest(test.TestCase):
             dtypes_lib.int32, shape=()), 17], 1.0)
     self.assertEqual([None, 17], f.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       in_v = constant_op.constant(5.0)
@@ -714,6 +754,7 @@ class FillTest(test.TestCase):
 
 class PlaceholderTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDtype(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=(10, 10), name="p")
@@ -724,8 +765,9 @@ class PlaceholderTest(test.TestCase):
 
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=(10, 10), name="p")
@@ -737,12 +779,13 @@ class PlaceholderTest(test.TestCase):
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float and "
           r"shape \[10,10\]"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
       with self.assertRaisesWithPredicateMatch(
           ValueError, lambda e: "Cannot feed value of shape" in str(e)):
         p_identity.eval(feed_dict={p: feed_array[:5, :5]})
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=None, name="p")
@@ -755,12 +798,14 @@ class PlaceholderTest(test.TestCase):
       self.assertAllClose(
           p_identity.eval(feed_dict={p: feed_array}), feed_array)
 
+  @test_util.run_deprecated_v1
   def testScalarShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[], name="p")
       p_identity = array_ops.identity(p)
       self.assertAllClose(p_identity.eval(feed_dict={p: 5}), 5)
 
+  @test_util.run_deprecated_v1
   def testPartialShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
@@ -773,6 +818,7 @@ class PlaceholderTest(test.TestCase):
           ValueError, lambda e: "Cannot feed value of shape" in str(e)):
         p_identity.eval(feed_dict={p: feed_array[:5, :2]})
 
+  @test_util.run_deprecated_v1
   def testPartialShapeWhenNotFed(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
@@ -781,8 +827,9 @@ class PlaceholderTest(test.TestCase):
       # Should trigger an operator error, not a shape error.
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
+  @test_util.run_deprecated_v1
   def testControlDependency(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.int32, shape=[], name="p")
@@ -792,10 +839,12 @@ class PlaceholderTest(test.TestCase):
       val = np.array(2).astype(np.int)
       self.assertEqual(10, d.eval(feed_dict={p: val}))
 
+  @test_util.run_deprecated_v1
   def testBadShape(self):
     with self.assertRaises(ValueError):
       array_ops.placeholder(dtypes_lib.float32, shape=(-1, 10))
 
+  @test_util.run_deprecated_v1
   def testTensorStr(self):
     a = array_ops.placeholder(dtypes_lib.float32, shape=None, name="a")
     self.assertEqual("<tf.Tensor 'a:0' shape=<unknown> dtype=float32>", repr(a))
@@ -804,8 +853,14 @@ class PlaceholderTest(test.TestCase):
     self.assertEqual("<tf.Tensor 'b:0' shape=(32, 40) dtype=int32>", repr(b))
 
     c = array_ops.placeholder(dtypes_lib.qint32, shape=(32, None, 2), name="c")
-    self.assertEqual("<tf.Tensor 'c:0' shape=(32, ?, 2) dtype=qint32>", repr(c))
+    if c.shape._v2_behavior:
+      self.assertEqual(
+          "<tf.Tensor 'c:0' shape=(32, None, 2) dtype=qint32>", repr(c))
+    else:
+      self.assertEqual(
+          "<tf.Tensor 'c:0' shape=(32, ?, 2) dtype=qint32>", repr(c))
 
+  @test_util.run_deprecated_v1
   def testOldGraph(self):
     # Load graph generated from earlier version of TF where
     # placeholder shape was not set.
@@ -885,38 +940,42 @@ versions {
 
 class PlaceholderWithDefaultTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testFullShape(self):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([[2, 2], [2, 2]], shape=[2, 2])
       a = array_ops.identity(p)
-      self.assertAllEqual([[2, 2], [2, 2]], a.eval())
+      self.assertAllEqual([[2, 2], [2, 2]], self.evaluate(a))
       self.assertAllEqual(
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
       with self.assertRaises(ValueError):
         a.eval(feed_dict={p: [[6, 6, 6], [6, 6, 6]]})
 
+  @test_util.run_deprecated_v1
   def testPartialShape(self):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([1, 2, 3], shape=[None])
       a = array_ops.identity(p)
-      self.assertAllEqual([1, 2, 3], a.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(a))
       self.assertAllEqual([3, 37], a.eval(feed_dict={p: [3, 37]}))
 
       with self.assertRaises(ValueError):
         a.eval(feed_dict={p: [[2, 2], [2, 2]]})
 
+  @test_util.run_deprecated_v1
   def testNoShape(self):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([17], shape=None)
       a = array_ops.identity(p)
-      self.assertAllEqual([17], a.eval())
+      self.assertAllEqual([17], self.evaluate(a))
       self.assertAllEqual([3, 37], a.eval(feed_dict={p: [3, 37]}))
       self.assertAllEqual(
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with self.session(force_gpu=test_util.is_gpu_available()):
       x = array_ops.placeholder(dtypes_lib.float32, [5, 7])
       y = array_ops.placeholder_with_default(x, None)
       err = gradient_checker.compute_gradient_error(x, [5, 7], y, [5, 7])
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index c7e89dd5f92ed13b5c4c06605113b6198408884f..0fd293ebba3044097453c18fb625fc0dee19b19f 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -22,8 +22,8 @@ from __future__ import print_function
 
 import collections
 import math
+import sys
 import time
-import unittest
 
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
@@ -42,7 +42,6 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import cond_v2  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -130,6 +129,7 @@ def isum(s, maximum_iterations=None):
 @test_util.with_control_flow_v2
 class ControlFlowTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRefIdentity(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -140,8 +140,9 @@ class ControlFlowTest(test.TestCase):
 
       self.assertTrue(isinstance(v2, ops.Tensor))
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v2.eval())
+      self.assertEqual(9, self.evaluate(v2))
 
+  @test_util.run_v1_only("b/120545219")
   def testRefEnter(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -153,8 +154,9 @@ class ControlFlowTest(test.TestCase):
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v3.eval())
+      self.assertEqual(9, self.evaluate(v3))
 
+  @test_util.run_v1_only("b/120545219")
   def testRefSwitch(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -163,7 +165,7 @@ class ControlFlowTest(test.TestCase):
       v1 = control_flow_ops._SwitchRefOrTensor(v._ref(), p)  # pylint: disable=protected-access
       v2 = state_ops.assign(v1[1], 9)
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v2.eval())
+      self.assertEqual(9, self.evaluate(v2))
 
   def testEnterMulExit(self):
     with self.cached_session():
@@ -174,9 +176,10 @@ class ControlFlowTest(test.TestCase):
       mul_op = math_ops.multiply(enter_data, enter_five)
       exit_op = control_flow_ops.exit(mul_op)
 
-      result = exit_op.eval()
+      result = self.evaluate(exit_op)
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  @test_util.run_deprecated_v1
   def testEnterShapePropagation(self):
     with self.cached_session():
       v = variables.Variable([0.0, 0.0], dtype=dtypes.float32)
@@ -191,6 +194,7 @@ class ControlFlowTest(test.TestCase):
           v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([1, 2, 3, 4, 5, 6])
@@ -205,6 +209,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(np.arange(1, 7), val)
     self.assertAllEqual(np.arange(0, 12, 2), ind)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchDeadBranch(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -215,8 +220,9 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           lambda e: "Retval[0] does not have value" in str(e)):
-        dead_branch.eval()
+        self.evaluate(dead_branch)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeLess(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -226,9 +232,10 @@ class ControlFlowTest(test.TestCase):
       switch_op = control_flow_ops.switch(data, less_op)
       merge_op = control_flow_ops.merge(switch_op)[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.arange(1, 7), result)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeAddIdentity(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -239,9 +246,10 @@ class ControlFlowTest(test.TestCase):
       id_op = array_ops.identity(switch_op[1])
       merge_op = control_flow_ops.merge([add_op, id_op])[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x + 1 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeAddMul(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -253,9 +261,10 @@ class ControlFlowTest(test.TestCase):
       mul_op = math_ops.multiply(switch_op[1], five)
       merge_op = control_flow_ops.merge([add_op, mul_op])[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  @test_util.run_v1_only("b/120545219")
   def testLoop_false(self):
     with self.cached_session():
       false = ops.convert_to_tensor(False)
@@ -270,9 +279,10 @@ class ControlFlowTest(test.TestCase):
       next_n = control_flow_ops.next_iteration(switch_n[0])
       merge_n.op._update_input(1, next_n)
 
-      result = exit_n.eval()
+      result = self.evaluate(exit_n)
     self.assertAllEqual(10, result)
 
+  @test_util.run_deprecated_v1
   def testLoop_1(self):
     with self.cached_session():
       zero = constant_op.constant(0)
@@ -296,9 +306,10 @@ class ControlFlowTest(test.TestCase):
       merge_i.op._update_input(1, next_i)
 
       exit_i = control_flow_ops.exit(switch_i[0])
-      result = exit_i.eval()
+      result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
+  @test_util.run_v1_only("b/120545219")
   def testLoop_2(self):
     with self.cached_session():
       zero = constant_op.constant(0)
@@ -322,9 +333,10 @@ class ControlFlowTest(test.TestCase):
       merge_i.op._update_input(1, next_i)
 
       exit_i = control_flow_ops.exit(switch_i[0])
-      result = exit_i.eval()
+      result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
+  @test_util.run_v1_only("b/120545219")
   def testDifferentFrame(self):
     with self.cached_session():
       data = array_ops.placeholder(dtypes.float32, shape=[])
@@ -334,7 +346,7 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesOpError("has inputs from different frames"):
         res.eval(feed_dict={data: 1.0})
 
-  @test_util.disable_control_flow_v2("b/113294340")
+  @test_util.run_deprecated_v1
   def testCondBool(self):
     values = constant_op.constant(10)
     fn1 = lambda: math_ops.add(values, 1)
@@ -342,6 +354,7 @@ class ControlFlowTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "must not be a Python bool"):
       _ = control_flow_ops.cond(False, fn1, fn2)
 
+  @test_util.run_deprecated_v1
   def testCondInt(self):
     p = array_ops.placeholder(dtypes.bool, shape=[])
     v = constant_op.constant(10)
@@ -358,6 +371,7 @@ class ControlFlowTest(test.TestCase):
         lambda: math_ops.subtract(x, 1.))
     self.assertEqual(b.shape, tensor_shape.scalar())
 
+  @test_util.run_v1_only("b/120545219")
   def testFetchable(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -374,6 +388,7 @@ class ControlFlowTest(test.TestCase):
               sess.run(t, feed_dict={x: 3})
 
   @test_util.disable_control_flow_v2("Not relevant")
+  @test_util.run_v1_only("b/120545219")
   def testFeedable(self):
     with self.cached_session() as sess:
       c = constant_op.constant(2)
@@ -391,7 +406,7 @@ class ControlFlowTest(test.TestCase):
             with self.assertRaisesRegexp(ValueError, "may not be fed"):
               sess.run(r, feed_dict={t: 3})
 
-  @test_util.disable_control_flow_v2("b/113296180 (IndexedSlices)")
+  @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant(10)
@@ -407,7 +422,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
-  @test_util.disable_control_flow_v2("b/113296161 (SparseTensors)")
+  @test_util.run_v1_only("b/120545219")
   def testCondSparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -425,6 +440,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual([[1], [4]], r.indices.eval())
       self.assertAllEqual(r.values.get_shape(), (2,))
 
+  @test_util.run_v1_only("b/120545219")
   def testCondResource(self):
 
     with self.cached_session():
@@ -439,7 +455,22 @@ class ControlFlowTest(test.TestCase):
 
       self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
 
+  @test_util.run_v1_only("b/120545219")
+  def testCondWithTensorArrayGrad(self):
+    with self.cached_session() as sess:
+      with ops.device(test.gpu_device_name()):
+        pred = array_ops.placeholder(dtypes.bool, [])
+        x = constant_op.constant([1.0, 2.0, 3.0])
+        y = control_flow_ops.cond(
+            pred, lambda: functional_ops.map_fn(lambda z: z * 2.0, x),
+            lambda: constant_op.constant([1.0, 1.0, 1.0]))
+        g = gradients_impl.gradients(y, x)[0]
+
+      self.assertAllEqual(sess.run(g, {pred: True}), [2.0, 2.0, 2.0])
+      self.assertAllEqual(sess.run(g, {pred: False}), [0.0, 0.0, 0.0])
+
   @test_util.disable_control_flow_v2("b/113293074")
+  @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlicesDifferentTypes(self):
     with self.cached_session():
       values = constant_op.constant(10)
@@ -457,8 +488,9 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(0, ind)
     self.assertTrue(ind.dtype == np.int64)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondColocation(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with ops.device("/cpu:0"):
         v = variables.Variable(7.0)
 
@@ -473,14 +505,14 @@ class ControlFlowTest(test.TestCase):
           self.assertDeviceEqual(op.device, "/cpu:0")
 
   def _testCond_1(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       x = constant_op.constant(10)
       pred = math_ops.less(1, 2)
       fn1 = lambda: math_ops.add(x, 1)
       fn2 = lambda: math_ops.subtract(x, 1)
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(11, result)
 
   def testCond_1(self):
@@ -496,7 +528,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(1, 0), lambda: math_ops.add(x, 1),
           lambda: math_ops.subtract(x, 1))
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(9, result)
 
   def testCond_3(self):
@@ -509,30 +541,43 @@ class ControlFlowTest(test.TestCase):
       fn3 = lambda: math_ops.add(control_flow_ops.cond(pred, fn1, fn2), 1)
       r = control_flow_ops.cond(pred, fn3, fn2)
 
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(12, result)
 
-  @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
-  def testCond_4(self):
-    with self.cached_session():
-      v1 = variables.Variable(7)
-      v2 = variables.Variable(7)
-      v3 = variables.Variable(7)
+  @test_util.run_in_graph_and_eager_modes
+  def testCondPruning(self):
+    v1 = variables.Variable(7)
+    v2 = variables.Variable(7)
+    v3 = variables.Variable(7)
 
+    def f():
       age = constant_op.constant(3)
       max_age = constant_op.constant(2)
       pred = math_ops.greater(age, max_age)
       fn1 = lambda: [state_ops.assign(v1, 1).op, state_ops.assign(v2, 2).op]
       fn2 = lambda: [state_ops.assign(v3, 3).op, constant_op.constant(10).op]
       r = control_flow_ops.cond(pred, fn1, fn2)
-
-      variables.global_variables_initializer().run()
       self.assertEqual(len(r), 2)
-      result = r[1].eval()
-      self.assertAllEqual(True, result)
-      self.assertAllEqual(7, v1.eval())
-      self.assertAllEqual(2, v2.eval())
-      self.assertAllEqual(7, v3.eval())
+      return r[1]
+
+    f_defun = eager_function.defun(f)
+
+    if not context.executing_eagerly():
+      with self.cached_session():
+        variables.global_variables_initializer().run()
+        result = f().eval()
+        self.assertEqual(True, result)
+        # Only second cond result was fetched, so v1 assign shouldn't run.
+        self.assertEqual(7, self.evaluate(v1))
+        self.assertEqual(2, self.evaluate(v2))
+        self.assertEqual(7, self.evaluate(v3))
+
+    result = f_defun()
+    self.assertEqual(True, self.evaluate(result))
+    # Both v1 and v2 branch assignments should be run in defun.
+    self.assertEqual(1, self.evaluate(v1))
+    self.assertEqual(2, self.evaluate(v2))
+    self.assertEqual(7, self.evaluate(v3))
 
   def testCond_5(self):
     with self.cached_session():
@@ -546,10 +591,10 @@ class ControlFlowTest(test.TestCase):
 
       for i in range(10):
         alive, count = body(i)
-      self.assertAllEqual(4, count.eval())
+      self.assertAllEqual(4, self.evaluate(count))
 
+  @test_util.run_v1_only("b/120545219")
   def testCond_6(self):
-
     with self.cached_session():
       v1 = variables.Variable([7])
 
@@ -560,7 +605,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       variables.global_variables_initializer().run()
-      result = r.eval()
+      result = self.evaluate(r)
       self.assertAllEqual(np.array([7]), result)
 
   def testCond_7(self):
@@ -571,8 +616,95 @@ class ControlFlowTest(test.TestCase):
       fn1 = lambda: [math_ops.add(x, 1), math_ops.add(x, 2)]
       fn2 = lambda: [y, y]
       r = control_flow_ops.cond(pred, fn1, fn2)
-      self.assertAllEqual([11, 12], sess.run(r))
+      self.assertAllEqual([11, 12], self.evaluate(r))
+
+  def testCondListOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: [math_ops.add(x, y), math_ops.add(x, y)]
+      fn2 = lambda: [y, y]
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertListEqual([210, 210], test_result)
+
+  def testTupleOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: (math_ops.add(x, y), math_ops.add(x, y))
+      fn2 = lambda: (y, y)
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertTupleEqual((210, 210), test_result)
+
+  def testDictOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
+      fn2 = lambda: {"a": y, "b": y}
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertDictEqual({"a": 210, "b": 210}, test_result)
+
+  @test_util.run_deprecated_v1
+  def testEmbeddedListOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: [[math_ops.add(x, y), math_ops.add(x, y)]]
+      fn2 = lambda: [[y, y]]
+      # Pass strict=True flag as cond_v2 allows for tensors to be
+      # in nested output structures as singletons
+      r = control_flow_ops.cond(pred, fn1, fn2, strict=True)
+      test_result = self.evaluate(r)
+      self.assertListEqual([[210, 210]], test_result)
+
+  def testEmbeddedTupleOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: ((math_ops.add(x, y), math_ops.add(x, y)))
+      fn2 = lambda: ((y, y))
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertTupleEqual(((210, 210)), test_result)
+
+  def testEmbeddedDictOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": {"c": math_ops.add(x, y)},
+                     "b": {"d": math_ops.add(x, y)}}
+      fn2 = lambda: {"a": {"c": y},
+                     "b": {"d": y}}
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertDictEqual({"a": {"c": 210}, "b": {"d": 210}}, test_result)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCheckNestedOutputStruct(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
+      fn2 = lambda: {"c": y, "d": y}
+      v1_msg = "The two structures don't have the same nested structure"
+      v2_msg = "Outputs of true_fn and false_fn must have the same structure"
+      with self.assertRaisesRegexp(
+          ValueError, v2_msg if control_flow_ops.ENABLE_COND_V2 else v1_msg):
+        r = control_flow_ops.cond(pred, fn1, fn2)
+        self.evaluate(r)
 
+  @test_util.run_deprecated_v1
   def testCondRef(self):
 
     with self.cached_session():
@@ -585,9 +717,10 @@ class ControlFlowTest(test.TestCase):
       true_fn = lambda: x
       false_fn = lambda: constant_op.constant([2.0])
       r = control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
-      self.assertAllEqual([2.0], r.eval())
+      self.assertAllEqual([2.0], self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.run_v1_only("b/120545219")
   def testCondWithControl(self):
     with self.cached_session():
       control_holder = array_ops.placeholder(dtypes.float32, shape=())
@@ -601,8 +734,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           constant_op.constant(True), true_branch,
           lambda: constant_op.constant(1))
-      self.assertEqual(5, r.eval())
+      self.assertEqual(5, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testUninitializedRefIdentity(self):
     with self.cached_session() as sess:
       v = gen_state_ops.variable(
@@ -625,10 +759,8 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([v_t_op]):
         orig_v = array_ops.identity(v)
       merged_op = control_flow_ops.merge([assign_v, orig_v])
-      self.assertAllEqual([1.0], sess.run(merged_op.output))
+      self.assertAllEqual([1.0], self.evaluate(merged_op.output))
 
-  @test_util.disable_control_flow_v2(
-      "b/112477618 (Operation returned from cond)")
   def testCondSwitchIdentity(self):
     # Make sure the recv identity is not removed by optimization.
     with session.Session(config=opt_cfg()) as sess:
@@ -641,10 +773,8 @@ class ControlFlowTest(test.TestCase):
         return control_flow_ops.Assert(False, ["Wrong branch!!!"])
 
       r = control_flow_ops.cond(pred, fn1, fn2)
-      sess.run(r)
+      self.evaluate(r)
 
-  @test_util.disable_control_flow_v2(
-      "b/112477618 (Operation returned from cond)")
   def testCondRecvIdentity(self):
     # Make sure the switch identity is not removed by optimization.
     with session.Session(config=opt_cfg()) as sess:
@@ -659,12 +789,11 @@ class ControlFlowTest(test.TestCase):
           return control_flow_ops.Assert(False, ["Wrong branch!!!"])
 
       r = control_flow_ops.cond(pred, fn1, fn2)
-      sess.run(r)
+      self.evaluate(r)
 
-  @test_util.disable_control_flow_v2("b/113346829 (gpu failure)")
+  @test_util.run_v1_only("b/120545219")
   def testCondGrad_1(self):
-    graph = ops.Graph()
-    with graph.as_default():
+    with self.cached_session():
       x = constant_op.constant(10.0, name="x")
       pred = math_ops.less(1, 2)
       fn1 = lambda: array_ops.identity(x)
@@ -672,9 +801,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       grad = gradients_impl.gradients(r, [x])[0]
-      with self.cached_session():
-        self.assertAllEqual(1.0, grad.eval())
+      self.assertAllEqual(1.0, self.evaluate(grad))
 
+  @test_util.run_deprecated_v1
   def testCondGrad_2(self):
     with self.cached_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
@@ -690,6 +819,7 @@ class ControlFlowTest(test.TestCase):
 
   @test_util.disable_control_flow_v2(
       "b/110550782 (gradient w.r.t external variable)")
+  @test_util.run_deprecated_v1
   def testCondGrad_3(self):
     with self.cached_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
@@ -707,6 +837,36 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(980.0, r.eval(feed_dict={c: 1}))
       self.assertAllEqual(30.0, r.eval(feed_dict={c: 3}))
 
+  @test_util.run_deprecated_v1
+  def testCondGradMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 2},
+                                    allow_soft_placement=True)
+    with self.cached_session(use_gpu=True, config=config) as sess:
+      pred = array_ops.placeholder(dtypes.bool, [])
+      x = array_ops.placeholder(dtypes.float32)
+      y = array_ops.placeholder(dtypes.float32)
+
+      with ops.device("/cpu:0"):
+        z = control_flow_ops.cond(pred, lambda: x * y * 2.0, lambda: 2.0)
+
+      with ops.device("/cpu:1"):
+        grad = gradients_impl.gradients(z, x)[0]
+
+      with ops.device("/cpu:0"):
+        grad_grad = gradients_impl.gradients(grad, x)[0]
+
+      self.assertEqual(sess.run(grad, {pred: True, x: 1.0, y: 2.0}), 4.0)
+      self.assertEqual(sess.run(grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
+
+      # v1 control flow gets None second derivative for some reason.
+      if not control_flow_ops.ENABLE_COND_V2:
+        self.assertIsNone(grad_grad)
+        return
+
+      self.assertEqual(sess.run(grad_grad, {pred: True, x: 1.0, y: 2.0}), 0.0)
+      self.assertEqual(sess.run(grad_grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
+
+  @test_util.run_v1_only("b/120545219")
   def testNestedCond_Simple(self):
     with self.cached_session():
       x = constant_op.constant(0., name="X")
@@ -714,15 +874,16 @@ class ControlFlowTest(test.TestCase):
           constant_op.constant(True), lambda: x,
           lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(y, x)[0]
-      self.assertEqual(1.0, result.eval())
+      self.assertEqual(1.0, self.evaluate(result))
 
       z = control_flow_ops.cond(
           constant_op.constant(False), lambda: x,
           lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(z, x)[0]
-      self.assertEqual(1.0, result.eval())
+      self.assertEqual(1.0, self.evaluate(result))
 
   @test_util.disable_control_flow_v2("b/113327884")
+  @test_util.run_v1_only("b/120545219")
   def testCondGrad_Gather(self):
     with self.cached_session() as sess:
       v1 = variables.Variable([1.0, 42.0])
@@ -736,16 +897,132 @@ class ControlFlowTest(test.TestCase):
       # Should just be [1, 1], but possibly a sparse representation
       gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 1})
       dense_gv = [
-          sum([y for (x, y) in zip(gi, gv) if x == i]) for i in range(2)
+          sum(y for (x, y) in zip(gi, gv) if x == i) for i in range(2)
       ]
       self.assertAllEqual(dense_gv, [1.0, 1.0])
       # Should be [0, 2], as the else forwards v1[1] twice
       gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 3})
       dense_gv = [
-          sum([y for (x, y) in zip(gi, gv) if x == i]) for i in range(2)
+          sum(y for (x, y) in zip(gi, gv) if x == i) for i in range(2)
       ]
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
+  @test_util.run_v1_only("b/120545219")
+  def testCondPredicateTensor(self):
+    """Regression test for lowering predicate from non-first output of an op."""
+
+    @eager_function.defun
+    def foo():
+      return constant_op.constant("foo"), constant_op.constant(True)
+
+    r = control_flow_ops.cond(foo()[1], lambda: 1.0, lambda: 2.0)
+    self.assertEqual(self.evaluate(r), 1.0)
+
+  # TODO(b/117945658): reenable
+  @test_util.run_in_graph_and_eager_modes
+  def DISABLED_testCondAutoControlDeps(self):
+
+    def branch_fn():
+      logging_ops.print_v2("A")
+      logging_ops.print_v2("B")
+      with ops.control_dependencies([logging_ops.print_v2("C")]):
+        return constant_op.constant(10)
+
+    def build_cond():
+      return control_flow_ops.cond(
+          constant_op.constant(True), branch_fn, lambda: 0)
+
+    def build_nested_cond():
+      return control_flow_ops.cond(
+          constant_op.constant(True), build_cond, lambda: 0)
+
+    # In v1 graph mode, pruning should make only "C" print.
+    if not context.executing_eagerly():
+      with self.cached_session():
+        with self.captureWritesToStream(sys.stderr) as printed:
+          self.assertEqual(build_cond().eval(), 10)
+        self.assertEqual(printed.contents(), "C\n")
+
+        with self.captureWritesToStream(sys.stderr) as printed:
+          self.assertEqual(build_nested_cond().eval(), 10)
+        self.assertEqual(printed.contents(), "C\n")
+
+    # In defuns, all prints should execute in program order.
+    # This doesn't work with legacy control flow.
+    if control_flow_ops.ENABLE_COND_V2:
+
+      @eager_function.defun
+      def cond():
+        return build_cond()
+
+      with self.captureWritesToStream(sys.stderr) as printed:
+        self.assertEqual(self.evaluate(cond()), 10)
+      self.assertEqual(printed.contents(), "A\nB\nC\n")
+
+      @eager_function.defun
+      def nested_cond():
+        return build_nested_cond()
+
+      with self.captureWritesToStream(sys.stderr) as printed:
+        self.assertEqual(self.evaluate(nested_cond()), 10)
+      self.assertEqual(printed.contents(), "A\nB\nC\n")
+
+  # TODO(b/117945658): reenable
+  @test_util.run_in_graph_and_eager_modes
+  def DISABLED_testWhileAutoControlDeps(self):
+
+    def cond(i, unused_x):
+      logging_ops.print_v2("A")
+      return i < 2
+
+    def body(i, x):
+      logging_ops.print_v2("B")
+      with ops.control_dependencies([logging_ops.print_v2("C")]):
+        x = array_ops.identity(x)
+      with ops.control_dependencies([logging_ops.print_v2("D")]):
+        return i + 1, x
+
+    def build_while():
+      return control_flow_ops.while_loop(
+          cond, body, [constant_op.constant(0), constant_op.constant(0)])
+
+    def build_nested_while():
+      return control_flow_ops.cond(
+          constant_op.constant(True), build_while, lambda: (0, 0))
+
+    # In v1 graph mode, pruning should make only "D" print.
+    if not context.executing_eagerly():
+      with self.cached_session():
+        with self.captureWritesToStream(sys.stderr) as printed:
+          self.assertEqual(build_while()[0].eval(), 2)
+        self.assertEqual(printed.contents(), "D\nD\n")
+
+        with self.captureWritesToStream(sys.stderr) as printed:
+          self.assertEqual(build_nested_while()[0].eval(), 2)
+        self.assertEqual(printed.contents(), "D\nD\n")
+
+    # In defuns, all prints should execute in program order.
+    # This doesn't work with legacy control flow.
+    if control_flow_ops.ENABLE_WHILE_V2:
+
+      @eager_function.defun
+      def while_loop():
+        return build_while()[0]
+
+      with self.captureWritesToStream(sys.stderr) as printed:
+        self.assertEqual(self.evaluate(while_loop()), 2)
+      self.assertEqual(printed.contents(), "A\nB\nC\nD\nA\nB\nC\nD\nA\n")
+
+      @eager_function.defun
+      def nested_while_loop():
+        return build_nested_while()[0]
+
+      # TODO(b/117840611): calling nested_while_loop fails in eager
+      if not context.executing_eagerly():
+        with self.captureWritesToStream(sys.stderr) as printed:
+          self.assertEqual(self.evaluate(nested_while_loop()), 2)
+        self.assertEqual(printed.contents(), "A\nB\nC\nD\nA\nB\nC\nD\nA\n")
+
   # Microbenchmark: 256,000 iterations/s.
   @test_util.disable_control_flow_v2("b/116630618 (Times out)")
   def testWhile_1(self):
@@ -754,9 +1031,10 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10000)
       b = lambda x: math_ops.add(x, 1)
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual(10000, r.eval())
+      self.assertEqual(10000, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileExternalControlDependencies(self):
     with self.cached_session():
       v = variables.Variable(0.0)
@@ -773,6 +1051,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(v.eval(), 1.0)
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileExternalControlDependenciesNoInput(self):
     with self.cached_session():
       v = variables.Variable(0.0)
@@ -785,10 +1064,11 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(cond=lambda i: i < 5,
                                            body=body_fn, loop_vars=[0])
-      result.eval()
+      self.evaluate(result)
       self.assertAllEqual(v.eval(), 1.0)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithRefs_1(self):
     with self.cached_session() as sess:
       x = variables.VariableV1(0)._ref()  # pylint: disable=protected-access
@@ -808,7 +1088,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r[0].dtype, dtypes.int32)
       self.assertEqual(r[1].dtype, dtypes.int32_ref)
 
-      value_i, value_x = sess.run(r)
+      value_i, value_x = self.evaluate(r)
 
     self.assertEqual(100, value_i)
     self.assertEqual(0, value_x)
@@ -817,24 +1097,23 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session():
       s = constant_op.constant(0)
       r = isum(s)
-      self.assertAllEqual(45, r.eval())
+      self.assertAllEqual(45, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
   def testWhileWithMaximumIterations(self):
     with self.cached_session():
       s = constant_op.constant([1, 2, 3, 4, 5])
       r = isum(s, maximum_iterations=3)
-      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
+      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116339888 (non-tensor loop var)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.cached_session():
       r = control_flow_ops.while_loop(
           lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
-      self.assertEqual(1, r.eval())
+      self.assertEqual(1, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2(
-      "b/116248044 (nested), b/115920078 (gradients)")
+  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
+  @test_util.run_v1_only("b/120545219")
   def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -860,7 +1139,7 @@ class ControlFlowTest(test.TestCase):
     # Should execute without issue.
     self.assertEqual(3, self.evaluate(loop_execute))
 
-  @test_util.disable_control_flow_v2("b/116248044 (nested while_loop)")
+  @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -882,29 +1161,46 @@ class ControlFlowTest(test.TestCase):
     gs = gradients_impl.gradients(loop_no_xla, v)
     self.evaluate(gs)  # This should execute without error.
 
-    xla_context = control_flow_ops.XLAControlFlowContext()
-    xla_context.Enter()
-    loop_no_maxiter = create_while_loop()
-    loop_with_maxiter = create_while_loop(maximum_iterations=2)
-    xla_context.Exit()
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Cannot create a gradient accumulator for tensor '.+' inside "
-        r"XLA while_loop because maximum_iterations was not passed to "
-        r"the tf.while_loop call \('.+'\)."):
-      _ = gradients_impl.gradients(loop_no_maxiter, v)
+    if control_flow_ops.ENABLE_WHILE_V2:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"maximum_iterations is None. It is required and must be statically "
+          r"known \(e.g. a constant value or known shape dimension\) when "
+          r"building while_loop in XLA context."):
+        loop_no_maxiter = create_while_loop()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"maximum_iterations must be statically "
+          r"known \(e.g. a constant value or known shape dimension\) when "
+          r"building while_loop in XLA context."):
+        loop_with_maxiter = create_while_loop(maximum_iterations=2)
+      xla_context.Exit()
+    else:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      loop_no_maxiter = create_while_loop()
+      loop_with_maxiter = create_while_loop(maximum_iterations=2)
+      xla_context.Exit()
 
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
-        r"while_loop. maximum_iterations tensor '.+' for while_loop context "
-        r"'.+' must be statically known \(e.g. a constant value or known "
-        r"shape dimension\), or be defined at or outside the while loop "
-        r"context '.*' \(currently defined in '.*'\)"):
-      _ = gradients_impl.gradients(loop_with_maxiter, v)
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Cannot create a gradient accumulator for tensor '.+' inside "
+          r"XLA while_loop because maximum_iterations was not passed to "
+          r"the tf.while_loop call \('.+'\)."):
+        _ = gradients_impl.gradients(loop_no_maxiter, v)
 
-  @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+          r"while_loop. maximum_iterations tensor '.+' for while_loop context "
+          r"'.+' must be statically known \(e.g. a constant value or known "
+          r"shape dimension\), or be defined at or outside the while loop "
+          r"context '.*' \(currently defined in '.*'\)"):
+        _ = gradients_impl.gradients(loop_with_maxiter, v)
+
+  @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -923,22 +1219,32 @@ class ControlFlowTest(test.TestCase):
           lambda i, x: (i + 1, v * x), (0, 1.0),
           maximum_iterations=max_iter_holder[0])
 
-    xla_context = control_flow_ops.XLAControlFlowContext()
-    xla_context.Enter()
-    loop = create_while_loop()
-    xla_context.Exit()
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
-        r"while_loop. maximum_iterations tensor '.*Placeholder:0' for "
-        r"while_loop context '.+' must be statically known \(e.g. a constant "
-        r"value or known shape dimension\), or be defined at or outside the "
-        r"while loop context '' \(currently defined in 'cond/.+'\)"):
-      _ = gradients_impl.gradients(loop, v)
-
-  @test_util.disable_control_flow_v2(
-      "b/116248044 (nesting), b/115776323 (max_iters)")
+    if control_flow_ops.ENABLE_WHILE_V2:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"maximum_iterations must be statically known \(e.g. a constant value"
+          r" or known shape dimension\) when building while_loop in XLA "
+          r"context."):
+        loop = create_while_loop()
+      xla_context.Exit()
+    else:
+      xla_context = control_flow_ops.XLAControlFlowContext()
+      xla_context.Enter()
+      loop = create_while_loop()
+      xla_context.Exit()
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+          r"while_loop. maximum_iterations tensor '.*Placeholder:0' for "
+          r"while_loop context '.+' must be statically known \(e.g. a constant "
+          r"value or known shape dimension\), or be defined at or outside the "
+          r"while loop context '' \(currently defined in 'cond/.+'\)"):
+        _ = gradients_impl.gradients(loop, v)
+
+  @test_util.disable_control_flow_v2("b/118457764")
+  @test_util.run_v1_only("b/120545219")
   def testNestedWhileLoopWithMaxItersFromOuterContextInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -980,7 +1286,7 @@ class ControlFlowTest(test.TestCase):
 
     final_without_xla_context = create_while_loop()
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
       run_metadata = config_pb2.RunMetadata()
 
@@ -1007,6 +1313,7 @@ class ControlFlowTest(test.TestCase):
 
   # Have more than 10 parallel iterations and hence exercise k-bound
   # most of the time.
+  @test_util.run_deprecated_v1
   def testWhile_3(self):
     with self.cached_session():
 
@@ -1027,6 +1334,7 @@ class ControlFlowTest(test.TestCase):
       result = r[3].eval()
     self.assertAllEqual(10100, result)
 
+  @test_util.run_deprecated_v1
   def testWhile_4(self):
     with self.cached_session():
 
@@ -1048,6 +1356,7 @@ class ControlFlowTest(test.TestCase):
       result = r[3].eval()
     self.assertAllEqual(42, result)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhile_5(self):
     with self.cached_session():
 
@@ -1073,6 +1382,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
   @test_util.disable_control_flow_v2("b/116338794 (buffer_reuse)")
+  @test_util.run_v1_only("b/120545219")
   def testBufferForwarding(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1096,19 +1406,19 @@ class ControlFlowTest(test.TestCase):
       self.assertLess(len(unique_allocs), 756)
 
   def _testWhile_Gpu_1(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       n = constant_op.constant(1.0)
       c = lambda x: math_ops.less(x, 10.0)
       b = lambda x: math_ops.add(x, 1.0)
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllClose(10.0, r.eval())
+      self.assertAllClose(10.0, self.evaluate(r))
 
   def testWhile_Gpu_1(self):
     self._testWhile_Gpu_1(use_gpu=False)
     self._testWhile_Gpu_1(use_gpu=True)
 
   def _testWhile_Gpu_2(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       n = constant_op.constant(1.0)
       c = lambda x: math_ops.less(x, 10.0)
 
@@ -1117,11 +1427,11 @@ class ControlFlowTest(test.TestCase):
           return math_ops.add(x, 1.0)
 
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllClose(10.0, r.eval())
+      self.assertAllClose(10.0, self.evaluate(r))
 
   def testWhile_Gpu_2(self):
-    self._testWhile_Gpu_1(use_gpu=False)
-    self._testWhile_Gpu_1(use_gpu=True)
+    self._testWhile_Gpu_2(use_gpu=False)
+    self._testWhile_Gpu_2(use_gpu=True)
 
   def testWhileShape(self):
     with self.cached_session():
@@ -1138,26 +1448,26 @@ class ControlFlowTest(test.TestCase):
           c, _b, [i, m],
           [i.get_shape(), tensor_shape.unknown_shape()])
       r = r[1] * array_ops.ones([8, 8])
-      self.assertAllEqual(np.ones((8, 8)), r.eval())
+      self.assertAllEqual(np.ones((8, 8)), self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116339888 (non-tensor loop var)")
+  @test_util.run_deprecated_v1
   def testWhileWithNonTensorInput_Scalar(self):
     with self.cached_session():
       n = 0
       c = lambda x: x < 10000
       b = lambda x: x + 1
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual(10000, r.eval())
+      self.assertEqual(10000, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116339888 (non-tensor loop var)")
   def testWhileWithNonTensorInput_Vector(self):
     with self.cached_session():
       n = np.array([0])  # Note, [0] would not work here; that is a list
       c = lambda x: x[0] < 10000
       b = lambda x: array_ops.stack([x[0] + 1])
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual([10000], r.eval())
+      self.assertEqual([10000], self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInference(self):
     with self.cached_session():
       i = constant_op.constant(0)
@@ -1172,8 +1482,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           c, b, [i, m],
           [i.get_shape(), tensor_shape.TensorShape([None, 2])])
-      self.assertIsNone(r[1].get_shape()[0].value)
-      self.assertEqual(r[1].get_shape()[1], tensor_shape.Dimension(2))
+      self.assertIsNone(r[1].shape.dims[0].value)
+      self.assertEqual(r[1].shape.dims[1], tensor_shape.Dimension(2))
 
       with self.assertRaisesRegexp(
           ValueError,
@@ -1184,6 +1494,7 @@ class ControlFlowTest(test.TestCase):
         r = control_flow_ops.while_loop(c, b, [i, m])
 
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceSparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -1216,6 +1527,7 @@ class ControlFlowTest(test.TestCase):
             [i.get_shape(), tensor_shape.TensorShape([5])])
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([[2.0, 4.0], [3.0, 5.0]], name="values")
@@ -1250,7 +1562,7 @@ class ControlFlowTest(test.TestCase):
             [i.get_shape(), tensor_shape.TensorShape([None, 5])])
 
   def _testNestedWhile_1(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       n = constant_op.constant(0)
 
       def cpu_sum(s):
@@ -1268,16 +1580,15 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 200)
       b = lambda x: math_ops.add(x, cpu_sum(n))
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertEqual(225, r.eval())
+      self.assertEqual(225, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
   def testNestedWhile_1(self):
     self._testNestedWhile_1(use_gpu=False)
     self._testNestedWhile_1(use_gpu=True)
 
   def _testNestedWhile_2(self, use_gpu):
     # Test the cases that A -> Enter and Exit -> A are partitioned.
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       s0 = constant_op.constant(2.0)
 
       def inner_loop(s):
@@ -1301,13 +1612,13 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           outer_c, outer_b, [s0], parallel_iterations=1)
-      self.assertEqual(1048576.0, r.eval())
+      self.assertEqual(1048576.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
   def testNestedWhile_2(self):
     self._testNestedWhile_2(use_gpu=False)
     self._testNestedWhile_2(use_gpu=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_1(self):
     with self.cached_session():
       n = constant_op.constant(0)
@@ -1324,6 +1635,7 @@ class ControlFlowTest(test.TestCase):
           condition, body, [n, r], parallel_iterations=1)
       self.assertAllEqual(12, res[1].eval())
 
+  @test_util.run_deprecated_v1
   def testWhileWithControl_2(self):
     with self.cached_session():
       r = constant_op.constant(0)
@@ -1336,8 +1648,9 @@ class ControlFlowTest(test.TestCase):
 
       res = control_flow_ops.while_loop(
           condition, body, [r], parallel_iterations=1)
-      self.assertAllEqual(12, res.eval())
+      self.assertAllEqual(12, self.evaluate(res))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_3(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1347,6 +1660,7 @@ class ControlFlowTest(test.TestCase):
         r = control_flow_ops.while_loop(lambda x: x < 10, lambda x: x + c, [x0])
       self.assertEqual(10, sess.run(r, {b: True}))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_4(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1358,6 +1672,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(10, sess.run(r, {b: True}))
 
   @test_util.disable_control_flow_v2("b/79881896 (control_deps)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_5(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1371,7 +1686,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(lambda x: x < 10, body, [x0])
       self.assertEqual(10, sess.run(r, {b: True}))
 
-  @test_util.disable_control_flow_v2("b/116134862 (cond output shape)")
   def testWhileCondWithControl(self):
     # Ensure that no control edges by an outer control dependency context are
     # added to nodes inside cond/while contexts.
@@ -1384,9 +1698,10 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([control_flow_ops.no_op()]):
         loop = control_flow_ops.while_loop(cond, body,
                                            (constant_op.constant(5),))
-      self.assertEqual(0, sess.run(loop))
+      self.assertEqual(0, self.evaluate(loop))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileCondWithControl_1(self):
     with self.cached_session():
       v = variable_scope.get_variable(
@@ -1406,10 +1721,11 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
       variables.global_variables_initializer().run()
-      self.assertEqual(4, r.eval())
-      self.assertAllClose(65536.0, v.eval())
+      self.assertEqual(4, self.evaluate(r))
+      self.assertAllClose(65536.0, self.evaluate(v))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileCondExitControl(self):
 
     with self.cached_session():
@@ -1431,8 +1747,8 @@ class ControlFlowTest(test.TestCase):
           constant_op.constant(False), lambda: constant_op.constant(1.0),
           false_branch)
       variables.global_variables_initializer().run()
-      self.assertEqual(6.0, r.eval())
-      self.assertEqual(99, v.eval())
+      self.assertEqual(6.0, self.evaluate(r))
+      self.assertEqual(99, self.evaluate(v))
 
   def testCondWhile_1(self):
 
@@ -1443,7 +1759,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(0, 1), lambda: control_flow_ops.while_loop(c, b, [n]),
           lambda: n)
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testCondWhile_2(self):
 
@@ -1454,10 +1770,10 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(1, 0), lambda: math_ops.add(n, 1),
           lambda: control_flow_ops.while_loop(c, b, [n]))
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def _testCondWhile_3(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       p = array_ops.placeholder(dtypes.bool)
       n = constant_op.constant(0.0)
 
@@ -1479,11 +1795,11 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual([2.0], sess.run(r1, {p: False}))
 
   @test_util.disable_control_flow_v2("b/116743589")
+  @test_util.run_deprecated_v1
   def testCondWhile_3(self):
     self._testCondWhile_3(use_gpu=False)
     self._testCondWhile_3(use_gpu=True)
 
-  @test_util.disable_control_flow_v2("b/116134862 (cond output shape)")
   def testWhileCond_1(self):
 
     with self.cached_session():
@@ -1498,9 +1814,8 @@ class ControlFlowTest(test.TestCase):
           lambda: math_ops.add(x, one), lambda: math_ops.subtract(x, one))
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [i])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116134862 (cond output shape)")
   def testWhileCond_2(self):
 
     with self.cached_session():
@@ -1508,9 +1823,8 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10)
       b = lambda x: control_flow_ops.cond(constant_op.constant(True), lambda: math_ops.add(x, 1), lambda: n)
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116134862 (cond output shape)")
   def testWhileCond_3(self):
 
     with self.cached_session():
@@ -1523,10 +1837,41 @@ class ControlFlowTest(test.TestCase):
                                           lambda: math_ops.subtract(x, 1))
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
+
+  @test_util.run_deprecated_v1
+  def testWhileCondGradMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 2},
+                                    allow_soft_placement=True)
+    with self.cached_session(use_gpu=True, config=config) as sess:
+      pred = array_ops.placeholder(dtypes.bool, [])
+      x_init = constant_op.constant(1.0)
+
+      with ops.device("/cpu:0"):
+        z = control_flow_ops.while_loop(
+            lambda i, _: i < 3,
+            lambda i, x: (i + 1, control_flow_ops.cond(
+                pred, lambda: x * 2.0, lambda: 10.0)),
+            [0, x_init])
+
+      with ops.device("/cpu:1"):
+        grad = gradients_impl.gradients(z, x_init)[0]
+
+      with ops.device("/cpu:0"):
+        grad_grad = gradients_impl.gradients(grad, x_init)[0]
+
+      self.assertEqual(sess.run(grad, {pred: True}), 8.0)
+      self.assertEqual(sess.run(grad, {pred: False}), 0.0)
+
+      if not control_flow_ops.ENABLE_WHILE_V2:
+        return
+
+      self.assertEqual(sess.run(grad_grad, {pred: True}), 0.0)
+      self.assertEqual(sess.run(grad_grad, {pred: False}), 0.0)
 
   # NOTE: It is ok to have parallel_iterations > 1
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileUpdateVariable_1(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1545,11 +1890,12 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
       variables.global_variables_initializer().run()
-      self.assertEqual(3, r.eval())
-      result = select.eval()
+      self.assertEqual(3, self.evaluate(r))
+      result = self.evaluate(select)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_2(self):
     with self.cached_session():
       select1 = variables.Variable([3.0, 4.0, 5.0])
@@ -1570,13 +1916,14 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
       variables.global_variables_initializer().run()
-      self.assertEqual(3, r.eval())
-      result1 = select1.eval()
+      self.assertEqual(3, self.evaluate(r))
+      result1 = self.evaluate(select1)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result1)
-      result2 = select2.eval()
+      result2 = self.evaluate(select2)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result2)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileUpdateVariable_3(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1599,6 +1946,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_4(self):
     with self.cached_session():
       var_a = variables.Variable(0, name="a")
@@ -1622,11 +1970,12 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [c], parallel_iterations=1)
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(10, var_b.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_5(self):
     with self.cached_session():
       # Create some variables.
@@ -1651,12 +2000,13 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [var_b], parallel_iterations=1, name="loop")
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(10, var_a.eval())
-      self.assertEqual(10, var_b.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(10, self.evaluate(var_a))
+      self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_6(self):
     with self.cached_session():
       # Create some variables.
@@ -1681,12 +2031,12 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [c], parallel_iterations=1, name="loop")
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(55, var_b.eval())
-      self.assertEqual(10, var_a.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(55, self.evaluate(var_b))
+      self.assertEqual(10, self.evaluate(var_a))
 
-  @test_util.disable_control_flow_v2("b/116742472 (resource accumulator)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileQueue_1(self):
     with self.cached_session():
       q = data_flow_ops.FIFOQueue(-1, dtypes.int32)
@@ -1701,11 +2051,23 @@ class ControlFlowTest(test.TestCase):
         return ni
 
       r = control_flow_ops.while_loop(c, b, [i], parallel_iterations=1)
-      self.assertEqual([10], r.eval())
+      self.assertEqual([10], self.evaluate(r))
       for i in xrange(10):
         self.assertEqual([i], q.dequeue().eval())
 
+  @test_util.run_v1_only("b/120545219")
+  def testWhileTimeOut(self):
+    run_options = config_pb2.RunOptions(timeout_in_ms=1)
+    with self.cached_session() as sess:
+      n = constant_op.constant(0)
+      c = lambda x: True
+      b = lambda x: math_ops.add(x, 1)
+      r = control_flow_ops.while_loop(c, b, [n])
+      with self.assertRaises(errors_impl.DeadlineExceededError):
+        sess.run(r, options=run_options)
+
   @test_util.disable_control_flow_v2("b/117119329 (stack)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileStack_1(self):
     with self.cached_session():
       s = gen_data_flow_ops.stack_v2(-1, dtypes.int32, stack_name="foo")
@@ -1737,7 +2099,7 @@ class ControlFlowTest(test.TestCase):
           b1, [r, x],
           [r.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
-      self.assertEqual(45, rx.eval())
+      self.assertEqual(45, self.evaluate(rx))
 
   def _testWhileGrad_ColocateGradients(self, colocate):
     gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
@@ -1772,13 +2134,15 @@ class ControlFlowTest(test.TestCase):
         self.assertFalse(gpu_dev_name in dev)
 
     with self.session(graph=graph) as sess:
-      self.assertAllClose(1024.0, sess.run(r))
+      self.assertAllClose(1024.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116351701 (colocation)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ColocateGradients(self):
     self._testWhileGrad_ColocateGradients(colocate=False)
     self._testWhileGrad_ColocateGradients(colocate=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_Square(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -1788,8 +2152,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(math_ops.less(1, 2), lambda: r, lambda: v)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(1024.0, r.eval())
+      self.assertAllClose(1024.0, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_Shape(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=[None])
@@ -1807,6 +2172,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual([None], r.get_shape().as_list())
       self.assertAllClose([810.0, 2560.0], r.eval(feed_dict={x: [3.0, 4.0]}))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_BaseShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32, [None])
@@ -1819,6 +2185,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([r, y], x)[0]
       self.assertAllClose([2.0, 4.0], sess.run(r, feed_dict={x: [1.0, 2.0]}))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_MultipleUses(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -1828,8 +2195,9 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.multiply(r, r)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertEqual(524288.0, r.eval())
+      self.assertEqual(524288.0, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_LoopAdd(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -1839,10 +2207,10 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.add(r, r)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(2048.0, r.eval())
+      self.assertAllClose(2048.0, self.evaluate(r))
 
   def _testWhileGrad_Mul(self, use_gpu, p_iters):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       a = constant_op.constant(3.0, name="a")
       v = constant_op.constant(2.0, name="v")
       c = lambda v: math_ops.less(v, 100.0)
@@ -1850,10 +2218,12 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v], parallel_iterations=p_iters)
 
       grad_a, grad_v = gradients_impl.gradients(r, [a, v])
-      grad_a_val, grad_v_val = sess.run([grad_a, grad_v])
+      grad_a_val, grad_v_val = self.evaluate([grad_a, grad_v])
       self.assertAllClose(216.0, grad_a_val)
       self.assertAllClose(81.0, grad_v_val)
 
+  @test_util.disable_control_flow_v2("b/116630618 (parallel_iters: times out)")
+  @test_util.run_deprecated_v1
   def testWhileGrad_Mul(self):
     self._testWhileGrad_Mul(use_gpu=False, p_iters=1)
     self._testWhileGrad_Mul(use_gpu=False, p_iters=10)
@@ -1862,7 +2232,7 @@ class ControlFlowTest(test.TestCase):
 
   def _testNestedWhileCondWhileGrad(self, use_gpu):
 
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       v = constant_op.constant(1.0)
 
       def inner_loop(s):
@@ -1881,14 +2251,17 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(512.0, r.eval())
+      self.assertAllClose(512.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
+  @test_util.run_deprecated_v1
   def testNestedWhileCondWhileGrad(self):
     self._testNestedWhileCondWhileGrad(use_gpu=False)
+
+  @test_util.run_deprecated_v1
+  def testNestedWhileCondWhileGradGpu(self):
     self._testNestedWhileCondWhileGrad(use_gpu=True)
 
-  @test_util.disable_control_flow_v2("b/116823782")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_Variable(self):
     with self.cached_session():
       a = variables.Variable(3.0)
@@ -1901,6 +2274,20 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllClose(216.0, r[0].eval())
 
+  @test_util.run_deprecated_v1
+  def testWhileGrad_ResourceVariable(self):
+    with self.cached_session():
+      a = resource_variable_ops.ResourceVariable(3.0)
+      v = constant_op.constant(2.0, name="v")
+      c = lambda v: math_ops.less(v, 100.0)
+      b = lambda v: math_ops.multiply(v, a)
+      r = control_flow_ops.while_loop(c, b, [v], parallel_iterations=1)
+
+      g = gradients_impl.gradients(r, a)
+      variables.global_variables_initializer().run()
+      self.assertAllClose(216.0, g[0].eval())
+
+  @test_util.run_v1_only("b/120545219")
   def testWhileGradInCond(self):
 
     with self.cached_session():
@@ -1912,12 +2299,13 @@ class ControlFlowTest(test.TestCase):
       def fn1():
         r = control_flow_ops.while_loop(c, b, [n],
                                         [tensor_shape.unknown_shape()])
-        return gradients_impl.gradients(r, x)
+        return gradients_impl.gradients(r, x)[0]
 
       r = control_flow_ops.cond(math_ops.less(1, 2), fn1, lambda: x)
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
   @test_util.disable_control_flow_v2("b/116340060")
+  @test_util.run_v1_only("b/120545219")
   def testGradInWhileWrtInitialLoopVal(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=(), name="x")
@@ -1935,7 +2323,7 @@ class ControlFlowTest(test.TestCase):
           "loop invariants or wrt the input parameters to the loop body."):
         control_flow_ops.while_loop(lambda i, x: i < 3, body, [0, y])
 
-  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGradInWhile(self):
     with self.cached_session():
       n = ops.convert_to_tensor(1.0, name="n")
@@ -1952,7 +2340,7 @@ class ControlFlowTest(test.TestCase):
                                       [tensor_shape.unknown_shape()])
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
-  @test_util.disable_control_flow_v2("b/116248044 (nested while)")
+  @test_util.run_v1_only("b/120545219")
   def testCondGradInNestedWhiles(self):
 
     def outer_body(i, x):
@@ -1967,11 +2355,53 @@ class ControlFlowTest(test.TestCase):
     i, x = control_flow_ops.while_loop(lambda i, x: i < 3, outer_body, [0, 0.0])
 
     with self.cached_session() as sess:
-      i_val, x_val = sess.run([i, x])
+      i_val, x_val = self.evaluate([i, x])
       self.assertEqual(i_val, 3)
       self.assertAllClose(x_val, 1.0)
 
-  @test_util.disable_control_flow_v2("b/116255781 (flat_args)")
+  def testNestedResourceAccess(self):
+    var = resource_variable_ops.ResourceVariable(constant_op.constant(3.0))
+
+    @eager_function.defun
+    def test_fn():
+      x = constant_op.constant(0.0)
+      r = control_flow_ops.while_loop(
+          # Outer loop condition
+          lambda i, y: i < 2,
+          # Outer loop body
+          lambda i, y: (i + 1, y + control_flow_ops.cond(
+              constant_op.constant(True),
+              # True branch
+              lambda: control_flow_ops.while_loop(
+                  # Inner loop condition
+                  lambda j, z: j < 3,
+                  # Inner loop body
+                  lambda j, z: (j + 1, z + math_ops.square(var)),
+                  # Inner initial loop value
+                  [0, y])[1],
+              # False branch
+              lambda: (0.0))),
+          # Outer initial loop value
+          [0, x])[1]
+
+      grad = gradients_impl.gradients(r, x)[0]
+      return r, grad
+
+    self.evaluate(variables.global_variables_initializer())
+    r, grad = self.evaluate(test_fn())
+    # 2 * 3 * 3^2
+    self.assertEqual(r, 81.0)
+    # v1 control flow gets the wrong answer!!!
+    # Gradient computation:
+    #   f(x) = x + 3^2
+    #   inner_loop(x) = f(f(f(x))) = x + 3*3^2 = x + 27
+    #   g(x) = x + inner_loop(x) = 2x + 27
+    #   outer_loop(x) = g(g(x)) = 4x + 81
+    #   outer_loop'(x) = 4
+    # Note that v1 control flow gets 4.0 as well if the cond is removed.
+    if control_flow_ops.ENABLE_WHILE_V2 and control_flow_ops.ENABLE_COND_V2:
+      self.assertEqual(grad, 4.0)
+
   def testWhile_NestedInput(self):
     with self.cached_session() as sess:
       named = collections.namedtuple("named", ("a", "b"))
@@ -1997,9 +2427,9 @@ class ControlFlowTest(test.TestCase):
 
       r_flattened = nest.flatten(r)
       self.assertEqual([100.0, 1.0, 102.0, 3.0, 4.0 + 100 * 2.0],
-                       sess.run(r_flattened))
+                       self.evaluate(r_flattened))
 
-  @test_util.disable_control_flow_v2("b/116255781(flat_args)")
+  @test_util.run_v1_only("b/120545219")
   def testWhile_NestedBadArityFails(self):
     with self.cached_session():
       named = collections.namedtuple("named", ("a", "b"))
@@ -2016,6 +2446,7 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, "the same number of elements"):
         control_flow_ops.while_loop(c, b, loop_vars)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ys_xs(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2039,6 +2470,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([rx], y)
       self.assertAllClose(120.0, r[0].eval())
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_Dependency(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2059,6 +2491,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(1024.0, r[0].eval())
 
   @test_util.disable_control_flow_v2("b/116355153 (back_prop flag)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_NoGradient(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -2070,6 +2503,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(1.0, r[0].eval())
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_NoDependency(self):
     with self.cached_session() as sess:
       variable = variables.Variable(array_ops.ones([2, 3]))
@@ -2090,6 +2524,7 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllClose(np.ones([2, 3]), sess.run(grad[0]))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_Const(self):
     with self.cached_session() as sess:
       c0 = constant_op.constant(0.0, name="c0")
@@ -2109,6 +2544,7 @@ class ControlFlowTest(test.TestCase):
       grad = gradients_impl.gradients(cost, [c0])
       self.assertAllClose(0.0, sess.run(grad[0]))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_SerialTwoLoops(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2127,6 +2563,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([rx], x)
       self.assertAllClose(1024.0, r[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ParallelTwoLoops(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2146,6 +2583,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([rx], x)
       self.assertAllClose(64.0, r[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_OneOutputWithControlDependencyOnSecond(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2162,13 +2600,14 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([x_f]):
         y_f_d = array_ops.identity(y_f, name="y_f_d")
 
-      self.assertAllClose(2.0, y_f_d.eval())  # y_f_d = 1.0 + 1.0
+      self.assertAllClose(2.0, self.evaluate(y_f_d))  # y_f_d = 1.0 + 1.0
       g = gradients_impl.gradients([y_f_d], [x])[0]
       self.assertTrue(g is not None)
-      self.assertAllClose(1.0, g.eval())  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
+      self.assertAllClose(1.0,
+                          self.evaluate(g))  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
 
   def _testNestedWhileGrad_Simple(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       v = constant_op.constant(1.0)
 
       def inner_loop(s):
@@ -2181,14 +2620,14 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(8.0, r.eval())
+      self.assertAllClose(8.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116248044 (nested)")
+  @test_util.run_deprecated_v1
   def testNestedWhileGrad_Simple(self):
     self._testNestedWhileGrad_Simple(use_gpu=False)
     self._testNestedWhileGrad_Simple(use_gpu=True)
 
-  @test_util.disable_control_flow_v2("b/116248044 (nested)")
+  @test_util.run_v1_only("b/120545219")
   def testNestedWhileGrad_SerialInner(self):
     with self.cached_session():
       v = constant_op.constant(1.0)
@@ -2210,9 +2649,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(256.0, r.eval())
+      self.assertAllClose(256.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116248044 (nested)")
+  @test_util.run_deprecated_v1
   def testNestedWhileGrad_ParallelInner(self):
     with self.cached_session():
       v = constant_op.constant(1.0)
@@ -2234,10 +2673,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(512.0, r.eval())
+      self.assertAllClose(512.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2(
-      "Nested loops and TensorArrays not supported")
+  @test_util.run_v1_only("b/120545219")
   def testNestedWhileGrad_ParallelIterations(self):
     # Make sure the stack pushes and pops of an inner loop are executed in
     # the sequential order of the iterations of its outer loop.
@@ -2256,12 +2694,12 @@ class ControlFlowTest(test.TestCase):
       res = outer_loop(inp)
       optimizer = adam.AdamOptimizer(learning_rate=0.001)
       train_op = optimizer.minimize(math_ops.reduce_mean(math_ops.square(res)))
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(2.999, var.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(train_op)
+      self.assertAllClose(2.999, self.evaluate(var))
 
   def _testWhileCondGrad_Simple(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       v = ops.convert_to_tensor(2.0, name="v")
       n = ops.convert_to_tensor(100.0, name="n")
       one = ops.convert_to_tensor(1.0, name="one")
@@ -2274,14 +2712,16 @@ class ControlFlowTest(test.TestCase):
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(1024.0, r.eval())
+      self.assertAllClose(1024.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/116272044 (cond_in_while)")
+  @test_util.disable_control_flow_v2("b/117519152")
+  @test_util.run_deprecated_v1
   def testWhileCondGrad_Simple(self):
     self._testWhileCondGrad_Simple(use_gpu=False)
     self._testWhileCondGrad_Simple(use_gpu=True)
 
-  @test_util.disable_control_flow_v2("b/116272044 (cond_in_while)")
+  @test_util.disable_control_flow_v2("b/117276490")
+  @test_util.run_deprecated_v1
   def testWhileCondGrad_UnknownShape(self):
     with self.cached_session() as sess:
       v = array_ops.placeholder(dtypes.float32)
@@ -2299,6 +2739,7 @@ class ControlFlowTest(test.TestCase):
       r = sess.run(r, feed_dict={v: 2.0})
       self.assertAllClose(1024.0, r)
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_Concat(self):
     with self.cached_session() as sess:
       x = variable_scope.get_variable("x", initializer=[[1., 2.]])
@@ -2316,13 +2757,15 @@ class ControlFlowTest(test.TestCase):
           [i0.get_shape(), tensor_shape.TensorShape([None, 2])])
       s = math_ops.reduce_sum(h)
 
-      sess.run(variables.global_variables_initializer())
       optimizer = gradient_descent.GradientDescentOptimizer(0.01)
       op = optimizer.minimize(s)
-      sess.run(op)
-      self.assertAllClose([[0.98000002, 1.98000002]], sess.run(x))
+
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(op)
+      self.assertAllClose([[0.98000002, 1.98000002]], self.evaluate(x))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithRefsWithGradients_1(self):
     with self.cached_session() as sess:
       x = variables.VariableV1(0.)._ref()  # pylint: disable=protected-access
@@ -2352,6 +2795,7 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual(73, value_x_grad)
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_IndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -2371,9 +2815,10 @@ class ControlFlowTest(test.TestCase):
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       r = gradients_impl.gradients(r.values, values)[0]
-      self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
+      self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_SparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -2394,9 +2839,10 @@ class ControlFlowTest(test.TestCase):
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       r = gradients_impl.gradients(r.values, values)[0]
-      self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
+      self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
+  @test_util.run_v1_only("b/120545219")
   def testCallGradInLoop(self):
     with self.cached_session() as sess:
       i0 = constant_op.constant(0)
@@ -2414,10 +2860,9 @@ class ControlFlowTest(test.TestCase):
 
       output_grad = control_flow_ops.while_loop(
           c, b, [i0, constant_op.constant(0.0)])
-      self.assertAllClose(600.0, sess.run(output_grad)[1])
+      self.assertAllClose(600.0, self.evaluate(output_grad)[1])
 
-  @test_util.disable_control_flow_v2(
-      "b/116255781 (flat_args), b/115660901 (TensorArray)")
+  @test_util.run_deprecated_v1
   def testWhileAndTensorArray(self):
     with self.cached_session() as sess:
       param = constant_op.constant(2.0)
@@ -2435,8 +2880,9 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [n0, y0], parallel_iterations=1)
       r = gradients_impl.gradients(r, param)[0]
-      self.assertAllClose(107520.0, sess.run(r))
+      self.assertAllClose(107520.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_StopGrad(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2452,9 +2898,9 @@ class ControlFlowTest(test.TestCase):
       rx, ry = control_flow_ops.while_loop(c, b, [x, y])
 
       r = gradients_impl.gradients(rx, y)[0]
-      self.assertEqual(136.0, r.eval())
+      self.assertEqual(136.0, self.evaluate(r))
       r = gradients_impl.gradients(ry, y)[0]
-      self.assertEqual(32.0, r.eval())
+      self.assertEqual(32.0, self.evaluate(r))
 
       r = gradients_impl.gradients(array_ops.stop_gradient(rx), y)[0]
       self.assertEqual(r, None)
@@ -2472,14 +2918,16 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r, None)
 
       r = gradients_impl.gradients(math_ops.add(rx, ry), y)[0]
-      self.assertEqual(168.0, r.eval())
+      self.assertEqual(168.0, self.evaluate(r))
       r = gradients_impl.gradients(
           math_ops.add(rx, array_ops.stop_gradient(ry)), y)[0]
-      self.assertEqual(136.0, r.eval())
+      self.assertEqual(136.0, self.evaluate(r))
       r = gradients_impl.gradients(
           math_ops.add(array_ops.stop_gradient(rx), ry), y)[0]
-      self.assertEqual(32.0, r.eval())
+      self.assertEqual(32.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
+  @test_util.disable_control_flow_v2("b/118712257")
   def testWhileGrad_StopGradInside(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2495,10 +2943,12 @@ class ControlFlowTest(test.TestCase):
       rx, _ = control_flow_ops.while_loop(c, b, [x, y])
 
       r = gradients_impl.gradients(rx, y)[0]
-      self.assertAllClose(0.0, r.eval())
+      self.assertAllClose(0.0, self.evaluate(r))
       r = gradients_impl.gradients(rx, x)[0]
-      self.assertAllClose(156.0, r.eval())
+      self.assertAllClose(156.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
+  @test_util.disable_control_flow_v2("b/118712257")
   def testWhileGrad_StopGradInsideNoShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -2520,9 +2970,10 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose([156.0, 400.0], sess.run(r, feed_dict=feed_dict))
       name = "gradients/while/stopped_grad"
       all_ops = x.graph.get_operations()
-      self.assertFalse(any([name in op.name for op in all_ops]))
+      self.assertFalse(any(name in op.name for op in all_ops))
 
-  @test_util.disable_control_flow_v2("b/116255781 (flat args)")
+  @test_util.disable_control_flow_v2("b/117954949")
+  @test_util.run_deprecated_v1
   def testWhileGradGradFail(self):
     theta = variables.Variable(initial_value=1.)
 
@@ -2531,11 +2982,13 @@ class ControlFlowTest(test.TestCase):
 
     result = functional_ops.scan(fn, np.array([1., 2., 3.], dtype=np.float32))
     grad_theta = gradients_impl.gradients(result, theta)
-    with self.assertRaisesRegexp(TypeError, "Second-order gradient"):
-      gradients_impl.gradients(grad_theta, theta)
+    if not control_flow_ops.ENABLE_WHILE_V2:
+      with self.assertRaisesRegexp(TypeError, "Second-order gradient"):
+        gradients_impl.gradients(grad_theta, theta)
     grad_theta_stopped = array_ops.stop_gradient(grad_theta)
     gradients_impl.gradients(grad_theta_stopped, theta)
 
+  @test_util.run_deprecated_v1
   def testStopGradOnWhileGrad(self):
     with self.cached_session():
       x = constant_op.constant(2.0, name="x")
@@ -2550,9 +3003,10 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.add(math_ops.square(y), rx)
       r = math_ops.add(r, rg)
       r = gradients_impl.gradients(r, y)[0]
-      self.assertEqual(388.0, r.eval())
+      self.assertEqual(388.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileGradientWithNontrainablePath1(self):
     q = variables.Variable([7., 8.])
 
@@ -2567,10 +3021,11 @@ class ControlFlowTest(test.TestCase):
     dy_dq, = gradients_impl.gradients(y, q)
     self.assertIsNotNone(dy_dq)
     with self.cached_session() as sess:
-      sess.run(q.initializer)
-      self.assertAllClose([0., 0.], sess.run(dy_dq))
+      self.evaluate(q.initializer)
+      self.assertAllClose([0., 0.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGradientWithNontrainablePath2(self):
     q = variables.Variable([7., 8.])
 
@@ -2585,10 +3040,11 @@ class ControlFlowTest(test.TestCase):
     dy_dq, = gradients_impl.gradients(y, q)
     self.assertIsNotNone(dy_dq)
     with self.cached_session() as sess:
-      sess.run(q.initializer)
-      self.assertAllClose([1., 1.], sess.run(dy_dq))
+      self.evaluate(q.initializer)
+      self.assertAllClose([1., 1.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
+  @test_util.run_v1_only("b/120545219")
   def testIssue16504(self):
     c = constant_op.constant(np.arange(100), dtype=dtypes.float32)
     w = variables.Variable(
@@ -2612,7 +3068,7 @@ class ControlFlowTest(test.TestCase):
     grad, = gradients_impl.gradients(w, c)
     self.assertIsNotNone(grad)
 
-  @test_util.disable_control_flow_v2("b/116270461 (resource)")
+  @test_util.run_v1_only("b/120545219")
   def testStopGradMultiFlows(self):
     with self.cached_session():
 
@@ -2637,8 +3093,9 @@ class ControlFlowTest(test.TestCase):
       z = math_ops.add(r, array_ops.stop_gradient(math_ops.reduce_sum(grads)))
       result = gradients_impl.gradients(z, vars_)[0]
       variables.global_variables_initializer().run()
-      self.assertEqual(5.0, result.eval())
+      self.assertEqual(5.0, self.evaluate(result))
 
+  @test_util.run_v1_only("b/120545219")
   def testOneValueCond(self):
 
     with self.cached_session():
@@ -2655,6 +3112,7 @@ class ControlFlowTest(test.TestCase):
       # False case: c = 0 is not >= 1
       self.assertEqual([2], i.eval(feed_dict={c: 0}))
 
+  @test_util.run_deprecated_v1
   def testExampleCond(self):
 
     with self.cached_session():
@@ -2671,8 +3129,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(4.0, i.eval(feed_dict={d: 1}))
       self.assertAllClose(2.0 * math.sqrt(2), i.eval(feed_dict={d: 2}))
 
-  @test_util.disable_control_flow_v2(
-      "b/112477618 (Operation returned from cond)")
+  @test_util.run_v1_only("b/120545219")
   def testCase(self):
     with self.cached_session():
       x = constant_op.constant(1)
@@ -2700,7 +3157,7 @@ class ControlFlowTest(test.TestCase):
       r4 = control_flow_ops.case(
           [(x < y, f1), (x < y, f2)], default=f3, exclusive=True)
       with self.assertRaisesOpError("Input error:"):
-        r4.eval()
+        self.evaluate(r4)
 
       # Check that the default is called if none of the others are
       r5 = control_flow_ops.case({x > y: f1}, default=f3)
@@ -2725,8 +3182,7 @@ class ControlFlowTest(test.TestCase):
 
       self.assertAllEqual(r6.eval(), 0)
 
-  @test_util.disable_control_flow_v2(
-      "b/112477618 (Operation returned from cond)")
+  @test_util.run_v1_only("b/120545219")
   def testCaseSideEffects(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(-1)
@@ -2748,21 +3204,22 @@ class ControlFlowTest(test.TestCase):
           ((x > y, a), (x > y, b)), default=c, exclusive=True)
 
       variables.global_variables_initializer().run()
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(2, r2.eval())
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1, -1, 2])
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
+      self.assertEqual(2, self.evaluate(r2))
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, -1, 2])
 
       variables.global_variables_initializer().run()
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(1, r1.eval())
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1, 1, -1])
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
+      self.assertEqual(1, self.evaluate(r1))
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, 1, -1])
 
       variables.global_variables_initializer().run()
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(0, r0.eval())
-      self.assertAllEqual(sess.run([v0, v1, v2]), [0, -1, -1])
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
+      self.assertEqual(0, self.evaluate(r0))
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [0, -1, -1])
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
+  @test_util.run_v1_only("b/120545219")
   def testOneOpCond(self):
     with self.cached_session():
       v = variables.Variable(0)
@@ -2781,16 +3238,17 @@ class ControlFlowTest(test.TestCase):
       self.assertTrue(isinstance(i, ops.Tensor))
       variables.global_variables_initializer().run()
 
-      self.assertEqual(0, v.eval())
+      self.assertEqual(0, self.evaluate(v))
 
       # True case: c = 2 is >= 1, v is set to 1.
       self.assertEqual(1, i.eval(feed_dict={c.name: 2}))
-      self.assertEqual(1, v.eval())
+      self.assertEqual(1, self.evaluate(v))
 
       # False case: c = 0 is not >= 1, v is set to 2.
       self.assertEqual(2, i.eval(feed_dict={c.name: 0}))
-      self.assertEqual(2, v.eval())
+      self.assertEqual(2, self.evaluate(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testWithOpsDependencies(self):
     with self.cached_session() as sess:
       v = variables.VariableV1(0.0)
@@ -2798,7 +3256,7 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching v directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        sess.run([c, v])
+        self.evaluate([c, v])
 
       # Use a control dependency to ensure init_variable is run
       # while asking for c
@@ -2806,7 +3264,7 @@ class ControlFlowTest(test.TestCase):
           name="real_tensor",
           output_tensor=v._ref(),  # pylint: disable=protected-access
           dependencies=[v.initializer])
-      c_val, real_v_val = sess.run([c, real_v])
+      c_val, real_v_val = self.evaluate([c, real_v])
 
     # Ensure the result of 'real_c' is the same as 'c'
     self.assertAllEqual(10, c_val)
@@ -2814,6 +3272,7 @@ class ControlFlowTest(test.TestCase):
     # Ensure that 'v' is initialized
     self.assertAllClose(0.0, real_v_val)
 
+  @test_util.run_v1_only("b/120545219")
   def testWithTensorDependencies(self):
     with self.cached_session():
       v = variables.VariableV1(0.0)
@@ -2831,15 +3290,16 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching v directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        v.eval()
+        self.evaluate(v)
 
       # Get the value of 'c2_with_c1_dep', which should cause 'v'
       # to be initialized.
-      self.assertAllEqual(20, c2_with_c1_dep.eval())
+      self.assertAllEqual(20, self.evaluate(c2_with_c1_dep))
 
       # Ensure that 'v' is initialized
-      self.assertAllClose(0.0, v.eval())
+      self.assertAllClose(0.0, self.evaluate(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testWithIndexedSlicesDependencies(self):
     with self.cached_session():
       v = variables.VariableV1(
@@ -2853,13 +3313,15 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching gather_v_at_1 will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        gather_v_at_1.eval()
+        self.evaluate(gather_v_at_1)
 
       # Getting gather_v_at_1_after_init will work, and initialize v.
-      self.assertAllEqual([[10.0, 11.0]], gather_v_at_1_after_init.eval())
+      self.assertAllEqual([[10.0, 11.0]],
+                          self.evaluate(gather_v_at_1_after_init))
 
       # Double check that 'v' is initialized
-      self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]], v.eval())
+      self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]],
+                          self.evaluate(v))
 
   def testDependenciesDevice(self):
     with ops.Graph().as_default():
@@ -2884,6 +3346,7 @@ class ControlFlowTest(test.TestCase):
         self.assertDeviceEqual("", with_vdef_dep.device)
         self.assertEqual([b"loc:@vdef"], with_vdef_dep.op.colocation_groups())
 
+  @test_util.run_v1_only("b/120545219")
   def testGroup(self):
     with self.cached_session() as sess:
       v1 = variables.VariableV1([0.0])
@@ -2893,21 +3356,23 @@ class ControlFlowTest(test.TestCase):
       init = control_flow_ops.group(v1.initializer, v2.initializer)
       # Fetching v1 directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        v1.eval()
+        self.evaluate(v1)
 
       # Runs "init" before fetching v1 and v2.
       init.run()
-      v1_val, v2_val = sess.run([v1, v2])
+      v1_val, v2_val = self.evaluate([v1, v2])
 
     # Ensure that v1 and v2 are initialized
     self.assertAllClose([0.0], v1_val)
     self.assertAllClose([1.0], v2_val)
 
+  @test_util.run_v1_only("b/120545219")
   def testGroupEmpty(self):
     op = control_flow_ops.group()
     self.assertEqual(op.type, "NoOp")
     self.assertEqual(op.control_inputs, [])
 
+  @test_util.run_deprecated_v1
   def testMergeShapes(self):
     # All inputs unknown.
     p1 = array_ops.placeholder(dtypes.float32)
@@ -2962,6 +3427,7 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual([None, None], m.get_shape().as_list())
     self.assertEqual([], index.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testRefSelect(self):
     index = array_ops.placeholder(dtypes.int32)
 
@@ -2995,6 +3461,7 @@ class ControlFlowTest(test.TestCase):
     s = control_flow_ops.ref_select(index, [v1, v2])
     self.assertEqual(None, s.get_shape())
 
+  @test_util.run_deprecated_v1
   def testRunLoopTensor(self):
     with self.cached_session() as sess:
       tensor_list = []
@@ -3008,13 +3475,14 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(condition, body,
                                            [constant_op.constant(4)])
-      self.assertEqual(10, sess.run(result))
+      self.assertEqual(10, self.evaluate(result))
 
       # Ensure that we cannot run a tensor that escapes the loop body
       # accidentally.
       with self.assertRaises(ValueError):
         sess.run(tensor_list[0])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhilePyFuncBasic(self):
 
     def func(x):
@@ -3028,6 +3496,7 @@ class ControlFlowTest(test.TestCase):
           [tensor_shape.unknown_shape(), tensor_shape.unknown_shape()])
       self.assertEqual(r[1].eval(), 65536.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileFuncBasic(self):
 
     @function.Defun(dtypes.float32)
@@ -3041,10 +3510,9 @@ class ControlFlowTest(test.TestCase):
           [constant_op.constant(0), x],
           [tensor_shape.unknown_shape(),
            tensor_shape.unknown_shape()])
+      grad = gradients_impl.gradients(r, x)[0]
       self.assertEqual(r[1].eval(), 65536.0)
-
-      r = gradients_impl.gradients(r, x)[0]
-      self.assertEqual(r.eval(), 524288.0)
+      self.assertEqual(grad.eval(), 524288.0)
       # while_v2 does not have stacks.
       if not control_flow_ops.ENABLE_WHILE_V2:
         self.assertEqual(
@@ -3052,6 +3520,51 @@ class ControlFlowTest(test.TestCase):
                 ]), 1)
 
 
+  @test_util.run_v1_only("b/120545219")
+  def testQIntSwitchMerge(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      constant_qint = constant_op.constant(np.array([42]), dtypes.qint8)
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.switch(constant_qint, cond)
+      result = control_flow_ops.merge([v_f, v_t])
+      self.evaluate(result)
+
+  @test_util.run_v1_only("b/120545219")
+  def testQIntRefSwitchMerge(self):
+    with self.cached_session(use_gpu=test.is_gpu_available()) as sess:
+      var_qint = gen_state_ops.variable(
+          shape=[1], dtype=dtypes.qint8, name="v", container="", shared_name="")
+      assign_op = state_ops.assign(
+          var_qint, constant_op.constant(np.array([42]), dtypes.qint8))
+      self.evaluate(assign_op)
+
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.ref_switch(var_qint, cond)
+      result = control_flow_ops.ref_merge([v_f, v_t])
+      self.evaluate(result)
+
+  @test_util.run_v1_only("b/120545219")
+  def testUInt64SwitchMerge(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      constant_uint64 = constant_op.constant(np.array([42]), dtypes.uint64)
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.switch(constant_uint64, cond)
+      result = control_flow_ops.merge([v_f, v_t])
+      self.evaluate(result)
+
+  @test_util.run_deprecated_v1
+  def testQIntArgAndRet(self):
+
+    @function.Defun(dtypes.qint8)
+    def func(x):
+      return x
+
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      qint = constant_op.constant(np.array([42]), dtypes.qint8)
+      result = func(qint)
+      self.evaluate(result)
+
+
 class ControlFlowContextCheckTest(test.TestCase):
 
   def _getWhileTensor(self):
@@ -3078,6 +3591,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
     return cond_tensor[0]
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidContext(self):
     # Accessing a while loop tensor outside of control flow is illegal.
     while_tensor = self._getWhileTensor()
@@ -3087,6 +3601,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         "is in a while loop. See info log for more details."):
       math_ops.add(1, while_tensor)
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidContextInCond(self):
     # Accessing a while loop tensor in cond is illegal.
     while_tensor = self._getWhileTensor()
@@ -3099,6 +3614,7 @@ class ControlFlowContextCheckTest(test.TestCase):
           math_ops.less(1, 2), lambda: math_ops.add(1, while_tensor),
           lambda: constant_op.constant(0))
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidContextInWhile(self):
     # Accessing a while loop tensor in a different while loop is illegal.
     while_tensor = self._getWhileTensor()
@@ -3133,6 +3649,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.cond(math_ops.less(1, 2), branch_fn, branch_fn)
 
+  @test_util.run_v1_only("b/120545219")
   def testValidWhileContext(self):
     # Accessing a tensor in a nested while is OK.
     def body(_):
@@ -3141,6 +3658,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.while_loop(lambda i: i < 5, body, [0])
 
+  @test_util.run_v1_only("b/120545219")
   def testValidNestedContexts(self):
     # Accessing a tensor from a cond context in a while context, all inside an
     # outer while context, is OK.
@@ -3155,6 +3673,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.while_loop(lambda i: i < 5, body, [0])
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidNestedContexts(self):
     # Accessing a tensor from a while context in a different while context, all
     # inside a cond context, is illegal.
@@ -3173,6 +3692,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
 class TupleTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testTensors(self):
     for v1_first in [True, False]:
       with self.cached_session():
@@ -3188,21 +3708,22 @@ class TupleTest(test.TestCase):
 
         # v1 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v1.eval()
+          self.evaluate(v1)
 
         # v2 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v2.eval()
+          self.evaluate(v2)
 
         if v1_first:
           # Getting t1 initializes v2.
-          self.assertAllClose([3.0], t1.eval())
-          self.assertAllClose([10.0], v2.eval())
+          self.assertAllClose([3.0], self.evaluate(t1))
+          self.assertAllClose([10.0], self.evaluate(v2))
         else:
           # Getting t2 initializes v1.
-          self.assertAllClose([30.0], t2.eval())
-          self.assertAllClose([1.0], v1.eval())
+          self.assertAllClose([30.0], self.evaluate(t2))
+          self.assertAllClose([1.0], self.evaluate(v1))
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlices(self):
     for v1_first in [True, False]:
       with self.cached_session():
@@ -3226,22 +3747,22 @@ class TupleTest(test.TestCase):
 
         # v1 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v1.eval()
+          self.evaluate(v1)
 
         # v2 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v2.eval()
+          self.evaluate(v2)
 
         if v1_first:
           # Getting g1 initializes v2.
-          self.assertAllClose([[10.0, 11.0]], g1.eval())
+          self.assertAllClose([[10.0, 11.0]], self.evaluate(g1))
           self.assertAllClose([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]],
-                              v2.eval())
+                              self.evaluate(v2))
         else:
           # Getting g2 initializes v1.
-          self.assertAllClose([[10.1, 11.1]], g2.eval())
+          self.assertAllClose([[10.1, 11.1]], self.evaluate(g2))
           self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]],
-                              v1.eval())
+                              self.evaluate(v1))
 
   def testAcceptTensorsAsControlInputs(self):
     with self.cached_session():
@@ -3251,15 +3772,16 @@ class TupleTest(test.TestCase):
           [constant_op.constant(0)], control_inputs=[assign])
 
       # Should trigger the assign.
-      t.eval()
+      self.evaluate(t)
 
-      self.assertEquals(1, var.eval())
+      self.assertEquals(1, self.evaluate(var))
 
 
 class AssertTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGuardedAssertDoesNotCopyWhenTrue(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with ops.device(test.gpu_device_name()):
         value = constant_op.constant(1.0)
       with ops.device("/cpu:0"):
@@ -3354,7 +3876,7 @@ class WhileOpBenchmark(test.Benchmark):
     with session.Session() as sess, ops.device(default_device):
       # Get the initial id i, input x, and kernel.
       i, x, kernel = self._getInitVariables()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       if static_unroll:
         for _ in xrange(steps):
@@ -3373,11 +3895,11 @@ class WhileOpBenchmark(test.Benchmark):
 
       for _ in xrange(3):
         # exclude warm up time
-        sess.run(r)
+        self.evaluate(r)
 
       start_time = time.time()
       for _ in xrange(num_iters):
-        sess.run(r)
+        self.evaluate(r)
       return (time.time() - start_time) / num_iters
 
   def benchmarkWhileOpCrossDevicePlacement(self):
@@ -3424,9 +3946,6 @@ class EagerTest(test.TestCase):
 
   # TODO(b/117279927): Re-enable once msan failure is fixed.
   def DISABLED_testCondInDefun(self):
-    if "GPU" in [d.device_type for d in device_lib.list_local_devices()]:
-      return unittest.skip("b/113346829 (gpu failure)")
-
     with context.eager_mode():
 
       @eager_function.defun
@@ -3456,6 +3975,7 @@ class EagerTest(test.TestCase):
           isum(tensor, maximum_iterations=3).numpy(),
           [1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with context.eager_mode():
       tensor = constant_op.constant(0)
@@ -3478,6 +3998,7 @@ class EagerTest(test.TestCase):
       self.assertAllEqual(t1.numpy(), tup1.numpy())
       self.assertAllEqual(t2.numpy(), tup2.numpy())
 
+  @test_util.run_v1_only("b/120545219")
   def testCase(self):
     with context.eager_mode():
       x = constant_op.constant(1)
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
index 762c445da05008a78fec1ec9e1cc7186e1539134..573f4b0d250ba5ff75118ed5738c3de2a8711a2f 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
 
 class ControlFlowUtilTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testIsSwitch(self):
     switch_false, _ = control_flow_ops.switch(1, True)
     switch = switch_false.op
@@ -44,6 +46,7 @@ class ControlFlowUtilTest(test.TestCase):
 
     self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
 
+  @test_util.run_v1_only("b/120545219")
   def testIsLoopEnter(self):
     enter = gen_control_flow_ops.enter(1, frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(enter))
@@ -61,6 +64,7 @@ class ControlFlowUtilTest(test.TestCase):
 
     self.assertFalse(control_flow_util.IsLoopEnter(test_ops.int_output().op))
 
+  @test_util.run_v1_only("b/120545219")
   def testIsLoopExit(self):
     exit_op = control_flow_ops.exit(1).op
     self.assertTrue(control_flow_util.IsLoopExit(exit_op))
diff --git a/tensorflow/python/kernel_tests/control_flow_util_v2_test.py b/tensorflow/python/kernel_tests/control_flow_util_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0374a77005db4597ddbce76c1d2a3b9ac0e792d
--- /dev/null
+++ b/tensorflow/python/kernel_tests/control_flow_util_v2_test.py
@@ -0,0 +1,66 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.python.ops.control_flow_util_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util_v2
+from tensorflow.python.platform import test
+
+
+class ControlFlowUtilV2Test(test.TestCase):
+
+  def setUp(self):
+    self._enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
+    self._enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
+    control_flow_ops.ENABLE_COND_V2 = True
+    control_flow_ops.ENABLE_WHILE_V2 = True
+
+  def tearDown(self):
+    control_flow_ops.ENABLE_COND_V2 = self._enable_cond_v2_old
+    control_flow_ops.ENABLE_WHILE_V2 = self._enable_while_v2_old
+
+  def _create_control_flow(self, expect_in_defun):
+    """Helper method for testInDefun."""
+    def body(i):
+      def branch():
+        self.assertEqual(control_flow_util_v2.in_defun(), expect_in_defun)
+        return i + 1
+      return control_flow_ops.cond(constant_op.constant(True),
+                                   branch, lambda: 0)
+    return control_flow_ops.while_loop(lambda i: i < 4, body,
+                                       [constant_op.constant(0)])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInDefun(self):
+    self._create_control_flow(False)
+
+    @function.defun
+    def defun():
+      self._create_control_flow(True)
+
+    defun()
+    self.assertFalse(control_flow_util_v2.in_defun())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index 2d6d8a80510da9a746c5bc9dd66b1375f5c7ba80..e8463323df90bd37d927f88bd41b09bef45de541 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -40,10 +40,10 @@ class Conv1DTest(test.TestCase):
       filters = array_ops.expand_dims(filters, 2)  # out_channels
       # Filters is 2x1x1
       for stride in [1, 2]:
-        with self.test_session(use_gpu=test.is_gpu_available()):
+        with self.cached_session(use_gpu=test.is_gpu_available()):
           c = nn_ops.conv1d(x, filters, stride, padding="VALID")
           reduced = array_ops.squeeze(c)
-          output = reduced.eval()
+          output = self.evaluate(reduced)
           if stride == 1:
             self.assertEqual(len(output), 3)
             self.assertAllClose(output,
@@ -69,7 +69,7 @@ class Conv1DTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv1d_transpose(
           x, f, y_shape, stride=stride, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
diff --git a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
index 644a151710e7b537ca3a91e79e0aab888b0b812a..7b3b560b24005e4fdbac78245ac425865d98dd0b 100644
--- a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class Conv2DBackpropFilterGradTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:
@@ -66,7 +68,7 @@ class Conv2DBackpropFilterGradTest(test.TestCase):
 
   def testGradientDilatedConv(self):
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         for padding in ["SAME", "VALID"]:
           for stride in [1, 2]:
             np.random.seed(1)
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index cbdd2c5991a8c3e11f08ada3588af6e5872044b2..c603c08630661083a65c4c1f6f399925efa537a6 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -53,7 +53,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells=kernel_height * kernel_width
@@ -91,7 +91,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[2]):
@@ -124,7 +124,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
@@ -155,6 +155,7 @@ class Conv2DTransposeTest(test.TestCase):
 
     self.assertAllClose(cache_values, value)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [2, 6, 4, 3]
     f_shape = [3, 3, 2, 3]
@@ -177,7 +178,7 @@ class Conv2DTransposeTest(test.TestCase):
   def testConv2DTransposeSingleStrideNCHW(self):
     # `NCHW` data format is only supported for CUDA device.
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         strides = [1, 1, 1, 1]
 
         # Input, output: [batch, depth, height, width, depth]
@@ -195,7 +196,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="SAME", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         for n in xrange(x_shape[0]):
           for k in xrange(f_shape[2]):
             for w in xrange(y_shape[3]):
@@ -212,7 +213,7 @@ class Conv2DTransposeTest(test.TestCase):
   def testConv2DTransposeSameNCHW(self):
     # `NCHW` data format is only supported for CUDA device.
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         strides = [1, 1, 2, 2]
 
         # Input, output: [batch, depth, height, width]
@@ -230,7 +231,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="SAME", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         for n in xrange(x_shape[0]):
           for k in xrange(f_shape[2]):
             for w in xrange(y_shape[3]):
@@ -248,7 +249,7 @@ class Conv2DTransposeTest(test.TestCase):
   def testConv2DTransposeValidNCHW(self):
     # `NCHW` data format is only supported for CUDA device.
     if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         strides = [1, 1, 2, 2]
 
         # Input, output: [batch, depth, height, width]
@@ -265,7 +266,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="VALID", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         cache_values = np.zeros(y_shape, dtype=np.float32)
         # The amount of padding added
         pad = 1
@@ -293,7 +294,6 @@ class Conv2DTransposeTest(test.TestCase):
 
         self.assertAllClose(cache_values, value)
 
-  @test_util.enable_c_shapes
   def testConv2DTransposeShapeInference(self):
     # Test case for 8972
     initializer = random_ops.truncated_normal(
diff --git a/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py b/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
index 89b64068ace5803ed7d92cfb6425940b494159cc..7e913febed3dc8f4f698a0ede6ed8670e0b69a50 100644
--- a/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class Conv3DBackpropFilterV2GradTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index 2527b837692b5e31126499db85224d2a8d3b5321..22ba5b90375c61ae7e1c426d88f0c19a546b2bbc 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -48,7 +49,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells = kernel_depth * kernel_height * kernel_width
@@ -98,7 +99,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[3]):
@@ -119,6 +120,7 @@ class Conv3DTransposeTest(test.TestCase):
                   target = 3.0
                 self.assertAllClose(target, value[n, d, h, w, k])
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeShapeMismatch(self):
     # Test case for GitHub issue 18460
     x_shape = [2, 2, 3, 4, 3]
@@ -146,7 +148,7 @@ class Conv3DTransposeTest(test.TestCase):
         output = nn_ops.conv3d_transpose(
             x_value, f_value, constant_op.constant(y_shape, dtype=dtype),
             strides=strides, padding="SAME")
-        output.eval()
+        self.evaluate(output)
 
   def testConv3DTransposeValid(self):
     with self.cached_session():
@@ -165,7 +167,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
@@ -201,6 +203,7 @@ class Conv3DTransposeTest(test.TestCase):
 
     self.assertAllClose(cache_values, value)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [2, 3, 4, 3, 2]
     f_shape = [3, 3, 3, 2, 2]
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 6794464e3afcd1b62a298975aa2d7b1a13be03a3..4a689b3fdfa5f43c8b6a4c67b7ebb31104d83db7 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -52,11 +52,11 @@ class Conv3DTest(test.TestCase):
   def _DtypesToTest(self, use_gpu):
     if use_gpu:
       if not test_util.CudaSupportsHalfMatMulAndConv():
-        return [dtypes.float32]
+        return [dtypes.float64, dtypes.float32]
       else:
         # It is important that float32 comes before float16 here,
         # as we will be using its gradients as reference for fp16 gradients.
-        return [dtypes.float32, dtypes.float16]
+        return [dtypes.float64, dtypes.float32, dtypes.float16]
     else:
       return [dtypes.float64, dtypes.float32, dtypes.float16]
 
@@ -74,7 +74,7 @@ class Conv3DTest(test.TestCase):
     # during the conv3d.
     x1 = [f * 1.0 / total_size_tensor for f in range(1, total_size_tensor + 1)]
     x2 = [f * 1.0 / total_size_filter for f in range(1, total_size_filter + 1)]
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
 
@@ -109,7 +109,7 @@ class Conv3DTest(test.TestCase):
         results.append(result)
 
       with self.cached_session() as sess:
-        values = sess.run(results)
+        values = self.evaluate(results)
         for value in values:
           print("expected = ", expected)
           print("actual = ", value)
@@ -133,7 +133,7 @@ class Conv3DTest(test.TestCase):
     # numbers from 1.
     x1 = [f * 1.0 for f in range(1, total_size_tensor + 1)]
     x2 = [f * 1.0 for f in range(1, total_size_filter + 1)]
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       if isinstance(stride, collections.Iterable):
@@ -184,8 +184,8 @@ class Conv3DTest(test.TestCase):
         computed_results.append(computed)
         tolerance = 1e-2 if use_gpu else 1e-5
         with self.cached_session() as sess:
-          expected_values = sess.run(expected_results)
-          computed_values = sess.run(computed_results)
+          expected_values = self.evaluate(expected_results)
+          computed_values = self.evaluate(computed_results)
           for e_value, c_value in zip(expected_values, computed_values):
             print("expected = ", e_value)
             print("actual = ", c_value)
@@ -413,7 +413,7 @@ class Conv3DTest(test.TestCase):
       elif data_type == dtypes.float16:
         tolerance = 1e-3
 
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         orig_input_tensor = constant_op.constant(
             input_data, shape=input_shape, dtype=data_type, name="input")
         filter_tensor = constant_op.constant(
@@ -462,6 +462,7 @@ class Conv3DTest(test.TestCase):
       self._ConstructAndTestGradientForConfig(data_format=data_format,
                                               use_gpu=use_gpu, **kwargs)
 
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -473,6 +474,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=4,
@@ -484,6 +486,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -495,6 +498,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -506,6 +510,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -517,6 +522,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -528,6 +534,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -539,6 +546,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientSamePaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -550,6 +558,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -561,6 +570,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientSamePaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=4,
@@ -572,6 +582,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -583,6 +594,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientSamePaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -594,6 +606,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingDifferentStrides(self):
     self.ConstructAndTestGradient(
         batch=1,
@@ -605,6 +618,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientKernelSizeMatchesInputSize(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -616,6 +630,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientKernelSizeMatchesInputSize(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -638,6 +653,32 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  # Test the fast path in gemm_pack_rhs/mkldnn_gemm_pack, when channel
+  # dimension is a multiple of packet size.
+  @test_util.run_deprecated_v1
+  def testInputGradientValidPaddingStrideOneFastPath(self):
+    self.ConstructAndTestGradient(
+        batch=2,
+        input_shape=(3, 5, 4),
+        filter_shape=(2, 2, 2),
+        in_depth=8,
+        out_depth=2,
+        stride=1,
+        padding="VALID",
+        test_input=True)
+
+  @test_util.run_deprecated_v1
+  def testFilterGradientValidPaddingStrideOneFastPath(self):
+    self.ConstructAndTestGradient(
+        batch=2,
+        input_shape=(4, 6, 5),
+        filter_shape=(2, 2, 2),
+        in_depth=8,
+        out_depth=2,
+        stride=1,
+        padding="VALID",
+        test_input=False)
+
   # Testing for backprops
   def _RunAndVerifyBackprop(self, input_sizes, filter_sizes, output_sizes,
                             strides, dilations, padding, data_format, use_gpu,
@@ -659,7 +700,7 @@ class Conv3DTest(test.TestCase):
     # because we currently do not have a CPU implementation for arbitrary
     # dilation rates.
     if default_dilations or use_gpu:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         if data_format == "NCDHW":
           input_sizes = test_util.NHWCToNCHW(input_sizes)
         t1 = constant_op.constant(x1, shape=input_sizes)
@@ -691,8 +732,8 @@ class Conv3DTest(test.TestCase):
         expected_grad = gradients_impl.gradients(expected, t1
                                                  if mode == "input" else t2)[0]
         # "values" consists of two tensors for two backprops
-        actual_value = sess.run(actual_grad)
-        expected_value = sess.run(expected_grad)
+        actual_value = self.evaluate(actual_grad)
+        expected_value = self.evaluate(expected_grad)
         self.assertShapeEqual(actual_value, actual_grad)
         self.assertShapeEqual(expected_value, expected_grad)
       print("expected = ", expected_value)
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index ea611497d984730a42e0ccaa6ea255d58ba2ff54..2f6f3bb383b381de1dac78cc72882fe5fe4291c9 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -878,7 +878,7 @@ class Conv2DTest(test.TestCase):
     x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
     default_dilations = (dilations[0] == 1 and dilations[1] == 1)
     if default_dilations or use_gpu:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         if data_format == "NCHW":
           input_sizes = test_util.NHWCToNCHW(input_sizes)
         t1 = constant_op.constant(x1, shape=input_sizes)
@@ -908,8 +908,8 @@ class Conv2DTest(test.TestCase):
         conv = gradients_impl.gradients(conv_forward, t1)[0]
         conv_2 = gradients_impl.gradients(conv_forward_2, t1)[0]
         # "values" consists of two tensors for two backprops
-        value = sess.run(conv)
-        value_2 = sess.run(conv_2)
+        value = self.evaluate(conv)
+        value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
       tf_logging.info("expected = ", value_2)
@@ -932,7 +932,7 @@ class Conv2DTest(test.TestCase):
     x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
     default_dilations = (dilations[0] == 1 and dilations[1] == 1)
     if default_dilations or use_gpu:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         if data_format == "NCHW":
           input_sizes = test_util.NHWCToNCHW(input_sizes)
         t1 = constant_op.constant(x1, shape=input_sizes)
@@ -961,8 +961,8 @@ class Conv2DTest(test.TestCase):
           conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
         conv = gradients_impl.gradients(conv_forward, t2)[0]
         conv_2 = gradients_impl.gradients(conv_forward, t2)[0]
-        value = sess.run(conv)
-        value_2 = sess.run(conv_2)
+        value = self.evaluate(conv)
+        value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
       tf_logging.info("expected = ", value_2)
@@ -1139,7 +1139,7 @@ class Conv2DTest(test.TestCase):
     # So we disable the DOUBLE path.  We should re-enable this
     # when double support returns for CPU and/or GPU.
     for dtype in self._DtypesToTest(use_gpu=use_gpu):
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         input_tensor = constant_op.constant(
             input_data, shape=input_shape, dtype=dtype, name="input")
         filter_tensor = constant_op.constant(
@@ -1545,7 +1545,7 @@ class DepthwiseConv2DTest(test.TestCase):
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
@@ -1644,7 +1644,7 @@ class SeparableConv2DTest(test.TestCase):
       expected: An array containing the expected operation outputs.
       data_format: string data format for input tensor.
     """
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       t1 = self._InitValues(tensor_in_sizes)
       f1 = self._InitValues(depthwise_filter_in_sizes)
       f1.set_shape(depthwise_filter_in_sizes)
@@ -1667,9 +1667,9 @@ class SeparableConv2DTest(test.TestCase):
       if data_format == "NCHW":
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = ", value)
-    self.assertArrayNear(expected, np.ravel(value), 1e-5)
+    self.assertArrayNear(expected, np.ravel(value), 1e-3)
     self.assertShapeEqual(value, conv)
 
   def _testSeparableConv2D(self, data_format):
@@ -1766,7 +1766,7 @@ class DeepConv2DTest(test.TestCase):
     x1 = np.random.rand(*tensor_in_sizes).astype(np.float32)
     x2 = np.random.rand(*filter_in_sizes).astype(np.float32)
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       strides = [1] + conv_strides + [1]
@@ -1774,10 +1774,10 @@ class DeepConv2DTest(test.TestCase):
       conv = nn_ops.conv2d(t1, t2, strides=strides, padding=padding)
 
       os.environ["TF_USE_DEEP_CONV2D"] = "0"
-      values_expect = sess.run([conv])
+      values_expect = self.evaluate([conv])
 
       os.environ["TF_USE_DEEP_CONV2D"] = "1"
-      values_test = sess.run([conv])
+      values_test = self.evaluate([conv])
 
       self.assertAllClose(values_expect, values_test, rtol=1e-5, atol=1e-5)
 
diff --git a/tensorflow/python/kernel_tests/cross_grad_test.py b/tensorflow/python/kernel_tests/cross_grad_test.py
index 0bd4006d6ac1e922ed2935ad70d7aa60e87dedf3..b397133fd7328efa137910f4ea503849e23c6abe 100644
--- a/tensorflow/python/kernel_tests/cross_grad_test.py
+++ b/tensorflow/python/kernel_tests/cross_grad_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -26,6 +27,7 @@ from tensorflow.python.platform import test
 
 class CrossOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGradientRandomValues(self):
     with self.cached_session():
       us = [2, 3]
diff --git a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
index 41ae0b456f66c4934f90de63044468e2dfb033e9..0d86d13c7159bf577c1cca882964fe62b0586e2a 100644
--- a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
+++ b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
@@ -25,6 +25,7 @@ from six.moves import zip_longest
 
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ctc_ops
 from tensorflow.python.platform import test
@@ -57,7 +58,7 @@ class CTCGreedyDecoderTest(test.TestCase):
     # from a len time python list of [batch_size x depth] tensors
     inputs_t = array_ops.stack(inputs_t)
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       decoded_list, log_probability = decoder(
           inputs_t, sequence_length=seq_lens, **decoder_args)
       decoded_unwrapped = list(
@@ -94,6 +95,7 @@ class CTCGreedyDecoderTest(test.TestCase):
         with self.assertRaisesOpError(expected_err_re):
           sess.run(decoded_unwrapped + [log_probability])
 
+  @test_util.run_deprecated_v1
   def testCTCGreedyDecoder(self):
     """Test two batch entries - best path decoder."""
     max_time_steps = 6
@@ -170,6 +172,7 @@ class CTCGreedyDecoderTest(test.TestCase):
     self._testCTCDecoder(ctc_ops.ctc_greedy_decoder, inputs, seq_lens,
                          log_prob_truth, decode_truth)
 
+  @test_util.run_deprecated_v1
   def testCTCDecoderBeamSearch(self):
     """Test one batch, two beams - hibernating beam search."""
     # max_time_steps == 8
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index 18e92162b93f6392874bbe497cb0110f72cd8dae..e24f304c1b80787f43885055cad1de8cf43bb4db 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -23,9 +23,16 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ctc_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
@@ -52,6 +59,24 @@ def SimpleSparseTensorFrom(x):
   return sparse_tensor.SparseTensor(x_ix, x_val, x_shape)
 
 
+def _ctc_loss_v2(labels, inputs, sequence_length,
+                 preprocess_collapse_repeated=False,
+                 ctc_merge_repeated=True,
+                 ignore_longer_outputs_than_inputs=False,
+                 time_major=True):
+  """Call ctc_loss_v2 with v1 args."""
+  assert not preprocess_collapse_repeated
+  assert ctc_merge_repeated
+  assert not ignore_longer_outputs_than_inputs
+  return ctc_ops.ctc_loss_v2(
+      labels=labels,
+      logits=inputs,
+      logit_length=sequence_length,
+      label_length=None,
+      blank_index=-1,
+      logits_time_major=time_major)
+
+
 class CTCLossTest(test.TestCase):
 
   def _testCTCLoss(self,
@@ -65,8 +90,8 @@ class CTCLossTest(test.TestCase):
 
     inputs_t = constant_op.constant(inputs)
 
-    with self.test_session(use_gpu=False) as sess:
-      loss = ctc_ops.ctc_loss(
+    with self.cached_session(use_gpu=False) as sess:
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
       grad = gradients_impl.gradients(loss, [inputs_t])[0]
 
@@ -74,13 +99,14 @@ class CTCLossTest(test.TestCase):
       self.assertShapeEqual(grad_truth, grad)
 
       if expected_err_re is None:
-        (tf_loss, tf_grad) = sess.run([loss, grad])
+        (tf_loss, tf_grad) = self.evaluate([loss, grad])
         self.assertAllClose(tf_loss, loss_truth, atol=1e-6)
         self.assertAllClose(tf_grad, grad_truth, atol=1e-6)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          sess.run([loss, grad])
+          self.evaluate([loss, grad])
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     """Test two batch entries."""
     # Input and ground truth from Alex Graves' implementation.
@@ -216,6 +242,7 @@ class CTCLossTest(test.TestCase):
 
     self._testCTCLoss(inputs, seq_lens, labels, loss_truth, grad_truth)
 
+  @test_util.run_v1_only("b/120545219")
   def test_time_major(self):
     """Testing time_major param.
 
@@ -233,18 +260,19 @@ class CTCLossTest(test.TestCase):
     # Transposing tensor to [batch_size x max_time x depth tensor]
     inputs_t_transposed = constant_op.constant(inputs.transpose(1, 0, 2))
 
-    with self.test_session(use_gpu=False) as sess:
-      loss = ctc_ops.ctc_loss(
+    with self.session(use_gpu=False) as sess:
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
-      loss_transposed = ctc_ops.ctc_loss(
+      loss_transposed = _ctc_loss_v2(
           inputs=inputs_t_transposed,
           labels=labels,
           sequence_length=seq_lens,
           time_major=False)
 
-      (tf_loss, tf_loss_transposed) = sess.run([loss, loss_transposed])
+      (tf_loss, tf_loss_transposed) = self.evaluate([loss, loss_transposed])
       self.assertAllEqual(tf_loss, tf_loss_transposed)
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidSecondGradient(self):
     inputs = np.random.randn(2, 2, 3).astype(np.float32)
     inputs_t = constant_op.constant(inputs)
@@ -252,8 +280,8 @@ class CTCLossTest(test.TestCase):
     seq_lens = np.array([2, 2], dtype=np.int32)
     v = [1.0]
 
-    with self.test_session(use_gpu=False):
-      loss = ctc_ops.ctc_loss(
+    with self.session(use_gpu=False):
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
       # Taking ths second gradient should fail, since it is not
       # yet supported.
@@ -261,6 +289,7 @@ class CTCLossTest(test.TestCase):
                                    "explicitly disabled"):
         _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
 
+  @test_util.run_v1_only("b/120545219")
   def testEmptyBatch(self):
     inputs = constant_op.constant([], dtype=dtypes.float32, shape=(1, 0, 2))
     sequence_lengths = constant_op.constant([], dtype=dtypes.int32)
@@ -269,10 +298,549 @@ class CTCLossTest(test.TestCase):
         values=constant_op.constant([], shape=(0,), dtype=dtypes.int32),
         dense_shape=[5, 5])
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "batch_size must not be 0"):
-        sess.run(ctc_ops.ctc_loss(labels, inputs, sequence_lengths))
+        sess.run(_ctc_loss_v2(labels, inputs, sequence_lengths))
+
+
+class CTCLossTestV2(test.TestCase):
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossV2(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    max_label_length = 5
+    num_frames = 12
+
+    labels = random_ops.random_uniform(
+        [batch_size, max_label_length], minval=1, maxval=num_labels,
+        dtype=dtypes.int64)
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+
+    label_length = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=max_label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_length, maxlen=max_label_length, dtype=label_length.dtype)
+    labels *= label_mask
+    logit_length = [num_frames] * batch_size
+
+    ref_loss = ctc_ops.ctc_loss_v2(
+        labels=labels,
+        logits=logits,
+        label_length=label_length,
+        logit_length=logit_length)
+    ref_grad = gradients_impl.gradients(ref_loss, [logits])
+
+    sparse_labels = ctc_ops.dense_labels_to_sparse(labels, label_length)
+
+    def assert_same_loss_and_grads(loss):
+      with self.cached_session() as sess:
+        self.assertAllClose(*self.evaluate([loss, ref_loss]))
+        grad = gradients_impl.gradients(loss, [logits])
+        self.assertAllClose(
+            *self.evaluate([grad, ref_grad]), rtol=2e-06, atol=2e-06)
+
+    assert_same_loss_and_grads(
+        ctc_ops.ctc_loss_v2(
+            labels=sparse_labels,
+            logits=logits,
+            label_length=label_length,
+            logit_length=logit_length,
+            blank_index=0))
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossDenseIsSameAsCtcLoss(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      random_seed.set_random_seed(5)
+
+      batch_size = 8
+      num_labels = 6
+      label_length = 5
+      num_frames = 12
+      logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+      labels = random_ops.random_uniform(
+          [batch_size, label_length], minval=1, maxval=num_labels,
+          dtype=dtypes.int64)
+
+      label_lengths = random_ops.random_uniform(
+          [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+      label_mask = array_ops.sequence_mask(
+          label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+      labels *= label_mask
+
+      logit_lengths = [num_frames] * batch_size
+
+      ctc_loss = ctc_ops.ctc_loss_dense(
+          labels=labels,
+          logits=logits,
+          label_length=label_lengths,
+          logit_length=logit_lengths)
+      ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+      # Shift labels down by one (move blank from 0 to num_labels -1)
+      tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
+      tf_nn_ctc_logits = array_ops.concat([
+          logits[:, :, 1:],
+          logits[:, :, 0:1],
+      ], axis=2)
+
+      tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+          tf_ctc_loss_labels, label_lengths)
+
+      tf_nn_ctc_loss = ctc_ops.ctc_loss(
+          labels=tf_ctc_loss_labels,
+          inputs=tf_nn_ctc_logits,
+          sequence_length=logit_lengths,
+          time_major=True)
+      tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+      with self.cached_session() as sess:
+        for _ in range(32):
+          self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+          self.assertAllClose(
+              *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+              rtol=2e-06,
+              atol=2e-06)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    label_length = 5
+    num_frames = 12
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+    labels = random_ops.random_uniform(
+        [batch_size, label_length], minval=1, maxval=num_labels,
+        dtype=dtypes.int64)
+
+    label_lengths = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+    labels *= label_mask
+
+    logit_lengths = [num_frames] * batch_size
+
+    ctc_loss = ctc_ops.ctc_loss_dense(
+        labels=labels,
+        logits=logits,
+        label_length=label_lengths,
+        logit_length=logit_lengths,
+        unique=ctc_ops.ctc_unique_labels(labels))
+    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+    # Shift labels down by one (move blank from 0 to num_labels -1)
+    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
+    tf_nn_ctc_logits = array_ops.concat([
+        logits[:, :, 1:],
+        logits[:, :, 0:1],
+    ], axis=2)
+
+    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+        tf_ctc_loss_labels, label_lengths)
+
+    tf_nn_ctc_loss = ctc_ops.ctc_loss(
+        labels=tf_ctc_loss_labels,
+        inputs=tf_nn_ctc_logits,
+        sequence_length=logit_lengths,
+        time_major=True)
+    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+    with self.cached_session() as sess:
+      for _ in range(32):
+        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+        self.assertAllClose(
+            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+            rtol=2e-06,
+            atol=2e-06)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    label_length = 5
+    num_frames = 12
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+    labels = random_ops.random_uniform(
+        [batch_size, label_length], minval=0, maxval=num_labels-1,
+        dtype=dtypes.int64)
+
+    label_lengths = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+    labels *= label_mask
+
+    logit_lengths = [num_frames] * batch_size
+
+    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
+    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+        tf_ctc_loss_labels, label_lengths)
+
+    tf_nn_ctc_loss = ctc_ops.ctc_loss(
+        labels=tf_ctc_loss_labels,
+        inputs=logits,
+        sequence_length=logit_lengths,
+        time_major=True)
+    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+    # Shift the blank logits/labels to be somewhere in the middle.
+    blank_index = 2
+    shifted_logits = array_ops.concat([
+        logits[:, :, :blank_index],
+        logits[:, :, -1:],
+        logits[:, :, blank_index:-1],
+    ], axis=2)
+    shifted_labels = array_ops.where(labels < blank_index, labels, labels + 1)
+
+    ctc_loss = ctc_ops.ctc_loss_dense(
+        labels=shifted_labels,
+        logits=shifted_logits,
+        label_length=label_lengths,
+        logit_length=logit_lengths,
+        blank_index=blank_index)
+    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+    with self.cached_session() as sess:
+      for _ in range(32):
+        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+        self.assertAllClose(
+            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+            rtol=2e-06,
+            atol=2e-06)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossDenseWithNegativeBlankIndexIsSameAsCtcLoss(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      random_seed.set_random_seed(5)
+
+      batch_size = 8
+      num_labels = 6
+      label_length = 5
+      num_frames = 12
+      logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+      labels = random_ops.random_uniform(
+          [batch_size, label_length], minval=0, maxval=num_labels-1,
+          dtype=dtypes.int64)
+
+      label_lengths = random_ops.random_uniform(
+          [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+      label_mask = array_ops.sequence_mask(
+          label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+      labels *= label_mask
+
+      logit_lengths = [num_frames] * batch_size
+
+      ctc_loss = ctc_ops.ctc_loss_dense(
+          labels=labels,
+          logits=logits,
+          label_length=label_lengths,
+          logit_length=logit_lengths,
+          blank_index=-1)
+      ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+      tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
+      tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+          tf_ctc_loss_labels, label_lengths)
+
+      tf_nn_ctc_loss = ctc_ops.ctc_loss(
+          labels=tf_ctc_loss_labels,
+          inputs=logits,
+          sequence_length=logit_lengths,
+          time_major=True)
+      tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+      with self.cached_session() as sess:
+        for _ in range(32):
+          self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+          self.assertAllClose(
+              *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+              rtol=2e-06,
+              atol=2e-06)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeated(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 3, 3, 3, 0],
+                [1, 4, 4, 4, 0],
+                [4, 2, 2, 9, 4]],
+        seq_length=[4, 5, 5])
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeatedPreservesDtypes(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=constant_op.constant(
+            [[1, 3, 3, 3, 0],
+             [1, 4, 4, 4, 0],
+             [4, 2, 2, 9, 4]],
+            dtype=dtypes.int64),
+        seq_length=constant_op.constant([4, 5, 5], dtype=dtypes.int64))
+    self.assertEqual(new_seq_lengths.dtype, dtypes.int64)
+    self.assertEqual(collapsed.dtype, dtypes.int64)
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeatedExtraPadding(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 3, 3, 3, 0, 0, 0],
+                [1, 4, 4, 4, 0, 1, 2],
+                [4, 2, 2, 9, 4, 0, 0]],
+        seq_length=[4, 5, 5])
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeatedFrontRepeats(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 1, 1, 2, 2],
+                [1, 1, 1, 2, 2],
+                [1, 1, 1, 2, 2]],
+        seq_length=[5, 4, 3])
+    self.assertAllEqual(new_seq_lengths, [2, 2, 1])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 2],
+         [1, 2],
+         [1, 0]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeatedAllLabelsTheSame(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1]],
+        seq_length=[4, 5, 1])
+    self.assertAllEqual(new_seq_lengths, [1, 1, 1])
+    self.assertAllEqual(
+        collapsed,
+        [[1],
+         [1],
+         [1]])
+
+  def testDenseSequencesToSparse(self):
+    labels = [[1, 3, 3, 3, 0],
+              [1, 4, 4, 4, 0],
+              [4, 2, 2, 9, 4]]
+    length = [4, 5, 5]
+    sparse = ctc_ops.dense_labels_to_sparse(labels, length)
+    new_dense = sparse_ops.sparse_tensor_to_dense(sparse)
+
+    self.assertAllEqual(labels, new_dense)
+
+    padded_labels = [[1, 3, 3, 3, 0, 0, 0, 0],
+                     [1, 4, 4, 4, 0, 0, 0, 0],
+                     [4, 2, 2, 9, 4, 0, 0, 0]]
+    length = [4, 5, 5]
+    sparse = ctc_ops.dense_labels_to_sparse(padded_labels, length)
+    padded_dense = sparse_ops.sparse_tensor_to_dense(sparse)
+
+    self.assertAllEqual(padded_dense, new_dense)
+
+  @test_util.run_v1_only("b/120545219")
+  def testUnique(self):
+    labels = [
+        [3, 4, 4, 3],
+        [1, 1, 1, 0],
+    ]
+    unique, idx = ctc_ops.ctc_unique_labels(labels)
+    self.assertAllEqual([
+        [3, 4, 0, 0],
+        [1, 0, 0, 0],
+    ], unique)
+    self.assertAllEqual([
+        [0, 1, 1, 0],
+        [0, 0, 0, 1],
+    ], idx)
+
+  @test_util.run_v1_only("b/120545219")
+  def testSumStates(self):
+    idx = [
+        [0, 1, 0, 1],
+        [0, 0, 0, 1],
+    ]
+    states = math_ops.log([
+        [[1.0, 2.0, 3.0, 4.0],
+         [5.0, 6.0, 7.0, 8.0]],
+        [[0.1, 0.2, 0.3, 0.4],
+         [0.5, 0.6, 0.7, 0.8]],
+    ])
+    sum_of_states = math_ops.exp(ctc_ops._sum_states(idx, states))
+    self.assertAllClose([
+        [[4.0, 6.0, 0.0, 0.0],
+         [18.0, 8.0, 0.0, 0.0]],
+        [[0.4, 0.6, 0.0, 0.0],
+         [1.8, 0.8, 0.0, 0.0]]
+    ], sum_of_states)
+
+  @test_util.run_v1_only("b/120545219")
+  def testStateToOlabel(self):
+    labels = [
+        [3, 4, 3, 4],
+        [1, 1, 1, 0],
+    ]
+    num_labels = 8
+
+    # 3 frames, 2 batch, 10 states (5 label, 5 blank).
+    states = [
+        [[0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20],
+         [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30]],
+        [[1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
+         [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0]],
+        [[11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
+         [21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]],
+    ]
+    labels = ops.convert_to_tensor(labels)
+    states = math_ops.log(states)
+    olabel = ctc_ops._state_to_olabel(labels, num_labels, states)
+    olabel = math_ops.exp(olabel)
+    blank = olabel[:, :, 0]
+    self.assertAllClose(blank, [
+        [0.16 + 0.17 + 0.18 + 0.19 + 0.20,
+         0.26 + 0.27 + 0.28 + 0.29 + 0.30],
+        [1.6 + 1.7 + 1.8 + 1.9 + 2.0,
+         2.6 + 2.7 + 2.8 + 2.9 + 3.0],
+        [16.0 + 17.0 + 18.0 + 19.0 + 20.0,
+         26.0 + 27.0 + 28.0 + 29.0 + 30.0]
+    ])
+    self.assertAllClose(olabel[:, :, 1:], [
+        [[0.0, 0.0, 0.12 + 0.14, 0.13 + 0.15, 0.0, 0.0, 0.0],
+         [0.22 + 0.23 + 0.24, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 1.2 + 1.4, 1.3 + 1.5, 0.0, 0.0, 0.0],
+         [2.2 + 2.3 + 2.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 12.0 + 14.0, 13.0 + 15.0, 0.0, 0.0, 0.0],
+         [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+    ])
+
+  @test_util.run_v1_only("b/120545219")
+  def testStateToOlabelUnique(self):
+    labels = [
+        [3, 4, 3, 4],
+        [1, 1, 1, 0],
+    ]
+    num_labels = 8
+
+    # 3 frames, 2 batch, 10 states (5 label, 5 blank).
+    states = [
+        [[0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20],
+         [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30]],
+        [[1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
+         [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0]],
+        [[11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
+         [21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]],
+    ]
+    labels = ops.convert_to_tensor(labels)
+    states = math_ops.log(states)
+    olabel = ctc_ops._state_to_olabel_unique(
+        labels, num_labels, states, ctc_ops.ctc_unique_labels(labels))
+    olabel = math_ops.exp(olabel)
+    blank = olabel[:, :, 0]
+    self.assertAllClose(blank, [
+        [0.16 + 0.17 + 0.18 + 0.19 + 0.20,
+         0.26 + 0.27 + 0.28 + 0.29 + 0.30],
+        [1.6 + 1.7 + 1.8 + 1.9 + 2.0,
+         2.6 + 2.7 + 2.8 + 2.9 + 3.0],
+        [16.0 + 17.0 + 18.0 + 19.0 + 20.0,
+         26.0 + 27.0 + 28.0 + 29.0 + 30.0]])
+    self.assertAllClose(olabel[:, :, 1:], [
+        [[0.0, 0.0, 0.12 + 0.14, 0.13 + 0.15, 0.0, 0.0, 0.0],
+         [0.22 + 0.23 + 0.24, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 1.2 + 1.4, 1.3 + 1.5, 0.0, 0.0, 0.0],
+         [2.2 + 2.3 + 2.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 12.0 + 14.0, 13.0 + 15.0, 0.0, 0.0, 0.0],
+         [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+    ])
+
+  @test_util.run_deprecated_v1
+  def testScan(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      out = ctc_ops._scan(
+          lambda accum, elem: accum + elem,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0)
+      self.assertAllEqual([24.0, 26.0, 29.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          inclusive=True)
+      self.assertAllEqual([23.0, 24.0, 26.0, 29.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          reverse=True)
+      self.assertAllEqual([29.0, 28.0, 26.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          reverse=True,
+          inclusive=True)
+      self.assertAllEqual([29.0, 28.0, 26.0, 23.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([[0.0, 1.0], [2.0, 3.0], [4.0, 5.0]]),
+          constant_op.constant([23.0, 24.0]))
+      self.assertAllEqual([[23.0, 25.0], [25.0, 28.0], [29.0, 33.0]], out)
+
+  @test_util.run_deprecated_v1
+  def testScanCapturesVariables(self):
+    with self.cached_session() as sess:
+      x = random_ops.random_uniform([])
+      fn = lambda accum, elem: accum + x * elem
+      out = ctc_ops._scan(fn, constant_op.constant([0.0, 1.0, 2.0]), 23.0)
+      self.assertAllEqual(*sess.run([
+          [23.0 + x * 0.0, 23.0 + x * 1.0, 23.0 + x * 3.0], out
+      ]))
+
+  @test_util.run_deprecated_v1
+  def testScanMultipleAccumulators(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      def fn(accum, elem):
+        accum_a, accum_b = accum
+        return accum_a + elem, accum_b * elem
+      out = ctc_ops._scan(
+          fn, constant_op.constant([1.0, 2.0, 3.0]),
+          (23.0, constant_op.constant([1.0, 2.0])))
+      a, b = out
+      self.assertAllEqual([24.0, 26.0, 29.0], a)
+      self.assertAllEqual([[1.0, 2.0], [2.0, 4.0], [6.0, 12.0]], b)
+
+  @test_util.run_deprecated_v1
+  def testScanMultipleElements(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      def fn(accum, elem):
+        elem_a, elem_b = elem
+        return accum + (elem_a * elem_b)
+      elems_a = constant_op.constant([1.0, 2.0, 3.0])
+      elems_b = constant_op.constant([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]])
+      out = ctc_ops._scan(
+          fn, (elems_a, elems_b),
+          initial=constant_op.constant([0.0, 0.0]))
+      self.assertAllEqual(
+          [[1.0, 2.0], [5.0, 8.0], [14.0, 20.0]], out)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 8028f93a8c561c4e5d416240469c5da1724dd1ab..49dbbb125a162bd5e1abaa4e8e2dc0907ca920ae 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -77,23 +77,23 @@ class BinaryOpTest(test.TestCase):
 
   def _compareCpu(self, x, y, np_func, tf_func, also_compare_variables=False):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_cpu = out.eval()
+      tf_cpu = self.evaluate(out)
       # Test that the op takes precedence over numpy operators.
-      np_left = tf_func(x, iny).eval()
-      np_right = tf_func(inx, y).eval()
+      np_left = self.evaluate(tf_func(x, iny))
+      np_right = self.evaluate(tf_func(inx, y))
 
       if also_compare_variables:
         var_x = variables.Variable(x)
         var_y = variables.Variable(y)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         print(type(x), type(y), type(var_x), type(var_y))
         print(type(tf_func(x, var_y)), type(tf_func(var_x, y)))
-        np_var_left = tf_func(x, var_y).eval()
-        np_var_right = tf_func(var_x, y).eval()
+        np_var_left = self.evaluate(tf_func(x, var_y))
+        np_var_right = self.evaluate(tf_func(var_x, y))
 
     if np_ans.dtype != np.object:
       self.assertAllClose(np_ans, tf_cpu)
@@ -174,11 +174,11 @@ class BinaryOpTest(test.TestCase):
 
   def _compareGpu(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = out.eval()
+      tf_gpu = self.evaluate(out)
     self.assertAllClose(np_ans, tf_gpu)
     self.assertShapeEqual(np_ans, out)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
@@ -196,6 +196,7 @@ class BinaryOpTest(test.TestCase):
         self._compareGradientY(x, y, np_func, tf_func)
       self._compareGpu(x, y, np_func, tf_func)
 
+  @test_util.run_deprecated_v1
   def testFloatBasic(self):
     x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(np.float32)
     y = np.linspace(20, -5, 15).reshape(1, 3, 5).astype(np.float32)
@@ -233,6 +234,7 @@ class BinaryOpTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testFloatDifferentShapes(self):
     x = np.array([1, 2, 3, 4]).reshape(2, 2).astype(np.float32)
     y = np.array([1, 2]).reshape(2, 1).astype(np.float32)
@@ -252,14 +254,17 @@ class BinaryOpTest(test.TestCase):
     y = np.array([1, 2]).reshape(2, 1).astype(np.int32)
     var_x = variables.Variable(x)
     var_y = variables.Variable(y)
+
     with self.cached_session() as sess:
-      sess.run([var_x.initializer, var_y.initializer])
-      left_result = (var_x * y).eval()
-      right_result = (x * var_y).eval()
+      self.evaluate([var_x.initializer, var_y.initializer])
+      left_result = self.evaluate(var_x * y)
+      right_result = self.evaluate(x * var_y)
+
     np_result = x * y
     self.assertAllEqual(np_result, left_result)
     self.assertAllEqual(np_result, right_result)
 
+  @test_util.run_deprecated_v1
   def testDoubleBasic(self):
     x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(np.float64)
     y = np.linspace(20, -5, 15).reshape(1, 3, 5).astype(np.float64)
@@ -351,6 +356,7 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.floor_divide, _FLOORDIV)
     self._compareBoth(x, y, np.mod, _MOD)
 
+  @test_util.run_deprecated_v1
   def testComplex64Basic(self):
     x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
         np.complex64)
@@ -365,6 +371,7 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.multiply, _MUL)
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
 
+  @test_util.run_deprecated_v1
   def testComplex128Basic(self):
     x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
         np.complex128)
@@ -382,10 +389,10 @@ class BinaryOpTest(test.TestCase):
   def testStringComparison(self):
     x = np.array([["abc", "bh"], ["c", ""]])
     y = np.array([["abc", "bh"], ["def", "hi"]])
-    with self.test_session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       cmp_eq = math_ops.equal(x, y)
       cmp_not_eq = math_ops.not_equal(x, y)
-      values = sess.run([cmp_eq, cmp_not_eq])
+      values = self.evaluate([cmp_eq, cmp_not_eq])
       self.assertAllEqual([[True, True], [False, False]], values[0])
       self.assertAllEqual([[False, False], [True, True]], values[1])
 
@@ -478,198 +485,263 @@ class BinaryOpTest(test.TestCase):
     ]
     self._testBCastByFunc(funcs, xs, ys)
 
+  @test_util.run_deprecated_v1
   def testBCast_0A(self):
     self._testBCastA([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_0B(self):
     self._testBCastB([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_0C(self):
     self._testBCastC([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_0D(self):
     self._testBCastD([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_1A(self):
     self._testBCastA([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_1B(self):
     self._testBCastB([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_1C(self):
     self._testBCastC([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_1D(self):
     self._testBCastD([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2A(self):
     self._testBCastA([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2B(self):
     self._testBCastB([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2C(self):
     self._testBCastC([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2D(self):
     self._testBCastD([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_3A(self):
     self._testBCastA([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_3B(self):
     self._testBCastB([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_3C(self):
     self._testBCastC([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_3D(self):
     self._testBCastD([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_4A(self):
     self._testBCastA([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_4B(self):
     self._testBCastB([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_4C(self):
     self._testBCastC([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_4D(self):
     self._testBCastD([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_5A(self):
     self._testBCastA([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_5B(self):
     self._testBCastB([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_5C(self):
     self._testBCastC([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_5D(self):
     self._testBCastD([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6A(self):
     self._testBCastA([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6B(self):
     self._testBCastB([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6C(self):
     self._testBCastC([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6D(self):
     self._testBCastD([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7A(self):
     self._testBCastA([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7B(self):
     self._testBCastB([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7C(self):
     self._testBCastC([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7D(self):
     self._testBCastD([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8A(self):
     self._testBCastA([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8B(self):
     self._testBCastB([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8C(self):
     self._testBCastC([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8D(self):
     self._testBCastD([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9A(self):
     self._testBCastA([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9B(self):
     self._testBCastB([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9C(self):
     self._testBCastC([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9D(self):
     self._testBCastD([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10A(self):
     self._testBCastA([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10B(self):
     self._testBCastB([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10C(self):
     self._testBCastC([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10D(self):
     self._testBCastD([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_11A(self):
     self._testBCastA([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_11B(self):
     self._testBCastB([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_11C(self):
     self._testBCastC([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_11D(self):
     self._testBCastD([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12A(self):
     self._testBCastA([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12B(self):
     self._testBCastB([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12C(self):
     self._testBCastC([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12D(self):
     self._testBCastD([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_13A(self):
     self._testBCastA([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_13B(self):
     self._testBCastB([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_13C(self):
     self._testBCastC([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_13D(self):
     self._testBCastD([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14A(self):
     self._testBCastA([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14B(self):
     self._testBCastB([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14C(self):
     self._testBCastC([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14D(self):
     self._testBCastD([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_15A(self):
     self._testBCastA([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_15B(self):
     self._testBCastB([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_15C(self):
     self._testBCastC([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_15D(self):
     self._testBCastD([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testMismatchedDimensions(self):
     for func in [
         math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div, _ADD,
@@ -681,6 +753,7 @@ class BinaryOpTest(test.TestCase):
             ops.convert_to_tensor([10.0, 20.0, 30.0]),
             ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
+  @test_util.run_deprecated_v1
   def testZeroPowGrad(self):
     with self.cached_session():
       for dtype in (np.float16, np.float32, np.float64, np.complex64,
@@ -691,6 +764,7 @@ class BinaryOpTest(test.TestCase):
         error = gradient_checker.compute_gradient_error(y, [], z, [])
         self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testComplexPowGrad(self):
     with self.cached_session():
       for dtype in np.complex64, np.complex128:
@@ -716,39 +790,39 @@ class BinaryOpTest(test.TestCase):
 
   def testPowNegativeExponent(self):
     for dtype in [np.int32, np.int64]:
-      with self.test_session(use_gpu=False) as sess:
+      with test_util.force_cpu():
         with self.assertRaisesRegexp(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
           y = np.array([-2, 3]).astype(dtype)
-          sess.run(math_ops.pow(x, y))
+          self.evaluate(math_ops.pow(x, y))
 
-      with self.test_session(use_gpu=False) as sess:
+      with test_util.force_cpu():
         with self.assertRaisesRegexp(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
           y = np.array([2, -3]).astype(dtype)
-          sess.run(math_ops.pow(x, y))
+          self.evaluate(math_ops.pow(x, y))
 
-      with self.test_session(use_gpu=False) as sess:
+      with test_util.force_cpu():
         with self.assertRaisesRegexp(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
           y = -3
-          sess.run(math_ops.pow(x, y))
+          self.evaluate(math_ops.pow(x, y))
 
 
 class ComparisonOpTest(test.TestCase):
 
   def _compareScalar(self, func, x, y, dtype):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
-      ret = out.eval()
+      ret = self.evaluate(out)
     return ret[0]
 
   def testScalarCompareScalar(self):
@@ -777,9 +851,9 @@ class ComparisonOpTest(test.TestCase):
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
@@ -859,6 +933,7 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(
         np.not_equal, math_ops.not_equal, include_complex=True)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
     funcs = [
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index c5311ad834a700bf3341b5c25fb8a22f837eae62..9bb7d8b8b12baafe15fe9150e58c4e03749e7261 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -84,11 +84,11 @@ def _default_tolerance(dtype):
 class ComparisonOpTest(test.TestCase):
 
   def _compareScalar(self, func, x, y, dtype):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
-      ret = out.eval()
+      ret = self.evaluate(out)
     return ret[0]
 
   def testScalarCompareScalar(self):
@@ -117,9 +117,9 @@ class ComparisonOpTest(test.TestCase):
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
@@ -199,6 +199,7 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(
         np.not_equal, math_ops.not_equal, include_complex=True)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
     funcs = [
@@ -218,22 +219,20 @@ class LogicalOpTest(test.TestCase):
 
   def _compareBinary(self, x, y, np_func, tf_func, use_gpu=False):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_val = out.eval()
+      tf_val = self.evaluate(out)
     self.assertEqual(out.dtype, dtypes_lib.bool)
     self.assertAllEqual(np_ans, tf_val)
     self.assertShapeEqual(np_ans, out)
 
   def _not(self, x, use_gpu=False):
     np_ans = np.logical_not(x)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       out = math_ops.logical_not(ops.convert_to_tensor(x))
-      tf_val = out.eval()
+      tf_val = self.evaluate(out)
     self.assertEqual(out.dtype, dtypes_lib.bool)
     self.assertAllEqual(np_ans, tf_val)
     self.assertShapeEqual(np_ans, out)
@@ -282,6 +281,7 @@ class LogicalOpTest(test.TestCase):
         self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
         self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     y = np.random.randint(0, 2, 6).astype(np.bool).reshape(3, 2, 1)
@@ -290,6 +290,7 @@ class LogicalOpTest(test.TestCase):
           ValueError, lambda e: "Dimensions must" in str(e)):
         f(x, y)
 
+  @test_util.run_deprecated_v1
   def testUsingAsPythonValueFails(self):
     # Ensure that we raise an error when the user attempts to treat a
     # `Tensor` as a Python `bool`.
@@ -316,10 +317,9 @@ class SelectOpTest(test.TestCase):
 
   def _compare(self, c, x, y, use_gpu):
     np_ans = np.where(c, x, y)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       out = array_ops.where(c, x, y)
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, out)
 
@@ -399,6 +399,7 @@ class SelectOpTest(test.TestCase):
       if t in [np.float16, np.float32, np.float64]:
         self._compare(c, xt, yt, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
@@ -418,6 +419,7 @@ class SelectOpTest(test.TestCase):
         self._compareGradientX(c, xt, yt)
         self._compareGradientY(c, xt, yt)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
@@ -431,6 +433,7 @@ class SelectOpTest(test.TestCase):
       with self.assertRaises(ValueError):
         array_ops.where(c, xt, yt)
 
+  @test_util.run_deprecated_v1
   def testEmptyTensor(self):
     c = np.random.randint(0, 3, 0).astype(np.bool).reshape(1, 3, 0)
     x = np.random.rand(1, 3, 0) * 100
@@ -442,6 +445,7 @@ class SelectOpTest(test.TestCase):
       z = array_ops.where(c, xt, yt).eval()
       self.assertAllEqual(z_expected, z)
 
+  @test_util.run_deprecated_v1
   def testNan(self):
     """Verify that nans don't propagate where they shouldn't."""
     with self.cached_session():
@@ -460,10 +464,9 @@ class BatchSelectOpTest(test.TestCase):
     np_ans = np.dstack(
         [x_i if c_i else y_i for c_i, x_i, y_i in zip(c, x, y)]).transpose(
             [2, 0, 1])
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       out = array_ops.where(c, x, y)
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, out)
 
@@ -529,6 +532,7 @@ class BatchSelectOpTest(test.TestCase):
       if t in [np.float16, np.float32, np.float64]:
         self._compare(c, xt, yt, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     c = np.random.randint(0, 2, 16).astype(np.bool)
     x = np.random.rand(16, 2, 8) * 100
@@ -548,6 +552,7 @@ class BatchSelectOpTest(test.TestCase):
         self._compareGradientX(c, xt, yt)
         self._compareGradientY(c, xt, yt)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     c = np.random.randint(0, 2, 8).astype(np.bool)
     x = np.random.rand(16, 3, 2) * 100
@@ -566,13 +571,11 @@ class MinMaxOpTest(test.TestCase):
 
   def _compare(self, x, y, use_gpu):
     np_min, np_max = np.minimum(x, y), np.maximum(x, y)
-    with self.test_session(
-        use_gpu=use_gpu,
-        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       omin, omax = math_ops.minimum(inx, iny), math_ops.maximum(inx, iny)
-      tf_min, tf_max = sess.run([omin, omax])
+      tf_min, tf_max = self.evaluate([omin, omax])
     self.assertAllEqual(np_min, tf_min)
     self.assertAllEqual(np_max, tf_max)
 
@@ -628,6 +631,7 @@ class MinMaxOpTest(test.TestCase):
     elif x.dtype == np.float64:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     x = np.random.rand(1, 3, 2) * 100.
     # ensure x != y
@@ -641,16 +645,16 @@ class MinMaxOpTest(test.TestCase):
 class MathOpsOverloadTest(test.TestCase):
 
   def _computeTensorAndLiteral(self, x, y, dtype, func):
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       inx = ops.convert_to_tensor(x, dtype=dtype)
       z = func(inx, y)  # Should use __add__, __sub__, etc.
-      return z.eval()
+      return self.evaluate(z)
 
   def _computeLiteralAndTensor(self, x, y, dtype, func):
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       iny = ops.convert_to_tensor(y, dtype=dtype)
       z = func(x, iny)  # Should use __radd__, __rsub__, etc.
-      return z.eval()
+      return self.evaluate(z)
 
   def _compareBinary(self, x, y, dtype, np_func, tf_func):
     np_ans = np_func(x, y).astype(dtype.as_numpy_dtype)
@@ -661,9 +665,9 @@ class MathOpsOverloadTest(test.TestCase):
 
   def _compareUnary(self, x, dtype, np_func, tf_func):
     np_ans = np_func(x).astype(dtype.as_numpy_dtype)
-    with self.test_session(use_gpu=False):
-      self.assertAllClose(np_ans,
-                          tf_func(ops.convert_to_tensor(x, dtype=dtype)).eval())
+    with test_util.force_cpu():
+      self.assertAllClose(
+          np_ans, self.evaluate(tf_func(ops.convert_to_tensor(x, dtype=dtype))))
 
   def testOverload(self):
     dtypes = [
@@ -730,13 +734,11 @@ class IsFiniteInfNanTest(test.TestCase):
 
   def _compare(self, x, use_gpu):
     np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
-    with self.test_session(
-        use_gpu=use_gpu,
-        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
           inx), math_ops.is_nan(inx)
-      tf_finite, tf_inf, tf_nan = sess.run([ofinite, oinf, onan])
+      tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
     self.assertAllEqual(np_inf, tf_inf)
     self.assertAllEqual(np_nan, tf_nan)
     self.assertAllEqual(np_finite, tf_finite)
@@ -773,31 +775,33 @@ class IsFiniteInfNanTest(test.TestCase):
           x = np.full((size,), value, dtype=dtype)
           np_y = np.sqrt(x)
           np_nan = np.isnan(np_y)
-          with self.test_session(force_gpu=test_util.is_gpu_available()):
+          with test_util.use_gpu():
             tf_y = math_ops.sqrt(x)
             tf_nan = math_ops.is_nan(tf_y)
             if value < 0:
-              self.assertAllEqual(np_nan, tf_nan.eval())
+              self.assertAllEqual(np_nan, self.evaluate(tf_nan))
             else:
-              self.assertAllCloseAccordingToType(np_y, tf_y.eval())
+              self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
 
 
 class RoundingTest(test.TestCase):
 
   def _compare_values(self, x, y=None):
     y = np.rint(x) if y is None else np.asarray(y)
-    with self.cached_session() as sess:
-      tf_rint = math_ops.rint(x)
-      np_rint = sess.run(tf_rint)
+
+    tf_rint = math_ops.rint(x)
+    np_rint = self.evaluate(tf_rint)
+
     self.assertAllEqual(y, np_rint)
     self.assertShapeEqual(y, tf_rint)
 
   def _compare(self, x):
     np_floor, np_ceil = np.floor(x), np.ceil(x)
-    with self.cached_session() as sess:
-      inx = ops.convert_to_tensor(x)
-      ofloor, oceil = math_ops.floor(inx), math_ops.ceil(inx)
-      tf_floor, tf_ceil = sess.run([ofloor, oceil])
+
+    inx = ops.convert_to_tensor(x)
+    ofloor, oceil = math_ops.floor(inx), math_ops.ceil(inx)
+    tf_floor, tf_ceil = self.evaluate([ofloor, oceil])
+
     self.assertAllEqual(np_floor, tf_floor)
     self.assertAllEqual(np_ceil, tf_ceil)
     self.assertShapeEqual(np_floor, ofloor)
@@ -828,12 +832,13 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareMake(self, real, imag, use_gpu):
     np_ans = real + (1j) * imag
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+
+    with test_util.device(use_gpu=use_gpu):
       real = ops.convert_to_tensor(real)
       imag = ops.convert_to_tensor(imag)
       tf_ans = math_ops.complex(real, imag)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
+
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -848,17 +853,17 @@ class ComplexMakeRealImagTest(test.TestCase):
   def _compareRealImag(self, cplx, use_gpu):
     np_real, np_imag = np.real(cplx), np.imag(cplx)
     np_zeros = np_real * 0
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(cplx)
       tf_real = math_ops.real(inx)
       tf_imag = math_ops.imag(inx)
       tf_real_real = math_ops.real(tf_real)
       tf_imag_real = math_ops.imag(tf_real)
-      self.assertAllEqual(np_real, tf_real.eval())
-      self.assertAllEqual(np_imag, tf_imag.eval())
-      self.assertAllEqual(np_real, tf_real_real.eval())
-      self.assertAllEqual(np_zeros, tf_imag_real.eval())
+      self.assertAllEqual(np_real, self.evaluate(tf_real))
+      self.assertAllEqual(np_imag, self.evaluate(tf_imag))
+      self.assertAllEqual(np_real, self.evaluate(tf_real_real))
+      self.assertAllEqual(np_zeros, self.evaluate(tf_imag_real))
 
   def testRealImag64(self):
     real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float32)
@@ -876,12 +881,12 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareAngle(self, cplx, use_gpu):
     np_angle = np.angle(cplx)
-    with self.test_session(
-        use_gpu=use_gpu,
-        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
+
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(cplx)
       tf_angle = math_ops.angle(inx)
-      tf_angle_val = sess.run(tf_angle)
+      tf_angle_val = self.evaluate(tf_angle)
+
     self.assertAllEqual(np_angle, tf_angle_val)
     self.assertShapeEqual(np_angle, tf_angle)
 
@@ -903,6 +908,7 @@ class ComplexMakeRealImagTest(test.TestCase):
     # build failures on GPU (See #10643 for context).
     # self._compareAngle(cplx, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testRealReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
                   dtypes_lib.float64):
@@ -912,11 +918,10 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareConj(self, cplx, use_gpu):
     np_ans = np.conj(cplx)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(cplx)
       tf_conj = math_ops.conj(inx)
-      tf_ans = tf_conj.eval()
+      tf_ans = self.evaluate(tf_conj)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, tf_conj)
 
@@ -934,6 +939,7 @@ class ComplexMakeRealImagTest(test.TestCase):
     self._compareConj(cplx, use_gpu=False)
     self._compareConj(cplx, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testConjReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
                   dtypes_lib.float32, dtypes_lib.float64):
@@ -941,6 +947,7 @@ class ComplexMakeRealImagTest(test.TestCase):
       y = math_ops.conj(x)
       self.assertEqual(x, y)
 
+  @test_util.run_deprecated_v1
   def testConjString(self):
     x = array_ops.placeholder(dtypes_lib.string)
     with self.assertRaisesRegexp(TypeError,
@@ -977,6 +984,7 @@ class ComplexMakeRealImagTest(test.TestCase):
             x_, list(x.shape), z, [1], x_init_value=x, delta=epsilon)
         self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     # complex64
     data = np.arange(1, 2, 0.10).reshape([5, 2]).astype(np.float32)
@@ -1012,6 +1020,7 @@ class ComplexMakeRealImagTest(test.TestCase):
           inp, list(data.shape), loss, [1], x_init_value=data, delta=epsilon)
     self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
+  @test_util.run_deprecated_v1
   def testMulGradient(self):
     data = np.arange(1, 2, 0.125).reshape([2, 4]).astype(np.float32)
     self._compareMulGradient(data)
@@ -1032,13 +1041,13 @@ class AccumulateTest(test.TestCase):
       np_val = random_arrays[0]
       for random_array in random_arrays[1:]:
         np_val += random_array
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testZeroArgs(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         tf_val = math_ops.accumulate_n([])
-        tf_val.eval()
+        self.evaluate(tf_val)
 
   def testWrongShape(self):
     with self.cached_session():
@@ -1070,7 +1079,7 @@ class PolyvalTest(test.TestCase):
     np_val = np.polyval(coeffs, x)
     with self.cached_session():
       tf_val = math_ops.polyval(coeffs, x)
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testSimple(self):
     for dtype in [
@@ -1093,7 +1102,7 @@ class PolyvalTest(test.TestCase):
         np_val = np.polyval(coeffs, x)
         with self.cached_session():
           tf_val = math_ops.polyval(coeffs, x)
-          self.assertAllClose(np_val, tf_val.eval())
+          self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testEmpty(self):
     x = np.random.rand(2, 2).astype(np.float32)
@@ -1101,7 +1110,7 @@ class PolyvalTest(test.TestCase):
     np_val = np.polyval(coeffs, x)
     with self.cached_session():
       tf_val = math_ops.polyval(coeffs, x)
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index 77f182784ebb0a149762e291c4e0bdd937bf8dfa..709a20f3d0da0ea73924589699d5ecb24f963bf2 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -76,7 +76,7 @@ class UnaryOpTest(test.TestCase):
     if grad_atol is None:
       grad_atol = _default_tolerance(x.dtype)
     np_ans = np_func(x)
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
       if x.dtype in (np.float32, np.float64,
                      dtypes_lib.bfloat16.as_numpy_dtype):
@@ -84,7 +84,7 @@ class UnaryOpTest(test.TestCase):
         np_ans *= 1.1
       else:
         y = tf_func(inx)
-      tf_cpu = y.eval()
+      tf_cpu = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       if x.dtype == np.float16:
         self.assertAllClose(np_ans, tf_cpu, rtol=1e-3, atol=1e-3)
@@ -121,26 +121,24 @@ class UnaryOpTest(test.TestCase):
   def _check(self, result_tensor, result_np, input_sp_t, tol):
     self.assertTrue(isinstance(result_tensor, sparse_tensor.SparseTensor))
     self.assertTrue(isinstance(input_sp_t, sparse_tensor.SparseTensor))
-    self.assertAllEqual(input_sp_t.indices.eval(), result_tensor.indices.eval())
-    self.assertAllEqual(input_sp_t.dense_shape.eval(),
-                        result_tensor.dense_shape.eval())
+    self.assertAllEqual(input_sp_t.indices, result_tensor.indices)
+    self.assertAllEqual(input_sp_t.dense_shape, result_tensor.dense_shape)
     if tol is None:
-      self.assertAllClose(result_np, result_tensor.values.eval())
+      self.assertAllClose(result_np, result_tensor.values)
     else:
-      self.assertAllClose(
-          result_np, result_tensor.values.eval(), rtol=tol, atol=tol)
+      self.assertAllClose(result_np, result_tensor.values, rtol=tol, atol=tol)
 
   def _compareSparseCpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareGpu(self, x, np_func, tf_func):
     np_ans = np_func(x)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       result = tf_func(ops.convert_to_tensor(x))
-      tf_gpu = result.eval()
+      tf_gpu = self.evaluate(result)
     if x.dtype == np.float16:
       self.assertAllClose(np_ans, tf_gpu, rtol=1e-3, atol=1e-3)
     else:
@@ -150,7 +148,7 @@ class UnaryOpTest(test.TestCase):
   def _compareSparseGpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareBoth(self, x, np_func, tf_func):
@@ -186,6 +184,7 @@ class UnaryOpTest(test.TestCase):
 
     return func
 
+  @test_util.run_deprecated_v1
   def testFloatBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
     w = x - x.min() + 1.02  # all greater than 1
@@ -240,12 +239,14 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf)
 
+  @test_util.run_deprecated_v1
   def testFloatTanhEdge(self):
     x = np.arange(40, 40 + 6).reshape(6).astype(np.float32)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     x = np.arange(-40, -40 + 6).reshape(6).astype(np.float32)
     self._compareBoth(x, np.tanh, math_ops.tanh)
 
+  @test_util.run_deprecated_v1
   def testFloatEmpty(self):
     x = np.empty((2, 0, 5), dtype=np.float32)
     self._compareBoth(x, np.abs, math_ops.abs)
@@ -291,6 +292,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(x, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.sign, math_ops.erf)
 
+  @test_util.run_deprecated_v1
   def testDoubleBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
     w = x - x.min() + 1.02  # all greater than 1
@@ -344,6 +346,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf)
 
+  @test_util.run_deprecated_v1
   def testHalfBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float16)
     y = (x + .5).astype(np.float16)  # no zero
@@ -416,6 +419,7 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.square, math_ops.square)
     self._compareBothSparse(x, np.square, math_ops.square)
 
+  @test_util.run_deprecated_v1
   def testComplex64Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
         np.complex64)
@@ -460,6 +464,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
+  @test_util.run_deprecated_v1
   def testComplex128Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
         np.complex128)
@@ -499,6 +504,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
+  @test_util.run_deprecated_v1
   def testGradGrad(self):
     np.random.seed(7)
     shape = (5,)
diff --git a/tensorflow/python/kernel_tests/dct_ops_test.py b/tensorflow/python/kernel_tests/dct_ops_test.py
deleted file mode 100644
index 97d7e2d8f90a620b693e2c81adc616d399e13bd6..0000000000000000000000000000000000000000
--- a/tensorflow/python/kernel_tests/dct_ops_test.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for DCT operations."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import importlib
-
-import numpy as np
-
-from tensorflow.python.ops import spectral_ops
-from tensorflow.python.ops import spectral_ops_test_util
-from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
-
-
-def try_import(name):  # pylint: disable=invalid-name
-  module = None
-  try:
-    module = importlib.import_module(name)
-  except ImportError as e:
-    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
-  return module
-
-
-fftpack = try_import("scipy.fftpack")
-
-
-def _np_dct2(signals, norm=None):
-  """Computes the DCT-II manually with NumPy."""
-  # X_k = sum_{n=0}^{N-1} x_n * cos(\frac{pi}{N} * (n + 0.5) * k)  k=0,...,N-1
-  dct_size = signals.shape[-1]
-  dct = np.zeros_like(signals)
-  for k in range(dct_size):
-    phi = np.cos(np.pi * (np.arange(dct_size) + 0.5) * k / dct_size)
-    dct[..., k] = np.sum(signals * phi, axis=-1)
-  # SciPy's `dct` has a scaling factor of 2.0 which we follow.
-  # https://github.com/scipy/scipy/blob/v0.15.1/scipy/fftpack/src/dct.c.src
-  if norm == "ortho":
-    # The orthonormal scaling includes a factor of 0.5 which we combine with
-    # the overall scaling of 2.0 to cancel.
-    dct[..., 0] *= np.sqrt(1.0 / dct_size)
-    dct[..., 1:] *= np.sqrt(2.0 / dct_size)
-  else:
-    dct *= 2.0
-  return dct
-
-
-def _np_dct3(signals, norm=None):
-  """Computes the DCT-III manually with NumPy."""
-  # SciPy's `dct` has a scaling factor of 2.0 which we follow.
-  # https://github.com/scipy/scipy/blob/v0.15.1/scipy/fftpack/src/dct.c.src
-  dct_size = signals.shape[-1]
-  signals = np.array(signals)  # make a copy so we can modify
-  if norm == "ortho":
-    signals[..., 0] *= np.sqrt(4.0 / dct_size)
-    signals[..., 1:] *= np.sqrt(2.0 / dct_size)
-  else:
-    signals *= 2.0
-  dct = np.zeros_like(signals)
-  # X_k = 0.5 * x_0 +
-  #       sum_{n=1}^{N-1} x_n * cos(\frac{pi}{N} * n * (k + 0.5))  k=0,...,N-1
-  half_x0 = 0.5 * signals[..., 0]
-  for k in range(dct_size):
-    phi = np.cos(np.pi * np.arange(1, dct_size) * (k + 0.5) / dct_size)
-    dct[..., k] = half_x0 + np.sum(signals[..., 1:] * phi, axis=-1)
-  return dct
-
-
-NP_DCT = {2: _np_dct2, 3: _np_dct3}
-NP_IDCT = {2: _np_dct3, 3: _np_dct2}
-
-
-class DCTOpsTest(test.TestCase):
-
-  def _compare(self, signals, norm, dct_type, atol=5e-4, rtol=5e-4):
-    """Compares (I)DCT to SciPy (if available) and a NumPy implementation."""
-    np_dct = NP_DCT[dct_type](signals, norm)
-    tf_dct = spectral_ops.dct(signals, type=dct_type, norm=norm).eval()
-    self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol)
-    np_idct = NP_IDCT[dct_type](signals, norm)
-    tf_idct = spectral_ops.idct(signals, type=dct_type, norm=norm).eval()
-    self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol)
-    if fftpack:
-      scipy_dct = fftpack.dct(signals, type=dct_type, norm=norm)
-      self.assertAllClose(scipy_dct, tf_dct, atol=atol, rtol=rtol)
-      scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm)
-      self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol)
-    # Verify inverse(forward(s)) == s, up to a normalization factor.
-    tf_idct_dct = spectral_ops.idct(
-        tf_dct, type=dct_type, norm=norm).eval()
-    tf_dct_idct = spectral_ops.dct(
-        tf_idct, type=dct_type, norm=norm).eval()
-    if norm is None:
-      tf_idct_dct *= 0.5 / signals.shape[-1]
-      tf_dct_idct *= 0.5 / signals.shape[-1]
-    self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol)
-    self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
-
-  def test_random(self):
-    """Test randomly generated batches of data."""
-    with spectral_ops_test_util.fft_kernel_label_map():
-      with self.test_session(use_gpu=True):
-        for shape in ([1], [2], [3], [10], [2, 20], [2, 3, 25]):
-          signals = np.random.rand(*shape).astype(np.float32)
-          for norm in (None, "ortho"):
-            self._compare(signals, norm, 2)
-            self._compare(signals, norm, 3)
-
-  def test_error(self):
-    signals = np.random.rand(10)
-    # Unsupported type.
-    with self.assertRaises(ValueError):
-      spectral_ops.dct(signals, type=1)
-    # Unknown normalization.
-    with self.assertRaises(ValueError):
-      spectral_ops.dct(signals, norm="bad")
-    with self.assertRaises(NotImplementedError):
-      spectral_ops.dct(signals, n=10)
-    with self.assertRaises(NotImplementedError):
-      spectral_ops.dct(signals, axis=0)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index eebaffbe13ab1afbc9c6e36c2e5710dcf56e4b15..5e7991382ed14ed401edd38c6ab28af6630e1099 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -61,7 +61,7 @@ class DecodeBmpOpTest(test.TestCase):
     decode = array_ops.squeeze(image_ops.decode_bmp(img_in))
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
   def testGrayscale(self):
@@ -136,7 +136,7 @@ class DecodeBmpOpTest(test.TestCase):
     decode = image_ops.decode_bmp(img_in)
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
 
diff --git a/tensorflow/python/kernel_tests/decode_compressed_op_test.py b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
index 1cc1c7da30a9c73935aa11ac9226c15aa2cf7954..fd871c0090699f36df41d1e7f7423bf273c4bba7 100644
--- a/tensorflow/python/kernel_tests/decode_compressed_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
@@ -24,6 +24,7 @@ import zlib
 from six import BytesIO
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -42,6 +43,7 @@ class DecodeCompressedOpTest(test.TestCase):
         f.write(bytes_in)
       return out.getvalue()
 
+  @test_util.run_deprecated_v1
   def testDecompress(self):
     for compression_type in ["ZLIB", "GZIP", ""]:
       with self.cached_session():
@@ -55,6 +57,7 @@ class DecodeCompressedOpTest(test.TestCase):
                                   self._compress(b"bBbb", compression_type)]})
         self.assertAllEqual([b"AaAA", b"bBbb"], result)
 
+  @test_util.run_deprecated_v1
   def testDecompressWithRaw(self):
     for compression_type in ["ZLIB", "GZIP", ""]:
       with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index 7f73fbaa846b4b5ace2242566a9e33c9a1416ea0..ba5770001ad30eb9b2b0c084faa483dbcb4728b9 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -23,6 +23,7 @@ import os.path
 import numpy as np
 
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import io_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -36,14 +37,15 @@ class DecodeImageOpTest(test.TestCase):
   def testBmp(self):
     # Read a real bmp and verify shape
     path = os.path.join(prefix_path, "bmp", "testdata", "lena.bmp")
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       bmp0 = io_ops.read_file(path)
       image0 = image_ops.decode_image(bmp0)
       image1 = image_ops.decode_bmp(bmp0)
-      bmp0, image0, image1 = sess.run([bmp0, image0, image1])
+      bmp0, image0, image1 = self.evaluate([bmp0, image0, image1])
       self.assertEqual(len(bmp0), 4194)
       self.assertAllEqual(image0, image1)
 
+  @test_util.run_deprecated_v1
   def testGif(self):
     # Read some real GIFs
     path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
@@ -52,11 +54,11 @@ class DecodeImageOpTest(test.TestCase):
     stride = 5
     shape = (12, height, width, 3)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(path)
       image0 = image_ops.decode_image(gif0)
       image1 = image_ops.decode_gif(gif0)
-      gif0, image0, image1 = sess.run([gif0, image0, image1])
+      gif0, image0, image1 = self.evaluate([gif0, image0, image1])
 
       self.assertEqual(image0.shape, shape)
       self.assertAllEqual(image0, image1)
@@ -76,44 +78,46 @@ class DecodeImageOpTest(test.TestCase):
 
         bad_channels = image_ops.decode_image(gif0, channels=1)
         with self.assertRaises(errors_impl.InvalidArgumentError):
-          bad_channels.eval()
+          self.evaluate(bad_channels)
 
+  @test_util.run_deprecated_v1
   def testJpeg(self):
     # Read a real jpeg and verify shape
     path = os.path.join(prefix_path, "jpeg", "testdata", "jpeg_merge_test1.jpg")
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       jpeg0 = io_ops.read_file(path)
       image0 = image_ops.decode_image(jpeg0)
       image1 = image_ops.decode_jpeg(jpeg0)
-      jpeg0, image0, image1 = sess.run([jpeg0, image0, image1])
+      jpeg0, image0, image1 = self.evaluate([jpeg0, image0, image1])
       self.assertEqual(len(jpeg0), 3771)
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertAllEqual(image0, image1)
 
       bad_channels = image_ops.decode_image(jpeg0, channels=4)
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        bad_channels.eval()
+        self.evaluate(bad_channels)
 
   def testPng(self):
     # Read some real PNGs, converting to different channel numbers
     inputs = [(1, "lena_gray.png")]
     for channels_in, filename in inputs:
       for channels in 0, 1, 3, 4:
-        with self.test_session(use_gpu=True) as sess:
+        with self.cached_session(use_gpu=True) as sess:
           path = os.path.join(prefix_path, "png", "testdata", filename)
           png0 = io_ops.read_file(path)
           image0 = image_ops.decode_image(png0, channels=channels)
           image1 = image_ops.decode_png(png0, channels=channels)
-          png0, image0, image1 = sess.run([png0, image0, image1])
+          png0, image0, image1 = self.evaluate([png0, image0, image1])
           self.assertEqual(image0.shape, (26, 51, channels or channels_in))
           self.assertAllEqual(image0, image1)
 
+  @test_util.run_deprecated_v1
   def testInvalidBytes(self):
     image_bytes = b"ThisIsNotAnImage!"
     decode = image_ops.decode_image(image_bytes)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        decode.eval()
+        self.evaluate(decode)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index 66b3e0f22fd2ab07311895da5df5448ee4e6e6f0..f8fc28062f4d9cd846a5b124611b56c35f652442 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -80,7 +80,7 @@ class DecodeJpegBenchmark(test.Benchmark):
           initializer=image_ops.encode_jpeg(tiled_image))
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       images = []
       for _ in xrange(parallelism):
         if crop_window is None:
@@ -105,11 +105,11 @@ class DecodeJpegBenchmark(test.Benchmark):
 
       for _ in xrange(3):
         # Skip warm up time.
-        sess.run(r)
+        self.evaluate(r)
 
       start_time = time.time()
       for _ in xrange(num_iters):
-        sess.run(r)
+        self.evaluate(r)
       end_time = time.time()
     return end_time - start_time
 
diff --git a/tensorflow/python/kernel_tests/decode_png_op_test.py b/tensorflow/python/kernel_tests/decode_png_op_test.py
index 8f36343667f72b410f14a1934c93a61debaff59e..5a0b742a6a46aa994eb555f09ab3fb75c8a03b15 100644
--- a/tensorflow/python/kernel_tests/decode_png_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_png_op_test.py
@@ -47,7 +47,7 @@ class DecodePngOpTest(test.TestCase):
             img_in, dtype=dtypes.uint16))
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
 
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index dcc984811cbcfef206befde7a94b3c948a07c15d..008e59ba3e64915d8642243d335701e8adea19c0 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -28,6 +29,7 @@ from tensorflow.python.platform import test
 
 class DecodeRawOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testToUint8(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[2])
@@ -46,6 +48,7 @@ class DecodeRawOpTest(test.TestCase):
           "element 1 has size 5 != 6"):
         decode.eval(feed_dict={in_bytes: ["short", "longer"]})
 
+  @test_util.run_deprecated_v1
   def testToInt16(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -61,6 +64,7 @@ class DecodeRawOpTest(test.TestCase):
           "size of int16"):
         decode.eval(feed_dict={in_bytes: ["123", "456"]})
 
+  @test_util.run_deprecated_v1
   def testEndianness(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -73,6 +77,7 @@ class DecodeRawOpTest(test.TestCase):
       result = decode_be.eval(feed_dict={in_bytes: ["\x01\x02\x03\x04"]})
       self.assertAllEqual([[0x01020304]], result)
 
+  @test_util.run_deprecated_v1
   def testToFloat16(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -84,6 +89,7 @@ class DecodeRawOpTest(test.TestCase):
 
       self.assertAllEqual(expected_result, result)
 
+  @test_util.run_deprecated_v1
   def testEmptyStringInput(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -93,6 +99,7 @@ class DecodeRawOpTest(test.TestCase):
         result = decode.eval(feed_dict={in_bytes: [""] * num_inputs})
         self.assertEqual((num_inputs, 0), result.shape)
 
+  @test_util.run_deprecated_v1
   def testToUInt16(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index 95fc40f88339d6a9fb4f88496aafb8aa15f49055..d824e95f213acf5480be9bf2c431a4c4b89d106a 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -22,6 +22,7 @@ import numpy as np
 import platform
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -35,11 +36,12 @@ class DenormalTest(test.TestCase):
       self.assertEqual(tiny, tiny / 16 * 16)
 
   def _flushDenormalsTest(self, use_gpu, dtypes):
-    if platform.machine() == "ppc64le" or platform.machine() == "s390x":
-      # Disabled denormal_test on power/s390x platform
+    if platform.machine() == "ppc64le" or platform.machine(
+    ) == "s390x" or platform.machine() == "aarch64":
+      # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       array_ops.identity(7).eval()
       for dtype in dtypes:
         tiny = np.finfo(dtype).tiny
@@ -50,10 +52,12 @@ class DenormalTest(test.TestCase):
           # Make sure the flags don't leak out
           self.testPythonHasDenormals()
 
+  @test_util.run_deprecated_v1
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
     self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
 
+  @test_util.run_deprecated_v1
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
     self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
index affbaf159d82e15d6c15a83ae509851ae1219c7f..4e3da068b8927c324bf9b17fb8e19e1038470777 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -32,6 +33,7 @@ class AssignOpTest(test.TestCase):
   # NOTE(mrry): We exclude thess tests from the TSAN TAP target, because they
   #   contain benign and deliberate data races when multiple threads update
   #   the same parameters without a lock.
+  @test_util.run_v1_only("b/120545219")
   def testParallelUpdateWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], 1.0)
@@ -43,7 +45,7 @@ class AssignOpTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       def run_add(add_op):
-        sess.run(add_op)
+        self.evaluate(add_op)
 
       threads = [
           self.checkedThread(
@@ -54,11 +56,12 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertTrue((vals >= ones).all())
       self.assertTrue((vals <= ones * 20).all())
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelAssignWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], float(1))
@@ -70,7 +73,7 @@ class AssignOpTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       def run_assign(assign_op):
-        sess.run(assign_op)
+        self.evaluate(assign_op)
 
       threads = [
           self.checkedThread(
@@ -81,7 +84,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
 
       # Assert every element is taken from one of the assignments.
       self.assertTrue((vals > 0).all())
@@ -91,6 +94,7 @@ class AssignOpTest(test.TestCase):
   # contain non-benign but known data races between the variable assignment and
   # returning the output tensors. This issue will be resolved with the new
   # resource variables.
+  @test_util.run_v1_only("b/120545219")
   def testParallelUpdateWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -103,7 +107,7 @@ class AssignOpTest(test.TestCase):
       p.initializer.run()
 
       def run_add(add_op):
-        sess.run(add_op)
+        self.evaluate(add_op)
 
       threads = [
           self.checkedThread(
@@ -114,10 +118,11 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertAllEqual(vals, ones * 20)
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelAssignWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -131,7 +136,7 @@ class AssignOpTest(test.TestCase):
       p.initializer.run()
 
       def run_assign(assign_op):
-        sess.run(assign_op)
+        self.evaluate(assign_op)
 
       threads = [
           self.checkedThread(
@@ -142,7 +147,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
 
       # Assert every element is the same, and taken from one of the assignments.
       self.assertTrue(vals[0, 0] > 0)
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/dense_update_ops_test.py
index 120e10314f66f95a574cceeb4335c34066c096e8..545de87ca10deb6c01ab889f331aa61dc815e19e 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -32,30 +33,30 @@ class AssignOpTest(test.TestCase):
   def _initAssignFetch(self, x, y, use_gpu=False):
     """Initialize a param to init and update it with y."""
     super(AssignOpTest, self).setUp()
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       p = variables.Variable(x)
       assign = state_ops.assign(p, y)
       p.initializer.run()
-      new_value = assign.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(assign)
+      return self.evaluate(p), new_value
 
   def _initAssignAddFetch(self, x, y, use_gpu=False):
     """Initialize a param to init, and compute param += y."""
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       p = variables.Variable(x)
       add = state_ops.assign_add(p, y)
       p.initializer.run()
-      new_value = add.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(add)
+      return self.evaluate(p), new_value
 
   def _initAssignSubFetch(self, x, y, use_gpu=False):
     """Initialize a param to init, and compute param -= y."""
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       p = variables.Variable(x)
       sub = state_ops.assign_sub(p, y)
       p.initializer.run()
-      new_value = sub.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(sub)
+      return self.evaluate(p), new_value
 
   def _testTypes(self, vals):
     for dtype in [np.float32, np.float64, np.int32, np.int64]:
@@ -81,23 +82,26 @@ class AssignOpTest(test.TestCase):
         self.assertAllEqual(x - y, var_value)
         self.assertAllEqual(x - y, op_value)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testTypes(np.arange(0, 20).reshape([4, 5]))
 
+  @test_util.run_v1_only("b/120545219")
   def testAssignNonStrictShapeChecking(self):
     with self.cached_session():
       data = array_ops.fill([1024, 1024], 0)
       p = variables.VariableV1([1])
       a = state_ops.assign(p, data, validate_shape=False)
       a.op.run()
-      self.assertAllEqual(p.eval(), data.eval())
+      self.assertAllEqual(p.eval(), self.evaluate(data))
 
       # Assign to yet another shape
       data2 = array_ops.fill([10, 10], 1)
       a2 = state_ops.assign(p, data2, validate_shape=False)
       a2.op.run()
-      self.assertAllEqual(p.eval(), data2.eval())
+      self.assertAllEqual(p.eval(), self.evaluate(data2))
 
+  @test_util.run_v1_only("b/120545219")
   def testInitRequiredAssignAdd(self):
     with self.cached_session():
       p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
@@ -105,6 +109,7 @@ class AssignOpTest(test.TestCase):
       with self.assertRaisesOpError("use uninitialized"):
         a.op.run()
 
+  @test_util.run_v1_only("b/120545219")
   def testInitRequiredAssignSub(self):
     with self.cached_session():
       p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index f0beabb4e20e4ec0a2fc7a487bf2541d19568927..96c9b5258e2a4a103a3d981a3340f67a01bbec94 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -37,12 +37,12 @@ class DepthToSpaceTest(test.TestCase):
 
   def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
     input_nhwc = math_ops.cast(inputs, dtype)
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       # test NHWC (default) on CPU
       x_tf = array_ops.depth_to_space(input_nhwc, block_size)
       self.assertAllEqual(x_tf.eval(), outputs)
     if test.is_gpu_available():
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         # test NHWC (default) on GPU
         x_tf = array_ops.depth_to_space(input_nhwc, block_size)
         self.assertAllEqual(x_tf.eval(), outputs)
@@ -53,12 +53,14 @@ class DepthToSpaceTest(test.TestCase):
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
         self.assertAllEqual(output_nhwc.eval(), outputs)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x_np = [[[[1, 2, 3, 4]]]]
     block_size = 2
     x_out = [[[[1], [2]], [[3], [4]]]]
     self._testOne(x_np, block_size, x_out)
 
+  @test_util.run_deprecated_v1
   def testBasicFloat16(self):
     x_np = [[[[1, 2, 3, 4]]]]
     block_size = 2
@@ -67,6 +69,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
+  @test_util.run_deprecated_v1
   def testBlockSize2(self):
     x_np = [[[[1, 2, 3, 4],
               [5, 6, 7, 8]],
@@ -79,6 +82,7 @@ class DepthToSpaceTest(test.TestCase):
               [[11], [12], [15], [16]]]]
     self._testOne(x_np, block_size, x_out)
 
+  @test_util.run_deprecated_v1
   def testBlockSize2Batch10(self):
     block_size = 2
     def batch_input_elt(i):
@@ -102,19 +106,20 @@ class DepthToSpaceTest(test.TestCase):
     input_nhwc = array_ops.ones([batch_size, 2, 3, 12])
     x_out = array_ops.ones([batch_size, 4, 6, 3])
 
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       # test NHWC (default) on CPU
       x_tf = array_ops.depth_to_space(input_nhwc, block_size)
       self.assertAllEqual(x_tf.shape, x_out.shape)
-      x_tf.eval()
+      self.evaluate(x_tf)
     if test.is_gpu_available():
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         # test NHWC (default) on GPU
         x_tf = array_ops.depth_to_space(input_nhwc, block_size)
         self.assertAllEqual(x_tf.shape, x_out.shape)
-        x_tf.eval()
+        self.evaluate(x_tf)
 
   # Tests for different width and height.
+  @test_util.run_deprecated_v1
   def testNonSquare(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]],
              [[5, 50, 6, 60, 7, 70, 8, 80]],
@@ -130,6 +135,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
+  @test_util.run_deprecated_v1
   def testBlockSize4FlatInput(self):
     x_np = [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]]
     block_size = 4
@@ -141,6 +147,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input depths.
   # To make sure elements are properly interleaved in depth.
+  @test_util.run_deprecated_v1
   def testDepthInterleaved(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]]]]
     block_size = 2
@@ -150,6 +157,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input depths. Here an odd depth.
   # To make sure elements are properly interleaved in depth.
+  @test_util.run_deprecated_v1
   def testDepthInterleavedDepth3(self):
     x_np = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
     block_size = 2
@@ -159,6 +167,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input depths.
   # To make sure elements are properly interleaved in depth.
+  @test_util.run_deprecated_v1
   def testDepthInterleavedLarger(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40],
               [5, 50, 6, 60, 7, 70, 8, 80]],
@@ -175,6 +184,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for a block larger for the depth. In this case should raise an
   # exception.
+  @test_util.run_deprecated_v1
   def testBlockSizeTooLarge(self):
     x_np = [[[[1, 2, 3, 4],
               [5, 6, 7, 8]],
@@ -185,18 +195,20 @@ class DepthToSpaceTest(test.TestCase):
     # divisible by 16.
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   # Test when the block size is 0.
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     x_np = [[[[1], [2]],
              [[3], [4]]]]
     block_size = 0
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   # Test when the block size is 1. The block size should be > 1.
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     x_np = [[[[1, 1, 1, 1],
               [2, 2, 2, 2]],
@@ -205,8 +217,9 @@ class DepthToSpaceTest(test.TestCase):
     block_size = 1
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLargerThanInput(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]],
@@ -214,8 +227,9 @@ class DepthToSpaceTest(test.TestCase):
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleDepth(self):
     # The depth is not divisible by the square of the block size.
     x_np = [[[[1, 1, 1, 1],
@@ -226,6 +240,7 @@ class DepthToSpaceTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = array_ops.depth_to_space(
         array_ops.placeholder(dtypes.float32), block_size=4)
@@ -276,8 +291,8 @@ class DepthToSpaceTest(test.TestCase):
       expected = self.depthToSpaceUsingTranspose(t, block_size, data_format)
       actual = array_ops.depth_to_space(t, block_size, data_format=data_format)
 
-    with self.test_session(use_gpu=use_gpu) as sess:
-      actual_vals, expected_vals = sess.run([actual, expected])
+    with self.session(use_gpu=use_gpu) as sess:
+      actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
   def testAgainstTranspose(self):
@@ -314,7 +329,7 @@ class DepthToSpaceGradientTest(test.TestCase):
       return
 
     assert 4 == x.ndim
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_x = ops.convert_to_tensor(x)
       tf_y = array_ops.depth_to_space(tf_x, block_size, data_format=data_format)
 
@@ -343,11 +358,13 @@ class DepthToSpaceGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here, as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     self._compare(3, 2, 5, 3, block_size, "NHWC")
     self._compare(3, 2, 5, 3, block_size, "NCHW")
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 3
     self._compare(1, 2, 3, 2, block_size, "NHWC")
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 737a73f97a54983da35901592ca75fdfc7d3bd96..5b1a47fb03563f3c104e0d0ca158a0918dcb39b6 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
@@ -162,7 +163,7 @@ class DepthwiseConv2DTest(test.TestCase):
         conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
 
       try:
-        native_result = sess.run(conv_native)
+        native_result = self.evaluate(conv_native)
       except errors.InvalidArgumentError as e:
         # Grouped convolution kernel is only registered for cuDNN 7. Silently
         # return when we are running on an earlier version or without GPU.
@@ -174,7 +175,7 @@ class DepthwiseConv2DTest(test.TestCase):
 
       conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      interface_result = sess.run(conv_interface)
+      interface_result = self.evaluate(conv_interface)
 
     tf_logging.info(
         "data_type: %r, use_gpu: %r, grouped_conv: %r, max diff = %f",
@@ -185,6 +186,7 @@ class DepthwiseConv2DTest(test.TestCase):
     self.assertShapeEqual(native_result, conv_native)
     self.assertShapeEqual(native_result, conv_interface)
 
+  @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2D(self):
     for index, (input_size, filter_size, _, stride,
                 padding) in enumerate(ConfigsToTest()):
@@ -209,7 +211,7 @@ class DepthwiseConv2DTest(test.TestCase):
     # GitHub issue 22110.
     if not test.is_gpu_available():
       return
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = array_ops.placeholder(dtypes.float32)
       f = np.ones([1, 1, 1, 1], np.float32)
       v = nn_impl.depthwise_conv2d(
@@ -263,13 +265,13 @@ class DepthwiseConv2DTest(test.TestCase):
     # numbers from 1.
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
     x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t1.set_shape(tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       conv = nn_ops.depthwise_conv2d_native(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = %r", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
@@ -428,6 +430,7 @@ class DepthwiseConv2DTest(test.TestCase):
           use_gpu, grouped_conv, err)
       self.assertLess(err, tolerance)
 
+  @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DInputGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
@@ -477,6 +480,7 @@ class DepthwiseConv2DTest(test.TestCase):
             use_gpu=True,
             data_format="NCHW")
 
+  @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DFilterGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
@@ -522,13 +526,13 @@ class DepthwiseConv2DTest(test.TestCase):
     x2 = np.random.rand(*output_sizes).astype(np.float32)
 
     def _GetVal(use_gpu):
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         t0 = constant_op.constant(input_sizes, shape=[len(input_sizes)])
         t1 = constant_op.constant(x1, shape=filter_sizes)
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_input(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -542,13 +546,13 @@ class DepthwiseConv2DTest(test.TestCase):
     x2 = np.random.rand(*output_sizes).astype(np.float64)
 
     def _GetVal(use_gpu):
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         t0 = constant_op.constant(input_sizes, shape=[len(input_sizes)])
         t1 = constant_op.constant(x1, shape=filter_sizes)
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_input(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -574,13 +578,13 @@ class DepthwiseConv2DTest(test.TestCase):
     x2 = np.random.rand(*output_sizes).astype(np.float32)
 
     def _GetVal(use_gpu):
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         t0 = constant_op.constant(x0, shape=input_sizes)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -594,13 +598,13 @@ class DepthwiseConv2DTest(test.TestCase):
     x2 = np.random.rand(*output_sizes).astype(np.float64)
 
     def _GetVal(use_gpu):
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         t0 = constant_op.constant(x0, shape=input_sizes)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
diff --git a/tensorflow/python/kernel_tests/determinant_op_test.py b/tensorflow/python/kernel_tests/determinant_op_test.py
index a52b2c0dc32c26ecd5ef08aa3f8678f0006cd4fe..dbfda385ed221cda8c42843326bccb08a10e0689 100644
--- a/tensorflow/python/kernel_tests/determinant_op_test.py
+++ b/tensorflow/python/kernel_tests/determinant_op_test.py
@@ -23,18 +23,20 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
 class DeterminantOpTest(test.TestCase):
 
   def _compareDeterminantBase(self, matrix_x, tf_ans):
-    out = tf_ans.eval()
+    out = self.evaluate(tf_ans)
     shape = matrix_x.shape
     if shape[-1] == 0 and shape[-2] == 0:
       np_ans = np.ones(shape[:-2]).astype(matrix_x.dtype)
@@ -53,15 +55,15 @@ class DeterminantOpTest(test.TestCase):
       np_ans = np_ans.astype(matrix_x.dtype)
 
     self.assertShapeEqual(np_ans, abs_log_det_tf)
-    sign_tf_val = sign_tf.eval()
-    abs_log_det_tf_val = abs_log_det_tf.eval()
+    sign_tf_val = self.evaluate(sign_tf)
+    abs_log_det_tf_val = self.evaluate(abs_log_det_tf)
     self.assertAllClose(
         sign_tf_val * np.exp(abs_log_det_tf_val),
         np_sign * np.exp(np_ans),
         atol=5e-5)
 
   def _compareDeterminant(self, matrix_x):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       self._compareDeterminantBase(matrix_x,
                                    linalg_ops.matrix_determinant(matrix_x))
       self._compareLogDeterminantBase(
@@ -131,6 +133,7 @@ class DeterminantOpTest(test.TestCase):
     huge_matrix = np.array([[max_double, 0.0], [0.0, max_double]])
     self._compareDeterminant(huge_matrix)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonSquareMatrix(self):
     # When the determinant of a non-square matrix is attempted we should return
     # an error
@@ -138,6 +141,7 @@ class DeterminantOpTest(test.TestCase):
       linalg_ops.matrix_determinant(
           np.array([[1., 2., 3.], [3., 5., 4.]]).astype(np.float32))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to the determinant should be a 2-dimensional tensor.
     tensor1 = constant_op.constant([1., 2.])
@@ -148,13 +152,14 @@ class DeterminantOpTest(test.TestCase):
     self._compareDeterminant(np.empty([0, 2, 2]))
     self._compareDeterminant(np.empty([2, 0, 0]))
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
       matrix2 = random_ops.random_normal([5, 5], seed=42)
       det1 = linalg_ops.matrix_determinant(matrix1)
       det2 = linalg_ops.matrix_determinant(matrix2)
-      det1_val, det2_val = sess.run([det1, det2])
+      det1_val, det2_val = self.evaluate([det1, det2])
       self.assertEqual(det1_val, det2_val)
 
 
@@ -185,8 +190,8 @@ class MatrixDeterminantBenchmark(test.Benchmark):
 
   def benchmarkMatrixDeterminantOp(self):
     for shape in self.shapes:
-      with ops.Graph().as_default(), session.Session() as sess, ops.device(
-          "/cpu:0"):
+      with ops.Graph().as_default(), session.Session(
+          config=benchmark.benchmark_config()) as sess, ops.device("/cpu:0"):
         matrix = self._GenerateMatrix(shape)
         d = linalg_ops.matrix_determinant(matrix)
         variables.global_variables_initializer().run()
@@ -198,8 +203,8 @@ class MatrixDeterminantBenchmark(test.Benchmark):
             name="matrix_determinant_cpu_{shape}".format(shape=shape))
 
       if test.is_gpu_available(True):
-        with ops.Graph().as_default(), session.Session() as sess, ops.device(
-            "/gpu:0"):
+        with ops.Graph().as_default(), session.Session(
+            config=benchmark.benchmark_config()) as sess, ops.device("/gpu:0"):
           matrix = self._GenerateMatrix(shape)
           d = linalg_ops.matrix_determinant(matrix)
           variables.global_variables_initializer().run()
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 0825d8fc6bea008532fd7428236dfb569f2a471e..ed2a9e8e47e961549dbaa99a78624e22af146937 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -31,8 +32,9 @@ from tensorflow.python.platform import tf_logging
 
 class MatrixDiagTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testVector(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
       mat = np.diag(v)
       v_diag = array_ops.matrix_diag(v)
@@ -40,7 +42,7 @@ class MatrixDiagTest(test.TestCase):
       self.assertAllEqual(v_diag.eval(), mat)
 
   def _testBatchVector(self, dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype)
       mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]],
                             [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0],
@@ -49,6 +51,7 @@ class MatrixDiagTest(test.TestCase):
       self.assertEqual((2, 3, 3), v_batch_diag.get_shape())
       self.assertAllEqual(v_batch_diag.eval(), mat_batch)
 
+  @test_util.run_deprecated_v1
   def testBatchVector(self):
     self._testBatchVector(np.float32)
     self._testBatchVector(np.float64)
@@ -56,19 +59,22 @@ class MatrixDiagTest(test.TestCase):
     self._testBatchVector(np.int64)
     self._testBatchVector(np.bool)
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.matrix_diag(0)
 
+  @test_util.run_deprecated_v1
   def testInvalidShapeAtEval(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 1-dim"):
         array_ops.matrix_diag(v).eval(feed_dict={v: 0.0})
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     shapes = ((3,), (7, 4))
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for shape in shapes:
         x = constant_op.constant(np.random.rand(*shape), np.float32)
         y = array_ops.matrix_diag(x)
@@ -81,34 +87,36 @@ class MatrixDiagTest(test.TestCase):
 
 class MatrixSetDiagTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSquare(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
       mat = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]])
       mat_set_diag = np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0],
                                [1.0, 1.0, 3.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag, output.eval())
+      self.assertAllEqual(mat_set_diag, self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def testRectangular(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v = np.array([3.0, 4.0])
       mat = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]])
       expected = np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((2, 3), output.get_shape())
-      self.assertAllEqual(expected, output.eval())
+      self.assertAllEqual(expected, self.evaluate(output))
 
       v = np.array([3.0, 4.0])
       mat = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
       expected = np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 2), output.get_shape())
-      self.assertAllEqual(expected, output.eval())
+      self.assertAllEqual(expected, self.evaluate(output))
 
   def _testSquareBatch(self, dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       v_batch = np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]]).astype(dtype)
       mat_batch = np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]],
                             [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0],
@@ -121,8 +129,9 @@ class MatrixSetDiagTest(test.TestCase):
 
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 3, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag_batch, output.eval())
+      self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def testSquareBatch(self):
     self._testSquareBatch(np.float32)
     self._testSquareBatch(np.float64)
@@ -130,8 +139,9 @@ class MatrixSetDiagTest(test.TestCase):
     self._testSquareBatch(np.int64)
     self._testSquareBatch(np.bool)
 
+  @test_util.run_deprecated_v1
   def testRectangularBatch(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v_batch = np.array([[-1.0, -2.0], [-4.0, -5.0]])
       mat_batch = np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]],
                             [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]])
@@ -140,16 +150,18 @@ class MatrixSetDiagTest(test.TestCase):
                                      [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]])
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 2, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag_batch, output.eval())
+      self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 2"):
       array_ops.matrix_set_diag(0, [0])
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.matrix_set_diag([[0]], 0)
 
+  @test_util.run_deprecated_v1
   def testInvalidShapeAtEval(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 2-dim"):
         array_ops.matrix_set_diag(v, [v]).eval(feed_dict={v: 0.0})
@@ -157,9 +169,10 @@ class MatrixSetDiagTest(test.TestCase):
           r"but received input shape: \[1,1\] and diagonal shape: \[\]"):
         array_ops.matrix_set_diag([[v]], v).eval(feed_dict={v: 0.0})
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     shapes = ((3, 4, 4), (3, 3, 4), (3, 4, 3), (7, 4, 8, 8))
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for shape in shapes:
         x = constant_op.constant(
             np.random.rand(*shape), dtype=dtypes_lib.float32)
@@ -178,8 +191,9 @@ class MatrixSetDiagTest(test.TestCase):
             y.get_shape().as_list())
         self.assertLess(error_x_diag, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradWithNoShapeInformation(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       mat = array_ops.placeholder(dtype=dtypes_lib.float32)
       grad_input = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -200,16 +214,18 @@ class MatrixSetDiagTest(test.TestCase):
 
 class MatrixDiagPartTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSquare(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
       mat = np.diag(v)
       mat_diag = array_ops.matrix_diag_part(mat)
       self.assertEqual((3,), mat_diag.get_shape())
       self.assertAllEqual(mat_diag.eval(), v)
 
+  @test_util.run_deprecated_v1
   def testRectangular(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       mat = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
       mat_diag = array_ops.matrix_diag_part(mat)
       self.assertAllEqual(mat_diag.eval(), np.array([1.0, 5.0]))
@@ -218,7 +234,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertAllEqual(mat_diag.eval(), np.array([1.0, 4.0]))
 
   def _testSquareBatch(self, dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype)
       mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]],
                             [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0],
@@ -228,6 +244,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual((2, 3), mat_batch_diag.get_shape())
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
+  @test_util.run_deprecated_v1
   def testSquareBatch(self):
     self._testSquareBatch(np.float32)
     self._testSquareBatch(np.float64)
@@ -235,8 +252,9 @@ class MatrixDiagPartTest(test.TestCase):
     self._testSquareBatch(np.int64)
     self._testSquareBatch(np.bool)
 
+  @test_util.run_deprecated_v1
   def testRectangularBatch(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v_batch = np.array([[1.0, 2.0], [4.0, 5.0]])
       mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 2.0, 0.0]],
                             [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0]]])
@@ -245,19 +263,22 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual((2, 2), mat_batch_diag.get_shape())
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 2"):
       array_ops.matrix_diag_part(0)
 
+  @test_util.run_deprecated_v1
   def testInvalidShapeAtEval(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 2-dim"):
         array_ops.matrix_diag_part(v).eval(feed_dict={v: 0.0})
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     shapes = ((3, 3), (2, 3), (3, 2), (5, 3, 3))
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for shape in shapes:
         x = constant_op.constant(np.random.rand(*shape), dtype=np.float32)
         y = array_ops.matrix_diag_part(x)
@@ -271,11 +292,11 @@ class MatrixDiagPartTest(test.TestCase):
 class DiagTest(test.TestCase):
 
   def _diagOp(self, diag, dtype, expected_ans, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.diag(ops.convert_to_tensor(diag.astype(dtype)))
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       tf_ans_inv = array_ops.diag_part(expected_ans)
-      inv_out = tf_ans_inv.eval()
+      inv_out = self.evaluate(tf_ans_inv)
     self.assertAllClose(out, expected_ans)
     self.assertAllClose(inv_out, diag)
     self.assertShapeEqual(expected_ans, tf_ans)
@@ -407,6 +428,7 @@ class DiagTest(test.TestCase):
           dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
+  @test_util.run_deprecated_v1
   def testInvalidRank(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.diag(0.0)
@@ -418,10 +440,10 @@ class DiagPartOpTest(test.TestCase):
     np.random.seed(0)
 
   def _diagPartOp(self, tensor, dtype, expected_ans, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tensor = ops.convert_to_tensor(tensor.astype(dtype))
       tf_ans_inv = array_ops.diag_part(tensor)
-      inv_out = tf_ans_inv.eval()
+      inv_out = self.evaluate(tf_ans_inv)
     self.assertAllClose(inv_out, expected_ans)
     self.assertShapeEqual(expected_ans, tf_ans_inv)
 
@@ -441,11 +463,11 @@ class DiagPartOpTest(test.TestCase):
     i = np.arange(3)
     expected_ans = x[i, i]
     for shape in None, (None, 3), (3, None):
-      with self.test_session(use_gpu=False):
+      with self.cached_session(use_gpu=False):
         t = ops.convert_to_tensor(x.astype(np.float32))
         t.set_shape(shape)
         tf_ans = array_ops.diag_part(t)
-        out = tf_ans.eval()
+        out = self.evaluate(tf_ans)
       self.assertAllClose(out, expected_ans)
       self.assertShapeEqual(expected_ans, tf_ans)
 
@@ -476,6 +498,7 @@ class DiagPartOpTest(test.TestCase):
     self.diagPartOp(x, np.complex64, expected_ans)
     self.diagPartOp(x, np.complex128, expected_ans)
 
+  @test_util.run_deprecated_v1
   def testOddRank(self):
     w = np.random.rand(2)
     x = np.random.rand(2, 2, 2)
@@ -484,6 +507,7 @@ class DiagPartOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.diag_part(0.0)
 
+  @test_util.run_deprecated_v1
   def testUnevenDimensions(self):
     w = np.random.rand(2, 5)
     x = np.random.rand(2, 1, 2, 3)
@@ -493,11 +517,12 @@ class DiagPartOpTest(test.TestCase):
 
 class DiagGradOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDiagGrad(self):
     np.random.seed(0)
     shapes = ((3,), (3, 3), (3, 3, 3))
     dtypes = (dtypes_lib.float32, dtypes_lib.float64)
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       errors = []
       for shape in shapes:
         for dtype in dtypes:
@@ -513,11 +538,12 @@ class DiagGradOpTest(test.TestCase):
 
 class DiagGradPartOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDiagPartGrad(self):
     np.random.seed(0)
     shapes = ((3, 3), (3, 3, 3, 3))
     dtypes = (dtypes_lib.float32, dtypes_lib.float64)
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       errors = []
       for shape in shapes:
         for dtype in dtypes:
diff --git a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index 37b35ba51a884b9f8568be4a3c93e2144271730d..e6d560b4bc4c79885a4529427f5b427b39a166e6 100644
--- a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -151,6 +151,7 @@ class BernoulliTest(test.TestCase):
       self.assertAllClose(self.evaluate(dist.prob(x)), expected_pmf)
       self.assertAllClose(self.evaluate(dist.log_prob(x)), np.log(expected_pmf))
 
+  @test_util.run_deprecated_v1
   def testPmfCorrectBroadcastDynamicShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtype=dtypes.float32)
@@ -167,6 +168,7 @@ class BernoulliTest(test.TestCase):
           }), [[0.2, 0.7, 0.4]])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testPmfInvalid(self):
     p = [0.1, 0.2, 0.7]
     dist = bernoulli.Bernoulli(probs=p, validate_args=True)
@@ -193,6 +195,7 @@ class BernoulliTest(test.TestCase):
         self.evaluate(
             bernoulli.Bernoulli(probs=p, validate_args=False).log_prob(samps)))
 
+  @test_util.run_deprecated_v1
   def testBroadcasting(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes.float32)
@@ -207,6 +210,7 @@ class BernoulliTest(test.TestCase):
               p: [0.5, 0.5, 0.5]
           }))
 
+  @test_util.run_deprecated_v1
   def testPmfShapes(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes.float32, shape=[None, 1])
@@ -276,6 +280,7 @@ class BernoulliTest(test.TestCase):
     grad_p = tape.gradient(samples, p)
     self.assertIsNone(grad_p)
 
+  @test_util.run_deprecated_v1
   def testSampleActsLikeSampleN(self):
     with self.cached_session() as sess:
       p = [0.2, 0.6]
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index e20f59f48ac7aacaf650195ade7a50228b80e75c..a0e0a36fecc33b155c309dd9ac0dfda65ef698b8 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -132,6 +132,7 @@ class BijectorTestEventNdims(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Expected scalar"):
       bij.inverse_log_det_jacobian(1., event_ndims=(1, 2))
 
+  @test_util.run_deprecated_v1
   def testBijectorDynamicEventNdims(self):
     bij = BrokenBijector(validate_args=True)
     event_ndims = array_ops.placeholder(dtype=np.int32, shape=None)
@@ -301,6 +302,7 @@ class BijectorReduceEventDimsTest(test.TestCase):
         8.,
         self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
 
+  @test_util.run_deprecated_v1
   def testHandlesNonStaticEventNdims(self):
     x_ = [[[1., 2.], [3., 4.]]]
     x = array_ops.placeholder_with_default(x_, shape=None)
diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index c6bb06eab3090a103f4a7da92a7f1f5354d9020a..ec1d4ed20703e151876c9e315343b10baa76f760 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -44,6 +45,7 @@ def make_categorical(batch_shape, num_classes, dtype=dtypes.int32):
 
 class CategoricalTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testP(self):
     p = [0.2, 0.8]
     dist = categorical.Categorical(probs=p)
@@ -51,6 +53,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(p, dist.probs.eval())
       self.assertAllEqual([2], dist.logits.get_shape())
 
+  @test_util.run_deprecated_v1
   def testLogits(self):
     p = np.array([0.2, 0.8], dtype=np.float32)
     logits = np.log(p) - 50.
@@ -61,6 +64,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(dist.probs.eval(), p)
       self.assertAllClose(dist.logits.eval(), logits)
 
+  @test_util.run_deprecated_v1
   def testShapes(self):
     with self.cached_session():
       for batch_shape in ([], [1], [2, 3, 4]):
@@ -107,6 +111,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(dist.dtype, dtype)
       self.assertEqual(dist.dtype, dist.sample(5).dtype)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.cached_session():
       logits = array_ops.placeholder(dtype=dtypes.float32)
@@ -121,18 +126,21 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           feed_dict={logits: [[-1000.0, 1000.0], [1000.0, -1000.0]]})
       self.assertAllEqual([1, 0], sample_value_batch)
 
+  @test_util.run_deprecated_v1
   def testPMFWithBatch(self):
     histograms = [[0.2, 0.8], [0.6, 0.4]]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
     with self.cached_session():
       self.assertAllClose(dist.prob([0, 1]).eval(), [0.2, 0.4])
 
+  @test_util.run_deprecated_v1
   def testPMFNoBatch(self):
     histograms = [0.2, 0.8]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
     with self.cached_session():
       self.assertAllClose(dist.prob(0).eval(), 0.2)
 
+  @test_util.run_deprecated_v1
   def testCDFWithDynamicEventShapeKnownNdims(self):
     """Test that dynamically-sized events with unknown shape work."""
     batch_size = 2
@@ -184,6 +192,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     actual_cdf = self.evaluate(cdf_op)
     self.assertAllClose(actual_cdf, expected_cdf)
 
+  @test_util.run_deprecated_v1
   def testCDFWithBatch(self):
     histograms = [[0.1, 0.2, 0.3, 0.25, 0.15],
                   [0.0, 0.75, 0.2, 0.05, 0.0]]
@@ -195,6 +204,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     with self.cached_session():
       self.assertAllClose(cdf_op.eval(), expected_cdf)
 
+  @test_util.run_deprecated_v1
   def testCDFNoBatch(self):
     histogram = [0.1, 0.2, 0.3, 0.4]
     event = 2
@@ -205,6 +215,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     with self.cached_session():
       self.assertAlmostEqual(cdf_op.eval(), expected_cdf)
 
+  @test_util.run_deprecated_v1
   def testCDFBroadcasting(self):
     # shape: [batch=2, n_bins=3]
     histograms = [[0.2, 0.1, 0.7],
@@ -287,7 +298,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     }
 
     with self.cached_session() as sess:
-      run_result = sess.run(to_run)
+      run_result = self.evaluate(to_run)
 
     self.assertAllEqual(run_result["cat_prob"].shape,
                         run_result["norm_prob"].shape)
@@ -298,6 +309,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(run_result["cat_log_cdf"].shape,
                         run_result["norm_log_cdf"].shape)
 
+  @test_util.run_deprecated_v1
   def testLogPMF(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
@@ -305,6 +317,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(dist.log_prob([0, 1]).eval(), np.log([0.2, 0.4]))
       self.assertAllClose(dist.log_prob([0.0, 1.0]).eval(), np.log([0.2, 0.4]))
 
+  @test_util.run_deprecated_v1
   def testEntropyNoBatch(self):
     logits = np.log([0.2, 0.8]) - 50.
     dist = categorical.Categorical(logits)
@@ -312,6 +325,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(dist.entropy().eval(),
                           -(0.2 * np.log(0.2) + 0.8 * np.log(0.8)))
 
+  @test_util.run_deprecated_v1
   def testEntropyWithBatch(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
@@ -321,6 +335,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           -(0.6 * np.log(0.6) + 0.4 * np.log(0.4))
       ])
 
+  @test_util.run_deprecated_v1
   def testEntropyGradient(self):
     with self.cached_session() as sess:
       logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]])
@@ -355,7 +370,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       samples = dist.sample(n, seed=123)
       samples.set_shape([n, 1, 2])
       self.assertEqual(samples.dtype, dtypes.int32)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertFalse(np.any(sample_values < 0))
       self.assertFalse(np.any(sample_values > 1))
       self.assertAllClose(
@@ -371,7 +386,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       samples = dist.sample((100, 100), seed=123)
       prob = dist.prob(samples)
-      prob_val = prob.eval()
+      prob_val = self.evaluate(prob)
       self.assertAllClose(
           [0.2**2 + 0.8**2], [prob_val[:, :, :, 0].mean()], atol=1e-2)
       self.assertAllClose(
@@ -393,26 +408,26 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
 
       prob = dist.prob(1)
-      self.assertAllClose([[0.8, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([1])
-      self.assertAllClose([[0.8, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([0, 1])
-      self.assertAllClose([[0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[0, 1]])
-      self.assertAllClose([[0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[[0, 1]]])
-      self.assertAllClose([[[0.2, 0.6]]], prob.eval())
+      self.assertAllClose([[[0.2, 0.6]]], self.evaluate(prob))
 
       prob = dist.prob([[1, 0], [0, 1]])
-      self.assertAllClose([[0.8, 0.4], [0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.4], [0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[[1, 1], [1, 0]], [[1, 0], [0, 1]]])
       self.assertAllClose([[[0.8, 0.6], [0.8, 0.4]], [[0.8, 0.4], [0.2, 0.6]]],
-                          prob.eval())
+                          self.evaluate(prob))
 
   def testLogPMFShape(self):
     with self.cached_session():
@@ -440,12 +455,14 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(3, log_prob.get_shape().ndims)
     self.assertAllEqual([2, 2, 2], log_prob.get_shape())
 
+  @test_util.run_deprecated_v1
   def testMode(self):
     with self.cached_session():
       histograms = [[[0.2, 0.8], [0.6, 0.4]]]
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       self.assertAllEqual(dist.mode().eval(), [[1, 0]])
 
+  @test_util.run_deprecated_v1
   def testCategoricalCategoricalKL(self):
 
     def np_softmax(logits):
@@ -462,7 +479,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           b = categorical.Categorical(logits=b_logits)
 
           kl = kullback_leibler.kl_divergence(a, b)
-          kl_val = sess.run(kl)
+          kl_val = self.evaluate(kl)
           # Make sure KL(a||a) is 0
           kl_same = sess.run(kullback_leibler.kl_divergence(a, a))
 
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index d558ca09cc64b1337d2e5f47fc742282eaf7307f..c530037e1edc0437231cd5e968e48028cc4828ff 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import dirichlet_multinomial
@@ -36,6 +37,7 @@ class DirichletMultinomialTest(test.TestCase):
   def setUp(self):
     self._rng = np.random.RandomState(42)
 
+  @test_util.run_deprecated_v1
   def testSimpleShapes(self):
     with self.cached_session():
       alpha = np.random.rand(3)
@@ -45,6 +47,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
+  @test_util.run_deprecated_v1
   def testComplexShapes(self):
     with self.cached_session():
       alpha = np.random.rand(3, 2, 2)
@@ -55,6 +58,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
+  @test_util.run_deprecated_v1
   def testNproperty(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
@@ -63,6 +67,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual([1, 1], dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
+  @test_util.run_deprecated_v1
   def testAlphaProperty(self):
     alpha = [[1., 2, 3]]
     with self.cached_session():
@@ -70,6 +75,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual([1, 3], dist.concentration.get_shape())
       self.assertAllClose(alpha, dist.concentration.eval())
 
+  @test_util.run_deprecated_v1
   def testPmfNandCountsAgree(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
@@ -83,6 +89,7 @@ class DirichletMultinomialTest(test.TestCase):
           "last-dimension must sum to `self.total_count`"):
         dist.prob([3., 3, 0]).eval()
 
+  @test_util.run_deprecated_v1
   def testPmfNonIntegerCounts(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
@@ -110,7 +117,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [1., 0]
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1 / 3., pmf.eval())
+      self.assertAllClose(1 / 3., self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesNontrivialN(self):
@@ -122,7 +129,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [3., 2]
       dist = ds.DirichletMultinomial(5., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1 / 7., pmf.eval())
+      self.assertAllClose(1 / 7., self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesMultidimensionalN(self):
@@ -134,7 +141,7 @@ class DirichletMultinomialTest(test.TestCase):
       n = np.full([4, 3], 5., dtype=np.float32)
       dist = ds.DirichletMultinomial(n, alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose([[1 / 7., 1 / 7., 1 / 7.]] * 4, pmf.eval())
+      self.assertAllClose([[1 / 7., 1 / 7., 1 / 7.]] * 4, self.evaluate(pmf))
       self.assertEqual((4, 3), pmf.get_shape())
 
   def testPmfAlphaStretchedInBroadcastWhenSameRank(self):
@@ -145,7 +152,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [[1., 0], [0., 1]]
       dist = ds.DirichletMultinomial([1.], alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose([1 / 3., 2 / 3.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 3.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfAlphaStretchedInBroadcastWhenLowerRank(self):
@@ -155,7 +162,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [1., 2]
       counts = [[1., 0], [0., 1]]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 3.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 3.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
@@ -165,7 +172,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       counts = [[1., 0]]
       pmf = ds.DirichletMultinomial([1., 1.], alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 5.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 5.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
@@ -175,9 +182,10 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       counts = [1., 0]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 5.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 5.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
+  @test_util.run_deprecated_v1
   def testPmfForOneVoteIsTheMeanWithOneRecordInput(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
@@ -194,6 +202,7 @@ class DirichletMultinomialTest(test.TestCase):
         self.assertAllEqual([3], mean.shape)
         self.assertAllEqual([], pmf.shape)
 
+  @test_util.run_deprecated_v1
   def testMeanDoubleTwoVotes(self):
     # The probabilities of two votes falling into class k for
     # DirichletMultinomial(2, alpha) is twice as much as the probability of one
@@ -215,6 +224,7 @@ class DirichletMultinomialTest(test.TestCase):
         self.assertAllClose(mean2[class_num], 2 * mean1[class_num])
         self.assertAllEqual([3], mean1.shape)
 
+  @test_util.run_deprecated_v1
   def testCovarianceFromSampling(self):
     # We will test mean, cov, var, stddev on a DirichletMultinomial constructed
     # via broadcast between alpha, n.
@@ -289,7 +299,7 @@ class DirichletMultinomialTest(test.TestCase):
         expected_covariance = n * (n + alpha_0) / (1 + alpha_0) * shared_matrix
 
         self.assertEqual([2, 2], covariance.get_shape())
-        self.assertAllClose(expected_covariance, covariance.eval())
+        self.assertAllClose(expected_covariance, self.evaluate(covariance))
 
   def testCovarianceNAlphaBroadcast(self):
     alpha_v = [1., 2, 3]
@@ -327,7 +337,7 @@ class DirichletMultinomialTest(test.TestCase):
           ns * (ns + alpha_0) / (1 + alpha_0))[..., array_ops.newaxis]
 
       self.assertEqual([4, 3, 3], covariance.get_shape())
-      self.assertAllClose(expected_covariance, covariance.eval())
+      self.assertAllClose(expected_covariance, self.evaluate(covariance))
 
   def testCovarianceMultidimensional(self):
     alpha = np.random.rand(3, 5, 4).astype(np.float32)
@@ -353,7 +363,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(0., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1.0, pmf.eval())
+      self.assertAllClose(1.0, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testLargeTauGivesPreciseProbabilities(self):
@@ -368,7 +378,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.8, pmf.eval(), atol=1e-4)
+      self.assertAllClose(0.8, self.evaluate(pmf), atol=1e-4)
       self.assertEqual((), pmf.get_shape())
 
     # Two (three sided) coin flips.  Prob[coin 3] = 0.8.
@@ -376,7 +386,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(2., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.8**2, pmf.eval(), atol=1e-2)
+      self.assertAllClose(0.8**2, self.evaluate(pmf), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
 
     # Three (three sided) coin flips.
@@ -384,7 +394,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(3., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(3 * 0.1 * 0.8 * 0.8, pmf.eval(), atol=1e-2)
+      self.assertAllClose(3 * 0.1 * 0.8 * 0.8, self.evaluate(pmf), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
 
   def testSmallTauPrefersCorrelatedResults(self):
@@ -399,7 +409,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(0.5, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
     # If there are two draws, it is much more likely that they are the same.
@@ -409,9 +419,10 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(2., alpha)
       pmf_same = dist.prob(counts_same)
       pmf_different = dist.prob(counts_different)
-      self.assertLess(5 * pmf_different.eval(), pmf_same.eval())
+      self.assertLess(5 * self.evaluate(pmf_different), self.evaluate(pmf_same))
       self.assertEqual((), pmf_same.get_shape())
 
+  @test_util.run_deprecated_v1
   def testNonStrictTurnsOffAllChecks(self):
     # Make totally invalid input.
     with self.cached_session():
@@ -421,6 +432,7 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(n, alpha, validate_args=False)
       dist.prob(counts).eval()  # Should not raise.
 
+  @test_util.run_deprecated_v1
   def testSampleUnbiasedNonScalarBatch(self):
     with self.cached_session() as sess:
       dist = ds.DirichletMultinomial(
@@ -450,6 +462,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertAllClose(
           actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
 
+  @test_util.run_deprecated_v1
   def testSampleUnbiasedScalarBatch(self):
     with self.cached_session() as sess:
       dist = ds.DirichletMultinomial(
diff --git a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
index e35a8e1cdd7087dbf0ce7520412b4f773468c9e5..62b562387d0ebfbb895f4602e24c8af823f0bb4f 100644
--- a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.distributions import bijector_test_util
 from tensorflow.python.ops.distributions import identity_bijector
 from tensorflow.python.platform import test
@@ -41,6 +42,7 @@ class IdentityBijectorTest(test.TestCase):
         self.evaluate(
             bijector.forward_log_det_jacobian(x, event_ndims=3)))
 
+  @test_util.run_deprecated_v1
   def testScalarCongruency(self):
     with self.cached_session():
       bijector = identity_bijector.Identity()
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index e77e1117d493511748dea2dc1aff46ea8e7658e6..1e967de570f2fa012c84be50d8ecdf9a49a89dc3 100644
--- a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import normal
@@ -45,6 +46,7 @@ class KLTest(test.TestCase):
     a = MyDist(loc=0.0, scale=1.0)
     self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
 
+  @test_util.run_deprecated_v1
   def testDomainErrorExceptions(self):
 
     class MyDistException(normal.Normal):
@@ -63,17 +65,17 @@ class KLTest(test.TestCase):
       kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
-        kl.eval()
+        self.evaluate(kl)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
         a.kl_divergence(a).eval()
       a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=True)
       kl_ok = kullback_leibler.kl_divergence(a, a)
-      self.assertAllEqual([float("nan")], kl_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(kl_ok))
       self_kl_ok = a.kl_divergence(a)
-      self.assertAllEqual([float("nan")], self_kl_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(self_kl_ok))
       cross_ok = a.cross_entropy(a)
-      self.assertAllEqual([float("nan")], cross_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(cross_ok))
 
   def testRegistrationFailures(self):
 
diff --git a/tensorflow/python/kernel_tests/distributions/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
index 3840d7331cacf588218e3c7dfea85662d545a13a..187ddd4cf417a54acbdd7bcd5fc60459336f11c9 100644
--- a/tensorflow/python/kernel_tests/distributions/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import multinomial
@@ -33,6 +34,7 @@ class MultinomialTest(test.TestCase):
   def setUp(self):
     self._rng = np.random.RandomState(42)
 
+  @test_util.run_v1_only("b/120545219")
   def testSimpleShapes(self):
     with self.cached_session():
       p = [.1, .3, .6]
@@ -42,6 +44,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testComplexShapes(self):
     with self.cached_session():
       p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
@@ -52,6 +55,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testN(self):
     p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
     n = [[3.], [4]]
@@ -60,6 +64,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((2, 1), dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testP(self):
     p = [[0.1, 0.2, 0.7]]
     with self.cached_session():
@@ -68,6 +73,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((1, 3), dist.logits.get_shape())
       self.assertAllClose(p, dist.probs.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testLogits(self):
     p = np.array([[0.1, 0.2, 0.7]], dtype=np.float32)
     logits = np.log(p) - 50.
@@ -78,6 +84,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(p, multinom.probs.eval())
       self.assertAllClose(logits, multinom.logits.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfUnderflow(self):
     logits = np.array([[-200, 0]], dtype=np.float32)
     with self.cached_session():
@@ -85,6 +92,7 @@ class MultinomialTest(test.TestCase):
       lp = dist.log_prob([1., 0.]).eval()[0]
       self.assertAllClose(-200, lp, atol=0, rtol=1e-6)
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfandCountsAgree(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
@@ -97,6 +105,7 @@ class MultinomialTest(test.TestCase):
       with self.assertRaisesOpError("counts must sum to `self.total_count`"):
         dist.prob([3., 3, 0]).eval()
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfNonIntegerCounts(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
@@ -127,7 +136,7 @@ class MultinomialTest(test.TestCase):
       p = [0.5, 0.5]
       counts = [1., 0]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(0.5, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesNontrivialN(self):
@@ -138,7 +147,7 @@ class MultinomialTest(test.TestCase):
       dist = multinomial.Multinomial(total_count=5., probs=p)
       pmf = dist.prob(counts)
       # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
-      self.assertAllClose(81. / 10000, pmf.eval())
+      self.assertAllClose(81. / 10000, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenSameRank(self):
@@ -146,7 +155,7 @@ class MultinomialTest(test.TestCase):
       p = [[0.1, 0.9]]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertAllClose([0.1, 0.9], self.evaluate(pmf))
       self.assertEqual((2), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenLowerRank(self):
@@ -154,9 +163,10 @@ class MultinomialTest(test.TestCase):
       p = [0.1, 0.9]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertAllClose([0.1, 0.9], self.evaluate(pmf))
       self.assertEqual((2), pmf.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
     with self.cached_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
@@ -165,6 +175,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual((2), pmf.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
     with self.cached_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
@@ -182,7 +193,7 @@ class MultinomialTest(test.TestCase):
       # [2]
       counts = [2., 1]
       pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
-      pmf.eval()
+      self.evaluate(pmf)
       self.assertEqual(pmf.get_shape(), (2, 2))
 
   def testPmfShapeCountsPStretchedN(self):
@@ -191,9 +202,10 @@ class MultinomialTest(test.TestCase):
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
       pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
-      pmf.eval()
+      self.evaluate(pmf)
       self.assertEqual((4, 3), pmf.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultinomialMean(self):
     with self.cached_session():
       n = 5.
@@ -203,6 +215,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((3,), dist.mean().get_shape())
       self.assertAllClose(expected_means, dist.mean().eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultinomialCovariance(self):
     with self.cached_session():
       n = 5.
@@ -214,6 +227,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((3, 3), dist.covariance().get_shape())
       self.assertAllClose(expected_covariances, dist.covariance().eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultinomialCovarianceBatch(self):
     with self.cached_session():
       # Shape [2]
@@ -246,6 +260,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((3, 5, 4, 4), covariance.get_shape())
       self.assertEqual((6, 3, 3, 3), covariance2.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testCovarianceFromSampling(self):
     # We will test mean, cov, var, stddev on a DirichletMultinomial constructed
     # via broadcast between alpha, n.
@@ -288,6 +303,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(sample_var_, analytic_var, atol=0.01, rtol=0.01)
       self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.01, rtol=0.01)
 
+  @test_util.run_v1_only("b/120545219")
   def testSampleUnbiasedNonScalarBatch(self):
     with self.cached_session() as sess:
       dist = multinomial.Multinomial(
@@ -317,6 +333,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(
           actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
 
+  @test_util.run_v1_only("b/120545219")
   def testSampleUnbiasedScalarBatch(self):
     with self.cached_session() as sess:
       dist = multinomial.Multinomial(
diff --git a/tensorflow/python/kernel_tests/distributions/normal_test.py b/tensorflow/python/kernel_tests/distributions/normal_test.py
index 6625a88843f1ca489799bd19172db437d965a182..f2a193e69bd4393bda3817a45f7a27db70c73115 100644
--- a/tensorflow/python/kernel_tests/distributions/normal_test.py
+++ b/tensorflow/python/kernel_tests/distributions/normal_test.py
@@ -511,6 +511,7 @@ class NormalTest(test.TestCase):
     self.assertAllEqual(self.evaluate(normal.event_shape_tensor()), [])
     self.assertEqual(normal.event_shape, tensor_shape.TensorShape([]))
 
+  @test_util.run_deprecated_v1
   def testNormalShapeWithPlaceholders(self):
     mu = array_ops.placeholder(dtype=dtypes.float32)
     sigma = array_ops.placeholder(dtype=dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/distributions/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
index cc43e12168697c4f5a0cda48896b3d7d3c108ae4..d97fcfa655f2728b04ee0a2eb7ed71ef07ea1fbf 100644
--- a/tensorflow/python/kernel_tests/distributions/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -104,6 +104,7 @@ class NdtriTest(test.TestCase):
     x = special_math.ndtri(p)
     self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
+  @test_util.run_deprecated_v1
   def testNdtriDynamicShape(self):
     """Verifies that ndtri computation is correct."""
     with self.cached_session() as sess:
@@ -213,9 +214,11 @@ class NdtrTest(test.TestCase):
         rtol=error_spec.rtol,
         atol=error_spec.atol)
 
+  @test_util.run_deprecated_v1
   def test_float32(self):
     self._test_grid(np.float32, self._grid32, self._error32)
 
+  @test_util.run_deprecated_v1
   def test_float64(self):
     self._test_grid(np.float64, self._grid64, self._error64)
 
@@ -338,10 +341,12 @@ class NdtrGradientTest(test.TestCase):
           rtol=error_spec.rtol,
           atol=error_spec.atol)
 
+  @test_util.run_deprecated_v1
   def test_float32(self):
     self._test_grad_accuracy(np.float32, self._grid, self._error32)
     self._test_grad_finite(np.float32)
 
+  @test_util.run_deprecated_v1
   def test_float64(self):
     self._test_grad_accuracy(np.float64, self._grid, self._error64)
     self._test_grad_finite(np.float64)
@@ -362,7 +367,7 @@ class ErfInvTest(test.TestCase):
 
       expected_x = special.erfinv(x)
       x = special_math.erfinv(x)
-      self.assertAllClose(expected_x, x.eval(), atol=0.)
+      self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
   def testErfInvIntegerInput(self):
     with self.cached_session():
@@ -418,6 +423,7 @@ class LogCDFLaplaceTest(test.TestCase):
           rtol=error_spec.rtol,
           atol=error_spec.atol)
 
+  @test_util.run_deprecated_v1
   def test_float32_lower_and_mid_segment_scipy_float32_ok(self):
     # Choose values mild enough that we can use scipy in float32, which will
     # allow for a high accuracy match to scipy (since we both use float32).
@@ -427,6 +433,7 @@ class LogCDFLaplaceTest(test.TestCase):
         GridSpec(min=-10, max=self.CUTOFF_FLOAT32_UPPER - 5, shape=[100]),
         ErrorSpec(rtol=5e-4, atol=0))
 
+  @test_util.run_deprecated_v1
   def test_float32_all_segments_with_scipy_float64_ok(self):
     # Choose values outside the range where scipy float32 works.
     # Let scipy use float64.  This means we
@@ -437,6 +444,7 @@ class LogCDFLaplaceTest(test.TestCase):
         GridSpec(min=-50, max=self.CUTOFF_FLOAT32_UPPER + 5, shape=[100]),
         ErrorSpec(rtol=0.05, atol=0))
 
+  @test_util.run_deprecated_v1
   def test_float32_extreme_values_result_and_gradient_finite_and_nonzero(self):
     with self.cached_session() as sess:
       # On the lower branch, log_cdf_laplace(x) = x, so we know this will be
@@ -448,7 +456,7 @@ class LogCDFLaplaceTest(test.TestCase):
       actual = sm.log_cdf_laplace(grid)
       grad = gradients_impl.gradients(actual, grid)[0]
 
-      actual_, grad_ = sess.run([actual, grad])
+      actual_, grad_ = self.evaluate([actual, grad])
 
       # isfinite checks for NaN and Inf.
       self.assertAllTrue(np.isfinite(actual_))
@@ -456,6 +464,7 @@ class LogCDFLaplaceTest(test.TestCase):
       self.assertFalse(np.any(actual_ == 0))
       self.assertFalse(np.any(grad_ == 0))
 
+  @test_util.run_deprecated_v1
   def test_float64_extreme_values_result_and_gradient_finite_and_nonzero(self):
     with self.cached_session() as sess:
       # On the lower branch, log_cdf_laplace(x) = x, so we know this will be
@@ -467,7 +476,7 @@ class LogCDFLaplaceTest(test.TestCase):
       actual = sm.log_cdf_laplace(grid)
       grad = gradients_impl.gradients(actual, grid)[0]
 
-      actual_, grad_ = sess.run([actual, grad])
+      actual_, grad_ = self.evaluate([actual, grad])
 
       # isfinite checks for NaN and Inf.
       self.assertAllTrue(np.isfinite(actual_))
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 27d652c2c6251f8179139f89a41ec5d1c7f46810..030ad601bf4754ebda7b896b14051440adc170d2 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -59,6 +59,7 @@ def _logit(x):
 
 class AssertCloseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAssertIntegerForm(self):
     # This should only be detected as an integer.
     x = array_ops.placeholder(dtypes.float32)
@@ -112,6 +113,7 @@ class MaybeGetStaticTest(test.TestCase):
     self.assertAllClose(
         np.array(2.), du.maybe_get_static_value(x, dtype=np.float64))
 
+  @test_util.run_deprecated_v1
   def testGetStaticPlaceholder(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
     self.assertEqual(None, du.maybe_get_static_value(x))
@@ -235,6 +237,7 @@ class GetLogitsAndProbsTest(test.TestCase):
         probs=p4, multidimensional=True, validate_args=False)
     self.evaluate(prob)
 
+  @test_util.run_deprecated_v1
   def testProbsMultidimShape(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -249,6 +252,7 @@ class GetLogitsAndProbsTest(test.TestCase):
             probs=p, multidimensional=True, validate_args=True)
         prob.eval(feed_dict={p: np.ones([int(2**11+1)])})
 
+  @test_util.run_deprecated_v1
   def testLogitsMultidimShape(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -266,6 +270,7 @@ class GetLogitsAndProbsTest(test.TestCase):
 
 class EmbedCheckCategoricalEventShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTooSmall(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -280,6 +285,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
             param)
         checked_param.eval(feed_dict={param: np.ones([1])})
 
+  @test_util.run_deprecated_v1
   def testTooLarge(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -305,6 +311,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
 
 class EmbedCheckIntegerCastingClosedTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssertsNonnegative(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements must be non-negative"):
@@ -313,6 +320,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
             x, target_dtype=dtypes.int16)
         x_checked.eval(feed_dict={x: np.array([1, -1], dtype=np.float16)})
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssersIntegerForm(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements must be int16-equivalent."):
@@ -321,6 +329,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
             x, target_dtype=dtypes.int16)
         x_checked.eval(feed_dict={x: np.array([1, 1.5], dtype=np.float16)})
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssertsLargestPossibleInteger(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements cannot exceed 32767."):
@@ -329,6 +338,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
             x, target_dtype=dtypes.int16)
         x_checked.eval(feed_dict={x: np.array([1, 2**15], dtype=np.int32)})
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssertsSmallestPossibleInteger(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements cannot be smaller than 0."):
@@ -369,6 +379,7 @@ class LogCombinationsTest(test.TestCase):
 
 class DynamicShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSameDynamicShape(self):
     with self.cached_session():
       scalar = constant_op.constant(2.0)
@@ -493,6 +504,7 @@ class RotateTransposeTest(test.TestCase):
             self._np_rotate_transpose(x, shift), self.evaluate(y))
         self.assertAllEqual(np.roll(x.shape, shift), y.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testRollDynamic(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -511,6 +523,7 @@ class RotateTransposeTest(test.TestCase):
 
 class PickVectorTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCorrectlyPicksVector(self):
     with self.cached_session():
       x = np.arange(10, 12)
@@ -529,36 +542,42 @@ class PickVectorTest(test.TestCase):
 
 class PreferStaticRankTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNonEmptyConstantTensor(self):
     x = array_ops.zeros((2, 3, 4))
     rank = du.prefer_static_rank(x)
     self.assertIsInstance(rank, np.ndarray)
     self.assertEqual(3, rank)
 
+  @test_util.run_deprecated_v1
   def testEmptyConstantTensor(self):
     x = constant_op.constant([])
     rank = du.prefer_static_rank(x)
     self.assertIsInstance(rank, np.ndarray)
     self.assertEqual(1, rank)
 
+  @test_util.run_deprecated_v1
   def testScalarTensor(self):
     x = constant_op.constant(1.)
     rank = du.prefer_static_rank(x)
     self.assertIsInstance(rank, np.ndarray)
     self.assertEqual(0, rank)
 
+  @test_util.run_deprecated_v1
   def testDynamicRankEndsUpBeingNonEmpty(self):
     x = array_ops.placeholder(np.float64, shape=None)
     rank = du.prefer_static_rank(x)
     with self.cached_session():
       self.assertAllEqual(2, rank.eval(feed_dict={x: np.zeros((2, 3))}))
 
+  @test_util.run_deprecated_v1
   def testDynamicRankEndsUpBeingEmpty(self):
     x = array_ops.placeholder(np.int32, shape=None)
     rank = du.prefer_static_rank(x)
     with self.cached_session():
       self.assertAllEqual(1, rank.eval(feed_dict={x: []}))
 
+  @test_util.run_deprecated_v1
   def testDynamicRankEndsUpBeingScalar(self):
     x = array_ops.placeholder(np.int32, shape=None)
     rank = du.prefer_static_rank(x)
@@ -568,36 +587,42 @@ class PreferStaticRankTest(test.TestCase):
 
 class PreferStaticShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNonEmptyConstantTensor(self):
     x = array_ops.zeros((2, 3, 4))
     shape = du.prefer_static_shape(x)
     self.assertIsInstance(shape, np.ndarray)
     self.assertAllEqual(np.array([2, 3, 4]), shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyConstantTensor(self):
     x = constant_op.constant([])
     shape = du.prefer_static_shape(x)
     self.assertIsInstance(shape, np.ndarray)
     self.assertAllEqual(np.array([0]), shape)
 
+  @test_util.run_deprecated_v1
   def testScalarTensor(self):
     x = constant_op.constant(1.)
     shape = du.prefer_static_shape(x)
     self.assertIsInstance(shape, np.ndarray)
     self.assertAllEqual(np.array([]), shape)
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeEndsUpBeingNonEmpty(self):
     x = array_ops.placeholder(np.float64, shape=None)
     shape = du.prefer_static_shape(x)
     with self.cached_session():
       self.assertAllEqual((2, 3), shape.eval(feed_dict={x: np.zeros((2, 3))}))
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeEndsUpBeingEmpty(self):
     x = array_ops.placeholder(np.int32, shape=None)
     shape = du.prefer_static_shape(x)
     with self.cached_session():
       self.assertAllEqual(np.array([0]), shape.eval(feed_dict={x: []}))
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeEndsUpBeingScalar(self):
     x = array_ops.placeholder(np.int32, shape=None)
     shape = du.prefer_static_shape(x)
@@ -607,24 +632,28 @@ class PreferStaticShapeTest(test.TestCase):
 
 class PreferStaticValueTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNonEmptyConstantTensor(self):
     x = array_ops.zeros((2, 3, 4))
     value = du.prefer_static_value(x)
     self.assertIsInstance(value, np.ndarray)
     self.assertAllEqual(np.zeros((2, 3, 4)), value)
 
+  @test_util.run_deprecated_v1
   def testEmptyConstantTensor(self):
     x = constant_op.constant([])
     value = du.prefer_static_value(x)
     self.assertIsInstance(value, np.ndarray)
     self.assertAllEqual(np.array([]), value)
 
+  @test_util.run_deprecated_v1
   def testScalarTensor(self):
     x = constant_op.constant(1.)
     value = du.prefer_static_value(x)
     self.assertIsInstance(value, np.ndarray)
     self.assertAllEqual(np.array(1.), value)
 
+  @test_util.run_deprecated_v1
   def testDynamicValueEndsUpBeingNonEmpty(self):
     x = array_ops.placeholder(np.float64, shape=None)
     value = du.prefer_static_value(x)
@@ -632,12 +661,14 @@ class PreferStaticValueTest(test.TestCase):
       self.assertAllEqual(np.zeros((2, 3)),
                           value.eval(feed_dict={x: np.zeros((2, 3))}))
 
+  @test_util.run_deprecated_v1
   def testDynamicValueEndsUpBeingEmpty(self):
     x = array_ops.placeholder(np.int32, shape=None)
     value = du.prefer_static_value(x)
     with self.cached_session():
       self.assertAllEqual(np.array([]), value.eval(feed_dict={x: []}))
 
+  @test_util.run_deprecated_v1
   def testDynamicValueEndsUpBeingScalar(self):
     x = array_ops.placeholder(np.int32, shape=None)
     value = du.prefer_static_value(x)
@@ -698,43 +729,55 @@ class FillTriangularTest(test.TestCase):
     self.assertAllClose(expected, actual_, rtol=1e-8, atol=1e-9)
     self.assertAllClose(x_, grad_actual_, rtol=1e-8, atol=1e-9)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakes1x1TriLower(self):
     self._run_test(self._rng.randn(3, int(1*2/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesNoBatchTriLower(self):
     self._run_test(self._rng.randn(int(4*5/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriLower(self):
     self._run_test(self._rng.randn(2, 3, int(3*4/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriLowerUnknownShape(self):
     self._run_test(self._rng.randn(2, 3, int(3*4/2)), use_deferred_shape=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriLowerUnknownShape(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), use_deferred_shape=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriLower(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakes1x1TriUpper(self):
     self._run_test(self._rng.randn(3, int(1*2/2)), upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesNoBatchTriUpper(self):
     self._run_test(self._rng.randn(int(4*5/2)), upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriUpper(self):
     self._run_test(self._rng.randn(2, 2, int(3*4/2)), upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriUpperUnknownShape(self):
     self._run_test(self._rng.randn(2, 2, int(3*4/2)),
                    use_deferred_shape=True,
                    upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriUpperUnknownShape(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)),
                    use_deferred_shape=True,
                    upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriUpper(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), upper=True)
 
@@ -773,6 +816,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
       m = np.squeeze(m, axis=axis)
     return m + np.log(sgn * sum_), sgn
 
+  @test_util.run_deprecated_v1
   def testNoWeights(self):
     logx_ = np.array([[0., -1, 1000.],
                       [0, 1, -1000.],
@@ -805,7 +849,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
       w = constant_op.constant(w_)
       actual, actual_sgn = du.reduce_weighted_logsumexp(
           logx, w, axis=-1, return_sign=True)
-      [actual_, actual_sgn_] = sess.run([actual, actual_sgn])
+      [actual_, actual_sgn_] = self.evaluate([actual, actual_sgn])
     self.assertAllEqual(expected, actual_)
     self.assertAllEqual([-1., -1, 1], actual_sgn_)
 
@@ -823,7 +867,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
       w = constant_op.constant(w_)
       actual, actual_sgn = du.reduce_weighted_logsumexp(
           logx, w, axis=-1, return_sign=True, keep_dims=True)
-      [actual_, actual_sgn_] = sess.run([actual, actual_sgn])
+      [actual_, actual_sgn_] = self.evaluate([actual, actual_sgn])
     self.assertAllEqual(expected, actual_)
     self.assertAllEqual([[-1.], [-1], [1]], actual_sgn_)
 
@@ -879,7 +923,7 @@ class SoftplusTest(test.TestCase):
   def _testSoftplus(self, np_features, use_gpu=False):
     np_features = np.asarray(np_features)
     np_softplus = self._npSoftplus(np_features)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.session(use_gpu=use_gpu) as sess:
       softplus = nn_ops.softplus(np_features)
       softplus_inverse = du.softplus_inverse(softplus)
       [tf_softplus, tf_softplus_inverse] = sess.run([
@@ -903,6 +947,7 @@ class SoftplusTest(test.TestCase):
     self.assertAllEqual(np.ones_like(tf_softplus_inverse).astype(np.bool),
                         np.isfinite(tf_softplus_inverse))
 
+  @test_util.run_deprecated_v1
   def testNumbers(self):
     for t in [np.float16, np.float32, np.float64]:
       lower = {np.float16: -15, np.float32: -50, np.float64: -50}.get(t, -100)
@@ -933,6 +978,7 @@ class SoftplusTest(test.TestCase):
           ],
           use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -949,6 +995,7 @@ class SoftplusTest(test.TestCase):
     tf_logging.vlog(2, "softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testInverseSoftplusGradientNeverNan(self):
     with self.cached_session():
       # Note that this range contains both zero and inf.
@@ -958,6 +1005,7 @@ class SoftplusTest(test.TestCase):
       # Equivalent to `assertAllFalse` (if it existed).
       self.assertAllEqual(np.zeros_like(grads).astype(np.bool), np.isnan(grads))
 
+  @test_util.run_deprecated_v1
   def testInverseSoftplusGradientFinite(self):
     with self.cached_session():
       # This range of x is all finite, and so is 1 / x.  So the
diff --git a/tensorflow/python/kernel_tests/division_future_test.py b/tensorflow/python/kernel_tests/division_future_test.py
index e477bdc73b90eb104011f476fcfa9b4cf39a628a..85c85809d3f96a22b7994bedef34b10b700a2815 100644
--- a/tensorflow/python/kernel_tests/division_future_test.py
+++ b/tensorflow/python/kernel_tests/division_future_test.py
@@ -65,7 +65,7 @@ class DivisionTestCase(test.TestCase):
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
       # Do only one sess.run for speed
-      for f, (x, y) in zip(checks, sess.run(tensors)):
+      for f, (x, y) in zip(checks, self.evaluate(tensors)):
         f(x, y)
 
 
diff --git a/tensorflow/python/kernel_tests/division_past_test.py b/tensorflow/python/kernel_tests/division_past_test.py
index 63951b5b382947ff17029bc7b7062cf5808f220e..38bb18631ab7be4e191ceca801e8d68b0c3bdd61 100644
--- a/tensorflow/python/kernel_tests/division_past_test.py
+++ b/tensorflow/python/kernel_tests/division_past_test.py
@@ -64,7 +64,7 @@ class DivisionTestCase(test.TestCase):
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
       # Do only one sess.run for speed
-      for f, (x, y) in zip(checks, sess.run(tensors)):
+      for f, (x, y) in zip(checks, self.evaluate(tensors)):
         f(x, y)
 
 
diff --git a/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py b/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
index 4f5b854e6f6825659048f46c922e24134bc73613..6aa757e293ef69040266d194aef85370b86e5b2b 100644
--- a/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
+++ b/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
@@ -86,8 +86,8 @@ class DrawBoundingBoxOpTest(test.TestCase):
       image = image_ops_impl.convert_image_dtype(image, dtypes.float32)
       image = array_ops.expand_dims(image, 0)
       image = image_ops.draw_bounding_boxes(image, bboxes)
-      with self.test_session(use_gpu=False) as sess:
-        op_drawn_image = np.squeeze(sess.run(image), 0)
+      with self.cached_session(use_gpu=False) as sess:
+        op_drawn_image = np.squeeze(self.evaluate(image), 0)
         self.assertAllEqual(test_drawn_image, op_drawn_image)
 
   def testDrawBoundingBoxRGBColorCycling(self):
diff --git a/tensorflow/python/kernel_tests/duplicate_op_test.py b/tensorflow/python/kernel_tests/duplicate_op_test.py
index 654267a58252060db890891cc4a0d7f8d0b2afdd..fef3127d4a84e5be59bb5a8e50dd60944fe57606 100644
--- a/tensorflow/python/kernel_tests/duplicate_op_test.py
+++ b/tensorflow/python/kernel_tests/duplicate_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import load_library
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -27,6 +28,7 @@ from tensorflow.python.platform import test
 
 class DuplicateOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'duplicate_op.so')
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index 9557e3099332a4fb73930758f5d9f817b6b9c11f..8c448194076ba72cc5f8efb66dbfd3d75bd7c7ef 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -25,6 +25,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -34,13 +35,14 @@ from tensorflow.python.platform import test
 
 class DynamicPartitionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimpleOneDimensional(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant([0, 13, 2, 39, 4, 17], dtype=dtypes.float32)
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([0, 13], partition_vals[0])
@@ -54,15 +56,16 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual([None], partitions[2].get_shape().as_list())
     self.assertEqual([None], partitions[3].get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testSimpleTwoDimensional(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
                                    [12, 13, 14], [15, 16, 17]],
                                   dtype=dtypes.float32)
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([[0, 1, 2], [3, 4, 5]], partition_vals[0])
@@ -82,12 +85,12 @@ class DynamicPartitionTest(test.TestCase):
     indices_list = [x % 2 for x in range(num)]
     part1 = [x for x in range(num) if x % 2 == 0]
     part2 = [x for x in range(num) if x % 2 == 1]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.float32)
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual(part1, partition_vals[0])
@@ -104,12 +107,12 @@ class DynamicPartitionTest(test.TestCase):
     parts = [[] for _ in range(num_partitions)]
     for i in range(rows):
       parts[(i ** 2) % num_partitions].append(data_list[i])
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.float32)
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=num_partitions)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(num_partitions, len(partition_vals))
     for i in range(num_partitions):
@@ -120,12 +123,12 @@ class DynamicPartitionTest(test.TestCase):
   def testSimpleComplex(self):
     data_list = [1 + 2j, 3 + 4j, 5 + 6j, 7 + 8j]
     indices_list = [1, 0, 1, 0]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.complex64)
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([3 + 4j, 7 + 8j], partition_vals[0])
@@ -133,12 +136,12 @@ class DynamicPartitionTest(test.TestCase):
 
   def testScalarPartitions(self):
     data_list = [10, 13, 12, 11]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.float64)
       indices = 3
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
@@ -151,9 +154,10 @@ class DynamicPartitionTest(test.TestCase):
                                  dtype=np.float64).reshape(-1, 4),
                         partition_vals[3])
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     np.random.seed(7)
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for n in 2, 3:
         for shape in (4,), (4, 5), (4, 5, 2):
           partitions = np.random.randint(n, size=np.prod(shape)).reshape(shape)
@@ -164,7 +168,7 @@ class DynamicPartitionTest(test.TestCase):
             outputs = data_flow_ops.dynamic_partition(
                 data_t, partitions_t, num_partitions=n)
             self.assertEqual(n, len(outputs))
-            outputs_val = sess.run(outputs)
+            outputs_val = self.evaluate(outputs)
             for i, output in enumerate(outputs_val):
               self.assertAllEqual(output, data[partitions == i])
 
@@ -178,12 +182,12 @@ class DynamicPartitionTest(test.TestCase):
   def testEmptyParts(self):
     data_list = [1, 2, 3, 4]
     indices_list = [1, 3, 1, 3]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.float32)
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
@@ -194,12 +198,12 @@ class DynamicPartitionTest(test.TestCase):
   def testEmptyDataTwoDimensional(self):
     data_list = [[], []]
     indices_list = [0, 1]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.float32)
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=3)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(3, len(partition_vals))
     self.assertAllEqual([[]], partition_vals[0])
@@ -210,12 +214,12 @@ class DynamicPartitionTest(test.TestCase):
   def testEmptyPartitions(self):
     data_list = []
     indices_list = []
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.float32)
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
@@ -231,12 +235,12 @@ class DynamicPartitionTest(test.TestCase):
 
     data_list = [1, 2, 3, 4, 5, 6]
     indices_list = [6, 5, 4, 3, 1, 0]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.float32)
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([6], partition_vals[0])
@@ -252,12 +256,12 @@ class DynamicPartitionTest(test.TestCase):
 
     data_list = [1, 2, 3, 4, 5, 6]
     indices_list = [10, 11, 2, 12, 0, 1000]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.float32)
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=5)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(5, len(partition_vals))
     self.assertAllEqual([5], partition_vals[0])
@@ -276,17 +280,18 @@ class DynamicPartitionTest(test.TestCase):
 
     data_list = [1.1, 2.1, 3.1, 4.1, 5.1, 6.1]
     indices_list = [90, 70, 60, 100, 110, 40]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       data = constant_op.constant(data_list, dtype=dtypes.float32)
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=40)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(40, len(partition_vals))
     for i in range(40):
       self.assertAllEqual([], partition_vals[i])
 
+  @test_util.run_deprecated_v1
   def testErrorIndexOutOfRange(self):
     with self.cached_session() as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
@@ -295,16 +300,18 @@ class DynamicPartitionTest(test.TestCase):
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
       with self.assertRaisesOpError(r"partitions\[2\] = 99 is not in \[0, 4\)"):
-        sess.run(partitions)
+        self.evaluate(partitions)
 
+  @test_util.run_deprecated_v1
   def testScalarIndexOutOfRange(self):
     with self.cached_session() as sess:
       bad = 17
       data = np.zeros(5)
       partitions = data_flow_ops.dynamic_partition(data, bad, num_partitions=7)
       with self.assertRaisesOpError(r"partitions = 17 is not in \[0, 7\)"):
-        sess.run(partitions)
+        self.evaluate(partitions)
 
+  @test_util.run_deprecated_v1
   def testHigherRankIndexOutOfRange(self):
     with self.cached_session() as sess:
       shape = (2, 3)
@@ -320,6 +327,7 @@ class DynamicPartitionTest(test.TestCase):
               r"partitions\[%d,%d\] = 17 is not in \[0, 7\)" % (i, j)):
             sess.run(partitions, feed_dict={indices: bad})
 
+  @test_util.run_deprecated_v1
   def testErrorWrongDimsIndices(self):
     data = constant_op.constant([[0], [1], [2]])
     indices = constant_op.constant([[0], [0]])
@@ -335,7 +343,7 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual(len(inds), x.shape[0])
     partitioned = data_flow_ops.dynamic_partition(x, inds, 16)
     with self.cached_session() as sess:
-      res = sess.run(partitioned)
+      res = self.evaluate(partitioned)
     self.assertEqual(res[-1].shape[0], 192)
 
 
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index 3a1036e52a461b7f7448ef99876bbe0117038032..4f338880aa3564c4bf37102c7d01c8768ef07d58 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -22,8 +22,10 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import gradients_impl
 import tensorflow.python.ops.data_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -35,18 +37,19 @@ class DynamicStitchTestBase(object):
     self.stitch_op = stitch_op
 
   def testScalar(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant(0), constant_op.constant(1)]
       data = [constant_op.constant(40), constant_op.constant(60)]
       for step in -1, 1:
         stitched_t = self.stitch_op(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40, 60][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceForScalarWithNonConstantIndices(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           array_ops.placeholder(dtype=dtypes.int32),
           constant_op.constant(1)
@@ -60,33 +63,40 @@ class DynamicStitchTestBase(object):
         self.assertEqual([None], stitched_t.get_shape().as_list())
 
   def testSimpleOneDimensional(self):
-    with self.test_session(use_gpu=True):
-      indices = [
-          constant_op.constant([0, 4, 7]),
-          constant_op.constant([1, 6, 2, 3, 5])
+    with test_util.use_gpu():
+      # Test various datatypes in the simple case to ensure that the op was
+      # registered under those types.
+      dtypes_to_test = [
+          dtypes.float32, dtypes.qint8, dtypes.quint8, dtypes.qint32
       ]
-      data = [
-          constant_op.constant([0, 40, 70]),
-          constant_op.constant([10, 60, 20, 30, 50])
-      ]
-      stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
-      self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
-      # Dimension 0 is max(flatten(indices))+1.
-      self.assertEqual([8], stitched_t.get_shape().as_list())
+      for dtype in dtypes_to_test:
+        indices = [
+            constant_op.constant([0, 4, 7]),
+            constant_op.constant([1, 6, 2, 3, 5])
+        ]
+        data = [
+            math_ops.cast(constant_op.constant([0, 40, 70]), dtype=dtype),
+            math_ops.cast(
+                constant_op.constant([10, 60, 20, 30, 50]), dtype=dtype)
+        ]
+        stitched_t = self.stitch_op(indices, data)
+        stitched_val = self.evaluate(stitched_t)
+        self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
+        # Dimension 0 is max(flatten(indices))+1.
+        self.assertEqual([8], stitched_t.get_shape().as_list())
 
   def testOneListOneDimensional(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant([1, 6, 2, 3, 5, 0, 4, 7])]
       data = [constant_op.constant([10, 60, 20, 30, 50, 0, 40, 70])]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8], stitched_t.get_shape().as_list())
 
   def testSimpleTwoDimensional(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           constant_op.constant([0, 4, 7]),
           constant_op.constant([1, 6]),
@@ -98,14 +108,14 @@ class DynamicStitchTestBase(object):
           constant_op.constant([[20, 21], [30, 31], [50, 51]])
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
                            [50, 51], [60, 61], [70, 71]], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
   def testZeroSizeTensor(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           constant_op.constant([0, 4, 7]),
           constant_op.constant([1, 6]),
@@ -119,14 +129,15 @@ class DynamicStitchTestBase(object):
           array_ops.zeros([0, 2], dtype=dtypes.int32)
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
                            [50, 51], [60, 61], [70, 71]], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       indices = [
           constant_op.constant(6),
           constant_op.constant([4, 1]),
@@ -139,7 +150,7 @@ class DynamicStitchTestBase(object):
                                 [[1., 2.], [31., 32.]]])
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10. * np.arange(7)[:, None] + [1., 2.]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -149,8 +160,9 @@ class DynamicStitchTestBase(object):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7. * datum.eval(), grad)
+        self.assertAllEqual(7. * self.evaluate(datum), grad)
 
+  @test_util.run_deprecated_v1
   def testErrorIndicesMultiDimensional(self):
     indices = [
         constant_op.constant([0, 4, 7]),
@@ -163,6 +175,7 @@ class DynamicStitchTestBase(object):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  @test_util.run_deprecated_v1
   def testErrorDataNumDimsMismatch(self):
     indices = [
         constant_op.constant([0, 4, 7]),
@@ -175,6 +188,7 @@ class DynamicStitchTestBase(object):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  @test_util.run_deprecated_v1
   def testErrorDataDimSizeMismatch(self):
     indices = [
         constant_op.constant([0, 4, 5]),
@@ -187,6 +201,7 @@ class DynamicStitchTestBase(object):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  @test_util.run_deprecated_v1
   def testErrorDataAndIndicesSizeMismatch(self):
     indices = [
         constant_op.constant([0, 4, 7]),
@@ -214,18 +229,19 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
     DynamicStitchTestBase.__init__(self, data_flow_ops.parallel_dynamic_stitch)
 
   def testScalar(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant(0), constant_op.constant(1)]
       data = [constant_op.constant(40.0), constant_op.constant(60.0)]
       for step in -1, 1:
         stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40.0, 60.0][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       indices = [
           constant_op.constant(6),
           constant_op.constant([4, 1]),
@@ -238,7 +254,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
               [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -248,7 +264,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * datum.eval(), grad)
+        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
   # GPU version unit tests
   def testScalarGPU(self):
@@ -257,11 +273,12 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
       data = [constant_op.constant(40.0), constant_op.constant(60.0)]
       for step in -1, 1:
         stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40.0, 60.0][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testHigherRankGPU(self):
     with self.cached_session() as sess:
       indices = [
@@ -276,7 +293,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
               [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -286,7 +303,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * datum.eval(), grad)
+        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/edit_distance_op_test.py b/tensorflow/python/kernel_tests/edit_distance_op_test.py
index 12f85af7a53918bcf7228ae09645f735f4477c99..4a06ab770aaa072c8858e0f527f21dcbc10bbbdd 100644
--- a/tensorflow/python/kernel_tests/edit_distance_op_test.py
+++ b/tensorflow/python/kernel_tests/edit_distance_op_test.py
@@ -49,11 +49,11 @@ class EditDistanceTest(test.TestCase):
 
     if expected_err_re is None:
       self.assertEqual(edit_distance.get_shape(), expected_shape)
-      output = edit_distance.eval()
+      output = self.evaluate(edit_distance)
       self.assertAllClose(output, expected_output)
     else:
       with self.assertRaisesOpError(expected_err_re):
-        edit_distance.eval()
+        self.evaluate(edit_distance)
 
   def _testEditDistance(self,
                         hypothesis,
@@ -68,7 +68,7 @@ class EditDistanceTest(test.TestCase):
     ]
 
     # SparseTensorValue inputs.
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       # hypothesis and truth are (index, value, shape) tuples
       self._testEditDistanceST(
           hypothesis_st=sparse_tensor.SparseTensorValue(
@@ -81,7 +81,7 @@ class EditDistanceTest(test.TestCase):
           expected_err_re=expected_err_re)
 
     # SparseTensor inputs.
-    with ops.Graph().as_default() as g, self.test_session(g):
+    with ops.Graph().as_default() as g, self.session(g):
       # hypothesis and truth are (index, value, shape) tuples
       self._testEditDistanceST(
           hypothesis_st=sparse_tensor.SparseTensor(
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 40b8548cea628118b9483e6994a361cc4eeba165..3ea2071e13a24fb804924081add2f2b41f314716 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import embedding_ops
@@ -61,7 +62,7 @@ class ScatterAddSubTest(test.TestCase):
       scatter_op: ScatterAdd or ScatterSub.
     """
     super(ScatterAddSubTest, self).setUp()
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       # Create a random parameter array of given shape
       p_init = np.random.rand(*shape).astype("f")
       # Create the shape of the update array. All dimensions except the last
@@ -76,7 +77,7 @@ class ScatterAddSubTest(test.TestCase):
       # p = init
       variables.global_variables_initializer().run()
       # p += vals
-      result = p2.eval()
+      result = self.evaluate(p2)
     # Compute the expected 'p' using numpy operations.
     for i, ind in enumerate(indices):
       if scatter_op == state_ops.scatter_add:
@@ -87,16 +88,19 @@ class ScatterAddSubTest(test.TestCase):
             vals_shape[0], -1)[i, :])
     self.assertTrue(all((p_init == result).ravel()))
 
+  @test_util.run_deprecated_v1
   def testNoRepetitions(self):
     self._TestCase([2, 2], [1])
     self._TestCase([4, 4, 4], [2, 0])
     self._TestCase([43, 20, 10, 10], [42, 5, 6, 1, 3, 5, 7, 9])
 
+  @test_util.run_deprecated_v1
   def testWithRepetitions(self):
     self._TestCase([2, 2], [1, 1])
     self._TestCase([5, 3, 9, 5], [2, 0, 4, 1, 3, 1, 4, 0, 4, 3])
     self._TestCase([32, 4, 4], [31] * 8)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     # Random shapes of rank 4, random indices
     for _ in range(5):
@@ -104,6 +108,7 @@ class ScatterAddSubTest(test.TestCase):
       indices = np.random.randint(shape[0], size=2 * shape[0])
       self._TestCase(_AsLong(list(shape)), list(indices))
 
+  @test_util.run_deprecated_v1
   def testSubRandom(self):
     # Random shapes of rank 4, random indices
     for _ in range(5):
@@ -111,6 +116,7 @@ class ScatterAddSubTest(test.TestCase):
       indices = np.random.randint(shape[0], size=2 * shape[0])
       self._TestCase(_AsLong(list(shape)), list(indices), state_ops.scatter_sub)
 
+  @test_util.run_deprecated_v1
   def testWrongShape(self):
     # Indices and values mismatch.
     var = variables.Variable(
@@ -241,6 +247,7 @@ class EmbeddingLookupTest(test.TestCase):
   # both the ids are in the first shard, one of the resulting lookup
   # vector is going to be empty. The subsequent DivOp fails because of that.
   # TODO(keveman): Disabling the test until the underlying problem is fixed.
+  @test_util.run_deprecated_v1
   def testSimpleSharded(self):
     with self.cached_session():
       num_shards = 2
@@ -257,6 +264,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testMaxNorm(self):
     with self.cached_session():
       embeddings = constant_op.constant([[2.0]])
@@ -267,6 +275,7 @@ class EmbeddingLookupTest(test.TestCase):
 
       self.assertAllEqual(embedding.eval(), [[1.0]])
 
+  @test_util.run_deprecated_v1
   def testMaxNormNontrivial(self):
     with self.cached_session():
       embeddings = constant_op.constant([[2.0, 4.0], [3.0, 1.0]])
@@ -278,8 +287,9 @@ class EmbeddingLookupTest(test.TestCase):
       norms = math_ops.sqrt(
           math_ops.reduce_sum(embeddings * embeddings, axis=1))
       normalized = embeddings / array_ops.stack([norms, norms], axis=1)
-      self.assertAllEqual(embedding.eval(), 2 * normalized.eval())
+      self.assertAllEqual(embedding.eval(), 2 * self.evaluate(normalized))
 
+  @test_util.run_deprecated_v1
   def testSimpleShardedPartitionedVariable(self):
     with self.cached_session() as sess:
       num_shards = 2
@@ -294,7 +304,7 @@ class EmbeddingLookupTest(test.TestCase):
       variables.global_variables_initializer().run()
       params_values = [params[p_i.name] for p_i in p]
       # Test that the PartitionedVariable components equal the list in p
-      p_var_val = sess.run(list(p_variable))
+      p_var_val = self.evaluate(list(p_variable))
       # Actual test
       tf_result = embedding.eval(feed_dict=feed_dict)
     np_result, _, _ = _EmbeddingResult(params, id_vals, num_shards, vocab_size)
@@ -302,6 +312,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testSimpleShardedPartitionedResourceVariable(self):
     with self.cached_session() as sess:
       num_shards = 2
@@ -316,15 +327,16 @@ class EmbeddingLookupTest(test.TestCase):
       variables.global_variables_initializer().run()
       params_values = [params[p_i.name] for p_i in p]
       # Test that the PartitionedVariable components equal the list in p
-      p_var_val = sess.run(list(p_variable))
+      p_var_val = self.evaluate(list(p_variable))
       # Actual test
       print(ops.get_default_graph().as_graph_def())
-      tf_result = embedding.eval()
+      tf_result = self.evaluate(embedding)
     np_result, _, _ = _EmbeddingResult(params, id_vals, num_shards, vocab_size)
     self.assertAllEqual(params_values, p_var_val)
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedModPartitioningInt32Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -347,6 +359,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedModPartitioningInt64Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -369,6 +382,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningInt32Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -393,6 +407,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningInt32IdsPartitionedVariable(self):
     with self.cached_session():
       num_shards = 5
@@ -418,6 +433,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningInt64Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -442,6 +458,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningUnknownParamShape(self):
     with self.cached_session():
       num_shards = 5
@@ -468,6 +485,7 @@ class EmbeddingLookupTest(test.TestCase):
         params, id_vals, num_shards, vocab_size, partition_strategy="div")
     self.assertAllEqual(np_result, tf_result)
 
+  @test_util.run_deprecated_v1
   def testGradientsEmbeddingLookup(self):
     vocab_size = 9
     num_ids = 10
@@ -488,6 +506,7 @@ class EmbeddingLookupTest(test.TestCase):
               x, x_shape, y, y_shape, x_init_value=x_init_value)
         self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientsEmbeddingLookupWithComputedParams(self):
     vocab_size = 9
     num_ids = 5
@@ -526,6 +545,7 @@ class EmbeddingLookupTest(test.TestCase):
         ids = constant_op.constant([0, 1, 1, 17], dtype=dtypes.int32)
       embedding_ops.embedding_lookup(p, ids)
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     np.random.seed(8)
     with self.cached_session():
@@ -546,6 +566,7 @@ class EmbeddingLookupTest(test.TestCase):
             sharded = embedding_ops.embedding_lookup(split_params, ids).eval()
             self.assertAllEqual(simple, sharded)
 
+  @test_util.run_deprecated_v1
   def testHigherRankMaxNorm(self):
     np.random.seed(8)
     with self.cached_session():
@@ -574,6 +595,7 @@ class EmbeddingLookupTest(test.TestCase):
                 split_params, ids, max_norm=1.0).eval()
             self.assertAllEqual(simple, sharded)
 
+  @test_util.run_deprecated_v1
   def testTransform(self):
     # This tests all combinations of:
     #   - ids rank 0, 1, >1
@@ -648,6 +670,7 @@ class EmbeddingLookupSparseTest(test.TestCase):
       index += num_val
     return grouped_vals
 
+  @test_util.run_deprecated_v1
   def testEmbeddingLookupSparse(self):
     vocab_size = 13
     batch_size = 10
@@ -706,6 +729,7 @@ class EmbeddingLookupSparseTest(test.TestCase):
         atol = rtol
         self.assertAllClose(np_embedding_sum, tf_embedding_sum, rtol, atol)
 
+  @test_util.run_deprecated_v1
   def testGradientsEmbeddingLookupSparse(self):
     vocab_size = 12
     batch_size = 4
@@ -733,6 +757,7 @@ class EmbeddingLookupSparseTest(test.TestCase):
             x, x_shape, y, y_shape, x_init_value=x_init_value)
       self.assertLess(err, 1e-5 if dtype == dtypes.float64 else 2e-3)
 
+  @test_util.run_deprecated_v1
   def testIncompatibleShapes(self):
     with self.cached_session():
       x, _, _ = _EmbeddingParams(1, 10, dtype=dtypes.float32)
@@ -758,11 +783,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= vocab_size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32)
+    embedding_weights = list(variable_scope.get_variable(
+        name="embedding_weights",
         shape=[vocab_size, embed_dim],
-        slicing=[num_shards, 1],
-        initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32))
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
+        initializer=initializer))
     for w in embedding_weights:
       w.initializer.run()
     embedding_weights = [w.eval() for w in embedding_weights]
@@ -818,26 +845,31 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
 
     return sparse_ids, sparse_weights
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_return_zero_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
           [(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) /
            3.0, [0] * 4, [0] * 4, embedding_weights[0][2], [0] * 4])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_return_special_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights, default_id=3).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights,
+              default_id=3).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -845,13 +877,15 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
            3.0, embedding_weights[0][3], embedding_weights[0][3],
            embedding_weights[0][2], embedding_weights[0][3]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_no_weights(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, _ = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -859,13 +893,15 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
            [0] * 4, embedding_weights[0][2], (
                embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_partitioned(self):
     with self.cached_session():
       embedding_weights = self._random_weights(num_shards=3)
       sparse_ids, _ = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       embedding_weights = list(itertools.chain(*embedding_weights))
       self.assertAllClose(embedding_lookup_result,
@@ -873,6 +909,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
                            [0] * 4, [0] * 4, embedding_weights[2],
                            (embedding_weights[0] + embedding_weights[1]) / 2.0])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_partitioned_inconsistent_weights(self):
     with self.cached_session():
       embedding_weights = self._random_weights(num_shards=3)
@@ -888,26 +925,31 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids, sparse_weights)
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_return_zero_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights).eval())
 
       self.assertAllClose(embedding_lookup_result, [[
           (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0,
           [0] * 4, [0] * 4
       ], [embedding_weights[0][2], [0] * 4, [0] * 4]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_return_special_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights, default_id=3).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights,
+              default_id=3).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -917,13 +959,15 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
                 embedding_weights[0][3]
             ]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_no_weights(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, _ = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       self.assertAllClose(embedding_lookup_result, [[(
           embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [
@@ -933,13 +977,15 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
               (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4
           ]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_partitioned(self):
     with self.cached_session():
       embedding_weights = self._random_weights(num_shards=3)
       sparse_ids, _ = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       embedding_weights = list(itertools.chain(*embedding_weights))
       self.assertAllClose(embedding_lookup_result, [[
@@ -949,6 +995,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
           (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4
       ]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_partitioned_inconsistent_weights(
       self):
     with self.cached_session():
@@ -968,8 +1015,9 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
 
 class DynamicStitchOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCint32Cpu(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       indices = [
           ops.convert_to_tensor([0, 1, 2]),
           ops.convert_to_tensor([2, 3])
@@ -981,8 +1029,9 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testCint32Gpu(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       indices = [
           ops.convert_to_tensor([0, 1, 2]),
           ops.convert_to_tensor([2, 3])
@@ -994,8 +1043,9 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testInt32Cpu(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       indices = [
           ops.convert_to_tensor([0, 1, 2]),
           ops.convert_to_tensor([2, 3])
@@ -1007,8 +1057,9 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testInt32Gpu(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       indices = [
           ops.convert_to_tensor([0, 1, 2]),
           ops.convert_to_tensor([2, 3])
@@ -1020,8 +1071,9 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testSumGradArgs(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       indices = [
           ops.convert_to_tensor([0, 1, 2, 3]),
           ops.convert_to_tensor([2, 3])
@@ -1034,6 +1086,7 @@ class DynamicStitchOpTest(test.TestCase):
           data_flow_ops.dynamic_stitch(indices, values).eval(), [2, 3, 1, 1])
 
   # We expect that the values are merged in order.
+  @test_util.run_deprecated_v1
   def testStitchOrder(self):
     with self.cached_session():
       indices = []
@@ -1049,8 +1102,9 @@ class DynamicStitchOpTest(test.TestCase):
 
 class ParallelDynamicStitchOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCint32Cpu(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       indices = [
           ops.convert_to_tensor([0, 1, 4, 6]),
           ops.convert_to_tensor([2, 3, 5])
@@ -1063,8 +1117,9 @@ class ParallelDynamicStitchOpTest(test.TestCase):
           data_flow_ops.parallel_dynamic_stitch(indices, values).eval(),
           [12, 23, 1, 2, 34, 3, 45])
 
+  @test_util.run_deprecated_v1
   def testInt32Cpu(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       indices = [
           ops.convert_to_tensor([0, 1, 5, 6, 7]),
           ops.convert_to_tensor([2, 4, 3])
@@ -1077,8 +1132,9 @@ class ParallelDynamicStitchOpTest(test.TestCase):
           data_flow_ops.parallel_dynamic_stitch(indices, values).eval(),
           [12, 23, 1, 2, 3, 34, 45, 56])
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       indices = [ops.convert_to_tensor([0, 1]), ops.convert_to_tensor([2, 3])]
       values = [ops.convert_to_tensor([2, 3]), ops.convert_to_tensor([1, 1])]
       self.assertAllEqual(
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
index 7d9d4e517527e457c0da73d4f4b2a8763359a693..7ba2dc6c20951d00994978790a26c17c59233d0a 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed as random_seed_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -78,6 +79,7 @@ class ExtractImagePatchesGradTest(test.TestCase):
       },
   ]
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     # Set graph seed for determinism.
     random_seed = 42
@@ -102,6 +104,7 @@ class ExtractImagePatchesGradTest(test.TestCase):
           print('extract_image_patches gradient err: %.4e' % err)
           self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testConstructGradientWithLargeImages(self):
     batch_size = 4
     height = 1024
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 6ea9f1badc3b8fac06fe6328f95714b93de97c0e..bb3c0ae80694035dd362f5024ecdddeb0e364bb0 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -43,7 +44,7 @@ class ExtractImagePatches(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       out_tensor = array_ops.extract_image_patches(
           constant_op.constant(image),
           ksizes=ksizes,
@@ -51,7 +52,7 @@ class ExtractImagePatches(test.TestCase):
           rates=rates,
           padding=padding,
           name="im2col")
-      self.assertAllClose(patches, out_tensor.eval())
+      self.assertAllClose(patches, self.evaluate(out_tensor))
 
   def testKsize1x1Stride1x1Rate1x1(self):
     """Verifies that for 1x1 kernel the output equals the input."""
diff --git a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
index 64757a3e07713429a36aa600a265beda86b4bd80..88f7df8fbb64512c9ca362ec7c310a5805c9c728 100644
--- a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -45,14 +46,14 @@ class ExtractVolumePatches(test.TestCase):
     ksizes = [1] + ksizes + [1]
     strides = [1] + strides + [1]
 
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       out_tensor = array_ops.extract_volume_patches(
           constant_op.constant(image),
           ksizes=ksizes,
           strides=strides,
           padding=padding,
           name="im2col_3d")
-      self.assertAllClose(patches, out_tensor.eval())
+      self.assertAllClose(patches, self.evaluate(out_tensor))
 
   # pylint: disable=bad-whitespace
   def testKsize1x1x1Stride1x1x1(self):
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index a5f8f64e0c23fe36eca6ddcdd4e184292720126f..0579dddb70264199a53c140ab60ad2ddf9b00bb9 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
+@test_util.run_v1_only("b/120545219")
 class FIFOQueueTest(test.TestCase):
 
   def testConstructor(self):
@@ -159,7 +160,7 @@ class FIFOQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -191,7 +192,7 @@ class FIFOQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops]
       for thread in threads:
@@ -211,7 +212,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testDequeueHalf(self):
@@ -225,7 +226,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -240,13 +241,13 @@ class FIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
       def dequeue():
         for _ in xrange(len(elems)):
-          results.append(sess.run(dequeued_t))
+          results.append(self.evaluate(dequeued_t))
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -269,7 +270,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        x_val, y_val = sess.run(dequeued_t)
+        x_val, y_val = self.evaluate(dequeued_t)
         x, y = elems[i]
         self.assertEqual([x], x_val)
         self.assertEqual([y], y_val)
@@ -288,9 +289,9 @@ class FIFOQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -302,7 +303,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i % 4]], vals)
 
   def testEmptyEnqueueMany(self):
@@ -313,9 +314,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -323,9 +324,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpTo(self):
     with self.cached_session():
@@ -333,9 +334,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithNoShape(self):
     with self.cached_session():
@@ -356,7 +357,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -369,8 +370,8 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testDequeueUpToNoBlocking(self):
     with self.cached_session():
@@ -381,8 +382,8 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testMultiDequeueMany(self):
     with self.cached_session() as sess:
@@ -399,17 +400,17 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
@@ -429,13 +430,13 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual([None], dequeued_t[0].get_shape().as_list())
       self.assertEqual([None, 2], dequeued_t[1].get_shape().as_list())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
@@ -518,7 +519,7 @@ class FIFOQueueTest(test.TestCase):
                                    r"Expected \[2,3,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testParallelEnqueueMany(self):
     with self.cached_session() as sess:
@@ -529,7 +530,7 @@ class FIFOQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -552,7 +553,7 @@ class FIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -576,7 +577,7 @@ class FIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -596,11 +597,11 @@ class FIFOQueueTest(test.TestCase):
 
       def enqueue():
         for _ in xrange(100):
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       def dequeue():
         for _ in xrange(100):
-          self.assertTrue(sess.run(dequeued_t) in (10.0, 20.0))
+          self.assertTrue(self.evaluate(dequeued_t) in (10.0, 20.0))
 
       enqueue_threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       dequeue_threads = [self.checkedThread(target=dequeue) for _ in range(10)]
@@ -632,7 +633,7 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         for i in xrange(250):
-          self.assertEqual(i, sess.run(dequeued_t))
+          self.assertEqual(i, self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -663,7 +664,7 @@ class FIFOQueueTest(test.TestCase):
       dequeuemany_t = q.dequeue_many(count_placeholder)
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -672,7 +673,7 @@ class FIFOQueueTest(test.TestCase):
       while elements_dequeued < 250:
         # With equal probability, run Dequeue or dequeue_many.
         if random.random() > 0.5:
-          self.assertEqual(elements_dequeued, dequeued_t.eval())
+          self.assertEqual(elements_dequeued, self.evaluate(dequeued_t))
           elements_dequeued += 1
         else:
           count = random.randint(0, min(20, 250 - elements_dequeued))
@@ -701,10 +702,10 @@ class FIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -728,10 +729,10 @@ class FIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -778,12 +779,12 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
       close_op.run()
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -797,11 +798,11 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         for elem in elems:
-          self.assertEqual([elem], sess.run(dequeued_t))
+          self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -821,7 +822,7 @@ class FIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -842,11 +843,11 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems, sess.run(dequeued_t))
+        self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -867,11 +868,11 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -892,8 +893,8 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
-        self.assertAllEqual(elems[3:], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
+        self.assertAllEqual(elems[3:], self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -913,16 +914,16 @@ class FIFOQueueTest(test.TestCase):
       cleanup_dequeue_t = q.dequeue()
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        self.assertAllEqual(elems[0:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[0:3], self.evaluate(dequeued_t))
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run(dequeued_t)
-        self.assertEqual(elems[3], sess.run(cleanup_dequeue_t))
+          self.evaluate(dequeued_t)
+        self.assertEqual(elems[3], self.evaluate(cleanup_dequeue_t))
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -955,7 +956,7 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run([dequeued_a_t, dequeued_b_t])
+          self.evaluate([dequeued_a_t, dequeued_b_t])
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -968,7 +969,7 @@ class FIFOQueueTest(test.TestCase):
       # Test that the elements in the partially-dequeued batch are
       # restored in the correct order.
       for elem_a, elem_b in zip(elems_a, elems_b):
-        val_a, val_b = sess.run([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
+        val_a, val_b = self.evaluate([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
         self.assertEqual(elem_a, val_a)
         self.assertEqual(elem_b, val_b)
       self.assertEqual(0, q.size().eval())
@@ -983,7 +984,7 @@ class FIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1003,7 +1004,7 @@ class FIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1051,7 +1052,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1059,8 +1060,8 @@ class FIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
-      self.assertEqual([50.0], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
       thread.join()
 
   def testBlockingEnqueueManyToFullQueue(self):
@@ -1074,7 +1075,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1082,10 +1083,10 @@ class FIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
         time.sleep(0.01)
-      self.assertEqual([50.0], dequeued_t.eval())
-      self.assertEqual([60.0], dequeued_t.eval())
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
+      self.assertEqual([60.0], self.evaluate(dequeued_t))
 
       # Make sure the thread finishes before exiting.
       thread.join()
@@ -1103,7 +1104,7 @@ class FIFOQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # Expect the operation to succeed once the dequeue op runs.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1113,18 +1114,18 @@ class FIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
 
       for elem in [20.0, 30.0, 40.0, 50.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
       self.assertEqual(0, q.size().eval())
 
   def testBlockingEnqueueManyBeforeClose(self):
@@ -1138,7 +1139,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1148,17 +1149,17 @@ class FIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
       for elem in [20.0, 30.0, 50.0, 60.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
 
   def testDoesNotLoseValue(self):
     with self.cached_session():
@@ -1266,19 +1267,19 @@ class FIFOQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1321,7 +1322,7 @@ class FIFOQueueTest(test.TestCase):
       def blocking_enqueue():
         enq_done.append(False)
         # This will fill the queue and then block until enough dequeues happen.
-        sess.run(enq)
+        self.evaluate(enq)
         enq_done.append(True)
 
       thread = self.checkedThread(target=blocking_enqueue)
@@ -1331,14 +1332,14 @@ class FIFOQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1364,7 +1365,7 @@ class FIFOQueueTest(test.TestCase):
 
       def blocking_dequeue():
         # Will only complete after 4 enqueues complete.
-        results.extend(sess.run(deq))
+        results.extend(self.evaluate(deq))
 
       thread = self.checkedThread(target=blocking_dequeue)
       thread.start()
@@ -1373,7 +1374,7 @@ class FIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         self.assertEqual(len(results), 0)
-        sess.run(enq)
+        self.evaluate(enq)
 
       # Enough enqueued to unblock the dequeue
       thread.join()
@@ -1405,7 +1406,7 @@ class FIFOQueueTest(test.TestCase):
       q.enqueue_many(input_tuple).run()
 
       output_tuple_t = q.dequeue_many(32)
-      output_tuple = sess.run(output_tuple_t)
+      output_tuple = self.evaluate(output_tuple_t)
 
       for (input_elem, output_elem) in zip(input_tuple, output_tuple):
         self.assertAllEqual(input_elem, output_elem)
@@ -1423,6 +1424,7 @@ class FIFOQueueTest(test.TestCase):
         session.run([a, c])
 
 
+@test_util.run_v1_only("b/120545219")
 class FIFOQueueDictTest(test.TestCase):
 
   def testConstructor(self):
@@ -1507,10 +1509,10 @@ class FIFOQueueDictTest(test.TestCase):
       enqueue_op4 = q.enqueue_many({"f": [40.0, 50.0]})
       dequeue = q.dequeue()
       dequeue_2 = q.dequeue_many(2)
-      sess.run(enqueue_op)
-      sess.run(enqueue_op2)
-      sess.run(enqueue_op3)
-      sess.run(enqueue_op4)
+      self.evaluate(enqueue_op)
+      self.evaluate(enqueue_op2)
+      self.evaluate(enqueue_op3)
+      self.evaluate(enqueue_op4)
       f = sess.run(dequeue["f"])
       self.assertEqual(10.0, f)
       f = sess.run(dequeue_2["f"])
@@ -1565,10 +1567,10 @@ class FIFOQueueDictTest(test.TestCase):
       })
       dequeue = q.dequeue()
       dequeue_2 = q.dequeue_many(2)
-      sess.run(enqueue_op)
-      sess.run(enqueue_op2)
-      sess.run(enqueue_op3)
-      sess.run(enqueue_op4)
+      self.evaluate(enqueue_op)
+      self.evaluate(enqueue_op2)
+      self.evaluate(enqueue_op3)
+      self.evaluate(enqueue_op4)
       i, f, s = sess.run([dequeue["i"], dequeue["f"], dequeue["s"]])
       self.assertEqual(123, i)
       self.assertEqual(10.0, f)
@@ -1583,10 +1585,11 @@ class FIFOQueueDictTest(test.TestCase):
       self.assertTrue([compat.as_bytes("dd"), compat.as_bytes("ee")], list(s))
 
 
+@test_util.run_v1_only("b/120545219")
 class FIFOQueueWithTimeoutTest(test.TestCase):
 
   def testDequeueWithTimeout(self):
-    with self.test_session(
+    with self.session(
         config=config_pb2.ConfigProto(operation_timeout_in_ms=20)) as sess:
       q = data_flow_ops.FIFOQueue(10, dtypes_lib.float32)
       self.assertEqual(
@@ -1597,7 +1600,7 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
       # until operation_timeout_in_ms.
       with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
                                    "Timed out waiting for notification"):
-        sess.run(dequeued_t)
+        self.evaluate(dequeued_t)
 
   def testReusableAfterTimeout(self):
     with self.cached_session() as sess:
@@ -1613,10 +1616,11 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
                                    "Timed out waiting for notification"):
         sess.run(dequeued_t, options=config_pb2.RunOptions(timeout_in_ms=10))
 
-      sess.run(enqueue_op)
-      self.assertEqual(37, sess.run(dequeued_t))
+      self.evaluate(enqueue_op)
+      self.assertEqual(37, self.evaluate(dequeued_t))
 
 
+@test_util.run_v1_only("b/120545219")
 class QueueContainerTest(test.TestCase):
 
   def testContainer(self):
diff --git a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
index f89d2062f1e736068a50344234b05aad423a17e7..0d5928aefacf5a395c0f1c61ef997914aca000e8 100644
--- a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -37,7 +38,6 @@ class FractionalAvgTest(test.TestCase):
   # Random number generate with seed.
   _PRNG = np.random.RandomState(341261000)
   _SEED = 341261001
-  _SEED2 = 341261002
 
   def _AvgPoolAlongRows(self, input_matrix, row_seq, overlapping):
     """Perform average pool along row of a 2-D matrix based on row_seq.
@@ -128,15 +128,13 @@ class FractionalAvgTest(test.TestCase):
       None
     """
     with self.cached_session() as sess:
-      p, r, c = nn_ops.fractional_avg_pool(
+      p, r, c = nn_ops.fractional_avg_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual, row_seq, col_seq = sess.run([p, r, c])
+          seed=self._SEED)
+      actual, row_seq, col_seq = self.evaluate([p, r, c])
       expected = self._GetExpectedFractionalAvgPoolResult(input_tensor, row_seq,
                                                           col_seq, overlapping)
       self.assertShapeEqual(expected, p)
@@ -161,15 +159,13 @@ class FractionalAvgTest(test.TestCase):
       rand_mat = self._PRNG.randint(10, size=tensor_shape)
       pooling_ratio = [1, math.sqrt(2), math.sqrt(2), 1]
       with self.cached_session() as sess:
-        p, r, c = nn_ops.fractional_avg_pool(
+        p, r, c = nn_ops.fractional_avg_pool_v2(
             rand_mat.astype(np.float32),
             pooling_ratio,
             pseudo_random,
             overlapping,
-            deterministic=True,
-            seed=self._SEED,
-            seed2=self._SEED2)
-        tensor_output, row_seq, col_seq = sess.run([p, r, c])
+            seed=self._SEED)
+        tensor_output, row_seq, col_seq = self.evaluate([p, r, c])
         expected_result = self._GetExpectedFractionalAvgPoolResult(
             rand_mat.astype(np.float32), row_seq, col_seq, overlapping)
         print("row sequence:")
@@ -214,12 +210,6 @@ class FractionalAvgTest(test.TestCase):
 
   def testIntegerTensorInput(self):
     """Test FractionalAvgPool works fine when input tensor is integer type.
-
-    I would have used _ValidateFractionalAvgPoolResult function to automate this
-    process, however, there's rounding issue. It is caused by numpy.mean cast
-    integer input to numpy.float64 for intermediate use. While for
-    fractional_avg_pool, the mean operation is integer division (trucated).  So,
-    for this test case, I will hard code a simple matrix.
     """
     pseudo_random = True
     overlapping = True
@@ -234,29 +224,9 @@ class FractionalAvgTest(test.TestCase):
         [4, 4, 5, 9, 7, 2]
     ])
     # pyformat: enable
-    with self.cached_session() as sess:
-      # Since deterministic = True, seed and seed2 are fixed. Therefore r, and c
-      # are the same each time. We can have an expected result precomputed.
-      # r = [0, 2, 4, 6]
-      # c = [0, 1, 3, 4, 6]
-
-      # pyformat: disable
-      expected = np.array([
-          [6, 5, 3, 5],
-          [5, 5, 4, 5],
-          [5, 4, 7, 5]
-      ]).reshape((1, 3, 4, 1))
-      # pyformat: enable
-      p, unused_r, unused_c = nn_ops.fractional_avg_pool(
-          mat.reshape(tensor_shape), [1, math.sqrt(3), math.sqrt(2), 1],
-          pseudo_random,
-          overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual = sess.run(p)
-      self.assertShapeEqual(expected, p)
-      self.assertAllClose(expected, actual)
+    self._ValidateFractionalAvgPoolResult(mat.reshape(tensor_shape),
+                                          [1, math.sqrt(3), math.sqrt(2), 1],
+                                          pseudo_random, overlapping)
 
   def testDifferentTensorShapes(self):
     """Test different shapes of input tensor.
@@ -312,6 +282,7 @@ class FractionalAvgTest(test.TestCase):
     self._ValidateFractionalAvgPoolResult(rand_mat, [1, 2, 2, 1], pseudo_random,
                                           overlapping)
 
+  @test_util.run_deprecated_v1
   def testDifferentInputTensorShape(self):
     """Runs the operation in one session with different input tensor shapes."""
     with self.cached_session() as sess:
@@ -320,14 +291,12 @@ class FractionalAvgTest(test.TestCase):
       pooling_ratio = [1, 1.5, 1.5, 1]
       pseudo_random = False
       overlapping = False
-      p, r, c = nn_ops.fractional_avg_pool(
+      p, r, c = nn_ops.fractional_avg_pool_v2(
           input_holder,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # First run.
       input_a = np.zeros([3, 32, 32, 3])
       actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
@@ -372,7 +341,6 @@ class FractionalAvgPoolGradTest(test.TestCase):
   """
   _PRNG = np.random.RandomState(341261004)
   _SEED = 341261005
-  _SEED2 = 341261006
 
   def _GenerateRandomInputTensor(self, shape):
     num_elements = 1
@@ -398,7 +366,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.avg_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               num_elements = 1
               for dim_size in output_data.shape:
                 num_elements *= dim_size
@@ -407,7 +375,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
               fap_input_backprop_tensor = gen_nn_ops.fractional_avg_pool_grad(
@@ -416,7 +384,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=False)
-              fap_input_backprop = fap_input_backprop_tensor.eval()
+              fap_input_backprop = self.evaluate(fap_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fap_input_backprop_tensor)
               self.assertAllClose(input_backprop, fap_input_backprop)
 
@@ -437,7 +405,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.avg_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               num_elements = 1
               for dim_size in output_data.shape:
                 num_elements *= dim_size
@@ -446,7 +414,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows, row_window_size - 1))
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
@@ -457,10 +425,11 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=True)
-              fap_input_backprop = fap_input_backprop_tensor.eval()
+              fap_input_backprop = self.evaluate(fap_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fap_input_backprop_tensor)
               self.assertAllClose(input_backprop, fap_input_backprop)
 
+  @test_util.run_deprecated_v1
   def testAllInputOptionsThroughGradientError(self):
     input_shape = (1, 7, 13, 1)
     input_data = self._GenerateRandomInputTensor(input_shape)
@@ -470,15 +439,13 @@ class FractionalAvgPoolGradTest(test.TestCase):
       for overlapping in True, False:
         with self.cached_session() as _:
           input_tensor = constant_op.constant(input_data, shape=input_shape)
-          output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+          output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
               input_tensor,
               pooling_ratio,
               pseudo_random=pseudo_random,
               overlapping=overlapping,
-              deterministic=True,
-              seed=self._SEED,
-              seed2=self._SEED2)
-          output_data = output_tensor.eval()
+              seed=self._SEED)
+          output_data = self.evaluate(output_tensor)
           output_shape = output_data.shape
           # error_margin and delta setting is similar to avg_pool_grad.
           error_margin = 1e-4
@@ -491,6 +458,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               delta=1e-2)
           self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testDifferentTensorShapesThroughGradientError(self):
     pseudo_random = True
     overlapping = True
@@ -503,15 +471,13 @@ class FractionalAvgPoolGradTest(test.TestCase):
             input_data = self._GenerateRandomInputTensor(input_shape)
             with self.cached_session() as _:
               input_tensor = constant_op.constant(input_data, shape=input_shape)
-              output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+              output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
                   input_tensor,
                   pooling_ratio,
                   pseudo_random=pseudo_random,
                   overlapping=overlapping,
-                  deterministic=True,
-                  seed=self._SEED,
-                  seed2=self._SEED2)
-              output_data = output_tensor.eval()
+                  seed=self._SEED)
+              output_data = self.evaluate(output_tensor)
               output_shape = output_data.shape
               # error_margin and delta setting is similar to avg_pool_grad.
               error_margin = 1e-4
@@ -524,6 +490,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   delta=1e-2)
               self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testLargePoolingRatioThroughGradientError(self):
     input_shape = (1, 17, 23, 1)
     input_data = self._GenerateRandomInputTensor(input_shape)
@@ -534,14 +501,12 @@ class FractionalAvgPoolGradTest(test.TestCase):
 
     with self.cached_session() as _:
       input_tensor = constant_op.constant(input_data, shape=input_shape)
-      output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+      output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random=pseudo_random,
           overlapping=overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # error_margin and delta setting is similar to avg_pool_grad.
       error_margin = 1e-4
       gradient_error = gradient_checker.compute_gradient_error(
diff --git a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
index 9b94ca85547590600306bf8aef2caa0f3c3eac8e..fa886cc215a7d814bbc13cb3be0c8712100f81d7 100644
--- a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -37,7 +38,6 @@ class FractionalMaxPoolTest(test.TestCase):
   # Random number generate with seed.
   _PRNG = np.random.RandomState(341261)
   _SEED = 123456
-  _SEED2 = 654321
 
   def _MaxPoolAlongRows(self, input_matrix, row_seq, overlapping):
     """Perform max pool along row of a 2-D matrix based on row_seq.
@@ -128,15 +128,13 @@ class FractionalMaxPoolTest(test.TestCase):
       None
     """
     with self.cached_session() as sess:
-      p, r, c = nn_ops.fractional_max_pool(
+      p, r, c = nn_ops.fractional_max_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual, row_seq, col_seq = sess.run([p, r, c])
+          seed=self._SEED)
+      actual, row_seq, col_seq = self.evaluate([p, r, c])
       expected = self._GetExpectedFractionalMaxPoolResult(input_tensor, row_seq,
                                                           col_seq, overlapping)
       self.assertShapeEqual(expected, p)
@@ -161,15 +159,13 @@ class FractionalMaxPoolTest(test.TestCase):
       rand_mat = self._PRNG.randint(10, size=tensor_shape)
       pooling_ratio = [1, math.sqrt(2), math.sqrt(2), 1]
       with self.cached_session() as sess:
-        p, r, c = nn_ops.fractional_max_pool(
+        p, r, c = nn_ops.fractional_max_pool_v2(
             rand_mat,
             pooling_ratio,
             pseudo_random,
             overlapping,
-            deterministic=True,
-            seed=self._SEED,
-            seed2=self._SEED2)
-        tensor_output, row_seq, col_seq = sess.run([p, r, c])
+            seed=self._SEED)
+        tensor_output, row_seq, col_seq = self.evaluate([p, r, c])
         expected_result = self._GetExpectedFractionalMaxPoolResult(rand_mat,
                                                                    row_seq,
                                                                    col_seq,
@@ -283,6 +279,7 @@ class FractionalMaxPoolTest(test.TestCase):
     self._ValidateFractionalMaxPoolResult(rand_mat, [1, 2, 2, 1], pseudo_random,
                                           overlapping)
 
+  @test_util.run_deprecated_v1
   def testDifferentInputTensorShape(self):
     """Runs the operation in one session with different input tensor shapes."""
     with self.cached_session() as sess:
@@ -291,14 +288,12 @@ class FractionalMaxPoolTest(test.TestCase):
       pooling_ratio = [1, 1.5, 1.5, 1]
       pseudo_random = False
       overlapping = False
-      p, r, c = nn_ops.fractional_max_pool(
+      p, r, c = nn_ops.fractional_max_pool_v2(
           input_holder,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # First run.
       input_a = np.zeros([3, 32, 32, 3])
       actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
@@ -344,7 +339,6 @@ class FractionalMaxPoolGradTest(test.TestCase):
 
   _PRNG = np.random.RandomState(341261)
   _SEED = 123456
-  _SEED2 = 654321
 
   def _GenerateUniqueRandomInputTensor(self, shape):
     """Generate 'unqiue' random input tensor.
@@ -382,12 +376,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.max_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
               input_backprop_tensor = gen_nn_ops.max_pool_grad(
                   input_tensor, output_tensor, output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
               fmp_input_backprop_tensor = gen_nn_ops.fractional_max_pool_grad(
@@ -397,7 +391,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=False)
-              fmp_input_backprop = fmp_input_backprop_tensor.eval()
+              fmp_input_backprop = self.evaluate(fmp_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fmp_input_backprop_tensor)
               self.assertAllClose(input_backprop, fmp_input_backprop)
 
@@ -417,12 +411,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.max_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
               input_backprop_tensor = gen_nn_ops.max_pool_grad(
                   input_tensor, output_tensor, output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows, row_window_size - 1))
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
@@ -434,10 +428,11 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=True)
-              fmp_input_backprop = fmp_input_backprop_tensor.eval()
+              fmp_input_backprop = self.evaluate(fmp_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fmp_input_backprop_tensor)
               self.assertAllClose(input_backprop, fmp_input_backprop)
 
+  @test_util.run_deprecated_v1
   def testAllInputOptionsThroughGradientError(self):
     input_shape = (1, 7, 13, 1)
     input_data = self._GenerateUniqueRandomInputTensor(input_shape)
@@ -449,15 +444,13 @@ class FractionalMaxPoolGradTest(test.TestCase):
       for overlapping in True, False:
         with self.cached_session() as _:
           input_tensor = constant_op.constant(input_data, shape=input_shape)
-          output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+          output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
               input_tensor,
               pooling_ratio,
               pseudo_random=pseudo_random,
               overlapping=overlapping,
-              deterministic=True,
-              seed=self._SEED,
-              seed2=self._SEED2)
-          output_data = output_tensor.eval()
+              seed=self._SEED)
+          output_data = self.evaluate(output_tensor)
           output_shape = output_data.shape
           # error_margin and delta setting is similar to max_pool_grad.
           error_margin = 1e-3
@@ -470,6 +463,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
               delta=1e-2)
           self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testDifferentTensorShapesThroughGradientError(self):
     pseudo_random = True
     overlapping = True
@@ -484,15 +478,13 @@ class FractionalMaxPoolGradTest(test.TestCase):
             input_data += self._PRNG.random_sample(input_shape)
             with self.cached_session() as _:
               input_tensor = constant_op.constant(input_data, shape=input_shape)
-              output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+              output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
                   input_tensor,
                   pooling_ratio,
                   pseudo_random=pseudo_random,
                   overlapping=overlapping,
-                  deterministic=True,
-                  seed=self._SEED,
-                  seed2=self._SEED2)
-              output_data = output_tensor.eval()
+                  seed=self._SEED)
+              output_data = self.evaluate(output_tensor)
               output_shape = output_data.shape
               # error_margin and delta setting is similar to max_pool_grad.
               error_margin = 1e-3
@@ -505,6 +497,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   delta=1e-2)
               self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testLargePoolingRatioThroughGradientError(self):
     input_shape = (1, 17, 23, 1)
     input_data = self._GenerateUniqueRandomInputTensor(input_shape)
@@ -517,14 +510,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
 
     with self.cached_session() as _:
       input_tensor = constant_op.constant(input_data, shape=input_shape)
-      output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+      output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random=pseudo_random,
           overlapping=overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # error_margin and delta setting is similar to max_pool_grad.
       error_margin = 1e-3
       gradient_error = gradient_checker.compute_gradient_error(
@@ -592,7 +583,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
           row_seq,
           col_seq,
           overlapping=False)
-      input_backprop_not_overlapping = r.eval()
+      input_backprop_not_overlapping = self.evaluate(r)
       self.assertShapeEqual(
           np.reshape(expected_input_backprop_not_overlapping, input_size), r)
       self.assertAllClose(expected_input_backprop_not_overlapping,
@@ -602,7 +593,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
           output_data_overlapping, shape=output_size)
       r = gen_nn_ops.fractional_max_pool_grad(
           input_tensor, output_tensor, grad, row_seq, col_seq, overlapping=True)
-      input_backprop_overlapping = r.eval()
+      input_backprop_overlapping = self.evaluate(r)
       self.assertShapeEqual(
           np.reshape(expected_input_backprop_overlapping, input_size), r)
       self.assertAllClose(expected_input_backprop_overlapping,
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 30d11852c7d8c82f1853e43b64ff078c7485f8ef..95ee454614e6edb633b981e9173b2035550259c3 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -56,6 +56,7 @@ def simple_scoped_fn(a, x):
     return math_ops.multiply(math_ops.add(a, x), two)
 
 
+@test_util.with_control_flow_v2
 class FunctionalOpsTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -100,6 +101,7 @@ class FunctionalOpsTest(test.TestCase):
                              (elems, other_elems), initializer)
     self.assertAllEqual([1.0, 2.0, 3.0], self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testFoldl_Scoped(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -152,6 +154,7 @@ class FunctionalOpsTest(test.TestCase):
                              initializer)
     self.assertAllEqual(1, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testFoldr_Scoped(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -172,6 +175,7 @@ class FunctionalOpsTest(test.TestCase):
         self.assertAllEqual(1282, self.evaluate(r))
 
   # pylint: disable=unnecessary-lambda
+  @test_util.run_deprecated_v1
   def testFold_Grad(self):
     with self.cached_session():
       elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
@@ -213,6 +217,7 @@ class FunctionalOpsTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "not a scalar"):
       functional_ops.map_fn(lambda x: x, 1)
 
+  @test_util.run_deprecated_v1
   def testMap_Scoped(self):
     with self.cached_session() as sess:
 
@@ -244,6 +249,7 @@ class FunctionalOpsTest(test.TestCase):
         self.assertEqual(len(variables.trainable_variables()), 1)
         self.assertAllEqual(doubles, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testMap_Grad(self):
     with self.cached_session():
       param = constant_op.constant(2.0)
@@ -380,6 +386,7 @@ class FunctionalOpsTest(test.TestCase):
         ValueError, "two structures don't have the same nested structure"):
       functional_ops.scan(lambda a, x: (a, -a), elems, initializer)
 
+  @test_util.run_deprecated_v1
   def testScan_Scoped(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -424,6 +431,7 @@ class FunctionalOpsTest(test.TestCase):
     #   t_1 == 1, b == 4.5,       y == 0.5, returns b * y * x = 9
     self.assertAllClose([1., 1., 2.25, 9.], self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testScan_Control(self):
     with self.cached_session() as sess:
       s = array_ops.placeholder(dtypes.float32, shape=[None])
@@ -435,6 +443,7 @@ class FunctionalOpsTest(test.TestCase):
           np.array([1.0, 3.0, 9.0]), sess.run(c, {s: [1, 3, 3],
                                                   b: True}))
 
+  @test_util.run_deprecated_v1
   def testScan_Grad(self):
     with self.cached_session():
       elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
@@ -447,6 +456,7 @@ class FunctionalOpsTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllEqual(873.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testScanGradientWithPartStopGradient(self):
     a = variables.Variable(0.0, name="a")
     b = variables.Variable(0.0, name="b")
@@ -457,7 +467,7 @@ class FunctionalOpsTest(test.TestCase):
     grad = gradients_impl.gradients(ys=[loss], xs=[a, b])
     with self.test_session(use_gpu=True) as sess:
       variables.global_variables_initializer().run()
-      sess.run(grad)
+      self.evaluate(grad)
 
   @test_util.run_in_graph_and_eager_modes
   def testFoldShape(self):
@@ -476,12 +486,15 @@ class FunctionalOpsTest(test.TestCase):
     y = functional_ops.map_fn(lambda e: e, x)
     self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
 
+  @test_util.run_deprecated_v1
   def testMapUnknownShape(self):
     x = array_ops.placeholder(dtypes.float32)
     y = functional_ops.map_fn(lambda e: e, x)
     self.assertIs(None, y.get_shape().dims)
 
+  @test_util.disable_control_flow_v2("b/119323354")
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testMapEmptyScalar(self):
     map_return = functional_ops.map_fn(lambda x: 1, constant_op.constant([]))
     self.assertAllEqual([0], map_return.get_shape().dims)
@@ -489,6 +502,8 @@ class FunctionalOpsTest(test.TestCase):
 
   # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
   # so the body of the while loop never executes
+  @test_util.disable_control_flow_v2("b/119323354")
+  @test_util.run_v1_only("b/120545219")
   def testMapEmptyTensor(self):
     with self.cached_session():
       map_return = functional_ops.map_fn(lambda x: array_ops.zeros([3, 2]),
@@ -509,6 +524,7 @@ class FunctionalOpsTest(test.TestCase):
 
   # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
   # so the body of the while loop never executes
+  @test_util.run_deprecated_v1
   def testScanEmptyTensor(self):
     with self.cached_session():
       x = functional_ops.scan(
@@ -516,6 +532,7 @@ class FunctionalOpsTest(test.TestCase):
       self.assertAllEqual([0, 2, 4], x.get_shape())
       self.assertAllEqual(x.get_shape(), self.evaluate(x).shape)
 
+  @test_util.run_deprecated_v1
   def testScanUnknownShape(self):
     x = array_ops.placeholder(dtypes.float32)
     initializer = array_ops.placeholder(dtypes.float32)
@@ -526,6 +543,7 @@ class FunctionalOpsTest(test.TestCase):
     y = functional_ops.scan(fn, x, initializer=initializer)
     self.assertIs(None, y.get_shape().dims)
 
+  @test_util.run_deprecated_v1
   def testScanVaryingShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 2])
@@ -542,6 +560,7 @@ class FunctionalOpsTest(test.TestCase):
       sess.run([result, result_t, result_grad, result_t_grad],
                feed_dict={x: [[1.0, 2.0]]})
 
+  @test_util.run_deprecated_v1
   def testRemoteFunction(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -564,10 +583,11 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:worker/replica:0/task:0/cpu:1")
 
     with session.Session(worker[0].target) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionDirectSession(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -588,10 +608,11 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/cpu:1")
 
     with self.test_session(config=worker_config) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionSameDeviceDirectSession(self):
 
     @function.Defun(dtypes.int32, dtypes.int32)
@@ -607,8 +628,8 @@ class FunctionalOpsTest(test.TestCase):
           args=[a, b], Tout=[dtypes.int32], f=_remote_fn, target="/cpu:0")
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
   def testRemoteFunctionCPUGPU(self):
@@ -631,8 +652,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/device:GPU:0")[0] + 3.0
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
   def testRemoteFunctionGPUCPU(self):
@@ -655,8 +676,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/cpu:0")[0] + 3.0
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
   def testRemoteFunctionGPUCPUStrings(self):
@@ -674,9 +695,10 @@ class FunctionalOpsTest(test.TestCase):
           args=[a], Tout=[dtypes.string], f=_remote_fn, target="/cpu:0")
 
     with self.cached_session() as sess:
-      ret = sess.run(remote_op)
+      ret = self.evaluate(remote_op)
       self.assertAllEqual(ret, [b"a"])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionCrossProcess(self):
     workers, _ = test_util.create_local_cluster(2, 1)
 
@@ -696,10 +718,11 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:worker/replica:0/task:1/cpu:0")[0] + 3.0
 
     with session.Session(workers[0].target) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9)
 
+  @test_util.run_deprecated_v1
   def testIf(self):
 
     @function.Defun(dtypes.float32)
@@ -739,6 +762,7 @@ class FunctionalOpsTest(test.TestCase):
           self.assertAllEqual(Run(sess, 20.), 210.)
           self.assertAllEqual(Run(sess, 100.), 5050.)
 
+  @test_util.run_deprecated_v1
   def testWhileLowering(self):
 
     def Run(n, fetch_by_name):
@@ -766,13 +790,14 @@ class FunctionalOpsTest(test.TestCase):
           else:
             fetch = "my_while:1"
         with self.session(graph=g, use_gpu=use_gpu) as sess:
-          return sess.run(fetch)
+          return self.evaluate(fetch)
 
     self.assertAllEqual(Run(20., False), 210.)
     self.assertAllEqual(Run(20., True), 210.)
     self.assertAllEqual(Run(100., False), 5050.)
     self.assertAllEqual(Run(100., True), 5050.)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileError(self):
     for use_gpu in (True, False):
       with ops.Graph().as_default() as g:
@@ -829,6 +854,49 @@ class FunctionalOpsTest(test.TestCase):
           self.assertAllEqual(5050.,
                               sess.run([result, c], feed_dict={n: 100.})[0])
 
+  # pylint: disable=cell-var-from-loop
+  def testWhileCapturedInputs(self):
+    for use_gpu in (True, False):
+      with ops.Graph().as_default() as g:
+        v = variables.Variable(1.0)
+
+        def TestCond(n, *args):
+          del args
+          return n < 10
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def TestUnary(n, x):
+          return math_ops.add(n, 1), x + n + v
+
+        @function.Defun(*[dtypes.float32] * 3)
+        def TestBinary(n, x, x2):
+          return math_ops.add(n, 1), x + n + v, x2 + v
+
+        with self.session(graph=g, use_gpu=use_gpu) as sess:
+          result_unary = functional_ops.While(
+              [1.0, 0.],
+              function.Defun(*[dtypes.float32] * 2)(TestCond), TestUnary)
+          result_binary = functional_ops.While(
+              [1.0, 0., 0.],
+              function.Defun(*[dtypes.float32] * 3)(TestCond), TestBinary)
+          self.evaluate(variables.global_variables_initializer())
+          assert len(result_unary) == 2
+          self.assertEqual([10.0, 54.0], self.evaluate(result_unary))
+          assert len(result_binary) == 3
+          self.assertEqual([10.0, 54.0, 9.0], self.evaluate(result_binary))
+
+          def TestCondCapture(n, *args):
+            del args
+            return math_ops.to_float(n) + v < 10
+
+          with self.assertRaises(ValueError):
+            _ = functional_ops.While(
+                [1],
+                function.Defun(dtypes.int32)(TestCondCapture),
+                function.Defun(dtypes.int32, dtypes.float32)(TestUnary))
+
+  # pylint: enable=cell-var-from-loop
+
   def _tfSum(self, use_gpu, rewrite_with_while):
     with ops.Graph().as_default() as g:
       with self.session(graph=g, use_gpu=use_gpu) as sess:
@@ -846,7 +914,7 @@ class FunctionalOpsTest(test.TestCase):
                 100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)
             [0],
         ]
-        xvals = sess.run(xs)
+        xvals = self.evaluate(xs)
       self.assertAllEqual(210, xvals[0])
       self.assertAllEqual(5050, xvals[1])
 
@@ -876,6 +944,7 @@ class FunctionalOpsTest(test.TestCase):
     self.assertTrue("TestBody_Cond" in names)
     self.assertTrue("TestBody_Body" in names)
 
+  @test_util.run_deprecated_v1
   def testForCapturedInputs(self):
     v = variables.Variable(1.0)
 
@@ -903,16 +972,16 @@ class FunctionalOpsTest(test.TestCase):
         result_binary = functional_ops.For(
             1, 10, 1, [0., 0.], TestBinary,
             rewrite_with_while=rewrite_with_while)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         assert not result_nullary
         # The nullary variant doesn't return anything so we can't easily run it.
         # As a total hack, fetch the operation by name and run it.
         sess.run(ops.get_default_graph().get_operation_by_name(
             "While" if rewrite_with_while else "For"))
         assert len(result_unary) == 1
-        self.assertEqual([54.0], sess.run(result_unary))
+        self.assertEqual([54.0], self.evaluate(result_unary))
         assert len(result_binary) == 2
-        self.assertEqual([54.0, 9.0], sess.run(result_binary))
+        self.assertEqual([54.0, 9.0], self.evaluate(result_binary))
 
   def _tfMLP(self, xval, wsval, bsval, rewrite_with_while):
     # On GPU, don't rewrite using a while loop.
@@ -931,7 +1000,7 @@ class FunctionalOpsTest(test.TestCase):
           MLP,
           rewrite_with_while=rewrite_with_while)[0]
 
-      return ret.eval()
+      return self.evaluate(ret)
 
   def _npMLP(self, xval, wsval, bsval):
     for i in range(wsval.shape[0]):
@@ -950,12 +1019,15 @@ class FunctionalOpsTest(test.TestCase):
     tf_for_ans = self._tfMLP(xval, wsval, bsval, rewrite_with_while)
     self.assertAllClose(np_ans, tf_for_ans)
 
+  @test_util.run_deprecated_v1
   def testForMLP(self):
     self._testForMLP(False)
 
+  @test_util.run_deprecated_v1
   def testForMLPWhile(self):
     self._testForMLP(True)
 
+  @test_util.run_v1_only("b/120545219")
   def testForError(self):
 
     @function.Defun(dtypes.int32, dtypes.float32)
@@ -978,6 +1050,7 @@ class FunctionalOpsTest(test.TestCase):
           "For loop body returned 2 arguments. Expected: 1"):
         functional_ops.For(0, 10, 1, [0.0], ReturnsTooManyArgs)[0].eval()
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
 
     @function.Defun(dtypes.float32)
@@ -995,14 +1068,15 @@ class FunctionalOpsTest(test.TestCase):
       avals = [Poly(a), Grad(a)]
       b = constant_op.constant(1.)
       bvals = [Poly(b), Grad(b)]
-      self.assertAllEqual(sess.run(avals), [8., 4.])
-      self.assertAllEqual(sess.run(bvals), [17., 16.])
+      self.assertAllEqual(self.evaluate(avals), [8., 4.])
+      self.assertAllEqual(self.evaluate(bvals), [17., 16.])
 
 
 # TODO(akshayka): Replace `function.Defun` with tf.contrib.eager.defun` in the
 # below test cases.
 class PartitionedCallTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicSingleDevice(self):
 
     @function.Defun(*[dtypes.float32] * 2)
@@ -1018,6 +1092,7 @@ class PartitionedCallTest(test.TestCase):
                   constant_op.constant(2.)], f=Body))
     self.assertEqual(output, 6.)
 
+  @test_util.run_deprecated_v1
   def testBasicMultiDevice(self):
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
 
@@ -1061,6 +1136,7 @@ class PartitionedCallTest(test.TestCase):
                   constant_op.constant(2.)], f=Body))
     self.assertEqual(output, 6.)
 
+  @test_util.run_deprecated_v1
   def testBasicNoDeviceAnnotations(self):
 
     @function.Defun(*[dtypes.float32] * 2)
@@ -1075,6 +1151,7 @@ class PartitionedCallTest(test.TestCase):
                   constant_op.constant(2.)], f=Body))
     self.assertEqual(output, 6.)
 
+  @test_util.run_deprecated_v1
   def testShardsRunOnRequestedDevices(self):
     config = config_pb2.ConfigProto(device_count={"CPU": 4})
 
@@ -1104,6 +1181,7 @@ class PartitionedCallTest(test.TestCase):
     self.assertIn(compat.as_bytes("CPU:1"), outputs[1])
     self.assertIn(compat.as_bytes("CPU:2"), outputs[2])
 
+  @test_util.run_deprecated_v1
   def testAssignAddResourceVariable(self):
 
     v = resource_variable_ops.ResourceVariable(1.0)
@@ -1147,13 +1225,28 @@ class PartitionedCallTest(test.TestCase):
             allow_soft_placement=False,
             log_device_placement=True,
             device_count={"CPU": 2})) as sess:
-      sess.run(variables.global_variables_initializer())
-      expected = sess.run(sum_gather())
+      self.evaluate(variables.global_variables_initializer())
+      expected = self.evaluate(sum_gather())
       result = sess.run(
           functional_ops.partitioned_call(
               args=defined.captured_inputs, f=defined))
       self.assertAllEqual(expected, result)
 
+  # Use an invalid executor name to test the plumbing of the executor_type attr.
+  @test_util.run_v1_only("b/120545219")
+  def testExecutorTypeAttrExecutorNotFound(self):
+    @function.Defun(dtypes.int32)
+    def AddFive(x):
+      return x + 5
+
+    op = functional_ops.partitioned_call(
+        args=[constant_op.constant([1, 2, 3], dtype=dtypes.int32)],
+        f=AddFive,
+        executor_type="NON_EXISTENT_EXECUTOR")
+    with self.assertRaisesRegexp(errors.NotFoundError,
+                                 "NON_EXISTENT_EXECUTOR"):
+      self.evaluate(op)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index c0b419e1d13405d04c34fb642cec15760ddcf50f..320ffc9674bd2e0ce601084ab8fc375c4cbdc3e2 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -26,6 +26,8 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
@@ -35,11 +37,11 @@ from tensorflow.python.platform import test
 class GatherNdTest(test.TestCase):
 
   def _testSimpleDtype(self, dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       params = constant_op.constant(np.array([8, 1, 2, 3, 7, 5], dtype=dtype))
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertAllEqual(np.array([7, 7, 8], dtype=dtype), gather_nd_val)
     self.assertEqual([3], gather_nd_t.get_shape())
@@ -53,26 +55,27 @@ class GatherNdTest(test.TestCase):
     self._testSimpleDtype(np.complex128)
     self._testSimpleDtype("|S")  # byte strings in python2 + 3
 
+  @test_util.run_deprecated_v1
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = np.ones((3, 3), dtype=np.float32)
 
       indices_empty = np.empty((0, 2), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
       indices_empty = np.empty((0, 1), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0, 3], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0, 3), dtype=np.float32), gather_nd_ok_val)
 
       params_empty = np.empty((0, 3), dtype=np.float32)
       indices_empty = np.empty((0, 2), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params_empty, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
@@ -81,42 +84,42 @@ class GatherNdTest(test.TestCase):
       gather_nd_break_t = array_ops.gather_nd(params_empty, indices_nonempty)
       with self.assertRaisesOpError(
           r"Requested more than 0 entries, but params is empty."):
-        gather_nd_break_t.eval()
+        self.evaluate(gather_nd_break_t)
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
   def testIndexScalar(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = np.array(
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4, 1])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
       self.assertEqual([], gather_nd_t.get_shape())
       self.assertAllEqual(np.array(7), gather_nd_val)
 
   def testParamsRankLargerThanIndexIndexScalarSlices(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = np.array(
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
       self.assertEqual([2], gather_nd_t.get_shape())
       self.assertAllEqual(np.array([-7, 7]), gather_nd_val)
 
   def testParamsRankLargerThanIndexSlices(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = np.array(
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([3, 2], gather_nd_t.get_shape())
     self.assertAllEqual(np.array([[-7, 7], [-7, 7], [-8, 8]]), gather_nd_val)
 
   def testHigherRankParamsLargerThanIndexSlices(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = np.array(
           [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
            [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
@@ -124,13 +127,13 @@ class GatherNdTest(test.TestCase):
       params_t = constant_op.constant(params)
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([3, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(params[[4, 4, 0]], gather_nd_val)
 
   def testEmptyIndicesLastRankMeansCopyEntireTensor(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = np.array(
           [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
            [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
@@ -139,7 +142,7 @@ class GatherNdTest(test.TestCase):
       indices = constant_op.constant(
           [[], []], dtype=dtypes.int32)  # Size (2, 0)
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([2, 6, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(
@@ -147,7 +150,7 @@ class GatherNdTest(test.TestCase):
         gather_nd_val)
 
   def testHigherRankParamsAndIndicesLargerThanIndexSlices(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = np.array(
           [[[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]],
            [[-80, -10, -20, -30, -70, -50], [80, 10, 20, 30, 70, 50]]],
@@ -155,32 +158,32 @@ class GatherNdTest(test.TestCase):
       params_t = constant_op.constant(params)
       indices = constant_op.constant([[[3], [2], [1]], [[4], [4], [0]]])
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([2, 3, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(params[[3, 2, 1, 4, 4, 0]].reshape(2, 3, 2, 2),
                         gather_nd_val)
 
   def testHigherRankParams(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = (10, 20, 5, 1, 17)
       params = np.random.rand(*shape)
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected, gather_nd_val)
     self.assertEqual([2000], gather_nd_t.get_shape())
 
   def testHigherRankParamsAndIndices(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = (10, 20, 5, 1, 17)
       params = np.random.rand(*shape)
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
       indices_reshaped = indices.reshape([10, 10, 20, 5])
       gather_nd_t = array_ops.gather_nd(params, indices_reshaped)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
@@ -189,58 +192,62 @@ class GatherNdTest(test.TestCase):
   def assertIndexedSlices(self, t):
     self.assertIsInstance(t, ops.IndexedSlices)
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
     gather_nd_t = array_ops.gather_nd(params, indices)
     shape = gather_nd_t.get_shape()
     self.assertEqual(None, shape.ndims)
-    self.assertEqual(None, shape[0].value)
+    self.assertEqual(None, tensor_shape.dimension_value(shape[0]))
 
+  @test_util.run_deprecated_v1
   def testBadIndicesCPU(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def _disabledTestBadIndicesGPU(self):
     # TODO disabled due to different behavior on GPU and CPU
     # On GPU the bad indices do not raise error but fetch 0 values
     if not test.is_gpu_available():
       return
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
+  @test_util.run_deprecated_v1
   def testBadIndicesWithSlicesCPU(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def _disabledTestBadIndicesWithSlicesGPU(self):
     # TODO disabled due to different behavior on GPU and CPU
     # On GPU the bad indices do not raise error but fetch 0 values
     if not test.is_gpu_available():
       return
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2Elements(self):
     indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
     inputs = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
@@ -249,9 +256,10 @@ class GatherNdTest(test.TestCase):
     grad_vals = constant_op.constant([1, 2], dtype=dtypes.float64)
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array([[1, 0], [0, 2]], dtype=np.float64)
-    with self.test_session(use_gpu=True):
-      assert np.array_equal(expected_grads, grads.eval())
+    with self.session(use_gpu=True):
+      assert np.array_equal(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2Slices(self):
     indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
     inputs = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
@@ -260,10 +268,11 @@ class GatherNdTest(test.TestCase):
     grad_vals = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array([[3, 4], [1, 2]], dtype=np.float64)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertIndexedSlices(grads)
       self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
 
+  @test_util.run_deprecated_v1
   def testGradientsRank3Elements(self):
     indices = constant_op.constant(
         [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int32)
@@ -276,9 +285,10 @@ class GatherNdTest(test.TestCase):
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
-    with self.test_session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+    with self.session(use_gpu=True):
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank7Elements(self):
     # Shape [1,1,2,1,1,2,2]
     indices = constant_op.constant(
@@ -305,9 +315,10 @@ class GatherNdTest(test.TestCase):
             [[[[5, 6], [1, 2]]]],
             [[[[3, 4], [7, 8]]]]
         ]]], dtype=np.float64)
-    with self.test_session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+    with self.session(use_gpu=True):
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsInt64Indices(self):
     indices = constant_op.constant(
         [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int64)
@@ -320,9 +331,10 @@ class GatherNdTest(test.TestCase):
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
-    with self.test_session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+    with self.session(use_gpu=True):
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2SlicesWithEmptySpace(self):
     indices = constant_op.constant([[2], [0], [5]], dtype=dtypes.int32)
     inputs = constant_op.constant(
@@ -341,7 +353,7 @@ class GatherNdTest(test.TestCase):
          [1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0],
          [0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 3, 3, 3, 3, 3, 3, 3, 3]],
         dtype=np.float64)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertIndexedSlices(grads)
       self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
 
@@ -360,10 +372,10 @@ class GatherNdOpBenchmark(test.Benchmark):
       gather_op = array_ops.gather_nd(t_params, t_indices)
       variables.global_variables_initializer().run()
       for _ in range(10):
-        gather_op.eval()
+        self.evaluate(gather_op)
       t1 = time.time()
       for _ in range(1000):
-        gather_op.eval()
+        self.evaluate(gather_op)
       t2 = time.time()
       self.report_benchmark(iters=1000, wall_time=(t2 - t1) / 1000.0)
 
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 85bf969068ef41c13e386572862c014bba83e28b..fc86068c3fc08d1ad01ba8dfa9bb4c5bc6c429f2 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
@@ -42,7 +43,7 @@ class GatherTest(test.TestCase):
     return data
 
   def testScalar1D(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       data = np.array([0, 1, 2, 3, 7, 5])
       for dtype in _TEST_TYPES:
         for indices in 4, [1, 2, 2, 4, 5]:
@@ -50,13 +51,13 @@ class GatherTest(test.TestCase):
           params = constant_op.constant(params_np)
           indices_tf = constant_op.constant(indices)
           gather_t = array_ops.gather(params, indices_tf)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           np_val = params_np[indices]
           self.assertAllEqual(np_val, gather_val)
           self.assertEqual(np_val.shape, gather_t.get_shape())
 
   def testScalar2D(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8],
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
@@ -65,13 +66,13 @@ class GatherTest(test.TestCase):
           params = constant_op.constant(params_np)
           indices = constant_op.constant(2)
           gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
           expected_shape = data.shape[:axis] + data.shape[axis + 1:]
           self.assertEqual(expected_shape, gather_t.get_shape())
 
   def testSimpleTwoD32(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       data = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8],
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
@@ -81,12 +82,13 @@ class GatherTest(test.TestCase):
           # The indices must be in bounds for any axis.
           indices = constant_op.constant([0, 1, 0, 2])
           gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
                               gather_val)
           expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
           self.assertEqual(expected_shape, gather_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     # We check that scalar and empty indices shapes work as well
     shape = (2, 1, 3, 2)
@@ -95,7 +97,7 @@ class GatherTest(test.TestCase):
         for axis in range(len(shape)):
           params = self._buildParams(np.random.randn(*shape), dtype)
           indices = np.random.randint(shape[axis], size=indices_shape)
-          with self.test_session(use_gpu=True) as sess:
+          with self.cached_session(use_gpu=True) as sess:
             tf_params = constant_op.constant(params)
             tf_indices = constant_op.constant(indices)
             # Check that both positive and negative indices for axis work.
@@ -142,9 +144,13 @@ class GatherTest(test.TestCase):
               source_slice = ((slice(None),) * outer_dims + (source_index,) +
                               (slice(None),) * inner_dims)
               correct_params_grad[dest_slice] += gather_grad[source_slice]
-            self.assertAllClose(correct_params_grad, params_grad.eval(),
-                                atol=2e-6, rtol=2e-6)
+            self.assertAllClose(
+                correct_params_grad,
+                self.evaluate(params_grad),
+                atol=2e-6,
+                rtol=2e-6)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
     with self.cached_session():
@@ -153,6 +159,7 @@ class GatherTest(test.TestCase):
       self.assertAllEqual([b"asdf", b"qwer"],
                           array_ops.gather(params, 0, axis=1).eval())
 
+  @test_util.run_deprecated_v1
   def testUInt32AndUInt64(self):
     for unsigned_type in (dtypes.uint32, dtypes.uint64):
       params = self._buildParams(
@@ -162,12 +169,14 @@ class GatherTest(test.TestCase):
                             array_ops.gather(params, 1, axis=0).eval())
         self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1).eval())
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
     gather_t = array_ops.gather(params, indices)
     self.assertEqual(None, gather_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testUnknownAxis(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = constant_op.constant([[0, 0], [0, 0]])
@@ -182,7 +191,7 @@ class GatherTest(test.TestCase):
     self.assertEqual(None, gather_t.shape)
 
   def testBadIndicesCPU(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
         array_ops.gather(params, [[7]], axis=0).eval()
@@ -194,15 +203,16 @@ class GatherTest(test.TestCase):
     # On GPU the bad indices do not raise error but fetch 0 values
     if not test.is_gpu_available():
       return
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
         array_ops.gather(params, [[7]], axis=0).eval()
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
         array_ops.gather(params, [[7]], axis=1).eval()
 
+  @test_util.run_deprecated_v1
   def testBadAxis(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       params = [0, 1, 2]
       params_ph = array_ops.placeholder(dtypes.int32)
       indices = 0
@@ -217,8 +227,9 @@ class GatherTest(test.TestCase):
           array_ops.gather(params_ph, indices, axis=bad_axis).eval(
               feed_dict={params_ph: params})
 
+  @test_util.run_deprecated_v1
   def testEmptySlices(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
           # Leading axis gather.
diff --git a/tensorflow/python/kernel_tests/gradient_correctness_test.py b/tensorflow/python/kernel_tests/gradient_correctness_test.py
index 291a69ebac6625ea9b50a54d2e0e28083b463d85..0148de5047afe6144433d69beb03e066ae395865 100644
--- a/tensorflow/python/kernel_tests/gradient_correctness_test.py
+++ b/tensorflow/python/kernel_tests/gradient_correctness_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -29,37 +30,42 @@ from tensorflow.python.platform import test
 
 class GradientCorrectnessTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMultipleOutputChainedGradients(self):
     with self.cached_session() as sess:
       x = constant_op.constant(1.0, dtype=dtypes.float32)
       yexp = math_ops.exp(x)
       yexplog = math_ops.log(yexp)
       grads = gradients_impl.gradients([yexp, yexplog], [x])
-      grad_vals = sess.run(grads)
+      grad_vals = self.evaluate(grads)
       exp1_plus_one = (1.0 + np.exp(1.0)).astype(np.float32)
       # [dexp(x)/dx + d(log(exp(x)))/dx] @ x=1 == exp(1) + 1
       self.assertAllClose(grad_vals[0], exp1_plus_one)
 
+  @test_util.run_deprecated_v1
   def testIdentityGradient(self):
     x = constant_op.constant(3.)
     dx_dx, = gradients_impl.gradients(x, x)
     with self.cached_session() as sess:
-      self.assertAllClose(1., sess.run(dx_dx))
+      self.assertAllClose(1., self.evaluate(dx_dx))
 
+  @test_util.run_deprecated_v1
   def testIntegerIdentityGradient(self):
     x = constant_op.constant(3)
     dx_dx, = gradients_impl.gradients(x, x)
     with self.cached_session() as sess:
-      self.assertAllClose(1, sess.run(dx_dx))
+      self.assertAllClose(1, self.evaluate(dx_dx))
 
+  @test_util.run_deprecated_v1
   def testGradientWithIntegerPath(self):
     x = constant_op.constant([3.9, 4.1])
     k = math_ops.to_float(math_ops.to_int32(x))
     y = x * k
     dy_dx, = gradients_impl.gradients(y, x)
     with self.cached_session() as sess:
-      self.assertAllClose([3., 4.], sess.run(dy_dx))
+      self.assertAllClose([3., 4.], self.evaluate(dy_dx))
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient1(self):
     x = constant_op.constant([3.9, 4.1])
     k = math_ops.to_float(math_ops.to_int32(x))
@@ -67,6 +73,7 @@ class GradientCorrectnessTest(test.TestCase):
     dy_dx, = gradients_impl.gradients(y, x)
     self.assertIsNone(dy_dx)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient2(self):
     k = constant_op.constant([3, 4])
     x = math_ops.to_float(k)
@@ -74,18 +81,21 @@ class GradientCorrectnessTest(test.TestCase):
     dy_dk, = gradients_impl.gradients(y, k)
     self.assertIsNone(dy_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient3(self):
     k = constant_op.constant([3, 4])
     m = k * k
     dm_dk, = gradients_impl.gradients(m, k)
     self.assertIsNone(dm_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient4(self):
     k = constant_op.constant([3, 4])
     m = k * k * k
     dm_dk, = gradients_impl.gradients(m, k)
     self.assertIsNone(dm_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient5(self):
     k = constant_op.constant([3, 4])
     m = k * k
@@ -93,6 +103,7 @@ class GradientCorrectnessTest(test.TestCase):
     dn_dk, = gradients_impl.gradients(n, k)
     self.assertIsNone(dn_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient6(self):
     k = constant_op.constant(3)
     x = math_ops.to_float(k)
diff --git a/tensorflow/python/kernel_tests/huge_slice_op_test.py b/tensorflow/python/kernel_tests/huge_slice_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4074946350aa5ce753a39fb173346d1d4f7fe3c7
--- /dev/null
+++ b/tensorflow/python/kernel_tests/huge_slice_op_test.py
@@ -0,0 +1,43 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for slice op that consume a lot of GPU memory."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class SliceTest(test.TestCase):
+
+  def testInt64Slicing(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()):
+      a_large = array_ops.tile(
+          constant_op.constant(np.array([False, True] * 4)), [2**29 + 3])
+      slice_t = array_ops.slice(a_large, np.asarray([3]).astype(np.int64), [3])
+      slice_val = self.evaluate(slice_t)
+      self.assertAllEqual([True, False, True], slice_val)
+
+      slice_t = array_ops.slice(
+          a_large, constant_op.constant([long(2)**32 + 3], dtype=dtypes.int64),
+          [3])
+      slice_val = self.evaluate(slice_t)
+      self.assertAllEqual([True, False, True], slice_val)
diff --git a/tensorflow/python/kernel_tests/identity_n_op_py_test.py b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
index 518733cd8e9064cc5d4637225295571c072a0fc6..a1110d640f01dd5cdfe51fa26c85760ada705b8d 100644
--- a/tensorflow/python/kernel_tests/identity_n_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
@@ -21,12 +21,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class IdentityNOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInt32String_6(self):
     with self.cached_session() as sess:
       [value0, value1] = sess.run(
@@ -36,6 +38,7 @@ class IdentityNOpTest(test.TestCase):
     self.assertAllEqual(
         np.array([b"a", b"b", b"C", b"d", b"E", b"f", b"g"]), value1)
 
+  @test_util.run_deprecated_v1
   def testInt32_shapes(self):
     with self.cached_session() as sess:
       inp0 = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
@@ -50,6 +53,7 @@ class IdentityNOpTest(test.TestCase):
         np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]),
         value2)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     source = [b"A", b"b", b"C", b"d", b"E", b"f"]
     with self.cached_session() as sess:
diff --git a/tensorflow/python/kernel_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/identity_op_py_test.py
index 88ea10c22a30742f62b56b490f00e9fe387cbfa0..40ec9db4226a89305732683118f7f906db1ba965 100644
--- a/tensorflow/python/kernel_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_op_py_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import variables
@@ -30,17 +31,20 @@ from tensorflow.python.platform import test
 
 class IdentityOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInt32_6(self):
     with self.cached_session():
       value = array_ops.identity([1, 2, 3, 4, 5, 6]).eval()
     self.assertAllEqual(np.array([1, 2, 3, 4, 5, 6]), value)
 
+  @test_util.run_deprecated_v1
   def testInt32_2_3(self):
     with self.cached_session():
       inp = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
       value = array_ops.identity(inp).eval()
     self.assertAllEqual(np.array([[10, 20, 30], [40, 50, 60]]), value)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     source = [b"A", b"b", b"C", b"d", b"E", b"f"]
     with self.cached_session():
@@ -58,6 +62,7 @@ class IdentityOpTest(test.TestCase):
       self.assertEquals(shape,
                         array_ops.identity(np.array(array_2x3)).get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testRefIdentityShape(self):
     with self.cached_session():
       shape = [2, 3]
diff --git a/tensorflow/python/kernel_tests/in_topk_op_test.py b/tensorflow/python/kernel_tests/in_topk_op_test.py
index 6fdb497bc6f8d15d54b9d35ed8c15ed9caceb1db..507822b3142a77a3782be52a3d19bb9bd664b684 100644
--- a/tensorflow/python/kernel_tests/in_topk_op_test.py
+++ b/tensorflow/python/kernel_tests/in_topk_op_test.py
@@ -32,7 +32,7 @@ class InTopKTest(test.TestCase):
     np_ans = np.array(expected)
     with self.cached_session():
       precision = nn_ops.in_top_k(predictions, target, k)
-      out = precision.eval()
+      out = self.evaluate(precision)
       self.assertAllClose(np_ans, out)
       self.assertShapeEqual(np_ans, precision)
 
@@ -77,7 +77,7 @@ class InTopKTest(test.TestCase):
     np_ans = np.array([False, True])
     with self.cached_session():
       precision = nn_ops.in_top_k(predictions, target, k)
-      out = precision.eval()
+      out = self.evaluate(precision)
       self.assertAllClose(np_ans, out)
       self.assertShapeEqual(np_ans, precision)
 
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 292679e4b996841009f8a2bfdf3c267f83c9e39b..09b9944baa1d92bfbcd484f5dba45cea28e6eafe 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -106,40 +107,45 @@ def _init_sampler(tc, init, num):
 
 class ConstantInitializersTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testZerosInitializer(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = [2, 3]
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.zeros_initializer())
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.zeros(shape))
 
+  @test_util.run_deprecated_v1
   def testOnesInitializer(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = [2, 3]
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.ones_initializer())
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.ones(shape))
 
+  @test_util.run_deprecated_v1
   def testConstantZeroInitializer(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = [2, 3]
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.constant_initializer(0.0))
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.zeros(shape))
 
+  @test_util.run_deprecated_v1
   def testConstantOneInitializer(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = [2, 3]
       x = variable_scope.get_variable(
           "x", shape=shape, initializer=init_ops.constant_initializer(1.0))
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.ones(shape))
 
+  @test_util.run_deprecated_v1
   def testConstantIntInitializer(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = [2, 3]
       x = variable_scope.get_variable(
           "x",
@@ -150,8 +156,9 @@ class ConstantInitializersTest(test.TestCase):
       self.assertEqual(x.dtype.base_dtype, dtypes.int32)
       self.assertAllEqual(x.eval(), 7 * np.ones(shape, dtype=np.int32))
 
+  @test_util.run_deprecated_v1
   def testConstantTupleInitializer(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = [3]
       x = variable_scope.get_variable(
           "x",
@@ -163,7 +170,7 @@ class ConstantInitializersTest(test.TestCase):
       self.assertAllEqual(x.eval(), [10, 20, 30])
 
   def _testNDimConstantInitializer(self, name, value, shape, expected):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       init = init_ops.constant_initializer(value, dtype=dtypes.int32)
       x = variable_scope.get_variable(name, shape=shape, initializer=init)
       x.initializer.run()
@@ -173,6 +180,7 @@ class ConstantInitializersTest(test.TestCase):
       for a, e in zip(actual, expected):
         self.assertEqual(a, e)
 
+  @test_util.run_deprecated_v1
   def testNDimConstantInitializer(self):
     value = [0, 1, 2, 3, 4, 5]
     shape = [2, 3]
@@ -187,7 +195,7 @@ class ConstantInitializersTest(test.TestCase):
 
   def _testNDimConstantInitializerLessValues(self, name, value, shape,
                                              expected):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       init = init_ops.constant_initializer(value, dtype=dtypes.int32)
       x = variable_scope.get_variable(name, shape=shape, initializer=init)
       x.initializer.run()
@@ -199,6 +207,7 @@ class ConstantInitializersTest(test.TestCase):
         e = expected[i] if i < len(expected) else expected[-1]
         self.assertEqual(a, e)
 
+  @test_util.run_deprecated_v1
   def testNDimConstantInitializerLessValues(self):
     value = [0, 1, 2, 3, 4, 5]
     shape = [2, 4]
@@ -213,7 +222,7 @@ class ConstantInitializersTest(test.TestCase):
 
   def _testNDimConstantInitializerMoreValues(self, value, shape):
     ops.reset_default_graph()
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       init = init_ops.constant_initializer(value, dtype=dtypes.int32)
       self.assertRaises(
           ValueError,
@@ -222,6 +231,7 @@ class ConstantInitializersTest(test.TestCase):
           shape=shape,
           initializer=init)
 
+  @test_util.run_deprecated_v1
   def testNDimConstantInitializerMoreValues(self):
     value = [0, 1, 2, 3, 4, 5, 6, 7]
     shape = [2, 3]
@@ -243,18 +253,21 @@ class ConstantInitializersTest(test.TestCase):
 
 class RandomNormalInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
       init2 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
       init2 = init_ops.random_normal_initializer(0.0, 1.0, seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.random_normal_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -270,6 +283,7 @@ class RandomNormalInitializationTest(test.TestCase):
 
 class TruncatedNormalInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.truncated_normal_initializer(
@@ -278,6 +292,7 @@ class TruncatedNormalInitializationTest(test.TestCase):
           0.0, 1.0, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.truncated_normal_initializer(
@@ -286,6 +301,7 @@ class TruncatedNormalInitializationTest(test.TestCase):
           0.0, 1.0, seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.truncated_normal_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -301,18 +317,21 @@ class TruncatedNormalInitializationTest(test.TestCase):
 
 class RandomUniformInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64, dtypes.int64]:
       init1 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
       init2 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64]:
       init1 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
       init2 = init_ops.random_uniform_initializer(0, 7, seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.random_uniform_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -320,6 +339,7 @@ class RandomUniformInitializationTest(test.TestCase):
 
 class UniformUnitScalingInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.uniform_unit_scaling_initializer(seed=1, dtype=dtype)
@@ -331,6 +351,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
           1.5, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init3, init4))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.uniform_unit_scaling_initializer(seed=1, dtype=dtype)
@@ -341,6 +362,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
       self.assertFalse(identicaltest(self, init1, init3))
       self.assertFalse(identicaltest(self, init2, init3))
 
+  @test_util.run_deprecated_v1
   def testZeroSize(self):
     shape = [0, 2]
     with self.cached_session():
@@ -349,8 +371,9 @@ class UniformUnitScalingInitializationTest(test.TestCase):
           shape=shape,
           initializer=init_ops.uniform_unit_scaling_initializer())
       variables.global_variables_initializer().run()
-      self.assertAllEqual(shape, x.eval().shape)
+      self.assertAllEqual(shape, self.evaluate(x).shape)
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.uniform_unit_scaling_initializer()
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -364,6 +387,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
 
 class VarianceScalingInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTruncatedNormalDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -371,7 +395,7 @@ class VarianceScalingInitializationTest(test.TestCase):
     init = init_ops.variance_scaling_initializer(
         distribution='truncated_normal')
 
-    with self.test_session(use_gpu=True), \
+    with self.session(use_gpu=True), \
       test.mock.patch.object(
           random_ops, 'truncated_normal', wraps=random_ops.truncated_normal) \
           as mock_truncated_normal:
@@ -381,13 +405,14 @@ class VarianceScalingInitializationTest(test.TestCase):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_deprecated_v1
   def testNormalDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
     expect_var = 1. / shape[0]
     init = init_ops.variance_scaling_initializer(distribution='normal')
 
-    with self.test_session(use_gpu=True), \
+    with self.session(use_gpu=True), \
       test.mock.patch.object(
           random_ops, 'truncated_normal', wraps=random_ops.truncated_normal) \
           as mock_truncated_normal:
@@ -397,6 +422,7 @@ class VarianceScalingInitializationTest(test.TestCase):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_deprecated_v1
   def testUntruncatedNormalDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -404,7 +430,7 @@ class VarianceScalingInitializationTest(test.TestCase):
     init = init_ops.variance_scaling_initializer(
         distribution='untruncated_normal')
 
-    with self.test_session(use_gpu=True), \
+    with self.session(use_gpu=True), \
       test.mock.patch.object(
           random_ops, 'random_normal', wraps=random_ops.random_normal) \
           as mock_random_normal:
@@ -414,13 +440,14 @@ class VarianceScalingInitializationTest(test.TestCase):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_deprecated_v1
   def testUniformDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
     expect_var = 1. / shape[0]
     init = init_ops.variance_scaling_initializer(distribution='uniform')
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = init(shape).eval()
 
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
@@ -431,11 +458,11 @@ class VarianceScalingInitializationTest(test.TestCase):
 class RangeTest(test.TestCase):
 
   def _Range(self, start, limit, delta):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_ans = math_ops.range(start, limit, delta, name="range")
       self.assertEqual([len(np.arange(start, limit, delta))],
                        tf_ans.get_shape())
-      return tf_ans.eval()
+      return self.evaluate(tf_ans)
 
   def testBasic(self):
     self.assertTrue(
@@ -449,8 +476,9 @@ class RangeTest(test.TestCase):
             self._Range(100, 500, 100), np.array([100, 200, 300, 400])))
     self.assertEqual(math_ops.range(0, 5, 1).dtype, dtypes.int32)
 
+  @test_util.run_deprecated_v1
   def testLimitOnly(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(np.arange(5), math_ops.range(5).eval())
 
   def testEmpty(self):
@@ -520,12 +548,11 @@ class LinSpaceTest(test.TestCase):
       return [False]
 
   def _LinSpace(self, start, stop, num):
-    # NOTE(touts): Needs to pass a graph to get a new session each time.
     with ops.Graph().as_default() as graph:
       with self.session(graph=graph, force_gpu=self.force_gpu):
         tf_ans = math_ops.linspace(start, stop, num, name="linspace")
         self.assertEqual([num], tf_ans.get_shape())
-        return tf_ans.eval()
+        return self.evaluate(tf_ans)
 
   def testPositive(self):
     for self.force_gpu in self._gpu_modes():
@@ -584,18 +611,21 @@ class DeviceTest(test.TestCase):
 
 class OrthogonalInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.orthogonal_initializer(seed=1, dtype=dtype)
       init2 = init_ops.orthogonal_initializer(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.orthogonal_initializer(seed=1, dtype=dtype)
       init2 = init_ops.orthogonal_initializer(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.orthogonal_initializer()
     self.assertFalse(duplicated_initializer(self, init, 1, (10, 10)))
@@ -609,6 +639,7 @@ class OrthogonalInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -617,8 +648,9 @@ class OrthogonalInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       for shape in [(10, 10), (10, 9, 8), (100, 5, 5), (50, 40), (40, 50)]:
@@ -640,18 +672,21 @@ class OrthogonalInitializerTest(test.TestCase):
 
 class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_delta_orthogonal(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_delta_orthogonal()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
@@ -666,6 +701,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -675,8 +711,9 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     gain = 3.14
     for dtype in [dtypes.float32]:
@@ -704,28 +741,28 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
         outputs_2norm = linalg_ops.norm(outputs)
         ratio = outputs_2norm / inputs_2norm
         my_ops = variables.global_variables_initializer()
-        with self.test_session(use_gpu=True) as sess:
-          sess.run(my_ops)
+        with self.session(use_gpu=True) as sess:
+          self.evaluate(my_ops)
           # Check the shape of the outputs
-          t = outputs.eval()
+          t = self.evaluate(outputs)
           self.assertAllEqual(t.shape, outputs_shape)
           # Check isometry of the delta-orthogonal kernel.
-          self.assertAllClose(sess.run(ratio), np.sqrt(gain),
-                              rtol=tol, atol=tol)
+          self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testNonuniformity(self):
     value = 0
     abs_value = 0
     shape = [3, 3, 10, 10]
     count = 70
     tol = 1e-5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for i in range(count):
         x = variable_scope.get_variable("{}".format(i), shape=shape,
                                         initializer=
                                         init_ops.convolutional_delta_orthogonal)
         x.initializer.run()
-        y = x.eval()[1, 1, :, :]
+        y = self.evaluate(x)[1, 1, :, :]
         determinant = np.linalg.det(y)
         value += determinant
         abs_value += np.abs(determinant)
@@ -740,18 +777,21 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
 
 class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_1d(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_orthogonal_1d()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 10, 10)))
@@ -766,6 +806,7 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -775,15 +816,16 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testNonuniformity(self):
     value = 0
     abs_value = 0
     shape = [3, 10, 10]
     count = 70
     tol = 1e-5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for i in range(count):
         x = variable_scope.get_variable("{}".format(i), shape=shape,
                                         initializer=
@@ -801,6 +843,7 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       # Compute the sum of the absolute values of 'count' determinants
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
       """Pad input_ for computing (circular) convolution.
@@ -843,29 +886,32 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       outputs_2norm = linalg_ops.norm(outputs)
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
-      with self.test_session(use_gpu=True) as sess:
-        sess.run(my_ops)
+      with self.session(use_gpu=True) as sess:
+        self.evaluate(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_2d(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_orthogonal_2d()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
@@ -880,6 +926,7 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -889,8 +936,9 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
       """Pad input_ for computing (circular) convolution.
@@ -938,29 +986,32 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
       outputs_2norm = linalg_ops.norm(outputs)
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
-      with self.test_session(use_gpu=True) as sess:
-        sess.run(my_ops)
+      with self.session(use_gpu=True) as sess:
+        self.evaluate(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_3d(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_orthogonal_3d()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 3, 10, 10)))
@@ -975,6 +1026,7 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 3, 3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 3, 3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -984,15 +1036,16 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testNonuniformity(self):
     value = 0
     abs_value = 0
     shape = [3, 3, 3, 5, 5]
     count = 20
     tol = 1e-5
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for i in range(count):
         x = variable_scope.get_variable("{}".format(i), shape=shape,
                                         initializer=
@@ -1010,6 +1063,7 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       # Compute the sum of the absolute values of 'count' determinants
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
       """Padding input_ for computing circular convolution.
@@ -1063,13 +1117,13 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       outputs_2norm = linalg_ops.norm(outputs)
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
-      with self.test_session(use_gpu=True) as sess:
-        sess.run(my_ops)
+      with self.cached_session(use_gpu=True) as sess:
+        self.evaluate(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class IdentityInitializerTest(test.TestCase):
@@ -1085,12 +1139,14 @@ class IdentityInitializerTest(test.TestCase):
       self.assertRaises(ValueError, init, shape=[5])
       self.assertRaises(ValueError, init, shape=[])
 
+  @test_util.run_deprecated_v1
   def testNonSquare(self):
     init = init_ops.identity_initializer()
     shape = (10, 5)
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertAllClose(init(shape).eval(), np.eye(*shape))
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -1101,6 +1157,7 @@ class IdentityInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         self.assertAllClose(init_custom(shape).eval(), np.eye(*shape) * 0.9)
 
+  @test_util.run_deprecated_v1
   def testPartitions(self):
     shape = (10, 10)
     init = init_ops.identity_initializer()
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
index 90759c23ae3924a59eb5ff2b5755ede9631f7b52..9eaaac7a24849600a54a755b80f4418ec905a0bf 100644
--- a/tensorflow/python/kernel_tests/inplace_ops_test.py
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -31,9 +31,10 @@ from tensorflow.python.platform import test as test_lib
 
 class InplaceOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicUpdate(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
         self.assertAllClose(x.eval(), y)
@@ -48,8 +49,9 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[5, :] = 7
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testBasicUpdateBool(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = array_ops.ones([7, 3], dtypes.bool)
       y = np.ones([7, 3], dtypes.bool.as_numpy_dtype)
       self.assertAllClose(x.eval(), y)
@@ -65,9 +67,10 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       y[5, :] = False
       self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testBasicAdd(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
         self.assertAllClose(x.eval(), y)
@@ -84,9 +87,10 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[:, :] += 99
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testBasicSub(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = array_ops.ones([7, 3], dtype)
         y = np.ones([7, 3], dtype.as_numpy_dtype)
         self.assertAllClose(x.eval(), y)
@@ -103,8 +107,9 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[:, :] -= 99
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       d0, d1, d2 = 100, 3, 5
       x = array_ops.zeros([d0, d1, d2])
       y = np.zeros([d0, d1, d2])
@@ -123,8 +128,9 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
           y[idx, :] -= val
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testRandom1D(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       d0 = 100
       x = array_ops.zeros([d0])
       y = np.zeros([d0])
@@ -144,12 +150,12 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         self.assertAllClose(x.eval(), y)
 
   def testAlias(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       x = array_ops.ones([2, 3])
       y = inplace_ops.alias_inplace_add(x, [0], [[1, 2, 3]])
       with ops.control_dependencies([y]):
         z = array_ops.identity(x)
-        _, vy, vz = sess.run([x, y, z])
+        _, vy, vz = self.evaluate([x, y, z])
       self.assertAllClose(vy, vz)
 
   def testError(self):
@@ -164,12 +170,13 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
                                    "i and x shape doesn't match"):
         _ = inplace_ops.inplace_update([[1.]], [0, 1], [[10]]).eval()
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in [
         dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool,
         dtypes.uint8
     ]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         test_shapes = [(), (1,), (2, 3), (0, 2), (2, 3, 5), (2, 0, 5)]
         for shape in test_shapes:
           val = inplace_ops.empty(shape, dtype).eval()
@@ -188,7 +195,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
           self.assertEqual(val.dtype, dtype.as_numpy_dtype)
           self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
       self.assertEqual(val.tolist(), [[b"", b""]])
 
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index afa24195cb3fe08d7ed474242d90276861b87f85..c5df5231bf6fd945c41ac1c99fe6a613ca05fca6 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -23,6 +23,7 @@ import os
 import shutil
 import tempfile
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -30,6 +31,7 @@ from tensorflow.python.util import compat
 
 class IoOpsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testReadFile(self):
     cases = ['', 'Some contents', 'Неки садржаји на српском']
     for contents in cases:
@@ -53,7 +55,7 @@ class IoOpsTest(test.TestCase):
         pass
       with self.cached_session() as sess:
         w = io_ops.write_file(temp.name, contents)
-        sess.run(w)
+        self.evaluate(w)
         with open(temp.name, 'rb') as f:
           file_contents = f.read()
         self.assertEqual(file_contents, contents)
@@ -67,7 +69,7 @@ class IoOpsTest(test.TestCase):
       filepath = os.path.join(subdir, 'subdir2', 'filename')
       with self.cached_session() as sess:
         w = io_ops.write_file(filepath, contents)
-        sess.run(w)
+        self.evaluate(w)
         with open(filepath, 'rb') as f:
           file_contents = f.read()
         self.assertEqual(file_contents, contents)
@@ -78,6 +80,7 @@ class IoOpsTest(test.TestCase):
         compat.as_bytes(files[i].name) for i in range(len(files))
         if i in indices)
 
+  @test_util.run_deprecated_v1
   def testMatchingFiles(self):
     cases = [
         'ABcDEF.GH', 'ABzDEF.GH', 'ABasdfjklDEF.GH', 'AB3DEF.GH', 'AB4DEF.GH',
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 66afb6ec014991ca32efd5b0895ff695d3d1015f..bf6fa9ea71f391287a7c21d042ae67ed57c9fc2b 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -32,10 +32,10 @@ class LargeConcatOpTest(test.TestCase):
       a = array_ops.ones([2**31 + 6], dtype=dtypes.int8)
       b = array_ops.zeros([1024], dtype=dtypes.int8)
       onezeros = array_ops.concat([a, b], 0)
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       # TODO(dga):  Add more depth to this test to validate correctness,
       # not just non-crashingness, once other large tensor fixes have gone in.
-      _ = onezeros.eval()
+      _ = self.evaluate(onezeros)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index be2e31cb5adec71f7c55633441f7eca23f3ec2b5..ba9e64979a48ccce82a283e74a1a024c4bcceda8 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -40,6 +40,44 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "linear_operator_adjoint_test",
+    size = "medium",
+    srcs = ["linear_operator_adjoint_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out
+    ],
+)
+
+cuda_py_test(
+    name = "linear_operator_algebra_test",
+    size = "small",
+    srcs = ["linear_operator_algebra_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "linear_operator_block_diag_test",
     size = "medium",
@@ -89,7 +127,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["linear_operator_circulant_test.py"],
     additional_deps = [
-        "//tensorflow/python/ops/linalg",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:spectral_ops_test_util",
@@ -99,6 +136,8 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/signal",
     ],
     shard_count = 5,
     tags = [
@@ -150,6 +189,28 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "linear_operator_inversion_test",
+    size = "medium",
+    srcs = ["linear_operator_inversion_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out
+    ],
+)
+
 cuda_py_test(
     name = "linear_operator_full_matrix_test",
     size = "medium",
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
index cf56168d637d5f252af99269a588f8ba68d41c6a..627349c69b315297d6832576200b28c5b5e2d12f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
@@ -19,14 +19,13 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_addition
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
-random_seed.set_random_seed(23)
 rng = np.random.RandomState(0)
 
 add_operators = linear_operator_addition.add_operators
@@ -71,6 +70,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "contain only LinearOperator"):
       add_operators([1, 2])
 
+  @test_util.run_deprecated_v1
   def test_two_diag_operators(self):
     op_a = linalg.LinearOperatorDiag(
         [1., 1.], is_positive_definite=True, name="A")
@@ -91,6 +91,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       # Enforce particular name for this simple case
       self.assertEqual("Add/B__A/", op.name)
 
+  @test_util.run_deprecated_v1
   def test_three_diag_operators(self):
     op1 = linalg.LinearOperatorDiag(
         [1., 1.], is_positive_definite=True, name="op1")
@@ -111,6 +112,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       # Positive definite ==> non-singular
       self.assertTrue(op.is_non_singular)
 
+  @test_util.run_deprecated_v1
   def test_diag_tril_diag(self):
     op1 = linalg.LinearOperatorDiag(
         [1., 1.], is_non_singular=True, name="diag_a")
@@ -136,6 +138,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       # Since no custom hint was provided, we default to None (unknown).
       self.assertEqual(None, op.is_non_singular)
 
+  @test_util.run_deprecated_v1
   def test_matrix_diag_tril_diag_uses_custom_name(self):
     op0 = linalg.LinearOperatorFullMatrix(
         [[-1., -1.], [-1., -1.]], name="matrix")
@@ -219,6 +222,7 @@ class LinearOperatorOrderOfAdditionTest(test.TestCase):
     self.assertEqual(1, len(op_sum))
     self.assertIsInstance(op_sum[0], linalg.LinearOperatorLowerTriangular)
 
+  @test_util.run_deprecated_v1
   def test_cannot_add_everything_so_return_more_than_one_operator(self):
     diag1 = linalg.LinearOperatorDiag([1.])
     diag2 = linalg.LinearOperatorDiag([2.])
@@ -263,6 +267,7 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnScaledIdentity()
 
+  @test_util.run_deprecated_v1
   def test_identity_plus_identity(self):
     id1 = linalg.LinearOperatorIdentity(num_rows=2)
     id2 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
@@ -281,6 +286,7 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
 
+  @test_util.run_deprecated_v1
   def test_identity_plus_scaled_identity(self):
     id1 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
     id2 = linalg.LinearOperatorScaledIdentity(num_rows=2, multiplier=2.2)
@@ -299,6 +305,7 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
 
+  @test_util.run_deprecated_v1
   def test_scaled_identity_plus_scaled_identity(self):
     id1 = linalg.LinearOperatorScaledIdentity(
         num_rows=2, multiplier=[2.2, 2.2, 2.2])
@@ -324,6 +331,7 @@ class AddAndReturnDiagTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnDiag()
 
+  @test_util.run_deprecated_v1
   def test_identity_plus_identity_returns_diag(self):
     id1 = linalg.LinearOperatorIdentity(num_rows=2)
     id2 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
@@ -342,6 +350,7 @@ class AddAndReturnDiagTest(test.TestCase):
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
 
+  @test_util.run_deprecated_v1
   def test_diag_plus_diag(self):
     diag1 = rng.rand(2, 3, 4)
     diag2 = rng.rand(4)
@@ -368,6 +377,7 @@ class AddAndReturnTriLTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnTriL()
 
+  @test_util.run_deprecated_v1
   def test_diag_plus_tril(self):
     diag = linalg.LinearOperatorDiag([1., 2.])
     tril = linalg.LinearOperatorLowerTriangular([[10., 0.], [30., 0.]])
@@ -391,6 +401,7 @@ class AddAndReturnMatrixTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnMatrix()
 
+  @test_util.run_deprecated_v1
   def test_diag_plus_diag(self):
     diag1 = linalg.LinearOperatorDiag([1., 2.])
     diag2 = linalg.LinearOperatorDiag([-1., 3.])
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bed4b5268e8d27a25ab735f7e3e1a6c9e4d5d95
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
@@ -0,0 +1,118 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_adjoint
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+LinearOperatorAdjoint = linear_operator_adjoint.LinearOperatorAdjoint  # pylint: disable=invalid-name
+
+
+class LinearOperatorAdjointTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    self._atol[dtypes.complex64] = 1e-5
+    self._rtol[dtypes.complex64] = 1e-5
+
+  def _operator_and_matrix(self,
+                           build_info,
+                           dtype,
+                           use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+
+    if ensure_self_adjoint_and_pd:
+      matrix = linear_operator_test_util.random_positive_definite_matrix(
+          shape, dtype, force_well_conditioned=True)
+    else:
+      matrix = linear_operator_test_util.random_tril_matrix(
+          shape, dtype, force_well_conditioned=True, remove_upper=True)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    if ensure_self_adjoint_and_pd:
+      operator = LinearOperatorAdjoint(
+          linalg.LinearOperatorFullMatrix(
+              lin_op_matrix, is_positive_definite=True, is_self_adjoint=True))
+    else:
+      operator = LinearOperatorAdjoint(
+          linalg.LinearOperatorLowerTriangular(lin_op_matrix))
+
+    return operator, linalg.adjoint(matrix)
+
+  def test_base_operator_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    operator_adjoint = LinearOperatorAdjoint(operator)
+    self.assertTrue(operator_adjoint.is_positive_definite)
+    self.assertTrue(operator_adjoint.is_non_singular)
+    self.assertFalse(operator_adjoint.is_self_adjoint)
+
+  def test_supplied_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator_adjoint = LinearOperatorAdjoint(
+        operator,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator_adjoint.is_positive_definite)
+    self.assertTrue(operator_adjoint.is_non_singular)
+    self.assertFalse(operator_adjoint.is_self_adjoint)
+
+  def test_contradicting_hints_raise(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_positive_definite=False)
+    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+      LinearOperatorAdjoint(operator, is_positive_definite=True)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
+    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+      LinearOperatorAdjoint(operator, is_self_adjoint=True)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, name="my_operator", is_non_singular=True)
+
+    operator = LinearOperatorAdjoint(operator)
+
+    self.assertEqual("my_operator_adjoint", operator.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e296c026c09b36afd39b891befb767a222f5f19
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for registration mechanisms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.linalg import cholesky_registrations  # pylint: disable=unused-import
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import matmul_registrations  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+# pylint: disable=protected-access
+_CHOLESKY_DECOMPS = linear_operator_algebra._CHOLESKY_DECOMPS
+_MATMUL = linear_operator_algebra._MATMUL
+_registered_cholesky = linear_operator_algebra._registered_cholesky
+_registered_matmul = linear_operator_algebra._registered_matmul
+# pylint: enable=protected-access
+
+
+class CholeskyTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Cholesky to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterCholesky(CustomLinOp)
+    def _cholesky(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    with self.assertRaisesRegexp(ValueError, "positive definite"):
+      CustomLinOp(dtype=None, is_self_adjoint=True).cholesky()
+
+    with self.assertRaisesRegexp(ValueError, "self adjoint"):
+      CustomLinOp(dtype=None, is_positive_definite=True).cholesky()
+
+    custom_linop = CustomLinOp(
+        dtype=None, is_self_adjoint=True, is_positive_definite=True)
+    self.assertEqual("OK", custom_linop.cholesky())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterCholesky(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
+
+  def testExactCholeskyRegistrationsAllMatch(self):
+    for (k, v) in _CHOLESKY_DECOMPS.items():
+      self.assertEqual(v, _registered_cholesky(k[0]))
+
+
+class MatmulTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Matmul to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterMatmul(CustomLinOp, CustomLinOp)
+    def _matmul(a, b):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    custom_linop = CustomLinOp(
+        dtype=None, is_self_adjoint=True, is_positive_definite=True)
+    self.assertEqual("OK", custom_linop.matmul(custom_linop))
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterMatmul(CustomLinOp, CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterMatmul(
+        CustomLinOp, CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterMatmul(
+          CustomLinOp, CustomLinOp)(lambda a: None)
+
+  def testExactMatmulRegistrationsAllMatch(self):
+    for (k, v) in _MATMUL.items():
+      self.assertEqual(v, _registered_matmul(k[0], k[1]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index 3ede2aceaa51c2795029ba13b763fed3e2ddc441..f0cc5d709f9bfec2e3dcfadecc8f949bb6ce6e6d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -20,16 +20,15 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_block_diag as block_diag
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
-random_seed.set_random_seed(23)
 rng = np.random.RandomState(0)
 
 
@@ -80,7 +79,9 @@ class SquareLinearOperatorBlockDiagTest(
         build_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
     ]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
     expected_blocks = (
         build_info.__dict__["blocks"] if "blocks" in build_info.__dict__
@@ -100,7 +101,11 @@ class SquareLinearOperatorBlockDiagTest(
 
     operator = block_diag.LinearOperatorBlockDiag(
         [linalg.LinearOperatorFullMatrix(
-            l, is_square=True) for l in lin_op_matrices])
+            l,
+            is_square=True,
+            is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+            is_positive_definite=True if ensure_self_adjoint_and_pd else None)
+         for l in lin_op_matrices])
 
     # Should be auto-set.
     self.assertTrue(operator.is_square)
@@ -131,6 +136,40 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_block_diag_cholesky_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+        ],
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    cholesky_factor = operator.cholesky()
+    self.assertTrue(isinstance(
+        cholesky_factor,
+        block_diag.LinearOperatorBlockDiag))
+    self.assertEqual(2, len(cholesky_factor.operators))
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[0],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[1],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+
   def test_is_non_singular_auto_set(self):
     # Matrix with two positive eigenvalues, 11 and 8.
     # The matrix values do not effect auto-setting of the flags.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index f1e151ebd862ffdbb0a266060dfc6ae7d5a24ef2..6366083ac5b1601c0e71a13a310c6761015bcc45 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -21,12 +21,14 @@ import contextlib
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.linalg import linear_operator_circulant
 from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(0)
@@ -75,8 +77,8 @@ class LinearOperatorCirculantBaseTest(object):
       x = np.zeros([domain_dimension])
       # x is a basis vector.
       x[m] = 1.0
-      fft_x = math_ops.fft(x.astype(np.complex64))
-      h_convolve_x = math_ops.ifft(spectrum * fft_x)
+      fft_x = fft_ops.fft(x.astype(np.complex64))
+      h_convolve_x = fft_ops.ifft(spectrum * fft_x)
       matrix_rows.append(h_convolve_x)
     matrix = array_ops.stack(matrix_rows, axis=-1)
     return math_ops.cast(matrix, dtype)
@@ -97,7 +99,9 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
     # real, the matrix will not be real.
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating real spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -105,6 +109,8 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
     # spectrum is bounded away from zero.
     spectrum = linear_operator_test_util.random_sign_uniform(
         shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    if ensure_self_adjoint_and_pd:
+      spectrum = math_ops.abs(spectrum)
     # If dtype is complex, cast spectrum to complex.  The imaginary part will be
     # zero, so the operator will still be self-adjoint.
     spectrum = math_ops.cast(spectrum, dtype)
@@ -115,12 +121,16 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant(
-        lin_op_spectrum, is_self_adjoint=True, input_output_dtype=dtype)
+        lin_op_spectrum,
+        is_self_adjoint=True,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        input_output_dtype=dtype)
 
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
       spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
@@ -129,7 +139,8 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
 
 class LinearOperatorCirculantTestHermitianSpectrum(
@@ -146,7 +157,9 @@ class LinearOperatorCirculantTestHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -160,14 +173,14 @@ class LinearOperatorCirculantTestHermitianSpectrum(
     #  = IFFT[EvenPartOf[pre_spectrum]]
     # is the IFFT of something that is also bounded away from zero.
     # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
-    pre_h = math_ops.ifft(pre_spectrum_c)
+    pre_h = fft_ops.ifft(pre_spectrum_c)
 
     # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
     # So we will make spectrum = FFT[h], for real valued h.
     h = math_ops.real(pre_h)
     h_c = _to_complex(h)
 
-    spectrum = math_ops.fft(h_c)
+    spectrum = fft_ops.fft(h_c)
 
     lin_op_spectrum = spectrum
 
@@ -175,12 +188,17 @@ class LinearOperatorCirculantTestHermitianSpectrum(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant(
-        lin_op_spectrum, input_output_dtype=dtype)
+        lin_op_spectrum,
+        input_output_dtype=dtype,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+    )
 
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
       spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
@@ -189,7 +207,8 @@ class LinearOperatorCirculantTestHermitianSpectrum(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
 
 class LinearOperatorCirculantTestNonHermitianSpectrum(
@@ -205,7 +224,16 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  # Skip Cholesky since we are explicitly testing non-hermitian
+  # spectra.
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     shape = build_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
@@ -226,6 +254,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
       spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
@@ -234,8 +263,10 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
+  @test_util.run_deprecated_v1
   def test_simple_positive_real_spectrum_gives_self_adjoint_pos_def_oper(self):
     with self.cached_session() as sess:
       spectrum = math_ops.cast([6., 4, 2], dtypes.complex64)
@@ -248,10 +279,11 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       operator.assert_positive_definite().run()  # Should not fail
       operator.assert_self_adjoint().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_defining_operator_using_real_convolution_kernel(self):
     with self.cached_session():
       convolution_kernel = [1., 2., 1.]
-      spectrum = math_ops.fft(
+      spectrum = fft_ops.fft(
           math_ops.cast(convolution_kernel, dtypes.complex64))
 
       # spectrum is shape [3] ==> operator is shape [3, 3]
@@ -269,15 +301,16 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       # Make spectrum the FFT of a real convolution kernel h.  This ensures that
       # spectrum is Hermitian.
       h = linear_operator_test_util.random_normal(shape=(3, 4))
-      spectrum = math_ops.fft(math_ops.cast(h, dtypes.complex64))
+      spectrum = fft_ops.fft(math_ops.cast(h, dtypes.complex64))
       operator = linalg.LinearOperatorCirculant(
           spectrum, input_output_dtype=dtypes.complex64)
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
       np.testing.assert_allclose(
-          0, imag_matrix.eval(), rtol=0, atol=eps * 3 * 4)
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3 * 4)
 
+  @test_util.run_deprecated_v1
   def test_convolution_kernel_same_as_first_row_of_to_dense(self):
     spectrum = [[3., 2., 1.], [2., 1.5, 1.]]
     with self.cached_session():
@@ -287,8 +320,9 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
       self.assertAllEqual((2, 3), h.get_shape())
       self.assertAllEqual((2, 3, 3), c.get_shape())
-      self.assertAllClose(h.eval(), c.eval()[:, :, 0])
+      self.assertAllClose(h.eval(), self.evaluate(c)[:, :, 0])
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_fails_for_singular_operator(self):
     spectrum = math_ops.cast([0, 4, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
@@ -296,12 +330,14 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
     spectrum = math_ops.cast([-3j, 4, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
     with self.cached_session():
       operator.assert_non_singular().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_fails_for_non_positive_definite(self):
     spectrum = math_ops.cast([6., 4, 2j], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
@@ -309,6 +345,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_fail_when_pos_def(self):
     spectrum = math_ops.cast([6., 4, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
@@ -397,8 +434,8 @@ class LinearOperatorCirculant2DBaseTest(object):
         x = np.zeros(block_shape)
         # x is a basis vector.
         x[n0, n1] = 1.0
-        fft_x = math_ops.fft2d(x.astype(np.complex64))
-        h_convolve_x = math_ops.ifft2d(spectrum * fft_x)
+        fft_x = fft_ops.fft2d(x.astype(np.complex64))
+        h_convolve_x = fft_ops.ifft2d(spectrum * fft_x)
         # We want the flat version of the action of the operator on a basis
         # vector, not the block version.
         h_convolve_x = array_ops.reshape(h_convolve_x, shape[:-1])
@@ -421,7 +458,9 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -435,14 +474,14 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
     #  = IFFT[EvenPartOf[pre_spectrum]]
     # is the IFFT of something that is also bounded away from zero.
     # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
-    pre_h = math_ops.ifft2d(pre_spectrum_c)
+    pre_h = fft_ops.ifft2d(pre_spectrum_c)
 
     # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
     # So we will make spectrum = FFT[h], for real valued h.
     h = math_ops.real(pre_h)
     h_c = _to_complex(h)
 
-    spectrum = math_ops.fft2d(h_c)
+    spectrum = fft_ops.fft2d(h_c)
 
     lin_op_spectrum = spectrum
 
@@ -450,7 +489,10 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant2D(
-        lin_op_spectrum, input_output_dtype=dtype)
+        lin_op_spectrum,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        input_output_dtype=dtype)
 
     mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
 
@@ -470,7 +512,14 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     shape = build_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
@@ -491,6 +540,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_real_hermitian_spectrum_gives_real_symmetric_operator(self):
     with self.cached_session() as sess:
       # This is a real and hermitian spectrum.
@@ -508,6 +558,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       np.testing.assert_allclose(0, imag_matrix, atol=1e-6)
       self.assertAllClose(matrix, matrix_transpose, atol=0)
 
+  @test_util.run_deprecated_v1
   def test_real_spectrum_gives_self_adjoint_operator(self):
     with self.cached_session() as sess:
       # This is a real and hermitian spectrum.
@@ -519,9 +570,10 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       self.assertEqual(matrix_tensor.dtype,
                        linear_operator_circulant._DTYPE_COMPLEX)
       matrix_h = linalg.adjoint(matrix_tensor)
-      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      matrix, matrix_h = self.evaluate([matrix_tensor, matrix_h])
       self.assertAllClose(matrix, matrix_h, atol=0)
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_fails_for_singular_operator(self):
     spectrum = math_ops.cast([[0, 4], [2j + 2, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
@@ -529,12 +581,14 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
     spectrum = math_ops.cast([[-3j, 4], [2j + 2, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
     with self.cached_session():
       operator.assert_non_singular().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_fails_for_non_positive_definite(self):
     spectrum = math_ops.cast([[6., 4], [2j, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
@@ -542,6 +596,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_fail_when_pos_def(self):
     spectrum = math_ops.cast([[6., 4], [2j + 2, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
@@ -580,6 +635,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       with spectral_ops_test_util.fft_kernel_label_map():
         yield sess
 
+  @test_util.run_deprecated_v1
   def test_real_spectrum_gives_self_adjoint_operator(self):
     with self.cached_session() as sess:
       # This is a real and hermitian spectrum.
@@ -593,16 +649,17 @@ class LinearOperatorCirculant3DTest(test.TestCase):
                        linear_operator_circulant._DTYPE_COMPLEX)
       matrix_h = linalg.adjoint(matrix_tensor)
 
-      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      matrix, matrix_h = self.evaluate([matrix_tensor, matrix_h])
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
       self.assertAllClose(matrix, matrix_h)
 
+  @test_util.run_deprecated_v1
   def test_defining_operator_using_real_convolution_kernel(self):
     with self.cached_session():
       convolution_kernel = linear_operator_test_util.random_normal(
           shape=(2, 2, 3, 5), dtype=dtypes.float32)
       # Convolution kernel is real ==> spectrum is Hermitian.
-      spectrum = math_ops.fft3d(
+      spectrum = fft_ops.fft3d(
           math_ops.cast(convolution_kernel, dtypes.complex64))
 
       # spectrum is Hermitian ==> operator is real.
@@ -615,6 +672,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
       np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def test_defining_spd_operator_by_taking_real_part(self):
     with self.cached_session() as sess:
       # S is real and positive.
@@ -634,7 +692,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       #         =      H1  +      H2
       # where H1 is real since it is Hermitian,
       # and H2 is imaginary since it is anti-Hermitian.
-      ifft_s = math_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
+      ifft_s = fft_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
 
       # Throw away H2, keep H1.
       real_ifft_s = math_ops.real(ifft_s)
@@ -642,7 +700,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       # This is the perfect spectrum!
       # spectrum = DFT[H1]
       #          = S1,
-      fft_real_ifft_s = math_ops.fft3d(
+      fft_real_ifft_s = fft_ops.fft3d(
           math_ops.cast(real_ifft_s, dtypes.complex64))
 
       # S1 is Hermitian ==> operator is real.
@@ -665,7 +723,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       # S2 is anti-Hermitian ==> operator is imaginary.
       # S2 is real ==> operator is self-adjoint.
       imag_ifft_s = math_ops.imag(ifft_s)
-      fft_imag_ifft_s = math_ops.fft3d(
+      fft_imag_ifft_s = fft_ops.fft3d(
           1j * math_ops.cast(imag_ifft_s, dtypes.complex64))
       operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s)
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index 99497914f2a6a86272165c591b087380a8072b1b..214b73aa2f34d436e3430e4e7489c90adb6d52f9 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -21,7 +21,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
@@ -29,7 +29,6 @@ from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
-random_seed.set_random_seed(23)
 rng = np.random.RandomState(0)
 
 
@@ -44,8 +43,12 @@ class SquareLinearOperatorCompositionTest(
     self._rtol[dtypes.float32] = 1e-4
     self._rtol[dtypes.complex64] = 1e-4
 
+  @property
+  def _tests_to_skip(self):
+    # Cholesky not implemented.
+    return ["cholesky"]
+
   def _operator_and_matrix(self, build_info, dtype, use_placeholder):
-    sess = ops.get_default_session()
     shape = list(build_info.shape)
 
     # Either 1 or 2 matrices, depending.
@@ -177,6 +180,7 @@ class NonSquareLinearOperatorCompositionTest(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_static_shapes(self):
     operators = [
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 4)),
@@ -185,6 +189,7 @@ class NonSquareLinearOperatorCompositionTest(
     operator = linalg.LinearOperatorComposition(operators)
     self.assertAllEqual((2, 3, 5), operator.shape)
 
+  @test_util.run_deprecated_v1
   def test_shape_tensors_when_statically_available(self):
     operators = [
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 4)),
@@ -194,6 +199,7 @@ class NonSquareLinearOperatorCompositionTest(
     with self.cached_session():
       self.assertAllEqual((2, 3, 5), operator.shape_tensor().eval())
 
+  @test_util.run_deprecated_v1
   def test_shape_tensors_when_only_dynamically_available(self):
     mat_1 = rng.rand(1, 2, 3, 4)
     mat_2 = rng.rand(1, 2, 4, 5)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 52861ae84a88ca08ef384868d77d05541f66bf43..dcbc0dd7c97184df150fc7094a28441fcfaa1257 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -27,24 +27,32 @@ from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
-random_seed.set_random_seed(23)
 
 
 class LinearOperatorDiagTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
     diag = linear_operator_test_util.random_sign_uniform(
         shape[:-1], minval=1., maxval=2., dtype=dtype)
 
+    if ensure_self_adjoint_and_pd:
+      # Abs on complex64 will result in a float32, so we cast back up.
+      diag = math_ops.cast(math_ops.abs(diag), dtype=dtype)
+
     lin_op_diag = diag
 
     if use_placeholder:
       lin_op_diag = array_ops.placeholder_with_default(diag, shape=None)
 
-    operator = linalg.LinearOperatorDiag(lin_op_diag)
+    operator = linalg.LinearOperatorDiag(
+        lin_op_diag,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     matrix = array_ops.matrix_diag(diag)
 
@@ -73,6 +81,7 @@ class LinearOperatorDiagTest(
       with self.assertRaisesOpError("non-positive real.*not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_raise_if_pd_and_complex(self):
     with self.cached_session():
       x = [1., 2.]
@@ -89,6 +98,7 @@ class LinearOperatorDiagTest(
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_raise_for_complex_nonsingular(self):
     with self.cached_session():
       x = [1., 0.]
@@ -106,6 +116,7 @@ class LinearOperatorDiagTest(
       with self.assertRaisesOpError("imaginary.*not self-adjoint"):
         operator.assert_self_adjoint().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint_does_not_raise_for_diag_with_zero_imag(self):
     with self.cached_session():
       x = [1., 0.]
@@ -140,12 +151,52 @@ class LinearOperatorDiagTest(
       operator_matmul = operator.matmul(x)
       mat_matmul = math_ops.matmul(mat, x)
       self.assertAllEqual(operator_matmul.get_shape(), mat_matmul.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, mat_matmul]))
+      self.assertAllClose(*self.evaluate([operator_matmul, mat_matmul]))
 
       operator_solve = operator.solve(x)
       mat_solve = linalg_ops.matrix_solve(mat, x)
       self.assertAllEqual(operator_solve.get_shape(), mat_solve.get_shape())
-      self.assertAllClose(*sess.run([operator_solve, mat_solve]))
+      self.assertAllClose(*self.evaluate([operator_solve, mat_solve]))
+
+  def test_diag_matmul(self):
+    operator1 = linalg_lib.LinearOperatorDiag([2., 3.])
+    operator2 = linalg_lib.LinearOperatorDiag([1., 2.])
+    operator3 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3.)
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2., 6.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2., 6.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator1.matmul(operator3)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator3.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
+
+  def test_diag_cholesky_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(
+        diag,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg.LinearOperatorDiag))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
index 8373b5263f324df770a600222d3cbd7c8d081fc9..aff0b1ae14ce5bfb62ba9984f60cf30f9b553ea7 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
@@ -28,14 +28,15 @@ from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
-random_seed.set_random_seed(23)
 
 
 class SquareLinearOperatorFullMatrixTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
 
     matrix = linear_operator_test_util.random_positive_definite_matrix(
@@ -46,7 +47,12 @@ class SquareLinearOperatorFullMatrixTest(
     if use_placeholder:
       lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
 
-    operator = linalg.LinearOperatorFullMatrix(lin_op_matrix, is_square=True)
+    # Set the hints to none to test non-symmetric PD code paths.
+    operator = linalg.LinearOperatorFullMatrix(
+        lin_op_matrix,
+        is_square=True,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     return operator, matrix
 
@@ -64,6 +70,7 @@ class SquareLinearOperatorFullMatrixTest(
     # Auto-detected.
     self.assertTrue(operator.is_square)
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_raises_if_cond_too_big_but_finite(self):
     with self.cached_session():
       tril = linear_operator_test_util.random_tril_matrix(
@@ -125,7 +132,13 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.float64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+
+    # Matrix is always symmetric and positive definite in this class.
+    del ensure_self_adjoint_and_pd
+
     shape = list(build_info.shape)
 
     matrix = linear_operator_test_util.random_positive_definite_matrix(
@@ -136,7 +149,11 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
     if use_placeholder:
       lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
 
-    operator = linalg.LinearOperatorFullMatrix(lin_op_matrix, is_square=True)
+    operator = linalg.LinearOperatorFullMatrix(
+        lin_op_matrix,
+        is_square=True,
+        is_self_adjoint=True,
+        is_positive_definite=True)
 
     return operator, matrix
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 0c3c6b390fa628759fdc6aaa9ab8b97b8856087c..2da5e712d77b88ca6bb20a5f0920335f00c7b594 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -20,16 +20,16 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
 
 
-random_seed.set_random_seed(23)
 rng = np.random.RandomState(2016)
 
 
@@ -43,7 +43,12 @@ class LinearOperatorIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    # Identity matrix is already Hermitian Positive Definite.
+    del ensure_self_adjoint_and_pd
+
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
@@ -56,16 +61,19 @@ class LinearOperatorIdentityTest(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_positive_definite().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_non_singular().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
@@ -79,7 +87,7 @@ class LinearOperatorIdentityTest(
           num_rows=2, dtype=dtypes.float16)
       x = rng.randn(2, 3).astype(np.float16)
       y = operator.matmul(x)
-      self.assertAllClose(x, y.eval())
+      self.assertAllClose(x, self.evaluate(y))
 
   def test_non_scalar_num_rows_raises_static(self):
     with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
@@ -105,6 +113,7 @@ class LinearOperatorIdentityTest(
     with self.assertRaisesRegexp(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorIdentity(num_rows=2, batch_shape=[-2])
 
+  @test_util.run_deprecated_v1
   def test_non_scalar_num_rows_raises_dynamic(self):
     with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
@@ -113,6 +122,7 @@ class LinearOperatorIdentityTest(
       with self.assertRaisesOpError("must be a 0-D Tensor"):
         operator.to_dense().eval(feed_dict={num_rows: [2]})
 
+  @test_util.run_deprecated_v1
   def test_negative_num_rows_raises_dynamic(self):
     with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
@@ -121,6 +131,7 @@ class LinearOperatorIdentityTest(
       with self.assertRaisesOpError("must be non-negative"):
         operator.to_dense().eval(feed_dict={num_rows: -2})
 
+  @test_util.run_deprecated_v1
   def test_non_1d_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -129,6 +140,7 @@ class LinearOperatorIdentityTest(
       with self.assertRaisesOpError("must be a 1-D"):
         operator.to_dense().eval(feed_dict={batch_shape: 2})
 
+  @test_util.run_deprecated_v1
   def test_negative_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -143,6 +155,7 @@ class LinearOperatorIdentityTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
+  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
@@ -166,8 +179,9 @@ class LinearOperatorIdentityTest(
       expected = x
 
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
+  @test_util.run_deprecated_v1
   def test_default_batch_shape_broadcasts_with_everything_dynamic(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
@@ -203,8 +217,9 @@ class LinearOperatorIdentityTest(
 
       operator_matmul = operator.matmul(x)
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
+  @test_util.run_deprecated_v1
   def test_broadcast_matmul_dynamic_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
@@ -244,6 +259,16 @@ class LinearOperatorIdentityTest(
           is_non_singular=None,
       )
 
+  def test_identity_cholesky_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg_lib.LinearOperatorIdentity))
+
 
 class LinearOperatorScaledIdentityTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
@@ -255,7 +280,10 @@ class LinearOperatorScaledIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
@@ -268,6 +296,9 @@ class LinearOperatorScaledIdentityTest(
     multiplier = linear_operator_test_util.random_sign_uniform(
         shape=batch_shape, minval=1., maxval=2., dtype=dtype)
 
+    if ensure_self_adjoint_and_pd:
+      # Abs on complex64 will result in a float32, so we cast back up.
+      multiplier = math_ops.cast(math_ops.abs(multiplier), dtype=dtype)
 
     # Nothing to feed since LinearOperatorScaledIdentity takes no Tensor args.
     lin_op_multiplier = multiplier
@@ -277,7 +308,10 @@ class LinearOperatorScaledIdentityTest(
           multiplier, shape=None)
 
     operator = linalg_lib.LinearOperatorScaledIdentity(
-        num_rows, lin_op_multiplier)
+        num_rows,
+        lin_op_multiplier,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     multiplier_matrix = array_ops.expand_dims(
         array_ops.expand_dims(multiplier, -1), -1)
@@ -286,6 +320,7 @@ class LinearOperatorScaledIdentityTest(
 
     return operator, matrix
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_raise_when_positive(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -299,6 +334,7 @@ class LinearOperatorScaledIdentityTest(
       with self.assertRaisesOpError("not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_raise_when_non_singular(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -312,6 +348,7 @@ class LinearOperatorScaledIdentityTest(
       with self.assertRaisesOpError("was singular"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint_does_not_raise_when_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -334,7 +371,7 @@ class LinearOperatorScaledIdentityTest(
           num_rows=2, multiplier=multiplier)
       x = rng.randn(2, 3).astype(np.float16)
       y = operator.matmul(x)
-      self.assertAllClose(multiplier[..., None, None] * x, y.eval())
+      self.assertAllClose(multiplier[..., None, None] * x, self.evaluate(y))
 
   def test_non_scalar_num_rows_raises_static(self):
     # Many "test_...num_rows" tests are performed in LinearOperatorIdentity.
@@ -349,6 +386,7 @@ class LinearOperatorScaledIdentityTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
+  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
@@ -380,13 +418,13 @@ class LinearOperatorScaledIdentityTest(
       expected = x * 2.2 + zeros
       operator_matmul = operator.matmul(x)
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2 + zeros
       operator_solve = operator.solve(x)
       self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_solve, expected]))
+      self.assertAllClose(*self.evaluate([operator_solve, expected]))
 
   def test_broadcast_matmul_and_solve_scalar_scale_multiplier(self):
     # These cannot be done in the automated (base test class) tests since they
@@ -406,13 +444,13 @@ class LinearOperatorScaledIdentityTest(
       expected = x * 2.2
       operator_matmul = operator.matmul(x)
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2
       operator_solve = operator.solve(x)
       self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_solve, expected]))
+      self.assertAllClose(*self.evaluate([operator_solve, expected]))
 
   def test_is_x_flags(self):
     operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -422,6 +460,41 @@ class LinearOperatorScaledIdentityTest(
     self.assertTrue(operator.is_non_singular)
     self.assertTrue(operator.is_self_adjoint is None)
 
+  def test_identity_matmul(self):
+    operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
+    operator2 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3.)
+    self.assertTrue(isinstance(
+        operator1.matmul(operator1),
+        linalg_lib.LinearOperatorIdentity))
+
+    self.assertTrue(isinstance(
+        operator1.matmul(operator1),
+        linalg_lib.LinearOperatorIdentity))
+
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorScaledIdentity))
+    self.assertAllClose(3., self.evaluate(operator_matmul.multiplier))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorScaledIdentity))
+    self.assertAllClose(3., self.evaluate(operator_matmul.multiplier))
+
+  def test_scaled_identity_cholesky_type(self):
+    operator = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2,
+        multiplier=3.,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg_lib.LinearOperatorScaledIdentity))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9344c526ee8ce3bd68de6876626a86a9ad6ab0d8
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_inversion
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+LinearOperatorInversion = linear_operator_inversion.LinearOperatorInversion  # pylint: disable=invalid-name
+
+
+class LinearOperatorInversionTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    self._atol[dtypes.complex64] = 1e-5
+    self._rtol[dtypes.complex64] = 1e-5
+
+  def _operator_and_matrix(self,
+                           build_info,
+                           dtype,
+                           use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+
+    if ensure_self_adjoint_and_pd:
+      matrix = linear_operator_test_util.random_positive_definite_matrix(
+          shape, dtype, force_well_conditioned=True)
+    else:
+      matrix = linear_operator_test_util.random_tril_matrix(
+          shape, dtype, force_well_conditioned=True, remove_upper=True)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    if ensure_self_adjoint_and_pd:
+      operator = LinearOperatorInversion(
+          linalg.LinearOperatorFullMatrix(
+              lin_op_matrix, is_positive_definite=True, is_self_adjoint=True))
+    else:
+      operator = LinearOperatorInversion(
+          linalg.LinearOperatorLowerTriangular(lin_op_matrix))
+
+    return operator, linalg.inv(matrix)
+
+  def test_base_operator_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    operator_inv = LinearOperatorInversion(operator)
+    self.assertTrue(operator_inv.is_positive_definite)
+    self.assertTrue(operator_inv.is_non_singular)
+    self.assertFalse(operator_inv.is_self_adjoint)
+
+  def test_supplied_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator_inv = LinearOperatorInversion(
+        operator,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator_inv.is_positive_definite)
+    self.assertTrue(operator_inv.is_non_singular)
+    self.assertFalse(operator_inv.is_self_adjoint)
+
+  def test_contradicting_hints_raise(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_positive_definite=False)
+    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+      LinearOperatorInversion(operator, is_positive_definite=True)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
+    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+      LinearOperatorInversion(operator, is_self_adjoint=True)
+
+  def test_singular_raises(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 1.], [1., 1.]]
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=False)
+    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+      LinearOperatorInversion(operator)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+      LinearOperatorInversion(operator, is_non_singular=False)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, name="my_operator", is_non_singular=True)
+
+    operator = LinearOperatorInversion(operator)
+
+    self.assertEqual("my_operator_inv", operator.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index 7e81c9c6c4f4a1db475a97294eb51a96478dfdf0..513b246803233f1117b48f1a3d413be42f15238a 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -21,16 +21,16 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_kronecker as kronecker
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
-random_seed.set_random_seed(23)
 rng = np.random.RandomState(0)
 
 
@@ -54,6 +54,7 @@ def _kronecker_dense(factors):
 
 class KroneckerDenseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testKroneckerDenseMatrix(self):
     x = ops.convert_to_tensor([[2., 3.], [1., 2.]], dtype=dtypes.float32)
     y = ops.convert_to_tensor([[1., 2.], [5., -1.]], dtype=dtypes.float32)
@@ -71,8 +72,8 @@ class KroneckerDenseTest(test.TestCase):
         [5., 10., -1., -2.]], dtype=dtypes.float32)
 
     with self.cached_session():
-      self.assertAllClose(_kronecker_dense([x, y]).eval(), z.eval())
-      self.assertAllClose(_kronecker_dense([y, x]).eval(), w.eval())
+      self.assertAllClose(_kronecker_dense([x, y]).eval(), self.evaluate(z))
+      self.assertAllClose(_kronecker_dense([y, x]).eval(), self.evaluate(w))
 
 
 class SquareLinearOperatorKroneckerTest(
@@ -101,7 +102,12 @@ class SquareLinearOperatorKroneckerTest(
   def _tests_to_skip(self):
     return ["det", "solve", "solve_with_broadcast"]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    # Kronecker products constructed below will be from symmetric
+    # positive-definite matrices.
+    del ensure_self_adjoint_and_pd
     shape = list(build_info.shape)
     expected_factors = build_info.__dict__["factors"]
     matrices = [
@@ -118,7 +124,11 @@ class SquareLinearOperatorKroneckerTest(
 
     operator = kronecker.LinearOperatorKronecker(
         [linalg.LinearOperatorFullMatrix(
-            l, is_square=True) for l in lin_op_matrices])
+            l,
+            is_square=True,
+            is_self_adjoint=True,
+            is_positive_definite=True)
+         for l in lin_op_matrices])
 
     matrices = linear_operator_util.broadcast_matrix_batch_dims(matrices)
 
@@ -182,6 +192,40 @@ class SquareLinearOperatorKroneckerTest(
     with self.assertRaisesRegexp(ValueError, ">=1 operators"):
       kronecker.LinearOperatorKronecker([])
 
+  def test_kronecker_cholesky_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+        ],
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    cholesky_factor = operator.cholesky()
+    self.assertTrue(isinstance(
+        cholesky_factor,
+        kronecker.LinearOperatorKronecker))
+    self.assertEqual(2, len(cholesky_factor.operators))
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[0],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[1],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
index 61268607a415e68fb52d3e53fca0139701071ace..2920f3ae7ebc549ae960215445fc933bb30913dd 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
@@ -28,7 +27,6 @@ from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
-random_seed.set_random_seed(23)
 rng = np.random.RandomState(0)
 
 
@@ -71,7 +69,8 @@ class BaseLinearOperatorLowRankUpdatetest(object):
     return linear_operator_test_util.random_uniform(
         diag_shape, minval=1e-4, maxval=1., dtype=dtype)
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
     # Recall A = L + UDV^H
     shape = list(build_info.shape)
     diag_shape = shape[:-1]
@@ -95,7 +94,7 @@ class BaseLinearOperatorLowRankUpdatetest(object):
     lin_op_v = v
 
     # D
-    if self._is_diag_update_positive:
+    if self._is_diag_update_positive or ensure_self_adjoint_and_pd:
       diag_update = self._gen_positive_diag(dtype, diag_update_shape)
     else:
       diag_update = linear_operator_test_util.random_normal(
@@ -180,6 +179,10 @@ class LinearOperatorLowRankUpdatetestWithDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UDU^H, D !> 0, L > 0 ==> A !> 0 and we cannot use a Cholesky."""
 
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
   _use_diag_update = True
   _is_diag_update_positive = False
   _use_v = False
@@ -219,6 +222,10 @@ class LinearOperatorLowRankUpdatetestNoDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UV^H, L > 0 ==> A is not symmetric and we cannot use a Cholesky."""
 
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
   _use_diag_update = False
   _is_diag_update_positive = None
   _use_v = True
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index eb4bff915ba0b7be0af3bed9cf0d39ed24ccc131..bd41f9ed9d335f6f7e77cb7a19c5db1e59482d48 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -17,20 +17,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
 
 linalg = linalg_lib
-random_seed.set_random_seed(23)
 
 
 class LinearOperatorLowerTriangularTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
+  @property
+  def _tests_to_skip(self):
+    # Cholesky does not make sense for triangular matrices.
+    return ["cholesky"]
+
   def _operator_and_matrix(self, build_info, dtype, use_placeholder):
     shape = list(build_info.shape)
     # Upper triangle will be nonzero, but ignored.
@@ -73,6 +77,30 @@ class LinearOperatorLowerTriangularTest(
     with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
       linalg.LinearOperatorLowerTriangular([1.])
 
+  def test_triangular_diag_matmul(self):
+    operator1 = linalg_lib.LinearOperatorLowerTriangular(
+        [[1., 0., 0.], [2., 1., 0.], [2., 3., 3.]])
+    operator2 = linalg_lib.LinearOperatorDiag([2., 2., 3.])
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorLowerTriangular))
+    self.assertAllClose(
+        math_ops.matmul(
+            operator1.to_dense(),
+            operator2.to_dense()),
+        self.evaluate(operator_matmul.to_dense()))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorLowerTriangular))
+    self.assertAllClose(
+        math_ops.matmul(
+            operator2.to_dense(),
+            operator1.to_dense()),
+        self.evaluate(operator_matmul.to_dense()))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 819347343b1d22257e9f3579caced56128596723..18e13a76a097f72887cacc5d3de40b8d6babcb52 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -107,6 +108,7 @@ class LinearOperatorTest(test.TestCase):
     self.assertAllEqual(4, operator.domain_dimension)
     self.assertAllEqual(3, operator.range_dimension)
 
+  @test_util.run_deprecated_v1
   def test_all_shape_methods_defined_by_the_one_method_shape(self):
     with self.cached_session():
       shape = (1, 2, 3, 4)
@@ -134,8 +136,9 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       operator_dense = operator.to_dense()
       self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
-      self.assertAllClose(matrix, operator_dense.eval())
+      self.assertAllClose(matrix, self.evaluate(operator_dense))
 
+  @test_util.run_deprecated_v1
   def test_generic_to_dense_method_non_square_matrix_tensor(self):
     matrix = rng.randn(2, 3, 4)
     matrix_ph = array_ops.placeholder(dtypes.float64)
@@ -152,7 +155,7 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       y = operator.matvec(x)
       self.assertAllEqual((2,), y.get_shape())
-      self.assertAllClose([1., 2.], y.eval())
+      self.assertAllClose([1., 2.], self.evaluate(y))
 
   def test_solvevec(self):
     matrix = [[1., 0], [0., 2.]]
@@ -161,7 +164,7 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       x = operator.solvevec(y)
       self.assertAllEqual((2,), x.get_shape())
-      self.assertAllClose([1., 1 / 2.], x.eval())
+      self.assertAllClose([1., 1 / 2.], self.evaluate(x))
 
   def test_is_square_set_to_true_for_square_static_shapes(self):
     operator = LinearOperatorShape(shape=(2, 4, 4))
@@ -175,6 +178,7 @@ class LinearOperatorTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "but.*was square"):
       _ = LinearOperatorShape(shape=(2, 4, 4), is_square=False).is_square
 
+  @test_util.run_deprecated_v1
   def test_is_square_set_inconsistent_with_other_hints_raises(self):
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
@@ -185,6 +189,7 @@ class LinearOperatorTest(test.TestCase):
       LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
+  @test_util.run_deprecated_v1
   def test_non_square_operators_raise_on_determinant_and_solve(self):
     operator = LinearOperatorShape((2, 3))
     with self.assertRaisesRegexp(NotImplementedError, "not be square"):
@@ -199,6 +204,7 @@ class LinearOperatorTest(test.TestCase):
       LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
+  @test_util.run_deprecated_v1
   def test_is_square_manual_set_works(self):
     matrix = array_ops.placeholder(dtypes.float32)
     # Default is None.
@@ -208,6 +214,80 @@ class LinearOperatorTest(test.TestCase):
     operator = LinearOperatorMatmulSolve(matrix, is_square=True)
     self.assertTrue(operator.is_square)
 
+  @test_util.run_v1_only("b/120545219")
+  def test_linear_operator_matmul_hints_closed(self):
+    matrix = array_ops.placeholder(dtypes.float32)
+    operator1 = LinearOperatorMatmulSolve(matrix)
+
+    operator_matmul = operator1.matmul(operator1)
+
+    self.assertEqual(None, operator_matmul.is_square)
+    self.assertEqual(None, operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+    operator2 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=True,
+        is_self_adjoint=True,
+        is_positive_definite=True,
+        is_square=True,
+    )
+
+    operator_matmul = operator2.matmul(operator2)
+
+    self.assertTrue(operator_matmul.is_square)
+    self.assertTrue(operator_matmul.is_non_singular)
+    self.assertTrue(operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+  @test_util.run_v1_only("b/120545219")
+  def test_linear_operator_matmul_hints_false(self):
+    matrix = array_ops.placeholder(dtypes.float32)
+    operator1 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=False,
+        is_self_adjoint=False,
+        is_positive_definite=False,
+        is_square=True,
+    )
+
+    operator_matmul = operator1.matmul(operator1)
+
+    self.assertTrue(operator_matmul.is_square)
+    self.assertFalse(operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+    operator2 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=False,
+        is_self_adjoint=False,
+        is_positive_definite=False,
+        is_square=False,
+    )
+
+    operator_matmul = operator2.matmul(operator2)
+
+    self.assertEqual(None, operator_matmul.is_square)
+    self.assertEqual(None, operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+  @test_util.run_v1_only("b/120545219")
+  def test_linear_operator_matmul_hint_infer_square(self):
+    matrix1 = array_ops.placeholder(shape=[2, 3], dtype=dtypes.float32)
+    matrix2 = array_ops.placeholder(shape=[3, 2], dtype=dtypes.float32)
+    matrix3 = array_ops.placeholder(shape=[3, 4], dtype=dtypes.float32)
+
+    operator1 = LinearOperatorMatmulSolve(matrix1, is_square=False)
+    operator2 = LinearOperatorMatmulSolve(matrix2, is_square=False)
+    operator3 = LinearOperatorMatmulSolve(matrix3, is_square=False)
+
+    self.assertTrue(operator1.matmul(operator2).is_square)
+    self.assertTrue(operator2.matmul(operator1).is_square)
+    self.assertFalse(operator1.matmul(operator3).is_square)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index 86847d38c2a711422af20950b44ac666c7b26262..d1e6c37e35af8664454c20f60e712ed6ff7c6fe6 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -21,25 +21,26 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
 
-random_seed.set_random_seed(23)
 rng = np.random.RandomState(0)
 
 
 class AssertZeroImagPartTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([0., 2, 3])
     with self.cached_session():
       # Should not raise.
       linear_operator_util.assert_zero_imag_part(x, message="ABC123").run()
 
+  @test_util.run_deprecated_v1
   def test_complex_tensor_with_imag_zero_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([0., 0, 0])
@@ -59,6 +60,7 @@ class AssertZeroImagPartTest(test.TestCase):
 
 class AssertNoEntriesWithModulusZeroTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_nonzero_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 2, 3])
     with self.cached_session():
@@ -66,6 +68,7 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
       linear_operator_util.assert_no_entries_with_modulus_zero(
           x, message="ABC123").run()
 
+  @test_util.run_deprecated_v1
   def test_nonzero_complex_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([1., 2, 0])
@@ -104,8 +107,9 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     self.assertTrue(isinstance(tensor, ops.Tensor))
 
     with self.cached_session():
-      self.assertAllClose(arr, tensor.eval())
+      self.assertAllClose(arr, self.evaluate(tensor))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast(self):
     # x.batch_shape = [3, 1, 2]
     # y.batch_shape = [4, 1]
@@ -121,7 +125,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
       self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
-      x_bc_, y_bc_ = sess.run([x_bc, y_bc])
+      x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
 
@@ -140,10 +144,11 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
       self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
-      x_bc_, y_bc_ = sess.run([x_bc, y_bc])
+      x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_32bit(self):
     # x.batch_shape = [3, 1, 2]
     # y.batch_shape = [4, 1]
@@ -164,6 +169,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_32bit_second_arg_higher_rank(self):
     # x.batch_shape =    [1, 2]
     # y.batch_shape = [3, 4, 1]
@@ -197,6 +203,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
 class CholeskySolveWithBroadcastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast(self):
     # batch_shape = [2]
     chol = rng.rand(3, 3)
@@ -207,8 +214,9 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
       result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
-      self.assertAllEqual(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
     chol = rng.rand(2, 3, 3)
@@ -230,12 +238,13 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
               chol_ph: chol,
               rhs_ph: rhs,
           })
-      self.assertAllEqual(expected, result)
+      self.assertAllClose(expected, result)
 
 
 class MatmulWithBroadcastTest(test.TestCase):
 
-  def test_static_dims_broadcast(self):
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_x_has_extra_dims(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
     x = rng.rand(2, 1, 3)
@@ -246,8 +255,74 @@ class MatmulWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matmul_with_broadcast(x, y)
       self.assertAllEqual((2, 1, 7), result.get_shape())
       expected = math_ops.matmul(x, y_broadcast)
-      self.assertAllEqual(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
+
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_y_has_extra_dims(self):
+    # Since the second arg has extra dims, and the domain dim of the first arg
+    # is larger than the number of linear equations, code will "flip" the extra
+    # dims of the first arg to the far right, making extra linear equations
+    # (then call the matrix function, then flip back).
+    # We have verified that this optimization indeed happens.  How? We stepped
+    # through with a debugger.
+    x = rng.rand(5, 7)
+    y = rng.rand(2, 3, 7, 5)
+    x_broadcast = x + np.zeros((2, 3, 5, 7))
+
+    with self.cached_session():
+      result = linear_operator_util.matmul_with_broadcast(x, y)
+      self.assertAllEqual((2, 3, 5, 5), result.get_shape())
+      expected = math_ops.matmul(x_broadcast, y)
+      self.assertAllClose(expected.eval(), self.evaluate(result))
+
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_y_has_extra_dims_transpose_a_and_b(self):
+    # Since the second arg has extra dims, and the domain dim of the first arg
+    # is larger than the number of linear equations, code will "flip" the extra
+    # dims of the first arg to the far right, making extra linear equations
+    # (then call the matrix function, then flip back).
+    # We have verified that this optimization indeed happens.  How? We stepped
+    # through with a debugger.
+    x = rng.rand(1, 7, 5)
+    y = rng.rand(2, 3, 1, 7)
+    x_broadcast = x + np.zeros((2, 3, 1, 1))
+
+    with self.cached_session():
+      result = linear_operator_util.matmul_with_broadcast(
+          x, y, transpose_a=True, transpose_b=True)
+      self.assertAllEqual((2, 3, 5, 1), result.get_shape())
+      expected = math_ops.matmul(
+          x_broadcast, y, transpose_a=True, transpose_b=True)
+      self.assertAllClose(expected.eval(), self.evaluate(result))
+
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_y_has_extra_dims_transpose_dynamic(self):
+    # Since the second arg has extra dims, and the domain dim of the first arg
+    # is larger than the number of linear equations, code will "flip" the extra
+    # dims of the first arg to the far right, making extra linear equations
+    # (then call the matrix function, then flip back).
+    # We have verified that this optimization indeed happens.  How? We stepped
+    # through with a debugger.
+    x = rng.rand(1, 7, 5)
+    y = rng.rand(2, 3, 1, 7)
+    x_broadcast = x + np.zeros((2, 3, 1, 1))
+
+    x_ph = array_ops.placeholder(dtypes.float64, [None, None, None])
+    y_ph = array_ops.placeholder(dtypes.float64, [None, None, None, None])
 
+    with self.cached_session():
+      result = linear_operator_util.matmul_with_broadcast(
+          x_ph, y_ph, transpose_a=True, transpose_b=True)
+      self.assertAllEqual(4, result.shape.ndims)
+      expected = math_ops.matmul(
+          x_broadcast, y, transpose_a=True, transpose_b=True)
+      self.assertAllClose(expected.eval(),
+                          result.eval(feed_dict={
+                              x_ph: x,
+                              y_ph: y
+                          }))
+
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
@@ -268,23 +343,93 @@ class MatmulWithBroadcastTest(test.TestCase):
               x_ph: x,
               y_ph: y
           })
-      self.assertAllEqual(expected, result)
+      self.assertAllClose(expected, result)
 
 
 class MatrixSolveWithBroadcastTest(test.TestCase):
 
-  def test_static_dims_broadcast(self):
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_matrix_has_extra_dims(self):
+    # batch_shape = [2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(3, 7)
+    rhs_broadcast = rhs + np.zeros((2, 1, 1))
+
+    with self.cached_session():
+      result = linear_operator_util.matrix_solve_with_broadcast(
+          matrix, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.matrix_solve(matrix, rhs_broadcast)
+      self.assertAllClose(expected.eval(), self.evaluate(result))
+
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_rhs_has_extra_dims(self):
+    # Since the second arg has extra dims, and the domain dim of the first arg
+    # is larger than the number of linear equations, code will "flip" the extra
+    # dims of the first arg to the far right, making extra linear equations
+    # (then call the matrix function, then flip back).
+    # We have verified that this optimization indeed happens.  How? We stepped
+    # through with a debugger.
     # batch_shape = [2]
     matrix = rng.rand(3, 3)
-    rhs = rng.rand(2, 3, 7)
+    rhs = rng.rand(2, 3, 2)
     matrix_broadcast = matrix + np.zeros((2, 1, 1))
 
     with self.cached_session():
       result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
-      self.assertAllEqual((2, 3, 7), result.get_shape())
+      self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
-      self.assertAllEqual(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
+
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_rhs_has_extra_dims_dynamic(self):
+    # Since the second arg has extra dims, and the domain dim of the first arg
+    # is larger than the number of linear equations, code will "flip" the extra
+    # dims of the first arg to the far right, making extra linear equations
+    # (then call the matrix function, then flip back).
+    # We have verified that this optimization indeed happens.  How? We stepped
+    # through with a debugger.
+    # batch_shape = [2]
+    matrix = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 2)
+    matrix_broadcast = matrix + np.zeros((2, 1, 1))
 
+    matrix_ph = array_ops.placeholder(dtypes.float64, shape=[None, None])
+    rhs_ph = array_ops.placeholder(dtypes.float64, shape=[None, None, None])
+
+    with self.cached_session():
+      result = linear_operator_util.matrix_solve_with_broadcast(matrix_ph,
+                                                                rhs_ph)
+      self.assertAllEqual(3, result.shape.ndims)
+      expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
+      self.assertAllClose(
+          self.evaluate(expected),
+          result.eval(feed_dict={
+              matrix_ph: matrix,
+              rhs_ph: rhs
+          }))
+
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self):
+    # Since the second arg has extra dims, and the domain dim of the first arg
+    # is larger than the number of linear equations, code will "flip" the extra
+    # dims of the first arg to the far right, making extra linear equations
+    # (then call the matrix function, then flip back).
+    # We have verified that this optimization indeed happens.  How? We stepped
+    # through with a debugger.
+    # batch_shape = [2]
+    matrix = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 2)
+    matrix_broadcast = matrix + np.zeros((2, 1, 1))
+
+    with self.cached_session():
+      result = linear_operator_util.matrix_solve_with_broadcast(
+          matrix, rhs, adjoint=True)
+      self.assertAllEqual((2, 3, 2), result.get_shape())
+      expected = linalg_ops.matrix_solve(matrix_broadcast, rhs, adjoint=True)
+      self.assertAllClose(expected.eval(), self.evaluate(result))
+
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
     matrix = rng.rand(2, 3, 3)
@@ -306,12 +451,13 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
               matrix_ph: matrix,
               rhs_ph: rhs,
           })
-      self.assertAllEqual(expected, result)
+      self.assertAllClose(expected, result)
 
 
 class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
 
-  def test_static_dims_broadcast(self):
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_matrix_has_extra_dims(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
     rhs = rng.rand(3, 7)
@@ -322,8 +468,50 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
-      self.assertAllEqual(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
+
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_rhs_has_extra_dims(self):
+    # Since the second arg has extra dims, and the domain dim of the first arg
+    # is larger than the number of linear equations, code will "flip" the extra
+    # dims of the first arg to the far right, making extra linear equations
+    # (then call the matrix function, then flip back).
+    # We have verified that this optimization indeed happens.  How? We stepped
+    # through with a debugger.
+    # batch_shape = [2]
+    matrix = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 2)
+    matrix_broadcast = matrix + np.zeros((2, 1, 1))
+
+    with self.cached_session():
+      result = linear_operator_util.matrix_triangular_solve_with_broadcast(
+          matrix, rhs)
+      self.assertAllEqual((2, 3, 2), result.get_shape())
+      expected = linalg_ops.matrix_triangular_solve(matrix_broadcast, rhs)
+      self.assertAllClose(expected.eval(), self.evaluate(result))
+
+  @test_util.run_deprecated_v1
+  def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self):
+    # Since the second arg has extra dims, and the domain dim of the first arg
+    # is larger than the number of linear equations, code will "flip" the extra
+    # dims of the first arg to the far right, making extra linear equations
+    # (then call the matrix function, then flip back).
+    # We have verified that this optimization indeed happens.  How? We stepped
+    # through with a debugger.
+    # batch_shape = [2]
+    matrix = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 2)
+    matrix_broadcast = matrix + np.zeros((2, 1, 1))
+
+    with self.cached_session():
+      result = linear_operator_util.matrix_triangular_solve_with_broadcast(
+          matrix, rhs, adjoint=True)
+      self.assertAllEqual((2, 3, 2), result.get_shape())
+      expected = linalg_ops.matrix_triangular_solve(
+          matrix_broadcast, rhs, adjoint=True)
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
@@ -344,7 +532,7 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
               matrix_ph: matrix,
               rhs_ph: rhs,
           })
-      self.assertAllEqual(expected, result)
+      self.assertAllClose(expected, result)
 
 
 class DomainDimensionStubOperator(object):
@@ -358,6 +546,7 @@ class DomainDimensionStubOperator(object):
 
 class AssertCompatibleMatrixDimensionsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_compatible_dimensions_do_not_raise(self):
     with self.cached_session():
       x = ops.convert_to_tensor(rng.rand(2, 3, 4))
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index f0556304adc68c8ef849ced755d63700e0940c2a..eb0b8ef127749e9e5709861d14b143877790bffd 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -20,14 +20,13 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
 
 
-random_seed.set_random_seed(23)
 rng = np.random.RandomState(2016)
 
 
@@ -37,7 +36,7 @@ class LinearOperatorZerosTest(
 
   @property
   def _tests_to_skip(self):
-    return ["log_abs_det", "solve", "solve_with_broadcast"]
+    return ["cholesky", "log_abs_det", "solve", "solve_with_broadcast"]
 
   @property
   def _operator_build_infos(self):
@@ -48,7 +47,10 @@ class LinearOperatorZerosTest(
         build_info((3, 4, 4)),
         build_info((2, 1, 4, 4))]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     del use_placeholder
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
@@ -72,6 +74,7 @@ class LinearOperatorZerosTest(
       operator = linalg_lib.LinearOperatorZeros(num_rows=2)
       operator.assert_non_singular()
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorZeros(num_rows=2)
@@ -107,6 +110,7 @@ class LinearOperatorZerosTest(
     with self.assertRaisesRegexp(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[-2])
 
+  @test_util.run_deprecated_v1
   def test_non_scalar_num_rows_raises_dynamic(self):
     with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
@@ -115,6 +119,7 @@ class LinearOperatorZerosTest(
       with self.assertRaisesOpError("must be a 0-D Tensor"):
         operator.to_dense().eval(feed_dict={num_rows: [2]})
 
+  @test_util.run_deprecated_v1
   def test_negative_num_rows_raises_dynamic(self):
     with self.cached_session():
       n = array_ops.placeholder(dtypes.int32)
@@ -128,6 +133,7 @@ class LinearOperatorZerosTest(
       with self.assertRaisesOpError("must be non-negative"):
         operator.to_dense().eval(feed_dict={n: -2})
 
+  @test_util.run_deprecated_v1
   def test_non_1d_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -136,6 +142,7 @@ class LinearOperatorZerosTest(
       with self.assertRaisesOpError("must be a 1-D"):
         operator.to_dense().eval(feed_dict={batch_shape: 2})
 
+  @test_util.run_deprecated_v1
   def test_negative_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -150,6 +157,7 @@ class LinearOperatorZerosTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
+  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
@@ -168,6 +176,17 @@ class LinearOperatorZerosTest(
     self.assertFalse(operator.is_non_singular)
     self.assertTrue(operator.is_self_adjoint)
 
+  def test_zeros_matmul(self):
+    operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
+    operator2 = linalg_lib.LinearOperatorZeros(num_rows=2)
+    self.assertTrue(isinstance(
+        operator1.matmul(operator2),
+        linalg_lib.LinearOperatorZeros))
+
+    self.assertTrue(isinstance(
+        operator2.matmul(operator1),
+        linalg_lib.LinearOperatorZeros))
+
 
 class LinearOperatorZerosNotSquareTest(
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index e52f303fe0958746c5e7caf5e357e13d9b7c74a1..ff84221611813cf37537b843087faa70ae1d3e8e 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -39,6 +40,7 @@ def _AddTest(test, op_name, testcase_name, fn):
 
 class ShapeTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBatchGradientUnknownSize(self):
     with self.cached_session():
       batch_size = constant_op.constant(3)
@@ -50,7 +52,7 @@ class ShapeTest(test_lib.TestCase):
       determinants = linalg_ops.matrix_determinant(batch_identity)
       reduced = math_ops.reduce_sum(determinants)
       sum_grad = gradients_impl.gradients(reduced, batch_identity)[0]
-      self.assertAllClose(batch_identity.eval(), sum_grad.eval())
+      self.assertAllClose(batch_identity.eval(), self.evaluate(sum_grad))
 
 
 class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
@@ -59,13 +61,18 @@ class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
 
 def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
 
+  @test_util.run_v1_only('b/120545219')
   def Test(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       np.random.seed(1)
       a_np = np.random.uniform(
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
       a = constant_op.constant(a_np)
+      if functor_.__name__ == 'matrix_square_root':
+        # Square the input matrix to ensure that its matrix square root exists
+        a = math_ops.matmul(a, a)
+        a_np = self.evaluate(a)
       b = functor_(a, **kwargs_)
 
       # Optimal stepsize for central difference is O(epsilon^{1/3}).
@@ -97,12 +104,13 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
                                         float32_tol_fudge=1.0,
                                         **kwargs_):
 
+  @test_util.run_v1_only('b/120545219')
   def Test(self):
     # TODO(rmlarsen): Debug illegal address bug on CUDA and re-enable
     # GPU test for matrix_solve.
     use_gpu = False if functor_ == linalg_ops.matrix_solve else True
 
-    with self.test_session(use_gpu=use_gpu):
+    with self.session(use_gpu=use_gpu):
       np.random.seed(1)
       a_np = np.random.uniform(
           low=-1.0, high=1.0,
@@ -189,6 +197,17 @@ if __name__ == '__main__':
                 lambda x: linalg_ops.log_matrix_determinant(x)[1],
                 dtype, shape))
 
+        # The numerical Jacobian is consistently invalid for these four shapes
+        # because the matrix square root of the perturbed input doesn't exist
+        if shape in {(2, 5, 5), (3, 5, 5), (3, 10, 10), (3, 2, 5, 5)}:
+          # Alternative shape that consistently produces a valid numerical Jacobian
+          shape = extra + (size + 1, size + 1)
+          name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape)))
+        _AddTest(
+            MatrixUnaryFunctorGradientTest, 'MatrixSquareRootGradient', name,
+            _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_square_root,
+                                               dtype, shape))
+
   # Tests for gradients of matrix_solve_ls
   for dtype in np.float32, np.float64:
     for rows in 2, 5, 10:
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index aa17f727d0905fef638a0a5f40511b416b802738..028167a78603b7f2c00ae19ca76f721d38e200c9 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import itertools
+
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -49,10 +53,11 @@ class CholeskySolveTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(0)
 
+  @test_util.run_deprecated_v1
   def test_works_with_five_different_random_pos_def_matrices(self):
     for n in range(1, 6):
       for np_type, atol in [(np.float32, 0.05), (np.float64, 1e-5)]:
-        with self.test_session(use_gpu=True):
+        with self.session(use_gpu=True):
           # Create 2 x n x n matrix
           array = np.array(
               [_RandomPDMatrix(n, self.rng),
@@ -70,28 +75,29 @@ class LogdetTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(42)
 
+  @test_util.run_deprecated_v1
   def test_works_with_five_different_random_pos_def_matrices(self):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                              (np.complex64, 0.05), (np.complex128, 1e-5)]:
         matrix = _RandomPDMatrix(n, self.rng, np_dtype)
         _, logdet_np = np.linalg.slogdet(matrix)
-        with self.test_session(use_gpu=True):
+        with self.session(use_gpu=True):
           # Create 2 x n x n matrix
           # matrix = np.array(
           #     [_RandomPDMatrix(n, self.rng, np_dtype),
           #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
           logdet_tf = linalg.logdet(matrix)
-          self.assertAllClose(logdet_np, logdet_tf.eval(), atol=atol)
+          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                            (np.complex64, 0.05), (np.complex128, 1e-5)]:
       matrix = (np.eye(20) * 1e-6).astype(np_dtype)
       _, logdet_np = np.linalg.slogdet(matrix)
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         logdet_tf = linalg.logdet(matrix)
-        self.assertAllClose(logdet_np, logdet_tf.eval(), atol=atol)
+        self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
 
 class SlogdetTest(test.TestCase):
@@ -99,26 +105,29 @@ class SlogdetTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(42)
 
+  @test_util.run_deprecated_v1
   def test_works_with_five_different_random_pos_def_matrices(self):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                              (np.complex64, 0.05), (np.complex128, 1e-5)]:
         matrix = _RandomPDMatrix(n, self.rng, np_dtype)
         sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
-        with self.test_session(use_gpu=True):
+        with self.session(use_gpu=True):
           sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-          self.assertAllClose(log_abs_det_np, log_abs_det_tf.eval(), atol=atol)
-          self.assertAllClose(sign_np, sign_tf.eval(), atol=atol)
+          self.assertAllClose(
+              log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+          self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                            (np.complex64, 0.05), (np.complex128, 1e-5)]:
       matrix = (np.eye(20) * 1e-6).astype(np_dtype)
       sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-        self.assertAllClose(log_abs_det_np, log_abs_det_tf.eval(), atol=atol)
-        self.assertAllClose(sign_np, sign_tf.eval(), atol=atol)
+        self.assertAllClose(
+            log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+        self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
 
 class AdjointTest(test.TestCase):
@@ -128,66 +137,131 @@ class AdjointTest(test.TestCase):
       matrix_np = np.array([[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j,
                                                        6 + 6j]]).astype(dtype)
       expected_transposed = np.conj(matrix_np.T)
-      with self.cached_session():
+      with self.session():
         matrix = ops.convert_to_tensor(matrix_np)
         transposed = linalg.adjoint(matrix)
         self.assertEqual((3, 2), transposed.get_shape())
-        self.assertAllEqual(expected_transposed, transposed.eval())
-
-
-class EyeTest(test.TestCase):
-  pass  # Will be filled in below
-
-
-def _GetEyeTest(num_rows, num_columns, batch_shape, dtype):
-
-  def Test(self):
+        self.assertAllEqual(expected_transposed, self.evaluate(transposed))
+
+
+class EyeTest(parameterized.TestCase, test.TestCase):
+
+  def testShapeInferenceNoBatch(self):
+    self.assertEqual((2, 2), linalg_ops.eye(num_rows=2).shape)
+    self.assertEqual((2, 3), linalg_ops.eye(num_rows=2, num_columns=3).shape)
+
+  def testShapeInferenceStaticBatch(self):
+    batch_shape = (2, 3)
+    self.assertEqual(
+        (2, 3, 2, 2),
+        linalg_ops.eye(num_rows=2, batch_shape=batch_shape).shape)
+    self.assertEqual(
+        (2, 3, 2, 3),
+        linalg_ops.eye(
+            num_rows=2, num_columns=3, batch_shape=batch_shape).shape)
+
+  @parameterized.named_parameters(
+      ("DynamicRow",
+       lambda: array_ops.placeholder_with_default(2, shape=None),
+       lambda: None),
+      ("DynamicRowStaticColumn",
+       lambda: array_ops.placeholder_with_default(2, shape=None),
+       lambda: 3),
+      ("StaticRowDynamicColumn",
+       lambda: 2,
+       lambda: array_ops.placeholder_with_default(3, shape=None)),
+      ("DynamicRowDynamicColumn",
+       lambda: array_ops.placeholder_with_default(2, shape=None),
+       lambda: array_ops.placeholder_with_default(3, shape=None)))
+  def testShapeInferenceStaticBatchWith(self, num_rows_fn, num_columns_fn):
+    num_rows = num_rows_fn()
+    num_columns = num_columns_fn()
+    batch_shape = (2, 3)
+    identity_matrix = linalg_ops.eye(
+        num_rows=num_rows,
+        num_columns=num_columns,
+        batch_shape=batch_shape)
+    self.assertEqual(4, identity_matrix.shape.ndims)
+    self.assertEqual((2, 3), identity_matrix.shape[:2])
+    if num_rows is not None and not isinstance(num_rows, ops.Tensor):
+      self.assertEqual(2, identity_matrix.shape[-2])
+
+    if num_columns is not None and not isinstance(num_columns, ops.Tensor):
+      self.assertEqual(3, identity_matrix.shape[-1])
+
+  @parameterized.parameters(
+      itertools.product(
+          # num_rows
+          [0, 1, 2, 5],
+          # num_columns
+          [None, 0, 1, 2, 5],
+          # batch_shape
+          [None, [], [2], [2, 3]],
+          # dtype
+          [
+              dtypes.int32,
+              dtypes.int64,
+              dtypes.float32,
+              dtypes.float64,
+              dtypes.complex64,
+              dtypes.complex128
+          ])
+      )
+  def test_eye_no_placeholder(self, num_rows, num_columns, batch_shape, dtype):
     eye_np = np.eye(num_rows, M=num_columns, dtype=dtype.as_numpy_dtype)
     if batch_shape is not None:
       eye_np = np.tile(eye_np, batch_shape + [1, 1])
-    for use_placeholder in False, True:
-      if use_placeholder and (num_columns is None or batch_shape is None):
-        return
-      with self.test_session(use_gpu=True) as sess:
-        if use_placeholder:
-          num_rows_placeholder = array_ops.placeholder(
-              dtypes.int32, name="num_rows")
-          num_columns_placeholder = array_ops.placeholder(
-              dtypes.int32, name="num_columns")
-          batch_shape_placeholder = array_ops.placeholder(
-              dtypes.int32, name="batch_shape")
-          eye = linalg_ops.eye(
-              num_rows_placeholder,
-              num_columns=num_columns_placeholder,
-              batch_shape=batch_shape_placeholder,
-              dtype=dtype)
-          eye_tf = sess.run(
-              eye,
-              feed_dict={
-                  num_rows_placeholder: num_rows,
-                  num_columns_placeholder: num_columns,
-                  batch_shape_placeholder: batch_shape
-              })
-        else:
-          eye_tf = linalg_ops.eye(
-              num_rows,
-              num_columns=num_columns,
-              batch_shape=batch_shape,
-              dtype=dtype).eval()
-        self.assertAllEqual(eye_np, eye_tf)
-
-  return Test
+    eye_tf = self.evaluate(linalg_ops.eye(
+        num_rows,
+        num_columns=num_columns,
+        batch_shape=batch_shape,
+        dtype=dtype))
+    self.assertAllEqual(eye_np, eye_tf)
+
+  @parameterized.parameters(
+      itertools.product(
+          # num_rows
+          [0, 1, 2, 5],
+          # num_columns
+          [0, 1, 2, 5],
+          # batch_shape
+          [[], [2], [2, 3]],
+          # dtype
+          [
+              dtypes.int32,
+              dtypes.int64,
+              dtypes.float32,
+              dtypes.float64,
+              dtypes.complex64,
+              dtypes.complex128
+          ])
+      )
+  @test_util.run_deprecated_v1
+  def test_eye_with_placeholder(
+      self, num_rows, num_columns, batch_shape, dtype):
+    eye_np = np.eye(num_rows, M=num_columns, dtype=dtype.as_numpy_dtype)
+    eye_np = np.tile(eye_np, batch_shape + [1, 1])
+    num_rows_placeholder = array_ops.placeholder(
+        dtypes.int32, name="num_rows")
+    num_columns_placeholder = array_ops.placeholder(
+        dtypes.int32, name="num_columns")
+    batch_shape_placeholder = array_ops.placeholder(
+        dtypes.int32, name="batch_shape")
+    eye = linalg_ops.eye(
+        num_rows_placeholder,
+        num_columns=num_columns_placeholder,
+        batch_shape=batch_shape_placeholder,
+        dtype=dtype)
+    with self.session(use_gpu=True) as sess:
+      eye_tf = sess.run(
+          eye,
+          feed_dict={
+              num_rows_placeholder: num_rows,
+              num_columns_placeholder: num_columns,
+              batch_shape_placeholder: batch_shape
+          })
+    self.assertAllEqual(eye_np, eye_tf)
 
 
 if __name__ == "__main__":
-  for _num_rows in 0, 1, 2, 5:
-    for _num_columns in None, 0, 1, 2, 5:
-      for _batch_shape in None, [], [2], [2, 3]:
-        for _dtype in (dtypes.int32, dtypes.int64, dtypes.float32,
-                       dtypes.float64, dtypes.complex64, dtypes.complex128):
-          name = "dtype_%s_num_rows_%s_num_column_%s_batch_shape_%s_" % (
-              _dtype.name, _num_rows, _num_columns, _batch_shape)
-          _AddTest(EyeTest, "EyeTest", name,
-                   _GetEyeTest(_num_rows, _num_columns, _batch_shape, _dtype))
-
   test.main()
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index ae413edaecb98fa49a22270c4dfcf2727f818e40..489f6c9b00471e6c10a8a04830613e9c5b99661a 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np  # pylint: disable=unused-import
 
 from tensorflow.python.client import session
@@ -28,53 +29,174 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
 
 
-def scalar_shape():
-  return ops.convert_to_tensor([], dtype=dtypes.int32)
+@test_util.run_all_in_graph_and_eager_modes
+class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-
-@test_util.with_c_shapes
-class ListOpsTest(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testPushPop(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=scalar_shape())
+  def _testPushPop(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[],
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 1.0)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testPushPopGPU(self):
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testPushPop(self, max_num_elements):
+    self._testPushPop(max_num_elements)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testPushPopGPU(self, max_num_elements):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testPushPop(max_num_elements)
+
+  @test_util.run_deprecated_v1
+  def testPushInFullListFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[], max_num_elements=1)
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Tried to push item into a full list"):
+      l = list_ops.tensor_list_push_back(l, 2.)
+      self.evaluate(l)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  @test_util.run_deprecated_v1
+  def testPopFromEmptyTensorListFails(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[],
+        max_num_elements=max_num_elements)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Trying to pop from an empty list"):
+      l = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.evaluate(l)
+
+  def _testStack(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[],
+        max_num_elements=max_num_elements)
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    if not context.executing_eagerly():
+      self.assertAllEqual(t.shape.as_list(), [None])
+    self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testStack(self, max_num_elements):
+    self._testStack(max_num_elements)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testStackGPU(self, max_num_elements):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
-      self.testPushPop()
+      self._testStack(max_num_elements)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStack(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=scalar_shape())
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
+  def testStackWithUnknownElementShape(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=None,
+        max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
+
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
-  def testGatherGrad(self):
+    # Should raise an error when the element tensors do not all have the same
+    # shape.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+      l = list_ops.tensor_list_push_back(l, constant_op.constant([3.0, 4.0]))
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
+  def testStackWithPartiallyDefinedElementShape(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[None],
+        max_num_elements=max_num_elements)
+    l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
+    l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0]))
+
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[1.0], [2.0]])
+
+    # Should raise an error when the element tensors do not all have the same
+    # shape.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+      l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  @test_util.run_deprecated_v1
+  def testStackEmptyList(self, max_num_elements):
+    # Should be able to stack empty lists with fully defined element_shape.
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[1, 2],
+        max_num_elements=max_num_elements)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t).shape, (0, 1, 2))
+
+    # Should not be able to stack empty lists with partially defined
+    # element_shape.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "non-fully-defined"):
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=[None, 2],
+          max_num_elements=max_num_elements)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+    # Should not be able to stack empty lists with undefined element_shape.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "non-fully-defined"):
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=None,
+          max_num_elements=max_num_elements)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 2))
+  def testGatherGrad(self, max_num_elements):
     with backprop.GradientTape() as tape:
-      l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                     element_shape=scalar_shape())
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=[],
+          max_num_elements=max_num_elements)
       c0 = constant_op.constant(1.0)
       tape.watch(c0)
       l = list_ops.tensor_list_push_back(l, c0)
@@ -85,7 +207,89 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     dt = tape.gradient(s, c0)
     self.assertAllEqual(self.evaluate(dt), 6.0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
+  def testGatherWithUnknownElementShape(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=None,
+        max_num_elements=max_num_elements)
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+    l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
+    l = list_ops.tensor_list_push_back(l, constant_op.constant([3.0, 4.0]))
+
+    t = list_ops.tensor_list_gather(l, [1, 0], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [2.0, 1.0])
+
+    t = list_ops.tensor_list_gather(l, [2], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[3.0, 4.0]])
+
+    # Should raise an error when the requested tensors do not all have the same
+    # shape.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+      t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
+  def testGatherWithPartiallyDefinedElementShape(self, max_num_elements):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[None],
+        max_num_elements=max_num_elements)
+    l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
+    l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
+    l = list_ops.tensor_list_push_back(l, constant_op.constant([4.0, 5.0]))
+
+    t = list_ops.tensor_list_gather(l, [0], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[1.0]])
+
+    t = list_ops.tensor_list_gather(l, [1, 2], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[2.0, 3.0], [4.0, 5.0]])
+
+    # Should raise an error when the requested tensors do not all have the same
+    # shape.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+      t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  @parameterized.named_parameters(("NoMaxNumElements", None),
+                                  ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
+  def testGatherEmptyList(self, max_num_elements):
+    # Should be able to gather from empty lists with fully defined
+    # element_shape.
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32,
+        element_shape=[1, 2],
+        max_num_elements=max_num_elements)
+    t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
+    self.assertAllEqual((0, 1, 2), self.evaluate(t).shape)
+
+    # Should not be able to gather from empty lists with partially defined
+    # element_shape.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "non-fully-defined"):
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=[None, 2],
+          max_num_elements=max_num_elements)
+      t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+    # Should not be able to gather from empty lists with undefined
+    # element_shape.
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "non-fully-defined"):
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=None,
+          max_num_elements=max_num_elements)
+      t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
+      self.evaluate(t)
+
   def testScatterGrad(self):
     with backprop.GradientTape() as tape:
       c0 = constant_op.constant([1.0, 2.0])
@@ -100,51 +304,60 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     dt = tape.gradient(loss, c0)
     self.assertAllEqual(self.evaluate(dt), [2., 4.])
 
-  @test_util.run_in_graph_and_eager_modes
-  def testStackGPU(self):
-    if not context.num_gpus():
-      return
-    with context.device("gpu:0"):
-      self.testStack()
-
-  @test_util.run_in_graph_and_eager_modes
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 1.0)
     self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testFromTensorGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetSetItem(self):
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e0), 1.0)
     l = list_ops.tensor_list_set_item(l, 0, 3.0)
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(t), [3.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetSetGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testGetSetItem()
 
-  @test_util.run_in_graph_and_eager_modes
+  def testSetGetGrad(self):
+    with backprop.GradientTape() as tape:
+      t = constant_op.constant(5.)
+      tape.watch(t)
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+      l = list_ops.tensor_list_set_item(l, 1, 2. * t)
+      e = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(e), 10.0)
+    self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0)
+
+  @test_util.run_deprecated_v1
+  def testSetOnEmptyListWithMaxNumElementsFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[], max_num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to modify element 0 in a list with 0 elements."):
+      l = list_ops.tensor_list_set_item(l, 0, 1.)
+      self.evaluate(l)
+
   def testUnknownShape(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32, element_shape=None)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0, 2.0]))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
@@ -152,12 +365,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 1.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testCPUGPUCopy(self):
     if not context.num_gpus():
       return
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     with context.device("gpu:0"):
       l_gpu = array_ops.identity(l)
       self.assertAllEqual(
@@ -170,12 +382,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
             list_ops.tensor_list_pop_back(
                 l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testCPUGPUCopyNested(self):
     if not context.num_gpus():
       return
     t = constant_op.constant([1.0, 2.0])
-    child_l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    child_l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     l = list_ops.empty_tensor_list(
         element_shape=constant_op.constant([], dtype=dtypes.int32),
         element_dtype=dtypes.variant)
@@ -207,7 +418,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
               list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32)),
           [[1]])
 
-  def testGraphStackInLoop(self):
+  def testSkipEagerStackInLoop(self):
     with self.cached_session():
       t1 = list_ops.empty_tensor_list(
           element_shape=constant_op.constant([], dtype=dtypes.int32),
@@ -224,7 +435,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32)
       self.assertAllEqual(self.evaluate(s1), [0, 1, 2, 3])
 
-  def testGraphStackSwitchDtype(self):
+  def testSkipEagerStackSwitchDtype(self):
     with self.cached_session():
       list_ = list_ops.empty_tensor_list(
           element_shape=constant_op.constant([], dtype=dtypes.int32),
@@ -245,7 +456,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       np_s1 = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)
       self.assertAllEqual(self.evaluate(s1), np_s1)
 
-  def testGraphStackInLoopSwitchDtype(self):
+  def testSkipEagerStackInLoopSwitchDtype(self):
     with self.cached_session():
       t1 = list_ops.empty_tensor_list(
           element_shape=constant_op.constant([], dtype=dtypes.int32),
@@ -268,43 +479,75 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       np_s1 = np.vstack([np.arange(1, 4) * i for i in range(4)])
       self.assertAllEqual(self.evaluate(s1), np_s1)
 
-  @test_util.run_in_graph_and_eager_modes
   def testSerialize(self):
-    # pylint: disable=g-import-not-at-top
-    try:
-      import portpicker
-    except ImportError:
+    worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
+    with ops.Graph().as_default(), session.Session(target=worker.target):
+      with ops.device("/job:worker"):
+        t = constant_op.constant([[1.0], [2.0]])
+        l = list_ops.tensor_list_from_tensor(t, element_shape=[1])
+      with ops.device("/job:ps"):
+        l_ps = array_ops.identity(l)
+        l_ps, e = list_ops.tensor_list_pop_back(
+            l_ps, element_dtype=dtypes.float32)
+      with ops.device("/job:worker"):
+        worker_e = array_ops.identity(e)
+      self.assertAllEqual(self.evaluate(worker_e), [2.0])
+
+  def testSerializeListWithInvalidTensors(self):
+    worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
+    with ops.Graph().as_default(), session.Session(target=worker.target):
+      with ops.device("/job:worker"):
+        l = list_ops.tensor_list_reserve(
+            element_dtype=dtypes.float32, element_shape=[], num_elements=2)
+        l = list_ops.tensor_list_set_item(l, 0, 1.)
+      with ops.device("/job:ps"):
+        l_ps = array_ops.identity(l)
+        l_ps = list_ops.tensor_list_set_item(l_ps, 1, 2.)
+        t = list_ops.tensor_list_stack(l_ps, element_dtype=dtypes.float32)
+      with ops.device("/job:worker"):
+        worker_t = array_ops.identity(t)
+      self.assertAllEqual(self.evaluate(worker_t), [1.0, 2.0])
+
+  def testSerializeListWithUnknownRank(self):
+    worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
+    with ops.Graph().as_default(), session.Session(target=worker.target):
+      with ops.device("/job:worker"):
+        t = constant_op.constant([[1.0], [2.0]])
+        l = list_ops.tensor_list_from_tensor(t, element_shape=None)
+      with ops.device("/job:ps"):
+        l_ps = array_ops.identity(l)
+        element_shape = list_ops.tensor_list_element_shape(
+            l_ps, shape_type=dtypes.int32)
+      with ops.device("/job:worker"):
+        element_shape = array_ops.identity(element_shape)
+      self.assertEqual(self.evaluate(element_shape), -1)
+
+  def testSerializeListWithMaxNumElements(self):
+    if context.num_gpus():
+      # TODO(b/119151861): Enable on GPU.
       return
-    with context.graph_mode():
-      worker_port = portpicker.pick_unused_port()
-      ps_port = portpicker.pick_unused_port()
-      cluster_dict = {
-          "worker": ["localhost:%s" % worker_port],
-          "ps": ["localhost:%s" % ps_port]
-      }
-      cs = server_lib.ClusterSpec(cluster_dict)
-
-      worker = server_lib.Server(
-          cs, job_name="worker", protocol="grpc", task_index=0, start=True)
-      unused_ps = server_lib.Server(
-          cs, job_name="ps", protocol="grpc", task_index=0, start=True)
-      with ops.Graph().as_default(), session.Session(target=worker.target):
-        with ops.device("/job:worker"):
-          t = constant_op.constant([[1.0], [2.0]])
-          l = list_ops.tensor_list_from_tensor(t, element_shape=[1])
-        with ops.device("/job:ps"):
-          l_ps = array_ops.identity(l)
-          l_ps, e = list_ops.tensor_list_pop_back(
-              l_ps, element_dtype=dtypes.float32)
+    worker = test_util.create_local_cluster(num_workers=1, num_ps=1)[0][0]
+    with ops.Graph().as_default(), session.Session(target=worker.target):
+      with ops.device("/job:worker"):
+        l = list_ops.empty_tensor_list(
+            element_shape=None,
+            element_dtype=dtypes.float32,
+            max_num_elements=2)
+        l = list_ops.tensor_list_push_back(l, 1.)
+      with ops.device("/job:ps"):
+        l_ps = array_ops.identity(l)
+        l_ps = list_ops.tensor_list_push_back(l_ps, 2.)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Tried to push item into a full list"):
         with ops.device("/job:worker"):
-          worker_e = array_ops.identity(e)
-        self.assertAllEqual(self.evaluate(worker_e), [2.0])
+          l_worker = array_ops.identity(l_ps)
+          l_worker = list_ops.tensor_list_push_back(l_worker, 3.0)
+          self.evaluate(l_worker)
 
-  @test_util.run_in_graph_and_eager_modes
   def testPushPopGradients(self):
     with backprop.GradientTape() as tape:
-      l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                     element_shape=scalar_shape())
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=[])
       c = constant_op.constant(1.0)
       tape.watch(c)
       l = list_ops.tensor_list_push_back(l, c)
@@ -312,24 +555,22 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       e = 2 * e
     self.assertAllEqual(self.evaluate(tape.gradient(e, [c])[0]), 2.0)
 
-  @test_util.run_in_graph_and_eager_modes
   def testStackFromTensorGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       c2 = list_ops.tensor_list_stack(
           l, element_dtype=dtypes.float32, num_elements=2)
       result = c2 * 2.0
     grad = tape.gradient(result, [c])[0]
     self.assertAllEqual(self.evaluate(grad), [2.0, 2.0])
 
-  @test_util.run_in_graph_and_eager_modes
   def testGetSetGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       c2 = constant_op.constant(3.0)
       tape.watch(c2)
       l = list_ops.tensor_list_set_item(l, 0, c2)
@@ -340,17 +581,30 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0])
     self.assertAllEqual(self.evaluate(grad_c2), 6.0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testSetOutOfBounds(self):
     c = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(c, element_shape=[])
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
+  def testSkipEagerSetItemWithMismatchedShapeFails(self):
+    with self.cached_session() as sess:
+      ph = array_ops.placeholder(dtypes.float32)
+      c = constant_op.constant([1.0, 2.0])
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
+      # Set a placeholder with unknown shape to satisfy the shape inference
+      # at graph building time.
+      l = list_ops.tensor_list_set_item(l, 0, ph)
+      l_0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "incompatible shape"):
+        sess.run(l_0, {ph: [3.0]})
+
   def testResourceVariableScatterGather(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(c, element_shape=[])
     v = vs.get_variable("var", initializer=[l] * 10, use_resource=True)
     v_r_0_stacked = list_ops.tensor_list_stack(v[0], dtypes.float32)
     self.evaluate(v.initializer)
@@ -358,10 +612,8 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     v_r_sparse_stacked = list_ops.tensor_list_stack(
         v.sparse_read(0), dtypes.float32)
     self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_sparse_stacked))
-    l_new_0 = list_ops.tensor_list_from_tensor(
-        [3.0, 4.0], element_shape=scalar_shape())
-    l_new_1 = list_ops.tensor_list_from_tensor(
-        [5.0, 6.0], element_shape=scalar_shape())
+    l_new_0 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l_new_1 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
     updated_v = state_ops.scatter_update(v, [3, 5], [l_new_0, l_new_1])
     updated_v_elems = array_ops.unstack(updated_v)
     updated_v_stacked = [
@@ -371,11 +623,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                 [[1.0, 2.0]] * 4)
     self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testConcat(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=[])
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=[])
     l_batch_0 = array_ops.stack([l0, l1])
     l_batch_1 = array_ops.stack([l1, l0])
 
@@ -411,7 +663,7 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       self.evaluate(
           list_ops.tensor_list_concat_lists(
               l_batch_0,
-              list_ops.empty_tensor_list(scalar_shape(), dtypes.float32),
+              list_ops.empty_tensor_list([], dtypes.float32),
               element_dtype=dtypes.float32))
 
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
@@ -425,17 +677,16 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"input_b\[0\].dtype != element_dtype."):
       l_batch_of_int_tls = array_ops.stack(
-          [list_ops.tensor_list_from_tensor([1], element_shape=scalar_shape())]
-          * 2)
+          [list_ops.tensor_list_from_tensor([1], element_shape=[])] * 2)
       self.evaluate(
           list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_int_tls,
                                             element_dtype=dtypes.float32))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testPushBackBatch(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=[])
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=[])
     l_batch = array_ops.stack([l0, l1])
     l_push = list_ops.tensor_list_push_back_batch(l_batch, [3.0, 4.0])
     l_unstack = array_ops.unstack(l_push)
@@ -473,14 +724,13 @@ class ListOpsTest(test_util.TensorFlowTestCase):
                                  "Invalid data type at index 0"):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
-  @test_util.run_in_graph_and_eager_modes
   def testZerosLike(self):
     for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16,
                   dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32,
                   dtypes.float64, dtypes.complex64, dtypes.complex128,
                   dtypes.bool):
       l_empty = list_ops.empty_tensor_list(
-          element_dtype=dtype, element_shape=scalar_shape())
+          element_dtype=dtype, element_shape=[])
       l_empty_zeros = array_ops.zeros_like(l_empty)
       t_empty_zeros = list_ops.tensor_list_stack(
           l_empty_zeros, element_dtype=dtype)
@@ -498,17 +748,15 @@ class ListOpsTest(test_util.TensorFlowTestCase):
           self.evaluate(t_full_zeros), np.zeros(
               (2,), dtype=dtype.as_numpy_dtype))
 
-  @test_util.run_in_graph_and_eager_modes
-  def testZerosLikeVariant(self):
+  def testZerosLikeNested(self):
     for dtype in (dtypes.uint8, dtypes.uint16, dtypes.int8, dtypes.int16,
                   dtypes.int32, dtypes.int64, dtypes.float16, dtypes.float32,
                   dtypes.float64, dtypes.complex64, dtypes.complex128,
                   dtypes.bool):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.variant, element_shape=scalar_shape())
+          element_dtype=dtypes.variant, element_shape=[])
 
-      sub_l = list_ops.empty_tensor_list(
-          element_dtype=dtype, element_shape=scalar_shape())
+      sub_l = list_ops.empty_tensor_list(element_dtype=dtype, element_shape=[])
       l = list_ops.tensor_list_push_back(l, sub_l)
       sub_l = list_ops.tensor_list_push_back(sub_l, math_ops.cast(
           1, dtype=dtype))
@@ -539,6 +787,315 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(
           self.evaluate(outputs[0]), np.zeros((2,), dtype=dtype.as_numpy_dtype))
 
+  def testElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    shape = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int32)
+    self.assertEqual(self.evaluate(shape), -1)
+
+  def testZerosLikeUninitialized(self):
+    l0 = list_ops.tensor_list_reserve([], 3, element_dtype=dtypes.float32)
+    l1 = list_ops.tensor_list_set_item(l0, 0, 1.)  # [1., _, _]
+    zeros_1 = array_ops.zeros_like(l1)  # [0., _, _]
+    l2 = list_ops.tensor_list_set_item(l1, 2, 2.)  # [1., _, 2.]
+    zeros_2 = array_ops.zeros_like(l2)  # [0., _, 0.]
+
+    # Gather indices with zeros in `zeros_1`.
+    res_1 = list_ops.tensor_list_gather(
+        zeros_1, [0], element_dtype=dtypes.float32)
+    # Gather indices with zeros in `zeros_2`.
+    res_2 = list_ops.tensor_list_gather(
+        zeros_2, [0, 2], element_dtype=dtypes.float32)
+
+    self.assertAllEqual(self.evaluate(res_1), [0.])
+    self.assertAllEqual(self.evaluate(res_2), [0., 0.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorListGetItemGradAggregation(self):
+    l = list_ops.tensor_list_reserve(
+        element_shape=[], num_elements=1, element_dtype=dtypes.float32)
+    x = constant_op.constant(1.0)
+    l = list_ops.tensor_list_set_item(l, 0, x)
+    l_read1 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    l_read2 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients([l_read1, l_read2], [x])
+    with self.cached_session() as sess:
+      self.assertSequenceEqual(self.evaluate(grad), [2.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerBuildElementShape(self):
+    fn = list_ops._build_element_shape
+    # Unknown shape -> -1.
+    self.assertEqual(fn(None), -1)
+    self.assertEqual(fn(tensor_shape.unknown_shape()), -1)
+    # Scalar shape -> [] with type int32.
+    self.assertEqual(fn([]).dtype, dtypes.int32)
+    self.assertEqual(fn(tensor_shape.scalar()).dtype, dtypes.int32)
+    self.assertAllEqual(self.evaluate(fn([])), np.array([], np.int32))
+    self.assertAllEqual(
+        self.evaluate(fn(tensor_shape.scalar())), np.array([], np.int32))
+    # Tensor -> Tensor
+    shape = constant_op.constant(1)
+    self.assertIs(fn(shape), shape)
+    # Shape with unknown dims -> shape list with -1's.
+    shape = [None, 5]
+    self.assertAllEqual(fn(shape), [-1, 5])
+    self.assertAllEqual(fn(tensor_shape.TensorShape(shape)), [-1, 5])
+    # Shape with unknown dims and tensor dims -> shape list with -1's and tensor
+    # dims.
+    t = array_ops.placeholder(dtypes.int32)
+    shape = [None, 5, t]
+    result = fn(shape)
+    self.assertAllEqual(result[:2], [-1, 5])
+    self.assertIs(result[2], t)
+
+  def testAddN(self):
+    l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[])
+    l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
+    result = math_ops.add_n((l1, l2, l3))
+    result_t = list_ops.tensor_list_stack(result, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(result_t), [9., 12.])
+
+  def testAddNNestedList(self):
+    l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[])
+    l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
+    l4 = list_ops.tensor_list_from_tensor([7.0, 8.0], element_shape=[])
+    a = list_ops.empty_tensor_list(
+        element_dtype=dtypes.variant, element_shape=[])
+    a = list_ops.tensor_list_push_back(a, l1)
+    a = list_ops.tensor_list_push_back(a, l2)
+    b = list_ops.empty_tensor_list(
+        element_dtype=dtypes.variant, element_shape=[])
+    b = list_ops.tensor_list_push_back(b, l3)
+    b = list_ops.tensor_list_push_back(b, l4)
+    result = math_ops.add_n((a, b))
+    result_0 = list_ops.tensor_list_stack(
+        list_ops.tensor_list_get_item(result, 0, element_dtype=dtypes.variant),
+        element_dtype=dtypes.float32)
+    result_1 = list_ops.tensor_list_stack(
+        list_ops.tensor_list_get_item(result, 1, element_dtype=dtypes.variant),
+        element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(result_0), [6., 8.])
+    self.assertAllEqual(self.evaluate(result_1), [10., 12.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerConcatShapeInference(self):
+
+    def BuildTensor(element_shape):
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=element_shape)
+      return list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+
+    self.assertIsNone(BuildTensor(None).shape.rank)
+    self.assertAllEqual(BuildTensor([None, 2, 3]).shape.as_list(), [None, 2, 3])
+    self.assertAllEqual(
+        BuildTensor([None, 2, None]).shape.as_list(), [None, 2, None])
+    self.assertAllEqual(BuildTensor([1, 2, 3]).shape.as_list(), [None, 2, 3])
+
+  def testConcatWithFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[2, 2])
+    l = list_ops.tensor_list_push_back(l, [[0., 1.], [2., 3.]])
+    l = list_ops.tensor_list_push_back(l, [[4., 5.], [6., 7.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(
+        self.evaluate(t), [[0., 1.], [2., 3.], [4., 5.], [6., 7.]])
+
+  def testConcatWithNonFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[None, 2])
+    l = list_ops.tensor_list_push_back(l, [[0., 1.]])
+    l = list_ops.tensor_list_push_back(l, [[2., 3.], [4., 5.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[0., 1.], [2., 3.], [4., 5.]])
+
+  def testConcatWithMismatchingTensorShapesFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    l = list_ops.tensor_list_push_back(l, [[0., 1.]])
+    l = list_ops.tensor_list_push_back(l, [[2.], [4.]])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Tried to concat tensors with unequal shapes: "
+        r"\[2\] vs \[1\]"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatEmptyListWithFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[5, 2])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t).shape, (0, 2))
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[None, 2])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t).shape, (0, 2))
+
+  def testConcatEmptyListWithUnknownElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "All except the first dimension must be fully"
+        " defined when concating an empty tensor list"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatEmptyListWithPartiallyDefinedElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[2, None])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "All except the first dimension must be fully"
+        " defined when concating an empty tensor list"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatListWithScalarElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=tensor_shape.scalar())
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Concat requires elements to be at least vectors, "
+        "found scalars instead"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatListWithScalarElementsFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    l1 = list_ops.tensor_list_push_back(l, 1.)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, "Concat saw a scalar shape at index 0"
+        " but requires at least vectors"):
+      t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
+      self.evaluate(t)
+    l1 = list_ops.tensor_list_push_back(l, [1.])
+    l1 = list_ops.tensor_list_push_back(l1, 2.)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, "Concat saw a scalar shape at index 1"
+        " but requires at least vectors"):
+      t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testEvenSplit(self):
+
+    def RunTest(input_tensor, lengths, expected_stacked_output):
+      l = list_ops.tensor_list_split(
+          input_tensor, element_shape=None, lengths=lengths)
+      self.assertAllEqual(
+          list_ops.tensor_list_stack(l, element_dtype=dtypes.float32),
+          expected_stacked_output)
+
+    RunTest([1., 2., 3.], [1, 1, 1], [[1.], [2.], [3.]])
+    RunTest([1., 2., 3., 4.], [2, 2], [[1., 2.], [3., 4.]])
+    RunTest([[1., 2.], [3., 4.]], [1, 1], [[[1., 2.]], [[3., 4.]]])
+
+  def testUnevenSplit(self):
+    l = list_ops.tensor_list_split([1., 2., 3., 4., 5],
+                                   element_shape=None,
+                                   lengths=[3, 2])
+    self.assertAllEqual(list_ops.tensor_list_length(l), 2)
+    self.assertAllEqual(
+        list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32),
+        [1., 2., 3.])
+    self.assertAllEqual(
+        list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32),
+        [4., 5.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithInvalidTensorShapeFails(self):
+    with self.cached_session():
+      tensor = array_ops.placeholder(dtype=dtypes.float32)
+      l = list_ops.tensor_list_split(tensor, element_shape=None, lengths=[1])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Tensor must be at least a vector, but saw shape: \[\]"):
+        l.eval({tensor: 1})
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithInvalidLengthsShapeFails(self):
+    with self.cached_session():
+      lengths = array_ops.placeholder(dtype=dtypes.int64)
+      l = list_ops.tensor_list_split([1., 2.],
+                                     element_shape=None,
+                                     lengths=lengths)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Expected lengths to be a vector, received shape: \[\]"):
+        l.eval({lengths: 1})
+
+  def testSplitWithInvalidLengthsFails(self):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r"Invalid value in lengths: -1"):
+      l = list_ops.tensor_list_split([1., 2.],
+                                     element_shape=None,
+                                     lengths=[1, -1])
+      self.evaluate(l)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Attempting to slice \[0, 3\] from tensor with length 2"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[3])
+      self.evaluate(l)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Unused values in tensor. Length of tensor: 2 Values used: 1"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[1])
+      self.evaluate(l)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithScalarElementShapeFails(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Shapes must be equal rank, but are 1 and 0"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=[], lengths=[1, 1])
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"TensorListSplit requires element_shape to be at least of rank 1, "
+          r"but saw: \[\]"):
+        element_shape = array_ops.placeholder(dtype=dtypes.int32)
+        l = list_ops.tensor_list_split([1., 2.],
+                                       element_shape=element_shape,
+                                       lengths=[1, 1])
+        l.eval({element_shape: []})
+
+  def testEagerOnlySplitWithScalarElementShapeFails(self):
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"TensorListSplit requires element_shape to be at least of rank 1, "
+          r"but saw: \[\]"):
+        list_ops.tensor_list_split([1., 2.], element_shape=[], lengths=[1, 1])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithIncompatibleTensorShapeAndElementShapeFails(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Shapes must be equal rank, but are 2 and 1"):
+      l = list_ops.tensor_list_split([[1.], [2.]],
+                                     element_shape=[1],
+                                     lengths=[1, 1])
+
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
+        element_shape = array_ops.placeholder(dtype=dtypes.int32)
+        l = list_ops.tensor_list_split([[1.], [2.]],
+                                       element_shape=element_shape,
+                                       lengths=[1, 1])
+        l.eval({element_shape: [1]})
+
+  def testEagerOnlySplitWithIncompatibleTensorShapeAndElementShapeFails(self):
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
+        list_ops.tensor_list_split([[1.], [2.]],
+                                   element_shape=[1],
+                                   lengths=[1, 1])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/listdiff_op_test.py b/tensorflow/python/kernel_tests/listdiff_op_test.py
index baeb40dd63584bb47db7914b789d6ce869f09b25..28657107980e2c1ea3356da89b97df624477260d 100644
--- a/tensorflow/python/kernel_tests/listdiff_op_test.py
+++ b/tensorflow/python/kernel_tests/listdiff_op_test.py
@@ -47,7 +47,7 @@ class ListDiffTest(test.TestCase):
             y_tensor = ops.convert_to_tensor(y, dtype=dtype)
             out_tensor, idx_tensor = diff_func(x_tensor, y_tensor,
                                                index_dtype=index_dtype)
-            tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
+            tf_out, tf_idx = self.evaluate([out_tensor, idx_tensor])
           self.assertAllEqual(tf_out, out)
           self.assertAllEqual(tf_idx, idx)
           self.assertEqual(1, out_tensor.get_shape().ndims)
diff --git a/tensorflow/python/kernel_tests/logging_ops_test.py b/tensorflow/python/kernel_tests/logging_ops_test.py
index 4beddd00bb2dcb3a9428b7c2f90306ea9cdbdc19..85035e5f7d308c323786bc9fd9017fda89dbec13 100644
--- a/tensorflow/python/kernel_tests/logging_ops_test.py
+++ b/tensorflow/python/kernel_tests/logging_ops_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import sys
+import tempfile
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
@@ -35,9 +37,9 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
-
 class LoggingOpsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAssertDivideByZero(self):
     with self.cached_session() as sess:
       epsilon = ops.convert_to_tensor(1e-20)
@@ -51,7 +53,7 @@ class LoggingOpsTest(test.TestCase):
               math_ops.less(epsilon, y), ["Divide-by-zero"])
       ]):
         out = math_ops.div(z, y)
-      self.assertAllEqual(2.0, out.eval())
+      self.assertAllEqual(2.0, self.evaluate(out))
       # assert(epsilon < x)
       # z / x
       #
@@ -62,7 +64,7 @@ class LoggingOpsTest(test.TestCase):
       ]):
         out = math_ops.div(z, x)
       with self.assertRaisesOpError("less than x"):
-        out.eval()
+        self.evaluate(out)
 
 
 class PrintV2Test(test.TestCase):
@@ -271,6 +273,30 @@ class PrintV2Test(test.TestCase):
       expected = "[0 1 2 ... 7 8 9]"
       self.assertTrue((expected + "\n") in printed.contents())
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testPrintTensorsToFile(self):
+    tmpfile_name = tempfile.mktemp(".printv2_test")
+    tensor_0 = math_ops.range(0, 10)
+    print_op_0 = logging_ops.print_v2(tensor_0,
+                                      output_stream="file://"+tmpfile_name)
+    self.evaluate(print_op_0)
+    tensor_1 = math_ops.range(11, 20)
+    print_op_1 = logging_ops.print_v2(tensor_1,
+                                      output_stream="file://"+tmpfile_name)
+    self.evaluate(print_op_1)
+    try:
+      f = open(tmpfile_name, "r")
+      line_0 = f.readline()
+      expected_0 = "[0 1 2 ... 7 8 9]"
+      self.assertTrue(expected_0 in line_0)
+      line_1 = f.readline()
+      expected_1 = "[11 12 13 ... 17 18 19]"
+      self.assertTrue(expected_1 in line_1)
+      f.close()
+      os.remove(tmpfile_name)
+    except IOError as e:
+      self.fail(e)
+
   @test_util.run_in_graph_and_eager_modes()
   def testInvalidOutputStreamRaisesError(self):
     with self.cached_session():
@@ -280,12 +306,14 @@ class PrintV2Test(test.TestCase):
             tensor, output_stream="unknown")
         self.evaluate(print_op)
 
+  @test_util.run_deprecated_v1
   def testPrintOpName(self):
     with self.cached_session():
       tensor = math_ops.range(10)
       print_op = logging_ops.print_v2(tensor, name="print_name")
       self.assertEqual(print_op.name, "print_name")
 
+  @test_util.run_deprecated_v1
   def testNoDuplicateFormatOpGraphModeAfterExplicitFormat(self):
     with self.cached_session():
       tensor = math_ops.range(10)
@@ -306,6 +334,19 @@ class PrintV2Test(test.TestCase):
           logging_ops.print_v2(tensor)
         self.assertTrue((expected + "\n") in printed.contents())
 
+  def testPrintsOrderedInDefun(self):
+    with context.eager_mode():
+
+      @function.defun
+      def prints():
+        logging_ops.print_v2("A")
+        logging_ops.print_v2("B")
+        logging_ops.print_v2("C")
+
+      with self.captureWritesToStream(sys.stderr) as printed:
+        prints()
+      self.assertTrue(("A\nB\nC\n") in printed.contents())
+
   @test_util.run_in_graph_and_eager_modes()
   def testPrintInDefunWithoutExplicitEvalOfPrint(self):
     @function.defun
@@ -341,6 +382,7 @@ class PrintGradientTest(test.TestCase):
     inp_printed = logging_ops.Print(inp, ["hello"])
     self.assertEqual(inp.get_shape(), inp_printed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testPrintGradient(self):
     with self.cached_session():
       inp = constant_op.constant(2.0, shape=[100, 32], name="in")
@@ -349,8 +391,8 @@ class PrintGradientTest(test.TestCase):
       wx_print = logging_ops.Print(wx, [w, w, w])
       wx_grad = gradients_impl.gradients(wx, w)[0]
       wx_print_grad = gradients_impl.gradients(wx_print, w)[0]
-      wxg = wx_grad.eval()
-      wxpg = wx_print_grad.eval()
+      wxg = self.evaluate(wx_grad)
+      wxpg = self.evaluate(wx_print_grad)
     self.assertAllEqual(wxg, wxpg)
 
 
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index 6791a03e2e1acce1f283e5fb1468dbf99bd4ae42..ad81e0be649f17fe97691b1c5739dbe0bf4a63d2 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.training import server_lib
 
 class HashTableOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testHashTable(self):
     with self.cached_session():
       default_val = -1
@@ -44,7 +45,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       self.assertAllEqual(3, table.size().eval())
 
@@ -52,15 +53,16 @@ class HashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
       self.assertAllEqual([3], output.get_shape())
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
       exported_keys_tensor, exported_values_tensor = table.export()
 
       self.assertItemsEqual([b"brain", b"salad", b"surgery"],
-                            exported_keys_tensor.eval())
-      self.assertItemsEqual([0, 1, 2], exported_values_tensor.eval())
+                            self.evaluate(exported_keys_tensor))
+      self.assertItemsEqual([0, 1, 2], self.evaluate(exported_values_tensor))
 
+  @test_util.run_deprecated_v1
   def testHashTableFindHighRank(self):
     with self.cached_session():
       default_val = -1
@@ -68,7 +70,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       self.assertAllEqual(3, table.size().eval())
 
@@ -76,9 +78,10 @@ class HashTableOpTest(test.TestCase):
           [["brain", "salad"], ["tank", "tarkus"]])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([[0, 1], [-1, -1]], result)
 
+  @test_util.run_deprecated_v1
   def testHashTableInitWithPythonArrays(self):
     with self.cached_session():
       default_val = -1
@@ -87,16 +90,17 @@ class HashTableOpTest(test.TestCase):
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(
               keys, values, value_dtype=dtypes.int64), default_val)
-      table.init.run()
+      table.initializer.run()
 
       self.assertAllEqual(3, table.size().eval())
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testHashTableInitWithNumPyArrays(self):
     with self.cached_session():
       default_val = -1
@@ -104,16 +108,17 @@ class HashTableOpTest(test.TestCase):
       values = np.array([0, 1, 2], dtype=np.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       self.assertAllEqual(3, table.size().eval())
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testMultipleHashTables(self):
     with self.cached_session() as sess:
       default_val = -1
@@ -137,11 +142,12 @@ class HashTableOpTest(test.TestCase):
       output2 = table2.lookup(input_string)
       output3 = table3.lookup(input_string)
 
-      out1, out2, out3 = sess.run([output1, output2, output3])
+      out1, out2, out3 = self.evaluate([output1, output2, output3])
       self.assertAllEqual([0, 1, -1], out1)
       self.assertAllEqual([0, 1, -1], out2)
       self.assertAllEqual([0, 1, -1], out3)
 
+  @test_util.run_deprecated_v1
   def testHashTableWithTensorDefault(self):
     with self.cached_session():
       default_val = constant_op.constant(-1, dtypes.int64)
@@ -149,14 +155,15 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testHashTableWithSparseTensorInput(self):
     with self.cached_session() as sess:
       default_val = constant_op.constant(-1, dtypes.int64)
@@ -164,7 +171,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       sp_indices = [[0, 0], [0, 1], [1, 0]]
       sp_shape = [2, 2]
@@ -174,12 +181,13 @@ class HashTableOpTest(test.TestCase):
           constant_op.constant(sp_shape, dtypes.int64))
       output = table.lookup(input_tensor)
 
-      out_indices, out_values, out_shape = sess.run(output)
+      out_indices, out_values, out_shape = self.evaluate(output)
 
       self.assertAllEqual([0, 1, -1], out_values)
       self.assertAllEqual(sp_indices, out_indices)
       self.assertAllEqual(sp_shape, out_shape)
 
+  @test_util.run_deprecated_v1
   def testSignatureMismatch(self):
     with self.cached_session():
       default_val = -1
@@ -187,7 +195,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       # Ref types do not produce a lookup signature mismatch.
       input_string_ref = variables.Variable("brain")
@@ -210,6 +218,7 @@ class HashTableOpTest(test.TestCase):
             lookup_ops.KeyValueTensorInitializer(["a"], [1], [dtypes.string],
                                                  dtypes.int64), default_val)
 
+  @test_util.run_deprecated_v1
   def testNotInitialized(self):
     with self.cached_session():
       default_val = -1
@@ -221,8 +230,9 @@ class HashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
 
       with self.assertRaisesOpError("Table not initialized"):
-        output.eval()
+        self.evaluate(output)
 
+  @test_util.run_deprecated_v1
   def testInitializeTwice(self):
     with self.cached_session():
       default_val = -1
@@ -230,11 +240,12 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       with self.assertRaisesOpError("Table already initialized"):
-        table.init.run()
+        table.initializer.run()
 
+  @test_util.run_deprecated_v1
   def testInitializationWithInvalidDimensions(self):
     with self.cached_session():
       default_val = -1
@@ -245,6 +256,7 @@ class HashTableOpTest(test.TestCase):
         lookup_ops.HashTable(
             lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
 
+  @test_util.run_deprecated_v1
   def testMultipleSessions(self):
     # Start a server
     server = server_lib.Server(
@@ -265,15 +277,16 @@ class HashTableOpTest(test.TestCase):
 
     # Init the table in the first session.
     with session1:
-      table.init.run()
+      table.initializer.run()
       self.assertAllEqual(3, table.size().eval())
 
     # Init the table in the second session and verify that we do not get a
     # "Table already initialized" error.
     with session2:
-      table.init.run()
+      table.initializer.run()
       self.assertAllEqual(3, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testHashTableInt32String(self):
     with self.cached_session():
       default_val = "n/a"
@@ -281,12 +294,12 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant(["brain", "salad", "surgery"])
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.init.run()
+      table.initializer.run()
 
       input_tensor = constant_op.constant([0, 1, -1])
       output = table.lookup(input_tensor)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
 
 
@@ -298,6 +311,7 @@ class IndexTableFromFile(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_file(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -305,10 +319,12 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -320,10 +336,12 @@ class IndexTableFromFile(test.TestCase):
           value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -336,10 +354,12 @@ class IndexTableFromFile(test.TestCase):
           delimiter=" ")
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_file_tensor_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -348,12 +368,14 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
       self.assertEqual(1,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_file_placeholder_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -362,14 +384,16 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_placeholder, num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
 
       feed_dict = {vocabulary_placeholder.name: vocabulary_file}
       lookup_ops.tables_initializer().run(feed_dict=feed_dict)
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
       self.assertEqual(0,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
+  @test_util.run_deprecated_v1
   def test_int32_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab2.txt", values=("42", "1", "-1000"))
@@ -381,10 +405,12 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_int64_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab3.txt", values=("42", "1", "-1000"))
@@ -396,10 +422,12 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_default_value(self):
     default_value = -42
     vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
@@ -408,10 +436,12 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, default_value=default_value)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), ids.eval())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
     with self.cached_session():
@@ -420,7 +450,8 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(
           constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual(
           (
@@ -428,7 +459,7 @@ class IndexTableFromFile(test.TestCase):
               2,  # From vocabulary file.
               867,  # 3 + fingerprint("tarkus") mod 300.
               860),  # 3 + fingerprint("toccata") mod 300.
-          ids.eval())
+          self.evaluate(ids))
 
   def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
     self.assertRaises(
@@ -459,6 +490,7 @@ class IndexTableFromFile(test.TestCase):
         vocabulary_file=vocabulary_file,
         vocab_size=0)
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_small(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
@@ -466,19 +498,22 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=2)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, -1, -1), ids.eval())
+      self.assertAllEqual((1, -1, -1), self.evaluate(ids))
       self.assertEqual(2, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
       table = lookup_ops.index_table_from_file(
           vocabulary_file=vocabulary_file, vocab_size=4)
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                              "Invalid vocab_size", table.init.run)
+                              "Invalid vocab_size", table.initializer.run)
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
 
@@ -493,9 +528,10 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=3)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, -1), ids.eval())
+      self.assertAllEqual((1, 2, -1), self.evaluate(ids))
       self.assertEqual(3, table.size().eval())
 
   def test_index_table_from_file_with_invalid_hashers(self):
@@ -522,14 +558,14 @@ class IndexTableFromFile(test.TestCase):
     with self.cached_session():
       table = lookup_ops.index_table_from_file(
           vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      self.assertIsNotNone(table.table_ref)
+      self.assertIsNotNone(table.resource_handle)
 
   def test_index_table_from_file_table_ref_without_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab10.txt")
     with self.cached_session():
       table = lookup_ops.index_table_from_file(
           vocabulary_file=vocabulary_file, num_oov_buckets=0)
-      self.assertIsNotNone(table.table_ref)
+      self.assertIsNotNone(table.resource_handle)
 
 
 class KeyValueTensorInitializerTest(test.TestCase):
@@ -539,15 +575,34 @@ class KeyValueTensorInitializerTest(test.TestCase):
       init = lookup_ops.KeyValueTensorInitializer(
           ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
       table = lookup_ops.HashTable(init, default_value=-1)
-      table.init.run()
+      table.initializer.run()
+
+  def test_multiple_tables(self):
+    with ops.Graph().as_default(), self.cached_session():
+      with ops.name_scope("table_scope"):
+        init1 = lookup_ops.KeyValueTensorInitializer(
+            ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
+            dtypes.int64)
+        table1 = lookup_ops.HashTable(init1, default_value=-1)
+        self.assertEquals("hash_table", table1.name)
+        self.assertEquals("table_scope/hash_table",
+                          table1.resource_handle.op.name)
+        init2 = lookup_ops.KeyValueTensorInitializer(
+            ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
+            dtypes.int64)
+        table2 = lookup_ops.HashTable(init2, default_value=-1)
+        self.assertEquals("hash_table_1", table2.name)
+        self.assertEquals("table_scope/hash_table_1",
+                          table2.resource_handle.op.name)
 
   def test_int64(self):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
                                                   dtypes.int64, dtypes.int64)
       table = lookup_ops.HashTable(init, default_value=-1)
-      table.init.run()
+      table.initializer.run()
 
+  @test_util.run_deprecated_v1
   def test_int32(self):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
@@ -555,12 +610,13 @@ class KeyValueTensorInitializerTest(test.TestCase):
       table = lookup_ops.HashTable(init, default_value=-1)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "No OpKernel was registered"):
-        table.init.run()
+        table.initializer.run()
 
 
 class IndexTableFromTensor(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_tensor_init(self):
     table = lookup_ops.index_table_from_tensor(
         vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
@@ -577,6 +633,7 @@ class IndexTableFromTensor(test.TestCase):
     ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
     self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_int32_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -584,10 +641,12 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_int64_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -595,10 +654,12 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_default_value(self):
     default_value = -42
     with self.cached_session():
@@ -607,9 +668,10 @@ class IndexTableFromTensor(test.TestCase):
           default_value=default_value)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), ids.eval())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_tensor_missing_vocabulary_list(self):
     with self.cached_session():
@@ -618,12 +680,14 @@ class IndexTableFromTensor(test.TestCase):
         lookup_ops.index_table_from_tensor(
             vocabulary_list=None, num_oov_buckets=1)
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_empty_vocabulary_list(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
           vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
-      self.assertRaises(errors_impl.OpError, ids.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "keys and values cannot be empty"):
         lookup_ops.tables_initializer().run()
@@ -653,6 +717,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table(self):
     vocabulary_path = self._createVocabFile("i2f_vocab1.txt")
     # vocabulary_file supports string and tensor
@@ -664,11 +729,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
             vocabulary_file=vocabulary_file)
         features = table.lookup(
             constant_op.constant([0, 1, 2, 3], dtypes.int64))
-        self.assertRaises(errors_impl.OpError, features.eval)
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
         lookup_ops.tables_initializer().run()
         self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                            features.eval())
+                            self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -678,11 +745,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
           key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
           value_column_index=0)
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      self.assertRaises(errors_impl.OpError, features.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -693,11 +762,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
           value_column_index=0,
           delimiter=" ")
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      self.assertRaises(errors_impl.OpError, features.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_default_value(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -705,11 +776,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
       table = lookup_ops.index_to_string_table_from_file(
           vocabulary_file=vocabulary_file, default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      self.assertRaises(errors_impl.OpError, features.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_small(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -719,11 +792,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocab_size=2,
           default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      self.assertRaises(errors_impl.OpError, features.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", default_value, default_value),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
@@ -731,11 +806,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=4)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
-      self.assertRaises(errors_impl.OpError, features.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
       init = lookup_ops.tables_initializer()
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", init.run)
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
@@ -743,13 +820,16 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=3)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
-      self.assertRaises(errors_impl.OpError, features.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
+      self.assertAllEqual((b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
 
 
 class IndexToStringTableFromTensorTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_tensor(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
@@ -758,12 +838,14 @@ class IndexToStringTableFromTensorTest(test.TestCase):
 
       indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       features = table.lookup(indices)
-      self.assertRaises(errors_impl.OpError, features.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_duplicate_entries(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["hello", "hello"])
@@ -772,8 +854,9 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       features = table.lookup(indices)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
+      self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
     with self.cached_session():
@@ -782,11 +865,12 @@ class IndexToStringTableFromTensorTest(test.TestCase):
           vocabulary_list=vocabulary_list, default_value=default_value)
       indices = constant_op.constant([1, 2, 4], dtypes.int64)
       features = table.lookup(indices)
-      self.assertRaises(errors_impl.OpError, features.eval)
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
 
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
-                          features.eval())
+                          self.evaluate(features))
 
 
 class InitializeTableFromFileOpTest(test.TestCase):
@@ -805,13 +889,14 @@ class InitializeTableFromFileOpTest(test.TestCase):
         lookup_ops.TextFileInitializer(
             vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
             dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
-    self.evaluate(table.init)
+    self.evaluate(table.initializer)
 
     output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
 
     result = self.evaluate(output)
     self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testInitializeInt64Table(self):
     vocabulary_file = self._createVocabFile(
         "one_column_int64.txt", values=("42", "1", "-1000"))
@@ -823,14 +908,15 @@ class InitializeTableFromFileOpTest(test.TestCase):
               vocabulary_file, dtypes.int64,
               lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
               lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
-      table.init.run()
+      table.initializer.run()
 
       output = table.lookup(
           constant_op.constant((42, 1, 11), dtype=dtypes.int64))
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testInitializeIndexTable(self):
     vocabulary_file = self._createVocabFile("one_column_2.txt")
 
@@ -842,14 +928,15 @@ class InitializeTableFromFileOpTest(test.TestCase):
           lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
                                          key_index, dtypes.string, value_index),
           default_value)
-      table.init.run()
+      table.initializer.run()
 
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       output = table.lookup(input_values)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
 
+  @test_util.run_deprecated_v1
   def testMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -864,14 +951,15 @@ class InitializeTableFromFileOpTest(test.TestCase):
           lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
                                          key_index, dtypes.int64, value_index),
           default_value)
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([1, 5, 6], result)
 
+  @test_util.run_deprecated_v1
   def testInvalidDataTypeInMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -886,7 +974,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                                          key_index, dtypes.int64, value_index),
           default_value)
       with self.assertRaisesOpError("is not a valid"):
-        table.init.run()
+        table.initializer.run()
 
   def testInvalidDataType(self):
     vocabulary_file = self._createVocabFile("one_column_3.txt")
@@ -902,6 +990,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                                            key_index, dtypes.string,
                                            value_index), default_value)
 
+  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     vocabulary_file = self._createVocabFile("one_column_4.txt")
     with self.cached_session():
@@ -914,8 +1003,9 @@ class InitializeTableFromFileOpTest(test.TestCase):
           default_value)
 
       with self.assertRaisesOpError("Invalid number of columns"):
-        table.init.run()
+        table.initializer.run()
 
+  @test_util.run_deprecated_v1
   def testInitializeSameTableWithMultipleNodes(self):
     vocabulary_file = self._createVocabFile("one_column_5.txt")
 
@@ -952,7 +1042,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       output2 = table2.lookup(input_string)
       output3 = table3.lookup(input_string)
 
-      out1, out2, out3 = sess.run([output1, output2, output3])
+      out1, out2, out3 = self.evaluate([output1, output2, output3])
       self.assertAllEqual([0, 1, -1], out1)
       self.assertAllEqual([0, 1, -1], out2)
       self.assertAllEqual([0, 1, -1], out3)
@@ -967,6 +1057,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
 
+  @test_util.run_deprecated_v1
   def testInitializeWithVocabSize(self):
     with self.cached_session():
       default_value = -1
@@ -982,7 +1073,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
               vocab_size=vocab_size), default_value)
 
       # Initialize from file.
-      table1.init.run()
+      table1.initializer.run()
       self.assertEquals(vocab_size, table1.size().eval())
 
       vocabulary_file2 = self._createVocabFile("one_column7.txt")
@@ -996,7 +1087,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
               lookup_ops.TextFileIndex.LINE_NUMBER,
               vocab_size=vocab_size), default_value)
       with self.assertRaisesOpError("Invalid vocab_size"):
-        table2.init.run()
+        table2.initializer.run()
 
       vocab_size = 1
       vocabulary_file3 = self._createVocabFile("one_column3.txt")
@@ -1010,9 +1101,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
               vocab_size=vocab_size), default_value)
 
       # Smaller vocab size reads only vocab_size records.
-      table3.init.run()
+      table3.initializer.run()
       self.assertEquals(vocab_size, table3.size().eval())
 
+  @test_util.run_deprecated_v1
   def testFeedVocabularyName(self):
     vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
 
@@ -1027,18 +1119,19 @@ class InitializeTableFromFileOpTest(test.TestCase):
       # Initialize with non existing file (old_file.txt) should fail.
       # TODO(yleon): Update message, which might change per FileSystem.
       with self.assertRaisesOpError("old_file.txt"):
-        table.init.run()
+        table.initializer.run()
 
       # Initialize the model feeding the vocabulary file.
       filenames = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
-      table.init.run(feed_dict={filenames[0]: vocabulary_file})
+      table.initializer.run(feed_dict={filenames[0]: vocabulary_file})
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testInvalidFilenames(self):
     vocabulary_file = self._createVocabFile("filename_shape.txt")
 
@@ -1063,6 +1156,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
 
+  @test_util.run_deprecated_v1
   def testIdToStringTable(self):
     vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.cached_session():
@@ -1072,14 +1166,16 @@ class InitializeTableFromFileOpTest(test.TestCase):
           lookup_ops.TextFileStringTableInitializer(
               vocab_file, vocab_size=vocab_size), default_value)
 
-      table.init.run()
+      table.initializer.run()
 
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
 
       out = table.lookup(input_values)
-      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], out.eval())
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"],
+                          self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testStringToIdTable(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt")
     with self.cached_session():
@@ -1088,14 +1184,15 @@ class InitializeTableFromFileOpTest(test.TestCase):
       table = lookup_ops.HashTable(
           lookup_ops.TextFileIdTableInitializer(
               vocab_file, vocab_size=vocab_size), default_value)
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, -1], out.eval())
+      self.assertAllEqual([0, 1, 2, -1], self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt64ToIdTable(self):
     vocab_file = self._createVocabFile(
         "feat_to_id_3.txt", values=("42", "1", "-1000"))
@@ -1106,11 +1203,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
           lookup_ops.TextFileIdTableInitializer(
               vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
           default_value)
-      table.init.run()
+      table.initializer.run()
 
       out = table.lookup(
           constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
-      self.assertAllEqual((0, 1, 2, -1), out.eval())
+      self.assertAllEqual((0, 1, 2, -1), self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
 
@@ -1122,6 +1219,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_deprecated_v1
   def testStringIdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.cached_session():
@@ -1134,14 +1232,15 @@ class IdTableWithHashBucketsTest(test.TestCase):
                   vocab_file, vocab_size=vocab_size), default_value),
           oov_buckets)
 
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt32IdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
     with self.cached_session():
@@ -1156,14 +1255,15 @@ class IdTableWithHashBucketsTest(test.TestCase):
           oov_buckets,
           key_dtype=dtypes.int32)
 
-      table.init.run()
+      table.initializer.run()
 
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
 
       out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt64IdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
     with self.cached_session():
@@ -1176,14 +1276,15 @@ class IdTableWithHashBucketsTest(test.TestCase):
                   vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
               default_value), oov_buckets)
 
-      table.init.run()
+      table.initializer.run()
 
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
 
       out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testStringIdTableWithOnlyHashBucket(self):
     with self.cached_session():
       oov_buckets = 5
@@ -1191,7 +1292,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       # Set a table that only uses hash buckets, for each input value returns
       # an id calculated by fingerprint("input") mod oov_buckets.
       table = lookup_ops.IdTableWithHashBuckets(None, oov_buckets)
-      table.init.run()
+      table.initializer.run()
 
       values = constant_op.constant(("brain", "salad", "surgery"))
 
@@ -1202,9 +1303,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
               1,  # fingerprint("salad") mod 5.
               4  # fingerprint("surgery") mod 5
           ],
-          out.eval())
+          self.evaluate(out))
       self.assertEquals(oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt32IdTableWithOnlyHashBucket(self):
     with self.cached_session():
       oov_buckets = 5
@@ -1213,7 +1315,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       # an id calculated by fingerprint("input") mod oov_buckets.
       table = lookup_ops.IdTableWithHashBuckets(
           None, oov_buckets, key_dtype=dtypes.int32)
-      table.init.run()
+      table.initializer.run()
 
       input_string = constant_op.constant([42, 1, -1000], dtype=dtypes.int32)
 
@@ -1224,7 +1326,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               4,  # fingerprint("1") mod 5.
               2  # fingerprint("-1000") mod 5
           ],
-          out.eval())
+          self.evaluate(out))
       self.assertEquals(oov_buckets, table.size().eval())
 
   def testFloat64IdTableWithOnlyHashBucket(self):
@@ -1239,6 +1341,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
         lookup_ops.IdTableWithHashBuckets(
             None, num_oov_buckets=5, key_dtype=dtypes.bool)
 
+  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
     with self.cached_session() as sess:
@@ -1269,7 +1372,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1 = table1.lookup(input_string)
       out2 = table2.lookup(input_string)
 
-      out1, out2 = sess.run([out1, out2])
+      out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([5, 0, 1, 2, 5], out1)
       self.assertAllEqual([5, 0, 1, 2, 3], out2)
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
@@ -1279,6 +1382,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           "table2_Lookup/hash_bucket": "StringToHashBucketStrong",
       }, sess.graph)
 
+  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsInitializationAcrossSessions(self):
     vocab_file = self._createVocabFile("feat_to_id_5.txt")
     shared_name = "across-sessions"
@@ -1293,14 +1397,14 @@ class IdTableWithHashBucketsTest(test.TestCase):
               default_value,
               shared_name=shared_name), oov_buckets)
 
-      table1.init.run()
+      table1.initializer.run()
 
       input_string_1 = constant_op.constant(
           ["brain", "salad", "surgery", "UNK"])
 
       out1 = table1.lookup(input_string_1)
 
-      self.assertAllEqual([0, 1, 2, 3], out1.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
 
     with self.cached_session():
@@ -1309,7 +1413,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       oov_buckets = 1
 
       # Underlying lookup table already initialized in previous session.
-      # No need to call table2.init.run()
+      # No need to call table2.initializer.run()
       table2 = lookup_ops.IdTableWithHashBuckets(
           lookup_ops.HashTable(
               lookup_ops.TextFileIdTableInitializer(
@@ -1321,9 +1425,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out2 = table2.lookup(input_string_2)
 
-      self.assertAllEqual([3, 1, 3], out2.eval())
+      self.assertAllEqual([3, 1, 3], self.evaluate(out2))
       self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
 
+  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
     vocab_file = self._createVocabFile("feat_to_id_6.txt")
     with self.cached_session() as sess:
@@ -1352,12 +1457,13 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1 = table1.lookup(input_string_1)
       out2 = table2.lookup(input_string_2)
 
-      out1, out2 = sess.run([out1, out2])
+      out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([0, 1, 2, -1], out1)
       self.assertAllEqual([-2, 1, -2], out2)
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
       self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
 
+  @test_util.run_deprecated_v1
   def testSparseTensor(self):
     vocab_file = self._createVocabFile("feat_to_id_7.txt")
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
@@ -1373,7 +1479,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           lookup_ops.HashTable(
               lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3),
               -1), 1)
-      table.init.run()
+      table.initializer.run()
 
       sp_ids = table.lookup(sp_features)
 
@@ -1386,6 +1492,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
       self.assertAllEqual(input_shape, sp_ids_shape)
 
+  @test_util.run_deprecated_v1
   def testInt32SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -1401,7 +1508,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
                   (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
           1,
           key_dtype=dtypes.int32)
-      table.init.run()
+      table.initializer.run()
 
       sp_ids = table.lookup(sp_features)
 
@@ -1414,6 +1521,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
       self.assertAllEqual(input_shape, sp_ids_shape)
 
+  @test_util.run_deprecated_v1
   def testInt64SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -1429,7 +1537,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
                   (42, 1, -1000), (0, 1, 2), dtypes.int64, dtypes.int64), -1),
           1,
           key_dtype=dtypes.int64)
-      table.init.run()
+      table.initializer.run()
 
       sp_ids = table.lookup(sp_features)
 
@@ -1487,7 +1595,8 @@ class IdTableWithHashBucketsTest(test.TestCase):
   def testIdTableWithHashBucketsNoInnerTable(self):
     with self.cached_session():
       table = lookup_ops.IdTableWithHashBuckets(None, num_oov_buckets=1)
-      self.assertIsNone(table.table_ref)
+      self.assertIsNone(table.resource_handle)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 3ce0b74263137c01f3d3d3574137ecedabf87594..4584a27e6227bf53e4de5f74730cc9b737214cd5 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -34,25 +33,11 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
-from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.ops.losses import util
 from tensorflow.python.platform import test
 from tensorflow.python.training import momentum as momentum_lib
 
 
-safe_div = losses_impl._safe_div  # pylint: disable=protected-access
-
-
-class SafeDivTest(test.TestCase):
-
-  def testEager(self):
-    with context.eager_mode():
-      self.assertAllEqual(safe_div(constant_op.constant(1.0),
-                                   constant_op.constant(0.0)), 0.0)
-      self.assertAllEqual(safe_div(constant_op.constant(1.0),
-                                   0.0), 0.0)
-
-
 class AbsoluteDifferenceLossTest(test.TestCase):
 
   def setUp(self):
@@ -66,58 +51,62 @@ class AbsoluteDifferenceLossTest(test.TestCase):
         losses.absolute_difference(
             self._predictions, self._predictions, weights=None)
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.absolute_difference(self._predictions, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.absolute_difference(self._labels, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(5.5, loss.eval(), 3)
+      self.assertAlmostEqual(5.5, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions,
                                       constant_op.constant(weights))
     with self.cached_session():
-      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     weights = constant_op.constant((1.2, 0.0), shape=(2, 1))
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.6, loss.eval(), 3)
+      self.assertAlmostEqual(5.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 0.0], shape=[2, 1])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.6, loss.eval(), 3)
+      self.assertAlmostEqual(5.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeights(self):
     weights = constant_op.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(16.6, loss.eval(), 3)
+      self.assertAlmostEqual(16.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
     weights = constant_op.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(6.0, loss.eval(), 3)
+      self.assertAlmostEqual(6.0, self.evaluate(loss), 3)
 
   def testLossWithSampleSpecificWeightsAllZero(self):
     weights = array_ops.zeros((2, 3))
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testEagerNoMemoryLeaked(self):
@@ -138,6 +127,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.softmax_cross_entropy(labels, logits, weights=None)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -147,6 +137,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals('softmax_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrong(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -157,6 +148,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -164,8 +156,9 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = 2.3
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -174,7 +167,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits,
                                           constant_op.constant(weights))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -183,7 +176,8 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant((1.2, 3.4, 5.6))
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testAllWrongAllWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -192,7 +186,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([0, 0, 0], shape=[3])
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSomeWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -201,7 +195,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 0, 0], shape=[3])
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(12.0, loss.eval(), 3)
+      self.assertAlmostEqual(12.0, self.evaluate(loss), 3)
 
   def testSoftmaxWithMeasurementSpecificWeightsRaisesException(self):
     with self.cached_session():
@@ -214,6 +208,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.softmax_cross_entropy(labels, logits, weights=weights).eval()
 
+  @test_util.run_deprecated_v1
   def testSoftmaxLabelSmoothing(self):
     with self.cached_session():
       # Softmax Cross Entropy Loss is:
@@ -246,6 +241,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.sparse_softmax_cross_entropy(labels, logits, weights=None)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectInt32Labels(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -262,6 +258,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     labels = constant_op.constant([[0], [1], [2]], dtype=dtypes.int32)
     losses.sparse_softmax_cross_entropy(labels, logits)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectInt64Labels(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -271,6 +268,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectNonColumnLabels(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -280,6 +278,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongInt32Labels(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -290,6 +289,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongInt64Labels(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -300,6 +300,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongNonColumnLabels(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -310,6 +311,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -317,8 +319,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = 2.3
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -327,7 +330,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits,
                                                  constant_op.constant(weights))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWith1DTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -337,8 +340,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(
           labels, logits, constant_op.constant((weights,)))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPlaceholderForWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0],
                                    [0.0, 10.0, 0.0],
@@ -351,6 +355,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                           feed_dict={weights: ((1.2,), (3.4,), (5.6,))})
       self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss_val, 3)
 
+  @test_util.run_deprecated_v1
   def testUnknownShapePlaceholderForLogitsLabelsButScalarWeights(self):
     logits = array_ops.placeholder(dtypes.float32)
     labels = array_ops.placeholder(dtypes.int32)
@@ -366,6 +371,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                           })
       self.assertAlmostEqual((1.0 + 1.0 + 1.0) * 10.0 / 3.0, loss_val, 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPlaceholderForLogitsLabelsAndWeights(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 3))
     labels = array_ops.placeholder(dtypes.int32, shape=(None, 1))
@@ -389,7 +395,8 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 3.4, 5.6], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithColumnWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -398,7 +405,8 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([[1.2], [3.4], [5.6]])
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testAllWrongAllWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -407,7 +415,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([0, 0, 0], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSomeWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -416,8 +424,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 0, 0], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(12.0, loss.eval(), 3)
+      self.assertAlmostEqual(12.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testMeasurementSpecificWeightsRaisesException(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -456,6 +465,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
         losses.sparse_softmax_cross_entropy(
             labels, logits, weights=weights).eval()
 
+  @test_util.run_deprecated_v1
   def testInconsistentWeightShapeRaisesException(self):
     """The weight tensor has incorrect shape."""
     with self.cached_session():
@@ -470,6 +480,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
         losses.sparse_softmax_cross_entropy(
             labels, logits, weights=weights).eval()
 
+  @test_util.run_deprecated_v1
   def testInconsistentLabelShapeRaisesException(self):
     """The label tensor has incorrect shape."""
     with self.cached_session():
@@ -487,6 +498,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
 class SigmoidCrossEntropyLossTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAllCorrectSigmoid(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -496,8 +508,9 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       loss = losses.sigmoid_cross_entropy(labels, logits)
       self.assertEquals(logits.dtype, loss.dtype)
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testLossWithSingleDimPlaceholderForLogitsAndWeights1(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 1))
     labels = array_ops.placeholder(dtypes.float32, shape=(None, 1))
@@ -514,6 +527,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                       })
       self.assertAlmostEqual(0.313, loss, 3)
 
+  @test_util.run_deprecated_v1
   def testLossWithSingleDimPlaceholderForLogitsAndWeights2(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 2))
     labels = array_ops.placeholder(dtypes.float32, shape=(None, 2))
@@ -530,6 +544,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                       })
       self.assertAlmostEqual(0.313, loss, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongSigmoid(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -541,6 +556,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 600.0 / 9.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongSigmoidWithMeasurementSpecificWeights(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -551,8 +567,9 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       loss = losses.sigmoid_cross_entropy(labels, logits, weights)
       self.assertEquals(logits.dtype, loss.dtype)
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(1700.0 / 7.0, loss.eval(), 3)
+      self.assertAlmostEqual(1700.0 / 7.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testMultiCorrectSigmoid(self):
     logits = constant_op.constant([[100.0, -100.0, 100.0],
                                    [100.0, 100.0, -100.0],
@@ -563,7 +580,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
 
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSigmoidFloat64(self):
     logits = constant_op.constant((
@@ -578,7 +595,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals(logits.dtype, loss.dtype)
 
     with self.cached_session():
-      self.assertAlmostEqual(44.444, loss.eval(), 3)
+      self.assertAlmostEqual(44.444, self.evaluate(loss), 3)
 
   def testSigmoidNoReduction(self):
     logits = constant_op.constant((
@@ -591,12 +608,10 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals(logits.dtype, loss.dtype)
 
     with self.cached_session():
-      self.assertAllClose((
-          (0., 0., 0.),
-          (0., 100., 100.),
-          (100., 0., 100.)
-      ), loss.eval(), 3)
+      self.assertAllClose(((0., 0., 0.), (0., 100., 100.), (100., 0., 100.)),
+                          self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testSigmoidLabelSmoothingCorrect(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0]])
@@ -620,6 +635,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
       self.assertAlmostEqual(loss.eval(), expected_value, 3)
 
+  @test_util.run_deprecated_v1
   def testSigmoidLabelSmoothingEqualsSoftmaxTwoLabel(self):
     with self.cached_session():
       label_smoothing = 0.1
@@ -634,7 +650,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       softmax_labels = constant_op.constant([[0, 1], [1, 0], [0, 1]])
       softmax_loss = losses.softmax_cross_entropy(
           softmax_labels, softmax_logits, label_smoothing=label_smoothing)
-      self.assertAlmostEqual(sigmoid_loss.eval(), softmax_loss.eval(), 3)
+      self.assertAlmostEqual(sigmoid_loss.eval(), self.evaluate(softmax_loss),
+                             3)
 
 
 class LogLossTest(test.TestCase):
@@ -660,11 +677,13 @@ class LogLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.log_loss(self._labels, self._labels, weights=None)
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.log_loss(self._labels, self._labels)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeightWithPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_labels.shape)
@@ -673,27 +692,31 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(
           0.0, loss.eval(feed_dict={tf_predictions: self._np_labels}), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.log_loss(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(-np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions,
                            constant_op.constant(weights))
     with self.cached_session():
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeightAndPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_predictions.shape)
@@ -705,6 +728,7 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
                              loss, 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeightAndPlaceholderWithRankOnly(self):
     tf_predictions = array_ops.placeholder(dtypes.float32, shape=[None, None])
     weights = 2.3
@@ -722,7 +746,8 @@ class LogLossTest(test.TestCase):
         np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 6.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeightsSomeZero(self):
     weights = constant_op.constant((1.2, 0), shape=(2, 1))
@@ -731,7 +756,8 @@ class LogLossTest(test.TestCase):
                                       (2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeightsSomeZero(self):
     weights = constant_op.constant([1.2, 0], shape=[2, 1])
@@ -740,7 +766,8 @@ class LogLossTest(test.TestCase):
                                       (2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             self.evaluate(loss), 3)
 
   def testWeightsWithSameNumDimsButWrongShapeThrowsException(self):
     weights = constant_op.constant(np.random.normal(size=(2, 4)), shape=[2, 4])
@@ -758,8 +785,10 @@ class LogLossTest(test.TestCase):
         constant_op.constant(
             weights, shape=(2, 3)))
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0,
+                             self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithMeasurementSpecificWeightsWithPlaceholder(self):
     weights = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -785,8 +814,9 @@ class LogLossTest(test.TestCase):
         constant_op.constant(
             weights, shape=(2, 3)))
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses), loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses), self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithSampleSpecificWeightsMostZeroWithPlaceholder(self):
     weights = np.array([0, 0, 0, 0, 0, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -803,7 +833,7 @@ class LogLossTest(test.TestCase):
     tf_weights = array_ops.zeros(shape=(2, 3))
     loss = losses.log_loss(self._labels, self._predictions, tf_weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
 
 class HingeLossTest(test.TestCase):
@@ -815,6 +845,7 @@ class HingeLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = losses.hinge_loss(labels, logits).eval()
 
+  @test_util.run_deprecated_v1
   def testAllOutsideMargin(self):
     with self.cached_session():
       logits = constant_op.constant([1.2, -1.4, -1.0, 2.1])
@@ -822,6 +853,7 @@ class HingeLossTest(test.TestCase):
       loss = losses.hinge_loss(labels, logits)
       self.assertAllClose(loss.eval(), 0.0, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testSomeInsideMargin(self):
     with self.cached_session():
       logits = constant_op.constant([[-0.7], [-1.4], [1.4], [0.6]])
@@ -831,6 +863,7 @@ class HingeLossTest(test.TestCase):
       # the margin so they incur some (small) loss.
       self.assertAllClose(loss.eval(), 0.175, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testSomeMisclassified(self):
     with self.cached_session():
       logits = constant_op.constant([[[1.2], [0.4], [-1.0], [-1.1]]])
@@ -850,6 +883,7 @@ class HuberLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = losses.huber_loss(labels, predictions).eval()
 
+  @test_util.run_deprecated_v1
   def testAllQuadratic(self):
     with self.cached_session():
       predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
@@ -858,6 +892,7 @@ class HuberLossTest(test.TestCase):
       self.assertAllClose(loss.eval(),
                           0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4., atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testAllLinear(self):
     with self.cached_session():
       predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
@@ -866,6 +901,7 @@ class HuberLossTest(test.TestCase):
       self.assertAllClose(loss.eval(),
                           (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testMixedQuadraticLinear(self):
     with self.cached_session():
       predictions = constant_op.constant([[1.5, -1.4, -1.0, 0.0],
@@ -885,7 +921,7 @@ class HuberLossTest(test.TestCase):
       labels = constant_op.constant([1.0, -1.0, 0.0, 0.5])
       expected = 0.5 * np.array([0.5**2, 0.4**2, 0.5**2, 0.5**2]).mean()
       loss = losses.huber_loss(labels, predictions, delta=delta)
-      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+      self.assertAllClose(expected, self.evaluate(loss), atol=1e-5)
 
   def testAllLinearDelta(self):
     delta = 0.5
@@ -895,7 +931,7 @@ class HuberLossTest(test.TestCase):
     expected -= 0.5 * delta**2
     loss = losses.huber_loss(labels, predictions, delta=delta)
     with self.cached_session():
-      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+      self.assertAllClose(expected, self.evaluate(loss), atol=1e-5)
 
 
 class MeanSquaredErrorTest(test.TestCase):
@@ -911,6 +947,7 @@ class MeanSquaredErrorTest(test.TestCase):
         losses.mean_squared_error(
             self._predictions, self._predictions, weights=None)
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     with self.cached_session():
       self.assertEqual(
@@ -918,58 +955,62 @@ class MeanSquaredErrorTest(test.TestCase):
           losses.mean_squared_error(predictions=constant_op.constant(0),
                                     labels=constant_op.constant(0)).eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.mean_squared_error(self._predictions, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.mean_squared_error(self._labels, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(49.5, loss.eval(), 3)
+      self.assertAlmostEqual(49.5, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions,
                                      constant_op.constant(weights))
     with self.cached_session():
-      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(767.8 / 6.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 3.4], shape=[2, 1])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(767.8 / 6.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeights(self):
     weights = constant_op.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(587 / 5.0, loss.eval(), 3)
+      self.assertAlmostEqual(587 / 5.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
     weights = constant_op.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(18.0, loss.eval(), 3)
+      self.assertAlmostEqual(18.0, self.evaluate(loss), 3)
 
   def testLossWithSampleSpecificWeightsAllZero(self):
     weights = array_ops.zeros((2, 3))
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
 
 class MeanPairwiseSquaredErrorTest(test.TestCase):
@@ -1006,7 +1047,8 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     with self.cached_session():
       static_inputs_op = losses.mean_pairwise_squared_error(
           predictions=predictions, labels=labels, weights=weights)
-      self.assertAlmostEqual(expected_loss, static_inputs_op.eval(), places=3)
+      self.assertAlmostEqual(
+          expected_loss, self.evaluate(static_inputs_op), places=3)
 
       predictions_placeholder = array_ops.placeholder(
           dtypes.float32, shape=np.asarray(predictions.shape))
@@ -1026,10 +1068,12 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       self.assertAlmostEqual(
           expected_loss, dynamic_inputs_op.eval(feed_dict=feed_dict), places=3)
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     self._test_valid_weights(
         self._labels, self._labels, expected_loss=0.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     self._test_valid_weights(
         self._labels, self._predictions,
@@ -1055,11 +1099,12 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with self.cached_session() as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         for grad, _ in gradients_to_variables:
-          np_grad = sess.run(grad)
+          np_grad = self.evaluate(grad)
           self.assertFalse(np.isnan(np_grad).any())
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weight = 2.3
     self._test_valid_weights(
@@ -1067,6 +1112,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         expected_loss=weight * np.sum(self._expected_losses),
         weights=weight)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_pairwise_squared_error(
@@ -1075,12 +1121,14 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         weights=constant_op.constant(weights))
     with self.cached_session():
       self.assertAlmostEqual(weights * np.sum(self._expected_losses),
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarZeroWeight(self):
     self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0, weights=0.0)
 
+  @test_util.run_deprecated_v1
   def test3d(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1092,6 +1140,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     self._test_valid_weights(labels, predictions, expected_loss=137.5)
 
+  @test_util.run_deprecated_v1
   def test3dWeightedScalar(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1130,6 +1179,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
             weights_placeholder: weights,
         })
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalid3dWeighted2x0(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1142,6 +1192,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     self._test_invalid_weights(
         labels, predictions, weights=np.asarray((1.2, 3.4)))
 
+  @test_util.run_deprecated_v1
   def test3dWeighted2x3x3(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1158,6 +1209,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         expected_loss=9 * 137.5,
         weights=np.ones((2, 3, 3)))
 
+  @test_util.run_deprecated_v1
   def testLossWithAllZeroBatchSpecificWeights(self):
     self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0,
@@ -1230,7 +1282,7 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2)
     with self.cached_session():
-      self.assertAlmostEqual(0, loss.eval(), 5)
+      self.assertAlmostEqual(0, self.evaluate(loss), 5)
 
   def testPartiallyCorrectWithIntegerValues(self):
     loss = losses.cosine_distance(
@@ -1238,7 +1290,7 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2)
     with self.cached_session():
-      self.assertAlmostEqual(1, loss.eval(), 5)
+      self.assertAlmostEqual(1, self.evaluate(loss), 5)
 
   def testPartiallyCorrectFloatingPointValues(self):
     predictions = np.matrix(
@@ -1256,7 +1308,7 @@ class CosineDistanceLossTest(test.TestCase):
     loss = losses.cosine_distance(tf_labels, tf_preds, dim=2)
 
     with self.cached_session():
-      self.assertAlmostEqual(1.0, loss.eval(), 5)
+      self.assertAlmostEqual(1.0, self.evaluate(loss), 5)
 
   def testSampleSpecificWeights(self):
     loss = losses.cosine_distance(
@@ -1265,7 +1317,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=np.asarray((1, 0, 0)).reshape((3, 1, 1)))
     with self.cached_session():
-      self.assertEqual(1.0, loss.eval())
+      self.assertEqual(1.0, self.evaluate(loss))
 
   def testMeasurementSpecificWeights(self):
     loss = losses.cosine_distance(
@@ -1275,8 +1327,9 @@ class CosineDistanceLossTest(test.TestCase):
         weights=constant_op.constant(
             [1, 0, 0, 1, 1, 1], shape=(3, 2, 1)))
     with self.cached_session():
-      self.assertEqual(3.0 / 4.0, loss.eval())
+      self.assertEqual(3.0 / 4.0, self.evaluate(loss))
 
+  @test_util.run_deprecated_v1
   def testMeasurementSpecificWeightsWithPlaceholderWithShape(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._labels.shape)
@@ -1297,7 +1350,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=array_ops.zeros((3, 1, 1)))
     with self.cached_session():
-      self.assertEqual(0, loss.eval())
+      self.assertEqual(0, self.evaluate(loss))
 
   def testZeroLossWhenAllMeasurementSpecificWeightsAreZero(self):
     loss = losses.cosine_distance(
@@ -1306,7 +1359,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=array_ops.zeros((3, 2, 1)))
     with self.cached_session():
-      self.assertEqual(0, loss.eval())
+      self.assertEqual(0, self.evaluate(loss))
 
 
 class AddLossTest(test.TestCase):
@@ -1363,18 +1416,19 @@ class ComputeWeightedLossTest(test.TestCase):
                 raw_losses, weights=np.ones(self._shape), reduction=reduction)
         )
         self.assertEqual(9, len(util.get_losses()))
-        with self.test_session(g):
+        with self.session(g):
           for unweighted_loss in unweighted_losses:
             if reduction == losses.Reduction.NONE:
-              self.assertAllClose(self._raw_losses, unweighted_loss.eval())
+              self.assertAllClose(self._raw_losses,
+                                  self.evaluate(unweighted_loss))
             elif reduction == losses.Reduction.SUM:
               self.assertAllClose(
-                  np.sum(self._raw_losses), unweighted_loss.eval())
+                  np.sum(self._raw_losses), self.evaluate(unweighted_loss))
             else:
               # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS,
               # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE.
               self.assertAllClose(
-                  np.mean(self._raw_losses), unweighted_loss.eval())
+                  np.mean(self._raw_losses), self.evaluate(unweighted_loss))
 
   def testUnweightedFromPlaceholder(self):
     for reduction in losses.Reduction.all():
@@ -1390,7 +1444,7 @@ class ComputeWeightedLossTest(test.TestCase):
                 raw_losses, weights=np.ones((1, 1, 4)), reduction=reduction),
         )
         self.assertEqual(3, len(util.get_losses()))
-        with self.test_session(g):
+        with self.session(g):
           for unweighted_loss in unweighted_losses:
             if reduction == losses.Reduction.NONE:
               self.assertAllClose(
@@ -1413,7 +1467,7 @@ class ComputeWeightedLossTest(test.TestCase):
       self.assertEqual(1, len(util.get_losses()))
       with self.cached_session():
         self.assertAllClose(
-            np.mean(weight * self._raw_losses), weighted_loss.eval())
+            np.mean(weight * self._raw_losses), self.evaluate(weighted_loss))
 
   def _test_invalid_weights(self, weights):
     with ops.Graph().as_default():
@@ -1481,28 +1535,26 @@ class ComputeWeightedLossTest(test.TestCase):
         weighted_loss = losses.compute_weighted_loss(
             self._raw_losses, weights=weights, reduction=reduction)
         self.assertEqual(1, len(util.get_losses()))
-        with self.test_session(g):
+        with self.session(g):
           weighted_losses = weights * self._raw_losses
           weighted_sum = np.sum(weighted_losses)
           if reduction == losses.Reduction.NONE:
-            self.assertAllClose(weighted_losses, weighted_loss.eval())
+            self.assertAllClose(weighted_losses, self.evaluate(weighted_loss))
           elif reduction == losses.Reduction.SUM:
-            self.assertAllClose(weighted_sum, weighted_loss.eval())
+            self.assertAllClose(weighted_sum, self.evaluate(weighted_loss))
           else:
             broadcast_weights = weights * np.ones_like(self._raw_losses)
             if reduction == losses.Reduction.MEAN:
-              self.assertAllClose(
-                  weighted_sum / np.sum(broadcast_weights),
-                  weighted_loss.eval())
+              self.assertAllClose(weighted_sum / np.sum(broadcast_weights),
+                                  self.evaluate(weighted_loss))
             elif (reduction == losses.Reduction.SUM_OVER_NONZERO_WEIGHTS or
                   reduction == losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
               self.assertAllClose(
                   weighted_sum / np.count_nonzero(broadcast_weights),
-                  weighted_loss.eval())
+                  self.evaluate(weighted_loss))
             elif reduction == losses.Reduction.SUM_OVER_BATCH_SIZE:
-              self.assertAllClose(
-                  weighted_sum / self._raw_losses.size,
-                  weighted_loss.eval())
+              self.assertAllClose(weighted_sum / self._raw_losses.size,
+                                  self.evaluate(weighted_loss))
 
   def test1x1x1Weight(self):
     self._test_valid_weights((((17.0,),),))
diff --git a/tensorflow/python/kernel_tests/lrn_op_test.py b/tensorflow/python/kernel_tests/lrn_op_test.py
index 9eba059549302c61bd8c5b0b83c811da449c7ba8..fbe628c3944f80b10012cb10f6c43336a5380019 100644
--- a/tensorflow/python/kernel_tests/lrn_op_test.py
+++ b/tensorflow/python/kernel_tests/lrn_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -54,7 +55,7 @@ class LRNOpTest(test.TestCase):
     return output
 
   def _RunAndVerify(self, dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # random shape
       shape = np.random.randint(1, 16, size=4)
       # Make depth at least 2 to make it meaningful
@@ -92,6 +93,7 @@ class LRNOpTest(test.TestCase):
       self.assertTrue(err < 1e-2)
     self.assertShapeEqual(expected, lrn_t)
 
+  @test_util.run_deprecated_v1
   def testCompute(self):
     for _ in range(2):
       self._RunAndVerify(dtypes.float32)
@@ -99,8 +101,9 @@ class LRNOpTest(test.TestCase):
       if not test.is_gpu_available():
         self._RunAndVerify(dtypes.float16)
 
+  @test_util.run_deprecated_v1
   def testGradientsZeroInput(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = [4, 4, 4, 4]
       p = array_ops.placeholder(dtypes.float32, shape=shape)
       inp_array = np.zeros(shape).astype("f")
@@ -113,7 +116,7 @@ class LRNOpTest(test.TestCase):
     self.assertShapeEqual(expected, grad)
 
   def _RunAndVerifyGradients(self, dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # random shape
       shape = np.random.randint(1, 5, size=4)
       # Make depth at least 2 to make it meaningful
@@ -147,6 +150,7 @@ class LRNOpTest(test.TestCase):
     else:
       self.assertLess(err, 1.0)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     for _ in range(2):
       self._RunAndVerifyGradients(dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..06deb0e1c82175c33b028e017a5f54cc2549253b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -0,0 +1,288 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.Lu."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+
+class LuOpTest(test.TestCase):
+
+  @property
+  def float_types(self):
+    return set((np.float64, np.float32, np.complex64, np.complex128))
+
+  def _verifyLuBase(self, x, lower, upper, perm, verification,
+                    output_idx_type):
+    lower_np, upper_np, perm_np, verification_np = self.evaluate(
+        [lower, upper, perm, verification])
+
+    self.assertAllClose(x, verification_np)
+    self.assertShapeEqual(x, lower)
+    self.assertShapeEqual(x, upper)
+
+    self.assertAllEqual(x.shape[:-1], perm.shape.as_list())
+
+    # Check dtypes are as expected.
+    self.assertEqual(x.dtype, lower_np.dtype)
+    self.assertEqual(x.dtype, upper_np.dtype)
+    self.assertEqual(output_idx_type.as_numpy_dtype, perm_np.dtype)
+
+    # Check that the permutation is valid.
+    if perm_np.shape[-1] > 0:
+      perm_reshaped = np.reshape(perm_np, (-1, perm_np.shape[-1]))
+      for perm_vector in perm_reshaped:
+        self.assertAllClose(np.arange(len(perm_vector)), np.sort(perm_vector))
+
+  def _verifyLu(self, x, output_idx_type=dtypes.int64):
+    # Verify that Px = LU.
+    with test_util.use_gpu():
+
+      lu, perm = linalg_ops.lu(x, output_idx_type=output_idx_type)
+
+      # Prepare the lower factor of shape num_rows x num_rows
+      lu_shape = np.array(lu.shape.as_list())
+      batch_shape = lu_shape[:-2]
+      num_rows = lu_shape[-2]
+      num_cols = lu_shape[-1]
+
+      lower = array_ops.matrix_band_part(lu, -1, 0)
+
+      if num_rows > num_cols:
+        eye = linalg_ops.eye(
+            num_rows, batch_shape=batch_shape, dtype=lower.dtype)
+        lower = array_ops.concat([lower, eye[..., num_cols:]], axis=-1)
+      elif num_rows < num_cols:
+        lower = lower[..., :num_rows]
+
+      # Fill the diagonal with ones.
+      ones_diag = array_ops.ones(
+          np.append(batch_shape, num_rows), dtype=lower.dtype)
+      lower = array_ops.matrix_set_diag(lower, ones_diag)
+
+      # Prepare the upper factor.
+      upper = array_ops.matrix_band_part(lu, 0, -1)
+
+      verification = math_ops.matmul(lower, upper)
+
+      # Permute the rows of product of the Cholesky factors.
+      if num_rows > 0:
+        # Reshape the product of the triangular factors and permutation indices
+        # to a single batch dimension. This makes it easy to apply
+        # invert_permutation and gather_nd ops.
+        perm_reshaped = array_ops.reshape(perm, [-1, num_rows])
+        verification_reshaped = array_ops.reshape(verification,
+                                                  [-1, num_rows, num_cols])
+        # Invert the permutation in each batch.
+        inv_perm_reshaped = functional_ops.map_fn(array_ops.invert_permutation,
+                                                  perm_reshaped)
+        batch_size = perm_reshaped.shape.as_list()[0]
+        # Prepare the batch indices with the same shape as the permutation.
+        # The corresponding batch index is paired with each of the `num_rows`
+        # permutation indices.
+        batch_indices = math_ops.cast(
+            array_ops.broadcast_to(
+                math_ops.range(batch_size)[:, None], perm_reshaped.shape),
+            dtype=output_idx_type)
+        permuted_verification_reshaped = array_ops.gather_nd(
+            verification_reshaped,
+            array_ops.stack([batch_indices, inv_perm_reshaped], axis=-1))
+
+        # Reshape the verification matrix back to the original shape.
+        verification = array_ops.reshape(permuted_verification_reshaped,
+                                         lu_shape)
+
+      self._verifyLuBase(x, lower, upper, perm, verification,
+                         output_idx_type)
+
+  def testBasic(self):
+    data = np.array([[4., -1., 2.], [-1., 6., 0], [10., 0., 5.]])
+
+    for dtype in (np.float32, np.float64):
+      for output_idx_type in (dtypes.int32, dtypes.int64):
+        self._verifyLu(data.astype(dtype), output_idx_type=output_idx_type)
+
+    for dtype in (np.complex64, np.complex128):
+      for output_idx_type in (dtypes.int32, dtypes.int64):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyLu(complex_data, output_idx_type=output_idx_type)
+
+  def testPivoting(self):
+    with test_util.use_gpu():
+      # This matrix triggers partial pivoting because the first diagonal entry
+      # is small.
+      data = np.array([[1e-9, 1., 0.], [1., 0., 0], [0., 1., 5]])
+      self._verifyLu(data.astype(np.float32))
+
+      for dtype in (np.float32, np.float64):
+        self._verifyLu(data.astype(dtype))
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
+
+      for dtype in (np.complex64, np.complex128):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyLu(complex_data)
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
+
+  def testInvalidMatrix(self):
+    # LU factorization gives an error when the input is singular.
+    # Note: A singular matrix may return without error but it won't be a valid
+    # factorization.
+    with test_util.use_gpu():
+      for dtype in self.float_types:
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
+                           dtype=dtype)))
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
+                            [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
+                           dtype=dtype)))
+
+  def testBatch(self):
+    simple_array = np.array([[[1., -1.], [2., 5.]]])  # shape (1, 2, 2)
+    self._verifyLu(simple_array)
+    self._verifyLu(np.vstack((simple_array, simple_array)))
+    odd_sized_array = np.array([[[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]]])
+    self._verifyLu(np.vstack((odd_sized_array, odd_sized_array)))
+
+    batch_size = 200
+
+    # Generate random matrices.
+    np.random.seed(42)
+    matrices = np.random.rand(batch_size, 5, 5)
+    self._verifyLu(matrices)
+
+    # Generate random complex valued matrices.
+    np.random.seed(52)
+    matrices = np.random.rand(batch_size, 5,
+                              5) + 1j * np.random.rand(batch_size, 5, 5)
+    self._verifyLu(matrices)
+
+  def testLargeMatrix(self):
+    # Generate random matrices.
+    n = 500
+    np.random.seed(64)
+    data = np.random.rand(n, n)
+    self._verifyLu(data)
+
+    # Generate random complex valued matrices.
+    np.random.seed(129)
+    data = np.random.rand(n, n) + 1j * np.random.rand(n, n)
+    self._verifyLu(data)
+
+  @test_util.run_v1_only("b/120545219")
+  def testEmpty(self):
+    self._verifyLu(np.empty([0, 2, 2]))
+    self._verifyLu(np.empty([2, 0, 0]))
+
+  @test_util.run_deprecated_v1
+  def testConcurrentExecutesWithoutError(self):
+    with test_util.use_gpu():
+      matrix1 = random_ops.random_normal([5, 5], seed=42)
+      matrix2 = random_ops.random_normal([5, 5], seed=42)
+      lu1, p1 = linalg_ops.lu(matrix1)
+      lu2, p2 = linalg_ops.lu(matrix2)
+      lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])
+      self.assertAllEqual(lu1_val, lu2_val)
+      self.assertAllEqual(p1_val, p2_val)
+
+
+class LuBenchmark(test.Benchmark):
+  shapes = [
+      (4, 4),
+      (10, 10),
+      (16, 16),
+      (101, 101),
+      (256, 256),
+      (1000, 1000),
+      (1024, 1024),
+      (2048, 2048),
+      (4096, 4096),
+      (513, 2, 2),
+      (513, 8, 8),
+      (513, 256, 256),
+      (4, 513, 2, 2),
+  ]
+
+  def _GenerateMatrix(self, shape):
+    batch_shape = shape[:-2]
+    shape = shape[-2:]
+    assert shape[0] == shape[1]
+    n = shape[0]
+    matrix = np.ones(shape).astype(np.float32) / (2.0 * n) + np.diag(
+        np.ones(n).astype(np.float32))
+    return np.tile(matrix, batch_shape + (1, 1))
+
+  def benchmarkLuOp(self):
+    for shape in self.shapes:
+      with ops.Graph().as_default(), \
+          session.Session(config=benchmark.benchmark_config()) as sess, \
+          ops.device("/cpu:0"):
+        matrix = variables.Variable(self._GenerateMatrix(shape))
+        lu, p = linalg_ops.lu(matrix)
+        variables.global_variables_initializer().run()
+        self.run_op_benchmark(
+            sess,
+            control_flow_ops.group(lu, p),
+            min_iters=25,
+            name="lu_cpu_{shape}".format(shape=shape))
+
+      if test.is_gpu_available(True):
+        with ops.Graph().as_default(), \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
+            ops.device("/device:GPU:0"):
+          matrix = variables.Variable(self._GenerateMatrix(shape))
+          lu, p = linalg_ops.lu(matrix)
+          variables.global_variables_initializer().run()
+          self.run_op_benchmark(
+              sess,
+              control_flow_ops.group(lu, p),
+              min_iters=25,
+              name="lu_gpu_{shape}".format(shape=shape))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index f71857a3cbaddb52fc4da082f504fcbc5c405bd9..5700db4b950995c5bc59adb84a8e0f81655850cc 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -62,6 +62,7 @@ class RollTest(test_util.TensorFlowTestCase):
     if np_input.dtype == np.float32:
       self._testGradient(np_input, shift, axis)
 
+  @test_util.run_deprecated_v1
   def testIntTypes(self):
     for t in [np.int32, np.int64]:
       self._testAll(np.random.randint(-100, 100, (5)).astype(t), 3, 0)
@@ -73,6 +74,7 @@ class RollTest(test_util.TensorFlowTestCase):
             np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t), [0, 1, -2],
             [1, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float32, np.float64]:
       self._testAll(np.random.rand(5).astype(t), 2, 0)
@@ -80,6 +82,7 @@ class RollTest(test_util.TensorFlowTestCase):
         self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0])
         self._testAll(np.random.rand(1, 3, 4).astype(t), [1, 0, -3], [0, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testComplexTypes(self):
     for t in [np.complex64, np.complex128]:
       x = np.random.rand(4, 4).astype(t)
@@ -90,6 +93,7 @@ class RollTest(test_util.TensorFlowTestCase):
         x = np.random.rand(3, 2, 1, 1).astype(t)
         self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testNegativeAxis(self):
     self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
     self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
@@ -100,12 +104,14 @@ class RollTest(test_util.TensorFlowTestCase):
         manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
                        3, -10).eval()
 
+  @test_util.run_deprecated_v1
   def testInvalidInputShape(self):
     # The input should be 1-D or higher, checked in shape function.
     with self.assertRaisesRegexp(
         ValueError, "Shape must be at least rank 1 but is rank 0"):
       manip_ops.roll(7, 1, 0)
 
+  @test_util.run_deprecated_v1
   def testRollInputMustVectorHigherRaises(self):
     # The input should be 1-D or higher, checked in kernel.
     tensor = array_ops.placeholder(dtype=dtypes.int32)
@@ -116,12 +122,14 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "input must be 1-D or higher"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={tensor: 7})
 
+  @test_util.run_deprecated_v1
   def testInvalidAxisShape(self):
     # The axis should be a scalar or 1-D, checked in shape function.
     with self.assertRaisesRegexp(
         ValueError, "Shape must be at most rank 1 but is rank 2"):
       manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]])
 
+  @test_util.run_deprecated_v1
   def testRollAxisMustBeScalarOrVectorRaises(self):
     # The axis should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
@@ -132,12 +140,14 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "axis must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={axis: [[0, 1]]})
 
+  @test_util.run_deprecated_v1
   def testInvalidShiftShape(self):
     # The shift should be a scalar or 1-D, checked in shape function.
     with self.assertRaisesRegexp(
         ValueError, "Shape must be at most rank 1 but is rank 2"):
       manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1)
 
+  @test_util.run_deprecated_v1
   def testRollShiftMustBeScalarOrVectorRaises(self):
     # The shift should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
@@ -148,11 +158,13 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "shift must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [[0, 1]]})
 
+  @test_util.run_deprecated_v1
   def testInvalidShiftAndAxisNotEqualShape(self):
     # The shift and axis must be same size, checked in shape function.
     with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
       manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1])
 
+  @test_util.run_deprecated_v1
   def testRollShiftAndAxisMustBeSameSizeRaises(self):
     # The shift and axis must be same size, checked in kernel.
     tensor = [[1, 2], [3, 4]]
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index acfafde9e0f74d4e3ad6f2ee8ada9da3df94f5b9..dd16fad690470e0ca77c31102b8ef2000f0a15d5 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -29,6 +30,7 @@ TIMEOUT = 1
 
 class MapStageTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -44,12 +46,13 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 0})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
         self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testMultiple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -65,13 +68,14 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 0})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testDictionary(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -92,7 +96,7 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 0})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
@@ -121,6 +125,7 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
+  @test_util.run_deprecated_v1
   def testPeek(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -141,7 +146,7 @@ class MapStageTest(test.TestCase):
 
     n = 10
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       for i in range(n):
         sess.run(stage, feed_dict={x: i, pi: i})
 
@@ -150,6 +155,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 10)
 
+  @test_util.run_deprecated_v1
   def testSizeAndClear(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -168,7 +174,7 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 3})
       self.assertEqual(sess.run(size), 1)
       sess.run(stage, feed_dict={x: -1, pi: 1})
@@ -176,6 +182,7 @@ class MapStageTest(test.TestCase):
       sess.run(clear)
       self.assertEqual(sess.run(size), 0)
 
+  @test_util.run_deprecated_v1
   def testCapacity(self):
     capacity = 3
 
@@ -202,7 +209,7 @@ class MapStageTest(test.TestCase):
     queue = Queue.Queue()
     n = 8
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
       # when it hits the staging area's capacity and thus
       # not fill the queue with n tokens
@@ -239,6 +246,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testMemoryLimit(self):
     memory_limit = 512 * 1024  # 512K
     chunk = 200 * 1024  # 256K
@@ -265,7 +273,7 @@ class MapStageTest(test.TestCase):
     queue = Queue.Queue()
     n = 8
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
       # when it hits the staging area's capacity and thus
       # not fill the queue with n tokens
@@ -303,6 +311,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testOrdering(self):
     import six
     import random
@@ -325,7 +334,7 @@ class MapStageTest(test.TestCase):
 
     n = 10
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       # Keys n-1..0
       keys = list(reversed(six.moves.range(n)))
 
@@ -341,6 +350,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testPartialDictInsert(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -362,7 +372,7 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       # 0 complete and incomplete entries
       self.assertTrue(sess.run([size, isize]) == [0, 0])
       # Stage key 0, x and f tuple entries
@@ -400,6 +410,7 @@ class MapStageTest(test.TestCase):
               'v': 3
           }])
 
+  @test_util.run_deprecated_v1
   def testPartialIndexInsert(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -419,7 +430,7 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       # 0 complete and incomplete entries
       self.assertTrue(sess.run([size, isize]) == [0, 0])
       # Stage key 0, x and f tuple entries
@@ -443,6 +454,7 @@ class MapStageTest(test.TestCase):
       # We can now obtain tuple associated with key 1
       self.assertTrue(sess.run([key, ret], feed_dict={gi: 1}) == [1, [1, 3, 2]])
 
+  @test_util.run_deprecated_v1
   def testPartialDictGetsAndPeeks(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -470,7 +482,7 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       # 0 complete and incomplete entries
       self.assertTrue(sess.run([size, isize]) == [0, 0])
       # Stage key 0, x and f tuple entries
@@ -540,6 +552,7 @@ class MapStageTest(test.TestCase):
       # Nothing is left
       self.assertTrue(sess.run([size, isize]) == [0, 0])
 
+  @test_util.run_deprecated_v1
   def testPartialIndexGets(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -561,7 +574,7 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       # Stage complete tuple
       sess.run(stage_xvf, feed_dict={pi: 0, x: 1, f: 2, v: 3})
 
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 309da8f184be4fc9d7cebe779127eac80807037d..d31ecbcd3f1d57386fa629cd533f5f698176ca76 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -21,11 +21,12 @@ from __future__ import print_function
 import operator
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -35,6 +36,17 @@ from tensorflow.python.platform import test as test_lib
 # os.environ["TF_MATMUL_AUTOTUNE_ENABLE"] = "1" to enable it.
 
 
+class MatVecTest(test_lib.TestCase):
+  """Simple test for matvec, which is sugar on top of matmul."""
+
+  def testTwoByTwoCase(self):
+    a = np.array([[1, 2], [3, 4]])
+    b = np.array([5, 6])
+    c = math_ops.matvec(a, b)
+    self.assertAllEqual((2,), c.shape)
+    self.assertAllEqual([5 + 2 * 6, 3 * 5 + 4 * 6], c)
+
+
 def _AddTest(test, op_name, testcase_name, fn):
   test_name = "_".join(["test", op_name, testcase_name])
   if hasattr(test, test_name):
@@ -72,12 +84,12 @@ def _GetMatMulTest(a_np_, b_np_, use_static_shape_, **kwargs_):
     # np.matrix(a_np_) * np.matrix(b_np_)
     effective_a_np = _GetTransposedMatrices(a_np_, "a", kwargs_)
     effective_b_np = _GetTransposedMatrices(b_np_, "b", kwargs_)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session() as sess, test_util.device(use_gpu):
       if use_static_shape_:
         a = constant_op.constant(effective_a_np)
         b = constant_op.constant(effective_b_np)
         res = math_ops.matmul(a, b, **kwargs_)
-        tf_val = res.eval()
+        tf_val = self.evaluate(res)
       else:
         a = array_ops.placeholder(a_np_.dtype)
         b = array_ops.placeholder(b_np_.dtype)
@@ -102,7 +114,7 @@ class MatMulGradientTest(test_lib.TestCase):
 def _GetMatMulGradientTest(a_np_, b_np_, use_static_shape_, **kwargs_):
 
   def Test(self):
-    if not use_static_shape_ or a_np_.dtype in (np.int32, np.float16):
+    if not use_static_shape_ or a_np_.dtype in (np.int32, np.int64, np.float16):
       self.skipTest("Skipping infeasible gradient test.")
 
     # Transpose and possibly conjugate a_np_ and b_np_ according to the
@@ -115,45 +127,45 @@ def _GetMatMulGradientTest(a_np_, b_np_, use_static_shape_, **kwargs_):
     epsilon = np.finfo(a_np_.dtype).eps
     delta = epsilon**(1.0 / 3.0)
     tol = 20 * delta
-    with self.test_session(use_gpu=True):
-      a = constant_op.constant(effective_a_np)
-      b = constant_op.constant(effective_b_np)
-      res = math_ops.matmul(a, b, **kwargs_)
-      for x, x_init in [a, effective_a_np], [b, effective_b_np]:
-        theoretical, numerical = gradient_checker.compute_gradient(
-            x,
-            x_init.shape,
-            res, [a_np_.shape[0], b_np_.shape[1]],
-            x_init_value=x_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
+    with self.session(), test_util.use_gpu():
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.matmul(x, effective_b_np, **kwargs_),
+          [effective_a_np],
+          delta=delta)
+      self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
+
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.matmul(effective_a_np, x, **kwargs_),
+          [effective_b_np],
+          delta=delta)
+      self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
 
   return Test
 
 
 class MatMulStatsTest(test_lib.TestCase):
 
+  @test_util.run_v1_only("Test requires a Graph and NodeDef inspection")
   def testSimpleStatistics(self):
-    g = ops.Graph()
-    with g.as_default():
-      a = variables.Variable(random_ops.random_normal([25, 16]))
-      b = variables.Variable(random_ops.random_normal([16, 9]))
-      math_ops.matmul(a, b)
-      for op in g.get_operations():
-        flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
-        if op.name == "MatMul":
-          self.assertEqual(7200, flops)
-
+    a = variables.Variable(random_ops.random_normal([25, 16]))
+    b = variables.Variable(random_ops.random_normal([16, 9]))
+    math_ops.matmul(a, b)
+    g = ops.get_default_graph()
+    for op in g.get_operations():
+      flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
+      if op.name == "MatMul":
+        self.assertEqual(7200, flops)
+
+  @test_util.run_v1_only("Test requires a Graph and NodeDef inspection")
   def testTransposedStatistics(self):
-    g = ops.Graph()
-    with g.as_default():
-      a = variables.Variable(random_ops.random_normal([16, 25]))
-      b = variables.Variable(random_ops.random_normal([16, 9]))
-      math_ops.matmul(a, b, transpose_a=True)
-      for op in g.get_operations():
-        flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
-        if op.name == "MatMul":
-          self.assertEqual(7200, flops)
+    a = variables.Variable(random_ops.random_normal([16, 25]))
+    b = variables.Variable(random_ops.random_normal([16, 9]))
+    math_ops.matmul(a, b, transpose_a=True)
+    g = ops.get_default_graph()
+    for op in g.get_operations():
+      flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
+      if op.name == "MatMul":
+        self.assertEqual(7200, flops)
 
 
 try:
@@ -182,19 +194,20 @@ except AttributeError:
 class MatMulInfixOperatorTest(test_lib.TestCase):
 
   def testMismatchedShape(self):
-    with self.assertRaisesWithPredicateMatch(ValueError,
-                                             lambda e: "Shape must" in str(e)):
+    with self.assertRaisesRegexp(
+        Exception, "(Shape must be rank 2 but is rank 1|is not a matrix)"):
       infix_matmul(
           ops.convert_to_tensor([10.0, 20.0, 30.0]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
   def testMismatchedDimensions(self):
-    with self.assertRaisesWithPredicateMatch(
-        ValueError, lambda e: "Dimensions must" in str(e)):
+    with self.assertRaisesRegexp(
+        Exception, "(Dimensions must be equal|Matrix size-incompatible)"):
       infix_matmul(
           ops.convert_to_tensor([[10.0, 20.0, 30.0]]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
+  @test_util.run_v1_only("Tensor.op is generally not applicable in TF 2")
   def testInfixMatmulIsTfMatmul(self):
     a = ops.convert_to_tensor([[10.0, 20.0, 30.0]])
     b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
@@ -206,17 +219,17 @@ class MatMulInfixOperatorTest(test_lib.TestCase):
     b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
     c = infix_matmul(a, b)
     d = math_ops.matmul(a, b)
-    with self.cached_session():
-      self.assertAllEqual(c.eval(), d.eval())
+    self.assertAllEqual(c, d)
 
 
 if __name__ == "__main__":
   sizes = [1, 3, 5]
   trans_options = [[False, False], [True, False], [False, True]]
-  for use_static_shape in [False, True]:
-    for dtype in (np.int32, np.float16, np.float32, np.float64, np.complex64,
-                  np.complex128):
-      if not use_static_shape and dtype == np.int32:
+  # TF2 does not support placeholders under eager so we skip it
+  for use_static_shape in set([True, tf2.enabled()]):
+    for dtype in (np.int32, np.int64, np.float16, np.float32, np.float64,
+                  np.complex64, np.complex128):
+      if not use_static_shape and (dtype == np.int32 or dtype == np.int64):
         # TODO(rmlarsen): Re-enable this test when we have fixed the underlying
         # bug in Windows (b/35935459).
         continue
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 68d626de2c5cdd91ee332247c05ddce2a558a35e..fdb7e4a1a4e54883afd66e6a856a977b61ff8aaf 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -23,10 +23,12 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test as test_lib
 
 
@@ -43,6 +45,7 @@ class MatrixBandPartTest(test_lib.TestCase):
 
 def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     mat = np.ones(shape_).astype(dtype_)
     batch_mat = np.tile(mat, batch_shape_ + (1, 1))
@@ -56,12 +59,12 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
         if batch_shape_ is not ():
           band_np = np.tile(band_np, batch_shape_ + (1, 1))
         for index_dtype in [dtypes_lib.int32, dtypes_lib.int64]:
-          with self.test_session(use_gpu=False):
+          with self.cached_session(use_gpu=False):
             band = array_ops.matrix_band_part(
                 batch_mat,
                 constant_op.constant(lower, index_dtype),
                 constant_op.constant(upper, index_dtype))
-            self.assertAllEqual(band_np, band.eval())
+            self.assertAllEqual(band_np, self.evaluate(band))
 
   return Test
 
@@ -72,10 +75,11 @@ class MatrixBandPartGradTest(test_lib.TestCase):
 
 def _GetMatrixBandPartGradTest(dtype_, batch_shape_, shape_):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     shape = batch_shape_ + shape_
     x = constant_op.constant(np.random.rand(*shape), dtype=dtype_)
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for lower in -1, 0, 1, shape_[-2] - 1:
         for upper in -1, 0, 1, shape_[-1] - 1:
           y = array_ops.matrix_band_part(x, lower, upper)
@@ -109,7 +113,7 @@ class MatrixBandPartBenchmark(test_lib.Benchmark):
     for shape_ in self.shapes:
       for limits in (-1, -1), (-1, 0), (0, -1), (2, 2):
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/cpu:0"):
           matrix = variables.Variable(array_ops.ones(shape_))
           band = array_ops.matrix_band_part(matrix, limits[0], limits[1])
@@ -123,7 +127,7 @@ class MatrixBandPartBenchmark(test_lib.Benchmark):
 
         if test_lib.is_gpu_available(True):
           with ops.Graph().as_default(), \
-              session.Session() as sess, \
+              session.Session(config=benchmark.benchmark_config()) as sess, \
               ops.device("/gpu:0"):
             matrix = variables.Variable(array_ops.ones(shape_))
             band = array_ops.matrix_band_part(matrix, limits[0], limits[1])
@@ -137,12 +141,13 @@ class MatrixBandPartBenchmark(test_lib.Benchmark):
 
 
 if __name__ == "__main__":
-  dtypes = (np.bool, np.int32, np.int64, np.float32, np.float64, np.complex64,
-            np.complex128)
+  dtypes = (np.bool, np.int32, np.int64, np.float16,
+            dtypes_lib.bfloat16.as_numpy_dtype, np.float32, np.float64,
+            np.complex64, np.complex128)
   for dtype in dtypes:
     for batch_shape in ((), (2,), (1, 3, 2)):
-      for rows in 1, 2, 7:
-        for cols in 1, 2, 7:
+      for rows in 1, 2, 7, 23:
+        for cols in 1, 2, 7, 23:
           shape = (rows, cols)
           name = "%s_%s" % (dtype.__name__,
                             "_".join(map(str, batch_shape + shape)))
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
index 0386e91276eb3cd8515c41396e8687a90e27fbca..372b6dc17f4d080f3a59705611e05f0f0865c50d 100644
--- a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -25,11 +25,13 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -49,7 +51,7 @@ class ExponentialOpTest(test.TestCase):
 
   def _verifyExponential(self, x, np_type):
     inp = x.astype(np_type)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       tf_ans = linalg_impl.matrix_exponential(inp)
       if x.size == 0:
         np_ans = np.empty(x.shape, dtype=np_type)
@@ -60,7 +62,7 @@ class ExponentialOpTest(test.TestCase):
             np_ans[i] = np_expm(inp[i])
         else:
           np_ans = np_expm(inp)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
 
   def _verifyExponentialReal(self, x):
@@ -120,12 +122,14 @@ class ExponentialOpTest(test.TestCase):
     # Complex batch
     self._verifyExponentialComplex(self._makeBatch(matrix1, matrix2))
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     # When the exponential of a non-square matrix is attempted we should return
     # an error
     with self.assertRaises(ValueError):
       linalg_impl.matrix_exponential(np.array([[1., 2., 3.], [3., 4., 5.]]))
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The input to the exponential should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.])
@@ -136,20 +140,22 @@ class ExponentialOpTest(test.TestCase):
     self._verifyExponentialReal(np.empty([0, 2, 2]))
     self._verifyExponentialReal(np.empty([2, 0, 0]))
 
+  @test_util.run_deprecated_v1
   def testDynamic(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       inp = array_ops.placeholder(ops.dtypes.float32)
       expm = linalg_impl.matrix_exponential(inp)
       matrix = np.array([[1., 2.], [3., 4.]])
       sess.run(expm, feed_dict={inp: matrix})
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
       matrix2 = random_ops.random_normal([5, 5], seed=42)
       expm1 = linalg_impl.matrix_exponential(matrix1)
       expm2 = linalg_impl.matrix_exponential(matrix2)
-      expm = sess.run([expm1, expm2])
+      expm = self.evaluate([expm1, expm2])
       self.assertAllEqual(expm[0], expm[1])
 
 
@@ -181,7 +187,7 @@ class MatrixExponentialBenchmark(test.Benchmark):
   def benchmarkMatrixExponentialOp(self):
     for shape in self.shapes:
       with ops.Graph().as_default(), \
-          session.Session() as sess, \
+          session.Session(config=benchmark.benchmark_config()) as sess, \
           ops.device("/cpu:0"):
         matrix = self._GenerateMatrix(shape)
         expm = linalg_impl.matrix_exponential(matrix)
@@ -195,7 +201,7 @@ class MatrixExponentialBenchmark(test.Benchmark):
 
       if test.is_gpu_available(True):
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/gpu:0"):
           matrix = self._GenerateMatrix(shape)
           expm = linalg_impl.matrix_exponential(matrix)
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 720ba806e90489b28b456032ea5203f7380a68c5..5cef4b79a32b85e3366ce018d1d8634867c20a75 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -36,7 +37,7 @@ class InverseOpTest(test.TestCase):
   def _verifyInverse(self, x, np_type):
     for adjoint in False, True:
       y = x.astype(np_type)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         # Verify that x^{-1} * x == Identity matrix.
         inv = linalg_ops.matrix_inverse(y, adjoint=adjoint)
         tf_ans = math_ops.matmul(inv, y, adjoint_b=adjoint)
@@ -45,7 +46,7 @@ class InverseOpTest(test.TestCase):
           tiling = list(y.shape)
           tiling[-2:] = [1, 1]
           np_ans = np.tile(np_ans, tiling)
-        out = tf_ans.eval()
+        out = self.evaluate(tf_ans)
         self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
         self.assertShapeEqual(y, tf_ans)
 
@@ -137,7 +138,7 @@ class InverseOpTest(test.TestCase):
           self._verifyInverseReal(matrix)
 
   def testConcurrentExecutesWithoutError(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       all_ops = []
       for adjoint_ in True, False:
         matrix1 = random_ops.random_normal([5, 5], seed=42)
@@ -145,7 +146,7 @@ class InverseOpTest(test.TestCase):
         inv1 = linalg_ops.matrix_inverse(matrix1, adjoint=adjoint_)
         inv2 = linalg_ops.matrix_inverse(matrix2, adjoint=adjoint_)
         all_ops += [inv1, inv2]
-      inv = sess.run(all_ops)
+      inv = self.evaluate(all_ops)
       self.assertAllEqual(inv[0], inv[1])
       self.assertAllEqual(inv[2], inv[3])
 
@@ -179,7 +180,7 @@ class MatrixInverseBenchmark(test.Benchmark):
     for adjoint in False, True:
       for shape in self.shapes:
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/cpu:0"):
           matrix = self._GenerateMatrix(shape)
           inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint)
@@ -193,7 +194,7 @@ class MatrixInverseBenchmark(test.Benchmark):
 
         if test.is_gpu_available(True):
           with ops.Graph().as_default(), \
-              session.Session() as sess, \
+              session.Session(config=benchmark.benchmark_config()) as sess, \
               ops.device("/gpu:0"):
             matrix = self._GenerateMatrix(shape)
             inv = linalg_ops.matrix_inverse(matrix, adjoint=adjoint)
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index 723a15fbd1c1e416913f82c082735ead41e102bc..682ac12adc6acef378ccbb256066cbd2b099e1b9 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -25,12 +25,14 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -38,11 +40,11 @@ class LogarithmOpTest(test.TestCase):
 
   def _verifyLogarithm(self, x, np_type):
     inp = x.astype(np_type)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       # Verify that expm(logm(A)) == A.
       tf_ans = linalg_impl.matrix_exponential(
           gen_linalg_ops.matrix_logarithm(inp))
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertAllClose(inp, out, rtol=1e-4, atol=1e-3)
 
   def _verifyLogarithmComplex(self, x):
@@ -82,6 +84,7 @@ class LogarithmOpTest(test.TestCase):
     # Complex batch
     self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
 
+  @test_util.run_v1_only("b/120545219")
   def testNonSquareMatrix(self):
     # When the logarithm of a non-square matrix is attempted we should return
     # an error
@@ -89,6 +92,7 @@ class LogarithmOpTest(test.TestCase):
       gen_linalg_ops.matrix_logarithm(
           np.array([[1., 2., 3.], [3., 4., 5.]], dtype=np.complex64))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to the logarithm should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.], dtype=dtypes.complex64)
@@ -119,15 +123,16 @@ class LogarithmOpTest(test.TestCase):
             size=np.prod(shape)).reshape(shape).astype(np.complex128)
         self._verifyLogarithmComplex(matrix)
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       matrix1 = math_ops.cast(
           random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
       matrix2 = math_ops.cast(
           random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
       logm1 = gen_linalg_ops.matrix_logarithm(matrix1)
       logm2 = gen_linalg_ops.matrix_logarithm(matrix2)
-      logm = sess.run([logm1, logm2])
+      logm = self.evaluate([logm1, logm2])
       self.assertAllEqual(logm[0], logm[1])
 
 
@@ -159,7 +164,7 @@ class MatrixLogarithmBenchmark(test.Benchmark):
   def benchmarkMatrixLogarithmOp(self):
     for shape in self.shapes:
       with ops.Graph().as_default(), \
-          session.Session() as sess, \
+          session.Session(config=benchmark.benchmark_config()) as sess, \
           ops.device("/cpu:0"):
         matrix = self._GenerateMatrix(shape)
         logm = gen_linalg_ops.matrix_logarithm(matrix)
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index de495968a710276caef5214eb12fa965edbfd64c..463477a6a2cb5cf174b461c1fbffd2024f7ce21e 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -20,15 +20,18 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test as test_lib
 
 
@@ -106,7 +109,7 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
         b = np.tile(b, batch_shape + (1, 1))
         np_ans = np.tile(np_ans, batch_shape + (1, 1))
         np_r_norm = np.tile(np_r_norm, batch_shape)
-      with self.test_session(use_gpu=fast) as sess:
+      with self.cached_session(use_gpu=fast) as sess:
         if use_placeholder:
           a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
           b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
@@ -132,9 +135,10 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
       self.assertEqual(np_ans.shape, tf_ans_val.shape)
       self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       matrix = constant_op.constant([[1., 0.], [0., 1.]])
       rhs = constant_op.constant([[1., 0.]])
       with self.assertRaises(ValueError):
@@ -145,16 +149,21 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
     empty0 = np.empty([3, 0])
     empty1 = np.empty([0, 2])
     for fast in [True, False]:
-      with self.test_session(use_gpu=True):
-        tf_ans = linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast).eval()
+      with self.cached_session(use_gpu=True):
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
         self.assertEqual(tf_ans.shape, (0, 0))
-        tf_ans = linalg_ops.matrix_solve_ls(empty0, full, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
         self.assertEqual(tf_ans.shape, (0, 2))
-        tf_ans = linalg_ops.matrix_solve_ls(full, empty0, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
         self.assertEqual(tf_ans.shape, (2, 0))
-        tf_ans = linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
         self.assertEqual(tf_ans.shape, (2, 2))
 
+  @test_util.run_v1_only("b/120545219")
   def testBatchResultSize(self):
     # 3x3x3 matrices, 3x3x1 right-hand sides.
     matrix = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.] * 3).reshape(3, 3, 3)
@@ -313,7 +322,7 @@ class MatrixSolveLsBenchmark(test_lib.Benchmark):
       for num_rhs in 1, 2, matrix_shape[-1]:
 
         with ops.Graph().as_default(), \
-            session.Session() as sess, \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
             ops.device("/cpu:0"):
           matrix, rhs = _GenerateTestData(matrix_shape, num_rhs)
           x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer)
@@ -328,7 +337,7 @@ class MatrixSolveLsBenchmark(test_lib.Benchmark):
 
         if run_gpu_test and (len(matrix_shape) < 3 or matrix_shape[0] < 513):
           with ops.Graph().as_default(), \
-                session.Session() as sess, \
+                session.Session(config=benchmark.benchmark_config()) as sess, \
                 ops.device("/gpu:0"):
             matrix, rhs = _GenerateTestData(matrix_shape, num_rhs)
             x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer)
@@ -345,7 +354,8 @@ class MatrixSolveLsBenchmark(test_lib.Benchmark):
 
 if __name__ == "__main__":
   for dtype_ in [np.float32, np.float64, np.complex64, np.complex128]:
-    for use_placeholder_ in [True, False]:
+    # TF2 does not support placeholders under eager so we skip it
+    for use_placeholder_ in set([False, not tf2.enabled()]):
       for fast_ in [True, False]:
         l2_regularizers = [0] if dtype_ == np.complex128 else [0, 0.1]
         for l2_regularizer_ in l2_regularizers:
diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index b8f2736b7b0fc7f2e3381b5a58625ff3f25d38a3..db7c4802f69227627f00565c7398b12af87e3651 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -24,11 +24,13 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
@@ -54,7 +56,7 @@ class MatrixSolveOpTest(test.TestCase):
           b = np.tile(b, batch_dims + [1, 1])
         np_ans = np.linalg.solve(a_np, b)
         for use_placeholder in False, True:
-          with self.test_session(use_gpu=True) as sess:
+          with self.cached_session(use_gpu=True) as sess:
             if use_placeholder:
               a_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
               b_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
@@ -62,7 +64,7 @@ class MatrixSolveOpTest(test.TestCase):
               out = sess.run(tf_ans, {a_ph: a, b_ph: b})
             else:
               tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
-              out = tf_ans.eval()
+              out = self.evaluate(tf_ans)
               self.assertEqual(tf_ans.get_shape(), out.shape)
             self.assertEqual(np_ans.shape, out.shape)
             self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
@@ -74,6 +76,7 @@ class MatrixSolveOpTest(test.TestCase):
         [m, n]))
     return matrix
 
+  @test_util.run_deprecated_v1
   def testSolve(self):
     for n in 1, 2, 4, 9:
       matrix = self._generateMatrix(n, n)
@@ -81,6 +84,7 @@ class MatrixSolveOpTest(test.TestCase):
         rhs = self._generateMatrix(n, nrhs)
         self._verifySolve(matrix, rhs)
 
+  @test_util.run_deprecated_v1
   def testSolveBatch(self):
     for n in 2, 5:
       matrix = self._generateMatrix(n, n)
@@ -89,17 +93,19 @@ class MatrixSolveOpTest(test.TestCase):
         for batch_dims in [[2], [2, 2], [7, 4]]:
           self._verifySolve(matrix, rhs, batch_dims=batch_dims)
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     # When the solve of a non-square matrix is attempted we should return
     # an error
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
         matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
         linalg_ops.matrix_solve(matrix, matrix)
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       matrix = constant_op.constant([[1., 0.], [0., 1.]])
       rhs = constant_op.constant([[1., 0.]])
       with self.assertRaises(ValueError):
@@ -107,15 +113,16 @@ class MatrixSolveOpTest(test.TestCase):
 
   def testNotInvertible(self):
     # The input should be invertible.
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaisesOpError("Input matrix is not invertible."):
         # All rows of the matrix below add to zero
         matrix = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
                                        [0., -1., 1.]])
         linalg_ops.matrix_solve(matrix, matrix).eval()
 
+  @test_util.run_deprecated_v1
   def testConcurrent(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       all_ops = []
       for adjoint_ in False, True:
         lhs1 = random_ops.random_normal([3, 3], seed=42)
@@ -125,7 +132,7 @@ class MatrixSolveOpTest(test.TestCase):
         s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
         s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
         all_ops += [s1, s2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       self.assertAllEqual(val[0], val[1])
       self.assertAllEqual(val[2], val[3])
 
@@ -167,7 +174,7 @@ class MatrixSolveBenchmark(test.Benchmark):
         for num_rhs in 1, 2, matrix_shape[-1]:
 
           with ops.Graph().as_default(), \
-              session.Session() as sess, \
+              session.Session(config=benchmark.benchmark_config()) as sess, \
               ops.device("/cpu:0"):
             matrix, rhs = self._GenerateTestData(matrix_shape, num_rhs)
             x = linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint)
@@ -185,7 +192,7 @@ class MatrixSolveBenchmark(test.Benchmark):
 
           if run_gpu_test:
             with ops.Graph().as_default(), \
-                session.Session() as sess, \
+                session.Session(config=benchmark.benchmark_config()) as sess, \
                 ops.device("/gpu:0"):
               matrix, rhs = self._GenerateTestData(matrix_shape, num_rhs)
               x = linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint)
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3edb390c724b6c71cd8849efc2b22a579e87247f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -0,0 +1,121 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.math_ops.matrix_square_root."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class SquareRootOpTest(test.TestCase):
+
+  def _verifySquareRoot(self, matrix, np_type):
+    matrix = matrix.astype(np_type)
+    with test_util.use_gpu():
+      # Verify that matmul(sqrtm(A), sqrtm(A)) = A
+      sqrt = gen_linalg_ops.matrix_square_root(matrix)
+      square = math_ops.matmul(sqrt, sqrt)
+      self.assertShapeEqual(matrix, square)
+      self.assertAllClose(matrix, square, rtol=1e-4, atol=1e-3)
+
+  def _verifySquareRootReal(self, x):
+    for np_type in [np.float32, np.float64]:
+      self._verifySquareRoot(x, np_type)
+
+  def _verifySquareRootComplex(self, x):
+    for np_type in [np.complex64, np.complex128]:
+      self._verifySquareRoot(x, np_type)
+
+  def _makeBatch(self, matrix1, matrix2):
+    matrix_batch = np.concatenate(
+        [np.expand_dims(matrix1, 0),
+         np.expand_dims(matrix2, 0)])
+    matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
+    return matrix_batch
+
+  def _testMatrices(self, matrix1, matrix2):
+    # Real
+    self._verifySquareRootReal(matrix1)
+    self._verifySquareRootReal(matrix2)
+    self._verifySquareRootReal(self._makeBatch(matrix1, matrix2))
+    # Complex
+    matrix1 = matrix1.astype(np.complex64)
+    matrix2 = matrix2.astype(np.complex64)
+    matrix1 += 1j * matrix1
+    matrix2 += 1j * matrix2
+    self._verifySquareRootComplex(matrix1)
+    self._verifySquareRootComplex(matrix2)
+    self._verifySquareRootComplex(self._makeBatch(matrix1, matrix2))
+
+  def testSymmetricPositiveDefinite(self):
+    matrix1 = np.array([[2., 1.], [1., 2.]])
+    matrix2 = np.array([[3., -1.], [-1., 3.]])
+    self._testMatrices(matrix1, matrix2)
+
+  def testAsymmetric(self):
+    matrix1 = np.array([[0., 4.], [-1., 5.]])
+    matrix2 = np.array([[33., 24.], [48., 57.]])
+    self._testMatrices(matrix1, matrix2)
+
+  def testIdentityMatrix(self):
+    # 2x2
+    identity = np.array([[1., 0], [0, 1.]])
+    self._verifySquareRootReal(identity)
+    # 3x3
+    identity = np.array([[1., 0, 0], [0, 1., 0], [0, 0, 1.]])
+    self._verifySquareRootReal(identity)
+
+  def testEmpty(self):
+    self._verifySquareRootReal(np.empty([0, 2, 2]))
+    self._verifySquareRootReal(np.empty([2, 0, 0]))
+
+  @test_util.run_v1_only("b/120545219")
+  def testWrongDimensions(self):
+    # The input to the square root should be at least a 2-dimensional tensor.
+    tensor = constant_op.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      gen_linalg_ops.matrix_square_root(tensor)
+
+  @test_util.run_v1_only("b/120545219")
+  def testNotSquare(self):
+    with self.assertRaises(ValueError):
+      tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
+      self.evaluate(gen_linalg_ops.matrix_square_root(tensor))
+
+  @test_util.run_v1_only("b/120545219")
+  def testConcurrentExecutesWithoutError(self):
+    with test_util.use_gpu():
+      matrix1 = random_ops.random_normal([5, 5], seed=42)
+      matrix2 = random_ops.random_normal([5, 5], seed=42)
+      square1 = math_ops.matmul(matrix1, matrix1)
+      square2 = math_ops.matmul(matrix2, matrix2)
+      sqrt1 = gen_linalg_ops.matrix_square_root(square1)
+      sqrt2 = gen_linalg_ops.matrix_square_root(square2)
+      all_ops = [sqrt1, sqrt2]
+      sqrt = self.evaluate(all_ops)
+      self.assertAllEqual(sqrt[0], sqrt[1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
index dd01ba11af37ea24e11713b35fd4b364c9ef4447..dde83f12f3cee1882d921be292f6a33b8c7f1b48 100644
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.platform import test
@@ -74,7 +75,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
         a_np = np.tile(a_np, batch_dims + [1, 1])
         b = np.tile(b, batch_dims + [1, 1])
 
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         if use_placeholder:
           a_tf = array_ops.placeholder(a.dtype)
           b_tf = array_ops.placeholder(b.dtype)
@@ -87,12 +88,13 @@ class MatrixTriangularSolveOpTest(test.TestCase):
           b_tf = constant_op.constant(b)
           tf_ans = linalg_ops.matrix_triangular_solve(
               a_tf, b_tf, lower=lower, adjoint=adjoint)
-          tf_val = tf_ans.eval()
+          tf_val = self.evaluate(tf_ans)
           np_ans = np.linalg.solve(a_np, b)
           self.assertEqual(np_ans.shape, tf_ans.get_shape())
         self.assertEqual(np_ans.shape, tf_val.shape)
         self.assertAllClose(np_ans, tf_val)
 
+  @test_util.run_deprecated_v1
   def testSolve(self):
     # 1x1 matrix, single rhs.
     matrix = np.array([[0.1]])
@@ -106,6 +108,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]])
     self._verifySolveAllWaysReal(matrix, rhs1)
 
+  @test_util.run_deprecated_v1
   def testSolveComplex(self):
     # 1x1 matrix, single rhs.
     matrix = np.array([[0.1 + 1j * 0.1]])
@@ -122,6 +125,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     rhs1 += 1j * rhs1
     self._verifySolveAllWaysComplex(matrix, rhs1)
 
+  @test_util.run_deprecated_v1
   def testSolveBatch(self):
     matrix = np.array([[1., 2.], [3., 4.]])
     rhs = np.array([[1., 0., 1.], [0., 1., 1.]])
@@ -130,6 +134,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
     self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
 
+  @test_util.run_deprecated_v1
   def testSolveBatchComplex(self):
     matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
     matrix += 1j * matrix
@@ -140,6 +145,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
     self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[3, 2])
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     # A non-square matrix should cause an error.
     matrix = np.array([[1., 2., 3.], [3., 4., 5.]])
@@ -149,6 +155,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
       with self.assertRaises(ValueError):
         self._verifySolve(matrix, matrix, batch_dims=[2, 3])
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The matrix should have the same number of rows as the
     # right-hand sides.
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 5dcdb9e4205e209091bb54474aa5c672f29cd081..64dd5914552d276e91ccaa4eed63e93b0eac37c1 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -175,22 +176,26 @@ class MeanTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean(array_ops.ones([4, 3]))
     _assert_metric_variables(self, ('mean/count:0', 'mean/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean(
         array_ops.ones([4, 3]), metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean(
         array_ops.ones([4, 3]), updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -203,11 +208,12 @@ class MeanTest(test.TestCase):
 
       mean, update_op = metrics.mean(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAlmostEqual(1.65, self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testUpdateOpsReturnsCurrentValue(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -220,15 +226,16 @@ class MeanTest(test.TestCase):
 
       mean, update_op = metrics.mean(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
-      self.assertAlmostEqual(1.475, sess.run(update_op), 5)
-      self.assertAlmostEqual(12.4 / 6.0, sess.run(update_op), 5)
-      self.assertAlmostEqual(1.65, sess.run(update_op), 5)
+      self.assertAlmostEqual(0.5, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(1.475, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(12.4 / 6.0, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(1.65, self.evaluate(update_op), 5)
 
-      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+      self.assertAlmostEqual(1.65, self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     values = _test_values((3, 2, 4, 1))
     mean_results = (
@@ -271,37 +278,44 @@ class MeanTest(test.TestCase):
       self.assertAlmostEqual(expected, update_op.eval(), places=5)
       self.assertAlmostEqual(expected, mean.eval(), places=5)
 
+  @test_util.run_deprecated_v1
   def test1x1x1Weighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5,)).reshape((1, 1, 1)))
 
+  @test_util.run_deprecated_v1
   def test1x1xNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 7, 11, 3)).reshape((1, 1, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNx1Weighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 11)).reshape((1, 2, 1)))
 
+  @test_util.run_deprecated_v1
   def test1xNxNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 7, 11, 3, 2, 13, 7, 5)).reshape((1, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1x1Weighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 7, 11)).reshape((3, 1, 1)))
 
+  @test_util.run_deprecated_v1
   def testNx1xNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((
             5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3)).reshape((3, 1, 4)))
 
+  @test_util.run_deprecated_v1
   def testNxNxNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
@@ -309,6 +323,7 @@ class MeanTest(test.TestCase):
             5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3,
             2, 17, 11, 3, 5, 7, 11, 3, 2, 12, 7, 5)).reshape((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidWeights(self):
     values_placeholder = array_ops.placeholder(dtype=dtypes_lib.float32)
     values = _test_values((3, 2, 4, 1))
@@ -341,23 +356,27 @@ class MeanTensorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_tensor(array_ops.ones([4, 3]))
     _assert_metric_variables(self,
                              ('mean/total_tensor:0', 'mean/count_tensor:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_tensor(
         array_ops.ones([4, 3]), metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_tensor(
         array_ops.ones([4, 3]), updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -370,11 +389,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(mean))
+        self.evaluate(update_op)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(mean))
 
+  @test_util.run_deprecated_v1
   def testMultiDimensional(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -391,11 +411,13 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(2):
-        sess.run(update_op)
-      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]], sess.run(mean))
+        self.evaluate(update_op)
+      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]],
+                          self.evaluate(mean))
 
+  @test_util.run_deprecated_v1
   def testUpdateOpsReturnsCurrentValue(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -408,15 +430,16 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAllClose([[0, 1]], sess.run(update_op), 5)
-      self.assertAllClose([[-2.1, 5.05]], sess.run(update_op), 5)
-      self.assertAllClose([[2.3 / 3., 10.1 / 3.]], sess.run(update_op), 5)
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(update_op), 5)
+      self.assertAllClose([[0, 1]], self.evaluate(update_op), 5)
+      self.assertAllClose([[-2.1, 5.05]], self.evaluate(update_op), 5)
+      self.assertAllClose([[2.3 / 3., 10.1 / 3.]], self.evaluate(update_op), 5)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(update_op), 5)
 
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(mean), 5)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testBinaryWeighted1d(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -439,11 +462,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[3.25, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[3.25, 0.5]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -466,11 +490,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[0.8, 3.52]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[0.8, 3.52]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted2d_1(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -493,11 +518,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[-2.1, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[-2.1, 0.5]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted2d_2(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -520,10 +546,10 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[0, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[0, 0.5]], self.evaluate(mean), 5)
 
 
 class AccuracyTest(test.TestCase):
@@ -531,6 +557,7 @@ class AccuracyTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.accuracy(
         predictions=array_ops.ones((10, 1)),
@@ -539,6 +566,7 @@ class AccuracyTest(test.TestCase):
     _assert_metric_variables(self,
                              ('my_accuracy/count:0', 'my_accuracy/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.accuracy(
@@ -547,6 +575,7 @@ class AccuracyTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.accuracy(
@@ -555,12 +584,14 @@ class AccuracyTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones((10, 3))
     labels = array_ops.ones((10, 4))
     with self.assertRaises(ValueError):
       metrics.accuracy(labels, predictions)
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones((10, 3))
     labels = array_ops.ones((10, 3))
@@ -568,6 +599,7 @@ class AccuracyTest(test.TestCase):
     with self.assertRaises(ValueError):
       metrics.accuracy(labels, predictions, weights)
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=3, dtype=dtypes_lib.int64, seed=1)
@@ -576,17 +608,18 @@ class AccuracyTest(test.TestCase):
     accuracy, update_op = metrics.accuracy(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_accuracy = accuracy.eval()
       for _ in range(10):
         self.assertEqual(initial_accuracy, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdates(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -609,32 +642,35 @@ class AccuracyTest(test.TestCase):
 
       accuracy, update_op = metrics.accuracy(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in xrange(3):
-        sess.run(update_op)
-      self.assertEqual(0.5, sess.run(update_op))
+        self.evaluate(update_op)
+      self.assertEqual(0.5, self.evaluate(update_op))
       self.assertEqual(0.5, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizes(self):
     predictions = array_ops.ones((40, 1))
     labels = array_ops.ones((40,))
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval())
       self.assertEqual(1.0, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithScalarWeight(self):
     predictions = array_ops.ones((40, 1))
     labels = array_ops.ones((40,))
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions, weights=2.0)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval())
       self.assertEqual(1.0, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithStaticShapedWeight(self):
     predictions = ops.convert_to_tensor([1, 1, 1])  # shape 3,
     labels = array_ops.expand_dims(ops.convert_to_tensor([1, 0, 0]),
@@ -645,13 +681,14 @@ class AccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # if streaming_accuracy does not flatten the weight, accuracy would be
       # 0.33333334 due to an intended broadcast of weight. Due to flattening,
       # it will be higher than .95
       self.assertGreater(update_op.eval(), .95)
       self.assertGreater(accuracy.eval(), .95)
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithDynamicallyShapedWeight(self):
     predictions = ops.convert_to_tensor([1, 1, 1])  # shape 3,
     labels = array_ops.expand_dims(ops.convert_to_tensor([1, 0, 0]),
@@ -666,13 +703,14 @@ class AccuracyTest(test.TestCase):
       accuracy, update_op = metrics.accuracy(labels, predictions,
                                              weights_placeholder)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # if streaming_accuracy does not flatten the weight, accuracy would be
       # 0.33333334 due to an intended broadcast of weight. Due to flattening,
       # it will be higher than .95
       self.assertGreater(update_op.eval(feed_dict=feed_dict), .95)
       self.assertGreater(accuracy.eval(feed_dict=feed_dict), .95)
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeightedValues(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -704,10 +742,10 @@ class AccuracyTest(test.TestCase):
 
       accuracy, update_op = metrics.accuracy(labels, predictions, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in xrange(3):
-        sess.run(update_op)
-      self.assertEqual(1.0, sess.run(update_op))
+        self.evaluate(update_op)
+      self.assertEqual(1.0, self.evaluate(update_op))
       self.assertEqual(1.0, accuracy.eval())
 
 
@@ -717,12 +755,14 @@ class PrecisionTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.precision(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
     _assert_metric_variables(self, ('precision/false_positives/count:0',
                                     'precision/true_positives/count:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.precision(
@@ -731,6 +771,7 @@ class PrecisionTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.precision(
@@ -739,6 +780,7 @@ class PrecisionTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
@@ -747,17 +789,18 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_precision = precision.eval()
       for _ in range(10):
         self.assertEqual(initial_precision, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -766,10 +809,11 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op))
       self.assertAlmostEqual(1, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleInputDtypes(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions = math_ops.cast(
@@ -779,10 +823,11 @@ class PrecisionTest(test.TestCase):
       precision, update_op = metrics.precision(labels, predictions)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.5, update_op.eval())
         self.assertAlmostEqual(0.5, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -797,6 +842,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testWeightedScalar_placeholders(self):
     predictions = array_ops.placeholder(dtype=dtypes_lib.float32)
     labels = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -816,6 +862,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(
           expected_precision, precision.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testWeighted1d_placeholders(self):
     predictions = array_ops.placeholder(dtype=dtypes_lib.float32)
     labels = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -836,6 +883,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(
           expected_precision, precision.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -852,6 +900,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted2d_placeholders(self):
     predictions = array_ops.placeholder(dtype=dtypes_lib.float32)
     labels = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -874,6 +923,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(
           expected_precision, precision.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -882,18 +932,19 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertAlmostEqual(0, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroTrueAndFalsePositivesGivesZeroPrecision(self):
     predictions = constant_op.constant([0, 0, 0, 0])
     labels = constant_op.constant([0, 0, 0, 0])
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0.0, precision.eval())
 
 
@@ -903,6 +954,7 @@ class RecallTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.recall(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
@@ -910,6 +962,7 @@ class RecallTest(test.TestCase):
         self,
         ('recall/false_negatives/count:0', 'recall/true_positives/count:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.recall(
@@ -918,6 +971,7 @@ class RecallTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.recall(
@@ -926,6 +980,7 @@ class RecallTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
@@ -934,17 +989,18 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_recall = recall.eval()
       for _ in range(10):
         self.assertEqual(initial_recall, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     np_inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -953,10 +1009,11 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(1, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleInputDtypes(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions = math_ops.cast(
@@ -966,10 +1023,11 @@ class RecallTest(test.TestCase):
       recall, update_op = metrics.recall(labels, predictions)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.5, update_op.eval())
         self.assertAlmostEqual(0.5, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -977,13 +1035,14 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       weighted_tp = 2.0 + 5.0
       weighted_t = (2.0 + 2.0) + (5.0 + 5.0)
       expected_precision = weighted_tp / weighted_t
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -991,13 +1050,14 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       weighted_tp = 3.0 + 1.0
       weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
       expected_precision = weighted_tp / weighted_t
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     np_inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1006,18 +1066,19 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroTruePositivesAndFalseNegativesGivesZeroRecall(self):
     predictions = array_ops.zeros((1, 4))
     labels = array_ops.zeros((1, 4))
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0, recall.eval())
 
 
@@ -1027,6 +1088,7 @@ class AUCTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.auc(predictions=array_ops.ones((10, 1)),
                 labels=array_ops.ones((10, 1)))
@@ -1034,6 +1096,7 @@ class AUCTest(test.TestCase):
                              ('auc/true_positives:0', 'auc/false_negatives:0',
                               'auc/false_positives:0', 'auc/true_negatives:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.auc(predictions=array_ops.ones((10, 1)),
@@ -1041,6 +1104,7 @@ class AUCTest(test.TestCase):
                           metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.auc(predictions=array_ops.ones((10, 1)),
@@ -1048,6 +1112,7 @@ class AUCTest(test.TestCase):
                                updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1056,17 +1121,18 @@ class AUCTest(test.TestCase):
     auc, update_op = metrics.auc(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_auc = auc.eval()
       for _ in range(10):
         self.assertAlmostEqual(initial_auc, auc.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     self.allCorrectAsExpected('ROC')
 
@@ -1078,11 +1144,12 @@ class AUCTest(test.TestCase):
       labels = constant_op.constant(inputs)
       auc, update_op = metrics.auc(labels, predictions, curve=curve)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
 
       self.assertEqual(1, auc.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleLabelDtypes(self):
     with self.cached_session() as sess:
       for label_dtype in (
@@ -1093,11 +1160,12 @@ class AUCTest(test.TestCase):
             constant_op.constant([0, 1, 1, 0], shape=(1, 4)), dtype=label_dtype)
         auc, update_op = metrics.auc(labels, predictions)
 
-        sess.run(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.5, sess.run(update_op))
+        self.evaluate(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.5, self.evaluate(update_op))
 
         self.assertAlmostEqual(0.5, auc.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1106,11 +1174,12 @@ class AUCTest(test.TestCase):
       weights = constant_op.constant([2], shape=(1, 1))
       auc, update_op = metrics.auc(labels, predictions, weights=weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.5, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(0.5, auc.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1119,13 +1188,14 @@ class AUCTest(test.TestCase):
       weights = constant_op.constant([1, 2, 3, 4], shape=(1, 4))
       auc, update_op = metrics.auc(labels, predictions, weights=weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.7, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.7, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(0.7, auc.eval(), 5)
 
   # Regarding the AUC-PR tests: note that the preferred method when
   # calculating AUC-PR is summation_method='careful_interpolation'.
+  @test_util.run_deprecated_v1
   def testCorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1134,12 +1204,13 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.79726744594
       expected = 1 - math.log(1.5) / 2
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testCorrectAnotherAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1150,12 +1221,13 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.61350593198
       expected = (2.5 - 2 * math.log(4./3) - 0.25 * math.log(7./5)) / 3
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testThirdCorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1166,12 +1238,13 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.90410597584
       expected = 1 - math.log(4./3) / 3
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testIncorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1180,11 +1253,12 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.79166, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testAnotherIncorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1195,11 +1269,12 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.610317, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testThirdIncorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1210,11 +1285,12 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.90277, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1223,30 +1299,32 @@ class AUCTest(test.TestCase):
       labels = constant_op.constant(1 - inputs, dtype=dtypes_lib.float32)
       auc, update_op = metrics.auc(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0, self.evaluate(update_op))
 
       self.assertAlmostEqual(0, auc.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroTruePositivesAndFalseNegativesGivesOneAUC(self):
     with self.cached_session() as sess:
       predictions = array_ops.zeros([4], dtype=dtypes_lib.float32)
       labels = array_ops.zeros([4])
       auc, update_op = metrics.auc(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 6)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
+  @test_util.run_deprecated_v1
   def testRecallOneAndPrecisionOneGivesOnePRAUC(self):
     with self.cached_session() as sess:
       predictions = array_ops.ones([4], dtype=dtypes_lib.float32)
       labels = array_ops.ones([4])
       auc, update_op = metrics.auc(labels, predictions, curve='PR')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 6)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
@@ -1277,6 +1355,7 @@ class AUCTest(test.TestCase):
     tp = np.cumsum(sorted_weights * is_positive) / num_positives
     return np.sum((sorted_weights * tp)[~is_positive]) / num_negatives
 
+  @test_util.run_deprecated_v1
   def testWithMultipleUpdates(self):
     num_samples = 1000
     batch_size = 10
@@ -1317,9 +1396,9 @@ class AUCTest(test.TestCase):
                                      num_thresholds=500,
                                      weights=tf_weights)
 
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         for i in range(num_batches):
-          sess.run(update_op)
+          self.evaluate(update_op)
 
         # Since this is only approximate, we can't expect a 6 digits match.
         # Although with higher number of samples/thresholds we should see the
@@ -1333,6 +1412,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.specificity_at_sensitivity(
         predictions=array_ops.ones((10, 1)),
@@ -1344,6 +1424,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
                               'specificity_at_sensitivity/false_positives:0',
                               'specificity_at_sensitivity/true_negatives:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.specificity_at_sensitivity(
@@ -1353,6 +1434,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.specificity_at_sensitivity(
@@ -1362,6 +1444,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1371,17 +1454,18 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_specificity = specificity.eval()
       for _ in range(10):
         self.assertAlmostEqual(initial_specificity, specificity.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1391,10 +1475,11 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
       self.assertEqual(1, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectHighSensitivity(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.45, 0.5, 0.8, 0.9]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1406,10 +1491,11 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.8)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1.0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1.0, self.evaluate(update_op))
       self.assertAlmostEqual(1.0, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectLowSensitivity(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1421,11 +1507,12 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, self.evaluate(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d_multipleLabelDtypes(self):
     for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
@@ -1440,11 +1527,12 @@ class SpecificityAtSensitivityTest(test.TestCase):
           labels, predictions, weights=weights, sensitivity=0.4)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
 
-        self.assertAlmostEqual(0.6, sess.run(update_op))
+        self.assertAlmostEqual(0.6, self.evaluate(update_op))
         self.assertAlmostEqual(0.6, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1458,9 +1546,9 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, weights=weights, sensitivity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(8.0 / 15.0, sess.run(update_op))
+      self.assertAlmostEqual(8.0 / 15.0, self.evaluate(update_op))
       self.assertAlmostEqual(8.0 / 15.0, specificity.eval())
 
 
@@ -1470,6 +1558,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.sensitivity_at_specificity(
         predictions=array_ops.ones((10, 1)),
@@ -1481,6 +1570,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
                               'sensitivity_at_specificity/false_positives:0',
                               'sensitivity_at_specificity/true_negatives:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.sensitivity_at_specificity(
@@ -1490,6 +1580,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.sensitivity_at_specificity(
@@ -1499,6 +1590,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1508,17 +1600,18 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_sensitivity = sensitivity.eval()
       for _ in range(10):
         self.assertAlmostEqual(initial_sensitivity, sensitivity.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1528,10 +1621,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
       self.assertEqual(1, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectHighSpecificity(self):
     predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1543,10 +1637,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.8)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.8, self.evaluate(update_op))
       self.assertAlmostEqual(0.8, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectLowSpecificity(self):
     predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1558,10 +1653,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.6, self.evaluate(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted_multipleLabelDtypes(self):
     for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions_values = [
@@ -1577,8 +1673,8 @@ class SensitivityAtSpecificityTest(test.TestCase):
           labels, predictions, weights=weights, specificity=0.4)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.675, sess.run(update_op))
+        self.evaluate(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.675, self.evaluate(update_op))
         self.assertAlmostEqual(0.675, specificity.eval())
 
 
@@ -1589,6 +1685,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.precision_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -1599,6 +1696,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         'precision_at_thresholds/false_positives:0',
     ))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     prec, _ = metrics.precision_at_thresholds(
@@ -1613,6 +1711,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [prec, rec])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, precision_op = metrics.precision_at_thresholds(
@@ -1628,6 +1727,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     self.assertListEqual(
         ops.get_collection(my_collection_name), [precision_op, recall_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1639,18 +1739,19 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     rec, rec_op = metrics.recall_at_thresholds(labels, predictions, thresholds)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates, then verify idempotency.
-      sess.run([prec_op, rec_op])
+      self.evaluate([prec_op, rec_op])
       initial_prec = prec.eval()
       initial_rec = rec.eval()
       for _ in range(10):
-        sess.run([prec_op, rec_op])
+        self.evaluate([prec_op, rec_op])
         self.assertAllClose(initial_prec, prec.eval())
         self.assertAllClose(initial_rec, rec.eval())
 
   # TODO(nsilberman): fix tests (passing but incorrect).
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1663,12 +1764,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertEqual(1, prec.eval())
       self.assertEqual(1, rec.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleLabelDtypes(self):
     with self.cached_session() as sess:
       for label_dtype in (
@@ -1683,12 +1785,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                    thresholds)
 
-        sess.run(variables.local_variables_initializer())
-        sess.run([prec_op, rec_op])
+        self.evaluate(variables.local_variables_initializer())
+        self.evaluate([prec_op, rec_op])
 
         self.assertAlmostEqual(0.5, prec.eval())
         self.assertAlmostEqual(0.5, rec.eval())
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1701,12 +1804,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(0, prec.eval())
       self.assertAlmostEqual(0, rec.eval())
 
+  @test_util.run_deprecated_v1
   def testWeights1d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1729,14 +1833,15 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec_low = array_ops.reshape(rec_low, shape=())
       rec_high = array_ops.reshape(rec_high, shape=())
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
       self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
 
+  @test_util.run_deprecated_v1
   def testWeights2d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1759,14 +1864,15 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec_low = array_ops.reshape(rec_low, shape=())
       rec_high = array_ops.reshape(rec_high, shape=())
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
       self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
 
+  @test_util.run_deprecated_v1
   def testExtremeThresholds(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1783,14 +1889,15 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       [rec_low, rec_high] = array_ops.split(
           value=rec, num_or_size_splits=2, axis=0)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(0.75, prec_low.eval())
       self.assertAlmostEqual(0.0, prec_high.eval())
       self.assertAlmostEqual(1.0, rec_low.eval())
       self.assertAlmostEqual(0.0, rec_high.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroLabelsPredictions(self):
     with self.cached_session() as sess:
       predictions = array_ops.zeros([4], dtype=dtypes_lib.float32)
@@ -1801,12 +1908,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(0, prec.eval(), 6)
       self.assertAlmostEqual(0, rec.eval(), 6)
 
+  @test_util.run_deprecated_v1
   def testWithMultipleUpdates(self):
     num_samples = 1000
     batch_size = 10
@@ -1869,9 +1977,9 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(tf_labels, tf_predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(int(num_samples / batch_size)):
-        sess.run([prec_op, rec_op])
+        self.evaluate([prec_op, rec_op])
       # Since this is only approximate, we can't expect a 6 digits match.
       # Although with higher number of samples/thresholds we should see the
       # accuracy improving
@@ -1989,6 +2097,7 @@ class SingleLabelPrecisionAtKTest(test.TestCase):
     self._test_average_precision_at_k = functools.partial(
         _test_average_precision_at_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_at_k1_nan(self):
     for labels in self._labels:
       # Classes 0,1,2 have 0 predictions, classes -1 and 4 are out of range.
@@ -1998,6 +2107,7 @@ class SingleLabelPrecisionAtKTest(test.TestCase):
         self._test_precision_at_top_k(
             self._predictions_idx, labels, k=1, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_at_k1(self):
     for labels in self._labels:
       # Class 3: 1 label, 2 predictions, 1 correct.
@@ -2025,6 +2135,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
     self._test_average_precision_at_k = functools.partial(
         _test_average_precision_at_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_average_precision(self):
     # Example 1.
     # Matches example here:
@@ -2100,6 +2211,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
           expected=streaming_average_precision[i],
           weights=weights)
 
+  @test_util.run_deprecated_v1
   def test_average_precision_some_labels_out_of_range(self):
     """Tests that labels outside the [0, n_classes) range are ignored."""
     labels_ex1 = (-1, 0, 1, 2, 3, 4, 7)
@@ -2119,6 +2231,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_average_precision_at_k(
           predictions, labels, k, expected=avg_precision_ex1[i])
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5_no_predictions(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
@@ -2135,6 +2248,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
         self._test_precision_at_top_k(
             predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5_no_labels(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
@@ -2151,6 +2265,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
         self._test_precision_at_top_k(
             predictions_idx, labels, k=5, expected=0.0, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
@@ -2184,6 +2299,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=3.0 / 10)
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) range are ignored."""
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
@@ -2220,6 +2336,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
     self._test_precision_at_top_k(
         predictions_idx, sp_labels, k=5, expected=3.0 / 10)
 
+  @test_util.run_deprecated_v1
   def test_3d_nan(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2238,6 +2355,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d_no_labels(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2256,6 +2374,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=0.0, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2291,6 +2410,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
     self._test_precision_at_top_k(
         predictions_idx, labels, k=5, expected=7.0 / 20)
 
+  @test_util.run_deprecated_v1
   def test_3d_ignore_some(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2432,6 +2552,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k = functools.partial(
         _test_recall_at_top_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_at_k1_nan(self):
     # Classes 0,1 have 0 labels, 0 predictions, classes -1 and 4 are out of
     # range.
@@ -2442,6 +2563,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
         self._test_recall_at_top_k(
             self._predictions_idx, labels, k=1, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_at_k1_no_predictions(self):
     for labels in self._labels:
       # Class 2: 0 predictions.
@@ -2450,6 +2572,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=1, expected=0.0, class_id=2)
 
+  @test_util.run_deprecated_v1
   def test_one_label_at_k1(self):
     for labels in self._labels:
       # Class 3: 1 label, 2 predictions, 1 correct.
@@ -2463,6 +2586,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=1, expected=1.0 / 2)
 
+  @test_util.run_deprecated_v1
   def test_one_label_at_k1_weighted_class_id3(self):
     predictions = self._predictions
     predictions_idx = self._predictions_idx
@@ -2504,6 +2628,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
           predictions_idx, labels, k=1, expected=2.0 / 2, class_id=3,
           weights=(2.0, 3.0))
 
+  @test_util.run_deprecated_v1
   def test_one_label_at_k1_weighted(self):
     predictions = self._predictions
     predictions_idx = self._predictions_idx
@@ -2553,6 +2678,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k = functools.partial(
         _test_recall_at_top_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_at_k5_nan(self):
     for labels in self._labels:
       # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
@@ -2562,6 +2688,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
         self._test_recall_at_top_k(
             self._predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_at_k5_no_predictions(self):
     for labels in self._labels:
       # Class 8: 1 label, no predictions.
@@ -2570,6 +2697,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=5, expected=0.0 / 1, class_id=8)
 
+  @test_util.run_deprecated_v1
   def test_at_k5(self):
     for labels in self._labels:
       # Class 2: 2 labels, both correct.
@@ -2595,6 +2723,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=5, expected=3.0 / 6)
 
+  @test_util.run_deprecated_v1
   def test_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) count in denominator."""
     labels = sparse_tensor.SparseTensorValue(
@@ -2647,6 +2776,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k = functools.partial(
         _test_recall_at_top_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_3d_nan(self):
     # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
     for class_id in (0, 3, 4, 6, 9, 10):
@@ -2656,6 +2786,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
           self._predictions_idx, self._labels, k=5, expected=NAN,
           class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d_no_predictions(self):
     # Classes 1,8 have 0 predictions, >=1 label.
     for class_id in (1, 8):
@@ -2665,6 +2796,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
           self._predictions_idx, self._labels, k=5, expected=0.0,
           class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d(self):
     # Class 2: 4 labels, all correct.
     self._test_recall_at_k(
@@ -2693,6 +2825,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k(
         self._predictions_idx, self._labels, k=5, expected=7.0 / 12)
 
+  @test_util.run_deprecated_v1
   def test_3d_ignore_all(self):
     for class_id in xrange(10):
       self._test_recall_at_k(
@@ -2719,6 +2852,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
         self._predictions_idx, self._labels, k=5, expected=NAN,
         weights=[[0, 0], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def test_3d_ignore_some(self):
     # Class 2: 2 labels, both correct.
     self._test_recall_at_k(
@@ -2774,12 +2908,14 @@ class MeanAbsoluteErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_absolute_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
     _assert_metric_variables(
         self, ('mean_absolute_error/count:0', 'mean_absolute_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_absolute_error(
@@ -2788,6 +2924,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_absolute_error(
@@ -2796,23 +2933,25 @@ class MeanAbsoluteErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.mean_absolute_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
     predictions = constant_op.constant(
         [2, 4, 6, 8], shape=(1, 4), dtype=dtypes_lib.float32)
@@ -2823,8 +2962,8 @@ class MeanAbsoluteErrorTest(test.TestCase):
     error, update_op = metrics.mean_absolute_error(labels, predictions, weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(3, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(3, self.evaluate(update_op))
       self.assertEqual(3, error.eval())
 
 
@@ -2833,6 +2972,7 @@ class MeanRelativeErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_relative_error(
         predictions=array_ops.ones((10, 1)),
@@ -2841,6 +2981,7 @@ class MeanRelativeErrorTest(test.TestCase):
     _assert_metric_variables(
         self, ('mean_relative_error/count:0', 'mean_relative_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_relative_error(
@@ -2850,6 +2991,7 @@ class MeanRelativeErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_relative_error(
@@ -2859,6 +3001,7 @@ class MeanRelativeErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
@@ -2867,17 +3010,18 @@ class MeanRelativeErrorTest(test.TestCase):
                                                    normalizer)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateNormalizedByLabels(self):
     np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
     np_labels = np.asarray([1, 3, 2, 3], dtype=np.float32)
@@ -2892,10 +3036,11 @@ class MeanRelativeErrorTest(test.TestCase):
         labels, predictions, normalizer=labels)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(expected_error, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(expected_error, self.evaluate(update_op))
       self.assertEqual(expected_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateNormalizedByZeros(self):
     np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
 
@@ -2908,8 +3053,8 @@ class MeanRelativeErrorTest(test.TestCase):
         labels, predictions, normalizer=array_ops.zeros_like(labels))
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0.0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0.0, self.evaluate(update_op))
       self.assertEqual(0.0, error.eval())
 
 
@@ -2918,12 +3063,14 @@ class MeanSquaredErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
     _assert_metric_variables(
         self, ('mean_squared_error/count:0', 'mean_squared_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_squared_error(
@@ -2932,6 +3079,7 @@ class MeanSquaredErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_squared_error(
@@ -2940,23 +3088,25 @@ class MeanSquaredErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
     predictions = array_ops.zeros((1, 3), dtype=dtypes_lib.float32)
     labels = array_ops.zeros((1, 3), dtype=dtypes_lib.float32)
@@ -2964,10 +3114,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError(self):
     predictions = constant_op.constant(
         [2, 4, 6], shape=(1, 3), dtype=dtypes_lib.float32)
@@ -2977,10 +3128,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(6, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(6, self.evaluate(update_op))
       self.assertEqual(6, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
     predictions = constant_op.constant(
         [2, 4, 6, 8], shape=(1, 4), dtype=dtypes_lib.float32)
@@ -2991,10 +3143,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions, weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(13, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(13, self.evaluate(update_op))
       self.assertEqual(13, error.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleBatchesOfSizeOne(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -3013,12 +3166,13 @@ class MeanSquaredErrorTest(test.TestCase):
 
       error, update_op = metrics.mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(208.0 / 6, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
+      self.assertAlmostEqual(208.0 / 6, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(208.0 / 6, error.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testMetricsComputedConcurrently(self):
     with self.cached_session() as sess:
       # Create the queue that populates one set of predictions.
@@ -3054,14 +3208,15 @@ class MeanSquaredErrorTest(test.TestCase):
       mse1, update_op1 = metrics.mean_squared_error(
           labels1, predictions1, name='msd1')
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([update_op0, update_op1])
-      sess.run([update_op0, update_op1])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([update_op0, update_op1])
+      self.evaluate([update_op0, update_op1])
 
-      mse0, mse1 = sess.run([mse0, mse1])
+      mse0, mse1 = self.evaluate([mse0, mse1])
       self.assertAlmostEqual(208.0 / 6, mse0, 5)
       self.assertAlmostEqual(79.0 / 6, mse1, 5)
 
+  @test_util.run_deprecated_v1
   def testMultipleMetricsOnMultipleBatchesOfSizeOne(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -3081,9 +3236,9 @@ class MeanSquaredErrorTest(test.TestCase):
       mae, ma_update_op = metrics.mean_absolute_error(labels, predictions)
       mse, ms_update_op = metrics.mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([ma_update_op, ms_update_op])
-      sess.run([ma_update_op, ms_update_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([ma_update_op, ms_update_op])
+      self.evaluate([ma_update_op, ms_update_op])
 
       self.assertAlmostEqual(32.0 / 6, mae.eval(), 5)
       self.assertAlmostEqual(208.0 / 6, mse.eval(), 5)
@@ -3094,6 +3249,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.root_mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
@@ -3101,6 +3257,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
         self,
         ('root_mean_squared_error/count:0', 'root_mean_squared_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.root_mean_squared_error(
@@ -3109,6 +3266,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.root_mean_squared_error(
@@ -3117,23 +3275,25 @@ class RootMeanSquaredErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.root_mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -3142,11 +3302,12 @@ class RootMeanSquaredErrorTest(test.TestCase):
 
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
 
       self.assertEqual(0, rmse.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -3156,10 +3317,11 @@ class RootMeanSquaredErrorTest(test.TestCase):
 
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
       self.assertAlmostEqual(math.sqrt(6), rmse.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -3171,8 +3333,8 @@ class RootMeanSquaredErrorTest(test.TestCase):
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions,
                                                         weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(math.sqrt(13), sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(math.sqrt(13), self.evaluate(update_op))
 
       self.assertAlmostEqual(math.sqrt(13), rmse.eval(), 5)
 
@@ -3187,6 +3349,7 @@ class MeanCosineDistanceTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_cosine_distance(
         predictions=array_ops.ones((10, 3)),
@@ -3197,6 +3360,7 @@ class MeanCosineDistanceTest(test.TestCase):
         'mean_cosine_distance/total:0',
     ))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_cosine_distance(
@@ -3206,6 +3370,7 @@ class MeanCosineDistanceTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_cosine_distance(
@@ -3215,23 +3380,25 @@ class MeanCosineDistanceTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=1)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
 
@@ -3243,10 +3410,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError1(self):
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
     np_predictions = np.matrix(('1 0 0;' '0 0 -1;' '1 0 0'))
@@ -3259,10 +3427,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 5)
       self.assertAlmostEqual(1, error.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError2(self):
     np_predictions = np.matrix(
         ('0.819031913261206 0.567041924552012 0.087465312324590;'
@@ -3280,10 +3449,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1.0, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1.0, self.evaluate(update_op), 5)
       self.assertAlmostEqual(1.0, error.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights1(self):
     np_predictions = np.matrix(('1 0 0;' '0 0 -1;' '1 0 0'))
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
@@ -3299,10 +3469,11 @@ class MeanCosineDistanceTest(test.TestCase):
         labels, predictions, dim=2, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights2(self):
     np_predictions = np.matrix(('1 0 0;' '0 0 -1;' '1 0 0'))
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
@@ -3318,7 +3489,7 @@ class MeanCosineDistanceTest(test.TestCase):
         labels, predictions, dim=2, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.5, update_op.eval())
       self.assertEqual(1.5, error.eval())
 
@@ -3328,6 +3499,7 @@ class PcntBelowThreshTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.percentage_below(values=array_ops.ones((10,)), threshold=2)
     _assert_metric_variables(self, (
@@ -3335,6 +3507,7 @@ class PcntBelowThreshTest(test.TestCase):
         'percentage_below_threshold/total:0',
     ))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.percentage_below(
@@ -3343,6 +3516,7 @@ class PcntBelowThreshTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.percentage_below(
@@ -3351,6 +3525,7 @@ class PcntBelowThreshTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testOneUpdate(self):
     with self.cached_session() as sess:
       values = constant_op.constant(
@@ -3360,14 +3535,15 @@ class PcntBelowThreshTest(test.TestCase):
       pcnt1, update_op1 = metrics.percentage_below(values, 7, name='medium')
       pcnt2, update_op2 = metrics.percentage_below(values, 1, name='low')
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([update_op0, update_op1, update_op2])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([update_op0, update_op1, update_op2])
 
-      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      pcnt0, pcnt1, pcnt2 = self.evaluate([pcnt0, pcnt1, pcnt2])
       self.assertAlmostEqual(1.0, pcnt0, 5)
       self.assertAlmostEqual(0.75, pcnt1, 5)
       self.assertAlmostEqual(0.0, pcnt2, 5)
 
+  @test_util.run_deprecated_v1
   def testSomePresentOneUpdate(self):
     with self.cached_session() as sess:
       values = constant_op.constant(
@@ -3382,11 +3558,11 @@ class PcntBelowThreshTest(test.TestCase):
       pcnt2, update_op2 = metrics.percentage_below(
           values, 1, weights=weights, name='low')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertListEqual([1.0, 0.5, 0.0],
-                           sess.run([update_op0, update_op1, update_op2]))
+                           self.evaluate([update_op0, update_op1, update_op2]))
 
-      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      pcnt0, pcnt1, pcnt2 = self.evaluate([pcnt0, pcnt1, pcnt2])
       self.assertAlmostEqual(1.0, pcnt0, 5)
       self.assertAlmostEqual(0.5, pcnt1, 5)
       self.assertAlmostEqual(0.0, pcnt2, 5)
@@ -3398,6 +3574,7 @@ class MeanIOUTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_iou(
         predictions=array_ops.ones([10, 1]),
@@ -3405,6 +3582,7 @@ class MeanIOUTest(test.TestCase):
         num_classes=2)
     _assert_metric_variables(self, ('mean_iou/total_confusion_matrix:0',))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
     mean_iou, _ = metrics.mean_iou(
@@ -3414,6 +3592,7 @@ class MeanIOUTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean_iou])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_iou(
@@ -3423,12 +3602,14 @@ class MeanIOUTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10, 3])
     labels = array_ops.ones([10, 4])
     with self.assertRaises(ValueError):
       metrics.mean_iou(labels, predictions, num_classes=2)
 
+  @test_util.run_deprecated_v1
   def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10])
     labels = array_ops.ones([10])
@@ -3436,6 +3617,7 @@ class MeanIOUTest(test.TestCase):
     with self.assertRaises(ValueError):
       metrics.mean_iou(labels, predictions, num_classes=2, weights=weights)
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     num_classes = 3
     predictions = random_ops.random_uniform(
@@ -3446,17 +3628,18 @@ class MeanIOUTest(test.TestCase):
         labels, predictions, num_classes=num_classes)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_mean_iou = mean_iou.eval()
       for _ in range(10):
         self.assertEqual(initial_mean_iou, mean_iou.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdates(self):
     num_classes = 3
     with self.cached_session() as sess:
@@ -3482,12 +3665,13 @@ class MeanIOUTest(test.TestCase):
 
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 1.0 / 4.0, 0.])
       self.assertEqual(desired_output, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeights(self):
     num_classes = 2
     with self.cached_session() as sess:
@@ -3529,10 +3713,11 @@ class MeanIOUTest(test.TestCase):
 
       variables.local_variables_initializer().run()
       for _ in range(6):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([2.0 / 3.0, 1.0 / 2.0])
       self.assertAlmostEqual(desired_output, mean_iou.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithMissingClass(self):
     # Test the case where there are no predicions and labels for
     # one class, and thus there is one row and one column with
@@ -3563,12 +3748,13 @@ class MeanIOUTest(test.TestCase):
 
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 3.0, 2.0 / 4.0])
       self.assertAlmostEqual(desired_output, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
     predictions = array_ops.concat(
         [
@@ -3587,32 +3773,35 @@ class MeanIOUTest(test.TestCase):
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       confusion_matrix = update_op.eval()
       self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
       desired_miou = np.mean([3. / 5., 5. / 7.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.zeros([40])
     num_classes = 1
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(40, update_op.eval()[0])
       self.assertEqual(1.0, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testAllWrong(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.ones([40])
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
       self.assertEqual(0., miou.eval())
 
+  @test_util.run_deprecated_v1
   def testResultsWithSomeMissing(self):
     predictions = array_ops.concat(
         [
@@ -3640,11 +3829,12 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(
           labels, predictions, num_classes, weights=weights)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
       desired_miou = np.mean([2. / 4., 4. / 6.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMissingClassInLabels(self):
     labels = constant_op.constant([
         [[0, 0, 1, 1, 0, 0],
@@ -3659,22 +3849,24 @@ class MeanIOUTest(test.TestCase):
     num_classes = 3
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
           1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
           miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMissingClassOverallSmall(self):
     labels = constant_op.constant([0])
     predictions = constant_op.constant([0])
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[1, 0], [0, 0]], update_op.eval())
       self.assertAlmostEqual(1, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMissingClassOverallLarge(self):
     labels = constant_op.constant([
         [[0, 0, 1, 1, 0, 0],
@@ -3689,7 +3881,7 @@ class MeanIOUTest(test.TestCase):
     num_classes = 3
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
           1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
@@ -3701,6 +3893,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_per_class_accuracy(
         predictions=array_ops.ones([10, 1]),
@@ -3709,6 +3902,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     _assert_metric_variables(self, ('mean_accuracy/count:0',
                                     'mean_accuracy/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
     mean_accuracy, _ = metrics.mean_per_class_accuracy(
@@ -3719,6 +3913,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     self.assertListEqual(
         ops.get_collection(my_collection_name), [mean_accuracy])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_per_class_accuracy(
@@ -3728,12 +3923,14 @@ class MeanPerClassAccuracyTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10, 3])
     labels = array_ops.ones([10, 4])
     with self.assertRaises(ValueError):
       metrics.mean_per_class_accuracy(labels, predictions, num_classes=2)
 
+  @test_util.run_deprecated_v1
   def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10])
     labels = array_ops.ones([10])
@@ -3742,6 +3939,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       metrics.mean_per_class_accuracy(
           labels, predictions, num_classes=2, weights=weights)
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     num_classes = 3
     predictions = random_ops.random_uniform(
@@ -3752,11 +3950,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
         labels, predictions, num_classes=num_classes)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_mean_accuracy = mean_accuracy.eval()
@@ -3788,12 +3986,13 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0, 1.0 / 3.0, 0.0])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeights(self):
     num_classes = 2
     with self.cached_session() as sess:
@@ -3835,10 +4034,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
 
       variables.local_variables_initializer().run()
       for _ in range(6):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([2.0 / 2.0, 0.5 / 1.5])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithMissingClass(self):
     # Test the case where there are no predicions and labels for
     # one class, and thus there is one row and one column with
@@ -3870,12 +4070,13 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 2.0 / 3.0, 0.])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.zeros([40])
@@ -3883,10 +4084,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval()[0])
       self.assertEqual(1.0, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testAllWrong(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.ones([40])
@@ -3894,10 +4096,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([0.0, 0.0], update_op.eval())
       self.assertEqual(0., mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testResultsWithSomeMissing(self):
     predictions = array_ops.concat([
         constant_op.constant(0, shape=[5]), constant_op.constant(1, shape=[5])
@@ -3913,7 +4116,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes, weights=weights)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       desired_accuracy = np.array([2. / 2., 4. / 6.], dtype=np.float32)
       self.assertAllEqual(desired_accuracy, update_op.eval())
       desired_mean_accuracy = np.mean(desired_accuracy)
@@ -3926,12 +4129,14 @@ class FalseNegativesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_negatives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('false_negatives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -3945,11 +4150,12 @@ class FalseNegativesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(3., tn_update_op.eval())
       self.assertAllClose(3., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -3964,7 +4170,7 @@ class FalseNegativesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(5., tn_update_op.eval())
       self.assertAllClose(5., tn.eval())
@@ -3976,6 +4182,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_negatives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -3983,6 +4190,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('false_negatives/false_negatives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -3994,11 +4202,12 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), fn.eval())
       self.assertAllEqual((0, 2, 3), fn_update_op.eval())
       self.assertAllEqual((0, 2, 3), fn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4013,7 +4222,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), fn.eval())
       self.assertAllEqual((0.0, 8.0, 11.0), fn_update_op.eval())
       self.assertAllEqual((0.0, 8.0, 11.0), fn.eval())
@@ -4025,12 +4234,14 @@ class FalsePositivesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_positives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('false_positives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4044,11 +4255,12 @@ class FalsePositivesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(7., tn_update_op.eval())
       self.assertAllClose(7., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4063,7 +4275,7 @@ class FalsePositivesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(14., tn_update_op.eval())
       self.assertAllClose(14., tn.eval())
@@ -4075,6 +4287,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_positives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -4082,6 +4295,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('false_positives/false_positives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4093,11 +4307,12 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), fp.eval())
       self.assertAllEqual((7, 4, 2), fp_update_op.eval())
       self.assertAllEqual((7, 4, 2), fp.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4114,7 +4329,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), fp.eval())
       self.assertAllEqual((125.0, 42.0, 12.0), fp_update_op.eval())
       self.assertAllEqual((125.0, 42.0, 12.0), fp.eval())
@@ -4126,12 +4341,14 @@ class TrueNegativesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_negatives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('true_negatives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4145,11 +4362,12 @@ class TrueNegativesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(3., tn_update_op.eval())
       self.assertAllClose(3., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4164,7 +4382,7 @@ class TrueNegativesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(4., tn_update_op.eval())
       self.assertAllClose(4., tn.eval())
@@ -4176,6 +4394,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_negatives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -4183,6 +4402,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('true_negatives/true_negatives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4194,11 +4414,12 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), tn.eval())
       self.assertAllEqual((2, 5, 7), tn_update_op.eval())
       self.assertAllEqual((2, 5, 7), tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4213,7 +4434,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), tn.eval())
       self.assertAllEqual((5.0, 15.0, 23.0), tn_update_op.eval())
       self.assertAllEqual((5.0, 15.0, 23.0), tn.eval())
@@ -4225,12 +4446,14 @@ class TruePositivesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_positives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('true_positives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4244,11 +4467,12 @@ class TruePositivesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(7., tn_update_op.eval())
       self.assertAllClose(7., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4263,7 +4487,7 @@ class TruePositivesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(12., tn_update_op.eval())
       self.assertAllClose(12., tn.eval())
@@ -4275,6 +4499,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_positives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -4282,6 +4507,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('true_positives/true_positives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4293,11 +4519,12 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), tp.eval())
       self.assertAllEqual((3, 1, 0), tp_update_op.eval())
       self.assertAllEqual((3, 1, 0), tp.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4310,7 +4537,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), tp.eval())
       self.assertAllEqual((111.0, 37.0, 0.0), tp_update_op.eval())
       self.assertAllEqual((111.0, 37.0, 0.0), tp.eval())
diff --git a/tensorflow/python/kernel_tests/morphological_ops_test.py b/tensorflow/python/kernel_tests/morphological_ops_test.py
index ce4d8acfbd4f4a806b57431cd77781a55a42bec9..f54aaf30d0a928f2ff5f86ec1ec07658f272f8f7 100644
--- a/tensorflow/python/kernel_tests/morphological_ops_test.py
+++ b/tensorflow/python/kernel_tests/morphological_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -44,7 +45,7 @@ class DilationTest(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       out_tensor = nn_ops.dilation2d(
           constant_op.constant(image),
           constant_op.constant(kernel),
@@ -52,7 +53,7 @@ class DilationTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="dilation2d")
-      self.assertAllClose(out, out_tensor.eval())
+      self.assertAllClose(out, self.evaluate(out_tensor))
 
   def _testDilationValidPadding(self, use_gpu):
     # [1, 2, 2, 1]
@@ -204,7 +205,7 @@ class DilationTest(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       image_tensor = constant_op.constant(
           image, shape=image_shape, name="input")
       kernel_tensor = constant_op.constant(
@@ -216,7 +217,7 @@ class DilationTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="dilation2d")
-      out_shape = out_tensor.eval().shape
+      out_shape = self.evaluate(out_tensor).shape
 
       # Small delta is necessary for argmax to remain the same.
       err = gradient_checker.compute_gradient_error(
@@ -291,6 +292,7 @@ class DilationTest(test.TestCase):
         padding="SAME",
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testDilationGrad(self):
     for use_gpu in True, False:
       self._testDilationGradValidPadding_1x1x1(use_gpu)
@@ -319,7 +321,7 @@ class ErosionTest(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       out_tensor = nn_ops.erosion2d(
           constant_op.constant(image),
           constant_op.constant(kernel),
@@ -327,7 +329,7 @@ class ErosionTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="erosion2d")
-      self.assertAllClose(out, out_tensor.eval())
+      self.assertAllClose(out, self.evaluate(out_tensor))
 
   def _testErosionValidPadding(self, use_gpu):
     # [1, 2, 2, 1]
@@ -479,7 +481,7 @@ class ErosionTest(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       image_tensor = constant_op.constant(
           image, shape=image_shape, name="input")
       kernel_tensor = constant_op.constant(
@@ -491,7 +493,7 @@ class ErosionTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="erosion2d")
-      out_shape = out_tensor.eval().shape
+      out_shape = self.evaluate(out_tensor).shape
 
       # Small delta is necessary for argmax to remain the same.
       err = gradient_checker.compute_gradient_error(
@@ -566,6 +568,7 @@ class ErosionTest(test.TestCase):
         padding="SAME",
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testErosionGrad(self):
     for use_gpu in True, False:
       self._testErosionGradValidPadding_1x1x1(use_gpu)
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
index 3cbbd48c8cb26d5cdb457c9599bfc9131000d174..380d2860da4771faf1c22fe870e38b8c13edd896 100644
--- a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
@@ -114,7 +115,7 @@ class DepthwiseConv2DTest(test.TestCase):
     # Initializes the input and filter tensor with numbers incrementing from 1.
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
     x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       with sess.graph._kernel_label_map({"DepthwiseConv2dNative": "neon"}):
         t1 = constant_op.constant(x1, shape=tensor_in_sizes)
         t1.set_shape(tensor_in_sizes)
@@ -142,8 +143,8 @@ class DepthwiseConv2DTest(test.TestCase):
       conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
 
-      native_result = sess.run(conv_native)
-      interface_result = sess.run(conv_interface)
+      native_result = self.evaluate(conv_native)
+      interface_result = self.evaluate(conv_interface)
 
     print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
           ", stride:", stride, ", padding: ", padding, ", max diff: ",
@@ -153,6 +154,7 @@ class DepthwiseConv2DTest(test.TestCase):
     self.assertShapeEqual(native_result, conv_native)
     self.assertShapeEqual(native_result, conv_interface)
 
+  @test_util.run_deprecated_v1
   def testDepthwiseConv2D(self):
     for index, (input_size, filter_size, _, stride,
                 padding) in enumerate(ConfigsToTest()):
@@ -204,18 +206,19 @@ class DepthwiseConv2DTest(test.TestCase):
     # numbers from 1.
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
     x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       with sess.graph._kernel_label_map({"DepthwiseConv2dNative": "neon"}):
         t1 = constant_op.constant(x1, shape=tensor_in_sizes)
         t1.set_shape(tensor_in_sizes)
         t2 = constant_op.constant(x2, shape=filter_in_sizes)
         conv = nn_ops.depthwise_conv2d_native(
             t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        value = sess.run(conv)
+        value = self.evaluate(conv)
     print("value = ", value)
     self.assertAllClose(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
+  @test_util.run_deprecated_v1
   def testConv2D2x2Filter(self):
     # The inputs look like this (it's a 3 x 2 matrix, each of depth 2):
     #
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index 3f71b326a2fcb8accfd3182ce5d42f30aa2c74b4..20b9ad95c8be7aa59a2a1b70d59341e2f3ec8fa4 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.platform import test as test_lib
@@ -35,6 +36,7 @@ def _AddTest(test, test_name, fn):
 
 class NormOpTest(test_lib.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBadOrder(self):
     matrix = [[0., 1.], [2., 3.]]
     for ord_ in "fro", -7, -1.1, 0:
@@ -52,6 +54,7 @@ class NormOpTest(test_lib.TestCase):
                                    "'ord' must be a supported matrix norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1])
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidAxis(self):
     matrix = [[0., 1.], [2., 3.]]
     for axis_ in [], [1, 2, 3], [[1]], [[1], [2]], [3.1415], [1, 1]:
@@ -65,12 +68,12 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
 
   def _CompareNorm(self, matrix):
     np_norm = np.linalg.norm(matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       if use_static_shape_:
         tf_matrix = constant_op.constant(matrix)
         tf_norm = linalg_ops.norm(
             tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
-        tf_norm_val = sess.run(tf_norm)
+        tf_norm_val = self.evaluate(tf_norm)
       else:
         tf_matrix = array_ops.placeholder(dtype_)
         tf_norm = linalg_ops.norm(
@@ -78,6 +81,7 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
         tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix})
     self.assertAllClose(np_norm, tf_norm_val, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     is_matrix_norm = (isinstance(axis_, tuple) or
                       isinstance(axis_, list)) and len(axis_) == 2
diff --git a/tensorflow/python/kernel_tests/nth_element_op_test.py b/tensorflow/python/kernel_tests/nth_element_op_test.py
index 1b8f02140fb5d531c7c1ab2ea6a5fc0b00e5d259..4be78b2d5ca57d96e691215cf0f17f3a48fce130 100644
--- a/tensorflow/python/kernel_tests/nth_element_op_test.py
+++ b/tensorflow/python/kernel_tests/nth_element_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@@ -32,10 +33,10 @@ class NthElementTest(test.TestCase):
 
   def _validateNthElement(self, inputs, dtype, n, reverse, expected_values):
     np_expected_values = np.array(expected_values)
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       inputs_op = ops.convert_to_tensor(inputs, dtype=dtype)
       values_op = nn_ops.nth_element(inputs_op, n, reverse=reverse)
-      values = sess.run(values_op)
+      values = self.evaluate(values_op)
 
       self.assertShapeEqual(np_expected_values, values_op)
       self.assertAllClose(np_expected_values, values)
@@ -111,17 +112,20 @@ class NthElementTest(test.TestCase):
     self._testEnumerateN([10, 10, 10])
     self._testEnumerateN([10, 10, 10, 10])
 
+  @test_util.run_deprecated_v1
   def testInvalidInput(self):
     with self.assertRaisesRegexp(ValueError,
                                  "at least rank 1 but is rank 0"):
       nn_ops.nth_element(5, 0)
 
+  @test_util.run_deprecated_v1
   def testInvalidInputAtEval(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       v = array_ops.placeholder(dtype=dtypes.float32)
       with self.assertRaisesOpError("Input must be >= 1-D"):
         nn_ops.nth_element(v, 0).eval(feed_dict={v: 5.0})
 
+  @test_util.run_deprecated_v1
   def testInvalidN(self):
     with self.assertRaisesRegexp(ValueError,
                                  "non-negative but is -1"):
@@ -130,30 +134,34 @@ class NthElementTest(test.TestCase):
                                  "scalar but has rank 1"):
       nn_ops.nth_element([5, 6, 3], [1])
 
+  @test_util.run_deprecated_v1
   def testInvalidNAtEval(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       n = array_ops.placeholder(dtypes.int32)
       values = nn_ops.nth_element(inputs, n)
       with self.assertRaisesOpError("Need n >= 0, got -7"):
         values.eval(feed_dict={n: -7})
 
+  @test_util.run_deprecated_v1
   def testNTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.assertRaisesRegexp(ValueError,
                                  "must have last dimension > n = 2"):
       nn_ops.nth_element(inputs, 2)
 
+  @test_util.run_deprecated_v1
   def testNTooLargeAtEval(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       n = array_ops.placeholder(dtypes.int32)
       values = nn_ops.nth_element(inputs, n)
       with self.assertRaisesOpError(r"Input must have at least n\+1 columns"):
         values.eval(feed_dict={n: 2})
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       inputs = array_ops.placeholder(dtypes.float32, shape=[3, 5])
       values = nn_ops.nth_element(inputs, 3)
       grad = sess.run(
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 6cc70f7c8930ffe0a55ad97ec13eefe00ffd21b4..f13f9d68062e7874222b5bc67d6fcc8378af0714 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -35,11 +36,11 @@ class VerifyTensorAllFiniteTest(test.TestCase):
   def testVerifyTensorAllFiniteSucceeds(self):
     x_shape = [5, 4]
     x = np.random.random_sample(x_shape).astype(np.float32)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
       t_verified = numerics.verify_tensor_all_finite(t,
                                                      "Input is not a number.")
-      self.assertAllClose(x, t_verified.eval())
+      self.assertAllClose(x, self.evaluate(t_verified))
 
   def testVerifyTensorAllFiniteFails(self):
     x_shape = [5, 4]
@@ -48,21 +49,22 @@ class VerifyTensorAllFiniteTest(test.TestCase):
 
     # Test NaN.
     x[0] = np.nan
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       with self.assertRaisesOpError(my_msg):
         t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
         t_verified = numerics.verify_tensor_all_finite(t, my_msg)
-        t_verified.eval()
+        self.evaluate(t_verified)
 
     # Test Inf.
     x[0] = np.inf
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       with self.assertRaisesOpError(my_msg):
         t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
         t_verified = numerics.verify_tensor_all_finite(t, my_msg)
-        t_verified.eval()
+        self.evaluate(t_verified)
 
 
+@test_util.run_v1_only("b/120545219")
 class NumericsTest(test.TestCase):
 
   def testInf(self):
@@ -73,7 +75,7 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("Inf"):
-        a.eval()
+        self.evaluate(a)
 
   def testNaN(self):
     with self.session(graph=ops.Graph()):
@@ -83,7 +85,7 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("NaN"):
-        a.eval()
+        self.evaluate(a)
 
   def testBoth(self):
     with self.session(graph=ops.Graph()):
@@ -93,13 +95,13 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("Inf and NaN"):
-        a.eval()
+        self.evaluate(a)
 
   def testPassThrough(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       checked = array_ops.check_numerics(t1, message="pass through test")
-      value = checked.eval()
+      value = self.evaluate(checked)
       self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value)
       self.assertEqual([2, 3], checked.get_shape())
 
diff --git a/tensorflow/python/kernel_tests/one_hot_op_test.py b/tensorflow/python/kernel_tests/one_hot_op_test.py
index b449a195a7a5eaad2e06c1a13f2856af9de012f0..856ba7bb7f3c5fb340a80c88b7c4ff2c33277568 100644
--- a/tensorflow/python/kernel_tests/one_hot_op_test.py
+++ b/tensorflow/python/kernel_tests/one_hot_op_test.py
@@ -34,19 +34,19 @@ class OneHotTest(test.TestCase):
                   expected_err_re=None,
                   raises=None,
                   **inputs):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       if raises is not None:
         with self.assertRaises(raises):
           array_ops.one_hot(**inputs)
       else:
         ans = array_ops.one_hot(**inputs)
         if expected_err_re is None:
-          tf_ans = ans.eval()
+          tf_ans = self.evaluate(ans)
           self.assertAllEqual(tf_ans, truth)
           self.assertEqual(tf_ans.shape, ans.get_shape())
         else:
           with self.assertRaisesOpError(expected_err_re):
-            ans.eval()
+            self.evaluate(ans)
 
   def _testBothOneHot(self, truth, expected_err_re=None, raises=None, **inputs):
     self._testOneHot(truth, True, expected_err_re, raises, **inputs)
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index e415d7879e8791edb9afa63cd442a9a3b8fa0931..7b1b054ae0656ef8ae988c1a3220a2a643afbcab 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -85,15 +86,15 @@ class PadOpTest(test.TestCase):
   def _testPad(self, np_inputs, paddings, mode, constant_values):
     np_val = self._npPad(np_inputs, paddings, mode=mode,
                          constant_values=constant_values)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_val = array_ops.pad(np_inputs, paddings, mode=mode,
                              constant_values=constant_values)
-      out = tf_val.eval()
+      out = self.evaluate(tf_val)
     self.assertAllEqual(np_val, out)
     self.assertShapeEqual(np_val, tf_val)
 
   def _testGradient(self, x, a, mode, constant_values):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       xs = list(x.shape)
       ina = ops.convert_to_tensor(a)
@@ -116,56 +117,63 @@ class PadOpTest(test.TestCase):
           self._testGradient(np_inputs, paddings, mode=mode,
                              constant_values=constant_values)
 
+  @test_util.run_deprecated_v1
   def testInputDims(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2, 1, 1, 1, 1]),
                       array_ops.reshape(
                           [1, 2], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2]),
                       array_ops.reshape(
                           [1, 2], shape=[2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim2(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2]),
                       array_ops.reshape(
                           [1, 2], shape=[2, 1]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim3(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2]),
                       array_ops.reshape(
                           [1, 2], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim4(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
         array_ops.pad(array_ops.reshape(
             [1, 2], shape=[1, 2]),
                       array_ops.reshape(
                           [1, 2, 3, 4, 5, 6], shape=[3, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsNonNegative(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaisesRegexp(ValueError, "must be non-negative"):
         array_ops.pad(constant_op.constant(
             [1], shape=[1]),
                       constant_op.constant(
                           [-1, 0], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsNonNegative2(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaisesRegexp(ValueError, "must be non-negative"):
         array_ops.pad(constant_op.constant(
             [1], shape=[1]),
@@ -173,7 +181,7 @@ class PadOpTest(test.TestCase):
                           [-1, 0], shape=[1, 2]))
 
   def testPaddingsMaximum(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaises(Exception):
         array_ops.pad(constant_op.constant(
             [1], shape=[2]),
@@ -203,12 +211,12 @@ class PadOpTest(test.TestCase):
                              paddings,
                              mode=mode,
                              constant_values=0)
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           tf_val = array_ops.pad(inputs,
                                  constant_op.constant(paddings, padding_dtype),
                                  mode=mode,
                                  constant_values=0)
-          out = tf_val.eval()
+          out = self.evaluate(tf_val)
         self.assertAllEqual(np_val, out)
         self.assertShapeEqual(np_val, tf_val)
 
@@ -223,6 +231,7 @@ class PadOpTest(test.TestCase):
           np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t),
           [[0, 0], [0, 0], [0, 0], [0, 0]], -123)
 
+  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float32, np.float64]:
       self._testAll(np.random.rand(2, 5).astype(t), [[1, 0], [2, 0]], 0.0)
@@ -249,18 +258,19 @@ class PadOpTest(test.TestCase):
                             constant_values="PAD")
     symmetric = array_ops.pad(x, [[1, 0], [0, 1]], mode="SYMMETRIC",
                               constant_values="PAD")
-    with self.test_session(use_gpu=True):
-      self.assertAllEqual([[b"PAD", b"PAD", b"PAD"],
-                           [b"Hello", b"World", b"PAD"],
-                           [b"Goodnight", b"Moon", b"PAD"]], constant.eval())
+    with self.session(use_gpu=True):
+      self.assertAllEqual(
+          [[b"PAD", b"PAD", b"PAD"], [b"Hello", b"World", b"PAD"],
+           [b"Goodnight", b"Moon", b"PAD"]], self.evaluate(constant))
       self.assertAllEqual([[b"Goodnight", b"Moon", b"Goodnight"],
                            [b"Hello", b"World", b"Hello"],
                            [b"Goodnight", b"Moon", b"Goodnight"]],
-                          reflect.eval())
-      self.assertAllEqual([[b"Hello", b"World", b"World"],
-                           [b"Hello", b"World", b"World"],
-                           [b"Goodnight", b"Moon", b"Moon"]], symmetric.eval())
+                          self.evaluate(reflect))
+      self.assertAllEqual(
+          [[b"Hello", b"World", b"World"], [b"Hello", b"World", b"World"],
+           [b"Goodnight", b"Moon", b"Moon"]], self.evaluate(symmetric))
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # Unknown paddings shape.
     inp = constant_op.constant(0.0, shape=[4, 4, 4, 4])
@@ -277,6 +287,7 @@ class PadOpTest(test.TestCase):
     padded = array_ops.pad(inp, array_ops.placeholder(dtypes.int32))
     self.assertAllEqual(None, padded.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testPartialShapeInformation(self):
     unknown = array_ops.placeholder(dtypes.int32)
 
@@ -325,9 +336,9 @@ class PadOpTest(test.TestCase):
   def testScalars(self):
     paddings = np.zeros((0, 2), dtype=np.int32)
     inp = np.asarray(7)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       tf_val = array_ops.pad(inp, paddings)
-      out = tf_val.eval()
+      out = self.evaluate(tf_val)
     self.assertAllEqual(inp, out)
     self.assertShapeEqual(inp, tf_val)
 
@@ -335,12 +346,13 @@ class PadOpTest(test.TestCase):
     for dtype in [dtypes.int32, dtypes.int64]:
       paddings = np.zeros((0, 2))
       inp = np.asarray(7)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         tf_val = array_ops.pad(inp, constant_op.constant(paddings, dtype=dtype))
-        out = tf_val.eval()
+        out = self.evaluate(tf_val)
       self.assertAllEqual(inp, out)
       self.assertShapeEqual(inp, tf_val)
 
+  @test_util.run_deprecated_v1
   def testCollapseAdjacentNonPaddedDimensions(self):
     # pyformat: disable
     paddings_values = [[[0, 0], [0, 0], [0, 0], [0, 1]],
@@ -360,12 +372,13 @@ class PadOpTest(test.TestCase):
             padded,
             [paddings_value[i][0] + inp.shape.dims[i].value for i in range(4)],
             [-1, -1, -1, -1])
-        with self.test_session(use_gpu=True):
-          self.assertAllEqual(inp.eval(), middle.eval())
+        with self.cached_session(use_gpu=True):
+          self.assertAllEqual(inp.eval(), self.evaluate(middle))
           self.assertAllEqual(
-              np.zeros([row[0] for row in paddings_value]), left.eval())
+              np.zeros([row[0] for row in paddings_value]), self.evaluate(left))
           self.assertAllEqual(
-              np.zeros([row[1] for row in paddings_value]), right.eval())
+              np.zeros([row[1] for row in paddings_value]),
+              self.evaluate(right))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index 95f3dcceeaa14909b706b1f1c0676c5df28b8427..e3999695d0605f49d1440c3305f020e4871940a3 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -29,11 +29,13 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class PaddingFIFOQueueTest(test.TestCase):
 
   def testConstructor(self):
@@ -126,7 +128,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -158,7 +160,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops]
       for thread in threads:
@@ -178,7 +180,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -193,13 +195,13 @@ class PaddingFIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
       def dequeue():
         for _ in xrange(len(elems)):
-          results.append(sess.run(dequeued_t))
+          results.append(self.evaluate(dequeued_t))
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -224,7 +226,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        x_val, y_val = sess.run(dequeued_t)
+        x_val, y_val = self.evaluate(dequeued_t)
         x, y = elems[i]
         self.assertEqual([x], x_val)
         self.assertEqual([y], y_val)
@@ -243,9 +245,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -257,7 +259,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i % 4]], vals)
 
   def testEmptyEnqueueMany(self):
@@ -269,9 +271,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -279,9 +281,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithDynamicShape(self):
     with self.cached_session():
@@ -290,9 +292,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue(([10.0],))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpToWithDynamicShape(self):
     with self.cached_session():
@@ -301,9 +303,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue(([10.0],))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testConstructPaddingFIFOQueueWithNoShape(self):
     with self.cached_session():
@@ -327,7 +329,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -344,7 +346,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -357,8 +359,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testDequeueUpToNoBlocking(self):
     with self.cached_session():
@@ -369,8 +371,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testMultiDequeueMany(self):
     with self.cached_session() as sess:
@@ -387,17 +389,17 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
@@ -418,7 +420,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertTrue(
@@ -428,11 +430,11 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertTrue(
@@ -459,7 +461,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       for enqueue_op in enqueue_ops:
         enqueue_op.run()
-      string_val, int_val = sess.run(dequeued_t)
+      string_val, int_val = self.evaluate(dequeued_t)
 
       self.assertAllEqual([[b"a", b"", b""], [b"ab", b"", b""],
                            [b"abc", b"", b""], [b"abc", b"d", b""],
@@ -473,7 +475,7 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      string_val, int_val = sess.run(dequeued_single_t)
+      string_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual([b"abc", b"d", b"e", b"f"], string_val)
       self.assertAllEqual([[1, 2, 3, 4]], int_val)
       self.assertTrue(
@@ -500,7 +502,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       for enqueue_op in enqueue_ops:
         enqueue_op.run()
-      string_val, int_val = sess.run(dequeued_t)
+      string_val, int_val = self.evaluate(dequeued_t)
 
       self.assertAllEqual([[b"a", b"", b""], [b"ab", b"", b""],
                            [b"abc", b"", b""], [b"abc", b"d", b""],
@@ -514,7 +516,7 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      string_val, int_val = sess.run(dequeued_single_t)
+      string_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual([b"abc", b"d", b"e", b"f"], string_val)
       self.assertAllEqual([[1, 2, 3, 4]], int_val)
       self.assertTrue(
@@ -622,7 +624,7 @@ class PaddingFIFOQueueTest(test.TestCase):
                                    r"Expected \[2,\?,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testParallelEnqueueMany(self):
     with self.cached_session() as sess:
@@ -633,7 +635,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -656,7 +658,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -680,7 +682,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -700,11 +702,11 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def enqueue():
         for _ in xrange(100):
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       def dequeue():
         for _ in xrange(100):
-          self.assertTrue(sess.run(dequeued_t) in (10.0, 20.0))
+          self.assertTrue(self.evaluate(dequeued_t) in (10.0, 20.0))
 
       enqueue_threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       dequeue_threads = [self.checkedThread(target=dequeue) for _ in range(10)]
@@ -736,7 +738,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         for i in xrange(250):
-          self.assertEqual(i, sess.run(dequeued_t))
+          self.assertEqual(i, self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -767,7 +769,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeuemany_t = q.dequeue_many(count_placeholder)
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -776,7 +778,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       while elements_dequeued < 250:
         # With equal probability, run Dequeue or dequeue_many.
         if random.random() > 0.5:
-          self.assertEqual(elements_dequeued, dequeued_t.eval())
+          self.assertEqual(elements_dequeued, self.evaluate(dequeued_t))
           elements_dequeued += 1
         else:
           count = random.randint(0, min(20, 250 - elements_dequeued))
@@ -805,10 +807,10 @@ class PaddingFIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -832,10 +834,10 @@ class PaddingFIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -882,12 +884,12 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
       close_op.run()
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -901,11 +903,11 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         for elem in elems:
-          self.assertEqual([elem], sess.run(dequeued_t))
+          self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -926,8 +928,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
-        self.assertAllEqual(elems[3:], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
+        self.assertAllEqual(elems[3:], self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -947,7 +949,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -968,11 +970,11 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems, sess.run(dequeued_t))
+        self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -993,11 +995,11 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1017,16 +1019,16 @@ class PaddingFIFOQueueTest(test.TestCase):
       cleanup_dequeue_t = q.dequeue()
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        self.assertAllEqual(elems[0:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[0:3], self.evaluate(dequeued_t))
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run(dequeued_t)
-        self.assertEqual(elems[3], sess.run(cleanup_dequeue_t))
+          self.evaluate(dequeued_t)
+        self.assertEqual(elems[3], self.evaluate(cleanup_dequeue_t))
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -1059,7 +1061,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run([dequeued_a_t, dequeued_b_t])
+          self.evaluate([dequeued_a_t, dequeued_b_t])
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1072,7 +1074,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       # Test that the elements in the partially-dequeued batch are
       # restored in the correct order.
       for elem_a, elem_b in zip(elems_a, elems_b):
-        val_a, val_b = sess.run([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
+        val_a, val_b = self.evaluate([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
         self.assertEqual(elem_a, val_a)
         self.assertEqual(elem_b, val_b)
       self.assertEqual(0, q.size().eval())
@@ -1087,7 +1089,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1107,7 +1109,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1155,7 +1157,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1163,8 +1165,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
-      self.assertEqual([50.0], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
       thread.join()
 
   def testBlockingEnqueueManyToFullQueue(self):
@@ -1178,7 +1180,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1186,10 +1188,10 @@ class PaddingFIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
         time.sleep(0.01)
-      self.assertEqual([50.0], dequeued_t.eval())
-      self.assertEqual([60.0], dequeued_t.eval())
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
+      self.assertEqual([60.0], self.evaluate(dequeued_t))
 
       # Make sure the thread finishes before exiting.
       thread.join()
@@ -1207,7 +1209,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # Expect the operation to succeed once the dequeue op runs.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1217,18 +1219,18 @@ class PaddingFIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
 
       for elem in [20.0, 30.0, 40.0, 50.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
       self.assertEqual(0, q.size().eval())
 
   def testBlockingEnqueueManyBeforeClose(self):
@@ -1242,7 +1244,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1252,17 +1254,17 @@ class PaddingFIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
       for elem in [20.0, 30.0, 50.0, 60.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
 
   def testDoesNotLoseValue(self):
     with self.cached_session():
@@ -1379,20 +1381,21 @@ class PaddingFIFOQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
+  @test_util.run_deprecated_v1
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
       q_empty = data_flow_ops.PaddingFIFOQueue(5, dtypes_lib.float32, ((),))
@@ -1434,7 +1437,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       def blocking_enqueue():
         enq_done.append(False)
         # This will fill the queue and then block until enough dequeues happen.
-        sess.run(enq)
+        self.evaluate(enq)
         enq_done.append(True)
 
       thread = self.checkedThread(target=blocking_enqueue)
@@ -1444,14 +1447,14 @@ class PaddingFIFOQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1477,7 +1480,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def blocking_dequeue():
         # Will only complete after 4 enqueues complete.
-        results.extend(sess.run(deq))
+        results.extend(self.evaluate(deq))
 
       thread = self.checkedThread(target=blocking_dequeue)
       thread.start()
@@ -1486,7 +1489,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         self.assertEqual(len(results), 0)
-        sess.run(enq)
+        self.evaluate(enq)
 
       # Enough enqueued to unblock the dequeue
       thread.join()
@@ -1517,7 +1520,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       q.enqueue_many(input_tuple).run()
 
       output_tuple_t = q.dequeue_many(32)
-      output_tuple = sess.run(output_tuple_t)
+      output_tuple = self.evaluate(output_tuple_t)
 
       for (input_elem, output_elem) in zip(input_tuple, output_tuple):
         self.assertAllEqual(input_elem, output_elem)
diff --git a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
index e14894cf56ba4373a7d4fb9a2af7758f77238e57..f87f5170539eab9b3599b271fef9c7cce7cd150f 100644
--- a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
+++ b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
@@ -29,12 +29,21 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+def _get_stddev_inside_bounds_before_using_randn(gpu):
+  # The boundary where the randn sampler is used varies between CPU and GPU.
+  if gpu:
+    return 1.3
+  else:
+    return 1.7
+
+
 class TruncatedNormalMoments(object):
   memoized_moments = None
   mean = None
@@ -115,7 +124,7 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
       # Give up early if we are unable to import it.
       import scipy.stats  # pylint: disable=g-import-not-at-top,unused-variable
       random_seed.set_random_seed(seed)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         samples = random_ops.parameterized_truncated_normal(shape, mean, stddev,
                                                             minval,
                                                             maxval).eval()
@@ -139,7 +148,7 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
     try:
       import scipy.stats  # pylint: disable=g-import-not-at-top
       random_seed.set_random_seed(seed)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         samples = random_ops.parameterized_truncated_normal(shape, mean, stddev,
                                                             minval,
                                                             maxval).eval()
@@ -158,42 +167,76 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test truncated normal op: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testDefaults(self):
     self.validateMoments([10**5], 0.0, 1.0, -2.0, 2.0)
 
+  @test_util.run_deprecated_v1
   def testShifted(self):
     self.validateMoments([10**5], -1.0, 1.0, -2.0, 2.0)
 
+  @test_util.run_deprecated_v1
   def testRightTail(self):
     self.validateMoments([10**5], 0.0, 1.0, 4.0, np.infty)
 
+  @test_util.run_deprecated_v1
   def testLeftTail(self):
     self.validateMoments([10**5], 0.0, 1.0, -np.infty, -4.0)
 
+  @test_util.run_deprecated_v1
   def testLeftTailTwoSidedBounds(self):
     self.validateMoments([10**5], 0.0, 1.0, -6.0, -3.0)
 
+  @test_util.run_deprecated_v1
   def testTwoSidedLeftTailShifted(self):
     self.validateKolmogorovSmirnov([10**5], 6.0, 1.0, -1.0, 1.0)
 
+  @test_util.run_deprecated_v1
   def testRightTailShifted(self):
     self.validateMoments([10**5], -5.0, 1.0, 2.0, np.infty)
 
+  @test_util.run_deprecated_v1
   def testSmallStddev(self):
     self.validateKolmogorovSmirnov([10**5], 0.0, 0.1, 0.05, 0.10)
 
+  @test_util.run_deprecated_v1
   def testSamplingWithSmallStdDevFarFromBound(self):
     sample_op = random_ops.parameterized_truncated_normal(
         shape=(int(1e5),), means=0.8, stddevs=0.05, minvals=-1., maxvals=1.)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       samples = sess.run(sample_op)
       # 0. is more than 16 standard deviations from the mean, and
       # should have a likelihood < 1e-57.
-      # TODO(jjhunt)  Sampler is still numerically unstable in this case,
-      # numbers less than 0 should never observed.
+      assert (~np.isnan(samples)).all()
       no_neg_samples = np.sum(samples < 0.)
-      self.assertLess(no_neg_samples, 2.)
+      self.assertEqual(no_neg_samples, 0.)
+
+  @test_util.run_deprecated_v1
+  def testSamplingAtRandnSwitchover(self):
+    # The randn sampler is used as the bounds are moved farther from the mean,
+    # and the probability of accepting a sample increases the farther the
+    # bounds are from the mean.
+    # This test asserts that at the point of switchover, both samplers are
+    # working (not raising an error or returning nan) and returning the
+    # expected moments.
+    use_gpu = test.is_gpu_available()
+    stddev_inside_bounds_before_using_randn = (
+        _get_stddev_inside_bounds_before_using_randn(use_gpu))
+
+    epsilon = 0.001
+    self.validateMoments(
+        shape=[10**6],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn - epsilon)
+    self.validateMoments(
+        shape=[10**6],
+        mean=0.,
+        stddev=1.0,
+        minval=-epsilon,
+        maxval=stddev_inside_bounds_before_using_randn + epsilon)
 
 
 # Benchmarking code
@@ -222,6 +265,58 @@ def parameterized_vs_naive(shape, num_iters, use_gpu=False):
     return param_dt, naive_dt
 
 
+def randn_sampler_switchover(shape, num_iters, use_gpu=False):
+  # Benchmark by constructing samplers on the threshold of using the randn
+  # rejection sampling and check that this threshold is set correctly by
+  # benchmarking with bounds just above and below this threshold.
+  # The uniform and randn samplers should have about the same performance
+  # at this point.
+
+  stddev_inside_bounds_before_using_randn = (
+      _get_stddev_inside_bounds_before_using_randn(use_gpu))
+
+  epsilon = 0.001
+
+  np.random.seed(1618)  # Make it reproducible.
+
+  # No CSE/CF.
+  optimizer_options = config_pb2.OptimizerOptions(
+      opt_level=config_pb2.OptimizerOptions.L0)
+  config = config_pb2.ConfigProto(
+      graph_options=config_pb2.GraphOptions(
+          optimizer_options=optimizer_options))
+
+  with session.Session(config=config) as sess:
+    with ops.device("/cpu:0" if not use_gpu else "/gpu:0"):
+      uniform_sampler_op = control_flow_ops.group(
+          random_ops.parameterized_truncated_normal(
+              shape,
+              means=0.,
+              stddevs=1.0,
+              minvals=-stddev_inside_bounds_before_using_randn + epsilon,
+              maxvals=0.01))
+      randn_sampler_op = control_flow_ops.group(
+          random_ops.parameterized_truncated_normal(
+              shape,
+              means=0.,
+              stddevs=1.0,
+              minvals=-stddev_inside_bounds_before_using_randn - epsilon,
+              maxvals=0.01))
+
+    # Burn-in to avoid session setup costs in the timing.
+    sess.run(uniform_sampler_op)
+    sess.run(uniform_sampler_op)
+    uniform_dt = timeit.timeit(
+        lambda: sess.run(uniform_sampler_op), number=num_iters)
+
+    sess.run(randn_sampler_op)
+    sess.run(randn_sampler_op)
+    randn_dt = timeit.timeit(
+        lambda: sess.run(randn_sampler_op), number=num_iters)
+
+    return randn_dt, uniform_dt
+
+
 class TruncatedNormalBenchmark(test.Benchmark):
 
   def benchmarkParameterizedOpVsNaiveOpCpu(self):
@@ -249,6 +344,26 @@ class TruncatedNormalBenchmark(test.Benchmark):
       self.report_benchmark(
           name="naive_shape" + shape_str, iters=num_iters, wall_time=n_dt)
 
+  def benchmarkRandnSamplerCPU(self):
+    self._benchmarkRandnSampler(False)
+
+  def benchmarkRandnSamplerGPU(self):
+    self._benchmarkRandnSampler(True)
+
+  def _benchmarkRandnSampler(self, use_gpu):
+    num_iters = 100
+    shape = [int(1e6)]
+    randn_dt, uniform_dt = randn_sampler_switchover(shape, num_iters, use_gpu)
+
+    print(("Randn Sampler vs uniform samplers [%d iters]\t%.4f\t%.4f") %
+          (num_iters, randn_dt, uniform_dt))
+
+    gpu_str = "_gpu" if use_gpu else "_cpu"
+    self.report_benchmark(
+        name="randn_sampler" + gpu_str, iters=num_iters, wall_time=randn_dt)
+    self.report_benchmark(
+        name="uniform_sampler" + gpu_str, iters=num_iters, wall_time=uniform_dt)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
index a84895a287eeb0d67cce563254e2383e390c9e2c..43c8fa4ab5c5c9d71a4ac67fd0e90c34b36b45b4 100644
--- a/tensorflow/python/kernel_tests/parse_single_example_op_test.py
+++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -107,7 +108,7 @@ class ParseExampleTest(test.TestCase):
         for result_dict in [out, out_with_example_name]:
           result = flatten_values_tensors_or_sparse(result_dict.values())
           # Check values.
-          tf_result = sess.run(result)
+          tf_result = self.evaluate(result)
           _compare_output_to_expected(self, result_dict, expected_values,
                                       tf_result)
 
@@ -121,6 +122,7 @@ class ParseExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
+  @test_util.run_deprecated_v1
   def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
     a_name = "a"
@@ -229,6 +231,7 @@ class ParseExampleTest(test.TestCase):
         },
         expected_err=(ValueError, "Missing shape for feature a"))
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparse(self):
     original = [
         example(features=features({
@@ -552,6 +555,7 @@ class ParseExampleTest(test.TestCase):
           }
       }, expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
     original = [
         example(features=features({
@@ -618,6 +622,7 @@ class ParseExampleTest(test.TestCase):
           },
           expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
     original = [
         example(features=features({
@@ -658,6 +663,7 @@ class ParseExampleTest(test.TestCase):
           }
       }, expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
     aname = "a"
     bname = "b"
@@ -869,6 +875,7 @@ class ParseSingleExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
+  @test_util.run_deprecated_v1
   def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
     original = example(features=features({
         "c": float_feature([3, 4]),
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index 71d8b60d3ccf9fafaa16fa705c3261e008d8409c..af76e09f3931004063a5faa2070058ee2e4a0fc5 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
@@ -101,15 +102,15 @@ class ParseExampleTest(test.TestCase):
         out = parsing_ops.parse_example(**kwargs)
         result = flatten_values_tensors_or_sparse(out.values())
         # Check values.
-        tf_result = sess.run(result)
+        tf_result = self.evaluate(result)
         _compare_output_to_expected(self, out, expected_values, tf_result)
 
       # Check shapes; if serialized is a Tensor we need its size to
       # properly check.
       serialized = kwargs["serialized"]
       batch_size = (
-          serialized.eval().size if isinstance(serialized, ops.Tensor) else
-          np.asarray(serialized).size)
+          self.evaluate(serialized).size if isinstance(serialized, ops.Tensor)
+          else np.asarray(serialized).size)
       for k, f in kwargs["features"].items():
         if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
           self.assertEqual(
@@ -121,6 +122,7 @@ class ParseExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (2,))
 
+  @test_util.run_deprecated_v1
   def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
     a_name = "a"
@@ -243,6 +245,7 @@ class ParseExampleTest(test.TestCase):
         },
         expected_err=(ValueError, "Missing shape for feature a"))
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparse(self):
     original = [
         example(features=features({
@@ -571,6 +574,7 @@ class ParseExampleTest(test.TestCase):
         }
     }, expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
     expected_st_a = (  # indices, values, shape
         np.empty((0, 2), dtype=np.int64),  # indices
@@ -631,6 +635,7 @@ class ParseExampleTest(test.TestCase):
         },
         expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
     expected_idx = (  # indices, values, shape
         np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
@@ -740,6 +745,7 @@ class ParseExampleTest(test.TestCase):
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
     aname = "a"
     bname = "b"
@@ -962,6 +968,7 @@ class ParseSingleExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
+  @test_util.run_deprecated_v1
   def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
     original = example(
         features=features({
@@ -1180,6 +1187,7 @@ class ParseSequenceExampleTest(test.TestCase):
         expected_err=expected_err,
         batch=True)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithSparseAndDenseContext(self):
     original = sequence_example(
         context=features({
@@ -1223,6 +1231,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_context_values=expected_context_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithMultipleSizeFeatureLists(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1286,6 +1295,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_feat_list_values=expected_feature_list_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithoutDebugName(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1343,6 +1353,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_feat_list_values=expected_feature_list_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithSparseAndDenseFeatureLists(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1401,6 +1412,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_feat_list_values=expected_feature_list_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithEmptyFeatureInFeatureLists(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1541,6 +1553,7 @@ class ParseSequenceExampleTest(test.TestCase):
             " feature_list_dense_missing_assumed_empty or"
             " feature_list_dense_defaults?"))
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleBatch(self):
     first = sequence_example(
         feature_lists=feature_lists({
@@ -1614,7 +1627,7 @@ class DecodeJSONExampleTest(test.TestCase):
           shape=examples.shape,
           dtype=dtypes.string)
       binary_tensor = parsing_ops.decode_json_example(json_tensor)
-      binary_val = sess.run(binary_tensor)
+      binary_val = self.evaluate(binary_tensor)
 
       if examples.shape:
         self.assertShapeEqual(binary_val, json_tensor)
@@ -1695,16 +1708,18 @@ class DecodeJSONExampleTest(test.TestCase):
             })),
     ])
 
+  @test_util.run_deprecated_v1
   def testInvalidSyntax(self):
     with self.cached_session() as sess:
       json_tensor = constant_op.constant(["{]"])
       binary_tensor = parsing_ops.decode_json_example(json_tensor)
       with self.assertRaisesOpError("Error while parsing JSON"):
-        sess.run(binary_tensor)
+        self.evaluate(binary_tensor)
 
 
 class ParseTensorOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testToFloat32(self):
     with self.cached_session():
       expected = np.random.rand(3, 4, 5).astype(np.float32)
@@ -1718,6 +1733,7 @@ class ParseTensorOpTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @test_util.run_deprecated_v1
   def testToUint8(self):
     with self.cached_session():
       expected = np.random.rand(3, 4, 5).astype(np.uint8)
@@ -1731,6 +1747,7 @@ class ParseTensorOpTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @test_util.run_deprecated_v1
   def testTypeMismatch(self):
     with self.cached_session():
       expected = np.random.rand(3, 4, 5).astype(np.uint8)
@@ -1744,6 +1761,7 @@ class ParseTensorOpTest(test.TestCase):
           r"\(uint16\)"):
         tensor.eval(feed_dict={serialized: tensor_proto.SerializeToString()})
 
+  @test_util.run_deprecated_v1
   def testInvalidInput(self):
     with self.cached_session():
       serialized = array_ops.placeholder(dtypes.string)
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index b34d30f5c0493359b622e8a846c752f495f84212..da79b4ecfc0a3972f610c1ed39cdd0201716bee4 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -26,6 +26,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
@@ -322,17 +323,19 @@ class PartitionedVariablesTestCase(test.TestCase):
     for i in xrange(len(expected_specs)):
       self.assertEquals(expected_specs[i], slices[i]._save_slice_info.spec)
 
+  @test_util.run_deprecated_v1
   def testVecConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([1, 2, 3, 4])
       vs = partitioned_variables.create_partitioned_variables([4], [4], rnd_par)
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd_par.eval()
+      rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 4, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["4 0,1", "4 1,1", "4 2,1", "4 3,1"])
 
+  @test_util.run_deprecated_v1
   def testConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([[1, 2, 3, 4], [5, 6, 7, 8]])
@@ -340,7 +343,7 @@ class PartitionedVariablesTestCase(test.TestCase):
                                                               rnd_par)
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 1).eval()
-      rnd = rnd_par.eval()
+      rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 2, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["2 4 0,2:0,2", "2 4 0,2:2,2"])
@@ -401,12 +404,15 @@ class PartitionedVariablesTestCase(test.TestCase):
       self.assertEqual(var2_name + "/part_0:0", vs2[0].name)
       self.assertEqual(var2_name + "/part_1:0", vs2[1].name)
 
+  @test_util.run_deprecated_v1
   def testName(self):
     self._testNameHelper(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceName(self):
     self._testNameHelper(use_resource=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testRandomInitValue(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([200, 40]))
@@ -414,7 +420,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [1, 10], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 1).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.float32] * 10, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, [
@@ -424,6 +430,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           "200 40 0,200:36,4"
       ])
 
+  @test_util.run_v1_only("b/120545219")
   def testRandomInitUnevenPartitions(self):
     with self.cached_session():
       rnd = variables.Variable(
@@ -434,7 +441,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           for i in xrange(1, 10)
       ]
       variables.global_variables_initializer().run()
-      rnd_val = rnd.eval()
+      rnd_val = self.evaluate(rnd)
       # Only check the slice save specs for the first 5 tf.
       save_specs = [
           # One slice
@@ -462,6 +469,7 @@ class PartitionedVariablesTestCase(test.TestCase):
         if i < len(save_specs):
           self._TestSaveSpec(vs, save_specs[i])
 
+  @test_util.run_v1_only("b/120545219")
   def testDegenerate(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
@@ -469,10 +477,11 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [1, 1], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, ["10 43 0,10:0,43"])
 
+  @test_util.run_v1_only("b/120545219")
   def testSliceSizeOne(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
@@ -480,7 +489,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [10, 1], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, [
           "10 43 0,1:0,43", "10 43 1,1:0,43", "10 43 2,1:0,43",
@@ -488,6 +497,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           "10 43 6,1:0,43", "10 43 7,1:0,43", "10 43 8,1:0,43", "10 43 9,1:0,43"
       ])
 
+  @test_util.run_deprecated_v1
   def testIotaInitializer(self):
     self.assertAllClose([0., 1., 2., 3.], _IotaInitializer([4]))
     self.assertAllClose([[0., 1.], [0., 10.], [0., 100.], [0., 1000.]],
@@ -503,6 +513,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       self.assertAllClose(slice0 + slice1 + slice2, val)
       self._TestSaveSpec(vs, ["13 5 0,5:0,5", "13 5 5,4:0,5", "13 5 9,4:0,5"])
 
+  @test_util.run_deprecated_v1
   def testRandomInitializer(self):
     # Sanity check that the slices uses a different seed when using a random
     # initializer function.
@@ -510,7 +521,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer())
       variables.global_variables_initializer().run()
-      val0, val1 = var0.eval().flatten(), var1.eval().flatten()
+      val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6)
     # Negative test that proves that slices have the same values if
     # the random initializer uses a seed.
@@ -518,7 +529,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer(seed=201))
       variables.global_variables_initializer().run()
-      val0, val1 = var0.eval().flatten(), var1.eval().flatten()
+      val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertAllClose(val0, val1)
 
   def testSomeErrors(self):
@@ -546,6 +557,7 @@ class PartitionedVariablesTestCase(test.TestCase):
         partitioned_variables.create_partitioned_variables(
             [10, 43], [1, 50], rnd.initialized_value())
 
+  @test_util.run_deprecated_v1
   def testControlDepsNone(self):
     with self.cached_session() as session:
       c = constant_op.constant(1.0)
@@ -572,6 +584,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       for op in reading_ops:
         self.assertEqual([], op.control_inputs)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     with self.cached_session() as session:
       var_x = variable_scope.get_variable(
@@ -600,7 +613,7 @@ class PartitionedVariablesTestCase(test.TestCase):
   def testMetaGraphSaveLoad(self):
     save_prefix = os.path.join(self.get_temp_dir(), "ckpt")
     save_graph = ops.Graph()
-    with save_graph.as_default(), self.test_session(
+    with save_graph.as_default(), self.session(
         graph=save_graph) as session:
       partitioner = partitioned_variables.fixed_size_partitioner(5, axis=0)
       with variable_scope.variable_scope("root", partitioner=partitioner):
@@ -620,7 +633,7 @@ class PartitionedVariablesTestCase(test.TestCase):
             save_graph.get_tensor_by_name(v0.name + ":0"))
 
     restore_graph = ops.Graph()
-    with restore_graph.as_default(), self.test_session(
+    with restore_graph.as_default(), self.session(
         graph=restore_graph) as session:
       saver = saver_lib.import_meta_graph(save_path + ".meta")
       saver.restore(sess=session, save_path=save_path)
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 6ede654aadc7d0d78bc18f13c2d4b3d47fef0402..78e786f01ca9c167b5b175fcd833a83281c078de 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -151,10 +152,10 @@ class PoolingTest(test.TestCase):
         np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1
     y1 = pool_direct(input=x, **kwargs)
     y2 = nn_ops.pool(input=x, **kwargs)
-    self.assertAllClose(y1, y2.eval(), rtol=1e-2, atol=1e-2)
+    self.assertAllClose(y1, self.evaluate(y2), rtol=1e-2, atol=1e-2)
 
   def testPoolSimple(self):
-    with self.test_session(use_gpu=test.is_gpu_available()):
+    with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           self._test(
@@ -166,7 +167,7 @@ class PoolingTest(test.TestCase):
               strides=[1, 2])
 
   def testPool1D(self):
-    with self.test_session(use_gpu=test.is_gpu_available()):
+    with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           for input_shape in [[2, 9, 2], [2, 10, 2]]:
@@ -192,7 +193,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testPool2D(self):
-    with self.test_session(use_gpu=test.is_gpu_available()):
+    with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           for input_shape in [[2, 9, 10, 2], [2, 10, 9, 2]]:
@@ -218,7 +219,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testPool3D(self):
-    with self.test_session(use_gpu=test.is_gpu_available()):
+    with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           for input_shape in [[2, 9, 10, 11, 2], [2, 10, 9, 11, 2]]:
@@ -247,7 +248,7 @@ class PoolingTest(test.TestCase):
   def testPoolNC(self):
     if test.is_gpu_available(cuda_only=True):
       # "NC*" format is currently only supported on CUDA.
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         for padding in ["SAME", "VALID"]:
           self._test(
               input_shape=[2, 2, 9],
@@ -301,8 +302,9 @@ class PoolingTest(test.TestCase):
     err_tolerance = 1e-2
     self.assertLess(err, err_tolerance)
 
+  @test_util.run_deprecated_v1
   def testGradient1D(self):
-    with self.test_session(use_gpu=test.is_gpu_available()):
+    with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
           for input_shape in [[2, 5, 2], [1, 4, 1]]:
@@ -327,8 +329,9 @@ class PoolingTest(test.TestCase):
                     dilation_rate=[1],
                     strides=strides)
 
+  @test_util.run_deprecated_v1
   def testGradient2D(self):
-    with self.test_session(use_gpu=test.is_gpu_available()):
+    with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
           for input_shape in [[2, 4, 5, 2], [1, 5, 4, 1]]:
@@ -353,8 +356,9 @@ class PoolingTest(test.TestCase):
                     dilation_rate=[1, 1],
                     strides=strides)
 
+  @test_util.run_deprecated_v1
   def testGradient3D(self):
-    with self.test_session(use_gpu=test.is_gpu_available()):
+    with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
           for input_shape in [[1, 3, 5, 4, 1], [1, 5, 4, 3, 1]]:
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index b01fc129538b8f54adcdf4b38ac8cc095e3901f4..347e092dee3b964b3abba5fae2a46c80d80f79bf 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -65,7 +65,7 @@ class PoolingTest(test.TestCase):
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     x = [f * 1.0 for f in range(1, total_size + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       t = constant_op.constant(x, shape=input_sizes)
       window = [1] + list(window) + [1]
       strides = [1] + list(strides) + [1]
@@ -81,7 +81,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format)
       if data_format == "NCDHW":
         t = test_util.NCHWToNHWC(t)
-      vals = sess.run(t)
+      vals = self.evaluate(t)
     # Verifies values.
     actual = vals.flatten()
     self.assertAllClose(expected, actual)
@@ -233,7 +233,7 @@ class PoolingTest(test.TestCase):
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     x = np.arange(1, total_size + 1, dtype=np.float32)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
       err_g_margin = 1e-3
       err_gg_margin = 1.5e-2
@@ -253,6 +253,7 @@ class PoolingTest(test.TestCase):
         ksize = test_util.NHWCToNCHW(ksize)
         strides = test_util.NHWCToNCHW(strides)
         t = test_util.NHWCToNCHW(t)
+        output_sizes = test_util.NHWCToNCHW(output_sizes)
 
       t = pool_func(
           t,
@@ -294,6 +295,7 @@ class PoolingTest(test.TestCase):
                                               use_gpu=use_gpu,
                                               **kwargs)
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -303,6 +305,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding2_1_6_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -312,6 +315,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding2_1_7_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -321,6 +325,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -330,6 +335,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -339,6 +345,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -348,6 +355,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -357,6 +365,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -366,6 +375,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -375,6 +385,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding3_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -384,6 +395,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -393,6 +405,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -402,6 +415,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -411,6 +425,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -420,6 +435,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -429,6 +445,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -438,6 +455,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -447,6 +465,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -456,6 +475,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding3_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index e95c72971521452a239b78ff4ab9c25c3089f1da..c33b59bb99b716b7164c82f6e640a8a3f4680351 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -129,7 +129,7 @@ class PoolingTest(test.TestCase):
     # Initializes the input tensor with array containing incrementing
     # numbers from 1, wrapping round to -127 after 127 to support int8.
     x = [((f + 128) % 255) - 127 for f in range(total_size)]
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       t = constant_op.constant(x, shape=input_sizes, dtype=data_type)
       if data_format in ("NCHW", "NCHW_VECT_C"):
         if data_format == "NCHW_VECT_C":
@@ -166,7 +166,7 @@ class PoolingTest(test.TestCase):
             strides_placeholder: strides
         })
       else:
-        actual = t.eval()
+        actual = self.evaluate(t)
         self.assertShapeEqual(actual, t)
       self.assertAllCloseAccordingToType(expected, actual.flatten())
 
@@ -384,6 +384,7 @@ class PoolingTest(test.TestCase):
         expected=[],
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testAvgPooling(self):
     for use_gpu in True, False:
       self._testAvgPoolValidPadding(use_gpu)
@@ -577,6 +578,7 @@ class PoolingTest(test.TestCase):
         expected=[],
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testMaxPooling(self):
     for use_gpu in True, False:
       self._testMaxPoolValidPadding(use_gpu)
@@ -588,6 +590,7 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolEmptyInput(use_gpu)
 
   # Tests for DepthwiseMaxPooling on CPU only.
+  @test_util.run_deprecated_v1
   def testDepthwiseMaxPool1x1DepthWindow1(self):
     # input is:
     # [1.0, ..., 10.0] along depth,
@@ -613,6 +616,7 @@ class PoolingTest(test.TestCase):
           use_gpu=False,
           v2=v2)
 
+  @test_util.run_deprecated_v1
   def testDepthwiseMaxPool2x2DepthWindow3(self):
     # input is:
     #
@@ -639,6 +643,7 @@ class PoolingTest(test.TestCase):
           use_gpu=False,
           v2=v2)
 
+  @test_util.run_deprecated_v1
   def testKernelSmallerThanStrideValid(self):
     for use_gpu in [True, False]:
       self._VerifyValues(
@@ -670,6 +675,7 @@ class PoolingTest(test.TestCase):
           expected=[5, 8, 26, 29],
           use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testKernelSmallerThanStrideSame(self):
     for use_gpu in [True, False]:
       for pool_func in [nn_ops.max_pool, nn_ops.avg_pool]:
@@ -718,7 +724,7 @@ class PoolingTest(test.TestCase):
                                          strides,
                                          error_msg,
                                          use_gpu=False):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       t = constant_op.constant(1.0, shape=in_size)
       with self.assertRaisesRegexp(errors_impl.UnimplementedError, error_msg):
         t = nn_ops.max_pool(
@@ -734,7 +740,7 @@ class PoolingTest(test.TestCase):
     self._testDepthwiseMaxPoolInvalidConfig([1, 2, 2, 4], [1, 1, 1, 3],
                                             [1, 1, 1, 3], "evenly divide")
     if test.is_gpu_available():
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         t = variables.Variable(np.ones([1, 2, 2, 4]))
         variables.global_variables_initializer().run()
         with self.assertRaisesOpError("for CPU devices"):
@@ -747,14 +753,14 @@ class PoolingTest(test.TestCase):
   def _CompareMaxPoolingFwd(self, input_shape, ksize, strides, padding):
     for dtype in np.float64, np.float32, np.float16:
       tensor_input = np.random.rand(*input_shape).astype(dtype)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op, _ = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        gpu_val = out_op.eval()
-      with self.test_session(use_gpu=False):
+        gpu_val = self.evaluate(out_op)
+      with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
       self.assertAllCloseAccordingToType(cpu_val, gpu_val)
 
   def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
@@ -764,23 +770,23 @@ class PoolingTest(test.TestCase):
       # in the input.
       tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
       tensor_output = np.random.rand(*output_shape).astype(dtype)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        argmax = argmax_op.eval()
+        argmax = self.evaluate(argmax_op)
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
         out_op = gen_nn_ops.max_pool_grad_with_argmax(t, grad_in, argmax, ksize,
                                                       strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
         self.assertShapeEqual(gpu_val, out_op)
-      with self.test_session(use_gpu=False):
+      with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        orig_out = out_op.eval()
+        orig_out = self.evaluate(out_op)
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
         out_op = gen_nn_ops.max_pool_grad(t, orig_out, grad_in, ksize, strides,
                                           padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
@@ -793,23 +799,23 @@ class PoolingTest(test.TestCase):
       # Generate numbers in a narrow range, so that there are many duplicates
       # in the input.
       tensor_input = np.random.random_integers(0, 3, input_shape).astype(dtype)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        argmax = argmax_op.eval()
+        argmax = self.evaluate(argmax_op)
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
         out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
             t, grad_in, argmax, ksize, strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
         self.assertShapeEqual(gpu_val, out_op)
-      with self.test_session(use_gpu=False):
+      with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        orig_out = out_op.eval()
+        orig_out = self.evaluate(out_op)
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
         out_op = gen_nn_ops.max_pool_grad_grad(t, orig_out, grad_in, ksize,
                                                strides, padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
@@ -818,7 +824,7 @@ class PoolingTest(test.TestCase):
 
   def testMaxPoolingWithArgmax(self):
     tensor_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
       out_op, argmax_op = nn_ops.max_pool_with_argmax(
           t,
@@ -826,7 +832,7 @@ class PoolingTest(test.TestCase):
           strides=[1, 1, 1, 1],
           Targmax=dtypes.int64,
           padding="VALID")
-      out, argmax = sess.run([out_op, argmax_op])
+      out, argmax = self.evaluate([out_op, argmax_op])
       self.assertShapeEqual(out, out_op)
       self.assertShapeEqual(argmax, argmax_op)
       self.assertAllClose(out.ravel(), [1.0, 1.0, 1.0, 1.0])
@@ -836,7 +842,7 @@ class PoolingTest(test.TestCase):
     orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     tensor_input = [11.0, 12.0, 13.0, 14.0]
     tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       orig_in = constant_op.constant(orig_input, shape=[1, 3, 3, 1])
       t = constant_op.constant(tensor_input, shape=[1, 2, 2, 1])
       argmax = constant_op.constant(
@@ -848,7 +854,7 @@ class PoolingTest(test.TestCase):
           ksize=[1, 2, 2, 1],
           strides=[1, 1, 1, 1],
           padding="VALID")
-      out = out_op.eval().flatten()
+      out = self.evaluate(out_op).flatten()
       self.assertAllClose(out,
                           [11.0, 12.0, 0.0, 13.0, 0.0, 14.0, 0.0, 0.0, 0.0])
 
@@ -859,7 +865,7 @@ class PoolingTest(test.TestCase):
     orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     tensor_input = [11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0]
     tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       orig_in = constant_op.constant(orig_input, shape=[1, 3, 3, 1])
       t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
       argmax = constant_op.constant(
@@ -871,7 +877,7 @@ class PoolingTest(test.TestCase):
           ksize=[1, 2, 2, 1],
           strides=[1, 1, 1, 1],
           padding="VALID")
-      out = out_op.eval().flatten()
+      out = self.evaluate(out_op).flatten()
       self.assertAllClose(out, [11.0, 12.0, 14.0, 16.0])
 
   def _ConstructAndTestGradient(self,
@@ -910,7 +916,7 @@ class PoolingTest(test.TestCase):
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     x = [f * 1.0 for f in range(1, total_size + 1)]
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
       if pool_func == nn_ops.avg_pool:
         func_name = "avg_pool"
@@ -986,7 +992,7 @@ class PoolingTest(test.TestCase):
     # Initializes the input tensor with array containing incrementing
     # numbers from 1.
     x = [f * 1.0 for f in range(1, total_size + 1)]
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       input_tensor = constant_op.constant(x, shape=input_sizes, name="input")
       if pool_func == nn_ops.avg_pool:
         func_name = "avg_pool"
@@ -1167,6 +1173,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1208,7 +1215,7 @@ class PoolingTest(test.TestCase):
                              window_rows, window_cols, row_stride, col_stride,
                              padding, use_gpu, v2):
     pool_func = gen_nn_ops.max_pool_v2 if v2 else nn_ops.max_pool
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       input_tensor = variables.Variable(
           np.array(input_data, dtype=np.float32).reshape(input_sizes))
       variables.global_variables_initializer().run()
@@ -1221,12 +1228,12 @@ class PoolingTest(test.TestCase):
           input_tensor, output_tensor, output_backprop_tensor, window_rows,
           window_cols, row_stride, col_stride, padding, v2)
 
-      actual_input_backprop = input_backprop_tensor.eval()
+      actual_input_backprop = self.evaluate(input_backprop_tensor)
       self.assertShapeEqual(actual_input_backprop, input_backprop_tensor)
       actual_input_backprop = actual_input_backprop.flatten()
       actual_input_backprop = self._GetNdArray(actual_input_backprop)
 
-      actual_output = output_tensor.eval().flatten()
+      actual_output = self.evaluate(output_tensor).flatten()
       actual_output = self._GetNdArray(actual_output)
 
       self.assertAllClose(
@@ -1497,6 +1504,7 @@ class PoolingTest(test.TestCase):
     else:
       del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradDirect(self):
     self._testMaxPoolGradDirect1_1()
     self._testMaxPoolGradDirect1_2()
@@ -1616,6 +1624,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradGradValidPadding1_1(data_format, use_gpu)
@@ -1649,6 +1658,7 @@ class PoolingTest(test.TestCase):
         orig_input, orig_output, grad, [1, window_rows, window_cols, 1],
         [1, row_stride, col_stride, 1], padding)
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1778,6 +1788,7 @@ class PoolingTest(test.TestCase):
         data_format=data_format,
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # All shapes unknown.
     for pool_func in [nn_ops.max_pool, nn_ops.avg_pool]:
@@ -1806,8 +1817,9 @@ class PoolingTest(test.TestCase):
             strides=[1, 1, 1, 1],
             padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testOpEdgeCases(self):
-    with self.test_session(use_gpu=test.is_gpu_available()) as sess:
+    with self.session(use_gpu=test.is_gpu_available()) as sess:
       pool_funcs = [nn_ops.max_pool, nn_ops.avg_pool]
       if test.is_gpu_available():
         pool_funcs.append(nn_ops.max_pool_with_argmax)
diff --git a/tensorflow/python/kernel_tests/priority_queue_test.py b/tensorflow/python/kernel_tests/priority_queue_test.py
index 73a9c81638259486f28a37755db86e4fe055f738..49ec7ee4836d40719971822aff9e063b7235dc8b 100644
--- a/tensorflow/python/kernel_tests/priority_queue_test.py
+++ b/tensorflow/python/kernel_tests/priority_queue_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -35,6 +36,7 @@ from tensorflow.python.platform import test
 
 class PriorityQueueTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertReadOnceSorts(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(2000, (dtypes.string, dtypes.string), (
@@ -50,7 +52,7 @@ class PriorityQueueTest(test.TestCase):
         enq.run()
 
       deq = q.dequeue_many(100)
-      deq_elem, deq_value_0, deq_value_1 = sess.run(deq)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(deq)
 
       allowed = {}
       missed = set()
@@ -81,7 +83,7 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeue_op = q.dequeue_many(100)
 
@@ -93,7 +95,7 @@ class PriorityQueueTest(test.TestCase):
       for t in enqueue_threads:
         t.start()
 
-      deq_elem, deq_value_0, deq_value_1 = sess.run(dequeue_op)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(dequeue_op)
 
       for t in enqueue_threads:
         t.join()
@@ -112,6 +114,7 @@ class PriorityQueueTest(test.TestCase):
         missed.remove((dv0, dv1))
       self.assertEqual(missed, set())
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripFillsCapacityMultiThreadedEnqueueAndDequeue(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(10, (dtypes.int64), (()))
@@ -132,12 +135,12 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeued = []
 
       def dequeue(dequeue_op):
-        (dequeue_indices, dequeue_values) = sess.run(dequeue_op)
+        (dequeue_indices, dequeue_values) = self.evaluate(dequeue_op)
         self.assertAllEqual(dequeue_indices, dequeue_values)
         dequeued.extend(dequeue_indices)
 
@@ -184,10 +187,10 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue(dequeue_op, dequeued):
-        (dequeue_indices, dequeue_values) = sess.run(dequeue_op)
+        (dequeue_indices, dequeue_values) = self.evaluate(dequeue_op)
         self.assertAllEqual(dequeue_indices, dequeue_values)
         dequeue_wait.acquire()
         dequeued.extend(dequeue_indices)
@@ -215,7 +218,7 @@ class PriorityQueueTest(test.TestCase):
 
       # We can't guarantee full sorting because we can't guarantee
       # that the dequeued.extend() call runs immediately after the
-      # sess.run() call.  Here we're just happy everything came out.
+      # self.evaluate() call.  Here we're just happy everything came out.
       self.assertAllEqual(set(dequeued), set(all_enqueued_values))
 
   def testRoundTripInsertManyMultiThreadedReadOnceSorts(self):
@@ -236,7 +239,7 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeue_op = q.dequeue_many(100)
 
@@ -248,7 +251,7 @@ class PriorityQueueTest(test.TestCase):
       for t in enqueue_threads:
         t.start()
 
-      deq_elem, deq_value_0, deq_value_1 = sess.run(dequeue_op)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(dequeue_op)
 
       for t in enqueue_threads:
         t.join()
@@ -267,6 +270,7 @@ class PriorityQueueTest(test.TestCase):
         missed.remove((dv0, dv1))
       self.assertEqual(missed, set())
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadOnceSorts(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(2000, (dtypes.string, dtypes.string), (
@@ -276,7 +280,7 @@ class PriorityQueueTest(test.TestCase):
       side_value_1 = np.random.rand(1000).astype(bytes)
       q.enqueue_many((elem, side_value_0, side_value_1)).run()
       deq = q.dequeue_many(1000)
-      deq_elem, deq_value_0, deq_value_1 = sess.run(deq)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(deq)
 
       allowed = {}
       for e, v0, v1 in zip(elem, side_value_0, side_value_1):
@@ -288,6 +292,7 @@ class PriorityQueueTest(test.TestCase):
       for e, dv0, dv1 in zip(deq_elem, deq_value_0, deq_value_1):
         self.assertTrue((dv0, dv1) in allowed[e])
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadManySorts(self):
     with self.cached_session():
       q = data_flow_ops.PriorityQueue(2000, (dtypes.int64), (()))
@@ -296,6 +301,7 @@ class PriorityQueueTest(test.TestCase):
       deq_values = np.hstack((q.dequeue_many(100)[0].eval() for _ in range(10)))
       self.assertAllEqual(deq_values, sorted(elem))
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadOnceLotsSorts(self):
     with self.cached_session():
       q = data_flow_ops.PriorityQueue(2000, (dtypes.int64), (()))
@@ -311,6 +317,7 @@ class PriorityQueueTest(test.TestCase):
       with self.assertRaises(TypeError):
         q.enqueue_many((["a", "b", "c"], ["a", "b", "c"])).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testInsertingNonScalarFails(self):
     with self.cached_session() as sess:
       input_priority = array_ops.placeholder(dtypes.int64)
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 5f5e24bd631fbd66265b953be9bafd769e282892..482633d539dfb0d1b0737846ba44ff3e0826ad43 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -102,6 +102,7 @@ class PyFuncTest(test.TestCase):
           script_ops.eager_py_func(np_func, [x, y], [dtypes.float32]))
       self.assertEqual(z[0], np_func(1.0, 2.0).astype(np.float32))
 
+  @test_util.run_v1_only("b/120545219")
   def testArray(self):
     with self.cached_session():
       x = constant_op.constant([1.0, 2.0], dtypes.float64)
@@ -168,6 +169,7 @@ class PyFuncTest(test.TestCase):
                              (dtypes.float64, dtypes.float64)))
       self.assertAllClose(y, [0.0, 1.0])
 
+  @test_util.run_v1_only("b/120545219")
   def testStrings(self):
 
     def read_fixed_length_numpy_strings():
@@ -185,6 +187,7 @@ class PyFuncTest(test.TestCase):
           script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
       self.assertAllEqual(z, [b"hello there", b"hi there"])
 
+  @test_util.run_v1_only("b/120545219")
   def testStringsAreConvertedToBytes(self):
 
     def read_fixed_length_numpy_strings():
@@ -202,6 +205,7 @@ class PyFuncTest(test.TestCase):
           script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
       self.assertAllEqual(z, [b"hello there", b"hi there"])
 
+  @test_util.run_v1_only("b/120545219")
   def testObjectArraysAreConvertedToBytes(self):
 
     def read_object_array():
@@ -217,12 +221,14 @@ class PyFuncTest(test.TestCase):
       z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
       self.assertListEqual(list(z.eval()), [b"hello there", b"hi ya"])
 
+  @test_util.run_v1_only("b/120545219")
   def testStringPadding(self):
     correct = [b"this", b"is", b"a", b"test"]
     with self.cached_session():
       s, = script_ops.py_func(lambda: [correct], [], [dtypes.string])
       self.assertAllEqual(s.eval(), correct)
 
+  @test_util.run_v1_only("b/120545219")
   def testStringPaddingAreConvertedToBytes(self):
     inp = ["this", "is", "a", "test"]
     correct = [b"this", b"is", b"a", b"test"]
@@ -230,6 +236,7 @@ class PyFuncTest(test.TestCase):
       s, = script_ops.py_func(lambda: [inp], [], [dtypes.string])
       self.assertAllEqual(s.eval(), correct)
 
+  @test_util.run_v1_only("b/120545219")
   def testLarge(self):
     with self.cached_session() as sess:
       x = array_ops.zeros([1000000], dtype=np.float32)
@@ -243,6 +250,7 @@ class PyFuncTest(test.TestCase):
       x = self.evaluate(script_ops.py_func(lambda: 42.0, [], dtypes.float64))
       self.assertAllClose(x, 42.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testAlias(self):
     with self.cached_session():
       np_array = np.array([1.0, 2.0], dtype=np.float32)
@@ -251,6 +259,7 @@ class PyFuncTest(test.TestCase):
       value.op.run()
       self.assertAllEqual(np_array, [1.0, 2.0])
 
+  @test_util.run_v1_only("b/120545219")
   def testReturnUnicodeString(self):
     with self.cached_session():
       correct = u"你好 世界"
@@ -261,6 +270,7 @@ class PyFuncTest(test.TestCase):
       z, = script_ops.py_func(unicode_string, [], [dtypes.string])
       self.assertEqual(z.eval(), correct.encode("utf8"))
 
+  @test_util.run_v1_only("b/120545219")
   def testBadNumpyReturnType(self):
     with self.cached_session():
 
@@ -272,8 +282,9 @@ class PyFuncTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported numpy type"):
-        y.eval()
+        self.evaluate(y)
 
+  @test_util.run_v1_only("b/120545219")
   def testBadReturnType(self):
     with self.cached_session():
 
@@ -285,8 +296,9 @@ class PyFuncTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported object type"):
-        z.eval()
+        self.evaluate(z)
 
+  @test_util.run_v1_only("b/120545219")
   def testReturnInput(self):
     with self.cached_session():
 
@@ -307,9 +319,9 @@ class PyFuncTest(test.TestCase):
     with session_lib.Session() as sess:
       producer = iter(range(3))
       x, = script_ops.py_func(lambda: next(producer), [], [dtypes.int64])
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 1)
-      self.assertEqual(sess.run(x), 2)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 1)
+      self.assertEqual(self.evaluate(x), 2)
 
   def testStateless(self):
     # Not using self.cached_session(), which disables optimization.
@@ -317,10 +329,11 @@ class PyFuncTest(test.TestCase):
       producer = iter(range(3))
       x, = script_ops.py_func(
           lambda: next(producer), [], [dtypes.int64], stateful=False)
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
 
+  @test_util.run_v1_only("b/120545219")
   def testGradientFunction(self):
     # Input to tf.py_func is necessary, otherwise get_gradient_function()
     # returns None per default.
@@ -330,13 +343,15 @@ class PyFuncTest(test.TestCase):
     self.assertEqual(None, ops.get_gradient_function(x.op))
     self.assertEqual(None, ops.get_gradient_function(y.op))
 
+  @test_util.run_v1_only("b/120545219")
   def testCOrder(self):
     with self.cached_session():
       val = [[1, 2], [3, 4]]
       x, = script_ops.py_func(lambda: np.array(val, order="F"), [],
                               [dtypes.int64])
-      self.assertAllEqual(val, x.eval())
+      self.assertAllEqual(val, self.evaluate(x))
 
+  @test_util.run_v1_only("b/120545219")
   def testParallel(self):
     # Tests that tf.py_func's can run in parallel if they release the GIL.
     with self.cached_session() as session:
@@ -382,6 +397,7 @@ class PyFuncTest(test.TestCase):
       self.assertIsNone(ret)
       self.assertAllEqual([3], s.value)
 
+  @test_util.run_v1_only("b/120545219")
   def testNoReturnValueStateless(self):
 
     def do_nothing(unused_x):
@@ -390,7 +406,7 @@ class PyFuncTest(test.TestCase):
     f = script_ops.py_func(
         do_nothing, [constant_op.constant(3, dtypes.int64)], [], stateful=False)
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(f), [])
+      self.assertEqual(self.evaluate(f), [])
 
   def _testExceptionHandling(self, py_exp, tf_exp, eager=False):
 
@@ -420,6 +436,7 @@ class PyFuncTest(test.TestCase):
     with self.assertRaisesWithPredicateMatch(tf_exp, expected_error_check):
       self.evaluate(f)
 
+  @test_util.run_v1_only("b/120545219")
   def testExceptionHandling(self):
     with self.cached_session():
       self._testExceptionHandling(ValueError, errors.InvalidArgumentError)
@@ -514,6 +531,7 @@ class PyFuncTest(test.TestCase):
       self.assertAllEqual(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testEagerExceptionHandling(self):
     with test_util.device(use_gpu=True):
       self._testExceptionHandling(
@@ -533,6 +551,7 @@ class PyFuncTest(test.TestCase):
       self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testEagerReturningVariableRaisesError(self):
     def return_variable():
       return resource_variable_ops.ResourceVariable(0.0)
@@ -556,6 +575,7 @@ class PyFuncTest(test.TestCase):
     dy_dx = tape.gradient(y, x)
     self.assertEqual(self.evaluate(dy_dx), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraph(self):
 
     def f(x):
@@ -566,6 +586,7 @@ class PyFuncTest(test.TestCase):
     dy_dx = gradients_impl.gradients(y, x)[0]
     self.assertEqual(self.evaluate(dy_dx), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraphTwoOutputs(self):
 
     def f(x, y):
@@ -595,6 +616,7 @@ class PyFuncTest(test.TestCase):
     self.assertEqual(self.evaluate(dz_dx), 6.0)
     self.assertEqual(self.evaluate(dz_dy), 8.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraphMultipleArgs(self):
 
     def f(x, y):
@@ -608,6 +630,7 @@ class PyFuncTest(test.TestCase):
     self.assertEqual(self.evaluate(dz_dx), 6.0)
     self.assertEqual(self.evaluate(dz_dy), 8.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraphLogHuber(self):
 
     def log_huber(x, m):
@@ -629,6 +652,7 @@ class PyFuncTest(test.TestCase):
       self.assertEqual(y, 1.0)
       self.assertEqual(dy_dx, 2.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerRespectsDevicePlacmentOfOp(self):
 
     def f(x):
@@ -644,7 +668,7 @@ class PyFuncTest(test.TestCase):
       y = script_ops.eager_py_func(func=f, inp=[x], Tout=dtypes.float32)
       z = script_ops.eager_py_func(func=g, inp=[y], Tout=dtypes.float32)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       output = sess.run(z, feed_dict={x: 3.0})
       self.assertEqual(output, 18.0)
 
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 8848c15e765236c2ae2817cce1510c4c1ab04740..5adb95c7d60e88e43f6f171f6594c8542ef53143 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
@@ -38,6 +40,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class QrOpTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to qr should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
@@ -49,8 +52,9 @@ class QrOpTest(test.TestCase):
                                  "Shape must be at least rank 2 but is rank 1"):
       linalg_ops.qr(vector)
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       all_ops = []
       for full_matrices_ in True, False:
         for rows_ in 4, 5:
@@ -60,7 +64,7 @@ class QrOpTest(test.TestCase):
             q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
             q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
             all_ops += [q1, r1, q2, r2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       for i in range(8):
         q = 4 * i
         self.assertAllEqual(val[q], val[q + 2])  # q1 == q2
@@ -100,7 +104,7 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-14
     # Tests that a ~= q*r.
     a_recon = math_ops.matmul(q, r)
-    self.assertAllClose(a_recon.eval(), a, rtol=tol, atol=tol)
+    self.assertAllClose(a_recon, a, rtol=tol, atol=tol)
 
   def CheckUnitary(self, x):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
@@ -110,8 +114,9 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-5
     else:
       tol = 1e-14
-    self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
+    self.assertAllClose(identity, xx, atol=tol)
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(1)
     x_np = np.random.uniform(
@@ -121,7 +126,7 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       if use_static_shape_:
         x_tf = constant_op.constant(x_np)
       else:
@@ -129,7 +134,7 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       q_tf, r_tf = linalg_ops.qr(x_tf, full_matrices=full_matrices_)
 
       if use_static_shape_:
-        q_tf_val, r_tf_val = sess.run([q_tf, r_tf])
+        q_tf_val, r_tf_val = self.evaluate([q_tf, r_tf])
       else:
         q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
 
@@ -160,6 +165,7 @@ class QrGradOpTest(test.TestCase):
 
 def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(42)
     a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
@@ -173,7 +179,7 @@ def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
       tol = 3e-2
     else:
       tol = 1e-6
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       tf_a = constant_op.constant(a)
       tf_b = linalg_ops.qr(tf_a, full_matrices=full_matrices_)
       for b in tf_b:
@@ -200,7 +206,8 @@ if __name__ == "__main__":
       for cols in 1, 2, 5, 10, 32, 100:
         for full_matrices in False, True:
           for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
-            for use_static_shape in True, False:
+            # TF2 does not support placeholders under eager so we skip it
+            for use_static_shape in set([True, tf2.enabled()]):
               shape = batch_dims + (rows, cols)
               name = "%s_%s_full_%s_static_%s" % (dtype.__name__,
                                                   "_".join(map(str, shape)),
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 3b3a28fc9a24104cc9032ab23dfc51e690d3ec94..dd81306db05aafac0d041320a193c7d92437a5fd 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -90,6 +90,21 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "stateless_random_ops_test",
+    size = "medium",
+    srcs = ["stateless_random_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:stateless_random_ops",
+    ],
+)
+
 cuda_py_test(
     name = "random_gamma_test",
     size = "medium",
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
index 0023506b77aeb561da2f65183ce7efb60402ba4c..576720528e20d5b4595f106ed7203462e57b2ac7 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
@@ -39,7 +40,7 @@ class MultinomialTest(test.TestCase):
           num_samples=1000000,
           seed=15)
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
@@ -57,7 +58,7 @@ class MultinomialTest(test.TestCase):
           num_samples=1000000,
           seed=15)
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
@@ -66,6 +67,7 @@ class MultinomialTest(test.TestCase):
             counts_by_indices[index] = count
     self.assertEqual(counts_by_indices[0], 100000000)
 
+  @test_util.run_deprecated_v1
   def testLargeDynamicRange3(self):
     random_seed.set_random_seed(10)
     counts_by_indices = {}
@@ -79,7 +81,7 @@ class MultinomialTest(test.TestCase):
       # we'll run out of memory if we try to draw 1e9 samples directly
       # really should fit in 12GB of memory...
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index bd64d61af8e793e71a319b6ac1af95bd7dd16a3d..5d123307a8e62c072949d31d3c6b52a8fc39666b 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -66,12 +66,13 @@ class MultinomialTest(test.TestCase):
             logits, num_samples, output_dtype=output_dtype))
         self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
 
+  @test_util.run_deprecated_v1
   def testOneOpMultipleStepsIndependent(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       sample_op1, _ = self._make_ops(10)
       # Consecutive runs shouldn't yield identical output.
-      sample1a = sess.run(sample_op1)
-      sample1b = sess.run(sample_op1)
+      sample1a = self.evaluate(sample_op1)
+      sample1b = self.evaluate(sample_op1)
       self.assertFalse(np.equal(sample1a, sample1b).all())
 
   def testEagerOneOpMultipleStepsIndependent(self):
@@ -81,26 +82,27 @@ class MultinomialTest(test.TestCase):
       self.assertFalse(np.equal(sample1.numpy(), sample2.numpy()).all())
 
   def testTwoOpsIndependent(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       sample_op1, sample_op2 = self._make_ops(32)
-      sample1, sample2 = sess.run([sample_op1, sample_op2])
+      sample1, sample2 = self.evaluate([sample_op1, sample_op2])
       # We expect sample1 and sample2 to be independent.
       # 1 in 2^32 chance of this assertion failing.
       self.assertFalse(np.equal(sample1, sample2).all())
 
+  @test_util.run_deprecated_v1
   def testTwoOpsSameSeedDrawSameSequences(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       sample_op1, sample_op2 = self._make_ops(1000, seed=1)
-      sample1, sample2 = sess.run([sample_op1, sample_op2])
+      sample1, sample2 = self.evaluate([sample_op1, sample_op2])
       self.assertAllEqual(sample1, sample2)
 
   def testLargeLogits(self):
     for neg in [True, False]:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         logits = np.array([[1000.] * 5])
         if neg:
           logits *= -1
-        samples = random_ops.multinomial(logits, 10).eval()
+        samples = self.evaluate(random_ops.multinomial(logits, 10))
       # Sampled classes should be in-range.
       self.assertTrue((samples >= 0).all())
       self.assertTrue((samples < 5).all())
@@ -157,10 +159,10 @@ class MultinomialTest(test.TestCase):
     Returns:
       Frequencies from sampled classes; shape [batch_size, num_classes].
     """
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       random_seed.set_random_seed(1618)
       op = sampler(constant_op.constant(logits), num_samples)
-      d = sess.run(op)
+      d = self.evaluate(op)
 
     batch_size, num_classes = logits.shape
     freqs_mat = []
@@ -186,25 +188,27 @@ class MultinomialTest(test.TestCase):
 
   def testEmpty(self):
     classes = 5
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       for batch in 0, 3:
         for samples in 0, 7:
-          x = random_ops.multinomial(
-              array_ops.zeros([batch, classes]), samples).eval()
+          x = self.evaluate(
+              random_ops.multinomial(
+                  array_ops.zeros([batch, classes]), samples))
           self.assertEqual(x.shape, (batch, samples))
 
+  @test_util.run_deprecated_v1
   def testEmptyClasses(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       x = random_ops.multinomial(array_ops.zeros([5, 0]), 7)
       with self.assertRaisesOpError("num_classes should be positive"):
-        x.eval()
+        self.evaluate(x)
 
   def testNegativeMinLogits(self):
     random_seed.set_random_seed(78844)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       logits = constant_op.constant([[np.finfo(np.float32).min] * 1023 + [0]])
       num_samples = 1000
-      samples = random_ops.multinomial(logits, num_samples).eval()
+      samples = self.evaluate(random_ops.multinomial(logits, num_samples))
       self.assertAllEqual([[1023] * num_samples], samples)
 
 
diff --git a/tensorflow/python/kernel_tests/random/random_crop_test.py b/tensorflow/python/kernel_tests/random/random_crop_test.py
index 8ded522320b730955e08b43cbf6da537f437b095..724bee07157181fd40b3b0c9ca4a9afac0688e7d 100644
--- a/tensorflow/python/kernel_tests/random/random_crop_test.py
+++ b/tensorflow/python/kernel_tests/random/random_crop_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
 class RandomCropTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     # No random cropping is performed since the size is value.shape.
     for shape in (2, 1, 1), (2, 1, 3), (4, 5, 3):
@@ -44,10 +46,11 @@ class RandomCropTest(test.TestCase):
           for i in range(2) for j in range(3) for k in range(4))
       crop = random_ops.random_crop(value, size=target)
       for _ in range(20):
-        y = crop.eval()
+        y = self.evaluate(crop)
         self.assertAllEqual(y.shape, target)
         self.assertTrue(tuple(y.ravel()) in value_set)
 
+  @test_util.run_deprecated_v1
   def testRandomization(self):
     # Run 1x1 crop num_samples times in an image and ensure that one finds each
     # pixel 1/size of the time.
@@ -61,7 +64,7 @@ class RandomCropTest(test.TestCase):
       crop = random_ops.random_crop(value, single, seed=7)
       counts = np.zeros(size, dtype=np.int32)
       for _ in range(num_samples):
-        y = crop.eval()
+        y = self.evaluate(crop)
         self.assertAllEqual(y.shape, single)
         counts[y] += 1
 
diff --git a/tensorflow/python/kernel_tests/random/random_gamma_test.py b/tensorflow/python/kernel_tests/random/random_gamma_test.py
index d9699444937f91b18d73cddc78444e756aff7c07..a5952a21968c79c8bfbcbfef2b09852f24f29923 100644
--- a/tensorflow/python/kernel_tests/random/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random/random_gamma_test.py
@@ -26,6 +26,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -43,19 +44,21 @@ class RandomGammaTest(test.TestCase):
   def _Sampler(self, num, alpha, beta, dtype, use_gpu, seed=None):
 
     def func():
-      with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+      with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
         rng = random_ops.random_gamma(
             [num], alpha, beta=beta, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
 
+  @test_util.run_deprecated_v1
   def testMomentsFloat32(self):
     self._testMoments(dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testMomentsFloat64(self):
     self._testMoments(dtypes.float64)
 
@@ -208,6 +211,7 @@ class RandomGammaTest(test.TestCase):
         sy = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=345)
         self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     """CSE = constant subexpression eliminator.
 
@@ -216,12 +220,13 @@ class RandomGammaTest(test.TestCase):
     """
     for dtype in dtypes.float16, dtypes.float32, dtypes.float64:
       for use_gpu in [False, True]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           rnd1 = random_ops.random_gamma([24], 2.0, dtype=dtype)
           rnd2 = random_ops.random_gamma([24], 2.0, dtype=dtype)
           diff = rnd2 - rnd1
           self.assertGreater(np.linalg.norm(diff.eval()), 0.1)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     # Fully known shape.
     rnd = random_ops.random_gamma([150], 2.0)
@@ -253,6 +258,7 @@ class RandomGammaTest(test.TestCase):
     rnd = random_ops.random_gamma([50], array_ops.placeholder(dtypes.float32))
     self.assertIs(None, rnd.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testPositive(self):
     n = int(10e3)
     for dt in [dtypes.float16, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/kernel_tests/random/random_grad_test.py b/tensorflow/python/kernel_tests/random/random_grad_test.py
index d89056c485a3d68a0ea5527391196b41d5fc0090..aac6eeac06abca3148947901b92b43058fe76e3c 100644
--- a/tensorflow/python/kernel_tests/random/random_grad_test.py
+++ b/tensorflow/python/kernel_tests/random/random_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -45,6 +46,7 @@ class AddLeadingUnitDimensionsTest(test.TestCase):
     ret = random_grad.add_leading_unit_dimensions(1.0, 2)
     self.assertAllEqual(ret.shape, [1, 1])
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     x = array_ops.placeholder(dtypes.float32)
     num_dimensions = array_ops.placeholder(dtypes.int32)
@@ -72,6 +74,7 @@ class RandomGammaGradTest(test.TestCase):
   some statistical properties of the derivative.
   """
 
+  @test_util.run_deprecated_v1
   def testGradientsShape(self):
     shape = [2, 3]
     alpha = array_ops.ones([2, 2])
@@ -81,6 +84,7 @@ class RandomGammaGradTest(test.TestCase):
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
 
+  @test_util.run_deprecated_v1
   def testGradientsShapeWithOneSamplePerParameter(self):
     shape = []
     alpha = array_ops.ones([2, 2])
@@ -90,6 +94,7 @@ class RandomGammaGradTest(test.TestCase):
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
 
+  @test_util.run_deprecated_v1
   def testGradientsUnknownShape(self):
     shape = array_ops.placeholder(dtypes.int32)
     alpha = array_ops.placeholder(dtypes.float32)
@@ -138,9 +143,11 @@ class RandomGammaGradTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot use special functions in a test: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testCompareToExplicitDerivativeFloat(self):
     self._testCompareToExplicitDerivative(dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testCompareToExplicitDerivativeDouble(self):
     self._testCompareToExplicitDerivative(dtypes.float64)
 
@@ -182,12 +189,15 @@ class RandomGammaGradTest(test.TestCase):
 
     self.assertAllClose(actual_val, expected_val, rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testCompareToImplicitDerivativeFloat(self):
     self._testCompareToImplicitDerivative(dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testCompareToImplicitDerivativeDouble(self):
     self._testCompareToImplicitDerivative(dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testAverageAlphaGradient(self):
     """Statistical test for the gradient.
 
@@ -207,6 +217,7 @@ class RandomGammaGradTest(test.TestCase):
     dsample_dalpha_val = self.evaluate(dsample_dalpha)
     self.assertAllClose(dsample_dalpha_val, [1.0] * 3, atol=1e-1, rtol=1e-1)
 
+  @test_util.run_deprecated_v1
   def testQuadraticLoss(self):
     """Statistical test for the gradient.
 
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index d199a9d9dd54bbaf697c5c0f1c35f346b0eea368..1384c3f446f97a76792a27cfc7f679e80402cbf0 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -44,14 +45,14 @@ class RandomOpTestCommon(test.TestCase):
                                     use_gpu,
                                     op_seed=None,
                                     graph_seed=None):
-    with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+    with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
       if graph_seed is not None:
         random_seed.set_random_seed(graph_seed)
       x = rng_func([num], min_or_mean, max_or_stddev, dtype=dtype, seed=op_seed)
 
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # the same output, all three outputs will be bitwise identical.
@@ -64,12 +65,12 @@ class RandomNormalTest(RandomOpTestCommon):
   def _Sampler(self, num, mu, sigma, dtype, use_gpu, seed=None):
 
     def func():
-      with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+      with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
         rng = random_ops.random_normal(
             [num], mean=mu, stddev=sigma, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -92,6 +93,7 @@ class RandomNormalTest(RandomOpTestCommon):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       results = {}
@@ -104,27 +106,31 @@ class RandomNormalTest(RandomOpTestCommon):
       else:
         self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       sx = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=True, seed=345)
       sy = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=True, seed=345)
       self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.session(use_gpu=use_gpu):
         shape = [2, 3, 4]
         rnd1 = random_ops.random_normal(shape, 0.0, 1.0, dtypes.float32)
         rnd2 = random_ops.random_normal(shape, 0.0, 1.0, dtypes.float32)
         diff = rnd2 - rnd1
         self.assertTrue(np.linalg.norm(diff.eval()) > 0.1)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionNotConstant(self):
     for use_gpu in [False, True]:
       for dt in dtypes.float16, dtypes.float32, dtypes.float64:
         self._testSingleSessionNotConstant(
             random_ops.random_normal, 100, dt, 0.0, 1.0, use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionOpSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in dtypes.float16, dtypes.float32, dtypes.float64:
@@ -137,6 +143,7 @@ class RandomNormalTest(RandomOpTestCommon):
             use_gpu=use_gpu,
             op_seed=1345)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionGraphSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in dtypes.float16, dtypes.float32, dtypes.float64:
@@ -155,12 +162,12 @@ class TruncatedNormalTest(test.TestCase):
   def _Sampler(self, num, mu, sigma, dtype, use_gpu, seed=None):
 
     def func():
-      with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+      with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
         rng = random_ops.truncated_normal(
             [num], mean=mu, stddev=sigma, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -185,6 +192,7 @@ class TruncatedNormalTest(test.TestCase):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     # Skip the test if there is no GPU.
     if not test.is_gpu_available():
@@ -203,6 +211,7 @@ class TruncatedNormalTest(test.TestCase):
       else:
         self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       sx = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=True, seed=345)
@@ -219,15 +228,17 @@ class TruncatedNormalTest(test.TestCase):
       print("std(x)", np.std(x), abs(np.std(x) / stddev - 0.85))
       self.assertTrue(abs(np.std(x) / stddev - 0.85) < 0.04)
 
+  @test_util.run_deprecated_v1
   def testLargeShape(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v = variables.Variable(
           array_ops.zeros(dtype=dtypes.float32, shape=[2**33, 1]))
       n = random_ops.truncated_normal(v.shape)
       self.assertEqual([8589934592, 1], n.shape.as_list())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       shape = [2, 3, 4]
       rnd1 = random_ops.truncated_normal(shape, 0.0, 1.0, dtypes.float32)
       rnd2 = random_ops.truncated_normal(shape, 0.0, 1.0, dtypes.float32)
@@ -251,12 +262,12 @@ class RandomUniformTest(RandomOpTestCommon):
   def _Sampler(self, num, minv, maxv, dtype, use_gpu, seed=None):
 
     def func():
-      with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+      with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
         rng = random_ops.random_uniform(
             [num], minval=minv, maxval=maxv, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -287,6 +298,7 @@ class RandomUniformTest(RandomOpTestCommon):
         print("count = ", count)
       self.assertTrue(count < count_limit)
 
+  @test_util.run_deprecated_v1
   def testUniformIntsWithInvalidShape(self):
     for dtype in dtypes.int32, dtypes.int64:
       with self.assertRaisesRegexp(
@@ -299,6 +311,7 @@ class RandomUniformTest(RandomOpTestCommon):
             [1000], minval=1, maxval=[2, 3], dtype=dtype)
 
   # Check that uniform ints actually follow a uniform distribution.
+  @test_util.run_deprecated_v1
   def testUniformInts(self):
     minv = -2
     maxv = 15
@@ -331,6 +344,7 @@ class RandomUniformTest(RandomOpTestCommon):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
                dtypes.int64):
@@ -342,6 +356,7 @@ class RandomUniformTest(RandomOpTestCommon):
         results[use_gpu] = sampler()
       self.assertAllEqual(results[False], results[True])
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
                dtypes.int64):
@@ -350,15 +365,17 @@ class RandomUniformTest(RandomOpTestCommon):
         sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
         self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     shape = [2, 3, 4]
     for dtype in dtypes.float16, dtypes.float32, dtypes.int32:
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         rnd1 = random_ops.random_uniform(shape, 0, 17, dtype=dtype)
         rnd2 = random_ops.random_uniform(shape, 0, 17, dtype=dtype)
         diff = (rnd2 - rnd1).eval()
         self.assertTrue(np.linalg.norm(diff) > 0.1)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionNotConstant(self):
     for use_gpu in [False, True]:
       for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
@@ -366,6 +383,7 @@ class RandomUniformTest(RandomOpTestCommon):
         self._testSingleSessionNotConstant(
             random_ops.random_uniform, 100, dt, 0, 17, use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionOpSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
@@ -379,6 +397,7 @@ class RandomUniformTest(RandomOpTestCommon):
             use_gpu=use_gpu,
             op_seed=1345)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionGraphSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
@@ -395,6 +414,7 @@ class RandomUniformTest(RandomOpTestCommon):
 
 class RandomShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTruncatedNormal(self):
     # Fully known shape.
     rnd1 = random_ops.truncated_normal([1, 2, 3])
@@ -407,6 +427,7 @@ class RandomShapeTest(test.TestCase):
     rnd3 = random_ops.truncated_normal(array_ops.placeholder(dtypes.int32))
     self.assertIs(None, rnd3.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testRandomNormal(self):
     # Fully known shape.
     rnd1 = random_ops.random_normal([1, 2, 3])
@@ -419,6 +440,7 @@ class RandomShapeTest(test.TestCase):
     rnd3 = random_ops.random_normal(array_ops.placeholder(dtypes.int32))
     self.assertIs(None, rnd3.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testRandomUniform(self):
     # Fully known shape.
     rnd1 = random_ops.random_uniform([1, 2, 3])
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index 15ab95cdb7727b86be579c6136de0483ccdc889e..0a6b004d682e5d810a5a3e09ca6dce867e5f41f1 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -39,11 +40,11 @@ class RandomPoissonTest(test.TestCase):
   def _Sampler(self, num, lam, dtype, use_gpu, seed=None):
 
     def func():
-      with self.test_session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
+      with self.session(use_gpu=use_gpu, graph=ops.Graph()) as sess:
         rng = random_ops.random_poisson(lam, [num], dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -104,6 +105,7 @@ class RandomPoissonTest(test.TestCase):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     for dt in _SUPPORTED_DTYPES:
       results = {}
@@ -115,12 +117,14 @@ class RandomPoissonTest(test.TestCase):
       else:
         self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       sx = self._Sampler(1000, 1.0, dt, use_gpu=True, seed=345)
       sy = self._Sampler(1000, 1.0, dt, use_gpu=True, seed=345)
       self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     """CSE = constant subexpression eliminator.
 
@@ -128,7 +132,7 @@ class RandomPoissonTest(test.TestCase):
     merged.
     """
     for dtype in dtypes.float16, dtypes.float32, dtypes.float64:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         rnd1 = random_ops.random_poisson(2.0, [24], dtype=dtype)
         rnd2 = random_ops.random_poisson(2.0, [24], dtype=dtype)
         diff = rnd2 - rnd1
@@ -140,8 +144,9 @@ class RandomPoissonTest(test.TestCase):
     with self.cached_session():
       rnd = random_ops.random_poisson([], [], seed=12345)
       self.assertEqual([0], rnd.get_shape().as_list())
-      self.assertAllClose(np.array([], dtype=np.float32), rnd.eval())
+      self.assertAllClose(np.array([], dtype=np.float32), self.evaluate(rnd))
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     # Fully known shape
     rnd = random_ops.random_poisson(2.0, [150], seed=12345)
@@ -184,6 +189,7 @@ class RandomPoissonTest(test.TestCase):
         seed=12345)
     self.assertIs(None, rnd.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testDTypeCombinationsV2(self):
     """Tests random_poisson_v2() for all supported dtype combinations."""
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index 0d85a072d4a2ff168f5e1c3233c7f7faf5c69a32..dd814a22b4e59261b33e1a57fd9014147792858b 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -29,11 +29,13 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+@test_util.run_v1_only("b/120545219")
 class RandomShuffleQueueTest(test.TestCase):
 
   def setUp(self):
@@ -84,9 +86,9 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeue_t = q.dequeue()
       results = []
       for _ in range(2):
-        a, b = sess.run(dequeue_t)
+        a, b = self.evaluate(dequeue_t)
         results.append((a, b))
-      a, b = sess.run(q.dequeue_many(3))
+      a, b = self.evaluate(q.dequeue_many(3))
       for i in range(3):
         results.append((a[i], b[i]))
       self.assertItemsEqual([(1, [5]), (2, [6]), (3, [7]), (4, [8]), (9, [10])],
@@ -101,7 +103,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -133,7 +135,7 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops]
       for thread in threads:
@@ -167,13 +169,13 @@ class RandomShuffleQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
       def dequeue():
         for _ in xrange(len(elems)):
-          results.append(sess.run(dequeued_t))
+          results.append(self.evaluate(dequeued_t))
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -197,7 +199,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       results = []
       for _ in xrange(len(elems)):
-        x, y = sess.run(dequeued_t)
+        x, y = self.evaluate(dequeued_t)
         results.append((x, y))
       self.assertItemsEqual(elems, results)
 
@@ -215,9 +217,9 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual([1], size.eval())
+      self.assertEqual([1], self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual([0], size.eval())
+      self.assertEqual([0], self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -241,9 +243,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual(0, size_t.eval())
+      self.assertEqual(0, self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual(0, size_t.eval())
+      self.assertEqual(0, self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -251,9 +253,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpTo(self):
     with self.cached_session():
@@ -261,9 +263,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithNoShape(self):
     with self.cached_session():
@@ -275,7 +277,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the shape not being constrained.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
       enqueue_op.run()
 
@@ -284,7 +286,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # elements enqueued.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testEmptyDequeueUpToWithNoShape(self):
     with self.cached_session():
@@ -296,7 +298,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the shape not being constrained.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
       enqueue_op.run()
 
@@ -305,7 +307,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # elements enqueued.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testMultiEnqueueMany(self):
     with self.cached_session() as sess:
@@ -321,7 +323,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       results = []
       for _ in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         results.append((float_val, [int_val[0], int_val[1]]))
       expected = list(zip(float_elems, int_elems)) * 2
       self.assertItemsEqual(expected, results)
@@ -335,7 +337,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      results = dequeued_t.eval().tolist()
+      results = self.evaluate(dequeued_t).tolist()
       results.extend(dequeued_t.eval())
       self.assertItemsEqual(elems, results)
 
@@ -348,7 +350,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      results = dequeued_t.eval().tolist()
+      results = self.evaluate(dequeued_t).tolist()
       results.extend(dequeued_t.eval())
       self.assertItemsEqual(elems, results)
 
@@ -368,20 +370,20 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       results = []
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_single_t[1].get_shape())
       results.append((float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       results.append((float_val, int_val.tolist()))
 
       self.assertItemsEqual(zip(float_elems, int_elems), results)
@@ -402,21 +404,21 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       results = []
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       # dequeue_up_to has undefined shape.
       self.assertEqual([None], dequeued_t[0].get_shape().as_list())
       self.assertEqual([None, 2], dequeued_t[1].get_shape().as_list())
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_single_t[1].get_shape())
       results.append((float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       results.append((float_val, int_val.tolist()))
 
       self.assertItemsEqual(zip(float_elems, int_elems), results)
@@ -442,7 +444,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -466,7 +468,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -489,7 +491,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -515,7 +517,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue(dequeue_op):
-        dequeued_elems.extend(sess.run(dequeue_op))
+        dequeued_elems.extend(self.evaluate(dequeue_op))
 
       threads = []
       for dequeue_op in dequeue_ops:
@@ -539,10 +541,10 @@ class RandomShuffleQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -566,10 +568,10 @@ class RandomShuffleQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -649,7 +651,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -665,18 +667,18 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       # Manually dequeue until we hit min_size.
-      results.append(sess.run(dequeued_t))
-      results.append(sess.run(dequeued_t))
+      results.append(self.evaluate(dequeued_t))
+      results.append(self.evaluate(dequeued_t))
 
       def blocking_dequeue():
-        results.append(sess.run(dequeued_t))
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
         self.assertItemsEqual(elems, results)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=blocking_dequeue)
       dequeue_thread.start()
@@ -701,7 +703,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
         finished.append(True)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -727,12 +729,12 @@ class RandomShuffleQueueTest(test.TestCase):
       progress = []  # Must be mutable
 
       def dequeue():
-        self.assertItemsEqual(elems, sess.run(dequeued_t))
+        self.assertItemsEqual(elems, self.evaluate(dequeued_t))
         progress.append(1)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
         progress.append(2)
 
       self.assertEqual(len(progress), 0)
@@ -763,9 +765,9 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(3, len(results))
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(4, len(results))
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -794,11 +796,11 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(3, len(results))
         # min_after_dequeue is 2, we ask for 3 elements, and we end up only
         # getting the remaining 1.
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(4, len(results))
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -824,16 +826,16 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEqual(len(results), 3)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
         # While the last dequeue failed, we want to insure that it returns
         # any elements that it potentially reserved to dequeue. Thus the
         # next cleanup should return a single element.
-        results.extend(sess.run(cleanup_dequeue_t))
+        results.extend(self.evaluate(cleanup_dequeue_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -854,7 +856,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -874,7 +876,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -922,7 +924,7 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -950,7 +952,7 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -987,11 +989,11 @@ class RandomShuffleQueueTest(test.TestCase):
       def blocking_enqueue():
         # Expect the operation to succeed since it will complete
         # before the queue is closed.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
-          sess.run(blocking_enqueue_op)
+          self.evaluate(blocking_enqueue_op)
 
       thread1 = self.checkedThread(target=blocking_enqueue)
       thread1.start()
@@ -1001,7 +1003,7 @@ class RandomShuffleQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def blocking_close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       thread2 = self.checkedThread(target=blocking_close)
       thread2.start()
@@ -1032,7 +1034,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # This will block until the dequeue after the close.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread1 = self.checkedThread(target=blocking_enqueue)
       thread1.start()
@@ -1040,7 +1042,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # First blocking_enqueue_op of blocking_enqueue has enqueued 1 of 2
       # elements, and is blocked waiting for one more element to be dequeue.
       for i in range(50):
-        queue_size = size_t.eval()
+        queue_size = self.evaluate(size_t)
         if queue_size == 4:
           break
         elif i == 49:
@@ -1050,7 +1052,7 @@ class RandomShuffleQueueTest(test.TestCase):
         time.sleep(0.1)
 
       def blocking_close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       thread2 = self.checkedThread(target=blocking_close)
       thread2.start()
@@ -1064,7 +1066,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # At this point the close operation will complete, so the next enqueue
       # will fail.
       with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
   def testSharedQueueSameSession(self):
     with self.cached_session():
@@ -1216,23 +1218,23 @@ class RandomShuffleQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingDequeueUpTo(self, sess, dequeue_up_to_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_up_to_op)
+      self.evaluate(dequeue_up_to_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1383,7 +1385,7 @@ class RandomShuffleQueueTest(test.TestCase):
       def blocking_enqueue():
         enq_done.append(False)
         # This will fill the queue and then block until enough dequeues happen.
-        sess.run(enq)
+        self.evaluate(enq)
         enq_done.append(True)
 
       thread = self.checkedThread(target=blocking_enqueue)
@@ -1393,14 +1395,14 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1415,6 +1417,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       self.assertItemsEqual(elem, results)
 
+  @test_util.run_v1_only("b/120545219")
   def testBigDequeueMany(self):
     with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(2, 0, dtypes_lib.int32, ((),))
@@ -1426,7 +1429,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def blocking_dequeue():
         # Will only complete after 4 enqueues complete.
-        results.extend(sess.run(deq))
+        results.extend(self.evaluate(deq))
 
       thread = self.checkedThread(target=blocking_dequeue)
       thread.start()
@@ -1435,7 +1438,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         self.assertEqual(len(results), 0)
-        sess.run(enq)
+        self.evaluate(enq)
 
       # Enough enqueued to unblock the dequeue
       thread.join()
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..898f38444b7fcab52129c4e53761cdb78c2fd825
--- /dev/null
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -0,0 +1,160 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateless random ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops as stateless
+from tensorflow.python.platform import test
+
+
+def invert_philox(key, value):
+  """Invert the Philox bijection."""
+  key = np.array(key, dtype=np.uint32)
+  value = np.array(value, dtype=np.uint32)
+  step = np.array([0x9E3779B9, 0xBB67AE85], dtype=np.uint32)
+  for n in range(10)[::-1]:
+    key0, key1 = key + n * step
+    v0 = value[3] * 0x991a7cdb & 0xffffffff
+    v2 = value[1] * 0x6d7cae67 & 0xffffffff
+    hi0 = v0 * 0xD2511F53 >> 32
+    hi1 = v2 * 0xCD9E8D57 >> 32
+    v1 = hi1 ^ value[0] ^ key0
+    v3 = hi0 ^ value[2] ^ key1
+    value = v0, v1, v2, v3
+  return np.array(value)
+
+
+class StatelessOpsTest(test.TestCase):
+
+  def _test_match(self, cases):
+    # Stateless ops should be the same as stateful ops on the first call
+    # after seed scrambling.
+    cases = tuple(cases)
+    key = 0x3ec8f720, 0x02461e29
+    for seed in (7, 17), (11, 5), (2, 3):
+      preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
+      preseed = preseed[::2] | preseed[1::2] << 32
+      random_seed.set_random_seed(seed[0])
+      with test_util.use_gpu():
+        for stateless_op, stateful_op in cases:
+          stateful = stateful_op(seed=seed[1])
+          pure = stateless_op(seed=preseed)
+          self.assertAllEqual(self.evaluate(stateful), self.evaluate(pure))
+
+  def _test_determinism(self, cases):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    cases = tuple(cases)
+    with self.test_session(use_gpu=True):
+      for seed_type in [dtypes.int32, dtypes.int64]:
+        seed_t = array_ops.placeholder(seed_type, shape=[2])
+        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+        for stateless_op, _ in cases:
+          pure = stateless_op(seed=seed_t)
+          values = [
+              (seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds
+          ]
+          for s0, v0 in values:
+            for s1, v1 in values:
+              self.assertEqual(s0 == s1, np.all(v0 == v1))
+
+  def _float_cases(self, shape_dtypes=(None,)):
+    float_cases = (
+        # Uniform distribution, with and without range
+        (stateless.stateless_random_uniform, random_ops.random_uniform, {}),
+        (stateless.stateless_random_uniform, random_ops.random_uniform,
+         dict(minval=2.2, maxval=7.1)),
+        # Normal distribution, with and without mean+stddev
+        (stateless.stateless_random_normal, random_ops.random_normal, {}),
+        (stateless.stateless_random_normal, random_ops.random_normal,
+         dict(mean=2, stddev=3)),
+        # Truncated normal distribution, with and without mean+stddev
+        (stateless.stateless_truncated_normal, random_ops.truncated_normal, {}),
+        (stateless.stateless_truncated_normal, random_ops.truncated_normal,
+         dict(mean=3, stddev=4)),
+    )
+    for dtype in dtypes.float16, dtypes.float32, dtypes.float64:
+      for shape_dtype in shape_dtypes:
+        for shape in (), (3,), (2, 5):
+          if shape_dtype is not None:
+            shape = constant_op.constant(shape, dtype=shape_dtype)
+          for stateless_op, stateful_op, kwds in float_cases:
+            kwds = dict(shape=shape, dtype=dtype, **kwds)
+            yield (functools.partial(stateless_op, **kwds),
+                   functools.partial(stateful_op, **kwds))
+
+  def _int_cases(self, shape_dtypes=(None,)):
+    for shape_dtype in shape_dtypes:
+      for shape in (), (3,), (2, 5):
+        if shape_dtype is not None:
+          shape = constant_op.constant(shape, dtype=shape_dtype)
+        for dtype in dtypes.int32, dtypes.int64:
+          kwds = dict(minval=2, maxval=11111, dtype=dtype, shape=shape)
+          yield (functools.partial(stateless.stateless_random_uniform, **kwds),
+                 functools.partial(random_ops.random_uniform, **kwds))
+
+  def _multinomial_cases(self):
+    num_samples = 10
+    for logits_dtype in np.float16, np.float32, np.float64:
+      for output_dtype in dtypes.int32, dtypes.int64:
+        for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                  [0.25, 0.75]]):
+          kwds = dict(
+              logits=constant_op.constant(logits, dtype=logits_dtype),
+              num_samples=num_samples,
+              output_dtype=output_dtype)
+          yield (functools.partial(stateless.stateless_multinomial, **kwds),
+                 functools.partial(random_ops.multinomial, **kwds))
+
+  @test_util.run_deprecated_v1
+  def testMatchFloat(self):
+    self._test_match(self._float_cases())
+
+  @test_util.run_deprecated_v1
+  def testMatchInt(self):
+    self._test_match(self._int_cases())
+
+  @test_util.run_deprecated_v1
+  def testMatchMultinomial(self):
+    self._test_match(self._multinomial_cases())
+
+  @test_util.run_deprecated_v1
+  def testDeterminismFloat(self):
+    self._test_determinism(
+        self._float_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
+
+  @test_util.run_deprecated_v1
+  def testDeterminismInt(self):
+    self._test_determinism(
+        self._int_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
+
+  @test_util.run_deprecated_v1
+  def testDeterminismMultinomial(self):
+    self._test_determinism(self._multinomial_cases())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 8c84b2a49f506891ed05f2cd488861fa82480e89..43d15817e97e37a372dee940ef2c6baa35d8be24 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -28,6 +28,7 @@ import zlib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
@@ -140,147 +141,147 @@ class TFCompressionTestCase(test.TestCase):
 
 class IdentityReaderTest(test.TestCase):
 
-  def _ExpectRead(self, sess, key, value, expected):
-    k, v = sess.run([key, value])
+  def _ExpectRead(self, key, value, expected):
+    k, v = self.evaluate([key, value])
     self.assertAllEqual(expected, k)
     self.assertAllEqual(expected, v)
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      work_completed = reader.num_work_units_completed()
-      produced = reader.num_records_produced()
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queued_length = queue.size()
-      key, value = reader.read(queue)
+    reader = io_ops.IdentityReader("test_reader")
+    work_completed = reader.num_work_units_completed()
+    produced = reader.num_records_produced()
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    queued_length = queue.size()
+    key, value = reader.read(queue)
 
-      self.assertAllEqual(0, work_completed.eval())
-      self.assertAllEqual(0, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+    self.assertAllEqual(0, self.evaluate(work_completed))
+    self.assertAllEqual(0, self.evaluate(produced))
+    self.assertAllEqual(0, self.evaluate(queued_length))
 
-      queue.enqueue_many([["A", "B", "C"]]).run()
-      queue.close().run()
-      self.assertAllEqual(3, queued_length.eval())
+    self.evaluate(queue.enqueue_many([["A", "B", "C"]]))
+    self.evaluate(queue.close())
+    self.assertAllEqual(3, self.evaluate(queued_length))
 
-      self._ExpectRead(sess, key, value, b"A")
-      self.assertAllEqual(1, produced.eval())
+    self._ExpectRead(key, value, b"A")
+    self.assertAllEqual(1, self.evaluate(produced))
 
-      self._ExpectRead(sess, key, value, b"B")
+    self._ExpectRead(key, value, b"B")
 
-      self._ExpectRead(sess, key, value, b"C")
-      self.assertAllEqual(3, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+    self._ExpectRead(key, value, b"C")
+    self.assertAllEqual(3, self.evaluate(produced))
+    self.assertAllEqual(0, self.evaluate(queued_length))
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
 
-      self.assertAllEqual(3, work_completed.eval())
-      self.assertAllEqual(3, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+    self.assertAllEqual(3, self.evaluate(work_completed))
+    self.assertAllEqual(3, self.evaluate(produced))
+    self.assertAllEqual(0, self.evaluate(queued_length))
 
+  @test_util.run_deprecated_v1
   def testMultipleEpochs(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      enqueue = queue.enqueue_many([["DD", "EE"]])
-      key, value = reader.read(queue)
-
-      enqueue.run()
-      self._ExpectRead(sess, key, value, b"DD")
-      self._ExpectRead(sess, key, value, b"EE")
-      enqueue.run()
-      self._ExpectRead(sess, key, value, b"DD")
-      self._ExpectRead(sess, key, value, b"EE")
-      enqueue.run()
-      self._ExpectRead(sess, key, value, b"DD")
-      self._ExpectRead(sess, key, value, b"EE")
-      queue.close().run()
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
-
+    reader = io_ops.IdentityReader("test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    enqueue = queue.enqueue_many([["DD", "EE"]])
+    key, value = reader.read(queue)
+
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, b"DD")
+    self._ExpectRead(key, value, b"EE")
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, b"DD")
+    self._ExpectRead(key, value, b"EE")
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, b"DD")
+    self._ExpectRead(key, value, b"EE")
+    self.evaluate(queue.close())
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
+
+  @test_util.run_deprecated_v1
   def testSerializeRestore(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      produced = reader.num_records_produced()
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queue.enqueue_many([["X", "Y", "Z"]]).run()
-      key, value = reader.read(queue)
-
-      self._ExpectRead(sess, key, value, b"X")
-      self.assertAllEqual(1, produced.eval())
-      state = reader.serialize_state().eval()
-
-      self._ExpectRead(sess, key, value, b"Y")
-      self._ExpectRead(sess, key, value, b"Z")
-      self.assertAllEqual(3, produced.eval())
-
-      queue.enqueue_many([["Y", "Z"]]).run()
-      queue.close().run()
-      reader.restore_state(state).run()
-      self.assertAllEqual(1, produced.eval())
-      self._ExpectRead(sess, key, value, b"Y")
-      self._ExpectRead(sess, key, value, b"Z")
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
-      self.assertAllEqual(3, produced.eval())
-
-      self.assertEqual(bytes, type(state))
-
-      with self.assertRaises(ValueError):
-        reader.restore_state([])
-
-      with self.assertRaises(ValueError):
-        reader.restore_state([state, state])
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(state[1:]).run()
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(state[:-1]).run()
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(state + b"ExtraJunk").run()
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(b"PREFIX" + state).run()
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(b"BOGUS" + state[5:]).run()
-
+    reader = io_ops.IdentityReader("test_reader")
+    produced = reader.num_records_produced()
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    self.evaluate(queue.enqueue_many([["X", "Y", "Z"]]))
+    key, value = reader.read(queue)
+
+    self._ExpectRead(key, value, b"X")
+    self.assertAllEqual(1, self.evaluate(produced))
+    state = self.evaluate(reader.serialize_state())
+
+    self._ExpectRead(key, value, b"Y")
+    self._ExpectRead(key, value, b"Z")
+    self.assertAllEqual(3, self.evaluate(produced))
+
+    self.evaluate(queue.enqueue_many([["Y", "Z"]]))
+    self.evaluate(queue.close())
+    self.evaluate(reader.restore_state(state))
+    self.assertAllEqual(1, self.evaluate(produced))
+    self._ExpectRead(key, value, b"Y")
+    self._ExpectRead(key, value, b"Z")
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
+    self.assertAllEqual(3, self.evaluate(produced))
+
+    self.assertEqual(bytes, type(state))
+
+    with self.assertRaises(ValueError):
+      reader.restore_state([])
+
+    with self.assertRaises(ValueError):
+      reader.restore_state([state, state])
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(state[1:]))
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(state[:-1]))
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(state + b"ExtraJunk"))
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(b"PREFIX" + state))
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(b"BOGUS" + state[5:]))
+
+  @test_util.run_deprecated_v1
   def testReset(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      work_completed = reader.num_work_units_completed()
-      produced = reader.num_records_produced()
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queued_length = queue.size()
-      key, value = reader.read(queue)
+    reader = io_ops.IdentityReader("test_reader")
+    work_completed = reader.num_work_units_completed()
+    produced = reader.num_records_produced()
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    queued_length = queue.size()
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([["X", "Y", "Z"]]).run()
-      self._ExpectRead(sess, key, value, b"X")
-      self.assertLess(0, queued_length.eval())
-      self.assertAllEqual(1, produced.eval())
+    self.evaluate(queue.enqueue_many([["X", "Y", "Z"]]))
+    self._ExpectRead(key, value, b"X")
+    self.assertLess(0, self.evaluate(queued_length))
+    self.assertAllEqual(1, self.evaluate(produced))
 
-      self._ExpectRead(sess, key, value, b"Y")
-      self.assertLess(0, work_completed.eval())
-      self.assertAllEqual(2, produced.eval())
+    self._ExpectRead(key, value, b"Y")
+    self.assertLess(0, self.evaluate(work_completed))
+    self.assertAllEqual(2, self.evaluate(produced))
 
-      reader.reset().run()
-      self.assertAllEqual(0, work_completed.eval())
-      self.assertAllEqual(0, produced.eval())
-      self.assertAllEqual(1, queued_length.eval())
-      self._ExpectRead(sess, key, value, b"Z")
+    self.evaluate(reader.reset())
+    self.assertAllEqual(0, self.evaluate(work_completed))
+    self.assertAllEqual(0, self.evaluate(produced))
+    self.assertAllEqual(1, self.evaluate(queued_length))
+    self._ExpectRead(key, value, b"Z")
 
-      queue.enqueue_many([["K", "L"]]).run()
-      self._ExpectRead(sess, key, value, b"K")
+    self.evaluate(queue.enqueue_many([["K", "L"]]))
+    self._ExpectRead(key, value, b"K")
 
 
 class WholeFileReaderTest(test.TestCase):
@@ -301,44 +302,44 @@ class WholeFileReaderTest(test.TestCase):
       os.remove(fn)
     super(WholeFileReaderTest, self).tearDown()
 
-  def _ExpectRead(self, sess, key, value, index):
-    k, v = sess.run([key, value])
+  def _ExpectRead(self, key, value, index):
+    k, v = self.evaluate([key, value])
     self.assertAllEqual(compat.as_bytes(self._filenames[index]), k)
     self.assertAllEqual(self._content[index], v)
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
-    with self.cached_session() as sess:
-      reader = io_ops.WholeFileReader("test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queue.enqueue_many([self._filenames]).run()
-      queue.close().run()
-      key, value = reader.read(queue)
+    reader = io_ops.WholeFileReader("test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    self.evaluate(queue.enqueue_many([self._filenames]))
+    self.evaluate(queue.close())
+    key, value = reader.read(queue)
 
-      self._ExpectRead(sess, key, value, 0)
-      self._ExpectRead(sess, key, value, 1)
-      self._ExpectRead(sess, key, value, 2)
+    self._ExpectRead(key, value, 0)
+    self._ExpectRead(key, value, 1)
+    self._ExpectRead(key, value, 2)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testInfiniteEpochs(self):
-    with self.cached_session() as sess:
-      reader = io_ops.WholeFileReader("test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      enqueue = queue.enqueue_many([self._filenames])
-      key, value = reader.read(queue)
-
-      enqueue.run()
-      self._ExpectRead(sess, key, value, 0)
-      self._ExpectRead(sess, key, value, 1)
-      enqueue.run()
-      self._ExpectRead(sess, key, value, 2)
-      self._ExpectRead(sess, key, value, 0)
-      self._ExpectRead(sess, key, value, 1)
-      enqueue.run()
-      self._ExpectRead(sess, key, value, 2)
-      self._ExpectRead(sess, key, value, 0)
+    reader = io_ops.WholeFileReader("test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    enqueue = queue.enqueue_many([self._filenames])
+    key, value = reader.read(queue)
+
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, 0)
+    self._ExpectRead(key, value, 1)
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, 2)
+    self._ExpectRead(key, value, 0)
+    self._ExpectRead(key, value, 1)
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, 2)
+    self._ExpectRead(key, value, 0)
 
 
 class TextLineReaderTest(test.TestCase):
@@ -366,47 +367,48 @@ class TextLineReaderTest(test.TestCase):
     return filenames
 
   def _testOneEpoch(self, files):
-    with self.cached_session() as sess:
-      reader = io_ops.TextLineReader(name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TextLineReader(name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_lines):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j + 1), compat.as_text(k))
-          self.assertAllEqual(self._LineText(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_lines):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j + 1), compat.as_text(k))
+        self.assertAllEqual(self._LineText(i, j), v)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testOneEpochLF(self):
     self._testOneEpoch(self._CreateFiles(crlf=False))
 
+  @test_util.run_deprecated_v1
   def testOneEpochCRLF(self):
     self._testOneEpoch(self._CreateFiles(crlf=True))
 
+  @test_util.run_deprecated_v1
   def testSkipHeaderLines(self):
     files = self._CreateFiles()
-    with self.cached_session() as sess:
-      reader = io_ops.TextLineReader(skip_header_lines=1, name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TextLineReader(skip_header_lines=1, name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_lines - 1):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j + 2), compat.as_text(k))
-          self.assertAllEqual(self._LineText(i, j + 1), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_lines - 1):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j + 2), compat.as_text(k))
+        self.assertAllEqual(self._LineText(i, j + 1), v)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
 
 class FixedLengthRecordReaderTest(TFCompressionTestCase):
@@ -522,56 +524,55 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
   # gap_bytes=hop_bytes-record_bytes
   def _TestOneEpoch(self, files, num_records, gap_bytes, encoding=None):
     hop_bytes = 0 if gap_bytes == 0 else self._record_bytes + gap_bytes
-    with self.cached_session() as sess:
-      reader = io_ops.FixedLengthRecordReader(
-          header_bytes=self._header_bytes,
-          record_bytes=self._record_bytes,
-          footer_bytes=self._footer_bytes,
-          hop_bytes=hop_bytes,
-          encoding=encoding,
-          name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(num_records):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
-          self.assertAllEqual(self._Record(i, j), v)
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    reader = io_ops.FixedLengthRecordReader(
+        header_bytes=self._header_bytes,
+        record_bytes=self._record_bytes,
+        footer_bytes=self._footer_bytes,
+        hop_bytes=hop_bytes,
+        encoding=encoding,
+        name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(num_records):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
+        self.assertAllEqual(self._Record(i, j), v)
+
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
   def _TestOneEpochWithHopBytes(self,
                                 files,
                                 num_overlapped_records,
                                 encoding=None):
-    with self.cached_session() as sess:
-      reader = io_ops.FixedLengthRecordReader(
-          header_bytes=self._header_bytes,
-          record_bytes=self._record_bytes,
-          footer_bytes=self._footer_bytes,
-          hop_bytes=self._hop_bytes,
-          encoding=encoding,
-          name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(num_overlapped_records):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
-          self.assertAllEqual(self._OverlappedRecord(i, j), v)
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    reader = io_ops.FixedLengthRecordReader(
+        header_bytes=self._header_bytes,
+        record_bytes=self._record_bytes,
+        footer_bytes=self._footer_bytes,
+        hop_bytes=self._hop_bytes,
+        encoding=encoding,
+        name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(num_overlapped_records):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
+        self.assertAllEqual(self._OverlappedRecord(i, j), v)
+
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
     for num_records in [0, 7]:
       # gap_bytes=0: hop_bytes=0
@@ -580,6 +581,7 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
         files = self._CreateFiles(num_records, gap_bytes)
         self._TestOneEpoch(files, num_records, gap_bytes)
 
+  @test_util.run_deprecated_v1
   def testGzipOneEpoch(self):
     for num_records in [0, 7]:
       # gap_bytes=0: hop_bytes=0
@@ -588,6 +590,7 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
         files = self._CreateGzipFiles(num_records, gap_bytes)
         self._TestOneEpoch(files, num_records, gap_bytes, encoding="GZIP")
 
+  @test_util.run_deprecated_v1
   def testZlibOneEpoch(self):
     for num_records in [0, 7]:
       # gap_bytes=0: hop_bytes=0
@@ -596,17 +599,20 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
         files = self._CreateZlibFiles(num_records, gap_bytes)
         self._TestOneEpoch(files, num_records, gap_bytes, encoding="ZLIB")
 
+  @test_util.run_deprecated_v1
   def testOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateOverlappedRecordFiles(num_overlapped_records)
       self._TestOneEpochWithHopBytes(files, num_overlapped_records)
 
+  @test_util.run_deprecated_v1
   def testGzipOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records,)
       self._TestOneEpochWithHopBytes(
           files, num_overlapped_records, encoding="GZIP")
 
+  @test_util.run_deprecated_v1
   def testZlibOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateZlibOverlappedRecordFiles(num_overlapped_records)
@@ -619,95 +625,96 @@ class TFRecordReaderTest(TFCompressionTestCase):
   def setUp(self):
     super(TFRecordReaderTest, self).setUp()
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
     files = self._CreateFiles()
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_records):
-          k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
-          self.assertAllEqual(self._Record(i, j), v)
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    reader = io_ops.TFRecordReader(name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_records):
+        k, v = self.evaluate([key, value])
+        self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
+        self.assertAllEqual(self._Record(i, j), v)
 
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
+
+  @test_util.run_deprecated_v1
   def testReadUpTo(self):
     files = self._CreateFiles()
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      batch_size = 3
-      key, value = reader.read_up_to(queue, batch_size)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      num_k = 0
-      num_v = 0
-
-      while True:
-        try:
-          k, v = sess.run([key, value])
-          # Test reading *up to* batch_size records
-          self.assertLessEqual(len(k), batch_size)
-          self.assertLessEqual(len(v), batch_size)
-          num_k += len(k)
-          num_v += len(v)
-        except errors_impl.OutOfRangeError:
-          break
-
-      # Test that we have read everything
-      self.assertEqual(self._num_files * self._num_records, num_k)
-      self.assertEqual(self._num_files * self._num_records, num_v)
-
+    reader = io_ops.TFRecordReader(name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    batch_size = 3
+    key, value = reader.read_up_to(queue, batch_size)
+
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    num_k = 0
+    num_v = 0
+
+    while True:
+      try:
+        k, v = self.evaluate([key, value])
+        # Test reading *up to* batch_size records
+        self.assertLessEqual(len(k), batch_size)
+        self.assertLessEqual(len(v), batch_size)
+        num_k += len(k)
+        num_v += len(v)
+      except errors_impl.OutOfRangeError:
+        break
+
+    # Test that we have read everything
+    self.assertEqual(self._num_files * self._num_records, num_k)
+    self.assertEqual(self._num_files * self._num_records, num_v)
+
+  @test_util.run_deprecated_v1
   def testReadZlibFiles(self):
     options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
     files = self._CreateFiles(options)
 
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader", options=options)
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_records):
-          k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
-          self.assertAllEqual(self._Record(i, j), v)
+    reader = io_ops.TFRecordReader(name="test_reader", options=options)
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_records):
+        k, v = self.evaluate([key, value])
+        self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
+        self.assertAllEqual(self._Record(i, j), v)
+
+  @test_util.run_deprecated_v1
   def testReadGzipFiles(self):
     options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
     files = self._CreateFiles(options)
 
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader", options=options)
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TFRecordReader(name="test_reader", options=options)
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_records):
-          k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
-          self.assertAllEqual(self._Record(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_records):
+        k, v = self.evaluate([key, value])
+        self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
+        self.assertAllEqual(self._Record(i, j), v)
 
 
 class AsyncReaderTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoDeadlockFromQueue(self):
     """Tests that reading does not block main execution threads."""
     config = config_pb2.ConfigProto(
         inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
-    with self.test_session(config=config) as sess:
+    with self.session(config=config) as sess:
       thread_data_t = collections.namedtuple("thread_data_t",
                                              ["thread", "queue", "output"])
       thread_data = []
@@ -724,7 +731,7 @@ class AsyncReaderTest(test.TestCase):
         thread_data.append(thread_data_t(t, queue, output))
 
       # Start all readers. They are all blocked waiting for queue entries.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for d in thread_data:
         d.thread.start()
 
@@ -733,7 +740,7 @@ class AsyncReaderTest(test.TestCase):
         fname = os.path.join(self.get_temp_dir(), "deadlock.%s.txt" % i)
         with open(fname, "wb") as f:
           f.write(("file-%s" % i).encode())
-        d.queue.enqueue_many([[fname]]).run()
+        self.evaluate(d.queue.enqueue_many([[fname]]))
         d.thread.join()
         self.assertEqual([[("file-%s" % i).encode()]], d.output)
 
@@ -751,24 +758,25 @@ class LMDBReaderTest(test.TestCase):
     self.db_path = os.path.join(self.get_temp_dir(), "data.mdb")
     shutil.copy(path, self.db_path)
 
+  @test_util.run_deprecated_v1
   def testReadFromFile(self):
-    with self.cached_session() as sess:
-      reader = io_ops.LMDBReader(name="test_read_from_file")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue([self.db_path]).run()
-      queue.close().run()
-      for i in range(10):
-        k, v = sess.run([key, value])
-        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
-        self.assertAllEqual(
-            compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
-
+    reader = io_ops.LMDBReader(name="test_read_from_file")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue([self.db_path]))
+    self.evaluate(queue.close())
+    for i in range(10):
+      k, v = self.evaluate([key, value])
+      self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+      self.assertAllEqual(
+          compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
+
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
+
+  @test_util.run_deprecated_v1
   def testReadFromSameFile(self):
     with self.cached_session() as sess:
       reader1 = io_ops.LMDBReader(name="test_read_from_same_file1")
@@ -782,30 +790,31 @@ class LMDBReaderTest(test.TestCase):
       threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
       for _ in range(3):
         for _ in range(10):
-          k1, v1, k2, v2 = sess.run([key1, value1, key2, value2])
+          k1, v1, k2, v2 = self.evaluate([key1, value1, key2, value2])
           self.assertAllEqual(compat.as_bytes(k1), compat.as_bytes(k2))
           self.assertAllEqual(compat.as_bytes(v1), compat.as_bytes(v2))
       coord.request_stop()
       coord.join(threads)
 
+  @test_util.run_deprecated_v1
   def testReadFromFolder(self):
-    with self.cached_session() as sess:
-      reader = io_ops.LMDBReader(name="test_read_from_folder")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue([self.db_path]).run()
-      queue.close().run()
-      for i in range(10):
-        k, v = sess.run([key, value])
-        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
-        self.assertAllEqual(
-            compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
-
+    reader = io_ops.LMDBReader(name="test_read_from_folder")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue([self.db_path]))
+    self.evaluate(queue.close())
+    for i in range(10):
+      k, v = self.evaluate([key, value])
+      self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+      self.assertAllEqual(
+          compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
+
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
+
+  @test_util.run_deprecated_v1
   def testReadFromFileRepeatedly(self):
     with self.cached_session() as sess:
       reader = io_ops.LMDBReader(name="test_read_from_file_repeated")
@@ -819,7 +828,7 @@ class LMDBReaderTest(test.TestCase):
       for _ in range(3):
         # Go over all 10 records each time.
         for j in range(10):
-          k, v = sess.run([key, value])
+          k, v = self.evaluate([key, value])
           self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(j)))
           self.assertAllEqual(
               compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + j))))
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index ebb9872f226f35c4642f99c8aa161845657e4a73..ad8188b372fc5e4ac627098cbbfd8fac73359272 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import data_flow_ops
@@ -54,7 +55,7 @@ class RecordInputOpTest(test.TestCase):
           batch_size=1,
           name="record_input").get_yield_op()
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
   def testRecordInputSimpleGzip(self):
     with self.cached_session() as sess:
@@ -73,7 +74,7 @@ class RecordInputOpTest(test.TestCase):
           compression_type=tf_record.TFRecordCompressionType.GZIP).get_yield_op(
           )
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
   def testRecordInputSimpleZlib(self):
     with self.cached_session() as sess:
@@ -92,8 +93,9 @@ class RecordInputOpTest(test.TestCase):
           compression_type=tf_record.TFRecordCompressionType.ZLIB).get_yield_op(
           )
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
+  @test_util.run_deprecated_v1
   def testRecordInputEpochs(self):
     files = 100
     records_per_file = 100
@@ -117,7 +119,7 @@ class RecordInputOpTest(test.TestCase):
       for _ in range(3):
         epoch_set = set()
         for _ in range(int(files * records_per_file / batches)):
-          op_list = sess.run(yield_op)
+          op_list = self.evaluate(yield_op)
           self.assertTrue(len(op_list) is batches)
           for r in op_list:
             self.assertTrue(r[0] not in epoch_set)
@@ -138,16 +140,18 @@ class RecordInputOpTest(test.TestCase):
 
         yield_op = records.get_yield_op()
         for _ in range(50):
-          sess.run(yield_op)
+          self.evaluate(yield_op)
 
+  @test_util.run_deprecated_v1
   def testEmptyGlob(self):
     with self.cached_session() as sess:
       record_input = data_flow_ops.RecordInput(file_pattern="foo")
       yield_op = record_input.get_yield_op()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       with self.assertRaises(NotFoundError):
-        sess.run(yield_op)
+        self.evaluate(yield_op)
 
+  @test_util.run_deprecated_v1
   def testBufferTooSmall(self):
     files = 10
     records_per_file = 10
@@ -171,7 +175,7 @@ class RecordInputOpTest(test.TestCase):
       for _ in range(3):
         epoch_set = set()
         for _ in range(int(files * records_per_file / batches)):
-          op_list = sess.run(yield_op)
+          op_list = self.evaluate(yield_op)
           self.assertTrue(len(op_list) is batches)
           for r in op_list:
             self.assertTrue(r[0] not in epoch_set)
diff --git a/tensorflow/python/kernel_tests/reduce_benchmark_test.py b/tensorflow/python/kernel_tests/reduce_benchmark_test.py
index 3a2fb81157d923f39daf77ab23f0f20162e592e7..ef9c4c350fdeafd3ea872fc648f13e1fb246a513 100644
--- a/tensorflow/python/kernel_tests/reduce_benchmark_test.py
+++ b/tensorflow/python/kernel_tests/reduce_benchmark_test.py
@@ -81,7 +81,7 @@ class ReduceBenchmarks(test.Benchmark):
       grad, = gradients_impl.gradients(reduction, tensor)
 
       def fn():
-        sess.run(grad.op)
+        self.evaluate(grad.op)
 
       self._run(fn, 10000)
 
@@ -98,7 +98,7 @@ class ReduceBenchmarks(test.Benchmark):
         grad, = gradients_impl.gradients(reduction, tensor)
 
       def fn():
-        sess.run(grad.op)
+        self.evaluate(grad.op)
 
       self._run(fn, 10000)
 
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index 3bb4986313db74ba439991566ab2947722ab890d..49b6620779e13caa635af493914f13a65a6e0257 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -25,6 +25,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -119,7 +120,7 @@ class ReduceJoinTest(UnicodeTestCase):
           axis=axis,
           keep_dims=keep_dims,
           separator=separator)
-      output_array = output.eval()
+      output_array = self.evaluate(output)
 
     self.assertAllEqualUnicode(truth, output_array)
     self.assertAllEqual(truth_shape, output.get_shape())
@@ -149,10 +150,10 @@ class ReduceJoinTest(UnicodeTestCase):
       if not axis:
         truth = constant_op.constant(truth)
       truth_squeezed = array_ops.squeeze(truth, axis=axis)
-      output_array = output.eval()
-      output_keep_dims_array = output_keep_dims.eval()
-      truth_array = truth.eval()
-      truth_squeezed_array = truth_squeezed.eval()
+      output_array = self.evaluate(output)
+      output_keep_dims_array = self.evaluate(output_keep_dims)
+      truth_array = self.evaluate(truth)
+      truth_squeezed_array = self.evaluate(truth_squeezed)
     self.assertAllEqualUnicode(truth_array, output_keep_dims_array)
     self.assertAllEqualUnicode(truth_squeezed_array, output_array)
     self.assertAllEqual(truth.get_shape(), output_keep_dims.get_shape())
@@ -230,6 +231,7 @@ class ReduceJoinTest(UnicodeTestCase):
         axis=1,
         separator="  ")
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     input_array = [["a"], ["b"]]
     truth = ["ab"]
@@ -241,6 +243,7 @@ class ReduceJoinTest(UnicodeTestCase):
       self.assertAllEqualUnicode(truth, output_array)
       self.assertAllEqual(truth_shape, reduced.get_shape())
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     input_array = [["this", "is", "a", "test"],
                    ["please", "do", "not", "panic"]]
@@ -297,6 +300,7 @@ class ReduceJoinTest(UnicodeTestCase):
       for permutation in itertools.permutations(xrange(num_dims), i):
         self._testMultipleReduceJoin(input_array, axis=permutation)
 
+  @test_util.run_deprecated_v1
   def testInvalidReductionIndices(self):
     with self.cached_session():
       with self.assertRaisesRegexp(ValueError, "Invalid reduction dim"):
@@ -318,13 +322,14 @@ class ReduceJoinTest(UnicodeTestCase):
 
       # Reduction that drops the dim of size 0.
       output = string_ops.reduce_join(inputs=inputs, axis=0)
-      self.assertAllEqualUnicode([""], output.eval())
+      self.assertAllEqualUnicode([""], self.evaluate(output))
 
       # Reduction that keeps the dim of size 0.
       output = string_ops.reduce_join(inputs=inputs, axis=1)
-      output_shape = output.eval().shape
+      output_shape = self.evaluate(output).shape
       self.assertAllEqual([0], output_shape)
 
+  @test_util.run_deprecated_v1
   def testInvalidArgsUnknownShape(self):
     with self.cached_session():
       placeholder = array_ops.placeholder(dtypes.string, name="placeholder")
@@ -335,6 +340,7 @@ class ReduceJoinTest(UnicodeTestCase):
       with self.assertRaisesOpError("Duplicate reduction dimension 1"):
         duplicate_index.eval(feed_dict={placeholder.name: [[""]]})
 
+  @test_util.run_deprecated_v1
   def testInvalidArgsUnknownIndices(self):
     with self.cached_session():
       placeholder = array_ops.placeholder(dtypes.int32, name="placeholder")
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 248036a82ac7b54550937c8580c4c58370005ae1..67a89461f3a885056f47c62af40bf6cfccd60583 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -60,6 +61,7 @@ class ReducedShapeTest(test.TestCase):
     output = math_ops.reduced_shape(shape, axes=axes)
     self.assertAllEqual(output.eval(), result)
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.cached_session():
       self._check([3], [], [3])
@@ -69,6 +71,7 @@ class ReducedShapeTest(test.TestCase):
       self._check([5, 3], [1], [5, 1])
       self._check([5, 3], [0, 1], [1, 1])
 
+  @test_util.run_deprecated_v1
   def testZeros(self):
     """Check that reduced_shape does the right thing with zero dimensions."""
     with self.cached_session():
@@ -83,6 +86,7 @@ class ReducedShapeTest(test.TestCase):
       self._check([3, 0], [1], [3, 1])
       self._check([3, 0], [0, 1], [1, 1])
 
+  @test_util.run_deprecated_v1
   def testNegAxes(self):
     with self.cached_session():
       self._check([10, 10, 10], [-1], [10, 10, 1])
@@ -94,6 +98,7 @@ class ReducedShapeTest(test.TestCase):
 
 class ReductionUnknownShape(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session():
       for dtype, reductions in [(dtypes.float32,
@@ -131,7 +136,7 @@ class BaseReductionTest(test.TestCase):
 
   def _compare(self, x, reduction_axes, keepdims, feed_dict=None):
     np_ans = self._np_reduce(x, reduction_axes, keepdims)
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       tf_ans = self._tf_reduce(x, reduction_axes, keepdims)
       out = sess.run(tf_ans, feed_dict)
     self.assertAllClose(np_ans, out)
@@ -153,7 +158,7 @@ class BaseReductionTest(test.TestCase):
     if reduction_axes is not None and np.shape(reduction_axes) == (1,):
       # Test scalar reduction_axes argument
       self._compareGradient(x, reduction_axes[0], rtol=rtol, atol=atol)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       t = ops.convert_to_tensor(x)
       su = self._tf_reduce(t, reduction_axes, False)
       jacob_t, jacob_n = gradient_checker.compute_gradient(
@@ -183,11 +188,12 @@ class SumReductionTest(BaseReductionTest):
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_sum([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -195,11 +201,13 @@ class SumReductionTest(BaseReductionTest):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat16(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float16)
@@ -216,9 +224,10 @@ class SumReductionTest(BaseReductionTest):
       tf_arr = variables.Variable(arr)
       variables.global_variables_initializer().run()
       tf_mean = math_ops.reduce_mean(tf_arr, 0, False)
-      tf_out_mean = sess.run(tf_mean)
+      tf_out_mean = self.evaluate(tf_mean)
     self.assertAllClose(tf_out_mean, 1.)
 
+  @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
@@ -238,7 +247,7 @@ class SumReductionTest(BaseReductionTest):
       with self.session(graph=ops.Graph(), use_gpu=True) as sess:
         tf_row_sum = self._tf_reduce(arr, 1, False)
         tf_col_sum = self._tf_reduce(arr, 0, False)
-        tf_out_row, tf_out_col = sess.run([tf_row_sum, tf_col_sum])
+        tf_out_row, tf_out_col = self.evaluate([tf_row_sum, tf_col_sum])
       self.assertAllClose(col_sum, tf_out_col)
       self.assertAllClose(row_sum, tf_out_row)
 
@@ -252,25 +261,29 @@ class SumReductionTest(BaseReductionTest):
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
             tf_sum_xz = self._tf_reduce(arr, [0, 2], False)
             tf_sum_y = self._tf_reduce(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            tf_out_sum_xz, tf_out_sum_y = self.evaluate([tf_sum_xz, tf_sum_y])
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     np_arr = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     input_tensor = ops.convert_to_tensor(np_arr)
@@ -284,6 +297,7 @@ class SumReductionTest(BaseReductionTest):
         ValueError, lambda e: "Invalid reduction dimension" in str(e)):
       math_ops.reduce_sum(input_tensor, [0, 2])
 
+  @test_util.run_deprecated_v1
   def testPartialShapes(self):
     np.random.seed(1618)
 
@@ -298,10 +312,10 @@ class SumReductionTest(BaseReductionTest):
 
     # Input shape only has known rank.
     c_known_rank = array_ops.placeholder(dtypes.float32)
-    c_known_rank.set_shape(tensor_shape.unknown_shape(ndims=3))
+    c_known_rank.set_shape(tensor_shape.unknown_shape(rank=3))
     s_known_rank = math_ops.reduce_sum(
         c_known_rank, reduction_axes, keepdims=True)
-    self.assertEqual(3, s_known_rank.get_shape().ndims)
+    self.assertEqual(3, s_known_rank.get_shape().rank)
 
     np_input = np.random.randn(3, 3, 3)
     self._compareAll(np_input, reduction_axes, {c_known_rank: np_input})
@@ -315,8 +329,9 @@ class SumReductionTest(BaseReductionTest):
                      s_unknown_indices.get_shape())
     s_unknown_indices_keep = math_ops.reduce_sum(
         c_unknown_indices, unknown_indices, keepdims=True)
-    self.assertEqual(2, s_unknown_indices_keep.get_shape().ndims)
+    self.assertEqual(2, s_unknown_indices_keep.get_shape().rank)
 
+  @test_util.run_deprecated_v1
   def testWrongShapeForReductionIndices(self):
     reduction_axes = [[1], [2]]
     c_unknown = array_ops.placeholder(dtypes.float32)
@@ -326,6 +341,7 @@ class SumReductionTest(BaseReductionTest):
 
   # Int64??
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     for dtype in [
         dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128
@@ -333,6 +349,7 @@ class SumReductionTest(BaseReductionTest):
       x = self._makeIncremental([2, 3, 4, 2], dtype)
       self._compareGradientAxes(x)
 
+  @test_util.run_deprecated_v1
   def testHighRank(self):
     # Do a bunch of random high dimensional reductions
     np.random.seed(42)
@@ -350,20 +367,23 @@ class SumReductionTest(BaseReductionTest):
                    np.arange(1, rank, 2)):
         self._compareAll(data, axes)
 
+  @test_util.run_deprecated_v1
   def testExpand(self):
     # Reduce an empty tensor to a nonempty tensor
     x = np.zeros((5, 0))
     self._compareAll(x, [1])
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = array_ops.zeros([0, 3])
       y = math_ops.reduce_sum(x, [1])
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
                     dtypes.complex64, dtypes.complex128):
         # A large number is needed to get Eigen to die
@@ -398,11 +418,12 @@ class MeanReductionTest(BaseReductionTest):
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_mean([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -410,46 +431,54 @@ class MeanReductionTest(BaseReductionTest):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
     for dtype in [dtypes.float32, dtypes.float64]:
       x = self._makeIncremental(s, dtype)
       self._compareGradientAxes(x, rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = array_ops.zeros([0, 3])
       y = math_ops.reduce_mean(x, [1])
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
         # A large number is needed to get Eigen to die
         x = array_ops.zeros((0, 9938), dtype=dtype)
@@ -471,11 +500,12 @@ class ProdReductionTest(BaseReductionTest):
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_prod([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -483,6 +513,7 @@ class ProdReductionTest(BaseReductionTest):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     # Numpy automatically upgrades the type of np.prod from int32 to int64, so
     # Numpy does not overflow an int32 np.prod while TensorFlow does. To avoid
@@ -491,26 +522,31 @@ class ProdReductionTest(BaseReductionTest):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32) / 2
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testGradientWithZeros(self):
     s = [2, 3, 4, 2]
     x = self._makeIncremental(s, dtypes.float32) / 20.
@@ -533,15 +569,17 @@ class ProdReductionTest(BaseReductionTest):
     x4[:, :, :, :] = 0
     self._compareGradientAxes(x4, rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = array_ops.zeros([0, 3])
       y = math_ops.reduce_prod(x, [1])
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
         # A large number is needed to get Eigen to die
         x = array_ops.zeros((0, 9938), dtype=dtype)
@@ -558,11 +596,11 @@ class MinReductionTest(test.TestCase):
     else:
       for ra in reduction_axes[::-1]:
         np_ans = np.amin(np_ans, axis=ra, keepdims=keepdims)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_min(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -574,11 +612,12 @@ class MinReductionTest(test.TestCase):
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_min([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -614,6 +653,7 @@ class MinReductionTest(test.TestCase):
     self._compareAll(np_arr, [0, 2])
     self._compareAll(np_arr, [0, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -624,6 +664,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [2, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient2(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -634,6 +675,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [2, 4, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient3(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -644,6 +686,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [2, 3, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient4(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -654,6 +697,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [1], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.cached_session():
       x = array_ops.zeros([0, 3])
@@ -671,11 +715,11 @@ class MaxReductionTest(test.TestCase):
     else:
       for ra in reduction_axes[::-1]:
         np_ans = np.amax(np_ans, axis=ra, keepdims=keepdims)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_max(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -687,11 +731,12 @@ class MaxReductionTest(test.TestCase):
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_max([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -741,6 +786,7 @@ class MaxReductionTest(test.TestCase):
     self._compareAll(np_arr, [0, 2])
     self._compareAll(np_arr, [0, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -751,6 +797,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [2, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient2(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -761,6 +808,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [2, 4, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient3(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -771,6 +819,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [2, 3, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient4(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -781,6 +830,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [1], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.cached_session():
       x = array_ops.zeros([0, 3])
@@ -798,11 +848,11 @@ class AllReductionTest(test.TestCase):
     else:
       for ra in reduction_axes[::-1]:
         np_ans = np.all(np_ans, axis=ra, keepdims=keepdims)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_all(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -814,10 +864,10 @@ class AllReductionTest(test.TestCase):
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         v = math_ops.reduce_all([True, True],
                                 constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, True)
 
   def testAll3D(self):
@@ -847,11 +897,11 @@ class AnyReductionTest(test.TestCase):
     else:
       for ra in reduction_axes[::-1]:
         np_ans = np.any(np_ans, axis=ra, keepdims=keepdims)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_any(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -863,10 +913,10 @@ class AnyReductionTest(test.TestCase):
 
   def testAxesType(self):
     for dtype in [dtypes.int64, dtypes.int32]:
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         v = math_ops.reduce_any([True, True],
                                 constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, True)
 
   def testAll3D(self):
@@ -898,7 +948,7 @@ class CountNonzeroReductionTest(test.TestCase):
       reduction_axes = np.array(reduction_axes).astype(np.int32)
       for ra in reduction_axes.ravel()[::-1]:
         np_ans = np.sum(np_ans, axis=ra, keepdims=keepdims)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       tf_ans = math_ops.count_nonzero(x, reduction_axes, keepdims)
       out = sess.run(tf_ans, feed_dict)
     self.assertAllClose(np_ans, out)
@@ -913,6 +963,7 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compare(x, reduction_axes, True, use_gpu=True, feed_dict=feed_dict)
     self._compare(x, reduction_axes, True, use_gpu=False, feed_dict=feed_dict)
 
+  @test_util.run_deprecated_v1
   def testBoolReduce1D(self):
     # Create a 1D array of floats
     np_arr = np.asarray([False, False, True, False, False, True])
@@ -920,11 +971,13 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compareAll(np_arr, [])
     self._compareAll(np_arr, [0])
 
+  @test_util.run_deprecated_v1
   def testFloatReduce1D(self):
     # Create a 1D array of floats
     np_arr = np.asarray([0.0, 1.0, -1.0, 0.0, 0.0, 3.0]).astype(np.float32)
     self._compareAll(np_arr, [0])
 
+  @test_util.run_deprecated_v1
   def testFloatReduce4D(self):
     # Create a 4D array of floats and reduce across some
     # dimensions
@@ -944,14 +997,16 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compareAll(np_arr, [1, 2, 3])
     self._compareAll(np_arr, [0, 1, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testExpand(self):
     # Reduce an empty tensor to a nonempty tensor
     x = np.zeros((5, 0))
     self._compareAll(x, [1])
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         for dtype in (dtypes.bool,):
           # A large number is needed to get Eigen to die
           x = array_ops.zeros((0, 9938), dtype=dtype)
@@ -962,8 +1017,9 @@ class CountNonzeroReductionTest(test.TestCase):
     # Test case for GitHub issue 18712
     with self.cached_session() as sess:
       v = math_ops.count_nonzero(constant_op.constant(["test"]))
-      self.assertAllClose(sess.run(v), 1)
+      self.assertAllClose(self.evaluate(v), 1)
 
+  @test_util.run_deprecated_v1
   def testStringReduce1D(self):
     # Create a 1D array of strings
     x = np.asarray(["", "", "a", "", "", "b"])
@@ -974,6 +1030,7 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compare(x, [], keepdims=True, zero=np.str(""))
     self._compare(x, [0], keepdims=True, zero=np.str(""))
 
+  @test_util.run_deprecated_v1
   def testStringReduce2D(self):
     # Create a 2D array of strings
     x = np.asarray([["", "", "a", "", "", "b"],
diff --git a/tensorflow/python/kernel_tests/regex_full_match_op_test.py b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
index 98746e7d9b19e5ba52a73b7ca3d9967cc813c133..488ec85ab2cae79d23c0434b075edaaee6869da6 100644
--- a/tensorflow/python/kernel_tests/regex_full_match_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ from tensorflow.python.platform import test
     (gen_string_ops.static_regex_full_match))
 class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRegexFullMatch(self, op):
     values = ["abaaba", "abcdabcde"]
     with self.cached_session():
@@ -40,6 +42,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       matched = op(input_tensor, "a.*a").eval()
       self.assertAllEqual([True, False], matched)
 
+  @test_util.run_deprecated_v1
   def testRegexFullMatchTwoDims(self, op):
     values = [["abaaba", "abcdabcde"], ["acdcba", "ebcda"]]
     with self.cached_session():
@@ -47,6 +50,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       matched = op(input_tensor, "a.*a").eval()
       self.assertAllEqual([[True, False], [True, False]], matched)
 
+  @test_util.run_deprecated_v1
   def testEmptyMatch(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -54,6 +58,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       matched = op(input_tensor, "").eval()
       self.assertAllEqual([False, False], matched)
 
+  @test_util.run_deprecated_v1
   def testInvalidPattern(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -61,11 +66,12 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       invalid_pattern = "A["
       matched = op(input_tensor, invalid_pattern)
       with self.assertRaisesOpError("Invalid pattern"):
-        matched.eval()
+        self.evaluate(matched)
 
 
 class RegexFullMatchOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRegexFullMatchDelegation(self):
     with compat.forward_compatibility_horizon(2018, 11, 1):
       with self.cached_session():
@@ -78,6 +84,7 @@ class RegexFullMatchOpTest(test.TestCase):
         op_tensor = string_ops.regex_full_match(input_tensor, pattern_tensor)
         self.assertTrue(op_tensor.name.startswith("RegexFullMatch"), op.name)
 
+  @test_util.run_deprecated_v1
   def testStaticRegexFullMatchDelegation(self):
     with compat.forward_compatibility_horizon(2018, 11, 20):
       with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/regex_replace_op_test.py b/tensorflow/python/kernel_tests/regex_replace_op_test.py
index d9b7ed28d21652e964977c1938cd5d2cefb17825..6c7dfee7b401ee317d77367538a5fb41bc62d540 100644
--- a/tensorflow/python/kernel_tests/regex_replace_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_replace_op_test.py
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
     (gen_string_ops.static_regex_replace))
 class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testForwarding(self, op):
     with self.cached_session():
       # Generate an input that is uniquely consumed by the regex op.
@@ -45,6 +47,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       stripped = op(inp, "\\p{Ll}", ".").eval()
       self.assertAllEqual([b"A.C.E", b"H.J.L"], stripped)
 
+  @test_util.run_deprecated_v1
   def testRemovePrefix(self, op):
     values = ["a:foo", "a:bar", "a:foo", "b:baz", "b:qux", "ca:b"]
     with self.cached_session():
@@ -53,6 +56,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual([b"foo", b"bar", b"foo", b"baz", b"qux", b"ca:b"],
                           stripped)
 
+  @test_util.run_deprecated_v1
   def testRegexReplace(self, op):
     values = ["aba\naba", "abcdabcde"]
     with self.cached_session():
@@ -60,6 +64,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       stripped = op(input_vector, "a.*a", "(\\0)").eval()
       self.assertAllEqual([b"(aba)\n(aba)", b"(abcda)bcde"], stripped)
 
+  @test_util.run_deprecated_v1
   def testEmptyMatch(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -67,6 +72,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       stripped = op(input_vector, "", "x").eval()
       self.assertAllEqual([b"xaxbxcx", b"x1x"], stripped)
 
+  @test_util.run_deprecated_v1
   def testInvalidPattern(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -74,8 +80,9 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       invalid_pattern = "A["
       replace = op(input_vector, invalid_pattern, "x")
       with self.assertRaisesOpError("Invalid pattern"):
-        replace.eval()
+        self.evaluate(replace)
 
+  @test_util.run_deprecated_v1
   def testGlobal(self, op):
     values = ["ababababab", "abcabcabc", ""]
     with self.cached_session():
@@ -98,6 +105,7 @@ class RegexReplaceTest(test.TestCase, parameterized.TestCase):
       (as_string, as_tensor),
       (as_tensor, as_string),
       (as_tensor, as_tensor))
+  @test_util.run_deprecated_v1
   def testRegexReplaceDelegation(self, pattern_fn, rewrite_fn):
     with self.cached_session():
       input_vector = constant_op.constant("foo", dtypes.string)
@@ -106,6 +114,7 @@ class RegexReplaceTest(test.TestCase, parameterized.TestCase):
       op = string_ops.regex_replace(input_vector, pattern, replace)
       self.assertTrue(op.name.startswith("RegexReplace"))
 
+  @test_util.run_deprecated_v1
   def testStaticRegexReplaceDelegation(self):
     with self.cached_session():
       input_vector = constant_op.constant("foo", dtypes.string)
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index a45a325b4757112e46bf44f59727b21582bdfd21..d4ba1ad77d5547ccb9fe4e2154d145751cf63514 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -21,12 +21,15 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python import tf2
+from tensorflow.python.compat import compat
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -54,162 +57,163 @@ class ReluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testRelu(self, np_features, use_gpu=False):
+  def _testRelu(self, np_features):
     np_relu = self._npRelu(np_features)
-    with self.test_session(use_gpu=use_gpu):
-      relu = nn_ops.relu(np_features)
-      tf_relu = relu.eval()
+    tf_relu = nn_ops.relu(np_features)
     self.assertAllClose(np_relu, tf_relu)
-    self.assertShapeEqual(np_relu, relu)
+    self.assertShapeEqual(np_relu, tf_relu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testRelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testRelu(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
-  def _testReluInt8x4(self, np_inputs):
-    if not test.is_gpu_available(cuda_only=True):
-      return
-    np_relu = self._npRelu(np_inputs)
-    with self.test_session(use_gpu=True):
-      relu = nn_ops.relu(constant_op.constant(np_inputs, dtypes.qint8))
-      if np_inputs.size % 4 == 0:
-        tf_relu = relu.eval()
-        self.assertAllClose(np_relu, tf_relu)
-        self.assertShapeEqual(np_relu, relu)
-      else:
-        with self.assertRaisesRegexp(
-            errors.InvalidArgumentError,
-            "Tensor size must be a multiple of 4 for Relu<qint8>. Got %d" %
-            np_inputs.size):
-          tf_relu = relu.eval()
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testReluInt8x4GoodShape(self):
-    self._testReluInt8x4(np.array([[-50, 7, 23, 0], [-1, -5, 6, 11]]))
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest("No GPU available")
+    inputs = np.array([[-50, 7, 23, 0], [-1, -5, 6, 11]])
+    np_relu = self._npRelu(inputs)
+    tf_relu = nn_ops.relu(constant_op.constant(inputs, dtypes.qint8))
+    self.assertAllClose(np_relu, tf_relu)
+    self.assertShapeEqual(np_relu, tf_relu)
 
   def testReluInt8x4BadShape(self):
-    np_inputs = np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]])
-    self.assertEqual(np_inputs.size, 9)
-    self._testReluInt8x4(np_inputs)
-    np_inputs = np.array(
-        [1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1])
-    self.assertEqual(np_inputs.size, 17)
-    self._testReluInt8x4(np_inputs)
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest("No GPU available")
+    inputs = constant_op.constant(
+        np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]]), dtypes.qint8)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tensor size must be a multiple of 4 for Relu<qint8>. Got 9"):
+      self.evaluate(nn_ops.relu(inputs))
+
+    inputs = constant_op.constant(
+        np.array([1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1]),
+        dtypes.qint8)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tensor size must be a multiple of 4 for Relu<qint8>. Got 17"):
+      self.evaluate(nn_ops.relu(inputs))
 
   # The gradient test for ReLU is a bit tricky as the derivative is not well
   # defined at around zero and we want to avoid that in terms of input values.
   def testGradientFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu, [x]))
     print("relu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   # The gradient for fp16 is inaccurate due to the low-precision.
-  # Instead of relying on compute_gradient_error, we compare the fp16 analytical
-  # gradient against their fp32 counterpart.
+  # We compare the fp16 analytical gradient against their fp32 counterpart.
   def testGradientFloat16(self):
-    with self.test_session(use_gpu=True) as sess:
-      # Randomly construct a 1D shape from [1, 40)
-      shape = random_ops.random_uniform(
-          [1], minval=1, maxval=40, dtype=dtypes.int32)
-
-      # Construct the fp32 graph and its gradient.
-      x = random_ops.random_uniform(shape, minval=-1, maxval=1, name="x")
-      y1 = nn_ops.relu(x, name="relu_fp32")
-      l1 = nn_ops.l2_loss(y1)
-      dx_f32 = gradients_impl.gradients(l1, x)
-
-      # Construct the fp16 graph and its gradient.
-      # It starts with the same x, in fp32. But before it reaches Relu, it is
-      # cast into fp16. So during backprop, the gradient computation is in fp16.
-      x2 = math_ops.cast(x, dtype=dtypes.float16, name="cast")
-      y2 = nn_ops.relu(x2, name="relu_fp16")
-      l2 = nn_ops.l2_loss(y2)
-      dx_f16 = gradients_impl.gradients(l2, x)
-
-      # Repeat the experiment for 100 times. All tensor shapes and its tensor
-      # values are randomly generated for each run.
-      for _ in xrange(100):
-        dx_f32_v, dx_f16_v = sess.run([dx_f32, dx_f16])
-        self.assertAllClose(dx_f32_v, dx_f16_v, atol=3e-4)
+
+    def grad(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = nn_ops.l2_loss(nn_ops.relu(x))
+      return tape.gradient(y, x)
+
+    def f():
+      with test_util.use_gpu():
+        # Randomly construct a 1D shape from [1, 40)
+        shape = random_ops.random_uniform([1],
+                                          minval=1,
+                                          maxval=40,
+                                          dtype=dtypes.int32)
+        x32 = random_ops.random_uniform(shape, minval=-1, maxval=1)
+        x16 = math_ops.cast(x32, dtype=dtypes.float16)
+        return grad(x32), grad(x16)
+
+    # We're going to ensure that the fp16 and fp32 gradients
+    # are "close" to each other for ~100 random values.
+    #
+    # In TensorFlow 1.x, invoking f() (without eager execution enabled)
+    # would construct a graph. Instead of construct a graph with O(100) nodes,
+    # we construct a single graph to be executed ~100 times in a Session.
+    if not tf2.enabled():
+      d32_tensor, d16_tensor = f()
+      with self.cached_session() as sess:
+        f = lambda: sess.run([d32_tensor, d16_tensor])
+
+    # Repeat the experiment for 100 times. All tensor shapes and its tensor
+    # values are randomly generated for each run.
+    for _ in xrange(100):
+      d32, d16 = f()
+      self.assertAllClose(d32, d16, atol=3e-4)
 
   def testGradientFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu, [x]))
     print("relu (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
   def testGradGradFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.relu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("relu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float64
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.relu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("relu (float64) gradient of gradient err = ", err)
     self.assertLess(err, 1e-10)
 
   def testGradientScalar(self):
-    with self.cached_session() as sess:
-      x = variables.Variable(100.)
-      y = nn_ops.relu(x)
-      loss = y**2
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.25)
-      train_op = optimizer.minimize(loss)
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(x.eval(), 50.0)
+    x = variables.Variable(100.)
+
+    def loss():
+      return nn_ops.relu(x)**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.25)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(optimizer.minimize(loss))
+    self.assertAllClose(x.read_value(), 50.0)
 
 
 class Relu6Test(test.TestCase):
@@ -227,61 +231,165 @@ class Relu6Test(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7,
                                                     0.9]])))
 
-  def _testRelu6(self, np_features, use_gpu=False):
+  def _testRelu6(self, np_features):
     np_relu6 = self._npRelu6(np_features)
-    with self.test_session(use_gpu=use_gpu):
-      relu6 = nn_ops.relu6(np_features)
-      tf_relu6 = relu6.eval()
+    tf_relu6 = nn_ops.relu6(np_features)
     self.assertAllClose(np_relu6, tf_relu6)
-    self.assertShapeEqual(np_relu6, relu6)
+    self.assertShapeEqual(np_relu6, tf_relu6)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testRelu6(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float, np.double]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testRelu6(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float, np.double]:
+      self._testRelu6(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   # The gradient test for ReLU6 is a bit tricky as the derivative is
   # not well defined at around zero and six and we want to avoid that
   # in terms of input values.
   def testGradientFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 6.1, 6.3, 6.5, 6.7, 6.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.relu6(x, name="relu6")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu6, [x]))
     print("relu6 (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradientFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 6.1, 6.3, 6.5, 6.7, 6.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.relu6(x, name="relu6")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu6, [x]))
     print("relu6 (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
 
+class LeakyReluTest(test.TestCase):
+
+  def _npLeakyRelu(self, np_features, alpha=0.1):
+    return np.maximum(np_features, alpha * np_features)
+
+  def testNpLeakyRelu(self):
+    self.assertAllClose(
+        np.array([[-0.09, 0.7, -0.05, 0.3, -0.01],
+                  [0.1, -0.03, 0.5, -0.07, 0.9]]),
+        self._npLeakyRelu(
+            np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
+                                                     0.9]]),
+            alpha=0.1))
+
+  def _testLeakyRelu(self, np_features, alpha):
+    np_leaky_relu = self._npLeakyRelu(np_features, alpha)
+    tf_leaky_relu = nn_ops.leaky_relu(np_features, alpha)
+    self.assertAllClose(np_leaky_relu, tf_leaky_relu)
+    self.assertShapeEqual(np_leaky_relu, tf_leaky_relu)
+
+  def testNumbersCPU(self):
+    for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
+        self._testLeakyRelu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+            alpha=0.2)
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testLeakyRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+          alpha=0.1)
+
+  # The gradient test for Leaky ReLU is a bit tricky as the derivative is not
+  # well defined at around zero and we want to avoid that in terms of input
+  # values.
+  def testGradientFloat32(self):
+    with self.cached_session():
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.leaky_relu, [x]))
+    print("leaky_relu (float32) gradient err = ", err)
+    self.assertLess(err, 1e-4)
+
+  def testGradientFloat64(self):
+    with self.cached_session():
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float64,
+          order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.leaky_relu, [x]))
+    print("leaky_relu (float64) gradient err = ", err)
+    self.assertLess(err, 1e-10)
+
+  def testGradGradFloat32(self):
+    with compat.forward_compatibility_horizon(2018, 11, 2):
+      with self.cached_session():
+
+        def f(x):
+          assert x.dtype == dtypes.float32
+          with backprop.GradientTape() as tape:
+            tape.watch(x)
+            y = nn_ops.leaky_relu(x)
+          return tape.gradient(y, x)
+
+        x = np.asarray(
+            [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+            dtype=np.float32,
+            order="F")
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(f, [x]))
+      print("leaky_relu (float32) gradient of gradient err = ", err)
+      self.assertLess(err, 1e-4)
+
+  def testGradGradFloat64(self):
+    with compat.forward_compatibility_horizon(2018, 11, 2):
+      with self.cached_session():
+
+        def f(x):
+          assert x.dtype == dtypes.float64
+          with backprop.GradientTape() as tape:
+            tape.watch(x)
+            y = nn_ops.leaky_relu(x)
+          return tape.gradient(y, x)
+
+        x = np.asarray(
+            [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+            dtype=np.float64,
+            order="F")
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(f, [x]))
+      print("leaky_relu (float64) gradient of gradient err = ", err)
+      self.assertLess(err, 1e-10)
+
+  def testGradientScalar(self):
+    x = variables.Variable(-100.)
+
+    def loss():
+      return nn_ops.leaky_relu(x, 0.05)**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.2)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(optimizer.minimize(loss))
+    self.assertAllClose(x.read_value(), -99.9)
+
+
 class EluTest(test.TestCase):
 
   def _npElu(self, np_features):
@@ -295,88 +403,94 @@ class EluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testElu(self, np_features, use_gpu=False):
+  def _testElu(self, np_features):
     np_elu = self._npElu(np_features)
-    with self.test_session(use_gpu=use_gpu):
-      elu = nn_ops.elu(np_features)
-      tf_elu = elu.eval()
+    tf_elu = nn_ops.elu(np_features)
     self.assertAllClose(np_elu, tf_elu)
-    self.assertShapeEqual(np_elu, elu)
+    self.assertShapeEqual(np_elu, tf_elu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.float16, np.float32, np.float64]:
-      self._testElu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      self._testElu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=True)
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
+        self._testElu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testElu(np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testGradientFloat32(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, name="x")
-      y = nn_ops.elu(x, name="elu")
-      x_init = np.asarray(x_val, dtype=np.float32, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float32, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.elu, [x]))
     print("elu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradientFloat64(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, dtype=dtypes.float64, name="x")
-      y = nn_ops.elu(x, name="elu")
-      x_init = np.asarray(x_val, dtype=np.float64, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float64, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.elu, [x]))
     print("elu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
   def testGradGrad(self):
     with self.cached_session():
-      x = array_ops.placeholder(dtype=dtypes.float32)
-      elu = nn_ops.elu(x)
-      g, = gradients_impl.gradients(elu, x)
-      gg, = gradients_impl.gradients(g, x)
 
-      for x_val in [-1, -0.5, 0.5, 1]:
-        err = np.abs(gg.eval(feed_dict={x: x_val}) - _elu_grad_grad(x_val))
+      def f(x):
+        with backprop.GradientTape(persistent=True) as tape:
+          tape.watch(x)
+          y = nn_ops.elu(x)
+          dy = tape.gradient(y, x)
+        return tape.gradient(dy, x)
+
+      for x in [-1., -0.5, 0.5, 1.]:
+        got = self.evaluate(f(constant_op.constant(x)))
+        want = _elu_grad_grad(x)
+        err = np.abs(got - want)
         self.assertLess(err, 1e-4)
 
   def testGradGradFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.elu(x, name="elu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.elu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("elu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.elu(x, name="elu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float64
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.elu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("elu (float64) gradient of gradient err = ", err)
     self.assertLess(err, 1e-6)
 
@@ -397,77 +511,74 @@ class SeluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testSelu(self, np_features, use_gpu=False):
+  def _testSelu(self, np_features):
     np_selu = self._npSelu(np_features)
-    with self.test_session(use_gpu=use_gpu):
-      selu = nn_ops.selu(np_features)
-      tf_selu = selu.eval()
+    tf_selu = nn_ops.selu(np_features)
     self.assertAllClose(np_selu, tf_selu)
-    self.assertShapeEqual(np_selu, selu)
+    self.assertShapeEqual(np_selu, tf_selu)
 
   def testNumbers(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testSelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      self._testSelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=True)
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+      # Force executed on CPU in case GPU kernels are avaiable.
+      with ops.device("/device:CPU:0"):
+        self._testSelu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testGradientFloat32(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, name="x")
-      y = nn_ops.selu(x, name="selu")
-      x_init = np.asarray(x_val, dtype=np.float32, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float32, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.selu, [x]))
     print("selu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradientFloat64(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, dtype=dtypes.float64, name="x")
-      y = nn_ops.selu(x, name="selu")
-      x_init = np.asarray(x_val, dtype=np.float64, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float64, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.selu, [x]))
     print("selu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
   def testGradGradFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.selu(x, name="selu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.selu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("selu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.selu(x, name="selu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float64
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.selu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("selu (float64) gradient of gradient err = ", err)
     self.assertLess(err, 1e-6)
 
@@ -479,46 +590,44 @@ class CreluTest(test.TestCase):
     t = nn_ops.crelu(f)
     self.assertEqual([50, 5, 7, 20], t.get_shape())
 
-  def _testCrelu(self, np_features, use_gpu=False):
+  def _testCrelu(self, np_features):
     np_relu = np.maximum(np_features, np.zeros_like(np_features))
     np_neg_relu = np.maximum(-np_features, np.zeros_like(np_features))
     np_crelu = np.concatenate((np_relu, np_neg_relu),
                               len(np_features.shape) - 1)
 
-    with self.test_session(use_gpu=use_gpu):
-      crelu = nn_ops.crelu(np_features)
-      tf_relu = crelu.eval()
+    tf_crelu = nn_ops.crelu(np_features)
 
-    self.assertAllClose(np_crelu, tf_relu)
-    self.assertShapeEqual(np_crelu, crelu)
+    self.assertAllClose(np_crelu, tf_crelu)
+    self.assertShapeEqual(np_crelu, tf_crelu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testCrelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testCrelu(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testCrelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testNumbersWithAxis0(self):
-    with self.cached_session():
-      crelu = nn_ops.crelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=0)
-      tf_relu = crelu.eval()
-      np_crelu = np.array([[0, 7, 0, 3, 0], [1, 0, 5, 0, 9], [9, 0, 5, 0, 1],
-                           [0, 3, 0, 7, 0]])
-      self.assertAllEqual(np_crelu, tf_relu)
+    tf_crelu = nn_ops.crelu(
+        np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=0)
+    np_crelu = np.array([[0, 7, 0, 3, 0], [1, 0, 5, 0, 9], [9, 0, 5, 0, 1],
+                         [0, 3, 0, 7, 0]])
+    self.assertAllEqual(np_crelu, tf_crelu)
 
   def testNumbersWithAxis1(self):
-    with self.cached_session():
-      crelu = nn_ops.crelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=1)
-      tf_relu = crelu.eval()
-      np_crelu = np.array([[0, 7, 0, 3, 0, 9, 0, 5, 0, 1],
-                           [1, 0, 5, 0, 9, 0, 3, 0, 7, 0]])
-      self.assertAllEqual(np_crelu, tf_relu)
+    tf_crelu = nn_ops.crelu(
+        np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=1)
+    np_crelu = np.array([[0, 7, 0, 3, 0, 9, 0, 5, 0, 1],
+                         [1, 0, 5, 0, 9, 0, 3, 0, 7, 0]])
+    self.assertAllEqual(np_crelu, tf_crelu)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/reshape_op_test.py b/tensorflow/python/kernel_tests/reshape_op_test.py
index ca3ff1d1df45dacbb0dfd52d8f3f0e36735e3068..db3e88a104f44fbea4df757a10203eea7ebcb278 100644
--- a/tensorflow/python/kernel_tests/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/reshape_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -30,17 +31,17 @@ from tensorflow.python.platform import test
 class ReshapeTest(test.TestCase):
 
   def _testReshape(self, x, y, use_gpu=False):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       np_ans = x.reshape(y)
       tf_ans = array_ops.reshape(x, y)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertEqual(tf_ans.get_shape(), out.shape)
       self.assertShapeEqual(np_ans, tf_ans)
 
       # Repeat with an int64 shape tensor.
       y64 = constant_op.constant(y, dtype=dtypes.int64)
       tf_ans = array_ops.reshape(x, y64)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertEqual(tf_ans.get_shape(), out.shape)
       self.assertShapeEqual(np_ans, tf_ans)
 
@@ -91,6 +92,7 @@ class ReshapeTest(test.TestCase):
   # TODO(vrv): Add tests for failure conditions once python test_util
   # reports errors.
 
+  @test_util.run_deprecated_v1
   def testFloatReshapeGradThreeDimensions(self):
     x = np.arange(1., 25.).reshape([2, 3, 4]).astype(np.float32)
     s = list(np.shape(x))
@@ -111,6 +113,7 @@ class ReshapeTest(test.TestCase):
     self._testBothReshape(x, [0, 0, 0])
     self._testBothReshape(x, [1, -1, 5])
 
+  @test_util.run_deprecated_v1
   def testErrors(self):
     y = constant_op.constant(0.0, shape=[23, 29, 31])
     with self.assertRaisesRegexp(ValueError, "must be evenly divisible by 17"):
@@ -121,6 +124,7 @@ class ReshapeTest(test.TestCase):
                                  "Cannot reshape a tensor with 4096 elements"):
       array_ops.reshape(z, [4095])
 
+  @test_util.run_deprecated_v1
   def testPartialShapes(self):
     x = array_ops.placeholder(dtypes.float32)
 
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index a9fd93e9f8760f41e890cf07f57751deb1cad793..433957fd1d38890c0952c443097e4955e1eb99cb 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -29,9 +29,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -53,6 +56,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     # involving objects with __del__ defined.
     self.assertEqual(0, len(gc.garbage))
 
+  @test_util.run_deprecated_v1
   def testHandleDtypeShapeMatch(self):
     with self.cached_session():
       handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
@@ -122,6 +126,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       # values.
       self.assertFalse(np.allclose(variable.numpy(), copied_variable.numpy()))
 
+  @test_util.run_deprecated_v1
   def testGraphDeepCopy(self):
     with self.cached_session():
       init_value = np.ones((4, 4, 4))
@@ -137,6 +142,15 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(v[0].assign(2.0))
     self.assertAllEqual(self.evaluate(v), [2.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testVariableShape(self):
+    v = resource_variable_ops.ResourceVariable([1., 1.])
+    self.assertAllEqual(
+        tensor_util.constant_value(
+            resource_variable_ops.variable_shape(v.handle)),
+        [2])
+
+  @test_util.run_deprecated_v1
   def testDifferentAssignGraph(self):
     with ops.Graph().as_default():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -144,16 +158,18 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.assign(2.0)  # Note: this fails if we run convert_to_tensor on not the
     # variable graph.
 
+  @test_util.run_deprecated_v1
   def testFetchHandle(self):
     with self.cached_session():
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       self.assertGreater(len(handle.eval()), 0)
 
+  @test_util.run_deprecated_v1
   def testCachedValueReadBeforeWrite(self):
     with self.cached_session() as sess:
       v = resource_variable_ops.ResourceVariable(0.0, caching_device="cpu:0")
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
       value, _ = sess.run([v, v.assign_add(1.0)])
       self.assertAllEqual(value, 0.0)
 
@@ -426,6 +442,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
     self.assertEqual(self.evaluate(read), [[6]])
 
+  @test_util.run_deprecated_v1
   def testScatterUpdateString(self):
     handle = resource_variable_ops.var_handle_op(
         dtype=dtypes.string, shape=[1, 1])
@@ -437,6 +454,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(compat.as_bytes(self.evaluate(read)[0][0]),
                      compat.as_bytes("b"))
 
+  @test_util.run_deprecated_v1
   def testScatterUpdateStringScalar(self):
     handle = resource_variable_ops.var_handle_op(
         dtype=dtypes.string, shape=[1, 1])
@@ -456,7 +474,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   # TODO(alive): get this to work in Eager mode.
   def testGPU(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       abc = variable_scope.get_variable(
           "abc",
           shape=[1],
@@ -491,6 +509,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           initial_value=lambda: 1, constraint=constraint, name="var1")
 
   # TODO(alive): how should this work in Eager mode?
+  @test_util.run_deprecated_v1
   def testInitFn(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(
@@ -568,6 +587,48 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.load(2.0)
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+  def testShapePassedToGradient(self):
+    with ops.Graph().as_default():
+      @custom_gradient.custom_gradient
+      def differentiable_scatter_update(handle, indices, values):
+        with ops.control_dependencies([
+            resource_variable_ops.resource_scatter_update(
+                handle, indices, values)]):
+          new_handle = array_ops.identity(handle)
+
+        def grad(dresult):
+          self.assertIsNotNone(
+              tensor_util.constant_value(dresult.dense_shape))
+          return [dresult, None, None]
+
+        return new_handle, grad
+
+      var = variable_scope.get_variable(
+          "foo", shape=[20], initializer=init_ops.zeros_initializer,
+          dtype=dtypes.float64, use_resource=True)
+
+      indices = math_ops.range(10)
+      updates = math_ops.range(9, -1, -1, dtype=dtypes.float64)
+      new_handle = differentiable_scatter_update(var.handle, indices, updates)
+      gathered = resource_variable_ops.resource_gather(
+          new_handle, indices, dtype=var.dtype)
+      gradients_impl.gradients([gathered], [updates])
+
+  def testToFromProtoCachedValue(self):
+    with ops.Graph().as_default():
+      v_def = resource_variable_ops.ResourceVariable(
+          initial_value=constant_op.constant(3.0)).to_proto()
+      v_prime = resource_variable_ops.ResourceVariable(variable_def=v_def)
+      self.assertTrue(getattr(v_prime, "_cached_value", None) is None)
+
+      other_v_def = resource_variable_ops.ResourceVariable(
+          caching_device="cpu:0",
+          initial_value=constant_op.constant(3.0)).to_proto()
+      other_v_prime = resource_variable_ops.ResourceVariable(
+          variable_def=other_v_def)
+      self.assertTrue(other_v_prime._cached_value is not None)
+
+  @test_util.run_v1_only("b/120545219")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = resource_variable_ops.ResourceVariable(
@@ -576,11 +637,11 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default(), self.cached_session() as sess:
       # v describes a VariableDef-based variable without an initial value.
       v = resource_variable_ops.ResourceVariable(variable_def=v_def)
-      self.assertEqual(3.0, sess.run(v.initialized_value()))
+      self.assertEqual(3.0, self.evaluate(v.initialized_value()))
 
       # initialized_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
-      sess.run(v.assign(1.0))
+      self.evaluate(v.assign(1.0))
       self.assertEqual(1.0, v.initialized_value().eval())
 
     v_def.ClearField("initial_value_name")
@@ -592,7 +653,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       self.assertProtoEquals(v_def, v.to_proto())
       # But attempts to use initialized_value will result in errors.
       with self.assertRaises(ValueError):
-        sess.run(v.initialized_value())
+        self.evaluate(v.initialized_value())
 
   def testTrainableInProto(self):
     with ops.Graph().as_default():
@@ -623,6 +684,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     value = self.evaluate(v.sparse_read([0, 3, 1, 2]))
     self.assertAllEqual(init_value[[0, 3, 1, 2], ...], value)
 
+  @test_util.run_deprecated_v1
   def testToFromProto(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -671,6 +733,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(0.0, self.evaluate(v.value()))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testDestroyResource(self):
     v = resource_variable_ops.ResourceVariable(3.0, name="var0")
     self.evaluate(variables.global_variables_initializer())
@@ -684,6 +747,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(resource_variable_ops.destroy_resource_op(
         handle, ignore_lookup_error=True))
 
+  @test_util.run_deprecated_v1
   def testAssignDifferentShapes(self):
     with self.cached_session() as sess, variable_scope.variable_scope(
         "foo", use_resource=True):
@@ -704,6 +768,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
+  @test_util.run_deprecated_v1
   def testDtypeAfterFromProto(self):
     v = resource_variable_ops.ResourceVariable(2.0)
     w = resource_variable_ops.ResourceVariable.from_proto(v.to_proto())
@@ -711,6 +776,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(v.dtype, w.dtype)
 
   # TODO(alive): get caching to work in eager mode.
+  @test_util.run_deprecated_v1
   def testCachingDevice(self):
     with ops.device("/job:server/task:1"):
       v = resource_variable_ops.ResourceVariable(
@@ -726,6 +792,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         _ = w.value().op.get_attr("_class")
 
+  @test_util.run_v1_only("b/120545219")
   def testSharedName(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(300.0, name="var4")
@@ -736,7 +803,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           # Needed in Eager since we get a unique container name by default.
           container=ops.get_default_graph()._container)
       w_read = resource_variable_ops.read_variable_op(w, v.dtype.base_dtype)
-      self.assertEqual(300.0, w_read.eval())
+      self.assertEqual(300.0, self.evaluate(w_read))
 
       x = resource_variable_ops.var_handle_op(
           dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="var5",
@@ -744,6 +811,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("Resource .*/var5/.* does not exist"):
         resource_variable_ops.read_variable_op(x, v.dtype.base_dtype).eval()
 
+  @test_util.run_deprecated_v1
   def testSharedNameWithNamescope(self):
     with self.cached_session():
       with ops.name_scope("foo"):
@@ -772,6 +840,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           "<unknown>",
           str(v.sparse_read(array_ops.placeholder(dtypes.int32)).shape))
 
+  @test_util.run_deprecated_v1
   def testSetInitialValue(self):
     with self.cached_session():
       # Initialize variable with a value different from the initial value passed
@@ -780,6 +849,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v.initializer.run(feed_dict={v.initial_value: 3.0})
       self.assertEqual(3.0, v.value().eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
 
@@ -915,6 +985,18 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(Exception, r"hapes must be equal"):
       self.assertAllEqual(self.evaluate(v.assign_add(1)), [1, 2, 3, 4])
 
+  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
+  def testCopyToGraphUninitialized(self):
+    v = resource_variable_ops.ResourceVariable([0, 1, 2, 3])
+    copy_to_graph = ops.Graph()
+    with copy_to_graph.as_default():  # Intentionally testing v1 behavior
+      copied = resource_variable_ops.copy_to_graph_uninitialized(v)
+      self.assertEqual(v.name, copied.name)
+      with self.session(copy_to_graph) as session:
+        with self.assertRaises(errors.InvalidArgumentError):
+          session.run(copied.initializer)
+
 
 class _MixedPrecisionVariableTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 8fc71e0c573685f89410cb4df474a00dde24bcf2..05307c9834ad2ab05bb5a2b557466255e92c6d1e 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -38,16 +39,16 @@ class ReverseSequenceTest(test.TestCase):
                            truth,
                            use_gpu=False,
                            expected_err_re=None):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       ans = array_ops.reverse_sequence(
           x, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=seq_lengths)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertAllClose(tf_ans, truth, atol=1e-10)
         self.assertShapeEqual(truth, ans)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBothReverseSequence(self,
                                x,
@@ -107,6 +108,7 @@ class ReverseSequenceTest(test.TestCase):
   def testComplex128Basic(self):
     self._testBasic(np.complex128)
 
+  @test_util.run_deprecated_v1
   def testFloatReverseSequenceGrad(self):
     x = np.asarray(
         [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]],
@@ -133,6 +135,7 @@ class ReverseSequenceTest(test.TestCase):
     print("ReverseSequence gradient error = %g" % err)
     self.assertLess(err, 1e-8)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     t = array_ops.reverse_sequence(
         array_ops.placeholder(
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 2f6963f6b87f00f6ba2473ac50b449ef3441d68d..a49496e4ef15bc2772fe7abdac4d801b77787079 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -36,6 +36,8 @@ from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import network as keras_network
+from tensorflow.python.layers import base as base_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -52,6 +54,7 @@ import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
 from tensorflow.python.training import training
+from tensorflow.python.util import nest
 
 
 class Plus1RNNCell(rnn_cell_impl.RNNCell):
@@ -127,6 +130,28 @@ class TensorArrayStateRNNCell(rnn_cell_impl.RNNCell):
     return (input_, (state[0] + 1, new_array))
 
 
+class KerasNetworkTFRNNs(keras_network.Network):
+
+  def __init__(self, name=None):
+    super(KerasNetworkTFRNNs, self).__init__(name=name)
+    self._cell = rnn_cell_impl.MultiRNNCell(
+        [rnn_cell_impl.LSTMCell(1) for _ in range(2)])
+
+  def call(self, inputs):
+    return self._cell(inputs, self._cell.get_initial_state(inputs))
+
+
+class KerasNetworkKerasRNNs(keras_network.Network):
+
+  def __init__(self, name=None):
+    super(KerasNetworkKerasRNNs, self).__init__(name=name)
+    self._cell = keras.layers.StackedRNNCells(
+        [keras.layers.LSTMCell(1) for _ in range(2)])
+
+  def call(self, inputs):
+    return self._cell(inputs, self._cell.get_initial_state(inputs))
+
+
 class RNNTest(test.TestCase):
 
   def setUp(self):
@@ -177,15 +202,15 @@ class RNNTest(test.TestCase):
       inputs = array_ops.placeholder(dtypes.float32, shape=(None, 4, 5))
       # - Without initial_state
       outputs, state = rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32)
-      self.assertEqual(None, outputs.shape[0].value)
-      self.assertEqual(None, state.shape[0].value)
+      self.assertEqual(None, outputs.shape.dims[0].value)
+      self.assertEqual(None, state.shape.dims[0].value)
       # - With initial_state
       outputs, state = rnn.dynamic_rnn(
           cell,
           inputs,
           initial_state=array_ops.placeholder(dtypes.float32, shape=(None, 5)))
-      self.assertEqual(None, outputs.shape[0].value)
-      self.assertEqual(None, state.shape[0].value)
+      self.assertEqual(None, outputs.shape.dims[0].value)
+      self.assertEqual(None, state.shape.dims[0].value)
 
   @test_util.run_in_graph_and_eager_modes
   def testScalarStateIsAccepted(self):
@@ -237,6 +262,7 @@ class RNNTest(test.TestCase):
       rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=[4])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArrayStateIsAccepted(self):
     cell = TensorArrayStateRNNCell()
     in_eager_mode = context.executing_eagerly()
@@ -260,6 +286,7 @@ class RNNTest(test.TestCase):
     self.assertAllEqual(4, state[0])
     self.assertAllEqual([[[1]], [[2]], [[3]], [[4]]], state[1])
 
+  @test_util.run_deprecated_v1
   def testCellGetInitialState(self):
     cell = rnn_cell_impl.BasicRNNCell(5)
     with self.assertRaisesRegexp(
@@ -320,6 +347,7 @@ class RNNTest(test.TestCase):
     self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f32, 5, 7, 3)
     self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f64, 5, 7, 3)
 
+  @test_util.run_deprecated_v1
   def testRNNWithKerasSimpleRNNCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -353,6 +381,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(outputs), batch)
       self.assertEqual(len(state), batch)
 
+  @test_util.run_deprecated_v1
   def testRNNWithKerasGRUCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -386,6 +415,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(outputs), batch)
       self.assertEqual(len(state), batch)
 
+  @test_util.run_deprecated_v1
   def testRNNWithKerasLSTMCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -423,6 +453,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(state[0]), batch)
       self.assertEqual(len(state[1]), batch)
 
+  @test_util.run_deprecated_v1
   def testRNNWithStackKerasCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -447,6 +478,8 @@ class RNNTest(test.TestCase):
       outputs, state = rnn.dynamic_rnn(
           cell, inputs, dtype=dtypes.float32)
       self.assertEqual(outputs.shape.as_list(), [None, timestep, output_shape])
+      self.assertEqual(len(state), 2)
+      state = nest.flatten(state)
       self.assertEqual(len(state), 4)
       self.assertEqual(state[0].shape.as_list(), [None, 2 * output_shape])
       self.assertEqual(state[1].shape.as_list(), [None, 2 * output_shape])
@@ -464,6 +497,7 @@ class RNNTest(test.TestCase):
       for s in state:
         self.assertEqual(len(s), batch)
 
+  @test_util.run_deprecated_v1
   def testStaticRNNWithKerasSimpleRNNCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -502,6 +536,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(outputs[0]), batch)
       self.assertEqual(len(state), batch)
 
+  @test_util.run_deprecated_v1
   def testKerasAndTFRNNLayerOutputComparison(self):
     input_shape = 10
     output_shape = 5
@@ -535,6 +570,7 @@ class RNNTest(test.TestCase):
     self.assertAllClose(tf_out, k_out)
     self.assertAllClose(tf_state, k_state)
 
+  @test_util.run_deprecated_v1
   def testSimpleRNNCellAndBasicRNNCellComparison(self):
     input_shape = 10
     output_shape = 5
@@ -554,7 +590,7 @@ class RNNTest(test.TestCase):
     kernel, recurrent_kernel, bias = keras_weights
     tf_weights = [np.concatenate((kernel, recurrent_kernel)), bias]
 
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape))
       cell = keras.layers.SimpleRNNCell(output_shape)
@@ -562,7 +598,7 @@ class RNNTest(test.TestCase):
           cell, inputs, dtype=dtypes.float32)
       cell.set_weights(keras_weights)
       [k_out, k_state] = sess.run([k_out, k_state], {inputs: x_train})
-    with self.test_session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops_lib.Graph()) as sess:
       inputs = array_ops.placeholder(
           dtypes.float32, shape=(None, timestep, input_shape))
       cell = rnn_cell_impl.BasicRNNCell(output_shape)
@@ -571,9 +607,10 @@ class RNNTest(test.TestCase):
       cell.set_weights(tf_weights)
       [tf_out, tf_state] = sess.run([tf_out, tf_state], {inputs: x_train})
 
-    self.assertAllClose(tf_out, k_out)
-    self.assertAllClose(tf_state, k_state)
+    self.assertAllClose(tf_out, k_out, atol=1e-5)
+    self.assertAllClose(tf_state, k_state, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testBasicLSTMCellInterchangeWithLSTMCell(self):
     with self.session(graph=ops_lib.Graph()) as sess:
       basic_cell = rnn_cell_impl.BasicLSTMCell(1)
@@ -635,6 +672,32 @@ class RNNTest(test.TestCase):
         y_np_2 = model.predict(x_np)
         self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
+  def testRNNCellActsLikeKerasRNNCellInProperScope(self):
+    with base_layers.keras_style_scope():
+      kn1 = KerasNetworkTFRNNs(name="kn1")
+      kn2 = KerasNetworkKerasRNNs(name="kn2")
+
+    z = array_ops.zeros((2, 3))
+
+    kn1(z)
+    kn2(z)
+
+    # pylint: disable=protected-access
+    self.assertTrue(all("kn1" in v.name for v in kn1._cell.variables))
+    self.assertTrue(all("kn2" in v.name for v in kn2._cell.variables))
+
+    with base_layers.keras_style_scope():
+      kn1_new = KerasNetworkTFRNNs(name="kn1_new")
+      kn2_new = KerasNetworkKerasRNNs(name="kn2_new")
+
+    kn2_new(z)
+    # Most importantly, this doesn't fail due to variable scope reuse issues.
+    kn1_new(z)
+
+    self.assertTrue(all("kn1_new" in v.name for v in kn1_new._cell.variables))
+    self.assertTrue(all("kn2_new" in v.name for v in kn2_new._cell.variables))
+
+
 ######### Benchmarking RNN code
 
 
diff --git a/tensorflow/python/kernel_tests/save_restore_ops_test.py b/tensorflow/python/kernel_tests/save_restore_ops_test.py
index cb9aa1e34d6eb82efa94e60e7b56c26b181cef04..fecc9a3800fd85958d204144613a3f239ea43404 100644
--- a/tensorflow/python/kernel_tests/save_restore_ops_test.py
+++ b/tensorflow/python/kernel_tests/save_restore_ops_test.py
@@ -17,14 +17,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
 
 
+class SaveTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRelativePath(self):
+    os.chdir(self.get_temp_dir())
+    self.evaluate(io_ops.save_v2(
+        "ckpt", ["x"], [""], [constant_op.constant(100.)]))
+    self.assertAllEqual([100.],
+                        self.evaluate(io_ops.restore_v2(
+                            "ckpt", ["x"], [""], [dtypes.float32])))
+
+
 class ShardedFileOpsTest(test.TestCase):
 
   def testShardedFileName(self):
@@ -39,6 +55,7 @@ class ShardedFileOpsTest(test.TestCase):
 
 class ShapeInferenceTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRestoreV2WithSliceInput(self):
     op = io_ops.restore_v2("model", ["var1", "var2"], ["", "3 4 0,1:-"],
                            [dtypes.float32, dtypes.float32])
@@ -46,11 +63,13 @@ class ShapeInferenceTest(test.TestCase):
     self.assertFalse(op[0].get_shape().is_fully_defined())
     self.assertEqual([1, 4], op[1].get_shape())
 
+  @test_util.run_deprecated_v1
   def testRestoreV2NumSlicesNotMatch(self):
     with self.assertRaises(ValueError):
       io_ops.restore_v2("model", ["var1", "var2", "var3"], ["", "3 4 0,1:-"],
                         [dtypes.float32, dtypes.float32])
 
+  @test_util.run_deprecated_v1
   def testRestoreSlice(self):
     op = gen_io_ops.restore_slice("model", "var", "3 4 0,1:-", dtypes.float32)
     self.assertEqual([1, 4], op.get_shape())
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index 08b4a2aaae2469a7fedb13d47493c02cf8306a9b..33e491fee1dadbcce225dfa70310d47a21b6893c 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -78,7 +79,7 @@ class CumsumTest(test.TestCase):
 
   def _compare(self, x, axis, exclusive, reverse):
     np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_out = math_ops.cumsum(x, axis, exclusive, reverse).eval()
 
     self.assertAllClose(np_out, tf_out)
@@ -88,48 +89,60 @@ class CumsumTest(test.TestCase):
       for reverse in [True, False]:
         self._compare(x, axis, exclusive, reverse)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in self.valid_dtypes:
       x = np.zeros([0]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def testAxisType(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis_dtype in [dtypes.int64, dtypes.int32]:
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           axis = constant_op.constant(0, axis_dtype)
           tf_out = math_ops.cumsum(x, axis).eval()
 
+  @test_util.run_deprecated_v1
   def test1D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test2D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(0, 10).reshape([2, 5]).astype(dtype)
       for axis in (-2, -1, 0, 1):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test3D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(0, 20).reshape([2, 2, 5]).astype(dtype)
       for axis in (-3, -2, -1, 0, 1, 2):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test6D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
       for axis in range(-6, 6, 3):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
+  def testLarge(self):
+    for dtype in self.valid_dtypes:
+      x = np.ones([1000000], dtype=dtype) / 1024
+      self._compareAll(x, 0)
+
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     input_tensor = ops.convert_to_tensor(x)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
@@ -145,29 +158,34 @@ class CumsumTest(test.TestCase):
 
   def _compareGradient(self, shape, axis, exclusive, reverse):
     x = np.arange(0, 50).reshape(shape).astype(np.float64)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       t = ops.convert_to_tensor(x)
       result = math_ops.cumsum(t, axis, exclusive, reverse)
       jacob_t, jacob_n = gradient_checker.compute_gradient(
           t, shape, result, shape, x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, False, False)
 
+  @test_util.run_deprecated_v1
   def testGradientReverse(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, False, True)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusive(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, True, False)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusiveReverse(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, True, True)
 
+  @test_util.run_deprecated_v1
   def testGradient2D(self):
     for axis in (-1, 0, 1):
       for exclusive in [True, False]:
@@ -184,7 +202,7 @@ class CumprodTest(test.TestCase):
 
   def _compare(self, x, axis, exclusive, reverse):
     np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_out = math_ops.cumprod(x, axis, exclusive, reverse).eval()
 
     self.assertAllClose(np_out, tf_out)
@@ -194,38 +212,44 @@ class CumprodTest(test.TestCase):
       for reverse in [True, False]:
         self._compare(x, axis, exclusive, reverse)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in self.valid_dtypes:
       x = np.zeros([0]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def testAxisType(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis_dtype in [dtypes.int64, dtypes.int32]:
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           axis = constant_op.constant(0, axis_dtype)
           tf_out = math_ops.cumprod(x, axis).eval()
 
+  @test_util.run_deprecated_v1
   def test1D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test2D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 11).reshape([2, 5]).astype(dtype)
       for axis in (-2, -1, 0, 1):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test3D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 21).reshape([2, 2, 5]).astype(dtype)
       for axis in (-3, -2, -1, 0, 1, 2):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test6D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
@@ -235,7 +259,7 @@ class CumprodTest(test.TestCase):
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     input_tensor = ops.convert_to_tensor(x)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
@@ -251,29 +275,34 @@ class CumprodTest(test.TestCase):
 
   def _compareGradient(self, shape, axis, exclusive, reverse):
     x = np.arange(1, 9).reshape(shape).astype(np.float64)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       t = ops.convert_to_tensor(x)
       result = math_ops.cumprod(t, axis, exclusive, reverse)
       jacob_t, jacob_n = gradient_checker.compute_gradient(
           t, shape, result, shape, x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, False, False)
 
+  @test_util.run_deprecated_v1
   def testGradientReverse(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, False, True)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusive(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, True, False)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusiveReverse(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, True, True)
 
+  @test_util.run_deprecated_v1
   def testGradient2D(self):
     for axis in (-2, -1, 0, 1):
       for exclusive in [True, False]:
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 4b92309e4df6c21f202fc4ed7ae80b12392e6206..8510a08f0c96dd9ae08a2ca3e782cc7d28e86264 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
@@ -36,6 +37,9 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+GRADIENT_TESTS_DTYPES = (dtypes.float16, dtypes.float32, dtypes.float64)
+
+
 def _AsType(v, vtype):
   return v.astype(vtype) if isinstance(v, np.ndarray) else vtype(v)
 
@@ -102,7 +106,7 @@ class StatefulScatterNdTest(test.TestCase):
     np.random.seed(8)
     ref_shapes = [(3, 6), (3, 6), (3, 6, 9), (3, 6, 9), (3, 6, 9), (3, 6, 9)]
     indices_shapes = [(2,), (2, 2), (2,), (2, 2), (2, 3), (2, 3, 3)]
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       for ref_shape, indices_shape in zip(ref_shapes, indices_shapes):
         num_updates = indices_shape[0]
         ixdim = indices_shape[-1]
@@ -141,12 +145,11 @@ class StatefulScatterNdTest(test.TestCase):
         tf_scatter(ref_var, indices, updates).eval()
 
         # Compare
-        self.assertAllClose(new, ref_var.eval())
+        self.assertAllClose(new, self.evaluate(ref_var))
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.int32,
-                  np.float32, np.float64,
-                  np.complex64, np.complex128):
+    for vtype in (np.int32, np.float16, np.float32, np.float64, np.complex64,
+                  np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -158,11 +161,12 @@ class StatefulScatterNdTest(test.TestCase):
     scatter = state_ops.scatter_nd_update(ref, indices, updates)
     init = variables.global_variables_initializer()
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
+  @test_util.run_deprecated_v1
   def testSimpleResource(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
     updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
@@ -172,9 +176,9 @@ class StatefulScatterNdTest(test.TestCase):
     scatter = state_ops.scatter_nd_update(ref, indices, updates)
     init = variables.global_variables_initializer()
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(init)
-      sess.run(scatter)
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(init)
+      self.evaluate(scatter)
       self.assertAllClose(ref.eval(), expected)
 
   def testSimple2(self):
@@ -186,9 +190,9 @@ class StatefulScatterNdTest(test.TestCase):
     scatter = state_ops.scatter_nd_update(ref, indices, updates)
     init = variables.global_variables_initializer()
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
   def testSimple3(self):
@@ -200,17 +204,20 @@ class StatefulScatterNdTest(test.TestCase):
     scatter = state_ops.scatter_nd_update(ref, indices, updates)
     init = variables.global_variables_initializer()
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     self._VariableRankTests(_NumpyUpdate, state_ops.scatter_nd_update)
 
+  @test_util.run_deprecated_v1
   def testVariableRankAdd(self):
     self._VariableRankTests(_NumpyAdd, state_ops.scatter_nd_add)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableRankSub(self):
     self._VariableRankTests(_NumpySub, state_ops.scatter_nd_sub)
 
@@ -223,11 +230,12 @@ class StatefulScatterNdTest(test.TestCase):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.int32, np.float32, np.float64):
+    for vtype in (np.int32, np.float16, np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testScatterRepeatIndices(self):
     """This tests scatter_add using indices that repeat."""
     self._ScatterRepeatIndicesTest(_NumpyAdd, state_ops.scatter_nd_add)
@@ -239,7 +247,7 @@ class StatefulScatterNdTest(test.TestCase):
   # TODO(simister): Re-enable once binary size increase due to
   # extra templating is back under control and this op is re-enabled
   # def testBooleanScatterUpdate(self):
-  #   with self.test_session(use_gpu=False) as session:
+  #   with self.session(use_gpu=False) as session:
   #     var = tf.Variable([True, False])
   #     update0 = tf.scatter_nd_update(var, [[1]], [True])
   #     update1 = tf.scatter_nd_update(
@@ -247,8 +255,9 @@ class StatefulScatterNdTest(test.TestCase):
   #             [[0]], dtype=tf.int64), [False])
   #     var.initializer.run()
   #     session.run([update0, update1])
-  #     self.assertAllEqual([False, True], var.eval())
+  #     self.assertAllEqual([False, True], self.evaluate(var))
 
+  @test_util.run_v1_only("b/120545219")
   def testScatterOutOfRangeCpu(self):
     # TODO(simister): Re-enable once binary size increase due to
     # scatter_nd ops is under control.
@@ -257,7 +266,7 @@ class StatefulScatterNdTest(test.TestCase):
                state_ops.scatter_nd_update):
       params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
       updates = np.array([-3, -4, -5]).astype(np.float32)
-      with self.test_session(use_gpu=False):
+      with self.cached_session(use_gpu=False):
         ref = variables.VariableV1(params)
         ref.initializer.run()
 
@@ -285,6 +294,17 @@ class StatefulScatterNdTest(test.TestCase):
         state_ops.scatter_nd_update(ref, indices,
                                     updates).get_shape().as_list(), shape)
 
+  @test_util.run_v1_only("b/120545219")
+  def testResVarInvalidOutputShape(self):
+    res = variables.Variable(
+        initial_value=lambda: array_ops.zeros(shape=[], dtype=dtypes.float32),
+        dtype=dtypes.float32)
+    with self.cached_session():
+      res.initializer.run()
+      with self.assertRaisesOpError("Output must be at least 1-D"):
+        state_ops.scatter_nd_update(res, [[0]], [0.22]).eval()
+
+  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -296,8 +316,9 @@ class StatefulScatterNdTest(test.TestCase):
     expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.cached_session():
       ref.initializer.run()
-      self.assertAllEqual(expected_result, scatter_update.eval())
+      self.assertAllEqual(expected_result, self.evaluate(scatter_update))
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape1(self):
     indices = array_ops.zeros([3, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -307,6 +328,7 @@ class StatefulScatterNdTest(test.TestCase):
         ValueError, "The outer \\d+ dimensions of indices\\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape2(self):
     indices = array_ops.zeros([2, 2, 1], dtypes.int32)
     updates = array_ops.zeros([2, 2], dtypes.int32)
@@ -316,6 +338,7 @@ class StatefulScatterNdTest(test.TestCase):
         ValueError, "The inner \\d+ dimensions of input\\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
+  @test_util.run_deprecated_v1
   def testConcurrentUpdates(self):
     num_updates = 10000
     update_values = np.random.rand(num_updates)
@@ -330,8 +353,8 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with session.Session() as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       assert np.allclose(result, expected_result)
 
   # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
@@ -347,7 +370,7 @@ class StatefulScatterNdTest(test.TestCase):
       updates = np.array([-3, -4, -5]).astype(np.float32)
       # With GPU, the code ignores indices that are out of range.
       # We don't test the implementation; just test there's no failures.
-      with self.test_session(force_gpu=True):
+      with self.cached_session(force_gpu=True):
         ref = variables.Variable(params)
         ref.initializer.run()
 
@@ -410,7 +433,7 @@ class ScatterNdTest(test.TestCase):
                          b"", b"", b"seven"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by same value.
@@ -421,7 +444,7 @@ class ScatterNdTest(test.TestCase):
     expected = np.array([b"", b"", b"", b"bb", b"a", b"", b"", b"c"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by different value.
@@ -433,7 +456,7 @@ class ScatterNdTest(test.TestCase):
                 np.array([b"", b"", b"", b"cb", b"a", b"", b"", b"d"])]
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertTrue(np.array_equal(result, expected[0]) or
                       np.array_equal(result, expected[1]))
 
@@ -444,6 +467,7 @@ class ScatterNdTest(test.TestCase):
     self.assertAllEqual(
         self.scatter_nd(indices, updates, shape).get_shape().as_list(), shape)
 
+  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -452,26 +476,30 @@ class ScatterNdTest(test.TestCase):
     self.assertAllEqual(scatter.get_shape().as_list(), shape)
     expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.cached_session():
-      self.assertAllEqual(expected_result, scatter.eval())
+      self.assertAllEqual(expected_result, self.evaluate(scatter))
 
+  @test_util.run_deprecated_v1
   def testUndefinedIndicesShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
     updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     shape = constant_op.constant([2, 2, 2], dtypes.int32)
     self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testUndefinedUpdatesShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     updates = array_ops.placeholder(dtypes.int32, shape=None)
     shape = constant_op.constant([2, 2, 2], dtypes.int32)
     self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testUndefinedOutputShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     shape = array_ops.placeholder(dtypes.int32, shape=[None])
     self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyOutputShape1(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -481,6 +509,7 @@ class ScatterNdTest(test.TestCase):
         ValueError, "Indices and updates specified for empty output shape"):
       self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testEmptyOutputShape2(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
     updates = array_ops.placeholder(dtypes.int32, shape=None)
@@ -494,6 +523,7 @@ class ScatterNdTest(test.TestCase):
             updates: np.zeros([2, 2, 2], dtype=np.int32)
         })
 
+  @test_util.run_deprecated_v1
   def testEmptyOutputShape3(self):
     indices = array_ops.zeros([0], dtypes.int32)
     updates = array_ops.zeros([0], dtypes.int32)
@@ -503,6 +533,7 @@ class ScatterNdTest(test.TestCase):
     with self.cached_session():
       self.assertEqual(scatter.eval().size, 0)
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape1(self):
     indices = array_ops.zeros([3, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -511,6 +542,7 @@ class ScatterNdTest(test.TestCase):
         ValueError, "The outer \\d+ dimensions of indices\\.shape="):
       self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape2(self):
     indices = array_ops.zeros([2, 2, 1], dtypes.int32)
     updates = array_ops.zeros([2, 2], dtypes.int32)
@@ -519,99 +551,100 @@ class ScatterNdTest(test.TestCase):
         ValueError, "The inner \\d+ dimensions of (input|output)\\.shape="):
       self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2ElementUpdate(self):
-    indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
-    updates = constant_op.constant([1, 4], dtype=dtypes.float64)
-    shape = constant_op.constant([2, 2], dtype=dtypes.int32)
-    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
-    outputs = self.scatter_nd(indices, updates, shape, input_)
-
-    grad_vals = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
-    updates_grad, input_grad = gradients_impl.gradients(
-        [outputs], [updates, input_], [grad_vals])
-    expected_updates_grad = np.array([1, 4], dtype=np.float64)
-    expected_input_grad = np.array([[1, 2], [3, 4]], dtype=np.float64)
-    with self.cached_session():
-      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
-      if self.non_aliasing_add_test:
-        self.assertAllEqual(expected_input_grad, input_grad.eval())
-
+    for dtype in GRADIENT_TESTS_DTYPES:
+      indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
+      updates = constant_op.constant([1, 4], dtype=dtype)
+      shape = constant_op.constant([2, 2], dtype=dtypes.int32)
+      input_ = array_ops.zeros(shape, dtype=dtype)
+      outputs = self.scatter_nd(indices, updates, shape, input_)
+
+      grad_vals = constant_op.constant([[1, 2], [3, 4]], dtype=dtype)
+      updates_grad, input_grad = gradients_impl.gradients(
+          [outputs], [updates, input_], [grad_vals])
+      expected_updates_grad = np.array([1, 4], dtype=dtype.as_numpy_dtype())
+      expected_input_grad = np.array([[1, 2], [3, 4]],
+                                     dtype=dtype.as_numpy_dtype())
+      with self.cached_session():
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+        if self.non_aliasing_add_test:
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+
+  @test_util.run_deprecated_v1
   def testGradientsRank2SliceUpdate(self):
-    indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
-    updates = constant_op.constant([[3, 4], [1, 2]], dtype=dtypes.float64)
-    shape = constant_op.constant([2, 2], dtype=dtypes.int32)
-    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
-    outputs = self.scatter_nd(indices, updates, shape, input_)
-
-    grad_vals = constant_op.constant([[3, 4], [1, 2]], dtype=dtypes.float64)
-    updates_grad, input_grad = gradients_impl.gradients(
-        [outputs], [updates, input_], [grad_vals])
-    expected_updates_grad = np.array([[1, 2], [3, 4]], dtype=np.float64)
-    expected_input_grad = np.array([[3, 4], [1, 2]], dtype=np.float64)
-    with self.cached_session():
-      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
-      if self.non_aliasing_add_test:
-        self.assertAllEqual(expected_input_grad, input_grad.eval())
-
+    for dtype in GRADIENT_TESTS_DTYPES:
+      indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
+      updates = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
+      shape = constant_op.constant([2, 2], dtype=dtypes.int32)
+      input_ = array_ops.zeros(shape, dtype=dtype)
+      outputs = self.scatter_nd(indices, updates, shape, input_)
+
+      grad_vals = constant_op.constant([[3, 4], [1, 2]], dtype=dtype)
+      updates_grad, input_grad = gradients_impl.gradients(
+          [outputs], [updates, input_], [grad_vals])
+      expected_updates_grad = np.array([[1, 2], [3, 4]],
+                                       dtype=dtype.as_numpy_dtype())
+      expected_input_grad = np.array([[3, 4], [1, 2]],
+                                     dtype=dtype.as_numpy_dtype())
+      with self.cached_session():
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+        if self.non_aliasing_add_test:
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+
+  @test_util.run_deprecated_v1
   def testGradientsRank3SliceUpdate(self):
-    indices = constant_op.constant(
-        [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int32)
-    updates = constant_op.constant(
-        [[[5, 7], [2, 4]], [[1, 3], [6, 8]]], dtype=dtypes.float64)
-    shape = constant_op.constant([2, 2, 2], dtype=dtypes.int32)
-    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
-    outputs = self.scatter_nd(indices, updates, shape, input_)
-
-    grad_vals = constant_op.constant(
-        [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=dtypes.float64)
-    updates_grad, input_grad = gradients_impl.gradients(
-        [outputs], [updates, input_], [grad_vals])
-    expected_updates_grad = np.array(
-        [[[3, 4], [5, 6]], [[1, 2], [7, 8]]], dtype=np.float64)
-    expected_input_grad = np.array(
-        [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=np.float64)
-    with self.cached_session():
-      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
-      if self.non_aliasing_add_test:
-        self.assertAllEqual(expected_input_grad, input_grad.eval())
-
+    for dtype in GRADIENT_TESTS_DTYPES:
+      indices = constant_op.constant([[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                                     dtype=dtypes.int32)
+      updates = constant_op.constant([[[5, 7], [2, 4]], [[1, 3], [6, 8]]],
+                                     dtype=dtype)
+      shape = constant_op.constant([2, 2, 2], dtype=dtypes.int32)
+      input_ = array_ops.zeros(shape, dtype=dtype)
+      outputs = self.scatter_nd(indices, updates, shape, input_)
+
+      grad_vals = constant_op.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                                       dtype=dtype)
+      updates_grad, input_grad = gradients_impl.gradients(
+          [outputs], [updates, input_], [grad_vals])
+      expected_updates_grad = np.array([[[3, 4], [5, 6]], [[1, 2], [7, 8]]],
+                                       dtype=dtype.as_numpy_dtype())
+      expected_input_grad = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                                     dtype=dtype.as_numpy_dtype())
+      with self.cached_session():
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+        if self.non_aliasing_add_test:
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+
+  @test_util.run_deprecated_v1
   def testGradientsRank7SliceUpdate(self):
-    indices = constant_op.constant(
-        [[[
-            [[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
-            [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]
-        ]]], dtype=dtypes.int32)
-    updates = constant_op.constant(
-        [[[
-            [[[[5, 6], [2, 4]]]],
-            [[[[1, 3], [6, 8]]]]
-        ]]], dtype=dtypes.float64)
-    shape = constant_op.constant([1, 1, 2, 1, 1, 2, 2], dtype=dtypes.int32)
-    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
-    outputs = self.scatter_nd(indices, updates, shape, input_)
-
-    grad_vals = constant_op.constant(
-        [[[
-            [[[[1, 2], [3, 4]]]],
-            [[[[5, 6], [7, 8]]]]
-        ]]], dtype=dtypes.float64)
-    updates_grad, input_grad = gradients_impl.gradients(
-        [outputs], [updates, input_], [grad_vals])
-    expected_updates_grad = np.array(
-        [[[
-            [[[[3, 4], [5, 6]]]],
-            [[[[1, 2], [7, 8]]]]
-        ]]], dtype=np.float64)
-    expected_input_grad = np.array(
-        [[[
-            [[[[1, 2], [3, 4]]]],
-            [[[[5, 6], [7, 8]]]]
-        ]]], dtype=np.float64)
-    with self.cached_session():
-      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
-      if self.non_aliasing_add_test:
-        self.assertAllEqual(expected_input_grad, input_grad.eval())
-
+    for dtype in GRADIENT_TESTS_DTYPES:
+      indices = constant_op.constant(
+          [[[[[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
+             [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]]]],
+          dtype=dtypes.int32)
+      updates = constant_op.constant(
+          [[[[[[[5, 6], [2, 4]]]], [[[[1, 3], [6, 8]]]]]]], dtype=dtype)
+      shape = constant_op.constant([1, 1, 2, 1, 1, 2, 2], dtype=dtypes.int32)
+      input_ = array_ops.zeros(shape, dtype=dtype)
+      outputs = self.scatter_nd(indices, updates, shape, input_)
+
+      grad_vals = constant_op.constant(
+          [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]], dtype=dtype)
+      updates_grad, input_grad = gradients_impl.gradients(
+          [outputs], [updates, input_], [grad_vals])
+      expected_updates_grad = np.array(
+          [[[[[[[3, 4], [5, 6]]]], [[[[1, 2], [7, 8]]]]]]],
+          dtype=dtype.as_numpy_dtype())
+      expected_input_grad = np.array(
+          [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]],
+          dtype=dtype.as_numpy_dtype())
+      with self.cached_session():
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
+        if self.non_aliasing_add_test:
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
+
+  @test_util.run_deprecated_v1
   def testScatterNdRepatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
     values = np.random.randn(100000)
@@ -620,6 +653,7 @@ class ScatterNdTest(test.TestCase):
       val = self.scatter_nd(indices, values, shape).eval()
     self.assertAllClose([np.sum(values)], val)
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim2(self):
     with self.cached_session():
       indices = array_ops.zeros([3, 5, 2], dtype=dtypes.int32)
@@ -627,6 +661,7 @@ class ScatterNdTest(test.TestCase):
       shape = [4, 6, 7]
       self.scatter_nd(indices, values, shape).eval()
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim2(self):
     with self.cached_session():
       indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
@@ -634,6 +669,7 @@ class ScatterNdTest(test.TestCase):
       shape = [4, 6, 7]
       self.scatter_nd(indices, values, shape).eval()
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim3ShapeRank7(self):
     with self.cached_session():
       indices = array_ops.zeros([1, 3], dtype=dtypes.int32)
@@ -641,6 +677,7 @@ class ScatterNdTest(test.TestCase):
       shape = [3, 4, 5, 6, 7, 8, 9]
       self.scatter_nd(indices, values, shape).eval()
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim3ShapeRank7(self):
     with self.cached_session():
       indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
@@ -662,5 +699,56 @@ class ScatterNdNonAliasingAddTest(ScatterNdTest):
     pass
 
 
+class ScatterNdTensorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testUpdateAddSub(self):
+    indices = constant_op.constant([[4], [3], [1], [7]])
+    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+    t = array_ops.ones([8], dtype=dtypes.float32)
+    assigned = array_ops.tensor_scatter_update(t, indices, updates)
+    added = array_ops.tensor_scatter_add(t, indices, updates)
+    subbed = array_ops.tensor_scatter_sub(t, indices, updates)
+
+    self.assertAllEqual(assigned,
+                        constant_op.constant([1, 11, 1, 10, 9, 1, 1, 12]))
+    self.assertAllEqual(added,
+                        constant_op.constant([1, 12, 1, 11, 10, 1, 1, 13]))
+    self.assertAllEqual(subbed,
+                        constant_op.constant([1, -10, 1, -9, -8, 1, 1, -11]))
+
+  @test_util.run_v1_only("b/120545219")
+  def testUpdateAddSubGradients(self):
+
+    with self.cached_session():
+      indices = constant_op.constant([[3], [1]])
+      updates = constant_op.constant([9, 10], dtype=dtypes.float32)
+      x = array_ops.ones([4], dtype=dtypes.float32)
+
+      assigned = array_ops.tensor_scatter_update(x, indices, updates)
+      added = array_ops.tensor_scatter_add(x, indices, updates)
+      subbed = array_ops.tensor_scatter_sub(x, indices, updates)
+
+      err_assigned = gradient_checker.compute_gradient_error(
+          x, [4], assigned, [4])
+      err_added = gradient_checker.compute_gradient_error(x, [4], added, [4])
+      err_subbed = gradient_checker.compute_gradient_error(x, [4], subbed, [4])
+
+      self.assertLess(err_assigned, 2e-4)
+      self.assertLess(err_added, 2e-4)
+      self.assertLess(err_subbed, 2e-4)
+
+      err_assigned_wrt_updates = gradient_checker.compute_gradient_error(
+          updates, [2], assigned, [4])
+      err_added_wrt_updates = gradient_checker.compute_gradient_error(
+          updates, [2], added, [4])
+      err_subbed_wrt_updates = gradient_checker.compute_gradient_error(
+          updates, [2], subbed, [4])
+
+      self.assertLess(err_assigned_wrt_updates, 2e-4)
+      self.assertLess(err_added_wrt_updates, 2e-4)
+      self.assertLess(err_subbed_wrt_updates, 2e-4)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 527b7daf105adc5367efb5e57a8681b8e3672bc4..623c17d373cc7231d7191b715a77b6a3cf8701fc 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -133,7 +134,7 @@ class ScatterTest(test.TestCase):
                         repeat_indices=False,
                         updates_are_scalar=False):
     np.random.seed(8)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       for indices_shape in (), (2,), (3, 7), (3, 4, 7):
         for extra_shape in (), (5,), (5, 9):
           # Generate random indices with no duplicates for easy numpy comparison
@@ -196,87 +197,114 @@ class ScatterTest(test.TestCase):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     self._VariableRankTests(state_ops.scatter_update, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankAdd(self):
     self._VariableRankTests(state_ops.scatter_add, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankSub(self):
     self._VariableRankTests(state_ops.scatter_sub, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMul(self):
     self._VariableRankTests(state_ops.scatter_mul, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankDiv(self):
     self._VariableRankTests(state_ops.scatter_div, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMin(self):
     self._VariableRankTests(state_ops.scatter_min, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMax(self):
     self._VariableRankTests(state_ops.scatter_max, False)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesAdd(self):
     self._VariableRankTests(state_ops.scatter_add, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesSub(self):
     self._VariableRankTests(state_ops.scatter_sub, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMul(self):
     self._VariableRankTests(state_ops.scatter_mul, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesDiv(self):
     self._VariableRankTests(state_ops.scatter_div, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMin(self):
     self._VariableRankTests(state_ops.scatter_min, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMax(self):
     self._VariableRankTests(state_ops.scatter_max, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdateScalar(self):
     self._VariableRankTests(state_ops.scatter_update, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankAddScalar(self):
     self._VariableRankTests(state_ops.scatter_add, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankSubScalar(self):
     self._VariableRankTests(state_ops.scatter_sub, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMulScalar(self):
     self._VariableRankTests(state_ops.scatter_mul, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankDivScalar(self):
     self._VariableRankTests(state_ops.scatter_div, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMinScalar(self):
     self._VariableRankTests(state_ops.scatter_min, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMaxScalar(self):
     self._VariableRankTests(state_ops.scatter_max, False, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesAddScalar(self):
     self._VariableRankTests(state_ops.scatter_add, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesSubScalar(self):
     self._VariableRankTests(state_ops.scatter_sub, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMulScalar(self):
     self._VariableRankTests(state_ops.scatter_mul, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesDivScalar(self):
     self._VariableRankTests(state_ops.scatter_div, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMinScalar(self):
     self._VariableRankTests(state_ops.scatter_min, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMaxScalar(self):
     self._VariableRankTests(state_ops.scatter_max, True, True)
 
+  @test_util.run_deprecated_v1
   def testBooleanScatterUpdate(self):
     if not test.is_gpu_available():
-      with self.test_session(use_gpu=False) as session:
+      with self.session(use_gpu=False) as session:
         var = variables.Variable([True, False])
         update0 = state_ops.scatter_update(var, 1, True)
         update1 = state_ops.scatter_update(
@@ -286,14 +314,15 @@ class ScatterTest(test.TestCase):
 
         session.run([update0, update1])
 
-        self.assertAllEqual([False, True], var.eval())
+        self.assertAllEqual([False, True], self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testScatterOutOfRangeCpu(self):
     for op, _ in _TF_OPS_TO_NUMPY.items():
       params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
       updates = np.array([-3, -4, -5]).astype(np.float32)
       if not test.is_gpu_available():
-        with self.test_session(use_gpu=False):
+        with self.session(use_gpu=False):
           ref = variables.VariableV1(params)
           ref.initializer.run()
 
@@ -320,19 +349,19 @@ class ScatterTest(test.TestCase):
       updates = np.array([-3, -4, -5]).astype(np.float32)
       # With GPU, the code ignores indices that are out of range.
       # We don't test the implementation; just test there's no failures.
-      with self.test_session(force_gpu=True):
+      with test_util.force_gpu():
         ref = variables.Variable(params)
         ref.initializer.run()
 
         # Indices all in range, no problem.
         indices = np.array([2, 0, 5])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
         # Indicies out of range should not fail.
         indices = np.array([-1, 0, 5])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
         indices = np.array([2, 0, 6])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 2931877c111b9264dc59d037e8fa40ba48400eca..8af1b47e83c94ba117d4f4f9168da7b91b606dbf 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -113,12 +114,12 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       else:
         curr_ops_list = ops_list
       for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           tf_x, np_x = self._input(shape, dtype=dtype)
           for np_op1, np_op2, tf_op in curr_ops_list:
             np_ans = self._segmentReduce(indices, np_x, np_op1, np_op2)
             s = tf_op(data=tf_x, segment_ids=indices)
-            tf_ans = s.eval()
+            tf_ans = self.evaluate(s)
             self.assertAllClose(np_ans, tf_ans)
             # NOTE(mrry): The static shape inference that computes
             # `tf_ans.shape` can only infer that sizes from dimension 1
@@ -126,6 +127,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
             # and may therefore vary dynamically.
             self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:])
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsShape(self):
     shape = [4, 4]
     tf_x, _ = self._input(shape)
@@ -133,21 +135,23 @@ class SegmentReductionOpTest(SegmentReductionHelper):
     with self.assertRaises(ValueError):
       math_ops.segment_sum(data=tf_x, segment_ids=indices)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsSize(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         tf_x, _ = self._input(shape)
         indices = [0, 1]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment_ids should be the same size"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsValid(self):
     # This is a baseline for the following SegmentIdsInvalid* tests.
     shape = [4, 4]
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         tf_x, _ = self._input(shape, dtype=dtypes_lib.float32)
         indices = [0, 0, 0, 1]
         result = math_ops.segment_sum(data=tf_x, segment_ids=indices).eval()
@@ -156,25 +160,26 @@ class SegmentReductionOpTest(SegmentReductionHelper):
   def testSegmentIdsGreaterThanZero(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         tf_x, np_x = self._input(shape, dtype=dtypes_lib.float32)
         indices = [1, 1, 2, 2]
         np_ans = self._segmentReduce(indices, np_x, np.add)
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testSegmentIdsHole(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         tf_x, np_x = self._input(shape, dtype=dtypes_lib.float32)
         indices = [0, 0, 3, 3]
         np_ans = self._segmentReduce(indices, np_x, np.add)
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid1(self):
     shape = [4, 4]
     with self.cached_session():
@@ -184,8 +189,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.assertRaisesOpError(
           r"Segment id -1 out of range \[0, 1\), possibly because "
           "'segment_ids' input is not sorted."):
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid2(self):
     shape = [4, 4]
     with self.cached_session():
@@ -193,8 +199,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       indices = [0, 1, 0, 1]
       s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
       with self.assertRaisesOpError("segment ids are not increasing"):
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid3(self):
     shape = [4, 4]
     with self.cached_session():
@@ -204,28 +211,31 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.assertRaisesOpError(
           r"Segment id 1 out of range \[0, 1\), possibly "
           "because 'segment_ids' input is not sorted."):
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid4(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         tf_x, _ = self._input(shape, dtype=dtypes_lib.float32)
         indices = [0, 0, 0, -1]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid5(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         tf_x, _ = self._input(shape, dtype=dtypes_lib.float32)
         indices = [0, 0, 0, -2]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     shape = [4, 4]
     indices = [0, 1, 2, 2]
@@ -284,7 +294,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
         ops_list = self.complex_ops_list if dtype.is_complex else self.ops_list
         tf_x, np_x = self._input(shape, dtype=dtype)
         for use_gpu in [True, False]:
-          with self.test_session(use_gpu=True):
+          with self.cached_session(use_gpu=True):
             for np_op1, np_op2, tf_op, init_op in ops_list:
               # sqrt_n doesn't support integers
               if (np_op2 == self._sqrt_n_reduce_op and dtype.is_integer):
@@ -297,7 +307,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
                   indices, np_x, np_op1, np_op2, num_segments=num_segments,
                   initial_value=init_op(dtype))
               s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
-              tf_ans = s.eval()
+              tf_ans = self.evaluate(s)
               if dtype is dtypes_lib.bfloat16:
                 tf_ans = tf_ans.astype(np.float32)
               self.assertAllCloseAccordingToType(np_ans, tf_ans)
@@ -310,7 +320,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
       for dtype in dtypes:
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           tf_x, np_x = self._input(shape)
           num_segments_constant = constant_op.constant(
               num_segments, dtype=dtype)
@@ -320,10 +330,11 @@ class UnsortedSegmentTest(SegmentReductionHelper):
               data=tf_x,
               segment_ids=indices,
               num_segments=num_segments_constant)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     num_cols = 2
     indices_flat = np.array([0, 4, 0, -1, 3, -1, 4, 7, 7, 3])
@@ -334,7 +345,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
         shape = indices.shape + (num_cols,)
         # test CPU and GPU as tf.gather behaves differently on each device
         for use_gpu in [False, True]:
-          with self.test_session(use_gpu=use_gpu):
+          with self.cached_session(use_gpu=use_gpu):
             for _, _, tf_op, _ in ops_list:
               tf_x, np_x = self._input(shape, dtype=dtype)
               s = tf_op(tf_x, indices, num_segments)
@@ -346,6 +357,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
                   delta=1)
             self.assertAllClose(jacob_t, jacob_n)
 
+  @test_util.run_deprecated_v1
   def testProdGrad(self):
     # additional test for the prod gradient to ensure correct handling of zeros
     values = np.array([0, 0, 1, 0, 2, 2, 3, 3, 3], dtype=np.float32)
@@ -360,7 +372,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
     gradients_indices[range(9), indices] = [0, 0, 0, 4, 0, 0, 9, 9, 9]
     gradients_indices_neg[range(9), indices_neg] = [0, 1, 0, 0, 2, 2, 0, 3, 3]
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         for ind, grad_gt in [(indices, gradients_indices),
                              (indices_neg, gradients_indices_neg)]:
           s = math_ops.unsorted_segment_prod(values_tf,
@@ -370,6 +382,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
           self.assertAllClose(jacob_t, jacob_n)
           self.assertAllClose(jacob_t, grad_gt)
 
+  @test_util.run_deprecated_v1
   def testGradientMatchesSegmentSum(self):
     # Strategy: compute the gradient for UnsortedSegmentSum and SegmentSum
     # and compare the outputs, which should be identical.
@@ -382,7 +395,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
     shape = [n, num_cols]
     num_segments = max(indices) + 1
     for dtype in self.differentiable_dtypes:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         tf_x, np_x = self._input(shape, dtype=dtype)
         # Results from UnsortedSegmentSum
         unsorted_s = math_ops.unsorted_segment_sum(
@@ -403,21 +416,23 @@ class UnsortedSegmentTest(SegmentReductionHelper):
       self.assertAllClose(unsorted_jacob_t, sorted_jacob_t)
       self.assertAllClose(unsorted_jacob_n, sorted_jacob_n)
 
+  @test_util.run_deprecated_v1
   def testBadIndices(self):
     # Note: GPU kernel does not return the out-of-range error needed for this
     # test, so this test is marked as cpu-only.
     # Note: With PR #13055 a negative index will be ignored silently.
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for bad in [[2]], [[7]]:
         unsorted = math_ops.unsorted_segment_sum([[17]], bad, num_segments=2)
         with self.assertRaisesOpError(
             r"segment_ids\[0,0\] = %d is out of range \[0, 2\)" % bad[0][0]):
-          unsorted.eval()
+          self.evaluate(unsorted)
 
+  @test_util.run_deprecated_v1
   def testEmptySecondDimension(self):
     dtypes = [np.float16, np.float32, np.float64, np.int64, np.int32,
               np.complex64, np.complex128]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for dtype in dtypes:
         for itype in (np.int32, np.int64):
           data = np.zeros((2, 0), dtype=dtype)
@@ -433,7 +448,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
     for indices in indices_flat, indices_flat.reshape(5, 2):
       shape = indices.shape + (2,)
       for dtype in self.all_dtypes:
-        with self.test_session(use_gpu=True):
+        with self.session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
               indices, np_x, np.add, op2=None, num_segments=num_segments)
@@ -443,7 +458,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
           np.place(indices, indices == 8, [-1])
           s = math_ops.unsorted_segment_sum(
               data=tf_x, segment_ids=indices, num_segments=num_segments)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
@@ -490,7 +505,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         segment_indices.append(i)
     num_indices = len(segment_indices)
     for dtype in dtypes:
-      with self.test_session(use_gpu=False):
+      with self.cached_session(use_gpu=False):
         tf_indices, np_indices, tf_x, np_x = self._sparse_input(
             shape, num_indices, dtype=dtype)
         for np_op1, np_op2, tf_op in ops_list:
@@ -499,7 +514,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
           np_ans = self._sparseSegmentReduce(np_x, np_indices, segment_indices,
                                              np_op1, np_op2)
           s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
           self.assertAllClose(np_ans, tf_ans)
           # NOTE(mrry): The static shape inference that computes
           # `tf_ans.shape` can only infer that sizes from dimension 1
@@ -513,12 +528,12 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         self._mean_cum_op, self._mean_reduce_op, math_ops.sparse_segment_mean)]
     segment_indices = [0, 2, 2, 2]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for np_op1, np_op2, tf_op in ops_list:
         np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
                                            np_op1, np_op2)
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testWithNumSegments(self):
@@ -529,7 +544,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     segment_indices = [0, 2, 2, 2]
     tf_indices = [8, 3, 0, 9]
     num_segments = 5
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for np_op1, np_op2, tf_op in ops_list:
         np_ans = self._sparseSegmentReduce(
             np_x,
@@ -543,7 +558,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testWithEmptySegments(self):
@@ -555,14 +570,14 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     segment_indices = []
     tf_indices = []
     num_segments = 5
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(
             data=tf_x,
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np.zeros([5, 4]), tf_ans)
 
   def testSegmentIdsGreaterThanZero(self):
@@ -571,12 +586,12 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         self._mean_cum_op, self._mean_reduce_op, math_ops.sparse_segment_mean)]
     segment_indices = [1, 2, 2, 2]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for np_op1, np_op2, tf_op in ops_list:
         np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
                                            np_op1, np_op2)
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testValid(self):
@@ -585,93 +600,100 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
     segment_indices = [0, 1, 2, 2]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testIndicesInvalid1(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
     segment_indices = [0, 1, 2, 2]
     tf_indices = [8, -1, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"indices\[1\] == -1 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testIndicesInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
     segment_indices = [0, 1, 2, 2]
     tf_indices = [8, 3, 0, 10]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"indices\[3\] == 10 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
     segment_indices = [0, 1, 0, 1]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids are not increasing"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid3(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
     segment_indices = [0, 1, 2, 0]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"Segment id 1 out of range \[0, 1\), possibly because "
             "'segment_ids' input is not sorted"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid4(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
     segment_indices = [-1, 0, 1, 1]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"Segment id -1 out of range \[0, 2\), possibly because "
             "'segment_ids' input is not sorted"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid6(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
     segment_indices = [0, 0, 0, -1]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid7(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
     segment_indices = [0, 0, 0, -2]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentWithNumSegmentsValid(self):
     # Baseline for the test*WithNumSegmentsInvalid* methods below.
@@ -683,15 +705,16 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     num_segments = 5
     segment_indices = [0, 1, 3, 3]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(
             data=tf_x,
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentWithNumSegmentsInvalid1(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -701,7 +724,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     num_segments = 5
     segment_indices = [0, 1, 3, 5]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(
             data=tf_x,
@@ -709,8 +732,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             segment_ids=segment_indices,
             num_segments=num_segments)
         with self.assertRaisesOpError("segment ids must be < num_segments"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentWithNumSegmentsInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -720,7 +744,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     num_segments = -2
     segment_indices = [0, 1, 3, 3]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         with self.assertRaisesRegexp(
             ValueError, "Cannot specify a negative value for num_segments"):
@@ -730,6 +754,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
               segment_ids=segment_indices,
               num_segments=num_segments)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     shape = [10, 4]
 
@@ -748,6 +773,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             delta=1)
       self.assertAllClose(jacob_t, jacob_n)
 
+  @test_util.run_deprecated_v1
   def testGradientWithEmptySegmentsAtEnd(self):
     shape = [10, 4]
 
@@ -782,11 +808,12 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ]
     segment_indices = [0, 1, 2, 2]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientIndicesInvalid1(self):
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -794,12 +821,13 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ]
     segment_indices = [0, 1, 2, 2]
     tf_indices = [8, 3, 0, 10]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Index 10 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientIndicesInvalid2(self):
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -807,12 +835,13 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ]
     segment_indices = [0, 1, 2, 2]
     tf_indices = [8, 3, -1, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Index -1 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid1(self):
     tf_x, _ = self._input(
         [3, 4], dtype=dtypes_lib.float32)  # expecting 3 segments
@@ -821,12 +850,13 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ]
     segment_indices = [0, 1, 1, 4]  # 5 segments
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError("Invalid number of segments"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid2(self):
     tf_x, _ = self._input([1, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -834,12 +864,13 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ]
     segment_indices = [0, 1, 2, 0]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id 1 out of range \[0, 1\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid3(self):
     tf_x, _ = self._input([2, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -847,12 +878,13 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ]
     segment_indices = [-1, 0, 1, 1]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id -1 out of range \[0, 2\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid4(self):
     tf_x, _ = self._input([0, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -860,11 +892,12 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ]
     segment_indices = [0, 1, 2, -1]
     tf_indices = [8, 3, 0, 9]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id 0 out of range \[0, 0\)"):
-          s.eval()
+          self.evaluate(s)
+
 
 class SegmentReductionOpBenchmark(test.Benchmark):
   outer_dim_options = [2**x for x in range(9, 14, 2)]
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index d2647088c5c2afda032482fb5cfd983cedb49a8f..47b22ec29673f31c3216d4b4a39687a40bc95a95 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -22,8 +22,9 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -39,6 +40,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SelfAdjointEigTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The input to self_adjoint_eig should be a tensor of
     # at least rank 2.
@@ -49,9 +51,10 @@ class SelfAdjointEigTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_ops.self_adjoint_eig(vector)
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
     all_ops = []
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for compute_v_ in True, False:
         matrix1 = random_ops.random_normal([5, 5], seed=42)
         matrix2 = random_ops.random_normal([5, 5], seed=42)
@@ -63,7 +66,7 @@ class SelfAdjointEigTest(test.TestCase):
           e1 = linalg_ops.self_adjoint_eigvals(matrix1)
           e2 = linalg_ops.self_adjoint_eigvals(matrix2)
           all_ops += [e1, e2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       self.assertAllEqual(val[0], val[2])
       # The algorithm is slightly different for compute_v being True and False,
       # so require approximate equality only here.
@@ -80,8 +83,8 @@ class SelfAdjointEigTest(test.TestCase):
             "self_adjoint_eig_fail_if_denorms_flushed.txt")).astype(np.float32)
     self.assertEqual(matrix.shape, (32, 32))
     matrix_tensor = constant_op.constant(matrix)
-    with self.test_session(use_gpu=True) as sess:
-      (e, v) = sess.run(linalg_ops.self_adjoint_eig(matrix_tensor))
+    with self.session(use_gpu=True) as sess:
+      (e, v) = self.evaluate(linalg_ops.self_adjoint_eig(matrix_tensor))
       self.assertEqual(e.size, 32)
       self.assertAllClose(
           np.matmul(v, v.transpose()), np.eye(32, dtype=np.float32), atol=2e-3)
@@ -152,7 +155,7 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
     else:
       atol = 1e-12
     np_e, np_v = np.linalg.eigh(a)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       if compute_v_:
         tf_e, tf_v = linalg_ops.self_adjoint_eig(constant_op.constant(a))
 
@@ -161,15 +164,15 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
             math_ops.matmul(tf_v, array_ops.matrix_diag(tf_e)),
             tf_v,
             adjoint_b=True)
-        self.assertAllClose(a_ev.eval(), a, atol=atol)
+        self.assertAllClose(self.evaluate(a_ev), a, atol=atol)
 
         # Compare to numpy.linalg.eigh.
-        CompareEigenDecompositions(self, np_e, np_v,
-                                   tf_e.eval(), tf_v.eval(), atol)
+        CompareEigenDecompositions(self, np_e, np_v, self.evaluate(tf_e),
+                                   self.evaluate(tf_v), atol)
       else:
         tf_e = linalg_ops.self_adjoint_eigvals(constant_op.constant(a))
         self.assertAllClose(
-            np.sort(np_e, -1), np.sort(tf_e.eval(), -1), atol=atol)
+            np.sort(np_e, -1), np.sort(self.evaluate(tf_e), -1), atol=atol)
 
   return Test
 
@@ -185,53 +188,51 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_, compute_v_):
     n = shape_[-1]
     batch_shape = shape_[:-2]
     np_dtype = dtype_.as_numpy_dtype
-    a = np.random.uniform(
-        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-    if dtype_.is_complex:
-      a += 1j * np.random.uniform(
+
+    def RandomInput():
+      a = np.random.uniform(
           low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-    a += np.conj(a.T)
-    a = np.tile(a, batch_shape + (1, 1))
+      if dtype_.is_complex:
+        a += 1j * np.random.uniform(
+            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+      a += np.conj(a.T)
+      a = np.tile(a, batch_shape + (1, 1))
+      return a
+
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
     epsilon = np.finfo(np_dtype).eps
     delta = 0.1 * epsilon**(1.0 / 3.0)
     # tolerance obtained by looking at actual differences using
     # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    # after discarding one random input sample
+    _ = RandomInput()
     if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
       tol = 1e-2
     else:
       tol = 1e-7
-    with self.test_session(use_gpu=True):
-      tf_a = constant_op.constant(a)
-      if compute_v_:
-        tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a)
+    with self.session(use_gpu=True):
+      def Compute(x):
+        e, v = linalg_ops.self_adjoint_eig(x)
         # (complex) Eigenvectors are only unique up to an arbitrary phase
         # We normalize the vectors such that the first component has phase 0.
-        top_rows = tf_v[..., 0:1, :]
-        if tf_a.dtype.is_complex:
+        top_rows = v[..., 0:1, :]
+        if dtype_.is_complex:
           angle = -math_ops.angle(top_rows)
           phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
         else:
           phase = math_ops.sign(top_rows)
-        tf_v *= phase
-        outputs = [tf_e, tf_v]
+        v *= phase
+        return e, v
+
+      if compute_v_:
+        funcs = [lambda x: Compute(x)[0], lambda x: Compute(x)[1]]
       else:
-        tf_e = linalg_ops.self_adjoint_eigvals(tf_a)
-        outputs = [tf_e]
-      for b in outputs:
-        x_init = np.random.uniform(
-            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-        if dtype_.is_complex:
-          x_init += 1j * np.random.uniform(
-              low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-        x_init += np.conj(x_init.T)
-        x_init = np.tile(x_init, batch_shape + (1, 1))
-        theoretical, numerical = gradient_checker.compute_gradient(
-            tf_a,
-            tf_a.get_shape().as_list(),
-            b,
-            b.get_shape().as_list(),
-            x_init_value=x_init,
+        funcs = [linalg_ops.self_adjoint_eigvals]
+
+      for f in funcs:
+        theoretical, numerical = gradient_checker_v2.compute_gradient(
+            f,
+            [RandomInput()],
             delta=delta)
         self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
@@ -245,7 +246,7 @@ if __name__ == "__main__":
       for size in 1, 2, 5, 10:
         for batch_dims in [(), (3,)] + [(3, 2)] * (max(size, size) < 10):
           shape = batch_dims + (size, size)
-          name = "%s_%s_%s" % (dtype, "_".join(map(str, shape)), compute_v)
+          name = "%s_%s_%s" % (dtype.name, "_".join(map(str, shape)), compute_v)
           _AddTest(SelfAdjointEigTest, "SelfAdjointEig", name,
                    _GetSelfAdjointEigTest(dtype, shape, compute_v))
           _AddTest(SelfAdjointEigGradTest, "SelfAdjointEigGrad", name,
diff --git a/tensorflow/python/kernel_tests/session_ops_test.py b/tensorflow/python/kernel_tests/session_ops_test.py
index 03e1ae852fc5b4ce4297b70b37964310f02306e5..bc5d8e81511494ea82bbf703544ec36448b5e982 100644
--- a/tensorflow/python/kernel_tests/session_ops_test.py
+++ b/tensorflow/python/kernel_tests/session_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import session_ops
@@ -28,6 +29,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionOpsTest(test.TestCase):
 
   def testHandleBasic(self):
@@ -37,7 +39,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Feed a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -51,7 +53,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Get the tensor from its handle.
       self.assertEqual(50, h.eval())
@@ -64,7 +66,7 @@ class SessionOpsTest(test.TestCase):
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
       v = math_ops.multiply(a, c)
-      h, v = sess.run([h, v])
+      h, v = self.evaluate([h, v])
 
       self.assertEqual(50, h.eval())
       self.assertEqual(500, v)
@@ -77,7 +79,7 @@ class SessionOpsTest(test.TestCase):
       p = math_ops.less(a, b)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      p, h = sess.run([p, h])
+      p, h = self.evaluate([p, h])
 
       # Run by feeding a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -94,7 +96,7 @@ class SessionOpsTest(test.TestCase):
       # Initialize a handle.
       a = constant_op.constant(0)
       h = session_ops.get_session_handle(a)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Do some computation.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -111,7 +113,7 @@ class SessionOpsTest(test.TestCase):
       # Initialize a handle.
       a = constant_op.constant(0)
       h = session_ops.get_session_handle(a)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Do some computation.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -133,7 +135,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Feed a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -144,7 +146,7 @@ class SessionOpsTest(test.TestCase):
       with ops.device(test.gpu_device_name()):
         a = constant_op.constant(10)
         h = session_ops.get_session_handle(a)
-        h = sess.run(h)
+        h = self.evaluate(h)
         self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
 
   def testHandleDelete(self):
@@ -154,7 +156,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      sess.run(h).delete()
+      self.evaluate(h).delete()
 
   def testHandleDeleteRaw(self):
     with self.cached_session() as sess:
@@ -163,7 +165,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Delete using a raw tensor handle.
       raw_h = h.get_raw_handle()
@@ -174,10 +176,10 @@ class SessionOpsTest(test.TestCase):
     with self.cached_session() as sess:
       with ops.device(test.gpu_device_name()):
         a = constant_op.constant(1.0)
-        a_handle = sess.run(session_ops.get_session_handle(a))
+        a_handle = self.evaluate(session_ops.get_session_handle(a))
       with ops.device("/cpu:0"):
         b = constant_op.constant(2.0)
-        b_handle = sess.run(session_ops.get_session_handle(b))
+        b_handle = self.evaluate(session_ops.get_session_handle(b))
 
       a_p, a_t = session_ops.get_session_tensor(a_handle.handle, dtypes.float32)
       b_p, b_t = session_ops.get_session_tensor(b_handle.handle, dtypes.float32)
@@ -193,8 +195,8 @@ class SessionOpsTest(test.TestCase):
       # initial values live on CPU
       with ops.device("/cpu:0"):
         one = constant_op.constant(1, dtype=dtypes.float32)
-        one_handle = sess.run(session_ops.get_session_handle(one))
-        x_handle = sess.run(session_ops.get_session_handle(one))
+        one_handle = self.evaluate(session_ops.get_session_handle(one))
+        x_handle = self.evaluate(session_ops.get_session_handle(one))
 
       # addition lives on GPU
       with ops.device(test.gpu_device_name()):
@@ -219,8 +221,8 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(2.0)
       b_handle_op = session_ops.get_session_handle(b)
 
-      a_handle = sess.run(a_handle_op)
-      b_handle = sess.run(b_handle_op)
+      a_handle = self.evaluate(a_handle_op)
+      b_handle = self.evaluate(b_handle_op)
 
       a_p, a_t = session_ops.get_session_tensor(a_handle.handle, dtypes.float32)
       b_p, b_t = session_ops.get_session_tensor(b_handle.handle, dtypes.float32)
@@ -232,6 +234,7 @@ class SessionOpsTest(test.TestCase):
                      b_p: b_handle.handle})
       self.assertEqual(3.0, c_handle.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testFeedOneHandleDirectly(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
@@ -239,16 +242,17 @@ class SessionOpsTest(test.TestCase):
       c = math_ops.multiply(a, b)
       d = math_ops.multiply(c, c)
 
-      h_c = sess.run(session_ops.get_session_handle(c))
+      h_c = self.evaluate(session_ops.get_session_handle(c))
 
       self.assertAllClose(2500.0, sess.run(d, feed_dict={c: h_c}))
 
+  @test_util.run_v1_only("b/120545219")
   def testDirectHandleFeedOverlappingWithFetches(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
       b = constant_op.constant(5.0)
       c = math_ops.multiply(a, b)
-      h_c = sess.run(session_ops.get_session_handle(c))
+      h_c = self.evaluate(session_ops.get_session_handle(c))
       d = array_ops.identity(c)
 
       c_val = sess.run(c, feed_dict={c: h_c})
@@ -277,24 +281,25 @@ class SessionOpsTest(test.TestCase):
       d = math_ops.div(a, b)
       e = math_ops.subtract(c, d)
 
-      h_c = sess.run(session_ops.get_session_handle(c))
-      h_d = sess.run(session_ops.get_session_handle(d))
+      h_c = self.evaluate(session_ops.get_session_handle(c))
+      h_d = self.evaluate(session_ops.get_session_handle(d))
 
       self.assertAllClose(48.0, sess.run(e, feed_dict={c: h_c, d: h_d}))
       self.assertAllClose(-48.0, sess.run(e, feed_dict={c: h_d, d: h_c}))
 
+  @test_util.run_v1_only("b/120545219")
   def testFeedHandleToVariableDirectly(self):
     with self.cached_session() as sess:
       a = variables.Variable(12.0)
       inc_a = state_ops.assign_add(a, 2.0)
       b = math_ops.add(a, 5.0)
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
 
       h_a_read = sess.run(session_ops.get_session_handle(a.read_value()))
-      self.assertAllClose(12.0, sess.run(a))
+      self.assertAllClose(12.0, self.evaluate(a))
 
       self.assertAllClose(17.0, sess.run(b, feed_dict={a: h_a_read}))
-      sess.run(inc_a)
+      self.evaluate(inc_a)
       self.assertAllClose(19.0, sess.run(b, feed_dict={a: h_a_read}))
 
 
diff --git a/tensorflow/python/kernel_tests/sets_test.py b/tensorflow/python/kernel_tests/sets_test.py
index 8335e9c139a581a22e06bd2fbfc5c027956d1714..b4f232293482b08b31fefa0f3b2a61ba115d1c47 100644
--- a/tensorflow/python/kernel_tests/sets_test.py
+++ b/tensorflow/python/kernel_tests/sets_test.py
@@ -70,6 +70,7 @@ def _dense_to_sparse(dense, dtype):
 
 class SetOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def test_set_size_2d(self):
     for dtype in _DTYPES:
       self._test_set_size_2d(dtype)
@@ -83,6 +84,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(
         [0, 3], self._set_size(_dense_to_sparse([[], [1, 9, 2]], dtype)))
 
+  @test_util.run_deprecated_v1
   def test_set_size_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_size_duplicates_2d(dtype)
@@ -96,6 +98,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
                                 6, 7, 8, 8, 6, 7, 5, 3, 3, 0, 6, 6, 9, 0, 0, 0
                             ], [999, 1, -1000], [], [-1]], dtype)))
 
+  @test_util.run_deprecated_v1
   def test_set_size_3d(self):
     for dtype in _DTYPES:
       self._test_set_size_3d(dtype)
@@ -159,10 +162,11 @@ class SetOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual(None, op.get_shape().dims)
       self.assertEqual(dtypes.int32, op.dtype)
     with self.cached_session() as sess:
-      results = sess.run(ops)
+      results = self.evaluate(ops)
     self.assertAllEqual(results[0], results[1])
     return results[0]
 
+  @test_util.run_deprecated_v1
   def test_set_intersection_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_set_intersection_multirow_2d(dtype)
@@ -199,6 +203,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_intersection_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_dense_set_intersection_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_dense_set_intersection_multirow_2d(dtype)
@@ -223,6 +228,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         dtype=dtype)
     self.assertAllEqual(expected_counts, self._set_intersection_count(a, b))
 
+  @test_util.run_deprecated_v1
   def test_set_intersection_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_intersection_duplicates_2d(dtype)
@@ -270,6 +276,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_intersection_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_set_intersection_3d(self):
     for dtype in _DTYPES:
       self._test_set_intersection_3d(dtype=dtype)
@@ -534,8 +541,9 @@ class SetOpsTest(test_util.TensorFlowTestCase):
   def _set_intersection_count(self, a, b):
     op = sets.set_size(sets.set_intersection(a, b))
     with self.cached_session() as sess:
-      return sess.run(op)
+      return self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def test_set_difference_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_set_difference_multirow_2d(dtype)
@@ -604,6 +612,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(sp_a, sp_b, False))
 
+  @test_util.run_deprecated_v1
   def test_dense_set_difference_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_dense_set_difference_multirow_2d(dtype)
@@ -647,6 +656,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(a, b, False))
 
+  @test_util.run_deprecated_v1
   def test_sparse_set_difference_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_sparse_set_difference_multirow_2d(dtype)
@@ -688,6 +698,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(sp_a, sp_b, False))
 
+  @test_util.run_deprecated_v1
   def test_set_difference_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_difference_duplicates_2d(dtype)
@@ -755,6 +766,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(a, sp_b, False))
 
+  @test_util.run_deprecated_v1
   def test_sparse_set_difference_3d(self):
     for dtype in _DTYPES:
       self._test_sparse_set_difference_3d(dtype)
@@ -972,8 +984,9 @@ class SetOpsTest(test_util.TensorFlowTestCase):
   def _set_difference_count(self, a, b, aminusb=True):
     op = sets.set_size(sets.set_difference(a, b, aminusb))
     with self.cached_session() as sess:
-      return sess.run(op)
+      return self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def test_set_union_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_set_union_multirow_2d(dtype)
@@ -1001,6 +1014,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         expected_indices, expected_values, expected_shape, union, dtype=dtype)
     self.assertAllEqual(expected_counts, self._set_union_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_dense_set_union_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_dense_set_union_multirow_2d(dtype)
@@ -1021,6 +1035,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         expected_indices, expected_values, expected_shape, union, dtype=dtype)
     self.assertAllEqual(expected_counts, self._set_union_count(a, b))
 
+  @test_util.run_deprecated_v1
   def test_set_union_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_union_duplicates_2d(dtype)
@@ -1047,6 +1062,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         expected_indices, expected_values, expected_shape, union, dtype=dtype)
     self.assertAllEqual([2], self._set_union_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_sparse_set_union_3d(self):
     for dtype in _DTYPES:
       self._test_sparse_set_union_3d(dtype)
@@ -1221,7 +1237,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
   def _set_union_count(self, a, b):
     op = sets.set_size(sets.set_union(a, b))
     with self.cached_session() as sess:
-      return sess.run(op)
+      return self.evaluate(op)
 
   def _assert_set_operation(self, expected_indices, expected_values,
                             expected_shape, sparse_tensor_value, dtype):
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 0304dc3875c66fcab0d35d0a38fade46044137e4..c8e7c143ade2ca740833ea5f9bd18ab5c7b4a2e6 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -50,11 +51,11 @@ class ShapeOpsTest(test.TestCase):
 
   def _compareShape(self, x, use_gpu=False):
     np_ans = np.array(np.shape(x))
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.shape(x)
       tf_ans_64 = array_ops.shape(x, out_type=dtypes.int64)
-      result = tf_ans.eval()
-      result_64 = tf_ans_64.eval()
+      result = self.evaluate(tf_ans)
+      result_64 = self.evaluate(tf_ans_64)
     self.assertAllEqual(np_ans, result)
     self.assertAllEqual(np_ans, result_64)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -62,19 +63,19 @@ class ShapeOpsTest(test.TestCase):
   def _compareShapeSparse(self, x_np, use_gpu=False):
     np_ans = np.array(np.shape(x_np))
     x_tf, unused_nnz = _sparsify(x_np)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.shape(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
   def _compareShapeN(self, x, use_gpu=False):
     np_ans = np.array(np.shape(x))
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       tf_ans = array_ops.shape_n([x, x, x])
       tf_ans_64 = array_ops.shape_n([x, x, x], out_type=dtypes.int64)
-      result = sess.run(tf_ans)
-      result_64 = sess.run(tf_ans_64)
+      result = self.evaluate(tf_ans)
+      result_64 = self.evaluate(tf_ans_64)
     for i in range(3):
       self.assertAllEqual(np_ans, result[i])
       self.assertAllEqual(np_ans, result_64[i])
@@ -82,28 +83,28 @@ class ShapeOpsTest(test.TestCase):
 
   def _compareRank(self, x, use_gpu=False):
     np_ans = np.asarray(np.ndim(x))
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.rank(x)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
   def _compareRankSparse(self, x_np, use_gpu=False):
     np_ans = np.asarray(np.ndim(x_np))
     x_tf, unused_nnz = _sparsify(x_np)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.rank(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
   def _compareSize(self, x, use_gpu=False):
     np_ans = np.asarray(np.size(x))
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.size(x)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
       tf_ans_64 = array_ops.size(x, out_type=dtypes.int64)
-      result_64 = tf_ans_64.eval()
+      result_64 = self.evaluate(tf_ans_64)
     self.assertAllEqual(np_ans, result)
     self.assertAllEqual(np_ans, result_64)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -111,9 +112,9 @@ class ShapeOpsTest(test.TestCase):
   def _compareSizeSparse(self, x_np, use_gpu=False):
     np_ans = np.asarray(np.size(x_np))
     x_tf, unused_nnz = _sparsify(x_np)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.size(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -162,7 +163,7 @@ class ShapeOpsTest(test.TestCase):
       inp = array_ops.zeros([2**31])
       num_elements = array_ops.size_internal(
           inp, optimize=False, out_type=dtypes.int64)
-      self.assertEqual(2**31, num_elements.eval())
+      self.assertEqual(2**31, self.evaluate(num_elements))
 
     # Too large for tf.int32 output.
     with self.assertRaises(errors_impl.InvalidArgumentError):
@@ -170,13 +171,13 @@ class ShapeOpsTest(test.TestCase):
         inp = array_ops.zeros([2**31])
         num_elements = array_ops.size_internal(
             inp, optimize=False, out_type=dtypes.int32)
-        self.assertEqual(2**31, num_elements.eval())
+        self.assertEqual(2**31, self.evaluate(num_elements))
 
   def _compareExpandDims(self, x, dim, use_gpu):
     np_ans = np.expand_dims(x, axis=dim)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tensor = array_ops.expand_dims(x, dim)
-      tf_ans = tensor.eval()
+      tf_ans = self.evaluate(tensor)
     self.assertShapeEqual(np_ans, tensor)
     self.assertAllEqual(np_ans, tf_ans)
 
@@ -227,6 +228,7 @@ class ShapeOpsTest(test.TestCase):
     self._compareExpandDimsAll(choice([2, 3, 5]), -3)
     self._compareExpandDimsAll(choice([2, 3, 5]), -4)
 
+  @test_util.run_deprecated_v1
   def testExpandDimsErrors(self):
     with self.cached_session():
       self.assertRaises(ValueError, array_ops.expand_dims,
@@ -238,6 +240,7 @@ class ShapeOpsTest(test.TestCase):
       self.assertRaises(ValueError, array_ops.expand_dims,
                         [False, True, True], 4)
 
+  @test_util.run_deprecated_v1
   def testExpandDimsGradient(self):
     with self.cached_session():
       inp = constant_op.constant(
@@ -248,6 +251,7 @@ class ShapeOpsTest(test.TestCase):
                                                     [4, 1, 2])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testExpandDimsScalar(self):
     with self.cached_session():
       inp = constant_op.constant(7)
@@ -262,22 +266,22 @@ class ShapeOpsTest(test.TestCase):
     for dtype in [dtypes.int32, dtypes.int64]:
       x = np.zeros([2])
       np_ans = np.expand_dims(x, axis=0)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         tensor = array_ops.expand_dims(x, constant_op.constant(0, dtype))
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
       self.assertShapeEqual(np_ans, tensor)
       self.assertAllEqual(np_ans, tf_ans)
 
   def _compareSqueeze(self, x, squeeze_dims, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       if squeeze_dims:
         np_ans = np.squeeze(x, axis=tuple(squeeze_dims))
         tensor = array_ops.squeeze(x, squeeze_dims)
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
       else:
         np_ans = np.squeeze(x)
         tensor = array_ops.squeeze(x)
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
     self.assertShapeEqual(np_ans, tensor)
     self.assertAllEqual(np_ans, tf_ans)
 
@@ -337,34 +341,36 @@ class ShapeOpsTest(test.TestCase):
     # Numpy squeezes a 1 element tensor into a zero dimensional tensor.
     # Verify that we do the same.
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         tensor = array_ops.squeeze(np.zeros([1, 1, 1]), [])
         self.assertEqual(np.shape(1), tensor.get_shape())
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
         self.assertEqual(np.shape(1), tf_ans.shape)
 
   def testSqueezeAllOnesBool(self):
     # Numpy squeezes a 1 element tensor into a zero dimensional tensor.
     # Verify that we do the same.
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         tensor = array_ops.squeeze([[[False]]], [])
         self.assertEqual(np.shape(1), tensor.get_shape())
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
         self.assertEqual(np.shape(1), tf_ans.shape)
 
+  @test_util.run_deprecated_v1
   def testSqueezeOnlyOnes(self):
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         input_1x1x3 = np.zeros([1, 1, 3])
         self._compareSqueezeAll(input_1x1x3)
         self._compareSqueezeAll(input_1x1x3, [0])
         self._compareSqueezeAll(input_1x1x3, [1])
         self.assertRaises(ValueError, array_ops.squeeze, input_1x1x3, [2])
 
+  @test_util.run_deprecated_v1
   def testSqueezeErrors(self):
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         self.assertRaises(ValueError, array_ops.squeeze,
                           np.zeros([1, 2, 1]), [-4])
         self.assertRaises(ValueError, array_ops.squeeze,
@@ -374,6 +380,7 @@ class ShapeOpsTest(test.TestCase):
         self.assertRaises(ValueError, array_ops.squeeze,
                           np.zeros([1, 2, 1]), [2, 3])
 
+  @test_util.run_deprecated_v1
   def testSqueezeGradient(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -384,6 +391,7 @@ class ShapeOpsTest(test.TestCase):
                                                     [4, 2])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testSqueezeGradientWithSqueezeDims(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -394,6 +402,7 @@ class ShapeOpsTest(test.TestCase):
                                                     [4, 2, 1])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testSqueezeWithUnknownShape(self):
     with self.cached_session():
       a = array_ops.placeholder(dtypes.float32, shape=[2, None])
@@ -412,10 +421,10 @@ class TileTest(test.TestCase):
 
   def testScalar(self):
     for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         a = constant_op.constant(7, shape=[], dtype=dtypes.float32)
         tiled = array_ops.tile(a, [])
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, ())
       self.assertEqual([], tiled.get_shape())
       self.assertEqual(7, result)
@@ -423,11 +432,11 @@ class TileTest(test.TestCase):
   def testSimple(self):
     # multiples could be int32 or int64
     for dtype in [dtypes.int32, dtypes.int64]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inp = np.random.rand(4, 1).astype(np.float32)
         a = constant_op.constant(inp)
         tiled = array_ops.tile(a, constant_op.constant([1, 4], dtype=dtype))
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, (4, 4))
       self.assertEqual([4, 4], tiled.get_shape())
       self.assertTrue((result == np.tile(inp, (1, 4))).all())
@@ -437,7 +446,7 @@ class TileTest(test.TestCase):
       inp = np.random.rand(4, 1).astype(np.float32)
       a = constant_op.constant(inp)
       tiled = array_ops.tile(a, [1, 1])
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertEqual(result.shape, (4, 1))
     self.assertEqual([4, 1], tiled.get_shape())
     self.assertTrue((result == np.tile(inp, (1, 1))).all())
@@ -447,10 +456,11 @@ class TileTest(test.TestCase):
       inp = np.random.rand(2, 3).astype(np.float32)
       a = constant_op.constant(inp)
       tiled = array_ops.tile(a, [5, 0])
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertEqual(result.shape, (10, 0))
     self.assertEqual([10, 0], tiled.get_shape())
 
+  @test_util.run_deprecated_v1
   def testUnknownInputShape(self):
     """Importing can call _TileShape without shape of <multiples> known."""
     with self.cached_session():
@@ -490,18 +500,19 @@ class TileTest(test.TestCase):
         bytes: (dtypes.string, bytes)
     }
     for dtype_np, (dtype_tf, cast) in types_to_test.items():
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inp = np.random.rand(4, 1).astype(dtype_np)
         a = constant_op.constant(
             [cast(x) for x in inp.ravel(order="C")],
             shape=[4, 1],
             dtype=dtype_tf)
         tiled = array_ops.tile(a, [1, 4])
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, (4, 4))
       self.assertEqual([4, 4], tiled.get_shape())
       self.assertAllEqual(result, np.tile(inp, (1, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidDim(self):
     with self.cached_session():
       inp = np.random.rand(4, 1).astype("f")
@@ -517,7 +528,7 @@ class TileTest(test.TestCase):
         array_ops.tile(a, [[2, 3], [3, 4]]).eval()
 
   def _RunAndVerifyResult(self, rank, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       # Random dims of given rank
       input_shape = np.random.randint(1, 4, size=rank)
       inp = np.random.rand(*input_shape).astype("f")
@@ -527,7 +538,7 @@ class TileTest(test.TestCase):
           dtype=dtypes.float32)
       multiples = np.random.randint(1, 4, size=rank).astype(np.int32)
       tiled = array_ops.tile(a, multiples)
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertTrue((np.array(multiples) * np.array(inp.shape) == np.array(
         result.shape)).all())
     self.assertAllEqual(result, np.tile(inp, tuple(multiples)))
@@ -545,6 +556,7 @@ class TileTest(test.TestCase):
     for _ in range(5):
       self._RunAndVerifyResult(10, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradientSimpleReduction(self):
     with self.cached_session():
       inp = np.random.rand(4, 1).astype("f")
@@ -557,9 +569,10 @@ class TileTest(test.TestCase):
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
       self.assertShapeEqual(inp, grad)
-      result = grad.eval()
+      result = self.evaluate(grad)
     self.assertAllClose(np.sum(grad_inp, axis=1).reshape(4, 1), result, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientStridedReduction(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -572,15 +585,16 @@ class TileTest(test.TestCase):
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
       self.assertShapeEqual(inp, grad)
-      result = grad.eval()
+      result = self.evaluate(grad)
     expected_shape = [4, 2]
     expected = np.zeros(expected_shape)
     expected[:, 0] = grad_inp[:, 0] + grad_inp[:, 2]
     expected[:, 1] = grad_inp[:, 1] + grad_inp[:, 3]
     self.assertTrue((np.abs(expected - result) < 1e-3).all())
 
+  @test_util.run_deprecated_v1
   def testGradientSimpleReductionOnGPU(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       inp = np.random.rand(4, 1).astype("f")
       a = constant_op.constant(
           [float(x) for x in inp.flatten()], shape=[4, 1], dtype=dtypes.float32)
@@ -590,11 +604,12 @@ class TileTest(test.TestCase):
       grad_tensor = constant_op.constant(
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     self.assertAllClose(np.sum(grad_inp, axis=1).reshape(4, 1), result, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientStridedReductionOnGPU(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       inp = np.random.rand(4, 2).astype("f")
       a = constant_op.constant(
           [float(x) for x in inp.flatten()], shape=[4, 2], dtype=dtypes.float32)
@@ -604,7 +619,7 @@ class TileTest(test.TestCase):
       grad_tensor = constant_op.constant(
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     expected_shape = [4, 2]
     expected = np.zeros(expected_shape)
     expected[:, 0] = grad_inp[:, 0] + grad_inp[:, 2]
@@ -613,7 +628,7 @@ class TileTest(test.TestCase):
 
   def _RunAndVerifyGradientResult(self, input_shape, multiples):
     for use_gpu in False, True:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         # Random values
         inp = np.asarray(np.random.rand(*input_shape))
         a = constant_op.constant(inp, dtype=dtypes.float64)
@@ -624,15 +639,18 @@ class TileTest(test.TestCase):
       print("tile(float) error = ", err)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientRandomScalar(self):
     self._RunAndVerifyGradientResult([], [])
 
+  @test_util.run_deprecated_v1
   def testGradientRandom(self):
     self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 1, 1, 1, 1])
     self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 2, 1, 3, 1])
     self._RunAndVerifyGradientResult([2, 3, 1, 1, 3], [3, 1, 1, 2, 2])
     self._RunAndVerifyGradientResult([2, 1, 3, 3, 2], [1, 3, 3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testGradientStridedReductionGC(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -642,6 +660,7 @@ class TileTest(test.TestCase):
       err = gradient_checker.compute_gradient_error(a, [4, 2], tiled, [4, 4])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientWithSparseGradWithRank1(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0],
                                   dtype=dtypes.float32)
@@ -653,6 +672,7 @@ class TileTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithSparseGradWithRank3(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0],
                                   dtype=dtypes.float32)
@@ -665,6 +685,7 @@ class TileTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # Unknown multiples shape.
     inp = constant_op.constant(0.0, shape=[4, 4, 4, 4])
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..8f4e31abe3c90af01029be719ee83c7c7dc42f0c
--- /dev/null
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -0,0 +1,143 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
+
+py_library(
+    name = "test_util",
+    srcs = ["test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:tf_optimizer",
+        "//tensorflow/python:training",
+    ],
+)
+
+cuda_py_tests(
+    name = "dct_ops_test",
+    srcs = ["dct_ops_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+)
+
+cuda_py_tests(
+    name = "fft_ops_test",
+    size = "medium",
+    srcs = ["fft_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+    shard_count = 4,
+    tags = ["optonly"],
+)
+
+cuda_py_tests(
+    name = "mel_ops_test",
+    srcs = ["mel_ops_test.py"],
+    additional_deps = [
+        ":test_util",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/ops/signal",
+    ],
+)
+
+cuda_py_tests(
+    name = "mfcc_ops_test",
+    srcs = ["mfcc_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:spectral_ops_test_util",
+    ],
+)
+
+cuda_py_tests(
+    name = "reconstruction_ops_test",
+    srcs = ["reconstruction_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "shape_ops_test",
+    srcs = ["shape_ops_test.py"],
+    additional_deps = [
+        ":test_util",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+cuda_py_tests(
+    name = "spectral_ops_test",
+    size = "large",
+    srcs = ["spectral_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+    tags = ["nomac"],
+)
+
+cuda_py_tests(
+    name = "window_ops_test",
+    srcs = ["window_ops_test.py"],
+    additional_deps = [
+        ":test_util",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/ops/signal",
+        "//tensorflow/python:platform_test",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/signal/dct_ops_test.py b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3ac15bab8a7b8223bd1ea085386b965b7fdd62e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
@@ -0,0 +1,169 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DCT operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import dct_ops
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+fftpack = try_import("scipy.fftpack")
+
+
+def _np_dct1(signals, norm=None):
+  """Computes the DCT-I manually with NumPy."""
+  # X_k = (x_0 + (-1)**k * x_{N-1} +
+  #       2 * sum_{n=0}^{N-2} x_n * cos(\frac{pi}{N-1} * n * k)  k=0,...,N-1
+  del norm
+  dct_size = signals.shape[-1]
+  dct = np.zeros_like(signals)
+  for k in range(dct_size):
+    phi = np.cos(np.pi * np.arange(1, dct_size - 1) * k / (dct_size - 1))
+    dct[..., k] = 2 * np.sum(signals[..., 1:-1] * phi, axis=-1) + (
+        signals[..., 0] + (-1) ** k * signals[..., -1])
+  return dct
+
+
+def _np_dct2(signals, norm=None):
+  """Computes the DCT-II manually with NumPy."""
+  # X_k = sum_{n=0}^{N-1} x_n * cos(\frac{pi}{N} * (n + 0.5) * k)  k=0,...,N-1
+  dct_size = signals.shape[-1]
+  dct = np.zeros_like(signals)
+  for k in range(dct_size):
+    phi = np.cos(np.pi * (np.arange(dct_size) + 0.5) * k / dct_size)
+    dct[..., k] = np.sum(signals * phi, axis=-1)
+  # SciPy's `dct` has a scaling factor of 2.0 which we follow.
+  # https://github.com/scipy/scipy/blob/v0.15.1/scipy/fftpack/src/dct.c.src
+  if norm == "ortho":
+    # The orthonormal scaling includes a factor of 0.5 which we combine with
+    # the overall scaling of 2.0 to cancel.
+    dct[..., 0] *= np.sqrt(1.0 / dct_size)
+    dct[..., 1:] *= np.sqrt(2.0 / dct_size)
+  else:
+    dct *= 2.0
+  return dct
+
+
+def _np_dct3(signals, norm=None):
+  """Computes the DCT-III manually with NumPy."""
+  # SciPy's `dct` has a scaling factor of 2.0 which we follow.
+  # https://github.com/scipy/scipy/blob/v0.15.1/scipy/fftpack/src/dct.c.src
+  dct_size = signals.shape[-1]
+  signals = np.array(signals)  # make a copy so we can modify
+  if norm == "ortho":
+    signals[..., 0] *= np.sqrt(4.0 / dct_size)
+    signals[..., 1:] *= np.sqrt(2.0 / dct_size)
+  else:
+    signals *= 2.0
+  dct = np.zeros_like(signals)
+  # X_k = 0.5 * x_0 +
+  #       sum_{n=1}^{N-1} x_n * cos(\frac{pi}{N} * n * (k + 0.5))  k=0,...,N-1
+  half_x0 = 0.5 * signals[..., 0]
+  for k in range(dct_size):
+    phi = np.cos(np.pi * np.arange(1, dct_size) * (k + 0.5) / dct_size)
+    dct[..., k] = half_x0 + np.sum(signals[..., 1:] * phi, axis=-1)
+  return dct
+
+
+NP_DCT = {1: _np_dct1, 2: _np_dct2, 3: _np_dct3}
+NP_IDCT = {1: _np_dct1, 2: _np_dct3, 3: _np_dct2}
+
+
+class DCTOpsTest(parameterized.TestCase, test.TestCase):
+
+  def _compare(self, signals, norm, dct_type, atol=5e-4, rtol=5e-4):
+    """Compares (I)DCT to SciPy (if available) and a NumPy implementation."""
+    np_dct = NP_DCT[dct_type](signals, norm)
+    tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm).eval()
+    self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol)
+    np_idct = NP_IDCT[dct_type](signals, norm)
+    tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm).eval()
+    self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol)
+    if fftpack:
+      scipy_dct = fftpack.dct(signals, type=dct_type, norm=norm)
+      self.assertAllClose(scipy_dct, tf_dct, atol=atol, rtol=rtol)
+      scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm)
+      self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol)
+    # Verify inverse(forward(s)) == s, up to a normalization factor.
+    tf_idct_dct = dct_ops.idct(
+        tf_dct, type=dct_type, norm=norm).eval()
+    tf_dct_idct = dct_ops.dct(
+        tf_idct, type=dct_type, norm=norm).eval()
+    if norm is None:
+      if dct_type == 1:
+        tf_idct_dct *= 0.5 / (signals.shape[-1] - 1)
+        tf_dct_idct *= 0.5 / (signals.shape[-1] - 1)
+      else:
+        tf_idct_dct *= 0.5 / signals.shape[-1]
+        tf_dct_idct *= 0.5 / signals.shape[-1]
+    self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol)
+    self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
+
+  @parameterized.parameters([
+      [[2]], [[3]], [[10]], [[2, 20]], [[2, 3, 25]]])
+  @test_util.run_deprecated_v1
+  def test_random(self, shape):
+    """Test randomly generated batches of data."""
+    with spectral_ops_test_util.fft_kernel_label_map():
+      with self.session(use_gpu=True):
+        signals = np.random.rand(*shape).astype(np.float32)
+        # Normalization not implemented for orthonormal.
+        self._compare(signals, norm=None, dct_type=1)
+        for norm in (None, "ortho"):
+          self._compare(signals, norm, 2)
+          self._compare(signals, norm, 3)
+
+  def test_error(self):
+    signals = np.random.rand(10)
+    # Unsupported type.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(signals, type=5)
+    # DCT-I normalization not implemented.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(signals, type=1, norm="ortho")
+    # DCT-I requires at least two inputs.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(np.random.rand(1), type=1)
+    # Unknown normalization.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(signals, norm="bad")
+    with self.assertRaises(NotImplementedError):
+      dct_ops.dct(signals, n=10)
+    with self.assertRaises(NotImplementedError):
+      dct_ops.dct(signals, axis=0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
similarity index 94%
rename from tensorflow/python/kernel_tests/fft_ops_test.py
rename to tensorflow/python/kernel_tests/signal/fft_ops_test.py
index f117934e4b598ab9b8fa925df5e1359f88aee6e9..5b1053428c0096c15fce7c4fa7b46d5999602057 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -25,12 +25,13 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.platform import test
 
 VALID_FFT_RANKS = (1, 2, 3)
@@ -68,12 +69,12 @@ class BaseFFTOpsTest(test.TestCase):
   def _checkMemoryFail(self, x, rank):
     config = config_pb2.ConfigProto()
     config.gpu_options.per_process_gpu_memory_fraction = 1e-2
-    with self.test_session(config=config, force_gpu=True):
+    with self.cached_session(config=config, force_gpu=True):
       self._tfFFT(x, rank, fft_length=None)
 
   def _checkGradComplex(self, func, x, y, result_is_complex=True,
                         rtol=1e-2, atol=1e-2):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       # func is a forward or inverse, real or complex, batched or unbatched FFT
@@ -93,7 +94,7 @@ class BaseFFTOpsTest(test.TestCase):
     self.assertAllClose(y_jacob_t, y_jacob_n, rtol=rtol, atol=atol)
 
   def _checkGradReal(self, func, x, rtol=1e-2, atol=1e-2):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       # func is a forward RFFT function (batched or unbatched).
       z = func(inx)
@@ -109,12 +110,12 @@ class FFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFT(self, x, rank, fft_length=None, feed_dict=None):
     # fft_length unused for complex FFTs.
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return self._tfFFTForRank(rank)(x).eval(feed_dict=feed_dict)
 
   def _tfIFFT(self, x, rank, fft_length=None, feed_dict=None):
     # fft_length unused for complex FFTs.
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return self._tfIFFTForRank(rank)(x).eval(feed_dict=feed_dict)
 
   def _npFFT(self, x, rank, fft_length=None):
@@ -139,24 +140,25 @@ class FFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.fft
+      return fft_ops.fft
     elif rank == 2:
-      return spectral_ops.fft2d
+      return fft_ops.fft2d
     elif rank == 3:
-      return spectral_ops.fft3d
+      return fft_ops.fft3d
     else:
       raise ValueError("invalid rank")
 
   def _tfIFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.ifft
+      return fft_ops.ifft
     elif rank == 2:
-      return spectral_ops.ifft2d
+      return fft_ops.ifft2d
     elif rank == 3:
-      return spectral_ops.ifft3d
+      return fft_ops.ifft3d
     else:
       raise ValueError("invalid rank")
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -166,6 +168,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
             self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
@@ -194,6 +197,7 @@ class FFTOpsTest(BaseFFTOpsTest):
   #           np.mod(np.arange(np.power(128, dims)), 64).reshape(
   #               (128,) * dims).astype(np.complex64), rank)
 
+  @test_util.run_deprecated_v1
   def testBasicPlaceholder(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
@@ -204,6 +208,7 @@ class FFTOpsTest(BaseFFTOpsTest):
                     (4,) * dims).astype(np_type),
                 rank, use_placeholder=True, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 5e-6)):
@@ -218,6 +223,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self._compare(gen((4,) * dims).astype(np_type), rank,
                           rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testRandom1D(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -240,6 +246,7 @@ class FFTOpsTest(BaseFFTOpsTest):
         for dim in (127, 255, 511, 1023):
           self._compare(gen((dim,)).astype(np_type), 1, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testError(self):
     for rank in VALID_FFT_RANKS:
       for dims in xrange(0, rank):
@@ -251,6 +258,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             ValueError, "Shape must be .*rank {}.*".format(rank)):
           self._tfIFFT(x, rank)
 
+  @test_util.run_deprecated_v1
   def testGrad_Simple(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.float32, 1e-4), (np.float64, 1e-10)):
@@ -263,6 +271,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self._checkGradComplex(self._tfIFFTForRank(rank), re, im,
                                    rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testGrad_Random(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.float32, 1e-2), (np.float64, 1e-10)):
@@ -283,11 +292,11 @@ class RFFTOpsTest(BaseFFTOpsTest):
                                               use_placeholder)
 
   def _tfFFT(self, x, rank, fft_length=None, feed_dict=None):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return self._tfFFTForRank(rank)(x, fft_length).eval(feed_dict=feed_dict)
 
   def _tfIFFT(self, x, rank, fft_length=None, feed_dict=None):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return self._tfIFFTForRank(rank)(x, fft_length).eval(feed_dict=feed_dict)
 
   def _npFFT(self, x, rank, fft_length=None):
@@ -312,24 +321,25 @@ class RFFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.rfft
+      return fft_ops.rfft
     elif rank == 2:
-      return spectral_ops.rfft2d
+      return fft_ops.rfft2d
     elif rank == 3:
-      return spectral_ops.rfft3d
+      return fft_ops.rfft3d
     else:
       raise ValueError("invalid rank")
 
   def _tfIFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.irfft
+      return fft_ops.irfft
     elif rank == 2:
-      return spectral_ops.irfft2d
+      return fft_ops.irfft2d
     elif rank == 3:
-      return spectral_ops.irfft3d
+      return fft_ops.irfft3d
     else:
       raise ValueError("invalid rank")
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -339,6 +349,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
           x = np.zeros((0,) * dims).astype(np.complex64)
           self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -366,6 +377,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                        10).reshape((size,) * (dims - 1) + (inner_dim,))
           self._compareBackward(c2r.astype(np.complex64), rank, (size,) * rank)
 
+  @test_util.run_deprecated_v1
   def testBasicPlaceholder(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -427,6 +439,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                   fft_length,
                   use_placeholder=True)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       def gen_real(shape):
@@ -451,6 +464,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
             self._compareBackward(
                 gen_complex(complex_dims), rank, (size,) * rank)
 
+  @test_util.run_deprecated_v1
   def testError(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -507,6 +521,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
           with self.cached_session():
             irfft_fn(x, fft_length).eval()
 
+  @test_util.run_deprecated_v1
   def testGrad_Simple(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -521,6 +536,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
             self._checkGradComplex(
                 self._tfIFFTForRank(rank), re, im, result_is_complex=False)
 
+  @test_util.run_deprecated_v1
   def testGrad_Random(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
similarity index 95%
rename from tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
rename to tensorflow/python/kernel_tests/signal/mel_ops_test.py
index f4348e80eac54933d67cdf7bd281d6a9c6c10381..3134503daec4d3ebeeb014f7ea99123cb4a0f694 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.kernel_tests import test_util
-from tensorflow.contrib.signal.python.ops import mel_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.signal import mel_ops
 from tensorflow.python.platform import test
 
 # mel spectrum constants and functions.
@@ -137,18 +138,20 @@ class LinearToMelTest(test.TestCase):
         # Settings used by Tacotron (https://arxiv.org/abs/1703.10135).
         (80, 1025, 24000.0, 80.0, 12000.0, dtypes.float64)
     ]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for config in configs:
         mel_matrix_np = spectrogram_to_mel_matrix(*config)
         mel_matrix = mel_ops.linear_to_mel_weight_matrix(*config)
-        self.assertAllClose(mel_matrix_np, mel_matrix.eval(), atol=3e-6)
+        self.assertAllClose(mel_matrix_np, self.evaluate(mel_matrix), atol=3e-6)
 
+  @tf_test_util.run_deprecated_v1
   def test_dtypes(self):
     # LinSpace is not supported for tf.float16.
     for dtype in (dtypes.bfloat16, dtypes.float32, dtypes.float64):
       self.assertEqual(dtype,
                        mel_ops.linear_to_mel_weight_matrix(dtype=dtype).dtype)
 
+  @tf_test_util.run_deprecated_v1
   def test_error(self):
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(num_mel_bins=0)
@@ -177,8 +180,9 @@ class LinearToMelTest(test.TestCase):
         rewritten_graph = test_util.grappler_optimize(g, [mel_matrix])
         self.assertEqual(1, len(rewritten_graph.node))
 
+  @tf_test_util.run_deprecated_v1
   def test_num_spectrogram_bins_dynamic(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       num_spectrogram_bins = array_ops.placeholder(shape=(),
                                                    dtype=dtypes.int32)
       mel_matrix_np = spectrogram_to_mel_matrix(
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
similarity index 89%
rename from tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
rename to tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
index e7743bdcba180929007d17bdf3b143c64643aacc..935922657cd4dd088c30dae7c74997339b3cb7f1 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mfcc_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
@@ -18,12 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops import mfcc_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import mfcc_ops
 from tensorflow.python.platform import test
 
 
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
 # HTK conventions.
 class MFCCTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_error(self):
     # num_mel_bins must be positive.
     with self.assertRaises(ValueError):
@@ -43,17 +45,19 @@ class MFCCTest(test.TestCase):
       signal = array_ops.zeros((2, 3, 5), dtype=dtypes.float64)
       mfcc_ops.mfccs_from_log_mel_spectrograms(signal)
 
+  @test_util.run_deprecated_v1
   def test_basic(self):
     """A basic test that the op runs on random input."""
     with spectral_ops_test_util.fft_kernel_label_map():
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         signal = random_ops.random_normal((2, 3, 5))
         mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
 
+  @test_util.run_deprecated_v1
   def test_unknown_shape(self):
     """A test that the op runs when shape and rank are unknown."""
     with spectral_ops_test_util.fft_kernel_label_map():
-      with self.test_session(use_gpu=True):
+      with self.session(use_gpu=True):
         signal = array_ops.placeholder_with_default(
             random_ops.random_normal((2, 3, 5)), tensor_shape.TensorShape(None))
         self.assertIsNone(signal.shape.ndims)
diff --git a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0ce06418a457eee9a45b172f9cc5887d1167153
--- /dev/null
+++ b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
@@ -0,0 +1,259 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for reconstruction_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import reconstruction_ops
+from tensorflow.python.platform import test
+
+
+class ReconstructionOpsTest(test.TestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(ReconstructionOpsTest, self).__init__(*args, **kwargs)
+    self.batch_size = 3
+    self.frames = 3
+    self.samples = 5
+
+    self.bases = np.array(range(2, 5))
+    exponents = np.array(range(self.frames * self.samples))
+    powers = np.power(self.bases[:, np.newaxis], exponents[np.newaxis, :])
+
+    self.powers = np.reshape(powers, [self.batch_size, self.frames,
+                                      self.samples])
+    self.frame_hop = 2
+
+    # Hand computed example using powers of unique numbers: this is easily
+    # verified.
+    self.expected_string = ["1", "10", "100100", "1001000", "10010010000",
+                            "100100000000", "1001000000000", "10000000000000",
+                            "100000000000000"]
+
+  def test_all_ones(self):
+    signal = array_ops.ones([3, 5])
+    reconstruction = reconstruction_ops.overlap_and_add(signal, 2)
+
+    self.assertEqual(reconstruction.shape.as_list(), [9])
+
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
+
+      expected_output = np.array([1, 1, 2, 2, 3, 2, 2, 1, 1])
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
+  def test_unknown_shapes(self):
+    # This test uses placeholders and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.placeholder(dtype=dtypes.int32, shape=[None, None, None])
+    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.shape.as_list(), [None, None])
+
+    with self.session(use_gpu=True) as sess:
+      output = sess.run(reconstruction,
+                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
+
+      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
+  def test_unknown_rank(self):
+    # This test uses placeholders and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.placeholder(dtype=dtypes.int32, shape=None)
+    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.shape, None)
+
+    with self.session(use_gpu=True) as sess:
+      output = sess.run(reconstruction,
+                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
+
+      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
+  def test_fast_path(self):
+    # This test uses tensor names and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.ones([3, 5])
+    frame_step = 5
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.name, "overlap_and_add/fast_path:0")
+
+    with self.session(use_gpu=True) as sess:
+      output = self.evaluate(reconstruction)
+
+      expected_output = np.ones([15])
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
+  def test_simple(self):
+    def make_input(frame_length, num_frames=3):
+      """Generate a tensor of num_frames frames of frame_length."""
+      return np.reshape(np.arange(1, num_frames * frame_length + 1),
+                        (-1, frame_length))
+
+    # List of (signal, expected_result, frame_hop).
+    configurations = [
+        # All hop lengths on a frame length of 2.
+        (make_input(2), [1, 5, 9, 6], 1),
+        (make_input(2), [1, 2, 3, 4, 5, 6], 2),
+
+        # All hop lengths on a frame length of 3.
+        (make_input(3), [1, 6, 15, 14, 9], 1),
+        (make_input(3), [1, 2, 7, 5, 13, 8, 9], 2),
+        (make_input(3), [1, 2, 3, 4, 5, 6, 7, 8, 9], 3),
+
+        # All hop lengths on a frame length of 4.
+        (make_input(4), [1, 7, 18, 21, 19, 12], 1),
+        (make_input(4), [1, 2, 8, 10, 16, 18, 11, 12], 2),
+        (make_input(4), [1, 2, 3, 9, 6, 7, 17, 10, 11, 12], 3),
+        (make_input(4), [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 4),
+    ]
+
+    with self.session(use_gpu=True):
+      for signal, expected, frame_hop in configurations:
+        reconstruction = reconstruction_ops.overlap_and_add(
+            np.array(signal), frame_hop).eval()
+        expected_output = np.array(expected)
+        self.assertAllClose(reconstruction, expected_output)
+
+  def test_powers(self):
+    signal = constant_op.constant(np.squeeze(self.powers[0, :, :]),
+                                  dtype=dtypes.int64)
+    reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
+
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
+      string_output = [np.base_repr(x, self.bases[0]) for x in output]
+
+      self.assertEqual(string_output, self.expected_string)
+
+  def test_batch(self):
+    signal = constant_op.constant(self.powers, dtype=dtypes.int64)
+    reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
+
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
+
+      accumulator = True
+      for i in range(self.batch_size):
+        string_output = [np.base_repr(x, self.bases[i]) for x in output[i, :]]
+        accumulator = accumulator and (string_output == self.expected_string)
+
+      self.assertTrue(accumulator)
+
+  def test_one_element_batch(self):
+    input_matrix = np.squeeze(self.powers[0, :, :])
+    input_matrix = input_matrix[np.newaxis, :, :].astype(float)
+    signal = constant_op.constant(input_matrix, dtype=dtypes.float32)
+    reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
+
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
+
+      string_output = [np.base_repr(int(x), self.bases[0]) for x in
+                       np.squeeze(output)]
+
+      self.assertEqual(output.shape, (1, 9))
+      self.assertEqual(string_output, self.expected_string)
+
+  @test_util.run_deprecated_v1
+  def test_gradient(self):
+    configurations = [
+        ((1, 128), 1),
+        ((5, 35), 17),
+        ((10, 128), 128),
+        ((2, 10, 128), 127),
+        ((2, 2, 10, 128), 126),
+        ((2, 2, 2, 10, 128), 125),
+    ]
+
+    with self.session(use_gpu=True) as sess:
+      for shape, frame_hop in configurations:
+        signal = array_ops.zeros(shape)
+        reconstruction = reconstruction_ops.overlap_and_add(signal, frame_hop)
+        loss = math_ops.reduce_sum(reconstruction)
+        # Increasing any sample in the input frames by one will increase the sum
+        # of all the samples in the reconstruction by 1, so the gradient should
+        # be all ones, no matter the shape or hop.
+        gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
+        self.assertTrue((gradient == 1.0).all())
+
+  @test_util.run_deprecated_v1
+  def test_gradient_batch(self):
+    with self.session(use_gpu=True) as sess:
+      signal = array_ops.zeros((2, 10, 10))
+      frame_hop = 10
+      reconstruction = reconstruction_ops.overlap_and_add(signal, frame_hop)
+
+      # Multiply the first batch-item's reconstruction by zeros. This will block
+      # gradient from flowing into the first batch item from the loss. Multiply
+      # the second batch item by the integers from 0 to 99. Since there is zero
+      # overlap, the gradient for this batch item will be 0-99 shaped as (10,
+      # 10).
+      reconstruction *= array_ops.stack(
+          [array_ops.zeros((100,)), math_ops.to_float(math_ops.range(100))])
+      loss = math_ops.reduce_sum(reconstruction)
+
+      # Verify that only the second batch item receives gradient.
+      gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
+      expected_gradient = np.stack([
+          np.zeros((10, 10)),
+          np.reshape(np.arange(100).astype(np.float32), (10, 10))])
+      self.assertAllEqual(expected_gradient, gradient)
+
+  @test_util.run_deprecated_v1
+  def test_gradient_numerical(self):
+    with self.session(use_gpu=True):
+      shape = (2, 10, 10)
+      framed_signal = array_ops.zeros(shape)
+      frame_hop = 10
+      reconstruction = reconstruction_ops.overlap_and_add(
+          framed_signal, frame_hop)
+      error = test.compute_gradient_error(
+          framed_signal, shape, reconstruction, [2, 100])
+      self.assertLess(error, 2e-5)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
similarity index 91%
rename from tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
rename to tensorflow/python/kernel_tests/signal/shape_ops_test.py
index f1320501535f87fd73121e42a3d8e291e320ed3b..32ac76e80d00660e0784ee44cda7e325862c7816 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
@@ -20,20 +20,22 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.kernel_tests import test_util
-from tensorflow.contrib.signal.python.ops import shape_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.platform import test
 
 
 class FrameTest(test.TestCase):
 
+  @tf_test_util.run_deprecated_v1
   def test_mapping_of_indices_without_padding(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       tensor = constant_op.constant(np.arange(9152), dtypes.int32)
       tensor = array_ops.expand_dims(tensor, 0)
 
@@ -47,8 +49,9 @@ class FrameTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_mapping_of_indices_with_padding(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       tensor = constant_op.constant(np.arange(10000), dtypes.int32)
       tensor = array_ops.expand_dims(tensor, 0)
 
@@ -64,6 +67,7 @@ class FrameTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_invalid_inputs(self):
     # Rank 0 input signal.
     with self.assertRaises(ValueError):
@@ -84,12 +88,13 @@ class FrameTest(test.TestCase):
     with self.assertRaises(ValueError):
       shape_ops.frame([1], 1, 1, pad_end=True, pad_value=[1])
 
+  @tf_test_util.run_deprecated_v1
   def test_length_zero(self):
     signal = constant_op.constant([], dtype=dtypes.float32)
     frame_length = 2
     frame_step = 1
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       result = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=True, pad_value=99).eval()
       self.assertEqual((0, 2), result.shape)
@@ -98,6 +103,7 @@ class FrameTest(test.TestCase):
                                pad_end=False).eval()
       self.assertEqual((0, 2), result.shape)
 
+  @tf_test_util.run_deprecated_v1
   def test_shape_inference(self):
     signal = array_ops.placeholder(dtypes.int32, shape=[1, 1])
     frame_length = 2
@@ -149,16 +155,17 @@ class FrameTest(test.TestCase):
         for pad_end in [False, True]:
           op = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=pad_end, pad_value=99)
-          with self.test_session(use_gpu=True):
-            result = op.eval()
+          with self.cached_session(use_gpu=True):
+            result = self.evaluate(op)
           self.assertEqual(op.shape.as_list(), list(result.shape))
 
+  @tf_test_util.run_deprecated_v1
   def test_basic_mono(self):
     signal = np.arange(6)
     frame_length = 3
     frame_step = 2
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for rank in range(5):
         nd_signal = np.reshape(signal, (1,) * rank + signal.shape)
 
@@ -178,13 +185,14 @@ class FrameTest(test.TestCase):
                                  pad_end=False).eval()
         self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_basic_stereo(self):
     signal = np.vstack([np.arange(6),
                         np.arange(6) + 10])
     frame_length = 3
     frame_step = 2
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for rank in range(5):
         nd_signal = np.reshape(signal, (1,) * rank + signal.shape)
 
@@ -207,6 +215,7 @@ class FrameTest(test.TestCase):
                                  pad_end=False).eval()
         self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_complex_shape(self):
     signal = np.vstack([np.arange(6),
                         np.arange(6) + 10,
@@ -218,7 +227,7 @@ class FrameTest(test.TestCase):
     frame_length = 3
     frame_step = 2
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # With padding, we pad the last frame with pad_value.
       result = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=True, pad_value=99).eval()
@@ -244,11 +253,11 @@ class FrameTest(test.TestCase):
 
   def test_axis(self):
     signal = np.reshape(np.arange(16), (2, 4, 2))
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       result = shape_ops.frame(signal, frame_length=2, frame_step=2,
                                pad_end=True, axis=1)
       expected = np.reshape(np.arange(16), (2, 2, 2, 2))
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
       result = shape_ops.frame(signal, frame_length=2, frame_step=1,
                                pad_end=True, axis=1)
@@ -260,7 +269,7 @@ class FrameTest(test.TestCase):
                    [[10, 11], [12, 13]],
                    [[12, 13], [14, 15]],
                    [[14, 15], [0, 0]]]]
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
       result = shape_ops.frame(signal, frame_length=3, frame_step=1,
                                pad_end=True, axis=1)
@@ -272,14 +281,15 @@ class FrameTest(test.TestCase):
                    [[10, 11], [12, 13], [14, 15]],
                    [[12, 13], [14, 15], [0, 0]],
                    [[14, 15], [0, 0], [0, 0]]]]
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
+  @tf_test_util.run_deprecated_v1
   def test_window_larger_than_signal(self):
     signal = constant_op.constant([[1, 2], [11, 12]], dtype=dtypes.float32)
     frame_length = 4
     frame_step = 1
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       result = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=True, pad_value=99).eval()
       self.assertAllClose([[[1, 2, 99, 99], [2, 99, 99, 99]],
@@ -303,10 +313,11 @@ class FrameTest(test.TestCase):
     frame_length = 2
     frame_step = 3
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       result = shape_ops.frame(signal, frame_length, frame_step)
       self.assertEqual(result.dtype, signal.dtype)
 
+  @tf_test_util.run_deprecated_v1
   def test_dynamic_tensor(self):
     # Show that frame works even when the dimensions of its input are
     # not known at graph creation time.
@@ -315,7 +326,7 @@ class FrameTest(test.TestCase):
     frame_length = 2
     frame_step = 2
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       signal_placeholder = array_ops.placeholder(shape=(None, None),
                                                  dtype=dtypes.float32)
       result = sess.run(shape_ops.frame(
@@ -325,8 +336,9 @@ class FrameTest(test.TestCase):
                            [[10, 11], [12, 13]],
                            [[20, 21], [22, 23]]], result)
 
+  @tf_test_util.run_deprecated_v1
   def test_gradient_numerical(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       signal_shape = (2, 128)
       signal = array_ops.ones(signal_shape)
       frame_length = 33
diff --git a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
similarity index 92%
rename from tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
rename to tensorflow/python/kernel_tests/signal/spectral_ops_test.py
index f10d78259a3be3a3a6f7f78c196ab107f18a53aa..7b9748c7f260b60d7322a6de68e35970513ac969 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
@@ -20,14 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.ops import spectral_ops
-from tensorflow.contrib.signal.python.ops import window_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import spectral_ops
+from tensorflow.python.ops.signal import window_ops
 from tensorflow.python.platform import test
 
 
@@ -81,7 +81,7 @@ class SpectralOpsTest(test.TestCase):
 
   def _compare(self, signal, frame_length, frame_step, fft_length):
     with spectral_ops_test_util.fft_kernel_label_map(), (
-        self.test_session(use_gpu=True)) as sess:
+        self.cached_session(use_gpu=True)) as sess:
       actual_stft = spectral_ops.stft(
           signal, frame_length, frame_step, fft_length, pad_end=False)
       signal_ph = array_ops.placeholder(dtype=dtypes.as_dtype(signal.dtype))
@@ -117,7 +117,7 @@ class SpectralOpsTest(test.TestCase):
 
   def test_shapes(self):
     with spectral_ops_test_util.fft_kernel_label_map(), (
-        self.test_session(use_gpu=True)):
+        self.session(use_gpu=True)):
       signal = np.zeros((512,)).astype(np.float32)
 
       # If fft_length is not provided, the smallest enclosing power of 2 of
@@ -125,22 +125,22 @@ class SpectralOpsTest(test.TestCase):
       stft = spectral_ops.stft(signal, frame_length=7, frame_step=8,
                                pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
                                pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
                                fft_length=16, pad_end=True)
       self.assertAllEqual([64, 9], stft.shape.as_list())
-      self.assertAllEqual([64, 9], stft.eval().shape)
+      self.assertAllEqual([64, 9], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=16, frame_step=8,
                                fft_length=8, pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = np.zeros((32, 9)).astype(np.complex64)
 
@@ -148,7 +148,7 @@ class SpectralOpsTest(test.TestCase):
                                                fft_length=16, frame_step=8)
       expected_length = (stft.shape[0] - 1) * 8 + 8
       self.assertAllEqual([256], inverse_stft.shape.as_list())
-      self.assertAllEqual([expected_length], inverse_stft.eval().shape)
+      self.assertAllEqual([expected_length], self.evaluate(inverse_stft).shape)
 
   def test_stft_and_inverse_stft(self):
     """Test that spectral_ops.stft/inverse_stft match a NumPy implementation."""
@@ -188,7 +188,7 @@ class SpectralOpsTest(test.TestCase):
       signal = random_ops.random_normal([signal_length])
 
       with spectral_ops_test_util.fft_kernel_label_map(), (
-          self.test_session(use_gpu=True)) as sess:
+          self.cached_session(use_gpu=True)) as sess:
         stft = spectral_ops.stft(signal, frame_length, frame_step, fft_length,
                                  pad_end=False)
         inverse_stft = spectral_ops.inverse_stft(stft, frame_length, frame_step,
@@ -234,8 +234,9 @@ class SpectralOpsTest(test.TestCase):
       inverse_window_fn = spectral_ops.inverse_stft_window_fn(frame_step)
       inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
 
-      with self.test_session(use_gpu=True) as sess:
-        hann_window, inverse_window = sess.run([hann_window, inverse_window])
+      with self.cached_session(use_gpu=True) as sess:
+        hann_window, inverse_window = self.evaluate(
+            [hann_window, inverse_window])
 
       # Expect unit gain at each phase of the window.
       product_window = hann_window * inverse_window
@@ -262,8 +263,9 @@ class SpectralOpsTest(test.TestCase):
       inverse_window_fn = spectral_ops.inverse_stft_window_fn(frame_step)
       inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
 
-      with self.test_session(use_gpu=True) as sess:
-        hann_window, inverse_window = sess.run([hann_window, inverse_window])
+      with self.cached_session(use_gpu=True) as sess:
+        hann_window, inverse_window = self.evaluate(
+            [hann_window, inverse_window])
 
       self.assertAllClose(hann_window, inverse_window * 1.5)
 
@@ -279,7 +281,7 @@ class SpectralOpsTest(test.TestCase):
   def test_gradients(self):
     """Test that spectral_ops.stft has a working gradient."""
     with spectral_ops_test_util.fft_kernel_label_map(), (
-        self.test_session(use_gpu=True)) as sess:
+        self.session(use_gpu=True)) as sess:
       signal_length = 512
 
       # An all-zero signal has all zero gradients with respect to the sum of the
@@ -293,12 +295,12 @@ class SpectralOpsTest(test.TestCase):
       # the sum of the magnitude STFT.
       sinusoid = math_ops.sin(
           2 * np.pi * math_ops.linspace(0.0, 1.0, signal_length))
-      sinusoid_gradient = sess.run(self._compute_stft_gradient(sinusoid))
+      sinusoid_gradient = self.evaluate(self._compute_stft_gradient(sinusoid))
       self.assertFalse((sinusoid_gradient == 0.0).all())
 
   def test_gradients_numerical(self):
     with spectral_ops_test_util.fft_kernel_label_map(), (
-        self.test_session(use_gpu=True)):
+        self.session(use_gpu=True)):
       # Tuples of (signal_length, frame_length, frame_step, fft_length,
       # stft_bound, inverse_stft_bound).
       # TODO(rjryan): Investigate why STFT gradient error is so high.
diff --git a/tensorflow/contrib/signal/python/kernel_tests/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
similarity index 77%
rename from tensorflow/contrib/signal/python/kernel_tests/test_util.py
rename to tensorflow/python/kernel_tests/signal/test_util.py
index b4422a49887378187a2be46275d4dabf1fbd40a1..0a8a621c3eeee1b943a55aced138a6abad233059 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Test utilities for tf.contrib.signal."""
+"""Test utilities for tf.signal."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.training import saver
 
 
-def grappler_optimize(graph, fetches=None, rewriter_config=None):
+def grappler_optimize(graph, fetches=None, config_proto=None):
   """Tries to optimize the provided graph using grappler.
 
   Args:
@@ -31,17 +31,17 @@ def grappler_optimize(graph, fetches=None, rewriter_config=None):
     fetches: An optional list of `Tensor`s to fetch (i.e. not optimize away).
       Grappler uses the 'train_op' collection to look for fetches, so if not
       provided this collection should be non-empty.
-    rewriter_config: An optional `tf.RewriterConfig` to use when rewriting the
+    config_proto: An optional `tf.ConfigProto` to use when rewriting the
       graph.
 
   Returns:
     A `tf.GraphDef` containing the rewritten graph.
   """
-  if rewriter_config is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
-    rewriter_config.min_graph_nodes = -1
+  if config_proto is None:
+    config_proto = config_pb2.ConfigProto()
+    config_proto.graph_options.rewrite_options.min_graph_nodes = -1
   if fetches is not None:
     for fetch in fetches:
       graph.add_to_collection('train_op', fetch)
   metagraph = saver.export_meta_graph(graph_def=graph.as_graph_def())
-  return tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+  return tf_optimizer.OptimizeGraph(config_proto, metagraph)
diff --git a/tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
similarity index 93%
rename from tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py
rename to tensorflow/python/kernel_tests/signal/window_ops_test.py
index 5a464699dac5a737e0c6e0122a4a6699e945f695..a72cdb288bb93d96237fa84261a7bc1e9dcdf118 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -22,10 +22,11 @@ import functools
 
 import numpy as np
 
-from tensorflow.contrib.signal.python.kernel_tests import test_util
-from tensorflow.contrib.signal.python.ops import window_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.kernel_tests.signal import test_util
+from tensorflow.python.ops.signal import window_ops
 from tensorflow.python.platform import test
 
 
@@ -64,7 +65,7 @@ class WindowOpsTest(test.TestCase):
                     (dtypes.float64, 1e-9)]
 
   def _compare_window_fns(self, np_window_fn, tf_window_fn):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for window_length in self._window_lengths:
         for periodic in [False, True]:
           for tf_dtype, tol in self._dtypes:
@@ -75,6 +76,7 @@ class WindowOpsTest(test.TestCase):
                                   dtype=tf_dtype).eval()
             self.assertAllClose(expected, actual, tol, tol)
 
+  @tf_test_util.run_deprecated_v1
   def test_hann_window(self):
     """Check that hann_window matches scipy.signal.hann behavior."""
     # The Hann window is a raised cosine window with parameters alpha=0.5 and
@@ -84,6 +86,7 @@ class WindowOpsTest(test.TestCase):
         functools.partial(_scipy_raised_cosine, a=0.5, b=0.5),
         window_ops.hann_window)
 
+  @tf_test_util.run_deprecated_v1
   def test_hamming_window(self):
     """Check that hamming_window matches scipy.signal.hamming's behavior."""
     # The Hamming window is a raised cosine window with parameters alpha=0.54
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index c08d3222b347694d516cf9ff1bb743b73b62005c..8f7245214a20d88caf426558b9699fec9f9c908f 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -24,6 +24,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -35,66 +36,106 @@ class SliceTest(test.TestCase):
   def testEmpty(self):
     inp = np.random.rand(4, 4).astype("f")
     for k in xrange(4):
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.float32)
         slice_t = a[2, k:k]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[2, k:k])
 
   def testInt32(self):
     inp = np.random.rand(4, 4).astype("i")
     for k in xrange(4):
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.int32)
         slice_t = a[2, k:k]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[2, k:k])
 
-  def testInt64Slicing(self):
-    with self.test_session(use_gpu=True):
-      a = constant_op.constant([0, 1, 2], dtype=dtypes.int64)
+  def testSlicingWithInt64Index(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()):
+      a = constant_op.constant([0, 1, 2], dtype=dtypes.int32)
 
       # Slice using int64 Tensor.
       i = constant_op.constant(1, dtype=dtypes.int64)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i+1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       # Slice using int64 integer.
       i = np.asarray(1).astype(np.int64)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i+1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
+      self.assertAllEqual([1], slice_val)
+
+      a_int32 = constant_op.constant([0, 1, 2], dtype=dtypes.int32)
+      slice_t = array_ops.slice(a_int32,
+                                np.asarray([1]).astype(np.int64),
+                                np.asarray([2]).astype(np.int64))
+      slice_val = self.evaluate(slice_t)
+      self.assertAllEqual([1, 2], slice_val)
+
+      a_float32 = constant_op.constant([0, 1, 2], dtype=dtypes.float32)
+      slice_t = array_ops.slice(a_float32,
+                                np.asarray([1]).astype(np.int64),
+                                np.asarray([2]).astype(np.int64))
+      slice_val = self.evaluate(slice_t)
+      self.assertAllEqual([1, 2], slice_val)
+
+  def testSlicingInt64Tensor(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()):
+      a = constant_op.constant([0, 1, 2], dtype=dtypes.int64)
+
+      # Slice using int32 Tensor.
+      i = constant_op.constant(1, dtype=dtypes.int32)
+      slice_t = a[i]
+      slice_val = self.evaluate(slice_t)
+      self.assertAllEqual(1, slice_val)
+      slice_t = a[i:i + 1]
+      slice_val = self.evaluate(slice_t)
+      self.assertAllEqual([1], slice_val)
+
+      # Slice using int32 integer.
+      i = np.asarray(1).astype(np.int32)
+      slice_t = a[i]
+      slice_val = self.evaluate(slice_t)
+      self.assertAllEqual(1, slice_val)
+      slice_t = a[i:i + 1]
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
+      slice_t = array_ops.slice(a, [1], [2])
+      slice_val = self.evaluate(slice_t)
+      self.assertAllEqual([1, 2], slice_val)
+
   def testSelectAll(self):
     for _ in range(10):
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inp = np.random.rand(4, 4, 4, 4).astype("f")
         a = constant_op.constant(inp, shape=[4, 4, 4, 4], dtype=dtypes.float32)
 
         slice_explicit_t = array_ops.slice(a, [0, 0, 0, 0], [-1, -1, -1, -1])
         slice_implicit_t = a[:, :, :, :]
 
-        self.assertAllEqual(inp, slice_explicit_t.eval())
-        self.assertAllEqual(inp, slice_implicit_t.eval())
+        self.assertAllEqual(inp, self.evaluate(slice_explicit_t))
+        self.assertAllEqual(inp, self.evaluate(slice_implicit_t))
         self.assertEqual(inp.shape, slice_explicit_t.get_shape())
         self.assertEqual(inp.shape, slice_implicit_t.get_shape())
 
   def testSingleDimension(self):
     for _ in range(10):
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inp = np.random.rand(10).astype("f")
         a = constant_op.constant(inp, shape=[10], dtype=dtypes.float32)
 
         hi = np.random.randint(0, 9)
         scalar_t = a[hi]
-        scalar_val = scalar_t.eval()
+        scalar_val = self.evaluate(scalar_t)
         self.assertAllEqual(scalar_val, inp[hi])
 
         if hi > 0:
@@ -102,9 +143,10 @@ class SliceTest(test.TestCase):
         else:
           lo = 0
         slice_t = a[lo:hi]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
         self.assertAllEqual(slice_val, inp[lo:hi])
 
+  @test_util.run_deprecated_v1
   def testScalarInput(self):
     input_val = 0
     with self.cached_session() as sess:
@@ -119,6 +161,7 @@ class SliceTest(test.TestCase):
                                                "out of range"):
         sess.run([slice_t], feed_dict={input_t: input_val})
 
+  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     input_val = [1, 2]
     with self.cached_session() as sess:
@@ -134,11 +177,12 @@ class SliceTest(test.TestCase):
         sess.run([slice_t], feed_dict={input_t: input_val})
 
   def _testSliceMatrixDim0(self, x, begin, size):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_ans = array_ops.slice(x, [begin, 0], [size, x.shape[1]]).eval()
     np_ans = x[begin:begin + size, :]
     self.assertAllEqual(tf_ans, np_ans)
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixDim0(self):
     x = np.random.rand(8, 4).astype("f")
     self._testSliceMatrixDim0(x, 1, 2)
@@ -149,17 +193,17 @@ class SliceTest(test.TestCase):
 
   def testSingleElementAll(self):
     for _ in range(10):
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inp = np.random.rand(4, 4).astype("f")
         a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.float32)
 
         x, y = np.random.randint(0, 3, size=2).tolist()
         slice_t = a[x, 0:y]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[x, 0:y])
 
   def testSimple(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       inp = np.random.rand(4, 4).astype("f")
       a = constant_op.constant(
           [float(x) for x in inp.ravel(order="C")],
@@ -167,14 +211,15 @@ class SliceTest(test.TestCase):
           dtype=dtypes.float32)
       slice_t = array_ops.slice(a, [0, 0], [2, 2])
       slice2_t = a[:2, :2]
-      slice_val, slice2_val = sess.run([slice_t, slice2_t])
+      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
     self.assertAllEqual(slice_val, inp[:2, :2])
     self.assertAllEqual(slice2_val, inp[:2, :2])
     self.assertEqual(slice_val.shape, slice_t.get_shape())
     self.assertEqual(slice2_val.shape, slice2_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testComplex(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       inp = np.random.rand(4, 10, 10, 4).astype("f")
       a = constant_op.constant(inp, dtype=dtypes.float32)
 
@@ -191,7 +236,7 @@ class SliceTest(test.TestCase):
     # Random dims of rank 6
     input_shape = np.random.randint(0, 20, size=6)
     inp = np.random.rand(*input_shape).astype("f")
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       a = constant_op.constant(
           [float(x) for x in inp.ravel(order="C")],
           shape=input_shape,
@@ -207,7 +252,7 @@ class SliceTest(test.TestCase):
                    + sizes[3], indices[4]:indices[4] + sizes[4], indices[5]:
                    indices[5] + sizes[5]]
 
-      slice_val, slice2_val = sess.run([slice_t, slice2_t])
+      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
 
     expected_val = inp[indices[0]:indices[0] + sizes[0], indices[1]:indices[
         1] + sizes[1], indices[2]:indices[2] + sizes[2], indices[3]:indices[
@@ -230,7 +275,7 @@ class SliceTest(test.TestCase):
 
 
   def _testGradientSlice(self, input_shape, slice_begin, slice_size):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       num_inputs = np.prod(input_shape)
       num_grads = np.prod(slice_size)
       inp = np.random.rand(num_inputs).astype("f").reshape(input_shape)
@@ -242,7 +287,7 @@ class SliceTest(test.TestCase):
       grads = np.random.rand(num_grads).astype("f").reshape(slice_size)
       grad_tensor = constant_op.constant(grads)
       grad = gradients_impl.gradients(slice_t, [a], grad_tensor)[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
 
     # Create a zero tensor of the input shape ane place
     # the grads into the right location to compare against TensorFlow.
@@ -255,7 +300,7 @@ class SliceTest(test.TestCase):
     self.assertAllClose(np_ans, result)
 
   def _testGradientVariableSize(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       inp = constant_op.constant([1.0, 2.0, 3.0], name="in")
       out = array_ops.slice(inp, [1], [-1])
       grad_actual = gradients_impl.gradients(out, inp)[0].eval()
@@ -265,7 +310,7 @@ class SliceTest(test.TestCase):
     # Regression test for bug in slice. A low-level bug in Eigen was causing
     # incorrect results for negative indices in multi-dimensional tensors.
     # See b/114318298.
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       x = constant_op.constant([[1., 2., 3.], [4., 5., 6.], [7., 8., 7]])
       loss1 = math_ops.reduce_sum(x[:-1, :-1] * 1.0)
       loss2 = math_ops.reduce_sum(x[:-1][:, :-1])
@@ -273,9 +318,10 @@ class SliceTest(test.TestCase):
       g1 = gradients_impl.gradients(loss1, x)[0]
       g2 = gradients_impl.gradients(loss2, x)[0]
 
-      g1_val, g2_val = sess.run([g1, g2])
+      g1_val, g2_val = self.evaluate([g1, g2])
     self.assertAllEqual(g1_val, g2_val)
 
+  @test_util.run_deprecated_v1
   def testGradientsAll(self):
     # Slice the middle square out of a 4x4 input
     self._testGradientSlice([4, 4], [1, 1], [2, 2])
@@ -295,6 +341,7 @@ class SliceTest(test.TestCase):
     # Use -1 as a slice dimension on a 2D tensor.
     self._testGradientVariableSize2D()
 
+  @test_util.run_deprecated_v1
   def testNotIterable(self):
     # NOTE(mrry): If we register __getitem__ as an overloaded
     # operator, Python will valiantly attempt to iterate over the
@@ -306,6 +353,7 @@ class SliceTest(test.TestCase):
       for _ in c:
         pass
 
+  @test_util.run_deprecated_v1
   def testComputedShape(self):
     # NOTE(mrry): We cannot currently handle partially-known values,
     # because `tf.slice()` uses -1 to specify a wildcard size, and
@@ -322,13 +370,13 @@ class SliceTest(test.TestCase):
     self.assertEqual([None, 2], c.get_shape().as_list())
 
   def testSliceOfSlice(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       a = constant_op.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
       b = a[1:, :]
       c = b[:-1, :]
       d = c[1, :]
       res = 2 * d - c[1, :] + a[2, :] - 2 * b[-2, :]
-      self.assertAllEqual([0, 0, 0], res.eval())
+      self.assertAllEqual([0, 0, 0], self.evaluate(res))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 89f4697e5cbc443abd6e18dbf7d7681e4c42269e..707b8a429f2be1fcce39516d368e2b7a05570652 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
@@ -59,12 +60,12 @@ class SoftmaxTest(test.TestCase):
     # this bug in future.
     name = "arbitrary"
     np_softmax = self._npSoftmax(np_features, dim=dim, log=log)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       if log:
         tf_softmax = nn_ops.log_softmax(np_features, axis=dim, name=name)
       else:
         tf_softmax = nn_ops.softmax(np_features, axis=dim, name=name)
-      out = tf_softmax.eval()
+      out = self.evaluate(tf_softmax)
     self.assertAllCloseAccordingToType(np_softmax, out)
     self.assertShapeEqual(np_softmax, tf_softmax)
     if not log:
@@ -111,9 +112,9 @@ class SoftmaxTest(test.TestCase):
       type = np.float64  # pylint: disable=redefined-builtin
     max = np.finfo(type).max  # pylint: disable=redefined-builtin
     features = np.array([[1., 1., 1., 1.], [max, 1., 2., 3.]]).astype(type)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       tf_log_softmax = nn_ops.log_softmax(features)
-      out = tf_log_softmax.eval()
+      out = self.evaluate(tf_log_softmax)
     self.assertAllClose(
         np.array([[-1.386294, -1.386294, -1.386294, -1.386294],
                   [0, -max, -max, -max]]),
@@ -206,6 +207,7 @@ class SoftmaxTest(test.TestCase):
                          [[5., 4., 3., 2.], [1., 2., 3., 4.]]])
     self.assertEqual([3, 2, 4], op.get_shape())
 
+  @test_util.run_deprecated_v1
   def testEmptyInput(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=[0, 3])
@@ -222,6 +224,14 @@ class SoftmaxTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         nn_ops.softmax([1., 2., 3., 4.], axis=dim).eval()
 
+  def testInvalidAxis(self):
+    # Test case for GitHub issue 22793.
+    with self.cached_session():
+      ones = array_ops.ones(shape=[2, 3])
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        nn_ops.softmax(ones, axis=2).eval()
+
+  @test_util.run_deprecated_v1
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
     # https://github.com/tensorflow/tensorflow/issues/4425 for details
@@ -230,7 +240,7 @@ class SoftmaxTest(test.TestCase):
       np_softmax = self._npSoftmax(ones)
 
       for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu) as sess:
+        with self.cached_session(use_gpu=use_gpu) as sess:
           x = array_ops.placeholder(dtypes.float32)
           y = nn_ops.softmax(x)
           tf_softmax = sess.run(y, feed_dict={x: ones})
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index 636ed4747e1702d5bd6374e44e5114f76fe0a00d..5273dd7ffc7cec6807bdcdf2ad2a4e9c18a573d1 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -37,9 +38,9 @@ class SoftplusTest(test.TestCase):
 
   def _testSoftplus(self, np_features, use_gpu=False):
     np_softplus = self._npSoftplus(np_features)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       softplus = nn_ops.softplus(np_features)
-      tf_softplus = softplus.eval()
+      tf_softplus = self.evaluate(softplus)
     self.assertAllCloseAccordingToType(np_softplus, tf_softplus)
     self.assertTrue(np.all(tf_softplus > 0))
     self.assertShapeEqual(np_softplus, softplus)
@@ -70,6 +71,7 @@ class SoftplusTest(test.TestCase):
           ],
           use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -86,6 +88,7 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradGrad(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -103,6 +106,7 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) gradient of gradient err = ", err)
     self.assertLess(err, 5e-5)
 
+  @test_util.run_deprecated_v1
   def testGradGradGrad(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -121,6 +125,7 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) third-order gradient err = ", err)
     self.assertLess(err, 5e-5)
 
+  @test_util.run_deprecated_v1
   def testNoInts(self):
     with self.cached_session():
       with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
index 1b4db9fa46f0923aae5fa667daf2d9073ef21abe..5554240c82621e5bceb89fab17f6d824f61252f3 100644
--- a/tensorflow/python/kernel_tests/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -34,9 +35,9 @@ class SoftsignTest(test.TestCase):
 
   def _testSoftsign(self, np_features, use_gpu=False):
     np_softsign = self._npSoftsign(np_features)
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       softsign = nn_ops.softsign(np_features)
-      tf_softsign = softsign.eval()
+      tf_softsign = self.evaluate(softsign)
     self.assertAllClose(np_softsign, tf_softsign)
     self.assertShapeEqual(np_softsign, softsign)
 
@@ -49,6 +50,7 @@ class SoftsignTest(test.TestCase):
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
           use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -65,6 +67,7 @@ class SoftsignTest(test.TestCase):
     print("softsign (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testNoInts(self):
     with self.cached_session():
       with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index e267c0591557b960341febd0d92c5ef06fbc7c4a..8641156604c98e2737f8854db3a218905cfd9281 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
@@ -100,7 +101,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
   """
 
   def _testPad(self, inputs, paddings, block_size, outputs):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # outputs = space_to_batch(inputs)
       x_tf = self.space_to_batch(
           math_ops.to_float(inputs), paddings, block_size=block_size)
@@ -115,6 +116,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
     self._testPad(inputs, paddings, block_size, outputs)
 
   # [1, 2, 2, 1] <-> [4, 1, 1, 1]
+  @test_util.run_deprecated_v1
   def testSmallInput2x2(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 2
@@ -122,6 +124,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
     self._testOne(x_np, block_size, x_out)
 
   # [1, 2, 2, 1] <-> [1, 3, 3, 1] (padding) <-> [9, 1, 1, 1]
+  @test_util.run_deprecated_v1
   def testSmallInput2x2Pad1x0(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
     paddings = np.array([[1, 0], [1, 0]], dtype=np.int32)
@@ -132,6 +135,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
 
   # Test with depth larger than 1.
   # [1, 2, 2, 3] <-> [4, 1, 1, 3]
+  @test_util.run_deprecated_v1
   def testDepthInput2x2(self):
     x_np = [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]
     block_size = 2
@@ -140,6 +144,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
 
   # Test for larger input dimensions.
   # [1, 4, 4, 1] <-> [4, 2, 2, 1]
+  @test_util.run_deprecated_v1
   def testLargerInput2x2(self):
     x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
              [[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
@@ -150,6 +155,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
 
   # Test with batch larger than 1.
   # [2, 2, 4, 1] <-> [8, 1, 2, 1]
+  @test_util.run_deprecated_v1
   def testBatchInput2x2(self):
     x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]]],
             [[[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
@@ -162,6 +168,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
   # that elements are correctly laid out spatially and properly interleaved
   # along the batch dimension.
   # [2, 4, 4, 1] <-> [8, 2, 2, 1]
+  @test_util.run_deprecated_v1
   def testLargerInputBatch2x2(self):
     x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
              [[9], [10], [11], [12]], [[13], [14], [15], [16]]],
@@ -190,7 +197,7 @@ class SpaceToBatchNDTest(test.TestCase):
     block_shape = np.array(block_shape)
     paddings = np.array(paddings).reshape((len(block_shape), 2))
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         # outputs = space_to_batch(inputs)
         x_tf = array_ops.space_to_batch_nd(
             math_ops.to_float(inputs), block_shape, paddings)
@@ -206,6 +213,7 @@ class SpaceToBatchNDTest(test.TestCase):
     self._testPad(inputs, block_shape, paddings,
                   space_to_batch_direct(inputs, block_shape, paddings))
 
+  @test_util.run_deprecated_v1
   def testZeroBlockDimsZeroRemainingDims(self):
     self._testPad(
         inputs=[1, 2],
@@ -213,6 +221,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[],
         outputs=[1, 2],)
 
+  @test_util.run_deprecated_v1
   def testZeroBlockDimsOneRemainingDim(self):
     self._testPad(
         inputs=[[1, 2], [3, 4]],
@@ -227,6 +236,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[[0, 0]],
         outputs=[[1, 2], [3, 4]])
 
+  @test_util.run_deprecated_v1
   def testZeroBlockDimsTwoRemainingDims(self):
     self._testPad(
         inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
@@ -248,6 +258,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[[0, 0], [0, 0]],
         outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
 
+  @test_util.run_deprecated_v1
   def testOneBlockDimZeroRemainingDims(self):
     self._testPad(
         inputs=[[1, 2, 3], [4, 5, 6]],
@@ -255,6 +266,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[1, 0],
         outputs=[[0, 2], [0, 5], [1, 3], [4, 6]])
 
+  @test_util.run_deprecated_v1
   def testOneBlockDimOneRemainingDim(self):
     self._testPad(
         inputs=[[[1, 11], [2, 21], [3, 31]], [[4, 41], [5, 51], [6, 61]]],
@@ -263,6 +275,7 @@ class SpaceToBatchNDTest(test.TestCase):
         outputs=[[[0, 0], [2, 21]], [[0, 0], [5, 51]], [[1, 11], [3, 31]],
                  [[4, 41], [6, 61]]])
 
+  @test_util.run_deprecated_v1
   def testDirect(self):
     # Test with zero-size remaining dimension.
     self._testDirect(
@@ -300,6 +313,7 @@ class SpaceToBatchNDTest(test.TestCase):
 class SpaceToBatchSpaceToDepth(test.TestCase, PythonOpImpl):
 
   # Verifies that: space_to_batch(x) = transpose(space_to_depth(transpose(x)))
+  @test_util.run_deprecated_v1
   def testSpaceToDepthTranspose(self):
     x = np.arange(5 * 10 * 16 * 7, dtype=np.float32).reshape([5, 10, 16, 7])
     block_size = 2
@@ -309,7 +323,7 @@ class SpaceToBatchSpaceToDepth(test.TestCase, PythonOpImpl):
         array_ops.space_to_depth(
             array_ops.transpose(x, [3, 1, 2, 0]), block_size=block_size),
         [3, 1, 2, 0])
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(y1.eval(), y2.eval())
 
 
@@ -319,6 +333,7 @@ class SpaceToBatchSpaceToDepthCpp(SpaceToBatchSpaceToDepth, CppOpImpl):
 
 class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
@@ -327,6 +342,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -336,6 +352,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -345,6 +362,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -354,6 +372,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleWidth(self):
     # The block size divides width but not height.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
@@ -362,6 +381,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleHeight(self):
     # The block size divides height but not width.
     x_np = [[[[1], [2]], [[3], [4]], [[5], [6]]]]
@@ -370,6 +390,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleBoth(self):
     # The block size does not divide neither width or height.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -378,6 +399,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = self.space_to_batch(
         array_ops.placeholder(dtypes.float32),
@@ -424,25 +446,31 @@ class SpaceToBatchNDErrorHandlingTest(test.TestCase):
     self._testStaticShape(input_shape, block_shape, paddings, error)
     self._testDynamicShape(input_shape, block_shape, paddings)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     self._testShape([1, 2, 2], [0, 2], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNegative(self):
     self._testShape([1, 2, 2], [-1, 2], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testNegativePadding(self):
     # The padding is negative.
     self._testShape([1, 2, 2], [1, 1], [[0, -1], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisible(self):
     # The padded size is not divisible by the block size.
     self._testShape([1, 2, 3, 1], [3, 3], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockDimsMismatch(self):
     # Shape of block_shape does not match shape of paddings.
     self._testStaticShape([1, 3, 3, 1], [3, 3], [[0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     # Verify that input shape and paddings shape can be unknown.
     _ = array_ops.space_to_batch_nd(
@@ -494,7 +522,7 @@ class SpaceToBatchGradientTest(test.TestCase, PythonOpImpl):
   # Check the gradients.
   def _checkGrad(self, x, paddings, block_size):
     assert 4 == x.ndim
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_x = ops.convert_to_tensor(x)
       tf_y = self.space_to_batch(tf_x, paddings, block_size)
       epsilon = 1e-5
@@ -522,18 +550,21 @@ class SpaceToBatchGradientTest(test.TestCase, PythonOpImpl):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     pad_beg = 0
     pad_end = 0
     self._compare(1, 2, 3, 5, block_size, pad_beg, pad_end)
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 2
     pad_beg = 0
     pad_end = 0
     self._compare(2, 4, 3, 2, block_size, pad_beg, pad_end)
 
+  @test_util.run_deprecated_v1
   def testSmallPad1x1(self):
     block_size = 2
     pad_beg = 1
@@ -572,15 +603,19 @@ class SpaceToBatchNDGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     self._compare([1, 4, 6, 5], [2, 2], [[0, 0], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     self._compare([2, 8, 6, 2], [2, 2], [[0, 0], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def testSmallPad1(self):
     self._compare([2, 4, 6, 2], [2, 2], [[1, 1], [1, 1]])
 
+  @test_util.run_deprecated_v1
   def testSmallPadThreeBlockDims(self):
     self._compare([2, 2, 4, 3, 2], [2, 2, 2], [[1, 1], [1, 1], [1, 0]])
 
@@ -644,6 +679,7 @@ class RequiredSpaceToBatchPaddingsTest(test.TestCase):
     self.assertAllEqual(paddings_result, paddings_const)
     self.assertAllEqual(crops_result, crops_const)
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     self._test(
         input_shape=np.zeros((0,), np.int32),
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index cd90d16aacb4325ed426b0466266d9616b574401..e96bc09f3652aaa4d41bddac6ad06daaff8bfbd6 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -36,33 +36,30 @@ class SpaceToDepthTest(test.TestCase):
 
   def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
     input_nhwc = math_ops.cast(inputs, dtype)
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       # test NHWC (default) on CPU
       x_tf = array_ops.space_to_depth(input_nhwc, block_size)
-      self.assertAllEqual(x_tf.eval(), outputs)
-    if test.is_gpu_available():
-      with self.test_session(use_gpu=True):
+      self.assertAllEqual(self.evaluate(x_tf), outputs)
+
+    if test_util.is_gpu_available():
+      with test_util.force_gpu():
         # test NHWC (default) on GPU
         x_tf = array_ops.space_to_depth(input_nhwc, block_size)
-        self.assertAllEqual(x_tf.eval(), outputs)
+        self.assertAllEqual(self.evaluate(x_tf), outputs)
         # test NCHW on GPU
         input_nchw = test_util.NHWCToNCHW(input_nhwc)
         output_nchw = array_ops.space_to_depth(
             input_nchw, block_size, data_format="NCHW")
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
-        self.assertAllEqual(output_nhwc.eval(), outputs)
+        self.assertAllEqual(self.evaluate(output_nhwc), outputs)
 
   def testBasic(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 2
     x_out = [[[[1, 2, 3, 4]]]]
-    self._testOne(x_np, block_size, x_out)
+    for dtype in [dtypes.float32, dtypes.float16, dtypes.uint8]:
+      self._testOne(x_np, block_size, x_out, dtype=dtype)
 
-  def testBasicFloat16(self):
-    x_np = [[[[1], [2]], [[3], [4]]]]
-    block_size = 2
-    x_out = [[[[1, 2, 3, 4]]]]
-    self._testOne(x_np, block_size, x_out, dtype=dtypes.float16)
 
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
@@ -138,17 +135,18 @@ class SpaceToDepthTest(test.TestCase):
     input_nhwc = array_ops.ones([batch_size, 4, 6, 3])
     x_out = array_ops.ones([batch_size, 2, 3, 12])
 
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       # test NHWC (default) on CPU
       x_tf = array_ops.space_to_depth(input_nhwc, block_size)
       self.assertAllEqual(x_tf.shape, x_out.shape)
-      x_tf.eval()
+      self.evaluate(x_tf)
+
     if test.is_gpu_available():
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         # test NHWC (default) on GPU
         x_tf = array_ops.space_to_depth(input_nhwc, block_size)
         self.assertAllEqual(x_tf.shape, x_out.shape)
-        x_tf.eval()
+        self.evaluate(x_tf)
 
   # Tests for different width and height.
   def testNonSquare(self):
@@ -161,14 +159,16 @@ class SpaceToDepthTest(test.TestCase):
 
   # Error handling:
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingDepth(self):
     # The input is missing the last dimension ("depth")
     x_np = [[[1, 2], [3, 4]]]
     block_size = 2
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
@@ -176,30 +176,34 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 0
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 1
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleWidth(self):
     # The block size divides width but not height.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
@@ -207,6 +211,7 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleHeight(self):
     # The block size divides height but not width.
     x_np = [[[[1], [2]], [[3], [4]], [[5], [6]]]]
@@ -214,6 +219,7 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleBoth(self):
     # The block size does not divide neither width or height.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -221,6 +227,7 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = array_ops.space_to_depth(
         array_ops.placeholder(dtypes.float32), block_size=4)
@@ -274,8 +281,8 @@ class SpaceToDepthTest(test.TestCase):
       expected = self.spaceToDepthUsingTranspose(t, block_size, data_format)
       actual = array_ops.space_to_depth(t, block_size, data_format=data_format)
 
-    with self.test_session(use_gpu=use_gpu) as sess:
-      actual_vals, expected_vals = sess.run([actual, expected])
+    with self.cached_session(use_gpu=use_gpu) as sess:
+      actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
   def testAgainstTranspose(self):
@@ -307,7 +314,7 @@ class SpaceToDepthGradientTest(test.TestCase):
       return
 
     assert 4 == x.ndim
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_x = ops.convert_to_tensor(x)
       tf_y = array_ops.space_to_depth(tf_x, block_size, data_format=data_format)
       epsilon = 1e-2
@@ -336,11 +343,13 @@ class SpaceToDepthGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     self._compare(1, 2, 3, 5, block_size, "NHWC")
     self._compare(1, 2, 3, 5, block_size, "NCHW")
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 2
     self._compare(2, 4, 3, 2, block_size, "NHWC")
diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index 7371ebe389345efb6d0604501c75437e7645a060..00eff54077caa4c60c8d910439a73704159a6ee6 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -85,13 +86,13 @@ class SparseAddTest(test.TestCase):
         constant_op.constant(shape, dtypes.int64))
 
   def testAddSelf(self):
-    with self.test_session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
         for sp_b in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
           self.assertAllEqual((3, 3), sp_sum.get_shape())
 
-          sum_out = sess.run(sp_sum)
+          sum_out = self.evaluate(sp_sum)
 
           self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
           self.assertAllEqual(sum_out.indices, [[0, 1], [1, 0], [2, 0], [2, 1]])
@@ -99,12 +100,12 @@ class SparseAddTest(test.TestCase):
           self.assertAllEqual(sum_out.dense_shape, [3, 3])
 
   def testAddSelfAndNegation(self):
-    with self.test_session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_a = self._SparseTensor_3x3()
       sp_b = self._SparseTensor_3x3(negate=True)
 
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, 0.1)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, np.empty([0, 2]))
@@ -112,7 +113,7 @@ class SparseAddTest(test.TestCase):
       self.assertAllEqual(sum_out.dense_shape, [3, 3])
 
   def testSmallValuesShouldVanish(self):
-    with self.test_session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_a = self._SparseTensor_3x3()
       sp_b = self._SparseTensor_3x3_v2()
 
@@ -123,7 +124,7 @@ class SparseAddTest(test.TestCase):
 
       # two values should vanish: |.1| < .21, and |-.2| < .21
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, thresh=0.21)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0]])
@@ -132,22 +133,23 @@ class SparseAddTest(test.TestCase):
 
       # only .1 vanishes
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, thresh=0.11)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0], [2, 1]])
       self.assertAllClose(sum_out.values, [2, 6, -.2])
       self.assertAllEqual(sum_out.dense_shape, [3, 3])
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     np.random.seed(1618)  # Make it reproducible.
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for n in [10, 31]:
         for m in [4, 17]:
           sp_a, nnz_a = self._randomTensor([n, m], np.float32)
           sp_b, nnz_b = self._randomTensor([n, m], np.float32)
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
-          nnz_sum = len(sp_sum.values.eval())
+          nnz_sum = len(self.evaluate(sp_sum.values))
 
           err = gradient_checker.compute_gradient_error(
               [sp_a.values, sp_b.values], [(nnz_a,), (nnz_b,)], sp_sum.values,
@@ -162,26 +164,27 @@ class SparseAddTest(test.TestCase):
         rand_vals_np = np.random.randn(n, m).astype(dtype)
         dense_np = np.random.randn(n, m).astype(dtype)
 
-        with self.test_session(use_gpu=False):
+        with test_util.force_cpu():
           sparse, unused_nnz = _sparsify(rand_vals_np, index_dtype=index_dtype)
-          s = sparse_ops.sparse_add(sparse,
-                                    constant_op.constant(dense_np)).eval()
+          s = self.evaluate(
+              sparse_ops.sparse_add(sparse, constant_op.constant(dense_np)))
           self.assertAllEqual(dense_np + rand_vals_np, s)
           self.assertTrue(s.dtype == dtype)
 
           # check commutativity
-          s = sparse_ops.sparse_add(constant_op.constant(dense_np),
-                                    sparse).eval()
+          s = self.evaluate(
+              sparse_ops.sparse_add(constant_op.constant(dense_np), sparse))
           self.assertAllEqual(dense_np + rand_vals_np, s)
           self.assertTrue(s.dtype == dtype)
 
+  @test_util.run_deprecated_v1
   def testSparseTensorDenseAddGradients(self):
     np.random.seed(1618)  # Make it reproducible.
     n, m = np.random.randint(30, size=2)
     rand_vals_np = np.random.randn(n, m).astype(np.float32)
     dense_np = np.random.randn(n, m).astype(np.float32)
 
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sparse, nnz = _sparsify(rand_vals_np)
       dense = constant_op.constant(dense_np, dtype=dtypes.float32)
       s = sparse_ops.sparse_add(sparse, dense)
@@ -190,8 +193,9 @@ class SparseAddTest(test.TestCase):
                                                     [(nnz,), (n, m)], s, (n, m))
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testInvalidSparseTensor(self):
-    with self.test_session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       shape = [2, 2]
       val = [0]
       dense = constant_op.constant(np.zeros(shape, dtype=np.int32))
@@ -205,7 +209,7 @@ class SparseAddTest(test.TestCase):
 
         with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                      "invalid index"):
-          sess.run(s)
+          self.evaluate(s)
 
 ######################## Benchmarking code
 
diff --git a/tensorflow/python/kernel_tests/sparse_concat_op_test.py b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
index d3c798312866848f6c6bfee3d9aed2c1a48bfe59..04b6b9b8d20fe683add967e2877ae3766caf1c4f 100644
--- a/tensorflow/python/kernel_tests/sparse_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -132,7 +133,7 @@ class SparseConcatTest(test.TestCase):
         constant_op.constant(shape, dtypes.int64))
 
   def testConcat1(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       # concat(A):
       # [    1]
       # [2    ]
@@ -147,7 +148,7 @@ class SparseConcatTest(test.TestCase):
           self.assertEqual(sp_concat.values.get_shape(), [4])
           self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-          concat_out = sess.run(sp_concat)
+          concat_out = self.evaluate(sp_concat)
 
           self.assertAllEqual(concat_out.indices,
                               [[0, 2], [1, 0], [2, 0], [2, 2]])
@@ -155,7 +156,7 @@ class SparseConcatTest(test.TestCase):
           self.assertAllEqual(concat_out.dense_shape, [3, 3])
 
   def testConcat2(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       # concat(A, B):
       # [    1          ]
       # [2       1      ]
@@ -169,7 +170,7 @@ class SparseConcatTest(test.TestCase):
             self.assertEqual(sp_concat.values.get_shape(), [8])
             self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-            concat_out = sess.run(sp_concat)
+            concat_out = self.evaluate(sp_concat)
 
             self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4],
                                                      [2, 0], [2, 2], [2, 3],
@@ -178,7 +179,7 @@ class SparseConcatTest(test.TestCase):
             self.assertAllEqual(concat_out.dense_shape, [3, 8])
 
   def testConcatDim0(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       # concat(A, D):
       # [    1]
       # [2    ]
@@ -195,7 +196,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [7])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(
             concat_out.indices,
@@ -204,7 +205,7 @@ class SparseConcatTest(test.TestCase):
         self.assertAllEqual(concat_out.dense_shape, np.array([5, 3]))
 
   def testConcat3(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       # concat(A, B, C):
       # [    1              ]
       # [2       1       1  ]
@@ -220,7 +221,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [10])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4], [1, 8],
                                                  [2, 0], [2, 2], [2, 3], [2, 6],
@@ -229,7 +230,7 @@ class SparseConcatTest(test.TestCase):
         self.assertAllEqual(concat_out.dense_shape, [3, 10])
 
   def testConcatNonNumeric(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       # concat(A, B):
       # [    a          ]
       # [b       e      ]
@@ -244,7 +245,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [8])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(
             concat_out.indices,
@@ -253,8 +254,9 @@ class SparseConcatTest(test.TestCase):
                             [b"a", b"b", b"e", b"c", b"d", b"f", b"g", b"h"])
         self.assertAllEqual(concat_out.dense_shape, [3, 8])
 
+  @test_util.run_deprecated_v1
   def testMismatchedRank(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_a = self._SparseTensor_3x3()
       sp_e = self._SparseTensor_2x3x4()
 
@@ -263,8 +265,9 @@ class SparseConcatTest(test.TestCase):
         with self.assertRaises(ValueError):
           sparse_ops.sparse_concat(concat_dim, [sp_a, sp_e])
 
+  @test_util.run_deprecated_v1
   def testMismatchedRankExpandNonconcatDim(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_a = self._SparseTensor_3x3()
       sp_e = self._SparseTensor_2x3x4()
 
@@ -275,8 +278,9 @@ class SparseConcatTest(test.TestCase):
           sparse_ops.sparse_concat(
               concat_dim, [sp_a, sp_e], expand_nonconcat_dim=True)
 
+  @test_util.run_deprecated_v1
   def testMismatchedShapes(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_a = self._SparseTensor_3x3()
       sp_b = self._SparseTensor_3x5()
       sp_c = self._SparseTensor_3x2()
@@ -287,10 +291,10 @@ class SparseConcatTest(test.TestCase):
 
         # Shape mismatches can only be caught when the op is run
         with self.assertRaisesOpError("Input shapes must match"):
-          sess.run(sp_concat)
+          self.evaluate(sp_concat)
 
   def testMismatchedShapesExpandNonconcatDim(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_a = self._SparseTensor_3x3()
       sp_b = self._SparseTensor_3x5()
       sp_c = self._SparseTensor_3x2()
@@ -302,8 +306,8 @@ class SparseConcatTest(test.TestCase):
           sp_concat_dim1 = sparse_ops.sparse_concat(
               concat_dim1, [sp_a, sp_b, sp_c, sp_d], expand_nonconcat_dim=True)
 
-          sp_concat_dim0_out = sess.run(sp_concat_dim0)
-          sp_concat_dim1_out = sess.run(sp_concat_dim1)
+          sp_concat_dim0_out = self.evaluate(sp_concat_dim0)
+          sp_concat_dim1_out = self.evaluate(sp_concat_dim1)
 
           self.assertAllEqual(sp_concat_dim0_out.indices,
                               [[0, 2], [1, 0], [2, 0], [2, 2], [4, 1], [5, 0],
@@ -321,8 +325,9 @@ class SparseConcatTest(test.TestCase):
                               [1, 1, 2, 1, 1, 1, 2, 3, 4, 2, 1, 0, 2])
           self.assertAllEqual(sp_concat_dim1_out.dense_shape, [3, 13])
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceUnknownShapes(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_inputs = [
           self._SparseTensor_UnknownShape(),
           self._SparseTensor_UnknownShape(val_shape=[3]),
diff --git a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
index a824d5c826305a04bdc8c96d67837a39ae2dd5de..4a967b656285a1094b8eef17fb0b7f41f83cd8e7 100644
--- a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
@@ -98,12 +99,14 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       attr { key: 'reduction_type' value {s: 'MEAN'} }
       """, q.accumulator_ref.op.node_def)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeEmpty(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q")
       self.assertEqual(q.num_accumulated().eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSetGlobalStep(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -111,6 +114,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       set_global_step_op = q.set_global_step(1)
       set_global_step_op.run()
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyGradFloat32(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -122,6 +126,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
       self.assertEqual(q.num_accumulated().eval(), 1)
 
+  @test_util.run_deprecated_v1
   def testDtypes(self):
     with self.cached_session() as sess:
       dtypes = [dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64]
@@ -140,10 +145,11 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
           t = _indexedslice(mat_to_add)
           q.apply_indexed_slices_grad(t).run()
 
-        result = sess.run(q.take_indexed_slices_grad(1))
+        result = self.evaluate(q.take_indexed_slices_grad(1))
 
         self._assertEqual_nparray(sum_elems / len(elems), result, sess)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorMultipleAccumulators(self):
     with self.cached_session() as sess:
       q_f32_0 = data_flow_ops.SparseConditionalAccumulator(
@@ -174,6 +180,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         result = sess.run(accums[i].take_indexed_slices_grad(1))
         self._assertEqual_indexedslices(expected_tensors[i], result)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradMean(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -189,11 +196,12 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual([0, 1, 2], val.indices)
       self.assertAllEqual([[0.5, 0.5], [0, 2], [3, 0]], val.values)
       self.assertAllEqual([-1, 2], val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradSum(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -209,16 +217,18 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual([0, 1, 2], val.indices)
       self.assertAllEqual([[1, 1], [0, 2], [3, 0]], val.values)
       self.assertAllEqual([-1, 2], val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradInvalidReductionType(self):
     with self.assertRaises(ValueError):
       data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q", shape=(), reduction_type="Invalid")
 
+  @test_util.run_deprecated_v1
   def testAccumulatorRepeatedTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -235,7 +245,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual(val.indices, [0, 1, 2])
       self.assertAllEqual(val.values, [[0.5, 0.5], [0, 2], [3, 0]])
       self.assertAllEqual(val.dense_shape, [-1, 2])
@@ -252,11 +262,12 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual(val.indices, [0, 1, 2])
       self.assertAllEqual(val.values, [[5, 5], [0, 20], [30, 0]])
       self.assertAllEqual(val.dense_shape, [-1, 2])
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelApplyGradMean(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -269,7 +280,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_indexed_slices_grad(1)
 
       def apply_indexed_slices_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(
@@ -281,13 +292,14 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
 
       expected_val = sum(elems) / len(elems)
       self._assertEqual_nparray(
           np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32),
           val, sess)
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelApplyGradSum(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -303,7 +315,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_indexed_slices_grad(1)
 
       def apply_indexed_slices_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(target=apply_indexed_slices_grad, args=(o,))
@@ -315,13 +327,14 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
 
       expected_val = 550.0
       self._assertEqual_nparray(
           np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32),
           val, sess)
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -338,13 +351,13 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       def apply_indexed_slices_grad():
         for accum_op in accum_ops:
           time.sleep(1.0)
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       apply_indexed_slices_grad_thread = self.checkedThread(
           target=apply_indexed_slices_grad)
 
       def take_grad():
-        t = sess.run(takeg_t)
+        t = self.evaluate(takeg_t)
         results.append(t)
 
       threads = [self.checkedThread(target=take_grad) for _ in range(10)]
@@ -361,6 +374,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         self._assertEqual_nparray(
             np.array([[0, 0], [elems[i], 0]]), results[i], sess)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorApplyAndBlockingTake(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -378,10 +392,10 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       def apply_indexed_slices_grad():
         for accum_op in accum_ops:
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       def take_grad():
-        results.append(sess.run(takeg_t))
+        results.append(self.evaluate(takeg_t))
 
       accum_thread = self.checkedThread(target=apply_indexed_slices_grad)
       takeg_thread = self.checkedThread(target=take_grad)
@@ -394,8 +408,9 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
   def _blocking_takeg(self, sess, takeg_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(takeg_op)
+      self.evaluate(takeg_op)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -415,6 +430,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       takeg_thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testNonVectorIndices(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -427,6 +443,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_indices=[[0, 1], [1, 0]],
             grad_values=np.array([1, 2]).astype(np.float32)).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testZeroDimensionValues(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -437,6 +454,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         q.apply_grad(
             grad_indices=[0], grad_values=np.array(1).astype(np.float32)).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongNonEmptyInputValues(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -448,6 +466,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_indices=[0, 1],
             grad_values=np.array([[0, 1, 1]]).astype(np.float32)).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testDynamicNonVectorIndices(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -467,6 +486,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                      x_values: np.array([1, 2]).astype(np.float32)
                  })
 
+  @test_util.run_v1_only("b/120545219")
   def testDynamicWrongNonEmptyInputValues(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -485,6 +505,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                      x_values: np.array([[0, 1, 1]]).astype(np.float32)
                  })
 
+  @test_util.run_v1_only("b/120545219")
   def testEmptyShapeApply(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -510,6 +531,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       q.apply_grad(grad_indices=[0], grad_values=[1.0], grad_shape=[]).run()
       q.apply_grad(grad_indices=[0], grad_values=[1.0]).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testValidateShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -585,7 +607,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                     np.float32)).run()
 
       # After take grad, constraints on accumulated gradient are removed
-      sess.run(q.take_grad(1))
+      self.evaluate(q.take_grad(1))
 
       # First successful gradient imposes new constraints.
       # Hereafter, shape will additionally constrained to [None,2,2,3]
@@ -605,6 +627,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                 [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]]]).astype(np.float32),
             local_step=1).run()
 
+  @test_util.run_deprecated_v1
   def testReturnShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -615,7 +638,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
           grad_values=np.array(
               [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]]]).astype(np.float32)).run()
 
-      val = sess.run(q.take_indexed_slices_grad(1))
+      val = self.evaluate(q.take_indexed_slices_grad(1))
       self.assertAllEqual(val.dense_shape, [2, 2, 2, 2])
 
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -627,9 +650,10 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
               [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]).astype(
                   np.float32)).run()
 
-      val = sess.run(q.take_indexed_slices_grad(1))
+      val = self.evaluate(q.take_indexed_slices_grad(1))
       self.assertAllEqual(val.dense_shape, [-1, 2, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testApplyGradtInt32IndicesAndShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -653,7 +677,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
       self.assertEqual(q.num_accumulated().eval(), 2)
 
-      val = sess.run(q.take_indexed_slices_grad(1))
+      val = self.evaluate(q.take_indexed_slices_grad(1))
       self.assertAllEqual(val.indices, [0, 2])
       self.assertAllEqual(val.values, [[0, 0, 1], [3, 0, 4]])
       self.assertAllEqual(val.dense_shape, [3, 3])
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 6e0714da702a09735ca10f7bb8658ecb25cbe8fb..566bbb56f007eacfd11bce3f1f2d78a8b22755a1 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
 class SparseCrossOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_simple(self):
     """Tests a simple scenario."""
     op = sparse_ops.sparse_cross([
@@ -43,8 +45,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_dense(self):
     """Tests only dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -63,8 +66,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_integer_mixed_string_sparse(self):
     """Tests mixed type."""
     op = sparse_ops.sparse_cross([
@@ -77,8 +81,9 @@ class SparseCrossOpTest(test.TestCase):
         '55555_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_integer_mixed_string_dense(self):
     """Tests mixed dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -95,8 +100,9 @@ class SparseCrossOpTest(test.TestCase):
         '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_sparse_cross_dense(self):
     """Tests sparse and dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -112,8 +118,9 @@ class SparseCrossOpTest(test.TestCase):
             'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
         ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_integer_sparse_input(self):
     """Tests mixed type sparse and dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -128,8 +135,9 @@ class SparseCrossOpTest(test.TestCase):
             '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2'
         ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_permutation_3x3x3(self):
     """Tests 3x3x3 permutation."""
     op = sparse_ops.sparse_cross([
@@ -170,8 +178,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_permutation_3x1x2(self):
     """Tests 3x1x2 permutation."""
     op = sparse_ops.sparse_cross([
@@ -189,8 +198,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_large_batch(self):
     """Tests with large batch size to force multithreading."""
     batch_size = 5000
@@ -222,8 +232,9 @@ class SparseCrossOpTest(test.TestCase):
 
     expected_out = self._sparse_tensor(col_out)
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_one_column_empty(self):
     """Tests when one column is empty.
 
@@ -235,8 +246,9 @@ class SparseCrossOpTest(test.TestCase):
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
     ])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_empty(sess.run(op))
+      self._assert_sparse_tensor_empty(self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_some_columns_empty(self):
     """Tests when more than one columns are empty.
 
@@ -254,8 +266,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
     ]], 2)
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_all_columns_empty(self):
     """Tests when all columns are empty.
 
@@ -267,8 +280,9 @@ class SparseCrossOpTest(test.TestCase):
         self._sparse_tensor([])
     ])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_empty(sess.run(op))
+      self._assert_sparse_tensor_empty(self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed_zero_bucket_no_hash_key(self):
     op = sparse_ops.sparse_cross_hashed([
         self._sparse_tensor([['batch1-FC1-F1']]),
@@ -278,8 +292,9 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[1971693436396284976]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed_zero_bucket(self):
     op = sparse_ops.sparse_cross_hashed(
         [
@@ -291,9 +306,10 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[4847552627144134031]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   # TODO(sibyl-Aix6ihai): Add benchmark to compare Hashed vs Non-hashed.
+  @test_util.run_deprecated_v1
   def test_hashed_no_hash_key(self):
     op = sparse_ops.sparse_cross_hashed(
         [
@@ -305,8 +321,9 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[83]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed_output(self):
     op = sparse_ops.sparse_cross_hashed(
         [
@@ -319,8 +336,9 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[31]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed__has_no_collision(self):
     """Tests that fingerprint concatenation has no collisions."""
     # Although the last 10 bits of 359 and 1024+359 are identical.
@@ -331,7 +349,7 @@ class SparseCrossOpTest(test.TestCase):
         [t2, t1], num_buckets=1024, hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
     cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
     with session.Session():
-      values = cross_dense.eval()
+      values = self.evaluate(cross_dense)
       self.assertTrue(numpy.not_equal(values[0], values[1]).all())
 
   def test_hashed_3x1x2(self):
@@ -345,7 +363,7 @@ class SparseCrossOpTest(test.TestCase):
         ],
         num_buckets=1000)
     with self.cached_session() as sess:
-      out = sess.run(op)
+      out = self.evaluate(op)
       self.assertEqual(6, len(out.values))
       self.assertAllEqual([[0, i] for i in range(6)], out.indices)
       self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
diff --git a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
index 90009fc33e62f886ac95c073454fa4027b3642e4..2e17a9c608fcf64ad7e8eb48476bbfa0215ce178 100644
--- a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -48,7 +49,7 @@ class SparseMatMulTest(test.TestCase):
                      sp_b=False,
                      x_dtype=dtypes.float32,
                      y_dtype=dtypes.float32):
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       tf_x = math_ops.cast(x, x_dtype)
       tf_y = math_ops.cast(y, y_dtype)
       tf_ans = math_ops.matmul(
@@ -58,7 +59,7 @@ class SparseMatMulTest(test.TestCase):
           transpose_b=tr_b,
           a_is_sparse=sp_a,
           b_is_sparse=sp_b)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       np_x = math_ops.cast(tf_x, dtypes.float32).eval()
       np_y = math_ops.cast(tf_y, dtypes.float32).eval()
 
@@ -71,6 +72,7 @@ class SparseMatMulTest(test.TestCase):
     self.assertShapeEqual(np_ans, tf_ans)
     self.assertAllCloseAccordingToType(np_ans, out, rtol=1e-4, atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x = np.arange(0., 4.).reshape([4, 1]).astype(np.float32)
     y = np.arange(-1., 1.).reshape([1, 2]).astype(np.float32)
@@ -78,6 +80,7 @@ class SparseMatMulTest(test.TestCase):
       for y_dtype in (dtypes.float32, dtypes.bfloat16):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
+  @test_util.run_deprecated_v1
   def testZeroDim(self):
     x = np.ones((4, 0)).astype(np.float32)
     y = np.ones((0, 3)).astype(np.float32)
@@ -85,6 +88,7 @@ class SparseMatMulTest(test.TestCase):
       for y_dtype in (dtypes.float32, dtypes.bfloat16):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     x = np.ones((0, 0)).astype(np.float32)
     y = np.ones((0, 0)).astype(np.float32)
@@ -93,6 +97,7 @@ class SparseMatMulTest(test.TestCase):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
   # Tests setting one dimension to be a high value.
+  @test_util.run_deprecated_v1
   def testLarge(self):
     r1 = np.random.randint(6000, 20000)
     r2 = np.random.randint(1, 10)
@@ -105,6 +110,7 @@ class SparseMatMulTest(test.TestCase):
           self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
   # Tests random sized matrices.
+  @test_util.run_deprecated_v1
   def testRandom(self):
     for tr_a in [True, False]:
       for tr_b in [True, False]:
@@ -159,6 +165,7 @@ class MatMulGradientTest(test.TestCase):
               delta=delta))
     self.assertLessEqual(err, delta / 2.)
 
+  @test_util.run_deprecated_v1
   def testGradientInput(self):
     for tr_a in [True, False]:
       for tr_b in [True, False]:
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 79efee3f5b87c6aa1e4b3adf24862bd496027e33..7598991489ce6019352e19cb6c50819d91085b0d 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -71,8 +72,9 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtype),
         constant_op.constant(shape, dtypes.int64))
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x6(dtypes.int32)
       output = sparse_ops.sparse_to_indicator(sp_input, 50).eval()
 
@@ -83,8 +85,9 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
+  @test_util.run_deprecated_v1
   def testInt64(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x6(dtypes.int64)
       output = sparse_ops.sparse_to_indicator(sp_input, 50).eval()
 
@@ -95,8 +98,9 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_2x3x4(dtypes.int64)
       output = sparse_ops.sparse_to_indicator(sp_input, 200).eval()
 
@@ -147,63 +151,63 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
   def testInt32AndFloat32(self):
     vocab_size = 50
     indices_v, values_v = self._SparseTensorValue_3x50(np.int32, np.float32)
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       for indices in (indices_v,
                       sparse_tensor.SparseTensor.from_value(indices_v)):
         for values in (values_v,
                        sparse_tensor.SparseTensor.from_value(values_v)):
           sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-          output = sess.run(sp_output)
+          output = self.evaluate(sp_output)
           self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat32(self):
     vocab_size = 50
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
     vocab_size = 50
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt32AndFloat32NonCanonicalOrder(self):
     vocab_size = 50
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices, values = self._SparseTensor_3x50(np.int32, np.float32)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testInt64AndFloat32NonCanonicalOrder(self):
     vocab_size = 50
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testInt64AndFloat64NonCanonicalOrder(self):
     vocab_size = 50
     vocab_size_tensor = constant_op.constant(vocab_size, dtypes.int64)
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size_tensor, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testShouldSetLastDimensionInDynamicShape(self):
@@ -257,29 +261,29 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32(self):
     vocab_size = [50, 31]
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
     vocab_size = [50, 31]
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64Shape(self):
     vocab_size = [50, 30]
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
 
@@ -296,32 +300,33 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
   def _SparseTensor_5x6(self):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_5x6())
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         to_retain = np.array([1, 0, 0, 1, 1, 0], dtype=np.bool)
         sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
 
-        output = sess.run(sp_output)
+        output = self.evaluate(sp_output)
 
         self.assertAllEqual(output.indices, [[0, 0], [1, 4], [3, 2]])
         self.assertAllEqual(output.values, [0, 14, 32])
         self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testRetainNone(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_5x6()
       to_retain = np.zeros((6,), dtype=np.bool)
       sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, np.array([]).reshape((0, 2)))
       self.assertAllEqual(output.values, [])
       self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testMismatchedRetainShape(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x6()
       to_retain = np.array([1, 0, 0, 1, 0], dtype=np.bool)
       with self.assertRaises(ValueError):
@@ -353,40 +358,44 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
+  @test_util.run_deprecated_v1
   def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 6, 7], dtype=np.int64)
     sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
     self.assertAllEqual([3, 6, 7], sp_output.get_shape())
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
+  @test_util.run_deprecated_v1
   def testInputUnavailableInGraphConstructionOk(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorValue_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
+  @test_util.run_deprecated_v1
   def testFeedInputUnavailableInGraphConstructionOk(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
@@ -400,11 +409,11 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
   def testTightBoundingBox(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
@@ -412,26 +421,28 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [2, 4, 5])
 
   def testTightBoundingBoxEmpty(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6_Empty()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices.shape, [0, 3])
       self.assertAllEqual(output.values.shape, [0])
       self.assertAllEqual(output.dense_shape, [0, 0, 0])
 
+  @test_util.run_deprecated_v1
   def testInvalidRank(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 7], dtype=np.int64)
 
       with self.assertRaises(ValueError):
         sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
+  @test_util.run_deprecated_v1
   def testInvalidRankNewShapeUnavailableInGraphConstruction(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       new_shape = array_ops.placeholder(dtype=dtypes.int64)
       sp_input = self._SparseTensor_2x5x6()
       out = sparse_ops.sparse_reset_shape(sp_input, new_shape)
@@ -439,6 +450,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
+  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 7, 5], dtype=np.int64)
@@ -446,8 +458,9 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "should have dimension sizes"):
       sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
+  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeDynamic(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
       new_shape = array_ops.placeholder(dtype=dtypes.int32)
       out = sparse_ops.sparse_reset_shape(sp_input, new_shape)
@@ -455,9 +468,10 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x <= y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: [3, 7, 5]})
 
+  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeInputUnavailableInGraphConstruction(self):
     sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       new_shape = np.array([3, 7, 5], dtype=np.int64)
       out = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
@@ -496,8 +510,9 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtypes.int32),
         constant_op.constant(shape, dtypes.int64))
 
+  @test_util.run_deprecated_v1
   def testFillNumber(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         sp_output, empty_row_indicator = (
             sparse_ops.sparse_fill_empty_rows(sp_input, -1))
@@ -513,8 +528,9 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(empty_row_indicator_out,
                             np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
+  @test_util.run_deprecated_v1
   def testFillFloat(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       values = constant_op.constant(
           [0.0, 10.0, 13.0, 14.0, 32.0, 33.0], dtype=dtypes.float64)
       default_value = constant_op.constant(-1.0, dtype=dtypes.float64)
@@ -547,8 +563,9 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertGreater(default_value_grad_err, 0)
       self.assertLess(default_value_grad_err, 1e-8)
 
+  @test_util.run_deprecated_v1
   def testFillString(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_String5x6()
       sp_output, empty_row_indicator = (
           sparse_ops.sparse_fill_empty_rows(sp_input, ""))
@@ -565,8 +582,9 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(empty_row_indicator_out,
                           np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
+  @test_util.run_deprecated_v1
   def testNoEmptyRows(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x6()
       sp_output, empty_row_indicator = (
           sparse_ops.sparse_fill_empty_rows(sp_input, -1))
@@ -582,6 +600,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
 class SparseAddTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testValuesInVariable(self):
     indices = constant_op.constant([[1]], dtype=dtypes.int64)
     values = variables.Variable([1], trainable=False, dtype=dtypes.float32)
@@ -590,9 +609,9 @@ class SparseAddTest(test_util.TensorFlowTestCase):
     sp_input = sparse_tensor.SparseTensor(indices, values, shape)
     sp_output = sparse_ops.sparse_add(sp_input, sp_input)
 
-    with self.test_session(use_gpu=False) as sess:
-      sess.run(variables.global_variables_initializer())
-      output = sess.run(sp_output)
+    with self.session(use_gpu=False) as sess:
+      self.evaluate(variables.global_variables_initializer())
+      output = self.evaluate(sp_output)
       self.assertAllEqual(output.values, [2])
 
 
@@ -635,7 +654,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
       else:
         tf_dense_ans = sparse_ops.sparse_reduce_max(sp_t, reduction_axes,
                                                     keep_dims)
-      out_dense = tf_dense_ans.eval()
+      out_dense = self.evaluate(tf_dense_ans)
 
       if do_sum:
         tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t,
@@ -657,13 +676,14 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
     self._compare(sp_t, reduction_axes, ndims, True, False)
     self._compare(sp_t, reduction_axes, ndims, True, True)
 
+  @test_util.run_deprecated_v1
   def testSimpleAndRandomInputs(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
 
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
 
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       self._compare_all(sp_t, None, ndims=2)
       self._compare_all(sp_t, 0, ndims=2)
       self._compare_all(sp_t, [1], ndims=2)
@@ -674,7 +694,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
     np.random.seed(1618)
     test_dims = [(1618, 1, 11, 7, 1), (1,), (1, 1, 1)]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for dims in test_dims:
         sp_t, unused_nnz = _sparsify(np.random.randn(*dims))
         # reduce all using None
@@ -686,7 +706,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
   def testInvalidAxes(self):
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       with self.assertRaisesOpError("Invalid reduction dimension -3"):
         sparse_ops.sparse_reduce_sum(sp_t, -3).eval()
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
@@ -696,13 +716,14 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
         sparse_ops.sparse_reduce_max(sp_t, 2).eval()
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
 
     np.random.seed(8161)
     test_dims = [(11, 1, 5, 7, 1), (2, 2)]
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for dims in test_dims:
         sp_t, nnz = _sparsify(np.random.randn(*dims))
         # reduce random axes from 1D to N-D
@@ -710,18 +731,59 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
           axes = np.random.choice(len(dims), size=d, replace=False).tolist()
           reduced = sparse_ops.sparse_reduce_sum(sp_t, axes)
 
-          err = gradient_checker.compute_gradient_error(sp_t.values, (nnz,),
-                                                        reduced,
-                                                        reduced.eval().shape)
+          err = gradient_checker.compute_gradient_error(
+              sp_t.values, (nnz,), reduced,
+              self.evaluate(reduced).shape)
           self.assertLess(err, 1e-3)
 
         # Tests for negative axes.
         reduced = sparse_ops.sparse_reduce_sum(sp_t, -1)
-        err = gradient_checker.compute_gradient_error(sp_t.values, (nnz,),
-                                                      reduced,
-                                                      reduced.eval().shape)
+        err = gradient_checker.compute_gradient_error(
+            sp_t.values, (nnz,), reduced,
+            self.evaluate(reduced).shape)
         self.assertLess(err, 1e-3)
 
+  def _testSparseReduceShape(self, sp_t, reduction_axes, ndims, keep_dims,
+                             do_sum):
+    densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+
+    np_op = np.sum
+    tf_op = sparse_ops.sparse_reduce_sum
+    if not do_sum:
+      np_op = np.max
+      tf_op = sparse_ops.sparse_reduce_max
+
+    np_ans = densified
+    if reduction_axes is None:
+      np_ans = np_op(np_ans, keepdims=keep_dims)
+    else:
+      if not isinstance(reduction_axes, list):  # Single scalar.
+        reduction_axes = [reduction_axes]
+      reduction_axes = np.array(reduction_axes).astype(np.int32)
+      # Handles negative axes.
+      reduction_axes = (reduction_axes + ndims) % ndims
+      # Loop below depends on sorted.
+      reduction_axes.sort()
+      for ra in reduction_axes.ravel()[::-1]:
+        np_ans = np_op(np_ans, axis=ra, keepdims=keep_dims)
+
+    tf_ans = tf_op(sp_t, reduction_axes, keep_dims)
+    self.assertAllEqual(np_ans.shape, tf_ans.get_shape().as_list())
+
+  def testSparseReduceSumOrMaxShape(self):
+    sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
+
+    with self.session(use_gpu=False):
+      for do_sum in [True, False]:
+        for keep_dims in [True, False]:
+          self._testSparseReduceShape(sp_t, None, 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, 0, 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [1], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [0, 1], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [1, 0], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [-1], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [1, -2], 2, keep_dims, do_sum)
+
 
 class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
@@ -737,12 +799,26 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
                                                result_tensor.values).eval()
     self.assertAllEqual(result_np, res_densified)
 
+  @test_util.run_deprecated_v1
+  def testCwiseShapeValidation(self):
+    # Test case for GitHub 24072.
+    with self.session(use_gpu=False):
+      a = array_ops.ones([3, 4, 1], dtype=dtypes.int32)
+      b = sparse_tensor.SparseTensor([[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20],
+                                     [1, 1, 4, 2])
+      c = a * b
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "broadcasts dense to sparse only; got incompatible shapes"):
+        c.eval()
+
+  @test_util.run_deprecated_v1
   def testCwiseDivAndMul(self):
     np.random.seed(1618)
     sp_shapes = [(10, 10, 10), (5, 5), (1618,), (3, 3, 7)]
     dense_shapes = [(10, 10, 1), (5, 5), (1,), (1, 7)]
 
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for dtype in [np.float32, np.float64, np.int32, np.int64]:
         for sp_shape, dense_shape in zip(sp_shapes, dense_shapes):
           sp_vals_np = np.random.rand(*sp_shape).astype(dtype) + 1
@@ -760,8 +836,9 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
             res = sp_t / dense_t  # should invoke "__truediv__"
             self.assertEqual(res.values.eval().dtype, np.float64)
 
+  @test_util.run_deprecated_v1
   def testCwiseAdd(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       # Identity(2) + AllOnes(2,2).  Should be equal to 2 * Identity(2).
       indices = [[0, 0], [1, 1]]
       vals = [1, 1]
@@ -779,12 +856,13 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
           sparse_ops.sparse_dense_cwise_add(sp_t, dense_t),
           np.identity(2) * 2, sp_t)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     np.random.seed(1618)
     sp_shapes = [(10, 10, 10), (5, 5), (1618,), (3, 3, 7)]
     dense_shapes = [(10, 10, 1), (5, 5), (1,), (1, 7)]
 
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for dtype in [np.float32, np.float64]:
         for sp_shape, dense_shape in zip(sp_shapes, dense_shapes):
           sp_vals_np = np.random.rand(*sp_shape).astype(dtype) + 1
@@ -812,6 +890,7 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
 class SparseSoftmaxTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testEquivalentToDensified(self):
     np.random.seed(1618)
     n, m = np.random.choice(20, size=2)
@@ -822,7 +901,7 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
       batched_sp_t, unused_nnz1 = _sparsify(
           sp_vals_np.reshape((1, n, m)), thresh=0.)  # No masking.
 
-      with self.test_session(use_gpu=False):
+      with self.cached_session(use_gpu=False):
         densified = constant_op.constant(sp_vals_np)
 
         sp_result = sparse_ops.sparse_softmax(batched_sp_t).eval(
@@ -831,6 +910,7 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
 
         self.assertAllClose(dense_result.eval(), sp_result)
 
+  @test_util.run_deprecated_v1
   def testHigherRanks(self):
     # For the first shape:
     # First batch:
@@ -853,16 +933,17 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
       sp_t, unused_nnz = _sparsify(values, thresh=1e-2)
       expected_values = [1., 1., 1., .5, .5]
 
-      with self.test_session(use_gpu=False):
+      with self.cached_session(use_gpu=False):
         result = sparse_ops.sparse_softmax(sp_t).eval()
 
         self.assertAllEqual(expected_values, result.values)
         self.assertAllEqual(sp_t.indices.eval(), result.indices)
         self.assertAllEqual(shape, result.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [2, 5, 10]
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       for dtype in [np.float32, np.float64]:
         x_np = np.random.randn(*x_shape).astype(dtype)
         x_tf, nnz = _sparsify(x_np)
@@ -879,8 +960,9 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       # 1-D, values at index 0.
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [7])
@@ -898,6 +980,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
       self._assertSparseTensorValueEqual(expected.eval(), max_tf)
       self._assertSparseTensorValueEqual(expected.eval(), min_tf)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     np.random.seed(1618)
     shapes = [(13,), (6, 8), (1, 7, 1)]
@@ -908,7 +991,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
         sp_a, unused_a_nnz = _sparsify(a_np, thresh=-.5)
         sp_b, unused_b_nnz = _sparsify(b_np, thresh=-.5)
 
-        with self.test_session(use_gpu=False):
+        with self.cached_session(use_gpu=False):
           maximum_tf = sparse_ops.sparse_maximum(sp_a, sp_b)
           maximum_tf_densified = sparse_ops.sparse_tensor_to_dense(
               maximum_tf).eval()
@@ -925,7 +1008,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
             np.minimum(a_densified, b_densified), minimum_tf_densified)
 
   def testMismatchedShapes(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_zero = sparse_tensor.SparseTensor([[0, 0]], [0], [1, 1])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [2])
       with self.assertRaisesOpError("Operands do not have the same ranks"):
@@ -939,11 +1022,12 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
 
 class SparseTransposeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTranspose(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
 
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       np.random.seed(1618)
       shapes = [np.random.randint(1, 10, size=rank) for rank in range(1, 6)]
       for shape in shapes:
@@ -961,16 +1045,19 @@ class SparseTransposeTest(test.TestCase):
 
 class SparsePlaceholderTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testPlaceholder(self):
     foo = array_ops.sparse_placeholder(dtypes.float32, shape=(10, 47))
     self.assertAllEqual([10, 47], foo.get_shape())
     self.assertAllEqual([None, 2], foo.indices.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testPartialShapePlaceholder(self):
     foo = array_ops.sparse_placeholder(dtypes.float32, shape=(None, 47))
     self.assertAllEqual([None, None], foo.get_shape().as_list())
     self.assertAllEqual([None, 2], foo.indices.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testNoShapePlaceholder(self):
     foo = array_ops.sparse_placeholder(dtypes.float32, shape=None)
     self.assertAllEqual(None, foo.get_shape())
diff --git a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
index 18335d665af833fb7d9fef0b517b2c4efc4a005e..93fcc6a18e615d43d8145633e5720ce785ad017c 100644
--- a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import sparse_ops
@@ -56,17 +57,18 @@ class SparseReorderTest(test.TestCase):
     self.assertAllEqual((5, 6), sp_output.get_shape())
 
   def testAlreadyInOrder(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6(np.arange(6))
       sp_output = sparse_ops.sparse_reorder(input_val)
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices, input_val.indices)
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedAlreadyInOrder(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6(np.arange(6))
       sp_output = sparse_ops.sparse_reorder(sp_input)
@@ -78,20 +80,21 @@ class SparseReorderTest(test.TestCase):
 
   def testOutOfOrder(self):
     expected_output_val = self._SparseTensorValue_5x6(np.arange(6))
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       for _ in range(5):  # To test various random permutations
         input_val = self._SparseTensorValue_5x6(np.random.permutation(6))
         sp_output = sparse_ops.sparse_reorder(input_val)
 
-        output_val = sess.run(sp_output)
+        output_val = self.evaluate(sp_output)
         self.assertAllEqual(output_val.indices, expected_output_val.indices)
         self.assertAllEqual(output_val.values, expected_output_val.values)
         self.assertAllEqual(output_val.dense_shape,
                             expected_output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedOutOfOrder(self):
     expected_output_val = self._SparseTensorValue_5x6(np.arange(6))
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       for _ in range(5):  # To test various random permutations
         sp_input = self._SparseTensorPlaceholder()
         input_val = self._SparseTensorValue_5x6(np.random.permutation(6))
@@ -103,8 +106,9 @@ class SparseReorderTest(test.TestCase):
         self.assertAllEqual(output_val.dense_shape,
                             expected_output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for _ in range(5):  # To test various random permutations
         input_val = self._SparseTensorValue_5x6(np.random.permutation(6))
         sp_input = sparse_tensor.SparseTensor(input_val.indices,
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index 89a54c8ab6fb19c79404222365124b72dd3b6f3f..9341228d57e6cea8651b13c70f53ebd229c65b7e 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -64,12 +65,14 @@ class SparseReshapeTest(test.TestCase):
     sp_output = sparse_ops.sparse_reshape(sp_input, shape=(2, -1))
     self.assertAllEqual((2, 3 * 4), sp_output.get_shape())
 
+  @test_util.run_deprecated_v1
   def testRaisesIfMoreThanOneInferredDim(self):
     sp_input = sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_2x3x4())
     with self.assertRaisesRegexp(ValueError, "At most one dimension can"):
       sparse_ops.sparse_reshape(sp_input, shape=(-1, 2, -1))
 
+  @test_util.run_deprecated_v1
   def testRaisesIfInferredShapeNotPossible(self):
     sp_input = sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_2x3x4())
@@ -77,17 +80,18 @@ class SparseReshapeTest(test.TestCase):
       sparse_ops.sparse_reshape(sp_input, shape=(-1, 7))
 
   def testSameShape(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(input_val, [5, 6])
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices, input_val.indices)
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedSameShape(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [5, 6])
@@ -97,8 +101,9 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testWorksWellWithTfShape(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       shape = array_ops.shape(sp_input)  # tf.shape generates int32 output
@@ -109,8 +114,9 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedSameShapeWithInferredDim(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [-1, 6])
@@ -120,8 +126,9 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedNewShapeSameRank(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [3, 10])
@@ -133,8 +140,9 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [3, 10])
 
+  @test_util.run_deprecated_v1
   def testFeedNewShapeSameRankWithInferredDim(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [3, -1])
@@ -147,19 +155,20 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.dense_shape, [3, 10])
 
   def testUpRank(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(input_val, [2, 3, 5])
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices,
                           np.array([[0, 0, 0], [0, 1, 1], [0, 1, 4], [0, 2, 0],
                                     [1, 1, 0], [1, 1, 1]]))
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testFeedUpRank(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [2, 3, 5])
@@ -171,8 +180,9 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testFeedUpRankWithInferredDim(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [2, -1, 5])
@@ -184,8 +194,9 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testFeedDownRank(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_2x3x4()
       sp_output = sparse_ops.sparse_reshape(sp_input, [6, 4])
@@ -197,8 +208,9 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [6, 4])
 
+  @test_util.run_deprecated_v1
   def testFeedDownRankWithInferredDim(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_2x3x4()
       sp_output = sparse_ops.sparse_reshape(sp_input, [6, -1])
@@ -210,22 +222,25 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [6, 4])
 
+  @test_util.run_deprecated_v1
   def testFeedMultipleInferredDims(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [4, -1, -1])
       with self.assertRaisesOpError("only one output dimension may be -1"):
         sess.run(sp_output, {sp_input: input_val})
 
+  @test_util.run_deprecated_v1
   def testProvideStaticallyMismatchedSizes(self):
     input_val = self._SparseTensorValue_5x6()
     sp_input = sparse_tensor.SparseTensor.from_value(input_val)
     with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
       sparse_ops.sparse_reshape(sp_input, [4, 7])
 
+  @test_util.run_deprecated_v1
   def testFeedMismatchedSizes(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [4, 7])
@@ -233,16 +248,18 @@ class SparseReshapeTest(test.TestCase):
           "Input to reshape is a tensor with 30 dense values"):
         sess.run(sp_output, {sp_input: input_val})
 
+  @test_util.run_deprecated_v1
   def testFeedMismatchedSizesWithInferredDim(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [4, -1])
       with self.assertRaisesOpError("requested shape requires a multiple"):
         sess.run(sp_output, {sp_input: input_val})
 
+  @test_util.run_deprecated_v1
   def testFeedPartialShapes(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       # Incorporate new rank into shape information if known
       sp_input = self._SparseTensorPlaceholder()
       sp_output = sparse_ops.sparse_reshape(sp_input, [2, 3, 5])
@@ -266,8 +283,9 @@ class SparseReshapeTest(test.TestCase):
       self.assertListEqual(sp_output.indices.get_shape().as_list(), [5, None])
       self.assertListEqual(sp_output.dense_shape.get_shape().as_list(), [None])
 
+  @test_util.run_deprecated_v1
   def testFeedDenseReshapeSemantics(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       # Compute a random rank-5 initial shape and new shape, randomly sparsify
       # it, and check that the output of SparseReshape has the same semantics
       # as a dense reshape.
diff --git a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
index 3847cebc7dcabd66c26a4e4551e5856c6a927a33..5a48eb825dbfa8231062be2d2db33fc0756a690f 100644
--- a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -68,12 +69,12 @@ class SerializeSparseTest(test.TestCase):
                                       serialize_fn,
                                       deserialize_fn,
                                       out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorValue_5x6(np.arange(6))
       serialized = serialize_fn(sp_input, out_type=out_type)
       sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
-      indices, values, shape = sess.run(sp_deserialized)
+      indices, values, shape = self.evaluate(sp_deserialized)
 
       self.assertAllEqual(indices, sp_input[0])
       self.assertAllEqual(values, sp_input[1])
@@ -92,7 +93,7 @@ class SerializeSparseTest(test.TestCase):
                                            serialize_fn,
                                            deserialize_fn,
                                            out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorValue_5x6(np.arange(6))
       serialized = serialize_fn(sp_input, out_type=out_type)
       serialized = array_ops.stack([serialized, serialized])
@@ -110,14 +111,17 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeBatch(self):
     self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeManyBatch(self):
     self._testSerializeDeserializeBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeBatch(self):
     self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse,
@@ -125,7 +129,7 @@ class SerializeSparseTest(test.TestCase):
 
   def _testSerializeDeserializeBatchInconsistentShapeHelper(
       self, serialize_fn, deserialize_fn, out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
       sp_input1 = self._SparseTensorValue_3x4(np.arange(6))
       serialized0 = serialize_fn(sp_input0, out_type=out_type)
@@ -145,10 +149,12 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input1[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeBatchInconsistentShape(self):
     self._testSerializeDeserializeBatchInconsistentShapeHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeBatchInconsistentShape(self):
     self._testSerializeDeserializeBatchInconsistentShapeHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
@@ -158,7 +164,7 @@ class SerializeSparseTest(test.TestCase):
                                                  serialize_fn,
                                                  deserialize_fn,
                                                  out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       sp_input = self._SparseTensorValue_5x6(np.arange(6))
       serialized = serialize_fn(sp_input, out_type=out_type)
       serialized = array_ops.stack([serialized, serialized])
@@ -188,10 +194,12 @@ class SerializeSparseTest(test.TestCase):
 
       self.assertAllEqual(combined_shape, [2, 2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeNestedBatch(self):
     self._testSerializeDeserializeNestedBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeNestedBatch(self):
     self._testSerializeDeserializeNestedBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
@@ -201,7 +209,7 @@ class SerializeSparseTest(test.TestCase):
                                                serialize_fn,
                                                deserialize_fn,
                                                out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
@@ -224,14 +232,17 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], input1_val[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testFeedSerializeDeserializeBatch(self):
     self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                                   sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testFeedSerializeDeserializeManyBatch(self):
     self._testFeedSerializeDeserializeBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testFeedVariantSerializeDeserializeBatch(self):
     self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                                   sparse_ops.deserialize_sparse,
@@ -240,7 +251,7 @@ class SerializeSparseTest(test.TestCase):
   def _testSerializeManyShapeHelper(self,
                                     serialize_many_fn,
                                     out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       # N == 4 because shape_value == [4, 5]
       indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
       values_value = np.array([b"a", b"b", b"c"])
@@ -256,6 +267,7 @@ class SerializeSparseTest(test.TestCase):
           })
       self.assertEqual(serialized_value.shape, (4, 3))
 
+  @test_util.run_deprecated_v1
   def testSerializeManyShape(self):
     self._testSerializeManyShapeHelper(sparse_ops.serialize_many_sparse)
 
@@ -268,7 +280,7 @@ class SerializeSparseTest(test.TestCase):
                                                serialize_many_fn,
                                                deserialize_fn,
                                                out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       # N == 4 because shape_value == [4, 5]
       indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
       values_value = np.array([b"a", b"b", b"c"])
@@ -287,21 +299,25 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(deserialized_value.values, values_value)
       self.assertAllEqual(deserialized_value.dense_shape, shape_value)
 
+  @test_util.run_deprecated_v1
   def testSerializeManyDeserializeBatch(self):
     self._testSerializeManyDeserializeBatchHelper(
         sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testSerializeManyDeserializeManyBatch(self):
     self._testSerializeManyDeserializeBatchHelper(
         sparse_ops.serialize_many_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeManyDeserializeBatch(self):
     self._testSerializeManyDeserializeBatchHelper(
         sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse,
         dtypes.variant)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeScalar(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices_value = np.array([[]], dtype=np.int64)
       values_value = np.array([37], dtype=np.int32)
       shape_value = np.array([], dtype=np.int64)
@@ -321,8 +337,9 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(deserialized_value.values, values_value)
       self.assertAllEqual(deserialized_value.dense_shape, shape_value)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeScalarBatch(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       indices_value = np.array([[]], dtype=np.int64)
       values_value = np.array([37], dtype=np.int32)
       shape_value = np.array([], dtype=np.int64)
@@ -349,7 +366,7 @@ class SerializeSparseTest(test.TestCase):
                                            serialize_fn,
                                            deserialize_fn,
                                            out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
@@ -367,14 +384,17 @@ class SerializeSparseTest(test.TestCase):
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsWrongType(self):
     self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testDeserializeManyFailsWrongType(self):
     self._testDeserializeFailsWrongTypeHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantDeserializeFailsWrongType(self):
     self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse,
@@ -384,7 +404,7 @@ class SerializeSparseTest(test.TestCase):
                                                   serialize_fn,
                                                   deserialize_fn,
                                                   out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
@@ -402,14 +422,17 @@ class SerializeSparseTest(test.TestCase):
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsInconsistentRank(self):
     self._testDeserializeFailsInconsistentRankHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testDeserializeManyFailsInconsistentRank(self):
     self._testDeserializeFailsInconsistentRankHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantDeserializeFailsInconsistentRank(self):
     self._testDeserializeFailsInconsistentRankHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
@@ -419,7 +442,7 @@ class SerializeSparseTest(test.TestCase):
                                               serialize_fn,
                                               deserialize_fn,
                                               out_type=dtypes.string):
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       serialized0 = serialize_fn(sp_input0, out_type=out_type)
@@ -431,10 +454,12 @@ class SerializeSparseTest(test.TestCase):
       with self.assertRaisesOpError(r"Could not parse serialized proto"):
         sess.run(sp_deserialized, {sp_input0: input0_val})
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsInvalidProto(self):
     self._testDeserializeFailsInvalidProtoHelper(sparse_ops.serialize_sparse,
                                                  sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testDeserializeManyFailsInvalidProto(self):
     self._testDeserializeFailsInvalidProtoHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
diff --git a/tensorflow/python/kernel_tests/sparse_slice_op_test.py b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
index 97f30daf4a9c9615e1b42a1ba94e693e166bbc1c..7f8c91bde6748369211f66b50ed253cdcd513a2a 100644
--- a/tensorflow/python/kernel_tests/sparse_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import sparse_ops
 import tensorflow.python.ops.sparse_grad  # pylint: disable=unused-import
@@ -79,8 +80,9 @@ class SparseSliceOpTest(test.TestCase):
     return sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_3x4x2())
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixRows(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [2, 6])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [2, 0], [3, 7])
@@ -96,8 +98,9 @@ class SparseSliceOpTest(test.TestCase):
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 6])
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixUnevenCols(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x7()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [5, 3])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 3], [5, 2])
@@ -137,8 +140,9 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor3.values.eval(), [16, 46])
       self.assertAllEqual(sp_tensor3.dense_shape.eval(), [5, 1])
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixUnevenRows(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x7()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [3, 7])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [3, 0], [3, 7])
@@ -173,8 +177,9 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 7])
     return
 
+  @test_util.run_deprecated_v1
   def testSliceAllRows(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
       sp_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [1, 6])
       sp_tensor1 = sparse_ops.sparse_slice(sp_input, [1, 0], [1, 6])
@@ -195,8 +200,9 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor3.values.eval(), [30, 32, 33, 35])
       self.assertAllEqual(sp_tensor3.dense_shape.eval(), [1, 6])
 
+  @test_util.run_deprecated_v1
   def testSliceColumns(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
       sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 2])
       sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 2], [5, 2])
@@ -215,8 +221,9 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensor2.values.eval(), [4, 5, 14, 25, 35])
       self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 2])
 
+  @test_util.run_deprecated_v1
   def testSliceAllColumns(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
       sparse_tensor0 = sparse_ops.sparse_slice(sp_input, [0, 0], [4, 1])
       sparse_tensor1 = sparse_ops.sparse_slice(sp_input, [0, 1], [4, 1])
@@ -246,13 +253,14 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensor5.values.eval(), [5, 25, 35])
       self.assertAllEqual(sparse_tensor5.dense_shape.eval(), [4, 1])
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     sp_input = self._SparseTensor_4x6(val_dtype=np.float32)
     start_and_size = [([0, 0], [4, 2]),
                       ([0, 2], [5, 2]),
                       ([0, 4], [5, 3])]
 
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for start, size in start_and_size:
         sp_output = sparse_ops.sparse_slice(sp_input, start, size)
         nnz_in = len(sp_input.values.eval())
diff --git a/tensorflow/python/kernel_tests/sparse_split_op_test.py b/tensorflow/python/kernel_tests/sparse_split_op_test.py
index 23c6c390b2ef1a816b7dac809f578496cf7c46b4..f4bb7498b02f91abb2f93fb16a7e77b65e27257f 100644
--- a/tensorflow/python/kernel_tests/sparse_split_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_split_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
@@ -75,8 +76,9 @@ class SparseSplitOpTest(test.TestCase):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_3x4x2(
     ))
 
+  @test_util.run_deprecated_v1
   def testSplitMatrixRows(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_tensors = sparse_ops.sparse_split(
           sp_input=self._SparseTensor_4x6(), num_split=2, axis=0)
       self.assertAllEqual(len(sp_tensors), 2)
@@ -92,8 +94,9 @@ class SparseSplitOpTest(test.TestCase):
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensors[1].dense_shape.eval(), [2, 6])
 
+  @test_util.run_deprecated_v1
   def testSplitMatrixUnevenCols(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_tensors_3 = sparse_ops.sparse_split(
           sp_input=self._SparseTensor_5x7(), num_split=3, axis=1)
       self.assertAllEqual(len(sp_tensors_3), 3)
@@ -131,8 +134,9 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors_4[3].values.eval(), [16, 46])
       self.assertAllEqual(sp_tensors_4[3].dense_shape.eval(), [5, 1])
 
+  @test_util.run_deprecated_v1
   def testSplitMatrixUnevenRows(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_tensors_2 = sparse_ops.sparse_split(
           sp_input=self._SparseTensor_5x7(), num_split=2, axis=0)
       self.assertAllEqual(sp_tensors_2[0].indices.eval(),
@@ -167,8 +171,9 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors_3[2].dense_shape.eval(), [1, 7])
     return
 
+  @test_util.run_deprecated_v1
   def testSplitAllRows(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sp_tensors = sparse_ops.sparse_split(
           sp_input=self._SparseTensor_4x6(), num_split=4, axis=0)
       self.assertAllEqual(len(sp_tensors), 4)
@@ -189,8 +194,9 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors[3].values.eval(), [30, 32, 33, 35])
       self.assertAllEqual(sp_tensors[3].dense_shape.eval(), [1, 6])
 
+  @test_util.run_deprecated_v1
   def testSplitColumns(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sparse_tensors = sparse_ops.sparse_split(
           sp_input=self._SparseTensor_4x6(), num_split=3, axis=1)
       self.assertAllEqual(len(sparse_tensors), 3)
@@ -207,8 +213,9 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensors[2].values.eval(), [4, 5, 14, 25, 35])
       self.assertAllEqual(sparse_tensors[2].dense_shape.eval(), [4, 2])
 
+  @test_util.run_deprecated_v1
   def testSplitAllColumns(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       sparse_tensors = sparse_ops.sparse_split(
           sp_input=self._SparseTensor_4x6(), num_split=6, axis=1)
       self.assertAllEqual(len(sparse_tensors), 6)
@@ -234,10 +241,11 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensors[5].values.eval(), [5, 25, 35])
       self.assertAllEqual(sparse_tensors[5].dense_shape.eval(), [4, 1])
 
+  @test_util.run_deprecated_v1
   def testSliceConcat(self):
     for sp_input in (self._SparseTensorValue_3x4x2(),
                      self._SparseTensor_3x4x2()):
-      with self.test_session(use_gpu=False):
+      with self.cached_session(use_gpu=False):
         sparse_tensors = sparse_ops.sparse_split(
             sp_input=sp_input, num_split=2, axis=1)
         concat_tensor = sparse_ops.sparse_concat(1, sparse_tensors)
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
index e8b94294b1b85849760356ca102df44603ae5a3f..fa2bab1fca68000ec54c93bc9cb2ab1cf5b98a4f 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import sparse_ops
 import tensorflow.python.ops.sparse_grad  # pylint: disable=unused-import
@@ -72,7 +73,7 @@ class SparseTensorDenseMatMulGradientTest(test.TestCase):
     matmul = sparse_ops.sparse_tensor_dense_matmul(
         sp_t, dense_t, adjoint_a=adjoint_a, adjoint_b=adjoint_b, name=name)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       dense_t_shape = [m, k] if adjoint_b else [k, m]
       sp_t_val_shape = [nnz]
       err = gradient_checker.compute_gradient_error(
@@ -89,6 +90,7 @@ class SparseTensorDenseMatMulGradientTest(test.TestCase):
         self._testGradients(adjoint_a, adjoint_b, name, values_dtype,
                             indices_dtype)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     np.random.seed(5)  # Fix seed to avoid flakiness
     self._testGradientsType(np.float32, np.int64)
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index e20c6992525636f5c1683ec71cb219d1723ff705..637cfaec9907a59f7559053792e513739aad293f 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -65,7 +66,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     x_values = x[np.where(x)]
     x_shape = x.shape
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       sp_x_value = sparse_tensor.SparseTensorValue(
           indices=x_indices, values=x_values, dense_shape=x_shape)
       tf_value_ans = sparse_ops.sparse_tensor_dense_matmul(
@@ -80,7 +81,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
       self.assertEqual(tf_value_ans.get_shape()[1], np_ans.shape[1])
       self.assertEqual(tf_tensor_ans.get_shape()[1], np_ans.shape[1])
 
-      for out in (tf_value_ans.eval(), tf_tensor_ans.eval()):
+      for out in (tf_value_ans.eval(), self.evaluate(tf_tensor_ans)):
         if x.dtype == np.float32:
           self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-4)
         elif x.dtype == np.float64:
@@ -96,6 +97,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
     self._testMatmul(x, y, indices_dtype=indices_dtype)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     np.random.seed(127)  # Repeatable results
     self._testBasic(np.int32)
@@ -106,6 +108,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testBasic(np.int32, indices_dtype=np.int32)
     self._testBasic(np.float32, indices_dtype=np.int32)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     x = np.random.rand(10, 10)
     x[np.abs(x) < 0.5] = 0  # Make it sparse
@@ -133,7 +136,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
   def testInvalidIndicesForSparseTensorDenseMatmul(self):
     # Note: use_gpu=False because nice errors are only returned from CPU kernel.
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       indices = np.matrix([[1, 10]]).astype(np.int64)
       values = np.array([10]).astype(np.float32)
       shape = [3, 2]
@@ -166,7 +169,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     # Note: use_gpu=False because nice errors are only returned from CPU kerne
     if not test.is_gpu_available():
       return
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       indices = np.array([[1, 10]]).astype(np.int64)
       values = np.array([10]).astype(np.float32)
       shape = [3, 2]
@@ -229,6 +232,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testLarge(np.complex128)
 
   # Tests random sized matrices.
+  @test_util.run_deprecated_v1
   def testFloatRandom(self):
     np.random.seed(127)  # Repeatable results
     for _ in range(8):
diff --git a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
index 31e84341ae6e630bd9ff049b129eae00f3783f44..6039ff1afa74ed3d56dd3974bd10312c4c9870ca 100644
--- a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
@@ -24,9 +24,11 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 # pylint: disable=protected-access
@@ -75,6 +77,7 @@ class SparseTensorsMapTest(test.TestCase):
     shape = np.array([3, 4, 5]).astype(np.int64)
     return sparse_tensor_lib.SparseTensorValue(ind, val, shape)
 
+  @test_util.run_deprecated_v1
   def testAddTakeMany(self):
     with self.session(graph=ops.Graph(), use_gpu=False) as sess:
       sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
@@ -87,7 +90,7 @@ class SparseTensorsMapTest(test.TestCase):
       sp_out = take_many_sparse_from_tensors_map(
           sparse_map_op=handle0.op, sparse_handles=handles_concat)
 
-      combined_indices, combined_values, combined_shape = sess.run(sp_out)
+      combined_indices, combined_values, combined_shape = self.evaluate(sp_out)
 
       self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
       self.assertAllEqual(combined_indices[:6, 1:], sp_input0[0])
@@ -97,8 +100,9 @@ class SparseTensorsMapTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input1[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testFeedAddTakeMany(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_3x4(np.arange(6))
@@ -113,7 +117,8 @@ class SparseTensorsMapTest(test.TestCase):
       sp_roundtrip = take_many_sparse_from_tensors_map(
           sparse_map_op=handle.op, sparse_handles=sparse_handles)
 
-      combined_indices, combined_values, combined_shape = sess.run(sp_roundtrip)
+      combined_indices, combined_values, combined_shape = self.evaluate(
+          sp_roundtrip)
 
       self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
       self.assertAllEqual(combined_indices[:6, 1:], input0_val[0])
@@ -123,8 +128,9 @@ class SparseTensorsMapTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], input1_val[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testAddManyTakeManyRoundTrip(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       # N == 4 because shape_value == [4, 5]
       indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
       values_value = np.array([b"a", b"b", b"c"])
@@ -145,8 +151,9 @@ class SparseTensorsMapTest(test.TestCase):
       self.assertAllEqual(roundtrip_value.values, values_value)
       self.assertAllEqual(roundtrip_value.dense_shape, shape_value)
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsInconsistentRank(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_1x1x1()
@@ -164,19 +171,20 @@ class SparseTensorsMapTest(test.TestCase):
       with self.assertRaisesOpError(
           r"Inconsistent rank across SparseTensors: rank prior to "
           r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
-        sess.run(sp_roundtrip)
+        self.evaluate(sp_roundtrip)
 
+  @test_util.run_deprecated_v1
   def testTakeManyFailsWrongInputOp(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6(np.arange(6))
       handle = add_sparse_to_tensors_map(input_val)
-      handle_value = sess.run(handle)
+      handle_value = self.evaluate(handle)
       bad_handle = handle_value + 10
       sp_roundtrip = take_many_sparse_from_tensors_map(
           sparse_map_op=handle.op, sparse_handles=[handle_value, bad_handle])
 
       with self.assertRaisesOpError(r"Unable to find SparseTensor: 10"):
-        sess.run(sp_roundtrip)
+        self.evaluate(sp_roundtrip)
 
 
 class BenchmarkSparseTensorsMapVsSerialization(test.Benchmark):
@@ -192,7 +200,7 @@ class BenchmarkSparseTensorsMapVsSerialization(test.Benchmark):
         sorted(zip(indices_batch, indices_value)), dtype=np.int64)
     values = ["feature_value_for_embedding_lookup"] * num_elements
     shape = np.asarray([batch_size, num_elements], dtype=np.int64)
-    with session.Session() as sess:
+    with session.Session(config=benchmark.benchmark_config()) as sess:
       with ops.device("/cpu:0"):
         indices = variables.Variable(indices)
         values = variables.Variable(values)
@@ -211,8 +219,8 @@ class BenchmarkSparseTensorsMapVsSerialization(test.Benchmark):
 
         variables.global_variables_initializer().run()
 
-        st_roundtrip_values = sess.run(st_roundtrip)
-        st_deserialized_values = sess.run(st_deserialized)
+        st_roundtrip_values = self.evaluate(st_roundtrip)
+        st_deserialized_values = self.evaluate(st_deserialized)
         np.testing.assert_equal(st_roundtrip_values.values,
                                 st_deserialized_values.values)
         np.testing.assert_equal(st_roundtrip_values.indices,
diff --git a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
index c71746cc99ff31e79b387afd387adb66841efdf0..c6c45db4f9ac50d6986516fc18860d162b70b29c 100644
--- a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
+++ b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -41,84 +42,97 @@ def _SparseToDense(sparse_indices,
 
 class SparseToDenseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInt(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], 1, 0).eval()
     np_ans = np.array([0, 1, 0, 1, 0]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], 1.0, 0.0).eval()
     np_ans = np.array([0, 1, 0, 1, 0]).astype(np.float32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testString(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], "a", "b").eval()
     np_ans = np.array(["b", "a", "b", "a", "b"]).astype(np.string_)
     self.assertAllEqual(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testSetValue(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], [1, 2], -1).eval()
     np_ans = np.array([-1, 1, -1, 2, -1]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testSetSingleValue(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], 1, -1).eval()
     np_ans = np.array([-1, 1, -1, 1, -1]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def test2d(self):
     # pylint: disable=bad-whitespace
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       tf_ans = _SparseToDense([[1, 3], [2, 0]], [3, 4], 1, -1).eval()
     np_ans = np.array([[-1, -1, -1, -1],
                        [-1, -1, -1,  1],
                        [ 1, -1, -1, -1]]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testZeroDefault(self):
     with self.cached_session():
       x = sparse_ops.sparse_to_dense(2, [4], 7).eval()
       self.assertAllEqual(x, [0, 0, 7, 0])
 
+  @test_util.run_deprecated_v1
   def test3d(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       tf_ans = _SparseToDense([[1, 3, 0], [2, 0, 1]], [3, 4, 2], 1, -1).eval()
     np_ans = np.ones((3, 4, 2), dtype=np.int32) * -1
     np_ans[1, 3, 0] = 1
     np_ans[2, 0, 1] = 1
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testBadShape(self):
     with self.cached_session():
       with self.assertRaisesWithPredicateMatch(ValueError, "must be rank 1"):
         _SparseToDense([1, 3], [[5], [3]], 1, -1)
 
+  @test_util.run_deprecated_v1
   def testBadValue(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [[5], [3]], -1)
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[2,1\], "
           r"should be \[\] or \[2\]"):
-        dense.eval()
+        self.evaluate(dense)
 
+  @test_util.run_deprecated_v1
   def testBadNumValues(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [1, 2, 3], -1)
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[3\], should be \[\] or \[2\]"):
-        dense.eval()
+        self.evaluate(dense)
 
+  @test_util.run_deprecated_v1
   def testBadDefault(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [1, 2], [0])
       with self.assertRaisesOpError("default_value should be a scalar"):
-        dense.eval()
+        self.evaluate(dense)
 
+  @test_util.run_deprecated_v1
   def testOutOfBoundsIndicesWithWithoutValidation(self):
     with self.cached_session():
       dense = _SparseToDense(
@@ -128,7 +142,7 @@ class SparseToDenseTest(test.TestCase):
           default_value=0.0)
       with self.assertRaisesOpError(
           r"indices\[1\] = \[10\] is out of bounds: need 0 <= index < \[5\]"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks, the allocation should still fail.
       with self.assertRaisesOpError("out of bounds"):
         dense_without_validation = _SparseToDense(
@@ -137,8 +151,9 @@ class SparseToDenseTest(test.TestCase):
             sparse_values=[-1.0, 1.0],
             default_value=0.0,
             validate_indices=False)
-        dense_without_validation.eval()
+        self.evaluate(dense_without_validation)
 
+  @test_util.run_deprecated_v1
   def testRepeatingIndicesWithWithoutValidation(self):
     with self.cached_session():
       dense = _SparseToDense(
@@ -147,7 +162,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0)
       with self.assertRaisesOpError(r"indices\[1\] = \[1\] is repeated"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks
       dense_without_validation = _SparseToDense(
           sparse_indices=[[1], [1]],
@@ -155,8 +170,9 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0,
           validate_indices=False)
-      dense_without_validation.eval()
+      self.evaluate(dense_without_validation)
 
+  @test_util.run_deprecated_v1
   def testUnsortedIndicesWithWithoutValidation(self):
     with self.cached_session():
       dense = _SparseToDense(
@@ -165,7 +181,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0)
       with self.assertRaisesOpError(r"indices\[1\] = \[1\] is out of order"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks
       dense_without_validation = _SparseToDense(
           sparse_indices=[[2], [1]],
@@ -173,10 +189,11 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0,
           validate_indices=False)
-      dense_without_validation.eval()
+      self.evaluate(dense_without_validation)
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceKnownShape(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       indices = array_ops.placeholder(dtypes.int64)
 
       shape = [4, 5, 6]
@@ -187,8 +204,9 @@ class SparseToDenseTest(test.TestCase):
       output = sparse_ops.sparse_to_dense(indices, shape, 1, 0)
       self.assertEqual(output.get_shape().as_list(), [None, None, None])
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceUnknownShape(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       indices = array_ops.placeholder(dtypes.int64)
       shape = array_ops.placeholder(dtypes.int64)
       output = sparse_ops.sparse_to_dense(indices, shape, 1, 0)
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index a841fe83a7f585a69ef33c437570359797484a4a..8f0842f7f50c61ea23361fa255ae45deae2ebfc1 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -63,34 +64,35 @@ class SparseXentTest(test.TestCase):
 
   def _testXent(self, np_features, np_labels):
     np_loss, np_backprop = self._npXent(np_features, np_labels)
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       loss, backprop = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
           np_features, np_labels)
-      tf_loss, tf_backprop = sess.run([loss, backprop])
+      tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
   def testSingleClass(self):
     for label_dtype in np.int32, np.int64:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         loss, backprop = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
             np.array([[1.], [-1.], [0.]]).astype(np.float32),
             np.array([0, 0, 0]).astype(label_dtype))
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
       self.assertAllClose([0.0, 0.0, 0.0], tf_loss)
       self.assertAllClose([[0.0], [0.0], [0.0]], tf_backprop)
 
+  @test_util.run_deprecated_v1
   def testInvalidLabel(self):
     features = [[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 2., 3., 4.],
                 [1., 2., 3., 4.]]
     labels = [4, 3, 0, -1]
 
     if test.is_built_with_cuda() and test.is_gpu_available():
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         loss, backprop = (
             gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
                 features, labels))
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
         self.assertAllClose(
             [[np.nan] * 4, [0.25, 0.25, 0.25, -0.75],
              [-0.968, 0.087, 0.237, 0.6439], [np.nan] * 4],
@@ -100,11 +102,11 @@ class SparseXentTest(test.TestCase):
         self.assertAllClose(
             [np.nan, 1.3862, 3.4420, np.nan], tf_loss, rtol=1e-3, atol=1e-3)
 
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       loss, backprop = (
           gen_nn_ops.sparse_softmax_cross_entropy_with_logits(features, labels))
       with self.assertRaisesOpError("Received a label value of"):
-        sess.run([loss, backprop])
+        self.evaluate([loss, backprop])
 
   def testNpXent(self):
     # We create 2 batches of logits for testing.
@@ -141,19 +143,20 @@ class SparseXentTest(test.TestCase):
         np.array([1.3862, 3.4420]), np_loss, rtol=1.e-3, atol=1.e-3)
 
   def testShapeMismatch(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaisesRegexp(ValueError, ".*Rank mismatch:*"):
         nn_ops.sparse_softmax_cross_entropy_with_logits(
             labels=[[0, 2]], logits=[[0., 1.], [2., 3.], [2., 3.]])
 
   def testScalar(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaisesRegexp(ValueError, ".*Logits cannot be scalars*"):
         nn_ops.sparse_softmax_cross_entropy_with_logits(
             labels=constant_op.constant(0), logits=constant_op.constant(1.0))
 
+  @test_util.run_deprecated_v1
   def testLabelsPlaceholderScalar(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       labels = array_ops.placeholder(np.int32)
       y = nn_ops.sparse_softmax_cross_entropy_with_logits(
           labels=labels, logits=[[7.]])
@@ -161,10 +164,10 @@ class SparseXentTest(test.TestCase):
         y.eval(feed_dict={labels: 0})
 
   def testVector(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       loss = nn_ops.sparse_softmax_cross_entropy_with_logits(
           labels=constant_op.constant(0), logits=constant_op.constant([1.0]))
-      self.assertAllClose(0.0, loss.eval())
+      self.assertAllClose(0.0, self.evaluate(loss))
 
   def testFloat(self):
     for label_dtype in np.int32, np.int64:
@@ -187,8 +190,9 @@ class SparseXentTest(test.TestCase):
   def testEmpty(self):
     self._testXent(np.zeros((0, 3)), np.zeros((0,), dtype=np.int32))
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       l = constant_op.constant([3, 0, 1], name="l")
       f = constant_op.constant(
           [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
@@ -201,6 +205,7 @@ class SparseXentTest(test.TestCase):
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
+  @test_util.run_deprecated_v1
   def testSecondGradient(self):
     images_placeholder = array_ops.placeholder(dtypes.float32, shape=(3, 2))
     labels_placeholder = array_ops.placeholder(dtypes.int32, shape=(3))
@@ -222,27 +227,30 @@ class SparseXentTest(test.TestCase):
     np_loss, np_backprop = self._npXent(np.array(features), np.array(labels))
     # manually reshape loss
     np_loss = np.reshape(np_loss, np.array(labels).shape)
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       loss = nn_ops.sparse_softmax_cross_entropy_with_logits(
           labels=labels, logits=features)
       backprop = loss.op.inputs[0].op.outputs[1]
-      tf_loss, tf_backprop = sess.run([loss, backprop])
+      tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
+  @test_util.run_deprecated_v1
   def testHighDim(self):
     features = [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]
     labels = [[3], [0]]
     self._testHighDim(features, labels)
 
+  @test_util.run_deprecated_v1
   def testHighDim2(self):
     features = [[[1., 1., 1., 1.], [2., 2., 2., 2.]],
                 [[1., 2., 3., 4.], [5., 6., 7., 8.]]]
     labels = [[3, 2], [0, 3]]
     self._testHighDim(features, labels)
 
+  @test_util.run_deprecated_v1
   def testScalarHandling(self):
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    ".*labels must be 1-D.*"):
         labels = array_ops.placeholder(dtypes.int32, shape=[None, 1])
@@ -318,7 +326,7 @@ def sparse_vs_dense_xent_benchmark(batch_size, num_entries, use_gpu):
   # Using sparse_softmax_cross_entropy_with_logits
   with session.Session(config=config) as sess:
     if not use_gpu:
-      with ops_lib.device("/cpu:0"):
+      with test_util.device("/cpu:0"):
         ops = _sparse_vs_dense_xent_benchmark_sparse(labels, logits)
     else:
       ops = _sparse_vs_dense_xent_benchmark_sparse(labels, logits)
diff --git a/tensorflow/python/kernel_tests/sparsemask_op_test.py b/tensorflow/python/kernel_tests/sparsemask_op_test.py
index 6f5dd45b616c13133a70d82823d4e5030d4e41ea..b1cd0227bc0a71db05c120cff7f70afc7ef1f10e 100644
--- a/tensorflow/python/kernel_tests/sparsemask_op_test.py
+++ b/tensorflow/python/kernel_tests/sparsemask_op_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class SparseMaskTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     values = np.random.rand(4, 4).astype(np.single)
     indices = np.array([0, 2, 3, 4], dtype=np.int32)
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 3f9b029a6ac777fc97c65ecf3d70ac879bb5d116..517db3450f3c43ea0989b59db5ccc7c089e9cec3 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -42,6 +42,7 @@ class SplitOpTest(test.TestCase):
       data -= 1j * data
     return data
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     model_input = array_ops.placeholder(dtypes.float32, shape=(1, 10))
 
@@ -54,13 +55,13 @@ class SplitOpTest(test.TestCase):
     model_input = array_ops.placeholder(dtypes.float32)
     inp = np.zeros((1, 10))
     # check that we still fail at runtime if the shapes were unknown
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       with self.assertRaises(errors_impl.InvalidArgumentError):
         sess.run(array_ops.split(model_input, [4]), {model_input: inp})
 
     # test that we can pass a scalar Tensor as num_splits
     for axis in [0, -2]:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         result = sess.run(
             array_ops.split(
                 array_ops.ones([4, 4]),
@@ -82,15 +83,16 @@ class SplitOpTest(test.TestCase):
     model_input2 = array_ops.placeholder(dtypes.float32, shape=[None, 2])
     result = array_ops.split(model_input2, [2, 2], axis=0)[0]
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       sess.run(result, feed_dict={model_input2: np.ones([4, 2])})
 
+  @test_util.run_deprecated_v1
   def testFailWithoutExplicitNum(self):
     size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
 
     value = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with self.assertRaises(ValueError) as context:
         sess.run(array_ops.split(value, size_splits), {size_splits: [2, 2, 6]})
       self.assertTrue("Cannot infer num from shape" in str(context.exception))
@@ -209,9 +211,10 @@ class SplitOpTest(test.TestCase):
     self.assertAllEqual(result[:, 0:1], inp_grads[0])
     self.assertAllEqual(result[:, 1:4], inp_grads[1])
 
+  @test_util.run_deprecated_v1
   def testOutputShape(self):
     for axis in [1, -1]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         tensor = array_ops.placeholder(dtypes.float32, shape=[None, 12])
         size_splits = [3, 7, 2]
         outputs = array_ops.split(tensor, size_splits, axis)
@@ -312,21 +315,23 @@ class SplitOpTest(test.TestCase):
 
   def _testGradientsSimple(self, dtype):
     inp = self._makeData((4, 4), dtype)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       inp_tensor = ops.convert_to_tensor(inp)
       s = array_ops.split(value=inp_tensor, num_or_size_splits=4, axis=1)
       inp_grads = [self._makeData((4, 1), dtype)for _ in range(4)]
       grad_tensors = [constant_op.constant(x) for x in inp_grads]
       grad = gradients_impl.gradients(s, [inp_tensor], grad_tensors)[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     for i in range(4):
       self.assertAllEqual(result[:, i:i + 1], inp_grads[i])
 
+  @test_util.run_deprecated_v1
   def testGradientsAll(self):
     for dtype in _TEST_DTYPES:
       self._testGradientsSimple(dtype)
       self._testGradientsSimpleVariable(dtype)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # split_dim greater than rank of input.
     with self.assertRaises(ValueError):
@@ -356,6 +361,7 @@ class SplitOpTest(test.TestCase):
     for s in splits:
       self.assertEqual(None, s.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testVariableShapeFunction(self):
     # size_splits too big
     with self.assertRaises(ValueError):
@@ -366,6 +372,7 @@ class SplitOpTest(test.TestCase):
     assert s0.shape.as_list() == [2]
     assert s1.shape.as_list() == [1]
 
+  @test_util.run_deprecated_v1
   def testNonexistentDimTensor(self):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
@@ -375,7 +382,7 @@ class SplitOpTest(test.TestCase):
 
     splits = array_ops.placeholder(dtypes.int32, [3])
     y = array_ops.split(values, splits, axis=x)
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "must have exactly one element"):
         sess.run(y, {x: np.array([], dtype=np.int32), splits: [4, 11, 15]})
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index 2a33c594a44a641f8687fb80efbd5aeebe210089..ca3357a0ed8f87cfcccd08a62c5b8526a898b664 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
@@ -41,9 +42,10 @@ def np_split_squeeze(array, axis):
 
 class StackOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     np.random.seed(7)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [np.bool, np.float32, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
@@ -54,27 +56,30 @@ class StackOpTest(test.TestCase):
           c = array_ops.stack(xs)
           self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testSimpleParallelCPU(self):
     np.random.seed(7)
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         data = np.random.randn(*shape).astype(np.float32)
         xs = list(map(constant_op.constant, data))
         c = array_ops.parallel_stack(xs)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testSimpleParallelGPU(self):
     np.random.seed(7)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         data = np.random.randn(*shape).astype(np.float32)
         xs = list(map(constant_op.constant, data))
         c = array_ops.parallel_stack(xs)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testConst(self):
     np.random.seed(7)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [np.bool, np.float32, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
@@ -96,9 +101,10 @@ class StackOpTest(test.TestCase):
         b = array_ops.reshape(a, array_ops.stack([2, 3]))
         self.assertAllEqual(b.get_shape(), [2, 3])
 
+  @test_util.run_deprecated_v1
   def testConstParallelCPU(self):
     np.random.seed(7)
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         data = np.random.randn(*shape).astype(np.float32)
         if len(shape) == 1:
@@ -110,9 +116,10 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(data)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testConstParallelGPU(self):
     np.random.seed(7)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         data = np.random.randn(*shape).astype(np.float32)
         if len(shape) == 1:
@@ -124,18 +131,20 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(data)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
     np.random.seed(7)
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
       data = np.random.randn(*shape)
       shapes = [shape[1:]] * shape[0]
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         # TODO(irving): Remove list() once we handle maps correctly
         xs = list(map(constant_op.constant, data))
         c = array_ops.stack(xs)
         err = gradient_checker.compute_gradient_error(xs, shapes, c, shape)
         self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis1(self):
     np.random.seed(7)
     for shape in (2, 3), (3, 2), (4, 3, 2):
@@ -143,16 +152,17 @@ class StackOpTest(test.TestCase):
       shapes = [shape[1:]] * shape[0]
       out_shape = list(shape[1:])
       out_shape.insert(1, shape[0])
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         # TODO(irving): Remove list() once we handle maps correctly
         xs = list(map(constant_op.constant, data))
         c = array_ops.stack(xs, axis=1)
         err = gradient_checker.compute_gradient_error(xs, shapes, c, out_shape)
         self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testZeroSizeCPU(self):
     # Verify that stack doesn't crash for zero size inputs
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       for shape in (0,), (3, 0), (0, 3):
         x = np.zeros((2,) + shape).astype(np.int32)
         p = array_ops.stack(list(x)).eval()
@@ -161,9 +171,10 @@ class StackOpTest(test.TestCase):
         p = array_ops.parallel_stack(list(x)).eval()
         self.assertAllEqual(p, x)
 
+  @test_util.run_deprecated_v1
   def testZeroSizeGPU(self):
     # Verify that stack doesn't crash for zero size inputs
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       for shape in (0,), (3, 0), (0, 3):
         x = np.zeros((2,) + shape).astype(np.int32)
         p = array_ops.stack(list(x)).eval()
@@ -172,8 +183,9 @@ class StackOpTest(test.TestCase):
         p = array_ops.parallel_stack(list(x)).eval()
         self.assertAllEqual(p, x)
 
+  @test_util.run_deprecated_v1
   def testAxis0DefaultCPU(self):
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
       stacked = array_ops.stack(t).eval()
       parallel_stacked = array_ops.parallel_stack(t).eval()
@@ -182,8 +194,9 @@ class StackOpTest(test.TestCase):
     self.assertAllEqual(stacked, expected)
     self.assertAllEqual(parallel_stacked, expected)
 
+  @test_util.run_deprecated_v1
   def testAxis0DefaultGPU(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
       stacked = array_ops.stack(t).eval()
       parallel_stacked = array_ops.parallel_stack(t).eval()
@@ -201,14 +214,14 @@ class StackOpTest(test.TestCase):
       for j in range(-i, i):
         test_arrays = np_split_squeeze(expected, j)
 
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           actual_pack = array_ops.stack(test_arrays, axis=j)
           self.assertEqual(expected.shape, actual_pack.get_shape())
-          actual_pack = actual_pack.eval()
+          actual_pack = self.evaluate(actual_pack)
 
           actual_stack = array_ops.stack(test_arrays, axis=j)
           self.assertEqual(expected.shape, actual_stack.get_shape())
-          actual_stack = actual_stack.eval()
+          actual_stack = self.evaluate(actual_stack)
 
         self.assertNDArrayNear(expected, actual_stack, 1e-6)
 
@@ -225,8 +238,9 @@ class StackOpTest(test.TestCase):
 
 class AutomaticStackingTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(
           [1, 0, 2],
           ops.convert_to_tensor([1, constant_op.constant(0), 2]).eval())
@@ -246,24 +260,27 @@ class AutomaticStackingTest(test.TestCase):
                           ]).eval())
 
   def testWithNDArray(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       result = ops.convert_to_tensor([[[0., 0.],
                                        constant_op.constant([1., 1.])],
                                       np.array(
                                           [[2., 2.], [3., 3.]],
                                           dtype=np.float32)])
       self.assertAllEqual([[[0., 0.], [1., 1.]], [[2., 2.], [3., 3.]]],
-                          result.eval())
+                          self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def testVariable(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       v = variables.Variable(17)
       result = ops.convert_to_tensor([[0, 0, 0], [0, v, 0], [0, 0, 0]])
       v.initializer.run()
-      self.assertAllEqual([[0, 0, 0], [0, 17, 0], [0, 0, 0]], result.eval())
+      self.assertAllEqual([[0, 0, 0], [0, 17, 0], [0, 0, 0]],
+                          self.evaluate(result))
 
       v.assign(38).op.run()
-      self.assertAllEqual([[0, 0, 0], [0, 38, 0], [0, 0, 0]], result.eval())
+      self.assertAllEqual([[0, 0, 0], [0, 38, 0], [0, 0, 0]],
+                          self.evaluate(result))
 
   def testDtype(self):
     t_0 = ops.convert_to_tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
@@ -306,8 +323,9 @@ class AutomaticStackingTest(test.TestCase):
     t_2 = ops.convert_to_tensor([t_0, t_0, t_1], dtype=dtypes.float64)
     self.assertEqual(dtypes.float64, t_2.dtype)
 
+  @test_util.run_deprecated_v1
   def testPlaceholder(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # Test using placeholder with a defined shape.
       ph_0 = array_ops.placeholder(dtypes.int32, shape=[])
       result_0 = ops.convert_to_tensor([[0, 0, 0], [0, ph_0, 0], [0, 0, 0]])
@@ -324,6 +342,7 @@ class AutomaticStackingTest(test.TestCase):
       self.assertAllEqual(
           [[0, 0, 0], [0, 2, 0], [0, 0, 0]], result_1.eval(feed_dict={ph_1: 2}))
 
+  @test_util.run_deprecated_v1
   def testShapeErrors(self):
     # Static shape error.
     ph_0 = array_ops.placeholder(dtypes.int32, shape=[1])
@@ -333,7 +352,7 @@ class AutomaticStackingTest(test.TestCase):
     # Dynamic shape error.
     ph_1 = array_ops.placeholder(dtypes.int32)
     result_1 = ops.convert_to_tensor([[0, 0, 0], [0, ph_1, 0], [0, 0, 0]])
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         result_1.eval(feed_dict={ph_1: [1]})
 
diff --git a/tensorflow/python/kernel_tests/stack_ops_test.py b/tensorflow/python/kernel_tests/stack_ops_test.py
index afd2eaffab992bca4b3ae7b4f65e0370f325b548..1930d2484fdc986ba8c5ab50df55769aa4fdc45a 100644
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -33,20 +34,21 @@ from tensorflow.python.platform import test
 class StackOpTest(test.TestCase):
 
   def _testStackPushPop(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
       c = gen_data_flow_ops.stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
-      self.assertAllClose([[4.0, 5.0]], c1.eval())
+      self.assertAllClose([[4.0, 5.0]], self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPop(self):
     self._testStackPushPop(use_gpu=False)
     self._testStackPushPop(use_gpu=True)
 
   def _testStackPushPopSwap(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       a = np.arange(2000)
       x = constant_op.constant(a, dtype=dtypes.float32)
       h = gen_data_flow_ops.stack_v2(
@@ -54,14 +56,15 @@ class StackOpTest(test.TestCase):
       c = gen_data_flow_ops.stack_push_v2(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
-      self.assertAllClose(a, c1.eval())
+      self.assertAllClose(a, self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPopSwap(self):
     self._testStackPushPopSwap(use_gpu=False)
     self._testStackPushPopSwap(use_gpu=True)
 
   def _testStackWhileSwap(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       n = constant_op.constant(0)
       h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
@@ -91,14 +94,15 @@ class StackOpTest(test.TestCase):
 
       _, ry = control_flow_ops.while_loop(
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
-      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
+      self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
+  @test_util.run_v1_only("b/120545219")
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
     self._testStackWhileSwap(use_gpu=True)
 
   def _testMultiStack(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       h1 = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_push_v2(h1, 4.0)
@@ -110,15 +114,16 @@ class StackOpTest(test.TestCase):
       with ops.control_dependencies([c2]):
         c2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
       r = c1 + c2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testMultiStack(self):
     self._testMultiStack(use_gpu=False)
     self._testMultiStack(use_gpu=True)
 
   def _testSameNameStacks(self, use_gpu):
     """Different stacks with the same name do not interfere."""
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       h1 = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
       h2 = gen_data_flow_ops.stack_v2(
@@ -131,34 +136,37 @@ class StackOpTest(test.TestCase):
         pop1 = gen_data_flow_ops.stack_pop_v2(h1, dtypes.float32)
         pop2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
 
-      out1, out2 = sess.run([pop1, pop2])
+      out1, out2 = self.evaluate([pop1, pop2])
       self.assertAllClose(out1, 4.0)
       self.assertAllClose(out2, 5.0)
 
+  @test_util.run_deprecated_v1
   def testSameNameStacks(self):
     self._testSameNameStacks(use_gpu=False)
     self._testSameNameStacks(use_gpu=True)
 
   def _testCloseStack(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_close_v2(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testCloseStack(self):
     self._testCloseStack(use_gpu=False)
     self._testCloseStack(use_gpu=True)
 
   def _testPushCloseStack(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
       c = gen_data_flow_ops.stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_close_v2(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testPushCloseStack(self):
     self._testPushCloseStack(use_gpu=False)
     self._testPushCloseStack(use_gpu=True)
@@ -168,33 +176,35 @@ class StackOpRefTest(test.TestCase):
   """Tests for deprecated non-resource variant of stack ops."""
 
   def _testStackPushPop(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c = gen_data_flow_ops.stack_push(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
-      self.assertAllClose([[4.0, 5.0]], c1.eval())
+      self.assertAllClose([[4.0, 5.0]], self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPop(self):
     self._testStackPushPop(use_gpu=False)
     self._testStackPushPop(use_gpu=True)
 
   def _testStackPushPopSwap(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       a = np.arange(2000)
       x = constant_op.constant(a, dtype=dtypes.float32)
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c = gen_data_flow_ops.stack_push(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
-      self.assertAllClose(a, c1.eval())
+      self.assertAllClose(a, self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPopSwap(self):
     self._testStackPushPopSwap(use_gpu=False)
     self._testStackPushPopSwap(use_gpu=True)
 
   def _testMultiStack(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       h1 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_push(h1, 4.0)
       with ops.control_dependencies([c1]):
@@ -204,10 +214,10 @@ class StackOpRefTest(test.TestCase):
       with ops.control_dependencies([c2]):
         c2 = gen_data_flow_ops.stack_pop(h2, dtypes.float32)
       r = c1 + c2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
   def _testStackWhileSwap(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       n = constant_op.constant(0)
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
 
@@ -236,47 +246,52 @@ class StackOpRefTest(test.TestCase):
 
       _, ry = control_flow_ops.while_loop(
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
-      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
+      self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
+  @test_util.run_v1_only("b/120545219")
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
     self._testStackWhileSwap(use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testMultiStack(self):
     self._testMultiStack(use_gpu=False)
     self._testMultiStack(use_gpu=True)
 
   def _testSameNameStacks(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       h1 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_push(h1, 4.0)
       h2 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c2 = gen_data_flow_ops.stack_push(h2, 5.0)
       _ = c1 + c2
-      self.assertNotEqual(h1.eval()[1], h2.eval()[1])
+      self.assertNotEqual(h1.eval()[1], self.evaluate(h2)[1])
 
+  @test_util.run_deprecated_v1
   def testSameNameStacks(self):
     self._testSameNameStacks(use_gpu=False)
     self._testSameNameStacks(use_gpu=True)
 
   def _testCloseStack(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_close(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testCloseStack(self):
     self._testCloseStack(use_gpu=False)
     self._testCloseStack(use_gpu=True)
 
   def _testPushCloseStack(self, use_gpu):
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c = gen_data_flow_ops.stack_push(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_close(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testPushCloseStack(self):
     self._testPushCloseStack(use_gpu=False)
     self._testPushCloseStack(use_gpu=True)
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index dd06d303912813733886b9cf20590513760e67f1..83e06ba48bdbbe3189eafde7d0f42c2e4ced68ab 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -28,6 +29,7 @@ TIMEOUT = 1
 
 class StageTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -41,12 +43,13 @@ class StageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i})
         self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testMultiple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -60,13 +63,14 @@ class StageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i})
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testDictionary(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -85,7 +89,7 @@ class StageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i})
@@ -110,6 +114,7 @@ class StageTest(test.TestCase):
 
     G.finalize()
 
+  @test_util.run_deprecated_v1
   def testPeek(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -126,13 +131,14 @@ class StageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       for i in range(10):
         sess.run(stage, feed_dict={x: i})
 
       for i in range(10):
         self.assertTrue(sess.run(peek, feed_dict={p: i}) == [i])
 
+  @test_util.run_deprecated_v1
   def testSizeAndClear(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -150,7 +156,7 @@ class StageTest(test.TestCase):
 
     G.finalize()
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       sess.run(stage, feed_dict={x: -1})
       self.assertEqual(sess.run(size), 1)
       sess.run(stage, feed_dict={x: -1})
@@ -158,6 +164,7 @@ class StageTest(test.TestCase):
       sess.run(clear)
       self.assertEqual(sess.run(size), 0)
 
+  @test_util.run_deprecated_v1
   def testCapacity(self):
     capacity = 3
 
@@ -181,7 +188,7 @@ class StageTest(test.TestCase):
     queue = Queue.Queue()
     n = 8
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
       # when it hits the staging area's capacity and thus
       # not fill the queue with n tokens
@@ -219,6 +226,7 @@ class StageTest(test.TestCase):
       # It should now be empty
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testMemoryLimit(self):
     memory_limit = 512 * 1024  # 512K
     chunk = 200 * 1024  # 256K
@@ -245,7 +253,7 @@ class StageTest(test.TestCase):
     queue = Queue.Queue()
     n = 8
 
-    with self.test_session(use_gpu=True, graph=G) as sess:
+    with self.session(use_gpu=True, graph=G) as sess:
       # Stage data in a separate thread which will block
       # when it hits the staging area's capacity and thus
       # not fill the queue with n tokens
diff --git a/tensorflow/python/kernel_tests/string_join_op_test.py b/tensorflow/python/kernel_tests/string_join_op_test.py
index e4371ab5b933a9bd2cf891f24a254bd14e584e3d..2548e8695fe5861644dbac6481bb01ef18515b3e 100644
--- a/tensorflow/python/kernel_tests/string_join_op_test.py
+++ b/tensorflow/python/kernel_tests/string_join_op_test.py
@@ -17,12 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
 class StringJoinOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testStringJoin(self):
     input0 = ["a", "b"]
     input1 = "a"
diff --git a/tensorflow/python/kernel_tests/string_length_op_test.py b/tensorflow/python/kernel_tests/string_length_op_test.py
index 4afe3ad3f4edbbeb49244c7bbea23e95b1b04620..bfa6ac2454a3fba97abdd4ed8376661a0bc6fd70 100644
--- a/tensorflow/python/kernel_tests/string_length_op_test.py
+++ b/tensorflow/python/kernel_tests/string_length_op_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -29,34 +30,36 @@ class StringLengthOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       lengths = string_ops.string_length(strings)
-      values = sess.run(lengths)
+      values = self.evaluate(lengths)
       self.assertAllEqual(values, [[[1, 2], [3, 4], [5, 6]]])
 
+  @test_util.run_deprecated_v1
   def testUnit(self):
     unicode_strings = [u"H\xc3llo", u"\U0001f604"]
     utf8_strings = [s.encode("utf-8") for s in unicode_strings]
     expected_utf8_byte_lengths = [6, 4]
     expected_utf8_char_lengths = [5, 1]
 
-    with self.test_session() as sess:
+    with self.session() as sess:
       utf8_byte_lengths = string_ops.string_length(utf8_strings, unit="BYTE")
       utf8_char_lengths = string_ops.string_length(
           utf8_strings, unit="UTF8_CHAR")
       self.assertAllEqual(
-          sess.run(utf8_byte_lengths), expected_utf8_byte_lengths)
+          self.evaluate(utf8_byte_lengths), expected_utf8_byte_lengths)
       self.assertAllEqual(
-          sess.run(utf8_char_lengths), expected_utf8_char_lengths)
+          self.evaluate(utf8_char_lengths), expected_utf8_char_lengths)
       with self.assertRaisesRegexp(
           ValueError, "Attr 'unit' of 'StringLength' Op passed string 'XYZ' "
           'not in: "BYTE", "UTF8_CHAR"'):
         string_ops.string_length(utf8_strings, unit="XYZ")
 
+  @test_util.run_deprecated_v1
   def testLegacyPositionalName(self):
     # Code that predates the 'unit' parameter may have used a positional
     # argument for the 'name' parameter.  Check that we don't break such code.
     strings = [[["1", "12"], ["123", "1234"], ["12345", "123456"]]]
     lengths = string_ops.string_length(strings, "some_name")
-    with self.test_session():
+    with self.session():
       self.assertAllEqual(lengths.eval(), [[[1, 2], [3, 4], [5, 6]]])
 
 
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index b968e885edafcbdebd3b32e11c6bdf35e65e7616..0c91deb5220bf268366bbc65dbd001617439fa12 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -34,17 +35,18 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
       self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
       self.assertAllEqual(shape, [2, 4])
 
+  @test_util.run_deprecated_v1
   def testStringSplitEmptyDelimiter(self):
     strings = ["hello", "hola", b"\xF0\x9F\x98\x8E"]  # Last string is U+1F60E
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, delimiter="")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                                     [1, 0], [1, 1], [1, 2], [1, 3], [2, 0],
                                     [2, 1], [2, 2], [2, 3]])
@@ -62,7 +64,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices,
           [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
@@ -74,13 +76,14 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, delimiter=" .")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices,
           [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
       self.assertAllEqual(values, [b"a", b"b", b"c", b"d", b"e", b"f", b"g"])
       self.assertAllEqual(shape, [10, 1])
 
+  @test_util.run_deprecated_v1
   def testStringSplitWithDelimiter(self):
     strings = ["hello|world", "hello world"]
 
@@ -92,17 +95,18 @@ class StringSplitOpTest(test.TestCase):
           ValueError, string_ops.string_split, strings, delimiter=["a"])
 
       tokens = string_ops.string_split(strings, delimiter="|")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
       self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
       self.assertAllEqual(shape, [2, 2])
 
       tokens = string_ops.string_split(strings, delimiter="| ")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"hello", b"world", b"hello", b"world"])
       self.assertAllEqual(shape, [2, 2])
 
+  @test_util.run_deprecated_v1
   def testStringSplitWithDelimiterTensor(self):
     strings = ["hello|world", "hello world"]
 
@@ -121,6 +125,7 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
       self.assertAllEqual(shape, [2, 2])
 
+  @test_util.run_deprecated_v1
   def testStringSplitWithDelimitersTensor(self):
     strings = ["hello.cruel,world", "hello cruel world"]
 
@@ -145,7 +150,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, "#", skip_empty=False)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1],
                                     [2, 0], [2, 1], [2, 2]])
@@ -154,7 +159,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, "#")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(values, [b"a", b"b", b"c"])
       self.assertAllEqual(indices, [[0, 0], [1, 0], [2, 0]])
       self.assertAllEqual(shape, [3, 1])
@@ -167,7 +172,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
       self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
       self.assertAllEqual(shape, [2, 4])
@@ -182,7 +187,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep="<>")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices, [[0, 0], [0, 1], [0, 2],
                     [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
@@ -200,7 +205,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep=',')
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                     [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
       self.assertAllEqual(values, [b"1", b"2", b"3",
@@ -217,7 +222,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                     [1, 0], [1, 1], [1, 2]])
       self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
@@ -233,7 +238,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
@@ -249,7 +254,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, maxsplit=1)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
diff --git a/tensorflow/python/kernel_tests/string_strip_op_test.py b/tensorflow/python/kernel_tests/string_strip_op_test.py
index a96b71490e460ba7e9f28f03b1fca7a0c9984571..edff3862ff6984393c497f76943dc460d6f2541c 100644
--- a/tensorflow/python/kernel_tests/string_strip_op_test.py
+++ b/tensorflow/python/kernel_tests/string_strip_op_test.py
@@ -23,14 +23,14 @@ from tensorflow.python.platform import test
 
 
 class StringStripOpTest(test.TestCase):
-  """ Test cases for tf.string_strip."""
+  """ Test cases for tf.strings.strip."""
 
   def test_string_strip(self):
     strings = ["pigs on the wing", "animals"]
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [b"pigs on the wing", b"animals"])
 
   def test_string_strip_2d(self):
@@ -39,7 +39,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [[b"pigs on the wing", b"animals"],
                                    [b"hello", b"world"]])
 
@@ -48,7 +48,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [b"hello", b"", b"world", b""])
 
 
diff --git a/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py b/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
index 9cb0c9d18f32803aff5b5c7d1d5643d0742fee05..25f573fc144a6252ce8de3b88adf3874ab7f9bab 100644
--- a/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
+++ b/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -26,6 +27,7 @@ from tensorflow.python.platform import test
 
 class StringToHashBucketOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testStringToOneHashBucketFast(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -34,6 +36,7 @@ class StringToHashBucketOpTest(test.TestCase):
 
       self.assertAllEqual([0, 0, 0], result)
 
+  @test_util.run_deprecated_v1
   def testStringToHashBucketsFast(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -46,6 +49,7 @@ class StringToHashBucketOpTest(test.TestCase):
       # Fingerprint64('d') -> 4470636696479570465 -> mod 10 -> 5
       self.assertAllEqual([9, 2, 2, 5], result)
 
+  @test_util.run_deprecated_v1
   def testStringToOneHashBucketLegacyHash(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -54,6 +58,7 @@ class StringToHashBucketOpTest(test.TestCase):
 
       self.assertAllEqual([0, 0, 0], result)
 
+  @test_util.run_deprecated_v1
   def testStringToHashBucketsLegacyHash(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -70,7 +75,7 @@ class StringToHashBucketOpTest(test.TestCase):
       input_string = constant_op.constant(['a', 'b', 'c'])
       output = string_ops.string_to_hash_bucket_strong(
           input_string, 1, key=[123, 345])
-      self.assertAllEqual([0, 0, 0], output.eval())
+      self.assertAllEqual([0, 0, 0], self.evaluate(output))
 
   def testStringToHashBucketsStrong(self):
     with self.cached_session():
@@ -81,7 +86,7 @@ class StringToHashBucketOpTest(test.TestCase):
       # StrongKeyedHash(key, 'a') -> 7157389809176466784 -> mod 10 -> 4
       # StrongKeyedHash(key, 'b') -> 15805638358933211562 -> mod 10 -> 2
       # StrongKeyedHash(key, 'c') -> 18100027895074076528 -> mod 10 -> 8
-      self.assertAllEqual([4, 2, 8], output.eval())
+      self.assertAllEqual([4, 2, 8], self.evaluate(output))
 
   def testStringToHashBucketsStrongInvalidKey(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/string_to_number_op_test.py b/tensorflow/python/kernel_tests/string_to_number_op_test.py
index 99ee25e1253740653f9c92d3722ecf2f682ca003..49ccfd1028fa5b6dd290a949a841ea7653517431 100644
--- a/tensorflow/python/kernel_tests/string_to_number_op_test.py
+++ b/tensorflow/python/kernel_tests/string_to_number_op_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -45,6 +46,7 @@ class StringToNumberOpTest(test.TestCase):
         with self.assertRaisesOpError(outstr):
           output.eval(feed_dict={input_string: [instr]})
 
+  @test_util.run_deprecated_v1
   def testToFloat(self):
     self._test(dtypes.float32,
                [("0", 0), ("3", 3), ("-1", -1),
@@ -58,6 +60,7 @@ class StringToNumberOpTest(test.TestCase):
                 ("INF", float("INF"))],
                [("10foobar", _ERROR_MESSAGE + "10foobar")])
 
+  @test_util.run_deprecated_v1
   def testToDouble(self):
     self._test(dtypes.float64,
                [("0", 0), ("3", 3), ("-1", -1),
@@ -71,6 +74,7 @@ class StringToNumberOpTest(test.TestCase):
                 ("INF", float("INF"))],
                [("10foobar", _ERROR_MESSAGE + "10foobar")])
 
+  @test_util.run_deprecated_v1
   def testToInt32(self):
     self._test(dtypes.int32,
                [("0", 0), ("3", 3), ("-1", -1),
@@ -84,6 +88,7 @@ class StringToNumberOpTest(test.TestCase):
                    ("2.9", _ERROR_MESSAGE + "2.9"),
                    ("10foobar", _ERROR_MESSAGE + "10foobar")])
 
+  @test_util.run_deprecated_v1
   def testToInt64(self):
     self._test(dtypes.int64,
                [("0", 0), ("3", 3), ("-1", -1),
diff --git a/tensorflow/python/kernel_tests/substr_op_test.py b/tensorflow/python/kernel_tests/substr_op_test.py
index 37aa624b07e86c68a48d3859bf88d8ef0ce93253..9302152e82bfa9c807a644f73ef1e705594b45f8 100644
--- a/tensorflow/python/kernel_tests/substr_op_test.py
+++ b/tensorflow/python/kernel_tests/substr_op_test.py
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -51,7 +52,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -71,7 +72,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Full string
@@ -83,7 +84,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, test_string)
 
     # Full string (Negative)
@@ -95,7 +96,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, test_string)
 
     # Length is larger in magnitude than a negative position
@@ -111,7 +112,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_string)
 
   @parameterized.parameters(
@@ -138,7 +139,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -173,7 +174,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     position = np.array(-3, dtype)
@@ -188,7 +189,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -229,7 +230,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -271,7 +272,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Broadcast input string onto pos/len
@@ -294,7 +295,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Test 1D broadcast
@@ -310,7 +311,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -319,6 +320,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testBadBroadcast(self, dtype, unit):
     test_string = [[b"ten", b"eleven", b"twelve"],
                    [b"thirteen", b"fourteen", b"fifteen"],
@@ -338,6 +340,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, -6, "UTF8_CHAR"),
       (np.int64, -6, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_Scalar(self, dtype, pos, unit):
     # Scalar/Scalar
     test_string = {
@@ -349,7 +352,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, 4, "BYTE"),
@@ -361,6 +364,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, -4, "UTF8_CHAR"),
       (np.int64, -4, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_VectorScalar(self, dtype, pos, unit):
     # Vector/Scalar
     test_string = {
@@ -373,7 +377,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -381,6 +385,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_MatrixMatrix(self, dtype, unit):
     # Matrix/Matrix
     test_string = {
@@ -398,7 +403,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
     # Matrix/Matrix (with negative)
     position = np.array([[1, 2, -3], [1, 2, -4], [1, 2, -3]], dtype)
@@ -406,7 +411,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -414,6 +419,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_Broadcast(self, dtype, unit):
     # Broadcast
     test_string = {
@@ -428,7 +434,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
     # Broadcast (with negative)
     position = np.array([-1, -2, -4], dtype)
@@ -436,7 +442,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -444,6 +450,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testMismatchPosLenShapes(self, dtype, unit):
     test_string = {
         "BYTE": [[b"ten", b"eleven", b"twelve"],
@@ -471,6 +478,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       string_ops.substr(test_string, position, length)
 
+  @test_util.run_deprecated_v1
   def testWrongDtype(self):
     with self.cached_session():
       with self.assertRaises(TypeError):
@@ -478,6 +486,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(TypeError):
         string_ops.substr(b"test", 3, 1.0)
 
+  @test_util.run_deprecated_v1
   def testInvalidUnit(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
diff --git a/tensorflow/python/kernel_tests/summary_audio_op_test.py b/tensorflow/python/kernel_tests/summary_audio_op_test.py
deleted file mode 100644
index e59a2ceef7e4c8e8099da0b7aa4d8f3bd8b0b124..0000000000000000000000000000000000000000
--- a/tensorflow/python/kernel_tests/summary_audio_op_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for summary sound op."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
-from tensorflow.python.summary import summary
-
-
-class SummaryAudioOpTest(test.TestCase):
-
-  def _AsSummary(self, s):
-    summ = summary_pb2.Summary()
-    summ.ParseFromString(s)
-    return summ
-
-  def _CheckProto(self, audio_summ, sample_rate, num_channels, length_frames):
-    """Verify that the non-audio parts of the audio_summ proto match shape."""
-    # Only the first 3 sounds are returned.
-    for v in audio_summ.value:
-      v.audio.ClearField("encoded_audio_string")
-    expected = "\n".join("""
-        value {
-          tag: "snd/audio/%d"
-          audio { content_type: "audio/wav" sample_rate: %d
-                  num_channels: %d length_frames: %d }
-        }""" % (i, sample_rate, num_channels, length_frames) for i in xrange(3))
-    self.assertProtoEquals(expected, audio_summ)
-
-  def testAudioSummary(self):
-    np.random.seed(7)
-    for channels in (1, 2, 5, 8):
-      with self.session(graph=ops.Graph()) as sess:
-        num_frames = 7
-        shape = (4, num_frames, channels)
-        # Generate random audio in the range [-1.0, 1.0).
-        const = 2.0 * np.random.random(shape) - 1.0
-
-        # Summarize
-        sample_rate = 8000
-        summ = summary.audio(
-            "snd", const, max_outputs=3, sample_rate=sample_rate)
-        value = sess.run(summ)
-        self.assertEqual([], summ.get_shape())
-        audio_summ = self._AsSummary(value)
-
-        # Check the rest of the proto
-        self._CheckProto(audio_summ, sample_rate, channels, num_frames)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/kernel_tests/summary_image_op_test.py b/tensorflow/python/kernel_tests/summary_image_op_test.py
deleted file mode 100644
index b650e1040424818e06181c0019139127414b41d7..0000000000000000000000000000000000000000
--- a/tensorflow/python/kernel_tests/summary_image_op_test.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for summary image op."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import image_ops
-import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
-from tensorflow.python.platform import test
-from tensorflow.python.summary import summary
-
-
-class SummaryImageOpTest(test.TestCase):
-
-  def _AsSummary(self, s):
-    summ = summary_pb2.Summary()
-    summ.ParseFromString(s)
-    return summ
-
-  def _CheckProto(self, image_summ, shape):
-    """Verify that the non-image parts of the image_summ proto match shape."""
-    # Only the first 3 images are returned.
-    for v in image_summ.value:
-      v.image.ClearField("encoded_image_string")
-    expected = "\n".join("""
-        value {
-          tag: "img/image/%d"
-          image { height: %d width: %d colorspace: %d }
-        }""" % ((i,) + shape[1:]) for i in xrange(3))
-    self.assertProtoEquals(expected, image_summ)
-
-  def testImageSummary(self):
-    for depth in (1, 3, 4):
-      for positive in False, True:
-        with self.session(graph=ops.Graph()) as sess:
-          shape = (4, 5, 7) + (depth,)
-          bad_color = [255, 0, 0, 255][:depth]
-          # Build a mostly random image with one nan
-          const = np.random.randn(*shape).astype(np.float32)
-          const[0, 1, 2] = 0  # Make the nan entry not the max
-          if positive:
-            const = 1 + np.maximum(const, 0)
-            scale = 255 / const.reshape(4, -1).max(axis=1)
-            offset = 0
-          else:
-            scale = 127 / np.abs(const.reshape(4, -1)).max(axis=1)
-            offset = 128
-          adjusted = np.floor(scale[:, None, None, None] * const + offset)
-          const[0, 1, 2, depth // 2] = np.nan
-
-          # Summarize
-          summ = summary.image("img", const)
-          value = sess.run(summ)
-          self.assertEqual([], summ.get_shape())
-          image_summ = self._AsSummary(value)
-
-          # Decode the first image and check consistency
-          image = image_ops.decode_png(image_summ.value[0]
-                                       .image.encoded_image_string).eval()
-          self.assertAllEqual(image[1, 2], bad_color)
-          image[1, 2] = adjusted[0, 1, 2]
-          self.assertAllClose(image, adjusted[0], rtol=2e-5, atol=2e-5)
-
-          # Check the rest of the proto
-          self._CheckProto(image_summ, shape)
-
-  def testImageSummaryUint8(self):
-    np.random.seed(7)
-    for depth in (1, 3, 4):
-      with self.session(graph=ops.Graph()) as sess:
-        shape = (4, 5, 7) + (depth,)
-
-        # Build a random uint8 image
-        images = np.random.randint(256, size=shape).astype(np.uint8)
-        tf_images = ops.convert_to_tensor(images)
-        self.assertEqual(tf_images.dtype, dtypes.uint8)
-
-        # Summarize
-        summ = summary.image("img", tf_images)
-        value = sess.run(summ)
-        self.assertEqual([], summ.get_shape())
-        image_summ = self._AsSummary(value)
-
-        # Decode the first image and check consistency.
-        # Since we're uint8, everything should be exact.
-        image = image_ops.decode_png(image_summ.value[0]
-                                     .image.encoded_image_string).eval()
-        self.assertAllEqual(image, images[0])
-
-        # Check the rest of the proto
-        self._CheckProto(image_summ, shape)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops_test.py
deleted file mode 100644
index 0c500120b0b81907e1c6d2a4a70405b4c7b42687..0000000000000000000000000000000000000000
--- a/tensorflow/python/kernel_tests/summary_ops_test.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for summary ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import logging_ops
-from tensorflow.python.platform import test
-from tensorflow.python.summary import summary
-
-
-class SummaryOpsTest(test.TestCase):
-
-  def _AsSummary(self, s):
-    summ = summary_pb2.Summary()
-    summ.ParseFromString(s)
-    return summ
-
-  def testScalarSummary(self):
-    with self.cached_session() as sess:
-      const = constant_op.constant([10.0, 20.0])
-      summ = logging_ops.scalar_summary(["c1", "c2"], const, name="mysumm")
-      value = sess.run(summ)
-    self.assertEqual([], summ.get_shape())
-    self.assertProtoEquals("""
-      value { tag: "c1" simple_value: 10.0 }
-      value { tag: "c2" simple_value: 20.0 }
-      """, self._AsSummary(value))
-
-  def testScalarSummaryDefaultName(self):
-    with self.cached_session() as sess:
-      const = constant_op.constant([10.0, 20.0])
-      summ = logging_ops.scalar_summary(["c1", "c2"], const)
-      value = sess.run(summ)
-    self.assertEqual([], summ.get_shape())
-    self.assertProtoEquals("""
-      value { tag: "c1" simple_value: 10.0 }
-      value { tag: "c2" simple_value: 20.0 }
-      """, self._AsSummary(value))
-
-  def testMergeSummary(self):
-    with self.cached_session() as sess:
-      const = constant_op.constant(10.0)
-      summ1 = summary.histogram("h", const)
-      summ2 = logging_ops.scalar_summary("c", const)
-      merge = summary.merge([summ1, summ2])
-      value = sess.run(merge)
-    self.assertEqual([], merge.get_shape())
-    self.assertProtoEquals("""
-      value {
-        tag: "h"
-        histo {
-          min: 10.0
-          max: 10.0
-          num: 1.0
-          sum: 10.0
-          sum_squares: 100.0
-          bucket_limit: 9.93809490288
-          bucket_limit: 10.9319043932
-          bucket_limit: 1.7976931348623157e+308
-          bucket: 0.0
-          bucket: 1.0
-          bucket: 0.0
-        }
-      }
-      value { tag: "c" simple_value: 10.0 }
-    """, self._AsSummary(value))
-
-  def testMergeAllSummaries(self):
-    with ops.Graph().as_default():
-      const = constant_op.constant(10.0)
-      summ1 = summary.histogram("h", const)
-      summ2 = summary.scalar("o", const, collections=["foo_key"])
-      summ3 = summary.scalar("c", const)
-      merge = summary.merge_all()
-      self.assertEqual("MergeSummary", merge.op.type)
-      self.assertEqual(2, len(merge.op.inputs))
-      self.assertEqual(summ1, merge.op.inputs[0])
-      self.assertEqual(summ3, merge.op.inputs[1])
-      merge = summary.merge_all("foo_key")
-      self.assertEqual("MergeSummary", merge.op.type)
-      self.assertEqual(1, len(merge.op.inputs))
-      self.assertEqual(summ2, merge.op.inputs[0])
-      self.assertTrue(summary.merge_all("bar_key") is None)
-
-  def testHistogramSummaryTypes(self):
-    with ops.Graph().as_default():
-      for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32,
-                    dtypes.float32, dtypes.float64):
-        const = constant_op.constant(10, dtype=dtype)
-        summary.histogram("h", const)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/kernel_tests/summary_tensor_op_test.py b/tensorflow/python/kernel_tests/summary_tensor_op_test.py
deleted file mode 100644
index 0f4643393a12a2b1d72faaf22683698be3ee6f3b..0000000000000000000000000000000000000000
--- a/tensorflow/python/kernel_tests/summary_tensor_op_test.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BAvSIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for summary ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import summary_ops
-from tensorflow.python.platform import test
-
-
-class SummaryOpsTest(test.TestCase):
-
-  def _SummarySingleValue(self, s):
-    summ = summary_pb2.Summary()
-    summ.ParseFromString(s)
-    self.assertEqual(len(summ.value), 1)
-    return summ.value[0]
-
-  def _AssertNumpyEq(self, actual, expected):
-    self.assertTrue(np.array_equal(actual, expected))
-
-  def testTags(self):
-    with self.cached_session() as sess:
-      c = constant_op.constant(1)
-      s1 = summary_ops.tensor_summary("s1", c)
-      with ops.name_scope("foo"):
-        s2 = summary_ops.tensor_summary("s2", c)
-        with ops.name_scope("zod"):
-          s3 = summary_ops.tensor_summary("s3", c)
-          s4 = summary_ops.tensor_summary("TensorSummary", c)
-      summ1, summ2, summ3, summ4 = sess.run([s1, s2, s3, s4])
-
-    v1 = self._SummarySingleValue(summ1)
-    self.assertEqual(v1.tag, "s1")
-
-    v2 = self._SummarySingleValue(summ2)
-    self.assertEqual(v2.tag, "foo/s2")
-
-    v3 = self._SummarySingleValue(summ3)
-    self.assertEqual(v3.tag, "foo/zod/s3")
-
-    v4 = self._SummarySingleValue(summ4)
-    self.assertEqual(v4.tag, "foo/zod/TensorSummary")
-
-  def testScalarSummary(self):
-    with self.cached_session() as sess:
-      const = constant_op.constant(10.0)
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
-
-    value = self._SummarySingleValue(result)
-    n = tensor_util.MakeNdarray(value.tensor)
-    self._AssertNumpyEq(n, 10)
-
-  def testStringSummary(self):
-    s = six.b("foobar")
-    with self.cached_session() as sess:
-      const = constant_op.constant(s)
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
-
-    value = self._SummarySingleValue(result)
-    n = tensor_util.MakeNdarray(value.tensor)
-    self._AssertNumpyEq(n, s)
-
-  def testManyScalarSummary(self):
-    with self.cached_session() as sess:
-      const = array_ops.ones([5, 5, 5])
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
-    value = self._SummarySingleValue(result)
-    n = tensor_util.MakeNdarray(value.tensor)
-    self._AssertNumpyEq(n, np.ones([5, 5, 5]))
-
-  def testManyStringSummary(self):
-    strings = [[six.b("foo bar"), six.b("baz")], [six.b("zoink"), six.b("zod")]]
-    with self.cached_session() as sess:
-      const = constant_op.constant(strings)
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
-    value = self._SummarySingleValue(result)
-    n = tensor_util.MakeNdarray(value.tensor)
-    self._AssertNumpyEq(n, strings)
-
-  def testManyBools(self):
-    bools = [True, True, True, False, False, False]
-    with self.cached_session() as sess:
-      const = constant_op.constant(bools)
-      summ = summary_ops.tensor_summary("foo", const)
-      result = sess.run(summ)
-
-    value = self._SummarySingleValue(result)
-    n = tensor_util.MakeNdarray(value.tensor)
-    self._AssertNumpyEq(n, bools)
-
-  def testSummaryDescriptionAndDisplayName(self):
-    with self.cached_session() as sess:
-
-      def get_description(summary_op):
-        summ_str = sess.run(summary_op)
-        summ = summary_pb2.Summary()
-        summ.ParseFromString(summ_str)
-        return summ.value[0].metadata
-
-      const = constant_op.constant(1)
-      # Default case; no description or display name
-      simple_summary = summary_ops.tensor_summary("simple", const)
-
-      descr = get_description(simple_summary)
-      self.assertEqual(descr.display_name, "")
-      self.assertEqual(descr.summary_description, "")
-
-      # Values are provided via function args
-      with_values = summary_ops.tensor_summary(
-          "simple",
-          const,
-          display_name="my name",
-          summary_description="my description")
-
-      descr = get_description(with_values)
-      self.assertEqual(descr.display_name, "my name")
-      self.assertEqual(descr.summary_description, "my description")
-
-      # Values are provided via the SummaryMetadata arg
-      metadata = summary_pb2.SummaryMetadata()
-      metadata.display_name = "my name"
-      metadata.summary_description = "my description"
-
-      with_metadata = summary_ops.tensor_summary(
-          "simple", const, summary_metadata=metadata)
-      descr = get_description(with_metadata)
-      self.assertEqual(descr.display_name, "my name")
-      self.assertEqual(descr.summary_description, "my description")
-
-      # If both SummaryMetadata and explicit args are provided, the args win
-      overwrite = summary_ops.tensor_summary(
-          "simple",
-          const,
-          summary_metadata=metadata,
-          display_name="overwritten",
-          summary_description="overwritten")
-      descr = get_description(overwrite)
-      self.assertEqual(descr.display_name, "overwritten")
-      self.assertEqual(descr.summary_description, "overwritten")
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1547c55f8b0b112325c6049f2052091228c171bf
--- /dev/null
+++ b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
@@ -0,0 +1,72 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for summary V1 audio op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary
+
+
+class SummaryV1AudioOpTest(test.TestCase):
+
+  def _AsSummary(self, s):
+    summ = summary_pb2.Summary()
+    summ.ParseFromString(s)
+    return summ
+
+  def _CheckProto(self, audio_summ, sample_rate, num_channels, length_frames):
+    """Verify that the non-audio parts of the audio_summ proto match shape."""
+    # Only the first 3 sounds are returned.
+    for v in audio_summ.value:
+      v.audio.ClearField("encoded_audio_string")
+    expected = "\n".join("""
+        value {
+          tag: "snd/audio/%d"
+          audio { content_type: "audio/wav" sample_rate: %d
+                  num_channels: %d length_frames: %d }
+        }""" % (i, sample_rate, num_channels, length_frames) for i in xrange(3))
+    self.assertProtoEquals(expected, audio_summ)
+
+  def testAudioSummary(self):
+    np.random.seed(7)
+    for channels in (1, 2, 5, 8):
+      with self.session(graph=ops.Graph()) as sess:
+        num_frames = 7
+        shape = (4, num_frames, channels)
+        # Generate random audio in the range [-1.0, 1.0).
+        const = 2.0 * np.random.random(shape) - 1.0
+
+        # Summarize
+        sample_rate = 8000
+        summ = summary.audio(
+            "snd", const, max_outputs=3, sample_rate=sample_rate)
+        value = self.evaluate(summ)
+        self.assertEqual([], summ.get_shape())
+        audio_summ = self._AsSummary(value)
+
+        # Check the rest of the proto
+        self._CheckProto(audio_summ, sample_rate, channels, num_frames)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/summary_v1_image_op_test.py b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..56de2e933db6498d274d9463e89d01ac3c06b2bc
--- /dev/null
+++ b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
@@ -0,0 +1,118 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for summary V1 image op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import image_ops
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary
+
+
+class SummaryV1ImageOpTest(test.TestCase):
+
+  def _AsSummary(self, s):
+    summ = summary_pb2.Summary()
+    summ.ParseFromString(s)
+    return summ
+
+  def _CheckProto(self, image_summ, shape):
+    """Verify that the non-image parts of the image_summ proto match shape."""
+    # Only the first 3 images are returned.
+    for v in image_summ.value:
+      v.image.ClearField("encoded_image_string")
+    expected = "\n".join("""
+        value {
+          tag: "img/image/%d"
+          image { height: %d width: %d colorspace: %d }
+        }""" % ((i,) + shape[1:]) for i in xrange(3))
+    self.assertProtoEquals(expected, image_summ)
+
+  @test_util.run_deprecated_v1
+  def testImageSummary(self):
+    for depth in (1, 3, 4):
+      for positive in False, True:
+        with self.session(graph=ops.Graph()) as sess:
+          shape = (4, 5, 7) + (depth,)
+          bad_color = [255, 0, 0, 255][:depth]
+          # Build a mostly random image with one nan
+          const = np.random.randn(*shape).astype(np.float32)
+          const[0, 1, 2] = 0  # Make the nan entry not the max
+          if positive:
+            const = 1 + np.maximum(const, 0)
+            scale = 255 / const.reshape(4, -1).max(axis=1)
+            offset = 0
+          else:
+            scale = 127 / np.abs(const.reshape(4, -1)).max(axis=1)
+            offset = 128
+          adjusted = np.floor(scale[:, None, None, None] * const + offset)
+          const[0, 1, 2, depth // 2] = np.nan
+
+          # Summarize
+          summ = summary.image("img", const)
+          value = self.evaluate(summ)
+          self.assertEqual([], summ.get_shape())
+          image_summ = self._AsSummary(value)
+
+          # Decode the first image and check consistency
+          image = image_ops.decode_png(image_summ.value[0]
+                                       .image.encoded_image_string).eval()
+          self.assertAllEqual(image[1, 2], bad_color)
+          image[1, 2] = adjusted[0, 1, 2]
+          self.assertAllClose(image, adjusted[0], rtol=2e-5, atol=2e-5)
+
+          # Check the rest of the proto
+          self._CheckProto(image_summ, shape)
+
+  @test_util.run_deprecated_v1
+  def testImageSummaryUint8(self):
+    np.random.seed(7)
+    for depth in (1, 3, 4):
+      with self.session(graph=ops.Graph()) as sess:
+        shape = (4, 5, 7) + (depth,)
+
+        # Build a random uint8 image
+        images = np.random.randint(256, size=shape).astype(np.uint8)
+        tf_images = ops.convert_to_tensor(images)
+        self.assertEqual(tf_images.dtype, dtypes.uint8)
+
+        # Summarize
+        summ = summary.image("img", tf_images)
+        value = self.evaluate(summ)
+        self.assertEqual([], summ.get_shape())
+        image_summ = self._AsSummary(value)
+
+        # Decode the first image and check consistency.
+        # Since we're uint8, everything should be exact.
+        image = image_ops.decode_png(image_summ.value[0]
+                                     .image.encoded_image_string).eval()
+        self.assertAllEqual(image, images[0])
+
+        # Check the rest of the proto
+        self._CheckProto(image_summ, shape)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/summary_v1_ops_test.py b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e070f5bf6f5d892aab2df630a3f1e1b96ee2dfce
--- /dev/null
+++ b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
@@ -0,0 +1,112 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the actual serialized proto output of the V1 tf.summary ops.
+
+The tensor, audio, and image ops have dedicated tests in adjacent files. The
+overall tf.summary API surface also has its own tests in summary_test.py that
+check calling the API methods but not the exact serialized proto output.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary
+
+
+class SummaryV1OpsTest(test.TestCase):
+
+  def _AsSummary(self, s):
+    summ = summary_pb2.Summary()
+    summ.ParseFromString(s)
+    return summ
+
+  def testScalarSummary(self):
+    with self.cached_session() as sess:
+      const = constant_op.constant([10.0, 20.0])
+      summ = logging_ops.scalar_summary(["c1", "c2"], const, name="mysumm")
+      value = self.evaluate(summ)
+    self.assertEqual([], summ.get_shape())
+    self.assertProtoEquals("""
+      value { tag: "c1" simple_value: 10.0 }
+      value { tag: "c2" simple_value: 20.0 }
+      """, self._AsSummary(value))
+
+  def testScalarSummaryDefaultName(self):
+    with self.cached_session() as sess:
+      const = constant_op.constant([10.0, 20.0])
+      summ = logging_ops.scalar_summary(["c1", "c2"], const)
+      value = self.evaluate(summ)
+    self.assertEqual([], summ.get_shape())
+    self.assertProtoEquals("""
+      value { tag: "c1" simple_value: 10.0 }
+      value { tag: "c2" simple_value: 20.0 }
+      """, self._AsSummary(value))
+
+  @test_util.run_deprecated_v1
+  def testMergeSummary(self):
+    with self.cached_session() as sess:
+      const = constant_op.constant(10.0)
+      summ1 = summary.histogram("h", const)
+      summ2 = logging_ops.scalar_summary("c", const)
+      merge = summary.merge([summ1, summ2])
+      value = self.evaluate(merge)
+    self.assertEqual([], merge.get_shape())
+    self.assertProtoEquals("""
+      value {
+        tag: "h"
+        histo {
+          min: 10.0
+          max: 10.0
+          num: 1.0
+          sum: 10.0
+          sum_squares: 100.0
+          bucket_limit: 9.93809490288
+          bucket_limit: 10.9319043932
+          bucket_limit: 1.7976931348623157e+308
+          bucket: 0.0
+          bucket: 1.0
+          bucket: 0.0
+        }
+      }
+      value { tag: "c" simple_value: 10.0 }
+    """, self._AsSummary(value))
+
+  def testMergeAllSummaries(self):
+    with ops.Graph().as_default():
+      const = constant_op.constant(10.0)
+      summ1 = summary.histogram("h", const)
+      summ2 = summary.scalar("o", const, collections=["foo_key"])
+      summ3 = summary.scalar("c", const)
+      merge = summary.merge_all()
+      self.assertEqual("MergeSummary", merge.op.type)
+      self.assertEqual(2, len(merge.op.inputs))
+      self.assertEqual(summ1, merge.op.inputs[0])
+      self.assertEqual(summ3, merge.op.inputs[1])
+      merge = summary.merge_all("foo_key")
+      self.assertEqual("MergeSummary", merge.op.type)
+      self.assertEqual(1, len(merge.op.inputs))
+      self.assertEqual(summ2, merge.op.inputs[0])
+      self.assertTrue(summary.merge_all("bar_key") is None)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8e5b5b882a3090620ecdb14292ae8d73f2c8bcd
--- /dev/null
+++ b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
@@ -0,0 +1,170 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BAvSIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for summary V1 tensor op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary as summary_lib
+
+
+class SummaryV1TensorOpTest(test.TestCase):
+
+  def _SummarySingleValue(self, s):
+    summ = summary_pb2.Summary()
+    summ.ParseFromString(s)
+    self.assertEqual(len(summ.value), 1)
+    return summ.value[0]
+
+  def _AssertNumpyEq(self, actual, expected):
+    self.assertTrue(np.array_equal(actual, expected))
+
+  def testTags(self):
+    with self.cached_session() as sess:
+      c = constant_op.constant(1)
+      s1 = summary_lib.tensor_summary("s1", c)
+      with ops.name_scope("foo"):
+        s2 = summary_lib.tensor_summary("s2", c)
+        with ops.name_scope("zod"):
+          s3 = summary_lib.tensor_summary("s3", c)
+          s4 = summary_lib.tensor_summary("TensorSummary", c)
+      summ1, summ2, summ3, summ4 = self.evaluate([s1, s2, s3, s4])
+
+    v1 = self._SummarySingleValue(summ1)
+    self.assertEqual(v1.tag, "s1")
+
+    v2 = self._SummarySingleValue(summ2)
+    self.assertEqual(v2.tag, "foo/s2")
+
+    v3 = self._SummarySingleValue(summ3)
+    self.assertEqual(v3.tag, "foo/zod/s3")
+
+    v4 = self._SummarySingleValue(summ4)
+    self.assertEqual(v4.tag, "foo/zod/TensorSummary")
+
+  def testScalarSummary(self):
+    with self.cached_session() as sess:
+      const = constant_op.constant(10.0)
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
+
+    value = self._SummarySingleValue(result)
+    n = tensor_util.MakeNdarray(value.tensor)
+    self._AssertNumpyEq(n, 10)
+
+  def testStringSummary(self):
+    s = six.b("foobar")
+    with self.cached_session() as sess:
+      const = constant_op.constant(s)
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
+
+    value = self._SummarySingleValue(result)
+    n = tensor_util.MakeNdarray(value.tensor)
+    self._AssertNumpyEq(n, s)
+
+  def testManyScalarSummary(self):
+    with self.cached_session() as sess:
+      const = array_ops.ones([5, 5, 5])
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
+    value = self._SummarySingleValue(result)
+    n = tensor_util.MakeNdarray(value.tensor)
+    self._AssertNumpyEq(n, np.ones([5, 5, 5]))
+
+  def testManyStringSummary(self):
+    strings = [[six.b("foo bar"), six.b("baz")], [six.b("zoink"), six.b("zod")]]
+    with self.cached_session() as sess:
+      const = constant_op.constant(strings)
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
+    value = self._SummarySingleValue(result)
+    n = tensor_util.MakeNdarray(value.tensor)
+    self._AssertNumpyEq(n, strings)
+
+  def testManyBools(self):
+    bools = [True, True, True, False, False, False]
+    with self.cached_session() as sess:
+      const = constant_op.constant(bools)
+      summ = summary_lib.tensor_summary("foo", const)
+      result = self.evaluate(summ)
+
+    value = self._SummarySingleValue(result)
+    n = tensor_util.MakeNdarray(value.tensor)
+    self._AssertNumpyEq(n, bools)
+
+  def testSummaryDescriptionAndDisplayName(self):
+    with self.cached_session() as sess:
+
+      def get_description(summary_op):
+        summ_str = self.evaluate(summary_op)
+        summ = summary_pb2.Summary()
+        summ.ParseFromString(summ_str)
+        return summ.value[0].metadata
+
+      const = constant_op.constant(1)
+      # Default case; no description or display name
+      simple_summary = summary_lib.tensor_summary("simple", const)
+
+      descr = get_description(simple_summary)
+      self.assertEqual(descr.display_name, "")
+      self.assertEqual(descr.summary_description, "")
+
+      # Values are provided via function args
+      with_values = summary_lib.tensor_summary(
+          "simple",
+          const,
+          display_name="my name",
+          summary_description="my description")
+
+      descr = get_description(with_values)
+      self.assertEqual(descr.display_name, "my name")
+      self.assertEqual(descr.summary_description, "my description")
+
+      # Values are provided via the SummaryMetadata arg
+      metadata = summary_pb2.SummaryMetadata()
+      metadata.display_name = "my name"
+      metadata.summary_description = "my description"
+
+      with_metadata = summary_lib.tensor_summary(
+          "simple", const, summary_metadata=metadata)
+      descr = get_description(with_metadata)
+      self.assertEqual(descr.display_name, "my name")
+      self.assertEqual(descr.summary_description, "my description")
+
+      # If both SummaryMetadata and explicit args are provided, the args win
+      overwrite = summary_lib.tensor_summary(
+          "simple",
+          const,
+          summary_metadata=metadata,
+          display_name="overwritten",
+          summary_description="overwritten")
+      descr = get_description(overwrite)
+      self.assertEqual(descr.display_name, "overwritten")
+      self.assertEqual(descr.summary_description, "overwritten")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index d20567bf0ecf587e6bb12bfd4a2a57658d2f5914..cfa9f122d1fcee1748cd30bdc4212d81a5709ae6 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
@@ -38,6 +40,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SvdOpTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
@@ -49,8 +52,9 @@ class SvdOpTest(test.TestCase):
                                  "Shape must be at least rank 2 but is rank 1"):
       linalg_ops.svd(vector)
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       all_ops = []
       for compute_uv_ in True, False:
         for full_matrices_ in True, False:
@@ -68,7 +72,7 @@ class SvdOpTest(test.TestCase):
             s2 = linalg_ops.svd(
                 matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
             all_ops += [s1, s2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       for i in range(2):
         s = 6 * i
         self.assertAllEqual(val[s], val[s + 3])  # s1 == s2
@@ -117,14 +121,15 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         diag_s = array_ops.concat([diag_s, zeros], a.ndim - 1)
     a_recon = math_ops.matmul(u, diag_s)
     a_recon = math_ops.matmul(a_recon, v, adjoint_b=True)
-    self.assertAllClose(a_recon.eval(), a, rtol=tol, atol=tol)
+    self.assertAllClose(a_recon, a, rtol=tol, atol=tol)
 
   def CheckUnitary(self, x, tol):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
-    self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
+    self.assertAllClose(identity, xx, atol=tol)
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     is_complex = dtype_ in (np.complex64, np.complex128)
     is_single = dtype_ in (np.float32, np.complex64)
@@ -140,7 +145,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       if use_static_shape_:
         x_tf = constant_op.constant(x_np)
       else:
@@ -150,7 +155,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         s_tf, u_tf, v_tf = linalg_ops.svd(
             x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
         if use_static_shape_:
-          s_tf_val, u_tf_val, v_tf_val = sess.run([s_tf, u_tf, v_tf])
+          s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
         else:
           s_tf_val, u_tf_val, v_tf_val = sess.run(
               [s_tf, u_tf, v_tf], feed_dict={x_tf: x_np})
@@ -158,7 +163,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         s_tf = linalg_ops.svd(
             x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
         if use_static_shape_:
-          s_tf_val = sess.run(s_tf)
+          s_tf_val = self.evaluate(s_tf)
         else:
           s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np})
 
@@ -213,6 +218,7 @@ def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
     tf_v *= phase[..., :n]
     return tf_s, tf_u, tf_v
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(42)
     a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
@@ -229,7 +235,7 @@ def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
       tol = 3e-2
     else:
       tol = 1e-6
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       tf_a = constant_op.constant(a)
       if compute_uv_:
         tf_s, tf_u, tf_v = _NormalizingSvd(tf_a)
@@ -263,7 +269,8 @@ if __name__ == "__main__":
           for cols in 1, 2, 5, 10, 32, 100:
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
               shape = batch_dims + (rows, cols)
-              for use_static_shape in True, False:
+              # TF2 does not support placeholders under eager so we skip it
+              for use_static_shape in set([True, tf2.enabled()]):
                 name = "%s_%s_static_shape_%s__compute_uv_%s_full_%s" % (
                     dtype.__name__, "_".join(map(str, shape)), use_static_shape,
                     compute_uv, full_matrices)
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 9dcdaa61ed2c0c12940817ccb311e27d1a19fa0c..3b2a56bd1ff6ef81ae17773fd5a23bc96778ce63 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -72,6 +72,7 @@ def variable_scoped_function_with_local_variable():
 
 class TemplateTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_end_to_end(self):
     """This test shows a very simple line model with test_loss.
 
@@ -104,10 +105,10 @@ class TemplateTest(test.TestCase):
     train_op = optimizer.minimize(train_loss)
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      initial_test_loss = sess.run(test_loss)
-      sess.run(train_op)
-      final_test_loss = sess.run(test_loss)
+      self.evaluate(variables.global_variables_initializer())
+      initial_test_loss = self.evaluate(test_loss)
+      self.evaluate(train_op)
+      final_test_loss = self.evaluate(test_loss)
 
     # Parameters are tied, so the loss should have gone down when we trained it.
     self.assertLess(final_test_loss, initial_test_loss)
@@ -172,6 +173,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1/dummy:0", v1.name)
     self.assertEqual("s1_1/dummy:0", v3.name)
 
+  @test_util.run_deprecated_v1
   def test_same_unique_name_raise_error(self):
     tmpl1 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
@@ -190,6 +192,7 @@ class TemplateTest(test.TestCase):
         template.make_template(
             "_", variable_scoped_function, unique_name_="s1")
 
+  @test_util.run_deprecated_v1
   def test_unique_name_and_reuse(self):
     tmpl1 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
@@ -260,6 +263,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1/test/dummy:0", v1.name)
     self.assertEqual("s1_1/test/dummy:0", v3.name)
 
+  @test_util.run_deprecated_v1
   def test_enforces_no_extra_trainable_variables(self):
     tmpl = template.make_template("s", function_with_create, trainable=True)
 
@@ -675,6 +679,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(1, len(tb.variables))
 
   # TODO(apassos) handle local variables in Eager
+  @test_util.run_deprecated_v1
   def test_local_variables(self):
     # Make sure trainable_variables are created.
     with variable_scope.variable_scope("foo3"):
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 0ad2063558f46b50d78d1d423849e2e8130003da..88625841bcc982bf477b619f3da0b70498f0542f 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -63,6 +63,8 @@ def _make_ta(size, name, dtype=dtypes.float32, infer_shape=False):
       dtype=dtype, tensor_array_name=name, size=size, infer_shape=infer_shape)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
 class TensorArrayTest(test.TestCase):
 
   @classmethod
@@ -77,7 +79,7 @@ class TensorArrayTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testTensorArrayWriteRead(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -98,7 +100,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(-3.0, d2)
 
   def _testTensorArrayWritePack(self, tf_dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3)
 
@@ -123,13 +125,11 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayWritePack(dtypes.complex128)
     self._testTensorArrayWritePack(dtypes.string)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayWritePack(self):
     self._testTensorArrayWritePackMaybeLegacy()
 
-  @test_util.run_in_graph_and_eager_modes
   def testEmptyTensorArrayPack(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
 
@@ -144,7 +144,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([3, 0, 1], c0.shape)
 
   def _testTensorArrayWriteConcat(self, tf_dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=tf_dtype, tensor_array_name="foo", size=3, infer_shape=False)
 
@@ -161,7 +161,7 @@ class TensorArrayTest(test.TestCase):
           convert([[4.0, 5.0], [104.0, 105.0], [204.0, 205.0], [6.0, 7.0],
                    [106.0, 107.0], [8.0, 9.0]]), c0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArrayWriteConcat(self):
     self._testTensorArrayWriteConcat(dtypes.float32)
     self._testTensorArrayWriteConcat(dtypes.float64)
@@ -172,7 +172,7 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayWriteConcat(dtypes.string)
 
   def _testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -184,7 +184,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                           self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118890905")
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
 
@@ -200,12 +201,13 @@ class TensorArrayTest(test.TestCase):
     self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                         self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118890905")
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
   def _testTensorArrayUnpackRead(self, tf_dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       convert = _make_converter(tf_dtype)
 
       ta = _make_ta(3, "foo", dtype=tf_dtype)
@@ -251,12 +253,11 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayUnpackRead(dtypes.complex128)
     self._testTensorArrayUnpackRead(dtypes.string)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayUnpackRead(self):
     self._testTensorArrayUnpackReadMaybeLegacy()
 
   def _testTensorArraySplitRead(self, tf_dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       convert = _make_converter(tf_dtype)
 
       # Split an empty vector
@@ -297,7 +298,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(convert([]).reshape(0, 2), d1)
       self.assertAllEqual(convert([[3.0, 301.0]]), d2)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArraySplitRead(self):
     self._testTensorArraySplitRead(dtypes.float32)
     self._testTensorArraySplitRead(dtypes.float64)
@@ -307,8 +308,10 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArraySplitRead(dtypes.complex128)
     self._testTensorArraySplitRead(dtypes.string)
 
-  def testTensorGradArrayWriteRead(self):
-    with self.test_session(use_gpu=True) as session:
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorGradArrayWriteRead(self):
+    with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -340,8 +343,30 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[2.0]], g_d1)
       self.assertAllEqual(-2.0, g_d2)
 
-  def testTensorGradArrayDynamicWriteRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradGrad(self):
+    if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+      self.skipTest("Legacy TensorArray does not support double derivatives.")
     with self.test_session(use_gpu=True) as session:
+      x = constant_op.constant(4.0)
+
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=1,
+          infer_shape=False)
+      w0 = ta.write(0, x)
+      r0 = w0.read(0)
+      y = r0 * r0
+
+      g1 = gradients_impl.gradients(ys=[y], xs=[x])
+      g2 = gradients_impl.gradients(ys=[g1], xs=[x])
+      self.assertAllEqual([2.0], session.run(g2))
+
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorGradArrayDynamicWriteRead(self):
+    with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -381,8 +406,10 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(3, vs)
       self.assertAllEqual(3, g_vs)
 
-  def testTensorGradAccessTwiceReceiveSameObject(self):
-    with self.test_session(use_gpu=True) as session:
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorGradAccessTwiceReceiveSameObject(self):
+    with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       g_ta_0 = ta.grad("grad")
@@ -397,51 +424,78 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(t_g_ta_0, t_g_ta_1)
       self.assertAllEqual([[4.0, 5.0]], d_r1_0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
       # Test writing the wrong datatype
-      with self.assertRaisesOpError(
-          "TensorArray dtype is (float|float32) but Op is trying to write "
-          "dtype string"):
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = ("Invalid data types; op elements string but list elements "
+                     "float")
+      else:
+        error_msg = (
+            "TensorArray dtype is (float|float32) but Op is trying to write "
+            "dtype string")
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(0, "wrong_type_scalar").flow)
 
-      with self.assertRaisesOpError("index -1"):
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to modify element -1 in a list with 3 elements."
+      else:
+        error_msg = "index -1"
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(-1, 3.0).flow)
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to modify element 3 in a list with 3 elements"
+      else:
+        error_msg = ("Tried to write to index 3 but array is not "
+                     "resizeable and size is: 3")
       # Test reading from too large an index
-      with self.assertRaisesOpError(
-          "Tried to write to index 3 but array is not "
-          "resizeable and size is: 3"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(3, 3.0).flow)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArrayReadWrongIndexOrDataTypeFails(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
 
       w0 = ta.write(0, [[4.0, 5.0]])
 
       # Test reading wrong datatype (only possible when constructing graphs).
-      if not context.executing_eagerly():
+      if (not context.executing_eagerly() and
+          not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtypes.float64, flow_in=w0.flow)
         with self.assertRaisesOpError(
             "TensorArray dtype is float but Op requested dtype double."):
-          r0_bad.eval()
+          self.evaluate(r0_bad)
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to access element -1 in a list with 3 elements."
+      else:
+        error_msg = "index -1"
       # Test reading from a negative index, which is not allowed
-      with self.assertRaisesOpError("index -1"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(-1))
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to access element 3 in a list with 3 elements."
+      else:
+        error_msg = "Tried to read from index 3 but array size is: 3"
       # Test reading from too large an index
-      with self.assertRaisesOpError(
-          "Tried to read from index 3 but array size is: 3"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(3))
 
-  def testTensorArrayWriteMultipleFails(self):
-    with self.test_session(use_gpu=True):
+  @test_util.disable_control_flow_v2("v2 allows multiple writes.")
+  @test_util.run_v1_only("v2 allows multiple writes.")
+  def testSkipEagerTensorArrayWriteMultipleFails(self):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
 
@@ -450,9 +504,9 @@ class TensorArrayTest(test.TestCase):
           "it has already been written to."):
         self.evaluate(ta.write(2, 3.0).write(2, 3.0).flow)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArrayConcatIncompatibleShapesFails(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -482,9 +536,9 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError("shape"):
         self.evaluate(w3.concat())
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArraySplitIncompatibleShapesFails(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       in_eager_mode = context.executing_eagerly()
       ta = _make_ta(3, "foo")
       with self.assertRaisesOpError(
@@ -495,25 +549,35 @@ class TensorArrayTest(test.TestCase):
           lengths = array_ops.placeholder(dtypes.int64)
           ta.split([1.0, 2.0, 3.0], lengths).flow.eval(feed_dict={lengths: 1})
 
-      with self.assertRaisesOpError(
-          r"Expected sum of lengths to be equal to values.shape\[0\], "
-          r"but sum of lengths is 1 and value's shape is: \[3\]"):
+      error_msg = ("Unused values in tensor. Length of tensor: 3 Values used: 1"
+                   if tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+                   not in_eager_mode else
+                   r"Expected sum of lengths to be equal to values.shape\[0\], "
+                   r"but sum of lengths is 1 and value's shape is: \[3\]")
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.split([1.0, 2.0, 3.0], [1]).flow)
 
       ta = _make_ta(1, "baz")
-      with self.assertRaisesOpError(
-          r"Expected value to be at least a vector, but received shape: \[\]"):
-        self.evaluate(ta.split(1.0, [1]).flow)
+      if tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and not in_eager_mode:
+        with self.assertRaisesRegexp(
+            ValueError, "Shape must be at least rank 1 but is rank 0"):
+          self.evaluate(ta.split(1.0, [1]).flow)
+      else:
+        with self.assertRaisesOpError(
+            r"Expected value to be at least a vector, but received shape: \[\]"
+        ):
+          self.evaluate(ta.split(1.0, [1]).flow)
 
-      ta = _make_ta(2, "buz")
-      with self.assertRaisesOpError(
-          r"TensorArray's size is not equal to the size of lengths "
-          r"\(2 vs. 1\), and the TensorArray is not marked as "
-          r"dynamically resizeable"):
-        self.evaluate(ta.split([1.0], [1]).flow)
+      if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 or in_eager_mode:
+        ta = _make_ta(2, "buz")
+        with self.assertRaisesOpError(
+            r"TensorArray's size is not equal to the size of lengths "
+            r"\(2 vs. 1\), and the TensorArray is not marked as "
+            r"dynamically resizeable"):
+          self.evaluate(ta.split([1.0], [1]).flow)
 
   def _testTensorArrayWriteGradientAddMultipleAdds(self, dtype):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtype, tensor_array_name="foo", size=3, infer_shape=False)
       ta_grad = ta.grad("grad")
@@ -546,13 +610,17 @@ class TensorArrayTest(test.TestCase):
           r"existing shape is \[\] but the new input shape is \[1\]"):
         wb1_grad.flow.eval()
 
-  def testTensorArrayWriteGradientAddMultipleAdds(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorArrayWriteGradientAddMultipleAdds(self):
     for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
                   dtypes.complex64, dtypes.complex128):
       self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
 
-  def testTensorArrayGradWithShapeKnownElementShape(self):
-    with self.test_session(use_gpu=True) as sess:
+  @test_util.disable_control_flow_v2("Low level legacy TA op test.")
+  @test_util.run_v1_only("Low level legacy TA op test.")
+  def testSkipEagerTensorArrayGradWithShapeKnownElementShape(self):
+    with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           size=3,
           dtype=dtypes.float32,
@@ -580,8 +648,10 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(fed_value,
                           sess.run(read_value, feed_dict={value: fed_value}))
 
-  def testTensorArrayGradWithShapeUnknownElementShape(self):
-    with self.test_session(use_gpu=True) as sess:
+  @test_util.disable_control_flow_v2("Low level legacy TA op test.")
+  @test_util.run_v1_only("Low level legacy TA op test.")
+  def testSkipEagerTensorArrayGradWithShapeUnknownElementShape(self):
+    with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           size=3, dtype=dtypes.float32,
           element_shape=None)  # Note that element_shape is unknown
@@ -603,9 +673,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(fed_value,
                           sess.run(read_value, feed_dict={value: fed_value}))
 
-  @test_util.run_in_graph_and_eager_modes
   def testMultiTensorArray(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       h1 = tensor_array_ops.TensorArray(
           size=1, dtype=dtypes.float32, tensor_array_name="foo")
       w1 = h1.write(0, 4.0)
@@ -621,7 +690,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(9.0, val)
 
   def _testTensorArrayGradientWriteReadType(self, dtype):
-    with self.test_session(use_gpu=True) as session:
+    with self.cached_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.as_dtype(dtype),
           tensor_array_name="foo",
@@ -667,12 +736,13 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(c([[3.0, 2.0]]), grad_vals[0])
       self.assertAllEqual(c(-2.0), grad_vals[1])
 
-  def testTensorArrayGradientWriteRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientWriteRead(self):
     for dtype in (np.float32, np.float64, np.complex64, np.complex128):
       self._testTensorArrayGradientWriteReadType(dtype)
 
   def _testTensorArrayGradientWritePackConcatAndRead(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -698,17 +768,19 @@ class TensorArrayTest(test.TestCase):
                 [-0.5, 1.5],  # read(0) gradient
                 [20.0, 30.0, 40.0, 50.0]
             ])  # concat gradient
-      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+      grad_vals = self.evaluate(grad_r)  # 2 + 2 entries
 
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
 
-  def testTensorArrayGradientWritePackConcatAndRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientWritePackConcatAndRead(self):
     self._testTensorArrayGradientWritePackConcatAndRead()
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("v2 does not support clear_after_read.")
+  @test_util.run_v1_only("v2 does not support clear_after_read.")
   def testTensorArrayReadTwice(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
 
       ta_readonce = tensor_array_ops.TensorArray(
@@ -736,7 +808,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([1.0, -1.0], self.evaluate(r1_readtwice))
 
   def _testTensorArrayGradientUnpackRead(self):
-    with self.test_session(use_gpu=True) as session:
+    with self.cached_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -760,11 +832,13 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0 - 1.5, 3.0 + 1.5], [4.0, 5.0]], grad_vals[0])
 
-  def testTensorArrayGradientUnpackRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientUnpackRead(self):
     self._testTensorArrayGradientUnpackRead()
 
-  def testTensorArrayGradientSplitConcat(self):
-    with self.test_session(use_gpu=True) as session:
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientSplitConcat(self):
+    with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=2,
           infer_shape=False)
@@ -787,7 +861,7 @@ class TensorArrayTest(test.TestCase):
                           grad_vals[0])
 
   def _testTensorArrayGradientDynamicUnpackRead(self):
-    with self.test_session(use_gpu=True) as session:
+    with self.cached_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -808,27 +882,25 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  def testTensorArrayGradientDynamicUnpackRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientDynamicUnpackRead(self):
     self._testTensorArrayGradientDynamicUnpackRead()
 
-  @test_util.run_in_graph_and_eager_modes
   def testCloseTensorArray(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       self.evaluate(ta.close())
 
-  @test_util.run_in_graph_and_eager_modes
   def testSizeTensorArray(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       s = ta.size()
       self.assertAllEqual(3, self.evaluate(s))
 
-  @test_util.run_in_graph_and_eager_modes
   def testWriteCloseTensorArray(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -840,7 +912,7 @@ class TensorArrayTest(test.TestCase):
 
   def _testWhileLoopWritePackGradients(self, dynamic_size, dtype):
     np_dtype = dtype.as_numpy_dtype
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       def func(v0, state0, var):
         ta = tensor_array_ops.TensorArray(
             dtype=dtype,
@@ -924,7 +996,6 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(grad_val.sum(axis=0), var_grad_t)
       self.assertAllClose(grad_val.sum(axis=0), state0_grad_t)
 
-  @test_util.run_in_graph_and_eager_modes
   def testWhileLoopWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=False, dtype=dtypes.float32)
@@ -932,13 +1003,30 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  def testWhileLoopDynamicWritePackGradients(self):
+  @test_util.disable_control_flow_v2("Testing v1 while_loop with v2 TA")
+  @test_util.enable_tensor_array_v2
+  def testWhileLoopV1WithTensorArrayV2(self):
+    size = 3
+    ta = tensor_array_ops.TensorArray(
+        dtype=dtypes.int32, size=size, element_shape=tensor_shape.scalar())
+
+    def Body(counter, ta):
+      return counter + 1, ta.write(counter, counter)
+
+    _, ta = control_flow_ops.while_loop(lambda i, _: i < size, Body, [0, ta])
+
+    for i in range(size):
+      self.assertEqual(self.evaluate(ta.read(i)), i)
+
+  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/119323158")
   def testGradSerialTwoLoops(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       def loop(x):
         num_steps = 100
         acc = tensor_array_ops.TensorArray(
@@ -976,8 +1064,9 @@ class TensorArrayTest(test.TestCase):
         grad = gradients_impl.gradients(loop(x), [x])[0]
       self.assertAllClose(31.0, self.evaluate(grad))
 
-  def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
-    with self.test_session(use_gpu=True) as session:
+  @test_util.run_deprecated_v1
+  def testSkipEagerSumOfTwoReadVariablesWithoutRepeatGrad(self):
+    with self.session(use_gpu=True) as session:
       a = array_ops.identity(
           np.arange(
               3 * 5, dtype=np.float32).reshape(3, 5) + 1)
@@ -1011,7 +1100,8 @@ class TensorArrayTest(test.TestCase):
   def _grad_source_for_name(self, name):
     return tensor_array_grad._GetGradSource(constant_op.constant(0, name=name))
 
-  def testGetGradSource_Invalid(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_Invalid(self):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("")
     with self.assertRaises(ValueError):
@@ -1019,7 +1109,8 @@ class TensorArrayTest(test.TestCase):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("foo/bar")
 
-  def testGetGradSource_NoEnclosingScope(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_NoEnclosingScope(self):
     self.assertEqual("gradients:0", self._grad_source_for_name("gradients"))
     self.assertEqual("gradients_0:0", self._grad_source_for_name("gradients_0"))
     self.assertEqual("gradients", self._grad_source_for_name("gradients/foo"))
@@ -1030,7 +1121,8 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("gradients_0",
                      self._grad_source_for_name("gradients_0/foo/bar"))
 
-  def testGetGradSource_EnclosingScope(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_EnclosingScope(self):
     self.assertEqual("foo/gradients:0",
                      self._grad_source_for_name("foo/gradients"))
     self.assertEqual("foo/gradients_0:0",
@@ -1044,13 +1136,15 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("foo/bar/gradients_0",
                      self._grad_source_for_name("foo/bar/gradients_0/baz"))
 
-  def testGetGradSource_NestedUsesInnermost(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_NestedUsesInnermost(self):
     self.assertEqual(
         "foo/gradients/bar/gradients_0",
         self._grad_source_for_name("foo/gradients/bar/gradients_0/baz"))
 
-  def testWriteShape(self):
-    with self.test_session(use_gpu=True):
+  @test_util.run_deprecated_v1
+  def testSkipEagerWriteShape(self):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       c0 = constant_op.constant([4.0, 5.0])
@@ -1073,8 +1167,9 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w0.write(0, c2)
 
-  def testPartlyUnknownShape(self):
-    with self.test_session(use_gpu=True):
+  @test_util.run_deprecated_v1
+  def testSkipEagerPartlyUnknownShape(self):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=6)
 
@@ -1113,9 +1208,8 @@ class TensorArrayTest(test.TestCase):
       r5 = w5.read(0)
       self.assertAllEqual([5, 4, 2, 3], r5.get_shape().as_list())
 
-  @test_util.run_in_graph_and_eager_modes
   def _testUnpackShape(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1144,12 +1238,14 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
+  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
+  @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testSplitShape(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1174,12 +1270,14 @@ class TensorArrayTest(test.TestCase):
         self.assertEqual((2, 2), w0.read(1).get_shape())
       else:
         self.assertEqual(r0.get_shape().ndims, None)
-        self.assertEqual(
-            tensor_shape.TensorShape(
-                ta1.handle.op.get_attr("element_shape")).ndims, None)
-
-  def testWriteUnknownShape(self):
-    with self.test_session(use_gpu=True):
+        if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+          self.assertEqual(
+              tensor_shape.TensorShape(
+                  ta1.handle.op.get_attr("element_shape")).ndims, None)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerWriteUnknownShape(self):
+    with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1191,7 +1289,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(r0.get_shape(), tensor_shape.unknown_shape())
 
   def _testGradientWhenNotAllComponentsRead(self):
-    with self.test_session(use_gpu=True) as session:
+    with self.cached_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
       x = constant_op.constant([2.0, 3.0])
       w = ta.unstack(x)
@@ -1201,53 +1299,66 @@ class TensorArrayTest(test.TestCase):
       grad_r0_vals = session.run(grad_r0)[0]
       self.assertAllEqual(grad_r0_vals, [1.0, 0.0])
 
-  def testGradientWhenNotAllComponentsRead(self):
+  # TODO(srbs): Figure out how to enable this. This is probably failing
+  # because we are trying to stack a TensorList with invalid tensors.
+  # That is because we do not receive gradients for all list indices.
+  # Figure out how TensorArray handles this.
+  def disabletestGradientWhenNotAllComponentsRead(self):
     self._testGradientWhenNotAllComponentsRead()
 
   def _testTensorArrayUnpackDynamic(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=3, dynamic_size=True)
       x = constant_op.constant([1.0, 2.0, 3.0])
       w0 = ta.unstack(x)
       w1 = w0.write(3, 4.0)
       r = w1.stack()
-      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), r.eval())
+      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), self.evaluate(r))
       grad = gradients_impl.gradients(ys=[r], xs=[x])
-      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), sess.run(grad)[0])
+      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  def testTensorArrayUnpackDynamic(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  def testTensorArraySplitDynamic(self):
-    with self.test_session(use_gpu=True) as sess:
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArraySplitDynamic(self):
+    with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=3, dynamic_size=True)
       x = constant_op.constant([1.0, 2.0, 3.0])
       w0 = ta.split(x, [1, 1, 1])
       w1 = w0.write(3, [4.0])
       r = w1.concat()
-      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), r.eval())
+      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), self.evaluate(r))
       grad = gradients_impl.gradients(ys=[r], xs=[x])
-      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), sess.run(grad)[0])
+      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
   def _testTensorArrayEvalEmpty(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=False)
-      with self.assertRaisesOpError(
-          "TensorArray has size zero, but element shape <unknown> is not fully "
-          "defined. Currently only static shapes are supported when packing "
-          "zero-size TensorArrays."):
+      v2_msg = ("Tried to stack elements of a empty list with "
+                "non-fully-defined shape")
+      v1_msg = (
+          "TensorArray has size zero, but element shape <unknown> is not "
+          "fully defined. Currently only static shapes are supported when "
+          "packing zero-size TensorArrays.")
+      with self.assertRaisesOpError(v2_msg if tensor_array_ops
+                                    .ENABLE_TENSOR_ARRAY_V2 else v1_msg):
         ta.stack().eval()
 
-  def testTensorArrayEvalEmpty(self):
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
 
   # this test is ill-defined for Eager mode --- unpacking an empty tensor
   # gives an empty list / there is not equivalent of "mark_used" in Eager
   def _testTensorArrayEvalEmptyWithDefault(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=True)
       self.assertEqual(0, ta.size().eval())
@@ -1255,16 +1366,20 @@ class TensorArrayTest(test.TestCase):
       ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       concatenated = ta.concat()
-      self.assertAllEqual([0, 3, 5], packed.eval().shape)
+      self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
-      self.assertAllEqual([0, 5], concatenated.eval().shape)
+      self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  def testTensorArrayEvalEmptyWithDefault(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  def testTensorArrayScatterReadAndGradients(self):
-    with self.test_session(use_gpu=True) as session:
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayScatterReadAndGradients(self):
+    with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1289,9 +1404,10 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/117943286")
+  @test_util.run_v1_only("b/117943286")
   def testTensorArrayWriteGatherAndGradients(self):
-    with self.test_session(use_gpu=True) as session:
+    with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
           tensor_array_name="foo",
@@ -1326,7 +1442,9 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[1.0, -1.0], [8.0, -8.0]], g_vals[0])
       self.assertAllEqual(expected_grad, grad_vals[0])
 
-  def testTensorArrayGetsDeviceFromFirstWrite(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerTensorArrayGetsDeviceFromFirstWrite(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       # this initial device will be ignored.
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
@@ -1374,7 +1492,9 @@ class TensorArrayTest(test.TestCase):
         self.assertFalse(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
-  def testTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
 
@@ -1398,12 +1518,14 @@ class TensorArrayTest(test.TestCase):
     for d in dev_stats:
       if "/task:1/" in d:
         self.assertTrue(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
       else:
         self.assertFalse(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
-  def testTensorArrayDisabledColocateWithFirstWriteCall(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerTensorArrayDisabledColocateWithFirstWriteCall(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=2, colocate_with_first_write_call=False)
@@ -1428,14 +1550,13 @@ class TensorArrayTest(test.TestCase):
     for d in dev_stats:
       if "/task:0/" in d and "CPU" in d:  # Skip any GPU node stats
         self.assertTrue(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
       else:
         self.assertFalse(
-            [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
+            [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayIdentity(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
                                          infer_shape=False)
       ta1 = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=4,
@@ -1486,7 +1607,8 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(size0_v, 2)
       self.assertEqual(size1_v, 4)
 
-  def testTensorArrayGradYsInCorrectScope(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradYsInCorrectScope(self):
     n_time = 1
     n_dim = 1
     x = constant_op.constant([[1.42]])
@@ -1500,14 +1622,14 @@ class TensorArrayTest(test.TestCase):
       # dy is outside of the gradients name scope; tf.gradients must
       # wrap it in the correct name scope.
       dx, = gradients_impl.gradients(ys=[y], xs=[x], grad_ys=[dy])
-      with self.test_session(use_gpu=True) as sess:
-        vdx, vdy = sess.run([dx, dy])
+      with self.cached_session(use_gpu=True) as sess:
+        vdx, vdy = self.evaluate([dx, dy])
       self.assertAllClose(vdx, vdy)
 
-  def testTensorArrayInt64GPU(self):
+  def testSkipEagerTensorArrayInt64GPU(self):
     if not test.is_gpu_available():
       return
-    with self.test_session(use_gpu=True, force_gpu=True) as sess:
+    with self.session(use_gpu=True, force_gpu=True) as sess:
       value = array_ops.placeholder(dtypes.int64)
       ta = tensor_array_ops.TensorArray(dtype=dtypes.int64, size=2)
       ta = ta.scatter([0, 1], value)
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index d8d76440f13cc5f815cabe7fb43246ff78c23656..febfe23b16d0a5b56102dd1c4c21d5cf16a0e1dc 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -20,9 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test as test_lib
@@ -39,6 +41,7 @@ def _add_test(test, test_name, fn):
 
 class TensordotTest(test_lib.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def test_invalid_shape(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4], [5, 6]]
@@ -62,6 +65,7 @@ class TensordotTest(test_lib.TestCase):
                 axes_ph: (a_axes, b_axes)
             })
 
+  @test_util.run_v1_only("b/120545219")
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
@@ -99,11 +103,12 @@ class TensordotTest(test_lib.TestCase):
 
         tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
         tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
-        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value).eval()
+        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
 
         self.assertAllEqual(tf_ans.shape, np_ans.shape)
         self.assertAllEqual(tf_ans, np_ans)
 
+  @test_util.run_v1_only("b/120545219")
   def test_partial_shape_inference(self):
     for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
@@ -165,7 +170,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
     for _ in range(num_trials):
       a_np, b_np, a_dims_np, b_dims_np = _generate_random_tensors_and_dims()
       np_ans = np.tensordot(a_np, b_np, axes=(a_dims_np, b_dims_np))
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         if dynamic_shape_:
           a = array_ops.placeholder(dtype_)
           b = array_ops.placeholder(dtype_)
@@ -178,7 +183,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
                   axes: (a_dims_np, b_dims_np)
               })
         else:
-          tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np)).eval()
+          tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np))
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
@@ -201,14 +206,14 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
       all_axes.append(a_np.ndim - 1)
     for axes in all_axes:
       np_ans = np.tensordot(a_np, b_np, axes=axes)
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         if dynamic_shape_:
           a = array_ops.placeholder(dtype_)
           b = array_ops.placeholder(dtype_)
           c = math_ops.tensordot(a, b, axes=axes)
           tf_ans = sess.run(c, feed_dict={a: a_np, b: b_np})
         else:
-          tf_ans = math_ops.tensordot(a_np, b_np, axes=axes).eval()
+          tf_ans = math_ops.tensordot(a_np, b_np, axes=axes)
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
@@ -220,7 +225,8 @@ if __name__ == "__main__":
     for rank_a in 1, 2, 4, 5:
       for rank_b in 1, 2, 4, 5:
         for num_dims in range(0, min(rank_a, rank_b) + 1):
-          for dynamic_shape in False, True:
+          # TF2 does not support placeholders under eager so we skip it
+          for dynamic_shape in set([False, not tf2.enabled()]):
             for testcase in _get_tensordot_tests(dtype, rank_a, rank_b,
                                                  num_dims, dynamic_shape):
               name = "%s_%s_%s_%s_%s_%s" % (testcase.__name__, dtype.__name__,
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index d5f07261062f88aaf8596faff9f29c21ed58dda9..5d46176bce87a94ac6f2c2ce51739c0289b38b80 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -46,9 +47,9 @@ class TopKTest(test.TestCase):
                     sorted=True):  # pylint: disable=redefined-builtin
     np_expected_values = np.array(expected_values)
     np_expected_indices = np.array(expected_indices)
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       values_op, indices_op = nn_ops.top_k(inputs, k, sorted=sorted)
-      values, indices = sess.run([values_op, indices_op])
+      values, indices = self.evaluate([values_op, indices_op])
 
       self.assertShapeEqual(np_expected_values, values_op)
       self.assertShapeEqual(np_expected_indices, indices_op)
@@ -181,22 +182,25 @@ class TopKTest(test.TestCase):
     k = constant_op.constant(3)
     self._validateTopK(inputs, k, [19, 18, 17], [11, 3, 7])
 
+  @test_util.run_deprecated_v1
   def testKNegative(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       k = array_ops.placeholder(dtypes.int32)
       values, _ = nn_ops.top_k(inputs, k)
       with self.assertRaisesOpError("Need k >= 0, got -7"):
         values.eval(feed_dict={k: -7})
 
+  @test_util.run_deprecated_v1
   def testKTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.assertRaisesRegexp(ValueError,
                                  r"must have last dimension >= k = 4"):
       nn_ops.top_k(inputs, 4)
 
+  @test_util.run_deprecated_v1
   def testTopKGradients(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       inputs = array_ops.placeholder(dtypes.float32, shape=[2, 5])
       values, _ = nn_ops.top_k(inputs, 3)
       grad = sess.run(
diff --git a/tensorflow/python/kernel_tests/trace_op_test.py b/tensorflow/python/kernel_tests/trace_op_test.py
index a5d5bcc149546e89093ace022aea187a47a9b1c1..52640c02c22770ba516a61488de7166b6d45ddf6 100644
--- a/tensorflow/python/kernel_tests/trace_op_test.py
+++ b/tensorflow/python/kernel_tests/trace_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -30,10 +31,11 @@ class TraceTest(test.TestCase):
 
   def compare(self, x):
     np_ans = np.trace(x, axis1=-2, axis2=-1)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_ans = math_ops.trace(x).eval()
     self.assertAllClose(tf_ans, np_ans)
 
+  @test_util.run_deprecated_v1
   def testTrace(self):
     for dtype in [np.int32, np.float32, np.float64]:
       for shape in [[2, 2], [2, 3], [3, 2], [2, 3, 2], [2, 2, 2, 3]]:
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index a825052dd2a15b6e98d1454f00b2dd0f046de575..76e1002ee1b97cea9fa29763b39f39a486a0ec16 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -47,10 +47,10 @@ class TransposeTest(test.TestCase):
     np_ans = self._np_transpose(x, perm)
     if conjugate:
       np_ans = np.conj(np_ans)
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
       y = array_ops.transpose(inx, p, conjugate=conjugate)
-      tf_ans = y.eval()
+      tf_ans = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       self.assertAllEqual(np_ans, tf_ans)
 
@@ -78,10 +78,10 @@ class TransposeTest(test.TestCase):
     np_ans = self._np_transpose(x, perm)
     if conjugate:
       np_ans = np.conj(np_ans)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       y = array_ops.transpose(inx, p, conjugate=conjugate)
-      tf_ans = y.eval()
+      tf_ans = self.evaluate(y)
 
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
@@ -165,10 +165,10 @@ class TransposeTest(test.TestCase):
         total_size = np.prod(input_shape)
         inp = np.arange(1, total_size + 1, dtype=datatype).reshape(input_shape)
         np_ans = self._np_transpose(inp, perm)
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           inx = ops.convert_to_tensor(inp)
           y = array_ops.transpose(inx, perm)
-          tf_ans = y.eval()
+          tf_ans = self.evaluate(y)
         self.assertAllEqual(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, y)
 
@@ -186,10 +186,10 @@ class TransposeTest(test.TestCase):
       total_size = np.prod(input_shape)
       inp = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_shape)
       np_ans = self._np_transpose(inp, perm)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -221,10 +221,10 @@ class TransposeTest(test.TestCase):
       total_size = np.prod(input_shape)
       inp = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_shape)
       np_ans = self._np_transpose(inp, perm)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -243,10 +243,10 @@ class TransposeTest(test.TestCase):
         total_size = np.prod(input_shape)
         inp = np.arange(1, total_size + 1, dtype=datatype).reshape(input_shape)
         np_ans = self._np_transpose(inp, perm)
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           inx = ops.convert_to_tensor(inp)
           y = array_ops.transpose(inx, perm)
-          tf_ans = y.eval()
+          tf_ans = self.evaluate(y)
         self.assertAllEqual(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, y)
 
@@ -264,10 +264,10 @@ class TransposeTest(test.TestCase):
       total_size = np.prod(input_shape)
       inp = np.arange(1, total_size + 1, dtype=np.float32).reshape(input_shape)
       np_ans = self._np_transpose(inp, perm)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -316,10 +316,10 @@ class TransposeTest(test.TestCase):
       # generate input data with random ints from 0 to 9.
       inp = np.random.randint(10, size=input_shape)
       np_ans = self._np_transpose(inp, perm)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
       self._ClearCachedSession()
@@ -337,11 +337,11 @@ class TransposeTest(test.TestCase):
       x = np.arange(0, 8).reshape([2, 4]).astype(np.float32)
       p = np.array([1, 0]).astype(perm_dtype)
       np_ans = np.copy(x).transpose(p)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(x)
         inp = constant_op.constant(p)
         y = array_ops.transpose(inx, inp)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
         self.assertShapeEqual(np_ans, y)
         self.assertAllEqual(np_ans, tf_ans)
 
@@ -414,7 +414,7 @@ class TransposeTest(test.TestCase):
   def testTranspose2DAuto(self):
     x_np = [[1, 2, 3], [4, 5, 6]]
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         x_tf = array_ops.transpose(x_np).eval()
         self.assertAllEqual(x_tf, [[1, 4], [2, 5], [3, 6]])
 
diff --git a/tensorflow/python/kernel_tests/unicode_decode_op_test.py b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c165021eea3eba54fbc77aa328acebaccd844a74
--- /dev/null
+++ b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for unicode_decode and unicode_decode_with_splits."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.platform import test
+
+
+# Account for python2 and python3 execution of the test.
+def codepoint(s):
+  if isinstance(s, bytes):
+    return ord(s.decode("utf-8"))
+  elif isinstance(s, str):
+    return ord(s)
+
+
+class UnicodeDecodeTest(test.TestCase):
+
+  def testBatchDecode(self):
+    text = constant_op.constant(
+        ["仅今年前", "分享介面終於迎來更新"])
+    row_splits, utf8_text, offsets = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8")
+
+    with self.test_session():
+      self.assertAllEqual([
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+          codepoint("分"),
+          codepoint("享"),
+          codepoint("介"),
+          codepoint("面"),
+          codepoint("終"),
+          codepoint("於"),
+          codepoint("迎"),
+          codepoint("來"),
+          codepoint("更"),
+          codepoint("新")
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual([0, 4, 14], self.evaluate(row_splits).tolist())
+      self.assertAllEqual([0, 3, 6, 9, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27],
+                          self.evaluate(offsets).tolist())
+
+  def testBasicDecodeWithOffset(self):
+    text = constant_op.constant(["仅今年前"])
+    row_splits, utf8_text, starts = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8")
+
+    with self.test_session():
+      self.assertAllEqual([
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual(self.evaluate(row_splits).tolist(), [0, 4])
+      self.assertAllEqual(self.evaluate(starts).tolist(), [0, 3, 6, 9])
+
+  @test_util.run_deprecated_v1
+  def testStrictError(self):
+    text = constant_op.constant([b"\xFEED"])
+    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="strict")
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      with self.test_session():
+        self.evaluate(error)
+
+  def testReplaceOnError(self):
+    text = constant_op.constant([b"\xFE"])
+
+    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="replace")
+
+    with self.test_session():
+      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [65533])
+
+  @test_util.run_deprecated_v1
+  def testBadReplacementChar(self):
+    text = constant_op.constant([b"\xFE"])
+    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="replace", replacement_char=11141111)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      with self.test_session():
+        self.evaluate(error)
+
+  def testIgnoreOnError(self):
+    text = constant_op.constant([b"\xFEhello"])
+
+    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="ignore")
+
+    with self.test_session():
+      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [
+          codepoint("h"),
+          codepoint("e"),
+          codepoint("l"),
+          codepoint("l"),
+          codepoint("o")
+      ])
+
+  @test_util.run_deprecated_v1
+  def testBadErrorPolicy(self):
+    text = constant_op.constant(["hippopotamus"])
+
+    with self.assertRaises(ValueError):
+      _, _, _ = gen_string_ops.unicode_decode_with_offsets(
+          text, "utf-8", errors="oranguatan")
+
+  def testReplaceControlChars(self):
+    text = constant_op.constant(["\x02仅今年前"])
+    row_splits, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", replace_control_characters=True)
+
+    with self.test_session():
+      self.assertAllEqual([
+          65533,
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual([0, 5], self.evaluate(row_splits).tolist())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unicode_encode_op_test.py b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f3cd8a6577e06fc4b3de81585d8b48231ae7076
--- /dev/null
+++ b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
@@ -0,0 +1,271 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnicodeEncode op from ragged_string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.platform import test
+
+
+class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
+
+  def assertRaggedEqual(self, rt, expected):
+    with self.cached_session() as sess:
+      value = sess.run(rt)
+      if isinstance(value, np.ndarray):
+        value = value.tolist()
+      elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
+        value = value.to_list()
+      self.assertEqual(value, expected)
+
+  def testScalar(self):
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        ragged_string_ops.unicode_encode(72, "UTF-8")
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        ragged_string_ops.unicode_encode(constant_op.constant(72), "UTF-8")
+
+  def testRequireParams(self):
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        ragged_string_ops.unicode_encode()
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        ragged_string_ops.unicode_encode(72)
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        ragged_string_ops.unicode_encode(encoding="UTF-8")
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def testStrictErrors(self, encoding):
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    with self.cached_session() as session:
+      with self.assertRaises(errors.InvalidArgumentError):
+        session.run(
+            ragged_string_ops.unicode_encode(test_value, encoding, "strict"))
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testIgnoreErrors(self, encoding):
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"Heo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "ignore")
+    with self.cached_session() as session:
+      result = session.run(unicode_encode_op)
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testReplaceErrors(self, encoding):
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "replace")
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    # Test custom replacement character
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"Heooo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "replace", 111)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    # Verify "replace" is default
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    # Replacement_char must be within range
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "replace", 1114112)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(unicode_encode_op)
+
+  # -- regular Tensor tests -- #
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testVector(self, encoding):
+    test_value = np.array([72, 101, 108, 108, 111], np.int32)
+    expected_value = u"Hello".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    test_value = np.array([72, 101, 195, 195, 128516], np.int32)
+    expected_value = u"He\xc3\xc3\U0001f604".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    # Single character string
+    test_value = np.array([72], np.int32)
+    expected_value = u"H".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    test_value = np.array([128516], np.int32)
+    expected_value = u"\U0001f604".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testMatrix(self, encoding):
+    test_value = np.array(
+        [[72, 128516, 108, 108, 111], [87, 128516, 114, 108, 100]], np.int32)
+    expected_value = [
+        u"H\U0001f604llo".encode(encoding), u"W\U0001f604rld".encode(encoding)
+    ]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertAllEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test3DimMatrix(self, encoding):
+    test_value = constant_op.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
+         [[102, 105, 120, 101, 100], [119, 111, 114, 100, 115]],
+         [[72, 121, 112, 101, 114], [99, 117, 98, 101, 46]]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World".encode(encoding)],
+                      [u"fixed".encode(encoding), u"words".encode(encoding)],
+                      [u"Hyper".encode(encoding), u"cube.".encode(encoding)]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test4DimMatrix(self, encoding):
+    test_value = constant_op.constant(
+        [[[[72, 101, 108, 108, 111]], [[87, 111, 114, 108, 100]]],
+         [[[102, 105, 120, 101, 100]], [[119, 111, 114, 100, 115]]],
+         [[[72, 121, 112, 101, 114]], [[99, 117, 98, 101, 46]]]], np.int32)
+    expected_value = [[[u"Hello".encode(encoding)],
+                       [u"World".encode(encoding)]],
+                      [[u"fixed".encode(encoding)],
+                       [u"words".encode(encoding)]],
+                      [[u"Hyper".encode(encoding)],
+                       [u"cube.".encode(encoding)]]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  # -- Ragged Tensor tests -- #
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testRaggedMatrix(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[72, 195, 108, 108, 111], [87, 128516, 114, 108, 100, 46]], np.int32)
+    expected_value = [
+        u"H\xc3llo".encode(encoding), u"W\U0001f604rld.".encode(encoding)
+    ]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test3DimMatrixWithRagged2ndDim(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
+         [[102, 105, 120, 101, 100]],
+         [[72, 121, 112, 101, 114], [119, 111, 114, 100, 115],
+          [99, 117, 98, 101, 46]]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World".encode(encoding)],
+                      [u"fixed".encode(encoding)],
+                      [
+                          u"Hyper".encode(encoding), u"words".encode(encoding),
+                          u"cube.".encode(encoding)
+                      ]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test3DimMatrixWithRagged3rdDim(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]],
+         [[68, 111, 110, 39, 116], [119, 195, 114, 114, 121, 44, 32, 98, 101]],
+         [[128516], []]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
+                      [
+                          u"Don't".encode(encoding),
+                          u"w\xc3rry, be".encode(encoding)
+                      ], [u"\U0001f604".encode(encoding), u"".encode(encoding)]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test3DimMatrixWithRagged2ndAnd3rdDim(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]], [],
+         [[128516]]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
+                      [], [u"\U0001f604".encode(encoding)]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test4DimRaggedMatrix(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]]],
+         [[[]], [[72, 121, 112, 101]]]], np.int32)
+    expected_value = [[[u"Hello".encode(encoding), u"World".encode(encoding)]],
+                      [[u"".encode(encoding)], [u"Hype".encode(encoding)]]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testRaggedMatrixWithMultiDimensionInnerValues(self, encoding):
+    test_flat_values = constant_op.constant([[[72, 101, 108, 108, 111],
+                                              [87, 111, 114, 108, 100]],
+                                             [[102, 105, 120, 101, 100],
+                                              [119, 111, 114, 100, 115]],
+                                             [[72, 121, 112, 101, 114],
+                                              [99, 117, 98, 101, 46]]])
+    test_row_splits = [
+        constant_op.constant([0, 2, 3], dtype=np.int64),
+        constant_op.constant([0, 1, 1, 3], dtype=np.int64)
+    ]
+    test_value = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        test_flat_values, test_row_splits)
+    expected_value = [[[[u"Hello".encode(encoding), u"World".encode(encoding)]],
+                       []],
+                      [[[u"fixed".encode(encoding), u"words".encode(encoding)],
+                        [u"Hyper".encode(encoding),
+                         u"cube.".encode(encoding)]]]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unicode_script_op_test.py b/tensorflow/python/kernel_tests/unicode_script_op_test.py
index 927e5459ed2cf56c6adc59323ef4e3a33eeb5dc7..83cfeb20216455a5fc11177991ef8aa7c5c44703 100644
--- a/tensorflow/python/kernel_tests/unicode_script_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_script_op_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
 class UnicodeScriptOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testValidScripts(self):
     inputs = [
         ord("a"),
@@ -45,6 +47,7 @@ class UnicodeScriptOpTest(test.TestCase):
               0  # USCRIPT_COMMON (ZYYY)
           ])
 
+  @test_util.run_deprecated_v1
   def testInvalidScript(self):
     inputs = [-100, 0xffffff]
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b4fd03474010b06009c52ad3afabf3e31ca024
--- /dev/null
+++ b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
@@ -0,0 +1,444 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for unicode_transcode op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+# Note: for now only tests for algorithmic converters since no file-based
+# converters can be loaded. TODO(gbillock): add ability to include at least
+# the ucmcore converters from the conversion data sets.
+class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
+
+  def test_transcode_utf8_simple(self):
+    strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]
+
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          strings,
+          input_encoding="UTF-8",
+          output_encoding="UTF-8",
+          errors="replace",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, strings)
+
+      outputs = string_ops.unicode_transcode(
+          strings,
+          input_encoding="ISO-8859-1",
+          output_encoding="UTF-8",
+          errors="replace",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, strings)
+
+      outputs = string_ops.unicode_transcode(
+          strings,
+          input_encoding="US-ASCII",
+          output_encoding="UTF-8",
+          errors="replace",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, strings)
+
+  def test_transcode_utf16_to_utf8(self):
+    strings = [b"\x00a\x00b\x20\xAC", b"\xD8\x01\xDC\x37"]  # U+10437
+    expected = [s.decode("UTF-16-BE").encode("UTF-8") for s in strings]
+
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          strings,
+          input_encoding="UTF-16",
+          output_encoding="UTF-8",
+          errors="replace",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, expected)
+
+  def test_transcode_bad_utf8(self):
+    bad_string = b"\x00\xff"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bad_string,
+          input_encoding="UTF-8",
+          output_encoding="UTF-8",
+          errors="replace",
+          replacement_char=ord(" "),
+          replace_control_characters=True)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"  ")
+
+      outputs = string_ops.unicode_transcode(
+          bad_string,
+          input_encoding="UTF-8",
+          output_encoding="UTF-8",
+          errors="replace",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"\x00 ")
+
+  def test_transcode_bad_utf8_with_some_good(self):
+    bad_string = b"abc\xffabcdefg"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bad_string,
+          input_encoding="UTF-8",
+          output_encoding="UTF-8",
+          errors="replace",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"abc abcdefg")
+
+  def test_transcode_bad_utf8_with_defaults(self):
+    bad_string = b"\x00\xff"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"\x00\xef\xbf\xbd")
+
+  def test_transcode_bad_utf8_with_space_replacement(self):
+    bad_string = b"\x00\xff"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bad_string, input_encoding="UTF-8", output_encoding="UTF-8",
+          replacement_char=ord(" "))
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"\x00 ")
+
+  @test_util.run_deprecated_v1
+  def test_transcode_bad_utf8_with_strict_errors(self):
+    bad_string = b"\x00\xff"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bad_string,
+          input_encoding="UTF-8",
+          output_encoding="UTF-8",
+          errors="strict")
+      with self.assertRaisesOpError(
+          "Invalid formatting on input string"):
+        self.evaluate(outputs)
+
+  @test_util.run_deprecated_v1
+  def test_transcode_bad_utf8_start_with_strict_errors(self):
+    bad_string = b"\xffabcd"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bad_string,
+          input_encoding="UTF-8",
+          output_encoding="UTF-8",
+          errors="strict")
+      with self.assertRaisesOpError(
+          "Invalid formatting on input string"):
+        self.evaluate(outputs)
+
+  def test_transcode_bad_utf8_with_elision_of_malformatting(self):
+    bad_string = b"\x00\xff"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bad_string,
+          input_encoding="UTF-8",
+          output_encoding="UTF-8",
+          errors="ignore")
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"\x00")
+
+  def test_transcode_bad_utf8_with_elision_including_control_chars(self):
+    bad_string = b"\x00\xff"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bad_string,
+          input_encoding="UTF-8",
+          output_encoding="UTF-8",
+          errors="ignore",
+          replace_control_characters=True)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"")
+
+  def test_transcode_bad_utf8_termination_with_defaults(self):
+    bad_string = b"a\xf0"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"a\xef\xbf\xbd")   # 0xFFFD
+
+  def test_transcode_utf8_with_replacement_char(self):
+    strings = [b"a\xef\xbf\xbd"]
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          strings, input_encoding="UTF-8", output_encoding="UTF-8",
+          errors="strict")
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
+
+      outputs = string_ops.unicode_transcode(
+          strings, input_encoding="UTF-8", output_encoding="UTF-8",
+          errors="replace", replacement_char=ord("?"))
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
+
+  def test_transcode_utf8_to_utf16(self):
+    strings = [b"ab\xe2\x82\xac", b"\xf0\x90\x90\xb7"]  # U+10437
+    expected = [s.decode("UTF-8").encode("UTF-16-BE") for s in strings]
+
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          strings,
+          input_encoding="UTF-8",
+          output_encoding="UTF-16-BE",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      print("values=", values)
+      self.assertAllEqual(values, expected)
+
+  def test_transcode_utf32_to_utf8(self):
+    strings = [
+        b"\x00\x00\x00a\x00\x00\x00b\x00\x00\x20\xAC", b"\x00\x01\x04\x37"
+    ]  # U+10437
+    expected = [s.decode("UTF-32-BE").encode("UTF-8") for s in strings]
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          strings,
+          input_encoding="UTF-32",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, expected)
+
+  def test_transcode_utf8_to_utf32(self):
+    strings = [b"ab\xe2\x82\xac", b"\xf0\x90\x90\xb7"]
+    expected = [s.decode("UTF-8").encode("UTF-32-BE") for s in strings]
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          strings,
+          input_encoding="UTF-8",
+          output_encoding="UTF-32-BE",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, expected)
+
+  # Documentation in ICU suggests that getNextUChar may produce a different
+  # error code if the input sequence contains particular non-coding sequences.
+  # This test checks that condition.
+  def test_transcode_ascii_with_shift_chars(self):
+    strings = [b"\x0e\x0e", b"\x0f\x0f"]
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          strings,
+          input_encoding="US-ASCII",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, strings)
+
+  def test_transcode_utf8_with_bom(self):
+    bom_string = b"\xef\xbb\xbfabcdefg"
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bom_string, input_encoding="UTF-8", output_encoding="UTF-8")
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"\xef\xbb\xbfabcdefg")  # BOM preserved
+
+      outputs = string_ops.unicode_transcode(
+          bom_string, input_encoding="UTF-8", output_encoding="UTF-16-BE")
+      values = self.evaluate(outputs)
+      utf16expected = bom_string.decode("UTF-8").encode("UTF-16-BE")
+      self.assertAllEqual(values, utf16expected)
+
+  def test_transcode_utf16_le_be_with_bom(self):
+    bom_string = b"\xfe\xff\x00\x61"  # Big-endian BOM with 'a' encoded
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          bom_string, input_encoding="UTF-16-BE", output_encoding="UTF-8")
+      values = self.evaluate(outputs)
+      # BOM is preserved in output
+      self.assertAllEqual(values, b"\xef\xbb\xbfa")
+
+      outputs = string_ops.unicode_transcode(
+          bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
+      values = self.evaluate(outputs)
+      # mangled BOM and value from (incorrect) LE encoding
+      self.assertAllEqual(values, b"\xef\xbf\xbe\xe6\x84\x80")
+
+      bom_string = b"\xff\xfe\x61\x00"  # Little-endian BOM with 'a' encoded
+      outputs = string_ops.unicode_transcode(
+          bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
+      values = self.evaluate(outputs)
+      self.assertAllEqual(values, b"\xef\xbb\xbfa")
+
+  @parameterized.parameters(
+      # BOM is stripped if it is used to decide the byte order of the input.
+      (b"\xfe\xff\x00*", "UTF-16", b"*"),
+      (b"\xff\xfe*\x00", "UTF-16", b"*"),
+      # BOM is *not* stripped if it is not used to decide the byte order of
+      # the input.
+      (b"\xef\xbb\xbf*", "UTF-8", b"\xef\xbb\xbf*"),
+      (b"\xfe\xff\x00*", "UTF-16-BE", b"\xef\xbb\xbf*"),
+      (b"\xff\xfe*\x00", "UTF-16-LE", b"\xef\xbb\xbf*"),
+      # If the encoding is UTF-16, and no BOM is present, then UTF-16-BE
+      # is assumed.
+      (b"\x00*", "UTF-16", b"*"),
+      # BOM is never stripped from any position other than the beginning of
+      # the string, for any encoding.
+      (b"<\xef\xbb\xbf>", "UTF-8", b"<\xef\xbb\xbf>"),
+      (b"\x00<\xfe\xff\x00>", "UTF-16", b"<\xef\xbb\xbf>"),
+      (b"\x00<\xfe\xff\x00>", "UTF-16-BE", b"<\xef\xbb\xbf>"),
+      (b"<\x00\xff\xfe>\x00", "UTF-16-LE", b"<\xef\xbb\xbf>"),
+      (b"\xfe\xff\x00<\xfe\xff\x00>", "UTF-16", b"<\xef\xbb\xbf>"),
+      (b"\xff\xfe<\x00\xff\xfe>\x00", "UTF-16", b"<\xef\xbb\xbf>"),
+  )
+  @test_util.run_deprecated_v1
+  def test_bom_handling(self, string, input_encoding, expected):
+    with self.test_session():
+      output = string_ops.unicode_transcode(
+          string, input_encoding=input_encoding, output_encoding="UTF-8")
+      self.assertAllEqual(output.eval(), expected)
+
+  @test_util.run_deprecated_v1
+  def test_invalid_encoding_causes_errors(self):
+    strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]
+
+    with self.cached_session() as sess:
+      outputs = string_ops.unicode_transcode(
+          strings,
+          input_encoding="invalid",
+          output_encoding="UTF-8",
+          errors="replace",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+      with self.assertRaisesOpError(
+          "Could not create converter for input encoding: invalid"):
+        self.evaluate(outputs)
+
+    with self.assertRaisesRegexp(ValueError, "Op passed string 'invalid'"):
+      with self.cached_session() as sess:
+        outputs = string_ops.unicode_transcode(
+            strings,
+            input_encoding="UTF-8",
+            output_encoding="invalid",
+            errors="replace",
+            replacement_char=ord(" "),
+            replace_control_characters=False)
+        self.evaluate(outputs)
+
+  @test_util.run_deprecated_v1
+  def test_invalid_error_policy_causes_errors(self):
+    strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]
+
+    with self.assertRaisesRegexp(
+        ValueError, "'invalid' not in: \"strict\", \"replace\", \"ignore\"."):
+      with self.cached_session() as sess:
+        outputs = string_ops.unicode_transcode(
+            strings,
+            input_encoding="UTF-8",
+            output_encoding="UTF-8",
+            errors="invalid",
+            replacement_char=ord(" "),
+            replace_control_characters=False)
+        self.evaluate(outputs)
+
+  def test_forwarding(self):
+    with self.cached_session():
+      # Generate an input that is uniquely consumed by the transcode op.
+      # This exercises code paths which are optimized for this case
+      # (e.g., using forwarding).
+      inp = string_ops.substr(
+          constant_op.constant([b"AbCdEfG", b"HiJkLmN"], dtypes.string),
+          pos=0,
+          len=5)
+      transcoded = string_ops.unicode_transcode(
+          inp, input_encoding="UTF-8", output_encoding="UTF-8")
+
+      self.assertAllEqual([b"AbCdE", b"HiJkL"], transcoded)
+
+  @test_util.run_deprecated_v1
+  def test_cjk_encodings(self):
+    strings_ja = [
+        b"\x5c\x5c",  # Yen sign
+        b"\x8f\x70",  # kanji character "waza"
+        b"\x83\x4f"
+    ]  # katakana character "gu"
+    strings_zh_cn = [b"\xca\xf5"]  # simplified "shu4"
+    strings_zh_tw = [b"\xb3\x4e"]  # traditional "shu4"
+    strings_ko = [b"\xc7\xd1\xb9\xce"]  # hangul "hanmin"
+
+    expected_ja = [s.decode("shift_jis").encode("UTF-8") for s in strings_ja]
+    expected_zh_cn = [
+        s.decode("gb18030").encode("UTF-8") for s in strings_zh_cn
+    ]
+    expected_zh_tw = [s.decode("big5").encode("UTF-8") for s in strings_zh_tw]
+    expected_ko = [s.decode("euc_kr").encode("UTF-8") for s in strings_ko]
+
+    with self.cached_session() as sess:
+      outputs_ja = string_ops.unicode_transcode(
+          strings_ja,
+          input_encoding="shift_jis",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_zh_cn = string_ops.unicode_transcode(
+          strings_zh_cn,
+          input_encoding="gb18030",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_zh_tw = string_ops.unicode_transcode(
+          strings_zh_tw,
+          input_encoding="big5",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_ko = string_ops.unicode_transcode(
+          strings_ko,
+          input_encoding="euc_kr",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      result_ja, result_zh_cn, result_zh_tw, result_ko = sess.run(
+          [outputs_ja, outputs_zh_cn, outputs_zh_tw, outputs_ko])
+
+      self.assertAllEqual(result_ja, expected_ja)
+      self.assertAllEqual(result_zh_cn, expected_zh_cn)
+      self.assertAllEqual(result_zh_tw, expected_zh_tw)
+      self.assertAllEqual(result_ko, expected_ko)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 316570e13e263a1a0b7bcef9c64448d58bb747af..f203263e0c567bb43ce1cb997bd343774d083d43 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -32,7 +32,7 @@ class UniqueTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx = array_ops.unique(x)
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -43,7 +43,7 @@ class UniqueTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx = array_ops.unique(x, out_idx=dtypes.int64)
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -55,7 +55,7 @@ class UniqueTest(test.TestCase):
     x = [chr(i) for i in indx]
     with self.cached_session() as sess:
       y, idx = array_ops.unique(x)
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -67,9 +67,9 @@ class UniqueTest(test.TestCase):
       x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
       with self.cached_session() as sess:
         y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
-        tf_y0, tf_idx0 = sess.run([y0, idx0])
+        tf_y0, tf_idx0 = self.evaluate([y0, idx0])
         y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
-        tf_y1, tf_idx1 = sess.run([y1, idx1])
+        tf_y1, tf_idx1 = self.evaluate([y1, idx1])
       self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
       self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
       self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
@@ -81,7 +81,7 @@ class UniqueTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx = gen_array_ops.unique_v2(x, axis=np.array([], np.int32))
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -95,7 +95,7 @@ class UniqueWithCountsTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx, count = array_ops.unique_with_counts(x)
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -108,7 +108,7 @@ class UniqueWithCountsTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx, count = array_ops.unique_with_counts(x, out_idx=dtypes.int64)
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -123,7 +123,7 @@ class UniqueWithCountsTest(test.TestCase):
 
     with self.cached_session() as sess:
       y, idx, count = array_ops.unique_with_counts(x)
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -139,10 +139,10 @@ class UniqueWithCountsTest(test.TestCase):
       with self.cached_session() as sess:
         y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
             x, axis=np.array([0], dtype))
-        tf_y0, tf_idx0, tf_count0 = sess.run([y0, idx0, count0])
+        tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
         y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
             x, axis=np.array([1], dtype))
-        tf_y1, tf_idx1, tf_count1 = sess.run([y1, idx1, count1])
+        tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
       self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
       self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
       self.assertAllEqual(tf_count0, np.array([2, 1]))
@@ -157,7 +157,7 @@ class UniqueWithCountsTest(test.TestCase):
     with self.cached_session() as sess:
       y, idx, count = gen_array_ops.unique_with_counts_v2(
           x, axis=np.array([], np.int32))
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index b373c419b648bc6f154787d8f4ceab33ae13cf55..f5ba475e7adabc9bb5b057504ad854f550395440 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -41,7 +41,7 @@ class UnstackOpTest(test.TestCase):
 
   def testSimple(self):
     np.random.seed(7)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [
             np.bool, np.float16, np.float32, np.float64, np.int32, np.int64
@@ -53,14 +53,15 @@ class UnstackOpTest(test.TestCase):
           cs = array_ops.unstack(x, num=shape[0])
           self.assertEqual(type(cs), list)
           self.assertEqual(len(cs), shape[0])
-          cs = [c.eval() for c in cs]
+          cs = [self.evaluate(c) for c in cs]
           self.assertAllEqual(cs, data)
 
   def testSimpleGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest('No GPU available')
+
     np.random.seed(7)
-    with self.test_session(use_gpu=True, force_gpu=True):
+    with test_util.force_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
@@ -70,34 +71,37 @@ class UnstackOpTest(test.TestCase):
           cs = array_ops.unstack(x, num=shape[0])
           self.assertEqual(type(cs), list)
           self.assertEqual(len(cs), shape[0])
-          cs = [c.eval() for c in cs]
+          cs = [self.evaluate(c) for c in cs]
           self.assertAllEqual(cs, data)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
       data = np.random.randn(*shape)
       shapes = [shape[1:]] * shape[0]
       for i in xrange(shape[0]):
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           x = constant_op.constant(data)
           cs = array_ops.unstack(x, num=shape[0])
           err = gradient_checker.compute_gradient_error(x, shape, cs[i],
                                                         shapes[i])
           self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis1(self):
     for shape in (2, 3), (3, 2), (4, 3, 2):
       data = np.random.randn(*shape)
       out_shape = list(shape)
       del out_shape[1]
       for i in xrange(shape[1]):
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           x = constant_op.constant(data)
           cs = array_ops.unstack(x, num=shape[1], axis=1)
           err = gradient_checker.compute_gradient_error(x, shape, cs[i],
                                                         out_shape)
           self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testInferNum(self):
     with self.cached_session():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
@@ -106,20 +110,23 @@ class UnstackOpTest(test.TestCase):
         self.assertEqual(type(cs), list)
         self.assertEqual(len(cs), shape[0])
 
+  @test_util.run_deprecated_v1
   def testCannotInferNumFromUnknownShape(self):
     x = array_ops.placeholder(np.float32)
     with self.assertRaisesRegexp(ValueError,
                                  r'Cannot infer num from shape <unknown>'):
       array_ops.unstack(x)
 
+  @test_util.run_deprecated_v1
   def testUnknownShapeOkWithNum(self):
     x = array_ops.placeholder(np.float32)
     array_ops.unstack(x, num=2)
 
+  @test_util.run_deprecated_v1
   def testCannotInferNumFromNoneShape(self):
     x = array_ops.placeholder(np.float32, shape=(None,))
     with self.assertRaisesRegexp(ValueError,
-                                 r'Cannot infer num from shape \(\?,\)'):
+                                 r'Cannot infer num from shape \((\?|None),\)'):
       array_ops.unstack(x)
 
   def testAgainstNumpy(self):
@@ -131,15 +138,13 @@ class UnstackOpTest(test.TestCase):
       for j in range(-i, i):
         expected = np_split_squeeze(a, j)
 
-        with self.cached_session() as sess:
-          actual_unstack = sess.run(array_ops.unstack(a, axis=j))
+        actual_unstack = self.evaluate(array_ops.unstack(a, axis=j))
 
         self.assertAllEqual(expected, actual_unstack)
 
   def testAxis0Default(self):
-    with self.cached_session() as sess:
-      a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
-      unstacked = sess.run(array_ops.unstack(a))
+    a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
+    unstacked = self.evaluate(array_ops.unstack(a))
 
     self.assertEqual(len(unstacked), 2)
     self.assertAllEqual(unstacked[0], [1, 2, 3])
@@ -156,10 +161,9 @@ class UnstackOpTest(test.TestCase):
       array_ops.unstack(a, axis=-3)
 
   def testZeroLengthDim(self):
-    with self.cached_session():
-      x = array_ops.zeros(shape=(0, 1, 2))
-      y = array_ops.unstack(x, axis=1)[0].eval()
-      self.assertEqual(y.shape, (0, 2))
+    x = array_ops.zeros(shape=(0, 1, 2))
+    y = self.evaluate(array_ops.unstack(x, axis=1)[0])
+    self.assertEqual(y.shape, (0, 2))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/variable_ops_test.py b/tensorflow/python/kernel_tests/variable_ops_test.py
index 3d2f8b61555f277cd67d65b27c43b81c2a45538e..0f3e261992537f6d57a2a6d7234ab255fe55e79c 100644
--- a/tensorflow/python/kernel_tests/variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variable_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
@@ -46,7 +47,7 @@ class VariableOpTest(test.TestCase):
       p = state_ops.variable_op(x.shape, tftype)
       op = state_ops.assign(p, x)
       op.op.run()
-      return p.eval()
+      return self.evaluate(p)
 
   def _testTypes(self, vals):
     for dtype in [np.float32, np.float64, np.int32, np.int64]:
@@ -59,15 +60,18 @@ class VariableOpTest(test.TestCase):
       # that Variable and Assign have GPU implementations for matching tf.
       self.assertAllEqual(x, self._initFetch(x, tftype, use_gpu=True))
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testTypes(np.arange(0, 20).reshape([4, 5]))
 
+  @test_util.run_deprecated_v1
   def testset_shape(self):
     p = state_ops.variable_op([1, 2], dtypes.float32)
     self.assertEqual([1, 2], p.get_shape())
     p = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     self.assertEqual(tensor_shape.unknown_shape(), p.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssign(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32)
@@ -75,6 +79,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value)
     self.assertShapeEqual(value, assigned)
 
+  @test_util.run_deprecated_v1
   def testAssignNoValidateShape(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32)
@@ -82,6 +87,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value, validate_shape=False)
     self.assertShapeEqual(value, assigned)
 
+  @test_util.run_deprecated_v1
   def testAssignNoVarShape(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32, set_shape=False)
@@ -89,6 +95,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value)
     self.assertShapeEqual(value, assigned)
 
+  @test_util.run_deprecated_v1
   def testAssignNoVarShapeNoValidateShape(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32, set_shape=False)
@@ -101,6 +108,7 @@ class VariableOpTest(test.TestCase):
     self.assertEqual(tensor_shape.unknown_shape(), tensor.get_shape())
     return tensor
 
+  @test_util.run_deprecated_v1
   def testAssignNoValueShape(self):
     value = self._NewShapelessTensor()
     shape = [1, 2]
@@ -109,6 +117,7 @@ class VariableOpTest(test.TestCase):
     self.assertEqual(shape, var.get_shape())
     self.assertEqual(shape, assigned.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignNoValueShapeNoValidateShape(self):
     value = self._NewShapelessTensor()
     shape = [1, 2]
@@ -117,6 +126,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value, validate_shape=False)
     self.assertEqual(tensor_shape.unknown_shape(), assigned.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignNoShape(self):
     with self.cached_session():
       value = self._NewShapelessTensor()
@@ -125,6 +135,7 @@ class VariableOpTest(test.TestCase):
       self.assertEqual(tensor_shape.unknown_shape(),
                        state_ops.assign(var, value).get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignNoShapeNoValidateShape(self):
     with self.cached_session():
       value = self._NewShapelessTensor()
@@ -135,6 +146,7 @@ class VariableOpTest(test.TestCase):
           state_ops.assign(
               var, value, validate_shape=False).get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdate(self):
     var = state_ops.variable_op([1, 2], dtypes.float32)
     added = state_ops.assign_add(var, [[2.0, 3.0]])
@@ -142,6 +154,7 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, [[12.0, 13.0]])
     self.assertEqual([1, 2], subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdateNoVarShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     added = state_ops.assign_add(var, [[2.0, 3.0]])
@@ -149,6 +162,7 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, [[12.0, 13.0]])
     self.assertEqual([1, 2], subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdateNoValueShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32)
     added = state_ops.assign_add(var, self._NewShapelessTensor())
@@ -156,6 +170,7 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual([1, 2], subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdateNoShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     added = state_ops.assign_add(var, self._NewShapelessTensor())
@@ -163,24 +178,27 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual(tensor_shape.unknown_shape(), subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testTemporaryVariable(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="foo")
       var = state_ops.assign(var, [[4.0, 5.0]])
       var = state_ops.assign_add(var, [[6.0, 7.0]])
       final = gen_state_ops.destroy_temporary_variable(var, var_name="foo")
-      self.assertAllClose([[10.0, 12.0]], final.eval())
+      self.assertAllClose([[10.0, 12.0]], self.evaluate(final))
 
+  @test_util.run_deprecated_v1
   def testDestroyNonexistentTemporaryVariable(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable([1, 2], dtypes.float32)
       final = gen_state_ops.destroy_temporary_variable(var, var_name="bad")
       with self.assertRaises(errors.NotFoundError):
-        final.eval()
+        self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testDuplicateTemporaryVariable(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var1 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="dup")
       var1 = state_ops.assign(var1, [[1.0, 2.0]])
@@ -189,48 +207,53 @@ class VariableOpTest(test.TestCase):
       var2 = state_ops.assign(var2, [[3.0, 4.0]])
       final = var1 + var2
       with self.assertRaises(errors.AlreadyExistsError):
-        final.eval()
+        self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testDestroyTemporaryVariableTwice(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable([1, 2], dtypes.float32)
       val1 = gen_state_ops.destroy_temporary_variable(var, var_name="dup")
       val2 = gen_state_ops.destroy_temporary_variable(var, var_name="dup")
       final = val1 + val2
       with self.assertRaises(errors.NotFoundError):
-        final.eval()
+        self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testTemporaryVariableNoLeak(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="bar")
       final = array_ops.identity(var)
-      final.eval()
+      self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testTwoTemporaryVariablesNoLeaks(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var1 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="var1")
       var2 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="var2")
       final = var1 + var2
-      final.eval()
+      self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testAssignDependencyAcrossDevices(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       # The variable and an op to increment it are on the GPU.
       var = state_ops.variable_op([1], dtypes.float32)
-      state_ops.assign(var, [1.0]).eval()
+      self.evaluate(state_ops.assign(var, [1.0]))
       increment = state_ops.assign_add(var, [1.0])
       with ops.control_dependencies([increment]):
-        with ops.device("/cpu:0"):
+        with test_util.force_cpu():
           # This mul op is pinned to the CPU, but reads the variable from the
           # GPU. The test ensures that the dependency on 'increment' is still
           # honored, i.e., the Send and Recv from GPU to CPU should take place
           # only after the increment.
           result = math_ops.multiply(var, var)
-      self.assertAllClose([4.0], result.eval())
+      self.assertAllClose([4.0], self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def testIsVariableInitialized(self):
     for use_gpu in [True, False]:
       with self.test_session(use_gpu=use_gpu):
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 33f464fb90c4bfd03e1890d12453411530456888..451eb3853062203a190def09f432f9d9e12f2edd 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -25,6 +25,7 @@ import numpy
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -44,6 +45,30 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import tf_inspect
 
 
+def run_inside_wrap_function_in_eager_mode(graph_function):
+  """Decorator to execute the same graph code in eager and graph modes.
+
+  In graph mode, we just execute the graph_function passed as argument. In eager
+  mode, we wrap the function using wrap_function and then execute the wrapped
+  result.
+
+  Args:
+    graph_function: python function containing graph code to be wrapped
+
+  Returns:
+    decorated function
+  """
+  def wrap_and_execute(self):
+    if context.executing_eagerly():
+      wrapped = wrap_function.wrap_function(graph_function, [self])
+      # use the wrapped graph function
+      wrapped()
+    else:
+      # use the original function
+      graph_function(self)
+  return wrap_and_execute
+
+
 class VariableScopeTest(test.TestCase):
 
   def tearDown(self):
@@ -52,6 +77,8 @@ class VariableScopeTest(test.TestCase):
     # involving objects with __del__ defined.
     self.assertEqual(0, len(gc.garbage))
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testGetVar(self):
     vs = variable_scope._get_default_variable_store()
     v = vs.get_variable("v", [1])
@@ -59,11 +86,14 @@ class VariableScopeTest(test.TestCase):
     self.assertEqual(v, v1)
 
   @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testResource(self):
     vs = variable_scope._get_default_variable_store()
     v1 = vs.get_variable("v", [1], use_resource=True)
     self.assertTrue(isinstance(v1, resource_variable_ops.ResourceVariable))
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testNameExists(self):
     vs = variable_scope._get_default_variable_store()
     # No check by default, so we can both create and get existing names.
@@ -80,6 +110,8 @@ class VariableScopeTest(test.TestCase):
     with self.assertRaises(ValueError):
       vs.get_variable("u", [1], reuse=True)  # That fails.
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testNamelessStore(self):
     vs = variable_scope._get_default_variable_store()
     vs.get_variable("v1", [2])
@@ -88,6 +120,9 @@ class VariableScopeTest(test.TestCase):
     self.assertEqual(
         set(expected_names), set([v.name for v in vs._vars.values()]))
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # TypeError: Expected tf.group() expected Tensor arguments not 'None' with
+  # type '<type 'NoneType'>'
   @test_util.run_in_graph_and_eager_modes
   def testVarScopeInitializer(self):
     init = init_ops.constant_initializer(0.3)
@@ -102,6 +137,7 @@ class VariableScopeTest(test.TestCase):
         self.assertAllClose(self.evaluate(w.value()), 0.3)
 
   @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScopeConstraint(self):
     constraint = lambda x: 0. * x
     with variable_scope.variable_scope("tower1") as tower:
@@ -112,13 +148,19 @@ class VariableScopeTest(test.TestCase):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.constraint, constraint)
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # TypeError: Fetch argument <tf.Variable 'string:0' shape=() dtype=string>
+  # has invalid type <class '...ResourceVariable'>, must be a string or Tensor.
+  # (Can not convert a ResourceVariable into a Tensor or Operation.)
+  @test_util.run_deprecated_v1
   def testStringDefaultInitializer(self):
     with self.cached_session():
       v = variable_scope.get_variable("string", shape=[], dtype=dtypes.string)
       variables_lib.global_variables_initializer().run()
-      self.assertAllEqual(compat.as_bytes(v.eval()), b"")
+      self.assertAllEqual(compat.as_bytes(self.evaluate(v)), b"")
 
   @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScopeDType(self):
     with variable_scope.variable_scope("tower2") as tower:
       with variable_scope.variable_scope("foo", dtype=dtypes.float16):
@@ -195,9 +237,12 @@ class VariableScopeTest(test.TestCase):
         _ = d2(x)
         self.assertEqual(len(d2.variables), 2)
         v3, v4 = d2.variables
-        self.assertAllEqual([v1, v2], [v3, v4])
+        self.assertEqual(v1, v3)
+        self.assertEqual(v2, v4)
       f()
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # obtaining different results in the eager case compared to the graph one
   @test_util.run_in_graph_and_eager_modes
   def testEagerVariablesStoreAddsToCollections(self):
     store = variable_scope.EagerVariableStore()
@@ -215,15 +260,16 @@ class VariableScopeTest(test.TestCase):
       self.assertEqual(
           ops.get_collection(ops.GraphKeys.CONCATENATED_VARIABLES), [concat])
 
-  @test_util.run_in_graph_and_eager_modes
   def testEagerVariablesOutsideStoreNotAddedToCollections(self):
-    if not context.executing_eagerly():
-      return
-    variable_scope.get_variable("v1", [], trainable=True)
-    variable_scope.get_variable("v2", [], trainable=False)
-    self.assertFalse(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertFalse(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
+    with context.eager_mode():
+      variable_scope.get_variable("v1", [], trainable=True)
+      variable_scope.get_variable("v2", [], trainable=False)
+      self.assertFalse(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+      self.assertFalse(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # TypeError: Expected tf.group() expected Tensor arguments not 'None' with
+  # type '<type 'NoneType'>'.
   @test_util.run_in_graph_and_eager_modes
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
@@ -240,6 +286,9 @@ class VariableScopeTest(test.TestCase):
     with self.assertRaises(error):
       variable_scope.get_variable("x4", initializer={})
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # InvalidArgumentError=: You must feed a value for placeholder tensor
+  # 'ReadVariableOp/resource' with dtype resource
   @test_util.run_in_graph_and_eager_modes
   def testInitFromNonInitializer(self):
     # Test various dtypes with zeros initializer as following:
@@ -261,7 +310,9 @@ class VariableScopeTest(test.TestCase):
       self.evaluate(variables_lib.global_variables_initializer())
       self.assertAllEqual(self.evaluate(x.value()), self.evaluate(y.value()))
 
-  # TODO(alive): support variable partitioning/caching in eager mode.
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # InvalidArgumentError: /job:moo/replica:0/task:0/device:CPU:0 unknown device.
+  @test_util.run_deprecated_v1
   def testVarScopeCachingDevice(self):
     with self.cached_session():
       caching_device = "/job:moo"
@@ -295,6 +346,8 @@ class VariableScopeTest(test.TestCase):
         v_tower = variable_scope.get_variable("v", [])
         self.assertFalse(v_tower.value().device.startswith(caching_device))
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # AttributeError: Tensor.name is meaningless when eager execution is enabled.
   @test_util.run_in_graph_and_eager_modes
   def testVarScopeRegularizer(self):
     init = init_ops.constant_initializer(0.3)
@@ -340,6 +393,9 @@ class VariableScopeTest(test.TestCase):
           losses = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
           self.assertEqual(3, len(losses))  # No new loss added.
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # ValueError: Tensor-typed variable initializers must either be wrapped in an
+  # init_scope or callable...
   @test_util.run_in_graph_and_eager_modes
   def testInitializeFromValue(self):
     init = constant_op.constant(0.1)
@@ -366,6 +422,12 @@ class VariableScopeTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "don't match"):
       variable_scope.get_variable("s", initializer=init, dtype=dtypes.float64)
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # TypeError: Fetch argument <tf.Variable 'v0:0' shape=(1,) dtype=float32> has
+  # invalid type <class '...ops.resource_variable_ops.ResourceVariable'>, must
+  # be a string or Tensor. (Can not convert a ResourceVariable into a Tensor or
+  # Operation.)
+  @test_util.run_deprecated_v1
   def testControlDeps(self):
     with self.cached_session() as sess:
       v0 = variable_scope.get_variable(
@@ -376,20 +438,23 @@ class VariableScopeTest(test.TestCase):
         add = v1 + v0
       # v0 should be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       # We should be able to initialize and run v1 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual(1, sess.run(v1))
+      self.evaluate(v1.initializer)
+      self.assertEqual(1, self.evaluate(v1))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(add)
+        self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
-      sess.run(add)
+      self.evaluate(v0.initializer)
+      self.evaluate(add)
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # AssertionError: True is not false (last assertFalse)
+  @test_util.run_deprecated_v1
   def testEnableResourceVariables(self):
     old = variable_scope._DEFAULT_USE_RESOURCE
     try:
@@ -402,6 +467,9 @@ class VariableScopeTest(test.TestCase):
     finally:
       variable_scope._DEFAULT_USE_RESOURCE = old
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # TypeError: Fetch argument None has invalid type <type 'NoneType'>
+  @test_util.run_deprecated_v1
   def testControlFlow(self):
     with self.cached_session() as sess:
       v0 = variable_scope.get_variable(
@@ -427,20 +495,23 @@ class VariableScopeTest(test.TestCase):
       v2 = var_dict["v2"]
       # We should be able to initialize and run v1 and v2 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual([1], sess.run(v1))
-      sess.run(v2.initializer)
-      self.assertEqual([2], sess.run(v2))
+      self.evaluate(v1.initializer)
+      self.assertEqual([1], self.evaluate(v1))
+      self.evaluate(v2.initializer)
+      self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       # We should not be able to run 'add' yet.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(add)
+        self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
-      sess.run(add)
+      self.evaluate(v0.initializer)
+      self.evaluate(add)
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # TypeError: Expected tf.group() expected Tensor arguments not 'None' with
+  # type '<type 'NoneType'>'.
   @test_util.run_in_graph_and_eager_modes
   def testGetVariableScope(self):
     # Test the get_variable_scope() function and setting properties of result.
@@ -463,6 +534,7 @@ class VariableScopeTest(test.TestCase):
     self.assertEqual(new_init, None)
 
   @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScope(self):
     with variable_scope.variable_scope("tower4") as tower:
       self.assertEqual(tower.name, "tower4")
@@ -482,6 +554,7 @@ class VariableScopeTest(test.TestCase):
           self.assertEqual(sc, "tower6/tower4/scope/")
 
   @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScopeNameScope(self):
     with ops.name_scope("testVarScopeNameScope1"):
       with variable_scope.variable_scope("tower") as tower:
@@ -512,6 +585,8 @@ class VariableScopeTest(test.TestCase):
         with ops.name_scope("scope2") as sc2:
           self.assertEqual(sc2, "testVarScopeNameScope3/scope2/")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScopeOriginalNameScope(self):
     with self.cached_session():
       with ops.name_scope("scope1"):
@@ -535,6 +610,8 @@ class VariableScopeTest(test.TestCase):
             with ops.name_scope("bar") as sc3:
               self.assertEqual(sc3, "scope1/tower/bar_1/")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScopeObjectReuse(self):
     with self.cached_session():
       vs = None
@@ -562,6 +639,8 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope(vs, reuse=False) as jump_no_reuse:
         self.assertFalse(jump_no_reuse.reuse)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScopeGetOrCreateReuse(self):
     with self.cached_session():
 
@@ -575,12 +654,14 @@ class VariableScopeTest(test.TestCase):
             "testVarScopeGetOrCreateReuse_bar",
             reuse=variable_scope.AUTO_REUSE):
           _ = variable_scope.get_variable("var", [])
-        self.assertEqual(value, x.eval())
+        self.assertEqual(value, self.evaluate(x))
 
       test_value(42.)  # Variable is created.
       test_value(13.)  # Variable is reused hereafter.
       test_value(17.)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarOpScope(self):
     with self.cached_session():
       with ops.name_scope("testVarOpScope1"):
@@ -607,6 +688,8 @@ class VariableScopeTest(test.TestCase):
           with ops.name_scope("testVarOpScope2") as sc2:
             self.assertEqual(sc2, "testVarOpScope2/default_1/testVarOpScope2/")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarOpScopeUniqueNamesInterleavedSubstringScopes(self):
     with self.cached_session():
       with variable_scope.variable_scope(None, "defaultScope1"):
@@ -630,6 +713,8 @@ class VariableScopeTest(test.TestCase):
               variable_scope.get_variable("w", []).name,
               "defaultScope1_2/layer/w:0")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarOpScopeUniqueNamesWithJump(self):
     with self.cached_session():
       with variable_scope.variable_scope("default") as default:
@@ -638,14 +723,18 @@ class VariableScopeTest(test.TestCase):
               variable_scope.get_variable("w", []).name, "default/layer/w:0")
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name, "default/layer_1/w:0")
+              variable_scope.get_variable("w", []).name,
+              "default/layer_1/w:0")
         with variable_scope.variable_scope(default):
           pass
         # No matter the jump in the middle, unique numbering continues.
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name, "default/layer_2/w:0")
+              variable_scope.get_variable("w", []).name,
+              "default/layer_2/w:0")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarOpScopeReuse(self):
     with self.cached_session():
       with variable_scope.variable_scope("outer") as outer:
@@ -672,6 +761,8 @@ class VariableScopeTest(test.TestCase):
           with ops.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "outer_1/default/scope2/")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScopeGetVar(self):
     with self.cached_session():
       with variable_scope.variable_scope("root"):
@@ -718,12 +809,15 @@ class VariableScopeTest(test.TestCase):
             variable_scope.get_variable("v", [1], dtype=dtypes.int32)
         self.assertEqual("dtype" in str(exc.exception), True)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScopeOuterScope(self):
     with self.cached_session():
       with variable_scope.variable_scope("outer") as outer:
         pass
       with variable_scope.variable_scope(outer):
-        self.assertEqual(variable_scope.get_variable("w", []).name, "outer/w:0")
+        self.assertEqual(
+            variable_scope.get_variable("w", []).name, "outer/w:0")
         with ops.name_scope("scope2") as sc2:
           self.assertEqual(sc2, "outer_1/scope2/")
         with variable_scope.variable_scope("default"):
@@ -733,7 +827,8 @@ class VariableScopeTest(test.TestCase):
             self.assertEqual(sc2, "outer_1/default/scope2/")
 
       with variable_scope.variable_scope(outer, reuse=True):
-        self.assertEqual(variable_scope.get_variable("w", []).name, "outer/w:0")
+        self.assertEqual(
+            variable_scope.get_variable("w", []).name, "outer/w:0")
         with ops.name_scope("scope2") as sc2:
           self.assertEqual(sc2, "outer_2/scope2/")
         with variable_scope.variable_scope("default", reuse=True):
@@ -742,6 +837,8 @@ class VariableScopeTest(test.TestCase):
           with ops.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "outer_2/default/scope2/")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarScopeNestedOuterScope(self):
     with self.cached_session():
       with variable_scope.variable_scope("outer") as outer:
@@ -767,6 +864,8 @@ class VariableScopeTest(test.TestCase):
           with ops.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "outer/default_1/scope2/")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarOpScopeReuseParam(self):
     with self.cached_session():
       with variable_scope.variable_scope("outer") as outer:
@@ -794,6 +893,8 @@ class VariableScopeTest(test.TestCase):
           with ops.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "outer_1/default/scope2/")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarOpScopeReuseError(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -801,12 +902,15 @@ class VariableScopeTest(test.TestCase):
           self.assertEqual(
               variable_scope.get_variable("w", []).name, "outer/tower/w:0")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarOpScopeOuterScope(self):
     with self.cached_session():
       with variable_scope.variable_scope("outer") as outer:
         pass
       with variable_scope.variable_scope(outer, "default", []):
-        self.assertEqual(variable_scope.get_variable("w", []).name, "outer/w:0")
+        self.assertEqual(
+            variable_scope.get_variable("w", []).name, "outer/w:0")
         with ops.name_scope("scope2") as sc2:
           self.assertEqual(sc2, "outer_1/scope2/")
         with variable_scope.variable_scope(None, "default", []):
@@ -816,7 +920,8 @@ class VariableScopeTest(test.TestCase):
             self.assertEqual(sc2, "outer_1/default/scope2/")
 
       with variable_scope.variable_scope(outer, "default", reuse=True):
-        self.assertEqual(variable_scope.get_variable("w", []).name, "outer/w:0")
+        self.assertEqual(
+            variable_scope.get_variable("w", []).name, "outer/w:0")
         with ops.name_scope("scope2") as sc2:
           self.assertEqual(sc2, "outer_2/scope2/")
         outer.reuse_variables()
@@ -826,6 +931,8 @@ class VariableScopeTest(test.TestCase):
           with ops.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "outer_2/default/scope2/")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVarOpScopeNestedOuterScope(self):
     with self.cached_session():
       with variable_scope.variable_scope("outer") as outer:
@@ -841,7 +948,8 @@ class VariableScopeTest(test.TestCase):
             self.assertEqual(sc2, "outer/default/scope2/")
 
       with variable_scope.variable_scope(outer, "default", reuse=True):
-        self.assertEqual(variable_scope.get_variable("w", []).name, "outer/w:0")
+        self.assertEqual(
+            variable_scope.get_variable("w", []).name, "outer/w:0")
         with ops.name_scope("scope2") as sc2:
           self.assertEqual(sc2, "outer_1/scope2/")
         with variable_scope.variable_scope(None, "default", []):
@@ -850,12 +958,15 @@ class VariableScopeTest(test.TestCase):
           with ops.name_scope("scope2") as sc2:
             self.assertEqual(sc2, "outer_1/default/scope2/")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testBasicWhenAuxiliaryNameScopeIsFalse(self):
     with self.cached_session():
       with variable_scope.variable_scope(
           "scope", auxiliary_name_scope=False) as scope:
         self.assertEqual(scope.original_name_scope, "")
-        self.assertEqual(variable_scope.get_variable("w", []).name, "scope/w:0")
+        self.assertEqual(
+            variable_scope.get_variable("w", []).name, "scope/w:0")
         self.assertEqual(constant_op.constant([], name="c").name, "c:0")
       with variable_scope.variable_scope(scope, auxiliary_name_scope=False):
         self.assertEqual(scope.original_name_scope, "")
@@ -872,7 +983,8 @@ class VariableScopeTest(test.TestCase):
           self.assertEqual(inner.original_name_scope, "outer/")
           self.assertEqual(
               variable_scope.get_variable("w", []).name, "outer/inner/w:0")
-          self.assertEqual(constant_op.constant([], name="c").name, "outer/c:0")
+          self.assertEqual(
+              constant_op.constant([], name="c").name, "outer/c:0")
         with variable_scope.variable_scope(
             inner, auxiliary_name_scope=False) as inner1:
           self.assertEqual(inner1.original_name_scope, "outer/")
@@ -885,6 +997,8 @@ class VariableScopeTest(test.TestCase):
           self.assertEqual(
               constant_op.constant([], name="c").name, "outer/inner/c:0")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testCreatedByDefaultNameWhenAuxiliaryNameScopeIsFalse(self):
     with self.cached_session():
       with variable_scope.variable_scope(
@@ -895,20 +1009,25 @@ class VariableScopeTest(test.TestCase):
         self.assertEqual(constant_op.constant([], name="c").name, "c:0")
       # Recheck: new name scope is NOT created before
       with ops.name_scope("default"):
-        self.assertEqual(constant_op.constant([], name="c").name, "default/c:0")
+        self.assertEqual(
+            constant_op.constant([], name="c").name, "default/c:0")
 
       with variable_scope.variable_scope("outer"):
         with variable_scope.variable_scope(
-            None, default_name="default", auxiliary_name_scope=False) as inner:
+            None, default_name="default",
+            auxiliary_name_scope=False) as inner:
           self.assertEqual(inner.original_name_scope, "outer/")
           self.assertEqual(
               variable_scope.get_variable("w", []).name, "outer/default/w:0")
-          self.assertEqual(constant_op.constant([], name="c").name, "outer/c:0")
+          self.assertEqual(
+              constant_op.constant([], name="c").name, "outer/c:0")
         # Recheck: new name scope is NOT created before
         with ops.name_scope("default"):
           self.assertEqual(
               constant_op.constant([], name="c").name, "outer/default/c:0")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testReenterRootScopeWhenAuxiliaryNameScopeIsFalse(self):
     with self.cached_session():
       root_scope = variable_scope.get_variable_scope()
@@ -926,6 +1045,8 @@ class VariableScopeTest(test.TestCase):
           self.assertEqual(
               constant_op.constant([], name="c1").name, "outer/c1:0")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testAuxiliaryNameScopeIsInvalid(self):
     with self.cached_session():
       with self.assertRaisesRegexp(TypeError, "auxiliary_name_scope"):
@@ -945,6 +1066,8 @@ class VariableScopeTest(test.TestCase):
             scope, auxiliary_name_scope="invalid"):
           pass
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testReuseScopeWithoutNameScopeCollision(self):
     # Github issue: #13429
     with self.cached_session():
@@ -960,20 +1083,25 @@ class VariableScopeTest(test.TestCase):
           self.assertEqual(
               constant_op.constant([], name="c").name, "outer/inner/c:0")
         with ops.name_scope("inner"):
-          self.assertEqual(constant_op.constant([], name="c").name, "inner/c:0")
+          self.assertEqual(
+              constant_op.constant([], name="c").name, "inner/c:0")
 
       with variable_scope.variable_scope("another"):
         with variable_scope.variable_scope(
             inner, auxiliary_name_scope=False) as scope1:
           with ops.name_scope(scope1.original_name_scope):
             self.assertEqual(
-                variable_scope.get_variable("w1", []).name, "outer/inner/w1:0")
+                variable_scope.get_variable("w1", []).name,
+                "outer/inner/w1:0")
             self.assertEqual(
                 constant_op.constant([], name="c1").name, "outer/inner/c1:0")
           with ops.name_scope("inner"):
             self.assertEqual(
                 constant_op.constant([], name="c").name, "another/inner/c:0")
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # obtaining different results in the eager case compared to the graph one
+  # (different assertions failing after wrapping, in both execution modes)
   @test_util.run_in_graph_and_eager_modes
   def testGetLocalVar(self):
     # Check that local variable respects naming.
@@ -996,6 +1124,8 @@ class VariableScopeTest(test.TestCase):
         self.assertEqual(
             variable_scope.get_local_variable("w", []).name, "outer/w:0")
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testSignatureGetVarVsGetLocalVar(self):
     """get_{local,}variable() must take the same list of args."""
     arg_names = tf_inspect.getargspec(variable_scope.get_variable)[0]
@@ -1003,6 +1133,8 @@ class VariableScopeTest(test.TestCase):
         variable_scope.get_local_variable)[0]
     self.assertEqual(arg_names, local_arg_names)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testGetVarWithDevice(self):
     g = ops.Graph()
     varname_type = []
@@ -1020,6 +1152,9 @@ class VariableScopeTest(test.TestCase):
     self.assertEqual(varname_type[0], ("x", dtypes.float32))
     self.assertEqual(varname_type[1], ("y", dtypes.int64))
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetCollection(self):
     with self.cached_session():
       _ = variable_scope.get_variable("testGetCollection_a", [])
@@ -1074,6 +1209,9 @@ class VariableScopeTest(test.TestCase):
           "testGetCollection_foo/testGetCollection_a:0"
       ])
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetTrainableVariablesWithGetVariable(self):
     with self.cached_session():
       _ = variable_scope.get_variable("testGetTrainableVariables_a", [])
@@ -1110,6 +1248,9 @@ class VariableScopeTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_READ,
             trainable=True)
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetTrainableVariablesWithVariable(self):
     with self.cached_session():
       _ = variable_scope.variable(1.0, name="testGetTrainableVariables_a")
@@ -1149,6 +1290,9 @@ class VariableScopeTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_READ,
             trainable=True)
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetGlobalVariables(self):
     with self.cached_session():
       _ = variable_scope.get_variable("testGetGlobalVariables_a", [])
@@ -1159,6 +1303,9 @@ class VariableScopeTest(test.TestCase):
             ["testGetGlobalVariables_foo/"
              "testGetGlobalVariables_b:0"])
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetLocalVariables(self):
     with self.cached_session():
       _ = variable_scope.get_variable(
@@ -1169,11 +1316,37 @@ class VariableScopeTest(test.TestCase):
         _ = variable_scope.get_variable("c", [])
         self.assertEqual([v.name for v in scope.local_variables()], ["foo/b:0"])
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testGetVariableWithRefDtype(self):
     v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
     # Ensure it is possible to do get_variable with a _ref dtype passed in.
     _ = variable_scope.get_variable("w", shape=[5, 6], dtype=v.dtype)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
+  def testGetVariableWithInitializerWhichTakesNoArgs(self):
+    v = variable_scope.get_variable("foo", initializer=lambda: [2])
+    self.assertEqual(v.name, "foo:0")
+
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
+  def testGetVariableWithInitializerWhichTakesOptionalArgs(self):
+    v = variable_scope.get_variable("foo", initializer=lambda x=True: [2])
+    self.assertEqual(v.name, "foo:0")
+
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
+  def testGetVariableWithInitializerWhichTakesUnprovidedArgsAndNoShape(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        "The initializer passed is not valid. It should be a callable with no "
+        "arguments and the shape should not be provided or an instance of "
+        "`tf.keras.initializers.*' and `shape` should be fully defined."):
+      variable_scope.get_variable("foo", initializer=lambda x: [2])
+
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testTwoGraphs(self):
 
     def f():
@@ -1206,6 +1379,9 @@ def axis0_into3_partitioner(shape=None, **unused_kwargs):
 
 class VariableScopeWithPartitioningTest(test.TestCase):
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testResultNameMatchesRequested(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1218,6 +1394,8 @@ class VariableScopeWithPartitioningTest(test.TestCase):
       self.assertIn("scope0/name0/part_1:0", [x.name for x in variables])
       self.assertNotIn("scope0/name0/part_2:0", [x.name for x in variables])
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testBreaksIfPartitioningChanges(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1227,18 +1405,20 @@ class VariableScopeWithPartitioningTest(test.TestCase):
         "scope0", partitioner=axis0_into3_partitioner, reuse=True):
       with self.assertRaisesRegexp(
           ValueError,
-          "Trying to reuse partitioned variable .* but specified partitions .* "
-          "and found partitions .*"):
+          "Trying to reuse partitioned variable .* but specified partitions "
+          ".* and found partitions .*"):
         variable_scope.get_variable("name0", shape=(3, 1, 1))
 
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into1_partitioner, reuse=True):
       with self.assertRaisesRegexp(
           ValueError,
-          "Trying to reuse partitioned variable .* but specified partitions .* "
-          "and found partitions .*"):
+          "Trying to reuse partitioned variable .* but specified partitions "
+          ".* and found partitions .*"):
         variable_scope.get_variable("name0", shape=(3, 1, 1))
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testReturnsExistingConcatenatedValueIfReuse(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1247,6 +1427,8 @@ class VariableScopeWithPartitioningTest(test.TestCase):
       v_concat_2 = variable_scope.get_variable("name0", shape=(3, 1, 1))
       self.assertEqual(v_concat, v_concat_2)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testAllowsReuseWithoutPartitioner(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1255,6 +1437,16 @@ class VariableScopeWithPartitioningTest(test.TestCase):
       v_reused = variable_scope.get_variable("name0")
     self.assertEqual(v, v_reused)
 
+  def testNoReuseInEagerByDefault(self):
+    with context.eager_mode():
+      with variable_scope.variable_scope(
+          "scope0", partitioner=axis0_into2_partitioner):
+        v1 = variable_scope.get_variable("name0", shape=(3, 1, 1))
+        v2 = variable_scope.get_variable("name0", shape=(3, 1, 1))
+        self.assertIsNot(v1, v2)
+
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testPropagatePartitionerOnReopening(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner) as vs:
@@ -1262,6 +1454,9 @@ class VariableScopeWithPartitioningTest(test.TestCase):
       with variable_scope.variable_scope(vs) as vs1:
         self.assertEqual(axis0_into2_partitioner, vs1.partitioner)
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testScalarIgnoresPartitioner(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1271,7 +1466,6 @@ class VariableScopeWithPartitioningTest(test.TestCase):
       self.assertIn("scope0/name0:0", [x.name for x in variables])
 
   def _testPartitionConcatenatesAlongCorrectAxis(self, use_resource):
-
     def _part_axis_0(**unused_kwargs):
       return (2, 1, 1)
 
@@ -1297,22 +1491,36 @@ class VariableScopeWithPartitioningTest(test.TestCase):
     self.assertEqual(n1_0.get_shape(), (2, 1, 2))
     self.assertEqual(n1_1.get_shape(), (2, 1, 2))
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testPartitionConcatenatesAlongCorrectAxis(self):
     self._testPartitionConcatenatesAlongCorrectAxis(use_resource=False)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testPartitionConcatenatesAlongCorrectAxisResource(self):
     self._testPartitionConcatenatesAlongCorrectAxis(use_resource=True)
 
+  def testPartitionConcatenatesAlongCorrectAxisResourceInEager(self):
+    with context.eager_mode():
+      self._testPartitionConcatenatesAlongCorrectAxis(use_resource=True)
+
 
 class VariableScopeWithCustomGetterTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testNonCallableGetterFails(self):
-    with self.assertRaisesRegexp(ValueError, r"custom_getter .* not callable:"):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"custom_getter .* not callable:"):
       with variable_scope.variable_scope("scope0", custom_getter=3):
         variable_scope.get_variable("name0")
-    with self.assertRaisesRegexp(ValueError, r"custom_getter .* not callable:"):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"custom_getter .* not callable:"):
       variable_scope.get_variable("name0", custom_getter=3)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testNoSideEffectsWithIdentityCustomGetter(self):
     called = [0]
 
@@ -1335,6 +1543,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
     self.assertEqual(v3, v4)
     self.assertEqual(3, called[0])  # skipped one in the first new_scope
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testSynchronizationAndAggregationWithCustomGetter(self):
     called = [0]
     synchronization = variable_scope.VariableSynchronization.AUTO
@@ -1360,6 +1570,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
 
     self.assertEqual(2, called[0])
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testCustomGetterWithReuse(self):
     # Custom getter can choose to behave differently on reused variables.
     def custom_getter(getter, *args, **kwargs):
@@ -1379,6 +1591,12 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
     self.assertEqual(v.name, "not_reused:0")
     self.assertEqual(v2.name, "reused:0")
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # ValueError: Fetch argument <tf.Tensor 'custom_getter/add:0' shape=(1, 2, 3)
+  # dtype=float32> cannot be interpreted as a Tensor. (Tensor
+  # Tensor("custom_getter/add:0", shape=(1, 2, 3), dtype=float32) is not an
+  # element of this graph.)
+  @test_util.run_deprecated_v1
   def testGetterThatCreatesTwoVariablesAndSumsThem(self):
 
     def custom_getter(getter, name, *args, **kwargs):
@@ -1398,9 +1616,15 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
     self.assertEqual("custom_getter/add:0", v.name)
     with self.cached_session() as sess:
       variables_lib.global_variables_initializer().run()
-      np_vars, np_v = sess.run([true_vars, v])
+      np_vars, np_v = self.evaluate([true_vars, v])
       self.assertAllClose(np_v, sum(np_vars))
 
+  # TODO(mihaimaruseac): Not converted to use wrap_function because of
+  # ValueError: Fetch argument <tf.Tensor 'sum_getter_2/add:0' shape=(1, 2, 3)
+  # dtype=float32> cannot be interpreted as a Tensor. (Tensor
+  # Tensor("sum_getter_2/add:0", shape=(1, 2, 3), dtype=float32) is not an
+  # element of this graph.)
+  @test_util.run_deprecated_v1
   def testNestedCustomGetters(self):
 
     def sum_getter(getter, name, *args, **kwargs):
@@ -1438,14 +1662,15 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
 
     with self.cached_session() as sess:
       variables_lib.global_variables_initializer().run()
-      np_vars, np_v = sess.run([true_vars, v])
+      np_vars, np_v = self.evaluate([true_vars, v])
       # take products of sums of products
       self.assertAllClose(
           np_v, (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3])) + (
               (np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testVariableCreator(self):
-
     variable_names = []
 
     def creator_a(next_creator, **kwargs):
@@ -1460,7 +1685,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       with variable_scope.variable_creator_scope(creator_b):
         variable_scope.variable(1.0, name="one_name")
 
-    self.assertAllEqual(variable_names, ["forced_name"])
+    self.assertEqual(variable_names[0], "forced_name")
 
     called = [False]
 
@@ -1482,6 +1707,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
 
 class PartitionInfoTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testConstructorChecks(self):
     # Invalid arg types.
     with self.assertRaises(TypeError):
@@ -1500,6 +1727,8 @@ class PartitionInfoTest(test.TestCase):
     with self.assertRaises(ValueError):
       variable_scope._PartitionInfo(full_shape=[1, 1], var_offset=[0, 1])
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testSingleOffset(self):
     partition_info = variable_scope._PartitionInfo(
         full_shape=[9, 3], var_offset=[4, 0])
@@ -1510,6 +1739,8 @@ class PartitionInfoTest(test.TestCase):
         full_shape=[9, 3], var_offset=[0, 0])
     self.assertEqual(0, partition_info.single_offset([9, 3]))
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testSingleSliceDim(self):
     partition_info = variable_scope._PartitionInfo(
         full_shape=[9, 3], var_offset=[4, 0])
@@ -1539,6 +1770,8 @@ class PartitionInfoTest(test.TestCase):
 
 class VariableScopeMultithreadedTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testTwoThreadsDisjointScopeEntry(self):
 
     def thread_fn(i, graph):
@@ -1567,6 +1800,8 @@ class VariableScopeMultithreadedTest(test.TestCase):
     threads[1].start()
     threads[1].join()
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testTwoThreadsNestedScopeEntry(self):
 
     def thread_fn(i, graph, run_event, pause_event):
@@ -1604,6 +1839,8 @@ class VariableScopeMultithreadedTest(test.TestCase):
     threads[0].join()
     threads[1].join()
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
   def testReenterMainScope(self):
 
     def thread_fn(graph, main_thread_scope):
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index c2b86089f46d7d9cf2d8aae10db6d9a171411f61..336e9b0bca2339554339b655e2226ea35558bb00 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import operator
 
 import numpy as np
@@ -27,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_state_ops
@@ -41,6 +43,7 @@ from tensorflow.python.util import compat
 
 class VariablesTestCase(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testInitialization(self):
     with self.cached_session():
       var0 = variables.VariableV1(0.0)
@@ -58,16 +61,17 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([], var1.shape)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        var0.eval()
+        self.evaluate(var0)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        var1.eval()
+        self.evaluate(var1)
 
       variables.global_variables_initializer().run()
 
-      self.assertAllClose(0.0, var0.eval())
-      self.assertAllClose(1.1, var1.eval())
+      self.assertAllClose(0.0, self.evaluate(var0))
+      self.assertAllClose(1.1, self.evaluate(var1))
 
+  @test_util.run_v1_only("b/120545219")
   def testInitializationOrder(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([3, 6]), name="rnd")
@@ -94,8 +98,9 @@ class VariablesTestCase(test.TestCase):
 
       variables.global_variables_initializer().run()
 
-      self.assertAllClose(rnd.eval(), dep.eval())
-      self.assertAllClose(rnd.eval() + dep.eval() + 2.0, depdep.eval())
+      self.assertAllClose(rnd.eval(), self.evaluate(dep))
+      self.assertAllClose(rnd.eval() + self.evaluate(dep) + 2.0,
+                          self.evaluate(depdep))
 
   def testIterable(self):
     with self.assertRaisesRegexp(TypeError, "not iterable"):
@@ -105,6 +110,7 @@ class VariablesTestCase(test.TestCase):
       for _ in variables.Variable([0.0, 1.0]):
         pass
 
+  @test_util.run_deprecated_v1
   def testAssignments(self):
     with self.cached_session():
       var = variables.Variable(0.0)
@@ -112,34 +118,35 @@ class VariablesTestCase(test.TestCase):
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(0.0, var.eval())
+      self.assertAllClose(0.0, self.evaluate(var))
 
-      self.assertAllClose(1.0, plus_one.eval())
-      self.assertAllClose(1.0, var.eval())
+      self.assertAllClose(1.0, self.evaluate(plus_one))
+      self.assertAllClose(1.0, self.evaluate(var))
 
-      self.assertAllClose(-1.0, minus_one.eval())
-      self.assertAllClose(-1.0, var.eval())
+      self.assertAllClose(-1.0, self.evaluate(minus_one))
+      self.assertAllClose(-1.0, self.evaluate(var))
 
-      self.assertAllClose(4.0, four.eval())
-      self.assertAllClose(4.0, var.eval())
+      self.assertAllClose(4.0, self.evaluate(four))
+      self.assertAllClose(4.0, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testResourceAssignments(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       var = resource_variable_ops.ResourceVariable(0.0)
       plus_one = var.assign_add(1.0)
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(0.0, var.eval())
+      self.assertAllClose(0.0, self.evaluate(var))
 
-      plus_one.eval()
-      self.assertAllClose(1.0, var.eval())
+      self.evaluate(plus_one)
+      self.assertAllClose(1.0, self.evaluate(var))
 
-      minus_one.eval()
-      self.assertAllClose(-1.0, var.eval())
+      self.evaluate(minus_one)
+      self.assertAllClose(-1.0, self.evaluate(var))
 
-      four.eval()
-      self.assertAllClose(4.0, var.eval())
+      self.evaluate(four)
+      self.assertAllClose(4.0, self.evaluate(var))
 
   def testZeroSizeStringAssign(self):
     with self.cached_session() as sess:
@@ -148,10 +155,10 @@ class VariablesTestCase(test.TestCase):
           name="foo",
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES])
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       old_value = array.value()
       copy_op = array.assign(old_value)
-      self.assertEqual([], list(sess.run(copy_op)))
+      self.assertEqual([], list(self.evaluate(copy_op)))
 
   def _countUpToTest(self, dtype):
     with self.cached_session():
@@ -160,31 +167,34 @@ class VariablesTestCase(test.TestCase):
       count_up_to = var.count_up_to(3)
 
       variables.global_variables_initializer().run()
-      self.assertEqual(0, var.eval())
+      self.assertEqual(0, self.evaluate(var))
 
-      self.assertEqual(0, count_up_to.eval())
-      self.assertEqual(1, var.eval())
+      self.assertEqual(0, self.evaluate(count_up_to))
+      self.assertEqual(1, self.evaluate(var))
 
-      self.assertEqual(1, count_up_to.eval())
-      self.assertEqual(2, var.eval())
+      self.assertEqual(1, self.evaluate(count_up_to))
+      self.assertEqual(2, self.evaluate(var))
 
-      self.assertEqual(2, count_up_to.eval())
-      self.assertEqual(3, var.eval())
+      self.assertEqual(2, self.evaluate(count_up_to))
+      self.assertEqual(3, self.evaluate(var))
 
       with self.assertRaisesOpError("Reached limit of 3"):
-        count_up_to.eval()
-      self.assertEqual(3, var.eval())
+        self.evaluate(count_up_to)
+      self.assertEqual(3, self.evaluate(var))
 
       with self.assertRaisesOpError("Reached limit of 3"):
-        count_up_to.eval()
-      self.assertEqual(3, var.eval())
+        self.evaluate(count_up_to)
+      self.assertEqual(3, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testCountUpToInt32(self):
     self._countUpToTest(dtypes.int32)
 
+  @test_util.run_deprecated_v1
   def testCountUpToInt64(self):
     self._countUpToTest(dtypes.int64)
 
+  @test_util.run_v1_only("b/120545219")
   def testControlDepsNone(self):
     with self.cached_session():
       c = constant_op.constant(1.0)
@@ -198,6 +208,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([], var_x.value().op.control_inputs)
       self.assertEqual([], var_x._ref().op.control_inputs)  # pylint: disable=protected-access
 
+  @test_util.run_v1_only("b/120545219")
   def testControlFlow(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(0, name="v0")
@@ -220,20 +231,21 @@ class VariablesTestCase(test.TestCase):
       v2 = var_dict["v2"]
       # We should be able to initialize and run v1 and v2 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual([1], sess.run(v1))
-      sess.run(v2.initializer)
-      self.assertEqual([2], sess.run(v2))
+      self.evaluate(v1.initializer)
+      self.assertEqual([1], self.evaluate(v1))
+      self.evaluate(v2.initializer)
+      self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       # We should not be able to run 'add' yet.
       with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
-        sess.run(add)
+        self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
-      sess.run(add)
+      self.evaluate(v0.initializer)
+      self.evaluate(add)
 
+  @test_util.run_v1_only("b/120545219")
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
     def cond(i, _):
@@ -247,15 +259,17 @@ class VariablesTestCase(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "inside a control-flow"):
       control_flow_ops.while_loop(cond, body, [0, 0])
 
+  @test_util.run_deprecated_v1
   def testUseVariableAsTensor(self):
     with self.cached_session():
       var_x = variables.Variable(2.0)
       var_y = variables.Variable(3.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(2.0, var_x.eval())
-      self.assertAllClose(3.0, var_y.eval())
+      self.assertAllClose(2.0, self.evaluate(var_x))
+      self.assertAllClose(3.0, self.evaluate(var_y))
       self.assertAllClose(5.0, math_ops.add(var_x, var_y).eval())
 
+  @test_util.run_deprecated_v1
   def testZeroSizeVarSameAsConst(self):
     with self.cached_session():
       zero_size_var = variables.Variable(array_ops.zeros([0, 2]))
@@ -264,10 +278,11 @@ class VariablesTestCase(test.TestCase):
       const_mul = math_ops.matmul(
           zero_size_const, zero_size_const, transpose_b=True)
       variables.global_variables_initializer().run()
-      variable_output = variable_mul.eval()
+      variable_output = self.evaluate(variable_mul)
       self.assertAllClose(const_mul.eval(), variable_output)
       self.assertAllClose([[0., 0.], [0., 0.]], variable_output)
 
+  @test_util.run_deprecated_v1
   def testCachingDevice(self):
     with self.cached_session():
       var = variables.Variable(2.0)
@@ -278,6 +293,7 @@ class VariablesTestCase(test.TestCase):
       self.assertFalse(var_cached.device.startswith("/job:foo"))
       self.assertTrue(var_cached.value().device.startswith("/job:foo"))
 
+  @test_util.run_deprecated_v1
   def testCollections(self):
     with self.cached_session():
       var_x = variables.VariableV1(2.0)
@@ -293,6 +309,7 @@ class VariablesTestCase(test.TestCase):
                        variables.global_variables())
       self.assertEqual([var_x, var_z, var_t], variables.trainable_variables())
 
+  @test_util.run_deprecated_v1
   def testCollectionsWithScope(self):
     with self.cached_session():
       with ops.name_scope("scope_1"):
@@ -308,6 +325,13 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([var_x], variables.trainable_variables("scope_1"))
       self.assertEqual([var_y], variables.trainable_variables("scope_2"))
 
+  def testOperatorWrapping(self):
+    for attr in functools.WRAPPER_ASSIGNMENTS:
+      self.assertEqual(
+          getattr(variables.Variable.__add__, attr),
+          getattr(ops.Tensor.__add__, attr))
+
+  @test_util.run_deprecated_v1
   def testOperators(self):
     with self.cached_session():
       var_f = variables.Variable([2.0])
@@ -349,54 +373,46 @@ class VariablesTestCase(test.TestCase):
       rmatmul = var_m.__rmatmul__([[10.0], [20.0]])
 
       variables.global_variables_initializer().run()
-      self.assertAllClose([2.0], add.eval())
-      self.assertAllClose([3.0], radd.eval())
-      self.assertAllClose([1.0], sub.eval())
-      self.assertAllClose([-1.0], rsub.eval())
-      self.assertAllClose([20.0], mul.eval())
-      self.assertAllClose([20.0], rmul.eval())
-      self.assertAllClose([0.2], div.eval())
-      self.assertAllClose([5.0], rdiv.eval())
-      self.assertAllClose([-2.0], neg.eval())
-      self.assertAllClose([2.0], abs_v.eval())
-      self.assertAllClose([True], lt.eval())
-      self.assertAllClose([False], rlt.eval())
-      self.assertAllClose([True], le.eval())
-      self.assertAllClose([True], rle.eval())
-      self.assertAllClose([False], gt.eval())
-      self.assertAllClose([True], rgt.eval())
-      self.assertAllClose([True], ge.eval())
-      self.assertAllClose([True], rge.eval())
-
-      self.assertAllClose([6], mod.eval())
-      self.assertAllClose([3], rmod.eval())
-
-      self.assertAllClose([True, False], and_v.eval())
-      self.assertAllClose([True, True], or_v.eval())
-      self.assertAllClose([True, False], xor_v.eval())
-      self.assertAllClose([False, True], invert_v.eval())
-
-      self.assertAllClose(rnd[2, 0:0], slice_v.eval())
-
-      self.assertAllClose([[80.0]], matmul.eval())
-      self.assertAllClose([[20.0, 30.0], [40.0, 60.0]], rmatmul.eval())
-
+      self.assertAllClose([2.0], self.evaluate(add))
+      self.assertAllClose([3.0], self.evaluate(radd))
+      self.assertAllClose([1.0], self.evaluate(sub))
+      self.assertAllClose([-1.0], self.evaluate(rsub))
+      self.assertAllClose([20.0], self.evaluate(mul))
+      self.assertAllClose([20.0], self.evaluate(rmul))
+      self.assertAllClose([0.2], self.evaluate(div))
+      self.assertAllClose([5.0], self.evaluate(rdiv))
+      self.assertAllClose([-2.0], self.evaluate(neg))
+      self.assertAllClose([2.0], self.evaluate(abs_v))
+      self.assertAllClose([True], self.evaluate(lt))
+      self.assertAllClose([False], self.evaluate(rlt))
+      self.assertAllClose([True], self.evaluate(le))
+      self.assertAllClose([True], self.evaluate(rle))
+      self.assertAllClose([False], self.evaluate(gt))
+      self.assertAllClose([True], self.evaluate(rgt))
+      self.assertAllClose([True], self.evaluate(ge))
+      self.assertAllClose([True], self.evaluate(rge))
+
+      self.assertAllClose([6], self.evaluate(mod))
+      self.assertAllClose([3], self.evaluate(rmod))
+
+      self.assertAllClose([True, False], self.evaluate(and_v))
+      self.assertAllClose([True, True], self.evaluate(or_v))
+      self.assertAllClose([True, False], self.evaluate(xor_v))
+      self.assertAllClose([False, True], self.evaluate(invert_v))
+
+      self.assertAllClose(rnd[2, 0:0], self.evaluate(slice_v))
+
+      self.assertAllClose([[80.0]], self.evaluate(matmul))
+      self.assertAllClose([[20.0, 30.0], [40.0, 60.0]], self.evaluate(rmatmul))
+
+  @test_util.run_deprecated_v1
   def testSession(self):
     with self.cached_session() as sess:
       var = variables.Variable([1, 12])
       variables.global_variables_initializer().run()
-      self.assertAllClose([1, 12], sess.run(var))
-
-  def testDevicePlacement(self):
-    with self.cached_session() as sess:
-      with ops.device("/cpu:0"):
-        var = variables.Variable([1, 12])
-      init_value = var.initialized_value()
-      init_op = variables.global_variables_initializer()
-      self.assertEqual(var.op.device, init_value.device)
-      self.assertEqual(var.op.device, init_op.device)
-      sess.run(init_op)
+      self.assertAllClose([1, 12], self.evaluate(var))
 
+  @test_util.run_v1_only("b/120545219")
   def testColocation(self):
     with ops.device("/job:ps"):
       var = variables.VariableV1(0, name="v")
@@ -405,6 +421,7 @@ class VariablesTestCase(test.TestCase):
     self.assertDeviceEqual("/job:ps", assign_op.device)
     self.assertEqual([b"loc:@v"], assign_op.op.colocation_groups())
 
+  @test_util.run_v1_only("b/120545219")
   def testInitializerFunction(self):
     value = [[-42], [133.7]]
     shape = [2, 1]
@@ -416,7 +433,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual(shape, v1.shape)
       self.assertAllClose(value, v1.initial_value.eval())
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        v1.eval()
+        self.evaluate(v1)
 
       v2 = variables.Variable(
           math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
@@ -425,9 +442,9 @@ class VariablesTestCase(test.TestCase):
       self.assertAllClose(np.negative(value), v2.initial_value.eval())
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        v2.eval()
+        self.evaluate(v2)
       variables.global_variables_initializer().run()
-      self.assertAllClose(np.negative(value), v2.eval())
+      self.assertAllClose(np.negative(value), self.evaluate(v2))
 
   def testConstraintArg(self):
     constraint = lambda x: x
@@ -442,6 +459,7 @@ class VariablesTestCase(test.TestCase):
           lambda: constant_op.constant(1.),
           constraint=constraint)
 
+  @test_util.run_v1_only("b/120545219")
   def testNoRefDataRace(self):
     with self.cached_session():
       a = variables.Variable([1, 2, 3], dtype=dtypes.float32)
@@ -452,6 +470,7 @@ class VariablesTestCase(test.TestCase):
       self.assertAllEqual(b.eval(), [3, 4, 5])
       self.assertAllEqual(c.eval(), [5, 6, 7])
 
+  @test_util.run_deprecated_v1
   def testInitializerFunctionDevicePlacement(self):
     with self.cached_session():
       initializer = lambda: constant_op.constant(42.0)
@@ -470,6 +489,7 @@ class VariablesTestCase(test.TestCase):
       for i in v2.initializer.inputs:
         self.assertEqual(expected_group_v2, i.op.colocation_groups())
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = variables.Variable(
@@ -478,11 +498,11 @@ class VariablesTestCase(test.TestCase):
     with ops.Graph().as_default(), self.cached_session() as sess:
       # v describes a VariableDef-based variable without an initial value.
       v = variables.Variable(variable_def=v_def)
-      self.assertEqual(3.0, sess.run(v.initialized_value()))
+      self.assertEqual(3.0, self.evaluate(v.initialized_value()))
 
       # initialized_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
-      sess.run(v.assign(1.0))
+      self.evaluate(v.assign(1.0))
       self.assertEqual(1.0, v.initialized_value().eval())
 
     v_def.ClearField("initial_value_name")
@@ -494,7 +514,7 @@ class VariablesTestCase(test.TestCase):
       self.assertProtoEquals(v_def, v.to_proto())
       # But attempts to use initialized_value will result in errors.
       with self.assertRaises(ValueError):
-        sess.run(v.initialized_value())
+        self.evaluate(v.initialized_value())
 
   def testTrainableInProto(self):
     with ops.Graph().as_default():
@@ -513,14 +533,16 @@ class VariablesTestCase(test.TestCase):
           variables.Variable(variable_def=trainable_variable.to_proto())
           .trainable)
 
+  @test_util.run_deprecated_v1
   def testLoad(self):
     with self.cached_session():
       var = variables.Variable(np.zeros((5, 5), np.float32))
       variables.global_variables_initializer().run()
       var.load(np.ones((5, 5), np.float32))
 
-      self.assertAllClose(np.ones((5, 5), np.float32), var.eval())
+      self.assertAllClose(np.ones((5, 5), np.float32), self.evaluate(var))
 
+  @test_util.run_v1_only("b/120545219")
   def testRepr(self):
     var = variables.VariableV1(np.zeros((5, 5), np.float32), name="noop")
     self.assertEqual(
@@ -542,7 +564,7 @@ class IsInitializedTest(test.TestCase):
   def testNoVars(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       uninited = variables.report_uninitialized_variables()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testAssertVariablesInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -550,27 +572,28 @@ class IsInitializedTest(test.TestCase):
       w = variables.Variable([3, 4], name="w")
       _ = v, w
       uninited = variables.report_uninitialized_variables()
-      self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
+      self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
       variables.global_variables_initializer().run()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2], name="v")
       w = variables.VariableV1([3, 4], name="w")
       uninited = variables.report_uninitialized_variables()
-      self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
-      sess.run(w.initializer)
-      self.assertAllEqual(np.array([b"v"]), sess.run(uninited))
+      self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
+      self.evaluate(w.initializer)
+      self.assertAllEqual(np.array([b"v"]), self.evaluate(uninited))
       v.initializer.run()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testZeroSizeVarInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.Variable(array_ops.zeros([0, 2]), name="v")
       uninited = variables.report_uninitialized_variables()
       v.initializer.run()  # not strictly necessary
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testTrainingWithZeroSizeVar(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -581,8 +604,8 @@ class IsInitializedTest(test.TestCase):
       variables.global_variables_initializer().run()
       do_opt = gradient_descent.GradientDescentOptimizer(0.1).minimize(
           objective)
-      sess.run([do_opt])
-      self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], b.eval())
+      self.evaluate([do_opt])
+      self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], self.evaluate(b))
 
 
 class ObsoleteIsInitializedTest(test.TestCase):
@@ -591,6 +614,7 @@ class ObsoleteIsInitializedTest(test.TestCase):
     with ops.Graph().as_default():
       self.assertEqual(None, variables.assert_variables_initialized())
 
+  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -598,10 +622,11 @@ class ObsoleteIsInitializedTest(test.TestCase):
       _ = v, w
       inited = variables.assert_variables_initialized()
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        sess.run(inited)
+        self.evaluate(inited)
       variables.global_variables_initializer().run()
-      sess.run(inited)
+      self.evaluate(inited)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -609,7 +634,7 @@ class ObsoleteIsInitializedTest(test.TestCase):
       inited = variables.assert_variables_initialized([v])
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
-      sess.run(w.initializer)
+      self.evaluate(w.initializer)
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
       v.initializer.run()
@@ -697,46 +722,81 @@ class PartitionedVariableTest(test.TestCase):
             partitions=partitions)
 
   def testPartitionedVariableAssignments(self):
-    with ops.Graph().as_default(), self.cached_session() as sess:
+    with ops.Graph().as_default(), self.cached_session():
       v0 = variables.Variable(initial_value=[0.0])
       v1 = variables.Variable(initial_value=[1.0])
+      v2 = variables.Variable(initial_value=[20.0])
+      v3 = variables.Variable(initial_value=[30.0])
       v0._set_save_slice_info(
           variables.Variable.SaveSliceInfo(v0.name, [2], [0], [1]))
       v1._set_save_slice_info(
-          variables.Variable.SaveSliceInfo(v0.name, [2], [1], [1]))
+          variables.Variable.SaveSliceInfo(v1.name, [2], [1], [1]))
+      v2._set_save_slice_info(
+          variables.Variable.SaveSliceInfo(v2.name, [2], [0], [1]))
+      v3._set_save_slice_info(
+          variables.Variable.SaveSliceInfo(v3.name, [2], [1], [1]))
+
       partitions = [2]
 
       # Pass variable_list as [v1, v0] to ensure they are properly
       # re-sorted to [v0, v1] based on their slice info offsets.
-      partitioned_variable = variables.PartitionedVariable(
+      pv_0 = variables.PartitionedVariable(
           name="two_vars",
           shape=[2],
           dtype=v0.dtype,
           variable_list=[v0, v1],
           partitions=partitions)
 
+      pv_1 = variables.PartitionedVariable(
+          name="two_vars",
+          shape=[2],
+          dtype=v0.dtype,
+          variable_list=[v2, v3],
+          partitions=partitions)
+
       deltas_a = constant_op.constant([1.0, 2.0])
       deltas_b = constant_op.constant([3.0, 4.0])
       ones = array_ops.ones([2])
-      plus_delta = partitioned_variable.assign_add(deltas_a)
-      minus_delta = partitioned_variable.assign_sub(deltas_b)
-      assign_ones = partitioned_variable.assign(ones)
+      plus_delta = pv_0.assign_add(deltas_a)
+      minus_delta = pv_0.assign_sub(deltas_b)
+      assign_ones = pv_0.assign(ones)
+
+      c_0 = constant_op.constant([2.0])
+      c_1 = constant_op.constant([3.0])
+      assign_list = pv_1.assign([c_0, c_1])
+      assign_part_value = pv_1.assign_add(assign_ones)
+      assign_part_var = pv_1.assign_sub(pv_0)
       variables.global_variables_initializer().run()
 
       self.assertEqual([1.0], plus_delta[0].eval())
-      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([1.0], self.evaluate(v0))
       self.assertEqual([3.0], plus_delta[1].eval())
-      self.assertEqual([3.0], v1.eval())
+      self.assertEqual([3.0], self.evaluate(v1))
 
       self.assertEqual([-2.0], minus_delta[0].eval())
-      self.assertEqual([-2.0], v0.eval())
+      self.assertEqual([-2.0], self.evaluate(v0))
       self.assertEqual([-1.0], minus_delta[1].eval())
-      self.assertEqual([-1.0], v1.eval())
+      self.assertEqual([-1.0], self.evaluate(v1))
 
       self.assertEqual([1.0], assign_ones[0].eval())
-      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([1.0], self.evaluate(v0))
       self.assertEqual([1.0], assign_ones[1].eval())
-      self.assertEqual([1.0], v1.eval())
+      self.assertEqual([1.0], self.evaluate(v1))
+
+      self.assertEqual([2.0], assign_list[0].eval())
+      self.assertEqual([2.0], self.evaluate(v2))
+      self.assertEqual([3.0], assign_list[1].eval())
+      self.assertEqual([3.0], self.evaluate(v3))
+
+      self.assertEqual([3.0], assign_part_value[0].eval())
+      self.assertEqual([3.0], self.evaluate(v2))
+      self.assertEqual([4.0], assign_part_value[1].eval())
+      self.assertEqual([4.0], self.evaluate(v3))
+
+      self.assertEqual([2.0], assign_part_var[0].eval())
+      self.assertEqual([2.0], self.evaluate(v2))
+      self.assertEqual([3.0], assign_part_var[1].eval())
+      self.assertEqual([3.0], self.evaluate(v3))
 
 
 class VariableContainerTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/weights_broadcast_test.py b/tensorflow/python/kernel_tests/weights_broadcast_test.py
index 85f9abc69f78b048c78d4d0ab908371e7a8650d3..677d8f2f22f0e2877553d3698ef02a6902986727 100644
--- a/tensorflow/python/kernel_tests/weights_broadcast_test.py
+++ b/tensorflow/python/kernel_tests/weights_broadcast_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import test
@@ -51,40 +52,48 @@ class AssertBroadcastableTest(test.TestCase):
           values_placeholder: values,
       })
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     self._test_valid(weights=5, values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1x1(self):
     self._test_valid(
         weights=np.asarray((5,)).reshape((1, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1xN(self):
     self._test_valid(
         weights=np.asarray((5, 7, 11, 3)).reshape((1, 1, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNx1(self):
     self._test_valid(
         weights=np.asarray((5, 11)).reshape((1, 2, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNxN(self):
     self._test_valid(
         weights=np.asarray((5, 7, 11, 3, 2, 13, 7, 5)).reshape((1, 2, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1x1(self):
     self._test_valid(
         weights=np.asarray((5, 7, 11)).reshape((3, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1xN(self):
     self._test_valid(
         weights=np.asarray((
             5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3)).reshape((3, 1, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNxNxN(self):
     self._test_valid(
         weights=np.asarray((
@@ -107,29 +116,35 @@ class AssertBroadcastableTest(test.TestCase):
             values_placeholder: values,
         })
 
+  @test_util.run_deprecated_v1
   def testInvalid1(self):
     self._test_invalid(weights=np.asarray((5,)), values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalid1x1(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12)).reshape((3, 2)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12, 7, 5)).reshape((2, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidOnesExtraDim(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
@@ -137,6 +152,7 @@ class AssertBroadcastableTest(test.TestCase):
             2, 17, 11, 3, 5, 7, 11, 3, 2, 12, 7, 5)).reshape((3, 2, 4, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
@@ -158,24 +174,27 @@ class BroadcastWeightsTest(test.TestCase):
     dynamic_op = weights_broadcast_ops.broadcast_weights(
         weights=weights_placeholder, values=values_placeholder)
     with self.cached_session():
-      self.assertAllEqual(expected, static_op.eval())
+      self.assertAllEqual(expected, self.evaluate(static_op))
       self.assertAllEqual(expected, dynamic_op.eval(feed_dict={
           weights_placeholder: weights,
           values_placeholder: values,
       }))
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     self._test_valid(
         weights=5,
         values=_test_values((3, 2, 4)),
         expected=5 * np.ones((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1x1(self):
     self._test_valid(
         weights=np.asarray((5,)).reshape((1, 1, 1)),
         values=_test_values((3, 2, 4)),
         expected=5 * np.ones((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1xN(self):
     weights = np.asarray((5, 7, 11, 3)).reshape((1, 1, 4))
     self._test_valid(
@@ -183,6 +202,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(3, 2, 1)))
 
+  @test_util.run_deprecated_v1
   def test1xNx1(self):
     weights = np.asarray((5, 11)).reshape((1, 2, 1))
     self._test_valid(
@@ -190,6 +210,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(3, 1, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNxN(self):
     weights = np.asarray((5, 7, 11, 3, 2, 13, 7, 5)).reshape((1, 2, 4))
     self._test_valid(
@@ -197,6 +218,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(3, 1, 1)))
 
+  @test_util.run_deprecated_v1
   def testNx1x1(self):
     weights = np.asarray((5, 7, 11)).reshape((3, 1, 1))
     self._test_valid(
@@ -204,6 +226,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(1, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1xN(self):
     weights = np.asarray((
         5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3)).reshape((3, 1, 4))
@@ -212,6 +235,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(1, 2, 1)))
 
+  @test_util.run_deprecated_v1
   def testNxNxN(self):
     weights = np.asarray((
         5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3,
@@ -234,29 +258,35 @@ class BroadcastWeightsTest(test.TestCase):
             values_placeholder: values,
         })
 
+  @test_util.run_deprecated_v1
   def testInvalid1(self):
     self._test_invalid(weights=np.asarray((5,)), values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalid1x1(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12)).reshape((3, 2)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12, 7, 5)).reshape((2, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidOnesExtraDim(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
@@ -264,6 +294,7 @@ class BroadcastWeightsTest(test.TestCase):
             2, 17, 11, 3, 5, 7, 11, 3, 2, 12, 7, 5)).reshape((3, 2, 4, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
diff --git a/tensorflow/python/kernel_tests/where_op_test.py b/tensorflow/python/kernel_tests/where_op_test.py
index 29fb002ef445bcfa483dfc747428c7365860fe37..56c1390411324acf1cf4ff36c30f1c473e1df95c 100644
--- a/tensorflow/python/kernel_tests/where_op_test.py
+++ b/tensorflow/python/kernel_tests/where_op_test.py
@@ -27,32 +27,35 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
 class WhereOpTest(test.TestCase):
 
   def _testWhere(self, x, truth, expected_err_re=None):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       ans = array_ops.where(x)
       self.assertEqual([None, x.ndim], ans.get_shape().as_list())
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertAllClose(tf_ans, truth, atol=1e-10)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def testWrongNumbers(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
         array_ops.where([False, True], [1, 2], None)
       with self.assertRaises(ValueError):
         array_ops.where([False, True], None, [1, 2])
 
+  @test_util.run_deprecated_v1
   def testBasicVec(self):
     x = np.asarray([True, False])
     truth = np.asarray([[0]], dtype=np.int64)
@@ -66,11 +69,13 @@ class WhereOpTest(test.TestCase):
     truth = np.asarray([[2], [4]], dtype=np.int64)
     self._testWhere(x, truth)
 
+  @test_util.run_deprecated_v1
   def testRandomVec(self):
     x = np.random.rand(1000000) > 0.5
     truth = np.vstack([np.where(x)[0].astype(np.int64)]).T
     self._testWhere(x, truth)
 
+  @test_util.run_deprecated_v1
   def testBasicMat(self):
     x = np.asarray([[True, False], [True, False]])
 
@@ -79,6 +84,7 @@ class WhereOpTest(test.TestCase):
 
     self._testWhere(x, truth)
 
+  @test_util.run_deprecated_v1
   def testBasic3Tensor(self):
     x = np.asarray([[[True, False], [True, False]],
                     [[False, True], [False, True]],
@@ -98,49 +104,61 @@ class WhereOpTest(test.TestCase):
     truth = np.vstack(truth).T  # Convert to [num_true, indices].
     self._testWhere(x, truth, expected_err_re)
 
+  @test_util.run_deprecated_v1
   def testRandomBool(self):
     self._testRandom(np.bool)
 
+  @test_util.run_deprecated_v1
   def testRandomInt32(self):
     self._testRandom(np.int32)
 
+  @test_util.run_deprecated_v1
   def testRandomInt64(self):
     self._testRandom(np.int64)
 
+  @test_util.run_deprecated_v1
   def testRandomFloat(self):
     self._testRandom(np.float32)
 
+  @test_util.run_deprecated_v1
   def testRandomDouble(self):
     self._testRandom(np.float64)
 
+  @test_util.run_deprecated_v1
   def testRandomComplex64(self):
     self._testRandom(np.complex64)
 
+  @test_util.run_deprecated_v1
   def testRandomComplex128(self):
     self._testRandom(np.complex128)
 
+  @test_util.run_deprecated_v1
   def testRandomUint8(self):
     self._testRandom(np.uint8)
 
+  @test_util.run_deprecated_v1
   def testRandomInt8(self):
     self._testRandom(np.int8)
 
+  @test_util.run_deprecated_v1
   def testRandomInt16(self):
     self._testRandom(np.int16)
 
+  @test_util.run_deprecated_v1
   def testThreeArgument(self):
     x = np.array([[-2, 3, -1], [1, -3, -3]])
     np_val = np.where(x > 0, x * x, -x)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       tf_val = array_ops.where(constant_op.constant(x) > 0, x * x, -x).eval()
     self.assertAllEqual(tf_val, np_val)
 
+  @test_util.run_deprecated_v1
   def testBatchSelect(self):
     x = np.array([[-2, 3, -1] * 64, [1, -3, -3] * 64] * 8192)  # [16384, 192]
     c_mat = np.array([[False] * 192, [True] * 192] * 8192)  # [16384, 192]
     c_vec = np.array([False, True] * 8192)  # [16384]
     np_val = np.where(c_mat, x * x, -x)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       tf_val = array_ops.where(c_vec, x * x, -x).eval()
     self.assertAllEqual(tf_val, np_val)
 
@@ -160,7 +178,7 @@ class WhereBenchmark(test.Benchmark):
           x = random_ops.random_uniform((m, n), dtype=dtypes.float32) <= p
           v = resource_variable_ops.ResourceVariable(x)
           op = array_ops.where(v)
-        with session.Session() as sess:
+        with session.Session(config=benchmark.benchmark_config()) as sess:
           v.initializer.run()
           r = self.run_op_benchmark(sess, op, min_iters=100, name=name)
           gb_processed_input = m * n / 1.0e9
@@ -186,7 +204,7 @@ class WhereBenchmark(test.Benchmark):
           y = resource_variable_ops.ResourceVariable(y_gen)
           c = resource_variable_ops.ResourceVariable(c_gen)
           op = array_ops.where(c, x, y)
-        with session.Session() as sess:
+        with session.Session(config=benchmark.benchmark_config()) as sess:
           x.initializer.run()
           y.initializer.run()
           c.initializer.run()
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index 3a070544e8371e552fba5b9729f690c5ea977198..cae459a34e934cc804a56f5738202377a1227274 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -20,15 +20,21 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import while_v2
 from tensorflow.python.ops.control_flow_ops import while_loop as while_loop_v1
 from tensorflow.python.ops.while_v2 import while_loop as while_loop_v2
@@ -37,14 +43,30 @@ from tensorflow.python.platform import test
 
 class WhileV2Test(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSingleLoopVar(self):
     x = constant_op.constant(2.)
-    ret = while_loop_v2(lambda v: v < 8., lambda v: v * v, [x])
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v * v, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+
+  @test_util.run_v1_only("b/120545219")
+  def testReturnSameStructureTrue(self):
+    x = constant_op.constant(2.)
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v * v, [x], return_same_structure=True)
+    grad = gradients_impl.gradients(ret, [x])
+    with self.cached_session() as sess:
+      eval_result = sess.run(ret)
+      self.assertIsInstance(eval_result, list)
+      self.assertLen(eval_result, 1)
+      self.assertEqual(16., eval_result[0])
       self.assertSequenceEqual(sess.run(grad), [32.])
 
+  @test_util.run_deprecated_v1
   def testMultipleLoopVarsBasic(self):
     x = constant_op.constant(5.)
     y = constant_op.constant(3.)
@@ -53,15 +75,19 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # y = 3.
     # while x < 45.:
     #   x = x * y
-    ret = while_loop_v2(lambda v, _: v < 45., lambda v, w: (v * w, w), [x, y])
+    ret = while_loop_v2(
+        lambda v, _: v < 45.,
+        lambda v, w: (v * w, w), [x, y],
+        return_same_structure=False)
     # ret = [x*y^2, y]
 
     # Note: This is simply d_ret[0]/d_x since d_ret[1]/d_x is 0.
     grad = gradients_impl.gradients(ret, [x])  # [2*x*y]
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(ret), [45., 3.])
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertSequenceEqual(self.evaluate(ret), [45., 3.])
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
   def testMultipleLoopVars(self):
     x = constant_op.constant(5.)
     y = constant_op.constant(3.)
@@ -71,8 +97,10 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # while x < 45.:
     #   x = x * y
     #   y = x + y
-    ret = while_loop_v2(lambda v, _: v < 45., lambda v, w: (v * w, v + w),
-                        [x, y])
+    ret = while_loop_v2(
+        lambda v, _: v < 45.,
+        lambda v, w: (v * w, v + w), [x, y],
+        return_same_structure=False)
     # ret = [y*x**2 + x*y**2, x*y + x + y]
 
     gradx_0 = gradients_impl.gradients(ret[0], [x])  # [2*x*y + y**2]
@@ -82,34 +110,43 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grady_1 = gradients_impl.gradients(ret[1], [y])  # [x + 1]
     grady_2 = gradients_impl.gradients(ret, [y])  # [2*x*y + x**2 + x + 1]
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(ret), [120., 23.])
-      self.assertSequenceEqual(sess.run(gradx_0), [39.])
-      self.assertSequenceEqual(sess.run(gradx_1), [4.])
-      self.assertSequenceEqual(sess.run(gradx_2), [43.])
-      self.assertSequenceEqual(sess.run(grady_0), [55.])
-      self.assertSequenceEqual(sess.run(grady_1), [6.])
-      self.assertSequenceEqual(sess.run(grady_2), [61.])
-
+      self.assertSequenceEqual(self.evaluate(ret), [120., 23.])
+      self.assertSequenceEqual(self.evaluate(gradx_0), [39.])
+      self.assertSequenceEqual(self.evaluate(gradx_1), [4.])
+      self.assertSequenceEqual(self.evaluate(gradx_2), [43.])
+      self.assertSequenceEqual(self.evaluate(grady_0), [55.])
+      self.assertSequenceEqual(self.evaluate(grady_1), [6.])
+      self.assertSequenceEqual(self.evaluate(grady_2), [61.])
+
+  @test_util.run_deprecated_v1
   def testMultipleWhileLoops(self):
     x = constant_op.constant(2.)
-    ret1 = while_loop_v2(lambda v: v < 4., lambda v: v * v, [x])  # x**2
-    ret2 = while_loop_v2(lambda v: v < 16., lambda v: v * v, ret1)  # x**4
+    ret1 = while_loop_v2(
+        lambda v: v < 4., lambda v: v * v, [x],
+        return_same_structure=False)  # x**2
+    ret2 = while_loop_v2(
+        lambda v: v < 16., lambda v: v * v, [ret1],
+        return_same_structure=False)  # x**4
     grad = gradients_impl.gradients(ret2, [x])  # 4x**3
     grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(grad), [32.])
-      self.assertSequenceEqual(sess.run(grad_grad), [48.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
+  @test_util.run_deprecated_v1
   def testDoubleDerivative(self):
     x = constant_op.constant(2.)
-    ret = while_loop_v2(lambda v: v < 8., lambda v: v**2, [x])  # x**4
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v**2, [x],
+        return_same_structure=False)  # x**4
     grad = gradients_impl.gradients(ret, [x])  # 4x**3
     grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
-      self.assertSequenceEqual(sess.run(grad_grad), [48.])
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
+  @test_util.run_v1_only("b/120545219")
   def testPruning(self):
     x = constant_op.constant(1)
 
@@ -130,10 +167,12 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
     def GetOptimizedGraph():
       mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
-      rewriter_config = rewriter_config_pb2.RewriterConfig(
-          constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-          memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
-      return tf_optimizer.OptimizeGraph(rewriter_config, mg)
+      config = config_pb2.ConfigProto()
+      config.graph_options.rewrite_options.CopyFrom(
+          rewriter_config_pb2.RewriterConfig(
+              constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+              memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+      return tf_optimizer.OptimizeGraph(config, mg)
 
     g = GetOptimizedGraph()
     self.assertEqual(len([n for n in g.node if n.op == "Enter"]), 1)
@@ -143,24 +182,31 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     g = GetOptimizedGraph()
     self.assertEqual(len([n for n in g.node if n.op == "Enter"]), 2)
 
+  @test_util.run_deprecated_v1
   def testCaptureExternalTensorInCond(self):
     x = constant_op.constant(2.)
     y = constant_op.constant(1.)
-    ret = while_loop_v2(lambda v: v + y < 9., lambda v: v * 3., [x])
+    ret = while_loop_v2(
+        lambda v: v + y < 9.,
+        lambda v: v * 3., [x],
+        return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 18.)
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertEqual(self.evaluate(ret), 18.)
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
   def testCaptureExternalTensorInBody(self):
     x = constant_op.constant(2.)
     y = constant_op.constant(3.)
-    ret = while_loop_v2(lambda v: v < 8., lambda v: v * y, [x])
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v * y, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 18.)
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertEqual(self.evaluate(ret), 18.)
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
   def testLoopWithTensorListPushBack(self):
     x = constant_op.constant(2.)
 
@@ -176,12 +222,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       tl = list_ops.tensor_list_push_back(tl, constant_op.constant(100.))
       return x**2., tl
 
-    ret = while_loop_v2(Cond, Body, [x, tensor_list])
+    ret = while_loop_v2(
+        Cond, Body, [x, tensor_list], return_same_structure=False)
     grad = gradients_impl.gradients(ret[0], x)
     with self.cached_session() as sess:
       self.assertEqual(sess.run(ret[0]), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
+  @test_util.run_deprecated_v1
   def testDuplicateAccumulator(self):
     x = constant_op.constant(2.)
 
@@ -198,13 +246,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       tl = list_ops.tensor_list_push_back(tl, x)
       return x**2., tl
 
-    ret = while_loop_v2(Cond, Body, [x, tensor_list])
+    ret = while_loop_v2(
+        Cond, Body, [x, tensor_list], return_same_structure=False)
 
     for op in ops.get_default_graph().get_operations():
       if op.type == "While":
         while_op = op
 
-    body_graph = while_v2._get_body_graph(while_op)
+    body_graph = while_v2._get_graph(while_op, "body")
     # body_graph.inputs: [counter_arg, x_arg, tl_arg, *accumulators]
     x_input_t = body_graph.inputs[1]
     accumulator_count = len(
@@ -214,14 +263,15 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grad = gradients_impl.gradients(ret[0], x)
     with self.cached_session() as sess:
       self.assertEqual(sess.run(ret[0]), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
   @parameterized.named_parameters(
       ("UnknownShape", None),
       ("PartiallyDefinedShape", [None, 2]),
       ("FullyDefinedShape", [1, 2]),
   )
-  def testTensorListOutputElementShape(self, shape):
+  @test_util.run_deprecated_v1
+  def testAccumulatorElementShape(self, shape):
 
     def MatchShape(actual_tensor_shape):
       # Compare the shapes, treating None dimensions as equal. We do not
@@ -234,19 +284,26 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         self.assertListEqual(actual_tensor_shape.as_list(), shape)
 
     def GetAccumulatorForInputAtIndex(while_op, idx):
-      body_graph = while_v2._get_body_graph(while_op)
+      body_graph = while_v2._get_graph(while_op, "body")
       y_input_t = body_graph.inputs[idx]
       push_back_node = [c for c in y_input_t.consumers()
                         if c.type == "TensorListPushBack"][0]
       output_idx = body_graph.outputs.index(push_back_node.outputs[0])
       return while_op.outputs[output_idx]
 
-    x = constant_op.constant(2.)
+    x = array_ops.placeholder(dtype=dtypes.float32, shape=shape)
     y = array_ops.placeholder(dtype=dtypes.float32, shape=shape)
 
     # Forward pass.
-    ret = while_loop_v2(lambda v, u: v < 8., lambda v, u: (v * v, u), [x, y])
-    while_op = ret[0].op
+    ret = while_loop_v2(lambda v, u: v < 8.,
+                        lambda v, u: (math_ops.pow(v, u), u),
+                        [x, y],
+                        return_same_structure=True)
+    while_op = ret[0].op.inputs[0].op
+    # Gradient pass.
+    grad = gradients_impl.gradients(ret[0], x)
+    grad_while_op = grad[0].op.inputs[0].op
+
     # Get the TensorList output of While op containing the accumulated values
     # of y.
     # while_op.inputs: [counter_arg, x_arg, y_arg, *accumulators]
@@ -255,18 +312,156 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
                                            element_dtype=dtypes.float32)
     MatchShape(val.shape)
 
-    # Gradient pass.
-    grad = gradients_impl.gradients(ret[1], y)
-    grad_while_op = grad[0].op
+    # Take second derivative to generate intermediate grad_while_op outputs
+    gradients_impl.gradients(grad, x)
+
     # Get the TensorList output of gradient While op containing the accumulated
-    # values of grad_y.
+    # values of grad_x (note that grad_x is needed by the second derivative).
     # grad_while_op.inputs:
     # [counter_arg, total_iters_arg, grad_x_arg, grad_y_arg, *other_args]
-    grad_output = GetAccumulatorForInputAtIndex(grad_while_op, 4)
+    grad_output = GetAccumulatorForInputAtIndex(grad_while_op, 2)
     _, val = list_ops.tensor_list_pop_back(grad_output,
                                            element_dtype=dtypes.float32)
     MatchShape(val.shape)
 
+  def _createWhile(self, name):
+    """Helper function testDefaultName."""
+    output = while_v2.while_loop(
+        lambda i: i < 3,
+        lambda i: i + 1, [constant_op.constant(0)],
+        return_same_structure=False)
+    while_op = output.op.inputs[0].op
+    self.assertEqual(while_op.type, "While")
+    return while_op
+
+  def testDefaultName(self):
+    with ops.Graph().as_default():
+      while_op = self._createWhile(None)
+      self.assertEqual(while_op.name, "while")
+      self.assertRegexpMatches(
+          while_op.get_attr("cond").name, r"while_cond_\d*")
+      self.assertRegexpMatches(
+          while_op.get_attr("body").name, r"while_body_\d*")
+
+    with ops.Graph().as_default():
+      with ops.name_scope("foo"):
+        while1_op = self._createWhile("")
+        self.assertEqual(while1_op.name, "foo/while")
+        self.assertRegexpMatches(
+            while1_op.get_attr("cond").name, r"foo_while_cond_\d*")
+        self.assertRegexpMatches(
+            while1_op.get_attr("body").name, r"foo_while_body_\d*")
+
+        while2_op = self._createWhile(None)
+        self.assertEqual(while2_op.name, "foo/while_1")
+        self.assertRegexpMatches(
+            while2_op.get_attr("cond").name, r"foo_while_1_cond_\d*")
+        self.assertRegexpMatches(
+            while2_op.get_attr("body").name, r"foo_while_1_body_\d*")
+
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
+  def testWhileAndTensorArray(self):
+    param = constant_op.constant(2.0)
+    y0 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
+    # map_fn uses TensorArray internally.
+    r = functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y0)
+    grad = gradients_impl.gradients(r, param)[0]
+    self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], self.evaluate(r))
+    self.assertAllClose(21.0, self.evaluate(grad))
+
+  @test_util.run_deprecated_v1
+  def testNestedWhile(self):
+    # Compute sum of geometric progression: n^0 + n^1 + ... + n^m
+    # We compute the pow using a while loop.
+    n = constant_op.constant(3.)
+    m = constant_op.constant(5.)
+    sum_of_powers = constant_op.constant(0.)
+
+    def Body(i, previous_sum):
+      prod = constant_op.constant(1.)
+      return i - 1., previous_sum + while_loop_v2(
+          lambda c, _: c > 0,
+          lambda c, v: (c - 1., v * n), [i, prod],
+          return_same_structure=False)[1]
+
+    result = while_loop_v2(
+        lambda i, _: i >= 0,
+        Body, [m, sum_of_powers],
+        return_same_structure=False)[1]
+    grad = gradients_impl.gradients(result, [n])
+    with self.cached_session() as sess:
+      self.assertEqual(self.evaluate(result), 364.)
+      self.assertSequenceEqual(self.evaluate(grad), [547.])
+
+  @test_util.run_deprecated_v1
+  def testIdentityNodeInBody(self):
+
+    def Body(v):
+      v = array_ops.identity(v)
+      v = array_ops.identity(v)
+      return v * v
+
+    x = constant_op.constant(2.)
+    ret = while_loop_v2(
+        lambda v: v < 8., Body, [x], return_same_structure=False)
+    grad = gradients_impl.gradients(ret, [x])
+    with self.cached_session() as sess:
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+
+  @test_util.run_deprecated_v1
+  def testNestedWhileAndTensorArray(self):
+    n = constant_op.constant(3.0)
+
+    def Body(row, ta, n):
+
+      def InnerBody(row, col, ta, n):
+        # Note: row and col are 1-based.
+        ta = ta.write(
+            math_ops.cast(n * (row - 1.) + col - 1., dtypes.int32), row * col)
+        return row, col + 1., ta, n
+
+      # TODO(b/118457764): Remove n from loop_vars from both loops once fixed.
+      ta = while_loop_v2(
+          lambda _, col, _1, n: col <= n,
+          InnerBody, [row, constant_op.constant(1.), ta, n],
+          return_same_structure=False)[2]
+      return row + 1., ta, n
+
+    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=9)
+    ta = while_loop_v2(
+        lambda row, _, _1: row <= n,
+        Body, [constant_op.constant(1.), ta, n],
+        return_same_structure=False)[1]
+
+    output = array_ops.reshape(ta.stack(), [3, 3])
+    self.assertAllEqual(
+        self.evaluate(output), [[1., 2., 3.], [2., 4., 6.], [3., 6., 9.]])
+    # TODO(b/117675481): This does not work with current TA. Enable with new TA.
+    # grad = gradients_impl.gradients(output, [n])
+    # self.assertEqual(self.evaluate(grad), 3.5)
+
+  @test_util.run_deprecated_v1
+  def testForwardPassRewrite(self):
+    x = constant_op.constant(1.0, name="x")
+    output = while_v2.while_loop(lambda x: x < 10.0,
+                                 lambda x: x * 2.0,
+                                 [x])[0]
+    while_op = output.op.inputs[0].op
+    self.assertEqual(while_op.type, "While")
+    # outputs = [loop_counter, x]
+    self.assertLen(while_op.outputs, 2)
+
+    gradients_impl.gradients(output, x)
+    # while_op should have been rewritten to output 2.0 intermediate.
+    # outputs = [loop_counter, x, 2.0_accumulator, x_accumulator]
+    self.assertLen(while_op.outputs, 4)
+
+    gradients_impl.gradients(output, x)
+    # Computing the gradient again shouldn't rewrite while_op again.
+    self.assertLen(while_op.outputs, 4)
+
 
 def ScalarShape():
   return ops.convert_to_tensor([], dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index 729885169ef2b3114371fd3de65cc15f68b3a676..f5d03c2370186e39cad2ba9aa29d03c454de9168 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -53,19 +54,19 @@ class XentTest(test.TestCase):
 
   def _testXent(self, np_features, np_labels, use_gpu=False):
     np_loss, np_backprop = self._npXent(np_features, np_labels)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
           np_features, np_labels)
-      tf_loss, tf_backprop = sess.run([loss, backprop])
+      tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
   def _testXentWrapper(self, np_features, np_labels, dim=-1, use_gpu=False):
     np_loss, _ = self._npXent(np_features, np_labels, dim=dim)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       loss = nn_ops.softmax_cross_entropy_with_logits(
           labels=np_labels, logits=np_features, dim=dim)
-      tf_loss = sess.run(loss)
+      tf_loss = self.evaluate(loss)
     print("np_loss:", np_loss)
     print("tf_loss:", tf_loss)
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
@@ -76,11 +77,11 @@ class XentTest(test.TestCase):
 
   def _testSingleClass(self, use_gpu=False):
     for dtype in np.float16, np.float32:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
             np.array([[1.], [-1.], [0.]]).astype(dtype),
             np.array([[-1.], [0.], [1.]]).astype(dtype))
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
       self.assertAllClose([0.0, 0.0, 0.0], tf_loss)
       self.assertAllClose([[2.0], [1.0], [0.0]], tf_backprop)
 
@@ -88,6 +89,7 @@ class XentTest(test.TestCase):
     self._testSingleClass(True)
     self._testSingleClass(False)
 
+  @test_util.run_deprecated_v1
   def testRankTooLarge(self):
     for dtype in np.float16, np.float32:
       np_features = np.array([[[1., 1., 1., 1.]], [[1., 2., 3.,
@@ -145,19 +147,21 @@ class XentTest(test.TestCase):
     tf_l = constant_op.constant(
         np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float32))
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
             tf_f, tf_l)
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
       self.assertAllCloseAccordingToType(np_loss, tf_loss)
       self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         gen_nn_ops.softmax_cross_entropy_with_logits(
             [[0., 1.], [2., 3.]], [[0., 1., 0.], [1., 0., 0.]])
 
+  @test_util.run_deprecated_v1
   def testNotMatrix(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -179,6 +183,7 @@ class XentTest(test.TestCase):
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float64),
         np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float64))
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session() as sess:
       l = constant_op.constant(
@@ -206,6 +211,7 @@ class XentTest(test.TestCase):
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
+  @test_util.run_deprecated_v1
   def testGradientLabelWithV2(self):
     with self.cached_session():
       l = constant_op.constant(
@@ -224,6 +230,7 @@ class XentTest(test.TestCase):
 
     self.assertLess(err, 5e-8)
 
+  @test_util.run_deprecated_v1
   def testSecondGradient(self):
     with self.cached_session() as sess:
       l = constant_op.constant(
@@ -277,10 +284,10 @@ class XentTest(test.TestCase):
     features = np.zeros([0, 2, 4]).astype(np.float32)
     labels = np.zeros([0, 2, 4]).astype(np.float32)
     np_loss, _ = self._npXent(features, labels)
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       loss = nn_ops.softmax_cross_entropy_with_logits(
           labels=labels, logits=features)
-      tf_loss = sess.run(loss)
+      tf_loss = self.evaluate(loss)
     self.assertAllEqual(np_loss, tf_loss)
 
 
diff --git a/tensorflow/python/kernel_tests/zero_division_test.py b/tensorflow/python/kernel_tests/zero_division_test.py
index dd0214e0f1337c75557e180d88c284ca5b2eb617..3dd9ec4ba9459b95f690a2146c7f94ad75043d6d 100644
--- a/tensorflow/python/kernel_tests/zero_division_test.py
+++ b/tensorflow/python/kernel_tests/zero_division_test.py
@@ -21,13 +21,15 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class ZeroDivisionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testZeros(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       for dtype in dtypes.uint8, dtypes.int16, dtypes.int32, dtypes.int64:
         zero = constant_op.constant(0, dtype=dtype)
         one = constant_op.constant(1, dtype=dtype)
@@ -36,7 +38,7 @@ class ZeroDivisionTest(test.TestCase):
           bads.append(one % zero)
         for bad in bads:
           try:
-            result = bad.eval()
+            result = self.evaluate(bad)
           except errors_impl.OpError as e:
             # Ideally, we'd get a nice exception.  In theory, this should only
             # happen on CPU, but 32 bit integer GPU division is actually on
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index e399ece2326f482921fd0e9e45021d7fdde33650..bfe591f875556c9dbcf3001bec4fe836bca3593f 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -23,17 +23,134 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-
+# Avoid breaking users who directly import this symbol from this file.
+# TODO(fchollet): remove this.
 InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
 
+_KERAS_STYLE_SCOPE = False
+
+
+@tf_export(v1=['layers.experimental.keras_style_scope'])
+@tf_contextlib.contextmanager
+def keras_style_scope():
+  """Use Keras-style variable management.
+
+  All tf.layers and tf RNN cells created in this scope use Keras-style
+  variable management.  Creating such layers with a scope= argument is
+  disallowed, and reuse=True is disallowed.
+
+  The purpose of this scope is to allow users of existing layers to
+  slowly transition to a Keras layers API without breaking existing
+  functionality.
+
+  One example of this is when using TensorFlow's RNN classes with Keras
+  Models or Networks.  Because Keras models do not properly set variable
+  scopes, users of RNNs may either accidentally share scopes between two
+  different models, or get errors about variables that already exist.
+
+  Example:
+
+  ```python
+  class RNNModel(tf.keras.Model):
+
+    def __init__(self, name):
+      super(RNNModel, self.).__init__(name=name)
+      self.rnn = tf.nn.rnn_cell.MultiRNNCell(
+        [tf.nn.rnn_cell.LSTMCell(64) for _ in range(2)])
+
+    def call(self, input, state):
+      return self.rnn(input, state)
+
+  model_1 = RNNModel("model_1")
+  model_2 = RNNModel("model_2")
+
+  # OK
+  output_1, next_state_1 = model_1(input, state)
+  # Raises an error about trying to create an already existing variable.
+  output_2, next_state_2 = model_2(input, state)
+  ```
+
+  The solution is to wrap the model construction and execution in a keras-style
+  scope:
+
+  ```python
+  with keras_style_scope():
+    model_1 = RNNModel("model_1")
+    model_2 = RNNModel("model_2")
+
+    # model_1 and model_2 are guaranteed to create their own variables.
+    output_1, next_state_1 = model_1(input, state)
+    output_2, next_state_2 = model_2(input, state)
+
+    assert len(model_1.weights) > 0
+    assert len(model_2.weights) > 0
+    assert(model_1.weights != model_2.weights)
+  ```
+
+  Yields:
+    A keras layer style scope.
+  """
+  global _KERAS_STYLE_SCOPE
+  stack = _KERAS_STYLE_SCOPE
+  _KERAS_STYLE_SCOPE = True
+  try:
+    yield
+  finally:
+    _KERAS_STYLE_SCOPE = stack
+
+
+@tf_export(v1=['layers.experimental.set_keras_style'])
+def set_keras_style():
+  """Use Keras-style variable management.
+
+  All tf.layers and tf RNN cells created after keras style ha been enabled
+  use Keras-style variable management.  Creating such layers with a
+  scope= argument is disallowed, and reuse=True is disallowed.
 
-@tf_export('layers.Layer')
+  The purpose of this function is to allow users of existing layers to
+  slowly transition to Keras layers API without breaking existing
+  functionality.
+
+  For more details, see the documentation for `keras_style_scope`.
+
+  Note, once keras style has been set, it is set globally for the entire
+  program and cannot be unset.
+
+  Example:
+
+  ```python
+  set_keras_style()
+
+  model_1 = RNNModel(name="model_1")
+  model_2 = RNNModel(name="model_2")
+
+  # model_1 and model_2 are guaranteed to create their own variables.
+  output_1, next_state_1 = model_1(input, state)
+  output_2, next_state_2 = model_2(input, state)
+
+  assert len(model_1.weights) > 0
+  assert len(model_2.weights) > 0
+  assert(model_1.weights != model_2.weights)
+  ```
+  """
+  global _KERAS_STYLE_SCOPE
+  _KERAS_STYLE_SCOPE = True
+
+
+def _is_in_keras_style_scope():
+  global _KERAS_STYLE_SCOPE
+  return _KERAS_STYLE_SCOPE
+
+
+@tf_export(v1=['layers.Layer'])
 class Layer(base_layer.Layer):
   """Base layer class.
 
@@ -83,6 +200,19 @@ class Layer(base_layer.Layer):
     super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
                                 **kwargs)
 
+    if _is_in_keras_style_scope():
+      if scope is not None:
+        raise ValueError(
+            'scope argument not allowed when keras style layers are enabled, '
+            'but saw: {}'.format(scope))
+      if self._reuse is not None:
+        raise ValueError(
+            'reuse argument not allowed when keras style layers are enabled, '
+            'but saw: {}'.format(self._reuse))
+      self._keras_style = True
+    else:
+      self._keras_style = False
+
     self._graph = None
     self._call_has_scope_arg = 'scope' in self._call_fn_args
     if scope:
@@ -102,6 +232,7 @@ class Layer(base_layer.Layer):
     # Determine layer name (non-unique).
     if isinstance(name, vs.VariableScope):
       base_name = name.name
+      self._name, _ = self._make_unique_name()
     else:
       base_name = name
       self._name = name
@@ -112,11 +243,11 @@ class Layer(base_layer.Layer):
   def _make_unique_name(self, name_uid_map=None, avoid_names=None,
                         namespace='', zero_based=False):
     base_name = base_layer.to_snake_case(self.__class__.__name__)
-    name = base_layer.unique_layer_name(base_name,
-                                        name_uid_map=name_uid_map,
-                                        avoid_names=avoid_names,
-                                        namespace=namespace,
-                                        zero_based=zero_based)
+    name = base_layer_utils.unique_layer_name(base_name,
+                                              name_uid_map=name_uid_map,
+                                              avoid_names=avoid_names,
+                                              namespace=namespace,
+                                              zero_based=zero_based)
     return (name, base_name)
 
   @property
@@ -148,6 +279,8 @@ class Layer(base_layer.Layer):
 
   def _name_scope(self):
     """Determines op naming for the Layer."""
+    if self._keras_style:
+      return super(Layer, self)._name_scope()
     return self._current_scope.original_name_scope
 
   def _set_scope(self, scope=None):
@@ -220,6 +353,20 @@ class Layer(base_layer.Layer):
       ValueError: When trainable has been set to True with synchronization
         set as `ON_READ`.
     """
+    if self._keras_style:
+      return super(Layer, self).add_weight(
+          name=name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          trainable=trainable,
+          constraint=constraint,
+          use_resource=use_resource,
+          synchronization=vs.VariableSynchronization.AUTO,
+          aggregation=vs.VariableAggregation.NONE,
+          partitioner=partitioner)
+
     if synchronization == vs.VariableSynchronization.ON_READ:
       if trainable:
         raise ValueError(
@@ -332,7 +479,16 @@ class Layer(base_layer.Layer):
     Raises:
       ValueError: if the layer's `call` method returns None (an invalid value).
     """
-    self._set_scope(kwargs.pop('scope', None))
+    scope = kwargs.pop('scope', None)
+
+    if self._keras_style:
+      if scope is not None:
+        raise ValueError(
+            'scope argument not allowed when keras style layers are enabled, '
+            'but saw: {}'.format(scope))
+      return super(Layer, self).__call__(inputs, *args, **kwargs)
+
+    self._set_scope(scope)
 
     if not context.executing_eagerly():
       try:
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index d2443db6651cdab2aaf5fb2b9d678080b48bb254..d0ec4f4425f2ea92ba5699cf4ae2d81a86ea27dd 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -25,6 +25,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import base_layer as keras_base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layers
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
@@ -59,6 +61,29 @@ class BaseLayerTest(test.TestCase):
     layer.add_variable('my_var', [2, 2])
     self.assertEqual(layer.name, 'my_layer')
 
+  @test_util.run_in_graph_and_eager_modes
+  def testKerasStyleAddWeight(self):
+    keras_layer = keras_base_layer.Layer(name='keras_layer')
+    with ops.name_scope('foo'):
+      keras_variable = keras_layer.add_variable(
+          'my_var', [2, 2], initializer=init_ops.zeros_initializer())
+    self.assertEqual(keras_variable.name, 'foo/my_var:0')
+
+    with ops.name_scope('baz'):
+      old_style_layer = base_layers.Layer(name='my_layer')
+      # Test basic variable creation.
+      variable = old_style_layer.add_variable(
+          'my_var', [2, 2], initializer=init_ops.zeros_initializer())
+    self.assertEqual(variable.name, 'my_layer/my_var:0')
+
+    with base_layers.keras_style_scope():
+      layer = base_layers.Layer(name='my_layer')
+    # Test basic variable creation.
+    with ops.name_scope('bar'):
+      variable = layer.add_variable(
+          'my_var', [2, 2], initializer=init_ops.zeros_initializer())
+    self.assertEqual(variable.name, 'bar/my_var:0')
+
   @test_util.run_in_graph_and_eager_modes
   def testAddWeight(self):
     layer = base_layers.Layer(name='my_layer')
@@ -118,6 +143,7 @@ class BaseLayerTest(test.TestCase):
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           trainable=True)
 
+  @test_util.run_deprecated_v1
   def testReusePartitionedVaraiblesAndRegularizers(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     partitioner = partitioned_variables.fixed_size_partitioner(3)
@@ -133,11 +159,6 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 3)
 
-  def testNoEagerActivityRegularizer(self):
-    with context.eager_mode():
-      with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
-        core_layers.Dense(1, activity_regularizer=lambda *args, **kwargs: 0.)
-
   @test_util.run_in_graph_and_eager_modes
   def testCall(self):
 
@@ -232,7 +253,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(ndim=2)
+        self.input_spec = input_spec.InputSpec(ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -259,7 +280,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(min_ndim=2)
+        self.input_spec = input_spec.InputSpec(min_ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -287,7 +308,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(max_ndim=2)
+        self.input_spec = input_spec.InputSpec(max_ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -315,7 +336,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(dtype='float32')
+        self.input_spec = input_spec.InputSpec(dtype='float32')
 
       def call(self, inputs):
         return inputs
@@ -335,7 +356,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(axes={-1: 2})
+        self.input_spec = input_spec.InputSpec(axes={-1: 2})
 
       def call(self, inputs):
         return inputs
@@ -357,7 +378,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(shape=(None, 3))
+        self.input_spec = input_spec.InputSpec(shape=(None, 3))
 
       def call(self, inputs):
         return inputs
@@ -425,6 +446,7 @@ class BaseLayerTest(test.TestCase):
       self.assertTrue(isinstance(result, dict))
       self.assertEqual(set(['label', 'logits']), set(result.keys()))
 
+  @test_util.run_deprecated_v1
   def testActivityRegularizer(self):
     regularizer = math_ops.reduce_sum
     layer = base_layers.Layer(activity_regularizer=regularizer)
@@ -513,6 +535,7 @@ class BaseLayerTest(test.TestCase):
         self.assertEqual(len(layer.trainable_variables), 1)
         self.assertEqual(layer.variables[0].graph, outer_graph)
 
+  @test_util.run_deprecated_v1
   def testGetUpdateFor(self):
 
     class MyLayer(base_layers.Layer):
@@ -557,6 +580,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
     self.assertEqual(len(layer.get_updates_for([outputs])), 0)
 
+  @test_util.run_deprecated_v1
   def testGetLossesFor(self):
 
     class MyLayer(base_layers.Layer):
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index d40743b0cea29553430a0fc247684f7b182a94ee..5d4805e245e17376e8719466868326b34d7cf12d 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -22,10 +22,11 @@ from __future__ import print_function
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('layers.Conv1D')
+@tf_export(v1=['layers.Conv1D'])
 class Conv1D(keras_layers.Conv1D, base.Layer):
   """1D convolution layer (e.g. temporal convolution).
 
@@ -114,7 +115,10 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
         name=name, **kwargs)
 
 
-@tf_export('layers.conv1d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv1d instead.')
+@tf_export(v1=['layers.conv1d'])
 def conv1d(inputs,
            filters,
            kernel_size,
@@ -214,7 +218,7 @@ def conv1d(inputs,
   return layer.apply(inputs)
 
 
-@tf_export('layers.Conv2D')
+@tf_export(v1=['layers.Conv2D'])
 class Conv2D(keras_layers.Conv2D, base.Layer):
   """2D convolution layer (e.g. spatial convolution over images).
 
@@ -310,7 +314,10 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
         name=name, **kwargs)
 
 
-@tf_export('layers.conv2d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv2d instead.')
+@tf_export(v1=['layers.conv2d'])
 def conv2d(inputs,
            filters,
            kernel_size,
@@ -417,7 +424,7 @@ def conv2d(inputs,
   return layer.apply(inputs)
 
 
-@tf_export('layers.Conv3D')
+@tf_export(v1=['layers.Conv3D'])
 class Conv3D(keras_layers.Conv3D, base.Layer):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
@@ -514,7 +521,10 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
         name=name, **kwargs)
 
 
-@tf_export('layers.conv3d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv3d instead.')
+@tf_export(v1=['layers.conv3d'])
 def conv3d(inputs,
            filters,
            kernel_size,
@@ -622,7 +632,7 @@ def conv3d(inputs,
   return layer.apply(inputs)
 
 
-@tf_export('layers.SeparableConv1D')
+@tf_export(v1=['layers.SeparableConv1D'])
 class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
   """Depthwise separable 1D convolution.
 
@@ -729,7 +739,7 @@ class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
         **kwargs)
 
 
-@tf_export('layers.SeparableConv2D')
+@tf_export(v1=['layers.SeparableConv2D'])
 class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
   """Depthwise separable 2D convolution.
 
@@ -841,7 +851,10 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
         **kwargs)
 
 
-@tf_export('layers.separable_conv1d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.separable_conv1d instead.')
+@tf_export(v1=['layers.separable_conv1d'])
 def separable_conv1d(inputs,
                      filters,
                      kernel_size,
@@ -958,7 +971,10 @@ def separable_conv1d(inputs,
   return layer.apply(inputs)
 
 
-@tf_export('layers.separable_conv2d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.separable_conv2d instead.')
+@tf_export(v1=['layers.separable_conv2d'])
 def separable_conv2d(inputs,
                      filters,
                      kernel_size,
@@ -1080,7 +1096,7 @@ def separable_conv2d(inputs,
   return layer.apply(inputs)
 
 
-@tf_export('layers.Conv2DTranspose')
+@tf_export(v1=['layers.Conv2DTranspose'])
 class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
   """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
@@ -1165,7 +1181,10 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
         **kwargs)
 
 
-@tf_export('layers.conv2d_transpose')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv2d_transpose instead.')
+@tf_export(v1=['layers.conv2d_transpose'])
 def conv2d_transpose(inputs,
                      filters,
                      kernel_size,
@@ -1260,7 +1279,7 @@ def conv2d_transpose(inputs,
   return layer.apply(inputs)
 
 
-@tf_export('layers.Conv3DTranspose')
+@tf_export(v1=['layers.Conv3DTranspose'])
 class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
   """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
 
@@ -1342,7 +1361,10 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
         **kwargs)
 
 
-@tf_export('layers.conv3d_transpose')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.conv3d_transpose instead.')
+@tf_export(v1=['layers.conv3d_transpose'])
 def conv3d_transpose(inputs,
                      filters,
                      kernel_size,
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index 257fa27156749713bd35f22f82b7cc6c81c23a70..a3e493edfeadfe6f68446616df8b81177e013921 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -59,6 +60,7 @@ class ConvTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.conv2d(images, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateConv2D(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -87,6 +89,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testCreateConv2DChannelsFirst(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, 4, height, width))
@@ -97,6 +100,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testUnknownInputChannels(self):
     images = array_ops.placeholder(dtypes.float32, (5, 7, 9, None))
     layer = conv_layers.Conv2D(32, [3, 3], activation=nn_ops.relu)
@@ -140,6 +144,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height / 2, width, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateConv1D(self):
     width = 7
     data = random_ops.random_uniform((5, width, 4))
@@ -156,6 +161,7 @@ class ConvTest(test.TestCase):
     output = conv_layers.conv1d(data, 32, 3, activation=nn_ops.relu)
     self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateConv1DChannelsFirst(self):
     width = 7
     data = random_ops.random_uniform((5, 4, width))
@@ -165,6 +171,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testUnknownInputChannelsConv1D(self):
     data = array_ops.placeholder(dtypes.float32, (5, 4, None))
     layer = conv_layers.Conv1D(32, 3, activation=nn_ops.relu)
@@ -180,6 +187,7 @@ class ConvTest(test.TestCase):
                                  'should be defined. Found `None`.'):
       _ = layer.apply(data)
 
+  @test_util.run_deprecated_v1
   def testCreateConv3D(self):
     depth, height, width = 6, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 4))
@@ -191,6 +199,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testUnknownInputChannelsConv3D(self):
     volumes = array_ops.placeholder(dtypes.float32, (5, 6, 7, 9, None))
     layer = conv_layers.Conv3D(32, [3, 3, 3], activation=nn_ops.relu)
@@ -199,6 +208,7 @@ class ConvTest(test.TestCase):
                                  'should be defined. Found `None`.'):
       _ = layer.apply(volumes)
 
+  @test_util.run_deprecated_v1
   def testConv2DKernelRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -210,6 +220,7 @@ class ConvTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DBiasRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -221,6 +232,7 @@ class ConvTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -247,6 +259,7 @@ class ConvTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, height - 2, 3, 32])
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -255,6 +268,7 @@ class ConvTest(test.TestCase):
     conv_layers.conv2d(images, 32, [3, 3], name='conv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       height, width = 7, 9
@@ -265,6 +279,7 @@ class ConvTest(test.TestCase):
       conv_layers.conv2d(images, 32, [3, 3], name='conv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -276,13 +291,14 @@ class ConvTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 32)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((32)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -325,6 +341,7 @@ class ConvTest(test.TestCase):
     self.assertEqual(conv3d.kernel_constraint, k_constraint)
     self.assertEqual(conv3d.bias_constraint, b_constraint)
 
+  @test_util.run_deprecated_v1
   def testConv3DChannelsFirst(self):
     # Test case for GitHub issue 15655
     images = array_ops.placeholder(
@@ -358,6 +375,7 @@ class SeparableConv1DTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.separable_conv1d(data, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv1D(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -379,6 +397,7 @@ class SeparableConv1DTest(test.TestCase):
     self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 8, 32])
     self.assertEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv1DChannelsFirst(self):
     length = 9
     data = random_ops.random_uniform((5, 4, length))
@@ -404,6 +423,7 @@ class SeparableConv1DTest(test.TestCase):
     output = layer.apply(data)
     self.assertEqual(output.get_shape().as_list(), [5, length // 2, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv1DWithStridesChannelsFirst(self):
     data_format = 'channels_first'
     length = 10
@@ -413,6 +433,7 @@ class SeparableConv1DTest(test.TestCase):
     output = layer.apply(data)
     self.assertEqual(output.get_shape().as_list(), [5, 32, length // 2])
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv1DReuse(self):
     length = 10
     data = random_ops.random_uniform((5, length, 3), seed=1)
@@ -421,6 +442,7 @@ class SeparableConv1DTest(test.TestCase):
     conv_layers.separable_conv1d(data, 32, 3, name='sepconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv1DReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       length = 10
@@ -431,6 +453,7 @@ class SeparableConv1DTest(test.TestCase):
       conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
       self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv1DNoReuse(self):
     length = 10
     data = random_ops.random_uniform((5, length, 3), seed=1)
@@ -439,6 +462,7 @@ class SeparableConv1DTest(test.TestCase):
     conv_layers.separable_conv1d(data, 32, 3)
     self.assertEqual(len(variables.trainable_variables()), 6)
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DDepthwiseRegularizer(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -450,6 +474,7 @@ class SeparableConv1DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DPointwiseRegularizer(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -461,6 +486,7 @@ class SeparableConv1DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DBiasRegularizer(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -472,6 +498,7 @@ class SeparableConv1DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DNoBias(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -522,6 +549,7 @@ class SeparableConv2DTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.separable_conv2d(images, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv2D(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -562,6 +590,7 @@ class SeparableConv2DTest(test.TestCase):
                          [1, 1, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv2DChannelsFirst(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, 4, height, width))
@@ -584,6 +613,7 @@ class SeparableConv2DTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConvWithStrides(self):
     height, width = 6, 8
     # Test strides tuple
@@ -607,6 +637,7 @@ class SeparableConv2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height / 2, width, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConvWithStridesChannelsFirst(self):
     data_format = 'channels_first'
     height, width = 6, 8
@@ -632,6 +663,7 @@ class SeparableConv2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, 32, height / 2, width])
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -641,6 +673,7 @@ class SeparableConv2DTest(test.TestCase):
         images, 32, [3, 3], name='sepconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       height, width = 7, 9
@@ -651,6 +684,7 @@ class SeparableConv2DTest(test.TestCase):
       conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
       self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -663,14 +697,15 @@ class SeparableConv2DTest(test.TestCase):
         self.assertTrue('depthwise_kernel' in weights[0].name)
         self.assertTrue('pointwise_kernel' in weights[1].name)
         self.assertTrue('bias' in weights[2].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 1)))
         self.assertAllClose(weights[1], np.ones((1, 1, 3, 32)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[2], np.zeros((32)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -679,6 +714,7 @@ class SeparableConv2DTest(test.TestCase):
     conv_layers.separable_conv2d(images, 32, [3, 3])
     self.assertEqual(len(variables.trainable_variables()), 6)
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DDepthwiseRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -690,6 +726,7 @@ class SeparableConv2DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DPointwiseRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -701,6 +738,7 @@ class SeparableConv2DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DBiasRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -712,6 +750,7 @@ class SeparableConv2DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -768,6 +807,7 @@ class Conv2DTransposeTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.conv2d_transpose(images, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateConv2DTranspose(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -839,6 +879,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height * 2, width, 32])
 
+  @test_util.run_deprecated_v1
   def testConv2DTransposeKernelRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -850,6 +891,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DTransposeBiasRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -861,6 +903,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DTransposeNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -873,6 +916,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
     self.assertEqual(layer.bias, None)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -881,6 +925,7 @@ class Conv2DTransposeTest(test.TestCase):
     conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       height, width = 7, 9
@@ -891,6 +936,7 @@ class Conv2DTransposeTest(test.TestCase):
       conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -902,13 +948,14 @@ class Conv2DTransposeTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 32, 3)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((32)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -955,6 +1002,7 @@ class Conv3DTransposeTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.conv3d_transpose(volumes, 4, None)
 
+  @test_util.run_deprecated_v1
   def testCreateConv3DTranspose(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -976,6 +1024,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [4])
 
+  @test_util.run_deprecated_v1
   def testCreateConv3DTransposeChannelsFirst(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, 32, depth, height, width))
@@ -1019,6 +1068,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, depth * 2, height, width, 4])
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeKernelRegularizer(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -1030,6 +1080,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeBiasRegularizer(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -1041,6 +1092,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeNoBias(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -1053,6 +1105,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertEqual(layer.bias, None)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeReuse(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
@@ -1062,6 +1115,7 @@ class Conv3DTransposeTest(test.TestCase):
         volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       depth, height, width = 5, 7, 9
@@ -1072,6 +1126,7 @@ class Conv3DTransposeTest(test.TestCase):
       conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -1084,13 +1139,14 @@ class Conv3DTransposeTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 4, 32)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((4)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeNoReuse(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index e06e9aba4a8c3df3fdf3d2465682ecefafbbd2bd..b2d54a98272be53b69872e900901d9552177a172 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -25,10 +25,11 @@ from __future__ import print_function
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('layers.Dense')
+@tf_export(v1=['layers.Dense'])
 class Dense(keras_layers.Dense, base.Layer):
   """Densely-connected layer class.
 
@@ -109,7 +110,10 @@ class Dense(keras_layers.Dense, base.Layer):
                                 **kwargs)
 
 
-@tf_export('layers.dense')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.dense instead.')
+@tf_export(v1=['layers.dense'])
 def dense(
     inputs, units,
     activation=None,
@@ -184,7 +188,7 @@ def dense(
   return layer.apply(inputs)
 
 
-@tf_export('layers.Dropout')
+@tf_export(v1=['layers.Dropout'])
 class Dropout(keras_layers.Dropout, base.Layer):
   """Applies Dropout to the input.
 
@@ -223,7 +227,10 @@ class Dropout(keras_layers.Dropout, base.Layer):
     return super(Dropout, self).call(inputs, training=training)
 
 
-@tf_export('layers.dropout')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.dropout instead.')
+@tf_export(v1=['layers.dropout'])
 def dropout(inputs,
             rate=0.5,
             noise_shape=None,
@@ -265,7 +272,7 @@ def dropout(inputs,
   return layer.apply(inputs, training=training)
 
 
-@tf_export('layers.Flatten')
+@tf_export(v1=['layers.Flatten'])
 class Flatten(keras_layers.Flatten, base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
@@ -291,7 +298,10 @@ class Flatten(keras_layers.Flatten, base.Layer):
   pass
 
 
-@tf_export('layers.flatten')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.flatten instead.')
+@tf_export(v1=['layers.flatten'])
 def flatten(inputs, name=None, data_format='channels_last'):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 0343bfa8bd2d0fdfd80bd49709fa734d8df8f7ec..b40a2682381ad50da67fe7499b75f4f862e00b3d 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -59,6 +59,7 @@ class DenseTest(test.TestCase):
     dense.apply(random_ops.random_uniform((5, 2)))
     self.assertEqual(dense.name, 'dense_2')
 
+  @test_util.run_deprecated_v1
   def testVariableInput(self):
     with self.cached_session():
       v = variable_scope.get_variable(
@@ -140,6 +141,7 @@ class DenseTest(test.TestCase):
     outputs = dense.apply(inputs)
     self.assertEqual(outputs.get_shape().as_list(), [1, 2, 4, 7])
 
+  @test_util.run_deprecated_v1
   def testCallOnPlaceHolder(self):
     inputs = array_ops.placeholder(dtype=dtypes.float32)
     dense = core_layers.Dense(4, name='my_dense')
@@ -179,6 +181,7 @@ class DenseTest(test.TestCase):
     if not context.executing_eagerly():
       self.assertEqual(outputs.op.name, 'dense2/BiasAdd')
 
+  @test_util.run_deprecated_v1
   def testActivityRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(
@@ -189,6 +192,7 @@ class DenseTest(test.TestCase):
     self.assertEqual(len(loss_keys), 1)
     self.assertListEqual(dense.losses, loss_keys)
 
+  @test_util.run_deprecated_v1
   def testKernelRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(
@@ -200,6 +204,7 @@ class DenseTest(test.TestCase):
     self.evaluate([v.initializer for v in dense.variables])
     self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testKernelRegularizerWithReuse(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     inputs = random_ops.random_uniform((5, 3), seed=1)
@@ -212,6 +217,7 @@ class DenseTest(test.TestCase):
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
 
+  @test_util.run_deprecated_v1
   def testBiasRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(2, name='my_dense', bias_regularizer=regularizer)
@@ -222,6 +228,7 @@ class DenseTest(test.TestCase):
     self.evaluate([v.initializer for v in dense.variables])
     self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testFunctionalDense(self):
     with self.cached_session():
       inputs = random_ops.random_uniform((5, 3), seed=1)
@@ -231,6 +238,7 @@ class DenseTest(test.TestCase):
           len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
       self.assertEqual(outputs.op.name, 'my_dense/Relu')
 
+  @test_util.run_deprecated_v1
   def testFunctionalDenseTwice(self):
     inputs = random_ops.random_uniform((5, 3), seed=1)
     core_layers.dense(inputs, 2)
@@ -262,6 +270,7 @@ class DenseTest(test.TestCase):
         vars2 = variables.trainable_variables()
       self.assertEqual(vars1, vars2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalDenseInitializerFromScope(self):
     with variable_scope.variable_scope(
         'scope',
@@ -307,6 +316,7 @@ class DenseTest(test.TestCase):
       core_layers.dense(inputs, 2)
     self.assertEqual(called[0], 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalDenseInScope(self):
     with self.cached_session():
       with variable_scope.variable_scope('test'):
@@ -393,6 +403,7 @@ class DropoutTest(test.TestCase):
     np_output = self.evaluate(dropped)
     self.assertAllClose(np.ones((5, 3)), np_output)
 
+  @test_util.run_deprecated_v1
   def testDynamicLearningPhase(self):
     with self.cached_session() as sess:
       dp = core_layers.Dropout(0.5, seed=1)
@@ -426,6 +437,7 @@ class DropoutTest(test.TestCase):
     self.assertAlmostEqual(0., np_output.min())
     self.assertAllClose(np_output[:, 0, :], np_output[:, 1, :])
 
+  @test_util.run_deprecated_v1
   def testFunctionalDropout(self):
     with self.cached_session():
       inputs = array_ops.ones((5, 5))
@@ -437,13 +449,14 @@ class DropoutTest(test.TestCase):
       np_output = self.evaluate(dropped)
       self.assertAllClose(np.ones((5, 5)), np_output)
 
+  @test_util.run_deprecated_v1
   def testDynamicRate(self):
     with self.cached_session() as sess:
       rate = array_ops.placeholder(dtype='float32', name='rate')
       dp = core_layers.Dropout(rate, name='dropout')
       inputs = array_ops.ones((5, 5))
       dropped = dp.apply(inputs, training=True)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_output = sess.run(dropped, feed_dict={rate: 0.5})
       self.assertAlmostEqual(0., np_output.min())
       np_output = sess.run(dropped, feed_dict={rate: 0.0})
@@ -452,6 +465,7 @@ class DropoutTest(test.TestCase):
 
 class FlattenTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCreateFlatten(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(shape=(None, 2, 3), dtype='float32')
@@ -476,6 +490,7 @@ class FlattenTest(test.TestCase):
     shape = core_layers.Flatten().compute_output_shape((None, 3, None))
     self.assertEqual(shape.as_list(), [None, None])
 
+  @test_util.run_deprecated_v1
   def testDataFormat5d(self):
     np_input_channels_last = np.arange(
         120, dtype='float32').reshape([1, 5, 4, 3, 2])
@@ -493,6 +508,7 @@ class FlattenTest(test.TestCase):
 
       self.assertAllEqual(np_output_cl, np_output_cf)
 
+  @test_util.run_deprecated_v1
   def testDataFormat4d(self):
     np_input_channels_last = np.arange(
         24, dtype='float32').reshape([1, 4, 3, 2])
@@ -510,16 +526,22 @@ class FlattenTest(test.TestCase):
 
       self.assertAllEqual(np_output_cl, np_output_cf)
 
+  @test_util.run_deprecated_v1
   def testFunctionalFlatten(self):
     x = array_ops.placeholder(shape=(None, 2, 3), dtype='float32')
     y = core_layers.flatten(x, name='flatten')
     self.assertEqual(y.get_shape().as_list(), [None, 6])
 
-  def testFlattenValueError(self):
+  @test_util.run_deprecated_v1
+  def testFlatten0D(self):
     x = array_ops.placeholder(shape=(None,), dtype='float32')
-    with self.assertRaises(ValueError):
-      core_layers.Flatten()(x)
+    y = core_layers.Flatten()(x)
+    with self.cached_session() as sess:
+      np_output = sess.run(y, feed_dict={x: np.zeros((5,))})
+    self.assertEqual(list(np_output.shape), [5, 1])
+    self.assertEqual(y.shape.as_list(), [None, 1])
 
+  @test_util.run_deprecated_v1
   def testFlattenUnknownAxes(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(shape=(5, None, None), dtype='float32')
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index 11a2ebc040f0177e38d5b0f38cf609071f91fd07..93eec38a08c476a746fa5ee1604076ce1e4e904f 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -24,7 +24,7 @@ from __future__ import print_function
 
 # Base objects.
 from tensorflow.python.layers.base import Layer
-from tensorflow.python.layers.base import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 
 # Core layers.
 from tensorflow.python.layers.core import Dense
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 691dac69865b6e0ee582071d01c2cf626f7f639a..7eefb294cd6f1f8c7194d68f5a76bfba220e0493 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -23,10 +23,11 @@ from __future__ import print_function
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('layers.BatchNormalization')
+@tf_export(v1=['layers.BatchNormalization'])
 class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
@@ -154,7 +155,10 @@ class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
     return super(BatchNormalization, self).call(inputs, training=training)
 
 
-@tf_export('layers.batch_normalization')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.batch_normalization instead.')
+@tf_export(v1=['layers.batch_normalization'])
 def batch_normalization(inputs,
                         axis=-1,
                         momentum=0.99,
@@ -190,10 +194,10 @@ def batch_normalization(inputs,
 
   Note: when training, the moving_mean and moving_variance need to be updated.
   By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
-  need to be added as a dependency to the `train_op`. Also, be sure to add
-  any batch_normalization ops before getting the update_ops collection.
-  Otherwise, update_ops will be empty, and training/inference will not work
-  properly. For example:
+  need to be executed alongside the `train_op`. Also, be sure to add any
+  batch_normalization ops before getting the update_ops collection. Otherwise,
+  update_ops will be empty, and training/inference will not work properly. For
+  example:
 
   ```python
     x_norm = tf.layers.batch_normalization(x, training=training)
@@ -201,8 +205,8 @@ def batch_normalization(inputs,
     # ...
 
     update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    with tf.control_dependencies(update_ops):
-      train_op = optimizer.minimize(loss)
+    train_op = optimizer.minimize(loss)
+    train_op = tf.group([train_op, update_ops])
   ```
 
   Arguments:
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index a72d147a0b0b45f4a5ee5804f58291c3625a0c32..6535f74129ae166d41675aad494be09bdd0f5cd3 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.layers import normalization as normalization_layers
 from tensorflow.python.ops import array_ops
@@ -37,6 +38,7 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import saver as saver_lib
 
 
+@test_util.run_v1_only('b/120545219')
 class BNTest(test.TestCase):
 
   def _simple_model(self, image, fused, freeze_mode):
@@ -78,7 +80,7 @@ class BNTest(test.TestCase):
       if restore:
         saver.restore(sess, checkpoint_path)
       else:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
       np.random.seed(0)
       for _ in range(2):
         image_val = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
@@ -321,9 +323,9 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 4, 1))
       np_beta = np.reshape(np_beta, (1, 4, 1))
 
@@ -336,8 +338,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 2))
       std = np.std(np_inputs, axis=(0, 2))
       variance = np.square(std)
@@ -363,8 +366,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 3))
       np_beta = np.reshape(np_beta, (1, 1, 3))
       for _ in range(100):
@@ -376,8 +379,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1))
       std = np.std(np_inputs, axis=(0, 1))
       variance = np.square(std)
@@ -402,10 +406,10 @@ class BNTest(test.TestCase):
       training = array_ops.placeholder(dtype='bool')
       outputs = bn.apply(inputs, training=training)
 
-      with self.test_session(use_gpu=True) as sess:
+      with self.session(use_gpu=True) as sess:
         # Test training with placeholder learning phase.
-        sess.run(variables.global_variables_initializer())
-        np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+        self.evaluate(variables.global_variables_initializer())
+        np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
         np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
         np_beta = np.reshape(np_beta, (1, 4, 1, 1))
         for _ in range(100):
@@ -417,8 +421,9 @@ class BNTest(test.TestCase):
           self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
         # Verify that the statistics are updated during training.
-        moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-        np_inputs = sess.run(inputs)
+        moving_mean, moving_var = self.evaluate(
+            [bn.moving_mean, bn.moving_variance])
+        np_inputs = self.evaluate(inputs)
         mean = np.mean(np_inputs, axis=(0, 2, 3))
         std = np.std(np_inputs, axis=(0, 2, 3))
         variance = np.square(std)
@@ -444,8 +449,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 3, 1))
       np_beta = np.reshape(np_beta, (1, 1, 3, 1))
       for _ in range(100):
@@ -457,8 +462,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 3))
       std = np.std(np_inputs, axis=(0, 1, 3))
       variance = np.square(std)
@@ -484,8 +490,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -497,8 +503,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -524,8 +531,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -537,8 +544,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -565,8 +573,8 @@ class BNTest(test.TestCase):
 
       with self.cached_session() as sess:
         # Test training with placeholder learning phase.
-        sess.run(variables.global_variables_initializer())
-        np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+        self.evaluate(variables.global_variables_initializer())
+        np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
         np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
         np_beta = np.reshape(np_beta, (1, 4, 1, 1))
         for _ in range(100):
@@ -578,8 +586,9 @@ class BNTest(test.TestCase):
           self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
         # Verify that the statistics are updated during training.
-        moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-        np_inputs = sess.run(inputs)
+        moving_mean, moving_var = self.evaluate(
+            [bn.moving_mean, bn.moving_variance])
+        np_inputs = self.evaluate(inputs)
         mean = np.mean(np_inputs, axis=(0, 2, 3))
         std = np.std(np_inputs, axis=(0, 2, 3))
         variance = np.square(std)
@@ -605,8 +614,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -619,8 +628,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -646,8 +656,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -658,8 +668,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -667,7 +678,7 @@ class BNTest(test.TestCase):
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
       # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs_infer)
+      np_output = self.evaluate(outputs_infer)
 
       # Verify that the axis is normalized during inference.
       normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
@@ -696,8 +707,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([gamma, beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([gamma, beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -709,8 +720,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
-      np_inputs = sess.run(inputs)
+      np_moving_mean, np_moving_var = self.evaluate(
+          [moving_mean, moving_variance])
+      np_inputs = self.evaluate(inputs)
       np_mean = np.mean(np_inputs, axis=(0, 1, 2))
       np_std = np.std(np_inputs, axis=(0, 1, 2))
       np_variance = np.square(np_std)
@@ -758,14 +770,15 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(100):
         np_output, _, _ = sess.run([outputs2] + updates,
                                    feed_dict={training: True})
 
       # Verify that the statistics are updated during training.
-      np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
-      np_inputs = sess.run(inputs2)
+      np_moving_mean, np_moving_var = self.evaluate(
+          [moving_mean, moving_variance])
+      np_inputs = self.evaluate(inputs2)
       np_mean = np.mean(np_inputs, axis=(0, 1, 2))
       np_std = np.std(np_inputs, axis=(0, 1, 2))
       np_variance = np.square(np_std)
@@ -773,7 +786,7 @@ class BNTest(test.TestCase):
       self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
 
       # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([gamma, beta])
+      np_gamma, np_beta = self.evaluate([gamma, beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
@@ -884,8 +897,8 @@ class BNTest(test.TestCase):
     moving_variance = 1.
     renorm_mean = renorm_stddev = 0.
     renorm_weight = 0.
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -936,8 +949,8 @@ class BNTest(test.TestCase):
 
     moving_mean = 0.
     moving_variance = 1.
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
         yt_val_train, adj_scale_val, adj_bias_val = sess.run(
@@ -989,8 +1002,8 @@ class BNTest(test.TestCase):
     moving_variance = 1.
     renorm_mean = renorm_stddev = 0.
     renorm_weight = 0.
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
         yt_val_train, adj_scale_val, adj_bias_val = sess.run(
@@ -1039,8 +1052,8 @@ class BNTest(test.TestCase):
     self.assertListEqual(
         out1.shape.as_list(), out2.shape.as_list())
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(variables.global_variables_initializer())
 
       x = np.random.random(shape)
       y1, y2 = sess.run([out1, out2], feed_dict={inp: x})
@@ -1061,8 +1074,8 @@ class BNTest(test.TestCase):
     out = normalization_layers.batch_normalization(
         inp, virtual_batch_size=2)
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(variables.global_variables_initializer())
 
       x = np.random.random(np_shape)
       y = sess.run(out, feed_dict={inp: x})
@@ -1092,8 +1105,8 @@ class BNTest(test.TestCase):
                     shape[0] // virtual_batch_size,
                     shape[1]])
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1145,8 +1158,8 @@ class BNTest(test.TestCase):
     ghost_shape = ([virtual_batch_size, shape[0] // virtual_batch_size] +
                    shape[1:])
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1199,8 +1212,8 @@ class BNTest(test.TestCase):
     ghost_shape = ([virtual_batch_size, shape[0] // virtual_batch_size] +
                    shape[1:])
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1256,9 +1269,9 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
 
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
@@ -1269,8 +1282,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=0, keepdims=True)
       std = np.std(np_inputs, axis=0, keepdims=True)
       variance = np.square(std)
@@ -1296,9 +1310,9 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
 
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
@@ -1309,8 +1323,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 4), keepdims=True)
       std = np.std(np_inputs, axis=(0, 4), keepdims=True)
       variance = np.square(std)
@@ -1349,8 +1364,8 @@ class BNTest(test.TestCase):
     ghost_shape = ([virtual_batch_size, shape[0] // virtual_batch_size] +
                    shape[1:])
 
-    with self.test_session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.session(use_gpu=True) as sess:
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index c53cca3d312470c6fc22b4cca0bb9c76ed0865af..d123afc6231fb7d49ac4d610c5ca30c324a55de3 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -21,10 +21,11 @@ from __future__ import print_function
 
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('layers.AveragePooling1D')
+@tf_export(v1=['layers.AveragePooling1D'])
 class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
   """Average Pooling layer for 1D inputs.
 
@@ -57,7 +58,10 @@ class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
         **kwargs)
 
 
-@tf_export('layers.average_pooling1d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.average_pooling1d instead.')
+@tf_export(v1=['layers.average_pooling1d'])
 def average_pooling1d(inputs, pool_size, strides,
                       padding='valid', data_format='channels_last',
                       name=None):
@@ -92,7 +96,7 @@ def average_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
-@tf_export('layers.MaxPooling1D')
+@tf_export(v1=['layers.MaxPooling1D'])
 class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
   """Max Pooling layer for 1D inputs.
 
@@ -125,7 +129,10 @@ class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
         **kwargs)
 
 
-@tf_export('layers.max_pooling1d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.max_pooling1d instead.')
+@tf_export(v1=['layers.max_pooling1d'])
 def max_pooling1d(inputs, pool_size, strides,
                   padding='valid', data_format='channels_last',
                   name=None):
@@ -160,7 +167,7 @@ def max_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
-@tf_export('layers.AveragePooling2D')
+@tf_export(v1=['layers.AveragePooling2D'])
 class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
   """Average pooling layer for 2D inputs (e.g. images).
 
@@ -193,7 +200,10 @@ class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@tf_export('layers.average_pooling2d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.average_pooling2d instead.')
+@tf_export(v1=['layers.average_pooling2d'])
 def average_pooling2d(inputs,
                       pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -231,7 +241,7 @@ def average_pooling2d(inputs,
   return layer.apply(inputs)
 
 
-@tf_export('layers.MaxPooling2D')
+@tf_export(v1=['layers.MaxPooling2D'])
 class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
   """Max pooling layer for 2D inputs (e.g. images).
 
@@ -264,7 +274,10 @@ class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@tf_export('layers.max_pooling2d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.max_pooling2d instead.')
+@tf_export(v1=['layers.max_pooling2d'])
 def max_pooling2d(inputs,
                   pool_size, strides,
                   padding='valid', data_format='channels_last',
@@ -302,7 +315,7 @@ def max_pooling2d(inputs,
   return layer.apply(inputs)
 
 
-@tf_export('layers.AveragePooling3D')
+@tf_export(v1=['layers.AveragePooling3D'])
 class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
   """Average pooling layer for 3D inputs (e.g. volumes).
 
@@ -337,7 +350,10 @@ class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@tf_export('layers.average_pooling3d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.average_pooling3d instead.')
+@tf_export(v1=['layers.average_pooling3d'])
 def average_pooling3d(inputs,
                       pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -377,7 +393,7 @@ def average_pooling3d(inputs,
   return layer.apply(inputs)
 
 
-@tf_export('layers.MaxPooling3D')
+@tf_export(v1=['layers.MaxPooling3D'])
 class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
   """Max pooling layer for 3D inputs (e.g. volumes).
 
@@ -412,7 +428,10 @@ class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@tf_export('layers.max_pooling3d')
+@deprecation.deprecated(
+    date=None,
+    instructions='Use keras.layers.max_pooling3d instead.')
+@tf_export(v1=['layers.max_pooling3d'])
 def max_pooling3d(inputs,
                   pool_size, strides,
                   padding='valid', data_format='channels_last',
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
index 7533674e5a0cf60f91551cd6333c8d802612e03d..cf1fa1e6915695cc3d4c130ef501b466a73a1953 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/layers/pooling_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import pooling as pooling_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
@@ -64,6 +65,7 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
 
+  @test_util.run_deprecated_v1
   def testCreateMaxPooling2DChannelsFirst(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, 2, height, width))
@@ -73,6 +75,7 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 2, 6, 8])
 
+  @test_util.run_deprecated_v1
   def testCreateAveragePooling2DChannelsFirst(self):
     height, width = 5, 6
     images = random_ops.random_uniform((3, 4, height, width))
@@ -83,6 +86,7 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [3, 4, 4, 5])
 
+  @test_util.run_deprecated_v1
   def testCreateAveragePooling2DChannelsFirstWithNoneBatch(self):
     height, width = 5, 6
     images = array_ops.placeholder(dtype='float32',
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 6189503d8f5416e45a022abfa4f8bcad2da64c66..97bebe86177ee264ef00bc9b969b293389aa2122 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -177,8 +177,7 @@ tensorflow::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
                                                 const Device* expected_device,
                                                 const Tensor** output_tensor) {
   auto handle = EagerTensor_Handle(eager_tensor)->handle;
-  Device* actual_device = nullptr;
-  TF_RETURN_IF_ERROR(handle->Device(&actual_device));
+  Device* actual_device = handle->device();
   TF_RETURN_IF_ERROR(handle->Tensor(output_tensor));
   // actual_device may be nullptr, which implies local CPU.
   if (expected_device == actual_device) return Status::OK();
@@ -303,15 +302,14 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
 class NumpyTensorBuffer : public TensorBuffer {
  public:
   NumpyTensorBuffer(PyArrayObject* array, size_t len, void* data)
-      : array_(array), len_(len), data_(data) {}
+      : TensorBuffer(data), array_(array), len_(len) {}
 
   ~NumpyTensorBuffer() override {
     // Note: The session::run wrapper is responsible for freeing this while
     // holding the GIL.
-    DelayedNumpyDecref(data_, len_, array_);
+    DelayedNumpyDecref(data(), len_, array_);
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return len_; }
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -330,7 +328,6 @@ class NumpyTensorBuffer : public TensorBuffer {
  private:
   PyArrayObject* array_;
   size_t len_;
-  void* data_;
 };
 
 Status PyObjectToString(PyObject* obj, string* str) {
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 269142a7c294f938ea806ec614f6897f5021cb78..f681cff6cff35bfd8ed0e3a880d26936a54fabee 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -220,6 +220,7 @@ const char ErrorFoundFloat[] =
       /* Iterate over outer dim, and recursively convert each element. */ \
       const int64 s = shape.dim_size(0);                                  \
       Safe_PyObjectPtr seq = make_safe(PySequence_Fast(obj, ""));         \
+      if (TF_PREDICT_FALSE(seq == nullptr)) return ErrorRectangular;      \
       if (TF_PREDICT_FALSE(s != PySequence_Fast_GET_SIZE(seq.get()))) {   \
         return ErrorRectangular;                                          \
       }                                                                   \
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index f22fb253e4d59813226f0e9741cabcfbf0cdcd1a..ee55d89bffcbaca2a68cbb028ae9ca5157e6f6df 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -241,7 +241,7 @@ class FileIO(object):
     self._writable_file = None
 
 
-@tf_export("gfile.Exists")
+@tf_export(v1=["gfile.Exists"])
 def file_exists(filename):
   """Determines whether a path exists or not.
 
@@ -252,18 +252,35 @@ def file_exists(filename):
     True if the path exists, whether its a file or a directory.
     False if the path does not exist and there are no filesystem errors.
 
+  Raises:
+    errors.OpError: Propagates any errors reported by the FileSystem API.
+  """
+  return file_exists_v2(filename)
+
+
+@tf_export("io.gfile.exists")
+def file_exists_v2(path):
+  """Determines whether a path exists or not.
+
+  Args:
+    path: string, a path
+
+  Returns:
+    True if the path exists, whether its a file or a directory.
+    False if the path does not exist and there are no filesystem errors.
+
   Raises:
     errors.OpError: Propagates any errors reported by the FileSystem API.
   """
   try:
     with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.FileExists(compat.as_bytes(filename), status)
+      pywrap_tensorflow.FileExists(compat.as_bytes(path), status)
   except errors.NotFoundError:
     return False
   return True
 
 
-@tf_export("gfile.Remove")
+@tf_export(v1=["gfile.Remove"])
 def delete_file(filename):
   """Deletes the file located at 'filename'.
 
@@ -274,8 +291,22 @@ def delete_file(filename):
     errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
     NotFoundError if the file does not exist.
   """
+  delete_file_v2(filename)
+
+
+@tf_export("io.gfile.remove")
+def delete_file_v2(path):
+  """Deletes the path located at 'path'.
+
+  Args:
+    path: string, a path
+
+  Raises:
+    errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
+    NotFoundError if the path does not exist.
+  """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.DeleteFile(compat.as_bytes(filename), status)
+    pywrap_tensorflow.DeleteFile(compat.as_bytes(path), status)
 
 
 def read_file_to_string(filename, binary_mode=False):
@@ -314,7 +345,7 @@ def write_string_to_file(filename, file_content):
     f.write(file_content)
 
 
-@tf_export("gfile.Glob")
+@tf_export(v1=["gfile.Glob"])
 def get_matching_files(filename):
   """Returns a list of files that match the given pattern(s).
 
@@ -324,28 +355,44 @@ def get_matching_files(filename):
   Returns:
     A list of strings containing filenames that match the given pattern(s).
 
+  Raises:
+    errors.OpError: If there are filesystem / directory listing errors.
+  """
+  return get_matching_files_v2(filename)
+
+
+@tf_export("io.gfile.glob")
+def get_matching_files_v2(pattern):
+  """Returns a list of files that match the given pattern(s).
+
+  Args:
+    pattern: string or iterable of strings. The glob pattern(s).
+
+  Returns:
+    A list of strings containing filenames that match the given pattern(s).
+
   Raises:
     errors.OpError: If there are filesystem / directory listing errors.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    if isinstance(filename, six.string_types):
+    if isinstance(pattern, six.string_types):
       return [
           # Convert the filenames to string from bytes.
           compat.as_str_any(matching_filename)
           for matching_filename in pywrap_tensorflow.GetMatchingFiles(
-              compat.as_bytes(filename), status)
+              compat.as_bytes(pattern), status)
       ]
     else:
       return [
           # Convert the filenames to string from bytes.
           compat.as_str_any(matching_filename)
-          for single_filename in filename
+          for single_filename in pattern
           for matching_filename in pywrap_tensorflow.GetMatchingFiles(
               compat.as_bytes(single_filename), status)
       ]
 
 
-@tf_export("gfile.MkDir")
+@tf_export(v1=["gfile.MkDir"])
 def create_dir(dirname):
   """Creates a directory with the name 'dirname'.
 
@@ -356,14 +403,31 @@ def create_dir(dirname):
     The parent directories need to exist. Use recursive_create_dir instead if
     there is the possibility that the parent dirs don't exist.
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  create_dir_v2(dirname)
+
+
+@tf_export("io.gfile.mkdir")
+def create_dir_v2(path):
+  """Creates a directory with the name given by 'path'.
+
+  Args:
+    path: string, name of the directory to be created
+
+  Notes:
+    The parent directories need to exist. Use recursive_create_dir instead if
+    there is the possibility that the parent dirs don't exist.
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.CreateDir(compat.as_bytes(dirname), status)
+    pywrap_tensorflow.CreateDir(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.MakeDirs")
+@tf_export(v1=["gfile.MakeDirs"])
 def recursive_create_dir(dirname):
   """Creates a directory and all parent/intermediate directories.
 
@@ -372,14 +436,29 @@ def recursive_create_dir(dirname):
   Args:
     dirname: string, name of the directory to be created
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  recursive_create_dir_v2(dirname)
+
+
+@tf_export("io.gfile.makedirs")
+def recursive_create_dir_v2(path):
+  """Creates a directory and all parent/intermediate directories.
+
+  It succeeds if path already exists and is writable.
+
+  Args:
+    path: string, name of the directory to be created
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(dirname), status)
+    pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.Copy")
+@tf_export(v1=["gfile.Copy"])
 def copy(oldpath, newpath, overwrite=False):
   """Copies data from oldpath to newpath.
 
@@ -389,15 +468,31 @@ def copy(oldpath, newpath, overwrite=False):
     overwrite: boolean, if false its an error for newpath to be occupied by an
         existing file.
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  copy_v2(oldpath, newpath, overwrite)
+
+
+@tf_export("io.gfile.copy")
+def copy_v2(src, dst, overwrite=False):
+  """Copies data from src to dst.
+
+  Args:
+    src: string, name of the file whose contents need to be copied
+    dst: string, name of the file to which to copy to
+    overwrite: boolean, if false its an error for newpath to be occupied by an
+        existing file.
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
     pywrap_tensorflow.CopyFile(
-        compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status)
+        compat.as_bytes(src), compat.as_bytes(dst), overwrite, status)
 
 
-@tf_export("gfile.Rename")
+@tf_export(v1=["gfile.Rename"])
 def rename(oldname, newname, overwrite=False):
   """Rename or move a file / directory.
 
@@ -407,12 +502,28 @@ def rename(oldname, newname, overwrite=False):
     overwrite: boolean, if false it's an error for `newname` to be occupied by
         an existing file.
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  rename_v2(oldname, newname, overwrite)
+
+
+@tf_export("io.gfile.rename")
+def rename_v2(src, dst, overwrite=False):
+  """Rename or move a file / directory.
+
+  Args:
+    src: string, pathname for a file
+    dst: string, pathname to which the file needs to be moved
+    overwrite: boolean, if false it's an error for `dst` to be occupied by
+        an existing file.
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
     pywrap_tensorflow.RenameFile(
-        compat.as_bytes(oldname), compat.as_bytes(newname), overwrite, status)
+        compat.as_bytes(src), compat.as_bytes(dst), overwrite, status)
 
 
 def atomic_write_string_to_file(filename, contents, overwrite=True):
@@ -439,35 +550,61 @@ def atomic_write_string_to_file(filename, contents, overwrite=True):
     raise
 
 
-@tf_export("gfile.DeleteRecursively")
+@tf_export(v1=["gfile.DeleteRecursively"])
 def delete_recursively(dirname):
   """Deletes everything under dirname recursively.
 
   Args:
     dirname: string, a path to a directory
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  delete_recursively_v2(dirname)
+
+
+@tf_export("io.gfile.rmtree")
+def delete_recursively_v2(path):
+  """Deletes everything under path recursively.
+
+  Args:
+    path: string, a path
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.DeleteRecursively(compat.as_bytes(dirname), status)
+    pywrap_tensorflow.DeleteRecursively(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.IsDirectory")
+@tf_export(v1=["gfile.IsDirectory"])
 def is_directory(dirname):
   """Returns whether the path is a directory or not.
 
   Args:
     dirname: string, path to a potential directory
 
+  Returns:
+    True, if the path is a directory; False otherwise
+  """
+  return is_directory_v2(dirname)
+
+
+@tf_export("io.gfile.isdir")
+def is_directory_v2(path):
+  """Returns whether the path is a directory or not.
+
+  Args:
+    path: string, path to a potential directory
+
   Returns:
     True, if the path is a directory; False otherwise
   """
   status = c_api_util.ScopedTFStatus()
-  return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
+  return pywrap_tensorflow.IsDirectory(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.ListDirectory")
+@tf_export(v1=["gfile.ListDirectory"])
 def list_directory(dirname):
   """Returns a list of entries contained within a directory.
 
@@ -483,7 +620,26 @@ def list_directory(dirname):
   Raises:
     errors.NotFoundError if directory doesn't exist
   """
-  if not is_directory(dirname):
+  return list_directory_v2(dirname)
+
+
+@tf_export("io.gfile.listdir")
+def list_directory_v2(path):
+  """Returns a list of entries contained within a directory.
+
+  The list is in arbitrary order. It does not contain the special entries "."
+  and "..".
+
+  Args:
+    path: string, path to a directory
+
+  Returns:
+    [filename1, filename2, ... filenameN] as strings
+
+  Raises:
+    errors.NotFoundError if directory doesn't exist
+  """
+  if not is_directory(path):
     raise errors.NotFoundError(None, None, "Could not find directory")
   with errors.raise_exception_on_not_ok_status() as status:
     # Convert each element to string, since the return values of the
@@ -491,11 +647,11 @@ def list_directory(dirname):
     return [
         compat.as_str_any(filename)
         for filename in pywrap_tensorflow.GetChildren(
-            compat.as_bytes(dirname), status)
+            compat.as_bytes(path), status)
     ]
 
 
-@tf_export("gfile.Walk")
+@tf_export(v1=["gfile.Walk"])
 def walk(top, in_order=True):
   """Recursive directory tree generator for directories.
 
@@ -505,6 +661,27 @@ def walk(top, in_order=True):
 
   Errors that happen while listing directories are ignored.
 
+  Yields:
+    Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
+    all its subdirectories and leaf files.
+    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
+    as strings
+  """
+  return walk_v2(top, in_order)
+
+
+@tf_export("io.gfile.walk")
+def walk_v2(top, topdown=True, onerror=None):
+  """Recursive directory tree generator for directories.
+
+  Args:
+    top: string, a Directory name
+    topdown: bool, Traverse pre order if True, post order if False.
+    onerror: optional handler for errors. Should be a function, it will be
+      called with the error as argument. Rethrowing the error aborts the walk.
+
+  Errors that happen while listing directories are ignored.
+
   Yields:
     Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
     all its subdirectories and leaf files.
@@ -514,8 +691,11 @@ def walk(top, in_order=True):
   top = compat.as_str_any(top)
   try:
     listing = list_directory(top)
-  except errors.NotFoundError:
-    return
+  except errors.NotFoundError as err:
+    if onerror:
+      onerror(err)
+    else:
+      return
 
   files = []
   subdirs = []
@@ -528,18 +708,18 @@ def walk(top, in_order=True):
 
   here = (top, subdirs, files)
 
-  if in_order:
+  if topdown:
     yield here
 
   for subdir in subdirs:
-    for subitem in walk(os.path.join(top, subdir), in_order):
+    for subitem in walk_v2(os.path.join(top, subdir), topdown, onerror=onerror):
       yield subitem
 
-  if not in_order:
+  if not topdown:
     yield here
 
 
-@tf_export("gfile.Stat")
+@tf_export(v1=["gfile.Stat"])
 def stat(filename):
   """Returns file statistics for a given path.
 
@@ -549,12 +729,28 @@ def stat(filename):
   Returns:
     FileStatistics struct that contains information about the path
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  return stat_v2(filename)
+
+
+@tf_export("io.gfile.stat")
+def stat_v2(path):
+  """Returns file statistics for a given path.
+
+  Args:
+    path: string, path to a file
+
+  Returns:
+    FileStatistics struct that contains information about the path
+
   Raises:
     errors.OpError: If the operation fails.
   """
   file_statistics = pywrap_tensorflow.FileStatistics()
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.Stat(compat.as_bytes(filename), file_statistics, status)
+    pywrap_tensorflow.Stat(compat.as_bytes(path), file_statistics, status)
     return file_statistics
 
 
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index c21eb931037f1728149456d62b1534f59527cfdb..0ece84c08dcc82d96b62f40dd79bc9ed24d5e122 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -582,5 +582,30 @@ class FileIoTest(test.TestCase):
     self.assertTrue(crc1 != crc2)
     self.assertEqual(crc2, crc3)
 
+  def testMatchingFilesPermission(self):
+    # Create top level directory test_dir.
+    dir_path = os.path.join(self._base_dir, "test_dir")
+    file_io.create_dir(dir_path)
+    # Create second level directories `noread` and `any`.
+    noread_path = os.path.join(dir_path, "noread")
+    file_io.create_dir(noread_path)
+    any_path = os.path.join(dir_path, "any")
+    file_io.create_dir(any_path)
+    files = ["file1.txt", "file2.txt", "file3.txt"]
+    for name in files:
+      file_path = os.path.join(any_path, name)
+      file_io.FileIO(file_path, mode="w").write("testing")
+    file_path = os.path.join(noread_path, "file4.txt")
+    file_io.FileIO(file_path, mode="w").write("testing")
+    # Change noread to noread access.
+    os.chmod(noread_path, 0)
+    expected_match = [os.path.join(any_path, name) for name in files]
+    self.assertItemsEqual(
+        file_io.get_matching_files(os.path.join(dir_path, "*", "file*.txt")),
+        expected_match)
+    # Change noread back so that it could be cleaned during tearDown.
+    os.chmod(noread_path, 0o777)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/lib/io/python_io.py b/tensorflow/python/lib/io/python_io.py
index 404423ce07b3bbee89266a7154405c72da067a02..8223d3092fc0853d02ebea5f3a117d34472077c1 100644
--- a/tensorflow/python/lib/io/python_io.py
+++ b/tensorflow/python/lib/io/python_io.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Python functions for directly manipulating TFRecord-formatted files.
-
-See the [Python IO](https://tensorflow.org/api_guides/python/python_io) guide.
-"""
+"""Python functions for directly manipulating TFRecord-formatted files."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 9ab683d96a397280e53185f37d183e4ddd0407a2..43086ab18d7774f54be2b393deccec6be180801f 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -26,7 +26,9 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("io.TFRecordCompressionType", "python_io.TFRecordCompressionType")
+@tf_export(
+    "io.TFRecordCompressionType",
+    v1=["io.TFRecordCompressionType", "python_io.TFRecordCompressionType"])
 @deprecation.deprecated_endpoints("python_io.TFRecordCompressionType")
 class TFRecordCompressionType(object):
   """The type of compression for the record."""
@@ -35,7 +37,9 @@ class TFRecordCompressionType(object):
   GZIP = 2
 
 
-@tf_export("io.TFRecordOptions", "python_io.TFRecordOptions")
+@tf_export(
+    "io.TFRecordOptions",
+    v1=["io.TFRecordOptions", "python_io.TFRecordOptions"])
 @deprecation.deprecated_endpoints("python_io.TFRecordOptions")
 class TFRecordOptions(object):
   """Options used for manipulating TFRecord files."""
@@ -146,8 +150,11 @@ class TFRecordOptions(object):
     return options
 
 
-@tf_export("io.tf_record_iterator", "python_io.tf_record_iterator")
-@deprecation.deprecated_endpoints("python_io.tf_record_iterator")
+@tf_export(v1=["io.tf_record_iterator", "python_io.tf_record_iterator"])
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use eager execution and: \n"
+                  "`tf.data.TFRecordDataset(path)`"))
 def tf_record_iterator(path, options=None):
   """An iterator that read the records from a TFRecords file.
 
@@ -179,7 +186,8 @@ def tf_record_iterator(path, options=None):
     reader.Close()
 
 
-@tf_export("io.TFRecordWriter", "python_io.TFRecordWriter")
+@tf_export(
+    "io.TFRecordWriter", v1=["io.TFRecordWriter", "python_io.TFRecordWriter"])
 @deprecation.deprecated_endpoints("python_io.TFRecordWriter")
 class TFRecordWriter(object):
   """A class to write records to a TFRecords file.
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index ade86e85bff9860947bfccc3479b6b3d072c6177..45e741ef222b1dcde21b66ab6cdc3db9576a85ce 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -480,7 +480,7 @@ def _GatherNdGrad(op, grad):
   ref = op.inputs[0]
   indices = op.inputs[1]
   ref_shape = array_ops.shape(ref, out_type=indices.dtype)
-  if indices.shape.ndims == 2 and indices.shape[-1].value == 1:
+  if indices.shape.ndims == 2 and indices.shape.dims[-1].value == 1:
     ref_grad = ops.IndexedSlices(grad, array_ops.squeeze(indices, axis=-1),
                                  ref_shape)
   else:
@@ -489,10 +489,12 @@ def _GatherNdGrad(op, grad):
 
 
 @ops.RegisterGradient("CheckNumerics")
-def _CheckNumericsGrad(_, grad):
+def _CheckNumericsGrad(op, grad):
   """Gradient for check_numerics op."""
   return array_ops.check_numerics(
-      grad, "Not a number (NaN) or infinity (Inf) values detected in gradient.")
+      grad,
+      "Not a number (NaN) or infinity (Inf) values detected in gradient. %s" %
+      op.get_attr("message"))
 
 
 @ops.RegisterGradient("PlaceholderWithDefault")
@@ -733,7 +735,7 @@ def _QuantizeAndDequantizeV3Grad(_, grad):
 @ops.RegisterGradient("ExtractImagePatches")
 def _ExtractImagePatchesGrad(op, grad):
   batch_size, rows_in, cols_in, channels = [
-      dim.value for dim in op.inputs[0].get_shape()
+      dim.value for dim in op.inputs[0].shape.dims
   ]
   input_bhwc = array_ops.shape(op.inputs[0])
   batch_size = input_bhwc[0]
@@ -754,7 +756,7 @@ def _ExtractImagePatchesGrad(op, grad):
       op.get_attr("padding"))
 
   # Create indices matrix for output tensor.
-  _, rows_out, cols_out, _ = [dim.value for dim in op.outputs[0].get_shape()]
+  _, rows_out, cols_out, _ = [dim.value for dim in op.outputs[0].shape.dims]
   _, ksize_r, ksize_c, _ = op.get_attr("ksizes")
   # Indices for output start from 0.
   output_indices_num = rows_out * cols_out * ksize_r * ksize_c
@@ -800,6 +802,32 @@ def _ScatterNdGrad(op, grad):
   return [None, updates_grad, None]
 
 
+@ops.RegisterGradient("TensorScatterUpdate")
+def _TensorScatterUpdateGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  tensor_grad = array_ops.tensor_scatter_update(
+      array_ops.identity(grad), indices,
+      array_ops.zeros_like(op.inputs[2], dtype=grad.dtype))
+  return [tensor_grad, None, updates_grad]
+
+
+@ops.RegisterGradient("TensorScatterAdd")
+def _TensorScatterAddGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  tensor_grad = array_ops.identity(grad)
+  return [tensor_grad, None, updates_grad]
+
+
+@ops.RegisterGradient("TensorScatterSub")
+def _TensorScatterSubGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  tensor_grad = array_ops.identity(grad)
+  return [tensor_grad, None, -updates_grad]
+
+
 @ops.RegisterGradient("ScatterNdNonAliasingAdd")
 def _ScatterNdNonAliasingAddGrad(op, grad):
   indices = op.inputs[1]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 4be9c532f48762dfd2b1cce944214333a3360594..9dabbffb138093db6d3bd0dcf983d2f6cfdc5081 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # Tests for this file live in python/kernel_tests/array_ops_test.py
-"""Support for manipulating tensors.
-
-See the [Array Ops](https://tensorflow.org/api_guides/python/array_ops) guide.
-"""
+"""Support for manipulating tensors."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,6 +22,7 @@ from __future__ import print_function
 import sys
 
 import numpy as np
+import six
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import common_shapes
@@ -43,6 +41,7 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops.gen_array_ops import *
 from tensorflow.python.ops.gen_array_ops import reverse_v2 as reverse  # pylint: disable=unused-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
@@ -57,6 +56,7 @@ _BaseSlice = slice
 
 
 @tf_export("identity")
+@dispatch.add_dispatch_support
 def identity(input, name=None):  # pylint: disable=redefined-builtin
   r"""Return a tensor with the same shape and contents as input.
 
@@ -78,11 +78,15 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
       return input._copy()  # pylint: disable=protected-access
     return input
   else:
-    return gen_array_ops.identity(input, name=name)
+    ret = gen_array_ops.identity(input, name=name)
+    # Propagate handle data for happier shape inference for resource variables.
+    if hasattr(input, "_handle_data"):
+      ret._handle_data = input._handle_data  # pylint: disable=protected-access
+    return ret
 
 
 # pylint: disable=redefined-builtin,protected-access
-@tf_export("expand_dims")
+@tf_export(v1=["expand_dims"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
@@ -123,7 +127,7 @@ def expand_dims(input, axis=None, name=None, dim=None):
     axis: 0-D (scalar). Specifies the dimension index at which to
       expand the shape of `input`. Must be in the range
       `[-rank(input) - 1, rank(input)]`.
-    name: The name of the output `Tensor`.
+    name: The name of the output `Tensor` (optional).
     dim: 0-D (scalar). Equivalent to `axis`, to be deprecated.
 
   Returns:
@@ -131,9 +135,61 @@ def expand_dims(input, axis=None, name=None, dim=None):
     dimension of size 1 added.
 
   Raises:
-    ValueError: if both `dim` and `axis` are specified.
+    ValueError: if either both or neither of `dim` and `axis` are specified.
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
+    raise ValueError("Must specify an axis argument to tf.expand_dims()")
+  return expand_dims_v2(input, axis, name)
+
+
+@tf_export("expand_dims", v1=[])
+@dispatch.add_dispatch_support
+def expand_dims_v2(input, axis, name=None):
+  """Inserts a dimension of 1 into a tensor's shape.
+
+  Given a tensor `input`, this operation inserts a dimension of 1 at the
+  dimension index `axis` of `input`'s shape. The dimension index `axis` starts
+  at zero; if you specify a negative number for `axis` it is counted backward
+  from the end.
+
+  This operation is useful if you want to add a batch dimension to a single
+  element. For example, if you have a single image of shape `[height, width,
+  channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+  which will make the shape `[1, height, width, channels]`.
+
+  Other examples:
+
+  ```python
+  # 't' is a tensor of shape [2]
+  tf.shape(tf.expand_dims(t, 0))  # [1, 2]
+  tf.shape(tf.expand_dims(t, 1))  # [2, 1]
+  tf.shape(tf.expand_dims(t, -1))  # [2, 1]
+
+  # 't2' is a tensor of shape [2, 3, 5]
+  tf.shape(tf.expand_dims(t2, 0))  # [1, 2, 3, 5]
+  tf.shape(tf.expand_dims(t2, 2))  # [2, 3, 1, 5]
+  tf.shape(tf.expand_dims(t2, 3))  # [2, 3, 5, 1]
+  ```
+
+  This operation requires that:
+
+  `-1-input.dims() <= dim <= input.dims()`
+
+  This operation is related to `squeeze()`, which removes dimensions of
+  size 1.
+
+  Args:
+    input: A `Tensor`.
+    axis: 0-D (scalar). Specifies the dimension index at which to
+      expand the shape of `input`. Must be in the range
+      `[-rank(input) - 1, rank(input)]`.
+    name: The name of the output `Tensor` (optional).
+
+  Returns:
+    A `Tensor` with the same data as `input`, but its shape has an additional
+    dimension of size 1 added.
+  """
   return gen_array_ops.expand_dims(input, axis, name)
 
 
@@ -156,7 +212,11 @@ listdiff.__doc__ = gen_array_ops.list_diff.__doc__ + "\n" + listdiff.__doc__
 
 
 # pylint: disable=undefined-variable
-@tf_export("setdiff1d")
+@deprecation.deprecated(
+    "2018-11-30",
+    "This op will be removed after the deprecation date. "
+    "Please switch to tf.sets.difference().")
+@tf_export(v1=["setdiff1d"])
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
   return gen_array_ops.list_diff(x, y, index_dtype, name)
 
@@ -166,7 +226,18 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 
 @tf_export("broadcast_dynamic_shape")
 def broadcast_dynamic_shape(shape_x, shape_y):
-  """Returns the broadcasted dynamic shape between `shape_x` and `shape_y`.
+  """Computes the shape of a broadcast given symbolic shapes.
+
+  When shape_x and shape_y are Tensors representing shapes (i.e. the result of
+  calling tf.shape on another Tensor) this computes a Tensor which is the shape
+  of the result of a broadcasting op applied in tensors of shapes shape_x and
+  shape_y.
+
+  For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
+  Tensor whose value is [5, 2, 3].
+
+  This is useful when validating the result of a broadcasting operation when the
+  tensors do not have statically known shapes.
 
   Args:
     shape_x: A rank 1 integer `Tensor`, representing the shape of x.
@@ -180,7 +251,17 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
 @tf_export("broadcast_static_shape")
 def broadcast_static_shape(shape_x, shape_y):
-  """Returns the broadcasted static shape between `shape_x` and `shape_y`.
+  """Computes the shape of a broadcast given known shapes.
+
+  When shape_x and shape_y are fully known TensorShapes this computes a
+  TensorShape which is the shape of the result of a broadcasting op applied in
+  tensors of shapes shape_x and shape_y.
+
+  For example, if shape_x is [1, 2, 3] and shape_y is [5, 1, 3], the result is a
+  TensorShape whose value is [5, 2, 3].
+
+  This is useful when validating the result of a broadcasting operation when the
+  tensors have statically known shapes.
 
   Args:
     shape_x: A `TensorShape`
@@ -195,7 +276,13 @@ def broadcast_static_shape(shape_x, shape_y):
   return common_shapes.broadcast_shape(shape_x, shape_y)
 
 
-@tf_export("shape")
+@tf_export("shape", v1=[])
+def shape_v2(input, out_type=dtypes.int32, name=None):
+  # pylint: disable=redefined-builtin
+  return shape(input, name, out_type)
+
+
+@tf_export(v1=["shape"])
 def shape(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -268,7 +355,13 @@ def shape_n(input, out_type=dtypes.int32, name=None):
   return gen_array_ops.shape_n(input, out_type=out_type, name=name)
 
 
-@tf_export("size")
+@tf_export("size", v1=[])
+def size_v2(input, out_type=dtypes.int32, name=None):
+  # pylint: disable=redefined-builtin
+  return size(input, name, out_type)
+
+
+@tf_export(v1=["size"])
 def size(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
@@ -317,7 +410,7 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
       input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
     input = ops.convert_to_tensor(input)
     np_out_type = out_type.as_numpy_dtype
-    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-acces:
+    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-access
     return ops.convert_to_tensor(num_elements, dtype=out_type)
   with ops.name_scope(name, "Size", [input]) as name:
     if isinstance(input, (sparse_tensor.SparseTensor,
@@ -392,6 +485,36 @@ def rank_internal(input, name=None, optimize=True):
       return gen_array_ops.rank(input, name=name)
 
 
+_SLICE_TYPE_ERROR = (
+    "Only integers, slices (`:`), ellipsis (`...`), "
+    "tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid "
+    "indices")
+
+_SUPPORTED_SLICE_DTYPES = (
+    dtypes.int32,
+    dtypes.int32_ref,
+    dtypes.int64,
+    dtypes.int64_ref
+)
+
+
+def _check_index(idx):
+  """Check if a given value is a valid index into a tensor."""
+  if isinstance(idx, (six.integer_types, tensor_shape.Dimension)):
+    return
+
+  # Optimistic check. Assumptions:
+  # * any object with a dtype is supported
+  # * any object with a dtype has a sizeable shape attribute.
+  dtype = getattr(idx, "dtype", None)
+  if (dtype is None or
+      dtypes.as_dtype(dtype) not in _SUPPORTED_SLICE_DTYPES or
+      idx.shape and len(idx.shape) == 1):
+    # TODO(slebedev): IndexError seems more appropriate here, but it
+    # will break `_slice_helper` contract.
+    raise TypeError(_SLICE_TYPE_ERROR + ", got {!r}".format(idx))
+
+
 def _slice_helper(tensor, slice_spec, var=None):
   """Overload for Tensor.__getitem__.
 
@@ -445,7 +568,8 @@ def _slice_helper(tensor, slice_spec, var=None):
 
   Raises:
     ValueError: If a slice range is negative size.
-    TypeError: If the slice indices aren't int, slice, or Ellipsis.
+    TypeError: If the slice indices aren't int, slice, ellipsis,
+      tf.newaxis or scalar int32/int64 tensors.
   """
 
   if not isinstance(slice_spec, (list, tuple)):
@@ -463,16 +587,19 @@ def _slice_helper(tensor, slice_spec, var=None):
       # for example a[:] gives slice(None,sys.maxsize,None)
       # whereas a[::1] gives slice(None,None,None)
       if s.start is not None and s.start is not sys.maxsize:
+        _check_index(s.start)
         begin.append(s.start)
       else:
         begin.append(0)
         begin_mask |= (1 << index)
       if s.stop is not None and s.stop != sys.maxsize:
+        _check_index(s.stop)
         end.append(s.stop)
       else:
         end.append(0)
         end_mask |= (1 << index)
       if s.step is not None:
+        _check_index(s.step)
         strides.append(s.step)
       else:
         strides.append(1)
@@ -487,6 +614,7 @@ def _slice_helper(tensor, slice_spec, var=None):
       strides.append(1)
       new_axis_mask |= (1 << index)
     else:
+      _check_index(s)
       begin.append(s)
       end.append(s + 1)
       strides.append(1)
@@ -756,7 +884,8 @@ def _SliceHelperVar(var, slice_spec):
 
   Raises:
     ValueError: If a slice range is negative size.
-    TypeError: If the slice indices aren't int, slice, or Ellipsis.
+    TypeError: TypeError: If the slice indices aren't int, slice,
+      ellipsis, tf.newaxis or int32/int64 tensors.
 
   """
 
@@ -818,6 +947,7 @@ def parallel_stack(values, name="parallel_stack"):
 
 
 @tf_export("stack")
+@dispatch.add_dispatch_support
 def stack(values, axis=0, name="stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
 
@@ -1021,13 +1151,14 @@ def unstack(value, num=None, axis=0, name="unstack"):
       if axis < -value_shape.ndims or axis >= value_shape.ndims:
         raise ValueError("axis = %d not in [%d, %d)" %
                          (axis, -value_shape.ndims, value_shape.ndims))
-      num = value_shape[axis].value
+      num = value_shape.dims[axis].value
   if num is None:
     raise ValueError("Cannot infer num from shape %s" % value_shape)
   return gen_array_ops.unpack(value, num=num, axis=axis, name=name)
 
 
 @tf_export("concat")
+@dispatch.add_dispatch_support
 def concat(values, axis, name="concat"):
   """Concatenates tensors along one dimension.
 
@@ -1124,7 +1255,7 @@ def concat(values, axis, name="concat"):
   return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
 
 
-@tf_export("boolean_mask")
+@tf_export(v1=["boolean_mask"])
 def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.  Numpy equivalent is `tensor[mask]`.
 
@@ -1204,7 +1335,56 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
     return _apply_mask_1d(tensor, mask, axis)
 
 
-@tf_export("sparse.mask", "sparse_mask")
+@tf_export("boolean_mask", v1=[])
+@dispatch.add_dispatch_support
+def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
+  """Apply boolean mask to tensor.
+
+  Numpy equivalent is `tensor[mask]`.
+
+  ```python
+  # 1-D example
+  tensor = [0, 1, 2, 3]
+  mask = np.array([True, False, True, False])
+  boolean_mask(tensor, mask)  # [0, 2]
+  ```
+
+  In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
+  the first K dimensions of `tensor`'s shape.  We then have:
+    `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
+  where `(i1,...,iK)` is the ith `True` entry of `mask` (row-major order).
+  The `axis` could be used with `mask` to indicate the axis to mask from.
+  In that case, `axis + dim(mask) <= dim(tensor)` and `mask`'s shape must match
+  the first `axis + dim(mask)` dimensions of `tensor`'s shape.
+
+  Args:
+    tensor:  N-D tensor.
+    mask:  K-D boolean tensor, K <= N and K must be known statically.
+    axis:  A 0-D int Tensor representing the axis in `tensor` to mask from. By
+      default, axis is 0 which will mask from the first dimension. Otherwise K +
+      axis <= N.
+    name:  A name for this operation (optional).
+
+  Returns:
+    (N-K+1)-dimensional tensor populated by entries in `tensor` corresponding
+    to `True` values in `mask`.
+
+  Raises:
+    ValueError:  If shapes do not conform.
+
+  Examples:
+
+  ```python
+  # 2-D example
+  tensor = [[1, 2], [3, 4], [5, 6]]
+  mask = np.array([True, False, True])
+  boolean_mask(tensor, mask)  # [[1, 2], [5, 6]]
+  ```
+  """
+  return boolean_mask(tensor, mask, name, axis)
+
+
+@tf_export("sparse.mask", v1=["sparse.mask", "sparse_mask"])
 @deprecation.deprecated_endpoints("sparse_mask")
 def sparse_mask(a, mask_indices, name=None):
   """Masks elements of `IndexedSlices`.
@@ -1337,7 +1517,75 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
       value=value, size_splits=size_splits, axis=axis, num_split=num, name=name)
 
 
-@tf_export("transpose")
+@tf_export("transpose", v1=[])
+def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
+  """Transposes `a`. Permutes the dimensions according to `perm`.
+
+  The returned tensor's dimension i will correspond to the input dimension
+  `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
+  the rank of the input tensor. Hence by default, this operation performs a
+  regular matrix transpose on 2-D input Tensors. If conjugate is True and
+  `a.dtype` is either `complex64` or `complex128` then the values of `a`
+  are conjugated and transposed.
+
+  @compatibility(numpy)
+  In `numpy` transposes are memory-efficient constant time operations as they
+  simply return a new view of the same data with adjusted `strides`.
+
+  TensorFlow does not support strides, so `transpose` returns a new tensor with
+  the items permuted.
+  @end_compatibility
+
+  For example:
+
+  ```python
+  x = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.transpose(x)  # [[1, 4]
+                   #  [2, 5]
+                   #  [3, 6]]
+
+  # Equivalently
+  tf.transpose(x, perm=[1, 0])  # [[1, 4]
+                                #  [2, 5]
+                                #  [3, 6]]
+
+  # If x is complex, setting conjugate=True gives the conjugate transpose
+  x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j],
+                   [4 + 4j, 5 + 5j, 6 + 6j]])
+  tf.transpose(x, conjugate=True)  # [[1 - 1j, 4 - 4j],
+                                   #  [2 - 2j, 5 - 5j],
+                                   #  [3 - 3j, 6 - 6j]]
+
+  # 'perm' is more useful for n-dimensional tensors, for n > 2
+  x = tf.constant([[[ 1,  2,  3],
+                    [ 4,  5,  6]],
+                   [[ 7,  8,  9],
+                    [10, 11, 12]]])
+
+  # Take the transpose of the matrices in dimension-0
+  # (this common operation has a shorthand `linalg.transpose`)
+  tf.transpose(x, perm=[0, 2, 1])  # [[[1,  4],
+                                   #   [2,  5],
+                                   #   [3,  6]],
+                                   #  [[7, 10],
+                                   #   [8, 11],
+                                   #   [9, 12]]]
+  ```
+
+  Args:
+    a: A `Tensor`.
+    perm: A permutation of the dimensions of `a`.
+    conjugate: Optional bool. Setting it to `True` is mathematically equivalent
+      to tf.conj(tf.transpose(input)).
+    name: A name for the operation (optional).
+
+  Returns:
+    A transposed `Tensor`.
+  """
+  return transpose(a=a, perm=perm, name=name, conjugate=conjugate)
+
+
+@tf_export(v1=["transpose"])
 def transpose(a, perm=None, name="transpose", conjugate=False):
   """Transposes `a`. Permutes the dimensions according to `perm`.
 
@@ -1407,8 +1655,13 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
         gen_array_ops.conjugate_transpose
         if (conjugate and a.dtype.is_complex) else gen_array_ops.transpose)
     if perm is None:
-      rank = gen_array_ops.rank(a)
-      perm = (rank - 1) - gen_math_ops._range(0, rank, 1)
+      a = ops.convert_to_tensor(a, name="a")
+      if not a.get_shape().ndims:
+        rank = gen_array_ops.rank(a)
+        perm = (rank - 1) - gen_math_ops._range(0, rank, 1)
+      else:
+        rank = a.get_shape().ndims
+        perm = (rank - 1) - np.arange(rank)
       ret = transpose_fn(a, perm, name=name)
       # NOTE(mrry): Setting the shape explicitly because
       #   reverse is not handled by the shape function.
@@ -1422,7 +1675,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
 
 
 # pylint: disable=invalid-name
-@tf_export("linalg.transpose", "matrix_transpose")
+@tf_export("linalg.transpose", v1=["linalg.transpose", "matrix_transpose"])
 @deprecation.deprecated_endpoints("matrix_transpose")
 def matrix_transpose(a, name="matrix_transpose", conjugate=False):
   """Transposes last two dimensions of tensor `a`.
@@ -1565,7 +1818,8 @@ def zeros(shape, dtype=dtypes.float32, name=None):
   return output
 
 
-@tf_export("zeros_like")
+@tf_export(v1=["zeros_like"])
+@dispatch.add_dispatch_support
 def zeros_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to zero.
 
@@ -1592,6 +1846,43 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   Returns:
     A `Tensor` with all elements set to zero.
   """
+  return zeros_like_impl(tensor, dtype, name, optimize)
+
+
+@tf_export("zeros_like", v1=[])
+@dispatch.add_dispatch_support
+def zeros_like_v2(
+    input,  # pylint: disable=redefined-builtin
+    dtype=None,
+    name=None):
+  """Creates a tensor with all elements set to zero.
+
+  Given a single tensor (`tensor`), this operation returns a tensor of the
+  same type and shape as `tensor` with all elements set to zero. Optionally,
+  you can use `dtype` to specify a new type for the returned tensor.
+
+  For example:
+
+  ```python
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.zeros_like(tensor)  # [[0, 0, 0], [0, 0, 0]]
+  ```
+
+  Args:
+    input: A `Tensor`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with all elements set to zero.
+  """
+  return zeros_like_impl(input, dtype, name, optimize=True)
+
+
+def zeros_like_impl(tensor, dtype, name, optimize=True):
+  """Internal implementation for the v1/v2 zeros_like API calls."""
   with ops.name_scope(name, "zeros_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
 
@@ -1618,7 +1909,8 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
       return gen_array_ops.zeros_like(tensor, name=name)
 
 
-@tf_export("ones_like")
+@tf_export(v1=["ones_like"])
+@dispatch.add_dispatch_support
 def ones_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to 1.
 
@@ -1645,6 +1937,43 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
   Returns:
     A `Tensor` with all elements set to 1.
   """
+  return ones_like_impl(tensor, dtype, name, optimize)
+
+
+@tf_export("ones_like", v1=[])
+@dispatch.add_dispatch_support
+def ones_like_v2(
+    input,  # pylint: disable=redefined-builtin
+    dtype=None,
+    name=None):
+  """Creates a tensor with all elements set to zero.
+
+  Given a single tensor (`tensor`), this operation returns a tensor of the
+  same type and shape as `tensor` with all elements set to zero. Optionally,
+  you can use `dtype` to specify a new type for the returned tensor.
+
+  For example:
+
+  ```python
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.ones_like(tensor)  # [[1, 1, 1], [1, 1, 1]]
+  ```
+
+  Args:
+    input: A `Tensor`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with all elements set to zero.
+  """
+  return ones_like_impl(input, dtype, name, optimize=True)
+
+
+def ones_like_impl(tensor, dtype, name, optimize=True):
+  """Internal implementation for the v1/v2 ones_like API calls."""
   with ops.name_scope(name, "ones_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
     ones_shape = shape_internal(tensor, optimize=optimize)
@@ -1702,7 +2031,7 @@ def ones(shape, dtype=dtypes.float32, name=None):
   return output
 
 
-@tf_export("placeholder")
+@tf_export(v1=["placeholder"])
 def placeholder(dtype, shape=None, name=None):
   """Inserts a placeholder for a tensor that will be always fed.
 
@@ -1747,6 +2076,22 @@ def placeholder(dtype, shape=None, name=None):
   return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
 
 
+@tf_export(v1=["placeholder_with_default"])
+def placeholder_with_default(input, shape, name=None):  # pylint: disable=redefined-builtin
+  """A placeholder op that passes through `input` when its output is not fed.
+
+  Args:
+    input: A `Tensor`. The default value to produce when output is not fed.
+    shape: A `tf.TensorShape` or list of `int`s. The (possibly partial) shape
+      of the tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  return gen_array_ops.placeholder_with_default(input, shape, name)
+
+
 # pylint: disable=redefined-outer-name
 def _normalize_sparse_shape(shape, name):
   """Returns a tuple of (Tensor or None, rank or None)."""
@@ -1758,7 +2103,7 @@ def _normalize_sparse_shape(shape, name):
   return (ops.convert_to_tensor(shape, dtype=dtypes.int64, name=name), rank)
 
 
-@tf_export("sparse.placeholder", "sparse_placeholder")
+@tf_export(v1=["sparse.placeholder", "sparse_placeholder"])
 @deprecation.deprecated_endpoints("sparse_placeholder")
 def sparse_placeholder(dtype, shape=None, name=None):
   """Inserts a placeholder for a sparse tensor that will be always fed.
@@ -1826,7 +2171,65 @@ def sparse_placeholder(dtype, shape=None, name=None):
 # pylint: enable=redefined-outer-name
 
 
-@tf_export("pad")
+@tf_export("pad", v1=[])
+def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
+  """Pads a tensor.
+
+  This operation pads a `tensor` according to the `paddings` you specify.
+  `paddings` is an integer tensor with shape `[n, 2]`, where n is the rank of
+  `tensor`. For each dimension D of `input`, `paddings[D, 0]` indicates how
+  many values to add before the contents of `tensor` in that dimension, and
+  `paddings[D, 1]` indicates how many values to add after the contents of
+  `tensor` in that dimension. If `mode` is "REFLECT" then both `paddings[D, 0]`
+  and `paddings[D, 1]` must be no greater than `tensor.dim_size(D) - 1`. If
+  `mode` is "SYMMETRIC" then both `paddings[D, 0]` and `paddings[D, 1]` must be
+  no greater than `tensor.dim_size(D)`.
+
+  The padded size of each dimension D of the output is:
+
+  `paddings[D, 0] + tensor.dim_size(D) + paddings[D, 1]`
+
+  For example:
+
+  ```python
+  t = tf.constant([[1, 2, 3], [4, 5, 6]])
+  paddings = tf.constant([[1, 1,], [2, 2]])
+  # 'constant_values' is 0.
+  # rank of 't' is 2.
+  tf.pad(t, paddings, "CONSTANT")  # [[0, 0, 0, 0, 0, 0, 0],
+                                   #  [0, 0, 1, 2, 3, 0, 0],
+                                   #  [0, 0, 4, 5, 6, 0, 0],
+                                   #  [0, 0, 0, 0, 0, 0, 0]]
+
+  tf.pad(t, paddings, "REFLECT")  # [[6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1],
+                                  #  [6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1]]
+
+  tf.pad(t, paddings, "SYMMETRIC")  # [[2, 1, 1, 2, 3, 3, 2],
+                                    #  [2, 1, 1, 2, 3, 3, 2],
+                                    #  [5, 4, 4, 5, 6, 6, 5],
+                                    #  [5, 4, 4, 5, 6, 6, 5]]
+  ```
+
+  Args:
+    tensor: A `Tensor`.
+    paddings: A `Tensor` of type `int32`.
+    mode: One of "CONSTANT", "REFLECT", or "SYMMETRIC" (case-insensitive)
+    constant_values: In "CONSTANT" mode, the scalar pad value to use. Must be
+      same type as `tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `tensor`.
+
+  Raises:
+    ValueError: When mode is not one of "CONSTANT", "REFLECT", or "SYMMETRIC".
+  """
+  return pad(tensor, paddings, mode, name, constant_values)
+
+
+@tf_export(v1=["pad"])
 def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
@@ -2216,7 +2619,7 @@ def required_space_to_batch_paddings(input_shape,
 
     block_shape.get_shape().assert_is_fully_defined()
     block_shape.get_shape().assert_has_rank(1)
-    num_block_dims = block_shape.get_shape()[0].value
+    num_block_dims = block_shape.get_shape().dims[0].value
     if num_block_dims == 0:
       return zeros([0, 2], dtypes.int32), zeros([0, 2], dtypes.int32)
 
@@ -2253,7 +2656,7 @@ def required_space_to_batch_paddings(input_shape,
     return result_paddings, result_crops
 
 
-@tf_export("nn.space_to_batch", "space_to_batch")
+@tf_export(v1=["nn.space_to_batch", "space_to_batch"])
 @deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=redefined-builtin
   result = space_to_batch_nd(
@@ -2268,7 +2671,15 @@ def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=r
 space_to_batch.__doc__ = gen_array_ops.space_to_batch.__doc__
 
 
-@tf_export("nn.space_to_depth", "space_to_depth")
+@tf_export("space_to_batch", "nn.space_to_batch", v1=[])
+def space_to_batch_v2(input, block_shape, paddings, name=None):  # pylint: disable=redefined-builtin
+  return space_to_batch_nd(input, block_shape, paddings, name)
+
+
+space_to_batch_v2.__doc__ = gen_array_ops.space_to_batch_nd.__doc__
+
+
+@tf_export(v1=["nn.space_to_depth", "space_to_depth"])
 @deprecation.deprecated_endpoints("space_to_depth")
 def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
@@ -2277,7 +2688,15 @@ def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint:
 space_to_depth.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
-@tf_export("nn.depth_to_space", "depth_to_space")
+@tf_export("nn.space_to_depth", v1=[])
+def space_to_depth_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
+  return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
+
+
+space_to_depth_v2.__doc__ = gen_array_ops.space_to_depth.__doc__
+
+
+@tf_export(v1=["nn.depth_to_space", "depth_to_space"])
 @deprecation.deprecated_endpoints("depth_to_space")
 def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
@@ -2286,7 +2705,15 @@ def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint:
 depth_to_space.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
-@tf_export("batch_to_space")
+@tf_export("nn.depth_to_space", v1=[])
+def depth_to_space_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
+  return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
+
+
+depth_to_space_v2.__doc__ = gen_array_ops.depth_to_space.__doc__
+
+
+@tf_export(v1=["batch_to_space"])
 def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=redefined-builtin
   result = batch_to_space_nd(
       input,
@@ -2300,6 +2727,151 @@ def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=rede
 batch_to_space.__doc__ = gen_array_ops.batch_to_space.__doc__
 
 
+@tf_export("batch_to_space", v1=[])
+def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=redefined-builtin
+  """BatchToSpace for N-D tensors of type T.
+
+  This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of
+  shape `block_shape + [batch]`, interleaves these blocks back into the grid
+  defined by the spatial dimensions `[1, ..., M]`, to obtain a result with the
+  same rank as the input.  The spatial dimensions of this intermediate result
+  are then optionally cropped according to `crops` to produce the output.  This
+  is the reverse of SpaceToBatch.  See below for a precise description.
+
+  Args:
+    input: A `Tensor`.
+      N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+      where spatial_shape has M dimensions.
+    block_shape: A `Tensor`. Must be one of the following types:
+      `int32`, `int64`. 1-D with shape `[M]`, all values must be >= 1.
+      For backwards compatibility with TF 1.0, this parameter may be an int, in
+      which case it is converted to
+      `numpy.array([block_shape, block_shape], dtype=numpy.int64)`.
+    crops: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      2-D with shape `[M, 2]`, all values must be >= 0.
+        `crops[i] = [crop_start, crop_end]` specifies the amount to crop from
+        input dimension `i + 1`, which corresponds to spatial dimension `i`.  It
+        is required that
+        `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+
+      This operation is equivalent to the following steps:
+
+      1. Reshape `input` to `reshaped` of shape:
+           [block_shape[0], ..., block_shape[M-1],
+            batch / prod(block_shape),
+            input_shape[1], ..., input_shape[N-1]]
+
+      2. Permute dimensions of `reshaped` to produce `permuted` of shape
+           [batch / prod(block_shape),
+
+            input_shape[1], block_shape[0],
+            ...,
+            input_shape[M], block_shape[M-1],
+
+            input_shape[M+1], ..., input_shape[N-1]]
+
+      3. Reshape `permuted` to produce `reshaped_permuted` of shape
+           [batch / prod(block_shape),
+
+            input_shape[1] * block_shape[0],
+            ...,
+            input_shape[M] * block_shape[M-1],
+
+            input_shape[M+1],
+            ...,
+            input_shape[N-1]]
+
+      4. Crop the start and end of dimensions `[1, ..., M]` of
+         `reshaped_permuted` according to `crops` to produce the
+         output of shape:
+           [batch / prod(block_shape),
+
+            input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+            ...,
+            input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+
+            input_shape[M+1], ..., input_shape[N-1]]
+
+      Some examples:
+
+      (1) For the following input of shape `[4, 1, 1, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+      ```
+      [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+      ```
+
+      The output tensor has shape `[1, 2, 2, 1]` and value:
+
+      ```
+      x = [[[[1], [2]], [[3], [4]]]]
+      ```
+
+      (2) For the following input of shape `[4, 1, 1, 3]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+      ```
+      [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+      ```
+
+      The output tensor has shape `[1, 2, 2, 3]` and value:
+
+      ```
+      x = [[[[1, 2, 3], [4, 5, 6]],
+            [[7, 8, 9], [10, 11, 12]]]]
+      ```
+
+      (3) For the following input of shape `[4, 2, 2, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+      ```
+      x = [[[[1], [3]], [[9], [11]]],
+           [[[2], [4]], [[10], [12]]],
+           [[[5], [7]], [[13], [15]]],
+           [[[6], [8]], [[14], [16]]]]
+      ```
+
+      The output tensor has shape `[1, 4, 4, 1]` and value:
+
+      ```
+      x = [[[1],   [2],  [3],  [4]],
+           [[5],   [6],  [7],  [8]],
+           [[9],  [10], [11],  [12]],
+           [[13], [14], [15],  [16]]]
+      ```
+
+      (4) For the following input of shape `[8, 1, 3, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
+
+      ```
+      x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+           [[[0], [2], [4]]], [[[0], [10], [12]]],
+           [[[0], [5], [7]]], [[[0], [13], [15]]],
+           [[[0], [6], [8]]], [[[0], [14], [16]]]]
+      ```
+
+      The output tensor has shape `[2, 2, 4, 1]` and value:
+
+      ```
+      x = [[[[1],   [2],  [3],  [4]],
+            [[5],   [6],  [7],  [8]]],
+           [[[9],  [10], [11],  [12]],
+            [[13], [14], [15],  [16]]]]
+      ```
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if isinstance(block_shape, int):
+    block_shape = np.array([block_shape, block_shape], dtype=np.int64)
+
+  return batch_to_space_nd(input=input,
+                           block_shape=block_shape,
+                           crops=crops,
+                           name=name)
+
+
 @tf_export("one_hot")
 def one_hot(indices,
             depth,
@@ -2453,7 +3025,7 @@ def _all_dimensions(x):
         np.arange(x.get_shape().ndims), dtype=dtypes.int32)
   if (isinstance(x, sparse_tensor.SparseTensor) and
       x.dense_shape.get_shape().is_fully_defined()):
-    r = x.dense_shape.get_shape()[0].value  # sparse.dense_shape is 1-D.
+    r = x.dense_shape.get_shape().dims[0].value  # sparse.dense_shape is 1-D.
     return constant_op.constant(np.arange(r), dtype=dtypes.int32)
 
   # Otherwise, we rely on `range` and `rank` to do the right thing at runtime.
@@ -2523,7 +3095,7 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
       return gen_math_ops.cast(result, dtype)
 
 
-@tf_export("squeeze")
+@tf_export(v1=["squeeze"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "squeeze_dims")
 def squeeze(input, axis=None, name=None, squeeze_dims=None):
@@ -2573,7 +3145,14 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
   return gen_array_ops.squeeze(input, axis, name)
 
 
+@tf_export("squeeze", v1=[])
+def squeeze_v2(input, axis=None, name=None):
+  # pylint: disable=redefined-builtin
+  return squeeze(input, axis, name)
+
+
 @tf_export("where")
+@dispatch.add_dispatch_support
 def where(condition, x=None, y=None, name=None):
   """Return the elements, either from `x` or `y`, depending on the `condition`.
 
@@ -2627,7 +3206,7 @@ def where(condition, x=None, y=None, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("reverse_sequence")
+@tf_export(v1=["reverse_sequence"])
 @deprecation.deprecated_args(
     None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
 @deprecation.deprecated_args(
@@ -2651,15 +3230,32 @@ def reverse_sequence(input,
       name=name)
 
 
-# pylint: enable=redefined-builtin
-
 reverse_sequence.__doc__ = deprecation.rewrite_argument_docstring(
     deprecation.rewrite_argument_docstring(
         gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
     "seq_dim", "seq_axis")
 
 
-@tf_export("gather")
+@tf_export("reverse_sequence", v1=[])
+def reverse_sequence_v2(
+    input, seq_lengths, seq_axis=None, batch_axis=None, name=None):
+  return gen_array_ops.reverse_sequence(
+      input=input,
+      seq_lengths=seq_lengths,
+      seq_dim=seq_axis,
+      batch_dim=batch_axis,
+      name=name)
+
+
+reverse_sequence_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
+    "seq_dim", "seq_axis")
+
+# pylint: enable=redefined-builtin
+
+
+@tf_export(v1=["gather"])
 def gather(params, indices, validate_indices=None, name=None, axis=0):
   del validate_indices
   if axis != 0:
@@ -2675,10 +3271,19 @@ def gather(params, indices, validate_indices=None, name=None, axis=0):
     return gen_array_ops.gather_v2(params, indices, axis, name=name)
 
 
-gather.__doc__ = gen_array_ops.gather_v2.__doc__
+@tf_export("gather", v1=[])
+@dispatch.add_dispatch_support
+def gather_v2(params, indices, validate_indices=None, axis=0, name=None):
+  return gather(params, indices, validate_indices=validate_indices, name=name,
+                axis=axis)
+
+
+gather.__doc__ = gather_v2.__doc__ = gen_array_ops.gather_v2.__doc__
+
 
 
 @tf_export("batch_gather")
+@dispatch.add_dispatch_support
 def batch_gather(params, indices, name=None):
   """Gather slices from `params` according to `indices` with leading batch dims.
 
@@ -2748,7 +3353,7 @@ def batch_gather(params, indices, name=None):
     result = reshape(flat_result, concat([indices_shape, outer_shape], axis=0))
     final_shape = indices.get_shape()[:ndims-1].merge_with(
         params.get_shape()[:ndims -1])
-    final_shape = final_shape.concatenate(indices.get_shape()[ndims-1])
+    final_shape = final_shape.concatenate(indices.get_shape().dims[ndims-1])
     final_shape = final_shape.concatenate(params.get_shape()[ndims:])
     result.set_shape(final_shape)
     return result
@@ -2756,7 +3361,7 @@ def batch_gather(params, indices, name=None):
 
 # Define quantize_v2 here in order to make name the second-to-last attribute,
 # because round_mode was added later.
-@tf_export("quantize_v2")
+@tf_export(v1=["quantize_v2"])
 @deprecation.deprecated(
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
@@ -2777,12 +3382,12 @@ def quantize_v2(input,  # pylint: disable=redefined-builtin
                                    round_mode=round_mode)
 
 
-quantize_v2.__doc__ = """Please use `tf.quantize` instead."""
+quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 
 
 # We want to expose tf.quantize instead of tf.quantize_v2; we can deprecate
 # tf.quantize_v2 in next version of TensorFlow.
-@tf_export("quantization.quantize", "quantize")
+@tf_export("quantization.quantize", v1=["quantization.quantize", "quantize"])
 @deprecation.deprecated_endpoints("quantize")
 def quantize(input,  # pylint: disable=redefined-builtin
              min_range,
@@ -2863,3 +3468,48 @@ def searchsorted(sorted_sequence,
 
 
 quantize.__doc__ = gen_array_ops.quantize_v2.__doc__
+
+
+@tf_export("image.extract_image_patches", v1=[])
+def extract_image_patches_v2(
+    images,
+    sizes,
+    strides,
+    rates,
+    padding,
+    name=None):
+  # pylint: disable=line-too-long
+  r"""Extract `patches` from `images` and put them in the \"depth\" output dimension.
+
+  Args:
+    images: A 4-D Tensor with shape `[batch, in_rows, in_cols, depth]
+    sizes: The size of the sliding window for each dimension of `images`.
+    strides: A 1-D Tensor of length 4. How far the centers of two consecutive
+      patches are in the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+    rates: A 1-D Tensor of length 4. Must be: `[1, rate_rows, rate_cols, 1]`.
+      This is the input stride, specifying how far two consecutive patch samples
+      are in the input. Equivalent to extracting patches with `patch_sizes_eff =
+      patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by subsampling
+      them spatially by a factor of `rates`. This is equivalent to `rate` in
+      dilated (a.k.a. Atrous) convolutions.
+    padding: The type of padding algorithm to use.
+      We specify the size-related attributes as: ```python ksizes = [1,
+        ksize_rows, ksize_cols, 1] strides = [1, strides_rows, strides_cols, 1]
+        rates = [1, rates_rows, rates_cols, 1]
+    name: A name for the operation (optional).
+
+  Returns:
+    A 4-D Tensor. Has the same type as `images`, and with shape `[batch,
+    out_rows, out_cols, ksize_rows * ksize_cols * depth]` containing image
+    patches with size `ksize_rows x ksize_cols x depth` vectorized in the
+    \"depth\" dimension. Note `out_rows` and `out_cols` are the dimensions of
+    the output patches.
+  """
+  # pylint: enable=line-too-long
+  return gen_array_ops.extract_image_patches(
+      images, sizes, strides, rates, padding, name)
+
+extract_image_patches_deprecation = deprecation.deprecated_args(
+    None, "ksizes is deprecated, use sizes instead", "ksizes")
+tf_export(v1=["image.extract_image_patches", "extract_image_patches"])(
+    extract_image_patches_deprecation(gen_array_ops.extract_image_patches))
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index c4cfc0da197edcfd143cfee79fd3c3f9b7a2858b..d154b6759bfbc50ad2e5ea34e4f04b945ef2d397 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -34,11 +34,12 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
   def __init__(self, method_name="runTest"):
     super(BitwiseOpTest, self).__init__(method_name)
 
+  @test_util.run_deprecated_v1
   def testBinaryOps(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for dtype in dtype_list:
         lhs = constant_op.constant([0, 5, 3, 14], dtype=dtype)
         rhs = constant_op.constant([5, 0, 7, 11], dtype=dtype)
@@ -59,21 +60,23 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
                   2**31 - 1, 2**31, 2**32 - 1, 2**32, -2**32 + 1, -2**32,
                   -2**63 + 1, 2**63 - 1]
     def count_bits(x):
-      return sum([bin(z).count("1") for z in six.iterbytes(x.tobytes())])
+      return sum(bin(z).count("1") for z in six.iterbytes(x.tobytes()))
     for dtype in dtype_list:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         print("PopulationCount test: ", dtype)
         inputs = np.array(raw_inputs, dtype=dtype.as_numpy_dtype)
         truth = [count_bits(x) for x in inputs]
         input_tensor = constant_op.constant(inputs, dtype=dtype)
-        popcnt_result = sess.run(gen_bitwise_ops.population_count(input_tensor))
+        popcnt_result = self.evaluate(
+            gen_bitwise_ops.population_count(input_tensor))
         self.assertAllEqual(truth, popcnt_result)
 
+  @test_util.run_deprecated_v1
   def testInvertOp(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
     inputs = [0, 5, 3, 14]
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for dtype in dtype_list:
         # Because of issues with negative numbers, let's test this indirectly.
         # 1. invert(a) and a = 0
@@ -89,15 +92,16 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(not_a_or_a, [not_0] * 4)
         # For unsigned dtypes let's also check the result directly.
         if dtype.is_unsigned:
-          inverted = sess.run(bitwise_ops.invert(input_tensor))
+          inverted = self.evaluate(bitwise_ops.invert(input_tensor))
           expected = [dtype.max - x for x in inputs]
           self.assertAllEqual(inverted, expected)
 
+  @test_util.run_deprecated_v1
   def testShiftsWithPositiveLHS(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64,
                   np.uint8, np.uint16, np.uint32, np.uint64]
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for dtype in dtype_list:
         lhs = np.array([0, 5, 3, 14], dtype=dtype)
         rhs = np.array([5, 0, 7, 3], dtype=dtype)
@@ -107,10 +111,11 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(left_shift_result, np.left_shift(lhs, rhs))
         self.assertAllEqual(right_shift_result, np.right_shift(lhs, rhs))
 
+  @test_util.run_deprecated_v1
   def testShiftsWithNegativeLHS(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64]
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for dtype in dtype_list:
         lhs = np.array([-1, -5, -3, -14], dtype=dtype)
         rhs = np.array([5, 0, 7, 11], dtype=dtype)
@@ -120,10 +125,11 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(left_shift_result, np.left_shift(lhs, rhs))
         self.assertAllEqual(right_shift_result, np.right_shift(lhs, rhs))
 
+  @test_util.run_deprecated_v1
   def testImplementationDefinedShiftsDoNotCrash(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64]
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for dtype in dtype_list:
         lhs = np.array([-1, -5, -3, -14], dtype=dtype)
         rhs = np.array([-2, 64, 101, 32], dtype=dtype)
@@ -135,11 +141,12 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
                   bitwise_ops.right_shift(lhs, rhs)])
 
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16]
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       for dtype in dtype_list:
         lhs = constant_op.constant([[0], [3], [5]], dtype=dtype)
         rhs = constant_op.constant([[1, 2, 4]], dtype=dtype)
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 720f9f4d41e4cc627752be751a0c5b377b404523..37d649acf00c6905ae7330169321e5a5f8f487be 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -33,13 +33,17 @@ from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_make_quant
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_make_stats_summary as make_stats_summary
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_predict as predict
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_add_summaries as quantile_add_summaries
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_deserialize as quantile_resource_deserialize
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_flush as quantile_flush
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_get_bucket_boundaries as get_bucket_boundaries
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_handle_op as quantile_resource_handle_op
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_training_predict as training_predict
 from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_update_ensemble as update_ensemble
+from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as is_quantile_resource_initialized
 # pylint: enable=unused-import
 
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import tracking
 
 
 class PruningMode(object):
@@ -57,6 +61,69 @@ class PruningMode(object):
           sorted(cls._map))))
 
 
+class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for QuantileAccumulator.
+
+     The bucket boundaries are serialized and deserialized from checkpointing.
+  """
+
+  def __init__(self,
+               epsilon,
+               num_streams,
+               num_quantiles,
+               name=None,
+               max_elements=None):
+    with ops.name_scope(name, 'QuantileAccumulator') as name:
+      self._eps = epsilon
+      self._num_streams = num_streams
+      self._num_quantiles = num_quantiles
+      self._resource_handle = quantile_resource_handle_op(
+          container='', shared_name=name, name=name)
+      self._create_op = create_quantile_stream_resource(self._resource_handle,
+                                                        epsilon, num_streams)
+      is_initialized_op = is_quantile_resource_initialized(
+          self._resource_handle)
+      resources.register_resource(self._resource_handle, self._create_op,
+                                  is_initialized_op)
+      self._make_saveable(name)
+
+  def _make_saveable(self, name):
+    bucket_boundaries = get_bucket_boundaries(self._resource_handle,
+                                              self._num_streams)
+    slice_spec = ''
+    specs = []
+    for i in range(self._num_streams):
+      specs.append(
+          saver.BaseSaverBuilder.SaveSpec(
+              bucket_boundaries[i], slice_spec,
+              name + '_bucket_boundaries_' + str(i)))
+    super(QuantileAccumulator, self).__init__(self._resource_handle, specs,
+                                              name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+
+  def restore(self, restored_tensors, unused_tensor_shapes):
+    bucket_boundaries = restored_tensors
+    with ops.control_dependencies([self._create_op]):
+      return quantile_resource_deserialize(
+          self._resource_handle, bucket_boundaries=bucket_boundaries)
+
+  def add_summaries(self, float_columns, example_weights):
+    summaries = make_quantile_summaries(float_columns, example_weights,
+                                        self._eps)
+    summary_op = quantile_add_summaries(self._resource_handle, summaries)
+    return summary_op
+
+  def flush(self):
+    return quantile_flush(self._resource_handle, self._num_quantiles)
+
+  def get_bucket_boundaries(self):
+    return get_bucket_boundaries(self._resource_handle, self._num_streams)
+
+  @property
+  def resource(self):
+    return self._resource_handle
+
+
 class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
   """SaveableObject implementation for TreeEnsemble."""
 
@@ -102,35 +169,52 @@ class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
           tree_ensemble_serialized=restored_tensors[1])
 
 
-class TreeEnsemble(object):
+class TreeEnsemble(tracking.TrackableResource):
   """Creates TreeEnsemble resource."""
 
   def __init__(self, name, stamp_token=0, is_local=False, serialized_proto=''):
+    self._stamp_token = stamp_token
+    self._serialized_proto = serialized_proto
+    self._is_local = is_local
     with ops.name_scope(name, 'TreeEnsemble') as name:
-      self._resource_handle = (
-          gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
-              container='', shared_name=name, name=name))
-      create_op = gen_boosted_trees_ops.boosted_trees_create_ensemble(
-          self.resource_handle,
-          stamp_token,
-          tree_ensemble_serialized=serialized_proto)
-      is_initialized_op = (
-          gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
-              self._resource_handle))
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
       # Adds the variable to the savable list.
       if not is_local:
-        saveable = _TreeEnsembleSavable(self.resource_handle, create_op,
-                                        self.resource_handle.name)
-        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+        self._saveable = _TreeEnsembleSavable(
+            self.resource_handle, self.initializer, self.resource_handle.name)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
       resources.register_resource(
           self.resource_handle,
-          create_op,
+          self.initializer,
           is_initialized_op,
           is_shared=not is_local)
 
+  def create_resource(self):
+    return gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
+        container='', shared_name=self._name, name=self._name)
+
+  def initialize(self):
+    return gen_boosted_trees_ops.boosted_trees_create_ensemble(
+        self.resource_handle,
+        self._stamp_token,
+        tree_ensemble_serialized=self._serialized_proto)
+
   @property
-  def resource_handle(self):
-    return self._resource_handle
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
+        self.resource_handle)
+
+  def _gather_saveables_for_checkpoint(self):
+    if not self._is_local:
+      return {'tree_ensemble': self._saveable}
 
   def get_stamp_token(self):
     """Returns the current stamp token of the resource."""
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 98dde995c9b898710d5d215354893d13672b002e..56f76a49d51bec99d35593041f3e72c2fcb580a4 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -27,7 +27,9 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('random.uniform_candidate_sampler', 'nn.uniform_candidate_sampler')
+@tf_export(
+    'random.uniform_candidate_sampler',
+    v1=['random.uniform_candidate_sampler', 'nn.uniform_candidate_sampler'])
 @deprecation.deprecated_endpoints('nn.uniform_candidate_sampler')
 def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                               range_max, seed=None, name=None):
@@ -84,8 +86,12 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
       seed2=seed2, name=name)
 
 
-@tf_export('random.log_uniform_candidate_sampler',
-           'nn.log_uniform_candidate_sampler')
+@tf_export(
+    'random.log_uniform_candidate_sampler',
+    v1=[
+        'random.log_uniform_candidate_sampler',
+        'nn.log_uniform_candidate_sampler'
+    ])
 @deprecation.deprecated_endpoints('nn.log_uniform_candidate_sampler')
 def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                                   range_max, seed=None, name=None):
@@ -145,7 +151,10 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
       seed2=seed2, name=name)
 
 
-@tf_export('nn.learned_unigram_candidate_sampler')
+@tf_export(
+    'random.learned_unigram_candidate_sampler',
+    'nn.learned_unigram_candidate_sampler')
+@deprecation.deprecated_endpoints(['nn.learned_unigram_candidate_sampler'])
 def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
                                       unique, range_max, seed=None, name=None):
   """Samples a set of classes from a distribution learned during training.
@@ -202,7 +211,8 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
       seed2=seed2, name=name)
 
 
-@tf_export('nn.fixed_unigram_candidate_sampler')
+@tf_export('random.fixed_unigram_candidate_sampler',
+           'nn.fixed_unigram_candidate_sampler')
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -294,7 +304,7 @@ def fixed_unigram_candidate_sampler(true_classes,
       unigrams=unigrams, seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export('nn.all_candidate_sampler')
+@tf_export('random.all_candidate_sampler', 'nn.all_candidate_sampler')
 def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
                           seed=None, name=None):
   """Generate the set of all classes.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index d607f1d9fb7f30dc299c192c006c2dd9c393143e..f1f36269cf2bd9bcd3d25638a82d776850bc6bb8 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -13,11 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=g-short-docstring-punctuation
-"""Asserts and Boolean Checks.
-
-See the [Asserts and
-checks](https://tensorflow.org/api_guides/python/check_ops) guide.
-"""
+"""Asserts and Boolean Checks."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -92,7 +88,9 @@ def _shape_and_dtype_str(tensor):
   return 'shape=%s dtype=%s' % (tensor.shape, tensor.dtype.name)
 
 
-@tf_export('debugging.assert_proper_iterable', 'assert_proper_iterable')
+@tf_export(
+    'debugging.assert_proper_iterable',
+    v1=['debugging.assert_proper_iterable', 'assert_proper_iterable'])
 @deprecation.deprecated_endpoints('assert_proper_iterable')
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
@@ -121,7 +119,31 @@ def assert_proper_iterable(values):
         'Expected argument "values" to be iterable.  Found: %s' % type(values))
 
 
-@tf_export('debugging.assert_negative', 'assert_negative')
+@tf_export('debugging.assert_negative', v1=[])
+def assert_negative_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x < 0` holds element-wise.
+
+  This Op checks that `x[i] < 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not negative everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_negative".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] < 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_negative(x=x, message=message, summarize=summarize, name=name)
+
+
+@tf_export(v1=['debugging.assert_negative', 'assert_negative'])
 @deprecation.deprecated_endpoints('assert_negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
@@ -163,7 +185,31 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less(x, zero, data=data, summarize=summarize)
 
 
-@tf_export('debugging.assert_positive', 'assert_positive')
+@tf_export('debugging.assert_positive', v1=[])
+def assert_positive_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x > 0` holds element-wise.
+
+  This Op checks that `x[i] > 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not positive everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional). Defaults to "assert_positive".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] > 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_positive(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_positive', 'assert_positive'])
 @deprecation.deprecated_endpoints('assert_positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
@@ -204,7 +250,32 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less(zero, x, data=data, summarize=summarize)
 
 
-@tf_export('debugging.assert_non_negative', 'assert_non_negative')
+@tf_export('debugging.assert_non_negative', v1=[])
+def assert_non_negative_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x >= 0` holds element-wise.
+
+  This Op checks that `x[i] >= 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not >= 0 everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+      "assert_non_negative".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] >= 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_non_negative(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_non_negative', 'assert_non_negative'])
 @deprecation.deprecated_endpoints('assert_non_negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
@@ -247,7 +318,32 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(zero, x, data=data, summarize=summarize)
 
 
-@tf_export('debugging.assert_non_positive', 'assert_non_positive')
+@tf_export('debugging.assert_non_positive', v1=[])
+def assert_non_positive_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x <= 0` holds element-wise.
+
+  This Op checks that `x[i] <= 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not <= 0 everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+      "assert_non_positive".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] <= 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_non_positive(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_non_positive', 'assert_non_positive'])
 @deprecation.deprecated_endpoints('assert_non_positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
@@ -290,7 +386,33 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
 
-@tf_export('debugging.assert_equal', 'assert_equal')
+@tf_export('debugging.assert_equal', 'assert_equal', v1=[])
+def assert_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x == y` holds element-wise.
+
+  This Op checks that `x[i] == y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` and `y` are not equal, `message`, as well as the first `summarize`
+  entries of `x` and `y` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x == y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_equal', 'assert_equal'])
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -390,7 +512,36 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_none_equal', 'assert_none_equal')
+@tf_export('debugging.assert_none_equal', v1=[])
+def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
+  """Assert the condition `x != y` holds for all elements.
+
+  This Op checks that `x[i] != y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If any elements of `x` and `y` are equal, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError`
+  is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to
+    "assert_none_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x != y` is False for any pair of elements in `x` and `y`. The check can
+      be performed immediately during eager execution or if `x` and `y` are
+      statically known.
+  """
+  assert_none_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_none_equal', 'assert_none_equal'])
 @deprecation.deprecated_endpoints('assert_none_equal')
 def assert_none_equal(
     x, y, data=None, summarize=None, message=None, name=None):
@@ -442,7 +593,52 @@ def assert_none_equal(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_near', 'assert_near')
+@tf_export('debugging.assert_near', v1=[])
+def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
+                   name=None):
+  """Assert the condition `x` and `y` are close element-wise.
+
+  This Op checks that `x[i] - y[i] < atol + rtol * tf.abs(y[i])` holds for every
+  pair of (possibly broadcast) elements of `x` and `y`. If both `x` and `y` are
+  empty, this is trivially satisfied.
+
+  If any elements of `x` and `y` are not close, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError`
+  is raised.
+
+  The default `atol` and `rtol` is `10 * eps`, where `eps` is the smallest
+  representable positive number such that `1 + eps != 1`.  This is about
+  `1.2e-6` in `32bit`, `2.22e-15` in `64bit`, and `0.00977` in `16bit`.
+  See `numpy.finfo`.
+
+  Args:
+    x: Float or complex `Tensor`.
+    y: Float or complex `Tensor`, same dtype as and broadcastable to `x`.
+    rtol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The relative tolerance.  Default is `10 * eps`.
+    atol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The absolute tolerance.  Default is `10 * eps`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_near".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x != y` is False for any pair of elements in `x` and `y`. The check can
+      be performed immediately during eager execution or if `x` and `y` are
+      statically known.
+
+  @compatibility(numpy)
+  Similar to `numpy.assert_allclose`, except tolerance depends on data type.
+  This is due to the fact that `TensorFlow` is often used with `32bit`, `64bit`,
+  and even `16bit` data.
+  @end_compatibility
+  """
+  assert_near(x=x, y=y, rtol=rtol, atol=atol, summarize=summarize,
+              message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_near', 'assert_near'])
 @deprecation.deprecated_endpoints('assert_near')
 def assert_near(
     x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
@@ -464,7 +660,7 @@ def assert_near(
   If both `x` and `y` are empty, this is trivially satisfied.
 
   The default `atol` and `rtol` is `10 * eps`, where `eps` is the smallest
-  representable positive number such that `1 + eps != eps`.  This is about
+  representable positive number such that `1 + eps != 1`.  This is about
   `1.2e-6` in `32bit`, `2.22e-15` in `64bit`, and `0.00977` in `16bit`.
   See `numpy.finfo`.
 
@@ -521,7 +717,34 @@ def assert_near(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_less', 'assert_less')
+@tf_export('debugging.assert_less', 'assert_less', v1=[])
+def assert_less_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x < y` holds element-wise.
+
+  This Op checks that `x[i] < y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not less than `y` element-wise, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
+  raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_less".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x < y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_less(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_less', 'assert_less'])
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -569,7 +792,34 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_less_equal', 'assert_less_equal')
+@tf_export('debugging.assert_less_equal', v1=[])
+def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x <= y` holds element-wise.
+
+  This Op checks that `x[i] <= y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not less or equal than `y` element-wise, `message`, as well as the
+  first `summarize` entries of `x` and `y` are printed, and
+  `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional). Defaults to "assert_less_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x <= y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_less_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_less_equal', 'assert_less_equal'])
 @deprecation.deprecated_endpoints('assert_less_equal')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
@@ -618,7 +868,34 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_greater', 'assert_greater')
+@tf_export('debugging.assert_greater', 'assert_greater', v1=[])
+def assert_greater_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x > y` holds element-wise.
+
+  This Op checks that `x[i] > y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not greater than `y` element-wise, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
+  raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_greater".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x > y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_greater(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_greater', 'assert_greater'])
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -666,7 +943,36 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_greater_equal', 'assert_greater_equal')
+@tf_export('debugging.assert_greater_equal', v1=[])
+def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x >= y` holds element-wise.
+
+  This Op checks that `x[i] >= y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not greater or equal to `y` element-wise, `message`, as well as the
+  first `summarize` entries of `x` and `y` are printed, and
+  `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+    "assert_greater_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x >= y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_greater_equal(x=x, y=y, summarize=summarize, message=message,
+                       name=name)
+
+
+@tf_export(v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
 @deprecation.deprecated_endpoints('assert_greater_equal')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
                          name=None):
@@ -765,7 +1071,31 @@ def _assert_rank_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_rank', 'assert_rank')
+@tf_export('debugging.assert_rank', 'assert_rank', v1=[])
+def assert_rank_v2(x, rank, message=None, name=None):
+  """Assert that `x` has rank equal to `rank`.
+
+  This Op checks that the rank of `x` is equal to `rank`.
+
+  If `x` has a different rank, `message`, as well as the shape of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    rank: Scalar integer `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to
+      "assert_rank".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x` does not have rank `rank`. The check can be performed immediately
+      during eager execution or if the shape of `x` is statically known.
+  """
+  assert_rank(x=x, rank=rank, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank', 'assert_rank'])
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -780,7 +1110,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
     x:  Numeric `Tensor`.
     rank:  Scalar integer `Tensor`.
     data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`.
+      error message and the shape of `x`.
     summarize: Print this many entries of each tensor.
     message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_rank".
@@ -827,7 +1157,31 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   return assert_op
 
 
-@tf_export('debugging.assert_rank_at_least', 'assert_rank_at_least')
+@tf_export('debugging.assert_rank_at_least', v1=[])
+def assert_rank_at_least_v2(x, rank, message=None, name=None):
+  """Assert that `x` has rank of at least `rank`.
+
+  This Op checks that the rank of `x` is greater or equal to `rank`.
+
+  If `x` has a rank lower than `rank`, `message`, as well as the shape of `x`
+  are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    rank: Scalar integer `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to
+      "assert_rank_at_least".
+
+  Raises:
+    InvalidArgumentError: `x` does not have rank at least `rank`, but the rank
+      cannot be statically determined.
+    ValueError: If static checks determine `x` has mismatched rank.
+  """
+  assert_rank_at_least(x=x, rank=rank, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
 @deprecation.deprecated_endpoints('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
@@ -959,7 +1313,30 @@ def _assert_ranks_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_rank_in', 'assert_rank_in')
+@tf_export('debugging.assert_rank_in', v1=[])
+def assert_rank_in_v2(x, ranks, message=None, name=None):
+  """Assert that `x` has a rank in `ranks`.
+
+  This Op checks that the rank of `x` is in `ranks`.
+
+  If `x` has a different rank, `message`, as well as the shape of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    ranks: `Iterable` of scalar `Tensor` objects.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to "assert_rank_in".
+
+  Raises:
+    InvalidArgumentError: `x` does not have rank in `ranks`, but the rank cannot
+      be statically determined.
+    ValueError: If static checks determine `x` has mismatched rank.
+  """
+  assert_rank_in(x=x, ranks=ranks, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank_in', 'assert_rank_in'])
 @deprecation.deprecated_endpoints('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
@@ -1022,7 +1399,25 @@ def assert_rank_in(
   return assert_op
 
 
-@tf_export('debugging.assert_integer', 'assert_integer')
+@tf_export('debugging.assert_integer', v1=[])
+def assert_integer_v2(x, message=None, name=None):
+  """Assert that `x` is of integer dtype.
+
+  If `x` has a non-integer type, `message`, as well as the dtype of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: A `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to "assert_integer".
+
+  Raises:
+    TypeError:  If `x.dtype` is not a non-quantized integer type.
+  """
+  assert_integer(x=x, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_integer', 'assert_integer'])
 @deprecation.deprecated_endpoints('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
@@ -1061,13 +1456,30 @@ def assert_integer(x, message=None, name=None):
     return control_flow_ops.no_op('statically_determined_was_integer')
 
 
-@tf_export('debugging.assert_type', 'assert_type')
+@tf_export('debugging.assert_type', v1=[])
+def assert_type_v2(tensor, tf_type, message=None, name=None):
+  """Asserts that the given `Tensor` is of the specified type.
+
+  Args:
+    tensor: A `Tensor`.
+    tf_type: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
+      etc).
+    message: A string to prefix to the default message.
+    name:  A name for this operation. Defaults to "assert_type"
+
+  Raises:
+    TypeError: If the tensor's data type doesn't match `tf_type`.
+  """
+  assert_type(tensor=tensor, tf_type=tf_type, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_type', 'assert_type'])
 @deprecation.deprecated_endpoints('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
 
   Args:
-    tensor: A tensorflow `Tensor`.
+    tensor: A `Tensor`.
     tf_type: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
       etc).
     message: A string to prefix to the default message.
@@ -1109,14 +1521,22 @@ def _get_diff_for_monotonic_comparison(x):
   return control_flow_ops.cond(is_shorter_than_two, short_result, diff)
 
 
-@tf_export('debugging.is_numeric_tensor', 'is_numeric_tensor')
+@tf_export(
+    'debugging.is_numeric_tensor',
+    v1=['debugging.is_numeric_tensor', 'is_numeric_tensor'])
 @deprecation.deprecated_endpoints('is_numeric_tensor')
 def is_numeric_tensor(tensor):
   return isinstance(tensor, ops.Tensor) and tensor.dtype in NUMERIC_TYPES
 
 
-@tf_export('debugging.is_non_decreasing', 'is_non_decreasing')
-@deprecation.deprecated_endpoints('is_non_decreasing')
+@tf_export(
+    'math.is_non_decreasing',
+    v1=[
+        'math.is_non_decreasing', 'debugging.is_non_decreasing',
+        'is_non_decreasing'
+    ])
+@deprecation.deprecated_endpoints('debugging.is_non_decreasing',
+                                  'is_non_decreasing')
 def is_non_decreasing(x, name=None):
   """Returns `True` if `x` is non-decreasing.
 
@@ -1143,8 +1563,14 @@ def is_non_decreasing(x, name=None):
     return math_ops.reduce_all(math_ops.less_equal(zero, diff))
 
 
-@tf_export('debugging.is_strictly_increasing', 'is_strictly_increasing')
-@deprecation.deprecated_endpoints('is_strictly_increasing')
+@tf_export(
+    'math.is_strictly_increasing',
+    v1=[
+        'math.is_strictly_increasing', 'debugging.is_strictly_increasing',
+        'is_strictly_increasing'
+    ])
+@deprecation.deprecated_endpoints('debugging.is_strictly_increasing',
+                                  'is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
   """Returns `True` if `x` is strictly increasing.
 
@@ -1219,7 +1645,9 @@ def _assert_same_base_type(items, expected_type=None):
     return expected_type
 
 
-@tf_export('debugging.assert_same_float_dtype', 'assert_same_float_dtype')
+@tf_export(
+    'debugging.assert_same_float_dtype',
+    v1=['debugging.assert_same_float_dtype', 'assert_same_float_dtype'])
 @deprecation.deprecated_endpoints('assert_same_float_dtype')
 def assert_same_float_dtype(tensors=None, dtype=None):
   """Validate and return float type based on `tensors` and `dtype`.
@@ -1234,8 +1662,10 @@ def assert_same_float_dtype(tensors=None, dtype=None):
     tensors: Tensors of input values. Can include `None` elements, which will be
         ignored.
     dtype: Expected type.
+
   Returns:
     Validated type.
+
   Raises:
     ValueError: if neither `tensors` nor `dtype` is supplied, or result is not
         float, or the common type of the inputs is not a floating point type.
@@ -1249,19 +1679,57 @@ def assert_same_float_dtype(tensors=None, dtype=None):
   return dtype
 
 
-@tf_export('debugging.assert_scalar', 'assert_scalar')
+@tf_export('debugging.assert_scalar', v1=[])
+def assert_scalar_v2(tensor, message=None, name=None):
+  """Asserts that the given `tensor` is a scalar.
+
+  This function raises `ValueError` unless it can be certain that the given
+  `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
+  unknown.
+
+  Args:
+    tensor: A `Tensor`.
+    message: A string to prefix to the default message.
+    name:  A name for this operation. Defaults to "assert_scalar"
+
+  Raises:
+    ValueError: If the tensor is not scalar (rank 0), or if its shape is
+      unknown.
+  """
+  assert_scalar(tensor=tensor, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
 @deprecation.deprecated_endpoints('assert_scalar')
-def assert_scalar(tensor, name=None):
+def assert_scalar(tensor, name=None, message=None):
+  """Asserts that the given `tensor` is a scalar.
+
+  This function raises `ValueError` unless it can be certain that the given
+  `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
+  unknown.
+
+  Args:
+    tensor: A `Tensor`.
+    name:  A name for this operation. Defaults to "assert_scalar"
+    message: A string to prefix to the default message.
+
+  Returns:
+    The input tensor (potentially converted to a `Tensor`).
+
+  Raises:
+    ValueError: If the tensor is not scalar (rank 0), or if its shape is
+      unknown.
+  """
   with ops.name_scope(name, 'assert_scalar', [tensor]) as name_scope:
     tensor = ops.convert_to_tensor(tensor, name=name_scope)
     shape = tensor.get_shape()
     if shape.ndims != 0:
       if context.executing_eagerly():
-        raise ValueError('Expected scalar shape, saw shape: %s.'
-                         % (shape,))
+        raise ValueError('%sExpected scalar shape, saw shape: %s.'
+                         % (message or '', shape,))
       else:
-        raise ValueError('Expected scalar shape for %s, saw shape: %s.'
-                         % (tensor.name, shape))
+        raise ValueError('%sExpected scalar shape for %s, saw shape: %s.'
+                         % (message or '', tensor.name, shape))
     return tensor
 
 
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 45516068f4deb9ca010a78a954467f0c71381d63..a237cfff826bf0fb4cacd0c25fe5d361e3d7b26e 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -31,10 +31,12 @@ from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("clip_by_value")
+@dispatch.add_dispatch_support
 def clip_by_value(t, clip_value_min, clip_value_max,
                   name=None):
   """Clips tensor values to a specified min and max.
@@ -131,7 +133,7 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
   an optimizer.
 
   Args:
-    t: A `Tensor`.
+    t: A `Tensor` or `IndexedSlices`.
     clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
     axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions
       to use for computing the L2-norm. If `None` (the default), uses all
@@ -139,28 +141,32 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
     name: A name for the operation (optional).
 
   Returns:
-    A clipped `Tensor`.
+    A clipped `Tensor` or `IndexedSlices`.
   """
   with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name:
-    t = ops.convert_to_tensor(t, name="t")
+    values = ops.convert_to_tensor(
+        t.values if isinstance(t, ops.IndexedSlices) else t, name="t")
 
     # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
-    l2sum = math_ops.reduce_sum(t * t, axes, keepdims=True)
+    l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True)
     pred = l2sum > 0
     # Two-tap tf.where trick to bypass NaN gradients
     l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum))
     l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum)
-    intermediate = t * clip_norm
+    intermediate = values * clip_norm
     # Assert that the shape is compatible with the initial shape,
     # to prevent unintentional broadcasting.
-    _ = t.shape.merge_with(intermediate.shape)
-    tclip = array_ops.identity(
+    _ = values.shape.merge_with(intermediate.shape)
+    values_clip = array_ops.identity(
         intermediate / math_ops.maximum(l2norm, clip_norm), name=name)
 
-  return tclip
+    if isinstance(t, ops.IndexedSlices):
+      return ops.IndexedSlices(values_clip, t.indices, t.dense_shape)
+
+    return values_clip
 
 
-@tf_export("linalg.global_norm", "global_norm")
+@tf_export("linalg.global_norm", v1=["linalg.global_norm", "global_norm"])
 @deprecation.deprecated_endpoints("global_norm")
 def global_norm(t_list, name=None):
   """Computes the global norm of multiple tensors.
@@ -296,7 +302,12 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   return list_clipped, use_norm
 
 
-@tf_export("clip_by_average_norm")
+@deprecation.deprecated(
+    date=None,
+    instructions=
+    "clip_by_average_norm is deprecated in TensorFlow 2.0. Please use "
+    "clip_by_norm(t, clip_norm * tf.to_float(tf.size(t), name)) instead.")
+@tf_export(v1=["clip_by_average_norm"])
 def clip_by_average_norm(t, clip_norm, name=None):
   """Clips tensor values to a maximum average L2-norm.
 
diff --git a/tensorflow/python/ops/clip_ops_test.py b/tensorflow/python/ops/clip_ops_test.py
index 444cd0f62c43354c37a2bbec194656ee39989a88..a59a0c22d409e68a821c6b069827b15cd9ecca52 100644
--- a/tensorflow/python/ops/clip_ops_test.py
+++ b/tensorflow/python/ops/clip_ops_test.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.platform import test
@@ -29,21 +31,70 @@ class ClipOpsTest(test.TestCase):
   def __init__(self, method_name="runTest"):
     super(ClipOpsTest, self).__init__(method_name)
 
-  def _testClipByNorm(self, inputs, max_norm, expected):
+  def _testClipTensorByNorm(self, inputs, max_norm, expected):
     with self.cached_session() as sess:
       input_op = constant_op.constant(inputs)
       clipped = clip_ops.clip_by_norm(input_op, max_norm)
       check_op = numerics.add_check_numerics_ops()
-      result, _ = sess.run([clipped, check_op])
+      result, _ = self.evaluate([clipped, check_op])
     self.assertAllClose(result, expected)
 
-  def testClipByNorm(self):
+  def _testClipIndexedSlicesByNorm(self, values, indices, shape, max_norm,
+                                   axes):
+    with self.cached_session() as sess:
+      values = constant_op.constant(values)
+      indices = constant_op.constant(indices)
+      shape = constant_op.constant(shape)
+      # IndexedSlices mode
+      indixed_slices = ops.IndexedSlices(values, indices, shape)
+      clipped = clip_ops.clip_by_norm(indixed_slices, max_norm, axes)
+      # clipped should be IndexedSlices
+      self.assertIsInstance(clipped, ops.IndexedSlices)
+      clipped = ops.convert_to_tensor(clipped)
+
+      # Tensor mode
+      dense_tensor = ops.convert_to_tensor(indixed_slices)
+      dense_clipped = clip_ops.clip_by_norm(dense_tensor, max_norm, axes)
+      result, expected = self.evaluate([clipped, dense_clipped])
+    self.assertAllClose(result, expected)
+
+  @test_util.run_deprecated_v1
+  def testClipTensorByNorm(self):
     # Simple example
-    self._testClipByNorm([[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]], 4.0,
-                         [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]])
+    self._testClipTensorByNorm([[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]], 4.0,
+                               [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]])
     # Zero norm
-    self._testClipByNorm([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], 4.0,
-                         [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
+    self._testClipTensorByNorm([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], 4.0,
+                               [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
+
+  def testClipIndexedSlicesByNorm(self):
+    values = [[[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]],
+              [[0.0, 2.0, 0.0], [0.0, 0.0, -1.0]]]
+    indices = [2, 6]
+    shape = [10, 2, 3]
+    # Axes == None
+    self._testClipIndexedSlicesByNorm(values, indices, shape, 4.0, None)
+
+    # Axes == 0
+    self._testClipIndexedSlicesByNorm(values, indices, shape, 4.0, 0)
+
+    # Axes == 1
+    self._testClipIndexedSlicesByNorm(values, indices, shape, 4.0, 1)
+
+    # Axes == 2
+    self._testClipIndexedSlicesByNorm(values, indices, shape, 4.0, 1)
+
+    # Axes == [0, 1]
+    self._testClipIndexedSlicesByNorm(values, indices, shape, 4.0, [0, 1])
+
+    # Axes == [0, 1]
+    self._testClipIndexedSlicesByNorm(values, indices, shape, 4.0, [0, 2])
+
+    # Axes == [0, 1]
+    self._testClipIndexedSlicesByNorm(values, indices, shape, 4.0, [1, 2])
+
+    # Axes == [0, 1]
+    self._testClipIndexedSlicesByNorm(values, indices, shape, 4.0, [0, 1, 2])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 78c4b4bfe02876657014d662b1a1fcd96c4096d3..0fd9368d2194e875aa5c4ddfb716f0898d6a9c49 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import test
 
@@ -32,7 +33,7 @@ class CollectiveOpTest(test.TestCase):
   def _testCollectiveReduce(self, t0, t1, expected, set_graph_key):
     group_key = 1
     instance_key = 1
-    with self.test_session(
+    with self.session(
         config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
       with ops.device('/CPU:0'):
         in0 = constant_op.constant(t0)
@@ -49,23 +50,26 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testCollectiveReduce(self):
     self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
                                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
                                [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], True)
 
+  @test_util.run_deprecated_v1
   def testCollectiveAutoGraphKey(self):
     self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
                                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
                                [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], False)
 
+  @test_util.run_deprecated_v1
   def testCollectiveReduceScalar(self):
     self._testCollectiveReduce(0.1, 0.3, 0.2, True)
 
   def _testCollectiveBroadcast(self, t0):
     group_key = 1
     instance_key = 1
-    with self.test_session(
+    with self.session(
         config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
       with ops.device('/CPU:0'):
         in0 = constant_op.constant(t0)
@@ -81,6 +85,7 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], t0, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], t0, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testCollectiveBroadcast(self):
     self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
 
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 75a1a53eb7a3865d408648347ad48edcbf3abba2..abc99c1205159bd4eb87e3a378fe95693ac84aa7 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -12,21 +12,614 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-"""cond_v2 wrapper module.
+"""cond_v2 and gradient.
 
-This imports the cond_v2 method and all necessary dependencies (this is to avoid
-circular dependencies in the cond_v2 implementation). See cond_v2_impl for more
-information.
+This is a version of cond that emits a single If op, as well as the gradient
+function for If ops produced by cond_v2. This will eventually replace the
+current tf.cond implementation once it reaches feature and performance parity.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import
-from tensorflow.python.eager import function
+import collections
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import function_def_to_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import control_flow_util_v2 as util
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.util import nest
+
+
+# NOTE(skyewm): TensorFlow uses protected class methods and fields to signify
+# that they aren't part of the official public API. These protected members
+# often need to be used by implementation code however. Rather than litter the
+# code with pylint comments, we ignore protected access violations for
+# readability.
+# pylint: disable=protected-access
+
+
+def cond_v2(pred, true_fn, false_fn, name="cond"):
+  """Like tf.cond, except emits a single If op."""
+  if isinstance(pred, bool):
+    raise TypeError("pred must not be a Python bool", pred)
+
+  if not name:
+    name = "cond"
+
+  with ops.name_scope(name) as scope:
+    true_name = util.unique_fn_name(scope, "true")
+    false_name = util.unique_fn_name(scope, "false")
+
+    # Automatic control dependencies are added in defuns, but not in v1
+    # graphs. Propagate that behavior here.
+    add_control_dependencies = util.in_defun()
+    pred = ops.convert_to_tensor(pred)
+
+    true_graph = func_graph_module.func_graph_from_py_func(
+        true_name,
+        true_fn, [], {},
+        func_graph=util.CondBranchFuncGraph(
+            true_name, read_only_collections=False),
+        add_control_dependencies=add_control_dependencies,
+        op_return_value=pred)
+    false_graph = func_graph_module.func_graph_from_py_func(
+        false_name,
+        false_fn, [], {},
+        func_graph=util.CondBranchFuncGraph(
+            false_name, read_only_collections=False),
+        add_control_dependencies=add_control_dependencies,
+        op_return_value=pred)
+
+    outputs = _build_cond(pred, true_graph, false_graph,
+                          true_graph.external_captures,
+                          false_graph.external_captures,
+                          name=scope)
+
+    return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
+                                              outputs)
+
+
+@ops.RegisterGradient("If")
+def _IfGrad(op, *grads):  # pylint: disable=invalid-name
+  """The gradient of an If op produced by cond_v2."""
+  true_graph, false_graph = _get_func_graphs(op)
+  # Note: op.graph != ops.get_default_graph() when we are computing the gradient
+  # of a nested cond.
+  assert true_graph.outer_graph == op.graph
+  assert false_graph.outer_graph == op.graph
+
+  # Create grad functions that compute the gradient of the true/false forward
+  # graphs. These functions will capture tensors from the forward pass
+  # functions.
+  true_grad_graph = _create_grad_func(
+      true_graph, grads, util.unique_grad_fn_name(true_graph.name))
+  false_grad_graph = _create_grad_func(
+      false_graph, grads, util.unique_grad_fn_name(false_graph.name))
+
+  assert ([t.dtype for t in true_grad_graph.outputs] ==
+          [t.dtype for t in false_grad_graph.outputs])
+
+  if (true_grad_graph.if_op_needs_rewrite or
+      false_grad_graph.if_op_needs_rewrite):
+    # Modify 'op' to output the intermediates needed by the grad functions. Note
+    # that all needed intermediates are wrapped in optionals. Each optional
+    # intermediate output will have a value iff its corresponding branch is
+    # taken.
+    # NOTE(skyewm): if there are any active sessions, this modification to `op`
+    # may make them unrunnable!
+
+    if control_flow_util.InXlaContext(ops.get_default_graph()):
+      # XLA does not yet support optionals, so output intermediates directly and
+      # make them match via FakeParams, which can be converted to zeros in XLA.
+      # TODO(skyewm,jpienaar): can XLA support optionals?
+      true_intermediates = true_grad_graph.xla_intermediates
+      false_intermediates = false_grad_graph.xla_intermediates
+      extra_true_outputs, extra_false_outputs = _make_intermediates_match_xla(
+          true_graph, false_graph, true_intermediates, false_intermediates)
+    else:
+      true_intermediates = true_grad_graph.wrapped_intermediates
+      false_intermediates = false_grad_graph.wrapped_intermediates
+      # Make outputs match by adding none optionals.
+      extra_true_outputs, extra_false_outputs = _make_intermediates_match(
+          true_graph, false_graph, true_intermediates, false_intermediates)
+
+    true_graph.outputs.extend(extra_true_outputs)
+    false_graph.outputs.extend(extra_false_outputs)
+    # TODO(skyewm): indicate it's an internal bug if this fails.
+    _check_same_outputs(true_graph, false_graph)
+
+    true_graph.name += "_rewritten"
+    false_graph.name += "_rewritten"
+
+    op._set_func_attr("then_branch", util.create_new_tf_function(true_graph))
+    op._set_func_attr("else_branch", util.create_new_tf_function(false_graph))
+    op._set_type_list_attr("Tout", true_graph.output_types)
+    op._set_shape_list_attr("output_shapes", true_graph.output_shapes)
+    op._add_outputs(
+        [t.dtype for t in extra_true_outputs],
+        [t.shape for t in extra_true_outputs])
+
+  # Resolve references to forward graph tensors in grad graphs and ensure
+  # they are in-scope, i.e., belong to one of outer graphs of the grad graph.
+  true_grad_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
+  false_grad_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
+
+  outputs = _build_cond(op.inputs[0], true_grad_graph, false_grad_graph,
+                        true_grad_inputs, false_grad_inputs)
+
+  # The predicate has no gradient.
+  return [None] + outputs
+
+
+def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
+                name=None):
+  """Creates an If op from the specified predicate, branch functions and inputs.
+
+  Note that this modifies true_graph and false_graph to make the inputs match,
+  and to output all intermediates values so they're available for the gradient
+  computation.
+
+  true_graph and false_graph need not have the same input types, but they must
+  have the same outpute types.
+
+  Args:
+    pred: boolean Tensor
+    true_graph: FuncGraph
+    false_graph: FuncGraph
+    true_inputs: a list of Tensors to be passed to true_graph as input.
+    false_inputs: a list of Tensors to be passed to false_graph as input.
+    name: the name for the If op.
+
+  Returns:
+    A list of Tensors which are the outputs of the If op. Does not include added
+    intermediate outputs.
+  """
+  _check_same_outputs(true_graph, false_graph)
+
+  # Add inputs to true_graph and false_graph to make them match. Note that
+  # this modifies true_graph and false_graph.
+  cond_inputs = _make_inputs_match(true_graph, false_graph,
+                                   true_inputs, false_inputs)
+
+  # Create the If op.
+  tensors = gen_functional_ops._if(  # pylint: disable=protected-access
+      pred,
+      cond_inputs, [t.dtype for t in true_graph.outputs],
+      util.create_new_tf_function(true_graph),
+      util.create_new_tf_function(false_graph),
+      output_shapes=_get_output_shapes(true_graph.outputs,
+                                       false_graph.outputs),
+      name=name)
+
+  # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
+  if_op = tensors[0].op
+  util.maybe_set_lowering_attr(if_op)
+
+  # Return identities for each output of the If op, rather than the output of
+  # the If op directly. This makes pruning work if the output of cond() is
+  # fetched: the lowering pass converts the If outputs into IdentityN outputs,
+  # which if fetched will cause all ops in the taken branch to be run (since
+  # it takes all merge ops as input). After lowering, each output identity op
+  # will end up with only the appropriate merge op as input.
+  # TODO(b/79984175): this doesn't have to be a tuple once we covert to the
+  # correct output structure
+  tensors = [array_ops.identity(t) for t in tensors]
+
+  # Prevent fetching since the variant outputs can't be fetched directly.
+  if_op.graph.prevent_fetching(if_op)
+  return tensors
+
+
+def _get_func_graphs(if_op):
+  """Returns `FuncGraph`s for the input op branches.
+
+  Args:
+    if_op: The _If Operation.
+
+  Returns:
+    A 2-tuple of the `FuncGraph`s of the then_branch and else_branch.
+  """
+  def _get_func_graph_for_branch(branch_name):
+    """Generates and returns a FuncGraph for the given branch."""
+    inputs = if_op.inputs[1:]  # First input is pred.
+    input_shapes = [t.shape for t in inputs]
+    func_name = if_op.get_attr(branch_name).name
+    fdef = if_op.graph._get_function(func_name).definition
+    # `if_op.graph` may not be the same as `ops.get_default_graph()` e.g.
+    # in the case of nested if ops or when the gradient is being computed
+    # from inside a Defun. We build the `func_graph` with `if_op.graph` as its
+    # `outer_graph`. This resembles how the `FuncGraph` was built in the
+    # forward pass. We need this so that we can resolve references to tensors
+    # in `func_graph` from its gradient graph in `_resolve_grad_inputs`.
+    with if_op.graph.as_default():
+      func_graph = function_def_to_graph.function_def_to_graph(
+          fdef, input_shapes)
+    func_graph.captures = collections.OrderedDict(zip(inputs,
+                                                      func_graph.inputs))
+    # Set the if op so that the gradient code can use it.
+    func_graph._if = if_op
+    return func_graph
+
+  return (_get_func_graph_for_branch("then_branch"),
+          _get_func_graph_for_branch("else_branch"))
+
+
+def _grad_fn(func_graph, grads):
+  """The gradient function for each conditional branch.
+
+  This function builds the gradient graph of the corresponding forward-pass
+  conditional branch in `func_graph`. This is done by differentiating
+  func_graph's outputs w.r.t. its inputs.
+
+  Args:
+    func_graph: FuncGraph. The corresponding forward-pass function.
+    grads: The list of input gradient Tensors.
+
+  Returns:
+    The output gradient Tensors.
+  """
+  # Filter out untrainable function outputs.
+  # NOTE(skyewm): If we don't do this, the untrainable tensors can sometimes
+  # cause _GradientsHelper to raise an exception (e.g. the implementation
+  # doesn't expect 'ys' to contain boolean tensors).
+  assert len(func_graph.outputs) == len(grads)
+  ys = []
+  grad_ys = []
+  for y, grad_y in zip(func_graph.outputs, grads):
+    if not gradients_impl.IsTrainable(y):
+      continue
+    ys.append(y)
+    grad_ys.append(grad_y)
+
+  # Build the gradient graph. Note that this builds the gradient computation of
+  # func_graph in the current graph, which requires capturing tensors from
+  # func_graph. The captured func_graph tensors are resolved to external tensors
+  # in _resolve_grad_inputs.
+  result = gradients_impl._GradientsHelper(
+      ys, func_graph.inputs, grad_ys=grad_ys,
+      src_graph=func_graph)
+
+  # Functions can't return None; replace Nones with zero tensors.
+  # TODO(b/80444525): don't return anything here and make _IfGrad return None if
+  # both branches have zero gradient.
+  for i in range(len(result)):
+    if result[i] is None:
+      if func_graph.inputs[i].dtype == dtypes.resource:
+        result[i] = array_ops.zeros(
+            gen_resource_variable_ops.variable_shape(func_graph.inputs[i]))
+      else:
+        result[i] = array_ops.zeros_like(func_graph.inputs[i])
+
+  return result
+
+
+def _create_grad_func(func_graph, grads, name):
+  """Returns the FuncGraph representation of _grad_fn."""
+  return func_graph_module.func_graph_from_py_func(
+      name,
+      lambda: _grad_fn(func_graph, grads), [], {},
+      func_graph=_CondGradFuncGraph(name, func_graph))
+
+
+def _resolve_grad_inputs(cond_graph, grad_graph):
+  """Returns the tensors to pass as inputs to `grad_graph`.
+
+  The `grad_graph` may have external references to
+  1. Its outer graph containing the input gradients. These references are kept
+     as is.
+  2. Tensors in the forward pass graph. These tensors may not be "live"
+     when the gradient is being computed. We replace such references by their
+     corresponding tensor in `cond_graph.outer_graph`. In the case of nested
+     control flow or functions, the gradient logic handling
+     `grad_graph.outer_graph` will make sure the tensor from
+     `cond_graph.outer_graph` is also correctly captured.
+
+  Args:
+    cond_graph: FuncGraph. The forward-pass function.
+    grad_graph: FuncGraph. The gradients function.
+
+  Returns:
+    A list of inputs tensors to be passed to grad_graph.
+  """
+  new_inputs = []
+
+  for t in grad_graph.external_captures:
+    # `t` must either be in `grad_graph.outer_graph` or in the forward
+    # `cond_graph`.
+    if t.graph != grad_graph.outer_graph:
+      assert t.graph == cond_graph
+      # `internal_captures` are not treated as intermediates and hence not added
+      # to If op outputs. So we get the outer tensor corresponding to those
+      # from the list of `external_captures`.
+      try:
+        t = t.graph._if.outputs[t.graph.outputs.index(t)]
+      except ValueError:
+        index = t.graph.internal_captures.index(t)
+        t = t.graph.external_captures[index]
+
+      # Note: We rely on the capturing logic of the gradient If op graph to
+      # correctly capture the tensors in `cond_graph.outer_graph`. Both cond_v2
+      # and while_v2 handle this while building their gradient functions.
+      assert t.graph == cond_graph.outer_graph
+    new_inputs.append(t)
+
+  return new_inputs
+
+
+def _get_intermediates(func_graph):
+  """Returns all tensors in `func_graph` that aren't inputs or outputs."""
+  intermediates = []
+  for op in func_graph.get_operations():
+    for t in op.outputs:
+      if t in func_graph.inputs: continue
+      if t in func_graph.outputs: continue
+      intermediates.append(t)
+  return intermediates
+
+
+def _separate_unique_inputs(true_inputs, false_inputs):
+  """Separates tensors appearing only in true_inputs or false_inputs, or both.
+
+  Args:
+    true_inputs: list of Tensors
+    false_inputs: list of Tensors
+
+  Returns:
+    Three lists of Tensors:
+      1. The tensors that appear in both true_inputs and false_inputs
+      2. The tensors that only appear in true_inputs
+      3. The tensors that only appear in false_inputs
+  """
+  true_inputs = set(true_inputs)
+  false_inputs = set(false_inputs)
+
+  shared_inputs = true_inputs.intersection(false_inputs)
+  true_only_inputs = true_inputs - false_inputs
+  false_only_inputs = false_inputs - true_inputs
+
+  return list(shared_inputs), list(true_only_inputs), list(false_only_inputs)
+
+
+def _make_intermediates_match(true_graph, false_graph,
+                              true_optionals, false_optionals):
+  """Returns new optionals lists that have matching signatures.
+
+  This is done by mirroring each list in the other using none optionals.
+  There is no merging of like optionals.
+
+  Args:
+    true_graph: FuncGraph
+    false_graph: FuncGraph
+    true_optionals: a list of optional Tensors from true_graph
+    false_optionals: a list of optional Tensors from false_graph
+
+  Returns:
+    A new list of Tensors in true_graph and a new list of Tensors in
+    false_graph. The two lists have the same number of Tensors, all of which
+    will be optionals of the same shape/type.
+  """
+  new_true_optionals = (true_optionals +
+                        _create_none_optionals(true_graph, false_optionals))
+  new_false_optionals = (_create_none_optionals(false_graph, true_optionals)
+                         + false_optionals)
+  return new_true_optionals, new_false_optionals
+
+
+def _make_intermediates_match_xla(true_graph, false_graph, true_intermediates,
+                                  false_intermediates):
+  """Like _make_intermediates_match but for the XLA case."""
+  new_true_intermediates = (true_intermediates +
+                            _create_fakeparams(true_graph, false_intermediates))
+  new_false_intermediates = (_create_fakeparams(false_graph, true_intermediates)
+                             + false_intermediates)
+  return new_true_intermediates, new_false_intermediates
+
+
+def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
+  """Modifies true_graph and false_graph so they have the same input signature.
+
+  This method reorders and/or adds parameters to true_graph and false_graph so
+  they have the same input signature, and updates the 'inputs' and 'captured'
+  fields of both graphs accordingly. It uses the input tensors from the outer
+  graph to avoid duplicating shared arguments.
+
+  Args:
+    true_graph: FuncGraph
+    false_graph: FuncGraph
+    true_inputs: a list of Tensors in the outer graph. The inputs for
+      true_graph.
+    false_inputs: a list of Tensors in the outer graph. The inputs for
+      false_graph.
+
+  Returns:
+    A new list of Tensors from the outer graph that are the new inputs for both
+    true_graph and false_graph. This is a deduped version of true_inputs +
+    false_inputs.
+  """
+  shared_inputs, true_only_inputs, false_only_inputs = _separate_unique_inputs(
+      true_inputs, false_inputs)
+
+  new_inputs = shared_inputs + true_only_inputs + false_only_inputs
+
+  true_input_to_param = dict(zip(true_inputs, true_graph.inputs))
+  false_input_to_param = dict(zip(false_inputs, false_graph.inputs))
+
+  true_graph.inputs = (
+      [true_input_to_param[t] for t in shared_inputs] +
+      [true_input_to_param[t] for t in true_only_inputs] +
+      _create_dummy_inputs(true_graph, false_only_inputs))
+
+  false_graph.inputs = (
+      [false_input_to_param[t] for t in shared_inputs] +
+      _create_dummy_inputs(false_graph, true_only_inputs) +
+      [false_input_to_param[t] for t in false_only_inputs])
+
+  # Rewrite the FuncGraphs' state to reflect the new inputs.
+  true_graph.captures = collections.OrderedDict(zip(new_inputs,
+                                                    true_graph.inputs))
+  false_graph.captures = collections.OrderedDict(zip(new_inputs,
+                                                     false_graph.inputs))
+
+  return new_inputs
+
+
+def _wrap_intermediates(func_graph, intermediates):
+  with func_graph.as_default():
+    return [gen_dataset_ops.optional_from_value([t]) for t in intermediates]
+
+
+def _create_dummy_inputs(func_graph, template_tensors):
+  """Creates tensors in func_graph to represent template_tensors.
+
+  Args:
+    func_graph: FuncGraph.
+    template_tensors: a list of tensors in the outer graph.
+
+  Returns:
+    A list of tensors in func_graph.
+  """
+  with func_graph.as_default():
+    return [array_ops.placeholder(t.dtype, shape=t.shape)
+            for t in template_tensors]
+
+
+def _create_none_optionals(func_graph, template_tensors):
+  """Creates none optionals in func_graph to represent template_tensors.
+
+  Args:
+    func_graph: FuncGraph.
+    template_tensors: a list of tensors in func_graph.
+
+  Returns:
+    A list of tensors in func_graph.
+  """
+  with func_graph.as_default():
+    return [gen_dataset_ops.optional_none() for _ in template_tensors]
+
+
+def _create_fakeparams(func_graph, template_tensors):
+  """Create FakeParams for the XLA case."""
+  with func_graph.as_default():
+    return [gen_functional_ops.fake_param(dtype=t.dtype, shape=t.shape)
+            for t in template_tensors]
+
+
+def _check_same_outputs(true_graph, false_graph):
+  """Raises an error if true_graph and false_graph have different outputs."""
+  true_output_types = [t.dtype for t in true_graph.outputs]
+  false_output_types = [t.dtype for t in false_graph.outputs]
+  if (len(true_graph.outputs) != len(false_graph.outputs) or
+      true_output_types != false_output_types):
+    raise TypeError(
+        "true_fn() and false_fn() must return the same number and type of "
+        "arguments, got:\n"
+        "  true_fn: %s\n"
+        "  false_fn: %s" % (true_output_types, false_output_types))
+
+  # Make sure `structured_outputs` for both graphs have the same structure.
+  try:
+    nest.assert_same_structure(true_graph.structured_outputs,
+                               false_graph.structured_outputs)
+  except (ValueError, TypeError) as e:
+    raise ValueError("Outputs of true_fn and false_fn must have the same "
+                     "structure: %s" % str(e))
+
+
+def _get_output_shapes(true_graph_outputs, false_graph_outputs):
+  output_shapes = [
+      t_out.shape.most_specific_compatible_shape(f_out.shape)
+      for t_out, f_out in zip(true_graph_outputs, false_graph_outputs)
+  ]
+  return output_shapes
+
+
+class _CondGradFuncGraph(util.CondBranchFuncGraph):
+  """FuncGraph for the gradient function of the branch of an If op.
+
+  Handles wrapping and unwrapping intermediate values that are captured by the
+  gradient computation in optionals.
+
+  Attributes:
+    if_op_needs_rewrite: True if any intermediates were captured, meaning the
+      forward If op needs to be written to output the wrapped intermediates.
+  """
+
+  def __init__(self, name, forward_graph):
+    super(_CondGradFuncGraph, self).__init__(name, read_only_collections=False)
+    self.if_op_needs_rewrite = False
+    self._forward_graph = forward_graph
+    # Maps from forward intermediate tensor -> the unwrapped captured
+    # intermediate.
+    self._indirect_captures = {}
+    # Maps unwrapped intermediate -> optional-wrapped intermediate in the
+    # forward graph.
+    self._wrapped_intermediates = collections.OrderedDict()
+    # Raw intermediates captured from the forward graph. Populated iff we're in
+    # an XLA context.
+    self._xla_intermediates = []
+
+  @property
+  def wrapped_intermediates(self):
+    """The optional-wrapped intermediates captured from the forward graph."""
+    return list(self._wrapped_intermediates.values())
+
+  @property
+  def xla_intermediates(self):
+    """Raw intermediates captured from the forward graph if XLA is enabled."""
+    return self._xla_intermediates
+
+  def _capture_helper(self, tensor, name):
+    if (tensor.graph is not self._forward_graph or
+        tensor in self._forward_graph.inputs or
+        tensor in self._forward_graph.outputs):
+      return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
+
+    if control_flow_util.InXlaContext(ops.get_default_graph()):
+      # XLA does not yet support optionals, so capture intermediates directly.
+      # TODO(skyewm,jpienaar): can XLA support optionals?
+      if tensor not in self.captures:
+        self.xla_intermediates.append(tensor)
+        self.if_op_needs_rewrite = True
+      return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
+
+    captured_tensor = self._indirect_captures.get(tensor)
+    if captured_tensor is not None:
+      return captured_tensor
+
+    # 'tensor' is an uncaptured intermediate in the forward graph. We wrap it in
+    # an optional in the forward graph and capture the optional normally. We
+    # then unwrap the captured optional value in the gradient graph to get the
+    # raw intermediate value.
+
+    if tensor not in self._wrapped_intermediates:
+      # If the gradient has already been computed for this If op, 'tensor' may
+      # already be wrapped.
+      for consumer in tensor.consumers():
+        if (consumer.type == "OptionalFromValue"
+            and consumer.outputs[0] in self._forward_graph.outputs):
+          optional = consumer.outputs[0]
+          break
+      else:
+        # 'tensor' hasn't been wrapped, do it now.
+        with self._forward_graph.as_default():
+          optional = gen_dataset_ops.optional_from_value([tensor])
+        self.if_op_needs_rewrite = True
+
+      self._wrapped_intermediates[tensor] = optional
 
-from tensorflow.python.ops.cond_v2_impl import cond_v2
-# pylint: enable=unused-import
+    optional = self._wrapped_intermediates[tensor]
+    captured_optional = super(_CondGradFuncGraph, self)._capture_helper(
+        optional, name)
+    captured_tensor = gen_dataset_ops.optional_get_value(
+        captured_optional, [tensor.dtype], [tensor.shape])[0]
+    self._indirect_captures[tensor] = captured_tensor
+    return captured_tensor
diff --git a/tensorflow/python/ops/cond_v2_impl.py b/tensorflow/python/ops/cond_v2_impl.py
deleted file mode 100644
index 195ad11c71d0ff6a9a71cf4a8526045078384d43..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/cond_v2_impl.py
+++ /dev/null
@@ -1,502 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-"""cond_v2 and gradient.
-
-This is a version of cond that emits a single If op, as well as the gradient
-function for If ops produced by cond_v2. This will eventually replace the
-current tf.cond implementation once it reaches feature and performance parity.
-
-NOTE: most users of cond_v2 should import cond_v2, not this module! This module
-does not contain all the necessary imports to prevent circular dependencies,
-while cond_v2 does.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util
-from tensorflow.python.ops import gen_functional_ops
-
-
-# The following modules cannot be imported directly because they cause circular
-# dependencies. These are set in each corresponding module.
-_function = None
-_function_def_to_graph = None
-_gradients_impl = None
-
-# NOTE(skyewm): TensorFlow uses protected class methods and fields to signify
-# that they aren't part of the official public API. These protected members
-# often need to be used by implementation code however. Rather than litter the
-# code with pylint comments, we ignore protected access violations for
-# readability.
-# pylint: disable=protected-access
-
-
-def cond_v2(pred, true_fn, false_fn, name="cond"):
-  """Like tf.cond, except emits a single If op."""
-  if not name:
-    name = "cond"
-
-  with ops.name_scope(name) as scope:
-    with ops.name_scope(None):
-      # Find the outer most graph for uniquing function names.
-      # TODO(jpienaar): Make this work in eager mode.
-      graph = ops.get_default_graph()
-      while isinstance(graph, _function.FuncGraph):
-        graph = graph.outer_graph
-
-      true_name = graph.unique_name(("%strue" % scope).replace("/", "_"))
-      false_name = graph.unique_name(("%sfalse" % scope).replace("/", "_"))
-
-    true_graph = _function.func_graph_from_py_func(
-        true_name, true_fn, [], {})
-    false_graph = _function.func_graph_from_py_func(
-        false_name, false_fn, [], {})
-    _check_same_outputs(true_graph, false_graph)
-
-    # Add inputs to true_graph and false_graph to make them match. Note that
-    # this modifies true_graph and false_graph.
-    cond_inputs = _make_inputs_match(true_graph, false_graph,
-                                     true_graph.external_captures,
-                                     false_graph.external_captures)
-
-    # Add all intermediate tensors as function outputs so they're available for
-    # the gradient computation.
-
-    true_intermediates = _get_intermediates(true_graph)
-    false_intermediates = _get_intermediates(false_graph)
-
-    # Save the original number of outputs to return to the caller.
-    num_cond_outputs = len(true_graph.outputs)
-
-    # Make the number/type of new intermediate outputs match.
-    extra_true_outputs, extra_false_outputs = _pad_params(
-        true_graph, false_graph, true_intermediates, false_intermediates)
-
-    true_graph.outputs.extend(extra_true_outputs)
-    false_graph.outputs.extend(extra_false_outputs)
-
-    # Create the If op.
-    tensors = gen_functional_ops._if(  # pylint: disable=protected-access
-        pred,
-        cond_inputs, [t.dtype for t in true_graph.outputs],
-        _create_new_tf_function(true_graph),
-        _create_new_tf_function(false_graph),
-        output_shapes=_get_output_shapes(true_graph.outputs,
-                                         false_graph.outputs),
-        name=scope)
-
-    # Set the flag to enable lowering on the `if` op if necessary
-    # Lowering allows cond_v2 to avoid some of the limitations of Functions,
-    # allowing users to specify devices & colocation inside of cond_v2 branches,
-    # and enabling non-strict evaluation & partial pruning of cond_v2 branches.
-    # This brings cond_v2 closer to feature parity with tf.cond.
-    #
-    # However, we do not lower `If` in the XLA context because it is easier for
-    # XLA to apply its own optimizations when dealing with un-lowered `If`
-    # operators than with lowered switch/merge control flow.
-    #
-    # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
-    if_op = tensors[0].op
-    if not control_flow_util.IsInXLAContext(if_op):
-      # pylint: disable=protected-access
-      if_op._set_attr("_lower_using_switch_merge",
-                      attr_value_pb2.AttrValue(b=True))
-      # pylint: enable=protected-access
-
-    result = tuple(tensors[:num_cond_outputs])
-    if len(result) == 1:
-      return result[0]
-    else:
-      return result
-
-
-@ops.RegisterGradient("If")
-def _IfGrad(op, *grads):  # pylint: disable=invalid-name
-  """The gradient of an If op produced by cond_v2."""
-  true_graph, false_graph = _get_func_graphs(op)
-  # Note: op.graph != ops.get_default_graph() when we are computing the gradient
-  # of a nested cond.
-  assert true_graph.outer_graph == op.graph
-  assert false_graph.outer_graph == op.graph
-
-  # Create grad functions that compute the gradient of the true/false forward
-  # graphs. These functions will capture tensors from the forward pass
-  # functions.
-  true_grad_graph = _create_grad_func(
-      true_graph, grads, _get_grad_fn_name(true_graph))
-  false_grad_graph = _create_grad_func(
-      false_graph, grads, _get_grad_fn_name(false_graph))
-
-  assert ([t.dtype for t in true_grad_graph.outputs] ==
-          [t.dtype for t in false_grad_graph.outputs])
-
-  # Resolve references to forward graph tensors in grad graphs and ensure
-  # they are in-scope, i.e., belong to one of outer graphs of the grad graph.
-  true_grad_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
-  false_grad_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
-
-  # Make the inputs to true_grad_graph and false_grad_graph match. Note that
-  # this modifies true_grad_graph and false_grad_graph.
-  grad_inputs = _make_inputs_match(true_grad_graph, false_grad_graph,
-                                   true_grad_inputs, false_grad_inputs)
-
-  # Add all intermediate tensors as function outputs so they're available for
-  # higher-order gradient computations.
-
-  true_grad_intermediates = _get_intermediates(true_grad_graph)
-  false_grad_intermediates = _get_intermediates(false_grad_graph)
-
-  # Save the original number of gradient outputs to return.
-  num_grad_outputs = len(true_grad_graph.outputs)
-
-  # Make the number/type of new intermediate outputs match.
-  extra_true_grad_outputs, extra_false_grad_outputs = _pad_params(
-      true_grad_graph, false_grad_graph,
-      true_grad_intermediates, false_grad_intermediates)
-
-  true_grad_graph.outputs.extend(extra_true_grad_outputs)
-  false_grad_graph.outputs.extend(extra_false_grad_outputs)
-
-  # Create the gradient If op.
-  tensors = gen_functional_ops._if(
-      op.inputs[0],
-      grad_inputs, [t.dtype for t in true_grad_graph.outputs],
-      _create_new_tf_function(true_grad_graph),
-      _create_new_tf_function(false_grad_graph),
-      output_shapes=_get_output_shapes(true_grad_graph.outputs,
-                                       false_grad_graph.outputs))
-
-  # The predicate has no gradient.
-  return [None] + tensors[:num_grad_outputs]
-
-
-def _get_func_graphs(if_op):
-  """Returns `FuncGraph`s for the input op branches.
-
-  Args:
-    if_op: The _If Operation.
-
-  Returns:
-    A 2-tuple of the `FuncGraph`s of the then_branch and else_branch.
-  """
-  def _get_func_graph_for_branch(branch_name):
-    """Generates and returns a FuncGraph for the given branch."""
-    inputs = if_op.inputs[1:]  # First input is pred.
-    input_shapes = [t.shape for t in inputs]
-    func_name = if_op.get_attr(branch_name).name
-    fdef = if_op.graph._get_function(func_name).definition
-    # `if_op.graph` may not be the same as `ops.get_default_graph()` e.g.
-    # in the case of nested if ops or when the gradient is being computed
-    # from inside a Defun. We build the `func_graph` with `if_op.graph` as its
-    # `outer_graph`. This resembles how the `FuncGraph` was built in the
-    # forward pass. We need this so that we can resolve references to tensors
-    # in `func_graph` from its gradient graph in `_resolve_grad_inputs`.
-    with if_op.graph.as_default():
-      func_graph = _function_def_to_graph.function_def_to_graph(
-          fdef, input_shapes)
-    func_graph.captures = collections.OrderedDict(zip(inputs,
-                                                      func_graph.inputs))
-    # Set the if op so that the gradient code can use it.
-    func_graph._if = if_op
-    return func_graph
-
-  return (_get_func_graph_for_branch("then_branch"),
-          _get_func_graph_for_branch("else_branch"))
-
-
-def _grad_fn(func_graph, grads):
-  """The gradient function for each conditional branch.
-
-  This function builds the gradient graph of the corresponding forward-pass
-  conditional branch in `func_graph`. This is done by differentiating
-  func_graph's outputs w.r.t. its inputs.
-
-  Args:
-    func_graph: function.FuncGraph. The corresponding forward-pass function.
-    grads: The list of input gradient Tensors.
-
-  Returns:
-    The output gradient Tensors.
-  """
-  # Filter out untrainable function outputs.
-  # NOTE(skyewm): If we don't do this, the untrainable tensors can sometimes
-  # cause _GradientsHelper to raise an exception (e.g. the implementation
-  # doesn't expect 'ys' to contain boolean tensors).
-  assert len(func_graph.outputs) == len(grads)
-  ys = []
-  grad_ys = []
-  for y, grad_y in zip(func_graph.outputs, grads):
-    if not _gradients_impl._IsTrainable(y):
-      continue
-    ys.append(y)
-    grad_ys.append(grad_y)
-
-  # Build the gradient graph. Note that this builds the gradient computation of
-  # func_graph in the current graph, which requires capturing tensors from
-  # func_graph. The captured func_graph tensors are resolved to external tensors
-  # in _resolve_grad_inputs.
-  result = _gradients_impl._GradientsHelper(
-      ys, func_graph.inputs, grad_ys=grad_ys,
-      src_graph=func_graph)
-
-  # Functions can't return None; replace Nones with zero tensors.
-  # TODO(b/80444525): don't return anything here and make _IfGrad return None if
-  # both branches have zero gradient.
-  for i in range(len(result)):
-    if result[i] is None:
-      result[i] = array_ops.zeros_like(func_graph.inputs[i])
-
-  return result
-
-
-def _create_grad_func(func_graph, grads, name):
-  """Returns the FuncGraph representation of _grad_fn."""
-  return _function.func_graph_from_py_func(
-      name, lambda: _grad_fn(func_graph, grads), [], {})
-
-
-def _resolve_grad_inputs(cond_graph, grad_graph):
-  """Returns the tensors to pass as inputs to `grad_graph`.
-
-  The `grad_graph` may have external references to
-  1. Its outer graph containing the input gradients. These references are kept
-     as is.
-  2. Tensors in the forward pass graph. These tensors may not be "live"
-     when the gradient is being computed. We replace such references by their
-     corresponding tensor in the least common ancestor graph of `grad_graph` and
-     `cond_graph`. Since we export intermediate tensors for all branch
-     functions, this is always possible.
-
-  Args:
-    cond_graph: function.FuncGraph. The forward-pass function.
-    grad_graph: function.FuncGraph. The gradients function.
-
-  Returns:
-    A list of inputs tensors to be passed to grad_graph.
-  """
-  new_inputs = []
-
-  for t in grad_graph.external_captures:
-    if t.graph != grad_graph.outer_graph:
-      # `t` is a tensor in `cond_graph` or one of its ancestors. We bubble this
-      # tensor to the least common ancestor of the `cond_graph` and
-      # `grad_graph` so that it is "in-scope" for `grad_graph`.
-      # TODO(srbs): `_is_ancestor` calls may be expensive. Compute the least
-      # common ancestor once and re-use.
-      assert _is_ancestor(cond_graph, t.graph)
-      while not _is_ancestor(grad_graph, t.graph):
-        assert isinstance(t.graph, _function.FuncGraph)
-        if t in t.graph.internal_captures:
-          # TODO(srbs): Consider building a map of internal_captures ->
-          # external_captures instead of searching for `t` twice.
-          t = t.graph.external_captures[t.graph.internal_captures.index(t)]
-        else:
-          # Note: All intermediate tensors are output by the If op.
-          # TODO(srbs): .index() calls may be expensive. Optimize.
-          t = t.graph._if.outputs[t.graph.outputs.index(t)]
-      assert _is_ancestor(grad_graph, t.graph)
-    new_inputs.append(t)
-
-  return new_inputs
-
-
-def _create_new_tf_function(func_graph):
-  """Converts func_graph to a TF_Function and adds it to the current graph.
-
-  Args:
-    func_graph: function.FuncGraph
-
-  Returns:
-    The name of the new TF_Function.
-  """
-  func = _function._EagerDefinedFunction(
-      func_graph.name, func_graph, func_graph.inputs, func_graph.outputs, {})
-  func.add_to_graph(func_graph.outer_graph)
-  return func_graph.name
-
-
-def _get_intermediates(func_graph):
-  """Returns all tensors in `func_graph` that aren't inputs or outputs."""
-  intermediates = []
-  for op in func_graph.get_operations():
-    for t in op.outputs:
-      if t in func_graph.inputs: continue
-      if t in func_graph.outputs: continue
-      intermediates.append(t)
-  return intermediates
-
-
-def _separate_unique_inputs(true_inputs, false_inputs):
-  """Separates tensors appearing only in true_inputs or false_inputs, or both.
-
-  Args:
-    true_inputs: list of Tensors
-    false_inputs: list of Tensors
-
-  Returns:
-    Three lists of Tensors:
-      1. The tensors that appear in both true_inputs and false_inputs
-      2. The tensors that only appear in true_inputs
-      3. The tensors that only appear in false_inputs
-  """
-  true_inputs = set(true_inputs)
-  false_inputs = set(false_inputs)
-
-  shared_inputs = true_inputs.intersection(false_inputs)
-  true_only_inputs = true_inputs - false_inputs
-  false_only_inputs = false_inputs - true_inputs
-
-  return list(shared_inputs), list(true_only_inputs), list(false_only_inputs)
-
-
-def _pad_params(true_graph, false_graph, true_params, false_params):
-  """Returns new param lists that have matching signatures.
-
-  This is done by mirroring each param list in the other using dummy params.
-  There is no merging of params.
-
-  Args:
-    true_graph: function.FuncGraph
-    false_graph: function.FuncGraph
-    true_params: a list of Tensors from true_graph
-    false_params: a list of Tensors from false_graph
-
-  Returns:
-    A new list of Tensors in true_graph and a new list of Tensors in
-    false_graph. The two lists have the same number of Tensors, with matching
-    types and shapes across the lists.
-  """
-  new_true_params = (true_params +
-                     _create_dummy_params(true_graph, false_params))
-  new_false_inputs = (_create_dummy_params(false_graph, true_params)
-                      + false_params)
-  return new_true_params, new_false_inputs
-
-
-def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
-  """Modifies true_graph and false_graph so they have the same input signature.
-
-  This method reorders and/or adds parameters to true_graph and false_graph so
-  they have the same input signature, and updates the 'inputs' and 'captured'
-  fields of both graphs accordingly. It uses the input tensors from the outer
-  graph to avoid duplicating shared arguments.
-
-  Args:
-    true_graph: function.FuncGraph
-    false_graph: function.FuncGraph
-    true_inputs: a list of Tensors in the outer graph. The inputs for
-      true_graph.
-    false_inputs: a list of Tensors in the outer graph. The inputs for
-      false_graph.
-
-  Returns:
-    A new list of Tensors from the outer graph that are the new inputs for both
-    true_graph and false_graph. This is a deduped version of true_inputs +
-    false_inputs.
-  """
-  shared_inputs, true_only_inputs, false_only_inputs = _separate_unique_inputs(
-      true_inputs, false_inputs)
-
-  new_inputs = shared_inputs + true_only_inputs + false_only_inputs
-
-  true_input_to_param = dict(zip(true_inputs, true_graph.inputs))
-  false_input_to_param = dict(zip(false_inputs, false_graph.inputs))
-
-  true_graph.inputs = (
-      [true_input_to_param[t] for t in shared_inputs] +
-      [true_input_to_param[t] for t in true_only_inputs] +
-      _create_dummy_params(true_graph, false_only_inputs))
-
-  false_graph.inputs = (
-      [false_input_to_param[t] for t in shared_inputs] +
-      _create_dummy_params(false_graph, true_only_inputs) +
-      [false_input_to_param[t] for t in false_only_inputs])
-
-  # Rewrite the FuncGraphs' state to reflect the new inputs.
-  true_graph.captures = collections.OrderedDict(zip(new_inputs,
-                                                    true_graph.inputs))
-  false_graph.captures = collections.OrderedDict(zip(new_inputs,
-                                                     false_graph.inputs))
-
-  return new_inputs
-
-
-def _create_dummy_params(func_graph, template_tensors):
-  """Creates tensors in func_graph to represent template_tensors.
-
-  Args:
-    func_graph: function.FuncGraph.
-    template_tensors: a list of tensors in the outer graph.
-
-  Returns:
-    A list of tensors in func_graph.
-  """
-  with func_graph.as_default():
-    return [gen_functional_ops.fake_param(dtype=t.dtype, shape=t.shape)
-            for t in template_tensors]
-
-
-def _get_grad_fn_name(func_graph):
-  """Returns a unique name to use for the grad function of `func_graph`.
-
-  Ensures this name is unique in the entire hierarchy.
-
-  Args:
-    func_graph: The FuncGraph.
-
-  Returns:
-    A string, the name to use for the gradient function.
-  """
-  name = "%s_grad" % func_graph.name
-  outer_most_graph = func_graph
-  while isinstance(outer_most_graph, _function.FuncGraph):
-    outer_most_graph = outer_most_graph.outer_graph
-  return outer_most_graph.unique_name(name)
-
-
-def _check_same_outputs(true_graph, false_graph):
-  """Raises an error if true_graph and false_graph have different outputs."""
-  true_output_types = [t.dtype for t in true_graph.outputs]
-  false_output_types = [t.dtype for t in false_graph.outputs]
-  if (len(true_graph.outputs) != len(false_graph.outputs) or
-      true_output_types != false_output_types):
-    raise ValueError(
-        "true_fn() and false_fn() must return the same number and type of "
-        "arguments, got:\n"
-        "  true_fn: %s\n"
-        "  false_fn: %s" % (true_output_types, false_output_types))
-
-
-def _get_output_shapes(true_graph_outputs, false_graph_outputs):
-  output_shapes = [
-      t_out.shape.most_specific_compatible_shape(f_out.shape)
-      for t_out, f_out in zip(true_graph_outputs, false_graph_outputs)
-  ]
-  return output_shapes
-
-
-def _is_ancestor(graph, maybe_ancestor):
-  if maybe_ancestor == graph:
-    return True
-  if isinstance(graph, _function.FuncGraph):
-    return _is_ancestor(graph.outer_graph, maybe_ancestor)
-  return False
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 825914245658b264d1cd8cde4368dbd5b21f6597..ccfe3b65c2d90b37836e2e48c3235f399f77df49 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -90,10 +90,13 @@ def remove_squeezable_dimensions(
     return labels, predictions
 
 
-@tf_export('train.confusion_matrix', 'confusion_matrix')
-@deprecation.deprecated_endpoints('confusion_matrix')
-def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
-                     name=None, weights=None):
+@tf_export('math.confusion_matrix', v1=[])
+def confusion_matrix(labels,
+                     predictions,
+                     num_classes=None,
+                     weights=None,
+                     dtype=dtypes.int32,
+                     name=None):
   """Computes the confusion matrix from predictions and labels.
 
   The matrix columns represent the prediction labels and the rows represent the
@@ -130,9 +133,9 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
     num_classes: The possible number of labels the classification task can
                  have. If this value is not provided, it will be calculated
                  using both predictions and labels array.
+    weights: An optional `Tensor` whose shape matches `predictions`.
     dtype: Data type of the confusion matrix.
     name: Scope name.
-    weights: An optional `Tensor` whose shape matches `predictions`.
 
   Returns:
     A `Tensor` of type `dtype` with shape `[n, n]` representing the confusion
@@ -191,3 +194,65 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
     zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)
 
     return sparse_ops.sparse_add(zero_matrix, cm_sparse)
+
+
+@tf_export(v1=['math.confusion_matrix', 'confusion_matrix'])
+@deprecation.deprecated_endpoints('confusion_matrix', 'train.confusion_matrix')
+def confusion_matrix_v1(labels,
+                        predictions,
+                        num_classes=None,
+                        dtype=dtypes.int32,
+                        name=None,
+                        weights=None):
+  """Computes the confusion matrix from predictions and labels.
+
+  The matrix columns represent the prediction labels and the rows represent the
+  real labels. The confusion matrix is always a 2-D array of shape `[n, n]`,
+  where `n` is the number of valid labels for a given classification task. Both
+  prediction and labels must be 1-D arrays of the same shape in order for this
+  function to work.
+
+  If `num_classes` is `None`, then `num_classes` will be set to one plus the
+  maximum value in either predictions or labels. Class labels are expected to
+  start at 0. For example, if `num_classes` is 3, then the possible labels
+  would be `[0, 1, 2]`.
+
+  If `weights` is not `None`, then each prediction contributes its
+  corresponding weight to the total value of the confusion matrix cell.
+
+  For example:
+
+  ```python
+    tf.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+        [[0 0 0 0 0]
+         [0 0 1 0 0]
+         [0 0 1 0 0]
+         [0 0 0 0 0]
+         [0 0 0 0 1]]
+  ```
+
+  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
+  resulting in a 5x5 confusion matrix.
+
+  Args:
+    labels: 1-D `Tensor` of real labels for the classification task.
+    predictions: 1-D `Tensor` of predictions for a given classification.
+    num_classes: The possible number of labels the classification task can have.
+      If this value is not provided, it will be calculated using both
+      predictions and labels array.
+    dtype: Data type of the confusion matrix.
+    name: Scope name.
+    weights: An optional `Tensor` whose shape matches `predictions`.
+
+  Returns:
+    A `Tensor` of type `dtype` with shape `[n, n]` representing the confusion
+    matrix, where `n` is the number of possible labels in the classification
+    task.
+
+  Raises:
+    ValueError: If both predictions and labels are not 1-D vectors and have
+      mismatched shapes, or if `weights` is not `None` and its shape doesn't
+      match `predictions`.
+  """
+  return confusion_matrix(labels, predictions, num_classes, weights, dtype,
+                          name)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 5bc217d355b60a06e39a9c7e3a3354262fea9beb..b7e50c1dae5ac1dc0968a3badb8f017e6b0384e1 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -14,8 +14,7 @@
 # ==============================================================================
 """Control Flow Operations.
 
-See the [Control
-Flow](https://tensorflow.org/api_guides/python/control_flow_ops) guide.
+See the [autograph](https://www.tensorflow.org/guide/autographs) guide.
 """
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -31,6 +30,7 @@ import six
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import control_flow_pb2
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -40,7 +40,6 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import cond_v2_impl
 from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
@@ -58,20 +57,22 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
-# The while_v2 module.
-_while_v2 = None
+# This is to avoid a circular dependency:
+# cond_v2 -> gradients_impl -> control_flow_ops
+cond_v2 = LazyLoader("cond_v2", globals(),
+                     "tensorflow.python.ops.cond_v2")
 
-ENABLE_COND_V2 = os.getenv("TF_ENABLE_COND_V2", "0") != "0"
-# Note: Setting this to True is not sufficient to switch to the v2 while_loop.
-# Users must also import the while_v2 module to set the _while_v2 module
-# variable above. We do this to avoid a circular dependency:
-# control_flow_ops -> while_v2 -> gradients_impl -> control_flow_ops
-# A ValueError is raised in tf.while_loop if this is set to True and the
-# `_while_v2` module is not set.
-ENABLE_WHILE_V2 = os.getenv("TF_ENABLE_WHILE_V2", "0") != "0"
+# This is to avoid circular dependencies:
+# while_v2 -> control_flow_ops
+# while_v2 -> gradients_impl -> control_flow_ops
+while_v2 = LazyLoader("while_v2", globals(),
+                      "tensorflow.python.ops.while_v2")
 
+ENABLE_COND_V2 = tf2.enabled() or os.getenv("TF_ENABLE_COND_V2", "0") != "0"
+ENABLE_WHILE_V2 = tf2.enabled() or os.getenv("TF_ENABLE_WHILE_V2", "0") != "0"
 
 # We override the 'tuple' for a control flow op, so we keep python's
 # existing 'tuple' for later use in this module.
@@ -85,6 +86,11 @@ def _summarize_eager(tensor, summarize=None):
     tensor: EagerTensor to summarize
     summarize: Include these many first elements of `array`
   """
+  # Emulate the behavior of Tensor::SummarizeValue()
+  if summarize is None:
+    summarize = 3
+  elif summarize < 0:
+    summarize = array_ops.size(tensor)
   # reshape((-1,)) is the fastest way to get a flat array view
   if tensor._rank():  # pylint: disable=protected-access
     flat = tensor.numpy().reshape((-1,))
@@ -152,7 +158,7 @@ def Assert(condition, data, summarize=None, name=None):
 
   with ops.name_scope(name, "Assert", [condition, data]) as name:
     xs = ops.convert_n_to_tensor(data)
-    if all([x.dtype in {dtypes.string, dtypes.int32} for x in xs]):
+    if all(x.dtype in {dtypes.string, dtypes.int32} for x in xs):
       # As a simple heuristic, we assume that string and int32 are
       # on host to avoid the need to use cond. If it is not case,
       # we will pay the price copying the tensor to host memory.
@@ -345,8 +351,8 @@ def switch(data, pred, dtype=None, name=None):
   Args:
     data: The tensor to be forwarded to the appropriate output.
     pred: A scalar that specifies which output port will receive data.
-    dtype: Optional element type for the returned tensor. If missing,
-           the type is inferred from the type of `value`.
+    dtype: Optional element type for the returned tensor. If missing, the type
+      is inferred from the type of `value`.
     name: A name for this operation (optional).
 
   Returns:
@@ -451,19 +457,19 @@ def merge(inputs, name=None):
     ValueError: If any of the inputs is None, or inputs are IndexedSlices and
       some but not all have a dense_shape property.
   """
-  if any([inp is None for inp in inputs]):
+  if any(inp is None for inp in inputs):
     raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.name_scope(name, "Merge", inputs) as name:
     inputs = [
         ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
         for inp in inputs
     ]
-    if all([isinstance(v, ops.Tensor) for v in inputs]):
-      if all([v.dtype._is_ref_dtype for v in inputs]):  # pylint: disable=protected-access
+    if all(isinstance(v, ops.Tensor) for v in inputs):
+      if all(v.dtype._is_ref_dtype for v in inputs):  # pylint: disable=protected-access
         return gen_control_flow_ops.ref_merge(inputs, name)
       else:
         return gen_control_flow_ops.merge(inputs, name)
-    elif all([isinstance(v, sparse_tensor.SparseTensor) for v in inputs]):
+    elif all(isinstance(v, sparse_tensor.SparseTensor) for v in inputs):
       # Only handle the case when all inputs are SparseTensor.
       values, _ = merge([inp.values for inp in inputs], name=name)
       indices, chosen_index = gen_control_flow_ops.merge(
@@ -519,8 +525,8 @@ def _convert_flows_to_tensorarrays(tensors_or_tensorarrays, tensors_or_flows):
         "Lengths of original Tensor list and new list do not match: %d vs. %d" %
         (len(tensors_or_tensorarrays), len(tensors_or_flows)))
   return [
-      _make_tensor_array(ta, t_or_flow)
-      if isinstance(ta, tensor_array_ops.TensorArray) else t_or_flow
+      _make_tensor_array(ta, t_or_flow) if isinstance(
+          ta, tensor_array_ops.TensorArray) else t_or_flow
       for (ta, t_or_flow) in zip(tensors_or_tensorarrays, tensors_or_flows)
   ]
 
@@ -551,7 +557,7 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
   if shapes is None:
     return
   flat_shapes = nest.flatten(shapes)
-  if not all([isinstance(s, tensor_shape.TensorShape) for s in flat_shapes]):
+  if not all(isinstance(s, tensor_shape.TensorShape) for s in flat_shapes):
     raise ValueError("`shapes` must be a (possibly nested) list of shapes.")
   # Check that the shapes of the inputs are less than the shape invariants,
   # and set the shapes of `enter_vars` to the shape invariants.
@@ -596,10 +602,10 @@ def _EnforceShapeInvariant(merge_var, next_var):
   """Check if the shapes of the loops variables are invariants.
 
   Args:
-    merge_var: The list of tensors representing the initial values of the
-      loop variables.
-    next_var: The list of tensors representing the values of the loop
-      variables after one loop iteration.
+    merge_var: The list of tensors representing the initial values of the loop
+      variables.
+    next_var: The list of tensors representing the values of the loop variables
+      after one loop iteration.
 
   Raises:
     ValueError: If any tensor in `merge_var` has a more specific shape than
@@ -616,8 +622,7 @@ def _EnforceShapeInvariant(merge_var, next_var):
           "Input tensor '%s' enters the loop with shape %s, but has shape %s "
           "after one iteration. To allow the shape to vary across iterations, "
           "use the `shape_invariants` argument of tf.while_loop to specify a "
-          "less-specific shape." %
-          (input_t.name, input_t.shape, n_shape))
+          "less-specific shape." % (input_t.name, input_t.shape, n_shape))
   else:
     if not isinstance(merge_var,
                       (ops.IndexedSlices, sparse_tensor.SparseTensor)):
@@ -707,9 +712,9 @@ def GetMaxSizeFromNestedMaximumIterations(value, while_ctxt):
   Args:
     value: The value inside the while_loop forward context.  Used for printing
       error messages.
-    while_ctxt: The forward context inside which value resides.  This does
-      not always match the value's immediate context, as `value` may be
-      inside e.g. a cond context inside the while_loop.
+    while_ctxt: The forward context inside which value resides.  This does not
+      always match the value's immediate context, as `value` may be inside e.g.
+      a cond context inside the while_loop.
 
   Returns:
     A tensor containing the `max_size` to feed to a Stack initializer.
@@ -1479,6 +1484,7 @@ def ZerosLikeOutsideLoop(op, index):
       return array_ops.zeros_like(val, optimize=False)
 
 
+@six.add_metaclass(abc.ABCMeta)
 class ControlFlowContext(object):
   """The base class for control flow context.
 
@@ -1644,8 +1650,8 @@ class ControlFlowContext(object):
           internal_control_inputs.append(x)
     external_control_inputs = []
     if len(internal_control_inputs) != len(op.control_inputs):
-      external_control_inputs = list(set(op.control_inputs)
-                                     - set(internal_control_inputs))
+      external_control_inputs = list(
+          set(op.control_inputs) - set(internal_control_inputs))
       op._remove_all_control_inputs()
       op._add_control_inputs(internal_control_inputs)
     return internal_control_inputs, external_control_inputs
@@ -1730,8 +1736,8 @@ class CondContext(ControlFlowContext):
     self._pivot = g.as_graph_element(
         ops.prepend_name_scope(context_def.pivot_name, import_scope))
     self._branch = context_def.branch
-    super(CondContext, self).__init__(values_def=context_def.values_def,
-                                      import_scope=import_scope)
+    super(CondContext, self).__init__(
+        values_def=context_def.values_def, import_scope=import_scope)
 
   @property
   def pred(self):
@@ -1777,8 +1783,8 @@ class CondContext(ControlFlowContext):
       context_def.pivot_name = ops.strip_name_scope(self._pivot.name,
                                                     export_scope)
       context_def.branch = self._branch
-      context_def.values_def.MergeFrom(super(CondContext, self)._to_values_def(
-          export_scope))
+      context_def.values_def.MergeFrom(
+          super(CondContext, self)._to_values_def(export_scope))
       for nested in self._nested_contexts:
         nested_def = context_def.nested_contexts.add()
         nested.to_control_flow_context_def(nested_def)
@@ -1790,8 +1796,7 @@ class CondContext(ControlFlowContext):
   @staticmethod
   def from_proto(context_def, import_scope=None):
     """Returns a `CondContext` object created from `context_def`."""
-    ret = CondContext(context_def=context_def,
-                      import_scope=import_scope)
+    ret = CondContext(context_def=context_def, import_scope=import_scope)
 
     ret.Enter()
     for nested_def in context_def.nested_contexts:
@@ -1826,7 +1831,15 @@ class CondContext(ControlFlowContext):
       result.op._set_control_flow_context(self)
       # pylint: enable=protected-access
 
-      self._values.add(result.name)
+      # Mark Switch output as seen by this context and any outer contexts,
+      # just like what we do for normal op outputs in _AddOpInternal() below.
+      ctxt = self
+      while ctxt is not None:
+        # pylint: disable=protected-access
+        ctxt._values.add(result.name)
+        ctxt = ctxt._outer_context
+        # pylint: enable=protected-access
+
       self._external_values[val.name] = result
     return result
 
@@ -1840,8 +1853,8 @@ class CondContext(ControlFlowContext):
       # loop.
       self._RemoveExternalControlEdges(op)
 
-      if not any(util.OpInContext(input_op, self)
-                 for input_op in op.control_inputs):
+      if not any(
+          util.OpInContext(input_op, self) for input_op in op.control_inputs):
         # pylint: disable=protected-access
         op._add_control_input(self._pivot.op)
         # pylint: enable=protected-access
@@ -1963,7 +1976,7 @@ def _UnpackIfSingleton(res):
 
 # pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
-@tf_export("cond")
+@tf_export(v1=["cond"])
 @deprecation.deprecated_args(
     None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
@@ -2040,7 +2053,7 @@ def cond(pred,
 
   """
   if ENABLE_COND_V2 and not context.executing_eagerly():
-    return cond_v2_impl.cond_v2(pred, true_fn, false_fn, name)
+    return cond_v2.cond_v2(pred, true_fn, false_fn, name)
 
   # We needed to make true_fn/false_fn keyword arguments for
   # backwards-compatibility. This check exists so that we can convert back to
@@ -2160,6 +2173,77 @@ def cond(pred,
 # pylint: enable=redefined-outer-name
 
 
+@tf_export("cond", v1=[])
+def cond_for_tf_v2(pred,
+                   true_fn=None,
+                   false_fn=None,
+                   name=None):
+  """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
+
+  `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
+  `false_fn` must have the same non-zero number and type of outputs.
+
+  **WARNING**: Any Tensors or Operations created outside of `true_fn` and
+  `false_fn` will be executed regardless of which branch is selected at runtime.
+
+  Although this behavior is consistent with the dataflow model of TensorFlow,
+  it has frequently surprised users who expected a lazier semantics.
+  Consider the following simple program:
+
+  ```python
+  z = tf.multiply(a, b)
+  result = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
+  ```
+
+  If `x < y`, the `tf.add` operation will be executed and `tf.square`
+  operation will not be executed. Since `z` is needed for at least one
+  branch of the `cond`, the `tf.multiply` operation is always executed,
+  unconditionally.
+
+  Note that `cond` calls `true_fn` and `false_fn` *exactly once* (inside the
+  call to `cond`, and not at all during `Session.run()`). `cond`
+  stitches together the graph fragments created during the `true_fn` and
+  `false_fn` calls with some additional graph nodes to ensure that the right
+  branch gets executed depending on the value of `pred`.
+
+  `tf.cond` supports nested structures as implemented in
+  `tensorflow.python.util.nest`. Both `true_fn` and `false_fn` must return the
+  same (possibly nested) value structure of lists, tuples, and/or named tuples.
+  Singleton lists and tuples form the only exceptions to this: when returned by
+  `true_fn` and/or `false_fn`, they are implicitly unpacked to single values.
+
+  Args:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix for the returned tensors.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`. If the
+    callables return a singleton list, the element is extracted from the list.
+
+  Raises:
+    TypeError: if `true_fn` or `false_fn` is not callable.
+    ValueError: if `true_fn` and `false_fn` do not return the same number of
+      tensors, or return tensors of different types.
+
+  Example:
+
+  ```python
+  x = tf.constant(2)
+  y = tf.constant(5)
+  def f1(): return tf.multiply(x, 17)
+  def f2(): return tf.add(y, 23)
+  r = tf.cond(tf.less(x, y), f1, f2)
+  # r is set to f1().
+  # Operations in f2 (e.g., tf.add) are not executed.
+  ```
+
+  """
+  return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
+
+
 def _resource_safe_shape(t):
   """Returns the shape of t or the variable it points to."""
   if t.dtype == dtypes.resource:
@@ -2193,8 +2277,8 @@ class WhileContext(ControlFlowContext):
       swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
       name: Optional name prefix for the returned tensors.
       grad_state: The gradient loop state.
-      context_def: Optional `WhileContextDef` protocol buffer to initialize
-        the `Whilecontext` python object from.
+      context_def: Optional `WhileContextDef` protocol buffer to initialize the
+        `Whilecontext` python object from.
       import_scope: Optional `string`. Name scope to add. Only used when
         initialing from protocol buffer.
     """
@@ -2367,8 +2451,7 @@ class WhileContext(ControlFlowContext):
           ops.strip_name_scope(l.name, export_scope) for l in self._loop_enters
       ])
       context_def.values_def.MergeFrom(
-          super(WhileContext, self)._to_values_def(
-              export_scope=export_scope))
+          super(WhileContext, self)._to_values_def(export_scope=export_scope))
       for nested in self._nested_contexts:
         nested_def = context_def.nested_contexts.add()
         nested.to_control_flow_context_def(nested_def)
@@ -2391,8 +2474,7 @@ class WhileContext(ControlFlowContext):
     Returns:
       A `WhileContext` Python object.
     """
-    ret = WhileContext(context_def=context_def,
-                       import_scope=import_scope)
+    ret = WhileContext(context_def=context_def, import_scope=import_scope)
     ret.Enter()
     for nested_def in context_def.nested_contexts:
       from_control_flow_context_def(nested_def, import_scope=import_scope)
@@ -2517,8 +2599,11 @@ class WhileContext(ControlFlowContext):
       # ignore ops which don't have outputs. TODO(apassos): fix that
       with ops.control_dependencies(None):
         self.Enter()
-        external_inputs = [array_ops.identity(x.outputs[0]).op
-                           for x in external_inputs if x.outputs]
+        external_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_inputs
+            if x.outputs
+        ]
         self.Exit()
       op._add_control_inputs(external_inputs)  # pylint: disable=protected-access
     if self._outer_context or not util.IsLoopExit(op):
@@ -2768,8 +2853,8 @@ class WhileContext(ControlFlowContext):
     if self.outer_context:
       self.outer_context.Enter()
     if values.get_shape().is_fully_defined():
-      values_shape = tensor_shape.TensorShape(
-          [tensor_shape.Dimension(1)] + values.get_shape().dims[1:])
+      values_shape = tensor_shape.TensorShape([tensor_shape.Dimension(1)] +
+                                              values.get_shape().dims[1:])
       if self.outer_context:
         self.outer_context.Enter()
       values_acc = constant_op.constant(
@@ -2792,8 +2877,8 @@ class WhileContext(ControlFlowContext):
           self.outer_context.Exit()
       else:
         shape_acc = array_ops.zeros_like(
-            array_ops.shape_internal(op.inputs[0], optimize=False,
-                                     out_type=dense_shape.dtype),
+            array_ops.shape_internal(
+                op.inputs[0], optimize=False, out_type=dense_shape.dtype),
             optimize=False)
 
     if self.outer_context:
@@ -3051,7 +3136,186 @@ class WhileContext(ControlFlowContext):
 
 
 # pylint: disable=redefined-outer-name
-@tf_export("while_loop")
+@tf_export("while_loop", v1=[])
+def while_loop_v2(cond,
+                  body,
+                  loop_vars,
+                  shape_invariants=None,
+                  parallel_iterations=10,
+                  back_prop=True,
+                  swap_memory=False,
+                  maximum_iterations=None,
+                  name=None):
+  """Repeat `body` while the condition `cond` is true.
+
+  `cond` is a callable returning a boolean scalar tensor. `body` is a callable
+  returning a (possibly nested) tuple, namedtuple or list of tensors of the same
+  arity (length and structure) and types as `loop_vars`. `loop_vars` is a
+  (possibly nested) tuple, namedtuple or list of tensors that is passed to both
+  `cond` and `body`. `cond` and `body` both take as many arguments as there are
+  `loop_vars`.
+
+  In addition to regular Tensors or IndexedSlices, the body may accept and
+  return TensorArray objects.  The flows of the TensorArray objects will
+  be appropriately forwarded between loops and during gradient calculations.
+
+  Note that `while_loop` calls `cond` and `body` *exactly once* (inside the
+  call to `while_loop`, and not at all during `Session.run()`). `while_loop`
+  stitches together the graph fragments created during the `cond` and `body`
+  calls with some additional graph nodes to create the graph flow that
+  repeats `body` until `cond` returns false.
+
+  For correctness, `tf.while_loop()` strictly enforces shape invariants for
+  the loop variables. A shape invariant is a (possibly partial) shape that
+  is unchanged across the iterations of the loop. An error will be raised
+  if the shape of a loop variable after an iteration is determined to be more
+  general than or incompatible with its shape invariant. For example, a shape
+  of [11, None] is more general than a shape of [11, 17], and [11, 21] is not
+  compatible with [11, 17]. By default (if the argument `shape_invariants` is
+  not specified), it is assumed that the initial shape of each tensor in
+  `loop_vars` is the same in every iteration. The `shape_invariants` argument
+  allows the caller to specify a less specific shape invariant for each loop
+  variable, which is needed if the shape varies between iterations. The
+  `tf.Tensor.set_shape`
+  function may also be used in the `body` function to indicate that
+  the output loop variable has a particular shape. The shape invariant for
+  SparseTensor and IndexedSlices are treated specially as follows:
+
+  a) If a loop variable is a SparseTensor, the shape invariant must be
+  TensorShape([r]) where r is the rank of the dense tensor represented
+  by the sparse tensor. It means the shapes of the three tensors of the
+  SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
+  is the shape of the SparseTensor.dense_shape property. It must be the shape of
+  a vector.
+
+  b) If a loop variable is an IndexedSlices, the shape invariant must be
+  a shape invariant of the values tensor of the IndexedSlices. It means
+  the shapes of the three tensors of the IndexedSlices are (shape, [shape[0]],
+  [shape.ndims]).
+
+  `while_loop` implements non-strict semantics, enabling multiple iterations
+  to run in parallel. The maximum number of parallel iterations can be
+  controlled by `parallel_iterations`, which gives users some control over
+  memory consumption and execution order. For correct programs, `while_loop`
+  should return the same result for any parallel_iterations > 0.
+
+  For training, TensorFlow stores the tensors that are produced in the
+  forward inference and are needed in back propagation. These tensors are a
+  main source of memory consumption and often cause OOM errors when training
+  on GPUs. When the flag swap_memory is true, we swap out these tensors from
+  GPU to CPU. This for example allows us to train RNN models with very long
+  sequences and large batches.
+
+  Args:
+    cond: A callable that represents the termination condition of the loop.
+    body: A callable that represents the loop body.
+    loop_vars: A (possibly nested) tuple, namedtuple or list of numpy array,
+      `Tensor`, and `TensorArray` objects.
+    shape_invariants: The shape invariants for the loop variables.
+    parallel_iterations: The number of iterations allowed to run in parallel. It
+      must be a positive integer.
+    back_prop: Whether backprop is enabled for this while loop.
+    swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
+    maximum_iterations: Optional maximum number of iterations of the while loop
+      to run.  If provided, the `cond` output is AND-ed with an additional
+      condition ensuring the number of iterations executed is no greater than
+      `maximum_iterations`.
+    name: Optional name prefix for the returned tensors.
+
+  Returns:
+    The output tensors for the loop variables after the loop. The return value
+      has the same structure as `loop_vars`.
+
+  Raises:
+    TypeError: if `cond` or `body` is not callable.
+    ValueError: if `loop_vars` is empty.
+
+  Example:
+
+  ```python
+  i = tf.constant(0)
+  c = lambda i: tf.less(i, 10)
+  b = lambda i: tf.add(i, 1)
+  r = tf.while_loop(c, b, [i])
+  ```
+
+  Example with nesting and a namedtuple:
+
+  ```python
+  import collections
+  Pair = collections.namedtuple('Pair', 'j, k')
+  ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
+  c = lambda i, p: i < 10
+  b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
+  ijk_final = tf.while_loop(c, b, ijk_0)
+  ```
+
+  Example using shape_invariants:
+
+  ```python
+  i0 = tf.constant(0)
+  m0 = tf.ones([2, 2])
+  c = lambda i, m: i < 10
+  b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+  tf.while_loop(
+      c, b, loop_vars=[i0, m0],
+      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+  ```
+
+  Example which demonstrates non-strict semantics: In the following
+  example, the final value of the counter `i` does not depend on `x`. So
+  the `while_loop` can increment the counter parallel to updates of `x`.
+  However, because the loop counter at one loop iteration depends
+  on the value at the previous iteration, the loop counter itself cannot
+  be incremented in parallel. Hence if we just want the final value of the
+  counter (which we print on the line `print(sess.run(i))`), then
+  `x` will never be incremented, but the counter will be updated on a
+  single thread. Conversely, if we want the value of the output (which we
+  print on the line `print(sess.run(out).shape)`), then the counter may be
+  incremented on its own thread, while `x` can be incremented in
+  parallel on a separate thread. In the extreme case, it is conceivable
+  that the thread incrementing the counter runs until completion before
+  `x` is incremented even a single time. The only thing that can never
+  happen is that the thread updating `x` can never get ahead of the
+  counter thread because the thread incrementing `x` depends on the value
+  of the counter.
+
+  ```python
+  import tensorflow as tf
+
+  n = 10000
+  x = tf.constant(list(range(n)))
+  c = lambda i, x: i < n
+  b = lambda i, x: (tf.Print(i + 1, [i]), tf.Print(x + 1, [i], "x:"))
+  i, out = tf.while_loop(c, b, (0, x))
+  with tf.Session() as sess:
+      print(sess.run(i))  # prints [0] ... [9999]
+
+      # The following line may increment the counter and x in parallel.
+      # The counter thread may get ahead of the other thread, but not the
+      # other way around. So you may see things like
+      # [9996] x:[9987]
+      # meaning that the counter thread is on iteration 9996,
+      # while the other thread is on iteration 9987
+      print(sess.run(out).shape)
+  ```
+
+  """
+  return while_loop(
+      cond=cond,
+      body=body,
+      loop_vars=loop_vars,
+      shape_invariants=shape_invariants,
+      parallel_iterations=parallel_iterations,
+      back_prop=back_prop,
+      swap_memory=swap_memory,
+      name=name,
+      maximum_iterations=maximum_iterations,
+      return_same_structure=True)
+
+
+# pylint: disable=redefined-outer-name
+@tf_export(v1=["while_loop"])
 def while_loop(cond,
                body,
                loop_vars,
@@ -3128,8 +3392,8 @@ def while_loop(cond,
     loop_vars: A (possibly nested) tuple, namedtuple or list of numpy array,
       `Tensor`, and `TensorArray` objects.
     shape_invariants: The shape invariants for the loop variables.
-    parallel_iterations: The number of iterations allowed to run in parallel.
-      It must be a positive integer.
+    parallel_iterations: The number of iterations allowed to run in parallel. It
+      must be a positive integer.
     back_prop: Whether backprop is enabled for this while loop.
     swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
     name: Optional name prefix for the returned tensors.
@@ -3224,12 +3488,14 @@ def while_loop(cond,
 
   """
   if ENABLE_WHILE_V2 and not context.executing_eagerly():
-    if not _while_v2:
-      raise ValueError("The while_v2 module is not set. Did you forget to "
-                       "import tensorflow.python.ops."
-                       "while_v2?")
-    return _while_v2.while_loop(
-        cond, body, loop_vars, shape_invariants=shape_invariants, name=name)
+    return while_v2.while_loop(
+        cond,
+        body,
+        loop_vars,
+        shape_invariants=shape_invariants,
+        maximum_iterations=maximum_iterations,
+        name=name,
+        return_same_structure=return_same_structure)
 
   with ops.name_scope(name, "while", loop_vars):
     if not loop_vars:
@@ -3450,7 +3716,43 @@ def group(*inputs, **kwargs):
       return no_op(name=name)
 
 
-@tf_export("tuple")
+@tf_export("tuple", v1=[])
+def tuple_v2(tensors, control_inputs=None, name=None):
+  """Group tensors together.
+
+  This creates a tuple of tensors with the same values as the `tensors`
+  argument, except that the value of each tensor is only returned after the
+  values of all tensors have been computed.
+
+  `control_inputs` contains additional ops that have to finish before this op
+  finishes, but whose outputs are not returned.
+
+  This can be used as a "join" mechanism for parallel computations: all the
+  argument tensors can be computed in parallel, but the values of any tensor
+  returned by `tuple` are only available after all the parallel computations
+  are done.
+
+  See also `tf.group` and
+  `tf.control_dependencies`.
+
+  Args:
+    tensors: A list of `Tensor`s or `IndexedSlices`, some entries can be `None`.
+    control_inputs: List of additional ops to finish before returning.
+    name: (optional) A name to use as a `name_scope` for the operation.
+
+  Returns:
+    Same as `tensors`.
+
+  Raises:
+    ValueError: If `tensors` does not contain any `Tensor` or `IndexedSlices`.
+    TypeError: If `control_inputs` is not a list of `Operation` or `Tensor`
+      objects.
+
+  """
+  return tuple(tensors=tensors, name=name, control_inputs=control_inputs)  # pylint: disable=redefined-builtin
+
+
+@tf_export(v1=["tuple"])
 def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
@@ -3486,12 +3788,15 @@ def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined
   if context.executing_eagerly():
     return tensors
   with ops.name_scope(name, "tuple", tensors) as name:
-    tensors = [t if (isinstance(t, ops.Operation)
-                     or tensor_util.is_tensor(t)
-                     or t is None)
-               else ops.convert_to_tensor(t) for t in tensors]
-    gating_ops = [t if isinstance(t, ops.Operation) else t.op for t in tensors
-                  if t is not None]
+    tensors = [
+        t if (isinstance(t, ops.Operation) or tensor_util.is_tensor(t) or
+              t is None) else ops.convert_to_tensor(t) for t in tensors
+    ]
+    gating_ops = [
+        t if isinstance(t, ops.Operation) else t.op
+        for t in tensors
+        if t is not None
+    ]
     if control_inputs:
       for c in control_inputs:
         if isinstance(c, ops.Tensor):
@@ -3576,12 +3881,13 @@ def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name,
   """Verifies input arguments for the case function.
 
   Args:
-    pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor,
-                   and a callable which returns a list of tensors.
+    pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor, and a
+      callable which returns a list of tensors.
     exclusive: True iff at most one predicate is allowed to evaluate to `True`.
     name: A name for the case operation.
     allow_python_preds: if true, pred_fn_pairs may contain Python bools in
-                        addition to boolean Tensors
+      addition to boolean Tensors
+
   Raises:
     TypeError: If `pred_fn_pairs` is not a list/dictionary.
     TypeError: If `pred_fn_pairs` is a list but does not contain 2-tuples.
@@ -3597,11 +3903,22 @@ def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name,
   if isinstance(pred_fn_pairs, collections.OrderedDict):
     pred_fn_pairs = pred_fn_pairs.items()
   elif isinstance(pred_fn_pairs, dict):
-    pred_fn_pairs = sorted(pred_fn_pairs.items(), key=lambda item: item[0].name)
-    if not exclusive:
-      logging.warn("%s: An unordered dictionary of predicate/fn pairs was "
-                   "provided, but exclusive=False. The order of conditional "
-                   "tests is deterministic but not guaranteed.", name)
+    if context.executing_eagerly():
+      # No name to sort on in eager mode. Use dictionary traversal order,
+      # which is nondeterministic in versions of Python < 3.6
+      if not exclusive:
+        raise ValueError("Unordered dictionaries are not supported for the "
+                         "`pred_fn_pairs` argument when `exclusive=False` and "
+                         "eager mode is enabled.")
+      pred_fn_pairs = list(pred_fn_pairs.items())
+    else:
+      pred_fn_pairs = sorted(
+          pred_fn_pairs.items(), key=lambda item: item[0].name)
+      if not exclusive:
+        logging.warn(
+            "%s: An unordered dictionary of predicate/fn pairs was "
+            "provided, but exclusive=False. The order of conditional "
+            "tests is deterministic but not guaranteed.", name)
   for pred_fn_pair in pred_fn_pairs:
     if not isinstance(pred_fn_pair, _basetuple) or len(pred_fn_pair) != 2:
       raise TypeError("Each entry in pred_fn_pairs must be a 2-tuple")
@@ -3622,19 +3939,24 @@ def _case_verify_and_canonicalize_args(pred_fn_pairs, exclusive, name,
   return predicates, actions
 
 
-def _case_helper(cond_fn, pred_fn_pairs, default,
-                 exclusive, name, allow_python_preds=False, **cond_kwargs):
+def _case_helper(cond_fn,
+                 pred_fn_pairs,
+                 default,
+                 exclusive,
+                 name,
+                 allow_python_preds=False,
+                 **cond_kwargs):
   """Implementation of case that allows for different cond functions.
 
   Args:
     cond_fn: method that has signature and semantics of `cond` above.
     pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor, and a
-                   callable which returns a list of tensors.
+      callable which returns a list of tensors.
     default: Optional callable that returns a list of tensors.
     exclusive: True iff at most one predicate is allowed to evaluate to `True`.
     name: A name for this operation (optional).
     allow_python_preds: if true, pred_fn_pairs may contain Python bools in
-                        addition to boolean Tensors
+      addition to boolean Tensors
     **cond_kwargs: keyword arguments that will be passed to `cond_fn`.
 
   Returns:
@@ -3692,7 +4014,7 @@ def case(pred_fn_pairs,
   operation returns the tensors generated by `default`.
 
   `tf.case` supports nested structures as implemented in
-  `tensorflow.python.util.nest`. All of the callables must return the same
+  `tf.contrib.framework.nest`. All of the callables must return the same
   (possibly nested) value structure of lists, tuples, and/or named tuples.
   Singleton lists and tuples form the only exceptions to this: when returned by
   a callable, they are implicitly unpacked to single values. This
@@ -3703,6 +4025,12 @@ def case(pred_fn_pairs,
   deterministic, so that variables created in conditional branches are created
   in fixed order across runs.
 
+  @compatibility{eager}
+  Unordered dictionaries are not supported in eager mode when `exclusive=False`.
+  Use a list of tuples instead.
+  @end_compatibility
+
+
   **Example 1:**
 
   Pseudocode:
@@ -3717,7 +4045,7 @@ def case(pred_fn_pairs,
   ```python
   f1 = lambda: tf.constant(17)
   f2 = lambda: tf.constant(23)
-  r = case([(tf.less(x, y), f1)], default=f2)
+  r = tf.case([(tf.less(x, y), f1)], default=f2)
   ```
 
   **Example 2:**
@@ -3725,7 +4053,7 @@ def case(pred_fn_pairs,
   Pseudocode:
 
   ```
-  if (x < y && x > z) raise OpError("Only one predicate may evaluate true");
+  if (x < y && x > z) raise OpError("Only one predicate may evaluate to True");
   if (x < y) return 17;
   else if (x > z) return 23;
   else return -1;
@@ -3737,13 +4065,13 @@ def case(pred_fn_pairs,
   def f1(): return tf.constant(17)
   def f2(): return tf.constant(23)
   def f3(): return tf.constant(-1)
-  r = case({tf.less(x, y): f1, tf.greater(x, z): f2},
+  r = tf.case({tf.less(x, y): f1, tf.greater(x, z): f2},
            default=f3, exclusive=True)
   ```
 
   Args:
     pred_fn_pairs: Dict or list of pairs of a boolean scalar tensor and a
-                   callable which returns a list of tensors.
+      callable which returns a list of tensors.
     default: Optional callable that returns a list of tensors.
     exclusive: True iff at most one predicate is allowed to evaluate to `True`.
     strict: A boolean that enables/disables 'strict' mode; see above.
@@ -3759,8 +4087,14 @@ def case(pred_fn_pairs,
     TypeError: If `fns[i]` is not callable for any i, or `default` is not
                callable.
   """
-  return _case_helper(cond, pred_fn_pairs, default, exclusive, name,
-                      allow_python_preds=False, strict=strict)
+  return _case_helper(
+      cond,
+      pred_fn_pairs,
+      default,
+      exclusive,
+      name,
+      allow_python_preds=False,
+      strict=strict)
 
 
 class XLAControlFlowContext(ControlFlowContext):
@@ -3770,6 +4104,12 @@ class XLAControlFlowContext(ControlFlowContext):
     super(XLAControlFlowContext, self).__init__()
     self._name = "XLAControlFlowContext"
 
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    # pylint: disable=useless-super-delegation
+    # NOTE(slebedev): the method is required by `ControlFlowContext`.
+    super(XLAControlFlowContext, self).to_control_flow_context_def(
+        context_def, export_scope)
+
   def IsXLAContext(self):
     return True
 
@@ -3791,13 +4131,13 @@ def from_control_flow_context_def(context_def, import_scope=None):
     A ControlFlowContext subclass
   """
   if context_def.HasField("cond_ctxt"):
-    return CondContext.from_proto(context_def.cond_ctxt,
-                                  import_scope=import_scope)
+    return CondContext.from_proto(
+        context_def.cond_ctxt, import_scope=import_scope)
   if context_def.HasField("while_ctxt"):
-    return WhileContext.from_proto(context_def.while_ctxt,
-                                   import_scope=import_scope)
-  raise NotImplementedError("Unknown ControlFlowContextDef field: %s"
-                            % context_def.WhichOneof("ctxt"))
+    return WhileContext.from_proto(
+        context_def.while_ctxt, import_scope=import_scope)
+  raise NotImplementedError("Unknown ControlFlowContextDef field: %s" %
+                            context_def.WhichOneof("ctxt"))
 
 
 ops.register_proto_function(
diff --git a/tensorflow/python/ops/control_flow_ops_benchmark.py b/tensorflow/python/ops/control_flow_ops_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ba5ff2c0f8af44e8536b49a3c0e7ef6bfae4d28
--- /dev/null
+++ b/tensorflow/python/ops/control_flow_ops_benchmark.py
@@ -0,0 +1,122 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for control flow ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class CondWithManyIntermediatesBenchmark(test.Benchmark):
+  """Checks the runtime performance of outputting all intermediates."""
+
+  NUM_INTERMEDIATES = 1000
+  NUM_ITERS = 500
+  NUM_WARM_UP_ITERS = 50
+
+  def _create_cond(self, x):
+
+    def branch_fn():
+      # Use a random value so the adds can't be constant folded.
+      return x + sum(random_ops.random_normal([])
+                     for _ in range(self.NUM_INTERMEDIATES))
+
+    # Use a dynamic predicate to make sure the cond isn't constant folded.
+    return control_flow_ops.cond(math_ops.not_equal(x, -1),
+                                 branch_fn, lambda: 0.0)
+
+  def _benchmark_defun(self):
+    """Benchmarks cond in a defun."""
+
+    @function.defun
+    def cond_fn(x):
+      return self._create_cond(x)
+
+    # Warm up
+    for _ in range(self.NUM_WARM_UP_ITERS):
+      cond_fn(0.0)
+
+    start_time = time.time()
+
+    for _ in range(self.NUM_ITERS):
+      cond_fn(0.0)
+
+    self.report_benchmark(
+        wall_time=time.time() - start_time,
+        iters=self.NUM_ITERS)
+
+  def _benchmark_graph(self):
+    """Benchmarks cond in legacy graph mode."""
+    with context.graph_mode():
+      with ops.Graph().as_default():
+        x = array_ops.placeholder(dtypes.float32)
+        cond_val = self._create_cond(x)
+
+        with session.Session() as sess:
+          cond_fn = sess.make_callable(cond_val, [x])
+
+          # Warm up
+          for _ in range(self.NUM_WARM_UP_ITERS):
+            cond_fn(0.0)
+
+          start_time = time.time()
+
+          for _ in range(self.NUM_ITERS):
+            cond_fn(0.0)
+
+          self.report_benchmark(
+              wall_time=time.time() - start_time,
+              iters=self.NUM_ITERS)
+
+  def benchmark_cond_v1_defun(self):
+    old_val = control_flow_ops.ENABLE_COND_V2
+    control_flow_ops.ENABLE_COND_V2 = False
+    self._benchmark_defun()
+    control_flow_ops.ENABLE_COND_V2 = old_val
+
+  def benchmark_cond_v2_defun(self):
+    old_val = control_flow_ops.ENABLE_COND_V2
+    control_flow_ops.ENABLE_COND_V2 = True
+    self._benchmark_defun()
+    control_flow_ops.ENABLE_COND_V2 = old_val
+
+  def benchmark_cond_v1_graph(self):
+    old_val = control_flow_ops.ENABLE_COND_V2
+    control_flow_ops.ENABLE_COND_V2 = False
+    self._benchmark_graph()
+    control_flow_ops.ENABLE_COND_V2 = old_val
+
+  def benchmark_cond_v2_graph(self):
+    old_val = control_flow_ops.ENABLE_COND_V2
+    control_flow_ops.ENABLE_COND_V2 = True
+    self._benchmark_graph()
+    control_flow_ops.ENABLE_COND_V2 = old_val
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 2c421761584f3c83072d12a0ac37f565bda31e79..0c18b7208f5c4049722012504a26563f55aeca3c 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -21,9 +21,10 @@ from __future__ import print_function
 import collections
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
-from tensorflow.python.client import session
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -126,56 +127,56 @@ class GroupTestCase(test_util.TensorFlowTestCase):
       node { name: "root" op: "NoOp" input: "^a" input: "^b" }
     """, self._StripGraph(gd))
 
+  @test_util.run_deprecated_v1
   def testPassingNonTensors(self):
-    with ops.Graph().as_default():
-      with self.assertRaises(TypeError):
-        control_flow_ops.group(1, 2)
+    with self.assertRaises(TypeError):
+      control_flow_ops.group(1, 2)
 
 
 class ShapeTestCase(test_util.TensorFlowTestCase):
 
   def testShape(self):
-    with ops.Graph().as_default():
-      tensor = constant_op.constant([1.0, 2.0])
-      self.assertEquals([2], tensor.get_shape())
-      self.assertEquals([2],
-                        control_flow_ops.with_dependencies(
-                            [constant_op.constant(1.0)], tensor).get_shape())
+    tensor = constant_op.constant([1.0, 2.0])
+    self.assertEquals([2], tensor.get_shape())
+    self.assertEquals([2],
+                      control_flow_ops.with_dependencies(
+                          [constant_op.constant(1.0)], tensor).get_shape())
 
 
 class WithDependenciesTestCase(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTupleDependencies(self):
-    with ops.Graph().as_default():
-      counter = variable_scope.get_variable(
-          "my_counter", shape=[], initializer=init_ops.zeros_initializer())
-      increment_counter = state_ops.assign_add(counter, 1)
-      const_with_dep = control_flow_ops.with_dependencies(
-          (increment_counter, constant_op.constant(42)),
-          constant_op.constant(7))
-      with self.cached_session():
-        variables.global_variables_initializer().run()
-        self.assertEquals(0, counter.eval())
-        self.assertEquals(7, const_with_dep.eval())
-        self.assertEquals(1, counter.eval())
-
+    counter = variable_scope.get_variable(
+        "my_counter", shape=[], initializer=init_ops.zeros_initializer())
+    increment_counter = state_ops.assign_add(counter, 1)
+    const_with_dep = control_flow_ops.with_dependencies(
+        (increment_counter, constant_op.constant(42)),
+        constant_op.constant(7))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEquals(0, self.evaluate(counter))
+    self.assertEquals(7, self.evaluate(const_with_dep))
+    self.assertEquals(1, self.evaluate(counter))
+
+  @test_util.run_deprecated_v1
   def testListDependencies(self):
-    with ops.Graph().as_default():
-      counter = variable_scope.get_variable(
-          "my_counter", shape=[], initializer=init_ops.zeros_initializer())
-      increment_counter = state_ops.assign_add(counter, 1)
-      const_with_dep = control_flow_ops.with_dependencies(
-          [increment_counter, constant_op.constant(42)],
-          constant_op.constant(7))
-      with self.cached_session():
-        variables.global_variables_initializer().run()
-        self.assertEquals(0, counter.eval())
-        self.assertEquals(7, const_with_dep.eval())
-        self.assertEquals(1, counter.eval())
+    counter = variable_scope.get_variable(
+        "my_counter", shape=[], initializer=init_ops.zeros_initializer())
+    increment_counter = state_ops.assign_add(counter, 1)
+    const_with_dep = control_flow_ops.with_dependencies(
+        [increment_counter, constant_op.constant(42)],
+        constant_op.constant(7))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEquals(0, self.evaluate(counter))
+    self.assertEquals(7, self.evaluate(const_with_dep))
+    self.assertEquals(1, self.evaluate(counter))
 
 
 class SwitchTestCase(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesWithDenseShape(self):
     with self.cached_session():
       data = ops.IndexedSlices(
@@ -189,68 +190,64 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       self.assertAllEqual([1, 2, 3], switch_true.values.eval())
       self.assertAllEqual([0, 1], switch_true.indices.eval())
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesGradient(self):
-    with ops.Graph().as_default():
-      embedding_matrix = variable_scope.get_variable(
-          "embedding_matrix", [5, 5],
-          initializer=init_ops.random_normal_initializer())
-
-      def cond(it, _):
-        return it < 5
-
-      def body(it, cost):
-        embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
-        cost += math_ops.reduce_sum(embedding)
-        return it + 1, cost
-
-      _, cost = control_flow_ops.while_loop(
-          cond, body, [constant_op.constant(0),
-                       constant_op.constant(0.0)])
-      optimizer = momentum.MomentumOptimizer(0.1, 0.9)
-      train_op = optimizer.minimize(cost)
-      with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        for _ in range(10):
-          sess.run([train_op])
+    embedding_matrix = variable_scope.get_variable(
+        "embedding_matrix", [5, 5],
+        initializer=init_ops.random_normal_initializer())
+
+    def cond(it, _):
+      return it < 5
+
+    def body(it, cost):
+      embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
+      cost += math_ops.reduce_sum(embedding)
+      return it + 1, cost
+
+    _, cost = control_flow_ops.while_loop(
+        cond, body, [constant_op.constant(0),
+                     constant_op.constant(0.0)])
+    optimizer = momentum.MomentumOptimizer(0.1, 0.9)
+    train_op = optimizer.minimize(cost)
+    with self.cached_session():
+      self.evaluate(variables.global_variables_initializer())
+      for _ in range(10):
+        self.evaluate([train_op])
 
   def testResourceReadInLoop(self):
-    with ops.Graph().as_default():
-      embedding_matrix = variable_scope.get_variable(
-          "embedding_matrix",
-          initializer=[[2.0], [3.0]],
-          use_resource=True)
+    embedding_matrix = variable_scope.get_variable(
+        "embedding_matrix", initializer=[[2.0], [3.0]], use_resource=True)
 
-      def cond(it, _):
-        return it < 5
+    def cond(it, _):
+      return it < 5
 
-      def body(it, cost):
-        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
-        cost += math_ops.reduce_sum(embedding)
-        return it + 1, cost
+    def body(it, cost):
+      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
+      cost += math_ops.reduce_sum(embedding)
+      return it + 1, cost
 
-      _, cost = control_flow_ops.while_loop(
-          cond, body, [constant_op.constant(0),
-                       constant_op.constant(0.0)])
-      with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertAllEqual(10.0, cost.eval())
+    _, cost = control_flow_ops.while_loop(
+        cond, body, [constant_op.constant(0),
+                     constant_op.constant(0.0)])
+    with self.cached_session():
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(10.0, self.evaluate(cost))
 
   def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
-    with ops.Graph().as_default():
-      embedding_matrix = variable_scope.get_variable(
-          "embedding_matrix", [5, 5],
-          initializer=init_ops.random_normal_initializer(),
-          use_resource=use_resource)
-
-      def cond(it, _):
-        return it < 5
-
-      def body(it, cost):
-        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
-        cost = control_flow_ops.cond(
-            math_ops.equal(it, 3), lambda: math_ops.square(cost),
-            lambda: cost + math_ops.reduce_sum(embedding))
-        return it + 1, cost
+    embedding_matrix = variable_scope.get_variable(
+        "embedding_matrix", [5, 5],
+        initializer=init_ops.random_normal_initializer(),
+        use_resource=use_resource)
+
+    def cond(it, _):
+      return it < 5
+
+    def body(it, cost):
+      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
+      cost = control_flow_ops.cond(
+          math_ops.equal(it, 3), lambda: math_ops.square(cost),
+          (lambda: cost + math_ops.reduce_sum(embedding)))
+      return it + 1, cost
 
       _, cost = control_flow_ops.while_loop(
           cond, body, [constant_op.constant(0),
@@ -268,9 +265,9 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       static_grads = math_ops.segment_sum(static_grads.values,
                                           static_grads.indices)
 
-      with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
+      with self.cached_session():
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllEqual(*self.evaluate([static_grads, dynamic_grads]))
 
   def testIndexedSlicesGradientInCondInWhileLoop(self):
     self.doTestIndexedSlicesGradientInCondInWhileLoop(use_resource=False)
@@ -278,6 +275,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
   def testIndexedSlicesGradientInCondInWhileLoopResource(self):
     self.doTestIndexedSlicesGradientInCondInWhileLoop(use_resource=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesWithShapeGradientInWhileLoop(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session() as sess:
@@ -307,6 +305,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(o, 20)
         self.assertAllEqual(grad, [1] * num_steps)
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesWithDynamicShapeGradientInWhileLoop(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session() as sess:
@@ -334,105 +333,94 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(o, 6)
         self.assertAllEqual(grad, [1] * 3)
 
+  @test_util.run_deprecated_v1
   def testGradientThroughSingleBranchOutsideOfContext(self):
-    with self.cached_session():
-      x = constant_op.constant(2.)
-      s = constant_op.constant(True)
-      x_false, x_true = control_flow_ops.switch(x, s)
-      grad_x_true = gradients_impl.gradients(x_true, x)[0]
-      grad_x_false = gradients_impl.gradients(x_false, x)[0]
-      self.assertEquals(grad_x_true.eval(), 1.)
-      self.assertEquals(grad_x_false.eval(), 0.)
+    x = constant_op.constant(2.)
+    s = constant_op.constant(True)
+    x_false, x_true = control_flow_ops.switch(x, s)
+    grad_x_true = gradients_impl.gradients(x_true, x)[0]
+    grad_x_false = gradients_impl.gradients(x_false, x)[0]
+    self.assertEquals(self.evaluate(grad_x_true), 1.)
+    self.assertEquals(self.evaluate(grad_x_false), 0.)
 
 
 class CondTest(test_util.TensorFlowTestCase):
 
   def testCondTrue(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(5)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
-            lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 34)
+    x = constant_op.constant(2)
+    y = constant_op.constant(5)
+    z = control_flow_ops.cond(
+        math_ops.less(
+            x,
+            y), lambda: math_ops.multiply(x, 17), lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 34)
 
   def testCondFalse(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(1)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
-            lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 24)
+    x = constant_op.constant(2)
+    y = constant_op.constant(1)
+    z = control_flow_ops.cond(
+        math_ops.less(
+            x,
+            y), lambda: math_ops.multiply(x, 17), lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 24)
 
   def testCondTrueLegacy(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(5)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
-            fn2=lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 34)
+    x = constant_op.constant(2)
+    y = constant_op.constant(5)
+    z = control_flow_ops.cond(
+        math_ops.less(x, y),
+        fn1=lambda: math_ops.multiply(x, 17),
+        fn2=lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 34)
 
   def testCondFalseLegacy(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(1)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
-            fn2=lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 24)
-
+    x = constant_op.constant(2)
+    y = constant_op.constant(1)
+    z = control_flow_ops.cond(
+        math_ops.less(x, y),
+        fn1=lambda: math_ops.multiply(x, 17),
+        fn2=lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 24)
+
+  @test_util.run_deprecated_v1
   def testCondModifyBoolPred(self):
     # This test in particular used to fail only when running in GPU, hence
     # use_gpu=True.
-    with ops.Graph().as_default():
-      with session.Session() as sess:
-        bool_var = variable_scope.get_variable("bool_var", dtype=dtypes.bool,
-                                               initializer=True)
-        cond_on_bool_var = control_flow_ops.cond(
-            pred=bool_var,
-            true_fn=lambda: state_ops.assign(bool_var, False),
-            false_fn=lambda: True)
-        sess.run(bool_var.initializer)
-        self.assertEquals(sess.run(cond_on_bool_var), False)
-        self.assertEquals(sess.run(cond_on_bool_var), True)
+    with test_util.use_gpu():
+      bool_var = variable_scope.get_variable(
+          "bool_var", dtype=dtypes.bool, initializer=True)
+      cond_on_bool_var = control_flow_ops.cond(
+          pred=bool_var,
+          true_fn=lambda: state_ops.assign(bool_var, False),
+          false_fn=lambda: True)
+      self.evaluate(bool_var.initializer)
+      self.assertEquals(self.evaluate(cond_on_bool_var), False)
+      self.assertEquals(self.evaluate(cond_on_bool_var), True)
 
   def testCondMissingArg1(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, false_fn=lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, false_fn=lambda: x)
 
   def testCondMissingArg2(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, lambda: x)
 
   def testCondDuplicateArg1(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
 
   def testCondDuplicateArg2(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
 
 
 class ContextTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testCondContext(self):
     with self.cached_session() as sess:
       x = constant_op.constant(2)
@@ -462,24 +450,38 @@ class ContextTest(test_util.TensorFlowTestCase):
               control_flow_ops.WhileContext.from_proto(
                   control_flow_context.to_proto()).to_proto())
 
+  @test_util.run_deprecated_v1
   def testWhileContext(self):
     self._testWhileContextHelper()
 
+  @test_util.run_deprecated_v1
   def testWhileContextWithMaximumIterations(self):
     self._testWhileContextHelper(maximum_iterations=10)
 
+  @test_util.run_deprecated_v1
   def testControlContextImportScope(self):
+    class NoABCControlFlowContext(control_flow_ops.ControlFlowContext):
+      """A noop wrapper around `ControlFlowContext`.
+
+      `ControlFlowContext` is an ABC and therefore cannot be instantiated.
+      """
+      # pylint: disable=useless-super-delegation
+
+      def to_control_flow_context_def(self, context_def, export_scope=None):
+        super(NoABCControlFlowContext, self).to_control_flow_context_def(
+            context_def, export_scope)
+
     with self.cached_session():
       constant_op.constant(0, name="a")
       constant_op.constant(2, name="test_scope/a")
       b1 = constant_op.constant(1, name="b")
       b2 = constant_op.constant(3, name="test_scope/b")
 
-      c = control_flow_ops.ControlFlowContext()
+      c = NoABCControlFlowContext()
       c._values = ["a", "b"]
       c._external_values = {"a": b1}
 
-      c_with_scope = control_flow_ops.ControlFlowContext(
+      c_with_scope = NoABCControlFlowContext(
           values_def=c._to_values_def(), import_scope="test_scope")
 
       # _values and _external_values should be have scope prepended.
@@ -519,7 +521,7 @@ def _raw_nested_shape(nested_shape):
 
   def _raw_shape(shape):
     if isinstance(shape, tensor_shape.TensorShape) and shape.ndims is not None:
-      return [x.value for x in shape]
+      return [x.value for x in shape.dims]
     else:
       return None
 
@@ -579,6 +581,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
       if check_cond:
         self.assertAllEqualNested(result_case, expected_value_false)
 
+  @test_util.run_deprecated_v1
   def test_int(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: 1
@@ -588,6 +591,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape, strict=True)
     self._testReturnValues(fn_true, fn_false, 1, 2, strict=True)
 
+  @test_util.run_deprecated_v1
   def test_float(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: 1.0
@@ -595,12 +599,14 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, 1.0, 2.0)
 
+  @test_util.run_deprecated_v1
   def test_noop(self):
     shape = tensor_shape.TensorShape(None)
     self._testShape(control_flow_ops.no_op, control_flow_ops.no_op, shape)
     self._testReturnValues(control_flow_ops.no_op, control_flow_ops.no_op,
                            True, False, check_cond=False)
 
+  @test_util.run_deprecated_v1
   def test_string(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: "abc"
@@ -608,6 +614,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, b"abc", b"xyz")
 
+  @test_util.run_deprecated_v1
   def test_variable(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: variables.Variable(3.0)
@@ -615,6 +622,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, 3.0, 4.0)
 
+  @test_util.run_v1_only("b/120553181")
   def test_none(self):
     fn_none = lambda: None
     fn_tensor = lambda: constant_op.constant(1)
@@ -625,6 +633,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_none)
 
+  @test_util.run_deprecated_v1
   def test_tensors(self):
 
     def _build_true_branch(dtype):
@@ -653,6 +662,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                              (np.zeros([2, 2]), np.ones([3, 3])),
                              (np.ones([2, 2]), np.zeros([3, 3])))
 
+  @test_util.run_deprecated_v1
   def test_tensors_unknown_shape(self):
 
     def _build_true_branch(dtype):
@@ -681,6 +691,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                              feed_dict={true_tensor: np.zeros([2, 2]),
                                         false_tensor: np.ones([2, 2])})
 
+  @test_util.run_deprecated_v1
   def test_sparse_tensors(self):
     shape = tensor_shape.TensorShape([None, None])
 
@@ -696,11 +707,14 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                              values=[1, 2], dense_shape=[3, 4])
     value2 = sparse_tensor.SparseTensorValue(indices=[[0, 0], [2, 1]],
                                              values=[3, 4], dense_shape=[3, 4])
-    self._testShape(true_fn, false_fn, shape)
-    self._testReturnValues(true_fn, false_fn, value1, value2)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(true_fn, false_fn, shape)
+      self._testReturnValues(true_fn, false_fn, value1, value2)
     self._testShape(true_fn, false_fn, [shape], strict=True)
     self._testReturnValues(true_fn, false_fn, [value1], [value2], strict=True)
 
+  @test_util.run_deprecated_v1
   def test_tensors_with_partially_specified_shapes(self):
 
     def _build_branch(dtype, shape):
@@ -730,6 +744,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                         true_tensors[2]: np.ones([3, 3]),
                                         false_tensors[2]: np.ones([3, 3])})
 
+  @test_util.run_deprecated_v1
   def test_tensor_arrays(self):
     element_shape = tensor_shape.TensorShape([2])
     ta1 = _create_tensor_array(4, element_shape)
@@ -739,6 +754,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     fn_false = lambda: ta2
     self._testShape(fn_true, fn_false, shape)
 
+  @test_util.run_deprecated_v1
   def test_tensor_array_reads(self):
     shape = tensor_shape.TensorShape([2])
     ta = _create_tensor_array(4, shape)
@@ -746,6 +762,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     fn_false = lambda: ta.read(1)
     self._testShape(fn_true, fn_false, shape)
 
+  @test_util.run_deprecated_v1
   def test_list(self):
     shape = [tensor_shape.TensorShape([]), tensor_shape.TensorShape([]),
              tensor_shape.TensorShape([])]
@@ -754,6 +771,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, [1, 2, 3.0], [3, 4, 5.0])
 
+  @test_util.run_v1_only("Non-strict cond is only available in v1")
   def test_non_strict(self):
     shape = tensor_shape.TensorShape([])
     fn_tensor = lambda: constant_op.constant(1)
@@ -766,6 +784,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testReturnValues(fn_tensor, fn_tuple, 1, 3)
     self._testReturnValues(fn_list, fn_tuple, 2, 3)
 
+  @test_util.run_v1_only("b/120553181")
   def test_singleton_strict(self):
     fn_tensor = lambda: constant_op.constant(1)
     fn_list = lambda: [constant_op.constant(2)]
@@ -787,36 +806,46 @@ class DataTypesTest(test_util.TensorFlowTestCase):
       control_flow_ops.case([(constant_op.constant(True), fn_list)], fn_tuple,
                             strict=True)
 
+  @test_util.run_deprecated_v1
   def test_singleton_list(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: [constant_op.constant(1)]
     fn_false = lambda: [constant_op.constant(3)]
-    self._testShape(fn_true, fn_false, shape)
-    self._testReturnValues(fn_true, fn_false, 1, 3)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false, 1, 3)
     self._testShape(fn_true, fn_false, [shape], strict=True)
     self._testReturnValues(fn_true, fn_false, [1], [3], strict=True)
 
+  @test_util.run_deprecated_v1
   def test_singleton_tuple(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: (constant_op.constant(1),)
     fn_false = lambda: (constant_op.constant(3),)
-    self._testShape(fn_true, fn_false, shape)
-    self._testReturnValues(fn_true, fn_false, 1, 3)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false, 1, 3)
     self._testShape(fn_true, fn_false, (shape,), strict=True)
     self._testReturnValues(fn_true, fn_false, (1,), (3,),
                            strict=True)
 
+  @test_util.run_deprecated_v1
   def test_singleton_namedtuple(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: SingletonTestTuple(constant_op.constant(1))
     fn_false = lambda: SingletonTestTuple(constant_op.constant(3))
-    self._testShape(fn_true, fn_false, shape)
-    self._testReturnValues(fn_true, fn_false, 1, 3)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false, 1, 3)
     self._testShape(fn_true, fn_false, SingletonTestTuple(shape),
                     strict=True)
     self._testReturnValues(fn_true, fn_false, SingletonTestTuple(1),
                            SingletonTestTuple(3), strict=True)
 
+  @test_util.run_deprecated_v1
   def test_tuple(self):
     shape = (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
     fn_true = lambda: (constant_op.constant(1), 2)
@@ -824,6 +853,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, (1, 2), (3, 4))
 
+  @test_util.run_deprecated_v1
   def test_namedtuple(self):
     shape = TestTuple(tensor_shape.TensorShape([]),
                       tensor_shape.TensorShape([]))
@@ -832,6 +862,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, TestTuple(1, 2), TestTuple(3, 4))
 
+  @test_util.run_deprecated_v1
   def test_nested(self):
     shape = [tensor_shape.TensorShape([]),
              TestTuple(tensor_shape.TensorShape([]),
@@ -857,6 +888,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
         [11, TestTuple(12, [13, 14]),
          np.ones([5, 5]), 16])
 
+  @test_util.run_deprecated_v1
   def test_cond_inside_while_loop(self):
 
     def body(i, matrix):
@@ -878,6 +910,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
 class CaseTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testCase_withDefault(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -889,6 +922,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 6)
 
+  @test_util.run_deprecated_v1
   def testCase_multiple_matches_exclusive(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -902,6 +936,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 2})
 
+  @test_util.run_deprecated_v1
   def testCase_multiple_matches_non_exclusive(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -914,6 +949,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 8)
 
+  @test_util.run_deprecated_v1
   def testCase_withoutDefault(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -927,6 +963,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
+  @test_util.run_deprecated_v1
   def testCase_withoutDefault_oneCondition(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2))]
@@ -936,6 +973,16 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
+  @test_util.run_in_graph_and_eager_modes
+  def testCase_dict(self):
+    x = constant_op.constant(2)
+    conditions = {
+        math_ops.equal(x, 1): lambda: constant_op.constant(2),
+        math_ops.equal(x, 2): lambda: constant_op.constant(4)
+    }
+    output = control_flow_ops.case(conditions, exclusive=True)
+    self.assertEqual(4, self.evaluate(output))
+
 
 class WhileLoopTestCase(test_util.TensorFlowTestCase):
 
@@ -958,6 +1005,7 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
     # Expect a tuple since that is what the body returns.
     self.assertEqual(self.evaluate(r), (10,))
 
+  @test_util.run_deprecated_v1
   def testWhileLoopSameReturnShape_False(self):
     i = constant_op.constant(0)
     c = lambda i, _: math_ops.less(i, 10)
@@ -981,5 +1029,31 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
     self.assertEqual(self.evaluate(r), [10, []])
 
 
+class AssertTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_deprecated_v1
+  def testAssert(self):
+    i = constant_op.constant(0)
+    c = control_flow_ops.Assert(i < 10, [i, [10], [i + 1]])
+    self.evaluate(c)
+
+    i = constant_op.constant(10)
+    c = control_flow_ops.Assert(i < 10, [i, [10], [i + 1]])
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(c)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAssertInFunction(self):
+
+    @def_function.function
+    def whiny(value):
+      control_flow_ops.Assert(value, ["Raised false"])
+      return constant_op.constant(5)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(whiny(False))
+
+    self.assertAllEqual(whiny(True), 5)
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index 72c074ed1af208da274edd52572961ecaa613b34..cb628f4aa6441ec9cb03dfe873a79d06a66e37a1 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -38,6 +38,11 @@ def IsInXLAContext(op):
   return GetContainingXLAContext(ctxt) is not None
 
 
+def InXlaContext(graph):
+  ctxt = graph._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingXLAContext(ctxt) is not None
+
+
 def IsInWhileLoop(op):
   ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
   return GetContainingWhileContext(ctxt) is not None
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f56850884a5e9e424c77515406ef8c9b513e972
--- /dev/null
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -0,0 +1,122 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilties for V2 control flow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework.func_graph import FuncGraph
+from tensorflow.python.ops import control_flow_util
+
+
+class CondBranchFuncGraph(FuncGraph):
+  """FuncGraph for branches of tf.cond().
+
+  This is used to distinguish cond branches from other functions.
+  """
+  pass
+
+
+class WhileCondFuncGraph(FuncGraph):
+  """FuncGraph for the condition of tf.while_loop().
+
+  This is used to distinguish while conditions from other functions.
+  """
+  pass
+
+
+class WhileBodyFuncGraph(FuncGraph):
+  """FuncGraph for the body of tf.while_loop().
+
+  This is used to distinguish while bodies from other functions.
+  """
+  pass
+
+
+def in_defun():
+  """Returns if the current graph is, or is nested in, a defun."""
+  if context.executing_eagerly(): return False
+
+  graph = ops.get_default_graph()
+  while (isinstance(graph, CondBranchFuncGraph) or
+         isinstance(graph, WhileBodyFuncGraph)):
+    graph = graph.outer_graph
+  return isinstance(graph, FuncGraph)
+
+
+def create_new_tf_function(func_graph):
+  """Converts func_graph to a TF_Function and adds it to the current graph.
+
+  Args:
+    func_graph: FuncGraph
+
+  Returns:
+    The name of the new TF_Function.
+  """
+  func = function._EagerDefinedFunction(  # pylint: disable=protected-access
+      func_graph.name, func_graph, func_graph.inputs, func_graph.outputs, {})
+  func.add_to_graph(func_graph.outer_graph)
+  return func_graph.name
+
+
+def unique_fn_name(scope, name):
+  """Returns a unique name to use for a control flow function.
+
+  Args:
+    scope: A name scope string.
+    name: An identifier for this function (e.g. "true", "body").
+
+  Returns:
+    A string, the name to use for the function.
+  """
+  return ("%s%s_%s" % (scope, name, ops.uid())).replace("/", "_")
+
+
+def unique_grad_fn_name(forward_name):
+  return "%s_grad_%s" % (forward_name, ops.uid())
+
+
+def maybe_set_lowering_attr(op):
+  """Sets the flag to enable lowering on `op` if necessary.
+
+  Lowering allows cond_v2 and while_v2 to avoid some of the limitations of
+  Functions, allowing users to specify devices & colocation inside of cond_v2
+  and while_v2 input functions, and enabling non-strict evaluation & partial
+  pruning. This brings v2 control flow closer to feature parity with v1 control
+  flow.
+
+  However, we do not lower in the following cases:
+    - When the `If` or `While` ops are in the XLA context. Because it is easier
+      for XLA to apply its own optimizations when dealing with un-lowered
+      control flow operators than with low-level control flow primitives.
+    - When the eager execution context specifies the executor of functions to
+      be the single threaded executor (see context.function_executor_type()).
+      Because the single threaded executor does not support v1 control flow ops.
+
+  Args:
+    op: An `If` or `While` Operation.
+  """
+  if (not control_flow_util.IsInXLAContext(op) and
+      context.context().get_function_call_options().executor_type
+      != "SINGLE_THREADED_EXECUTOR"):
+    # pylint: disable=protected-access
+    op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
+    # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 32d455bdad55eede6370204ae74bbc7631597df3..3a7eb9355a66a213d3d60f103b818ef22fd839bd 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -19,17 +19,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_ctc_ops
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access, invalid-name
-@tf_export("nn.ctc_loss")
+@tf_export(v1=["nn.ctc_loss"])
 def ctc_loss(labels, inputs, sequence_length,
              preprocess_collapse_repeated=False,
              ctc_merge_repeated=True,
@@ -212,14 +222,19 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
 
   Returns:
     A tuple `(decoded, neg_sum_logits)` where
+
     decoded: A single-element list. `decoded[0]`
       is an `SparseTensor` containing the decoded outputs s.t.:
+
       `decoded.indices`: Indices matrix `(total_decoded_outputs, 2)`.
         The rows store: `[batch, time]`.
+
       `decoded.values`: Values vector, size `(total_decoded_outputs)`.
         The vector stores the decoded classes.
+
       `decoded.dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length]`
+
     neg_sum_logits: A `float` matrix `(batch_size x 1)` containing, for the
         sequence found, the negative of the sum of the greatest logit at each
         timeframe.
@@ -231,7 +246,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
           log_probabilities)
 
 
-@tf_export("nn.ctc_beam_search_decoder")
+@tf_export(v1=["nn.ctc_beam_search_decoder"])
 def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
                             top_paths=1, merge_repeated=True):
   """Performs beam search decoding on the logits given in input.
@@ -259,14 +274,19 @@ def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
 
   Returns:
     A tuple `(decoded, log_probabilities)` where
+
     decoded: A list of length top_paths, where `decoded[j]`
       is a `SparseTensor` containing the decoded outputs:
+
       `decoded[j].indices`: Indices matrix `(total_decoded_outputs[j] x 2)`
         The rows store: [batch, time].
+
       `decoded[j].values`: Values vector, size `(total_decoded_outputs[j])`.
         The vector stores the decoded classes for beam j.
+
       `decoded[j].dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length[j]]`.
+
     log_probability: A `float` matrix `(batch_size x top_paths)` containing
         sequence log-probabilities.
   """
@@ -282,7 +302,829 @@ def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
       log_probabilities)
 
 
-ops.NotDifferentiable("CTCGreedyDecoder")
+@tf_export("nn.ctc_beam_search_decoder", v1=["nn.ctc_beam_search_decoder_v2"])
+def ctc_beam_search_decoder_v2(inputs, sequence_length, beam_width=100,
+                               top_paths=1):
+  """Performs beam search decoding on the logits given in input.
 
+  **Note** The `ctc_greedy_decoder` is a special case of the
+  `ctc_beam_search_decoder` with `top_paths=1` and `beam_width=1` (but
+  that decoder is faster for this special case).
+
+  Args:
+    inputs: 3-D `float` `Tensor`, size
+      `[max_time, batch_size, num_classes]`.  The logits.
+    sequence_length: 1-D `int32` vector containing sequence lengths,
+      having size `[batch_size]`.
+    beam_width: An int scalar >= 0 (beam search beam width).
+    top_paths: An int scalar >= 0, <= beam_width (controls output size).
+
+  Returns:
+    A tuple `(decoded, log_probabilities)` where
+
+    decoded: A list of length top_paths, where `decoded[j]`
+      is a `SparseTensor` containing the decoded outputs:
 
+      `decoded[j].indices`: Indices matrix `[total_decoded_outputs[j], 2]`;
+        The rows store: `[batch, time]`.
+
+      `decoded[j].values`: Values vector, size `[total_decoded_outputs[j]]`.
+        The vector stores the decoded classes for beam `j`.
+
+      `decoded[j].dense_shape`: Shape vector, size `(2)`.
+        The shape values are: `[batch_size, max_decoded_length[j]]`.
+
+    log_probability: A `float` matrix `[batch_size, top_paths]` containing
+        sequence log-probabilities.
+  """
+
+  # Note, merge_repeated is an invalid optimization that is removed from the
+  # public API: it returns low probability paths.
+  return ctc_beam_search_decoder(inputs, sequence_length=sequence_length,
+                                 beam_width=beam_width, top_paths=top_paths,
+                                 merge_repeated=False)
+
+
+ops.NotDifferentiable("CTCGreedyDecoder")
 ops.NotDifferentiable("CTCBeamSearchDecoder")
+
+
+def _ctc_state_trans(label_seq):
+  """Compute CTC alignment model transition matrix.
+
+  Args:
+    label_seq: tensor of shape [batch_size, max_seq_length]
+
+  Returns:
+    tensor of shape [batch_size, states, states] with a state transition matrix
+    computed for each sequence of the batch.
+  """
+
+  with ops.name_scope("ctc_state_trans"):
+    label_seq = ops.convert_to_tensor(label_seq, name="label_seq")
+    batch_size = _get_dim(label_seq, 0)
+    num_labels = _get_dim(label_seq, 1)
+
+    num_label_states = num_labels + 1
+    num_states = 2 * num_label_states
+
+    label_states = math_ops.range(num_label_states)
+    blank_states = label_states + num_label_states
+
+    # Start state to first label.
+    start_to_label = [[1, 0]]
+
+    # Blank to label transitions.
+    blank_to_label = array_ops.stack([label_states[1:], blank_states[:-1]], 1)
+
+    # Label to blank transitions.
+    label_to_blank = array_ops.stack([blank_states, label_states], 1)
+
+    # Scatter transitions that don't depend on sequence.
+    indices = array_ops.concat(
+        [start_to_label, blank_to_label, label_to_blank], 0)
+    values = array_ops.ones([_get_dim(indices, 0)])
+    trans = array_ops.scatter_nd(
+        indices, values, shape=[num_states, num_states])
+    trans += linalg_ops.eye(num_states)  # Self-loops.
+
+    # Label to label transitions. Disallow transitions between repeated labels
+    # with no blank state in between.
+    batch_idx = array_ops.zeros_like(label_states[2:])
+    indices = array_ops.stack(
+        [batch_idx, label_states[2:], label_states[1:-1]], 1)
+    indices = array_ops.tile(
+        array_ops.expand_dims(indices, 0), [batch_size, 1, 1])
+    batch_idx = array_ops.expand_dims(math_ops.range(batch_size), 1) * [1, 0, 0]
+    indices += array_ops.expand_dims(batch_idx, 1)
+    repeats = math_ops.equal(label_seq[:, :-1], label_seq[:, 1:])
+    values = 1.0 - math_ops.cast(repeats, dtypes.float32)
+    batched_shape = [batch_size, num_states, num_states]
+    label_to_label = array_ops.scatter_nd(indices, values, batched_shape)
+
+    return array_ops.expand_dims(trans, 0) + label_to_label
+
+
+def ctc_state_log_probs(seq_lengths, max_seq_length):
+  """Computes CTC alignment initial and final state log probabilities.
+
+  Create the initial/final state values directly as log values to avoid
+  having to take a float64 log on tpu (which does not exist).
+
+  Args:
+    seq_lengths: int tensor of shape [batch_size], seq lengths in the batch.
+    max_seq_length: int, max sequence length possible.
+
+  Returns:
+    initial_state_log_probs, final_state_log_probs
+  """
+
+  batch_size = _get_dim(seq_lengths, 0)
+  num_label_states = max_seq_length + 1
+  num_duration_states = 2
+  num_states = num_duration_states * num_label_states
+  log_0 = math_ops.cast(
+      math_ops.log(math_ops.cast(0, dtypes.float64) + 1e-307),
+      dtypes.float32)
+
+  initial_state_log_probs = array_ops.one_hot(
+      indices=array_ops.zeros([batch_size], dtype=dtypes.int32),
+      depth=num_states,
+      on_value=0.0,
+      off_value=log_0, axis=1)
+
+  label_final_state_mask = array_ops.one_hot(
+      seq_lengths, depth=num_label_states, axis=0)
+  duration_final_state_mask = array_ops.ones(
+      [num_duration_states, 1, batch_size])
+  final_state_mask = duration_final_state_mask * label_final_state_mask
+  final_state_log_probs = (1.0 - final_state_mask) * log_0
+  final_state_log_probs = array_ops.reshape(
+      final_state_log_probs, [num_states, batch_size])
+
+  return initial_state_log_probs, array_ops.transpose(final_state_log_probs)
+
+
+def _ilabel_to_state(labels, num_labels, ilabel_log_probs):
+  """Project ilabel log probs to state log probs."""
+
+  num_label_states = _get_dim(labels, 1)
+  blank = ilabel_log_probs[:, :, :1]
+  blank = array_ops.tile(blank, [1, 1, num_label_states + 1])
+  one_hot = array_ops.one_hot(labels, depth=num_labels)
+  one_hot = array_ops.expand_dims(one_hot, axis=0)
+  ilabel_log_probs = array_ops.expand_dims(ilabel_log_probs, axis=2)
+  state_log_probs = math_ops.reduce_sum(ilabel_log_probs * one_hot, axis=3)
+  state_log_probs = array_ops.concat([state_log_probs, blank], axis=2)
+  return array_ops.pad(
+      state_log_probs, [[0, 0], [0, 0], [1, 0]],
+      constant_values=math_ops.log(0.0))
+
+
+def _state_to_olabel(labels, num_labels, states):
+  """Sum state log probs to ilabel log probs."""
+
+  num_label_states = _get_dim(labels, 1) + 1
+  label_states = states[:, :, 1:num_label_states]
+  blank_states = states[:, :, num_label_states:]
+  one_hot = array_ops.one_hot(
+      labels - 1, depth=(num_labels - 1),
+      on_value=0.0, off_value=math_ops.log(0.0))
+  one_hot = array_ops.expand_dims(one_hot, axis=0)
+  label_states = array_ops.expand_dims(label_states, axis=3)
+  label_olabels = math_ops.reduce_logsumexp(label_states + one_hot, axis=2)
+  blank_olabels = math_ops.reduce_logsumexp(
+      blank_states, axis=2, keepdims=True)
+  return array_ops.concat([blank_olabels, label_olabels], axis=-1)
+
+
+# pylint: disable=redefined-outer-name
+def _state_to_olabel_unique(labels, num_labels, states, unique):
+  """Sum state log probs to ilabel log probs using unique label indices."""
+
+  num_label_states = _get_dim(labels, 1) + 1
+  label_states = states[:, :, 1:num_label_states]
+  blank_states = states[:, :, num_label_states:]
+
+  unique_y, unique_idx = unique
+  mul_reduce = _sum_states(unique_idx, label_states)
+
+  num_frames = states.shape[0]
+  batch_size = states.shape[1]
+  num_states = num_label_states - 1
+  batch_state_major = array_ops.transpose(mul_reduce, perm=[1, 2, 0])
+  batch_state_major = array_ops.reshape(
+      batch_state_major, [batch_size * num_states, num_frames])
+  batch_offset = math_ops.range(batch_size, dtype=unique_y.dtype) * num_labels
+  indices = unique_y + array_ops.expand_dims(batch_offset, axis=-1)
+  indices = array_ops.reshape(indices, [-1, 1])
+  scatter = array_ops.scatter_nd(
+      indices=indices,
+      updates=batch_state_major,
+      shape=[batch_size * num_labels, num_frames])
+  scatter = array_ops.reshape(scatter, [batch_size, num_labels, num_frames])
+  scatter = array_ops.where(
+      math_ops.equal(scatter, 0.0),
+      array_ops.fill(array_ops.shape(scatter), math_ops.log(0.0)),
+      scatter)
+  label_olabels = array_ops.transpose(scatter, [2, 0, 1])
+  label_olabels = label_olabels[:, :, 1:]
+
+  blank_olabels = math_ops.reduce_logsumexp(
+      blank_states, axis=2, keepdims=True)
+
+  return array_ops.concat([blank_olabels, label_olabels], axis=-1)
+
+
+def ctc_loss_and_grad(logits, labels, label_length, logit_length, unique=None):
+  """Computes the CTC loss and gradients.
+
+  Most users will want fwd_bwd.ctc_loss
+
+  This function returns the computed gradient, it does not have a gradient
+  of its own defined.
+
+  Args:
+    logits: tensor of shape [frames, batch_size, num_labels]
+    labels: tensor of shape [batch_size, max_label_seq_length]
+    label_length: tensor of shape [batch_size]
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    unique: (optional) unique label indices as computed by unique(labels)
+      If supplied, enables an implementation that is faster and more memory
+      efficient on TPU.
+
+  Returns:
+    loss: tensor of shape [batch_size]
+    gradient: tensor of shape [frames, batch_size, num_labels]
+  """
+
+  num_labels = _get_dim(logits, 2)
+  max_label_seq_length = _get_dim(labels, 1)
+
+  ilabel_log_probs = nn_ops.log_softmax(logits)
+  state_log_probs = _ilabel_to_state(labels, num_labels, ilabel_log_probs)
+  state_trans_probs = _ctc_state_trans(labels)
+  initial_state_log_probs, final_state_log_probs = ctc_state_log_probs(
+      label_length, max_label_seq_length)
+  fwd_bwd_log_probs, log_likelihood = _forward_backward_log(
+      state_trans_log_probs=math_ops.log(state_trans_probs),
+      initial_state_log_probs=initial_state_log_probs,
+      final_state_log_probs=final_state_log_probs,
+      observed_log_probs=state_log_probs,
+      sequence_length=logit_length)
+
+  if unique:
+    olabel_log_probs = _state_to_olabel_unique(
+        labels, num_labels, fwd_bwd_log_probs, unique)
+  else:
+    olabel_log_probs = _state_to_olabel(labels, num_labels, fwd_bwd_log_probs)
+
+  grad = math_ops.exp(ilabel_log_probs) - math_ops.exp(olabel_log_probs)
+  loss = -log_likelihood
+  return loss, grad
+
+
+def _ctc_loss_grad(op, grad_loss, _):
+  grad = op.outputs[1]
+  grad = [array_ops.reshape(grad_loss, [1, -1, 1]) * grad]
+  grad += [None] * (len(op.inputs) - len(grad))
+  return grad
+
+
+def _ctc_loss_shape(op):
+  return [op.inputs[2].get_shape(), op.inputs[0].get_shape()]
+
+
+@tf_export("nn.ctc_loss", v1=["nn.ctc_loss_v2"])
+def ctc_loss_v2(labels, logits, label_length, logit_length,
+                logits_time_major=True, unique=None,
+                blank_index=None, name=None):
+  """Computes CTC (Connectionist Temporal Classification) loss.
+
+  This op implements the CTC loss as presented in the article:
+
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
+  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+
+  Notes:
+      - Same as the "Classic CTC" in TensorFlow 1.x's tf.nn.ctc_loss setting of
+        preprocess_collapse_repeated=False, ctc_merge_repeated=True
+      - Labels may be supplied as either a dense, zero-padded tensor with a
+        vector of label sequence lengths OR as a SparseTensor.
+      - On TPU and GPU:
+          - Only dense padded labels are supported.
+      - On CPU:
+          - Caller may use SparseTensor or dense padded labels but calling with
+            a SparseTensor will be significantly faster.
+      - Default blank label is 0 rather num_classes - 1, unless overridden by
+        blank_index.
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_seq_length] or SparseTensor
+    logits: tensor of shape [frames, batch_size, num_labels],
+      if logits_time_major == False, shape is [batch_size, frames, num_labels].
+    label_length: tensor of shape [batch_size], None if labels is SparseTensor
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    logits_time_major: (optional) If True (default), logits is shaped
+      [time, batch, logits]. If False, shape is [batch, time, logits]
+    unique: (optional) Unique label indices as computed by
+      ctc_unique_labels(labels).  If supplied, enable a faster, memory
+      efficient implementation on TPU.
+    blank_index: (optional) Set the class index to use for the blank label.
+      Negative values will start from num_classes, ie, -1 will reproduce the
+      ctc_loss behavior of using num_classes - 1 for the blank symbol.
+      There is some memory/performance overhead to switching from the default
+      of 0 as an additional shifted copy of the logits may be created.
+    name: A name for this `Op`. Defaults to "ctc_loss_dense".
+
+  Returns:
+    loss: tensor of shape [batch_size], negative log probabilities.
+  """
+  if isinstance(labels, sparse_tensor.SparseTensor):
+    if blank_index is None:
+      raise ValueError(
+          "blank_index must be given when using SparseTensor labels.")
+
+    if blank_index < 0:
+      blank_index += _get_dim(logits, 2)
+
+    if blank_index != _get_dim(logits, 2) - 1:
+      logits = array_ops.concat([
+          logits[:, :, :blank_index],
+          logits[:, :, blank_index+1:],
+          logits[:, :, blank_index:blank_index+1],
+      ], axis=2)
+      labels = sparse_tensor.SparseTensor(
+          labels.indices,
+          array_ops.where(labels.values < blank_index,
+                          labels.values,
+                          labels.values - 1),
+          labels.dense_shape)
+
+    return ctc_loss(labels=labels,
+                    inputs=logits,
+                    sequence_length=logit_length,
+                    time_major=logits_time_major)
+
+  if blank_index is None:
+    blank_index = 0
+
+  return ctc_loss_dense(labels=labels,
+                        logits=logits,
+                        label_length=label_length,
+                        logit_length=logit_length,
+                        logits_time_major=logits_time_major,
+                        unique=unique,
+                        blank_index=blank_index,
+                        name=name)
+
+
+def ctc_loss_dense(labels, logits, label_length, logit_length,
+                   logits_time_major=True, unique=None,
+                   blank_index=0, name=None):
+  """Computes CTC (Connectionist Temporal Classification) loss.
+
+  This op implements the CTC loss as presented in the article:
+
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
+  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+
+  Using the batched forward backward algorithm described in:
+
+  [Sim, K. C., Narayanan, A., Bagby, T., Sainath, T. N., & Bacchiani, M.
+  Improving the efficiency of forward-backward algorithm using batched
+    computation in TensorFlow.
+  Automatic Speech Recognition and Understanding Workshop (ASRU),
+    2017 IEEE (pp. 258-264).
+  ](https://ieeexplore.ieee.org/iel7/8260578/8268903/08268944.pdf)
+
+  Notes:
+    Significant differences from tf.nn.ctc_loss:
+      Supports GPU and TPU (tf.nn.ctc_loss supports CPU only):
+        For batched operations, GPU and TPU are significantly faster than using
+        ctc_loss on CPU.
+        This implementation runs on CPU, but significantly slower than ctc_loss.
+      Blank label is 0 rather num_classes - 1, unless overridden by blank_index.
+      Logits and labels are dense arrays with padding rather than SparseTensor.
+      The only mode supported is the same as:
+        preprocess_collapse_repeated=False, ctc_merge_repeated=True
+        To collapse labels, the caller can preprocess label sequence first.
+
+    The dense implementation supports both CPU, GPU and TPU. A fast path is
+    provided that significantly improves memory use for large vocabulary if the
+    caller preprocesses label sequences to get unique label indices on the CPU
+    (eg. in the data input pipeline) using ctc_ops.unique and simplies this in
+    the optional "unique" kwarg. This is especially useful for TPU and GPU but
+    also works with if used on CPU.
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_seq_length]
+    logits: tensor of shape [frames, batch_size, num_labels],
+      if logits_time_major == False, shape is [batch_size, frames, num_labels].
+    label_length: tensor of shape [batch_size]
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    logits_time_major: (optional) If True (default), logits is shaped
+      [time, batch, logits]. If False, shape is [batch, time, logits]
+    unique: (optional) Unique label indices as computed by unique(labels).
+      If supplied, enable a faster, memory efficient implementation on TPU.
+    blank_index: (optional) Set the class index to use for the blank label.
+      Negative values will start from num_classes, ie, -1 will reproduce the
+      ctc_loss behavior of using num_classes - 1 for the blank symbol.
+      There is some memory/performance overhead to switching from the default
+      of 0 as an additional shifted copy of the logits may be created.
+    name: A name for this `Op`. Defaults to "ctc_loss_dense".
+
+  Returns:
+    loss: tensor of shape [batch_size], negative log probabilities.
+  """
+
+  with ops.name_scope(name, "ctc_loss_dense",
+                      [logits, labels, label_length, logit_length]):
+    logits = ops.convert_to_tensor(logits, name="logits")
+    labels = ops.convert_to_tensor(labels, name="labels")
+    label_length = ops.convert_to_tensor(label_length, name="label_length")
+    logit_length = ops.convert_to_tensor(logit_length, name="logit_length")
+
+    if not logits_time_major:
+      logits = array_ops.transpose(logits, perm=[1, 0, 2])
+
+    if blank_index != 0:
+      if blank_index < 0:
+        blank_index += _get_dim(logits, 2)
+      logits = array_ops.concat([
+          logits[:, :, blank_index:blank_index+1],
+          logits[:, :, :blank_index],
+          logits[:, :, blank_index+1:],
+      ], axis=2)
+      labels = array_ops.where(labels < blank_index, labels + 1, labels)
+
+    args = [logits, labels, label_length, logit_length]
+
+    if unique:
+      unique_y, unique_idx = unique
+      args.extend([unique_y, unique_idx])
+
+    # TODO(tombagby): Update to tfe.defun
+    @function.Defun(*[x.dtype for x in args],
+                    python_grad_func=_ctc_loss_grad,
+                    shape_func=_ctc_loss_shape)
+    def compute_ctc_loss(logits_t, labels_t, label_length_t, logit_length_t,
+                         *unique_t):
+      """Compute CTC loss."""
+      logits_t.set_shape(logits.shape)
+      labels_t.set_shape(labels.shape)
+      label_length_t.set_shape(label_length.shape)
+      logit_length_t.set_shape(logit_length.shape)
+      kwargs = dict(
+          logits=logits_t,
+          labels=labels_t,
+          label_length=label_length_t,
+          logit_length=logit_length_t)
+      if unique_t:
+        kwargs["unique"] = unique_t
+      return ctc_loss_and_grad(**kwargs)
+
+    return compute_ctc_loss(*args)[0]
+
+
+@tf_export("nn.collapse_repeated")
+def collapse_repeated(labels, seq_length, name=None):
+  """Merge repeated labels into single labels.
+
+  Args:
+    labels: Tensor of shape (batch, max value in seq_length)
+    seq_length: Tensor of shape (batch), sequence length of each batch element.
+    name: A name for this `Op`. Defaults to "collapse_repeated_labels".
+
+  Returns:
+    tuple of Tensor of shape (batch, max_seq_length) with repeated labels
+    collapsed and padded to max_seq_length, eg:
+        [[A, A, B, B, A],
+         [A, B, C, D, E]] => [[A, B, A, 0, 0],
+                              [A, B, C, D, E]]
+    and int tensor of shape [batch] with new sequence lengths.
+  """
+
+  with ops.name_scope(name, "collapse_repeated_labels",
+                      [labels, seq_length]):
+    labels = ops.convert_to_tensor(labels, name="labels")
+    seq_length = ops.convert_to_tensor(seq_length, name="seq_length")
+
+    # Mask labels that don't equal previous label.
+    label_mask = array_ops.concat(
+        [array_ops.ones_like(labels[:, :1], dtypes.bool),
+         math_ops.not_equal(labels[:, 1:], labels[:, :-1])],
+        axis=1)
+
+    # Filter labels that aren't in the original sequence.
+    maxlen = _get_dim(labels, 1)
+    seq_mask = array_ops.sequence_mask(seq_length, maxlen=maxlen)
+    label_mask = math_ops.logical_and(label_mask, seq_mask)
+
+    # Count masks for new sequence lengths.
+    new_seq_len = math_ops.reduce_sum(
+        math_ops.cast(label_mask, dtypes.int32), axis=1)
+
+    # Mask indexes based on sequence length mask.
+    new_maxlen = math_ops.reduce_max(new_seq_len)
+    idx_mask = array_ops.sequence_mask(new_seq_len, maxlen=new_maxlen)
+
+    # Flatten everything and mask out labels to keep and sparse indices.
+    flat_labels = array_ops.reshape(labels, [-1])
+    flat_label_mask = array_ops.reshape(label_mask, [-1])
+    flat_idx_mask = array_ops.reshape(idx_mask, [-1])
+    idx = math_ops.range(_get_dim(flat_idx_mask, 0))
+
+    # Scatter to flat shape.
+    flat = array_ops.scatter_nd(
+        indices=array_ops.expand_dims(
+            array_ops.boolean_mask(idx, flat_idx_mask), axis=1),
+        updates=array_ops.boolean_mask(flat_labels, flat_label_mask),
+        shape=array_ops.shape(flat_idx_mask))
+
+    # Reshape back to square batch.
+    batch_size = _get_dim(labels, 0)
+    new_shape = [batch_size, new_maxlen]
+    return (array_ops.reshape(flat, new_shape),
+            math_ops.cast(new_seq_len, seq_length.dtype))
+
+
+def dense_labels_to_sparse(dense, length):
+  """Convert dense labels with sequence lengths to sparse tensor.
+
+  Args:
+    dense: tensor of shape [batch, max_length]
+    length: int tensor of shape [batch]
+      The length of each sequence in dense.
+
+  Returns:
+    tf.SparseTensor with values only for the valid elements of sequences.
+  """
+
+  flat_values = array_ops.reshape(dense, [-1])
+  flat_indices = math_ops.range(
+      array_ops.shape(flat_values, out_type=dtypes.int64)[0])
+  mask = array_ops.sequence_mask(length, maxlen=array_ops.shape(dense)[1])
+  flat_mask = array_ops.reshape(mask, [-1])
+  indices = array_ops.expand_dims(
+      array_ops.boolean_mask(flat_indices, flat_mask), 1)
+  values = array_ops.boolean_mask(flat_values, flat_mask)
+  sparse = sparse_tensor.SparseTensor(
+      indices=indices, values=math_ops.cast(values, dtypes.int32),
+      dense_shape=array_ops.shape(flat_values, out_type=dtypes.int64))
+  reshaped = sparse_ops.sparse_reshape(sparse, array_ops.shape(dense))
+  max_length = math_ops.reduce_max(length)
+  return sparse_tensor.SparseTensor(
+      indices=reshaped.indices,
+      values=reshaped.values,
+      dense_shape=[
+          math_ops.cast(reshaped.dense_shape[0], dtypes.int64),
+          math_ops.cast(max_length, dtypes.int64)])
+
+
+@tf_export("nn.ctc_unique_labels")
+def ctc_unique_labels(labels, name=None):
+  """Get unique labels and indices for batched labels for tf.nn.ctc_loss.
+
+  For use with tf.nn.ctc_loss_v2 optional argument `unique`: This op can be
+  used to preprocess labels in input pipeline to for better speed/memory use
+  computing the ctc loss on TPU.
+
+  Example:
+    ctc_unique_labels([[3, 4, 4, 3]]) ->
+      unique labels padded with 0: [[3, 4, 0, 0]]
+      indices of original labels in unique: [0, 1, 1, 0]
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_length] padded with 0.
+    name: A name for this `Op`. Defaults to "ctc_unique_labels".
+
+  Returns:
+    tuple of
+      - unique labels, tensor of shape `[batch_size, max_label_length]`
+      - indices into unique labels, shape `[batch_size, max_label_length]`
+  """
+
+  with ops.name_scope(name, "ctc_unique_labels", [labels]):
+    labels = ops.convert_to_tensor(labels, name="labels")
+    def _unique(x):
+      u = array_ops.unique(x)
+      y = array_ops.pad(
+          u.y, [[0, _get_dim(u.idx, 0) - _get_dim(u.y, 0)]])
+      y = math_ops.cast(y, dtypes.int64)
+      return [y, u.idx]
+    return functional_ops.map_fn(
+        _unique, labels, dtype=[dtypes.int64, dtypes.int32])
+
+
+def _sum_states(idx, states):
+  """Take logsumexp for each unique state out of all label states.
+
+  Args:
+    idx: tensor of shape [batch, label_length]
+      For each sequence, indices into a set of unique labels as computed by
+      calling unique.
+    states: tensor of shape [frames, batch, label_length]
+      Log probabilities for each label state.
+
+  Returns:
+    tensor of shape [frames, batch_size, label_length], log probabilites summed
+      for each unique label of the sequence.
+  """
+
+  with ops.name_scope("sum_states"):
+    idx = ops.convert_to_tensor(idx, name="idx")
+    num_states = _get_dim(states, 2)
+    states = array_ops.expand_dims(states, axis=2)
+    one_hot = array_ops.one_hot(
+        idx, depth=num_states, on_value=0.0, off_value=math_ops.log(0.0),
+        axis=1)
+    return math_ops.reduce_logsumexp(states + one_hot, axis=-1)
+
+
+def _forward_backward_log(state_trans_log_probs, initial_state_log_probs,
+                          final_state_log_probs, observed_log_probs,
+                          sequence_length):
+  """Forward-backward algorithm computed in log domain.
+
+  Args:
+    state_trans_log_probs: tensor of shape [states, states] or
+      if different transition matrix per batch [batch_size, states, states]
+    initial_state_log_probs: tensor of shape [batch_size, states]
+    final_state_log_probs: tensor of shape [batch_size, states]
+    observed_log_probs: tensor of shape [frames, batch_size, states]
+    sequence_length: tensor of shape [batch_size]
+
+  Returns:
+    forward backward log probabilites: tensor of shape [frames, batch, states]
+    log_likelihood: tensor of shape [batch_size]
+
+  Raises:
+    ValueError: If state_trans_log_probs has unknown or incorrect rank.
+  """
+
+  if state_trans_log_probs.shape.ndims == 2:
+    perm = [1, 0]
+  elif state_trans_log_probs.shape.ndims == 3:
+    perm = [0, 2, 1]
+  else:
+    raise ValueError(
+        "state_trans_log_probs rank must be known and == 2 or 3, is: %s" %
+        state_trans_log_probs.shape.ndims)
+
+  bwd_state_trans_log_probs = array_ops.transpose(state_trans_log_probs, perm)
+  batch_size = _get_dim(observed_log_probs, 1)
+
+  def _forward(state_log_prob, obs_log_prob):
+    state_log_prob = array_ops.expand_dims(state_log_prob, axis=1)  # Broadcast.
+    state_log_prob += state_trans_log_probs
+    state_log_prob = math_ops.reduce_logsumexp(state_log_prob, axis=-1)
+    state_log_prob += obs_log_prob
+    log_prob_sum = math_ops.reduce_logsumexp(
+        state_log_prob, axis=-1, keepdims=True)
+    state_log_prob -= log_prob_sum
+    return state_log_prob
+
+  fwd = _scan(_forward, observed_log_probs, initial_state_log_probs,
+              inclusive=True)
+
+  def _backward(accs, elems):
+    """Calculate log probs and cumulative sum masked for sequence length."""
+    state_log_prob, cum_log_sum = accs
+    obs_log_prob, mask = elems
+    state_log_prob += obs_log_prob
+    state_log_prob = array_ops.expand_dims(state_log_prob, axis=1)  # Broadcast.
+    state_log_prob += bwd_state_trans_log_probs
+    state_log_prob = math_ops.reduce_logsumexp(state_log_prob, axis=-1)
+
+    log_prob_sum = math_ops.reduce_logsumexp(
+        state_log_prob, axis=-1, keepdims=True)
+    state_log_prob -= log_prob_sum
+
+    cum_log_sum += array_ops.squeeze(log_prob_sum) * mask
+    batched_mask = array_ops.expand_dims(mask, axis=1)
+    out = state_log_prob * batched_mask
+    out += final_state_log_probs * (1.0 - batched_mask)
+    return out, cum_log_sum
+
+  zero_log_sum = array_ops.zeros([batch_size])
+  maxlen = _get_dim(observed_log_probs, 0)
+  mask = array_ops.sequence_mask(sequence_length, maxlen, dtypes.float32)
+  mask = array_ops.transpose(mask, perm=[1, 0])
+
+  bwd, cum_log_sum = _scan(_backward, (observed_log_probs, mask),
+                           (final_state_log_probs, zero_log_sum),
+                           reverse=True, inclusive=True)
+
+  fwd_bwd_log_probs = fwd[1:] + bwd[1:]
+  fwd_bwd_log_probs_sum = math_ops.reduce_logsumexp(
+      fwd_bwd_log_probs, axis=2, keepdims=True)
+  fwd_bwd_log_probs -= fwd_bwd_log_probs_sum
+  fwd_bwd_log_probs += math_ops.log(array_ops.expand_dims(mask, axis=2))
+
+  log_likelihood = bwd[0, :, 0] + cum_log_sum[0]
+
+  return fwd_bwd_log_probs, log_likelihood
+
+
+# TODO(tombagby): This is currently faster for the ctc implementation than using
+# functional_ops.scan, but could be replaced by that or something similar if
+# things change.
+def _scan(fn, elems, initial, reverse=False, inclusive=False, final_only=False):
+  """Repeatedly applies callable `fn` to a sequence of elements.
+
+  Implemented by functional_ops.While, tpu friendly, no gradient.
+
+  This is similar to functional_ops.scan but significantly faster on tpu/gpu
+  for the forward backward use case.
+
+  Examples:
+    scan(lambda a, e: a + e, [1.0, 2.0, 3.0], 1.0) => [2.0, 3.0, 4.0]
+
+    Multiple accumulators:
+      scan(lambda a, e: (a[0] + e, a[1] * e), [1.0, 2.0, 3.0], (0.0, 1.0))
+
+    Multiple inputs:
+      scan(lambda a, e: a + (e[0] * e[1]), (elems1, elems2), 0.0)
+
+  Args:
+    fn: callable, fn(accumulators, element) return new accumulator values.
+      The (possibly nested) sequence of accumulators is the same as `initial`
+      and the return value must have the same structure.
+    elems: A (possibly nested) tensor which will be unpacked along the first
+      dimension. The resulting slices will be the second argument to fn. The
+      first dimension of all nested input tensors must be the same.
+    initial: A tensor or (possibly nested) sequence of tensors with initial
+      values for the accumulators.
+    reverse: (optional) True enables scan and output elems in reverse order.
+    inclusive: (optional) True includes the initial accumulator values in the
+      output. Length of output will be len(elem sequence) + 1. Not meaningful
+      if final_only is True.
+    final_only: (optional) When True, return only the final accumulated values,
+      not the concatenation of accumulated values for each input.
+
+  Returns:
+    A (possibly nested) sequence of tensors with the results of applying fn
+    to tensors unpacked from elems and previous accumulator values.
+  """
+
+  flat_elems = [ops.convert_to_tensor(x) for x in nest.flatten(elems)]
+  num_elems = array_ops.shape(flat_elems[0])[0]
+  pack_elems = lambda x: nest.pack_sequence_as(structure=elems, flat_sequence=x)
+  flat_initial = [ops.convert_to_tensor(x) for x in nest.flatten(initial)]
+  pack = lambda x: nest.pack_sequence_as(structure=initial, flat_sequence=x)
+  accum_dtypes = [x.dtype for x in flat_initial]
+  num_accums = len(flat_initial)
+
+  # Types for counter, [outputs], [accumulators] loop arguments.
+  if final_only:
+    loop_dtypes = [dtypes.int32, dtypes.int32] + accum_dtypes
+  else:
+    loop_dtypes = [dtypes.int32, dtypes.int32] + accum_dtypes + accum_dtypes
+
+  # TODO(tombagby): Update to tfe.defun
+  @function.Defun(*loop_dtypes)
+  def cond(i, num_elems, *args):
+    del args
+    return i >= 0 if reverse else i < num_elems
+
+  # The loop *args are [output tensors] + [accumulator tensors] which must
+  # be paired. Each output corresponds to one accumulator.
+  @function.Defun(*loop_dtypes)
+  def body(i, num_elems, *args):
+    """Loop body."""
+    i.set_shape([])
+    if final_only:
+      accum = args
+    else:
+      out, accum = args[:num_accums], args[num_accums:]
+    slices = [array_ops.gather(e, i) for e in flat_elems]
+    accum = fn(pack(accum), pack_elems(slices))
+    flat_accum = nest.flatten(accum)
+    if final_only:
+      new_out = []
+    else:
+      update_i = i + 1 if inclusive and not reverse else i
+      new_out = [inplace_ops.alias_inplace_update(x, update_i, y)
+                 for x, y in zip(out, flat_accum)]
+    i = i - 1 if reverse else i + 1
+    return [i, num_elems] + new_out + flat_accum
+
+  init_i = (array_ops.shape(flat_elems[0])[0] - 1 if reverse
+            else constant_op.constant(0, dtype=dtypes.int32))
+  outputs = []
+  if not final_only:
+    num_outputs = array_ops.shape(flat_elems[0])[0] + (1 if inclusive else 0)
+    for initial_accum in flat_initial:
+      out_shape = array_ops.concat(
+          [[num_outputs], array_ops.shape(initial_accum)], 0)
+      out = inplace_ops.empty(out_shape, dtype=initial_accum.dtype, init=True)
+      if inclusive:
+        out = inplace_ops.alias_inplace_add(
+            out, init_i + (1 if reverse else 0), initial_accum)
+      outputs.append(out)
+  loop_in = [init_i, num_elems] + outputs + flat_initial
+  hostmem = [
+      i for i, x in enumerate(loop_in)
+      if x.dtype.base_dtype in (dtypes.int32, dtypes.int64)
+  ]
+
+  # TODO(tombagby): Update to while_v2.
+  loop_results = functional_ops.While(loop_in, cond, body, hostmem=hostmem)
+  out = loop_results[2:num_accums + 2]
+  return pack(out)
+
+
+def _get_dim(tensor, i):
+  """Get value of tensor shape[i] preferring static value if available."""
+  return tensor.shape[i].value or array_ops.shape(tensor)[i]
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index d7834ba350f81356db55d5e7f832764bc850d81c..d96601ac21c7d7d62423b65a2e43d08449e23129 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -18,9 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape as tape_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
@@ -33,6 +35,45 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
+def copy_handle_data(source_t, target_t):
+  """Copies HandleData for variant and resource type tensors if available.
+
+  The CppShapeInferenceResult::HandleData proto contains information about the
+  shapes and types of the element tensors of resource/variant type tensors.
+  We need to copy this across function boundaries, i.e., when capturing a
+  placeholder or when returning a function tensor as output. If we don't do this
+  the element tensors will have unknown shapes, e.g., if a TensorList variant
+  tensor is captured as a placeholder, elements popped from that list would have
+  unknown shape.
+
+  Args:
+    source_t: The tensor to copy HandleData from.
+    target_t: The tensor to copy HandleData to.
+  """
+  if (target_t.dtype == dtypes.resource or
+      target_t.dtype == dtypes.variant):
+    if isinstance(source_t, ops.EagerTensor):
+      handle_data = source_t._handle_data  # pylint: disable=protected-access
+    else:
+      handle_data = resource_variable_ops.get_resource_handle_data(source_t)
+    if handle_data is not None and handle_data.is_set:
+      # pylint: disable=protected-access
+      pywrap_tensorflow.SetHandleShapeAndType(target_t.graph._c_graph,
+                                              target_t._as_tf_output(),
+                                              handle_data.SerializeToString())
+      # pylint: enable=protected-access
+      # Ensure that shapes and dtypes are propagated.
+      shapes, types = zip(*[(pair.shape, pair.dtype)
+                            for pair in handle_data.shape_and_type])
+      ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
+      shapes = [[d.size for d in s.dim]
+                if not s.unknown_rank else None for s in shapes]
+      pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+          target_t._op._graph._c_graph,  # pylint: disable=protected-access
+          target_t._as_tf_output(),  # pylint: disable=protected-access
+          shapes, ranks, types)
+
+
 @tf_export("custom_gradient")
 def custom_gradient(f):
   """Decorator to define a function with a custom gradient.
@@ -90,7 +131,15 @@ def custom_gradient(f):
          a list of `Tensor`s - the derivatives of `Tensor`s in `y` with respect
          to the `Tensor`s in `x`.  `grad_ys` is a `Tensor` or sequence of
          `Tensor`s the same size as `y` holding the initial value gradients for
-         each `Tensor` in `y`. If `f` uses `Variable`s (that are not part of the
+         each `Tensor` in `y`. In a pure mathematical sense, a vector-argument
+         vector-valued function `f`'s derivatives should be its Jacobian matrix
+         `J`. Here we are expressing the Jacobian `J` as a function `grad_fn`
+         which defines how `J` will transform a vector `grad_ys` when
+         left-multiplied with it (`grad_ys * J`). This functional representation
+         of a matrix is convenient to use for chain-rule calculation
+         (in e.g. the back-propagation algorithm).
+
+         If `f` uses `Variable`s (that are not part of the
          inputs), i.e. through `get_variable`, then `grad_fn` should have
          signature `g(*grad_ys, variables=None)`, where `variables` is a list of
          the `Variable`s, and return a 2-tuple `(grad_xs, grad_vars)`, where
@@ -161,8 +210,7 @@ def _graph_mode_decorator(f, *args, **kwargs):
   flat_result = nest.flatten(result)
   all_tensors = flat_result + args + variables
 
-  @ops.RegisterGradient(name)
-  def internal_grad_fn(unused_op, *result_grads):  # pylint: disable=unused-variable
+  def tape_grad_fn(*result_grads):
     """Custom grad fn wrapper."""
     result_grads = result_grads[:len(flat_result)]
     if variables:
@@ -180,8 +228,22 @@ def _graph_mode_decorator(f, *args, **kwargs):
     input_grads = nest.flatten(input_grads)
     return ([None] * len(flat_result)) + input_grads + variable_grads
 
+  @ops.RegisterGradient(name)
+  def internal_grad_fn(unused_op, *result_grads):  # pylint: disable=unused-variable
+    """Custom grad fn wrapper."""
+    return tape_grad_fn(*result_grads)
+
+  original_tensors = all_tensors
   with ops.get_default_graph().gradient_override_map({"IdentityN": name}):
     all_tensors = array_ops.identity_n(all_tensors)
+  # Propagate handle data for happier shape inference for resource variables.
+  for i, t in enumerate(original_tensors):
+    if t.dtype == dtypes.resource and hasattr(t, "_handle_data"):
+      all_tensors[i]._handle_data = t._handle_data  # pylint: disable=protected-access
+  tape_lib.record_operation(
+      f.__name__, all_tensors, original_tensors, tape_grad_fn)
+  for ot, t in zip(original_tensors, all_tensors):
+    copy_handle_data(ot, t)
   return nest.pack_sequence_as(
       structure=result, flat_sequence=all_tensors[:len(flat_result)])
 
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 97b6f3bd9c9d2613e410f103364c5c5da0ab91c7..2030332e4eaec8574010217d26ef6ac52dd988d5 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -79,7 +79,7 @@ def _as_shape_list(shapes,
     shapes = [shapes]
   shapes = [tensor_shape.as_shape(shape) for shape in shapes]
   if not unknown_dim_allowed:
-    if any([not shape.is_fully_defined() for shape in shapes]):
+    if any(not shape.is_fully_defined() for shape in shapes):
       raise ValueError("All shapes must be fully defined: %s" % shapes)
   if not unknown_rank_allowed:
     if any([shape.dims is None for shape in shapes]):
@@ -113,7 +113,7 @@ def _shape_common(s1, s2):
 
 
 # pylint: disable=protected-access
-@tf_export("io.QueueBase", "QueueBase")
+@tf_export("io.QueueBase", v1=["io.QueueBase", "QueueBase"])
 @deprecation.deprecated_endpoints("QueueBase")
 class QueueBase(object):
   """Base class for queue implementations.
@@ -171,7 +171,10 @@ class QueueBase(object):
       self._names = None
     self._queue_ref = queue_ref
     if context.executing_eagerly():
-      self._name = context.context().scope_name
+      if context.context().scope_name:
+        self._name = context.context().scope_name
+      else:
+        self._name = "Empty"
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           queue_ref, None)
     else:
@@ -198,11 +201,11 @@ class QueueBase(object):
       raise TypeError("A list of queues expected")
 
     dtypes = queues[0].dtypes
-    if not all([dtypes == q.dtypes for q in queues[1:]]):
+    if not all(dtypes == q.dtypes for q in queues[1:]):
       raise TypeError("Queues do not have matching component dtypes.")
 
     names = queues[0].names
-    if not all([names == q.names for q in queues[1:]]):
+    if not all(names == q.names for q in queues[1:]):
       raise TypeError("Queues do not have matching component names.")
 
     queue_shapes = [q.shapes for q in queues]
@@ -376,10 +379,16 @@ class QueueBase(object):
 
       # NOTE(mrry): Not using a shape function because we need access to
       # the `QueueBase` object.
-      batch_dim = vals[0].get_shape().with_rank_at_least(1)[0]
+      # NOTE(fchollet): the code that follow is verbose because it needs to be
+      # compatible with both TF v1 TensorShape behavior and TF v2 behavior.
+      batch_dim = tensor_shape.dimension_value(
+          vals[0].get_shape().with_rank_at_least(1)[0])
+      batch_dim = tensor_shape.Dimension(batch_dim)
       for val, shape in zip(vals, self._shapes):
-        batch_dim = batch_dim.merge_with(
+        val_batch_dim = tensor_shape.dimension_value(
             val.get_shape().with_rank_at_least(1)[0])
+        val_batch_dim = tensor_shape.Dimension(val_batch_dim)
+        batch_dim = batch_dim.merge_with(val_batch_dim)
         val.get_shape()[1:].assert_is_compatible_with(shape)
 
       return gen_data_flow_ops.queue_enqueue_many_v2(
@@ -606,7 +615,8 @@ def _shared_name(shared_name):
   return shared_name
 
 
-@tf_export("io.RandomShuffleQueue", "RandomShuffleQueue")
+@tf_export(
+    "io.RandomShuffleQueue", v1=["io.RandomShuffleQueue", "RandomShuffleQueue"])
 @deprecation.deprecated_endpoints("RandomShuffleQueue")
 class RandomShuffleQueue(QueueBase):
   """A queue implementation that dequeues elements in a random order.
@@ -749,7 +759,8 @@ class FIFOQueue(QueueBase):
     super(FIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
-@tf_export("io.PaddingFIFOQueue", "PaddingFIFOQueue")
+@tf_export(
+    "io.PaddingFIFOQueue", v1=["io.PaddingFIFOQueue", "PaddingFIFOQueue"])
 @deprecation.deprecated_endpoints("PaddingFIFOQueue")
 class PaddingFIFOQueue(QueueBase):
   """A FIFOQueue that supports batching variable-sized tensors by padding.
@@ -824,7 +835,7 @@ class PaddingFIFOQueue(QueueBase):
     super(PaddingFIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
-@tf_export("io.PriorityQueue", "PriorityQueue")
+@tf_export("io.PriorityQueue", v1=["io.PriorityQueue", "PriorityQueue"])
 @deprecation.deprecated_endpoints("PriorityQueue")
 class PriorityQueue(QueueBase):
   """A queue implementation that dequeues elements in prioritized order.
@@ -1140,7 +1151,7 @@ class Barrier(object):
         self._barrier_ref, name=name)
 
 
-@tf_export("ConditionalAccumulatorBase")
+@tf_export(v1=["ConditionalAccumulatorBase"])
 class ConditionalAccumulatorBase(object):
   """A conditional accumulator for aggregating gradients.
 
@@ -1219,7 +1230,7 @@ class ConditionalAccumulatorBase(object):
         name=name)
 
 
-@tf_export("ConditionalAccumulator")
+@tf_export(v1=["ConditionalAccumulator"])
 class ConditionalAccumulator(ConditionalAccumulatorBase):
   """A conditional accumulator for aggregating gradients.
 
@@ -1305,8 +1316,9 @@ class ConditionalAccumulator(ConditionalAccumulatorBase):
     return out
 
 
-@tf_export("sparse.SparseConditionalAccumulator",
-           "SparseConditionalAccumulator")
+@tf_export(
+    "sparse.SparseConditionalAccumulator",
+    v1=["sparse.SparseConditionalAccumulator", "SparseConditionalAccumulator"])
 @deprecation.deprecated_endpoints("SparseConditionalAccumulator")
 class SparseConditionalAccumulator(ConditionalAccumulatorBase):
   """A conditional accumulator for aggregating sparse gradients.
diff --git a/tensorflow/python/ops/dequantize_op_test.py b/tensorflow/python/ops/dequantize_op_test.py
index 13e50273d863f3c157ee7a089532df0c925c0e5f..794985b2dbb77e4d7691753432c53ddf3ad31377 100644
--- a/tensorflow/python/ops/dequantize_op_test.py
+++ b/tensorflow/python/ops/dequantize_op_test.py
@@ -35,7 +35,7 @@ class DequantizeOpTest(test.TestCase):
     with self.cached_session():
       input_op = constant_op.constant(inputs, shape=[len(inputs)], dtype=dtype)
       dequantized = array_ops.dequantize(input_op, min_range, max_range)
-      tf_ans = dequantized.eval()
+      tf_ans = self.evaluate(dequantized)
 
     # TODO(vrv): Add support for DT_QINT32 quantization if needed.
     type_dict = {
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index baecc321d3824d550d5b7d9fc86caf4ec93c6c64..4fb598aef4d725bfd0d5a1ce99af7e7a1ea86fb0 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -32,7 +32,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("distributions.Bernoulli")
+@tf_export(v1=["distributions.Bernoulli"])
 class Bernoulli(distribution.Distribution):
   """Bernoulli distribution.
 
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 51c4f6eb3d0ba9e4ec28bde5189b4dba44471990..1d1a666317f83c91ae3d3aaac77596ca8d0f8680 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -47,7 +47,7 @@ _beta_sample_note = """Note: `x` must have dtype `self.dtype` and be in
 `[0, 1].` It must have a shape compatible with `self.batch_shape()`."""
 
 
-@tf_export("distributions.Beta")
+@tf_export(v1=["distributions.Beta"])
 class Beta(distribution.Distribution):
   """Beta distribution.
 
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 26a3da2fb64d39e9353821effebff85c9610a93a..33a843562506e3e57dcba9bc6922f6ae6fa1900a 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -59,7 +59,7 @@ def _broadcast_cat_event_and_params(event, params, base_dtype):
   return event, params
 
 
-@tf_export("distributions.Categorical")
+@tf_export(v1=["distributions.Categorical"])
 class Categorical(distribution.Distribution):
   """Categorical distribution.
 
@@ -214,9 +214,9 @@ class Categorical(distribution.Distribution):
           self._batch_rank = array_ops.rank(self._logits) - 1
 
       logits_shape = array_ops.shape(self._logits, name="logits_shape")
-      if logits_shape_static[-1].value is not None:
+      if tensor_shape.dimension_value(logits_shape_static[-1]) is not None:
         self._event_size = ops.convert_to_tensor(
-            logits_shape_static[-1].value,
+            logits_shape_static.dims[-1].value,
             dtype=dtypes.int32,
             name="event_size")
       else:
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 675c30b383391cf6f3c9cbdf5405945b5e36c66e..971ce46efbc7aaa268c2a61a0da62d64d67668ee 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -45,7 +45,7 @@ dtype `self.dtype` and be in the `(self.event_shape() - 1)`-simplex, i.e.,
 `self.batch_shape() + self.event_shape()`."""
 
 
-@tf_export("distributions.Dirichlet")
+@tf_export(v1=["distributions.Dirichlet"])
 class Dirichlet(distribution.Distribution):
   """Dirichlet distribution.
 
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index 2e3151a5ab4ac94dff6d25ef494f70d8e338482d..8ce01f6b95777248ab772f5903bf061efdcabdce 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -51,7 +51,7 @@ fractional components, and such that
 with `self.concentration` and `self.total_count`."""
 
 
-@tf_export("distributions.DirichletMultinomial")
+@tf_export(v1=["distributions.DirichletMultinomial"])
 class DirichletMultinomial(distribution.Distribution):
   """Dirichlet-Multinomial compound distribution.
 
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 4741370cd80dd41ef2f05ca2f2d8c9b49af75011..d551830fb84784fd6503e4386b587ef1fb3c8101 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -212,7 +212,7 @@ class _DistributionMeta(abc.ABCMeta):
     return abc.ABCMeta.__new__(mcs, classname, baseclasses, attrs)
 
 
-@tf_export("distributions.ReparameterizationType")
+@tf_export(v1=["distributions.ReparameterizationType"])
 class ReparameterizationType(object):
   """Instances of this class represent how sampling is reparameterized.
 
@@ -263,7 +263,7 @@ class ReparameterizationType(object):
 # reparameterized distribution support straight-through gradients with
 # respect to all parameters.
 FULLY_REPARAMETERIZED = ReparameterizationType("FULLY_REPARAMETERIZED")
-tf_export("distributions.FULLY_REPARAMETERIZED").export_constant(
+tf_export(v1=["distributions.FULLY_REPARAMETERIZED"]).export_constant(
     __name__, "FULLY_REPARAMETERIZED")
 
 
@@ -271,12 +271,12 @@ tf_export("distributions.FULLY_REPARAMETERIZED").export_constant(
 # reparameterized distribution do not support straight-through gradients for
 # at least some of the parameters.
 NOT_REPARAMETERIZED = ReparameterizationType("NOT_REPARAMETERIZED")
-tf_export("distributions.NOT_REPARAMETERIZED").export_constant(
+tf_export(v1=["distributions.NOT_REPARAMETERIZED"]).export_constant(
     __name__, "NOT_REPARAMETERIZED")
 
 
 @six.add_metaclass(_DistributionMeta)
-@tf_export("distributions.Distribution")
+@tf_export(v1=["distributions.Distribution"])
 class Distribution(_BaseDistribution):
   """A generic probability distribution base class.
 
@@ -1315,7 +1315,7 @@ class Distribution(_BaseDistribution):
       return static_shape.ndims == 0
     shape = dynamic_shape_fn()
     if (shape.get_shape().ndims is not None and
-        shape.get_shape()[0].value is not None):
+        shape.get_shape().dims[0].value is not None):
       # If the static_shape is correctly written then we should never execute
       # this branch. We keep it just in case there's some unimagined corner
       # case.
diff --git a/tensorflow/python/ops/distributions/distributions.py b/tensorflow/python/ops/distributions/distributions.py
index 59ed455e43806dedf34818e63235dee5c4440fd2..b18caa5b2eb87141659c310c92c5c221e1f2cb3f 100644
--- a/tensorflow/python/ops/distributions/distributions.py
+++ b/tensorflow/python/ops/distributions/distributions.py
@@ -17,21 +17,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.util import deprecation
 
-# pylint: disable=wildcard-import,unused-import
-from tensorflow.python.ops.distributions.bernoulli import Bernoulli
-from tensorflow.python.ops.distributions.beta import Beta
-from tensorflow.python.ops.distributions.categorical import Categorical
-from tensorflow.python.ops.distributions.dirichlet import Dirichlet
-from tensorflow.python.ops.distributions.dirichlet_multinomial import DirichletMultinomial
-from tensorflow.python.ops.distributions.distribution import *
-from tensorflow.python.ops.distributions.exponential import Exponential
-from tensorflow.python.ops.distributions.gamma import Gamma
-from tensorflow.python.ops.distributions.kullback_leibler import *
-from tensorflow.python.ops.distributions.laplace import Laplace
-from tensorflow.python.ops.distributions.multinomial import Multinomial
-from tensorflow.python.ops.distributions.normal import Normal
-from tensorflow.python.ops.distributions.student_t import StudentT
-from tensorflow.python.ops.distributions.uniform import Uniform
-# pylint: enable=wildcard-import,unused-import
 
+# pylint: disable=wildcard-import,unused-import,g-import-not-at-top
+with deprecation.silence():
+  from tensorflow.python.ops.distributions.bernoulli import Bernoulli
+  from tensorflow.python.ops.distributions.beta import Beta
+  from tensorflow.python.ops.distributions.categorical import Categorical
+  from tensorflow.python.ops.distributions.dirichlet import Dirichlet
+  from tensorflow.python.ops.distributions.dirichlet_multinomial import DirichletMultinomial
+  from tensorflow.python.ops.distributions.distribution import *
+  from tensorflow.python.ops.distributions.exponential import Exponential
+  from tensorflow.python.ops.distributions.gamma import Gamma
+  from tensorflow.python.ops.distributions.kullback_leibler import *
+  from tensorflow.python.ops.distributions.laplace import Laplace
+  from tensorflow.python.ops.distributions.multinomial import Multinomial
+  from tensorflow.python.ops.distributions.normal import Normal
+  from tensorflow.python.ops.distributions.student_t import StudentT
+  from tensorflow.python.ops.distributions.uniform import Uniform
+# pylint: enable=wildcard-import,unused-import
+del deprecation
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index 6a52af8c33e8a62ef5ab18640a77660faefa52c2..8b79a5d4abdbb20086ac9cba49370a9b084fe2b6 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -37,7 +37,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Exponential")
+@tf_export(v1=["distributions.Exponential"])
 class Exponential(gamma.Gamma):
   """Exponential distribution.
 
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index 4a2db208d40982f7f2c6a933145deb4528a6853b..57505d1b1311054f4d837e5e0b958df855df4881 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -43,7 +43,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Gamma")
+@tf_export(v1=["distributions.Gamma"])
 class Gamma(distribution.Distribution):
   """Gamma distribution.
 
diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
index 12743fa23d6a3d69ee65d4d653f0d195d77fb0fc..5c6745b0fe0c4036f91885a30e428fe76f316b8f 100644
--- a/tensorflow/python/ops/distributions/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -60,7 +60,7 @@ def _registered_kl(type_a, type_b):
     "should update all references to use `tfp.distributions` "
     "instead of `tf.distributions`.",
     warn_once=True)
-@tf_export("distributions.kl_divergence")
+@tf_export(v1=["distributions.kl_divergence"])
 def kl_divergence(distribution_a, distribution_b,
                   allow_nan_stats=True, name=None):
   """Get the KL-divergence KL(distribution_a || distribution_b).
@@ -161,7 +161,7 @@ def cross_entropy(ref, other,
         ref, other, allow_nan_stats=allow_nan_stats)
 
 
-@tf_export("distributions.RegisterKL")
+@tf_export(v1=["distributions.RegisterKL"])
 class RegisterKL(object):
   """Decorator to register a KL divergence implementation function.
 
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index 4f6a8f587d1e921413069a82df9bedf58730c310..a96b58ba1a64246e6f7f2d4a44bdbdae1f8d0cf8 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -43,7 +43,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Laplace")
+@tf_export(v1=["distributions.Laplace"])
 class Laplace(distribution.Distribution):
   """The Laplace distribution with location `loc` and `scale` parameters.
 
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 8397353cd5ed38ac2afb1a773367d97db7185d90..97d2b1b26c68dc53f0a77120c9d3820c1d0f017b 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -52,7 +52,7 @@ fractional components, and such that
 with `self.probs` and `self.total_count`."""
 
 
-@tf_export("distributions.Multinomial")
+@tf_export(v1=["distributions.Multinomial"])
 class Multinomial(distribution.Distribution):
   """Multinomial distribution.
 
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 9f511709b90b039e739d6f14b4f293ef029a4dbf..9acc0469885c2463e84f875314f07d1f3d55481a 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -42,7 +42,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.Normal")
+@tf_export(v1=["distributions.Normal"])
 class Normal(distribution.Distribution):
   """The Normal distribution with location `loc` and `scale` parameters.
 
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index b69e61925c122d6e29b76fe5de89a546b1695325..351f5605e24770c152ad01f09b9ee78b59c3ddf5 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -43,7 +43,7 @@ __all__ = [
 ]
 
 
-@tf_export("distributions.StudentT")
+@tf_export(v1=["distributions.StudentT"])
 class StudentT(distribution.Distribution):
   """Student's t-distribution.
 
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index b6b24187cc537809dc167205baf4c2a76e06c8d5..8fac0167778b824c9621462ce4981f6d767bedf2 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -33,7 +33,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("distributions.Uniform")
+@tf_export(v1=["distributions.Uniform"])
 class Uniform(distribution.Distribution):
   """Uniform distribution with `low` and `high` parameters.
 
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index ad848dfee6b9a4cc18a159e80d97da081fa48266..24314e8fc92b3aef2718dd6668ca5564764aa8f4 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -343,7 +343,7 @@ def embed_check_categorical_event_shape(
     x_dtype = x.dtype.base_dtype
     max_event_size = (_largest_integer_by_dtype(x_dtype)
                       if x_dtype.is_floating else 0)
-    if max_event_size is 0:
+    if max_event_size == 0:
       raise TypeError("Unable to validate size of unrecognized dtype "
                       "({}).".format(x_dtype.name))
     try:
@@ -351,8 +351,8 @@ def embed_check_categorical_event_shape(
     except ValueError:
       raise ValueError("A categorical-distribution parameter must have "
                        "at least 1 dimension.")
-    if x_shape_static[-1].value is not None:
-      event_size = x_shape_static[-1].value
+    if tensor_shape.dimension_value(x_shape_static[-1]) is not None:
+      event_size = x_shape_static.dims[-1].value
       if event_size < 2:
         raise ValueError("A categorical-distribution parameter must have at "
                          "least 2 events.")
@@ -831,9 +831,10 @@ def fill_triangular(x, upper=False, name=None):
 
   with ops.name_scope(name, "fill_triangular", values=[x]):
     x = ops.convert_to_tensor(x, name="x")
-    if x.shape.with_rank_at_least(1)[-1].value is not None:
+    if tensor_shape.dimension_value(
+        x.shape.with_rank_at_least(1)[-1]) is not None:
       # Formula derived by solving for n: m = n(n+1)/2.
-      m = np.int32(x.shape[-1].value)
+      m = np.int32(x.shape.dims[-1].value)
       n = np.sqrt(0.25 + 2. * m) - 0.5
       if n != np.floor(n):
         raise ValueError("Input right-most shape ({}) does not "
@@ -943,8 +944,9 @@ def fill_triangular_inverse(x, upper=False, name=None):
 
   with ops.name_scope(name, "fill_triangular_inverse", values=[x]):
     x = ops.convert_to_tensor(x, name="x")
-    if x.shape.with_rank_at_least(2)[-1].value is not None:
-      n = np.int32(x.shape[-1].value)
+    if tensor_shape.dimension_value(
+        x.shape.with_rank_at_least(2)[-1]) is not None:
+      n = np.int32(x.shape.dims[-1].value)
       m = np.int32((n * (n + 1)) // 2)
       static_final_shape = x.shape[:-2].concatenate([m])
     else:
@@ -1197,7 +1199,8 @@ def dimension_size(x, axis):
   """Returns the size of a specific dimension."""
   # Since tf.gather isn't "constant-in, constant-out", we must first check the
   # static shape or fallback to dynamic shape.
-  s = x.shape.with_rank_at_least(np.abs(axis))[axis].value
+  s = tensor_shape.dimension_value(
+      x.shape.with_rank_at_least(np.abs(axis))[axis])
   if s is not None:
     return s
   return array_ops.shape(x)[axis]
@@ -1247,7 +1250,7 @@ def process_quadrature_grid_and_probs(
 
     def _static_event_size(x):
       """Returns the static size of a specific dimension or `None`."""
-      return x.shape.with_rank_at_least(1)[-1].value
+      return tensor_shape.dimension_value(x.shape.with_rank_at_least(1)[-1])
 
     m, n = _static_event_size(probs), _static_event_size(grid)
     if m is not None and n is not None:
@@ -1316,7 +1319,8 @@ def pad(x, axis, front=False, back=False, value=0, count=1, name=None):
         head = x.shape[:axis]
         middle = tensor_shape.TensorShape(
             None if count_ is None
-            else (x.shape[axis] + count_ * (front + back)))
+            else (tensor_shape.dimension_at_index(
+                x.shape, axis) + count_ * (front + back)))
         tail = x.shape[axis+1:]
         final_shape = head.concatenate(middle.concatenate(tail))
       else:
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 6263041b8d703c2b37c41b41c4e7323882bc777c..d0291e2095bdb6574c707c7458e4cc335fc4b825 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -155,16 +155,19 @@ def _embedding_lookup_and_transform(params,
         # Compute num_total_ids as the sum of dim-0 of params, then assign to
         # partitions based on a constant number of ids per partition. Optimize
         # if we already know the full shape statically.
-        dim_0_size = params[0].get_shape()[0]
+        dim_0_size = tensor_shape.Dimension(tensor_shape.dimension_value(
+            params[0].get_shape()[0]))
         for p in xrange(1, np):
-          dim_0_size += params[p].get_shape()[0]
+          dim_0_size += tensor_shape.Dimension(tensor_shape.dimension_value(
+              params[p].get_shape()[0]))
         if dim_0_size.value:
           num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
         else:
           dim_0_sizes = []
           for p in xrange(np):
-            if params[p].get_shape()[0].value is not None:
-              dim_0_sizes.append(params[p].get_shape()[0].value)
+            param_p_dim = tensor_shape.dimension_value(params[p].get_shape()[0])
+            if param_p_dim is not None:
+              dim_0_sizes.append(param_p_dim)
             else:
               with ops.colocate_with(params[p]):
                 dim_0_sizes.append(array_ops.shape(params[p])[0])
@@ -244,7 +247,7 @@ def _embedding_lookup_and_transform(params,
       return ret
 
 
-@tf_export("nn.embedding_lookup")
+@tf_export(v1=["nn.embedding_lookup"])
 def embedding_lookup(
     params,
     ids,
@@ -313,7 +316,66 @@ def embedding_lookup(
       transform_fn=None)
 
 
-@tf_export("nn.embedding_lookup_sparse")
+@tf_export("nn.embedding_lookup", v1=[])
+def embedding_lookup_v2(
+    params,
+    ids,
+    partition_strategy="mod",
+    max_norm=None,
+    name=None):
+  """Looks up `ids` in a list of embedding tensors.
+
+  This function is used to perform parallel lookups on the list of
+  tensors in `params`.  It is a generalization of
+  `tf.gather`, where `params` is
+  interpreted as a partitioning of a large embedding tensor.  `params` may be
+  a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  If `len(params) > 1`, each element `id` of `ids` is partitioned between
+  the elements of `params` according to the `partition_strategy`.
+  In all strategies, if the id space does not evenly divide the number of
+  partitions, each of the first `(max_id + 1) % len(params)` partitions will
+  be assigned one more id.
+
+  If `partition_strategy` is `"mod"`, we assign each id to partition
+  `p = id % len(params)`. For instance,
+  13 ids are split across 5 partitions as:
+  `[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`
+
+  If `partition_strategy` is `"div"`, we assign ids to partitions in a
+  contiguous manner. In this case, 13 ids are split across 5 partitions as:
+  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
+
+  The results of the lookup are concatenated into a dense
+  tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
+
+  Args:
+    params: A single tensor representing the complete embedding tensor,
+      or a list of P tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors.  Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
+    ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
+      up in `params`.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+      is `"mod"`.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is
+      larger than this value.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with the same type as the tensors in `params`.
+
+  Raises:
+    ValueError: If `params` is empty.
+  """
+  return embedding_lookup(params, ids, partition_strategy, name,
+                          max_norm=max_norm)
+
+
+@tf_export(v1=["nn.embedding_lookup_sparse"])
 def embedding_lookup_sparse(params,
                             sp_ids,
                             sp_weights,
@@ -488,7 +550,85 @@ def embedding_lookup_sparse(params,
     return embeddings
 
 
-@tf_export("nn.safe_embedding_lookup_sparse")
+@tf_export("nn.embedding_lookup_sparse", v1=[])
+def embedding_lookup_sparse_v2(params,
+                               sp_ids,
+                               sp_weights,
+                               partition_strategy="mod",
+                               combiner=None,
+                               max_norm=None,
+                               name=None):
+  return embedding_lookup_sparse_v2(
+      params, sp_ids, sp_weights, partition_strategy, name, combiner, max_norm)
+
+
+embedding_lookup_sparse_v2.__doc__ = embedding_lookup_sparse.__doc__
+
+
+@tf_export("nn.safe_embedding_lookup_sparse", v1=[])
+def safe_embedding_lookup_sparse_v2(embedding_weights,
+                                    sparse_ids,
+                                    sparse_weights=None,
+                                    combiner="mean",
+                                    default_id=None,
+                                    max_norm=None,
+                                    name=None):
+  """Lookup embedding results, accounting for invalid IDs and empty features.
+
+  The partitioned embedding in `embedding_weights` must all be the same shape
+  except for the first dimension. The first dimension is allowed to vary as the
+  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
+  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
+  with non-positive weight. For an entry with no features, the embedding vector
+  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
+
+  The ids and weights may be multi-dimensional. Embeddings are always aggregated
+  along the last dimension.
+
+  Note: when doing embedding lookup on `embedding_weights`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Args:
+    embedding_weights:  A list of `P` float `Tensor`s or values representing
+      partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
+      created by partitioning along dimension 0.  The total unpartitioned shape
+      should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size
+      and `e_1, ..., e_m` are the embedding dimensions.
+    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
+      ids. `d_0` is typically batch size.
+    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
+      float weights corresponding to `sparse_ids`, or `None` if all weights are
+      be assumed to be 1.0.
+    combiner: A string specifying how to combine embedding results for each
+      entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
+      default.
+    default_id: The id to use for an entry with no features.
+    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
+      combining.
+    name: A name for this operation (optional).
+
+  Returns:
+    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
+
+  Raises:
+    ValueError: if `embedding_weights` is empty.
+  """
+  return safe_embedding_lookup_sparse(
+      embedding_weights,
+      sparse_ids,
+      sparse_weights=sparse_weights,
+      combiner=combiner,
+      default_id=default_id,
+      name=name,
+      partition_strategy="div",
+      max_norm=max_norm)
+
+
+@tf_export(v1=["nn.safe_embedding_lookup_sparse"])
 def safe_embedding_lookup_sparse(embedding_weights,
                                  sparse_ids,
                                  sparse_weights=None,
@@ -551,7 +691,10 @@ def safe_embedding_lookup_sparse(embedding_weights,
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
   embedding_weights = [
-      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+      w if (isinstance(w, resource_variable_ops.ResourceVariable)
+            and dtype in (None, w.dtype))
+      else ops.convert_to_tensor(w, dtype=dtype)
+      for w in embedding_weights
   ]
 
   with ops.name_scope(name, 'embedding_lookup',
@@ -559,11 +702,12 @@ def safe_embedding_lookup_sparse(embedding_weights,
                                            sparse_weights]) as scope:
     # Reshape higher-rank sparse ids and weights to linear segment ids.
     original_shape = sparse_ids.dense_shape
-    original_rank_dim = sparse_ids.dense_shape.get_shape()[0]
+    original_rank_dim = tensor_shape.dimension_value(
+        sparse_ids.dense_shape.get_shape()[0])
     original_rank = (
         array_ops.size(original_shape)
-        if original_rank_dim.value is None
-        else original_rank_dim.value)
+        if original_rank_dim is None
+        else original_rank_dim)
     sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
         math_ops.reduce_prod(
             array_ops.slice(original_shape, [0], [original_rank - 1])),
@@ -617,7 +761,8 @@ def safe_embedding_lookup_sparse(embedding_weights,
             array_ops.slice(array_ops.shape(result), [1], [-1])
         ], 0))
     final_result.set_shape(tensor_shape.unknown_shape(
-        (original_rank_dim - 1).value).concatenate(result.get_shape()[1:]))
+        (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
+            result.get_shape()[1:]))
     return final_result
 
 
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 119d9522bd58276dc73e5d73978adb77a77eb053..57542e3c7baa0f4eb3dc53431c9a3060f0998c5b 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -13,16 +13,13 @@
 # limitations under the License.
 # =============================================================================
 
-"""Functional operations.
-
-See the [Higher Order
-Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
-"""
+"""Functional operations."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -42,6 +39,7 @@ from tensorflow.python.ops.gen_functional_ops import remote_call
 # pylint: enable=unused-import
 from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -124,7 +122,8 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     elems_flat = [
         ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
     ]
-    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+    n = (tensor_shape.dimension_value(elems_flat[0].shape[0])
+         or array_ops.shape(elems_flat[0])[0])
 
     elems_ta = nest.map_structure(create_ta, elems)
 
@@ -231,7 +230,8 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     elems_flat = [
         ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
     ]
-    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+    n = (tensor_shape.dimension_value(elems_flat[0].shape[0])
+         or array_ops.shape(elems_flat[0])[0])
 
     elems_ta = nest.map_structure(create_ta, elems)
 
@@ -445,7 +445,8 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
         raise ValueError(
             "elements in elems must be 1+ dimensional Tensors, not scalars"
         )
-    n = static_shape[0].value or array_ops.shape(elems_flat[0])[0]
+    n = (tensor_shape.dimension_value(static_shape[0])
+         or array_ops.shape(elems_flat[0])[0])
 
     # TensorArrays are always flat
     elems_ta = [
@@ -494,9 +495,11 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
         maximum_iterations=n)
     results_flat = [r.stack() for r in r_a]
 
-    n_static = elems_flat[0].get_shape().with_rank_at_least(1)[0]
+    n_static = tensor_shape.Dimension(tensor_shape.dimension_value(
+        elems_flat[0].get_shape().with_rank_at_least(1)[0]))
     for elem in elems_flat[1:]:
-      n_static.merge_with(elem.get_shape().with_rank_at_least(1)[0])
+      n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value(
+          elem.get_shape().with_rank_at_least(1)[0])))
     for r in results_flat:
       r.set_shape(tensor_shape.TensorShape(n_static).concatenate(
           r.get_shape()[1:]))
@@ -643,7 +646,8 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
 
     # Convert elems to tensor array. n may be known statically.
-    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+    n = (tensor_shape.dimension_value(elems_flat[0].shape[0])
+         or array_ops.shape(elems_flat[0])[0])
 
     # TensorArrays are always flat
     elems_ta = [
@@ -719,9 +723,11 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
     results_flat = [r.stack() for r in r_a]
 
-    n_static = elems_flat[0].get_shape().with_rank_at_least(1)[0]
+    n_static = tensor_shape.Dimension(tensor_shape.dimension_value(
+        elems_flat[0].get_shape().with_rank_at_least(1)[0]))
     for elem in elems_flat[1:]:
-      n_static.merge_with(elem.get_shape().with_rank_at_least(1)[0])
+      n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value(
+          elem.get_shape().with_rank_at_least(1)[0])))
     for r in results_flat:
       r.set_shape(tensor_shape.TensorShape(n_static).concatenate(
           r.get_shape()[1:]))
@@ -796,6 +802,29 @@ def Gradient(inputs, f, name=None):
   return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
 
 
+def _LoopBodyCaptureWrapper(func):
+  """Returns a wrapper for `func` that handles loop-carried captured inputs."""
+
+  @function.Defun(
+      *func.declared_input_types, func_name="%s_Wrapper" % func.name)
+  def Wrapper(*args):
+    """A wrapper that handles loop-carried captured inputs."""
+    result = func(*args)
+    extra_args = tuple(function.get_extra_args())
+    # Nullary functions return an Operation. Normal functions can't do this
+    # because their return values are converted to Tensors.
+    if isinstance(result, ops.Operation):
+      return extra_args
+    # Unary functions return a single Tensor value.
+    elif not isinstance(result, tuple):
+      return (result,) + extra_args
+    # N-ary functions return a tuple of Tensors.
+    else:
+      return result + extra_args
+
+  return Wrapper
+
+
 # pylint: disable=invalid-name,protected-access
 def While(input_, cond, body, name=None, hostmem=None):
   r"""output = input; While (Cond(output)) { output = Body(output) }.
@@ -817,11 +846,41 @@ def While(input_, cond, body, name=None, hostmem=None):
     hostmem: A list of integer. If i is in the list, input[i] is a
       host memory tensor.
 
+  Raises:
+    ValueError: if `cond` has implicitly captured inputs or if `cond` and `body`
+      have different signatures.
+
   Returns:
     A list of `Tensor` objects. Has the same type as `input`.
     A list of output tensors whose types are T.
   """
-  ret = gen_functional_ops._while(input_, cond, body, name=name)
+  if cond.captured_inputs:
+    raise ValueError("While op 'cond' argument must be a function "
+                     "without implicitly captured inputs.")
+
+  if cond.declared_input_types != body.declared_input_types:
+    raise ValueError(
+        "While op 'cond' and 'body' signatures do not match. %r vs %r" %
+        (cond.declared_input_types, body.declared_input_types))
+
+  if body.captured_inputs:
+    cond_dtypes = list(
+        body.declared_input_types) + [t.dtype for t in body.captured_inputs]
+
+    @function.Defun(*cond_dtypes, func_name="%s_Wrapper" % cond.name)
+    def CondWrapper(*args):
+      """A wrapper that handles loop-carried captured inputs."""
+      return cond(*args[:len(body.declared_input_types)])
+
+    ret = gen_functional_ops._while(
+        input_ + body.captured_inputs,
+        CondWrapper,
+        _LoopBodyCaptureWrapper(body),
+        name=name)
+    # Slice off the loop-carried captured inputs.
+    ret = ret[:-len(body.captured_inputs)]
+  else:
+    ret = gen_functional_ops._while(input_, cond, body, name=name)
   if hostmem:
     input_attr = attr_value_pb2.AttrValue()
     input_attr.list.i.extend(hostmem)
@@ -870,11 +929,10 @@ def _ForUsingWhile(start,
   # must have identical inputs, we have to augment the cond signature to take
   # the same types as the carried loop variables.
   body_sig = [dtypes.int32] * 4 + list(forbody.declared_input_types)[1:]
-  cond_sig = body_sig + [t.dtype for t in forbody.captured_inputs]
 
   cond_name = "%s_Cond" % forbody.name
 
-  @function.Defun(*cond_sig, func_name=cond_name)
+  @function.Defun(*body_sig, func_name=cond_name)
   def WhileCond(i, n, *args):
     del args
     return i < n
@@ -892,8 +950,7 @@ def _ForUsingWhile(start,
     # Unary functions return a single Tensor value.
     elif isinstance(for_result, ops.Tensor):
       for_result = (for_result,)
-    extra_args = tuple(function.get_extra_args())
-    return (i + 1, n, start, delta) + tuple(for_result) + extra_args
+    return (i + 1, n, start, delta) + tuple(for_result)
 
   if hostmem is not None:
     hostmem = [0, 1, 2, 3] + [(4 + _) for _ in hostmem]
@@ -901,13 +958,13 @@ def _ForUsingWhile(start,
     hostmem = [0, 1, 2, 3]
 
   results = While(
-      input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs,
+      input_=[0, n, start, delta] + inputs,
       cond=WhileCond,
       body=WhileBody,
       name=name,
       hostmem=hostmem)
   # Slice off the loop-carried captured inputs.
-  return list(results[4:len(results) - len(WhileBody.captured_inputs)])
+  return list(results[4:len(results)])
 
 
 def For(start,
@@ -941,29 +998,15 @@ def For(start,
   if rewrite_with_while:
     return _ForUsingWhile(start, limit, delta, inputs, body, name, hostmem)
   if body.captured_inputs:
-    wrapper_name = "%s_BodyWrapper" % body.name
-
-    @function.Defun(*body.declared_input_types, func_name=wrapper_name)
-    def BodyWrapper(*args):
-      """A wrapper for body that handles loop-carried captured inputs."""
-      body_result = body(*args)
-      extra_args = tuple(function.get_extra_args())
-      # Nullary functions return an Operation. Normal functions can't do this
-      # because their return values are converted to Tensors.
-      if isinstance(body_result, ops.Operation):
-        return extra_args
-      # Unary functions return a single Tensor value.
-      elif not isinstance(body_result, tuple):
-        return (body_result,) + extra_args
-      # N-ary functions return a tuple of Tensors.
-      else:
-        return body_result + extra_args
-
-    inputs += BodyWrapper.captured_inputs
     ret = gen_functional_ops._for(
-        start, limit, delta, inputs, BodyWrapper, name=name)
+        start,
+        limit,
+        delta,
+        inputs + body.captured_inputs,
+        _LoopBodyCaptureWrapper(body),
+        name=name)
     # Slice off the loop-carried captured inputs.
-    ret = ret[:-len(BodyWrapper.captured_inputs)]
+    ret = ret[:-len(body.captured_inputs)]
   else:
     ret = gen_functional_ops._for(start, limit, delta, inputs, body, name=name)
   if hostmem:
@@ -979,8 +1022,20 @@ def For(start,
   return ret
 # pylint: enable=invalid-name,protected-access
 
+_rewriter_config_optimizer_disabled = None
+
+def _get_disabled_rewriter_config():
+  global _rewriter_config_optimizer_disabled
+  if _rewriter_config_optimizer_disabled is None:
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
+    rewriter_config.disable_meta_optimizer = True
+    _rewriter_config_optimizer_disabled = config.SerializeToString()
+  return _rewriter_config_optimizer_disabled
+
 
-def partitioned_call(args, f, tout=None, executing_eagerly=None):
+def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
+                     executor_type=None):
   """Executes a function while respecting device annotations.
 
   Currently, only those functions that execute within the same address space
@@ -994,6 +1049,12 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None):
       the signature of `f`.
     executing_eagerly: (Optional) A boolean indicating whether the context is
       executing eagerly. If `None`, fetched from the global context.
+    config: (Optional) A `tensorflow::ConfigProto` proto, serialized. If
+      `None`, all optimizations are disabled. Currently only handled for eager
+      defined functions.
+    executor_type: (Optional) A string for the name of the executor to be used
+      in the function call. If not set, or set to an empty string, the default
+      tensorflow executor will be used.
 
   Returns:
     The list of `Tensor`s returned by invoking `f(args)`. If the function does
@@ -1007,12 +1068,21 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None):
   if executing_eagerly is None:
     executing_eagerly = context.executing_eagerly()
 
+  if config is None:
+    config = _get_disabled_rewriter_config()
+
+  if executor_type is None:
+    executor_type = ""
+
   if executing_eagerly or len(tout):
     if f.stateful_ops:
       outputs = gen_functional_ops.stateful_partitioned_call(
-          args=args, Tout=tout, f=f)
+          args=args, Tout=tout, f=f, config_proto=config,
+          executor_type=executor_type)
     else:
-      outputs = gen_functional_ops.partitioned_call(args=args, Tout=tout, f=f)
+      outputs = gen_functional_ops.partitioned_call(
+          args=args, Tout=tout, f=f, config_proto=config,
+          executor_type=executor_type)
     return outputs if outputs else None
 
   # The generated binding returns an empty list for functions that don't
@@ -1025,6 +1095,13 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None):
       list=attr_value_pb2.AttrValue.ListValue(type=tout))
   func_attr = attr_value_pb2.AttrValue(
       func=attr_value_pb2.NameAttrList(name=f.name))
+  executor_type_attr = attr_value_pb2.AttrValue(
+      s=compat.as_bytes(executor_type))
+
+  # When running in graph mode, the graph and function graphs are optimized
+  # (i.e. run through grappler) per the session options, so we can disable any
+  # eager-specific rewriting.
+  config_proto = attr_value_pb2.AttrValue(s=_get_disabled_rewriter_config())
 
   graph = ops.get_default_graph()
   f.add_to_graph(graph)
@@ -1035,6 +1112,12 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None):
       tout,
       compute_shapes=False,
       name="PartitionedFunctionCall",
-      attrs={"Tin": tin_attr, "Tout": tout_attr, "f": func_attr})
+      attrs={
+          "Tin": tin_attr,
+          "Tout": tout_attr,
+          "f": func_attr,
+          "config_proto": config_proto,
+          "executor_type": executor_type_attr,
+      })
   outputs = op.outputs
   return outputs if outputs else op
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 94c8d7933523a315523cf7b2d34d8263785b6eeb..683f78ce9b21c5a1b5d8b60017588ee8a09686f2 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -157,7 +157,8 @@ def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
   # as delta. Convert to float32 here. Since numeric_jacobian is expected to
   # be the groundtruth to compare against, it shouldn't lose any information.
   if x.dtype == dtypes.bfloat16:
-    x = math_ops.cast(x, dtypes.float32)
+    x = math_ops.cast(x, dtypes.float32)  # TODO(wangpeng): Now that the new x
+            # is an output of the old x, isn't feeding to the new x a mistake?
   if y.dtype == dtypes.bfloat16:
     y = math_ops.cast(y, dtypes.float32)
   if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
@@ -265,7 +266,7 @@ def _compute_gradient_list(x,
   return ret
 
 
-@tf_export("test.compute_gradient")
+@tf_export(v1=["test.compute_gradient"])
 def compute_gradient(x,
                      x_shape,
                      y,
@@ -300,7 +301,6 @@ def compute_gradient(x,
       as the initial value.
     delta: (optional) the amount of perturbation.
     init_targets: list of targets to run to initialize model params.
-      TODO(mrry): remove this argument.
     extra_feed_dict: dict that allows fixing specified tensor values
       during the Jacobian calculation.
 
@@ -310,6 +310,7 @@ def compute_gradient(x,
     where "x_size" is the number of elements in x and "y_size" is the
     number of elements in y. If x is a list, returns a list of two numpy arrays.
   """
+  # TODO(mrry): remove argument `init_targets`
   if extra_feed_dict is None:
     extra_feed_dict = {}
 
@@ -327,7 +328,17 @@ def compute_gradient(x,
     return ret
 
 
-@tf_export("test.compute_gradient_error")
+def _compute_error(grad):
+  if isinstance(grad, tuple):
+    grad = [grad]
+  error = 0
+  for j_t, j_n in grad:
+    if j_t.size or j_n.size:  # Handle zero size tensors correctly
+      error = np.maximum(error, np.fabs(j_t - j_n).max())
+  return error
+
+
+@tf_export(v1=["test.compute_gradient_error"])
 def compute_gradient_error(x,
                            x_shape,
                            y,
@@ -369,10 +380,4 @@ def compute_gradient_error(x,
   """
   grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta,
                           init_targets, extra_feed_dict=extra_feed_dict)
-  if isinstance(grad, tuple):
-    grad = [grad]
-  error = 0
-  for j_t, j_n in grad:
-    if j_t.size or j_n.size:  # Handle zero size tensors correctly
-      error = np.maximum(error, np.fabs(j_t - j_n).max())
-  return error
+  return _compute_error(grad)
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index fbb84b9018765b4a31f4eab1001641ce9d2a53ae..4d2b5efac7beb258f2720055bb3db56e9790042f 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -46,9 +47,10 @@ def _nan_grad(unused_op, grad):
 
 class GradientCheckerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAddSimple(self):
     np.random.seed(1)  # Fix seed to avoid flakiness
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       # a test case for Add operation
       size = (2, 3)
       x1 = constant_op.constant(2.0, shape=size, name="x1")
@@ -60,9 +62,10 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x1 error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testAddSimpleGPU(self):
     np.random.seed(2)  # Fix seed to avoid flakiness
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # a test case for Add operation
       size = (2, 3)
       x1 = constant_op.constant(2.0, shape=size, name="x1")
@@ -74,6 +77,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x1 error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testAddCustomized(self):
     np.random.seed(3)  # Fix seed to avoid flakiness
     with self.cached_session():
@@ -92,6 +96,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x2 error = %f", error)
     assert error < 1e-10
 
+  @test_util.run_deprecated_v1
   def testGather(self):
     np.random.seed(4)  # Fix seed to avoid flakiness
     with self.cached_session():
@@ -109,6 +114,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("gather error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testNestedGather(self):
     np.random.seed(5)  # Fix seed to avoid flakiness
     with self.cached_session():
@@ -130,6 +136,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("nested gather error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testComplexMul(self):
     with self.cached_session():
       size = ()
@@ -144,6 +151,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertLess(
           gradient_checker.compute_gradient_error(x, size, y, size), 2e-4)
 
+  @test_util.run_deprecated_v1
   def testComplexConj(self):
     with self.cached_session():
       size = ()
@@ -157,6 +165,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertLess(
           gradient_checker.compute_gradient_error(x, size, y, size), 2e-5)
 
+  @test_util.run_deprecated_v1
   def testEmptySucceeds(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32)
@@ -216,7 +225,7 @@ class MiniMNISTTest(test.TestCase):
     s = label_data.sum(axis=1)
     label_data /= s[:, None]
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       # We treat the inputs as "parameters" here
       inp = constant_op.constant(
           inp_data.tolist(),
@@ -279,18 +288,23 @@ class MiniMNISTTest(test.TestCase):
     tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
     return err
 
+  @test_util.run_deprecated_v1
   def testInputGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(0, "input"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testHiddenWeightGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(1, "hidden_weight"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testHiddenBiasGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(2, "hidden_bias"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testSoftmaxWeightGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(3, "softmax_weight"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testSoftmaxBiasGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(4, "softmax_bias"), 1e-8)
 
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d473eeb5f4f00087672da53c5fef3ab63bdbd08
--- /dev/null
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -0,0 +1,329 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gradient checker for functions.
+
+The gradient checker verifies numerically that an function properly
+computes the gradients
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _product(t):
+  if isinstance(t, int):
+    return t
+  else:
+    y = 1
+    for x in t:
+      y *= x
+    return y
+
+
+def _eval_indexed_slices(a):
+  """Converts IndexedSlices to IndexedSlicesValue with numpy indices/values.
+
+  When eager execution is enabled, converts IndexedSlices
+  to IndexedSlicesValue with numpy indices/values.
+
+  Args:
+    a: any value.
+
+  Returns:
+    If a is IndexedSlices and eager execution is enabled, calls numpy() on a's
+    fields. Otherwise returns a unchanged.
+  """
+  if isinstance(a, ops.IndexedSlices) and context.executing_eagerly():
+    return ops.IndexedSlicesValue(
+        indices=[x.numpy() for x in a.indices],
+        values=[x.numpy() for x in a.values],
+        dense_shape=a.dense_shape)
+  return a
+
+
+def _to_numpy(a):
+  """Converts Tensors and EagerTensors to numpy arrays.
+
+  Args:
+    a: any value.
+
+  Returns:
+    If a is EagerTensor or Tensor, returns the evaluation of a by calling
+    numpy() or run(). Otherwise returns a unchanged.
+  """
+  if isinstance(a, ops.EagerTensor):
+    return a.numpy()
+  if isinstance(a, ops.Tensor):
+    sess = ops.get_default_session()
+    return sess.run(a)
+  return a
+
+
+def _prepare(f, xs_dtypes):
+  """Return a function that executes 'f'.
+
+    In TF 2.x, this is the same as `f`.
+    In TF 1.x, returns a Python function that executes the graph defined by `f`
+    in a Session.
+
+  Args:
+    f: the function.
+    xs_dtypes: dtypes of f's arguments.
+
+  Returns:
+    a function that will be evaluated in both graph and eager mode
+  """
+  if context.executing_eagerly():
+
+    def decorated_eager(*xs_data):
+      return f(*map(ops.convert_to_tensor, xs_data))
+
+    return decorated_eager
+  xs = [array_ops.placeholder(x_dtype) for x_dtype in xs_dtypes]
+  y = f(*xs)
+  sess = ops.get_default_session()
+  def decorated_graph(*xs_data):
+    xs_data = [_to_numpy(a) for a in xs_data]
+    return sess.run(y, feed_dict=dict(zip(xs, xs_data)))
+  return decorated_graph
+
+
+def _compute_theoretical_jacobian(f, y_shape, y_dtype, xs, param):
+  """Computes the theoretical Jacobian for f regarding xs[param].
+
+  One can think of the relation among f, xs and y as y = f(xs).
+
+  Args:
+    f: the function.
+    y_shape: the shape of the result.
+    y_dtype: the dtype of the result.
+    xs: a list of tensors.
+    param: the index of the target parameter.
+
+  Returns:
+    A 2-d numpy array representing the Jacobian. It has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in xs[param]
+    and "y_size" is the number of elements in the result.
+
+  Raises:
+    ValueError: If result is empty but the gradient is nonzero.
+  """
+  x = xs[param]
+  # Complex vectors are treated as vectors of twice as many reals.
+  x_shape = tuple(x.shape) + (2,) if x.dtype.is_complex else x.shape
+  y_factor = 2 if y_dtype.is_complex else 1
+
+  # To compute the jacobian, we treat x and y as one-dimensional vectors.
+  x_size = _product(x_shape)
+  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
+  y_size = _product(y_shape) * y_factor
+
+  # Allocate 2-D Jacobian, with x dimensions smashed into the first
+  # dimension and y dimensions smashed into the second.
+  jacobian = np.zeros((x_size, y_size), dtype=x.dtype.real_dtype.as_numpy_dtype)
+
+  # For each of the entry of dy, we set this to be 1 and
+  # everything else to be 0 and compute the gradients -- this will give us one
+  # one column of the Jacobian matrix.
+  dy_data = np.zeros(y_shape, dtype=y_dtype.as_numpy_dtype)
+  dy_data_flat = dy_data.ravel().view(y_dtype.real_dtype.as_numpy_dtype)
+  grad_fn_unprep = backprop.gradients_function(f, [param])
+  grad_fn = _prepare(lambda dy, *xs: grad_fn_unprep(*xs, dy=dy),
+                     [y_dtype] + [x.dtype for x in xs])
+  for col in range(y_size):
+    dy_data_flat[col] = 1
+    grad = _to_numpy(grad_fn(dy_data, *xs)[0])
+    grad = _eval_indexed_slices(grad)
+    dy_data_flat[col] = 0
+    if isinstance(grad, ops.IndexedSlicesValue):
+      for i, v in zip(grad.indices, grad.values):
+        r_begin = i * x_val_size
+        r_end = r_begin + x_val_size
+        jacobian[r_begin:r_end, col] += v.flat
+    else:
+      jacobian[:, col] = grad.ravel().view(jacobian.dtype)
+
+  # If the output is empty, run the gradients at least once and make sure
+  # they produce zeros.
+  if y_size == 0:  # don't use 'not y_size', because y_size may not be an int
+    grad = _to_numpy(grad_fn(dy_data, *xs)[0])
+    if grad.shape != x.shape:
+      raise ValueError("Empty gradient has wrong shape: expected %s, got %s" %
+                       (x.shape, grad.shape))
+    if np.any(grad):
+      raise ValueError("Empty tensor with nonzero gradients")
+
+  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
+  return jacobian
+
+
+def _compute_numeric_jacobian(f, y_size, y_dtype, xs, param,
+                              delta):
+  """Computes the numeric Jacobian for f regarding xs[param].
+
+  One can think of the relation among f, xs and y as y = f(xs).
+
+  Args:
+    f: the function.
+    y_size: the number of elements of the result.
+    y_dtype: the dtype of the result.
+    xs: a list of tensors.
+    param: the index of the target parameter.
+    delta: the amount of perturbation we give to the input.
+
+  Returns:
+    A 2-d numpy array representing the Jacobian. It has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in xs[param]
+    and "y_size" is the number of elements in the result.
+  """
+  # bfloat16 doesn't have enough bits to represent high precision numbers such
+  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
+  # be the groundtruth to compare against, it shouldn't lose any information.
+  x_shape = xs[param].shape
+  x_dtype = xs[param].dtype
+  if y_dtype == dtypes.bfloat16:
+    f = lambda *xs: math_ops.cast(f(*xs), dtypes.float32)
+    y_dtype = dtypes.float32
+
+  # To compute the jacobian, we treat x and y as one-dimensional vectors
+  x_size = _product(x_shape) * (2 if x_dtype.is_complex else 1)
+  y_size = y_size * (2 if y_dtype.is_complex else 1)
+  x_dtype = x_dtype.real_dtype.as_numpy_dtype
+  y_dtype = y_dtype.real_dtype.as_numpy_dtype
+
+  xs_dtypes = [x.dtype for x in xs]
+  # Converts xs to numpy arrays to do in-place perturbation.
+  # Calls asarray() to avoid copying in ravel() later.
+  xs = [np.asarray(_to_numpy(x)) for x in xs]
+  x = xs[param]
+
+  # Make sure we have the right types
+  scale = np.asarray(2 * delta, dtype=y_dtype)[()]
+
+  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
+  # For each of the entry of x, we slightly perturbs this by adding and
+  # subtracting a delta and then compute difference between the outputs. This
+  # will give us one row of the Jacobian matrix.
+
+  f = _prepare(f, xs_dtypes)
+  for row in range(x_size):
+    original = x.ravel().view(x_dtype)[row]
+    x.ravel().view(x_dtype)[row] += delta
+    y_pos = _to_numpy(f(*xs))
+    x.ravel().view(x_dtype)[row] = original
+    x.ravel().view(x_dtype)[row] -= delta
+    y_neg = _to_numpy(f(*xs))
+    x.ravel().view(x_dtype)[row] = original
+    diff = (y_pos - y_neg) / scale
+    jacobian[row, :] = diff.ravel().view(y_dtype)
+
+  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
+  return jacobian
+
+
+def _compute_gradient(f,
+                      y_shape,
+                      y_dtype,
+                      xs,
+                      param,
+                      delta):
+  """Computes the theoretical and numerical jacobian."""
+  x = xs[param]
+  t = x.dtype
+  allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.complex64, dtypes.complex128]
+  assert t.base_dtype in allowed_types, ("Cannot compute gradient for"
+                                         "unsupported type %s of argument %s" %
+                                         (t.name, param))
+  t2 = y_dtype
+  assert t2.base_dtype in allowed_types, ("Cannot compute gradient for"
+                                          "unsupported type %s of y" % t2.name)
+  y_size = _product(y_shape)
+  jacob_t = _compute_theoretical_jacobian(f, y_shape, y_dtype,
+                                          xs, param)
+  jacob_n = _compute_numeric_jacobian(f, y_size, y_dtype, xs,
+                                      param, delta)
+  return jacob_t, jacob_n
+
+
+def _compute_gradient_list(f, xs, delta):
+  """Compute gradients for a list of x values."""
+  # convert xs to tensors so that dtype and shape have uniform types
+  xs = list(map(ops.convert_to_tensor, xs))
+  # run the function to get info of the result
+  xs_dtypes = [x.dtype for x in xs]
+  f_temp = _prepare(f, xs_dtypes)
+  y = f_temp(*xs)
+  return zip(*[_compute_gradient(f, y.shape, dtypes.as_dtype(y.dtype),
+                                 xs, i, delta) for i in range(len(xs))])
+
+
+@tf_export("test.compute_gradient", v1=[])
+def compute_gradient(f, x, delta=1e-3):
+  """Computes the theoretical and numeric Jacobian of f.
+
+  With y = f(x), computes the theoretical and numeric Jacobian dy/dx.
+
+  Args:
+    f: the function.
+    x: a list of tensors.
+    delta: (optional) perturbation used to compute numeric Jacobian.
+
+  Returns:
+    A pair of lists, where the first is a list of 2-d numpy arrays representing
+    the theoretical Jacobians for each argument, and the second list is the
+    numerical ones. Each 2-d array has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in the
+    corresponding argument and "y_size" is the number of elements in f(x).
+
+  Raises:
+    ValueError: If result is empty but the gradient is nonzero.
+  """
+  if not isinstance(x, list):
+    raise ValueError(
+        "`x` must be a list of Tensors (arguments to `f`), not a %s" % type(x))
+  return _compute_gradient_list(f, x, delta)
+
+
+def max_error(grad1, grad2):
+  """Computes maximum elementwise gap.
+
+  Computes the maximum elementwise gap between two lists of tensors of the same
+  shape.
+
+  Args:
+    grad1: a lists of tensors.
+    grad2: a lists of tensors with the same shape as grad1.
+
+  Returns:
+    The maximum elementwise gap between the two.
+  """
+  error = 0
+  for j_t, j_n in zip(grad1, grad2):
+    if j_t.size or j_n.size:  # Handle zero size tensors correctly
+      error = np.maximum(error, np.fabs(j_t - j_n).max())
+  return error
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..191b2b6568104b7cf49aa2844f7929284c00d74d
--- /dev/null
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -0,0 +1,300 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for compute_gradient.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import \
+gradient_checker_v2 as gradient_checker
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+# needs this to register gradient for SoftmaxCrossEntropyWithLogits:
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def _random_complex(shape, dtype):
+  data = np.random.random_sample(shape).astype(dtype.as_numpy_dtype)
+  if dtype.is_complex:
+    data.imag = np.random.random_sample(shape)
+  return data
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class GradientCheckerTest(test.TestCase):
+
+  def testAddSimple(self):
+    size = (2, 3)
+    x1 = constant_op.constant(2.0, shape=size, name="x1")
+    x2 = constant_op.constant(3.0, shape=size, name="x2")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x1: math_ops.add(x1, x2), [x1]))
+    tf_logging.info("x1 error = %f", error)
+    assert error < 1e-4
+
+  def testAddCustomized(self):
+    size = (2, 3)
+    x1 = constant_op.constant(
+        2.0, shape=size, dtype=dtypes.float64, name="x1")
+    x2 = np.asarray(np.arange(6, dtype=np.float64).reshape(2, 3))
+    # checkint gradients for x2 using a special delta
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x2: math_ops.add(x1, x2),
+        [x2], delta=1e-2))
+    tf_logging.info("x2 error = %f", error)
+    assert error < 1e-10
+
+  def testGather(self):
+    def f(params):
+      index_values = [1, 3]
+      indices = constant_op.constant(index_values, name="i")
+      return array_ops.gather(params, indices, name="y")
+    p_shape = (4, 2)
+    p_size = 8
+    params = constant_op.constant(
+        np.arange(p_size).astype(np.float), shape=p_shape, name="p")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [params]))
+    tf_logging.info("gather error = %f", error)
+    assert error < 1e-4
+
+  def testNestedGather(self):
+    def f(params):
+      index_values = [1, 3, 5, 6]
+      indices = constant_op.constant(index_values, name="i")
+      y = array_ops.gather(params, indices, name="y")
+      index_values2 = [0, 2]
+      indices2 = constant_op.constant(index_values2, name="i2")
+      return array_ops.gather(y, indices2, name="y2")
+    p_shape = (8, 2)
+    p_size = 16
+    params = constant_op.constant(
+        np.arange(p_size).astype(np.float), shape=p_shape, name="p")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [params]))
+    tf_logging.info("nested gather error = %f", error)
+    assert error < 1e-4
+
+  def testComplexMul(self):
+    c = constant_op.constant(5 + 7j, dtype=dtypes.complex64)
+    def f(x):
+      return c * x
+    x_shape = c.shape
+    x_dtype = c.dtype
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    analytical, numerical = gradient_checker.compute_gradient(
+        f, [x])
+    correct = np.array([[5, 7], [-7, 5]])
+    self.assertAllEqual(correct, analytical[0])
+    self.assertAllClose(correct, numerical[0], rtol=1e-4)
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    self.assertLess(
+        gradient_checker.max_error(*gradient_checker.compute_gradient(
+            f, [x])), 3e-4)
+
+  def testComplexConj(self):
+    def f(x):
+      return math_ops.conj(x)
+    x_shape = ()
+    x_dtype = dtypes.complex64
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    analytical, numerical = gradient_checker.compute_gradient(
+        f, [x])
+    correct = np.array([[1, 0], [0, -1]])
+    self.assertAllEqual(correct, analytical[0])
+    self.assertAllClose(correct, numerical[0], rtol=2e-5)
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    self.assertLess(
+        gradient_checker.max_error(*gradient_checker.compute_gradient(
+            f, [x])), 2e-5)
+
+  def testEmptySucceeds(self):
+    def f(x):
+      return array_ops.identity(x)
+    x = constant_op.constant(np.random.random_sample((0, 3)),
+                             dtype=dtypes.float32)
+    for grad in gradient_checker.compute_gradient(f, [x]):
+      self.assertEqual(grad[0].shape, (0, 0))
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [x]))
+    self.assertEqual(error, 0)
+
+  def testEmptyFails(self):
+    @custom_gradient.custom_gradient
+    def id_bad_grad(x):
+      y = array_ops.identity(x)
+      def grad_fn(dy):
+        # dx = constant_op.constant(np.zeros((1, 4)), dtype=dtypes.float32)
+        dx = array_ops.transpose(dy)
+        return dx
+      return y, grad_fn
+    def f(x):
+      return id_bad_grad(x)
+    x = constant_op.constant(np.random.random_sample((0, 3)),
+                             dtype=dtypes.float32)
+    bad = r"Empty gradient has wrong shape: expected \(0, 3\), got \(3, 0\)"
+    with self.assertRaisesRegexp(ValueError, bad):
+      gradient_checker.compute_gradient(f, [x])
+
+  def testNaNGradFails(self):
+    @custom_gradient.custom_gradient
+    def id_nan_grad(x):
+      y = array_ops.identity(x)
+      def grad_fn(dy):
+        dx = np.nan * dy
+        # dx = dy
+        return dx
+      return y, grad_fn
+    def f(x):
+      return id_nan_grad(x)
+    x = constant_op.constant(np.random.random_sample((1, 1)),
+                             dtype=dtypes.float32)
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [x]))
+    # Typical test would assert error < max_err, so assert this test would
+    # raise AssertionError, since NaN is not < 1.0.
+    with self.assertRaisesRegexp(AssertionError, "False is not true"):
+      self.assertTrue(error < 1.0)
+
+  def testGradGrad(self):
+
+    def f(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = math_ops.square(x)
+        z = math_ops.square(y)
+      return tape.gradient(z, x)
+
+    analytical, numerical = gradient_checker.compute_gradient(f, [2.0])
+    self.assertAllEqual([[[48.]]], analytical)
+    self.assertAllClose([[[48.]]], numerical, rtol=1e-4)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MiniMNISTTest(test.TestCase):
+
+  # Gradient checker for MNIST.
+  def _BuildAndTestMiniMNIST(self, param_index, tag):
+    # Fix seed to avoid occasional flakiness
+    np.random.seed(6)
+
+    # Hyperparameters
+    batch = 3
+    inputs = 16
+    features = 32
+    classes = 10
+
+    # Define the parameters
+    inp_data = np.random.random_sample(inputs * batch)
+    hidden_weight_data = np.random.randn(inputs * features) / np.sqrt(inputs)
+    hidden_bias_data = np.random.random_sample(features)
+    sm_weight_data = np.random.randn(features * classes) / np.sqrt(features)
+    sm_bias_data = np.random.random_sample(classes)
+
+    # special care for labels since they need to be normalized per batch
+    label_data = np.random.random(batch * classes).reshape((batch, classes))
+    s = label_data.sum(axis=1)
+    label_data /= s[:, None]
+
+    # We treat the inputs as "parameters" here
+    inp = constant_op.constant(
+        inp_data.tolist(),
+        shape=[batch, inputs],
+        dtype=dtypes.float64,
+        name="inp")
+    hidden_weight = constant_op.constant(
+        hidden_weight_data.tolist(),
+        shape=[inputs, features],
+        dtype=dtypes.float64,
+        name="hidden_weight")
+    hidden_bias = constant_op.constant(
+        hidden_bias_data.tolist(),
+        shape=[features],
+        dtype=dtypes.float64,
+        name="hidden_bias")
+    softmax_weight = constant_op.constant(
+        sm_weight_data.tolist(),
+        shape=[features, classes],
+        dtype=dtypes.float64,
+        name="softmax_weight")
+    softmax_bias = constant_op.constant(
+        sm_bias_data.tolist(),
+        shape=[classes],
+        dtype=dtypes.float64,
+        name="softmax_bias")
+
+    # List all the parameter so that we can test them one at a time
+    all_params = [
+        inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias
+    ]
+
+    # Now, Building MNIST
+    def f(inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias):
+      features = nn_ops.relu(
+          nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features")
+      logits = nn_ops.xw_plus_b(
+          features, softmax_weight, softmax_bias, name="logits")
+      labels = constant_op.constant(
+          label_data.tolist(),
+          shape=[batch, classes],
+          dtype=dtypes.float64,
+          name="labels")
+      cost = nn_ops.softmax_cross_entropy_with_logits(
+          labels=labels, logits=logits, name="cost")
+      return cost
+
+    def f_restricted(x):
+      xs = all_params
+      i = param_index
+      # use x for the i-th parameter
+      xs = xs[0:i]+[x]+xs[i+1:]
+      return f(*xs)
+    # Test the gradients.
+    err = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f_restricted, [all_params[param_index]], delta=1e-5))
+
+    tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
+    return err
+
+  def testInputGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(0, "input"), 1e-8)
+
+  def testHiddenWeightGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(1, "hidden_weight"), 1e-8)
+
+  def testHiddenBiasGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(2, "hidden_bias"), 1e-8)
+
+  def testSoftmaxWeightGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(3, "softmax_weight"), 1e-8)
+
+  def testSoftmaxBiasGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(4, "softmax_bias"), 1e-8)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index 794465b10e3fd2195e74d7fa3144329140bb111e..cd11447e1f963a62d79855cfd8af42a35e978c79 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -25,5 +25,5 @@ from tensorflow.python.ops.custom_gradient import custom_gradient
 from tensorflow.python.ops.gradients_impl import AggregationMethod
 from tensorflow.python.ops.gradients_impl import gradients
 from tensorflow.python.ops.gradients_impl import hessians
-from tensorflow.python.ops.gradients_impl import UnconnectedGradients
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 # pylint: enable=unused-import
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index aac95037dce306ff5ec25cf3162c78bb789e5a1a..0a70d6ee61e64f94c41c1f1d0a5b6c3610b45c04 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import collections
 import contextlib
-import enum  # pylint: disable=g-bad-import-order
-import sys
 import warnings
 
 import numpy as np
@@ -36,10 +34,10 @@ from tensorflow.python.framework import function as framework_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework.func_graph import FuncGraph
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops  # pylint: disable=unused-import
-from tensorflow.python.ops import cond_v2_impl
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -51,21 +49,20 @@ from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import optional_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import random_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import spectral_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
+
 # This is to avoid a circular dependency (eager.function depends on
 # gradients_impl). This is set in eager/function.py.
 _function = None
 
-# This is to avoid a circular dependency with cond_v2_impl.
-cond_v2_impl._gradients_impl = sys.modules[__name__]  # pylint: disable=protected-access
-
 # Warn the user if we convert a sparse representation to dense with at
 # least this number of elements.
 _LARGE_SPARSE_NUM_ELEMENTS = 100000000
@@ -126,7 +123,7 @@ def _MarkReachedOps(from_ops, reached_ops, func_graphs):
   Args:
     from_ops: list of Operations.
     reached_ops: set of Operations.
-    func_graphs: list of _function.FuncGraphs. This method will traverse through
+    func_graphs: list of FuncGraphs. This method will traverse through
       these functions if they capture from_ops or any reachable ops.
   """
   queue = collections.deque()
@@ -151,7 +148,7 @@ def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops, func_graphs,
     to_ops: list of Operations.
     from_ops: list of Operations.
     colocate_gradients_with_ops: Python bool.  See docstring of gradients().
-    func_graphs: list of _function.FuncGraphs. This method will traverse through
+    func_graphs: list of FuncGraphs. This method will traverse through
       these functions if they capture from_ops or any reachable ops. This is
       useful if to_ops occur in a function and from_ops are in an outer function
       or graph.
@@ -267,6 +264,12 @@ def _DefaultGradYs(grad_ys,
               "Gradient type %s generated for variant "
               "tensor %s with type %s must be variant" % (dtypes.as_dtype(
                   grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
+      elif y.dtype == dtypes.resource:
+        # We assume y is the handle of a ResourceVariable. The gradient of a
+        # ResourceVariable should be a numeric value, not another resource.
+        if grad_y.dtype == dtypes.resource:
+          raise TypeError("Input gradient %s for resource tensor %s should not "
+                          "be a resource" % (grad_y, y))
       else:
         raise TypeError(
             "Tensor %s with type %s must be numeric "
@@ -294,18 +297,22 @@ def _DefaultGradYs(grad_ys,
   return new_grad_ys
 
 
-def _IsTrainable(tensor):
-  dtype = dtypes.as_dtype(tensor.dtype)
+def IsTrainable(tensor_or_dtype):
+  if isinstance(tensor_or_dtype, ops.Tensor):
+    dtype = tensor_or_dtype.dtype
+  else:
+    dtype = tensor_or_dtype
+  dtype = dtypes.as_dtype(dtype)
   return dtype.base_dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
                               dtypes.complex64, dtypes.complex128,
-                              dtypes.resource)
+                              dtypes.resource, dtypes.variant)
 
 
 def _IsBackpropagatable(tensor):
-  if _IsTrainable(tensor):
+  if IsTrainable(tensor):
     return True
   dtype = dtypes.as_dtype(tensor.dtype)
-  return dtype.base_dtype in (dtypes.bfloat16, dtypes.variant)
+  return dtype.base_dtype == dtypes.bfloat16
 
 
 def _VerifyGeneratedGradients(grads, op):
@@ -319,6 +326,10 @@ def _VerifyGeneratedGradients(grads, op):
     ValueError: if sizes of gradients and inputs don't match.
     TypeError: if type of any gradient is not valid for its input.
   """
+  # While ops have inputs added to them during the gradient computation, so we
+  # skip the below check. See while_v2 for details.
+  if op.type == "While": return
+
   if len(grads) != len(op.inputs):
     raise ValueError("Num gradients %d generated for op %s do not match num "
                      "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
@@ -453,12 +464,12 @@ def _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs):
 
 
 def _IsFunction(graph):
-  return (isinstance(graph, _function.FuncGraph) or
+  return (isinstance(graph, FuncGraph) or
           isinstance(graph, framework_function._FuncGraph))  # pylint: disable=protected-access
 
 
 def _Captures(func_graph):
-  if isinstance(func_graph, _function.FuncGraph):
+  if isinstance(func_graph, FuncGraph):
     return func_graph.captures
   else:
     assert isinstance(func_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
@@ -472,7 +483,7 @@ def _MaybeCaptured(t):
     t: Tensor
 
   Returns:
-    A tensor, potentially from a different Graph/_function.FuncGraph.
+    A tensor, potentially from a different Graph/FuncGraph.
   """
   # pylint: disable=protected-access
   if (not isinstance(t, ops.EagerTensor) and
@@ -497,9 +508,8 @@ def _NonEagerInputs(op, xs):
     xs: list of Tensors we are differentiating w.r.t.
 
   Returns:
-    A list of tensors. The tensors may be from multiple
-    Graph/_function.FuncGraphs if op is in a _function.FuncGraph and has
-    captured inputs.
+    A list of tensors. The tensors may be from multiple Graph/FuncGraphs if op
+    is in a FuncGraph and has captured inputs.
   """
   if _IsFunction(op.graph):  # pylint: disable=protected-access
     inputs = []
@@ -524,7 +534,7 @@ def _Consumers(t, func_graphs):
 
   Args:
     t: Tensor
-    func_graphs: a list of _function.FuncGraphs that may have captured t.
+    func_graphs: a list of FuncGraphs that may have captured t.
 
   Returns:
     A list of tensors. The tensors will be from the current graph and/or
@@ -538,27 +548,7 @@ def _Consumers(t, func_graphs):
   return consumers
 
 
-@tf_export("UnconnectedGradients")
-class UnconnectedGradients(enum.Enum):
-  """Controls how gradient computation behaves when y does not depend on x.
-
-  The gradient of y with respect to x can be zero in two different ways: there
-  could be no differentiable path in the graph connecting x to y (and so we can
-  statically prove that the gradient is zero) or it could be that runtime values
-  of tensors in a particular execution lead to a gradient of zero (say, if a
-  relu unit happens to not be activated). To allow you to distinguish between
-  these two cases you can choose what value gets returned for the gradient when
-  there is no path in the graph from x to y:
-
-  * `NONE`: Indicates that [None] will be returned if there is no path from x
-    to y
-  * `ZERO`: Indicates that a zero tensor will be returned in the shape of x.
-  """
-  NONE = "none"
-  ZERO = "zero"
-
-
-@tf_export("gradients")
+@tf_export(v1=["gradients"])
 def gradients(ys,
               xs,
               grad_ys=None,
@@ -674,6 +664,119 @@ def gradients(ys,
                             unconnected_gradients)
 
 
+@tf_export("gradients", v1=[])
+def gradients_v2(ys,  # pylint: disable=invalid-name
+                 xs,
+                 grad_ys=None,
+                 name="gradients",
+                 gate_gradients=False,
+                 aggregation_method=None,
+                 stop_gradients=None,
+                 unconnected_gradients=UnconnectedGradients.NONE):
+  """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.
+
+  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
+  is a list of `Tensor`, holding the gradients received by the
+  `ys`. The list must be the same length as `ys`.
+
+  `gradients()` adds ops to the graph to output the derivatives of `ys` with
+  respect to `xs`.  It returns a list of `Tensor` of length `len(xs)` where
+  each tensor is the `sum(dy/dx)` for y in `ys`.
+
+  `grad_ys` is a list of tensors of the same length as `ys` that holds
+  the initial gradients for each y in `ys`.  When `grad_ys` is None,
+  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
+  user can provide their own initial `grad_ys` to compute the
+  derivatives using a different initial gradient for each y (e.g., if
+  one wanted to weight the gradient differently for each value in
+  each y).
+
+  `stop_gradients` is a `Tensor` or a list of tensors to be considered constant
+  with respect to all `xs`. These tensors will not be backpropagated through,
+  as though they had been explicitly disconnected using `stop_gradient`.  Among
+  other things, this allows computation of partial derivatives as opposed to
+  total derivatives. For example:
+
+  ```python
+  a = tf.constant(0.)
+  b = 2 * a
+  g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
+  ```
+
+  Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
+  total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
+  influence of `a` on `b` and evaluate to `[3.0, 1.0]`.  Note that the above is
+  equivalent to:
+
+  ```python
+  a = tf.stop_gradient(tf.constant(0.))
+  b = tf.stop_gradient(2 * a)
+  g = tf.gradients(a + b, [a, b])
+  ```
+
+  `stop_gradients` provides a way of stopping gradient after the graph has
+  already been constructed, as compared to `tf.stop_gradient` which is used
+  during graph construction.  When the two approaches are combined,
+  backpropagation stops at both `tf.stop_gradient` nodes and nodes in
+  `stop_gradients`, whichever is encountered first.
+
+  All integer tensors are considered constant with respect to all `xs`, as if
+  they were included in `stop_gradients`.
+
+  `unconnected_gradients` determines the value returned for each x in xs if it
+  is unconnected in the graph to ys. By default this is None to safeguard
+  against errors. MAthematically these gradients are zero which can be requested
+  using the `'zero'` option. `tf.UnconnectedGradients` provides the
+  following options and behaviors:
+
+  ```python
+  a = tf.ones([1, 2])
+  b = tf.ones([3, 1])
+  g1 = tf.gradients([b], [a], unnconnected_gradients='none')
+  sess.run(g1)  # [None]
+
+  g2 = tf.gradients([b], [a], unconnected_gradients='zero')
+  sess.run(g2)  # [array([[0., 0.]], dtype=float32)]
+  ```
+
+
+  Args:
+    ys: A `Tensor` or list of tensors to be differentiated.
+    xs: A `Tensor` or list of tensors to be used for differentiation.
+    grad_ys: Optional. A `Tensor` or list of tensors the same size as
+      `ys` and holding the gradients computed for each y in `ys`.
+    name: Optional name to use for grouping all the gradient ops together.
+      defaults to 'gradients'.
+    gate_gradients: If True, add a tuple around the gradients returned
+      for an operations.  This avoids some race conditions.
+    aggregation_method: Specifies the method used to combine gradient terms.
+      Accepted values are constants defined in the class `AggregationMethod`.
+    stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
+      through.
+    unconnected_gradients: Optional. Specifies the gradient value returned when
+      the given input tensors are unconnected. Accepted values are constants
+      defined in the class `tf.UnconnectedGradients` and the default value is
+      `none`.
+
+  Returns:
+    A list of `sum(dy/dx)` for each x in `xs`.
+
+  Raises:
+    LookupError: if one of the operations between `x` and `y` does not
+      have a registered gradient function.
+    ValueError: if the arguments are invalid.
+    RuntimeError: if called in Eager mode.
+
+  """
+  # Creating the gradient graph for control flow mutates Operations.
+  # _mutation_lock ensures a Session.run call cannot occur between creating and
+  # mutating new ops.
+  with ops.get_default_graph()._mutation_lock():  # pylint: disable=protected-access
+    return _GradientsHelper(ys, xs, grad_ys, name, True, gate_gradients,
+                            aggregation_method, stop_gradients,
+                            unconnected_gradients)
+
+
 def _GradientsHelper(ys,
                      xs,
                      grad_ys=None,
@@ -702,7 +805,7 @@ def _GradientsHelper(ys,
   curr_graph = src_graph
   while _IsFunction(curr_graph):
     func_graphs.append(curr_graph)
-    if isinstance(curr_graph, _function.FuncGraph):
+    if isinstance(curr_graph, FuncGraph):
       curr_graph = curr_graph.outer_graph
     else:
       assert isinstance(curr_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
@@ -775,7 +878,7 @@ def _GradientsHelper(ys,
     if loop_state:
       loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
       for y in loop_exits:
-        if _IsTrainable(y):
+        if IsTrainable(y):
           _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
           queue.append(y.op)
 
@@ -800,23 +903,21 @@ def _GradientsHelper(ys,
         # pylint: enable=protected-access
         has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
         if has_out_grads and (op not in stop_ops):
-          if is_func_call:
-            if is_partitioned_call:
-              func_call = src_graph._get_function(  # pylint: disable=protected-access
-                  compat.as_bytes(op.get_attr("f").name))
+          try:
+            grad_fn = ops.get_gradient_function(op)
+          except LookupError:
+            if is_func_call:
+              if is_partitioned_call:
+                func_call = src_graph._get_function(  # pylint: disable=protected-access
+                    compat.as_bytes(op.get_attr("f").name))
+              else:
+                func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
+              # Note that __defun is not set if the graph is
+              # imported. If it's set, we prefer to access the original
+              # defun.
+              func_call = getattr(op, "__defun", func_call)
+              grad_fn = func_call.python_grad_func
             else:
-              func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
-            # Note that __defun is not set if the graph is
-            # imported. If it's set, we prefer to access the original
-            # defun.
-            func_call = getattr(op, "__defun", func_call)
-            grad_fn = func_call.python_grad_func
-          else:
-            # A grad_fn must be defined, either as a function or as None
-            # for ops that do not have gradients.
-            try:
-              grad_fn = ops.get_gradient_function(op)
-            except LookupError:
               raise LookupError(
                   "No gradient defined for operation '%s' (op type: %s)" %
                   (op.name, op.type))
@@ -843,7 +944,7 @@ def _GradientsHelper(ys,
           # therefore dC/doutput[i] is 0.
           for i, out_grad in enumerate(out_grads):
             if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
-                (not grad_fn and is_func_call) or _IsTrainable(op.outputs[i])):
+                (not grad_fn and is_func_call) or IsTrainable(op.outputs[i])):
               # Only trainable outputs or outputs for a function call that
               # will use SymbolicGradient get a zero gradient. Gradient
               # functions should ignore the gradient for other outputs.
@@ -916,7 +1017,7 @@ def _HasAnyNotNoneGrads(grads, op):
     if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
       return True
     if out_grad and isinstance(out_grad, collections.Sequence):
-      if any([g is not None for g in out_grad]):
+      if any(g is not None for g in out_grad):
         return True
   return False
 
@@ -948,7 +1049,7 @@ def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
             # For an unused exit, if it has trainable outputs, backprop
             # a zero gradient. Otherwise, just ignore it.
             for y in grad_state.unused_exits:
-              if _IsTrainable(y):
+              if IsTrainable(y):
                 _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
               queue.append(y.op)
           else:
@@ -980,7 +1081,8 @@ def _GetGrad(grads, t, unconnected_gradients):
   op_grads = grads.get(op)
   if not op_grads:
     if unconnected_gradients == UnconnectedGradients.ZERO:
-      return array_ops.zeros_like(t)
+      t_dtype = t.dtype if t.dtype != dtypes.resource else dtypes.float32
+      return array_ops.zeros_like(t, dtype=t_dtype)
     elif unconnected_gradients == UnconnectedGradients.NONE:
       return None
     else:
@@ -1130,11 +1232,11 @@ def _AggregatedGrads(grads,
         assert control_flow_util.IsLoopSwitch(op)
         continue
     # Grads have to be Tensors or IndexedSlices
-    if (isinstance(out_grad, collections.Sequence) and not all([
+    if (isinstance(out_grad, collections.Sequence) and not all(
         isinstance(g, (ops.Tensor, ops.IndexedSlices))
         for g in out_grad
         if g is not None
-    ])):
+    )):
       raise TypeError("gradients have to be either all Tensors "
                       "or all IndexedSlices")
     # Aggregate multiple gradients, and convert [] to None.
@@ -1142,7 +1244,7 @@ def _AggregatedGrads(grads,
       if len(out_grad) < 2:
         used = "nop"
         out_grads[i] = out_grad[0]
-      elif all([isinstance(g, ops.Tensor) for g in out_grad if g is not None]):
+      elif all(isinstance(g, ops.Tensor) for g in out_grad if g is not None):
         tensor_shape = _AccumulatorShape(out_grad)
         if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
             and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
@@ -1259,7 +1361,7 @@ def _hessian_vector_product(ys, xs, v):
   return gradients(elemwise_products, xs)
 
 
-@tf_export("hessians")
+@tf_export(v1=["hessians"])
 def hessians(ys,
              xs,
              name="hessians",
@@ -1324,3 +1426,16 @@ def hessians(ys,
                                           array_ops.concat((_shape, _shape), 0))
     hessians.append(_reshaped_hessian)
   return hessians
+
+
+@tf_export("hessians", v1=[])
+def HessiansV2(ys,
+               xs,
+               gate_gradients=False,
+               aggregation_method=None,
+               name="hessians"):
+  return hessians(ys, xs, name=name, gate_gradients=gate_gradients,
+                  aggregation_method=aggregation_method)
+
+
+HessiansV2.__doc__ = hessians.__doc__
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index c93e2493ee7eb1dfa5240297c759524cd20a23b0..abdcbc7a3ac3b2e6d42bacf4ae454e277220f497 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -144,7 +144,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
                                  gate_gradients=True)[0]
       with session.Session():
         # Make sure the placer doesn't complain.
-        gz_x.eval()
+        self.evaluate(gz_x)
 
   def testBoundaryStop(self):
     # Test that we don't differentiate 'x'. The gradient function for 'x' is
@@ -158,6 +158,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(z, [x])
       self.assertTrue(all(x is not None for x in grads))
 
+  @test_util.run_v1_only("b/120545219")
   def testBoundaryContinue(self):
     # Test that we differentiate both 'x' and 'y' correctly when x is a
     # predecessor of y.
@@ -169,6 +170,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertTrue(all(x is not None for x in grads))
       self.assertEqual(6.0, grads[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAggregationMethodAccumulateN(self):
     with self.cached_session():
       x = constant(1.0)
@@ -182,6 +184,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(20.0, grads[0].eval())
       self.assertEqual(10.0, grads[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAggregationMethodAddN(self):
     with self.cached_session():
       x = constant(1.0)
@@ -193,6 +196,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(20.0, grads[0].eval())
       self.assertEqual(10.0, grads[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAggregationMethodTree(self):
     with self.cached_session():
       x = constant(1.0)
@@ -239,6 +243,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
             [dx, dy], feed_dict={x: [1.0], dy.indices: [0], dy.values: [2.0]})
       self.assertEqual(vdx, vdy)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonDifferentiableSwitchInWhileLoop(self):
     with ops.Graph().as_default():
       v = array_ops.placeholder(dtypes.float32, [])
@@ -270,6 +275,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       gradient = gradients.gradients(graph.as_graph_element(var), var)
       self.assertIsNotNone(gradient)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableRefGradient(self):
     with ops.Graph().as_default():
       init = constant_op.constant(100.0)
@@ -277,6 +283,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       gradient = gradients.gradients(var._ref(), var)
       self.assertIsNotNone(gradient)
 
+  @test_util.run_v1_only("b/120545219")
   def testDependentYs(self):
     with self.cached_session():
       x = constant_op.constant(3.0)
@@ -292,6 +299,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       g = gradients.gradients([z, z2], x)
       self.assertAllClose(17502.0, g[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testPartialDerivatives(self):
     with self.cached_session():
       x = constant_op.constant(1.)
@@ -302,6 +310,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       partialg = gradients.gradients(z, [x, y], stop_gradients=[x, y])
       self.assertEqual([1.0, 1.0], [g.eval() for g in partialg])
 
+  @test_util.run_v1_only("b/120545219")
   def testStopGradients(self):
     def _MakeGraph(rng, stop_gradients=()):
       def _FunctionOf(xs, k=3):
@@ -365,7 +374,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
       with self.cached_session() as sess:
-        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], sess.run(grads)[0])
+        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], self.evaluate(grads)[0])
 
   def testUnconnectedGradientsZeroConnectedGradients(self):
     with ops.Graph().as_default():
@@ -374,7 +383,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grad = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
       with self.cached_session() as sess:
-        self.assertEquals(3.0, sess.run(grad)[0])
+        self.assertEquals(3.0, self.evaluate(grad)[0])
 
   def testUnknownUnconnectedGradientsValueGiven(self):
     with ops.Graph().as_default():
@@ -438,8 +447,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(y, [x, b1])
 
       with self.cached_session() as sess:
-        self.assertAllEqual([40.0], sess.run(grads)[0])
-        self.assertAllEqual([10.0], sess.run(grads)[1])
+        self.assertAllEqual([40.0], self.evaluate(grads)[0])
+        self.assertAllEqual([10.0], self.evaluate(grads)[1])
 
   def testFunctionGradientsWithGradFunc(self):
     g = ops.Graph()
@@ -487,7 +496,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(f), 2.0)
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testGradientOfCaptured(self):
     with ops.Graph().as_default():
@@ -501,7 +510,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(f), 2.0)
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedResourceVariable(self):
     with ops.Graph().as_default():
@@ -515,8 +524,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertEqual(sess.run(f), 2.0)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedNested(self):
     with ops.Graph().as_default():
@@ -541,9 +550,9 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       x1_grad, x2_grad = Outer()
       with self.cached_session() as sess:
         # 1.0 + None + 2.0 + 1.0 = 4.0
-        self.assertEqual(sess.run(x1_grad), 4.0)
+        self.assertEqual(self.evaluate(x1_grad), 4.0)
         # None + 1.0 + 1.0 + None = 2.0
-        self.assertEqual(sess.run(x2_grad), 2.0)
+        self.assertEqual(self.evaluate(x2_grad), 2.0)
 
   def testCapturedFromFunction(self):
     with ops.Graph().as_default():
@@ -563,7 +572,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       z_grad = Outer()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(z_grad), 3.0)
+        self.assertEqual(self.evaluate(z_grad), 3.0)
 
   def testCapturedEagerTensors(self):
     # Test that we can handle captured eager tensors unrelated to the gradient
@@ -606,6 +615,7 @@ class PreventGradientTest(test_util.TensorFlowTestCase):
 
 class HessianVectorProductTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testHessianVectorProduct(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that HessianVectorProduct matches multiplication by the
@@ -621,19 +631,20 @@ class HessianVectorProductTest(test_util.TensorFlowTestCase):
     hess_value = mat_value + mat_value.T
     hess_v_value = np.dot(hess_value, v_value)
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         mat = constant_op.constant(mat_value)
         v = constant_op.constant(v_value)
         x = constant_op.constant(x_value)
         mat_x = math_ops.matmul(mat, x, name="Ax")
         x_mat_x = math_ops.matmul(array_ops.transpose(x), mat_x, name="xAx")
         hess_v = gradients_impl._hessian_vector_product(x_mat_x, [x], [v])[0]
-        hess_v_actual = hess_v.eval()
+        hess_v_actual = self.evaluate(hess_v)
       self.assertAllClose(hess_v_value, hess_v_actual)
 
 
 class HessianTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian1D(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that `hessian` matches. Specifically, the Hessian of
@@ -643,14 +654,15 @@ class HessianTest(test_util.TensorFlowTestCase):
     mat_value = rng.randn(m, m).astype("float32")
     x_value = rng.randn(m).astype("float32")
     hess_value = mat_value + mat_value.T
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       mat = constant_op.constant(mat_value)
       x = constant_op.constant(x_value)
       x_mat_x = math_ops.reduce_sum(x[:, None] * mat * x[None, :])
       hess = gradients.hessians(x_mat_x, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     self.assertAllClose(hess_value, hess_actual)
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian1D_multi(self):
     # Test the computation of the hessian with respect to multiple tensors
     m = 4
@@ -659,7 +671,7 @@ class HessianTest(test_util.TensorFlowTestCase):
     mat_values = [rng.randn(m, m).astype("float32") for _ in range(n)]
     x_values = [rng.randn(m).astype("float32") for _ in range(n)]
     hess_values = [mat_value + mat_value.T for mat_value in mat_values]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       mats = [constant_op.constant(mat_value) for mat_value in mat_values]
       xs = [constant_op.constant(x_value) for x_value in x_values]
       xs_mats_xs = [
@@ -671,14 +683,16 @@ class HessianTest(test_util.TensorFlowTestCase):
     for hess_value, hess_actual in zip(hess_values, hessians_actual):
       self.assertAllClose(hess_value, hess_actual)
 
+  @test_util.run_v1_only("b/120545219")
   def testHessianInvalidDimension(self):
     for shape in [(10, 10), None]:
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x = array_ops.placeholder(dtypes.float32, shape)
         # Expect a ValueError because the dimensions are wrong
         with self.assertRaises(ValueError):
           gradients.hessians(x, x)
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian2D_square_matrix(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that `hessian` matches. Specifically, the Hessian of
@@ -686,13 +700,13 @@ class HessianTest(test_util.TensorFlowTestCase):
     m = 3
     rng = np.random.RandomState([1, 2, 3])
     x_value = rng.randn(m, m).astype("float32")
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant(x_value)
       x_square = math_ops.reduce_sum(
           math_ops.matmul(array_ops.transpose(x), x) * 0.5
       )
       hess = gradients.hessians(x_square, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     hess_value = np.bmat([
         [elem*np.ones((m, m)) for elem in vec]
         for vec in np.eye(m)
@@ -700,18 +714,19 @@ class HessianTest(test_util.TensorFlowTestCase):
     self.assertAllEqual((m, m, m, m), hess_actual.shape)
     self.assertAllClose(hess_value, hess_actual.reshape((m * m, m * m)))
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian2D_non_square_matrix(self):
     m = 3
     n = 4
     rng = np.random.RandomState([1, 2, 3])
     x_value = rng.randn(m, n).astype("float32")
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant(x_value)
       x_square = math_ops.reduce_sum(
           math_ops.matmul(array_ops.transpose(x), x) * 0.5
       )
       hess = gradients.hessians(x_square, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     hess_value = np.bmat([
         [elem*np.ones((n, n)) for elem in vec]
         for vec in np.eye(m)
@@ -722,6 +737,7 @@ class HessianTest(test_util.TensorFlowTestCase):
 
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesToTensor(self):
     with self.cached_session():
       np_val = np.random.rand(4, 4, 4, 4).astype(np.float32)
@@ -729,8 +745,9 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       c_sparse = math_ops._as_indexed_slices(c)
       self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
       c_dense = math_ops.multiply(c_sparse, 1.0)
-      self.assertAllClose(np_val, c_dense.eval())
+      self.assertAllClose(np_val, self.evaluate(c_dense))
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesToTensorList(self):
     with self.cached_session():
       numpy_list = []
@@ -745,8 +762,9 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
         sparse_list.append(c_sparse)
       packed_dense = array_ops.stack(dense_list)
       packed_sparse = array_ops.stack(sparse_list)
-      self.assertAllClose(packed_dense.eval(), packed_sparse.eval())
+      self.assertAllClose(packed_dense.eval(), self.evaluate(packed_sparse))
 
+  @test_util.run_v1_only("b/120545219")
   def testInt64Indices(self):
     with self.cached_session():
       np_val = np.random.rand(4, 4, 4, 4).astype(np.float32)
@@ -757,8 +775,9 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
           math_ops.cast(c_sparse.indices, dtypes.int64), c_sparse.dense_shape)
       self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
       c_dense = math_ops.multiply(c_sparse, 1.0)
-      self.assertAllClose(np_val, c_dense.eval())
+      self.assertAllClose(np_val, self.evaluate(c_dense))
 
+  @test_util.run_v1_only("b/120545219")
   def testWarnings(self):
     # TODO(gunan) Reenable after this issue is fixed:
     # https://github.com/google/protobuf/issues/2812
@@ -802,6 +821,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
 class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRealOnly(self):
     x = constant_op.constant(7+3j, dtype=dtypes.complex64)
     y = math_ops.square(x)
@@ -814,6 +834,7 @@ class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
 
 class ResourceCondTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     gamma = resource_variable_ops.ResourceVariable(
         np.random.random((3,)),
@@ -853,7 +874,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       y = MyIdentity(MyIdentity(x))
       dy = gradients.gradients(y, x)[0]
       with session.Session():
-        self.assertEqual(9., dy.eval())
+        self.assertEqual(9., self.evaluate(dy))
 
   def testCustomGradient(self):
 
@@ -873,7 +894,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       y = MyMultiply(x1, x2)
       dy = gradients.gradients(y, [x1, x2])
       with session.Session() as sess:
-        self.assertAllEqual([3., 5.], sess.run(dy))
+        self.assertAllEqual([3., 5.], self.evaluate(dy))
 
   def testCustomGradientErrors(self):
 
@@ -914,7 +935,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       for g in grads:
         self.assertTrue(g is not None)
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         dw = sess.run(math_ops.reduce_sum(grads[1]))
         self.assertEqual(12., dw)
 
@@ -943,6 +964,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       self.assertEqual(6., math_ops.reduce_sum(dx).numpy())
       self.assertEqual(8., math_ops.reduce_sum(dw).numpy())
 
+  @test_util.run_v1_only("b/120545219")
   def testCustomGradientErrorsWithNonResourceVariables(self):
 
     def F(x, use_resource=False):
@@ -993,6 +1015,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       # Smoke test to ensure numpy inputs are accepted
       F(x)
 
+  @test_util.run_v1_only("b/120545219")
   def testRVGradientsDynamicCond(self):
     with self.cached_session():
       alpha = resource_variable_ops.ResourceVariable(
@@ -1074,7 +1097,7 @@ class TensorListGradientsTest(test_util.TensorFlowTestCase):
 
       grad = gradients.gradients(tl, a, grad_ys=grad_tl)[0]
       with self.cached_session() as sess:
-        self.assertEquals(sess.run(grad), 5.)
+        self.assertEquals(self.evaluate(grad), 5.)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index 1ba805dbb4469c0d23e783a01f3184906c88209a..b48ef67196bd7d1d56f51b61bc0b28ca2054d28d 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import histogram_ops
@@ -39,7 +40,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_1d_values_int32_output(self):
     # Bins will be:
@@ -51,7 +52,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5, dtype=dtypes.int64)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_1d_float64_values_int32_output(self):
     # Bins will be:
@@ -63,7 +64,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_2d_values(self):
     # Bins will be:
@@ -76,7 +77,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
 
 class HistogramFixedWidthTest(test.TestCase):
@@ -84,6 +85,7 @@ class HistogramFixedWidthTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(0)
 
+  @test_util.run_deprecated_v1
   def test_with_invalid_value_range(self):
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
     with self.assertRaisesRegexp(
@@ -92,6 +94,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Dimension must be 2 but is 3"):
       histogram_ops.histogram_fixed_width(values, [1.0, 2.0, 3.0])
 
+  @test_util.run_deprecated_v1
   def test_with_invalid_nbins(self):
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
     with self.assertRaisesRegexp(
@@ -107,10 +110,10 @@ class HistogramFixedWidthTest(test.TestCase):
     value_range = [0.0, 5.0]
     values = []
     expected_bin_counts = [0, 0, 0, 0, 0]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_1d_values_int64_output(self):
     # Bins will be:
@@ -118,11 +121,11 @@ class HistogramFixedWidthTest(test.TestCase):
     value_range = [0.0, 5.0]
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
     expected_bin_counts = [2, 1, 1, 0, 2]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(
           values, value_range, nbins=5, dtype=dtypes.int64)
       self.assertEqual(dtypes.int64, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_1d_float64_values(self):
     # Bins will be:
@@ -130,10 +133,10 @@ class HistogramFixedWidthTest(test.TestCase):
     value_range = np.float64([0.0, 5.0])
     values = np.float64([-1.0, 0.0, 1.5, 2.0, 5.0, 15])
     expected_bin_counts = [2, 1, 1, 0, 2]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_2d_values(self):
     # Bins will be:
@@ -141,26 +144,27 @@ class HistogramFixedWidthTest(test.TestCase):
     value_range = [0.0, 5.0]
     values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]]
     expected_bin_counts = [2, 1, 1, 0, 2]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
+  @test_util.run_deprecated_v1
   def test_shape_inference(self):
     value_range = [0.0, 5.0]
     values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]]
     expected_bin_counts = [2, 1, 1, 0, 2]
     placeholder = array_ops.placeholder(dtypes.int32)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertAllEqual(hist.shape.as_list(), (5,))
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
       hist = histogram_ops.histogram_fixed_width(
           values, value_range, nbins=placeholder)
       self.assertEquals(hist.shape.ndims, 1)
-      self.assertIs(hist.shape[0].value, None)
+      self.assertIs(hist.shape.dims[0].value, None)
       self.assertEqual(dtypes.int32, hist.dtype)
       self.assertAllClose(expected_bin_counts, hist.eval({placeholder: 5}))
 
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index fddde75f6b646461bc382bf2d985690d5033f47e..c481266dd71c1300612dbc384d240d34b98b3599 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import image_ops
@@ -38,15 +39,16 @@ class ResizeNearestNeighborOpTest(test.TestCase):
     for nptype in self.TYPES:
       x = np.arange(0, 4).reshape(in_shape).astype(nptype)
 
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         input_tensor = constant_op.constant(x, shape=in_shape)
         resize_out = image_ops.resize_nearest_neighbor(input_tensor,
                                                        out_shape[1:3])
         self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-        resize_out = sess.run(resize_out)
+        resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -54,7 +56,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
     for nptype in self.TYPES:
       x = np.arange(0, 6).reshape(in_shape).astype(nptype)
 
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         input_tensor = constant_op.constant(x, shape=in_shape)
         resize_out = image_ops.resize_nearest_neighbor(input_tensor,
                                                        out_shape[1:3])
@@ -62,6 +64,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -69,7 +72,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
     for nptype in self.TYPES:
       x = np.arange(0, 24).reshape(in_shape).astype(nptype)
 
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         input_tensor = constant_op.constant(x, shape=in_shape)
         resize_out = image_ops.resize_nearest_neighbor(input_tensor,
                                                        out_shape[1:3])
@@ -77,6 +80,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testCompareGpuVsCpu(self):
     in_shape = [1, 4, 6, 3]
     out_shape = [1, 8, 16, 3]
@@ -84,14 +88,14 @@ class ResizeNearestNeighborOpTest(test.TestCase):
     for nptype in self.TYPES:
       x = np.arange(0, np.prod(in_shape)).reshape(in_shape).astype(nptype)
       for align_corners in [True, False]:
-        with self.test_session(use_gpu=False):
+        with self.cached_session(use_gpu=False):
           input_tensor = constant_op.constant(x, shape=in_shape)
           resize_out = image_ops.resize_nearest_neighbor(
               input_tensor, out_shape[1:3], align_corners=align_corners)
           grad_cpu = gradient_checker.compute_gradient(
               input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
 
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           input_tensor = constant_op.constant(x, shape=in_shape)
           resize_out = image_ops.resize_nearest_neighbor(
               input_tensor, out_shape[1:3], align_corners=align_corners)
@@ -113,9 +117,10 @@ class ResizeBilinearOpTest(test.TestCase):
       resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
       self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-      resize_out = sess.run(resize_out)
+      resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -129,6 +134,7 @@ class ResizeBilinearOpTest(test.TestCase):
           input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -142,6 +148,7 @@ class ResizeBilinearOpTest(test.TestCase):
           input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testCompareGpuVsCpu(self):
     in_shape = [2, 4, 6, 3]
     out_shape = [2, 8, 16, 3]
@@ -151,7 +158,7 @@ class ResizeBilinearOpTest(test.TestCase):
     for align_corners in [True, False]:
       grad = {}
       for use_gpu in [False, True]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           input_tensor = constant_op.constant(x, shape=in_shape)
           resized_tensor = image_ops.resize_bilinear(
               input_tensor, out_shape[1:3], align_corners=align_corners)
@@ -160,6 +167,7 @@ class ResizeBilinearOpTest(test.TestCase):
 
       self.assertAllClose(grad[False], grad[True], rtol=1e-4, atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testTypes(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -196,9 +204,10 @@ class ResizeBicubicOpTest(test.TestCase):
                                               align_corners=align_corners)
         self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-        resize_out = sess.run(resize_out)
+        resize_out = self.evaluate(resize_out)
         self.assertEqual(out_shape, list(resize_out.shape))
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -214,6 +223,7 @@ class ResizeBicubicOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -229,6 +239,7 @@ class ResizeBicubicOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradOnUnsupportedType(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -262,7 +273,7 @@ class CropAndResizeOpTest(test.TestCase):
     boxes = np.array([[0, 0, 1, 1], [.1, .2, .7, .8]], dtype=np.float32)
     box_ind = np.array([0, 1], dtype=np.int32)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.session(use_gpu=True) as sess:
       crops = image_ops.crop_and_resize(
           constant_op.constant(
               image, shape=image_shape),
@@ -273,7 +284,7 @@ class CropAndResizeOpTest(test.TestCase):
           constant_op.constant(
               crop_size, shape=[2]))
       self.assertEqual(crops_shape, list(crops.get_shape()))
-      crops = sess.run(crops)
+      crops = self.evaluate(crops)
       self.assertEqual(crops_shape, list(crops.shape))
 
   def _randomUniformAvoidAnchors(self, low, high, anchors, radius, num_samples):
@@ -306,6 +317,7 @@ class CropAndResizeOpTest(test.TestCase):
         samples.append(sample)
     return samples
 
+  @test_util.run_deprecated_v1
   def testGradRandomBoxes(self):
     """Test that the gradient is correct for randomly generated boxes.
 
@@ -351,7 +363,7 @@ class CropAndResizeOpTest(test.TestCase):
               boxes = np.array(boxes, dtype=np.float32)
               box_ind = np.arange(batch, dtype=np.int32)
 
-              with self.test_session(use_gpu=True):
+              with self.cached_session(use_gpu=True):
                 image_tensor = constant_op.constant(image, shape=image_shape)
                 boxes_tensor = constant_op.constant(boxes, shape=[num_boxes, 4])
                 box_ind_tensor = constant_op.constant(
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 1c75aab5787ca193e788b9a52db111805fe4cc60..24d049b726fb93401d916d60c0d37fe85de30719 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -24,6 +24,7 @@ from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -37,6 +38,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable('RandomCrop')
@@ -511,15 +513,20 @@ def _rot90_4D(images, k, name_scope):
   result.set_shape([shape[0], None, None, shape[3]])
   return result
 
-@tf_export('image.transpose_image')
+
+@tf_export(v1=['image.transpose', 'image.transpose_image'])
 def transpose_image(image):
-  """Transpose image(s) by swapping the height and width dimension.
+  return transpose(image=image, name=None)
 
-  See also `transpose()`.
+
+@tf_export('image.transpose', v1=[])
+def transpose(image, name=None):
+  """Transpose image(s) by swapping the height and width dimension.
 
   Args:
     image: 4-D Tensor of shape `[batch, height, width, channels]` or
            3-D Tensor of shape `[height, width, channels]`.
+    name: A name for this operation (optional).
 
   Returns:
     If `image` was 4-D, a 4-D float Tensor of shape
@@ -530,14 +537,14 @@ def transpose_image(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'transpose_image', [image]):
+  with ops.name_scope(name, 'transpose', [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
     if shape.ndims == 3 or shape.ndims is None:
-      return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
+      return array_ops.transpose(image, [1, 0, 2], name=name)
     elif shape.ndims == 4:
-      return array_ops.transpose(image, [0, 2, 1, 3], name='transpose_image')
+      return array_ops.transpose(image, [0, 2, 1, 3], name=name)
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
@@ -587,7 +594,7 @@ def central_crop(image, central_fraction):
     # Helper method to return the `idx`-th dimension of `tensor`, along with
     # a boolean signifying if the dimension is dynamic.
     def _get_dim(tensor, idx):
-      static_shape = tensor.get_shape()[idx].value
+      static_shape = tensor.get_shape().dims[idx].value
       if static_shape is not None:
         return static_shape, False
       return array_ops.shape(tensor)[idx], True
@@ -938,12 +945,28 @@ class ResizeMethod(object):
   AREA = 3
 
 
-@tf_export('image.resize_images')
+@tf_export(v1=['image.resize_images', 'image.resize'])
 def resize_images(images,
                   size,
                   method=ResizeMethod.BILINEAR,
                   align_corners=False,
                   preserve_aspect_ratio=False):
+  return resize_images_v2(
+      images=images,
+      size=size,
+      method=method,
+      align_corners=align_corners,
+      preserve_aspect_ratio=preserve_aspect_ratio,
+      name=None)
+
+
+@tf_export('image.resize', v1=[])
+def resize_images_v2(images,
+                     size,
+                     method=ResizeMethod.BILINEAR,
+                     align_corners=False,
+                     preserve_aspect_ratio=False,
+                     name=None):
   """Resize `images` to `size` using the specified `method`.
 
   Resized images will be distorted if their original aspect ratio is not
@@ -979,6 +1002,7 @@ def resize_images(images,
       then `images` will be resized to a size that fits in `size` while
       preserving the aspect ratio of the original image. Scales up the image if
       `size` is bigger than the current size of the `image`. Defaults to False.
+    name: A name for this operation (optional).
 
   Raises:
     ValueError: if the shape of `images` is incompatible with the
@@ -992,7 +1016,7 @@ def resize_images(images,
     If `images` was 3-D, a 3-D float Tensor of shape
     `[new_height, new_width, channels]`.
   """
-  with ops.name_scope(None, 'resize_images', [images, size]):
+  with ops.name_scope(name, 'resize', [images, size]):
     images = ops.convert_to_tensor(images, name='images')
     if images.get_shape().ndims is None:
       raise ValueError('\'images\' contains no shape.')
@@ -1014,8 +1038,8 @@ def resize_images(images,
       raise ValueError('\'size\' must be a 1-D Tensor of 2 elements: '
                        'new_height, new_width')
     size_const_as_shape = tensor_util.constant_value_as_shape(size)
-    new_height_const = size_const_as_shape[0].value
-    new_width_const = size_const_as_shape[1].value
+    new_height_const = size_const_as_shape.dims[0].value
+    new_width_const = size_const_as_shape.dims[1].value
 
     if preserve_aspect_ratio:
       # Get the current shapes of the image, even if dynamic.
@@ -1036,8 +1060,8 @@ def resize_images(images,
       size = ops.convert_to_tensor([scaled_height_const, scaled_width_const],
                                    dtypes.int32, name='size')
       size_const_as_shape = tensor_util.constant_value_as_shape(size)
-      new_height_const = size_const_as_shape[0].value
-      new_width_const = size_const_as_shape[1].value
+      new_height_const = size_const_as_shape.dims[0].value
+      new_width_const = size_const_as_shape.dims[1].value
 
     # If we can determine that the height and width will be unmodified by this
     # transformation, we avoid performing the resize.
@@ -1184,7 +1208,8 @@ def per_image_standardization(image):
   away from zero to protect against division by 0 when handling uniform images.
 
   Args:
-    image: 3-D tensor of shape `[height, width, channels]`.
+    image: An n-D Tensor where the last 3 dimensions are
+           `[height, width, channels]`.
 
   Returns:
     The standardized image with same shape as `image`.
@@ -1194,14 +1219,15 @@ def per_image_standardization(image):
   """
   with ops.name_scope(None, 'per_image_standardization', [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    num_pixels = math_ops.reduce_prod(array_ops.shape(image))
+    image = _AssertAtLeast3DImage(image)
+    num_pixels = math_ops.reduce_prod(array_ops.shape(image)[-3:])
 
     image = math_ops.cast(image, dtype=dtypes.float32)
-    image_mean = math_ops.reduce_mean(image)
+    image_mean = math_ops.reduce_mean(image, axis=[-1, -2, -3], keepdims=True)
 
     variance = (
-        math_ops.reduce_mean(math_ops.square(image)) -
+        math_ops.reduce_mean(
+            math_ops.square(image), axis=[-1, -2, -3], keepdims=True) -
         math_ops.square(image_mean))
     variance = gen_nn_ops.relu(variance)
     stddev = math_ops.sqrt(variance)
@@ -1734,7 +1760,7 @@ def adjust_saturation(image, saturation_factor, name=None):
         orig_dtype)
 
 
-@tf_export('image.is_jpeg')
+@tf_export('io.is_jpeg', 'image.is_jpeg', v1=['io.is_jpeg', 'image.is_jpeg'])
 def is_jpeg(contents, name=None):
   r"""Convenience function to check if the 'contents' encodes a JPEG image.
 
@@ -1769,8 +1795,28 @@ def _is_png(contents, name=None):
     substr = string_ops.substr(contents, 0, 3)
     return math_ops.equal(substr, b'\211PN', name=name)
 
+tf_export('io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg',
+          v1=['io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg'])(
+              gen_image_ops.decode_and_crop_jpeg)
 
-@tf_export('image.decode_image')
+tf_export('io.decode_bmp', 'image.decode_bmp',
+          v1=['io.decode_bmp', 'image.decode_bmp'])(gen_image_ops.decode_bmp)
+tf_export('io.decode_gif', 'image.decode_gif',
+          v1=['io.decode_gif', 'image.decode_gif'])(gen_image_ops.decode_gif)
+tf_export('io.decode_jpeg', 'image.decode_jpeg',
+          v1=['io.decode_jpeg', 'image.decode_jpeg'])(gen_image_ops.decode_jpeg)
+tf_export('io.decode_png', 'image.decode_png',
+          v1=['io.decode_png', 'image.decode_png'])(gen_image_ops.decode_png)
+
+tf_export('io.encode_jpeg', 'image.encode_jpeg',
+          v1=['io.encode_jpeg', 'image.encode_jpeg'])(gen_image_ops.encode_jpeg)
+tf_export('io.extract_jpeg_shape', 'image.extract_jpeg_shape',
+          v1=['io.extract_jpeg_shape', 'image.extract_jpeg_shape'])(
+              gen_image_ops.extract_jpeg_shape)
+
+
+@tf_export('io.decode_image', 'image.decode_image',
+           v1=['io.decode_image', 'image.decode_image'])
 def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
@@ -1940,7 +1986,113 @@ def total_variation(images, name=None):
   return tot_var
 
 
-@tf_export('image.sample_distorted_bounding_box')
+@tf_export('image.sample_distorted_bounding_box', v1=[])
+def sample_distorted_bounding_box_v2(image_size,
+                                     bounding_boxes,
+                                     seed=0,
+                                     min_object_covered=0.1,
+                                     aspect_ratio_range=None,
+                                     area_range=None,
+                                     max_attempts=None,
+                                     use_image_if_no_bounding_boxes=None,
+                                     name=None):
+  """Generate a single randomly distorted bounding box for an image.
+
+  Bounding box annotations are often supplied in addition to ground-truth labels
+  in image recognition or object localization tasks. A common technique for
+  training such a system is to randomly distort an image while preserving
+  its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+  localization of an object, i.e. bounding box, given an `image_size`,
+  `bounding_boxes` and a series of constraints.
+
+  The output of this Op is a single bounding box that may be used to crop the
+  original image. The output is returned as 3 tensors: `begin`, `size` and
+  `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to
+  visualize what the bounding box looks like.
+
+  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`.
+  The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width
+  and height of the underlying image.
+
+  For example,
+
+  ```python
+      # Generate a single distorted bounding box.
+      begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+          tf.shape(image),
+          bounding_boxes=bounding_boxes,
+          min_object_covered=0.1)
+
+      # Draw the bounding box in an image summary.
+      image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+                                                    bbox_for_draw)
+      tf.summary.image('images_with_box', image_with_box)
+
+      # Employ the bounding box to distort the image.
+      distorted_image = tf.slice(image, begin, size)
+  ```
+
+  Note that if no bounding box information is available, setting
+  `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+  bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+  false and no bounding boxes are supplied, an error is raised.
+
+  Args:
+    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
+      `int16`, `int32`, `int64`.
+      1-D, containing `[height, width, channels]`.
+    bounding_boxes: A `Tensor` of type `float32`.
+      3-D with shape `[batch, N, 4]` describing the N bounding boxes
+      associated with the image.
+    seed: An optional `int`. Defaults to `0`.
+      If `seed` is set to non-zero, the random number generator is seeded by
+      the given `seed`.  Otherwise, it is seeded by a random seed.
+    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`.
+      The cropped area of the image must contain at least this
+      fraction of any bounding box supplied. The value of this parameter should
+      be non-negative. In the case of 0, the cropped area does not need to
+      overlap any of the bounding boxes supplied.
+    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
+      1.33]`.
+      The cropped area of the image must have an aspect `ratio =
+      width / height` within this range.
+    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
+      The cropped area of the image must contain a fraction of the
+      supplied image within this range.
+    max_attempts: An optional `int`. Defaults to `100`.
+      Number of attempts at generating a cropped region of the image
+      of the specified constraints. After `max_attempts` failures, return the
+      entire image.
+    use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
+      Controls behavior if no bounding boxes supplied.
+      If true, assume an implicit bounding box covering the whole input. If
+      false, raise an error.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (begin, size, bboxes).
+
+    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[offset_height, offset_width, 0]`. Provide as input to
+      `tf.slice`.
+    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[target_height, target_width, -1]`. Provide as input to
+      `tf.slice`.
+    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing
+    the distorted bounding box.
+    Provide as input to `tf.image.draw_bounding_boxes`.
+  """
+  seed1, seed2 = random_seed.get_seed(seed) if seed else (0, 0)
+  return sample_distorted_bounding_box(
+      image_size, bounding_boxes, seed1, seed2, min_object_covered,
+      aspect_ratio_range, area_range, max_attempts,
+      use_image_if_no_bounding_boxes, name)
+
+
+@tf_export(v1=['image.sample_distorted_bounding_box'])
+@deprecation.deprecated(date=None, instructions='`seed2` arg is deprecated.'
+                        'Use sample_distorted_bounding_box_v2 instead.')
 def sample_distorted_bounding_box(image_size,
                                   bounding_boxes,
                                   seed=None,
@@ -2208,7 +2360,7 @@ def non_max_suppression_with_overlaps(overlaps,
     overlap_threshold = ops.convert_to_tensor(
         overlap_threshold, name='overlap_threshold')
     # pylint: disable=protected-access
-    return gen_image_ops._non_max_suppression_v3(
+    return gen_image_ops.non_max_suppression_with_overlaps(
         overlaps, scores, max_output_size, overlap_threshold, score_threshold)
     # pylint: enable=protected-access
 
@@ -2343,7 +2495,8 @@ def _verify_compatible_image_shapes(img1, img2):
   shape1[-3:].assert_is_compatible_with(shape2[-3:])
 
   if shape1.ndims is not None and shape2.ndims is not None:
-    for dim1, dim2 in zip(reversed(shape1[:-3]), reversed(shape2[:-3])):
+    for dim1, dim2 in zip(reversed(shape1.dims[:-3]),
+                          reversed(shape2.dims[:-3])):
       if not (dim1 == 1 or dim2 == 1 or dim1.is_compatible_with(dim2)):
         raise ValueError(
             'Two images are not compatible: %s and %s' % (shape1, shape2))
@@ -2805,3 +2958,102 @@ def sobel_edges(image):
   output = array_ops.reshape(output, shape=shape)
   output.set_shape(static_image_shape.concatenate([num_kernels]))
   return output
+
+
+resize_area_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.AREA...)` instead.'))
+tf_export(v1=['image.resize_area'])(
+    resize_area_deprecation(gen_image_ops.resize_area))
+
+resize_bicubic_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.BICUBIC...)` instead.'))
+tf_export(v1=['image.resize_bicubic'])(
+    resize_bicubic_deprecation(gen_image_ops.resize_bicubic))
+
+resize_bilinear_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.'))
+tf_export(v1=['image.resize_bilinear'])(
+    resize_bilinear_deprecation(gen_image_ops.resize_bilinear))
+
+resize_nearest_neighbor_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.NEAREST_NEIGHBOR...)` '
+        'instead.'))
+tf_export(v1=['image.resize_nearest_neighbor'])(
+    resize_nearest_neighbor_deprecation(gen_image_ops.resize_nearest_neighbor))
+
+
+@tf_export('image.crop_and_resize', v1=[])
+def crop_and_resize_v2(
+    image,
+    boxes,
+    box_indices,
+    crop_size,
+    method='bilinear',
+    extrapolation_value=0,
+    name=None):
+  """Extracts crops from the input image tensor and resizes them.
+
+  Extracts crops from the input image tensor and resizes them using bilinear
+  sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+  common output size specified by `crop_size`. This is more general than the
+  `crop_to_bounding_box` op which extracts a fixed size slice from the input
+  image and does not allow resizing or aspect ratio change.
+
+  Returns a tensor with `crops` from the input `image` at positions defined at
+  the bounding box locations in `boxes`. The cropped boxes are all resized (with
+  bilinear or nearest neighbor interpolation) to a fixed
+  `size = [crop_height, crop_width]`. The result is a 4-D tensor
+  `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+  In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+  results to using `tf.image.resize_bilinear()` or
+  `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+  `align_corners=True`.
+
+  Args:
+    image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+      Both `image_height` and `image_width` need to be positive.
+    boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+      specifies the coordinates of a box in the `box_ind[i]` image and is
+      specified in normalized coordinates `[y1, x1, y2, x2]`. A normalized
+      coordinate value of `y` is mapped to the image coordinate at `y *
+      (image_height - 1)`, so as the `[0, 1]` interval of normalized image
+      height is mapped to `[0, image_height - 1]` in image height coordinates.
+      We do allow `y1` > `y2`, in which case the sampled crop is an up-down
+      flipped version of the original image. The width dimension is treated
+      similarly. Normalized coordinates outside the `[0, 1]` range are allowed,
+      in which case we use `extrapolation_value` to extrapolate the input image
+      values.
+    box_indices: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0,
+      batch)`. The value of `box_ind[i]` specifies the image that the `i`-th box
+      refers to.
+    crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`.
+      All cropped image patches are resized to this size. The aspect ratio of
+      the image content is not preserved. Both `crop_height` and `crop_width`
+      need to be positive.
+    method: An optional string specifying the sampling method for resizing. It
+      can be either `"bilinear"` or `"nearest"` and default to `"bilinear"`.
+      Currently two sampling methods are supported: Bilinear and Nearest
+      Neighbor.
+    extrapolation_value: An optional `float`. Defaults to `0`. Value used for
+      extrapolation, when applicable.
+    name: A name for the operation (optional).
+
+  Returns:
+    A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+  """
+  return gen_image_ops.crop_and_resize(
+      image, boxes, box_indices, crop_size, method, extrapolation_value, name)
+
+
+crop_and_resize_deprecation = deprecation.deprecated_args(
+    None, 'box_ind is deprecated, use box_indices instead', 'box_ind')
+tf_export(v1=['image.crop_and_resize'])(
+    crop_and_resize_deprecation(gen_image_ops.crop_and_resize))
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 35fdee4fad8c2d375a2ec09d400321d0e8c5d262..e7249333bd35d07821004a39c3c78e52c1ee904d 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -70,7 +70,8 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
         split2 = list(map(image_ops.hsv_to_rgb, split1))
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
-        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+        batch1, batch2, join1, join2 = self.evaluate(
+            [batch1, batch2, join1, join2])
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1)
@@ -84,7 +85,7 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       with self.test_session(use_gpu=True):
         hsv = image_ops.rgb_to_hsv(rgb_np)
         rgb = image_ops.hsv_to_rgb(hsv)
-        rgb_tf = rgb.eval()
+        rgb_tf = self.evaluate(rgb)
       self.assertAllClose(rgb_tf, rgb_np)
 
 
@@ -109,7 +110,8 @@ class RGBToYIQTest(test_util.TensorFlowTestCase):
         split2 = list(map(image_ops.yiq_to_rgb, split1))
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
-        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+        batch1, batch2, join1, join2 = self.evaluate(
+            [batch1, batch2, join1, join2])
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
@@ -138,7 +140,8 @@ class RGBToYUVTest(test_util.TensorFlowTestCase):
         split2 = list(map(image_ops.yuv_to_rgb, split1))
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
-        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+        batch1, batch2, join1, join2 = self.evaluate(
+            [batch1, batch2, join1, join2])
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
@@ -173,7 +176,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.rgb_to_grayscale(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBasicRGBToGrayscale(self):
@@ -195,7 +198,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
     # 3-D input with no batch dimension.
@@ -205,9 +208,10 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     # Shape inference works and produces expected output where possible
     rgb_shape = [7, None, 19, 3]
@@ -245,7 +249,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=1)
 
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       y_np = x_np
 
       self.assertAllClose(y_tf, y_np, 1e-6)
@@ -268,6 +272,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       else:
         raise AssertionError("Exception not raised: %s" % err_msg)
 
+  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_tensor(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
@@ -281,7 +286,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       err_msg = "Gamma should be a non-negative real number."
       try:
-        image.eval()
+        self.evaluate(image)
       except Exception as e:
         if err_msg not in str(e):
           raise
@@ -297,7 +302,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=0)
 
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       dtype = x.dtype.as_numpy_dtype
       y_np = np.array([dtypes.dtype_range[dtype][1]] * x_np.size)
@@ -305,6 +310,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
+  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_one(self):
     """Verifying the output with expected results for gamma
     correction with gamma equal to half"""
@@ -326,6 +332,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
+  @test_util.run_deprecated_v1
   def test_adjust_gamma_greater_one(self):
     """Verifying the output with expected results for gamma
     correction with gamma equal to two"""
@@ -360,7 +367,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testAdjustPositiveHue(self):
@@ -375,7 +382,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBatchAdjustHue(self):
@@ -390,7 +397,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjustHueNp(self, x_np, delta_h):
@@ -415,7 +422,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_hue(x, delta_h)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
     return y_tf
 
   def testAdjustRandomHue(self):
@@ -488,11 +495,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -518,11 +525,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -548,11 +555,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -602,20 +609,19 @@ class AdjustHueBenchmark(test.Benchmark):
     if cpu_count is not None:
       config.inter_op_parallelism_threads = 1
       config.intra_op_parallelism_threads = cpu_count
-    with session.Session("", graph=ops.Graph(), config=config) as sess:
-      with ops.device(device):
-        inputs = variables.Variable(
-            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
-            trainable=False,
-            dtype=dtypes.float32)
-        delta = constant_op.constant(0.1, dtype=dtypes.float32)
-        outputs = image_ops.adjust_hue(inputs, delta)
-        run_op = control_flow_ops.group(outputs)
-        sess.run(variables.global_variables_initializer())
-        for i in xrange(warmup_rounds + benchmark_rounds):
-          if i == warmup_rounds:
-            start = time.time()
-          sess.run(run_op)
+    with self.benchmark_session(config=config, device=device) as sess:
+      inputs = variables.Variable(
+          random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+          trainable=False,
+          dtype=dtypes.float32)
+      delta = constant_op.constant(0.1, dtype=dtypes.float32)
+      outputs = image_ops.adjust_hue(inputs, delta)
+      run_op = control_flow_ops.group(outputs)
+      self.evaluate(variables.global_variables_initializer())
+      for i in xrange(warmup_rounds + benchmark_rounds):
+        if i == warmup_rounds:
+          start = time.time()
+        self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -646,21 +652,20 @@ class AdjustSaturationBenchmark(test.Benchmark):
     if cpu_count is not None:
       config.inter_op_parallelism_threads = 1
       config.intra_op_parallelism_threads = cpu_count
-    with session.Session("", graph=ops.Graph(), config=config) as sess:
-      with ops.device(device):
-        inputs = variables.Variable(
-            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
-            trainable=False,
-            dtype=dtypes.float32)
-        delta = constant_op.constant(0.1, dtype=dtypes.float32)
-        outputs = image_ops.adjust_saturation(inputs, delta)
-        run_op = control_flow_ops.group(outputs)
-        sess.run(variables.global_variables_initializer())
-        for _ in xrange(warmup_rounds):
-          sess.run(run_op)
-        start = time.time()
-        for _ in xrange(benchmark_rounds):
-          sess.run(run_op)
+    with self.benchmark_session(config=config, device=device) as sess:
+      inputs = variables.Variable(
+          random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+          trainable=False,
+          dtype=dtypes.float32)
+      delta = constant_op.constant(0.1, dtype=dtypes.float32)
+      outputs = image_ops.adjust_saturation(inputs, delta)
+      run_op = control_flow_ops.group(outputs)
+      self.evaluate(variables.global_variables_initializer())
+      for _ in xrange(warmup_rounds):
+        self.evaluate(run_op)
+      start = time.time()
+      for _ in xrange(benchmark_rounds):
+        self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -699,8 +704,8 @@ class ResizeBilinearBenchmark(test.Benchmark):
         deps = [resize_op]
       benchmark_op = control_flow_ops.group(*deps)
 
-    with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.benchmark_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -747,8 +752,8 @@ class ResizeBicubicBenchmark(test.Benchmark):
         deps = [resize_op]
       benchmark_op = control_flow_ops.group(*deps)
 
-    with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.benchmark_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -804,8 +809,8 @@ class ResizeAreaBenchmark(test.Benchmark):
         deps = [resize_op]
       benchmark_op = control_flow_ops.group(*deps)
 
-    with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+    with self.benchmark_session() as sess:
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -848,7 +853,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTwiceSaturation(self):
@@ -863,7 +868,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBatchSaturation(self):
@@ -878,7 +883,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjust_saturation(self, image, saturation_factor):
@@ -901,7 +906,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTwiceSaturationFused(self):
@@ -916,7 +921,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjustSaturationNp(self, x_np, scale):
@@ -937,6 +942,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       y_v[i][2] = b
     return y_v.reshape(x_np.shape)
 
+  @test_util.run_deprecated_v1
   def testAdjustRandomSaturation(self):
     x_shapes = [
         [2, 2, 3],
@@ -982,7 +988,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionLeftRightWithBatch(self):
@@ -992,9 +998,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
+  @test_util.run_deprecated_v1
   def testLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1003,7 +1010,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
       self.assertTrue(y.op.name.startswith("flip_left_right"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testLeftRightWithBatch(self):
@@ -1017,9 +1024,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1033,7 +1041,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1048,6 +1056,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipLeftRightWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1072,7 +1081,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1098,7 +1107,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionUpDownWithBatch(self):
@@ -1109,9 +1118,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
+  @test_util.run_deprecated_v1
   def testUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1120,7 +1130,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
       self.assertTrue(y.op.name.startswith("flip_up_down"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testUpDownWithBatch(self):
@@ -1134,9 +1144,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1150,7 +1161,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1165,6 +1176,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipUpDownWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1189,7 +1201,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1215,7 +1227,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionTransposeWithBatch(self):
@@ -1226,9 +1238,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
+  @test_util.run_deprecated_v1
   def testTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
@@ -1236,8 +1249,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      self.assertTrue(y.op.name.startswith("transpose_image"))
-      y_tf = y.eval()
+      self.assertTrue(y.op.name.startswith("transpose"))
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTransposeWithBatch(self):
@@ -1252,9 +1265,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testPartialShapes(self):
     p_unknown_rank = array_ops.placeholder(dtypes.uint8)
     p_unknown_dims_3 = array_ops.placeholder(
@@ -1303,7 +1317,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
-      self.assertAllEqual(image, rotated.eval())
+      self.assertAllEqual(image, self.evaluate(rotated))
 
   def testRot90GroupOrderWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
@@ -1311,8 +1325,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
-      self.assertAllEqual(image, rotated.eval())
+      self.assertAllEqual(image, self.evaluate(rotated))
 
+  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1322,6 +1337,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k)
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
+  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalenceWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1337,7 +1353,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_contrast(x, contrast_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testDoubleContrastUint8(self):
@@ -1392,7 +1408,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_contrast(x, contrast_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
     return y_tf
 
   def testRandomContrast(self):
@@ -1410,6 +1426,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
       y_tf = self._adjustContrastTf(x_np, contrast_factor)
       self.assertAllClose(y_tf, y_np, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testContrastFactorShape(self):
     x_shape = [1, 2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
@@ -1425,7 +1442,7 @@ class AdjustBrightnessTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_brightness(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testPositiveDeltaUint8(self):
@@ -1473,6 +1490,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     y /= stddev
     return y
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x_shape = [13, 9, 3]
     x_np = np.arange(0, np.prod(x_shape), dtype=np.int32).reshape(x_shape)
@@ -1482,7 +1500,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
       self.assertTrue(y.op.name.startswith("per_image_standardization"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
   def testUniformImage(self):
@@ -1490,9 +1508,19 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     im = constant_op.constant(im_np)
     whiten = image_ops.per_image_standardization(im)
     with self.test_session(use_gpu=True):
-      whiten_np = whiten.eval()
+      whiten_np = self.evaluate(whiten)
       self.assertFalse(np.any(np.isnan(whiten_np)))
 
+  def testBatchWhitening(self):
+    imgs_np = np.random.uniform(0., 255., [4, 24, 24, 3])
+    whiten_np = [self._NumpyPerImageWhitening(img) for img in imgs_np]
+    with self.test_session(use_gpu=True):
+      imgs = constant_op.constant(imgs_np)
+      whiten = image_ops.per_image_standardization(imgs)
+      whiten_tf = self.evaluate(whiten)
+      for w_tf, w_np in zip(whiten_tf, whiten_np):
+        self.assertAllClose(w_tf, w_np, atol=1e-4)
+
 
 class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
 
@@ -1563,11 +1591,13 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     y = image_ops.crop_to_bounding_box(image, 0, 0, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     self._assertReturns(x, x_shape, 0, 0, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testCrop(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -1592,6 +1622,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     y = [1, 2, 4, 5, 7, 8]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
     self._assertShapeInference([59, 69, 3], 55, 66, [55, 66, 3])
@@ -1605,6 +1636,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
     self._assertShapeInference(None, 55, 66, [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -1616,6 +1648,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
 
+  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -1647,6 +1680,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
           "assertion failed:",
           use_tensor_inputs_options=[True])
 
+  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [4, 4, 1]
     x = np.zeros(x_shape)
@@ -1664,6 +1698,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     for params, err_msg in test_config:
       self._assertRaises(x, x_shape, *params, err_msg=err_msg)
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
@@ -1680,6 +1715,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     else:
       self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shapes = [[13, 9, 3], [5, 13, 9, 3]]
     for x_shape in x_shapes:
@@ -1688,7 +1724,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
         with self.test_session(use_gpu=use_gpu):
           x = constant_op.constant(x_np, shape=x_shape)
           y = image_ops.central_crop(x, 1.0)
-          y_tf = y.eval()
+          y_tf = self.evaluate(y)
           self.assertAllEqual(y_tf, x_np)
           self.assertEqual(y.op.name, x.op.name)
 
@@ -1703,7 +1739,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       with self.test_session(use_gpu=use_gpu):
         x = constant_op.constant(x_np, shape=x_shape)
         y = image_ops.central_crop(x, 0.5)
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         self.assertAllEqual(y_tf, y_np)
         self.assertAllEqual(y_tf.shape, y_np.shape)
 
@@ -1719,10 +1755,11 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.central_crop(x, 0.5)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
+  @test_util.run_deprecated_v1
   def testCropping2(self):
     # Test case for 10315
     x_shapes = [[240, 320, 3], [5, 240, 320, 3]]
@@ -1739,6 +1776,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(y_tf, y_np)
           self.assertAllEqual(y_tf.shape, y_np.shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     # Test no-op fraction=1.0, with 3-D tensors.
     self._assertShapeInference([50, 60, 3], 1.0, [50, 60, 3])
@@ -1799,6 +1837,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           with self.assertRaises(ValueError):
             _ = image_ops.central_crop(x, 0.5)
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     x_shape = [13, 9, 3]
     x_np = np.ones(x_shape, dtype=np.float32)
@@ -1889,14 +1928,16 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     i = constant_op.constant([1, 0, 4, 3], dtype=dtypes.int64)
     y_tf = image_ops.pad_to_bounding_box(x, i[0], i[1], i[2], i[3])
     with self.test_session(use_gpu=True):
-      self.assertAllClose(y, y_tf.eval())
+      self.assertAllClose(y, self.evaluate(y_tf))
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     offset_height, offset_width = [0, 0]
     self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPadding(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -1921,6 +1962,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
     self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
@@ -1934,6 +1976,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
     self._assertShapeInference(None, 55, 66, [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -1945,6 +1988,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
 
+  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -1977,6 +2021,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
           "all dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
+  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [3, 3, 1]
     x = np.zeros(x_shape)
@@ -1991,6 +2036,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     for config_item in test_config:
       self._assertRaises(x, x_shape, *config_item)
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
@@ -2032,7 +2078,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -2098,6 +2144,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
     # TODO(wicke, shlens, dga): Restore this test so that it is no longer flaky.
     # self.assertGreaterEqual(min(fraction_object_covered), min_object_covered)
 
+  @test_util.run_deprecated_v1
   def testWholeImageBoundingBox(self):
     height = 40
     width = 50
@@ -2112,6 +2159,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
+  @test_util.run_deprecated_v1
   def testWithBoundingBox(self):
     height = 40
     width = 50
@@ -2142,6 +2190,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
+  @test_util.run_deprecated_v1
   def testSampleDistortedBoundingBoxShape(self):
     with self.test_session(use_gpu=True):
       image_size = constant_op.constant(
@@ -2163,9 +2212,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
       # Actual run to make sure shape is correct inside Compute().
-      begin = begin.eval()
-      end = end.eval()
-      bbox_for_drawing = bbox_for_drawing.eval()
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
@@ -2199,9 +2248,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
       # Actual run to make sure shape is correct inside Compute().
-      begin = begin.eval()
-      end = end.eval()
-      bbox_for_drawing = bbox_for_drawing.eval()
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
@@ -2237,6 +2286,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     else:
       return False
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -2257,7 +2307,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(image, [target_height, target_width], opt)
           yshape = array_ops.shape(y)
-          resized, newshape = sess.run([y, yshape])
+          resized, newshape = self.evaluate([y, yshape])
           self.assertAllEqual(img_shape, newshape)
           self.assertAllClose(resized, img_np, atol=1e-5)
 
@@ -2268,9 +2318,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         y = image_ops.resize_images(image, [target_height, target_width],
                                     self.OPTIONS[0])
         yshape = array_ops.shape(y)
-        newshape = yshape.eval()
+        newshape = self.evaluate(yshape)
         self.assertAllEqual(single_shape, newshape)
 
+  @test_util.run_deprecated_v1
   def testTensorArguments(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -2332,6 +2383,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       _ = image_ops.resize_images(image, [6, None],
                                   image_ops.ResizeMethod.BILINEAR)
 
+  @test_util.run_deprecated_v1
   def testReturnDtype(self):
     target_shapes = [[6, 4], [3, 2], [
         array_ops.placeholder(dtypes.int32),
@@ -2371,7 +2423,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         image = constant_op.constant(img_np, shape=img_shape)
         y = image_ops.resize_images(image, [height, width], opt)
         yshape = array_ops.shape(y)
-        resized, newshape = sess.run([y, yshape])
+        resized, newshape = self.evaluate([y, yshape])
         self.assertAllEqual(img_shape, newshape)
         self.assertAllClose(resized, img_np, atol=1e-5)
 
@@ -2403,7 +2455,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               y = image_ops.resize_images(image, [target_height, target_width],
                                           opt)
               expected = np.array(expected_data).reshape(target_shape)
-              resized = y.eval()
+              resized = self.evaluate(y)
               self.assertAllClose(resized, expected, atol=1e-5)
 
   def testResizeUpAlignCornersFalse(self):
@@ -2438,7 +2490,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
               image, [target_height, target_width], opt, align_corners=False)
-          resized = y.eval()
+          resized = self.evaluate(y)
           expected = np.array(expected_data[opt]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
@@ -2474,7 +2526,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
               image, [target_height, target_width], opt, align_corners=True)
-          resized = y.eval()
+          resized = self.evaluate(y)
           expected = np.array(expected_data[opt]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
@@ -2501,7 +2553,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       image = constant_op.constant(img_np, shape=img_shape)
       y = image_ops.resize_images(image, [target_height, target_width],
                                   image_ops.ResizeMethod.BICUBIC)
-      resized = y.eval()
+      resized = self.evaluate(y)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
       self.assertAllClose(resized, expected, atol=1)
@@ -2526,7 +2578,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                   image_ops.ResizeMethod.AREA)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
-      resized = y.eval()
+      resized = self.evaluate(y)
       self.assertAllClose(resized, expected, atol=1)
 
   def testCompareNearestNeighbor(self):
@@ -2546,7 +2598,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                 new_size,
                 image_ops.ResizeMethod.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
-            gpu_val = out_op.eval()
+            gpu_val = self.evaluate(out_op)
           with self.test_session(use_gpu=False):
             image = constant_op.constant(img_np, shape=input_shape)
             new_size = constant_op.constant([target_height, target_width])
@@ -2555,7 +2607,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                 new_size,
                 image_ops.ResizeMethod.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
-            cpu_val = out_op.eval()
+            cpu_val = self.evaluate(out_op)
           self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
 
   def testCompareBilinear(self):
@@ -2577,9 +2629,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                   new_size,
                   image_ops.ResizeMethod.BILINEAR,
                   align_corners=align_corners)
-              value[use_gpu] = out_op.eval()
+              value[use_gpu] = self.evaluate(out_op)
           self.assertAllClose(value[True], value[False], rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
     self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
@@ -2600,12 +2653,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
     self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     img_shape = [1, 3, 2, 1]
     with self.test_session(use_gpu=True):
       single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
       y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith("resize_images"))
+      self.assertTrue(y.op.name.startswith("resize"))
 
   def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
                        use_tensor_inputs):
@@ -2650,6 +2704,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                    preserve_aspect_ratio, use_tensor_inputs)
       self.assertShapeEqual(y, ops.convert_to_tensor(y_tf))
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
@@ -2657,36 +2712,42 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertResizeCheckShape(x, x_shape, [250, 250], [10, 250, 250, 10],
                                  preserve_aspect_ratio=False)
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeEqual(x, x_shape, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmaller(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [50, 50, 10])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmallerMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [10, 50, 50, 10])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioLarger(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [150, 200], [150, 150, 10])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSameRatio(self):
     x_shape = [1920, 1080, 3]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSquare(self):
     x_shape = [299, 299, 3]
     x = np.random.uniform(size=x_shape)
@@ -2756,12 +2817,14 @@ class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
     y = image_ops.resize_image_with_pad(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertReturns(x, x_shape, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPad(self):
     # Reduce vertical dimension
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2852,12 +2915,14 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     y = image_ops.resize_image_with_crop_or_pad(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertReturns(x, x_shape, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPad(self):
     # Pad even along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2895,6 +2960,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testCrop(self):
     # Crop even along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2932,6 +2998,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testCropAndPad(self):
     # Pad along row but crop along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2951,6 +3018,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
     self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
@@ -2972,6 +3040,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
     self._assertShapeInference(None, 55, 66, [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -2985,6 +3054,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
       self._assertRaises(x, x_shape, target_height, target_width,
                          "'image' must have either 3 or 4 dimensions.")
 
+  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     target_height, target_width = [1, 1]
@@ -3010,6 +3080,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
           "all dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
+  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [4, 4, 1]
     x = np.zeros(x_shape)
@@ -3024,6 +3095,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self._assertRaises(x, x_shape, target_height, target_width,
                        "target_width must be > 0")
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
     y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
@@ -3058,7 +3130,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       jpeg0 = io_ops.read_file(path)
       image0 = image_ops.decode_jpeg(jpeg0)
       image1 = image_ops.decode_jpeg(image_ops.encode_jpeg(image0))
-      jpeg0, image0, image1 = sess.run([jpeg0, image0, image1])
+      jpeg0, image0, image1 = self.evaluate([jpeg0, image0, image1])
       self.assertEqual(len(jpeg0), 3771)
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertLess(self.averageError(image0, image1), 1.4)
@@ -3075,7 +3147,7 @@ class JpegTest(test_util.TensorFlowTestCase):
             io_ops.read_file(rgb_path), channels=channels)
         cmyk = image_ops.decode_jpeg(
             io_ops.read_file(cmyk_path), channels=channels)
-        rgb, cmyk = sess.run([rgb, cmyk])
+        rgb, cmyk = self.evaluate([rgb, cmyk])
         self.assertEqual(rgb.shape, shape)
         self.assertEqual(cmyk.shape, shape)
         error = self.averageError(rgb, cmyk)
@@ -3104,9 +3176,10 @@ class JpegTest(test_util.TensorFlowTestCase):
                             image2.get_shape().as_list())
 
         # CropAndDecode should be equal to DecodeJpeg+Crop.
-        image1_crop, image2 = sess.run([image1_crop, image2])
+        image1_crop, image2 = self.evaluate([image1_crop, image2])
         self.assertAllEqual(image1_crop, image2)
 
+  @test_util.run_deprecated_v1
   def testCropAndDecodeJpegWithInvalidCropWindow(self):
     with self.cached_session() as sess:
       # Encode it, then decode it, then encode it
@@ -3123,7 +3196,7 @@ class JpegTest(test_util.TensorFlowTestCase):
         with self.assertRaisesWithPredicateMatch(
             errors.InvalidArgumentError,
             lambda e: "Invalid JPEG data or crop window" in str(e)):
-          sess.run(result)
+          self.evaluate(result)
 
   def testSynthetic(self):
     with self.test_session(use_gpu=True) as sess:
@@ -3133,7 +3206,8 @@ class JpegTest(test_util.TensorFlowTestCase):
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_ACCURATE")
       image2 = image_ops.decode_jpeg(
           image_ops.encode_jpeg(image1), dct_method="INTEGER_ACCURATE")
-      jpeg0, image0, image1, image2 = sess.run([jpeg0, image0, image1, image2])
+      jpeg0, image0, image1, image2 = self.evaluate(
+          [jpeg0, image0, image1, image2])
 
       # The decoded-encoded image should be similar to the input
       self.assertLess(self.averageError(image0, image1), 0.6)
@@ -3153,7 +3227,8 @@ class JpegTest(test_util.TensorFlowTestCase):
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(
           image_ops.encode_jpeg(image1), dct_method="INTEGER_FAST")
-      jpeg0, image0, image1, image2 = sess.run([jpeg0, image0, image1, image2])
+      jpeg0, image0, image1, image2 = self.evaluate(
+          [jpeg0, image0, image1, image2])
 
       # The decoded-encoded image should be similar to the input, but
       # note this is worse than the slower algorithm because it is
@@ -3176,11 +3251,12 @@ class JpegTest(test_util.TensorFlowTestCase):
       jpeg0 = image_ops.encode_jpeg(image0)
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(jpeg0)
-      image1, image2 = sess.run([image1, image2])
+      image1, image2 = self.evaluate([image1, image2])
 
       # The images should be the same.
       self.assertAllClose(image1, image2)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
       jpeg = constant_op.constant("nonsense")
@@ -3189,6 +3265,7 @@ class JpegTest(test_util.TensorFlowTestCase):
         self.assertEqual(image.get_shape().as_list(),
                          [None, None, channels or None])
 
+  @test_util.run_deprecated_v1
   def testExtractJpegShape(self):
     # Read a real jpeg and verify shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
@@ -3199,6 +3276,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       [image_shape] = sess.run([image_ops.extract_jpeg_shape(jpeg)])
       self.assertEqual(image_shape.tolist(), [256, 128, 3])
 
+  @test_util.run_deprecated_v1
   def testExtractJpegShapeforCmyk(self):
     # Read a cmyk jpeg image, and verify its shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
@@ -3222,11 +3300,11 @@ class PngTest(test_util.TensorFlowTestCase):
         with self.test_session(use_gpu=True) as sess:
           png0 = io_ops.read_file(prefix + filename)
           image0 = image_ops.decode_png(png0, channels=channels)
-          png0, image0 = sess.run([png0, image0])
+          png0, image0 = self.evaluate([png0, image0])
           self.assertEqual(image0.shape, (26, 51, channels or channels_in))
           if channels == channels_in:
             image1 = image_ops.decode_png(image_ops.encode_png(image0))
-            self.assertAllEqual(image0, image1.eval())
+            self.assertAllEqual(image0, self.evaluate(image1))
 
   def testSynthetic(self):
     with self.test_session(use_gpu=True) as sess:
@@ -3234,7 +3312,7 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(_SimpleColorRamp())
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
 
       # PNG is lossless
       self.assertAllEqual(image0, image1)
@@ -3249,7 +3327,7 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(_SimpleColorRamp(), dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0, dtype=dtypes.uint16)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
 
       # PNG is lossless
       self.assertAllEqual(image0, image1)
@@ -3265,7 +3343,7 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(gray_alpha)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
       self.assertEqual(2, image0.shape[-1])
       self.assertAllEqual(image0, image1)
 
@@ -3276,10 +3354,11 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(gray_alpha, dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0, dtype=dtypes.uint16)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
       self.assertEqual(2, image0.shape[-1])
       self.assertAllEqual(image0, image1)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.test_session(use_gpu=True):
       png = constant_op.constant("nonsense")
@@ -3302,7 +3381,7 @@ class GifTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(prefix + filename)
       image0 = image_ops.decode_gif(gif0)
-      gif0, image0 = sess.run([gif0, image0])
+      gif0, image0 = self.evaluate([gif0, image0])
 
       self.assertEqual(image0.shape, shape)
 
@@ -3324,6 +3403,7 @@ class GifTest(test_util.TensorFlowTestCase):
     self._testValid("scan.gif")
     self._testValid("optimized.gif")
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
       gif = constant_op.constant("nonsense")
@@ -3350,6 +3430,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
         self.assertTrue(y_saturate.dtype == output_dtype)
         self.assertAllClose(y_saturate.eval(), y_np, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Make sure converting to the same data type creates only an identity op
     with self.test_session(use_gpu=True):
@@ -3359,6 +3440,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self.assertEquals(y.op.type, "Identity")
       self.assertEquals(y.op.inputs[0], image)
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenInteger(self):
     # Make sure converting to between integer types scales appropriately
     with self.test_session(use_gpu=True):
@@ -3367,6 +3449,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
       self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2**32])
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
     with self.test_session(use_gpu=True):
@@ -3375,6 +3458,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([-1.0, 0, 1.0, 200000], dtypes.float64, dtypes.float32,
                     [-1.0, 0, 1.0, 200000])
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenIntegerAndFloat(self):
     # Make sure converting from and to a float type scales appropriately
     with self.test_session(use_gpu=True):
@@ -3383,6 +3467,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([0, 1.1 / 255.0, 1], dtypes.float32, dtypes.uint8,
                     [0, 1, 255])
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenInt16AndInt8(self):
     with self.test_session(use_gpu=True):
       # uint8, uint16
@@ -3423,7 +3508,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
       y = image_ops.total_variation(images=x_tf)
 
       # Run the TensorFlow session to calculate the result.
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       # Assert that the results are as expected within
       # some small error-bound in case they are float-values.
@@ -3574,6 +3659,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
 
 class FormatTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFormats(self):
     prefix = "tensorflow/core/lib"
     paths = ("png/testdata/lena_gray.png", "jpeg/testdata/jpeg_merge_test1.jpg",
@@ -3606,6 +3692,7 @@ class FormatTest(test_util.TensorFlowTestCase):
 
 class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSelectFromThreeClusters(self):
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
                 [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
@@ -3621,6 +3708,7 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
           boxes, scores, max_output_size, iou_threshold).eval()
       self.assertAllClose(selected_indices, [3, 0, 5])
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     # The boxes should be 2D of shape [num_boxes, 4].
     with self.assertRaisesRegexp(ValueError,
@@ -3663,6 +3751,7 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
       scores = constant_op.constant([0.9])
       image_ops.non_max_suppression(boxes, scores, 3, [[0.5]])
 
+  @test_util.run_deprecated_v1
   def testDataTypes(self):
     # Test case for GitHub issue 20199.
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
@@ -3701,12 +3790,13 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
         iou_threshold = constant_op.constant(iou_threshold_np)
         selected_indices, _ = gen_image_ops.non_max_suppression_v4(
             boxes, scores, max_output_size, iou_threshold, score_threshold)
-        selected_indices = selected_indices.eval()
+        selected_indices = self.evaluate(selected_indices)
         self.assertAllClose(selected_indices, [3, 0, 5])
 
 
 class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSelectFromThreeClusters(self):
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
                 [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
@@ -3739,10 +3829,61 @@ class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
       self.assertAllClose(selected_indices.eval(), [3, 0, 5])
       self.assertEqual(num_valid.eval(), 3)
 
+  @test_util.run_deprecated_v1
+  def testSelectFromContinuousOverLap(self):
+    boxes_np = [[0, 0, 1, 1], [0, 0.2, 1, 1.2], [0, 0.4, 1, 1.4],
+                [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
+    scores_np = [0.9, 0.75, 0.6, 0.5, 0.4, 0.3]
+    max_output_size_np = 3
+    iou_threshold_np = 0.5
+    score_threshold_np = 0.1
+    boxes = constant_op.constant(boxes_np)
+    scores = constant_op.constant(scores_np)
+    max_output_size = constant_op.constant(max_output_size_np)
+    iou_threshold = constant_op.constant(iou_threshold_np)
+    score_threshold = constant_op.constant(score_threshold_np)
+    selected_indices, num_valid = image_ops.non_max_suppression_padded(
+        boxes,
+        scores,
+        max_output_size,
+        iou_threshold,
+        score_threshold)
+    # The output shape of the padded operation must be fully defined.
+    self.assertEqual(selected_indices.shape.is_fully_defined(), False)
+    with self.cached_session():
+      self.assertAllClose(selected_indices.eval(), [0, 2, 4])
+      self.assertEqual(num_valid.eval(), 3)
+
+
+class NonMaxSuppressionWithOverlapsTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_deprecated_v1
+  def testSelectOneFromThree(self):
+    overlaps_np = [
+        [1.0, 0.7, 0.2],
+        [0.7, 1.0, 0.0],
+        [0.2, 0.0, 1.0],
+    ]
+    scores_np = [0.7, 0.9, 0.1]
+    max_ouput_size_np = 3
+
+    overlaps = constant_op.constant(overlaps_np)
+    scores = constant_op.constant(scores_np)
+    max_output_size = constant_op.constant(max_ouput_size_np)
+    overlap_threshold = 0.6
+    score_threshold = 0.4
+
+    selected_indices = image_ops.non_max_suppression_with_overlaps(
+        overlaps, scores, max_output_size, overlap_threshold, score_threshold)
+
+    with self.cached_session():
+      self.assertAllClose(selected_indices.eval(), [1])
+
 
 class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
   """Tests utility function used by ssim() and psnr()."""
 
+  @test_util.run_deprecated_v1
   def testWrongDims(self):
     img = array_ops.placeholder(dtype=dtypes.float32)
     img_np = np.array((2, 2))
@@ -3752,6 +3893,7 @@ class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(checks, {img: img_np})
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     img1 = array_ops.placeholder(dtype=dtypes.float32)
     img2 = array_ops.placeholder(dtype=dtypes.float32)
@@ -3773,7 +3915,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
         "tensorflow/core/lib/psnr/testdata", filename))
     im = image_ops.decode_jpeg(content, dct_method="INTEGER_ACCURATE")
     im = image_ops.convert_image_dtype(im, dtypes.float32)
-    im, = sess.run([im])
+    im, = self.evaluate([im])
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
@@ -3792,6 +3934,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
+  @test_util.run_deprecated_v1
   def testPSNRSingleImage(self):
     image1 = self._RandomImage((8, 8, 1), 1)
     image2 = self._RandomImage((8, 8, 1), 1)
@@ -3805,6 +3948,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1.0, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testPSNRMultiImage(self):
     image1 = self._RandomImage((10, 8, 8, 1), 1)
     image2 = self._RandomImage((10, 8, 8, 1), 1)
@@ -3818,6 +3962,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testGoldenPSNR(self):
     q20, q72, q95 = self._LoadTestImages()
 
@@ -3842,6 +3987,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       self.assertAllClose(psnr2, tf_psnr2, atol=0.001)
       self.assertAllClose(psnr3, tf_psnr3, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     q20, _, _ = self._LoadTestImages()
     psnr = self._PSNR_NumPy(q20, q20, 1)
@@ -3850,6 +3996,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_psnr = image_ops.psnr(tf_q20, tf_q20, 1, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((10, 8, 8, 1), 255)
     img2 = self._RandomImage((10, 8, 8, 1), 255)
@@ -3860,7 +4007,8 @@ class PSNRTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     psnr_float32 = image_ops.psnr(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(psnr_uint8.eval(), psnr_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          psnr_uint8.eval(), self.evaluate(psnr_float32), atol=0.001)
 
 
 class SSIMTest(test_util.TensorFlowTestCase):
@@ -3879,7 +4027,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
         "tensorflow/core/lib/ssim/testdata", filename))
     im = image_ops.decode_png(content)
     im = image_ops.convert_image_dtype(im, dtypes.float32)
-    im, = sess.run([im])
+    im, = self.evaluate([im])
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
@@ -3890,6 +4038,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
+  @test_util.run_deprecated_v1
   def testAgainstMatlab(self):
     """Tests against values produced by Matlab."""
     img = self._LoadTestImages()
@@ -3913,7 +4062,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     ssim = image_ops.ssim(constant_op.constant(img1),
                           constant_op.constant(img2), 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
   def testBroadcast(self):
     img = self._LoadTestImages()[:2]
@@ -3925,8 +4074,9 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ssim = image_ops.ssim(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testNegative(self):
     """Tests against negative SSIM index."""
     step = np.expand_dims(np.arange(0, 256, 16, dtype=np.uint8), axis=0)
@@ -3941,6 +4091,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       self.assertLess(ssim.eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((1, 16, 16, 3), 255)
     img2 = self._RandomImage((1, 16, 16, 3), 255)
@@ -3951,7 +4102,8 @@ class SSIMTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
 
 class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
@@ -3970,7 +4122,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
         "tensorflow/core/lib/ssim/testdata", filename))
     im = image_ops.decode_png(content)
     im = image_ops.convert_image_dtype(im, dtypes.float32)
-    im, = sess.run([im])
+    im, = self.evaluate([im])
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
@@ -3981,6 +4133,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
+  @test_util.run_deprecated_v1
   def testAgainstMatlab(self):
     """Tests against MS-SSIM computed with Matlab implementation.
 
@@ -3997,6 +4150,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     self.assertAllClose(expected, np.squeeze(scores), atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testUnweightedIsDifferentiable(self):
     img = self._LoadTestImages()
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
@@ -4021,7 +4175,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     msssim = image_ops.ssim_multiscale(constant_op.constant(img1),
                                        constant_op.constant(img2), 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, msssim.eval(), 1e-4)
+      self.assertAllClose(expected, self.evaluate(msssim), 1e-4)
 
   def testBroadcast(self):
     """Tests MS-SSIM broadcasting."""
@@ -4034,7 +4188,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     score_tensor = image_ops.ssim_multiscale(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, score_tensor.eval(), 1e-4)
+      self.assertAllClose(expected, self.evaluate(score_tensor), 1e-4)
 
   def testRange(self):
     """Tests against low MS-SSIM score.
@@ -4052,12 +4206,13 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
       images = [ops.convert_to_tensor(x, dtype=dtypes.float32) for x in images]
       msssim_ops = [image_ops.ssim_multiscale(x, y, 1.0)
                     for x, y in itertools.combinations(images, 2)]
-      msssim = sess.run(msssim_ops)
+      msssim = self.evaluate(msssim_ops)
       msssim = np.squeeze(msssim)
 
     self.assertTrue(np.all(msssim >= 0.0))
     self.assertTrue(np.all(msssim <= 1.0))
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((1, 180, 240, 3), 255)
     img2 = self._RandomImage((1, 180, 240, 3), 255)
@@ -4068,7 +4223,8 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim_multiscale(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
 
 class ImageGradientsTest(test_util.TensorFlowTestCase):
@@ -4083,8 +4239,8 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
 
     dy, dx = image_ops.image_gradients(img)
     with self.cached_session():
-      actual_dy = dy.eval()
-      actual_dx = dx.eval()
+      actual_dy = self.evaluate(dy)
+      actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
       self.assertAllClose(expected_dx, actual_dx)
 
@@ -4108,8 +4264,8 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
     assert batch.get_shape().as_list() == [2, 2, 3, 2]
     dy, dx = image_ops.image_gradients(batch)
     with self.test_session(use_gpu=True):
-      actual_dy = dy.eval()
-      actual_dx = dx.eval()
+      actual_dy = self.evaluate(dy)
+      actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
       self.assertAllClose(expected_dx, actual_dx)
 
@@ -4129,7 +4285,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
                            [[0, 0], [0, 12], [0, 0]]], [1, 2, 3, 1, 2])
     sobel = image_ops.sobel_edges(img)
     with self.test_session(use_gpu=True):
-      actual_sobel = sobel.eval()
+      actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected, actual_sobel)
 
   def testSobelEdges5x3x4x2(self):
@@ -4151,7 +4307,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 
     sobel = image_ops.sobel_edges(img)
     with self.test_session(use_gpu=True):
-      actual_sobel = sobel.eval()
+      actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected_batch, actual_sobel)
 
 
@@ -4164,7 +4320,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
                                              dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testPngUint16(self):
@@ -4174,7 +4330,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(
           image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testGifUint16(self):
@@ -4184,7 +4340,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
                                              dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testBmpUint16(self):
@@ -4194,7 +4350,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
                                              dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testJpegFloat32(self):
@@ -4204,7 +4360,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
                                              dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testPngFloat32(self):
@@ -4214,7 +4370,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(
           image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testGifFloat32(self):
@@ -4224,7 +4380,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
                                              dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testBmpFloat32(self):
@@ -4234,7 +4390,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
                                              dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 65bb77b47475d2481cd2cc2eb1113e92f71c41e1..c0a4bcd51dd10f352366b74955241e5f97133130 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -55,6 +55,15 @@ class Initializer(object):
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. If not provided use the initializer
+        dtype.
+      partition_info: Optional information about the possible partitioning of a
+        tensor.
+    """
     raise NotImplementedError
 
   def get_config(self):
@@ -143,7 +152,8 @@ class Constant(Initializer):
     value: A Python scalar, list or tuple of values, or a N-dimensional numpy
       array. All elements of the initialized variable will be set to the
       corresponding value in the `value` argument.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer.
     verify_shape: Boolean that enables verification of the shape of `value`. If
       `True`, the initializer will throw an error if the shape of `value` is not
       compatible with the shape of the initialized tensor.
@@ -216,7 +226,7 @@ class Constant(Initializer):
       dtype = self.dtype
     if verify_shape is None:
       verify_shape = self._verify_shape
-    return constant_op.constant(
+    return constant_op.constant_v1(
         self.value, dtype=dtype, shape=shape, verify_shape=verify_shape)
 
   def get_config(self):
@@ -239,7 +249,8 @@ class RandomUniform(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer.
   """
 
   def __init__(self, minval=0, maxval=None, seed=None, dtype=dtypes.float32):
@@ -275,7 +286,8 @@ class RandomNormal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
@@ -316,7 +328,8 @@ class TruncatedNormal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
@@ -340,8 +353,11 @@ class TruncatedNormal(Initializer):
     }
 
 
-@tf_export("initializers.uniform_unit_scaling",
-           "uniform_unit_scaling_initializer")
+@tf_export(
+    "initializers.uniform_unit_scaling",
+    v1=[
+        "initializers.uniform_unit_scaling", "uniform_unit_scaling_initializer"
+    ])
 @deprecation.deprecated_endpoints("uniform_unit_scaling_initializer")
 class UniformUnitScaling(Initializer):
   """Initializer that generates tensors without scaling variance.
@@ -357,8 +373,7 @@ class UniformUnitScaling(Initializer):
   A similar calculation for convolutional networks gives an analogous result
   with `dim` equal to the product of the first 3 dimensions.  When
   nonlinearities are present, we need to multiply this by a constant `factor`.
-  See [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
-  ([pdf](http://arxiv.org/pdf/1412.6558.pdf)) for deeper motivation, experiments
+  See (Sussillo et al., 2014) for deeper motivation, experiments
   and the calculation of constants. In section 2.3 there, the constants were
   numerically computed: for a linear layer it's 1.0, relu: ~1.43, tanh: ~1.15.
 
@@ -367,7 +382,12 @@ class UniformUnitScaling(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
+      ([pdf](http://arxiv.org/pdf/1412.6558.pdf))
   """
 
   @deprecated(None,
@@ -401,8 +421,13 @@ class UniformUnitScaling(Initializer):
     return {"factor": self.factor, "seed": self.seed, "dtype": self.dtype.name}
 
 
-@tf_export("keras.initializers.VarianceScaling",
-           "initializers.variance_scaling", "variance_scaling_initializer")
+@tf_export(
+    "keras.initializers.VarianceScaling",
+    "initializers.variance_scaling",
+    v1=[
+        "keras.initializers.VarianceScaling", "initializers.variance_scaling",
+        "variance_scaling_initializer"
+    ])
 @deprecation.deprecated_endpoints("variance_scaling_initializer")
 class VarianceScaling(Initializer):
   """Initializer capable of adapting its scale to the shape of weights tensors.
@@ -426,7 +451,8 @@ class VarianceScaling(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
 
   Raises:
     ValueError: In case of an invalid value for the "scale", mode" or
@@ -472,7 +498,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal" or self.distribution == "truncated_normal":
-      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+    # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
       stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
@@ -495,8 +521,14 @@ class VarianceScaling(Initializer):
     }
 
 
-@tf_export("keras.initializers.Orthogonal", "initializers.orthogonal",
-           "orthogonal_initializer", "keras.initializers.orthogonal")
+@tf_export(
+    "keras.initializers.Orthogonal",
+    "initializers.orthogonal",
+    "keras.initializers.orthogonal",
+    v1=[
+        "keras.initializers.Orthogonal", "initializers.orthogonal",
+        "orthogonal_initializer", "keras.initializers.orthogonal"
+    ])
 @deprecation.deprecated_endpoints("orthogonal_initializer")
 class Orthogonal(Initializer):
   """Initializer that generates an orthogonal matrix.
@@ -517,7 +549,12 @@ class Orthogonal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
+      ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -562,16 +599,21 @@ class ConvolutionDeltaOrthogonal(Initializer):
   The shape of the tensor must have length 3, 4 or 5. The number of input
   filters must not exceed the number of output filters. The center pixels of the
   tensor form an orthogonal matrix. Other pixels are set to be zero. See
-  algorithm 2 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
+  algorithm 2 in (Xiao et al., 2018).
 
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -599,7 +641,7 @@ class ConvolutionDeltaOrthogonal(Initializer):
     d = array_ops.diag_part(r)
     q *= math_ops.sign(d)
     q = q[:shape[-2], :]
-    q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    q *= math_ops.cast(self.gain, dtype=dtype)
     if len(shape) == 3:
       weight = array_ops.scatter_nd([[(shape[0]-1)//2]],
                                     array_ops.expand_dims(q, 0), shape)
@@ -622,12 +664,17 @@ class ConvolutionOrthogonal(Initializer):
   Base class used to construct 1D, 2D and 3D orthogonal kernels for convolution.
 
   Args:
-    gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -684,15 +731,20 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
   filters must not exceed the number of output filters.
   The orthogonality(==isometry) is exact when the inputs are circular padded.
   There are finite-width effects with non-circular padding (e.g. zero padding).
-  See algorithm 1 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
+  See algorithm 1 in (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      This has the effect of scaling the output 2-norm by a factor of
-      `sqrt(gain)`.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. This has the effect of scaling the output 2-norm by
+      a factor of `gain`.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -708,7 +760,7 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
       raise ValueError("Kernel sizes must be equal.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[2], shape[3])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k1, k2):
@@ -820,16 +872,21 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
   filters must not exceed the number of output filters.
   The orthogonality(==isometry) is exact when the inputs are circular padded.
   There are finite-width effects with non-circular padding (e.g. zero padding).
-  See algorithm 1 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
+  See algorithm 1 in (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -842,7 +899,7 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
       raise ValueError("In_filters cannot be greater than out_filters.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k):
@@ -937,15 +994,20 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
   filters must not exceed the number of output filters.
   The orthogonality(==isometry) is exact when the inputs are circular padded.
   There are finite-width effects with non-circular padding (e.g. zero padding).
-  See algorithm 1 [Xiao et al., 2018] in: https://arxiv.org/abs/1806.05393
+  See algorithm 1 (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -961,7 +1023,7 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
       raise ValueError("Kernel sizes must be equal.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k1, k2, k3):
@@ -1091,7 +1153,8 @@ class Identity(Initializer):
 
   Args:
     gain: Multiplicative factor to apply to the identity matrix.
-    dtype: The type of the output.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, gain=1.0, dtype=dtypes.float32):
@@ -1125,18 +1188,19 @@ class GlorotUniform(VarianceScaling):
   where `fan_in` is the number of input units in the weight tensor
   and `fan_out` is the number of output units in the weight tensor.
 
-  Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-
   Args:
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
-  def __init__(self,
-               seed=None,
-               dtype=dtypes.float32):
+  def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotUniform, self).__init__(
         scale=1.0,
         mode="fan_avg",
@@ -1145,14 +1209,16 @@ class GlorotUniform(VarianceScaling):
         dtype=dtype)
 
   def get_config(self):
-    return {
-        "seed": self.seed,
-        "dtype": self.dtype.name
-    }
+    return {"seed": self.seed, "dtype": self.dtype.name}
 
 
-@tf_export("glorot_normal_initializer", "keras.initializers.glorot_normal",
-           "initializers.glorot_normal")
+@tf_export(
+    "keras.initializers.glorot_normal",
+    "initializers.glorot_normal",
+    v1=[
+        "glorot_normal_initializer", "keras.initializers.glorot_normal",
+        "initializers.glorot_normal"
+    ])
 @deprecation.deprecated_endpoints("glorot_normal_initializer")
 class GlorotNormal(VarianceScaling):
   """The Glorot normal initializer, also called Xavier normal initializer.
@@ -1162,18 +1228,18 @@ class GlorotNormal(VarianceScaling):
   where `fan_in` is the number of input units in the weight tensor
   and `fan_out` is the number of output units in the weight tensor.
 
-  Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-
   Args:
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
-    dtype: The data type. Only floating point types are supported.
+      `tf.set_random_seed` for behavior.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
-  def __init__(self,
-               seed=None,
-               dtype=dtypes.float32):
+  def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotNormal, self).__init__(
         scale=1.0,
         mode="fan_avg",
@@ -1182,10 +1248,7 @@ class GlorotNormal(VarianceScaling):
         dtype=dtype)
 
   def get_config(self):
-    return {
-        "seed": self.seed,
-        "dtype": self.dtype.name
-    }
+    return {"seed": self.seed, "dtype": self.dtype.name}
 
 
 # Aliases.
@@ -1225,9 +1288,11 @@ def lecun_normal(seed=None):
       An initializer.
 
   References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-      - [Efficient
-      Backprop](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
   """
   return VarianceScaling(
       scale=1., mode="fan_in", distribution="truncated_normal", seed=seed)
@@ -1248,8 +1313,11 @@ def lecun_uniform(seed=None):
       An initializer.
 
   References:
-      LeCun 98, Efficient Backprop,
-      http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
   """
   return VarianceScaling(
       scale=1., mode="fan_in", distribution="uniform", seed=seed)
@@ -1270,7 +1338,8 @@ def he_normal(seed=None):
       An initializer.
 
   References:
-      He et al., http://arxiv.org/abs/1502.01852
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
       scale=2., mode="fan_in", distribution="truncated_normal", seed=seed)
@@ -1291,7 +1360,8 @@ def he_uniform(seed=None):
       An initializer.
 
   References:
-      He et al., http://arxiv.org/abs/1502.01852
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
       scale=2., mode="fan_in", distribution="uniform", seed=seed)
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index 5693c3caaf5ca80fd6528c94bb952acc7bc8957c..1f22248004697438d2c8c05dc0c6762a20902d31 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -45,8 +45,8 @@ class InitializersTest(test.TestCase):
       output = variable.numpy()
     else:
       sess = ops.get_default_session()
-      sess.run(variable.initializer)
-      output = sess.run(variable)
+      self.evaluate(variable.initializer)
+      output = self.evaluate(variable)
     lim = 3e-2
     if target_std is not None:
       self.assertGreater(lim, abs(output.std() - target_std))
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index c7314d77749130e4696d58896249b73cc2de4a12..5df2d6b83816334f46ef45eec675ed9b7e35bd00 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -18,6 +18,7 @@ py_library(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/signal",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/ops/linalg/cholesky_registrations.py b/tensorflow/python/ops/linalg/cholesky_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5284cf22ac2981f79c0d3c7a6a60635c9d0bf02
--- /dev/null
+++ b/tensorflow/python/ops/linalg/cholesky_registrations.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.cholesky."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+
+
+# By default, compute the Cholesky of the dense matrix, and return a
+# LowerTriangular operator. Methods below specialize this registration.
+@linear_operator_algebra.RegisterCholesky(linear_operator.LinearOperator)
+def _cholesky_linear_operator(linop):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      linalg_ops.cholesky(linop.to_dense()),
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_diag.LinearOperatorDiag)
+def _cholesky_diag(diag_operator):
+  return linear_operator_diag.LinearOperatorDiag(
+      math_ops.sqrt(diag_operator.diag),
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_identity.LinearOperatorIdentity)
+def _cholesky_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      batch_shape=identity_operator.batch_shape,
+      dtype=identity_operator.dtype,
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _cholesky_scaled_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=math_ops.sqrt(identity_operator.multiplier),
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _cholesky_block_diag(block_diag_operator):
+    # We take the cholesky of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.cholesky() for operator in block_diag_operator.operators],
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _cholesky_kronecker(kronecker_operator):
+    # Cholesky decomposition of a Kronecker product is the Kronecker product
+    # of cholesky decompositions.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.cholesky() for operator in kronecker_operator.operators],
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index c29b5033bb137e8376e1c19985755b4fc72e8834..ac4fd4ebc6059a187828c757c852a470d8ee69a8 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.linalg import cholesky_registrations as _cholesky_registrations
+from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
+from tensorflow.python.ops.linalg import matmul_registrations as _matmul_registrations
 from tensorflow.python.ops.linalg.linalg_impl import *
 from tensorflow.python.ops.linalg.linear_operator import *
 from tensorflow.python.ops.linalg.linear_operator_block_diag import *
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 1e3d81798060548d98487f3426184df2df72f123..df2bd887cdde6f651db572c2bdfebd2bc0170716 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -44,12 +44,14 @@ einsum = special_math_ops.einsum
 eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
 logm = gen_linalg_ops.matrix_logarithm
+lu = gen_linalg_ops.lu
 tf_export('linalg.logm')(logm)
 lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
 qr = linalg_ops.qr
 set_diag = array_ops.matrix_set_diag
 solve = linalg_ops.matrix_solve
+sqrtm = linalg_ops.matrix_square_root
 svd = linalg_ops.svd
 tensordot = math_ops.tensordot
 trace = math_ops.trace
@@ -87,7 +89,7 @@ def logdet(matrix, name=None):
     chol = gen_linalg_ops.cholesky(matrix)
     return 2.0 * math_ops.reduce_sum(
         math_ops.log(math_ops.real(array_ops.matrix_diag_part(chol))),
-        reduction_indices=[-1])
+        axis=[-1])
 
 
 @tf_export('linalg.adjoint')
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 20c46fbb82b0671c6cc586eafdd7fa346d8b4e6d..8efafda3a1e7424442163a76aca95d14af4b8a70 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -22,14 +22,17 @@ import abc
 import contextlib
 
 import numpy as np
+import six
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
@@ -39,6 +42,7 @@ __all__ = ["LinearOperator"]
 
 # TODO(langmore) Use matrix_solve_ls for singular or non-square matrices.
 @tf_export("linalg.LinearOperator")
+@six.add_metaclass(abc.ABCMeta)
 class LinearOperator(object):
   """Base class defining a [batch of] linear operator[s].
 
@@ -139,7 +143,6 @@ class LinearOperator(object):
   * If `is_X == None` (the default), callers should have no expectation either
     way.
   """
-  __metaclass__ = abc.ABCMeta
 
   def __init__(self,
                dtype,
@@ -282,7 +285,7 @@ class LinearOperator(object):
     `[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`
@@ -316,7 +319,7 @@ class LinearOperator(object):
     `[B1,...,Bb]`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`
@@ -338,7 +341,7 @@ class LinearOperator(object):
     `A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       Python integer, or None if the tensor rank is undefined.
@@ -354,7 +357,7 @@ class LinearOperator(object):
     `A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`, determined at runtime.
@@ -397,8 +400,9 @@ class LinearOperator(object):
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
       # Prefer to use statically defined shape if available.
-      if self.domain_dimension.value is not None:
-        return ops.convert_to_tensor(self.domain_dimension.value)
+      dim_value = tensor_shape.dimension_value(self.domain_dimension)
+      if dim_value is not None:
+        return ops.convert_to_tensor(dim_value)
       else:
         return self.shape_tensor()[-1]
 
@@ -413,7 +417,10 @@ class LinearOperator(object):
       `Dimension` object.
     """
     # Derived classes get this "for free" once .shape is implemented.
-    return self.shape[-2]
+    if self.shape.dims:
+      return self.shape.dims[-2]
+    else:
+      return tensor_shape.Dimension(None)
 
   def range_dimension_tensor(self, name="range_dimension_tensor"):
     """Dimension (in the sense of vector spaces) of the range of this operator.
@@ -432,8 +439,9 @@ class LinearOperator(object):
     # Derived classes get this "for free" once .shape() is implemented.
     with self._name_scope(name):
       # Prefer to use statically defined shape if available.
-      if self.range_dimension.value is not None:
-        return ops.convert_to_tensor(self.range_dimension.value)
+      dim_value = tensor_shape.dimension_value(self.range_dimension)
+      if dim_value is not None:
+        return ops.convert_to_tensor(dim_value)
       else:
         return self.shape_tensor()[-2]
 
@@ -574,23 +582,38 @@ class LinearOperator(object):
     ```
 
     Args:
-      x: `Tensor` with compatible shape and same `dtype` as `self`.
-        See class docstring for definition of compatibility.
+      x: `LinearOperator` or `Tensor` with compatible shape and same `dtype` as
+        `self`. See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
       adjoint_arg:  Python `bool`.  If `True`, compute `A x^H` where `x^H` is
         the hermitian transpose (transposition and complex conjugation).
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
-      A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
+      A `LinearOperator` or `Tensor` with shape `[..., M, R]` and same `dtype`
+        as `self`.
     """
+    if isinstance(x, LinearOperator):
+      if adjoint or adjoint_arg:
+        raise ValueError(".matmul not supported with adjoints.")
+      if (x.range_dimension is not None and
+          self.domain_dimension is not None and
+          x.range_dimension != self.domain_dimension):
+        raise ValueError(
+            "Operators are incompatible. Expected `x` to have dimension"
+            " {} but got {}.".format(self.domain_dimension, x.range_dimension))
+      with self._name_scope(name):
+        return linear_operator_algebra.matmul(self, x)
+
     with self._name_scope(name, values=[x]):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
 
       self_dim = -2 if adjoint else -1
       arg_dim = -1 if adjoint_arg else -2
-      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
+      tensor_shape.dimension_at_index(
+          self.shape, self_dim).assert_is_compatible_with(
+              x.get_shape()[arg_dim])
 
       return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
@@ -621,7 +644,7 @@ class LinearOperator(object):
         dimensions, the last dimension defines a vector.
         See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
@@ -630,7 +653,8 @@ class LinearOperator(object):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
       self_dim = -2 if adjoint else -1
-      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[-1])
+      tensor_shape.dimension_at_index(
+          self.shape, self_dim).assert_is_compatible_with(x.get_shape()[-1])
       return self._matvec(x, adjoint=adjoint)
 
   def _determinant(self):
@@ -639,13 +663,13 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     if self._can_use_cholesky():
       return math_ops.exp(self.log_abs_determinant())
-    return linalg_ops.matrix_determinant(self._matrix)
+    return linalg_ops.matrix_determinant(self.to_dense())
 
   def determinant(self, name="det"):
     """Determinant for every batch member.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
@@ -666,15 +690,15 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     if self._can_use_cholesky():
       diag = array_ops.matrix_diag_part(linalg_ops.cholesky(self.to_dense()))
-      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
-    _, log_abs_det = linalg.slogdet(self._matrix)
+      return 2 * math_ops.reduce_sum(math_ops.log(diag), axis=[-1])
+    _, log_abs_det = linalg.slogdet(self.to_dense())
     return log_abs_det
 
   def log_abs_determinant(self, name="log_abs_det"):
     """Log absolute value of determinant for every batch member.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
@@ -759,7 +783,9 @@ class LinearOperator(object):
 
       self_dim = -1 if adjoint else -2
       arg_dim = -1 if adjoint_arg else -2
-      self.shape[self_dim].assert_is_compatible_with(rhs.get_shape()[arg_dim])
+      tensor_shape.dimension_at_index(
+          self.shape, self_dim).assert_is_compatible_with(
+              rhs.get_shape()[arg_dim])
 
       return self._solve(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
 
@@ -812,10 +838,37 @@ class LinearOperator(object):
       rhs = ops.convert_to_tensor(rhs, name="rhs")
       self._check_input_dtype(rhs)
       self_dim = -1 if adjoint else -2
-      self.shape[self_dim].assert_is_compatible_with(rhs.get_shape()[-1])
+      tensor_shape.dimension_at_index(
+          self.shape, self_dim).assert_is_compatible_with(
+              rhs.get_shape()[-1])
 
       return self._solvevec(rhs, adjoint=adjoint)
 
+  def cholesky(self, name="cholesky"):
+    """Returns a Cholesky factor as a `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, if `A` is positive definite
+    self-adjoint, return `L`, where `A = L L^T`, i.e. the cholesky
+    decomposition.
+
+    Args:
+      name:  A name for this `Op`.
+
+    Returns:
+      `LinearOperator` which represents the lower triangular matrix
+      in the Cholesky decomposition.
+
+    Raises:
+      ValueError: When the `LinearOperator` is not hinted to be positive
+        definite and self adjoint.
+    """
+
+    if not self._can_use_cholesky():
+      raise ValueError("Cannot take the Cholesky decomposition: "
+                       "Not a positive definite self adjoint matrix.")
+    with self._name_scope(name):
+      return linear_operator_algebra.cholesky(self)
+
   def _to_dense(self):
     """Generic and often inefficient implementation.  Override often."""
     logging.warn("Using (possibly slow) default implementation of to_dense."
@@ -825,8 +878,9 @@ class LinearOperator(object):
     else:
       batch_shape = self.batch_shape_tensor()
 
-    if self.domain_dimension.value is not None:
-      n = self.domain_dimension.value
+    dim_value = tensor_shape.dimension_value(self.domain_dimension)
+    if dim_value is not None:
+      n = dim_value
     else:
       n = self.domain_dimension_tensor()
 
@@ -907,6 +961,4 @@ class LinearOperator(object):
       return self._add_to_tensor(x)
 
   def _can_use_cholesky(self):
-    # TODO(langmore) Add complex types when tf.cholesky can use them.
-    return (not self.dtype.is_complex and self.is_self_adjoint and
-            self.is_positive_definite)
+    return self.is_self_adjoint and self.is_positive_definite
diff --git a/tensorflow/python/ops/linalg/linear_operator_addition.py b/tensorflow/python/ops/linalg/linear_operator_addition.py
index 86130a2c077ce14a7539b281ec809029bc05e071..50baf03c124b29afc49e788e628cad58f177a400 100644
--- a/tensorflow/python/ops/linalg/linear_operator_addition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_addition.py
@@ -23,6 +23,7 @@ import abc
 import six
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops.linalg import linear_operator
@@ -194,14 +195,18 @@ def _static_check_for_same_dimensions(operators):
   if len(operators) < 2:
     return
 
-  domain_dimensions = [(op.name, op.domain_dimension.value) for op in operators
-                       if op.domain_dimension.value is not None]
+  domain_dimensions = [
+      (op.name, tensor_shape.dimension_value(op.domain_dimension))
+      for op in operators
+      if tensor_shape.dimension_value(op.domain_dimension) is not None]
   if len(set(value for name, value in domain_dimensions)) > 1:
     raise ValueError("Operators must have the same domain dimension. Found: %s"
                      % domain_dimensions)
 
-  range_dimensions = [(op.name, op.range_dimension.value) for op in operators
-                      if op.range_dimension.value is not None]
+  range_dimensions = [
+      (op.name, tensor_shape.dimension_value(op.range_dimension))
+      for op in operators
+      if tensor_shape.dimension_value(op.range_dimension) is not None]
   if len(set(value for name, value in range_dimensions)) > 1:
     raise ValueError("Operators must have the same range dimension. Found: %s" %
                      range_dimensions)
diff --git a/tensorflow/python/ops/linalg/linear_operator_adjoint.py b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..858e224b9adda57b4d472ae2f61b2b6cda74c243
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Takes the adjoint of a `LinearOperator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = []
+
+
+@tf_export("linalg.LinearOperatorAdjoint")
+class LinearOperatorAdjoint(linear_operator.LinearOperator):
+  """`LinearOperator` representing the adjoint of another operator.
+
+  This operator represents the adjoint of another operator.
+
+  ```python
+  # Create a 2 x 2 linear operator.
+  operator = LinearOperatorFullMatrix([[1 - i., 3.], [0., 1. + i]])
+  operator_adjoint = LinearOperatorAdjoint(operator)
+
+  operator_adjoint.to_dense()
+  ==> [[1. + i, 0.]
+       [3., 1 - i]]
+
+  operator_adjoint.shape
+  ==> [2, 2]
+
+  operator_adjoint.log_abs_determinant()
+  ==> - log(2)
+
+  x = ... Shape [2, 4] Tensor
+  operator_adjoint.matmul(x)
+  ==> Shape [2, 4] Tensor, equal to operator.matmul(x, adjoint=True)
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorAdjoint` depends on the underlying
+  operators performance.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operator,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorAdjoint`.
+
+    `LinearOperatorAdjoint` is initialized with an operator `A`.  The `solve`
+    and `matmul` methods effectively flip the `adjoint` argument.  E.g.
+
+    ```
+    A = MyLinearOperator(...)
+    B = LinearOperatorAdjoint(A)
+    x = [....]  # a vector
+
+    assert A.matvec(x, adjoint=True) == B.matvec(x, adjoint=False)
+    ```
+
+    Args:
+      operator: `LinearOperator` object.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`. Default is `operator.name +
+        "_adjoint"`.
+
+    Raises:
+      ValueError:  If `operator.is_non_singular` is False.
+    """
+
+    self._operator = operator
+
+    # The congruency of is_non_singular and is_self_adjoint was checked in the
+    # base operator.
+    def _combined_hint(hint_str, provided_hint_value, message):
+      """Get combined hint in the case where operator.hint should equal hint."""
+      op_hint = getattr(operator, hint_str)
+      if op_hint is False and provided_hint_value:
+        raise ValueError(message)
+      if op_hint and provided_hint_value is False:
+        raise ValueError(message)
+      return (op_hint or provided_hint_value) or None
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its adjoint is square.")
+
+    is_non_singular = _combined_hint(
+        "is_non_singular", is_non_singular,
+        "An operator is non-singular if and only if its adjoint is "
+        "non-singular.")
+
+    is_self_adjoint = _combined_hint(
+        "is_self_adjoint", is_self_adjoint,
+        "An operator is self-adjoint if and only if its adjoint is "
+        "self-adjoint.")
+
+    is_positive_definite = _combined_hint(
+        "is_positive_definite", is_positive_definite,
+        "An operator is positive-definite if and only if its adjoint is "
+        "positive-definite.")
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its adjoint is square.")
+
+    # Initialization.
+    if name is None:
+      name = operator.name + "_adjoint"
+    with ops.name_scope(name, values=operator.graph_parents):
+      super(LinearOperatorAdjoint, self).__init__(
+          dtype=operator.dtype,
+          graph_parents=operator.graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operator(self):
+    """The operator before taking the adjoint."""
+    return self._operator
+
+  def _assert_non_singular(self):
+    return self.operator.assert_non_singular()
+
+  def _assert_positive_definite(self):
+    return self.operator.assert_positive_definite()
+
+  def _assert_self_adjoint(self):
+    return self.operator.assert_self_adjoint()
+
+  def _shape(self):
+    return self.operator.shape
+
+  def _shape_tensor(self):
+    return self.operator.shape_tensor()
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.matmul(
+        x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
+  def _determinant(self):
+    if self.is_self_adjoint:
+      return self.operator.determinant()
+    return math_ops.conj(self.operator.determinant())
+
+  def _log_abs_determinant(self):
+    return self.operator.log_abs_determinant()
+
+  def _trace(self):
+    if self.is_self_adjoint:
+      return self.operator.trace()
+    return math_ops.conj(self.operator.trace())
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.solve(
+        rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
+  def _to_dense(self):
+    if self.is_self_adjoint:
+      return self.operator.to_dense()
+    return linalg.adjoint(self.operator.to_dense())
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b99066e4c121ebd7546dfad1039c0dfa46bca11
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Registration mechanisms for various n-ary operations on LinearOperators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.framework import ops
+from tensorflow.python.util import tf_inspect
+
+
+_CHOLESKY_DECOMPS = {}
+_MATMUL = {}
+
+
+def _registered_function(type_list, registry):
+  """Given a list of classes, finds the most specific function registered."""
+  enumerated_hierarchies = [enumerate(tf_inspect.getmro(t)) for t in type_list]
+  # Get all possible combinations of hierarchies.
+  cls_combinations = list(itertools.product(*enumerated_hierarchies))
+
+  def hierarchy_distance(cls_combination):
+    candidate_distance = sum(c[0] for c in cls_combination)
+    if tuple(c[1] for c in cls_combination) in registry:
+      return candidate_distance
+    return 10000
+
+  registered_combination = min(cls_combinations, key=hierarchy_distance)
+  return registry.get(tuple(r[1] for r in registered_combination), None)
+
+
+def _registered_cholesky(type_a):
+  """Get the Cholesky function registered for class a."""
+  return _registered_function([type_a], _CHOLESKY_DECOMPS)
+
+
+def _registered_matmul(type_a, type_b):
+  """Get the Matmul function registered for classes a and b."""
+  return _registered_function([type_a, type_b], _MATMUL)
+
+
+def cholesky(lin_op_a, name=None):
+  """Get the Cholesky factor associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to decompose.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the lower Cholesky factor of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Cholesky method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  cholesky_fn = _registered_cholesky(type(lin_op_a))
+  if cholesky_fn is None:
+    raise ValueError("No cholesky decomposition registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Cholesky"):
+    return cholesky_fn(lin_op_a)
+
+
+def matmul(lin_op_a, lin_op_b, name=None):
+  """Compute lin_op_a.matmul(lin_op_b).
+
+  Args:
+    lin_op_a: The LinearOperator on the left.
+    lin_op_b: The LinearOperator on the right.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the matmul between `lin_op_a` and
+      `lin_op_b`.
+
+  Raises:
+    NotImplementedError: If no matmul method is defined between types of
+      `lin_op_a` and `lin_op_b`.
+  """
+  matmul_fn = _registered_matmul(type(lin_op_a), type(lin_op_b))
+  if matmul_fn is None:
+    raise ValueError("No matmul registered for {}.matmul({})".format(
+        type(lin_op_a), type(lin_op_b)))
+
+  with ops.name_scope(name, "Matmul"):
+    return matmul_fn(lin_op_a, lin_op_b)
+
+
+class RegisterCholesky(object):
+  """Decorator to register a Cholesky implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterCholesky(lin_op.LinearOperatorIdentity)
+  def _cholesky_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, cholesky_fn):
+    """Perform the Cholesky registration.
+
+    Args:
+      cholesky_fn: The function to use for the Cholesky.
+
+    Returns:
+      cholesky_fn
+
+    Raises:
+      TypeError: if cholesky_fn is not a callable.
+      ValueError: if a Cholesky function has already been registered for
+        the given argument classes.
+    """
+    if not callable(cholesky_fn):
+      raise TypeError(
+          "cholesky_fn must be callable, received: {}".format(cholesky_fn))
+    if self._key in _CHOLESKY_DECOMPS:
+      raise ValueError("Cholesky({}) has already been registered to: {}".format(
+          self._key[0].__name__, _CHOLESKY_DECOMPS[self._key]))
+    _CHOLESKY_DECOMPS[self._key] = cholesky_fn
+    return cholesky_fn
+
+
+class RegisterMatmul(object):
+  """Decorator to register a Matmul implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterMatmul(
+    lin_op.LinearOperatorIdentity,
+    lin_op.LinearOperatorIdentity)
+  def _matmul_identity(a, b):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a, lin_op_cls_b):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to multiply.
+      lin_op_cls_b: the class of the second LinearOperator to multiply.
+    """
+    self._key = (lin_op_cls_a, lin_op_cls_b)
+
+  def __call__(self, matmul_fn):
+    """Perform the Matmul registration.
+
+    Args:
+      matmul_fn: The function to use for the Matmul.
+
+    Returns:
+      matmul_fn
+
+    Raises:
+      TypeError: if matmul_fn is not a callable.
+      ValueError: if a Matmul function has already been registered for
+        the given argument classes.
+    """
+    if not callable(matmul_fn):
+      raise TypeError(
+          "matmul_fn must be callable, received: {}".format(matmul_fn))
+    if self._key in _MATMUL:
+      raise ValueError("Matmul({}, {}) has already been registered.".format(
+          self._key[0].__name__,
+          self._key[1].__name__))
+    _MATMUL[self._key] = matmul_fn
+    return matmul_fn
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 438c3496bdf4277e239c488d947ac743165179a5..b0b418c99706ad9468668d52e48e79f2add7552d 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -29,9 +29,7 @@ from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
-__all__ = [
-    "LinearOperatorBlockDiag",
-]
+__all__ = ["LinearOperatorBlockDiag"]
 
 
 @tf_export("linalg.LinearOperatorBlockDiag")
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index 021ef47383673dd1ccd42e58d04631ef2f3b2e7a..b74baa5dfdb0a70f035ee5a2633ba571147aa5e6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -39,8 +40,8 @@ __all__ = [
 ]
 
 # Different FFT Ops will be used for different block depths.
-_FFT_OP = {1: math_ops.fft, 2: math_ops.fft2d, 3: math_ops.fft3d}
-_IFFT_OP = {1: math_ops.ifft, 2: math_ops.ifft2d, 3: math_ops.ifft3d}
+_FFT_OP = {1: fft_ops.fft, 2: fft_ops.fft2d, 3: fft_ops.fft3d}
+_IFFT_OP = {1: fft_ops.ifft, 2: fft_ops.ifft2d, 3: fft_ops.ifft3d}
 
 # This is the only dtype allowed with fft ops.
 # TODO(langmore) Add other types once available.
@@ -417,15 +418,13 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
     return math_ops.cast(y, self.dtype)
 
   def _determinant(self):
-    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
-    det = math_ops.reduce_prod(
-        self.spectrum, reduction_indices=reduction_indices)
+    axis = [-(i + 1) for i in range(self.block_depth)]
+    det = math_ops.reduce_prod(self.spectrum, axis=axis)
     return math_ops.cast(det, self.dtype)
 
   def _log_abs_determinant(self):
-    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
-    lad = math_ops.reduce_sum(
-        math_ops.log(self._abs_spectrum), reduction_indices=reduction_indices)
+    axis = [-(i + 1) for i in range(self.block_depth)]
+    lad = math_ops.reduce_sum(math_ops.log(self._abs_spectrum), axis=axis)
     return math_ops.cast(lad, self.dtype)
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index 0292bc51dcf9809941087dd4aa1ea4c760c064d1..f499b3066129bce83706a94d93d943422ccc1ffd 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -275,6 +275,3 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
     for operator in solve_order_list[1:]:
       solution = operator.solve(solution, adjoint=adjoint)
     return solution
-
-  def _add_to_tensor(self, x):
-    return self.to_dense() + x
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index ed53decc00dc90df5c6c97d9fd9d5cb124ddf660..be893c705c970bcf100a686d64171806e2d9ace6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -228,11 +228,11 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
     return diag_mat * x
 
   def _determinant(self):
-    return math_ops.reduce_prod(self._diag, reduction_indices=[-1])
+    return math_ops.reduce_prod(self._diag, axis=[-1])
 
   def _log_abs_determinant(self):
     log_det = math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
+        math_ops.log(math_ops.abs(self._diag)), axis=[-1])
     if self.dtype.is_complex:
       log_det = math_ops.cast(log_det, dtype=self.dtype)
     return log_det
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 45929eb4e2e91218784a9fabba23b57851ae3cc8..32b222cb2a685ee3254065dfc26a230482004182 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -76,8 +76,8 @@ class BaseLinearOperatorIdentity(linear_operator.LinearOperator):
 
   def _min_matrix_dim(self):
     """Minimum of domain/range dimension, if statically available, else None."""
-    domain_dim = self.domain_dimension.value
-    range_dim = self.range_dimension.value
+    domain_dim = tensor_shape.dimension_value(self.domain_dimension)
+    range_dim = tensor_shape.dimension_value(self.range_dimension)
     if domain_dim is None or range_dim is None:
       return None
     return min(domain_dim, range_dim)
diff --git a/tensorflow/python/ops/linalg/linear_operator_inversion.py b/tensorflow/python/ops/linalg/linear_operator_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aa4b40e16bd82941357e394101a0a9d55c7a7fe
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_inversion.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inverts a non-singular `LinearOperator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = []
+
+
+@tf_export("linalg.LinearOperatorInversion")
+class LinearOperatorInversion(linear_operator.LinearOperator):
+  """`LinearOperator` representing the inverse of another operator.
+
+  This operator represents the inverse of another operator.
+
+  ```python
+  # Create a 2 x 2 linear operator.
+  operator = LinearOperatorFullMatrix([[1., 0.], [0., 2.]])
+  operator_inv = LinearOperatorInversion(operator)
+
+  operator_inv.to_dense()
+  ==> [[1., 0.]
+       [0., 0.5]]
+
+  operator_inv.shape
+  ==> [2, 2]
+
+  operator_inv.log_abs_determinant()
+  ==> - log(2)
+
+  x = ... Shape [2, 4] Tensor
+  operator_inv.matmul(x)
+  ==> Shape [2, 4] Tensor, equal to operator.solve(x)
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorInversion` depends on the underlying
+  operators performance:  `solve` and `matmul` are swapped, and determinant is
+  inverted.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operator,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorInversion`.
+
+    `LinearOperatorInversion` is initialized with an operator `A`.  The `solve`
+    and `matmul` methods are effectively swapped.  E.g.
+
+    ```
+    A = MyLinearOperator(...)
+    B = LinearOperatorInversion(A)
+    x = [....]  # a vector
+
+    assert A.matvec(x) == B.solvevec(x)
+    ```
+
+    Args:
+      operator: `LinearOperator` object. If `operator.is_non_singular == False`,
+        an exception is raised.  We do allow `operator.is_non_singular == None`,
+        in which case this operator will have `is_non_singular == None`.
+        Similarly for `is_self_adjoint` and `is_positive_definite`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`. Default is `operator.name +
+        "_inv"`.
+
+    Raises:
+      ValueError:  If `operator.is_non_singular` is False.
+    """
+
+    self._operator = operator
+
+    # Auto-set and check hints.
+    if operator.is_non_singular is False or is_non_singular is False:
+      raise ValueError(
+          "operator and supplied hints must have `is_non_singular` equal to "
+          "`True` or `None`.  Found %s, %s" % (operator.is_non_singular,
+                                               is_non_singular))
+    if operator.is_square is False or is_square is False:
+      raise ValueError(
+          "operator and supplied hints must have `is_square` equal to "
+          "`True` or `None`.  Found %s, %s" % (operator.is_square, is_square))
+
+    # The congruency of is_non_singular and is_self_adjoint was checked in the
+    # base operator.  Other hints are, in this special case of inversion, ones
+    # that must be the same for base/derived operator.
+    def _combined_hint(hint_str, provided_hint_value, message):
+      """Get combined hint in the case where operator.hint should equal hint."""
+      op_hint = getattr(operator, hint_str)
+      if op_hint is False and provided_hint_value:
+        raise ValueError(message)
+      if op_hint and provided_hint_value is False:
+        raise ValueError(message)
+      return (op_hint or provided_hint_value) or None
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its inverse is square.")
+
+    is_non_singular = _combined_hint(
+        "is_non_singular", is_non_singular,
+        "An operator is non-singular if and only if its inverse is "
+        "non-singular.")
+
+    is_self_adjoint = _combined_hint(
+        "is_self_adjoint", is_self_adjoint,
+        "An operator is self-adjoint if and only if its inverse is "
+        "self-adjoint.")
+
+    is_positive_definite = _combined_hint(
+        "is_positive_definite", is_positive_definite,
+        "An operator is positive-definite if and only if its inverse is "
+        "positive-definite.")
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its inverse is square.")
+
+    # Initialization.
+    if name is None:
+      name = operator.name + "_inv"
+    with ops.name_scope(name, values=operator.graph_parents):
+      super(LinearOperatorInversion, self).__init__(
+          dtype=operator.dtype,
+          graph_parents=operator.graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operator(self):
+    """The operator before inversion."""
+    return self._operator
+
+  def _assert_non_singular(self):
+    return self.operator.assert_non_singular()
+
+  def _assert_positive_definite(self):
+    return self.operator.assert_positive_definite()
+
+  def _assert_self_adjoint(self):
+    return self.operator.assert_self_adjoint()
+
+  def _shape(self):
+    return self.operator.shape
+
+  def _shape_tensor(self):
+    return self.operator.shape_tensor()
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.solve(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def _determinant(self):
+    return 1. / self.operator.determinant()
+
+  def _log_abs_determinant(self):
+    return -1. * self.operator.log_abs_determinant()
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.matmul(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index 1fd5073c17832f0689616f2842c33c95d186e487..f7e785caa5d8cc290f037944378f709633423a74 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -30,9 +30,7 @@ from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.util.tf_export import tf_export
 
-__all__ = [
-    "LinearOperatorKronecker",
-]
+__all__ = ["LinearOperatorKronecker"]
 
 
 def _vec(x):
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index 2b2bf80f276a62d20aae717ac9fa08f9769f455e..aa0500aff06e0c9eddf7a3059ebf9480b670ca9d 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -275,11 +276,13 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     batch_shape = array_ops.broadcast_static_shape(
         self.base_operator.batch_shape, uv_shape[:-2])
 
-    self.base_operator.domain_dimension.assert_is_compatible_with(
-        uv_shape[-2])
+    tensor_shape.Dimension(
+        self.base_operator.domain_dimension).assert_is_compatible_with(
+            uv_shape[-2])
 
     if self._diag_update is not None:
-      uv_shape[-1].assert_is_compatible_with(self._diag_update.get_shape()[-1])
+      tensor_shape.dimension_at_index(uv_shape, -1).assert_is_compatible_with(
+          self._diag_update.get_shape()[-1])
       array_ops.broadcast_static_shape(
           batch_shape, self._diag_update.get_shape()[:-1])
 
@@ -291,8 +294,8 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
       self._diag_inv_operator = linear_operator_diag.LinearOperatorDiag(
           1. / self._diag_update, is_positive_definite=is_diag_update_positive)
     else:
-      if self.u.get_shape()[-1].value is not None:
-        r = self.u.get_shape()[-1].value
+      if tensor_shape.dimension_value(self.u.shape[-1]) is not None:
+        r = tensor_shape.dimension_value(self.u.shape[-1])
       else:
         r = array_ops.shape(self.u)[-1]
       self._diag_operator = linear_operator_identity.LinearOperatorIdentity(
@@ -388,7 +391,7 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     if self._use_cholesky:
       chol_cap_diag = array_ops.matrix_diag_part(self._chol_capacitance)
       log_abs_det_c = 2 * math_ops.reduce_sum(
-          math_ops.log(chol_cap_diag), reduction_indices=[-1])
+          math_ops.log(chol_cap_diag), axis=[-1])
     else:
       det_c = linalg_ops.matrix_determinant(self._capacitance)
       log_abs_det_c = math_ops.log(math_ops.abs(det_c))
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index ca6d3f54051d7bf0ff748804d3cd314b144c2f88..d33fe17e042bfc53ab2f53aa6f79ee5dfa24c4a2 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -195,11 +195,11 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
-    return math_ops.reduce_prod(self._diag, reduction_indices=[-1])
+    return math_ops.reduce_prod(self._diag, axis=[-1])
 
   def _log_abs_determinant(self):
     return math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
+        math_ops.log(math_ops.abs(self._diag)), axis=[-1])
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 76d659f1097579a9b5c92a90938f71b90268503f..e50f572b5f431ae8b7cf3470ee799f170e83656c 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -102,7 +102,9 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("operator_build_infos has not been implemented.")
 
   @abc.abstractmethod
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     """Build a batch matrix and an Operator that should have similar behavior.
 
     Every operator acts like a (batch) matrix.  This method returns both
@@ -114,6 +116,11 @@ class LinearOperatorDerivedClassTest(test.TestCase):
       dtype:  Numpy dtype.  Data type of returned array/operator.
       use_placeholder:  Python bool.  If True, initialize the operator with a
         placeholder of undefined shape and correct dtype.
+      ensure_self_adjoint_and_pd: If `True`,
+        construct this operator to be Hermitian Positive Definite, as well
+        as ensuring the hints `is_positive_definite` and `is_self_adjoint`
+        are set.
+        This is useful for testing methods such as `cholesky`.
 
     Returns:
       operator:  `LinearOperator` subclass instance.
@@ -271,6 +278,21 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
     self._test_matmul(with_batch=False)
 
+  def test_cholesky(self):
+    self._skip_if_tests_to_skip_contains("cholesky")
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder,
+                ensure_self_adjoint_and_pd=True)
+            op_chol = operator.cholesky().to_dense()
+            mat_chol = linalg_ops.cholesky(mat)
+            op_chol_v, mat_chol_v = sess.run([op_chol, mat_chol])
+            self.assertAC(mat_chol_v, op_chol_v)
+
   def _test_solve(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
@@ -441,7 +463,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["solve", "solve_with_broadcast", "det", "log_abs_det"]
+    return ["cholesky", "solve", "solve_with_broadcast", "det", "log_abs_det"]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 9dd40765c20222c6998260547b7e8fa341e65437..54d04e4a70bc65e0053575e7761680894e3702e5 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -25,6 +27,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
 
 
 def assert_no_entries_with_modulus_zero(
@@ -233,9 +236,9 @@ def matmul_with_broadcast(a,
   """Multiplies matrix `a` by matrix `b`, producing `a @ b`.
 
   Works identically to `tf.matmul`, but broadcasts batch dims
-  of `a` and `b` (by replicating) if they are determined statically to be
-  different, or if static shapes are not fully defined.  Thus, this may result
-  in an inefficient replication of data.
+  of `a` and `b` if they are determined statically to be different, or if static
+  shapes are not fully defined. Attempts are made to avoid unnecessary
+  replication of data, but this is not always possible.
 
   The inputs must be matrices (or tensors of rank > 2, representing batches of
   matrices).
@@ -308,23 +311,51 @@ def matmul_with_broadcast(a,
       are both set to True.
   """
   with ops.name_scope(name, "MatMulWithBroadcast", [a, b]):
-    a, b = broadcast_matrix_batch_dims([a, b])
-    return math_ops.matmul(
+    a = ops.convert_to_tensor(a, name="a")
+    b = ops.convert_to_tensor(b, name="b", dtype=a.dtype)
+
+    # If either a or b has extra dims, we can reshape to get rid of them.
+    a, b, reshape_inv, still_need_to_transpose = _reshape_for_efficiency(
         a,
         b,
         transpose_a=transpose_a,
         transpose_b=transpose_b,
         adjoint_a=adjoint_a,
-        adjoint_b=adjoint_b,
+        adjoint_b=adjoint_b)
+
+    # This will broadcast by brute force if we still need to.
+    a, b = broadcast_matrix_batch_dims([a, b])
+
+    a_times_b = math_ops.matmul(
+        a,
+        b,
+        transpose_a=transpose_a and still_need_to_transpose,
+        transpose_b=transpose_b and still_need_to_transpose,
+        adjoint_a=adjoint_a and still_need_to_transpose,
+        adjoint_b=adjoint_b and still_need_to_transpose,
         a_is_sparse=a_is_sparse,
         b_is_sparse=b_is_sparse)
 
+    return reshape_inv(a_times_b)
+
 
 def matrix_solve_with_broadcast(matrix, rhs, adjoint=False, name=None):
   """Solve systems of linear equations."""
   with ops.name_scope(name, "MatrixSolveWithBroadcast", [matrix, rhs]):
+    matrix = ops.convert_to_tensor(matrix, name="matrix")
+    rhs = ops.convert_to_tensor(rhs, name="rhs", dtype=matrix.dtype)
+
+    # If either matrix/rhs has extra dims, we can reshape to get rid of them.
+    matrix, rhs, reshape_inv, still_need_to_transpose = _reshape_for_efficiency(
+        matrix, rhs, adjoint_a=adjoint)
+
+    # This will broadcast by brute force if we still need to.
     matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
-    return linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint)
+
+    solution = linalg_ops.matrix_solve(
+        matrix, rhs, adjoint=adjoint and still_need_to_transpose)
+
+    return reshape_inv(solution)
 
 
 def matrix_triangular_solve_with_broadcast(matrix,
@@ -354,9 +385,119 @@ def matrix_triangular_solve_with_broadcast(matrix,
     `Tensor` with same `dtype` as `matrix` and shape `[..., M, K]`.
   """
   with ops.name_scope(name, "MatrixTriangularSolve", [matrix, rhs]):
+    matrix = ops.convert_to_tensor(matrix, name="matrix")
+    rhs = ops.convert_to_tensor(rhs, name="rhs", dtype=matrix.dtype)
+
+    # If either matrix/rhs has extra dims, we can reshape to get rid of them.
+    matrix, rhs, reshape_inv, still_need_to_transpose = _reshape_for_efficiency(
+        matrix, rhs, adjoint_a=adjoint)
+
+    # lower indicates whether the matrix is lower triangular. If we have
+    # manually taken adjoint inside _reshape_for_efficiency, it is now upper tri
+    if not still_need_to_transpose and adjoint:
+      lower = not lower
+
+    # This will broadcast by brute force if we still need to.
     matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
-    return linalg_ops.matrix_triangular_solve(
+
+    solution = linalg_ops.matrix_triangular_solve(
         matrix,
         rhs,
         lower=lower,
-        adjoint=adjoint)
+        adjoint=adjoint and still_need_to_transpose)
+
+    return reshape_inv(solution)
+
+
+def _reshape_for_efficiency(a,
+                            b,
+                            transpose_a=False,
+                            transpose_b=False,
+                            adjoint_a=False,
+                            adjoint_b=False):
+  """Maybe reshape a, b, and return an inverse map.  For matmul/solve."""
+  def identity(x):
+    return x
+
+  # At this point, we have not taken transpose/adjoint of a/b.
+  still_need_to_transpose = True
+
+  if a.shape.ndims is None or b.shape.ndims is None:
+    return a, b, identity, still_need_to_transpose
+
+  # This could be handled in the future, but seems less common.
+  if a.shape.ndims >= b.shape.ndims:
+    return a, b, identity, still_need_to_transpose
+
+  # From now on, we might modify b, but will not modify a.
+
+  # Suppose:
+  #   a.shape =     C + [m, n], b.shape =
+  #   b.shape = S + C + [n, r]
+  b_extra_ndims = b.shape.ndims - a.shape.ndims
+
+  # b_extra_sh = S, b_main_sh = C + [n, r]
+  b_extra_sh = array_ops.shape(b)[:b_extra_ndims]
+  b_main_sh = array_ops.shape(b)[b_extra_ndims:]
+
+  # No reason to flip unless the extra dims of b are big enough.  Why?
+  # Assume adjoint/transpose = False.  Then...
+  # By not flipping, we have to replicate a to shape
+  #   b_extra_sh + a.shape,
+  # which could use extra memory.  But in all cases, the final output has shape
+  #   b_extra_sh + a.shape[:-1] + [b.shape[-1]]
+  # So we only end up creating a larger object if the end dim of b is smaller
+  # than the end dim of a.  This often happens, e.g. if b was a vector that was
+  # expanded to a matrix (by appending a singleton).
+
+  # Since adjoint/transpose may not be False, we must make adjustments here.
+  # The dim of b that holds the multiple equations.
+  a_domain_sz_ = a.shape[-2 if adjoint_a or transpose_a else -1]
+  b_eq_sz_ = b.shape[-2 if adjoint_b or transpose_b else -1]
+  b_extra_sz_ = (
+      np.prod(b.shape[:b_extra_ndims].as_list())
+      if b.shape[:b_extra_ndims].is_fully_defined() else None)
+  if (a_domain_sz_ is not None and b_eq_sz_ is not None and
+      b_extra_sz_ is not None):
+    if b_extra_sz_ < 2 or a_domain_sz_ <= b_eq_sz_:
+      return a, b, identity, still_need_to_transpose
+
+  # At this point, we're flipping for sure!
+  # Any transposes/adjoints will happen here explicitly, rather than in calling
+  # code.  Why?  To avoid having to write separate complex code for each case.
+  if adjoint_a:
+    a = linalg.adjoint(a)
+  elif transpose_a:
+    a = linalg.transpose(a)
+  if adjoint_b:
+    b = linalg.adjoint(b)
+  elif transpose_b:
+    b = linalg.transpose(b)
+  still_need_to_transpose = False
+
+  # Recompute shapes, since the transpose/adjoint may have changed them.
+  b_extra_sh = array_ops.shape(b)[:b_extra_ndims]
+  b_main_sh = array_ops.shape(b)[b_extra_ndims:]
+
+  # Permutation to put the extra dims at the end.
+  perm = (
+      array_ops.concat(
+          (math_ops.range(b_extra_ndims, b.shape.ndims),
+           math_ops.range(0, b_extra_ndims)), 0))
+  b_extra_on_end = array_ops.transpose(b, perm=perm)
+
+  # Now squash this end into one long dim.
+  b_squashed_end = array_ops.reshape(
+      b_extra_on_end, array_ops.concat((b_main_sh[:-1], [-1]), 0))
+
+  def reshape_inv(y):
+    # Expand the extra dims hanging off the end, "b_extra_sh".
+    # Note we use y_sh[:-1] + [b_main_sh[-1]] rather than b_main_sh, because y
+    # Could have different batch dims than a and b, because of broadcasting.
+    y_extra_shape = array_ops.concat(
+        (array_ops.shape(y)[:-1], [b_main_sh[-1]], b_extra_sh), 0)
+    y_extra_on_end = array_ops.reshape(y, y_extra_shape)
+    return array_ops.transpose(
+        y_extra_on_end, perm=array_ops.invert_permutation(perm))
+
+  return a, b_squashed_end, reshape_inv, still_need_to_transpose
diff --git a/tensorflow/python/ops/linalg/matmul_registrations.py b/tensorflow/python/ops/linalg/matmul_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0ac988ba274dd99b03733eff38b07055d68543b
--- /dev/null
+++ b/tensorflow/python/ops/linalg/matmul_registrations.py
@@ -0,0 +1,252 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.matmul."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_composition
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+from tensorflow.python.ops.linalg import linear_operator_zeros
+
+
+def _combined_self_adjoint_hint(operator_a, operator_b):
+  """Get combined hint for self-adjoint-ness."""
+  # Note: only use this method in the commuting case.
+  # The property is preserved under composition when the operators commute.
+  if operator_a.is_self_adjoint and operator_b.is_self_adjoint:
+    return True
+
+  # The property is not preserved when an operator with the property is composed
+  # with an operator without the property.
+  if ((operator_a.is_self_adjoint is True and
+       operator_b.is_self_adjoint is False) or
+      (operator_a.is_self_adjoint is False and
+       operator_b.is_self_adjoint is True)):
+    return False
+
+  # The property is not known when operators are not known to have the property
+  # or both operators don't have the property (the property for the complement
+  # class is not closed under composition).
+  return None
+
+
+def _is_square(operator_a, operator_b):
+  """Return a hint to whether the composition is square."""
+  if operator_a.is_square and operator_b.is_square:
+    return True
+  if operator_a.is_square is False and operator_b.is_square is False:
+    # Let A have shape [B, M, N], B have shape [B, N, L].
+    m = operator_a.range_dimension
+    l = operator_b.domain_dimension
+    if m is not None and l is not None:
+      return m == l
+
+    return None
+
+
+def _combined_positive_definite_hint(operator_a, operator_b):
+  """Get combined PD hint for compositions."""
+  # Note: Positive definiteness is only guaranteed to be preserved
+  # when the operators commute and are symmetric. Only use this method in
+  # commuting cases.
+
+  if (operator_a.is_positive_definite is True and
+      operator_a.is_self_adjoint is True and
+      operator_b.is_positive_definite is True and
+      operator_b.is_self_adjoint is True):
+    return True
+
+  return None
+
+
+def _combined_non_singular_hint(operator_a, operator_b):
+  """Get combined hint for when ."""
+  # If either operator is not-invertible the composition isn't.
+  if (operator_a.is_non_singular is False or
+      operator_b.is_non_singular is False):
+    return False
+
+  return operator_a.is_non_singular and operator_b.is_non_singular
+
+
+# By default, use a LinearOperatorComposition to delay the computation.
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator, linear_operator.LinearOperator)
+def _matmul_linear_operator(linop_a, linop_b):
+  """Generic matmul of two `LinearOperator`s."""
+  is_square = _is_square(linop_a, linop_b)
+  is_non_singular = None
+  is_self_adjoint = None
+  is_positive_definite = None
+
+  if is_square:
+    is_non_singular = _combined_non_singular_hint(linop_a, linop_b)
+    is_self_adjoint = _combined_self_adjoint_hint(linop_a, linop_b)
+  elif is_square is False:
+    is_non_singular = False
+    is_self_adjoint = False
+    is_positive_definite = False
+
+  return linear_operator_composition.LinearOperatorComposition(
+      operators=[linop_a, linop_b],
+      is_non_singular=is_non_singular,
+      is_self_adjoint=is_self_adjoint,
+      is_positive_definite=is_positive_definite,
+      is_square=is_square,
+  )
+
+# Identity
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_identity.LinearOperatorIdentity,
+    linear_operator.LinearOperator)
+def _matmul_linear_operator_identity_left(identity, linop):
+  del identity
+  return linop
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator,
+    linear_operator_identity.LinearOperatorIdentity)
+def _matmul_linear_operator_identity_right(linop, identity):
+  del identity
+  return linop
+
+
+# Zeros
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator,
+    linear_operator_zeros.LinearOperatorZeros)
+def _matmul_linear_operator_zeros_right(linop, zeros):
+  if not zeros.is_square or not linop.is_square:
+    raise ValueError("Matmul with non-square `LinearOperator`s or non-square "
+                     "`LinearOperatorZeros` not supported at this time.")
+  return zeros
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_zeros.LinearOperatorZeros,
+    linear_operator.LinearOperator)
+def _matmul_linear_operator_zeros_left(zeros, linop):
+  if not zeros.is_square or not linop.is_square:
+    raise ValueError("Matmul with non-square `LinearOperator`s or non-square "
+                     "`LinearOperatorZeros` not supported at this time.")
+  return zeros
+
+
+# Diag.
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_diag(linop_a, linop_b):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_a.diag * linop_b.diag,
+      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_a, linop_b),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_a, linop_b),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _matmul_linear_operator_diag_scaled_identity_right(
+    linop_diag, linop_scaled_identity):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_diag.diag * linop_scaled_identity.multiplier,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_diag, linop_scaled_identity),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_identity.LinearOperatorScaledIdentity,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_diag_scaled_identity_left(
+    linop_scaled_identity, linop_diag):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_diag.diag * linop_scaled_identity.multiplier,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_diag, linop_scaled_identity),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular)
+def _matmul_linear_operator_diag_tril(linop_diag, linop_triangular):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      tril=linop_diag.diag[..., None] * linop_triangular.to_dense(),
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_triangular),
+      # This is safe to do since the Triangular matrix is only self-adjoint
+      # when it is a diagonal matrix, and hence commutes.
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_triangular),
+      is_positive_definite=None,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_tril_diag(linop_triangular, linop_diag):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      tril=linop_triangular.to_dense() * linop_diag.diag,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_triangular),
+      # This is safe to do since the Triangular matrix is only self-adjoint
+      # when it is a diagonal matrix, and hence commutes.
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_triangular),
+      is_positive_definite=None,
+      is_square=True)
+
+# Circulant.
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_circulant.LinearOperatorCirculant,
+    linear_operator_circulant.LinearOperatorCirculant)
+def _matmul_linear_operator_circulant_circulant(linop_a, linop_b):
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=linop_a.spectrum * linop_b.spectrum,
+      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
+      is_self_adjoint=_combined_self_adjoint_hint(linop_a, linop_b),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_a, linop_b),
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index b6b98d5c86fd3285b35377c9158dcdb649b88a83..290d1fc5030023c426d45116f57b263576833fc3 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -55,6 +55,71 @@ def _MatrixDeterminantGrad(op, grad):
   return multipliers * a_adj_inv
 
 
+@ops.RegisterGradient("MatrixSquareRoot")
+def _MatrixSquareRootGrad(op, grad):
+  """Gradient for MatrixSquareRoot."""
+
+  # Let A be an m x m square matrix (or batch of matrices)
+  # Let R = sqrtm(A)
+  # By definition, A = RR
+  # Take the differential: dA = d(RR) = RdR + dRR
+  # Solve the resulting Sylvester equation for dR
+
+  # Used to find Kronecker products within the Sylvester equation
+  def _KroneckerProduct(b1, b2):
+    """Computes the Kronecker product of two batches of square matrices"""
+    b1_shape = array_ops.shape(b1)
+    b2_shape = array_ops.shape(b2)
+    b1_order = b1_shape[-1]
+    b2_order = b2_shape[-1]
+
+    shape_slice_size = [math_ops.subtract(array_ops.size(b1_shape), 2)]
+    shape_slice = array_ops.slice(b1_shape, [0],
+                                  shape_slice_size)  # Same for both batches
+    b1_reshape_shape = array_ops.concat(
+        [shape_slice, [b1_order], [1], [b1_order], [1]], 0)
+    b2_reshape_shape = array_ops.concat(
+        [shape_slice, [1], [b2_order], [1], [b2_order]], 0)
+
+    b1_reshape = array_ops.reshape(b1, b1_reshape_shape)
+    b2_reshape = array_ops.reshape(b2, b2_reshape_shape)
+
+    order_prod = b1_order * b2_order
+    kprod_shape = array_ops.concat([shape_slice, [order_prod], [order_prod]], 0)
+    return array_ops.reshape(b1_reshape * b2_reshape, kprod_shape)
+
+  sqrtm = op.outputs[0]  # R
+  shape = array_ops.shape(sqrtm)
+  order = shape[-1]  # m
+  matrix_count = math_ops.reduce_prod(shape[0:-2])
+
+  # Get batch of m x m identity matrices
+  eye = linalg_ops.eye(order, dtype=sqrtm.dtype)  # m x m identity matrix
+  eye_flat = array_ops.reshape(eye, [-1])
+  eye_tiled = array_ops.tile(eye_flat, [matrix_count])
+  eye_batch = array_ops.reshape(eye_tiled, shape)
+
+  # The transpose of R is taken in the k1 term instead of k2 in
+  # order to prevent redundant transposition of R (i.e. (R')' = R)
+  sqrtm_transpose = array_ops.matrix_transpose(sqrtm)
+  k1 = _KroneckerProduct(eye_batch, sqrtm_transpose)
+  k2 = _KroneckerProduct(sqrtm, eye_batch)
+  ksum = math_ops.add(k1, k2)
+
+  # Vectorize dA
+  shape_slice_size = [math_ops.subtract(array_ops.size(shape), 2)]
+  shape_slice = array_ops.slice(shape, [0], shape_slice_size)
+  shape_vec_da = array_ops.concat([shape_slice, [order * order], [1]], 0)
+  vec_da = array_ops.reshape(array_ops.matrix_transpose(grad), shape_vec_da)
+
+  # Solve for vec(dR)
+  vec_dsqrtm = linalg_ops.matrix_solve(ksum, vec_da)
+
+  # Solve for dR by inverse vectorizing vec(dR)
+  dsqrtm_transpose = array_ops.reshape(vec_dsqrtm, shape)
+  return array_ops.matrix_transpose(dsqrtm_transpose)
+
+
 @ops.RegisterGradient("LogMatrixDeterminant")
 def _LogMatrixDeterminantGrad(op, _, grad_b):
   """Gradient for LogMatrixDeterminant."""
@@ -101,7 +166,7 @@ def _QrGrad(op, dq, dr):
   if (r.shape.ndims is None or r.shape.as_list()[-2] is None or
       r.shape.as_list()[-1] is None):
     raise NotImplementedError("QrGrad not implemented with dynamic shapes.")
-  if r.shape[-2].value != r.shape[-1].value:
+  if r.shape.dims[-2].value != r.shape.dims[-1].value:
     raise NotImplementedError("QrGrad not implemented when ncols > nrows "
                               "or full_matrices is true and ncols != nrows.")
 
@@ -305,14 +370,14 @@ def _SvdGrad(op, grad_s, grad_u, grad_v):
         "compute_uv=True.")
   grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
   grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
-  m = a_shape[-2].merge_with(grad_u_shape[-2])
-  n = a_shape[-1].merge_with(grad_v_shape[-2])
+  m = a_shape.dims[-2].merge_with(grad_u_shape[-2])
+  n = a_shape.dims[-1].merge_with(grad_v_shape[-2])
   batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
       grad_v_shape[:-2])
   a_shape = batch_shape.concatenate([m, n])
 
-  m = a_shape[-2].value
-  n = a_shape[-1].value
+  m = a_shape.dims[-2].value
+  n = a_shape.dims[-1].value
   # TODO(rmlarsen): Make this work with placeholders.
   if m is None or n is None:
     raise NotImplementedError(
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index bf4354fa7313e16af30c5974aff2316143d10b2f..1a9e7112b45cacb711ac176b92cb3bef0dc72f00 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -79,7 +79,8 @@ def _RegularizedGramianCholesky(matrix, l2_regularizer, first_kind):
   return gen_linalg_ops.cholesky(gramian)
 
 
-@tf_export('cholesky_solve', 'linalg.cholesky_solve')
+@tf_export(
+    'linalg.cholesky_solve', v1=['linalg.cholesky_solve', 'cholesky_solve'])
 @deprecation.deprecated_endpoints('cholesky_solve')
 def cholesky_solve(chol, rhs, name=None):
   """Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
@@ -168,7 +169,7 @@ def eye(num_rows,
                              name=name)
 
 
-@tf_export('linalg.lstsq', 'matrix_solve_ls')
+@tf_export('linalg.lstsq', v1=['linalg.lstsq', 'matrix_solve_ls'])
 @deprecation.deprecated_endpoints('matrix_solve_ls')
 def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
   r"""Solves one or more linear least-squares problems.
@@ -305,7 +306,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
         matrix, rhs, l2_regularizer, fast=fast, name=name)
 
 
-@tf_export('linalg.eigh', 'self_adjoint_eig')
+@tf_export('linalg.eigh', v1=['linalg.eigh', 'self_adjoint_eig'])
 @deprecation.deprecated_endpoints('self_adjoint_eig')
 def self_adjoint_eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of self-adjoint matrices.
@@ -328,7 +329,7 @@ def self_adjoint_eig(tensor, name=None):
   return e, v
 
 
-@tf_export('linalg.eigvalsh', 'self_adjoint_eigvals')
+@tf_export('linalg.eigvalsh', v1=['linalg.eigvalsh', 'self_adjoint_eigvals'])
 @deprecation.deprecated_endpoints('self_adjoint_eigvals')
 def self_adjoint_eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more self-adjoint matrices.
@@ -351,7 +352,7 @@ def self_adjoint_eigvals(tensor, name=None):
   return e
 
 
-@tf_export('svd', 'linalg.svd')
+@tf_export('linalg.svd', v1=['linalg.svd', 'svd'])
 @deprecation.deprecated_endpoints('svd')
 def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   r"""Computes the singular value decompositions of one or more matrices.
@@ -422,7 +423,78 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export('norm', 'linalg.norm')
+@tf_export('norm', 'linalg.norm', v1=[])
+def norm_v2(tensor,
+            ord='euclidean',
+            axis=None,
+            keepdims=None,
+            name=None):
+  r"""Computes the norm of vectors, matrices, and tensors.
+
+  This function can compute several different vector norms (the 1-norm, the
+  Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
+  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).
+
+  Args:
+    tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
+    ord: Order of the norm. Supported values are 'fro', 'euclidean',
+      `1`, `2`, `np.inf` and any positive real number yielding the corresponding
+      p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if
+      `tensor` is a matrix and equivalent to 2-norm for vectors.
+      Some restrictions apply:
+        a) The Frobenius norm `fro` is not defined for vectors,
+        b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`,
+           `2`, `np.inf` are supported.
+      See the description of `axis` on how to compute norms for a batch of
+      vectors or matrices stored in a tensor.
+    axis: If `axis` is `None` (the default), the input is considered a vector
+      and a single vector norm is computed over the entire set of values in the
+      tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
+      `norm(reshape(tensor, [-1]), ord=ord)`.
+      If `axis` is a Python integer, the input is considered a batch of vectors,
+      and `axis` determines the axis in `tensor` over which to compute vector
+      norms.
+      If `axis` is a 2-tuple of Python integers it is considered a batch of
+      matrices and `axis` determines the axes in `tensor` over which to compute
+      a matrix norm.
+      Negative indices are supported. Example: If you are passing a tensor that
+      can be either a matrix or a batch of matrices at runtime, pass
+      `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
+      computed.
+    keepdims: If True, the axis indicated in `axis` are kept with size 1.
+      Otherwise, the dimensions in `axis` are removed from the output shape.
+    name: The name of the op.
+
+  Returns:
+    output: A `Tensor` of the same type as tensor, containing the vector or
+      matrix norms. If `keepdims` is True then the rank of output is equal to
+      the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar,
+      if `axis` is an integer, the rank of `output` is one less than the rank
+      of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less
+      than the rank of `tensor`.
+
+  Raises:
+    ValueError: If `ord` or `axis` is invalid.
+
+  @compatibility(numpy)
+  Mostly equivalent to numpy.linalg.norm.
+  Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
+  Other differences:
+    a) If axis is `None`, treats the flattened `tensor` as a vector
+     regardless of rank.
+    b) Explicitly supports 'euclidean' norm as the default, including for
+     higher order tensors.
+  @end_compatibility
+  """
+  return norm(tensor=tensor,
+              ord=ord,
+              axis=axis,
+              keepdims=keepdims,
+              name=name)
+
+
+# pylint: disable=redefined-builtin
+@tf_export(v1=['norm', 'linalg.norm'])
 @deprecation.deprecated_args(
     None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
 def norm(tensor,
diff --git a/tensorflow/python/ops/linalg_ops_impl.py b/tensorflow/python/ops/linalg_ops_impl.py
index e7c89f6ae3e9c517920e6c9afce99a8b192be164..37c724e032512c9edbdb6514df627cd0c9b7de32 100644
--- a/tensorflow/python/ops/linalg_ops_impl.py
+++ b/tensorflow/python/ops/linalg_ops_impl.py
@@ -44,22 +44,31 @@ def eye(num_rows,
     is_square = num_columns is None
     batch_shape = [] if batch_shape is None else batch_shape
     num_columns = num_rows if num_columns is None else num_columns
-    if isinstance(num_rows, ops.Tensor) or isinstance(
-        num_columns, ops.Tensor) or isinstance(batch_shape, ops.Tensor):
-      batch_shape = ops.convert_to_tensor(
-          batch_shape, name='shape', dtype=dtypes.int32)
+
+    # We cannot statically infer what the diagonal size should be:
+    if (isinstance(num_rows, ops.Tensor) or
+        isinstance(num_columns, ops.Tensor)):
       diag_size = math_ops.minimum(num_rows, num_columns)
-      diag_shape = array_ops.concat((batch_shape, [diag_size]), 0)
-      if not is_square:
-        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), 0)
     else:
+      # We can statically infer the diagonal size, and whether it is square.
       if not isinstance(num_rows, compat.integral_types) or not isinstance(
           num_columns, compat.integral_types):
         raise TypeError(
             'num_rows and num_columns must be positive integer values.')
-      batch_shape = [dim for dim in batch_shape]
       is_square = num_rows == num_columns
-      diag_shape = batch_shape + [np.minimum(num_rows, num_columns)]
+      diag_size = np.minimum(num_rows, num_columns)
+
+    # We can not statically infer the shape of the tensor.
+    if isinstance(batch_shape, ops.Tensor) or isinstance(diag_size, ops.Tensor):
+      batch_shape = ops.convert_to_tensor(
+          batch_shape, name='shape', dtype=dtypes.int32)
+      diag_shape = array_ops.concat((batch_shape, [diag_size]), axis=0)
+      if not is_square:
+        shape = array_ops.concat((batch_shape, [num_rows, num_columns]), axis=0)
+    # We can statically infer everything.
+    else:
+      batch_shape = list(batch_shape)
+      diag_shape = batch_shape + [diag_size]
       if not is_square:
         shape = batch_shape + [num_rows, num_columns]
 
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index 145a5f358c1707b333f6167bd02496353e3b2e82..dbaae886d43e46ac193d1e7f28a6367192d2a640 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_list_ops
 # go/tf-wildcard-import
@@ -29,10 +30,56 @@ from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
 
 
-ops.NotDifferentiable("TensorListConcat")
+ops.NotDifferentiable("TensorListConcatLists")
+ops.NotDifferentiable("TensorListElementShape")
+ops.NotDifferentiable("TensorListLength")
 ops.NotDifferentiable("TensorListPushBackBatch")
 
 
+def empty_tensor_list(element_shape,
+                      element_dtype,
+                      max_num_elements=None,
+                      name=None):
+  if max_num_elements is None:
+    max_num_elements = -1
+
+  return gen_list_ops.empty_tensor_list(
+      element_shape=_build_element_shape(element_shape),
+      element_dtype=element_dtype,
+      max_num_elements=max_num_elements,
+      name=name)
+
+
+def tensor_list_reserve(element_shape, num_elements, element_dtype, name=None):
+  return gen_list_ops.tensor_list_reserve(
+      element_shape=_build_element_shape(element_shape),
+      num_elements=num_elements,
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_from_tensor(tensor, element_shape, name=None):
+  return gen_list_ops.tensor_list_from_tensor(
+      tensor=tensor,
+      element_shape=_build_element_shape(element_shape),
+      name=name)
+
+
+def tensor_list_concat(input_handle, element_dtype, name=None):
+  # Ignore the lengths output of TensorListConcat. It is only used during
+  # gradient computation.
+  return gen_list_ops.tensor_list_concat(
+      input_handle=input_handle, element_dtype=element_dtype, name=name)[0]
+
+
+def tensor_list_split(tensor, element_shape, lengths, name=None):
+  return gen_list_ops.tensor_list_split(
+      tensor=tensor,
+      element_shape=_build_element_shape(element_shape),
+      lengths=lengths,
+      name=name)
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
@@ -42,7 +89,7 @@ def _PushBackGrad(op, dresult):
 @ops.RegisterGradient("TensorListPopBack")
 def _PopBackGrad(op, dlist, delement):
   if dlist is None:
-    dlist = gen_list_ops.empty_tensor_list(
+    dlist = empty_tensor_list(
         element_dtype=delement.dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
@@ -51,19 +98,37 @@ def _PopBackGrad(op, dlist, delement):
 
 @ops.RegisterGradient("TensorListStack")
 def _TensorListStackGrad(unused_op, dtensor):
-  return gen_list_ops.tensor_list_from_tensor(dtensor,
-                                              element_shape=dtensor.shape[1:])
+  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:])
+
+
+@ops.RegisterGradient("TensorListConcat")
+def _TensorListConcatGrad(op, dtensor, unused_dlengths):
+  # TODO(srbs): We lose the element_shape information in tensor_list_concat.
+  # Consider providing that as an output of TensorListConcat?
+  if dtensor.shape.rank is None:
+    element_shape = None
+  else:
+    element_shape = [None] + dtensor.shape.as_list()[1:]
+  return tensor_list_split(
+      dtensor,
+      element_shape=_build_element_shape(element_shape),
+      lengths=op.outputs[1])
+
+
+@ops.RegisterGradient("TensorListSplit")
+def _TensorListSplitGrad(op, dlist):
+  return tensor_list_concat(dlist, element_dtype=op.inputs[0].dtype), None, None
 
 
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape[0].value is not None:
-    num_elements = op.inputs[0].shape[0].value
+  if op.inputs[0].shape.dims and op.inputs[0].shape.dims[0].value is not None:
+    num_elements = op.inputs[0].shape.dims[0].value
   else:
     num_elements = None
   if dlist is None:
-    dlist = gen_list_ops.empty_tensor_list(
+    dlist = empty_tensor_list(
         element_dtype=op.inputs[0].dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
@@ -112,3 +177,40 @@ def _TensorListScatterGrad(op, dlist):
   t, indices, _ = op.inputs
   return gen_list_ops.tensor_list_gather(
       dlist, indices, element_dtype=t.dtype), None
+
+
+def _build_element_shape(shape):
+  """Converts shape to a format understood by list_ops for element_shape.
+
+  If `shape` is already a `Tensor` it is returned as-is. We do not perform a
+  type check here.
+
+  If shape is None or a TensorShape with unknown rank, -1 is returned.
+
+  If shape is a scalar, an int32 tensor with empty list is returned. Note we
+  do directly return an empty list since ops.convert_to_tensor would conver it
+  to a float32 which is not a valid type for element_shape.
+
+  If shape is a sequence of dims, None's in the list are replaced with -1. We
+  do not check the dtype of the other dims.
+
+  Args:
+    shape: Could be None, Tensor, TensorShape or a list of dims (each dim could
+      be a None, scalar or Tensor).
+
+  Returns:
+    A None-free shape that can be converted to a tensor.
+  """
+  if isinstance(shape, ops.Tensor):
+    return shape
+  if isinstance(shape, tensor_shape.TensorShape):
+    # `TensorShape.as_list` requires rank to be known.
+    shape = shape.as_list() if shape else None
+  # Shape is unknown.
+  if shape is None:
+    return -1
+  # Shape is a scalar.
+  if not shape:
+    return ops.convert_to_tensor(shape, dtype=dtypes.int32)
+  # Shape is a sequence of dimensions. Convert None dims to -1.
+  return [d if d is not None else -1 for d in shape]
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 4c53f33af122eecbd88a91915ea97117c2071087..5a948a21946d0b9ce867901a00425857e4f06b1f 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -114,6 +114,11 @@ def _generate_placeholder_string(x, default_placeholder="{}"):
   return placeholder
 
 
+def _is_filepath(output_stream):
+  """Returns True if output_stream is a file path."""
+  return isinstance(output_stream, str) and output_stream.startswith("file://")
+
+
 # Temporarily disable pylint g-doc-args error to allow giving more context
 # about what the kwargs are.
 # Because we are using arbitrary-length positional arguments, python 2
@@ -198,9 +203,11 @@ def print_v2(*inputs, **kwargs):
       primitives, tensors, data structures such as dicts and lists that
       may contain tensors (with the data structures possibly nested in
       arbitrary ways), and printable python objects.
-    output_stream: The output stream or logging level to print to. Defaults to
-      sys.stderr, but sys.stdout, tf.logging.info, tf.logging.warning, and
-      tf.logging.error are also supported.
+    output_stream: The output stream, logging level, or file to print to.
+      Defaults to sys.stderr, but sys.stdout, tf.logging.info,
+      tf.logging.warning, and tf.logging.error are also supported. To print to
+      a file, pass a string started with "file://" followed by the file path,
+      e.g., "file:///tmp/foo.out".
     summarize: The first and last `summarize` elements within each dimension are
       recursively printed per Tensor. If None, then the first 3 and last 3
       elements of each dimension are printed for each tensor. If set to -1, it
@@ -241,13 +248,17 @@ def print_v2(*inputs, **kwargs):
       tf_logging.error: "log(error)",
   }
 
-  output_stream_string = output_stream_to_constant.get(output_stream)
-  if not output_stream_string:
-    raise ValueError(
-        "Unsupported output stream or logging level " +
-        str(output_stream) + ". Supported streams are sys.stdout, "
-                             "sys.stderr, tf.logging.info, "
-                             "tf.logging.warning, tf.logging.error")
+  if _is_filepath(output_stream):
+    output_stream_string = output_stream
+  else:
+    output_stream_string = output_stream_to_constant.get(output_stream)
+    if not output_stream_string:
+      raise ValueError(
+          "Unsupported output stream, logging level, or file." +
+          str(output_stream) + ". Supported streams are sys.stdout, "
+          "sys.stderr, tf.logging.info, "
+          "tf.logging.warning, tf.logging.error. " +
+          "File needs to be in the form of 'file://<filepath>'.")
 
   # If we are only printing a single string scalar, there is no need to format
   if (len(inputs) == 1 and tensor_util.is_tensor(inputs[0])
@@ -612,11 +623,12 @@ def scalar_summary(tags, values, collections=None, name=None):
     _Collect(val, collections, [ops.GraphKeys.SUMMARIES])
   return val
 
-
 ops.NotDifferentiable("HistogramSummary")
 ops.NotDifferentiable("ImageSummary")
 ops.NotDifferentiable("AudioSummary")
 ops.NotDifferentiable("AudioSummaryV2")
 ops.NotDifferentiable("MergeSummary")
 ops.NotDifferentiable("ScalarSummary")
+ops.NotDifferentiable("TensorSummary")
+ops.NotDifferentiable("TensorSummaryV2")
 ops.NotDifferentiable("Timestamp")
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index cffaa983d486c94504032789d69315437263a340..758cb8041da63956c7a451e2030b9e9d98016f42 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -39,12 +39,14 @@ from tensorflow.python.ops import string_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_lookup_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.training.checkpointable import base as checkpointable_base
+from tensorflow.python.training.checkpointable import tracking as checkpointable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("initialize_all_tables")
+@tf_export(v1=["initialize_all_tables"])
 @deprecated(None, "Use `tf.tables_initializer` instead.")
 def initialize_all_tables(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
@@ -59,7 +61,7 @@ def initialize_all_tables(name="init_all_tables"):
   return tables_initializer(name)
 
 
-@tf_export("initializers.tables_initializer", "tables_initializer")
+@tf_export(v1=["initializers.tables_initializer", "tables_initializer"])
 def tables_initializer(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
 
@@ -96,20 +98,22 @@ def _check_table_dtypes(table, key_dtype, value_dtype):
                     (table.value_dtype, value_dtype))
 
 
-class LookupInterface(object):
+class LookupInterface(checkpointable.TrackableResource):
   """Represent a lookup table that persists across different steps."""
 
-  def __init__(self, key_dtype, value_dtype, name):
+  def __init__(self, key_dtype, value_dtype):
     """Construct a lookup table interface.
 
     Args:
       key_dtype: The table key type.
       value_dtype: The table value type.
-      name: A name for the operation (optional).
     """
     self._key_dtype = dtypes.as_dtype(key_dtype)
     self._value_dtype = dtypes.as_dtype(value_dtype)
-    self._name = name
+    super(LookupInterface, self).__init__()
+
+  def create_resource(self):
+    raise NotImplementedError
 
   @property
   def key_dtype(self):
@@ -124,12 +128,7 @@ class LookupInterface(object):
   @property
   def name(self):
     """The name of the table."""
-    return self._name
-
-  @property
-  def init(self):
-    """The table initialization op."""
-    raise NotImplementedError
+    return NotImplementedError
 
   def size(self, name=None):
     """Compute the number of elements in this table."""
@@ -146,7 +145,7 @@ class InitializableLookupTableBase(LookupInterface):
   An initializable lookup tables persist across different steps.
   """
 
-  def __init__(self, table_ref, default_value, initializer):
+  def __init__(self, default_value, initializer):
     """Construct a table object from a table reference.
 
     If requires a table initializer object (subclass of `TableInitializerBase`).
@@ -154,38 +153,37 @@ class InitializableLookupTableBase(LookupInterface):
     the table. The caller is responsible to execute the initialization op.
 
     Args:
-      table_ref: The table reference, i.e. the output of the lookup table ops.
       default_value: The value to use if a key is missing in the table.
       initializer: The table initializer to use.
     """
-    if context.executing_eagerly():
-      name = context.context().scope_name
-    else:
-      name = table_ref.op.name.split("/")[-1]
-    super(InitializableLookupTableBase,
-          self).__init__(initializer.key_dtype, initializer.value_dtype,
-                         name)
-    self._table_ref = table_ref
+    super(InitializableLookupTableBase, self).__init__(initializer.key_dtype,
+                                                       initializer.value_dtype)
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=self._value_dtype)
     self._default_value.get_shape().merge_with(tensor_shape.scalar())
-    self._init = initializer.initialize(self)
+    if isinstance(initializer, checkpointable_base.CheckpointableBase):
+      self._initializer = self._track_checkpointable(
+          initializer, "_initializer")
+    self._resource_handle = self.create_resource()
+    self._init_op = self.initialize()
+
+  def initialize(self):
+    return self._initializer.initialize(self)
 
   @property
-  def table_ref(self):
-    """Get the underlying table reference."""
-    return self._table_ref
+  def initializer(self):
+    return self._init_op
+
+  @property
+  @deprecated("2018-12-15", "Use `initializer` instead.")
+  def init(self):
+    return self.initializer
 
   @property
   def default_value(self):
     """The default value of the table."""
     return self._default_value
 
-  @property
-  def init(self):
-    """The table initialization op."""
-    return self._init
-
   def size(self, name=None):
     """Compute the number of elements in this table.
 
@@ -195,9 +193,10 @@ class InitializableLookupTableBase(LookupInterface):
     Returns:
       A scalar tensor containing the number of elements in this table.
     """
-    with ops.name_scope(name, "%s_Size" % self._name,
-                        [self._table_ref]) as scope:
-      return gen_lookup_ops.lookup_table_size_v2(self._table_ref, name=scope)
+    with ops.name_scope(name, "%s_Size" % self.name,
+                        [self.resource_handle]) as scope:
+      return gen_lookup_ops.lookup_table_size_v2(
+          self.resource_handle, name=scope)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -223,11 +222,11 @@ class InitializableLookupTableBase(LookupInterface):
       raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
                       (self._key_dtype, keys.dtype))
 
-    with ops.name_scope(name, "%s_Lookup" % self._name,
-                        (self._table_ref, key_tensor,
-                         self._default_value)) as scope:
+    with ops.name_scope(
+        name, "%s_Lookup" % self.name,
+        (self.resource_handle, key_tensor, self._default_value)) as scope:
       values = gen_lookup_ops.lookup_table_find_v2(
-          self._table_ref, key_tensor, self._default_value, name=scope)
+          self.resource_handle, key_tensor, self._default_value, name=scope)
 
     values.set_shape(key_tensor.get_shape())
     if isinstance(keys, sparse_tensor.SparseTensor):
@@ -269,16 +268,28 @@ class HashTable(InitializableLookupTableBase):
     Returns:
       A `HashTable` object.
     """
-    with ops.name_scope(name, "hash_table", (initializer,
-                                             default_value)) as scope:
+    self._initializer = initializer
+    self._default_value = default_value
+    self._shared_name = shared_name
+    self._name = name
+    self._table_name = ""
+    super(HashTable, self).__init__(default_value, initializer)
+    self._value_shape = self._default_value.get_shape()
+
+  def create_resource(self):
+    with ops.name_scope(self._name, "hash_table",
+                        (self._initializer, self._default_value)) as scope:
       table_ref = gen_lookup_ops.hash_table_v2(
-          shared_name=shared_name,
-          key_dtype=initializer.key_dtype,
-          value_dtype=initializer.value_dtype,
+          shared_name=self._shared_name,
+          key_dtype=self._initializer.key_dtype,
+          value_dtype=self._initializer.value_dtype,
           name=scope)
+      self._table_name = scope.split("/")[-2]
+    return table_ref
 
-      super(HashTable, self).__init__(table_ref, default_value, initializer)
-      self._value_shape = self._default_value.get_shape()
+  @property
+  def name(self):
+    return self._table_name
 
   def export(self, name=None):
     """Returns tensors of all keys and values in the table.
@@ -290,18 +301,18 @@ class HashTable(InitializableLookupTableBase):
       A pair of tensors with the first tensor containing all keys and the
         second tensors containing all values in the table.
     """
-    with ops.name_scope(name, "%s_Export" % self._name,
-                        [self._table_ref]) as name:
-      with ops.colocate_with(self._table_ref):
+    with ops.name_scope(name, "%s_Export" % self.name,
+                        [self.resource_handle]) as name:
+      with ops.colocate_with(self.resource_handle):
         exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self._table_ref, self._key_dtype, self._value_dtype, name=name)
+            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
         self._value_shape))
     return exported_keys, exported_values
 
 
-class TableInitializerBase(object):
+class TableInitializerBase(checkpointable_base.CheckpointableBase):
   """Base class for lookup table initializers."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -366,7 +377,7 @@ class KeyValueTensorInitializer(TableInitializerBase):
     """
     _check_table_dtypes(table, self._keys.dtype, self._values.dtype)
     with ops.name_scope(
-        self._name, values=(table.table_ref, self._keys,
+        self._name, values=(table.resource_handle, self._keys,
                             self._values)) as scope:
       if context.executing_eagerly():
         # Ensure a unique name when eager execution is enabled to avoid spurious
@@ -374,11 +385,11 @@ class KeyValueTensorInitializer(TableInitializerBase):
         scope += str(ops.uid())
       if fwd_compat.forward_compatible(2018, 9, 19):
         init_op = gen_lookup_ops.lookup_table_import_v2(
-            table.table_ref, self._keys, self._values, name=scope)
+            table.resource_handle, self._keys, self._values, name=scope)
       else:
         # To maintain forward compatibiltiy, use the old implementation.
         init_op = gen_lookup_ops.initialize_table_v2(
-            table.table_ref, self._keys, self._values, name=scope)
+            table.resource_handle, self._keys, self._values, name=scope)
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     return init_op
 
@@ -514,12 +525,14 @@ class TextFileInitializer(TableInitializerBase):
     if (vocab_size is not None) and (vocab_size <= 0):
       raise ValueError("Invalid vocab_size %s." % vocab_size)
 
-    self._filename = filename
     self._key_index = key_index
     self._value_index = value_index
     self._vocab_size = vocab_size
     self._delimiter = delimiter
     self._name = name
+    self._filename = self._track_checkpointable(
+        checkpointable.TrackableAsset(filename),
+        "_filename")
 
     super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
 
@@ -538,11 +551,11 @@ class TextFileInitializer(TableInitializerBase):
     """
     _check_table_dtypes(table, self.key_dtype, self.value_dtype)
     with ops.name_scope(self._name, "text_file_init",
-                        (table.table_ref,)) as scope:
+                        (table.resource_handle,)) as scope:
       filename = ops.convert_to_tensor(
           self._filename, dtypes.string, name="asset_filepath")
       init_op = gen_lookup_ops.initialize_table_from_text_file_v2(
-          table.table_ref,
+          table.resource_handle,
           filename,
           self._key_index,
           self._value_index,
@@ -806,36 +819,42 @@ class IdTableWithHashBuckets(LookupInterface):
       raise TypeError(
           "hasher_spec must be of type HasherSpec, got %s" % hasher_spec)
     self._hasher_spec = hasher_spec
-    super(IdTableWithHashBuckets, self).__init__(key_dtype, dtypes.int64,
-                                                 name.split("/")[-1])
+    self._table_name = name.split("/")[-1]
+    super(IdTableWithHashBuckets, self).__init__(key_dtype, dtypes.int64)
 
-  @property
-  def init(self):
-    """The table initialization op."""
-    if self._table:
-      return self._table.init
+  def create_resource(self):
+    if self._table is not None:
+      return self._table.create_resource()
+    return None
+
+  def initialize(self):
+    if self._table is not None:
+      return self._table.initialize()
     with ops.name_scope(None, "init"):
       return control_flow_ops.no_op()
 
   @property
-  def table_ref(self):
-    """Returns the table_ref of the underlying table, if one exists.
-
-    Only use the table_ref directly if you know what you are doing. The
-    table_ref does not have the "hash bucket" functionality, as that is provided
-    by this class.
+  def initializer(self):
+    if self._table is not None:
+      return self._table._init_op  # pylint: disable=protected-access
+    with ops.name_scope(None, "init"):
+      return control_flow_ops.no_op()
 
-    One possible use of the table_ref is subtokenization, i.e. ops which
-    dynamically decompose tokens into subtokens based on the contents of the
-    table_ref.
+  @property
+  @deprecated("2018-12-15", "Use `initializer` instead.")
+  def init(self):
+    return self.initializer
 
-    Returns:
-      the underlying table_ref, or None if there is no underlying table
-    """
+  @property
+  def resource_handle(self):
     if self._table is not None:
-      return self._table.table_ref
+      return self._table.resource_handle
     return None
 
+  @property
+  def name(self):
+    return self._table_name
+
   def size(self, name=None):
     """Compute the number of elements in this table."""
     with ops.name_scope(name, "%s_Size" % self.name) as scope:
@@ -1139,7 +1158,6 @@ def index_table_from_tensor(vocabulary_list,
           hasher_spec=hasher_spec,
           name=feat_to_id_scope,
           key_dtype=dtype)
-
     return table
 
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 806539747e5e74cf1c5f40ab47aa84dcbb364344..20397612bca9a9b81d9816ac1626ce15024d45f6 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -33,28 +33,50 @@ from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("losses.Reduction")
-class Reduction(object):
+@tf_export("losses.Reduction", "keras.losses.Reduction", v1=[])
+class ReductionV2(object):
   """Types of loss reduction.
 
   Contains the following values:
-  `NONE`: Un-reduced weighted losses with the same shape as input.
-  `SUM`: Scalar sum of weighted losses.
-  `MEAN`: Scalar `SUM` divided by sum of weights.
-  `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
-  `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
-     weights.
-  `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
+
+  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `SUM`: Scalar sum of weighted losses.
+  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
   """
 
   NONE = "none"
+  SUM = "sum"
+  SUM_OVER_BATCH_SIZE = "sum_over_batch_size"
 
-  SUM = "weighted_sum"
+  @classmethod
+  def all(cls):
+    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
 
-  MEAN = "weighted_mean"
+  @classmethod
+  def validate(cls, key):
+    if key not in cls.all():
+      raise ValueError("Invalid Reduction Key %s." % key)
 
-  SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
 
+@tf_export(v1=["losses.Reduction"])
+class Reduction(object):
+  """Types of loss reduction.
+
+  Contains the following values:
+
+  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `SUM`: Scalar sum of weighted losses.
+  * `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
+  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+  * `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
+     weights. DEPRECATED.
+  * `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
+  """
+
+  NONE = "none"
+  SUM = "weighted_sum"
+  SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
+  MEAN = "weighted_mean"
   SUM_BY_NONZERO_WEIGHTS = "weighted_sum_by_nonzero_weights"
   SUM_OVER_NONZERO_WEIGHTS = SUM_BY_NONZERO_WEIGHTS
 
@@ -71,32 +93,7 @@ class Reduction(object):
   @classmethod
   def validate(cls, key):
     if key not in cls.all():
-      raise ValueError("Invalid ReductionKey %s." % key)
-
-
-def _safe_div(numerator, denominator, name="value"):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-    name: An optional name for the returned op.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator, array_ops.where(
-          math_ops.equal(denominator, 0),
-          array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator),
-      name=name)
+      raise ValueError("Invalid Reduction Key %s." % key)
 
 
 def _safe_mean(losses, num_present):
@@ -111,7 +108,7 @@ def _safe_mean(losses, num_present):
       then zero is returned.
   """
   total_loss = math_ops.reduce_sum(losses)
-  return _safe_div(total_loss, num_present)
+  return math_ops.div_no_nan(total_loss, num_present, name="value")
 
 
 def _num_present(losses, weights, per_batch=False):
@@ -162,7 +159,7 @@ def _num_elements(losses):
     return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
-@tf_export("losses.compute_weighted_loss")
+@tf_export(v1=["losses.compute_weighted_loss"])
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -201,7 +198,7 @@ def compute_weighted_loss(
   Reduction.validate(reduction)
   with ops.name_scope(scope, "weighted_loss", (losses, weights)):
     # Save the `reduction` argument for loss normalization when distributing
-    # to multiple towers.
+    # to multiple replicas.
     # TODO(josh11b): Associate it with the returned op for more precision.
     ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
 
@@ -232,7 +229,7 @@ def compute_weighted_loss(
       return loss
 
 
-@tf_export("losses.absolute_difference")
+@tf_export(v1=["losses.absolute_difference"])
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -285,7 +282,7 @@ def absolute_difference(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.cosine_distance")
+@tf_export(v1=["losses.cosine_distance"])
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -341,7 +338,7 @@ def cosine_distance(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.hinge_loss")
+@tf_export(v1=["losses.hinge_loss"])
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -391,7 +388,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.huber_loss")
+@tf_export(v1=["losses.huber_loss"])
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -469,7 +466,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.log_loss")
+@tf_export(v1=["losses.log_loss"])
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -526,7 +523,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 
 # TODO(b/37208492): Add reduction arg.
-@tf_export("losses.mean_pairwise_squared_error")
+@tf_export(v1=["losses.mean_pairwise_squared_error"])
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -591,22 +588,24 @@ def mean_pairwise_squared_error(
 
       diffs = math_ops.subtract(predictions, labels)
 
-      reduction_indices = math_ops.range(1, array_ops.rank(diffs))
+      axis = math_ops.range(1, array_ops.rank(diffs))
 
       sum_squares_diff_per_batch = math_ops.reduce_sum(
-          math_ops.square(diffs),
-          reduction_indices=reduction_indices,
-          keepdims=True)
+          math_ops.square(diffs), axis=axis, keepdims=True)
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-      term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                              num_present_per_batch - 1)
+      term1 = 2.0 * math_ops.div_no_nan(
+          sum_squares_diff_per_batch,
+          math_ops.maximum(num_present_per_batch - 1, 0),
+          name="value")
 
-      sum_diff = math_ops.reduce_sum(
-          diffs, reduction_indices=reduction_indices, keepdims=True)
-      term2 = 2.0 * _safe_div(
+      sum_diff = math_ops.reduce_sum(diffs, axis=axis, keepdims=True)
+      term2 = 2.0 * math_ops.div_no_nan(
           math_ops.square(sum_diff),
-          math_ops.multiply(num_present_per_batch, num_present_per_batch - 1))
+          math_ops.maximum(
+              math_ops.multiply(num_present_per_batch,
+                                num_present_per_batch - 1), 0),
+          name="value")
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
@@ -620,7 +619,7 @@ def mean_pairwise_squared_error(
       return mean_loss
 
 
-@tf_export("losses.mean_squared_error")
+@tf_export(v1=["losses.mean_squared_error"])
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -673,7 +672,7 @@ def mean_squared_error(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.sigmoid_cross_entropy")
+@tf_export(v1=["losses.sigmoid_cross_entropy"])
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -737,7 +736,7 @@ def sigmoid_cross_entropy(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.softmax_cross_entropy")
+@tf_export(v1=["losses.softmax_cross_entropy"])
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -859,7 +858,7 @@ def _remove_squeezable_dimensions(
   return labels, predictions, weights
 
 
-@tf_export("losses.sparse_softmax_cross_entropy")
+@tf_export(v1=["losses.sparse_softmax_cross_entropy"])
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/losses/util_test.py b/tensorflow/python/ops/losses/util_test.py
index df2e60e2e45c9aa38184e36e126f519fdb8beb5e..22a8eaae2666634c7132bdbd537fac5d731ed2f6 100644
--- a/tensorflow/python/ops/losses/util_test.py
+++ b/tensorflow/python/ops/losses/util_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.losses import util
 from tensorflow.python.platform import test
 
 
 class LossesUtilTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGetRegularizationLoss(self):
     # Empty regularization collection should evaluate to 0.0.
     with self.cached_session():
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index d9d07282871a44e796a15c9dff768481f6920502..046ea0dfb1e592cc53dec4f42457fa9a8357f7b5 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -24,7 +24,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
-@tf_export('roll', 'manip.roll')
+@tf_export('roll', v1=['roll', 'manip.roll'])
 @deprecation.deprecated_endpoints('manip.roll')
 def roll(input, shift, axis):  # pylint: disable=redefined-builtin
   return _gen_manip_ops.roll(input, shift, axis)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 35278d9680408aa44c81ec3276e61cd382a58c57..c7ec1c57d1b07232e2bdb05fc30f5456b792890f 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -1041,11 +1041,12 @@ def _PowGrad(op, grad):
   # Avoid false singularity at x = 0
   if x.dtype.is_complex:
     # real(x) < 0 is fine for the complex case
-    log_x = array_ops.where(
-        math_ops.not_equal(x, 0), math_ops.log(x), array_ops.zeros_like(x))
+    mask = math_ops.not_equal(x, 0)
   else:
     # There's no sensible real value to return if x < 0, so return 0
-    log_x = array_ops.where(x > 0, math_ops.log(x), array_ops.zeros_like(x))
+    mask = x > 0
+  safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
+  log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
   gy = array_ops.reshape(math_ops.reduce_sum(grad * z * log_x, ry), sy)
   return gx, gy
 
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 9cfb0509424c9c3c2947ac4e224b33b035d53248..822f89768c53c45def3bb93a53382b2375944528 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -20,9 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import execution_callbacks
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients
@@ -41,7 +45,7 @@ class SquaredDifferenceOpTest(test.TestCase):
     l = np.random.randn(*left_shape)
     r = np.random.randn(*right_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       left_tensor = constant_op.constant(l, shape=left_shape)
       right_tensor = constant_op.constant(r, shape=right_shape)
       output = math_ops.squared_difference(left_tensor, right_tensor)
@@ -52,6 +56,7 @@ class SquaredDifferenceOpTest(test.TestCase):
     self.assertLess(left_err, 1e-10)
     self.assertLess(right_err, 1e-10)
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     self._testGrad([1, 2, 3, 2], [3, 2])
     self._testGrad([2, 4], [3, 2, 4])
@@ -77,12 +82,13 @@ class AbsOpTest(test.TestCase):
           self._biasedRandN(
               shape, bias=bias), dtype=dtype)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       output = math_ops.abs(value)
       error = gradient_checker.compute_gradient_error(
           value, shape, output, output.get_shape().as_list())
     self.assertLess(error, max_error)
 
+  @test_util.run_deprecated_v1
   def testComplexAbs(self):
     # Bias random test values away from zero to avoid numeric instabilities.
     self._testGrad(
@@ -99,6 +105,7 @@ class AbsOpTest(test.TestCase):
 
 class MinOrMaxGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMinGradient(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     outputs = math_ops.reduce_min(array_ops.concat([inputs, inputs], 0))
@@ -106,6 +113,7 @@ class MinOrMaxGradientTest(test.TestCase):
       error = gradient_checker.compute_gradient_error(inputs, [1], outputs, [])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testMaxGradient(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     outputs = math_ops.reduce_max(array_ops.concat([inputs, inputs], 0))
@@ -116,6 +124,7 @@ class MinOrMaxGradientTest(test.TestCase):
 
 class MaximumOrMinimumGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMaximumGradient(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
     outputs = math_ops.maximum(inputs, 3.0)
@@ -123,6 +132,7 @@ class MaximumOrMinimumGradientTest(test.TestCase):
       error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testMinimumGradient(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
     outputs = math_ops.minimum(inputs, 2.0)
@@ -133,6 +143,7 @@ class MaximumOrMinimumGradientTest(test.TestCase):
 
 class ProdGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testProdGradient(self):
     inputs = constant_op.constant([[1., 2.], [3., 4.]],
                                   dtype=dtypes.float32)
@@ -143,6 +154,7 @@ class ProdGradientTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testProdGradientForNegativeAxis(self):
     inputs = constant_op.constant([[1., 2.], [3., 4.]],
                                   dtype=dtypes.float32)
@@ -153,6 +165,7 @@ class ProdGradientTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testProdGradientComplex(self):
     for dtype in dtypes.complex64, dtypes.complex128:
       inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
@@ -164,6 +177,7 @@ class ProdGradientTest(test.TestCase):
             outputs, outputs.get_shape().as_list())
         self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testProdGradientForNegativeAxisComplex(self):
     for dtype in dtypes.complex64, dtypes.complex128:
       inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
@@ -178,6 +192,7 @@ class ProdGradientTest(test.TestCase):
 
 class SegmentMinOrMaxGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSegmentMinGradient(self):
     data = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.float32)
     segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
@@ -187,6 +202,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
                                                       [2])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testSegmentMaxGradient(self):
     data = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.float32)
     segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
@@ -196,6 +212,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
                                                       [2])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testSegmentMinGradientWithTies(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     data = array_ops.concat([inputs, inputs], 0)
@@ -206,6 +223,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
                                                       [1])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testSegmentMaxGradientWithTies(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     data = array_ops.concat([inputs, inputs], 0)
@@ -219,6 +237,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
 
 class FloorModGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testFloorModGradient(self):
     # Making sure the input is not near the discontinuity point where
     # x/y == floor(x/y)
@@ -233,6 +252,7 @@ class FloorModGradientTest(test.TestCase):
 
 class DivNoNanGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicGradient(self):
     inputs = constant_op.constant(np.arange(-3, 3),
                                   dtype=dtypes.float32)
@@ -244,6 +264,7 @@ class DivNoNanGradientTest(test.TestCase):
           outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithDenominatorIsZero(self):
     x = constant_op.constant(np.arange(-3, 3),
                              dtype=dtypes.float32)
@@ -263,6 +284,7 @@ class XlogyTest(test.TestCase):
     xlogy_ygrad = self.evaluate(gradients.gradients(math_ops.xlogy(x, y), y)[0])
     return xlogy_xgrad, xlogy_ygrad
 
+  @test_util.run_deprecated_v1
   def testNonZeroValuesGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -273,6 +295,7 @@ class XlogyTest(test.TestCase):
       self.assertAllClose(xlogy_expected_xgrad, xlogy_xgrad)
       self.assertAllClose(xlogy_expected_ygrad, xlogy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -282,6 +305,7 @@ class XlogyTest(test.TestCase):
       self.assertAllClose(zero, xlogy_xgrad)
       self.assertAllClose(zero, xlogy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -290,6 +314,7 @@ class XlogyTest(test.TestCase):
       self.assertAllClose(-np.inf, xlogy_xgrad)
       self.assertAllClose(np.inf, xlogy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -307,6 +332,7 @@ class XdivyTest(test.TestCase):
     xdivy_ygrad = self.evaluate(gradients.gradients(math_ops.xdivy(x, y), y)[0])
     return xdivy_xgrad, xdivy_ygrad
 
+  @test_util.run_deprecated_v1
   def testNonZeroValuesGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -317,6 +343,7 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(xdivy_expected_xgrad, xdivy_xgrad)
       self.assertAllClose(xdivy_expected_ygrad, xdivy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -326,6 +353,7 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(zero, xdivy_xgrad)
       self.assertAllClose(zero, xdivy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -334,6 +362,7 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(np.inf, xdivy_xgrad)
       self.assertAllClose(-np.inf, xdivy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -344,5 +373,25 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(zero, xdivy_ygrad)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class PowGradTest(test.TestCase):
+
+  def test_zero_grad_tf_gradients(self):
+    if context.executing_eagerly():
+      self.skipTest("tf.gradients not supported in eager.")
+
+    x = constant_op.constant([-1., 0., 1.])
+    g = self.evaluate(gradients.gradients(math_ops.pow(x, 2), x)[0])
+    self.assertAllClose([-2., 0., 2.], g)
+
+  def test_zero_grad_tape(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+      x = constant_op.constant([-1, 0., 1.])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        g = tape.gradient(math_ops.pow(x, 2), x)
+      g = self.evaluate(g)
+      self.assertAllClose([-2., 0., 2.], g)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 83b8b5a3a49ce604e50fd2e2998e40625a7a7631..e2b634ee8f8d18e1e0e43a9e10cb7f2532bbbf12 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
-from tensorflow.python.ops import gen_spectral_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_math_ops import *
@@ -44,16 +43,17 @@ from tensorflow.python.ops.gen_math_ops import *
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
 
-arg_max = deprecation.deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
-arg_min = deprecation.deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
-tf_export("arg_max")(arg_max)
-tf_export("arg_min")(arg_min)
+arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
+arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
+tf_export(v1=["arg_max"])(arg_max)
+tf_export(v1=["arg_min"])(arg_min)
 
 # This is set by resource_variable_ops.py. It is included in this way since
 # there is a circular dependency between math_ops and resource_variable_ops
@@ -70,7 +70,7 @@ def _set_doc(doc):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("math.argmax", "argmax")
+@tf_export(v1=["math.argmax", "argmax"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -83,12 +83,39 @@ def argmax(input,
            output_type=dtypes.int64):
   axis = deprecation.deprecated_argument_lookup(
       "axis", axis, "dimension", dimension)
+  return argmax_v2(input, axis, output_type, name)
+
+
+@tf_export("math.argmax", "argmax", v1=[])
+def argmax_v2(input,
+              axis=None,
+              output_type=dtypes.int64,
+              name=None):
+  """Returns the index with the largest value across axes of a tensor.
+
+  Note that in case of ties the identity of the return value is not guaranteed.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+    `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`, `quint8`,
+    `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`, `uint64`.
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      int32 or int64, must be in the range `-rank(input), rank(input))`.
+      Describes which axis of the input Tensor to reduce across. For vectors,
+      use axis = 0.
+    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`.
+      Defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `output_type`.
+  """
   if axis is None:
     axis = 0
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
 
-@tf_export("math.argmin", "argmin")
+@tf_export(v1=["math.argmin", "argmin"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -101,6 +128,33 @@ def argmin(input,
            output_type=dtypes.int64):
   axis = deprecation.deprecated_argument_lookup(
       "axis", axis, "dimension", dimension)
+  return argmin_v2(input, axis, output_type, name)
+
+
+@tf_export("math.argmin", "argmin", v1=[])
+def argmin_v2(input,
+              axis=None,
+              output_type=dtypes.int64,
+              name=None):
+  """Returns the index with the smallest value across axes of a tensor.
+
+  Note that in case of ties the identity of the return value is not guaranteed.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+    `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`, `quint8`,
+    `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`, `uint64`.
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      int32 or int64, must be in the range `-rank(input), rank(input))`.
+      Describes which axis of the input Tensor to reduce across. For vectors,
+      use axis = 0.
+    output_type: An optional `tf.DType` from: `tf.int32, tf.int64`.
+      Defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `output_type`.
+  """
   if axis is None:
     axis = 0
   return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
@@ -112,6 +166,7 @@ def argmin(input,
 # pylint: disable=anomalous-backslash-in-string,protected-access
 # pylint: disable=g-docstring-has-escape
 @tf_export("math.abs", "abs")
+@dispatch.add_dispatch_support
 def abs(x, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the absolute value of a tensor.
 
@@ -136,22 +191,10 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
       of type `float32` or `float64`, respectively.
   """
   with ops.name_scope(name, "Abs", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      if x.values.dtype.is_complex:
-        x_abs = gen_math_ops.complex_abs(
-            x.values, Tout=x.values.dtype.real_dtype, name=name)
-        return sparse_tensor.SparseTensor(
-            indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
-      x_abs = gen_math_ops._abs(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
-    else:
-      x = ops.convert_to_tensor(x, name="x")
-      if x.dtype.is_complex:
-        return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
-      return gen_math_ops._abs(x, name=name)
-
-
+    x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.is_complex:
+      return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
+    return gen_math_ops._abs(x, name=name)
 # pylint: enable=g-docstring-has-escape
 
 
@@ -187,6 +230,7 @@ class DivideDelegateWithName(object):
 
 
 @tf_export("math.divide", "divide")
+@dispatch.add_dispatch_support
 def divide(x, y, name=None):
   """Computes Python style division of `x` by `y`."""
 
@@ -199,6 +243,7 @@ def divide(x, y, name=None):
 
 
 @tf_export("math.multiply", "multiply")
+@dispatch.add_dispatch_support
 def multiply(x, y, name=None):
   return gen_math_ops.mul(x, y, name)
 
@@ -219,6 +264,7 @@ _mul.__doc__ = (
 
 
 @tf_export("math.subtract", "subtract")
+@dispatch.add_dispatch_support
 def subtract(x, y, name=None):
   return gen_math_ops.sub(x, y, name)
 
@@ -238,31 +284,7 @@ _sub.__doc__ = (
     gen_math_ops.sub.__doc__ + ("" if _sub.__doc__ is None else _sub.__doc__))
 
 
-# pylint: disable=g-docstring-has-escape
-@tf_export("math.negative", "negative")
-def negative(x, name=None):
-  """Computes numerical negative value element-wise.
-
-  I.e., \\(y = -x\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Neg", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_neg = gen_math_ops.neg(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_neg, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.neg(x, name=name)
-
-
-# pylint: enable=g-docstring-has-escape
+negative = gen_math_ops.neg
 
 
 # pylint: disable=g-docstring-has-escape
@@ -288,107 +310,8 @@ def _neg(x, name=None):
 # pylint: enable=g-docstring-has-escape
 
 
-@tf_export("math.sign", "sign")
-def sign(x, name=None):
-  """Returns an element-wise indication of the sign of a number.
-
-  `y = sign(x) = -1` if `x < 0`; 0 if `x == 0` or `tf.is_nan(x)`; 1 if `x > 0`.
-
-  Zero is returned for NaN inputs.
-
-  For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(numpy)
-  Equivalent to numpy.sign except for the behavior for input values of NaN.
-  @end_compatibility
-  """
-  with ops.name_scope(name, "Sign", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_sign = gen_math_ops.sign(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sign, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.sign(x, name=name)
-
-
-@tf_export("math.square", "square")
-def square(x, name=None):
-  r"""Computes square of x element-wise.
-
-  I.e., \\(y = x * x = x^2\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Square", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_square = gen_math_ops.square(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_square, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.square(x, name=name)
-
-
-@tf_export("math.sqrt", "sqrt")
-def sqrt(x, name=None):
-  r"""Computes square root of x element-wise.
-
-  I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Sqrt", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_sqrt = gen_math_ops.sqrt(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sqrt, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.sqrt(x, name=name)
-
-
-@tf_export("math.erf", "erf")
-@deprecation.deprecated_endpoints("erf")
-def erf(x, name=None):
-  """Computes the Gauss error function of `x` element-wise.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Erf", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_erf = gen_math_ops.erf(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_erf, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.erf(x, name=name)
-
-
-@tf_export("math.scalar_mul", "scalar_mul")
-def scalar_mul(scalar, x):
+@tf_export(v1=["math.scalar_mul", "scalar_mul"])
+def scalar_mul(scalar, x, name=None):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
 
   Intended for use in gradient code which might deal with `IndexedSlices`
@@ -398,6 +321,7 @@ def scalar_mul(scalar, x):
   Args:
     scalar: A 0-D scalar `Tensor`. Must have known shape.
     x: A `Tensor` or `IndexedSlices` to be scaled.
+    name: A name for the operation (optional).
 
   Returns:
     `scalar * x` of the same type (`Tensor` or `IndexedSlices`) as `x`.
@@ -410,14 +334,23 @@ def scalar_mul(scalar, x):
   shape = scalar.get_shape()
   if shape.ndims == 0:
     if isinstance(x, ops.IndexedSlices):
-      return ops.IndexedSlices(scalar * x.values, x.indices, x.dense_shape)
+      return ops.IndexedSlices(gen_math_ops.mul(scalar, x.values, name),
+                               x.indices, x.dense_shape)
     else:
-      return scalar * x
+      return gen_math_ops.mul(scalar, x, name)
   else:
     raise ValueError("Only scalar multiply works, got shape %s" % shape)
 
 
+@tf_export("math.scalar_mul", "scalar_mul", v1=[])
+@_set_doc(scalar_mul.__doc__)
+def scalar_mul_v2(scalar, x, name=None):
+  with ops.name_scope(name, "scalar_mul", [x]) as name:
+    return scalar_mul(scalar, x, name)
+
+
 @tf_export("math.pow", "pow")
+@dispatch.add_dispatch_support
 def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the power of one value to another.
 
@@ -446,6 +379,7 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin,redefined-outer-name
 @tf_export("dtypes.complex", "complex")
+@dispatch.add_dispatch_support
 def complex(real, imag, name=None):
   r"""Converts two real numbers to a complex number.
 
@@ -487,8 +421,9 @@ def complex(real, imag, name=None):
     return gen_math_ops._complex(real, imag, Tout=Tout, name=name)
 
 
-@tf_export("math.real", "real")
+@tf_export("math.real", v1=["math.real", "real"])
 @deprecation.deprecated_endpoints("real")
+@dispatch.add_dispatch_support
 def real(input, name=None):
   r"""Returns the real part of a complex (or real) tensor.
 
@@ -519,8 +454,9 @@ def real(input, name=None):
       return input
 
 
-@tf_export("math.imag", "imag")
+@tf_export("math.imag", v1=["math.imag", "imag"])
 @deprecation.deprecated_endpoints("imag")
+@dispatch.add_dispatch_support
 def imag(input, name=None):
   r"""Returns the imaginary part of a complex (or real) tensor.
 
@@ -550,8 +486,9 @@ def imag(input, name=None):
       return array_ops.zeros_like(input)
 
 
-@tf_export("math.angle", "angle")
+@tf_export("math.angle", v1=["math.angle", "angle"])
 @deprecation.deprecated_endpoints("angle")
+@dispatch.add_dispatch_support
 def angle(input, name=None):
   r"""Returns the element-wise argument of a complex (or real) tensor.
 
@@ -591,6 +528,7 @@ def angle(input, name=None):
 
 
 @tf_export("math.round", "round")
+@dispatch.add_dispatch_support
 def round(x, name=None):  # pylint: disable=redefined-builtin
   """Rounds the values of a tensor to the nearest integer, element-wise.
 
@@ -618,6 +556,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
 
 
 @tf_export("dtypes.cast", "cast")
+@dispatch.add_dispatch_support
 def cast(x, dtype, name=None):
   """Casts a tensor to a new type.
 
@@ -681,6 +620,7 @@ def cast(x, dtype, name=None):
 
 
 @tf_export("dtypes.saturate_cast", "saturate_cast")
+@dispatch.add_dispatch_support
 def saturate_cast(value, dtype, name=None):
   """Performs a safe saturating cast of `value` to `dtype`.
 
@@ -713,8 +653,8 @@ def saturate_cast(value, dtype, name=None):
                                        name="max"))
     return cast(value, dtype, name=name)
 
-
-@tf_export("to_float")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_float"])
 def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
 
@@ -732,7 +672,8 @@ def to_float(x, name="ToFloat"):
   return cast(x, dtypes.float32, name=name)
 
 
-@tf_export("to_double")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_double"])
 def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
 
@@ -750,7 +691,8 @@ def to_double(x, name="ToDouble"):
   return cast(x, dtypes.float64, name=name)
 
 
-@tf_export("to_int32")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_int32"])
 def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
 
@@ -768,7 +710,8 @@ def to_int32(x, name="ToInt32"):
   return cast(x, dtypes.int32, name=name)
 
 
-@tf_export("to_int64")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_int64"])
 def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
 
@@ -786,7 +729,8 @@ def to_int64(x, name="ToInt64"):
   return cast(x, dtypes.int64, name=name)
 
 
-@tf_export("to_bfloat16")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_bfloat16"])
 def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
 
@@ -804,7 +748,8 @@ def to_bfloat16(x, name="ToBFloat16"):
   return cast(x, dtypes.bfloat16, name=name)
 
 
-@tf_export("to_complex64")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_complex64"])
 def to_complex64(x, name="ToComplex64"):
   """Casts a tensor to type `complex64`.
 
@@ -822,7 +767,8 @@ def to_complex64(x, name="ToComplex64"):
   return cast(x, dtypes.complex64, name=name)
 
 
-@tf_export("to_complex128")
+@deprecation.deprecated(date=None, instructions="Use tf.cast instead.")
+@tf_export(v1=["to_complex128"])
 def to_complex128(x, name="ToComplex128"):
   """Casts a tensor to type `complex128`.
 
@@ -1000,6 +946,7 @@ def _div_python2(x, y, name=None):
 
 
 @tf_export("math.truediv", "truediv")
+@dispatch.add_dispatch_support
 def truediv(x, y, name=None):
   """Divides x / y elementwise (using Python 3 division operator semantics).
 
@@ -1031,7 +978,10 @@ def truediv(x, y, name=None):
   return _truediv_python3(x, y, name)
 
 
-@tf_export("div")
+@deprecation.deprecated(
+    date=None,
+    instructions="Deprecated in favor of operator or tf.math.divide.")
+@tf_export(v1=["div"])
 def div(x, y, name=None):
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
@@ -1054,6 +1004,7 @@ def div(x, y, name=None):
 
 
 @tf_export("div_no_nan")
+@dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
   """Computes an unsafe divide which returns 0 if the y is zero.
 
@@ -1082,7 +1033,8 @@ mod = gen_math_ops.floor_mod
 
 # TODO(aselle): Deprecate this once all internal functionality uses
 # tf.truncatediv
-@tf_export("math.floordiv", "floordiv")
+@tf_export("math.floordiv", v1=["math.floordiv", "floordiv"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("floordiv")
 def floordiv(x, y, name=None):
   """Divides `x / y` elementwise, rounding toward the most negative integer.
@@ -1112,16 +1064,11 @@ def floordiv(x, y, name=None):
 
 
 realdiv = gen_math_ops.real_div
-tf_export("realdiv")(realdiv)
 truncatediv = gen_math_ops.truncate_div
-tf_export("truncatediv")(truncatediv)
 # TODO(aselle): Rename this to floordiv when we can.
 floor_div = gen_math_ops.floor_div
-tf_export("floor_div")(floor_div)
 truncatemod = gen_math_ops.truncate_mod
-tf_export("truncatemod")(truncatemod)
 floormod = gen_math_ops.floor_mod
-tf_export("floormod", "mod")(floormod)
 
 
 def _mul_dispatch(x, y, name=None):
@@ -1156,7 +1103,8 @@ _OverrideBinaryOperatorHelper(gen_math_ops.floor_mod, "mod")
 _OverrideBinaryOperatorHelper(pow, "pow")
 
 
-@tf_export("math.logical_xor", "logical_xor")
+@tf_export("math.logical_xor", v1=["math.logical_xor", "logical_xor"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("logical_xor")
 def logical_xor(x, y, name="LogicalXor"):
   """x ^ y = (x | y) & ~(x & y)."""
@@ -1252,7 +1200,7 @@ def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disa
 
 
 # Reduction operations
-def _ReductionDims(x, axis, reduction_indices):
+def _ReductionDims(x, axis, reduction_indices=None):  # pylint: disable=invalid-name
   """Returns range(0, rank(x)) if reduction_indices is None."""
   # TODO(aselle): Remove this after deprecation
   if reduction_indices is not None:
@@ -1267,31 +1215,31 @@ def _ReductionDims(x, axis, reduction_indices):
     if rank is not None:
       return constant_op.constant(np.arange(rank), dtype=dtypes.int32)
     if (isinstance(x, sparse_tensor.SparseTensor) and
-        x.dense_shape.get_shape().is_fully_defined()):
-      rank = x.dense_shape.get_shape()[0].value  # sparse.dense_shape is 1-D.
+        x.dense_shape.shape.is_fully_defined()):
+      rank = x.dense_shape.shape.dims[0].value  # sparse.dense_shape is 1-D.
       return constant_op.constant(np.arange(rank), dtype=dtypes.int32)
 
     # Otherwise, we rely on Range and Rank to do the right thing at run-time.
     return range(0, array_ops.rank(x))
 
 
-def _may_reduce_to_scalar(keepdims, axis, reduction_indices, output):
+def _may_reduce_to_scalar(keepdims, axis, output):
   """Set a reduction's output shape to be a scalar if we are certain."""
   if not common_shapes.has_fully_defined_shape(output) and (not keepdims) and (
-      axis is None) and (reduction_indices is None):
+      axis is None):
     output.set_shape(())
   return output
 
 
-@tf_export("math.reduce_sum", "reduce_sum")
+@tf_export(v1=["math.reduce_sum", "reduce_sum"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_sum(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+def reduce_sum_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes the sum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1331,21 +1279,62 @@ def reduce_sum(input_tensor,
   int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
+  return reduce_sum(input_tensor, axis, keepdims, name)
+
+
+@tf_export("math.reduce_sum", "reduce_sum", v1=[])
+@dispatch.add_dispatch_support
+def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the sum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._sum(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1, 1, 1], [1, 1, 1]])
+  tf.reduce_sum(x)  # 6
+  tf.reduce_sum(x, 0)  # [2, 2, 2]
+  tf.reduce_sum(x, 1)  # [3, 3]
+  tf.reduce_sum(x, 1, keepdims=True)  # [[3], [3]]
+  tf.reduce_sum(x, [0, 1])  # 6
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
 
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
 
-@tf_export("math.count_nonzero", "count_nonzero")
+  @compatibility(numpy)
+  Equivalent to np.sum apart the fact that numpy upcast uint8 and int32 to
+  int64 while tensorflow returns the same dtype as the input.
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._sum(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.count_nonzero", "count_nonzero"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def count_nonzero(input_tensor,
@@ -1406,32 +1395,89 @@ def count_nonzero(input_tensor,
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis,
+      "reduction_indices", reduction_indices
+      )
+
+  return count_nonzero_v2(input_tensor, axis, keepdims, dtype, name)
+
+
+@tf_export("math.count_nonzero", v1=[])
+def count_nonzero_v2(input,  # pylint: disable=redefined-builtin
+                     axis=None,
+                     keepdims=None,
+                     dtype=dtypes.int64,
+                     name=None):
+  """Computes number of nonzero elements across dimensions of a tensor.
+
+  Reduces `input` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  **NOTE** Floating point comparison to zero is done by exact floating point
+  equality check.  Small values are **not** rounded to zero for purposes of
+  the nonzero check.
+
+  For example:
+
+  ```python
+  x = tf.constant([[0, 1, 0], [1, 1, 0]])
+  tf.count_nonzero(x)  # 3
+  tf.count_nonzero(x, 0)  # [1, 2, 0]
+  tf.count_nonzero(x, 1)  # [1, 2]
+  tf.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
+  tf.count_nonzero(x, [0, 1])  # 3
+  ```
+
+  **NOTE** Strings are compared against zero-length empty string `""`. Any
+  string with a size greater than zero is already considered as nonzero.
+
+  For example:
+  ```python
+  x = tf.constant(["", "a", "  ", "b", ""])
+  tf.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
+  ```
+
+  Args:
+    input: The tensor to reduce. Should be of numeric type, `bool`,
+      or `string`.
+    axis: The dimensions to reduce. If `None` (the default),
+      reduces all dimensions. Must be in the range
+      `[-rank(input), rank(input))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    dtype: The output dtype; defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor (number of nonzero values).
+  """
   if keepdims is None:
     keepdims = False
-
-  with ops.name_scope(name, "count_nonzero", [input_tensor]):
-    input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
+  with ops.name_scope(name, "count_nonzero", [input]):
+    input = ops.convert_to_tensor(input, name="input")
     # A scalar of 'zero' is enough as `not_equal` will broadcast.
-    zero = array_ops.zeros([], dtype=input_tensor.dtype)
+    zero = array_ops.zeros([], dtype=input.dtype)
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
-            to_int64(gen_math_ops.not_equal(input_tensor, zero)),
+            to_int64(gen_math_ops.not_equal(input, zero)),
             axis=axis,
-            keepdims=keepdims,
-            reduction_indices=reduction_indices),
+            keepdims=keepdims),
         dtype=dtype)
 
 
-@tf_export("math.reduce_mean", "reduce_mean")
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_mean(input_tensor,
-                axis=None,
-                keepdims=None,
-                name=None,
-                reduction_indices=None,
-                keep_dims=None):
+@tf_export(v1=["math.reduce_mean", "reduce_mean"])
+def reduce_mean_v1(input_tensor,
+                   axis=None,
+                   keepdims=None,
+                   name=None,
+                   reduction_indices=None,
+                   keep_dims=None):
   """Computes the mean of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1481,29 +1527,168 @@ def reduce_mean(input_tensor,
 
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  return reduce_mean(input_tensor, axis, keepdims, name)
 
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops.mean(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
 
+@tf_export("math.reduce_mean", "reduce_mean", v1=[])
+@dispatch.add_dispatch_support
+def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the mean of elements across dimensions of a tensor.
 
-@tf_export("math.reduce_prod", "reduce_prod")
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_prod(input_tensor,
-                axis=None,
-                keepdims=None,
-                name=None,
-                reduction_indices=None,
-                keep_dims=None):
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1., 1.], [2., 2.]])
+  tf.reduce_mean(x)  # 1.5
+  tf.reduce_mean(x, 0)  # [1.5, 1.5]
+  tf.reduce_mean(x, 1)  # [1.,  2.]
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.mean
+
+  Please note that `np.mean` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_mean` has an aggressive type inference from `input_tensor`,
+  for example:
+
+  ```python
+  x = tf.constant([1, 0, 1, 0])
+  tf.reduce_mean(x)  # 0
+  y = tf.constant([1., 0., 1., 0.])
+  tf.reduce_mean(y)  # 0.5
+  ```
+
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops.mean(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export("math.reduce_variance")
+def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the variance of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1., 2.], [3., 4.]])
+  tf.reduce_variance(x)  # 1.25
+  tf.reduce_variance(x, 0)  # [1., 1.]
+  tf.reduce_variance(x, 1)  # [0.25,  0.25]
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name scope for the associated operations (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.var
+
+  Please note that `np.var` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_variance` has an aggressive type inference from
+  `input_tensor`,
+  @end_compatibility
+  """
+  name = name if name else "reduce_variance"
+  with ops.name_scope(name):
+    means = reduce_mean(input_tensor, axis=axis, keepdims=True)
+    squared_deviations = gen_math_ops.square(input_tensor - means)
+    return reduce_mean(squared_deviations, axis=axis, keepdims=keepdims)
+
+
+@tf_export("math.reduce_std")
+def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the standard deviation of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1., 2.], [3., 4.]])
+  tf.reduce_std(x)  # 1.1180339887498949
+  tf.reduce_std(x, 0)  # [1., 1.]
+  tf.reduce_std(x, 1)  # [0.5,  0.5]
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name scope for the associated operations (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.std
+
+  Please note that `np.std` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_std` has an aggressive type inference from `input_tensor`,
+  @end_compatibility
+  """
+  name = name if name else "reduce_std"
+  with ops.name_scope(name):
+    variance = reduce_variance(input_tensor, axis=axis, keepdims=keepdims)
+    return gen_math_ops.sqrt(variance)
+
+
+@tf_export("math.reduce_prod", "reduce_prod", v1=[])
+@dispatch.add_dispatch_support
+def reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the product of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1521,6 +1706,48 @@ def reduce_prod(input_tensor,
       `[-rank(input_tensor), rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.prod
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops.prod(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_prod", "reduce_prod"])
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+def reduce_prod_v1(input_tensor,
+                   axis=None,
+                   keepdims=None,
+                   name=None,
+                   reduction_indices=None,
+                   keep_dims=None):
+  """Computes the product of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
     keep_dims: Deprecated alias for `keepdims`.
 
@@ -1531,29 +1758,22 @@ def reduce_prod(input_tensor,
   Equivalent to np.prod
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops.prod(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_prod(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_min", "reduce_min")
+@tf_export(v1=["math.reduce_min", "reduce_min"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_min(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+def reduce_min_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes the minimum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1566,9 +1786,9 @@ def reduce_min(input_tensor,
 
   Args:
     input_tensor: The tensor to reduce. Should have real numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1581,28 +1801,58 @@ def reduce_min(input_tensor,
   Equivalent to np.min
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._min(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_min(input_tensor, axis, keepdims, name)
+
+
+@tf_export("math.reduce_min", "reduce_min", v1=[])
+@dispatch.add_dispatch_support
+def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the minimum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have real numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
 
+  Returns:
+    The reduced tensor.
 
-@tf_export("math.reduce_max", "reduce_max")
+  @compatibility(numpy)
+  Equivalent to np.min
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._min(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_max", "reduce_max"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_max(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+def reduce_max_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes the maximum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1627,32 +1877,159 @@ def reduce_max(input_tensor,
     The reduced tensor.
 
   @compatibility(numpy)
-  Equivalent to np.max
+  Equivalent to np.max
+  @end_compatibility
+  """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  return reduce_max(input_tensor, axis, keepdims, name)
+
+
+@tf_export("math.reduce_max", "reduce_max", v1=[])
+@dispatch.add_dispatch_support
+def reduce_max(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the maximum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have real numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.max
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._max(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_all", "reduce_all"])
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+def reduce_all_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
+  """Computes the "logical and" of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[True,  True], [False, False]])
+  tf.reduce_all(x)  # False
+  tf.reduce_all(x, 0)  # [False, False]
+  tf.reduce_all(x, 1)  # [True, False]
+  ```
+
+  Args:
+    input_tensor: The boolean tensor to reduce.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+    reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.all
+  @end_compatibility
+  """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  return reduce_all(input_tensor, axis, keepdims, name)
+
+
+@tf_export("reduce_all", "math.reduce_all", v1=[])
+@dispatch.add_dispatch_support
+def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the "logical and" of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[True,  True], [False, False]])
+  tf.reduce_all(x)  # False
+  tf.reduce_all(x, 0)  # [False, False]
+  tf.reduce_all(x, 1)  # [True, False]
+  ```
+
+  Args:
+    input_tensor: The boolean tensor to reduce.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.all
   @end_compatibility
   """
-  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
-                                                    "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._max(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._all(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
 
 
-@tf_export("math.reduce_all", "reduce_all")
+@tf_export(v1=["math.reduce_any", "reduce_any"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_all(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
-  """Computes the "logical and" of elements across dimensions of a tensor.
+def reduce_any_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
+  """Computes the "logical or" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
@@ -1666,16 +2043,16 @@ def reduce_all(input_tensor,
 
   ```python
   x = tf.constant([[True,  True], [False, False]])
-  tf.reduce_all(x)  # False
-  tf.reduce_all(x, 0)  # [False, False]
-  tf.reduce_all(x, 1)  # [True, False]
+  tf.reduce_any(x)  # True
+  tf.reduce_any(x, 0)  # [True, True]
+  tf.reduce_any(x, 1)  # [True, False]
   ```
 
   Args:
     input_tensor: The boolean tensor to reduce.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1685,31 +2062,19 @@ def reduce_all(input_tensor,
     The reduced tensor.
 
   @compatibility(numpy)
-  Equivalent to np.all
+  Equivalent to np.any
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._all(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_any(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_any", "reduce_any")
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_any(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+@tf_export("math.reduce_any", "reduce_any", v1=[])
+@dispatch.add_dispatch_support
+def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the "logical or" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1731,13 +2096,11 @@ def reduce_any(input_tensor,
 
   Args:
     input_tensor: The boolean tensor to reduce.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
-    reduction_indices: The old (deprecated) name for axis.
-    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1746,28 +2109,23 @@ def reduce_any(input_tensor,
   Equivalent to np.any
   @end_compatibility
   """
-  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
-                                                    "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._any(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._any(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
 
 
-@tf_export("math.reduce_logsumexp", "reduce_logsumexp")
+@tf_export(v1=["math.reduce_logsumexp", "reduce_logsumexp"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_logsumexp(input_tensor,
-                     axis=None,
-                     keepdims=None,
-                     name=None,
-                     reduction_indices=None,
-                     keep_dims=None):
+def reduce_logsumexp_v1(input_tensor,
+                        axis=None,
+                        keepdims=None,
+                        name=None,
+                        reduction_indices=None,
+                        keep_dims=None):
   """Computes log(sum(exp(elements across dimensions of a tensor))).
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1795,9 +2153,9 @@ def reduce_logsumexp(input_tensor,
 
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1806,16 +2164,57 @@ def reduce_logsumexp(input_tensor,
   Returns:
     The reduced tensor.
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
+  return reduce_logsumexp(input_tensor, axis, keepdims, name)
+
+
+@tf_export("math.reduce_logsumexp", "reduce_logsumexp", v1=[])
+def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes log(sum(exp(elements across dimensions of a tensor))).
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  This function is more numerically stable than log(sum(exp(input))). It avoids
+  overflows caused by taking the exp of large inputs and underflows caused by
+  taking the log of small inputs.
+
+  For example:
+
+  ```python
+  x = tf.constant([[0., 0., 0.], [0., 0., 0.]])
+  tf.reduce_logsumexp(x)  # log(6)
+  tf.reduce_logsumexp(x, 0)  # [log(2), log(2), log(2)]
+  tf.reduce_logsumexp(x, 1)  # [log(3), log(3)]
+  tf.reduce_logsumexp(x, 1, keepdims=True)  # [[log(3)], [log(3)]]
+  tf.reduce_logsumexp(x, [0, 1])  # log(6)
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+  """
+  keepdims = False if keepdims is None else keepdims
   input_tensor = ops.convert_to_tensor(input_tensor)
   with ops.name_scope(name, "ReduceLogSumExp", [input_tensor]) as name:
     raw_max = reduce_max(
         input_tensor,
         axis=axis,
-        reduction_indices=reduction_indices,
         keepdims=True)
     my_max = array_ops.stop_gradient(
         array_ops.where(
@@ -1825,15 +2224,14 @@ def reduce_logsumexp(input_tensor,
         reduce_sum(
             gen_math_ops.exp(gen_math_ops.sub(input_tensor, my_max)),
             axis,
-            keepdims=keepdims,
-            reduction_indices=reduction_indices))
+            keepdims=keepdims))
     if not keepdims:
       my_max = array_ops.reshape(my_max, array_ops.shape(result))
     result = gen_math_ops.add(result, my_max)
-    return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
+    return _may_reduce_to_scalar(keepdims, axis, result)
 
 
-@tf_export("linalg.trace", "trace")
+@tf_export("linalg.trace", v1=["linalg.trace", "trace"])
 @deprecation.deprecated_endpoints("trace")
 def trace(x, name=None):
   """Compute the trace of a tensor `x`.
@@ -2057,10 +2455,109 @@ def matmul(a,
           a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
 
 
+@tf_export("linalg.matvec")
+def matvec(a,
+           b,
+           transpose_a=False,
+           adjoint_a=False,
+           a_is_sparse=False,
+           b_is_sparse=False,
+           name=None):
+  """Multiplies matrix `a` by vector `b`, producing `a` * `b`.
+
+  The matrix `a` must, following any transpositions, be a tensor of rank >= 2,
+  and we must have `shape(b) = shape(a)[:-2] + [shape(a)[-1]]`.
+
+  Both `a` and `b` must be of the same type. The supported types are:
+  `float16`, `float32`, `float64`, `int32`, `complex64`, `complex128`.
+
+  Matrix `a` can be transposed or adjointed (conjugated and transposed) on
+  the fly by setting one of the corresponding flag to `True`. These are `False`
+  by default.
+
+  If one or both of the inputs contain a lot of zeros, a more efficient
+  multiplication algorithm can be used by setting the corresponding
+  `a_is_sparse` or `b_is_sparse` flag to `True`. These are `False` by default.
+  This optimization is only available for plain matrices/vectors (rank-2/1
+  tensors) with datatypes `bfloat16` or `float32`.
+
+  For example:
+
+  ```python
+  # 2-D tensor `a`
+  # [[1, 2, 3],
+  #  [4, 5, 6]]
+  a = tf.constant([1, 2, 3, 4, 5, 6], shape=[2, 3])
+
+  # 1-D tensor `b`
+  # [7, 9, 11]
+  b = tf.constant([7, 9, 11], shape=[3])
+
+  # `a` * `b`
+  # [ 58,  64]
+  c = tf.matvec(a, b)
+
+
+  # 3-D tensor `a`
+  # [[[ 1,  2,  3],
+  #   [ 4,  5,  6]],
+  #  [[ 7,  8,  9],
+  #   [10, 11, 12]]]
+  a = tf.constant(np.arange(1, 13, dtype=np.int32),
+                  shape=[2, 2, 3])
+
+  # 2-D tensor `b`
+  # [[13, 14, 15],
+  #  [16, 17, 18]]
+  b = tf.constant(np.arange(13, 19, dtype=np.int32),
+                  shape=[2, 3])
+
+  # `a` * `b`
+  # [[ 86, 212],
+  #  [410, 563]]
+  c = tf.matvec(a, b)
+  ```
+
+  Args:
+    a: `Tensor` of type `float16`, `float32`, `float64`, `int32`, `complex64`,
+      `complex128` and rank > 1.
+    b: `Tensor` with same type and rank = `rank(a) - 1`.
+    transpose_a: If `True`, `a` is transposed before multiplication.
+    adjoint_a: If `True`, `a` is conjugated and transposed before
+      multiplication.
+    a_is_sparse: If `True`, `a` is treated as a sparse matrix.
+    b_is_sparse: If `True`, `b` is treated as a sparse matrix.
+    name: Name for the operation (optional).
+
+  Returns:
+    A `Tensor` of the same type as `a` and `b` where each inner-most vector is
+    the product of the corresponding matrices in `a` and vectors in `b`, e.g. if
+    all transpose or adjoint attributes are `False`:
+
+    `output`[..., i] = sum_k (`a`[..., i, k] * `b`[..., k]), for all indices i.
+
+    Note: This is matrix-vector product, not element-wise product.
+
+
+  Raises:
+    ValueError: If transpose_a and adjoint_a are both set to True.
+  """
+  with ops.name_scope(name, "MatVec", [a, b]) as name:
+    output = matmul(
+        a,
+        array_ops.expand_dims(b, axis=-1),
+        transpose_a=transpose_a,
+        adjoint_a=adjoint_a,
+        a_is_sparse=a_is_sparse,
+        b_is_sparse=b_is_sparse)
+    return array_ops.squeeze(output, axis=-1)
+
+
 _OverrideBinaryOperatorHelper(matmul, "matmul")
 
-sparse_matmul = gen_math_ops.sparse_mat_mul
-tf_export("sparse_matmul")(sparse_matmul)
+sparse_matmul = deprecation.deprecated(None, "Use `tf.linalg.matmul` instead")(
+    gen_math_ops.sparse_mat_mul)
+tf_export(v1=["sparse_matmul"])(sparse_matmul)
 
 
 @ops.RegisterStatistics("MatMul", "flops")
@@ -2139,6 +2636,7 @@ def _as_indexed_slices_list(inputs, optimize=True):
 
 
 @tf_export("math.add_n", "add_n")
+@dispatch.add_dispatch_support
 def add_n(inputs, name=None):
   """Adds all input tensors element-wise.
 
@@ -2173,7 +2671,7 @@ def add_n(inputs, name=None):
   return gen_math_ops.add_n(inputs, name=name)
 
 
-@tf_export("math.accumulate_n", "accumulate_n")
+@tf_export("math.accumulate_n", v1=["math.accumulate_n", "accumulate_n"])
 @deprecation.deprecated_endpoints("accumulate_n")
 def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   """Returns the element-wise sum of a list of tensors.
@@ -2283,7 +2781,8 @@ def sigmoid(x, name=None):
     return gen_math_ops.sigmoid(x, name=name)
 
 
-@tf_export("math.log_sigmoid", "log_sigmoid")
+@tf_export("math.log_sigmoid", v1=["math.log_sigmoid", "log_sigmoid"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("log_sigmoid")
 def log_sigmoid(x, name=None):
   """Computes log sigmoid of `x` element-wise.
@@ -2303,34 +2802,64 @@ def log_sigmoid(x, name=None):
     return gen_math_ops.neg(gen_nn_ops.softplus(-x), name=name)
 
 
-@tf_export("math.tanh", "nn.tanh", "tanh")
-def tanh(x, name=None):
-  """Computes hyperbolic tangent of `x` element-wise.
+@tf_export("math.bincount", v1=[])
+def bincount(arr,
+             weights=None,
+             minlength=None,
+             maxlength=None,
+             dtype=dtypes.int32,
+             name=None):
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
 
   Args:
-    x: A Tensor or SparseTensor with type `float16`, `float32`, `double`,
-      `complex64`, or `complex128`.
-    name: A name for the operation (optional).
+    arr: An int32 tensor of non-negative values.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+    name: A name scope for the associated operations (optional).
 
   Returns:
-    A Tensor or SparseTensor respectively with the same type as `x`.
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
   """
-  with ops.name_scope(name, "Tanh", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_tanh = gen_math_ops.tanh(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_tanh, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.tanh(x, name=name)
-
-
-@tf_export("math.bincount", "bincount")
+  name = "bincount" if name is None else name
+  with ops.name_scope(name):
+    arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
+    array_is_nonempty = reduce_prod(array_ops.shape(arr)) > 0
+    output_size = cast(array_is_nonempty, dtypes.int32) * (reduce_max(arr) + 1)
+    if minlength is not None:
+      minlength = ops.convert_to_tensor(
+          minlength, name="minlength", dtype=dtypes.int32)
+      output_size = gen_math_ops.maximum(minlength, output_size)
+    if maxlength is not None:
+      maxlength = ops.convert_to_tensor(
+          maxlength, name="maxlength", dtype=dtypes.int32)
+      output_size = gen_math_ops.minimum(maxlength, output_size)
+    if weights is not None:
+      weights = ops.convert_to_tensor(weights, name="weights")
+      return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
+    weights = constant_op.constant([], dtype)
+    return gen_math_ops.bincount(arr, output_size, weights)
+
+
+@tf_export(v1=["math.bincount", "bincount"])
 @deprecation.deprecated_endpoints("bincount")
-def bincount(arr,
-             weights=None,
-             minlength=None,
-             maxlength=None,
-             dtype=dtypes.int32):
+def bincount_v1(arr,
+                weights=None,
+                minlength=None,
+                maxlength=None,
+                dtype=dtypes.int32):
   """Counts the number of occurrences of each value in an integer array.
 
   If `minlength` and `maxlength` are not given, returns a vector with length
@@ -2342,34 +2871,19 @@ def bincount(arr,
   Args:
     arr: An int32 tensor of non-negative values.
     weights: If non-None, must be the same shape as arr. For each value in
-        `arr`, the bin will be incremented by the corresponding weight instead
-        of 1.
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
     minlength: If given, ensures the output has length at least `minlength`,
-        padding with zeros at the end if necessary.
+      padding with zeros at the end if necessary.
     maxlength: If given, skips values in `arr` that are equal or greater than
-        `maxlength`, ensuring that the output has length at most `maxlength`.
+      `maxlength`, ensuring that the output has length at most `maxlength`.
     dtype: If `weights` is None, determines the type of the output bins.
 
   Returns:
     A vector with the same dtype as `weights` or the given `dtype`. The bin
     values.
   """
-  arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
-  array_is_nonempty = reduce_prod(array_ops.shape(arr)) > 0
-  output_size = cast(array_is_nonempty, dtypes.int32) * (reduce_max(arr) + 1)
-  if minlength is not None:
-    minlength = ops.convert_to_tensor(
-        minlength, name="minlength", dtype=dtypes.int32)
-    output_size = gen_math_ops.maximum(minlength, output_size)
-  if maxlength is not None:
-    maxlength = ops.convert_to_tensor(
-        maxlength, name="maxlength", dtype=dtypes.int32)
-    output_size = gen_math_ops.minimum(maxlength, output_size)
-  if weights is not None:
-    weights = ops.convert_to_tensor(weights, name="weights")
-    return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
-  weights = constant_op.constant([], dtype)
-  return gen_math_ops.bincount(arr, output_size, weights)
+  return bincount(arr, weights, minlength, maxlength, dtype)
 
 
 @tf_export("math.cumsum", "cumsum")
@@ -2424,7 +2938,7 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
         x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
-@tf_export("math.cumprod", "cumprod")
+@tf_export("math.cumprod", v1=["math.cumprod", "cumprod"])
 @deprecation.deprecated_endpoints("cumprod")
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative product of the tensor `x` along `axis`.
@@ -2477,7 +2991,8 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
         x, axis, exclusive=exclusive, reverse=reverse, name=name)
 
 
-@tf_export("math.conj", "conj")
+@tf_export("math.conj", v1=["math.conj", "conj"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("conj")
 def conj(x, name=None):
   r"""Returns the complex conjugate of a complex number.
@@ -2578,8 +3093,11 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
   return gen_math_ops.maximum(N, 1)
 
 
-@tf_export("math.unsorted_segment_mean", "unsorted_segment_mean")
+@tf_export(
+    "math.unsorted_segment_mean",
+    v1=["math.unsorted_segment_mean", "unsorted_segment_mean"])
 @deprecation.deprecated_endpoints("unsorted_segment_mean")
+@dispatch.add_dispatch_support
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r"""Computes the mean along segments of a tensor.
 
@@ -2621,8 +3139,11 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
     return summed / N
 
 
-@tf_export("math.unsorted_segment_sqrt_n", "unsorted_segment_sqrt_n")
+@tf_export(
+    "math.unsorted_segment_sqrt_n",
+    v1=["math.unsorted_segment_sqrt_n", "unsorted_segment_sqrt_n"])
 @deprecation.deprecated_endpoints("unsorted_segment_sqrt_n")
+@dispatch.add_dispatch_support
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
@@ -2667,7 +3188,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
     return summed / gen_math_ops.sqrt(N)
 
 
-@tf_export("sparse.segment_sum", "sparse_segment_sum")
+@tf_export(v1=["sparse.segment_sum", "sparse_segment_sum"])
 @deprecation.deprecated_endpoints("sparse_segment_sum")
 def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
@@ -2741,7 +3262,17 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export("sparse.segment_mean", "sparse_segment_mean")
+@tf_export("sparse.segment_sum", v1=[])
+def sparse_segment_sum_v2(data,
+                          indices,
+                          segment_ids,
+                          num_segments=None,
+                          name=None):
+  return sparse_segment_mean(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
+@tf_export(v1=["sparse.segment_mean", "sparse_segment_mean"])
 @deprecation.deprecated_endpoints("sparse_segment_mean")
 def sparse_segment_mean(data,
                         indices,
@@ -2787,7 +3318,44 @@ def sparse_segment_mean(data,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export("sparse.segment_sqrt_n", "sparse_segment_sqrt_n")
+@tf_export("sparse.segment_mean", v1=[])
+def sparse_segment_mean_v2(data,
+                           indices,
+                           segment_ids,
+                           num_segments=None,
+                           name=None):
+  r"""Computes the mean along sparse segments of a tensor.
+
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  for an explanation of segments.
+
+  Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  return sparse_segment_mean(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
+@tf_export(v1=["sparse.segment_sqrt_n", "sparse_segment_sqrt_n"])
 @deprecation.deprecated_endpoints("sparse_segment_sqrt_n")
 def sparse_segment_sqrt_n(data,
                           indices,
@@ -2825,6 +3393,35 @@ def sparse_segment_sqrt_n(data,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
+@tf_export("sparse.segment_sqrt_n", v1=[])
+def sparse_segment_sqrt_n_v2(data,
+                             indices,
+                             segment_ids,
+                             num_segments=None,
+                             name=None):
+  r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
+
+  `N` is the size of the segment being reduced.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  return sparse_segment_sqrt_n(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
 @tf_export("tensordot", "linalg.tensordot")
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes.
@@ -2858,12 +3455,11 @@ def tensordot(a, b, axes, name=None):
     a: `Tensor` of type `float32` or `float64`.
     b: `Tensor` with the same type as `a`.
     axes: Either a scalar `N`, or a list or an `int32` `Tensor` of shape [2, k].
-     If axes is a scalar, sum over the last N axes of a and the first N axes
-     of b in order.
-     If axes is a list or `Tensor` the first and second row contain the set of
-     unique integers specifying axes along which the contraction is computed,
-     for `a` and `b`, respectively. The number of axes for `a` and `b` must
-     be equal.
+      If axes is a scalar, sum over the last N axes of a and the first N axes of
+      b in order. If axes is a list or `Tensor` the first and second row contain
+      the set of unique integers specifying axes along which the contraction is
+      computed, for `a` and `b`, respectively. The number of axes for `a` and
+      `b` must be equal.
     name: A name for the operation (optional).
 
   Returns:
@@ -3035,73 +3631,3 @@ def polyval(coeffs, x, name=None):
     for c in coeffs[1:]:
       p = c + p * x
     return p
-
-
-@tf_export("math.bessel_i0e")
-def bessel_i0e(x, name=None):
-  """Computes the Bessel i0e function of `x` element-wise.
-
-  Exponentially scaled modified Bessel function of order 0 defined as
-  `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-
-  This function is faster and numerically stabler than `bessel_i0(x)`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.special.i0e
-  @end_compatibility
-  """
-  with ops.name_scope(name, "bessel_i0e", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_i0e = gen_math_ops.bessel_i0e(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_i0e, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.bessel_i0e(x, name=name)
-
-
-@tf_export("math.bessel_i1e")
-def bessel_i1e(x, name=None):
-  """Computes the Bessel i1e function of `x` element-wise.
-
-  Exponentially scaled modified Bessel function of order 1 defined as
-  `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-
-  This function is faster and numerically stabler than `bessel_i1(x)`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.special.i1e
-  @end_compatibility
-  """
-  with ops.name_scope(name, "bessel_i1e", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_i1e = gen_math_ops.bessel_i1e(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_i1e, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.bessel_i1e(x, name=name)
-
-
-# FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
-# 1.0 API so we leave these here for backwards compatibility.
-fft = gen_spectral_ops.fft
-ifft = gen_spectral_ops.ifft
-fft2d = gen_spectral_ops.fft2d
-ifft2d = gen_spectral_ops.ifft2d
-fft3d = gen_spectral_ops.fft3d
-ifft3d = gen_spectral_ops.ifft3d
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index f051850d9235d8ac3b0ff08644491e6b4e021764..e185dbcd230906270b6c92fe70e6a350c34f030f 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -69,13 +69,34 @@ class ReduceTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "must be at most rank 1"):
       math_ops.reduce_sum(x, axis)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testReduceVar(self):
+    x = np.array([[0, 0, 0], [0, 0, 0]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_variance(x)), 0)
+    self.assertAllClose(
+        self.evaluate(math_ops.reduce_variance(x, axis=0)), [0, 0, 0])
+
+    x = np.array([[0, 2, 1, 1], [1, 2, 0, 1]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_variance(x)), 0.5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testReduceStd(self):
+    x = np.array([[0, 0, 0], [0, 0, 0]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_std(x)), 0)
+    self.assertAllClose(
+        self.evaluate(math_ops.reduce_std(x, axis=0)), [0, 0, 0])
+
+    x = np.array([[1, 2, 1, 1], [1, 1, 0, 1]], "float32")
+    self.assertAllClose(self.evaluate(math_ops.reduce_std(x)), 0.5)
+
 
 class LogSumExpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testReduceLogSumExp(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         y_tf_np = math_ops.reduce_logsumexp(x_np).eval()
         y_np = log(np.sum(exp(x_np)))
         self.assertAllClose(y_tf_np, y_np)
@@ -83,32 +104,34 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
   def testReductionIndices(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
-      with self.test_session(use_gpu=True):
-        y_tf = math_ops.reduce_logsumexp(x_np, reduction_indices=[0])
+      with self.cached_session(use_gpu=True):
+        y_tf = math_ops.reduce_logsumexp(x_np, axis=[0])
         y_np = log(np.sum(exp(x_np), axis=0))
         self.assertShapeEqual(y_np, y_tf)
-        y_tf_np = y_tf.eval()
+        y_tf_np = self.evaluate(y_tf)
         self.assertAllClose(y_tf_np, y_np)
 
   def testReductionIndices2(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
-      with self.test_session(use_gpu=True):
-        y_tf = math_ops.reduce_logsumexp(x_np, reduction_indices=0)
+      with self.cached_session(use_gpu=True):
+        y_tf = math_ops.reduce_logsumexp(x_np, axis=0)
         y_np = log(np.sum(exp(x_np), axis=0))
         self.assertShapeEqual(y_np, y_tf)
-        y_tf_np = y_tf.eval()
+        y_tf_np = self.evaluate(y_tf)
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testKeepDims(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         y_tf_np = math_ops.reduce_logsumexp(x_np, keepdims=True).eval()
         self.assertEqual(y_tf_np.ndim, x_np.ndim)
         y_np = log(np.sum(exp(x_np), keepdims=True))
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testOverflow(self):
     x = [1000, 1001, 1002, 1003]
     for dtype in [np.float16, np.float32, np.double]:
@@ -120,12 +143,13 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         if out == np.inf:
           raise RuntimeWarning("overflow encountered in exp")
 
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x_tf = constant_op.constant(x_np, shape=x_np.shape)
         y_tf_np = math_ops.reduce_logsumexp(x_tf).eval()
         y_np = log(np.sum(exp(x_np - max_np))) + max_np
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testUnderflow(self):
     x = [-1000, -1001, -1002, -1003]
     for dtype in [np.float16, np.float32, np.double]:
@@ -137,14 +161,15 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         if out == -np.inf:
           raise RuntimeWarning("divide by zero encountered in log")
 
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         x_tf = constant_op.constant(x_np, shape=x_np.shape)
         y_tf_np = math_ops.reduce_logsumexp(x_tf).eval()
         y_np = log(np.sum(exp(x_np - max_np))) + max_np
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       res = math_ops.reduce_logsumexp(-np.inf).eval()
       self.assertEqual(-np.inf, res)
 
@@ -166,16 +191,17 @@ class RoundTest(test_util.TensorFlowTestCase):
 
 class ModTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     x = [0.5, 0.7, 0.3]
     for dtype in [np.float32, np.double]:
       # Test scalar and vector versions.
       for denom in [x[0], [x[0]] * 3]:
         x_np = np.array(x, dtype=dtype)
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           x_tf = constant_op.constant(x_np, shape=x_np.shape)
           y_tf = math_ops.mod(x_tf, denom)
-          y_tf_np = y_tf.eval()
+          y_tf_np = self.evaluate(y_tf)
           y_np = np.fmod(x_np, denom)
         self.assertAllClose(y_tf_np, y_np, atol=1e-2)
 
@@ -185,10 +211,10 @@ class ModTest(test_util.TensorFlowTestCase):
       # Test scalar and vector versions.
       for denom in [x[0], x]:
         x_np = np.array(x, dtype=dtype)
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           x_tf = constant_op.constant(x_np, shape=x_np.shape)
           y_tf = math_ops.mod(x_tf, denom)
-          y_tf_np = y_tf.eval()
+          y_tf_np = self.evaluate(y_tf)
           y_np = np.mod(x_np, denom)
         self.assertAllClose(y_tf_np, y_np)
 
@@ -197,7 +223,7 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testSquaredDifference(self):
-    for dtype in [np.int32, np.float16]:
+    for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
       x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)
       y = np.array([-3, -2, -1], dtype=dtype)
       z = (x - y) * (x - y)
@@ -205,6 +231,17 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
         z_tf = self.evaluate(math_ops.squared_difference(x, y))
         self.assertAllClose(z, z_tf)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testComplexSquaredDifference(self):
+    for dtype in [np.complex64, np.complex128]:
+      x = np.array([[1 + 3j, 2 + 2j, 3 + 1j], [4 - 1j, 5 - 2j, 6 - 3j]],
+                   dtype=dtype)
+      y = np.array([-3 + 1j, -2 + 2j, -1 + 3j], dtype=dtype)
+      z = np.conj(x - y) * (x - y)
+      with test_util.device(use_gpu=False):
+        z_tf = self.evaluate(math_ops.squared_difference(x, y))
+        self.assertAllClose(z, z_tf)
+
 
 class ApproximateEqualTest(test_util.TensorFlowTestCase):
 
@@ -236,6 +273,7 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
         z_tf = self.evaluate(math_ops.approximate_equal(x, y, tolerance=0.0001))
         self.assertAllEqual(z, z_tf)
 
+  @test_util.run_deprecated_v1
   def testApproximateEqualShape(self):
     for dtype in [np.float32, np.double]:
       x = np.array([1, 2], dtype=dtype)
@@ -289,25 +327,28 @@ class ScalarMulTest(test_util.TensorFlowTestCase):
 
 class AccumulateNTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     np.random.seed(12345)
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
     tf_x = ops.convert_n_to_tensor(x)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x).eval())
       self.assertAllClose(x[0] * 5, math_ops.accumulate_n([tf_x[0]] * 5).eval())
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     np.random.seed(54321)
     x = [np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(6)]
     tf_x = ops.convert_n_to_tensor(x)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(sum(x), math_ops.accumulate_n(tf_x).eval())
       self.assertAllEqual(x[0] * 6, math_ops.accumulate_n([tf_x[0]] * 6).eval())
 
 
 class AddNTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testPartials(self):
     """Test that previously revealed a bug in buffer forwarding for AddN."""
     partials = []
@@ -318,19 +359,21 @@ class AddNTest(test_util.TensorFlowTestCase):
                         constant_op.constant(1)]))
 
     res = math_ops.add_n(partials) + constant_op.constant(0)
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllEqual(res.eval(), 100)
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     np.random.seed(12345)
     for num_inputs in range(1, 10):
       x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(num_inputs)]
       tf_x = ops.convert_n_to_tensor(x)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         self.assertAllClose(sum(x), math_ops.add_n(tf_x).eval())
         self.assertAllClose(x[0] * num_inputs,
                             math_ops.add_n([tf_x[0]] * num_inputs).eval())
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     np.random.seed(54321)
     for num_inputs in range(1, 10):
@@ -339,21 +382,22 @@ class AddNTest(test_util.TensorFlowTestCase):
           for _ in range(num_inputs)
       ]
       tf_x = ops.convert_n_to_tensor(x)
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         self.assertAllEqual(sum(x), math_ops.add_n(tf_x).eval())
         self.assertAllEqual(x[0] * num_inputs,
                             math_ops.add_n([tf_x[0]] * num_inputs).eval())
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     np.random.seed(42)
     for num_inputs in range(1, 10):
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         input_vars = [
             variables.Variable(10.0 * np.random.random())
             for i in range(0, num_inputs)
         ]
         addn = math_ops.add_n(input_vars)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         add_n_grad = gradients.gradients(addn, input_vars)
         self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
                             [g.eval() for g in add_n_grad])
@@ -372,6 +416,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     divs = np.arange(-3, 0, .25).reshape(1, 12)
     return nums, divs
 
+  @test_util.run_deprecated_v1
   def testFloorModInt(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -381,6 +426,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = nums % divs
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testFloorModFloat(self):
     nums, divs = self.floatTestData()
     with self.cached_session():
@@ -392,6 +438,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               % array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
+  @test_util.run_deprecated_v1
   def testTruncateModInt(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -399,6 +446,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testTruncateModFloat(self):
     nums, divs = self.floatTestData()
     with self.cached_session():
@@ -406,6 +454,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testDivideInt(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -417,12 +466,14 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               // array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
+  @test_util.run_deprecated_v1
   def testDivideName(self):
     with self.cached_session():
       op = math_ops.divide(
           array_ops.constant(3), array_ops.constant(4), name="my_cool_divide")
       self.assertEqual(op.name, "my_cool_divide:0")
 
+  @test_util.run_deprecated_v1
   def testRealDiv(self):
     nums, divs = self.floatTestData()
     with self.cached_session():
@@ -430,26 +481,30 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.divide(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testComplexDiv(self):
     foo = array_ops.constant([1. + 3.j])
     with self.cached_session():
       _ = math_ops.divide(foo, 1.).eval()
       _ = math_ops.div(foo, 2.).eval()
 
+  @test_util.run_deprecated_v1
   def testFloorDivGrad(self):
     with self.cached_session():
       a = variables.Variable(2.)
       b = variables.Variable(4.)
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         c_grad = gradients.gradients(math_ops.divide(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
         c_grad = gradients.gradients(math_ops.div(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
         c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b])
-        self.assertAllEqual([None if x is None else x.eval()
-                             for x in c_grad], [None, None])
+        self.assertAllEqual(
+            [None if x is None else self.evaluate(x) for x in c_grad],
+            [None, None])
 
+  @test_util.run_deprecated_v1
   def testConsistent(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -476,6 +531,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
 
 class DivNoNanTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     for dtype in [np.float32, np.float64]:
       nums = np.arange(-10, 10, .25, dtype=dtype).reshape(80, 1)
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 763877c2d236b4f1f4ddc4032314f3b38e353c75..ec39b1790e340a0d194dea8ab3419ca78fc9d126 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -34,7 +35,6 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -43,25 +43,24 @@ def metric_variable(shape, dtype, validate_shape=True, name=None):
   """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES)` collections.
 
   If running in a `DistributionStrategy` context, the variable will be
-  "tower local". This means:
+  "replica local". This means:
 
   *   The returned object will be a container with separate variables
-      per replica/tower of the model.
+      per replica of the model.
 
   *   When writing to the variable, e.g. using `assign_add` in a metric
       update, the update will be applied to the variable local to the
-      replica/tower.
+      replica.
 
   *   To get a metric's result value, we need to sum the variable values
-      across the replicas/towers before computing the final answer.
-      Furthermore, the final answer should be computed once instead of
-      in every replica/tower. Both of these are accomplished by
-      running the computation of the final result value inside
-      `tf.contrib.distribution_strategy_context.get_tower_context(
-      ).merge_call(fn)`.
+      across the replicas before computing the final answer. Furthermore,
+      the final answer should be computed once instead of in every
+      replica. Both of these are accomplished by running the computation
+      of the final result value inside
+      `distribution_strategy_context.get_replica_context().merge_call(fn)`.
       Inside the `merge_call()`, ops are only added to the graph once
-      and access to a tower-local variable in a computation returns
-      the sum across all replicas/towers.
+      and access to a replica-local variable in a computation returns
+      the sum across all replicas.
 
   Args:
     shape: Shape of the created variable.
@@ -72,7 +71,7 @@ def metric_variable(shape, dtype, validate_shape=True, name=None):
 
   Returns:
     A (non-trainable) variable initialized to zero, or if inside a
-    `DistributionStrategy` scope a tower-local variable container.
+    `DistributionStrategy` scope a replica-local variable container.
   """
   # Note that synchronization "ON_READ" implies trainable=False.
   return variable_scope.variable(
@@ -213,24 +212,6 @@ def _maybe_expand_labels(labels, predictions):
         lambda: array_ops.expand_dims(labels, -1, name=scope), lambda: labels)
 
 
-def _safe_div(numerator, denominator, name):
-  """Divides two tensors element-wise, returning 0 if the denominator is <= 0.
-
-  Args:
-    numerator: A real `Tensor`.
-    denominator: A real `Tensor`, with dtype matching `numerator`.
-    name: Name for the returned op.
-
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  t = math_ops.truediv(numerator, denominator)
-  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-  condition = math_ops.greater(denominator, zero)
-  zero = math_ops.cast(zero, t.dtype)
-  return array_ops.where(condition, t, zero, name=name)
-
-
 def _safe_scalar_div(numerator, denominator, name):
   """Divides two values, returning 0 if the denominator is 0.
 
@@ -244,12 +225,7 @@ def _safe_scalar_div(numerator, denominator, name):
   """
   numerator.get_shape().with_rank_at_most(1)
   denominator.get_shape().with_rank_at_most(1)
-  return control_flow_ops.cond(
-      math_ops.equal(
-          array_ops.constant(0.0, dtype=dtypes.float64), denominator),
-      lambda: array_ops.constant(0.0, dtype=dtypes.float64),
-      lambda: math_ops.div(numerator, denominator),
-      name=name)
+  return math_ops.div_no_nan(numerator, denominator, name=name)
 
 
 def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
@@ -301,11 +277,11 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
   return total_cm, update_op
 
 
-def _aggregate_across_towers(metrics_collections, metric_value_fn, *args):
-  """Aggregate metric value across towers."""
+def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
+  """Aggregate metric value across replicas."""
   def fn(distribution, *a):
     """Call `metric_value_fn` in the correct control flow context."""
-    if hasattr(distribution, '_outer_control_flow_context'):
+    if hasattr(distribution.extended, '_outer_control_flow_context'):
       # If there was an outer context captured before this method was called,
       # then we enter that context to create the metric value op. If the
       # caputred context is `None`, ops.control_dependencies(None) gives the
@@ -318,13 +294,13 @@ def _aggregate_across_towers(metrics_collections, metric_value_fn, *args):
       # once the update ops have been evaluted.
 
       # pylint: disable=protected-access
-      if distribution._outer_control_flow_context is None:
+      if distribution.extended._outer_control_flow_context is None:
         with ops.control_dependencies(None):
           metric_value = metric_value_fn(distribution, *a)
       else:
-        distribution._outer_control_flow_context.Enter()
+        distribution.extended._outer_control_flow_context.Enter()
         metric_value = metric_value_fn(distribution, *a)
-        distribution._outer_control_flow_context.Exit()
+        distribution.extended._outer_control_flow_context.Exit()
         # pylint: enable=protected-access
     else:
       metric_value = metric_value_fn(distribution, *a)
@@ -332,10 +308,11 @@ def _aggregate_across_towers(metrics_collections, metric_value_fn, *args):
       ops.add_to_collections(metrics_collections, metric_value)
     return metric_value
 
-  return distribution_strategy_context.get_tower_context().merge_call(fn, *args)
+  return distribution_strategy_context.get_replica_context().merge_call(
+      fn, args=args)
 
 
-@tf_export('metrics.mean')
+@tf_export(v1=['metrics.mean'])
 def mean(values,
          weights=None,
          metrics_collections=None,
@@ -402,11 +379,13 @@ def mean(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    compute_mean = lambda _, t, c: _safe_div(t, c, 'value')
+    def compute_mean(_, t, c):
+      return math_ops.div_no_nan(t, math_ops.maximum(c, 0), name='value')
 
-    mean_t = _aggregate_across_towers(
+    mean_t = _aggregate_across_replicas(
         metrics_collections, compute_mean, total, count)
-    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
+    update_op = math_ops.div_no_nan(
+        update_total_op, math_ops.maximum(update_count_op, 0), name='update_op')
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
@@ -414,7 +393,7 @@ def mean(values,
     return mean_t, update_op
 
 
-@tf_export('metrics.accuracy')
+@tf_export(v1=['metrics.accuracy'])
 def accuracy(labels,
              predictions,
              weights=None,
@@ -643,10 +622,10 @@ def _confusion_matrix_at_thresholds(labels,
 
 def _aggregate_variable(v, collections):
   f = lambda distribution, value: distribution.read_var(value)
-  return _aggregate_across_towers(collections, f, v)
+  return _aggregate_across_replicas(collections, f, v)
 
 
-@tf_export('metrics.auc')
+@tf_export(v1=['metrics.auc'])
 def auc(labels,
         predictions,
         weights=None,
@@ -778,16 +757,21 @@ def auc(labels,
       """
       dtp = tp[:num_thresholds - 1] - tp[1:]
       p = tp + fp
-      prec_slope = _safe_div(dtp, p[:num_thresholds - 1] - p[1:], 'prec_slope')
+      prec_slope = math_ops.div_no_nan(
+          dtp,
+          math_ops.maximum(p[:num_thresholds - 1] - p[1:], 0),
+          name='prec_slope')
       intercept = tp[1:] - math_ops.multiply(prec_slope, p[1:])
       safe_p_ratio = array_ops.where(
           math_ops.logical_and(p[:num_thresholds - 1] > 0, p[1:] > 0),
-          _safe_div(p[:num_thresholds - 1], p[1:], 'recall_relative_ratio'),
-          array_ops.ones_like(p[1:]))
+          math_ops.div_no_nan(
+              p[:num_thresholds - 1],
+              math_ops.maximum(p[1:], 0),
+              name='recall_relative_ratio'), array_ops.ones_like(p[1:]))
       return math_ops.reduce_sum(
-          _safe_div(
+          math_ops.div_no_nan(
               prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
-              tp[1:] + fn[1:],
+              math_ops.maximum(tp[1:] + fn[1:], 0),
               name='pr_auc_increment'),
           name='interpolate_pr_auc')
 
@@ -835,7 +819,7 @@ def auc(labels,
       return compute_auc(values['tp'], values['fn'], values['tn'], values['fp'],
                          'value')
 
-    auc_value = _aggregate_across_towers(
+    auc_value = _aggregate_across_replicas(
         metrics_collections, compute_auc_value, values)
     update_op = compute_auc(update_ops['tp'], update_ops['fn'],
                             update_ops['tn'], update_ops['fp'], 'update_op')
@@ -846,7 +830,7 @@ def auc(labels,
     return auc_value, update_op
 
 
-@tf_export('metrics.mean_absolute_error')
+@tf_export(v1=['metrics.mean_absolute_error'])
 def mean_absolute_error(labels,
                         predictions,
                         weights=None,
@@ -907,7 +891,7 @@ def mean_absolute_error(labels,
               updates_collections, name or 'mean_absolute_error')
 
 
-@tf_export('metrics.mean_cosine_distance')
+@tf_export(v1=['metrics.mean_cosine_distance'])
 def mean_cosine_distance(labels,
                          predictions,
                          dim,
@@ -964,7 +948,7 @@ def mean_cosine_distance(labels,
       predictions=predictions, labels=labels, weights=weights)
   radial_diffs = math_ops.multiply(predictions, labels)
   radial_diffs = math_ops.reduce_sum(
-      radial_diffs, reduction_indices=[
+      radial_diffs, axis=[
           dim,
       ], keepdims=True)
   mean_distance, update_op = mean(radial_diffs, weights, None, None, name or
@@ -981,7 +965,7 @@ def mean_cosine_distance(labels,
   return mean_distance, update_op
 
 
-@tf_export('metrics.mean_per_class_accuracy')
+@tf_export(v1=['metrics.mean_per_class_accuracy'])
 def mean_per_class_accuracy(labels,
                             predictions,
                             num_classes,
@@ -1068,22 +1052,24 @@ def mean_per_class_accuracy(labels,
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
 
     def compute_mean_accuracy(_, count, total):
-      per_class_accuracy = _safe_div(count, total, None)
+      per_class_accuracy = math_ops.div_no_nan(
+          count, math_ops.maximum(total, 0), name=None)
       mean_accuracy_v = math_ops.reduce_mean(
           per_class_accuracy, name='mean_accuracy')
       return mean_accuracy_v
 
-    mean_accuracy_v = _aggregate_across_towers(
+    mean_accuracy_v = _aggregate_across_replicas(
         metrics_collections, compute_mean_accuracy, count, total)
 
-    update_op = _safe_div(update_count_op, update_total_op, name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_count_op, math_ops.maximum(update_total_op, 0), name='update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
     return mean_accuracy_v, update_op
 
 
-@tf_export('metrics.mean_iou')
+@tf_export(v1=['metrics.mean_iou'])
 def mean_iou(labels,
              predictions,
              num_classes,
@@ -1175,7 +1161,7 @@ def mean_iou(labels,
       return result
 
     # TODO(priyag): Use outside_compilation if in TPU context.
-    mean_iou_v = _aggregate_across_towers(
+    mean_iou_v = _aggregate_across_replicas(
         metrics_collections, compute_mean_iou, total_cm)
 
     if updates_collections:
@@ -1184,7 +1170,7 @@ def mean_iou(labels,
     return mean_iou_v, update_op
 
 
-@tf_export('metrics.mean_relative_error')
+@tf_export(v1=['metrics.mean_relative_error'])
 def mean_relative_error(labels,
                         predictions,
                         normalizer,
@@ -1253,7 +1239,7 @@ def mean_relative_error(labels,
               updates_collections, name or 'mean_relative_error')
 
 
-@tf_export('metrics.mean_squared_error')
+@tf_export(v1=['metrics.mean_squared_error'])
 def mean_squared_error(labels,
                        predictions,
                        weights=None,
@@ -1314,7 +1300,7 @@ def mean_squared_error(labels,
               name or 'mean_squared_error')
 
 
-@tf_export('metrics.mean_tensor')
+@tf_export(v1=['metrics.mean_tensor'])
 def mean_tensor(values,
                 weights=None,
                 metrics_collections=None,
@@ -1385,19 +1371,21 @@ def mean_tensor(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    compute_mean = lambda _, t, c: _safe_div(t, c, 'value')
+    compute_mean = lambda _, t, c: math_ops.div_no_nan(  # pylint: disable=g-long-lambda
+        t, math_ops.maximum(c, 0), name='value')
 
-    mean_t = _aggregate_across_towers(
+    mean_t = _aggregate_across_replicas(
         metrics_collections, compute_mean, total, count)
 
-    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
+    update_op = math_ops.div_no_nan(
+        update_total_op, math_ops.maximum(update_count_op, 0), name='update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
     return mean_t, update_op
 
 
-@tf_export('metrics.percentage_below')
+@tf_export(v1=['metrics.percentage_below'])
 def percentage_below(values,
                      threshold,
                      weights=None,
@@ -1497,7 +1485,7 @@ def _count_condition(values,
   return value_tensor, update_op
 
 
-@tf_export('metrics.false_negatives')
+@tf_export(v1=['metrics.false_negatives'])
 def false_negatives(labels,
                     predictions,
                     weights=None,
@@ -1549,7 +1537,7 @@ def false_negatives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.false_negatives_at_thresholds')
+@tf_export(v1=['metrics.false_negatives_at_thresholds'])
 def false_negatives_at_thresholds(labels,
                                   predictions,
                                   thresholds,
@@ -1605,7 +1593,7 @@ def false_negatives_at_thresholds(labels,
     return fn_value, update_ops['fn']
 
 
-@tf_export('metrics.false_positives')
+@tf_export(v1=['metrics.false_positives'])
 def false_positives(labels,
                     predictions,
                     weights=None,
@@ -1658,7 +1646,7 @@ def false_positives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.false_positives_at_thresholds')
+@tf_export(v1=['metrics.false_positives_at_thresholds'])
 def false_positives_at_thresholds(labels,
                                   predictions,
                                   thresholds,
@@ -1714,7 +1702,7 @@ def false_positives_at_thresholds(labels,
     return fp_value, update_ops['fp']
 
 
-@tf_export('metrics.true_negatives')
+@tf_export(v1=['metrics.true_negatives'])
 def true_negatives(labels,
                    predictions,
                    weights=None,
@@ -1767,7 +1755,7 @@ def true_negatives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.true_negatives_at_thresholds')
+@tf_export(v1=['metrics.true_negatives_at_thresholds'])
 def true_negatives_at_thresholds(labels,
                                  predictions,
                                  thresholds,
@@ -1823,7 +1811,7 @@ def true_negatives_at_thresholds(labels,
     return tn_value, update_ops['tn']
 
 
-@tf_export('metrics.true_positives')
+@tf_export(v1=['metrics.true_positives'])
 def true_positives(labels,
                    predictions,
                    weights=None,
@@ -1876,7 +1864,7 @@ def true_positives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.true_positives_at_thresholds')
+@tf_export(v1=['metrics.true_positives_at_thresholds'])
 def true_positives_at_thresholds(labels,
                                  predictions,
                                  thresholds,
@@ -1932,7 +1920,7 @@ def true_positives_at_thresholds(labels,
     return tp_value, update_ops['tp']
 
 
-@tf_export('metrics.precision')
+@tf_export(v1=['metrics.precision'])
 def precision(labels,
               predictions,
               weights=None,
@@ -2013,11 +2001,11 @@ def precision(labels,
       return array_ops.where(
           math_ops.greater(tp + fp, 0), math_ops.div(tp, tp + fp), 0, name)
 
-    def once_across_towers(_, true_p, false_p):
+    def once_across_replicas(_, true_p, false_p):
       return compute_precision(true_p, false_p, 'value')
 
-    p = _aggregate_across_towers(metrics_collections, once_across_towers,
-                                 true_p, false_p)
+    p = _aggregate_across_replicas(metrics_collections, once_across_replicas,
+                                   true_p, false_p)
 
     update_op = compute_precision(true_positives_update_op,
                                   false_positives_update_op, 'update_op')
@@ -2027,7 +2015,7 @@ def precision(labels,
     return p, update_op
 
 
-@tf_export('metrics.precision_at_thresholds')
+@tf_export(v1=['metrics.precision_at_thresholds'])
 def precision_at_thresholds(labels,
                             predictions,
                             thresholds,
@@ -2094,11 +2082,11 @@ def precision_at_thresholds(labels,
     def compute_precision(tp, fp, name):
       return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
 
-    def precision_across_towers(_, values):
+    def precision_across_replicas(_, values):
       return compute_precision(values['tp'], values['fp'], 'value')
 
-    prec = _aggregate_across_towers(
-        metrics_collections, precision_across_towers, values)
+    prec = _aggregate_across_replicas(
+        metrics_collections, precision_across_replicas, values)
 
     update_op = compute_precision(update_ops['tp'], update_ops['fp'],
                                   'update_op')
@@ -2108,7 +2096,7 @@ def precision_at_thresholds(labels,
     return prec, update_op
 
 
-@tf_export('metrics.recall')
+@tf_export(v1=['metrics.recall'])
 def recall(labels,
            predictions,
            weights=None,
@@ -2187,11 +2175,11 @@ def recall(labels,
           math_ops.greater(true_p + false_n, 0),
           math_ops.div(true_p, true_p + false_n), 0, name)
 
-    def once_across_towers(_, true_p, false_n):
+    def once_across_replicas(_, true_p, false_n):
       return compute_recall(true_p, false_n, 'value')
 
-    rec = _aggregate_across_towers(
-        metrics_collections, once_across_towers, true_p, false_n)
+    rec = _aggregate_across_replicas(
+        metrics_collections, once_across_replicas, true_p, false_n)
 
     update_op = compute_recall(true_positives_update_op,
                                false_negatives_update_op, 'update_op')
@@ -2459,7 +2447,7 @@ def _streaming_sparse_false_negative_at_k(labels,
     return var, state_ops.assign_add(var, batch_total_fn, name='update')
 
 
-@tf_export('metrics.recall_at_k')
+@tf_export(v1=['metrics.recall_at_k'])
 def recall_at_k(labels,
                 predictions,
                 k,
@@ -2552,7 +2540,7 @@ def recall_at_k(labels,
         name=scope)
 
 
-@tf_export('metrics.recall_at_top_k')
+@tf_export(v1=['metrics.recall_at_top_k'])
 def recall_at_top_k(labels,
                     predictions_idx,
                     k=None,
@@ -2626,7 +2614,7 @@ def recall_at_top_k(labels,
     def compute_recall(_, tp, fn):
       return math_ops.div(tp, math_ops.add(tp, fn), name=scope)
 
-    metric = _aggregate_across_towers(
+    metric = _aggregate_across_replicas(
         metrics_collections, compute_recall, tp, fn)
 
     update = math_ops.div(
@@ -2636,7 +2624,7 @@ def recall_at_top_k(labels,
     return metric, update
 
 
-@tf_export('metrics.recall_at_thresholds')
+@tf_export(v1=['metrics.recall_at_thresholds'])
 def recall_at_thresholds(labels,
                          predictions,
                          thresholds,
@@ -2701,11 +2689,11 @@ def recall_at_thresholds(labels,
     def compute_recall(tp, fn, name):
       return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
 
-    def recall_across_towers(_, values):
+    def recall_across_replicas(_, values):
       return compute_recall(values['tp'], values['fn'], 'value')
 
-    rec = _aggregate_across_towers(
-        metrics_collections, recall_across_towers, values)
+    rec = _aggregate_across_replicas(
+        metrics_collections, recall_across_replicas, values)
 
     update_op = compute_recall(update_ops['tp'], update_ops['fn'], 'update_op')
     if updates_collections:
@@ -2714,7 +2702,7 @@ def recall_at_thresholds(labels,
     return rec, update_op
 
 
-@tf_export('metrics.root_mean_squared_error')
+@tf_export(v1=['metrics.root_mean_squared_error'])
 def root_mean_squared_error(labels,
                             predictions,
                             weights=None,
@@ -2774,8 +2762,9 @@ def root_mean_squared_error(labels,
                                           None, name or
                                           'root_mean_squared_error')
 
-  once_across_towers = lambda _, mse: math_ops.sqrt(mse)
-  rmse = _aggregate_across_towers(metrics_collections, once_across_towers, mse)
+  once_across_replicas = lambda _, mse: math_ops.sqrt(mse)
+  rmse = _aggregate_across_replicas(
+      metrics_collections, once_across_replicas, mse)
 
   update_rmse_op = math_ops.sqrt(update_mse_op)
   if updates_collections:
@@ -2784,7 +2773,7 @@ def root_mean_squared_error(labels,
   return rmse, update_rmse_op
 
 
-@tf_export('metrics.sensitivity_at_specificity')
+@tf_export(v1=['metrics.sensitivity_at_specificity'])
 def sensitivity_at_specificity(labels,
                                predictions,
                                specificity,
@@ -2870,12 +2859,12 @@ def sensitivity_at_specificity(labels,
       return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + kepsilon,
                           name)
 
-    def sensitivity_across_towers(_, values):
+    def sensitivity_across_replicas(_, values):
       return compute_sensitivity_at_specificity(
           values['tp'], values['tn'], values['fp'], values['fn'], 'value')
 
-    sensitivity = _aggregate_across_towers(
-        metrics_collections, sensitivity_across_towers, values)
+    sensitivity = _aggregate_across_replicas(
+        metrics_collections, sensitivity_across_replicas, values)
 
     update_op = compute_sensitivity_at_specificity(
         update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
@@ -3056,7 +3045,7 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
 
     # Reduce along k dimension to get the sum, yielding a [D1, ... DN] tensor.
     precision_sum = math_ops.reduce_sum(
-        relevant_precision_per_k, reduction_indices=(-1,), name='precision_sum')
+        relevant_precision_per_k, axis=(-1,), name='precision_sum')
 
     # Divide by number of relevant items to get average precision. These are
     # the "num_relevant_items" and "AveP" terms from the formula above.
@@ -3144,11 +3133,11 @@ def _streaming_sparse_average_precision_at_top_k(labels,
       total_update = state_ops.assign_add(total_var, batch_total, name='update')
 
     # Divide total by max to get mean, for both vars and the update ops.
-    def precision_across_towers(_, total_var, max_var):
+    def precision_across_replicas(_, total_var, max_var):
       return _safe_scalar_div(total_var, max_var, name='mean')
 
-    mean_average_precision = _aggregate_across_towers(
-        metrics_collections, precision_across_towers, total_var, max_var)
+    mean_average_precision = _aggregate_across_replicas(
+        metrics_collections, precision_across_replicas, total_var, max_var)
 
     update = _safe_scalar_div(total_update, max_update, name=scope)
     if updates_collections:
@@ -3157,7 +3146,7 @@ def _streaming_sparse_average_precision_at_top_k(labels,
     return mean_average_precision, update
 
 
-@tf_export('metrics.sparse_average_precision_at_k')
+@tf_export(v1=['metrics.sparse_average_precision_at_k'])
 @deprecated(None, 'Use average_precision_at_k instead')
 def sparse_average_precision_at_k(labels,
                                   predictions,
@@ -3177,7 +3166,7 @@ def sparse_average_precision_at_k(labels,
       name=name)
 
 
-@tf_export('metrics.average_precision_at_k')
+@tf_export(v1=['metrics.average_precision_at_k'])
 def average_precision_at_k(labels,
                            predictions,
                            k,
@@ -3351,7 +3340,7 @@ def _streaming_sparse_false_positive_at_k(labels,
     return var, state_ops.assign_add(var, batch_total_fp, name='update')
 
 
-@tf_export('metrics.precision_at_top_k')
+@tf_export(v1=['metrics.precision_at_top_k'])
 def precision_at_top_k(labels,
                        predictions_idx,
                        k=None,
@@ -3427,11 +3416,11 @@ def precision_at_top_k(labels,
         class_id=class_id,
         weights=weights)
 
-    def precision_across_towers(_, tp, fp):
+    def precision_across_replicas(_, tp, fp):
       return math_ops.div(tp, math_ops.add(tp, fp), name=scope)
 
-    metric = _aggregate_across_towers(
-        metrics_collections, precision_across_towers, tp, fp)
+    metric = _aggregate_across_replicas(
+        metrics_collections, precision_across_replicas, tp, fp)
 
     update = math_ops.div(
         tp_update, math_ops.add(tp_update, fp_update), name='update')
@@ -3440,7 +3429,7 @@ def precision_at_top_k(labels,
     return metric, update
 
 
-@tf_export('metrics.sparse_precision_at_k')
+@tf_export(v1=['metrics.sparse_precision_at_k'])
 @deprecated(None, 'Use precision_at_k instead')
 def sparse_precision_at_k(labels,
                           predictions,
@@ -3462,7 +3451,7 @@ def sparse_precision_at_k(labels,
       name=name)
 
 
-@tf_export('metrics.precision_at_k')
+@tf_export(v1=['metrics.precision_at_k'])
 def precision_at_k(labels,
                    predictions,
                    k,
@@ -3556,7 +3545,7 @@ def precision_at_k(labels,
         name=scope)
 
 
-@tf_export('metrics.specificity_at_sensitivity')
+@tf_export(v1=['metrics.specificity_at_sensitivity'])
 def specificity_at_sensitivity(labels,
                                predictions,
                                sensitivity,
@@ -3662,12 +3651,12 @@ def specificity_at_sensitivity(labels,
       return math_ops.div(tn[tf_index], tn[tf_index] + fp[tf_index] + kepsilon,
                           name)
 
-    def specificity_across_towers(_, values):
+    def specificity_across_replicas(_, values):
       return compute_specificity_at_sensitivity(
           values['tp'], values['tn'], values['fp'], values['fn'], 'value')
 
-    specificity = _aggregate_across_towers(
-        metrics_collections, specificity_across_towers, values)
+    specificity = _aggregate_across_replicas(
+        metrics_collections, specificity_across_replicas, values)
 
     update_op = compute_specificity_at_sensitivity(
         update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops.py b/tensorflow/python/ops/nccl_ops.py
similarity index 89%
rename from tensorflow/contrib/nccl/python/ops/nccl_ops.py
rename to tensorflow/python/ops/nccl_ops.py
index fa597cf3efcf915311047f3a483772c45cc314fd..6259ce0f948427cace576dbc3e21a410f531f4e2 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops.py
+++ b/tensorflow/python/ops/nccl_ops.py
@@ -19,15 +19,11 @@ from __future__ import print_function
 
 import threading
 
-from tensorflow.contrib.nccl.ops import gen_nccl_ops
-from tensorflow.contrib.util import loader
-from tensorflow.python.eager import context
 from tensorflow.python.framework import device
 from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
+from tensorflow.python.ops import gen_nccl_ops
 
 
-_nccl_ops_so = None
 _module_lock = threading.Lock()
 _shared_name_counter = 0
 
@@ -182,7 +178,6 @@ def broadcast(tensor):
     A tensor with the value of `src_tensor`, which can be used as input to
     ops on other GPU devices.
   """
-  _validate_and_load_nccl_so()
   _check_device(tensor)
 
   with ops.device(tensor.device):
@@ -214,7 +209,6 @@ def _apply_all_reduce(reduction, tensors):
   """Helper function for all_* functions."""
   if not tensors:
     raise ValueError('Must pass >0 tensors to all reduce operations')
-  _validate_and_load_nccl_so()
 
   shared_name = _get_shared_name()
   res = []
@@ -236,7 +230,6 @@ def _apply_reduce(reduction, tensors):
   """Helper function for reduce_* functions."""
   if not tensors:
     raise ValueError('Must pass >0 tensors to reduce operations')
-  _validate_and_load_nccl_so()
 
   for t in tensors:
     _check_device(t)
@@ -262,27 +255,3 @@ def _check_device(tensor, expected=None):
     raise ValueError('Device assignment required for nccl collective ops')
   if expected and expected != tensor.device:
     raise ValueError('Expected device %s, got %s' % (expected, tensor.device))
-
-
-def _maybe_load_nccl_ops_so():
-  """Loads nccl ops so if it hasn't been loaded already."""
-
-  with _module_lock:
-    global _nccl_ops_so
-    if not _nccl_ops_so:
-      _nccl_ops_so = loader.load_op_library(
-          resource_loader.get_path_to_datafile('_nccl_ops.so'))
-
-
-def _validate_and_load_nccl_so():
-  """Validates calling context and loads nccl ops so file.
-
-  Raises:
-    ValueError: Ops are not supported.
-    errors_impl.NotFoundError: nccl library is not installed.
-  """
-
-  if context.executing_eagerly():
-    raise ValueError('Nccl ops are not supported in eager mode')
-
-  _maybe_load_nccl_ops_so()
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/python/ops/nccl_ops_test.py
similarity index 86%
rename from tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
rename to tensorflow/python/ops/nccl_ops_test.py
index 423a8689aeee062fb58eaf9d6d9b980b0998754e..3b2e2b0175f109bf698cf52e695d452ae5eae3ec 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/python/ops/nccl_ops_test.py
@@ -21,11 +21,11 @@ from __future__ import print_function
 from functools import partial
 import numpy as np
 
-from tensorflow.contrib import nccl
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
+from tensorflow.python.ops import nccl_ops
 from tensorflow.python.platform import test
 
 
@@ -51,7 +51,7 @@ def _NcclBroadcast(tensors, devices):
   sender = np.random.randint(0, len(devices))
   with ops.device(devices[sender]):
     tensor = array_ops.identity(tensors[0])
-    broadcast = nccl.broadcast(tensor)
+    broadcast = nccl_ops.broadcast(tensor)
   return _DeviceTensors([broadcast] * len(devices), devices)
 
 
@@ -102,7 +102,7 @@ class NcclTestCase(test.TestCase):
             continue
 
           # Test execution and results.
-          for t in sess.run(result_tensors):
+          for t in self.evaluate(result_tensors):
             self.assertAllClose(t, np_ans)
 
   def _TestGradient(self, nccl_reduce, numpy_fn):
@@ -130,29 +130,30 @@ class NcclTestCase(test.TestCase):
 class AllReduceTest(NcclTestCase):
 
   def testAllReduce(self):
-    self._Test(partial(_NcclAllReduce, nccl.all_sum), lambda x, y: x + y)
-    self._Test(partial(_NcclAllReduce, nccl.all_prod), lambda x, y: x * y)
-    self._Test(partial(_NcclAllReduce, nccl.all_min), np.minimum)
-    self._Test(partial(_NcclAllReduce, nccl.all_max), np.maximum)
+    self._Test(partial(_NcclAllReduce, nccl_ops.all_sum), lambda x, y: x + y)
+    self._Test(partial(_NcclAllReduce, nccl_ops.all_prod), lambda x, y: x * y)
+    self._Test(partial(_NcclAllReduce, nccl_ops.all_min), np.minimum)
+    self._Test(partial(_NcclAllReduce, nccl_ops.all_max), np.maximum)
 
   def testAllSumGrad(self):
     self._TestGradient(
-        partial(_NcclAllReduce, nccl.all_sum), lambda x, y: x + y)
+        partial(_NcclAllReduce, nccl_ops.all_sum), lambda x, y: x + y)
 
   def testErrors(self):
     with self.assertRaisesRegexp(ValueError, 'Device assignment required'):
-      nccl.all_sum([array_ops.identity(np.random.random_sample((3, 4)))])
+      nccl_ops.all_sum([array_ops.identity(np.random.random_sample((3, 4)))])
     with self.assertRaisesRegexp(ValueError, 'Must pass >0 tensors'):
-      nccl.all_sum([])
+      nccl_ops.all_sum([])
 
 
 class SingleReduceTest(NcclTestCase):
 
   def testSum(self):
-    self._Test(partial(_NcclReduce, nccl.reduce_sum), lambda x, y: x + y)
+    self._Test(partial(_NcclReduce, nccl_ops.reduce_sum), lambda x, y: x + y)
 
   def testSumGrad(self):
-    self._TestGradient(partial(_NcclReduce, nccl.reduce_sum), lambda x, y: x)
+    self._TestGradient(partial(_NcclReduce, nccl_ops.reduce_sum),
+                       lambda x, y: x)
 
 
 class BroadcastTest(NcclTestCase):
@@ -183,8 +184,8 @@ class CombinedTest(NcclTestCase):
   """Test all-reduce vs. single-reduce plus broadcast in one session.run."""
 
   def _Combined(self, tensors, devices):
-    all_reduce_tensors = _NcclAllReduce(nccl.all_sum, tensors, devices)
-    single_reduce_tensors = _NcclReduce(nccl.reduce_sum, tensors, devices)
+    all_reduce_tensors = _NcclAllReduce(nccl_ops.all_sum, tensors, devices)
+    single_reduce_tensors = _NcclReduce(nccl_ops.reduce_sum, tensors, devices)
     broadcast_tensors = _NcclBroadcast(single_reduce_tensors, devices)
     return all_reduce_tensors + broadcast_tensors
 
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index a7467aa943c4650c956acd805f9e6e511196c093..e978f1d32601890f8eb9b54fdd5738f626b7f863 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -71,6 +71,7 @@ class BatchNormalizationTest(test.TestCase):
                                        gamma if scale_after_normalization else
                                        None, epsilon)
 
+  @test_util.run_deprecated_v1
   def testBatchNorm(self):
     x_shape = [3, 5, 4, 2]
     param_shape = [2]
@@ -80,7 +81,7 @@ class BatchNormalizationTest(test.TestCase):
     beta_val = np.random.random_sample(param_shape).astype(np.float32)
     gamma_val = np.random.random_sample(param_shape).astype(np.float32)
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         x = constant_op.constant(x_val, name="x")
         m = constant_op.constant(m_val, name="m")
         v = constant_op.constant(v_val, name="v")
@@ -169,16 +170,20 @@ class BatchNormalizationTest(test.TestCase):
                                       shift_after_normalization, v,
                                       err_tolerance)
 
+  @test_util.run_deprecated_v1
   def testBatchNormInputGradient(self):
     self._testBatchNormGradientInAllNeedConfigs(0, "x")
 
+  @test_util.run_deprecated_v1
   def testBatchNormMeanGradient(self):
     self._testBatchNormGradientInAllNeedConfigs(1, "mean")
 
+  @test_util.run_deprecated_v1
   def testBatchNormVarianceGradient(self):
     self._testBatchNormGradientInAllNeedConfigs(
         2, "variance", err_tolerance=1e-03)
 
+  @test_util.run_deprecated_v1
   def testBatchNormBetaGradient(self):
     # Since beta does not exist when scale_after_normalization=False, we only
     # test for scale_after_normalization=True.
@@ -187,6 +192,7 @@ class BatchNormalizationTest(test.TestCase):
         self._testBatchNormGradient(3, "beta", scale_after_normalization, True,
                                     v)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGammaGradient(self):
     # If scale_after_normalization is False, backprop for gamma in v1
     # will be 0. In version 2 of the API, if scale_after_normalization is False,
@@ -199,6 +205,7 @@ class BatchNormalizationTest(test.TestCase):
       self._testBatchNormGradient(4, "gamma", True, shift_after_normalization,
                                   2)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradImpl(self):
     x_shape = [7, 5, 4, 6]
     param_shape = [6]
@@ -210,7 +217,7 @@ class BatchNormalizationTest(test.TestCase):
     gamma_val = np.random.random_sample(param_shape).astype(np.float32)
     backprop_val = np.random.random_sample(x_shape).astype(np.float32)
     for use_gpu in [False, True]:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         x = constant_op.constant(x_val, name="x")
         m = constant_op.constant(m_val, name="m")
         v = constant_op.constant(v_val, name="v")
@@ -235,15 +242,17 @@ class BatchNormalizationTest(test.TestCase):
           odx, odm, odv, odb, odg = gradients_impl.gradients(
               [on], [x, m, v, beta, gamma], [backprop])
           if scale_after_normalization:
-            all_grads = sess.run([dx, dm, dv, db, dg, odx, odm, odv, odb, odg])
+            all_grads = self.evaluate(
+                [dx, dm, dv, db, dg, odx, odm, odv, odb, odg])
             to_check = ["dx", "dm", "dv", "db", "dg"]
           else:
-            all_grads = sess.run([dx, dm, dv, db, odx, odm, odv, odb])
+            all_grads = self.evaluate([dx, dm, dv, db, odx, odm, odv, odb])
             to_check = ["dx", "dm", "dv", "db"]
           for i, _ in enumerate(to_check):
             self.assertAllClose(
                 all_grads[i + len(to_check)], all_grads[i], atol=0.000001)
 
+  @test_util.run_deprecated_v1
   def testBatchNormKeepDims(self):
     """Test for tf.nn.moments(..., keep_dims=True / False).
 
@@ -259,7 +268,7 @@ class BatchNormalizationTest(test.TestCase):
     beta_val = np.random.random_sample(param_shape).astype(np.float32)
     gamma_val = np.random.random_sample(param_shape).astype(np.float32)
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         x = constant_op.constant(x_val, name="x")
         m = constant_op.constant(m_val, name="m")
         v = constant_op.constant(v_val, name="v")
@@ -302,7 +311,7 @@ class BatchNormalizationTest(test.TestCase):
     beta_val = np.random.random_sample(param_shape).astype(numpy_param_dtype)
     gamma_val = np.random.random_sample(param_shape).astype(numpy_param_dtype)
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         x = constant_op.constant(x_val, name="x")
         m = constant_op.constant(m_val, name="m")
         v = constant_op.constant(v_val, name="v")
@@ -318,7 +327,7 @@ class BatchNormalizationTest(test.TestCase):
                                               gamma_val, epsilon,
                                               scale_after_normalization,
                                               shift_after_normalization)
-            [tf_batch_norm] = sess.run([bn])
+            [tf_batch_norm] = self.evaluate([bn])
             self.assertEquals(x_shape, np_batch_norm.shape)
             self.assertEquals(x_shape, tf_batch_norm.shape)
             self.assertAllClose(np_batch_norm, tf_batch_norm, atol=atol)
@@ -365,15 +374,15 @@ class SufficientStatisticsTest(test.TestCase):
     x_val = np.random.random_sample(x_shape).astype(np.float32)
     np_c, np_m, np_v, np_s = self._npSuffStats(x_val, axes, shift, keep_dims)
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         if has_shape:
           x = constant_op.constant(x_val, name="x")
           x.set_shape(x_shape)
           op_c, op_m, op_v, op_s = self._opSuffStats(x, axes, shift, keep_dims)
           if shift:
-            tf_c, tf_m, tf_v, tf_s = sess.run([op_c, op_m, op_v, op_s])
+            tf_c, tf_m, tf_v, tf_s = self.evaluate([op_c, op_m, op_v, op_s])
           else:
-            tf_c, tf_m, tf_v = sess.run([op_c, op_m, op_v])
+            tf_c, tf_m, tf_v = self.evaluate([op_c, op_m, op_v])
         else:
           x = array_ops.placeholder(
               dtype=dtypes.float32, shape=[None] * len(x_shape), name="x")
@@ -390,6 +399,7 @@ class SufficientStatisticsTest(test.TestCase):
         if shift:
           self.assertAllClose(np_s, tf_s, atol=0.000001)
 
+  @test_util.run_deprecated_v1
   def testSuffStats(self):
     for has_shape in [True, False]:
       for keep_dims in [True, False]:
@@ -422,7 +432,7 @@ class NormalizeMomentsTest(test.TestCase):
       shift_v = None
     npm, npv = self._npNormalizeMoments(counts, mean_ss, variance_ss, shift_v)
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         tf_counts = constant_op.constant(counts, name="counts")
         tf_mean_ss = constant_op.constant(mean_ss, name="mean_ss")
         tf_variance_ss = constant_op.constant(variance_ss, name="variance_ss")
@@ -432,7 +442,7 @@ class NormalizeMomentsTest(test.TestCase):
           tf_shift_v = None
         opm, opv = self._opNormalizeMoments(tf_counts, tf_mean_ss,
                                             tf_variance_ss, tf_shift_v)
-        tfm, tfv = sess.run([opm, opv])
+        tfm, tfv = self.evaluate([opm, opv])
         self.assertAllClose(npm, tfm, atol=0.000001)
         self.assertAllClose(npv, tfv, atol=0.000001)
 
@@ -507,9 +517,10 @@ class MomentsTest(test.TestCase):
       expected_variance = expected_x_squared - expected_mean_squared
 
       # Check that the moments are correct.
-      self.assertAllCloseAccordingToType(expected_mean, mean.eval())
-      self.assertAllCloseAccordingToType(expected_variance, var.eval())
+      self.assertAllCloseAccordingToType(expected_mean, self.evaluate(mean))
+      self.assertAllCloseAccordingToType(expected_variance, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     for keep_dims in [False, True]:
       for dtype in [dtypes.float32, dtypes.float16]:
@@ -518,6 +529,7 @@ class MomentsTest(test.TestCase):
         self.RunMomentTestWithDynamicShape(
             shape=[2, 3, 5, 4], axes=[0], keep_dims=keep_dims, dtype=dtype)
 
+  @test_util.run_deprecated_v1
   def testGlobalNormalization(self):
     for keep_dims in [False, True]:
       for dtype in [dtypes.float32, dtypes.float16]:
@@ -532,6 +544,7 @@ class MomentsTest(test.TestCase):
             keep_dims=keep_dims,
             dtype=dtype)
 
+  @test_util.run_deprecated_v1
   def testAxes(self):
     for keep_dims in [False, True]:
       for dtype in [dtypes.float32, dtypes.float16]:
@@ -572,9 +585,11 @@ class MomentsTest(test.TestCase):
         print("Moments %s gradient err vs input %d = %g" % (from_y, i, err))
         self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testMeanGlobalGradient(self):
     self._testGlobalGradient(from_y="mean")
 
+  @test_util.run_deprecated_v1
   def testVarGlobalGradient(self):
     self._testGlobalGradient(from_y="var")
 
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index a08b836025d12178ab7acfbd70fcc7a47bc99532..4bc33ff8bdb845510a9872db26c8adfdf1f50995 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -50,7 +50,7 @@ class BatchNormalizationTest(test.TestCase):
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
-    return y.eval()
+    return self.evaluate(y)
 
   def _test_inference(self,
                       x_shape,
@@ -66,7 +66,7 @@ class BatchNormalizationTest(test.TestCase):
     mean_val = np.random.random_sample(scale_shape).astype(scale_dtype)
     var_val = np.random.random_sample(scale_shape).astype(scale_dtype)
 
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       x = constant_op.constant(x_val, name='x')
       scale = constant_op.constant(scale_val, name='scale')
       offset = constant_op.constant(offset_val, name='offset')
@@ -82,7 +82,7 @@ class BatchNormalizationTest(test.TestCase):
           epsilon=epsilon,
           data_format=data_format,
           is_training=False)
-      y_val = sess.run(y)
+      y_val = self.evaluate(y)
       y_ref = self._inference_ref(x, scale, offset, mean, var, epsilon,
                                   data_format)
     # An atol value of 1e-3 is too small for float16's, because some adjacent
@@ -102,7 +102,7 @@ class BatchNormalizationTest(test.TestCase):
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
-    return y.eval(), mean.eval(), var.eval()
+    return self.evaluate(y), self.evaluate(mean), self.evaluate(var)
 
   def _test_training(self,
                      x_shape,
@@ -115,7 +115,7 @@ class BatchNormalizationTest(test.TestCase):
     x_val = np.random.random_sample(x_shape).astype(x_dtype)
     scale_val = np.random.random_sample(scale_shape).astype(scale_dtype)
     offset_val = np.random.random_sample(scale_shape).astype(scale_dtype)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       x = constant_op.constant(x_val, name='x')
       scale = constant_op.constant(scale_val, name='scale')
       offset = constant_op.constant(offset_val, name='offset')
@@ -127,7 +127,7 @@ class BatchNormalizationTest(test.TestCase):
           epsilon=epsilon,
           data_format=data_format,
           is_training=True)
-      y_val, mean_val, var_val = sess.run([y, mean, var])
+      y_val, mean_val, var_val = self.evaluate([y, mean, var])
       y_ref, mean_ref, var_ref = self._training_ref(x, scale, offset, epsilon,
                                                     data_format)
     y_atol = 2e-3 if x_dtype == np.float16 else 1e-3
@@ -190,7 +190,7 @@ class BatchNormalizationTest(test.TestCase):
     scale_val = np.random.random_sample(scale_shape).astype(scale_dtype)
     offset_val = np.random.random_sample(scale_shape).astype(scale_dtype)
 
-    with self.test_session(use_gpu=use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
       x = constant_op.constant(x_val, name='x')
       scale = constant_op.constant(scale_val, name='scale')
       offset = constant_op.constant(offset_val, name='offset')
@@ -252,7 +252,7 @@ class BatchNormalizationTest(test.TestCase):
     scale_val = np.random.random_sample(scale_shape).astype(scale_dtype)
     offset_val = np.random.random_sample(scale_shape).astype(scale_dtype)
 
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.cached_session(use_gpu=use_gpu) as sess:
       x = constant_op.constant(x_val, name='x')
       grad_y = constant_op.constant(grad_y_val, name='grad_y')
       scale = constant_op.constant(scale_val, name='scale')
@@ -277,10 +277,10 @@ class BatchNormalizationTest(test.TestCase):
       if is_training:
         epsilon = y.op.get_attr('epsilon')
         data_format = y.op.get_attr('data_format')
-        grad_vals = sess.run([grad_x, grad_scale, grad_offset])
+        grad_vals = self.evaluate([grad_x, grad_scale, grad_offset])
         grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean,
                                                pop_var, epsilon, data_format)
-        grad_internal_vals = sess.run(list(grad_internal))
+        grad_internal_vals = self.evaluate(list(grad_internal))
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
 
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index e1a01ab4c3250ea1488a9545b03befdae7524d71..34404edc9a1250710d4cd7a50e04ad8d187a5d7f 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -18,13 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 
@@ -389,6 +389,21 @@ def _Relu6GradGrad(op, grad):
           array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
 
 
+@ops.RegisterGradient("LeakyRelu")
+def _LeakyReluGrad(op, grad):
+  x = op.inputs[0]
+  alpha = op.get_attr("alpha")
+  return gen_nn_ops.leaky_relu_grad(grad, x, alpha=alpha)
+
+
+@ops.RegisterGradient("LeakyReluGrad")
+def _LeakyReluGradGrad(op, grad):
+  x = op.inputs[1]
+  alpha = op.get_attr("alpha")
+  return (gen_nn_ops.leaky_relu_grad(grad, x, alpha=alpha),
+          array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype))
+
+
 @ops.RegisterGradient("Elu")
 def _EluGrad(op, grad):
   return gen_nn_ops.elu_grad(grad, op.outputs[0])
@@ -933,10 +948,14 @@ def _FusedBatchNormGradGrad(op, *grad):
   grad_grad_x = grad[0]
   grad_grad_scale = grad[1]
   grad_grad_offset = grad[2]
-  grad_x, grad_scale, grad_offset = _BatchNormGrad(
-      grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
-  grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
-  grad_grad_y, grad_x, grad_scale = gradients_impl.gradients(
+  with backprop.GradientTape() as tape:
+    tape.watch(grad_y)
+    tape.watch(x)
+    tape.watch(scale)
+    grad_x, grad_scale, grad_offset = _BatchNormGrad(
+        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
+    grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
+  grad_grad_y, grad_x, grad_scale = tape.gradient(
       [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
   return grad_grad_y, grad_x, grad_scale, None, None
 
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 8065df4b1658dc1bac068bee1ae7c6052f82d4f1..95e05a977b856505f0b608442e85fda8468ead1f 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class Relu6OpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRelu6GradGrad(self):
     inputs = constant_op.constant(
         [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 453848fc00bde443ffda1265f1ee4f0e97640e92..48dcab4842864b7322610e4328c1771f95ee352d 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_nn_ops
@@ -261,7 +262,7 @@ def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
         name=name)
 
 
-@tf_export("nn.relu_layer")
+@tf_export(v1=["nn.relu_layer"])
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).
 
@@ -328,7 +329,7 @@ def swish(features):
   return features * math_ops.sigmoid(features)
 
 
-@tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize")
+@tf_export(v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.
@@ -349,17 +350,63 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
     name: A name for this operation (optional).
     dim: Deprecated alias for axis.
 
+  Returns:
+    A `Tensor` with the same shape as `x`.
+  """
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  return l2_normalize_v2(x, axis, epsilon, name)
+
+
+@tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize", v1=[])
+def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
+  """Normalizes along dimension `axis` using an L2 norm.
+
+  For a 1-D tensor with `axis = 0`, computes
+
+      output = x / sqrt(max(sum(x**2), epsilon))
+
+  For `x` with more dimensions, independently normalizes each 1-D slice along
+  dimension `axis`.
+
+  Args:
+    x: A `Tensor`.
+    axis: Dimension along which to normalize.  A scalar or a vector of
+      integers.
+    epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
+      divisor if `norm < sqrt(epsilon)`.
+    name: A name for this operation (optional).
+
   Returns:
     A `Tensor` with the same shape as `x`.
   """
   with ops.name_scope(name, "l2_normalize", [x]) as name:
-    axis = deprecated_argument_lookup("axis", axis, "dim", dim)
     x = ops.convert_to_tensor(x, name="x")
     square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
     return math_ops.multiply(x, x_inv_norm, name=name)
 
 
+def _count_nonzero(input_tensor, dtype=dtypes.int64):
+  """Same as math_ops.count_nonzero.
+
+  The reduction is done in dtype, which can be faster for 32-bit dtypes.
+
+  Args:
+      input_tensor: numeric tensor
+      dtype: reduction dtype
+
+  Returns:
+      number of nonzero values with type dtype
+  """
+  with ops.name_scope("count_nonzero", [input_tensor]):
+    zero = array_ops.zeros([], dtype=input_tensor.dtype)
+    nonzero_count = math_ops.reduce_sum(
+        math_ops.cast(
+            math_ops.not_equal(input_tensor, zero),
+            dtype=dtype), name="nonzero_count")
+    return nonzero_count
+
+
 @tf_export("math.zero_fraction", "nn.zero_fraction")
 def zero_fraction(value, name=None):
   """Returns the fraction of zeros in `value`.
@@ -382,13 +429,27 @@ def zero_fraction(value, name=None):
   """
   with ops.name_scope(name, "zero_fraction", [value]):
     value = ops.convert_to_tensor(value, name="value")
-    zero = constant_op.constant(0, dtype=value.dtype, name="zero")
-    return math_ops.reduce_mean(
-        math_ops.cast(math_ops.equal(value, zero), dtypes.float32))
+    size = array_ops.size(value, out_type=dtypes.int64)
+    # If the count is small, we can save memory/CPU with an int32 reduction.
+    num_nonzero = control_flow_ops.cond(
+        size <= dtypes.int32.max,
+        # pylint: disable=g-long-lambda
+        true_fn=lambda: math_ops.cast(
+            _count_nonzero(value, dtype=dtypes.int32),
+            dtype=dtypes.int64),
+        false_fn=lambda: _count_nonzero(value, dtype=dtypes.int64))
+
+    with ops.name_scope("counts_to_fraction"):
+      num_zero = size - num_nonzero
+      num_zero_float32 = math_ops.cast(num_zero, dtype=dtypes.float32)
+      size_float32 = math_ops.cast(size, dtype=dtypes.float32)
+      zero_fraction_float32 = num_zero_float32 / size_float32
+
+    return array_ops.identity(zero_fraction_float32, "fraction")
 
 
 # pylint: disable=redefined-builtin
-@tf_export("nn.depthwise_conv2d")
+@tf_export(v1=["nn.depthwise_conv2d"])
 def depthwise_conv2d(input,
                      filter,
                      strides,
@@ -461,11 +522,68 @@ def depthwise_conv2d(input,
         op=op)
 
 
+@tf_export("nn.depthwise_conv2d", v1=[])
+def depthwise_conv2d_v2(input,
+                        filter,
+                        strides,
+                        padding,
+                        data_format=None,
+                        dilations=None,
+                        name=None):
+  """Depthwise 2-D convolution.
+
+  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
+  and a filter tensor of shape
+  `[filter_height, filter_width, in_channels, channel_multiplier]`
+  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
+  applies a different filter to each input channel (expanding from 1 channel
+  to `channel_multiplier` channels for each), then concatenates the results
+  together.  The output has `in_channels * channel_multiplier` channels.
+
+  In detail,
+
+      output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
+           filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
+                                           strides[2] * j + rate[1] * dj, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
+  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
+  If any value in `rate` is greater than 1, we perform atrous depthwise
+  convolution, in which case all values in the `strides` tensor must be equal
+  to 1.
+
+  Args:
+    input: 4-D with shape according to `data_format`.
+    filter: 4-D with shape
+      `[filter_height, filter_width, in_channels, channel_multiplier]`.
+    strides: 1-D of size 4.  The stride of the sliding window for each
+      dimension of `input`.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: 1-D of size 2. The dilation rate in which we sample input values
+      across the `height` and `width` dimensions in atrous convolution. If it is
+      greater than 1, then all values of strides must be 1.
+    name: A name for this operation (optional).
+
+  Returns:
+    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
+    "NHWC" format, shape is
+    `[batch, out_height, out_width, in_channels * channel_multiplier].`
+  """
+  return depthwise_conv2d(input=input,
+                          filter=filter,
+                          strides=strides,
+                          padding=padding,
+                          rate=dilations,
+                          name=name,
+                          data_format=data_format)
+
 # pylint: enable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin,line-too-long
-@tf_export("nn.separable_conv2d")
+@tf_export(v1=["nn.separable_conv2d"])
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
@@ -528,8 +646,8 @@ def separable_conv2d(input,
         pointwise_filter, name="pointwise_filter")
 
     pointwise_filter_shape = pointwise_filter.get_shape().with_rank(4)
-    pointwise_filter_shape[0].assert_is_compatible_with(1)
-    pointwise_filter_shape[1].assert_is_compatible_with(1)
+    pointwise_filter_shape.dims[0].assert_is_compatible_with(1)
+    pointwise_filter_shape.dims[1].assert_is_compatible_with(1)
 
     if rate is None:
       rate = [1, 1]
@@ -563,10 +681,76 @@ def separable_conv2d(input,
         name=name)
 
 
+@tf_export("nn.separable_conv2d", v1=[])
+def separable_conv2d_v2(
+    input,
+    depthwise_filter,
+    pointwise_filter,
+    strides,
+    padding,
+    data_format=None,
+    dilations=None,
+    name=None,
+):
+  """2-D convolution with separable filters.
+
+  Performs a depthwise convolution that acts separately on channels followed by
+  a pointwise convolution that mixes channels.  Note that this is separability
+  between dimensions `[1, 2]` and `3`, not spatial separability between
+  dimensions `1` and `2`.
+
+  In detail,
+
+      output[b, i, j, k] = sum_{di, dj, q, r}
+          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+          depthwise_filter[di, dj, q, r] *
+          pointwise_filter[0, 0, q * channel_multiplier + r, k]
+
+  `strides` controls the strides for the depthwise convolution only, since
+  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
+  `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
+  If any value in `rate` is greater than 1, we perform atrous depthwise
+  convolution, in which case all values in the `strides` tensor must be equal
+  to 1.
+
+  Args:
+    input: 4-D `Tensor` with shape according to `data_format`.
+    depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,
+      in_channels, channel_multiplier]`. Contains `in_channels` convolutional
+      filters of depth 1.
+    pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *
+      in_channels, out_channels]`.  Pointwise filter to mix channels after
+      `depthwise_filter` has convolved spatially.
+    strides: 1-D of size 4.  The strides for the depthwise convolution for each
+      dimension of `input`.
+    padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: 1-D of size 2. The dilation rate in which we sample input values
+      across the `height` and `width` dimensions in atrous convolution. If it is
+      greater than 1, then all values of strides must be 1.
+    name: A name for this operation (optional).
+
+  Returns:
+    A 4-D `Tensor` with shape according to 'data_format'. For
+      example, with data_format="NHWC", shape is [batch, out_height,
+      out_width, out_channels].
+  """
+  return separable_conv2d(
+      input,
+      depthwise_filter,
+      pointwise_filter,
+      strides,
+      padding,
+      rate=dilations,
+      name=name,
+      data_format=data_format)
+
 # pylint: enable=redefined-builtin,line-too-long
 
 
-@tf_export("nn.sufficient_statistics")
+@tf_export(v1=["nn.sufficient_statistics"])
 def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
@@ -595,10 +779,10 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   with ops.name_scope(name, "sufficient_statistics", [x, shift]):
     x = ops.convert_to_tensor(x, name="x")
     x_shape = x.get_shape()
-    if all(x_shape[d].value is not None for d in axes):
+    if all(x_shape.dims[d].value is not None for d in axes):
       counts = 1
       for d in axes:
-        counts *= x_shape[d].value
+        counts *= x_shape.dims[d].value
       counts = constant_op.constant(counts, dtype=x.dtype)
     else:  # shape needs to be inferred at runtime.
       x_dims = array_ops.gather(
@@ -616,6 +800,35 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   return counts, m_ss, v_ss, shift
 
 
+@tf_export("nn.sufficient_statistics", v1=[])
+def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
+  """Calculate the sufficient statistics for the mean and variance of `x`.
+
+  These sufficient statistics are computed using the one pass algorithm on
+  an input that's optionally shifted. See:
+  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
+
+  Args:
+    x: A `Tensor`.
+    axes: Array of ints. Axes along which to compute mean and variance.
+    shift: A `Tensor` containing the value by which to shift the data for
+      numerical stability, or `None` if no shift is to be performed. A shift
+      close to the true mean provides the most numerically stable results.
+    keepdims: produce statistics with the same dimensionality as the input.
+    name: Name used to scope the operations that compute the sufficient stats.
+
+  Returns:
+    Four `Tensor` objects of the same type as `x`:
+
+    * the count (number of elements to average over).
+    * the (possibly shifted) sum of the elements in the array.
+    * the (possibly shifted) sum of squares of the elements in the array.
+    * the shift by which the mean must be corrected or None if `shift` is None.
+  """
+  return sufficient_statistics(
+      x=x, axes=axes, shift=shift, keep_dims=keepdims, name=name)
+
+
 @tf_export("nn.normalize_moments")
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
@@ -648,7 +861,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   return (mean, variance)
 
 
-@tf_export("nn.moments")
+@tf_export(v1=["nn.moments"])
 def moments(
     x,
     axes,
@@ -707,7 +920,43 @@ def moments(
       return (mean, variance)
 
 
-@tf_export("nn.weighted_moments")
+@tf_export("nn.moments", v1=[])
+def moments_v2(
+    x,
+    axes,
+    shift=None,
+    keepdims=False,
+    name=None):
+  """Calculates the mean and variance of `x`.
+
+  The mean and variance are calculated by aggregating the contents of `x`
+  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
+  and variance of a vector.
+
+  Note: shift is currently not used; the true mean is computed and used.
+
+  When using these moments for batch normalization (see
+  `tf.nn.batch_normalization`):
+
+   * for so-called "global normalization", used with convolutional filters with
+     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
+   * for simple batch normalization pass `axes=[0]` (batch only).
+
+  Args:
+    x: A `Tensor`.
+    axes: Array of ints.  Axes along which to compute mean and
+      variance.
+    shift: Not used in the current implementation.
+    keepdims: produce moments with the same dimensionality as the input.
+    name: Name used to scope the operations that compute the moments.
+
+  Returns:
+    Two `Tensor` objects: `mean` and `variance`.
+  """
+  return moments(x=x, axes=axes, shift=shift, name=name, keep_dims=keepdims)
+
+
+@tf_export(v1=["nn.weighted_moments"])
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
   """Returns the frequency-weighted mean and variance of `x`.
 
@@ -779,6 +1028,30 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     return weighted_mean, weighted_variance
 
 
+@tf_export("nn.weighted_moments", v1=[])
+def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
+  """Returns the frequency-weighted mean and variance of `x`.
+
+  Args:
+    x: A tensor.
+    axes: 1-d tensor of int32 values; these are the axes along which
+      to compute mean and variance.
+    frequency_weights: A tensor of positive weights which can be
+      broadcast with x.
+    keepdims: Produce moments with the same dimensionality as the input.
+    name: Name used to scope the operation.
+
+  Returns:
+    Two tensors: `weighted_mean` and `weighted_variance`.
+  """
+  return weighted_moments(
+      x=x,
+      axes=axes,
+      frequency_weights=frequency_weights,
+      name=name,
+      keep_dims=keepdims)
+
+
 @tf_export("nn.batch_normalization")
 def batch_normalization(x,
                         mean,
@@ -839,7 +1112,7 @@ def batch_normalization(x,
         offset - mean * inv if offset is not None else -mean * inv, x.dtype)
 
 
-@tf_export("nn.fused_batch_norm")
+@tf_export(v1=["nn.fused_batch_norm"])
 def fused_batch_norm(
     x,
     scale,
@@ -910,7 +1183,7 @@ def fused_batch_norm(
   return y, batch_mean, batch_var
 
 
-@tf_export("nn.batch_norm_with_global_normalization")
+@tf_export(v1=["nn.batch_norm_with_global_normalization"])
 def batch_norm_with_global_normalization(t,
                                          m,
                                          v,
@@ -948,6 +1221,53 @@ def batch_norm_with_global_normalization(t,
                              else None, variance_epsilon, name)
 
 
+# pylint: disable=redefined-builtin,line-too-long
+@tf_export("nn.batch_norm_with_global_normalization", v1=[])
+def batch_norm_with_global_normalization_v2(input,
+                                            mean,
+                                            variance,
+                                            beta,
+                                            gamma,
+                                            variance_epsilon,
+                                            scale_after_normalization,
+                                            name=None):
+  """Batch normalization.
+
+  This op is deprecated. See `tf.nn.batch_normalization`.
+
+  Args:
+    input: A 4D input Tensor.
+    mean: A 1D mean Tensor with size matching the last dimension of t.
+      This is the first output from tf.nn.moments,
+      or a saved moving average thereof.
+    variance: A 1D variance Tensor with size matching the last dimension of t.
+      This is the second output from tf.nn.moments,
+      or a saved moving average thereof.
+    beta: A 1D beta Tensor with size matching the last dimension of t.
+      An offset to be added to the normalized tensor.
+    gamma: A 1D gamma Tensor with size matching the last dimension of t.
+      If "scale_after_normalization" is true, this tensor will be multiplied
+      with the normalized tensor.
+    variance_epsilon: A small float number to avoid dividing by 0.
+    scale_after_normalization: A bool indicating whether the resulted tensor
+      needs to be multiplied with gamma.
+    name: A name for this operation (optional).
+
+  Returns:
+     A batch-normalized `t`.
+  """
+  return batch_norm_with_global_normalization(t=input,
+                                              m=mean,
+                                              v=variance,
+                                              beta=beta,
+                                              gamma=gamma,
+                                              variance_epsilon=variance_epsilon,
+                                              scale_after_normalization=scale_after_normalization,
+                                              name=name)
+
+# pylint: enable=redefined-builtin,line-too-long
+
+
 def _sum_rows(x):
   """Returns a vector summing up each row of the matrix x."""
   # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is
@@ -1142,7 +1462,111 @@ def _compute_sampled_logits(weights,
     return out_logits, out_labels
 
 
-@tf_export("nn.nce_loss")
+@tf_export("nn.nce_loss", v1=[])
+def nce_loss_v2(weights,
+                biases,
+                labels,
+                inputs,
+                num_sampled,
+                num_classes,
+                num_true=1,
+                sampled_values=None,
+                remove_accidental_hits=False,
+                name="nce_loss"):
+  """Computes and returns the noise-contrastive estimation training loss.
+
+  See [Noise-contrastive estimation: A new estimation principle for
+  unnormalized statistical
+  models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+  Also see our [Candidate Sampling Algorithms
+  Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  A common use case is to use this method for training, and calculate the full
+  sigmoid loss for evaluation or inference as in the following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.nce_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...)
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=labels_one_hot,
+        logits=logits)
+    loss = tf.reduce_sum(loss, axis=1)
+  ```
+
+  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
+  so your labels must be sorted in order of decreasing frequency to achieve
+  good results.  For more details, see
+  `tf.nn.log_uniform_candidate_sampler`.
+
+  Note: In the case where `num_true` > 1, we assign to each target class
+  the target probability 1 / `num_true` so that the target probabilities
+  sum to 1 per-example.
+
+  Note: It would be useful to allow a variable number of target classes per
+  example.  We hope to provide this functionality in a future release.
+  For now, if you have a variable number of target classes, you can pad them
+  out to a constant number by either repeating them or by padding
+  with an otherwise unused class.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+      objects whose concatenation along dimension 0 has shape [num_classes,
+      dim].  The (possibly-partitioned) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
+      target classes.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
+      the input network.
+    num_sampled: An `int`.  The number of negative classes to randomly sample
+      per batch. This single sample of negative classes is evaluated for each
+      element in the batch.
+    num_classes: An `int`. The number of possible classes.
+    num_true: An `int`.  The number of target classes per training example.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+      (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
+      where a sampled class equals one of the target classes.  If set to `True`,
+      this is a "Sampled Logistic" loss instead of NCE, and we are learning to
+      generate log-odds instead of log probabilities.  See our [Candidate
+      Sampling Algorithms Reference]
+        (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is
+          False.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example NCE losses.
+  """
+  # TODO(yuefengz): get partition_strategy from either variables or distribution
+  # strategies.
+  return nce_loss(
+      weights,
+      biases,
+      labels,
+      inputs,
+      num_sampled,
+      num_classes,
+      num_true=num_true,
+      sampled_values=sampled_values,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy="div",
+      name=name)
+
+
+@tf_export(v1=["nn.nce_loss"])
 def nce_loss(weights,
              biases,
              labels,
@@ -1253,7 +1677,98 @@ def nce_loss(weights,
   return _sum_rows(sampled_losses)
 
 
-@tf_export("nn.sampled_softmax_loss")
+@tf_export("nn.sampled_softmax_loss", v1=[])
+def sampled_softmax_loss_v2(weights,
+                            biases,
+                            labels,
+                            inputs,
+                            num_sampled,
+                            num_classes,
+                            num_true=1,
+                            sampled_values=None,
+                            remove_accidental_hits=True,
+                            seed=None,
+                            name="sampled_softmax_loss"):
+  """Computes and returns the sampled softmax training loss.
+
+  This is a faster way to train a softmax classifier over a huge number of
+  classes.
+
+  This operation is for training only.  It is generally an underestimate of
+  the full softmax loss.
+
+  A common use case is to use this method for training, and calculate the full
+  sigmoid loss for evaluation or inference as in the following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.sampled_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...)
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
+        labels=labels_one_hot,
+        logits=logits)
+  ```
+
+  See our [Candidate Sampling Algorithms Reference]
+  (https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
+  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
+
+  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+      objects whose concatenation along dimension 0 has shape [num_classes,
+      dim].  The (possibly-sharded) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
+      target classes.  Note that this format differs from the `labels` argument
+      of `nn.softmax_cross_entropy_with_logits_v2`.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
+      the input network.
+    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_classes: An `int`. The number of possible classes.
+    num_true: An `int`.  The number of target classes per training example.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+      (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
+      where a sampled class equals one of the target classes.  Default is True.
+    seed: random seed for candidate sampling. Default to None, which doesn't set
+      the op-level random seed for candidate sampling.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example sampled softmax losses.
+
+  """
+  return sampled_softmax_loss(
+      weights,
+      biases,
+      labels,
+      inputs,
+      num_sampled,
+      num_classes,
+      num_true=num_true,
+      sampled_values=sampled_values,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy="div",
+      name=name,
+      seed=seed)
+
+
+@tf_export(v1=["nn.sampled_softmax_loss"])
 def sampled_softmax_loss(weights,
                          biases,
                          labels,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 1fbe31a098db59c9bc2e9dfe4ced9685ac8abd03..611bfdac9a1b10a808cafeed585ac6e3427d18e9 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -22,10 +22,13 @@ import numbers
 
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -33,13 +36,14 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
-
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
+
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
@@ -204,6 +208,73 @@ class _NonAtrousConvolution(object):
         name=self.name)
 
 
+@tf_export("nn.dilation2d", v1=[])
+def dilation2d_v2(
+    input,   # pylint: disable=redefined-builtin
+    filters,  # pylint: disable=redefined-builtin
+    strides,
+    padding,
+    data_format,
+    dilations,
+    name=None):
+  """Computes the grayscale dilation of 4-D `input` and 3-D `filters` tensors.
+
+  The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+  `filters` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+  input channel is processed independently of the others with its own
+  structuring function. The `output` tensor has shape
+  `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+  tensor depend on the `padding` algorithm. We currently only support the
+  default "NHWC" `data_format`.
+
+  In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+  (for consistency with `conv2d`, we use unmirrored filters):
+
+      output[b, y, x, c] =
+         max_{dy, dx} input[b,
+                            strides[1] * y + rates[1] * dy,
+                            strides[2] * x + rates[2] * dx,
+                            c] +
+                      filters[dy, dx, c]
+
+  Max-pooling is a special case when the filter has size equal to the pooling
+  kernel size and contains all zeros.
+
+  Note on duality: The dilation of `input` by the `filters` is equal to the
+  negation of the erosion of `-input` by the reflected `filters`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+      `int32`, `uint8`, `int16`, `int8`, `int64`, `bfloat16`, `uint16`, `half`,
+      `uint32`, `uint64`.
+      4-D with shape `[batch, in_height, in_width, depth]`.
+    filters: A `Tensor`. Must have the same type as `input`.
+      3-D with shape `[filter_height, filter_width, depth]`.
+    strides: A list of `ints` that has length `>= 4`.
+      The stride of the sliding window for each dimension of the input
+      tensor. Must be: `[1, stride_height, stride_width, 1]`.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: A `string`, only `"NCHW"` is currently supported.
+    dilations: A list of `ints` that has length `>= 4`.
+      The input stride for atrous morphological dilation. Must be:
+      `[1, rate_height, rate_width, 1]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if data_format != "NCHW":
+    raise ValueError("Data formats other than NCHW are not yet supported")
+
+  return gen_nn_ops.dilation2d(input=input,
+                               filter=filters,
+                               strides=strides,
+                               rates=dilations,
+                               padding=padding,
+                               name=name)
+
+
 @tf_export("nn.with_space_to_batch")
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
@@ -401,7 +472,7 @@ class _WithSpaceToBatch(object):
     if not dilation_rate.get_shape().is_fully_defined():
       raise ValueError("rate must have known shape")
 
-    num_spatial_dims = rate_shape[0].value
+    num_spatial_dims = rate_shape.dims[0].value
 
     if data_format is not None and data_format.startswith("NC"):
       starting_spatial_dim = 2
@@ -509,7 +580,7 @@ class _WithSpaceToBatch(object):
 
     # Recover channel information for output shape if channels are not last.
     if self.data_format is not None and self.data_format.startswith("NC"):
-      if not result_converted.shape[1].value and filter is not None:
+      if not result_converted.shape.dims[1].value and filter is not None:
         output_shape = result_converted.shape.as_list()
         output_shape[1] = filter.shape[-1]
         result_converted.set_shape(output_shape)
@@ -642,7 +713,7 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
   return strides, dilation_rate
 
 
-@tf_export("nn.convolution")
+@tf_export(v1=["nn.convolution"])
 def convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -710,12 +781,12 @@ def convolution(
   It is required that 1 <= N <= 3.
 
   Args:
-    input: An N-D `Tensor` of type `T`, of shape
+    input: An (N+2)-D `Tensor` of type `T`, of shape
       `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
       not start with "NC" (default), or
       `[batch_size, in_channels] + input_spatial_shape` if data_format starts
       with "NC".
-    filter: An N-D `Tensor` with the same type as `input` and shape
+    filter: An (N+2)-D `Tensor` with the same type as `input` and shape
       `spatial_filter_shape + [in_channels, out_channels]`.
     padding: A string, either `"VALID"` or `"SAME"`. The padding algorithm.
     strides: Optional.  Sequence of N ints >= 1.  Specifies the output stride.
@@ -780,6 +851,30 @@ def convolution(
     return op(input, filter)
 
 
+@tf_export("nn.convolution", v1=[])
+def convolution_v2(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  return convolution(
+      input,  # pylint: disable=redefined-builtin
+      filters,
+      padding=padding,
+      strides=strides,
+      dilation_rate=dilations,
+      name=name,
+      data_format=data_format)
+
+convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        convolution.__doc__, "dilation_rate", "dilations"),
+    "filter", "filters")
+
+
 class Convolution(object):
   """Helper class for convolution.
 
@@ -827,10 +922,11 @@ class Convolution(object):
           "filter tensor must have rank %d" % (num_spatial_dims + 2))
 
     if data_format is None or not data_format.startswith("NC"):
-      input_channels_dim = input_shape[num_spatial_dims + 1]
+      input_channels_dim = tensor_shape.dimension_at_index(
+          input_shape, num_spatial_dims + 1)
       spatial_dims = range(1, num_spatial_dims + 1)
     else:
-      input_channels_dim = input_shape[1]
+      input_channels_dim = tensor_shape.dimension_at_index(input_shape, 1)
       spatial_dims = range(2, num_spatial_dims + 2)
 
     if not input_channels_dim.is_compatible_with(
@@ -870,7 +966,7 @@ class Convolution(object):
     return self.conv_op(inp, filter)
 
 
-@tf_export("nn.pool")
+@tf_export(v1=["nn.pool"])
 def pool(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1041,6 +1137,105 @@ def pool(
         filter_shape=window_shape)
 
 
+@tf_export("nn.pool", v1=[])
+def pool_v2(
+    input,  # pylint: disable=redefined-builtin
+    window_shape,
+    pooling_type,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  # pylint: disable=line-too-long
+  """Performs an N-D pooling operation.
+
+  In the case that `data_format` does not start with "NC", computes for
+      0 <= b < batch_size,
+      0 <= x[i] < output_spatial_shape[i],
+      0 <= c < num_channels:
+
+  ```
+    output[b, x[0], ..., x[N-1], c] =
+      REDUCE_{z[0], ..., z[N-1]}
+        input[b,
+              x[0] * strides[0] - pad_before[0] + dilation_rate[0]*z[0],
+              ...
+              x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1],
+              c],
+  ```
+
+  where the reduction function REDUCE depends on the value of `pooling_type`,
+  and pad_before is defined based on the value of `padding` as described in
+  the "returns" section of `tf.nn.convolution` for details.
+  The reduction never includes out-of-bounds positions.
+
+  In the case that `data_format` starts with `"NC"`, the `input` and output are
+  simply transposed as follows:
+
+  ```
+    pool(input, data_format, **kwargs) =
+      tf.transpose(pool(tf.transpose(input, [0] + range(2,N+2) + [1]),
+                        **kwargs),
+                   [0, N+1] + range(1, N+1))
+  ```
+
+  Args:
+    input: Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if data_format does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC".  Pooling happens over the spatial dimensions only.
+    window_shape: Sequence of N ints >= 1.
+    pooling_type: Specifies pooling operation, must be "AVG" or "MAX".
+    strides: Optional. Sequence of N ints >= 1.  Defaults to [1]*N. If any value of
+      strides is > 1, then all values of dilation_rate must be 1.
+    padding: The padding algorithm, must be "SAME" or "VALID". Defaults to "SAME".
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string or None.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (default, or if `data_format`
+      does not start with "NC"), or the second dimension (if `data_format`
+      starts with "NC").  For N=1, the valid values are "NWC" (default) and
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW". For
+      N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations: Optional.  Dilation rate.  List of N ints >= 1. Defaults to
+      [1]*N.  If any value of dilation_rate is > 1, then all values of strides
+      must be 1.
+    name: Optional. Name of the op.
+
+  Returns:
+    Tensor of rank N+2, of shape
+      [batch_size] + output_spatial_shape + [num_channels]
+
+    if data_format is None or does not start with "NC", or
+
+      [batch_size, num_channels] + output_spatial_shape
+
+    if data_format starts with "NC",
+    where `output_spatial_shape` depends on the value of padding:
+
+    If padding = "SAME":
+      output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
+
+    If padding = "VALID":
+      output_spatial_shape[i] =
+        ceil((input_spatial_shape[i] - (window_shape[i] - 1) * dilation_rate[i])
+             / strides[i]).
+
+  Raises:
+    ValueError: if arguments are invalid.
+
+  """
+  return pool(
+      input=input,
+      window_shape=window_shape,
+      pooling_type=pooling_type,
+      padding=padding,
+      dilation_rate=dilations,
+      strides=strides,
+      name=name,
+      data_format=data_format)
+
+
 @tf_export("nn.atrous_conv2d")
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
@@ -1178,7 +1373,208 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
-@tf_export("nn.conv2d_transpose")
+@tf_export("nn.conv2d", v1=[])
+def conv2d_v2(input,  # pylint: disable=redefined-builtin
+              filters,
+              strides,
+              padding,
+              data_format="NHWC",
+              dilations=None,
+              name=None):
+  # pylint: disable=line-too-long
+  r"""Computes a 2-D convolution given 4-D `input` and `filters` tensors.
+
+  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+  and a filter / kernel tensor of shape
+  `[filter_height, filter_width, in_channels, out_channels]`, this op
+  performs the following:
+
+  1. Flattens the filter to a 2-D matrix with shape
+     `[filter_height * filter_width * in_channels, output_channels]`.
+  2. Extracts image patches from the input tensor to form a *virtual*
+     tensor of shape `[batch, out_height, out_width,
+     filter_height * filter_width * in_channels]`.
+  3. For each patch, right-multiplies the filter matrix and the image patch
+     vector.
+
+  In detail, with the default NHWC format,
+
+      output[b, i, j, k] =
+          sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+                          filter[di, dj, q, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      A 4-D tensor. The dimension order is interpreted according to the value
+      of `data_format`, see below for details.
+    filters: A `Tensor`. Must have the same type as `input`.
+      A 4-D tensor of shape
+      `[filter_height, filter_width, in_channels, out_channels]`
+    strides: A list of `ints`.
+      1-D tensor of length 4.  The stride of the sliding window for each
+      dimension of `input`. The dimension order is determined by the value of
+      `data_format`, see below for details.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, height, width, channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, channels, height, width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by the
+      value of `data_format`, see above for details. Dilations in the batch and
+      depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  # pylint: enable=line-too-long
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
+                           filters,
+                           strides,
+                           padding,
+                           use_cudnn_on_gpu=True,
+                           data_format=data_format,
+                           dilations=dilations,
+                           name=name)
+tf_export(v1=["nn.conv2d"])(gen_nn_ops.conv2d)
+
+
+@tf_export("nn.conv2d_backprop_filter", v1=[])
+def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
+                              filter_sizes,
+                              out_backprop,
+                              strides,
+                              padding,
+                              data_format="NHWC",
+                              dilations=None,
+                              name=None):
+  r"""Computes the gradients of convolution with respect to the filter.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape `[batch, in_height, in_width, in_channels]`.
+    filter_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the tensor shape of `filter`,
+      where `filter` is a 4-D
+      `[filter_height, filter_width, in_channels, out_channels]` tensor.
+    out_backprop: A `Tensor`. Must have the same type as `input`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
+                                           filter_sizes,
+                                           out_backprop,
+                                           strides,
+                                           padding,
+                                           use_cudnn_on_gpu=True,
+                                           data_format=data_format,
+                                           dilations=dilations,
+                                           name=name)
+tf_export(v1=["nn.conv2d_backprop_filter"])(
+    gen_nn_ops.conv2d_backprop_filter)
+
+
+@tf_export("nn.conv2d_backprop_input", v1=[])
+def conv2d_backprop_input_v2(input_sizes,
+                             filters,
+                             out_backprop,
+                             strides,
+                             padding,
+                             data_format="NHWC",
+                             dilations=None,
+                             name=None):
+  r"""Computes the gradients of convolution with respect to the input.
+
+  Args:
+    input_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the shape of `input`,
+      where `input` is a 4-D `[batch, height, width, channels]` tensor.
+    filters: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape
+      `[filter_height, filter_width, in_channels, out_channels]`.
+    out_backprop: A `Tensor`. Must have the same type as `filters`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `filters`.
+  """
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d_backprop_input(input_sizes,
+                                          filters,
+                                          out_backprop,
+                                          strides,
+                                          padding,
+                                          use_cudnn_on_gpu=True,
+                                          data_format=data_format,
+                                          dilations=dilations,
+                                          name=name)
+tf_export(v1=["nn.conv2d_backprop_input"])(
+    gen_nn_ops.conv2d_backprop_input)
+
+
+@tf_export(v1=["nn.conv2d_transpose"])
 def conv2d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -1224,7 +1620,8 @@ def conv2d_transpose(
     value = ops.convert_to_tensor(value, name="value")
     filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     axis = 3 if data_format == "NHWC" else 1
-    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[3]):
+    if not value.get_shape().dims[axis].is_compatible_with(
+        filter.get_shape()[3]):
       raise ValueError("input channels does not match filter's input channels, "
                        "{} != {}".format(value.get_shape()[axis],
                                          filter.get_shape()[3]))
@@ -1236,7 +1633,8 @@ def conv2d_transpose(
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
-      if not filter.get_shape()[2].is_compatible_with(output_shape[axis]):
+      if not filter.get_shape().dims[2].is_compatible_with(
+          output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
             "{} != {}".format(output_shape[axis],
@@ -1256,6 +1654,31 @@ def conv2d_transpose(
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.conv2d_transpose", v1=[])
+def conv2d_transpose_v2(
+    input,
+    filters,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NHWC",
+    name=None):
+  return conv2d_transpose(
+      input,
+      filters,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+conv2d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        conv2d_transpose.__doc__, "filter", "filters"),
+    "value", "input")
+
+
 @tf_export("nn.atrous_conv2d_transpose")
 def atrous_conv2d_transpose(value,
                             filters,
@@ -1303,7 +1726,7 @@ def atrous_conv2d_transpose(value,
                       [value, filters, output_shape]) as name:
     value = ops.convert_to_tensor(value, name="value")
     filters = ops.convert_to_tensor(filters, name="filters")
-    if not value.get_shape()[3].is_compatible_with(filters.get_shape()[3]):
+    if not value.get_shape().dims[3].is_compatible_with(filters.get_shape()[3]):
       raise ValueError(
           "value's input channels does not match filters' input channels, "
           "{} != {}".format(value.get_shape()[3],
@@ -1327,7 +1750,7 @@ def atrous_conv2d_transpose(value,
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [4] if reached this point.
-      if not filters.get_shape()[2].is_compatible_with(output_shape[3]):
+      if not filters.get_shape().dims[2].is_compatible_with(output_shape[3]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
             "{} != {}".format(output_shape[3],
@@ -1404,7 +1827,29 @@ def atrous_conv2d_transpose(value,
         input=value, crops=batch_to_space_crop, block_size=rate)
 
 
-@tf_export("nn.conv3d_transpose")
+@tf_export("nn.conv3d", v1=[])
+def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
+              filters,
+              strides,
+              padding,
+              data_format="NDHWC",
+              dilations=None,
+              name=None):
+  if dilations is None:
+    dilations = [1, 1, 1, 1, 1]
+  return gen_nn_ops.conv3d(input,  # pylint: disable=redefined-builtin
+                           filters,
+                           strides,
+                           padding,
+                           data_format=data_format,
+                           dilations=dilations,
+                           name=name)
+tf_export(v1=["nn.conv3d"])(gen_nn_ops.conv3d)
+conv3d_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    gen_nn_ops.conv3d.__doc__, "filter", "filters")
+
+
+@tf_export(v1=["nn.conv3d_transpose"])
 def conv3d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -1448,7 +1893,8 @@ def conv3d_transpose(
     value = ops.convert_to_tensor(value, name="value")
     filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
     axis = 1 if data_format == "NCDHW" else 4
-    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[4]):
+    if not value.get_shape().dims[axis].is_compatible_with(
+        filter.get_shape()[4]):
       raise ValueError("input channels does not match filter's input channels, "
                        "{} != {}".format(value.get_shape()[axis],
                                          filter.get_shape()[4]))
@@ -1460,7 +1906,8 @@ def conv3d_transpose(
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [5] if reached this point.
-      if not filter.get_shape()[3].is_compatible_with(output_shape[axis]):
+      if not filter.get_shape().dims[3].is_compatible_with(
+          output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
             "{} != {}".format(output_shape[axis],
@@ -1480,6 +1927,31 @@ def conv3d_transpose(
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.conv3d_transpose", v1=[])
+def conv3d_transpose_v2(
+    input,
+    filters,
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NDHWC",
+    name=None):
+  return conv3d_transpose(
+      input,
+      filters,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+conv3d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        conv3d_transpose.__doc__, "filter", "filters"),
+    "value", "input")
+
+
 @tf_export("nn.bias_add")
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
@@ -1535,7 +2007,7 @@ def bias_add_v1(value, bias, name=None):
     return gen_nn_ops.bias_add_v1(value, bias, name=name)
 
 
-@tf_export("nn.crelu")
+@tf_export(v1=["nn.crelu"])
 def crelu(features, name=None, axis=-1):
   """Computes Concatenated ReLU.
 
@@ -1561,6 +2033,12 @@ def crelu(features, name=None, axis=-1):
     return gen_nn_ops.relu(c)
 
 
+@tf_export("nn.crelu", v1=[])
+def crelu_v2(features, axis=-1, name=None):
+  return crelu(features, name=name, axis=axis)
+crelu_v2.__doc__ = crelu.__doc__
+
+
 @tf_export("nn.relu6")
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
@@ -1602,6 +2080,10 @@ def leaky_relu(features, alpha=0.2, name=None):
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
+    if compat.forward_compatible(2018, 11, 1):
+      if isinstance(alpha, np.ndarray):
+        alpha = np.asscalar(alpha)
+      return gen_nn_ops.leaky_relu(features, alpha=alpha, name=name)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
     return math_ops.maximum(alpha * features, features, name=name)
 
@@ -1674,6 +2156,16 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   if is_last_dim:
     return compute_op(logits, name=name)
 
+  dim_val = dim
+  if isinstance(dim, ops.Tensor):
+    dim_val = tensor_util.constant_value(dim)
+  if dim_val is not None and (dim_val < -shape.ndims or dim_val >= shape.ndims):
+    raise errors_impl.InvalidArgumentError(
+        None, None,
+        "Dimension (%d) must be in the range [%d, %d) where %d is the number of"
+        " dimensions in the input." % (dim_val, -shape.ndims, shape.ndims,
+                                       shape.ndims))
+
   # If dim is not the last dimension, we have to do a transpose so that we can
   # still perform softmax on its last dimension.
 
@@ -1694,7 +2186,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   return output
 
 
-@tf_export("nn.softmax", "math.softmax")
+@tf_export(v1=["nn.softmax", "math.softmax"])
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -1724,7 +2216,34 @@ def softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops.softmax, axis, name)
 
 
-@tf_export("nn.log_softmax", "math.log_softmax")
+@tf_export("nn.softmax", "math.softmax", v1=[])
+def softmax_v2(logits, axis=None, name=None):
+  """Computes softmax activations.
+
+  This function performs the equivalent of
+
+      softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
+
+  Args:
+    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    axis: The dimension softmax would be performed on. The default is -1 which
+      indicates the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type and shape as `logits`.
+
+  Raises:
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
+      dimension of `logits`.
+  """
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops.softmax, axis, name)
+
+
+@tf_export(v1=["nn.log_softmax", "math.log_softmax"])
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -1754,6 +2273,33 @@ def log_softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
 
 
+@tf_export("nn.log_softmax", "math.log_softmax", v1=[])
+def log_softmax_v2(logits, axis=None, name=None):
+  """Computes log softmax activations.
+
+  For each batch `i` and class `j` we have
+
+      logsoftmax = logits - log(reduce_sum(exp(logits), axis))
+
+  Args:
+    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    axis: The dimension softmax would be performed on. The default is -1 which
+      indicates the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
+
+  Raises:
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
+      dimension of `logits`.
+  """
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
+
+
 def _ensure_xent_args(name, sentinel, labels, logits):
   # Make sure that all arguments were passed as named arguments.
   if sentinel is not None:
@@ -1763,13 +2309,8 @@ def _ensure_xent_args(name, sentinel, labels, logits):
     raise ValueError("Both labels and logits must be provided.")
 
 
-@tf_export("nn.softmax_cross_entropy_with_logits_v2")
-def softmax_cross_entropy_with_logits_v2(
-    _sentinel=None,  # pylint: disable=invalid-name
-    labels=None,
-    logits=None,
-    dim=-1,
-    name=None):
+@tf_export("nn.softmax_cross_entropy_with_logits", v1=[])
+def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1791,7 +2332,7 @@ def softmax_cross_entropy_with_logits_v2(
 
   A common use case is to have logits and labels of shape
   `[batch_size, num_classes]`, but higher dimensions are supported, with
-  the `dim` argument specifying the class dimension.
+  the `axis` argument specifying the class dimension.
 
   `logits` and `labels` must have the same dtype (either `float16`, `float32`,
   or `float64`).
@@ -1804,13 +2345,12 @@ def softmax_cross_entropy_with_logits_v2(
   this function.**
 
   Args:
-    _sentinel: Used to prevent positional parameters. Internal, do not use.
     labels: Each vector along the class dimension should hold a valid
       probability distribution e.g. for the case in which labels are of shape
       `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
       probability distribution.
     logits: Unscaled log probabilities.
-    dim: The class dimension. Defaulted to -1 which is the last dimension.
+    axis: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
 
   Returns:
@@ -1818,12 +2358,69 @@ def softmax_cross_entropy_with_logits_v2(
     same as `logits` and its shape is the same as `labels` except that it does
     not have the last dimension of `labels`.
   """
-  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
-                    logits)
+  return softmax_cross_entropy_with_logits_v2_helper(
+      labels=labels, logits=logits, axis=axis, name=name)
+
+
+@tf_export(v1=["nn.softmax_cross_entropy_with_logits_v2"])
+@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def softmax_cross_entropy_with_logits_v2_helper(
+    labels, logits, axis=None, name=None, dim=None):
+  """Computes softmax cross entropy between `logits` and `labels`.
+
+  Measures the probability error in discrete classification tasks in which the
+  classes are mutually exclusive (each entry is in exactly one class).  For
+  example, each CIFAR-10 image is labeled with one and only one label: an image
+  can be a dog or a truck, but not both.
+
+  **NOTE:**  While the classes are mutually exclusive, their probabilities
+  need not be.  All that is required is that each row of `labels` is
+  a valid probability distribution.  If they are not, the computation of the
+  gradient will be incorrect.
+
+  If using exclusive `labels` (wherein one and only
+  one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
+
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
+  on `logits` internally for efficiency.  Do not call this op with the
+  output of `softmax`, as it will produce incorrect results.
 
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `axis` argument specifying the class dimension.
+
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
+  or `float64`).
+
+  Backpropagation will happen into both `logits` and `labels`.  To disallow
+  backpropagation into `labels`, pass label tensors through `tf.stop_gradient`
+  before feeding it to this function.
+
+  **Note that to avoid confusion, it is required to pass only named arguments to
+  this function.**
+
+  Args:
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
+    logits: Unscaled log probabilities.
+    axis: The class dimension. Defaulted to -1 which is the last dimension.
+    name: A name for the operation (optional).
+    dim: Deprecated alias for axis.
+
+  Returns:
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
+  """
   # TODO(pcmurray) Raise an error when the labels do not sum to 1. Note: This
   # could break users who call this with bad labels, but disregard the bad
   # results.
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  del dim
+  if axis is None:
+    axis = -1
 
   with ops.name_scope(name, "softmax_cross_entropy_with_logits",
                       [logits, labels]) as name:
@@ -1840,7 +2437,7 @@ def softmax_cross_entropy_with_logits_v2(
     shape = logits.get_shape()
 
     # Move the dim to the end if dim is not the last dimension.
-    if dim is not -1:
+    if axis != -1:
 
       def _move_dim_to_end(tensor, dim_index, rank):
         return array_ops.transpose(
@@ -1850,8 +2447,8 @@ def softmax_cross_entropy_with_logits_v2(
                 math_ops.range(dim_index + 1, rank), [dim_index]
             ], 0))
 
-      precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
-      labels = _move_dim_to_end(labels, dim, input_rank)
+      precise_logits = _move_dim_to_end(precise_logits, axis, input_rank)
+      labels = _move_dim_to_end(labels, axis, input_rank)
 
     input_shape = array_ops.shape(precise_logits)
 
@@ -1865,7 +2462,7 @@ def softmax_cross_entropy_with_logits_v2(
     cost, unused_backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
         precise_logits, labels, name=name)
 
-    # The output cost shape should be the input minus dim.
+    # The output cost shape should be the input minus axis.
     output_shape = array_ops.slice(input_shape, [0],
                                    [math_ops.subtract(input_rank, 1)])
     cost = array_ops.reshape(cost, output_shape)
@@ -1875,7 +2472,7 @@ def softmax_cross_entropy_with_logits_v2(
     if not context.executing_eagerly(
     ) and shape is not None and shape.dims is not None:
       shape = shape.as_list()
-      del shape[dim]
+      del shape[axis]
       cost.set_shape(shape)
 
     if convert_to_float32:
@@ -1892,7 +2489,7 @@ See `tf.nn.softmax_cross_entropy_with_logits_v2`.
 """
 
 
-@tf_export("nn.softmax_cross_entropy_with_logits")
+@tf_export(v1=["nn.softmax_cross_entropy_with_logits"])
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
 def softmax_cross_entropy_with_logits(
     _sentinel=None,  # pylint: disable=invalid-name
@@ -1953,7 +2550,7 @@ def softmax_cross_entropy_with_logits(
     labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
 
   return softmax_cross_entropy_with_logits_v2(
-      labels=labels, logits=logits, dim=dim, name=name)
+      labels=labels, logits=logits, axis=dim, name=name)
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits")
@@ -1980,8 +2577,9 @@ def sparse_softmax_cross_entropy_with_logits(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  A common use case is to have logits and labels of shape
-  `[batch_size, num_classes]`, but higher dimensions are supported, in which
+  A common use case is to have logits of shape
+  `[batch_size, num_classes]` and have labels of shape
+  `[batch_size]`, but higher dimensions are supported, in which
   case the `dim`-th dimension is assumed to be of size `num_classes`.
   `logits` must have the dtype of `float16`, `float32`, or `float64`, and
   `labels` must have the dtype of `int32` or `int64`.
@@ -2142,6 +2740,67 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool_with_argmax", v1=[])
+def max_pool_with_argmax_v2(input,
+                            ksize,
+                            strides,
+                            padding,
+                            data_format="NHWC",
+                            output_dtype=dtypes.int64,
+                            name=None):
+  """Performs max pooling on the input and outputs both max values and indices.
+
+  The indices in `argmax` are flattened, so that a maximum value at position
+  `[b, y, x, c]` becomes flattened index
+  `((b * height + y) * width + x) * channels + c`.
+
+  The indices returned are always in `[0, height) x [0, width)` before
+  flattening, even if padding is involved and the mathematically correct answer
+  is outside (either negative or too large).  This is a bug, but fixing it is
+  difficult to do in a safe backwards compatible way, especially due to
+  flattening.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+      `int32`, `uint8`, `int16`, `int8`, `int64`, `bfloat16`, `uint16`, `half`,
+      `uint32`, `uint64`.
+      4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+    ksize: A list of `ints` that has length `>= 4`.
+      The size of the window for each dimension of the input tensor.
+    strides: A list of `ints` that has length `>= 4`.
+      The stride of the sliding window for each dimension of the
+      input tensor.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string`, must be set to `"NHWC"`. Defaults to
+      `"NHWC"`.
+      Specify the data format of the input and output data.
+    output_dtype: An optional `tf.DType` from: `tf.int32, tf.int64`.
+      Defaults to `tf.int64`.
+      The dtype of the returned argmax tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (output, argmax).
+
+    output: A `Tensor`. Has the same type as `input`.
+    argmax: A `Tensor` of type `output_dtype`.
+  """
+
+  if data_format != "NHWC":
+    raise ValueError("Data formats other than 'NHWC' are not yet supported")
+
+  return gen_nn_ops.max_pool_with_argmax(input=input,
+                                         ksize=ksize,
+                                         strides=strides,
+                                         padding=padding,
+                                         Targmax=output_dtype,
+                                         name=name)
+
+# pylint: enable=redefined-builtin
+
+
 @ops.RegisterStatistics("Conv2D", "flops")
 def _calc_conv_flops(graph, node):
   """Calculates the compute resources needed for Conv2D."""
@@ -2186,7 +2845,7 @@ def _calc_bias_add_flops(graph, node):
   return ops.OpStats("flops", input_count)
 
 
-@tf_export("nn.xw_plus_b")
+@tf_export(v1=["nn.xw_plus_b"])
 def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
   """Computes matmul(x, weights) + biases.
 
@@ -2258,12 +2917,16 @@ def _get_noise_shape(x, noise_shape):
   return noise_shape
 
 
-@tf_export("nn.dropout")
-def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+@tf_export(v1=["nn.dropout"])
+@deprecation.deprecated_args(None, "Please use `rate` instead of `keep_prob`. "
+                             "Rate should be set to `rate = 1 - keep_prob`.",
+                             "keep_prob")
+def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
+            rate=None):  # pylint: disable=invalid-name
   """Computes dropout.
 
-  With probability `keep_prob`, outputs the input element scaled up by
-  `1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
+  For each element of `x`, with probability `rate`, outputs `0`, and otherwise
+  scales up the input by `1 / (1-rate)`. The scaling is such that the expected
   sum is unchanged.
 
   By default, each element is kept or dropped independently.  If `noise_shape`
@@ -2276,8 +2939,59 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
 
   Args:
     x: A floating point tensor.
-    keep_prob: A scalar `Tensor` with the same type as x. The probability
-      that each element is kept.
+    keep_prob: (deprecated) A deprecated alias for `(1-rate)`.
+    noise_shape: A 1-D `Tensor` of type `int32`, representing the
+      shape for randomly generated keep/drop flags.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed` for behavior.
+    name: A name for this operation (optional).
+    rate: A scalar `Tensor` with the same type as `x`. The probability that each
+      element of `x` is discarded.
+
+  Returns:
+    A Tensor of the same shape of `x`.
+
+  Raises:
+    ValueError: If `rate` is not in `[0, 1)` or if `x` is not a floating
+      point tensor.
+  """
+  try:
+    keep = 1. - keep_prob if keep_prob is not None else None
+  except TypeError:
+    raise ValueError("keep_prob must be a floating point number or Tensor "
+                     "(got %r)" % keep_prob)
+
+  rate = deprecation.deprecated_argument_lookup(
+      "rate", rate,
+      "keep_prob", keep)
+
+  if rate is None:
+    raise ValueError("You must provide a rate to dropout.")
+
+  return dropout_v2(x, rate, noise_shape=noise_shape, seed=seed, name=name)
+
+
+@tf_export("nn.dropout", v1=[])
+def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+  """Computes dropout.
+
+  With probability `rate`, drops elements of `x`. Input that are kept are
+  scaled up by `1 / (1 - rate)`, otherwise outputs `0`.  The scaling is so that
+  the expected sum is unchanged.
+
+  By default, each element is kept or dropped independently.  If `noise_shape`
+  is specified, it must be
+  [broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]`
+  will make independent decisions.  For example, if `shape(x) = [k, l, m, n]`
+  and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be
+  kept independently and each row and column will be kept or not kept together.
+
+  Args:
+    x: A floating point tensor.
+    rate: A scalar `Tensor` with the same type as x. The probability
+      that each element is dropped. For example, setting rate=0.1 would drop
+      10% of input elements.
     noise_shape: A 1-D `Tensor` of type `int32`, representing the
       shape for randomly generated keep/drop flags.
     seed: A Python integer. Used to create random seeds. See
@@ -2297,35 +3011,36 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     if not x.dtype.is_floating:
       raise ValueError("x has to be a floating point tensor since it's going to"
                        " be scaled. Got a %s tensor instead." % x.dtype)
-    if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
-      raise ValueError("keep_prob must be a scalar tensor or a float in the "
-                       "range (0, 1], got %g" % keep_prob)
+    if isinstance(rate, numbers.Real) and not (rate >= 0 and rate < 1):
+      raise ValueError("rate must be a scalar tensor or a float in the "
+                       "range [0, 1), got %g" % rate)
 
     # Early return if nothing needs to be dropped.
-    if isinstance(keep_prob, float) and keep_prob == 1:
+    if isinstance(rate, numbers.Real) and rate == 0:
       return x
     if context.executing_eagerly():
-      if isinstance(keep_prob, ops.EagerTensor):
-        if keep_prob.numpy() == 1:
+      if isinstance(rate, ops.EagerTensor):
+        if rate.numpy() == 0:
           return x
     else:
-      keep_prob = ops.convert_to_tensor(
-          keep_prob, dtype=x.dtype, name="keep_prob")
-      keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
+      rate = ops.convert_to_tensor(
+          rate, dtype=x.dtype, name="rate")
+      rate.get_shape().assert_is_compatible_with(tensor_shape.scalar())
 
-      # Do nothing if we know keep_prob == 1
-      if tensor_util.constant_value(keep_prob) == 1:
+      # Do nothing if we know rate == 0
+      if tensor_util.constant_value(rate) == 0:
         return x
 
     noise_shape = _get_noise_shape(x, noise_shape)
 
+    keep_prob = 1 - rate
     # uniform [keep_prob, 1.0 + keep_prob)
     random_tensor = keep_prob
     random_tensor += random_ops.random_uniform(
         noise_shape, seed=seed, dtype=x.dtype)
     # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
     binary_tensor = math_ops.floor(random_tensor)
-    ret = math_ops.div(x, keep_prob) * binary_tensor
+    ret = math_ops.divide(x, keep_prob) * binary_tensor
     if not context.executing_eagerly():
       ret.set_shape(x.get_shape())
     return ret
@@ -2389,7 +3104,293 @@ def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefine
   return gen_nn_ops.nth_element(input, n, reverse=reverse, name=name)
 
 
-@tf_export("nn.conv1d")
+@tf_export(v1=["nn.fractional_max_pool"])
+@deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
+                        "args are deprecated.  Use fractional_max_pool_v2.")
+def fractional_max_pool(value,
+                        pooling_ratio,
+                        pseudo_random=False,
+                        overlapping=False,
+                        deterministic=False,
+                        seed=0,
+                        seed2=0,
+                        name=None):   # pylint: disable=redefined-builtin
+  r"""Performs fractional max pooling on the input.
+
+  This is a deprecated version of `fractional_max_pool`.
+
+  Fractional max pooling is slightly different than regular max pooling.  In
+  regular max pooling, you downsize an input set by taking the maximum value of
+  smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+  a factor of N, where N is an integer.  Fractional max pooling, as you might
+  expect from the word "fractional", means that the overall reduction ratio N
+  does not have to be an integer.
+
+  The sizes of the pooling regions are generated randomly but are fairly
+  uniform.  For example, let's look at the height dimension, and the constraints
+  on the list of rows that will be pool boundaries.
+
+  First we define the following:
+
+  1.  input_row_length : the number of rows from the input set
+  2.  output_row_length : which will be smaller than the input
+  3.  alpha = input_row_length / output_row_length : our reduction ratio
+  4.  K = floor(alpha)
+  5.  row_pooling_sequence : this is the result list of pool boundary rows
+
+  Then, row_pooling_sequence should satisfy:
+
+  1.  a[0] = 0 : the first value of the sequence is 0
+  2.  a[end] = input_row_length : the last value of the sequence is the size
+  3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+  4.  length(row_pooling_sequence) = output_row_length+1
+
+  For more details on fractional max pooling, see this paper: [Benjamin Graham,
+  Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional max pooling.
+    deterministic: An optional `bool`.  Deprecated; use `fractional_max_pool_v2`
+      instead.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    seed2: An optional `int`.  Deprecated; use `fractional_max_pool_v2` instead.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional max pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                        overlapping, deterministic, seed, seed2,
+                                        name)
+
+
+@tf_export("nn.fractional_max_pool", v1=[])
+def fractional_max_pool_v2(value,
+                           pooling_ratio,
+                           pseudo_random=False,
+                           overlapping=False,
+                           seed=0,
+                           name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional max pooling on the input.
+
+  Fractional max pooling is slightly different than regular max pooling.  In
+  regular max pooling, you downsize an input set by taking the maximum value of
+  smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+  a factor of N, where N is an integer.  Fractional max pooling, as you might
+  expect from the word "fractional", means that the overall reduction ratio N
+  does not have to be an integer.
+
+  The sizes of the pooling regions are generated randomly but are fairly
+  uniform.  For example, let's look at the height dimension, and the constraints
+  on the list of rows that will be pool boundaries.
+
+  First we define the following:
+
+  1.  input_row_length : the number of rows from the input set
+  2.  output_row_length : which will be smaller than the input
+  3.  alpha = input_row_length / output_row_length : our reduction ratio
+  4.  K = floor(alpha)
+  5.  row_pooling_sequence : this is the result list of pool boundary rows
+
+  Then, row_pooling_sequence should satisfy:
+
+  1.  a[0] = 0 : the first value of the sequence is 0
+  2.  a[end] = input_row_length : the last value of the sequence is the size
+  3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+  4.  length(row_pooling_sequence) = output_row_length+1
+
+  For more details on fractional max pooling, see this paper: [Benjamin Graham,
+  Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional max pooling.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional max pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  if seed == 0:
+    return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=False,
+                                          seed=0, seed2=0, name=name)
+  else:
+    seed1, seed2 = random_seed.get_seed(seed)
+    return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=True,
+                                          seed=seed1, seed2=seed2, name=name)
+
+
+@tf_export(v1=["nn.fractional_avg_pool"])
+@deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
+                        "args are deprecated.  Use fractional_avg_pool_v2.")
+def fractional_avg_pool(value,
+                        pooling_ratio,
+                        pseudo_random=False,
+                        overlapping=False,
+                        deterministic=False,
+                        seed=0,
+                        seed2=0,
+                        name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional average pooling on the input.
+
+  This is a deprecated version of `fractional_avg_pool`.
+
+  Fractional average pooling is similar to Fractional max pooling in the pooling
+  region generation step. The only difference is that after pooling regions are
+  generated, a mean operation is performed instead of a max operation in each
+  pooling region.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional avg pooling.
+    deterministic: An optional `bool`.  Deprecated; use `fractional_avg_pool_v2`
+      instead.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    seed2: An optional `int`.  Deprecated; use `fractional_avg_pool_v2` instead.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional avg pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                        overlapping, deterministic, seed, seed2,
+                                        name=name)
+
+
+@tf_export("nn.fractional_avg_pool", v1=[])
+def fractional_avg_pool_v2(value,
+                           pooling_ratio,
+                           pseudo_random=False,
+                           overlapping=False,
+                           seed=0,
+                           name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional average pooling on the input.
+
+  Fractional average pooling is similar to Fractional max pooling in the pooling
+  region generation step. The only difference is that after pooling regions are
+  generated, a mean operation is performed instead of a max operation in each
+  pooling region.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional avg pooling.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional avg pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  if seed == 0:
+    return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=False,
+                                          seed=0, seed2=0, name=name)
+  else:
+    seed1, seed2 = random_seed.get_seed(seed)
+    return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=True,
+                                          seed=seed1, seed2=seed2, name=name)
+
+
+@tf_export(v1=["nn.conv1d"])
 @deprecation.deprecated_arg_values(
     None,
     "`NCHW` for data_format is deprecated, use `NCW` instead",
@@ -2474,6 +3475,64 @@ def conv1d(value,
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
+@tf_export("nn.conv1d", v1=[])
+def conv1d_v2(input,  # pylint: disable=redefined-builtin
+              filters,
+              stride,
+              padding,
+              data_format=None,
+              name=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `input`.
+    stride: An `integer`.  The number of entries by which
+      the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
+      to `"NWC"`, the data is stored in the order of
+      [batch, in_width, in_channels].  The `"NCW"` format stores
+      data as [batch, in_channels, in_width].
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  return conv1d(input,  # pylint: disable=redefined-builtin
+                filters,
+                stride,
+                padding,
+                use_cudnn_on_gpu=True,
+                data_format=data_format,
+                name=name)
+
+
 def conv1d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -2529,14 +3588,16 @@ def conv1d_transpose(
     else:
       raise ValueError("data_format must be \"NWC\" or \"NCW\".")
 
-    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[2]):
+    if not value.get_shape().dims[axis].is_compatible_with(
+        filter.get_shape()[2]):
       raise ValueError("input channels does not match filter's input channels, "
                        "{} != {}".format(value.get_shape()[axis],
                                          filter.get_shape()[2]))
 
     if isinstance(output_shape, (list, np.ndarray)):
       # output_shape's shape should be == [3] if reached this point.
-      if not filter.get_shape()[1].is_compatible_with(output_shape[axis]):
+      if not filter.get_shape().dims[1].is_compatible_with(
+          output_shape[axis]):
         raise ValueError(
             "output_shape does not match filter's output channels, "
             "{} != {}".format(output_shape[axis],
@@ -2587,7 +3648,7 @@ def _calc_dilation2d_flops(graph, node):
   return ops.OpStats("flops", (output_count * filter_height * filter_width * 2))
 
 
-@tf_export("nn.erosion2d")
+@tf_export(v1=["nn.erosion2d"])
 def erosion2d(value, kernel, strides, rates, padding, name=None):
   """Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
 
@@ -2646,7 +3707,76 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
             name=name))
 
 
-@tf_export("math.in_top_k", "nn.in_top_k")
+@tf_export("nn.erosion2d", v1=[])
+def erosion2d_v2(value,
+                 filters,
+                 strides,
+                 padding,
+                 data_format,
+                 dilations,
+                 name=None):
+  """Computes the grayscale erosion of 4-D `value` and 3-D `filters` tensors.
+
+  The `value` tensor has shape `[batch, in_height, in_width, depth]` and the
+  `filters` tensor has shape `[filters_height, filters_width, depth]`, i.e.,
+  each input channel is processed independently of the others with its own
+  structuring function. The `output` tensor has shape
+  `[batch, out_height, out_width, depth]`. The spatial dimensions of the
+  output tensor depend on the `padding` algorithm. We currently only support the
+  default "NHWC" `data_format`.
+
+  In detail, the grayscale morphological 2-D erosion is given by:
+
+      output[b, y, x, c] =
+         min_{dy, dx} value[b,
+                            strides[1] * y - dilations[1] * dy,
+                            strides[2] * x - dilations[2] * dx,
+                            c] -
+                      filters[dy, dx, c]
+
+  Duality: The erosion of `value` by the `filters` is equal to the negation of
+  the dilation of `-value` by the reflected `filters`.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, in_height, in_width, depth]`.
+    filters: A `Tensor`. Must have the same type as `value`.
+      3-D with shape `[filters_height, filters_width, depth]`.
+    strides: A list of `ints` that has length `>= 4`.
+      1-D of length 4. The stride of the sliding window for each dimension of
+      the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: A `string`, only `"NHWC"` is currently supported.
+    dilations: A list of `ints` that has length `>= 4`.
+      1-D of length 4. The input stride for atrous morphological dilation.
+      Must be: `[1, rate_height, rate_width, 1]`.
+    name: A name for the operation (optional). If not specified "erosion2d"
+      is used.
+
+  Returns:
+    A `Tensor`. Has the same type as `value`.
+    4-D with shape `[batch, out_height, out_width, depth]`.
+
+  Raises:
+    ValueError: If the `value` depth does not match `filters`' shape, or if
+      padding is other than `'VALID'` or `'SAME'`.
+  """
+  if data_format != "NHWC":
+    raise ValueError("Data formats other than NHWC are not yet supported")
+
+  with ops.name_scope(name, "erosion2d", [value, filters]) as name:
+    # Reduce erosion to dilation by duality.
+    return math_ops.negative(
+        gen_nn_ops.dilation2d(
+            input=math_ops.negative(value),
+            filter=array_ops.reverse_v2(filters, [0, 1]),
+            strides=strides,
+            rates=dilations,
+            padding=padding,
+            name=name))
+
+
+@tf_export(v1=["math.in_top_k", "nn.in_top_k"])
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
@@ -2678,3 +3808,17 @@ def in_top_k(predictions, targets, k, name=None):
   """
   with ops.name_scope(name, "in_top_k"):
     return gen_nn_ops.in_top_kv2(predictions, targets, k, name=name)
+
+
+@tf_export("math.in_top_k", "nn.in_top_k", v1=[])
+def in_top_k_v2(targets, predictions, k, name=None):
+  return in_top_k(predictions, targets, k, name)
+
+
+in_top_k_v2.__doc__ = in_top_k.__doc__
+
+
+tf_export(v1=["nn.quantized_avg_pool"])(gen_nn_ops.quantized_avg_pool)
+tf_export(v1=["nn.quantized_conv2d"])(gen_nn_ops.quantized_conv2d)
+tf_export(v1=["nn.quantized_relu_x"])(gen_nn_ops.quantized_relu_x)
+tf_export(v1=["nn.quantized_max_pool"])(gen_nn_ops.quantized_max_pool)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 2fabb2e966aea4ff02cc3e0326567a8b69335c2b..82fab741830fddd4ee0ba5c8e2644702ec199b4d 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -49,23 +49,46 @@ class ZeroFractionTest(test_lib.TestCase):
     nonzeros = np.count_nonzero(x.flatten())
     return 1.0 - nonzeros / total_elements
 
+  @test_util.run_deprecated_v1
   def testZeroFraction(self):
     x_shape = [5, 17]
     x_np = np.random.randint(0, 2, size=x_shape).astype(np.float32)
     y_np = self._ZeroFraction(x_np)
-    with self.cached_session():
-      x_tf = constant_op.constant(x_np)
-      x_tf.set_shape(x_shape)
-      y_tf = nn_impl.zero_fraction(x_tf)
-      y_tf_np = y_tf.eval()
+
+    x_tf = constant_op.constant(x_np)
+    x_tf.set_shape(x_shape)
+    y_tf = nn_impl.zero_fraction(x_tf)
+    y_tf_np = self.evaluate(y_tf)
+
     eps = 1e-8
     self.assertAllClose(y_tf_np, y_np, eps)
 
+  @test_util.run_deprecated_v1
   def testZeroFractionEmpty(self):
-    with self.cached_session():
-      x = np.zeros(0)
-      y = nn_impl.zero_fraction(x).eval()
-      self.assertTrue(np.isnan(y))
+    x = np.zeros(0)
+    y = self.evaluate(nn_impl.zero_fraction(x))
+    self.assertTrue(np.isnan(y))
+
+  @test_util.run_deprecated_v1
+  def testZeroFraction2_27Zeros(self):
+    sparsity = nn_impl.zero_fraction(
+        array_ops.zeros([int(2**27 * 1.01)], dtype=dtypes.int8))
+    self.assertAllClose(1.0, self.evaluate(sparsity))
+
+  @test_util.run_deprecated_v1
+  def testZeroFraction2_27Ones(self):
+    sparsity = nn_impl.zero_fraction(
+        array_ops.ones([int(2**27 * 1.01)], dtype=dtypes.int8))
+    self.assertAllClose(0.0, self.evaluate(sparsity))
+
+  @test_util.run_deprecated_v1
+  def testUnknownSize(self):
+    value = array_ops.placeholder(dtype=dtypes.float32)
+    sparsity = nn_impl.zero_fraction(value)
+    with self.cached_session() as sess:
+      self.assertAllClose(
+          0.25,
+          sess.run(sparsity, {value: [[0., 1.], [0.3, 2.]]}))
 
 
 class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
@@ -83,8 +106,8 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     x_np = np.random.randn(*x_shape).astype(np.float32)
     y_np = self._softmax(x_np)
     x_tf = constant_op.constant(x_np)
-    y_tf = nn_ops.softmax(x_tf)
-    y_tf_last_dim = nn_ops.softmax(x_tf, 1)
+    y_tf = nn_ops.softmax_v2(x_tf)
+    y_tf_last_dim = nn_ops.softmax_v2(x_tf, 1)
     y_tf_np = self.evaluate(y_tf)
     y_tf_last_dim_np = self.evaluate(y_tf_last_dim)
     eps = 1e-3
@@ -93,9 +116,9 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
 
   def testSoftmaxAxes(self):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
-    x_neg_axis = nn_ops.softmax(arr, axis=-2)
-    y_pos_axis = nn_ops.softmax(arr, axis=0)
-    z_gt_axis = nn_ops.softmax(arr, axis=4)
+    x_neg_axis = nn_ops.softmax_v2(arr, axis=-2)
+    y_pos_axis = nn_ops.softmax_v2(arr, axis=0)
+    z_gt_axis = nn_ops.softmax_v2(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
@@ -104,11 +127,12 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
+  @test_util.run_deprecated_v1
   def testGradient(self, x_shape):
     x_np = np.random.randn(*x_shape).astype(np.float64)
     with self.cached_session():
       x_tf = constant_op.constant(x_np)
-      y_tf = nn_ops.softmax(x_tf)
+      y_tf = nn_ops.softmax_v2(x_tf)
       err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                     x_shape)
     eps = 2e-8
@@ -139,6 +163,7 @@ class LogPoissonLossTest(test_lib.TestCase):
     self.assertAllClose(y_tf_np, y_np, eps)
     self.assertAllClose(y_tf_np_stirling, y_np_stirling, eps)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float64)
@@ -171,16 +196,16 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     x_np = np.random.randn(*x_shape).astype(np.float32)
     y_np = self._log_softmax(x_np)
     x_tf = constant_op.constant(x_np)
-    y_tf = nn_ops.log_softmax(x_tf)
+    y_tf = nn_ops.log_softmax_v2(x_tf)
     y_tf_np = self.evaluate(y_tf)
     eps = 1e-3
     self.assertAllClose(y_tf_np, y_np, eps)
 
   def testLogSoftmaxAxes(self):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
-    x_neg_axis = nn_ops.log_softmax(arr, axis=-2)
-    y_pos_axis = nn_ops.log_softmax(arr, axis=0)
-    z_gt_axis = nn_ops.log_softmax(arr, axis=4)
+    x_neg_axis = nn_ops.log_softmax_v2(arr, axis=-2)
+    y_pos_axis = nn_ops.log_softmax_v2(arr, axis=0)
+    z_gt_axis = nn_ops.log_softmax_v2(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
@@ -189,11 +214,12 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
+  @test_util.run_deprecated_v1
   def testGradient(self, x_shape):
     x_np = np.random.randn(*x_shape).astype(np.float64)
     with self.cached_session():
       x_tf = constant_op.constant(x_np)
-      y_tf = nn_ops.log_softmax(x_tf)
+      y_tf = nn_ops.log_softmax_v2(x_tf)
       err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                     x_shape)
     eps = 1e-7
@@ -211,6 +237,7 @@ class L2LossTest(test_lib.TestCase):
       value = self.evaluate(l2loss)
       self.assertAllClose(7.0, value)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [20, 7, 3]
     np.random.seed(1)  # Make it reproducible.
@@ -244,7 +271,7 @@ class L2NormalizeTest(test_lib.TestCase):
     for dim in range(len(x_shape)):
       y_np = self._l2Normalize(x_np, dim)
       x_tf = constant_op.constant(x_np, name="x")
-      y_tf = nn_impl.l2_normalize(x_tf, dim)
+      y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
       self.assertAllClose(y_np, self.evaluate(y_tf))
 
   @test_util.run_in_graph_and_eager_modes
@@ -255,9 +282,10 @@ class L2NormalizeTest(test_lib.TestCase):
     dim = [1, 2]
     y_np = self._l2Normalize(x_np, dim)
     x_tf = constant_op.constant(x_np, name="x")
-    y_tf = nn_impl.l2_normalize(x_tf, dim)
+    y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
     self.assertAllClose(y_np, self.evaluate(y_tf))
 
+  @test_util.run_deprecated_v1
   def testL2NormalizeGradient(self):
     x_shape = [20, 7, 3]
     np.random.seed(1)
@@ -265,7 +293,7 @@ class L2NormalizeTest(test_lib.TestCase):
     for dim in range(len(x_shape)):
       with self.cached_session():
         x_tf = constant_op.constant(x_np, name="x")
-        y_tf = nn_impl.l2_normalize(x_tf, dim)
+        y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
         err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                       x_shape)
       print("L2Normalize gradient err = %g " % err)
@@ -282,19 +310,18 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob)
-        final_count = 0
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob)
+      final_count = 0
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
@@ -310,19 +337,18 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        final_count = 0
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      final_count = 0
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
@@ -335,18 +361,17 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          # Verifies that each y column as only one type of activation.
-          for i in xrange(x_dim):
-            sorted_value = np.unique(np.sort(value[i, :]))
-            self.assertEqual(sorted_value.size, 1)
-
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        # Verifies that each y column as only one type of activation.
+        for i in xrange(x_dim):
+          sorted_value = np.unique(np.sort(value[i, :]))
+          self.assertEqual(sorted_value.size, 1)
+
+  @test_util.run_deprecated_v1
   def testDropoutPlaceholderKeepProb(self):
     # Runs dropout with 0-1 tensor 10 times, sum the number of ones and validate
     # that it is producing approximately the right number of ones over a large
@@ -375,6 +400,7 @@ class DropoutTest(test_lib.TestCase):
       print(rel_error)
       self.assertTrue(rel_error < 0.15)
 
+  @test_util.run_deprecated_v1
   def testShapedDropoutUnknownShape(self):
     x_dim = 40
     y_dim = 30
@@ -389,26 +415,26 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        # Set noise_shape=[None, 1] which means [x_dim, 1].
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[None, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        final_count = 0
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      # Set noise_shape=[None, 1] which means [x_dim, 1].
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[None, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      final_count = 0
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
       print(rel_error)
       self.assertTrue(rel_error < 0.15)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepProb(self):
     x_dim = 40
     y_dim = 30
@@ -424,6 +450,19 @@ class DropoutTest(test_lib.TestCase):
     with self.assertRaises(ValueError):
       nn_ops.dropout(t, array_ops.placeholder(dtypes.float32, shape=[2]))
 
+  @test_util.run_deprecated_v1
+  def testInvalidRate(self):
+    x_dim = 40
+    y_dim = 30
+    t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, -1.0)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, 1.1)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, [0.0, 1.0])
+
+  @test_util.run_deprecated_v1
   def testShapedDropoutShapeError(self):
     # Runs shaped dropout and verifies an error is thrown on misshapen noise.
     x_dim = 40
@@ -446,9 +485,11 @@ class DropoutTest(test_lib.TestCase):
 
   def testNoDropoutFast(self):
     x = array_ops.zeros((5,))
-    for p in 1, constant_op.constant(1.0):
-      y = nn_ops.dropout(x, keep_prob=p)
-      self.assertTrue(x is y)
+    y = nn_ops.dropout(x, keep_prob=1)
+    self.assertTrue(x is y)
+
+    y = nn_ops.dropout_v2(x, rate=0)
+    self.assertTrue(x is y)
 
   def testDropoutWithIntegerInputs(self):
     x = constant_op.constant([1, 1, 1, 1, 1])
@@ -543,78 +584,78 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
           initializer=constant_op.constant(biases))
       with self.session(graph=g) as sess:
         variables.global_variables_initializer().run()
-        return sess.run([list(sharded_weights), list(sharded_biases)])
+        return self.evaluate([list(sharded_weights), list(sharded_biases)])
 
   def testShapes(self):
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_basic_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertEqual(exp_logits.shape, got_logits.shape, self._eps)
-        self.assertEqual(exp_labels.shape, got_labels.shape, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_basic_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertEqual(exp_logits.shape, got_logits.shape, self._eps)
+      self.assertEqual(exp_labels.shape, got_labels.shape, self._eps)
 
   def testBasic(self):
     """Without accidental hit removal or subtract_log_q."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_basic_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_basic_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testAccidentalHitRemoval(self):
     """With accidental hit removal, no subtract_log_q."""
@@ -622,118 +663,118 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     num_classes = 5
     batch_size = 3
     sampled = [1, 0, 2, 3]
-    with self.cached_session():
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, _,
-         _) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=sampled,
-             subtract_log_q=False)
-        logits_tensor, _ = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=len(sampled),
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=True,
-            partition_strategy="div",
-            name="sampled_logits_accidental_hit_removal_num_true_%d" % num_true)
-        # Test that the exponentiated logits of accidental hits are near 0.
-        # First we need to find the hits in this random test run:
-        labels_reshape = labels.reshape((batch_size, num_true))
-        got_logits = logits_tensor.eval()
-        for row in xrange(batch_size):
-          row_labels = labels_reshape[row, :]
-          for col in xrange(len(sampled)):
-            if sampled[col] in row_labels:
-              # We need to add the num_true_test offset into logits_*
-              self.assertNear(
-                  np.exp(got_logits[row, col + num_true]), 0., self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, _,
+       _) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=sampled,
+           subtract_log_q=False)
+      logits_tensor, _ = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=len(sampled),
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=True,
+          partition_strategy="div",
+          name="sampled_logits_accidental_hit_removal_num_true_%d" % num_true)
+      # Test that the exponentiated logits of accidental hits are near 0.
+      # First we need to find the hits in this random test run:
+      labels_reshape = labels.reshape((batch_size, num_true))
+      got_logits = self.evaluate(logits_tensor)
+      for row in xrange(batch_size):
+        row_labels = labels_reshape[row, :]
+        for col in xrange(len(sampled)):
+          if sampled[col] in row_labels:
+            # We need to add the num_true_test offset into logits_*
+            self.assertNear(
+                np.exp(got_logits[row, col + num_true]), 0., self._eps)
 
   def testSubtractLogQ(self):
     """With subtract_log_q, no accidental hit removal."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=True)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=True,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_subtract_log_q_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=True)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=True,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_subtract_log_q_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testSharded(self):
     """With sharded weights and sharded biases."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        weight_shards, bias_shards = self._ShardTestEmbeddings(
-            weights, biases, num_shards=3)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=[constant_op.constant(shard) for shard in weight_shards],
-            biases=[constant_op.constant(shard) for shard in bias_shards],
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_sharded_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      weight_shards, bias_shards = self._ShardTestEmbeddings(
+          weights, biases, num_shards=3)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=[constant_op.constant(shard) for shard in weight_shards],
+          biases=[constant_op.constant(shard) for shard in bias_shards],
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_sharded_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testNCELoss(self):
     # A simple test to verify the numerics.
@@ -762,35 +803,32 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_nce_loss = np.sum(
         _SigmoidCrossEntropyWithLogits(exp_logits, exp_labels), 1)
 
-    with self.cached_session():
-      got_nce_loss = nn_impl.nce_loss(
-          weights=constant_op.constant(weights),
-          biases=constant_op.constant(biases),
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
-
-      # Test with sharded weights and sharded biases.
-      weight_shards, bias_shards = self._ShardTestEmbeddings(
-          weights, biases, num_shards=3)
-      got_nce_loss = nn_impl.nce_loss(
-          weights=[constant_op.constant(shard) for shard in weight_shards],
-          biases=[constant_op.constant(shard) for shard in bias_shards],
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
+    got_nce_loss = nn_impl.nce_loss_v2(
+        weights=constant_op.constant(weights),
+        biases=constant_op.constant(biases),
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals)
+
+    self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
+
+    # Test with sharded weights and sharded biases.
+    weight_shards, bias_shards = self._ShardTestEmbeddings(
+        weights, biases, num_shards=3)
+    got_nce_loss = nn_impl.nce_loss_v2(
+        weights=[constant_op.constant(shard) for shard in weight_shards],
+        biases=[constant_op.constant(shard) for shard in bias_shards],
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals)
+
+    self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
 
   def testSampledSoftmaxLoss(self):
     # A simple test to verify the numerics.
@@ -819,39 +857,36 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.cached_session():
-      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
-          weights=constant_op.constant(weights),
-          biases=constant_op.constant(biases),
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          remove_accidental_hits=False,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-4)
-
-      # Test with sharded weights and sharded biases.
-      weight_shards, bias_shards = self._ShardTestEmbeddings(
-          weights, biases, num_shards=3)
-      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
-          weights=[constant_op.constant(shard) for shard in weight_shards],
-          biases=[constant_op.constant(shard) for shard in bias_shards],
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          remove_accidental_hits=False,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-4)
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss_v2(
+        weights=constant_op.constant(weights),
+        biases=constant_op.constant(biases),
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        remove_accidental_hits=False)
+
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-4)
+
+    # Test with sharded weights and sharded biases.
+    weight_shards, bias_shards = self._ShardTestEmbeddings(
+        weights, biases, num_shards=3)
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss_v2(
+        weights=[constant_op.constant(shard) for shard in weight_shards],
+        biases=[constant_op.constant(shard) for shard in bias_shards],
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        remove_accidental_hits=False)
+
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-4)
 
   def testSampledSoftmaxLossBf16(self):
     # A simple test to verify the numerics for bfloat16.
@@ -880,29 +915,29 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.cached_session():
-      true_exp_bf16 = np.full(
-          [batch_size, 1], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
-      sampled_exp_bf16 = np.full(
-          [len(sampled)], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
-      sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
-
-      got_sampled_softmax_loss = math_ops.cast(
-          nn_impl.sampled_softmax_loss(
-              weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
-              biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
-              labels=constant_op.constant(
-                  labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
-              inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
-              num_sampled=4,
-              num_classes=num_classes,
-              num_true=1,
-              sampled_values=sampled_vals_bf16,
-              remove_accidental_hits=False,
-              partition_strategy="div"), dtypes.float32)
-
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-1)
+    true_exp_bf16 = np.full([batch_size, 1],
+                            fill_value=0.5,
+                            dtype=dtypes.bfloat16.as_numpy_dtype)
+    sampled_exp_bf16 = np.full([len(sampled)],
+                               fill_value=0.5,
+                               dtype=dtypes.bfloat16.as_numpy_dtype)
+    sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
+
+    got_sampled_softmax_loss = math_ops.cast(
+        nn_impl.sampled_softmax_loss_v2(
+            weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
+            biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
+            labels=constant_op.constant(
+                labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
+            inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
+            num_sampled=4,
+            num_classes=num_classes,
+            num_true=1,
+            sampled_values=sampled_vals_bf16,
+            remove_accidental_hits=False), dtypes.float32)
+
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-1)
 
 
 class CReluTest(test_lib.TestCase):
@@ -911,9 +946,9 @@ class CReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.concatenate([x * (x > 0), -x * (x < 0)], axis=1)
-    with self.cached_session():
-      z = nn_ops.crelu(constant_op.constant(x)).eval()
-      self.assertAllClose(y, z, 1e-4)
+
+    z = self.evaluate(nn_ops.crelu(constant_op.constant(x)))
+    self.assertAllClose(y, z, 1e-4)
 
 
 class ReluTest(test_lib.TestCase):
@@ -922,10 +957,11 @@ class ReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.maximum(x, 0.0)
-    with self.cached_session():
-      z = nn_ops.relu(constant_op.constant(x)).eval()
-      self.assertAllEqual(y, z)
 
+    z = self.evaluate(nn_ops.relu(constant_op.constant(x)))
+    self.assertAllEqual(y, z)
+
+  @test_util.run_deprecated_v1
   def testNaNs(self):
     # Test that relu(nan) = nan for various sizes.
     for i in range(18):
@@ -947,22 +983,26 @@ class LeakyReluTest(test_lib.TestCase):
 
     outputs = nn_ops.leaky_relu(inputs)
     self.assertEquals(inputs.shape, outputs.shape)
-    with self.cached_session() as sess:
-      inputs, outputs = sess.run([inputs, outputs])
+
+    inputs, outputs = self.evaluate([inputs, outputs])
+
     self.assertGreaterEqual(outputs.min(), 0.0)
     self.assertLessEqual(outputs.max(), 1.0)
     self.assertAllClose(inputs, outputs)
 
+  @test_util.run_deprecated_v1
   def testValues(self):
     for dtype in [np.int32, np.int64, np.float16, np.float32, np.float64]:
       np_values = np.array([-2, -1, 0, 1, 2], dtype=dtype)
       outputs = nn_ops.leaky_relu(constant_op.constant(np_values))
-      with self.cached_session() as sess:
-        outputs = sess.run(outputs)
+
+      outputs = self.evaluate(outputs)
+
       tol = 2e-3 if dtype == np.float16 else 1e-6
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testName(self):
     np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
     outputs_with_name_set = nn_ops.leaky_relu(
@@ -976,6 +1016,7 @@ class LeakyReluTest(test_lib.TestCase):
 
 class SwishTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testValues(self):
     np_values = np.array(
         [np.linspace(-10.0, 0.0, 100),
@@ -984,11 +1025,13 @@ class SwishTest(test_lib.TestCase):
     tf_values = constant_op.constant(np_values)
     actual_tf_outputs = nn_impl.swish(tf_values)
     expected_tf_outputs = tf_values * math_ops.sigmoid(tf_values)
-    with self.cached_session() as sess:
-      actual_outputs, expected_outputs = sess.run(
-          [actual_tf_outputs, expected_tf_outputs])
+
+    actual_outputs, expected_outputs = self.evaluate(
+        [actual_tf_outputs, expected_tf_outputs])
+
     self.assertAllClose(actual_outputs, expected_outputs)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     shape = [5, 3, 4]
     sigma = 5
@@ -1019,8 +1062,8 @@ class MomentsTest(test_lib.TestCase):
             with self.session(graph=g) as sess:
               inputs = constant_op.constant(
                   input_values, shape=input_shape, dtype=dtypes.float32)
-              mean, variance = nn_impl.moments(
-                  inputs, moments_axes, keep_dims=keep_dims)
+              mean, variance = nn_impl.moments_v2(
+                  inputs, moments_axes, keepdims=keep_dims)
 
               if check_gradients:
                 err = gradient_checker.compute_gradient_error(
@@ -1031,7 +1074,7 @@ class MomentsTest(test_lib.TestCase):
                 self.assertLess(err, 1e-3)
 
               # Evaluate.
-              [mean, variance] = sess.run([mean, variance])
+              [mean, variance] = self.evaluate([mean, variance])
               # Make sure that there are no NaNs
               self.assertFalse(np.isnan(mean).any())
               self.assertFalse(np.isnan(variance).any())
@@ -1074,9 +1117,9 @@ class DataFormatDimMapTest(test_lib.TestCase):
   def _test(self, x_val, y_val_expected):
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x)
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
-      self.assertAllEqual(y_val, y_val_expected)
+
+    y_val = self.evaluate(y)
+    self.assertAllEqual(y_val, y_val_expected)
 
   def test(self):
     self._test(0, 0)
@@ -1097,8 +1140,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [2, 2, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="NCHW")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testNHWCtoHWNC(self):
@@ -1106,8 +1149,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [2, 0, 1, 3, 2, 0, 1, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="HWNC")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testNHWCtoWHCN(self):
@@ -1115,8 +1158,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [3, 1, 0, 2, 3, 1, 0, 2]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="WHCN")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testArbitraryASCII(self):
@@ -1124,8 +1167,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [3, 2, 1, 0, 3, 2, 1, 0]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="qwer", dst_format="rewq")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
 
@@ -1135,64 +1178,64 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x)
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 3, 4, 9])
 
   def testNCHWToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 9, 3, 4])
 
   def testNHWCToHWNC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [4, 9, 7, 3])
 
   def testHWNCToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [9, 7, 4, 3])
 
   def testNHWCToNCHW2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x)
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[7, 4], [5, 1], [9, 3], [4, 5]])
 
   def testNHWCToHWNC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[9, 3], [4, 5], [7, 4], [5, 1]])
 
   def testHWNCToNHWC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[4, 5], [7, 4], [9, 3], [5, 1]])
 
   def testNCHWToNHWC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
-    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[7, 4], [4, 5], [5, 1], [9, 3]])
 
 
diff --git a/tensorflow/python/ops/nn_xent_test.py b/tensorflow/python/ops/nn_xent_test.py
index 54a0e26bfb415dc16e5553caf0f40279a4f5e29d..3e5c198fc6a6658c7dcdc3bf3ead9df65db63607 100644
--- a/tensorflow/python/ops/nn_xent_test.py
+++ b/tensorflow/python/ops/nn_xent_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_impl
@@ -53,6 +54,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
     losses = np.array(self._SigmoidCrossEntropyWithLogits(x, y)).reshape(*sizes)
     return logits, targets, losses
 
+  @test_util.run_deprecated_v1
   def testConstructionNamed(self):
     with self.cached_session():
       logits, targets, _ = self._Inputs()
@@ -63,25 +65,26 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
   def testLogisticOutput(self):
     for use_gpu in [True, False]:
       for dtype in [dtypes.float32, dtypes.float16]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           logits, targets, losses = self._Inputs(dtype=dtype)
           loss = nn_impl.sigmoid_cross_entropy_with_logits(
               labels=targets, logits=logits)
           np_loss = np.array(losses).astype(np.float32)
-          tf_loss = loss.eval()
+          tf_loss = self.evaluate(loss)
         self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testLogisticOutputMultiDim(self):
     for use_gpu in [True, False]:
       for dtype in [dtypes.float32, dtypes.float16]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           logits, targets, losses = self._Inputs(dtype=dtype, sizes=[2, 2, 2])
           loss = nn_impl.sigmoid_cross_entropy_with_logits(
               labels=targets, logits=logits)
           np_loss = np.array(losses).astype(np.float32)
-          tf_loss = loss.eval()
+          tf_loss = self.evaluate(loss)
         self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     sizes = [4, 2]
     with self.cached_session():
@@ -92,6 +95,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
     print("logistic loss gradient err = ", err)
     self.assertLess(err, 1e-7)
 
+  @test_util.run_deprecated_v1
   def testGradientAtZero(self):
     with self.cached_session():
       logits = constant_op.constant([0.0, 0.0], dtype=dtypes.float64)
@@ -129,6 +133,7 @@ class WeightedCrossEntropyTest(test.TestCase):
     losses = np.array(self._WeightedCrossEntropy(x, y, q)).reshape(*sizes)
     return logits, targets, q, losses
 
+  @test_util.run_deprecated_v1
   def testConstructionNamed(self):
     with self.cached_session():
       logits, targets, pos_weight, _ = self._Inputs()
@@ -138,25 +143,26 @@ class WeightedCrossEntropyTest(test.TestCase):
 
   def testOutput(self):
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         logits, targets, pos_weight, losses = self._Inputs(dtype=dtypes.float32)
         loss = nn_impl.weighted_cross_entropy_with_logits(
             targets=targets, logits=logits, pos_weight=pos_weight)
         np_loss = np.array(losses).astype(np.float32)
-        tf_loss = loss.eval()
+        tf_loss = self.evaluate(loss)
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testOutputMultiDim(self):
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         logits, targets, pos_weight, losses = self._Inputs(
             dtype=dtypes.float32, sizes=[2, 2, 2])
         loss = nn_impl.weighted_cross_entropy_with_logits(
             targets=targets, logits=logits, pos_weight=pos_weight)
         np_loss = np.array(losses).astype(np.float32)
-        tf_loss = loss.eval()
+        tf_loss = self.evaluate(loss)
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     sizes = [4, 2]
     with self.cached_session():
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 002e87b4119e67ae0a183ceab4671b91d1d4186c..0ab39ad0a8edd60c78a6bea3ae31e4f025c9e0bd 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -28,7 +28,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("debugging.assert_all_finite", "verify_tensor_all_finite")
+@tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
 def verify_tensor_all_finite(t, msg, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
@@ -41,15 +41,30 @@ def verify_tensor_all_finite(t, msg, name=None):
   Returns:
     Same tensor as `t`.
   """
-  with ops.name_scope(name, "VerifyFinite", [t]) as name:
-    t = ops.convert_to_tensor(t, name="t")
-    with ops.colocate_with(t):
-      verify_input = array_ops.check_numerics(t, message=msg)
-      out = control_flow_ops.with_dependencies([verify_input], t)
+  return verify_tensor_all_finite_v2(t, msg, name)
+
+
+@tf_export("debugging.assert_all_finite", v1=[])
+def verify_tensor_all_finite_v2(x, message, name=None):
+  """Assert that the tensor does not contain any NaN's or Inf's.
+
+  Args:
+    x: Tensor to check.
+    message: Message to log on failure.
+    name: A name for this operation (optional).
+
+  Returns:
+    Same tensor as `x`.
+  """
+  with ops.name_scope(name, "VerifyFinite", [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    with ops.colocate_with(x):
+      verify_input = array_ops.check_numerics(x, message=message)
+      out = control_flow_ops.with_dependencies([verify_input], x)
   return out
 
 
-@tf_export("add_check_numerics_ops")
+@tf_export(v1=["add_check_numerics_ops"])
 def add_check_numerics_ops():
   """Connect a `check_numerics` to every floating point tensor.
 
diff --git a/tensorflow/python/ops/optional_grad.py b/tensorflow/python/ops/optional_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d1eae3cda4bd9b6558313abc8abe7f4e815b816
--- /dev/null
+++ b/tensorflow/python/ops/optional_grad.py
@@ -0,0 +1,33 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gradient functions for optional ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+@ops.RegisterGradient("OptionalFromValue")
+def _OptionalFromValueGrad(op, grad):
+  return gen_dataset_ops.optional_get_value(
+      grad, [t.dtype for t in op.inputs], [t.shape for t in op.inputs])
+
+
+@ops.RegisterGradient("OptionalGetValue")
+def _OptionalGetValueGrad(unused_op, *grads):
+  return gen_dataset_ops.optional_from_value(grads)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index ead7ae5478c74aad4f67296ed68895c1f54f7333..8f652e9c5097db318a77c3cec8c6597c6bb1d87c 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -17,16 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.parallel_for.pfor import PFor
 from tensorflow.python.util import nest
 
 
-def for_loop(loop_fn, loop_fn_dtypes, iters):
+def for_loop(loop_fn, loop_fn_dtypes, iters, parallel_iterations=None):
   """Runs `loop_fn` `iters` times and stacks the outputs.
 
 
@@ -39,6 +43,8 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
       objects. The shape of these outputs should not depend on the input.
     loop_fn_dtypes: dtypes for the outputs of loop_fn.
     iters: Number of iterations for which to run loop_fn.
+    parallel_iterations: The number of iterations that can be dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     Returns a nested structure of stacked output tensor objects with the same
@@ -66,11 +72,16 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
       outputs.append(ta)
     return tuple([i + 1] + outputs)
 
+  if parallel_iterations is not None:
+    extra_args = {"parallel_iterations": parallel_iterations}
+  else:
+    extra_args = {}
   ta_list = control_flow_ops.while_loop(
-      lambda i, *ta: i < iters, while_body, [0] + [
-          tensor_array_ops.TensorArray(dtype, iters)
-          for dtype in flat_loop_fn_dtypes
-      ])[1:]
+      lambda i, *ta: i < iters,
+      while_body,
+      [0] + [tensor_array_ops.TensorArray(dtype, iters)
+             for dtype in flat_loop_fn_dtypes],
+      **extra_args)[1:]
 
   # TODO(rachelim): enable this for sparse tensors
 
@@ -79,7 +90,15 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
   return nest.pack_sequence_as(loop_fn_dtypes, output)
 
 
-def pfor(loop_fn, iters):
+def _flatten_first_two_dims(x):
+  """Flattens the first two dimensions of x into a single dimension."""
+  old_shape = array_ops.shape(x)
+  new_shape = array_ops.concat([[old_shape[0] * old_shape[1]], old_shape[2:]],
+                               axis=0)
+  return array_ops.reshape(x, new_shape)
+
+
+def pfor(loop_fn, iters, parallel_iterations=None):
   """Equivalent to running `loop_fn` `iters` times and stacking the outputs.
 
   `pfor` has functionality similar to `for_loop`, i.e. running `loop_fn` `iters`
@@ -99,8 +118,8 @@ def pfor(loop_fn, iters):
       reads, etc).
     - Conversion works only on a limited set of kernels for which a converter
       has been registered.
-    - loop_fn cannot currently contain control flow operations like
-      tf.while_loop or tf.cond.
+    - loop_fn has limited support for control flow operations. tf.cond in
+      particular is not supported.
     - `loop_fn` should return nested structure of Tensors or Operations. However
       if an Operation is returned, it should have zero outputs.
     - The shape and dtype of `loop_fn` outputs should not depend on the input
@@ -109,22 +128,92 @@ def pfor(loop_fn, iters):
   Args:
     loop_fn: A function that takes an int32 scalar tf.Tensor object representing
       the iteration number, and returns a possibly nested structure of Tensor or
-      Operation objects.
+      Operation objects. Note that if setting `parallel_iterations` argument to
+      something other than None, `loop_fn` may be called more than once during
+      graph construction. So it may need to avoid mutating global state.
     iters: Number of iterations for which to run loop_fn.
+    parallel_iterations: A knob to control how many iterations are vectorized
+      and dispatched in parallel. The default value of None corresponds to
+      vectorizing all the iterations.  If `parallel_iterations` is smaller than
+      `iters`, then chunks of at most that many iterations are dispatched in
+      sequence. This knob can be used to control the total memory usage.
 
   Returns:
     Returns a nested structure of stacked tensor objects with the same nested
     structure as the output of `loop_fn`.
+  Raises:
+    ValueError: If parallel_iterations is not None and not an integer > 1.
   """
+  def f():
+    return _pfor_impl(loop_fn, iters, parallel_iterations=parallel_iterations)
+  if context.executing_eagerly():
+    f = function.defun(f)
+  return f()
+
+
+def _pfor_impl(loop_fn, iters, parallel_iterations=None):
+  """Implementation of pfor."""
   existing_ops = set(ops.get_default_graph().get_operations())
   with ops.name_scope("loop_body"):
     loop_var = array_ops.placeholder(dtypes.int32, shape=[])
     loop_fn_outputs = loop_fn(loop_var)
   new_ops = set(ops.get_default_graph().get_operations()) - existing_ops
   iters = ops.convert_to_tensor(iters)
-  with ops.name_scope("pfor"):
-    converter = PFor(loop_var, iters, new_ops)
-    outputs = []
-    for loop_fn_output in nest.flatten(loop_fn_outputs):
-      outputs.append(converter.convert(loop_fn_output))
-    return nest.pack_sequence_as(loop_fn_outputs, outputs)
+  if parallel_iterations is not None:
+    if parallel_iterations < 1:
+      raise ValueError("parallel_iterations must be None or a positive integer")
+    if parallel_iterations == 1:
+      raise ValueError("Found parallel_iterations == 1. Use for_loop instead.")
+    iters_value = tensor_util.constant_value(iters)
+    if iters_value is not None and iters_value < parallel_iterations:
+      parallel_iterations = None
+  if parallel_iterations is None:
+    with ops.name_scope("pfor"):
+      converter = PFor(loop_var, iters, new_ops)
+      outputs = []
+      for loop_fn_output in nest.flatten(loop_fn_outputs):
+        outputs.append(converter.convert(loop_fn_output))
+      return nest.pack_sequence_as(loop_fn_outputs, outputs)
+  else:
+    num_tiled_iterations = iters // parallel_iterations
+    num_remaining_iterations = iters % parallel_iterations
+    # TODO(agarwal): Avoid calling loop_fn twice. Generate the loop body inside
+    # a tf.function and extract the graph from there to vectorize it.
+    with ops.name_scope("pfor_untiled"):
+      converter = PFor(loop_var, num_remaining_iterations, new_ops)
+      remaining_outputs = []
+      flattened_loop_fn_outputs = nest.flatten(loop_fn_outputs)
+      for loop_fn_output in flattened_loop_fn_outputs:
+        remaining_outputs.append(converter.convert(loop_fn_output))
+
+    with ops.name_scope("pfor_tiled"):
+      loop_fn_dtypes = [ops.convert_to_tensor(x).dtype
+                        for x in flattened_loop_fn_outputs]
+
+      def tiled_loop_body(j):
+        offset = j * parallel_iterations + num_remaining_iterations
+
+        def tiled_loop_fn(i):
+          return nest.flatten(loop_fn(i + offset))
+
+        return pfor(tiled_loop_fn, parallel_iterations)
+
+      tiled_outputs = for_loop(tiled_loop_body, loop_fn_dtypes,
+                               num_tiled_iterations, parallel_iterations=1)
+      tiled_outputs = [_flatten_first_two_dims(y) for y in tiled_outputs]
+
+    with ops.name_scope("pfor"):
+      iters_value = tensor_util.constant_value(iters)
+      if iters_value is None or iters_value % parallel_iterations:
+        outputs = control_flow_ops.cond(
+            math_ops.equal(num_remaining_iterations, 0),
+            lambda: tiled_outputs,
+            lambda: [array_ops.concat([x, y], axis=0)
+                     for x, y in zip(remaining_outputs, tiled_outputs)])
+      else:
+        outputs = tiled_outputs
+      return nest.pack_sequence_as(loop_fn_outputs, nest.flatten(outputs))
+
+
+
+
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 6e276dee55dc477a75bb9ab3457e2f5c827c5cb1..933bddd8ccaa830a394c8d69e4f1b33311315c99 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -26,15 +26,18 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients as gradient_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
@@ -51,6 +54,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class PForTest(test.TestCase):
 
   def _run_targets(self, targets1, targets2=None, run_init=True):
@@ -72,9 +76,13 @@ class PForTest(test.TestCase):
       else:
         self.assertAllEqual(outputs[i + n], outputs[i])
 
-  def _test_loop_fn(self, loop_fn, iters, loop_fn_dtypes=dtypes.float32):
-    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters)
-    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters)
+  def _test_loop_fn(self, loop_fn, iters,
+                    loop_fn_dtypes=dtypes.float32,
+                    parallel_iterations=None):
+    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
+                                    parallel_iterations=parallel_iterations)
+    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
+                                        parallel_iterations=parallel_iterations)
     self.run_and_assert_equal(t1, t2)
 
   def test_op_conversion_fallback_to_while_loop(self):
@@ -95,7 +103,32 @@ class PForTest(test.TestCase):
         loop_fn, 3, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
     flags.FLAGS.op_conversion_fallback_to_while_loop = False
 
+  def test_parallel_iterations(self):
+    for parallel_iterations in [2, 3, 8, 10]:
+      x = random_ops.random_uniform([8, 3])
 
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        return array_ops.gather(x, i)
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 8, parallel_iterations=parallel_iterations)
+      self._test_loop_fn(loop_fn, 4 * constant_op.constant(2),
+                         parallel_iterations=parallel_iterations)
+
+  def test_parallel_iterations_zero(self):
+    with self.assertRaisesRegexp(ValueError, "positive integer"):
+      pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=0)
+    with self.assertRaisesRegexp(TypeError, "positive integer"):
+      pfor_control_flow_ops.for_loop(lambda i: 1, dtypes.int32, 8,
+                                     parallel_iterations=0)
+
+  def test_parallel_iterations_one(self):
+    with self.assertRaisesRegexp(ValueError, "Use for_loop instead"):
+      pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=1)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class ArrayTest(PForTest):
 
   def test_gather(self):
@@ -244,6 +277,16 @@ class ArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 5)
 
+  def test_split_v(self):
+    x = random_ops.random_uniform([3, 6, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return (array_ops.split(x1, [2, 1, 3], axis=0),
+              array_ops.split(x1, [3], axis=-1))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 4)
+
   def test_transpose(self):
     x = random_ops.random_uniform([3, 2, 3, 4])
 
@@ -277,31 +320,54 @@ class ArrayTest(PForTest):
 
   def test_unary_cwise_ops(self):
     for op in [array_ops.identity, array_ops.stop_gradient]:
-      x = random_ops.random_uniform([3, 5])
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
 
       # pylint: disable=cell-var-from-loop
       def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y = op(x1) + x1
-        loss = nn.l2_loss(y)
-        return op(x), y, gradient_ops.gradients(loss, x1)
+        with g:
+          x1 = array_ops.gather(x, i)
+          y = op(x1) + x1
+          loss = nn.l2_loss(y)
+        return op(x), y, g.gradient(loss, x1)
 
       # pylint: enable=cell-var-from-loop
 
       self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
 
+  def test_identity_n(self):
+    x = random_ops.random_uniform([3, 4])
+
+    def loop_fn(i):
+      return array_ops.identity_n([x, array_ops.gather(x, i)])
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_matrix_diag_part(self):
+    x = random_ops.random_uniform([3, 4, 2])
+
+    def loop_fn(i):
+      return array_ops.matrix_diag_part(array_ops.gather(x, i))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
+
   def test_strided_slice(self):
-    x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
+      g.watch(x)
 
     def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
-      loss = nn.l2_loss(y)
-      return y, gradient_ops.gradients(loss, x_i)
+      with g:
+        x_i = array_ops.gather(x, i)
+        y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
+        loss = nn.l2_loss(y)
+      return y, g.gradient(loss, x_i)
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BitwiseTest(PForTest):
 
   def test_unary_cwise(self):
@@ -341,6 +407,7 @@ class BitwiseTest(PForTest):
       self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MathTest(PForTest):
 
   def test_unary_cwise_ops(self):
@@ -397,22 +464,29 @@ class MathTest(PForTest):
         nn.softsign,
     ]
     for op in complex_ops + real_ops:
-      x = random_ops.random_uniform([3, 5])
-      if op in complex_ops:
-        y = random_ops.random_uniform([3, 5])
-        x = math_ops.complex(x, y)
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
+        if op in complex_ops:
+          y = random_ops.random_uniform([3, 5])
+          g.watch(y)
+          x = math_ops.complex(x, y)
 
       # pylint: disable=cell-var-from-loop
       output_dtypes = []
       def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y1 = op(x1)
-        outputs = [op(x), y1]
-        if y1.dtype == dtypes.float32:
-          loss = math_ops.reduce_sum(y1 * y1)
-          grad = gradient_ops.gradients(loss, x1)
-          if grad and grad[0] is not None:
-            outputs.extend(grad)
+        with g:
+          x1 = array_ops.gather(x, i)
+          y1 = op(x1)
+          outputs = [op(x), y1]
+          if y1.dtype == dtypes.float32:
+            loss = math_ops.reduce_sum(y1 * y1)
+          else:
+            loss = None
+        if loss is not None:
+          grad = g.gradient(loss, x1)
+          if grad is not None:
+            outputs.append(grad)
         del output_dtypes[:]
         output_dtypes.extend([t.dtype for t in outputs])
         return outputs
@@ -629,17 +703,19 @@ class MathTest(PForTest):
     x_shape = [2, 3, 4, 5, 6]
     x = random_ops.random_uniform(x_shape)
     for data_format in ("NCHW", "NHWC"):
-      bias_dim = 2 if data_format == "NCHW" else -1
-      bias_shape = x_shape[bias_dim]
-      bias = random_ops.random_uniform([bias_shape])
+      with backprop.GradientTape(persistent=True) as g:
+        bias_dim = 2 if data_format == "NCHW" else -1
+        bias_shape = x_shape[bias_dim]
+        bias = random_ops.random_uniform([bias_shape])
+        g.watch(bias)
 
       # pylint: disable=cell-var-from-loop
       def loop_fn(i):
-        a = array_ops.gather(x, i)
-        y = nn.bias_add(a, bias, data_format=data_format)
-        loss = math_ops.reduce_sum(y * y)
-        return y, gradient_ops.gradients(loss, bias)
-
+        with g:
+          a = array_ops.gather(x, i)
+          y = nn.bias_add(a, bias, data_format=data_format)
+          loss = math_ops.reduce_sum(y * y)
+        return y, g.gradient(loss, bias)
       # pylint: enable=cell-var-from-loop
 
       self._test_loop_fn(
@@ -700,6 +776,7 @@ class MathTest(PForTest):
       self._test_loop_fn(loop_fn, 2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NNTest(PForTest):
 
   def test_conv2d(self):
@@ -752,30 +829,63 @@ class NNTest(PForTest):
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
   def test_avg_pool(self):
-    x = random_ops.random_uniform([3, 2, 12, 12, 3])
-    ksize = [1, 3, 3, 1]
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 2, 12, 12, 3])
+      g.watch(x)
+      ksize = [1, 3, 3, 1]
 
     def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      output = nn.avg_pool(
-          x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC")
-      loss = nn.l2_loss(output)
-      return output, gradient_ops.gradients(loss, x1)
+      with g:
+        x1 = array_ops.gather(x, i)
+        output = nn.avg_pool(
+            x1, ksize, strides=[1, 2, 2, 1], padding="VALID",
+            data_format="NHWC")
+        loss = nn.l2_loss(output)
+      return output, g.gradient(loss, x1)
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
   def test_max_pool(self):
-    x = random_ops.random_uniform([3, 2, 12, 12, 3])
-    ksize = [1, 3, 3, 1]
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 2, 12, 12, 3])
+      g.watch(x)
+      ksize = [1, 3, 3, 1]
+      strides = [1, 2, 2, 1]
 
     def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      output = nn.max_pool(
-          x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC")
-      loss = nn.l2_loss(output)
-      return output, gradient_ops.gradients(loss, x1)
+      with g:
+        x1 = array_ops.gather(x, i)
+        output = nn.max_pool(
+            x1, ksize, strides=strides, padding="VALID", data_format="NHWC")
+        loss = nn.l2_loss(output)
+        ones = array_ops.ones_like(output)
+        g.watch(ones)
+        grad = g.gradient(loss, x1, output_gradients=ones)
+      grad_grad = g.gradient(grad, ones)
+      return output, grad, grad_grad
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
+  def test_max_pool3d(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 3, 2, 12, 12, 3])
+      g.watch(x)
+      ksize = [1, 1, 3, 3, 1]
+      strides = [1, 1, 2, 2, 1]
 
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+    def loop_fn(i):
+      with g:
+        x1 = array_ops.gather(x, i)
+        output = nn.max_pool3d(
+            x1, ksize, strides=strides, padding="VALID", data_format="NDHWC")
+        loss = nn.l2_loss(output)
+        ones = array_ops.ones_like(output)
+        g.watch(ones)
+        grad = g.gradient(loss, x1, output_gradients=ones)
+      grad_grad = g.gradient(grad, ones)
+      return output, grad, grad_grad
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
 
   def test_fused_batch_norm(self):
     data_formats = ["NHWC"]
@@ -783,36 +893,44 @@ class NNTest(PForTest):
       data_formats.append("NCHW")
     for is_training in (True, False):
       for data_format in data_formats:
-        if data_format == "NCHW":
-          x = random_ops.random_uniform([3, 1, 2, 5, 5])
-        else:
-          x = random_ops.random_uniform([3, 1, 5, 5, 2])
-        scale = random_ops.random_uniform([2])
-        offset = random_ops.random_uniform([2])
-        mean = None if is_training else random_ops.random_uniform([2])
-        variance = None if is_training else random_ops.random_uniform([2])
+        with backprop.GradientTape(persistent=True) as g:
+          if data_format == "NCHW":
+            x = random_ops.random_uniform([3, 1, 2, 5, 5])
+          else:
+            x = random_ops.random_uniform([3, 1, 5, 5, 2])
+          g.watch(x)
+          scale = random_ops.random_uniform([2])
+          g.watch(scale)
+          offset = random_ops.random_uniform([2])
+          g.watch(offset)
+          mean = None if is_training else random_ops.random_uniform([2])
+          variance = None if is_training else random_ops.random_uniform([2])
 
         # pylint: disable=cell-var-from-loop
         def loop_fn(i):
-          x1 = array_ops.gather(x, i)
-          outputs = nn.fused_batch_norm(
-              x1,
-              scale,
-              offset,
-              mean=mean,
-              variance=variance,
-              epsilon=0.01,
-              data_format=data_format,
-              is_training=is_training)
-          outputs = list(outputs)
-          # We only test the first value of outputs when is_training is False.
-          # It looks like CPU and GPU have different outputs for batch_mean and
-          # batch_variance for this case.
-          if not is_training:
-            outputs[1] = constant_op.constant(0.)
-            outputs[2] = constant_op.constant(0.)
-          loss = nn.l2_loss(outputs[0])
-          gradients = gradient_ops.gradients(loss, [x1, scale, offset])
+          with g:
+            x1 = array_ops.gather(x, i)
+            outputs = nn.fused_batch_norm(
+                x1,
+                scale,
+                offset,
+                mean=mean,
+                variance=variance,
+                epsilon=0.01,
+                data_format=data_format,
+                is_training=is_training)
+            outputs = list(outputs)
+            # We only test the first value of outputs when is_training is False.
+            # It looks like CPU and GPU have different outputs for batch_mean
+            # and batch_variance for this case.
+            if not is_training:
+              outputs[1] = constant_op.constant(0.)
+              outputs[2] = constant_op.constant(0.)
+            loss = nn.l2_loss(outputs[0])
+          if is_training:
+            gradients = g.gradient(loss, [x1, scale, offset])
+          else:
+            gradients = [constant_op.constant(0.)] * 3
           return outputs + gradients
 
         # pylint: enable=cell-var-from-loop
@@ -820,16 +938,20 @@ class NNTest(PForTest):
         self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 6)
 
   def test_softmax_cross_entropy_with_logits(self):
-    logits = random_ops.random_uniform([3, 2, 4])
-    labels = random_ops.random_uniform([3, 2, 4])
-    labels /= math_ops.reduce_sum(labels, axis=[2], keepdims=True)
+    with backprop.GradientTape(persistent=True) as g:
+      logits = random_ops.random_uniform([3, 2, 4])
+      g.watch(logits)
+      labels = random_ops.random_uniform([3, 2, 4])
+      labels /= math_ops.reduce_sum(labels, axis=[2], keepdims=True)
 
     def loop_fn(i):
-      logits_i = array_ops.gather(logits, i)
-      labels_i = array_ops.gather(labels, i)
-      loss = nn.softmax_cross_entropy_with_logits(
-          labels=labels_i, logits=logits_i)
-      return loss, gradient_ops.gradients(math_ops.reduce_sum(loss), logits_i)
+      with g:
+        logits_i = array_ops.gather(logits, i)
+        labels_i = array_ops.gather(labels, i)
+        loss = nn.softmax_cross_entropy_with_logits(
+            labels=labels_i, logits=logits_i)
+        total_loss = math_ops.reduce_sum(loss)
+      return loss, g.gradient(total_loss, logits_i)
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
@@ -1248,13 +1370,12 @@ class ControlFlowTest(PForTest):
     pfor_out, pfor_out_grad = pfor_control_flow_ops.pfor(loop_fn, 4)
     # Note that tf.while_loop does not work in the setup above. So we manually
     # construct the equivalent computation of the above loops here.
-    real_out = math_ops.reduce_sum(inp, reduction_indices=[0])
-    real_out = math_ops.reduce_prod(real_out, reduction_indices=[1])
+    real_out = math_ops.reduce_sum(inp, axis=[0])
+    real_out = math_ops.reduce_prod(real_out, axis=[1])
     # Note that gradients of real_out will accumulate the gradients across the
     # output value. Hence we do the same aggregation on pfor_out_grad.
     real_out_grad = gradient_ops.gradients(real_out, inp)[0]
-    sum_pfor_out_grad = math_ops.reduce_sum(
-        pfor_out_grad, reduction_indices=[0])
+    sum_pfor_out_grad = math_ops.reduce_sum(pfor_out_grad, axis=[0])
 
     with session.Session() as sess:
       v1, v2, v1_grad, v2_grad = sess.run(
@@ -1358,15 +1479,78 @@ class Benchmarks(test.Benchmark):
     with sess:
       init = variables.global_variables_initializer()
       sess.run(init)
-      sess.run(targets)
+      run_fn = sess.make_callable(targets)
+      run_fn()  # Warm up
       begin = time.time()
       for _ in range(iters):
-        sess.run(targets)
+        run_fn()
       end = time.time()
     avg_time_ms = 1000 * (end - begin) / iters
     self.report_benchmark(iters=iters, wall_time=avg_time_ms, name=name)
     return avg_time_ms
 
+  def benchmark_sess_run_overhead(self):
+    with ops.Graph().as_default():
+      x = constant_op.constant(1.0)
+      self._run(x, 10000, name="session_run_overhead")
+
+  def benchmark_add(self):
+    with ops.Graph().as_default():
+      n = 256
+      params = 1000
+      x = random_ops.random_normal([n, params])
+      y = random_ops.random_normal([n, params])
+
+      def loop_fn(i):
+        x_i = array_ops.gather(x, i)
+        y_i = array_ops.gather(y, i)
+        return x_i + y_i
+
+      pfor_outputs = pfor_control_flow_ops.pfor(loop_fn, n)
+      while_outputs = pfor_control_flow_ops.for_loop(loop_fn, dtypes.float32, n)
+      manual = x + y
+
+      self._run(manual, 1000, name="manual_add")
+      self._run(pfor_outputs, 1000, name="pfor_add")
+      self._run(while_outputs, 100, name="while_add")
+
+  def benchmark_matmul(self):
+    with ops.Graph().as_default():
+      n = 1024
+      params = 1000
+      x = random_ops.random_normal([n, params])
+      y = random_ops.random_normal([params, params])
+
+      def loop_fn(i):
+        x_i = array_ops.expand_dims(array_ops.gather(x, i), 0)
+        return math_ops.matmul(x_i, y)
+
+      pfor_outputs = pfor_control_flow_ops.pfor(loop_fn, n)
+      while_outputs = pfor_control_flow_ops.for_loop(loop_fn, dtypes.float32, n)
+      manual = math_ops.matmul(x, y)
+
+      self._run(manual, 1000, name="manual_matmul")
+      self._run(pfor_outputs, 1000, name="pfor_matmul")
+      self._run(while_outputs, 100, name="while_matmul")
+
+  def benchmark_map_fn(self):
+    with ops.Graph().as_default():
+      b = 256
+      params = 1000
+      inp = random_ops.random_normal((b, params))
+      map_fn = lambda x: x * x
+
+      def pfor_map_fn(f, x):
+        return pfor_control_flow_ops.pfor(
+            lambda i: f(array_ops.gather(x, i)),
+            array_ops.shape(x)[0])
+
+      map_output = functional_ops.map_fn(map_fn, inp)
+      pfor_output = pfor_map_fn(map_fn, inp)
+
+      self._run(map_output, 100, name="tf_map_fn")
+      self._run(pfor_output, 100, name="pfor_map_fn")
+
   def benchmark_basic_while(self):
     with ops.Graph().as_default():
 
@@ -1391,13 +1575,6 @@ class Benchmarks(test.Benchmark):
       self._run(pfor_outputs, 100, name="pfor_rnn")
       self._run(tf_outputs, 100, name="tf_rnn")
 
-  def benchmark_dynamic_lstm(self):
-    with ops.Graph().as_default():
-      pfor_outputs, tf_outputs = create_dynamic_lstm(rnn_cell.BasicLSTMCell,
-                                                     128, 512, 16)
-      self._run(pfor_outputs, 100, name="pfor_lstm")
-      self._run(tf_outputs, 100, name="tf_lstm")
-
 
 class SparseTest(PForTest):
 
diff --git a/tensorflow/python/ops/parallel_for/gradients.py b/tensorflow/python/ops/parallel_for/gradients.py
index 1f026b3660c39066b3a8cf741b0fbd1929b22665..3ba1bde347698acf3b1229808fe63cef2e3255af 100644
--- a/tensorflow/python/ops/parallel_for/gradients.py
+++ b/tensorflow/python/ops/parallel_for/gradients.py
@@ -25,7 +25,7 @@ from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.util import nest
 
 
-def jacobian(output, inputs, use_pfor=True):
+def jacobian(output, inputs, use_pfor=True, parallel_iterations=None):
   """Computes jacobian of `output` w.r.t. `inputs`.
 
   Args:
@@ -33,6 +33,8 @@ def jacobian(output, inputs, use_pfor=True):
     inputs: A tensor or a nested structure of tensor objects.
     use_pfor: If true, uses pfor for computing the jacobian. Else uses
       tf.while_loop.
+    parallel_iterations: A knob to control how many iterations and dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     A tensor or a nested strucutre of tensors with the same structure as
@@ -56,10 +58,14 @@ def jacobian(output, inputs, use_pfor=True):
     output_size = array_ops.shape(output)[0]
 
   if use_pfor:
-    pfor_outputs = control_flow_ops.pfor(loop_fn, output_size)
+    pfor_outputs = control_flow_ops.pfor(
+        loop_fn, output_size, parallel_iterations=parallel_iterations)
   else:
     pfor_outputs = control_flow_ops.for_loop(
-        loop_fn, [output.dtype] * len(flat_inputs), output_size)
+        loop_fn,
+        [output.dtype] * len(flat_inputs),
+        output_size,
+        parallel_iterations=parallel_iterations)
 
   for i, out in enumerate(pfor_outputs):
     if out is not None:
@@ -72,7 +78,7 @@ def jacobian(output, inputs, use_pfor=True):
   return nest.pack_sequence_as(inputs, pfor_outputs)
 
 
-def batch_jacobian(output, inp, use_pfor=True):
+def batch_jacobian(output, inp, use_pfor=True, parallel_iterations=None):
   """Computes and stacks jacobians of `output[i,...]` w.r.t. `input[i,...]`.
 
   e.g.
@@ -87,6 +93,8 @@ def batch_jacobian(output, inp, use_pfor=True):
     inp: A tensor with shape [b, x1, ..., x_m]
     use_pfor: If true, uses pfor for computing the Jacobian. Else uses a
       tf.while_loop.
+    parallel_iterations: A knob to control how many iterations and dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]`
@@ -118,10 +126,13 @@ def batch_jacobian(output, inp, use_pfor=True):
     return gradient_ops.gradients(y, inp)[0]
 
   if use_pfor:
-    pfor_output = control_flow_ops.pfor(loop_fn, output_row_size)
+    pfor_output = control_flow_ops.pfor(loop_fn, output_row_size,
+                                        parallel_iterations=parallel_iterations)
   else:
-    pfor_output = control_flow_ops.for_loop(loop_fn, output.dtype,
-                                            output_row_size)
+    pfor_output = control_flow_ops.for_loop(
+        loop_fn, output.dtype,
+        output_row_size,
+        parallel_iterations=parallel_iterations)
   if pfor_output is None:
     return None
   pfor_output = array_ops.reshape(pfor_output,
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 5467f55af68053f59718e61af6c9a9dde6000f2a..4342833e3eb362e81ff9f60b4649cc5b8de6250f 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -258,6 +258,22 @@ class Mnist(keras_training.Model):
     return self.fc2(y)
 
 
+def create_mnist_autobatch(batch_size, data_format, training):
+  images = random_ops.random_uniform([batch_size, 28, 28])
+  model = Mnist(data_format)
+  manual = model(images, training=training)
+
+  def loop_fn(i):
+    image = array_ops.gather(images, i)
+    return model(image, training=training)
+
+  pfor_outputs = control_flow_ops.pfor(loop_fn, batch_size)
+  while_outputs = control_flow_ops.for_loop(
+      loop_fn, dtypes.float32, batch_size)
+
+  return pfor_outputs, while_outputs, manual
+
+
 def create_mnist_per_eg_grad(batch_size, data_format, training):
   images = random_ops.random_uniform([batch_size, 28, 28])
   sparse_labels = np.random.randint(
@@ -400,6 +416,12 @@ class GradientsTest(test.TestCase):
       self.assertAllClose(ans, pfor_value)
       self.assertAllClose(ans, while_value)
 
+  def test_jacobian_parallel_iterations(self):
+    x = constant_op.constant([[1., 2], [3, 4]])
+    y = math_ops.matmul(x, x)
+    self.assertAllClose(gradients.jacobian(y, x, parallel_iterations=2),
+                        gradients.jacobian(y, x, parallel_iterations=3))
+
   def test_batch_jacobian_bad_shapes(self):
     x = random_ops.random_uniform([2, 2])
     y = random_ops.random_uniform([3, 2])
@@ -443,6 +465,13 @@ class GradientsTest(test.TestCase):
       self.assertAllClose(ans, pfor_value)
       self.assertAllClose(ans, while_value)
 
+  def test_batch_jacobian_parallel_iterations(self):
+    x = constant_op.constant([[1., 2], [3, 4]])
+    w = constant_op.constant([[1., 2, 3, 4], [5, 6, 7, 8]])
+    y = math_ops.matmul(x, w)
+    self.assertAllClose(gradients.batch_jacobian(y, x, parallel_iterations=2),
+                        gradients.batch_jacobian(y, x, parallel_iterations=3))
+
   def test_fc_batch_jacobian(self):
     pfor_jacobian, while_jacobian = create_fc_batch_jacobian(8, 4, 2)
     self.run_and_assert_equal(pfor_jacobian, while_jacobian)
@@ -455,8 +484,8 @@ class GradientsTest(test.TestCase):
     pfor_jacobian, while_gradients = create_dynamic_lstm_batch_jacobian(8, 4, 3)
     with session.Session() as sess:
       init = variables.global_variables_initializer()
-      sess.run(init)
-      pfor = sess.run(pfor_jacobian)
+      self.evaluate(init)
+      pfor = self.evaluate(pfor_jacobian)
       for i in range(4):
         while_i = sess.run(while_gradients[i])
         self.assertAllClose(while_i, pfor[:, i, ...])
@@ -531,11 +560,11 @@ class GradientsBenchmarks(test.Benchmark):
     sess = session.Session()
     with sess:
       init = variables.global_variables_initializer()
-      sess.run(init)
-      sess.run(targets)
+      self.evaluate(init)
+      self.evaluate(targets)
       begin = time.time()
       for _ in range(iters):
-        sess.run(targets)
+        self.evaluate(targets)
       end = time.time()
     avg_time_ms = 1000 * (end - begin) / iters
     self.report_benchmark(iters=iters, wall_time=avg_time_ms, name=name)
@@ -577,6 +606,16 @@ class GradientsBenchmarks(test.Benchmark):
       self._run(pfor_outputs, 100, name="lstm_per_eg_grad_pfor")
       self._run(while_outputs, 20, name="lstm_per_eg_grad_while")
 
+  def benchmark_mnist_autobatch(self):
+    with ops.Graph().as_default():
+      data_format = ("channels_first"
+                     if test.is_gpu_available() else "channels_last")
+      pfor_outputs, while_outputs, manual = create_mnist_autobatch(
+          100, data_format, training=False)
+      self._run(pfor_outputs, 100, name="mnist_pfor")
+      self._run(while_outputs, 20, name="mnist_while")
+      self._run(manual, 100, name="mnist_manual")
+
   def benchmark_mnist_per_eg_grad(self):
     with ops.Graph().as_default():
       data_format = ("channels_first"
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 83cbe64ff21d6fa4380ddc9effb18b80feb5536b..a22c1126c93915da7acc5221594567f855557b84 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1033,7 +1033,7 @@ class PFor(object):
         *[self._unwrap_or_tile(w) for w in outputs])
 
   def _restack_sparse_tensor_logically(self, indices, values, shape):
-    sparse_tensor_rank = indices.get_shape()[-1].value
+    sparse_tensor_rank = indices.get_shape().dims[-1].value
     if sparse_tensor_rank is not None:
       sparse_tensor_rank += 1
 
@@ -1152,9 +1152,8 @@ class PFor(object):
           continue
 
         converted_inputs = [self._conversion_map[inp] for inp in y_op.inputs]
-        some_input_converted = any(
-            [self._was_converted(x) for x in y_op.inputs])
-        some_input_stacked = any([x.is_stacked for x in converted_inputs])
+        some_input_converted = any(self._was_converted(x) for x in y_op.inputs)
+        some_input_stacked = any(x.is_stacked for x in converted_inputs)
 
         converted_control_ops = set()
         some_control_input_converted = False
@@ -1198,7 +1197,7 @@ class PFor(object):
           # All inputs are unstacked or uncoverted but some control inputs are
           # converted.
           # TODO(rachelim): Handle the case where some inputs are sparsely
-          # stacked (i.e. any([x.is_sparse_stacked for x in converted_inputs]))
+          # stacked (i.e. any(x.is_sparse_stacked for x in converted_inputs))
           new_op = _create_op(y_op.type, [x.t for x in converted_inputs],
                               [x.dtype for x in y_op.outputs],
                               y_op.node_def.attr)
@@ -1303,7 +1302,11 @@ def _inputs_with_flattening(pfor_input, input_indices):
 @RegisterPForWithArgs("Conv2D", dims=[0])
 @RegisterPForWithArgs("AvgPool", dims=[0])
 @RegisterPForWithArgs("MaxPool", dims=[0])
+@RegisterPForWithArgs("MaxPool3D", dims=[0])
+@RegisterPForWithArgs("MaxPool3DGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("MaxPoolGrad", dims=[0, 1, 2])
+@RegisterPForWithArgs("MaxPool3DGradGrad", dims=[0, 1, 2])
+@RegisterPForWithArgs("MaxPoolGradGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("SoftmaxCrossEntropyWithLogits", dims=[0, 1])
 def _convert_flatten_batch(pfor_input, op_type, dims):
   del op_type
@@ -1532,11 +1535,19 @@ def _convert_conv2d_backprop_filter(pfor_input):
 
 @RegisterPForWithArgs("Identity", array_ops.identity)
 @RegisterPForWithArgs("StopGradient", array_ops.stop_gradient)
+@RegisterPForWithArgs("MatrixDiagPart", array_ops.matrix_diag_part)
 def _convert_identity(pfor_input, op_type, op_func):
   del op_type
   return wrap(op_func(*[x.t for x in pfor_input.inputs]), True)
 
 
+@RegisterPFor("IdentityN")
+def _convert_identity_n(pfor_input):
+  outputs = array_ops.identity_n([x.t for x in pfor_input.inputs])
+  return [wrap(out, inp.is_stacked) for out, inp in
+          zip(outputs, pfor_input.inputs)]
+
+
 @RegisterPFor("Reshape")
 def _convert_reshape(pfor_input):
   t = pfor_input.stacked_input(0)
@@ -1609,6 +1620,15 @@ def _convert_split(pfor_input):
   return [wrap(x, True) for x in array_ops.split(t, num_split, axis=split_dim)]
 
 
+@RegisterPFor("SplitV")
+def _convert_split_v(pfor_input):
+  t = pfor_input.stacked_input(0)
+  splits = pfor_input.unstacked_input(1)
+  split_dim = pfor_input.unstacked_input(2)
+  split_dim += math_ops.cast(split_dim >= 0, dtypes.int32)
+  return [wrap(x, True) for x in array_ops.split(t, splits, axis=split_dim)]
+
+
 @RegisterPFor("Transpose")
 def _convert_transpose(pfor_input):
   t = pfor_input.stacked_input(0)
@@ -1641,8 +1661,8 @@ def _convert_gather(pfor_input):
       axis = axis_value
   if indices_stacked and not param_stacked:
     if indices == pfor_input.pfor.all_indices and axis == 0:
-      param_shape0 = param.shape[0].value
-      indices_shape0 = indices.shape[0].value
+      param_shape0 = param.shape.dims[0].value
+      indices_shape0 = indices.shape.dims[0].value
       if param_shape0 is not None and indices_shape0 == param_shape0:
         # Note that with loops and conditionals, indices may not be contiguous.
         # However they will be sorted and unique. So if the shape matches, then
@@ -1908,7 +1928,8 @@ def _convert_unsortedsegmentsum(pfor_input):
   segment_offset = array_ops.reshape(segment_offset,
                                      array_ops.concat([[n], ones], axis=0))
   segment_ids += segment_offset
-  num_segments *= n
+  num_segments = math_ops.cast(num_segments, dtypes.int64) * math_ops.cast(
+      n, dtypes.int64)
   output = math_ops.unsorted_segment_sum(data, segment_ids, num_segments)
   new_output_shape = array_ops.concat(
       [[n, -1], array_ops.shape(output)[1:]], axis=0)
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index ff50fe0d096c46580cccea287cf68067f187f8d6..a84af6c5cf27f2e021b3950f4a60a87cb5324942 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -46,7 +46,7 @@ ops.NotDifferentiable("SerializeTensor")
 ops.NotDifferentiable("StringToNumber")
 
 
-@tf_export("io.VarLenFeature", "VarLenFeature")
+@tf_export("io.VarLenFeature", v1=["VarLenFeature", "io.VarLenFeature"])
 class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   """Configuration for parsing a variable-length input feature.
 
@@ -56,7 +56,7 @@ class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])):
   pass
 
 
-@tf_export("io.SparseFeature", "SparseFeature")
+@tf_export("io.SparseFeature", v1=["io.SparseFeature", "SparseFeature"])
 class SparseFeature(
     collections.namedtuple(
         "SparseFeature",
@@ -131,7 +131,7 @@ class SparseFeature(
         cls, index_key, value_key, dtype, size, already_sorted)
 
 
-@tf_export("io.FixedLenFeature", "FixedLenFeature")
+@tf_export("io.FixedLenFeature", v1=["io.FixedLenFeature", "FixedLenFeature"])
 class FixedLenFeature(collections.namedtuple(
     "FixedLenFeature", ["shape", "dtype", "default_value"])):
   """Configuration for parsing a fixed-length input feature.
@@ -151,7 +151,8 @@ class FixedLenFeature(collections.namedtuple(
         cls, shape, dtype, default_value)
 
 
-@tf_export("io.FixedLenSequenceFeature", "FixedLenSequenceFeature")
+@tf_export("io.FixedLenSequenceFeature",
+           v1=["io.FixedLenSequenceFeature", "FixedLenSequenceFeature"])
 class FixedLenSequenceFeature(collections.namedtuple(
     "FixedLenSequenceFeature",
     ["shape", "dtype", "allow_missing", "default_value"])):
@@ -217,21 +218,21 @@ def _features_to_raw_params(features, types):
       feature = features[key]
       if isinstance(feature, VarLenFeature):
         if VarLenFeature not in types:
-          raise ValueError("Unsupported VarLenFeature %s." % feature)
+          raise ValueError("Unsupported VarLenFeature %s." % (feature,))
         if not feature.dtype:
           raise ValueError("Missing type for feature %s." % key)
         sparse_keys.append(key)
         sparse_types.append(feature.dtype)
       elif isinstance(feature, SparseFeature):
         if SparseFeature not in types:
-          raise ValueError("Unsupported SparseFeature %s." % feature)
+          raise ValueError("Unsupported SparseFeature %s." % (feature,))
 
         if not feature.index_key:
           raise ValueError(
-              "Missing index_key for SparseFeature %s." % feature)
+              "Missing index_key for SparseFeature %s." % (feature,))
         if not feature.value_key:
           raise ValueError(
-              "Missing value_key for SparseFeature %s." % feature)
+              "Missing value_key for SparseFeature %s." % (feature,))
         if not feature.dtype:
           raise ValueError("Missing type for feature %s." % key)
         index_keys = feature.index_key
@@ -260,7 +261,7 @@ def _features_to_raw_params(features, types):
           sparse_types.append(feature.dtype)
       elif isinstance(feature, FixedLenFeature):
         if FixedLenFeature not in types:
-          raise ValueError("Unsupported FixedLenFeature %s." % feature)
+          raise ValueError("Unsupported FixedLenFeature %s." % (feature,))
         if not feature.dtype:
           raise ValueError("Missing type for feature %s." % key)
         if feature.shape is None:
@@ -281,7 +282,8 @@ def _features_to_raw_params(features, types):
           dense_defaults[key] = feature.default_value
       elif isinstance(feature, FixedLenSequenceFeature):
         if FixedLenSequenceFeature not in types:
-          raise ValueError("Unsupported FixedLenSequenceFeature %s." % feature)
+          raise ValueError("Unsupported FixedLenSequenceFeature %s." % (
+              feature,))
         if not feature.dtype:
           raise ValueError("Missing type for feature %s." % key)
         if feature.shape is None:
@@ -361,7 +363,7 @@ def _prepend_none_dimension(features):
     return features
 
 
-@tf_export("io.parse_example", "parse_example")
+@tf_export(v1=["io.parse_example", "parse_example"])
 def parse_example(serialized, features, name=None, example_names=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -572,6 +574,223 @@ def parse_example(serialized, features, name=None, example_names=None):
   Returns:
     A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
 
+  Raises:
+    ValueError: if any feature is invalid.
+  """
+  return parse_example_v2(serialized, features, example_names, name)
+
+
+@tf_export("io.parse_example", v1=[])
+def parse_example_v2(serialized, features, example_names=None, name=None):
+  # pylint: disable=line-too-long
+  """Parses `Example` protos into a `dict` of tensors.
+
+  Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+  protos given in `serialized`. We refer to `serialized` as a batch with
+  `batch_size` many entries of individual `Example` protos.
+
+  `example_names` may contain descriptive names for the corresponding serialized
+  protos. These may be useful for debugging purposes, but they have no effect on
+  the output. If not `None`, `example_names` must be the same length as
+  `serialized`.
+
+  This op parses serialized examples into a dictionary mapping keys to `Tensor`
+  and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
+  `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
+  and `SparseFeature` is mapped to a `SparseTensor`, and each
+  `FixedLenFeature` is mapped to a `Tensor`.
+
+  Each `VarLenFeature` maps to a `SparseTensor` of the specified type
+  representing a ragged matrix. Its indices are `[batch, index]` where `batch`
+  identifies the example in `serialized`, and `index` is the value's index in
+  the list of values associated with that feature and example.
+
+  Each `SparseFeature` maps to a `SparseTensor` of the specified type
+  representing a Tensor of `dense_shape` `[batch_size] + SparseFeature.size`.
+  Its `values` come from the feature in the examples with key `value_key`.
+  A `values[i]` comes from a position `k` in the feature of an example at batch
+  entry `batch`. This positional information is recorded in `indices[i]` as
+  `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of
+  the feature in the example at with key `SparseFeature.index_key[j]`.
+  In other words, we split the indices (except the first index indicating the
+  batch entry) of a `SparseTensor` by dimension into different features of the
+  `Example`. Due to its complexity a `VarLenFeature` should be preferred over a
+  `SparseFeature` whenever possible.
+
+  Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
+  `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`.
+
+  `FixedLenFeature` entries with a `default_value` are optional. With no default
+  value, we will fail if that `Feature` is missing from any example in
+  `serialized`.
+
+  Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type
+  (or `tf.float32` if not specified) and shape
+  `(serialized.size(), None) + df.shape`.
+  All examples in `serialized` will be padded with `default_value` along the
+  second dimension.
+
+  Examples:
+
+  For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three
+  serialized `Example`s are provided:
+
+  ```
+  serialized = [
+    features
+      { feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } },
+    features
+      { feature []},
+    features
+      { feature { key: "ft" value { float_list { value: [3.0] } } }
+  ]
+  ```
+
+  then the output will look like:
+
+  ```python
+  {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
+                      values=[1.0, 2.0, 3.0],
+                      dense_shape=(3, 2)) }
+  ```
+
+  If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and
+  `shape=[]` is used then the output will look like:
+
+  ```python
+  {"ft": [[1.0, 2.0], [3.0, -1.0]]}
+  ```
+
+  Given two `Example` input protos in `serialized`:
+
+  ```
+  [
+    features {
+      feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } }
+      feature { key: "gps" value { float_list { value: [] } } }
+    },
+    features {
+      feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } }
+      feature { key: "dank" value { int64_list { value: [ 42 ] } } }
+      feature { key: "gps" value { } }
+    }
+  ]
+  ```
+
+  And arguments
+
+  ```
+  example_names: ["input0", "input1"],
+  features: {
+      "kw": VarLenFeature(tf.string),
+      "dank": VarLenFeature(tf.int64),
+      "gps": VarLenFeature(tf.float32),
+  }
+  ```
+
+  Then the output is a dictionary:
+
+  ```python
+  {
+    "kw": SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 0]],
+        values=["knit", "big", "emmy"]
+        dense_shape=[2, 2]),
+    "dank": SparseTensor(
+        indices=[[1, 0]],
+        values=[42],
+        dense_shape=[2, 1]),
+    "gps": SparseTensor(
+        indices=[],
+        values=[],
+        dense_shape=[2, 0]),
+  }
+  ```
+
+  For dense results in two serialized `Example`s:
+
+  ```
+  [
+    features {
+      feature { key: "age" value { int64_list { value: [ 0 ] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+     },
+     features {
+      feature { key: "age" value { int64_list { value: [] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+    }
+  ]
+  ```
+
+  We can use arguments:
+
+  ```
+  example_names: ["input0", "input1"],
+  features: {
+      "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
+      "gender": FixedLenFeature([], dtype=tf.string),
+  }
+  ```
+
+  And the expected output is:
+
+  ```python
+  {
+    "age": [[0], [-1]],
+    "gender": [["f"], ["f"]],
+  }
+  ```
+
+  An alternative to `VarLenFeature` to obtain a `SparseTensor` is
+  `SparseFeature`. For example, given two `Example` input protos in
+  `serialized`:
+
+  ```
+  [
+    features {
+      feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } }
+      feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } }
+    },
+    features {
+      feature { key: "val" value { float_list { value: [ 0.0 ] } } }
+      feature { key: "ix" value { int64_list { value: [ 42 ] } } }
+    }
+  ]
+  ```
+
+  And arguments
+
+  ```
+  example_names: ["input0", "input1"],
+  features: {
+      "sparse": SparseFeature(
+          index_key="ix", value_key="val", dtype=tf.float32, size=100),
+  }
+  ```
+
+  Then the output is a dictionary:
+
+  ```python
+  {
+    "sparse": SparseTensor(
+        indices=[[0, 3], [0, 20], [1, 42]],
+        values=[0.5, -1.0, 0.0]
+        dense_shape=[2, 100]),
+  }
+  ```
+
+  Args:
+    serialized: A vector (1-D Tensor) of strings, a batch of binary
+      serialized `Example` protos.
+    features: A `dict` mapping feature keys to `FixedLenFeature`,
+      `VarLenFeature`, and `SparseFeature` values.
+    example_names: A vector (1-D Tensor) of strings (optional), the names of
+      the serialized protos in the batch.
+    name: A name for this operation (optional).
+
+  Returns:
+    A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
+
   Raises:
     ValueError: if any feature is invalid.
   """
@@ -730,7 +949,7 @@ def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types,
     default_value = dense_defaults.get(key)
     dense_shape = dense_shapes[i]
     if (dense_shape.ndims is not None and dense_shape.ndims > 0 and
-        dense_shape[0].value is None):
+        dense_shape.dims[0].value is None):
       # Variable stride dense shape, the default value should be a
       # scalar padding value
       if default_value is None:
@@ -762,7 +981,7 @@ def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types,
           dense_shapes_as_proto, dense_shapes)
 
 
-@tf_export("io.parse_single_example", "parse_single_example")
+@tf_export(v1=["io.parse_single_example", "parse_single_example"])
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -792,6 +1011,48 @@ def parse_single_example(serialized, features, name=None, example_names=None):
   Returns:
     A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
 
+  Raises:
+    ValueError: if any feature is invalid.
+  """
+  return parse_single_example_v2_unoptimized(
+      serialized, features, example_names, name
+      )
+
+
+# TODO(b/70890287): Combine the implementation of this op and
+# `parse_single_example_v2()` after 1/10/2018.
+@tf_export("io.parse_single_example", v1=[])
+def parse_single_example_v2_unoptimized(
+    serialized, features, example_names=None, name=None
+    ):
+  """Parses a single `Example` proto.
+
+  Similar to `parse_example`, except:
+
+  For dense tensors, the returned `Tensor` is identical to the output of
+  `parse_example`, except there is no batch dimension, the output shape is the
+  same as the shape given in `dense_shape`.
+
+  For `SparseTensor`s, the first (batch) column of the indices matrix is removed
+  (the indices matrix is a column vector), the values vector is unchanged, and
+  the first (`batch_size`) entry of the shape vector is removed (it is now a
+  single element vector).
+
+  One might see performance advantages by batching `Example` protos with
+  `parse_example` instead of using this function directly.
+
+  Args:
+    serialized: A scalar string Tensor, a single serialized Example.
+      See `_parse_single_example_raw` documentation for more details.
+    features: A `dict` mapping feature keys to `FixedLenFeature` or
+      `VarLenFeature` values.
+    example_names: (Optional) A scalar string Tensor, the associated name.
+      See `_parse_single_example_raw` documentation for more details.
+    name: A name for this operation (optional).
+
+  Returns:
+    A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
+
   Raises:
     ValueError: if any feature is invalid.
   """
@@ -1245,7 +1506,9 @@ def _parse_sequence_example_raw(serialized,
 
 # TODO(sundberg): rewrite this method to call the batch version, which is more
 # efficient especially for large inputs.
-@tf_export("io.parse_single_sequence_example", "parse_single_sequence_example")
+@tf_export("io.parse_single_sequence_example",
+           v1=["io.parse_single_sequence_example",
+               "parse_single_sequence_example"])
 def parse_single_sequence_example(
     serialized, context_features=None, sequence_features=None,
     example_name=None, name=None):
@@ -1565,7 +1828,7 @@ def _parse_single_sequence_example_raw(serialized,
 
 
 # Swap `name` and `na_value` for backward compatibility.
-@tf_export("io.decode_csv", "decode_csv")
+@tf_export(v1=["io.decode_csv", "decode_csv"])
 @deprecation.deprecated_endpoints("decode_csv")
 def decode_csv(records,
                record_defaults,
@@ -1604,6 +1867,54 @@ def decode_csv(records,
     A list of `Tensor` objects. Has the same type as `record_defaults`.
     Each tensor will have the same shape as records.
 
+  Raises:
+    ValueError: If any of the arguments is malformed.
+  """
+  return decode_csv_v2(
+      records, record_defaults,
+      field_delim, use_quote_delim,
+      na_value, select_cols, name
+      )
+
+
+@tf_export("io.decode_csv", v1=[])
+def decode_csv_v2(records,
+                  record_defaults,
+                  field_delim=",",
+                  use_quote_delim=True,
+                  na_value="",
+                  select_cols=None,
+                  name=None):
+  """Convert CSV records to tensors. Each column maps to one tensor.
+
+  RFC 4180 format is expected for the CSV records.
+  (https://tools.ietf.org/html/rfc4180)
+  Note that we allow leading and trailing spaces with int or float field.
+
+  Args:
+    records: A `Tensor` of type `string`.
+      Each string is a record/row in the csv and all records should have
+      the same format.
+    record_defaults: A list of `Tensor` objects with specific types.
+      Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`.
+      One tensor per column of the input record, with either a
+      scalar default value for that column or an empty vector if the column is
+      required.
+    field_delim: An optional `string`. Defaults to `","`.
+      char delimiter to separate fields in a record.
+    use_quote_delim: An optional `bool`. Defaults to `True`.
+      If false, treats double quotation marks as regular
+      characters inside of the string fields (ignoring RFC 4180, Section 2,
+      Bullet 5).
+    na_value: Additional string to recognize as NA/NaN.
+    select_cols: Optional sorted list of column indices to select. If specified,
+      only this subset of columns will be parsed and returned.
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `record_defaults`.
+    Each tensor will have the same shape as records.
+
   Raises:
     ValueError: If any of the arguments is malformed.
   """
@@ -1769,7 +2080,7 @@ def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types,
       default_value = dense_defaults.get(key)
       dense_shape = dense_shapes[i]
       if (dense_shape.ndims is not None and dense_shape.ndims > 0 and
-          dense_shape[0].value is None):
+          dense_shape.dims[0].value is None):
         # Variable stride dense shape, the default value should be a
         # scalar padding value
         if default_value is None:
diff --git a/tensorflow/python/ops/partitioned_variables.py b/tensorflow/python/ops/partitioned_variables.py
index 174cabdf8027e75c780441d06a98a24c19be0cfc..c1084c25592045734ae016c9d5a84b5264a38032 100644
--- a/tensorflow/python/ops/partitioned_variables.py
+++ b/tensorflow/python/ops/partitioned_variables.py
@@ -57,7 +57,7 @@ import math
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -68,7 +68,7 @@ __all__ = [
 ]
 
 
-@tf_export("variable_axis_size_partitioner")
+@tf_export(v1=["variable_axis_size_partitioner"])
 def variable_axis_size_partitioner(
     max_shard_bytes, axis=0, bytes_per_string_element=16, max_shards=None):
   """Get a partitioner for VariableScope to keep shards below `max_shard_bytes`.
@@ -96,7 +96,7 @@ def variable_axis_size_partitioner(
 
   Returns:
     A partition function usable as the `partitioner` argument to
-    `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
+    `variable_scope` and `get_variable`.
 
   Raises:
     ValueError: If any of the byte counts are non-positive.
@@ -136,13 +136,14 @@ def variable_axis_size_partitioner(
 
     partitions = [1] * shape.ndims
     bytes_per_slice = 1.0 * (
-        shape.num_elements() / shape[axis].value) * element_size
+        shape.num_elements() / shape.dims[axis].value) * element_size
     # How many slices can we fit on one shard of size at most max_shard_bytes?
     # At least one slice is required.
     slices_per_shard = max(1, math.floor(max_shard_bytes / bytes_per_slice))
     # How many shards do we need for axis given that each shard fits
-    # slices_per_shard slices from a total of shape[axis].value slices?
-    axis_shards = int(math.ceil(1.0 * shape[axis].value / slices_per_shard))
+    # slices_per_shard slices from a total of shape[axis] slices?
+    axis_shards = int(math.ceil(
+        1.0 * shape.dims[axis].value / slices_per_shard))
     if max_shards:
       axis_shards = min(max_shards, axis_shards)
 
@@ -153,7 +154,7 @@ def variable_axis_size_partitioner(
   return _partitioner
 
 
-@tf_export("min_max_variable_partitioner")
+@tf_export(v1=["min_max_variable_partitioner"])
 def min_max_variable_partitioner(max_partitions=1, axis=0,
                                  min_slice_size=256 << 10,
                                  bytes_per_string_element=16):
@@ -174,7 +175,7 @@ def min_max_variable_partitioner(max_partitions=1, axis=0,
 
   Returns:
     A partition function usable as the `partitioner` argument to
-    `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
+    `variable_scope` and `get_variable`.
 
   """
   def _partitioner(shape, dtype):
@@ -210,14 +211,14 @@ def min_max_variable_partitioner(max_partitions=1, axis=0,
     partitions_list = [1] * len(shape)
     # We can not partition the variable beyond what its shape or
     # `max_partitions` allows.
-    partitions_list[axis] = max(1, min(shape[axis].value,
+    partitions_list[axis] = max(1, min(shape.dims[axis].value,
                                        max_partitions,
                                        int(math.ceil(partitions))))
     return partitions_list
   return _partitioner
 
 
-@tf_export("fixed_size_partitioner")
+@tf_export(v1=["fixed_size_partitioner"])
 def fixed_size_partitioner(num_shards, axis=0):
   """Partitioner to specify a fixed number of shards along given axis.
 
@@ -227,16 +228,19 @@ def fixed_size_partitioner(num_shards, axis=0):
 
   Returns:
     A partition function usable as the `partitioner` argument to
-    `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
+    `variable_scope` and `get_variable`.
   """
   def _partitioner(shape, **unused_args):
     partitions_list = [1] * len(shape)
-    partitions_list[axis] = min(num_shards, shape[axis].value)
+    partitions_list[axis] = min(num_shards, shape.dims[axis].value)
     return partitions_list
   return _partitioner
 
 
-@tf_export("create_partitioned_variables")
+@tf_export(v1=["create_partitioned_variables"])
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.get_variable with a partitioner set.")
 def create_partitioned_variables(
     shape, slicing, initializer, dtype=dtypes.float32,
     trainable=True, collections=None, name=None, reuse=None):
@@ -281,11 +285,6 @@ def create_partitioned_variables(
   Raises:
     ValueError: If any of the arguments is malformed.
   """
-  logging.warn(
-      "create_partitioned_variables is deprecated.  Use "
-      "tf.get_variable with a partitioner set, or "
-      "tf.get_partitioned_variable_list, instead.")
-
   if len(shape) != len(slicing):
     raise ValueError("The 'shape' and 'slicing' of a partitioned Variable "
                      "must have the length: shape: %s, slicing: %s" %
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index 4ac2a8f634bb201c9aaecb74432f2e6e78ee840f..6b469a954f6531641f4bc61396563581b7c368fe 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -60,7 +60,7 @@ class Conv2DTest(test.TestCase):
     x2 = x2.astype(np.uint8).reshape(filter_in_sizes)
     x2_min = 0.0
     x2_max = 255.0
-    with self.test_session(use_gpu=False) as sess:
+    with self.cached_session(use_gpu=False) as sess:
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtypes.quint8)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtypes.quint8)
       conv = nn_ops.quantized_conv2d(
@@ -73,7 +73,7 @@ class Conv2DTest(test.TestCase):
           max_input=x1_max,
           min_filter=x2_min,
           max_filter=x2_max)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     quantized_output = value[0]
     output_min = value[1]
     output_max = value[2]
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
index d590bc4be6d520cbaa000d9802b84cbfbf8e90b9..b81843d17482bdff910827125ed8affd4094b942 100644
--- a/tensorflow/python/ops/quantized_ops_test.py
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -33,7 +33,7 @@ class QuantizedOpsTest(test.TestCase):
 
   def testQuantizeOp(self):
     expected_output = [1, 1, 2, 127, 255, 255]
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       x = constant_op.constant(
           [1.0, 1.25, 1.75, 127.0, 255.0, 500.0],
           shape=[6],
@@ -41,18 +41,18 @@ class QuantizedOpsTest(test.TestCase):
       x_min = 0.0
       x_max = 255.0
       op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
-      value = sess.run(op)
+      value = self.evaluate(op)
       self.assertArrayNear(expected_output, value.output, 0.1)
 
   def testDequantizeOp(self):
     expected_output = [1.0, 2.0, 4.0, 8.0, 16.0, 255.0]
     inp = np.array([1, 2, 4, 8, 16, 255]).astype(np.uint8)
-    with self.test_session(use_gpu=False) as sess:
+    with self.session(use_gpu=False) as sess:
       x = constant_op.constant(inp, shape=[6], dtype=dtypes.quint8)
       x_min = 0.0
       x_max = 255.0
       op = array_ops.dequantize(x, x_min, x_max, mode="MIN_FIRST")
-      value = sess.run(op)
+      value = self.evaluate(op)
       self.assertArrayNear(expected_output, value, 0.1)
 
 
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..d88543c400f2432ea620ccddcab983337abe3fc2
--- /dev/null
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -0,0 +1,800 @@
+package(
+    default_visibility = [
+        "//intelligence/datum/prensor:__pkg__",
+        "//learning/brain/contrib/text:__pkg__",
+        "//nlp/projects/atc/tf/ops:__pkg__",
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+#-------------------------------------------------------------------------------
+# RaggedTensor
+#-------------------------------------------------------------------------------
+
+py_library(
+    name = "ragged",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    tags = ["nofixdeps"],
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_dispatch",
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_getitem",
+        ":ragged_map_ops",
+        ":ragged_math_ops",
+        ":ragged_operators",
+        ":ragged_string_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_tensor_value",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_array_ops",
+    srcs = ["ragged_array_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_array_ops_gen",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_conversion_ops",
+    srcs = ["ragged_conversion_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_conversion_ops_gen",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "ragged_factory_ops",
+    srcs = ["ragged_factory_ops.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "ragged_functional_ops",
+    srcs = ["ragged_functional_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_getitem",
+    srcs = ["ragged_getitem.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_math_ops",
+    srcs = ["ragged_math_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_math_ops_gen",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_operators",
+    srcs = ["ragged_operators.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_getitem",
+        ":ragged_tensor",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_string_ops",
+    srcs = ["ragged_string_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_tensor",
+    srcs = ["ragged_tensor.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_tensor_value",
+        ":ragged_util",
+        ":segment_id_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "ragged_tensor_shape",
+    srcs = ["ragged_tensor_shape.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_tensor_value",
+    srcs = ["ragged_tensor_value.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//third_party/py/numpy"],
+)
+
+py_library(
+    name = "ragged_util",
+    srcs = ["ragged_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_math_ops_gen",
+    ],
+)
+
+py_library(
+    name = "segment_id_ops",
+    srcs = ["segment_id_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
+py_library(
+    name = "ragged_map_ops",
+    srcs = ["ragged_map_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "ragged_dispatch",
+    srcs = ["ragged_dispatch.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+#-------------------------------------------------------------------------------
+# RaggedTensor Tests
+#-------------------------------------------------------------------------------
+
+py_library(
+    name = "ragged_test_util",
+    srcs = ["ragged_test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_tensor",
+        ":ragged_tensor_value",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_tensor_test",
+    size = "medium",
+    srcs = ["ragged_tensor_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_eager_test",
+    size = "medium",
+    srcs = ["ragged_eager_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_range_op_test",
+    srcs = ["ragged_range_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_tensor_bounding_shape_op_test",
+    srcs = ["ragged_tensor_bounding_shape_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_row_lengths_op_test",
+    srcs = ["ragged_row_lengths_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_gather_op_test",
+    srcs = ["ragged_gather_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_batch_gather_op_test",
+    srcs = ["ragged_batch_gather_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_gather_nd_op_test",
+    srcs = ["ragged_gather_nd_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_row_splits_to_segment_ids_op_test",
+    srcs = ["ragged_row_splits_to_segment_ids_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_segment_ids_to_row_splits_op_test",
+    srcs = ["ragged_segment_ids_to_row_splits_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_from_tensor_op_test",
+    srcs = ["ragged_from_tensor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_to_sparse_op_test",
+    srcs = ["ragged_to_sparse_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_from_sparse_op_test",
+    srcs = ["ragged_from_sparse_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
+    ],
+)
+
+py_test(
+    name = "ragged_to_tensor_op_test",
+    srcs = ["ragged_to_tensor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_segment_op_test",
+    srcs = ["ragged_segment_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_reduce_op_test",
+    srcs = ["ragged_reduce_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_map_flat_values_op_test",
+    srcs = ["ragged_map_flat_values_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_const_op_test",
+    srcs = ["ragged_const_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_constant_value_op_test",
+    srcs = ["ragged_constant_value_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "convert_to_tensor_or_ragged_tensor_op_test",
+    srcs = ["convert_to_tensor_or_ragged_tensor_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_boolean_mask_op_test",
+    srcs = ["ragged_boolean_mask_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_concat_op_test",
+    srcs = ["ragged_concat_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_stack_op_test",
+    srcs = ["ragged_stack_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_tile_op_test",
+    srcs = ["ragged_tile_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_util_test",
+    srcs = ["ragged_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_test_util",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_expand_dims_op_test",
+    srcs = ["ragged_expand_dims_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_where_op_test",
+    srcs = ["ragged_where_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_dispatch_test",
+    srcs = ["ragged_dispatch_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_operators_test",
+    srcs = ["ragged_operators_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+py_test(
+    name = "ragged_map_fn_op_test",
+    size = "small",
+    srcs = ["ragged_map_fn_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python/keras:backend",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_tensor_shape_test",
+    srcs = ["ragged_tensor_shape_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d915ee269b45571c9338ea1d734ddaa4b884a98
--- /dev/null
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -0,0 +1,140 @@
+"""Ragged Tensors.
+
+This package defines the `tf.RaggedTensor` class, which
+represents tensors with non-uniform shapes.  In particular, each `RaggedTensor`
+has one or more *ragged dimensions*, which are dimensions whose slices may have
+different lengths.  For example, the inner (column) dimension of
+`rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged, since the column slices
+(`rt[0, :]`, ..., `rt[4, :]`) have different lengths.  For a more detailed
+description of ragged tensors, see the `tf.RaggedTensor`
+class documentation.
+
+<!-- Ragged Classes & related helper functions -->
+@@RaggedTensor
+@@RaggedTensorType
+@@RaggedTensorValue
+@@is_ragged
+
+<!-- Factory Ops -->
+@@ragged_factory_ops
+@@constant
+@@constant_value
+@@convert_to_tensor_or_ragged_tensor
+
+<!-- Conversion Ops -->
+@@from_tensor
+@@to_tensor
+@@from_sparse
+@@to_sparse
+@@row_splits_to_segment_ids
+@@segment_ids_to_row_splits
+
+<!-- Array Ops -->
+@@gather
+@@batch_gather
+@@gather_nd
+@@boolean_mask
+@@concat
+@@stack
+@@tile
+@@expand_dims
+@@where
+
+<!-- Math Ops -->
+@@range
+
+@@segment_sum
+@@segment_prod
+@@segment_min
+@@segment_max
+@@segment_mean
+@@segment_sqrt_n
+
+@@reduce_sum
+@@reduce_prod
+@@reduce_min
+@@reduce_max
+@@reduce_mean
+@@reduce_all
+@@reduce_any
+
+<!-- Functional Ops -->
+@@map_flat_values
+@@map_fn
+
+<!-- Shape & broadcasting -->
+@@RaggedTensorDynamicShape
+@@broadcast_to
+@@broadcast_dynamic_shape
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.ragged import ragged_dispatch
+from tensorflow.python.ops.ragged import ragged_operators
+from tensorflow.python.ops.ragged import ragged_string_ops
+
+from tensorflow.python.ops.ragged.ragged_array_ops import batch_gather
+from tensorflow.python.ops.ragged.ragged_array_ops import boolean_mask
+from tensorflow.python.ops.ragged.ragged_array_ops import concat
+from tensorflow.python.ops.ragged.ragged_array_ops import expand_dims
+from tensorflow.python.ops.ragged.ragged_array_ops import gather
+from tensorflow.python.ops.ragged.ragged_array_ops import gather_nd
+from tensorflow.python.ops.ragged.ragged_array_ops import stack
+from tensorflow.python.ops.ragged.ragged_array_ops import tile
+from tensorflow.python.ops.ragged.ragged_array_ops import where
+
+from tensorflow.python.ops.ragged.ragged_conversion_ops import from_sparse
+from tensorflow.python.ops.ragged.ragged_conversion_ops import from_tensor
+from tensorflow.python.ops.ragged.ragged_conversion_ops import to_sparse
+from tensorflow.python.ops.ragged.ragged_conversion_ops import to_tensor
+
+from tensorflow.python.ops.ragged.ragged_factory_ops import constant
+from tensorflow.python.ops.ragged.ragged_factory_ops import constant_value
+
+from tensorflow.python.ops.ragged.ragged_functional_ops import map_flat_values
+
+from tensorflow.python.ops.ragged.ragged_map_ops import map_fn
+
+from tensorflow.python.ops.ragged.ragged_math_ops import range  # pylint: disable=redefined-builtin
+
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_all
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_any
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_max
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_mean
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_min
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_prod
+from tensorflow.python.ops.ragged.ragged_math_ops import reduce_sum
+
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_max
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_mean
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_min
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_prod
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_sqrt_n
+from tensorflow.python.ops.ragged.ragged_math_ops import segment_sum
+
+from tensorflow.python.ops.ragged.ragged_tensor import convert_to_tensor_or_ragged_tensor
+from tensorflow.python.ops.ragged.ragged_tensor import is_ragged
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorType
+
+from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_dynamic_shape
+from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_to
+from tensorflow.python.ops.ragged.ragged_tensor_shape import RaggedTensorDynamicShape
+
+from tensorflow.python.ops.ragged.ragged_tensor_value import RaggedTensorValue
+
+from tensorflow.python.ops.ragged.segment_id_ops import row_splits_to_segment_ids
+from tensorflow.python.ops.ragged.segment_id_ops import segment_ids_to_row_splits
+
+from tensorflow.python.util import all_util as _all_util
+
+
+# Register OpDispatchers that override standard TF ops to work w/ RaggedTensors.
+__doc__ += ragged_dispatch.register_dispatchers()  # pylint: disable=redefined-builtin
+
+# Any symbol that is not referenced (with "@@name") in the module docstring
+# above will be removed.
+_all_util.remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b88f18c8b61a2fbc33aeca1f799c8e518cac4bf6
--- /dev/null
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -0,0 +1,208 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.convert_to_tensor_or_ragged."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConvertToTensorOrRaggedTensorTest(
+    ragged_test_util.RaggedTensorTestCase, parameterized.TestCase):
+
+  #=============================================================================
+  # Tests where the 'value' param is a RaggedTensor
+  #=============================================================================
+  @parameterized.parameters([
+      dict(pylist=[[1, 2], [3]]),
+      dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.string),
+  ])
+  def testConvertRaggedTensor(self, pylist, dtype=None, preferred_dtype=None):
+    rt = ragged.constant(pylist)
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        rt, dtype, preferred_dtype)
+    self.assertIs(converted, rt)
+
+  @parameterized.parameters([
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.float32,
+          message=('Tensor conversion requested dtype float32 for '
+                   'RaggedTensor with dtype int32')),
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.string,
+          message=('Tensor conversion requested dtype string for '
+                   'RaggedTensor with dtype .*')),
+  ])
+  def testConvertRaggedTensorError(self,
+                                   pylist,
+                                   message,
+                                   dtype=None,
+                                   preferred_dtype=None):
+    rt = ragged.constant(pylist)
+
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(rt, dtype, preferred_dtype)
+
+  #=============================================================================
+  # Tests where the 'value' param is a RaggedTensorValue
+  #=============================================================================
+  @parameterized.parameters([
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          expected_dtype=dtypes.int32),
+      dict(
+          value=ragged.constant_value([[b'a', b'b'], [b'c']]),
+          expected_dtype=dtypes.string),
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          preferred_dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
+          preferred_dtype=dtypes.string,
+          expected_dtype=dtypes.int32),
+  ])
+  def testConvertRaggedTensorValue(self,
+                                   value,
+                                   dtype=None,
+                                   preferred_dtype=None,
+                                   expected_dtype=None):
+    if expected_dtype is None:
+      expected_dtype = value.dtype if dtype is None else dtype
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        value, dtype, preferred_dtype)
+    self.assertEqual(value.ragged_rank, converted.ragged_rank)
+    self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
+    self.assertEqual(value.to_list(), self.eval_to_list(converted))
+
+  @parameterized.parameters([
+      dict(
+          value=ragged.constant_value([['a', 'b'], ['c']], dtype=str),
+          dtype=dtypes.int32,
+          message=r"invalid literal for int\(\) with base 10: 'a'"),
+  ])
+  def testConvertRaggedTensorValueError(self,
+                                        value,
+                                        message,
+                                        dtype=None,
+                                        preferred_dtype=None):
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+
+  #=============================================================================
+  # Tests where the 'value' param is a Tensor
+  #=============================================================================
+  @parameterized.parameters([
+      dict(pylist=[[1, 2], [3, 4]]),
+      dict(pylist=[[1, 2], [3, 4]], preferred_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3, 4]], preferred_dtype=dtypes.string),
+  ])
+  def testConvertTensor(self, pylist, dtype=None, preferred_dtype=None):
+    tensor = constant_op.constant(pylist)
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        tensor, dtype, preferred_dtype)
+    self.assertIs(tensor, converted)
+
+  @parameterized.parameters([
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.float32,
+          message=('Tensor conversion requested dtype float32 for '
+                   'Tensor with dtype int32')),
+      dict(
+          pylist=[[1, 2], [3, 4]],
+          dtype=dtypes.string,
+          message=('Tensor conversion requested dtype string for '
+                   'Tensor with dtype int32')),
+  ])
+  def testConvertTensorError(self,
+                             pylist,
+                             message,
+                             dtype=None,
+                             preferred_dtype=None):
+    tensor = constant_op.constant(pylist)
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(tensor, dtype, preferred_dtype)
+
+  #=============================================================================
+  # Tests where the 'value' param is a np.array
+  #=============================================================================
+  @parameterized.parameters([
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          expected_dtype=dtypes.int32),
+      dict(
+          value=np.array([[b'a', b'b'], [b'c', b'd']]),
+          expected_dtype=dtypes.string),
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          preferred_dtype=dtypes.float32,
+          expected_dtype=dtypes.float32),
+      dict(
+          value=np.array([[1, 2], [3, 4]], dtype=np.int32),
+          preferred_dtype=dtypes.string,
+          expected_dtype=dtypes.int32),
+  ])
+  def testConvertNumpyArray(self,
+                            value,
+                            dtype=None,
+                            preferred_dtype=None,
+                            expected_dtype=None):
+    if expected_dtype is None:
+      expected_dtype = value.dtype if dtype is None else dtype
+    converted = ragged.convert_to_tensor_or_ragged_tensor(
+        value, dtype, preferred_dtype)
+    self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
+    self.assertAllEqual(value, converted)
+
+  @parameterized.parameters([
+      dict(
+          value=np.array([['a', 'b'], ['c', 'd']], dtype=str),
+          dtype=dtypes.int32,
+          message=r"invalid literal for int\(\) with base 10: 'a'"),
+  ])
+  def testConvertNumpyArrayError(self,
+                                 value,
+                                 message,
+                                 dtype=None,
+                                 preferred_dtype=None):
+    with self.assertRaisesRegexp(ValueError, message):
+      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5917bc4ee6f6f5fb1d46f3e75cbdb66ef156bad
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -0,0 +1,1223 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Array operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_ragged_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import segment_id_ops
+
+
+#===============================================================================
+# ragged_gather
+#===============================================================================
+# TODO(edloper): Add an `axis` argument
+def gather(params, indices, validate_indices=None, axis=0, name=None):
+  """Gathers ragged slices from `params` axis `0` according to `indices`.
+
+  Returns `RaggedTensor` output, such that:
+
+  ```python
+  output.shape = indices.shape + params.shape[1:]
+  output.ragged_rank = indices.shape.ndims + params.ragged_rank
+  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+  ```
+
+  `params` may be ragged.  `indices` may be ragged.
+  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
+  then an error is returned.
+
+  Examples:
+
+  ```python
+  >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
+  >>> indices = tf.constant([3, 1, 2, 1, 0])
+  >>> ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+  >>> ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
+
+  >>> print ragged.gather(params, ragged_indices)
+  [['d', 'b', 'c'], ['b'], [], ['a']]
+
+  >>> print ragged.gather(ragged_params, indices)
+  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
+
+  >>> print ragged.gather(ragged_params, ragged_indices)
+  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
+  ```
+
+  Args:
+    params: The potentially ragged tensor from which to gather values. Must be
+      at least rank 1.
+    indices: The potentially ragged tensor indicating which values to gather.
+      Must have dtype `int32` or `int64`.  Values must be in the range `[0,
+      params.shape[0]]`.
+    validate_indices: Ignored.
+    axis: Must be zero.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor`, where `output.dtype=params.dtype` and
+    `output.shape=indices.shape + params.shape[1:]` and
+    `output.ragged_rank=indices.shape.ndims + params.ragged_rank`.
+
+  Raises:
+    ValueError: If indices.shape.ndims is not known statically.
+  """
+  del validate_indices
+  if not isinstance(axis, int) or axis != 0:
+    raise ValueError('axis>0 is not supported for ragged gather yet.')
+  with ops.name_scope(name, 'RaggedGather', [params, indices]):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+
+    if ragged_tensor.is_ragged(indices):
+      return indices.with_values(gather(params, indices.values))
+
+    if not ragged_tensor.is_ragged(params):
+      return array_ops.gather(params, indices)
+
+    indices = ops.convert_to_tensor(indices)
+    if indices.shape.ndims is None:
+      raise ValueError('indices.shape.ndims must be known statically')
+
+    result = gen_ragged_array_ops.ragged_gather(
+        indices=indices,
+        params_dense_values=params.flat_values,
+        params_nested_splits=params.nested_row_splits,
+        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
+        1)
+
+    # Compose the RaggedTensor from splits & values.
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        result.output_dense_values, result.output_nested_splits)
+
+
+#===============================================================================
+# ragged.batch_gather
+#===============================================================================
+def batch_gather(params, indices, name=None):
+  """Gathers slices from `params` according to `indices` with batch dims.
+
+  This operation is similar to `gather`, but it assumes that the leading `N`
+  dimensions of `indices` and `params` are batch dimensions, and performs a
+  gather within each batch.  In particular, when using this operation with `N`
+  batch dimensions `B1...BN`:
+
+  * `indices` has shape `[B1...BN, I]`
+  * `params` has shape `[B1...BN, P1...PM]`.
+  * `result` has shape `[B1...BN, I, P2...PM]`.
+  * `result[b1...bN, i, p2...pM] =
+    params[b1...bN, indices[b1...bN, i], p2...pM]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
+      `M>0`).
+    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
+    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
+
+  #### Example:
+    ```python
+    >>> params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    >>> indices = ragged.constant([[1, 2, 0], [], [], [0, 0]])
+    >>> ragged.batch_gather(params, indices)
+    [['b', 'c', 'a'], [], [], ['e', 'e']]
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.batch_gather(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_ndims = indices.shape.ndims
+    if indices_ndims is None:
+      raise ValueError(
+          'batch_gather does not allow indices with unknown shape.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+
+    if ragged_tensor.is_ragged(indices):
+      # If the outermost ragged dimension is a batch dimension, recurse.
+      if indices_ndims > 2:
+        if not ragged_tensor.is_ragged(params):
+          raise ValueError('batch shape from indices does '
+                           'not match params shape')
+        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
+        with ops.control_dependencies(checks):
+          return ragged_tensor.RaggedTensor.from_row_splits(
+              batch_gather(params.values, indices.values), indices.row_splits)
+
+      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
+      else:
+        # Ensure that `params` is ragged and has at least 2 dimensions.
+        if not ragged_tensor.is_ragged(params):
+          if params.shape.ndims is not None and params.shape.ndims < 2:
+            raise ValueError('batch shape from indices does '
+                             'not match params shape')
+          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
+
+        # Adjust indices from within-batch to global (in params.values), and
+        # then use ragged.gather to gather them.
+        num_indices = indices.row_lengths()
+        params_starts = params.row_starts()
+        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
+        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
+        return ragged_tensor.RaggedTensor.from_row_splits(
+            gather(params.values, adjusted_index_values), indices.row_splits)
+
+    else:  # params is a RaggedTensor and indices is a Tensor.
+      if indices_ndims == 1:
+        return gather(params, indices)
+      elif indices_ndims == 2:
+        # Adjust indices from batch-local to global (in params.values)
+        adjustments = array_ops.expand_dims(params.row_starts(), 1)
+        adjusted_indices = math_ops.to_int64(indices) + adjustments
+        return gather(params.values, adjusted_indices)
+      else:
+        raise ValueError('batch shape from indices does not match params shape')
+
+
+#===============================================================================
+# ragged.gather_nd
+#===============================================================================
+def gather_nd(params, indices, name=None):
+  """Gather slices from `params` using `n`-dimensional indices.
+
+  This operation is similar to `gather`, but it uses the innermost dimension
+  of `indices` to define a slice into `params`.  In particular, if:
+
+  * `indices` has shape `[A1...AN, I]`
+  * `params` has shape `[B1...BM]`
+
+  Then:
+
+  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
+  * `result[a1...aN] = params[indices[a1...aN, :]]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[A1...AN, I]`.
+    indices: A potentially ragged tensor with shape `[B1...BM]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
+
+  #### Examples:
+    ```python
+    >>> params = tf.ragged.constant_value(
+    ...     [ [ ['000', '001'], ['010'              ]          ],
+    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
+    ...       [ [            ], ['210'              ]          ] ])
+
+    >>> # Gather 2D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2], [0]])
+    [ [ [            ], ['210'] ]
+      [ ['000', '001'], ['010'] ] ]
+
+    >>> # Gather 1D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
+    [['210'], ['000', '001']]
+
+    >>> # Gather scalars from a 3D tensor
+    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
+    ['001', '112']
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.gather_nd(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
+
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_shape = indices.shape
+    indices_ndims = indices_shape.ndims
+    if indices_ndims is None:
+      raise ValueError('indices.rank be statically known.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+    if (ragged_tensor.is_ragged(indices) and
+        indices_ndims == indices.ragged_rank + 1):
+      raise ValueError('The innermost dimension of indices may not be ragged')
+
+    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
+    # that each index slices into.
+    index_size = tensor_shape.dimension_value(indices_shape[-1])
+    if index_size is None:
+      raise ValueError('indices.shape[-1] must be statically known.')
+
+    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
+    # dense, then we convert it to ragged before recursing, and then convert
+    # the result back to `dense` if appropriate.
+    if indices_ndims > 2:
+      indices_is_dense = not ragged_tensor.is_ragged(indices)
+      if indices_is_dense:
+        indices = ragged_conversion_ops.from_tensor(
+            indices, ragged_rank=indices_ndims - 2)
+      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
+      if (indices_is_dense and ragged_tensor.is_ragged(result) and
+          result.ragged_rank == indices_ndims - 2):
+        result = ragged_conversion_ops.to_tensor(result)
+      return result
+
+    # indices_ndims <= 2, and the innermost dimension of indices may not be
+    # ragged, so `indices` must not be ragged.
+    assert not ragged_tensor.is_ragged(indices)
+    assert ragged_tensor.is_ragged(params)
+
+    # Handle corner case: An empty index tuple selects the entire `params`
+    # value.  So if `index_size` is zero, then tile `params`.
+    if index_size == 0:
+      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
+      for dim in range(indices_ndims - 1):
+        params = expand_dims(params, axis=0)
+      multiples = array_ops.concat([
+          array_ops.shape(indices)[:-1],
+          array_ops.ones([params_ndims], dtypes.int32)
+      ],
+                                   axis=0)
+      return tile(params, multiples)
+
+    # When index_size=1, we can just flatten the index tuples and use gather.
+    elif index_size == 1:
+      flattened_index_tuples = array_ops.reshape(indices, [-1])
+      return gather(params, flattened_index_tuples)
+
+    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
+    # Flatten both the index tuples and the params, such that the flattened
+    # index tuples point to the correct values in the flattened params; and
+    # then use ragged.gather on the flattened index tuples & params.
+    else:
+      indices = math_ops.to_int64(indices)
+
+      # Flatten the outermost 2 dimensions of the index tuples & params.
+      flattened_index_tuples = array_ops.gather(params.row_splits,
+                                                indices[..., 0])
+      flattened_index_tuples += indices[..., 1]
+      flattened_params = params.values
+
+      # Flatten any remaining dimensions.
+      for dim in range(2, index_size):
+        if not ragged_tensor.is_ragged(flattened_params):
+          flattened_index_tuples = array_ops.expand_dims(
+              flattened_index_tuples, axis=1)
+          flattened_index_tuples = array_ops.concat(
+              [flattened_index_tuples, indices[..., dim:]], axis=1)
+          return array_ops.gather_nd(flattened_params, flattened_index_tuples)
+
+        flattened_index_tuples = array_ops.gather(
+            flattened_params.row_starts(), flattened_index_tuples)
+        flattened_index_tuples += indices[..., dim]
+        flattened_params = flattened_params.values
+
+      # Gather using the flattened index tuples and params.
+      return gather(flattened_params, flattened_index_tuples)
+
+
+#===============================================================================
+# Masking
+#===============================================================================
+def boolean_mask(data, mask, keepdims=False, name=None):
+  """Applies a boolean mask to `data`.
+
+  Returns a potentially ragged tensor that is formed by retaining the elements
+  in `data` where the corresponding value in `mask` is `True`.
+
+  If `keepdims` is true then outer dimensions (corresponding to the `mask`
+  dimensions) are preserved, and:
+
+  * `output[a1...aA, i, b1...bB] = data[a1...aA, j, b1...bB]`
+
+     Where `j` is the `i`th `True` entry of `mask[a1...aA]`.
+
+  If `keepdims` is false, then the outer dimensions are collapsed (similar to
+  the behavior of `tf.boolean_mask`), and:
+
+  * `output[i, b1...bB] = data[a1...aA, b1...bB]`
+
+     Where `(a1...aA)` is the `i`th `True` entry of `mask`
+     (in row-major order).
+
+  Args:
+    data: A potentially ragged tensor.
+    mask: A potentially ragged boolean tensor.  `mask`'s shape must be a prefix
+      of `data`'s shape.  `rank(mask)` must be known statically.
+    keepdims: Whether to preserve the outer dimensions (`keepdims=True`) or
+      flatten them (`keepdims=False`).
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A potentially ragged tensor that is formed by retaining the elements in
+    `data` where the corresponding value in `mask` is `True`.
+
+    If `keepdims` is false:
+
+    * `rank(output) = rank(data) - rank(mask) + 1`.
+    * `output.ragged_rank = max(data.ragged_rank - rank(mask) + 1, 0)`.
+
+    If `keepdims` is true:
+
+    * `rank(output) = rank(data)`.
+    * `output.ragged_rank = max(data.ragged_rank, rank(mask) - 1)`.
+
+  Raises:
+    ValueError: if `rank(mask)` is not known statically; or if `mask.shape` is
+      not a prefix of `data.shape`.
+
+  #### Examples:
+    ```python
+    >>> # Aliases for True & False so data and mask line up.
+    >>> T, F = (True, False)
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Flatten outer dims.
+    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
+    ...     keepdims=False).tolist()
+    [1, 3, 7]
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D Tensor.  Preserve outer dims.
+    ...     data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+    ...     mask=[[T, F, T], [F, F, F], [T, F, F]],
+    ...     keepdims=True).tolist()
+    [[1, 3], [], [7]]
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Flatten outer dims.
+    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
+    ...     keepdims=False).tolist()
+    [3, 5, 6]
+
+    >>> tf.ragged.boolean_mask(  # Mask a 2D RaggedTensor.  Preserve outer dims.
+    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+    ...     tf.ragged.constant([[F, F, T], [F], [T, T]]),
+    ...     keepdims=True).tolist()
+    [[3], [], [5, 6]]
+
+    >>> tf.ragged.boolean_mask(  # Mask rows of a 2D RaggedTensor.
+    ...     tf.ragged.constant([[1, 2, 3], [4], [5, 6]]),
+    ...     tf.ragged.constant([True, False, True]),
+    ...     keepdims=True).tolist()
+    [[1, 2, 3], [5, 6]]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedMask', [data, mask]):
+    # Convert inputs to tensors.
+    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
+    mask = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        mask, dtypes.bool, name='mask')
+
+    # Get static rank of mask.
+    if mask.shape.ndims is None:
+      raise ValueError('mask.shape.ndims must be kown statically.')
+    elif mask.shape.ndims == 0:
+      raise ValueError('mask cannot be scalar.')
+
+    # If mask is ragged, then recurse with a non-ragged mask.
+    if ragged_tensor.is_ragged(mask):
+      if not ragged_tensor.is_ragged(data):
+        data = ragged_conversion_ops.from_tensor(
+            data, ragged_rank=mask.ragged_rank)
+      # Check that mask.nested_row_splits is a prefix of
+      # data.nested_row_splits.
+      splits_list = [
+          mask.nested_row_splits, data.nested_row_splits[:mask.ragged_rank]
+      ]
+      with ops.control_dependencies(
+          ragged_util.assert_splits_match(splits_list)):
+        # Strip off ragged `splits` until `mask` is non-ragged.  Keep the splits
+        # that we strip off in `splits`, so we can add them back on after
+        # we recursively mask the non-ragged data.
+        splits = []
+        while ragged_tensor.is_ragged(mask):
+          if mask.shape.ndims > 2:
+            splits.append(mask.row_splits)
+          else:
+            # Count the number of True mask values in each row to find the
+            # lengths of the filtered rows; then convert to splits.
+            int_mask = ragged_functional_ops.map_flat_values(
+                math_ops.cast, mask, dtype=dtypes.int64)
+            masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1)
+            splits.append(ragged_util.lengths_to_splits(masked_row_lengths))
+          mask = mask.values
+          data = data.values
+
+        # Recursively apply the nested non-ragged mask to the nested data.
+        masked_values = boolean_mask(data, mask, keepdims)
+
+        # Add the ragged `splits` back to the result.
+        if keepdims:
+          masked_values = ragged_tensor.RaggedTensor.from_nested_row_splits(
+              masked_values, splits)
+
+        return masked_values
+
+    # If mask is non-ragged and has rank 1, and data is ragged, then build a
+    # ragged tensor with the indicated rows.
+    elif ragged_tensor.is_ragged(data) and mask.shape.ndims == 1:
+      # Get the masked splits: first get the length of each row, then filter
+      # out the rows that we are deleting, and convert that filtered set of
+      # masks back to a splits tensor.
+      lengths = data.row_lengths()
+      masked_lengths = array_ops.boolean_mask(lengths, mask)
+      masked_splits = ragged_util.lengths_to_splits(masked_lengths)
+
+      # Get the masked values: first get row ids corresponding to each
+      # value, then use tf.gather to build a boolean mask that's false for
+      # values that come from rows that we are deleting, and use that mask to
+      # construct the masked values tensor.
+      segment_ids = segment_id_ops.row_splits_to_segment_ids(data.row_splits)
+      segment_mask = array_ops.gather(mask, segment_ids)
+      masked_values = boolean_mask(data.values, segment_mask, keepdims=False)
+
+      return ragged_tensor.RaggedTensor.from_row_splits(masked_values,
+                                                        masked_splits)
+
+    # If mask is non-ragged and has rank>1, then convert it to be ragged,
+    # with a ragged rank matching data.
+    if ragged_tensor.is_ragged(data):
+      mask = ragged_conversion_ops.from_tensor(
+          mask, ragged_rank=min(data.ragged_rank, mask.shape.ndims - 1))
+      return boolean_mask(data, mask, keepdims)
+
+    # Otherwise, data and mask are both `Tensor`s.
+    else:
+      # Apply `boolean_mask` to get the masked values.
+      masked_values = array_ops.boolean_mask(data, mask)
+
+      if mask.shape.ndims >= 2 and keepdims:
+        # Add the innermost ragged dimension.  For each innermost cell, get the
+        # number of values it contains.  Then flatten that to get a list of
+        # cell lengths, and convert it to splits.  Finally, combine the splits
+        # and values to get the innermost ragged tensor.
+        masked_lengths = math_ops.count_nonzero(mask, axis=-1)
+        flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1])
+        masked_values = ragged_tensor.RaggedTensor.from_row_lengths(
+            masked_values, flattened_masked_lengths)
+
+        # Wrap remaining ragged dimensions.
+        if mask.shape.ndims > 2 and keepdims:
+          mask_shape = array_ops.shape(mask, out_type=dtypes.int64)
+          split_size = math_ops.cumprod(mask_shape) + 1
+          for dim in range(mask.shape.ndims - 3, -1, -1):
+            elt_size = mask_shape[dim + 1]
+            masked_splits = math_ops.range(split_size[dim]) * elt_size
+            masked_values = ragged_tensor.RaggedTensor.from_row_splits(
+                masked_values, masked_splits)
+
+      return masked_values
+
+
+#===============================================================================
+# Concatenation and Stacking
+#===============================================================================
+def concat(values, axis, name=None):
+  """Concatenates potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  concatenation of `[rt[i0...iaxis] for rt in values]`.
+
+  Args:
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to concatenate.
+      (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `values` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
+
+  Raises:
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.concat([t1, t2], axis=0)
+    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
+    >>> ragged.concat([t1, t2], axis=1)
+    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
+    ```
+  """
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=False)
+
+
+def stack(values, axis, name=None):
+  """Stacks potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  list `[rt[i0...iaxis] for rt in values]`.
+
+  Args:
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to stack.
+      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `values` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K+1`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
+
+  Raises:
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.stack([t1, t2], axis=0)
+    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
+    >>> ragged.stack([t1, t2], axis=1)
+    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
+    ```
+  """
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=True)
+
+
+def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
+  """Helper function to concatenate or stack ragged tensors.
+
+  Args:
+    rt_inputs: A list of RaggedTensors or Tensors to combine.
+    axis: The axis along which to concatenate or stack.
+    stack_values: A boolean -- if true, then stack values; otherwise,
+      concatenate them.
+
+  Returns:
+    A RaggedTensor.
+  Raises:
+    ValueError: If rt_inputs is empty, or if axis is out of range.
+  """
+  # Validate parameters.
+  if not rt_inputs:
+    raise ValueError('rt_inputs may not be empty.')
+
+  # Convert input tensors.
+  rt_inputs = [
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(
+          rt_input, name='rt_input') for rt_input in rt_inputs
+  ]
+
+  # Special case: if there's only one input, then return it as-is.
+  if len(rt_inputs) == 1:
+    if stack_values:
+      return expand_dims(rt_inputs[0], axis=0)
+    else:
+      return rt_inputs[0]
+
+  # Check the rank (number of dimensions) of the input tensors.
+  ndims = None
+  for rt in rt_inputs:
+    if ndims is None:
+      ndims = rt.shape.ndims
+    else:
+      rt.shape.assert_has_rank(ndims)
+
+  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
+  axis = ragged_util.get_positive_axis(axis, out_ndims)
+
+  # If all the inputs are Tensors, and we're combining the final dimension,
+  # then we can delegate to the tf.stack/tf.concat operation, and return a
+  # Tensor.
+  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
+    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
+      if stack_values:
+        return array_ops.stack(rt_inputs, axis)
+      else:
+        return array_ops.concat(rt_inputs, axis)
+
+  # Convert any Tensor inputs to RaggedTensors.  This makes it
+  # possible to concatenate Tensors and RaggedTensors together.
+  for i in range(len(rt_inputs)):
+    if not ragged_tensor.is_ragged(rt_inputs[i]):
+      rt_inputs[i] = ragged_conversion_ops.from_tensor(
+          rt_inputs[i], ragged_rank=1)
+
+  # Convert the input tensors to all have the same ragged_rank.
+  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
+  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
+
+  if axis == 0:
+    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
+  elif axis == 1:
+    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
+  else:  # axis > 1: recurse.
+    values = [rt.values for rt in rt_inputs]
+    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
+    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
+      return ragged_tensor.RaggedTensor.from_row_splits(
+          _ragged_stack_concat_helper(values, axis - 1, stack_values),
+          splits[0][0])
+
+
+def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 0.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  # Concatenate the inner values together.
+  flat_values = [rt.flat_values for rt in rt_inputs]
+  concatenated_flat_values = array_ops.concat(flat_values, axis=0)
+
+  # Concatenate the splits together for each ragged dimension (adjusting
+  # split offsets as necessary).
+  nested_splits = [rt.nested_row_splits for rt in rt_inputs]
+  ragged_rank = rt_inputs[0].ragged_rank
+  concatenated_nested_splits = [
+      _concat_ragged_splits([ns[dim]
+                             for ns in nested_splits])
+      for dim in range(ragged_rank)
+  ]
+
+  # If we are performing a stack operation, then add another splits.
+  if stack_values:
+    stack_lengths = array_ops.stack([_nrows(rt) for rt in rt_inputs])
+    stack_splits = ragged_util.lengths_to_splits(stack_lengths)
+    concatenated_nested_splits.insert(0, stack_splits)
+
+  return ragged_tensor.RaggedTensor.from_nested_row_splits(
+      concatenated_flat_values, concatenated_nested_splits)
+
+
+def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 1.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  num_inputs = len(rt_inputs)
+
+  rt_nrows = _nrows(rt_inputs[0])
+  nrows_msg = 'Input tensors have incompatible shapes.'
+  nrows_checks = [
+      check_ops.assert_equal(_nrows(rt), rt_nrows, message=nrows_msg)
+      for rt in rt_inputs[1:]
+  ]
+
+  with ops.control_dependencies(nrows_checks):
+    # Concatentate the inputs together to put them in a single ragged tensor.
+    concatenated_rt = _ragged_stack_concat_axis_0(rt_inputs, stack_values=False)
+
+    # Use ragged.gather to permute the rows of concatenated_rt.  In particular,
+    #   permuted_rt = [rt_inputs[0][0], ..., rt_inputs[N][0],
+    #                  rt_inputs[0][1], ..., rt_inputs[N][1],
+    #                      ...,
+    #                  rt_inputs[0][M], ..., rt_input[N][M]]
+    # where `N=num_inputs-1` and `M=rt_nrows-1`.
+    row_indices = math_ops.range(rt_nrows * num_inputs)
+    row_index_matrix = array_ops.reshape(row_indices, [num_inputs, -1])
+    transposed_row_index_matrix = array_ops.transpose(row_index_matrix)
+    row_permutation = array_ops.reshape(transposed_row_index_matrix, [-1])
+    permuted_rt = gather(concatenated_rt, row_permutation)
+
+    if stack_values:
+      # Add a new splits tensor to group together the values.
+      stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
+      _copy_row_shape(rt_inputs, stack_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
+                                                        stack_splits)
+    else:
+      # Merge together adjacent rows by dropping the row-split indices that
+      # separate them.
+      concat_splits = permuted_rt.row_splits[::num_inputs]
+      _copy_row_shape(rt_inputs, concat_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
+                                                        concat_splits)
+
+
+def _copy_row_shape(rt_inputs, splits):
+  """Sets splits.shape to [rt[shape[0]+1] for each rt in rt_inputs."""
+  for rt in rt_inputs:
+    if rt.shape[0] is not None:
+      splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
+
+
+#===============================================================================
+# Tiling
+#===============================================================================
+def tile(input, multiples, name=None):  # pylint: disable=redefined-builtin
+  """Constructs a `RaggedTensor` by tiling a given `RaggedTensor`.
+
+  The values of `input` are replicated `multiples[i]` times along the
+  `i`th dimension (for each dimension `i`).  For every dimension `axis` in
+  `input`, the length of each output element in that dimension is the
+  length of corresponding input element multiplied by `multiples[axis]`.
+
+  Args:
+    input: A `RaggedTensor`.
+    multiples: A 1-D integer `Tensor`.  Length must be the same as the number of
+      dimensions in `input`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor` with the same type, rank, and ragged_rank as `input`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> ragged.tile(rt, [3, 2])
+    [[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedTile', [input, multiples]):
+    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        input, name='input')
+    multiples = ragged_util.convert_to_int_tensor(
+        multiples, name='multiples', dtype=dtypes.int64)
+    multiples.shape.assert_has_rank(1)
+    if not ragged_tensor.is_ragged(input):
+      return array_ops.tile(input, multiples, name)
+
+    # If the constant value of `multiples` is available, then we can use it
+    # to skip tiling dimensions where `multiples=1`.
+    const_multiples = tensor_util.constant_value(multiples)
+
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        _tile_ragged_values(input, multiples, const_multiples),
+        _tile_ragged_splits(input, multiples, const_multiples))
+
+
+def _tile_ragged_values(rt_input, multiples, const_multiples=None):
+  """Builds flat_values tensor for a tiled `RaggedTensor`.
+
+  Returns a tensor that repeats the values in
+  `rt_input.flat_values` in the
+  appropriate pattern to construct a `RaggedTensor` that tiles `rt_input` as
+  specified by `multiples`.
+
+  Args:
+    rt_input: The `RaggedTensor` whose values should be repeated.
+    multiples: A 1-D integer `tensor`, indicating how many times each dimension
+      should be repeated.
+    const_multiples: Optional constant value for multiples.  Used to skip tiling
+      dimensions where `multiples=1`.
+
+  Returns:
+    A `Tensor` with the same type and rank as `rt_input.flat_values`.
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> _tile_ragged_values(rt, [3, 2])
+    [1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3]
+    ```
+  """
+  ragged_rank = rt_input.ragged_rank
+  nested_splits = rt_input.nested_row_splits
+
+  # Pointers to the values in `rt_input.flat_values`.
+  inner_value_ids = math_ops.range(nested_splits[-1][-1])
+
+  # For each ragged dimension (working from the innermost to outermost),
+  # expand `inner_value_ids` as necessary to tile that dimension.
+  prev_splits = None
+  for axis in range(ragged_rank, 0, -1):
+    # Ragged splits for this dimension.
+    splits = nested_splits[axis - 1]
+
+    # Adjust splits so they point into `inner_value_ids` (instead of just
+    # pointing into the next dimension's values).
+    if prev_splits is not None:  # Not the first pass through the loop.
+      splits = array_ops.gather(prev_splits * multiples[axis + 1], splits)
+
+    # Repeat each element in this ragged dimension `multiples[axis]` times.
+    if const_multiples is None or const_multiples[axis] != 1:
+      inner_value_ids = ragged_util.repeat_ranges(inner_value_ids, splits,
+                                                  multiples[axis])
+
+    prev_splits = splits
+
+  # Gather the tiled inner values.
+  ragged_tiled_values = array_ops.gather(rt_input.flat_values, inner_value_ids)
+
+  # Tile the flat_values for the uniform dimensions (i.e., for `axis=0` plus
+  # `axis=range(ragged_rank, rank)`).
+  inner_repeats = array_ops.concat([multiples[:1], multiples[ragged_rank + 1:]],
+                                   axis=0)
+  return array_ops.tile(ragged_tiled_values, inner_repeats)
+
+
+def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
+  """Builds nested_split tensors for a tiled `RaggedTensor`.
+
+  Returns a list of split tensors that can be used to construct the
+  `RaggedTensor` that tiles `rt_input` as specified by `multiples`.
+
+  Args:
+    rt_input: The `RaggedTensor` that is being tiled.
+    multiples: A 1-D integer `tensor`, indicating how many times each dimension
+      should be repeated.
+    const_multiples: Optional constant value for multiples.  Used to skip tiling
+      dimensions where `multiples=1`.
+
+  Returns:
+    A list of 1-D `int64` `Tensor`s (one for each ragged dimension in
+    `rt_input`).
+
+  #### Example:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> _tile_ragged_splits(rt, [3, 2])
+    [0, 4, 6, 10, 12, 16, 18]
+    ```
+  """
+  ragged_rank = rt_input.ragged_rank
+  nested_splits = rt_input.nested_row_splits
+
+  # projected_splits[src_axis, dst_axis] contains the split points that divide
+  # the rows from src_axis in the list of dst_axis values.  E.g.,
+  # projected_splits[i, i] = nested_splits[i], and
+  # projected_splits[i, i+1] = gather(nested_splits[i+1], nested_splits[i]).
+  projected_splits = [{i: nested_splits[i]} for i in range(ragged_rank)]
+  for src_axis in range(ragged_rank):
+    for dst_axis in range(src_axis + 1, ragged_rank - 1):
+      projected_splits[src_axis][dst_axis] = array_ops.gather(
+          nested_splits[dst_axis],
+          projected_splits[src_axis][dst_axis - 1])
+
+  # For each ragged dimension: nested_splits[axis] -> result_splits[axis].
+  result_splits = []
+  for axis in range(ragged_rank):
+    # Get the length of each row for the input tensor for this dimension.
+    input_lengths = nested_splits[axis][1:] - nested_splits[axis][:-1]
+
+    # Multiply those lengths by the `multiples` of dimension axis+1, since
+    # each value will be repeated that number of times.
+    output_lengths = input_lengths * multiples[axis + 1]
+
+    # Repeat ranges of the row lengths as necessary for them to be tiled in
+    # each ragged dimension `d < axis`.  (Start with dimension d=axis-1, and
+    # work our way up to dimension d=0.)
+    repeats = 1
+    for d in range(axis - 1, -1, -1):
+      if const_multiples is None or const_multiples[d + 1] != 1:
+        splits = projected_splits[d][axis - 1] * repeats
+        output_lengths = ragged_util.repeat_ranges(output_lengths, splits,
+                                                   multiples[d + 1])
+      repeats *= multiples[d + 1]
+
+    # Tile splits for the outermost (uniform) dimension.
+    output_lengths = array_ops.tile(output_lengths, multiples[:1])
+
+    # Convert to splits.
+    result_splits.append(ragged_util.lengths_to_splits(output_lengths))
+
+  return result_splits
+
+
+#===============================================================================
+# Reshaping
+#===============================================================================
+
+
+def expand_dims(input, axis, name=None):  # pylint: disable=redefined-builtin
+  """Inserts a dimension with shape 1 into a potentially ragged tensor's shape.
+
+  Given a potentially ragged tenor `input`, this operation inserts a
+  dimension with size 1 at the dimension `axis` of `input`'s shape.
+
+  * If `input` is a `Tensor`, then this is equivalent to
+    `tf.expand_dims`.
+  * If `input` is ragged, and `axis=0`, then the new dimension will be
+    uniform; but the previously outermost dimension will become ragged.
+  * If `input` is ragged, and `0 < axis < input.ragged_rank`, then the
+    new dimension will be ragged.
+  * If `input` is ragged, and axis >= input.ragged_rank`, then the new
+    dimension will be uniform.
+
+  The following table gives some examples showing how `ragged.expand_dims`
+  impacts the shapes of different input tensors.  Ragged dimensions are
+  indicated by enclosing them in parentheses.
+
+  input.shape             | axis | result.shape
+  ----------------------- | ---- | -----------------------------
+  `[D1, D2]`              |  `0` | `[1, D1, D2]`
+  `[D1, D2]`              |  `1` | `[D1, 1, D2]`
+  `[D1, D2]`              |  `2` | `[D1, D2, 1]`
+  `[D1, (D2), (D3), D4]`  |  `0` | `[1, (D1), (D2), (D3), D4]`
+  `[D1, (D2), (D3), D4]`  |  `1` | `[D1, (1), (D2), (D3), D4]`
+  `[D1, (D2), (D3), D4]`  |  `2` | `[D1, (D2), (1), (D3), D4]`
+  `[D1, (D2), (D3), D4]`  |  `3` | `[D1, (D2), (D3), 1, D4]`
+  `[D1, (D2), (D3), D4]`  |  `4` | `[D1, (D2), (D3), D4, 1]`
+
+  Args:
+    input: The potentially tensor that should be expanded with a new
+      dimension.
+    axis: An integer constant indicating where the new dimension should be
+      inserted.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor with the same values as `input`, with an added dimension of
+    size 1 at `axis`.
+
+  #### Examples:
+    ```python
+    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> print rt.shape
+    TensorShape([2, None])
+
+    >>> expanded = ragged.expand_dims(rt, axis=0)
+    >>> print(expanded.shape, expanded)
+    TensorShape([1, None, None]) [[[1, 2], [3]]]
+
+    >>> expanded = ragged.expand_dims(rt, axis=1)
+    >>> print(expanded.shape, expanded)
+    TensorShape([2, None, None]) [[[1, 2]], [[3]]]
+
+    >>> expanded = ragged.expand_dims(rt, axis=2)
+    >>> print(expanded.shape, expanded)
+    TensorShape([2, None, 1]) [[[1], [2]], [[3]]]
+    ```
+  """
+  with ops.name_scope(name, 'RaggedExpandDims', [input]):
+    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        input, name='input')
+
+    if not ragged_tensor.is_ragged(input):
+      return array_ops.expand_dims(input, axis)
+
+    ndims = None if input.shape.ndims is None else input.shape.ndims + 1
+    axis = ragged_util.get_positive_axis(axis, ndims)
+    if axis == 0:
+      values = input
+      splits = array_ops.stack([0, input.nrows()])
+    elif axis == 1:
+      values = input
+      splits = math_ops.range(input.nrows() + 1)
+    else:
+      values = expand_dims(input.values, axis - 1)
+      splits = input.row_splits
+
+    return ragged_tensor.RaggedTensor.from_row_splits(values, splits)
+
+
+#===============================================================================
+# ragged.where
+#===============================================================================
+def where(condition, x=None, y=None, name=None):
+  """Return the elements, either from `x` or `y`, depending on the `condition`.
+
+  : If both `x` and `y` are `None`:
+    Returns the coordinates of true elements of `condition`. The coordinates
+    are returned in a 2-D tensor with shape
+    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
+    coordinates of the `i`th true value (in row-major order).
+
+  : If both `x` and `y` are non-`None`:
+    Returns a tensor formed by selecting values from `x` where condition is
+    true, and from `y` when condition is false.  In particular:
+
+    : If `condition`, `x`, and `y` all have the same shape:
+
+      * `result[i1...iN] = x[i1...iN]` if `condition[i1...iN]` is true.
+      * `result[i1...iN] = y[i1...iN]` if `condition[i1...iN]` is false.
+
+    : Otherwise:
+
+      * `condition` must be a vector.
+      * `x` and `y` must have the same number of dimensions.
+      * The outermost dimensions of `condition`, `x`, and `y` must all have the
+        same size.
+      * `result[i] = x[i]` if `condition[i]` is true.
+      * `result[i] = y[i]` if `condition[i]` is false.
+
+  Args:
+    condition: A potentially ragged tensor of type `bool`
+    x: A potentially ragged tensor (optional).
+    y: A potentially ragged tensor (optional).  Must be specified if `x` is
+      specified.  Must have the same rank and type as `x`.
+    name: A name of the operation (optional)
+
+  Returns:
+    : If both `x` and `y` are `None`:
+      A `Tensor` with shape `(num_true, dim_size(condition))`.
+    : Otherwise:
+      A potentially ragged tensor with the same type, rank, and outermost
+      dimension size as `x` and `y`.
+      `result.ragged_rank = max(x.ragged_rank, y.ragged_rank)`.
+
+  Raises:
+    ValueError: When exactly one of `x` or `y` is non-`None`; or when
+      `condition`, `x`, and `y` have incompatible shapes.
+
+  #### Examples:
+    ```python
+    >>> # Coordinates where condition is true.
+    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
+    >>> ragged.where(condition)
+    [[0, 0], [0, 2], [1, 1]]
+
+    >>> # Elementwise selection between x and y, based on condition.
+    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
+    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'b', 'C'], ['d', 'E']]
+
+    >>> # Row selection between x and y, based on condition.
+    >>> condition = [True, False]
+    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'B', 'C'], ['d', 'e']]
+    ```
+  """
+  if (x is None) != (y is None):
+    raise ValueError('x and y must be either both None or both non-None')
+  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
+    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        condition, name='condition')
+    if x is None:
+      return _coordinate_where(condition)
+    else:
+      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
+      return _elementwise_where(condition, x, y)
+
+
+def _elementwise_where(condition, x, y):
+  """Ragged version of tf.where(condition, x, y)."""
+  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
+  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
+  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
+
+  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
+    return array_ops.where(condition, x, y)
+
+  elif condition_is_ragged and x_is_ragged and y_is_ragged:
+    return ragged_functional_ops.map_flat_values(array_ops.where, condition, x,
+                                                 y)
+  elif not condition_is_ragged:
+    # Concatenate x and y, and then use `gather` to assemble the selected rows.
+    condition.shape.assert_has_rank(1)
+    x_nrows = _nrows(x)
+    x_and_y = concat([x, y], axis=0)
+    indices = array_ops.where(condition, math_ops.range(x_nrows),
+                              x_nrows + math_ops.range(_nrows(y)))
+    return gather(x_and_y, indices)
+
+  else:
+    raise ValueError('Input shapes do not match.')
+
+
+def _coordinate_where(condition):
+  """Ragged version of tf.where(condition)."""
+  if not isinstance(condition, ragged_tensor.RaggedTensor):
+    return array_ops.where(condition)
+
+  # The coordinate for each `true` value in condition.values.
+  selected_coords = _coordinate_where(condition.values)
+
+  # Convert the first index in each coordinate to a row index and column index.
+  first_index = selected_coords[:, 0]
+  selected_rows = array_ops.gather(condition.value_rowids(), first_index)
+  selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
+  selected_cols = first_index - selected_row_starts
+
+  # Assemble the row & column index with the indices for inner dimensions.
+  return array_ops.concat([
+      array_ops.expand_dims(selected_rows, 1),
+      array_ops.expand_dims(selected_cols, 1), selected_coords[:, 1:]
+  ],
+                          axis=1)
+
+
+#===============================================================================
+# Internal Helper Functions
+#===============================================================================
+
+
+def _increase_ragged_rank_to(rt_input, ragged_rank):
+  """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
+  if ragged_rank > 0:
+    if not ragged_tensor.is_ragged(rt_input):
+      rt_input = ragged_conversion_ops.from_tensor(rt_input)
+    if rt_input.ragged_rank < ragged_rank:
+      rt_input = rt_input.with_values(
+          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
+  return rt_input
+
+
+def _concat_ragged_splits(splits_list):
+  """Concatenates a list of RaggedTensor splits to form a single splits."""
+  pieces = [splits_list[0]]
+  splits_offset = splits_list[0][-1]
+  for splits in splits_list[1:]:
+    pieces.append(splits[1:] + splits_offset)
+    splits_offset += splits[-1]
+  return array_ops.concat(pieces, axis=0)
+
+
+def _nrows(rt_input, out_type=dtypes.int64, name=None):
+  if isinstance(rt_input, ragged_tensor.RaggedTensor):
+    return rt_input.nrows(out_type=out_type, name=name)
+  else:
+    with ops.name_scope(name, 'RaggedNRows', [rt_input]):
+      return array_ops.shape(rt_input, out_type=out_type)[0]
+
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f1ae591f9f2c9dfcf5b405b1c4d7370ab853a6
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -0,0 +1,201 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.batch_gather."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
+                              parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Example
+      #=========================================================================
+      dict(
+          descr='Docstring example',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d'], [], ['e']]),
+          indices=ragged.constant_value([[1, 2, 0], [], [], [0, 0]]),
+          expected=ragged.constant_value([[b'b', b'c', b'a'], [], [],
+                                          [b'e', b'e']])),
+      #=========================================================================
+      # 0 Batch Dimensions
+      #=========================================================================
+      dict(
+          descr='params: [P1], indices: [I], result: [I]',
+          params=['a', 'b', 'c', 'd'],
+          indices=[3, 2],
+          expected=[b'd', b'c']),
+      dict(
+          descr='params: [P1, (P2)], indices: [I], result: [I, (P2)]',
+          params=ragged.constant_value([['a', 'b'], [], ['c'], ['d', 'e']]),
+          indices=[3, 2],
+          expected=ragged.constant_value([[b'd', b'e'], [b'c']])),
+      #=========================================================================
+      # 1 Batch Dimension
+      #=========================================================================
+      dict(
+          descr='params: [B1, P1], indices: [B1, I], result: [B1, I]',
+          params=[['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']],
+          indices=[[2, 0], [0, 1], [1, 0]],
+          expected=[[b'c', b'a'], [b'd', b'e'], [b'h', b'g']]),
+      dict(
+          descr='params: [B1, (P1)], indices: [B1, I], result: [B1, I]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d', 'e'], ['g']]),
+          indices=[[2, 0], [0, 1], [0, 0]],
+          expected=[[b'c', b'a'], [b'd', b'e'], [b'g', b'g']]),
+      dict(
+          descr='params: [B1, P1], indices: [B1, (I)], result: [B1, (I)]',
+          params=[['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']],
+          indices=ragged.constant_value([[2, 0, 2], [0], [1]]),
+          expected=ragged.constant_value([[b'c', b'a', b'c'], [b'd'], [b'h']])),
+      dict(
+          descr=('params: [B1, (P1), (P2), P3], indices: [B1, I], '
+                 'result: [B1, I, (P2), P3]'),
+          params=ragged.constant_value(
+              [[[['a']], [['b'], ['c']]], [[['d'], ['e']], [['f']]], [[['g']]]],
+              ragged_rank=2),
+          indices=[[1, 0], [0, 1], [0, 0]],
+          expected=ragged.constant_value(
+              [[[[b'b'], [b'c']], [[b'a']]], [[[b'd'], [b'e']], [[b'f']]],
+               [[[b'g']], [[b'g']]]],
+              ragged_rank=2)),
+      #=========================================================================
+      # 2 Batch Dimensions
+      #=========================================================================
+      dict(
+          descr=('params: [B1, B2, P1], indices: [B1, B2, I], '
+                 'result: [B1, B2, I]'),
+          params=[[['a', 'b', 'c']], [['d', 'e', 'f']], [['g', 'h', 'i']]],
+          indices=[[[2, 0]], [[0, 1]], [[1, 0]]],
+          expected=[[[b'c', b'a']], [[b'd', b'e']], [[b'h', b'g']]]),
+      dict(
+          descr=('params: [B1, (B2), P1], indices: [B1, (B2), I], '
+                 'result: [B1, (B2), I]'),
+          params=ragged.constant_value(
+              [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
+              ragged_rank=1),
+          indices=ragged.constant_value([[[2, 0], [0, 1]], [[1, 0]]],
+                                        ragged_rank=1),
+          expected=ragged.constant_value(
+              [[[b'c', b'a'], [b'd', b'e']], [[b'h', b'g']]], ragged_rank=1)),
+      dict(
+          descr=('params: [B1, (B2), (P1)], indices: [B1, (B2), I], '
+                 'result: [B1, (B2), I]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']], [['e', 'f']]],
+                                       ragged_rank=2),
+          indices=ragged.constant_value([[[2, 0], [0, 0]], [[1, 0]]],
+                                        ragged_rank=1),
+          expected=ragged.constant_value(
+              [[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]], ragged_rank=1)),
+      dict(
+          descr=('params: [B1, (B2), P1], indices: [B1, (B2), (I)], '
+                 'result: [B1, (B2), (I)]'),
+          params=ragged.constant_value(
+              [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
+              ragged_rank=1),
+          indices=ragged.constant_value([[[2, 1, 0], [0]], [[1, 1]]],
+                                        ragged_rank=2),
+          expected=ragged.constant_value(
+              [[[b'c', b'b', b'a'], [b'd']], [[b'h', b'h']]], ragged_rank=2)),
+      #=========================================================================
+      # 3 Batch Dimensions
+      #=========================================================================
+      dict(
+          descr=(
+              'params: [B1, (B2), (B3), (P1)], indices: [B1, (B2), (B3), I], '
+              'result: [B1, (B2), (B3), I]'),
+          params=ragged.constant_value(
+              [[[['a', 'b', 'c'], ['d']], [['e', 'f']]]], ragged_rank=3),
+          indices=ragged.constant_value([[[[2, 0], [0, 0]], [[1, 0]]]],
+                                        ragged_rank=2),
+          expected=ragged.constant_value(
+              [[[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]]], ragged_rank=2)),
+  ])
+  def testRaggedBatchGather(self, descr, params, indices, expected):
+    result = ragged.batch_gather(params, indices)
+    self.assertRaggedEqual(result, expected)
+
+  def testRaggedBatchGatherUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    params = [['a', 'b'], ['c', 'd']]
+    indices = array_ops.placeholder(dtypes.int32, shape=None)
+    ragged_indices = ragged.RaggedTensor.from_row_splits(indices, [0, 2, 4])
+
+    with self.assertRaisesRegexp(
+        ValueError, 'batch_gather does not allow indices with unknown shape.'):
+      ragged.batch_gather(params, indices)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'batch_gather does not allow indices with unknown shape.'):
+      ragged.batch_gather(params, ragged_indices)
+
+  @parameterized.parameters([
+      dict(
+          params=ragged.constant_value([['a'], ['b'], ['c']]),
+          indices=ragged.constant_value([[0], [0]]),
+          message='Dimensions 3 and 2 are not compatible'),
+      dict(
+          params=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          indices=ragged.constant_value([[[0, 0], [0, 0, 0]], [[0]]]),
+          message='batch shape from indices does not match params shape'),
+      dict(  # rank mismatch
+          params=ragged.constant_value([[[0, 0], [0, 0, 0]], [[0]]]),
+          indices=ragged.constant_value([[[0, 0]], [[0, 0, 0]], [[0]]]),
+          error=(ValueError, errors.InvalidArgumentError)),
+      dict(
+          params=ragged.constant_value([[[0, 0], [0, 0, 0]], [[0]], [[0]]]),
+          indices=ragged.constant_value([[[0, 0]], [[0, 0, 0]], [[0]]]),
+          error=errors.InvalidArgumentError,
+          message='.*Condition x == y did not hold.*'),
+      dict(
+          params=ragged.constant_value(['a', 'b', 'c']),
+          indices=ragged.constant_value([[0], [0]]),
+          message='batch shape from indices does not match params shape'),
+      dict(
+          params=ragged.constant_value([['a']]),
+          indices=0,
+          message='indices.rank must be at least 1.'),
+      dict(
+          params=ragged.constant_value([['a']]),
+          indices=[[[0]]],
+          message='batch shape from indices does not match params shape'),
+  ])
+  def testRaggedBatchGatherStaticError(self,
+                                       params,
+                                       indices,
+                                       message=None,
+                                       error=ValueError):
+    with self.assertRaisesRegexp(error, message):
+      ragged.batch_gather(params, indices)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0f7459322792aeafaadd4db18ecd30105e8e74c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
@@ -0,0 +1,339 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.boolean_mask."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
+                              parameterized.TestCase):
+  # Define short constants for true & false, so the data & mask can be lined
+  # up in the examples below.  This makes it easier to read the examples, to
+  # see which values should be kept vs. masked.
+  T = True
+  F = False
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring examples
+      #=========================================================================
+      dict(
+          descr='Docstring example 1',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          mask=[[T, F, T], [F, F, F], [T, F, F]],
+          keepdims=False,
+          expected=[1, 3, 7]),
+      dict(
+          descr='Docstring example 2',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          mask=[[T, F, T], [F, F, F], [T, F, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[1, 3], [], [7]])),
+      dict(
+          descr='Docstring example 3',
+          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          keepdims=False,
+          expected=[3, 5, 6]),
+      dict(
+          descr='Docstring example 4',
+          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          keepdims=True,
+          expected=ragged.constant_value([[3], [], [5, 6]])),
+      dict(
+          descr='Docstring example 5',
+          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=[True, False, True],
+          keepdims=False,
+          expected=ragged.constant_value([[1, 2, 3], [5, 6]])),
+      #=========================================================================
+      # Uniform data and uniform mask.
+      #=========================================================================
+      dict(
+          descr='data.shape=[7]; mask.shape=[7]; keepdims=True',
+          data=[1, 2, 3, 4, 5, 6, 7],
+          mask=[T, F, T, T, F, F, F],
+          keepdims=True,
+          expected=[1, 3, 4]),
+      dict(
+          descr='data.shape=[5, 3]; mask.shape=[5]; keepdims=True',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]],
+          mask=[True, False, True, True, False],
+          keepdims=True,
+          expected=[[1, 2, 3], [7, 8, 9], [10, 11, 12]]),
+      dict(
+          descr='data.shape=[5, 3]; mask.shape=[5, 3]; keepdims=True',
+          data=[[1, 2, 3], [4, 5, 6], [7, 8, 9], [0, 1, 2], [3, 4, 5]],
+          mask=[[F, F, F], [T, F, T], [T, T, T], [F, F, F], [T, T, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[], [4, 6], [7, 8, 9], [], [3, 4]])),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3]; keepdims=True',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[F, F, T],
+          keepdims=True,
+          expected=[[[2, 4], [6, 8]]]),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3]; keepdims=False',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[F, F, T],
+          keepdims=False,
+          expected=[[[2, 4], [6, 8]]]),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3, 2]; keepdims=True',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[[T, F], [T, T], [F, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[[1, 2]], [[5, 6], [7, 8]], []],
+                                         ragged_rank=1)),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3, 2]; keepdims=False',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[[T, F], [T, T], [F, F]],
+          keepdims=False,
+          expected=[[1, 2], [5, 6], [7, 8]]),
+      dict(
+          descr='data.shape=[3, 2, 2]; mask.shape=[3, 2, 2]; keepdims=True',
+          data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+          mask=[[[T, T], [F, T]], [[F, F], [F, F]], [[T, F], [T, T]]],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2], [4]], [[], []], [[2], [6, 8]]])),
+      dict(
+          descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=True',
+          data=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                [[[2, 4], [6, 8]], [[1, 3], [5, 7]]]],
+          mask=[[[[T, T], [F, F]], [[T, F], [F, F]]],
+                [[[F, F], [F, F]], [[T, T], [T, F]]]],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[[1, 2], []], [[5], []]], [[[], []], [[1, 3], [5]]]])),
+      dict(
+          descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=False',
+          data=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                [[[2, 4], [6, 8]], [[1, 3], [5, 7]]]],
+          mask=[[[[T, T], [F, F]], [[T, F], [F, F]]],
+                [[[F, F], [F, F]], [[T, T], [T, F]]]],
+          keepdims=False,
+          expected=[1, 2, 5, 1, 3, 5]),
+
+      #=========================================================================
+      # Ragged data and ragged mask.
+      #=========================================================================
+      dict(
+          descr='data.shape=[5, (D2)]; mask.shape=[5, (D2)]',
+          data=ragged.constant_value(
+              [[1, 2], [3, 4, 5, 6], [7, 8, 9], [], [1, 2, 3]]),
+          mask=ragged.constant_value(
+              [[F, F], [F, T, F, T], [F, F, F], [], [T, F, T]]),
+          keepdims=True,
+          expected=ragged.constant_value([[], [4, 6], [], [], [1, 3]])),
+      dict(
+          descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
+          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2]], [[5, 6], [7, 8]], []])),
+      dict(
+          descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
+          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          keepdims=False,
+          expected=ragged.constant_value([[1, 2], [5, 6], [7, 8]])),
+      dict(
+          descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8], [2, 4]], [[6, 8]]],
+              ragged_rank=1),
+          mask=ragged.constant_value([[T, F], [T, T, F], [F]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2]], [[5, 6], [7, 8]], []],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
+              ragged_rank=1),
+          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          keepdims=False,
+          expected=[[1, 2], [5, 6], [7, 8]]),
+      dict(
+          descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2), (D3)]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4]]]),
+          mask=ragged.constant_value(
+              [[[T, T], [F, T]], [[F, F], [F, F]], [[T, F]]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2], [4]], [[], []], [[2]]])),
+      dict(
+          descr=('data.shape=[3, (D2), (D3), (D4)]; '
+                 'mask.shape=[3, (D2), (D3), (D4)]'),
+          data=ragged.constant_value(
+              [[[[1, 2], [3, 4]], [[5, 6]]], [[[2, 4], [6, 8]]]]),
+          mask=ragged.constant_value(
+              [[[[T, T], [F, F]], [[T, F]]], [[[F, F], [T, T]]]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[[1, 2], []], [[5]]], [[[], [6, 8]]]])),
+
+      #=========================================================================
+      # Ragged mask and uniform data
+      #=========================================================================
+      dict(
+          descr='data.shape=[2, 3]; mask.shape=[2, (3)]',
+          data=[[1, 2, 3], [4, 5, 6]],
+          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          keepdims=True,
+          expected=ragged.constant_value([[1], [5, 6]])),
+      dict(
+          descr='data.shape=[2, 3, 2]; mask.shape=[2, (3)]',
+          data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
+          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2]], [[9, 0], [2, 4]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[2, 3, 2]; mask.shape=[2, (3), 2]',
+          data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
+          mask=ragged.constant_value(
+              [[[T, F], [F, F], [T, T]], [[T, F], [F, T], [F, F]]],
+              ragged_rank=1),
+          keepdims=True,
+          expected=ragged.constant_value([[[1], [], [5, 6]], [[7], [0], []]])),
+
+      #=========================================================================
+      # Ragged data and uniform mask.
+      #=========================================================================
+      dict(
+          descr='data.shape=[4, (D2)]; mask.shape=[4]',
+          data=ragged.constant_value([[1, 2, 3], [4], [], [5, 6]]),
+          mask=[T, F, T, F],
+          keepdims=False,
+          expected=ragged.constant_value([[1, 2, 3], []])),
+      dict(
+          descr='data.shape=[4, (D2), (D3)]; mask.shape=[4]',
+          data=ragged.constant_value(
+              [[[1, 2, 3]], [[4], []], [[5, 6]], []]),
+          mask=[T, F, T, T],
+          keepdims=False,
+          expected=ragged.constant_value([[[1, 2, 3]], [[5, 6]], []])),
+      dict(
+          descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1),
+          mask=[T, F, F, T],
+          keepdims=False,
+          expected=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
+          data=ragged.constant_value(
+              [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1),
+          mask=[T, F, F, T],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[1, (2)]; mask.shape=[1, 2]',
+          data=ragged.constant_value([[1, 2]]),
+          mask=[[T, F]],
+          keepdims=True,
+          expected=ragged.constant_value([[1]])),
+      dict(
+          descr='data.shape=[2, (2), (D3)]; mask.shape=[2, 2]',
+          data=ragged.constant_value([[[1], [2, 3]], [[], [4, 5, 6]]]),
+          mask=[[T, F], [T, T]],
+          keepdims=True,
+          expected=ragged.constant_value([[[1]], [[], [4, 5, 6]]])),
+      dict(
+          descr='data.shape=[2, (2), 3]; mask.shape=[2, 2]',
+          data=ragged.constant_value(
+              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
+              ragged_rank=1),
+          mask=[[T, F], [T, T]],
+          keepdims=True,
+          expected=ragged.constant_value(
+              [[[1, 2, 3]], [[7, 8, 9], [2, 4, 6]]],
+              ragged_rank=1)),
+      dict(
+          descr='data.shape=[2, (2), 3]; mask.shape=[2, 2, 3]',
+          data=ragged.constant_value(
+              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
+              ragged_rank=1),
+          mask=[[[T, F, F], [T, F, T]], [[T, F, T], [F, F, F]]],
+          keepdims=True,
+          expected=ragged.constant_value([[[1], [4, 6]], [[7, 9], []]])),
+  ])  # pyformat: disable
+  def testBooleanMask(self, descr, data, mask, keepdims, expected):
+    actual = ragged.boolean_mask(data, mask, keepdims=keepdims)
+    self.assertRaggedEqual(actual, expected)
+
+  def testErrors(self):
+    if not context.executing_eagerly():
+      self.assertRaisesRegexp(ValueError,
+                              r'mask\.shape\.ndims must be kown statically',
+                              ragged.boolean_mask, [[1, 2]],
+                              array_ops.placeholder(dtypes.bool))
+
+    self.assertRaises(TypeError, ragged.boolean_mask, [[1, 2]], [[0, 1]])
+    self.assertRaisesRegexp(
+        ValueError, 'Tensor conversion requested dtype bool for '
+        'RaggedTensor with dtype int32', ragged.boolean_mask,
+        ragged.constant([[1, 2]]), ragged.constant([[0, 0]]))
+
+    self.assertRaisesRegexp(
+        ValueError, r'Shapes \(1, 2\) and \(1, 3\) are incompatible',
+        ragged.boolean_mask, [[1, 2]], [[True, False, True]])
+
+    self.assertRaisesRegexp(errors.InvalidArgumentError,
+                            r'Inputs must have identical ragged splits',
+                            ragged.boolean_mask, ragged.constant([[1, 2]]),
+                            ragged.constant([[True, False, True]]))
+
+    self.assertRaisesRegexp(ValueError, 'mask cannot be scalar',
+                            ragged.boolean_mask, [[1, 2]], True)
+
+    self.assertRaisesRegexp(ValueError,
+                            'mask cannot be scalar', ragged.boolean_mask,
+                            ragged.constant([[1, 2]]), True)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e72afb0448f5e7f7f4ab9aebefb712bfd7816133
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -0,0 +1,323 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.concat."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
+                         parameterized.TestCase):
+
+  def _rt_inputs_to_tensors(self, rt_inputs, ragged_ranks=None):
+    if ragged_ranks is None:
+      ragged_ranks = [None] * len(rt_inputs)
+    return [
+        ragged.constant(rt_input, ragged_rank=rrank)
+        if rrank != 0 else constant_op.constant(rt_input)
+        for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
+    ]
+
+  @parameterized.parameters(
+      dict(
+          descr='Two rank-2 inputs with empty value axis=1',
+          rt_inputs=([[]], [[]]),
+          axis=1,
+          expected=[[]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=0,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21'], [b'b00'],
+                    [b'b10']]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=1,
+          expected=[
+              [b'a00', b'a01', b'b00'],
+              [b'b10', b'b11', b'b12'],
+              [b'a20', b'a21', b'a22', b'b20']]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=-2,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21'], [b'b00'],
+                    [b'b10']]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=-1,
+          expected=[
+              [b'a00', b'a01', b'b00'],
+              [b'b10', b'b11', b'b12'],
+              [b'a20', b'a21', b'a22', b'b20']],
+          expected_shape=[3, None]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10']],                            # shape=(2, None)
+              [['c00'], ['c10', 'c11'], ['c21']]),           # shape=(3, None)
+          axis=0,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21', b'a22'], [b'b00'],
+                    [b'b10'], [b'c00'], [b'c10', b'c11'], [b'c21']]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],     # shape=(3, None)
+              [[], ['c10', 'c11'], ['c20', 'c21']]),         # shape=(3, None)
+          axis=1,
+          expected=[
+              [b'a00', b'a01', b'b00'],
+              [b'b10', b'b11', b'b12', b'c10', b'c11'],
+              [b'a20', b'a21', b'a22', b'b20', b'c20', b'c21']]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=0',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [['c100', 'c101', 'c102', 'c103']], [[], ['c210', 'c211']]]),
+          axis=0,
+          expected=[
+              [[b'a000', b'a001'], [b'a010']],
+              [[b'a100', b'a101', b'a102'], [b'a110', b'a111']],
+              [[b'b000']],
+              [[b'b100', b'b101'], [b'b110']],
+              [],
+              [[b'c100', b'c101', b'c102', b'c103']],
+              [[], [b'c210', b'c211']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [[], ['c110', 'c111']]]),
+          axis=1,
+          expected=[
+              [[b'a000', b'a001'], [b'a010'], [b'b000']],
+              [[b'a100', b'a101', b'a102'], [b'a110', b'a111'],
+               [b'b100', b'b101'], [b'b110'], [], [b'c110', b'c111']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=2',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=2,
+          expected=[
+              [[b'a000', b'a001', b'c000'],
+               [b'a010', b'b010', b'b011', b'c010']],
+              [[b'a100', b'a101', b'a102', b'b100', b'b101'],
+               [b'a110', b'a111', b'b110', b'c110', b'c111']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=-1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=-1,
+          expected=[
+              [[b'a000', b'a001', b'c000'],
+               [b'a010', b'b010', b'b011', b'c010']],
+              [[b'a100', b'a101', b'a102', b'b100', b'b101'],
+               [b'a110', b'a111', b'b110', b'c110', b'c111']]]),
+      dict(
+          descr='ragged_concat([uniform, ragged, uniform], axis=1)',
+          ragged_ranks=[0, 1, 0],
+          rt_inputs=(
+              [['0('], ['1('], ['2(']],                   # shape=(3, 1)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],  # shape=(3, None)
+              [[')0'], [')1'], [')2']]),                  # shape=(3, 1)
+          axis=1,
+          expected=[
+              [b'0(', b'b00', b')0'],
+              [b'1(', b'b10', b'b11', b'b12', b')1'],
+              [b'2(', b'b20', b')2']]),
+      dict(
+          descr='ragged_concat([uniform, uniform], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21'],
+              [b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']],
+          expected_ragged_rank=1),
+      dict(
+          descr='ragged_concat([uniform, ragged], axis=0)',
+          ragged_ranks=[0, 1],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21'],
+              [b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']]),
+      dict(
+          descr='ragged_concat([uniform, ragged], axis=0) with rank-3 inputs',
+          ragged_ranks=[0, 2],
+          rt_inputs=(
+              [[[0, 1], [2, 3]], [[4, 5], [6, 7]]],  # shape = (2, 2, 2)
+              [[[8], [8, 8]]]),                      # shape = (2, None, None)
+          axis=0,
+          expected=[[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[8], [8, 8]]]),
+      dict(
+          descr='Two rank-3 inputs with ragged_rank=1, axis=-1',
+          ragged_ranks=[1, 1],
+          rt_inputs=(
+              [[[0, 1], [2, 3], [4, 5]], [], [[6, 7], [8, 9]]],
+              [[[9, 8], [7, 6], [5, 4]], [], [[3, 2], [1, 0]]]),
+          axis=-1,
+          expected=[
+              [[0, 1, 9, 8], [2, 3, 7, 6], [4, 5, 5, 4]], [],
+              [[6, 7, 3, 2], [8, 9, 1, 0]]],
+          expected_ragged_rank=1),
+      dict(
+          descr='ragged_concat([vector, vector], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=([1, 2, 3], [4, 5, 6]),
+          axis=0,
+          expected=[1, 2, 3, 4, 5, 6]),
+      dict(
+          descr='One input (so ragged_conat is a noop)',
+          rt_inputs=([['a00', 'a01'], [], ['a20', 'a21']],),
+          axis=0,
+          expected=[[b'a00', b'a01'], [], [b'a20', b'a21']]),
+  )   # pyformat: disable
+  def testRaggedConcat(self,
+                       descr,
+                       rt_inputs,
+                       axis,
+                       expected,
+                       ragged_ranks=None,
+                       expected_ragged_rank=None,
+                       expected_shape=None):
+    rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
+    concatenated = ragged.concat(rt_inputs, axis)
+    if expected_ragged_rank is not None:
+      self.assertEqual(concatenated.ragged_rank, expected_ragged_rank)
+    if expected_shape is not None:
+      self.assertEqual(concatenated.shape.as_list(), expected_shape)
+    self.assertRaggedEqual(concatenated, expected)
+
+  @parameterized.parameters(
+      dict(
+          rt_inputs=(),
+          axis=0,
+          error=ValueError,
+          message=r'rt_inputs may not be empty\.'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=r'foo',
+          error=TypeError,
+          message='axis must be an int'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=-3,
+          error=ValueError,
+          message='axis=-3 out of bounds: expected -2<=axis<2'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=2,
+          error=ValueError,
+          message='axis=2 out of bounds: expected -2<=axis<2'),
+      dict(
+          ragged_ranks=(0, 0),
+          rt_inputs=([[1, 2]], [[3, 4], [5, 6]]),
+          axis=1,
+          error=(ValueError, errors.InvalidArgumentError)),
+  )
+  def testStaticError(self,
+                      rt_inputs,
+                      axis,
+                      error,
+                      message=None,
+                      ragged_ranks=None):
+    rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
+    self.assertRaisesRegexp(error, message, ragged.concat, rt_inputs, axis)
+
+  @parameterized.parameters([
+      dict(
+          ragged_ranks=(1, 1),
+          rt_inputs=([[1, 2]], [[3, 4], [5, 6]]),
+          axis=1,
+          error=errors.InvalidArgumentError,
+          message='Input tensors have incompatible shapes'),
+  ])
+  def testRuntimeError(self, rt_inputs, axis, error, message,
+                       ragged_ranks=None):
+    if context.executing_eagerly():
+      return
+    rt_inputs = [
+        array_ops.placeholder_with_default(rt, shape=None) for rt in rt_inputs
+    ]
+    concatenated = ragged.concat(rt_inputs, axis)
+    with self.assertRaisesRegexp(error, message):
+      self.evaluate(concatenated)
+
+  def testNegativeAxisWithUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    rt_inputs = [
+        array_ops.placeholder(dtypes.int64),
+        array_ops.placeholder(dtypes.int64)
+    ]
+    self.assertRaisesRegexp(
+        ValueError, r'axis may only be negative if ndims is statically known.',
+        ragged.concat, rt_inputs, -1)
+
+  def testSingleTensorInput(self):
+    """Tests ragged_concat with a single tensor input.
+
+    Usually, we pass a list of values in for rt_inputs.  However, you can
+    also pass in a single value (as with tf.concat), in which case it simply
+    returns that tensor.  This test exercises that path.
+    """
+    rt_inputs = ragged.constant([[1, 2], [3, 4]])
+    concatenated = ragged.concat(rt_inputs, 0)
+    self.assertRaggedEqual(concatenated, [[1, 2], [3, 4]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c014f7103016104d3cc2e3ecbd18bbf3337a0153
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -0,0 +1,358 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.constant."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConstOpTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
+
+  @parameterized.parameters(
+      #=========================================================================
+      # 0-dimensional tensors.
+      dict(pylist=b'x', expected_shape=()),
+
+      #=========================================================================
+      # 1-dimensional tensors.
+      dict(pylist=[1, 2, 3], expected_shape=(3,)),
+
+      #=========================================================================
+      # 2-dimensional tensors.
+      dict(pylist=[[1, 2, 3], [4], [5, 6]], expected_shape=(3, None)),
+      dict(pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], expected_shape=(3, None)),
+
+      #=========================================================================
+      # 3-dimensional tensors.
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          expected_shape=(3, None, None)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      #=========================================================================
+      # 4-dimensional tensors.
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          expected_shape=(2, None, None, None)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          ragged_rank=1,
+          expected_shape=(2, None, 2, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2,),
+          expected_shape=(2, None, None, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2, 2),
+          expected_shape=(2, None, 2, 2)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ default ragged_rank and inner_shape
+      dict(pylist=[], expected_shape=(0,)),
+      dict(pylist=[[], [], []], expected_shape=(3, None)),
+      dict(
+          pylist=[[[], []], [], [[], [[]]]],
+          expected_shape=(3, None, None, None)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ explicit ragged_rank or inner_shape
+      dict(pylist=[], ragged_rank=1, expected_shape=(0, None)),
+      dict(pylist=[], ragged_rank=2, expected_shape=(0, None, None)),
+      dict(pylist=[], inner_shape=(0, 100, 20), expected_shape=(0, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=1,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=2,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, None, 100, 20)),
+      dict(pylist=[[], [], []], ragged_rank=2, expected_shape=(3, None, None)),
+      dict(pylist=[], inner_shape=(0,), expected_shape=(0,)),
+      dict(pylist=[[]], inner_shape=(1, 0), expected_shape=(1, 0)),
+
+      #=========================================================================
+      # default/inferred dtypes
+      dict(pylist=[], expected_dtype=dtypes.float32),
+      dict(pylist=[[[], [[[]], []]]], expected_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], expected_dtype=dtypes.int32),
+      dict(pylist=[[1., 2.], [], [4., 5., 6.]], expected_dtype=dtypes.float32),
+      dict(pylist=[[1, 2], [3.], [4, 5, 6]], expected_dtype=dtypes.float32),
+      dict(pylist=[[b'a', b'b'], [b'c']], expected_dtype=dtypes.string),
+      dict(pylist=[[True]], expected_dtype=dtypes.bool),
+
+      #=========================================================================
+      # explicit dtypes
+      dict(pylist=[], dtype=dtypes.float32),
+      dict(pylist=[], dtype=dtypes.string),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.int64),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.int32),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=dtypes.float32),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=dtypes.float16),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=dtypes.float32),
+      dict(pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
+           dtype=dtypes.string),
+  )
+  def testRaggedConst(self,
+                      pylist,
+                      dtype=None,
+                      ragged_rank=None,
+                      inner_shape=None,
+                      expected_shape=None,
+                      expected_dtype=None):
+    """Tests that `ragged_const(pylist).eval().tolist() == pylist`.
+
+    Args:
+      pylist: The `pylist` argument for `ragged_const()`.
+      dtype: The `dtype` argument for `ragged_const()`.  If not None, then also
+        test that the resulting ragged tensor has this `dtype`.
+      ragged_rank: The `ragged_rank` argument for `ragged_const()`.  If not
+        None, then also test that the resulting ragged tensor has this
+        `ragged_rank`.
+      inner_shape: The `inner_shape` argument for `ragged_const()`.  If not
+        None, then also test that the resulting ragged tensor has this
+        `inner_shape`.
+      expected_shape: The expected shape for the resulting ragged tensor.
+      expected_dtype: The expected dtype for the resulting ragged tensor (used
+        to test default/inferred types when dtype=None).
+    """
+    rt = ragged.constant(
+        pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
+
+    # If dtype was explicitly specified, check it.
+    if dtype is not None:
+      self.assertEqual(rt.dtype, dtype)
+    if expected_dtype is not None:
+      self.assertEqual(rt.dtype, expected_dtype)
+
+    # If ragged_rank was explicitly specified, check it.
+    if ragged_rank is not None:
+      if isinstance(rt, ragged.RaggedTensor):
+        self.assertEqual(rt.ragged_rank, ragged_rank)
+      else:
+        self.assertEqual(0, ragged_rank)
+
+    # If inner_shape was explicitly specified, check it.
+    if inner_shape is not None:
+      if isinstance(rt, ragged.RaggedTensor):
+        self.assertEqual(rt.flat_values.shape.as_list()[1:], list(inner_shape))
+      else:
+        self.assertEqual(rt.shape.as_list(), list(inner_shape))
+
+    if expected_shape is not None:
+      self.assertEqual(tuple(rt.shape.as_list()), expected_shape)
+
+    self.assertRaggedEqual(rt, pylist)
+
+  @parameterized.parameters(
+      dict(
+          pylist=12,
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with ragged_rank=1'),
+      dict(
+          pylist=12,
+          inner_shape=(1,),
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with '
+          'dim\\(inner_shape\\)=1'),
+      dict(
+          pylist=[[[1], [2]]],
+          ragged_rank=-1,
+          exception=ValueError,
+          message='Invalid ragged_rank=-1: must be nonnegative'),
+      dict(
+          pylist=[[1, [2]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[[1]], [[[2]]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[1], [[]]],
+          exception=ValueError,
+          message='Invalid pylist=.*: empty list nesting is greater '
+          'than scalar value nesting'),
+      dict(
+          pylist=[1, 2, 3],
+          ragged_rank=1,
+          exception=ValueError,
+          message='pylist has scalar values depth 1, but ragged_rank=1 '
+          'requires scalar value depth greater than 1'),
+      dict(
+          pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          ragged_rank=2,
+          exception=ValueError,
+          message='pylist has scalar values depth 2, but ragged_rank=2 '
+          'requires scalar value depth greater than 2'),
+      dict(pylist=[1, 2, 3], inner_shape=(1, 1), exception=TypeError),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          inner_shape=(2, 2),
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=.*: incompatible with ragged_rank=1 and '
+          'dim\\(inner_shape\\)=2'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8, 9]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[], [[]]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+  )
+  def testRaggedConstError(self,
+                           pylist,
+                           dtype=None,
+                           ragged_rank=None,
+                           inner_shape=None,
+                           exception=None,
+                           message=None):
+    """Tests that `ragged_const()` raises an expected exception."""
+    self.assertRaisesRegexp(
+        exception,
+        message,
+        ragged.constant,
+        pylist,
+        dtype=dtype,
+        ragged_rank=ragged_rank,
+        inner_shape=inner_shape)
+
+  @parameterized.parameters([
+      dict(pylist=9, scalar_depth=0, max_depth=0),
+      dict(pylist=[9], scalar_depth=1, max_depth=1),
+      dict(pylist=[1, 2, 3], scalar_depth=1, max_depth=1),
+      dict(pylist=[[1], [2]], scalar_depth=2, max_depth=2),
+      dict(pylist=[[[1], [2]], [[3]]], scalar_depth=3, max_depth=3),
+      dict(pylist=[], scalar_depth=None, max_depth=1),
+      dict(pylist=[[]], scalar_depth=None, max_depth=2),
+      dict(pylist=[[], [], []], scalar_depth=None, max_depth=2),
+      dict(pylist=[[[], []], [[], [[[]]]], []], scalar_depth=None, max_depth=5),
+      dict(
+          pylist=[1, [2]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[1], 2],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[[[1]], []], [[2]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+  ])
+  def testScalarAndMaxDepthHelper(self,
+                                  pylist,
+                                  scalar_depth=None,
+                                  max_depth=None,
+                                  exception=None,
+                                  message=None):
+    """Tests for the _find_scalar_and_max_depth helper function."""
+    if exception is not None:
+      self.assertRaisesRegexp(
+          exception, message,
+          ragged.ragged_factory_ops._find_scalar_and_max_depth, pylist)
+    else:
+      self.assertEqual(
+          ragged.ragged_factory_ops._find_scalar_and_max_depth(pylist),
+          (scalar_depth, max_depth))
+
+  @parameterized.parameters([
+      dict(pylist=[[1], [2, 3]], ragged_rank=1, inner_shape=()),
+      dict(
+          pylist=[[[1], [2]], [[3], [4], [5]]], ragged_rank=1,
+          inner_shape=(1,)),
+      dict(pylist=[[[1], [2]], [[3], [4], [5]]], ragged_rank=2, inner_shape=()),
+      dict(
+          pylist=[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]]],
+          ragged_rank=1,
+          inner_shape=(2, 3)),
+      dict(
+          pylist=[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]]],
+          ragged_rank=2,
+          inner_shape=(3,)),
+      dict(
+          pylist=[[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]]],
+          ragged_rank=3,
+          inner_shape=()),
+      dict(
+          pylist=[[[1], [2, 3]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[1], [[2]]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[[1]], [2]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+  ])
+  def testDefaultInnerShapeForPylistHelper(self,
+                                           pylist,
+                                           ragged_rank,
+                                           inner_shape=None,
+                                           exception=None,
+                                           message=None):
+    """Tests for the _default_inner_shape_for_pylist helper function."""
+    if exception is not None:
+      self.assertRaisesRegexp(
+          exception, message,
+          ragged.ragged_factory_ops._default_inner_shape_for_pylist, pylist,
+          ragged_rank)
+    else:
+      self.assertEqual(
+          ragged.ragged_factory_ops._default_inner_shape_for_pylist(
+              pylist, ragged_rank), inner_shape)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..56768a9a479d0d3b568f4ff4b7f102837e26171d
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -0,0 +1,272 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.constant_value."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConstantValueOpTest(ragged_test_util.RaggedTensorTestCase,
+                                parameterized.TestCase):
+
+  @parameterized.parameters(
+      #=========================================================================
+      # 0-dimensional tensors.
+      dict(pylist='x', expected_shape=()),
+
+      #=========================================================================
+      # 1-dimensional tensors.
+      dict(pylist=[1, 2, 3], expected_shape=(3,)),
+
+      #=========================================================================
+      # 2-dimensional tensors.
+      dict(pylist=[[1, 2, 3], [4], [5, 6]], expected_shape=(3, None)),
+      dict(pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], expected_shape=(3, None)),
+
+      #=========================================================================
+      # 3-dimensional tensors.
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          expected_shape=(3, None, None)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          ragged_rank=1,
+          inner_shape=(2,),
+          expected_shape=(3, None, 2)),
+      #=========================================================================
+      # 4-dimensional tensors.
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          expected_shape=(2, None, None, None)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          ragged_rank=1,
+          expected_shape=(2, None, 2, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2,),
+          expected_shape=(2, None, None, 2)),
+      dict(
+          pylist=[[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+                  [[[2, 4], [6, 8]], [[1, 5], [7, 9]]]],
+          inner_shape=(2, 2),
+          expected_shape=(2, None, 2, 2)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ default ragged_rank and inner_shape
+      dict(pylist=[], expected_shape=(0,)),
+      dict(pylist=[[], [], []], expected_shape=(3, None)),
+      dict(
+          pylist=[[[], []], [], [[], [[]]]],
+          expected_shape=(3, None, None, None)),
+
+      #=========================================================================
+      # Empty tensors (no scalar values) w/ explicit ragged_rank or inner_shape
+      dict(pylist=[], ragged_rank=1, expected_shape=(0, None)),
+      dict(pylist=[], ragged_rank=2, expected_shape=(0, None, None)),
+      dict(pylist=[], inner_shape=(0, 100, 20), expected_shape=(0, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=1,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, 100, 20)),
+      dict(
+          pylist=[],
+          ragged_rank=2,
+          inner_shape=(100, 20),
+          expected_shape=(0, None, None, 100, 20)),
+      dict(pylist=[[], [], []], ragged_rank=2, expected_shape=(3, None, None)),
+      dict(pylist=[], inner_shape=(0,), expected_shape=(0,)),
+      dict(pylist=[[]], inner_shape=(1, 0), expected_shape=(1, 0)),
+
+      #=========================================================================
+      # default/inferred dtypes.
+      #
+      # Note: numpy has different default/inferred types than tensorflow.
+      # Since we are using values, not tensors, we get the default numpy types
+      # here.
+      dict(pylist=[], expected_dtype=np.float64),
+      dict(pylist=[[[], [[[]], []]]], expected_dtype=np.float64),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], expected_dtype=np.int64),
+      dict(pylist=[[1., 2.], [], [4., 5., 6.]], expected_dtype=np.float64),
+      dict(pylist=[[1, 2], [3.], [4, 5, 6]], expected_dtype=np.float64),
+      dict(pylist=[[b'a', b'b'], [b'c']], expected_dtype=np.dtype('S1')),
+      dict(pylist=[[True]], expected_dtype=np.bool),
+
+      #=========================================================================
+      # explicit dtypes
+      dict(pylist=[], dtype=np.float32),
+      dict(pylist=[], dtype=np.dtype('S1')),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=np.int64),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=np.int32),
+      dict(pylist=[[1, 2], [3], [4, 5, 6]], dtype=np.float32),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=np.float16),
+      dict(pylist=[[1., 2.], [3.], [4., 5., 6.]], dtype=np.float32),
+      dict(
+          pylist=[[b'a', b'b'], [b'c'], [b'd', b'e', b'f']],
+          dtype=np.dtype('S1')),
+  )
+  def testRaggedValues(self,
+                       pylist,
+                       dtype=None,
+                       ragged_rank=None,
+                       inner_shape=None,
+                       expected_shape=None,
+                       expected_dtype=None):
+    """Tests that `ragged_value(pylist).to_list() == pylist`."""
+    rt = ragged.constant_value(
+        pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
+
+    # If dtype was explicitly specified, check it.
+    if dtype is not None:
+      self.assertEqual(rt.dtype, dtype)
+    if expected_dtype is not None:
+      self.assertEqual(rt.dtype, expected_dtype)
+
+    # If ragged_rank was explicitly specified, check it.
+    if ragged_rank is not None:
+      if isinstance(rt, ragged.RaggedTensorValue):
+        self.assertEqual(rt.ragged_rank, ragged_rank)
+      else:
+        self.assertEqual(0, ragged_rank)
+
+    # If inner_shape was explicitly specified, check it.
+    if inner_shape is not None:
+      if isinstance(rt, ragged.RaggedTensorValue):
+        self.assertEqual(rt.flat_values.shape[1:], inner_shape)
+      else:
+        self.assertEqual(rt.shape, inner_shape)
+
+    if expected_shape is not None:
+      self.assertEqual(tuple(rt.shape), expected_shape)
+
+    if rt.shape:
+      if isinstance(rt, ragged.RaggedTensorValue):
+        self.assertEqual(rt.to_list(), pylist)
+      else:
+        self.assertEqual(rt.tolist(), pylist)
+      if expected_shape is not None:
+        self.assertEqual(rt.shape, expected_shape)
+    else:
+      self.assertEqual(rt, pylist)
+      if expected_shape is not None:
+        self.assertEqual((), expected_shape)
+
+  @parameterized.parameters(
+      dict(
+          pylist=12,
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with ragged_rank=1'),
+      dict(
+          pylist=12,
+          inner_shape=(1,),
+          exception=ValueError,
+          message='Invalid pylist=12: incompatible with '
+          'dim\\(inner_shape\\)=1'),
+      dict(
+          pylist=[[[1], [2]]],
+          ragged_rank=-1,
+          exception=ValueError,
+          message='Invalid ragged_rank=-1: must be nonnegative'),
+      dict(
+          pylist=[[1, [2]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[[1]], [[[2]]]],
+          exception=ValueError,
+          message='all scalar values must have the same nesting depth'),
+      dict(
+          pylist=[[1], [[]]],
+          exception=ValueError,
+          message='Invalid pylist=.*: empty list nesting is greater '
+          'than scalar value nesting'),
+      dict(
+          pylist=[1, 2, 3],
+          ragged_rank=1,
+          exception=ValueError,
+          message='pylist has scalar values depth 1, but ragged_rank=1 '
+          'requires scalar value depth greater than 1'),
+      dict(
+          pylist=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          ragged_rank=2,
+          exception=ValueError,
+          message='pylist has scalar values depth 2, but ragged_rank=2 '
+          'requires scalar value depth greater than 2'),
+      dict(
+          pylist=[1, 2, 3],
+          inner_shape=(1, 1),
+          exception=ValueError,
+          message='cannot reshape array'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          inner_shape=(2, 2),
+          ragged_rank=1,
+          exception=ValueError,
+          message='Invalid pylist=.*: incompatible with ragged_rank=1 and '
+          'dim\\(inner_shape\\)=2'),
+      dict(
+          pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8, 9]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+      dict(
+          pylist=[[[], [[]]]],
+          ragged_rank=1,
+          exception=ValueError,
+          message='inner values have inconsistent shape'),
+  )
+  def testRaggedValuesError(self,
+                            pylist,
+                            dtype=None,
+                            ragged_rank=None,
+                            inner_shape=None,
+                            exception=None,
+                            message=None):
+    """Tests that `ragged.constant_value()` raises an expected exception."""
+    self.assertRaisesRegexp(
+        exception,
+        message,
+        ragged.constant_value,
+        pylist,
+        dtype=dtype,
+        ragged_rank=ragged_rank,
+        inner_shape=inner_shape)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..854c5b303c81d089baf78119ca8525a51e7a83c4
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -0,0 +1,44 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops to convert between RaggedTensors and other tensor types."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+def from_tensor(tensor, lengths=None, padding=None, ragged_rank=1, name=None):
+  if ragged_tensor.is_ragged(tensor):
+    return tensor
+  else:
+    return ragged_tensor.RaggedTensor.from_tensor(tensor, lengths, padding,
+                                                  ragged_rank, name)
+
+
+def to_tensor(rt_input, default_value=None, name=None):
+  if ragged_tensor.is_ragged(rt_input):
+    return rt_input.to_tensor(default_value, name)
+  else:
+    return rt_input
+
+
+def to_sparse(rt_input, name=None):
+  return rt_input.to_sparse(name)
+
+
+def from_sparse(st_input, name=None):
+  return ragged_tensor.RaggedTensor.from_sparse(st_input, name)
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c74f7be62de0746418f57b2b2c06c31f2a5a4f5
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -0,0 +1,440 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operator dispatch for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+
+# @TODO(edloper): Set this to True in the CL that exports RaggedTensors.
+_UPDATE_DOCSTRINGS = False
+
+# Information about an argument to an operation: The name of the argument, its
+# position in the argument list, and a boolean flag indicating whether it
+# expects a list of tensors.
+_ArgInfo = collections.namedtuple('ArgInfo', ['name', 'position', 'is_list'])
+
+
+def _get_arg_infos(func, arg_names):
+  """Returns an `_ArgInfo` for each argument of `func` specified by `arg_names`.
+
+  Args:
+    func: The function whose arguments should be described.
+    arg_names: The names of the arguments to get info for.
+
+  Returns:
+    A tuple of `_ArgInfo`s.
+  """
+  arg_infos = []
+
+  # Inspect the func's argspec to find the position of each arg.
+  arg_spec = tf_inspect.getargspec(func)
+  for argname in arg_names:
+    assert isinstance(argname, str)
+    is_list = argname.startswith('[') and argname.endswith(']')
+    if is_list:
+      argname = argname[1:-1]
+    if argname not in arg_spec.args:
+      raise ValueError('Argument %r not found function in %s.  Args=%s' %
+                       (argname, func, arg_spec.args))
+    arg_infos.append(_ArgInfo(argname, arg_spec.args.index(argname), is_list))
+  return arg_infos
+
+
+def _is_convertible_to_tensor(value):
+  """Returns true if `value` is convertible to a `Tensor`."""
+  if isinstance(value,
+                (ops.Tensor, variables.Variable, np.ndarray, int, float, str)):
+    return True
+  elif isinstance(value, (sparse_tensor.SparseTensor,)):
+    return False
+  else:
+    try:
+      ops.convert_to_tensor(value)
+      return True
+    except (TypeError, ValueError):
+      return False
+
+
+class UnaryRaggedElementwiseDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for unary ops that map a base op across ragged values."""
+
+  def __init__(self, original_op, arg_is_list=False):
+    self._original_op = original_op
+    self._arg_is_list = arg_is_list
+    arg_names = tf_inspect.getfullargspec(original_op)[0]
+    self._x = arg_names[0]
+    if _UPDATE_DOCSTRINGS:
+      original_op.__doc__ = (
+          original_op.__doc__.rstrip() + '\n\n' +
+          '    `{x}` may be a `tf.RaggedTensor`.\n'.format(x=self._x))
+
+  def handle(self, args, kwargs):
+    if args:
+      x, args = args[0], args[1:]
+    else:
+      kwargs = kwargs.copy()
+      x = kwargs.pop(self._x, None)
+    if x is None:
+      return self.NOT_SUPPORTED
+    if self._arg_is_list:
+      found_ragged = False
+      for elt in x:
+        if ragged_tensor.is_ragged(elt):
+          found_ragged = True
+        elif not _is_convertible_to_tensor(elt):
+          return self.NOT_SUPPORTED
+      if found_ragged:
+        nested_splits_lists = [
+            elt.nested_row_splits for elt in x if ragged_tensor.is_ragged(elt)
+        ]
+        flat_values = [
+            elt.flat_values if ragged_tensor.is_ragged(elt) else elt
+            for elt in x
+        ]
+        with ops.control_dependencies(
+            ragged_util.assert_splits_match(nested_splits_lists)):
+          return ragged_tensor.RaggedTensor.from_nested_row_splits(
+              self._original_op(flat_values, *args, **kwargs),
+              nested_splits_lists[0])
+      else:
+        return self.NOT_SUPPORTED
+    else:
+      found_ragged = ragged_tensor.is_ragged(x)
+      if found_ragged:
+        mapped_values = self._original_op(x.flat_values, *args, **kwargs)
+        return x.with_flat_values(mapped_values)
+      else:
+        return self.NOT_SUPPORTED
+
+
+class BinaryRaggedElementwiseDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for binary ops that map a base op across ragged values.
+
+  Supports broadcasting.
+  """
+
+  def __init__(self, original_op):
+    self._original_op = original_op
+    arg_names = tf_inspect.getfullargspec(original_op)[0]
+    self._x = arg_names[0]
+    self._y = arg_names[1]
+    if _UPDATE_DOCSTRINGS:
+      original_op.__doc__ = (
+          original_op.__doc__.rstrip() + '\n\n' +
+          '    `{x}` and `{y}` may be a `tf.RaggedTensor`.\n'.format(
+              x=self._x, y=self._y))
+
+  def handle(self, args, kwargs):
+    # Extract the binary args.
+    if len(args) > 1:
+      x = args[0]
+      y = args[1]
+      args = args[2:]
+    elif args:
+      kwargs = kwargs.copy()
+      x = args[0]
+      y = kwargs.pop(self._y, None)
+      args = args[1:]
+    else:
+      kwargs = kwargs.copy()
+      x = kwargs.pop(self._x, None)
+      y = kwargs.pop(self._y, None)
+
+    # Bail if we don't have at least one ragged argument.
+    x_is_ragged = ragged_tensor.is_ragged(x)
+    y_is_ragged = ragged_tensor.is_ragged(y)
+    if not (x_is_ragged or y_is_ragged):
+      return self.NOT_SUPPORTED
+
+    # Convert args to tensors.  Bail if conversion fails.
+    try:
+      if not x_is_ragged:
+        x = ops.convert_to_tensor(x, name=self._x, preferred_dtype=y.dtype)
+      if not y_is_ragged:
+        y = ops.convert_to_tensor(y, name=self._y, preferred_dtype=x.dtype)
+    except (TypeError, ValueError):
+      return self.NOT_SUPPORTED
+
+    if ((x_is_ragged and y_is_ragged) or
+        (x_is_ragged and x.flat_values.shape.ndims <= y.shape.ndims) or
+        (y_is_ragged and y.flat_values.shape.ndims <= x.shape.ndims)):
+      bcast_shape = ragged_tensor_shape.broadcast_dynamic_shape(
+          ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(x),
+          ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(y))
+      x = ragged_tensor_shape.broadcast_to(
+          x, bcast_shape, broadcast_inner_dimensions=False)
+      y = ragged_tensor_shape.broadcast_to(
+          y, bcast_shape, broadcast_inner_dimensions=False)
+
+    x_values = x.flat_values if ragged_tensor.is_ragged(x) else x
+    y_values = y.flat_values if ragged_tensor.is_ragged(y) else y
+    mapped_values = self._original_op(x_values, y_values, *args, **kwargs)
+    if ragged_tensor.is_ragged(x):
+      return x.with_flat_values(mapped_values)
+    else:
+      return y.with_flat_values(mapped_values)
+
+
+class RaggedDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for ragged ops.
+
+  Dispatches to a wrapped op-handler if at least one of the `tensor_args`
+  arguments is a RaggedTensor or a RaggedTensorValue; and all of the
+  `tensor_args` arguments are convertible to Tensor or RaggedTensor.
+  """
+
+  def __init__(self, original_op, ragged_op, ragged_args):
+    op_arg_names = tf_inspect.getfullargspec(original_op)[0]
+    ragged_arg_names = tf_inspect.getfullargspec(ragged_op)[0]
+    if op_arg_names != ragged_arg_names:
+      raise AssertionError(
+          'Signature must exactly match when overriding %s with %s: %s vs %s' %
+          (original_op, ragged_op, op_arg_names, ragged_arg_names))
+    self._ragged_op = ragged_op
+    self._ragged_args = _get_arg_infos(ragged_op, ragged_args)
+    if _UPDATE_DOCSTRINGS:
+      arg_list = ' and '.join('`%s`' % arg for arg in ragged_args)
+      original_op.__doc__ = (
+          original_op.__doc__.rstrip() + '\n\n' +
+          '    {0} may be a `tf.RaggedTensor`.\n'.format(arg_list))
+
+  def handle(self, args, kwargs):
+    if self.is_supported(args, kwargs):
+      return self._ragged_op(*args, **kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+  def is_supported(self, args, kwargs):
+    found_ragged = False
+    for arg_info in self._ragged_args:
+      if arg_info.position < len(args):
+        arg = args[arg_info.position]
+      else:
+        arg = kwargs.get(arg_info.name, None)
+
+      if arg_info.is_list:
+        if not isinstance(arg, (list, tuple)):
+          return False
+        for elt in arg:
+          if ragged_tensor.is_ragged(elt):
+            found_ragged = True
+          elif not _is_convertible_to_tensor(elt):
+            return False
+      else:
+        if ragged_tensor.is_ragged(arg):
+          found_ragged = True
+        elif not _is_convertible_to_tensor(arg):
+          return False
+    return found_ragged
+
+
+def ragged_dispatch(original_op, tensor_args):
+
+  def decorator(ragged_op):
+    dispatch.RaggedDispatcher(original_op, ragged_op,
+                              tensor_args).register(original_op)
+    return ragged_op
+
+  return decorator
+
+
+_UNARY_ELEMENTWISE_OPS = [
+    array_ops.check_numerics,
+    array_ops.identity,
+    array_ops.ones_like,
+    array_ops.ones_like_v2,
+    array_ops.zeros_like,
+    array_ops.zeros_like_v2,
+    clip_ops.clip_by_value,
+    math_ops.abs,
+    math_ops.acos,
+    math_ops.acosh,
+    math_ops.angle,
+    math_ops.asin,
+    math_ops.asinh,
+    math_ops.atan,
+    math_ops.atanh,
+    math_ops.cast,
+    math_ops.ceil,
+    math_ops.conj,
+    math_ops.cos,
+    math_ops.cosh,
+    math_ops.digamma,
+    math_ops.erf,
+    math_ops.erfc,
+    math_ops.exp,
+    math_ops.expm1,
+    math_ops.floor,
+    math_ops.imag,
+    math_ops.is_finite,
+    math_ops.is_inf,
+    math_ops.is_nan,
+    math_ops.lgamma,
+    math_ops.log,
+    math_ops.log1p,
+    math_ops.log_sigmoid,
+    math_ops.logical_not,
+    math_ops.negative,
+    math_ops.real,
+    math_ops.reciprocal,
+    math_ops.rint,
+    math_ops.round,
+    math_ops.rsqrt,
+    math_ops.saturate_cast,
+    math_ops.sign,
+    math_ops.sin,
+    math_ops.sinh,
+    math_ops.sqrt,
+    math_ops.square,
+    math_ops.tan,
+    parsing_ops.decode_compressed,
+    string_ops.string_to_number,
+    string_ops.string_to_hash_bucket,
+    string_ops.as_string,
+    string_ops.decode_base64,
+    string_ops.encode_base64,
+    string_ops.regex_full_match,
+    string_ops.regex_replace,
+    string_ops.string_strip,
+    string_ops.string_to_hash_bucket,
+    string_ops.string_to_hash_bucket_fast,
+    string_ops.string_to_hash_bucket_strong,
+    string_ops.substr,
+    string_ops.substr_v2,
+    string_ops.string_length,
+    string_ops.string_length_v2,
+    string_ops.unicode_script,
+]
+
+_UNARY_LIST_ELEMENTWISE_OPS = [
+    math_ops.add_n,
+    string_ops.string_join,
+]
+
+_BINARY_ELEMENTWISE_OPS = [
+    math_ops.add,
+    math_ops.atan2,
+    math_ops.complex,
+    math_ops.div_no_nan,
+    math_ops.divide,
+    math_ops.equal,
+    math_ops.floordiv,
+    math_ops.floormod,
+    math_ops.greater,
+    math_ops.greater_equal,
+    math_ops.less,
+    math_ops.less_equal,
+    math_ops.logical_and,
+    math_ops.logical_or,
+    math_ops.logical_xor,
+    math_ops.maximum,
+    math_ops.minimum,
+    math_ops.multiply,
+    math_ops.not_equal,
+    math_ops.pow,
+    math_ops.realdiv,
+    math_ops.squared_difference,
+    math_ops.subtract,
+    math_ops.truediv,
+    math_ops.truncatediv,
+    math_ops.truncatemod,
+]
+
+# (original_op, ragged_op, ragged_args)
+_RAGGED_DISPATCH_OPS = [
+    (array_ops.batch_gather, ragged_array_ops.batch_gather,
+     ['params', 'indices']),
+    (array_ops.concat, ragged_array_ops.concat, ['values']),
+    (array_ops.expand_dims_v2, ragged_array_ops.expand_dims, ['input']),
+    (array_ops.gather_v2, ragged_array_ops.gather, ['params', 'indices']),
+    (array_ops.gather_nd, ragged_array_ops.gather_nd, ['params', 'indices']),
+    (array_ops.stack, ragged_array_ops.stack, ['values']),
+    (array_ops.tile, ragged_array_ops.tile, ['input']),
+    (array_ops.where, ragged_array_ops.where, ['condition', 'x', 'y']),
+    (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_prod, ragged_math_ops.segment_prod,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_min, ragged_math_ops.segment_min,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_max, ragged_math_ops.segment_max,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_mean, ragged_math_ops.segment_mean,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_sqrt_n, ragged_math_ops.segment_sqrt_n,
+     ['data', 'segment_ids']),
+    (math_ops.reduce_sum, ragged_math_ops.reduce_sum, ['input_tensor']),
+    (math_ops.reduce_prod, ragged_math_ops.reduce_prod, ['input_tensor']),
+    (math_ops.reduce_min, ragged_math_ops.reduce_min, ['input_tensor']),
+    (math_ops.reduce_max, ragged_math_ops.reduce_max, ['input_tensor']),
+    (math_ops.reduce_mean, ragged_math_ops.reduce_mean, ['input_tensor']),
+    (math_ops.reduce_any, ragged_math_ops.reduce_any, ['input_tensor']),
+    (math_ops.reduce_all, ragged_math_ops.reduce_all, ['input_tensor']),
+]
+
+
+def register_dispatchers():
+  """Constructs & registers OpDispatchers for ragged ops."""
+
+  op_list = (
+      _UNARY_ELEMENTWISE_OPS + _UNARY_LIST_ELEMENTWISE_OPS +
+      _BINARY_ELEMENTWISE_OPS + [x[0] for x in _RAGGED_DISPATCH_OPS])
+  for op in op_list:
+    _, undecorated_op = tf_decorator.unwrap(op)
+    if not hasattr(undecorated_op, tf_export.API_ATTRS['tensorflow'].names):
+      raise AssertionError('Expected %s to be an exported symbol '
+                           '(while adding a RaggedTensor dispatcher)')
+
+  for op in _UNARY_ELEMENTWISE_OPS:
+    UnaryRaggedElementwiseDispatcher(op).register(op)
+
+  for op in _UNARY_LIST_ELEMENTWISE_OPS:
+    UnaryRaggedElementwiseDispatcher(op, True).register(op)
+
+  for op in _BINARY_ELEMENTWISE_OPS:
+    BinaryRaggedElementwiseDispatcher(op).register(op)
+
+  for (original_op, ragged_op, args) in _RAGGED_DISPATCH_OPS:
+    RaggedDispatcher(original_op, ragged_op, args).register(original_op)
+
+  docstring = (
+      '\n\n### Additional ops that support `RaggedTensor`\n\n' + '\n'.join([
+          '* `tf.%s`' % tf_export.get_canonical_name_for_symbol(op)
+          for op in op_list
+      ]))
+
+  return docstring
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..82827aa2aafe22e7d6c61977ca6321cb69bd0db5
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -0,0 +1,451 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RaggedTensor operator dispatch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+# Constants listing various op types to test.  Each operation
+# should be included in at least one list below, or tested separately if
+# necessary (e.g., because it expects additional arguments).
+UNARY_FLOAT_OPS = [
+    math_ops.abs,
+    math_ops.acos,
+    math_ops.acosh,
+    math_ops.angle,
+    math_ops.asin,
+    math_ops.asinh,
+    math_ops.atan,
+    math_ops.atanh,
+    math_ops.ceil,
+    math_ops.conj,
+    math_ops.cos,
+    math_ops.cosh,
+    math_ops.digamma,
+    math_ops.erf,
+    math_ops.erfc,
+    math_ops.exp,
+    math_ops.expm1,
+    math_ops.floor,
+    math_ops.imag,
+    math_ops.is_finite,
+    math_ops.is_inf,
+    math_ops.is_nan,
+    math_ops.lgamma,
+    math_ops.log,
+    math_ops.log1p,
+    math_ops.log_sigmoid,
+    math_ops.negative,
+    math_ops.real,
+    math_ops.reciprocal,
+    math_ops.rint,
+    math_ops.round,
+    math_ops.rsqrt,
+    math_ops.sign,
+    math_ops.sin,
+    math_ops.sinh,
+    math_ops.sqrt,
+    math_ops.square,
+    math_ops.tan,
+    array_ops.identity,
+    array_ops.ones_like,
+    array_ops.zeros_like,
+]
+UNARY_BOOL_OPS = [
+    math_ops.logical_not,
+]
+UNARY_STRING_OPS = [
+    string_ops.decode_base64,
+    string_ops.encode_base64,
+    string_ops.string_strip,
+    parsing_ops.decode_compressed,
+]
+BINARY_FLOAT_OPS = [
+    math_ops.add,
+    math_ops.atan2,
+    math_ops.complex,
+    math_ops.div_no_nan,
+    math_ops.divide,
+    math_ops.equal,
+    math_ops.floordiv,
+    math_ops.floormod,
+    math_ops.greater,
+    math_ops.greater_equal,
+    math_ops.less,
+    math_ops.less_equal,
+    math_ops.maximum,
+    math_ops.minimum,
+    math_ops.multiply,
+    math_ops.not_equal,
+    math_ops.pow,
+    math_ops.realdiv,
+    math_ops.squared_difference,
+    math_ops.subtract,
+    math_ops.truediv,
+]
+BINARY_BOOL_OPS = [
+    math_ops.logical_and,
+    math_ops.logical_or,
+    math_ops.logical_xor,
+]
+UNARY_INT_OPS = [
+    string_ops.unicode_script,
+]
+BINARY_INT_OPS = [
+    math_ops.truncatediv,
+    math_ops.truncatemod,
+]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
+                               parameterized.TestCase):
+
+  def assertSameShape(self, x, y):
+    """Checks that x and y have the same shape (including ragged shapes)."""
+    if isinstance(x, ragged.RaggedTensor):
+      self.assertIsInstance(y, ragged.RaggedTensor)
+      self.assertEqual(x.ragged_rank, y.ragged_rank)
+      for (x_splits, y_splits) in zip(x.nested_row_splits, y.nested_row_splits):
+        self.assertAllEqual(x_splits, y_splits)
+      self.assertAllEqual(
+          array_ops.shape(x.flat_values), array_ops.shape(y.flat_values))
+    else:
+      self.assertIsInstance(y, ops.Tensor)
+      self.assertAllEqual(array_ops.shape(x), array_ops.shape(y))
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Test different input shapes.
+      #=========================================================================
+      [
+          # 0-dimensional input
+          {'x': 12},
+          # 1-dimensional input
+          {'x': [1, -2, 3]},
+          # 2-dimensional input
+          {'x': [[-2, 3], [-3, 4]]},
+          {'x': ragged.constant_value([[-2, 3], [-3]], ragged_rank=1)},
+          # 3-dimensional inputs
+          {'x': [[[-2, 3], [3, 4]], [[7, 6], [5, 4]]]},
+          {'x': ragged.constant_value([[[-2, 3], [3, 4]], [[7, 6]]],
+                                      ragged_rank=1)},
+          {'x': ragged.constant_value([[[-2, 3, 4], []], [[7, 6]], []],
+                                      ragged_rank=2)},
+          ] +
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]), 'op': op}
+       for op in UNARY_FLOAT_OPS] +
+      [{'x': ragged.constant_value([[True, False], [True]]), 'op': op}
+       for op in UNARY_BOOL_OPS] +
+      [{'x': ragged.constant_value([[18, 512], [12412]], np.int32), 'op': op}
+       for op in UNARY_INT_OPS] +
+      [{'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]), 'op': op}
+       for op in UNARY_STRING_OPS] +
+      [
+          {'op': clip_ops.clip_by_value,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'clip_value_min': 0.1, 'clip_value_max': 4.0},
+          {'op': math_ops.cast,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': math_ops.saturate_cast,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': string_ops.string_to_hash_bucket,
+           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': string_ops.string_to_hash_bucket_fast,
+           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': string_ops.string_to_hash_bucket_strong,
+           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000,
+           'key': [1231, 12512]},
+          {'op': string_ops.string_to_number,
+           'x': ragged.constant_value([['-2.0', '3.0'], ['-3.0']])},
+          {'op': string_ops.regex_full_match,
+           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'pattern': r'\w+'},
+          {'op': string_ops.regex_replace,
+           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'pattern': r'\d',
+           'rewrite': '#'},
+          {'op': string_ops.substr,
+           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'pos': 2, 'len': 3},
+          {'op': array_ops.check_numerics,
+           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'message': 'check-numerics'},
+      ]
+      )  # pyformat: disable
+  def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
+    x = ragged.convert_to_tensor_or_ragged_tensor(x)
+    result = op(x, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_x = x.flat_values if isinstance(x, ragged.RaggedTensor) else x
+    expected_flat_values = array_ops.reshape(op(dense_x, **extra_args), [-1])
+
+    # Check that the result has the expected shape.
+    self.assertSameShape(x, result)
+
+    # Check that the result has the expected (flattened) values.
+    if isinstance(result, ragged.RaggedTensor):
+      result_flat_values = array_ops.reshape(result.flat_values, [-1])
+    else:
+      result_flat_values = array_ops.reshape(result, [-1])
+    self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  @parameterized.parameters(
+      [
+          #=====================================================================
+          # Without broadcasting -- i.e., shapes match exactly.
+          #=====================================================================
+          # Shapes: x:(), y:()
+          {'x': 12,
+           'y': 8},
+          # Shapes: x:(3,), y:(3,)
+          {'x': [7, 8, 9],
+           'y': [1, -2, 3]},
+          # Shapes: x:(2, 2), y:(2, 2)
+          {'x': [[-2, 3], [-3, -4]],
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(2, None), y:(2, None)
+          {'x': ragged.constant_value([[-2, 3], [-3]]),
+           'y': ragged.constant_value([[5, 6], [7]])},
+          # Shapes: x:(2, 2, 2), y:(2, 2, 2)
+          {'x': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+           'y': [[[9, 3], [3, 4]], [[5, 2], [7, 6]]]},
+          # Shapes: x:(2, None, None), y: (2, None, None)
+          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]])},
+          # Shapes: x:(2, None, 2), y: (2, None, 2)
+          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+                                      ragged_rank=1),
+           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+                                      ragged_rank=1)},
+
+          #=====================================================================
+          # With broadcasting
+          #=====================================================================
+          # Shapes: x:(), y:(3,)
+          {'x': 12,                                 # Broadcast () -> (3,)
+           'y': [1, -2, 3]},
+          # Shapes: x:(1,), y:(3,)
+          {'x': [12],                               # Broadcast (1,) -> (3,)
+           'y': [1, -2, 3]},
+          # Shapes: x:(), y:(2, 2)
+          {'x': 12,                                 # Broadcast () -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(1,), y:(2, 2)
+          {'x': 12,                                 # Broadcast (1,) -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(2, 1), y:(2, 2)
+          {'x': [[10], [20]],                       # Broadcast (2, 1) -> (2, 2)
+           'y': [[1, 2], [3, 4]]},
+          # Shapes: x:(), y:(2, None)
+          {'x': 10,                                 # Broadcast () -> (2, None)
+           'y': ragged.constant_value([[1, 2], [3]], dtype=np.int32)},
+          # TODO(edloper): Add tests for more advanced broadcasting, once we add
+          # support for it.
+
+          #=====================================================================
+          # Keyword Args
+          #=====================================================================
+          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]]),
+           'use_kwargs': ('x', 'y')},
+          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+                                      ragged_rank=1),
+           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+                                      ragged_rank=1),
+           'use_kwargs': ('x', 'y')},
+          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+                                      ragged_rank=1),
+           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+                                      ragged_rank=1),
+           'use_kwargs': ('x',)},
+      ] +
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+        'y': ragged.constant_value([[5.0, 1.0], [12.0]]),
+        'op': op}
+       for op in BINARY_FLOAT_OPS] +
+      [{'x': ragged.constant_value([[-2, 3], [-3]]),
+        'y': ragged.constant_value([[5, 1], [12]]),
+        'op': op}
+       for op in BINARY_INT_OPS] +
+      [{'x': ragged.constant_value([[True, True], [False]]),
+        'y': ragged.constant_value([[False, True], [False]]),
+        'op': op}
+       for op in BINARY_BOOL_OPS]
+      )  # pyformat: disable
+  def testBinaryElementwiseOp(self, x, y, op=math_ops.add, **extra_args):
+    use_kwargs = extra_args.pop('use_kwargs', ())
+    x = ragged.convert_to_tensor_or_ragged_tensor(x)
+    y = ragged.convert_to_tensor_or_ragged_tensor(y)
+    if 'x' in use_kwargs and 'y' in use_kwargs:
+      result = op(x=x, y=y, **extra_args)
+    elif 'y' in use_kwargs:
+      result = op(x, y=y, **extra_args)
+    else:
+      result = op(x, y, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_x = x.flat_values if isinstance(x, ragged.RaggedTensor) else x
+    dense_y = y.flat_values if isinstance(y, ragged.RaggedTensor) else y
+    expected_flat_values = array_ops.reshape(
+        op(dense_x, dense_y, **extra_args), [-1])
+
+    # Check that the result has the expected shape.
+    self.assertSameShape(y, result)
+
+    # Check that the result has the expected (flattened) values.
+    if isinstance(result, ragged.RaggedTensor):
+      result_flat_values = array_ops.reshape(result.flat_values, [-1])
+    else:
+      result_flat_values = array_ops.reshape(result, [-1])
+    self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  @parameterized.parameters(
+      [
+          {'inputs': (12, 8, 3)},
+          {'inputs': ([1, 2, 3], [7, 8, 9], [3, 6, 9])},
+          {'inputs': ([[1, 2]], [[3, 4]], [[5, 6]])},
+          {'inputs': (ragged.constant_value([[1, 3], [-3]]),
+                      ragged.constant_value([[4, 7], [88]]),
+                      ragged.constant_value([[2, 9], [12]]))},
+          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
+                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
+                      ragged.constant_value([[[2, 9], [12]], [[8]]]))},
+          {'inputs': (ragged.constant_value([[[1, 3], [3, 4]], [[1, 5]]],
+                                            ragged_rank=1),
+                      ragged.constant_value([[[4, 7], [1, 2]], [[2, 2]]],
+                                            ragged_rank=1),
+                      ragged.constant_value([[[2, 9], [5, 2]], [[8, 0]]],
+                                            ragged_rank=1))},
+          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
+                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
+                      ragged.constant_value([[[2, 9], [12]], [[8]]])),
+           'use_kwargs': True},
+      ] + [
+          {'op': math_ops.add_n,
+           'inputs': (ragged.constant_value([[1, 3], [-3]]),
+                      ragged.constant_value([[4, 7], [88]]),
+                      ragged.constant_value([[2, 9], [12]]))},
+          {'op': string_ops.string_join,
+           'inputs': (ragged.constant_value([['a', 'b'], ['c']]),
+                      ragged.constant_value([['foo', 'bar'], ['baz']]),
+                      ragged.constant_value([['2', '9'], ['12']]))},
+      ])  # pyformat: disable
+  def testListValuedElementwiseOp(self, inputs, op=math_ops.add_n,
+                                  **extra_args):
+    use_kwargs = extra_args.pop('use_kwargs', False)
+    inputs = [ragged.convert_to_tensor_or_ragged_tensor(x) for x in inputs]
+    if use_kwargs:
+      result = op(inputs=inputs, **extra_args)
+    else:
+      result = op(inputs, **extra_args)
+
+    # Run the wrapped op on the dense values, for comparison.
+    dense_inputs = [
+        x.flat_values if isinstance(x, ragged.RaggedTensor) else x
+        for x in inputs
+    ]
+    expected_flat_values = array_ops.reshape(
+        op(dense_inputs, **extra_args), [-1])
+
+    # Check that the result has the expected shape.
+    self.assertSameShape(inputs[0], result)
+
+    # Check that the result has the expected (flattened) values.
+    if isinstance(result, ragged.RaggedTensor):
+      result_flat_values = array_ops.reshape(result.flat_values, [-1])
+    else:
+      result_flat_values = array_ops.reshape(result, [-1])
+    self.assertAllEqual(expected_flat_values, result_flat_values)
+
+  def testElementwiseOpUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    x = ragged.constant([[1, 2], [3]])
+    y = ragged.RaggedTensor.from_row_splits(
+        array_ops.placeholder_with_default([1, 2, 3], shape=None), x.row_splits)
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Unable to broadcast: unknown rank'):
+      math_ops.add(x, y)
+
+  @parameterized.parameters([
+      dict(
+          x=ragged.constant_value([[1, 2], [3]]),
+          y=[[10]],
+          expected=[[11, 12], [13]]),
+      dict(
+          x=ragged.constant_value([[[1, 2], [3, 4]], [[5]]], ragged_rank=2),
+          y=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
+          expected=[[[11, 12], [23, 24]], [[35]]]),
+      dict(
+          x=ragged.constant_value([[[1]]]),
+          y=ragged.constant_value([[1]]),
+          expected=[[[2]]]),
+  ])
+  def testElementwiseOpBroadcast(self, x, y, expected):
+    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    result = x + y
+    self.assertRaggedEqual(result, expected)
+
+  def testElementwiseOpShapeMismatch(self):
+    x = ragged.constant([[1, 2, 3], [4, 5]])
+    y = ragged.constant([[1, 2, 3], [4, 5, 6]])
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(math_ops.add(x, y))
+
+  def testBinaryOpSparseAndRagged(self):
+    x = ragged.constant([[1, 2, 3], [4, 5]])
+    y = sparse_tensor.SparseTensor([[0, 0], [0, 1], [2, 0]], [1, 2, 3], [3, 2])
+    with self.assertRaises((TypeError, ValueError)):
+      self.evaluate(math_ops.add(x, y))
+
+    with self.assertRaises((TypeError, ValueError)):
+      self.evaluate(math_ops.add_n([x, y]))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_eager_test.py b/tensorflow/python/ops/ragged/ragged_eager_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1befbf9613fefc4efd5efd3d8ebf17db9038581
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_eager_test.py
@@ -0,0 +1,52 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged in eager execution mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+
+  @parameterized.parameters([
+      dict(pylist=[[b'a', b'b'], [b'c']]),
+      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]]),
+      dict(pylist=[[[1, 2], [3, 4]], [[5, 6], [], [7, 8]]], ragged_rank=1),
+  ])
+  def testRaggedTensorToList(self, pylist, ragged_rank=None):
+    rt = ragged.constant(pylist, ragged_rank)
+    self.assertRaggedEqual(rt, pylist)
+
+  @parameterized.parameters([
+      dict(pylist=[[b'a', b'b'], [b'c']]),
+      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]]),
+  ])
+  def testRaggedTensorStr(self, pylist):
+    rt = ragged.constant(pylist)
+    self.assertEqual(str(rt), '<tf.RaggedTensor %s>' % pylist)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..072f330e3c1c0a20ac7cecd84ec6b0e47003a3a0
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
@@ -0,0 +1,126 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.expand_dims."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedExpandDimsOpTest(ragged_test_util.RaggedTensorTestCase,
+                             parameterized.TestCase):
+
+  # An example 4-d ragged tensor with shape [3, (D2), (D3), 2], and the
+  # expected result calling for expand_dims on each axis.  c.f. the table of
+  # expected result shapes in the ragged.expand_dims docstring.
+  EXAMPLE4D = [[[[1, 1], [2, 2]], [[3, 3]]],
+               [],
+               [[], [[4, 4], [5, 5], [6, 6]]]]  # pyformat: disable
+  EXAMPLE4D_EXPAND_AXIS = {
+      0: [EXAMPLE4D],
+      1: [[d0] for d0 in EXAMPLE4D],
+      2: [[[d1] for d1 in d0] for d0 in EXAMPLE4D],
+      3: [[[[d2] for d2 in d1] for d1 in d0] for d0 in EXAMPLE4D],
+      4: [[[[[d3] for d3 in d2] for d2 in d1] for d1 in d0] for d0 in EXAMPLE4D]
+  }
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring examples: 2D Ragged Inputs
+      dict(rt_input=[[1, 2], [3]],
+           axis=0,
+           expected=[[[1, 2], [3]]],
+           expected_shape=[1, None, None]),
+      dict(rt_input=[[1, 2], [3]],
+           axis=1,
+           expected=[[[1, 2]], [[3]]],
+           expected_shape=[2, None, None]),
+      dict(rt_input=[[1, 2], [3]],
+           axis=2,
+           expected=[[[1], [2]], [[3]]],
+           expected_shape=[2, None, 1]),
+
+      #=========================================================================
+      # 2D Tensor Inputs
+      dict(rt_input=[[1, 2], [3, 4], [5, 6]],
+           ragged_rank=0,
+           axis=0,
+           expected=[[[1, 2], [3, 4], [5, 6]]],
+           expected_shape=[1, 3, 2]),
+      dict(rt_input=[[1, 2], [3, 4], [5, 6]],
+           ragged_rank=0,
+           axis=1,
+           expected=[[[1, 2]], [[3, 4]], [[5, 6]]],
+           expected_shape=[3, 1, 2]),
+      dict(rt_input=[[1, 2], [3, 4], [5, 6]],
+           ragged_rank=0,
+           axis=2,
+           expected=[[[1], [2]], [[3], [4]], [[5], [6]]],
+           expected_shape=[3, 2, 1]),
+
+      #=========================================================================
+      # 4D Ragged Inputs: [3, (D2), (D3), 2]
+      # c.f. the table of expected result shapes in the expand_dims docstring.
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=0,
+           expected=EXAMPLE4D_EXPAND_AXIS[0],
+           expected_shape=[1, None, None, None, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=1,
+           expected=EXAMPLE4D_EXPAND_AXIS[1],
+           expected_shape=[3, None, None, None, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=2,
+           expected=EXAMPLE4D_EXPAND_AXIS[2],
+           expected_shape=[3, None, None, None, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=3,
+           expected=EXAMPLE4D_EXPAND_AXIS[3],
+           expected_shape=[3, None, None, 1, 2]),
+      dict(rt_input=EXAMPLE4D,
+           ragged_rank=2,
+           axis=4,
+           expected=EXAMPLE4D_EXPAND_AXIS[4],
+           expected_shape=[3, None, None, 2, 1]),
+  ])  # pyformat: disable
+  def testRaggedExpandDims(self,
+                           rt_input,
+                           axis,
+                           expected,
+                           ragged_rank=None,
+                           expected_shape=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    expanded = ragged.expand_dims(rt, axis=axis)
+    self.assertEqual(expanded.shape.ndims, rt.shape.ndims + 1)
+    if expected_shape is not None:
+      self.assertEqual(expanded.shape.as_list(), expected_shape)
+
+    self.assertRaggedEqual(expanded, expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c63e1c7994c31b6ed53e37e65498a843e2bb595
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -0,0 +1,293 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for constructing RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
+
+
+#===============================================================================
+# Op to construct a constant RaggedTensor from a nested Python list.
+#===============================================================================
+def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
+  """Constructs a constant RaggedTensor from a nested Python list.
+
+  Example:
+
+  ```python
+  >>> ragged.constant([[1, 2], [3], [4, 5, 6]]).eval()
+  RaggedTensorValue(values=[1, 2, 3, 4, 5, 6], splits=[0, 2, 3, 6])
+  ```
+
+  All scalar values in `pylist` must have the same nesting depth `K`, and the
+  returned `RaggedTensor` will have rank `K`.  If `pylist` contains no scalar
+  values, then `K` is one greater than the maximum depth of empty lists in
+  `pylist`.  All scalar values in `pylist` must be compatible with `dtype`.
+
+  Args:
+    pylist: A nested `list` or `tuple`.  Any nested element that is not a `list`
+      or `tuple` must be a scalar value compatible with `dtype`.
+    dtype: The type of elements for the returned `RaggedTensor`.  If not
+      specified, then a default is chosen based on the scalar values in
+      `pylist`.
+    ragged_rank: An integer specifying the ragged rank of the returned
+      `RaggedTensor`.  Must be nonnegative and less than `K`. Defaults to
+      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to `max(0, K
+      - 1 - len(inner_shape))` if `inner_shape` is specified.
+    inner_shape: A tuple of integers specifying the shape for individual inner
+      values in the returned `RaggedTensor`.  Defaults to `()` if `ragged_rank`
+      is not specified.  If `ragged_rank` is specified, then a default is chosen
+      based on the contents of `pylist`.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A potentially ragged tensor with rank `K` and the specified `ragged_rank`,
+    containing the values from `pylist`.
+
+  Raises:
+    ValueError: If the scalar values in `pylist` have inconsistent nesting
+      depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
+  """
+  with ops.name_scope(name, "RaggedConstant"):
+    return _constant_value(ragged_tensor.RaggedTensor.from_row_splits,
+                           constant_op.constant, pylist, dtype, ragged_rank,
+                           inner_shape)
+
+
+def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None):
+  """Constructs a RaggedTensorValue from a nested Python list.
+
+  > Warning: This function returns a `RaggedTensorValue`, not a `RaggedTensor`.
+  > If you wish to construct a constant `RaggedTensor`, use
+  > [`ragged.constant(...)`](constant.md) instead.
+
+  Example:
+
+  ```python
+  >>> ragged.constant_value([[1, 2], [3], [4, 5, 6]])
+  RaggedTensorValue(values=[1, 2, 3, 4, 5, 6], splits=[0, 2, 3, 6])
+  ```
+
+  All scalar values in `pylist` must have the same nesting depth `K`, and the
+  returned `RaggedTensorValue` will have rank `K`.  If `pylist` contains no
+  scalar values, then `K` is one greater than the maximum depth of empty lists
+  in `pylist`.  All scalar values in `pylist` must be compatible with `dtype`.
+
+  Args:
+    pylist: A nested `list` or `tuple`.  Any nested element that is not a `list`
+      or `tuple` must be a scalar value compatible with `dtype`.
+    dtype: `numpy.dtype`.  The type of elements for the returned `RaggedTensor`.
+      If not specified, then a default is chosen based on the scalar values in
+      `pylist`.
+    ragged_rank: An integer specifying the ragged rank of the returned
+      `RaggedTensorValue`.  Must be nonnegative and less than `K`. Defaults to
+      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to `max(0, K
+      - 1 - len(inner_shape))` if `inner_shape` is specified.
+    inner_shape: A tuple of integers specifying the shape for individual inner
+      values in the returned `RaggedTensorValue`.  Defaults to `()` if
+      `ragged_rank` is not specified.  If `ragged_rank` is specified, then a
+      default is chosen based on the contents of `pylist`.
+
+  Returns:
+    A `RaggedTensorValue` or `numpy.array` with rank `K` and the specified
+    `ragged_rank`, containing the values from `pylist`.
+
+  Raises:
+    ValueError: If the scalar values in `pylist` have inconsistent nesting
+      depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
+  """
+
+  def _ragged_factory(values, row_splits):
+    row_splits = np.array(row_splits, dtype=np.int64)
+    return ragged_tensor_value.RaggedTensorValue(values, row_splits)
+
+  def _inner_factory(pylist, dtype, shape, name=None):  # pylint: disable=unused-argument
+    return np.reshape(np.array(pylist, dtype=dtype), shape)
+
+  return _constant_value(_ragged_factory, _inner_factory, pylist, dtype,
+                         ragged_rank, inner_shape)
+
+
+def _constant_value(ragged_factory, inner_factory, pylist, dtype, ragged_rank,
+                    inner_shape):
+  """Constructs a constant RaggedTensor or RaggedTensorValue.
+
+  Args:
+    ragged_factory: A factory function with the signature:
+      `ragged_factory(values, row_splits)`
+    inner_factory: A factory function with the signature: `inner_factory(pylist,
+      dtype, shape, name)`
+    pylist: A nested `list` or `tuple`.
+    dtype: Data type for returned value.
+    ragged_rank: Ragged rank for returned value.
+    inner_shape: Inner value shape for returned value.
+
+  Returns:
+    A value returned by `ragged_factory` or `inner_factory`.
+
+  Raises:
+    ValueError: If the scalar values in `pylist` have inconsistent nesting
+      depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
+  """
+  if ragged_tensor.is_ragged(pylist):
+    raise TypeError("pylist may not be a RaggedTensor or RaggedTensorValue.")
+
+  if not isinstance(pylist, (list, tuple)):
+    # Scalar value
+    if ragged_rank is not None and ragged_rank != 0:
+      raise ValueError("Invalid pylist=%r: incompatible with ragged_rank=%d" %
+                       (pylist, ragged_rank))
+    if inner_shape is not None and inner_shape:
+      raise ValueError(
+          "Invalid pylist=%r: incompatible with dim(inner_shape)=%d" %
+          (pylist, len(inner_shape)))
+    return inner_factory(pylist, dtype, ())
+
+  if ragged_rank is not None and ragged_rank < 0:
+    raise ValueError(
+        "Invalid ragged_rank=%r: must be nonnegative" % ragged_rank)
+
+  # Find the depth of scalar values in `pylist`.
+  scalar_depth, max_depth = _find_scalar_and_max_depth(pylist)
+  if scalar_depth is not None:
+    if max_depth > scalar_depth:
+      raise ValueError("Invalid pylist=%r: empty list nesting is greater "
+                       "than scalar value nesting" % pylist)
+
+  # If both inner_shape and ragged_rank were specified, then check that
+  # they are compatible with pylist.
+  if inner_shape is not None and ragged_rank is not None:
+    expected_depth = ragged_rank + len(inner_shape) + 1
+    if ((scalar_depth is not None and expected_depth != scalar_depth) or
+        (scalar_depth is None and expected_depth < max_depth)):
+      raise ValueError(
+          "Invalid pylist=%r: incompatible with ragged_rank=%d "
+          "and dim(inner_shape)=%d" % (pylist, ragged_rank, len(inner_shape)))
+
+  # Check if the result is a `Tensor`.
+  if (ragged_rank == 0 or
+      (ragged_rank is None and
+       ((max_depth < 2) or
+        (inner_shape is not None and max_depth - len(inner_shape) < 2)))):
+    return inner_factory(pylist, dtype, inner_shape)
+
+  # Compute default value for inner_shape.
+  if inner_shape is None:
+    if ragged_rank is None:
+      inner_shape = ()
+    else:
+      inner_shape = _default_inner_shape_for_pylist(pylist, ragged_rank)
+
+  # Compute default value for ragged_rank.
+  if ragged_rank is None:
+    if scalar_depth is None:
+      ragged_rank = max(1, max_depth - 1)
+    else:
+      ragged_rank = max(1, scalar_depth - 1 - len(inner_shape))
+
+  # Build the splits for each ragged rank, and concatenate the inner values
+  # into a single list.
+  nested_splits = []
+  values = pylist
+  for dim in range(ragged_rank):
+    nested_splits.append([0])
+    concatenated_values = []
+    for row in values:
+      nested_splits[dim].append(nested_splits[dim][-1] + len(row))
+      concatenated_values.extend(row)
+    values = concatenated_values
+
+  values = inner_factory(
+      values, dtype=dtype, shape=(len(values),) + inner_shape, name="values")
+  for row_splits in reversed(nested_splits):
+    values = ragged_factory(values, row_splits)
+  return values
+
+
+def _find_scalar_and_max_depth(pylist):
+  """Finds nesting depth of scalar values in pylist.
+
+  Args:
+    pylist: A nested python `list` or `tuple`.
+
+  Returns:
+    A tuple `(scalar_depth, max_depth)`.  `scalar_depth` is the nesting
+    depth of scalar values in `pylist`, or `None` if `pylist` contains no
+    scalars.  `max_depth` is the maximum depth of `pylist` (including
+    empty lists).
+
+  Raises:
+    ValueError: If pylist has inconsistent nesting depths for scalars.
+  """
+  if isinstance(pylist, (list, tuple)):
+    scalar_depth = None
+    max_depth = 1
+    for child in pylist:
+      child_scalar_depth, child_max_depth = _find_scalar_and_max_depth(child)
+      if child_scalar_depth is not None:
+        if scalar_depth is not None and scalar_depth != child_scalar_depth + 1:
+          raise ValueError("all scalar values must have the same nesting depth")
+        scalar_depth = child_scalar_depth + 1
+      max_depth = max(max_depth, child_max_depth + 1)
+    return (scalar_depth, max_depth)
+  else:
+    return (0, 0)
+
+
+def _default_inner_shape_for_pylist(pylist, ragged_rank):
+  """Computes a default inner shape for the given python list."""
+
+  def get_inner_shape(item):
+    """Returns the inner shape for a python list `item`."""
+    if not isinstance(item, (list, tuple)):
+      return ()
+    elif item:
+      return (len(item),) + get_inner_shape(item[0])
+    else:
+      return (0,)
+
+  def check_inner_shape(item, shape):
+    """Checks that `item` has a consistent shape matching `shape`."""
+    is_nested = isinstance(item, (list, tuple))
+    if is_nested != bool(shape):
+      raise ValueError("inner values have inconsistent shape")
+    if is_nested:
+      if shape[0] != len(item):
+        raise ValueError("inner values have inconsistent shape")
+      for child in item:
+        check_inner_shape(child, shape[1:])
+
+  # Collapse the ragged layers to get the list of inner values.
+  flat_values = pylist
+  for dim in range(ragged_rank):
+    if not all(isinstance(v, (list, tuple)) for v in flat_values):
+      raise ValueError("pylist has scalar values depth %d, but ragged_rank=%d "
+                       "requires scalar value depth greater than %d" %
+                       (dim + 1, ragged_rank, ragged_rank))
+    flat_values = sum((list(v) for v in flat_values), [])
+
+  # Compute the inner shape looking only at the leftmost elements; and then
+  # use check_inner_shape to verify that other elements have the same shape.
+  inner_shape = get_inner_shape(flat_values)
+  check_inner_shape(flat_values, inner_shape)
+  return inner_shape[1:]
diff --git a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..07cf910202770192f146328844dec8c12be542a7
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
@@ -0,0 +1,113 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RaggedTensor.from_sparse."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import RaggedTensor
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
+
+  def testDocStringExample(self):
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0]],
+        values=[1, 2, 3, 4, 5],
+        dense_shape=[4, 3])
+    rt = RaggedTensor.from_sparse(st)
+
+    self.assertRaggedEqual(rt, [[1, 2, 3], [4], [], [5]])
+
+  def testEmpty(self):
+    st = sparse_tensor.SparseTensor(
+        indices=array_ops.zeros([0, 2], dtype=dtypes.int64),
+        values=[],
+        dense_shape=[4, 3])
+    rt = RaggedTensor.from_sparse(st)
+
+    self.assertRaggedEqual(rt, [[], [], [], []])
+
+  def testBadSparseTensorRank(self):
+    st1 = sparse_tensor.SparseTensor(indices=[[0]], values=[0], dense_shape=[3])
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            RaggedTensor.from_sparse, st1)
+
+    st2 = sparse_tensor.SparseTensor(
+        indices=[[0, 0, 0]], values=[0], dense_shape=[3, 3, 3])
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            RaggedTensor.from_sparse, st2)
+
+    if not context.executing_eagerly():
+      st3 = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=[0],
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                              RaggedTensor.from_sparse, st3)
+
+  def testGoodPartialSparseTensorRank(self):
+    if not context.executing_eagerly():
+      st1 = sparse_tensor.SparseTensor(
+          indices=[[0, 0]],
+          values=[0],
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      st2 = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=[0],
+          dense_shape=[4, 3])
+
+      # Shouldn't throw ValueError
+      RaggedTensor.from_sparse(st1)
+      RaggedTensor.from_sparse(st2)
+
+  def testNonRaggedSparseTensor(self):
+    # "index_suffix" means the value of the innermost dimension of the index
+    # (i.e., indices[i][-1]).
+    # See comments in _assert_sparse_indices_are_ragged_right() for more
+    # details/background.
+
+    # index_suffix of first index is not zero.
+    st1 = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [0, 2], [2, 0]], values=[1, 2, 3], dense_shape=[3, 3])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*SparseTensor is not right-ragged'):
+      self.evaluate(RaggedTensor.from_sparse(st1))
+    # index_suffix of an index that starts a new row is not zero.
+    st2 = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [2, 1]], values=[1, 2, 3], dense_shape=[3, 3])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*SparseTensor is not right-ragged'):
+      self.evaluate(RaggedTensor.from_sparse(st2))
+    # index_suffix of an index that continues a row skips a cell.
+    st3 = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [0, 1], [0, 3]], values=[1, 2, 3], dense_shape=[3, 3])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*SparseTensor is not right-ragged'):
+      self.evaluate(RaggedTensor.from_sparse(st3))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a3d639c5e35f23db7d53994e0a0bfe5231e664b
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
@@ -0,0 +1,457 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RaggedTensor.from_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import RaggedTensor
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase,
+                                 parameterized.TestCase):
+
+  def testDocStringExamples(self):
+    # The examples from RaggedTensor.from_tensor.__doc__.
+    dt = constant_op.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt), [[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt, lengths=[1, 0, 3]), [[5], [], [6, 0, 0]])
+
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt, padding=0), [[5, 7], [0, 3], [6]])
+
+  @parameterized.parameters(
+      # 2D test cases, no length or padding.
+      {
+          'tensor': [[]],
+          'expected': [[]],
+      },
+      {
+          'tensor': [[1]],
+          'expected': [[1]],
+      },
+      {
+          'tensor': [[1, 2]],
+          'expected': [[1, 2]],
+      },
+      {
+          'tensor': [[1], [2], [3]],
+          'expected': [[1], [2], [3]],
+      },
+      {
+          'tensor': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'expected': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+      },
+      # 3D test cases, no length or padding
+      {
+          'tensor': [[[]]],
+          'expected': [[[]]],
+      },
+      {
+          'tensor': [[[]]],
+          'expected': [[[]]],
+          'ragged_rank': 1,
+      },
+      {
+          'tensor': [[[1]]],
+          'expected': [[[1]]],
+      },
+      {
+          'tensor': [[[1, 2]]],
+          'expected': [[[1, 2]]],
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]]],
+          'expected': [[[1, 2], [3, 4]]],
+      },
+      {
+          'tensor': [[[1, 2]], [[3, 4]], [[5, 6]], [[7, 8]]],
+          'expected': [[[1, 2]], [[3, 4]], [[5, 6]], [[7, 8]]],
+      },
+      {
+          'tensor': [[[1], [2]], [[3], [4]], [[5], [6]], [[7], [8]]],
+          'expected': [[[1], [2]], [[3], [4]], [[5], [6]], [[7], [8]]],
+      },
+      # 2D test cases, with length
+      {
+          'tensor': [[1]],
+          'lengths': [1],
+          'expected': [[1]]
+      },
+      {
+          'tensor': [[1]],
+          'lengths': [0],
+          'expected': [[]]
+      },
+      {
+          'tensor': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'lengths': [0, 1, 2],
+          'expected': [[], [4], [7, 8]]
+      },
+      {
+          'tensor': [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+          'lengths': [0, 0, 0],
+          'expected': [[], [], []]
+      },
+      {
+          'tensor': [[1, 2], [3, 4]],
+          'lengths': [2, 2],
+          'expected': [[1, 2], [3, 4]]
+      },
+      {
+          'tensor': [[1, 2], [3, 4]],
+          'lengths': [7, 8],  # lengths > ncols: truncated to ncols
+          'expected': [[1, 2], [3, 4]]
+      },
+      {
+          'tensor': [[1, 2], [3, 4]],
+          'lengths': [-2, -1],  # lengths < 0: treated as zero
+          'expected': [[], []]
+      },
+      # 3D test cases, with length
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'lengths': [0, 0],
+          'expected': [[], []]
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'lengths': [1, 2],
+          'expected': [[[1, 2]], [[5, 6], [7, 8]]]
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'lengths': [2, 2],
+          'expected': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+      },
+      # 2D test cases, with padding
+      {
+          'tensor': [[1]],
+          'padding': 0,
+          'expected': [[1]]
+      },
+      {
+          'tensor': [[0]],
+          'padding': 0,
+          'expected': [[]]
+      },
+      {
+          'tensor': [[0, 1]],
+          'padding': 0,
+          'expected': [[0, 1]]
+      },
+      {
+          'tensor': [[1, 0]],
+          'padding': 0,
+          'expected': [[1]]
+      },
+      {
+          'tensor': [[1, 0, 1, 0, 0, 1, 0, 0]],
+          'padding': 0,
+          'expected': [[1, 0, 1, 0, 0, 1]]
+      },
+      {
+          'tensor': [[3, 7, 0, 0], [2, 0, 0, 0], [5, 0, 0, 0]],
+          'padding': 0,
+          'expected': [[3, 7], [2], [5]]
+      },
+      # 3D test cases, with padding
+      {
+          'tensor': [[[1]]],
+          'padding': [0],
+          'expected': [[[1]]]
+      },
+      {
+          'tensor': [[[0]]],
+          'padding': [0],
+          'expected': [[]]
+      },
+      {
+          'tensor': [[[0, 0], [1, 2]], [[3, 4], [0, 0]]],
+          'padding': [0, 0],
+          'expected': [[[0, 0], [1, 2]], [[3, 4]]]
+      },
+      # 4D test cases, with padding
+      {
+          'tensor': [
+              [[[1, 2], [3, 4]], [[0, 0], [0, 0]], [[0, 0], [0, 0]]],
+              [[[0, 0], [0, 0]], [[5, 6], [7, 8]], [[0, 0], [0, 0]]],
+              [[[0, 0], [0, 0]], [[0, 0], [0, 0]], [[0, 0], [0, 0]]]
+          ],
+          'padding': [[0, 0], [0, 0]],
+          'expected': [
+              [[[1, 2], [3, 4]]],
+              [[[0, 0], [0, 0]], [[5, 6], [7, 8]]],
+              []
+          ]
+      },
+      # 3D test cases, with ragged_rank=2.
+      {
+          'tensor': [[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+          'ragged_rank': 2,
+          'expected': [[[1, 0], [2, 3]], [[0, 0], [4, 0]]]
+      },
+      {
+          'tensor': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+          'ragged_rank': 2,
+          'lengths': [2, 0, 2, 1],
+          'expected': [[[1, 2], []], [[5, 6], [7]]]
+      },
+      {
+          'tensor': [[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+          'ragged_rank': 2,
+          'padding': 0,
+          'expected': [[[1], [2, 3]], [[], [4]]]
+      },
+      # 4D test cases, with ragged_rank>1
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 2,
+          'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]]
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 3,
+          'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]]
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 2,
+          'padding': [0, 0],
+          'expected': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                       [[[5, 6], [7, 0]], [[0, 8]]]]
+      },
+      {
+          'tensor': [[[[1, 0], [2, 3]], [[0, 0], [4, 0]]],
+                     [[[5, 6], [7, 0]], [[0, 8], [0, 0]]]],
+          'ragged_rank': 3,
+          'padding': 0,
+          'expected': [[[[1], [2, 3]], [[], [4]]],
+                       [[[5, 6], [7]], [[0, 8], []]]]
+      },
+  )  # pyformat: disable
+  def testRaggedFromTensor(self,
+                           tensor,
+                           expected,
+                           lengths=None,
+                           padding=None,
+                           ragged_rank=1):
+    dt = constant_op.constant(tensor)
+    rt = RaggedTensor.from_tensor(dt, lengths, padding, ragged_rank)
+    self.assertEqual(type(rt), RaggedTensor)
+    self.assertEqual(rt.ragged_rank, ragged_rank)
+    self.assertTrue(
+        dt.shape.is_compatible_with(rt.shape),
+        '%s is incompatible with %s' % (dt.shape, rt.shape))
+    self.assertRaggedEqual(rt, expected)
+
+  def testHighDimensions(self):
+    # Use distinct prime numbers for all dimension shapes in this test, so
+    # we can see any errors that are caused by mixing up dimension sizes.
+    dt = array_ops.reshape(
+        math_ops.range(3 * 5 * 7 * 11 * 13 * 17), [3, 5, 7, 11, 13, 17])
+    for ragged_rank in range(1, 4):
+      rt = RaggedTensor.from_tensor(dt, ragged_rank=ragged_rank)
+      self.assertEqual(type(rt), RaggedTensor)
+      self.assertEqual(rt.ragged_rank, ragged_rank)
+      self.assertTrue(
+          dt.shape.is_compatible_with(rt.shape),
+          '%s is incompatible with %s' % (dt.shape, rt.shape))
+      self.assertRaggedEqual(rt, self.evaluate(dt).tolist())
+
+  @parameterized.parameters(
+      # With no padding or lengths
+      {
+          'dt_shape': [0, 0],
+          'expected': []
+      },
+      {
+          'dt_shape': [0, 3],
+          'expected': []
+      },
+      {
+          'dt_shape': [3, 0],
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [0, 2, 3],
+          'expected': []
+      },
+      {
+          'dt_shape': [2, 0, 3],
+          'expected': [[], []]
+      },
+      {
+          'dt_shape': [2, 3, 0],
+          'expected': [[[], [], []], [[], [], []]]
+      },
+      {
+          'dt_shape': [2, 3, 0, 1],
+          'expected': [[[], [], []], [[], [], []]]
+      },
+      {
+          'dt_shape': [2, 3, 1, 0],
+          'expected': [[[[]], [[]], [[]]], [[[]], [[]], [[]]]]
+      },
+      # With padding
+      {
+          'dt_shape': [0, 0],
+          'padding': 0,
+          'expected': []
+      },
+      {
+          'dt_shape': [0, 3],
+          'padding': 0,
+          'expected': []
+      },
+      {
+          'dt_shape': [3, 0],
+          'padding': 0,
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [0, 2, 3],
+          'padding': [0, 0, 0],
+          'expected': []
+      },
+      {
+          'dt_shape': [2, 0, 3],
+          'padding': [0, 0, 0],
+          'expected': [[], []]
+      },
+      {
+          'dt_shape': [2, 3, 0],
+          'padding': [],
+          'expected': [[], []]
+      },
+      # With lengths
+      {
+          'dt_shape': [0, 0],
+          'lengths': [],
+          'expected': []
+      },
+      {
+          'dt_shape': [0, 3],
+          'lengths': [],
+          'expected': []
+      },
+      {
+          'dt_shape': [3, 0],
+          'lengths': [0, 0, 0],
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [3, 0],
+          'lengths': [2, 3, 4],  # lengths > ncols: truncated to ncols
+          'expected': [[], [], []]
+      },
+      {
+          'dt_shape': [0, 2, 3],
+          'lengths': [],
+          'expected': []
+      },
+      {
+          'dt_shape': [2, 0, 3],
+          'lengths': [0, 0],
+          'expected': [[], []]
+      },
+      {
+          'dt_shape': [2, 3, 0],
+          'lengths': [0, 0],
+          'expected': [[], []]
+      },
+  )
+  def testEmpty(self, dt_shape, expected, lengths=None, padding=None):
+    dt = array_ops.zeros(dt_shape)
+    rt = RaggedTensor.from_tensor(dt, lengths, padding)
+    self.assertEqual(type(rt), RaggedTensor)
+    self.assertEqual(rt.ragged_rank, 1)
+    self.assertTrue(dt.shape.is_compatible_with(rt.shape))
+    self.assertRaggedEqual(rt, expected)
+
+  @parameterized.parameters(
+      {
+          'tensor': [[1]],
+          'lengths': [0],
+          'padding': 0,
+          'error': (ValueError, 'Specify lengths or padding, but not both')
+      },
+      {
+          'tensor': [[1]],
+          'lengths': [0.5],
+          'error': (TypeError, 'lengths must be an integer tensor')
+      },
+      {
+          'tensor': [[1]],
+          'padding': 'a',
+          'error': (TypeError, '.*')
+      },
+      {
+          'tensor': [[1]],
+          'padding': [1],
+          'error': (ValueError, r'Shapes \(1,\) and \(\) are incompatible')
+      },
+      {
+          'tensor': [[[1]]],
+          'padding': 1,
+          'error': (ValueError, r'Shapes \(\) and \(1,\) are incompatible')
+      },
+      {
+          'tensor': [[1]],
+          'ragged_rank': 'bad',
+          'error': (TypeError, r'ragged_rank expected int, got \'bad\'')
+      },
+      {
+          'tensor': [[1]],
+          'ragged_rank': 0,
+          'error': (ValueError, r'ragged_rank must be greater than 0; got 0')
+      },
+      {
+          'tensor': [[1]],
+          'ragged_rank': -1,
+          'error': (ValueError, r'ragged_rank must be greater than 0; got -1')
+      },
+  )
+  def testErrors(self,
+                 tensor,
+                 lengths=None,
+                 padding=None,
+                 ragged_rank=1,
+                 error=None):
+    dt = constant_op.constant(tensor)
+    self.assertRaisesRegexp(error[0], error[1], RaggedTensor.from_tensor, dt,
+                            lengths, padding, ragged_rank)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..751f2c73592c676d0dd5eec4f9dc45430cd646b1
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -0,0 +1,114 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for ragged tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+def map_flat_values(op, *args, **kwargs):
+  """Applies `op` to the inner values of one or more RaggedTensors.
+
+  Replaces any `RaggedTensor` in `args` or `kwargs` with its `flat_values`
+  tensor, and then calls `op`.  Returns a `RaggedTensor` that is constructed
+  from the input `RaggedTensor`s' `splits` and the value returned by
+  the `op`.
+
+  If the input arguments contain multiple `RaggedTensor`s, then they must have
+  identical `splits`.
+
+  Examples:
+
+  ```python
+  >>> rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
+  >>> ragged.map_flat_values(tf.ones_like, rt).eval().tolist()
+  [[1, 1, 1], [], [1, 1], [1]]
+  >>> ragged.map_flat_values(tf.multiply, rt, rt).eval().tolist()
+  [[1, 4, 9], [], [16, 25], [36]]
+  >>> ragged.map_flat_values(tf.add, rt, 5).eval().tolist()
+  [[6, 7, 8], [], [9, 10], [11]]
+  ```
+
+  Args:
+    op: The operation that should be applied to the RaggedTensor `flat_values`.
+      `op` is typically an element-wise operation (such as math_ops.add), but
+      any operation that preserves the size of the outermost dimension can be
+      used.  I.e., `shape[0]` of the value returned by `op` must match
+      `shape[0]` of the `RaggedTensor`s' `flat_values` tensors.
+    *args: Arguments for `op`.
+    **kwargs: Keyword arguments for `op`.
+
+  Returns:
+    A `RaggedTensor` whose `ragged_rank` matches the `ragged_rank` of all
+    input `RaggedTensor`s.
+  Raises:
+    ValueError: If args contains no `RaggedTensors`, or if the `nested_splits`
+      of the input `RaggedTensor`s are not identical.
+  """
+  # Replace RaggedTensors with their values; and collect the splits tensors
+  # from each RaggedTensor.
+  nested_splits_lists = []
+  inner_args = _replace_ragged_with_flat_values(args, nested_splits_lists)
+  inner_kwargs = _replace_ragged_with_flat_values(kwargs, nested_splits_lists)
+  if not nested_splits_lists:
+    return op(*args, **kwargs)
+
+  with ops.control_dependencies(
+      ragged_util.assert_splits_match(nested_splits_lists)):
+    # Delegate to op, and then compose the result from the transformed values
+    # and the splits.
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        op(*inner_args, **inner_kwargs), nested_splits_lists[0])
+
+
+def _replace_ragged_with_flat_values(value, nested_splits_lists):
+  """Replace RaggedTensors with their flat_values, and record their splits.
+
+  Returns a copy of `value`, with any nested `RaggedTensor`s replaced by their
+  `flat_values` tensor.  Looks inside lists, tuples, and dicts.
+
+  Appends each `RaggedTensor`'s `nested_splits` to `nested_splits_lists`.
+
+  Args:
+    value: The value that should be transformed by replacing `RaggedTensors`.
+    nested_splits_lists: An output parameter used to record the `nested_splits`
+      for any `RaggedTensors` that were replaced.
+
+  Returns:
+    A copy of `value` with nested `RaggedTensors` replaced by their `values`.
+  """
+  # Base case
+  if ragged_tensor.is_ragged(value):
+    value = ragged_tensor.convert_to_tensor_or_ragged_tensor(value)
+    nested_splits_lists.append(value.nested_row_splits)
+    return value.flat_values
+
+  # Recursion cases
+  def recurse(v):
+    return _replace_ragged_with_flat_values(v, nested_splits_lists)
+
+  if isinstance(value, list):
+    return [recurse(v) for v in value]
+  elif isinstance(value, tuple):
+    return tuple(recurse(v) for v in value)
+  elif isinstance(value, dict):
+    return dict((k, recurse(v)) for (k, v) in value.items())
+  else:
+    return value
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6673192752e613f671c175193fce83fbba60e48d
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -0,0 +1,232 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.gather_nd."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
+                           parameterized.TestCase):
+
+  DOCSTRING_PARAMS = [[['000', '001'], ['010']],
+                      [['100'], ['110', '111', '112'], ['120']],
+                      [[], ['210']]]  # pyformat: disable
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Examples
+      #=========================================================================
+      dict(
+          descr='Docstring example 1',
+          params=ragged.constant_value(DOCSTRING_PARAMS),
+          indices=[[2], [0]],
+          expected=ragged.constant_value([[[], [b'210']],
+                                          [[b'000', b'001'], [b'010']]])),
+      dict(
+          descr='Docstring example 2',
+          params=ragged.constant_value(DOCSTRING_PARAMS),
+          indices=[[2, 1], [0, 0]],
+          expected=ragged.constant_value([[b'210'], [b'000', b'001']])),
+      dict(
+          descr='Docstring example 3',
+          params=ragged.constant_value(DOCSTRING_PARAMS),
+          indices=[[0, 0, 1], [1, 1, 2]],
+          expected=[b'001', b'112']),
+      #=========================================================================
+      # Indices with 0 values (selects the entire params)
+      #=========================================================================
+      dict(
+          descr='params: [B1, (B2)], indices: [0], result: [B1, (B2)]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=np.zeros([0], dtype=np.int32),
+          expected=ragged.constant_value([[b'a', b'b', b'c'], [b'd']])),
+      dict(
+          descr='params: [B1, (B2)], indices: [A1, 0], result: [A1, B1, (B2)]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=np.zeros([3, 0], dtype=np.int32),
+          expected=ragged.constant_value([[[b'a', b'b', b'c'], [b'd']],
+                                          [[b'a', b'b', b'c'], [b'd']],
+                                          [[b'a', b'b', b'c'], [b'd']]])),
+      dict(
+          descr=('params: [B1, (B2)], indices: [A1, A2, 0], '
+                 'result: [A1, A2, B1, (B2)]'),
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=np.zeros([1, 3, 0], dtype=np.int32),
+          expected=ragged.constant_value([[[[b'a', b'b', b'c'], [b'd']],
+                                           [[b'a', b'b', b'c'], [b'd']],
+                                           [[b'a', b'b', b'c'], [b'd']]]])),
+      dict(
+          descr='params: [B1], indices: [A1, (A2), 0], result: [A1, (A2), B1]',
+          params=['a'],
+          indices=ragged.constant_value([[[], []], [[]]],
+                                        ragged_rank=1,
+                                        dtype=np.int32),
+          expected=ragged.constant_value([[[b'a'], [b'a']], [[b'a']]],
+                                         ragged_rank=1)),
+      #=========================================================================
+      # Indices with 1 value (selects row from params)
+      #=========================================================================
+      dict(
+          descr='params: [B1, (B2)], indices: [A1, 1], result: [A1, (B2)]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=[[1], [0]],
+          expected=ragged.constant_value([[b'd'], [b'a', b'b', b'c']])),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, 1], '
+                 'result: [A1, (B2), (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[1], [1]],
+          expected=ragged.constant_value([[[b'e', b'f']], [[b'e', b'f']]])),
+      dict(
+          descr=('params: [B1, B2, B3], indices: [A1, (A2), 1], '
+                 'result: [A1, (A2), B2, B3]'),
+          params=[[['a']], [['b']]],
+          indices=ragged.constant_value([[[0]]], ragged_rank=1),
+          expected=ragged.constant_value([[[[b'a']]]], ragged_rank=1)),
+      #=========================================================================
+      # Indices with 2 values (selects row & col from params)
+      #=========================================================================
+      dict(
+          descr='params: [B1, (B2)], indices: [A1, 2], result: [A1]',
+          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          indices=[[1, 0], [0, 0], [0, 2]],
+          expected=ragged.constant_value([b'd', b'a', b'c'])),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, 2], '
+                 'result: [A1, (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[1, 0], [0, 1], [0, 0]],
+          expected=ragged.constant_value([[b'e', b'f'], [b'd'],
+                                          [b'a', b'b', b'c']])),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, A2, 2], '
+                 'result: [A1, (A2), (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[[1, 0], [0, 1], [0, 0]]],
+          expected=ragged.constant_value([[[b'e', b'f'], [b'd'],
+                                           [b'a', b'b', b'c']]])),
+      dict(
+          descr=('params: [B1, (B2), B3], indices: [A1, A2, 2], '
+                 'result: [A1, A2, B3]'),
+          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
+                                        [['e', 'f']]],
+                                       ragged_rank=1),
+          indices=[[[1, 0], [0, 1], [0, 0]]],
+          expected=[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]),
+      dict(
+          descr=('params: [B1, (B2), B3], indices: [A1, A2, A3, 2], '
+                 'result: [A1, A2, A3, B3]'),
+          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
+                                        [['e', 'f']]],
+                                       ragged_rank=1),
+          indices=[[[[1, 0], [0, 1], [0, 0]]]],
+          expected=[[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]]),
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, (A2), 2], '
+                 'result: [A1, (A2), (B3)]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=ragged.constant_value([[[1, 0], [0, 1]], [[0, 0]]],
+                                        ragged_rank=1),
+          expected=ragged.constant_value([[[b'e', b'f'], [b'd']],
+                                          [[b'a', b'b', b'c']]])),
+      #=========================================================================
+      # Indices with 3 values
+      #=========================================================================
+      dict(
+          descr=('params: [B1, (B2), (B3)], indices: [A1, 3], '
+                 'result: [A1]'),
+          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
+                                        [['e', 'f']]]),
+          indices=[[1, 0, 1], [0, 0, 0], [0, 1, 0]],
+          expected=[b'f', b'a', b'd']),
+      dict(
+          descr=('params: [B1, (B2), B3], indices: [A1, 3], '
+                 'result: [A1]'),
+          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
+                                        [['e', 'f']]],
+                                       ragged_rank=1),
+          indices=[[1, 0, 1], [0, 0, 0], [0, 1, 1]],
+          expected=[b'f', b'a', b'd']),
+      dict(
+          descr=('params: [B1, (B2), (B3), B4], indices: [A1, 3], '
+                 'result: [A1, B4]'),
+          params=ragged.constant_value([[[['a', 'b'], ['c', 'd']],
+                                         [['e', 'f']]]],
+                                       ragged_rank=2),
+          indices=[[0, 0, 1], [0, 0, 0], [0, 1, 0]],
+          expected=[[b'c', b'd'], [b'a', b'b'], [b'e', b'f']]),
+  ])  # pyformat: disable
+  def testRaggedGatherNd(self, descr, params, indices, expected):
+    result = ragged.gather_nd(params, indices)
+    self.assertRaggedEqual(result, expected)
+
+  def testRaggedGatherNdUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    params = ragged.constant([['a', 'b'], ['c', 'd']])
+    indices1 = array_ops.placeholder(dtypes.int32, shape=None)
+    indices2 = array_ops.placeholder(dtypes.int32, shape=[None])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'indices.rank be statically known.'):
+      ragged.gather_nd(params, indices1)
+    with self.assertRaisesRegexp(
+        ValueError, r'indices.shape\[-1\] must be statically known.'):
+      ragged.gather_nd(params, indices2)
+
+  @parameterized.parameters([
+      dict(
+          params=['a'],
+          indices=0,
+          error=(ValueError, errors.InvalidArgumentError)),
+      dict(
+          params=ragged.constant_value([['a']]),
+          indices=0,
+          message='indices.rank must be at least 1.'),
+      dict(
+          params=['a', 'b', 'c'],
+          indices=ragged.constant_value([[0]]),
+          message='The innermost dimension of indices may not be ragged'),
+  ])
+  def testRaggedGatherNdStaticError(self,
+                                    params,
+                                    indices,
+                                    message=None,
+                                    error=ValueError):
+    with self.assertRaisesRegexp(error, message):
+      ragged.gather_nd(params, indices)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..42efdc8a7d384744041454b5e0bb90e5618b7184
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -0,0 +1,138 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.gather."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
+
+  def testDocStringExamples(self):
+    params = constant_op.constant(['a', 'b', 'c', 'd', 'e'])
+    indices = constant_op.constant([3, 1, 2, 1, 0])
+    ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
+    self.assertRaggedEqual(
+        ragged.gather(params, ragged_indices),
+        [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
+    self.assertRaggedEqual(
+        ragged.gather(ragged_params, indices),
+        [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
+    self.assertRaggedEqual(
+        ragged.gather(ragged_params, ragged_indices),
+        [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
+
+  def testTensorParamsAndTensorIndices(self):
+    params = ['a', 'b', 'c', 'd', 'e']
+    indices = [2, 0, 2, 1]
+    self.assertRaggedEqual(
+        ragged.gather(params, indices), [b'c', b'a', b'c', b'b'])
+    self.assertIsInstance(ragged.gather(params, indices), ops.Tensor)
+
+  def testRaggedParamsAndTensorIndices(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    indices = [2, 0, 2, 1]
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
+
+  def testTensorParamsAndRaggedIndices(self):
+    params = ['a', 'b', 'c', 'd', 'e']
+    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
+
+  def testRaggedParamsAndRaggedIndices(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
+         [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
+         [[]]]                                        #  [p[3]            ]]
+    )  # pyformat: disable
+
+  def testRaggedParamsAndScalarIndices(self):
+    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    indices = 1
+    self.assertRaggedEqual(ragged.gather(params, indices), [b'c', b'd', b'e'])
+
+  def test3DRaggedParamsAnd2DTensorIndices(self):
+    params = ragged.constant([[['a', 'b'], []], [['c', 'd'], ['e'], ['f']],
+                              [['g']]])
+    indices = [[1, 2], [0, 1], [2, 2]]
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
+         [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
+         [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
+    )  # pyformat: disable
+
+  def testTensorParamsAnd4DRaggedIndices(self):
+    indices = ragged.constant(
+        [[[[3, 4], [0, 6]], []], [[[2, 1], [1, 0]], [[2, 5]], [[2, 3]]],
+         [[[1, 0]]]],  # pyformat: disable
+        ragged_rank=2,
+        inner_shape=(2,))
+    params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[[[b'd', b'e'], [b'a', b'g']], []],
+         [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
+         [[[b'b', b'a']]]])  # pyformat: disable
+
+  def testOutOfBoundsError(self):
+    tensor_params = ['a', 'b', 'c']
+    tensor_indices = [0, 1, 2]
+    ragged_params = ragged.constant([['a', 'b'], ['c']])
+    ragged_indices = ragged.constant([[0, 3]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'indices\[1\] = 3 is not in \[0, 3\)'):
+      self.evaluate(ragged.gather(tensor_params, ragged_indices))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'indices\[2\] = 2 is not in \[0, 2\)'):
+      self.evaluate(ragged.gather(ragged_params, tensor_indices))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'indices\[1\] = 3 is not in \[0, 2\)'):
+      self.evaluate(ragged.gather(ragged_params, ragged_indices))
+
+  def testUnknownIndicesRankError(self):
+    if context.executing_eagerly():
+      return
+    params = ragged.constant([], ragged_rank=1)
+    indices = constant_op.constant([0], dtype=dtypes.int64)
+    indices = array_ops.placeholder_with_default(indices, None)
+    self.assertRaisesRegexp(ValueError,
+                            r'indices\.shape\.ndims must be known statically',
+                            ragged.gather, params, indices)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fa72a36581150cd9408aa7bf12467bfaaab8893
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -0,0 +1,388 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python-style indexing and slicing for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+def ragged_tensor_getitem(self, key):
+  """Returns the specified piece of this RaggedTensor.
+
+  Supports multidimensional indexing and slicing, with one restriction:
+  indexing into a ragged inner dimension is not allowed.  This case is
+  problematic because the indicated value may exist in some rows but not
+  others.  In such cases, it's not obvious whether we should (1) report an
+  IndexError; (2) use a default value; or (3) skip that value and return a
+  tensor with fewer rows than we started with.  Following the guiding
+  principles of Python ("In the face of ambiguity, refuse the temptation to
+  guess" <go/pep20>), we simply disallow this operation.
+
+  Any dimensions added by `array_ops.newaxis` will be ragged if the following
+  dimension is ragged.
+
+  Args:
+    self: The RaggedTensor to slice.
+    key: Indicates which piece of the RaggedTensor to return, using standard
+      Python semantics (e.g., negative values index from the end).  `key`
+      may have any of the following types:
+
+      * `int` constant
+      * Scalar integer `Tensor`
+      * `slice` containing integer constants and/or scalar integer
+        `Tensor`s
+      * `Ellipsis`
+      * `tf.newaxis`
+      * `tuple` containing any of the above (for multidimentional indexing)
+
+  Returns:
+    A `Tensor` or `RaggedTensor` object.  Values that include at least one
+    ragged dimension are returned as `RaggedTensor`.  Values that include no
+    ragged dimensions are returned as `Tensor`.  See above for examples of
+    expressions that return `Tensor`s vs `RaggedTensor`s.
+
+  Raises:
+    ValueError: If `key` is out of bounds.
+    ValueError: If `key` is not supported.
+    TypeError: If the indices in `key` have an unsupported type.
+
+  Examples:
+
+    ```python
+    >>> # A 2-D ragged tensor with 1 ragged dimension.
+    >>> rt = ragged.constant([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g']])
+    >>> rt[0].eval().tolist()       # First row (1-D `Tensor`)
+    ['a', 'b', 'c']
+    >>> rt[:3].eval().tolist()      # First three rows (2-D RaggedTensor)
+    [['a', 'b', 'c'], ['d', 'e'], '[f'], [g']]
+    >>> rt[3, 0].eval().tolist()    # 1st element of 4th row (scalar)
+    'g'
+
+    >>> # A 3-D ragged tensor with 2 ragged dimensions.
+    >>> rt = ragged.constant([[[1, 2, 3], [4]],
+    ...                    [[5], [], [6]],
+    ...                    [[7]],
+    ...                    [[8, 9], [10]]])
+    >>> rt[1].eval().tolist()       # Second row (2-D RaggedTensor)
+    [[5], [], [6]]
+    >>> rt[3, 0].eval().tolist()    # First element of fourth row (1-D Tensor)
+    [8, 9]
+    >>> rt[:, 1:3].eval().tolist()  # Items 1-3 of each row (3-D RaggedTensor)
+    [[[4]], [[], [6]], [], [[10]]]
+    >>> rt[:, -1:].eval().tolist()  # Last item of each row (3-D RaggedTensor)
+    [[[4]], [[6]], [[7]], [[10]]]
+    ```
+  """
+  scope_tensors = [self] + list(_tensors_in_key_list(key))
+  if isinstance(key, (list, tuple)):
+    key = list(key)
+  else:
+    key = [key]
+  with ops.name_scope(None, "RaggedGetItem", scope_tensors):
+    return _ragged_getitem(self, key)
+
+
+def _ragged_getitem(rt_input, key_list):
+  """Helper for indexing and slicing ragged tensors with __getitem__().
+
+  Extracts the specified piece of the `rt_input`.  See
+  `RaggedTensor.__getitem__` for examples and restrictions.
+
+  Args:
+    rt_input: The `RaggedTensor` from which a piece should be returned.
+    key_list: The list of keys specifying which piece to return. Each key
+      corresponds with a separate dimension.
+
+  Returns:
+    The indicated piece of rt_input.
+
+  Raises:
+    ValueError: If `key_list` is not supported.
+    TypeError: If any keys in `key_list` have an unsupported type.
+  """
+  if not key_list:
+    return rt_input
+  row_key = key_list[0]
+  inner_keys = key_list[1:]
+
+  if row_key is Ellipsis:
+    expanded_key_list = _expand_ellipsis(key_list, rt_input.shape.ndims)
+    return _ragged_getitem(rt_input, expanded_key_list)
+
+  # Adding a new axis: Get rt_input[inner_keys], and wrap it in a RaggedTensor
+  # that puts all values in a single row.
+  if row_key is array_ops.newaxis:
+    inner_rt = _ragged_getitem(rt_input, inner_keys)
+    nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
+    return ragged_tensor.RaggedTensor.from_row_splits(
+        inner_rt, array_ops.stack([0, nsplits - 1]))
+
+  # Slicing a range of rows: first slice the outer dimension, and then
+  # call `_ragged_getitem_inner_dimensions` to handle the inner keys.
+  if isinstance(row_key, slice):
+    sliced_rt_input = _slice_ragged_row_dimension(rt_input, row_key)
+    return _ragged_getitem_inner_dimensions(sliced_rt_input, inner_keys)
+
+  # Indexing a single row: slice values to get the indicated row, and then
+  # use a recursive call to __getitem__ to handle the inner keys.
+  else:
+    starts = rt_input.row_splits[:-1]
+    limits = rt_input.row_splits[1:]
+    row = rt_input.values[starts[row_key]:limits[row_key]]
+    return row.__getitem__(inner_keys)
+
+
+def _slice_ragged_row_dimension(rt_input, row_key):
+  """Slice the outer dimension of `rt_input` according to the given `slice`.
+
+  Args:
+    rt_input: The `RaggedTensor` to slice.
+    row_key: The `slice` object that should be used to slice `rt_input`.
+
+  Returns:
+    A `RaggedTensor` containing the indicated slice of `rt_input`.
+  """
+  if row_key.start is None and row_key.stop is None and row_key.step is None:
+    return rt_input
+
+  # Use row_key to slice the starts & limits.
+  new_starts = rt_input.row_splits[:-1][row_key]
+  new_limits = rt_input.row_splits[1:][row_key]
+  zero_pad = array_ops.zeros([1], dtypes.int64)
+
+  # If there's no slice step, then we can just select a single continuous
+  # span of `ragged.values(rt_input)`.
+  if row_key.step is None or row_key.step == 1:
+    # Construct the new splits.  If new_starts and new_limits are empty,
+    # then this reduces to [0].  Otherwise, this reduces to:
+    #   concat([[new_starts[0]], new_limits])
+    new_splits = array_ops.concat(
+        [zero_pad[array_ops.size(new_starts):], new_starts[:1], new_limits],
+        axis=0)
+    values_start = new_splits[0]
+    values_limit = new_splits[-1]
+    return ragged_tensor.RaggedTensor.from_row_splits(
+        rt_input.values[values_start:values_limit], new_splits - values_start)
+
+  # If there is a slice step (aka a strided slice), then use ragged_gather to
+  # collect the necessary elements of `ragged.values(rt_input)`.
+  else:
+    return _build_ragged_tensor_from_value_ranges(new_starts, new_limits, 1,
+                                                  rt_input.values)
+
+
+def _ragged_getitem_inner_dimensions(rt_input, key_list):
+  """Retrieve inner dimensions, keeping outermost dimension unchanged.
+
+  Args:
+    rt_input: The `RaggedTensor` or `Tensor` from which a piece should be
+      extracted.
+    key_list: The __getitem__ keys for slicing the inner dimensions.
+
+  Returns:
+    A `RaggedTensor`.
+
+  Raises:
+    ValueError: If key_list is not supported.
+  """
+  if not key_list:
+    return rt_input
+
+  if isinstance(rt_input, ops.Tensor):
+    return rt_input.__getitem__([slice(None, None, None)] + key_list)
+
+  column_key = key_list[0]
+  if column_key is Ellipsis:
+    expanded_key_list = _expand_ellipsis(key_list, rt_input.values.shape.ndims)
+    return _ragged_getitem_inner_dimensions(rt_input, expanded_key_list)
+
+  # Adding a new axis to a ragged inner dimension: recursively get the inner
+  # dimensions of rt_input with key_list[1:], and then wrap the result in a
+  # RaggedTensor that puts each value in its own row.
+  if column_key is array_ops.newaxis:
+    inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
+    nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
+    return ragged_tensor.RaggedTensor.from_row_splits(inner_rt,
+                                                      math_ops.range(nsplits))
+
+  # Slicing a range of columns in a ragged inner dimension.  We use a
+  # recursive call to process the values, and then assemble a RaggedTensor
+  # with those values.
+  if isinstance(column_key, slice):
+    if (column_key.start is None and column_key.stop is None and
+        column_key.step is None):
+      # Trivial slice: recursively process all values, & splits is unchanged.
+      return rt_input.with_values(
+          _ragged_getitem_inner_dimensions(rt_input.values, key_list[1:]))
+    else:
+      # Nontrivial slice: use ragged_gather to extract the indicated slice as
+      # a new RaggedTensor (inner_rt), and then recursively process its values.
+      # The splits can be taken from inner_rt.row_splits().
+      inner_rt_starts = rt_input.row_splits[:-1]
+      inner_rt_limits = rt_input.row_splits[1:]
+      if column_key.start is not None and column_key.start != 0:
+        inner_rt_starts = _add_offset_to_ranges(
+            column_key.start, rt_input.row_splits[:-1], rt_input.row_splits[1:])
+      if column_key.stop is not None and column_key.stop != 0:
+        inner_rt_limits = _add_offset_to_ranges(
+            column_key.stop, rt_input.row_splits[:-1], rt_input.row_splits[1:])
+      inner_rt = _build_ragged_tensor_from_value_ranges(
+          inner_rt_starts, inner_rt_limits, column_key.step, rt_input.values)
+      return inner_rt.with_values(
+          _ragged_getitem_inner_dimensions(inner_rt.values, key_list[1:]))
+
+  # Indexing a single column in a ragged inner dimension: raise an Exception.
+  # See RaggedTensor.__getitem__.__doc__ for an explanation of why indexing
+  # into a ragged inner dimension is problematic.
+  else:
+    raise ValueError("Cannot index into an inner ragged dimension.")
+
+
+def _expand_ellipsis(key_list, num_remaining_dims):
+  """Expands the ellipsis at the start of `key_list`.
+
+  Assumes that the first element of `key_list` is Ellipsis.  This will either
+  remove the Ellipsis (if it corresponds to zero indices) or prepend a new
+  `slice(None, None, None)` (if it corresponds to more than zero indices).
+
+  Args:
+    key_list: The arguments to `__getitem__()`.
+    num_remaining_dims: The number of dimensions remaining.
+
+  Returns:
+    A copy of `key_list` with he ellipsis expanded.
+  Raises:
+    ValueError: If ragged_rank.shape.ndims is None
+    IndexError: If there are too many elements in `key_list`.
+  """
+  if num_remaining_dims is None:
+    raise ValueError("Ellipsis not supported for unknown shape RaggedTensors")
+  num_indices = sum(1 for idx in key_list if idx is not array_ops.newaxis)
+  if num_indices > num_remaining_dims + 1:
+    raise IndexError("Too many indices for RaggedTensor")
+  elif num_indices == num_remaining_dims + 1:
+    return key_list[1:]
+  else:
+    return [slice(None, None, None)] + key_list
+
+
+def _tensors_in_key_list(key_list):
+  """Generates all Tensors in the given slice spec."""
+  if isinstance(key_list, ops.Tensor):
+    yield key_list
+  if isinstance(key_list, (list, tuple)):
+    for v in key_list:
+      for tensor in _tensors_in_key_list(v):
+        yield tensor
+  if isinstance(key_list, slice):
+    for tensor in _tensors_in_key_list(key_list.start):
+      yield tensor
+    for tensor in _tensors_in_key_list(key_list.stop):
+      yield tensor
+    for tensor in _tensors_in_key_list(key_list.step):
+      yield tensor
+
+
+def _build_ragged_tensor_from_value_ranges(starts, limits, step, values):
+  """Returns a `RaggedTensor` containing the specified sequences of values.
+
+  Returns a RaggedTensor `output` where:
+
+  ```python
+  output.shape[0] = starts.shape[0]
+  output[i] = values[starts[i]:limits[i]:step]
+  ```
+
+  Requires that `starts.shape == limits.shape` and
+  `0 <= starts[i] <= limits[i] <= values.shape[0]`.
+
+  Args:
+    starts: 1D integer Tensor specifying the start indices for the sequences of
+      values to include.
+    limits: 1D integer Tensor specifying the limit indices for the sequences of
+      values to include.
+    step: Integer value specifying the step size for strided slices.
+    values: The set of values to select from.
+
+  Returns:
+    A `RaggedTensor`.
+
+  Raises:
+    ValueError: Until the prerequisite ops are checked in.
+  """
+  # Use `ragged_range` to get the index of each value we should include.
+  if step is None:
+    step = 1
+  step = ops.convert_to_tensor(step, name="step")
+  if step.dtype.is_integer:
+    step = math_ops.cast(step, dtypes.int64)
+  else:
+    raise TypeError("slice strides must be integers or None")
+  value_indices = ragged_math_ops.range(starts, limits, step)
+
+  # Use `ragged_gather` or `array_ops.gather` to collect the values.
+  if isinstance(values, ragged_tensor.RaggedTensor):
+    gathered_values = ragged_array_ops.gather(
+        params=values, indices=value_indices.values)
+  else:
+    gathered_values = array_ops.gather(
+        params=values, indices=value_indices.values)
+
+  # Assemble the RaggedTensor from splits & values.
+  return value_indices.with_values(gathered_values)
+
+
+def _add_offset_to_ranges(offset, starts, limits):
+  """Adds an indexing offset to each of the specified ranges.
+
+  If offset>=0, then return output[i]=min(starts[i]+offset, limits[i])
+  If offset<0, then return output[i]=max(limits[i]+offset, starts[i])
+
+  Args:
+    offset: The offset to add.  None, or an int, or a scalar Tensor.
+    starts: 1-D int64 tensor containing start indices.
+    limits: 1-D int64 tensor containing limit indices.
+
+  Returns:
+    A 1-D int64 tensor.
+  """
+
+  def map_positive_offset(offset):
+    return math_ops.minimum(starts + offset, limits)
+
+  def map_negative_offset(offset):
+    return math_ops.maximum(limits + offset, starts)
+
+  if isinstance(offset, ops.Tensor):
+    offset = math_ops.cast(offset, dtypes.int64)
+    return control_flow_ops.cond(offset >= 0,
+                                 lambda: map_positive_offset(offset),
+                                 lambda: map_negative_offset(offset))
+  elif isinstance(offset, int):
+    return (map_positive_offset(offset)
+            if offset > 0 else map_negative_offset(offset))
+
+  else:
+    raise TypeError("slice offsets must be integers or None")
diff --git a/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b28cac99db29e9ab2a2758db3449413b83cd747
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
@@ -0,0 +1,205 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.map_flat_values."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedMapInnerValuesOpTest(ragged_test_util.RaggedTensorTestCase):
+
+  def assertRaggedMapInnerValuesReturns(self,
+                                        op,
+                                        expected,
+                                        args=(),
+                                        kwargs=None):
+    kwargs = kwargs or {}
+    result = ragged.map_flat_values(op, *args, **kwargs)
+    with self.test_session():
+      self.assertRaggedEqual(result, expected)
+
+  def testDocStringExamples(self):
+    """Test the examples in apply_op_to_ragged_values.__doc__."""
+    rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
+    v1 = ragged.map_flat_values(array_ops.ones_like, rt)
+    v2 = ragged.map_flat_values(math_ops.multiply, rt, rt)
+    v3 = ragged.map_flat_values(math_ops.add, rt, 5)
+    with self.test_session():
+      self.assertRaggedEqual(v1, [[1, 1, 1], [], [1, 1], [1]])
+      self.assertRaggedEqual(v2, [[1, 4, 9], [], [16, 25], [36]])
+      self.assertRaggedEqual(v3, [[6, 7, 8], [], [9, 10], [11]])
+
+  def testOpWithSingleRaggedTensorArg(self):
+    tensor = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=array_ops.zeros_like,
+        args=(tensor,),
+        expected=[[0, 0, 0], [], [0, 0]])
+
+  def testOpWithTwoRaggedTensorArgs(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply, args=(x, y), expected=[[3, 2, 12], [], [4, 25]])
+
+  def testOpWithRaggedTensorAndScalarArgs(self):
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply, args=(5, y), expected=[[5, 10, 15], [], [20, 25]])
+
+  def testOpWithThreeRaggedTensorArgs(self):
+    condition = ragged.constant(
+        [[True, True, False], [], [True, False]])  # pyformat: disable
+    x = ragged.constant([['a', 'b', 'c'], [], ['d', 'e']])
+    y = ragged.constant([['A', 'B', 'C'], [], ['D', 'E']])
+    self.assertRaggedMapInnerValuesReturns(
+        op=array_ops.where,
+        args=(condition, x, y),
+        expected=[[b'a', b'b', b'C'], [], [b'd', b'E']])
+
+  def testOpWithRaggedTensorListArg(self):
+    x = ragged.constant([[1, 2, 3], [], [4, 5]])
+    y = ragged.constant([[10, 20, 30], [], [40, 50]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.add_n,
+        args=([x, y, x],),
+        expected=[[12, 24, 36], [], [48, 60]])
+
+  def testOpWithKeywordArgs(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        kwargs=dict(x=x, y=y),
+        expected=[[3, 2, 12], [], [4, 25]])
+
+  def testOpWithMixedPositionalAndKeywordArgs(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x,),
+        kwargs=dict(y=y),
+        expected=[[3, 2, 12], [], [4, 25]])
+
+  def testNonElementWiseOp(self):
+    x = ragged.constant(
+        [[[3, 1, 4], [1, 5, 9], [2, 6, 5]], [], [[3, 5, 8], [9, 7, 9]]],
+        ragged_rank=1)
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.reduce_sum,
+        kwargs={
+            'input_tensor': x,
+            'axis': 1,
+        },
+        expected=[[8, 15, 13], [], [16, 25]])
+
+  def testOpWithRaggedRankGreaterThanOne(self):
+    # ragged_rank=0
+    x0 = [3, 1, 4, 1, 5, 9, 2, 6, 5]
+    y0 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+    with self.test_session():
+      self.assertRaggedEqual(
+          math_ops.multiply(x0, y0), [3, 2, 12, 4, 25, 54, 14, 48, 45])
+
+    # ragged_rank=1
+    x1 = ragged.constant([[3, 1, 4], [], [1, 5], [9, 2], [6, 5]])
+    y1 = ragged.constant([[1, 2, 3], [], [4, 5], [6, 7], [8, 9]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x1, y1),
+        expected=[[3, 2, 12], [], [4, 25], [54, 14], [48, 45]])
+
+    # ragged_rank=2
+    x2 = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]], [[9, 2], [6, 5]]])
+    y2 = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]], [[6, 7], [8, 9]]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x2, y2),
+        expected=[[[3, 2, 12]],          # row 0
+                  [],                    # row 1
+                  [[], [4, 25]],         # row 2
+                  [[54, 14], [48, 45]]   # row 3
+                 ])  # pyformat: disable
+
+    # ragged_rank=3
+    x3 = ragged.constant([[[[3, 1, 4]], []], [], [[[], [1, 5]]],
+                          [[[9, 2], [6, 5]]]])
+    y3 = ragged.constant([[[[1, 2, 3]], []], [], [[[], [4, 5]]],
+                          [[[6, 7], [8, 9]]]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x3, y3),
+        expected=[
+            [[[3, 2, 12]], []],       # row 0
+            [],                       # row 1
+            [[[], [4, 25]]],          # row 2
+            [[[54, 14], [48, 45]]]    # row 3
+        ])  # pyformat: disable
+
+  def testOpWithRaggedRankThree(self):
+    x = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]]])
+    y = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]]])
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply,
+        args=(x, y),
+        expected=[[[3, 2, 12]], [], [[], [4, 25]]])
+
+  def testOpWithInnerValuesOnly(self):
+    x = constant_op.constant([[1, 2], [3, 4], [5, 6]])
+    y = constant_op.constant(2)
+    self.assertRaggedMapInnerValuesReturns(
+        op=math_ops.multiply, args=(x, y), expected=[[2, 4], [6, 8], [10, 12]])
+
+  def testRaggedTensorSplitsRaggedRankMismatchError(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[[3, 1, 4], []], [], [[1, 5]]])
+    self.assertRaisesRegexp(ValueError,
+                            r'Inputs must have identical ragged splits.*',
+                            ragged.map_flat_values, math_ops.add, x, y)
+
+  def testRaggedTensorSplitsValueMismatchError(self):
+    x = ragged.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged.constant([[1], [2, 3], [4, 5]])
+    self.assertRaisesRegexp(errors.InvalidArgumentError,
+                            r'Inputs must have identical ragged splits.*',
+                            ragged.map_flat_values, math_ops.add, x, y)
+
+  def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
+    splits1 = array_ops.placeholder_with_default(
+        constant_op.constant([0, 3, 3, 5], dtypes.int64), None)
+    splits2 = array_ops.placeholder_with_default(
+        constant_op.constant([0, 1, 3, 5], dtypes.int64), None)
+    x = ragged.RaggedTensor.from_row_splits([3, 1, 4, 1, 5], splits1)
+    y = ragged.RaggedTensor.from_row_splits([1, 2, 3, 4, 5], splits2)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*Inputs must have identical ragged splits'):
+      self.evaluate(ragged.map_flat_values(math_ops.add, x, y))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..49c0996b24f30dd33219d3292446239717bbf487
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -0,0 +1,275 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.map_fn."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops as mo
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
+                      parameterized.TestCase):
+
+  @parameterized.parameters([
+      # The following test sets map over a RaggedTensor and apply a
+      # transformation that returns with shape:
+      # [d1, (d2)] -> [d1]
+      dict(
+          fn=mo.reduce_mean,
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[2, 4, 6],
+      ),
+      dict(
+          fn=string_ops.reduce_join,
+          elems=[['foo', 'bar', 'baz'], ['a'], ['b', 'c']],
+          expected_output=[b'foobarbaz', b'a', b'bc'],
+          dtype=dtypes.string,
+      ),
+      # [d1, (d2)] -> [d1, 2]
+      dict(
+          fn=lambda x: array_ops.stack([mo.reduce_mean(x), mo.reduce_sum(x)]),
+          # fn=self.stack_mean_and_sum,
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[[2, 6], [4.5, 9], [6.5, 13]],
+          dtype=dtypes.float32,
+          expected_ragged_rank=0,
+      ),
+      # [d1, (d2)] -> [d1, (d2)]
+      dict(
+          fn=lambda x: x + np.int64(1),
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[[2, 3, 4], [5, 6], [7, 8]],
+          dtype=dtypes.int64,
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
+      ),
+      # [d1, (d2), d3] -> [d1, (d2), d3]
+      dict(
+          fn=lambda x: x + np.int64(1),
+          elems=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
+          elems_ragged_rank=1,
+          expected_ragged_rank=1,
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
+          expected_output=[[[2, 3], [4, 5]], [], [[6, 7], [8, 9], [10, 1]]],
+      ),
+      # [d1, (d2)] -> [d1, (d2), (d3)]
+      dict(
+          fn=lambda x: ragged.RaggedTensor.from_row_starts(x, [0]),
+          elems=[[1, 2, 3], [4, 5], [6, 7]],
+          expected_output=[[[1, 2, 3]], [[4, 5]], [[6, 7]]],
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2),
+      ),
+      # [d1, (d2), (d3)] -> [d1, (d2), (d3)]
+      dict(
+          fn=lambda x: ragged.map_flat_values(mo.add, x, 1),
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[[[2, 3, 4]], [[5, 6], [7, 8]]],
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2),
+      ),
+      # [d1, (d2), (d3)] -> [d1, (d2)]
+      dict(
+          fn=lambda x: ragged.reduce_sum(x, axis=1),
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[[6], [9, 13]],
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
+      ),
+      # [d1, (d2), (d3)] -> [d1, (d3)]
+      dict(
+          fn=lambda x: ragged.reduce_sum(x, axis=0),
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[[1, 2, 3], [10, 12]],
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
+      ),
+      # [d1, (d2), (d3)] -> [d1]
+      dict(
+          fn=ragged.reduce_sum,
+          elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
+          expected_output=[6, 22],
+          result_dtype=dtypes.int64,
+      ),
+      # [d1] -> [d1, (d2)]
+      dict(
+          fn=mo.range,
+          elems=[4, 0, 2],
+          expected_output=[[0, 1, 2, 3], [], [0, 1]],
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
+      ),
+      # [d1] -> [d1, (d2), (d3)]
+      dict(
+          fn=lambda x: ragged.range(mo.range(x)),
+          elems=[5, 0, 3],
+          expected_output=[[[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [],
+                           [[], [0], [0, 1]]],
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2),
+      ),
+      # [d1, (d2), (d3), (d4a), (d5)] ->  [d1, (d2), (d3), (d4b), (d5)]
+      dict(
+          fn=lambda x: x + np.int64(1),
+          elems=[[[[[1, 2, 3]], [[4], [5]]]], [[[[6, 7]]], [[[8], []]]]],
+          expected_output=[[[[[2, 3, 4]], [[5], [6]]]], [[[[7, 8]]], [[[9],
+                                                                       []]]]],
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=4),
+      ),
+  ])
+
+  def testRaggedMap(
+      self,
+      fn,
+      elems,
+      expected_output,
+      expected_ragged_rank=None,
+      result_ragged_rank=None,
+      elems_ragged_rank=None,
+      dtype=dtypes.int64,
+      result_dtype=None,
+      infer_shape=False,
+  ):
+    elems = ragged.constant(elems, dtype, elems_ragged_rank)
+    output = ragged.map_fn(
+        fn=fn, elems=elems, dtype=result_dtype, infer_shape=infer_shape)
+
+    expected_rt = ragged.constant(
+        expected_output, ragged_rank=expected_ragged_rank)
+    self.assertRaggedEqual(expected_rt, output)
+
+  def testRaggedMapOnStructure(self):
+    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    # [[10, 20, 30], [40], [50, 60, 70]]
+    robin = ragged.map_flat_values(mo.multiply, batman, 10)
+
+    features = {'batman': batman, 'robin': robin}
+
+    def _reduce_sum_from_all(f):
+      return mo.reduce_sum(f['batman']) + mo.reduce_sum(f['robin'])
+
+    output = ragged.map_fn(
+        fn=_reduce_sum_from_all,
+        elems=features,
+        dtype=dtypes.int32,
+    )
+
+    self.assertRaggedEqual(output, [66, 44, 198])
+
+  # Test mapping over a dict of RTs can produce a dict of RTs.
+  def testRaggedMapOnStructure_RaggedOutputs(self):
+    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    # [[10, 20, 30], [40], [50, 60, 70]]
+    robin = ragged.map_flat_values(mo.multiply, batman, 10)
+
+    features = {'batman': batman, 'robin': robin}
+
+    def _increment(f):
+      return {
+          'batman': f['batman'] + 1,
+          'robin': f['robin'] + 1,
+      }
+
+    output = ragged.map_fn(
+        fn=_increment,
+        elems=features,
+        infer_shape=False,
+        dtype={
+            'batman':
+                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1),
+            'robin':
+                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1)
+        },
+    )
+
+    self.assertRaggedEqual(output['batman'], [[2, 3, 4], [5], [6, 7, 8]])
+    self.assertRaggedEqual(output['robin'], [[11, 21, 31], [41], [51, 61, 71]])
+
+  def testZip(self):
+    x = ragged.constant([[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]],
+                        dtypes.int64)
+    y = array_ops.expand_dims(mo.range(x.nrows(), dtype=dtypes.int64), axis=1)
+
+    def _zip(foo):
+      y_val, x_val = foo
+      bar = backend.tile(y_val, array_ops.shape(x_val))
+      return array_ops.stack([bar, x_val], axis=1)
+
+    output = ragged.map_fn(
+        _zip, (y, x),
+        dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1),
+        infer_shape=False)
+
+    self.assertRaggedEqual(
+        output, [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]],
+                 [[3, 70]], [[4, 80], [4, 90], [4, 100]]])
+
+  def testBatchGather(self):
+    tokens = ragged.constant([['hello', '.', 'there'], ['merhaba'],
+                              ['bonjour', '.', 'ca va', '?']])
+    indices = ragged.constant([[0, 2], [0], [0, 2]])
+
+    def gather(x):
+      tokens_val, indices_val = x
+      return array_ops.gather(tokens_val, indices_val)
+
+    data = tokens, indices
+    out = ragged.map_fn(
+        gather,
+        data,
+        dtype=ragged.RaggedTensorType(dtype=dtypes.string, ragged_rank=1),
+        infer_shape=False)
+
+    self.assertRaggedEqual(
+        out, [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])
+
+  def testMismatchRaggedRank(self):
+    elems = ragged.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
+    fn = lambda x: ragged.reduce_sum(x, axis=0)
+    with self.assertRaisesWithLiteralMatch(
+        ValueError, r'The declared ragged rank (23) mismatches the result (1)'):
+      _ = ragged.map_fn(
+          fn,
+          elems,
+          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=23))
+
+  def testMismatchRaggedRank2(self):
+    elems = ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
+    fn = lambda x: ragged.RaggedTensor.from_row_starts(x, [0])
+    with self.assertRaisesWithLiteralMatch(
+        ValueError, r'The declared ragged rank (10) mismatches the result (1)'):
+      _ = ragged.map_fn(
+          fn,
+          elems,
+          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=10))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..af40352b1d02fe8ccce242d31fb33e2f8a21f1ce
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -0,0 +1,450 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional operations.
+
+See the [Higher Order
+Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+
+def map_fn(fn,
+           elems,
+           dtype=None,
+           parallel_iterations=None,
+           back_prop=True,
+           swap_memory=False,
+           infer_shape=True,
+           name=None):
+  """map on the list of tensors unpacked from `elems` on dimension 0.
+
+  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
+  sequence of elements from first to last. The elements are made of the
+  tensors unpacked from `elems`. `dtype` is the data type of the return
+  value of `fn`. Users must provide `dtype` if it is different from
+  the data type of `elems`.
+
+  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
+  of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
+
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
+  Furthermore, `fn` may emit a different structure than its input.  For example,
+  `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
+  the `dtype` parameter is not optional: `dtype` must be a type or (possibly
+  nested) tuple of types matching the output of `fn`.
+
+  To apply a functional operation to the nonzero elements of a SparseTensor
+  one of the following methods is recommended. First, if the function is
+  expressible as TensorFlow ops, use
+
+  ```python
+    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
+  ```
+
+  If, however, the function is not expressible as a TensorFlow op, then use
+
+  ```python
+  result = SparseTensor(
+    input.indices, map_fn(fn, input.values), input.dense_shape)
+  ```
+
+  instead.
+
+  When executing eagerly, map_fn does not execute in parallel even if
+  `parallel_iterations` is set to a value > 1. You can still get the
+  performance benefits of running a function in parallel by using the
+  `tf.contrib.eager.defun` decorator,
+
+  ```python
+  # Assume the function being used in map_fn is fn.
+  # To ensure map_fn calls fn in parallel, use the defun decorator.
+  @tf.contrib.eager.defun
+  def func(tensor):
+    return tf.map_fn(fn, tensor)
+  ```
+
+  Note that if you use the defun decorator, any non-TensorFlow Python code
+  that you may have written in your function won't get executed. See
+  `tf.contrib.eager.defun` for more details. The recommendation would be to
+  debug without defun but switch to defun to get performance benefits of
+  running map_fn in parallel.
+
+  Args:
+    fn: The callable to be performed.  It accepts one argument, which will have
+      the same (possibly nested) structure as `elems`.  Its output must have the
+      same structure as `dtype` if one is provided, otherwise it must have the
+      same structure as `elems`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which will
+      be unpacked along their first dimension.  The nested sequence of the
+      resulting slices will be applied to `fn`.
+    dtype: (optional) The output type(s) of `fn`.  If `fn` returns a structure
+      of Tensors differing from the structure of `elems`, then `dtype` is not
+      optional and must have the same structure as the output of `fn`. Use
+      `RaggedTensorType` to declare an output of type `RaggedTensor`.
+    parallel_iterations: (optional) The number of iterations allowed to run in
+      parallel. When graph building, the default value is 10. While executing
+      eagerly, the default value is set to 1.
+    back_prop: (optional) True enables support for back propagation.
+    swap_memory: (optional) True enables GPU-CPU memory swapping.
+    infer_shape: (optional) False disables tests for consistent output shapes.
+    name: (optional) Name prefix for the returned tensors.
+
+  Returns:
+    A possibly nested sequence of potentially ragged tensors.  Each
+    tensor packs the results of applying `fn` to tensors unpacked from `elems`
+    along the first dimension, from first to last.
+
+  Raises:
+    TypeError: if `fn` is not callable or the structure of the output of
+      `fn` and `dtype` do not match, or if elems is a SparseTensor.
+    ValueError: if the lengths of the output of `fn` and `dtype` do not match.
+
+  #### Examples:
+
+    ```python
+    elems = np.array([1, 2, 3, 4, 5, 6])
+    squares = map_fn(lambda x: x * x, elems)
+    # squares == [1, 4, 9, 16, 25, 36]
+    ```
+
+    ```python
+    elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
+    alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
+    # alternate == [-1, 2, -3]
+    ```
+
+    ```python
+    elems = np.array([1, 2, 3])
+    alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
+    # alternates[0] == [1, 2, 3]
+    # alternates[1] == [-1, -2, -3]
+    ```
+
+    ```python
+    elems=ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
+    mean = map_fn(tf.reduce_mean, elems)
+    # mean == [2, 4, 6]
+    ```
+
+    ```python
+    elems=ragged.constant([[1, 2, 3], [4, 5], [6, 7]], dtype=tf.int64)
+    out = map_fn(fn=lambda x: x+1, elems,
+      dtype=ragged.RaggedTensorType(type=tf.int64, ragged_rank=0))
+    # out = ragged.constant([[2, 3, 4], [5, 6], [7, 8]])
+    ```
+  """
+  if not callable(fn):
+    raise TypeError("fn must be callable.")
+
+  if isinstance(elems, sparse_tensor.SparseTensor):
+    raise TypeError(
+        "To perform a map on the values of a sparse tensor use either "
+        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
+        " SparseTensor(input.indices, map_fn(fn, input.values), "
+        "input.dense_shape)")
+
+  in_graph_mode = not context.executing_eagerly()
+  # Set the default number of parallel_iterations depending on graph/eager mode.
+  if in_graph_mode and not parallel_iterations:
+    parallel_iterations = 10
+  elif not in_graph_mode and not parallel_iterations:
+    parallel_iterations = 1
+
+  if not in_graph_mode and parallel_iterations > 1:
+    logging.log_first_n(logging.WARN, "Setting parallel_iterations > 1 has no "
+                        "effect when executing eagerly. Consider calling map_fn"
+                        " with tf.contrib.eager.defun to execute fn in "
+                        "parallel.", 1)
+    parallel_iterations = 1
+
+  input_is_sequence = nest.is_sequence(elems)
+  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
+
+  def input_pack(x):
+    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
+
+  elems_flat = input_flatten(elems)
+
+  with ops.name_scope(name, "map", elems_flat):
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode:
+      # Any get_variable calls in fn will cache the first call locally
+      # and not issue repeated network I/O requests for each iteration.
+      varscope = vs.get_variable_scope()
+      varscope_caching_device_was_none = False
+      if varscope.caching_device is None:
+        # TODO(ebrevdo): Change to using colocate_with here and in other
+        # methods.
+        varscope.set_caching_device(lambda op: op.device)
+        varscope_caching_device_was_none = True
+
+    elems_flat = [
+        ragged_tensor.convert_to_tensor_or_ragged_tensor(elem, name="elem")
+        for elem in elems_flat
+    ]
+
+    # We can either infer the output, or we can assume that it will be the same
+    # as the input structure.
+    dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
+
+    # Find the number of iterations, n may be known statically.
+    if isinstance(elems_flat[0], ragged_tensor.RaggedTensor):
+      n = elems_flat[0].nrows(out_type=dtypes.int32)
+    else:
+      static_shape = elems_flat[0].shape
+      if static_shape.ndims is not None and static_shape.ndims < 1:
+        if len(elems_flat) == 1:
+          raise ValueError(
+              "elems must be a 1+ dimensional Tensor, not a scalar")
+        else:
+          raise ValueError(
+              "elements in elems must be 1+ dimensional Tensors, not scalars")
+      n = (tensor_shape.dimension_value(static_shape[0]) or
+           array_ops.shape(elems_flat[0])[0])
+
+    # Create a flat list of TAs.
+
+    # Flatten the dtype structure to a list.
+    dtype_flat = nest.flatten(dtype)
+
+    # decompose to components
+    dtype_components = [_maybe_decompose_dtype(d) for d in dtype_flat]
+    dtype_components_flat = nest.flatten(dtype_components)
+
+    # Create TensorArrays.
+    accs_ta = [
+        tensor_array_ops.TensorArray(
+            dtype=t, dynamic_size=False, infer_shape=infer_shape, size=n)
+        for t in dtype_components_flat
+    ]
+
+    i = constant_op.constant(0)
+
+    def compute(i, tas):
+      """The loop body of map_fn.
+
+      Args:
+        i: the loop counter
+        tas: the flat TensorArray accumulator list
+
+      Returns:
+        (i + 1, tas): the updated counter + updated TensorArrays
+
+      Raises:
+        TypeError: if dtype and packed_fn_values structure do not match
+        ValueType: if dtype and packed_fn_values lengths do not match
+      """
+      # Get Tensors or RaggedTensors sliced at i, then pack it back to the
+      # original structure.
+      packed_values = input_pack([elem_flat[i] for elem_flat in elems_flat])
+      packed_fn_values = fn(packed_values)
+
+      # Check that the structure of the output matches what was declared or
+      # inferred.
+      # nest.assert_same_structure(dtype or elems, packed_fn_values)
+
+      # Flatten and decompose to a list of Tensors
+      flat_fn_values = nest.flatten(packed_fn_values)
+
+      # If we declared that we are expecting a RaggedTensor output, but we get a
+      # Tensor output. We should try to convert it to a RaggedTensor.
+      flat_fn_composite_tensors = list(
+          _convert_declared(flat_fn_values, dtype_flat))
+
+      flat_fn_components = [
+          _maybe_decompose_tensor(t) for t in flat_fn_composite_tensors
+      ]
+      flat_fn_tensors = nest.flatten(flat_fn_components)
+
+      # Write to TAs.
+      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_tensors)]
+
+      return (i + 1, tas)
+
+    _, r_a = control_flow_ops.while_loop(
+        lambda i, _: i < n, compute, (i, accs_ta),
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory,
+        maximum_iterations=n)
+
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode and varscope_caching_device_was_none:
+      varscope.set_caching_device(None)
+
+    # Pack back into a list of components
+    results_as_components = nest.pack_sequence_as(dtype_components, r_a)
+
+    # Stack TensorArrays for Tensor outputs, and concat RaggedTensor outputs.
+    def _stack_or_concat(e):
+      if isinstance(e, _RaggedTensorComponents):
+        return _concat_ragged_tensor_components(e)
+      else:
+        result = e.stack()
+        return result
+
+    results_flat_components = [
+        _stack_or_concat(e) for e in results_as_components
+    ]
+
+    results_packed = [
+        _maybe_recompose_tensor(c) for c in results_flat_components
+    ]
+    results_packed = nest.pack_sequence_as(dtype, results_packed)
+    return results_packed
+
+
+class _RaggedTensorComponents(
+    collections.namedtuple(
+        "_RaggedTensorComponents",
+        ["flat_values", "nested_row_lengths", "outer_row_length"])):
+  """A namedtuple of components which represent a `RaggedTensor`.
+
+  _RaggedTensorComponents is a list of components which can be used to create a
+  `RaggedTensor`. Use this class to represent a `RaggedTensor` in situations
+  where nest.flatten and nest.pack_sequence_as should decompose ragged tensors
+  into their components..
+
+  The following are a list of components for a `RaggedTensor`:
+
+  flat_values: The flat and inner values of a RaggedTensor. This could be
+    a `Tensor`, a `TensorArray`, or a data type.
+  nested_row_lengths: a tuple containing the row lengths of each rank. The
+    elements of the tuple could be `Tensor`s or `TensorArray`s.
+  outer_row_length: a `Tensor` or `TensorArray` containing the row length of the
+    `RaggedTensor`'s outermost dimension.
+
+  See `RaggedTensor` for more details of the use of each component.
+  """
+  __slots__ = ()
+
+
+def _concat_ragged_tensor_components(rt_ta):
+  flat_values = rt_ta.flat_values.concat()
+  nested_row_lengths = tuple(
+      row_lengths_ta.concat() for row_lengths_ta in rt_ta.nested_row_lengths)
+  outer_row_length = rt_ta.outer_row_length.concat()
+  return _RaggedTensorComponents(
+      flat_values=flat_values,
+      nested_row_lengths=nested_row_lengths,
+      outer_row_length=outer_row_length)
+
+
+def _maybe_decompose_tensor(rt):
+  """Decompose tensors to their composite tensors."""
+  if not isinstance(rt, ragged_tensor.RaggedTensor):
+    return rt
+
+  # The three component pieces we need:
+  # - inner values
+  flat_values = rt.flat_values
+
+  # - row_splits of the RT
+  splits = rt.nested_row_splits
+  nested_row_lengths = tuple(split[1:] - split[:-1] for split in splits)
+
+  # - outer row length
+  outer_row_length = array_ops.expand_dims(rt.nrows(), axis=0)
+
+  return _RaggedTensorComponents(
+      flat_values=flat_values,
+      nested_row_lengths=nested_row_lengths,
+      outer_row_length=outer_row_length,
+  )
+
+
+def _maybe_recompose_tensor(t):
+  """Reconstructs a _RaggedTensorComponents into a RaggedTensor."""
+  if not isinstance(t, _RaggedTensorComponents):
+    return t
+
+  values = t.flat_values
+  nested_row_lengths = tuple(t.nested_row_lengths)
+  for nested_row_length in reversed(nested_row_lengths):
+    values = ragged_tensor.RaggedTensor.from_row_lengths(
+        values, nested_row_length)
+  return ragged_tensor.RaggedTensor.from_row_lengths(values, t.outer_row_length)
+
+
+def _maybe_decompose_dtype(d):
+  """Decompose dtypes into composite tensors (if necessary)."""
+  if not isinstance(d, ragged_tensor.RaggedTensorType):
+    return d
+
+  result = _RaggedTensorComponents(
+      flat_values=d.dtype,
+      nested_row_lengths=tuple(dtypes.int64 for i in range(d.ragged_rank - 1)),
+      outer_row_length=dtypes.int64,
+  )
+  return result
+
+
+def _convert_declared(fn_output_flat, output_declared):
+  """Convert outputs which are `Tensor`s into `_RaggedTensorComponents`."""
+  for current, declared in zip(fn_output_flat, output_declared):
+    if isinstance(declared, ragged_tensor.RaggedTensorType):
+      if isinstance(current, ragged_tensor.RaggedTensor):
+        # Check that the ragged ranks match up.
+        # + 1 to account for the rank of the outermost dimension.
+        if declared.ragged_rank != current.ragged_rank + 1:
+          raise ValueError(
+              "The declared ragged rank (%d) mismatches the result (%d)" %
+              (declared.ragged_rank, current.ragged_rank))
+        yield current
+      else:
+        # We the output is a Tensor, but the caller has declared that we are
+        # expecting an RaggedTensor output.
+        if declared.ragged_rank != 1:
+          raise ValueError(
+              "The declared ragged rank (%d) mismatches the result (1)" %
+              declared.ragged_rank)
+
+        if isinstance(current, ragged_tensor.RaggedTensor):
+          nrows = current.nrows()
+        else:
+          nrows = array_ops.shape(current, out_type=dtypes.int64)[0]
+        row_length = array_ops.expand_dims(nrows, axis=0)
+        rt = _RaggedTensorComponents(
+            flat_values=current,
+            nested_row_lengths=(),
+            outer_row_length=row_length)
+        yield rt
+    else:
+      yield current
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..92f82be84aca06ae723f00103dccbdeb5c64371f
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -0,0 +1,586 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for ragged tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_ragged_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import segment_id_ops
+
+
+#===============================================================================
+# ragged.range
+#===============================================================================
+# pylint: disable=redefined-builtin
+def range(starts, limits=None, deltas=1, dtype=None, name=None):
+  """Returns a `RaggedTensor` containing the specified sequences of numbers.
+
+  Each row of the returned `RaggedTensor` contains a single sequence:
+
+  ```python
+  ragged.range(starts, limits, deltas)[i] ==
+      tf.range(starts[i], limits[i], deltas[i])
+  ```
+
+  If `start[i] < limits[i] and deltas[i] > 0`, then `output[i]` will be an
+  empty list.  Similarly, if `start[i] > limits[i] and deltas[i] < 0`, then
+  `output[i]` will be an empty list.  This behavior is consistent with the
+  Python `range` function, but differs from the `tf.range` op, which returns
+  an error for these cases.
+
+  Examples:
+
+  ```python
+  >>> ragged.range([3, 5, 2]).eval().tolist()
+  [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]]
+  >>> ragged.range([0, 5, 8], [3, 3, 12]).eval().tolist()
+  [[0, 1, 2], [], [8, 9, 10, 11]]
+  >>> ragged.range([0, 5, 8], [3, 3, 12], 2).eval().tolist()
+  [[0, 2], [], [8, 10]]
+  ```
+
+  The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+  The vector inputs must all have the same size.  Scalar inputs are broadcast
+  to match the size of the vector inputs.
+
+  Args:
+    starts: Vector or scalar `Tensor`.  Specifies the first entry for each range
+      if `limits` is not `None`; otherwise, specifies the range limits, and the
+      first entries default to `0`.
+    limits: Vector or scalar `Tensor`.  Specifies the exclusive upper limits for
+      each range.
+    deltas: Vector or scalar `Tensor`.  Specifies the increment for each range.
+      Defaults to `1`.
+    dtype: The type of the elements of the resulting tensor.  If not specified,
+      then a value is chosen based on the other args.
+    name: A name for the operation.
+
+  Returns:
+    A `RaggedTensor` of type `dtype` with `ragged_rank=1`.
+  """
+  if limits is None:
+    starts, limits = 0, starts
+
+  with ops.name_scope(name, 'RaggedRange', [starts, limits, deltas]) as name:
+    starts = ops.convert_to_tensor(starts, dtype=dtype, name='starts')
+    limits = ops.convert_to_tensor(limits, dtype=dtype, name='limits')
+    deltas = ops.convert_to_tensor(deltas, dtype=dtype, name='deltas')
+
+    # infer dtype if not explicitly provided
+    if dtype is None:
+      starts, limits, deltas = _infer_matching_dtype(
+          [starts, limits, deltas],
+          [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64])
+
+    result = gen_ragged_math_ops.ragged_range(starts, limits, deltas, name=name)
+    return ragged_tensor.RaggedTensor.from_row_splits(result.rt_dense_values,
+                                                      result.rt_nested_splits)
+
+
+def _infer_matching_dtype(tensors, dtype_hierarchy):
+  """Infers a matching dtype for tensors, and casts them to that dtype."""
+  assert all(t.dtype in dtype_hierarchy for t in tensors)
+  inferred_dtype = max([t.dtype for t in tensors], key=dtype_hierarchy.index)
+  return [math_ops.cast(t, inferred_dtype) for t in tensors]
+
+
+#===============================================================================
+# ragged_segment_<AGGREGATE>
+#===============================================================================
+
+# Docstring template used for the raggged_segment_<AGGREGATE> ops.
+_RAGGED_SEGMENT_DOCSTRING = """\
+Computes the %(combination)s along segments of a RaggedTensor.
+
+  Returns a RaggedTensor `output` with `num_segments` rows, where the row
+  `output[i]` is formed by taking the %(combination)s of all rows of `data`
+  whose corresponding `segment_id` is `i`.
+
+  The length of the row `output[i]` will be the maximum of the lengths of
+  all rows of `data` whose corresponding `segment_id` is `i`.  If no `data`
+  rows correspond to a given segment ID, then the output row for that segment
+  ID will be empty.
+
+  Args:
+    data: A `RaggedTensor` containing the values to combine.
+    segment_ids: A `Tensor` or `RaggedTensor`.  Must have type `int64` or
+      `int32`.  `segment_ids.shape` must be a prefix of `data.shape`.
+      Must be greater than or equal to zero, and less than `num_segments`.
+      `segment_ids` is not required to be sorted.
+    num_segments: An `int32` or `int64` scalar specifying the number of
+      distinct segment ids.
+    name: A name prefix for the returned tensor (optional).
+  Returns:
+    A `RaggedTensor` containing the %(combined)s values.  The returned tensor
+    has the same dtype as `data`, and its shape is
+    `[num_segments] + data.shape[segment_ids.rank:]`.
+  Raises:
+    ValueError: If `segment_ids.shape` is not a prefix of `data.shape`.
+"""
+
+
+def _ragged_segment_aggregate(unsorted_segment_op,
+                              data,
+                              segment_ids,
+                              num_segments,
+                              name=None):
+  """Aggregates along segments of a RaggedTensor using `unsorted_segment_op`.
+
+  Returns a RaggedTensor `output` with `num_segments` rows, where the row
+  `output[i]` is formed by combining all rows of `data` whose corresponding
+  `segment_id` is `i`.  The values in each row are combined using
+  `unsorted_segment_op`.
+
+  The length of the row `output[i]` will be the maximum of the lengths of
+  all rows of `data` whose corresponding `segment_id` is `i`.  If no `data`
+  rows correspond to a given segment ID, then the output row for that segment
+  ID will be empty.
+
+  Args:
+    unsorted_segment_op: The tensorflow `op` that should be used to combine
+      values in each row.  Must have the same signature and basic behavior as
+      `unsorted_segment_sum`, `unsorted_segment_max`, etc.
+    data: A `RaggedTensor` containing the values to be combined.
+    segment_ids: A `Tensor` or `RaggedTensor`.  Must have type `int64` or
+      `int32`.  `segment_ids.shape` must be a prefix of `data.shape`.
+      `segment_ids` is not required to be sorted.
+    num_segments: An `int32` or `int64` scalar.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` containing the aggregated values.  The returned tensor
+    has the same dtype as `data`, and its shape is
+    `[num_segments] + data.shape[segment_ids.rank:]`.
+  Raises:
+    ValueError: If segment_ids.shape is not a prefix of data.shape.
+  """
+  if not (ragged_tensor.is_ragged(data) or
+          ragged_tensor.is_ragged(segment_ids)):
+    return unsorted_segment_op(data, segment_ids, num_segments, name)
+
+  with ops.name_scope(name, 'RaggedSegment',
+                      [data, segment_ids, num_segments]) as name:
+    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
+    segment_ids = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        segment_ids, name='segment_ids')
+
+    if ragged_tensor.is_ragged(segment_ids):
+      if not ragged_tensor.is_ragged(data):
+        raise ValueError('segment_ids.shape must be a prefix of data.shape, '
+                         'but segment_ids is ragged and data is not.')
+      check_splits = check_ops.assert_equal(
+          segment_ids.row_splits,
+          data.row_splits,
+          message='segment_ids.shape must be a prefix of data.shape')
+      with ops.control_dependencies([check_splits]):
+        return _ragged_segment_aggregate(unsorted_segment_op, data.values,
+                                         segment_ids.values, num_segments, name)
+
+    segment_ids = math_ops.cast(segment_ids, dtypes.int64)
+
+    # Find the length of each row in data.  (dtype=int64, shape=[data_nrows])
+    data_row_lengths = data.row_splits[1:] - data.row_splits[:-1]
+
+    # Find the length that each output row will have.  The length of the row
+    # corresponding to segment `id` is `max(data_row_lengths[i])` where
+    # `segment_ids[i]=id`.  (dtype=int64, shape=[output_nrows])
+    output_row_lengths = math_ops.maximum(
+        math_ops.unsorted_segment_max(data_row_lengths, segment_ids,
+                                      num_segments), 0)
+    assert output_row_lengths.dtype == dtypes.int64
+
+    # Build the splits tensor for the output RaggedTensor.
+    output_splits = array_ops.concat([
+        array_ops.zeros([1], dtypes.int64),
+        math_ops.cumsum(output_row_lengths)
+    ],
+                                     axis=0)
+
+    # For each row in `data`, find the start & limit position where that row's
+    # values will be aggregated in output.values.
+    data_row_to_out_row_start = array_ops.gather(output_splits, segment_ids)
+    data_row_to_out_row_limit = data_row_to_out_row_start + data_row_lengths
+
+    # For each value in `data.values`, find the position where it will
+    # aggregated in `output.values`.
+    # Get the target output values index for each data values index.
+    data_val_to_out_val_index = range(data_row_to_out_row_start,
+                                      data_row_to_out_row_limit).values
+
+    # Recursively aggregate the values.
+    output_values = _ragged_segment_aggregate(unsorted_segment_op, data.values,
+                                              data_val_to_out_val_index,
+                                              output_splits[-1])
+    return ragged_tensor.RaggedTensor.from_row_splits(output_values,
+                                                      output_splits)
+
+
+def segment_sum(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_sum, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentSum')
+
+
+def segment_prod(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_prod, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentProd')
+
+
+def segment_min(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_min, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentMin')
+
+
+def segment_max(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  return _ragged_segment_aggregate(math_ops.unsorted_segment_max, data,
+                                   segment_ids, num_segments, name or
+                                   'RaggedSegmentMax')
+
+
+def segment_mean(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  with ops.name_scope(name, 'RaggedSegmentMean',
+                      [data, segment_ids, num_segments]):
+    total = segment_sum(data, segment_ids, num_segments)
+    ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        array_ops.ones_like(data.flat_values), data.nested_row_splits)
+    count = segment_sum(ones, segment_ids, num_segments)
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        total.flat_values / count.flat_values, total.nested_row_splits)
+
+
+def segment_sqrt_n(data, segment_ids, num_segments, name=None):
+  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  with ops.name_scope(name, 'RaggedSegmentSqrtN',
+                      [data, segment_ids, num_segments]):
+    total = segment_sum(data, segment_ids, num_segments)
+    ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        array_ops.ones_like(data.flat_values), data.nested_row_splits)
+    count = segment_sum(ones, segment_ids, num_segments)
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        total.flat_values / math_ops.sqrt(count.flat_values),
+        total.nested_row_splits)
+
+
+def _set_ragged_segment_docstring(func, combination, combined):
+  func.__doc__ = _RAGGED_SEGMENT_DOCSTRING % dict(
+      combination=combination, combined=combined)
+
+
+_set_ragged_segment_docstring(segment_sum, 'sum', 'summed')
+_set_ragged_segment_docstring(segment_prod, 'product', 'multiplied')
+_set_ragged_segment_docstring(segment_min, 'minimum', 'minimized')
+_set_ragged_segment_docstring(segment_max, 'maximum', 'maximized')
+_set_ragged_segment_docstring(segment_mean, 'mean', 'averaged')
+_set_ragged_segment_docstring(segment_sqrt_n, 'sum divided by sqrt(N)',
+                              'summed')
+
+#===============================================================================
+# ragged_reduce_<AGGREGATE>
+#===============================================================================
+
+# Docstring template used for ragged_reduce_<AGGREGATE> ops.
+_RAGGED_REDUCE_DOCSTRING = """\
+Computes the %(combination)s of elements across dimensions of a `RaggedTensor`.
+
+  Reduces `input_tensor` along the dimensions given in `axis` by taking the
+  %(combination)s of values.  If a reduced dimension has no elements for
+  some index, then the value for that index will be %(default)s.
+
+  The rank of the tensor is reduced by `1` for each entry in `axis`.  If
+  `axis` is not specified, then all dimensions are reduced, and a scalar
+  value is returned.
+  Args:
+    input_tensor: A `RaggedTensor` containing the values to be %(combined)s.
+    axis: The dimensions to reduce.  May be `None` (to reduce all axes), an
+      `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce
+      a given set of axes), or a `Tensor` with a constant value.  Must be in
+      the range `[0, input_tensor.rank]`.
+    name: A name prefix for the returned tensor (optional).
+  Returns:
+    A `RaggedTensor` containing the %(combined)s values.  The returned tensor
+    has the same dtype as `data`, and its shape is given by removing the
+    dimensions specified in `axis` from `input_tensor.shape`.  The `ragged_rank`
+    of the returned tensor is given by substracting any ragged dimensions
+    specified in `axis` from `input_tensor.ragged_rank`.
+  Raises:
+    ValueError: If `axis` contains a `Tensor` whose value is not constant.
+  ####Example:
+    ```python%(example)s    ```
+"""
+_RAGGED_REDUCE_SUM_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_sum(rt, axis=0).eval().tolist()
+    [15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
+    >>> ragged.reduce_sum(rt, axis=1).eval().tolist()
+    [8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+"""
+_RAGGED_REDUCE_PROD_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_prod(rt, axis=0).eval().tolist()
+    [54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
+    >>> ragged.reduce_prod(rt, axis=1).eval().tolist()
+    [12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
+"""
+_RAGGED_REDUCE_MIN_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_min(rt, axis=0).eval().tolist()
+    [1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
+    >>> ragged.reduce_min(rt, axis=1).eval().tolist()
+    [1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
+"""
+_RAGGED_REDUCE_MAX_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_max(rt, axis=0).eval().tolist()
+    [9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
+    >>> ragged.reduce_max(rt, axis=1).eval().tolist()
+    [4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
+"""
+_RAGGED_REDUCE_MEAN_EXAMPLE = """
+    >>> rt = ragged.constant([[3, 1, 4], [1, 5], [9], [2, 6]])
+    >>> ragged.reduce_mean(rt, axis=0).eval().tolist()
+    [3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
+    >>> ragged.reduce_mean(rt, axis=1).eval().tolist()
+    [2.66666, 3, 9, 4]  # = [mean(3, 1, 4), mean(1, 5), 9, mean(2, 6)]
+"""
+_RAGGED_REDUCE_ALL_EXAMPLE = """
+    >>> rt = ragged.constant([[True, True], [True, True, False, True], [False, True]])
+    >>> ragged.reduce_all(rt, axis=0).eval().tolist()
+    [False, True, False, True]
+    >>> ragged.reduce_all(rt, axis=1).eval().tolist()
+    [True, False, False]
+"""
+_RAGGED_REDUCE_ANY_EXAMPLE = """
+    >>> rt = ragged.constant([[True, True], [True, True, False, True], [False, True]])
+    >>> ragged.reduce_any(rt, axis=0).eval().tolist()
+    [True, True, False, True]
+    >>> ragged.reduce_any(rt, axis=1).eval().tolist()
+    [True, True, True]
+"""
+
+
+def _ragged_reduce_aggregate(reduce_op,
+                             unsorted_segment_op,
+                             rt_input,
+                             axis,
+                             keepdims,
+                             name=None):
+  """Aggregates across axes of a RaggedTensor using the given `Tensor` ops.
+
+  Reduces `rt_input` along the dimensions given in `axis`.  The rank of the
+  tensor is reduced by 1 for each entry in `axis`.  If `axis` is not specified,
+  then all dimensions are reduced, and a scalar value is returned.
+
+  This op assumes that `reduce_op` and `unsorted_segment_op` are associative;
+  if not, then reducing multiple axes will return incorrect results.  (In
+  particular, reducing multiple axes is currently implemented by reducing the
+  axes one at a time.)
+
+  Args:
+    reduce_op: The tensorflow `op` that should be used to reduce values in
+      uniform dimensions.  Must have the same signature and basic behavior as
+      `reduce_sum`, `reduce_max`, etc.
+    unsorted_segment_op: The tensorflow `op` that should be used to combine
+      values in ragged dimensions.  Must have the same signature and basic
+      behavior as `unsorted_segment_sum`, `unsorted_segment_max`, etc.
+    rt_input: A `Tensor` or `RaggedTensor` containing the values to be reduced.
+    axis: The axis or axes to reduce.  May be `None` (to reduce all axes), an
+      `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce a
+      given set of axes), or a `Tensor` with a constant value.  Must be in the
+      range `[0, rt_input.rank)`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` containing the reduced values.  The returned tensor
+    has the same dtype as `data`, and its shape is given by removing the
+    dimensions specified in `axis` from `rt_input.shape`.  The `ragged_rank`
+    of the returned tensor is given by substracting any ragged dimensions
+    specified in `axis` from `rt_input.ragged_rank`.
+  Raises:
+    ValueError: If `axis` contains a `Tensor` whose value is not constant.
+  """
+  if not ragged_tensor.is_ragged(rt_input):
+    return reduce_op(rt_input, axis, name=name)
+
+  if keepdims:
+    raise ValueError('keepdims=True is not supported for RaggedTensors.')
+
+  if isinstance(axis, ops.Tensor):
+    axis = tensor_util.constant_value(axis)
+    if axis is None:
+      raise ValueError('axis must be known at graph construction time.')
+    if isinstance(axis, np.ndarray):
+      axis = axis.tolist()
+
+  # When reducing all axes, just ignore splits & reduce the inner values.
+  if axis is None:
+    return reduce_op(rt_input.flat_values, None, name=name)
+
+  with ops.name_scope(name, 'RaggedReduce', [rt_input, axis]):
+    if isinstance(axis, (tuple, list)):
+      if not axis:
+        return rt_input
+      elif len(axis) == 1:
+        axis = axis[0]
+      else:
+        # When reducing multiple axes, just reduce one at a time.  This is less
+        # efficient, and only works for associative ops.  (In particular, it
+        # does not work for reduce_mean.)  However, reducing multiple axes at
+        # once will probably require a nontrivial c++ op.
+        axis = sorted(axis)
+        inner_reduced = _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                                 rt_input, axis[-1], keepdims)
+        return _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                        inner_reduced, axis[:-1], keepdims)
+
+    axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
+
+    rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        rt_input, name='rt_input')
+
+    if axis == 0:
+      # out[i_1, i_2, ..., i_N] = sum_{j} rt_input[j, i_1, i_2, ..., i_N]
+      row_lengths = rt_input.row_splits[1:] - rt_input.row_splits[:-1]
+      num_segments = math_ops.maximum(math_ops.reduce_max(row_lengths), 0)
+      segment_ids = range(row_lengths).values
+      return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
+                                       segment_ids, num_segments)
+    elif axis == 1:
+      # out[i_0, i_1, i_2, ..., i_N] = sum_{j} rt_input[i_0, j, i_2, ..., i_N]
+      num_segments = array_ops.shape(rt_input.row_splits)[0] - 1
+      segment_ids = segment_id_ops.row_splits_to_segment_ids(
+          rt_input.row_splits)
+      return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
+                                       segment_ids, num_segments)
+    else:
+      # out[i_0, ..., i_[axis-1], i_axis+1], ..., i_N] =
+      #     sum_{j} rt_input [i_0, ..., i_[axis-1], j, i_axis+1], ..., i_N]
+      return rt_input.with_values(
+          _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
+                                   rt_input.values, axis - 1, keepdims))
+
+
+def reduce_sum(input_tensor, axis=None, keepdims=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_sum,
+                                  math_ops.unsorted_segment_sum, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceSum')
+
+
+def reduce_prod(input_tensor, axis=None, keepdims=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_prod,
+                                  math_ops.unsorted_segment_prod, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceProd')
+
+
+def reduce_min(input_tensor, axis=None, keepdims=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_min,
+                                  math_ops.unsorted_segment_min, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceMin')
+
+
+def reduce_max(input_tensor, axis=None, keepdims=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  return _ragged_reduce_aggregate(math_ops.reduce_max,
+                                  math_ops.unsorted_segment_max, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceMax')
+
+
+def reduce_mean(input_tensor, axis=None, keepdims=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  with ops.name_scope(name, 'RaggedReduceMean', [input_tensor, axis]):
+    total = reduce_sum(input_tensor, axis, keepdims)
+    if ragged_tensor.is_ragged(input_tensor):
+      ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
+          array_ops.ones_like(input_tensor.flat_values),
+          input_tensor.nested_row_splits)
+    else:
+      ones = array_ops.ones_like(input_tensor)
+    count = reduce_sum(ones, axis, keepdims)
+    if ragged_tensor.is_ragged(total):
+      return ragged_tensor.RaggedTensor.from_nested_row_splits(
+          total.flat_values / count.flat_values, total.nested_row_splits)
+    else:
+      return total / count
+
+
+def _cast(input_tensor, dtype):
+  return ragged_functional_ops.map_flat_values(math_ops.cast, input_tensor,
+                                               dtype)
+
+
+def reduce_all(input_tensor, axis=None, keepdims=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  with ops.name_scope(name, 'RaggedReduceAll', [input_tensor, axis]):
+    return _cast(
+        reduce_prod(_cast(input_tensor, dtypes.int32), axis, keepdims),
+        dtypes.bool)
+
+
+def reduce_any(input_tensor, axis=None, keepdims=None, name=None):
+  """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
+  with ops.name_scope(name, 'RaggedReduceAny', [input_tensor, axis]):
+    return _cast(
+        reduce_sum(_cast(input_tensor, dtypes.int32), axis, keepdims),
+        dtypes.bool)
+
+
+def _set_ragged_reduce_docstring(func, combination, combined, default, example):
+  func.__doc__ = _RAGGED_REDUCE_DOCSTRING % dict(
+      combination=combination,
+      combined=combined,
+      default=default,
+      example=example)
+
+
+_set_ragged_reduce_docstring(reduce_sum, 'sum', 'summed', '0',
+                             _RAGGED_REDUCE_SUM_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_prod, 'product', 'multiplied', '1',
+                             _RAGGED_REDUCE_PROD_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_min, 'minimum', 'minimized',
+                             '`input_tensor.dtype.min`',
+                             _RAGGED_REDUCE_MIN_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_max, 'maximum', 'maximized',
+                             '`input_tensor.dtype.max`',
+                             _RAGGED_REDUCE_MAX_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_mean, 'mean', 'averaged', 'NaN',
+                             _RAGGED_REDUCE_MEAN_EXAMPLE)
+
+_set_ragged_reduce_docstring(reduce_all, 'logical and', 'and-ed', 'True',
+                             _RAGGED_REDUCE_ALL_EXAMPLE)
+_set_ragged_reduce_docstring(reduce_any, 'logical or', 'or-ed', 'False',
+                             _RAGGED_REDUCE_ANY_EXAMPLE)
diff --git a/tensorflow/python/ops/ragged/ragged_operators.py b/tensorflow/python/ops/ragged/ragged_operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..7654fa22b1e3a6d783a7a3295bca2d1a0b2ea757
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_operators.py
@@ -0,0 +1,78 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operator overloads for `RaggedTensor`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_getitem
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import tf_decorator
+
+
+def _right(operator):
+  """Right-handed version of an operator: swap args x and y."""
+  return tf_decorator.make_decorator(operator, lambda y, x: operator(x, y))
+
+
+# Indexing
+ragged_tensor.RaggedTensor.__getitem__ = ragged_getitem.ragged_tensor_getitem
+
+# Ordering operators
+ragged_tensor.RaggedTensor.__ge__ = math_ops.greater_equal
+ragged_tensor.RaggedTensor.__gt__ = math_ops.greater
+ragged_tensor.RaggedTensor.__le__ = math_ops.less_equal
+ragged_tensor.RaggedTensor.__lt__ = math_ops.less
+
+# Logical operators
+ragged_tensor.RaggedTensor.__and__ = math_ops.logical_and
+ragged_tensor.RaggedTensor.__rand__ = _right(math_ops.logical_and)
+ragged_tensor.RaggedTensor.__invert__ = math_ops.logical_not
+ragged_tensor.RaggedTensor.__ror__ = _right(math_ops.logical_or)
+ragged_tensor.RaggedTensor.__or__ = math_ops.logical_or
+ragged_tensor.RaggedTensor.__xor__ = math_ops.logical_xor
+ragged_tensor.RaggedTensor.__rxor__ = _right(math_ops.logical_xor)
+
+# Arithmetic operators
+ragged_tensor.RaggedTensor.__abs__ = math_ops.abs
+ragged_tensor.RaggedTensor.__add__ = math_ops.add
+ragged_tensor.RaggedTensor.__radd__ = _right(math_ops.add)
+ragged_tensor.RaggedTensor.__div__ = math_ops.div
+ragged_tensor.RaggedTensor.__rdiv__ = _right(math_ops.div)
+ragged_tensor.RaggedTensor.__floordiv__ = math_ops.floordiv
+ragged_tensor.RaggedTensor.__rfloordiv__ = _right(math_ops.floordiv)
+ragged_tensor.RaggedTensor.__mod__ = math_ops.floormod
+ragged_tensor.RaggedTensor.__rmod__ = _right(math_ops.floormod)
+ragged_tensor.RaggedTensor.__mul__ = math_ops.multiply
+ragged_tensor.RaggedTensor.__rmul__ = _right(math_ops.multiply)
+ragged_tensor.RaggedTensor.__neg__ = math_ops.negative
+ragged_tensor.RaggedTensor.__pow__ = math_ops.pow
+ragged_tensor.RaggedTensor.__rpow__ = _right(math_ops.pow)
+ragged_tensor.RaggedTensor.__sub__ = math_ops.subtract
+ragged_tensor.RaggedTensor.__rsub__ = _right(math_ops.subtract)
+ragged_tensor.RaggedTensor.__truediv__ = math_ops.truediv
+ragged_tensor.RaggedTensor.__rtruediv__ = _right(math_ops.truediv)
+
+
+# Dummy methods
+def _dummy_bool(_):
+  """Dummy method to prevent a RaggedTensor from being used as a Python bool."""
+  raise TypeError("RaggedTensor may not be used as a boolean.")
+
+
+ragged_tensor.RaggedTensor.__bool__ = _dummy_bool
+ragged_tensor.RaggedTensor.__nonzero__ = _dummy_bool
diff --git a/tensorflow/python/ops/ragged/ragged_operators_test.py b/tensorflow/python/ops/ragged/ragged_operators_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..78bb37c341e9261a972445cbd34f8e1b0fc674d9
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_operators_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for overloaded RaggedTensor operators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase):
+
+  def testOrderingOperators(self):
+    x = ragged.constant([[1, 5], [3]])
+    y = ragged.constant([[4, 5], [1]])
+    self.assertRaggedEqual((x > y), [[False, False], [True]])
+    self.assertRaggedEqual((x >= y), [[False, True], [True]])
+    self.assertRaggedEqual((x < y), [[True, False], [False]])
+    self.assertRaggedEqual((x <= y), [[True, True], [False]])
+
+  def testArithmeticOperators(self):
+    x = ragged.constant([[1.0, -2.0], [8.0]])
+    y = ragged.constant([[4.0, 4.0], [2.0]])
+    self.assertRaggedEqual(abs(x), [[1.0, 2.0], [8.0]])
+
+    self.assertRaggedEqual((-x), [[-1.0, 2.0], [-8.0]])
+
+    self.assertRaggedEqual((x + y), [[5.0, 2.0], [10.0]])
+    self.assertRaggedEqual((3.0 + y), [[7.0, 7.0], [5.0]])
+    self.assertRaggedEqual((x + 3.0), [[4.0, 1.0], [11.0]])
+
+    self.assertRaggedEqual((x - y), [[-3.0, -6.0], [6.0]])
+    self.assertRaggedEqual((3.0 - y), [[-1.0, -1.0], [1.0]])
+    self.assertRaggedEqual((x + 3.0), [[4.0, 1.0], [11.0]])
+
+    self.assertRaggedEqual((x * y), [[4.0, -8.0], [16.0]])
+    self.assertRaggedEqual((3.0 * y), [[12.0, 12.0], [6.0]])
+    self.assertRaggedEqual((x * 3.0), [[3.0, -6.0], [24.0]])
+
+    self.assertRaggedEqual((x / y), [[0.25, -0.5], [4.0]])
+    self.assertRaggedEqual((y / x), [[4.0, -2.0], [0.25]])
+    self.assertRaggedEqual((2.0 / y), [[0.5, 0.5], [1.0]])
+    self.assertRaggedEqual((x / 2.0), [[0.5, -1.0], [4.0]])
+
+    self.assertRaggedEqual((x // y), [[0.0, -1.0], [4.0]])
+    self.assertRaggedEqual((y // x), [[4.0, -2.0], [0.0]])
+    self.assertRaggedEqual((2.0 // y), [[0.0, 0.0], [1.0]])
+    self.assertRaggedEqual((x // 2.0), [[0.0, -1.0], [4.0]])
+
+    self.assertRaggedEqual((x % y), [[1.0, 2.0], [0.0]])
+    self.assertRaggedEqual((y % x), [[0.0, -0.0], [2.0]])
+    self.assertRaggedEqual((2.0 % y), [[2.0, 2.0], [0.0]])
+    self.assertRaggedEqual((x % 2.0), [[1.0, 0.0], [0.0]])
+
+  def testLogicalOperators(self):
+    a = ragged.constant([[True, True], [False]])
+    b = ragged.constant([[True, False], [False]])
+    self.assertRaggedEqual((~a), [[False, False], [True]])
+
+    self.assertRaggedEqual((a & b), [[True, False], [False]])
+    self.assertRaggedEqual((a & True), [[True, True], [False]])
+    self.assertRaggedEqual((True & b), [[True, False], [False]])
+
+    self.assertRaggedEqual((a | b), [[True, True], [False]])
+    self.assertRaggedEqual((a | False), [[True, True], [False]])
+    self.assertRaggedEqual((False | b), [[True, False], [False]])
+
+    self.assertRaggedEqual((a ^ b), [[False, True], [False]])
+    self.assertRaggedEqual((a ^ True), [[False, False], [True]])
+    self.assertRaggedEqual((True ^ b), [[False, True], [True]])
+
+  def testDummyOperators(self):
+    a = ragged.constant([[True, True], [False]])
+    with self.assertRaisesRegexp(TypeError,
+                                 'RaggedTensor may not be used as a boolean.'):
+      bool(a)
+    with self.assertRaisesRegexp(TypeError,
+                                 'RaggedTensor may not be used as a boolean.'):
+      if a:
+        pass
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_range_op_test.py b/tensorflow/python/ops/ragged/ragged_range_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ab3d4abc3988b05add4bf98e31e472d2d5b2e88
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_range_op_test.py
@@ -0,0 +1,124 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_range op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedRangeOpTest(ragged_test_util.RaggedTensorTestCase):
+
+  def testDocStringExamples(self):
+    """Examples from ragged_range.__doc__."""
+    rt1 = ragged.range([3, 5, 2])
+    self.assertRaggedEqual(rt1, [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]])
+
+    rt2 = ragged.range([0, 5, 8], [3, 3, 12])
+    self.assertRaggedEqual(rt2, [[0, 1, 2], [], [8, 9, 10, 11]])
+
+    rt3 = ragged.range([0, 5, 8], [3, 3, 12], 2)
+    self.assertRaggedEqual(rt3, [[0, 2], [], [8, 10]])
+
+  def testBasicRanges(self):
+    # Specify limits only.
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5]),
+        [list(range(0)), list(range(3)),
+         list(range(5))])
+
+    # Specify starts and limits.
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5], [2, 3, 10]),
+        [list(range(0, 2)),
+         list(range(3, 3)),
+         list(range(5, 10))])
+
+    # Specify starts, limits, and deltas.
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5], [4, 4, 15], [2, 3, 4]),
+        [list(range(0, 4, 2)),
+         list(range(3, 4, 3)),
+         list(range(5, 15, 4))])
+
+  def testFloatRanges(self):
+    expected = [[0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8, 3.2, 3.6], [3.0],
+                [5.0, 7.2, 9.4, 11.6, 13.8]]
+    actual = ragged.range([0.0, 3.0, 5.0], [3.9, 4.0, 15.0], [0.4, 1.5, 2.2])
+    self.assertEqual(
+        expected,
+        [[round(v, 5) for v in row] for row in self.eval_to_list(actual)])
+
+  def testNegativeDeltas(self):
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5], limits=0, deltas=-1),
+        [list(range(0, 0, -1)),
+         list(range(3, 0, -1)),
+         list(range(5, 0, -1))])
+
+    self.assertRaggedEqual(
+        ragged.range([0, -3, 5], limits=0, deltas=[-1, 1, -2]),
+        [list(range(0, 0, -1)),
+         list(range(-3, 0, 1)),
+         list(range(5, 0, -2))])
+
+  def testBroadcast(self):
+    # Specify starts and limits, broadcast deltas.
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5], [4, 4, 15], 3),
+        [list(range(0, 4, 3)),
+         list(range(3, 4, 3)),
+         list(range(5, 15, 3))])
+
+    # Broadcast all arguments.
+    self.assertRaggedEqual(ragged.range(0, 5, 1), [list(range(0, 5, 1))])
+
+  def testEmptyRanges(self):
+    rt1 = ragged.range([0, 5, 3], [0, 3, 5])
+    rt2 = ragged.range([0, 5, 5], [0, 3, 5], -1)
+    self.assertRaggedEqual(rt1, [[], [], [3, 4]])
+    self.assertRaggedEqual(rt2, [[], [5, 4], []])
+
+  def testShapeFnErrors(self):
+    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
+                      [[0]], 5)
+    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
+                      0, [[5]])
+    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
+                      0, 5, [[0]])
+    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
+                      [0], [1, 2])
+
+  def testKernelErrors(self):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'Requires delta != 0'):
+      self.evaluate(ragged.range(0, 0, 0))
+
+  def testShape(self):
+    self.assertRaggedEqual(ragged.range(0, 0, 1).shape.as_list(), [1, None])
+    self.assertRaggedEqual(ragged.range([1, 2, 3]).shape.as_list(), [3, None])
+    self.assertRaggedEqual(
+        ragged.range([1, 2, 3], [4, 5, 6]).shape.as_list(), [3, None])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..890460221bf9fdebe134d6ced77b1fca2dbdffd5
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -0,0 +1,346 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.reduce_<AGGREGATE> ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+_MAX_INT32 = dtypes.int32.max
+_MIN_INT32 = dtypes.int32.min
+_NAN = np.nan
+
+
+def mean(*values):
+  return 1.0 * sum(values) / len(values)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedReduceOpsTest(ragged_test_util.RaggedTensorTestCase,
+                          parameterized.TestCase):
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Docstring examples.  RaggedTensor for testing is:
+      #   [[3, 1, 4],
+      #    [1, 5,  ],
+      #    [9,     ],
+      #    [2, 6   ]]
+      #=========================================================================
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=-2,
+          expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=-1,
+          expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          expected=[4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          expected=[3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
+      ),
+      dict(
+          ragged_reduce_op=ragged.reduce_any,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=0,
+          expected=[True, True, False, True]),
+      dict(
+          ragged_reduce_op=ragged.reduce_any,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=1,
+          expected=[True, True, True]),
+      dict(
+          ragged_reduce_op=ragged.reduce_all,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=0,
+          expected=[False, True, False, True]),
+      dict(
+          ragged_reduce_op=ragged.reduce_all,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=1,
+          expected=[True, False, False]),
+
+      #=========================================================================
+      # Examples with the following RaggedTensor (ragged_rank=1):
+      #   [[0, 1, 2, 3],
+      #    [4         ],
+      #    [          ],
+      #    [5, 6      ],
+      #    [7         ],
+      #    [8, 9      ]]
+      #=========================================================================
+
+      # axis=None
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=0 * 1 * 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=min(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=max(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=None,
+          expected=mean(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
+      # axis=0
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[0 + 4 + 5 + 7 + 8, 1 + 6 + 9, 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[0 * 4 * 5 * 7 * 8, 1 * 6 * 9, 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[min(0, 4, 5, 7, 8), min(1, 6, 9), 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[max(0, 4, 5, 7, 8), max(1, 6, 9), 2, 3]),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=0,
+          expected=[mean(0, 4, 5, 7, 8),
+                    mean(1, 6, 9), 2, 3]),
+      # axis=1
+      # Note: we don't test mean here because it gives a NaN, and this will
+      # cause assertEqual to fail (since NaN != NaN).  See testMeanNan().
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]),
+      dict(
+          ragged_reduce_op=ragged.reduce_prod,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[0 * 1 * 2 * 3, 4, 1, 5 * 6, 7, 8 * 9]),
+      dict(
+          ragged_reduce_op=ragged.reduce_min,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[min(0, 1, 2, 3), 4, _MAX_INT32,
+                    min(5, 6), 7,
+                    min(8, 9)]),
+      dict(
+          ragged_reduce_op=ragged.reduce_max,
+          rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
+          axis=1,
+          expected=[max(0, 1, 2, 3), 4, _MIN_INT32,
+                    max(5, 6), 7,
+                    max(8, 9)]),
+
+      #=========================================================================
+      # Examples with ragged_rank=2:
+      # [[[1, 2], [ ], [3, 4, 5]],
+      #  [[6, 7], [ ], [8      ]],
+      #  [                      ],
+      #  [[9   ]                ]]
+      #=========================================================================
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[],
+          expected=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=None,
+          expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=0,
+          expected=[[1 + 6 + 9, 2 + 7], [], [3 + 8, 4, 5]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=1,
+          expected=[[1 + 3, 2 + 4, 5], [6 + 8, 7], [], [9]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=2,
+          expected=[[1 + 2, 0, 3 + 4 + 5], [6 + 7, 0, 8], [], [9]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 1],
+          expected=[1 + 3 + 6 + 8 + 9, 2 + 4 + 7, 5]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 2],
+          expected=[1 + 6 + 9 + 2 + 7, 0, 3 + 8 + 4 + 5]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[1, 2],
+          expected=[1 + 2 + 3 + 4 + 5, 6 + 7 + 8, 0, 9]),
+      dict(
+          ragged_reduce_op=ragged.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 1, 2],
+          expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
+
+      #=========================================================================
+      # Examples for ragged_reduce_mean ragged_rank=2:
+      # [[[1, 2], [3, 4, 5]],
+      #  [[6, 7], [8      ]],
+      #  [[9   ]          ]]
+      #=========================================================================
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
+          axis=0,
+          expected=[[mean(1, 6, 9), mean(2, 7)], [mean(3, 8), 4, 5]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
+          axis=1,
+          expected=[[mean(1, 3), mean(2, 4), 5], [mean(6, 8), 7], [9]]),
+      dict(
+          ragged_reduce_op=ragged.reduce_mean,
+          rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
+          axis=2,
+          expected=[[mean(1, 2), mean(3, 4, 5)], [mean(6, 7), 8], [9]]),
+  )
+  def testReduce(self, ragged_reduce_op, rt_input, axis, expected):
+    rt_input = ragged.constant(rt_input)
+    reduced = ragged_reduce_op(rt_input, axis)
+    self.assertRaggedEqual(reduced, expected)
+
+  def assertEqualWithNan(self, actual, expected):
+    """Like assertEqual, but NaN==NaN."""
+    self.assertTrue(
+        ((actual == expected) | (np.isnan(actual) & np.isnan(expected))).all())
+
+  def testMeanNan(self):
+    rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
+    expected = (
+        np.array([0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]) / np.array(
+            [4, 1, 0, 2, 1, 2]))
+    rt_input = ragged.constant(rt_as_list)
+    reduced = ragged.reduce_mean(rt_input, axis=1)
+    self.assertEqualWithNan(self.evaluate(reduced), expected)
+
+  def testMeanWithTensorInputs(self):
+    tensor = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
+    expected = [2.0, 20.0]
+    reduced = ragged.reduce_mean(tensor, axis=1)
+    self.assertRaggedEqual(reduced, expected)
+
+  def testErrors(self):
+    rt_input = ragged.constant([[1, 2, 3], [4, 5]])
+    axis = array_ops.placeholder_with_default(constant_op.constant([0]), None)
+
+    if not context.executing_eagerly():
+      self.assertRaisesRegexp(
+          ValueError, r'axis must be known at graph construction time.',
+          ragged.reduce_sum, rt_input, axis)
+    self.assertRaisesRegexp(TypeError,
+                            r'axis must be an int; got str.*',
+                            ragged.reduce_sum, rt_input, ['x'])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..15112d6c9c56b0e15247fc7c2f0b8410a5b9d376
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
@@ -0,0 +1,146 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.row_lengths."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedRowLengthsOp(ragged_test_util.RaggedTensorTestCase,
+                         parameterized.TestCase):
+
+  @parameterized.parameters([
+      # Docstring Example
+      dict(
+          rt_input=[[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []],
+          expected=[2, 0, 2, 1, 0]),
+      dict(
+          rt_input=[[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []],
+          axis=2,
+          expected=[[3, 1], [], [2, 1], [1], []]),
+
+      # 2D Tensor (1 ragged dimension)
+      dict(
+          rt_input=[['a'], ['b', 'c', 'd'], ['e'], [], ['f']],
+          expected=[1, 3, 1, 0, 1]),
+      dict(
+          rt_input=[['a'], ['b', 'c', 'd'], ['e'], [], ['f']],
+          axis=0,
+          expected=5),
+      dict(
+          rt_input=[['a', 'b', 'c', 'd', 'e', 'f', 'g']],
+          expected=[7]),
+      dict(
+          rt_input=[[], ['a', 'b', 'c', 'd', 'e', 'f', 'g'], []],
+          expected=[0, 7, 0]),
+      dict(
+          rt_input=[],
+          ragged_rank=1,
+          expected=[]),
+      dict(
+          rt_input=[],
+          ragged_rank=1,
+          axis=0,
+          expected=0),
+
+      # 3D Tensor (1 ragged dimension)
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
+          ragged_rank=1,
+          axis=0,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
+          ragged_rank=1,
+          axis=1,
+          expected=[3, 2]),
+      dict(
+          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
+          ragged_rank=1,
+          axis=2,
+          expected=[[2, 2, 2], [2, 2]],
+          expected_ragged_rank=1),
+
+      # 3D Tensor (2 ragged dimensions)
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=0,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=-3,
+          expected=2),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=1,
+          expected=[3, 2]),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=-2,
+          expected=[3, 2]),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=2,
+          expected=[[2, 3, 0], [4, 1]],
+          expected_ragged_rank=1),
+      dict(
+          rt_input=[[[1, 2], [3, 4, 5], []], [[6, 7, 8, 9], [10]]],
+          axis=-1,
+          expected=[[2, 3, 0], [4, 1]],
+          expected_ragged_rank=1),
+  ])  # pyformat: disable
+  def testRowLengths(self,
+                     rt_input,
+                     expected,
+                     axis=1,
+                     ragged_rank=None,
+                     expected_ragged_rank=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    lengths = rt.row_lengths(axis)
+    self.assertRaggedEqual(lengths, expected)
+    if expected_ragged_rank is not None:
+      if isinstance(lengths, ragged.RaggedTensor):
+        self.assertEqual(lengths.ragged_rank, expected_ragged_rank)
+      else:
+        self.assertEqual(0, expected_ragged_rank)
+
+  @parameterized.parameters([
+      dict(  # axis=2 out of bounds: expected -2<=axis<2.
+          rt_input=[[10, 20], [30]],
+          axis=2,
+          exception=(ValueError, errors.InvalidArgumentError)),
+      dict(  # axis=-3 out of bounds: expected -2<=axis<2.
+          rt_input=[[2, 3, 0], [4, 1, 2]],
+          axis=-3,
+          exception=(ValueError, errors.InvalidArgumentError)),
+  ])
+  def testErrors(self, rt_input, exception, message=None, axis=1):
+    rt = ragged.constant(rt_input)
+    with self.assertRaisesRegexp(exception, message):
+      rt.row_lengths(axis)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2970540f3e585a7e9399dbe561f148a5abc9ee2c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
@@ -0,0 +1,56 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ragged.row_splits_to_segment_ids() op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSplitsToSegmentIdsOpTest(ragged_test_util.RaggedTensorTestCase):
+
+  def testDocStringExample(self):
+    splits = [0, 3, 3, 5, 6, 9]
+    expected = [0, 0, 0, 2, 2, 3, 4, 4, 4]
+    segment_ids = ragged.row_splits_to_segment_ids(splits)
+    self.assertAllEqual(segment_ids, expected)
+
+  def testEmptySplits(self):
+    # Note: the splits for an empty ragged tensor contains a single zero.
+    segment_ids = ragged.row_splits_to_segment_ids([0])
+    self.assertAllEqual(segment_ids, [])
+
+  def testErrors(self):
+    self.assertRaisesRegexp(ValueError, r'Invalid row_splits: \[\]',
+                            ragged.row_splits_to_segment_ids, [])
+    self.assertRaisesRegexp(
+        ValueError, r'Tensor conversion requested dtype int64 for '
+        'Tensor with dtype float32', ragged.row_splits_to_segment_ids,
+        constant_op.constant([0.5]))
+    self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
+                            ragged.row_splits_to_segment_ids, 0)
+    self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
+                            ragged.row_splits_to_segment_ids, [[0]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ed962676700ade62adb76b035a9b4e1dc5c5d73
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ragged.segment_ids_to_row_splits() op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSplitsToSegmentIdsOpTest(ragged_test_util.RaggedTensorTestCase):
+
+  def testDocStringExample(self):
+    segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
+    expected = [0, 3, 3, 5, 6, 9]
+    splits = ragged.segment_ids_to_row_splits(segment_ids)
+    self.assertAllEqual(splits, expected)
+
+  def testEmptySegmentIds(self):
+    # Note: the splits for an empty ragged tensor contains a single zero.
+    segment_ids = ragged.segment_ids_to_row_splits([])
+    self.assertAllEqual(segment_ids, [0])
+
+  def testErrors(self):
+    self.assertRaisesRegexp(TypeError,
+                            r'segment_ids must be an integer tensor.*',
+                            ragged.segment_ids_to_row_splits,
+                            constant_op.constant([0.5]))
+    self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
+                            ragged.segment_ids_to_row_splits, 0)
+    self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
+                            ragged.segment_ids_to_row_splits, [[0]])
+
+  def testNumSegments(self):
+    segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
+    num_segments = 7
+    expected = [0, 3, 3, 5, 6, 9, 9, 9]
+    splits = ragged.segment_ids_to_row_splits(segment_ids, num_segments)
+    self.assertAllEqual(splits, expected)
+
+  def testUnsortedSegmentIds(self):
+    # Segment ids are not required to be sorted.
+    segment_ids = [0, 4, 3, 2, 4, 4, 2, 0, 0]
+    splits1 = ragged.segment_ids_to_row_splits(segment_ids)
+    expected1 = [0, 3, 3, 5, 6, 9]
+
+    splits2 = ragged.segment_ids_to_row_splits(segment_ids, 7)
+    expected2 = [0, 3, 3, 5, 6, 9, 9, 9]
+    self.assertAllEqual(splits1, expected1)
+    self.assertAllEqual(splits2, expected2)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_segment_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..be1f39afef0e720c0c23d9d8571fc70907696d6d
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
@@ -0,0 +1,219 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_range op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+def prod(values):
+  val = 1
+  for v in values:
+    val *= v
+  return val
+  # return reduce(lambda x, y: x * y, values, 1)
+
+
+def mean(values):
+  return 1.0 * sum(values) / len(values)
+
+
+def sqrt_n(values):
+  return 1.0 * sum(values) / math.sqrt(len(values))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSegmentOpsTest(ragged_test_util.RaggedTensorTestCase,
+                           parameterized.TestCase):
+
+  def expected_value(self, data, segment_ids, num_segments, combiner):
+    """Find the expected value for a call to ragged_segment_<aggregate>.
+
+    Args:
+      data: The input RaggedTensor, expressed as a nested python list.
+      segment_ids: The segment ids, as a python list of ints.
+      num_segments: The number of segments, as a python int.
+      combiner: The Python function used to combine values.
+    Returns:
+      The expected value, as a nested Python list.
+    """
+    self.assertEqual(len(data), len(segment_ids))
+
+    # Build an empty (num_segments x ncols) "grouped" matrix
+    ncols = max(len(row) for row in data)
+    grouped = [[[] for _ in range(ncols)] for row in range(num_segments)]
+
+    # Append values from data[row] to grouped[segment_ids[row]]
+    for row in range(len(data)):
+      for col in range(len(data[row])):
+        grouped[segment_ids[row]][col].append(data[row][col])
+
+    # Combine the values.
+    return [[combiner(values)
+             for values in grouped_row
+             if values]
+            for grouped_row in grouped]
+
+  @parameterized.parameters(
+      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+  )
+  def testRaggedSegment_Int(self, segment_op, combiner, segment_ids):
+    rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
+    rt = ragged.constant(rt_as_list)
+    num_segments = max(segment_ids) + 1
+    expected = self.expected_value(rt_as_list, segment_ids, num_segments,
+                                   combiner)
+
+    segmented = segment_op(rt, segment_ids, num_segments)
+    self.assertRaggedEqual(segmented, expected)
+
+  @parameterized.parameters(
+      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 1, 1, 2, 2]),
+      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 1, 1, 1]),
+      (ragged.segment_sqrt_n, sqrt_n, [5, 4, 3, 2, 1, 0]),
+      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 10, 10, 10]),
+  )
+  def testRaggedSegment_Float(self, segment_op, combiner, segment_ids):
+    rt_as_list = [[0., 1., 2., 3.], [4.], [], [5., 6.], [7.], [8., 9.]]
+    rt = ragged.constant(rt_as_list)
+    num_segments = max(segment_ids) + 1
+    expected = self.expected_value(rt_as_list, segment_ids, num_segments,
+                                   combiner)
+
+    segmented = segment_op(rt, segment_ids, num_segments)
+    self.assertRaggedAlmostEqual(segmented, expected, places=5)
+
+  def testRaggedRankTwo(self):
+    rt = ragged.constant([
+        [[111, 112, 113, 114], [121],],  # row 0
+        [],                              # row 1
+        [[], [321, 322], [331]],         # row 2
+        [[411, 412]]                     # row 3
+    ])  # pyformat: disable
+    segment_ids1 = [0, 2, 2, 2]
+    segmented1 = ragged.segment_sum(rt, segment_ids1, 3)
+    expected1 = [[[111, 112, 113, 114], [121]],     # row 0
+                 [],                                # row 1
+                 [[411, 412], [321, 322], [331]]    # row 2
+                ]  # pyformat: disable
+    self.assertRaggedEqual(segmented1, expected1)
+
+    segment_ids2 = [1, 2, 1, 1]
+    segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
+    expected2 = [[],
+                 [[111+411, 112+412, 113, 114], [121+321, 322], [331]],
+                 []]  # pyformat: disable
+    self.assertRaggedEqual(segmented2, expected2)
+
+  def testRaggedSegmentIds(self):
+    rt = ragged.constant([
+        [[111, 112, 113, 114], [121],],  # row 0
+        [],                              # row 1
+        [[], [321, 322], [331]],         # row 2
+        [[411, 412]]                     # row 3
+    ])  # pyformat: disable
+    segment_ids = ragged.constant([[1, 2], [], [1, 1, 2], [2]])
+    segmented = ragged.segment_sum(rt, segment_ids, 3)
+    expected = [[],
+                [111+321, 112+322, 113, 114],
+                [121+331+411, 412]]  # pyformat: disable
+    self.assertRaggedEqual(segmented, expected)
+
+  def testShapeMismatchError1(self):
+    dt = constant_op.constant([1, 2, 3, 4, 5, 6])
+    segment_ids = ragged.constant([[1, 2], []])
+    self.assertRaisesRegexp(
+        ValueError, 'segment_ids.shape must be a prefix of data.shape, '
+        'but segment_ids is ragged and data is not.', ragged.segment_sum, dt,
+        segment_ids, 3)
+
+  def testShapeMismatchError2(self):
+    rt = ragged.constant([
+        [[111, 112, 113, 114], [121]],  # row 0
+        [],                             # row 1
+        [[], [321, 322], [331]],        # row 2
+        [[411, 412]]                    # row 3
+    ])  # pyformat: disable
+    segment_ids = ragged.constant([[1, 2], [1], [1, 1, 2], [2]])
+
+    # Error is raised at graph-building time if we can detect it then.
+    self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'segment_ids.shape must be a prefix of data.shape.*',
+        ragged.segment_sum, rt, segment_ids, 3)
+
+    # Otherwise, error is raised when we run the graph.
+    segment_ids2 = ragged.RaggedTensor.from_row_splits(
+        array_ops.placeholder_with_default(segment_ids.values, None),
+        array_ops.placeholder_with_default(segment_ids.row_splits, None))
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'segment_ids.shape must be a prefix of data.shape.*'):
+      self.evaluate(ragged.segment_sum(rt, segment_ids2, 3))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..17d80b5aadc936cfe11c3f65628cc57bf2c60361
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -0,0 +1,331 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.stack."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
+
+  @parameterized.parameters(
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']], [[b'b00'],
+                                                               [b'b10']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=1,
+          expected=[
+              [[b'a00', b'a01'], [b'b00']],
+              [[], [b'b10', b'b11', b'b12']],
+              [[b'a20', b'a21', b'a22'], [b'b20']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00', 'b01'], [], ['b20', 'b21', 'b22']]),  # shape=(3, None)
+          axis=2,
+          expected=[
+              [[b'a00', b'b00'], [b'a01', b'b01']], [],
+              [[b'a20', b'b20'], [b'a21', b'b21'], [b'a22', b'b22']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-3',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],   # shape=(3, None)
+              [['b00'], ['b10']]),                    # shape=(2, None)
+          axis=-3,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']], [[b'b00'],
+                                                               [b'b10']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']]),    # shape=(3, None)
+          axis=-2,
+          expected=[
+              [[b'a00', b'a01'], [b'b00']],
+              [[], [b'b10', b'b11', b'b12']],
+              [[b'a20', b'a21', b'a22'], [b'b20']]]),
+      dict(
+          descr='Two rank-2 inputs (ragged_rank=1), axis=-1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00', 'b01'], [], ['b20', 'b21', 'b22']]),  # shape=(3, None)
+          axis=-1,
+          expected=[
+              [[b'a00', b'b00'], [b'a01', b'b01']], [],
+              [[b'a20', b'b20'], [b'a21', b'b21'], [b'a22', b'b22']]]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10']],                            # shape=(2, None)
+              [['c00'], ['c10', 'c11'], ['c21']]),           # shape=(3, None)
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21', b'a22']],
+                    [[b'b00'], [b'b10']],
+                    [[b'c00'], [b'c10', b'c11'], [b'c21']]]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],     # shape=(3, None)
+              [[], ['c10', 'c11'], ['c20', 'c21']]),         # shape=(3, None)
+          axis=1,
+          expected=[
+              [[b'a00', b'a01'], [b'b00'], []],
+              [[], [b'b10', b'b11', b'b12'], [b'c10', b'c11']],
+              [[b'a20', b'a21', b'a22'], [b'b20'], [b'c20', b'c21']]],
+          expected_shape=[3, None, None]),
+      dict(
+          descr='Three rank-2 inputs (ragged_rank=1), axis=2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],   # shape=(3, None)
+              [['b00', 'b01'], [], ['b20', 'b21', 'b22']],   # shape=(3, None)
+              [['c00', 'c01'], [], ['c20', 'c21', 'c22']]),  # shape=(3, None)
+          axis=2,
+          expected=[
+              [[b'a00', b'b00', b'c00'], [b'a01', b'b01', b'c01']], [],
+              [[b'a20', b'b20', b'c20'], [b'a21', b'b21', b'c21'],
+               [b'a22', b'b22', b'c22']]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=0',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [['c100', 'c101', 'c102', 'c103']], [[], ['c210', 'c211']]]),
+          axis=0,
+          expected=[
+              [[[b'a000', b'a001'], [b'a010']],
+               [[b'a100', b'a101', b'a102'], [b'a110', b'a111']]],
+              [[[b'b000']],
+               [[b'b100', b'b101'], [b'b110']]],
+              [[],
+               [[b'c100', b'c101', b'c102', b'c103']],
+               [[], [b'c210', b'c211']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[['b000']], [['b100', 'b101'], ['b110']]],
+              [[], [[], ['c110', 'c111']]]),
+          axis=1,
+          expected=[
+              [[[b'a000', b'a001'], [b'a010']], [[b'b000']], []],
+              [[[b'a100', b'a101', b'a102'], [b'a110', b'a111']],
+               [[b'b100', b'b101'], [b'b110']],
+               [[], [b'c110', b'c111']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=2',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=2,
+          expected=[
+              [[[b'a000', b'a001'], [], [b'c000']],
+               [[b'a010'], [b'b010', b'b011'], [b'c010']]],
+              [[[b'a100', b'a101', b'a102'], [b'b100', b'b101'], []],
+               [[b'a110', b'a111'], [b'b110'], [b'c110', b'c111']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=3',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']]],
+              [[['b000', 'b001'], ['b010']]],
+              [[['c000', 'c001'], ['c010']]]),
+          axis=3,
+          expected=[[
+              [[b'a000', b'b000', b'c000'], [b'a001', b'b001', b'c001']],
+              [[b'a010', b'b010', b'c010']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=-2',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']],
+               [['a100', 'a101', 'a102'], ['a110', 'a111']]],
+              [[[], ['b010', 'b011']], [['b100', 'b101'], ['b110']]],
+              [[['c000'], ['c010']], [[], ['c110', 'c111']]]),
+          axis=-2,
+          expected=[
+              [[[b'a000', b'a001'], [], [b'c000']],
+               [[b'a010'], [b'b010', b'b011'], [b'c010']]],
+              [[[b'a100', b'a101', b'a102'], [b'b100', b'b101'], []],
+               [[b'a110', b'a111'], [b'b110'], [b'c110', b'c111']]]]),
+      dict(
+          descr='Three rank-3 inputs (ragged_rank=2), axis=-1',
+          rt_inputs=(
+              [[['a000', 'a001'], ['a010']]],
+              [[['b000', 'b001'], ['b010']]],
+              [[['c000', 'c001'], ['c010']]]),
+          axis=-1,
+          expected=[[
+              [[b'a000', b'b000', b'c000'], [b'a001', b'b001', b'c001']],
+              [[b'a010', b'b010', b'c010']]]]),
+      dict(
+          descr='ragged_stack([uniform, ragged, uniform], axis=1)',
+          ragged_ranks=[0, 1, 0],
+          rt_inputs=(
+              [['0('], ['1('], ['2(']],                   # shape=(3, 1)
+              [['b00'], ['b10', 'b11', 'b12'], ['b20']],  # shape=(3, None)
+              [[')0'], [')1'], [')2']]),                  # shape=(3, 1)
+          axis=1,
+          expected=[
+              [[b'0('], [b'b00'], [b')0']],
+              [[b'1('], [b'b10', b'b11', b'b12'], [b')1']],
+              [[b'2('], [b'b20'], [b')2']]]),
+      dict(
+          descr='ragged_stack([uniform, uniform], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [[b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21']],
+              [[b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']]]),
+      dict(
+          descr='ragged_stack([uniform, ragged], axis=0)',
+          ragged_ranks=[0, 1],
+          rt_inputs=(
+              [['a00', 'a01'], ['a10', 'a11'], ['a20', 'a21']],  # shape=(3, 2)
+              [['b00', 'b01', 'b02'], ['b10', 'b11', 'b12']]),   # shape=(2, 3)
+          axis=0,
+          expected=[
+              [[b'a00', b'a01'], [b'a10', b'a11'], [b'a20', b'a21']],
+              [[b'b00', b'b01', b'b02'], [b'b10', b'b11', b'b12']]]),
+      dict(
+          descr='ragged_stack([uniform, ragged], axis=0) with rank-3 inputs',
+          ragged_ranks=[0, 2],
+          rt_inputs=(
+              [[[0, 1], [2, 3]], [[4, 5], [6, 7]]],  # shape = (2, 2, 2)
+              [[[8], [8, 8]]]),                      # shape = (2, None, None)
+          axis=0,
+          expected=[[[[0, 1], [2, 3]], [[4, 5], [6, 7]]], [[[8], [8, 8]]]]),
+      dict(
+          descr='Two rank-3 inputs with ragged_rank=1, axis=-1',
+          ragged_ranks=[1, 1],
+          rt_inputs=(
+              [[[0, 1], [2, 3], [4, 5]], [], [[6, 7], [8, 9]]],
+              [[[9, 8], [7, 6], [5, 4]], [], [[3, 2], [1, 0]]]),
+          axis=-1,
+          expected=[
+              [[[0, 9], [1, 8]], [[2, 7], [3, 6]], [[4, 5], [5, 4]]],
+              [],
+              [[[6, 3], [7, 2]], [[8, 1], [9, 0]]]],
+          expected_shape=[3, None, 2, 2]),
+      dict(
+          descr='Two rank-3 inputs with ragged_rank=1, axis=-2',
+          ragged_ranks=[1, 1],
+          rt_inputs=(
+              [[[0, 1], [2, 3], [4, 5]], [], [[6, 7], [8, 9]]],
+              [[[9, 8], [7, 6], [5, 4]], [], [[3, 2], [1, 0]]]),
+          axis=-2,
+          expected=[
+              [[[0, 1], [9, 8]], [[2, 3], [7, 6]], [[4, 5], [5, 4]]], [],
+              [[[6, 7], [3, 2]], [[8, 9], [1, 0]]]]),
+      dict(
+          descr='ragged_stack([vector, vector], axis=0)',
+          ragged_ranks=[0, 0],
+          rt_inputs=([1, 2, 3], [4, 5, 6]),
+          axis=0,
+          expected=[[1, 2, 3], [4, 5, 6]]),
+      dict(
+          descr='One input (so just adds an outer dimension)',
+          rt_inputs=([['a00', 'a01'], [], ['a20', 'a21']],),
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']]]),
+  )   # pyformat: disable
+  def testRaggedStack(self,
+                      descr,
+                      rt_inputs,
+                      axis,
+                      expected,
+                      ragged_ranks=None,
+                      expected_ragged_rank=None,
+                      expected_shape=None):
+    if ragged_ranks is None:
+      ragged_ranks = [None] * len(rt_inputs)
+    rt_inputs = [
+        ragged.constant(rt_input, ragged_rank=rrank)
+        if rrank != 0 else constant_op.constant(rt_input)
+        for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
+    ]
+    stacked = ragged.stack(rt_inputs, axis)
+    if expected_ragged_rank is not None:
+      self.assertEqual(stacked.ragged_rank, expected_ragged_rank)
+    if expected_shape is not None:
+      self.assertEqual(stacked.shape.as_list(), expected_shape)
+    self.assertRaggedEqual(stacked, expected)
+
+  @parameterized.parameters(
+      dict(
+          rt_inputs=(),
+          axis=0,
+          error=ValueError,
+          message=r'rt_inputs may not be empty\.'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=r'foo',
+          error=TypeError,
+          message='axis must be an int'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=-4,
+          error=ValueError,
+          message='axis=-4 out of bounds: expected -3<=axis<3'),
+      dict(
+          rt_inputs=([[1, 2]], [[3, 4]]),
+          axis=3,
+          error=ValueError,
+          message='axis=3 out of bounds: expected -3<=axis<3'),
+  )
+  def testError(self, rt_inputs, axis, error, message):
+    self.assertRaisesRegexp(error, message, ragged.stack, rt_inputs, axis)
+
+  def testSingleTensorInput(self):
+    """Tests ragged_stack with a single tensor input.
+
+    Usually, we pass a list of values in for rt_inputs.  However, you can
+    also pass in a single value (as with tf.stack), in which case it is
+    equivalent to expand_dims(axis=0).  This test exercises that path.
+    """
+    rt_inputs = ragged.constant([[1, 2], [3, 4]])
+    stacked = ragged.stack(rt_inputs, 0)
+    self.assertRaggedEqual(stacked, [[[1, 2], [3, 4]]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f9f0abe4f04bf0a9a2822df28af842cd18fc553
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -0,0 +1,118 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ragged operations for working with string Tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import tf_export
+
+
+# pylint: disable=redefined-builtin
+@tf_export("strings.unicode_encode")
+def unicode_encode(input, output_encoding, errors="replace",
+                   replacement_char=65533, name=None):
+  r"""Encodes each sequence of Unicode code points in `input` into a string.
+
+  `result[i1...iN]` is the string formed by concatenating the Unicode
+  codepoints `input[1...iN, :]`, encoded using `output_encoding`.
+
+  Args:
+    input: An `N+1` dimensional potentially ragged integer tensor with
+        shape `[D1...DN, num_chars]`.
+    output_encoding: Unicode encoding that should be used to encode each
+      codepoint sequence.  Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`.
+    errors: Specifies the response when an invalid codepoint is encountered
+      (optional). One of:
+            * `'replace'`: Replace invalid codepoint with the
+              `replacement_char`. (default)
+            * `'ignore'`: Skip invalid codepoints.
+            * `'strict'`: Raise an exception for any invalid codepoint.
+    replacement_char: The replacement character codepoint to be used in place of
+      any invalid input when `errors='replace'`. Any valid unicode codepoint may
+      be used. The default value is the default unicode replacement character
+      which is 0xFFFD (U+65533).
+    name: A name for the operation (optional).
+
+  Returns:
+    A `N` dimensional `string` tensor with shape `[D1...DN]`.
+
+  #### Example:
+    ```python
+      >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
+      >>> unicode_encode(input, 'UTF8')
+      ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a']
+    ```
+  """
+  with ops.name_scope(name, "UnicodeEncode", [input]):
+    input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(input)
+    if input_tensor.shape.ndims is None:
+      raise ValueError("Rank of input_tensor must be statically known.")
+    if ragged_tensor.is_ragged(input_tensor):
+      if input_tensor.flat_values.shape.ndims > 1:
+        # If the flat_values of our ragged tensor is multi-dimensional, we can
+        # process it separately and our output will have the same nested splits
+        # as our input.
+        return input_tensor.with_flat_values(
+            unicode_encode(input_tensor.flat_values, output_encoding, errors,
+                           replacement_char))
+      elif input_tensor.ragged_rank > 1:
+        # Recursively process the values of the ragged tensor.
+        return input_tensor.with_values(
+            unicode_encode(input_tensor.values, output_encoding, errors,
+                           replacement_char))
+      else:
+        # Our ragged tensor is of the correct shape (rank 1 flat_values tensor
+        # with ragged_rank of 1) so we can process it as normal.
+        return gen_string_ops.unicode_encode(
+            input_values=input_tensor.values,
+            input_splits=input_tensor.row_splits,
+            output_encoding=output_encoding,
+            errors=errors,
+            replacement_char=replacement_char)
+    else:
+      if input_tensor.shape.ndims == 2:
+        # The input tensor is of the correct 2-D shape, it's just not ragged.
+        return unicode_encode(ragged_conversion_ops.from_tensor(input_tensor),
+                              output_encoding, errors, replacement_char)
+      elif input_tensor.shape.ndims > 2:
+        # We need to initially flatten the input tensor to 2-D, and then can
+        # reshape the output of our processed flattened tensor.
+        flat_input_tensor = array_ops.reshape(
+            input_tensor,
+            array_ops.stack([-1, array_ops.shape(input_tensor)[-1]]))
+        flat_output_tensor = unicode_encode(flat_input_tensor, output_encoding,
+                                            errors, replacement_char)
+        return array_ops.reshape(flat_output_tensor, input_tensor.shape[:-1])
+      elif input_tensor.shape.ndims == 0:
+        raise ValueError("input_tensor's rank must be at least 1.")
+      else:
+        # Our input tensor is rank 1, so we create a ragged tensor with an added
+        # dimension to create the correct input shape & type, and then remove
+        # the additional dimension from the output and return the string scalar.
+        ragged_input_tensor = ragged_tensor.RaggedTensor.from_row_splits(
+            input_tensor,
+            array_ops.stack(
+                [0, array_ops.shape(input_tensor, out_type=dtypes.int64)[0]]))
+        output_tensor = unicode_encode(ragged_input_tensor, output_encoding,
+                                       errors, replacement_char)
+        return array_ops.reshape(output_tensor, [])
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..567c50203af592e57168063e20787b3ed621b8c8
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -0,0 +1,1608 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for storing ragged tensors and their values."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_ragged_conversion_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import segment_id_ops
+
+# pylint: disable=protected-access
+_eval_using_default_session = ops._eval_using_default_session
+
+# pylint: enable=protected-access
+
+#===============================================================================
+# RaggedTensor
+#===============================================================================
+
+
+class RaggedTensor(object):
+  """Represents a ragged tensor (go/ragged).
+
+  A `RaggedTensor` is a tensor with one or more *ragged dimensions*, which are
+  dimensions whose slices may have different lengths.  For example, the inner
+  (column) dimension of `rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged,
+  since the column slices (`rt[0, :]`, ..., `rt[4, :]`) have different lengths.
+  Dimensions whose slices all have the same length are called *uniform
+  dimensions*.  The outermost dimension of a `RaggedTensor` is always uniform,
+  since it consists of a single slice (and so there is no possibility for
+  differing slice lengths).
+
+  The total number of dimensions in a `RaggedTensor` is called its *rank*,
+  and the number of ragged dimensions in a `RaggedTensor` is called its
+  *ragged-rank*.  A `RaggedTensor`'s ragged-rank is fixed at graph creation
+  time: it can't depend on the runtime values of `Tensor`s, and can't vary
+  dynamically for different session runs.
+
+  ### Potentially Ragged Tensors
+
+  Many ops support both `Tensor`s and `RaggedTensor`s.  The term "potentially
+  ragged tensor" may be used to refer to a tensor that might be either a
+  `Tensor` or a `RaggedTensor`.  The ragged-rank of a `Tensor` is zero.
+
+  ### Documenting RaggedTensor Shapes
+
+  When documenting the shape of a RaggedTensor, ragged dimensions can be
+  indicated by enclosing them in parentheses.  For example, the shape of
+  a 3-D `RaggedTensor` that stores the fixed-size word embedding for each
+  word in a sentence, for each sentence in a batch, could be written as
+  `[num_sentences, (num_words), embedding_size]`.  The parentheses around
+  `(num_words)` indicate that dimension is ragged, and that the length
+  of each element list in that dimension may vary for each item.
+
+  ### Component Tensors
+
+  Internally, a `RaggedTensor` consists of a concatenated list of values that
+  are partitioned into variable-length rows.  In particular, each `RaggedTensor`
+  consists of:
+
+    * A `values` tensor, which concatenates the variable-length rows into a
+      flattened list.  For example, the `values` tensor for
+      `[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is `[3, 1, 4, 1, 5, 9, 2, 6]`.
+
+    * A `row_splits` vector, which indicates how those flattened values are
+      divided into rows.  In particular, the values for row `rt[i]` are stored
+      in the slice `rt.values[rt.row_splits[i]:rt.row_splits[i+1]]`.
+
+  Example:
+
+  ```python
+  >>> print(tf.RaggedTensor.from_row_splits(
+  ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...     row_splits=[0, 4, 4, 7, 8, 8]))
+  <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+  ```
+
+  ### Alternative Row-Partitioning Schemes
+
+  In addition to `row_splits`, ragged tensors provide support for four other
+  row-partitioning schemes:
+
+    * `row_lengths`: a vector with shape `[nrows]`, which specifies the length
+      of each row.
+
+    * `value_rowids` and `nrows`: `value_rowids` is a vector with shape
+      `[nvals]`, corresponding one-to-one with `values`, which specifies
+      each value's row index.  In particular, the row `rt[row]` consists of the
+      values `rt.values[j]` where `value_rowids[j]==row`.  `nrows` is an
+      int64 scalar that specifies the number of rows in the `RaggedTensor`.
+      (`nrows` is used to indicate trailing empty rows.)
+
+    * `row_starts`: a vector with shape `[nrows]`, which specifies the start
+      offset of each row.  Equivalent to `row_splits[:-1]`.
+
+    * `row_limits`: a vector with shape `[nrows]`, which specifies the stop
+      offset of each row.  Equivalent to `row_splits[1:]`.
+
+  Example: The following ragged tensors are equivalent, and all represent the
+  nested list `[[3, 1, 4, 1], [], [5, 9, 2], [6], []]`.
+
+  ```python
+  >>> values = [3, 1, 4, 1, 5, 9, 2, 6]
+  >>> rt1 = RaggedTensor.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+  >>> rt2 = RaggedTensor.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+  >>> rt3 = RaggedTensor.from_value_rowids(
+  ...     values, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+  >>> rt4 = RaggedTensor.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+  >>> rt5 = RaggedTensor.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+  ```
+
+  ### Multiple Ragged Dimensions
+
+  `RaggedTensor`s with multiple ragged dimensions can be defined by using
+  a nested `RaggedTensor` for the `values` tensor.  Each nested `RaggedTensor`
+  adds a single ragged dimension.
+
+  ```python
+  >>> inner_rt = RaggedTensor.from_row_splits(  # =rt1 from above
+  ...     values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+  >>> outer_rt = RaggedTensor.from_row_splits(
+  ...     values=inner_rt, row_splits=[0, 3, 3, 5])
+  >>> print outer_rt.to_list()
+  [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
+  >>> print outer_rt.ragged_rank
+  2
+  ```
+
+  The factory function `RaggedTensor.from_nested_row_splits` may be used to
+  construct a `RaggedTensor` with multiple ragged dimensions directly, by
+  providing a list of `row_splits` tensors:
+
+  ```python
+  >>> RaggedTensor.from_nested_row_splits(
+  ...     flat_values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...     nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8])).to_list()
+  [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
+  ```
+
+  ### Uniform Inner Dimensions
+
+  `RaggedTensor`s with uniform inner dimensions can be defined
+  by using a multidimensional `Tensor` for `values`.
+
+  ```python
+  >>> rt = RaggedTensor.from_row_splits(values=tf.ones([5, 3]),
+  ..                                    row_splits=[0, 2, 5])
+  >>> print rt.to_list()
+  [[[1, 1, 1], [1, 1, 1]],
+   [[1, 1, 1], [1, 1, 1], [1, 1, 1]]]
+   >>> print rt.shape
+   (2, ?, 3)
+  ```
+
+  ### RaggedTensor Shape Restrictions
+
+  The shape of a RaggedTensor is currently restricted to have the following
+  form:
+
+    * A single uniform dimension
+    * Followed by one or more ragged dimensions
+    * Followed by zero or more uniform dimensions.
+
+  This restriction follows from the fact that each nested `RaggedTensor`
+  replaces the uniform outermost dimension of its `values` with a uniform
+  dimension followed by a ragged dimension.
+  """
+
+  #=============================================================================
+  # Constructor (private)
+  #=============================================================================
+  def __init__(self,
+               values,
+               row_splits,
+               cached_row_lengths=None,
+               cached_value_rowids=None,
+               cached_nrows=None,
+               internal=False):
+    """Creates a `RaggedTensor` with a specified partitioning for `values`.
+
+    This constructor is private -- please use one of the following ops to
+    build `RaggedTensor`s:
+
+      * `tf.RaggedTensor.from_row_lengths`
+      * `tf.RaggedTensor.from_value_rowids`
+      * `tf.RaggedTensor.from_row_splits`
+      * `tf.RaggedTensor.from_row_starts`
+      * `tf.RaggedTensor.from_row_limits`
+      * `tf.RaggedTensor.from_nested_row_splits`
+      * `tf.RaggedTensor.from_nested_row_lengths`
+      * `tf.RaggedTensor.from_nested_value_rowids`
+
+    Args:
+      values: A potentially ragged tensor of any dtype and shape `[nvals, ...]`.
+      row_splits: A 1-D int64 tensor with shape `[nrows+1]`.
+      cached_row_lengths: A 1-D int64 tensor with shape `[nrows]`
+      cached_value_rowids: A 1-D int64 tensor with shape `[nvals]`.
+      cached_nrows: A 1-D int64 scalar tensor.
+      internal: True if the constructor is being called by one of the factory
+        methods.  If false, an exception will be raised.
+
+    Raises:
+      TypeError: If a row partitioning tensor has an inappropriate dtype.
+      TypeError: If exactly one row partitioning argument was not specified.
+      ValueError: If a row partitioning tensor has an inappropriate shape.
+      ValueError: If multiple partitioning arguments are specified.
+      ValueError: If nrows is specified but value_rowids is not None.
+    """
+    if not internal:
+      raise ValueError("RaggedTensor constructor is private; please use one "
+                       "of the factory methods instead (e.g., "
+                       "RaggedTensor.from_row_lengths())")
+
+    # Validate the arguments.
+    if not isinstance(values, (RaggedTensor, ops.Tensor)):
+      raise TypeError("values must be a Tensor or RaggedTensor.")
+    if not isinstance(row_splits, ops.Tensor):
+      raise TypeError("Row-partitioning argument must be a Tensor.")
+    values.shape.with_rank_at_least(1)
+    row_splits.shape.assert_has_rank(1)
+    row_splits.set_shape([None])
+
+    self._values = values
+    self._row_splits = row_splits
+
+    # Store any cached tensors.  These are used to avoid unnecessary
+    # round-trip conversions when a RaggedTensor is constructed from
+    # lengths or rowids, and we later want those lengths/rowids back.
+    for tensor in [cached_row_lengths, cached_value_rowids, cached_nrows]:
+      if tensor is not None and not isinstance(tensor, ops.Tensor):
+        raise TypeError("Cached value must be a Tensor or None.")
+    self._cached_row_lengths = cached_row_lengths
+    self._cached_value_rowids = cached_value_rowids
+    self._cached_nrows = cached_nrows
+
+  #=============================================================================
+  # Factory Methods
+  #=============================================================================
+
+  @classmethod
+  def from_value_rowids(cls, values, value_rowids, nrows=None, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `value_rowids`.
+
+    The returned `RaggedTensor` corresponds with the python list defined by:
+
+    ```python
+    result = [[values[i] for i in range(len(values)) if value_rowids[i] == row]
+              for row in range(nrows)]
+    ```
+
+    Warning: currently, this needs to cast value_rowids to int64 before
+    converting, since `tf.bincount` only supports `int32`.
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      value_rowids: A 1-D int64 tensor with shape `[nvals]`, which corresponds
+        one-to-one with `values`, and specifies each value's row index.  Must be
+        nonnegative, and must be sorted in ascending order.
+      nrows: An int64 scalar specifying the number of rows.  This should be
+        specified if the `RaggedTensor` may containing empty training rows. Must
+        be greater than `value_rowids[-1]` (or zero if `value_rowids` is empty).
+        Defaults to `value_rowids[-1]` (or zero if `value_rowids` is empty).
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    Raises:
+      ValueError: If `nrows` is incompatible with `value_rowids`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_value_rowids(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
+      ...     nrows=5))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromValueRowIds",
+                        [values, value_rowids, nrows]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      value_rowids = ops.convert_to_tensor(
+          value_rowids, dtypes.int64, name="value_rowids")
+      if nrows is None:
+        const_rowids = tensor_util.constant_value(value_rowids)
+        if const_rowids is None:
+          nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
+          const_nrows = None
+        else:
+          const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
+          nrows = ops.convert_to_tensor(const_nrows, dtypes.int64, name="nrows")
+      else:
+        nrows = ops.convert_to_tensor(nrows, dtypes.int64, "nrows")
+        const_nrows = tensor_util.constant_value(nrows)
+        if const_nrows is not None:
+          if const_nrows < 0:
+            raise ValueError("Expected nrows >= 0; got %d" % const_nrows)
+          const_rowids = tensor_util.constant_value(value_rowids)
+          if const_rowids is not None and const_rowids.size > 0:
+            if not const_nrows >= const_rowids[-1] + 1:
+              raise ValueError(
+                  "Expected nrows >= value_rowids[-1] + 1; got nrows=%d, "
+                  "value_rowids[-1]=%d" % (const_nrows, const_rowids[-1]))
+
+      value_rowids.shape.assert_has_rank(1)
+      nrows.shape.assert_has_rank(0)
+      values.shape[:1].assert_is_compatible_with(value_rowids.shape)
+
+      # Convert value_rowids & nrows to row_splits.
+      # Note: we don't use segment_ids_to_row_splits() here because we want
+      # to save the intermediate value `row_lengths`, so we can cache it.
+      # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the
+      # cast (Remove the warning in the docstring when we do.)
+      value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
+      nrows_int32 = math_ops.cast(nrows, dtypes.int32)
+      row_lengths = math_ops.bincount(
+          value_rowids_int32,
+          minlength=nrows_int32,
+          maxlength=nrows_int32,
+          dtype=dtypes.int64)
+      row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
+      if const_nrows is not None:
+        row_lengths.set_shape([const_nrows])
+        row_splits.set_shape([const_nrows + 1])
+
+      return cls(
+          values,
+          row_splits,
+          cached_row_lengths=row_lengths,
+          cached_value_rowids=value_rowids,
+          cached_nrows=nrows,
+          internal=True)
+
+  @classmethod
+  def from_row_splits(cls, values, row_splits, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
+
+    The returned `RaggedTensor` corresponds with the python list defined by:
+
+    ```python
+    result = [values[row_splits[i]:row_splits[i + 1]]
+              for i in range(len(row_splits) - 1)]
+    ```
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_splits: A 1-D int64 tensor with shape `[nrows+1]`.  Must not be empty,
+        and must be sorted in ascending order.  `row_splits[0]` must be zero and
+        `row_splits[-1]` must be `nvals`.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    Raises:
+      ValueError: If `row_splits` is an empty list.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_splits(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_splits=[0, 4, 4, 7, 8, 8]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    if isinstance(row_splits, (list, tuple)) and not row_splits:
+      raise ValueError("row_splits tensor may not be empty.")
+    with ops.name_scope(name, "RaggedFromRowSplits", [values, row_splits]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_splits = ops.convert_to_tensor(row_splits, dtypes.int64, "row_splits")
+      row_splits.shape.assert_has_rank(1)
+      return cls(values=values, row_splits=row_splits, internal=True)
+
+  @classmethod
+  def from_row_lengths(cls, values, row_lengths, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
+
+    The returned `RaggedTensor` corresponds with the python list defined by:
+
+    ```python
+    result = [[values.pop(0) for i in range(length)]
+              for length in row_lengths]
+    ```
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_lengths: A 1-D int64 tensor with shape `[nrows]`.  Must be
+        nonnegative.  `sum(row_lengths)` must be `nvals`.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_lengths(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_lengths=[4, 0, 3, 1, 0]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []])>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromRowLengths", [values, row_lengths]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_lengths = ops.convert_to_tensor(row_lengths, dtypes.int64,
+                                          "row_lengths")
+      row_lengths.shape.assert_has_rank(1)
+      row_limits = math_ops.cumsum(row_lengths)
+      row_splits = array_ops.concat([[0], row_limits], axis=0)
+      return cls(
+          values=values,
+          row_splits=row_splits,
+          cached_row_lengths=row_lengths,
+          internal=True)
+
+  @classmethod
+  def from_row_starts(cls, values, row_starts, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
+
+    Equivalent to: `from_row_splits(values, concat([row_starts, nvals]))`.
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_starts: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative
+        and sorted in ascending order.  If `nrows>0`, then `row_starts[0]` must
+        be zero.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_starts(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_starts=[0, 4, 4, 7, 8]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromRowStarts", [values, row_starts]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_starts = ops.convert_to_tensor(row_starts, dtypes.int64, "row_starts")
+      row_starts.shape.assert_has_rank(1)
+      nvals = array_ops.shape(values, out_type=dtypes.int64)[:1]
+      row_splits = array_ops.concat([row_starts, nvals], axis=0)
+      return cls(values=values, row_splits=row_splits, internal=True)
+
+  @classmethod
+  def from_row_limits(cls, values, row_limits, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
+
+    Equivalent to: `from_row_splits(values, concat([0, row_limits]))`.
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_limits: A 1-D int64 tensor with shape `[nrows]`.  Must be sorted in
+        ascending order.  If `nrows>0`, then `row_limits[-1]` must be `nvals`.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_limits(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_limits=[4, 4, 7, 8, 8]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromRowLimits", [values, row_limits]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_limits = ops.convert_to_tensor(row_limits, dtypes.int64, "row_limits")
+      row_limits.shape.assert_has_rank(1)
+      zero = array_ops.zeros([1], dtypes.int64)
+      row_splits = array_ops.concat([zero, row_limits], axis=0)
+      return cls(values=values, row_splits=row_splits, internal=True)
+
+  @classmethod
+  def from_nested_value_rowids(cls,
+                               flat_values,
+                               nested_value_rowids,
+                               nested_nrows=None,
+                               name=None):
+    """Creates a `RaggedTensor` from a nested list of `value_rowids` tensors.
+
+    Equivalent to:
+
+    ```python
+    result = flat_values
+    for (rowids, nrows) in reversed(zip(nested_value_rowids, nested_nrows)):
+      result = from_value_rowids(result, rowids, nrows)
+    ```
+
+    Args:
+      flat_values: A potentially ragged tensor.
+      nested_value_rowids: A list of 1-D int64 tensors.  The `i`th tensor is
+        used as the `value_rowids` for the `i`th ragged dimension.
+      nested_nrows: A list of int64 scalars.  The `i`th scalar is used as the
+        `nrows` for the `i`th ragged dimension.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor` (or `flat_values` if `nested_value_rowids` is empty).
+
+    Raises:
+      ValueError: If `len(nested_values_rowids) != len(nested_nrows)`.
+    """
+    if isinstance(nested_value_rowids, ops.Tensor):
+      raise TypeError("nested_value_rowids must be a list of Tensors")
+    if nested_nrows is None:
+      nested_nrows = [None] * len(nested_value_rowids)
+    else:
+      if isinstance(nested_nrows, ops.Tensor):
+        raise TypeError("nested_nrows must be a list of Tensors")
+      if len(nested_nrows) != len(nested_value_rowids):
+        raise ValueError("nested_nrows must have the same length as "
+                         "nested_value_rowids")
+
+    with ops.name_scope(
+        name, "RaggedFromNestedValueRowIds",
+        [flat_values] + list(nested_value_rowids) + list(nested_nrows)):
+      result = flat_values
+      for value_rowids, nrows in reversed(
+          list(zip(nested_value_rowids, nested_nrows))):
+        result = cls.from_value_rowids(result, value_rowids, nrows)
+      return result
+
+  @classmethod
+  def from_nested_row_splits(cls, flat_values, nested_row_splits, name=None):
+    """Creates a `RaggedTensor` from a nested list of `row_splits` tensors.
+
+    Equivalent to:
+
+    ```python
+    result = flat_values
+    for row_splits in reversed(nested_row_splits):
+      result = from_row_splits(result, row_splits)
+    ```
+
+    Args:
+      flat_values: A potentially ragged tensor.
+      nested_row_splits: A list of 1-D int64 tensors.  The `i`th tensor is used
+        as the `row_splits` for the `i`th ragged dimension.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor` (or `flat_values` if `nested_row_splits` is empty).
+    """
+    if isinstance(nested_row_splits, ops.Tensor):
+      raise TypeError("nested_row_splits must be a list of Tensors")
+    with ops.name_scope(name, "RaggedFromNestedRowSplits",
+                        [flat_values] + list(nested_row_splits)):
+      result = flat_values
+      for splits in reversed(nested_row_splits):
+        result = cls.from_row_splits(result, splits)
+      return result
+
+  @classmethod
+  def from_nested_row_lengths(cls, flat_values, nested_row_lengths, name=None):
+    """Creates a `RaggedTensor` from a nested list of `row_lengths` tensors.
+
+    Equivalent to:
+
+    ```python
+    result = flat_values
+    for row_lengths in reversed(nested_row_lengths):
+      result = from_row_lengths(result, row_lengths)
+    ```
+
+    Args:
+      flat_values: A potentially ragged tensor.
+      nested_row_lengths: A list of 1-D int64 tensors.  The `i`th tensor is used
+        as the `row_lengths` for the `i`th ragged dimension.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor` (or `flat_values` if `nested_row_lengths` is empty).
+    """
+    if isinstance(nested_row_lengths, ops.Tensor):
+      raise TypeError("nested_row_lengths must be a list of Tensors")
+    with ops.name_scope(name, "RaggedFromNestedRowlengths",
+                        [flat_values] + list(nested_row_lengths)):
+      result = flat_values
+      for lengths in reversed(nested_row_lengths):
+        result = cls.from_row_lengths(result, lengths)
+      return result
+
+  #=============================================================================
+  # Accessors
+  #=============================================================================
+
+  @property
+  def dtype(self):
+    """The `DType` of values in this tensor."""
+    return self._values.dtype
+
+  @property
+  def shape(self):
+    """The statically known shape of this ragged tensor.
+
+    Returns:
+      A `TensorShape` containing the statically known shape of this ragged
+      tensor.  Ragged dimensions have a size of `None`.
+
+    Examples:
+
+      ```python
+      >>> ragged.constant([[0], [1, 2]]).shape
+      TensorShape([Dimension(2), Dimension(None)])
+
+      >>> ragged.constant([[[0, 1]], [[1, 2], [3, 4]]], ragged_rank=1).shape
+      TensorShape([Dimension(2), Dimension(None), Dimension(2)
+      ```
+    """
+    nrows = tensor_shape.dimension_at_index(self._row_splits.shape, 0) - 1
+
+    values_shape = self._values.shape
+    value_shape = values_shape[1:]
+    return tensor_shape.TensorShape([nrows, None]).concatenate(value_shape)
+
+  @property
+  def ragged_rank(self):
+    """The number of ragged dimensions in this ragged tensor.
+
+    Returns:
+      A Python `int` indicating the number of ragged dimensions in this ragged
+      tensor.  The outermost dimension is not considered ragged.
+    """
+    values_is_ragged = isinstance(self._values, RaggedTensor)
+    return self._values.ragged_rank + 1 if values_is_ragged else 1
+
+  @property
+  def values(self):
+    """The concatenated rows for this ragged tensor.
+
+    `rt.values` is a potentially ragged tensor formed by flattening the two
+    outermost dimensions of `rt` into a single dimension.
+
+    `rt.values.shape = [nvals] + rt.shape[2:]` (where `nvals` is the
+    number of items in the outer two dimensions of `rt`).
+
+    `rt.ragged_rank = self.ragged_rank - 1`
+
+    Returns:
+      A potentially ragged tensor.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> print rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      ```
+    """
+    return self._values
+
+  @property
+  def row_splits(self):
+    """The row-split indices for this ragged tensor's `values`.
+
+    `rt.row_splits` specifies where the values for each row begin and end in
+    `rt.values`.  In particular, the values for row `rt[i]` are stored in
+    the slice `rt.values[rt.row_splits[i]:rt.row_splits[i+1]]`.
+
+    Returns:
+      A 1-D `int64` `Tensor` with shape `[self.nrows+1]`.
+      The returned tensor is non-empty, and is sorted in ascending order.
+      `self.row_splits[0]` is zero, and `self.row_splits[-1]` is equal to
+      `self.values.shape[0]`.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> print rt.row_splits  # indices of row splits in rt.values
+      tf.Tensor([0, 4, 4, 7, 8, 8])
+      ```
+    """
+    return self._row_splits
+
+  @property
+  def flat_values(self):
+    """The innermost `values` tensor for this ragged tensor.
+
+    Concretely, if `rt.values` is a `Tensor`, then `rt.flat_values` is
+    `rt.values`; otherwise, `rt.flat_values` is `rt.values.flat_values`.
+
+    Conceptually, `flat_values` is the tensor formed by flattening the
+    outermost dimension and all of the ragged dimensions into a single
+    dimension.
+
+    `rt.flat_values.shape = [nvals] + rt.shape[rt.ragged_rank + 1:]`
+    (where `nvals` is the number of items in the flattened dimensions).
+
+    Returns:
+      A `Tensor`.
+
+    #### Example:
+
+      ```python
+      >>> rt = ragged.constant([[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+      >>> print rt.flat_values()
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      ```
+    """
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensor):
+      rt_values = rt_values.values
+    return rt_values
+
+  @property
+  def nested_row_splits(self):
+    """A tuple containing the row_splits for all ragged dimensions.
+
+    `rt.nested_row_splits` is a tuple containing the `row_splits` tensors for
+    all ragged dimensions in `rt`, ordered from outermost to innermost.  In
+    particular, `rt.nested_row_splits = (rt.row_splits,) + value_splits` where:
+
+        * `value_splits = ()` if `rt.values` is a `Tensor`.
+        * `value_splits = rt.values.nested_row_splits` otherwise.
+
+    Returns:
+      A `tuple` of 1-D `int64` `Tensor`s.
+
+    #### Example:
+
+      ```python
+      >>> rt = ragged.constant([[[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]])
+      >>> for i, splits in enumerate(rt.nested_row_splits()):
+      ...   print('Splits for dimension %d: %s' % (i+1, splits))
+      Splits for dimension 1: [0, 1]
+      Splits for dimension 2: [0, 3, 3, 5]
+      Splits for dimension 3: [0, 4, 4, 7, 8, 8]
+      ```
+
+    """
+    rt_nested_splits = [self.row_splits]
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensor):
+      rt_nested_splits.append(rt_values.row_splits)
+      rt_values = rt_values.values
+    return tuple(rt_nested_splits)
+
+  def value_rowids(self, name=None):
+    """Returns the row indices for the `values` in this ragged tensor.
+
+    `rt.value_rowids()` corresponds one-to-one with the outermost dimension of
+    `rt.values`, and specifies the row containing each value.  In particular,
+    the row `rt[row]` consists of the values `rt.values[j]` where
+    `rt.value_rowids()[j] == row`.
+
+    Args:
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A 1-D `int64` `Tensor` with shape `self.values.shape[:1]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      >>> rt.value_rowids()
+      tf.Tensor([0, 0, 0, 0, 2, 2, 2, 3])  # corresponds 1:1 with rt.values
+      ```
+    """
+    if self._cached_value_rowids is not None:
+      return self._cached_value_rowids
+
+    with ops.name_scope(name, "RaggedValueRowIds", [self]):
+      return segment_id_ops.row_splits_to_segment_ids(self.row_splits)
+
+  def nrows(self, out_type=dtypes.int64, name=None):
+    """Returns the number of rows in this ragged tensor.
+
+    I.e., the size of the outermost dimension of the tensor.
+
+    Args:
+      out_type: `dtype` for the returned tensor.
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A scalar `Tensor` with dtype `out_type`.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.nrows()  # rt has 5 rows.
+      5
+      ```
+    """
+    if self._cached_nrows is not None:
+      return self._cached_nrows
+
+    with ops.name_scope(name, "RaggedNRows", [self]):
+      return array_ops.shape(self.row_splits, out_type=out_type)[0] - 1
+
+  def row_starts(self, name=None):
+    """Returns the start indices for rows in this ragged tensor.
+
+    These indices specify where the values for each row begin in
+    `self.values`.  `rt.row_starts()` is equal to `rt.row_splits[:-1]`.
+
+    Args:
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A 1-D Tensor of int64 with shape `[nrows]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      >>> rt.row_starts()  # indices of row starts in rt.values
+      tf.Tensor([0, 4, 4, 7, 8])
+      ```
+    """
+    with ops.name_scope(name, "RaggedRowStarts", [self]):
+      return self.row_splits[:-1]
+
+  def row_limits(self, name=None):
+    """Returns the limit indices for rows in this ragged tensor.
+
+    These indices specify where the values for each row end in
+    `self.values`.  `rt.row_limits(self)` is equal to `rt.row_splits[:-1]`.
+
+    Args:
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A 1-D Tensor of int64 with shape `[nrows]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      >>> rt.row_limits()  # indices of row limits in rt.values
+      tf.Tensor([4, 4, 7, 8, 8])
+      ```
+    """
+    with ops.name_scope(name, "RaggedRowLimits", [self]):
+      return self.row_splits[1:]
+
+  def row_lengths(self, axis=1, name=None):
+    """Returns the lengths of the rows in this ragged tensor.
+
+    `rt.row_lengths()[i]` indicates the number of values in the
+    `i`th row of `rt`.
+
+    Args:
+      axis: An integer constant indicating the axis whose row lengths should be
+        returned.
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A potentially ragged Tensor of int64 with shape `self.shape[:axis]`.
+
+    Raises:
+      ValueError: If `axis` is out of bounds.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []])
+      >>> rt.row_lengths(rt)  # lengths of rows in rt
+      tf.Tensor([2, 0, 2, 1, 0])
+      >>> rt.row_lengths(axis=2)  # lengths of axis=2 rows.
+      <tf.RaggedTensor [[3, 1], [], [2, 1], [1], []]>
+      ```
+    """
+    if self._cached_row_lengths is not None:
+      return self._cached_row_lengths
+
+    with ops.name_scope(name, "RaggedRowLengths", [self]):
+      axis = ragged_util.get_positive_axis(axis, self.shape.ndims)
+      if axis == 0:
+        return self.nrows()
+      elif axis == 1:
+        splits = self.row_splits
+        return splits[1:] - splits[:-1]
+      elif isinstance(self.values, RaggedTensor):
+        return self.with_values(self.values.row_lengths(axis - 1))
+      else:
+        shape = array_ops.shape(self.values, out_type=dtypes.int64)
+        return self.with_values(
+            array_ops.ones(shape[:axis - 1], dtypes.int64) * shape[axis - 1])
+
+  def nested_row_lengths(self, name=None):
+    """Returns a tuple containing the row_lengths for all ragged dimensions.
+
+    `rtnested_row_lengths()` is a tuple containing the `row_lengths` tensors for
+    all ragged dimensions in `rt`, ordered from outermost to innermost.
+
+    Args:
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `tuple` of 1-D `int64` `Tensors`.  The length of the tuple is equal to
+      `self.ragged_rank`.
+    """
+    with ops.name_scope(name, "RaggedNestedRowLengths", [self]):
+      rt_nested_row_lengths = []
+      rt = self
+      while isinstance(rt, RaggedTensor):
+        rt_nested_row_lengths.append(rt.row_lengths())
+        rt = rt.values
+      return tuple(rt_nested_row_lengths)
+
+  def bounding_shape(self, axis=None, name=None):
+    """Returns the tight bounding box shape for this `RaggedTensor`.
+
+    Args:
+      axis: An integer scalar or vector indicating which axes to return the
+        bounding box for.  If not specified, then the full bounding box is
+        returned.
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      An int64 `Tensor`.  If `axis` is not specified, then `output`
+      is a vector with `output.shape=[self.shape.ndims]`.  If `axis` is a
+      scalar, then the `output` is a scalar.  If `axis` is a vector, then
+      `output` is a vector, where `output[i]` is the bounding size for
+      dimension `axis[i]`.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+      >>> rt.bounding_shape()
+      [5, 4]
+      ```
+    """
+    with ops.name_scope(name, "RaggedBoundingBox", [self, axis]):
+      nested_splits = self.nested_row_splits
+      rt_flat_values = self.flat_values
+
+      # Optimized special cases for when axis=0 or axis=1:
+      if isinstance(axis, int):
+        if axis == 0:
+          return array_ops.shape(nested_splits[0], out_type=dtypes.int64)[0] - 1
+        elif axis == 1:
+          return math_ops.maximum(math_ops.reduce_max(self.row_lengths()), 0)
+
+      splits_shape = array_ops.shape(self.row_splits, out_type=dtypes.int64)
+      flat_values_shape = array_ops.shape(rt_flat_values, out_type=dtypes.int64)
+
+      ragged_dimensions = array_ops.stack([splits_shape[0] - 1] + [
+          math_ops.maximum(math_ops.reduce_max(splits[1:] - splits[:-1]), 0)
+          for splits in nested_splits
+      ])
+      inner_dimensions = flat_values_shape[1:]
+
+      bbox = array_ops.concat([ragged_dimensions, inner_dimensions], axis=0)
+      return bbox if axis is None else array_ops.gather(bbox, axis)
+
+  #=============================================================================
+  # Transformation
+  #=============================================================================
+
+  def with_values(self, new_values):
+    """Returns a copy of `self` with `values` replaced by `new_value`.
+
+    Preserves cached row-partitioning tensors such as `self.cached_nrows` and
+    `self.cached_value_rowids` if they have values.
+
+    Args:
+      new_values: Potentially ragged tensor to use as the `values` for the
+        returned `RaggedTensor`.  Must have `rank > 0`, and must have the same
+        number of rows as `self.values`.
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = 1 + new_values.rank`.
+      `result.ragged_rank = 1 + new_values.ragged_rank`
+    """
+    new_values.shape.with_rank_at_least(1)
+    self.values.shape[:1].assert_is_compatible_with(new_values.shape[:1])
+    return RaggedTensor(
+        new_values,
+        self._row_splits,
+        self._cached_row_lengths,
+        self._cached_value_rowids,
+        self._cached_nrows,
+        internal=True)
+
+  def with_flat_values(self, new_values):
+    """Returns a copy of `self` with `flat_values` replaced by `new_value`.
+
+    Preserves cached row-partitioning tensors such as `self.cached_nrows` and
+    `self.cached_value_rowids` if they have values.
+
+    Args:
+      new_values: Potentially ragged tensor that should replace
+      `self.flat_values`.  Must have `rank > 0`, and must have the same
+      number of rows as `self.flat_values`.
+
+    Returns:
+      A `RaggedTensor`.
+      `result.rank = self.ragged_rank + new_values.rank`.
+      `result.ragged_rank = self.ragged_rank + new_values.ragged_rank`.
+    """
+    if isinstance(self._values, ops.Tensor):
+      return self.with_values(new_values)
+    else:
+      return self.with_values(self.values.with_flat_values(new_values))
+
+  #=============================================================================
+  # Tensor Type Conversions
+  #=============================================================================
+
+  @classmethod
+  def from_tensor(cls,
+                  tensor,
+                  lengths=None,
+                  padding=None,
+                  ragged_rank=1,
+                  name=None):
+    """Converts a `tf.Tensor` into a `RaggedTensor`.
+
+    The set of absent/default values may be specified using a vector of lengths
+    or a padding value (but not both).  If `lengths` is specified, then the
+    output tensor will satisfy `output[row] = tensor[row][:lengths[row]]`.
+    If `padding` is specified, then any row *suffix* consisting entirely of
+    `padding` will be excluded from the returned `RaggedTensor`.  If neither
+    `lengths` nor `padding` is specified, then the returned `RaggedTensor` will
+    have no absent/default values.
+
+    Examples:
+
+    ```python
+    >>> dt = tf.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+    >>> tf.RaggedTensor.from_tensor(dt)
+    <tf.RaggedTensor [[5, 7, 0], [0, 3, 0], [6, 0, 0]]>
+    >>> tf.RaggedTensor.from_tensor(dt, lengths=[2, 0, 3])
+    <tf.RaggedTensor [[5, 7], [], [6, 0, 0]]>
+    >>> tf.RaggedTensor.from_tensor(dt, padding=0)
+    <tf.RaggedTensor [[5, 7], [0, 3], [6]]>
+    ```
+
+    Args:
+      tensor: The `Tensor` to convert.  Must have rank `ragged_rank + 1` or
+        higher.
+      lengths: An optional set of row lengths, specified using a 1-D integer
+        `Tensor` whose length is equal to `tensor.shape[0]` (the number of rows
+        in `tensor`).  If specified, then `output[row]` will contain
+        `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero.
+      padding: An optional padding value.  If specified, then any row suffix
+        consisting entirely of `padding` will be excluded from the returned
+        RaggedTensor.  `padding` is a `Tensor` with the same dtype as `tensor`
+        and with `shape=tensor.shape[ragged_rank + 1:]`.
+      ragged_rank: Integer specifying the ragged rank for the returned
+        `RaggedTensor`.  Must be greater than zero.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `RaggedTensor` with the specified `ragged_rank`.  The shape of the
+      returned ragged tensor is compatible with the shape of `tensor`.
+    Raises:
+      ValueError: If both `lengths` and `padding` are specified.
+    """
+    if lengths is not None and padding is not None:
+      raise ValueError("Specify lengths or padding, but not both")
+    if not isinstance(ragged_rank, int):
+      raise TypeError("ragged_rank expected int, got %r" % ragged_rank)
+    if ragged_rank <= 0:
+      raise ValueError(
+          "ragged_rank must be greater than 0; got %s" % ragged_rank)
+
+    with ops.name_scope(name, "RaggedFromTensor", [tensor, lengths, padding]):
+      tensor = ops.convert_to_tensor(tensor, name="tensor")
+      tensor.shape.with_rank_at_least(ragged_rank + 1)
+      input_shape = array_ops.shape(tensor, out_type=dtypes.int64)
+      ncols = input_shape[1]
+
+      # Handle ragged_rank>1 via recursion:
+      # If the output should have multiple ragged dimensions, then first
+      # flatten the tensor to eliminate all but the last ragged dimension,
+      # and recursively convert that flattened tensor.  Then add on the splits
+      # for the dimensions that we flattened out.
+      if ragged_rank > 1:
+        # Flatten `tensor` to eliminate all but the last ragged dimension.
+        new_shape = array_ops.concat([
+            constant_op.constant([-1], dtypes.int64), input_shape[ragged_rank:]
+        ],
+                                     axis=0)
+        flattened = array_ops.reshape(tensor, new_shape)
+        # Recursively convert the flattened tensor.
+        values = cls.from_tensor(flattened, lengths, padding)
+        # The total number of elements in each  dimension.  E.g., if
+        # input_shape=[3, 4, 5, 6], then dim[2] has 3*4*5 elements in total.
+        dim_size = math_ops.cumprod(input_shape)
+        # Construct splits tensors for the dimensions that were flattened.
+        new_splits = [
+            math_ops.range(0, dim_size[dim - 1] + 1) * input_shape[dim]
+            for dim in range(1, ragged_rank)
+        ]
+        return cls.from_nested_row_splits(values, new_splits)
+
+      # If padding was specified, then use it to find row lengths.
+      if padding is not None:
+        padding = ops.convert_to_tensor(
+            padding, name="padding", dtype=tensor.dtype)
+        padding.shape.assert_is_compatible_with(tensor.shape[2:])
+
+        # Find places where the padding is equal to the tensor.  (This will
+        # broadcast `padding` across the outermost 2 dimensions of `tensor`,
+        # so `has_default_value.shape = tensor.shape`.)
+        has_default_value = math_ops.equal(padding, tensor)
+
+        # If the padding isn't a scalar, then require that all values in the
+        # padding match each item in the tensor.  After this block of code,
+        # `has_default.shape = tensor.shape[:2]`.  (Unfortunately, we can't just
+        # use reduce_all for both cases, becaue when you pass an empty `axis`
+        # list to reduce_all, it reduces all axes; but we want it to reduce no
+        # axes -- i.e., to be a no-op.)
+        tensor_rank = array_ops.rank(tensor)
+        reduce_axis = math_ops.range(2, tensor_rank)
+        has_default = control_flow_ops.cond(
+            tensor_rank > 2,
+            lambda: math_ops.reduce_all(has_default_value, axis=reduce_axis),
+            lambda: has_default_value)
+        has_default.set_shape(tensor_shape.TensorShape([None, None]))
+        has_default.set_shape(tensor.shape[:2])
+
+        # Use has_default it to find the length of each row: for each
+        # non-default item in a row, calculate the length that the row needs to
+        # have to include that item; and then take the max of those values
+        # (across each row).
+        has_nondefault = math_ops.logical_not(has_default)
+        has_nondefault = math_ops.cast(has_nondefault, dtypes.int64)
+        length_for_nondefault_value = (
+            has_nondefault * array_ops.expand_dims(
+                math_ops.range(1, ncols + 1), 0))
+        lengths = math_ops.reduce_max(length_for_nondefault_value, axis=1)
+
+      # If we have lengths (either directly supplied, or computed from
+      # paddings), then use those to construct splits; and then use masking
+      # to get the corresponding values.
+      if lengths is not None:
+        lengths = ragged_util.convert_to_int_tensor(lengths, "lengths",
+                                                    dtypes.int64)
+        lengths.shape.assert_has_rank(1)
+        lengths = math_ops.minimum(lengths, ncols)
+        lengths = math_ops.maximum(lengths, 0)
+        limits = math_ops.cumsum(lengths)
+        splits = array_ops.concat([array_ops.zeros([1], dtypes.int64), limits],
+                                  axis=0)
+        mask = array_ops.sequence_mask(lengths, maxlen=ncols)
+        values = array_ops.boolean_mask(tensor, mask)
+        return cls.from_row_splits(values, splits)
+
+      # If neither padding nor lengths were specified, then create a splits
+      # vector that contains no default values, and reshape the input tensor
+      # to form the values for the RaggedTensor.
+      nrows = input_shape[0]
+      nvals = nrows * ncols
+      splits = math_ops.range(nrows + 1) * ncols
+      values_shape = array_ops.concat([[nvals], input_shape[2:]], axis=0)
+      values = array_ops.reshape(tensor, values_shape)
+      return cls.from_row_splits(values, splits)
+
+  def to_tensor(self, default_value=None, name=None):
+    """Converts this `RaggedTensor` into a `tf.Tensor`.
+
+    Example:
+
+    ```python
+    >>> rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+    >>> print rt.to_tensor()
+    [[9 8 7]
+     [0 0 0]
+     [6 5 0]
+     [4 0 0]]
+    ```
+
+    Args:
+      default_value: Value to set for indices not specified in `self`. Defaults
+        to zero.  `default_value` must be broadcastable to
+        `self.shape[self.ragged_rank + 1:]`.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `Tensor` with shape `ragged.bounding_shape(self)` and the
+      values specified by the non-empty values in `self`.  Empty values are
+      assigned `default_value`.
+    """
+    with ops.name_scope(name, "RaggedToTensor", [self, default_value]):
+      if default_value is not None:
+        default_value = ops.convert_to_tensor(
+            default_value, name="default_value", dtype=self.dtype)
+
+      # If ragged_rank > 1, then recursively convert the ragged values into a
+      # `Tensor` before we proceed.
+      values = self.values
+      if is_ragged(values):
+        values = values.to_tensor(default_value)
+
+      # Tile the default value, if necessary.
+      if default_value is not None:
+        if values.shape.ndims is not None:
+          default_value.shape.with_rank_at_most(values.shape.ndims - 1)
+        if (values.shape.ndims is None or default_value.shape.ndims is None or
+            values.shape.ndims != default_value.shape.ndims + 1):
+          value_shape = array_ops.shape(values)[1:]
+          default_value = array_ops.broadcast_to(default_value, value_shape)
+        default_value.shape.assert_is_compatible_with(values.shape[1:])
+
+      # Get the expected dense shape ([nrows, ncols] + value_shape).
+      rt_row_lengths = [self.row_splits[1:] - self.row_splits[:-1]]
+      nrows = array_ops.shape(self.row_splits, out_type=dtypes.int64)[0] - 1
+      ncols = math_ops.maximum(math_ops.reduce_max(rt_row_lengths), 0)
+      values_shape = array_ops.shape(values, out_type=dtypes.int64)
+      value_shape = values_shape[1:]
+      nvals = values_shape[0]
+
+      # Build a default value if none was supplied.
+      if default_value is None:
+        default_value = array_ops.zeros(value_shape, dtype=values.dtype)
+      default_value.shape.assert_is_compatible_with(values.shape[1:])
+      default_value.set_shape(values.shape[1:])
+
+      # Get the row start indices, and expand to shape=[nrows, 1].
+      starts = array_ops.expand_dims(self.row_splits[:-1], 1)
+
+      # Get the row limit indices, and expand to shape=[nrows, 1].
+      limits = array_ops.expand_dims(self.row_splits[1:], 1)
+
+      # Get the column indices, and expand to shape=[1, ncols].
+      columns = array_ops.expand_dims(math_ops.range(0, ncols), 0)
+
+      # Build a list containing the values plus the default value.  We will use
+      # tf.gather to collect values from this list for the `Tensor` (using
+      # nvals as the index for the default value).
+      values_and_default = array_ops.concat(
+          [values, array_ops.stack([default_value])], axis=0)
+
+      # Construct a matrix "indices" pointing into values_and_default.  I.e.,
+      # output[r, c] = values_and_default[indices[r, c].
+      nondefault_index = starts + columns
+      has_value = nondefault_index < limits
+      default_index = array_ops.fill(array_ops.stack([nrows, ncols]), nvals)
+      indices = array_ops.where(has_value, nondefault_index, default_index)
+
+      # Gather the results into a `Tensor`.
+      return array_ops.gather(values_and_default, indices)
+
+  @classmethod
+  def from_sparse(cls, st_input, name=None):
+    """Converts a 2D `tf.SparseTensor` to a `RaggedTensor`.
+
+    Each row of the `output` `RaggedTensor` will contain the explicit values
+    from the same row in `st_input`.  `st_input` must be ragged-right.  If not
+    it is not ragged-right, then an error will be generated.
+
+    Example:
+
+    ```python
+    >>> st = SparseTensor(indices=[[0, 1], [0, 2], [0, 3], [1, 0], [3, 0]],
+    ...                   values=[1, 2, 3, 4, 5],
+    ...                   dense_shape=[4, 3])
+    >>> rt.RaggedTensor.from_sparse(st).eval().tolist()
+    [[1, 2, 3], [4], [], [5]]
+    ```
+
+    Currently, only two-dimensional `SparseTensors` are supported.
+
+    Args:
+      st_input: The sparse tensor to convert.  Must have rank 2.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `RaggedTensor` with the same values as `st_input`.
+      `output.ragged_rank = rank(st_input) - 1`.
+      `output.shape = [st_input.dense_shape[0], None]`.
+    Raises:
+      ValueError: If the number of dimensions in `st_input` is not known
+        statically, or is not two.
+    """
+    if not sparse_tensor.is_sparse(st_input):
+      raise TypeError("Expected SparseTensor, got %s" % type(st_input).__name__)
+    with ops.name_scope(name, "RaggedFromSparse", [st_input]):
+      st_input = sparse_tensor.convert_to_tensor_or_sparse_tensor(
+          st_input, name="st_input")
+
+      if st_input.dense_shape.shape.ndims is None:
+        static_rank_from_dense_shape = None
+      else:
+        static_rank_from_dense_shape = st_input.dense_shape.shape.dims[0].value
+
+      if st_input.indices.shape.ndims is None:
+        static_rank_from_indices = None
+      else:
+        static_rank_from_indices = st_input.indices.shape.dims[1].value
+
+      if static_rank_from_dense_shape != 2 and static_rank_from_indices != 2:
+        raise ValueError("rank(st_input) must be 2")
+
+      with ops.control_dependencies(
+          _assert_sparse_indices_are_ragged_right(st_input.indices)):
+        # Treat sparse row indices as segment ids to generate a splits tensor
+        # thta we can pair with the sparse tensor values.  (Ignore sparse column
+        # indices.)
+        segment_ids = st_input.indices[:, 0]
+        num_segments = st_input.dense_shape[0]
+        return cls.from_value_rowids(st_input.values, segment_ids, num_segments)
+
+  def to_sparse(self, name=None):
+    """Converts this `RaggedTensor` into a `tf.SparseTensor`.
+
+    Example:
+
+    ```python
+    >>> rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+    >>> rt.to_sparse().eval()
+    SparseTensorValue(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]],
+                      values=[1, 2, 3, 4, 5, 6],
+                      dense_shape=[4, 3])
+    ```
+
+    Args:
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A SparseTensor with the same values as `self`.
+    """
+    with ops.name_scope(name, "RaggedToSparse", [self]):
+      result = gen_ragged_conversion_ops.ragged_tensor_to_sparse(
+          self.nested_row_splits, self.flat_values, name=name)
+      return sparse_tensor.SparseTensor(result.sparse_indices,
+                                        result.sparse_values,
+                                        result.sparse_dense_shape)
+
+  #=============================================================================
+  # String Encoding
+  #=============================================================================
+  def __str__(self):
+    if self._is_eager():
+      return "<tf.RaggedTensor %s>" % self.to_list()
+    else:
+      return self.__repr__()
+
+  def __repr__(self):
+    return "tf.RaggedTensor(values=%s, row_splits=%s)" % (self._values,
+                                                          self._row_splits)
+
+  #=============================================================================
+  # Eager Execution Mode
+  #=============================================================================
+
+  def to_list(self):
+    """Returns a nested Python `list` with the values for this `RaggedTensor`.
+
+    Requires that `rt` was constructed in eager execution mode.
+
+    Returns:
+      A nested Python `list`.
+    """
+    if self._is_eager():
+      return self._eager_value().to_list()
+    else:
+      raise ValueError("RaggedTensor.to_list() is only supported in eager "
+                       "mode; in graph mode, evaluate the RaggedTensor first "
+                       "and then use RaggedTensorValue.to_list().")
+
+  def _eager_value(self):
+    """Returns a RaggedTensorValue for self.  Requires self._is_eager()=true."""
+    value = self.flat_values.numpy()
+    for row_splits in reversed(self.nested_row_splits):
+      value = ragged_tensor_value.RaggedTensorValue(value, row_splits.numpy())
+    return value
+
+  def _is_eager(self):
+    """Returns True if values & row_splits Tensors are all `EagerTensor`s."""
+    rt = self
+    while isinstance(rt, RaggedTensor):
+      if not isinstance(rt.row_splits, ops.EagerTensor):
+        return False
+      rt = rt.values
+    return isinstance(rt, ops.EagerTensor)
+
+  #=============================================================================
+  # Indexing & Slicing
+  #=============================================================================
+  def __getitem__(self, key):
+    """Returns the specified piece of this RaggedTensor."""
+    # See ragged_getitem.py for the documentation and implementation of this
+    # method.
+    #
+    # Note: the imports in ragged/__init__.py ensure that this method always
+    # gets overridden before it is called.
+
+  #=============================================================================
+  # Name Scope
+  #=============================================================================
+
+  # This private function is used by ops.name_scope to ensure that all of the
+  # input tensors for the scope belong to the same graph.  Defining this means
+  # that you may include `RaggedTensor` objects in the name_scope `values`
+  # list.
+  def _as_graph_element(self):
+    """Convert `self` to a graph element."""
+    values = self.values
+    while isinstance(values, RaggedTensor):
+      values = values.values
+    return values
+
+
+def is_ragged(value):
+  """Returns true if `value` is a ragged tensor or ragged tensor value."""
+  return isinstance(value,
+                    (RaggedTensor, ragged_tensor_value.RaggedTensorValue))
+
+
+#===============================================================================
+# Convert value -> tensor
+#===============================================================================
+def convert_to_tensor_or_ragged_tensor(value,
+                                       dtype=None,
+                                       preferred_dtype=None,
+                                       name=None):
+  """Converts value to a `RaggedTensor` or `Tensor`.
+
+  * If `value` is a `RaggedTensor`, then return it as-is.
+  * If `value` is a `RaggedTensorValue`, return a corresponding constant
+    `RaggedTensor`.
+  * Otherwise, use `convert_to_tensor` to convert `value` to a `Tensor`.
+
+  Args:
+    value: A `RaggedTensor`, a `RaggedTensorValue`, or an object whose type has
+      a registered `Tensor` conversion function.
+    dtype: Optional element type for the returned tensor.  If missing the type
+      is inferred from the type of `value`.
+    preferred_dtype: Optional element type for the returned tensor, used when
+      dtype is None.  This argument has no effect if `value` is already a
+      tensor, or when conversion is not possible.
+    name: Optional name to use if a new `Tensor` is created.
+
+  Returns:
+    A `Tensor` or `RaggedTensor`.
+  """
+  if isinstance(value, RaggedTensor):
+    if dtype and not dtype.is_compatible_with(value.dtype):
+      raise ValueError("Tensor conversion requested dtype %s for "
+                       "RaggedTensor with dtype %s: %r" %
+                       (dtype.name, value.dtype.name, value))
+    return value
+  elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
+    with ops.name_scope(name, "ConvertToTensorOrRaggedTensor", []):
+      flat_values = ops.convert_to_tensor(
+          value=value.flat_values,
+          dtype=dtype,
+          preferred_dtype=preferred_dtype,
+          name="flat_values")
+      return RaggedTensor.from_nested_row_splits(flat_values,
+                                                 value.nested_row_splits)
+  else:
+    return ops.convert_to_tensor(
+        value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
+
+
+#===============================================================================
+# Register RaggedTensor for use with session.run.
+#===============================================================================
+def _ragged_tensor_value_from_components(components):
+  components = list(components)
+  value = components.pop()
+  while components:
+    value = ragged_tensor_value.RaggedTensorValue(value, components.pop())
+  return value
+
+
+def _ragged_tensor_session_fetch(rt):
+  components = rt.nested_row_splits + (rt.flat_values,)
+  return (components, _ragged_tensor_value_from_components)
+
+
+def _ragged_tensor_session_feed(feed_key, feed_val):
+  key_components = feed_key.nested_row_splits + (feed_key.flat_values,)
+  val_components = feed_val.nested_row_splits + (feed_val.flat_values,)
+  return zip(key_components, val_components)
+
+
+def _ragged_tensor_session_feed_for_partial_run(feed_key):
+  return feed_key.nested_row_splits + (feed_key.flat_values,)
+
+
+session.register_session_run_conversion_functions(
+    RaggedTensor, _ragged_tensor_session_fetch, _ragged_tensor_session_feed,
+    _ragged_tensor_session_feed_for_partial_run)
+
+
+#===============================================================================
+# RaggedTensorType
+#===============================================================================
+class RaggedTensorType(object):
+  """Encoding of a static type for a `RaggedTensor`.
+
+  Use this type to express/declare that an output must have the type of
+  `RaggedTensor`.
+  """
+
+  def __init__(self, dtype, ragged_rank):
+    """Initializes a RaggedTensorType object.
+
+    Args:
+      dtype: data type of the `RaggedTensor`'s inner values.
+      ragged_rank: ragged_rank of the declared `RaggedTensor`.
+    """
+    self._dtype = dtype
+    self._ragged_rank = ragged_rank
+
+  dtype = property(lambda self: self._dtype)
+  ragged_rank = property(lambda self: self._ragged_rank)
+
+
+#===============================================================================
+# Helper Functions
+#===============================================================================
+def _assert_sparse_indices_are_ragged_right(indices):
+  """Checks that the given SparseTensor.indices tensor is ragged-right.
+
+  Example: `indices = [[0, 0], [0, 1], [2, 0], [3, 1]]` is not ragged right
+  because the entry `[3, 1]` skips a cell.
+
+  Args:
+    indices: The SparseTensor indices to check.
+
+  Returns:
+    A list of control dependency op tensors.
+  """
+  index_prefix = indices[:, :-1]
+  index_suffix = indices[:, -1]
+
+  # Check whether each index is starting a new row in the innermost dimension
+  # (prefix[i] != prefix[i-1]) or continuing a row (prefix[i] == prefix[i-1]).
+  # (Note: this skips the first index; we will check that separately below.)
+  index_prefix_changed = math_ops.reduce_any(
+      math_ops.not_equal(index_prefix[1:], index_prefix[:-1]), axis=1)
+
+  # Check two cases:
+  #   * For indices that start a new row: index_suffix[i] must be zero.
+  #   * For indices that continue a row: index_suffix[i] must be equal to
+  #     index_suffix[i-1]+1.
+  index_ok = array_ops.where(
+      index_prefix_changed, math_ops.equal(index_suffix[1:], 0),
+      math_ops.equal(index_suffix[1:], index_suffix[:-1] + 1))
+
+  # Also check that the very first index didn't skip any cells.  The first
+  # index starts a new row (by definition), so its suffix should be zero.
+  sparse_indices_are_ragged_right = math_ops.logical_and(
+      math_ops.reduce_all(math_ops.equal(index_suffix[:1], 0)),
+      math_ops.reduce_all(index_ok))
+
+  message = [
+      "SparseTensor is not right-ragged", "SparseTensor.indices =", indices
+  ]
+  return [control_flow_ops.Assert(sparse_indices_are_ragged_right, message)]
+
+
+@ops.RegisterGradient("RaggedTensorToSparse")
+def _ragged_tensor_to_sparse_gradient(op, unused_sparse_indices_grad,
+                                      sparse_values_grad,
+                                      unused_sparse_shape_grad):
+  """Gradient for RaggedTensorToSparse."""
+  op_inputs_nested_row_splits = op.inputs[:-1]
+  op_inputs_flat_values = op.inputs[-1]
+
+  # No gradient for the RaggedTensor's nested_row_splits.
+  nested_row_splits_gradient = [None] * len(op_inputs_nested_row_splits)
+
+  # Gradient for the RaggedTensor's flat_values is formed by reshaping
+  # the gradient for the SparseTensor's values.
+  flat_values_shape = array_ops.shape(op_inputs_flat_values)
+  flat_values_gradient = array_ops.reshape(sparse_values_grad,
+                                           flat_values_shape)
+
+  return nested_row_splits_gradient + [flat_values_gradient]
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e6ebdf332e6f53b7a3af5679af1cbf27ec9f792
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.bounding_shape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase):
+
+  def testDocStringExample(self):
+    # This is the example from ragged.bounding_shape.__doc__.
+    rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+    self.assertRaggedEqual(rt.bounding_shape(), [5, 4])
+
+  def test2DRaggedTensorWithOneRaggedDimension(self):
+    values = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+    rt1 = ragged.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged.RaggedTensor.from_row_splits(values, [0, 7])
+    rt3 = ragged.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
+    self.assertRaggedEqual(rt1.bounding_shape(), [5, 3])
+    self.assertRaggedEqual(rt2.bounding_shape(), [1, 7])
+    self.assertRaggedEqual(rt3.bounding_shape(), [3, 7])
+
+  def test3DRaggedTensorWithOneRaggedDimension(self):
+    values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
+    rt1 = ragged.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged.RaggedTensor.from_row_splits(values, [0, 7])
+    rt3 = ragged.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
+    self.assertRaggedEqual(rt1.bounding_shape(), [5, 3, 2])
+    self.assertRaggedEqual(rt2.bounding_shape(), [1, 7, 2])
+    self.assertRaggedEqual(rt3.bounding_shape(), [3, 7, 2])
+
+  def testExplicitAxisOptimizations(self):
+    rt = ragged.RaggedTensor.from_row_splits(b'a b c d e f g'.split(),
+                                             [0, 2, 5, 6, 6, 7])
+    self.assertRaggedEqual(rt.bounding_shape(0), 5)
+    self.assertRaggedEqual(rt.bounding_shape(1), 3)
+    self.assertRaggedEqual(rt.bounding_shape([1, 0]), [3, 5])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape.py b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..706881da74a46137171d4d4771b82e652d4ad4c8
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
@@ -0,0 +1,572 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Shapes & broadcasting for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+class RaggedTensorDynamicShape(object):
+  """A collection of tensors encoding the shape of a potentially ragged tensor.
+
+  Each `RaggedTensorDynamicShape` consists of an ordered list of dimension
+  sizes.  There are two dimension types:
+
+    * "Uniform dimensions" are dimenisons where all slices have the same
+      length.  `RaggedTensorDynamicShape` records the size of each uniform
+      dimension using a single scalar integer.
+
+    * "Ragged dimensions" are dimensions whose slices may have different
+      lengths.  `RaggedTensorDynamicShape` records the size of each ragged
+      dimension using an integer vector containing the slice lengths for all
+      the slices across that dimension.
+
+  Furthermore, there are two ways a dimension might be encoded:
+
+    * "Partitioned dimensions" are dimensions that are encoded using a
+      `RaggedTensor`'s `nested_row_splits`.  The outermostmost partitioned
+      dimension must be uniform, and the innermost partitioned dimension must
+      be ragged.
+
+    * "Inner dimensions" are dimensions that are encoded using a
+      `RaggedTensor`'s `flat_values`.  Inner dimensions are always uniform.
+
+  The sizes of partitioned dimensions are recorded using `partitioned_dim_sizes`
+  and `inner_dim_sizes`:
+
+    * `paritioned_dim_sizes` is a list of tensors (one for each partitioned
+      dimension).
+
+      * For uniform dimensions, the tensor is an integer scalar specifying the
+        size of all slices across that dimension.
+      * For ragged dimensions, the tensor is an integer vector specifying the
+        size of each slice across that dimension.
+
+    * `inner_dim_sizes` is a single integer vector, where each element
+      specifies the size of a single inner dimension.
+
+  Examples:
+
+  Tensor                         | Ragged | Partitioned Dim Sizes  | Inner Dim
+                                 : Rank   :                        : Sizes
+  ------------------------------ | ------ | ---------------------- | ----------
+  `[[1, 2, 3], [4, 5, 6]]`       |      0 |                        | `2, 3`
+  `[[1, 2], [], [3, 4, 5]]`      |      1 | `3, (2, 0, 3)`         |
+  `[[[1, 2], [3, 4]], [[5, 6]]]` |      1 | `2, (2, 1)`            | 2
+  `[[[1, 2], [3]], [[4, 5]]]`    |      2 | `2, (2, 1), (2, 1, 2)` |
+  """
+
+  def __init__(self, partitioned_dim_sizes, inner_dim_sizes):
+    """Creates a RaggedTensorDynamicShape.
+
+    Args:
+      partitioned_dim_sizes: A `list` of 0-D or 1-D integer `Tensor`, one for
+        each partitioned dimension.  If dimension `d` is uniform, then
+        `partitioned_dim_sizes[d]` must be an integer scalar, specifying the
+        size of all slices across dimension `d`.  If dimension `d` is ragged,
+        then `partitioned_dim_sizes[d]` must be an integer vector, specifying
+        the size of each slice across dimension `d`.
+      inner_dim_sizes: A 1-D integer `Tensor`, whose length is equal to the
+        number of inner dimensions.  `inner_dim_sizes[n]` is the size of all
+        slices across the `n`th inner dimension (which is the
+        `(len(partitioned_dim_sizes)+n)`th dimension in the overall tensor.
+    """
+    assert isinstance(partitioned_dim_sizes, (list, tuple))
+    with ops.name_scope(None, 'RaggedTensorDynamicShape',
+                        (partitioned_dim_sizes, inner_dim_sizes)):
+      partitioned_dim_sizes = tuple(
+          ragged_util.convert_to_int_tensor(
+              size, dtype=dtypes.int64, name='partitioned_dimension_size')
+          for size in partitioned_dim_sizes)
+      inner_dim_sizes = ragged_util.convert_to_int_tensor(
+          inner_dim_sizes, dtype=dtypes.int64, name='inner_dim_sizes')
+
+      # Validate shapes.
+      if partitioned_dim_sizes:
+        for axis, dimension_size in enumerate(partitioned_dim_sizes):
+          if dimension_size.shape.ndims is None:
+            raise ValueError(
+                'rank of partitioned_dim_sizes[%d] is unknown' % axis)
+          dimension_size.shape.with_rank_at_most(1)
+        if partitioned_dim_sizes[0].shape.ndims == 1:
+          raise ValueError('outermost partitioned dimension must be uniform')
+        if partitioned_dim_sizes[-1].shape.ndims == 0:
+          raise ValueError('innermost partitioned dimension must be ragged')
+      inner_dim_sizes.shape.assert_has_rank(1)
+
+      self._partitioned_dim_sizes = partitioned_dim_sizes
+      self._inner_dim_sizes = inner_dim_sizes
+
+  def __repr__(self):
+    return ('RaggedTensorDynamicShape'
+            '(partitioned_dim_sizes=%r, inner_dim_sizes=%r)' %
+            (self._partitioned_dim_sizes, self._inner_dim_sizes))
+
+  @staticmethod
+  def from_dim_sizes(dim_sizes):
+    """Constructs a ragged shape from a list of dimension sizes.
+
+    This list contains a single tensor for each dimension, where the tensor
+    is a scalar if the dimension is uniform, or a vector if the dimension is
+    ragged.
+
+    Args:
+      dim_sizes: List of int64 scalars or vectors.
+
+    Returns:
+      A RaggedTensorDynamicShape.
+    """
+    with ops.name_scope(None, 'RaggedTensorDynamicShapeFromDimensionSizes',
+                        [dim_sizes]):
+      dim_sizes = tuple(
+          ragged_util.convert_to_int_tensor(
+              size, dtype=dtypes.int64, name='dim_sizes') for size in dim_sizes)
+      # Split the dimensions into partitioned & inner dimensions.
+      inner_split = 0
+      for dim, dim_size in enumerate(dim_sizes):
+        if dim_size.shape.ndims == 1:
+          inner_split = dim + 1
+        elif dim_size.shape.ndims != 0:
+          raise ValueError('Each dim_size must be a scalar or a vector')
+      return RaggedTensorDynamicShape(dim_sizes[:inner_split],
+                                      dim_sizes[inner_split:])
+
+  @classmethod
+  def from_tensor(cls, rt_input):
+    """Constructs a ragged shape for a potentially ragged tensor."""
+    with ops.name_scope(None, 'RaggedTensorDynamicShapeFromTensor', [rt_input]):
+      rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt_input)
+      if not ragged_tensor.is_ragged(rt_input):
+        return cls([], array_ops.shape(rt_input))
+      else:
+        partitioned_dim_sizes = (
+            (rt_input.nrows(),) + rt_input.nested_row_lengths())
+        return RaggedTensorDynamicShape(
+            partitioned_dim_sizes,
+            array_ops.shape(rt_input.flat_values)[1:])
+
+  def dimension_size(self, axis):
+    """Returns the size of slices across the specified dimension."""
+    if not isinstance(axis, int):
+      raise TypeError('axis must be an integer')
+    partitioned_ndims = len(self._partitioned_dim_sizes)
+    if axis < partitioned_ndims:
+      return self._partitioned_dim_sizes[axis]
+    else:
+      return self._inner_dim_sizes[axis - partitioned_ndims]
+
+  def is_ragged(self, axis):
+    """Returns true if the indicated dimension is ragged."""
+    if not isinstance(axis, int):
+      raise TypeError('axis must be an integer')
+    rank = self.rank
+    if axis < 0:
+      raise ValueError('Negative axis values are not supported')
+    elif rank is not None and axis >= rank:
+      raise ValueError('Expected axis=%s < rank=%s' % (axis, rank))
+    else:
+      return (axis > 0 and axis < len(self._partitioned_dim_sizes) and
+              self._partitioned_dim_sizes[axis].shape.ndims == 1)
+
+  @property
+  def rank(self):
+    """The number of dimensions in this shape, or None if unknown."""
+    inner_ndims = tensor_shape.dimension_value(self._inner_dim_sizes.shape[0])
+    if inner_ndims is None:
+      return None
+    else:
+      return len(self._partitioned_dim_sizes) + inner_ndims
+
+  @property
+  def partitioned_dim_sizes(self):
+    """The partitioned dimension sizes for this shape.
+
+    Returns:
+      A `list` of 0-D or 1-D integer `Tensor`.
+    """
+    return self._partitioned_dim_sizes
+
+  @property
+  def inner_dim_sizes(self):
+    """The inner dimension sizes for this shape.
+
+    Returns:
+      A 1-D integer `Tensor`.
+    """
+    return self._inner_dim_sizes
+
+  @property
+  def num_partitioned_dimensions(self):
+    """The number of partitioned dimensions in this shape."""
+    return len(self._partitioned_dim_sizes)
+
+  @property
+  def num_inner_dimensions(self):
+    """The number of inner dimensions, or `None` if not statically known."""
+    return tensor_shape.dimension_value(self._inner_dim_sizes.shape[0])
+
+  def broadcast_to_rank(self, rank):
+    """Adds leading size-1 dimensions to broadcast `self` to the given rank.
+
+    E.g., if `shape1` is `[3, (D2), 4]`, then `shape1.broadcast_to_rank(5)`
+    is `[1, 1, 3, (D2), 4]`.
+
+    Args:
+      rank: The rank for the returned shape.
+
+    Returns:
+      A RaggedTensorDynamicShape with `rank` dimensions, whose inner dimensions
+      have the same size as `self` and whose outer dimensions have size `1`.
+
+    Raises:
+      ValueError: If `self.rank` is unknown or greater than `rank`.
+    """
+    if self.rank is None:
+      raise ValueError('Unable to broadcast: self.rank is unknown')
+    dims_to_add = rank - self.rank
+    if dims_to_add < 0:
+      raise ValueError('Unable to broadcast: rank=%d must be greater than '
+                       'self.rank=%d.' % (rank, self.rank))
+    elif dims_to_add == 0:
+      return self
+    elif self._partitioned_dim_sizes:
+      partitioned_dims = (1,) * dims_to_add + self._partitioned_dim_sizes
+      return RaggedTensorDynamicShape(partitioned_dims, self._inner_dim_sizes)
+    else:
+      inner_dims = array_ops.concat(
+          [array_ops.ones([dims_to_add], dtypes.int64), self.inner_dim_sizes],
+          axis=0)
+      return RaggedTensorDynamicShape([], inner_dims)
+
+  def broadcast_dimension(self, axis, lengths):
+    """Returns a shape that is broadcast-compatible with self & lengths.
+
+    * If dimension[axis] is uniform and lengths is a scalar, the check
+      that either lengths==1 or axis==1 or lengths==axis, and tile
+      dimension[axis] with tf.where(lengths==axis, 1, axis) repeats.
+
+    * If dimension[axis] is uniform and lengths is a vector, then check
+      that dimension[axis]==1, and raggedly tile dimension[axis] with
+      lengths repeats.  (we can skip tiling if we statically know that
+      slice_lengths == 1??)
+
+    * If dimension[axis] is ragged and lengths is a scalar, then check
+      that lengths==1.
+
+    * If dimension[axis] is ragged and lengths is a vector, then check
+      that self.dimension_size(axis) == lengths.
+
+    Args:
+      axis: `int`.  The dimension to broadcast.
+      lengths: 0-D or 1-D integer `Tensor`.
+
+    Returns:
+      A `RaggedTensorDynamicShape`.
+    """
+    lengths = ragged_util.convert_to_int_tensor(
+        lengths, name='lengths', dtype=dtypes.int64)
+    # Check whether lengths is a scalar (for uniform dimensions) or
+    # vector (for ragged dimensions).
+    if lengths.shape.ndims is None:
+      raise ValueError('lengths must have a known rank.')
+    elif lengths.shape.ndims > 1:
+      raise ValueError('lengths must be a scalar or vector')
+    else:
+      lengths_is_scalar = (lengths.shape.ndims == 0)
+
+    # Verify that the shapes are compatible.
+    if self.is_ragged(axis):
+      if lengths_is_scalar:
+        condition = math_ops.equal(lengths, 1)
+      else:
+        condition = math_ops.reduce_all(
+            math_ops.equal(lengths, self.dimension_size(axis)))
+    else:
+      axis_dim_size = self.dimension_size(axis)
+      if lengths_is_scalar:
+        condition = (
+            math_ops.equal(lengths, 1) | math_ops.equal(axis_dim_size, 1)
+            | math_ops.equal(axis_dim_size, lengths))
+      else:
+        condition = math_ops.equal(axis_dim_size, 1)
+    broadcast_err = [
+        'Unable to broadcast: dimension size mismatch in dimension', axis,
+        'lengths=', lengths, 'dim_size=',
+        self.dimension_size(axis)
+    ]
+    broadcast_check = control_flow_ops.Assert(
+        condition, data=broadcast_err, summarize=10)
+
+    with ops.control_dependencies([broadcast_check]):
+      # Partitioned dimensions:
+      if axis < self.num_partitioned_dimensions:
+        if self.is_ragged(axis):
+          # Use an identity op to make sure the check actually gets run.
+          return RaggedTensorDynamicShape(
+              self._partitioned_dim_sizes,
+              array_ops.identity(self.inner_dim_sizes))
+        else:
+          return self._broadcast_uniform_partitioned_dimension(axis, lengths)
+
+      # Inner dimensions:
+      else:
+        if lengths_is_scalar:
+          return self._broadcast_inner_dimension_to_uniform(axis, lengths)
+        else:
+          if axis == 0:
+            raise ValueError('Unable to broadcast: '
+                             'outermost dimension must be uniform.')
+          return self._broadcast_inner_dimension_to_ragged(axis, lengths)
+
+  def num_slices_in_dimension(self, axis):
+    """Returns the total number of slices across the indicated dimension."""
+    if axis < 0:
+      return constant_op.constant(1, dtype=dtypes.int64)
+    elif self.is_ragged(axis):
+      return math_ops.reduce_sum(self._partitioned_dim_sizes[axis])
+    else:
+      return self.dimension_size(axis) * self.num_slices_in_dimension(axis - 1)
+
+  def _broadcast_uniform_partitioned_dimension(self, axis, lengths):
+    """Broadcasts the partitioned dimension `axis` to match `lengths`."""
+    axis_dim_size = self.dimension_size(axis)
+    partitioned_sizes = list(self._partitioned_dim_sizes[:axis])
+
+    if lengths.shape.ndims == 0:
+      lengths = array_ops.where(
+          math_ops.equal(axis_dim_size, 1), lengths, axis_dim_size)
+      repeats = array_ops.where(math_ops.equal(axis_dim_size, 1), lengths, 1)
+      splits = array_ops.stack([0, self.num_slices_in_dimension(axis)])
+    else:
+      splits = math_ops.range(
+          array_ops.size(lengths, out_type=dtypes.int64) + 1)
+      repeats = lengths
+
+    partitioned_sizes.append(lengths)
+
+    for dim_size in self._partitioned_dim_sizes[axis + 1:]:
+      if dim_size.shape.ndims == 0:
+        partitioned_sizes.append(dim_size)
+        splits *= dim_size
+      else:
+        partitioned_sizes.append(
+            ragged_util.repeat_ranges(dim_size, splits, repeats))
+        splits = array_ops.gather(
+            ragged_util.lengths_to_splits(dim_size), splits)
+    inner_sizes = self._inner_dim_sizes
+    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
+
+  def _broadcast_inner_dimension_to_uniform(self, axis, length):
+    """Broadcasts the inner dimension `axis` to match `lengths`."""
+    dim_size = self.dimension_size(axis)
+    axis_in_inner_dims = axis - self.num_partitioned_dimensions
+    partitioned_sizes = self._partitioned_dim_sizes
+    inner_sizes = array_ops.concat([
+        self._inner_dim_sizes[:axis_in_inner_dims],
+        [array_ops.where(math_ops.equal(dim_size, 1), length, dim_size)],
+        self._inner_dim_sizes[axis_in_inner_dims + 1:]
+    ],
+                                   axis=0)
+    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
+
+  def _broadcast_inner_dimension_to_ragged(self, axis, lengths):
+    axis_in_inner_dims = axis - self.num_partitioned_dimensions
+    partitioned_sizes = (
+        self._partitioned_dim_sizes + tuple([
+            self._inner_dim_sizes[i] for i in range(axis_in_inner_dims)
+        ]) + (lengths,))
+    inner_sizes = self._inner_dim_sizes[axis_in_inner_dims + 1:]
+    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
+
+
+def broadcast_dynamic_shape(shape_x, shape_y):
+  """Returns the shape formed by broadcasting two shapes to be compatible.
+
+  Args:
+    shape_x: A `RaggedTensorDynamicShape`
+    shape_y: A `RaggedTensorDynamicShape`
+
+  Returns:
+    A `RaggedTensorDynamicShape`.
+  Raises:
+    ValueError: If `shape_x` and `shape_y` are not broadcast-compatible.
+  """
+  if not isinstance(shape_x, RaggedTensorDynamicShape):
+    raise TypeError('shape_x must be a RaggedTensorDynamicShape')
+  if not isinstance(shape_y, RaggedTensorDynamicShape):
+    raise TypeError('shape_y must be a RaggedTensorDynamicShape')
+
+  # Broadcast both shapes to have the same rank.
+  if shape_x.rank is None or shape_y.rank is None:
+    raise ValueError('Unable to broadcast: unknown rank')
+  broadcast_rank = max(shape_x.rank, shape_y.rank)
+  shape_x = shape_x.broadcast_to_rank(broadcast_rank)
+  shape_y = shape_y.broadcast_to_rank(broadcast_rank)
+
+  # Broadcast dimensions one at a time, starting from the outermost dimension.
+  for axis in range(broadcast_rank):
+    shape_x = shape_x.broadcast_dimension(axis, shape_y.dimension_size(axis))
+    shape_y = shape_y.broadcast_dimension(axis, shape_x.dimension_size(axis))
+
+  return shape_x
+
+
+def broadcast_to(rt_input, shape, broadcast_inner_dimensions=True):
+  """Broadcasts a potentially ragged tensor to a ragged shape.
+
+  Tiles `rt_input` as necessary to match the given shape.
+
+  Behavior is undefined if `rt_input` is not broadcast-compatible with `shape`.
+
+  Args:
+    rt_input: The potentially ragged tensor to broadcast.
+    shape: A `RaggedTensorDynamicShape`
+    broadcast_inner_dimensions: If false, then inner dimensions will not be
+      tiled.
+
+  Returns:
+    A potentially ragged tensor whose values are taken from
+    `rt_input`, and whose shape matches `shape`.
+  """
+  if not isinstance(shape, RaggedTensorDynamicShape):
+    raise TypeError('shape must be a RaggedTensorDynamicShape')
+  rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt_input)
+
+  # Broadcasting to a uniform shape.
+  if shape.num_partitioned_dimensions == 0:
+    return _broadcast_to_uniform_shape(rt_input, shape,
+                                       broadcast_inner_dimensions)
+  else:
+    return _broadcast_to_ragged_shape(rt_input, shape,
+                                      broadcast_inner_dimensions)
+
+
+def _broadcast_to_uniform_shape(rt_input, shape, broadcast_inner_dimensions):
+  """Broadcasts rt_input to the uniform shape `shape`."""
+  if isinstance(rt_input, ragged_tensor.RaggedTensor):
+    raise ValueError('Incompatible with shape: ragged rank mismatch')
+  if broadcast_inner_dimensions:
+    return array_ops.broadcast_to(rt_input, shape.inner_dim_sizes)
+  else:
+    return rt_input
+
+
+def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions):
+  """Broadcasts rt_input to the ragged shape `dst_shape`."""
+  # dst_shape's rank and ragged_rank must be greater than or equal to rt_input's
+  if rt_input.shape.ndims is None or dst_shape.rank is None:
+    raise ValueError('Unable to broadcast: unknown rank')
+  if rt_input.shape.ndims > dst_shape.rank:
+    raise ValueError('Incompatible with shape: rank mismatch')
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.ragged_rank >= dst_shape.num_partitioned_dimensions):
+    raise ValueError('Incompatible with shape: ragged rank mismatch')
+
+  src_shape = RaggedTensorDynamicShape.from_tensor(rt_input)
+  src_shape = src_shape.broadcast_to_rank(dst_shape.rank)
+
+  # Add dimensions to rt_input so its rank and ragged_rank matches dst_shape.
+  if dst_shape.rank > rt_input.shape.ndims:
+    if rt_input.shape.ndims < dst_shape.num_inner_dimensions + 1:
+      rt_input = array_ops.reshape(
+          rt_input, array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0))
+    for _ in range(dst_shape.rank - rt_input.shape.ndims):
+      if ragged_tensor.is_ragged(rt_input):
+        nrows = rt_input.nrows()
+      else:
+        nrows = array_ops.shape(rt_input, out_type=dtypes.int64)[0]
+      rt_input = ragged_tensor.RaggedTensor.from_row_lengths(rt_input, [nrows])
+
+  # Add ragged dimensions to match dst_shape.
+  if ragged_tensor.is_ragged(rt_input):
+    inner_rank_diff = (
+        rt_input.flat_values.shape.ndims - 1 - dst_shape.num_inner_dimensions)
+    if inner_rank_diff > 0:
+      rt_input = rt_input.with_flat_values(
+          ragged_conversion_ops.from_tensor(
+              rt_input.flat_values, ragged_rank=inner_rank_diff))
+  else:
+    rt_input = ragged_conversion_ops.from_tensor(
+        rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1)
+
+  # Do broadcasting for any dimensions that will remain uniform.  We can do
+  # these all at once, since they're independent of one another.
+  multiples = [1] * dst_shape.rank
+  for axis in range(dst_shape.num_partitioned_dimensions):
+    if not src_shape.is_ragged(axis) and not dst_shape.is_ragged(axis):
+      src_size = src_shape.dimension_size(axis)
+      dst_size = dst_shape.dimension_size(axis)
+      if ((tensor_util.constant_value(src_size) in (1, None)) and
+          (tensor_util.constant_value(dst_size) != 1)):
+        multiples[axis] = array_ops.where(
+            math_ops.equal(src_size, 1), dst_size, 1)
+  if not all(isinstance(v, int) and v == 1 for v in multiples):
+    multiples = array_ops.stack(multiples, axis=0)
+    rt_input = ragged_array_ops.tile(rt_input, multiples)
+
+  if broadcast_inner_dimensions:
+    rt_input = rt_input.with_flat_values(
+        array_ops.reshape(
+            rt_input.flat_values,
+            array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)))
+
+  # Do broadcasting for dimensions that become ragged.  We must do these from
+  # outermost to innermost.
+  for axis in range(dst_shape.num_partitioned_dimensions):
+    if not src_shape.is_ragged(axis) and dst_shape.is_ragged(axis):
+      dst_size = dst_shape.dimension_size(axis)
+      rt_input = _ragged_tile_axis(rt_input, axis, dst_size)
+
+  return rt_input
+
+
+def _ragged_tile_axis(rt_input, axis, repeats):
+  """Tile a dimension of a RaggedTensor to match a ragged shape."""
+  assert axis > 0  # Outermost dimension may not be ragged.
+
+  if not ragged_tensor.is_ragged(rt_input):
+    rt_input = ragged_conversion_ops.from_tensor(rt_input, ragged_rank=1)
+
+  if axis > 1:
+    return rt_input.with_values(
+        _ragged_tile_axis(rt_input.values, axis - 1, repeats))
+  else:
+    src_row_splits = rt_input.nested_row_splits
+    src_row_lengths = rt_input.nested_row_lengths()
+    splits = src_row_splits[0]
+
+    dst_row_lengths = [repeats]
+    for i in range(1, len(src_row_lengths)):
+      dst_row_lengths.append(
+          ragged_util.repeat_ranges(src_row_lengths[i], splits, repeats))
+      splits = array_ops.gather(src_row_splits[i], splits)
+    dst_values = ragged_util.repeat_ranges(rt_input.flat_values, splits,
+                                           repeats)
+    return ragged_tensor.RaggedTensor.from_nested_row_lengths(
+        dst_values, dst_row_lengths)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec06aeaea546d679d65c7c8d64357393afd3eae2
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
@@ -0,0 +1,479 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.ragged_tensor_shape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase,
+                                  parameterized.TestCase):
+
+  def assertShapeEq(self, x, y):
+    assert isinstance(x, ragged.RaggedTensorDynamicShape)
+    assert isinstance(y, ragged.RaggedTensorDynamicShape)
+    x_partitioned_dim_sizes = [
+        self.eval_to_list(splits)  #
+        for splits in x.partitioned_dim_sizes
+    ]
+    y_partitioned_dim_sizes = [
+        self.eval_to_list(splits)  #
+        for splits in y.partitioned_dim_sizes
+    ]
+    self.assertEqual(x_partitioned_dim_sizes, y_partitioned_dim_sizes)
+    self.assertAllEqual(x.inner_dim_sizes, y.inner_dim_sizes)
+
+  @parameterized.parameters([
+      dict(value='x', expected_dim_sizes=[]),
+      dict(value=['a', 'b', 'c'], expected_dim_sizes=[3]),
+      dict(value=[['a', 'b', 'c'], ['d', 'e', 'f']], expected_dim_sizes=[2, 3]),
+      dict(
+          value=[[['a', 'b', 'c'], ['d', 'e', 'f']]],
+          expected_dim_sizes=[1, 2, 3]),
+      dict(
+          value=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected_dim_sizes=[2, [3, 2]]),
+      dict(
+          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e']]]),
+          expected_dim_sizes=[1, [2], [3, 2]]),
+      dict(
+          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e', 'f']]],
+                                      ragged_rank=1),
+          expected_dim_sizes=[1, [2], 3]),
+      dict(
+          value=ragged.constant_value([[[[1], [2]], [[3], [4]]],
+                                       [[[5], [6]]]], ragged_rank=1),
+          expected_dim_sizes=[2, [2, 1], 2, 1]),
+      dict(
+          value=ragged.constant_value([[10, 20], [30]]),
+          expected_dim_sizes=[2, [2, 1]]),
+      # Docstring examples:
+      dict(value=[[1, 2, 3], [4, 5, 6]], expected_dim_sizes=[2, 3]),
+      dict(
+          value=ragged.constant_value([[1, 2], [], [3, 4, 5]]),
+          expected_dim_sizes=[3, [2, 0, 3]]),
+      dict(
+          value=ragged.constant_value([[[1, 2], [3, 4]], [[5, 6]]],
+                                      ragged_rank=1),
+          expected_dim_sizes=[2, [2, 1], 2]),
+      dict(
+          value=ragged.constant_value([[[1, 2], [3]], [[4, 5]]]),
+          expected_dim_sizes=[2, [2, 1], [2, 1, 2]]),
+  ])
+  def testFromTensor(self, value, expected_dim_sizes):
+    shape = ragged.RaggedTensorDynamicShape.from_tensor(value)
+    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        expected_dim_sizes)
+    self.assertShapeEq(shape, expected)
+
+  @parameterized.parameters([
+      dict(dim_sizes=[], rank=0, expected_dim_sizes=[]),
+      dict(dim_sizes=[], rank=3, expected_dim_sizes=[1, 1, 1]),
+      dict(dim_sizes=[3], rank=1, expected_dim_sizes=[3]),
+      dict(dim_sizes=[3], rank=3, expected_dim_sizes=[1, 1, 3]),
+      dict(dim_sizes=[2, 3], rank=3, expected_dim_sizes=[1, 2, 3]),
+      dict(dim_sizes=[3, [3, 2, 4]], rank=2, expected_dim_sizes=[3, [3, 2, 4]]),
+      dict(
+          dim_sizes=[3, [3, 2, 4]],
+          rank=4,
+          expected_dim_sizes=[1, 1, 3, [3, 2, 4]]),
+      dict(
+          dim_sizes=[3, [3, 2, 4], 2, 3],
+          rank=5,
+          expected_dim_sizes=[1, 3, [3, 2, 4], 2, 3]),
+  ])
+  def testBroadcastToRank(self, dim_sizes, rank, expected_dim_sizes):
+    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        expected_dim_sizes)
+    broadcasted_shape = shape.broadcast_to_rank(rank)
+    self.assertShapeEq(broadcasted_shape, expected)
+    self.assertEqual(broadcasted_shape.rank, rank)
+
+  @parameterized.parameters([
+      #=========================================================================
+      # dimension[axis] is uniform inner; and row_lengths is a scalar
+      #=========================================================================
+      # shape: [BROADCAST(UNIFORM), UNIFORM, UNIFORM]
+      dict(axis=0,
+           row_length=3,
+           original_dim_sizes=[1, 4, 5],
+           broadcast_dim_sizes=[3, 4, 5]),
+
+      # shape: [UNIFORM, UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=5,
+           original_dim_sizes=[3, 4, 1],
+           broadcast_dim_sizes=[3, 4, 5]),
+
+      # shape: [UNIFORM, RAGGED, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=5,
+           original_dim_sizes=[3, [3, 2, 8], 1],
+           broadcast_dim_sizes=[3, [3, 2, 8], 5]),
+
+      # shape: [UNIFORM, RAGGED, RAGGED, UNIFORM, UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=5,
+           row_length=5,
+           original_dim_sizes=[2, [2, 1], [3, 2, 8], 3, 4, 1],
+           broadcast_dim_sizes=[2, [2, 1], [3, 2, 8], 3, 4, 5]),
+
+      #=========================================================================
+      # dimension[axis] is uniform inner; and row_lengths is a vector
+      #=========================================================================
+      # shape: [UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=1,
+           row_length=[2, 0, 1],
+           original_dim_sizes=[3, 1],
+           broadcast_dim_sizes=[3, [2, 0, 1]]),
+      # shape: [UNIFORM, BROADCAST(UNIFORM), UNIFORM]
+      dict(axis=1,
+           row_length=[2, 0, 1],
+           original_dim_sizes=[3, 1, 5],
+           broadcast_dim_sizes=[3, [2, 0, 1], 5]),
+
+      # shape: [UNIFORM, UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=[2, 0, 1, 3, 8, 2, 3, 4, 1, 8, 7, 0],
+           original_dim_sizes=[4, 3, 1],
+           broadcast_dim_sizes=[4, 3, [2, 0, 1, 3, 8, 2, 3, 4, 1, 8, 7, 0]]),
+
+      # shape: [UNIFORM, RAGGED, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=[2, 5, 3],
+           original_dim_sizes=[2, [2, 1], 1],
+           broadcast_dim_sizes=[2, [2, 1], [2, 5, 3]]),
+
+      # shape: [UNIFORM, RAGGED, UNIFORM, UNIFORM, BROADCAST(UNIFORM), UNIFORM]
+      dict(axis=4,
+           row_length=list(range(18)),
+           original_dim_sizes=[2, [2, 1], 3, 2, 1, 8],
+           broadcast_dim_sizes=[2, [2, 1], 3, 2, list(range(18)), 8]),
+
+      #=========================================================================
+      # dimension[axis] is uniform partitioned; and row_lengths is a scalar
+      #=========================================================================
+      # shape: [BROADCAST(UNIFORM), RAGGED]
+      dict(axis=0,
+           row_length=3,
+           original_dim_sizes=[1, [5]],
+           broadcast_dim_sizes=[3, [5, 5, 5]]),
+
+      # shape: [BROADCAST(UNIFORM), UNIFORM, RAGGED]
+      dict(axis=0,
+           row_length=2,
+           original_dim_sizes=[1, 3, [3, 0, 2]],
+           broadcast_dim_sizes=[2, 3, [3, 0, 2, 3, 0, 2]]),
+
+      # shape: [BROADCAST(UNIFORM), RAGGED, RAGGED, UNIFORM, UNIFORM]
+      dict(axis=0,
+           row_length=3,
+           original_dim_sizes=[1, [3], [3, 5, 2], 9, 4, 5],
+           broadcast_dim_sizes=[3, [3, 3, 3], [3, 5, 2, 3, 5, 2, 3, 5, 2],
+                                9, 4, 5]),
+
+      # shape: [BROADCAST(UNIFORM), UNIFORM, RAGGED, UNIFORM]
+      dict(axis=0,
+           row_length=2,
+           original_dim_sizes=[1, 2, [2, 1], [3, 5, 2], 2],
+           broadcast_dim_sizes=[2, 2, [2, 1, 2, 1], [3, 5, 2, 3, 5, 2], 2]),
+
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED, UNIFORM]
+      dict(axis=1,
+           row_length=2,
+           original_dim_sizes=[3, 1, [4, 0, 2], 5],
+           broadcast_dim_sizes=[3, 2, [4, 0, 2, 4, 0, 2], 5]),
+
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED]
+      dict(axis=1,
+           row_length=1,
+           original_dim_sizes=[2, 3, (1, 2, 3, 4, 5, 6)],
+           broadcast_dim_sizes=[2, 3, (1, 2, 3, 4, 5, 6)]),
+
+      #=========================================================================
+      # dimension[axis] is uniform partitioned; and row_lengths is a vector
+      #=========================================================================
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED, UNIFORM]
+      dict(axis=1,
+           row_length=[4, 1, 2],
+           original_dim_sizes=[
+               3,                          # axis=0
+               1,                          # axis=1 (broadcast)
+               [3, 1, 2],                  # axis=2
+               5],                         # axis=3
+           broadcast_dim_sizes=[
+               3,                          # axis=0
+               [4, 1, 2],                  # axis=1 (broadcast)
+               [3, 3, 3, 3, 1, 2, 2],      # axis=2
+               5]),                        # axis=3
+
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED, RAGGED]
+      dict(axis=1,
+           row_length=[2, 0, 3],
+           original_dim_sizes=[
+               3,                                         # axis=0
+               1,                                         # axis=1 (broadcast)
+               [3, 1, 2],                                 # axis=2
+               [3, 1, 4, 1, 5, 9]],                       # axis=3
+           broadcast_dim_sizes=[
+               3,                                         # axis=0
+               [2, 0, 3],                                 # axis=1 (broadcast)
+               [3, 3, 2, 2, 2],                           # axis=2
+               [3, 1, 4, 3, 1, 4, 5, 9, 5, 9, 5, 9]]),    # axis=3
+
+      # shape: [UNIFORM, RAGGED, BROADCAST(UNIFORM), RAGGED, RAGGED, UNIFORM]
+      dict(axis=2,
+           row_length=[4, 1, 2],
+           original_dim_sizes=[
+               3,                                         # axis=0
+               [2, 0, 1],                                 # axis=1
+               1,                                         # axis=2 (broadcast)
+               [3, 2, 1],                                 # axis=3
+               [1, 0, 1, 0, 2, 3],                        # axis=4
+               5],                                        # axis=5
+           broadcast_dim_sizes=[
+               3,                                         # axis=0
+               [2, 0, 1],                                 # axis=2
+               [4, 1, 2],                                 # axis=2 (broadcast)
+               [3, 3, 3, 3, 2, 1, 1],                     # axis=3
+               [1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,    # axis=4
+                2, 3, 3],
+               5]),                                       # axis=5
+
+      dict(axis=0,
+           row_length=2,
+           original_dim_sizes=[1, 1, 2, (2, 1)],
+           broadcast_dim_sizes=[2, 1, 2, (2, 1, 2, 1)]),
+      dict(axis=1,
+           row_length=(2, 1),
+           original_dim_sizes=[2, 1, 2, (2, 1, 2, 1)],
+           broadcast_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+      dict(axis=2,
+           row_length=2,
+           original_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)],
+           broadcast_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+      dict(axis=3,
+           row_length=(2, 1, 2, 1, 2, 1),
+           original_dim_sizes=[2, (2, 1), 2, 1],
+           broadcast_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+  ])  # pyformat: disable
+  def testBroadcastDimension(self, axis, row_length, original_dim_sizes,
+                             broadcast_dim_sizes):
+    """Tests for the broadcast_dimension method.
+
+    Verifies that:
+
+    * `original.broadcast_dimension(axis, row_length) == broadcast`
+    * `broadcast.broadcast_dimension(axis, row_length) == broadcast`
+    * `broadcast.broadcast_dimension(axis, 1) == broadcast`
+
+    Args:
+      axis: The axis to broadcast
+      row_length: The slice lengths to broadcast to.
+      original_dim_sizes: The dimension sizes before broadcasting.
+        original_dim_sizes[axis] should be equal to `1` or `row_length`.
+      broadcast_dim_sizes: THe dimension sizes after broadcasting.
+    """
+    original_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        original_dim_sizes)
+    broadcast_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        broadcast_dim_sizes)
+    self.assertEqual(original_shape.rank, broadcast_shape.rank)
+    # shape[axis].value == 1 and row_length > 1:
+    bcast1 = original_shape.broadcast_dimension(axis, row_length)
+    # shape[axis].value > 1 and row_length == shape[axis].value:
+    bcast2 = broadcast_shape.broadcast_dimension(axis, row_length)
+    # shape[axis].value > 1 and row_length == 1:
+    bcast3 = broadcast_shape.broadcast_dimension(axis, 1)
+
+    self.assertShapeEq(bcast1, broadcast_shape)
+    self.assertShapeEq(bcast2, broadcast_shape)
+    self.assertShapeEq(bcast3, broadcast_shape)
+
+  @parameterized.parameters(
+      [
+          # Broadcast scalar
+          dict(x_dims=[], y_dims=[], expected_dims=[]),
+          dict(x_dims=[], y_dims=[2], expected_dims=[2]),
+          dict(x_dims=[], y_dims=[2, 3], expected_dims=[2, 3]),
+          dict(
+              x_dims=[],
+              y_dims=[2, (2, 3), (5, 7, 2, 0, 9)],
+              expected_dims=[2, (2, 3), (5, 7, 2, 0, 9)]),
+          # Broadcast vector
+          dict(x_dims=[3], y_dims=[4, 2, 3], expected_dims=[4, 2, 3]),
+          dict(x_dims=[1], y_dims=[4, 2, 3], expected_dims=[4, 2, 3]),
+          dict(x_dims=[3], y_dims=[4, 2, 1], expected_dims=[4, 2, 3]),
+          dict(
+              x_dims=[3],
+              y_dims=[3, (2, 3, 1), 1],
+              expected_dims=[3, (2, 3, 1), 3]),
+          dict(x_dims=[1], y_dims=[3, (2, 1, 3)], expected_dims=[3, (2, 1, 3)]),
+          dict(
+              x_dims=[1],
+              y_dims=[3, (2, 1, 3), 8],
+              expected_dims=[3, (2, 1, 3), 8]),
+          dict(
+              x_dims=[1],
+              y_dims=[2, (2, 3), (5, 7, 2, 0, 9)],
+              expected_dims=[2, (2, 3), (5, 7, 2, 0, 9)]),
+          # Mixed broadcasting
+          dict(
+              x_dims=[
+                  1,  # axis=0
+                  3,  # axis=1
+                  (3, 0, 2),  # axis=2
+                  1,  # axis=3
+                  2,  # axis=4
+              ],
+              y_dims=[
+                  2,  # axis=0
+                  1,  # axis=1
+                  1,  # axis=2
+                  (7, 2),  # axis=3
+                  1,  # axis=4
+              ],
+              expected_dims=[
+                  2,  # axis=0
+                  3,  # axis=1
+                  (3, 0, 2, 3, 0, 2),  # axis=2
+                  (7, 7, 7, 7, 7, 2, 2, 2, 2, 2),  # axis=3
+                  2,  # axis=4
+              ]),
+          dict(
+              x_dims=[2, (2, 1), 2, 1],
+              y_dims=[1, 1, 2, (2, 1)],
+              expected_dims=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+      ])
+  def testBroadcastDynamicShape(self, x_dims, y_dims, expected_dims):
+    x_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(x_dims)
+    y_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(y_dims)
+    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(expected_dims)
+    result1 = ragged.broadcast_dynamic_shape(x_shape, y_shape)
+    result2 = ragged.broadcast_dynamic_shape(y_shape, x_shape)
+    self.assertShapeEq(expected, result1)
+    self.assertShapeEq(expected, result2)
+
+  def testRepr(self):
+    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes([2, (2, 1), 2, 1])
+    self.assertRegexpMatches(
+        repr(shape),
+        r'RaggedTensorDynamicShape\('
+        r'partitioned_dim_sizes=\(<[^>]+>, <[^>]+>\), '
+        r'inner_dim_sizes=<[^>]+>\)')
+
+  @parameterized.parameters([
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, 2],
+          expected=[[10, 10], [20, 20], [30, 30]]),
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged.constant_value([[10, 10, 10], [], [30, 30]],
+                                         dtype=np.int32)),
+      dict(
+          x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
+          dim_sizes=[2, [2, 3], 3],
+          expected=ragged.constant_value(
+              [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
+              dtype=np.int32,
+              ragged_rank=1)),
+      dict(
+          x=[[[1]], [[2]]],  # shape = [2, 1, 1]
+          dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
+          expected=ragged.constant_value([[[], [1, 1]], [[2], [2, 2], []]],
+                                         dtype=np.int32,
+                                         ragged_rank=2)),
+      dict(
+          x=10,
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged.constant_value([[10, 10, 10], [], [10, 10]])),
+  ])
+  def testRaggedBroadcastTo(self, x, dim_sizes, expected):
+    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    result = ragged.broadcast_to(x, shape)
+    self.assertEqual(
+        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    self.assertRaggedEqual(result, expected)
+
+  @parameterized.parameters([
+      dict(
+          doc='x.shape=[3, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
+          y=[[10], [20], [30]],
+          expected=ragged.constant_value([[11, 12, 13], [], [34, 35]])),
+      dict(
+          doc='x.shape=[3, (D1)]; y.shape=[]; bcast.shape=[3, (D1)]',
+          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
+          y=10,
+          expected=ragged.constant_value([[11, 12, 13], [], [14, 15]])),
+      dict(
+          doc='x.shape=[1, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+          x=ragged.constant_value([[1, 2, 3]], dtype=np.int32),
+          y=[[10], [20], [30]],
+          expected=ragged.constant_value(
+              [[11, 12, 13], [21, 22, 23], [31, 32, 33]], dtype=np.int32)),
+      dict(
+          doc=('x.shape=[2, (D1), 1]; y.shape=[1, (D2)]; '
+               'bcast.shape=[2, (D1), (D2)]'),
+          x=ragged.constant_value([[[1], [2], [3]], [[4]]], ragged_rank=1),
+          y=ragged.constant_value([[10, 20, 30]]),
+          expected=ragged.constant_value([[[11, 21, 31], [12, 22, 32],
+                                           [13, 23, 33]], [[14, 24, 34]]])),
+      dict(
+          doc=('x.shape=[2, (D1), 1]; y.shape=[1, 1, 4]; '
+               'bcast.shape=[2, (D1), 4]'),
+          x=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
+          y=[[[1, 2, 3, 4]]],
+          expected=ragged.constant_value(
+              [[[11, 12, 13, 14], [21, 22, 23, 24]], [[31, 32, 33, 34]]],
+              ragged_rank=1)),
+      dict(
+          doc=('x.shape=[2, (D1), 2, 1]; y.shape=[2, (D2)]; '
+               'bcast.shape=[2, (D1), (2), (D2)'),
+          x=ragged.constant_value([[[[1], [2]], [[3], [4]]],
+                                   [[[5], [6]]]],
+                                  ragged_rank=1),
+          y=ragged.constant_value([[10, 20], [30]]),
+          expected=ragged.constant_value(
+              [[[[11, 21], [32]], [[13, 23], [34]]],
+               [[[15, 25], [36]]]])),
+  ])
+  def testRaggedAddWithBroadcasting(self, x, y, expected, doc):
+    expected_rrank = getattr(expected, 'ragged_rank', 0)
+    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    result = x + y
+    result_rrank = getattr(result, 'ragged_rank', 0)
+    self.assertEqual(expected_rrank, result_rrank)
+    if hasattr(expected, 'tolist'):
+      expected = expected.tolist()
+    self.assertRaggedEqual(result, expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8f1d97137d22376a39d9fa0e098f8c364383b65
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -0,0 +1,1207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for third_party.tensorflow.python.ops.ragged_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import RaggedTensor
+from tensorflow.python.platform import googletest
+
+
+class _SliceBuilder(object):
+  """Helper to construct arguments for __getitem__.
+
+  Usage: _SliceBuilder()[<expr>] slice_spec Python generates for <expr>.
+  """
+
+  def __getitem__(self, slice_spec):
+    return slice_spec
+
+
+SLICE_BUILDER = _SliceBuilder()
+
+
+def _make_tensor_slice_spec(slice_spec, use_constant=True):
+  """Wraps all integers in an extended slice spec w/ a tensor.
+
+  This function is used to help test slicing when the slice spec contains
+  tensors, rather than integers.
+
+  Args:
+    slice_spec: The extended slice spec.
+    use_constant: If true, then wrap each integer with a tf.constant.  If false,
+      then wrap each integer with a tf.placeholder.
+
+  Returns:
+    A copy of slice_spec, but with each integer i replaced with tf.constant(i).
+  """
+
+  def make_piece_scalar(piece):
+    if isinstance(piece, int):
+      scalar = constant_op.constant(piece)
+      if use_constant:
+        return scalar
+      else:
+        return array_ops.placeholder_with_default(scalar, [])
+    elif isinstance(piece, slice):
+      return slice(
+          make_piece_scalar(piece.start), make_piece_scalar(piece.stop),
+          make_piece_scalar(piece.step))
+    else:
+      return piece
+
+  if isinstance(slice_spec, tuple):
+    return tuple(make_piece_scalar(piece) for piece in slice_spec)
+  else:
+    return make_piece_scalar(slice_spec)
+
+
+# Example 2D ragged tensor value with one ragged dimension and with scalar
+# values, expressed as nested python lists and as splits+values.
+EXAMPLE_RAGGED_TENSOR_2D = [[b'a', b'b'], [b'c', b'd', b'e'], [b'f'], [],
+                            [b'g']]
+EXAMPLE_RAGGED_TENSOR_2D_SPLITS = [0, 2, 5, 6, 6, 7]
+EXAMPLE_RAGGED_TENSOR_2D_VALUES = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
+
+# Example 4D ragged tensor value, with two ragged dimensions and with values
+# whose shape is [2], expressed as nested python lists and as splits+values.
+EXAMPLE_RAGGED_TENSOR_4D = [
+    [                                       # rt[0]
+        [[1, 2], [3, 4], [5, 6]],           # rt[0][0]
+        [[7, 8], [9, 10], [11, 12]]],       # rt[0][1]
+    [],                                     # rt[1]
+    [                                       # rt[2]
+        [[13, 14], [15, 16], [17, 18]]],    # rt[2][0]
+    [                                       # rt[3]
+        [[19, 20]]]                         # rt[3][0]
+]  # pyformat: disable
+EXAMPLE_RAGGED_TENSOR_4D_SPLITS1 = [0, 2, 2, 3, 4]
+EXAMPLE_RAGGED_TENSOR_4D_SPLITS2 = [0, 3, 6, 9, 10]
+EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
+                                   [11, 12], [13, 14], [15, 16], [17, 18],
+                                   [19, 20]]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+  longMessage = True  # Property in unittest.Testcase. pylint: disable=invalid-name
+
+  #=============================================================================
+  # RaggedTensor class docstring examples
+  #=============================================================================
+
+  def testClassDocStringExamples(self):
+    # From section: "Component Tensors"
+    rt = RaggedTensor.from_row_splits(
+        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+    self.assertRaggedEqual(rt, [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    del rt
+
+    # From section: "Alternative Row-Partitioning Schemes"
+    values = [3, 1, 4, 1, 5, 9, 2, 6]
+    rt1 = RaggedTensor.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+    rt2 = RaggedTensor.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+    rt3 = RaggedTensor.from_value_rowids(
+        values, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+    rt4 = RaggedTensor.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+    rt5 = RaggedTensor.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+    for rt in (rt1, rt2, rt3, rt4, rt5):
+      self.assertRaggedEqual(rt, [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    del rt1, rt2, rt3, rt4, rt5
+
+    # From section: "Multiple Ragged Dimensions"
+    inner_rt = RaggedTensor.from_row_splits(
+        values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
+    outer_rt = RaggedTensor.from_row_splits(
+        values=inner_rt, row_splits=[0, 3, 3, 5])
+    self.assertEqual(outer_rt.ragged_rank, 2)
+    self.assertEqual(
+        self.eval_to_list(outer_rt),
+        [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    del inner_rt, outer_rt
+
+    # From section: "Multiple Ragged Dimensions"
+    rt = RaggedTensor.from_nested_row_splits(
+        flat_values=[3, 1, 4, 1, 5, 9, 2, 6],
+        nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8]))
+    self.assertEqual(
+        self.eval_to_list(rt), [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    del rt
+
+    # From section: "Uniform Inner Dimensions"
+    rt = RaggedTensor.from_row_splits(
+        values=array_ops.ones([5, 3]), row_splits=[0, 2, 5])
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
+    self.assertEqual(rt.shape.as_list(), [2, None, 3])
+    del rt
+
+  #=============================================================================
+  # RaggedTensorValue Constructor
+  #=============================================================================
+
+  def testRaggedTensorValueConstruction(self):
+    values = np.array(b'a b c d e f g'.split())
+    splits = np.array([0, 2, 5, 6, 6, 7], dtype=np.int64)
+    splits2 = np.array([0, 3, 5], dtype=np.int64)
+
+    # Test construction of a RaggedTensorValue with ragged_rank=1.
+    rt_value = ragged.RaggedTensorValue(values, splits)
+    self.assertEqual(rt_value.row_splits.dtype, np.int64)
+    self.assertEqual(rt_value.shape, (5, None))
+    self.assertEqual(len(rt_value.nested_row_splits), 1)
+    self.assertAllEqual(splits, rt_value.row_splits)
+    self.assertAllEqual(values, rt_value.values)
+    self.assertAllEqual(splits, rt_value.nested_row_splits[0])
+    self.assertAllEqual(values, rt_value.flat_values)
+
+    # Test construction of a RaggedTensorValue with ragged_rank=2.
+    rt_value = ragged.RaggedTensorValue(
+        values=ragged.RaggedTensorValue(values, splits), row_splits=splits2)
+    self.assertEqual(rt_value.row_splits.dtype, np.int64)
+    self.assertEqual(rt_value.shape, (2, None, None))
+    self.assertEqual(len(rt_value.nested_row_splits), 2)
+    self.assertAllEqual(splits2, rt_value.row_splits)
+    self.assertAllEqual(splits, rt_value.values.row_splits)
+    self.assertAllEqual(splits2, rt_value.nested_row_splits[0])
+    self.assertAllEqual(splits, rt_value.nested_row_splits[1])
+    self.assertAllEqual(values, rt_value.values.values)
+    self.assertAllEqual(values, rt_value.flat_values)
+
+  #=============================================================================
+  # RaggedTensor Constructor (private)
+  #=============================================================================
+
+  def testRaggedTensorConstruction(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    rt = RaggedTensor(values=values, row_splits=row_splits, internal=True)
+
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testRaggedTensorConstructionErrors(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'RaggedTensor constructor is private'):
+      RaggedTensor(values=values, row_splits=row_splits)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'values must be a Tensor or RaggedTensor'):
+      RaggedTensor(values=range(7), row_splits=row_splits, internal=True)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'Row-partitioning argument must be a Tensor'):
+      RaggedTensor(values=values, row_splits=[0, 2, 2, 5, 6, 7], internal=True)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shape \(6, 1\) must have rank 1'):
+      RaggedTensor(
+          values=values,
+          row_splits=array_ops.expand_dims(row_splits, 1),
+          internal=True)
+
+    with self.assertRaisesRegexp(TypeError,
+                                 'Cached value must be a Tensor or None.'):
+      RaggedTensor(
+          values=values,
+          row_splits=row_splits,
+          cached_row_lengths=[2, 3, 4],
+          internal=True)
+
+
+#=============================================================================
+# RaggedTensor Factory Ops
+#=============================================================================
+
+  def testFromValueRowIdsWithDerivedNRows(self):
+    # nrows is known at graph creation time.
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+
+    rt = RaggedTensor.from_value_rowids(values, value_rowids)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    self.assertAllEqual(rt_value_rowids, value_rowids)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromValueRowIdsWithDerivedNRowsDynamic(self):
+    # nrows is not known at graph creation time.
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    value_rowids = array_ops.placeholder_with_default(value_rowids, shape=None)
+
+    rt = RaggedTensor.from_value_rowids(values, value_rowids)
+    self.assertEqual(rt.dtype, dtypes.string)
+    if context.executing_eagerly():
+      self.assertEqual(rt.shape.as_list(), [5, None])
+    else:
+      self.assertEqual(rt.shape.as_list(), [None, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    self.assertAllEqual(rt_value_rowids, value_rowids)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromValueRowIdsWithExplicitNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(7, dtypes.int64)
+
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [7, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    self.assertIs(rt_nrows, nrows)  # cached_nrows
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
+
+  def testFromValueRowIdsWithExplicitNRowsEqualToDefault(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(5, dtypes.int64)
+
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
+    self.assertIs(rt_nrows, nrows)  # cached_nrows
+    self.assertAllEqual(rt_value_rowids, value_rowids)
+    self.assertAllEqual(rt_nrows, nrows)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromValueRowIdsWithEmptyValues(self):
+    rt = RaggedTensor.from_value_rowids([], [])
+    rt_nrows = rt.nrows()
+    self.assertEqual(rt.dtype, dtypes.float32)
+    self.assertEqual(rt.shape.as_list(), [0, None])
+    self.assertEqual(rt.ragged_rank, 1)
+    self.assertEqual(rt.values.shape.as_list(), [0])
+    self.assertEqual(rt.value_rowids().shape.as_list(), [0])
+    self.assertEqual(self.eval_to_list(rt_nrows), 0)
+    self.assertEqual(self.eval_to_list(rt), [])
+
+  def testFromRowSplits(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+
+    rt = RaggedTensor.from_row_splits(values, row_splits)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_splits = rt.row_splits
+    rt_nrows = rt.nrows()
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_row_splits, row_splits)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowSplitsWithEmptySplits(self):
+    err_msg = 'row_splits tensor may not be empty'
+    with self.assertRaisesRegexp(ValueError, err_msg):
+      RaggedTensor.from_row_splits([], [])
+
+  def testFromRowStarts(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
+
+    rt = RaggedTensor.from_row_starts(values, row_starts)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_starts = rt.row_starts()
+    rt_nrows = rt.nrows()
+
+    self.assertIs(rt_values, values)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertAllEqual(rt_row_starts, row_starts)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowLimits(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
+
+    rt = RaggedTensor.from_row_limits(values, row_limits)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_limits = rt.row_limits()
+    rt_nrows = rt.nrows()
+
+    self.assertIs(rt_values, values)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertAllEqual(rt_row_limits, row_limits)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromRowLengths(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_lengths = constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
+
+    rt = RaggedTensor.from_row_lengths(values, row_lengths)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [5, None])
+    self.assertEqual(rt.ragged_rank, 1)
+
+    rt_values = rt.values
+    rt_row_lengths = rt.row_lengths()
+    rt_nrows = rt.nrows()
+
+    self.assertIs(rt_values, values)
+    self.assertIs(rt_row_lengths, row_lengths)  # cached_nrows
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertAllEqual(rt_row_lengths, row_lengths)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+
+  def testFromNestedValueRowIdsWithDerivedNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+
+    rt = RaggedTensor.from_nested_value_rowids(values, nested_value_rowids)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [4, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_value_rowids = rt.value_rowids()
+    rt_values_values = rt_values.values
+    rt_values_value_rowids = rt_values.value_rowids()
+
+    self.assertIs(rt_values_values, values)
+    self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+    self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+
+  def testFromNestedValueRowIdsWithExplicitNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    nrows = [
+        constant_op.constant(6, dtypes.int64),
+        constant_op.constant(6, dtypes.int64)
+    ]
+
+    rt = RaggedTensor.from_nested_value_rowids(values, nested_value_rowids,
+                                               nrows)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [6, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
+    rt_values_values = rt_values.values
+    rt_values_value_rowids = rt_values.value_rowids()
+    rt_values_nrows = rt_values.nrows()
+
+    self.assertIs(rt_values_values, values)
+    self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+    self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+    self.assertAllEqual(rt_nrows, nrows[0])
+    self.assertAllEqual(rt_values_nrows, nrows[1])
+    self.assertEqual(
+        self.eval_to_list(rt), [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [],
+                                [[b'f'], [b'g'], []], [], []])
+
+  def testFromNestedValueRowIdsWithExplicitNRowsMismatch(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    nrows = [constant_op.constant(6, dtypes.int64)]
+    with self.assertRaisesRegexp(
+        ValueError, 'nested_nrows must have the same '
+        'length as nested_value_rowids'):
+      RaggedTensor.from_nested_value_rowids(values, nested_value_rowids, nrows)
+
+  def testFromNestedValueRowIdsWithNonListInput(self):
+    with self.assertRaisesRegexp(
+        TypeError, 'nested_value_rowids must be a list of Tensors'):
+      RaggedTensor.from_nested_value_rowids(
+          [1, 2, 3], constant_op.constant([[0, 1, 2], [0, 1, 2]], dtypes.int64))
+    with self.assertRaisesRegexp(TypeError,
+                                 'nested_nrows must be a list of Tensors'):
+      RaggedTensor.from_nested_value_rowids([1, 2, 3], [[0, 1, 2], [0, 1, 2]],
+                                            constant_op.constant([3, 3]))
+
+  def testFromNestedRowSplits(self):
+    flat_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_row_splits = [
+        constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
+        constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    ]
+
+    rt = RaggedTensor.from_nested_row_splits(flat_values, nested_row_splits)
+    self.assertEqual(rt.dtype, dtypes.string)
+    self.assertEqual(rt.shape.as_list(), [4, None, None])
+    self.assertEqual(rt.ragged_rank, 2)
+
+    rt_values = rt.values
+    rt_row_splits = rt.row_splits
+    rt_values_values = rt_values.values
+    rt_values_row_splits = rt_values.row_splits
+
+    self.assertIs(rt_values_values, flat_values)
+    self.assertIs(rt_row_splits, nested_row_splits[0])
+    self.assertIs(rt_values_row_splits, nested_row_splits[1])
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+
+  def testFromNestedRowSplitsWithNonListInput(self):
+    with self.assertRaisesRegexp(TypeError,
+                                 'nested_row_splits must be a list of Tensors'):
+      RaggedTensor.from_nested_row_splits(
+          [1, 2], constant_op.constant([[0, 1, 2], [0, 1, 2]], dtypes.int64))
+
+  def testFromValueRowIdsWithBadNRows(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    nrows = constant_op.constant(5, dtypes.int64)
+
+    with self.assertRaisesRegexp(ValueError, r'Expected nrows >= 0; got -2'):
+      RaggedTensor.from_value_rowids(
+          values=values,
+          value_rowids=array_ops.placeholder_with_default(value_rowids, None),
+          nrows=-2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=2, '
+        r'value_rowids\[-1\]=4'):
+      RaggedTensor.from_value_rowids(
+          values=values, value_rowids=value_rowids, nrows=2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=4, '
+        r'value_rowids\[-1\]=4'):
+      RaggedTensor.from_value_rowids(
+          values=values, value_rowids=value_rowids, nrows=4)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shape \(7, 1\) must have rank 1'):
+      RaggedTensor.from_value_rowids(
+          values=values,
+          value_rowids=array_ops.expand_dims(value_rowids, 1),
+          nrows=nrows)
+
+    with self.assertRaisesRegexp(ValueError, r'Shape \(1,\) must have rank 0'):
+      RaggedTensor.from_value_rowids(
+          values=values,
+          value_rowids=value_rowids,
+          nrows=array_ops.expand_dims(nrows, 0))
+
+  def testGraphMismatch(self):
+    if not context.executing_eagerly():
+      with ops.Graph().as_default():
+        values = constant_op.constant([1, 2, 3], dtypes.int64)
+      with ops.Graph().as_default():
+        splits = constant_op.constant([0, 2, 3], dtypes.int64)
+      self.assertRaisesRegexp(ValueError,
+                              '.* must be from the same graph as .*',
+                              RaggedTensor.from_row_splits, values, splits)
+
+  #=============================================================================
+  # Ragged Value & Row-Partitioning Tensor Accessors
+  #=============================================================================
+
+  def testRaggedTensorAccessors_2d(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    rt1 = RaggedTensor.from_row_splits(values, row_splits)
+    rt2 = RaggedTensor.from_value_rowids(values, value_rowids)
+
+    for rt in [rt1, rt2]:
+      self.assertRaggedEqual(
+          rt, [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+      self.assertAllEqual(rt.values, [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertEqual(rt.values.shape.dims[0].value, 7)
+      self.assertAllEqual(rt.value_rowids(), [0, 0, 2, 2, 2, 3, 4])
+      self.assertAllEqual(rt.nrows(), 5)
+      self.assertAllEqual(rt.row_splits, [0, 2, 2, 5, 6, 7])
+      self.assertAllEqual(rt.row_starts(), [0, 2, 2, 5, 6])
+      self.assertAllEqual(rt.row_limits(), [2, 2, 5, 6, 7])
+      self.assertAllEqual(rt.row_lengths(), [2, 0, 3, 1, 1])
+      self.assertAllEqual(rt.flat_values,
+                          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertLen(rt.nested_row_splits, 1)
+      self.assertAllEqual(rt.nested_row_splits[0], [0, 2, 2, 5, 6, 7])
+
+  def testRaggedTensorAccessors_3d_with_ragged_rank_1(self):
+    values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    rt1 = RaggedTensor.from_row_splits(values, row_splits)
+    rt2 = RaggedTensor.from_value_rowids(values, value_rowids)
+
+    for rt in [rt1, rt2]:
+      self.assertEqual(
+          self.eval_to_list(rt),
+          [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]], [[10, 11]],
+           [[12, 13]]])
+      self.assertEqual(
+          self.eval_to_list(rt.values),
+          [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+      self.assertEqual(rt.values.shape.dims[0].value, 7)
+      self.assertEqual(
+          self.eval_to_list(rt.value_rowids()), [0, 0, 2, 2, 2, 3, 4])
+      self.assertEqual(self.eval_to_list(rt.nrows()), 5)
+      self.assertEqual(self.eval_to_list(rt.row_splits), [0, 2, 2, 5, 6, 7])
+      self.assertEqual(self.eval_to_list(rt.row_starts()), [0, 2, 2, 5, 6])
+      self.assertEqual(self.eval_to_list(rt.row_limits()), [2, 2, 5, 6, 7])
+      self.assertEqual(self.eval_to_list(rt.row_lengths()), [2, 0, 3, 1, 1])
+      self.assertEqual(
+          self.eval_to_list(rt.flat_values),
+          [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+      self.assertEqual([self.eval_to_list(s) for s in rt.nested_row_splits],
+                       [[0, 2, 2, 5, 6, 7]])
+
+  def testRaggedTensorAccessors_3d_with_ragged_rank_2(self):
+    values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    nested_row_splits = [
+        constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
+        constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    ]
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    rt1 = RaggedTensor.from_nested_row_splits(values, nested_row_splits)
+    rt2 = RaggedTensor.from_nested_value_rowids(values, nested_value_rowids)
+
+    for rt in [rt1, rt2]:
+      self.assertEqual(
+          self.eval_to_list(rt),
+          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+      self.assertEqual(
+          self.eval_to_list(rt.values),
+          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+      self.assertEqual(rt.values.shape.dims[0].value, 5)
+      self.assertEqual(self.eval_to_list(rt.value_rowids()), [0, 0, 1, 3, 3])
+      self.assertEqual(self.eval_to_list(rt.nrows()), 4)
+      self.assertEqual(self.eval_to_list(rt.row_splits), [0, 2, 3, 3, 5])
+      self.assertEqual(self.eval_to_list(rt.row_starts()), [0, 2, 3, 3])
+      self.assertEqual(self.eval_to_list(rt.row_limits()), [2, 3, 3, 5])
+      self.assertEqual(self.eval_to_list(rt.row_lengths()), [2, 1, 0, 2])
+      self.assertEqual(
+          self.eval_to_list(rt.flat_values),
+          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertEqual([self.eval_to_list(s) for s in rt.nested_row_splits],
+                       [[0, 2, 3, 3, 5], [0, 2, 2, 5, 6, 7]])
+
+  #=============================================================================
+  # RaggedTensor.shape
+  #=============================================================================
+
+  def testShape(self):
+    """Tests for RaggedTensor.shape."""
+    rt1 = RaggedTensor.from_row_splits(b'a b c d e f g'.split(),
+                                       [0, 2, 5, 6, 6, 7])
+    self.assertEqual(rt1.shape.as_list(), [5, None])
+
+    rt2 = RaggedTensor.from_row_splits(
+        [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]],
+        [0, 2, 5, 6, 6, 7])
+    self.assertEqual(rt2.shape.as_list(), [5, None, 2])
+
+    rt3 = RaggedTensor.from_row_splits(
+        [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]], [0, 2, 2, 3])
+    self.assertEqual(rt3.shape.as_list(), [3, None, 2, 2])
+
+    rt4 = RaggedTensor.from_row_splits(rt3, [0, 1, 3, 3])
+    self.assertEqual(rt4.shape.as_list(), [3, None, None, 2, 2])
+
+    if not context.executing_eagerly():
+      rt5 = RaggedTensor.from_row_splits(
+          array_ops.placeholder(dtype=dtypes.string), [0, 2, 3, 5])
+      self.assertEqual(rt5.shape.ndims, None)
+
+      rt6 = RaggedTensor.from_row_splits(
+          [1, 2, 3], array_ops.placeholder(dtype=dtypes.int64))
+      self.assertEqual(rt6.shape.as_list(), [None, None])
+
+  #=============================================================================
+  # RaggedTensor.__getitem__
+  #=============================================================================
+
+  def _TestGetItem(self, rt, slice_spec, expected):
+    """Helper function for testing RaggedTensor.__getitem__.
+
+    Checks that calling `rt.__getitem__(slice_spec) returns the expected value.
+    Checks three different configurations for each slice spec:
+
+      * Call __getitem__ with the slice spec as-is (with int values)
+      * Call __getitem__ with int values in the slice spec wrapped in
+        `tf.constant()`.
+      * Call __getitem__ with int values in the slice spec wrapped in
+        `tf.placeholder()` (so value is not known at graph construction time).
+
+    Args:
+      rt: The RaggedTensor to test.
+      slice_spec: The slice spec.
+      expected: The expected value of rt.__getitem__(slice_spec), as a python
+        list; or an exception class.
+    """
+    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+    tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
+    value1 = self.eval_to_list(rt.__getitem__(slice_spec))
+    value2 = self.eval_to_list(rt.__getitem__(tensor_slice_spec1))
+    value3 = self.eval_to_list(rt.__getitem__(tensor_slice_spec2))
+    self.assertEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
+    self.assertEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
+    self.assertEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
+
+  def _TestGetItemException(self, rt, slice_spec, expected, message):
+    """Helper function for testing RaggedTensor.__getitem__ exceptions."""
+    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+    self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
+    self.assertRaisesRegexp(expected, message, rt.__getitem__,
+                            tensor_slice_spec1)
+
+  @parameterized.parameters(
+      # Tests for rt[i]
+      (SLICE_BUILDER[-5], EXAMPLE_RAGGED_TENSOR_2D[-5]),
+      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
+      (SLICE_BUILDER[-1], EXAMPLE_RAGGED_TENSOR_2D[-1]),
+      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
+      (SLICE_BUILDER[1], EXAMPLE_RAGGED_TENSOR_2D[1]),
+      (SLICE_BUILDER[4], EXAMPLE_RAGGED_TENSOR_2D[4]),
+
+      # Tests for rt[i:]
+      (SLICE_BUILDER[-6:], EXAMPLE_RAGGED_TENSOR_2D[-6:]),
+      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
+      (SLICE_BUILDER[-1:], EXAMPLE_RAGGED_TENSOR_2D[-1:]),
+      (SLICE_BUILDER[0:], EXAMPLE_RAGGED_TENSOR_2D[0:]),
+      (SLICE_BUILDER[3:], EXAMPLE_RAGGED_TENSOR_2D[3:]),
+      (SLICE_BUILDER[5:], EXAMPLE_RAGGED_TENSOR_2D[5:]),
+
+      # Tests for rt[:j]
+      (SLICE_BUILDER[:-6], EXAMPLE_RAGGED_TENSOR_2D[:-6]),
+      (SLICE_BUILDER[:-3], EXAMPLE_RAGGED_TENSOR_2D[:-3]),
+      (SLICE_BUILDER[:-1], EXAMPLE_RAGGED_TENSOR_2D[:-1]),
+      (SLICE_BUILDER[:0], EXAMPLE_RAGGED_TENSOR_2D[:0]),
+      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
+      (SLICE_BUILDER[:5], EXAMPLE_RAGGED_TENSOR_2D[:5]),
+
+      # Tests for rt[i:j]
+      (SLICE_BUILDER[0:3], EXAMPLE_RAGGED_TENSOR_2D[0:3]),
+      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
+      (SLICE_BUILDER[-5:3], EXAMPLE_RAGGED_TENSOR_2D[-5:3]),
+      (SLICE_BUILDER[3:1], EXAMPLE_RAGGED_TENSOR_2D[3:1]),
+      (SLICE_BUILDER[-1:1], EXAMPLE_RAGGED_TENSOR_2D[-1:1]),
+      (SLICE_BUILDER[1:-1], EXAMPLE_RAGGED_TENSOR_2D[1:-1]),
+
+      # Tests for rt[i, j]
+      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
+      (SLICE_BUILDER[1, 2], EXAMPLE_RAGGED_TENSOR_2D[1][2]),
+      (SLICE_BUILDER[-1, 0], EXAMPLE_RAGGED_TENSOR_2D[-1][0]),
+      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
+      (SLICE_BUILDER[:], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_2D),
+
+      # Empty slice spec.
+      ([], EXAMPLE_RAGGED_TENSOR_2D),
+
+      # Test for ellipsis
+      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_2D[2]),
+      (SLICE_BUILDER[..., :], EXAMPLE_RAGGED_TENSOR_2D),
+      (SLICE_BUILDER[..., 2, 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+      (SLICE_BUILDER[2, ..., 0], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_2D[2][0]),
+
+      # Test for array_ops.newaxis
+      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, array_ops.newaxis],
+       [[row] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+
+      # Slicing inner ragged dimensions.
+      (SLICE_BUILDER[-1:, 1:4],
+       [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D[-1:]]),
+      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_2D]),
+      # TODO(edloper): Add tests for strided slices, once support is added.
+  )
+  def testRaggedTensorGetItemWithRaggedRank1(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Ragged tensor
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  # pylint: disable=invalid-slice-index
+  @parameterized.parameters(
+      # Tests for out-of-bound errors
+      (SLICE_BUILDER[5],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[-6],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 2],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[3, 0],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+
+      # Indexing into an inner ragged dimension
+      (SLICE_BUILDER[:, 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+      (SLICE_BUILDER[:1, 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+      (SLICE_BUILDER[..., 3], ValueError,
+       'Cannot index into an inner ragged dimension'),
+
+      # Tests for type errors
+      (SLICE_BUILDER[0.5], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[1:3:0.5], TypeError, re.escape(
+          array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[:, 1:3:0.5], TypeError,
+       'slice strides must be integers or None'),
+      (SLICE_BUILDER[:, 0.5:1.5], TypeError,
+       'slice offsets must be integers or None'),
+      (SLICE_BUILDER['foo'], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[:, 'foo':'foo'], TypeError,
+       'slice offsets must be integers or None'),
+
+      # Tests for other errors
+      (SLICE_BUILDER[..., 0, 0, 0], IndexError,
+       'Too many indices for RaggedTensor'),
+  )
+  def testRaggedTensorGetItemErrorsWithRaggedRank1(self, slice_spec, expected,
+                                                   message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Ragged tensor
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      # Tests for rt[index, index, ...]
+      (SLICE_BUILDER[2, 0], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
+      (SLICE_BUILDER[2, 0, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
+      (SLICE_BUILDER[2, 0, 1, 1], EXAMPLE_RAGGED_TENSOR_4D[2][0][1][1]),
+      (SLICE_BUILDER[2, 0, 1:], EXAMPLE_RAGGED_TENSOR_4D[2][0][1:]),
+      (SLICE_BUILDER[2, 0, 1:, 1:], [[16], [18]]),
+      (SLICE_BUILDER[2, 0, :, 1], [14, 16, 18]),
+      (SLICE_BUILDER[2, 0, 1, :], EXAMPLE_RAGGED_TENSOR_4D[2][0][1]),
+
+      # Tests for rt[index, slice, ...]
+      (SLICE_BUILDER[0, :], EXAMPLE_RAGGED_TENSOR_4D[0]),
+      (SLICE_BUILDER[1, :], EXAMPLE_RAGGED_TENSOR_4D[1]),
+      (SLICE_BUILDER[0, :, :, 1], [[2, 4, 6], [8, 10, 12]]),
+      (SLICE_BUILDER[1, :, :, 1], []),
+      (SLICE_BUILDER[2, :, :, 1], [[14, 16, 18]]),
+      (SLICE_BUILDER[3, :, :, 1], [[20]]),
+
+      # Tests for rt[slice, slice, ...]
+      (SLICE_BUILDER[:, :], EXAMPLE_RAGGED_TENSOR_4D),
+      (SLICE_BUILDER[:, :, :, 1], [[[2, 4, 6], [8, 10, 12]], [], [[14, 16, 18]],
+                                   [[20]]]),
+      (SLICE_BUILDER[1:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
+      (SLICE_BUILDER[-3:, :, :, 1], [[], [[14, 16, 18]], [[20]]]),
+
+      # Test for ellipsis
+      (SLICE_BUILDER[...], EXAMPLE_RAGGED_TENSOR_4D),
+      (SLICE_BUILDER[2, ...], EXAMPLE_RAGGED_TENSOR_4D[2]),
+      (SLICE_BUILDER[2, 0, ...], EXAMPLE_RAGGED_TENSOR_4D[2][0]),
+      (SLICE_BUILDER[..., 0], [[[1, 3, 5], [7, 9, 11]], [], [[13, 15, 17]],
+                               [[19]]]),
+      (SLICE_BUILDER[2, ..., 0], [[13, 15, 17]]),
+      (SLICE_BUILDER[2, 0, ..., 0], [13, 15, 17]),
+
+      # Test for array_ops.newaxis
+      (SLICE_BUILDER[array_ops.newaxis, :], [EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, array_ops.newaxis],
+       [[row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+
+      # Empty slice spec.
+      ([], EXAMPLE_RAGGED_TENSOR_4D),
+
+      # Slicing inner ragged dimensions.
+      (SLICE_BUILDER[:, 1:4], [row[1:4] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, -2:], [row[-2:] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, :-1],
+       [[v[:-1] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, 1:2],
+       [[v[1:2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[1:, 1:3, 1:2],
+       [[v[1:2] for v in row[1:3]] for row in EXAMPLE_RAGGED_TENSOR_4D[1:]]),
+
+      # Strided slices
+      (SLICE_BUILDER[::2], EXAMPLE_RAGGED_TENSOR_4D[::2]),
+      (SLICE_BUILDER[1::2], EXAMPLE_RAGGED_TENSOR_4D[1::2]),
+      (SLICE_BUILDER[:, ::2], [row[::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, 1::2], [row[1::2] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, ::2],
+       [[v[::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+      (SLICE_BUILDER[:, :, 1::2],
+       [[v[1::2] for v in row] for row in EXAMPLE_RAGGED_TENSOR_4D]),
+
+      # TODO(edloper): Add tests for strided slices, once support is added.
+      # TODO(edloper): Add tests slicing inner ragged dimensions, one support
+      # is added.
+  )
+  def testRaggedTensorGetItemWithRaggedRank2(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_nested_row_splits(
+        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
+        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_4D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      # Test for errors in unsupported cases
+      (SLICE_BUILDER[:, 0], ValueError,
+       'Cannot index into an inner ragged dimension.'),
+      (SLICE_BUILDER[:, :, 0], ValueError,
+       'Cannot index into an inner ragged dimension.'),
+
+      # Test for out-of-bounds errors.
+      (SLICE_BUILDER[1, 0],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 0, 3],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 5],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+  )
+  def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
+                                                   message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_nested_row_splits(
+        EXAMPLE_RAGGED_TENSOR_4D_VALUES,
+        [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_4D)
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[:], []),
+      (SLICE_BUILDER[2:], []),
+      (SLICE_BUILDER[:-3], []),
+  )
+  def testRaggedTensorGetItemWithEmptyTensor(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_row_splits([], [0])
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[0],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[-1],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+  )
+  def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
+                                                   message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    rt = RaggedTensor.from_row_splits([], [0])
+    self._TestGetItemException(rt, slice_spec, expected, message)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[-4], EXAMPLE_RAGGED_TENSOR_2D[-4]),
+      (SLICE_BUILDER[0], EXAMPLE_RAGGED_TENSOR_2D[0]),
+      (SLICE_BUILDER[-3:], EXAMPLE_RAGGED_TENSOR_2D[-3:]),
+      (SLICE_BUILDER[:3], EXAMPLE_RAGGED_TENSOR_2D[:3]),
+      (SLICE_BUILDER[3:5], EXAMPLE_RAGGED_TENSOR_2D[3:5]),
+      (SLICE_BUILDER[0, 1], EXAMPLE_RAGGED_TENSOR_2D[0][1]),
+      (SLICE_BUILDER[-3, 0], EXAMPLE_RAGGED_TENSOR_2D[-3][0]),
+  )
+  def testRaggedTensorGetItemWithPlaceholderShapes(self, slice_spec, expected):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    # Intentionally use an unknown shape for `splits`, to force the code path
+    # that deals with having nrows unknown at graph construction time.
+    splits = constant_op.constant(
+        EXAMPLE_RAGGED_TENSOR_2D_SPLITS, dtype=dtypes.int64)
+    splits = array_ops.placeholder_with_default(splits, None)
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_2D)
+    self._TestGetItem(rt, slice_spec, expected)
+
+  @parameterized.parameters(
+      (SLICE_BUILDER[..., 2], ValueError,
+       'Ellipsis not supported for unknown shape RaggedTensors'),)
+  def testRaggedTensorGetItemErrorsWithPlaceholderShapes(
+      self, slice_spec, expected, message):
+    """Test that rt.__getitem__(slice_spec) == expected."""
+    if not context.executing_eagerly():
+      # Intentionally use an unknown shape for `values`.
+      values = array_ops.placeholder_with_default([0], None)
+      rt = RaggedTensor.from_row_splits(values, [0, 1])
+      self._TestGetItemException(rt, slice_spec, expected, message)
+
+  def testGetItemNewAxis(self):
+    # rt: [[[['a', 'b'], ['c', 'd']], [], [['e', 'f']]], []]
+    splits1 = [0, 3, 3]
+    splits2 = [0, 2, 2, 3]
+    values = constant_op.constant([['a', 'b'], ['c', 'd'], ['e', 'f']])
+    rt = RaggedTensor.from_nested_row_splits(values, [splits1, splits2])
+    rt_newaxis0 = rt[array_ops.newaxis]
+    rt_newaxis1 = rt[:, array_ops.newaxis]
+    rt_newaxis2 = rt[:, :, array_ops.newaxis]
+    rt_newaxis3 = rt[:, :, :, array_ops.newaxis]
+    rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
+
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis0),
+        [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis1),
+        [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis2),
+        [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis3),
+        [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis4),
+        [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
+
+    self.assertEqual(rt.ragged_rank, 2)
+    self.assertEqual(rt_newaxis0.ragged_rank, 3)
+    self.assertEqual(rt_newaxis1.ragged_rank, 3)
+    self.assertEqual(rt_newaxis2.ragged_rank, 3)
+    self.assertEqual(rt_newaxis3.ragged_rank, 2)
+    self.assertEqual(rt_newaxis4.ragged_rank, 2)
+
+    self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
+    self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
+    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
+    self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
+    self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
+
+  #=============================================================================
+  # RaggedTensor.__str__
+  #=============================================================================
+  def testRaggedTensorStr(self):
+    values = [b'a', b'b', b'c', b'd', b'e', b'f', b'g']
+    row_splits = [0, 2, 5, 6, 6, 7]
+    rt = RaggedTensor.from_row_splits(values, row_splits)
+    if context.executing_eagerly():
+      expected_str = '<tf.RaggedTensor {}>'.format([[b'a', b'b'],
+                                                    [b'c', b'd', b'e'], [b'f'],
+                                                    [], [b'g']])
+      expected_repr = (
+          'tf.RaggedTensor(values=tf.Tensor([{}], shape=(7,), dtype=string), '
+          'row_splits=tf.Tensor([{}], shape=(6,), dtype=int64))'.format(
+              ' '.join(repr(x) for x in values), ' '.join(
+                  repr(x) for x in row_splits)))
+      self.assertEqual(str(rt), expected_str)
+      self.assertEqual(repr(rt), expected_repr)
+    else:
+      expected_repr = (
+          'tf.RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
+          'shape=(7,), dtype=string), row_splits='
+          'Tensor("RaggedFromRowSplits/row_splits:0", '
+          'shape=(6,), dtype=int64))')
+      self.assertEqual(repr(rt), expected_repr)
+      self.assertEqual(str(rt), expected_repr)
+
+  def testRaggedTensorValueStr(self):
+    values = [b'a', b'b', b'c', b'd', b'e', b'f', b'g']
+    row_splits = [0, 2, 5, 6, 6, 7]
+    rt = ragged.RaggedTensorValue(
+        np.array(values), np.array(row_splits, dtype=np.int64))
+    expected_str = '<tf.RaggedTensorValue {}>'.format([[b'a', b'b'],
+                                                       [b'c', b'd', b'e'],
+                                                       [b'f'], [], [b'g']])
+    expected_repr = ("tf.RaggedTensorValue(values=array({}, dtype='|S1'), "
+                     'row_splits=array({}))'.format(values, row_splits))
+    self.assertEqual(' '.join(str(rt).split()), expected_str)
+    self.assertEqual(' '.join(repr(rt).split()), expected_repr)
+
+  #=============================================================================
+  # RaggedTensor.with_values() and RaggedTensor.with_flat_values().
+  #=============================================================================
+
+  def testWithValues(self):
+    rt1 = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    rt2 = ragged.constant([[[1, 2], [3, 4, 5]], [[6]], [], [[], [7]]])
+
+    rt1_plus_10 = rt1.with_values(rt1.values + 10)
+    rt2_times_10 = rt2.with_flat_values(rt2.flat_values * 10)
+    rt1_expanded = rt1.with_values(array_ops.expand_dims(rt1.values, axis=1))
+
+    self.assertEqual(
+        self.eval_to_list(rt1_plus_10),
+        [[11, 12], [13, 14, 15], [16], [], [17]])
+    self.assertEqual(
+        self.eval_to_list(rt2_times_10),
+        [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
+    self.assertEqual(
+        self.eval_to_list(rt1_expanded),
+        [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
+
+  #=============================================================================
+  # Session.run
+  #=============================================================================
+  def testSessionRun(self):
+    if context.executing_eagerly():
+      return
+
+    rt1 = ragged.constant([[1, 2, 3], [4]])
+    rt2 = ragged.constant([[[], [1, 2]], [[3]]])
+    with self.test_session() as session:
+      result = session.run({'rt1': rt1, 'rt2': rt2})
+      self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
+      self.assertEqual(result['rt1'].to_list(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].to_list(), [[[], [1, 2]], [[3]]])
+
+  def testSessionRunFeed(self):
+    if context.executing_eagerly():
+      return
+
+    rt1 = RaggedTensor.from_row_splits(
+        array_ops.placeholder(dtypes.int32),
+        array_ops.placeholder(dtypes.int64))
+    rt2 = RaggedTensor.from_nested_row_splits(
+        array_ops.placeholder(dtypes.int32), [
+            array_ops.placeholder(dtypes.int64),
+            array_ops.placeholder(dtypes.int64)
+        ])
+
+    rt1_feed_val = ragged.constant_value([[1, 2, 3], [4]])
+    rt2_feed_val = ragged.constant_value([[[], [1, 2]], [[3]]])
+
+    with self.test_session() as session:
+      result = session.run({
+          'rt1': rt1,
+          'rt2': rt2
+      },
+                           feed_dict={
+                               rt1: rt1_feed_val,
+                               rt2: rt2_feed_val
+                           })
+      self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
+      self.assertEqual(result['rt1'].to_list(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].to_list(), [[[], [1, 2]], [[3]]])
+
+  def testSessionPartialRunFeed(self):
+    if context.executing_eagerly():
+      return
+
+    # Placeholder inputs.
+    a = RaggedTensor.from_row_splits(
+        array_ops.placeholder(dtypes.int32, shape=[None], name='a.values'),
+        array_ops.placeholder(dtypes.int64, name='a.row_splits'))
+    b = RaggedTensor.from_row_splits(
+        array_ops.placeholder(dtypes.int32, shape=[None], name='b.values'),
+        array_ops.placeholder(dtypes.int64, name='b.row_splits'))
+    c = array_ops.placeholder(dtypes.int32, shape=[], name='c')
+
+    # Feed values for placeholder inputs.
+    a_val = ragged.constant_value([[1, 2, 3], [4]])
+    b_val = ragged.constant_value([[5, 4, 3], [2]])
+    c_val = 3
+
+    # Compute some values.
+    r1 = ragged.reduce_sum(a * b, axis=1)
+    r2 = ragged.reduce_sum(a + c, axis=1)
+
+    with self.test_session() as session:
+      handle = session.partial_run_setup([r1, r2], [a, b, c])
+
+      res1 = session.partial_run(handle, r1, feed_dict={a: a_val, b: b_val})
+      self.assertAllEqual(res1, [22, 8])
+
+      res2 = session.partial_run(handle, r2, feed_dict={c: c_val})
+      self.assertAllEqual(res2, [15, 7])
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_value.py b/tensorflow/python/ops/ragged/ragged_tensor_value.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf0ac4482a1c12eb620d33471a6474d10af11875
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_value.py
@@ -0,0 +1,100 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Value for RaggedTensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+class RaggedTensorValue(object):
+  """Represents the value of a `RaggedTensor`.
+
+  See `RaggedTensor` for a description of ragged tensors.
+  """
+
+  def __init__(self, values, row_splits):
+    """Creates a `RaggedTensorValue`.
+
+    Args:
+      values: A numpy array of any type and shape; or a RaggedTensorValue.
+      row_splits: A 1-D int64 numpy array.
+    """
+    if not (isinstance(row_splits, (np.ndarray, np.generic)) and
+            row_splits.dtype == np.int64 and row_splits.ndim == 1):
+      raise TypeError("row_splits must be a 1D int64 numpy array")
+    if not isinstance(values, (np.ndarray, np.generic, RaggedTensorValue)):
+      raise TypeError("values must be a numpy array or a RaggedTensorValue")
+    self._values = values
+    self._row_splits = row_splits
+
+  row_splits = property(
+      lambda self: self._row_splits,
+      doc="""The split indices for the ragged tensor value.""")
+  values = property(
+      lambda self: self._values,
+      doc="""The concatenated values for all rows in this tensor.""")
+  dtype = property(
+      lambda self: self._values.dtype,
+      doc="""The numpy dtype of values in this tensor.""")
+
+  @property
+  def flat_values(self):
+    """The innermost `values` array for this ragged tensor value."""
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensorValue):
+      rt_values = rt_values.values
+    return rt_values
+
+  @property
+  def nested_row_splits(self):
+    """The row_splits for all ragged dimensions in this ragged tensor value."""
+    rt_nested_splits = [self.row_splits]
+    rt_values = self.values
+    while isinstance(rt_values, RaggedTensorValue):
+      rt_nested_splits.append(rt_values.row_splits)
+      rt_values = rt_values.values
+    return tuple(rt_nested_splits)
+
+  @property
+  def ragged_rank(self):
+    """The number of ragged dimensions in this ragged tensor value."""
+    values_is_ragged = isinstance(self._values, RaggedTensorValue)
+    return self._values.ragged_rank + 1 if values_is_ragged else 1
+
+  @property
+  def shape(self):
+    """A tuple indicating the shape of this RaggedTensorValue."""
+    return (self._row_splits.shape[0] - 1,) + (None,) + self._values.shape[1:]
+
+  def __str__(self):
+    return "<tf.RaggedTensorValue %s>" % self.to_list()
+
+  def __repr__(self):
+    return "tf.RaggedTensorValue(values=%r, row_splits=%r)" % (self._values,
+                                                               self._row_splits)
+
+  def to_list(self):
+    """Returns this ragged tensor value as a nested Python list."""
+    if isinstance(self._values, RaggedTensorValue):
+      values_as_list = self._values.to_list()
+    else:
+      values_as_list = self._values.tolist()
+    return [
+        values_as_list[self._row_splits[i]:self._row_splits[i + 1]]
+        for i in range(len(self._row_splits) - 1)
+    ]
diff --git a/tensorflow/python/ops/ragged/ragged_test_util.py b/tensorflow/python/ops/ragged/ragged_test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..027417664d23683e0eb3906892b81c29c8847f6a
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_test_util.py
@@ -0,0 +1,95 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# pylint: disable=invalid-name
+"""Test utils for tensorflow RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+
+
+class RaggedTensorTestCase(test_util.TensorFlowTestCase):
+  """Base class for RaggedTensor test cases."""
+
+  def _GetPyList(self, a):
+    """Converts a to a nested python list."""
+    if isinstance(a, ragged.RaggedTensor):
+      return self.evaluate(a).to_list()
+    elif isinstance(a, ops.Tensor):
+      a = self.evaluate(a)
+      return a.tolist() if isinstance(a, np.ndarray) else a
+    elif isinstance(a, np.ndarray):
+      return a.tolist()
+    elif isinstance(a, ragged.RaggedTensorValue):
+      return a.to_list()
+    else:
+      return np.array(a).tolist()
+
+  def assertRaggedEqual(self, a, b):
+    """Asserts that two potentially ragged tensors are equal."""
+    a_list = self._GetPyList(a)
+    b_list = self._GetPyList(b)
+    self.assertEqual(a_list, b_list)
+
+    if not (isinstance(a, (list, tuple)) or isinstance(b, (list, tuple))):
+      a_ragged_rank = a.ragged_rank if ragged.is_ragged(a) else 0
+      b_ragged_rank = b.ragged_rank if ragged.is_ragged(b) else 0
+      self.assertEqual(a_ragged_rank, b_ragged_rank)
+
+  def assertRaggedAlmostEqual(self, a, b, places=7):
+    a_list = self._GetPyList(a)
+    b_list = self._GetPyList(b)
+    self.assertNestedListAlmostEqual(a_list, b_list, places, context='value')
+
+    if not (isinstance(a, (list, tuple)) or isinstance(b, (list, tuple))):
+      a_ragged_rank = a.ragged_rank if ragged.is_ragged(a) else 0
+      b_ragged_rank = b.ragged_rank if ragged.is_ragged(b) else 0
+      self.assertEqual(a_ragged_rank, b_ragged_rank)
+
+  def assertNestedListAlmostEqual(self, a, b, places=7, context='value'):
+    self.assertEqual(type(a), type(b))
+    if isinstance(a, (list, tuple)):
+      self.assertLen(a, len(b), 'Length differs for %s' % context)
+      for i in range(len(a)):
+        self.assertNestedListAlmostEqual(a[i], b[i], places,
+                                         '%s[%s]' % (context, i))
+    else:
+      self.assertAlmostEqual(
+          a, b, places,
+          '%s != %s within %s places at %s' % (a, b, places, context))
+
+  def eval_to_list(self, tensor):
+    value = self.evaluate(tensor)
+    if ragged.is_ragged(value):
+      return value.to_list()
+    elif isinstance(value, np.ndarray):
+      return value.tolist()
+    else:
+      return value
+
+  def _eval_tensor(self, tensor):
+    if ragged.is_ragged(tensor):
+      return ragged.RaggedTensorValue(
+          self._eval_tensor(tensor.values),
+          self._eval_tensor(tensor.row_splits))
+    else:
+      return test_util.TensorFlowTestCase._eval_tensor(self, tensor)
diff --git a/tensorflow/python/ops/ragged/ragged_tile_op_test.py b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3445571bff6c75e7a22e458bdf99d3886cd9614
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
@@ -0,0 +1,224 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.tile."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTileOpTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Example
+      #=========================================================================
+      dict(
+          descr='docstring example: ragged_rank=1, repeat axes 0 and 1',
+          rt_input=[[1, 2], [3]],
+          multiples=[3, 2],
+          expected=[
+              [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]],
+      ),
+
+      #=========================================================================
+      # rank=3, ragged_rank=2
+      #=========================================================================
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axis 0',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[2, 1, 1],
+          expected=[[[1, 2], [3]], [], [[4]],
+                    [[1, 2], [3]], [], [[4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axis 1',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[1, 2, 1],
+          expected=[[[1, 2], [3], [1, 2], [3]], [], [[4], [4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axis 2',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[1, 1, 2],
+          expected=[[[1, 2, 1, 2], [3, 3]], [], [[4, 4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axes 0 and 1',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[2, 2, 1],
+          expected=[[[1, 2], [3], [1, 2], [3]], [], [[4], [4]],
+                    [[1, 2], [3], [1, 2], [3]], [], [[4], [4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axes 0 and 2',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[2, 1, 2],
+          expected=[[[1, 2, 1, 2], [3, 3]], [], [[4, 4]],
+                    [[1, 2, 1, 2], [3, 3]], [], [[4, 4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat axes 1 and 2',
+          rt_input=[[[1, 2], [3]], [], [[4]]],
+          multiples=[1, 2, 2],
+          expected=[[[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]],
+                    [], [[4, 4], [4, 4]]]),
+      dict(
+          descr='rank=3, ragged_rank=2, repeat all axes',
+          rt_input=[[['a', 'b'], ['c']], [], [['d']]],
+          multiples=[4, 3, 2],
+          expected=[[[b'a', b'b']*2, [b'c']*2]*3, []*3, [[b'd']*2]*3]*4),
+      #=========================================================================
+      # rank=3, ragged_rank=1
+      #=========================================================================
+      dict(
+          descr='rank=3, ragged_rank=1, repeat axis 0',
+          ragged_rank=1,
+          rt_input=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          multiples=[2, 1, 1],
+          expected=[[[1, 2], [3, 4]], [], [[5, 6]],
+                    [[1, 2], [3, 4]], [], [[5, 6]]]),
+      dict(
+          descr='rank=3, ragged_rank=1, repeat axis 1',
+          ragged_rank=1,
+          rt_input=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          multiples=[1, 2, 1],
+          expected=[[[1, 2], [3, 4], [1, 2], [3, 4]], [], [[5, 6], [5, 6]]]),
+      dict(
+          descr='rank=3, ragged_rank=1, repeat axis 2',
+          ragged_rank=1,
+          rt_input=[[[1, 2], [3, 4]], [], [[5, 6]]],
+          multiples=[1, 1, 2],
+          expected=[[[1, 2, 1, 2], [3, 4, 3, 4]], [], [[5, 6, 5, 6]]]),
+      #=========================================================================
+      # rank=4, ragged_rank=3
+      #=========================================================================
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 0',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[2, 1, 1, 1],
+          expected=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]],
+                    [[[1], [2]], [[3]]], [[]], [[[4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 1',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 2, 1, 1],
+          expected=[[[[1], [2]], [[3]], [[1], [2]], [[3]]],
+                    [[], []],
+                    [[[4, 5]], [[4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 2',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 2, 1],
+          expected=[[[[1], [2], [1], [2]], [[3], [3]]],
+                    [[]],
+                    [[[4, 5], [4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 3',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 1, 2],
+          expected=[[[[1, 1], [2, 2]], [[3, 3]]], [[]], [[[4, 5, 4, 5]]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat all axes',
+          rt_input=[[[['a'], ['b']], [['c']]], [[]], [[['d', 'e']]]],
+          multiples=[5, 4, 3, 2],
+          expected=[[[[b'a']*2, [b'b']*2]*3, [[b'c']*2]*3]*4,
+                    [[]*3]*4,
+                    [[[b'd', b'e']*2]*3]*4]*5),
+      dict(
+          descr='rank=5, ragged_rank=4, repeat all axes',
+          rt_input=[[[[['a']]]]],
+          multiples=[6, 5, 4, 3, 2],
+          expected=[[[[[b'a']*2]*3]*4]*5]*6),
+      #=========================================================================
+      # multiple=0
+      #=========================================================================
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 0 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[0, 1, 1, 1],
+          expected=[]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 1 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 0, 1, 1],
+          expected=[[], [], []]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 2 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 0, 1],
+          expected=[[[], []], [[]], [[]]]),
+      dict(
+          descr='rank=4, ragged_rank=3, repeat axis 3 (multiple=0)',
+          rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
+          multiples=[1, 1, 1, 0],
+          expected=[[[[], []], [[]]], [[]], [[[]]]]),
+      #=========================================================================
+      # multiple=1
+      #=========================================================================
+      dict(
+          descr='rank=4, multiples=1 (no repeats)',
+          rt_input=[[[[1], [2]], [[3], [4]]], [[[5], [6]]]],
+          multiples=[1, 1, 1, 1],
+          expected=[[[[1], [2]], [[3], [4]]],
+                    [[[5], [6]]]]),
+
+  ])  # pyformat: disable
+  def testRaggedTile(self,
+                     descr,
+                     rt_input,
+                     multiples,
+                     expected,
+                     ragged_rank=None):
+    rt = ragged.constant(rt_input, ragged_rank)
+
+    expected_shape = [
+        None if dim is None else dim * multiple
+        for (dim, multiple) in zip(rt.shape.as_list(), multiples)
+    ]
+
+    # Test with both const & non-const multiples: ragged_tile has a few code
+    # paths that optimize the case where multiples[d] is known to be 1.
+    const_multiples = constant_op.constant(multiples, dtypes.int64)
+    non_const_multiples = array_ops.placeholder_with_default(
+        const_multiples, shape=[len(multiples)])
+
+    for multiples_tensor in (const_multiples, non_const_multiples):
+      tiled = ragged.tile(rt, multiples_tensor)
+      self.assertEqual(tiled.ragged_rank, rt.ragged_rank)
+      self.assertEqual(tiled.shape.ndims, rt.shape.ndims)
+      if multiples_tensor is const_multiples:
+        self.assertEqual(tiled.shape.as_list(), expected_shape)
+      self.assertRaggedEqual(tiled, expected)
+
+  def testRaggedTileWithTensorInput(self):
+    # When the input is a `Tensor`, ragged_tile just delegates to tf.tile.
+    dt = constant_op.constant([[1, 2], [3, 4]])
+    tiled = ragged.tile(dt, [3, 2])
+    expected = [[1, 2, 1, 2], [3, 4, 3, 4],
+                [1, 2, 1, 2], [3, 4, 3, 4],
+                [1, 2, 1, 2], [3, 4, 3, 4]]  # pyformat: disable
+    self.assertRaggedEqual(tiled, expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d7a56a7c8e0fa7a008625314e30786ffbbfefe
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
@@ -0,0 +1,194 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.to_sparse op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
+
+  def testDocStringExample(self):
+    rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(st.indices,
+                        [[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]])
+    self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6])
+    self.assertAllEqual(st.dense_shape, [4, 3])
+
+  def test2DRaggedTensorWithOneRaggedDimension(self):
+    rt = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(
+        st.indices, [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [4, 0]])
+    self.assertAllEqual(st.values, b'a b c d e f g'.split())
+    self.assertAllEqual(st.dense_shape, [5, 3])
+
+  def test3DRaggedTensorWithOneRaggedDimension(self):
+    rt = ragged.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]],
+                          [[11, 12]], [], [[13, 14]]],
+                         ragged_rank=1)
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(st.indices,
+                        [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0],
+                         [1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 2, 0], [1, 2, 1],
+                         [2, 0, 0], [2, 0, 1], [4, 0, 0], [4, 0, 1]])
+    self.assertAllEqual(st.values,
+                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+    self.assertAllEqual(st.dense_shape, [5, 3, 2])
+
+  def test4DRaggedTensorWithOneRaggedDimension(self):
+    rt = ragged.constant(
+        [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [], [[[9, 10], [11, 12]]]],
+        ragged_rank=1)
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
+    self.assertAllEqual(
+        st.indices,
+        [
+            [0, 0, 0, 0],  # index for value=1
+            [0, 0, 0, 1],  # index for value=2
+            [0, 0, 1, 0],  # index for value=3
+            [0, 0, 1, 1],  # index for value=4
+            [0, 1, 0, 0],  # index for value=5
+            [0, 1, 0, 1],  # index for value=6
+            [0, 1, 1, 0],  # index for value=7
+            [0, 1, 1, 1],  # index for value=8
+            [2, 0, 0, 0],  # index for value=9
+            [2, 0, 0, 1],  # index for value=10
+            [2, 0, 1, 0],  # index for value=11
+            [2, 0, 1, 1],  # index for value=12
+        ])
+    self.assertAllEqual(st.dense_shape, [3, 2, 2, 2])
+
+  def test4DRaggedTensorWithTwoRaggedDimensions(self):
+    rt = ragged.constant([[[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]]],
+                          [[[11, 12]], [], [[13, 14]]], []],
+                         ragged_rank=2)
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(
+        st.indices,
+        [
+            [0, 0, 0, 0],  # index for value=1
+            [0, 0, 0, 1],  # index for value=2
+            [0, 0, 1, 0],  # index for value=3
+            [0, 0, 1, 1],  # index for value=4
+            [0, 1, 0, 0],  # index for value=5
+            [0, 1, 0, 1],  # index for value=6
+            [0, 1, 1, 0],  # index for value=7
+            [0, 1, 1, 1],  # index for value=8
+            [0, 1, 2, 0],  # index for value=9
+            [0, 1, 2, 1],  # index for value=10
+            [1, 0, 0, 0],  # index for value=11
+            [1, 0, 0, 1],  # index for value=12
+            [1, 2, 0, 0],  # index for value=13
+            [1, 2, 0, 1],  # index for value=14
+        ])
+    self.assertAllEqual(st.values,
+                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+    self.assertAllEqual(st.dense_shape, [3, 3, 3, 2])
+
+  def testShape(self):
+    rt = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    st = rt.to_sparse()
+    self.assertEqual(st.indices.shape.as_list(), [7, 2])
+    self.assertEqual(st.values.shape.as_list(), [7])
+    self.assertEqual(st.dense_shape.shape.as_list(), [2])
+
+    rt = ragged.constant([[[1, 2]], [], [[3, 4]], []], ragged_rank=1)
+    st = rt.to_sparse()
+    self.assertEqual(st.indices.shape.as_list(), [4, 3])
+    self.assertEqual(st.values.shape.as_list(), [4])
+    self.assertEqual(st.dense_shape.shape.as_list(), [3])
+
+    rt = ragged.constant([[[1], [2, 3, 4, 5, 6, 7]], [[]]])
+    st = rt.to_sparse()
+    self.assertEqual(st.indices.shape.as_list(), [7, 3])
+    self.assertEqual(st.values.shape.as_list(), [7])
+    self.assertEqual(st.dense_shape.shape.as_list(), [3])
+
+  def testKernelErrors(self):
+    # An empty vector, defined using a placeholder to ensure that we can't
+    # determine that it's invalid at graph-construction time.
+    empty_vector = array_ops.placeholder_with_default(
+        array_ops.zeros([0], dtypes.int64), shape=None)
+
+    bad_rt1 = ragged.RaggedTensor.from_row_splits(
+        row_splits=[2, 3], values=[1, 2, 3])
+    bad_split0 = r'First value of ragged splits must be 0.*'
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0):
+      self.evaluate(bad_rt1.to_sparse())
+
+    bad_rt2 = ragged.RaggedTensor.from_row_splits(
+        row_splits=[0, 5], values=empty_vector)
+    bad_rt3 = ragged.RaggedTensor.from_row_splits(
+        row_splits=[0, 1],
+        values=ragged.RaggedTensor.from_row_splits(
+            row_splits=[0, 5], values=empty_vector))
+    split_mismatch1_error = r'Final value of ragged splits must match.*'
+    for rt in [bad_rt2, bad_rt3]:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   split_mismatch1_error):
+        self.evaluate(rt.to_sparse())
+
+    bad_rt4 = ragged.RaggedTensor.from_row_splits(
+        row_splits=[0, 5],
+        values=ragged.RaggedTensor.from_row_splits(
+            row_splits=[0], values=empty_vector))
+    split_mismatch2_error = r'Final value of ragged splits must match.*'
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 split_mismatch2_error):
+      self.evaluate(bad_rt4.to_sparse())
+
+    bad_rt5 = ragged.RaggedTensor.from_row_splits(
+        row_splits=empty_vector, values=[])
+    empty_splits_error = (r'ragged splits may not be empty.*')
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 empty_splits_error):
+      self.evaluate(bad_rt5.to_sparse())
+
+  def testGradient(self):
+    if context.executing_eagerly():
+      return
+    # rt1.shape == rt2.shape == [2, (D2), (D3), 2].
+    rt1 = ragged.constant([[[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0]]]],
+                          ragged_rank=2)
+    rt2 = ragged.constant([[[[9.0, 8.0], [7.0, 6.0]], [[5.0, 4.0]]]],
+                          ragged_rank=2)
+    rt = ragged.map_flat_values(math_ops.add, rt1, rt2 * 2.0)
+    st = rt.to_sparse()
+
+    g1, g2 = gradients_impl.gradients(st.values,
+                                      [rt1.flat_values, rt2.flat_values])
+    print(g1, g2)
+    self.assertRaggedEqual(g1, [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]])
+    self.assertRaggedEqual(g2, [[2.0, 2.0], [2.0, 2.0], [2.0, 2.0]])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffcc2be52e5538c6d99ee8bcb0ed5d368ac5ed42
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -0,0 +1,138 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.to_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToTensorOpTest(ragged_test_util.RaggedTensorTestCase,
+                                 parameterized.TestCase):
+
+  def testDocStringExamples(self):
+    """Example from ragged_to_tensor.__doc__."""
+    rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+    dt = rt.to_tensor()
+    self.assertAllEqual(dt, [[9, 8, 7], [0, 0, 0], [6, 5, 0], [4, 0, 0]])
+
+  @parameterized.parameters(
+      {
+          'rt_input': [],
+          'ragged_rank': 1,
+          'expected': [],
+          'expected_shape': [0, 0],
+      },
+      {
+          'rt_input': [[1, 2, 3], [], [4], [5, 6]],
+          'expected': [[1, 2, 3], [0, 0, 0], [4, 0, 0], [5, 6, 0]]
+      },
+      {
+          'rt_input': [[1, 2, 3], [], [4], [5, 6]],
+          'default': 9,
+          'expected': [[1, 2, 3], [9, 9, 9], [4, 9, 9], [5, 6, 9]]
+      },
+      {
+          'rt_input': [[[1], [2], [3]], [], [[4]], [[5], [6]]],
+          'ragged_rank':
+              1,
+          'default': [9],
+          'expected': [[[1], [2], [3]], [[9], [9], [9]], [[4], [9], [9]],
+                       [[5], [6], [9]]]
+      },
+      {
+          'rt_input': [[[1, 2], [], [3, 4]], [], [[5]], [[6, 7], [8]]],
+          'expected': [
+              [[1, 2], [0, 0], [3, 4]],  #
+              [[0, 0], [0, 0], [0, 0]],  #
+              [[5, 0], [0, 0], [0, 0]],  #
+              [[6, 7], [8, 0], [0, 0]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1, 2], [], [3, 4]], [], [[5]], [[6, 7], [8]]],
+          'default':
+              9,
+          'expected': [
+              [[1, 2], [9, 9], [3, 4]],  #
+              [[9, 9], [9, 9], [9, 9]],  #
+              [[5, 9], [9, 9], [9, 9]],  #
+              [[6, 7], [8, 9], [9, 9]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1], [2], [3]]],
+          'ragged_rank': 1,
+          'default': 0,
+          'expected': [[[1], [2], [3]]],
+      },
+      {
+          'rt_input': [[[[1], [2]], [], [[3]]]],
+          'default': 9,
+          'expected': [[[[1], [2]], [[9], [9]], [[3], [9]]]],
+      },
+  )
+  def testRaggedTensorToTensor(self,
+                               rt_input,
+                               expected,
+                               ragged_rank=None,
+                               default=None,
+                               expected_shape=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    dt = rt.to_tensor(default)
+    self.assertIsInstance(dt, ops.Tensor)
+    self.assertEqual(rt.dtype, dt.dtype)
+    self.assertTrue(dt.shape.is_compatible_with(rt.shape))
+    self.assertAllEqual(self.eval_to_list(dt), expected)
+    if expected_shape is not None:
+      dt_shape = array_ops.shape(dt)
+      self.assertAllEqual(dt_shape, expected_shape)
+
+  @parameterized.parameters(
+      {
+          'rt_input': [[1, 2, 3]],
+          'default': [0],
+          'error': (ValueError, r'Shape \(1,\) must have rank at most 0'),
+      },
+      {
+          'rt_input': [[[1, 2], [3, 4]], [[5, 6]]],
+          'ragged_rank': 1,
+          'default': [7, 8, 9],
+          'error': (ValueError, r'Shapes \(3,\) and \(2,\) are incompatible'),
+      },
+      {
+          'rt_input': [[1, 2, 3]],
+          'default': 'a',
+          'error': (TypeError, '.*'),
+      },
+  )
+  def testError(self, rt_input, default, error, ragged_rank=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    with self.assertRaisesRegexp(error[0], error[1]):
+      rt.to_tensor(default)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_util.py b/tensorflow/python/ops/ragged/ragged_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..a832f937d16a876a5c7c88866249101785122fb1
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_util.py
@@ -0,0 +1,280 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Private convenience functions for RaggedTensors.
+
+None of these methods are exposed in the main "ragged" package.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_ragged_math_ops
+from tensorflow.python.ops import math_ops
+
+
+def convert_to_int_tensor(tensor, name, dtype=dtypes.int32):
+  """Converts the given value to an integer Tensor."""
+  tensor = ops.convert_to_tensor(tensor, name=name, preferred_dtype=dtype)
+  if tensor.dtype.is_integer:
+    tensor = math_ops.cast(tensor, dtype)
+  else:
+    raise TypeError(
+        "%s must be an integer tensor; dtype=%s" % (name, tensor.dtype))
+  return tensor
+
+
+def get_positive_axis(axis, ndims):
+  """Validate an `axis` parameter, and normalize it to be positive.
+
+  If `ndims` is known (i.e., not `None`), then check that `axis` is in the
+  range `-ndims <= axis < ndims`, and return `axis` (if `axis >= 0`) or
+  `axis + ndims` (otherwise).
+  If `ndims` is not known, and `axis` is positive, then return it as-is.
+  If `ndims` is not known, and `axis` is negative, then report an error.
+
+  Args:
+    axis: An integer constant
+    ndims: An integer constant, or `None`
+
+  Returns:
+    The normalized `axis` value.
+
+  Raises:
+    ValueError: If `axis` is out-of-bounds, or if `axis` is negative and
+      `ndims is None`.
+  """
+  if not isinstance(axis, int):
+    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
+  if ndims is not None:
+    if 0 <= axis < ndims:
+      return axis
+    elif -ndims <= axis < 0:
+      return axis + ndims
+    else:
+      raise ValueError(
+          "axis=%s out of bounds: expected %s<=axis<%s" % (axis, -ndims, ndims))
+  elif axis < 0:
+    raise ValueError("axis may only be negative if ndims is statically known.")
+  return axis
+
+
+def assert_splits_match(nested_splits_lists):
+  """Checks that the given splits lists are identical.
+
+  Performs static tests to ensure that the given splits lists are identical,
+  and returns a list of control dependency op tensors that check that they are
+  fully identical.
+
+  Args:
+    nested_splits_lists: A list of nested_splits_lists, where each split_list is
+      a list of `splits` tensors from a `RaggedTensor`, ordered from outermost
+      ragged dimension to innermost ragged dimension.
+
+  Returns:
+    A list of control dependency op tensors.
+  Raises:
+    ValueError: If the splits are not identical.
+  """
+  error_msg = "Inputs must have identical ragged splits"
+  for splits_list in nested_splits_lists:
+    if len(splits_list) != len(nested_splits_lists[0]):
+      raise ValueError(error_msg)
+  return [
+      check_ops.assert_equal(s1, s2, message=error_msg)
+      for splits_list in nested_splits_lists[1:]
+      for (s1, s2) in zip(nested_splits_lists[0], splits_list)
+  ]
+
+
+# This op is intended to exactly match the semantics of numpy.repeat, with
+# one exception: numpy.repeat has special (and somewhat non-intuitive) behavior
+# when axis is not specified.  Rather than implement that special behavior, we
+# simply make `axis` be a required argument.
+#
+# External (OSS) `tf.repeat` feature request:
+# https://github.com/tensorflow/tensorflow/issues/8246
+def repeat(data, repeats, axis, name=None):
+  """Repeats elements of `data`.
+
+  Args:
+    data: An `N`-dimensional tensor.
+    repeats: A 1-D integer tensor specifying how many times each element in
+      `axis` should be repeated.  `len(repeats)` must equal `data.shape[axis]`.
+      Supports broadcasting from a scalar value.
+    axis: `int`.  The axis along which to repeat values.  Must be less than
+      `max(N, 1)`.
+    name: A name for the operation.
+
+  Returns:
+    A tensor with `max(N, 1)` dimensions.  Has the same shape as `data`,
+    except that dimension `axis` has size `sum(repeats)`.
+
+  #### Examples:
+    ```python
+    >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0)
+    ['a', 'a', 'a', 'c', 'c']
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0)
+    [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]
+    >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1)
+    [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]
+    ```
+  """
+  if not isinstance(axis, int):
+    raise TypeError("axis must be an int; got %s" % type(axis).__name__)
+
+  with ops.name_scope(name, "Repeat", [data, repeats]):
+    data = ops.convert_to_tensor(data, name="data")
+    repeats = convert_to_int_tensor(repeats, name="repeats")
+    repeats.shape.with_rank_at_most(1)
+
+    # If `data` is a scalar, then upgrade it to a vector.
+    data = _with_nonzero_rank(data)
+    data_shape = array_ops.shape(data)
+
+    # If `axis` is negative, then convert it to a positive value.
+    axis = get_positive_axis(axis, data.shape.ndims)
+
+    # Check data Tensor shapes.
+    if repeats.shape.ndims == 1:
+      data.shape.dims[axis].assert_is_compatible_with(repeats.shape[0])
+
+    # If we know that `repeats` is a scalar, then we can just tile & reshape.
+    if repeats.shape.ndims == 0:
+      expanded = array_ops.expand_dims(data, axis + 1)
+      tiled = tile_one_dimension(expanded, axis + 1, repeats)
+      result_shape = array_ops.concat(
+          [data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0)
+      return array_ops.reshape(tiled, result_shape)
+
+    # Broadcast the `repeats` tensor so rank(repeats) == axis + 1.
+    if repeats.shape.ndims != axis + 1:
+      repeats_shape = array_ops.shape(repeats)
+      repeats_ndims = array_ops.rank(repeats)
+      broadcast_shape = array_ops.concat(
+          [data_shape[:axis + 1 - repeats_ndims], repeats_shape], axis=0)
+      repeats = array_ops.broadcast_to(repeats, broadcast_shape)
+      repeats.set_shape([None] * (axis + 1))
+
+    # Create a "sequence mask" based on `repeats`, where slices across `axis`
+    # contain one `True` value for each repetition.  E.g., if
+    # `repeats = [3, 1, 2]`, then `mask = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]`.
+    max_repeat = math_ops.maximum(0, math_ops.reduce_max(repeats))
+    mask = array_ops.sequence_mask(repeats, max_repeat)
+
+    # Add a new dimension around each value that needs to be repeated, and
+    # then tile that new dimension to match the maximum number of repetitions.
+    expanded = array_ops.expand_dims(data, axis + 1)
+    tiled = tile_one_dimension(expanded, axis + 1, max_repeat)
+
+    # Use `boolean_mask` to discard the extra repeated values.  This also
+    # flattens all dimensions up through `axis`.
+    masked = array_ops.boolean_mask(tiled, mask)
+
+    # Reshape the output tensor to add the outer dimensions back.
+    if axis == 0:
+      result = masked
+    else:
+      result_shape = array_ops.concat(
+          [data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0)
+      result = array_ops.reshape(masked, result_shape)
+
+    # Preserve shape information.
+    if data.shape.ndims is not None:
+      new_axis_size = 0 if repeats.shape[0] == 0 else None
+      result.set_shape(data.shape[:axis].concatenate(
+          [new_axis_size]).concatenate(data.shape[axis + 1:]))
+
+    return result
+
+
+def tile_one_dimension(data, axis, multiple):
+  """Tiles a single dimension of a tensor."""
+  # Assumes axis is a nonnegative int.
+  if data.shape.ndims is not None:
+    multiples = [1] * data.shape.ndims
+    multiples[axis] = multiple
+  else:
+    ones = array_ops.ones(array_ops.rank(data), dtypes.int32)
+    multiples = array_ops.concat([ones[:axis], [multiple], ones[axis + 1:]],
+                                 axis=0)
+  return array_ops.tile(data, multiples)
+
+
+def _with_nonzero_rank(data):
+  """If `data` is scalar, then add a dimension; otherwise return as-is."""
+  if data.shape.ndims is not None:
+    if data.shape.ndims == 0:
+      return array_ops.stack([data])
+    else:
+      return data
+  else:
+    data_shape = array_ops.shape(data)
+    data_ndims = array_ops.rank(data)
+    return array_ops.reshape(
+        data,
+        array_ops.concat([[1], data_shape], axis=0)[-data_ndims:])
+
+
+def lengths_to_splits(lengths):
+  """Returns splits corresponding to the given lengths."""
+  return array_ops.concat([[0], math_ops.cumsum(lengths)], axis=-1)
+
+
+def repeat_ranges(params, splits, repeats):
+  """Repeats each range of `params` (as specified by `splits`) `repeats` times.
+
+  Let the `i`th range of `params` be defined as
+  `params[splits[i]:splits[i + 1]]`.  Then this function returns a tensor
+  containing range 0 repeated `repeats[0]` times, followed by range 1 repeated
+  `repeats[1]`, ..., followed by the last range repeated `repeats[-1]` times.
+
+  Args:
+    params: The `Tensor` whose values should be repeated.
+    splits: A splits tensor indicating the ranges of `params` that should be
+      repeated.
+    repeats: The number of times each range should be repeated.  Supports
+      broadcasting from a scalar value.
+
+  Returns:
+    A `Tensor` with the same rank and type as `params`.
+
+  #### Example:
+    ```python
+    >>> repeat_ranges(['a', 'b', 'c'], [0, 2, 3], 3)
+    ['a', 'b', 'a', 'b', 'a', 'b', 'c', 'c', 'c']
+    ```
+  """
+  # Divide `splits` into starts and limits, and repeat them `repeats` times.
+  if repeats.shape.ndims != 0:
+    repeated_starts = repeat(splits[:-1], repeats, axis=0)
+    repeated_limits = repeat(splits[1:], repeats, axis=0)
+  else:
+    # Optimization: we can just call repeat once, and then slice the result.
+    repeated_splits = repeat(splits, repeats, axis=0)
+    n_splits = array_ops.shape(repeated_splits, out_type=dtypes.int64)[0]
+    repeated_starts = repeated_splits[:n_splits - repeats]
+    repeated_limits = repeated_splits[repeats:]
+
+  # Get indices for each range from starts to limits, and use those to gather
+  # the values in the desired repetition pattern.
+  one = array_ops.ones((), repeated_starts.dtype)
+  offsets = gen_ragged_math_ops.ragged_range(
+      repeated_starts, repeated_limits, one)
+  return array_ops.gather(params, offsets.rt_dense_values)
diff --git a/tensorflow/python/ops/ragged/ragged_util_test.py b/tensorflow/python/ops/ragged/ragged_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a4155930708a0e8eb5808807bf788c67de862f
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_util_test.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.platform import googletest
+
+# Example 3d tensor for test cases.  Has shape [4, 2, 3].
+TENSOR_3D = [[[('%d%d%d' % (i, j, k)).encode('utf-8')
+               for k in range(3)]
+              for j in range(2)]
+             for i in range(4)]
+
+# Example 4d tensor for test cases.  Has shape [4, 2, 3, 5].
+TENSOR_4D = [[[[('%d%d%d%d' % (i, j, k, l)).encode('utf-8')
+                for l in range(5)]
+               for k in range(3)]
+              for j in range(2)]
+             for i in range(4)]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedUtilTest(ragged_test_util.RaggedTensorTestCase,
+                     parameterized.TestCase):
+
+  @parameterized.parameters([
+      # Docstring examples
+      dict(
+          data=['a', 'b', 'c'],
+          repeats=[3, 0, 2],
+          axis=0,
+          expected=[b'a', b'a', b'a', b'c', b'c']),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=[2, 3],
+          axis=0,
+          expected=[[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]]),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=[2, 3],
+          axis=1,
+          expected=[[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]]),
+
+      # Scalar repeats value
+      dict(
+          data=['a', 'b', 'c'],
+          repeats=2,
+          axis=0,
+          expected=[b'a', b'a', b'b', b'b', b'c', b'c']),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=2,
+          axis=0,
+          expected=[[1, 2], [1, 2], [3, 4], [3, 4]]),
+      dict(
+          data=[[1, 2], [3, 4]],
+          repeats=2,
+          axis=1,
+          expected=[[1, 1, 2, 2], [3, 3, 4, 4]]),
+
+      # data & repeats are broadcast to have at least one dimension,
+      # so these are all equivalent:
+      dict(data=3, repeats=4, axis=0, expected=[3, 3, 3, 3]),
+      dict(data=[3], repeats=4, axis=0, expected=[3, 3, 3, 3]),
+      dict(data=3, repeats=[4], axis=0, expected=[3, 3, 3, 3]),
+      dict(data=[3], repeats=[4], axis=0, expected=[3, 3, 3, 3]),
+      # Empty tensor
+      dict(data=[], repeats=[], axis=0, expected=[]),
+  ])
+  def testRepeat(self, data, repeats, expected, axis=None):
+    result = ragged_util.repeat(data, repeats, axis)
+    with self.test_session():
+      self.assertAllEqual(result, expected)
+
+  @parameterized.parameters([
+      dict(mode=mode, **args)
+      for mode in ['constant', 'dynamic', 'unknown_shape']
+      for args in [
+          # data & repeats are broadcast to have at least one dimension,
+          # so these are all equivalent:
+          dict(data=3, repeats=4, axis=0),
+          dict(data=[3], repeats=4, axis=0),
+          dict(data=3, repeats=[4], axis=0),
+          dict(data=[3], repeats=[4], axis=0),
+
+          # 1-dimensional data tensor.
+          dict(data=[], repeats=5, axis=0),
+          dict(data=[1, 2, 3], repeats=5, axis=0),
+          dict(data=[1, 2, 3], repeats=[3, 0, 2], axis=0),
+          dict(data=[1, 2, 3], repeats=[3, 0, 2], axis=-1),
+          dict(data=[b'a', b'b', b'c'], repeats=[3, 0, 2], axis=0),
+
+          # 2-dimensional data tensor.
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=3, axis=0),
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=3, axis=1),
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=[3, 5], axis=0),
+          dict(data=[[1, 2, 3], [4, 5, 6]], repeats=[3, 5, 7], axis=1),
+
+          # 3-dimensional data tensor: shape=[4, 2, 3].
+          dict(data=TENSOR_3D, repeats=2, axis=0),
+          dict(data=TENSOR_3D, repeats=2, axis=1),
+          dict(data=TENSOR_3D, repeats=2, axis=2),
+          dict(data=TENSOR_3D, repeats=[2, 0, 4, 1], axis=0),
+          dict(data=TENSOR_3D, repeats=[3, 2], axis=1),
+          dict(data=TENSOR_3D, repeats=[1, 3, 1], axis=2),
+
+          # 4-dimensional data tensor: shape=[4, 2, 3, 5].
+          dict(data=TENSOR_4D, repeats=2, axis=0),
+          dict(data=TENSOR_4D, repeats=2, axis=1),
+          dict(data=TENSOR_4D, repeats=2, axis=2),
+          dict(data=TENSOR_4D, repeats=2, axis=3),
+          dict(data=TENSOR_4D, repeats=[2, 0, 4, 1], axis=0),
+          dict(data=TENSOR_4D, repeats=[3, 2], axis=1),
+          dict(data=TENSOR_4D, repeats=[1, 3, 1], axis=2),
+          dict(data=TENSOR_4D, repeats=[1, 3, 0, 0, 2], axis=3),
+      ]
+  ])
+  def testValuesMatchesNumpy(self, mode, data, repeats, axis):
+    # Exception: we can't handle negative axis if data.ndims is unknown.
+    if axis < 0 and mode == 'unknown_shape':
+      return
+
+    expected = np.repeat(data, repeats, axis)
+
+    if mode == 'constant':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+    elif mode == 'dynamic':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+      data = array_ops.placeholder_with_default(data, data.shape)
+      repeats = array_ops.placeholder_with_default(repeats, repeats.shape)
+    elif mode == 'unknown_shape':
+      data = array_ops.placeholder_with_default(data, None)
+      repeats = array_ops.placeholder_with_default(repeats, None)
+
+    result = ragged_util.repeat(data, repeats, axis)
+    with self.test_session():
+      self.assertAllEqual(result, expected)
+
+  @parameterized.parameters([
+      dict(
+          descr='axis >= rank(data)',
+          mode='dynamic',
+          data=[1, 2, 3],
+          repeats=[3, 0, 2],
+          axis=1,
+          error='axis=1 out of bounds: expected -1<=axis<1'),
+      dict(
+          descr='axis < -rank(data)',
+          mode='dynamic',
+          data=[1, 2, 3],
+          repeats=[3, 0, 2],
+          axis=-2,
+          error='axis=-2 out of bounds: expected -1<=axis<1'),
+      dict(
+          descr='len(repeats) != data.shape[axis]',
+          mode='dynamic',
+          data=[[1, 2, 3], [4, 5, 6]],
+          repeats=[2, 3],
+          axis=1,
+          error='Dimensions 3 and 2 are not compatible'),
+      dict(
+          descr='rank(repeats) > 1',
+          mode='dynamic',
+          data=[[1, 2, 3], [4, 5, 6]],
+          repeats=[[3], [5]],
+          axis=1,
+          error=r'Shape \(2, 1\) must have rank at most 1'),
+      dict(
+          descr='non-integer axis',
+          mode='constant',
+          data=[1, 2, 3],
+          repeats=2,
+          axis='foo',
+          exception=TypeError,
+          error='axis must be an int'),
+  ])
+  def testError(self,
+                descr,
+                mode,
+                data,
+                repeats,
+                axis,
+                exception=ValueError,
+                error=None):
+    # Make sure that this is also an error case for numpy.
+    with self.assertRaises(exception):
+      np.repeat(data, repeats, axis)
+
+    if mode == 'constant':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+    elif mode == 'dynamic':
+      data = constant_op.constant(data)
+      repeats = constant_op.constant(repeats)
+      data = array_ops.placeholder_with_default(data, data.shape)
+      repeats = array_ops.placeholder_with_default(repeats, repeats.shape)
+    elif mode == 'unknown_shape':
+      data = array_ops.placeholder_with_default(data, None)
+      repeats = array_ops.placeholder_with_default(repeats, None)
+
+    with self.assertRaisesRegexp(exception, error):
+      ragged_util.repeat(data, repeats, axis)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3cd5a2debe0db0b1bac2b6396c78b9e94c3f671
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -0,0 +1,194 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.where."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Docstring Examples
+      #=========================================================================
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          expected=[[0, 0], [0, 2], [1, 1]]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([True, False]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'd', b'e']])),
+      #=========================================================================
+      # Coordinate-retrieval mode
+      #=========================================================================
+      dict(  # shape=[D1]
+          condition=[True, False, True, False, True],
+          expected=[[0], [2], [4]]),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          expected=[[0, 0], [1, 1]]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          expected=[[0, 0], [0, 2], [1, 1]]),
+      dict(  # shape=[D1, (D2), (D3)]
+          condition=ragged.constant_value([
+              [[True, False, True], [False, True]],
+              [[True], [], [False], [False, True, False]]
+          ]),
+          expected=[[0, 0, 0], [0, 0, 2], [0, 1, 1],
+                    [1, 0, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          expected=[[0, 0, 0], [0, 1, 1],
+                    [1, 0, 0], [1, 2, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          expected=[[0, 0, 1, 0],
+                    [1, 0, 0, 0], [1, 0, 0, 2], [1, 0, 1, 1],
+                    [1, 1, 0, 0], [1, 1, 3, 1]]),
+
+      #=========================================================================
+      # Elementwise value-selection mode
+      #=========================================================================
+      dict(  # shape=[]
+          condition=True, x='A', y='a', expected=b'A'),
+      dict(  # shape=[]
+          condition=False, x='A', y='a', expected=b'a'),
+      dict(  # shape=[D1]
+          condition=[True, False, True],
+          x=['A', 'B', 'C'],
+          y=['a', 'b', 'c'],
+          expected=[b'A', b'b', b'C']),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          x=[['A', 'B'], ['D', 'E']],
+          y=[['a', 'b'], ['d', 'e']],
+          expected=[[b'A', b'b'], [b'd', b'E']]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          x=ragged.constant_value([
+              [['A', 'B'], ['C', 'D']],
+              [['E', 'F'], ['G', 'H'], ['I', 'J'], ['K', 'L']]
+          ], ragged_rank=1),
+          y=ragged.constant_value([
+              [['a', 'b'], ['c', 'd']],
+              [['e', 'f'], ['g', 'h'], ['i', 'j'], ['k', 'l']]
+          ], ragged_rank=1),
+          expected=ragged.constant_value([
+              [[b'A', b'b'], [b'c', b'D']],
+              [[b'E', b'f'], [b'g', b'h'], [b'I', b'j'], [b'k', b'L']]
+          ], ragged_rank=1)),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          x=ragged.constant_value([
+              [[[], ['A']]],
+              [[['B', 'C', 'D'], ['E', 'F']],
+               [['G'], [], ['H'], ['I', 'J', 'K']]]
+          ]),
+          y=ragged.constant_value([
+              [[[], ['a']]],
+              [[['b', 'c', 'd'], ['e', 'f']],
+               [['g'], [], ['h'], ['i', 'j', 'k']]]
+          ]),
+          expected=ragged.constant_value([
+              [[[], [b'A']]],
+              [[[b'B', b'c', b'D'], [b'e', b'F']],
+               [[b'G'], [], [b'h'], [b'i', b'J', b'k']]]
+          ])),
+
+      #=========================================================================
+      # Elementwise row-selection mode
+      #=========================================================================
+      dict(  # shape=[D1, D2]
+          condition=[True, False, True],
+          x=[['A', 'B'], ['C', 'D'], ['E', 'F']],
+          y=[['a', 'b'], ['c', 'd'], ['e', 'f']],
+          expected=[[b'A', b'B'], [b'c', b'd'], [b'E', b'F']]),
+      dict(  # shape=[D1, (D2)]
+          condition=[True, False, True],
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
+          y=ragged.constant_value([['a', 'b'], ['c'], ['d', 'e']]),
+          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'c'],
+                                          [b'F', b'G']])),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged.constant_value([True, False]),
+          x=ragged.constant_value([
+              [[[], ['A']]],
+              [[['B', 'C', 'D'], ['E', 'F']],
+               [['G'], [], ['H'], ['I', 'J', 'K']]]
+          ]),
+          y=ragged.constant_value([[[['a']]], [[['b']]]]),
+          expected=ragged.constant_value([[[[], [b'A']]], [[[b'b']]]])),
+  ])   # pyformat: disable
+  def testRaggedWhere(self, condition, expected, x=None, y=None):
+    result = ragged.where(condition, x, y)
+    self.assertRaggedEqual(result, expected)
+
+  @parameterized.parameters([
+      dict(
+          condition=[True, False],
+          x=[1, 2],
+          error=ValueError,
+          message='x and y must be either both None or both non-None'),
+      dict(
+          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=[['a', 'b'], ['d', 'e']],
+          error=ValueError,
+          message='Input shapes do not match.'),
+  ])
+  def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
+    with self.assertRaisesRegexp(error, message):
+      ragged.where(condition, x, y)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2970c3e75af36d3f042ab23ab70c8d2cdb36ca
--- /dev/null
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -0,0 +1,107 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for converting between row_splits and segment_ids."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_util
+
+
+# For background on "segments" and "segment ids", see:
+# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+def row_splits_to_segment_ids(splits, name=None):
+  """Generates the segmentation corresponding to a RaggedTensor `splits` vector.
+
+  Returns an integer vector `segment_ids`, where `segment_ids[i] == j` if
+  `splits[j] <= i < splits[j+1]`.  Example:
+
+  ```python
+  >>> ragged.row_splits_to_segment_ids([0, 3, 3, 5, 6, 9]).eval()
+  [ 0 0 0 2 2 3 4 4 4 ]
+  ```
+
+  Args:
+    splits: A sorted 1-D int64 Tensor.  `splits[0]` must be zero.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A sorted 1-D int64 Tensor, with `shape=[splits[-1]]`
+
+  Raises:
+    ValueError: If `splits` is invalid.
+  """
+  with ops.name_scope(name, "RaggedSplitsToSegmentIds", [splits]) as name:
+    splits = ops.convert_to_tensor(splits, dtype=dtypes.int64, name="splits")
+    splits.shape.assert_has_rank(1)
+    if tensor_shape.dimension_value(splits.shape[0]) == 0:
+      raise ValueError("Invalid row_splits: []")
+    row_lengths = splits[1:] - splits[:-1]
+    nrows = array_ops.shape(splits, out_type=dtypes.int64)[-1] - 1
+    indices = math_ops.range(nrows)
+    return ragged_util.repeat(indices, repeats=row_lengths, axis=0)
+
+
+# For background on "segments" and "segment ids", see:
+# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+def segment_ids_to_row_splits(segment_ids, num_segments=None, name=None):
+  """Generates the RaggedTensor `splits` vector corresponding to a segmentation.
+
+  Returns an integer vector `splits`, where `splits[0] = 0` and
+  `splits[i] = splits[i-1] + count(segment_ids==i)`.  Example:
+
+  ```python
+  >>> ragged.segment_ids_to_row_splits([0, 0, 0, 2, 2, 3, 4, 4, 4]).eval()
+  [ 0 3 3 5 6 9 ]
+  ```
+
+  Args:
+    segment_ids: A 1-D integer Tensor.
+    num_segments: A scalar integer indicating the number of segments.  Defaults
+      to `max(segment_ids) + 1` (or zero if `segment_ids` is empty).
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A sorted 1-D int64 Tensor, with `shape=[num_segments + 1]`.
+  """
+  with ops.name_scope(name, "SegmentIdsToRaggedSplits", [segment_ids]) as name:
+    segment_ids = ragged_util.convert_to_int_tensor(segment_ids, "segment_ids")
+    segment_ids.shape.assert_has_rank(1)
+    if num_segments is not None:
+      num_segments = ragged_util.convert_to_int_tensor(num_segments,
+                                                       "num_segments")
+      num_segments.shape.assert_has_rank(0)
+
+    row_lengths = math_ops.bincount(
+        segment_ids,
+        minlength=num_segments,
+        maxlength=num_segments,
+        dtype=dtypes.int64)
+    splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
+
+    # Update shape information, if possible.
+    if num_segments is not None:
+      const_num_segments = tensor_util.constant_value(num_segments)
+      if const_num_segments is not None:
+        splits.set_shape(tensor_shape.TensorShape([const_num_segments + 1]))
+
+    return splits
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index c2eb9dfc5dce3dec497c2abd4ca29d912240c49c..62e2f6d1025bb9802a5b2a09a4dbffbe15921ace 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -44,7 +44,8 @@ def _ShapeTensor(shape):
   return ops.convert_to_tensor(shape, dtype=dtype, name="shape")
 
 
-@tf_export("random.normal", "random_normal")
+@tf_export("random.normal", v1=["random.normal", "random_normal"])
+@deprecation.deprecated_endpoints("random_normal")
 def random_normal(shape,
                   mean=0.0,
                   stddev=1.0,
@@ -137,7 +138,9 @@ def parameterized_truncated_normal(shape,
     return rnd
 
 
-@tf_export("random.truncated_normal", "truncated_normal")
+@tf_export("random.truncated_normal",
+           v1=["random.truncated_normal", "truncated_normal"])
+@deprecation.deprecated_endpoints("truncated_normal")
 def truncated_normal(shape,
                      mean=0.0,
                      stddev=1.0,
@@ -182,7 +185,8 @@ ops.NotDifferentiable("ParameterizedTruncatedNormal")
 ops.NotDifferentiable("TruncatedNormal")
 
 
-@tf_export("random.uniform", "random_uniform")
+@tf_export("random.uniform", v1=["random.uniform", "random_uniform"])
+@deprecation.deprecated_endpoints("random_uniform")
 def random_uniform(shape,
                    minval=0,
                    maxval=None,
@@ -247,7 +251,8 @@ def random_uniform(shape,
 ops.NotDifferentiable("RandomUniform")
 
 
-@tf_export("random.shuffle", "random_shuffle")
+@tf_export("random.shuffle", v1=["random.shuffle", "random_shuffle"])
+@deprecation.deprecated_endpoints("random_shuffle")
 def random_shuffle(value, seed=None, name=None):
   """Randomly shuffles a tensor along its first dimension.
 
@@ -278,7 +283,8 @@ def random_shuffle(value, seed=None, name=None):
       value, seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export("image.random_crop", "random_crop")
+@tf_export("image.random_crop", v1=["image.random_crop", "random_crop"])
+@deprecation.deprecated_endpoints("random_crop")
 def random_crop(value, size, seed=None, name=None):
   """Randomly crops a tensor to a given size.
 
@@ -321,7 +327,9 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
-@tf_export("random.multinomial", "multinomial")
+@tf_export(v1=["random.multinomial", "multinomial"])
+@deprecation.deprecated(
+    date=None, instructions="Use tf.random.categorical instead.")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   """Draws samples from a multinomial distribution.
 
@@ -338,9 +346,7 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
       `[i, :]` represents the unnormalized log-probabilities for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A Python integer. Used to create a random seed for the distribution.
-      See
-      `tf.set_random_seed`
-      for behavior.
+      See `tf.set_random_seed` for behavior.
     name: Optional name for the operation.
     output_dtype: integer type to use for the output. Defaults to int64.
 
@@ -348,16 +354,49 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
     The drawn samples of shape `[batch_size, num_samples]`.
   """
   with ops.name_scope(name, "multinomial", [logits]):
-    logits = ops.convert_to_tensor(logits, name="logits")
-    seed1, seed2 = random_seed.get_seed(seed)
-    return gen_random_ops.multinomial(
-        logits, num_samples, seed=seed1, seed2=seed2, output_dtype=output_dtype)
+    return multinomial_categorical_impl(logits, num_samples, output_dtype, seed)
+
+
+@tf_export("random.categorical")
+def categorical(logits, num_samples, dtype=None, seed=None, name=None):
+  """Draws samples from a categorical distribution.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.random.categorical(tf.log([[10., 10.]]), 5)
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    dtype: integer type to use for the output. Defaults to int64.
+    seed: A Python integer. Used to create a random seed for the distribution.
+      See `tf.set_random_seed` for behavior.
+    name: Optional name for the operation.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "categorical", [logits]):
+    return multinomial_categorical_impl(logits, num_samples, dtype, seed)
+
+
+def multinomial_categorical_impl(logits, num_samples, dtype, seed):
+  """Implementation for random.multinomial (v1) and random.categorical (v2)."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  seed1, seed2 = random_seed.get_seed(seed)
+  return gen_random_ops.multinomial(
+      logits, num_samples, seed=seed1, seed2=seed2, output_dtype=dtype)
 
 
 ops.NotDifferentiable("Multinomial")
 
 
-@tf_export("random.gamma", "random_gamma")
+@tf_export("random.gamma", v1=["random.gamma", "random_gamma"])
 @deprecation.deprecated_endpoints("random_gamma")
 def random_gamma(shape,
                  alpha,
@@ -441,7 +480,7 @@ def random_gamma(shape,
             shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
 
 
-@tf_export("random.poisson", "random_poisson")
+@tf_export(v1=["random.poisson", "random_poisson"])
 @deprecation.deprecated_endpoints("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
@@ -474,6 +513,45 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
       for behavior.
     name: Optional name for the operation.
 
+  Returns:
+    samples: a `Tensor` of shape `tf.concat([shape, tf.shape(lam)], axis=0)`
+      with values of type `dtype`.
+  """
+  return random_poisson_v2(shape, lam, dtype, seed, name)
+
+
+@tf_export("random.poisson", v1=[])
+def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None):
+  """Draws `shape` samples from each of the given Poisson distribution(s).
+
+  `lam` is the rate parameter describing the distribution(s).
+
+  Example:
+
+  ```python
+  samples = tf.random_poisson([10], [0.5, 1.5])
+  # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
+  # the samples drawn from each distribution
+
+  samples = tf.random_poisson([7, 5], [12.2, 3.3])
+  # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
+  # represents the 7x5 samples drawn from each of the two distributions
+  ```
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output samples
+      to be drawn per "rate"-parameterized distribution.
+    lam: A Tensor or Python value or N-D array of type `dtype`.
+      `lam` provides the rate parameter(s) describing the poisson
+      distribution(s) to sample.
+    dtype: The type of the output: `float16`, `float32`, `float64`, `int32` or
+      `int64`.
+    seed: A Python integer. Used to create a random seed for the distributions.
+      See
+      `tf.set_random_seed`
+      for behavior.
+    name: Optional name for the operation.
+
   Returns:
     samples: a `Tensor` of shape `tf.concat([shape, tf.shape(lam)], axis=0)`
       with values of type `dtype`.
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 4a126e9d7ad426d4e9e6072e2c98f57eeaa9db74..1066b357b43bb60d5e5b078846fcd82e12e941c3 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -26,6 +26,7 @@ from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -45,7 +46,6 @@ from tensorflow.python.util import compat
 
 
 def get_resource_handle_data(graph_op):
-  assert ops._USE_C_SHAPES  # pylint: disable=protected-access
   assert type(graph_op) == ops.Tensor  # pylint: disable=unidiomatic-typecheck
 
   handle_data = pywrap_tensorflow.GetHandleShapeAndType(
@@ -65,6 +65,7 @@ def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
                                                    name=name,
                                                    container=container)
   if graph_mode:
+    handle._handle_data = get_resource_handle_data(handle)  # pylint: disable=protected-access
     return handle
 
   # We do not want two distinct ResourceVariable objects for the same
@@ -87,12 +88,7 @@ def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     # shape inference doesn't run in eager mode we copy this data here for when
     # the handle is captured by an eager mode function.
     # pylint: disable=protected-access
-    if ops._USE_C_SHAPES:
-      handle._handle_data = get_resource_handle_data(h)
-    else:
-      if h._handle_data is None:
-        ops.set_shape_and_handle_data_for_outputs(h.op)
-      handle._handle_data = h._handle_data
+    handle._handle_data = get_resource_handle_data(h)
     # pylint: enable=protected-access
   # Clean up op->graph->op reference cycles.
   ops.dismantle_graph(graph)
@@ -525,7 +521,10 @@ class ResourceVariable(variables.RefVariable):
       snapshot = g.as_graph_element(
           ops.prepend_name_scope(
               variable_def.snapshot_name, import_scope=import_scope))
-      self._cached_value = snapshot
+      if snapshot.op.type != "ReadVariableOp":
+        self._cached_value = snapshot
+      else:
+        self._cached_value = None
       while snapshot.op.type != "ReadVariableOp":
         snapshot = snapshot.op.inputs[0]
       self._graph_element = snapshot
@@ -808,16 +807,6 @@ class ResourceVariable(variables.RefVariable):
     return ResourceVariable(
         variable_def=variable_def, import_scope=import_scope)
 
-  @staticmethod
-  def _OverloadAllOperators():  # pylint: disable=invalid-name
-    """Register overloads for all operators."""
-    for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      ResourceVariable._OverloadOperator(operator)
-    # For slicing, bind getitem differently than a tensor (use SliceHelperVar
-    # instead)
-    # pylint: disable=protected-access
-    setattr(ResourceVariable, "__getitem__", array_ops._SliceHelperVar)
-
   def _AsTensor(self):
     return self.value()
 
@@ -829,30 +818,6 @@ class ResourceVariable(variables.RefVariable):
     """Unsupported."""
     raise NotImplementedError("ResourceVariable does not implement set_shape()")
 
-  @staticmethod
-  def _OverloadOperator(operator):  # pylint: disable=invalid-name
-    """Defer an operator overload to `ops.Tensor`.
-
-    We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
-
-    Args:
-      operator: string. The operator name.
-    """
-
-    tensor_oper = getattr(ops.Tensor, operator)
-    def _run_op(a, *args):
-      # pylint: disable=protected-access
-      value = a._AsTensor()
-      return tensor_oper(value, *args)
-
-    # Propagate __doc__ to wrapper
-    try:
-      _run_op.__doc__ = tensor_oper.__doc__
-    except AttributeError:
-      pass
-
-    setattr(ResourceVariable, operator, _run_op)
-
   __array_priority__ = 100
 
   def is_initialized(self, name=None):
@@ -1438,7 +1403,6 @@ ops.register_tensor_conversion_function(
     variables.Variable, variables.Variable._TensorConversionFunction)  # pylint: disable=protected-access
 
 # pylint: disable=protected-access
-ResourceVariable._OverloadAllOperators()
 ops.register_dense_tensor_like_type(ResourceVariable)
 
 
@@ -1448,13 +1412,23 @@ def _ReadGrad(_, grad):
   return grad
 
 
+def variable_shape(handle, out_type=dtypes.int32):
+  if getattr(
+      handle, "_handle_data", None) is None or not handle._handle_data.is_set:
+    return gen_resource_variable_ops.variable_shape(handle, out_type=out_type)
+  shape_proto = handle._handle_data.shape_and_type[0].shape
+  if shape_proto.unknown_rank or any(x.size == -1 for x in shape_proto.dim):
+    return gen_resource_variable_ops.variable_shape(handle, out_type=out_type)
+  return constant_op.constant([x.size for x in shape_proto.dim], dtype=out_type)
+
+
 @ops.RegisterGradient("ResourceGather")
 def _GatherGrad(op, grad):
   """Gradient for gather op."""
   # Build appropriately shaped IndexedSlices
   handle = op.inputs[0]
   indices = op.inputs[1]
-  params_shape = gen_resource_variable_ops.variable_shape(handle)
+  params_shape = variable_shape(handle)
   size = array_ops.expand_dims(array_ops.size(indices), 0)
   values_shape = array_ops.concat([size, params_shape[1:]], 0)
   values = array_ops.reshape(grad, values_shape)
@@ -1510,3 +1484,24 @@ def is_resource_variable(var):
   """"Returns True if `var` is to be considered a ResourceVariable."""
   return isinstance(var, ResourceVariable) or hasattr(
       var, "_should_act_as_resource_variable")
+
+
+def copy_to_graph_uninitialized(var):
+  """Copies an existing variable to a new graph, with no initializer."""
+  # Like ResourceVariable.__deepcopy__, but does not set an initializer on the
+  # new variable.
+  # pylint: disable=protected-access
+  new_variable = ResourceVariable(
+      initial_value=array_ops.placeholder(
+          shape=var.shape, dtype=var.dtype,
+          name="unused_initial_variable_value"),
+      trainable=var.trainable,
+      constraint=var._constraint,
+      dtype=var.dtype,
+      name=var._shared_name)
+  new_variable._maybe_initialize_checkpointable()
+  # pylint: enable=protected-access
+  return new_variable
+
+ops.NotDifferentiable("VarIsInitializedOp")
+ops.NotDifferentiable("VariableShape")
diff --git a/tensorflow/python/ops/resources.py b/tensorflow/python/ops/resources.py
index db6740643cffd9ca852d75653c837a39a1731d42..86477c924777e7fe7a093b72fc2c6acea1fdaa0e 100644
--- a/tensorflow/python/ops/resources.py
+++ b/tensorflow/python/ops/resources.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import os
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -86,7 +87,9 @@ def report_uninitialized_resources(resource_list=None,
     resource_list = shared_resources() + local_resources()
   with ops.name_scope(name):
     # Run all operations on CPU
-    with ops.device("/cpu:0"):
+    local_device = os.environ.get(
+        "TF_DEVICE_FOR_UNINITIALIZED_VARIABLE_REPORTING", "/cpu:0")
+    with ops.device(local_device):
       if not resource_list:
         # Return an empty tensor so we only need to check for returned tensor
         # size being 0 as an indication of model ready.
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 5a3a5cc225bc605dbeb09a2f7791decf0545fedd..ec48cab91d172c54b2f927d946312f086e808c9c 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -54,7 +55,7 @@ def _transpose_batch_time(x):
     x transposed along the first two dimensions.
   """
   x_static_shape = x.get_shape()
-  if x_static_shape.ndims is not None and x_static_shape.ndims < 2:
+  if x_static_shape.rank is not None and x_static_shape.rank < 2:
     return x
 
   x_rank = array_ops.rank(x)
@@ -63,7 +64,7 @@ def _transpose_batch_time(x):
           ([1, 0], math_ops.range(2, x_rank)), axis=0))
   x_t.set_shape(
       tensor_shape.TensorShape([
-          x_static_shape[1].value, x_static_shape[0].value
+          x_static_shape.dims[1].value, x_static_shape.dims[0].value
       ]).concatenate(x_static_shape[2:]))
   return x_t
 
@@ -84,12 +85,12 @@ def _best_effort_input_batch_size(flat_input):
   """
   for input_ in flat_input:
     shape = input_.shape
-    if shape.ndims is None:
+    if shape.rank is None:
       continue
-    if shape.ndims < 2:
+    if shape.rank < 2:
       raise ValueError(
           "Expected input tensor %s to have rank at least 2" % input_)
-    batch_size = shape[1].value
+    batch_size = shape.dims[1].value
     if batch_size is not None:
       return batch_size
   # Fallback to the dynamic batch size of the first input.
@@ -116,7 +117,7 @@ def _infer_state_dtype(explicit_dtype, state):
     inferred_dtypes = [element.dtype for element in nest.flatten(state)]
     if not inferred_dtypes:
       raise ValueError("Unable to infer dtype from empty state.")
-    all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+    all_same = all(x == inferred_dtypes[0] for x in inferred_dtypes)
     if not all_same:
       raise ValueError(
           "State has tensors of different inferred_dtypes. Unable to infer a "
@@ -233,7 +234,7 @@ def _rnn_step(
     # TensorArray and scalar get passed through.
     if isinstance(output, tensor_array_ops.TensorArray):
       return new_output
-    if output.shape.ndims == 0:
+    if output.shape.rank == 0:
       return new_output
     # Otherwise propagate the old or the new value.
     with ops.colocate_with(new_output):
@@ -326,7 +327,7 @@ def _reverse_seq(input_seq, lengths):
   flat_results = [[] for _ in range(len(input_seq))]
   for sequence in zip(*flat_input_seq):
     input_shape = tensor_shape.unknown_shape(
-        ndims=sequence[0].get_shape().ndims)
+        rank=sequence[0].get_shape().rank)
     for input_ in sequence:
       input_shape.merge_with(input_.get_shape())
       input_.set_shape(input_shape)
@@ -347,7 +348,10 @@ def _reverse_seq(input_seq, lengths):
   return results
 
 
-@tf_export("nn.bidirectional_dynamic_rnn")
+@deprecation.deprecated(None, "Please use `keras.layers.Bidirectional("
+                        "keras.layers.RNN(cell))`, which is equivalent to "
+                        "this API")
+@tf_export(v1=["nn.bidirectional_dynamic_rnn"])
 def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
                               initial_state_fw=None, initial_state_bw=None,
                               dtype=None, parallel_iterations=None,
@@ -480,7 +484,10 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
   return (outputs, output_states)
 
 
-@tf_export("nn.dynamic_rnn")
+@deprecation.deprecated(
+    None,
+    "Please use `keras.layers.RNN(cell)`, which is equivalent to this API")
+@tf_export(v1=["nn.dynamic_rnn"])
 def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
                 dtype=None, parallel_iterations=None, swap_memory=False,
                 time_major=False, scope=None):
@@ -617,7 +624,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
     parallel_iterations = parallel_iterations or 32
     if sequence_length is not None:
       sequence_length = math_ops.to_int32(sequence_length)
-      if sequence_length.get_shape().ndims not in (None, 1):
+      if sequence_length.get_shape().rank not in (None, 1):
         raise ValueError(
             "sequence_length must be a vector of length batch_size, "
             "but saw shape: %s" % sequence_length.get_shape())
@@ -737,8 +744,8 @@ def _dynamic_rnn_loop(cell,
       raise ValueError(
           "Input size (depth of inputs) must be accessible via shape inference,"
           " but saw value None.")
-    got_time_steps = shape[0].value
-    got_batch_size = shape[1].value
+    got_time_steps = shape.dims[0].value
+    got_batch_size = shape.dims[1].value
     if const_time_steps != got_time_steps:
       raise ValueError(
           "Time steps is not the same for all the elements in the input in a "
@@ -891,7 +898,7 @@ def _dynamic_rnn_loop(cell,
   return (final_outputs, final_state)
 
 
-@tf_export("nn.raw_rnn")
+@tf_export(v1=["nn.raw_rnn"])
 def raw_rnn(cell, loop_fn,
             parallel_iterations=None, swap_memory=False, scope=None):
   """Creates an `RNN` specified by RNNCell `cell` and loop function `loop_fn`.
@@ -1087,13 +1094,14 @@ def raw_rnn(cell, loop_fn,
                   else constant_op.constant(0, dtype=dtypes.int32))
 
     input_shape = [input_.get_shape() for input_ in flat_input]
-    static_batch_size = input_shape[0][0]
+    static_batch_size = tensor_shape.dimension_at_index(input_shape[0], 0)
 
     for input_shape_i in input_shape:
       # Static verification that batch sizes all match
-      static_batch_size.merge_with(input_shape_i[0])
+      static_batch_size.merge_with(
+          tensor_shape.dimension_at_index(input_shape_i, 0))
 
-    batch_size = static_batch_size.value
+    batch_size = tensor_shape.dimension_value(static_batch_size)
     const_batch_size = batch_size
     if batch_size is None:
       batch_size = array_ops.shape(flat_input[0])[0]
@@ -1176,7 +1184,7 @@ def raw_rnn(cell, loop_fn,
           # TensorArray and scalar get passed through.
           if isinstance(cur_i, tensor_array_ops.TensorArray):
             return cand_i
-          if cur_i.shape.ndims == 0:
+          if cur_i.shape.rank == 0:
             return cand_i
           # Otherwise propagate the old or the new value.
           with ops.colocate_with(cand_i):
@@ -1209,7 +1217,10 @@ def raw_rnn(cell, loop_fn,
     return (emit_ta, final_state, final_loop_state)
 
 
-@tf_export("nn.static_rnn")
+@deprecation.deprecated(
+    None, "Please use `keras.layers.RNN(cell, unroll=True)`, "
+    "which is equivalent to this API")
+@tf_export(v1=["nn.static_rnn"])
 def static_rnn(cell,
                inputs,
                initial_state=None,
@@ -1296,26 +1307,27 @@ def static_rnn(cell,
 
     # Temporarily avoid EmbeddingWrapper and seq2seq badness
     # TODO(lukaszkaiser): remove EmbeddingWrapper
-    if first_input.get_shape().ndims != 1:
+    if first_input.get_shape().rank != 1:
 
       input_shape = first_input.get_shape().with_rank_at_least(2)
-      fixed_batch_size = input_shape[0]
+      fixed_batch_size = input_shape.dims[0]
 
       flat_inputs = nest.flatten(inputs)
       for flat_input in flat_inputs:
         input_shape = flat_input.get_shape().with_rank_at_least(2)
-        batch_size, input_size = input_shape[0], input_shape[1:]
+        batch_size, input_size = tensor_shape.dimension_at_index(
+            input_shape, 0), input_shape[1:]
         fixed_batch_size.merge_with(batch_size)
-        for i, size in enumerate(input_size):
-          if size.value is None:
+        for i, size in enumerate(input_size.dims):
+          if tensor_shape.dimension_value(size) is None:
             raise ValueError(
                 "Input size (dimension %d of inputs) must be accessible via "
                 "shape inference, but saw value None." % i)
     else:
       fixed_batch_size = first_input.get_shape().with_rank_at_least(1)[0]
 
-    if fixed_batch_size.value:
-      batch_size = fixed_batch_size.value
+    if tensor_shape.dimension_value(fixed_batch_size):
+      batch_size = tensor_shape.dimension_value(fixed_batch_size)
     else:
       batch_size = array_ops.shape(first_input)[0]
     if initial_state is not None:
@@ -1333,7 +1345,7 @@ def static_rnn(cell,
     if sequence_length is not None:  # Prepare variables
       sequence_length = ops.convert_to_tensor(
           sequence_length, name="sequence_length")
-      if sequence_length.get_shape().ndims not in (None, 1):
+      if sequence_length.get_shape().rank not in (None, 1):
         raise ValueError(
             "sequence_length must be a vector of length batch_size")
 
@@ -1342,7 +1354,9 @@ def static_rnn(cell,
         size = _concat(batch_size, output_size)
         output = array_ops.zeros(
             array_ops.stack(size), _infer_state_dtype(dtype, state))
-        shape = _concat(fixed_batch_size.value, output_size, static=True)
+        shape = _concat(tensor_shape.dimension_value(fixed_batch_size),
+                        output_size,
+                        static=True)
         output.set_shape(tensor_shape.TensorShape(shape))
         return output
 
@@ -1479,7 +1493,10 @@ def static_state_saving_rnn(cell,
   return (outputs, state)
 
 
-@tf_export("nn.static_bidirectional_rnn")
+@deprecation.deprecated(None, "Please use `keras.layers.Bidirectional("
+                        "keras.layers.RNN(cell, unroll=True))`, which is "
+                        "equivalent to this API")
+@tf_export(v1=["nn.static_bidirectional_rnn"])
 def static_bidirectional_rnn(cell_fw,
                              cell_bw,
                              inputs,
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index dd4f3d7a99793390de26d14a31a44c26ba1cdbb7..ffc45619a74e9b527047f3e55e94664581cb6591 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
@@ -276,11 +277,11 @@ class RNNCell(base_layer.Layer):
               batch_size, partial=True)
         else:
           static_batch_size = batch_size
-        if inputs.shape[0].value != static_batch_size:
+        if inputs.shape.dims[0].value != static_batch_size:
           raise ValueError(
               "batch size from input tensor is different from the "
               "input param. Input tensor batch: {}, batch_size: {}".format(
-                  inputs.shape[0].value, batch_size))
+                  inputs.shape.dims[0].value, batch_size))
 
       if dtype is not None and inputs.dtype != dtype:
         raise ValueError(
@@ -288,7 +289,7 @@ class RNNCell(base_layer.Layer):
             "input param. Input tensor dtype: {}, dtype: {}".format(
                 inputs.dtype, dtype))
 
-      batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
+      batch_size = inputs.shape.dims[0].value or array_ops.shape(inputs)[0]
       dtype = inputs.dtype
     if None in [batch_size, dtype]:
       raise ValueError(
@@ -410,7 +411,7 @@ class BasicRNNCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     if activation:
@@ -462,7 +463,7 @@ class BasicRNNCell(LayerRNNCell):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export("nn.rnn_cell.GRUCell")
+@tf_export(v1=["nn.rnn_cell.GRUCell"])
 class GRUCell(LayerRNNCell):
   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
 
@@ -488,6 +489,8 @@ class GRUCell(LayerRNNCell):
       `trainable` etc when constructing the cell from configs of get_config().
   """
 
+  @deprecated(None, "This class is equivalent as tf.keras.layers.GRUCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                activation=None,
@@ -505,7 +508,7 @@ class GRUCell(LayerRNNCell):
                    "Please use tf.contrib.cudnn_rnn.CudnnGRU for better "
                    "performance on GPU.", self)
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     if activation:
@@ -610,8 +613,7 @@ class LSTMStateTuple(_LSTMStateTuple):
     return c.dtype
 
 
-# TODO(scottzhu): Stop exporting this class in TF 2.0.
-@tf_export("nn.rnn_cell.BasicLSTMCell")
+@tf_export(v1=["nn.rnn_cell.BasicLSTMCell"])
 class BasicLSTMCell(LayerRNNCell):
   """DEPRECATED: Please use `tf.nn.rnn_cell.LSTMCell` instead.
 
@@ -634,10 +636,8 @@ class BasicLSTMCell(LayerRNNCell):
   better performance on CPU.
   """
 
-  @deprecated(None, "This class is deprecated, please use "
-                    "tf.nn.rnn_cell.LSTMCell, which supports all the feature "
-                    "this cell currently has. Please replace the existing code "
-                    "with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').")
+  @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self,
                num_units,
                forget_bias=1.0,
@@ -684,7 +684,7 @@ class BasicLSTMCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._forget_bias = forget_bias
@@ -779,7 +779,7 @@ class BasicLSTMCell(LayerRNNCell):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export("nn.rnn_cell.LSTMCell")
+@tf_export(v1=["nn.rnn_cell.LSTMCell"])
 class LSTMCell(LayerRNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
@@ -807,6 +807,8 @@ class LSTMCell(LayerRNNCell):
   better performance on CPU.
   """
 
+  @deprecated(None, "This class is equivalent as tf.keras.layers.LSTMCell,"
+                    " and will be replaced by that in Tensorflow 2.0.")
   def __init__(self, num_units,
                use_peepholes=False, cell_clip=None,
                initializer=None, num_proj=None, proj_clip=None,
@@ -870,7 +872,7 @@ class LSTMCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._use_peepholes = use_peepholes
@@ -986,8 +988,8 @@ class LSTMCell(LayerRNNCell):
       c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
       m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
 
-    input_size = inputs.get_shape().with_rank(2)[1]
-    if input_size.value is None:
+    input_size = inputs.get_shape().with_rank(2).dims[1].value
+    if input_size is None:
       raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
 
     # i = input_gate, j = new_input, f = forget_gate, o = output_gate
@@ -1393,7 +1395,7 @@ class DeviceWrapper(RNNCell):
       return self._cell(inputs, state, scope=scope)
 
 
-@tf_export("nn.rnn_cell.MultiRNNCell")
+@tf_export(v1=["nn.rnn_cell.MultiRNNCell"])
 class MultiRNNCell(RNNCell):
   """RNN cell composed sequentially of multiple simple cells.
 
@@ -1406,6 +1408,9 @@ class MultiRNNCell(RNNCell):
   ```
   """
 
+  @deprecated(None, "This class is equivalent as "
+                    "tf.keras.layers.StackedRNNCells, and will be replaced by "
+                    "that in Tensorflow 2.0.")
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
 
@@ -1451,7 +1456,7 @@ class MultiRNNCell(RNNCell):
     if self._state_is_tuple:
       return tuple(cell.state_size for cell in self._cells)
     else:
-      return sum([cell.state_size for cell in self._cells])
+      return sum(cell.state_size for cell in self._cells)
 
   @property
   def output_size(self):
@@ -1466,6 +1471,30 @@ class MultiRNNCell(RNNCell):
         # presumably does not contain TensorArrays or anything else fancy
         return super(MultiRNNCell, self).zero_state(batch_size, dtype)
 
+  @property
+  def trainable_weights(self):
+    if not self.trainable:
+      return []
+    weights = []
+    for cell in self._cells:
+      if isinstance(cell, base_layer.Layer):
+        weights += cell.trainable_weights
+    return weights
+
+  @property
+  def non_trainable_weights(self):
+    weights = []
+    for cell in self._cells:
+      if isinstance(cell, base_layer.Layer):
+        weights += cell.non_trainable_weights
+    if not self.trainable:
+      trainable_weights = []
+      for cell in self._cells:
+        if isinstance(cell, base_layer.Layer):
+          trainable_weights += cell.trainable_weights
+      return trainable_weights + weights
+    return weights
+
   def call(self, inputs, state):
     """Run this multi-layer cell on inputs, starting from state."""
     cur_state_pos = 0
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 2ec4b540fb612f776da83d55d011db66ad4e3bbe..a5b31aff91660a6ac79c980dffb543e87fd40dfa 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_script_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -305,13 +306,14 @@ def _EagerPyFuncGrad(op, *dy):
         is_grad_func=True)
 
 
+@tf_export("py_function")
 def eager_py_func(func, inp, Tout, name=None):
   """Wraps a python function into a TensorFlow op that executes it eagerly.
 
   This function allows expressing computations in a TensorFlow graph as
   Python functions. In particular, it wraps a Python function `func`
   in a once-differentiable TensorFlow operation that executes it with eager
-  exeuction enabled. As a consequence, `tf.contrib.eager.py_func` makes it
+  execution enabled. As a consequence, `tf.contrib.eager.py_func` makes it
   possible to express control flow using Python constructs (`if`, `while`,
   `for`, etc.), instead of TensorFlow control flow constructs (`tf.cond`,
   `tf.while_loop`). For example, you might use `tf.contrib.eager.py_func` to
@@ -387,7 +389,16 @@ def eager_py_func(func, inp, Tout, name=None):
   return _internal_py_func(func=func, inp=inp, Tout=Tout, eager=True, name=name)
 
 
-@tf_export("py_func")
+@deprecation.deprecated(
+    date=None,
+    instructions="""tf.py_func is deprecated in TF V2. Instead, use
+    tf.py_function, which takes a python function which manipulates tf eager
+    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
+    an ndarray (just call tensor.numpy()) but having access to eager tensors
+    means `tf.py_function`s can use accelerators such as GPUs as well as
+    being differentiable using a gradient tape.
+    """)
+@tf_export(v1=["py_func"])
 def py_func(func, inp, Tout, stateful=True, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
diff --git a/tensorflow/python/ops/sdca_ops.py b/tensorflow/python/ops/sdca_ops.py
index 4d5aeec59125a53866195aef2c09a314d9961836..a1c68343edd866d6987393edb0da3af53536a155 100644
--- a/tensorflow/python/ops/sdca_ops.py
+++ b/tensorflow/python/ops/sdca_ops.py
@@ -29,4 +29,5 @@ from tensorflow.python.ops.gen_sdca_ops import *
 
 ops.NotDifferentiable("SdcaFprint")
 ops.NotDifferentiable("SdcaOptimizer")
+ops.NotDifferentiable("SdcaOptimizerV2")
 ops.NotDifferentiable("SdcaShrinkL1")
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index e229501c10f30792841953a39abc5b3cf943af96..c6cf2fe9adf58bef84ec677466f01bb16dd61f8b 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -13,11 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tensor Handle Operations.
-
-See the [Session Ops](https://tensorflow.org/api_guides/python/session_ops)
-guide.
-"""
+"""Tensor Handle Operations."""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -140,7 +136,7 @@ class TensorHandle(object):
     return feeder.op.name + ";" + TensorHandle._get_reader_key(handle)
 
 
-@tf_export("get_session_handle")
+@tf_export(v1=["get_session_handle"])
 def get_session_handle(data, name=None):
   """Return the handle of `data`.
 
@@ -183,7 +179,7 @@ def get_session_handle(data, name=None):
     return gen_data_flow_ops.get_session_handle(data, name=name)
 
 
-@tf_export("get_session_tensor")
+@tf_export(v1=["get_session_tensor"])
 def get_session_tensor(handle, dtype, name=None):
   """Get the tensor of type `dtype` by feeding a tensor handle.
 
@@ -224,7 +220,7 @@ def get_session_tensor(handle, dtype, name=None):
   return (holder, tensor)
 
 
-@tf_export("delete_session_tensor")
+@tf_export(v1=["delete_session_tensor"])
 def delete_session_tensor(handle, name=None):
   """Delete the tensor for the given tensor handle.
 
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index 21e08d03d213c173d12dfc6676fe7f009811e93f..ee9c9b6bc0b36a374957178653eaae4c91ad733c 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -31,7 +31,7 @@ _VALID_DTYPES = set([
     dtypes.uint8, dtypes.uint16, dtypes.string])
 
 
-@tf_export("sets.set_size")
+@tf_export("sets.size", v1=["sets.size", "sets.set_size"])
 def set_size(a, validate_indices=True):
   """Compute number of unique elements along last dimension of `a`.
 
@@ -133,7 +133,8 @@ def _set_operation(a, b, set_operation, validate_indices=True):
   return sparse_tensor.SparseTensor(indices, values, shape)
 
 
-@tf_export("sets.set_intersection")
+@tf_export(
+    "sets.intersection", v1=["sets.intersection", "sets.set_intersection"])
 def set_intersection(a, b, validate_indices=True):
   """Compute set intersection of elements in last dimension of `a` and `b`.
 
@@ -200,7 +201,8 @@ def set_intersection(a, b, validate_indices=True):
   return _set_operation(a, b, "intersection", validate_indices)
 
 
-@tf_export("sets.set_difference")
+@tf_export(
+	   "sets.difference", v1=["sets.difference", "sets.set_difference"])
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -271,7 +273,8 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
   return _set_operation(a, b, "a-b" if aminusb else "b-a", validate_indices)
 
 
-@tf_export("sets.set_union")
+@tf_export(
+	   "sets.union", v1=["sets.union", "sets.set_union"])
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
diff --git a/tensorflow/python/ops/signal/BUILD b/tensorflow/python/ops/signal/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..da2bf9c1d2d73aeae8dd2d61c4e690bb1ab93b70
--- /dev/null
+++ b/tensorflow/python/ops/signal/BUILD
@@ -0,0 +1,36 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "signal",
+    srcs = [
+        "dct_ops.py",
+        "fft_ops.py",
+        "mel_ops.py",
+        "mfcc_ops.py",
+        "reconstruction_ops.py",
+        "shape_ops.py",
+        "signal.py",
+        "spectral_ops.py",
+        "util_ops.py",
+        "window_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:spectral_ops_gen",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d042c95c049538354836ef83f0b21d8babccedc8
--- /dev/null
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -0,0 +1,192 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Discrete Cosine Transform ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math as _math
+
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm):
+  """Checks that DCT/IDCT arguments are compatible and well formed."""
+  if n is not None:
+    raise NotImplementedError("The DCT length argument is not implemented.")
+  if axis != -1:
+    raise NotImplementedError("axis must be -1. Got: %s" % axis)
+  if dct_type not in (1, 2, 3):
+    raise ValueError("Only Types I, II and III (I)DCT are supported.")
+  if dct_type == 1:
+    if norm == "ortho":
+      raise ValueError("Normalization is not supported for the Type-I DCT.")
+    if input_tensor.shape[-1] is not None and input_tensor.shape[-1] < 2:
+      raise ValueError(
+          "Type-I DCT requires the dimension to be greater than one.")
+
+  if norm not in (None, "ortho"):
+    raise ValueError(
+        "Unknown normalization. Expected None or 'ortho', got: %s" % norm)
+
+
+# TODO(rjryan): Implement `n` and `axis` parameters.
+@tf_export("signal.dct", v1=["signal.dct", "spectral.dct"])
+def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
+  """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
+
+  Currently only Types I, II and III are supported.
+  Type I is implemented using a length `2N` padded `tf.spectral.rfft`.
+  Type II is implemented using a length `2N` padded `tf.spectral.rfft`, as
+  described here:
+  https://dsp.stackexchange.com/a/10606.
+  Type III is a fairly straightforward inverse of Type II
+  (i.e. using a length `2N` padded `tf.spectral.irfft`).
+
+  @compatibility(scipy)
+  Equivalent to scipy.fftpack.dct for Type-I, Type-II and Type-III DCT.
+  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
+  @end_compatibility
+
+  Args:
+    input: A `[..., samples]` `float32` `Tensor` containing the signals to
+      take the DCT of.
+    type: The DCT type to perform. Must be 1, 2 or 3.
+    n: For future expansion. The length of the transform. Must be `None`.
+    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
+    norm: The normalization to apply. `None` for no normalization or `'ortho'`
+      for orthonormal normalization.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.
+
+  Raises:
+    ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is
+      not `-1`, or `norm` is not `None` or `'ortho'`.
+    ValueError: If `type` is `1` and `norm` is `ortho`.
+
+  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
+  """
+  _validate_dct_arguments(input, type, n, axis, norm)
+  with _ops.name_scope(name, "dct", [input]):
+    # We use the RFFT to compute the DCT and TensorFlow only supports float32
+    # for FFTs at the moment.
+    input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)
+
+    axis_dim = (tensor_shape.dimension_value(input.shape[-1])
+                or _array_ops.shape(input)[-1])
+    axis_dim_float = _math_ops.to_float(axis_dim)
+
+    if type == 1:
+      dct1_input = _array_ops.concat([input, input[..., -2:0:-1]], axis=-1)
+      dct1 = _math_ops.real(fft_ops.rfft(dct1_input))
+      return dct1
+
+    if type == 2:
+      scale = 2.0 * _math_ops.exp(
+          _math_ops.complex(
+              0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
+              axis_dim_float))
+
+      # TODO(rjryan): Benchmark performance and memory usage of the various
+      # approaches to computing a DCT via the RFFT.
+      dct2 = _math_ops.real(
+          fft_ops.rfft(
+              input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)
+
+      if norm == "ortho":
+        n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
+        n2 = n1 * _math_ops.sqrt(2.0)
+        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
+        weights = _array_ops.pad(
+            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
+            constant_values=n2)
+        dct2 *= weights
+
+      return dct2
+
+    elif type == 3:
+      if norm == "ortho":
+        n1 = _math_ops.sqrt(axis_dim_float)
+        n2 = n1 * _math_ops.sqrt(0.5)
+        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
+        weights = _array_ops.pad(
+            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
+            constant_values=n2)
+        input *= weights
+      else:
+        input *= axis_dim_float
+      scale = 2.0 * _math_ops.exp(
+          _math_ops.complex(
+              0.0,
+              _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
+              axis_dim_float))
+      dct3 = _math_ops.real(
+          fft_ops.irfft(
+              scale * _math_ops.complex(input, 0.0),
+              fft_length=[2 * axis_dim]))[..., :axis_dim]
+
+      return dct3
+
+
+# TODO(rjryan): Implement `n` and `axis` parameters.
+@tf_export("signal.idct", v1=["signal.idct", "spectral.idct"])
+def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
+  """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
+
+  Currently only Types I, II and III are supported. Type III is the inverse of
+  Type II, and vice versa.
+
+  Note that you must re-normalize by 1/(2n) to obtain an inverse if `norm` is
+  not `'ortho'`. That is:
+  `signal == idct(dct(signal)) * 0.5 / signal.shape[-1]`.
+  When `norm='ortho'`, we have:
+  `signal == idct(dct(signal, norm='ortho'), norm='ortho')`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.fftpack.idct for Type-I, Type-II and Type-III DCT.
+  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.idct.html
+  @end_compatibility
+
+  Args:
+    input: A `[..., samples]` `float32` `Tensor` containing the signals to take
+      the DCT of.
+    type: The IDCT type to perform. Must be 1, 2 or 3.
+    n: For future expansion. The length of the transform. Must be `None`.
+    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
+    norm: The normalization to apply. `None` for no normalization or `'ortho'`
+      for orthonormal normalization.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `float32` `Tensor` containing the IDCT of `input`.
+
+  Raises:
+    ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is
+      not `-1`, or `norm` is not `None` or `'ortho'`.
+
+  [idct]:
+  https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms
+  """
+  _validate_dct_arguments(input, type, n, axis, norm)
+  inverse_type = {1: 1, 2: 3, 3: 2}[type]
+  return dct(input, type=inverse_type, n=n, axis=axis, norm=norm, name=name)
diff --git a/tensorflow/python/ops/signal/fft_ops.py b/tensorflow/python/ops/signal/fft_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d14b2bbd75864b6477bccc5cef562b617674c08
--- /dev/null
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -0,0 +1,330 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fast-Fourier Transform ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.framework import tensor_util as _tensor_util
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.ops import gen_spectral_ops
+from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _infer_fft_length_for_rfft(input_tensor, fft_rank):
+  """Infers the `fft_length` argument for a `rank` RFFT from `input_tensor`."""
+  # A TensorShape for the inner fft_rank dimensions.
+  fft_shape = input_tensor.get_shape()[-fft_rank:]
+
+  # If any dim is unknown, fall back to tensor-based math.
+  if not fft_shape.is_fully_defined():
+    return _array_ops.shape(input_tensor)[-fft_rank:]
+
+  # Otherwise, return a constant.
+  return _ops.convert_to_tensor(fft_shape.as_list(), _dtypes.int32)
+
+
+def _infer_fft_length_for_irfft(input_tensor, fft_rank):
+  """Infers the `fft_length` argument for a `rank` IRFFT from `input_tensor`."""
+  # A TensorShape for the inner fft_rank dimensions.
+  fft_shape = input_tensor.get_shape()[-fft_rank:]
+
+  # If any dim is unknown, fall back to tensor-based math.
+  if not fft_shape.is_fully_defined():
+    fft_length = _array_ops.unstack(_array_ops.shape(input_tensor)[-fft_rank:])
+    fft_length[-1] = _math_ops.maximum(0, 2 * (fft_length[-1] - 1))
+    return _array_ops.stack(fft_length)
+
+  # Otherwise, return a constant.
+  fft_length = fft_shape.as_list()
+  if fft_length:
+    fft_length[-1] = max(0, 2 * (fft_length[-1] - 1))
+  return _ops.convert_to_tensor(fft_length, _dtypes.int32)
+
+
+def _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length, is_reverse=False):
+  """Pads `input_tensor` to `fft_length` on its inner-most `fft_rank` dims."""
+  fft_shape = _tensor_util.constant_value_as_shape(fft_length)
+
+  # Edge case: skip padding empty tensors.
+  if (input_tensor.shape.ndims is not None and
+      any(dim.value == 0 for dim in input_tensor.shape.dims)):
+    return input_tensor
+
+  # If we know the shapes ahead of time, we can either skip or pre-compute the
+  # appropriate paddings. Otherwise, fall back to computing paddings in
+  # TensorFlow.
+  if fft_shape.is_fully_defined() and input_tensor.shape.ndims is not None:
+    # Slice the last FFT-rank dimensions from input_tensor's shape.
+    input_fft_shape = input_tensor.shape[-fft_shape.ndims:]
+
+    if input_fft_shape.is_fully_defined():
+      # In reverse, we only pad the inner-most dimension to fft_length / 2 + 1.
+      if is_reverse:
+        fft_shape = fft_shape[:-1].concatenate(
+            fft_shape.dims[-1].value // 2 + 1)
+
+      paddings = [[0, max(fft_dim.value - input_dim.value, 0)]
+                  for fft_dim, input_dim in zip(
+                      fft_shape.dims, input_fft_shape.dims)]
+      if any(pad > 0 for _, pad in paddings):
+        outer_paddings = [[0, 0]] * max((input_tensor.shape.ndims -
+                                         fft_shape.ndims), 0)
+        return _array_ops.pad(input_tensor, outer_paddings + paddings)
+      return input_tensor
+
+  # If we can't determine the paddings ahead of time, then we have to pad. If
+  # the paddings end up as zero, tf.pad has a special-case that does no work.
+  input_rank = _array_ops.rank(input_tensor)
+  input_fft_shape = _array_ops.shape(input_tensor)[-fft_rank:]
+  outer_dims = _math_ops.maximum(0, input_rank - fft_rank)
+  outer_paddings = _array_ops.zeros([outer_dims], fft_length.dtype)
+  # In reverse, we only pad the inner-most dimension to fft_length / 2 + 1.
+  if is_reverse:
+    fft_length = _array_ops.concat([fft_length[:-1],
+                                    fft_length[-1:] // 2 + 1], 0)
+  fft_paddings = _math_ops.maximum(0, fft_length - input_fft_shape)
+  paddings = _array_ops.concat([outer_paddings, fft_paddings], 0)
+  paddings = _array_ops.stack([_array_ops.zeros_like(paddings), paddings],
+                              axis=1)
+  return _array_ops.pad(input_tensor, paddings)
+
+
+def _rfft_wrapper(fft_fn, fft_rank, default_name):
+  """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
+
+  def _rfft(input_tensor, fft_length=None, name=None):
+    """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
+    with _ops.name_scope(name, default_name,
+                         [input_tensor, fft_length]) as name:
+      input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.float32)
+      input_tensor.shape.with_rank_at_least(fft_rank)
+      if fft_length is None:
+        fft_length = _infer_fft_length_for_rfft(input_tensor, fft_rank)
+      else:
+        fft_length = _ops.convert_to_tensor(fft_length, _dtypes.int32)
+      input_tensor = _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length)
+      return fft_fn(input_tensor, fft_length, name)
+  _rfft.__doc__ = fft_fn.__doc__
+  return _rfft
+
+
+def _irfft_wrapper(ifft_fn, fft_rank, default_name):
+  """Wrapper around gen_spectral_ops.irfft* that infers fft_length argument."""
+
+  def _irfft(input_tensor, fft_length=None, name=None):
+    """Wrapper irfft* that infers fft_length argument."""
+    with _ops.name_scope(name, default_name,
+                         [input_tensor, fft_length]) as name:
+      input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.complex64)
+      input_tensor.shape.with_rank_at_least(fft_rank)
+      if fft_length is None:
+        fft_length = _infer_fft_length_for_irfft(input_tensor, fft_rank)
+      else:
+        fft_length = _ops.convert_to_tensor(fft_length, _dtypes.int32)
+      input_tensor = _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length,
+                                         is_reverse=True)
+      return ifft_fn(input_tensor, fft_length, name)
+  _irfft.__doc__ = ifft_fn.__doc__
+  return _irfft
+
+
+# FFT/IFFT 1/2/3D are exported via
+# third_party/tensorflow/core/api_def/python_api/
+fft = gen_spectral_ops.fft
+ifft = gen_spectral_ops.ifft
+fft2d = gen_spectral_ops.fft2d
+ifft2d = gen_spectral_ops.ifft2d
+fft3d = gen_spectral_ops.fft3d
+ifft3d = gen_spectral_ops.ifft3d
+rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
+tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(rfft)
+irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
+tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(irfft)
+rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
+tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(rfft2d)
+irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
+tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(irfft2d)
+rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
+tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(rfft3d)
+irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
+tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(irfft3d)
+
+
+def _fft_size_for_grad(grad, rank):
+  return _math_ops.reduce_prod(_array_ops.shape(grad)[-rank:])
+
+
+@_ops.RegisterGradient("FFT")
+def _fft_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 1), grad.dtype)
+  return ifft(grad) * size
+
+
+@_ops.RegisterGradient("IFFT")
+def _ifft_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 1), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft(grad) * rsize
+
+
+@_ops.RegisterGradient("FFT2D")
+def _fft2d_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 2), grad.dtype)
+  return ifft2d(grad) * size
+
+
+@_ops.RegisterGradient("IFFT2D")
+def _ifft2d_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 2), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft2d(grad) * rsize
+
+
+@_ops.RegisterGradient("FFT3D")
+def _fft3d_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 3), grad.dtype)
+  return ifft3d(grad) * size
+
+
+@_ops.RegisterGradient("IFFT3D")
+def _ifft3d_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 3), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft3d(grad) * rsize
+
+
+def _rfft_grad_helper(rank, irfft_fn):
+  """Returns a gradient function for an RFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for RFFT3D.
+  assert rank in (1, 2), "Gradient for RFFT3D is not implemented."
+
+  def _grad(op, grad):
+    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
+    fft_length = op.inputs[1]
+    input_shape = _array_ops.shape(op.inputs[0])
+    is_even = _math_ops.cast(1 - (fft_length[-1] % 2), _dtypes.complex64)
+
+    def _tile_for_broadcasting(matrix, t):
+      expanded = _array_ops.reshape(
+          matrix,
+          _array_ops.concat([
+              _array_ops.ones([_array_ops.rank(t) - 2], _dtypes.int32),
+              _array_ops.shape(matrix)
+          ], 0))
+      return _array_ops.tile(
+          expanded, _array_ops.concat([_array_ops.shape(t)[:-2], [1, 1]], 0))
+
+    def _mask_matrix(length):
+      """Computes t_n = exp(sqrt(-1) * pi * n^2 / line_len)."""
+      # TODO(rjryan): Speed up computation of twiddle factors using the
+      # following recurrence relation and cache them across invocations of RFFT.
+      #
+      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
+      # for n = 0, 1,..., line_len-1.
+      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
+      a = _array_ops.tile(
+          _array_ops.expand_dims(_math_ops.range(length), 0), (length, 1))
+      b = _array_ops.transpose(a, [1, 0])
+      return _math_ops.exp(
+          -2j * np.pi * _math_ops.cast(a * b, _dtypes.complex64) /
+          _math_ops.cast(length, _dtypes.complex64))
+
+    def _ymask(length):
+      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
+      return _math_ops.cast(1 - 2 * (_math_ops.range(length) % 2),
+                            _dtypes.complex64)
+
+    y0 = grad[..., 0:1]
+    if rank == 1:
+      ym = grad[..., -1:]
+      extra_terms = y0 + is_even * ym * _ymask(input_shape[-1])
+    elif rank == 2:
+      # Create a mask matrix for y0 and ym.
+      base_mask = _mask_matrix(input_shape[-2])
+
+      # Tile base_mask to match y0 in shape so that we can batch-matmul the
+      # inner 2 dimensions.
+      tiled_mask = _tile_for_broadcasting(base_mask, y0)
+
+      y0_term = _math_ops.matmul(tiled_mask, _math_ops.conj(y0))
+      extra_terms = y0_term
+
+      ym = grad[..., -1:]
+      ym_term = _math_ops.matmul(tiled_mask, _math_ops.conj(ym))
+
+      inner_dim = input_shape[-1]
+      ym_term = _array_ops.tile(
+          ym_term,
+          _array_ops.concat([
+              _array_ops.ones([_array_ops.rank(grad) - 1], _dtypes.int32),
+              [inner_dim]
+          ], 0)) * _ymask(inner_dim)
+
+      extra_terms += is_even * ym_term
+
+    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
+    # factor, plus some additional terms to make up for the components dropped
+    # due to Hermitian symmetry.
+    input_size = _math_ops.to_float(_fft_size_for_grad(op.inputs[0], rank))
+    the_irfft = irfft_fn(grad, fft_length)
+    return 0.5 * (the_irfft * input_size + _math_ops.real(extra_terms)), None
+
+  return _grad
+
+
+def _irfft_grad_helper(rank, rfft_fn):
+  """Returns a gradient function for an IRFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for IRFFT3D.
+  assert rank in (1, 2), "Gradient for IRFFT3D is not implemented."
+
+  def _grad(op, grad):
+    """A gradient function for IRFFT with the provided `rank` and `rfft_fn`."""
+    # Generate a simple mask like [1.0, 2.0, ..., 2.0, 1.0] for even-length FFTs
+    # and [1.0, 2.0, ..., 2.0] for odd-length FFTs. To reduce extra ops in the
+    # graph we special-case the situation where the FFT length and last
+    # dimension of the input are known at graph construction time.
+    fft_length = op.inputs[1]
+    is_odd = _math_ops.mod(fft_length[-1], 2)
+    input_last_dimension = _array_ops.shape(op.inputs[0])[-1]
+    mask = _array_ops.concat(
+        [[1.0], 2.0 * _array_ops.ones([input_last_dimension - 2 + is_odd]),
+         _array_ops.ones([1 - is_odd])], 0)
+
+    rsize = _math_ops.reciprocal(_math_ops.to_float(
+        _fft_size_for_grad(grad, rank)))
+
+    # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
+    # factor and a mask. The mask scales the gradient for the Hermitian
+    # symmetric components of the RFFT by a factor of two, since these
+    # components are de-duplicated in the RFFT.
+    the_rfft = rfft_fn(grad, fft_length)
+    return the_rfft * _math_ops.cast(rsize * mask, _dtypes.complex64), None
+
+  return _grad
+
+
+_ops.RegisterGradient("RFFT")(_rfft_grad_helper(1, irfft))
+_ops.RegisterGradient("IRFFT")(_irfft_grad_helper(1, rfft))
+_ops.RegisterGradient("RFFT2D")(_rfft_grad_helper(2, irfft2d))
+_ops.RegisterGradient("IRFFT2D")(_irfft_grad_helper(2, rfft2d))
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py
similarity index 98%
rename from tensorflow/contrib/signal/python/ops/mel_ops.py
rename to tensorflow/python/ops/signal/mel_ops.py
index ecc2fedb9f82151511bab3f3c0496bc4e290903f..6488e1df59b4a0bd801ebb23dc3b5ea5b31e00c2 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/python/ops/signal/mel_ops.py
@@ -18,11 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.signal.python.ops import shape_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.util.tf_export import tf_export
+
 
 # mel spectrum constants.
 _MEL_BREAK_FREQUENCY_HERTZ = 700.0
@@ -85,6 +87,7 @@ def _validate_arguments(num_mel_bins, sample_rate,
     raise ValueError('dtype must be a floating point type. Got: %s' % dtype)
 
 
+@tf_export('signal.linear_to_mel_weight_matrix')
 def linear_to_mel_weight_matrix(num_mel_bins=20,
                                 num_spectrogram_bins=129,
                                 sample_rate=8000,
diff --git a/tensorflow/contrib/signal/python/ops/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
similarity index 88%
rename from tensorflow/contrib/signal/python/ops/mfcc_ops.py
rename to tensorflow/python/ops/signal/mfcc_ops.py
index 4e842f7f10ae07448cc07e5f636ae80a820e656f..601409dea901f34cca02861971850c3238378163 100644
--- a/tensorflow/contrib/signal/python/ops/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -22,9 +22,11 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import dct_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('signal.mfccs_from_log_mel_spectrograms')
 def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.
 
@@ -48,14 +50,14 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   pcm = tf.placeholder(tf.float32, [None, None])
 
   # A 1024-point STFT with frames of 64 ms and 75% overlap.
-  stfts = tf.contrib.signal.stft(pcm, frame_length=1024, frame_step=256,
-                                 fft_length=1024)
+  stfts = tf.signal.stft(pcm, frame_length=1024, frame_step=256,
+                         fft_length=1024)
   spectrograms = tf.abs(stfts)
 
   # Warp the linear scale spectrograms into the mel-scale.
   num_spectrogram_bins = stfts.shape[-1].value
   lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
-  linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
+  linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
     num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
     upper_edge_hertz)
   mel_spectrograms = tf.tensordot(
@@ -67,7 +69,7 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
 
   # Compute MFCCs from log_mel_spectrograms and take the first 13.
-  mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
+  mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
     log_mel_spectrograms)[..., :13]
   ```
 
@@ -96,13 +98,13 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
     log_mel_spectrograms = ops.convert_to_tensor(log_mel_spectrograms,
                                                  dtype=dtypes.float32)
     if (log_mel_spectrograms.shape.ndims and
-        log_mel_spectrograms.shape[-1].value is not None):
-      num_mel_bins = log_mel_spectrograms.shape[-1].value
+        log_mel_spectrograms.shape.dims[-1].value is not None):
+      num_mel_bins = log_mel_spectrograms.shape.dims[-1].value
       if num_mel_bins == 0:
         raise ValueError('num_mel_bins must be positive. Got: %s' %
                          log_mel_spectrograms)
     else:
       num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]
 
-    dct2 = spectral_ops.dct(log_mel_spectrograms)
+    dct2 = dct_ops.dct(log_mel_spectrograms, type=2)
     return dct2 * math_ops.rsqrt(math_ops.to_float(num_mel_bins) * 2.0)
diff --git a/tensorflow/python/ops/signal/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eaab4e0a0cd7958d56c9af3ccf2c5f69b35ee9b
--- /dev/null
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -0,0 +1,155 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal reconstruction via overlapped addition of frames."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("signal.overlap_and_add")
+def overlap_and_add(signal, frame_step, name=None):
+  """Reconstructs a signal from a framed representation.
+
+  Adds potentially overlapping frames of a signal with shape
+  `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
+  The resulting tensor has shape `[..., output_size]` where
+
+      output_size = (frames - 1) * frame_step + frame_length
+
+  Args:
+    signal: A [..., frames, frame_length] `Tensor`. All dimensions may be
+      unknown, and rank must be at least 2.
+    frame_step: An integer or scalar `Tensor` denoting overlap offsets. Must be
+      less than or equal to `frame_length`.
+    name: An optional name for the operation.
+
+  Returns:
+    A `Tensor` with shape `[..., output_size]` containing the overlap-added
+    frames of `signal`'s inner-most two dimensions.
+
+  Raises:
+    ValueError: If `signal`'s rank is less than 2, or `frame_step` is not a
+      scalar integer.
+  """
+  with ops.name_scope(name, "overlap_and_add", [signal, frame_step]):
+    signal = ops.convert_to_tensor(signal, name="signal")
+    signal.shape.with_rank_at_least(2)
+    frame_step = ops.convert_to_tensor(frame_step, name="frame_step")
+    frame_step.shape.assert_has_rank(0)
+    if not frame_step.dtype.is_integer:
+      raise ValueError("frame_step must be an integer. Got %s" %
+                       frame_step.dtype)
+
+    signal_shape = array_ops.shape(signal)
+
+    # All dimensions that are not part of the overlap-and-add. Can be empty for
+    # rank 2 inputs.
+    outer_dimensions = signal_shape[:-2]
+    outer_rank = array_ops.size(outer_dimensions)
+
+    def full_shape(inner_shape):
+      return array_ops.concat([outer_dimensions, inner_shape], 0)
+
+    frame_length = signal_shape[-1]
+    frames = signal_shape[-2]
+
+    # Compute output length.
+    output_length = frame_length + frame_step * (frames - 1)
+
+    # If frame_length is equal to frame_step, there's no overlap so just
+    # reshape the tensor.
+    frame_step_static = tensor_util.constant_value(frame_step)
+    if (frame_step_static is not None and signal.shape.dims is not None and
+        frame_step_static == signal.shape.dims[-1].value):
+      output_shape = full_shape([output_length])
+      return array_ops.reshape(signal, output_shape, name="fast_path")
+
+    # The following code is documented using this example:
+    #
+    # frame_step = 2
+    # signal.shape = (3, 5)
+    # a b c d e
+    # f g h i j
+    # k l m n o
+
+    # Compute the number of segments, per frame.
+    segments = -(-frame_length // frame_step)  # Divide and round up.
+
+    # Pad the frame_length dimension to a multiple of the frame step.
+    # Pad the frames dimension by `segments` so that signal.shape = (6, 6)
+    # a b c d e 0
+    # f g h i j 0
+    # k l m n o 0
+    # 0 0 0 0 0 0
+    # 0 0 0 0 0 0
+    # 0 0 0 0 0 0
+    paddings = [[0, segments], [0, segments * frame_step - frame_length]]
+    outer_paddings = array_ops.zeros([outer_rank, 2], dtypes.int32)
+    paddings = array_ops.concat([outer_paddings, paddings], 0)
+    signal = array_ops.pad(signal, paddings)
+
+    # Reshape so that signal.shape = (3, 6, 2)
+    # ab cd e0
+    # fg hi j0
+    # kl mn o0
+    # 00 00 00
+    # 00 00 00
+    # 00 00 00
+    shape = full_shape([frames + segments, segments, frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Transpose dimensions so that signal.shape = (3, 6, 2)
+    # ab fg kl 00 00 00
+    # cd hi mn 00 00 00
+    # e0 j0 o0 00 00 00
+    perm = array_ops.concat(
+        [math_ops.range(outer_rank), outer_rank + [1, 0, 2]], 0)
+    signal = array_ops.transpose(signal, perm)
+
+    # Reshape so that signal.shape = (18, 2)
+    # ab fg kl 00 00 00 cd hi mn 00 00 00 e0 j0 o0 00 00 00
+    shape = full_shape([(frames + segments) * segments, frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Truncate so that signal.shape = (15, 2)
+    # ab fg kl 00 00 00 cd hi mn 00 00 00 e0 j0 o0
+    signal = signal[..., :(frames + segments - 1) * segments, :]
+
+    # Reshape so that signal.shape = (3, 5, 2)
+    # ab fg kl 00 00
+    # 00 cd hi mn 00
+    # 00 00 e0 j0 o0
+    shape = full_shape([segments, (frames + segments - 1), frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Now, reduce over the columns, to achieve the desired sum.
+    signal = math_ops.reduce_sum(signal, -3)
+
+    # Flatten the array.
+    shape = full_shape([(frames + segments - 1) * frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Truncate to final length.
+    signal = signal[..., :output_length]
+
+    return signal
diff --git a/tensorflow/contrib/signal/python/ops/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
similarity index 97%
rename from tensorflow/contrib/signal/python/ops/shape_ops.py
rename to tensorflow/python/ops/signal/shape_ops.py
index 91862f0cc0ba53c6b3bc31d7f5e93cbbbd7ae494..ae9c2ef28e4f1c857519838f22a4844ac2c9e7b4 100644
--- a/tensorflow/contrib/signal/python/ops/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -18,13 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-from tensorflow.contrib.signal.python.ops import util_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import util_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
@@ -53,6 +52,7 @@ def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
   return outer_dimensions + [num_frames, frame_length] + inner_dimensions
 
 
+@tf_export("signal.frame")
 def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
           name=None):
   """Expands `signal`'s `axis` dimension into frames of `frame_length`.
@@ -70,8 +70,8 @@ def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
 
   ```python
   pcm = tf.placeholder(tf.float32, [None, 9152])
-  frames = tf.contrib.signal.frame(pcm, 512, 180)
-  magspec = tf.abs(tf.spectral.rfft(frames, [512]))
+  frames = tf.signal.frame(pcm, 512, 180)
+  magspec = tf.abs(tf.signal.rfft(frames, [512]))
   image = tf.expand_dims(magspec, 3)
   ```
 
diff --git a/tensorflow/python/ops/signal/signal.py b/tensorflow/python/ops/signal/signal.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc4d1c1911a8570ba28a0b42bd6da5d83fd40e1
--- /dev/null
+++ b/tensorflow/python/ops/signal/signal.py
@@ -0,0 +1,65 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal processing operations.
+
+See the [tf.signal](https://tensorflow.org/api_guides/python/contrib.signal)
+guide.
+
+@@frame
+@@hamming_window
+@@hann_window
+@@inverse_stft
+@@inverse_stft_window_fn
+@@mfccs_from_log_mel_spectrograms
+@@linear_to_mel_weight_matrix
+@@overlap_and_add
+@@stft
+
+[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+[mel]: https://en.wikipedia.org/wiki/Mel_scale
+[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.ops.signal.dct_ops import dct
+from tensorflow.python.ops.signal.dct_ops import idct
+from tensorflow.python.ops.signal.fft_ops import fft
+from tensorflow.python.ops.signal.fft_ops import fft2d
+from tensorflow.python.ops.signal.fft_ops import fft3d
+from tensorflow.python.ops.signal.fft_ops import ifft
+from tensorflow.python.ops.signal.fft_ops import ifft2d
+from tensorflow.python.ops.signal.fft_ops import ifft3d
+from tensorflow.python.ops.signal.fft_ops import irfft
+from tensorflow.python.ops.signal.fft_ops import irfft2d
+from tensorflow.python.ops.signal.fft_ops import irfft3d
+from tensorflow.python.ops.signal.fft_ops import rfft
+from tensorflow.python.ops.signal.fft_ops import rfft2d
+from tensorflow.python.ops.signal.fft_ops import rfft3d
+from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
+from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
+from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
+from tensorflow.python.ops.signal.shape_ops import frame
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
+from tensorflow.python.ops.signal.spectral_ops import stft
+from tensorflow.python.ops.signal.window_ops import hamming_window
+from tensorflow.python.ops.signal.window_ops import hann_window
+# pylint: enable=unused-import
diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f029e0a8b59777b50e38ab4d8f801e811467c561
--- /dev/null
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -0,0 +1,287 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Spectral operations (e.g. Short-time Fourier Transform)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.ops.signal import reconstruction_ops
+from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.ops.signal import window_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('signal.stft')
+def stft(signals, frame_length, frame_step, fft_length=None,
+         window_fn=window_ops.hann_window,
+         pad_end=False, name=None):
+  """Computes the [Short-time Fourier Transform][stft] of `signals`.
+
+  Implemented with GPU-compatible ops and supports gradients.
+
+  Args:
+    signals: A `[..., samples]` `float32` `Tensor` of real-valued signals.
+    frame_length: An integer scalar `Tensor`. The window length in samples.
+    frame_step: An integer scalar `Tensor`. The number of samples to step.
+    fft_length: An integer scalar `Tensor`. The size of the FFT to apply.
+      If not provided, uses the smallest power of 2 enclosing `frame_length`.
+    window_fn: A callable that takes a window length and a `dtype` keyword
+      argument and returns a `[window_length]` `Tensor` of samples in the
+      provided datatype. If set to `None`, no windowing is used.
+    pad_end: Whether to pad the end of `signals` with zeros when the provided
+      frame length and step produces a frame that lies partially past its end.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., frames, fft_unique_bins]` `Tensor` of `complex64` STFT values where
+    `fft_unique_bins` is `fft_length // 2 + 1` (the unique components of the
+    FFT).
+
+  Raises:
+    ValueError: If `signals` is not at least rank 1, `frame_length` is
+      not scalar, or `frame_step` is not scalar.
+
+  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+  """
+  with ops.name_scope(name, 'stft', [signals, frame_length,
+                                     frame_step]):
+    signals = ops.convert_to_tensor(signals, name='signals')
+    signals.shape.with_rank_at_least(1)
+    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
+    frame_length.shape.assert_has_rank(0)
+    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
+    frame_step.shape.assert_has_rank(0)
+
+    if fft_length is None:
+      fft_length = _enclosing_power_of_two(frame_length)
+    else:
+      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
+
+    framed_signals = shape_ops.frame(
+        signals, frame_length, frame_step, pad_end=pad_end)
+
+    # Optionally window the framed signals.
+    if window_fn is not None:
+      window = window_fn(frame_length, dtype=framed_signals.dtype)
+      framed_signals *= window
+
+    # fft_ops.rfft produces the (fft_length/2 + 1) unique components of the
+    # FFT of the real windowed signals in framed_signals.
+    return fft_ops.rfft(framed_signals, [fft_length])
+
+
+@tf_export('signal.inverse_stft_window_fn')
+def inverse_stft_window_fn(frame_step,
+                           forward_window_fn=window_ops.hann_window,
+                           name=None):
+  """Generates a window function that can be used in `inverse_stft`.
+
+  Constructs a window that is equal to the forward window with a further
+  pointwise amplitude correction.  `inverse_stft_window_fn` is equivalent to
+  `forward_window_fn` in the case where it would produce an exact inverse.
+
+  See examples in `inverse_stft` documentation for usage.
+
+  Args:
+    frame_step: An integer scalar `Tensor`. The number of samples to step.
+    forward_window_fn: window_fn used in the forward transform, `stft`.
+    name: An optional name for the operation.
+
+  Returns:
+    A callable that takes a window length and a `dtype` keyword argument and
+      returns a `[window_length]` `Tensor` of samples in the provided datatype.
+      The returned window is suitable for reconstructing original waveform in
+      inverse_stft.
+  """
+  with ops.name_scope(name, 'inverse_stft_window_fn', [forward_window_fn]):
+    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
+    frame_step.shape.assert_has_rank(0)
+
+  def inverse_stft_window_fn_inner(frame_length, dtype):
+    """Computes a window that can be used in `inverse_stft`.
+
+    Args:
+      frame_length: An integer scalar `Tensor`. The window length in samples.
+      dtype: Data type of waveform passed to `stft`.
+
+    Returns:
+      A window suitable for reconstructing original waveform in `inverse_stft`.
+
+    Raises:
+      ValueError: If `frame_length` is not scalar, `forward_window_fn` is not a
+      callable that takes a window length and a `dtype` keyword argument and
+      returns a `[window_length]` `Tensor` of samples in the provided datatype
+      `frame_step` is not scalar, or `frame_step` is not scalar.
+    """
+    with ops.name_scope(name, 'inverse_stft_window_fn', [forward_window_fn]):
+      frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
+      frame_length.shape.assert_has_rank(0)
+
+      # Use equation 7 from Griffin + Lim.
+      forward_window = forward_window_fn(frame_length, dtype=dtype)
+      denom = math_ops.square(forward_window)
+      overlaps = -(-frame_length // frame_step)  # Ceiling division.
+      denom = array_ops.pad(denom, [(0, overlaps * frame_step - frame_length)])
+      denom = array_ops.reshape(denom, [overlaps, frame_step])
+      denom = math_ops.reduce_sum(denom, 0, keepdims=True)
+      denom = array_ops.tile(denom, [overlaps, 1])
+      denom = array_ops.reshape(denom, [overlaps * frame_step])
+
+      return forward_window / denom[:frame_length]
+  return inverse_stft_window_fn_inner
+
+
+@tf_export('signal.inverse_stft')
+def inverse_stft(stfts,
+                 frame_length,
+                 frame_step,
+                 fft_length=None,
+                 window_fn=window_ops.hann_window,
+                 name=None):
+  """Computes the inverse [Short-time Fourier Transform][stft] of `stfts`.
+
+  To reconstruct an original waveform, a complimentary window function should
+  be used in inverse_stft. Such a window function can be constructed with
+  tf.signal.inverse_stft_window_fn.
+
+  Example:
+
+  ```python
+  frame_length = 400
+  frame_step = 160
+  waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
+  stft = tf.signal.stft(waveform, frame_length, frame_step)
+  inverse_stft = tf.signal.inverse_stft(
+      stft, frame_length, frame_step,
+      window_fn=tf.signal.inverse_stft_window_fn(frame_step))
+  ```
+
+  if a custom window_fn is used in stft, it must be passed to
+  inverse_stft_window_fn:
+
+  ```python
+  frame_length = 400
+  frame_step = 160
+  window_fn = functools.partial(window_ops.hamming_window, periodic=True),
+  waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
+  stft = tf.signal.stft(
+      waveform, frame_length, frame_step, window_fn=window_fn)
+  inverse_stft = tf.signal.inverse_stft(
+      stft, frame_length, frame_step,
+      window_fn=tf.signal.inverse_stft_window_fn(
+         frame_step, forward_window_fn=window_fn))
+  ```
+
+  Implemented with GPU-compatible ops and supports gradients.
+
+  Args:
+    stfts: A `complex64` `[..., frames, fft_unique_bins]` `Tensor` of STFT bins
+      representing a batch of `fft_length`-point STFTs where `fft_unique_bins`
+      is `fft_length // 2 + 1`
+    frame_length: An integer scalar `Tensor`. The window length in samples.
+    frame_step: An integer scalar `Tensor`. The number of samples to step.
+    fft_length: An integer scalar `Tensor`. The size of the FFT that produced
+      `stfts`. If not provided, uses the smallest power of 2 enclosing
+      `frame_length`.
+    window_fn: A callable that takes a window length and a `dtype` keyword
+      argument and returns a `[window_length]` `Tensor` of samples in the
+      provided datatype. If set to `None`, no windowing is used.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `Tensor` of `float32` signals representing the inverse
+    STFT for each input STFT in `stfts`.
+
+  Raises:
+    ValueError: If `stfts` is not at least rank 2, `frame_length` is not scalar,
+      `frame_step` is not scalar, or `fft_length` is not scalar.
+
+  [stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+  """
+  with ops.name_scope(name, 'inverse_stft', [stfts]):
+    stfts = ops.convert_to_tensor(stfts, name='stfts')
+    stfts.shape.with_rank_at_least(2)
+    frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
+    frame_length.shape.assert_has_rank(0)
+    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
+    frame_step.shape.assert_has_rank(0)
+    if fft_length is None:
+      fft_length = _enclosing_power_of_two(frame_length)
+    else:
+      fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
+      fft_length.shape.assert_has_rank(0)
+
+    real_frames = fft_ops.irfft(stfts, [fft_length])
+
+    # frame_length may be larger or smaller than fft_length, so we pad or
+    # truncate real_frames to frame_length.
+    frame_length_static = tensor_util.constant_value(frame_length)
+    # If we don't know the shape of real_frames's inner dimension, pad and
+    # truncate to frame_length.
+    if (frame_length_static is None or
+        real_frames.shape.ndims is None or
+        real_frames.shape[-1].value is None):
+      real_frames = real_frames[..., :frame_length]
+      real_frames_rank = array_ops.rank(real_frames)
+      real_frames_shape = array_ops.shape(real_frames)
+      paddings = array_ops.concat(
+          [array_ops.zeros([real_frames_rank - 1, 2],
+                           dtype=frame_length.dtype),
+           [[0, math_ops.maximum(0, frame_length - real_frames_shape[-1])]]], 0)
+      real_frames = array_ops.pad(real_frames, paddings)
+    # We know real_frames's last dimension and frame_length statically. If they
+    # are different, then pad or truncate real_frames to frame_length.
+    elif real_frames.shape[-1].value > frame_length_static:
+      real_frames = real_frames[..., :frame_length_static]
+    elif real_frames.shape[-1].value < frame_length_static:
+      pad_amount = frame_length_static - real_frames.shape[-1].value
+      real_frames = array_ops.pad(real_frames,
+                                  [[0, 0]] * (real_frames.shape.ndims - 1) +
+                                  [[0, pad_amount]])
+
+    # The above code pads the inner dimension of real_frames to frame_length,
+    # but it does so in a way that may not be shape-inference friendly.
+    # Restore shape information if we are able to.
+    if frame_length_static is not None and real_frames.shape.ndims is not None:
+      real_frames.set_shape([None] * (real_frames.shape.ndims - 1) +
+                            [frame_length_static])
+
+    # Optionally window and overlap-add the inner 2 dimensions of real_frames
+    # into a single [samples] dimension.
+    if window_fn is not None:
+      window = window_fn(frame_length, dtype=stfts.dtype.real_dtype)
+      real_frames *= window
+    return reconstruction_ops.overlap_and_add(real_frames, frame_step)
+
+
+def _enclosing_power_of_two(value):
+  """Return 2**N for integer N such that 2**N >= value."""
+  value_static = tensor_util.constant_value(value)
+  if value_static is not None:
+    return constant_op.constant(
+        int(2**np.ceil(np.log(value_static) / np.log(2.0))), value.dtype)
+  return math_ops.cast(
+      math_ops.pow(2.0, math_ops.ceil(
+          math_ops.log(math_ops.to_float(value)) / math_ops.log(2.0))),
+      value.dtype)
diff --git a/tensorflow/contrib/signal/python/ops/util_ops.py b/tensorflow/python/ops/signal/util_ops.py
similarity index 100%
rename from tensorflow/contrib/signal/python/ops/util_ops.py
rename to tensorflow/python/ops/signal/util_ops.py
diff --git a/tensorflow/contrib/signal/python/ops/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
similarity index 97%
rename from tensorflow/contrib/signal/python/ops/window_ops.py
rename to tensorflow/python/ops/signal/window_ops.py
index 59e67e8ba414df1f9c777d1f5a3f3dba975648a2..730c989cfe9866f6e0a22d6e5eeda46dab0ab94b 100644
--- a/tensorflow/contrib/signal/python/ops/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -27,8 +27,10 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('signal.hann_window')
 def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   """Generate a [Hann window][hann].
 
@@ -53,6 +55,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
                                dtype, 0.5, 0.5)
 
 
+@tf_export('signal.hamming_window')
 def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
                    name=None):
   """Generate a [Hamming][hamming] window.
diff --git a/tensorflow/python/ops/sort_ops.py b/tensorflow/python/ops/sort_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3e23d701ed546ca76e2dd08e999ff869e87c816
--- /dev/null
+++ b/tensorflow/python/ops/sort_ops.py
@@ -0,0 +1,197 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for sorting tensors.
+
+@@argsort
+@@sort
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('sort')
+def sort(values, axis=-1, direction='ASCENDING', name=None):
+  """Sorts a tensor.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same dtype and shape as `values`, with the elements
+        sorted along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  with framework_ops.name_scope(name, 'sort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=False)
+
+
+@tf_export('argsort')
+def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
+  """Returns the indices of a tensor that give its sorted order along an axis.
+
+  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
+  `tf.sort(values)`. For higher dimensions, the output has the same shape as
+  `values`, but along the given axis, values represent the index of the sorted
+  element in that slice of the tensor at the given position.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    stable: If True, equal elements in the original tensor will not be
+      re-ordered in the returned order. Unstable sort is not yet implemented,
+      but will eventually be the default for performance reasons. If you require
+      a stable order, pass `stable=True` for forwards compatibility.
+    name: Optional name for the operation.
+
+  Returns:
+    An int32 `Tensor` with the same shape as `values`. The indices that would
+        sort each slice of the given `values` along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  del stable  # Unused.
+  with framework_ops.name_scope(name, 'argsort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=True)
+
+
+def _sort_or_argsort(values, axis, direction, return_argsort):
+  """Internal sort/argsort implementation.
+
+  Args:
+    values: The input values.
+    axis: The axis along which to sort.
+    direction: 'ASCENDING' or 'DESCENDING'.
+    return_argsort: Whether to return the argsort result.
+
+  Returns:
+    Either the sorted values, or the indices of the sorted values in the
+        original tensor. See the `sort` and `argsort` docstrings.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  if direction not in _SORT_IMPL:
+    raise ValueError('%s should be one of %s' % (direction, ', '.join(
+        sorted(_SORT_IMPL.keys()))))
+  # Axis must be an integer, not a Tensor.
+  axis = framework_ops.convert_to_tensor(axis, name='axis')
+  axis_static = tensor_util.constant_value(axis)
+  if axis.shape.ndims != 0 or axis_static is None:
+    raise ValueError('axis must be a constant scalar')
+  axis_static = int(axis_static)  # Avoids NumPy casting error
+
+  values = framework_ops.convert_to_tensor(values, name='values')
+
+  return _SORT_IMPL[direction](values, axis_static, return_argsort)
+
+
+def _descending_sort(values, axis, return_argsort=False):
+  """Sorts values in reverse using `top_k`.
+
+  Args:
+    values: Tensor of numeric values.
+    axis: Index of the axis which values should be sorted along.
+    return_argsort: If False, return the sorted values. If True, return the
+      indices that would sort the values.
+
+  Returns:
+    The sorted values.
+  """
+  k = array_ops.shape(values)[axis]
+  rank = array_ops.rank(values)
+  static_rank = values.shape.ndims
+  # Fast path: sorting the last axis.
+  if axis == -1 or axis + 1 == values.get_shape().ndims:
+    top_k_input = values
+    transposition = None
+  else:
+    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
+    if axis < 0:
+      # Calculate the actual axis index if counting from the end. Use the static
+      # rank if available, or else make the axis back into a tensor.
+      axis += static_rank or rank
+    if static_rank is not None:
+      # Prefer to calculate the transposition array in NumPy and make it a
+      # constant.
+      transposition = constant_op.constant(
+          np.r_[
+              # Axes up to axis are unchanged.
+              np.arange(axis),
+              # Swap axis and rank - 1.
+              [static_rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              np.arange(axis + 1, static_rank - 1),
+              # Swap axis and rank - 1.
+              [axis]],
+          name='transposition')
+    else:
+      # Generate the transposition array from the tensors.
+      transposition = array_ops.concat(
+          [
+              # Axes up to axis are unchanged.
+              math_ops.range(axis),
+              # Swap axis and rank - 1.
+              [rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              math_ops.range(axis + 1, rank - 1),
+              # Swap axis and rank - 1.
+              [axis]
+          ],
+          axis=0)
+    top_k_input = array_ops.transpose(values, transposition)
+
+  values, indices = nn_ops.top_k(top_k_input, k)
+  return_value = indices if return_argsort else values
+  if transposition is not None:
+    # transposition contains a single cycle of length 2 (swapping 2 elements),
+    # so it is an involution (it is its own inverse).
+    return_value = array_ops.transpose(return_value, transposition)
+  return return_value
+
+
+def _ascending_sort(values, axis, return_argsort=False):
+  # Negate the values to get the ascending order from descending sort.
+  values_or_indices = _descending_sort(-values, axis, return_argsort)
+  # If not argsort, negate the values again.
+  return values_or_indices if return_argsort else -values_or_indices
+
+
+_SORT_IMPL = {
+    'ASCENDING': _ascending_sort,
+    'DESCENDING': _descending_sort,
+}
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops_test.py b/tensorflow/python/ops/sort_ops_test.py
similarity index 90%
rename from tensorflow/contrib/framework/python/ops/sort_ops_test.py
rename to tensorflow/python/ops/sort_ops_test.py
index 791b32cd1e2eea9f466a14585a8b15d085bd450f..17ce604cbf195427033aa71e4c7b4d7ceed61c50 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops_test.py
+++ b/tensorflow/python/ops/sort_ops_test.py
@@ -20,22 +20,25 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import sort_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sort_ops
 from tensorflow.python.platform import test
 
 
 class SortTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRandom_lowDimensionality(self):
     self._testRandom_lowDimensionality(negative_axis=False)
 
+  @test_util.run_deprecated_v1
   def testRandom_lowDimensionality_negative(self):
     self._testRandom_lowDimensionality(negative_axis=True)
 
@@ -53,6 +56,7 @@ class SortTest(test.TestCase):
             np.sort(arr, axis=sort_axis),
             sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
 
+  @test_util.run_deprecated_v1
   def testRandom_highDimensionality(self):
     np.random.seed(100)
     for _ in range(20):
@@ -65,6 +69,7 @@ class SortTest(test.TestCase):
             np.sort(arr, axis=sort_axis),
             sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     # Create an empty scalar where the static shape is unknown.
     zeros_length_1 = array_ops.zeros(
@@ -77,21 +82,22 @@ class SortTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sort.eval()
 
+  @test_util.run_deprecated_v1
   def testNegativeOutOfBounds_staticShape(self):
     arr = constant_op.constant([3, 4, 5])
     with self.assertRaises(ValueError):
       sort_ops.sort(arr, axis=-4)
 
+  @test_util.run_deprecated_v1
   def testDescending(self):
     arr = np.random.random((10, 5, 5))
     with self.cached_session():
       self.assertAllEqual(
           np.sort(arr, axis=0)[::-1],
           sort_ops.sort(
-              constant_op.constant(arr),
-              axis=0,
-              direction='DESCENDING').eval())
+              constant_op.constant(arr), axis=0, direction='DESCENDING').eval())
 
+  @test_util.run_deprecated_v1
   def testSort_staticallyKnownRank_constantTransposition(self):
     # The transposition array should be a constant if the rank of "values" is
     # statically known.
@@ -109,6 +115,7 @@ class SortTest(test.TestCase):
         tensor_util.constant_value(transposition),
         [0, 4, 2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testArgsort_1d(self):
     arr = np.random.random(42)
     with self.cached_session():
@@ -116,6 +123,7 @@ class SortTest(test.TestCase):
           np.sort(arr),
           array_ops.gather(arr, sort_ops.argsort(arr)).eval())
 
+  @test_util.run_deprecated_v1
   def testArgsort(self):
     arr = np.random.random((5, 6, 7, 8))
     for axis in range(4):
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index 1223b290ff6cfcfba27f40c05556c85b59e77148..2ca9c0c647d14b792b2575c8f977d9dbe39efb4b 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -195,7 +195,7 @@ def _SparseTensorDenseMatMulGrad(op, grad):
   parts_a = array_ops.gather(grad, rows if not adj_a else cols)
   parts_b = array_ops.gather(b if not adj_b else array_ops.transpose(b),
                              cols if not adj_a else rows)
-  a_values_grad = math_ops.reduce_sum(parts_a * parts_b, reduction_indices=1)
+  a_values_grad = math_ops.reduce_sum(parts_a * parts_b, axis=1)
 
   # gradients w.r.t. (a_indices, a_values, a_shape, b)
   return (None, a_values_grad, None, b_grad)
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 7e3dbdbad4c651089ff88d0e5570010db3e5ecdf..097b485a115fb8153f77d0ad24c63b872fb2e8ca 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -16,7 +16,7 @@
 # pylint: disable=g-short-docstring-punctuation
 """Sparse Tensor Representation.
 
-See the [Sparse Ops](https://tensorflow.org/api_guides/python/sparse_ops) guide.
+See also `tf.SparseTensor`.
 """
 
 from __future__ import absolute_import
@@ -31,6 +31,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -43,6 +44,9 @@ from tensorflow.python.ops.gen_sparse_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -185,7 +189,7 @@ def sparse_eye(num_rows,
 
 
 # pylint: disable=protected-access
-@tf_export("sparse.concat", "sparse_concat")
+@tf_export(v1=["sparse.concat", "sparse_concat"])
 @deprecation.deprecated_endpoints("sparse_concat")
 @deprecation.deprecated_args(
     None, "concat_dim is deprecated, use axis instead", "concat_dim")
@@ -291,6 +295,11 @@ def sparse_concat(axis,
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "concat_dim",
                                                 concat_dim)
+  return sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dim, name)
+
+
+@tf_export("sparse.concat", v1=[])
+def sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dims=False, name=None):  # pylint: disable=missing-docstring
   sp_inputs = _convert_to_sparse_tensors(sp_inputs)
 
   if len(sp_inputs) == 1:  # Degenerate case of one tensor.
@@ -300,7 +309,7 @@ def sparse_concat(axis,
   vals = [sp_input.values for sp_input in sp_inputs]
   shapes = [sp_input.dense_shape for sp_input in sp_inputs]
 
-  if expand_nonconcat_dim:
+  if expand_nonconcat_dims:
     max_shape = math_ops.reduce_max(
         array_ops.concat(
             [array_ops.reshape(shape, [1, -1]) for shape in shapes], 0), 0)
@@ -318,9 +327,15 @@ def sparse_concat(axis,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse.add", "sparse_add")
+sparse_concat_v2.__doc__ = sparse_concat.__doc__.replace(
+    "    concat_dim: The old (deprecated) name for axis.\n", "")
+
+
+@tf_export(v1=["sparse.add", "sparse_add"])
 @deprecation.deprecated_endpoints("sparse_add")
-def sparse_add(a, b, thresh=0):
+@deprecation.deprecated_args(
+    None, "thresh is deprecated, use threshold instead", "thresh")
+def sparse_add(a, b, threshold=None, thresh=None):
   """Adds two tensors, at least one of each is a `SparseTensor`.
 
   If one `SparseTensor` and one `Tensor` are passed in, returns a `Tensor`.  If
@@ -358,12 +373,74 @@ def sparse_add(a, b, thresh=0):
 
   Args:
     a: The first operand; `SparseTensor` or `Tensor`.
-    b: The second operand; `SparseTensor` or `Tensor`.  At least one operand
+    b: The second operand; `SparseTensor` or `Tensor`. At least one operand
       must be sparse.
-    thresh: A 0-D `Tensor`.  The magnitude threshold that determines if an
-    output value/index pair takes space.  Its dtype should match that of the
-    values if they are real; if the latter are complex64/complex128, then the
-    dtype should be float32/float64, correspondingly.
+    threshold: An optional 0-D `Tensor` (defaults to `0`). The magnitude
+      threshold that determines if an output value/index pair takes space. Its
+      dtype should match that of the values if they are real; if the latter are
+      complex64/complex128, then the dtype should be float32/float64,
+      correspondingly.
+    thresh: Deprecated alias for `threshold`.
+
+  Returns:
+    A `SparseTensor` or a `Tensor`, representing the sum.
+
+  Raises:
+    TypeError: If both `a` and `b` are `Tensor`s.  Use `tf.add()` instead.
+  """
+  threshold = deprecation.deprecated_argument_lookup("threshold", threshold,
+                                                     "thresh", thresh)
+  if threshold is None:
+    threshold = 0
+  return sparse_add_v2(a, b, threshold)
+
+
+@tf_export("sparse.add", v1=[])
+def sparse_add_v2(a, b, threshold=0):
+  """Adds two tensors, at least one of each is a `SparseTensor`.
+
+  If one `SparseTensor` and one `Tensor` are passed in, returns a `Tensor`.  If
+  both arguments are `SparseTensor`s, this returns a `SparseTensor`.  The order
+  of arguments does not matter.  Use vanilla `tf.add()` for adding two dense
+  `Tensor`s.
+
+  The shapes of the two operands must match: broadcasting is not supported.
+
+  The indices of any input `SparseTensor` are assumed ordered in standard
+  lexicographic order.  If this is not the case, before this step run
+  `SparseReorder` to restore index ordering.
+
+  If both arguments are sparse, we perform "clipping" as follows.  By default,
+  if two values sum to zero at some index, the output `SparseTensor` would still
+  include that particular location in its index, storing a zero in the
+  corresponding value slot.  To override this, callers can specify `threshold`,
+  indicating that if the sum has a magnitude strictly smaller than `threshold`,
+  its corresponding value and index would then not be included.  In particular,
+  `threshold == 0.0` (default) means everything is kept and actual thresholding
+  happens only for a positive value.
+
+  For example, suppose the logical sum of two sparse operands is (densified):
+
+      [       2]
+      [.1     0]
+      [ 6   -.2]
+
+  Then,
+
+      * `threshold == 0` (the default): all 5 index/value pairs will be
+          returned.
+      * `threshold == 0.11`: only .1 and 0 will vanish, and the remaining three
+          index/value pairs will be returned.
+      * `threshold == 0.21`: .1, 0, and -.2 will vanish.
+
+  Args:
+    a: The first operand; `SparseTensor` or `Tensor`.
+    b: The second operand; `SparseTensor` or `Tensor`. At least one operand
+      must be sparse.
+    threshold: A 0-D `Tensor`. The magnitude threshold that determines if an
+      output value/index pair takes space. Its dtype should match that of the
+      values if they are real; if the latter are complex64/complex128, then the
+      dtype should be float32/float64, correspondingly.
 
   Returns:
     A `SparseTensor` or a `Tensor`, representing the sum.
@@ -379,11 +456,12 @@ def sparse_add(a, b, thresh=0):
   if all(isinstance(inp, sparse_classes) for inp in [a, b]):
     a = _convert_to_sparse_tensor(a)
     b = _convert_to_sparse_tensor(b)
-    thresh = ops.convert_to_tensor(
-        thresh, dtype=a.values.dtype.real_dtype.base_dtype, name="thresh")
+    threshold = ops.convert_to_tensor(
+        threshold, dtype=a.values.dtype.real_dtype.base_dtype, name="threshold")
     output_ind, output_val, output_shape = (
         gen_sparse_ops.sparse_add(a.indices, a.values, a.dense_shape,
-                                  b.indices, b.values, b.dense_shape, thresh))
+                                  b.indices, b.values, b.dense_shape,
+                                  threshold))
 
     # Attempt to get output_shape statically.
     a.get_shape().assert_is_compatible_with(b.get_shape())
@@ -559,7 +637,7 @@ def sparse_dense_cwise_add(sp_t, dense_t):
   return sparse_tensor.SparseTensor(sp_t.indices, result, sp_t.dense_shape)
 
 
-@tf_export("sparse.reorder", "sparse_reorder")
+@tf_export("sparse.reorder", v1=["sparse.reorder", "sparse_reorder"])
 @deprecation.deprecated_endpoints("sparse_reorder")
 def sparse_reorder(sp_input, name=None):
   """Reorders a `SparseTensor` into the canonical, row-major ordering.
@@ -610,7 +688,7 @@ def sparse_reorder(sp_input, name=None):
   return sparse_tensor.SparseTensor(reordered_ind, reordered_val, dense_shape)
 
 
-@tf_export("sparse.reshape", "sparse_reshape")
+@tf_export("sparse.reshape", v1=["sparse.reshape", "sparse_reshape"])
 @deprecation.deprecated_endpoints("sparse_reshape")
 def sparse_reshape(sp_input, shape, name=None):
   """Reshapes a `SparseTensor` to represent values in a new dense shape.
@@ -704,7 +782,7 @@ class KeywordRequired(object):
     return "KeywordRequired()"
 
 
-@tf_export("sparse.split", "sparse_split")
+@tf_export(v1=["sparse.split", "sparse_split"])
 @deprecation.deprecated_endpoints("sparse_split")
 @deprecation.deprecated_args(
     None, "split_dim is deprecated, use axis instead", "split_dim")
@@ -778,7 +856,52 @@ def sparse_split(keyword_required=KeywordRequired(),
   return sparse_tensors
 
 
-@tf_export("sparse.slice", "sparse_slice")
+@tf_export("sparse.split", v1=[])
+def sparse_split_v2(sp_input=None,
+                    num_split=None,
+                    axis=None,
+                    name=None):
+  """Split a `SparseTensor` into `num_split` tensors along `axis`.
+
+  If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
+  each slice starting from 0:`shape[axis] % num_split` gets extra one
+  dimension. For example, if `axis = 1` and `num_split = 2` and the
+  input is:
+
+      input_tensor = shape = [2, 7]
+      [    a   d e  ]
+      [b c          ]
+
+  Graphically the output tensors are:
+
+      output_tensor[0] =
+      [    a ]
+      [b c   ]
+
+      output_tensor[1] =
+      [ d e  ]
+      [      ]
+
+  Args:
+    sp_input: The `SparseTensor` to split.
+    num_split: A Python integer. The number of ways to split.
+    axis: A 0-D `int32` `Tensor`. The dimension along which to split.
+    name: A name for the operation (optional).
+
+  Returns:
+    `num_split` `SparseTensor` objects resulting from splitting `value`.
+
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return sparse_split(sp_input=sp_input,
+                      num_split=num_split,
+                      axis=axis,
+                      name=name,
+                      split_dim=None)
+
+
+@tf_export("sparse.slice", v1=["sparse.slice", "sparse_slice"])
 @deprecation.deprecated_endpoints("sparse_slice")
 def sparse_slice(sp_input, start, size, name=None):
   """Slice a `SparseTensor` based on the `start` and `size.
@@ -828,7 +951,7 @@ def sparse_slice(sp_input, start, size, name=None):
                                       output_shape)
 
 
-@tf_export("sparse_to_dense")
+@tf_export(v1=["sparse_to_dense"])
 @deprecation.deprecated(
     None,
     "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.")
@@ -887,10 +1010,92 @@ def sparse_to_dense(sparse_indices,
       name=name)
 
 
-@tf_export("sparse.reduce_max", "sparse_reduce_max")
+@tf_export("sparse.reduce_max", v1=[])
+def sparse_reduce_max_v2(
+    sp_input, axis=None, keepdims=None, output_is_sparse=False, name=None):
+  """Computes the max of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+  if `output_is_sparse` is `False`, or a `SparseTensor` if `output_is_sparse`
+  is `True`.
+
+  Note: A gradient is not defined for this function, so it can't be used
+  in training models that need gradient descent.
+
+  Reduces `sp_input` along the dimensions given in `axis`.  Unless
+  `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `axis`. If `keepdims` is true, the reduced dimensions are retained
+  with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  The values not defined in `sp_input` don't participate in the reduce max,
+  as opposed to be implicitly assumed 0 -- hence it can return negative values
+  for sparse `axis`. But, in case there are no values in
+  `axis`, it will reduce to 0. See second example below.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 2]
+  #                 [?, 3, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse.reduce_max(x) ==> 3
+  tf.sparse.reduce_max(x, 0) ==> [1, 3, 2]
+  tf.sparse.reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
+  tf.sparse.reduce_max(x, 1, keepdims=True) ==> [[2], [3]]
+  tf.sparse.reduce_max(x, [0, 1]) ==> 3
+
+  # 'y' represents [[-7, ?]
+  #                 [ 4, 3]
+  #                 [ ?, ?]
+  tf.sparse.reduce_max(x, 1) ==> [-7, 4, 0]
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keepdims: If true, retain reduced dimensions with length 1.
+    output_is_sparse: If true, returns a `SparseTensor` instead of a dense
+      `Tensor` (the default).
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced Tensor or the reduced SparseTensor if `output_is_sparse` is
+    True.
+  """
+  if keepdims is None:
+    keepdims = False
+
+  # reduction_axes is the deprecated name for axis.
+  reduction_axes = None
+
+  if output_is_sparse:
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops.sparse_reduce_max_sparse(
+            sp_input.indices, sp_input.values, sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            name=name))
+
+    return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+  return gen_sparse_ops.sparse_reduce_max(
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      name=name)
+
+
+@tf_export(v1=["sparse.reduce_max", "sparse_reduce_max"])
 @deprecation.deprecated_endpoints("sparse_reduce_max")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(
+    None, "reduction_axes is deprecated, use axis instead",
+    "reduction_axes")
 def sparse_reduce_max(sp_input, axis=None, keepdims=None,
                       reduction_axes=None, keep_dims=None):
   """Computes the max of elements across dimensions of a SparseTensor.
@@ -939,7 +1144,7 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
     keepdims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis.
+    reduction_axes: Deprecated name of `axis`.
     keep_dims:  Deprecated alias for `keepdims`.
 
   Returns:
@@ -955,7 +1160,7 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse.reduce_max_sparse", "sparse_reduce_max_sparse")
+@tf_export(v1=["sparse.reduce_max_sparse", "sparse_reduce_max_sparse"])
 @deprecation.deprecated_endpoints("sparse_reduce_max_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1006,10 +1211,80 @@ def sparse_reduce_max_sparse(sp_input,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse.reduce_sum", "sparse_reduce_sum")
+@tf_export("sparse.reduce_sum", v1=[])
+def sparse_reduce_sum_v2(
+    sp_input, axis=None, keepdims=None, output_is_sparse=False, name=None):
+  """Computes the sum of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+  if `output_is_sparse` is `False`, or a `SparseTensor` if `output_is_sparse`
+  is `True`.
+
+  Note: if `output_is_sparse` is True, a gradient is not defined for this
+  function, so it can't be used in training models that need gradient descent.
+
+  Reduces `sp_input` along the dimensions given in `axis`.  Unless `keepdims` is
+  true, the rank of the tensor is reduced by 1 for each entry in `axis`. If
+  `keepdims` is true, the reduced dimensions are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 1]
+  #                 [?, 1, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse.reduce_sum(x) ==> 3
+  tf.sparse.reduce_sum(x, 0) ==> [1, 1, 1]
+  tf.sparse.reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
+  tf.sparse.reduce_sum(x, 1, keepdims=True) ==> [[2], [1]]
+  tf.sparse.reduce_sum(x, [0, 1]) ==> 3
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keepdims: If true, retain reduced dimensions with length 1.
+    output_is_sparse: If true, returns a `SparseTensor` instead of a dense
+      `Tensor` (the default).
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced Tensor or the reduced SparseTensor if `output_is_sparse` is
+    True.
+  """
+  if keepdims is None:
+    keepdims = False
+
+  # reduction_axes is the deprecated name for axis.
+  reduction_axes = None
+
+  if output_is_sparse:
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops.sparse_reduce_sum_sparse(
+            sp_input.indices, sp_input.values, sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            name=name))
+    return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+  return gen_sparse_ops.sparse_reduce_sum(
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      name=name)
+
+
+@tf_export(v1=["sparse.reduce_sum", "sparse_reduce_sum"])
 @deprecation.deprecated_endpoints("sparse_reduce_sum")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(
+    None, "reduction_axes is deprecated, use axis instead",
+    "reduction_axes")
 def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
                       reduction_axes=None, keep_dims=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
@@ -1045,7 +1320,7 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
     keepdims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis.
+    reduction_axes: Deprecated name of `axis`.
     keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
@@ -1061,7 +1336,7 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse")
+@tf_export(v1=["sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse"])
 @deprecation.deprecated_endpoints("sparse_reduce_sum_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1112,7 +1387,7 @@ def sparse_reduce_sum_sparse(sp_input,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse.to_dense", "sparse_tensor_to_dense")
+@tf_export("sparse.to_dense", v1=["sparse.to_dense", "sparse_tensor_to_dense"])
 @deprecation.deprecated_endpoints("sparse_tensor_to_dense")
 def sparse_tensor_to_dense(sp_input,
                            default_value=0,
@@ -1156,7 +1431,7 @@ def sparse_tensor_to_dense(sp_input,
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  return sparse_to_dense(
+  return gen_sparse_ops.sparse_to_dense(
       sp_input.indices,
       sp_input.dense_shape,
       sp_input.values,
@@ -1165,7 +1440,8 @@ def sparse_tensor_to_dense(sp_input,
       name=name)
 
 
-@tf_export("sparse.to_indicator", "sparse_to_indicator")
+@tf_export(
+    "sparse.to_indicator", v1=["sparse.to_indicator", "sparse_to_indicator"])
 @deprecation.deprecated_endpoints("sparse_to_indicator")
 def sparse_to_indicator(sp_input, vocab_size, name=None):
   """Converts a `SparseTensor` of ids into a dense bool indicator tensor.
@@ -1229,8 +1505,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
         sp_new, default_value=False, validate_indices=False, name=name)
 
 
-@tf_export("sparse.merge", "sparse_merge")
-@deprecation.deprecated_endpoints("sparse_merge")
+@tf_export(v1=["sparse.merge", "sparse_merge"])
+@deprecation.deprecated(None, "No similar op available at this time.")
 def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
                  already_sorted=False):
   """Combines a batch of feature ids and values into a single `SparseTensor`.
@@ -1374,7 +1650,7 @@ def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
         sorted_result.indices, sorted_result.values, new_shape)
 
 
-@tf_export("sparse.retain", "sparse_retain")
+@tf_export("sparse.retain", v1=["sparse.retain", "sparse_retain"])
 @deprecation.deprecated_endpoints("sparse_retain")
 def sparse_retain(sp_input, to_retain):
   """Retains specified non-empty values within a `SparseTensor`.
@@ -1410,7 +1686,9 @@ def sparse_retain(sp_input, to_retain):
   # Shape checking, if shape is known at graph construction time
   retain_shape = to_retain.get_shape()
   retain_shape.assert_has_rank(1)
-  sp_input.values.get_shape()[0].merge_with(retain_shape[0])
+  if sp_input.values.get_shape().dims is not None:
+    sp_input.values.get_shape().dims[0].merge_with(
+        tensor_shape.dimension_at_index(retain_shape, 0))
 
   where_true = array_ops.reshape(array_ops.where(to_retain), [-1])
   new_indices = array_ops.gather(sp_input.indices, where_true)
@@ -1419,7 +1697,8 @@ def sparse_retain(sp_input, to_retain):
                                     array_ops.identity(sp_input.dense_shape))
 
 
-@tf_export("sparse.reset_shape", "sparse_reset_shape")
+@tf_export(
+    "sparse.reset_shape", v1=["sparse.reset_shape", "sparse_reset_shape"])
 @deprecation.deprecated_endpoints("sparse_reset_shape")
 def sparse_reset_shape(sp_input, new_shape=None):
   """Resets the shape of a `SparseTensor` with indices and values unchanged.
@@ -1521,7 +1800,9 @@ def sparse_reset_shape(sp_input, new_shape=None):
   return sparse_tensor.SparseTensor(in_indices, in_values, output_shape_tensor)
 
 
-@tf_export("sparse.fill_empty_rows", "sparse_fill_empty_rows")
+@tf_export(
+    "sparse.fill_empty_rows",
+    v1=["sparse.fill_empty_rows", "sparse_fill_empty_rows"])
 @deprecation.deprecated_endpoints("sparse_fill_empty_rows")
 def sparse_fill_empty_rows(sp_input, default_value, name=None):
   """Fills empty rows in the input 2-D `SparseTensor` with a default value.
@@ -1586,7 +1867,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
         dense_shape=sp_input.dense_shape), empty_row_indicator)
 
 
-@tf_export("io.serialize_sparse", "serialize_sparse")
+@tf_export(v1=["io.serialize_sparse", "serialize_sparse"])
 @deprecation.deprecated_endpoints("serialize_sparse")
 def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
@@ -1600,6 +1881,25 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
     A 3-vector (1-D `Tensor`), with each column representing the serialized
     `SparseTensor`'s indices, values, and shape (respectively).
 
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return serialize_sparse_v2(sp_input, out_type, name)
+
+
+@tf_export("io.serialize_sparse", v1=[])
+def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
+  """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
+
+  Args:
+    sp_input: The input `SparseTensor`.
+    out_type: The `dtype` to use for serialization.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A 3-vector (1-D `Tensor`), with each column representing the serialized
+    `SparseTensor`'s indices, values, and shape (respectively).
+
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
@@ -1613,7 +1913,7 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
       out_type=out_type)
 
 
-@tf_export("io.serialize_many_sparse", "serialize_many_sparse")
+@tf_export(v1=["io.serialize_many_sparse", "serialize_many_sparse"])
 @deprecation.deprecated_endpoints("serialize_many_sparse")
 def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
@@ -1636,6 +1936,34 @@ def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
     represents serialized `SparseTensor`'s indices, values, and shape
     (respectively).
 
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return serialize_many_sparse_v2(sp_input, out_type, name)
+
+
+@tf_export("io.serialize_many_sparse", v1=[])
+def serialize_many_sparse_v2(sp_input, out_type=dtypes.string, name=None):
+  """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
+
+  The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+  is treated as the minibatch dimension.  Elements of the `SparseTensor`
+  must be sorted in increasing order of this first dimension.  The serialized
+  `SparseTensor` objects going into each row of the output `Tensor` will have
+  rank `R-1`.
+
+  The minibatch size `N` is extracted from `sparse_shape[0]`.
+
+  Args:
+    sp_input: The input rank `R` `SparseTensor`.
+    out_type: The `dtype` to use for serialization.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A matrix (2-D `Tensor`) with `N` rows and `3` columns. Each column
+    represents serialized `SparseTensor`'s indices, values, and shape
+    (respectively).
+
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
@@ -1715,7 +2043,9 @@ def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
-@tf_export("io.deserialize_many_sparse", "deserialize_many_sparse")
+@tf_export(
+    "io.deserialize_many_sparse",
+    v1=["io.deserialize_many_sparse", "deserialize_many_sparse"])
 @deprecation.deprecated_endpoints("deserialize_many_sparse")
 def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   """Deserialize and concatenate `SparseTensors` from a serialized minibatch.
@@ -1786,7 +2116,9 @@ def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
-@tf_export("sparse.matmul", "sparse_tensor_dense_matmul")
+@tf_export("sparse.sparse_dense_matmul",
+           v1=["sparse.sparse_dense_matmul", "sparse.matmul",
+               "sparse_tensor_dense_matmul"])
 @deprecation.deprecated_endpoints("sparse_tensor_dense_matmul")
 def sparse_tensor_dense_matmul(sp_a,
                                b,
@@ -2004,7 +2336,7 @@ def sparse_tensor_dense_matmul(sp_a,
         adjoint_b=adjoint_b)
 
 
-@tf_export("sparse.softmax", "sparse_softmax")
+@tf_export("sparse.softmax", v1=["sparse.softmax", "sparse_softmax"])
 @deprecation.deprecated_endpoints("sparse_softmax")
 def sparse_softmax(sp_input, name=None):
   """Applies softmax to a batched N-D `SparseTensor`.
@@ -2060,7 +2392,7 @@ def sparse_softmax(sp_input, name=None):
                                       sp_input.dense_shape)
 
 
-@tf_export("sparse.maximum", "sparse_maximum")
+@tf_export("sparse.maximum", v1=["sparse.maximum", "sparse_maximum"])
 @deprecation.deprecated_endpoints("sparse_maximum")
 def sparse_maximum(sp_a, sp_b, name=None):
   """Returns the element-wise max of two SparseTensors.
@@ -2098,7 +2430,7 @@ def sparse_maximum(sp_a, sp_b, name=None):
   return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.dense_shape)
 
 
-@tf_export("sparse.minimum", "sparse_minimum")
+@tf_export("sparse.minimum", v1=["sparse.minimum", "sparse_minimum"])
 @deprecation.deprecated_endpoints("sparse_minimum")
 def sparse_minimum(sp_a, sp_b, name=None):
   """Returns the element-wise min of two SparseTensors.
@@ -2136,7 +2468,7 @@ def sparse_minimum(sp_a, sp_b, name=None):
   return sparse_tensor.SparseTensor(out_indices, out_values, sp_a.dense_shape)
 
 
-@tf_export("sparse.transpose", "sparse_transpose")
+@tf_export("sparse.transpose", v1=["sparse.transpose", "sparse_transpose"])
 @deprecation.deprecated_endpoints("sparse_transpose")
 def sparse_transpose(sp_input, perm=None, name=None):
   """Transposes a `SparseTensor`
@@ -2350,3 +2682,48 @@ def _take_many_sparse_from_tensors_map(sparse_map_op,
   output_shape.set_shape([rank])
 
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
+
+
+class _UnaryMapValueDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for unary ops that maps base function across sparse values."""
+
+  def __init__(self, original_func):
+    self._original_func = original_func
+    func_name = get_canonical_name_for_symbol(original_func)
+    arg_names = tf_inspect.getfullargspec(original_func)[0]
+    self._x = arg_names[0]
+    original_func.__doc__ = (
+        original_func.__doc__.rstrip() + "\n\n" +
+        ("    If `{x}` is a `SparseTensor`, returns\n"
+         "    `SparseTensor({x}.indices, tf.{func}({x}.values, ...), "
+         "{x}.dense_shape)`").format(x=self._x, func=func_name))
+
+  def handle(self, args, kwargs):
+    if args:
+      x, args = args[0], args[1:]
+    else:
+      kwargs = kwargs.copy()
+      x = kwargs.pop(self._x, None)
+    if isinstance(x, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(
+          indices=x.indices,
+          values=self._original_func(x.values, *args, **kwargs),
+          dense_shape=x.dense_shape)
+    else:
+      return self.NOT_SUPPORTED
+
+
+_UNARY_OPS = [
+    # TODO(b/120307967) Add dispatchers for additional TensorFlow ops.
+    math_ops.abs,
+    math_ops.negative,
+    math_ops.sign,
+    math_ops.square,
+    math_ops.sqrt,
+    math_ops.erf,
+    math_ops.tanh,
+    math_ops.bessel_i0e,
+    math_ops.bessel_i1e,
+]
+for unary_op in _UNARY_OPS:
+  _UnaryMapValueDispatcher(unary_op).register(unary_op)
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index 4ee1569249b5ccd3b38de7bb6c2bb5bce761c513..031069a0f017c5d7e80999d2aa6a3e5fd2cf10e6 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -18,18 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import googletest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class SparseOpsTest(test_util.TensorFlowTestCase):
+class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testSparseEye(self):
     def test_one(n, m, as_tensors):
@@ -77,5 +79,23 @@ class SparseOpsTest(test_util.TensorFlowTestCase):
           d = sparse_ops.sparse_to_dense(s.indices, s.dense_shape, s.values)
           self.assertAllEqual(self.evaluate(d), expected_after)
 
+  @parameterized.parameters([
+      (math_ops.abs, [1.0, -1.0, 3.0, -4.0], [1.0, 1.0, 3.0, 4.0]),
+      (math_ops.negative, [1.0, -1.0, 3.0, -4.0], [-1.0, 1.0, -3.0, 4.0]),
+      (math_ops.sign, [3.0, -2.0, 0.0, -4.0], [1.0, -1.0, 0.0, -1.0]),
+      (math_ops.square, [1.0, -1.0, 3.0, -4.0], [1.0, 1.0, 9.0, 16.0]),
+  ])
+  def testUnarySparseDispatch(self, op, values, expected):
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [2, 0], [2, 4]],
+        values=values,
+        dense_shape=[3, 6])
+    result = op(st)
+    result_value = self.evaluate(result)
+    self.assertAllEqual(result_value.indices, st.indices)
+    self.assertAllEqual(result_value.values, expected)
+    self.assertAllEqual(result_value.dense_shape, st.dense_shape)
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index cfab94389615fe95daf88517a8490a550572801b..21f4996798eda29c8c9090c12b096d888c0b12d8 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -34,7 +34,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
-@tf_export('math.lbeta', 'lbeta')
+@tf_export('math.lbeta', v1=['math.lbeta', 'lbeta'])
 @deprecation.deprecated_endpoints('lbeta')
 def lbeta(x, name=None):
   r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
@@ -70,8 +70,7 @@ def lbeta(x, name=None):
     x = ops.convert_to_tensor(x, name='x')
 
     # Note reduce_sum([]) = 0.
-    log_prod_gamma_x = math_ops.reduce_sum(
-        math_ops.lgamma(x), reduction_indices=[-1])
+    log_prod_gamma_x = math_ops.reduce_sum(math_ops.lgamma(x), axis=[-1])
 
     # Note lgamma(0) = infinity, so if x = []
     # log_gamma_sum_x = lgamma(0) = infinity, and
@@ -182,7 +181,6 @@ def einsum(equation, *inputs, **kwargs):
   * Ellipses (subscripts like `ij...,jk...->ik...`)
   * Subscripts where an axis appears more than once for a single input
     (e.g. `ijj,k->ik`).
-  * Subscripts that are summed across multiple inputs (e.g., `ij,ij,jk->ik`).
 
   Args:
     equation: a `str` describing the contraction, in the same format as
@@ -238,6 +236,13 @@ def einsum(equation, *inputs, **kwargs):
       output_axis_labels = ''.join(
           sorted(ax for ax in indices if counts[ax] == 1))
 
+    for a in axis_labels:
+      for input_labels in input_axis_labels:
+        if input_labels.count(a) > 1:
+          raise ValueError(
+              'Subscript not supported: an axis appears more than once: %s' %
+              input_labels)
+
     for a in axis_labels:
       input_count = sum(1 for s in input_axis_labels if a in s)
       if input_count > 2 and a not in output_axis_labels:
@@ -258,11 +263,11 @@ def einsum(equation, *inputs, **kwargs):
 
     missing_indices = set(temp_axis_labels) - set(output_axis_labels)
     if missing_indices:
-      reduction_indices = [
+      axis = [
           i for i, a in enumerate(temp_axis_labels)
           if a not in output_axis_labels
       ]
-      temp = math_ops.reduce_sum(temp, reduction_indices=reduction_indices)
+      temp = math_ops.reduce_sum(temp, axis=axis)
       temp_axis_labels = ''.join(
           a for a in temp_axis_labels if a in output_axis_labels)
 
@@ -413,7 +418,7 @@ def _reshape_if_necessary(tensor, new_shape):
   """Like reshape(), but avoids creating a new tensor if possible."""
   # Accept None as an alias for -1 in new_shape.
   new_shape = tuple(-1 if x is None else x for x in new_shape)
-  cur_shape = tuple(x.value for x in tensor.get_shape())
+  cur_shape = tuple(x.value for x in tensor.get_shape().dims)
   if (len(new_shape) == len(cur_shape) and
       all(d0 == d1 or d1 == -1 for d0, d1 in zip(cur_shape, new_shape))):
     return tensor
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 9bc4098d5b63c3e8ee4f9c14332e65b3d2875d8b..94aaebed951a96a4aade8d05d36b3366e59708a5 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -39,24 +39,26 @@ class LBetaTest(test.TestCase):
     # Should evaluate to 1 and 1/2.
     x_one = [1, 1.]
     x_one_half = [2, 1.]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllClose(
           1, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_one))))
       self.assertAllClose(
           0.5, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_one_half))))
       self.assertEqual([], special_math_ops.lbeta(x_one).get_shape())
 
+  @test_util.run_deprecated_v1
   def test_one_dimensional_arg_dynamic(self):
     # Should evaluate to 1 and 1/2.
     x_one = [1, 1.]
     x_one_half = [2, 1.]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ph = array_ops.placeholder(dtypes.float32)
       beta_ph = math_ops.exp(special_math_ops.lbeta(ph))
       self.assertAllClose(1, beta_ph.eval(feed_dict={ph: x_one}))
       self.assertAllClose(0.5,
                           beta_ph.eval(feed_dict={ph: x_one_half}))
 
+  @test_util.run_deprecated_v1
   def test_four_dimensional_arg_with_partial_shape_dynamic(self):
     x_ = np.ones((3, 2, 3, 4))
     # Gamma(1) = 0! = 1
@@ -65,7 +67,7 @@ class LBetaTest(test.TestCase):
     #     = Gamma(1) * Gamma(1) * Gamma(1) * Gamma(1) / Gamma(1 + 1 + 1 + 1)
     #     = 1 / 6
     expected_beta_x = 1 / 6 * np.ones((3, 2, 3))
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x_ph = array_ops.placeholder(dtypes.float32, [3, 2, 3, None])
       beta_ph = math_ops.exp(special_math_ops.lbeta(x_ph))
       self.assertAllClose(expected_beta_x,
@@ -75,16 +77,17 @@ class LBetaTest(test.TestCase):
   def test_two_dimensional_arg(self):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllClose(
           [0.5, 0.5],
           self.evaluate(math_ops.exp(special_math_ops.lbeta(x_one_half))))
       self.assertEqual((2,), special_math_ops.lbeta(x_one_half).get_shape())
 
+  @test_util.run_deprecated_v1
   def test_two_dimensional_arg_dynamic(self):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       ph = array_ops.placeholder(dtypes.float32)
       beta_ph = math_ops.exp(special_math_ops.lbeta(ph))
       self.assertAllClose([0.5, 0.5],
@@ -94,7 +97,7 @@ class LBetaTest(test.TestCase):
   def test_two_dimensional_proper_shape(self):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllClose(
           [0.5, 0.5],
           self.evaluate(math_ops.exp(special_math_ops.lbeta(x_one_half))))
@@ -107,7 +110,7 @@ class LBetaTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_complicated_shape(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = ops.convert_to_tensor(np.random.rand(3, 2, 2))
       self.assertAllEqual(
           (3, 2), self.evaluate(array_ops.shape(special_math_ops.lbeta(x))))
@@ -121,7 +124,7 @@ class LBetaTest(test.TestCase):
     # as the answer, always.
     x_a = [5.5]
     x_b = [0.1]
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       self.assertAllClose(
           1, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_a))))
       self.assertAllClose(
@@ -130,7 +133,7 @@ class LBetaTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_empty_rank1_returns_negative_infinity(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       x = constant_op.constant([], shape=[0])
       lbeta_x = special_math_ops.lbeta(x)
       expected_result = constant_op.constant(-np.inf, shape=())
@@ -141,7 +144,7 @@ class LBetaTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_empty_rank2_with_zero_last_dim_returns_negative_infinity(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       event_size = 0
       for batch_size in [0, 1, 2]:
         x = constant_op.constant([], shape=[batch_size, event_size])
@@ -154,7 +157,7 @@ class LBetaTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_empty_rank2_with_zero_batch_dim_returns_empty(self):
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       batch_size = 0
       for event_size in [0, 1, 2]:
         x = constant_op.constant([], shape=[batch_size, event_size])
@@ -240,7 +243,7 @@ class EinsumTest(test.TestCase):
       'aef,fbc,dca->bde',
       'iJ,Jk->ik',
       'iJ,Ki->JK',
-      'iJk,Jklm->Jk'
+      'iJk,Jklm->Jk',
       'ij, jk, kl -> il',
       'a, ab, abc -> abc',
       'ab, ab, cd, cd, ef, ef -> ',
@@ -280,7 +283,7 @@ class EinsumTest(test.TestCase):
 
   dim_mismatch_cases = [('ijk,jkl->il', [(2, 3, 4), (3, 5, 6)])]
 
-  def disabled_test_simple(self):
+  def test_simple(self):
     for case in self.simple_cases:
       self.run_test(case)
 
@@ -288,6 +291,7 @@ class EinsumTest(test.TestCase):
     for case in self.long_cases:
       self.run_test(case)
 
+  @test_util.run_deprecated_v1
   def test_invalid(self):
     for axes in self.invalid_cases:
       inputs = [
@@ -297,6 +301,7 @@ class EinsumTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
 
+  @test_util.run_deprecated_v1
   def test_invalid_keyword_arguments(self):
     m0 = array_ops.placeholder(dtypes.int32, shape=(1, None))
     m1 = array_ops.placeholder(dtypes.int32, shape=(None, 1))
@@ -311,6 +316,13 @@ class EinsumTest(test.TestCase):
           invalid1='value1',
           invalid2='value2')
 
+  @test_util.run_deprecated_v1
+  def test_repeated_axis_single_input(self):
+    x = array_ops.placeholder(dtypes.float32, shape=[2, 2])
+    with self.assertRaises(ValueError):
+      _ = special_math_ops.einsum('ii->', x)
+
+  @test_util.run_deprecated_v1
   def test_dim_mismatch(self):
     for axes, input_shapes in self.dim_mismatch_cases:
       inputs = [
@@ -333,7 +345,7 @@ class EinsumTest(test.TestCase):
     input_tensors = [constant_op.constant(val) for val in input_vals]
     output_tensor = special_math_ops.einsum(axes, *input_tensors)
 
-    with self.test_session(use_gpu=True):
+    with self.session(use_gpu=True):
       output_value = self.evaluate(output_tensor)
 
     correct_value = np.einsum(axes, *input_vals)
diff --git a/tensorflow/python/ops/spectral_grad.py b/tensorflow/python/ops/spectral_grad.py
deleted file mode 100644
index 0af24114acbe5fa6283191f9d71e32805eba3f29..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/spectral_grad.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Gradients for operators defined in spectral_ops.py."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
-
-
-def _FFTSizeForGrad(grad, rank):
-  return math_ops.reduce_prod(array_ops.shape(grad)[-rank:])
-
-
-@ops.RegisterGradient("FFT")
-def _FFTGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype)
-  return spectral_ops.ifft(grad) * size
-
-
-@ops.RegisterGradient("IFFT")
-def _IFFTGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft(grad) * rsize
-
-
-@ops.RegisterGradient("FFT2D")
-def _FFT2DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype)
-  return spectral_ops.ifft2d(grad) * size
-
-
-@ops.RegisterGradient("IFFT2D")
-def _IFFT2DGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft2d(grad) * rsize
-
-
-@ops.RegisterGradient("FFT3D")
-def _FFT3DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype)
-  return spectral_ops.ifft3d(grad) * size
-
-
-@ops.RegisterGradient("IFFT3D")
-def _IFFT3DGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft3d(grad) * rsize
-
-
-def _RFFTGradHelper(rank, irfft_fn):
-  """Returns a gradient function for an RFFT of the provided rank."""
-  # Can't happen because we don't register a gradient for RFFT3D.
-  assert rank in (1, 2), "Gradient for RFFT3D is not implemented."
-
-  def _Grad(op, grad):
-    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
-    fft_length = op.inputs[1]
-    input_shape = array_ops.shape(op.inputs[0])
-    is_even = math_ops.cast(1 - (fft_length[-1] % 2), dtypes.complex64)
-
-    def _TileForBroadcasting(matrix, t):
-      expanded = array_ops.reshape(
-          matrix,
-          array_ops.concat([
-              array_ops.ones([array_ops.rank(t) - 2], dtypes.int32),
-              array_ops.shape(matrix)
-          ], 0))
-      return array_ops.tile(
-          expanded, array_ops.concat([array_ops.shape(t)[:-2], [1, 1]], 0))
-
-    def _MaskMatrix(length):
-      # TODO(rjryan): Speed up computation of twiddle factors using the
-      # following recurrence relation and cache them across invocations of RFFT.
-      #
-      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
-      # for n = 0, 1,..., line_len-1.
-      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
-      a = array_ops.tile(
-          array_ops.expand_dims(math_ops.range(length), 0), (length, 1))
-      b = array_ops.transpose(a, [1, 0])
-      return math_ops.exp(-2j * np.pi * math_ops.cast(a * b, dtypes.complex64) /
-                          math_ops.cast(length, dtypes.complex64))
-
-    def _YMMask(length):
-      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
-      return math_ops.cast(1 - 2 * (math_ops.range(length) % 2),
-                           dtypes.complex64)
-
-    y0 = grad[..., 0:1]
-    if rank == 1:
-      ym = grad[..., -1:]
-      extra_terms = y0 + is_even * ym * _YMMask(input_shape[-1])
-    elif rank == 2:
-      # Create a mask matrix for y0 and ym.
-      base_mask = _MaskMatrix(input_shape[-2])
-
-      # Tile base_mask to match y0 in shape so that we can batch-matmul the
-      # inner 2 dimensions.
-      tiled_mask = _TileForBroadcasting(base_mask, y0)
-
-      y0_term = math_ops.matmul(tiled_mask, math_ops.conj(y0))
-      extra_terms = y0_term
-
-      ym = grad[..., -1:]
-      ym_term = math_ops.matmul(tiled_mask, math_ops.conj(ym))
-
-      inner_dim = input_shape[-1]
-      ym_term = array_ops.tile(
-          ym_term,
-          array_ops.concat([
-              array_ops.ones([array_ops.rank(grad) - 1], dtypes.int32),
-              [inner_dim]
-          ], 0)) * _YMMask(inner_dim)
-
-      extra_terms += is_even * ym_term
-
-    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
-    # factor, plus some additional terms to make up for the components dropped
-    # due to Hermitian symmetry.
-    input_size = math_ops.to_float(_FFTSizeForGrad(op.inputs[0], rank))
-    irfft = irfft_fn(grad, fft_length)
-    return 0.5 * (irfft * input_size + math_ops.real(extra_terms)), None
-
-  return _Grad
-
-
-def _IRFFTGradHelper(rank, rfft_fn):
-  """Returns a gradient function for an IRFFT of the provided rank."""
-  # Can't happen because we don't register a gradient for IRFFT3D.
-  assert rank in (1, 2), "Gradient for IRFFT3D is not implemented."
-
-  def _Grad(op, grad):
-    """A gradient function for IRFFT with the provided `rank` and `rfft_fn`."""
-    # Generate a simple mask like [1.0, 2.0, ..., 2.0, 1.0] for even-length FFTs
-    # and [1.0, 2.0, ..., 2.0] for odd-length FFTs. To reduce extra ops in the
-    # graph we special-case the situation where the FFT length and last
-    # dimension of the input are known at graph construction time.
-    fft_length = op.inputs[1]
-    is_odd = math_ops.mod(fft_length[-1], 2)
-    input_last_dimension = array_ops.shape(op.inputs[0])[-1]
-    mask = array_ops.concat(
-        [[1.0], 2.0 * array_ops.ones([input_last_dimension - 2 + is_odd]),
-         array_ops.ones([1 - is_odd])], 0)
-
-    rsize = math_ops.reciprocal(math_ops.to_float(_FFTSizeForGrad(grad, rank)))
-
-    # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
-    # factor and a mask. The mask scales the gradient for the Hermitian
-    # symmetric components of the RFFT by a factor of two, since these
-    # components are de-duplicated in the RFFT.
-    rfft = rfft_fn(grad, fft_length)
-    return rfft * math_ops.cast(rsize * mask, dtypes.complex64), None
-
-  return _Grad
-
-
-ops.RegisterGradient("RFFT")(_RFFTGradHelper(1, spectral_ops.irfft))
-ops.RegisterGradient("IRFFT")(_IRFFTGradHelper(1, spectral_ops.rfft))
-ops.RegisterGradient("RFFT2D")(_RFFTGradHelper(2, spectral_ops.irfft2d))
-ops.RegisterGradient("IRFFT2D")(_IRFFTGradHelper(2, spectral_ops.rfft2d))
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/spectral_ops.py
deleted file mode 100644
index da5884e74626b493fb71c50ff040ce4fc4a97ce3..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/spectral_ops.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Spectral operators (e.g. DCT, FFT, RFFT)."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math as _math
-
-from tensorflow.python.framework import dtypes as _dtypes
-from tensorflow.python.framework import ops as _ops
-from tensorflow.python.framework import tensor_util as _tensor_util
-from tensorflow.python.ops import array_ops as _array_ops
-from tensorflow.python.ops import gen_spectral_ops
-from tensorflow.python.ops import math_ops as _math_ops
-from tensorflow.python.util.tf_export import tf_export
-
-
-def _infer_fft_length_for_rfft(input_tensor, fft_rank):
-  """Infers the `fft_length` argument for a `rank` RFFT from `input_tensor`."""
-  # A TensorShape for the inner fft_rank dimensions.
-  fft_shape = input_tensor.get_shape()[-fft_rank:]
-
-  # If any dim is unknown, fall back to tensor-based math.
-  if not fft_shape.is_fully_defined():
-    return _array_ops.shape(input_tensor)[-fft_rank:]
-
-  # Otherwise, return a constant.
-  return _ops.convert_to_tensor(fft_shape.as_list(), _dtypes.int32)
-
-
-def _infer_fft_length_for_irfft(input_tensor, fft_rank):
-  """Infers the `fft_length` argument for a `rank` IRFFT from `input_tensor`."""
-  # A TensorShape for the inner fft_rank dimensions.
-  fft_shape = input_tensor.get_shape()[-fft_rank:]
-
-  # If any dim is unknown, fall back to tensor-based math.
-  if not fft_shape.is_fully_defined():
-    fft_length = _array_ops.unstack(_array_ops.shape(input_tensor)[-fft_rank:])
-    fft_length[-1] = _math_ops.maximum(0, 2 * (fft_length[-1] - 1))
-    return _array_ops.stack(fft_length)
-
-  # Otherwise, return a constant.
-  fft_length = fft_shape.as_list()
-  if fft_length:
-    fft_length[-1] = max(0, 2 * (fft_length[-1] - 1))
-  return _ops.convert_to_tensor(fft_length, _dtypes.int32)
-
-
-def _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length, is_reverse=False):
-  """Pads `input_tensor` to `fft_length` on its inner-most `fft_rank` dims."""
-  fft_shape = _tensor_util.constant_value_as_shape(fft_length)
-
-  # Edge case: skip padding empty tensors.
-  if (input_tensor.shape.ndims is not None and
-      any(dim.value == 0 for dim in input_tensor.shape)):
-    return input_tensor
-
-  # If we know the shapes ahead of time, we can either skip or pre-compute the
-  # appropriate paddings. Otherwise, fall back to computing paddings in
-  # TensorFlow.
-  if fft_shape.is_fully_defined() and input_tensor.shape.ndims is not None:
-    # Slice the last FFT-rank dimensions from input_tensor's shape.
-    input_fft_shape = input_tensor.shape[-fft_shape.ndims:]
-
-    if input_fft_shape.is_fully_defined():
-      # In reverse, we only pad the inner-most dimension to fft_length / 2 + 1.
-      if is_reverse:
-        fft_shape = fft_shape[:-1].concatenate(fft_shape[-1].value // 2 + 1)
-
-      paddings = [[0, max(fft_dim.value - input_dim.value, 0)]
-                  for fft_dim, input_dim in zip(fft_shape, input_fft_shape)]
-      if any(pad > 0 for _, pad in paddings):
-        outer_paddings = [[0, 0]] * max((input_tensor.shape.ndims -
-                                         fft_shape.ndims), 0)
-        return _array_ops.pad(input_tensor, outer_paddings + paddings)
-      return input_tensor
-
-  # If we can't determine the paddings ahead of time, then we have to pad. If
-  # the paddings end up as zero, tf.pad has a special-case that does no work.
-  input_rank = _array_ops.rank(input_tensor)
-  input_fft_shape = _array_ops.shape(input_tensor)[-fft_rank:]
-  outer_dims = _math_ops.maximum(0, input_rank - fft_rank)
-  outer_paddings = _array_ops.zeros([outer_dims], fft_length.dtype)
-  # In reverse, we only pad the inner-most dimension to fft_length / 2 + 1.
-  if is_reverse:
-    fft_length = _array_ops.concat([fft_length[:-1],
-                                    fft_length[-1:] // 2 + 1], 0)
-  fft_paddings = _math_ops.maximum(0, fft_length - input_fft_shape)
-  paddings = _array_ops.concat([outer_paddings, fft_paddings], 0)
-  paddings = _array_ops.stack([_array_ops.zeros_like(paddings), paddings],
-                              axis=1)
-  return _array_ops.pad(input_tensor, paddings)
-
-
-def _rfft_wrapper(fft_fn, fft_rank, default_name):
-  """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
-
-  def _rfft(input_tensor, fft_length=None, name=None):
-    with _ops.name_scope(name, default_name,
-                         [input_tensor, fft_length]) as name:
-      input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.float32)
-      input_tensor.shape.with_rank_at_least(fft_rank)
-      if fft_length is None:
-        fft_length = _infer_fft_length_for_rfft(input_tensor, fft_rank)
-      else:
-        fft_length = _ops.convert_to_tensor(fft_length, _dtypes.int32)
-      input_tensor = _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length)
-      return fft_fn(input_tensor, fft_length, name)
-  _rfft.__doc__ = fft_fn.__doc__
-  return _rfft
-
-
-def _irfft_wrapper(ifft_fn, fft_rank, default_name):
-  """Wrapper around gen_spectral_ops.irfft* that infers fft_length argument."""
-
-  def _irfft(input_tensor, fft_length=None, name=None):
-    with _ops.name_scope(name, default_name,
-                         [input_tensor, fft_length]) as name:
-      input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.complex64)
-      input_tensor.shape.with_rank_at_least(fft_rank)
-      if fft_length is None:
-        fft_length = _infer_fft_length_for_irfft(input_tensor, fft_rank)
-      else:
-        fft_length = _ops.convert_to_tensor(fft_length, _dtypes.int32)
-      input_tensor = _maybe_pad_for_rfft(input_tensor, fft_rank, fft_length,
-                                         is_reverse=True)
-      return ifft_fn(input_tensor, fft_length, name)
-  _irfft.__doc__ = ifft_fn.__doc__
-  return _irfft
-
-
-fft = gen_spectral_ops.fft
-ifft = gen_spectral_ops.ifft
-fft2d = gen_spectral_ops.fft2d
-ifft2d = gen_spectral_ops.ifft2d
-fft3d = gen_spectral_ops.fft3d
-ifft3d = gen_spectral_ops.ifft3d
-rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
-tf_export("spectral.rfft")(rfft)
-irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
-tf_export("spectral.irfft")(irfft)
-rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
-tf_export("spectral.rfft2d")(rfft2d)
-irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
-tf_export("spectral.irfft2d")(irfft2d)
-rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
-tf_export("spectral.rfft3d")(rfft3d)
-irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
-tf_export("spectral.irfft3d")(irfft3d)
-
-
-def _validate_dct_arguments(dct_type, n, axis, norm):
-  if n is not None:
-    raise NotImplementedError("The DCT length argument is not implemented.")
-  if axis != -1:
-    raise NotImplementedError("axis must be -1. Got: %s" % axis)
-  if dct_type not in (2, 3):
-    raise ValueError("Only Types II and III (I)DCT are supported.")
-  if norm not in (None, "ortho"):
-    raise ValueError(
-        "Unknown normalization. Expected None or 'ortho', got: %s" % norm)
-
-
-# TODO(rjryan): Implement `type`, `n` and `axis` parameters.
-@tf_export("spectral.dct")
-def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
-  """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
-
-  Currently only Types II and III are supported. Type II is implemented using a
-  length `2N` padded `tf.spectral.rfft`, as described here:
-  https://dsp.stackexchange.com/a/10606. Type III is a fairly straightforward
-  inverse of Type II (i.e. using a length `2N` padded `tf.spectral.irfft`).
-
-  @compatibility(scipy)
-  Equivalent to scipy.fftpack.dct for Type-II and Type-III DCT.
-  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
-  @end_compatibility
-
-  Args:
-    input: A `[..., samples]` `float32` `Tensor` containing the signals to
-      take the DCT of.
-    type: The DCT type to perform. Must be 2 or 3.
-    n: For future expansion. The length of the transform. Must be `None`.
-    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
-    norm: The normalization to apply. `None` for no normalization or `'ortho'`
-      for orthonormal normalization.
-    name: An optional name for the operation.
-
-  Returns:
-    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.
-
-  Raises:
-    ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not
-      `-1`, or `norm` is not `None` or `'ortho'`.
-
-  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
-  """
-  _validate_dct_arguments(type, n, axis, norm)
-  with _ops.name_scope(name, "dct", [input]):
-    # We use the RFFT to compute the DCT and TensorFlow only supports float32
-    # for FFTs at the moment.
-    input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)
-
-    axis_dim = input.shape[-1].value or _array_ops.shape(input)[-1]
-    axis_dim_float = _math_ops.to_float(axis_dim)
-    if type == 2:
-      scale = 2.0 * _math_ops.exp(
-          _math_ops.complex(
-              0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
-              axis_dim_float))
-
-      # TODO(rjryan): Benchmark performance and memory usage of the various
-      # approaches to computing a DCT via the RFFT.
-      dct2 = _math_ops.real(
-          rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)
-
-      if norm == "ortho":
-        n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
-        n2 = n1 * _math_ops.sqrt(2.0)
-        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
-        weights = _array_ops.pad(
-            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
-            constant_values=n2)
-        dct2 *= weights
-
-      return dct2
-
-    elif type == 3:
-      if norm == "ortho":
-        n1 = _math_ops.sqrt(axis_dim_float)
-        n2 = n1 * _math_ops.sqrt(0.5)
-        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
-        weights = _array_ops.pad(
-            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
-            constant_values=n2)
-        input *= weights
-      else:
-        input *= axis_dim_float
-      scale = 2.0 * _math_ops.exp(
-          _math_ops.complex(
-              0.0,
-              _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
-              axis_dim_float))
-      dct3 = _math_ops.real(
-          irfft(
-              scale * _math_ops.complex(input, 0.0),
-              fft_length=[2 * axis_dim]))[..., :axis_dim]
-
-      return dct3
-
-
-# TODO(rjryan): Implement `type`, `n` and `axis` parameters.
-@tf_export("spectral.idct")
-def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
-  """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
-
-  Currently only Types II and III are supported. Type III is the inverse of
-  Type II, and vice versa.
-
-  Note that you must re-normalize by 1/(2n) to obtain an inverse if `norm` is
-  not `'ortho'`. That is:
-  `signal == idct(dct(signal)) * 0.5 / signal.shape[-1]`.
-  When `norm='ortho'`, we have:
-  `signal == idct(dct(signal, norm='ortho'), norm='ortho')`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.fftpack.idct for Type-II and Type-III DCT.
-  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.idct.html
-  @end_compatibility
-
-  Args:
-    input: A `[..., samples]` `float32` `Tensor` containing the signals to take
-      the DCT of.
-    type: The IDCT type to perform. Must be 2 or 3.
-    n: For future expansion. The length of the transform. Must be `None`.
-    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
-    norm: The normalization to apply. `None` for no normalization or `'ortho'`
-      for orthonormal normalization.
-    name: An optional name for the operation.
-
-  Returns:
-    A `[..., samples]` `float32` `Tensor` containing the IDCT of `input`.
-
-  Raises:
-    ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not
-      `-1`, or `norm` is not `None` or `'ortho'`.
-
-  [idct]:
-  https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms
-  """
-  _validate_dct_arguments(type, n, axis, norm)
-  inverse_type = {2: 3, 3: 2}[type]
-  return dct(input, type=inverse_type, n=n, axis=axis, norm=norm, name=name)
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index d0e5f700254fa5273cb707e59ac0d141fdc13627..c614d072badbdf7927d6c889288e1cf4e8d988ef 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import math_grad
 from tensorflow.python.ops import random_grad
 from tensorflow.python.ops import sparse_grad
-from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
 from tensorflow.python.ops import tensor_array_grad
 
@@ -51,6 +50,7 @@ from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
 from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=redefined-builtin
 # pylint: enable=redefined-builtin
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.ops.control_flow_ops import while_loop
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
@@ -72,6 +72,7 @@ from tensorflow.python.ops.partitioned_variables import *
 from tensorflow.python.ops.random_ops import *
 from tensorflow.python.ops.script_ops import py_func
 from tensorflow.python.ops.session_ops import *
+from tensorflow.python.ops.sort_ops import *
 from tensorflow.python.ops.sparse_ops import *
 from tensorflow.python.ops.state_ops import assign
 from tensorflow.python.ops.state_ops import assign_add
@@ -91,6 +92,7 @@ from tensorflow.python.ops.state_ops import scatter_nd_sub
 # from tensorflow.python.ops.state_ops import scatter_nd_mul
 # from tensorflow.python.ops.state_ops import scatter_nd_div
 from tensorflow.python.ops.state_ops import scatter_nd_update
+from tensorflow.python.ops.stateless_random_ops import *
 from tensorflow.python.ops.string_ops import *
 from tensorflow.python.ops.template import *
 from tensorflow.python.ops.tensor_array_ops import *
@@ -98,4 +100,3 @@ from tensorflow.python.ops.variable_scope import *
 from tensorflow.python.ops.variables import *
 # pylint: enable=wildcard-import
 # pylint: enable=g-bad-import-order
-
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 920047f38b07e62ec832f2cf411d83180b6fa160..3ac69c1c202d71b91e42f0f4a5bdd80c881ef97d 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -15,7 +15,7 @@
 
 """Variables.
 
-See the [Variables](https://tensorflow.org/api_guides/python/state_ops) guide.
+See the [Variables](https://www.tensorflow.org/guide/variables) guide.
 """
 
 from __future__ import absolute_import
@@ -32,6 +32,7 @@ from tensorflow.python.ops import gen_state_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_state_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -595,7 +596,9 @@ def scatter_nd_sub(ref, indices, updates, use_locking=False, name=None):
       name=name))
 
 
-@tf_export("batch_scatter_update")
+@tf_export(v1=["batch_scatter_update"])
+@deprecation.deprecated(
+    "2018-11-29", "Use the batch_scatter_update method of Variable instead.")
 def batch_scatter_update(ref, indices, updates, use_locking=True, name=None):
   """Generalization of `tf.scatter_update` to axis different than 0.
 
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b119049b163dd57aee08f078e5ab5ca913f61706
--- /dev/null
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -0,0 +1,267 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stateless random ops which take seed as a tensor input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_stateless_random_ops
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
+
+ops.NotDifferentiable("StatelessMultinomial")
+ops.NotDifferentiable("StatelessRandomNormal")
+ops.NotDifferentiable("StatelessRandomUniform")
+ops.NotDifferentiable("StatelessRandomUniformInt")
+ops.NotDifferentiable("StatelessTruncatedNormal")
+
+
+@tf_export("random.stateless_uniform")
+def stateless_random_uniform(shape,
+                             seed,
+                             minval=0,
+                             maxval=None,
+                             dtype=dtypes.float32,
+                             name=None):
+  """Outputs deterministic pseudorandom values from a uniform distribution.
+
+  This is a stateless version of `tf.random_uniform`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  The generated values follow a uniform distribution in the range
+  `[minval, maxval)`. The lower bound `minval` is included in the range, while
+  the upper bound `maxval` is excluded.
+
+  For floats, the default range is `[0, 1)`.  For ints, at least `maxval` must
+  be specified explicitly.
+
+  In the integer case, the random integers are slightly biased unless
+  `maxval - minval` is an exact power of two.  The bias is small for values of
+  `maxval - minval` significantly smaller than the range of the output (either
+  `2**32` or `2**64`).
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    minval: A 0-D Tensor or Python value of type `dtype`. The lower bound on the
+      range of random values to generate.  Defaults to 0.
+    maxval: A 0-D Tensor or Python value of type `dtype`. The upper bound on the
+      range of random values to generate.  Defaults to 1 if `dtype` is floating
+      point.
+    dtype: The type of the output: `float16`, `float32`, `float64`, `int32`, or
+      `int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor of the specified shape filled with random uniform values.
+
+  Raises:
+    ValueError: If `dtype` is integral and `maxval` is not specified.
+  """
+  dtype = dtypes.as_dtype(dtype)
+  if dtype not in (dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.int32, dtypes.int64):
+    raise ValueError("Invalid dtype %r" % dtype)
+  if maxval is None:
+    if dtype.is_integer:
+      raise ValueError("Must specify maxval for integer dtype %r" % dtype)
+    maxval = 1
+  with ops.name_scope(name, "stateless_random_uniform",
+                      [shape, seed, minval, maxval]) as name:
+    shape = random_ops._ShapeTensor(shape)  # pylint: disable=protected-access
+    minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
+    maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
+    if dtype.is_integer:
+      return gen_stateless_random_ops.stateless_random_uniform_int(
+          shape, seed=seed, minval=minval, maxval=maxval, name=name)
+    else:
+      rnd = gen_stateless_random_ops.stateless_random_uniform(
+          shape, seed=seed, dtype=dtype)
+      return math_ops.add(rnd * (maxval - minval), minval, name=name)
+
+
+@tf_export("random.stateless_normal")
+def stateless_random_normal(shape,
+                            seed,
+                            mean=0.0,
+                            stddev=1.0,
+                            dtype=dtypes.float32,
+                            name=None):
+  """Outputs deterministic pseudorandom values from a normal distribution.
+
+  This is a stateless version of `tf.random_normal`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    mean: A 0-D Tensor or Python value of type `dtype`. The mean of the normal
+      distribution.
+    stddev: A 0-D Tensor or Python value of type `dtype`. The standard deviation
+      of the normal distribution.
+    dtype: The type of the output.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor of the specified shape filled with random normal values.
+  """
+  with ops.name_scope(name, "stateless_random_normal",
+                      [shape, seed, mean, stddev]) as name:
+    shape = random_ops._ShapeTensor(shape)  # pylint: disable=protected-access
+    mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
+    stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
+    rnd = gen_stateless_random_ops.stateless_random_normal(shape, seed, dtype)
+    return math_ops.add(rnd * stddev, mean, name=name)
+
+
+@tf_export("random.stateless_truncated_normal")
+def stateless_truncated_normal(shape,
+                               seed,
+                               mean=0.0,
+                               stddev=1.0,
+                               dtype=dtypes.float32,
+                               name=None):
+  """Outputs deterministic pseudorandom values, truncated normally distributed.
+
+  This is a stateless version of `tf.truncated_normal`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  The generated values follow a normal distribution with specified mean and
+  standard deviation, except that values whose magnitude is more than 2 standard
+  deviations from the mean are dropped and re-picked.
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output tensor.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    mean: A 0-D Tensor or Python value of type `dtype`. The mean of the
+      truncated normal distribution.
+    stddev: A 0-D Tensor or Python value of type `dtype`. The standard deviation
+      of the normal distribution, before truncation.
+    dtype: The type of the output.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tensor of the specified shape filled with random truncated normal values.
+  """
+  with ops.name_scope(name, "stateless_truncated_normal",
+                      [shape, seed, mean, stddev]) as name:
+    shape = random_ops._ShapeTensor(shape)  # pylint: disable=protected-access
+    mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
+    stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
+    rnd = gen_stateless_random_ops.stateless_truncated_normal(
+        shape, seed, dtype)
+    return math_ops.add(rnd * stddev, mean, name=name)
+
+
+@tf_export(v1=["random.stateless_multinomial"])
+@deprecation.deprecated(
+    date=None, instructions="Use tf.random.stateless_categorical instead.")
+def stateless_multinomial(logits,
+                          num_samples,
+                          seed,
+                          output_dtype=dtypes.int64,
+                          name=None):
+  """Draws deterministic pseudorandom samples from a multinomial distribution.
+
+  This is a stateless version of `tf.multinomial`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.random.stateless_multinomial(
+      tf.log([[10., 10.]]), 5, seed=[7, 17])
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    output_dtype: integer type to use for the output. Defaults to int64.
+    name: Optional name for the operation.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "stateless_multinomial", [logits, seed]):
+    return stateless_multinomial_categorical_impl(logits, num_samples,
+                                                  output_dtype, seed)
+
+
+@tf_export("random.stateless_categorical")
+def stateless_categorical(logits,
+                          num_samples,
+                          seed,
+                          dtype=dtypes.int64,
+                          name=None):
+  """Draws deterministic pseudorandom samples from a categorical distribution.
+
+  This is a stateless version of `tf.categorical`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.random.stateless_categorical(
+      tf.log([[10., 10.]]), 5, seed=[7, 17])
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    dtype: integer type to use for the output. Defaults to int64.
+    name: Optional name for the operation.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "stateless_categorical", [logits, seed]):
+    return stateless_multinomial_categorical_impl(logits, num_samples, dtype,
+                                                  seed)
+
+
+def stateless_multinomial_categorical_impl(logits, num_samples, dtype, seed):
+  """Implementation for stateless multinomial/categorical ops (v1/v2)."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  return gen_stateless_random_ops.stateless_multinomial(
+      logits, num_samples, seed, output_dtype=dtype)
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index f26388efeae5442e149e0f0615c4b304902973f1..046459706c0881bd9a3cbd68e4d5553d0547947c 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Operations for working with string Tensors.
-
-See the [Strings](https://tensorflow.org/api_guides/python/string_ops) guide.
-"""
+"""Operations for working with string Tensors."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -31,6 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import math_ops
 
@@ -40,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_string_ops import *
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=g-bad-import-order
 # pylint: enable=wildcard-import
@@ -47,6 +46,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.regex_full_match")
+@dispatch.add_dispatch_support
 def regex_full_match(input, pattern, name=None):
   r"""Match elements of `input` with regex `pattern`.
 
@@ -75,8 +75,10 @@ def regex_full_match(input, pattern, name=None):
 regex_full_match.__doc__ = gen_string_ops.regex_full_match.__doc__
 
 
-@tf_export("strings.regex_replace", "regex_replace")
+@tf_export(
+    "strings.regex_replace", v1=["strings.regex_replace", "regex_replace"])
 @deprecation.deprecated_endpoints("regex_replace")
+@dispatch.add_dispatch_support
 def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
   r"""Replace elements of `input` matching regex `pattern` with `rewrite`.
 
@@ -313,7 +315,7 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices):
     return math_ops.range(array_ops.rank(x) - 1, -1, -1)
 
 
-@tf_export("strings.reduce_join", "reduce_join")
+@tf_export(v1=["strings.reduce_join", "reduce_join"])
 @deprecation.deprecated_endpoints("reduce_join")
 def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
                 keep_dims=False,
@@ -331,6 +333,17 @@ def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
       name=name)
 
 
+@tf_export("strings.reduce_join", v1=[])
+def reduce_join_v2(  # pylint: disable=missing-docstring
+    inputs,
+    axis=None,
+    keepdims=False,
+    separator="",
+    name=None):
+  return reduce_join(
+      inputs, axis, keep_dims=keepdims, separator=separator, name=name)
+
+
 reduce_join.__doc__ = deprecation.rewrite_argument_docstring(
     gen_string_ops.reduce_join.__doc__, "reduction_indices", "axis")
 reduce_join.__doc__ = reduce_join.__doc__.replace("tf.reduce_join(",
@@ -339,11 +352,18 @@ reduce_join.__doc__ = reduce_join.__doc__.replace("tf.reduce_join(",
 
 # This wrapper provides backwards compatibility for code that predates the
 # unit argument and that passed 'name' as a positional argument.
-@tf_export("strings.length")
+@tf_export(v1=["strings.length"])
+@dispatch.add_dispatch_support
 def string_length(input, name=None, unit="BYTE"):
   return gen_string_ops.string_length(input, unit=unit, name=name)
 
 
+@tf_export("strings.length", v1=[])
+@dispatch.add_dispatch_support
+def string_length_v2(input, unit="BYTE", name=None):
+  return string_length(input, name, unit)
+
+
 string_length.__doc__ = gen_string_ops.string_length.__doc__
 
 
@@ -355,11 +375,18 @@ def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
 substr_deprecated.__doc__ = gen_string_ops.substr.__doc__
 
 
-@tf_export("strings.substr")
+@tf_export(v1=["strings.substr"])
+@dispatch.add_dispatch_support
 def substr(input, pos, len, name=None, unit="BYTE"):
   return gen_string_ops.substr(input, pos, len, unit=unit, name=name)
 
 
+@tf_export("strings.substr", v1=[])
+@dispatch.add_dispatch_support
+def substr_v2(input, pos, len, unit="BYTE", name=None):
+  return substr(input, pos, len, name=name, unit=unit)
+
+
 substr.__doc__ = gen_string_ops.substr.__doc__
 
 
@@ -373,3 +400,55 @@ ops.NotDifferentiable("StringSplit")
 ops.NotDifferentiable("AsString")
 ops.NotDifferentiable("EncodeBase64")
 ops.NotDifferentiable("DecodeBase64")
+
+
+@tf_export("strings.to_number", v1=[])
+@dispatch.add_dispatch_support
+def string_to_number(input, out_type=dtypes.float32, name=None):
+  r"""Converts each string in the input Tensor to the specified numeric type.
+
+  (Note that int32 overflow results in an error while float overflow
+  results in a rounded value.)
+
+  Args:
+    input: A `Tensor` of type `string`.
+    out_type: An optional `tf.DType` from: `tf.float32, tf.float64, tf.int32,
+      tf.int64`. Defaults to `tf.float32`.
+      The numeric type to interpret each string in `string_tensor` as.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `out_type`.
+  """
+  return gen_parsing_ops.string_to_number(input, out_type, name)
+tf_export(v1=["strings.to_number", "string_to_number"])(
+    gen_parsing_ops.string_to_number
+    )
+
+
+@tf_export("strings.to_hash_bucket", v1=[])
+@dispatch.add_dispatch_support
+def string_to_hash_bucket(input, num_buckets, name=None):
+  # pylint: disable=line-too-long
+  r"""Converts each string in the input Tensor to its hash mod by a number of buckets.
+
+  The hash function is deterministic on the content of the string within the
+  process.
+
+  Note that the hash function may change from time to time.
+  This functionality will be deprecated and it's recommended to use
+  `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+
+  Args:
+    input: A `Tensor` of type `string`.
+    num_buckets: An `int` that is `>= 1`. The number of buckets.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `int64`.
+  """
+  # pylint: enable=line-too-long
+  return gen_string_ops.string_to_hash_bucket(input, num_buckets, name)
+tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])(
+    gen_string_ops.string_to_hash_bucket
+    )
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index b382c3b7ce57e3b07d7a6e598ef86948f3abe3a6..93d8d50842ba681688e6d42890445ab4e6879124 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -21,9 +21,10 @@ from __future__ import print_function
 import contextlib
 import re
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import distribution_strategy_context
 
 
 def collect(val, collections, default_collections):
@@ -44,13 +45,27 @@ _INVALID_TAG_CHARACTERS = re.compile(r'[^-/\w\.]')
 
 
 def skip_summary():
-  # If using multiple towers in distributed strategy, skip summaries on all
-  # towers except the first one (tower_id=0).
+  """Determines if summary should be skipped.
+
+  If using multiple replicas in distributed strategy, skip summaries on all
+  replicas except the first one (replica_id=0).
+
+  Returns:
+    True if the summary is skipped; False otherwise.
+  """
+
   # TODO(priyag): Add a new optional argument that will provide multiple
-  # alternatives to override default behavior. (e.g. run on last tower,
-  # compute sum or mean across towers).
-  tower_context = distribution_strategy_context.get_tower_context()
-  return tower_context and tower_context.tower_id > 0
+  # alternatives to override default behavior. (e.g. run on last replica,
+  # compute sum or mean across replicas).
+  replica_context = distribution_strategy_context.get_replica_context()
+  if not replica_context:
+    return False
+  # TODO(b/118385803): when replica_id of _TPUReplicaContext is properly
+  # initialized, remember to change here as well.
+  replica_id = replica_context.replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id and replica_id > 0
 
 
 def clean_tag(name):
diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py
deleted file mode 100644
index ec4d4a6e9242107fd7f4bebe1416198457e32cee..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/summary_ops.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Summary Operations."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_logging_ops
-from tensorflow.python.ops import summary_op_util
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.gen_logging_ops import *
-from tensorflow.python.util.tf_export import tf_export
-# pylint: enable=wildcard-import
-
-
-@tf_export("summary.tensor_summary")
-def tensor_summary(name,
-                   tensor,
-                   summary_description=None,
-                   collections=None,
-                   summary_metadata=None,
-                   family=None,
-                   display_name=None):
-  """Outputs a `Summary` protocol buffer with a serialized tensor.proto.
-
-  Args:
-    name: A name for the generated node. If display_name is not set, it will
-      also serve as the tag name in TensorBoard. (In that case, the tag
-      name will inherit tf name scopes.)
-    tensor: A tensor of any type and shape to serialize.
-    summary_description: A long description of the summary sequence. Markdown
-      is supported.
-    collections: Optional list of graph collections keys. The new summary op is
-      added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
-    summary_metadata: Optional SummaryMetadata proto (which describes which
-      plugins may use the summary value).
-    family: Optional; if provided, used as the prefix of the summary tag,
-      which controls the name used for display on TensorBoard when
-      display_name is not set.
-    display_name: A string used to name this data in TensorBoard. If this is
-      not set, then the node name will be used instead.
-
-  Returns:
-    A scalar `Tensor` of type `string`. The serialized `Summary` protocol
-    buffer.
-  """
-
-  if summary_metadata is None:
-    summary_metadata = summary_pb2.SummaryMetadata()
-
-  if summary_description is not None:
-    summary_metadata.summary_description = summary_description
-
-  if display_name is not None:
-    summary_metadata.display_name = display_name
-
-  serialized_summary_metadata = summary_metadata.SerializeToString()
-
-  if summary_op_util.skip_summary():
-    return constant_op.constant("")
-  with summary_op_util.summary_scope(
-      name, family, values=[tensor]) as (tag, scope):
-    val = gen_logging_ops.tensor_summary_v2(
-        tensor=tensor,
-        tag=tag,
-        name=scope,
-        serialized_summary_metadata=serialized_summary_metadata)
-    summary_op_util.collect(val, collections, [ops.GraphKeys.SUMMARIES])
-  return val
-
-ops.NotDifferentiable("TensorSummary")
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index a40450762796f2094f5659d555884efe90e962ab..3f99b9f8773b3d26cf334044e0d127bf7443bfea 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -40,14 +40,18 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
 
 
-# Name for a collection which is expected to have at most a single boolean
-# Tensor. If this tensor is True the summary ops will record summaries.
-_SHOULD_RECORD_SUMMARIES_NAME = "ShouldRecordSummaries"
+# Dictionary mapping graph keys to a boolean Tensor (or callable returning
+# a boolean Tensor) indicating whether we should record summaries for the
+# graph identified by the key of the dictionary.
+_SHOULD_RECORD_SUMMARIES = {}
 
-_SUMMARY_WRITER_INIT_COLLECTION_NAME = "_SUMMARY_WRITER_V2"
+# A global dictionary mapping graph keys to a list of summary writer init ops.
+_SUMMARY_WRITER_INIT_OP = {}
 
 _EXPERIMENT_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,256}$")
 _RUN_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,512}$")
@@ -56,62 +60,69 @@ _USER_NAME_PATTERNS = re.compile(r"^[a-z]([-a-z0-9]{0,29}[a-z0-9])?$", re.I)
 
 def should_record_summaries():
   """Returns boolean Tensor which is true if summaries should be recorded."""
-  should_record_collection = ops.get_collection(_SHOULD_RECORD_SUMMARIES_NAME)
-  if not should_record_collection:
-    return False
-  if len(should_record_collection) != 1:
-    raise ValueError(
-        "More than one tensor specified for whether summaries "
-        "should be recorded: %s" % should_record_collection)
-  return should_record_collection[0]
+  global _SHOULD_RECORD_SUMMARIES
+  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+  should = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  return should() if callable(should) else should
 
 
-# TODO(apassos) consider how to handle local step here.
 @tf_contextlib.contextmanager
+def _record_summaries(boolean=True):
+  """Sets summary recording on or off per the provided boolean value.
+
+  The provided value can be a python boolean, a scalar boolean Tensor, or
+  or a callable providing such a value; if a callable is passed it will be
+  invoked each time should_record_summaries() is called to determine whether
+  summary writing should be enabled.
+
+  Args:
+    boolean: can be True, False, a bool Tensor, or a callable providing such.
+      Defaults to True.
+
+  Yields:
+    Returns a context manager that sets this value on enter and restores the
+    previous value on exit.
+  """
+  # TODO(nickfelt): make this threadlocal
+  global _SHOULD_RECORD_SUMMARIES
+  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  try:
+    _SHOULD_RECORD_SUMMARIES[key] = boolean
+    yield
+  finally:
+    _SHOULD_RECORD_SUMMARIES[key] = old
+
+
+# TODO(apassos) consider how to handle local step here.
 def record_summaries_every_n_global_steps(n, global_step=None):
   """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
   if global_step is None:
     global_step = training_util.get_or_create_global_step()
-  collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
-  old = collection_ref[:]
-  try:
-    with ops.device("cpu:0"):
-      collection_ref[:] = [math_ops.equal(global_step % n, 0)]
-    yield
-  finally:
-    collection_ref[:] = old
+  with ops.device("cpu:0"):
+    should = lambda: math_ops.equal(global_step % n, 0)
+    if not context.executing_eagerly():
+      should = should()
+  return _record_summaries(should)
 
 
-@tf_contextlib.contextmanager
 def always_record_summaries():
   """Sets the should_record_summaries Tensor to always true."""
-  collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
-  old = collection_ref[:]
-  try:
-    collection_ref[:] = [True]
-    yield
-  finally:
-    collection_ref[:] = old
+  return _record_summaries(True)
 
 
-@tf_contextlib.contextmanager
 def never_record_summaries():
   """Sets the should_record_summaries Tensor to always false."""
-  collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
-  old = collection_ref[:]
-  try:
-    collection_ref[:] = [False]
-    yield
-  finally:
-    collection_ref[:] = old
+  return _record_summaries(False)
 
 
+@tf_export("summary.SummaryWriter", v1=[])
 class SummaryWriter(object):
   """Encapsulates a stateful summary writer resource.
 
   See also:
-  - `tf.contrib.summary.create_file_writer`
-  - `tf.contrib.summary.create_db_writer`
+  - `tf.summary.create_file_writer`
+  - `tf.summary.create_db_writer`
   """
 
   def  __init__(self, resource, init_op_fn):
@@ -143,7 +154,6 @@ class SummaryWriter(object):
       finally:
         context.context().summary_writer_resource = old
 
-
   def init(self):
     """Operation to initialize the summary writer resource."""
     if self._resource is not None:
@@ -207,6 +217,7 @@ def initialize(
     session.run(_graph(x, 0), feed_dict={x: data})
 
 
+@tf_export("summary.create_file_writer", v1=[])
 def create_file_writer(logdir,
                        max_queue=None,
                        flush_millis=None,
@@ -282,7 +293,7 @@ def create_db_writer(db_uri,
       `tf.Graph`.
 
   Returns:
-    A `tf.contrib.summary.SummaryWriter` instance.
+    A `tf.summary.SummaryWriter` instance.
   """
   with ops.device("cpu:0"):
     if experiment_name is None:
@@ -311,7 +322,9 @@ def _make_summary_writer(name, factory, **kwargs):
   if not context.executing_eagerly():
     # TODO(apassos): Consider doing this instead.
     #   ops.get_default_session().run(init_op)
-    ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME, init_op)
+    global _SUMMARY_WRITER_INIT_OP
+    key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+    _SUMMARY_WRITER_INIT_OP.setdefault(key, []).append(init_op)
   return SummaryWriter(resource, init_op_fn)
 
 
@@ -329,7 +342,7 @@ def _nothing():
 def all_summary_ops():
   """Graph-mode only. Returns all summary ops.
 
-  Please note this excludes `tf.contrib.summary.graph` ops.
+  Please note this excludes `tf.summary.graph` ops.
 
   Returns:
     The summary ops.
@@ -352,7 +365,9 @@ def summary_writer_initializer_op():
     raise RuntimeError(
         "tf.contrib.summary.summary_writer_initializer_op is only "
         "supported in graph mode.")
-  return ops.get_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME)
+  global _SUMMARY_WRITER_INIT_OP
+  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
+  return _SUMMARY_WRITER_INIT_OP.setdefault(key, [])
 
 
 def summary_writer_function(name, tensor, function, family=None):
@@ -495,7 +510,7 @@ def graph(param, step=None, name=None):
   """Writes a TensorFlow graph to the summary interface.
 
   The graph summary is, strictly speaking, not a summary. Conditions
-  like `tf.contrib.summary.never_record_summaries` do not apply. Only
+  like `tf.summary.should_record_summaries` do not apply. Only
   a single graph can be associated with a particular run. If multiple
   graphs are written, then only the last one will be considered by
   TensorBoard.
@@ -539,14 +554,13 @@ def graph(param, step=None, name=None):
 _graph = graph  # for functions with a graph parameter
 
 
+@tf_export("summary.import_event", v1=[])
 def import_event(tensor, name=None):
   """Writes a `tf.Event` binary proto.
 
-  When using create_db_writer(), this can be used alongside
-  `tf.TFRecordReader` to load event logs into the database. Please
-  note that this is lower level than the other summary functions and
-  will ignore any conditions set by methods like
-  `tf.contrib.summary.should_record_summaries`.
+  This can be used to import existing event logs into a new summary writer sink.
+  Please note that this is lower level than the other summary functions and
+  will ignore the `tf.summary.should_record_summaries` setting.
 
   Args:
     tensor: A `tf.Tensor` of type `string` containing a serialized
@@ -560,13 +574,14 @@ def import_event(tensor, name=None):
       context.context().summary_writer_resource, tensor, name=name)
 
 
+@tf_export("summary.flush", v1=[])
 def flush(writer=None, name=None):
   """Forces summary writer to send any buffered data to storage.
 
   This operation blocks until that finishes.
 
   Args:
-    writer: The `tf.contrib.summary.SummaryWriter` resource to flush.
+    writer: The `tf.summary.SummaryWriter` resource to flush.
       The thread default will be used if this parameter is None.
       Otherwise a `tf.no_op` is returned.
     name: A name for the operation (optional).
@@ -593,6 +608,8 @@ def eval_dir(model_dir, name=None):
   return os.path.join(model_dir, "eval" if not name else "eval_" + name)
 
 
+@deprecation.deprecated(date=None,
+                        instructions="Renamed to create_file_writer().")
 def create_summary_file_writer(*args, **kwargs):
   """Please use `tf.contrib.summary.create_file_writer`."""
   logging.warning("Deprecation Warning: create_summary_file_writer was renamed "
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index e7ad261615f57c1e0ff967d0f7cd498571d21bc7..7c2d3be338766a4e25a817f824e06c665059bc01 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.tf_export import tf_export
 __all__ = ["make_template"]
 
 
-@tf_export("make_template")
+@tf_export(v1=["make_template"])
 def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
                   custom_getter_=None, **kwargs):
   """Given an arbitrary function, wrap it so that it does variable sharing.
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index f86dfb35276f608c5cb323fe5deceb58733be007..d1516949517f1f5df9291add96756eeacea29f51 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -20,8 +20,10 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import os
 import weakref
 
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,12 +32,18 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
 
+ENABLE_TENSOR_ARRAY_V2 = (
+    tf2.enabled() or os.getenv("TF_ENABLE_TENSOR_ARRAY_V2") is not None)
+
+
 # _GraphTensorArray accesses many of the hidden generated ops, but is in
 # fact built to wrap these methods.
 # pylint: disable=protected-access
@@ -393,6 +401,273 @@ class _GraphTensorArray(object):
     return gen_data_flow_ops.tensor_array_close_v3(
         handle=self._handle, name=name)
 
+
+class _GraphTensorArrayV2(object):
+  """Graph-mode implementation of TensorArray backed by TensorLists.
+
+  The backing tensor of this TensorArray is a TensorList variant tensor which is
+  stored in the `flow`. The `handle` is always none here. The reason we use the
+  `flow` field and not the `handle` field is to ensure backwards compatibility
+  with legacy control flow.
+  """
+
+  def __init__(self,
+               dtype,
+               size=None,
+               dynamic_size=None,
+               clear_after_read=None,
+               tensor_array_name=None,
+               handle=None,
+               flow=None,
+               infer_shape=True,
+               element_shape=None,
+               colocate_with_first_write_call=True,
+               name=None):
+    """Constructs a graph mode TensorArray.
+
+    Args:
+      dtype: (required) data type of the TensorArray.
+      size: (optional) int32 scalar `Tensor`: the size of the TensorArray.
+        Required if flow is not provided.
+      dynamic_size: (optional) Python bool: If true, writes to the TensorArray
+        can grow the TensorArray past its initial size.  Default: False.
+      clear_after_read: (optional) unused. Not supported in TensorLists.
+      tensor_array_name: (optional) unused.
+      handle: (optional) Must always be None.
+      flow: (optional) A variant `Tensor` scalar for a TensorList.
+      infer_shape: (optional, default: True) If True, shape inference is
+        enabled.  In this case, all elements must have the same shape.
+      element_shape: (optional, default: None) A `TensorShape` object specifying
+        the shape constraints of each of the elements of the TensorArray. Need
+        not be fully defined.
+      colocate_with_first_write_call: (optional). unused.
+      name: (optional) A name for the operation.
+
+    Raises:
+      ValueError: if both handle and tensor_array_name are provided.
+      TypeError: if handle is provided but is not a Tensor.
+    """
+    assert handle is None
+    del handle
+    del clear_after_read
+    del tensor_array_name
+    del colocate_with_first_write_call
+
+    del dynamic_size  # TODO(b/117943489): Unused for now.
+
+    if (flow is not None and
+        (not isinstance(flow, ops.Tensor) or flow.dtype != dtypes.variant)):
+      raise TypeError("flow must be a variant tensor")
+    if flow is None and size is None:
+      raise ValueError("Size must be provided if flow is not provided")
+    if flow is not None and size is not None:
+      raise ValueError("Cannot provide both a flow and size "
+                       "at the same time")
+    if flow is not None and element_shape is not None:
+      raise ValueError("Cannot provide both a flow and element_shape "
+                       "at the same time")
+
+    self._dtype = dtype
+
+    # Record the current static shape for the array elements. The element
+    # shape is defined either by `element_shape` or the shape of the tensor
+    # of the first write. If `infer_shape` is true, all writes checks for
+    # shape equality.
+    if element_shape is None:
+      self._infer_shape = infer_shape
+      self._element_shape = []
+    else:
+      self._infer_shape = True
+      self._element_shape = [tensor_shape.TensorShape(element_shape)]
+    with ops.name_scope(name, "TensorArrayV2", [size, flow]) as scope:
+      if flow is None:
+        self._flow = list_ops.tensor_list_reserve(
+            element_shape=element_shape,
+            num_elements=size,
+            element_dtype=dtype,
+            name=scope)
+      else:
+        self._flow = flow
+
+    # For backwards compatibility.
+    self._colocate_with_first_write_call = None
+    self._colocate_with = None
+
+  @property
+  def flow(self):
+    return self._flow
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def handle(self):
+    # We intentionally do not raise an error so that legacy while_loop does not
+    # complain.
+    return None
+
+  def _merge_element_shape(self, shape):
+    """Changes the element shape of the array given a shape to merge with.
+
+    Args:
+      shape: A `TensorShape` object to merge with.
+
+    Raises:
+      ValueError: if the provided shape is incompatible with the current
+          element shape of the `TensorArray`.
+    """
+
+    if self._element_shape:
+      if not shape.is_compatible_with(self._element_shape[0]):
+        raise ValueError(
+            "Inconsistent shapes: saw %s but expected %s "
+            "(and infer_shape=True)" % (shape, self._element_shape[0]))
+      self._element_shape[0] = self._element_shape[0].merge_with(shape)
+    else:
+      self._element_shape.append(shape)
+
+  def identity(self):
+    """See TensorArray."""
+    flow = array_ops.identity(self._flow)
+    ta = TensorArray(
+        dtype=self._dtype, flow=flow, infer_shape=self._infer_shape)
+    ta._element_shape = self._element_shape
+    return ta
+
+  def grad(self, source, flow=None, name=None):
+    """Not supported."""
+    raise NotImplementedError()
+
+  def read(self, index, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_get_item(
+        input_handle=self._flow,
+        index=index,
+        element_dtype=self._dtype,
+        name=name)
+    if self._element_shape:
+      value.set_shape(self._element_shape[0].dims)
+    return value
+
+  @tf_should_use.should_use_result
+  def write(self, index, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayV2Write", [self._flow, index, value]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape:
+        self._merge_element_shape(value.shape)
+      flow_out = list_ops.tensor_list_set_item(
+          input_handle=self._flow, index=index, item=value, name=name)
+      ta = TensorArray(dtype=self._dtype, handle=None, flow=flow_out)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      return ta
+
+  def stack(self, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayV2Stack", [self._flow]):
+      value = list_ops.tensor_list_stack(
+          input_handle=self._flow, element_dtype=self._dtype)
+      if self._element_shape and self._element_shape[0].dims is not None:
+        value.set_shape([None] + self._element_shape[0].dims)
+      return value
+
+  def gather(self, indices, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_gather(
+        input_handle=self._flow,
+        indices=indices,
+        element_dtype=self._dtype,
+        name=name)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims)
+    return value
+
+  def concat(self, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_concat(
+        input_handle=self._flow, element_dtype=self._dtype, name=name)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims[1:])
+    return value
+
+  @tf_should_use.should_use_result
+  def unstack(self, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayUnstack", [self._flow, value]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape and not context.executing_eagerly():
+        self._merge_element_shape(value.shape[1:])
+      flow_out = list_ops.tensor_list_from_tensor(
+          tensor=value, element_shape=value.shape[1:])
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  @tf_should_use.should_use_result
+  def scatter(self, indices, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayScatter",
+                        [self._flow, value, indices]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape and not context.executing_eagerly():
+        self._merge_element_shape(value.shape[1:])
+      flow_out = list_ops.tensor_list_scatter(
+          tensor=value, indices=indices, element_shape=-1)
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  @tf_should_use.should_use_result
+  def split(self, value, lengths, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArraySplit", [self._flow, value, lengths]):
+      value = ops.convert_to_tensor(value, name="value")
+      lengths_64 = math_ops.to_int64(lengths)
+      if self._infer_shape and not context.executing_eagerly():
+        clengths = tensor_util.constant_value(lengths_64)
+        if value.shape.dims is not None:
+          if clengths is not None and clengths.max() == clengths.min():
+            self._merge_element_shape(
+                tensor_shape.TensorShape([clengths[0]]).concatenate(
+                    value.shape[1:]))
+      flow_out = list_ops.tensor_list_split(
+          tensor=value,
+          lengths=lengths_64,
+          element_shape=self._element_shape[0] if self._element_shape else None,
+          name=name)
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  def size(self, name=None):
+    """See TensorArray."""
+    return list_ops.tensor_list_length(input_handle=self._flow, name=name)
+
+  @tf_should_use.should_use_result
+  def close(self, name=None):
+    """See TensorArray."""
+    return gen_control_flow_ops.no_op(name=name)
+
 # pylint: enable=protected-access
 
 
@@ -738,8 +1013,10 @@ class TensorArray(object):
     if context.executing_eagerly():
       implementation = _EagerTensorArray
     else:
-      implementation = _GraphTensorArray
-
+      if ENABLE_TENSOR_ARRAY_V2:
+        implementation = _GraphTensorArrayV2
+      else:
+        implementation = _GraphTensorArray
     self._implementation = implementation(
         dtype,
         size=size,
@@ -768,7 +1045,7 @@ class TensorArray(object):
   @property
   def handle(self):
     """The reference to the TensorArray."""
-    return self._implementation._handle
+    return self._implementation.handle
 
   @property
   def _infer_shape(self):
@@ -953,4 +1230,16 @@ class TensorArray(object):
     """Close the current TensorArray."""
     return self._implementation.close(name=name)
 
+
+def build_ta_with_new_flow(old_ta, flow):
+  ta = TensorArray(
+      dtype=old_ta.dtype,
+      handle=old_ta.handle,
+      flow=flow,
+      infer_shape=old_ta._infer_shape,
+      colocate_with_first_write_call=old_ta._colocate_with_first_write_call)
+  ta._colocate_with = old_ta._colocate_with
+  ta._element_shape = old_ta._element_shape
+  return ta
+
 # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/tensor_forest_ops.py b/tensorflow/python/ops/tensor_forest_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..842f0c648b12551624fc6306a6fa869392dd4465
--- /dev/null
+++ b/tensorflow/python/ops/tensor_forest_ops.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for tensor_forest."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import ops
+from tensorflow.python.ops import gen_tensor_forest_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.training import saver
+
+
+class TreeVariableSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Resource that holds a tree."""
+
+  def __init__(self, type_name, name, container, config, resource_handle_func,
+               create_op_func, is_initialized_op_func, serialize_op_func,
+               deserialize_op_func):
+
+    with ops.name_scope(name, type_name) as name:
+      self._resource_handle = resource_handle_func(
+          container, shared_name=name, name=name)
+
+    self._is_initialized_op = is_initialized_op_func(self._resource_handle)
+    tensor = serialize_op_func(self._resource_handle)
+    self._create_op = create_op_func(self._resource_handle, config)
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree variable. So we just pass an empty
+    # value.
+    slice_spec = ''
+    specs = [saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name)]
+    super(TreeVariableSaveable, self).__init__(self._resource_handle, specs,
+                                               name)
+
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+
+    resources.register_resource(self._resource_handle, self._create_op,
+                                self._is_initialized_op)
+    self._deserialize_op_func = deserialize_op_func
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return self._deserialize_op_func(
+          self._resource_handle,
+          restored_tensors[0],
+      )
+
+  @property
+  def resource(self):
+    return self._resource_handle
+
+
+def tree_variable(tree_config, name, container=None):
+  return TreeVariableSaveable(
+      'TreeVariable', name, container, tree_config,
+      gen_tensor_forest_ops.tensor_forest_tree_resource_handle_op,
+      gen_tensor_forest_ops.tensor_forest_create_tree_variable,
+      gen_tensor_forest_ops.tensor_forest_tree_is_initialized_op,
+      gen_tensor_forest_ops.tensor_forest_tree_serialize,
+      gen_tensor_forest_ops.tensor_forest_tree_deserialize).resource
+
+
+class ForestVariables(object):
+  """Resource that holds all trees from a forest."""
+
+  def __init__(self, params, tree_configs=None):
+
+    self._variables = []
+
+    for i in range(params.n_trees):
+      tree_config = ''
+      if tree_configs is not None:
+        tree_config = tree_configs[i]
+      self._variables.append(tree_variable(
+          tree_config,
+          'tree-%s' % i,
+      ))
+
+  def __getitem__(self, t):
+    return self._variables[t]
diff --git a/tensorflow/python/ops/unconnected_gradients.py b/tensorflow/python/ops/unconnected_gradients.py
new file mode 100644
index 0000000000000000000000000000000000000000..03189a9f52ddfee3f10c1321e63dc985904f07f5
--- /dev/null
+++ b/tensorflow/python/ops/unconnected_gradients.py
@@ -0,0 +1,43 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities for calculating gradients."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import enum
+
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("UnconnectedGradients")
+class UnconnectedGradients(enum.Enum):
+  """Controls how gradient computation behaves when y does not depend on x.
+
+  The gradient of y with respect to x can be zero in two different ways: there
+  could be no differentiable path in the graph connecting x to y (and so we can
+  statically prove that the gradient is zero) or it could be that runtime values
+  of tensors in a particular execution lead to a gradient of zero (say, if a
+  relu unit happens to not be activated). To allow you to distinguish between
+  these two cases you can choose what value gets returned for the gradient when
+  there is no path in the graph from x to y:
+
+  * `NONE`: Indicates that [None] will be returned if there is no path from x
+    to y
+  * `ZERO`: Indicates that a zero tensor will be returned in the shape of x.
+  """
+  NONE = "none"
+  ZERO = "zero"
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 5032ca79f9b2c20205e6edadd0ddb807435c61a0..ccce9e2f93bac26a69d8cadab9ece4cc2482c4e1 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -31,6 +31,7 @@ import six
 from six import iteritems
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -206,7 +207,7 @@ it does exist, simply return it.
 """
 
 
-_DEFAULT_USE_RESOURCE = False
+_DEFAULT_USE_RESOURCE = tf2.enabled()
 
 
 @tf_export(v1=["enable_resource_variables"])
@@ -436,37 +437,43 @@ class _VariableStore(object):
           raise ValueError(
               "Partitioner must be callable, but received: %s" % partitioner)
         with ops.name_scope(None):
-          return self._get_partitioned_variable(name=name,
-                                                shape=shape,
-                                                dtype=dtype,
-                                                initializer=initializer,
-                                                regularizer=regularizer,
-                                                reuse=reuse,
-                                                trainable=trainable,
-                                                collections=collections,
-                                                caching_device=caching_device,
-                                                partitioner=partitioner,
-                                                validate_shape=validate_shape,
-                                                use_resource=use_resource,
-                                                constraint=constraint)
+          return self._get_partitioned_variable(
+              name=name,
+              shape=shape,
+              dtype=dtype,
+              initializer=initializer,
+              regularizer=regularizer,
+              reuse=reuse,
+              trainable=trainable,
+              collections=collections,
+              caching_device=caching_device,
+              partitioner=partitioner,
+              validate_shape=validate_shape,
+              use_resource=use_resource,
+              constraint=constraint,
+              synchronization=synchronization,
+              aggregation=aggregation)
 
       # Special case for partitioned variable to allow reuse without having to
       # specify partitioner.
       if (reuse is True and partitioner is None
           and name in self._partitioned_vars):
-        return self._get_partitioned_variable(name=name,
-                                              shape=shape,
-                                              dtype=dtype,
-                                              initializer=initializer,
-                                              regularizer=regularizer,
-                                              reuse=reuse,
-                                              trainable=trainable,
-                                              collections=collections,
-                                              caching_device=caching_device,
-                                              partitioner=None,
-                                              validate_shape=validate_shape,
-                                              use_resource=use_resource,
-                                              constraint=constraint)
+        return self._get_partitioned_variable(
+            name=name,
+            shape=shape,
+            dtype=dtype,
+            initializer=initializer,
+            regularizer=regularizer,
+            reuse=reuse,
+            trainable=trainable,
+            collections=collections,
+            caching_device=caching_device,
+            partitioner=None,
+            validate_shape=validate_shape,
+            use_resource=use_resource,
+            constraint=constraint,
+            synchronization=synchronization,
+            aggregation=aggregation)
 
       # Single variable case
       if "%s/part_0" % name in self._vars:
@@ -552,7 +559,9 @@ class _VariableStore(object):
                                 caching_device=None,
                                 validate_shape=True,
                                 use_resource=None,
-                                constraint=None):
+                                constraint=None,
+                                synchronization=VariableSynchronization.AUTO,
+                                aggregation=VariableAggregation.NONE):
     """Gets or creates a sharded variable list with these parameters.
 
     The `partitioner` must be a callable that accepts a fully defined
@@ -618,6 +627,15 @@ class _VariableStore(object):
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
 
     Returns:
       A `PartitionedVariable` object.
@@ -628,14 +646,8 @@ class _VariableStore(object):
         when violating reuse during variable creation, or if an existing
         sharded variable exists for the given name but with different sharding.
     """
-    if context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
-
     initializing_from_value = initializer is not None and isinstance(
         initializer, ops.Tensor)
-    reuse_without_partition = reuse and not partitioner
-
     if name in self._vars:
       raise ValueError(
           "A partitioner was provided, but an unpartitioned version of the "
@@ -646,30 +658,9 @@ class _VariableStore(object):
     if initializing_from_value:
       shape = shape.merge_with(initializer.get_shape())
 
-    if not reuse_without_partition:
-      if not shape.is_fully_defined():
-        raise ValueError("Shape of a new partitioned variable (%s) must be "
-                         "fully defined, but instead was %s." % (name, shape))
-
-      if shape.ndims < 1:
-        raise ValueError("A partitioned Variable must have rank at least 1, "
-                         "shape: %s" % shape)
-
-      partitions = partitioner(shape=shape, dtype=dtype)
-
-      if not isinstance(partitions, collections_lib.Sequence):
-        raise ValueError("Partitioner must return a sequence, but saw: %s"
-                         % partitions)
-
-      if len(partitions) != shape.ndims:
-        raise ValueError(
-            "Partitioner returned a partition list that does not match the "
-            "Variable's rank: %s vs. %s" % (partitions, shape))
-
-      if any([p < 1 for p in partitions]):
-        raise ValueError(
-            "Partitioner returned zero partitions for some axes: %s" %
-            partitions)
+    partitions = None
+    if not reuse or partitioner:
+      partitions = _call_partitioner(partitioner, shape, dtype)
 
     if name in self._partitioned_vars:
       if reuse is False:
@@ -691,7 +682,7 @@ class _VariableStore(object):
             % (name, dtype.name, existing_var.dtype.name))
 
       # pylint: disable=protected-access
-      if (not reuse_without_partition and
+      if (partitions is not None and
           existing_var._get_partitions() != partitions):
         raise ValueError(
             "Trying to reuse partitioned variable %s, but specified partitions "
@@ -706,14 +697,7 @@ class _VariableStore(object):
                        "created with tf.get_variable(). Did you mean to set "
                        "reuse=False or reuse=tf.AUTO_REUSE in VarScope?" % name)
 
-    slice_dim, slice_shape = _compute_slice_dim_and_shape(
-        shape.as_list(), partitions)
-
-    vs = []
-    num_slices = partitions[slice_dim]
-    num_slices_with_excess = shape[slice_dim].value % num_slices
-
-    slice_offset = [0] * shape.ndims
+    slice_dim, num_slices = _get_slice_dim_and_num_slices(partitions)
 
     if "%s/part_0" % name in self._vars:
       if "%s/part_%d" % (name, num_slices - 1) not in self._vars:
@@ -729,15 +713,14 @@ class _VariableStore(object):
             "%s/part_0 was found, but so was the extra shard %s/part_%d."
             % (num_slices, name, name, num_slices))
 
-    for i in xrange(num_slices):
-      var_shape = slice_shape[:]
-      var_offset = slice_offset[:]
+    vs = []
+    for i, (var_offset, var_shape) in enumerate(_iter_slices(
+        shape.as_list(),
+        num_slices,
+        slice_dim
+    )):
       partition_info = _PartitionInfo(
           full_shape=shape.as_list(), var_offset=var_offset)
-      if i < num_slices_with_excess:
-        var_shape[slice_dim] += 1
-      slice_offset[slice_dim] += var_shape[slice_dim]
-
       var_full_name = "%s/part_%d" % (name, i)
       with ops.name_scope(var_full_name + "/PartitionedInitializer"):
         # Create the tensor to initialize the variable with default value.
@@ -775,7 +758,9 @@ class _VariableStore(object):
             caching_device=caching_device,
             validate_shape=validate_shape,
             use_resource=use_resource,
-            constraint=constraint)
+            constraint=constraint,
+            synchronization=synchronization,
+            aggregation=aggregation)
 
       # pylint: disable=protected-access
       var._set_save_slice_info(variables.Variable.SaveSliceInfo(
@@ -783,15 +768,13 @@ class _VariableStore(object):
       vs.append(var)
       # pylint: enable=protected-access
 
-      # pylint: disable=protected-access
     partitioned_var = variables.PartitionedVariable(name=name,
                                                     shape=shape,
                                                     dtype=dtype,
                                                     variable_list=vs,
                                                     partitions=partitions)
-    # pylint: enable=protected-access
-
-    self._partitioned_vars[name] = partitioned_var
+    if not context.executing_eagerly() or self._store_eager_variables:
+      self._partitioned_vars[name] = partitioned_var
     return partitioned_var
 
   def _get_single_variable(self,
@@ -851,14 +834,18 @@ class _VariableStore(object):
     if name in self._vars:
       # Here we handle the case when returning an existing variable.
       if reuse is False:
-        tb = self._vars[name].op.traceback[::-1]
+        var = self._vars[name]
+        err_msg = ("Variable %s already exists, disallowed."
+                   " Did you mean to set reuse=True or "
+                   "reuse=tf.AUTO_REUSE in VarScope?" % name)
+        # ResourceVariables don't have an op associated with so no traceback
+        if isinstance(var, resource_variable_ops.ResourceVariable):
+          raise ValueError(err_msg)
+        tb = var.op.traceback[::-1]
         # Throw away internal tf entries and only take a few lines.
         tb = [x for x in tb if "tensorflow/python" not in x[0]][:3]
-        raise ValueError("Variable %s already exists, disallowed."
-                         " Did you mean to set reuse=True or "
-                         "reuse=tf.AUTO_REUSE in VarScope? "
-                         "Originally defined at:\n\n%s" % (
-                             name, "".join(traceback.format_list(tb))))
+        raise ValueError("%s Originally defined at:\n\n%s" % (err_msg, "".join(
+            traceback.format_list(tb))))
       found_var = self._vars[name]
       if not shape.is_compatible_with(found_var.get_shape()):
         raise ValueError("Trying to share variable %s, but specified shape %s"
@@ -889,20 +876,22 @@ class _VariableStore(object):
         variable_dtype = None
       else:
         # Instantiate initializer if provided initializer is a type object.
-        if isinstance(initializer, type(init_ops.Initializer)):
+        if tf_inspect.isclass(initializer):
           initializer = initializer(dtype=dtype)
-        if shape and shape.is_fully_defined():
+        if shape is not None and shape.is_fully_defined():
           init_val = lambda: initializer(  # pylint: disable=g-long-lambda
               shape.as_list(), dtype=dtype, partition_info=partition_info)
-        elif not tf_inspect.getargspec(initializer).args:
+          variable_dtype = dtype.base_dtype
+        elif len(tf_inspect.getargspec(initializer).args) == len(
+            tf_inspect.getargspec(initializer).defaults or []):
           init_val = initializer
+          variable_dtype = None
         else:
-          raise ValueError("You can only pass an initializer function that "
-                           "expects no arguments to its callable when the "
-                           "shape is not fully defined. The given initializer "
-                           "function expects the following args %s" %
-                           tf_inspect.getargspec(initializer).args)
-        variable_dtype = dtype.base_dtype
+          raise ValueError("The initializer passed is not valid. It should "
+                           "be a callable with no arguments and the "
+                           "shape should not be provided or an instance of "
+                           "`tf.keras.initializers.*' and `shape` should be "
+                           "fully defined.")
 
     # Create the variable.
     if use_resource is None:
@@ -1056,9 +1045,6 @@ class VariableScope(object):
       if self._caching_device is not None:
         raise NotImplementedError("Caching devices is not yet supported "
                                   "when eager execution is enabled.")
-      if self._partitioner is not None:
-        raise NotImplementedError("Partitioned variables are not yet supported "
-                                  "when eager execution is enabled.")
       self._reuse = AUTO_REUSE
       self._use_resource = True
 
@@ -1138,9 +1124,6 @@ class VariableScope(object):
 
   def set_partitioner(self, partitioner):
     """Set partitioner for this scope."""
-    if partitioner and context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
     self._partitioner = partitioner
 
   def set_custom_getter(self, custom_getter):
@@ -1249,11 +1232,10 @@ class VariableScope(object):
                                 partitioner=None,
                                 validate_shape=True,
                                 use_resource=None,
-                                constraint=None):
+                                constraint=None,
+                                synchronization=VariableSynchronization.AUTO,
+                                aggregation=VariableAggregation.NONE):
     """Gets an existing variable with this name or create a new one."""
-    if context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
     if initializer is None:
       initializer = self._initializer
     if regularizer is None:
@@ -1295,11 +1277,21 @@ class VariableScope(object):
     with ops.name_scope(None):
       # pylint: disable=protected-access
       return var_store._get_partitioned_variable(
-          full_name, shape=shape, dtype=dtype, initializer=initializer,
-          regularizer=regularizer, reuse=self.reuse, trainable=trainable,
-          collections=collections, caching_device=caching_device,
-          partitioner=partitioner, validate_shape=validate_shape,
-          use_resource=use_resource, constraint=constraint)
+          full_name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          reuse=self.reuse,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          validate_shape=validate_shape,
+          use_resource=use_resource,
+          constraint=constraint,
+          synchronization=synchronization,
+          aggregation=aggregation)
       # pylint: enable=protected-access
 
 
@@ -1656,7 +1648,9 @@ def _get_partitioned_variable(name,
                               partitioner=None,
                               validate_shape=True,
                               use_resource=None,
-                              constraint=None):
+                              constraint=None,
+                              synchronization=VariableSynchronization.AUTO,
+                              aggregation=VariableAggregation.NONE):
   """Gets or creates a sharded variable list with these parameters.
 
   The `partitioner` must be a callable that accepts a fully defined
@@ -1714,6 +1708,15 @@ def _get_partitioned_variable(name,
       variable and return the Tensor for the projected value
       (which must have the same shape). Constraints are not safe to
       use when doing asynchronous distributed training.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      `tf.VariableSynchronization`. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      `tf.VariableAggregation`.
 
   Returns:
     A tuple `(shards, partitions)` where `shards` is the list of `Variable`
@@ -1735,11 +1738,21 @@ def _get_partitioned_variable(name,
         "If so, consider instead using get_variable with a non-empty "
         "partitioner parameter instead." % scope.custom_getter)
   return scope._get_partitioned_variable(
-      _get_default_variable_store(), name, shape=shape, dtype=dtype,
-      initializer=initializer, regularizer=regularizer, trainable=trainable,
-      collections=collections, caching_device=caching_device,
-      partitioner=partitioner, validate_shape=validate_shape,
-      use_resource=use_resource, constraint=constraint)
+      _get_default_variable_store(),
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      trainable=trainable,
+      collections=collections,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      validate_shape=validate_shape,
+      use_resource=use_resource,
+      constraint=constraint,
+      synchronization=synchronization,
+      aggregation=aggregation)
   # pylint: enable=protected-access
 
 
@@ -2189,9 +2202,10 @@ class variable_scope(object):
 
     try:
       return self._enter_scope_uncached()
-    except:
-      if self._graph_context_manager is not None:
-        self._graph_context_manager.__exit__(*sys.exc_info())
+    except Exception:
+      if self._in_graph_mode and not self._building_function:
+        if self._graph_context_manager is not None:
+          self._graph_context_manager.__exit__(*sys.exc_info())
       raise
 
   def _enter_scope_uncached(self):
@@ -2355,34 +2369,71 @@ def variable_op_scope(values,
     yield scope
 
 
-def _compute_slice_dim_and_shape(full_shape, slicing):
-  """Computes which dimension is being sliced and the typical slice shape."""
-
-  slice_shape = [0] * len(full_shape)
-  slice_dim = None
-  for dim, num_slices in enumerate(slicing):
-    dim_size = full_shape[dim]
-    if num_slices <= 0 or dim_size < num_slices:
-      raise ValueError("Cannot create %d slices for size %d. shape: %s, "
-                       "slicing: %s" %
-                       (num_slices, full_shape[dim], full_shape, slicing))
-    if num_slices == 1:
-      # Not slicing in this dimension.
-      slice_shape[dim] = dim_size
-    elif slice_dim is not None:
-      # We only support slicing along one of the dimensions.
-      raise ValueError("Can only slice a variable along one dimension: "
-                       "shape: %s, slicing: %s" % (full_shape, slicing))
-    else:
-      # Note: We will add any extras onto the last slice, later.
-      slice_dim = dim
-      slice_shape[dim] = dim_size // num_slices
+def _call_partitioner(partitioner, shape, dtype):
+  """Call partitioner validating its inputs/output.
 
-  # Degenerate case: If "slicing" was all ones, pretend we are slicing along
-  # the first dimension.
-  if slice_dim is None:
+  Args:
+    partitioner: a function mapping `Tensor` shape and dtype to a
+        list of partitions.
+    shape: shape of the `Tensor` to partition, must have at least two
+        dimensions.
+    dtype: dtype of the elements in the `Tensor`.
+
+  Returns:
+    A list with elements >=1 and exactly one >1. The index of that
+    element corresponds to the partitioning axis.
+  """
+  if not shape.is_fully_defined():
+    raise ValueError("Shape of a new partitioned variable must be "
+                     "fully defined, but instead was %s." % (shape,))
+  if shape.ndims < 1:
+    raise ValueError("A partitioned Variable must have rank at least 1, "
+                     "shape: %s" % shape)
+
+  slicing = partitioner(shape=shape, dtype=dtype)
+  if not isinstance(slicing, collections_lib.Sequence):
+    raise ValueError("Partitioner must return a sequence, but saw: %s"
+                     % slicing)
+  if len(slicing) != shape.ndims:
+    raise ValueError(
+        "Partitioner returned a partition list that does not match the "
+        "Variable's rank: %s vs. %s" % (slicing, shape))
+  if any(p < 1 for p in slicing):
+    raise ValueError(
+        "Partitioner returned zero partitions for some axes: %s" %
+        slicing)
+  if sum(p > 1 for p in slicing) > 1:
+    raise ValueError(
+        "Can only slice a variable along one dimension: "
+        "shape: %s, partitioning: %s" % (shape, slicing))
+  return slicing
+
+
+# TODO(slebedev): could be inlined, but
+# `_VariableStore._get_partitioned_variable` is too complex even
+# without this logic.
+def _get_slice_dim_and_num_slices(slicing):
+  """Get slicing dimension and number of slices from the partitioner output."""
+  for slice_dim, num_slices in enumerate(slicing):
+    if num_slices > 1:
+      break
+  else:
+    # Degenerate case: no partitioning applied.
     slice_dim = 0
-  return slice_dim, slice_shape
+    num_slices = 1
+  return slice_dim, num_slices
+
+
+def _iter_slices(full_shape, num_slices, slice_dim):
+  """Slices a given a shape along the specified dimension."""
+  num_slices_with_excess = full_shape[slice_dim] % num_slices
+  offset = [0] * len(full_shape)
+  min_slice_len = full_shape[slice_dim] // num_slices
+  for i in xrange(num_slices):
+    shape = full_shape[:]
+    shape[slice_dim] = min_slice_len + bool(i < num_slices_with_excess)
+    yield offset[:], shape
+    offset[slice_dim] += shape[slice_dim]
 
 
 def _get_trainable_value(synchronization, trainable):
@@ -2558,7 +2609,7 @@ def variable_creator_scope_v1(variable_creator):
 
 
 # Note: only the docstrings differ between this and v1.
-@tf_export(v2=["variable_creator_scope"])
+@tf_export("variable_creator_scope", v1=[])
 @tf_contextlib.contextmanager
 def variable_creator_scope(variable_creator):
   """Scope which defines a variable creation function to be used by variable().
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 45c8618610687d1350605192d8908338e0fc9588..a31ce655183f8fb7e6331c2d6a4b3af8076902c8 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -18,7 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import enum  # pylint: disable=g-bad-import-order
-
+import functools
+import os
 import six
 
 from tensorflow.core.framework import attr_value_pb2
@@ -58,6 +59,21 @@ def _make_getter(captured_getter, captured_previous):
   return getter
 
 
+def _has_cycle(op, path):
+  """Detect cycles in the dependencies of `initial_value`."""
+  if op.name in path:
+    return True
+  path.add(op.name)
+  for op_input in op.inputs:
+    if _has_cycle(op_input.op, path):
+      return True
+  for op_control_input in op.control_inputs:
+    if _has_cycle(op_control_input, path):
+      return True
+  path.remove(op.name)
+  return False
+
+
 @tf_export("VariableSynchronization")
 class VariableSynchronization(enum.Enum):
   """Indicates when a distributed variable will be synced.
@@ -79,28 +95,42 @@ class VariableSynchronization(enum.Enum):
   ON_READ = 3
 
 
-@tf_export("VariableAggregation")
-class VariableAggregation(enum.Enum):
+@tf_export("VariableAggregation", v1=[])
+class VariableAggregationV2(enum.Enum):
   """Indicates how a distributed variable will be aggregated.
 
   `tf.contrib.distribute.DistributionStrategy` distributes a model by making
-  multiple copies (called "towers") acting data-parallel on different elements
+  multiple copies (called "replicas") acting data-parallel on different elements
   of the input batch. When performing some variable-update operation, say
   `var.assign_add(x)`, in a model, we need to resolve how to combine the
-  different values for `x` computed in the different towers.
+  different values for `x` computed in the different replicas.
 
   * `NONE`: This is the default, giving an error if you use a
-    variable-update operation with multiple towers.
-  * `SUM`: Add the updates across towers.
-  * `MEAN`: Take the arithmetic mean ("average") of the updates across towers.
-  * `ONLY_FIRST_TOWER`: This is for when every tower is performing the same
+    variable-update operation with multiple replicas.
+  * `SUM`: Add the updates across replicas.
+  * `MEAN`: Take the arithmetic mean ("average") of the updates across replicas.
+  * `ONLY_FIRST_REPLICA`: This is for when every replica is performing the same
     update, but we only want to perform the update once. Used, e.g., for the
     global step counter.
   """
   NONE = 0
   SUM = 1
   MEAN = 2
-  ONLY_FIRST_TOWER = 3
+  ONLY_FIRST_REPLICA = 3
+
+
+@tf_export(v1=["VariableAggregation"])
+class VariableAggregation(enum.Enum):
+  NONE = 0
+  SUM = 1
+  MEAN = 2
+  ONLY_FIRST_REPLICA = 3
+  ONLY_FIRST_TOWER = 3  # DEPRECATED
+
+
+VariableAggregation.__doc__ = (
+    VariableAggregationV2.__doc__ +
+    "* `ONLY_FIRST_TOWER`: Deprecated alias for `ONLY_FIRST_REPLICA`.\n  ")
 
 
 class VariableMetaclass(type):
@@ -187,7 +217,7 @@ class VariableMetaclass(type):
       return super(VariableMetaclass, cls).__call__(*args, **kwargs)
 
 
-@tf_export(v2=["Variable"])
+@tf_export("Variable", v1=[])
 class Variable(six.with_metaclass(VariableMetaclass,
                                   checkpointable.CheckpointableBase)):
   """See the [Variables Guide](https://tensorflow.org/guide/variables).
@@ -607,37 +637,84 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    raise NotImplementedError
+
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
 
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+    Assuming the variable has rank `P` and `indices` is a `Tensor` of rank `Q`.
 
-    `indices` must be integer tensor, containing indices into `ref`.
+    `indices` must be integer tensor, containing indices into self.
     It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
     The innermost dimension of `indices` (with length `K`) corresponds to
     indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
+    dimension of self.
 
     `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
     ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    [d_0, ..., d_{Q-2}, self.shape[K], ..., self.shape[P-1]].
     ```
 
     For example, say we want to add 4 scattered elements to a rank-1 tensor to
     8 elements. In Python, that update would look like this:
 
     ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        v = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
-        op = ref.scatter_nd_sub(indices, updates)
+        op = v.scatter_nd_sub(indices, updates)
         with tf.Session() as sess:
           print sess.run(op)
     ```
 
-    The resulting update to ref would look like this:
+    The resulting update to v would look like this:
 
         [1, -9, 3, -6, -6, 6, 7, -4]
 
@@ -661,34 +738,34 @@ class Variable(six.with_metaclass(VariableMetaclass,
   def scatter_nd_add(self, indices, updates, name=None):
     """Applies sparse addition to individual values or slices in a Variable.
 
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+    The Variable has rank `P` and `indices` is a `Tensor` of rank `Q`.
 
-    `indices` must be integer tensor, containing indices into `ref`.
+    `indices` must be integer tensor, containing indices into self.
     It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
     The innermost dimension of `indices` (with length `K`) corresponds to
     indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
+    dimension of self.
 
     `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
     ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    [d_0, ..., d_{Q-2}, self.shape[K], ..., self.shape[P-1]].
     ```
 
     For example, say we want to add 4 scattered elements to a rank-1 tensor to
     8 elements. In Python, that update would look like this:
 
     ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        v = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
-        add = ref.scatter_nd_add(indices, updates)
+        add = v.scatter_nd_add(indices, updates)
         with tf.Session() as sess:
           print sess.run(add)
     ```
 
-    The resulting update to ref would look like this:
+    The resulting update to v would look like this:
 
         [1, 13, 3, 14, 14, 6, 7, 20]
 
@@ -712,34 +789,34 @@ class Variable(six.with_metaclass(VariableMetaclass,
   def scatter_nd_update(self, indices, updates, name=None):
     """Applies sparse assignment to individual values or slices in a Variable.
 
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+    The Variable has rank `P` and `indices` is a `Tensor` of rank `Q`.
 
-    `indices` must be integer tensor, containing indices into `ref`.
+    `indices` must be integer tensor, containing indices into self.
     It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
     The innermost dimension of `indices` (with length `K`) corresponds to
     indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
+    dimension of self.
 
     `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
     ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    [d_0, ..., d_{Q-2}, self.shape[K], ..., self.shape[P-1]].
     ```
 
     For example, say we want to add 4 scattered elements to a rank-1 tensor to
     8 elements. In Python, that update would look like this:
 
     ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        v = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
-        op = ref.scatter_nd_assign(indices, updates)
+        op = v.scatter_nd_assign(indices, updates)
         with tf.Session() as sess:
           print sess.run(op)
     ```
 
-    The resulting update to ref would look like this:
+    The resulting update to v would look like this:
 
         [1, 11, 3, 10, 9, 6, 7, 12]
 
@@ -831,18 +908,18 @@ class Variable(six.with_metaclass(VariableMetaclass,
     else:
       return v.value()
 
-  @staticmethod
-  def _OverloadAllOperators():  # pylint: disable=invalid-name
+  @classmethod
+  def _OverloadAllOperators(cls):  # pylint: disable=invalid-name
     """Register overloads for all operators."""
     for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      Variable._OverloadOperator(operator)
+      cls._OverloadOperator(operator)
     # For slicing, bind getitem differently than a tensor (use SliceHelperVar
     # instead)
     # pylint: disable=protected-access
-    setattr(Variable, "__getitem__", array_ops._SliceHelperVar)
+    setattr(cls, "__getitem__", array_ops._SliceHelperVar)
 
-  @staticmethod
-  def _OverloadOperator(operator):  # pylint: disable=invalid-name
+  @classmethod
+  def _OverloadOperator(cls, operator):  # pylint: disable=invalid-name
     """Defer an operator overload to `ops.Tensor`.
 
     We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
@@ -850,17 +927,26 @@ class Variable(six.with_metaclass(VariableMetaclass,
     Args:
       operator: string. The operator name.
     """
+    tensor_oper = getattr(ops.Tensor, operator)
 
-    def _run_op(a, *args):
+    def _run_op(a, *args, **kwargs):
       # pylint: disable=protected-access
-      return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
-    # Propagate __doc__ to wrapper
-    try:
-      _run_op.__doc__ = getattr(ops.Tensor, operator).__doc__
-    except AttributeError:
-      pass
+      return tensor_oper(a._AsTensor(), *args, **kwargs)
+
+    functools.update_wrapper(_run_op, tensor_oper)
+    setattr(cls, operator, _run_op)
+
+  def __iter__(self):
+    """Dummy method to prevent iteration. Do not call.
+
+    NOTE(mrry): If we register __getitem__ as an overloaded operator,
+    Python will valiantly attempt to iterate over the variable's Tensor from 0
+    to infinity.  Declaring this method prevents this unintended behavior.
 
-    setattr(Variable, operator, _run_op)
+    Raises:
+      TypeError: when invoked.
+    """
+    raise TypeError("'Variable' object is not iterable.")
 
   # NOTE(mrry): This enables the Variable's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
@@ -1016,27 +1102,6 @@ class Variable(six.with_metaclass(VariableMetaclass,
       else:
         return None
 
-  def __iadd__(self, other):
-    raise NotImplementedError
-
-  def __isub__(self, other):
-    raise NotImplementedError
-
-  def __imul__(self, other):
-    raise NotImplementedError
-
-  def __idiv__(self, other):
-    raise NotImplementedError
-
-  def __itruediv__(self, other):
-    raise NotImplementedError
-
-  def __irealdiv__(self, other):
-    raise NotImplementedError
-
-  def __ipow__(self, other):
-    raise NotImplementedError
-
 
 @tf_export(v1=["Variable"])
 class VariableV1(Variable):
@@ -1547,18 +1612,6 @@ class RefVariable(VariableV1):
     """
     return self._snapshot
 
-  def __iter__(self):
-    """Dummy method to prevent iteration. Do not call.
-
-    NOTE(mrry): If we register __getitem__ as an overloaded operator,
-    Python will valiantly attempt to iterate over the variable's Tensor from 0
-    to infinity.  Declaring this method prevents this unintended behavior.
-
-    Raises:
-      TypeError: when invoked.
-    """
-    raise TypeError("'Variable' object is not iterable.")
-
   def value(self):
     """Returns the last snapshot of this variable.
 
@@ -1836,6 +1889,55 @@ class RefVariable(VariableV1):
         use_locking=use_locking,
         name=name)
 
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return state_ops.batch_scatter_update(
+        self, sparse_delta.indices, sparse_delta.values,
+        use_locking=use_locking, name=name)
+
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
 
@@ -2094,37 +2196,6 @@ class RefVariable(VariableV1):
     else:
       return v.value()
 
-  @staticmethod
-  def _OverloadAllOperators():  # pylint: disable=invalid-name
-    """Register overloads for all operators."""
-    for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      Variable._OverloadOperator(operator)  # pylint: disable=protected-access
-    # For slicing, bind getitem differently than a tensor (use SliceHelperVar
-    # instead)
-    # pylint: disable=protected-access
-    setattr(Variable, "__getitem__", array_ops._SliceHelperVar)
-
-  @staticmethod
-  def _OverloadOperator(operator):  # pylint: disable=invalid-name
-    """Defer an operator overload to `ops.Tensor`.
-
-    We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
-
-    Args:
-      operator: string. The operator name.
-    """
-
-    def _run_op(a, *args):
-      # pylint: disable=protected-access
-      return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
-    # Propagate __doc__ to wrapper
-    try:
-      _run_op.__doc__ = getattr(ops.Tensor, operator).__doc__
-    except AttributeError:
-      pass
-
-    setattr(Variable, operator, _run_op)
-
   def _gather_saveables_for_checkpoint(self):
     """For implementing `Checkpointable`. This object is saveable on its own."""
     return {checkpointable.VARIABLE_VALUE_KEY: self}
@@ -2158,20 +2229,7 @@ class RefVariable(VariableV1):
       raise TypeError("initial_value needs to be a Tensor: %s" % initial_value)
 
     # Don't modify initial_value if it contains any cyclic dependencies.
-    def has_cycle(op, path):
-      """Detect cycles in the dependencies of `initial_value`."""
-      if op.name in path:
-        return True
-      path.add(op.name)
-      for op_input in op.inputs:
-        if has_cycle(op_input.op, path):
-          return True
-      for op_control_input in op.control_inputs:
-        if has_cycle(op_control_input, path):
-          return True
-      path.remove(op.name)
-      return False
-    if has_cycle(initial_value.op, path=set()):
+    if _has_cycle(initial_value.op, path=set()):
       return initial_value
 
     return self._safe_initial_value_from_tensor(initial_value, op_cache={})
@@ -2441,34 +2499,6 @@ class PartitionedVariable(object):
   @end_compatibility
   """
 
-  class PartitionedVariableIterator(object):
-    """An iterator that allows accessing the underlying `Variable` objects.
-
-    This iterator is necessary to control order of access when Variables
-    are not partitioned in a standard way along a single axis.
-
-    Allows e.g. `list(partitioned_variable)` to return a proper list.
-    """
-
-    def __init__(self, partitioned_variable):
-      self._ix = 0
-      self._partitioned_variable = partitioned_variable
-
-    def __iter__(self):
-      return self
-
-    def __next__(self):  # For python3 compatibility.
-      return self.next()
-
-    def next(self):
-      # pylint: disable=protected-access
-      if self._ix >= len(self._partitioned_variable._variable_list):
-        raise StopIteration()
-      variable = self._partitioned_variable._variable_list[self._ix]
-      # pylint: enable=protected-access
-      self._ix += 1
-      return variable
-
   def __init__(self, name, shape, dtype, variable_list, partitions):
     """Creates a new partitioned variable wrapper.
 
@@ -2488,31 +2518,27 @@ class PartitionedVariable(object):
         `partitions` is not a list.
       ValueError: If `variable_list` is empty, or the `Variable` shape
         information does not match `shape`, or `partitions` has invalid values.
-      RuntimeError: If eager execution is enabled
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "tf.PartitionedVariable not supported with eager execution enabled.")
     if not isinstance(variable_list, (list, tuple)):
       raise TypeError(
           "variable_list is not a list or tuple: %s" % variable_list)
     if not isinstance(partitions, (list, tuple)):
       raise TypeError("partitions is not a list or tuple: %s" % partitions)
-    if not all([p >= 1 for p in partitions]):
+    if not all(p >= 1 for p in partitions):
       raise ValueError("partition values must be positive: %s" % partitions)
     if not variable_list:
       raise ValueError("variable_list may not be empty")
     # pylint: disable=protected-access
     for v in variable_list:
       # Sort the variable_list lexicographically according to var offset value.
-      if not all([v._get_save_slice_info() is not None for v in variable_list]):
+      if not all(v._get_save_slice_info() is not None for v in variable_list):
         raise ValueError(
             "All variables must have a save_slice_info available: %s"
             % [v.name for v in variable_list])
       if len(shape) != len(partitions):
         raise ValueError("len(shape) != len(partitions): %s vs. %s"
                          % (shape, partitions))
-      if not all([v._get_save_slice_info().full_shape == shape]):
+      if v._get_save_slice_info().full_shape != shape:
         raise ValueError(
             "All variables' full shapes must match shape: %s; "
             "but full shapes were: %s"
@@ -2529,7 +2555,7 @@ class PartitionedVariable(object):
 
   def __iter__(self):
     """Return an iterable for accessing the underlying partition Variables."""
-    return self.PartitionedVariableIterator(self)
+    return iter(self._variable_list)
 
   def __len__(self):
     num_partition_axes = len(self._partition_axes())
@@ -2539,7 +2565,7 @@ class PartitionedVariable(object):
     return len(self._variable_list)
 
   def _partition_axes(self):
-    if all([p == 1 for p in self._partitions]):
+    if all(p == 1 for p in self._partitions):
       return [0]
     else:
       return [i for i, p in enumerate(self._partitions) if p > 1]
@@ -2627,45 +2653,52 @@ class PartitionedVariable(object):
           "Cannot do assign action along more than one dimension: %s.  "
           "Multi-axis partition assign action is not supported " %
           str(partition_axes))
-    partition_ix = partition_axes[0]
-    size_splits_list = [
-        var.shape[partition_ix].value for var in self._variable_list
-    ]
-    value_list = array_ops.split(value, size_splits_list, axis=partition_ix)
+    if isinstance(value, list):
+      assert len(value) == len(self._variable_list)
+      value_list = value
+    elif isinstance(value, PartitionedVariable):
+      value_list = [var_part for var_part in value]
+    else:
+      partition_ix = partition_axes[0]
+      size_splits_list = [
+          tensor_shape.dimension_value(var.shape[partition_ix])
+          for var in self._variable_list
+      ]
+      value_list = array_ops.split(value, size_splits_list, axis=partition_ix)
+
     op_list = [
-        assign_fn(var, value_list[idx], idx)
+        assign_fn(var, value_list[idx])
         for idx, var in enumerate(self._variable_list)
     ]
     return op_list
 
   def assign(self, value, use_locking=False, name=None, read_value=True):
-    assign_fn = lambda var, r_value, idx: var.assign(
+    assign_fn = lambda var, r_value: var.assign(
         r_value, use_locking=use_locking,
-        name="%s_%d" % (name, idx), read_value=read_value)
+        name=name, read_value=read_value)
     assign_list = self._apply_assign_fn(assign_fn, value)
     if read_value:
       return assign_list
     return [assign.op for assign in assign_list]
 
   def assign_add(self, value, use_locking=False, name=None, read_value=True):
-    assign_fn = lambda var, r_value, idx: var.assign_add(
+    assign_fn = lambda var, r_value: var.assign_add(
         r_value, use_locking=use_locking,
-        name="%s_%d" % (name, idx), read_value=read_value)
+        name=name, read_value=read_value)
     assign_list = self._apply_assign_fn(assign_fn, value)
     if read_value:
       return assign_list
     return [assign.op for assign in assign_list]
 
   def assign_sub(self, value, use_locking=False, name=None, read_value=True):
-    assign_fn = lambda var, r_value, idx: var.assign_sub(
+    assign_fn = lambda var, r_value: var.assign_sub(
         r_value, use_locking=use_locking,
-        name="%s_%d" % (name, idx), read_value=read_value)
+        name=name, read_value=read_value)
     assign_list = self._apply_assign_fn(assign_fn, value)
     if read_value:
       return assign_list
     return [assign.op for assign in assign_list]
 
-
 @tf_export(v1=["global_variables"])
 def global_variables(scope=None):
   """Returns global variables.
@@ -2972,7 +3005,9 @@ def report_uninitialized_variables(var_list=None,
     # Run all operations on CPU
     if var_list:
       init_vars = [state_ops.is_variable_initialized(v) for v in var_list]
-    with ops.device("/cpu:0"):
+    local_device = os.environ.get(
+        "TF_DEVICE_FOR_UNINITIALIZED_VARIABLE_REPORTING", "/cpu:0")
+    with ops.device(local_device):
       if not var_list:
         # Return an empty tensor so we only need to check for returned tensor
         # size being 0 as an indication of model ready.
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 8e88a84d6057e282570432b8bc535a39af6240ce..d00c158d156b225553b52437324accd019c76aee 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -23,82 +23,120 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import sys
-
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import cond_v2_impl as cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import control_flow_util_v2 as util
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 
 # pylint: disable=protected-access
 
-control_flow_ops._while_v2 = sys.modules[__name__]
-
 # TODO(b/79881896): Handle external control dependencies. tf.while_loop allows
 # control dependencies on external nodes with at least 1 output.
 # Another idea is to create const nodes outside the loop and add control edges
 # to them and then pass those in as data inputs. This should probably be
 # handled in the CapturingGraph itself.
 
-
-def while_loop(cond, body, loop_vars, shape_invariants=None, name=None):
+# Op types that output a resource tensor representing a TensorArray handle.
+TENSOR_ARRAY_HANDLE_OPS = (
+    "TensorArrayV3",
+    "TensorArrayGradV3",
+    "TensorArrayGradWithShape",
+)
+
+
+def while_loop(cond,
+               body,
+               loop_vars,
+               shape_invariants=None,
+               maximum_iterations=None,
+               name=None,
+               return_same_structure=True):
   """Like tf.while_loop, except emits a single While op."""
-  flattened_loop_vars = nest.flatten(loop_vars)
+  maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
+  # Keep the original loop_vars around to know which args were TensorArrays.
+  orig_loop_vars = loop_vars
+  # Cache its length since we use it at multiple places below.
+  len_orig_loop_vars = len(orig_loop_vars)
+
+  # Convert TensorArrays to their flow variables. These get converted back to
+  # TensorArrays before calling `cond` and `body`. See `wrapped_cond` and
+  # `wrapped_body` below.
+  loop_vars = list(_tensor_array_to_flow(orig_loop_vars))
+  loop_vars = nest.map_structure(
+      ops.internal_convert_to_tensor_or_indexed_slices, loop_vars)
   if shape_invariants is not None:
-    nest.assert_same_structure(loop_vars, shape_invariants)
-    flattened_shapes = nest.flatten(shape_invariants)
+    nest.assert_same_structure(orig_loop_vars, shape_invariants)
   else:
-    flattened_shapes = [t.shape for t in flattened_loop_vars]
-
-  del shape_invariants
+    shape_invariants = nest.map_structure(lambda t: t.shape, loop_vars)
 
   if not name:
     name = "while"
 
   with ops.name_scope(name) as scope:
     with ops.name_scope(None):
-      cond_name = _get_unique_name(("%scond" % scope).replace("/", "_"))
-      body_name = _get_unique_name(("%sbody" % scope).replace("/", "_"))
-
-    num_outputs = len(flattened_loop_vars)
-
+      cond_name = util.unique_fn_name(scope, "cond")
+      body_name = util.unique_fn_name(scope, "body")
+
+    loop_counter = constant_op.constant(
+        0,
+        dtype=maximum_iterations.dtype
+        if maximum_iterations is not None else None,
+        name="loop_counter")
     # Add loop counter needed for computing gradients.
-    flattened_loop_vars = [constant_op.constant(0., name="loop_counter")
-                          ] + flattened_loop_vars
+    loop_vars = [loop_counter] + loop_vars
 
-    flattened_shapes = [tensor_shape.scalar()] + flattened_shapes
+    shape_invariants = type(shape_invariants)([tensor_shape.scalar()
+                                              ]) + shape_invariants
 
-    # Build a `cond` wrapper that can handle the extra counter loop_var.
-    def wrapped_cond(unused_loop_counter, *loop_vars):
-      return cond(*loop_vars)
+    # Automatic control dependencies are added in defuns, but not in v1
+    # graphs. Propagate that behavior here.
+    add_control_dependencies = util.in_defun()
 
-    signature = [
-        tensor_spec.TensorSpec(shape, t.dtype)
-        for shape, t in zip(flattened_shapes, flattened_loop_vars)
-    ]
-    cond_graph = function.func_graph_from_py_func(
-        cond_name, wrapped_cond, flattened_loop_vars, {}, signature=signature)
+    # Build a `cond` wrapper that can handle the extra counter loop_var.
+    def wrapped_cond(loop_counter, *args):
+      # Convert the flow variables in `args` to TensorArrays. `args` should
+      # already have the same structure as `orig_loop_vars` but currently there
+      # is no nest.zip so we call `_pack_sequence_as` which flattens both
+      # `orig_loop_vars` and `args`, converts flows in `args` to TensorArrays
+      # and packs it into the structure of `orig_loop_vars`.
+      if maximum_iterations is None:
+        return cond(*_pack_sequence_as(orig_loop_vars, args))
+      else:
+        return math_ops.logical_and(
+            loop_counter < maximum_iterations,
+            cond(*_pack_sequence_as(orig_loop_vars, args)))
+
+    cond_graph = func_graph_module.func_graph_from_py_func(
+        cond_name,
+        wrapped_cond,
+        loop_vars, {},
+        signature=_build_signature(loop_vars, shape_invariants),
+        func_graph=util.WhileCondFuncGraph(cond_name),
+        add_control_dependencies=add_control_dependencies)
 
     # Add external_captures of cond to the list of loop vars.
     # Note that external tensors will be treated as loop invariants, i.e.,
     # the value of that tensor in each iteration is the same as it was at the
     # beginning of the loop execution.
-    flattened_loop_vars = flattened_loop_vars + cond_graph.external_captures
-    flattened_shapes = flattened_shapes + [
-        t.shape for t in cond_graph.external_captures
-    ]
+    loop_vars = loop_vars + cond_graph.external_captures
+    shape_invariants = shape_invariants + type(shape_invariants)(
+        [t.shape for t in cond_graph.external_captures])
 
     def wrapped_body(loop_counter, *args):
       """Loop body augmented with counter update.
@@ -106,287 +144,429 @@ def while_loop(cond, body, loop_vars, shape_invariants=None, name=None):
       Args:
         loop_counter: Loop counter which needs to be incremented in the body.
         *args: List of args
-          args[:num_outputs] - Args for the original loop body.
-          args[num_outputs:] - External captures of cond. These get passed
-            through as is.
+          args[:len_orig_loop_vars] - Args for the original loop body.
+          args[len_orig_loop_vars:] - External captures of cond. These get
+            passed through as is.
 
       Returns:
         A list of tensors the same length as args.
       """
-      outputs = body(*args[:num_outputs])
-      if not isinstance(outputs, collections.Sequence):
+      # Convert the flow variables in `args` to TensorArrays. `args` should
+      # already have the same structure as `orig_loop_vars` but currently there
+      # is no nest.zip so we call `_pack_sequence_as` which flattens both
+      # `orig_loop_vars` and `args`, converts flows in `args` to TensorArrays
+      # and packs it into the structure of `orig_loop_vars`.
+      outputs = body(
+          *_pack_sequence_as(orig_loop_vars, args[:len_orig_loop_vars]))
+      if not nest.is_sequence(outputs):
         outputs = [outputs]
+      # Compare the structure of input and output of body converting the
+      # top-level tuples to list to be compatible with legacy while_loop.
+      nest.assert_same_structure(list(outputs), list(orig_loop_vars))
+
+      outputs = _tensor_array_to_flow(outputs)
 
       # Return the external_captures of cond_graph as is, i.e., treat them as
       # loop invariants.
       # TODO(srbs): Update lowering code to create _Enter nodes with
       # is_constant=True for inputs that are directly passed to outputs.
-      return [loop_counter + 1] + list(outputs) + list(args[num_outputs:])
-
-    signature = [
-        tensor_spec.TensorSpec(shape, t.dtype)
-        for shape, t in zip(flattened_shapes, flattened_loop_vars)
-    ]
-    body_graph = function.func_graph_from_py_func(
-        body_name, wrapped_body, flattened_loop_vars, {}, signature=signature)
+      return [loop_counter + 1] + list(outputs) + list(
+          args[len_orig_loop_vars:])
+
+    body_graph = func_graph_module.func_graph_from_py_func(
+        body_name,
+        wrapped_body,
+        loop_vars, {},
+        signature=_build_signature(loop_vars, shape_invariants),
+        func_graph=util.WhileBodyFuncGraph(body_name),
+        add_control_dependencies=add_control_dependencies)
     # Add external captures of body to the list of loop vars.
     # Note that external tensors will be treated as loop invariants, i.e.,
     # the value of that tensor in each iteration is the same as it was at the
     # beginning of the loop execution.
-    flattened_loop_vars = flattened_loop_vars + body_graph.external_captures
+    loop_vars = loop_vars + body_graph.external_captures
     # TODO(srbs): Update lowering code to create _Enter nodes with
     # is_constant=True for inputs that are directly passed to outputs.
     body_graph.outputs.extend(body_graph.internal_captures)
 
     # Capture `external_captures` of `body_graph` in `cond_graph` so that it
     # expects to receive those as arguments.
-    # TODO(srbs): Dedup tensors that are captured in both the cond and body.
-    # This logic already exists in cond_v2.
+    # TODO(b/118457764): Dedup tensors that are captured in both the cond and
+    # body. This logic already exists in cond_v2.
     with cond_graph.as_default():
       for external_capture in body_graph.external_captures:
+        assert external_capture not in cond_graph.captures, (
+            "Looks like both cond and body are capturing the same tensor %s. "
+            "This is not supported yet. For now consider passing,"
+            " this as a loop variable." % str(external_capture))
         cond_graph.capture(external_capture)
 
-    # Export all tensors in the loop body that may be needed for gradient
-    # computation. We do this by accumulating the intermediate values in
-    # TensorLists.
-    intermediate_tensors = _get_intermediates(body_graph)
-
-    for intermediate_tensor in intermediate_tensors:
-      # TODO(srbs): Cache and re-use empty tensor lists.
-      tensor_list = list_ops.empty_tensor_list(
-          element_dtype=intermediate_tensor.dtype,
-          element_shape=_get_tensor_convertible_shape(
-              intermediate_tensor.shape))
-      flattened_loop_vars.append(tensor_list)
-      with cond_graph.as_default():
-        # Add a placeholder to cond_graph's inputs corresponding to the
-        # tensor_list.
-        cond_graph.capture(tensor_list)
-      with body_graph.as_default():
-        # Push the intermediate tensor to the tensor list. This captures the
-        # `tensor_list` as well.
-        appended_tensor_list = list_ops.tensor_list_push_back(
-            tensor_list,
-            intermediate_tensor)
-        # Add this modified tensor list to the list of outputs.
-        body_graph.outputs.append(appended_tensor_list)
-
     # Make sure that the shapes of the loop outputs are compatible with the
     # shape invariants, or the shapes of the loop vars if the invariants are not
     # specified.
-    _check_shapes_compat(body_graph.outputs[1:1 + num_outputs],
-                         flattened_shapes[1:1 + num_outputs],
-                         flattened_loop_vars[1:1 + num_outputs])
+    num_flattened_outputs = len(nest.flatten(orig_loop_vars))
+    _check_shapes_compat(
+        body_graph.outputs[1:1 + num_flattened_outputs],
+        nest.flatten(shape_invariants[1:1 + len_orig_loop_vars]),
+        nest.flatten(loop_vars[1:1 + len_orig_loop_vars]))
+    flattened_loop_vars = nest.flatten(loop_vars)
+    _check_num_inputs_outputs(cond_graph, body_graph,
+                              len(flattened_loop_vars))
+
     outputs = gen_functional_ops._while(
         flattened_loop_vars,
-        cond_v2._create_new_tf_function(cond_graph),
-        cond_v2._create_new_tf_function(body_graph),
+        util.create_new_tf_function(cond_graph),
+        util.create_new_tf_function(body_graph),
         output_shapes=[t.shape for t in body_graph.outputs],
         name=scope)
 
     _copy_handle_data(body_graph.outputs, outputs)
-    _maybe_set_lowering_attr(outputs[0].op)
+    util.maybe_set_lowering_attr(outputs[0].op)
+    _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
+
+    # Return identities for each output of the While op, rather than the output
+    # of the While op directly. This makes pruning work if the output of
+    # while_loop() is fetched: the lowering pass converts the While outputs into
+    # IdentityN outputs, which if fetched will cause all ops in the body to be
+    # run (since it takes all exit ops as input). After lowering, each output
+    # identity op will end up with only the appropriate exit op as input.
+    outputs = tuple(array_ops.identity(t) for t in outputs)
 
   # First var is loop counter.
-  if num_outputs == 1:
-    return outputs[1]
+  outputs = _pack_sequence_as(orig_loop_vars,
+                              outputs[1:1 + num_flattened_outputs])
+
+  if return_same_structure:
+    return outputs
+
+  flattened_outputs = nest.flatten(outputs)
+  if len(flattened_outputs) == 1:
+    return flattened_outputs[0]
   else:
-    return nest.pack_sequence_as(loop_vars, outputs[1:1 + num_outputs])
+    return outputs
 
 
 @ops.RegisterGradient("While")
 def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of a While op produced by while_loop."""
-  body_graph = _get_body_graph(op)
-
-  # Replace None gradients with zeros. This is needed because `grads` could have
-  # None incoming gradients for the TensorLists. If we pass None's through, the
-  # custom gradient of TensorListPopBack will create an EmptyTensorList inside
-  # the FuncGraph which is undesirable.
-  # TODO(b/80444525): There might be an issue with treating no gradient as zero
-  # gradient in certain cases. Consider replacing None gradients with Zeros
-  # for accumulators only.
+  cond_graph = _get_graph(op, "cond")
+  body_graph = _get_graph(op, "body")
+  orig_num_params = len(body_graph.outputs)
+
+  maximum_iterations = op.get_attr(
+      "_maximum_iterations") if _is_in_xla_context() else None
+  assert not _is_in_xla_context() or maximum_iterations is not None
+
+  # Set the incoming gradient of TensorArray handles to None. The gradient
+  # implementation currently assumes all resource tensors correspond to float32
+  # ResourceVariables, which can lead to runtime shape errors when used with a
+  # TensorArray. This is a workaround until TensorArrays are reimplemented with
+  # TensorLists instead of resources.
+  # Also set the incoming gradient of non-trainable inputs to None. It is
+  # possible that we receive non-None gradients for non-trainable types in
+  # nested while loops because we accumulate outputs of the inner while as
+  # variant tensors which are trainable and hence receive zeros_like tensors in
+  # the gradient pass. The non-trainable tensors then receive the popped zeros
+  # tensor from this zeros variant. The gradient for the loop vars corresponding
+  # to these tensors is None or zeros (this happens only if the loop var is
+  # accumulated as well) in _grad_fn so we reset these.
+  # TODO(b/118712257): Remove the IsTrainable filter once we can handle None
+  # output grads in _grad_fn.
   grads = [
-      g if g is not None else array_ops.zeros_like(output)
-      for g, output in zip(grads, op.outputs)
+      None if _is_tensor_array_handle(output) or not _is_trainable(output)
+      else grad for grad, output in zip(grads, body_graph.outputs)
   ]
 
+  # Ensure that all non-resource trainable outputs have incoming gradients.
+  assert all(g is not None or o.dtype == dtypes.resource or not _is_trainable(o)
+             for o, g in zip(body_graph.outputs, grads)
+            ), "All trainable loop vars must receive incoming gradients."
+  # We compute the gradient for the sub-graph between trainable ys and xs
+  # with non-None incoming gradients. We later pad the None's to the list of
+  # outputs.
+  ys, xs, non_none_grads = zip(*[(y, x, grad) for (y, x, grad) in zip(
+      body_graph.outputs, body_graph.inputs, grads) if grad is not None])
+
   body_grad_graph, args = _create_grad_func(
-      body_graph, grads,
-      _get_unique_name("%s_grad" % body_graph.name), op)
-
-  intermediate_tensors = _get_intermediates(body_grad_graph)
-
-  for intermediate_tensor in intermediate_tensors:
-    tensor_list = list_ops.empty_tensor_list(
-        element_dtype=intermediate_tensor.dtype,
-        element_shape=_get_tensor_convertible_shape(intermediate_tensor.shape))
-    with body_grad_graph.as_default():
-      tensor_list_ph = body_grad_graph.capture(tensor_list, whitelisted=True)
-      # Push the intermediate tensor to the tensor list.
-      appended_tensor_list = list_ops.tensor_list_push_back(tensor_list_ph,
-                                                            intermediate_tensor)
-      # Add this modified tensor list to the list of outputs.
-      body_grad_graph.outputs.append(appended_tensor_list)
+      ys, xs, non_none_grads, cond_graph, body_graph,
+      util.unique_grad_fn_name(body_graph.name), op, maximum_iterations)
+
+  if body_grad_graph.while_op_needs_rewrite:
+    # Modify 'op' to output the intermediate accumulators needed by the grad
+    # function.
+    # NOTE(skyewm): if there are any active sessions, this modification to `op`
+    # may make them unrunnable!
+
+    cond_graph.name += "_rewritten"
+    body_graph.name += "_rewritten"
+
+    new_inputs = body_grad_graph.empty_tensor_lists
+    new_outputs = body_graph.outputs[orig_num_params:]
+
+    op._set_func_attr("cond", util.create_new_tf_function(cond_graph))
+    op._set_func_attr("body", util.create_new_tf_function(body_graph))
+    op._set_type_list_attr("T", body_graph.output_types)
+    op._set_shape_list_attr("output_shapes", body_graph.output_shapes)
+    op._add_while_inputs(new_inputs)
+    op._add_outputs([t.dtype for t in new_outputs],
+                    [t.shape for t in new_outputs])
+    _copy_handle_data(new_outputs, op.outputs[orig_num_params:])
+
+  captured_inputs = _resolve_grad_captures(body_graph, body_grad_graph, op)
+  loop_vars = args + captured_inputs
 
   def grad_cond(counter, max_iters, *unused_args):
     return counter < max_iters
 
-  loop_vars = args + body_grad_graph.external_captures
-  cond_grad_graph = function.func_graph_from_py_func(
-      _get_unique_name("%s_grad_cond" % op.name),
-      grad_cond, loop_vars, {})
+  grad_cond_name = util.unique_grad_fn_name(op.get_attr("cond").name)
+  cond_grad_graph = func_graph_module.func_graph_from_py_func(
+      grad_cond_name, grad_cond, loop_vars, {},
+      func_graph=util.WhileCondFuncGraph(grad_cond_name))
 
-  assert len(loop_vars) == len(body_grad_graph.inputs)
-  assert len(loop_vars) == len(body_grad_graph.outputs)
-  assert len(loop_vars) == len(cond_grad_graph.inputs)
+  _check_num_inputs_outputs(cond_grad_graph, body_grad_graph, len(loop_vars))
 
   outputs = gen_functional_ops._while(
       loop_vars,
-      cond_v2._create_new_tf_function(cond_grad_graph),
-      cond_v2._create_new_tf_function(body_grad_graph),
+      util.create_new_tf_function(cond_grad_graph),
+      util.create_new_tf_function(body_grad_graph),
       output_shapes=[t.shape for t in body_grad_graph.outputs],
-      name=_get_unique_name("%s_grad" % op.name))
+      name="%s_grad" % op.name)
 
   _copy_handle_data(body_grad_graph.outputs, outputs)
-  _maybe_set_lowering_attr(outputs[0].op)
+  util.maybe_set_lowering_attr(outputs[0].op)
+  _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
+
+  # See comment in while_loop.
+  outputs = [array_ops.identity(t) for t in outputs]
 
+  # Set None as the output gradient for tensors with None input gradient
+  # e.g. TensorArray handles.
   # outputs[0] is the loop counter.
   # outputs[1] is the total number of loop iterations.
-  return outputs[2:2 + len(op.inputs)]
+  index = 2
+  none_padded_outputs = []
+  for g in grads:
+    if g is None:
+      none_padded_outputs.append(None)
+    else:
+      none_padded_outputs.append(outputs[index])
+      index += 1
+  return none_padded_outputs
+
+
+def _is_trainable(tensor):
+  """Returns whether the given tensor is trainable."""
+  if not gradients_impl.IsTrainable(tensor):
+    return False
+
+  # Special case: untrainable accumulator output. The gradients algorithm
+  # doesn't know about tensor lists of untrainable elements. In theory the
+  # tensor list gradient functions should return None as appropriate, but
+  # because we can't return None from the gradient function we filter out
+  # untrainable accumulator output here to avoid computing the gradient at all.
+  if tensor.op.type == "TensorListPopBack" and tensor.value_index == 0:
+    assert tensor.dtype == dtypes.variant
+    element_type = tensor.op.get_attr("element_dtype")
+    return gradients_impl.IsTrainable(element_type)
+
+  return True
+
+
+def _validate_and_convert_to_tensor(maximum_iterations):
+  """Checks that `maximum_iterations` is valid.
+
+  In XLA context, `maximum_iterations` is required and must be statically
+  inferable, e.g. output tensor of a Const node.
+
+  Args:
+    maximum_iterations: The maximum_iterations passed to while_loop.
+
+  Returns:
+    A scalar valued tensor of type int32 or None.
+
+  Raises:
+    ValueError: If `maximum_iterations` is invalid.
+  """
+  if _is_in_xla_context():
+    if maximum_iterations is None:
+      raise ValueError("maximum_iterations is None. It is required and must "
+                       "be statically known (e.g. a constant value or known "
+                       "shape dimension) when building while_loop in XLA "
+                       "context.")
+    if isinstance(maximum_iterations, ops.Tensor):
+      # Get the constant value from the `maximum_iterations` tensor to avoid
+      # capturing a Const tensor from outside this graph.
+      maximum_iterations = tensor_util.constant_value(maximum_iterations)
+      if maximum_iterations is None:
+        raise ValueError("maximum_iterations must be statically known (e.g. a "
+                         "constant value or known shape dimension) when "
+                         "building while_loop in XLA context.")
+
+  if maximum_iterations is not None:
+    # EmptyTensorList expects `max_num_elements` to be of type int32.
+    maximum_iterations = ops.convert_to_tensor(
+        maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
+    if maximum_iterations.shape.ndims != 0:
+      raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
+                       maximum_iterations.shape)
+  return maximum_iterations
 
 
 # TODO(srbs): Pull this into common utils for cond_v2 and while_v2.
-def _get_body_graph(while_op):
-  """Returns `FuncGraph` for the while body.
+def _get_graph(while_op, func_attr_name):
+  """Returns `FuncGraph` for the given function attribute.
 
   Args:
     while_op: The While Operation.
+    func_attr_name: string
 
   Returns:
-    `FuncGraph` for the while body.
+    `FuncGraph`
   """
   # TODO(srbs): Handle TensorShapeProto in function_def_to_graph.input_shapes.
   input_shapes = [
       tensor_shape.TensorShape(s) for s in while_op.get_attr("output_shapes")
   ]
-  func_name = while_op.get_attr("body").name
+  func_name = while_op.get_attr(func_attr_name).name
   fdef = while_op.graph._get_function(func_name).definition
-  func_graph = function_def_to_graph.function_def_to_graph(fdef, input_shapes)
+  # `while_op.graph` may not be the same as `ops.get_default_graph()` e.g.
+  # if the `while_op` is in the body of another if/while/defun. We build the
+  # `func_graph` with `while_op.graph` as its `outer_graph`. This resembles how
+  # the `FuncGraph` was built in the forward pass. We need this so that we can
+  # appropriately capture references to outer tensors in the nested grad graphs.
+  with while_op.graph.as_default():
+    func_graph = function_def_to_graph.function_def_to_graph(fdef, input_shapes)
   func_graph._while = while_op
   return func_graph
 
 
-def _create_grad_func(func_graph, grads, name, while_op):
+def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
+                      max_iters):
   """Builds and returns the gradient FuncGraph of `func_graph` and its args.
 
   The returned grad_func_graph must be called with the returned
   args + grad_func_graph.captures.
 
   Args:
-    func_graph: FuncGraph for the forward body function.
-    grads: The incoming grads for `func_graph`'s outputs.
+    ys: A `Tensor` or list of tensors to be differentiated.
+    xs: A `Tensor` or list of tensors to be used for differentiation.
+    grads: The incoming grads for `ys`.
+    cond_graph: FuncGraph for the forward cond function.
+    body_graph: FuncGraph for the forward body function.
     name: Name of the returned gradient function.
     while_op: The forward While op.
+    max_iters: the maximum number of iterations, or None if no limit.
 
   Returns:
     2-tuple of (grad_func_graph, args).
   """
-  assert len(func_graph.outputs) == len(grads)
+  assert len(ys) == len(grads)
 
-  loop_counter = constant_op.constant(0.)
-  # TODO(srbs): For nested while loops will need to lookup this value from
-  # the accumulator of the enclosing while loop. For now use as is assuming
-  # there is no nesting.
-  num_iters_t = while_op.outputs[0]
-
-  args = [loop_counter, num_iters_t] + grads
+  total_iters = while_op.outputs[0]
+  counter = constant_op.constant(
+      0, dtype=total_iters.dtype, name="grad_counter")
 
+  args = [counter, total_iters] + list(grads)
   # Note: The returned function does not have `args` in the list of
   # `external_captures`.
-  grad_func_graph = function.func_graph_from_py_func(
+  grad_func_graph = func_graph_module.func_graph_from_py_func(
       name,
-      lambda *args: _grad_fn(func_graph, args),
+      lambda *args: _grad_fn(ys, xs, args, body_graph),
       args, {},
-      func_graph=_WhileBodyGradFuncGraph(name, func_graph))
+      func_graph=_WhileBodyGradFuncGraph(name, cond_graph, body_graph,
+                                         max_iters))
 
   # Add the popped accumulators to the list of outputs.
   for internal_capture in grad_func_graph.internal_captures:
-    grad_func_graph.outputs.append(
-        grad_func_graph.popped_tensor_lists[internal_capture])
+    if internal_capture in grad_func_graph.popped_tensor_lists:
+      grad_func_graph.outputs.append(
+          grad_func_graph.popped_tensor_lists[internal_capture])
+    elif internal_capture.dtype == dtypes.resource:
+      grad_func_graph.outputs.append(internal_capture)
+    else:
+      raise ValueError("Tensor %s is in list of internal_captures but is"
+                       " neither a resource nor is in popped_tensor_lists." %
+                       str(internal_capture))
 
   return grad_func_graph, args
 
 
-def _grad_fn(func_graph, args):
+def _grad_fn(ys, xs, args, func_graph):
   """Computes the gradient of `func_graph` in the current graph.
 
   This function builds the gradient graph of the corresponding forward-pass
   `func_graph` by differentiating `func_graph`'s outputs w.r.t. its inputs.
 
   Args:
+    ys: A `Tensor` or list of tensors to be differentiated.
+    xs: A `Tensor` or list of tensors to be used for differentiation.
+    args: The input arguments.
+      args[0] - Loop counter
+      args[1] - Total number of iterations.
+      args[2:] - Incoming gradients for `ys`.
     func_graph: function.FuncGraph. The corresponding forward-pass function.
-    args: The input arguments. args[0] - Loop counter args[1] - Total number of
-      iterations.
-      args[2:] - Incoming gradients for `func_graph.outputs`.
 
   Returns:
     The output gradient Tensors.
   """
-  xs = func_graph.inputs
-  ys = func_graph.outputs
   grad_ys = args[2:]
 
   # Build the gradient graph. Note that this builds the gradient computation of
   # func_graph in the current graph, which requires capturing tensors from
   # func_graph. The captured func_graph tensors are resolved to external tensors
-  # in _resolve_grad_inputs.
+  # after the forward While op has been rewritten in _resolve_grad_captures.
   # TODO(srbs): Mark GradientsHelper as public?
   grad_outs = gradients_impl._GradientsHelper(
-      ys, xs, grad_ys=grad_ys, src_graph=func_graph)
+      ys, xs, grad_ys=grad_ys, src_graph=func_graph,
+      unconnected_gradients="zero")
 
-  assert all([g is not None for g in grad_outs])
+  # TODO(b/118712257): Handle the case when grad_outs has None's e.g. when there
+  # is a tf.StopGradient in the loop body.
+  assert all(g is not None for g in grad_outs)
   counter = args[0]
   total_iters = args[1]
   return [counter + 1, total_iters] + grad_outs
 
 
-def _get_intermediates(func_graph):
-  """Returns all tensors in `func_graph` that should be accumulated."""
-  # We currently accumulate output tensors of most ops in the function and rely
-  # on the pruning pass to get rid of the unused accumulators at runtime.
-  # However, this can bloat the GraphDef and make debugging harder so we perform
-  # some optimizations.
-  #
-  # Optimization we currently perform:
-  # 1. We do not accumulate tensors which already have an accumulator
-  #    in the loop body.
-  # 2. We do not accumulate outputs of Identity nodes. When building the
-  #    FuncGraph, we add an Identity node for each output (see
-  #    `AutomaticControlDependencies.mark_as_return`). Accumulating outputs
-  #    of all these nodes bloats the GraphDef quite a bit so we remove those.
-  #    Since the gradient of an Identity node does not rely on its forward op's
-  #    input this is safe to do.
-  #
-  # Other possible optimizations:
-  # 1. Only accumulate tensors that will be required by the backward pass.
-  #    This will require running the gradient pass and hence would increase the
-  #    graph building time for the forward pass.
-  # 2. Do not accumulate Const nodes created inside the loop body.
-  # 3. Do not accumulate inputs that are passed as-is, e.g. loop invariants.
-  # TODO(srbs): 2 and 3 may be hard optimizations for the runtime optimizer
-  # since it requires knowledge of the while loop semantics. If so, consider
-  # doing those here.
-  intermediates = []
-
-  for op in func_graph.get_operations():
-    if op.type == "Identity":
-      continue
-    for o in op.outputs:
-      if (o != func_graph.inputs[0] and  # Loop counter.
-          _get_accumulator(o) is None):  # Has existing accumulator.
-        intermediates.append(o)
-  return intermediates
+def _resolve_grad_captures(body_graph, body_grad_graph, while_op):
+  """Returns the tensors to pass as captured inputs to `body_grad_graph`.
+
+  `body_grad_graph` may have external references to:
+  1. Its outer graph containing the input gradients. These are left as-is.
+  2. Accumulators captured from the forward-pass graph. These should have been
+     added as `while_op` outputs after the gradient graph was built. We replace
+     these with the corresponding output of `while_op`, i.e. a tensor in
+     `body_graph.outer_graph`. In the case of nested control flow or functions,
+     the gradient logic handling `body_grad_graph.outer_graph` will make sure
+     the tensor from `body_graph.outer_graph` is also correctly captured.
+
+  Args:
+    body_graph: FuncGraph. The forward-pass body function.
+    body_grad_graph: FuncGraph. The body gradients function.
+    while_op: The forward-pass While Operation calling `body_graph`.
+
+  Returns:
+    A list of input tensors to be passed as the captured inputs to
+      `body_grad_graph`.
+  """
+  new_capture_inputs = []
+  for t in body_grad_graph.external_captures:
+    # All values captured by gradient computation should be from the forward
+    # graph or a captured resource variable (note that input gradients are
+    # regular non-captured inputs).
+    if t.graph == body_graph:
+      # Captured accumulator
+      t = while_op.outputs[t.graph.outputs.index(t)]
+      # Note: We rely on the capturing logic of the gradient While op graph to
+      # correctly capture the tensors in `body_graph.outer_graph`. Both cond_v2
+      # and while_v2 handle this while building their gradient functions.
+      assert t.graph == body_graph.outer_graph
+    else:
+      # Captured resource variable
+      assert t.dtype == dtypes.resource
+
+    new_capture_inputs.append(t)
+  return new_capture_inputs
 
 
 def _get_accumulator(tensor):
@@ -416,7 +596,7 @@ def _get_accumulator(tensor):
     A variant tensor in the same graph as `tensor` or None if no accumulator is
     found.
   """
-  assert isinstance(tensor.graph, function.FuncGraph)
+  assert isinstance(tensor.graph, func_graph_module.FuncGraph)
 
   def get_func_graph_output(t):
     """Returns t or Identity(t) whichever exists in graph outputs else None."""
@@ -449,30 +629,23 @@ def _get_accumulator(tensor):
   return None
 
 
-# TODO(srbs): Add to common utils for cond_v2 and while_v2.
-def _get_unique_name(name):
-  """Returns a name that is unique in the root graph of `func_graph`.
-
-  Args:
-    name: String to uniquify.
-
-  Returns:
-    A string.
-  """
-  with ops.init_scope():
-    return ops.get_default_graph().unique_name(name)
-
-
-class _WhileBodyGradFuncGraph(function.FuncGraph):
+class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
   """FuncGraph for the gradient function of the body of a While op.
 
   Contains the logic for capturing the tensors from the body of the forward
   While op which is as follows:
-  1. Find the accumulator for that tensor.
-  2. Capture the forward While op output tensor corresponding to the
-     accumulator in this FuncGraph.
-  3. Pop a value from the captured placeholder and use it as the captured value
-     for the forward pass tensor.
+  1. If the tensor is of resource type (these are not accumulated):
+     a. Ensure that the tensor is a loop invariant, i.e., it exists in both loop
+        inputs and outputs at the same index.
+     b. Lookup the corresponding resource tensor in the forward outer graph and
+        try to capture that.
+  2. If the tensor is not of resource type:
+     a. Create an accumulator for that tensor and output it from the forward
+        pass. Note this also requires adding it as an input to the forward pass.
+     b. Capture the accumulator from the forward pass in this FuncGraph. This
+        will later be resolved to the correct output of the forward While op.
+     c. Pop a value from the captured placeholder and use it as the captured
+        value for the forward pass tensor.
 
   This only allows capturing tensors in the forward graph. A ValueError is
   raised if an attempt is made to capture a tensor not in the forward graph.
@@ -484,22 +657,37 @@ class _WhileBodyGradFuncGraph(function.FuncGraph):
   tensor.
 
   Attributes:
-    popped_tensor_lists: Dict from the captured accumulator placeholder to the
+    while_op_needs_rewrite: True if any non-resource intermediates were
+      captured, meaning the forward While op needs to be rewritten to output the
+      corresponding accumulators.
+    empty_tensor_lists: list of EmptyTensorList tensors to be used as initial
+      input to the new accumulators in the forward graph.
+    popped_tensor_lists: dict from the captured accumulator placeholder to the
       TensorList obtained after popping the intermediate tensor from it. The
       values of this dict need to be added to the list of outputs.
   """
 
-  def __init__(self, name, forward_graph):
+  def __init__(self, name, forward_cond_graph, forward_body_graph, max_iters):
     super(_WhileBodyGradFuncGraph, self).__init__(name)
+    self.empty_tensor_lists = []
     self.popped_tensor_lists = {}
     # FuncGraph for the body of the forward While op.
-    self._forward_graph = forward_graph
-    # Dict from forward intermediate tensor to the corresponding "popped" tensor
-    # in this graph.
+    self._forward_graph = forward_body_graph
+    # FuncGraph for the cond of the forward While op.
+    self._forward_cond_graph = forward_cond_graph
+    self._maximum_iterations = max_iters
+    # Dict from forward intermediate tensor to its indirectly captured tensor
+    # in this graph. Indirect capturing happens in two ways:
+    # 1. For non-resource tensors we capture their accumulators from the forward
+    #    outer graph and pop values from that accumulator inside this graph
+    #    using TensorListPopBack.
+    # 2. For resource tensors we directly capture their corresponding tensor
+    #    in the forward outer graph.
     self._indirect_captures = {}
-    # Dict from forward graph tensor to the While op output corresponding to its
-    # accumulator.
-    self._tensor_to_accumulator = {}
+
+  @property
+  def while_op_needs_rewrite(self):
+    return self.empty_tensor_lists
 
   def capture(self, tensor, name=None, whitelisted=False):
     """Selectively captures external tensors.
@@ -531,39 +719,78 @@ class _WhileBodyGradFuncGraph(function.FuncGraph):
     if tensor.graph is not self._forward_graph:
       return super(_WhileBodyGradFuncGraph, self)._capture_helper(tensor, name)
 
+    while tensor.op.type == "Identity":
+      # We do not accumulate the output of identity nodes so we try to capture
+      # the input of the Identity node instead.
+      tensor = tensor.op.inputs[0]
+
     captured_tensor = self._indirect_captures.get(tensor)
     if captured_tensor is not None:
-      # For GradientTape housekeeping.
-      assert self._tensor_to_accumulator[tensor] in self.captures
-      super(_WhileBodyGradFuncGraph, self)._capture_helper(
-          self._tensor_to_accumulator[tensor], name)
       return captured_tensor
 
-    assert tensor not in self._tensor_to_accumulator
-
-    accumulator = None
-
-    # Find the TensorList that was used to accumulate the tensors of this
-    # intermediate tensor.
+    if tensor.dtype == dtypes.resource:
+      # Resource-type tensors are not accumulated.
+      # If a resource tensor exists in the loop body it must either be a loop
+      # input or an output of a nested While op inside the loop body which
+      # had captured the external resource.
+      if tensor in self._forward_graph.inputs:
+        index = self._forward_graph.inputs.index(tensor)
+      elif tensor.op.type == "While":
+        # Captured resources occur at the same index in the lists of inputs and
+        # outputs of a while op. So we lookup the input of `tensor.op` at the
+        # same index as the index of `tensor` in the `tensor.op.outputs`.
+        index = self._forward_graph.inputs.index(
+            tensor.op.inputs[tensor.value_index])
+      else:
+        raise ValueError(
+            "Taking gradient of a while loop which creates"
+            " a resource in its body is not supported: %s" % str(tensor))
+      # This must be a loop invariant.
+      assert self._forward_graph.inputs[index] == self._forward_graph.outputs[
+          index], "Resource tensors must be loop invariants %s." % str(
+              self._forward_graph._while.inputs[index])
+      tensor_in_outer_graph = self._forward_graph._while.inputs[index]
+      self._indirect_captures[tensor] = self.capture(
+          tensor_in_outer_graph, whitelisted=True)
+      return self._indirect_captures[tensor]
+
+    # Create or find an existing accumulator output for `tensor` in the forward
+    # graph, and fetch from this accumulator in the gradient graph to get the
+    # raw intermediate value.
     accumulator = _get_accumulator(tensor)
     if accumulator is None:
-      raise ValueError("Reference to un-accumulated intermediate tensor: ",
-                       tensor.name)
-    assert accumulator.graph == self._forward_graph
-    # Get the While op output corresponding to the accumulator.
-    accumulator = self._forward_graph._while.outputs[self._forward_graph.outputs
-                                                     .index(accumulator)]
-
-    assert accumulator.graph == self._forward_graph.outer_graph
-    self._tensor_to_accumulator[tensor] = accumulator
-
-    # Capture the `accumulator`.
-    accumulator_ph = super(_WhileBodyGradFuncGraph, self)._capture_helper(
+      # Create the initial empty tensor list.
+      with self._forward_graph.outer_graph.as_default():
+        tensor_list = list_ops.empty_tensor_list(
+            element_dtype=tensor.dtype, element_shape=tensor.shape,
+            max_num_elements=self._maximum_iterations)
+      self.empty_tensor_lists.append(tensor_list)
+
+      # Push the intermediate tensor to the tensor list. This captures
+      # `tensor_list`.
+      with self._forward_graph.as_default():
+        accumulator = list_ops.tensor_list_push_back(tensor_list, tensor)
+      # Add the modified tensor list to the list of outputs. This output will be
+      # all the accumulated values.
+      self._forward_graph.outputs.append(accumulator)
+
+      # Capture in the cond graph as well so the forward cond and body inputs
+      # match.
+      with self._forward_cond_graph.as_default():
+        self._forward_cond_graph.capture(tensor_list)
+
+    # Capture the accumulator tensor list in the gradient graph directly from
+    # the forward graph -- we'll later modify this to capture the final list
+    # output by the forward While op instead.
+    captured_accumulator = super(_WhileBodyGradFuncGraph, self)._capture_helper(
         accumulator, name)
+
+    # Pop the intermediate value from the tensor list in the gradient graph.
     new_tensor_list, captured_tensor = list_ops.tensor_list_pop_back(
-        accumulator_ph, element_dtype=tensor.dtype)
+        captured_accumulator, element_dtype=tensor.dtype)
+
     self._indirect_captures[tensor] = captured_tensor
-    self.popped_tensor_lists[accumulator_ph] = new_tensor_list
+    self.popped_tensor_lists[captured_accumulator] = new_tensor_list
     return captured_tensor
 
 
@@ -578,50 +805,116 @@ def _check_shapes_compat(output_tensors, shape_invariants, input_tensors):
           "specify a less-specific shape." % (input_t.name, shape, t.shape))
 
 
-def _copy_handle_data(src_tensors, tgt_tensors):
-  for src_t, tgt_t in zip(src_tensors, tgt_tensors):
-    function._copy_handle_data(src_t, tgt_t)
+def _check_num_inputs_outputs(cond_graph, body_graph, num_flattened_loop_vars):
+  """Checks the number of inputs/outputs of `cond_graph` and `body_graph`."""
+  assert len(cond_graph.inputs) == num_flattened_loop_vars, (
+      "cond_graph takes %d inputs; Expected: %d" % (len(cond_graph.inputs),
+                                                    num_flattened_loop_vars))
+  assert len(cond_graph.outputs) == 1, (
+      "cond_graph has %d outputs; Expected: 1" % len(cond_graph.outputs))
+  assert len(body_graph.inputs) == num_flattened_loop_vars, (
+      "body_graph takes %d inputs; Expected: %d" % (len(cond_graph.inputs),
+                                                    num_flattened_loop_vars))
+  assert len(body_graph.outputs) == num_flattened_loop_vars, (
+      "body_graph has %d outputs; Expected: %d" % (len(body_graph.outputs),
+                                                   num_flattened_loop_vars))
 
 
-# TODO(srbs): Move to common utils for cond_v2 and while_v2.
-def _maybe_set_lowering_attr(op):
-  """Sets the flag to enable lowering on the `While` op if necessary.
-
-  Lowering allows while_v2 to avoid some of the limitations of Functions,
-  allowing users to specify devices & colocation inside of while_v2
-  branches, and enabling non-strict evaluation & partial pruning of while_v2
-  branches. This brings while_v2 closer to feature parity with
-  tf.while_loop.
+def _copy_handle_data(src_tensors, tgt_tensors):
+  for src_t, tgt_t in zip(src_tensors, tgt_tensors):
+    custom_gradient.copy_handle_data(src_t, tgt_t)
 
-  However, we do not lower `While` in the XLA context because it is easier
-  for XLA to apply its own optimizations when dealing with un-lowered
-  `While` operators than with low-level control flow primitives.
 
-  Args:
-    op: The While op.
-  """
-  if not control_flow_util.IsInXLAContext(op):
-    # pylint: disable=protected-access
-    op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
-    # pylint: enable=protected-access
+def _maybe_set_maximum_iterations_attr(op, maximum_iterations):
+  if control_flow_util.IsInXLAContext(op):
+    # Store the maximum_iterations to use in the gradient pass.
+    op._set_attr(  # pylint: disable=protected-access
+        "_maximum_iterations",
+        attr_value_pb2.AttrValue(
+            i=tensor_util.constant_value(maximum_iterations)))
 
 
-def _get_tensor_convertible_shape(shape):
-  assert isinstance(shape, tensor_shape.TensorShape)
-  if shape.is_fully_defined():
-    return shape
-  if not shape:  # Unknown shape.
-    return -1
-  # Partially defined shape.
-  shape_list = shape.as_list()
-  shape_list = [s if s is not None else -1 for s in shape_list]
-  return ops.convert_to_tensor(shape_list)
+# TODO(srbs): This method should be in control_flow_util but that introduces
+# a circular dependency ops -> control_flow_util -> ops.
+def _is_in_xla_context():
+  """Returns whether the current context is inside an XLA context."""
+  outer_graph = ops.get_default_graph()
+  # The `_control_flow_context` is not copied when building a FuncGraph so
+  # we look it up from the base graph.
+  while isinstance(outer_graph, func_graph_module.FuncGraph):
+    outer_graph = outer_graph.outer_graph
+  cur_ctxt = outer_graph._get_control_flow_context()  # pylint: disable=protected-access
+  return control_flow_util.GetContainingXLAContext(cur_ctxt) is not None
 
 
 def _graph_name(graph):
-  if isinstance(graph, function.FuncGraph):
+  if isinstance(graph, func_graph_module.FuncGraph):
     return graph.name
   return "Base"
 
 
+def _is_tensor_array_handle(tensor):
+  """Returns whether tensor is a TensorArray handle."""
+  if tensor.dtype != dtypes.resource:
+    return False
+
+  if tensor.op.type == "While":
+    # We assume that any resource outputs of a While op correspond to a captured
+    # resource input (as opposed to a loop variable specified by the user).
+    # NOTE(skyewm): we could actually check this, but I can't think of when you
+    # would have a resource loop variable.
+    tensor = tensor.op.inputs[tensor.value_index]
+
+  # TODO(b/118452219): add test coverage for this.
+  tensor = func_graph_module.maybe_captured(tensor)
+
+  if isinstance(tensor, ops.EagerTensor):
+    # Eager execution doesn't quite support legacy tensorarray
+    return False
+
+  return tensor.op.type in TENSOR_ARRAY_HANDLE_OPS
+
+
+def _pack_sequence_as(structure_with_tas, loop_vars):
+  """Like `nest.pack_sequence_as` but also replaces flows with TensorArrays."""
+
+  def flow_to_tensor_array(flow, ta):  # pylint: disable=missing-docstring
+    if isinstance(ta, tensor_array_ops.TensorArray):
+      # pylint: disable=protected-access
+      new_ta = tensor_array_ops.TensorArray(
+          dtype=ta.dtype,
+          handle=ta.handle,
+          flow=flow,
+          infer_shape=ta._infer_shape,
+          colocate_with_first_write_call=ta._colocate_with_first_write_call)
+      new_ta._colocate_with = ta._colocate_with
+      new_ta._element_shape = ta._element_shape
+      # pylint: enable=protected-access
+      return new_ta
+    return flow
+
+  flattened_loop_vars = [
+      flow_to_tensor_array(*z)
+      for z in zip(nest.flatten(loop_vars), nest.flatten(structure_with_tas))
+  ]
+  return nest.pack_sequence_as(structure_with_tas, flattened_loop_vars)
+
+
+def _tensor_array_to_flow(loop_vars):
+
+  def f(maybe_ta):
+    if isinstance(maybe_ta, tensor_array_ops.TensorArray):
+      return maybe_ta.flow
+    return maybe_ta
+
+  return nest.map_structure(f, loop_vars)
+
+
+def _build_signature(loop_vars, shape_invariants):
+  return nest.pack_sequence_as(loop_vars, [
+      tensor_spec.TensorSpec(s, t.dtype, name=t.op.name)
+      for s, t in zip(nest.flatten(shape_invariants), nest.flatten(loop_vars))
+  ])
+
+
 # pylint: enable=protected-access
diff --git a/tensorflow/examples/tutorials/estimators/__init__.py b/tensorflow/python/platform/__init__.py
similarity index 100%
rename from tensorflow/examples/tutorials/estimators/__init__.py
rename to tensorflow/python/platform/__init__.py
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 4c91bc3652dc77274acfbf43859c03fad8a46a38..7b917235c0a73421552b7aebaa3192de969e5f3a 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -108,7 +108,7 @@ def _define_help_flags():
     _define_help_flags_called = True
 
 
-@tf_export('app.run')
+@tf_export(v1=['app.run'])
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index fa17b17d104221990ed7847b725c4b741cb4aca7..d6773d7b8136f93080b122f52b77513305aecdb6 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -27,8 +27,10 @@ import time
 import six
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.client import timeline
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
@@ -182,6 +184,19 @@ class Benchmark(six.with_metaclass(_BenchmarkRegistrar, object)):
         throughput=throughput, extras=extras)
 
 
+@tf_export("test.benchmark_config")
+def benchmark_config():
+  """Returns a tf.ConfigProto for disabling the dependency optimizer.
+
+    Returns:
+      A TensorFlow ConfigProto object.
+  """
+  config = config_pb2.ConfigProto()
+  config.graph_options.rewrite_options.dependency_optimization = (
+      rewriter_config_pb2.RewriterConfig.OFF)
+  return config
+
+
 @tf_export("test.Benchmark")
 class TensorFlowBenchmark(Benchmark):
   """Abstract class that provides helpers for TensorFlow benchmarks."""
@@ -285,6 +300,18 @@ class TensorFlowBenchmark(Benchmark):
     benchmark_values["extras"].update(unreported_extras)
     return benchmark_values
 
+  def evaluate(self, tensors):
+    """Evaluates tensors and returns numpy values.
+
+    Args:
+      tensors: A Tensor or a nested list/tuple of Tensors.
+
+    Returns:
+      tensors numpy values.
+    """
+    sess = ops.get_default_session() or self.cached_session()
+    return sess.run(tensors)
+
 
 def _run_benchmarks(regex):
   """Run benchmarks that match regex `regex`.
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index 5927bc2409bb2744c2f6f003b90c0682e5ba5eb9..d0159e9e9816ba730c843d2b46936b142d47ff79 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('gfile.GFile', 'gfile.Open')
+@tf_export(v1=['gfile.GFile', 'gfile.Open'], v2=['io.gfile.GFile'])
 class GFile(_FileIO):
   """File I/O wrappers without thread locking.
 
@@ -52,7 +52,7 @@ class GFile(_FileIO):
     super(GFile, self).__init__(name=name, mode=mode)
 
 
-@tf_export('gfile.FastGFile')
+@tf_export(v1=['gfile.FastGFile'])
 class FastGFile(_FileIO):
   """File I/O wrappers without thread locking.
 
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index 8141cf92c568f257a5e9810318182d71f445dfa1..5b20e36a693b2ae283ffe4cefa2210c0cb61dcfc 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -104,10 +104,13 @@ def GetTempDir():
   """Return a temporary directory for tests to use."""
   global _googletest_temp_dir
   if not _googletest_temp_dir:
-    first_frame = tf_inspect.stack()[-1][0]
-    temp_dir = os.path.join(tempfile.gettempdir(),
-                            os.path.basename(tf_inspect.getfile(first_frame)))
-    temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
+    if os.environ.get('TEST_TMPDIR'):
+      temp_dir = tempfile.mkdtemp(prefix=os.environ['TEST_TMPDIR'])
+    else:
+      first_frame = tf_inspect.stack()[-1][0]
+      temp_dir = os.path.join(tempfile.gettempdir(),
+                              os.path.basename(tf_inspect.getfile(first_frame)))
+      temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
 
     def delete_temp_dir(dirname=temp_dir):
       try:
@@ -139,7 +142,7 @@ def StatefulSessionAvailable():
   return False
 
 
-@tf_export('test.StubOutForTesting')
+@tf_export(v1=['test.StubOutForTesting'])
 class StubOutForTesting(object):
   """Support class for stubbing methods out for unit testing.
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index b2d95518552de3a170d1c04bfc3f061dc8f8f54a..8f4c5c190ccaa5a06beaf89430c33ad935c1df9d 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -24,7 +24,7 @@ from tensorflow.python.util import tf_inspect as _inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('resource_loader.load_resource')
+@tf_export(v1=['resource_loader.load_resource'])
 def load_resource(path):
   """Load the resource at given path, where path is relative to tensorflow/.
 
@@ -46,7 +46,7 @@ def load_resource(path):
 
 
 # pylint: disable=protected-access
-@tf_export('resource_loader.get_data_files_path')
+@tf_export(v1=['resource_loader.get_data_files_path'])
 def get_data_files_path():
   """Get a direct path to the data files colocated with the script.
 
@@ -57,7 +57,7 @@ def get_data_files_path():
   return _os.path.dirname(_inspect.getfile(_sys._getframe(1)))
 
 
-@tf_export('resource_loader.get_root_dir_with_all_resources')
+@tf_export(v1=['resource_loader.get_root_dir_with_all_resources'])
 def get_root_dir_with_all_resources():
   """Get a root directory containing all the data attributes in the build rule.
 
@@ -97,7 +97,7 @@ def get_root_dir_with_all_resources():
   return data_files_dir or script_dir
 
 
-@tf_export('resource_loader.get_path_to_datafile')
+@tf_export(v1=['resource_loader.get_path_to_datafile'])
 def get_path_to_datafile(path):
   """Get the path to the specified file in the data dependencies.
 
@@ -117,7 +117,7 @@ def get_path_to_datafile(path):
   return _os.path.join(data_files_path, path)
 
 
-@tf_export('resource_loader.readahead_file_path')
+@tf_export(v1=['resource_loader.readahead_file_path'])
 def readahead_file_path(path, readahead='128M'):  # pylint: disable=unused-argument
   """Readahead files not implemented; simply returns given path."""
   return path
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index 5dc4037d62b478648baf2d57838c85aeda6cc738..943832af7a2c58d40cb2143048ddd6517596e406 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -46,9 +46,9 @@ from tensorflow.python.util.tf_export import tf_export
 if sys.version_info.major == 2:
   import mock                # pylint: disable=g-import-not-at-top,unused-import
 else:
-  from unittest import mock  # pylint: disable=g-import-not-at-top
+  from unittest import mock  # pylint: disable=g-import-not-at-top,g-importing-member
 
-tf_export('test.mock')(mock)
+tf_export(v1=['test.mock'])(mock)
 
 # Import Benchmark class
 Benchmark = _googletest.Benchmark  # pylint: disable=invalid-name
@@ -64,7 +64,7 @@ def main(argv=None):
   return _googletest.main(argv)
 
 
-@tf_export('test.get_temp_dir')
+@tf_export(v1=['test.get_temp_dir'])
 def get_temp_dir():
   """Returns a temporary directory for use during tests.
 
@@ -76,7 +76,7 @@ def get_temp_dir():
   return _googletest.GetTempDir()
 
 
-@tf_export('test.test_src_dir_path')
+@tf_export(v1=['test.test_src_dir_path'])
 def test_src_dir_path(relative_path):
   """Creates an absolute test srcdir path given a relative path.
 
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 59e60856ae80db76caa7ecd23db0db597bf60c6f..813bcb89beac01de97f53f9cb9ff97119f552a09 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -37,7 +37,7 @@ import six
 
 from tensorflow.python.util.tf_export import tf_export
 
-# Don't use this directly. Use _get_logger() instead.
+# Don't use this directly. Use get_logger() instead.
 _logger = None
 _logger_lock = threading.Lock()
 
@@ -78,7 +78,8 @@ else:
       return '(unknown file)', 0, '(unknown function)'
 
 
-def _get_logger():
+@tf_export('get_logger')
+def get_logger():
   """Return TF logger instance."""
   global _logger
 
@@ -130,39 +131,39 @@ def _get_logger():
     _logger_lock.release()
 
 
-@tf_export('logging.log')
+@tf_export(v1=['logging.log'])
 def log(level, msg, *args, **kwargs):
-  _get_logger().log(level, msg, *args, **kwargs)
+  get_logger().log(level, msg, *args, **kwargs)
 
 
-@tf_export('logging.debug')
+@tf_export(v1=['logging.debug'])
 def debug(msg, *args, **kwargs):
-  _get_logger().debug(msg, *args, **kwargs)
+  get_logger().debug(msg, *args, **kwargs)
 
 
-@tf_export('logging.error')
+@tf_export(v1=['logging.error'])
 def error(msg, *args, **kwargs):
-  _get_logger().error(msg, *args, **kwargs)
+  get_logger().error(msg, *args, **kwargs)
 
 
-@tf_export('logging.fatal')
+@tf_export(v1=['logging.fatal'])
 def fatal(msg, *args, **kwargs):
-  _get_logger().fatal(msg, *args, **kwargs)
+  get_logger().fatal(msg, *args, **kwargs)
 
 
-@tf_export('logging.info')
+@tf_export(v1=['logging.info'])
 def info(msg, *args, **kwargs):
-  _get_logger().info(msg, *args, **kwargs)
+  get_logger().info(msg, *args, **kwargs)
 
 
-@tf_export('logging.warn')
+@tf_export(v1=['logging.warn'])
 def warn(msg, *args, **kwargs):
-  _get_logger().warn(msg, *args, **kwargs)
+  get_logger().warn(msg, *args, **kwargs)
 
 
-@tf_export('logging.warning')
+@tf_export(v1=['logging.warning'])
 def warning(msg, *args, **kwargs):
-  _get_logger().warning(msg, *args, **kwargs)
+  get_logger().warning(msg, *args, **kwargs)
 
 
 _level_names = {
@@ -183,20 +184,20 @@ _log_prefix = None  # later set to google2_log_prefix
 _log_counter_per_token = {}
 
 
-@tf_export('logging.TaskLevelStatusMessage')
+@tf_export(v1=['logging.TaskLevelStatusMessage'])
 def TaskLevelStatusMessage(msg):
   error(msg)
 
 
-@tf_export('logging.flush')
+@tf_export(v1=['logging.flush'])
 def flush():
   raise NotImplementedError()
 
 
 # Code below is taken from pyglib/logging
-@tf_export('logging.vlog')
+@tf_export(v1=['logging.vlog'])
 def vlog(level, msg, *args, **kwargs):
-  _get_logger().log(level, msg, *args, **kwargs)
+  get_logger().log(level, msg, *args, **kwargs)
 
 
 def _GetNextLogCountPerToken(token):
@@ -214,7 +215,7 @@ def _GetNextLogCountPerToken(token):
   return _log_counter_per_token[token]
 
 
-@tf_export('logging.log_every_n')
+@tf_export(v1=['logging.log_every_n'])
 def log_every_n(level, msg, n, *args):
   """Log 'msg % args' at level 'level' once per 'n' times.
 
@@ -231,7 +232,7 @@ def log_every_n(level, msg, n, *args):
   log_if(level, msg, not (count % n), *args)
 
 
-@tf_export('logging.log_first_n')
+@tf_export(v1=['logging.log_first_n'])
 def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   """Log 'msg % args' at level 'level' only first 'n' times.
 
@@ -247,7 +248,7 @@ def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   log_if(level, msg, count < n, *args)
 
 
-@tf_export('logging.log_if')
+@tf_export(v1=['logging.log_if'])
 def log_if(level, msg, condition, *args):
   """Log 'msg % args' at level 'level' only if condition is fulfilled."""
   if condition:
@@ -296,16 +297,16 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
   return s
 
 
-@tf_export('logging.get_verbosity')
+@tf_export(v1=['logging.get_verbosity'])
 def get_verbosity():
   """Return how much logging output will be produced."""
-  return _get_logger().getEffectiveLevel()
+  return get_logger().getEffectiveLevel()
 
 
-@tf_export('logging.set_verbosity')
+@tf_export(v1=['logging.set_verbosity'])
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
-  _get_logger().setLevel(v)
+  get_logger().setLevel(v)
 
 
 def _get_thread_id():
@@ -318,8 +319,8 @@ def _get_thread_id():
 
 _log_prefix = google2_log_prefix
 
-tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
-tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
-tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
-tf_export('logging.INFO').export_constant(__name__, 'INFO')
-tf_export('logging.WARN').export_constant(__name__, 'WARN')
+tf_export(v1=['logging.DEBUG']).export_constant(__name__, 'DEBUG')
+tf_export(v1=['logging.ERROR']).export_constant(__name__, 'ERROR')
+tf_export(v1=['logging.FATAL']).export_constant(__name__, 'FATAL')
+tf_export(v1=['logging.INFO']).export_constant(__name__, 'INFO')
+tf_export(v1=['logging.WARN']).export_constant(__name__, 'WARN')
diff --git a/tensorflow/python/profiler/internal/flops_registry.py b/tensorflow/python/profiler/internal/flops_registry.py
index 147711b1d9b864c195f17b50eb3e7bc37ee1ecd0..d859c4853d4737930ed4a58ed988f650e71e7780 100644
--- a/tensorflow/python/profiler/internal/flops_registry.py
+++ b/tensorflow/python/profiler/internal/flops_registry.py
@@ -407,7 +407,7 @@ def _conv_2d_backprop_input_flops(graph, node):
   return ops.OpStats("flops",
                      (2 * out_shape.num_elements()
                       * kernel_shape.num_elements()
-                      / (out_shape[-1].value * strides_product)))
+                      / (out_shape.dims[-1].value * strides_product)))
 
 
 @ops.RegisterStatistics("Conv2DBackpropFilter", "flops")
@@ -430,7 +430,7 @@ def _conv_2d_backprop_filter_flops(graph, node):
   return ops.OpStats("flops",
                      (2 * image_shape.num_elements()
                       * kernel_shape.num_elements()
-                      / (image_shape[-1].value * strides_product)))
+                      / (image_shape.dims[-1].value * strides_product)))
 
 ################################################################################
 # Other ops
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index 216cc3dd54b7851810baee23be91e321ede06b42..f96d721f46e162ee6753377569aacb439cd591d5 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -26,6 +26,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -154,6 +155,7 @@ class RunMetadataTest(test.TestCase):
     # deallocates the memory after matmul started.
     self.assertGreater(random_allocs[1].alloc_micros, mm.all_start_micros)
 
+  @test_util.run_deprecated_v1
   def testCPU(self):
     ops.reset_default_graph()
     with ops.device('/cpu:0'):
@@ -167,6 +169,7 @@ class RunMetadataTest(test.TestCase):
     ret = _extract_node(run_meta, 'MatMul:MatMul')
     self.assertEqual(len(ret), 0)
 
+  @test_util.run_v1_only('b/120545219')
   def testLoopCPU(self):
     ops.reset_default_graph()
     with ops.device('/cpu:0'):
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index acf02096fffe8b38e68824878fa698ed69d3895c..4b2d9052b7879ceaf4a250ba56f438f3798b669b 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -122,7 +122,7 @@ def _build_advisor_options(options):
   return opts
 
 
-@tf_export('profiler.Profiler')
+@tf_export(v1=['profiler.Profiler'])
 class Profiler(object):
   """TensorFlow multi-step profiler.
 
@@ -306,7 +306,7 @@ class Profiler(object):
     print_mdl.WriteProfile(filename)
 
 
-@tf_export('profiler.profile')
+@tf_export(v1=['profiler.profile'])
 def profile(graph=None,
             run_meta=None,
             op_log=None,
@@ -381,7 +381,7 @@ def profile(graph=None,
   return tfprof_node
 
 
-@tf_export('profiler.advise')
+@tf_export(v1=['profiler.advise'])
 def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   """Auto profile and advise.
 
@@ -398,7 +398,7 @@ def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   Returns:
     Returns AdviceProto proto
   """
-  if not graph and context.in_eager_execution():
+  if not graph and not context.executing_eagerly():
     graph = ops.get_default_graph()
 
   if options == _DEFAULT_ADVISE_OPTIONS:
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 94c685274a764bb099da6c0501b397d73d239f35..1c7c15be4fe5920ff06241175aff57bc52ac338e 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -76,6 +76,7 @@ class PrintModelAnalysisTest(test.TestCase):
                          '  ScalarW (1, 1/1 params)\n',
                          lib.CheckAndRemoveDoc(f.read()))
 
+  @test_util.run_v1_only('b/120545219')
   def testSelectEverythingDetail(self):
     ops.reset_default_graph()
     dev = '/device:GPU:0' if test.is_gpu_available() else '/device:CPU:0'
@@ -93,10 +94,10 @@ class PrintModelAnalysisTest(test.TestCase):
           config=self._no_rewrite_session_config()) as sess, ops.device(dev):
         x = lib.BuildSmallModel()
 
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         pctx.trace_next_step()
         pctx.dump_next_step()
-        _ = sess.run(x)
+        _ = self.evaluate(x)
 
         pctx.profiler.profile_name_scope(options=opts)
 
@@ -160,7 +161,7 @@ class PrintModelAnalysisTest(test.TestCase):
                         ) as sess, ops.device('/device:CPU:0'):
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -186,7 +187,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -203,6 +204,7 @@ class PrintModelAnalysisTest(test.TestCase):
             lib.CheckAndRemoveDoc(f.read())[0:80])
         # pylint: enable=line-too-long
 
+  @test_util.run_v1_only('b/120545219')
   def testComplexCodeView(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -220,9 +222,9 @@ class PrintModelAnalysisTest(test.TestCase):
       with session.Session(config=self._no_rewrite_session_config()) as sess:
         x = lib.BuildFullModel()
 
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         pctx.trace_next_step()
-        _ = sess.run(x)
+        _ = self.evaluate(x)
         tfprof_node = pctx.profiler.profile_python(options=opts)
 
         # pylint: disable=line-too-long
@@ -281,7 +283,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -309,7 +311,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -345,7 +347,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -391,7 +393,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -424,7 +426,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -490,7 +492,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -555,7 +557,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -587,10 +589,10 @@ class PrintModelAnalysisTest(test.TestCase):
   def _trainLoop(self, train_op, train_steps, time_dir, time_step,
                  memory_dir, memory_step, profile_dir, dump_step):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       # start from 1 because variable_initializer took one step.
       for i in range(1, train_steps + 1):
-        _ = sess.run(train_op)
+        _ = self.evaluate(train_op)
         if i in time_step:
           ret = gfile.ListDirectory(time_dir)
           self.assertEqual(len(ret), 1)
@@ -619,6 +621,7 @@ class PrintModelAnalysisTest(test.TestCase):
           else:
             self.assertEqual(len(gfile.ListDirectory(profile_dir)), 0)
 
+  @test_util.run_v1_only('b/120545219')
   def testAutoProfiling(self):
     ops.reset_default_graph()
     time_dir = os.path.join(test.get_temp_dir(), 'time')
@@ -706,6 +709,7 @@ class PrintModelAnalysisTest(test.TestCase):
                       exception_str)
       self.assertTrue(mat is None)
 
+  @test_util.run_v1_only('b/120545219')
   def testTrackPersistentBytes(self):
     ops.reset_default_graph()
     a = array_ops.constant(np.ones((100, 100)))
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 2ad7adf76933df65ca795dca361397f436adb995..9d8f7683a658e74c649d9ea337e7dbc10f870ef2 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -23,7 +23,7 @@ from tensorflow.python.profiler import tfprof_logger
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('profiler.ProfileOptionBuilder')
+@tf_export(v1=['profiler.ProfileOptionBuilder'])
 class ProfileOptionBuilder(object):
   # pylint: disable=line-too-long
   """Option Builder for Profiling API.
diff --git a/tensorflow/python/profiler/pprof_profiler_test.py b/tensorflow/python/profiler/pprof_profiler_test.py
index 11a3487360c1396f86e150bfba47357a6c28a5fd..3f5bd9e79be2254779e4b64507ef91baec3db49c 100644
--- a/tensorflow/python/profiler/pprof_profiler_test.py
+++ b/tensorflow/python/profiler/pprof_profiler_test.py
@@ -24,6 +24,7 @@ from proto import profile_pb2
 from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -135,6 +136,7 @@ comment: 9
       profile.ParseFromString(profile_contents)
       self.assertEquals(expected_proto, str(profile))
 
+  @test_util.run_v1_only('b/120545219')
   def testProfileWithWhileLoop(self):
     options = config_pb2.RunOptions()
     options.trace_level = config_pb2.RunOptions.FULL_TRACE
diff --git a/tensorflow/python/profiler/profile_context_test.py b/tensorflow/python/profiler/profile_context_test.py
index 107ad443c32e20ab69f3c2fb71c652d97a9c0cc6..885f08ca4b9c049aa78d0d8a202cca48aa813bce 100644
--- a/tensorflow/python/profiler/profile_context_test.py
+++ b/tensorflow/python/profiler/profile_context_test.py
@@ -21,6 +21,7 @@ import os
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ builder = option_builder.ProfileOptionBuilder
 
 class ProfilerContextTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasics(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), "dump")
@@ -48,10 +50,10 @@ class ProfilerContextTest(test.TestCase):
     with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
       pctx.add_auto_profiling("op", options=opts, profile_steps=[15, 50, 100])
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         total_steps = 101
         for i in range(total_steps):
-          sess.run(x)
+          self.evaluate(x)
           if i == 14 or i == 49:
             self.assertTrue(gfile.Exists(outfile))
             gfile.Remove(outfile)
@@ -69,45 +71,47 @@ class ProfilerContextTest(test.TestCase):
       with gfile.Open(outfile, "r") as f:
         self.assertEqual(profile_str, f.read())
 
+  @test_util.run_deprecated_v1
   def testAutoTracingInDeubMode(self):
     ops.reset_default_graph()
     x = lib.BuildFullModel()
 
     with profile_context.ProfileContext(test.get_temp_dir(), debug=True):
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run(x)
+          self.evaluate(x)
           for f in gfile.ListDirectory(test.get_temp_dir()):
             # Warm up, no tracing.
             self.assertFalse("run_meta" in f)
-        sess.run(x)
+        self.evaluate(x)
         self.assertTrue(
             gfile.Exists(os.path.join(test.get_temp_dir(), "run_meta_11")))
         gfile.Remove(os.path.join(test.get_temp_dir(), "run_meta_11"))
         # fetched already.
-        sess.run(x)
+        self.evaluate(x)
         for f in gfile.ListDirectory(test.get_temp_dir()):
           self.assertFalse("run_meta" in f)
 
+  @test_util.run_deprecated_v1
   def testDisabled(self):
     ops.reset_default_graph()
     x = lib.BuildFullModel()
     with profile_context.ProfileContext(test.get_temp_dir(),
                                         enabled=False) as pctx:
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run(x)
+          self.evaluate(x)
       self.assertTrue(pctx.profiler is None)
       self.assertTrue(
           getattr(session.BaseSession, "profile_context", None) is None)
 
     with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run(x)
+          self.evaluate(x)
       self.assertFalse(pctx.profiler is None)
       self.assertFalse(
           getattr(session.BaseSession, "profile_context", None) is None)
diff --git a/tensorflow/python/profiler/profiler.py b/tensorflow/python/profiler/profiler.py
index efbdd1ba6842d85e82149346e9b4559527a1aacd..5f62690b54e2ff6e2c655eb5256299cce169f59a 100644
--- a/tensorflow/python/profiler/profiler.py
+++ b/tensorflow/python/profiler/profiler.py
@@ -49,7 +49,7 @@ _allowed_symbols.extend([
 ])
 
 # Export protos
-tf_export('profiler.GraphNodeProto')(GraphNodeProto)
-tf_export('profiler.MultiGraphNodeProto')(MultiGraphNodeProto)
-tf_export('profiler.AdviceProto')(AdviceProto)
-tf_export('profiler.OpLogProto')(OpLogProto)
+tf_export(v1=['profiler.GraphNodeProto'])(GraphNodeProto)
+tf_export(v1=['profiler.MultiGraphNodeProto'])(MultiGraphNodeProto)
+tf_export(v1=['profiler.AdviceProto'])(AdviceProto)
+tf_export(v1=['profiler.OpLogProto'])(OpLogProto)
diff --git a/tensorflow/python/profiler/profiler_test.py b/tensorflow/python/profiler/profiler_test.py
index eacb7d21e6aeb1c803165762e5f8f40e23247f64..e4f7361e5d711b58c5786a9e43e9d459c43dff4b 100644
--- a/tensorflow/python/profiler/profiler_test.py
+++ b/tensorflow/python/profiler/profiler_test.py
@@ -21,6 +21,7 @@ import os
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ builder = option_builder.ProfileOptionBuilder
 
 class ProfilerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testProfileBasic(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -171,6 +173,7 @@ class ProfilerTest(test.TestCase):
       checker = advice_pb.checkers['ExpensiveOperationChecker']
       self.assertGreater(len(checker.reports), 0)
 
+  @test_util.run_deprecated_v1
   def testMultipleProfilePerStep(self):
     ops.reset_default_graph()
     opts = (builder(builder.trainable_variables_parameter())
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index e651de32ea3bce32a965bfbeefc76ff08a79ac38..6ccd0e0ff3b5f9f067f49b7a1b64e62af7c7af5d 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -188,7 +188,7 @@ def merge_default_with_oplog(graph, op_log=None, run_meta=None,
   return tmp_op_log
 
 
-@tf_export('profiler.write_op_log')
+@tf_export(v1=['profiler.write_op_log'])
 def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 61e0abbfcbf612e3a7104bc014f531fb6925538e..adbce95c6f9f54909bbca2fdd3e31142bb2e6bc9 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -209,6 +209,7 @@ limitations under the License.
     SWIG_fail;
   } else {
     int num_outputs = $1->size();
+    Py_CLEAR($result);
     $result = PyList_New(num_outputs);
     for (int i = 0; i < num_outputs; ++i) {
       PyObject *output;
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index c9bc33e21824232df33c267d7b2baa824f5db0a3..53d0640542f257bff707047cd405a0dad5055449 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -12,6 +12,8 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
 py_library(
     name = "saved_model",
@@ -21,8 +23,10 @@ py_library(
     deps = [
         ":builder",
         ":constants",
+        ":load",
         ":loader",
         ":main_op",
+        ":save",
         ":signature_constants",
         ":signature_def_utils",
         ":simple_save",
@@ -82,12 +86,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
+        ":signature_def_utils",
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
-        "//tensorflow/python:training",
+        "//tensorflow/python:saver",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
@@ -113,6 +118,7 @@ py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -165,14 +171,15 @@ py_test(
         ":signature_def_utils",
         ":tag_constants",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:saver_test_utils",
+        "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:training",
@@ -263,5 +270,85 @@ py_test(
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
+tf_proto_library(
+    name = "saved_object_graph",
+    srcs = ["saved_object_graph.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//tensorflow:internal"],
+)
+
+py_library(
+    name = "save",
+    srcs = [
+        "save.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":builder",
+        ":constants",
+        ":loader",
+        ":saved_object_graph_py",
+        ":signature_constants",
+        ":signature_def_utils",
+        ":tag_constants",
+        ":utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:util",
+    ],
+)
+
+py_test(
+    name = "save_test",
+    srcs = ["save_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loader",
+        ":save",
+        ":signature_constants",
+        ":tag_constants",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "load",
+    srcs = [
+        "load.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loader",
+        ":saved_object_graph_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/training/checkpointable:tracking",
+    ],
+)
+
+py_test(
+    name = "load_test",
+    srcs = ["load_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":load",
+        ":save",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/checkpointable:tracking",
+    ],
+)
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index be49c70c60476ae8b95c07007abb32a222466958..b929934eebb14a340d89fbb570a322b2b7144154 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -24,5 +24,6 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.saved_model.builder_impl import _SavedModelBuilder
 from tensorflow.python.saved_model.builder_impl import SavedModelBuilder
 # pylint: enable=unused-import
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 8bf057f69db3352cd3c946bb1b31f4460f94ddf9..f37d283a2a2cbb50faf62f1ae24cd69bd0f29d74 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
 
 from google.protobuf.any_pb2 import Any
@@ -32,18 +33,17 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
-from tensorflow.python.util.deprecation import deprecated_endpoints
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("saved_model.Builder",
-           "saved_model.builder.SavedModelBuilder")
-@deprecated_endpoints("saved_model.builder.SavedModelBuilder")
-class SavedModelBuilder(object):
+# Base class for the SavedModelBuilder that is only used by Tensorflow
+# internally. Please use tf.compat.v1.saved_model.SavedModelBuilder instead.
+class _SavedModelBuilder(object):
   """Builds the `SavedModel` protocol buffer and saves variables and assets.
 
   The `SavedModelBuilder` class provides functionality to build a `SavedModel`
@@ -71,7 +71,7 @@ class SavedModelBuilder(object):
     builder.add_meta_graph_and_variables(sess,
                                     ["foo-tag"],
                                     signature_def_map=foo_signatures,
-                                    assets_collection=foo_assets)
+                                    assets_list=foo_assets)
   ...
 
   with tf.Session(graph=tf.Graph()) as sess:
@@ -81,6 +81,11 @@ class SavedModelBuilder(object):
 
   builder.save()
   ```
+
+  Note: This function will only be available through the v1 compatibility
+  library as tf.compat.v1.saved_model.builder.SavedModelBuilder or
+  tf.compat.v1.saved_model.Builder. Tensorflow 2.0 will introduce a new
+  object-based method of creating SavedModels.
   """
 
   def __init__(self, export_dir):
@@ -103,82 +108,24 @@ class SavedModelBuilder(object):
     # weights.
     self._has_saved_variables = False
 
-  def _save_and_write_assets(self, assets_collection_to_add=None):
+  def _save_and_write_assets(self, meta_graph_def, assets_list=None):
     """Saves asset to the meta graph and writes asset files to disk.
 
     Args:
-      assets_collection_to_add: The collection where the asset paths are setup.
+      meta_graph_def: The meta graph def to which the assets will be added.
+      assets_list: The list where the asset paths are setup.
     """
-    asset_filename_map = _maybe_save_assets(assets_collection_to_add)
+    # Creates a function that adds assets into the meta graph def.
+    write_fn = functools.partial(_add_asset_to_metagraph, meta_graph_def)
+    asset_filename_map = _maybe_save_assets(write_fn, assets_list)
 
     # Return if there are no assets to write.
     if not asset_filename_map:
       tf_logging.info("No assets to write.")
       return
 
-    assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
-        self._export_dir)
-
-    # Copy each asset from source path to destination path.
-    for asset_basename, asset_source_filepath in asset_filename_map.items():
-      asset_destination_filepath = os.path.join(
-          compat.as_bytes(assets_destination_dir),
-          compat.as_bytes(asset_basename))
-
-      # Only copy the asset file to the destination if it does not already
-      # exist. This is to ensure that an asset with the same name defined as
-      # part of multiple graphs is only copied the first time.
-      if not file_io.file_exists(asset_destination_filepath):
-        file_io.copy(asset_source_filepath, asset_destination_filepath)
-
-    tf_logging.info("Assets written to: %s",
-                    compat.as_text(assets_destination_dir))
-
-  def _maybe_add_main_op(self, main_op):
-    """Adds main op to the SavedModel.
-
-    Args:
-      main_op: Main op to run as part of graph initialization. If None, no
-        main op will be added to the graph.
-
-    Raises:
-      TypeError: if main op is provided but is not of type `Operation`.
-      ValueError: if the Graph already contains an init op.
-    """
-    if main_op is None:
-      return
-
-    if not isinstance(main_op, ops.Operation):
-      raise TypeError("main_op needs to be an Operation: %r" % main_op)
-
-    # Validate that no other init ops have been added to this graph already.
-    # We check main_op and legacy_init_op for thoroughness and explicitness.
-    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
-      if ops.get_collection(init_op_key):
-        raise ValueError(
-            "Graph already contains one or more main ops under the "
-            "collection {}.".format(init_op_key))
-
-    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
-
-  def _add_train_op(self, train_op):
-    """Add train op to the SavedModel.
-
-    Note that this functionality is in development, and liable to be
-    moved elsewhere.
-
-    Args:
-      train_op: Op or group of ops that are used for training. These are
-        stored as a collection with key TRAIN_OP_KEY, but not executed.
-
-    Raises:
-      TypeError if Train op is not of type `Operation`.
-    """
-    if train_op is not None:
-      if (not isinstance(train_op, ops.Tensor) and
-          not isinstance(train_op, ops.Operation)):
-        raise TypeError("train_op needs to be a Tensor or Op: %r" % train_op)
-      ops.add_to_collection(constants.TRAIN_OP_KEY, train_op)
+    # Copy assets from source path to destination path.
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
 
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
@@ -235,30 +182,32 @@ class SavedModelBuilder(object):
 
     Validation of entries in the signature def map includes ensuring that the
     `name` and `dtype` fields of the TensorInfo protos of the `inputs` and
-    `outputs` of each `SignatureDef` are populated.
+    `outputs` of each `SignatureDef` are populated. Also ensures that reserved
+    SigantureDef keys for the initialization and train ops are not used.
 
     Args:
       signature_def_map: The map of signature defs to be validated.
-    """
-    if signature_def_map is not None:
-      for signature_def_key in signature_def_map:
-        signature_def = signature_def_map[signature_def_key]
-        inputs = signature_def.inputs
-        outputs = signature_def.outputs
-        for inputs_key in inputs:
-          self._validate_tensor_info(inputs[inputs_key])
-        for outputs_key in outputs:
-          self._validate_tensor_info(outputs[outputs_key])
-
-  def _add_collections(
-      self, assets_collection, main_op, train_op):
-    """Add asset and op collections to be saved."""
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
 
-    self._maybe_add_main_op(main_op)
-
-    self._add_train_op(train_op)
+    Raises:
+      AssertionError: If a TensorInfo is not valid.
+      KeyError: If a reserved signature key is used in the map.
+    """
+    for signature_def_key in signature_def_map:
+      signature_def = signature_def_map[signature_def_key]
+      inputs = signature_def.inputs
+      outputs = signature_def.outputs
+      for inputs_key in inputs:
+        self._validate_tensor_info(inputs[inputs_key])
+      for outputs_key in outputs:
+        self._validate_tensor_info(outputs[outputs_key])
+    if constants.INIT_OP_SIGNATURE_KEY in signature_def_map:
+      raise KeyError(
+          "SignatureDef map key \"{}\" is reserved for initialization. Please "
+          "use a different key.".format(constants.INIT_OP_SIGNATURE_KEY))
+    if constants.TRAIN_OP_SIGNATURE_KEY in signature_def_map:
+      raise KeyError(
+          "SignatureDef map key \"{}\" is reserved for the train op. Please "
+          "use a different key.".format(constants.TRAIN_OP_SIGNATURE_KEY))
 
   def _maybe_create_saver(self, saver=None):
     """Creates a sharded saver if one does not already exist."""
@@ -272,19 +221,14 @@ class SavedModelBuilder(object):
           allow_empty=True)
     return saver
 
-  @deprecated_args(None,
-                   "Pass your op to the equivalent parameter main_op instead.",
-                   "legacy_init_op")
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
-                     assets_collection=None,
-                     legacy_init_op=None,
+                     assets_list=None,
                      clear_devices=False,
-                     main_op=None,
-                     strip_default_attrs=False,
+                     init_op=None,
+                     train_op=None,
                      saver=None):
-    # pylint: disable=line-too-long
     """Adds the current meta graph to the SavedModel.
 
     Creates a Saver in the current scope and uses the Saver to export the meta
@@ -295,19 +239,17 @@ class SavedModelBuilder(object):
       tags: The set of tags to annotate the meta graph def with.
       signature_def_map: The map of signature defs to be added to the meta graph
           def.
-      assets_collection: Assets collection to be saved with SavedModel. Note
-          that this collection should be a subset of the assets saved as part of
+      assets_list: Assets to be saved with SavedModel. Note
+          that this list should be a subset of the assets saved as part of
           the first meta graph in the SavedModel.
-      legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
-      main_op: Op or group of ops to execute when the graph is loaded. Note
-          that when the main_op is specified it is run after the restore op at
+      init_op: Op or group of ops to execute when the graph is loaded. Note
+          that when the init_op is specified it is run after the restore op at
           load-time.
-      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      train_op: Op or group of opts that trains the model when run. This will
+        not be run automatically when the graph is loaded, instead saved in
+        a SignatureDef accessible through the exported MetaGraph.
       saver: An instance of tf.train.Saver that will be used to export the
         metagraph. If None, a sharded Saver that restores all variables will
         be used.
@@ -316,7 +258,6 @@ class SavedModelBuilder(object):
       AssertionError: If the variables for the SavedModel have not been saved
           yet, or if the graph already contains one or more legacy init ops.
     """
-    # pylint: enable=line-too-long
     if not self._has_saved_variables:
       raise AssertionError(
           "Graph state including variables and assets has not been saved yet. "
@@ -324,14 +265,15 @@ class SavedModelBuilder(object):
 
     # Validate the signature def map to ensure all included TensorInfos are
     # properly populated.
+    signature_def_map = signature_def_map or {}
     self._validate_signature_def_map(signature_def_map)
 
-    # legacy_init_op is deprecated, and going away in TF 2.0.
-    # Re-mapping to main_op, as treatment is identical regardless.
-    main_op = main_op or legacy_init_op
-
-    # Add assets and ops
-    self._add_collections(assets_collection, main_op, None)
+    # Create a SignatureDef pointing to the graph initialization op, which will
+    # be added to the MetaGraphDef.
+    _add_op_to_signature_def_map(signature_def_map, init_op,
+                                 constants.INIT_OP_SIGNATURE_KEY)
+    _add_op_to_signature_def_map(signature_def_map, train_op,
+                                 constants.TRAIN_OP_SIGNATURE_KEY)
 
     saver = self._maybe_create_saver(saver)
 
@@ -343,22 +285,22 @@ class SavedModelBuilder(object):
     # resolved, we just leave the option set to False for now.
     # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
     meta_graph_def = saver.export_meta_graph(
-        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
+        clear_devices=clear_devices, strip_default_attrs=True)
+
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(meta_graph_def, assets_list)
 
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
-  @deprecated_args(None,
-                   "Pass your op to the equivalent parameter main_op instead.",
-                   "legacy_init_op")
   def add_meta_graph_and_variables(self,
                                    sess,
                                    tags,
                                    signature_def_map=None,
-                                   assets_collection=None,
-                                   legacy_init_op=None,
+                                   assets_list=None,
                                    clear_devices=False,
-                                   main_op=None,
+                                   init_op=None,
+                                   train_op=None,
                                    strip_default_attrs=False,
                                    saver=None):
     # pylint: disable=line-too-long
@@ -376,14 +318,15 @@ class SavedModelBuilder(object):
       tags: The set of tags with which to save the meta graph.
       signature_def_map: The map of signature def map to add to the meta graph
         def.
-      assets_collection: Assets collection to be saved with SavedModel.
-      legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load. Deprecated; please use main_op instead.
+      assets_list: Assets to be saved with SavedModel.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
-      main_op: Op or group of ops to execute when the graph is loaded. Note
-          that when the main_op is specified it is run after the restore op at
+      init_op: Op or group of ops to execute when the graph is loaded. Note
+          that when the init_op is specified it is run after the restore op at
           load-time.
+      train_op: Op or group of ops that trains the model when run. This will
+        not be run automatically when the graph is loaded, instead saved in
+        a SignatureDef accessible through the exported MetaGraph.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
@@ -400,14 +343,15 @@ class SavedModelBuilder(object):
 
     # Validate the signature def map to ensure all included TensorInfos are
     # properly populated.
+    signature_def_map = signature_def_map or {}
     self._validate_signature_def_map(signature_def_map)
 
-    # legacy_init_op is deprecated, and going away in TF 2.0.
-    # Re-mapping to main_op, as treatment is identical regardless.
-    main_op = main_op or legacy_init_op
-
-    # Add assets and ops
-    self._add_collections(assets_collection, main_op, None)
+    # Create a SignatureDef pointing to the graph initialization op, which will
+    # be added to the MetaGraphDef.
+    _add_op_to_signature_def_map(signature_def_map, init_op,
+                                 constants.INIT_OP_SIGNATURE_KEY)
+    _add_op_to_signature_def_map(signature_def_map, train_op,
+                                 constants.TRAIN_OP_SIGNATURE_KEY)
 
     saved_model_utils.get_or_create_variables_dir(self._export_dir)
     variables_path = saved_model_utils.get_variables_path(self._export_dir)
@@ -432,6 +376,9 @@ class SavedModelBuilder(object):
     meta_graph_def = saver.export_meta_graph(
         clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
 
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(meta_graph_def, assets_list)
+
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
@@ -469,11 +416,205 @@ class SavedModelBuilder(object):
     return path
 
 
-def _maybe_save_assets(assets_collection_to_add=None):
+@tf_export(v1=["saved_model.Builder", "saved_model.builder.SavedModelBuilder"])  # pylint: disable=missing-docstring
+class SavedModelBuilder(_SavedModelBuilder):
+  __doc__ = _SavedModelBuilder.__doc__.replace("assets_list",
+                                               "assets_collection")
+
+  def __init__(self, export_dir):
+    super(SavedModelBuilder, self).__init__(export_dir=export_dir)
+
+  def _add_collections(self, assets_collection, main_op, train_op):
+    """Add asset and op collections to be saved."""
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(assets_collection)
+
+    self._maybe_add_main_op(main_op)
+
+    self._add_train_op(train_op)
+
+  def _save_and_write_assets(self, assets_collection_to_add=None):
+    """Saves asset to the meta graph and writes asset files to disk.
+
+    Args:
+      assets_collection_to_add: The collection where the asset paths are setup.
+    """
+    # Add assets to the collection with key `constants.ASSETS_KEY`, in the
+    # graph.
+    asset_filename_map = _maybe_save_assets(_add_asset_to_collection,
+                                            assets_collection_to_add)
+
+    # Return if there are no assets to write.
+    if not asset_filename_map:
+      tf_logging.info("No assets to write.")
+      return
+
+    # Copy assets from source path to destination path.
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
+
+  def _maybe_add_main_op(self, main_op):
+    """Adds main op to the SavedModel.
+
+    Args:
+      main_op: Main op to run as part of graph initialization. If None, no main
+        op will be added to the graph.
+
+    Raises:
+      TypeError: if main op is provided but is not of type `Operation`.
+      ValueError: if the Graph already contains an init op.
+    """
+    if main_op is None:
+      return
+
+    if not isinstance(main_op, ops.Operation):
+      raise TypeError("main_op needs to be an Operation: %r" % main_op)
+
+    # Validate that no other init ops have been added to this graph already.
+    # We check main_op and legacy_init_op for thoroughness and explicitness.
+    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
+      if ops.get_collection(init_op_key):
+        raise ValueError(
+            "Graph already contains one or more main ops under the "
+            "collection {}.".format(init_op_key))
+
+    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
+
+  def _add_train_op(self, train_op):
+    """Add train op to the SavedModel.
+
+    Note that this functionality is in development, and liable to be
+    moved elsewhere.
+
+    Args:
+      train_op: Op or group of ops that are used for training. These are stored
+        as a collection with key TRAIN_OP_KEY, but not executed.
+
+    Raises:
+      TypeError if Train op is not of type `Operation`.
+    """
+    if train_op is not None:
+      if (not isinstance(train_op, ops.Tensor) and
+          not isinstance(train_op, ops.Operation)):
+        raise TypeError("train_op needs to be a Tensor or Op: %r" % train_op)
+      ops.add_to_collection(constants.TRAIN_OP_KEY, train_op)
+
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
+  def add_meta_graph(self,
+                     tags,
+                     signature_def_map=None,
+                     assets_collection=None,
+                     legacy_init_op=None,
+                     clear_devices=False,
+                     main_op=None,
+                     strip_default_attrs=False,
+                     saver=None):
+    if not self._has_saved_variables:
+      raise AssertionError(
+          "Graph state including variables and assets has not been saved yet. "
+          "Please invoke `add_meta_graph_and_variables()` first.")
+
+    # Validate the signature def map to ensure all included TensorInfos are
+    # properly populated.
+    signature_def_map = signature_def_map or {}
+    self._validate_signature_def_map(signature_def_map)
+
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
+    # Add assets and ops
+    self._add_collections(assets_collection, main_op, None)
+
+    saver = self._maybe_create_saver(saver)
+
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
+  def add_meta_graph_and_variables(self,
+                                   sess,
+                                   tags,
+                                   signature_def_map=None,
+                                   assets_collection=None,
+                                   legacy_init_op=None,
+                                   clear_devices=False,
+                                   main_op=None,
+                                   strip_default_attrs=False,
+                                   saver=None):
+    if self._has_saved_variables:
+      raise AssertionError("Graph state including variables and assets has "
+                           "already been saved. Please invoke "
+                           "`add_meta_graph()` instead.")
+
+    # Validate the signature def map to ensure all included TensorInfos are
+    # properly populated.
+    signature_def_map = signature_def_map or {}
+    self._validate_signature_def_map(signature_def_map)
+
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
+    # Add assets and ops
+    self._add_collections(assets_collection, main_op, None)
+
+    saved_model_utils.get_or_create_variables_dir(self._export_dir)
+    variables_path = saved_model_utils.get_variables_path(self._export_dir)
+
+    saver = self._maybe_create_saver(saver)
+
+    # Save the variables. Also, disable writing the checkpoint state proto. The
+    # file is not used during SavedModel loading. In addition, since a
+    # SavedModel can be copied or moved, this avoids the checkpoint state to
+    # become outdated.
+    saver.save(sess, variables_path, write_meta_graph=False, write_state=False)
+
+    # Export the meta graph def.
+
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+    # Mark this instance of SavedModel as having saved variables, such that
+    # subsequent attempts to save variables will fail.
+    self._has_saved_variables = True
+
+  add_meta_graph.__doc__ = _SavedModelBuilder.add_meta_graph.__doc__.replace(
+      "assets_list", "assets_collection")
+  add_meta_graph_and_variables.__doc__ = \
+      _SavedModelBuilder.add_meta_graph_and_variables.__doc__.replace(
+          "assets_list", "assets_collection")
+
+
+def _maybe_save_assets(write_fn, assets_to_add=None):
   """Saves assets to the meta graph.
 
   Args:
-    assets_collection_to_add: The collection where the asset paths are setup.
+    write_fn: A function callback that writes asset into meta graph.
+    assets_to_add: The list where the asset paths are setup.
 
   Returns:
     A dict of asset basenames for saving to the original full path to the asset.
@@ -484,25 +625,25 @@ def _maybe_save_assets(assets_collection_to_add=None):
   # Map of target file names to original filenames
   asset_filename_map = {}
 
-  if assets_collection_to_add is None:
+  if assets_to_add is None:
     tf_logging.info("No assets to save.")
     return asset_filename_map
 
-  # Iterate over the supplied asset collection, build the `AssetFile` proto
-  # and add them to the collection with key `constants.ASSETS_KEY`, in the
-  # graph.
-  for asset_tensor in assets_collection_to_add:
+  # Iterate over the supplied assets, build the `AssetFile` proto and add them
+  # to the meta graph.
+  for asset_tensor in assets_to_add:
     asset_source_filepath = _asset_path_from_tensor(asset_tensor)
     if not asset_source_filepath:
       raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
 
-    asset_filename = _get_asset_filename_to_add(
+    asset_filename = get_asset_filename_to_add(
         asset_source_filepath, asset_filename_map)
 
-    # Build `AssetFile` proto and add it to the asset collection in the graph.
+    # Call the passed-in function that builds AssetFileDef proto and adds it
+    # to either the collection or asset_file_def field of the meta graph.
     # Note that this should be done even when the file is a duplicate of an
     # already-added file, as the tensor reference should still exist.
-    _add_asset_to_collection(asset_filename, asset_tensor)
+    write_fn(asset_filename, asset_tensor)
 
     # In the cases where we are adding a duplicate, this will result in the
     # last of the filepaths being the one used for copying the file to the
@@ -514,7 +655,7 @@ def _maybe_save_assets(assets_collection_to_add=None):
   return asset_filename_map
 
 
-def _get_asset_filename_to_add(asset_filepath, asset_filename_map):
+def get_asset_filename_to_add(asset_filepath, asset_filename_map):
   """Get a unique basename to add to the SavedModel if this file is unseen.
 
   Assets come from users as full paths, and we save them out to the
@@ -540,7 +681,7 @@ def _get_asset_filename_to_add(asset_filepath, asset_filename_map):
 
   other_asset_filepath = asset_filename_map[asset_filename]
   if other_asset_filepath == asset_filepath:
-    # This is the same file, stored twice in the collection list. No need
+    # This is the same file, stored twice in the list. No need
     # to make unique.
     return asset_filename
 
@@ -587,6 +728,41 @@ def _asset_path_from_tensor(path_tensor):
   return str_values[0]
 
 
+def _add_asset_to_metagraph(meta_graph_def, asset_filename, asset_tensor):
+  """Builds an asset proto and adds it to the meta graph def.
+
+  Args:
+    meta_graph_def: The meta graph def to which the asset will be added.
+    asset_filename: The filename of the asset to be added.
+    asset_tensor: The asset tensor used to populate the tensor info of the asset
+      proto.
+  """
+  asset_proto = meta_graph_def.asset_file_def.add()
+  asset_proto.filename = asset_filename
+  asset_proto.tensor_info.name = asset_tensor.name
+
+
+def copy_assets_to_destination_dir(asset_filename_map, destination_dir):
+  """Copy all assets from source path to destination path."""
+  assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
+      destination_dir)
+
+  # Copy each asset from source path to destination path.
+  for asset_basename, asset_source_filepath in asset_filename_map.items():
+    asset_destination_filepath = os.path.join(
+        compat.as_bytes(assets_destination_dir),
+        compat.as_bytes(asset_basename))
+
+    # Only copy the asset file to the destination if it does not already
+    # exist. This is to ensure that an asset with the same name defined as
+    # part of multiple graphs is only copied the first time.
+    if not file_io.file_exists(asset_destination_filepath):
+      file_io.copy(asset_source_filepath, asset_destination_filepath)
+
+  tf_logging.info("Assets written to: %s",
+                  compat.as_text(assets_destination_dir))
+
+
 def _add_asset_to_collection(asset_filename, asset_tensor):
   """Builds an asset proto and adds it to the asset collection of the graph.
 
@@ -602,3 +778,8 @@ def _add_asset_to_collection(asset_filename, asset_tensor):
   asset_any_proto = Any()
   asset_any_proto.Pack(asset_proto)
   ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
+
+
+def _add_op_to_signature_def_map(signature_def_map, op, key):
+  if op is not None:
+    signature_def_map[key] = signature_def_utils.op_signature_def(op, key)
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index cb251f08bb56fd5496ea4f3aaedfd2822ae1565c..90511a409ed7eb34bede12f1ce9d665e0f1cc913 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -23,42 +23,68 @@ from tensorflow.python.util.tf_export import tf_export
 
 # Subdirectory name containing the asset files.
 ASSETS_DIRECTORY = "assets"
-tf_export("saved_model.constants.ASSETS_DIRECTORY").export_constant(
-    __name__, "ASSETS_DIRECTORY")
+tf_export(
+    "saved_model.ASSETS_DIRECTORY",
+    v1=[
+        "saved_model.ASSETS_DIRECTORY", "saved_model.constants.ASSETS_DIRECTORY"
+    ]).export_constant(__name__, "ASSETS_DIRECTORY")
+
+# Subdirectory name containing unmanaged files from higher-level APIs.
+EXTRA_ASSETS_DIRECTORY = "assets.extra"
 
 # CollectionDef key containing SavedModel assets.
 ASSETS_KEY = "saved_model_assets"
-tf_export("saved_model.constants.ASSETS_KEY").export_constant(
-    __name__, "ASSETS_KEY")
+tf_export(
+    "saved_model.ASSETS_KEY",
+    v1=["saved_model.ASSETS_KEY",
+        "saved_model.constants.ASSETS_KEY"]).export_constant(
+            __name__, "ASSETS_KEY")
 
 # CollectionDef key for the legacy init op.
 LEGACY_INIT_OP_KEY = "legacy_init_op"
-tf_export("saved_model.constants.LEGACY_INIT_OP_KEY").export_constant(
-    __name__, "LEGACY_INIT_OP_KEY")
+tf_export(
+    v1=[
+        "saved_model.LEGACY_INIT_OP_KEY",
+        "saved_model.constants.LEGACY_INIT_OP_KEY"
+    ]).export_constant(__name__, "LEGACY_INIT_OP_KEY")
 
 # CollectionDef key for the SavedModel main op.
 MAIN_OP_KEY = "saved_model_main_op"
-tf_export("saved_model.constants.MAIN_OP_KEY").export_constant(
-    __name__, "MAIN_OP_KEY")
+tf_export(
+    v1=["saved_model.MAIN_OP_KEY",
+        "saved_model.constants.MAIN_OP_KEY"]).export_constant(
+            __name__, "MAIN_OP_KEY")
 
 # CollectionDef key for the SavedModel train op.
-# Not exported while export_all_saved_models is in contrib.
+# Not exported while export_all_saved_models is experimental.
 TRAIN_OP_KEY = "saved_model_train_op"
 
 # Schema version for SavedModel.
 SAVED_MODEL_SCHEMA_VERSION = 1
-tf_export("saved_model.constants.SAVED_MODEL_SCHEMA_VERSION").export_constant(
-    __name__, "SAVED_MODEL_SCHEMA_VERSION")
+tf_export(
+    "saved_model.SAVED_MODEL_SCHEMA_VERSION",
+    v1=[
+        "saved_model.SAVED_MODEL_SCHEMA_VERSION",
+        "saved_model.constants.SAVED_MODEL_SCHEMA_VERSION"
+    ]).export_constant(__name__, "SAVED_MODEL_SCHEMA_VERSION")
 
 # File name for SavedModel protocol buffer.
 SAVED_MODEL_FILENAME_PB = "saved_model.pb"
-tf_export("saved_model.constants.SAVED_MODEL_FILENAME_PB").export_constant(
-    __name__, "SAVED_MODEL_FILENAME_PB")
+tf_export(
+    "saved_model.SAVED_MODEL_FILENAME_PB",
+    v1=[
+        "saved_model.SAVED_MODEL_FILENAME_PB",
+        "saved_model.constants.SAVED_MODEL_FILENAME_PB"
+    ]).export_constant(__name__, "SAVED_MODEL_FILENAME_PB")
 
 # File name for text version of SavedModel protocol buffer.
 SAVED_MODEL_FILENAME_PBTXT = "saved_model.pbtxt"
-tf_export("saved_model.constants.SAVED_MODEL_FILENAME_PBTXT").export_constant(
-    __name__, "SAVED_MODEL_FILENAME_PBTXT")
+tf_export(
+    "saved_model.SAVED_MODEL_FILENAME_PBTXT",
+    v1=[
+        "saved_model.SAVED_MODEL_FILENAME_PBTXT",
+        "saved_model.constants.SAVED_MODEL_FILENAME_PBTXT"
+    ]).export_constant(__name__, "SAVED_MODEL_FILENAME_PBTXT")
 
 # File name for json format of SavedModel.
 # Not exported while keras_saved_model is in contrib.
@@ -66,10 +92,23 @@ SAVED_MODEL_FILENAME_JSON = "saved_model.json"
 
 # Subdirectory name containing the variables/checkpoint files.
 VARIABLES_DIRECTORY = "variables"
-tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
-    __name__, "VARIABLES_DIRECTORY")
+tf_export(
+    "saved_model.VARIABLES_DIRECTORY",
+    v1=[
+        "saved_model.VARIABLES_DIRECTORY",
+        "saved_model.constants.VARIABLES_DIRECTORY"
+    ]).export_constant(__name__, "VARIABLES_DIRECTORY")
 
 # File name used for variables.
 VARIABLES_FILENAME = "variables"
-tf_export("saved_model.constants.VARIABLES_FILENAME").export_constant(
-    __name__, "VARIABLES_FILENAME")
+tf_export(
+    "saved_model.VARIABLES_FILENAME",
+    v1=[
+        "saved_model.VARIABLES_FILENAME",
+        "saved_model.constants.VARIABLES_FILENAME"
+    ]).export_constant(__name__, "VARIABLES_FILENAME")
+
+# The initialization and train ops for a MetaGraph are stored in the
+# signature def map. The ops are added to the map with the following keys.
+INIT_OP_SIGNATURE_KEY = "__saved_model_init_op"
+TRAIN_OP_SIGNATURE_KEY = "__saved_model_train_op"
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3095f4ee5e09ae0973164acc748e2d922e8a991
--- /dev/null
+++ b/tensorflow/python/saved_model/load.py
@@ -0,0 +1,96 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import a checkpointable object from a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.python.saved_model import utils_impl as saved_model_utils
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.util import compat
+
+
+class _Loader(object):
+  """Helper class to load an object-based SavedModel."""
+
+  def __init__(self, object_graph_proto, saved_model_proto, export_dir):
+    self._asset_file_def = saved_model_proto.meta_graphs[0].asset_file_def
+    self._proto = object_graph_proto
+    self._export_dir = export_dir
+    self._load_all()
+
+  def _load_all(self):
+    self._nodes = [self._recreate(proto) for proto in self._proto.nodes]
+    # After creating the objects, construct the edges between the objects.
+    for obj, object_proto in zip(self._nodes, self._proto.nodes):
+      for reference in object_proto.children:
+        setattr(obj, reference.local_name, self._nodes[reference.node_id])
+
+  def get(self, node_id):
+    return self._nodes[node_id]
+
+  def _recreate(self, proto):
+    factory = {
+        "user_object": lambda: self._recreate_user_object(proto.user_object),
+        "asset": lambda: self._recreate_asset(proto.asset),
+    }
+    kind = proto.WhichOneof("kind")
+    if kind not in factory:
+      raise ValueError("Unknown SavedObject type: %r" % kind)
+    return factory[kind]()
+
+  def _recreate_user_object(self, proto):
+    del proto
+    return tracking.Checkpointable()
+
+  def _recreate_asset(self, proto):
+    filename = os.path.join(
+        saved_model_utils.get_assets_dir(self._export_dir),
+        self._asset_file_def[proto.asset_file_def_index].filename)
+    return tracking.TrackableAsset(filename)
+
+
+def _load_saved_object_graph_proto(filename):
+  with file_io.FileIO(filename, "rb") as f:
+    contents = f.read()
+    return saved_object_graph_pb2.SavedObjectGraph.FromString(contents)
+
+
+def load(export_dir):
+  """Load a SavedModel from `export_dir`."""
+  saved_model_proto = loader_impl.parse_saved_model(export_dir)
+  object_graph_filename = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY),
+      compat.as_bytes("object_graph.pb"))
+  if file_io.file_exists(object_graph_filename):
+    object_graph_proto = _load_saved_object_graph_proto(object_graph_filename)
+    loader = _Loader(object_graph_proto,
+                     saved_model_proto,
+                     export_dir)
+    root = loader.get(0)
+  else:
+    raise NotImplementedError(
+        "Currently only SavedModels exported with `tf.saved_model.save` may be "
+        "imported. Other SavedModels may eventually be supported via load().")
+  # TODO(allenl): load functions from the SavedModel into the eager context
+  return root
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2971101cdb5ae93613df65f0379866244a7a3fe
--- /dev/null
+++ b/tensorflow/python/saved_model/load_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for checkpointable object SavedModel loading."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.training.checkpointable import tracking
+
+
+class LoadTest(test.TestCase):
+
+  def test_structure_import(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.dep_one = tracking.Checkpointable()
+    root.dep_two = tracking.Checkpointable()
+    root.dep_two.dep = tracking.Checkpointable()
+    root.dep_three = root.dep_two.dep
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir)
+    imported = load.load(save_dir)
+    self.assertIs(imported.dep_three, imported.dep_two.dep)
+    self.assertIsNot(imported.dep_one, imported.dep_two)
+
+  def _make_asset(self, contents):
+    filename = tempfile.mktemp(prefix=self.get_temp_dir())
+    with open(filename, "w") as f:
+      f.write(contents)
+    return filename
+
+  def test_assets_import(self):
+    file1 = self._make_asset("contents 1")
+    file2 = self._make_asset("contents 2")
+
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.asset1 = tracking.TrackableAsset(file1)
+    root.asset2 = tracking.TrackableAsset(file2)
+
+    save_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, save_dir)
+
+    file_io.delete_file(file1)
+    file_io.delete_file(file2)
+    load_dir = os.path.join(self.get_temp_dir(), "load_dir")
+    file_io.rename(save_dir, load_dir)
+
+    imported = load.load(load_dir)
+    with open(imported.asset1.asset_path.numpy(), "r") as f:
+      self.assertEquals("contents 1", f.read())
+    with open(imported.asset2.asset_path.numpy(), "r") as f:
+      self.assertEquals("contents 2", f.read())
+
+  def test_assets_dedup(self):
+    vocab = self._make_asset("contents")
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    root.asset1 = tracking.TrackableAsset(vocab)
+    root.asset2 = tracking.TrackableAsset(vocab)
+
+    export_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, export_dir)
+    imported = load.load(export_dir)
+
+    self.assertEqual(imported.asset1.asset_path.numpy(),
+                     imported.asset2.asset_path.numpy())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 895644a0305d99f086a3682d41c90e6af0be32d2..e5be03aae4905f4465ac87590da610a7d46e2ae4 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -31,6 +31,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
@@ -38,7 +39,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _parse_saved_model(export_dir):
+def parse_saved_model(export_dir):
   """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
 
   Args:
@@ -82,6 +83,11 @@ def _parse_saved_model(export_dir):
                    constants.SAVED_MODEL_FILENAME_PB))
 
 
+# TODO(b/120594573): Make this symbol also available as private, so that
+# tensorflow_transform and tensorflow_estimator do not break.
+_parse_saved_model = parse_saved_model
+
+
 def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   """Gets the asset tensors, if defined in the meta graph def to load.
 
@@ -99,22 +105,29 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   collection_def = meta_graph_def_to_load.collection_def
 
   asset_tensor_dict = {}
-  if constants.ASSETS_KEY in collection_def:
-    # Location of the assets for SavedModel.
-    assets_directory = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes(constants.ASSETS_DIRECTORY))
+  asset_protos = []
+
+  if meta_graph_def_to_load.asset_file_def:
+    asset_protos = meta_graph_def_to_load.asset_file_def
+  elif constants.ASSETS_KEY in collection_def:
     assets_any_proto = collection_def[constants.ASSETS_KEY].any_list.value
-    # Process each asset and add it to the asset tensor dictionary.
     for asset_any_proto in assets_any_proto:
       asset_proto = meta_graph_pb2.AssetFileDef()
       asset_any_proto.Unpack(asset_proto)
-      tensor_name = asset_proto.tensor_info.name
-      if import_scope:
-        tensor_name = "%s/%s" % (import_scope, tensor_name)
-      asset_tensor_dict[tensor_name] = os.path.join(
-          compat.as_bytes(assets_directory),
-          compat.as_bytes(asset_proto.filename))
+      asset_protos.append(asset_proto)
+
+  # Location of the assets for SavedModel.
+  assets_directory = os.path.join(
+      compat.as_bytes(export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY))
+  # Process each asset and add it to the asset tensor dictionary.
+  for asset_proto in asset_protos:
+    tensor_name = asset_proto.tensor_info.name
+    if import_scope:
+      tensor_name = "%s/%s" % (import_scope, tensor_name)
+    asset_tensor_dict[tensor_name] = os.path.join(
+        compat.as_bytes(assets_directory),
+        compat.as_bytes(asset_proto.filename))
+
   return asset_tensor_dict
 
 
@@ -134,19 +147,53 @@ def _get_main_op_tensor(
     RuntimeError: If the collection def corresponding to the main op key has
         other than exactly one tensor.
   """
+  # TODO(kathywu): Rename this method to _get_op_from_collection when
+  # dependency from SavedModelEstimator is removed.
   collection_def = meta_graph_def_to_load.collection_def
-  main_op_tensor = None
+  init_op = None
   if init_op_key in collection_def:
-    main_ops = collection_def[init_op_key].node_list.value
-    if len(main_ops) != 1:
-      raise RuntimeError("Expected exactly one SavedModel main op. "
-                         "Found: {}".format(main_ops))
-    main_op_tensor = ops.get_collection(init_op_key)[0]
-  return main_op_tensor
+    init_op_list = collection_def[init_op_key].node_list.value
+    if len(init_op_list) != 1:
+      raise RuntimeError("Expected exactly one SavedModel init op. "
+                         "Found: {}".format(init_op_list))
+    init_op = ops.get_collection(init_op_key)[0]
+  return init_op
 
 
-@tf_export("saved_model.maybe_saved_model_directory",
-           "saved_model.loader.maybe_saved_model_directory")
+def _get_op_from_collection(meta_graph_def, op_key):
+  return _get_main_op_tensor(meta_graph_def, op_key)
+
+
+def _get_op_from_signature_def(meta_graph_def, op_signature_key, import_scope):
+  """Retrieve op stored in the imported meta graph's signature def."""
+  if op_signature_key in meta_graph_def.signature_def:
+    return signature_def_utils.load_op_from_signature_def(
+        meta_graph_def.signature_def[op_signature_key], op_signature_key,
+        import_scope)
+  else:
+    return None
+
+
+def get_init_op(meta_graph_def, import_scope=None):
+  return (_get_op_from_signature_def(
+      meta_graph_def, constants.INIT_OP_SIGNATURE_KEY, import_scope) or
+          _get_op_from_collection(meta_graph_def, constants.MAIN_OP_KEY) or
+          _get_op_from_collection(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
+
+
+def get_train_op(meta_graph_def, import_scope=None):
+  train_op = _get_op_from_signature_def(
+      meta_graph_def, constants.TRAIN_OP_SIGNATURE_KEY, import_scope)
+  if train_op is None:
+    train_op = _get_op_from_collection(meta_graph_def, constants.TRAIN_OP_KEY)
+  return train_op
+
+
+@tf_export(v1=[
+    "saved_model.contains_saved_model",
+    "saved_model.maybe_saved_model_directory",
+    "saved_model.loader.maybe_saved_model_directory"
+])
 @deprecation.deprecated_endpoints(
     "saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
@@ -169,7 +216,32 @@ def maybe_saved_model_directory(export_dir):
   return file_io.file_exists(txt_path) or file_io.file_exists(pb_path)
 
 
-@tf_export("saved_model.load", "saved_model.loader.load")
+@tf_export("saved_model.contains_saved_model", v1=[])
+def contains_saved_model(export_dir):
+  """Checks whether the provided export directory could contain a SavedModel.
+
+  Note that the method does not load any data by itself. If the method returns
+  `false`, the export directory definitely does not contain a SavedModel. If the
+  method returns `true`, the export directory may contain a SavedModel but
+  provides no guarantee that it can be loaded.
+
+  Args:
+    export_dir: Absolute string path to possible export location. For example,
+                '/my/foo/model'.
+
+  Returns:
+    True if the export directory contains SavedModel files, False otherwise.
+  """
+  return maybe_saved_model_directory(export_dir)
+
+
+@tf_export(v1=["saved_model.load", "saved_model.loader.load"])
+@deprecation.deprecated(
+    None,
+    "This function will only be available through the v1 compatibility "
+    "library as tf.compat.v1.saved_model.loader.load or "
+    "tf.compat.v1.saved_model.load. There will be a new function for importing "
+    "SavedModels in Tensorflow 2.0.")
 def load(sess, tags, export_dir, import_scope=None, **saver_kwargs):
   """Loads the model from a SavedModel as specified by tags.
 
@@ -209,7 +281,7 @@ class SavedModelLoader(object):
     """
     self._export_dir = export_dir
     self._variables_path = saved_model_utils.get_variables_path(export_dir)
-    self._saved_model = _parse_saved_model(export_dir)
+    self._saved_model = parse_saved_model(export_dir)
 
   @property
   def export_dir(self):
@@ -324,11 +396,9 @@ class SavedModelLoader(object):
       asset_tensors_dictionary = _get_asset_tensors(
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
-      main_op_tensor = (
-          _get_main_op_tensor(meta_graph_def, constants.MAIN_OP_KEY) or
-          _get_main_op_tensor(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
-      if main_op_tensor is not None:
-        sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
+      init_op = get_init_op(meta_graph_def, import_scope)
+      if init_op is not None:
+        sess.run(fetches=[init_op], feed_dict=asset_tensors_dictionary)
 
   def load(self, sess, tags, import_scope=None, **saver_kwargs):
     """Load the MetaGraphDef graph and restore variable values into the session.
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index 924b2e7c0655130df9c0f7c5fe7742fc5ebaddc6..3e27c0801cd43eb43d1e0636f8aac1b1bc054485 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import shutil
+
+from absl.testing import parameterized
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.lib.io import file_io
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -42,55 +45,74 @@ SIMPLE_ADD_SAVED_MODEL = _get_export_dir("simple_add_saved_model")
 SAVED_MODEL_WITH_MAIN_OP = _get_export_dir("saved_model_with_main_op")
 
 
-class SavedModelLoaderTest(test.TestCase):
-
-  def setUp(self):
-    """Write test SavedModels to a temp directory."""
-    with session.Session(graph=ops.Graph()) as sess:
-      x = variables.VariableV1(5, name="x")
-      y = variables.VariableV1(11, name="y")
-      z = x + y
-      sess.run(variables.global_variables_initializer())
-
-      foo_sig_def = signature_def_utils.build_signature_def(
-          {"foo_input": utils.build_tensor_info(x)},
-          {"foo_output": utils.build_tensor_info(z)})
-      bar_sig_def = signature_def_utils.build_signature_def(
-          {"bar_x": utils.build_tensor_info(x),
-           "bar_y": utils.build_tensor_info(y)},
-          {"bar_z": utils.build_tensor_info(z)})
-
-      builder = saved_model_builder.SavedModelBuilder(SIMPLE_ADD_SAVED_MODEL)
-      builder.add_meta_graph_and_variables(
-          sess, ["foo_graph"], {"foo": foo_sig_def, "bar": bar_sig_def})
+def build_graph_helper():
+  g = ops.Graph()
+  with g.as_default():
+    x = variables.VariableV1(5, name="x")
+    y = variables.VariableV1(11, name="y")
+    z = x + y
+
+    foo_sig_def = signature_def_utils.build_signature_def({
+        "foo_input": utils.build_tensor_info(x)
+    }, {"foo_output": utils.build_tensor_info(z)})
+    bar_sig_def = signature_def_utils.build_signature_def({
+        "bar_x": utils.build_tensor_info(x),
+        "bar_y": utils.build_tensor_info(y)
+    }, {"bar_z": utils.build_tensor_info(z)})
+  return g, {"foo": foo_sig_def, "bar": bar_sig_def}, y
+
+
+@parameterized.parameters((saved_model_builder.SavedModelBuilder,),
+                          (saved_model_builder._SavedModelBuilder,))
+class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
+
+  def export_simple_graph(self, builder_cls):
+    g, sig_def_map, _ = build_graph_helper()
+    with session.Session(graph=g) as sess:
+      self.evaluate(variables.global_variables_initializer())
+      builder = builder_cls(SIMPLE_ADD_SAVED_MODEL)
+      builder.add_meta_graph_and_variables(sess, ["foo_graph"], sig_def_map)
       builder.save()
 
-      # Write SavedModel with a main_op
+  def export_graph_with_main_op(self, builder_cls):
+    g, sig_def_map, y = build_graph_helper()
+    with session.Session(graph=g) as sess:
+      self.evaluate(variables.global_variables_initializer())
       assign_op = control_flow_ops.group(state_ops.assign(y, 7))
 
-      builder = saved_model_builder.SavedModelBuilder(SAVED_MODEL_WITH_MAIN_OP)
-      builder.add_meta_graph_and_variables(
-          sess, ["foo_graph"], {"foo": foo_sig_def, "bar": bar_sig_def},
-          main_op=assign_op)
+      builder = builder_cls(SAVED_MODEL_WITH_MAIN_OP)
+
+      if builder_cls == saved_model_builder._SavedModelBuilder:
+        builder.add_meta_graph_and_variables(
+            sess, ["foo_graph"], sig_def_map, init_op=assign_op)
+      else:
+        builder.add_meta_graph_and_variables(
+            sess, ["foo_graph"], sig_def_map, main_op=assign_op)
       builder.save()
 
   def tearDown(self):
-    file_io.delete_recursively(test.get_temp_dir())
+    super(SavedModelLoaderTest, self).tearDown()
+    shutil.rmtree(test.get_temp_dir(), ignore_errors=True)
 
-  def test_load_function(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_load_function(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo_graph"])
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
 
+    self.export_graph_with_main_op(builder_cls)
     loader2 = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
       loader2.load(sess, ["foo_graph"])
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
 
-  def test_load_graph(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_load_graph(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     graph = ops.Graph()
     loader.load_graph(graph, ["foo_graph"])
@@ -101,14 +123,16 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaises(KeyError):
       graph.get_tensor_by_name("z:0")
 
-    with self.session(graph=graph) as sess:
+    with self.session(graph=graph):
       # Check that x and y are not initialized
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(x)
+        self.evaluate(x)
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(y)
+        self.evaluate(y)
 
-  def test_load_with_import_scope(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_load_with_import_scope(self, builder_cls):
+    self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
       saver, _ = loader.load_graph(
@@ -119,7 +143,13 @@ class SavedModelLoaderTest(test.TestCase):
         loader.restore_variables(sess, tf_saver.Saver())
 
       loader.restore_variables(sess, saver)
-      loader.run_init_ops(sess, ["foo_graph"])
+
+      if builder_cls == saved_model_builder._SavedModelBuilder:
+        with self.assertRaises(errors.NotFoundError):
+          loader.run_init_ops(sess, ["foo_graph"])
+        loader.run_init_ops(sess, ["foo_graph"], import_scope="baz")
+      else:
+        loader.run_init_ops(sess, ["foo_graph"])
 
       self.assertEqual(5, sess.graph.get_tensor_by_name("baz/x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("baz/y:0").eval())
@@ -131,23 +161,27 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("baa/x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("baa/y:0").eval())
 
-  def test_restore_variables(self):
+  @test_util.run_deprecated_v1
+  def test_restore_variables(self, builder_cls):
+    self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
       x = variables.VariableV1(0, name="x")
       y = variables.VariableV1(0, name="y")
       z = x * y
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       # There are variables to restore, so a saver must be created.
       with self.assertRaises(ValueError):
         loader.restore_variables(sess, None)
 
       loader.restore_variables(sess, tf_saver.Saver())
-      self.assertEqual(55, z.eval())
+      self.assertEqual(55, self.evaluate(z))
 
-  def test_run_init_op(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_run_init_op(self, builder_cls):
+    self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     graph = ops.Graph()
     saver, _ = loader.load_graph(graph, ["foo_graph"])
@@ -160,14 +194,16 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
 
-  def test_parse_saved_model(self):
+  def test_parse_saved_model(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     meta_graph = loader.get_meta_graph_def_from_tags(["foo_graph"])
     self.assertIsNotNone(meta_graph)
     self.assertIn("foo", meta_graph.signature_def)
     self.assertIn("bar", meta_graph.signature_def)
 
-  def test_load_invalid_meta_graph(self):
+  def test_load_invalid_meta_graph(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     with self.assertRaises(RuntimeError):
       loader.get_meta_graph_def_from_tags([])
@@ -176,13 +212,17 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaises(RuntimeError):
       loader.get_meta_graph_def_from_tags(["not_a_graph"])
 
-  def test_load_saved_model_with_no_variables(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_load_saved_model_with_no_variables(self, builder_cls):
     """Test that SavedModel runs saver when there appear to be no variables.
 
     When no variables are detected, this may mean that the variables were saved
     to different collections, or the collections weren't saved to the
     SavedModel. If the SavedModel MetaGraphDef contains a saver, it should still
     run in either of these cases.
+
+    Args:
+      builder_cls: SavedModelBuilder or _SavedModelBuilder class
     """
     path = _get_export_dir("no_variable_saved_model")
     with session.Session(graph=ops.Graph()) as sess:
@@ -192,7 +232,7 @@ class SavedModelLoaderTest(test.TestCase):
           11, name="y", collections=["not_global_variable"])
       self.assertFalse(variables._all_saveable_objects())
       z = x + y
-      sess.run(variables.variables_initializer([x, y]))
+      self.evaluate(variables.variables_initializer([x, y]))
 
       foo_sig_def = signature_def_utils.build_signature_def(
           {"foo_input": utils.build_tensor_info(x)},
@@ -215,8 +255,9 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
 
-  def test_load_saved_model_graph_with_return_elements(self):
+  def test_load_saved_model_graph_with_return_elements(self, builder_cls):
     """Ensure that the correct elements are returned."""
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     graph = ops.Graph()
     _, ret = loader.load_graph(graph, ["foo_graph"],
@@ -228,5 +269,6 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "not found in graph"):
       loader.load_graph(graph, ["foo_graph"], return_elements=["z:0"])
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/main_op_impl.py b/tensorflow/python/saved_model/main_op_impl.py
index ad4511b28e5859bb9abc02c3bd3177e45a5df819..bc0d38930eb8facdcb8d20c24c0f14c81bde8ba7 100644
--- a/tensorflow/python/saved_model/main_op_impl.py
+++ b/tensorflow/python/saved_model/main_op_impl.py
@@ -26,7 +26,11 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('saved_model.main_op.main_op')
+@tf_export(v1=['saved_model.main_op.main_op'])
+@deprecation.deprecated(
+    None,
+    'This function will only be available through the v1 compatibility '
+    'library as tf.compat.v1.saved_model.main_op.main_op.')
 def main_op():
   """Returns a main op to init variables and tables.
 
@@ -43,9 +47,13 @@ def main_op():
 
 
 # TODO(sukritiramesh): Integrate with Saver for complete restore functionality.
-@tf_export('saved_model.main_op_with_restore',
-           'saved_model.main_op.main_op_with_restore')
-@deprecation.deprecated_endpoints('saved_model.main_op.main_op_with_restore')
+@tf_export(v1=['saved_model.main_op_with_restore',
+               'saved_model.main_op.main_op_with_restore'])
+@deprecation.deprecated(
+    None,
+    'This function will only be available through the v1 compatibility '
+    'library as tf.compat.v1.saved_model.main_op_with_restore or '
+    'tf.compat.v1.saved_model.main_op.main_op_with_restore.')
 def main_op_with_restore(restore_op_name):
   """Returns a main op to init variables, tables and restore the graph.
 
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab6fcb7196fcc243d69b53b595b53b0dd00071f4
--- /dev/null
+++ b/tensorflow/python/saved_model/save.py
@@ -0,0 +1,734 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exports a SavedModel from a Checkpointable Python object."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import os
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.saved_model import builder_impl
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import utils_impl
+from tensorflow.python.training.checkpointable import base
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _check_for_functional_keras_model(root):
+  """Makes an export signature for `root` if it's a functional Keras Model."""
+  # If nothing is decorated yet but this is a functional Keras Model (duck
+  # typed), we'll try to make a signature ourselves.
+  try:
+    inputs = root.inputs
+    input_names = root.input_names
+  except AttributeError:
+    return None
+  input_signature = []
+  for input_tensor, input_name in zip(inputs, input_names):
+    input_signature.append(tensor_spec.TensorSpec(
+        shape=input_tensor.shape, dtype=input_tensor.dtype,
+        name=input_name))
+
+  @def_function.function(input_signature=input_signature)
+  def _wrapped_model(*args):
+    outputs_list = nest.flatten(root(inputs=list(args)))
+    return {name: output for name, output
+            in zip(root.output_names, outputs_list)}
+  return _wrapped_model
+
+
+def _find_function_to_export(root):
+  """Iterate over `root`'s attributes, finding traced functions."""
+  exported_function = None
+  previous_attribute_name = None
+  for attribute_name in dir(root):
+    attribute_value = getattr(root, attribute_name, None)
+    if isinstance(attribute_value, def_function.PolymorphicFunction):
+      if exported_function is not None:
+        raise ValueError(
+            ("Exporting an object with no "
+             "tf.saved_model.save(..., signatures=...) "
+             "argument specified, and with more than one "
+             "@tf.function-decorated method attached to it: {}. The signature "
+             "keys for these functions are ambiguous. Specify signature "
+             "functions explicitly.").format(
+                 [previous_attribute_name, attribute_name]))
+      exported_function = attribute_value
+      previous_attribute_name = attribute_name
+  if exported_function is None:
+    exported_function = _check_for_functional_keras_model(root)
+  if exported_function is None:
+    raise ValueError(
+        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
+         "argument specified, and with no @tf.function-decorated methods "
+         "attached to it. In the future this will be a supported use-case for "
+         "Python re-import, but at the moment saving a SavedModel without "
+         "signatures does not make sense, as the only consumers will expect "
+         "signatures. Either decorate a method or specify a signature function "
+         "explicitly."))
+  return exported_function
+
+
+def _canonicalize_signatures(signatures):
+  """Converts `signatures` into a dictionary of concrete functions."""
+  if not isinstance(signatures, collections.Mapping):
+    signatures = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
+  concrete_signatures = {}
+  for serving_key, signature_function in signatures.items():
+    if isinstance(signature_function, (defun.PolymorphicFunction,
+                                       def_function.PolymorphicFunction)):
+      input_signature = signature_function._input_signature  # pylint: disable=protected-access
+      if input_signature is None:
+        raise ValueError(
+            ("Unable to use the function {} as a signature directly. Functions "
+             "used to generate serving signatures must either have an "
+             "`input_signature=` specified when constructed, or must be "
+             "converted to concrete functions using "
+             "`f.get_concrete_function(...)`.").format(signature_function))
+      signature_function = signature_function.get_concrete_function()
+    elif not isinstance(signature_function, defun.Function):
+      raise ValueError(
+          ("Expected a TensorFlow function to generate a signature for, but "
+           "got {}. Python functions may be decorated with "
+           "`@tf.function(input_signature=...)` and passed as signatures "
+           "directly, or created without a signature using `@tf.function` "
+           "and then converted to a concrete TensorFlow function using "
+           "`f.get_concrete_function(...)`.").format(signature_function))
+    concrete_signatures[serving_key] = signature_function
+  return concrete_signatures
+
+
+def _is_flat(sequence):
+  sequence_flat = nest.flatten(sequence)
+  try:
+    nest.assert_same_structure(sequence_flat, sequence)
+    return True
+  except ValueError:
+    return False
+  except TypeError:
+    return False
+
+
+def _normalize_outputs(outputs, function_name, signature_key):
+  """Construct an output dictionary from unnormalized function outputs."""
+  if isinstance(outputs, collections.Mapping):
+    for key, value in outputs.items():
+      if not isinstance(value, ops.Tensor):
+        raise ValueError(
+            ("Got a dictionary containing non-Tensor value {} for key {} "
+             "in the output of the function {} used to generate a SavedModel "
+             "signature. Dictionaries outputs for functions used as signatures "
+             "should have one Tensor output per string key.")
+            .format(value, key, compat.as_str_any(function_name)))
+    return outputs
+  else:
+    original_outputs = outputs
+    if not isinstance(outputs, collections.Sequence):
+      outputs = [outputs]
+    if not _is_flat(outputs):
+      raise ValueError(
+          ("Got non-flat outputs '{}' from '{}' for SavedModel "
+           "signature '{}'. Signatures have one Tensor per output, so "
+           "to have predictable names Python functions used to generate "
+           "these signatures should avoid outputting Tensors in nested "
+           "structures.")
+          .format(original_outputs, function_name, signature_key))
+    return {("output_{}".format(output_index)): output
+            for output_index, output
+            in enumerate(outputs)}
+
+
+def _tensor_dict_to_tensorinfo(tensor_dict):
+  return {key: utils_impl.build_tensor_info(value)
+          for key, value in tensor_dict.items()}
+
+
+def _map_captures_to_created_tensors(
+    original_captures, resource_map):
+  """Maps eager tensors captured by a function to Graph resources for export.
+
+  Args:
+    original_captures: A dictionary mapping from tensors captured by the
+      function to interior placeholders for those tensors (inside the function
+      body).
+    resource_map: A dictionary mapping from resource tensors owned by the eager
+      context to resource tensors in the exported graph.
+
+  Returns:
+    A list of stand-in tensors which belong to the exported graph, corresponding
+    to the function's captures.
+
+  Raises:
+    AssertionError: If the function references a resource which is not part of
+      `resource_map`.
+  """
+  export_captures = []
+  for exterior, interior in original_captures.items():
+    mapped_resource = resource_map.get(exterior, None)
+    if mapped_resource is None:
+      if exterior.dtype == dtypes.resource:
+        raise AssertionError(
+            ("Tried to export a function which references untracked stateful "
+             "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
+             "be tracked by the main object. Objects may be tracked by "
+             "assigning them to an attribute of another tracked object, or to "
+             "an attribute of the main object directly.")
+            .format(interior))
+      else:
+        # This is a captured Tensor, but it's not a resource. We'll just add it
+        # to the graph as a constant.
+        mapped_resource = constant_op.constant(exterior.numpy())
+    export_captures.append(mapped_resource)
+  return export_captures
+
+
+def _map_function_arguments_to_created_inputs(
+    function_arguments, signature_key, function_name):
+  """Creates exterior placeholders in the exported graph for function arguments.
+
+  Functions have two types of inputs: tensors captured from the outside (eager)
+  context, and arguments to the function which we expect to receive from the
+  user at each call. `_map_captures_to_created_tensors` replaces
+  captured tensors with stand-ins (typically these are resource dtype tensors
+  associated with variables). `_map_function_inputs_to_created_inputs` runs over
+  every argument, creating a new placeholder for each which will belong to the
+  exported graph rather than the function body.
+
+  Args:
+    function_arguments: A list of argument placeholders in the function body.
+    signature_key: The name of the signature being exported, for error messages.
+    function_name: The name of the function, for error messages.
+
+  Returns:
+    A tuple of (mapped_inputs, exterior_placeholders)
+      mapped_inputs: A list with entries corresponding to `function_arguments`
+        containing all of the inputs of the function gathered from the exported
+        graph (both captured resources and arguments).
+      exterior_argument_placeholders: A dictionary mapping from argument names
+        to placeholders in the exported graph, containing the explicit arguments
+        to the function which a user is expected to provide.
+
+  Raises:
+    ValueError: If argument names are not unique.
+  """
+  # `exterior_argument_placeholders` holds placeholders which are outside the
+  # function body, directly contained in a MetaGraph of the SavedModel. The
+  # function body itself contains nearly identical placeholders used when
+  # running the function, but these exterior placeholders allow Session-based
+  # APIs to call the function using feeds and fetches which name Tensors in the
+  # MetaGraph.
+  exterior_argument_placeholders = {}
+  mapped_inputs = []
+  for placeholder in function_arguments:
+    # `export_captures` contains an exhaustive set of captures, so if we don't
+    # find the input there then we now know we have an argument.
+    user_input_name = compat.as_str_any(
+        placeholder.op.get_attr("_user_specified_name"))
+    # If the internal placeholders for a function have names which were
+    # uniquified by TensorFlow, then a single user-specified argument name
+    # must refer to multiple Tensors. The resulting signatures would be
+    # confusing to call. Instead, we throw an exception telling the user to
+    # specify explicit names.
+    if user_input_name != placeholder.op.name:
+      # This should be unreachable, since concrete functions may not be
+      # generated with non-unique argument names.
+      raise ValueError(
+          ("Got non-flat/non-unique argument names for SavedModel "
+           "signature '{}': more than one argument to '{}' was named '{}'. "
+           "Signatures have one Tensor per named input, so to have "
+           "predictable names Python functions used to generate these "
+           "signatures should avoid *args and Tensors in nested "
+           "structures unless unique names are specified for each. Use "
+           "tf.TensorSpec(..., name=...) to provide a name for a Tensor "
+           "input.")
+          .format(signature_key, compat.as_str_any(function_name),
+                  user_input_name))
+    arg_placeholder = array_ops.placeholder(
+        shape=placeholder.shape,
+        dtype=placeholder.dtype,
+        name="{}_{}".format(signature_key, user_input_name))
+    exterior_argument_placeholders[user_input_name] = arg_placeholder
+    mapped_inputs.append(arg_placeholder)
+  return mapped_inputs, exterior_argument_placeholders
+
+
+def _call_function_with_mapped_captures(function, args, resource_map):
+  """Calls `function` in the exported graph, using mapped resource captures."""
+  export_captures = _map_captures_to_created_tensors(
+      function.graph.captures, resource_map)
+  mapped_inputs = args + export_captures
+  # Calls the function quite directly, since we have new captured resource
+  # tensors we need to feed in which weren't part of the original function
+  # definition.
+  # pylint: disable=protected-access
+  outputs = function._build_call_outputs(
+      function._inference_function.call(context.context(), mapped_inputs))
+  return outputs
+
+
+def _generate_signatures(signature_functions, resource_map):
+  """Validates and calls `signature_functions` in the default graph.
+
+  Args:
+    signature_functions: A dictionary mapping string keys to concrete TensorFlow
+      functions (e.g. from `_canonicalize_signatures`) which will be used to
+      generate SignatureDefs.
+    resource_map: A dictionary mapping from resource tensors in the eager
+      context to resource tensors in the Graph being exported. This dictionary
+      is used to re-bind resources captured by functions to tensors which will
+      exist in the SavedModel.
+
+  Returns:
+    Each function in the `signature_functions` dictionary is called with
+    placeholder Tensors, generating a function call operation and output
+    Tensors. The placeholder Tensors, the function call operation, and the
+    output Tensors from the function call are part of the default Graph.
+
+    This function then returns a dictionary with the same structure as
+    `signature_functions`, with the concrete functions replaced by SignatureDefs
+    implicitly containing information about how to call each function from a
+    TensorFlow 1.x Session / the C++ Loader API. These SignatureDefs reference
+    the generated placeholders and Tensor outputs by name.
+
+    The caller is expected to include the default Graph set while calling this
+    function as a MetaGraph in a SavedModel, including the returned
+    SignatureDefs as part of that MetaGraph.
+  """
+  signatures = {}
+  for signature_key, function in sorted(signature_functions.items()):
+    if function.graph.captures:
+      argument_inputs = function.graph.inputs[:-len(function.graph.captures)]
+    else:
+      argument_inputs = function.graph.inputs
+    mapped_inputs, exterior_argument_placeholders = (
+        _map_function_arguments_to_created_inputs(
+            argument_inputs, signature_key, function.name))
+    outputs = _normalize_outputs(
+        _call_function_with_mapped_captures(
+            function, mapped_inputs, resource_map),
+        function.name, signature_key)
+    signatures[signature_key] = signature_def_utils.build_signature_def(
+        _tensor_dict_to_tensorinfo(exterior_argument_placeholders),
+        _tensor_dict_to_tensorinfo(outputs))
+  return signatures
+
+
+def _trace_resource_initializers(accessible_objects):
+  """Create concrete functions from `TrackableResource` objects."""
+  resource_initializers = []
+
+  def _wrap_initializer(obj):
+    obj.initialize()
+    return constant_op.constant(1.)  # Dummy control output
+
+  for obj in accessible_objects:
+    if isinstance(obj, tracking.TrackableResource):
+      resource_initializers.append(def_function.function(
+          functools.partial(_wrap_initializer, obj),
+          # All inputs are captures.
+          input_signature=[]).get_concrete_function())
+  return resource_initializers
+
+
+_AssetInfo = collections.namedtuple(
+    "_AssetInfo", [
+        # List of AssetFileDef protocol buffers
+        "asset_defs",
+        # Map from asset variable resource Tensors to their init ops
+        "asset_initializers_by_resource",
+        # Map from base asset filenames to full paths
+        "asset_filename_map",
+        # Map from TrackableAsset to index of corresponding AssetFileDef
+        "asset_index"])
+
+
+def _process_asset(trackable_asset, asset_info, resource_map):
+  """Add `trackable_asset` to `asset_info` and `resource_map`."""
+  original_variable = trackable_asset.asset_path
+  with context.eager_mode():
+    original_path = original_variable.numpy()
+  path = builder_impl.get_asset_filename_to_add(
+      asset_filepath=original_path,
+      asset_filename_map=asset_info.asset_filename_map)
+  # TODO(andresp): Instead of mapping 1-1 between trackable asset
+  # and asset in the graph def consider deduping the assets that
+  # point to the same file.
+  asset_path_initializer = array_ops.placeholder(
+      shape=original_variable.shape,
+      dtype=dtypes.string,
+      name="asset_path_initializer")
+  asset_variable = resource_variable_ops.ResourceVariable(
+      asset_path_initializer)
+  asset_info.asset_filename_map[path] = original_path
+  asset_def = meta_graph_pb2.AssetFileDef()
+  asset_def.filename = path
+  asset_def.tensor_info.name = asset_path_initializer.name
+  asset_info.asset_defs.append(asset_def)
+  asset_info.asset_initializers_by_resource[original_variable.handle] = (
+      asset_variable.initializer)
+  asset_info.asset_index[trackable_asset] = len(asset_info.asset_defs) - 1
+  resource_map[original_variable.handle] = asset_variable.handle
+
+
+def _map_resources(accessible_objects):
+  """Makes new resource handle ops corresponding to existing resource tensors.
+
+  Creates resource handle ops in the current default graph, whereas
+  `accessible_objects` will be from an eager context. Resource mapping adds
+  resource handle ops to the main GraphDef of a SavedModel, which allows the C++
+  loader API to interact with variables.
+
+  Args:
+    accessible_objects: A list of objects, some of which may contain resources,
+      to create replacements for.
+
+  Returns:
+    A tuple of (object_map, resource_map, asset_info):
+      object_map: A dictionary mapping from object in `accessible_objects` to
+        replacement objects created to hold the new resource tensors.
+      resource_map: A dictionary mapping from resource tensors extracted from
+        `accessible_objects` to newly created resource tensors.
+      asset_info: An _AssetInfo tuple describing external assets referenced from
+        accessible_objects.
+  """
+  # TODO(allenl): Handle MirroredVariables and other types of variables which
+  # may need special casing.
+  object_map = {}
+  resource_map = {}
+  asset_info = _AssetInfo(
+      asset_defs=[],
+      asset_initializers_by_resource={},
+      asset_filename_map={},
+      asset_index={})
+  for obj in accessible_objects:
+    if isinstance(obj, tracking.TrackableResource):
+      new_resource = obj.create_resource()
+      resource_map[obj.resource_handle] = new_resource
+    elif resource_variable_ops.is_resource_variable(obj):
+      new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
+      object_map[obj] = new_variable
+      resource_map[obj.handle] = new_variable.handle
+    elif isinstance(obj, tracking.TrackableAsset):
+      _process_asset(obj, asset_info, resource_map)
+  return object_map, resource_map, asset_info
+
+
+def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
+                         object_saver):
+  """Generates a MetaGraph which calls `signature_functions`.
+
+  Args:
+    meta_graph_def: The MetaGraphDef proto to fill.
+    obj: The checkpointable object being exported.
+    signature_functions: A dictionary mapping signature keys to concrete
+      functions containing signatures to add to the MetaGraph.
+    object_saver: A CheckpointableSaver to add to the MetaGraph.
+
+  Returns:
+    An _AssetInfo, which contains information to help creating the SavedModel.
+  """
+  signatures = {}
+  # List objects from the eager context to make sure Optimizers give us the
+  # right Graph-dependent variables.
+  accessible_objects = util.list_objects(obj)
+  resource_initializer_functions = _trace_resource_initializers(
+      accessible_objects)
+  exported_graph = ops.Graph()
+  resource_initializer_ops = []
+  with exported_graph.as_default():
+    object_map, resource_map, asset_info = _map_resources(accessible_objects)
+    for resource_initializer_function in resource_initializer_functions:
+      asset_dependencies = []
+      for capture in resource_initializer_function.graph.external_captures:
+        asset_initializer = asset_info.asset_initializers_by_resource.get(
+            capture, None)
+        if asset_initializer is not None:
+          asset_dependencies.append(asset_initializer)
+      with ops.control_dependencies(asset_dependencies):
+        resource_initializer_ops.append(
+            _call_function_with_mapped_captures(
+                resource_initializer_function, [], resource_map))
+    with ops.control_dependencies(resource_initializer_ops):
+      init_op = control_flow_ops.no_op()
+    # Add the same op to the main_op collection and to the init_op
+    # signature. The collection is for compatibility with older loader APIs;
+    # only one will be executed.
+    meta_graph_def.collection_def[constants.MAIN_OP_KEY].node_list.value.append(
+        init_op.name)
+    meta_graph_def.signature_def[constants.INIT_OP_SIGNATURE_KEY].CopyFrom(
+        signature_def_utils.op_signature_def(
+            init_op, constants.INIT_OP_SIGNATURE_KEY))
+
+  # Saving an object-based checkpoint again gathers variables. We need to do the
+  # gathering from the eager context so Optimizers save the right set of
+  # variables, but want any operations associated with the save/restore to be in
+  # the exported graph (thus the `to_graph` argument).
+  saver = object_saver.freeze(object_map=object_map, to_graph=exported_graph)
+  with exported_graph.as_default():
+    signatures = _generate_signatures(signature_functions, resource_map)
+    saver_def = saver.to_proto()
+    meta_graph_def.saver_def.CopyFrom(saver_def)
+  graph_def = exported_graph.as_graph_def(add_shapes=True)
+  # Clean reference cycles so repeated export()s don't make work for the garbage
+  # collector.
+  ops.dismantle_graph(exported_graph)
+
+  meta_graph_def.graph_def.CopyFrom(graph_def)
+  meta_graph_def.meta_info_def.tags.append(tag_constants.SERVING)
+  meta_graph_def.asset_file_def.extend(asset_info.asset_defs)
+  for signature_key, signature in signatures.items():
+    meta_graph_def.signature_def[signature_key].CopyFrom(signature)
+  meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
+  return asset_info
+
+
+def _write_object_graph(root, export_dir, asset_file_def_index):
+  """Save a SavedObjectGraph proto for `root`."""
+  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
+  # checkpoint. It will eventually go into the SavedModel.
+  proto = saved_object_graph_pb2.SavedObjectGraph()
+
+  checkpointable_objects, node_ids, slot_variables = util.find_objects(root)
+  util.fill_object_graph_proto(checkpointable_objects, node_ids, slot_variables,
+                               proto)
+
+  for obj, obj_proto in zip(checkpointable_objects, proto.nodes):
+    _write_object_proto(obj, obj_proto, asset_file_def_index)
+
+  extra_asset_dir = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
+  file_io.recursive_create_dir(extra_asset_dir)
+  object_graph_filename = os.path.join(
+      extra_asset_dir, compat.as_bytes("object_graph.pb"))
+  file_io.write_string_to_file(object_graph_filename, proto.SerializeToString())
+
+
+def _write_object_proto(obj, proto, asset_file_def_index):
+  """Saves an object into SavedObject proto."""
+  if isinstance(obj, tracking.TrackableAsset):
+    proto.asset.SetInParent()
+    proto.asset.asset_file_def_index = asset_file_def_index[obj]
+  else:
+    proto.user_object.SetInParent()
+
+
+@tf_export("saved_model.save",
+           v1=["saved_model.save", "saved_model.experimental.save"])
+def save(obj, export_dir, signatures=None):
+  # pylint: disable=line-too-long
+  """Exports the Checkpointable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+
+  Example usage:
+
+  ```python
+  class Adder(tf.train.Checkpoint):
+
+    @tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+    def add(self, x):
+      return x + x + 1.
+
+  to_export = Adder()
+  tf.saved_model.save(to_export, '/tmp/adder')
+  ```
+
+  The resulting SavedModel is then servable with an input named "x", its value
+  having any shape and dtype float32.
+
+  The optional `signatures` argument controls which methods in `obj` will be
+  available to programs which consume `SavedModel`s, for example serving
+  APIs. Python functions may be decorated with
+  `@tf.function(input_signature=...)` and passed as signatures directly, or
+  lazily with a call to `get_concrete_function` on the method decorated with
+  `@tf.function`.
+
+  If the `signatures` argument is omitted, `obj` will be searched for
+  `@tf.function`-decorated methods. If exactly one `@tf.function` is found, that
+  method will be used as the default signature for the SavedModel. This behavior
+  is expected to change in the future, when a corresponding
+  `tf.saved_model.load` symbol is added. At that point signatures will be
+  completely optional, and any `@tf.function` attached to `obj` or its
+  dependencies will be exported for use with `load`.
+
+  When invoking a signature in an exported SavedModel, `Tensor` arguments are
+  identified by name. These names will come from the Python function's argument
+  names by default. They may be overridden by specifying a `name=...` argument
+  in the corresponding `tf.TensorSpec` object. Explicit naming is required if
+  multiple `Tensor`s are passed through a single argument to the Python
+  function.
+
+  The outputs of functions used as `signatures` must either be flat lists, in
+  which case outputs will be numbered, or a dictionary mapping string keys to
+  `Tensor`, in which case the keys will be used to name outputs.
+
+  Since `tf.keras.Model` objects are also Checkpointable, this function can be
+  used to export Keras models. For example, exporting with a signature
+  specified:
+
+  ```python
+  class Model(tf.keras.Model):
+
+    @tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.string)])
+    def serve(self, serialized):
+      ...
+
+  m = Model()
+  tf.saved_model.save(m, '/tmp/saved_model/')
+  ```
+
+  Exporting from a function without a fixed signature:
+
+  ```python
+  class Model(tf.keras.Model):
+
+    @tf.function
+    def call(self, x):
+      ...
+
+  m = Model()
+  tf.saved_model.save(
+      m, '/tmp/saved_model/',
+      signatures=m.call.get_concrete_function(
+          tf.TensorSpec(shape=[None, 3], dtype=tf.float32, name="inp")))
+  ```
+
+  `tf.keras.Model` instances constructed from inputs and outputs already have a
+  signature and so do not require a `@tf.function` decorator or a `signatures`
+  argument. If neither are specified, the model's forward pass is exported.
+
+  ```python
+  x = input_layer.Input((4,), name="x")
+  y = core.Dense(5, name="out")(x)
+  model = training.Model(x, y)
+  tf.saved_model.save(model, '/tmp/saved_model/')
+  # The exported SavedModel takes "x" with shape [None, 4] and returns "out"
+  # with shape [None, 5]
+  ```
+
+  Variables must be tracked by assigning them to an attribute of a tracked
+  object or to an attribute of `obj` directly. TensorFlow objects (e.g. layers
+  from `tf.keras.layers`, optimizers from `tf.train`) track their variables
+  automatically. This is the same tracking scheme that `tf.train.Checkpoint`
+  uses, and an exported `Checkpoint` object may be restored as a training
+  checkpoint by pointing `tf.train.Checkpoint.restore` to the SavedModel's
+  "variables/" subdirectory. Currently variables are the only stateful objects
+  supported by `tf.saved_model.save`, but others (e.g. tables) will be supported
+  in the future.
+
+  `tf.function` does not hard-code device annotations from outside the function
+  body, instead using the calling context's device. This means for example that
+  exporting a model which runs on a GPU and serving it on a CPU will generally
+  work, with some exceptions. `tf.device` annotations inside the body of the
+  function will be hard-coded in the exported model; this type of annotation is
+  discouraged. Device-specific operations, e.g. with "cuDNN" in the name or with
+  device-specific layouts, may cause issues. Currently a `DistributionStrategy`
+  is another exception: active distribution strategies will cause device
+  placements to be hard-coded in a function. Exporting a single-device
+  computation and importing under a `DistributionStrategy` is not currently
+  supported, but may be in the future.
+
+  SavedModels exported with `tf.saved_model.save` [strip default-valued
+  attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes)
+  automatically, which removes one source of incompatibilities when the consumer
+  of a SavedModel is running an older TensorFlow version than the
+  producer. There are however other sources of incompatibilities which are not
+  handled automatically, such as when the exported model contains operations
+  which the consumer does not have definitions for.
+
+  The current implementation of `tf.saved_model.save` targets serving use-cases,
+  but omits information which will be necessary for the planned future
+  implementation of `tf.saved_model.load`. Exported models using the current
+  `save` implementation, and other existing SavedModels, will not be compatible
+  with `tf.saved_model.load` when it is implemented. Further, `save` will in the
+  future attempt to export `@tf.function`-decorated methods which it does not
+  currently inspect, so some objects which are exportable today will raise
+  exceptions on export in the future (e.g. due to complex/non-serializable
+  default arguments). Such backwards-incompatible API changes are expected only
+  prior to the TensorFlow 2.0 release.
+
+  Args:
+    obj: A checkpointable object to export.
+    export_dir: A directory in which to write the SavedModel.
+    signatures: Optional, either a `tf.function` with an input signature
+      specified or the result of `f.get_concrete_function` on a
+      `@tf.function`-decorated function `f`, in which case `f` will be used to
+      generate a signature for the SavedModel under the default serving
+      signature key. `signatures` may also be a dictionary, in which case it
+      maps from signature keys to either `tf.function` instances with input
+      signatures or concrete functions. The keys of such a dictionary may be
+      arbitrary strings, but will typically be from the
+      `tf.saved_model.signature_constants` module.
+
+  Raises:
+    ValueError: If `obj` is not checkpointable.
+  """
+  # pylint: enable=line-too-long
+  if not isinstance(obj, base.CheckpointableBase):
+    raise ValueError(
+        "Expected a Checkpointable object for export, got {}.".format(obj))
+  if signatures is None:
+    # Note that we run this before saving the checkpoint, since looping over
+    # attributes may have the side effect of creating variables in some cases.
+    signatures = _find_function_to_export(obj)
+
+  signatures = _canonicalize_signatures(signatures)
+  # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
+  # compatible (no sessions) and share it with this export API rather than
+  # making a SavedModel proto and writing it directly.
+  saved_model = saved_model_pb2.SavedModel()
+  meta_graph_def = saved_model.meta_graphs.add()
+  object_saver = util.CheckpointableSaver(obj)
+  asset_info = _fill_meta_graph_def(
+      meta_graph_def, obj, signatures, object_saver)
+  saved_model.saved_model_schema_version = (
+      constants.SAVED_MODEL_SCHEMA_VERSION)
+  # So far we've just been generating protocol buffers with no I/O. Now we write
+  # the checkpoint, copy assets into the assets directory, and write out the
+  # SavedModel proto itself.
+  utils_impl.get_or_create_variables_dir(export_dir)
+  object_saver.save(utils_impl.get_variables_path(export_dir))
+  builder_impl.copy_assets_to_destination_dir(asset_info.asset_filename_map,
+                                              export_dir)
+  path = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+  file_io.write_string_to_file(path, saved_model.SerializeToString())
+  _write_object_graph(obj, export_dir, asset_info.asset_index)
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..97218a98eae38decc9c296a420074b7d4ec1f5e3
--- /dev/null
+++ b/tensorflow/python/saved_model/save_test.py
@@ -0,0 +1,389 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for checkpointable object SavedModel save."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import numpy
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import merge
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import adam
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util
+
+
+class _ModelWithOptimizer(training.Model):
+
+  def __init__(self):
+    super(_ModelWithOptimizer, self).__init__()
+    self.dense = core.Dense(1)
+    self.optimizer = adam.AdamOptimizer(0.01)
+
+  @def_function.function(
+      input_signature=(tensor_spec.TensorSpec([None, 2], dtypes.float32),
+                       tensor_spec.TensorSpec([None], dtypes.float32)))
+  def call(self, x, y):
+    with backprop.GradientTape() as tape:
+      loss = math_ops.reduce_mean((self.dense(x) - y) ** 2.)
+    trainable_variables = self.trainable_variables
+    gradients = tape.gradient(loss, trainable_variables)
+    self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+    return {"loss": loss}
+
+
+def _import_and_infer(
+    save_dir, inputs,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+  graph = ops.Graph()
+  with graph.as_default(), session_lib.Session() as session:
+    model = loader.load(session, [tag_constants.SERVING], save_dir)
+    signature = model.signature_def[signature_key]
+    assert set(inputs.keys()) == set(signature.inputs.keys())
+    feed_dict = {}
+    for arg_name in inputs.keys():
+      feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
+          inputs[arg_name])
+    output_dict = {}
+    for output_name, output_tensor_info in signature.outputs.items():
+      output_dict[output_name] = graph.get_tensor_by_name(
+          output_tensor_info.name)
+    return session.run(output_dict, feed_dict=feed_dict)
+
+
+class SaveTest(test.TestCase):
+
+  def test_method_save_signature(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.f(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, root.f)
+    self.assertEqual(
+        {"output_0": 2.},
+        _import_and_infer(save_dir, {"x": 1.}))
+
+  def test_method_save_concrete(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda z: {"out": 2. * z})
+    root.f(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(
+        root,
+        save_dir,
+        {"non_default_key": root.f.get_concrete_function(
+            tensor_spec.TensorSpec(None, dtypes.float32))})
+    self.assertEqual(
+        {"out": 2.},
+        _import_and_infer(
+            save_dir, {"z": 1.}, signature_key="non_default_key"))
+
+  def test_non_concrete_error(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(lambda x: 2. * x)
+    root.f(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertRaisesRegexp(
+        ValueError, "must be converted to concrete functions"):
+      save.save(root, save_dir, root.f)
+
+  def test_nested_inputs(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x[0],
+        input_signature=([tensor_spec.TensorSpec(None, dtypes.float32),
+                          tensor_spec.TensorSpec(None, dtypes.float32)],))
+    root.f([constant_op.constant(1.), constant_op.constant(1.)])
+    # Concrete functions must always have uniquely named Tensor inputs. Save
+    # relies on this.
+    with self.assertRaisesRegexp(
+        ValueError, "two arguments named 'x'"):
+      root.f.get_concrete_function()
+
+  def test_nested_outputs(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(lambda x: (2. * x, (3. * x, 4. * x)))
+    root.f(constant_op.constant(1.))
+    to_save = root.f.get_concrete_function(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertRaisesRegexp(
+        ValueError, "non-flat outputs"):
+      save.save(root, save_dir, to_save)
+
+  def test_nested_dict_outputs(self):
+    root = util.Checkpoint(
+        f=def_function.function(
+            lambda x: {"a": 2. * x, "b": (3. * x, 4. * x)}))
+    root.f(constant_op.constant(1.))
+    to_save = root.f.get_concrete_function(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertRaisesRegexp(
+        ValueError, "dictionary containing non-Tensor value"):
+      save.save(root, save_dir, to_save)
+
+  def test_variable(self):
+    root = tracking.Checkpointable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(
+        lambda x: root.v1 * root.v2 * x)
+    root.f(constant_op.constant(1.))
+    to_save = root.f.get_concrete_function(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, to_save)
+    self.assertAllEqual({"output_0": 12.},
+                        _import_and_infer(save_dir, {"x": 2.}))
+
+  def test_optimizer(self):
+    x = constant_op.constant([[3., 4.]])
+    y = constant_op.constant([2.])
+    model = _ModelWithOptimizer()
+    first_loss = model(x, y)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir, model.call)
+    second_loss = model(x, y)
+    self.assertNotEqual(first_loss, second_loss)
+    self.assertAllClose(
+        second_loss,
+        _import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
+
+  def test_trivial_save_exception(self):
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertRaisesRegexp(ValueError, "signature"):
+      save.save(tracking.Checkpointable(), save_dir)
+
+  def test_single_method_default_signature(self):
+    model = _ModelWithOptimizer()
+    x = constant_op.constant([[3., 4.]])
+    y = constant_op.constant([2.])
+    model(x, y)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertIn("loss",
+                  _import_and_infer(save_dir,
+                                    {"x": [[3., 4.]], "y": [2.]}))
+
+  def test_single_function_default_signature(self):
+    model = tracking.Checkpointable()
+    model.f = def_function.function(lambda: 3., input_signature=())
+    model.f()
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertAllClose({"output_0": 3.},
+                        _import_and_infer(save_dir, {}))
+
+  def test_ambiguous_signatures(self):
+    model = _ModelWithOptimizer()
+    x = constant_op.constant([[3., 4.]])
+    y = constant_op.constant([2.])
+    model(x, y)
+    model.second_function = def_function.function(lambda: 1.)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    with self.assertRaisesRegexp(ValueError, "call.*second_function"):
+      save.save(model, save_dir)
+
+  def test_subclassed_no_signature(self):
+
+    class Subclassed(training.Model):
+
+      def call(self, inputs):
+        return inputs * 2.
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    model = Subclassed()
+    with self.assertRaisesRegexp(
+        ValueError, "no @tf.function-decorated methods"):
+      save.save(model, save_dir)
+
+  def test_docstring(self):
+
+    class Adder(util.Checkpoint):
+
+      @def_function.function(input_signature=[tensor_spec.TensorSpec(
+          shape=None, dtype=dtypes.float32)])
+      def add(self, x):
+        return x + x + 1.
+
+    to_save = Adder()
+    to_save.add(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(to_save, save_dir)
+    self.assertAllClose({"output_0": 7.},
+                        _import_and_infer(save_dir, {"x": 3.}))
+
+  def test_default_attr_stripping(self):
+
+    class Complex(util.Checkpoint):
+
+      @def_function.function(input_signature=[])
+      def __call__(self):
+        return math_ops.complex(
+            constant_op.constant(1.),
+            constant_op.constant(2.),
+            name="complex")
+
+    to_save = Complex()
+    to_save()
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(to_save, save_dir)
+    graph = ops.Graph()
+    with graph.as_default(), self.session(graph) as session:
+      loader.load(session, [tag_constants.SERVING], save_dir)
+      func, = graph._functions.values()
+      complex_node, = [
+          node for node in func.definition.node_def if node.op == "Complex"]
+      self.assertNotIn("T", complex_node.attr)
+      self.assertNotIn("Tout", complex_node.attr)
+
+  def test_export_functional_keras_model(self):
+    x = input_layer.Input((4,), name="x")
+    y = core.Dense(4, name="out")(x)
+    model = training.Model(x, y)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertAllClose(
+        {"out": model(array_ops.ones([1, 4]))},
+        _import_and_infer(save_dir, {"x": [[1., 1., 1., 1.]]}))
+
+  @test_util.run_v1_only("b/120545219")
+  def test_export_functional_keras_model_after_fit(self):
+    x = input_layer.Input((1,))
+    y = core.Dense(1, name="y")(x)
+    model = training.Model(x, y)
+    model.compile(optimizer="sgd", loss="mse")
+    model.fit(x=numpy.array([[1.]]),
+              y=numpy.array([2.]), epochs=2)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertAllClose(
+        {"y": model(constant_op.constant([[1.], [2.]]))},
+        _import_and_infer(save_dir, {"input_1": [[1.], [2.]]}))
+
+  def test_export_multi_input_functional_keras_model(self):
+    x1 = input_layer.Input((2,), name="x1")
+    x2 = input_layer.Input((2,), name="x2")
+    y1 = core.Dense(4)(merge.Add()([x1, x2]))
+    y2 = core.Dense(4)(merge.Multiply()([x1, x2]))
+    model = training.Model([x1, x2], [y1, y2])
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    outputs = model([array_ops.ones([1, 2]), 2. * array_ops.ones([1, 2])])
+    self.assertAllClose(
+        {"dense": outputs[0], "dense_1": outputs[1]},
+        _import_and_infer(
+            save_dir,
+            {"x1": [[1., 1.]],
+             "x2": [[2., 2.]]}))
+
+
+class AssetTests(test.TestCase):
+
+  def setUp(self):
+    super(AssetTests, self).setUp()
+    self._vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with open(self._vocab_path, "w") as f:
+      f.write("alpha\nbeta\ngamma\n")
+
+  def test_table(self):
+    initializer = lookup_ops.TextFileInitializer(
+        self._vocab_path,
+        key_dtype=dtypes.string,
+        key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
+        value_dtype=dtypes.int64,
+        value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+    root = util.Checkpoint(table=lookup_ops.HashTable(
+        initializer, default_value=-1))
+    root.table_user = def_function.function(
+        root.table.lookup,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
+    self.assertEqual(
+        2,
+        self.evaluate(root.table_user(constant_op.constant("gamma"))))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir)
+    file_io.delete_file(self._vocab_path)
+    self.assertAllClose(
+        {"output_0": [2, 0]},
+        _import_and_infer(save_dir, {"keys": ["gamma", "alpha"]}))
+    second_dir = os.path.join(self.get_temp_dir(), "second_dir")
+    # Asset paths should track the location the SavedModel is loaded from.
+    file_io.rename(save_dir, second_dir)
+    self.assertAllClose(
+        {"output_0": [2, 1]},
+        _import_and_infer(second_dir, {"keys": ["gamma", "beta"]}))
+
+  def test_unused_asset(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.asset = tracking.TrackableAsset(self._vocab_path)
+
+    export_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, export_dir)
+    self.assertAllClose(
+        {"output_0": [0.2]},
+        _import_and_infer(export_dir, {"x": [0.1]}))
+
+
+class MemoryTests(test.TestCase):
+
+  def setUp(self):
+    self._model = _ModelWithOptimizer()
+
+  @test_util.assert_no_garbage_created
+  def test_no_reference_cycles(self):
+    x = constant_op.constant([[3., 4.]])
+    y = constant_op.constant([2.])
+    self._model(x, y)
+    if sys.version_info[0] < 3:
+      # TODO(allenl): debug reference cycles in Python 2.x
+      self.skipTest("This test only works in Python 3+. Reference cycles are "
+                    "created in older Python versions.")
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(self._model, save_dir, self._model.call)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index 6702c99607136475cdf096f863ccd0bbddd57845..fcde6b47e4ff10dbd84801e08597591a10818d51 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -29,8 +29,8 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model.save import save
 # pylint: enable=unused-import
 # pylint: disable=wildcard-import
 from tensorflow.python.saved_model.simple_save import *
 # pylint: enable=wildcard-import
-
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 80b75b7ee65031a0afcfd4123bfb56a0d29a4b22..8d94c7c989d12df965bd5cc5954d30972238ff3c 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -54,15 +54,15 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
-class SavedModelTest(test.TestCase):
+class SavedModelTestBase(test.TestCase):
 
   def _get_export_dir(self, label):
     return os.path.join(test.get_temp_dir(), label)
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.VariableV1(variable_value, name=variable_name)
-    sess.run(variables.global_variables_initializer())
-    self.assertEqual(variable_value, v.eval())
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(variable_value, self.evaluate(v))
 
   def _build_asset_collection(self, asset_file_name, asset_file_contents,
                               asset_file_tensor_name, asset_subdir=""):
@@ -78,14 +78,16 @@ class SavedModelTest(test.TestCase):
     asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
     return asset_collection
 
-  def _validate_asset_collection(self, export_dir, graph_collection_def,
-                                 expected_asset_file_name,
-                                 expected_asset_file_contents,
-                                 expected_asset_tensor_name,
-                                 asset_id=0):
-    assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
-    asset = meta_graph_pb2.AssetFileDef()
-    assets_any[asset_id].Unpack(asset)
+
+class SavedModelTest(SavedModelTestBase):
+
+  def _validate_assets(self,
+                       export_dir,
+                       asset_file_def,
+                       expected_asset_file_name,
+                       expected_asset_file_contents,
+                       expected_asset_tensor_name,
+                       asset_id=0):
     assets_path = os.path.join(
         compat.as_bytes(export_dir),
         compat.as_bytes(constants.ASSETS_DIRECTORY),
@@ -93,8 +95,10 @@ class SavedModelTest(test.TestCase):
     actual_asset_contents = file_io.read_file_to_string(assets_path)
     self.assertEqual(expected_asset_file_contents,
                      compat.as_text(actual_asset_contents))
-    self.assertEqual(expected_asset_file_name, asset.filename)
-    self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
+    self.assertEqual(expected_asset_file_name,
+                     asset_file_def[asset_id].filename)
+    self.assertEqual(expected_asset_tensor_name,
+                     asset_file_def[asset_id].tensor_info.name)
 
   def _validate_inputs_tensor_info_fail(self, builder, tensor_info):
     with self.session(graph=ops.Graph()) as sess:
@@ -142,6 +146,18 @@ class SavedModelTest(test.TestCase):
           sess, ["foo"],
           signature_def_map={"foo_key": foo_signature})
 
+  def _validate_sig_def_keys(self, builder, valid_tensor_info, invalid_key):
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      foo_signature = signature_def_utils.build_signature_def(
+          dict(), {"foo_key": valid_tensor_info}, "foo")
+      self.assertRaises(
+          KeyError,
+          builder.add_meta_graph_and_variables,
+          sess, ["foo"],
+          signature_def_map={invalid_key: foo_signature})
+
   def testMaybeSavedModelDir(self):
     base_path = test.test_src_dir_path("/python/saved_model")
     self.assertFalse(loader.maybe_saved_model_directory(base_path))
@@ -183,9 +199,10 @@ class SavedModelTest(test.TestCase):
                                    constants.SAVED_MODEL_FILENAME_PBTXT):
         loader.load(sess, ["foo"], export_dir)
 
+  @test_util.run_deprecated_v1
   def testVerifySessionGraphUsage(self):
     export_dir = self._get_export_dir("test_verify_session_graph_usage")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
@@ -203,9 +220,10 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  @test_util.run_deprecated_v1
   def testSequence(self):
     export_dir = self._get_export_dir("test_sequence")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Expect an assertion error since add_meta_graph_and_variables() should be
     # invoked before any add_meta_graph() calls.
@@ -220,9 +238,10 @@ class SavedModelTest(test.TestCase):
       self.assertRaises(AssertionError, builder.add_meta_graph_and_variables,
                         sess, ["baz"])
 
+  @test_util.run_deprecated_v1
   def testTags(self):
     export_dir = self._get_export_dir("test_tags")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -309,9 +328,10 @@ class SavedModelTest(test.TestCase):
       self.assertRaises(RuntimeError, loader.load, sess, ["foo", "baz"],
                         export_dir)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     export_dir = self._get_export_dir("test_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with two variables. SavedModel invoked to:
     # - add with weights.
@@ -361,9 +381,10 @@ class SavedModelTest(test.TestCase):
       self.assertRaises(errors.NotFoundError, loader.load, sess, ["baz"],
                         export_dir)
 
+  @test_util.run_deprecated_v1
   def testGraphWithoutVariables(self):
     export_dir = self._get_export_dir("test_graph_has_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with no variables.
     with self.session(graph=ops.Graph()) as sess:
@@ -385,7 +406,7 @@ class SavedModelTest(test.TestCase):
       a = ops.get_default_graph().get_tensor_by_name(constant_5_name)
       b = constant_op.constant(6.0)
       c = a * b
-      self.assertEqual(30.0, sess.run(c))
+      self.assertEqual(30.0, self.evaluate(c))
 
     # Restore the graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
@@ -394,11 +415,12 @@ class SavedModelTest(test.TestCase):
       a = ops.get_default_graph().get_tensor_by_name(constant_6_name)
       b = constant_op.constant(5.0)
       c = a * b
-      self.assertEqual(30.0, sess.run(c))
+      self.assertEqual(30.0, self.evaluate(c))
 
+  @test_util.run_deprecated_v1
   def testNoOverwrite(self):
     export_dir = self._get_export_dir("test_no_overwrite")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -417,12 +439,13 @@ class SavedModelTest(test.TestCase):
 
     # An attempt to create another builder with the same export directory should
     # result in an assertion error.
-    self.assertRaises(AssertionError, saved_model_builder.SavedModelBuilder,
+    self.assertRaises(AssertionError, saved_model_builder._SavedModelBuilder,
                       export_dir)
 
+  @test_util.run_deprecated_v1
   def testSaveAsText(self):
     export_dir = self._get_export_dir("test_astext")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -451,17 +474,18 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testCollections(self):
     export_dir = self._get_export_dir("test_collections")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable added to a collection. SavedModel invoked to:
     # - add with weights.
     with self.session(graph=ops.Graph()) as sess:
       v = variables.VariableV1(42, name="v")
       ops.add_to_collection("foo_vars", v)
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(42, v.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(42, self.evaluate(v))
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Graph with the same single variable added to a different collection.
@@ -470,8 +494,8 @@ class SavedModelTest(test.TestCase):
     with self.session(graph=ops.Graph()) as sess:
       v = variables.VariableV1(43, name="v")
       ops.add_to_collection("bar_vars", v)
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(43, v.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(43, self.evaluate(v))
       builder.add_meta_graph(["bar"])
 
     # Save the SavedModel to disk.
@@ -501,9 +525,10 @@ class SavedModelTest(test.TestCase):
 
       self.assertEqual(len(ops.get_collection("foo_vars")), 0)
 
+  @test_util.run_deprecated_v1
   def testSignatureDefs(self):
     export_dir = self._get_export_dir("test_signature_defs")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable and a single entry in the signature def map.
     # SavedModel is invoked to add with weights.
@@ -563,7 +588,7 @@ class SavedModelTest(test.TestCase):
 
   def testSignatureDefValidationFails(self):
     export_dir = self._get_export_dir("test_signature_def_validation_fail")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     tensor_without_encoding = meta_graph_pb2.TensorInfo()
     tensor_without_encoding.dtype = types_pb2.DT_FLOAT
@@ -579,19 +604,30 @@ class SavedModelTest(test.TestCase):
     self._validate_inputs_tensor_info_fail(builder, tensor_empty)
     self._validate_outputs_tensor_info_fail(builder, tensor_empty)
 
+    valid_tensor_info = meta_graph_pb2.TensorInfo()
+    valid_tensor_info.name = "foo"
+    valid_tensor_info.dtype = types_pb2.DT_FLOAT
+
+    self._validate_sig_def_keys(builder, valid_tensor_info,
+                                constants.INIT_OP_SIGNATURE_KEY)
+    self._validate_sig_def_keys(builder, valid_tensor_info,
+                                constants.TRAIN_OP_SIGNATURE_KEY)
+
+  @test_util.run_deprecated_v1
   def testSignatureDefValidationSucceedsWithName(self):
     tensor_with_name = meta_graph_pb2.TensorInfo()
     tensor_with_name.name = "foo"
     tensor_with_name.dtype = types_pb2.DT_FLOAT
 
     export_dir = self._get_export_dir("test_signature_def_validation_name_1")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_inputs_tensor_info_accept(builder, tensor_with_name)
 
     export_dir = self._get_export_dir("test_signature_def_validation_name_2")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_outputs_tensor_info_accept(builder, tensor_with_name)
 
+  @test_util.run_deprecated_v1
   def testSignatureDefValidationSucceedsWithCoo(self):
     tensor_with_coo = meta_graph_pb2.TensorInfo()
     # TODO(soergel) test validation of each of the fields of coo_sparse
@@ -599,16 +635,17 @@ class SavedModelTest(test.TestCase):
     tensor_with_coo.dtype = types_pb2.DT_FLOAT
 
     export_dir = self._get_export_dir("test_signature_def_validation_coo_1")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_inputs_tensor_info_accept(builder, tensor_with_coo)
 
     export_dir = self._get_export_dir("test_signature_def_validation_coo_2")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_outputs_tensor_info_accept(builder, tensor_with_coo)
 
+  @test_util.run_deprecated_v1
   def testAssets(self):
     export_dir = self._get_export_dir("test_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
@@ -618,145 +655,151 @@ class SavedModelTest(test.TestCase):
           compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
       file_io.write_string_to_file(ignored_filepath, "will be ignored")
 
-      asset_collection = self._build_asset_collection("hello42.txt",
-                                                      "foo bar baz",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
           compat.as_bytes("ignored.txt"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionDiffFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_diff_file")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar bak", "asset_file_tensor",
-          asset_subdir="1")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar bak", "asset_file_tensor", asset_subdir="1")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
-          asset_subdir="2")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar bak",
-                                      "asset_file_tensor:0")
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt_1", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
-
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar bak", "asset_file_tensor:0")
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt_1",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
+
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionSameFilepath(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_path")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor_1")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       # The second tensor should be recorded, but the same.
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
           compat.as_bytes("hello42.txt_1"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionSameFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_file")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor",
-          asset_subdir="1")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor", asset_subdir="1")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
-          asset_subdir="2")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       # The second tensor should be recorded, but the same.
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
           compat.as_bytes("hello42.txt_1"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionManyFiles(self):
     export_dir = self._get_export_dir("test_assets_name_collision_many_files")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       for i in range(5):
         idx = str(i)
-        asset_collection = self._build_asset_collection(
-            "hello42.txt", "foo bar baz " + idx, "asset_file_tensor_" + idx,
+        asset_list = self._build_asset_collection(
+            "hello42.txt",
+            "foo bar baz " + idx,
+            "asset_file_tensor_" + idx,
             asset_subdir=idx)
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -765,18 +808,21 @@ class SavedModelTest(test.TestCase):
       foo_graph = loader.load(sess, ["foo"], export_dir)
       for i in range(1, 5):
         idx = str(i)
-        self._validate_asset_collection(
-            export_dir, foo_graph.collection_def, "hello42.txt_" + idx,
-            "foo bar baz " + idx, "asset_file_tensor_{}:0".format(idx),
+        self._validate_assets(
+            export_dir,
+            foo_graph.asset_file_def,
+            "hello42.txt_" + idx,
+            "foo bar baz " + idx,
+            "asset_file_tensor_{}:0".format(idx),
             asset_id=i)
 
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz 0",
-                                      "asset_file_tensor_0:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz 0", "asset_file_tensor_0:0")
 
-  def testCustomMainOp(self):
+  @test_util.run_v1_only("b/120545219")
+  def testCustomInitOp(self):
     export_dir = self._get_export_dir("test_main_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -792,11 +838,11 @@ class SavedModelTest(test.TestCase):
       # Set up an assignment op to be run as part of the main_op.
       with ops.control_dependencies([main_op.main_op()]):
         add_v1_v2 = math_ops.add(v1._ref(), v2._ref())
-        custom_main_op = control_flow_ops.group(state_ops.assign(v3, add_v1_v2))
+        custom_init_op = control_flow_ops.group(state_ops.assign(v3, add_v1_v2))
 
-      sess.run(custom_main_op)
+      self.evaluate(custom_init_op)
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], main_op=custom_main_op)
+          sess, ["foo"], init_op=custom_init_op)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -809,83 +855,10 @@ class SavedModelTest(test.TestCase):
       # the main_op, following a restore.
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
-  def testLegacyInitOp(self):
-    export_dir = self._get_export_dir("test_legacy_init_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
-
-    with self.session(graph=ops.Graph()) as sess:
-      # Add `v1` and `v2` variables to the graph.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-      v2 = variables.VariableV1(2, name="v2")
-      ops.add_to_collection("v", v2)
-
-      # Initialize another variable `v3` to 42.
-      v3 = variables.VariableV1(42, name="v3", trainable=False, collections=[])
-      ops.add_to_collection("v", v3)
-
-      # Set up an assignment op to be run as part of the legacy_init_op.
-      assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
-      legacy_init_op = control_flow_ops.group(assign_v3, name="legacy_init_op")
-
-      sess.run(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], legacy_init_op=legacy_init_op)
-
-    # Save the SavedModel to disk.
-    builder.save()
-
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(1, ops.get_collection("v")[0].eval())
-      self.assertEqual(2, ops.get_collection("v")[1].eval())
-      # Evaluates to the sum of the first two variables and assigned as part of
-      # the legacy_init_op, following a restore.
-      self.assertEqual(3, ops.get_collection("v")[2].eval())
-
-  def testLegacyInitOpWithNonEmptyCollection(self):
-    export_dir = self._get_export_dir(
-        "test_legacy_init_op_with_non_empty_collection")
-    self._testInitOpsWithNonEmptyCollection(
-        export_dir, constants.LEGACY_INIT_OP_KEY)
-
-  def testMainOpWithNonEmptyCollection(self):
-    export_dir = self._get_export_dir(
-        "test_main_op_with_non_empty_collection")
-    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
-
-  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
-
-    g = ops.Graph()
-    with self.session(graph=g) as sess:
-      # Initialize variable `v1` to 1.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-
-      # Initialize another variable `v2` to 42.
-      v2 = variables.VariableV1(42, name="v2", trainable=False, collections=[])
-      ops.add_to_collection("v", v2)
-
-      # Set up an assignment op to be run as part of the init op.
-      assign_v2 = state_ops.assign(v2, v1)
-      init_op = control_flow_ops.group(assign_v2, name="init_op")
-
-      sess.run(variables.global_variables_initializer())
-
-      ops.add_to_collection(key, control_flow_ops.no_op())
-      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
-      # is not empty and we don't support multiple init ops.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
-        builder.add_meta_graph_and_variables(
-            sess, ["foo"], legacy_init_op=init_op)
-      # We shouldn't be able to add as MAIN_OP, either.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
-        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
-
+  @test_util.run_v1_only("b/120545219")
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -894,27 +867,26 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       train_op = state_ops.assign_add(v1, v2)
 
-      sess.run(train_op)
-      # TODO(karmel): remove explicit call when in the public method.
-      builder._add_train_op(train_op)
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+      self.evaluate(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
+      meta_graph_def = loader.load(sess, ["foo"], export_dir)
       self.assertEqual(3, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
       self.assertIsInstance(
-          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+          loader_impl.get_train_op(meta_graph_def), ops.Tensor)
 
+  @test_util.run_v1_only("b/120545219")
   def testTrainOpGroup(self):
     export_dir = self._get_export_dir("test_train_op_group")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -923,27 +895,26 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       train_op = control_flow_ops.group()
 
-      sess.run(train_op)
-      # TODO(karmel): remove explicit call when in the public method.
-      builder._add_train_op(train_op)
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+      self.evaluate(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
+      meta_graph_def = loader.load(sess, ["foo"], export_dir)
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
       self.assertIsInstance(
-          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Operation)
+          loader_impl.get_train_op(meta_graph_def), ops.Operation)
 
+  @test_util.run_v1_only("b/120545219")
   def testTrainOpAfterVariables(self):
     export_dir = self._get_export_dir("test_train_op_after_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -952,51 +923,50 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["pre_foo"])
 
       train_op = state_ops.assign_add(v1, v2)
-      sess.run(train_op)
-      # TODO(karmel): remove explicit call when in the public method.
-      builder._add_train_op(train_op)
-      builder.add_meta_graph(["foo"])
+      self.evaluate(train_op)
+      builder.add_meta_graph(["foo"], train_op=train_op)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
+      meta_graph_def = loader.load(sess, ["foo"], export_dir)
       self.assertIsInstance(
-          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+          loader_impl.get_train_op(meta_graph_def), ops.Tensor)
 
     with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["pre_foo"], export_dir)
       self.assertFalse(ops.get_collection(constants.TRAIN_OP_KEY))
 
+  @test_util.run_deprecated_v1
   def testMultipleAssets(self):
     export_dir = self._get_export_dir("test_multiple_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `foo` graph.
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "foo".
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `bar` graph.
-      asset_collection = self._build_asset_collection("bar.txt", "content_bar",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("bar.txt", "content_bar",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+      builder.add_meta_graph(["bar"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1004,43 +974,42 @@ class SavedModelTest(test.TestCase):
     # Check assets restored for graph with tag "foo".
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
       bar_graph = loader.load(sess, ["bar"], export_dir)
-      self._validate_asset_collection(export_dir, bar_graph.collection_def,
-                                      "bar.txt", "content_bar",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, bar_graph.asset_file_def, "bar.txt",
+                            "content_bar", "asset_file_tensor:0")
 
+  @test_util.run_deprecated_v1
   def testDuplicateAssets(self):
     export_dir = self._get_export_dir("test_duplicate_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `foo` specific
       # content.
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "foo".
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `bar` specific
       # content.
-      asset_collection = self._build_asset_collection("foo.txt", "content_bar",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_bar",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+      builder.add_meta_graph(["bar"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1048,9 +1017,8 @@ class SavedModelTest(test.TestCase):
     # Check assets restored for graph with tag "foo".
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
@@ -1059,13 +1027,13 @@ class SavedModelTest(test.TestCase):
       # Validate the assets for `bar` graph. `foo.txt` should contain the
       # original contents corresponding to `foo` graph since an asset with the
       # same name across multiple graphs is only stored the first time
-      self._validate_asset_collection(export_dir, bar_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, bar_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
+  @test_util.run_v1_only("b/120545219")
   def testOp(self):
     export_dir = self._get_export_dir("test_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with session.Session(
         graph=ops.Graph(),
@@ -1086,7 +1054,7 @@ class SavedModelTest(test.TestCase):
       ops.add_to_collection("v", v3)
       ops.add_to_collection("init_op", init_op)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
 
@@ -1108,7 +1076,7 @@ class SavedModelTest(test.TestCase):
 
   def testCustomSaveable(self):
     export_dir = self._get_export_dir("custom_saveable")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with session.Session(
         graph=ops.Graph(),
@@ -1135,13 +1103,14 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(b"k1", v1.keys().eval())
       self.assertEqual(3.0, v1.values().eval())
 
+  @test_util.run_deprecated_v1
   def testCustomSaver(self):
     export_dir = self._get_export_dir("test_custom_saver")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       custom_saver = training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"], saver=custom_saver)
 
@@ -1157,13 +1126,14 @@ class SavedModelTest(test.TestCase):
         self.assertEqual(
             saved_graph.saver_def.restore_op_name, "my_saver/restore_all")
 
+  @test_util.run_deprecated_v1
   def testNoCustomSaver(self):
     export_dir = self._get_export_dir("test_no_custom_saver")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"])
 
@@ -1179,13 +1149,14 @@ class SavedModelTest(test.TestCase):
         self.assertEqual(
             saved_graph.saver_def.restore_op_name, "save/restore_all")
 
+  @test_util.run_deprecated_v1
   def testMultipleCustomSavers(self):
     export_dir = self._get_export_dir("test_multiple_custom_savers")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["tag_0"])
 
       saver_1 = training.Saver()
@@ -1209,21 +1180,22 @@ class SavedModelTest(test.TestCase):
     _validate_custom_saver("tag_1", "save_1/restore_all")
     _validate_custom_saver("tag_2", "save_2/restore_all")
 
+  @test_util.run_deprecated_v1
   def testImportScope(self):
     export_dir = self._get_export_dir("test_scoped_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Build a SavedModel with a variable, an asset, and a constant tensor.
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
       constant_op.constant("constant value", name="constant_tensor_name")
       builder.add_meta_graph_and_variables(
-          sess, ["tag_name"], assets_collection=asset_collection)
+          sess, ["tag_name"], assets_list=asset_list)
 
       # Save the asset file path for later comparison.
-      asset_file_path = asset_collection[0].eval()
+      asset_file_path = asset_list[0].eval()
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1244,16 +1216,14 @@ class SavedModelTest(test.TestCase):
 
       # The loaded asset tensor should be scoped, but the asset file path and
       # contents should be unchanged.
-      asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
-      self.assertEqual(1, len(asset_collection))
-      self.assertEqual(asset_file_path, asset_collection[0].eval())
-      self.assertEqual("scope_name/asset_file_tensor:0",
-                       asset_collection[0].name)
+      asset_list = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      self.assertEqual(1, len(asset_list))
+      self.assertEqual(asset_file_path, asset_list[0].eval())
+      self.assertEqual("scope_name/asset_file_tensor:0", asset_list[0].name)
       # The static asset data inside graph_proto.collection_def should not be
       # scoped.
-      self._validate_asset_collection(export_dir, graph_proto.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, graph_proto.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
       # The constant tensor should be scoped, but its contents should be
       # unchanged.
@@ -1262,9 +1232,10 @@ class SavedModelTest(test.TestCase):
           ops.get_default_graph().get_tensor_by_name(
               "scope_name/constant_tensor_name:0").eval())
 
+  @test_util.run_deprecated_v1
   def testClearDevices(self):
     export_dir = self._get_export_dir("test_clear_devices")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Specify a device and save a variable.
     ops.reset_default_graph()
@@ -1286,6 +1257,174 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  # Tests the behavior of loading SavedModels that having missing attrs or attrs
+  # with incorrect types.
+  def testInconsistentConsumerDefaultAttrs(self):
+    export_dir = self._get_export_dir(
+        "test_strip_default_attrs_no_consumer_defaults")
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
+
+    # Add a graph with a single variable and a test op with a defaultless
+    # float32 attr, "test_attr".
+    with session.Session(graph=ops.Graph()) as sess:
+      variables.VariableV1(1.0, dtype=dtypes.float64, name="var")
+      test_ops.test_attr(T=dtypes.float32, name="test_attr")
+      self.evaluate(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(sess, ["foo"])
+
+    # Save the SavedModel to disk in text format.
+    builder.save(as_text=True)
+
+    # Rewrite the SavedModel to remove the T attr from "test_attr".
+    saved_model_file = os.path.join(
+        export_dir, constants.SAVED_MODEL_FILENAME_PBTXT)
+    with open(saved_model_file) as f:
+      original_saved_model = f.read()
+
+    no_attr_saved_model = original_saved_model.replace("""
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }""", "")
+    with open(saved_model_file, "w") as f:
+      f.write(no_attr_saved_model)
+
+    # Loading the SavedModel via the loader must fail because the SavedModel
+    # does not have any attr values for the "TestAttr" node, and there is no
+    # default specified in the TestAttr OpDef.
+    sess = session.Session(graph=ops.Graph())
+    with self.assertRaisesRegexp(
+        ValueError, "NodeDef missing attr 'T' from Op<name=TestAttr"):
+      loader.load(sess, ["foo"], export_dir)
+
+    # Rewrite the SavedModel to change the type of the T attr in "test_attr"
+    bad_type_saved_model = original_saved_model.replace("""
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }""", """
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }""")
+    with open(saved_model_file, "w") as f:
+      f.write(bad_type_saved_model)
+
+    # Loading the SavedModel via the loader must fail because there is no
+    # OpKernel registered to handle T = double.
+    sess = session.Session(graph=ops.Graph())
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "No OpKernel was registered to support Op 'TestAttr' used by node "
+        "test_attr \\(defined at .*\\) with these attrs: \\[.*\\]\n"
+        "Registered devices:.*\n"
+        "Registered kernels:.*"
+    ):
+      loader.load(sess, ["foo"], export_dir)
+
+
+class SavedModelV1Test(SavedModelTestBase):
+
+  def _validate_asset_collection(self,
+                                 export_dir,
+                                 graph_collection_def,
+                                 expected_asset_file_name,
+                                 expected_asset_file_contents,
+                                 expected_asset_tensor_name,
+                                 asset_id=0):
+    assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
+    asset = meta_graph_pb2.AssetFileDef()
+    assets_any[asset_id].Unpack(asset)
+    assets_path = os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes(constants.ASSETS_DIRECTORY),
+        compat.as_bytes(expected_asset_file_name))
+    actual_asset_contents = file_io.read_file_to_string(assets_path)
+    self.assertEqual(expected_asset_file_contents,
+                     compat.as_text(actual_asset_contents))
+    self.assertEqual(expected_asset_file_name, asset.filename)
+    self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
+
+  @test_util.run_deprecated_v1
+  def testWritingAssetsToCollection(self):
+    export_dir = self._get_export_dir("test_writing_assets_to_collection")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      # Build an asset list.
+      ignored_filepath = os.path.join(
+          compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
+      file_io.write_string_to_file(ignored_filepath, "will be ignored")
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor")
+
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.session(graph=ops.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz",
+                                      "asset_file_tensor:0")
+      ignored_asset_path = os.path.join(
+          compat.as_bytes(export_dir),
+          compat.as_bytes(constants.ASSETS_DIRECTORY),
+          compat.as_bytes("ignored.txt"))
+      self.assertFalse(file_io.file_exists(ignored_asset_path))
+
+  @test_util.run_deprecated_v1
+  def testLegacyInitOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir(
+        "test_legacy_init_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir,
+                                            constants.LEGACY_INIT_OP_KEY)
+
+  @test_util.run_deprecated_v1
+  def testMainOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir("test_main_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
+
+  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    g = ops.Graph()
+    with self.session(graph=g) as sess:
+      # Initialize variable `v1` to 1.
+      v1 = variables.VariableV1(1, name="v1")
+      ops.add_to_collection("v", v1)
+
+      # Initialize another variable `v2` to 42.
+      v2 = variables.VariableV1(42, name="v2", trainable=False, collections=[])
+      ops.add_to_collection("v", v2)
+
+      # Set up an assignment op to be run as part of the init op.
+      assign_v2 = state_ops.assign(v2, v1)
+      init_op = control_flow_ops.group(assign_v2, name="init_op")
+
+      self.evaluate(variables.global_variables_initializer())
+
+      ops.add_to_collection(key, control_flow_ops.no_op())
+      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
+      # is not empty and we don't support multiple init ops.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], legacy_init_op=init_op)
+      # We shouldn't be able to add as MAIN_OP, either.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
+
   def testStripDefaultAttrs(self):
     export_dir = self._get_export_dir("test_strip_default_attrs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
@@ -1296,7 +1435,7 @@ class SavedModelTest(test.TestCase):
       real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
       imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(
           sess, ["foo"], strip_default_attrs=True)
 
@@ -1306,7 +1445,7 @@ class SavedModelTest(test.TestCase):
       real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
       imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph(["bar"], strip_default_attrs=False)
 
     # Save the SavedModel to disk in text format.
@@ -1322,10 +1461,8 @@ class SavedModelTest(test.TestCase):
     self.assertIn("Tout", complex_node.attr)
 
     # Load graph "foo" from disk as-is to verify default attrs are stripped.
-    # pylint: disable=protected-access
-    saved_model_pb = loader_impl._parse_saved_model(export_dir)
+    saved_model_pb = loader_impl.parse_saved_model(export_dir)
     self.assertIsNotNone(saved_model_pb)
-    # pylint: enable=protected-access
 
     meta_graph_foo_def = None
     meta_graph_bar_def = None
@@ -1356,73 +1493,40 @@ class SavedModelTest(test.TestCase):
     self.assertIn("T", node_def.attr)
     self.assertIn("Tout", node_def.attr)
 
-  # Tests the behavior of loading SavedModels that having missing attrs or attrs
-  # with incorrect types.
-  def testInconsistentConsumerDefaultAttrs(self):
-    export_dir = self._get_export_dir(
-        "test_strip_default_attrs_no_consumer_defaults")
+  @test_util.run_v1_only("b/120545219")
+  def testLegacyInitOp(self):
+    export_dir = self._get_export_dir("test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    # Add a graph with a single variable and a test op with a defaultless
-    # float32 attr, "test_attr".
-    with session.Session(graph=ops.Graph()) as sess:
-      variables.VariableV1(1.0, dtype=dtypes.float64, name="var")
-      test_ops.test_attr(T=dtypes.float32, name="test_attr")
-      sess.run(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(sess, ["foo"])
-
-    # Save the SavedModel to disk in text format.
-    builder.save(as_text=True)
+    with self.session(graph=ops.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = variables.VariableV1(1, name="v1")
+      ops.add_to_collection("v", v1)
+      v2 = variables.VariableV1(2, name="v2")
+      ops.add_to_collection("v", v2)
 
-    # Rewrite the SavedModel to remove the T attr from "test_attr".
-    saved_model_file = os.path.join(
-        export_dir, constants.SAVED_MODEL_FILENAME_PBTXT)
-    with open(saved_model_file) as f:
-      original_saved_model = f.read()
+      # Initialize another variable `v3` to 42.
+      v3 = variables.VariableV1(42, name="v3", trainable=False, collections=[])
+      ops.add_to_collection("v", v3)
 
-    no_attr_saved_model = original_saved_model.replace("""
-      attr {
-        key: "T"
-        value {
-          type: DT_FLOAT
-        }
-      }""", "")
-    with open(saved_model_file, "w") as f:
-      f.write(no_attr_saved_model)
+      # Set up an assignment op to be run as part of the init_op.
+      assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
+      legacy_init_op = control_flow_ops.group(assign_v3, name="legacy_init_op")
 
-    # Loading the SavedModel via the loader must fail because the SavedModel
-    # does not have any attr values for the "TestAttr" node, and there is no
-    # default specified in the TestAttr OpDef.
-    sess = session.Session(graph=ops.Graph())
-    with self.assertRaisesRegexp(
-        ValueError, "NodeDef missing attr 'T' from Op<name=TestAttr"):
-      loader.load(sess, ["foo"], export_dir)
+      self.evaluate(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], legacy_init_op=legacy_init_op)
 
-    # Rewrite the SavedModel to change the type of the T attr in "test_attr"
-    bad_type_saved_model = original_saved_model.replace("""
-      attr {
-        key: "T"
-        value {
-          type: DT_FLOAT
-        }
-      }""", """
-      attr {
-        key: "T"
-        value {
-          type: DT_DOUBLE
-        }
-      }""")
-    with open(saved_model_file, "w") as f:
-      f.write(bad_type_saved_model)
+    # Save the SavedModel to disk.
+    builder.save()
 
-    # Loading the SavedModel via the loader must fail because there is no
-    # OpKernel registered to handle T = double.
-    sess = session.Session(graph=ops.Graph())
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        ".*No OpKernel was registered to support Op \'TestAttr\' with these "
-        "attrs..*"):
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
+      self.assertEqual(1, ops.get_collection("v")[0].eval())
+      self.assertEqual(2, ops.get_collection("v")[1].eval())
+      # Evaluates to the sum of the first two variables and assigned as part of
+      # the legacy_init_op, following a restore.
+      self.assertEqual(3, ops.get_collection("v")[2].eval())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/saved_object_graph.proto b/tensorflow/python/saved_model/saved_object_graph.proto
new file mode 100644
index 0000000000000000000000000000000000000000..3991fbede42655e39bec93226b6295603c394cf4
--- /dev/null
+++ b/tensorflow/python/saved_model/saved_object_graph.proto
@@ -0,0 +1,73 @@
+syntax = "proto3";
+
+import "tensorflow/core/protobuf/checkpointable_object_graph.proto";
+
+option cc_enable_arenas = true;
+
+package tensorflow;
+
+// A SavedObjectGraph is part of object-based SavedModels in TF 2.0. It
+// describes the directed graph of Python objects (or equivalent in other
+// languages) that make up a model, with nodes[0] at the root.
+
+// SavedObjectGraph shares some structure with CheckpointableObjectGraph, but
+// ObjectGraph belongs to the SavedModel and contains pointers to functions and
+// type information, while CheckpointableObjectGraph lives in the checkpoint and
+// contains pointers only to variable values.
+
+// NOTE: This protocol buffer format is experimental and subject to change.
+
+message SavedObjectGraph {
+  // List of objects in the SavedModel.
+  //
+  // The position of the object in this list indicates its id.
+  // Nodes[0] is considered the root node.
+  repeated SavedObject nodes = 1;
+}
+
+message SavedObject {
+  // Objects which this object depends on: named edges in the dependency
+  // graph.
+  //
+  // Note: only valid if kind == "object".
+  repeated CheckpointableObjectGraph.CheckpointableObject.ObjectReference
+      children = 1;
+
+  // Removed when forking from CheckpointableObjectGraph.
+  reserved "attributes";
+  reserved 2;
+
+  // Slot variables owned by this object. This describes the three-way
+  // (optimizer, variable, slot variable) relationship; none of the three
+  // depend on the others directly.
+  //
+  // Note: only valid if kind == "object".
+  repeated CheckpointableObjectGraph.CheckpointableObject.SlotVariableReference
+      slot_variables = 3;
+
+  oneof kind {
+    SavedUserObject user_object = 4;
+    SavedAsset asset = 5;
+  }
+}
+
+// A SavedUserObject is an object (in the object-oriented language of the
+// TensorFlow program) of some user- or framework-defined class other than
+// those handled specifically by the other kinds of SavedObjects.
+//
+// This object cannot be evaluated as a tensor, and therefore cannot be bound
+// to an input of a function.
+message SavedUserObject {}
+
+// A SavedAsset represents a file in a SavedModel.
+//
+// When bound to a function this object evaluates to a Variable from which the
+// absolute filename can be read. Users should not expect the filename to be
+// maintained.
+message SavedAsset {
+  // Index into `MetaGraphDef.asset_file_def[]` that describes the Asset.
+  //
+  // Only the field `AssetFileDef.filename` is used. Other fields, such as
+  // `AssetFileDef.tensor_info`, MUST be ignored.
+  uint32 asset_file_def_index = 1;
+}
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 99007a9634b27dc1ad6d06b8f002ffa35a96dbcc..0efe1763430eade223801b63f958405212eebe34 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -26,76 +26,116 @@ from tensorflow.python.util.tf_export import tf_export
 # signature is used in inference requests where a specific signature was not
 # specified.
 DEFAULT_SERVING_SIGNATURE_DEF_KEY = "serving_default"
-tf_export("saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY"
-         ).export_constant(__name__, "DEFAULT_SERVING_SIGNATURE_DEF_KEY")
+tf_export(
+    "saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY",
+    v1=[
+        "saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY",
+        "saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY"
+    ],
+).export_constant(__name__, "DEFAULT_SERVING_SIGNATURE_DEF_KEY")
 
 ################################################################################
 # Classification API constants.
 
 # Classification inputs.
 CLASSIFY_INPUTS = "inputs"
-tf_export("saved_model.signature_constants.CLASSIFY_INPUTS").export_constant(
-    __name__, "CLASSIFY_INPUTS")
+tf_export(
+    "saved_model.CLASSIFY_INPUTS",
+    v1=[
+        "saved_model.CLASSIFY_INPUTS",
+        "saved_model.signature_constants.CLASSIFY_INPUTS"
+    ]).export_constant(__name__, "CLASSIFY_INPUTS")
 
 # Classification method name used in a SignatureDef.
 CLASSIFY_METHOD_NAME = "tensorflow/serving/classify"
 tf_export(
-    "saved_model.signature_constants.CLASSIFY_METHOD_NAME").export_constant(
-        __name__, "CLASSIFY_METHOD_NAME")
+    "saved_model.CLASSIFY_METHOD_NAME",
+    v1=[
+        "saved_model.CLASSIFY_METHOD_NAME",
+        "saved_model.signature_constants.CLASSIFY_METHOD_NAME"
+    ]).export_constant(__name__, "CLASSIFY_METHOD_NAME")
 
 # Classification classes output.
 CLASSIFY_OUTPUT_CLASSES = "classes"
 tf_export(
-    "saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES").export_constant(
-        __name__, "CLASSIFY_OUTPUT_CLASSES")
+    "saved_model.CLASSIFY_OUTPUT_CLASSES",
+    v1=[
+        "saved_model.CLASSIFY_OUTPUT_CLASSES",
+        "saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES"
+    ]).export_constant(__name__, "CLASSIFY_OUTPUT_CLASSES")
 
 # Classification scores output.
 CLASSIFY_OUTPUT_SCORES = "scores"
 tf_export(
-    "saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES").export_constant(
-        __name__, "CLASSIFY_OUTPUT_SCORES")
+    "saved_model.CLASSIFY_OUTPUT_SCORES",
+    v1=[
+        "saved_model.CLASSIFY_OUTPUT_SCORES",
+        "saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES"
+    ]).export_constant(__name__, "CLASSIFY_OUTPUT_SCORES")
 
 ################################################################################
 # Prediction API constants.
 
 # Predict inputs.
 PREDICT_INPUTS = "inputs"
-tf_export("saved_model.signature_constants.PREDICT_INPUTS").export_constant(
-    __name__, "PREDICT_INPUTS")
+tf_export(
+    "saved_model.PREDICT_INPUTS",
+    v1=[
+        "saved_model.PREDICT_INPUTS",
+        "saved_model.signature_constants.PREDICT_INPUTS"
+    ]).export_constant(__name__, "PREDICT_INPUTS")
 
 # Prediction method name used in a SignatureDef.
 PREDICT_METHOD_NAME = "tensorflow/serving/predict"
 tf_export(
-    "saved_model.signature_constants.PREDICT_METHOD_NAME").export_constant(
-        __name__, "PREDICT_METHOD_NAME")
+    "saved_model.PREDICT_METHOD_NAME",
+    v1=[
+        "saved_model.PREDICT_METHOD_NAME",
+        "saved_model.signature_constants.PREDICT_METHOD_NAME"
+    ]).export_constant(__name__, "PREDICT_METHOD_NAME")
 
 # Predict outputs.
 PREDICT_OUTPUTS = "outputs"
-tf_export("saved_model.signature_constants.PREDICT_OUTPUTS").export_constant(
-    __name__, "PREDICT_OUTPUTS")
+tf_export(
+    "saved_model.PREDICT_OUTPUTS",
+    v1=[
+        "saved_model.PREDICT_OUTPUTS",
+        "saved_model.signature_constants.PREDICT_OUTPUTS"
+    ]).export_constant(__name__, "PREDICT_OUTPUTS")
 
 ################################################################################
 # Regression API constants.
 
 # Regression inputs.
 REGRESS_INPUTS = "inputs"
-tf_export("saved_model.signature_constants.REGRESS_INPUTS").export_constant(
-    __name__, "REGRESS_INPUTS")
+tf_export(
+    "saved_model.REGRESS_INPUTS",
+    v1=[
+        "saved_model.REGRESS_INPUTS",
+        "saved_model.signature_constants.REGRESS_INPUTS"
+    ]).export_constant(__name__, "REGRESS_INPUTS")
 
 # Regression method name used in a SignatureDef.
 REGRESS_METHOD_NAME = "tensorflow/serving/regress"
 tf_export(
-    "saved_model.signature_constants.REGRESS_METHOD_NAME").export_constant(
-        __name__, "REGRESS_METHOD_NAME")
+    "saved_model.REGRESS_METHOD_NAME",
+    v1=[
+        "saved_model.REGRESS_METHOD_NAME",
+        "saved_model.signature_constants.REGRESS_METHOD_NAME"
+    ]).export_constant(__name__, "REGRESS_METHOD_NAME")
 
 # Regression outputs.
 REGRESS_OUTPUTS = "outputs"
-tf_export("saved_model.signature_constants.REGRESS_OUTPUTS").export_constant(
-    __name__, "REGRESS_OUTPUTS")
+tf_export(
+    "saved_model.REGRESS_OUTPUTS",
+    v1=[
+        "saved_model.REGRESS_OUTPUTS",
+        "saved_model.signature_constants.REGRESS_OUTPUTS"
+    ]).export_constant(__name__, "REGRESS_OUTPUTS")
 
 ################################################################################
 # Train/Eval API constants.
-# Not exported while export_all_saved_models is in contrib.
+# Not exported while export_all_saved_models is experimental.
 
 SUPERVISED_TRAIN_METHOD_NAME = "tensorflow/supervised/training"
 
diff --git a/tensorflow/python/saved_model/signature_def_utils.py b/tensorflow/python/saved_model/signature_def_utils.py
index 27d6b70e9dce3ff67b7912efcea8e2994d138dc6..6a3c0aaf385ec360f90f748c5aadcae7e354b621 100644
--- a/tensorflow/python/saved_model/signature_def_utils.py
+++ b/tensorflow/python/saved_model/signature_def_utils.py
@@ -24,6 +24,8 @@ from __future__ import print_function
 from tensorflow.python.saved_model.signature_def_utils_impl import build_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import classification_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import is_valid_signature
+from tensorflow.python.saved_model.signature_def_utils_impl import load_op_from_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import op_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import regression_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import supervised_eval_signature_def
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index a1034416e9bf04962f3e921a6c164ac41bfa7143..f6e6e1d13ecdea684f14dcaaa39f1c66f72ac352 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -21,15 +21,20 @@ from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model import utils_impl as utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('saved_model.build_signature_def',
-           'saved_model.signature_def_utils.build_signature_def')
+@tf_export(
+    'saved_model.build_signature_def',
+    v1=[
+        'saved_model.build_signature_def',
+        'saved_model.signature_def_utils.build_signature_def'
+    ])
 @deprecation.deprecated_endpoints(
     'saved_model.signature_def_utils.build_signature_def')
 def build_signature_def(inputs=None, outputs=None, method_name=None):
@@ -57,8 +62,12 @@ def build_signature_def(inputs=None, outputs=None, method_name=None):
   return signature_def
 
 
-@tf_export('saved_model.regression_signature_def',
-           'saved_model.signature_def_utils.regression_signature_def')
+@tf_export(
+    'saved_model.regression_signature_def',
+    v1=[
+        'saved_model.regression_signature_def',
+        'saved_model.signature_def_utils.regression_signature_def'
+    ])
 @deprecation.deprecated_endpoints(
     'saved_model.signature_def_utils.regression_signature_def')
 def regression_signature_def(examples, predictions):
@@ -102,8 +111,12 @@ def regression_signature_def(examples, predictions):
   return signature_def
 
 
-@tf_export('saved_model.classification_signature_def',
-           'saved_model.signature_def_utils.classification_signature_def')
+@tf_export(
+    'saved_model.classification_signature_def',
+    v1=[
+        'saved_model.classification_signature_def',
+        'saved_model.signature_def_utils.classification_signature_def'
+    ])
 @deprecation.deprecated_endpoints(
     'saved_model.signature_def_utils.classification_signature_def')
 def classification_signature_def(examples, classes, scores):
@@ -158,8 +171,12 @@ def classification_signature_def(examples, classes, scores):
   return signature_def
 
 
-@tf_export('saved_model.predict_signature_def',
-           'saved_model.signature_def_utils.predict_signature_def')
+@tf_export(
+    'saved_model.predict_signature_def',
+    v1=[
+        'saved_model.predict_signature_def',
+        'saved_model.signature_def_utils.predict_signature_def'
+    ])
 @deprecation.deprecated_endpoints(
     'saved_model.signature_def_utils.predict_signature_def')
 def predict_signature_def(inputs, outputs):
@@ -252,8 +269,12 @@ def _supervised_signature_def(
   return signature_def
 
 
-@tf_export('saved_model.is_valid_signature',
-           'saved_model.signature_def_utils.is_valid_signature')
+@tf_export(
+    'saved_model.is_valid_signature',
+    v1=[
+        'saved_model.is_valid_signature',
+        'saved_model.signature_def_utils.is_valid_signature'
+    ])
 @deprecation.deprecated_endpoints(
     'saved_model.signature_def_utils.is_valid_signature')
 def is_valid_signature(signature_def):
@@ -329,3 +350,51 @@ def _is_valid_classification_signature(signature_def):
     return False
 
   return True
+
+
+def op_signature_def(op, key):
+  """Creates a signature def with the output pointing to an op.
+
+  Note that op isn't strictly enforced to be an Op object, and may be a Tensor.
+  It is recommended to use the build_signature_def() function for Tensors.
+
+  Args:
+    op: An Op (or possibly Tensor).
+    key: Key to graph element in the SignatureDef outputs.
+
+  Returns:
+    A SignatureDef with a single output pointing to the op.
+  """
+  # Use build_tensor_info_from_op, which creates a TensorInfo from the element's
+  # name.
+  return build_signature_def(outputs={key: utils.build_tensor_info_from_op(op)})
+
+
+def load_op_from_signature_def(signature_def, key, import_scope=None):
+  """Load an Op from a SignatureDef created by op_signature_def().
+
+  Args:
+    signature_def: a SignatureDef proto
+    key: string key to op in the SignatureDef outputs.
+    import_scope: Scope used to import the op
+
+  Returns:
+    Op (or possibly Tensor) in the graph with the same name as saved in the
+      SignatureDef.
+
+  Raises:
+    NotFoundError: If the op could not be found in the graph.
+  """
+  tensor_info = signature_def.outputs[key]
+  try:
+    # The init and train ops are not strictly enforced to be operations, so
+    # retrieve any graph element (can be either op or tensor).
+    return utils.get_element_from_tensor_info(
+        tensor_info, import_scope=import_scope)
+  except KeyError:
+    raise errors.NotFoundError(
+        None, None,
+        'The {0} could not be found in the graph. Please make sure the '
+        'SavedModel was created by the internal _SavedModelBuilder. If you '
+        'are using the public API, please make sure the SignatureDef in the '
+        'SavedModel does not contain the key "{0}".'.format(key))
diff --git a/tensorflow/python/saved_model/signature_def_utils_test.py b/tensorflow/python/saved_model/signature_def_utils_test.py
index 18c55d8d33221054f033c8baf73c757b3e03a849..d1347eb0178423f9293022e4f36eeb90caac833e 100644
--- a/tensorflow/python/saved_model/signature_def_utils_test.py
+++ b/tensorflow/python/saved_model/signature_def_utils_test.py
@@ -22,7 +22,9 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils_impl
@@ -58,6 +60,7 @@ def _make_signature(inputs, outputs, name=None):
 
 class SignatureDefUtilsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBuildSignatureDef(self):
     x = array_ops.placeholder(dtypes.float32, 1, name="x")
     x_tensor_info = utils.build_tensor_info(x)
@@ -88,6 +91,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, y_tensor_info_actual.dtype)
     self.assertEqual(0, len(y_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testRegressionSignatureDef(self):
     input1 = constant_op.constant("a", name="input-1")
     output1 = constant_op.constant(2.2, name="output-1")
@@ -113,6 +117,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, y_tensor_info_actual.dtype)
     self.assertEqual(0, len(y_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testClassificationSignatureDef(self):
     input1 = constant_op.constant("a", name="input-1")
     output1 = constant_op.constant("b", name="output-1")
@@ -144,6 +149,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, scores_tensor_info_actual.dtype)
     self.assertEqual(0, len(scores_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testPredictionSignatureDef(self):
     input1 = constant_op.constant("a", name="input-1")
     input2 = constant_op.constant("b", name="input-2")
@@ -180,11 +186,13 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_STRING, output2_tensor_info_actual.dtype)
     self.assertEqual(0, len(output2_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testTrainSignatureDef(self):
     self._testSupervisedSignatureDef(
         signature_def_utils_impl.supervised_train_signature_def,
         signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
 
+  @test_util.run_deprecated_v1
   def testEvalSignatureDef(self):
     self._testSupervisedSignatureDef(
         signature_def_utils_impl.supervised_eval_signature_def,
@@ -238,11 +246,13 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(
         types_pb2.DT_FLOAT, signature_def.outputs["metrics/value"].dtype)
 
+  @test_util.run_deprecated_v1
   def testTrainSignatureDefMissingInputs(self):
     self._testSupervisedSignatureDefMissingInputs(
         signature_def_utils_impl.supervised_train_signature_def,
         signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
 
+  @test_util.run_deprecated_v1
   def testEvalSignatureDefMissingInputs(self):
     self._testSupervisedSignatureDefMissingInputs(
         signature_def_utils_impl.supervised_eval_signature_def,
@@ -413,5 +423,24 @@ class SignatureDefUtilsTest(test.TestCase):
         {},
         signature_constants.PREDICT_METHOD_NAME)
 
+  @test_util.run_v1_only("b/120545219")
+  def testOpSignatureDef(self):
+    key = "adding_1_and_2_key"
+    add_op = math_ops.add(1, 2, name="adding_1_and_2")
+    signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
+    self.assertIn(key, signature_def.outputs)
+    self.assertEqual(add_op.name, signature_def.outputs[key].name)
+
+  @test_util.run_v1_only("b/120545219")
+  def testLoadOpFromSignatureDef(self):
+    key = "adding_1_and_2_key"
+    add_op = math_ops.add(1, 2, name="adding_1_and_2")
+    signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
+
+    self.assertEqual(
+        add_op,
+        signature_def_utils_impl.load_op_from_signature_def(signature_def, key))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/simple_save.py b/tensorflow/python/saved_model/simple_save.py
index 042b8fa8e22703d8ffb5e12de3f844d22fb1b1ce..169504ec89131506a0f2da9f4bfca1eaba9d9fed 100644
--- a/tensorflow/python/saved_model/simple_save.py
+++ b/tensorflow/python/saved_model/simple_save.py
@@ -23,10 +23,15 @@ from tensorflow.python.saved_model import builder
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('saved_model.simple_save')
+@tf_export(v1=['saved_model.simple_save'])
+@deprecation.deprecated(
+    None,
+    'This function will only be available through the v1 compatibility '
+    'library as tf.compat.v1.saved_model.simple_save.')
 def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None):
   """Convenience function to build a SavedModel suitable for serving.
 
@@ -81,6 +86,6 @@ def simple_save(session, export_dir, inputs, outputs, legacy_init_op=None):
       tags=[tag_constants.SERVING],
       signature_def_map=signature_def_map,
       assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS),
-      legacy_init_op=legacy_init_op,
+      main_op=legacy_init_op,
       clear_devices=True)
   b.save()
diff --git a/tensorflow/python/saved_model/simple_save_test.py b/tensorflow/python/saved_model/simple_save_test.py
index 18f82daadad6ae7142c249c66e61ea13782b33ac..21c2e9df2fae9f1d078b9ca95ffa52242b6756f7 100644
--- a/tensorflow/python/saved_model/simple_save_test.py
+++ b/tensorflow/python/saved_model/simple_save_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
@@ -33,8 +34,8 @@ class SimpleSaveTest(test.TestCase):
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
-    sess.run(variables.global_variables_initializer())
-    self.assertEqual(variable_value, v.eval())
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(variable_value, self.evaluate(v))
     return v
 
   def _check_variable_info(self, actual_variable, expected_variable):
@@ -53,6 +54,7 @@ class SimpleSaveTest(test.TestCase):
       self.assertEqual(actual_tensor_info.tensor_shape.dim[i].size,
                        expected_tensor.shape[i])
 
+  @test_util.run_deprecated_v1
   def testSimpleSave(self):
     """Test simple_save that uses the default parameters."""
     export_dir = os.path.join(test.get_temp_dir(),
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index c82154e7b93aaec9bea826c319e3a93fc3b48a5b..8c84c9fbe4d8e65273433dc98f9da34a2183f90e 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -24,23 +24,33 @@ from tensorflow.python.util.tf_export import tf_export
 
 # Tag for the `serving` graph.
 SERVING = "serve"
-tf_export("saved_model.tag_constants.SERVING").export_constant(
-    __name__, "SERVING")
+tf_export(
+    "saved_model.SERVING",
+    v1=["saved_model.SERVING",
+        "saved_model.tag_constants.SERVING"]).export_constant(
+            __name__, "SERVING")
 
 # Tag for the `training` graph.
 TRAINING = "train"
-tf_export("saved_model.tag_constants.TRAINING").export_constant(
-    __name__, "TRAINING")
+tf_export(
+    "saved_model.TRANING",
+    v1=["saved_model.TRAINING",
+        "saved_model.tag_constants.TRAINING"]).export_constant(
+            __name__, "TRAINING")
 
 # Tag for the `eval` graph. Not exported while the export logic is in contrib.
 EVAL = "eval"
 
 # Tag for the `gpu` graph.
 GPU = "gpu"
-tf_export("saved_model.tag_constants.GPU").export_constant(__name__, "GPU")
+tf_export(
+    "saved_model.GPU", v1=["saved_model.GPU",
+                           "saved_model.tag_constants.GPU"]).export_constant(
+                               __name__, "GPU")
 
 # Tag for the `tpu` graph.
 TPU = "tpu"
-tf_export("saved_model.tag_constants.TPU").export_constant(__name__, "TPU")
-
-
+tf_export(
+    "saved_model.TPU", v1=["saved_model.TPU",
+                           "saved_model.tag_constants.TPU"]).export_constant(
+                               __name__, "TPU")
diff --git a/tensorflow/python/saved_model/utils.py b/tensorflow/python/saved_model/utils.py
index 27c355490934e7d20ee72ae10eca9fdb8bbfca14..9bd0126ae3aac4130f0ef2f6a38cfb9abd2c6f8b 100644
--- a/tensorflow/python/saved_model/utils.py
+++ b/tensorflow/python/saved_model/utils.py
@@ -22,5 +22,6 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 from tensorflow.python.saved_model.utils_impl import build_tensor_info
+from tensorflow.python.saved_model.utils_impl import build_tensor_info_from_op
 from tensorflow.python.saved_model.utils_impl import get_tensor_from_tensor_info
 # pylint: enable=unused-import
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 0bba7b6fac6ee93fd7b120a04df87b3dea7f8cde..5caabe59fec1a0819629bd9ff16ad5be19f0890a 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.saved_model import constants
 from tensorflow.python.util import compat
@@ -34,11 +36,15 @@ from tensorflow.python.util.tf_export import tf_export
 # TensorInfo helpers.
 
 
-@tf_export("saved_model.build_tensor_info",
-           "saved_model.utils.build_tensor_info")
-@deprecation.deprecated_endpoints("saved_model.utils.build_tensor_info")
+@tf_export(v1=["saved_model.build_tensor_info",
+               "saved_model.utils.build_tensor_info"])
+@deprecation.deprecated(
+    None,
+    "This function will only be available through the v1 compatibility "
+    "library as tf.compat.v1.saved_model.utils.build_tensor_info or "
+    "tf.compat.v1.saved_model.build_tensor_info.")
 def build_tensor_info(tensor):
-  """Utility function to build TensorInfo proto.
+  """Utility function to build TensorInfo proto from a Tensor.
 
   Args:
     tensor: Tensor or SparseTensor whose name, dtype and shape are used to
@@ -60,10 +66,48 @@ def build_tensor_info(tensor):
   return tensor_info
 
 
-@tf_export("saved_model.get_tensor_from_tensor_info",
-           "saved_model.utils.get_tensor_from_tensor_info")
-@deprecation.deprecated_endpoints(
-    "saved_model.utils.get_tensor_from_tensor_info")
+def build_tensor_info_from_op(op):
+  """Utility function to build TensorInfo proto from an Op.
+
+  Note that this function should be used with caution. It is strictly restricted
+  to TensorFlow internal use-cases only. Please make sure you do need it before
+  using it.
+
+  This utility function overloads the TensorInfo proto by setting the name to
+  the Op's name, dtype to DT_INVALID and tensor_shape as None. One typical usage
+  is for the Op of the call site for the defunned function:
+  ```python
+    @function.defun
+    def some_vairable_initialiation_fn(value_a, value_b):
+      a = value_a
+      b = value_b
+
+    value_a = constant_op.constant(1, name="a")
+    value_b = constant_op.constant(2, name="b")
+    op_info = utils.build_op_info(
+        some_vairable_initialiation_fn(value_a, value_b))
+  ```
+
+  Args:
+    op: An Op whose name is used to build the TensorInfo. The name that points
+        to the Op could be fetched at run time in the Loader session.
+
+  Returns:
+    A TensorInfo protocol buffer constructed based on the supplied argument.
+  """
+  return meta_graph_pb2.TensorInfo(
+      dtype=types_pb2.DT_INVALID,
+      tensor_shape=tensor_shape.unknown_shape().as_proto(),
+      name=op.name)
+
+
+@tf_export(v1=["saved_model.get_tensor_from_tensor_info",
+               "saved_model.utils.get_tensor_from_tensor_info"])
+@deprecation.deprecated(
+    None,
+    "This function will only be available through the v1 compatibility "
+    "library as tf.compat.v1.saved_model.utils.get_tensor_from_tensor_info or "
+    "tf.compat.v1.saved_model.get_tensor_from_tensor_info.")
 def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
   """Returns the Tensor or SparseTensor described by a TensorInfo proto.
 
@@ -97,6 +141,27 @@ def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
     raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
 
 
+def get_element_from_tensor_info(tensor_info, graph=None, import_scope=None):
+  """Returns the element in the graph described by a TensorInfo proto.
+
+  Args:
+    tensor_info: A TensorInfo proto describing an Op or Tensor by name.
+    graph: The tf.Graph in which tensors are looked up. If None, the current
+      default graph is used.
+    import_scope: If not None, names in `tensor_info` are prefixed with this
+      string before lookup.
+
+  Returns:
+    Op or tensor in `graph` described by `tensor_info`.
+
+  Raises:
+    KeyError: If `tensor_info` does not correspond to an op or tensor in `graph`
+  """
+  graph = graph or ops.get_default_graph()
+  return graph.as_graph_element(
+      ops.prepend_name_scope(tensor_info.name, import_scope=import_scope))
+
+
 # Path helpers.
 
 
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index c9b38ed60323332e430ef109c039898e1f8c8130..2afe8abfd646f26f0562d7cc56b82c5781a586ef 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -19,16 +19,45 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import utils
 
 
 class UtilsTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
+  def testBuildTensorInfoOp(self):
+    x = constant_op.constant(1, name="x")
+    y = constant_op.constant(2, name="y")
+    z = control_flow_ops.group([x, y], name="op_z")
+    z_op_info = utils.build_tensor_info_from_op(z)
+    self.assertEqual("op_z", z_op_info.name)
+    self.assertEqual(types_pb2.DT_INVALID, z_op_info.dtype)
+    self.assertEqual(0, len(z_op_info.tensor_shape.dim))
+
+  @test_util.run_v1_only("b/120545219")
+  def testBuildTensorInfoDefunOp(self):
+    @function.defun
+    def my_init_fn(x, y):
+      self.x_var = x
+      self.y_var = y
+
+    x = constant_op.constant(1, name="x")
+    y = constant_op.constant(2, name="y")
+    init_op_info = utils.build_tensor_info_from_op(my_init_fn(x, y))
+    self.assertEqual("PartitionedFunctionCall", init_op_info.name)
+    self.assertEqual(types_pb2.DT_INVALID, init_op_info.dtype)
+    self.assertEqual(0, len(init_op_info.tensor_shape.dim))
+
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoDense(self):
     x = array_ops.placeholder(dtypes.float32, 1, name="x")
     x_tensor_info = utils.build_tensor_info(x)
@@ -37,6 +66,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(1, len(x_tensor_info.tensor_shape.dim))
     self.assertEqual(1, x_tensor_info.tensor_shape.dim[0].size)
 
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoSparse(self):
     x = array_ops.sparse_placeholder(dtypes.float32, [42, 69], name="x")
     x_tensor_info = utils.build_tensor_info(x)
@@ -51,6 +81,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(42, x_tensor_info.tensor_shape.dim[0].size)
     self.assertEqual(69, x_tensor_info.tensor_shape.dim[1].size)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoDense(self):
     expected = array_ops.placeholder(dtypes.float32, 1, name="x")
     tensor_info = utils.build_tensor_info(expected)
@@ -58,6 +89,7 @@ class UtilsTest(test.TestCase):
     self.assertIsInstance(actual, ops.Tensor)
     self.assertEqual(expected.name, actual.name)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoSparse(self):
     expected = array_ops.sparse_placeholder(dtypes.float32, name="x")
     tensor_info = utils.build_tensor_info(expected)
@@ -97,6 +129,7 @@ class UtilsTest(test.TestCase):
                                                  import_scope="foo")
       self.assertEqual(expected.name, actual.name)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoRaisesErrors(self):
     expected = array_ops.placeholder(dtypes.float32, 1, name="x")
     tensor_info = utils.build_tensor_info(expected)
diff --git a/tensorflow/python/summary/README.md b/tensorflow/python/summary/README.md
index 8a5fea0d9a130aae59e3cdc6093977f51c27e832..ab6e89e5c95e4d76f4b86ca9355428e93704d78b 100644
--- a/tensorflow/python/summary/README.md
+++ b/tensorflow/python/summary/README.md
@@ -8,8 +8,3 @@ events files.
 If you wish to load TensorFlow events, you should use an EventAccumulator
 (to load from a single events file) or an EventMultiplexer (to load from
 multiple events files).
-
-The API around these tools has not solidified, and we may make backwards-
-incompatible changes without warning.
-
-If you have questions or requests, please contact danmane@google.com
diff --git a/tensorflow/python/summary/plugin_asset.py b/tensorflow/python/summary/plugin_asset.py
index 998fb30fa491bd35bd449fc0f7e3bc358c668074..82d3a618304fb914f81c72c452e57a7d553ff186 100644
--- a/tensorflow/python/summary/plugin_asset.py
+++ b/tensorflow/python/summary/plugin_asset.py
@@ -32,6 +32,8 @@ from __future__ import print_function
 
 import abc
 
+import six
+
 from tensorflow.python.framework import ops
 
 _PLUGIN_ASSET_PREFIX = "__tensorboard_plugin_asset__"
@@ -107,6 +109,7 @@ def get_all_plugin_assets(graph=None):
   return out
 
 
+@six.add_metaclass(abc.ABCMeta)
 class PluginAsset(object):
   """This abstract base class allows TensorBoard to serialize assets to disk.
 
@@ -124,7 +127,6 @@ class PluginAsset(object):
     writer calls assets and the PluginAsset instance provides its contents to be
     written to disk.
   """
-  __metaclass__ = abc.ABCMeta
 
   plugin_name = None
 
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index fbae2b77fafaac921f4419df4b8fa4378f9554b1..0c13016712f316e113723c4c0c250ef636a3fcf0 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -28,12 +28,12 @@ from google.protobuf import json_format as _json_format
 # pylint: disable=unused-import
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.framework.summary_pb2 import SummaryDescription
+from tensorflow.core.framework.summary_pb2 import SummaryMetadata as _SummaryMetadata  # pylint: enable=unused-import
 from tensorflow.core.util.event_pb2 import Event
 from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.core.util.event_pb2 import TaggedRunMetadata
 # pylint: enable=unused-import
 
-
 from tensorflow.python.eager import context as _context
 from tensorflow.python.framework import constant_op as _constant_op
 from tensorflow.python.framework import dtypes as _dtypes
@@ -42,16 +42,6 @@ from tensorflow.python.ops import gen_logging_ops as _gen_logging_ops
 from tensorflow.python.ops import gen_summary_ops as _gen_summary_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import summary_op_util as _summary_op_util
 
-# exports tensor-related summaries
-# pylint: disable=unused-import
-from tensorflow.python.ops.summary_ops import tensor_summary
-# pylint: enable=unused-import
-
-# exports text
-# pylint: disable=unused-import
-from tensorflow.python.summary.text_summary import text_summary as text
-# pylint: enable=unused-import
-
 # exports FileWriter, FileWriterCache
 # pylint: disable=unused-import
 from tensorflow.python.summary.writer.writer import FileWriter
@@ -62,7 +52,7 @@ from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('summary.scalar')
+@tf_export(v1=['summary.scalar'])
 def scalar(name, tensor, collections=None, family=None):
   """Outputs a `Summary` protocol buffer containing a single scalar value.
 
@@ -92,7 +82,7 @@ def scalar(name, tensor, collections=None, family=None):
   return val
 
 
-@tf_export('summary.image')
+@tf_export(v1=['summary.image'])
 def image(name, tensor, max_outputs=3, collections=None, family=None):
   """Outputs a `Summary` protocol buffer with images.
 
@@ -148,7 +138,7 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
   return val
 
 
-@tf_export('summary.histogram')
+@tf_export(v1=['summary.histogram'])
 def histogram(name, values, collections=None, family=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
@@ -189,7 +179,7 @@ def histogram(name, values, collections=None, family=None):
   return val
 
 
-@tf_export('summary.audio')
+@tf_export(v1=['summary.audio'])
 def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
           family=None):
   # pylint: disable=line-too-long
@@ -238,7 +228,104 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
   return val
 
 
-@tf_export('summary.merge')
+@tf_export(v1=['summary.text'])
+def text(name, tensor, collections=None):
+  """Summarizes textual data.
+
+  Text data summarized via this plugin will be visible in the Text Dashboard
+  in TensorBoard. The standard TensorBoard Text Dashboard will render markdown
+  in the strings, and will automatically organize 1d and 2d tensors into tables.
+  If a tensor with more than 2 dimensions is provided, a 2d subarray will be
+  displayed along with a warning message. (Note that this behavior is not
+  intrinsic to the text summary api, but rather to the default TensorBoard text
+  plugin.)
+
+  Args:
+    name: A name for the generated node. Will also serve as a series name in
+      TensorBoard.
+    tensor: a string-type Tensor to summarize.
+    collections: Optional list of ops.GraphKeys.  The collections to add the
+      summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
+
+  Returns:
+    A TensorSummary op that is configured so that TensorBoard will recognize
+    that it contains textual data. The TensorSummary is a scalar `Tensor` of
+    type `string` which contains `Summary` protobufs.
+
+  Raises:
+    ValueError: If tensor has the wrong type.
+  """
+  if tensor.dtype != _dtypes.string:
+    raise ValueError('Expected tensor %s to have dtype string, got %s' %
+                     (tensor.name, tensor.dtype))
+
+  summary_metadata = _SummaryMetadata(
+      plugin_data=_SummaryMetadata.PluginData(plugin_name='text'))
+  t_summary = tensor_summary(
+      name=name,
+      tensor=tensor,
+      summary_metadata=summary_metadata,
+      collections=collections)
+  return t_summary
+
+
+@tf_export(v1=['summary.tensor_summary'])
+def tensor_summary(name,
+                   tensor,
+                   summary_description=None,
+                   collections=None,
+                   summary_metadata=None,
+                   family=None,
+                   display_name=None):
+  """Outputs a `Summary` protocol buffer with a serialized tensor.proto.
+
+  Args:
+    name: A name for the generated node. If display_name is not set, it will
+      also serve as the tag name in TensorBoard. (In that case, the tag
+      name will inherit tf name scopes.)
+    tensor: A tensor of any type and shape to serialize.
+    summary_description: A long description of the summary sequence. Markdown
+      is supported.
+    collections: Optional list of graph collections keys. The new summary op is
+      added to these collections. Defaults to `[GraphKeys.SUMMARIES]`.
+    summary_metadata: Optional SummaryMetadata proto (which describes which
+      plugins may use the summary value).
+    family: Optional; if provided, used as the prefix of the summary tag,
+      which controls the name used for display on TensorBoard when
+      display_name is not set.
+    display_name: A string used to name this data in TensorBoard. If this is
+      not set, then the node name will be used instead.
+
+  Returns:
+    A scalar `Tensor` of type `string`. The serialized `Summary` protocol
+    buffer.
+  """
+
+  if summary_metadata is None:
+    summary_metadata = _SummaryMetadata()
+
+  if summary_description is not None:
+    summary_metadata.summary_description = summary_description
+
+  if display_name is not None:
+    summary_metadata.display_name = display_name
+
+  serialized_summary_metadata = summary_metadata.SerializeToString()
+
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
+  with _summary_op_util.summary_scope(
+      name, family, values=[tensor]) as (tag, scope):
+    val = _gen_logging_ops.tensor_summary_v2(
+        tensor=tensor,
+        tag=tag,
+        name=scope,
+        serialized_summary_metadata=serialized_summary_metadata)
+    _summary_op_util.collect(val, collections, [_ops.GraphKeys.SUMMARIES])
+  return val
+
+
+@tf_export(v1=['summary.merge'])
 def merge(inputs, collections=None, name=None):
   # pylint: disable=line-too-long
   """Merges summaries.
@@ -284,7 +371,7 @@ def merge(inputs, collections=None, name=None):
   return val
 
 
-@tf_export('summary.merge_all')
+@tf_export(v1=['summary.merge_all'])
 def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
   """Merges all summaries collected in the default graph.
 
@@ -317,7 +404,7 @@ def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
     return merge(summary_ops, name=name)
 
 
-@tf_export('summary.get_summary_description')
+@tf_export(v1=['summary.get_summary_description'])
 def get_summary_description(node_def):
   """Given a TensorSummary node_def, retrieve its SummaryDescription.
 
diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py
index ac5eb4dbbe3b652dc69d34922f4dc5d33de5e28a..64f0f315c5888b9dd7d2217693962f30e77b3b08 100644
--- a/tensorflow/python/summary/summary_test.py
+++ b/tensorflow/python/summary/summary_test.py
@@ -12,6 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Tests for the API surface of the V1 tf.summary ops.
+
+These tests don't check the actual serialized proto summary value for the
+more complex summaries (e.g. audio, image).  Those test live separately in
+tensorflow/python/kernel_tests/summary_v1_*.py.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,16 +27,19 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary as summary_lib
 
 
-class ScalarSummaryTest(test.TestCase):
+class SummaryTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testScalarSummary(self):
     with self.cached_session() as s:
       i = constant_op.constant(3)
@@ -44,6 +53,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(values[0].tag, 'outer/inner')
     self.assertEqual(values[0].simple_value, 3.0)
 
+  @test_util.run_deprecated_v1
   def testScalarSummaryWithFamily(self):
     with self.cached_session() as s:
       i = constant_op.constant(7)
@@ -67,6 +77,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(values[0].tag, 'family/outer/family/inner_1')
     self.assertEqual(values[0].simple_value, 7.0)
 
+  @test_util.run_deprecated_v1
   def testSummarizingVariable(self):
     with self.cached_session() as s:
       c = constant_op.constant(42.0)
@@ -82,6 +93,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(value.tag, 'summary')
     self.assertEqual(value.simple_value, 42.0)
 
+  @test_util.run_deprecated_v1
   def testImageSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -96,6 +108,7 @@ class ScalarSummaryTest(test.TestCase):
     expected = sorted('outer/inner/image/{}'.format(i) for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testImageSummaryWithFamily(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 2, 3, 1))
@@ -112,6 +125,7 @@ class ScalarSummaryTest(test.TestCase):
                       for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testHistogramSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -123,6 +137,7 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(len(summary.value), 1)
     self.assertEqual(summary.value[0].tag, 'outer/inner')
 
+  @test_util.run_deprecated_v1
   def testHistogramSummaryWithFamily(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -135,6 +150,13 @@ class ScalarSummaryTest(test.TestCase):
     self.assertEqual(len(summary.value), 1)
     self.assertEqual(summary.value[0].tag, 'family/outer/family/inner')
 
+  def testHistogramSummaryTypes(self):
+    for dtype in (dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.int32,
+                  dtypes.float32, dtypes.float64):
+      const = constant_op.constant(10, dtype=dtype)
+      summary_lib.histogram('h', const)
+
+  @test_util.run_deprecated_v1
   def testAudioSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
@@ -149,6 +171,7 @@ class ScalarSummaryTest(test.TestCase):
     expected = sorted('outer/inner/audio/{}'.format(i) for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testAudioSummaryWithFamily(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
@@ -165,6 +188,23 @@ class ScalarSummaryTest(test.TestCase):
                       for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
+  def testTextSummary(self):
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        num = array_ops.constant(1)
+        summary_lib.text('foo', num)
+
+      # The API accepts vectors.
+      arr = array_ops.constant(['one', 'two', 'three'])
+      summ = summary_lib.text('foo', arr)
+      self.assertEqual(summ.op.type, 'TensorSummaryV2')
+
+      # the API accepts scalars
+      summ = summary_lib.text('foo', array_ops.constant('one'))
+      self.assertEqual(summ.op.type, 'TensorSummaryV2')
+
+  @test_util.run_deprecated_v1
   def testSummaryNameConversion(self):
     c = constant_op.constant(3)
     s = summary_lib.scalar('name with spaces', c)
@@ -176,6 +216,7 @@ class ScalarSummaryTest(test.TestCase):
     s3 = summary_lib.scalar('/name/with/leading/slash', c)
     self.assertEqual(s3.op.name, 'name/with/leading/slash')
 
+  @test_util.run_deprecated_v1
   def testSummaryWithFamilyMetaGraphExport(self):
     with ops.name_scope('outer'):
       i = constant_op.constant(11)
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
deleted file mode 100644
index 6418c847f3c819cf2491bb449921d15c39eae288..0000000000000000000000000000000000000000
--- a/tensorflow/python/summary/text_summary.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Implements text_summary in TensorFlow, with TensorBoard support.
-
-The text_summary is a wrapper around the generic tensor_summary that takes a
-string-type tensor and emits a TensorSummary op with SummaryMetadata that
-notes that this summary is textual data for the TensorBoard text plugin.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.core.framework import summary_pb2
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops.summary_ops import tensor_summary
-from tensorflow.python.util.tf_export import tf_export
-
-PLUGIN_NAME = "text"
-
-
-@tf_export("summary.text")
-def text_summary(name, tensor, collections=None):
-  """Summarizes textual data.
-
-  Text data summarized via this plugin will be visible in the Text Dashboard
-  in TensorBoard. The standard TensorBoard Text Dashboard will render markdown
-  in the strings, and will automatically organize 1d and 2d tensors into tables.
-  If a tensor with more than 2 dimensions is provided, a 2d subarray will be
-  displayed along with a warning message. (Note that this behavior is not
-  intrinsic to the text summary api, but rather to the default TensorBoard text
-  plugin.)
-
-  Args:
-    name: A name for the generated node. Will also serve as a series name in
-      TensorBoard.
-    tensor: a string-type Tensor to summarize.
-    collections: Optional list of ops.GraphKeys.  The collections to add the
-      summary to.  Defaults to [_ops.GraphKeys.SUMMARIES]
-
-  Returns:
-    A TensorSummary op that is configured so that TensorBoard will recognize
-    that it contains textual data. The TensorSummary is a scalar `Tensor` of
-    type `string` which contains `Summary` protobufs.
-
-  Raises:
-    ValueError: If tensor has the wrong type.
-  """
-  if tensor.dtype != dtypes.string:
-    raise ValueError("Expected tensor %s to have dtype string, got %s" %
-                     (tensor.name, tensor.dtype))
-
-  summary_metadata = summary_pb2.SummaryMetadata(
-      plugin_data=summary_pb2.SummaryMetadata.PluginData(
-          plugin_name=PLUGIN_NAME))
-  t_summary = tensor_summary(
-      name=name,
-      tensor=tensor,
-      summary_metadata=summary_metadata,
-      collections=collections)
-  return t_summary
diff --git a/tensorflow/python/summary/text_summary_test.py b/tensorflow/python/summary/text_summary_test.py
deleted file mode 100644
index 5b0db43cc1caeb7eb847ea53df57b8d49a302e08..0000000000000000000000000000000000000000
--- a/tensorflow/python/summary/text_summary_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import googletest
-from tensorflow.python.summary import text_summary
-
-
-class TextPluginTest(test_util.TensorFlowTestCase):
-  """Test the Text Summary API.
-
-  These tests are focused on testing the API design of the text_summary method.
-  It doesn't test the PluginAsset and tensors registry functionality, because
-  that is better tested by the text_plugin test that actually consumes that
-  metadata.
-  """
-
-  def testTextSummaryAPI(self):
-    with self.cached_session():
-
-      with self.assertRaises(ValueError):
-        num = array_ops.constant(1)
-        text_summary.text_summary("foo", num)
-
-      # The API accepts vectors.
-      arr = array_ops.constant(["one", "two", "three"])
-      summ = text_summary.text_summary("foo", arr)
-      self.assertEqual(summ.op.type, "TensorSummaryV2")
-
-      # the API accepts scalars
-      summ = text_summary.text_summary("foo", array_ops.constant("one"))
-      self.assertEqual(summ.op.type, "TensorSummaryV2")
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 16b8626476eb1d43a800c9f41704971ecf5992ae..78217b503ffac90811c6ae8316bc0c0b907e7bf7 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os.path
 import time
+import warnings
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import summary_pb2
@@ -364,6 +365,8 @@ class FileWriter(SummaryToEventTransformer):
     else:
       event_writer = EventFileWriter(logdir, max_queue, flush_secs,
                                      filename_suffix)
+
+    self._closed = False
     super(FileWriter, self).__init__(event_writer, graph, graph_def)
 
   def __enter__(self):
@@ -378,12 +381,23 @@ class FileWriter(SummaryToEventTransformer):
     """Returns the directory where event file will be written."""
     return self.event_writer.get_logdir()
 
+  def _warn_if_event_writer_is_closed(self):
+    if self._closed:
+      warnings.warn("Attempting to use a closed FileWriter. "
+                    "The operation will be a noop unless the FileWriter "
+                    "is explicitly reopened.")
+
+  def _add_event(self, event, step):
+    self._warn_if_event_writer_is_closed()
+    super(FileWriter, self)._add_event(event, step)
+
   def add_event(self, event):
     """Adds an event to the event file.
 
     Args:
       event: An `Event` protocol buffer.
     """
+    self._warn_if_event_writer_is_closed()
     self.event_writer.add_event(event)
 
   def flush(self):
@@ -392,6 +406,9 @@ class FileWriter(SummaryToEventTransformer):
     Call this method to make sure that all pending events have been written to
     disk.
     """
+    # Flushing a closed EventFileWriterV2 raises an exception. It is,
+    # however, a noop for EventFileWriter.
+    self._warn_if_event_writer_is_closed()
     self.event_writer.flush()
 
   def close(self):
@@ -400,6 +417,7 @@ class FileWriter(SummaryToEventTransformer):
     Call this method when you do not need the summary writer anymore.
     """
     self.event_writer.close()
+    self._closed = True
 
   def reopen(self):
     """Reopens the EventFileWriter.
@@ -410,3 +428,4 @@ class FileWriter(SummaryToEventTransformer):
     Does nothing if the EventFileWriter was not closed.
     """
     self.event_writer.reopen()
+    self._closed = False
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 670230e917eb3325636f05b90cc363190e96738c..d702ddc0a274cc22798519319220dbd37046c580 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -22,6 +22,7 @@ import glob
 import os.path
 import shutil
 import time
+import warnings
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import summary_pb2
@@ -34,6 +35,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -99,6 +101,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testAddingSummaryGraphAndRunMetadata(self):
     test_dir = self._CleanTestDir("basics")
     sw = self._FileWriter(test_dir)
@@ -172,6 +175,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testGraphAsNamed(self):
     test_dir = self._CleanTestDir("basics_named_graph")
     with ops.Graph().as_default() as g:
@@ -180,6 +184,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
+  @test_util.run_deprecated_v1
   def testGraphAsPositional(self):
     test_dir = self._CleanTestDir("basics_positional_graph")
     with ops.Graph().as_default() as g:
@@ -188,6 +193,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
+  @test_util.run_deprecated_v1
   def testGraphDefAsNamed(self):
     test_dir = self._CleanTestDir("basics_named_graph_def")
     with ops.Graph().as_default() as g:
@@ -197,6 +203,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
+  @test_util.run_deprecated_v1
   def testGraphDefAsPositional(self):
     test_dir = self._CleanTestDir("basics_positional_graph_def")
     with ops.Graph().as_default() as g:
@@ -206,6 +213,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
+  @test_util.run_deprecated_v1
   def testGraphAndGraphDef(self):
     with self.assertRaises(ValueError):
       test_dir = self._CleanTestDir("basics_graph_and_graph_def")
@@ -215,12 +223,14 @@ class FileWriterTestCase(test.TestCase):
       sw = self._FileWriter(test_dir, graph=g, graph_def=gd)
       sw.close()
 
+  @test_util.run_deprecated_v1
   def testNeitherGraphNorGraphDef(self):
     with self.assertRaises(TypeError):
       test_dir = self._CleanTestDir("basics_string_instead_of_graph")
       sw = self._FileWriter(test_dir, "string instead of graph object")
       sw.close()
 
+  @test_util.run_deprecated_v1
   def testCloseAndReopen(self):
     test_dir = self._CleanTestDir("close_and_reopen")
     sw = self._FileWriter(test_dir)
@@ -264,6 +274,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testNonBlockingClose(self):
     test_dir = self._CleanTestDir("non_blocking_close")
     sw = self._FileWriter(test_dir)
@@ -273,6 +284,23 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertRecent(time_before_close)
 
+  @test_util.run_deprecated_v1
+  def testUseAfterClose(self):
+    test_dir = self._CleanTestDir("use_after_close")
+    sw = self._FileWriter(test_dir)
+    sw.close()
+    with warnings.catch_warnings(record=True) as triggered:
+      warnings.simplefilter("always")
+      self.assertFalse(triggered)
+      sw.add_summary(summary_pb2.Summary())
+      sw.add_session_log(event_pb2.SessionLog())
+      sw.add_graph(ops.Graph())
+
+    self.assertEqual(len(triggered), 3)
+    for w in triggered:
+      self.assertEqual(w.category, UserWarning)
+
+  @test_util.run_deprecated_v1
   def testWithStatement(self):
     test_dir = self._CleanTestDir("with_statement")
     with self._FileWriter(test_dir) as sw:
@@ -283,6 +311,7 @@ class FileWriterTestCase(test.TestCase):
   # Checks that values returned from session Run() calls are added correctly to
   # summaries.  These are numpy types so we need to check they fit in the
   # protocol buffers correctly.
+  @test_util.run_deprecated_v1
   def testAddingSummariesFromSessionRunCalls(self):
     test_dir = self._CleanTestDir("global_step")
     sw = self._FileWriter(test_dir)
@@ -293,12 +322,11 @@ class FileWriterTestCase(test.TestCase):
       summ = summary_pb2.Summary(
           value=[summary_pb2.Summary.Value(
               tag="i", simple_value=1.0)])
-      sw.add_summary(summ.SerializeToString(), i.eval())
+      sw.add_summary(summ.SerializeToString(), self.evaluate(i))
       sw.add_summary(
           summary_pb2.Summary(
-              value=[summary_pb2.Summary.Value(
-                  tag="l", simple_value=2.0)]),
-          l.eval())
+              value=[summary_pb2.Summary.Value(tag="l", simple_value=2.0)]),
+          self.evaluate(l))
       sw.close()
 
     rr = self._EventsReader(test_dir)
@@ -330,6 +358,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testPluginMetadataStrippedFromSubsequentEvents(self):
     test_dir = self._CleanTestDir("basics")
     sw = self._FileWriter(test_dir)
@@ -389,6 +418,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testFileWriterWithSuffix(self):
     test_dir = self._CleanTestDir("test_suffix")
     sw = self._FileWriter(test_dir, filename_suffix="_test_suffix")
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 39174fa5890c9cfbaf0f7139f0ba6f853bc303e5..4e1bf3d8362dbcf78a1aa93b620694603d3a9532 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -38,7 +38,6 @@ limitations under the License.
 
 %include "tensorflow/python/lib/io/file_io.i"
 %include "tensorflow/python/training/quantize_training.i"
-%include "tensorflow/python/training/server_lib.i"
 
 %include "tensorflow/python/framework/python_op_gen.i"
 
diff --git a/tensorflow/python/tf2.py b/tensorflow/python/tf2.py
new file mode 100644
index 0000000000000000000000000000000000000000..75748f8f2c5ba2b78a2d220011e3e28e12276b62
--- /dev/null
+++ b/tensorflow/python/tf2.py
@@ -0,0 +1,45 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tools to help with the TensorFlow 2.0 transition.
+
+This module is meant for TensorFlow internal implementation, not for users of
+the TensorFlow library. For that see tf.compat instead.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+
+_force_enable = False
+
+
+def enable():
+  """Enables v2 behaviors."""
+  global _force_enable
+  _force_enable = True
+
+
+def disable():
+  """Disables v2 behaviors (TF2_BEHAVIOR env variable is still respected)."""
+  global _force_enable
+  _force_enable = False
+
+
+def enabled():
+  """Returns True iff TensorFlow 2.0 behavior should be enabled."""
+  return _force_enable or os.getenv("TF2_BEHAVIOR", "0") != "0"
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 384c7a82d27b786839545a6ad979e12a73ee88c1..901d6bc335f3a10439e2f02d0db2b237a89fece0 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -29,6 +29,8 @@ py_library(
         ":optimize_for_inference_lib",
         ":selective_registration_header_lib",
         ":strip_unused_lib",
+        # Include the TF upgrade script to users can run it directly after install TF
+        "//tensorflow/tools/compatibility:tf_upgrade_v2",
     ],
 )
 
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index 90be2cc4f74d652863d138df36061028f8f78380..9fd069c5be0e61083e38ecdb2f974f9d38ee9216 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -4,7 +4,6 @@
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow/python/tools/api/generator:api_gen.bzl", "ESTIMATOR_API_INIT_FILES")
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "TENSORFLOW_API_INIT_FILES")
 load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "TENSORFLOW_API_INIT_FILES_V1")
 
@@ -66,23 +65,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "estimator_doc_srcs_test",
-    srcs = ["doc_srcs_test.py"],
-    args = [
-        "--package=tensorflow.python.estimator",
-        "--api_name=estimator",
-    ] + ESTIMATOR_API_INIT_FILES,
-    main = "doc_srcs_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":doc_srcs",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:no_contrib",
-        "//tensorflow/python/estimator:estimator_py",
-    ],
-)
-
 py_test(
     name = "output_init_files_test",
     srcs = ["output_init_files_test.py"],
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 271cf2afafbd4ed0d031e73264cf2c344782cfb6..5e64cc64d2408fa459b6daa0c9134793bd9d5327 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -2,16 +2,6 @@
 
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "TENSORFLOW_API_INIT_FILES")
 
-# keep sorted
-ESTIMATOR_API_INIT_FILES = [
-    # BEGIN GENERATED ESTIMATOR FILES
-    "__init__.py",
-    "estimator/__init__.py",
-    "estimator/export/__init__.py",
-    "estimator/inputs/__init__.py",
-    # END GENERATED ESTIMATOR FILES
-]
-
 def get_compat_files(
         file_paths,
         compat_api_version):
@@ -26,10 +16,12 @@ def gen_api_init_files(
         api_name = "tensorflow",
         api_version = 2,
         compat_api_versions = [],
-        package = "tensorflow.python",
-        package_dep = "//tensorflow/python:no_contrib",
+        compat_init_templates = [],
+        packages = ["tensorflow.python", "tensorflow.lite.python.lite"],
+        package_deps = ["//tensorflow/python:no_contrib"],
         output_package = "tensorflow",
-        output_dir = ""):
+        output_dir = "",
+        root_file_name = "__init__.py"):
     """Creates API directory structure and __init__.py files.
 
     Creates a genrule that generates a directory structure with __init__.py
@@ -53,36 +45,52 @@ def gen_api_init_files(
       api_version: TensorFlow API version to generate. Must be either 1 or 2.
       compat_api_versions: Older TensorFlow API versions to generate under
         compat/ directory.
-      package: Python package containing the @tf_export decorators you want to
+      compat_init_templates: Python init file that should be used as template
+        for top level __init__.py files under compat/vN directories.
+        "# API IMPORTS PLACEHOLDER" comment inside this
+        template will be replaced with root imports collected by this genrule.
+      packages: Python packages containing the @tf_export decorators you want to
         process
-      package_dep: Python library target containing your package.
+      package_deps: Python library target containing your packages.
       output_package: Package where generated API will be added to.
       output_dir: Subdirectory to output API to.
         If non-empty, must end with '/'.
+      root_file_name: Name of the root file with all the root imports.
     """
     root_init_template_flag = ""
     if root_init_template:
         root_init_template_flag = "--root_init_template=$(location " + root_init_template + ")"
 
-    api_gen_binary_target = ("create_" + package + "_api_%d") % api_version
+    primary_package = packages[0]
+    api_gen_binary_target = ("create_" + primary_package + "_api_%d_%s") % (api_version, name)
     native.py_binary(
         name = api_gen_binary_target,
         srcs = ["//tensorflow/python/tools/api/generator:create_python_api.py"],
         main = "//tensorflow/python/tools/api/generator:create_python_api.py",
         srcs_version = "PY2AND3",
         visibility = ["//visibility:public"],
-        deps = [
-            package_dep,
+        deps = package_deps + [
             "//tensorflow/python:util",
             "//tensorflow/python/tools/api/generator:doc_srcs",
         ],
     )
 
+    # Replace name of root file with root_file_name.
+    output_files = [
+        root_file_name if f == "__init__.py" else f
+        for f in output_files
+    ]
     all_output_files = ["%s%s" % (output_dir, f) for f in output_files]
     compat_api_version_flags = ""
     for compat_api_version in compat_api_versions:
         compat_api_version_flags += " --compat_apiversion=%d" % compat_api_version
 
+    compat_init_template_flags = ""
+    for compat_init_template in compat_init_templates:
+        compat_init_template_flags += (
+            " --compat_init_template=$(location %s)" % compat_init_template
+        )
+
     native.genrule(
         name = name,
         outs = all_output_files,
@@ -90,7 +98,8 @@ def gen_api_init_files(
             "$(location :" + api_gen_binary_target + ") " +
             root_init_template_flag + " --apidir=$(@D)" + output_dir +
             " --apiname=" + api_name + " --apiversion=" + str(api_version) +
-            compat_api_version_flags + " --package=" + package +
+            compat_api_version_flags + " " + compat_init_template_flags +
+            " --package=" + ",".join(packages) +
             " --output_package=" + output_package + " $(OUTS)"
         ),
         srcs = srcs,
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 533a138a39c6c3ad24a8587e6c2eee3a5f52e961..0245ac50a65a99a4e93733de17d680fe816e7db1 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -4,17 +4,17 @@
 TENSORFLOW_API_INIT_FILES = [
     # BEGIN GENERATED FILES
     "__init__.py",
-    "app/__init__.py",
     "bitwise/__init__.py",
     "compat/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
-    "distributions/__init__.py",
+    "distribute/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
+    "experimental/__init__.py",
     "feature_column/__init__.py",
-    "gfile/__init__.py",
+    "io/gfile/__init__.py",
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
@@ -44,6 +44,7 @@ TENSORFLOW_API_INIT_FILES = [
     "keras/datasets/mnist/__init__.py",
     "keras/datasets/reuters/__init__.py",
     "keras/estimator/__init__.py",
+    "keras/experimental/__init__.py",
     "keras/initializers/__init__.py",
     "keras/layers/__init__.py",
     "keras/losses/__init__.py",
@@ -58,37 +59,24 @@ TENSORFLOW_API_INIT_FILES = [
     "keras/utils/__init__.py",
     "keras/wrappers/__init__.py",
     "keras/wrappers/scikit_learn/__init__.py",
-    "layers/__init__.py",
     "linalg/__init__.py",
-    "logging/__init__.py",
+    "lite/__init__.py",
+    "lite/constants/__init__.py",
     "losses/__init__.py",
-    "manip/__init__.py",
     "math/__init__.py",
-    "metrics/__init__.py",
     "nn/__init__.py",
     "nn/rnn_cell/__init__.py",
-    "profiler/__init__.py",
-    "python_io/__init__.py",
     "quantization/__init__.py",
     "random/__init__.py",
-    "resource_loader/__init__.py",
-    "strings/__init__.py",
     "saved_model/__init__.py",
-    "saved_model/builder/__init__.py",
-    "saved_model/constants/__init__.py",
-    "saved_model/loader/__init__.py",
-    "saved_model/main_op/__init__.py",
-    "saved_model/signature_constants/__init__.py",
-    "saved_model/signature_def_utils/__init__.py",
-    "saved_model/tag_constants/__init__.py",
-    "saved_model/utils/__init__.py",
     "sets/__init__.py",
+    "signal/__init__.py",
     "sparse/__init__.py",
-    "spectral/__init__.py",
+    "strings/__init__.py",
     "summary/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
     "train/__init__.py",
-    "user_ops/__init__.py",
+    "version/__init__.py",
     # END GENERATED FILES
 ]
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 0747424eabbd9b63d30ac4c990a8d071fea91f28..e35b9c43740d4e59e9478cca978b15c7451ac96e 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -10,11 +10,14 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
+    "distribute/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
+    "experimental/__init__.py",
     "feature_column/__init__.py",
     "gfile/__init__.py",
+    "io/gfile/__init__.py",
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
@@ -44,6 +47,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "keras/datasets/mnist/__init__.py",
     "keras/datasets/reuters/__init__.py",
     "keras/estimator/__init__.py",
+    "keras/experimental/__init__.py",
     "keras/initializers/__init__.py",
     "keras/layers/__init__.py",
     "keras/losses/__init__.py",
@@ -59,7 +63,10 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "keras/wrappers/__init__.py",
     "keras/wrappers/scikit_learn/__init__.py",
     "layers/__init__.py",
+    "layers/experimental/__init__.py",
     "linalg/__init__.py",
+    "lite/__init__.py",
+    "lite/constants/__init__.py",
     "logging/__init__.py",
     "losses/__init__.py",
     "manip/__init__.py",
@@ -76,6 +83,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "saved_model/__init__.py",
     "saved_model/builder/__init__.py",
     "saved_model/constants/__init__.py",
+    "saved_model/experimental/__init__.py",
     "saved_model/loader/__init__.py",
     "saved_model/main_op/__init__.py",
     "saved_model/signature_constants/__init__.py",
@@ -83,6 +91,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "saved_model/tag_constants/__init__.py",
     "saved_model/utils/__init__.py",
     "sets/__init__.py",
+    "signal/__init__.py",
     "sparse/__init__.py",
     "spectral/__init__.py",
     "summary/__init__.py",
@@ -91,5 +100,6 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "train/__init__.py",
     "train/queue_runner/__init__.py",
     "user_ops/__init__.py",
+    "version/__init__.py",
     # END GENERATED FILES
 ]
diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py
index ab749f28cd182dee4fd153a711ea086f400a32ba..51c2bfba7c13dee1c321f157fda3e221726f79b8 100644
--- a/tensorflow/python/tools/api/generator/create_python_api.py
+++ b/tensorflow/python/tools/api/generator/create_python_api.py
@@ -45,10 +45,10 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 \"\"\"%s
 \"\"\"
 
-from __future__ import print_function
+from __future__ import print_function as _print_function
 
 """
-_GENERATED_FILE_FOOTER = '\n\ndel print_function\n'
+_GENERATED_FILE_FOOTER = '\n\ndel _print_function\n'
 
 
 class SymbolExposedTwiceError(Exception):
@@ -261,15 +261,18 @@ def add_imports_for_symbol(
           id(symbol), dest_module, source_module_name, source_name, dest_name)
 
 
-def get_api_init_text(
-    package, output_package, api_name, api_version, compat_api_versions=None):
+def get_api_init_text(packages,
+                      output_package,
+                      api_name,
+                      api_version,
+                      compat_api_versions=None):
   """Get a map from destination module to __init__.py code for that module.
 
   Args:
-    package: Base python package containing python with target tf_export
+    packages: Base python packages containing python with target tf_export
       decorators.
-    output_package: Base output python package where generated API will
-      be added.
+    output_package: Base output python package where generated API will be
+      added.
     api_name: API you want to generate (e.g. `tensorflow` or `estimator`).
     api_version: API version you want to generate (1 or 2).
     compat_api_versions: Additional API versions to generate under compat/
@@ -286,13 +289,18 @@ def get_api_init_text(
   module_code_builder = _ModuleInitCodeBuilder(output_package)
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
+
+  def in_packages(m):
+    return any(package in m for package in packages)
+
   for module in list(sys.modules.values()):
     # Only look at tensorflow modules.
     if (not module or not hasattr(module, '__name__') or
-        module.__name__ is None or package not in module.__name__):
+        module.__name__ is None or not in_packages(module.__name__)):
       continue
     # Do not generate __init__.py files for contrib modules for now.
-    if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
+    if (('.contrib.' in module.__name__ or module.__name__.endswith('.contrib'))
+        and '.lite' not in module.__name__):
       continue
 
     for module_contents_name in dir(module):
@@ -378,20 +386,14 @@ def get_module_docstring(module_name, package, api_name):
   return 'Public API for tf.%s namespace.' % module_name
 
 
-def create_api_files(
-    output_files,
-    package,
-    root_init_template,
-    output_dir,
-    output_package,
-    api_name,
-    api_version,
-    compat_api_versions):
+def create_api_files(output_files, packages, root_init_template, output_dir,
+                     output_package, api_name, api_version,
+                     compat_api_versions, compat_init_templates):
   """Creates __init__.py files for the Python API.
 
   Args:
     output_files: List of __init__.py file paths to create.
-    package: Base python package containing python with target tf_export
+    packages: Base python packages containing python with target tf_export
       decorators.
     root_init_template: Template for top-level __init__.py file.
       "# API IMPORTS PLACEHOLDER" comment in the template file will be replaced
@@ -402,6 +404,8 @@ def create_api_files(
     api_version: API version to generate (`v1` or `v2`).
     compat_api_versions: Additional API versions to generate in compat/
       subdirectory.
+    compat_init_templates: List of templates for top level compat init files
+      in the same order as compat_api_versions.
 
   Raises:
     ValueError: if output_files list is missing a required file.
@@ -417,14 +421,18 @@ def create_api_files(
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  module_text_map = get_api_init_text(
-      package, output_package, api_name, api_version, compat_api_versions)
+  module_text_map = get_api_init_text(packages, output_package, api_name,
+                                      api_version, compat_api_versions)
 
   # Add imports to output files.
   missing_output_files = []
   # Root modules are "" and "compat.v*".
-  root_modules = set(_COMPAT_MODULE_TEMPLATE % v for v in compat_api_versions)
-  root_modules.add('')
+  root_module = ''
+  compat_module_to_template = {
+      _COMPAT_MODULE_TEMPLATE % v: t
+      for v, t in zip(compat_api_versions, compat_init_templates)
+  }
+
   for module, text in module_text_map.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
@@ -434,23 +442,30 @@ def create_api_files(
       continue
 
     contents = ''
-    if module not in root_modules or not root_init_template:
-      contents = (
-          _GENERATED_FILE_HEADER %
-          get_module_docstring(module, package, api_name) +
-          text + _GENERATED_FILE_FOOTER)
-    else:
-      # Read base init file
+    if module == root_module and root_init_template:
+      # Read base init file for root module
       with open(root_init_template, 'r') as root_init_template_file:
         contents = root_init_template_file.read()
         contents = contents.replace('# API IMPORTS PLACEHOLDER', text)
+    elif module in compat_module_to_template:
+      # Read base init file for compat module
+      with open(compat_module_to_template[module], 'r') as init_template_file:
+        contents = init_template_file.read()
+        contents = contents.replace('# API IMPORTS PLACEHOLDER', text)
+    else:
+      contents = (
+          _GENERATED_FILE_HEADER % get_module_docstring(
+              module, packages[0], api_name) + text + _GENERATED_FILE_FOOTER)
     with open(module_name_to_file_path[module], 'w') as fp:
       fp.write(contents)
 
   if missing_output_files:
     raise ValueError(
-        'Missing outputs for genrule:\n%s.' %
-        ',\n'.join(sorted(missing_output_files)))
+        """Missing outputs for genrule:\n%s. Be sure to add these targets to
+tensorflow/python/tools/api/generator/api_init_files_v1.bzl and
+tensorflow/python/tools/api/generator/api_init_files.bzl (tensorflow repo), or
+tensorflow_estimator/python/estimator/api/api_gen.bzl (estimator repo)"""
+        % ',\n'.join(sorted(missing_output_files)))
 
 
 def main():
@@ -462,9 +477,11 @@ def main():
       'output. If multiple files are passed in, then we assume output files '
       'are listed directly as arguments.')
   parser.add_argument(
-      '--package', default=_DEFAULT_PACKAGE, type=str,
-      help='Base package that imports modules containing the target tf_export '
-           'decorators.')
+      '--packages',
+      default=_DEFAULT_PACKAGE,
+      type=str,
+      help='Base packages that import modules containing the target tf_export '
+      'decorators.')
   parser.add_argument(
       '--root_init_template', default='', type=str,
       help='Template for top level __init__.py file. '
@@ -486,6 +503,11 @@ def main():
       '--compat_apiversions', default=[], type=int, action='append',
       help='Additional versions to generate in compat/ subdirectory. '
            'If set to 0, then no additional version would be generated.')
+  parser.add_argument(
+      '--compat_init_templates', default=[], type=str, action='append',
+      help='Templates for top-level __init__ files under compat modules. '
+           'The list of init file templates must be in the same order as '
+           'list of versions passed with compat_apiversions.')
   parser.add_argument(
       '--output_package', default='tensorflow', type=str,
       help='Root output package.')
@@ -500,10 +522,12 @@ def main():
     outputs = args.outputs
 
   # Populate `sys.modules` with modules containing tf_export().
-  importlib.import_module(args.package)
-  create_api_files(outputs, args.package, args.root_init_template,
-                   args.apidir, args.output_package, args.apiname,
-                   args.apiversion, args.compat_apiversions)
+  packages = args.packages.split(',')
+  for package in packages:
+    importlib.import_module(package)
+  create_api_files(outputs, packages, args.root_init_template, args.apidir,
+                   args.output_package, args.apiname, args.apiversion,
+                   args.compat_apiversions, args.compat_init_templates)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/tools/api/generator/create_python_api_test.py b/tensorflow/python/tools/api/generator/create_python_api_test.py
index 95ef8bbb0f6aa83e99e0b702f4a70a909f05d741..3c0df6365c59c7f710ae88eadea67ba9cc041b21 100644
--- a/tensorflow/python/tools/api/generator/create_python_api_test.py
+++ b/tensorflow/python/tools/api/generator/create_python_api_test.py
@@ -57,9 +57,10 @@ class CreatePythonApiTest(test.TestCase):
 
   def testFunctionImportIsAdded(self):
     imports = create_python_api.get_api_init_text(
-        package=create_python_api._DEFAULT_PACKAGE,
+        packages=[create_python_api._DEFAULT_PACKAGE],
         output_package='tensorflow',
-        api_name='tensorflow', api_version=1)
+        api_name='tensorflow',
+        api_version=1)
     expected_import = (
         'from tensorflow.python.test_module '
         'import test_op as test_op1')
@@ -78,9 +79,10 @@ class CreatePythonApiTest(test.TestCase):
 
   def testClassImportIsAdded(self):
     imports = create_python_api.get_api_init_text(
-        package=create_python_api._DEFAULT_PACKAGE,
+        packages=[create_python_api._DEFAULT_PACKAGE],
         output_package='tensorflow',
-        api_name='tensorflow', api_version=2)
+        api_name='tensorflow',
+        api_version=2)
     expected_import = ('from tensorflow.python.test_module '
                        'import TestClass')
     self.assertTrue(
@@ -89,9 +91,10 @@ class CreatePythonApiTest(test.TestCase):
 
   def testConstantIsAdded(self):
     imports = create_python_api.get_api_init_text(
-        package=create_python_api._DEFAULT_PACKAGE,
+        packages=[create_python_api._DEFAULT_PACKAGE],
         output_package='tensorflow',
-        api_name='tensorflow', api_version=1)
+        api_name='tensorflow',
+        api_version=1)
     expected = ('from tensorflow.python.test_module '
                 'import _TEST_CONSTANT')
     self.assertTrue(expected in str(imports),
@@ -99,7 +102,7 @@ class CreatePythonApiTest(test.TestCase):
 
   def testCompatModuleIsAdded(self):
     imports = create_python_api.get_api_init_text(
-        package=create_python_api._DEFAULT_PACKAGE,
+        packages=[create_python_api._DEFAULT_PACKAGE],
         output_package='tensorflow',
         api_name='tensorflow',
         api_version=2,
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index fbec9c6635c060aa846c704f49921a4b5ceed42c..abb5886deb3d9dd2e6981ee5822b0323a87eef1d 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -35,10 +35,11 @@ DocSource.__new__.__defaults__ = (None,) * len(DocSource._fields)
 
 _TENSORFLOW_DOC_SOURCES = {
     'app': DocSource(docstring_module_name='platform.app'),
+    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
     'compat': DocSource(docstring_module_name='util.compat'),
+    'distribute': DocSource(docstring_module_name='distribute.distribute_lib'),
     'distributions': DocSource(
         docstring_module_name='ops.distributions.distributions'),
-    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
     'errors': DocSource(docstring_module_name='framework.errors'),
     'gfile': DocSource(docstring_module_name='platform.gfile'),
     'graph_util': DocSource(docstring_module_name='framework.graph_util'),
@@ -56,8 +57,8 @@ _TENSORFLOW_DOC_SOURCES = {
     'resource_loader': DocSource(
         docstring_module_name='platform.resource_loader'),
     'sets': DocSource(docstring_module_name='ops.sets'),
+    'signal': DocSource(docstring_module_name='ops.signal.signal'),
     'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
-    'spectral': DocSource(docstring_module_name='ops.spectral_ops'),
     'strings': DocSource(docstring_module_name='ops.string_ops'),
     'sysconfig': DocSource(docstring_module_name='platform.sysconfig'),
     'test': DocSource(docstring_module_name='platform.test'),
diff --git a/tensorflow/python/tools/api/generator/output_init_files_test.py b/tensorflow/python/tools/api/generator/output_init_files_test.py
index 602ad165c0c1a0e39cce5e7f4eac8cabad2c2e7b..ab154af9101e32ecacda276004b0e2c39ced0b83 100644
--- a/tensorflow/python/tools/api/generator/output_init_files_test.py
+++ b/tensorflow/python/tools/api/generator/output_init_files_test.py
@@ -19,6 +19,12 @@ from __future__ import print_function
 
 import sys
 
+# The unused imports are needed so that the python and lite modules are
+# available in sys.modules
+# pylint: disable=unused-import
+from tensorflow import python as _tf_for_api_traversal
+from tensorflow.lite.python import lite as _tflite_for_api_traversal
+# pylint: enable=unused-import
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_decorator
 
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index 5dc14a6961e1ba7f1c317519a2d3b63eacba2220..de2672db3c4c4e6b94d3803767a749a943910d2c 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -161,9 +161,11 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
             },)
         builder.save(as_text=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testFreezeGraphV1(self):
     self._testFreezeGraph(saver_pb2.SaverDef.V1)
 
+  @test_util.run_v1_only("b/120545219")
   def testFreezeGraphV2(self):
     self._testFreezeGraph(saver_pb2.SaverDef.V2)
 
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 6504fbc10755c5c543016b8d56d6d53f3311b249..ea1f6aa55553f0d35e526557ca114f9929b8af7d 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -63,7 +63,7 @@ def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors,
       print("It's likely that your checkpoint file has been compressed "
             "with SNAPPY.")
     if ("Data loss" in str(e) and
-        (any([e in file_name for e in [".index", ".meta", ".data"]]))):
+        any(e in file_name for e in [".index", ".meta", ".data"])):
       proposed_file = ".".join(file_name.split(".")[0:-1])
       v2_file_error_template = """
 It's likely that this is a V2 checkpoint and you need to provide the filename
diff --git a/tensorflow/python/tools/optimize_for_inference.py b/tensorflow/python/tools/optimize_for_inference.py
index dac6a06a89c7596dd66d0ed7a2e5a59a0ba9b9dd..fbf8c2d70999cc5a92c220754b0f8e2287fb6644 100644
--- a/tensorflow/python/tools/optimize_for_inference.py
+++ b/tensorflow/python/tools/optimize_for_inference.py
@@ -88,7 +88,7 @@ def main(unused_args):
       input_graph_def,
       FLAGS.input_names.split(","),
       FLAGS.output_names.split(","),
-      FLAGS.placeholder_type_enum,
+      _parse_placeholder_types(FLAGS.placeholder_type_enum),
       FLAGS.toco_compatible)
 
   if FLAGS.frozen_graph:
@@ -101,6 +101,12 @@ def main(unused_args):
   return 0
 
 
+def _parse_placeholder_types(values):
+  """Extracts placeholder types from a comma separate list."""
+  values = [int(value) for value in values.split(",")]
+  return values if len(values) > 1 else values[0]
+
+
 def parse_args():
   """Parses command line arguments."""
   parser = argparse.ArgumentParser()
@@ -137,9 +143,12 @@ def parse_args():
       """)
   parser.add_argument(
       "--placeholder_type_enum",
-      type=int,
-      default=dtypes.float32.as_datatype_enum,
-      help="The AttrValue enum to use for placeholders.")
+      type=str,
+      default=str(dtypes.float32.as_datatype_enum),
+      help="""\
+      The AttrValue enum to use for placeholders.
+      Or a comma separated list, one value for each placeholder.\
+      """)
   parser.add_argument(
       "--toco_compatible",
       type=bool,
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index a39c04676111168baeaf10a43f0b6a9273777697..310776ff1b06a9d210e271b7c31ee6e00903da84 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -128,6 +128,7 @@ class OptimizeForInferenceTest(test.TestCase):
         graph_def, [], [add_name], dtypes.float32.as_datatype_enum)
     self.assertProtoEquals(expected_output, output)
 
+  @test_util.run_deprecated_v1
   def testFoldBatchNorms(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -171,9 +172,10 @@ class OptimizeForInferenceTest(test.TestCase):
     for node in optimized_graph_def.node:
       self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
 
+  @test_util.run_deprecated_v1
   def testFoldFusedBatchNorms(self):
     for data_format, use_gpu in [("NHWC", False), ("NCHW", True)]:
-      with self.test_session(use_gpu=use_gpu) as sess:
+      with self.cached_session(use_gpu=use_gpu) as sess:
         inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
         input_op = constant_op.constant(
             np.array(inputs),
@@ -212,10 +214,9 @@ class OptimizeForInferenceTest(test.TestCase):
       optimized_graph_def = optimize_for_inference_lib.fold_batch_norms(
           original_graph_def)
 
-      with self.test_session(use_gpu=use_gpu) as sess:
-        _ = importer.import_graph_def(
-            optimized_graph_def, input_map={}, name="optimized")
-        optimized_result = sess.run(["optimized/output:0"])
+      _ = importer.import_graph_def(
+          optimized_graph_def, input_map={}, name="optimized")
+      optimized_result = sess.run(["optimized/output:0"])
 
       self.assertAllClose(
           original_result, optimized_result, rtol=1e-04, atol=1e-06)
@@ -223,6 +224,7 @@ class OptimizeForInferenceTest(test.TestCase):
       for node in optimized_graph_def.node:
         self.assertNotEqual("FusedBatchNorm", node.op)
 
+  @test_util.run_deprecated_v1
   def testFuseResizePadAndConv(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -254,6 +256,7 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("MirrorPad", node.op)
       self.assertNotEqual("ResizeBilinear", node.op)
 
+  @test_util.run_deprecated_v1
   def testFuseResizeAndConv(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -283,6 +286,7 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("MirrorPad", node.op)
 
 
+  @test_util.run_deprecated_v1
   def testFusePadAndConv(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 2fcb0fa02929eddf17b53518abd146b6b94f869e..c4c3756c0407f2ed6a6a411b6778b2431428eea6 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -45,7 +45,7 @@ from tensorflow.python.saved_model import loader
 from tensorflow.python.tools import saved_model_utils
 
 # Set of ops to blacklist.
-_OP_BLACKLIST = set(['WriteFile', 'ReadFile'])
+_OP_BLACKLIST = set(['WriteFile', 'ReadFile', 'PrintV2'])
 
 
 def _show_tag_sets(saved_model_dir):
diff --git a/tensorflow/python/tools/strip_unused_test.py b/tensorflow/python/tools/strip_unused_test.py
index 7cf0c3e3ed9b5748b263913566150eff8acf857a..e906ff94ba8c0ad5ebb5014f244b0ef128d23a7a 100644
--- a/tensorflow/python/tools/strip_unused_test.py
+++ b/tensorflow/python/tools/strip_unused_test.py
@@ -50,7 +50,7 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
           wanted_input_node, 2.0, name="output_node")
       math_ops.add(output_node, 2.0, name="later_node")
       sess = session.Session()
-      output = sess.run(output_node)
+      output = self.evaluate(output_node)
       self.assertNear(-4.0, output, 0.00001)
       graph_io.write_graph(sess.graph, self.get_temp_dir(), input_graph_name)
 
@@ -113,7 +113,7 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
           input_node1, input_node2, name="output_node")
       math_ops.add(output_node, 2.0, name="later_node")
       sess = session.Session()
-      output = sess.run(output_node)
+      output = self.evaluate(output_node)
       self.assertNear(6.0, output, 0.00001)
       graph_io.write_graph(sess.graph, self.get_temp_dir(), input_graph_name)
 
diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py
index 95eca76496992f7ac66643a4c94d7e9e812cecf8..dd210160004760f1fe8cde945c6a728a530ebf33 100644
--- a/tensorflow/python/training/adadelta.py
+++ b/tensorflow/python/training/adadelta.py
@@ -25,7 +25,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdadeltaOptimizer")
+@tf_export(v1=["train.AdadeltaOptimizer"])
 class AdadeltaOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adadelta algorithm.
 
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index a14ac895ac096e351cad91aa8a53ca0026b18c9d..0e5af5a92224a5c3a54cc45eef11cf728c78945c 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -166,6 +166,7 @@ class AdadeltaOptimizerTest(test.TestCase):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -177,12 +178,11 @@ class AdadeltaOptimizerTest(test.TestCase):
             1.0, 1.0, 1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval())
+        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index cc0da26b2792bde3dec5fdbbafdd069eef1d81d7..10c043bae175d1da60f54a31caff37329641d86b 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -28,7 +28,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdagradOptimizer")
+@tf_export(v1=["train.AdagradOptimizer"])
 class AdagradOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adagrad algorithm.
 
diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py
index 5ba403554f570d9df33a5d525a40de2eb0d11138..e23b7134b3bb609b4a217c68e2bc30caee7b0f8a 100644
--- a/tensorflow/python/training/adagrad_da.py
+++ b/tensorflow/python/training/adagrad_da.py
@@ -26,7 +26,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdagradDAOptimizer")
+@tf_export(v1=["train.AdagradDAOptimizer"])
 class AdagradDAOptimizer(optimizer.Optimizer):
   """Adagrad Dual Averaging algorithm for sparse linear models.
 
diff --git a/tensorflow/python/training/adagrad_da_test.py b/tensorflow/python/training/adagrad_da_test.py
index 00801be3b4da878619cac753707b088352afe803..aacfe6faf4eff2b334197d86794380a273bcbb5e 100644
--- a/tensorflow/python/training/adagrad_da_test.py
+++ b/tensorflow/python/training/adagrad_da_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -54,14 +55,14 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllClose([0.0, 0.0], v0_val)
         self.assertAllClose([0.0, 0.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         # Let g to be gradient accumulator, gg to be gradient squared
         # accumulator, T be the global step, lr is the learning rate, and k the
         # initial gradient squared accumulator value.
@@ -73,12 +74,15 @@ class AdagradDAOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(
             np.array([-0.094821, -0.189358]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAWithoutRegularizationBasic1(self):
     self.doTestAdagradDAwithoutRegularizationBasic1()
 
+  @test_util.run_deprecated_v1
   def testResourceAdagradDAWithoutRegularizationBasic1(self):
     self.doTestAdagradDAwithoutRegularizationBasic1(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -92,13 +96,15 @@ class AdagradDAOptimizerTest(test.TestCase):
             1.0, global_step).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-1, -1]], var0.eval(), rtol=0.01)
+        self.assertAllCloseAccordingToType([[-1, -1]],
+                                           self.evaluate(var0),
+                                           rtol=0.01)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in [dtypes.float64, dtypes.float32]:
       with self.cached_session() as sess:
@@ -118,19 +124,20 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.904534, -1.603567]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.094821, -0.189358]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAWithL1(self):
     for dtype in [dtypes.float64, dtypes.float32]:
       with self.cached_session() as sess:
@@ -150,19 +157,20 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.895489, -1.59555]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.085339, -0.17989]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAWithL1_L2(self):
     for dtype in [dtypes.float64, dtypes.float32]:
       with self.cached_session() as sess:
@@ -182,14 +190,14 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.046907, -0.093659]), v0_val)
         self.assertAllCloseAccordingToType(
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index 7caf01f64d5e1cf7a4084444721aff9c55a9fb0b..1e2d29b337338985fb8ac27ab11d65667d22ee21 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -96,6 +96,7 @@ class AdagradOptimizerTest(test.TestCase):
   def testBasicLocked(self):
     self.doTestBasic(use_locking=True)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -107,14 +108,16 @@ class AdagradOptimizerTest(test.TestCase):
         sgd_op = adagrad.AdagradOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType(
-            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]],
+                                           self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1], [3, 4]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -128,17 +131,20 @@ class AdagradOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -159,17 +165,18 @@ class AdagradOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([[1.0], [2.0]], var0.eval())
-        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        self.assertAllClose([[1.0], [2.0]], self.evaluate(var0))
+        self.assertAllClose([[3.0], [4.0]], self.evaluate(var1))
         # Run 3 step of sgd
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+            np.array([[-1.6026098728179932], [2.0]]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([[3.0], [3.715679168701172]]), var1.eval())
+            np.array([[3.0], [3.715679168701172]]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -193,13 +200,14 @@ class AdagradOptimizerTest(test.TestCase):
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
         self.assertAllClose(aggregated_update_var.eval(),
-                            repeated_index_update_var.eval())
+                            self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
           self.assertAllClose(aggregated_update_var.eval(),
-                              repeated_index_update_var.eval())
+                              self.evaluate(repeated_index_update_var))
 
+  @test_util.run_deprecated_v1
   def testSparseRepeatedIndicesResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -217,13 +225,14 @@ class AdagradOptimizerTest(test.TestCase):
             2.0).minimize(loss_aggregated)
         variables.global_variables_initializer().run()
         self.assertAllCloseAccordingToType(
-            var_repeated.eval(), var_aggregated.eval())
+            self.evaluate(var_repeated), self.evaluate(var_aggregated))
         for _ in range(3):
           update_op_repeated.run()
           update_op_aggregated.run()
           self.assertAllCloseAccordingToType(
-              var_repeated.eval(), var_aggregated.eval())
+              self.evaluate(var_repeated), self.evaluate(var_aggregated))
 
+  @test_util.run_deprecated_v1
   def testSparseStability(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -253,13 +262,14 @@ class AdagradOptimizerTest(test.TestCase):
           init.run()
           ada_update.run()
           self.assertAllCloseAccordingToType(
-              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), self.evaluate(slot0))
           self.assertAllCloseAccordingToType(
               np.array([[
                   0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
                   -0.01029443
-              ]]), var0.eval())
+              ]]), self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -282,18 +292,21 @@ class AdagradOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values.
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Mix the first and the second adagrad for 3 steps.
         ada_update1.run()
         ada_update2.run()
         ada_update1.run()
         # Validate updated params (the same as with only 1 Adagrad).
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1))
 
+  @test_util.run_v1_only("b/120545219")
   def testDynamicShapeVariable_Ok(self):
     with self.cached_session():
       v = variable_scope.get_variable("v", initializer=constant_op.constant(1.),
@@ -302,6 +315,7 @@ class AdagradOptimizerTest(test.TestCase):
       # Creating optimizer should cause no exception.
       adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
 
+  @test_util.run_v1_only("b/120545219")
   def testDynamicShapeVariableWithCallableInit(self):
     var0 = variable_scope.get_variable("var0",
                                        initializer=constant_op.constant(1.),
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 704ad6d3fe8a03b74012d260a54c64da67a1b0a3..0c701f47122caf7ae561ddfa84b98925226930e0 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -29,7 +29,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdamOptimizer")
+@tf_export(v1=["train.AdamOptimizer"])
 class AdamOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adam algorithm.
 
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 48db6e37335f006f7ac8becd7bfdd5a0a553cbcb..b0bae275773cf05b4e6233706b60f60ca13c9ac0 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -83,33 +83,37 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     self.doTestSparse(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceSparse(self):
     self.doTestSparse(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testSparseDevicePlacement(self):
     for index_dtype in [dtypes.int32, dtypes.int64]:
-      with self.test_session(force_gpu=test.is_gpu_available()):
+      with self.cached_session(force_gpu=test.is_gpu_available()):
         # If a GPU is available, tests that all optimizer ops can be placed on
         # it (i.e. they have GPU kernels).
         var = variables.Variable([[1.0], [2.0]])
@@ -120,6 +124,7 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
         minimize_op.run()
 
+  @test_util.run_deprecated_v1
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -143,12 +148,12 @@ class AdamOptimizerTest(test.TestCase):
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
         self.assertAllClose(aggregated_update_var.eval(),
-                            repeated_index_update_var.eval())
+                            self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
           self.assertAllClose(aggregated_update_var.eval(),
-                              repeated_index_update_var.eval())
+                              self.evaluate(repeated_index_update_var))
 
   def doTestBasic(self, use_resource=False, use_callable_params=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
@@ -235,6 +240,7 @@ class AdamOptimizerTest(test.TestCase):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -254,24 +260,26 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -294,13 +302,14 @@ class AdamOptimizerTest(test.TestCase):
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 3 steps of intertwined Adam1 and Adam2.
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           if t % 2 == 0:
             update1.run()
           else:
@@ -310,8 +319,8 @@ class AdamOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTwoSessions(self):
     optimizer = adam.AdamOptimizer()
diff --git a/tensorflow/python/training/basic_loops.py b/tensorflow/python/training/basic_loops.py
index 7af821c81928e67e0f258bc064d582a4186995c1..68fcb97a1c32f00ee059d78f1198d63218192763 100644
--- a/tensorflow/python/training/basic_loops.py
+++ b/tensorflow/python/training/basic_loops.py
@@ -21,7 +21,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.basic_train_loop")
+@tf_export(v1=["train.basic_train_loop"])
 def basic_train_loop(supervisor, train_step_fn, args=None,
                      kwargs=None, master=""):
   """Basic loop to train a model.
diff --git a/tensorflow/python/training/basic_loops_test.py b/tensorflow/python/training/basic_loops_test.py
index 5f5718e64a6c356e9fd4207c6a71a5b2628e3cb9..511a8334d56e60308c25927f47e3485d49b75dc6 100644
--- a/tensorflow/python/training/basic_loops_test.py
+++ b/tensorflow/python/training/basic_loops_test.py
@@ -23,6 +23,7 @@ import shutil
 
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import basic_loops
 from tensorflow.python.training import supervisor
@@ -37,6 +38,7 @@ def _test_dir(test_name):
 
 class BasicTrainLoopTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicTrainLoop(self):
     logdir = _test_dir("basic_train_loop")
     sv = supervisor.Supervisor(logdir=logdir)
@@ -55,6 +57,7 @@ class BasicTrainLoopTest(test.TestCase):
           sv, train_fn, args=(sv, "y"), kwargs={"a": "A"})
       self.assertEqual(3, num_calls[0])
 
+  @test_util.run_deprecated_v1
   def testBasicTrainLoopExceptionAborts(self):
     logdir = _test_dir("basic_train_loop_exception_aborts")
     sv = supervisor.Supervisor(logdir=logdir)
@@ -71,6 +74,7 @@ class BasicTrainLoopTest(test.TestCase):
       with self.assertRaisesRegexp(RuntimeError, "Failed"):
         basic_loops.basic_train_loop(sv, train_fn)
 
+  @test_util.run_deprecated_v1
   def testBasicTrainLoopRetryOnAborted(self):
     logdir = _test_dir("basic_train_loop_exception_aborts")
     sv = supervisor.Supervisor(logdir=logdir)
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 1efabcd854d7f72c51e39dcf1f5ce65b0168cbcc..86718ab45fc539d6c7d90878860ca510cda31e47 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -83,7 +83,7 @@ class _HookTimer(object):
     raise NotImplementedError
 
 
-@tf_export("train.SecondOrStepTimer")
+@tf_export(v1=["train.SecondOrStepTimer"])
 class SecondOrStepTimer(_HookTimer):
   """Timer that triggers at most once every N seconds or once every N steps.
   """
@@ -163,7 +163,7 @@ class NeverTriggerTimer(_HookTimer):
     return None
 
 
-@tf_export("train.LoggingTensorHook")
+@tf_export(v1=["train.LoggingTensorHook"])
 class LoggingTensorHook(session_run_hook.SessionRunHook):
   """Prints the given tensors every N local steps, every N seconds, or at end.
 
@@ -373,7 +373,7 @@ class _MultiStepStopAtStepHook(session_run_hook.SessionRunHook):
       self._update_steps_per_run_variable(global_step, run_context.session)
 
 
-@tf_export("train.StopAtStepHook")
+@tf_export(v1=["train.StopAtStepHook"])
 class StopAtStepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a specified step."""
 
@@ -429,7 +429,7 @@ class StopAtStepHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
-@tf_export("train.CheckpointSaverListener")
+@tf_export(v1=["train.CheckpointSaverListener"])
 class CheckpointSaverListener(object):
   """Interface for listeners that take action before or after checkpoint save.
 
@@ -495,7 +495,7 @@ class CheckpointSaverListener(object):
     pass
 
 
-@tf_export("train.CheckpointSaverHook")
+@tf_export(v1=["train.CheckpointSaverHook"])
 class CheckpointSaverHook(session_run_hook.SessionRunHook):
   """Saves checkpoints every N steps or seconds."""
 
@@ -634,7 +634,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     return savers[0]
 
 
-@tf_export("train.StepCounterHook")
+@tf_export(v1=["train.StepCounterHook"])
 class StepCounterHook(session_run_hook.SessionRunHook):
   """Hook that counts steps per second."""
 
@@ -718,14 +718,14 @@ class StepCounterHook(session_run_hook.SessionRunHook):
     self._last_global_step = stale_global_step
 
 
-@tf_export("train.NanLossDuringTrainingError")
+@tf_export(v1=["train.NanLossDuringTrainingError"])
 class NanLossDuringTrainingError(RuntimeError):
 
   def __str__(self):
     return "NaN loss during training."
 
 
-@tf_export("train.NanTensorHook")
+@tf_export(v1=["train.NanTensorHook"])
 class NanTensorHook(session_run_hook.SessionRunHook):
   """Monitors the loss tensor and stops training if loss is NaN.
 
@@ -757,7 +757,7 @@ class NanTensorHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
-@tf_export("train.SummarySaverHook")
+@tf_export(v1=["train.SummarySaverHook"])
 class SummarySaverHook(session_run_hook.SessionRunHook):
   """Saves summaries every N steps."""
 
@@ -866,7 +866,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
     return summary_op
 
 
-@tf_export("train.GlobalStepWaiterHook")
+@tf_export(v1=["train.GlobalStepWaiterHook"])
 class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
   """Delays execution until global step reaches `wait_until_step`.
 
@@ -914,7 +914,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
       time.sleep(0.5)
 
 
-@tf_export("train.FinalOpsHook")
+@tf_export(v1=["train.FinalOpsHook"])
 class FinalOpsHook(session_run_hook.SessionRunHook):
   """A hook which evaluates `Tensors` at the end of a session."""
 
@@ -958,7 +958,7 @@ class FinalOpsHook(session_run_hook.SessionRunHook):
         raise e
 
 
-@tf_export("train.FeedFnHook")
+@tf_export(v1=["train.FeedFnHook"])
 class FeedFnHook(session_run_hook.SessionRunHook):
   """Runs `feed_fn` and sets the `feed_dict` accordingly."""
 
@@ -976,7 +976,7 @@ class FeedFnHook(session_run_hook.SessionRunHook):
         fetches=None, feed_dict=self.feed_fn())
 
 
-@tf_export("train.ProfilerHook")
+@tf_export(v1=["train.ProfilerHook"])
 class ProfilerHook(session_run_hook.SessionRunHook):
   """Captures CPU/GPU profiling information every N steps or seconds.
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 2d469634e0ec99d71e244eb85c8f493759c79738..1af27626ba764b0bf4a2787e492983a72c1491e9 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 import os.path
 import shutil
 import tempfile
-import threading
 import time
 
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
@@ -35,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
@@ -52,6 +52,11 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 
 
+# Provide a realistic start time for unit tests where we need to mock out
+# calls to time.time().
+MOCK_START_TIME = 1484695987.209386
+
+
 class MockCheckpointSaverListener(
     basic_session_run_hooks.CheckpointSaverListener):
 
@@ -87,15 +92,19 @@ class MockCheckpointSaverListener(
 
 class SecondOrStepTimerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SecondOrStepTimer(every_secs=2.0, every_steps=10)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SecondOrStepTimer()
 
-  def test_every_secs(self):
+  @test.mock.patch.object(time, 'time')
+  def test_every_secs(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     timer = basic_session_run_hooks.SecondOrStepTimer(every_secs=1.0)
     self.assertTrue(timer.should_trigger_for_step(1))
 
@@ -103,7 +112,7 @@ class SecondOrStepTimerTest(test.TestCase):
     self.assertFalse(timer.should_trigger_for_step(1))
     self.assertFalse(timer.should_trigger_for_step(2))
 
-    time.sleep(1.0)
+    mock_time.return_value += 1.0
     self.assertFalse(timer.should_trigger_for_step(1))
     self.assertTrue(timer.should_trigger_for_step(2))
 
@@ -243,7 +252,7 @@ class LoggingTensorHookTest(test.TestCase):
           tensors=[t.name], at_end=True)
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       self.logged_message = ''
       for _ in range(3):
         mon_sess.run(train_op)
@@ -261,7 +270,7 @@ class LoggingTensorHookTest(test.TestCase):
         tensors=[t.name], every_n_iter=10, at_end=at_end)
     hook.begin()
     mon_sess = monitored_session._HookedSession(sess, [hook])
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
     mon_sess.run(train_op)
     self.assertRegexpMatches(str(self.logged_message), t.name)
     for _ in range(3):
@@ -308,13 +317,13 @@ class LoggingTensorHookTest(test.TestCase):
           tensors={'foo': t}, every_n_iter=1)
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess.run(train_op)
       self.assertRegexpMatches(str(self.logged_message), 'foo')
       # in first run, elapsed time is None.
       self.assertEqual(str(self.logged_message).find('sec'), -1)
 
-  def _validate_print_every_n_secs(self, sess, at_end):
+  def _validate_print_every_n_secs(self, sess, at_end, mock_time):
     t = constant_op.constant(42.0, name='foo')
     train_op = constant_op.constant(3)
 
@@ -322,7 +331,7 @@ class LoggingTensorHookTest(test.TestCase):
         tensors=[t.name], every_n_secs=1.0, at_end=at_end)
     hook.begin()
     mon_sess = monitored_session._HookedSession(sess, [hook])
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
 
     mon_sess.run(train_op)
     self.assertRegexpMatches(str(self.logged_message), t.name)
@@ -331,7 +340,7 @@ class LoggingTensorHookTest(test.TestCase):
     self.logged_message = ''
     mon_sess.run(train_op)
     self.assertEqual(str(self.logged_message).find(t.name), -1)
-    time.sleep(1.0)
+    mock_time.return_value += 1.0
 
     self.logged_message = ''
     mon_sess.run(train_op)
@@ -345,17 +354,21 @@ class LoggingTensorHookTest(test.TestCase):
       # assertNotRegexpMatches is not supported by python 3.1 and later
       self.assertEqual(str(self.logged_message).find(t.name), -1)
 
-  def test_print_every_n_secs(self):
+  @test.mock.patch.object(time, 'time')
+  def test_print_every_n_secs(self, mock_time):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      self._validate_print_every_n_secs(sess, at_end=False)
+      mock_time.return_value = MOCK_START_TIME
+      self._validate_print_every_n_secs(sess, at_end=False, mock_time=mock_time)
       # Verify proper reset.
-      self._validate_print_every_n_secs(sess, at_end=False)
+      self._validate_print_every_n_secs(sess, at_end=False, mock_time=mock_time)
 
-  def test_print_every_n_secs_and_end(self):
+  @test.mock.patch.object(time, 'time')
+  def test_print_every_n_secs_and_end(self, mock_time):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      self._validate_print_every_n_secs(sess, at_end=True)
+      mock_time.return_value = MOCK_START_TIME
+      self._validate_print_every_n_secs(sess, at_end=True, mock_time=mock_time)
       # Verify proper reset.
-      self._validate_print_every_n_secs(sess, at_end=True)
+      self._validate_print_every_n_secs(sess, at_end=True, mock_time=mock_time)
 
   def test_print_formatter(self):
     with ops.Graph().as_default(), session_lib.Session() as sess:
@@ -366,7 +379,7 @@ class LoggingTensorHookTest(test.TestCase):
           formatter=lambda items: 'qqq=%s' % items[t.name])
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess.run(train_op)
       self.assertEqual(self.logged_message[0], 'qqq=42.0')
 
@@ -403,11 +416,13 @@ class CheckpointSaverHookTest(test.TestCase):
       basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir, saver=self.scaffold.saver, scaffold=self.scaffold)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir, save_secs=10, save_steps=20)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.CheckpointSaverHook(self.model_dir)
@@ -562,11 +577,8 @@ class CheckpointSaverHookTest(test.TestCase):
 
   @test.mock.patch.object(time, 'time')
   def test_save_secs_saves_periodically(self, mock_time):
-    # Let's have a realistic start time
-    current_time = 1484695987.209386
-
     with self.graph.as_default():
-      mock_time.return_value = current_time
+      mock_time.return_value = MOCK_START_TIME
       hook = basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir, save_secs=2, scaffold=self.scaffold)
       hook.begin()
@@ -576,10 +588,10 @@ class CheckpointSaverHookTest(test.TestCase):
         sess.run(self.scaffold.init_op)
         mon_sess = monitored_session._HookedSession(sess, [hook])
 
-        mock_time.return_value = current_time
+        mock_time.return_value = MOCK_START_TIME
         mon_sess.run(self.train_op)  # Saved.
 
-        mock_time.return_value = current_time + 0.5
+        mock_time.return_value = MOCK_START_TIME + 0.5
         mon_sess.run(self.train_op)  # Not saved.
 
         self.assertEqual(1,
@@ -587,13 +599,13 @@ class CheckpointSaverHookTest(test.TestCase):
                                                         self.global_step.name))
 
         # Simulate 2.5 seconds of sleep.
-        mock_time.return_value = current_time + 2.5
+        mock_time.return_value = MOCK_START_TIME + 2.5
         mon_sess.run(self.train_op)  # Saved.
 
-        mock_time.return_value = current_time + 2.6
+        mock_time.return_value = MOCK_START_TIME + 2.6
         mon_sess.run(self.train_op)  # Not saved.
 
-        mock_time.return_value = current_time + 2.7
+        mock_time.return_value = MOCK_START_TIME + 2.7
         mon_sess.run(self.train_op)  # Not saved.
 
         self.assertEqual(3,
@@ -601,7 +613,7 @@ class CheckpointSaverHookTest(test.TestCase):
                                                         self.global_step.name))
 
         # Simulate 7.5 more seconds of sleep (10 seconds from start.
-        mock_time.return_value = current_time + 10
+        mock_time.return_value = MOCK_START_TIME + 10
         mon_sess.run(self.train_op)  # Saved.
         self.assertEqual(6,
                          checkpoint_utils.load_variable(self.model_dir,
@@ -609,11 +621,8 @@ class CheckpointSaverHookTest(test.TestCase):
 
   @test.mock.patch.object(time, 'time')
   def test_save_secs_calls_listeners_periodically(self, mock_time):
-    # Let's have a realistic start time
-    current_time = 1484695987.209386
-
     with self.graph.as_default():
-      mock_time.return_value = current_time
+      mock_time.return_value = MOCK_START_TIME
       listener = MockCheckpointSaverListener()
       hook = basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir,
@@ -626,28 +635,28 @@ class CheckpointSaverHookTest(test.TestCase):
         sess.run(self.scaffold.init_op)
         mon_sess = monitored_session._HookedSession(sess, [hook])
 
-        mock_time.return_value = current_time + 0.5
+        mock_time.return_value = MOCK_START_TIME + 0.5
         mon_sess.run(self.train_op)  # hook runs here
 
-        mock_time.return_value = current_time + 0.5
+        mock_time.return_value = MOCK_START_TIME + 0.5
         mon_sess.run(self.train_op)
 
-        mock_time.return_value = current_time + 3.0
+        mock_time.return_value = MOCK_START_TIME + 3.0
         mon_sess.run(self.train_op)  # hook runs here
 
-        mock_time.return_value = current_time + 3.5
+        mock_time.return_value = MOCK_START_TIME + 3.5
         mon_sess.run(self.train_op)
 
-        mock_time.return_value = current_time + 4.0
+        mock_time.return_value = MOCK_START_TIME + 4.0
         mon_sess.run(self.train_op)
 
-        mock_time.return_value = current_time + 6.5
+        mock_time.return_value = MOCK_START_TIME + 6.5
         mon_sess.run(self.train_op)  # hook runs here
 
-        mock_time.return_value = current_time + 7.0
+        mock_time.return_value = MOCK_START_TIME + 7.0
         mon_sess.run(self.train_op)  # hook won't run here, so it does at end
 
-        mock_time.return_value = current_time + 7.5
+        mock_time.return_value = MOCK_START_TIME + 7.5
         hook.end(sess)  # hook runs here
       self.assertEqual({
           'begin': 1,
@@ -913,7 +922,9 @@ class StepCounterHookTest(test.TestCase):
   def tearDown(self):
     shutil.rmtree(self.log_dir, ignore_errors=True)
 
-  def test_step_counter_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_step_counter_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
@@ -921,11 +932,11 @@ class StepCounterHookTest(test.TestCase):
       hook = basic_session_run_hooks.StepCounterHook(
           summary_writer=summary_writer, every_n_steps=10)
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       with test.mock.patch.object(tf_logging, 'warning') as mock_log:
         for _ in range(30):
-          time.sleep(0.01)
+          mock_time.return_value += 0.01
           mon_sess.run(train_op)
         # logging.warning should not be called.
         self.assertIsNone(mock_log.call_args)
@@ -941,7 +952,9 @@ class StepCounterHookTest(test.TestCase):
         self.assertEqual('global_step/sec', summary_value.tag)
         self.assertGreater(summary_value.simple_value, 0)
 
-  def test_step_counter_every_n_secs(self):
+  @test.mock.patch.object(time, 'time')
+  def test_step_counter_every_n_secs(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
@@ -950,12 +963,12 @@ class StepCounterHookTest(test.TestCase):
           summary_writer=summary_writer, every_n_steps=None, every_n_secs=0.1)
 
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       mon_sess.run(train_op)
-      time.sleep(0.2)
+      mock_time.return_value += 0.2
       mon_sess.run(train_op)
-      time.sleep(0.2)
+      mock_time.return_value += 0.2
       mon_sess.run(train_op)
       hook.end(sess)
 
@@ -987,7 +1000,7 @@ class StepCounterHookTest(test.TestCase):
           summary_writer=summary_writer, every_n_steps=1, every_n_secs=None)
 
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       mon_sess.run(train_op)
       mon_sess.run(train_op)
@@ -1007,7 +1020,7 @@ class StepCounterHookTest(test.TestCase):
     with ops.Graph().as_default(), session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(0)  # keep same.
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       hook = basic_session_run_hooks.StepCounterHook(
           every_n_steps=1, every_n_secs=None)
       hook.begin()
@@ -1034,16 +1047,18 @@ class StepCounterHookTest(test.TestCase):
         summary_writer=self.summary_writer, every_n_steps=every_n_steps)
     self.hook._set_steps_per_run(steps_per_run)
     self.hook.begin()
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
     self.mon_sess = monitored_session._HookedSession(sess, [self.hook])
 
-  def test_steps_per_run_less_than_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_steps_per_run_less_than_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       self._setup_steps_per_run_test(10, 5, g, sess)
 
       # Logs at 15, 25
       for _ in range(5):
-        time.sleep(0.01)
+        mock_time.return_value += 0.01
         self.mon_sess.run(self.train_op)
 
       self.hook.end(sess)
@@ -1058,13 +1073,15 @@ class StepCounterHookTest(test.TestCase):
         self.assertEqual('global_step/sec', summary_value.tag)
         self.assertGreater(summary_value.simple_value, 0)
 
-  def test_steps_per_run_equal_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_steps_per_run_equal_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       self._setup_steps_per_run_test(5, 5, g, sess)
 
       # Logs at 10, 15, 20, 25
       for _ in range(5):
-        time.sleep(0.01)
+        mock_time.return_value += 0.01
         self.mon_sess.run(self.train_op)
 
       self.hook.end(sess)
@@ -1080,13 +1097,15 @@ class StepCounterHookTest(test.TestCase):
         self.assertEqual('global_step/sec', summary_value.tag)
         self.assertGreater(summary_value.simple_value, 0)
 
-  def test_steps_per_run_greater_than_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_steps_per_run_greater_than_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       self._setup_steps_per_run_test(5, 10, g, sess)
 
       # Logs at 20, 30, 40, 50
       for _ in range(5):
-        time.sleep(0.01)
+        mock_time.return_value += 0.01
         self.mon_sess.run(self.train_op)
 
       self.hook.end(sess)
@@ -1103,6 +1122,7 @@ class StepCounterHookTest(test.TestCase):
         self.assertGreater(summary_value.simple_value, 0)
 
 
+@test_util.run_v1_only('b/120545219')
 class SummarySaverHookTest(test.TestCase):
 
   def setUp(self):
@@ -1147,7 +1167,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(30):
         mon_sess.run(self.train_op)
@@ -1179,7 +1199,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(10):
         mon_sess.run(self.train_op)
@@ -1199,7 +1219,9 @@ class SummarySaverHookTest(test.TestCase):
             },
         })
 
-  def test_save_secs_saving_once_every_step(self):
+  @test.mock.patch.object(time, 'time')
+  def test_save_secs_saving_once_every_step(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     hook = basic_session_run_hooks.SummarySaverHook(
         save_secs=0.5,
         summary_writer=self.summary_writer,
@@ -1207,11 +1229,11 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(4):
         mon_sess.run(self.train_op)
-        time.sleep(0.5)
+        mock_time.return_value += 0.5
       hook.end(sess)
 
     self.summary_writer.assert_summaries(
@@ -1242,7 +1264,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(8):
         mon_sess.run(self.train_op)
@@ -1279,27 +1301,43 @@ class GlobalStepWaiterHookTest(test.TestCase):
             session_run_hook.SessionRunContext(
                 original_args=None, session=sess))
 
-  def test_wait_for_step(self):
+  @test.mock.patch.object(time, 'sleep')
+  def test_wait_for_step(self, mock_sleep):
     with ops.Graph().as_default():
       gstep = variables.get_or_create_global_step()
       hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=1000)
       hook.begin()
+
       with session_lib.Session() as sess:
-        sess.run(variables_lib.global_variables_initializer())
-        waiter = threading.Thread(
-            target=hook.before_run,
-            args=(session_run_hook.SessionRunContext(
-                original_args=None, session=sess),))
-        waiter.daemon = True
-        waiter.start()
-        time.sleep(1.0)
-        self.assertTrue(waiter.is_alive())
-        sess.run(state_ops.assign(gstep, 500))
-        time.sleep(1.0)
-        self.assertTrue(waiter.is_alive())
-        sess.run(state_ops.assign(gstep, 1100))
-        time.sleep(1.2)
-        self.assertFalse(waiter.is_alive())
+        # Mock out calls to time.sleep() to update the global step.
+
+        class Context(object):
+          counter = 0
+
+        def mock_sleep_side_effect(seconds):
+          del seconds  # argument is ignored
+          Context.counter += 1
+          if Context.counter == 1:
+            # The first time sleep() is called, we update the global_step from
+            # 0 to 500.
+            sess.run(state_ops.assign(gstep, 500))
+          elif Context.counter == 2:
+            # The second time sleep() is called, we update the global_step from
+            # 500 to 1100.
+            sess.run(state_ops.assign(gstep, 1100))
+          else:
+            raise AssertionError(
+                'Expected before_run() to terminate after the second call to '
+                'time.sleep()')
+
+        mock_sleep.side_effect = mock_sleep_side_effect
+
+        # Run the mocked-out interaction with the hook.
+        self.evaluate(variables_lib.global_variables_initializer())
+        run_context = session_run_hook.SessionRunContext(
+            original_args=None, session=sess)
+        hook.before_run(run_context)
+        self.assertEqual(Context.counter, 2)
 
 
 class FinalOpsHookTest(test.TestCase):
@@ -1333,7 +1371,7 @@ class FinalOpsHookTest(test.TestCase):
   def test_final_ops_triggers_out_of_range_error(self):
     with ops.Graph().as_default():
       dataset = dataset_ops.Dataset.range(1)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       read_ops = iterator.get_next()
       final_ops = read_ops
 
@@ -1366,6 +1404,7 @@ class FinalOpsHookTest(test.TestCase):
                              hook.final_ops_values.tolist())
 
 
+@test_util.run_v1_only('b/120545219')
 class ResourceSummarySaverHookTest(test.TestCase):
 
   def setUp(self):
@@ -1390,7 +1429,7 @@ class ResourceSummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(30):
         mon_sess.run(self.train_op)
@@ -1446,10 +1485,12 @@ class ProfilerHookTest(test.TestCase):
   def _count_timeline_files(self):
     return len(gfile.Glob(self.filepattern))
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.ProfilerHook(save_secs=10, save_steps=20)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.ProfilerHook(save_secs=None, save_steps=None)
@@ -1465,29 +1506,27 @@ class ProfilerHookTest(test.TestCase):
   @test.mock.patch.object(time, 'time')
   def test_save_secs_saves_periodically(self, mock_time):
     # Pick a fixed start time.
-    current_time = 1484863632.
-
     with self.graph.as_default():
-      mock_time.return_value = current_time
+      mock_time.return_value = MOCK_START_TIME
       hook = basic_session_run_hooks.ProfilerHook(
           save_secs=2, output_dir=self.output_dir)
       with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
         sess.run(self.train_op)  # Not saved.
         self.assertEqual(0, self._count_timeline_files())
         # Simulate 2.5 seconds of sleep.
-        mock_time.return_value = current_time + 2.5
+        mock_time.return_value = MOCK_START_TIME + 2.5
         sess.run(self.train_op)  # Saved.
         self.assertEqual(1, self._count_timeline_files())
 
         # Pretend some small amount of time has passed.
-        mock_time.return_value = current_time + 2.6
+        mock_time.return_value = MOCK_START_TIME + 2.6
         sess.run(self.train_op)  # Not saved.
         # Edge test just before we should save the timeline.
-        mock_time.return_value = current_time + 4.4
+        mock_time.return_value = MOCK_START_TIME + 4.4
         sess.run(self.train_op)  # Not saved.
         self.assertEqual(1, self._count_timeline_files())
 
-        mock_time.return_value = current_time + 4.5
+        mock_time.return_value = MOCK_START_TIME + 4.5
         sess.run(self.train_op)  # Saved.
         self.assertEqual(2, self._count_timeline_files())
 
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index 38910fb246d6dc149520f41aa161635497fd5cca..f745ab4824ac364b51758e6c3fb60a5679d210fb 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -36,6 +36,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,7 +56,11 @@ def _GetCheckpointFilename(save_dir, latest_filename):
   return os.path.join(save_dir, latest_filename)
 
 
-@tf_export("train.generate_checkpoint_state_proto")
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use tf.train.CheckpointManager to manage checkpoints rather "
+                  "than editing the Checkpoint proto manually."))
+@tf_export(v1=["train.generate_checkpoint_state_proto"])
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
                                     all_model_checkpoint_paths=None,
@@ -121,7 +126,11 @@ def generate_checkpoint_state_proto(save_dir,
   return coord_checkpoint_proto
 
 
-@tf_export("train.update_checkpoint_state")
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use tf.train.CheckpointManager to manage checkpoints rather "
+                  "than manually editing the Checkpoint proto."))
+@tf_export(v1=["train.update_checkpoint_state"])
 def update_checkpoint_state(save_dir,
                             model_checkpoint_path,
                             all_model_checkpoint_paths=None,
@@ -344,7 +353,10 @@ def latest_checkpoint(checkpoint_dir, latest_filename=None):
   return None
 
 
-@tf_export("train.checkpoint_exists")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use standard file APIs to check for files with this prefix.")
+@tf_export(v1=["train.checkpoint_exists"])
 def checkpoint_exists(checkpoint_prefix):
   """Checks whether a V1 or V2 checkpoint exists with the specified prefix.
 
@@ -369,7 +381,10 @@ def checkpoint_exists(checkpoint_prefix):
     return False
 
 
-@tf_export("train.get_checkpoint_mtimes")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use standard file utilities to get mtimes.")
+@tf_export(v1=["train.get_checkpoint_mtimes"])
 def get_checkpoint_mtimes(checkpoint_prefixes):
   """Returns the mtimes (modification timestamps) of the checkpoints.
 
@@ -408,7 +423,10 @@ def get_checkpoint_mtimes(checkpoint_prefixes):
   return mtimes
 
 
-@tf_export("train.remove_checkpoint")
+@deprecation.deprecated(
+    date=None,
+    instructions="Use standard file APIs to delete files with this prefix.")
+@tf_export(v1=["train.remove_checkpoint"])
 def remove_checkpoint(checkpoint_prefix,
                       checkpoint_format_version=saver_pb2.SaverDef.V2,
                       meta_graph_suffix="meta"):
@@ -458,6 +476,7 @@ def meta_graph_filename(checkpoint_filename, meta_graph_suffix="meta"):
 
 
 # TODO(allenl): Allow tf.keras.Model instances in the constructor directly?
+@tf_export("train.CheckpointManager")
 class CheckpointManager(object):
   """Deletes old checkpoints.
 
@@ -634,13 +653,10 @@ class CheckpointManager(object):
     """
     return self._checkpoint_prefix
 
-  def save(self, session=None, checkpoint_number=None):
+  def save(self, checkpoint_number=None):
     """Creates a new checkpoint and manages it.
 
     Args:
-      session: The session to evaluate variables in. Ignored when executing
-        eagerly. If not provided when graph building, the default session is
-        used.
       checkpoint_number: An optional integer, or an integer-dtype `Variable` or
         `Tensor`, used to number the checkpoint. If `None` (default),
         checkpoints are numbered using `checkpoint.save_counter`. Even if
@@ -657,9 +673,9 @@ class CheckpointManager(object):
     if context.executing_eagerly():
       save_counter = self._checkpoint.save_counter
       save_counter.assign_add(1)
+      session = None
     else:
-      if session is None:
-        session = ops.get_default_session()
+      session = ops.get_default_session()
 
       def _initializing_creator(next_creator, **kwargs):
         """Initialize the save counter if it has been newly created."""
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 3a061bcb35c1c1a6ef31645c8e0ef892e9d9aa62..8606ec4a206ffbce85cf4071934deeb5a545b055 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -62,6 +62,7 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
     finally:
       shutil.rmtree(tempdir)
 
+  @test_util.run_deprecated_v1
   def testNameCollision(self):
     # Make sure we have a clean directory to work in.
     with self.tempDir() as tempdir:
@@ -99,6 +100,7 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
           self.assertIsNotNone(
               checkpoint_management.latest_checkpoint(traindir))
 
+  @test_util.run_deprecated_v1
   def testRelativePath(self):
     # Make sure we have a clean directory to work in.
     with self.tempDir() as tempdir:
@@ -123,9 +125,9 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
           # Record a short training history.
           variables.global_variables_initializer().run()
           save.save(sess, filepath, global_step=0)
-          inc.eval()
+          self.evaluate(inc)
           save.save(sess, filepath, global_step=1)
-          inc.eval()
+          self.evaluate(inc)
           save.save(sess, filepath, global_step=2)
 
         with self.cached_session() as sess:
@@ -270,6 +272,7 @@ class SaverUtilsTest(test.TestCase):
   def tearDown(self):
     gfile.DeleteRecursively(self._base_dir)
 
+  @test_util.run_deprecated_v1
   def testCheckpointExists(self):
     for sharded in (False, True):
       for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
@@ -288,6 +291,7 @@ class SaverUtilsTest(test.TestCase):
           ckpt_prefix = checkpoint_management.latest_checkpoint(self._base_dir)
           self.assertTrue(checkpoint_management.checkpoint_exists(ckpt_prefix))
 
+  @test_util.run_deprecated_v1
   def testGetCheckpointMtimes(self):
     prefixes = []
     for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
@@ -302,6 +306,7 @@ class SaverUtilsTest(test.TestCase):
     self.assertEqual(2, len(mtimes))
     self.assertTrue(mtimes[1] >= mtimes[0])
 
+  @test_util.run_deprecated_v1
   def testRemoveCheckpoint(self):
     for sharded in (False, True):
       for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py
index dde84314977f6ffc8c93e90f6ad76e13c2f02cb0..c48154713929b91050e070051add9fee7c428805 100644
--- a/tensorflow/python/training/checkpoint_ops_test.py
+++ b/tensorflow/python/training/checkpoint_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
@@ -33,6 +34,7 @@ from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import saver as saver_lib
 
 
+@test_util.run_v1_only('b/120545219')
 class LoadAndRemapWrappersTest(test.TestCase):
   """Tests for the functionality of the Python wrappers."""
 
@@ -47,7 +49,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
       with variable_scope.variable_scope('some_scope'):
         variable_scope.get_variable(name='embeddings', shape=[5, 16],
                                     initializer=initializer)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver = saver_lib.Saver()
       saver.save(sess, checkpoint_prefix, global_step=5)
     self.checkpoint_file = '{}-5'.format(checkpoint_prefix)
@@ -115,7 +117,8 @@ class LoadAndRemapWrappersTest(test.TestCase):
         axis=1)
 
     with self.cached_session():
-      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+      self.assertAllClose(expected_remapped_matrix,
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_output_layer_weight_initializer_linear(self):
     """Tests for the output layer initializer in the linear multi-class case."""
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index e6118177fd1004b0f6f807666302289de6b7d2f6..74b46179e75423b530191cce5a52034879712eaa 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -29,8 +30,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -101,9 +101,12 @@ def list_variables(ckpt_dir_or_file):
   return result
 
 
-@tf_export("train.init_from_checkpoint")
+@tf_export(v1=["train.init_from_checkpoint"])
 def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
-  """Initializes current variables with tensors loaded from given checkpoint.
+  """Replaces `tf.Variable` initializers so they load from a checkpoint file.
+
+  Values are not loaded immediately, but when the initializer is run
+  (typically by running a `tf.global_variables_initializer` op).
 
   Note: This overrides default initialization ops of specified variables and
   redefines dtype.
@@ -180,11 +183,11 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
     tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
     ValueError: If missing variables in current graph.
   """
-  if distribution_strategy_context.get_cross_tower_context():
+  if distribution_strategy_context.get_cross_replica_context():
     _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
   else:
-    distribution_strategy_context.get_tower_context().merge_call(
-        _init_from_checkpoint, ckpt_dir_or_file, assignment_map)
+    distribution_strategy_context.get_replica_context().merge_call(
+        _init_from_checkpoint, args=(ckpt_dir_or_file, assignment_map))
 
 
 def _init_from_checkpoint(_, ckpt_dir_or_file, assignment_map):
@@ -308,20 +311,20 @@ def _set_checkpoint_initializer(variable,
     restore_op = io_ops.restore_v2(
         ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
 
-    names_to_saveables = saver.BaseSaverBuilder.OpListToDict([variable])
+    names_to_saveables = saveable_object_util.op_list_to_dict([variable])
     saveable_objects = []
     for name, op in names_to_saveables.items():
-      for s in saver.BaseSaverBuilder.SaveableObjectsForOp(op, name):
+      for s in saveable_object_util.saveable_objects_for_op(op, name):
         saveable_objects.append(s)
 
     assert len(saveable_objects) == 1  # Should be only one variable.
-    init_op = saveable_objects[0].restore([restore_op], restored_shapes=None)
+  init_op = saveable_objects[0].restore([restore_op], restored_shapes=None)
 
-    # pylint:disable=protected-access
-    variable._initializer_op = init_op
-    restore_op.set_shape(variable.shape)
-    variable._initial_value = restore_op
-    # pylint:enable=protected-access
+  # pylint:disable=protected-access
+  variable._initializer_op = init_op
+  restore_op.set_shape(variable.shape)
+  variable._initial_value = restore_op
+  # pylint:enable=protected-access
 
 
 def _set_variable_or_list_initializer(variable_or_list, ckpt_file,
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index 61dcbdb2b8f92256c97b0bd62c1944516646fa03..a3e58de4a31bca7ccb20606bd43ebdb732137f4b 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -207,9 +207,6 @@ class CheckpointsTest(test.TestCase):
 
       checkpoint_utils.init_from_checkpoint(checkpoint_dir,
                                             {"useful_scope/": "useful_scope/"})
-      # initializer runs on the same task but always on CPU.
-      self.assertEqual(my4._initializer_op.op.inputs[1].device,
-                       "/job:ps/device:CPU:0")
 
   def testInitFromRootCheckpoint(self):
     checkpoint_dir = self.get_temp_dir()
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
index d26932c1aae7831f8e266d04777db53baa13330f..26a0ac35b763e4b8a2c9143d88a2a97259715262 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -25,9 +25,9 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:platform",
-        "//tensorflow/python:saveable_object",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/saving:saveable_object",
     ],
 )
 
@@ -114,7 +114,6 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:saveable_object",
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
@@ -122,6 +121,10 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/training/saving:functional_saver",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
     ],
 )
 
@@ -152,7 +155,7 @@ py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/checkpointable/base.py
index 095a90ddd4f831e5af63f8eb7e231eacb5a91975..3cd1c6f9c8b0b5b5acf517e5f5801db66d0045b2 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -25,7 +25,6 @@ import weakref
 
 import six
 
-from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -34,7 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import saveable_object
+from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
@@ -374,41 +373,10 @@ class _CheckpointPosition(object):
       eagerly.
     """
     (restore_ops,
-     named_saveables,
+     tensor_saveables,
      python_saveables) = self._gather_ops_or_named_saveables()
-
-    # Eagerly run restorations for Python state.
-    reader = pywrap_tensorflow.NewCheckpointReader(
-        self._checkpoint.save_path_string)
-    for saveable in python_saveables:
-      spec_names = [spec.name for spec in saveable.specs]
-      saveable.python_restore(
-          [reader.get_tensor(name) for name in spec_names])
-
-    # If we have new SaveableObjects, extract and cache restore ops.
-    if named_saveables:
-      validated_saveables = (
-          self._checkpoint.builder._ValidateAndSliceInputs(named_saveables))  # pylint: disable=protected-access
-      validated_names = set(saveable.name for saveable in validated_saveables)
-      if set(named_saveables.keys()) != validated_names:
-        raise AssertionError(
-            ("Saveable keys changed when validating. Got back %s, was "
-             "expecting %s") % (named_saveables.keys(), validated_names))
-      all_tensors = self._checkpoint.builder.bulk_restore(
-          filename_tensor=self._checkpoint.save_path_tensor,
-          saveables=validated_saveables, preferred_shard=-1,
-          restore_sequentially=False)
-      saveable_index = 0
-      for saveable in validated_saveables:
-        num_specs = len(saveable.specs)
-        saveable_tensors = all_tensors[
-            saveable_index:saveable_index + num_specs]
-        saveable_index += num_specs
-        restore_op = saveable.restore(saveable_tensors, restored_shapes=None)
-        if not context.executing_eagerly():
-          assert saveable.name not in self._checkpoint.restore_ops_by_name
-          self._checkpoint.restore_ops_by_name[saveable.name] = restore_op
-          restore_ops.append(restore_op)
+    restore_ops.extend(self._checkpoint.restore_saveables(
+        tensor_saveables, python_saveables))
     return restore_ops
 
   @property
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/checkpointable/data_structures.py
index c29e5db0753c0a2d96ebb0ed43e4e78aac629526..817552f32696e34d123d1da5057388c1bd96139c 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/checkpointable/data_structures.py
@@ -111,9 +111,6 @@ class CheckpointableDataStructure(base.CheckpointableBase):
   """Base class for data structures which contain checkpointable objects."""
 
   def __init__(self):
-    # An append-only ordered set
-    self._layers = []
-
     self.trainable = True
     self._extra_variables = []
 
@@ -128,21 +125,30 @@ class CheckpointableDataStructure(base.CheckpointableBase):
           ("Only checkpointable objects (such as Layers or Optimizers) may be "
            "stored in a List object. Got %s, which does not inherit from "
            "CheckpointableBase.") % (value,))
-    if (isinstance(value, CheckpointableDataStructure)
-        or layer_utils.is_layer(value)
-        or layer_utils.has_weights(value)):
-      # Check for object-identity rather than with __eq__ to avoid
-      # de-duplicating empty container types. Automatically generated list
-      # wrappers keep things like "[] == []" true, which means "[] in [[]]" is
-      # also true. This becomes not true once one of the lists is mutated.
-      if not any((layer is value for layer in self._layers)):
-        self._layers.append(value)
-        if hasattr(value, "_use_resource_variables"):
-          # In subclassed models, legacy layers (tf.layers) must always use
-          # resource variables.
-          value._use_resource_variables = True  # pylint: disable=protected-access
+    if hasattr(value, "_use_resource_variables"):
+      # In subclassed models, legacy layers (tf.layers) must always use
+      # resource variables.
+      value._use_resource_variables = True  # pylint: disable=protected-access
     return value
 
+  @property
+  def _values(self):
+    """An iterable/sequence which may contain checkpointable objects."""
+    raise NotImplementedError("Abstract method")
+
+  @property
+  def _layers(self):
+    """All Layers and Layer containers, including empty containers."""
+    # Filter objects on demand so that wrapper objects use values from the thing
+    # they're wrapping if out of sync.
+    collected = []
+    for obj in self._values:
+      if (isinstance(obj, CheckpointableDataStructure)
+          or layer_utils.is_layer(obj)
+          or layer_utils.has_weights(obj)):
+        collected.append(obj)
+    return collected
+
   @property
   def layers(self):
     return layer_utils.filter_empty_layer_containers(self._layers)
@@ -265,6 +271,10 @@ class List(CheckpointableDataStructure, collections.Sequence):
   def _name_element(self, index):
     return "%d" % (index,)
 
+  @property
+  def _values(self):
+    return self
+
   def append(self, value):
     """Add a new checkpointable value."""
     value = self._track_value(value, self._name_element(len(self._storage)))
@@ -479,6 +489,14 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
   def _make_storage(self, *args, **kwargs):
     return dict(*args, **kwargs)
 
+  @property
+  def _values(self):
+    # Sort items deterministically by key
+    ordered = list(zip(*sorted(self.items(), key=lambda it: it[0])))
+    if ordered:
+      return ordered[1]
+    return []
+
   def _name_element(self, key):
     if not isinstance(key, six.string_types):
       raise TypeError(
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
index 5597c7c7727d5ab6b8bc04fd7c94293a3c19477e..bcec6e01001eec6c164cf4bb17db3d4ed55b0935 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -24,6 +24,7 @@ import six
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
@@ -72,6 +73,7 @@ class HasList(training.Model):
 class ListTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTracking(self):
     model = HasList()
     output = model(array_ops.ones([32, 2]))
@@ -104,6 +106,7 @@ class ListTests(test.TestCase):
     self.assertIn(v, model.trainable_variables)
     self.assertNotIn(v, model.non_trainable_variables)
 
+  @test_util.run_v1_only("b/120545219")
   def testUpdatesForwarded(self):
     with context.graph_mode():
       model = HasList()
@@ -120,6 +123,7 @@ class ListTests(test.TestCase):
       self.assertEqual(0, len(model.updates))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testLossesForwarded(self):
     model = HasList()
     model_input = array_ops.ones([32, 2])
@@ -156,6 +160,23 @@ class ListTests(test.TestCase):
     with self.assertRaises(AttributeError):
       data_structures.List().pop()
 
+  @test_util.run_in_graph_and_eager_modes
+  def testTensorConversion(self):
+
+    class ListToTensor(training.Model):
+
+      def __init__(self):
+        super(ListToTensor, self).__init__()
+        self.l = [1., 2., 3.]
+
+    self.assertAllEqual(
+        [1., 2., 3.],
+        self.evaluate(constant_op.constant(ListToTensor().l)))
+
+    self.assertAllEqual(
+        [1., 2., 3.],
+        self.evaluate(array_ops.pack(ListToTensor().l)))
+
   def testNesting(self):
     with context.graph_mode():
       inner = data_structures.List()
@@ -235,6 +256,13 @@ class ListTests(test.TestCase):
     l.append(1)
     self.assertEqual([1], l_wrapper)
 
+  def testLayerCollectionWithExternalMutation(self):
+    l = []
+    l_wrapper = data_structures._ListWrapper(l)
+    layer = core.Dense(1)
+    l.append(layer)
+    self.assertEqual([layer], l_wrapper.layers)
+
   def testHashing(self):
     has_sequences = set([data_structures.List(),
                          data_structures.List()])
@@ -270,6 +298,7 @@ class HasMapping(training.Model):
 class MappingTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTracking(self):
     model = HasMapping()
     output = model(array_ops.ones([32, 2]))
@@ -306,6 +335,20 @@ class MappingTests(test.TestCase):
     with self.assertRaises(TypeError):
       mapping[1] = data_structures.List()
 
+  def testLayerCollectionWithExternalMutation(self):
+    d = {}
+    root = tracking.Checkpointable()
+    root.wrapper = d
+    self.assertEqual([], root.wrapper.layers)
+    self.assertEqual([], root.wrapper.trainable_weights)
+    layer1 = core.Dense(1)
+    layer2 = core.Dense(1)
+    d["a"] = layer1
+    d["b"] = layer2
+    self.assertEqual([layer1, layer2], root.wrapper.layers)
+    # The layers have still not created variables
+    self.assertEqual([], root.wrapper.trainable_weights)
+
   def testHashing(self):
     has_mappings = set([data_structures.Mapping(),
                         data_structures.Mapping()])
diff --git a/tensorflow/python/training/checkpointable/tracking.py b/tensorflow/python/training/checkpointable/tracking.py
index bd0bed9d46f2e75633e3bf1230eded3708ec1c8b..4e96aee0c51d441c4a32ce68943e27dbf592349c 100644
--- a/tensorflow/python/training/checkpointable/tracking.py
+++ b/tensorflow/python/training/checkpointable/tracking.py
@@ -17,8 +17,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.util import tf_contextlib
+
+
+# global _RESOURCE_TRACKER_STACK
+_RESOURCE_TRACKER_STACK = []
 
 
 class NotCheckpointable(object):
@@ -70,3 +79,106 @@ class Checkpointable(base.CheckpointableBase):
   def _no_dependency(self, value):
     """Override to allow CheckpointableBase to disable dependency tracking."""
     return data_structures.NoDependency(value)
+
+
+class ResourceTracker(object):
+  """An object that tracks a list of resources."""
+
+  def __init__(self):
+    self._resources = []
+
+  @property
+  def resources(self):
+    return self._resources
+
+  def add_resource(self, resource):
+    self._resources.append(resource)
+
+
+@tf_contextlib.contextmanager
+def resource_tracker_scope(resource_tracker):
+  """A context to manage resource trackers.
+
+  Use this in order to collect up all resources created within a block of code.
+  Example usage:
+
+  ```python
+  resource_tracker = ResourceTracker()
+  with resource_tracker_scope(resource_tracker):
+    resource = TrackableResource()
+
+  assert resource_tracker.resources == [resource]
+
+  Args:
+    resource_tracker: The passed in ResourceTracker object
+
+  Yields:
+    A scope in which the resource_tracker is active.
+  """
+  global _RESOURCE_TRACKER_STACK
+  old = list(_RESOURCE_TRACKER_STACK)
+  _RESOURCE_TRACKER_STACK.append(resource_tracker)
+  try:
+    yield
+  finally:
+    _RESOURCE_TRACKER_STACK = old
+
+
+class TrackableResource(base.CheckpointableBase):
+  """Base class for all resources that need to be tracked."""
+
+  def __init__(self):
+    global _RESOURCE_TRACKER_STACK
+    for resource_tracker in _RESOURCE_TRACKER_STACK:
+      resource_tracker.add_resource(self)
+
+    self._resource_handle = None
+
+  def create_resource(self):
+    """A function that creates a resource handle."""
+    raise NotImplementedError("TrackableResource.create_resource not "
+                              "implemented.")
+
+  def initialize(self):
+    """A function that initializes the resource. Optional."""
+    pass
+
+  @property
+  def resource_handle(self):
+    """Returns the resource handle associated with this Resource."""
+    if self._resource_handle is None:
+      self._resource_handle = self.create_resource()
+    return self._resource_handle
+
+
+class TrackableAsset(base.CheckpointableBase):
+  """Base class for asset files which need to be tracked."""
+
+  def __init__(self, path):
+    """Record the full path to the asset."""
+    # We use a variable here so that @tf.functions do not capture a literal
+    # value. The init_scope prevents functions from capturing `path` in an
+    # initialization graph, since it is transient and should not end up in a
+    # serialized function body. When serialized in a SavedModel, the variable
+    # will be set during the loading process to its location in the assets/
+    # directory.
+    with ops.init_scope():
+      if context.executing_eagerly():
+        self._path = self._no_dependency(
+            resource_variable_ops.ResourceVariable(
+                path, dtype=dtypes.string,
+                name="asset_path"))
+      else:
+        # Adding a variable is too disruptive when v1-style graph building,
+        # since things may get fed and local variable initializers would then
+        # need to be run.
+        self._path = path
+
+  @property
+  def asset_path(self):
+    """Fetch the current asset path."""
+    return self._path
+
+ops.register_tensor_conversion_function(
+    TrackableAsset,
+    lambda asset, **kw: ops.internal_convert_to_tensor(asset.asset_path, **kw))
diff --git a/tensorflow/python/training/checkpointable/tracking_test.py b/tensorflow/python/training/checkpointable/tracking_test.py
index a44c570fb9fe4104e44588c40069ddaa4b97c282..17c5461bc25e5e409cc04d0182603e8406dc7d47 100644
--- a/tensorflow/python/training/checkpointable/tracking_test.py
+++ b/tensorflow/python/training/checkpointable/tracking_test.py
@@ -193,5 +193,62 @@ class InterfaceTests(test.TestCase):
     self.assertAllClose({"k": [numpy.ones([2, 2]), numpy.zeros([3, 3])]},
                         self.evaluate(a.tensors))
 
+
+class _DummyResource(tracking.TrackableResource):
+
+  def __init__(self, handle_name):
+    self._handle_name = handle_name
+    super(_DummyResource, self).__init__()
+
+  def create_resource(self):
+    return self._handle_name
+
+
+class ResourceTrackerTest(test.TestCase):
+
+  def testBasic(self):
+    resource_tracker = tracking.ResourceTracker()
+    with tracking.resource_tracker_scope(resource_tracker):
+      dummy_resource1 = _DummyResource("test1")
+      dummy_resource2 = _DummyResource("test2")
+
+    self.assertEqual(2, len(resource_tracker.resources))
+    self.assertEqual("test1", resource_tracker.resources[0].resource_handle)
+    self.assertEqual("test2", resource_tracker.resources[1].resource_handle)
+
+  def testTwoScopes(self):
+    resource_tracker1 = tracking.ResourceTracker()
+    with tracking.resource_tracker_scope(resource_tracker1):
+      dummy_resource1 = _DummyResource("test1")
+
+    resource_tracker2 = tracking.ResourceTracker()
+    with tracking.resource_tracker_scope(resource_tracker2):
+      dummy_resource2 = _DummyResource("test2")
+
+    self.assertEqual(1, len(resource_tracker1.resources))
+    self.assertEqual("test1", resource_tracker1.resources[0].resource_handle)
+    self.assertEqual(1, len(resource_tracker1.resources))
+    self.assertEqual("test2", resource_tracker2.resources[0].resource_handle)
+
+  def testNestedScopesScopes(self):
+    resource_tracker = tracking.ResourceTracker()
+    with tracking.resource_tracker_scope(resource_tracker):
+      resource_tracker1 = tracking.ResourceTracker()
+      with tracking.resource_tracker_scope(resource_tracker1):
+        dummy_resource1 = _DummyResource("test1")
+
+      resource_tracker2 = tracking.ResourceTracker()
+      with tracking.resource_tracker_scope(resource_tracker2):
+        dummy_resource2 = _DummyResource("test2")
+
+    self.assertEqual(1, len(resource_tracker1.resources))
+    self.assertEqual("test1", resource_tracker1.resources[0].resource_handle)
+    self.assertEqual(1, len(resource_tracker1.resources))
+    self.assertEqual("test2", resource_tracker2.resources[0].resource_handle)
+    self.assertEqual(2, len(resource_tracker.resources))
+    self.assertEqual("test1", resource_tracker.resources[0].resource_handle)
+    self.assertEqual("test2", resource_tracker.resources[1].resource_handle)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index edab6cc6ebb8daee2389fa774f4dbbc172830892..a54f41a54fa1364af417a85e7faa9ee0693fada1 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -26,11 +26,13 @@ from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.ops import init_ops
@@ -38,11 +40,14 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.training import saveable_object as saveable_object_lib
-from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import saver as v1_saver_lib
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.saving import saveable_object as saveable_object_lib
+from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
@@ -87,7 +92,6 @@ class _CheckpointRestoreCoordinator(object):
         referenced every restore (e.g. for Python state); otherwise they would
         create their own ops every restore.
     """
-    self.builder = saver_lib.BulkSaverBuilder()
     self.object_graph_proto = object_graph_proto
     self.restore_uid = ops.uid()
     # Maps from objects to lists of attributes which were in the checkpoint but
@@ -142,6 +146,57 @@ class _CheckpointRestoreCoordinator(object):
     if self.new_restore_ops_callback:
       self.new_restore_ops_callback(new_ops)  # pylint: disable=not-callable
 
+  def restore_saveables(self, tensor_saveables, python_saveables):
+    """Run or build restore operations for SaveableObjects.
+
+    Args:
+      tensor_saveables: `SaveableObject`s which correspond to Tensors.
+      python_saveables: `PythonStateSaveable`s which correspond to Python
+        values.
+
+    Returns:
+      When graph building, a list of restore operations, either cached or newly
+      created, to restore `tensor_saveables`.
+    """
+    restore_ops = []
+    # Eagerly run restorations for Python state.
+    reader = pywrap_tensorflow.NewCheckpointReader(
+        self.save_path_string)
+    for saveable in python_saveables:
+      spec_names = [spec.name for spec in saveable.specs]
+      saveable.python_restore(
+          [reader.get_tensor(name) for name in spec_names])
+
+    # If we have new SaveableObjects, extract and cache restore ops.
+    if tensor_saveables:
+      validated_saveables = saveable_object_util.validate_and_slice_inputs(
+          tensor_saveables)
+      validated_names = set(saveable.name for saveable in validated_saveables)
+      if set(tensor_saveables.keys()) != validated_names:
+        raise AssertionError(
+            ("Saveable keys changed when validating. Got back %s, was "
+             "expecting %s") % (tensor_saveables.keys(), validated_names))
+      for saveable in validated_saveables:
+        if saveable.device:
+          device = saveable_object_util.set_cpu0(saveable.device)
+        else:
+          device = None
+        with ops.device(device):
+          tensors = []
+          for spec in saveable.specs:
+            tensors.append(
+                io_ops.restore_v2(
+                    self.save_path_tensor,
+                    [spec.name],
+                    [spec.slice_spec],
+                    [spec.dtype])[0])
+          restore_op = saveable.restore(tensors, restored_shapes=None)
+        if not context.executing_eagerly():
+          assert saveable.name not in self.restore_ops_by_name
+          self.restore_ops_by_name[saveable.name] = restore_op
+          restore_ops.append(restore_op)
+    return restore_ops
+
 
 class _NameBasedRestoreCoordinator(object):
   """Keeps the status of a name-based checkpoint restore."""
@@ -181,11 +236,11 @@ class _NameBasedRestoreCoordinator(object):
           continue
       else:
         saveable = saveable_factory
-      names_to_saveables = saver_lib.BaseSaverBuilder.OpListToDict(
+      names_to_saveables = saveable_object_util.op_list_to_dict(
           [saveable],
           convert_variable_to_tensor=False)
       for name, op in names_to_saveables.items():
-        for saveable_object in saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+        for saveable_object in saveable_object_util.saveable_objects_for_op(
             op=op, name=name):
           yield saveable_object
 
@@ -549,13 +604,11 @@ def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
   return slot_variables
 
 
-def _serialize_checkpointables(
-    checkpointable_objects, node_ids, object_names, slot_variables,
-    saveables_cache):
-  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  named_saveables = []
+def _add_attributes_to_object_graph(
+    checkpointable_objects, object_graph_proto, node_ids, object_names,
+    saveables_cache, object_map):
+  """Create SaveableObjects and corresponding SerializedTensor protos."""
+  named_saveable_objects = []
   if saveables_cache is None:
     # No SaveableObject caching. Either we're executing eagerly, or building a
     # static save which is specialized to the current Python state.
@@ -564,17 +617,21 @@ def _serialize_checkpointables(
     # If we are caching SaveableObjects, we need to build up a feed_dict with
     # functions computing volatile Python state to be saved with the checkpoint.
     feed_additions = {}
-  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+  for checkpoint_id, (checkpointable, object_proto) in enumerate(
+      zip(checkpointable_objects, object_graph_proto.nodes)):
     assert node_ids[checkpointable] == checkpoint_id
-    object_proto = object_graph_proto.nodes.add()
-    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
     object_name = object_names[checkpointable]
+    if object_map:
+      object_to_save = object_map.get(checkpointable, checkpointable)
+    else:
+      object_to_save = checkpointable
     if saveables_cache is not None:
-      cached_attributes = saveables_cache.setdefault(checkpointable, {})
+      cached_attributes = saveables_cache.setdefault(object_to_save, {})
     else:
       cached_attributes = None
+
     for name, saveable_factory in (
-        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+        object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
       attribute = object_proto.attributes.add()
       attribute.name = name
       attribute.checkpoint_key = "%s/%s/%s" % (
@@ -602,10 +659,10 @@ def _serialize_checkpointables(
           # Figure out the name-based Saver's name for this variable. If it's
           # already a SaveableObject we'd just get the checkpoint key back, so
           # we leave full_name blank.
-          saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+          saver_dict = saveable_object_util.op_list_to_dict(
               [maybe_saveable], convert_variable_to_tensor=False)
           full_name, = saver_dict.keys()
-          saveables = tuple(saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+          saveables = tuple(saveable_object_util.saveable_objects_for_op(
               op=maybe_saveable, name=attribute.checkpoint_key))
           for saveable in saveables:
             saveable.full_name = full_name
@@ -640,14 +697,55 @@ def _serialize_checkpointables(
                      "value.")
                     % (checkpointable, new_feed_key))
             feed_additions.update(saveable_feed_dict)
-        named_saveables.append(saveable)
+        named_saveable_objects.append(saveable)
 
+  return named_saveable_objects, feed_additions
+
+
+def fill_object_graph_proto(checkpointable_objects,
+                            node_ids,
+                            slot_variables,
+                            object_graph_proto=None):
+  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
+  if object_graph_proto is None:
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    assert node_ids[checkpointable] == checkpoint_id
+    object_proto = object_graph_proto.nodes.add()
+    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
     for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
       child_proto = object_proto.children.add()
       child_proto.node_id = node_ids[child.ref]
       child_proto.local_name = child.name
+  return object_graph_proto
+
 
-  return named_saveables, object_graph_proto, feed_additions
+def _serialize_gathered_objects(
+    checkpointable_objects, path_to_root, saveables_cache, object_map):
+  """Create SaveableObjects and protos for gathered objects."""
+  object_names = _ObjectIdentityDictionary()
+  for obj, path in path_to_root.items():
+    object_names[obj] = _object_prefix_from_path(path)
+  node_ids = _ObjectIdentityDictionary()
+  for node_id, node in enumerate(checkpointable_objects):
+    node_ids[node] = node_id
+  slot_variables = _serialize_slot_variables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names)
+  object_graph_proto = fill_object_graph_proto(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      slot_variables=slot_variables)
+  named_saveable_objects, feed_additions = _add_attributes_to_object_graph(
+      checkpointable_objects=checkpointable_objects,
+      object_graph_proto=object_graph_proto,
+      node_ids=node_ids,
+      object_names=object_names,
+      saveables_cache=saveables_cache,
+      object_map=object_map)
+  return named_saveable_objects, object_graph_proto, feed_additions
 
 
 def _serialize_object_graph(root_checkpointable, saveables_cache):
@@ -678,6 +776,19 @@ def _serialize_object_graph(root_checkpointable, saveables_cache):
   Raises:
     ValueError: If there are invalid characters in an optimizer's slot names.
   """
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  return _serialize_gathered_objects(
+      checkpointable_objects, path_to_root, saveables_cache, object_map=None)
+
+
+def named_saveables(root_checkpointable):
+  """Gather list of all SaveableObjects in the Checkpointable object."""
+  return _serialize_object_graph(root_checkpointable, None)[0]
+
+
+def find_objects(root_checkpointable):
+  """Find and number objects which are dependencies of `root_checkpointable`."""
   checkpointable_objects, path_to_root = (
       _breadth_first_checkpointable_traversal(root_checkpointable))
   object_names = _ObjectIdentityDictionary()
@@ -690,17 +801,7 @@ def _serialize_object_graph(root_checkpointable, saveables_cache):
       checkpointable_objects=checkpointable_objects,
       node_ids=node_ids,
       object_names=object_names)
-  return _serialize_checkpointables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names,
-      slot_variables=slot_variables,
-      saveables_cache=saveables_cache)
-
-
-def named_saveables(root_checkpointable):
-  """Gather list of all SaveableObjects in the Checkpointable object."""
-  return _serialize_object_graph(root_checkpointable, None)[0]
+  return checkpointable_objects, node_ids, slot_variables
 
 
 def list_objects(root_checkpointable):
@@ -717,20 +818,7 @@ def list_objects(root_checkpointable):
   Returns:
     A flat list of objects.
   """
-  # TODO(allenl): Extract out gathering logic so the naming logic doesn't have
-  # to run.
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = _ObjectIdentityDictionary()
-  for obj, path in path_to_root.items():
-    object_names[obj] = _object_prefix_from_path(path)
-  node_ids = _ObjectIdentityDictionary()
-  for node_id, node in enumerate(checkpointable_objects):
-    node_ids[node] = node_id
-  _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
+  checkpointable_objects, _, _ = find_objects(root_checkpointable)
   return checkpointable_objects
 
 
@@ -808,7 +896,7 @@ def capture_dependencies(template):
     """
     def _call_next_creator_renaming_initializer(initializer, **inner_kwargs):
       inner_kwargs.pop("name")  # Ignored; this is the scope-stripped name which
-                                # we don't want to propagate.
+      # we don't want to propagate.
       return next_creator(
           initial_value=initializer,
           name=name,
@@ -969,6 +1057,12 @@ class CheckpointLoadStatus(_LoadStatus):
         raise AssertionError(
             "Object not assigned a value from checkpoint: %s" % (node,))
     for checkpointable_object in list_objects(self._root_checkpointable):
+      # Remove data structures that do not contain any variables from
+      # restoration checks.
+      if (isinstance(checkpointable_object,
+                     data_structures.CheckpointableDataStructure) and
+          not checkpointable_object._checkpoint_dependencies):
+        continue
       self._checkpoint.all_python_objects.add(checkpointable_object)
     unused_python_objects = (
         _ObjectIdentitySet(self._checkpoint.all_python_objects)
@@ -1185,7 +1279,7 @@ class NameBasedSaverStatus(_LoadStatus):
       session = ops.get_default_session()
     with ops.device("/cpu:0"):
       saveables = self._gather_saveable_objects()
-      saver_lib.Saver(saveables).restore(
+      v1_saver_lib.Saver(saveables).restore(
           sess=session, save_path=self._checkpoint.save_path)
 
   def initialize_or_restore(self, session=None):
@@ -1210,18 +1304,6 @@ class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
         fetches=fetches, feed_dict=feed_dict, **kwargs)
 
 
-def _copy_saver_with_new_var_list(old_saver, new_var_list):
-  """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
-  new_saver = saver_lib.Saver(var_list=new_var_list, max_to_keep=None)
-  # TODO(allenl): Move to copying functionality to Saver?
-  # pylint: disable=protected-access
-  new_saver._last_checkpoints = old_saver._last_checkpoints
-  new_saver._checkpoints_to_be_deleted = old_saver._checkpoints_to_be_deleted
-  new_saver._next_checkpoint_time = old_saver._next_checkpoint_time
-  # pylint: enable=protected-access
-  return new_saver
-
-
 class CheckpointableSaver(object):
   """Saves and restores a `Checkpointable` object and its dependencies.
 
@@ -1260,7 +1342,8 @@ class CheckpointableSaver(object):
     # Op caching for save
     self._object_graph_feed_tensor = None
     self._last_save_object_graph = None
-    self._last_save_saver = None
+    self._file_prefix_feed_tensor = None
+    self._cached_save_operation = None
 
     # Op caching for restore, shared between _CheckpointRestoreCoordinators
     self._restore_op_cache = {}
@@ -1306,16 +1389,37 @@ class CheckpointableSaver(object):
             name=base.OBJECT_GRAPH_PROTO_KEY))
     return named_saveable_objects, graph_proto, feed_additions
 
-  def freeze(self):
+  def freeze(self, object_map=None, to_graph=None):
     """Creates a `tf.train.Saver` with the current object graph frozen."""
-    named_saveable_objects, _, _ = self._gather_saveables(
-        object_graph_tensor=None, saveable_object_cache=None)
-    return saver_lib.Saver(
-        var_list=named_saveable_objects, max_to_keep=None)
-
-  def _prepare_save(self,
-                    object_graph_tensor=None,
-                    saveable_object_cache=None):
+    checkpointable_objects, path_to_root = (
+        _breadth_first_checkpointable_traversal(self._root_checkpointable))
+    if to_graph:
+      target_context = to_graph.as_default
+    else:
+      target_context = ops.NullContextmanager
+    with target_context():
+      named_saveable_objects, graph_proto, _ = _serialize_gathered_objects(
+          checkpointable_objects,
+          path_to_root,
+          saveables_cache=None,
+          object_map=object_map)
+      with ops.device("/cpu:0"):
+        object_graph_tensor = constant_op.constant(
+            graph_proto.SerializeToString(), dtype=dtypes.string)
+      named_saveable_objects.append(
+          base.NoRestoreSaveable(
+              tensor=object_graph_tensor,
+              name=base.OBJECT_GRAPH_PROTO_KEY))
+      # TODO(allenl): Swap in a function-based saver here once it can serialize
+      # to a SaverDef.
+      return v1_saver_lib.Saver(
+          var_list=named_saveable_objects, max_to_keep=None)
+
+  def _save_cached_when_graph_building(
+      self,
+      file_prefix,
+      object_graph_tensor=None,
+      saveable_object_cache=None):
     """Create or retrieve save ops.
 
     When graph building, `saveable_object_cache` will typically be non-`None`,
@@ -1324,15 +1428,17 @@ class CheckpointableSaver(object):
     unnecessarily re-creating save ops.
 
     Args:
+      file_prefix: The prefix for saved checkpoint files.
       object_graph_tensor: A `Tensor` to which the current object graph will be
         fed.
       saveable_object_cache: A dictionary; if specified, used to cache
         `SaveableObject`s.
 
     Returns:
-      A two-element tuple with a `tf.train.Saver` and a feed_dict of `Tensor`s
-      to feed when running save ops. The feed dict contains the current object
-      graph and any Python state to be saved in the checkpoint.
+      A two-element tuple with a filename tensor and a feed_dict of tensors to
+      feed when running it (if graph building). The feed dict contains the
+      current object graph and any Python state to be saved in the
+      checkpoint. When executing eagerly only the first argument is meaningful.
     """
     (named_saveable_objects, graph_proto,
      feed_additions) = self._gather_saveables(
@@ -1344,15 +1450,11 @@ class CheckpointableSaver(object):
         # constructors. That means the Saver needs to be copied with a new
         # var_list.
         or context.executing_eagerly()):
-      if self._last_save_object_graph is not None:
-        self._last_save_saver = _copy_saver_with_new_var_list(
-            old_saver=self._last_save_saver,
-            new_var_list=named_saveable_objects)
-      else:
-        self._last_save_saver = saver_lib.Saver(
-            var_list=named_saveable_objects, max_to_keep=None)
+      saver = functional_saver.Saver(named_saveable_objects)
+      with ops.device("/cpu:0"):
+        self._cached_save_operation = saver.save(file_prefix)
       self._last_save_object_graph = graph_proto
-    return self._last_save_saver, feed_additions
+    return self._cached_save_operation, feed_additions
 
   def save(self, file_prefix, checkpoint_number=None, session=None):
     """Save a training checkpoint.
@@ -1376,35 +1478,42 @@ class CheckpointableSaver(object):
     Returns:
       The full path to the checkpoint.
     """
-    feed_additions = {}
+    feed_dict = {}
     graph_building = not context.executing_eagerly()
+    if checkpoint_number:
+      file_prefix = "%s-%d" % (file_prefix, checkpoint_number)
     if graph_building:
       if self._object_graph_feed_tensor is None:
         with ops.device("/cpu:0"):
           self._object_graph_feed_tensor = constant_op.constant(
               "", dtype=dtypes.string)
+          self._file_prefix_feed_tensor = constant_op.constant(
+              "", dtype=dtypes.string)
       object_graph_tensor = self._object_graph_feed_tensor
+      file_prefix_tensor = self._file_prefix_feed_tensor
+      feed_dict[file_prefix_tensor] = file_prefix
     else:
+      with ops.device("/cpu:0"):
+        file_prefix_tensor = constant_op.constant(
+            file_prefix, dtype=dtypes.string)
       object_graph_tensor = None
 
-    saver, new_feed_additions = self._prepare_save(
+    file_io.recursive_create_dir(os.path.dirname(file_prefix))
+    save_path, new_feed_additions = self._save_cached_when_graph_building(
+        file_prefix=file_prefix_tensor,
         object_graph_tensor=object_graph_tensor,
         saveable_object_cache=self._saveable_object_cache)
     if new_feed_additions:
-      feed_additions.update(new_feed_additions)
+      feed_dict.update(new_feed_additions)
     if not graph_building:
       session = None
     elif session is None:
       session = ops.get_default_session()
 
-    with ops.device("/cpu:0"):
-      save_path = saver.save(
-          sess=_SessionWithFeedDictAdditions(
-              session=session, feed_additions=feed_additions),
-          save_path=file_prefix,
-          write_meta_graph=False,
-          write_state=False,
-          global_step=checkpoint_number)
+    if session:
+      save_path = session.run(save_path, feed_dict=feed_dict)
+    else:
+      save_path = save_path.numpy()
     return save_path
 
   def restore(self, save_path):
@@ -1647,7 +1756,8 @@ class Checkpoint(tracking.Checkpointable):
     """
     super(Checkpoint, self).__init__()
     for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      if not isinstance(v, base.CheckpointableBase):
+      if not isinstance(v, (base.CheckpointableBase,
+                            def_function.PolymorphicFunction)):
         raise ValueError(
             ("`Checkpoint` was expecting a checkpointable object (an object "
              "derived from `CheckpointableBase`), got %s. If you believe this "
@@ -1692,9 +1802,9 @@ class Checkpoint(tracking.Checkpointable):
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
-    return self._saver.save(
+    return compat.as_str(self._saver.save(
         file_prefix=file_prefix,
-        session=session)
+        session=session))
 
   @property
   def save_counter(self):
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index 14b47a194019f63fa128e2d43556f400ead1ba44..3bdab4cb0bf990543a18cab885f540b8d1f78ed8 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -26,7 +26,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -41,6 +41,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
@@ -534,8 +535,7 @@ class CheckpointingTests(test.TestCase):
     num_training_steps = 10
     checkpoint_directory = self.get_temp_dir()
     for training_continuation in range(3):
-      with ops.Graph().as_default(), self.test_session(
-          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+      with test_util.device(use_gpu=True):
         model = MyModel()
         optimizer = adam.AdamOptimizer(0.001)
         root = checkpointable_utils.Checkpoint(
@@ -616,13 +616,13 @@ class CheckpointingTests(test.TestCase):
 
   # pylint: disable=cell-var-from-loop
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testWithDefun(self):
     num_training_steps = 2
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     for training_continuation in range(3):
-      with ops.Graph().as_default(), self.test_session(
-          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+      with test_util.device(use_gpu=True):
         model = MyModel()
         # Don't actually train so we can test variable values
         optimizer = adam.AdamOptimizer(0.)
@@ -633,7 +633,7 @@ class CheckpointingTests(test.TestCase):
             checkpoint_directory)
         status = root.restore(save_path=checkpoint_path)
         def train_fn():
-          @function.defun
+          @def_function.function
           def _call_model(x):
             return model(x)
           with backprop.GradientTape() as tape:
@@ -1017,33 +1017,26 @@ class CheckpointingTests(test.TestCase):
   def testRestoreOnAssign(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_graph = ops.Graph()
-    with save_graph.as_default(), self.test_session(save_graph):
-      first = tracking.Checkpointable()
-      first.var1 = variable_scope.get_variable(
-          name="outside_var", initializer=0.)
-      first.var2 = variable_scope.get_variable(
-          name="blah", initializer=0.)
-      self.evaluate(first.var1.assign(4.))
-      self.evaluate(first.var2.assign(8.))
-      save_path = checkpointable_utils.CheckpointableSaver(first).save(
-          checkpoint_prefix)
-    restore_graph = ops.Graph()
-    with restore_graph.as_default(), self.test_session(restore_graph):
-      second = tracking.Checkpointable()
-      second.var2 = variable_scope.get_variable(
-          name="blah", initializer=0.)
-      status = checkpointable_utils.CheckpointableSaver(
-          second).restore(save_path)
-      recreated_var1 = variable_scope.get_variable(
-          name="outside_var", initializer=0.)
-      status.run_restore_ops()
-      self.assertEqual(8., self.evaluate(second.var2))
-      self.evaluate(recreated_var1.assign(-2.))
-      self.assertEqual(-2., self.evaluate(recreated_var1))
-      second.var1 = recreated_var1
-      status.run_restore_ops()
-      self.assertEqual(4., self.evaluate(recreated_var1))
+    first = tracking.Checkpointable()
+    first.var1 = variables.Variable(0., name="outside_var")
+    first.var2 = variables.Variable(0., name="blah")
+    self.evaluate(first.var1.assign(4.))
+    self.evaluate(first.var2.assign(8.))
+    save_path = checkpointable_utils.CheckpointableSaver(first).save(
+        checkpoint_prefix)
+
+    second = tracking.Checkpointable()
+    second.var2 = variables.Variable(0., name="blah")
+    status = checkpointable_utils.CheckpointableSaver(
+        second).restore(save_path)
+    recreated_var1 = variables.Variable(0., name="outside_var")
+    status.run_restore_ops()
+    self.assertEqual(8., self.evaluate(second.var2))
+    self.evaluate(recreated_var1.assign(-2.))
+    self.assertEqual(-2., self.evaluate(recreated_var1))
+    second.var1 = recreated_var1
+    status.run_restore_ops()
+    self.assertEqual(4., self.evaluate(recreated_var1))
 
   def testManySavesGraph(self):
     """Saves after the first should not modify the graph."""
@@ -1248,8 +1241,7 @@ class CheckpointingTests(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
-    with ops.Graph().as_default(), self.test_session(
-        graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+    with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
       root = checkpointable_utils.Checkpoint(
@@ -1276,8 +1268,7 @@ class CheckpointingTests(test.TestCase):
       optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
 
     # Restore into a graph with the optimizer
-    with ops.Graph().as_default(), self.test_session(
-        graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+    with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
       root = checkpointable_utils.Checkpoint(
@@ -1299,8 +1290,7 @@ class CheckpointingTests(test.TestCase):
         status.assert_consumed()
 
     # Make sure initialization doesn't clobber later restores
-    with ops.Graph().as_default(), self.test_session(
-        graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+    with test_util.device(use_gpu=True):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
       root = checkpointable_utils.Checkpoint(
@@ -1324,6 +1314,24 @@ class CheckpointingTests(test.TestCase):
       train_fn()
       self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_restore_after_adding_empty_checkpointable_data_structure(self):
+    model = NonLayerCheckpointable()
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    checkpoint.restore(None).initialize_or_restore()
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+
+    del model, checkpoint
+
+    model = NonLayerCheckpointable()
+    model.dict = {"a": 1}
+    model.list = {"b": 1}
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    load_status = checkpoint.restore(save_path)
+    load_status.assert_existing_objects_matched().run_restore_ops()
+
 
 class _ManualScope(tracking.Checkpointable):
 
@@ -1483,7 +1491,7 @@ class CheckpointCompatibilityTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     with context.graph_mode():
       save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
+      with save_graph.as_default(), self.session(
           graph=save_graph) as session:
         root = self._initialized_model()
         name_saver = saver_lib.Saver()
@@ -1539,7 +1547,7 @@ class CheckpointCompatibilityTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     with context.graph_mode():
       save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
+      with save_graph.as_default(), self.session(
           graph=save_graph) as session:
         root = self._initialized_model()
         save_path = root.save(session=session, file_prefix=checkpoint_prefix)
@@ -1557,7 +1565,7 @@ class CheckpointCompatibilityTests(test.TestCase):
       save_path = root.save(file_prefix=checkpoint_prefix)
     with context.graph_mode():
       save_graph = ops.Graph()
-      with save_graph.as_default(), self.test_session(
+      with save_graph.as_default(), self.session(
           graph=save_graph):
         root = self._initialized_model()
         self._set_sentinels(root)
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 0ff97d85e37e6167f1200ba56940f4a663c259a2..b7e5c98c78ec91078de03de82ecc70d522e59f2b 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -408,7 +408,7 @@ class Coordinator(object):
 
 
 # Threads for the standard services.
-@tf_export("train.LooperThread")
+@tf_export(v1=["train.LooperThread"])
 class LooperThread(threading.Thread):
   """A thread that runs code repeatedly, optionally on a timer.
 
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index be80c3657158b52d063b5d2b7731f25d184794a0..5874a1ff4152d835263cdc1ad87002b64c026eb8 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -130,7 +130,7 @@ class _ReplicaDeviceChooser(object):
     return worker_device.to_string()
 
 
-@tf_export("train.replica_device_setter")
+@tf_export(v1=["train.replica_device_setter"])
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
                           worker_device="/job:worker", merge_devices=True,
                           cluster=None, ps_ops=None, ps_strategy=None):
diff --git a/tensorflow/python/training/device_setter_test.py b/tensorflow/python/training/device_setter_test.py
index 85b75502ab0943013f12a34002e72b71d187bf68..3cff87b326f1de8243a230bf87d64cc6963026d3 100644
--- a/tensorflow/python/training/device_setter_test.py
+++ b/tensorflow/python/training/device_setter_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ class DeviceSetterTest(test.TestCase):
       "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
   })
 
+  @test_util.run_deprecated_v1
   def testCPUOverride(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -47,12 +49,14 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker/cpu:0", a.device)
 
+  @test_util.run_deprecated_v1
   def testResource(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
       v = resource_variable_ops.ResourceVariable([1, 2])
       self.assertDeviceEqual("/job:ps/task:0", v.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithClusterSpecClass(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -65,6 +69,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksPinVariableToJob(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -82,6 +87,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksUseCpuForPS(self):
     with ops.device(
         device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
@@ -95,6 +101,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:moon/cpu:0", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksNoMerging(self):
     with ops.device(
         device_setter.replica_device_setter(
@@ -109,6 +116,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithClusterSpecDict(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec.as_dict(
@@ -122,6 +130,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithClusterDef(self):
     with ops.device(
         device_setter.replica_device_setter(
@@ -135,6 +144,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithDevice(self):
     cluster_spec = server_lib.ClusterSpec({
         "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
@@ -155,6 +165,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:moon/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:sun", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithCPUConstraint(self):
     cluster_spec = server_lib.ClusterSpec({
         "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 144b1671702472dab8f2f1f3879d8492c9a16c76..ad27bc8a7025f060b25c47c5391dd8e473c0e466 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -12,1239 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Class DistributionStrategy, TowerContext, and supporting APIs."""
+"""Deprecated, please use ../distribute/distribute_lib.py."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context as eager_context
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses_impl
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.util import deprecation
-from tensorflow.python.util import nest
-
-
-# ------------------------------------------------------------------------------
-# Context tracking whether in a distribution.update() or .update_non_slot()
-# call.
-
-
-_update_device = threading.local()
-
-
-def get_update_device():
-  """Get the current device if in a `DistributionStrategy.update()` call."""
-  try:
-    return _update_device.current
-  except AttributeError:
-    return None
-
-
-class UpdateContext(object):
-  """Context manager when you are in `update()` or `update_non_slot()`."""
-
-  def __init__(self, device):
-    self._device = device
-    self._old_device = None
-
-  def __enter__(self):
-    self._old_device = get_update_device()
-    _update_device.current = self._device
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    del exception_type, exception_value, traceback
-    _update_device.current = self._old_device
-
-
-# ------------------------------------------------------------------------------
-# Public utility functions.
-
-
-def get_loss_reduction():
-  """Reduce `aggregation` corresponding to the last loss reduction."""
-  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
-  if loss_reduction == losses_impl.Reduction.SUM:
-    return variable_scope.VariableAggregation.SUM
-  return variable_scope.VariableAggregation.MEAN
-
-
-# ------------------------------------------------------------------------------
-# Internal API for validating the current thread mode
-
-
-def _require_cross_tower_context(distribution_strategy):
-  """Verify in cross-tower context for `distribution_strategy`."""
-  context = _get_per_thread_mode()
-  if context.cross_tower_context is distribution_strategy: return
-  # We have an error to report, figure out the right message.
-  if context.distribution_strategy is not distribution_strategy:
-    if (context.distribution_strategy is
-        distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
-      raise RuntimeError(
-          'Need to be inside "with distribution_strategy.scope()" for %s' %
-          (distribution_strategy,))
-    else:
-      raise RuntimeError(
-          "Mixing different DistributionStrategy objects: %s is not %s" %
-          (context.distribution_strategy, distribution_strategy))
-  assert context.cross_tower_context is None
-  raise RuntimeError("Method requires being in cross-tower context, use "
-                     "get_tower_context().merge_call()")
-
-
-def require_tower_context(tower_ctx):
-  """Verify in `tower_ctx` tower context."""
-  context = _get_per_thread_mode()
-  if context.tower_context is tower_ctx: return
-  # We have an error to report, figure out the right message.
-  if context.tower_context is None:
-    raise RuntimeError("Need to be inside `call_for_each_tower()`")
-  if context.distribution_strategy is tower_ctx.distribution_strategy:
-    # Two different TowerContexts with the same DistributionStrategy.
-    raise RuntimeError("Mismatching tower context.")
-  raise RuntimeError(
-      "Mismatching DistributionStrategy objects: %s is not %s." %
-      (context.distribution_strategy, tower_ctx.distribution_strategy))
-
-
-def _require_distribution_strategy_scope(distribution_strategy):
-  """Verify in a `distribution_strategy.scope()` in this thread."""
-  context = _get_per_thread_mode()
-  if context.distribution_strategy is distribution_strategy: return
-  # We have an error to report, figure out the right message.
-  if (context.distribution_strategy is
-      distribution_strategy_context._get_default_distribution_strategy()):  # pylint: disable=protected-access
-    raise RuntimeError(
-        'Need to be inside "with distribution_strategy.scope()" for %s' %
-        (distribution_strategy,))
-  else:
-    raise RuntimeError(
-        "Mixing different DistributionStrategy objects: %s is not %s" %
-        (context.distribution_strategy, distribution_strategy))
-
-
-# ------------------------------------------------------------------------------
-# Internal context managers used to implement the DistributionStrategy
-# base class
-
-
-class _CurrentDistributionContext(object):
-  """Context manager for setting the `DistributionStrategy` and var creator."""
-
-  def __init__(self,
-               distribution_strategy,
-               var_creator_scope,
-               var_scope=None,
-               default_device=None):
-    self._context = distribution_strategy_context._CrossTowerThreadMode(  # pylint: disable=protected-access
-        distribution_strategy)
-    self._var_creator_scope = var_creator_scope
-    self._var_scope = var_scope
-    if default_device:
-      self._device_scope = ops.device(default_device)
-    else:
-      self._device_scope = None
-
-  def __enter__(self):
-    _push_per_thread_mode(self._context)
-    if self._var_scope:
-      self._var_scope.__enter__()
-    self._var_creator_scope.__enter__()
-    if self._device_scope:
-      self._device_scope.__enter__()
-    return self._context.distribution_strategy
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    if self._device_scope:
-      self._device_scope.__exit__(exception_type, exception_value, traceback)
-    self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
-    if self._var_scope:
-      self._var_scope.__exit__(exception_type, exception_value, traceback)
-    _pop_per_thread_mode()
-
-
-class _SameScopeAgainContext(object):
-  """Trivial context manager when you are already in `scope()`."""
-
-  def __init__(self, distribution_strategy):
-    self._distribution_strategy = distribution_strategy
-
-  def __enter__(self):
-    return self._distribution_strategy
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    del exception_type, exception_value, traceback
-
-
-# ------------------------------------------------------------------------------
-# Base classes for all distribution strategies.
-
-
-class DistributionStrategy(object):
-  """A list of devices with a state & compute distribution policy.
-
-  See [tensorflow/contrib/distribute/README.md](
-  https://www.tensorflow.org/code/tensorflow/contrib/distribute/README.md)
-  for overview and examples.
-
-  The intent is that you can write an algorithm in a stylized way and
-  it will be usable with a variety of different `DistributionStrategy`
-  implementations. Each descendant will implement a different strategy
-  for distributing the algorithm across multiple devices/machines.
-  Furthermore, these changes can be hidden inside the specific layers
-  and other library classes that need special treatment to run in a
-  distributed setting, so that most users' model definition code can
-  run unchanged. The `DistributionStrategy` API works the same way
-  with eager and graph execution.
-
-  First let's introduce a few high-level concepts:
-
-  * _Data parallelism_ is where we run multiple copies of the model
-    on different slices of the input data. This is in contrast to
-    _model parallelism_ where we divide up a single copy of a model
-    across multiple devices.
-    Note: we only support data parallelism for now, but
-    hope to add support for model parallelism in the future.
-  * A _tower_ is one copy of the model, running on one slice of the
-    input data.
-  * _Synchronous_, or more commonly _sync_, training is where the
-    updates from each tower are aggregated together before updating
-    the model variables. This is in contrast to _asynchronous_, or
-    _async_ training, where each tower updates the model variables
-    independently.
-  * Furthermore you might run your computation on multiple devices
-    on one machine (or "host"), or on multiple machines/hosts.
-    If you are running on multiple machines, you might have a
-    single master host that drives computation across all of them,
-    or you might have multiple clients driving the computation
-    asynchronously.
-
-  To distribute an algorithm, we might use some of these ingredients:
-
-  * Parameter servers: These are hosts that hold a single copy of
-    parameters/variables. All towers that want to operate on a variable
-    retrieve it at the beginning of a step and send an update to be
-    applied at the end of the step. Can support either sync or async
-    training.
-  * Mirrored variables: These are variables that are copied to multiple
-    devices, where we keep the copies in sync by applying the same
-    updates to every copy. Normally would only be used with sync training.
-  * Reductions and Allreduce: A _reduction_ is some method of
-    aggregating multiple values into one value, like "sum" or
-    "mean". If doing sync training, we will perform a reduction on the
-    gradients to a parameter from all towers before applying the
-    update. Allreduce is an algorithm for performing a reduction on
-    values from multiple devices and making the result available on
-    all of those devices.
-  * In the future we will have support for TensorFlow's partitioned
-    variables, where a single variable is split across multiple
-    devices.
-
-  We have then a few approaches we want to support:
-
-  * Code written (as if) with no knowledge of class `DistributionStrategy`.
-    This code should work as before, even if some of the layers, etc.
-    used by that code are written to be distribution-aware. This is done
-    by having a default `DistributionStrategy` that gives ordinary behavior,
-    and by default being in a single tower context.
-  * Ordinary model code that you want to run using a specific
-    `DistributionStrategy`. This can be as simple as:
-
-    ```
-    with my_distribution.scope():
-      iterator = my_distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
-      tower_train_ops = my_distribution.call_for_each_tower(
-          tower_fn, iterator.get_next())
-      train_op = tf.group(my_distribution.unwrap(tower_train_ops))
-    ```
-
-    This takes an ordinary `dataset` and `tower_fn` and runs it
-    distributed using a particular `DistributionStrategy` in
-    `my_distribution`. Any variables created in `tower_fn` are created
-    using `my_distribution`'s policy, and library functions called by
-    `tower_fn` can use the `get_tower_context()` API to get enhanced
-    behavior in this case.
-
-    You can also create an initializable iterator instead of a one-shot
-    iterator. In that case, you will need to ensure that you initialize the
-    iterator before calling get_next.
-    ```
-    iterator = my_distribution.distribute_dataset(
-        dataset).make_initializable_iterator())
-    session.run(iterator.initializer)
-    ```
-
-  * If you want to write a distributed algorithm, you may use any of
-    the `DistributionStrategy` APIs inside a
-    `with my_distribution.scope():` block of code.
-
-  Lower-level concepts:
-
-  * Wrapped values: In order to represent values parallel across devices
-    (either towers or the devices associated with a particular value), we
-    wrap them in a "PerDevice" or "Mirrored" object that contains a map
-    from device to values. "PerDevice" is used when the value may be
-    different across devices, and "Mirrored" when the value are the same.
-  * Unwrapping and merging: Consider calling a function `fn` on
-    multiple devices, like `call_for_each_tower(fn, w)` with an
-    argument `w` that is a wrapped value. This means `w` will have a
-    map taking tower device `d0` to `w0`, tower device `d1` to `w1`,
-    etc. `call_for_each_tower()` unwraps `w` before calling `fn`, so
-    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges
-    the return values from `fn()`, which can possibly result in
-    wrapped values. For example, let's say `fn()` returns a tuple with
-    three components: `(x, a, v0)` from tower 0, `(x, b, v1)` on tower 1,
-    etc. If the first component is the same object `x` from every
-    tower, then the first component of the merged result will also be
-    `x`. If the second component is different (`a`, `b`, ...)  from
-    each tower, then the merged value will have a wrapped map from
-    tower device to the different values. If the third component is
-    the members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to
-    `v1`, etc.), then the merged result will be that mirrored variable
-    (`v`).
-  * Tower context vs. Cross-tower context: _tower context_ is when we
-    are in some function that is being called once for each tower.
-    Otherwise we are in cross-tower context, which is useful for
-    calling `DistributionStrategy` methods which operate across the
-    towers (like `reduce()`). By default you start in a tower context
-    (the default "single tower context") and then some methods can
-    switch you back and forth, as described below.
-  * Worker devices vs. parameter devices: Most tower computations will
-    happen on worker devices. Since we don't yet support model
-    parallelism, there will be one worker device per tower. When using
-    parameter servers (see above), the set of devices holding
-    variables may be different, otherwise the parameter devices might
-    match the worker devices.
-  * Non-slot devices are some subset of the parameter devices where we
-    put all the non-slot variables. We need to ensure that all
-    non-slot variables are allocated on the same device, or mirrored
-    across the same set of devices. If you have some variable you want
-    to colocate all the non-slot variables with, you can use
-    `colocate_vars_with()` to get the remaining non-slot variables on
-    the same device.  Otherwise you can use `non_slot_devices()` to
-    pick a consistent set of devices to pass to both
-    `colocate_vars_with()` and `update_non_slot()`.
-
-  When using a `DistributionStrategy`, we have a new type dimension
-  called _locality_ that says what values are compatible with which
-  APIs:
-
-  * T: different value for each tower (e.g. a PerDevice-wrapped value).
-  * M: value is "mirrored" across towers, i.e. there are copies with the
-    same value on each tower (e.g. a Mirrored-wrapped value).
-  * V(`v`): value is "mirrored" across all the devices which have a
-    copy of variable `v` (also a Mirrored-wrapped value, but over
-    parameter devices instead of worker devices).
-  * N: value is "mirrored" across all the "non-slot" devices
-
-  Rules for methods with respect to locality and single-tower vs.
-  cross-tower context:
-
-  * `with d.scope()`: default single-tower context -> cross-tower context for
-    `d`
-  * `with d.colocate_vars_with(v)`: in tower/cross-tower context, variables
-    will be created with locality V(`v`). That is, if we write
-    `with d.colocate_vars_with(v1): v2 = tf.get_variable(...)`, then
-    `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
-    V(`v1`).
-  * `with d.colocate_vars_with(d.non_slot_devices(...))`: in
-    tower/cross-tower context, variables will be created with locality N
-  * `v = tf.get_variable(...)`: in tower/cross-tower context, creates
-    a variable (which by definition will have locality V(`v`), though
-    will match another locality if inside a `colocate_vars_with`
-    scope).
-  * `d.distribute_dataset(dataset).make_one_shot_iterator()`: in cross-tower
-    context, produces an iterator with locality T
-  * `d.broadcast(t)`: in cross-tower context, produces a value with locality M
-  * `d.broadcast(t, v)`: in cross-tower context, produces a value with
-    locality V(`v`)
-  * `d.call_for_each_tower(fn, ...)`: in cross-tower context, runs
-    `fn()` in a tower context (and so may call `get_tower_context()` and
-    use its API, including `merge_call()` to get back to cross-tower
-    context), once for each tower. May use values with locality T or
-    M, and any variable.
-  * `d.reduce(m, t, t)`: in cross-tower context, accepts t with locality T
-    and produces a value with locality M.
-  * `d.reduce(m, t, v)`: in cross-tower context, accepts t with
-    locality T and produces a value with locality V(`v`).
-  * `d.batch_reduce(m, [(t, v)]): see `d.reduce()`
-  * `d.update(v, fn, ...)`: in cross-tower context, runs `fn()` once
-    for each device `v` is copied to, all inputs should have locality
-    V(`v`), output will have locality V(`v`) as well.
-  * `d.update_non_slot(d.non_slot_devices(), fn)`: in cross-tower
-    context, like `d.update()` except with locality N.
-  * `d.read_var(v)`: Gets the (read-only) value of the variable `v` (on
-    the device determined by the current device scope), aggregating
-    across towers for tower-local variables. Frequently, this will be
-    done automatically when using `v` in an expression or fetching it in
-    a cross-tower context, but this function can be used to force that
-    conversion happens at a particular point in time (for example, to
-    add the result of the conversion to a graph collection).
-
-  The standard pattern for updating variables is to:
-
-  1. Wrap your input dataset in `d.distribute_dataset()` and create an iterator.
-  2. Define each tower `d.call_for_each_tower()` up to the point of
-     getting a list of gradient, variable pairs.
-  3. Call `d.reduce(VariableAggregation.SUM, t, v)` or `d.batch_reduce()` to sum
-     the gradients (with locality T) into values with locality V(`v`).
-  4. Call `d.update(v)` for each variable to update its value.
-
-  Steps 3 and 4 are done automatically by class `Optimizer` if you call
-  its `apply_gradients` method in a tower context. Otherwise you can
-  manually call its `_distributed_apply` method in a cross-tower context.
-
-  Another thing you might want to do in the middle of your tower function
-  is an all-reduce of some intermediate value, using `d.reduce()` or
-  `d.batch_reduce()`. You simply provide the same tensor as the input and
-  destination.
-
-  Layers should expect to be called in a tower context, and can use
-  the `get_tower_context()` function to get a `TowerContext` object. The
-  `TowerContext` object has a `merge_call()` method for entering
-  cross-tower context where you can use `reduce()` (or
-  `batch_reduce()`) and then optionally `update()` to update state.
-
-  You may use this API whether or not a `DistributionStrategy` is
-  being used, since there is a default implementation of
-  `TowerContext` and `DistributionStrategy`. Or you can use the
-  `get_tower_context().is_single_tower` property to run different code
-  in the distributed vs. single tower cases.
-  """
-
-  # TODO(josh11b): Raise an exception if variable partitioning requested before
-  #   we add support.
-  # TODO(josh11b): Also `parameter_device_index` property?
-  # TODO(josh11b): `map()`
-  # TODO(josh11b): ClusterSpec/ClusterResolver
-  # TODO(josh11b): Partitioned computations, state; sharding
-  # TODO(josh11b): Model parallelism: "towers" with multiple devices; shuffling
-  # TODO(josh11b): List of towers with their worker and parameter devices
-  #   (where the parameter devices may overlap in the ps case).
-
-  def __init__(self):
-    self._default_device = None
-    # This property is used to determine if we should set drop_remainder=True
-    # when creating Datasets from numpy array inputs.
-    self._require_static_shapes = False
-
-  def scope(self):
-    """Returns a context manager selecting this DistributionStrategy as current.
-
-    Inside a `with distribution_strategy.scope():` code block, this thread
-    will use a variable creator set by `distribution_strategy`, and will
-    enter its "cross-tower context".
-
-    Returns:
-      A context manager.
-    """
-    if distribution_strategy_context.has_distribution_strategy():
-      _require_cross_tower_context(self)
-      return _SameScopeAgainContext(self)
-
-    def creator_with_resource_vars(*args, **kwargs):
-      _require_distribution_strategy_scope(self)
-      kwargs["use_resource"] = True
-      return self._create_variable(*args, **kwargs)
-
-    def disable_partitioned_variables(getter, *args, **kwargs):
-      if kwargs.pop("partitioner", None) is not None:
-        tf_logging.log_first_n(
-            tf_logging.WARN, "Partitioned variables are disabled when using "
-            "DistributionStrategy.", 1)
-      return getter(*args, **kwargs)
-
-    return _CurrentDistributionContext(
-        self, variable_scope.variable_creator_scope(creator_with_resource_vars),
-        variable_scope.variable_scope(
-            variable_scope.get_variable_scope(),
-            custom_getter=disable_partitioned_variables),
-        self._default_device)
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    # Note: should support "colocate_with" argument.
-    raise NotImplementedError("must be implemented in descendants")
-
-  def read_var(self, v):
-    """Reads the value of a variable.
-
-    Returns the aggregate value of a tower-local variable, or the
-    (read-only) value of any other variable.
-
-    Args:
-      v: A variable allocated within the scope of this `DistributionStrategy`.
-
-    Returns:
-      A tensor representing the value of `v`, aggregated across towers if
-      necessary.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def colocate_vars_with(self, colocate_with_variable):
-    """Scope that controls which devices variables will be created on.
-
-    No operations should be added to the graph inside this scope, it
-    should only be used when creating variables (some implementations
-    work by changing variable creation, others work by using a
-    tf.colocate_with() scope).
-
-    This may only be used inside `self.scope()`.
-
-    Example usage:
-
-    ```
-    with distribution_strategy.scope():
-      var1 = tf.get_variable(...)
-      with distribution_strategy.colocate_vars_with(v1):
-        # var2 and var3 will be created on the same device(s) as var1
-        var2 = tf.get_variable(...)
-        var3 = tf.get_variable(...)
-
-      def fn(v1, v2, v3):
-        # operates on v1 from var1, v2 from var2, and v3 from var3
-
-      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
-      distribution_strategy.update(v1, fn, v2, v3)
-    ```
-
-    Args:
-      colocate_with_variable: A created in `self.scope()`. Variables created
-        while in the returned context manager will be on the same set of
-        devices as `colocate_with_variable`.
-
-    Returns:
-      A context manager.
-    """
-    def create_colocated_variable(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope(self)
-      kwargs["use_resource"] = True
-      kwargs["colocate_with"] = colocate_with_variable
-      return next_creator(*args, **kwargs)
-
-    _require_distribution_strategy_scope(self)
-    return variable_scope.variable_creator_scope(create_colocated_variable)
-
-  def _call_dataset_fn(self, dataset_fn):
-    result = dataset_fn()
-    if not isinstance(result, dataset_ops.Dataset):
-      raise ValueError(
-          "dataset_fn() must return a tf.data.Dataset when using a "
-          "DistributionStrategy.")
-    return result
-
-  # TODO(josh11b): `PerDeviceDataset` currently only implements a few methods of
-  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
-  # Extend to implement more functionality of datasets.
-  def distribute_dataset(self, dataset_fn):
-    """Return a `dataset` split across all towers.
-
-    Suitable for providing input to for `call_for_each_tower()` by creating an
-    iterator:
-
-    ```
-    def dataset_fn():
-      return tf.data.Dataset.from_tensors([[1.]]).repeat()
-    with distribution_strategy.scope():
-      distributed_dataset = distribution_strategy.distribute_dataset(dataset_fn)
-      iterator = distributed_dataset.make_one_shot_iterator()
-      tower_results = distribution_strategy.call_for_each_tower(
-          tower_fn, iterator.get_next())
-    ```
-
-    Args:
-      dataset_fn: A function that returns a `tf.data.Dataset`.
-
-    Returns:
-      A `PerDeviceDataset` that will produce data for each tower.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def broadcast(self, tensor, destinations=None):
-    """Mirror a tensor on one device to all worker devices.
-
-    Args:
-      tensor: A Tensor value to broadcast.
-      destinations: An optional mirrored variable, device string, or
-        list of device strings, specifying the destination devices
-        to copy `tensor` to. Defaults to `self.worker_devices`.
-
-    Returns:
-      A value mirrored to `destinations` devices.
-    """
-    # TODO(josh11b): More docstring
-    _require_cross_tower_context(self)
-    return self._broadcast(tensor, destinations)
-
-  def _broadcast(self, tensor, destinations):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def initialize(self):
-    """Any initialization to be done before running any computations.
-
-    In eager mode, it executes any initialization as a side effect.
-    In graph mode, it creates the initialization ops and returns them.
-
-    For example, TPU initialize_system ops.
-
-    Returns:
-      In eager mode, returns `None`.
-      In graph mode, a list of ops to execute. Empty list if nothing to be done.
-    """
-    if eager_context.executing_eagerly():
-      return
-    else:
-      return []
-
-  def finalize(self):
-    """Any final actions to be done at the end of all computations.
-
-    In eager mode, it executes any finalize actions as a side effect.
-    In graph mode, it creates the finalize ops and returns them.
-
-    For example, TPU shutdown ops.
-
-    Returns:
-      In eager mode, returns `None`.
-      In graph mode, a list of ops to execute. Empty list if nothing to be done.
-    """
-    if eager_context.executing_eagerly():
-      return
-    else:
-      return []
-
-  def run_steps_on_dataset(self, fn, iterator, iterations=1,
-                           initial_loop_values=None):
-    """Run `fn` with input from `iterator` for `iterations` times.
-
-    This method can be used to run a step function for training a number of
-    times using input from a dataset.
-
-    Args:
-      fn: function to run using this distribution strategy. The function must
-        have the following signature: `def fn(context, *inputs)`.
-        `context` is an instance of `MultiStepContext` that will be passed when
-        `fn` is run. `context` can be used to specify the outputs to be returned
-        from `fn` by calling `context.set_last_step_output`. It can also be used
-        to capture non tensor outputs by `context.set_non_tensor_output`.
-        See `MultiStepContext` documentation for more information.
-        `inputs` will have same type/structure as `iterator.get_next()`. If the
-        `iterator.get_next()` returns a tuple say `return x, y` then whose will
-        be unpacked and passed to the `step_fn`; and step_fn signature would
-        look like `def step_fn(context, x, y)`. If the iterator returns a single
-        value say `return x` then the value is passed as is; the step_fn
-        signature would look like `def step_fn(context, x)`.
-        Typically, `fn` will use `call_for_each_tower` method of the strategy
-        to distribute the computation over multiple towers.
-      iterator: Iterator of a dataset that represents the input for `fn`. The
-        caller is responsible for initializing the iterator as needed.
-      iterations: (Optional) Number of iterations that `fn` should be run.
-        Defaults to 1.
-      initial_loop_values: (Optional) Initial values to be passed into the
-        loop that runs `fn`. Defaults to `None`. # TODO(priyag): Remove
-        initial_loop_values argument when we have a mechanism to infer the
-        outputs of `fn`.
-
-    Returns:
-      Returns the `MultiStepContext` object which has the following properties,
-      among other things:
-        - run_op: An op that runs `fn` `iterations` times.
-        - last_step_outputs: A dictionary containing tensors set using
-        `context.set_last_step_output`. Evaluating this returns the value of
-        the tensors after the last iteration.
-        - non_tensor_outputs: A dictionatry containing anything that was set by
-          `fn` by calling `context.set_non_tensor_output`.
-    """
-    _require_cross_tower_context(self)
-    return self._run_steps_on_dataset(fn, iterator, iterations,
-                                      initial_loop_values)
-
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def call_for_each_tower(self, fn, *args, **kwargs):
-    """Run `fn` once per tower.
-
-    `fn` may call `tf.get_tower_context()` to access methods such as
-    `tower_id()` and `merge_call()`.
-
-    `merge_call()` is used to communicate between the towers and
-    re-enter the cross-tower context. All towers pause their execution
-    having encountered a `merge_call()` call. After that the
-    `merge_fn`-function is executed. Its results are then unwrapped and
-    given back to each tower call. After that execution resumes until
-    `fn` is complete or encounters another `merge_call()`.  Example:
-
-    ```python
-    # Called once in "cross-tower" context.
-    def merge_fn(distribution, three_plus_tower_id):
-      # sum the values across towers
-      return sum(distribution.unwrap(three_plus_tower_id))
-
-    # Called once per tower in `distribution`, in a "tower" context.
-    def fn(three):
-      tower_ctx = tf.get_tower_context()
-      v = three + tower_ctx.tower_id
-      # Computes the sum of the `v` values across all towers.
-      s = tower_ctx.merge_call(merge_fn, v)
-      return s + v
-
-    with distribution.scope():
-      # in "cross-tower" context
-      ...
-      merged_results = distribution.call_for_each_tower(fn, 3)
-      # merged_results has the values from every tower execution of `fn`.
-      print(distribution.unwrap(merged_results))  # Prints a list
-    ```
-
-    Args:
-      fn: function to run (will be run once per tower).
-      *args: positional arguments for `fn`
-      **kwargs: keyword arguments for `fn`.
-          `"run_concurrently"`: Boolean indicating whether executions of `fn`
-             can be run concurrently (under eager execution only), defaults to
-             `True`.
-
-    Returns:
-      Merged return value of `fn` across all towers.
-    """
-    _require_cross_tower_context(self)
-    return self._call_for_each_tower(fn, *args, **kwargs)
-
-  def _call_for_each_tower(self, fn, *args, **kwargs):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def reduce(self, aggregation, value, destinations):
-    """Combine (via e.g. sum or mean) values across towers.
-
-    Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`,
-        `tf.VariableAggregation.ONLY_FIRST_TOWER`.
-      value: A per-device value with one value per tower.
-      destinations: A mirrored variable, a per-device tensor, a device string,
-        or list of device strings. The return value will be copied to all
-        destination devices (or all the devices where the `destinations` value
-        resides). To perform an all-reduction, pass `value` to `destinations`.
-
-    Returns:
-      A value mirrored to `destinations`.
-    """
-    # TODO(josh11b): More docstring
-    # TODO(josh11b): Return an unwrapped value if colocate_with is a
-    # single device.
-    _require_cross_tower_context(self)
-    assert aggregation in [
-        variable_scope.VariableAggregation.SUM,
-        variable_scope.VariableAggregation.MEAN,
-        variable_scope.VariableAggregation.ONLY_FIRST_TOWER
-    ]
-    return self._reduce(aggregation, value, destinations)
-
-  def _reduce(self, aggregation, value, destinations):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def batch_reduce(self, aggregation, value_destination_pairs):
-    """Combine multiple `reduce` calls into one for faster execution.
-
-    Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`,
-        `tf.VariableAggregation.ONLY_FIRST_TOWER`.
-      value_destination_pairs: A sequence of (value, destinations)
-        pairs. See `reduce()` for a description.
-
-    Returns:
-      A list of mirrored values, one per pair in `value_destination_pairs`.
-    """
-    # TODO(josh11b): More docstring
-    _require_cross_tower_context(self)
-    assert aggregation in [
-        variable_scope.VariableAggregation.SUM,
-        variable_scope.VariableAggregation.MEAN,
-        variable_scope.VariableAggregation.ONLY_FIRST_TOWER
-    ]
-    return self._batch_reduce(aggregation, value_destination_pairs)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    return [
-        self.reduce(aggregation, t, destinations=v)
-        for t, v in value_destination_pairs
-    ]
-
-  def update(self, var, fn, *args, **kwargs):
-    """Run `fn` to update `var` using inputs mirrored to the same devices.
-
-    If `var` is mirrored across multiple devices, then this implements
-    logic like:
-
-    ```
-    results = {}
-    for device, v in var:
-      with tf.device(device):
-        # *args and **kwargs will be unwrapped if they are mirrored.
-        results[device] = fn(v, *args, **kwargs)
-    return merged(results)
-    ```
-
-    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.
-
-    Neither `*args` nor `**kwargs` may contain per-device values.
-    If they contain mirrored values, they will be unwrapped before
-    calling `fn`.
-
-    Args:
-      var: Variable, possibly mirrored to multiple devices, to operate on.
-      fn: Function to call. Should take the variable as the first argument.
-      *args: Additional positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
-        specified, the return value will be unwrapped.
-
-    Returns:
-      By default, the merged return value of `fn` across all towers.  The merged
-      result has dependencies to make sure that if it is evaluated at all, the
-      side effects (updates) will happen on every tower. If instead
-      "grouped=False" is specified, this function will return a nest of lists
-      where each list has an element per tower, and the caller is responsible
-      for ensuring all elements are executed.
-    """
-    _require_cross_tower_context(self)
-    options = {"grouped": kwargs.pop("grouped", True)}
-    return self._update(var, options, fn, *args, **kwargs)
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
-    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
-
-    Args:
-      colocate_with: The return value of `non_slot_devices()`.
-      fn: Function to execute.
-      *args: Positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
-        specified, the return value will be unwrapped and the caller is
-        responsible for ensuring all elements are executed.
-
-    Returns:
-      Return value of `fn`, possibly merged across devices.
-    """
-    _require_cross_tower_context(self)
-    options = {"grouped": kwargs.pop("grouped", True)}
-    return self._update_non_slot(colocate_with, options, fn, *args, **kwargs)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def unwrap(self, value):
-    """Returns the list of all per-device values contained in `value`.
-
-    Args:
-      value: A value returned by `call_for_each_tower()` or a variable
-        created in `scope()`.
-
-    Returns:
-      A list of values contained in `value`. If `value` represents a single
-      value, this returns `[value].`
-    """
-    return self._unwrap(value)
-
-  def value_container(self, value):
-    """Returns the container that this per-device `value` belongs to.
-
-    Args:
-      value: A value returned by `call_for_each_tower()` or a variable
-        created in `scope()`.
-
-    Returns:
-      A container that `value` belongs to.
-      If value does not belong to any container (including the case of
-      container having been destroyed), returns the value itself.
-      `value in unwrap(value_container(value))` will always be true.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def _unwrap(self, distributed_value):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def group(self, value, name=None):
-    """Shortcut for `tf.group(distribution.unwrap(value))`."""
-    value = nest.flatten(self.unwrap(value))
-
-    if len(value) != 1 or name is not None:
-      return control_flow_ops.group(value, name=name)
-    # Special handling for the common case of one op.
-    v, = value
-    if hasattr(v, "op"):
-      v = v.op
-    return v
-
-  @property
-  def is_single_tower(self):
-    """Returns whether there is a single tower or multiple.
-
-    Returns:
-      A boolean. If `True`, `call_for_each_tower(fn)` will only call `fn` once.
-      If `False`, `call_for_each_tower(fn)` may call `fn` multiple times.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def require_static_shapes(self):
-    return self._require_static_shapes
-
-  @property
-  def num_towers(self):
-    """Returns number of towers, for purposes of averaging across towers."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def worker_devices(self):
-    """Returns the list of devices used to run `call_for_each_tower()` calls."""
-    # TODO(josh11b): More docstring
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def parameter_devices(self):
-    """Returns the list of devices used for variable and `update` placement."""
-    # TODO(josh11b): More docstring
-    raise NotImplementedError("must be implemented in descendants")
-
-  def non_slot_devices(self, var_list):
-    """Device(s) for non-slot variables.
-
-    Create variables on these devices in a
-    `with colocate_vars_with(non_slot_devices(...)):` block.
-    Update those using `update_non_slot()`.
-
-    Args:
-      var_list: The list of variables being optimized, needed with the
-        default `DistributionStrategy`.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def worker_device_index(self):
-    """An object mapping worker device to an id.
-
-    This might be passed as an argument to `call_for_each_tower()`, as in:
-
-    ```
-    with distribution_strategy.scope():
-
-      def fn(device_id):
-        # device_id is an integer. `fn` is being executed on device:
-        #    distribution_strategy.worker_devices[device_id].
-
-      distribution_strategy.call_for_each_tower(
-          fn, distribution_strategy.worker_device_index)
-    ```
-
-    Returns:
-      An index object, or the integer 0 if there is only a single tower.
-    """
-    _require_cross_tower_context(self)
-    return self._worker_device_index()
-
-  def _worker_device_index(self):
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def between_graph(self):
-    """Whether the strategy uses between-graph replication or not.
-
-      This is expected to return a constant value that will not be changed
-      throughout its life cycle.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
-    """Configures the strategy class."""
-    del session_config, cluster_spec, task_type, task_id
-
-  @property
-  def should_init(self):
-    """Whether initialization is needed."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def should_checkpoint(self):
-    """Whether checkpointing is needed."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def should_save_summary(self):
-    """Whether saving summaries is needed."""
-    raise NotImplementedError("must be implemented in descendants")
-
-
-# A note about the difference between the context managers
-# `TowerContext` (defined here) and `_CurrentDistributionContext`
-# (defined above) used by `DistributionStrategy.scope()`:
-#
-# * a TowerContext is only present during a `call_for_each_tower()`
-#   call (except during a `merge_run` call) and in such a scope it
-#   will be returned by calls to `get_tower_context()`.  Implementers of new
-#   DistributionStrategy descendants will frequently also need to
-#   define a descendant of TowerContext, and are responsible for
-#   entering and exiting this context.
-#
-# * DistributionStrategy.scope() sets up a variable_creator scope that
-#   changes variable creation calls (e.g. to make mirrored
-#   variables). This is intended as an outer scope that users enter once
-#   around their model creation and graph definition. There is no
-#   anticipated need to define descendants of _CurrentDistributionContext.
-#   It sets the current DistributionStrategy for purposes of
-#   `get_distribution_strategy()` and `has_distribution_strategy()`
-#   and switches the thread mode to a "cross-tower context".
-class TowerContext(object):
-  """DistributionStrategy API inside a `call_for_each_tower()` call."""
-
-  def __init__(self, distribution_strategy, tower_id):
-    self._distribution_strategy = distribution_strategy
-    self._thread_context = distribution_strategy_context._InTowerThreadMode(  # pylint: disable=protected-access
-        self)
-    self._tower_id = tower_id
-
-  def __enter__(self):
-    _push_per_thread_mode(self._thread_context)
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    _pop_per_thread_mode()
-
-  def merge_call(self, merge_fn, *args, **kwargs):
-    """Merge args across towers and run `merge_fn` in a cross-tower context.
-
-    This allows communication and coordination when there are multiple calls
-    to a model function triggered by a call to
-    `distribution.call_for_each_tower(model_fn, ...)`.
-
-    See `MirroredDistribution.call_for_each_tower()` for an explanation.
-
-    Otherwise, this is equivalent to:
-
-    ```
-    distribution = get_distribution_strategy()
-    with cross-tower-context(distribution):
-      return merge_fn(distribution, *args, **kwargs)
-    ```
-
-    Args:
-      merge_fn: function that joins arguments from threads that are given as
-        PerDevice. It accepts `DistributionStrategy` object as the first
-        argument.
-      *args: positional per-thread arguments for `merge_fn`
-      **kwargs: keyword per-thread arguments for `merge_fn`.
-
-    Returns:
-      The return value of `merge_fn`, except for `PerDevice` values which are
-      unpacked.
-    """
-    require_tower_context(self)
-    return self._merge_call(merge_fn, *args, **kwargs)
-
-  def _merge_call(self, merge_fn, *args, **kwargs):
-    """Default implementation for single tower."""
-    _push_per_thread_mode(  # thread-local, so not needed with multiple threads
-        distribution_strategy_context._CrossTowerThreadMode(  # pylint: disable=protected-access
-            self._distribution_strategy))
-    try:
-      return merge_fn(self._distribution_strategy, *args, **kwargs)
-    finally:
-      _pop_per_thread_mode()
-
-  @property
-  def is_single_tower(self):
-    """Returns whether there is a single tower or multiple."""
-    require_tower_context(self)
-    return self._distribution_strategy.is_single_tower
-
-  @property
-  def num_towers(self):
-    """Returns number of towers, for purposes of averaging across towers."""
-    return self._distribution_strategy.num_towers
-
-  @property
-  def tower_id(self):
-    """Which tower is being defined, a number from 0 to `num_towers - 1`."""
-    require_tower_context(self)
-    return self._tower_id
-
-  @property
-  def distribution_strategy(self):
-    """The current `DistributionStrategy` object."""
-    return self._distribution_strategy
-
-  @property
-  def device(self):
-    """The device this tower is to be executed on, as a string."""
-    require_tower_context(self)
-    return device_util.current()
-
-  # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
-  # all-reduce. It would return a function returning the result of reducing `t`
-  # across all towers. The caller would wait to call this function until they
-  # needed the reduce result, allowing an efficient implementation:
-  # * With eager execution, the reduction could be performed asynchronously
-  #   in the background, not blocking until the result was needed.
-  # * When constructing a graph, it could batch up all reduction requests up
-  #   to that point that the first result is needed. Most likely this can be
-  #   implemented in terms of `merge_call()` and `batch_reduce()`.
-
-# ------------------------------------------------------------------------------
-
-
-class _DefaultDistributionStrategy(DistributionStrategy):
-  """Default `DistributionStrategy` if none is explicitly selected."""
-
-  def scope(self):
-    """Context manager setting a variable creator and `self` as current."""
-    if distribution_strategy_context.has_distribution_strategy():
-      raise RuntimeError("Must not nest DistributionStrategy scopes.")
-
-    def creator(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope(self)
-      return next_creator(*args, **kwargs)
-
-    return _CurrentDistributionContext(
-        self, variable_scope.variable_creator_scope(creator))
-
-  def colocate_vars_with(self, colocate_with_variable):
-    """Does not require `self.scope`."""
-    _require_distribution_strategy_scope(self)
-    return ops.colocate_with(colocate_with_variable)
-
-  def distribute_dataset(self, dataset_fn):
-    return self._call_dataset_fn(dataset_fn)
-
-  def _broadcast(self, tensor, destinations):
-    if destinations is None:
-      return tensor
-    else:
-      raise NotImplementedError("TODO")
-
-  def _call_for_each_tower(self, fn, *args, **kwargs):
-    # We don't run `fn` in multiple threads in _DefaultDistributionStrategy.
-    kwargs.pop("run_concurrently", None)
-    with TowerContext(self, tower_id=0):
-      return fn(*args, **kwargs)
-
-  def _reduce(self, aggregation, value, destinations):
-    # TODO(josh11b): Use destinations?
-    del aggregation, destinations
-    return value
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    # The implementations of _update() and _update_non_slot() are identical
-    # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
-    # once that value is used for something.
-    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
-      result = fn(*args, **kwargs)
-      if should_group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  def read_var(self, tower_local_var):
-    return array_ops.identity(tower_local_var)
-
-  def _unwrap(self, distributed_value):
-    return [distributed_value]
-
-  def value_container(self, value):
-    return value
-
-  @property
-  def is_single_tower(self):
-    return True
-
-  @property
-  def num_towers(self):
-    return 1
-
-  @property
-  def worker_devices(self):
-    raise RuntimeError(
-        "worker_devices() method unsupported by _DefaultDistributionStrategy.")
-
-  @property
-  def parameter_devices(self):
-    raise RuntimeError("parameter_devices() method unsupported by "
-                       "_DefaultDistributionStrategy.")
-
-  def non_slot_devices(self, var_list):
-    return min(var_list, key=lambda x: x.name)
-
-  def _worker_device_index(self):
-    raise RuntimeError("worker_device_index() method unsupported by "
-                       "_DefaultDistributionStrategy.")
-
-
-# ------------------------------------------------------------------------------
-# Deprecated, use v.assign_add(amount) instead.  Internal API, so expect
-# it to be deleted soon.
-
-
-@deprecation.deprecated(None,
-                        "Use v.assign_add(amount) instead. You may need to set "
-                        "aggregation=tf.VariableAggregation.ONLY_FIRST_TOWER "
-                        "when creating the variable.")
-def increment_var(v, amount=1):
-  """`v += amount`, distributed-aware version."""
-  def update(vu):
-    return vu.assign_add(amount, read_value=False)
-
-  def merge_fn(dist, vm):
-    return dist.update(vm, update)
-
-  tower_context = distribution_strategy_context.get_tower_context()
-  return tower_context.merge_call(merge_fn, v)
-
-
-# ------------------------------------------------------------------------------
-# We haven't yet implemented deserialization for DistributedVariables.
-# So here we catch any attempts to deserialize variables
-# when using distribution strategies.
-# pylint: disable=protected-access
-_original_from_proto = resource_variable_ops._from_proto_fn
-
-
-def _from_proto_fn(v, import_scope=None):
-  if distribution_strategy_context.has_distribution_strategy():
-    raise NotImplementedError(
-        "Deserialization of variables is not yet supported when using"
-        "distributed strategies.")
-  else:
-    return _original_from_proto(v, import_scope=import_scope)
-
-resource_variable_ops._from_proto_fn = _from_proto_fn
-# pylint: enable=protected-access
-
-
-#-------------------------------------------------------------------------------
-# Shorthand for some methods from distribution_strategy_context.
-_push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
-_get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
-_pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
+# pylint: disable=wildcard-import
+from tensorflow.python.distribute.distribute_lib import *
diff --git a/tensorflow/python/training/distribute_test.py b/tensorflow/python/training/distribute_test.py
deleted file mode 100644
index f03bd3910055d3022e5dc4d22ebb5ffc1a19cef8..0000000000000000000000000000000000000000
--- a/tensorflow/python/training/distribute_test.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test DistributionStrategy, TowerContext, and supporting APIs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import test
-from tensorflow.python.training import distribute
-from tensorflow.python.training import distribution_strategy_context
-
-
-class _TestTowerContext(distribute.TowerContext):
-
-  def merge_call(self, fn, *args, **kwargs):
-    return kwargs["test_arg"]
-
-
-def _get_test_variable(name, synchronization, aggregation):
-  return {
-      "name": name,
-      "synchronization": synchronization,
-      "aggregation": aggregation
-  }
-
-
-class _TestStrategy(distribute.DistributionStrategy):
-
-  def _call_for_each_tower(self, fn, *args, **kwargs):
-    with _TestTowerContext(self, tower_id=0):
-      return fn(*args, **kwargs)
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    return _get_test_variable(kwargs["name"], kwargs["synchronization"],
-                              kwargs["aggregation"])
-
-
-def _assert_in_default_state(t):
-  t.assertIs(distribution_strategy_context._get_default_tower_context(),
-             distribution_strategy_context.get_tower_context())
-  t.assertIs(None, distribution_strategy_context.get_cross_tower_context())
-  t.assertIs(distribution_strategy_context._get_default_distribution_strategy(),
-             distribution_strategy_context.get_distribution_strategy())
-  t.assertFalse(distribution_strategy_context.has_distribution_strategy())
-
-
-class TestStrategyTest(test.TestCase):
-
-  def testCallForEachTower(self):
-    _assert_in_default_state(self)
-    dist = _TestStrategy()
-
-    def run_fn():
-      tower_context = distribution_strategy_context.get_tower_context()
-      self.assertTrue(tower_context is not None)
-      self.assertIs(None,
-                    distribution_strategy_context.get_cross_tower_context())
-      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_distribution_strategy())
-      self.assertEqual("foo", tower_context.merge_call(None, test_arg="foo"))
-      expected_value = _get_test_variable(
-          "bar", variable_scope.VariableSynchronization.AUTO,
-          variable_scope.VariableAggregation.NONE)
-      self.assertDictEqual(expected_value,
-                           variable_scope.variable(1.0, name="bar"))
-
-    with self.assertRaises(RuntimeError):
-      dist.call_for_each_tower(run_fn)
-    with dist.scope():
-      dist.call_for_each_tower(run_fn)
-    _assert_in_default_state(self)
-
-  def testScope(self):
-    _assert_in_default_state(self)
-    dist = _TestStrategy()
-    with dist.scope():
-      self.assertIs(None, distribution_strategy_context.get_tower_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_cross_tower_context())
-      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_distribution_strategy())
-      expected_value = _get_test_variable(
-          "baz", variable_scope.VariableSynchronization.AUTO,
-          variable_scope.VariableAggregation.NONE)
-      self.assertDictEqual(expected_value,
-                           variable_scope.variable(1.0, name="baz"))
-    _assert_in_default_state(self)
-
-  def testSettingSynchronizationAndAggregation(self):
-    _assert_in_default_state(self)
-    dist = _TestStrategy()
-    with dist.scope():
-      expected_value = _get_test_variable(
-          "baz", variable_scope.VariableSynchronization.ON_WRITE,
-          variable_scope.VariableAggregation.MEAN)
-      self.assertDictEqual(
-          expected_value,
-          variable_scope.variable(
-              1.0,
-              name="baz",
-              synchronization=variable_scope.VariableSynchronization.ON_WRITE,
-              aggregation=variable_scope.VariableAggregation.MEAN))
-    _assert_in_default_state(self)
-
-
-class DefaultDistributionStrategyTest(test.TestCase):
-
-  def testMergeCall(self):
-    _assert_in_default_state(self)
-
-    def merge_fn(dist, s):
-      self.assertIs(
-          distribution_strategy_context._get_default_distribution_strategy(),
-          dist)
-      self.assertIs(None, distribution_strategy_context.get_tower_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_cross_tower_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_distribution_strategy())
-      self.assertFalse(
-          distribution_strategy_context.has_distribution_strategy())
-      return "foo_" + s
-
-    tower_ctx = distribution_strategy_context.get_tower_context()
-    self.assertIs(distribution_strategy_context._get_default_tower_context(),
-                  tower_ctx)
-    self.assertEqual("foo_bar", tower_ctx.merge_call(merge_fn, "bar"))
-    _assert_in_default_state(self)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/training/distribution_strategy_context.py b/tensorflow/python/training/distribution_strategy_context.py
index ce580a406f750d6194673fd2d8db166e6a8e2ae5..7391bf3b22dfd1a6f1b76e287132828fcc570c67 100644
--- a/tensorflow/python/training/distribution_strategy_context.py
+++ b/tensorflow/python/training/distribution_strategy_context.py
@@ -12,194 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility to get distribution strategy related contexts."""
+"""Deprecated, please use ../distribute/distribution_strategy_context.py."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-
-# There is a circular dependency between this and `distribute` module. So we
-# load it lazily to workaround this.
-distribute_lib = LazyLoader(
-    "distribute_lib", globals(),
-    "tensorflow.python.training.distribute")
-
-# ------------------------------------------------------------------------------
-# Internal API for setting the current thread mode as being either in a
-# tower or cross-tower context for a particular distribution strategy.
-
-
-class _ThreadMode(object):
-
-  def __init__(self, dist, cross, tower):
-    self.distribution_strategy = dist
-    self.cross_tower_context = cross
-    self.tower_context = tower
-
-
-class _CrossTowerThreadMode(_ThreadMode):
-
-  def __init__(self, distribution_strategy):
-    _ThreadMode.__init__(
-        self, distribution_strategy, distribution_strategy, None)
-
-
-class _InTowerThreadMode(_ThreadMode):
-
-  def __init__(self, tower_ctx):
-    _ThreadMode.__init__(
-        self, tower_ctx.distribution_strategy, None, tower_ctx)
-
-
-def _push_per_thread_mode(context):
-  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
-
-
-def _pop_per_thread_mode():
-  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
-
-
-class _DefaultTowerThreadMode(_ThreadMode):
-  """Type of default value returned by `_get_per_thread_mode()`.
-
-  Used when the thread-local stack is empty.
-  """
-
-  def __init__(self):
-    _ThreadMode.__init__(self, _get_default_distribution_strategy(), None,
-                         _get_default_tower_context())
-
-
-def _get_per_thread_mode():
-  try:
-    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
-  except (AttributeError, IndexError):
-    return _get_default_tower_mode()
-
-
-# ------------------------------------------------------------------------------
-# Public API for accessing the current thread mode
-
-
-def get_tower_context():
-  """Returns the current TowerContext or None if in a cross-tower context.
-
-  Note that execution:
-
-  1. starts in the default (single-tower) tower context (this function
-     will return the default TowerContext object);
-  2. switches to cross-tower context (in which case this will return
-     None) when entering a `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) tower context inside
-     `call_for_each_tower(fn, ...)`;
-  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-tower context (and again
-     this function will return None).
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-tower context for the default `DistributionStrategy`. You may
-  also switch from the cross-tower context of 4 to a tower context by
-  calling `call_for_each_tower()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-tower context, in a tower context you should use the
-  `TowerContext` API instead.
-
-  Returns:
-    The current `TowerContext` object when in a tower context scope, else None.
-
-    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().tower_context
-
-
-def get_cross_tower_context():
-  """Returns the current DistributionStrategy if in a cross-tower context.
-
-  Note that execution:
-
-  1. starts in the default (single-tower) tower context;
-  2. switches to cross-tower context when entering a
-     `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) tower context inside
-     `call_for_each_tower(fn, ...)`;
-  4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-tower context.
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-tower context for the default `DistributionStrategy`. You may
-  also switch from the cross-tower context of 4 to a tower context by
-  calling `call_for_each_tower()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-tower context.
-
-  Returns:
-    Returns the current `DistributionStrategy` object in a cross-tower
-    context, or None.
-
-    Exactly one of `get_tower_context()` and `get_cross_tower_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().cross_tower_context
-
-
-def get_distribution_strategy():
-  """Returns the current `DistributionStrategy` object.
-
-  Prefer to use `get_tower_context()` or `get_cross_tower_context()`
-  instead when possible.
-
-  Returns:
-    A `DistributionStrategy` object. Inside a
-    `with distribution_strategy.scope()` block, it returns
-    `distribution_strategy`, otherwise it returns the default
-    (single-tower) `DistributionStrategy` object.
-  """
-  return _get_per_thread_mode().distribution_strategy
-
-
-def has_distribution_strategy():
-  """Return if there is a current non-default `DistributionStrategy`.
-
-  Returns:
-    True if inside a `with distribution_strategy.scope():`.
-  """
-  return get_distribution_strategy() is not _get_default_distribution_strategy()
-
-
-# ------------------------------------------------------------------------------
-# Defaults that are used when no distribution strategy is explicitly created.
-# We create them lazily in a function so that we can workaround the circular
-# dependency on distribute_lib. See lazy loader at the top of this file.
-
-_defaults = {
-    "distribution_strategy": None,
-    "tower_context": None,
-    "tower_mode": None
-}
-
-
-def _get_default_distribution_strategy():
-  if _defaults["distribution_strategy"] is None:
-    _defaults["distribution_strategy"] = (
-        distribute_lib._DefaultDistributionStrategy())  # pylint: disable=protected-access
-  return _defaults["distribution_strategy"]
-
-
-def _get_default_tower_context():
-  if _defaults["tower_context"] is None:
-    _defaults["tower_context"] = distribute_lib.TowerContext(
-        _get_default_distribution_strategy(), tower_id=0)
-  return _defaults["tower_context"]
-
-
-def _get_default_tower_mode():
-  if _defaults["tower_mode"] is None:
-    _defaults["tower_mode"] = _DefaultTowerThreadMode()
-  return _defaults["tower_mode"]
+# pylint: disable=wildcard-import
+from tensorflow.python.distribute.distribution_strategy_context import *
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
index 2c4eb02d533201cd1be2ea655c8823198dd714d5..a10178f8cfe3af1ac45a5084b8e16abe1beee267 100644
--- a/tensorflow/python/training/evaluation.py
+++ b/tensorflow/python/training/evaluation.py
@@ -230,7 +230,7 @@ def _evaluate_once(checkpoint_path,
   hooks = list(hooks or [])
 
   if eval_ops is not None:
-    if any([isinstance(h, _MultiStepStopAfterNEvalsHook) for h in hooks]):
+    if any(isinstance(h, _MultiStepStopAfterNEvalsHook) for h in hooks):
       steps_per_run_variable = \
           basic_session_run_hooks.get_or_create_steps_per_run_variable()
       update_eval_step = state_ops.assign_add(
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index 832c10d454e6083be9715ef0af4642ad3e936f97..a2ef3c76b4e79b0ddefd26fcc54fb1afa27a94dd 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -25,7 +25,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.FtrlOptimizer")
+@tf_export(v1=["train.FtrlOptimizer"])
 class FtrlOptimizer(optimizer.Optimizer):
   """Optimizer that implements the FTRL algorithm.
 
@@ -52,6 +52,9 @@ class FtrlOptimizer(optimizer.Optimizer):
     Args:
       learning_rate: A float value or a constant float `Tensor`.
       learning_rate_power: A float value, must be less or equal to zero.
+        Controls how the learning rate decreases during training. Use zero for
+        a fixed learning rate. See section 3.1 in the
+        [paper](https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
       initial_accumulator_value: The starting value for accumulators.
         Only zero or positive values are allowed.
       l1_regularization_strength: A float value, must be greater than or
diff --git a/tensorflow/python/training/ftrl_test.py b/tensorflow/python/training/ftrl_test.py
index 15c50bc8788c3939a135920b8f917a2bb46f3ceb..39b299c64a35a907859416961fb72932423c18e3 100644
--- a/tensorflow/python/training/ftrl_test.py
+++ b/tensorflow/python/training/ftrl_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -54,7 +55,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllClose([0.0, 0.0], v0_val)
         self.assertAllClose([0.0, 0.0], v1_val)
 
@@ -62,18 +63,21 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(3):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-2.60260963, -4.29698515]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.28432083, -0.56694895]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithoutRegularization(self):
     self.doTestFtrlwithoutRegularization(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceFtrlWithoutRegularization(self):
     self.doTestFtrlwithoutRegularization(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testFtrlwithoutRegularization2(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session() as sess:
@@ -90,19 +94,20 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run 3 steps FTRL
         for _ in range(3):
           update.run()
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-2.55607247, -3.98729396]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.28232238, -0.56096673]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -113,12 +118,15 @@ class FtrlOptimizerTest(test.TestCase):
         sgd_op = ftrl.FtrlOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType([[0, 1]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session() as sess:
@@ -135,19 +143,20 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run 10 steps FTRL
         for _ in range(10):
           update.run()
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-7.66718769, -10.91273689]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.93460727, -1.86147261]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1_L2(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session() as sess:
@@ -164,7 +173,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
@@ -172,12 +181,13 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(10):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.24059935, -0.46829352]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.02406147, -0.04830509]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
 
@@ -201,7 +211,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
@@ -209,12 +219,13 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(10):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.22578995, -0.44345796]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.14378493, -0.13229476]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1_L2_L2ShrinkageSparse(self):
     """Tests the new FTRL op with support for l2 shrinkage on sparse grads."""
     for dtype in [dtypes.half, dtypes.float32]:
@@ -237,7 +248,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
         self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)
 
@@ -245,10 +256,11 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(10):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([[-0.22578995], [2.]], v0_val)
         self.assertAllCloseAccordingToType([[4.], [-0.13229476]], v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
     """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
     for dtype in [dtypes.half, dtypes.float32]:
@@ -273,7 +285,7 @@ class FtrlOptimizerTest(test.TestCase):
         update1 = opt1.apply_gradients([(grads1, var1)])
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([1.0, 2.0], v1_val)
 
@@ -282,12 +294,12 @@ class FtrlOptimizerTest(test.TestCase):
           update0.run()
           update1.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         # var0 is experiencing L2 shrinkage so it should be smaller than var1
         # in magnitude.
         self.assertTrue((v0_val**2 < v1_val**2).all())
-        accum0 = list(sess.run(opt0._slots)["accum"].values())[0]
-        accum1 = list(sess.run(opt1._slots)["accum"].values())[0]
+        accum0 = list(self.evaluate(opt0._slots)["accum"].values())[0]
+        accum1 = list(self.evaluate(opt1._slots)["accum"].values())[0]
         # L2 shrinkage should not change how we update grad accumulator.
         self.assertAllCloseAccordingToType(accum0, accum1)
 
@@ -311,7 +323,7 @@ class FtrlOptimizerTest(test.TestCase):
     variables.global_variables_initializer().run()
 
     sess = ops.get_default_session()
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
       self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
@@ -323,7 +335,7 @@ class FtrlOptimizerTest(test.TestCase):
     for _ in range(steps):
       update.run()
 
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
   # When variables are initialized with Zero, FTRL-Proximal has two properties:
@@ -333,6 +345,7 @@ class FtrlOptimizerTest(test.TestCase):
   # with Adagrad.
   # So, basing on these two properties, we test if our implementation of
   # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
+  @test_util.run_deprecated_v1
   def testEquivAdagradwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
@@ -353,6 +366,7 @@ class FtrlOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(val0, val2)
       self.assertAllCloseAccordingToType(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivSparseAdagradwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
@@ -376,6 +390,7 @@ class FtrlOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(val0, val2)
       self.assertAllCloseAccordingToType(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivSparseGradientDescentwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
@@ -399,6 +414,7 @@ class FtrlOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(val0, val2)
       self.assertAllCloseAccordingToType(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivGradientDescentwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index ef50f6315dd623647e000b9b713d3ae557c31427..1a527345ef6bdbefa1e2b2a679fa1d0072c3e515 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -26,7 +26,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.GradientDescentOptimizer")
+@tf_export(v1=["train.GradientDescentOptimizer"])
 class GradientDescentOptimizer(optimizer.Optimizer):
   """Optimizer that implements the gradient descent algorithm.
   """
diff --git a/tensorflow/python/training/gradient_descent_test.py b/tensorflow/python/training/gradient_descent_test.py
index 1ddea598e52b3b86b821553b0cc74674fe5389d5..5a6c5cfa7470d66c3710ba11ad0ae8772234d2c9 100644
--- a/tensorflow/python/training/gradient_descent_test.py
+++ b/tensorflow/python/training/gradient_descent_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -35,6 +36,7 @@ from tensorflow.python.training import gradient_descent
 
 class GradientDescentOptimizerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -47,17 +49,18 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
         self.assertEqual(0, len(optimizer.variables()))
 
+  @test_util.run_deprecated_v1
   def testBasicResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -73,16 +76,17 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -99,16 +103,17 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -124,17 +129,18 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -151,17 +157,18 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -174,16 +181,17 @@ class GradientDescentOptimizerTest(test.TestCase):
             lrate).apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -193,8 +201,9 @@ class GradientDescentOptimizerTest(test.TestCase):
         grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
         variables.global_variables_initializer().run()
         for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], grad.eval())
+          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
 
+  @test_util.run_deprecated_v1
   def testWithGlobalStep(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -207,17 +216,18 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params and global_step
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        self.assertAllCloseAccordingToType(1, global_step.eval())
+                                           self.evaluate(var1))
+        self.assertAllCloseAccordingToType(1, self.evaluate(global_step))
 
+  @test_util.run_deprecated_v1
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -237,15 +247,15 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testCapturingInDefunWhileExecutingEagerly(self):
     with context.eager_mode():
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index eb131ac9f7c5ce8c8c6efe30ccda6dce2808cd43..d3441a2f872681c4563a58c0f4808ec8be708bbc 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -56,7 +56,9 @@ _restore_sparse = sparse_ops._take_many_sparse_from_tensors_map
 # pylint: enable=protected-access
 
 
-@tf_export("io.match_filenames_once", "train.match_filenames_once")
+@tf_export(
+    "io.match_filenames_once",
+    v1=["io.match_filenames_once", "train.match_filenames_once"])
 @deprecation.deprecated_endpoints("train.match_filenames_once")
 def match_filenames_once(pattern, name=None):
   """Save the list of files matching pattern, so it is only computed once.
@@ -397,7 +399,7 @@ class _SparseMetaData(object):
     """
     self._sparse = sparse
     self._map_op = map_op
-    self._rank = rank
+    self._rank = tensor_shape.Dimension(rank)
 
   def __eq__(self, other):
     if self.sparse != other.sparse:
@@ -510,7 +512,7 @@ def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
   def _sparse_meta_data(t, storing_op, map_op):
     if not isinstance(t, sparse_tensor.SparseTensor):
       return _SparseMetaData(False, None, None)
-    rank = t.dense_shape.shape.with_rank(1)[0]
+    rank = t.dense_shape.shape.with_rank(1).dims[0]
     if enqueue_many:
       rank -= 1
     # If a shared map_op was provided, use that. Otherwise use the name of
@@ -604,7 +606,7 @@ def _restore_sparse_tensors(stored_list, sparse_info_list):
   tensors = [
       _restore_sparse(sparse_map_op=info.map_op,
                       sparse_handles=array_ops.squeeze(s, [1]),
-                      rank=(info.rank + 1).value)
+                      rank=tensor_shape.dimension_value(info.rank + 1))
       if info.sparse else s
       for (s, info) in zip(stored_list, sparse_info_list)]
   has_st = any(isinstance(x, sparse_tensor.SparseTensor) for x in tensors)
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index a9b05dcc736e7c3178c06424677da7866265bcf7..d89f5f3bbd879a32ab55cf70e366c5c82ef0f266 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -41,6 +42,7 @@ from tensorflow.python.util import compat
 
 class MatchFilenamesOnceTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     temp_dir = self.get_temp_dir()
     filenames = [os.path.join(temp_dir, n) for n in os.listdir(temp_dir)]
@@ -58,35 +60,41 @@ class MatchFilenamesOnceTest(test_lib.TestCase):
       one = inp.match_filenames_once(additional[1])
       variables.global_variables_initializer().run()
       variables.local_variables_initializer().run()
-      self.assertItemsEqual(map(compat.as_bytes, filenames), star.eval())
-      self.assertItemsEqual(map(compat.as_bytes, additional), question.eval())
-      self.assertItemsEqual([compat.as_bytes(additional[1])], one.eval())
+      self.assertItemsEqual(
+          map(compat.as_bytes, filenames), self.evaluate(star))
+      self.assertItemsEqual(
+          map(compat.as_bytes, additional), self.evaluate(question))
+      self.assertItemsEqual([compat.as_bytes(additional[1])],
+                            self.evaluate(one))
 
 
 class LimitEpochsTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoLimit(self):
     with self.cached_session():
       seven = constant_op.constant(7)
       seven_forever = inp.limit_epochs(seven)
       variables.local_variables_initializer().run()
       for _ in range(100):
-        self.assertEqual(7, seven_forever.eval())
+        self.assertEqual(7, self.evaluate(seven_forever))
 
+  @test_util.run_deprecated_v1
   def testLimit(self):
     with self.cached_session():
       love_me = constant_op.constant("Love Me")
       love_me_two_times = inp.limit_epochs(love_me, num_epochs=2)
       variables.global_variables_initializer().run()
       variables.local_variables_initializer().run()
-      self.assertEqual(b"Love Me", love_me_two_times.eval())
-      self.assertEqual(b"Love Me", love_me_two_times.eval())
+      self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
+      self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
       with self.assertRaises(errors_impl.OutOfRangeError):
-        love_me_two_times.eval()
+        self.evaluate(love_me_two_times)
 
 
 class InputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session():
       input_tensor = [[1, 2, 3, 4],
@@ -102,14 +110,16 @@ class InputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      self.assertAllEqual(input_tensor * num_epochs, dequeue_many.eval())
+      self.assertAllEqual(input_tensor * num_epochs,
+                          self.evaluate(dequeue_many))
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testNoShapeInference(self):
     with self.cached_session():
       # Disable shape inference for the input.
@@ -127,14 +137,15 @@ class InputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      self.assertAllEqual(input_value * num_epochs, dequeue_many.eval())
+      self.assertAllEqual(input_value * num_epochs, self.evaluate(dequeue_many))
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShapeError(self):
     input_tensor = array_ops.placeholder(dtypes.float32, None)
     with self.assertRaisesRegexp(ValueError, "fully defined shape"):
@@ -143,6 +154,7 @@ class InputProducerTest(test_lib.TestCase):
 
 class StringInputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session():
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
@@ -156,15 +168,16 @@ class StringInputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      output = dequeue_many.eval()
+      output = self.evaluate(dequeue_many)
       self.assertAllEqual(strings * num_epochs, output)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShuffle(self):
     with self.cached_session():
       strings = [b"a", b"b", b"c"]
@@ -184,7 +197,7 @@ class StringInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = dequeue_many.eval()
+        output = self.evaluate(dequeue_many)
         key = b"".join(output)
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -200,7 +213,7 @@ class StringInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
@@ -210,6 +223,7 @@ class StringInputProducerTest(test_lib.TestCase):
       with self.assertRaises(ValueError):
         _ = inp.string_input_producer([])
 
+  @test_util.run_deprecated_v1
   def testNullString(self):
     # Runtime check for empty string list.  This is slightly oblique:
     # The queue runner should die with an assertion error on the null
@@ -224,11 +238,12 @@ class StringInputProducerTest(test_lib.TestCase):
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       coord.request_stop()
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
@@ -237,6 +252,7 @@ class StringInputProducerTest(test_lib.TestCase):
       self.assertProtoEquals("s: 'SHARED_NAME_XYZ'",
                              queue.queue_ref.op.node_def.attr["shared_name"])
 
+  @test_util.run_deprecated_v1
   def testConstructionRace(self):
     with self.cached_session() as sess:
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
@@ -252,13 +268,14 @@ class StringInputProducerTest(test_lib.TestCase):
           # writing of the `tf.Graph` object. However, many users
           # write code this way, so we include this test to ensure
           # that we can support it.
-          self.assertEquals(string, sess.run(queue.dequeue()))
+          self.assertEquals(string, self.evaluate(queue.dequeue()))
       coord.request_stop()
       coord.join(threads)
 
 
 class RangeInputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session():
       num_epochs = 3
@@ -272,15 +289,16 @@ class RangeInputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      output = dequeue_many.eval()
+      output = self.evaluate(dequeue_many)
       self.assertAllEqual(list(xrange(range_size)) * num_epochs, output)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShuffle(self):
     with self.cached_session():
       num_epochs = 200
@@ -300,7 +318,7 @@ class RangeInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = dequeue_many.eval()
+        output = self.evaluate(dequeue_many)
         key = 10 * (output[0] + 1) + (output[1] + 1)
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -316,10 +334,11 @@ class RangeInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       range_size = 5
@@ -331,6 +350,7 @@ class RangeInputProducerTest(test_lib.TestCase):
 
 class SliceInputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session() as sess:
       num_epochs = 3
@@ -344,17 +364,18 @@ class SliceInputProducerTest(test_lib.TestCase):
 
       # No randomness, so just see repeated copies of the input.
       num_items = len(source_strings) * num_epochs
-      output = [sess.run(slices) for _ in range(num_items)]
+      output = [self.evaluate(slices) for _ in range(num_items)]
       out_strings, out_ints = zip(*output)
       self.assertAllEqual(source_strings * num_epochs, out_strings)
       self.assertAllEqual(source_ints * num_epochs, out_ints)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(slices)
+        self.evaluate(slices)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShuffle(self):
     with self.cached_session() as sess:
       num_epochs = 1200
@@ -379,7 +400,7 @@ class SliceInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = [sess.run(slices) for _ in range(len(source_strings))]
+        output = [self.evaluate(slices) for _ in range(len(source_strings))]
         key = b",".join([s + compat.as_bytes(str(i)) for s, i in output])
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -395,10 +416,11 @@ class SliceInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(slices)
+        self.evaluate(slices)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       source_strings = ["A", "B", "D", "G"]
@@ -470,7 +492,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -487,38 +509,43 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThread(self):
     self._testOneThreadHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testUint32DataTypes(self):
     values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint32)
     batched = inp.batch([values], batch_size=2)
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-      sess.run(batched)
+      self.evaluate(batched)
       coord.request_stop()
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testUint64DataTypes(self):
     values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint64)
     batched = inp.batch([values], batch_size=2)
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-      sess.run(batched)
+      self.evaluate(batched)
       coord.request_stop()
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThreadDynamicPad(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -535,7 +562,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         expected_results = np.arange(i * batch_size, (i + 1) * batch_size)
         max_len = expected_results[-1]
         self.assertAllEqual(results[0], expected_results)
@@ -545,10 +572,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThreadEnqueueMany(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -567,7 +595,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -580,10 +608,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreads(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -606,7 +635,7 @@ class BatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertAllEqual(results[0], results[1].values)
@@ -620,10 +649,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThreadSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -647,7 +677,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -663,7 +693,7 @@ class BatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[0],
                           np.arange(num_batches * batch_size,
                                     num_batches * batch_size + extra_elements))
@@ -677,10 +707,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -705,7 +736,7 @@ class BatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertAllEqual(results[0], results[1].values)
@@ -717,7 +748,7 @@ class BatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), extra_elements)
       self.assertAllEqual(results[0], results[1].values)
@@ -732,10 +763,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -753,12 +785,14 @@ class BatchTest(test_lib.TestCase):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
+  @test_util.run_deprecated_v1
   def testCannotInferRankError(self):
     with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
       with self.assertRaisesRegexp(ValueError, "Cannot infer Tensor's rank"):
         inp.batch([x], batch_size=2)
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -766,6 +800,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -773,6 +808,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -782,6 +818,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -791,6 +828,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testSingleElementDict(self):
     x = inp.batch({"c": [12, 12]}, batch_size=8)
     self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
@@ -823,35 +861,42 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testMaybeEnqueuePerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadMaybeEnqueuePerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -873,6 +918,7 @@ class BatchTest(test_lib.TestCase):
                       batch_size=1,
                       enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -880,6 +926,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -888,6 +935,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -896,6 +944,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -905,6 +954,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -915,6 +965,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -925,6 +976,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchCorrectValues(self):
     sparse_t = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
@@ -938,7 +990,7 @@ class BatchTest(test_lib.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = batched.eval()
+      batched_np = self.evaluate(batched)
 
       coord.request_stop()
       for thread in threads:
@@ -1016,7 +1068,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(3, len(results))
         self.assertEqual(batch_size, len(results[0]))
         self.assertEqual(batch_size, len(results[2]))
@@ -1035,8 +1087,11 @@ class BatchJoinTest(test_lib.TestCase):
         self.assertAllEqual([99] * len(which_b),
                             [results[0][i] for i in which_b])
 
-      # Some minimum level of mixing of the results of both threads.
-      self.assertGreater(saw_both, 1)
+      # We'd like to see some minimum level of mixing of the results of both
+      # threads, but we can't rely on fair thread scheduling, so we just log.
+      # self.assertGreater(saw_both, 1)
+      tf_logging.info("testTwoThreads%s saw both count: %s",
+                      "Dict" if use_dict else "", saw_both)
 
       # Verify the order of results from "a" were preserved.
       self.assertAllEqual(all_a, np.arange(num_a))
@@ -1044,16 +1099,19 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
-  def DISABLED_testTwoThreads(self):
+  @test_util.run_deprecated_v1
+  def testTwoThreads(self):
     self._testTwoThreadsHelper(use_dict=False)
 
-  def DISABLED_testTwoThreadsDict(self):
+  @test_util.run_deprecated_v1
+  def testTwoThreadsDict(self):
     self._testTwoThreadsHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
     with self.assertRaisesRegexp(ValueError, "must have the same keys"):
       inp.batch_join(
@@ -1068,7 +1126,8 @@ class BatchJoinTest(test_lib.TestCase):
           }],
           batch_size=8)
 
-  def DISABLED_testTwoThreadsDynamicPad(self):
+  @test_util.run_deprecated_v1
+  def testTwoThreadsDynamicPad(self):
     with self.cached_session() as sess:
       # Two threads, the first generates (0..69, ["a"] * 1..70).
       num_a = 70
@@ -1109,7 +1168,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertEqual(2, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[1]), batch_size)
@@ -1128,8 +1187,10 @@ class BatchJoinTest(test_lib.TestCase):
         self.assertAllEqual([99] * len(which_b),
                             [results[0][i] for i in which_b])
 
-      # Some minimum level of mixing of the results of both threads.
-      self.assertGreater(saw_both, 1)
+      # We'd like to see some minimum level of mixing of the results of both
+      # threads, but we can't rely on fair thread scheduling, so we just log.
+      # self.assertGreater(saw_both, 1)
+      tf_logging.info("testTwoThreadsDynamicPad saw both count: %s", saw_both)
 
       # Verify the order of results from "a" were preserved.
       self.assertAllEqual(  # tiled "a" with counter + 1
@@ -1139,11 +1200,12 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
-  def DISABLED_testTwoThreadsSmallerBatch(self):
+  @test_util.run_deprecated_v1
+  def testTwoThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       extra_elements = 2
       # Two threads, the first generates (0..69, "a").
@@ -1192,7 +1254,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -1212,7 +1274,7 @@ class BatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached the final batch with 2 * extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertEqual(len(results[2]), 2 * extra_elements)
@@ -1229,8 +1291,10 @@ class BatchJoinTest(test_lib.TestCase):
       all_a.extend([results[0][i] for i in which_a])
       seen_b += len(which_b)
 
-      # Some minimum level of mixing of the results of both threads.
-      self.assertGreater(saw_both, 1)
+      # We'd like to see some minimum level of mixing of the results of both
+      # threads, but we can't rely on fair thread scheduling, so we just log.
+      # self.assertGreater(saw_both, 1)
+      tf_logging.info("testTwoThreadsSmallerBatch saw both count: %s", saw_both)
 
       # Verify the order of results from "a" were preserved.
       self.assertAllEqual(all_a, np.arange(num_a))
@@ -1238,11 +1302,12 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
-  def DISABLED_testTwoThreadsDynamicPadSmallerBatch(self):
+  @test_util.run_deprecated_v1
+  def testTwoThreadsDynamicPadSmallerBatch(self):
     with self.cached_session() as sess:
       extra_elements = 2
       # Two threads, the first generates (0..69, ["a"] * 1..70).
@@ -1285,7 +1350,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[1]), batch_size)
@@ -1305,7 +1370,7 @@ class BatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached the final batch with 2 * extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertEqual(len(results[1]), 2 * extra_elements)
@@ -1322,8 +1387,11 @@ class BatchJoinTest(test_lib.TestCase):
       all_a.extend([results[0][i] for i in which_a])
       seen_b += len(which_b)
 
-      # Some minimum level of mixing of the results of both threads.
-      self.assertGreater(saw_both, 1)
+      # We'd like to see some minimum level of mixing of the results of both
+      # threads, but we can't rely on fair thread scheduling, so we just log.
+      # self.assertGreater(saw_both, 1)
+      tf_logging.info("testTwoThreadsDynamicPadSmallerBatch saw both count: %s",
+                      saw_both)
 
       # Verify the order of results from "a" were preserved.
       self.assertAllEqual(  # tiled "a" with counter + 1
@@ -1333,10 +1401,11 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -1359,12 +1428,14 @@ class BatchJoinTest(test_lib.TestCase):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
+  @test_util.run_deprecated_v1
   def testCannotInferRankError(self):
     with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
       with self.assertRaisesRegexp(ValueError, "Cannot infer Tensor's rank"):
         inp.batch_join([[x]], batch_size=2)
 
+  @test_util.run_deprecated_v1
   def testSingleElementDict(self):
     x = inp.batch_join([{"c": [12, 12]}], batch_size=8)
     self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
@@ -1396,7 +1467,7 @@ class BatchJoinTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(
             [0] * batch_size,
             np.mod(results[0], 2),)
@@ -1407,28 +1478,35 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -1450,6 +1528,7 @@ class BatchJoinTest(test_lib.TestCase):
                            batch_size=1,
                            enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1457,6 +1536,7 @@ class BatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1465,6 +1545,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -1473,6 +1554,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1482,6 +1564,7 @@ class BatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1492,6 +1575,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1502,6 +1586,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchCorrectValues(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
@@ -1515,7 +1600,7 @@ class BatchJoinTest(test_lib.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = batched.eval()
+      batched_np = self.evaluate(batched)
 
       coord.request_stop()
       for thread in threads:
@@ -1565,7 +1650,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
         self.assertAllEqual(
@@ -1583,16 +1668,19 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThread(self):
     self._testOneThreadHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testOneThreadSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -1620,7 +1708,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for _ in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
         self.assertAllEqual(
@@ -1631,7 +1719,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[1].dense_shape, [extra_elements, 1])
       self.assertAllEqual(results[2], [b"string"] * extra_elements)
       all_counts.extend(results[0])
@@ -1645,10 +1733,11 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreads(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -1673,7 +1762,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
@@ -1692,10 +1781,11 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -1723,7 +1813,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
@@ -1735,7 +1825,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[0].shape, [extra_elements])
       self.assertAllEqual(results[1].dense_shape, [extra_elements, 1])
       self.assertAllEqual(results[2], [b"string"] * extra_elements)
@@ -1750,10 +1840,11 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -1803,35 +1894,42 @@ class ShuffleBatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -1850,6 +1948,7 @@ class ShuffleBatchTest(test_lib.TestCase):
                               keep_input=array_ops.placeholder(dtypes.bool),
                               enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1857,6 +1956,7 @@ class ShuffleBatchTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1865,6 +1965,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         [sparse], 2, 10, 1, True, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -1873,6 +1974,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         [sparse], 2, 10, 1, [True, False], enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1882,6 +1984,7 @@ class ShuffleBatchTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1892,6 +1995,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         [sparse], 2, 10, 1, True, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1976,7 +2080,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(3, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -2006,16 +2110,19 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testTwoThreads(self):
     self._testTwoThreadsHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsDict(self):
     self._testTwoThreadsHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       # Two threads, the first generates (0..26, "a").
@@ -2068,7 +2175,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -2088,7 +2195,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached end with 2 * extra_elements left
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertAllEqual(results[1].dense_shape, [2 * extra_elements, 1])
       self.assertEqual(len(results[2]), 2 * extra_elements)
@@ -2115,10 +2222,11 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
     with self.assertRaisesRegexp(ValueError, "must have the same keys"):
       inp.shuffle_batch_join(
@@ -2136,6 +2244,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
           min_after_dequeue=16,
           seed=223607)
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -2189,35 +2298,42 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -2239,6 +2355,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
           keep_input=array_ops.placeholder(dtypes.bool),
           enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -2246,6 +2363,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -2254,6 +2372,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
         [[sparse]], 2, 10, 1, True, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -2262,6 +2381,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
         [[sparse]], 2, 10, 1, [True, False], enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -2271,6 +2391,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -2281,6 +2402,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
         [[sparse]], 2, 10, 1, True, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 29b546532193320961393ccf00fd3b190802c11a..c52e89db1f47eb303b7160cef77c01bcb46aebba 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -100,7 +100,7 @@ def exponential_decay(learning_rate,
   return decayed_lr
 
 
-@tf_export(v1=["train.piecewise_constant"])
+@tf_export(v1=["train.piecewise_constant_decay", "train.piecewise_constant"])
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index 03a32f6ca099a4b02de950d7e4dde6f88944695d..1029d4cea8f67d0e8614983ff106ccc57ccb9064 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -61,24 +61,24 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       self.evaluate(step.assign(100))
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
-    with self.cached_session():
-      step = variables.VariableV1(1)
-      assign_1 = step.assign(1)
-      assign_2 = step.assign(2)
-      assign_100 = step.assign(100)
-      decayed_lr = learning_rate_decay.exponential_decay(.1, step, 3, 0.96,
-                                                         staircase=True)
-      variables.global_variables_initializer().run()
-      # No change to learning rate
-      assign_1.op.run()
-      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
-      assign_2.op.run()
-      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
-      # Decayed learning rate
-      assign_100.op.run()
-      expected = .1 * 0.96 ** (100 // 3)
-      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+    step = variables.VariableV1(1)
+    assign_1 = step.assign(1)
+    assign_2 = step.assign(2)
+    assign_100 = step.assign(100)
+    decayed_lr = learning_rate_decay.exponential_decay(
+        .1, step, 3, 0.96, staircase=True)
+    self.evaluate(variables.global_variables_initializer())
+    # No change to learning rate
+    self.evaluate(assign_1.op)
+    self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
+    self.evaluate(assign_2.op)
+    self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
+    # Decayed learning rate
+    self.evaluate(assign_100.op)
+    expected = .1 * 0.96**(100 // 3)
+    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstant(self):
@@ -101,6 +101,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(decayed_lr), 0.001, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testPiecewiseConstantEdgeCases(self):
     x_int = resource_variable_ops.ResourceVariable(
         0, dtype=variables.dtypes.int32)
diff --git a/tensorflow/python/training/learning_rate_decay_v2.py b/tensorflow/python/training/learning_rate_decay_v2.py
index 9c5e144be6b0f1aa70d58ab90850ca18e2c90d57..eb69feb17d3983ddb494cdf63ae30edee7062915 100644
--- a/tensorflow/python/training/learning_rate_decay_v2.py
+++ b/tensorflow/python/training/learning_rate_decay_v2.py
@@ -117,7 +117,7 @@ def exponential_decay(learning_rate,
                            decay_rate, staircase, name)
 
 
-@tf_export("train.piecewise_constant", v1=[])
+@tf_export("train.piecewise_constant_decay", v1=[])
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
diff --git a/tensorflow/python/training/learning_rate_decay_v2_test.py b/tensorflow/python/training/learning_rate_decay_v2_test.py
index b2ac93f06fe3c3e9ada6d0ef6254078d3f444975..cb96773e299a37db1d5792c84d6a837147e09d04 100644
--- a/tensorflow/python/training/learning_rate_decay_v2_test.py
+++ b/tensorflow/python/training/learning_rate_decay_v2_test.py
@@ -61,24 +61,24 @@ class LRDecayTestV2(test_util.TensorFlowTestCase):
       self.evaluate(step.assign(100))
       self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
-    with self.cached_session():
-      step = variables.Variable(1)
-      assign_1 = step.assign(1)
-      assign_2 = step.assign(2)
-      assign_100 = step.assign(100)
-      decayed_lr = learning_rate_decay_v2.exponential_decay(.1, step, 3, 0.96,
-                                                            staircase=True)
-      variables.global_variables_initializer().run()
-      # No change to learning rate
-      assign_1.op.run()
-      self.assertAllClose(decayed_lr().eval(), .1, 1e-6)
-      assign_2.op.run()
-      self.assertAllClose(decayed_lr().eval(), .1, 1e-6)
-      # Decayed learning rate
-      assign_100.op.run()
-      expected = .1 * 0.96 ** (100 // 3)
-      self.assertAllClose(decayed_lr().eval(), expected, 1e-6)
+    step = variables.Variable(1)
+    assign_1 = step.assign(1)
+    assign_2 = step.assign(2)
+    assign_100 = step.assign(100)
+    decayed_lr = learning_rate_decay_v2.exponential_decay(
+        .1, step, 3, 0.96, staircase=True)
+    self.evaluate(variables.global_variables_initializer())
+    # No change to learning rate
+    self.evaluate(assign_1.op)
+    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
+    self.evaluate(assign_2.op)
+    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
+    # Decayed learning rate
+    self.evaluate(assign_100.op)
+    expected = .1 * 0.96**(100 // 3)
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstant(self):
diff --git a/tensorflow/python/training/localhost_cluster_performance_test.py b/tensorflow/python/training/localhost_cluster_performance_test.py
index 7c097b943d05cd1a049886af6ef1d018d7b2c9ab..c4cbc8a55dc5d40b9aeae2fed400b1d29d6c7499 100644
--- a/tensorflow/python/training/localhost_cluster_performance_test.py
+++ b/tensorflow/python/training/localhost_cluster_performance_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -34,6 +35,7 @@ from tensorflow.python.training import device_setter
 
 class CreateLocalClusterTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testCreateLocalCluster(self):
     workers, _ = test.create_local_cluster(num_workers=2, num_ps=2)
     worker_sessions = [session_lib.Session(w.target) for w in workers]
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index cb3ec6f053e2e7f5aa80152ed233c8fbb6920be0..f3bc83bbfa1fe6a225b1302655a187bca52c995c 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -25,7 +25,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.MomentumOptimizer")
+@tf_export(v1=["train.MomentumOptimizer"])
 class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
@@ -59,6 +59,11 @@ class MomentumOptimizer(optimizer.Optimizer):
         This implementation always computes gradients at the value of the
         variable(s) passed to the optimizer. Using Nesterov Momentum makes the
         variable(s) track the values called `theta_t + mu*v_t` in the paper.
+        This implementation is an approximation of the original formula, valid 
+        for high values of momentum. It will compute the "adjusted gradient" 
+        in NAG by assuming that the new gradient will be estimated by the 
+        current average gradient plus the product of momentum and the change 
+        in the average gradient.
 
     @compatibility(eager)
     When eager execution is enabled, `learning_rate` and `momentum` can each be
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 8a21c39d32344d0027793f1eac3d4f9f43a8d920..ba155fa6c646df5935feddbacffb2a4f9763c90a 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -160,6 +160,7 @@ class MomentumOptimizerTest(test.TestCase):
       self.assertStartsWith(optimizer_variables[1].name, "var3")
       self.assertEquals(2, len(optimizer_variables))
 
+  @test_util.run_deprecated_v1
   def testNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -183,9 +184,10 @@ class MomentumOptimizerTest(test.TestCase):
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
                                                                     accum1_np,
                                                                     3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -224,8 +226,8 @@ class MomentumOptimizerTest(test.TestCase):
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
                                                                     accum1_np,
                                                                     3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
@@ -280,6 +282,7 @@ class MomentumOptimizerTest(test.TestCase):
     self.evaluate(sgd_op)
     self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -303,37 +306,43 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
   def _dbParamsMom01(self):
     """Return dist-belief momentum values.
@@ -434,6 +443,7 @@ class MomentumOptimizerTest(test.TestCase):
     # pylint: enable=line-too-long
     return db_grad, db_out
 
+  @test_util.run_deprecated_v1
   def testLikeDistBeliefMom01(self):
     with self.cached_session():
       db_grad, db_out = self._dbParamsMom01()
@@ -445,8 +455,9 @@ class MomentumOptimizerTest(test.TestCase):
       variables.global_variables_initializer().run()
       for i in xrange(num_samples):
         mom_update.run(feed_dict={grads0: db_grad[i]})
-        self.assertAllClose(np.array(db_out[i]), var0.eval())
+        self.assertAllClose(np.array(db_out[i]), self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -476,46 +487,59 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertEquals(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([0, 0], var0.eval()[0])
-        self.assertAllClose([0, 0], var0.eval()[1])
-        self.assertAllClose([1, 1], var1.eval()[2])
+        self.assertAllClose([0, 0], self.evaluate(var0)[0])
+        self.assertAllClose([0, 0], self.evaluate(var0)[1])
+        self.assertAllClose([1, 1], self.evaluate(var1)[2])
 
         # Step 1: the momentum accumulators are 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
-        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
         self.assertAllCloseAccordingToType(
-            np.array([.01, .01]), slot1.eval()[2])
+            np.array([0, 0]),
+            self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([.1, .1]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]),
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
         self.assertAllCloseAccordingToType(
-            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]), var0.eval()[1])
+            np.array([0, 0]),
+            self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]), var1.eval()[2])
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
+            self.evaluate(var0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
+            self.evaluate(var1)[2])
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()[1])
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0)[1])
         self.assertAllCloseAccordingToType(
             np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            slot1.eval()[2])
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
             np.array([
-                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), -(0.1 * 2.0) - (
-                    (0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval()[1])
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]),
+            self.evaluate(var0)[1])
         self.assertAllCloseAccordingToType(
             np.array([
-                0.98 - ((0.9 * 0.01 + 0.01) * 2.0), 0.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval()[2])
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]),
+            self.evaluate(var1)[2])
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -538,37 +562,43 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertEquals(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update1.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the second momentum accumulators contain the previous update.
         mom_update2.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 82f0e3be52048c35ded6a8fe30a6c6f8723efe8b..6a7d27df5c322bfad37cf1ef207f66353d636111 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -54,7 +54,7 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 USE_DEFAULT = object()
 
 
-@tf_export('train.Scaffold')
+@tf_export(v1=['train.Scaffold'])
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
 
@@ -195,8 +195,12 @@ class Scaffold(object):
           default_ready_op)
     if self._ready_for_local_init_op is None:
       def default_ready_for_local_init_op():
-        return variables.report_uninitialized_variables(
-            variables.global_variables())
+        return array_ops.concat([
+            variables.report_uninitialized_variables(
+                variables.global_variables()),
+            resources.report_uninitialized_resources(
+                resources.shared_resources())
+        ], 0)
       self._ready_for_local_init_op = Scaffold.get_or_default(
           'ready_for_local_init_op', ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
           default_ready_for_local_init_op)
@@ -342,7 +346,7 @@ def _create_monitored_session_with_worker_context(worker_context,  # pylint: dis
       stop_grace_period_secs=stop_grace_period_secs)
 
 
-@tf_export('train.MonitoredTrainingSession')
+@tf_export(v1=['train.MonitoredTrainingSession'])
 def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              is_chief=True,
                              checkpoint_dir=None,
@@ -504,7 +508,8 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       stop_grace_period_secs=stop_grace_period_secs)
 
 
-@tf_export('train.SessionCreator')
+@tf_export(v1=['train.SessionCreator'])
+@six.add_metaclass(abc.ABCMeta)
 class SessionCreator(object):
   """A factory for tf.Session."""
 
@@ -514,7 +519,7 @@ class SessionCreator(object):
         'create_session is not implemented for {}.'.format(self))
 
 
-@tf_export('train.ChiefSessionCreator')
+@tf_export(v1=['train.ChiefSessionCreator'])
 class ChiefSessionCreator(SessionCreator):
   """Creates a tf.Session for a chief."""
 
@@ -566,7 +571,7 @@ class ChiefSessionCreator(SessionCreator):
         init_fn=self._scaffold.init_fn)
 
 
-@tf_export('train.WorkerSessionCreator')
+@tf_export(v1=['train.WorkerSessionCreator'])
 class WorkerSessionCreator(SessionCreator):
   """Creates a tf.Session for a worker."""
 
@@ -835,10 +840,18 @@ class _MonitoredSession(object):
     return self._coordinated_creator.tf_sess is None
 
   def _tf_sess(self):
+    """Return underlying tf.Session object.
+
+    Warning: accessing the returned object in user code is likely to cause races
+    or "flaky tests".
+
+    Returns:
+      A tf.Session object.
+    """
     return self._coordinated_creator.tf_sess
 
 
-@tf_export('train.MonitoredSession')
+@tf_export(v1=['train.MonitoredSession'])
 class MonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, recovery and hooks.
 
@@ -921,7 +934,7 @@ class MonitoredSession(_MonitoredSession):
         stop_grace_period_secs=stop_grace_period_secs)
 
 
-@tf_export('train.SingularMonitoredSession')
+@tf_export(v1=['train.SingularMonitoredSession'])
 class SingularMonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, restoring, and hooks.
 
@@ -1067,8 +1080,10 @@ class _WrappedSession(object):
     if self._sess:
       try:
         self._sess.close()
-      except _PREEMPTION_ERRORS:
-        pass
+      except _PREEMPTION_ERRORS as e:
+        logging.warning('An error occurred when attempting to close the '
+                        'session. This may be due to a preemption in a '
+                        'connected worker or parameter server. Error: %s', e)
       finally:
         self._sess = None
 
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index c870d99de9e3daed2c167455e6ee6ab5efa33a7b..99ee9ea7e2e4d32f9a24513d9c46f9de4fa2d797 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -382,6 +383,16 @@ class MonitoredTrainingSessionTest(test.TestCase):
         self.assertEqual(0, session.run(gstep))
 
 
+class MockExtended(object):
+
+  def __init__(self, between_graph, should_init, should_checkpoint,
+               should_save_summary):
+    self.experimental_between_graph = between_graph
+    self.experimental_should_init = should_init
+    self.should_checkpoint = should_checkpoint
+    self.should_save_summary = should_save_summary
+
+
 class MockStrategy(object):
 
   def __init__(self,
@@ -389,26 +400,8 @@ class MockStrategy(object):
                should_init=True,
                should_checkpoint=None,
                should_save_summary=None):
-    self._between_graph = between_graph
-    self._should_init = should_init
-    self._should_checkpoint = should_checkpoint
-    self._should_save_summary = should_save_summary
-
-  @property
-  def between_graph(self):
-    return self._between_graph
-
-  @property
-  def should_init(self):
-    return self._should_init
-
-  @property
-  def should_checkpoint(self):
-    return self._should_checkpoint
-
-  @property
-  def should_save_summary(self):
-    return self._should_save_summary
+    self.extended = MockExtended(between_graph, should_init, should_checkpoint,
+                                 should_save_summary)
 
 
 class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
@@ -512,6 +505,7 @@ class StopAtNSession(monitored_session._WrappedSession):
 class WrappedSessionTest(test.TestCase):
   """_WrappedSession tests."""
 
+  @test_util.run_deprecated_v1
   def test_properties(self):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
@@ -519,6 +513,7 @@ class WrappedSessionTest(test.TestCase):
       self.assertEquals(sess.graph, wrapped_sess.graph)
       self.assertEquals(sess.sess_str, wrapped_sess.sess_str)
 
+  @test_util.run_deprecated_v1
   def test_should_stop_on_close(self):
     with self.cached_session() as sess:
       wrapped_sess = monitored_session._WrappedSession(sess)
@@ -526,6 +521,7 @@ class WrappedSessionTest(test.TestCase):
       wrapped_sess.close()
       self.assertTrue(wrapped_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_should_stop_uses_check_stop(self):
     with self.cached_session() as sess:
       wrapped_sess = StopAtNSession(sess, 3)
@@ -534,6 +530,7 @@ class WrappedSessionTest(test.TestCase):
       self.assertFalse(wrapped_sess.should_stop())
       self.assertTrue(wrapped_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_should_stop_delegates_to_wrapped_session(self):
     with self.cached_session() as sess:
       wrapped_sess0 = StopAtNSession(sess, 4)
@@ -544,6 +541,7 @@ class WrappedSessionTest(test.TestCase):
       self.assertFalse(wrapped_sess1.should_stop())
       self.assertTrue(wrapped_sess1.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_close_twice(self):
     with self.cached_session() as sess:
       wrapped_sess = monitored_session._WrappedSession(sess)
@@ -552,6 +550,7 @@ class WrappedSessionTest(test.TestCase):
       wrapped_sess.close()
       self.assertTrue(wrapped_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_run(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -569,6 +568,7 @@ def busy_wait_for_coord_stop(coord):
 class CoordinatedSessionTest(test.TestCase):
   """_CoordinatedSession tests."""
 
+  @test_util.run_deprecated_v1
   def test_properties(self):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
@@ -577,6 +577,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertEquals(sess.graph, coord_sess.graph)
       self.assertEquals(sess.sess_str, coord_sess.sess_str)
 
+  @test_util.run_deprecated_v1
   def test_run(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -585,6 +586,7 @@ class CoordinatedSessionTest(test.TestCase):
       coord_sess = monitored_session._CoordinatedSession(sess, coord)
       self.assertEqual(42, coord_sess.run(v, feed_dict={c: 42}))
 
+  @test_util.run_deprecated_v1
   def test_should_stop_on_close(self):
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
@@ -593,6 +595,7 @@ class CoordinatedSessionTest(test.TestCase):
       coord_sess.close()
       self.assertTrue(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_should_stop_on_coord_stop(self):
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
@@ -601,6 +604,7 @@ class CoordinatedSessionTest(test.TestCase):
       coord.request_stop()
       self.assertTrue(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_dont_request_stop_on_exception_in_main_thread(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -615,6 +619,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertFalse(coord.should_stop())
       self.assertFalse(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_stop_threads_on_close_after_exception(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -662,6 +667,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertTrue(coord.should_stop())
       self.assertTrue(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_propagates_exception_trace(self):
     assertion = control_flow_ops.Assert(False, ['This should fail.'])
     with self.cached_session() as sess:
@@ -809,6 +815,7 @@ class RecoverableSessionTest(test.TestCase):
     def create_session(self):
       return self._sess
 
+  @test_util.run_deprecated_v1
   def test_properties(self):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
@@ -817,6 +824,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertEquals(sess.graph, recoverable_sess.graph)
       self.assertEquals(sess.sess_str, recoverable_sess.sess_str)
 
+  @test_util.run_deprecated_v1
   def test_run(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -825,6 +833,7 @@ class RecoverableSessionTest(test.TestCase):
           self._SessionReturner(sess))
       self.assertEqual(51, recoverable_sess.run(v, feed_dict={c: 51}))
 
+  @test_util.run_deprecated_v1
   def test_recovery(self):
     with self.cached_session() as sess:
 
@@ -871,6 +880,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaisesRegexp(IndexError, 'pop from empty list'):
         recoverable_sess.run(v, feed_dict={c: -12})
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_coordinator_exception(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -896,6 +906,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_non_preemption_in_coordinator(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -925,6 +936,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaises(errors_impl.UnknownError):
         session.close()
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_session_getting_stuck(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -949,6 +961,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_step_fn_recovery_from_coordinator_exception_when_run_hooks(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -979,6 +992,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_non_preemption_in_coordinator_when_run_hooks(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1013,6 +1027,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaises(errors_impl.UnknownError):
         session.close()
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_session_getting_stuck_when_run_hooks(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1057,6 +1072,7 @@ class RecoverableSessionTest(test.TestCase):
     # exception.
     return session
 
+  @test_util.run_deprecated_v1
   def test_step_fn_recovery_from_coordinator_exception_with_raw_session(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1089,6 +1105,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_non_preemption_in_coordinator_with_raw_session(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1126,6 +1143,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaises(errors_impl.UnknownError):
         session.close()
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_session_getting_stuck_with_raw_session(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1178,7 +1196,7 @@ class HookedSessionTest(test.TestCase):
       mock_run = FakeSession(sess)
       mon_sess = monitored_session._HookedSession(sess=mock_run, hooks=[])
       a_tensor = constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       output = mon_sess.run(fetches=a_tensor,
                             feed_dict='a_feed',
                             options='an_option',
@@ -1197,7 +1215,7 @@ class HookedSessionTest(test.TestCase):
       mon_sess = monitored_session._HookedSession(
           sess=sess, hooks=[mock_hook, mock_hook2])
       a_tensor = constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       mon_sess.run(a_tensor)
 
       for hook in [mock_hook, mock_hook2]:
@@ -1222,7 +1240,7 @@ class HookedSessionTest(test.TestCase):
       mon_sess = monitored_session._HookedSession(
           sess=sess, hooks=[mock_hook, mock_hook2])
       constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       mon_sess.run(fetches='a_tensor')
       self.assertFalse(mon_sess.should_stop())
@@ -1242,7 +1260,7 @@ class HookedSessionTest(test.TestCase):
       third_tensor = constant_op.constant([10], name='third_tensor')
       mock_hook.request = session_run_hook.SessionRunArgs([another_tensor])
       mock_hook2.request = session_run_hook.SessionRunArgs([third_tensor])
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       output = mon_sess.run(fetches=a_tensor)
       self.assertEqual(output, [0])
@@ -1262,7 +1280,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(mon_sess.run(fetches=add_tensor), [15])
 
@@ -1280,7 +1298,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       feed_dict = {c_tensor: [20]}
       self.assertEqual(
@@ -1301,7 +1319,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={a_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor)
@@ -1319,7 +1337,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor, feed_dict={b_tensor: [10]})
@@ -1451,6 +1469,7 @@ class MonitoredSessionTest(test.TestCase):
   # This set of tests, verifies the supervised session behavior when exceptions
   # are raised next to the innermost session run() call.
 
+  @test_util.run_deprecated_v1
   def test_recovery(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_recovery')
     with ops.Graph().as_default():
@@ -1803,6 +1822,7 @@ class MonitoredSessionTest(test.TestCase):
             isinstance(hook.run_metadata_list[0], config_pb2.RunMetadata))
         self.assertGreater(len(hook.run_metadata_list[0].partition_graphs), 0)
 
+  @test_util.run_deprecated_v1
   def test_with_statement_and_close(self):
     # Test case for https://github.com/tensorflow/tensorflow/issues/12224
     # where close() inside the with should have a better error message.
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 041266da3e4c95c024846b92c0ef4b57cf0a89d5..72670f0ca39f67b151abcb1813ede7ee36c6544b 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
@@ -36,9 +38,8 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
   The moving average of 'variable' updated with 'value' is:
     variable * decay + value * (1 - decay)
 
-  The returned Operation sets 'variable' to the newly computed moving average.
-
-  The new value of 'variable' can be set with the 'AssignSub' op as:
+  The returned Operation sets 'variable' to the newly computed moving average,
+  by performing this subtraction:
      variable -= (1 - decay) * (variable - value)
 
   Since variables that are initialized to a `0` value will be `0` biased,
@@ -50,7 +51,7 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
 
   The names of the debias shadow variables, by default, include both the scope
   they were created in and the scope of the variables they debias. They are also
-  given a uniqifying-suffix.
+  given a uniquifying-suffix.
 
   E.g.:
 
@@ -58,8 +59,8 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
     with tf.variable_scope('scope1'):
       with tf.variable_scope('scope2'):
         var = tf.get_variable('foo')
-        tf.assign_moving_average(var, 0.0, 1.0)
-        tf.assign_moving_average(var, 0.0, 0.9)
+        update_1 = tf.assign_moving_average(var, 0.0, 1.0)
+        update_2 = tf.assign_moving_average(var, 0.0, 0.9)
 
     # var.name: 'scope1/scope2/foo'
     # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased'
@@ -76,20 +77,33 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
     name: Optional name of the returned operation.
 
   Returns:
-    A reference to the input 'variable' tensor with the newly computed
-    moving average.
+    A tensor which if evaluated will compute and return the new moving average.
   """
+  def update_fn(v, value, decay=decay):
+    decay = ops.convert_to_tensor(1.0 - decay, name="decay")
+    if decay.dtype != v.dtype.base_dtype:
+      decay = math_ops.cast(decay, v.dtype.base_dtype)
+    if zero_debias:
+      update_delta = _zero_debias(v, value, decay)
+    else:
+      update_delta = (v - value) * decay
+    return state_ops.assign_sub(v, update_delta, name=scope)
+
   with ops.name_scope(name, "AssignMovingAvg",
                       [variable, value, decay]) as scope:
-    with ops.colocate_with(variable):
-      decay = ops.convert_to_tensor(1.0 - decay, name="decay")
-      if decay.dtype != variable.dtype.base_dtype:
-        decay = math_ops.cast(decay, variable.dtype.base_dtype)
-      if zero_debias:
-        update_delta = _zero_debias(variable, value, decay)
-      else:
-        update_delta = (variable - value) * decay
-      return state_ops.assign_sub(variable, update_delta, name=scope)
+    replica_context = distribution_strategy_context.get_replica_context()
+    if replica_context:
+      # In a replica context, we update variable using the mean of value across
+      # replicas.
+      def merge_fn(strategy, v, value):
+        value = strategy.extended.reduce_to(
+            ds_reduce_util.ReduceOp.MEAN, value, v)
+        return strategy.update(v, update_fn, value)
+
+      return replica_context.merge_call(merge_fn, args=(variable, value))
+    else:
+      strategy = distribution_strategy_context.get_cross_replica_context()
+      return strategy.update(variable, update_fn, value)
 
 
 def weighted_moving_average(value,
@@ -379,8 +393,6 @@ class ExponentialMovingAverage(object):
 
     Raises:
       TypeError: If the arguments are not an allowed type.
-      ValueError: If the moving average of one of the variables is already
-        being computed.
     """
     # TODO(touts): op_scope
     if var_list is None:
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index bb2fca66e3c1eed8f3143fa98fe0100a8eb71bbe..03bcde9c8498ed03d2eaf52c7f1e2d4211e0ddc6 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.training import saver as saver_lib
 
 class MovingAveragesTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverageWithoutZeroDebias(self):
     with self.cached_session():
       var = variables.Variable([10.0, 11.0])
@@ -43,12 +44,13 @@ class MovingAveragesTest(test.TestCase):
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
       variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var.eval())
+      self.assertAllClose([10.0, 11.0], self.evaluate(var))
       assign.op.run()
       self.assertAllClose(
           [10.0 * 0.25 + 1.0 * (1.0 - 0.25), 11.0 * 0.25 + 2.0 * (1.0 - 0.25)],
-          var.eval())
+          self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverage(self):
     with self.cached_session():
       var = variables.Variable([0.0, 0.0])
@@ -56,12 +58,13 @@ class MovingAveragesTest(test.TestCase):
       decay = 0.25
       assign = moving_averages.assign_moving_average(var, val, decay)
       variables.global_variables_initializer().run()
-      self.assertAllClose([0.0, 0.0], var.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var))
       assign.op.run()
-      self.assertAllClose([
-          1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)
-      ], var.eval())
+      self.assertAllClose(
+          [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
+          self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverageNewNamingMultipleCalls(self):
     with variable_scope.variable_scope("scope1") as vs1:
       with variable_scope.variable_scope("scope2"):
@@ -76,6 +79,7 @@ class MovingAveragesTest(test.TestCase):
     actual_names = [v.name for v in vs1.global_variables()]
     self.assertSetEqual(set(expected_names), set(actual_names))
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverageNewNamingMultipleCallsWithReuse(self):
     with variable_scope.variable_scope("scope1") as vs1:
       var = variable_scope.get_variable("Var", shape=[])
@@ -86,6 +90,7 @@ class MovingAveragesTest(test.TestCase):
       moving_averages.assign_moving_average(var, 0.0, 0.99)
       moving_averages.assign_moving_average(var, 0.0, 0.99)
 
+  @test_util.run_deprecated_v1
   def testWeightedMovingAverage(self):
     with self.cached_session() as sess:
       decay = 0.5
@@ -111,6 +116,7 @@ class MovingAveragesTest(test.TestCase):
       denominator_2 = denominator_1 * decay + weight_2 * (1.0 - decay)
       self.assertAllClose(numerator_2 / denominator_2, wma_array)
 
+  @test_util.run_deprecated_v1
   def testWeightedMovingAverageBfloat16(self):
     bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
     with self.cached_session() as sess:
@@ -179,66 +185,72 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertEqual("add/ExponentialMovingAverage:0", avg2.name)
 
     # Check initial values.
-    self.assertAllClose(tens, var0.eval())
-    self.assertAllClose(thirties, var1.eval())
-    self.assertAllClose(_Repeat(10.0 + 30.0, dim), tensor2.eval())
+    self.assertAllClose(tens, self.evaluate(var0))
+    self.assertAllClose(thirties, self.evaluate(var1))
+    self.assertAllClose(_Repeat(10.0 + 30.0, dim), self.evaluate(tensor2))
 
     # Check that averages are initialized correctly.
-    self.assertAllClose(tens, avg0.eval())
-    self.assertAllClose(thirties, avg1.eval())
+    self.assertAllClose(tens, self.evaluate(avg0))
+    self.assertAllClose(thirties, self.evaluate(avg1))
     # Note that averages of Tensor's initialize to zeros_like since no value
     # of the Tensor is known because the Op has not been run (yet).
-    self.assertAllClose(_Repeat(0.0, dim), avg2.eval())
+    self.assertAllClose(_Repeat(0.0, dim), self.evaluate(avg2))
 
     # Update the averages and check.
     update.run()
     dk = actual_decay
 
     expected = _Repeat(10.0 * dk + 10.0 * (1 - dk), dim)
-    self.assertAllClose(expected, avg0.eval())
+    self.assertAllClose(expected, self.evaluate(avg0))
     expected = _Repeat(30.0 * dk + 30.0 * (1 - dk), dim)
-    self.assertAllClose(expected, avg1.eval())
+    self.assertAllClose(expected, self.evaluate(avg1))
     expected = _Repeat(0.0 * dk + (10.0 + 30.0) * (1 - dk) / _Scale(dk, 1), dim)
-    self.assertAllClose(expected, avg2.eval())
+    self.assertAllClose(expected, self.evaluate(avg2))
 
     # Again, update the averages and check.
     update.run()
     expected = _Repeat((10.0 * dk + 10.0 * (1 - dk)) * dk + 10.0 * (1 - dk),
                        dim)
-    self.assertAllClose(expected, avg0.eval())
+    self.assertAllClose(expected, self.evaluate(avg0))
     expected = _Repeat((30.0 * dk + 30.0 * (1 - dk)) * dk + 30.0 * (1 - dk),
                        dim)
-    self.assertAllClose(expected, avg1.eval())
+    self.assertAllClose(expected, self.evaluate(avg1))
     expected = _Repeat(((0.0 * dk + (10.0 + 30.0) * (1 - dk)) * dk +
                         (10.0 + 30.0) * (1 - dk)) / _Scale(dk, 2), dim)
-    self.assertAllClose(expected, avg2.eval())
+    self.assertAllClose(expected, self.evaluate(avg2))
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Scalar(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25)
       self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Scalar_Debias(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Vector(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25)
       self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Vector_Debias(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Scalar(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
       ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
       self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Scalar_Debias(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
@@ -246,12 +258,14 @@ class ExponentialMovingAverageTest(test.TestCase):
           0.25, num_updates=1, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Vector(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
       ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
       self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Vector_Debias(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
@@ -259,6 +273,7 @@ class ExponentialMovingAverageTest(test.TestCase):
           0.25, num_updates=1, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesWithControlDeps(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(0, name="v0")
@@ -274,16 +289,17 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual([], v1_avg.value().op.control_inputs)
       self.assertEqual([], v1_avg.value().op.control_inputs)
       # We should be able to initialize v1_avg before v0.
-      sess.run(v1_avg.initializer)
-      sess.run(v0.initializer)
-      self.assertEqual([10.0], sess.run(v1_avg))
+      self.evaluate(v1_avg.initializer)
+      self.evaluate(v0.initializer)
+      self.assertEqual([10.0], self.evaluate(v1_avg))
       # running ema_op should add to v0 (in addition to updating v1_avg)
-      sess.run(assign_to_v1)
-      sess.run(ema_op)
-      self.assertEqual(1, sess.run(v0))
-      self.assertEqual([17.5], sess.run(v1_avg))
+      self.evaluate(assign_to_v1)
+      self.evaluate(ema_op)
+      self.assertEqual(1, self.evaluate(v0))
+      self.assertEqual([17.5], self.evaluate(v1_avg))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testBasicEager(self):
     v0 = variables.Variable(1.0)
     v1 = variables.Variable(2.0)
@@ -339,9 +355,11 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
       self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNames(self):
     self.averageVariablesNamesHelper(zero_debias=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNamesNoDebias(self):
     self.averageVariablesNamesHelper(zero_debias=False)
 
@@ -387,12 +405,15 @@ class ExponentialMovingAverageTest(test.TestCase):
         self.assertEqual(
             ema.average(tensor2).op.name, ema.average_name(tensor2))
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNamesRespectScope(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNamesRespectScopeNoDebias(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSubsetAverageVariablesNames(self):
     with self.cached_session():
       v0 = variables.Variable(10.0, name="v0")
@@ -421,6 +442,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
       self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesDeviceAssignment(self):
     with ops.device("/job:dev_v0"):
       v0 = variables.Variable(10.0, name="v0")
@@ -451,6 +473,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       _ = saver_lib.import_meta_graph(meta_graph)
     return graph_copy
 
+  @test_util.run_deprecated_v1
   def testImportedGraphVariablesToRestore(self):
     g = ops.Graph()
     with g.as_default():
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 47034919e123a07542912f31e06725025c62b9a1..d9ebdcad1f3c83c0e0d4b8496d601fce2669fbff 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -22,6 +22,11 @@ from __future__ import print_function
 
 import abc
 
+import six
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -34,8 +39,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
@@ -84,6 +87,7 @@ def _var_key(var):
   return var._unique_id  # pylint: disable=protected-access
 
 
+@six.add_metaclass(abc.ABCMeta)
 class _OptimizableVariable(object):
   """Interface for abstracting over variables in the optimizers."""
 
@@ -197,8 +201,7 @@ def _get_processor(v):
       return _TensorProcessor(v)
     else:
       return _DenseResourceVariableProcessor(v)
-  if isinstance(
-      v, resource_variable_ops.ResourceVariable) and not v._in_graph_mode:  # pylint: disable=protected-access
+  if resource_variable_ops.is_resource_variable(v) and not v._in_graph_mode:  # pylint: disable=protected-access
     # True if and only if `v` was initialized eagerly.
     return _DenseResourceVariableProcessor(v)
   if v.op.type == "VarHandleOp":
@@ -210,7 +213,7 @@ def _get_processor(v):
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
-@tf_export("train.Optimizer")
+@tf_export(v1=["train.Optimizer"])
 class Optimizer(
     # Optimizers inherit from CheckpointableBase rather than Checkpointable
     # since they do most of their dependency management themselves (slot
@@ -340,10 +343,10 @@ class Optimizer(
     self._deferred_slot_restorations = {}
 
     # TODO(isaprykin): When using a DistributionStrategy, and when an
-    # optimizer is created in each tower, it might be dangerous to
-    # rely on some Optimer methods.  When such methods are called on a
-    # per-tower optimizer, an exception needs to be thrown.  We do
-    # allow creation per-tower optimizers however, because the
+    # optimizer is created in each replica, it might be dangerous to
+    # rely on some Optimizer methods.  When such methods are called on a
+    # per-replica optimizer, an exception needs to be thrown.  We do
+    # allow creation per-replica optimizers however, because the
     # compute_gradients()->apply_gradients() sequence is safe.
 
   def get_name(self):
@@ -458,16 +461,11 @@ class Optimizer(
           tape.watch(var_list)
         loss_value = loss()
 
-        # Scale loss if using a "mean" loss reduction and multiple towers.
+        # Scale loss if using a "mean" loss reduction and multiple replicas.
         # Have to be careful to call distribute_lib.get_loss_reduction()
         # *after* loss() is evaluated, so we know what loss reduction it uses.
         # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-        if (distribute_lib.get_loss_reduction() ==
-            variable_scope.VariableAggregation.MEAN):
-          num_towers = distribution_strategy_context.get_distribution_strategy(
-          ).num_towers
-          if num_towers > 1:
-            loss_value *= (1. / num_towers)
+        loss_value = self._scale_loss(loss_value)
 
       if var_list is None:
         var_list = tape.watched_variables()
@@ -483,13 +481,8 @@ class Optimizer(
           "`loss` passed to Optimizer.compute_gradients should "
           "be a function when eager execution is enabled.")
 
-    # Scale loss if using a "mean" loss reduction and multiple towers.
-    if (distribute_lib.get_loss_reduction() ==
-        variable_scope.VariableAggregation.MEAN):
-      num_towers = distribution_strategy_context.get_distribution_strategy(
-      ).num_towers
-      if num_towers > 1:
-        loss *= (1. / num_towers)
+    # Scale loss if using a "mean" loss reduction and multiple replicas.
+    loss = self._scale_loss(loss)
 
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
@@ -525,6 +518,15 @@ class Optimizer(
          if g is not None and v.dtype != dtypes.resource])
     return grads_and_vars
 
+  @staticmethod
+  def _scale_loss(loss_value):
+    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
+      num_replicas = \
+        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      if num_replicas > 1:
+        loss_value *= (1. / num_replicas)
+    return loss_value
+
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     """Apply gradients to variables.
 
@@ -553,16 +555,16 @@ class Optimizer(
     # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
 
     # Handle DistributionStrategy case.
-    if distribution_strategy_context.get_cross_tower_context():
+    if distribute_ctx.get_cross_replica_context():
       raise RuntimeError("Use `_distributed_apply()` instead of "
-                         "`apply_gradients()` in a cross-tower context.")
+                         "`apply_gradients()` in a cross-replica context.")
     # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
     # always calling _distributed_apply(), using the default distribution
     # as needed.
-    if distribution_strategy_context.has_distribution_strategy():
+    if distribute_ctx.has_distribution_strategy():
       grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
-      return distribution_strategy_context.get_tower_context().merge_call(
-          self._distributed_apply, grads_and_vars, global_step, name)
+      return distribute_ctx.get_replica_context().merge_call(
+          self._distributed_apply, args=(grads_and_vars, global_step, name))
 
     # No DistributionStrategy case.
     grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
@@ -637,16 +639,16 @@ class Optimizer(
                          grads_and_vars,
                          global_step=None,
                          name=None):
-    """A version of `apply_gradients` for cross-tower context.
+    """A version of `apply_gradients` for cross-replica context.
 
     This is a version of `apply_gradients()` for when you are using a
-    `DistributionStrategy` and are in a cross-tower context. If in a
-    tower context, use `apply_gradients()` as normal.
+    `DistributionStrategy` and are in a cross-replica context. If in a
+    replica context, use `apply_gradients()` as normal.
 
     Args:
       distribution: A `DistributionStrategy` object.
       grads_and_vars: List of (gradient, variable) pairs as returned by
-        `compute_gradients()`, and then aggregated across towers.
+        `compute_gradients()`, and then aggregated across replicas.
       global_step: Optional (mirrored) `Variable` to increment by one
         after the variables have been updated.
       name: Optional name for the returned operation.  Default to the
@@ -654,15 +656,17 @@ class Optimizer(
 
     Returns:
       An `Operation` that applies the specified gradients across all
-      towers. If `global_step` was not None, that operation also
-      increments `global_step`.
+      replicas. If `global_step` was not None, that operation also
+      increments `global_step`
     """
-    reduced_grads = distribution.batch_reduce(
-        variable_scope.VariableAggregation.SUM, grads_and_vars)
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
     grads_and_vars = zip(reduced_grads, var_list)
-    # Note that this is called in a cross-tower context.
-    self._create_slots(var_list)
+
+    # Note that this is called in a cross-replica context.
+    with ops.init_scope():
+      self._create_slots(var_list)
 
     def update(v, g):
       """Apply gradients to a replica variable."""
@@ -679,7 +683,13 @@ class Optimizer(
             "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
       p = _get_processor(v)
 
-      scope_name = "" if context.executing_eagerly() else v.op.name
+      if context.executing_eagerly() or (
+          resource_variable_ops.is_resource_variable(v) and
+          not v._in_graph_mode):  # pylint: disable=protected-access
+        scope_name = v.name.split(":")[0]
+      else:
+        scope_name = v.op.name
+
       # device_policy is set because non-mirrored tensors will be read in
       # `update_op`. `_resource_apply_dense`, `lr_t`, `beta1_t` and `beta2_t`
       # is an example.
@@ -692,21 +702,23 @@ class Optimizer(
       update_ops = [
           op
           for grad, var in grads_and_vars
-          for op in distribution.update(var, update, grad, grouped=False)
+          for op in distribution.extended.update(
+              var, update, args=(grad,), group=False)
       ]
 
       def finish(self, update_ops):
         return self._finish(update_ops, "update")
 
-      non_slot_devices = distribution.non_slot_devices(var_list)
-      finish_updates = distribution.update_non_slot(
-          non_slot_devices, finish, self, update_ops, grouped=False)
+      non_slot_devices = distribution.extended.non_slot_devices(var_list)
+      finish_updates = distribution.extended.update_non_slot(
+          non_slot_devices, finish, args=(self, update_ops), group=False)
       if global_step is None:
         apply_updates = distribution.group(finish_updates, name=name)
       else:
         with ops.control_dependencies(finish_updates):
-          apply_updates = distribution.update(
-              global_step, state_ops.assign_add, 1, name=name)
+          apply_updates = distribution.extended.update(
+              global_step, state_ops.assign_add, args=(1,),
+              kwargs={"name": name})
 
       if not context.executing_eagerly():
         if isinstance(apply_updates, ops.Tensor):
@@ -744,7 +756,7 @@ class Optimizer(
       # `_resource_apply_dense`.
       distributed_container = var._distributed_container()
       assert distributed_container is not None
-      if context.executing_eagerly():
+      if ops.executing_eagerly_outside_functions():
         key = distributed_container._unique_id
       else:
         key = (distributed_container.graph, distributed_container._shared_name)
@@ -803,8 +815,7 @@ class Optimizer(
     v = self._non_slot_dict.get(key, None)
     if v is None:
       self._maybe_initialize_checkpointable()
-      distribution_strategy = (
-          distribution_strategy_context.get_distribution_strategy())
+      distribution_strategy = distribute_ctx.get_distribution_strategy()
       with distribution_strategy.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index 7a7d01d50e0b6dc639d0d511f03d121c3a9e5c73..e175b5a79989e4c7b6b4c736eefe0250e9ebbcc9 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -62,6 +62,7 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose([-14., -13.], self.evaluate(var0))
       self.assertAllClose([-6., -5.], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testAggregationMethod(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -79,14 +80,15 @@ class OptimizerTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd through optimizer
         opt_op.run()
         # Validate updated params
-        self.assertAllClose([-14., -13.], var0.eval())
-        self.assertAllClose([-6., -5.], var1.eval())
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testPrecomputedGradient(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -102,15 +104,15 @@ class OptimizerTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd through optimizer
         opt_op.run()
         # Validate updated params
         self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
-                            var0.eval())
+                            self.evaluate(var0))
         self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
-                            var1.eval())
+                            self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testNoVariables(self):
@@ -230,6 +232,7 @@ class OptimizerTest(test.TestCase):
     with self.assertRaises(NotImplementedError):
       sgd_op.apply_gradients(grads_and_vars)
 
+  @test_util.run_deprecated_v1
   def testTrainOp(self):
     with self.cached_session():
       var0 = variables.Variable([1.0, 2.0])
@@ -241,6 +244,7 @@ class OptimizerTest(test.TestCase):
       opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
       self.assertTrue(opt_op in ops.get_collection(ops.GraphKeys.TRAIN_OP))
 
+  @test_util.run_deprecated_v1
   def testConstraint(self):
     constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
     constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
@@ -257,13 +261,13 @@ class OptimizerTest(test.TestCase):
 
       variables.global_variables_initializer().run()
       # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([3.0, 4.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
       # Run 1 step of sgd through optimizer
       opt_op.run()
       # Validate updated params
-      self.assertAllClose([-0.1, -0.1], var0.eval())
-      self.assertAllClose([0., 0.], var1.eval())
+      self.assertAllClose([-0.1, -0.1], self.evaluate(var0))
+      self.assertAllClose([0., 0.], self.evaluate(var1))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
index 9bd677b8efcd447f74ec2a3cbe94d63eeb9a4dd1..2ea628a56b47b36a423b5ebdd3d8afef5f41c6bc 100644
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -26,7 +26,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.ProximalAdagradOptimizer")
+@tf_export(v1=["train.ProximalAdagradOptimizer"])
 class ProximalAdagradOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the Proximal Adagrad algorithm.
diff --git a/tensorflow/python/training/proximal_adagrad_test.py b/tensorflow/python/training/proximal_adagrad_test.py
index 74e06a5e2e68adc1b214110c6fc2268e50b30879..ce214ac418a01455b113ad261971434727994a3e 100644
--- a/tensorflow/python/training/proximal_adagrad_test.py
+++ b/tensorflow/python/training/proximal_adagrad_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -48,7 +49,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([0.0, 0.0], v0_val)
       self.assertAllClose([0.0, 0.0], v1_val)
 
@@ -56,7 +57,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       for _ in range(3):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-2.60260963, -4.29698515]), v0_val)
       self.assertAllClose(np.array([-0.28432083, -0.56694895]), v1_val)
       opt_vars = opt.variables()
@@ -64,12 +65,15 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       self.assertStartsWith(opt_vars[1].name, var1._shared_name)
       self.assertEqual(2, len(opt_vars))
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradwithoutRegularization(self):
     self.doTestProximalAdagradwithoutRegularization(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceProximalAdagradwithoutRegularization(self):
     self.doTestProximalAdagradwithoutRegularization(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradwithoutRegularization2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -85,17 +89,18 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-1.60261, -2.296985]), v0_val)
       self.assertAllClose(np.array([3.715679, 2.433051]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -106,13 +111,15 @@ class ProximalAdagradOptimizerTest(test.TestCase):
         sgd_op = proximal_adagrad.ProximalAdagradOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0, 1]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradWithL1(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -128,17 +135,18 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
       # Run 10 steps Proximal Adagrad
       for _ in range(10):
         update.run()
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-6.663634, -9.190331]), v0_val)
       self.assertAllClose(np.array([2.959304, 1.029232]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradWithL1_L2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -154,7 +162,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
@@ -162,7 +170,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       for _ in range(10):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-0.0495, -0.0995]), v0_val)
       self.assertAllClose(np.array([-0.0045, -0.0095]), v1_val)
 
@@ -190,7 +198,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     variables.global_variables_initializer().run()
 
     sess = ops.get_default_session()
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllClose([[1.0], [2.0]], v0_val)
       self.assertAllClose([[3.0], [4.0]], v1_val)
@@ -202,9 +210,10 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     for _ in range(steps):
       update.run()
 
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
+  @test_util.run_deprecated_v1
   def testEquivAdagradwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
@@ -222,6 +231,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     self.assertAllClose(val0, val2)
     self.assertAllClose(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivSparseAdagradwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
diff --git a/tensorflow/python/training/proximal_gradient_descent_test.py b/tensorflow/python/training/proximal_gradient_descent_test.py
index f77f68b23432a59f509e73158ee6893021bbc138..25b206605dc7315216e48a22d597f7342742a5ca 100644
--- a/tensorflow/python/training/proximal_gradient_descent_test.py
+++ b/tensorflow/python/training/proximal_gradient_descent_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -50,7 +51,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([0.0, 0.0], v0_val)
       self.assertAllClose([0.0, 0.0], v1_val)
 
@@ -58,16 +59,19 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       for _ in range(3):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-0.9, -1.8]), v0_val)
       self.assertAllClose(np.array([-0.09, -0.18]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testProximalGradientDescentwithoutRegularization(self):
     self.doTestProximalGradientDescentwithoutRegularization(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceProximalGradientDescentwithoutRegularization(self):
     self.doTestProximalGradientDescentwithoutRegularization(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testProximalGradientDescentwithoutRegularization2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -80,7 +84,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
@@ -88,10 +92,11 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       for _ in range(3):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([0.1, 0.2]), v0_val)
       self.assertAllClose(np.array([3.91, 2.82]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -103,13 +108,15 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
             1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testProximalGradientDescentWithL1_L2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -122,7 +129,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
@@ -130,7 +137,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       for _ in range(10):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-0.0495, -0.0995]), v0_val)
       self.assertAllClose(np.array([-0.0045, -0.0095]), v1_val)
 
@@ -158,7 +165,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     variables.global_variables_initializer().run()
 
     sess = ops.get_default_session()
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllClose([[1.0], [2.0]], v0_val)
       self.assertAllClose([[3.0], [4.0]], v1_val)
@@ -170,9 +177,10 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     for _ in range(steps):
       update.run()
 
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
+  @test_util.run_deprecated_v1
   def testEquivSparseGradientDescentwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
@@ -189,6 +197,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     self.assertAllClose(val0, val2)
     self.assertAllClose(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivGradientDescentwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
diff --git a/tensorflow/python/training/quantize_training_test.py b/tensorflow/python/training/quantize_training_test.py
index 6edbf7665fbd59eea04294551452b764856563a9..2352af7e99b5bab99826fb9a628a98846e25444c 100644
--- a/tensorflow/python/training/quantize_training_test.py
+++ b/tensorflow/python/training/quantize_training_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -52,6 +53,7 @@ class PywrapQuantizeTrainingTest(test.TestCase):
 
   # Test that save/restoring works for EMA variables generated in the
   # quantized training rewrite.
+  @test_util.run_v1_only('b/120545219')
   def testQuantizedSaveRestore(self):
     save_path = os.path.join(self.get_temp_dir(), 'quantized_save_restore')
 
@@ -73,11 +75,11 @@ class PywrapQuantizeTrainingTest(test.TestCase):
       _ = importer.import_graph_def(result, name='')
 
       # Initialize the variable.
-      sess.run(g.get_operation_by_name(init_op.name))
+      self.evaluate(g.get_operation_by_name(init_op.name))
 
       # Run the graph for one step to assign values to the quantization min/max
       # variables.
-      sess.run(g.get_tensor_by_name(c.name))
+      self.evaluate(g.get_tensor_by_name(c.name))
 
       saver.save(sess, save_path)
 
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 15fe42bbd851fec831ef2a84401c1c7f1cac1973..2f6e924f98e5068d9f50e6efe93c58771b9acade 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import variables
@@ -40,6 +41,7 @@ _MockOp = collections.namedtuple("MockOp", ["name"])
 
 class QueueRunnerTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -58,8 +60,9 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
+  @test_util.run_v1_only("b/120545219")
   def testTwoOps(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -80,9 +83,10 @@ class QueueRunnerTest(test.TestCase):
       for t in threads:
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
-      self.assertEqual(3, var0.eval())
-      self.assertEqual(30, var1.eval())
+      self.assertEqual(3, self.evaluate(var0))
+      self.assertEqual(30, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testExceptionsCaptured(self):
     with self.cached_session() as sess:
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
@@ -99,6 +103,7 @@ class QueueRunnerTest(test.TestCase):
       self.assertTrue("Operation not in the graph" in str(exceptions[0]))
       self.assertTrue("Operation not in the graph" in str(exceptions[1]))
 
+  @test_util.run_deprecated_v1
   def testRealDequeueEnqueue(self):
     with self.cached_session() as sess:
       q0 = data_flow_ops.FIFOQueue(3, dtypes.float32)
@@ -121,12 +126,13 @@ class QueueRunnerTest(test.TestCase):
       # It should have terminated cleanly.
       self.assertEqual(0, len(qr.exceptions_raised))
       # The 2 values should be in queue1.
-      self.assertEqual(10.0, dequeue1.eval())
-      self.assertEqual(10.0, dequeue1.eval())
+      self.assertEqual(10.0, self.evaluate(dequeue1))
+      self.assertEqual(10.0, self.evaluate(dequeue1))
       # And queue1 should now be closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError, "is closed"):
-        dequeue1.eval()
+        self.evaluate(dequeue1)
 
+  @test_util.run_v1_only("b/120545219")
   def testRespectCoordShouldStop(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -149,8 +155,9 @@ class QueueRunnerTest(test.TestCase):
       coord.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 0.
-      self.assertEqual(0, var.eval())
+      self.assertEqual(0, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testRequestStopOnException(self):
     with self.cached_session() as sess:
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
@@ -163,6 +170,7 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, "Operation not in the graph"):
         coord.join()
 
+  @test_util.run_deprecated_v1
   def testGracePeriod(self):
     with self.cached_session() as sess:
       # The enqueue will quickly block.
@@ -180,6 +188,7 @@ class QueueRunnerTest(test.TestCase):
       # the queue to be closed and the enqueue to terminate.
       coord.join(stop_grace_period_secs=1.0)
 
+  @test_util.run_deprecated_v1
   def testMultipleSessions(self):
     with self.cached_session() as sess:
       with session.Session() as other_sess:
@@ -195,6 +204,7 @@ class QueueRunnerTest(test.TestCase):
         other_threads = qr.create_threads(other_sess, coord=coord)
         self.assertEqual(len(threads), len(other_threads))
 
+  @test_util.run_deprecated_v1
   def testIgnoreMultiStarts(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -211,6 +221,7 @@ class QueueRunnerTest(test.TestCase):
       new_threads = qr.create_threads(sess, coord=coord)
       self.assertEqual([], new_threads)
 
+  @test_util.run_v1_only("b/120545219")
   def testThreads(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -238,6 +249,7 @@ class QueueRunnerTest(test.TestCase):
       self.assertEqual(1, len(exceptions))
       self.assertTrue("Operation not in the graph" in str(exceptions[0]))
 
+  @test_util.run_deprecated_v1
   def testName(self):
     with ops.name_scope("scope"):
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32, name="queue")
@@ -247,6 +259,7 @@ class QueueRunnerTest(test.TestCase):
     self.assertEqual(
         1, len(ops.get_collection(ops.GraphKeys.QUEUE_RUNNERS, "scope")))
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunners(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -263,8 +276,9 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunnersRaisesIfNotASession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
     var = variables.VariableV1(zero64)
@@ -278,6 +292,7 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(TypeError, "tf.Session"):
         queue_runner_impl.start_queue_runners("NotASession")
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunnersIgnoresMonitoredSession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
     var = variables.VariableV1(zero64)
@@ -292,6 +307,7 @@ class QueueRunnerTest(test.TestCase):
           monitored_session.MonitoredSession())
       self.assertFalse(threads)
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunnersNonDefaultGraph(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     graph = ops.Graph()
@@ -310,7 +326,7 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
   def testQueueRunnerSerializationRoundTrip(self):
     graph = ops.Graph()
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index f38c9861d64aa258cde07ccd3041d3c50932c33b..fb53b5883f5b0246e3e99cb00f972dcf4eb9c409 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -50,7 +50,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.RMSPropOptimizer")
+@tf_export(v1=["train.RMSPropOptimizer"])
 class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
diff --git a/tensorflow/python/training/rmsprop_test.py b/tensorflow/python/training/rmsprop_test.py
index 4f5f96e2b477249853b9f7d8c0e7b67c811a94a3..8f029d5310e9422e4f6dbc1c874f118d3c05d95d 100644
--- a/tensorflow/python/training/rmsprop_test.py
+++ b/tensorflow/python/training/rmsprop_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -88,11 +89,12 @@ class RMSPropOptimizerTest(test.TestCase):
       var_t[gindex] = var[gindex] - mom_t[gindex]
     return var_t, mg_t, rms_t, mom_t
 
+  @test_util.run_deprecated_v1
   def testDense(self):
     # TODO(yori): Use ParameterizedTest when available
     for (dtype, learning_rate, decay, momentum,
          epsilon, centered, use_resource) in _TESTPARAMS:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
@@ -115,7 +117,7 @@ class RMSPropOptimizerTest(test.TestCase):
             centered=centered)
 
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         mg0 = opt.get_slot(var0, "mg")
         self.assertEqual(mg0 is not None, centered)
@@ -138,12 +140,12 @@ class RMSPropOptimizerTest(test.TestCase):
         mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 4 steps of RMSProp
         for _ in range(1, 5):
-          update.run()
+          self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
               var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
@@ -154,15 +156,16 @@ class RMSPropOptimizerTest(test.TestCase):
 
           # Validate updated params
           if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -176,15 +179,17 @@ class RMSPropOptimizerTest(test.TestCase):
             momentum=0.0,
             epsilon=0.0,
             centered=False).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0., 1.]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0., 1.]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariableCentered(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -198,20 +203,22 @@ class RMSPropOptimizerTest(test.TestCase):
             momentum=0.0,
             epsilon=1.0,
             centered=True).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     # TODO(yori): Use ParameterizedTest when available
     for (dtype, learning_rate, decay,
          momentum, epsilon, centered, _) in _TESTPARAMS:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
@@ -235,7 +242,7 @@ class RMSPropOptimizerTest(test.TestCase):
             epsilon=epsilon,
             centered=centered)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         mg0 = opt.get_slot(var0, "mg")
         self.assertEqual(mg0 is not None, centered)
@@ -258,12 +265,12 @@ class RMSPropOptimizerTest(test.TestCase):
         mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 4 steps of RMSProp
         for _ in range(1, 5):
-          update.run()
+          self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
               var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
@@ -274,18 +281,19 @@ class RMSPropOptimizerTest(test.TestCase):
 
           # Validate updated params
           if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
   def testWithoutMomentum(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -293,7 +301,7 @@ class RMSPropOptimizerTest(test.TestCase):
         opt = rmsprop.RMSPropOptimizer(
             learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
@@ -305,34 +313,36 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertTrue(mom1 is not None)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the rms accumulators where 1. So we should see a normal
         # update: v -= grad * learning_rate
-        update.run()
+        self.evaluate(update)
         # Check the root mean square accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
+            np.array([0.901, 0.901]), self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
+            np.array([0.90001, 0.90001]), self.evaluate(rms1))
         # Check the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
         # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
+        self.evaluate(update)
         # Check the rms accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]),
+            self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]),
+            self.evaluate(rms1))
         # Check the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
@@ -340,18 +350,19 @@ class RMSPropOptimizerTest(test.TestCase):
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testWithMomentum(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -360,7 +371,7 @@ class RMSPropOptimizerTest(test.TestCase):
         opt = rmsprop.RMSPropOptimizer(
             learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
@@ -372,57 +383,61 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertTrue(mom1 is not None)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: rms = 1, mom = 0. So we should see a normal
         # update: v -= grad * learning_rate
-        update.run()
+        self.evaluate(update)
         # Check the root mean square accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
+            np.array([0.901, 0.901]), self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
+            np.array([0.90001, 0.90001]), self.evaluate(rms1))
         # Check the momentum accumulators
         self.assertAllCloseAccordingToType(
             np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]),
+            self.evaluate(mom0))
         self.assertAllCloseAccordingToType(
             np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]),
+            self.evaluate(mom1))
 
         # Check that the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
         # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
+        self.evaluate(update)
         # Check the rms accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]),
+            self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]),
+            self.evaluate(rms1))
         self.assertAllCloseAccordingToType(
             np.array([
                 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
                 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
-            ]), mom0.eval())
+            ]), self.evaluate(mom0))
         self.assertAllCloseAccordingToType(
             np.array([
                 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
                 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
-            ]), mom1.eval())
+            ]), self.evaluate(mom1))
 
         # Check the parameters.
         self.assertAllCloseAccordingToType(
@@ -433,7 +448,7 @@ class RMSPropOptimizerTest(test.TestCase):
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
                 (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                  (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
 
         self.assertAllCloseAccordingToType(
             np.array([
@@ -443,7 +458,7 @@ class RMSPropOptimizerTest(test.TestCase):
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
                 (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                  (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
   def testCallableParams(self):
     with context.eager_mode():
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 5b2b19e9130ab8ac2e95e5f4d680387b8dbcc335..348b8bf1ef0a89a971eb26c9cb7e5f9d01c51a4b 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -14,7 +14,11 @@
 # ==============================================================================
 
 # pylint: disable=invalid-name
-"""Save and restore variables."""
+"""Save and restore variables.
+
+Symbols in this file are deprecated. See replacements in
+tensorflow/python/training/checkpointable and tensorflow/python/training/saving.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -25,7 +29,6 @@ import time
 import uuid
 
 import numpy as np
-import six
 
 from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
@@ -42,16 +45,15 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import saveable_object
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -67,31 +69,6 @@ get_checkpoint_mtimes = checkpoint_management.get_checkpoint_mtimes
 remove_checkpoint = checkpoint_management.remove_checkpoint
 
 
-# Op names which identify variable reads which should be saved.
-_VARIABLE_OPS = set(["Variable",
-                     "VariableV2",
-                     "AutoReloadVariable",
-                     "VarHandleOp",
-                     "ReadVariableOp"])
-
-
-def _set_cpu0(device_string):
-  """Creates a new device string based on `device_string` but using /CPU:0.
-
-  If the device is already on /CPU:0, this is a no-op.
-
-  Args:
-    device_string: A device string.
-
-  Returns:
-    A device string.
-  """
-  parsed_device = pydev.DeviceSpec.from_string(device_string)
-  parsed_device.device_type = "CPU"
-  parsed_device.device_index = 0
-  return parsed_device.to_string()
-
-
 class BaseSaverBuilder(object):
   """Base class for Savers.
 
@@ -101,64 +78,9 @@ class BaseSaverBuilder(object):
   SaveSpec = saveable_object.SaveSpec
   SaveableObject = saveable_object.SaveableObject
 
-  class VariableSaveable(SaveableObject):
-    """SaveableObject implementation that handles Variables."""
-
-    def __init__(self, var, slice_spec, name):
-      spec = BaseSaverBuilder.SaveSpec(var, slice_spec, name, dtype=var.dtype)
-      super(BaseSaverBuilder.VariableSaveable, self).__init__(var, [spec], name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      restored_tensor = restored_tensors[0]
-      if restored_shapes is not None:
-        restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
-      return state_ops.assign(
-          self.op,
-          restored_tensor,
-          validate_shape=restored_shapes is None and
-          self.op.get_shape().is_fully_defined())
-
-  class ResourceVariableSaveable(SaveableObject):
-    """SaveableObject implementation that handles ResourceVariables."""
-
-    def __init__(self, var, slice_spec, name):
-      self._var_device = var.device
-      self._var_shape = var.shape
-      if isinstance(var, ops.Tensor):
-        self.handle_op = var.op.inputs[0]
-        tensor = var
-      elif isinstance(var, resource_variable_ops.ResourceVariable):
-
-        def _read_variable_closure(v):
-          def f():
-            with ops.device(v.device):
-              x = v.read_value()
-              # To allow variables placed on non-CPU devices to be checkpointed,
-              # we copy them to CPU on the same machine first.
-              with ops.device("/device:CPU:0"):
-                return array_ops.identity(x)
-          return f
-
-        self.handle_op = var.handle
-        tensor = _read_variable_closure(var)
-      else:
-        raise ValueError(
-            "Saveable is neither a resource variable nor a read operation."
-            " Got: %s" % repr(var))
-      spec = BaseSaverBuilder.SaveSpec(tensor, slice_spec, name,
-                                       dtype=var.dtype)
-      super(BaseSaverBuilder.ResourceVariableSaveable, self).__init__(
-          var, [spec], name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      restored_tensor = restored_tensors[0]
-      if restored_shapes is not None:
-        restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
-      # Copy the restored tensor to the variable's device.
-      with ops.device(self._var_device):
-        restored_tensor = array_ops.identity(restored_tensor)
-        return resource_variable_ops.shape_safe_assign_variable_handle(
-            self.handle_op, self._var_shape, restored_tensor)
+  # Aliases for code which was moved but still has lots of users.
+  VariableSaveable = saveable_object_util.ReferenceVariableSaveable
+  ResourceVariableSaveable = saveable_object_util.ResourceVariableSaveable
 
   def __init__(self, write_version=saver_pb2.SaverDef.V2):
     self._write_version = write_version
@@ -224,7 +146,11 @@ class BaseSaverBuilder(object):
     del restore_sequentially
     all_tensors = []
     for saveable in saveables:
-      with ops.device(_set_cpu0(saveable.device) if saveable.device else None):
+      if saveable.device:
+        device = saveable_object_util.set_cpu0(saveable.device)
+      else:
+        device = None
+      with ops.device(device):
         all_tensors.extend(
             self.restore_op(filename_tensor, saveable, preferred_shard))
     return all_tensors
@@ -336,7 +262,7 @@ class BaseSaverBuilder(object):
     last_device = None
     for shard, (device, saveables) in enumerate(per_device):
       last_device = device
-      with ops.device(_set_cpu0(device)):
+      with ops.device(saveable_object_util.set_cpu0(device)):
         sharded_filename = self.sharded_filename(tmp_checkpoint_prefix, shard,
                                                  num_shards_tensor)
         sharded_prefixes.append(sharded_filename)
@@ -344,7 +270,7 @@ class BaseSaverBuilder(object):
 
     with ops.control_dependencies([x.op for x in sharded_saves]):
       # Co-locates the merge step with the last device.
-      with ops.device(_set_cpu0(last_device)):
+      with ops.device(saveable_object_util.set_cpu0(last_device)):
         # V2 format write path consists of a metadata merge step.  Once merged,
         # attempts to delete the temporary directory, "<user-fed prefix>_temp".
         merge_step = gen_io_ops.merge_v2_checkpoints(
@@ -459,10 +385,6 @@ class BaseSaverBuilder(object):
                 name="restore_shard"))
     return control_flow_ops.group(*sharded_restores, name="restore_all")
 
-  @staticmethod
-  def _IsVariable(v):
-    return isinstance(v, ops.Tensor) and v.op.type in _VARIABLE_OPS
-
   def _GroupByDevices(self, saveables):
     """Group Variable tensor slices per device.
 
@@ -490,215 +412,6 @@ class BaseSaverBuilder(object):
       per_device[canonical_device.pop()].append(saveable)
     return sorted(per_device.items(), key=lambda t: t[0])
 
-  @staticmethod
-  def OpListToDict(op_list, convert_variable_to_tensor=True):
-    """Create a dictionary of names to operation lists.
-
-    Args:
-      op_list: A list, tuple, or set of Variables or SaveableObjects.
-      convert_variable_to_tensor: Whether or not to convert single Variables
-        with no slice info into Tensors.
-
-    Returns:
-      A dictionary of names to the operations that must be saved under
-      that name.  Variables with save_slice_info are grouped together under the
-      same key in no particular order.
-
-    Raises:
-      TypeError: If the type of op_list or its elements is not supported.
-      ValueError: If at least two saveables share the same name.
-    """
-    if not isinstance(op_list, (list, tuple, set)):
-      raise TypeError("Variables to save should be passed in a dict or a "
-                      "list: %s" % op_list)
-    # When ResourceVariables are converted to Tensors, read ops are added to the
-    # graph. Sorting the op_list ensures that the resulting graph is always
-    # constructed in a deterministic way:
-    op_list = sorted(op_list, key=lambda x: x.name)
-    names_to_saveables = {}
-    # pylint: disable=protected-access
-    for var in op_list:
-      if isinstance(var, BaseSaverBuilder.SaveableObject):
-        names_to_saveables[var.name] = var
-      elif isinstance(var, variables.PartitionedVariable):
-        if var.name in names_to_saveables:
-          raise ValueError("At least two variables have the same name: %s" %
-                           var.name)
-        names_to_saveables[var.name] = var
-      elif isinstance(var, variables.Variable) and var._save_slice_info:
-        name = var._save_slice_info.full_name
-        if name in names_to_saveables:
-          if not isinstance(names_to_saveables[name], list):
-            raise ValueError("Mixing slices and non-slices with the same name: "
-                             "%s" % name)
-          names_to_saveables[name].append(var)
-        else:
-          names_to_saveables[name] = [var]
-      elif (isinstance(var, checkpointable.CheckpointableBase)
-            and not isinstance(var, variables.Variable)):
-        checkpointable_saveables = [
-            (factory() if callable(factory) else factory)
-            for factory in var._gather_saveables_for_checkpoint().values()]
-        names_to_saveables.update(
-            BaseSaverBuilder.OpListToDict(checkpointable_saveables))
-      else:
-        if context.executing_eagerly():
-          if not isinstance(var, resource_variable_ops.ResourceVariable):
-            raise ValueError(
-                "Can only save/restore ResourceVariables when eager execution "
-                "is enabled, type: %s." % type(var))
-          set_var = names_to_saveables.setdefault(var._shared_name, var)
-          if set_var is not var:
-            raise ValueError(
-                ("Two different ResourceVariable objects with the same "
-                 "shared_name '%s' were passed to the Saver. This likely means "
-                 "that they were created in different Graphs or isolation "
-                 "contexts, and may not be checkpointed together.") %
-                (var._shared_name,))
-        else:
-          if convert_variable_to_tensor:
-            if isinstance(var, resource_variable_ops.ResourceVariable):
-              var = var._graph_element  # pylint: disable=protected-access
-            else:
-              var = ops.internal_convert_to_tensor(var, as_ref=True)
-            if not BaseSaverBuilder._IsVariable(var):
-              raise TypeError("Variable to save is not a Variable: %s" % var)
-          if var.op.type == "ReadVariableOp":
-            name = var.op.inputs[0].op.name
-          else:
-            name = var.op.name
-          if name in names_to_saveables:
-            raise ValueError("At least two variables have the same name: %s" %
-                             name)
-          names_to_saveables[name] = var
-
-      # pylint: enable=protected-access
-    return names_to_saveables
-
-  @staticmethod
-  def SaveableObjectsForOp(op, name):
-    """Create `SaveableObject`s from an operation.
-
-    Args:
-      op: A variable, operation, or SaveableObject to coerce into a
-        SaveableObject.
-      name: A string name for the SaveableObject.
-
-    Yields:
-      `SaveableObject`s which together save/restore `op`.
-
-    Raises:
-      TypeError: If `name` is not a string.
-      ValueError: For operations with no known conversion to SaveableObject.
-    """
-    if not isinstance(name, six.string_types):
-      raise TypeError(
-          "names_to_saveables must be a dict mapping string names to "
-          "checkpointable operations. Name is not a string: %s" % name)
-    if isinstance(op, BaseSaverBuilder.SaveableObject):
-      yield op
-    elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
-      if isinstance(op, variables.PartitionedVariable):
-        op = list(op)
-      # A set of slices.
-      slice_name = None
-      # pylint: disable=protected-access
-      for variable in op:
-        if not isinstance(variable, variables.Variable):
-          raise ValueError("Slices must all be Variables: %s" % variable)
-        if not variable._save_slice_info:
-          raise ValueError("Slices must all be slices: %s" % variable)
-        if slice_name is None:
-          slice_name = variable._save_slice_info.full_name
-        elif slice_name != variable._save_slice_info.full_name:
-          raise ValueError(
-              "Slices must all be from the same tensor: %s != %s" %
-              (slice_name, variable._save_slice_info.full_name))
-        if variable.op.type in ["Variable", "VariableV2",
-                                "AutoReloadVariable"]:
-          yield BaseSaverBuilder.VariableSaveable(
-              variable, variable._save_slice_info.spec, name)
-        else:
-          yield BaseSaverBuilder.ResourceVariableSaveable(
-              variable, variable._save_slice_info.spec, name)
-      # pylint: enable=protected-access
-    elif isinstance(op, checkpointable.CheckpointableBase) and not isinstance(
-        op, variables.Variable):
-      # pylint: disable=protected-access
-      for attr, factory in op._gather_saveables_for_checkpoint().items():
-        op = (factory(name + "_" + attr) if callable(factory) else factory)
-        for op in BaseSaverBuilder.SaveableObjectsForOp(op, op.name):
-          yield op
-      # pylint: enable=protected-access
-    else:
-      # A variable or tensor.
-      if context.executing_eagerly():
-        if not isinstance(op, resource_variable_ops.ResourceVariable):
-          raise ValueError("Can only save/restore ResourceVariable eager "
-                           "mode is enabled, type: %s." % type(op))
-        yield BaseSaverBuilder.ResourceVariableSaveable(op, "", name)
-      else:
-        if isinstance(op, resource_variable_ops.ResourceVariable):
-          variable = op._graph_element  # pylint: disable=protected-access
-        else:
-          variable = ops.internal_convert_to_tensor(op, as_ref=True)
-        if not BaseSaverBuilder._IsVariable(variable):
-          raise TypeError("names_to_saveables must be a dict mapping string "
-                          "names to Tensors/Variables. Not a variable: %s" %
-                          variable)
-        if variable.op.type in ["Variable", "VariableV2",
-                                "AutoReloadVariable"]:
-          yield BaseSaverBuilder.VariableSaveable(variable, "", name)
-        else:
-          yield BaseSaverBuilder.ResourceVariableSaveable(
-              variable, "", name)
-
-  def _ValidateAndSliceInputs(self, names_to_saveables):
-    """Returns the variables and names that will be used for a Saver.
-
-    Args:
-      names_to_saveables: A dict (k, v) where k is the name of an operation and
-         v is an operation to save or a BaseSaverBuilder.Saver.
-
-    Returns:
-      A list of BaseSaverBuilder.SaveableObject objects.
-
-    Raises:
-      TypeError: If any of the keys are not strings or any of the
-        values are not one of Tensor or Variable or a checkpointable operation.
-      ValueError: If the same operation is given in more than one value
-        (this also applies to slices of SlicedVariables).
-    """
-    if not isinstance(names_to_saveables, dict):
-      names_to_saveables = BaseSaverBuilder.OpListToDict(names_to_saveables)
-
-    saveables = []
-    seen_ops = set()
-    for name, op in sorted(names_to_saveables.items(),
-                           # Avoid comparing ops, sort only by name.
-                           key=lambda x: x[0]):
-      for converted_saveable_object in self.SaveableObjectsForOp(op, name):
-        self._AddSaveable(saveables, seen_ops, converted_saveable_object)
-    return saveables
-
-  def _AddSaveable(self, saveables, seen_ops, saveable):
-    """Adds the saveable to the saveables list.
-
-    Args:
-      saveables: List to append the SaveableObject to.
-      seen_ops: Set of the ops of the saveables already processed.  Used to
-        check that each saveable is only saved once.
-      saveable: The saveable.
-
-    Raises:
-      ValueError: If the saveable has already been processed.
-    """
-    if saveable.op in seen_ops:
-      raise ValueError("The same saveable will be restored with two names: %s" %
-                       saveable.name)
-    saveables.append(saveable)
-    seen_ops.add(saveable.op)
-
   def build(self,
             names_to_saveables,
             reshape=False,
@@ -770,14 +483,19 @@ class BaseSaverBuilder(object):
       raise ValueError("save and restore operations need to be built together "
                        " when eager execution is not enabled.")
 
-    saveables = self._ValidateAndSliceInputs(names_to_saveables)
+    saveables = saveable_object_util.validate_and_slice_inputs(
+        names_to_saveables)
     if max_to_keep is None:
       max_to_keep = 0
 
     with ops.name_scope(name, "save",
                         [saveable.op for saveable in saveables]) as name:
-      # Add the Constant string tensor for the filename.
-      filename_tensor = constant_op.constant(filename or "model")
+      # Add a placeholder string tensor for the filename.
+      filename_tensor = array_ops.placeholder_with_default(
+          filename or "model", shape=(), name="filename")
+      # Keep the name "Const" for backwards compatibility.
+      filename_tensor = array_ops.placeholder_with_default(
+          filename_tensor, shape=(), name="Const")
 
       # Add the save ops.
       if sharded:
@@ -889,7 +607,7 @@ def _get_saver_or_default():
   return saver
 
 
-@tf_export("train.Saver")
+@tf_export(v1=["train.Saver"])
 class Saver(object):
   """Saves and restores variables.
 
@@ -1068,16 +786,28 @@ class Saver(object):
     @compatibility(eager)
     When eager execution is enabled, `var_list` must specify a `list` or `dict`
     of variables to save. Otherwise, a `RuntimeError` will be raised.
+
+    Although Saver works in some cases when executing eagerly, it is
+    fragile. Please switch to `tf.train.Checkpoint` or
+    `tf.keras.Model.save_weights`, which perform a more robust object-based
+    saving. These APIs will load checkpoints written by `Saver`.
     @end_compatibility
     """
     if defer_build and var_list:
       raise ValueError(
           "If `var_list` is provided then build cannot be deferred. "
           "Either set defer_build=False or var_list=None.")
-    if context.executing_eagerly() and var_list is None:
-      raise RuntimeError(
-          "When eager execution is enabled, `var_list` must specify a list or "
-          "dict of variables to save")
+    if context.executing_eagerly():
+      logging.warning(
+          "Saver is deprecated, please switch to tf.train.Checkpoint or "
+          "tf.keras.Model.save_weights for training checkpoints. When "
+          "executing eagerly variables do not necessarily have unique names, "
+          "and so the variable.name-based lookups Saver performs are "
+          "error-prone.")
+      if var_list is None:
+        raise RuntimeError(
+            "When eager execution is enabled, `var_list` must specify a list "
+            "or dict of variables to save")
     self._var_list = var_list
     self._reshape = reshape
     self._sharded = sharded
@@ -1594,7 +1324,7 @@ class Saver(object):
                                   export_scope=export_scope)
 
 
-@tf_export("train.import_meta_graph")
+@tf_export(v1=["train.import_meta_graph"])
 def import_meta_graph(meta_graph_or_file, clear_devices=False,
                       import_scope=None, **kwargs):
   """Recreates a Graph saved in a `MetaGraphDef` proto.
@@ -1647,6 +1377,37 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   NOTE: Restarting training from saved `meta_graph` only works if the
   device assignments have not changed.
 
+  Example 2:
+  Variables, placeholders, and independent operations can also be stored, as
+  shown in the following example.
+
+  ```Python
+  # Saving contents and operations.
+  v1 = tf.placeholder(tf.float32, name="v1")
+  v2 = tf.placeholder(tf.float32, name="v2")
+  v3 = tf.mul(v1, v2)
+  vx = tf.Variable(10.0, name="vx")
+  v4 = tf.add(v3, vx, name="v4")
+  saver = tf.train.Saver([vx])
+  sess = tf.Session()
+  sess.run(tf.initialize_all_variables())
+  sess.run(vx.assign(tf.add(vx, vx)))
+  result = sess.run(v4, feed_dict={v1:12.0, v2:3.3})
+  print(result)
+  saver.save(sess, "./model_ex1")
+  ```
+
+  Later this model can be restored and contents loaded.
+
+  ```Python
+  # Restoring variables and running operations.
+  saver = tf.train.import_meta_graph("./model_ex1.meta")
+  sess = tf.Session()
+  saver.restore(sess, "./model_ex1")
+  result = sess.run("v4:0", feed_dict={"v1:0": 12.0, "v2:0": 3.3})
+  print(result)
+  ```
+
   Args:
     meta_graph_or_file: `MetaGraphDef` protocol buffer or filename (including
       the path) containing a `MetaGraphDef`.
@@ -1724,7 +1485,7 @@ def _create_saver_from_imported_meta_graph(
       return None
 
 
-@tf_export("train.export_meta_graph")
+@tf_export(v1=["train.export_meta_graph"])
 def export_meta_graph(filename=None,
                       meta_info_def=None,
                       graph_def=None,
@@ -1889,17 +1650,41 @@ def saver_from_object_based_checkpoint(
   if builder is None:
     builder = BulkSaverBuilder()
 
-  saveables = builder._ValidateAndSliceInputs(var_list)  # pylint: disable=protected-access
+  saveables = saveable_object_util.validate_and_slice_inputs(var_list)
+  current_names = set()
+  for saveable in saveables:
+    for spec in saveable.specs:
+      current_names.add(spec.name)
+  previous_names = set(names_to_keys.keys())
+  missing_names = current_names - previous_names
+  if missing_names:
+    extra_names = previous_names - current_names
+    intersecting_names = previous_names.intersection(current_names)
+    raise errors.NotFoundError(
+        None, None,
+        message=(
+            "\n\nExisting variables not in the checkpoint: %s\n\n"
+            "Variables names when this checkpoint was written which don't "
+            "exist now: %s\n\n"
+            "(%d variable name(s) did match)\n\n"
+            "Could not find some variables in the checkpoint (see names "
+            "above). Saver was attempting to load an object-based checkpoint "
+            "(saved using tf.train.Checkpoint or tf.keras.Model.save_weights) "
+            "using variable names. If the checkpoint was written with eager "
+            "execution enabled, it's possible that variable names have "
+            "changed (for example missing a '_1' suffix). It's also "
+            "possible that there are new variables which did not exist "
+            "when the checkpoint was written. You can construct a "
+            "Saver(var_list=...) with only the variables which previously "
+            "existed, and if variable names have changed you may need to "
+            "make this a dictionary with the old names as keys. If you're "
+            "using an Estimator, you'll need to return a tf.train.Saver "
+            "inside a tf.train.Scaffold from your model_fn.")
+        % (", ".join(sorted(missing_names)), ", ".join(sorted(extra_names)),
+           len(intersecting_names)))
   for saveable in saveables:
     for spec in saveable.specs:
-      if spec.name not in names_to_keys:
-        raise errors.NotFoundError(
-            None, None,
-            message=("Attempting to load an object-based checkpoint using "
-                     "variable names, but could not find %s in the "
-                     "checkpoint.") % spec.name)
       spec.name = names_to_keys[spec.name]
-
   if cached_saver is None:
     return Saver(saveables)
   return cached_saver
diff --git a/tensorflow/python/training/saver_large_partitioned_variable_test.py b/tensorflow/python/training/saver_large_partitioned_variable_test.py
index 1a44511cfeb99e350f8c3394fa51c5cfbf0f3b6c..84458836d0613ea632f3ffcd13315f4e7d7c3927 100644
--- a/tensorflow/python/training/saver_large_partitioned_variable_test.py
+++ b/tensorflow/python/training/saver_large_partitioned_variable_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -44,8 +45,12 @@ class SaverLargePartitionedVariableTest(test.TestCase):
         # split into smaller sized variables.
         init = lambda shape, dtype, partition_info: constant_op.constant(
             True, dtype, shape)
-        partitioned_var = partitioned_variables.create_partitioned_variables(
-            [1 << 31], [4], init, dtype=dtypes.bool, name=var_name)
+        partitioned_var = list(variable_scope.get_variable(
+            var_name,
+            shape=[1 << 31],
+            partitioner=partitioned_variables.fixed_size_partitioner(4),
+            initializer=init,
+            dtype=dtypes.bool))
         variables.global_variables_initializer().run()
         save = saver.Saver(partitioned_var)
         val = save.save(sess, save_path)
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 49e6e6546d29409818b49782a60e6ff7f6095726..95c21cb815fd8cf9aa5e9efb98efd6be7108f51a 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -170,6 +170,7 @@ class SaverTest(test.TestCase):
   def testResourceBasic(self):
     self.basicSaveRestore(resource_variable_ops.ResourceVariable)
 
+  @test_util.run_deprecated_v1
   def testResourceColocation(self):
     partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
     with ops_lib.device("/job:ps/device:GPU:0"):
@@ -227,7 +228,7 @@ class SaverTest(test.TestCase):
         w1 = resource_variable_ops.ResourceVariable(1.0, name="w1")
         w2 = resource_variable_ops.ResourceVariable(2.0, name="w2")
         graph_saver = saver_module.Saver([w1, w2])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         graph_saver.save(sess, graph_ckpt_prefix)
 
     with context.eager_mode():
@@ -260,7 +261,7 @@ class SaverTest(test.TestCase):
         w3 = resource_variable_ops.ResourceVariable(0.0, name="w3")
         w4 = resource_variable_ops.ResourceVariable(0.0, name="w4")
         graph_saver = saver_module.Saver([w3, w4])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         graph_saver.restore(sess, eager_ckpt_prefix)
         self.assertAllEqual(w3.eval(), 3.0)
         self.assertAllEqual(w4.eval(), 4.0)
@@ -300,6 +301,7 @@ class SaverTest(test.TestCase):
             not op.name.startswith("saver2/save/"))]
     self.assertEqual(ops_in_saver2_scope_but_not_save_scope, [])
 
+  @test_util.run_deprecated_v1
   def testSaveCopyRestoreWithSaveRelativePaths(self):
     """Save, copy checkpoint dir and restore from copied dir.
 
@@ -326,7 +328,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initialize all variables
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
       self.assertEqual(10.0, v0.eval())
@@ -369,6 +371,7 @@ class SaverTest(test.TestCase):
       self.assertEqual(b"k1", v2.keys().eval())
       self.assertEqual(30.0, v2.values().eval())
 
+  @test_util.run_deprecated_v1
   def testFilenameTensor(self):
     v0 = variables.VariableV1(0, name="v0")
     filename = b"somerandomfilename"
@@ -376,7 +379,7 @@ class SaverTest(test.TestCase):
     with self.cached_session() as sess:
       tensor = sess.graph.get_tensor_by_name(
           save.saver_def.filename_tensor_name)
-      self.assertEqual(sess.run(tensor), filename)
+      self.assertEqual(self.evaluate(tensor), filename)
 
   def testInvalidPath(self):
     v0 = variables.VariableV1(0, name="v0")
@@ -387,6 +390,7 @@ class SaverTest(test.TestCase):
             ValueError, "The passed save_path is not a valid checkpoint:"):
           save.restore(sess, "invalid path")
 
+  @test_util.run_v1_only("b/120545219")
   def testInt64(self):
     save_path = os.path.join(self.get_temp_dir(), "int64")
 
@@ -407,7 +411,7 @@ class SaverTest(test.TestCase):
 
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v" in e.message):
-        sess.run(v)
+        self.evaluate(v)
 
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
@@ -462,6 +466,7 @@ class SaverTest(test.TestCase):
       # Verify non-duplicate names work.
       saver_module.Saver({"v0": v0, "v2": v2.saveable})
 
+  @test_util.run_v1_only("b/120545219")
   def testBasicsWithListOfVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "basics_with_list")
 
@@ -497,10 +502,10 @@ class SaverTest(test.TestCase):
 
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v0" in e.message):
-        sess.run(v0)
+        self.evaluate(v0)
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v1" in e.message):
-        sess.run(v1)
+        self.evaluate(v1)
       self.assertEqual(0, len(v2.keys().eval()))
       self.assertEqual(0, len(v2.values().eval()))
 
@@ -557,6 +562,7 @@ class SaverTest(test.TestCase):
     # The cached readers should know to re-read the file.
     self._SaveAndLoad("var1", 1.1, 2.2, save_path)
 
+  @test_util.run_deprecated_v1
   def testAllowEmpty(self):
     save_path = os.path.join(self.get_temp_dir(), "allow_empty")
     with self.cached_session() as sess:
@@ -661,6 +667,7 @@ class SaverTest(test.TestCase):
       self.assertAllClose(1.0, one.eval())
       self.assertAllClose([2.0, 2.0, 2.0], twos.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testReshape(self):
     save_path = os.path.join(self.get_temp_dir(), "variables_reshape")
     with session.Session("", graph=ops_lib.Graph()) as sess:
@@ -719,6 +726,7 @@ class SaverTest(test.TestCase):
   def testSaveWithGlobalStepWithPadding(self):
     self.testSaveWithGlobalStep(pad_step_number=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testSaveToNonexistingPath(self):
     file_io.write_string_to_file(
         os.path.join(self.get_temp_dir(), "actually_a_file"), "")
@@ -742,7 +750,7 @@ class SaverTest(test.TestCase):
       try:
         with self.cached_session() as sess:
           # Initialize all variables
-          sess.run(init_all_op)
+          self.evaluate(init_all_op)
 
           # Check that the parameter nodes have been initialized.
           self.assertEqual(10.0, v0.eval())
@@ -761,6 +769,7 @@ class SaverTest(test.TestCase):
         error_msg_template = "Parent directory of {} doesn't exist, can't save."
         self.assertEqual(error_msg_template.format(save_path), str(exc))
 
+  @test_util.run_deprecated_v1
   def testSaveToURI(self):
     # ParseURI functions don't work on Windows yet.
     # TODO(jhseu): Remove this check when it works.
@@ -777,7 +786,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initialize all variables
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
       self.assertEqual(10.0, v0.eval())
@@ -824,19 +833,19 @@ class SaverTest(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph) as sess:
       orig_vars = _model()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       save = saver_module.Saver(max_to_keep=1)
       variables.global_variables_initializer().run()
       save.save(sess, save_dir)
-      orig_vals = sess.run(orig_vars)
+      orig_vals = self.evaluate(orig_vars)
 
     restore_graph = ops_lib.Graph()
-    with restore_graph.as_default(), self.test_session(
+    with restore_graph.as_default(), self.session(
         graph=restore_graph) as sess:
       restored_vars = _model()
       save = saver_module.Saver(max_to_keep=1)
       save.restore(sess, save_dir)
-      restored_vals = sess.run(restored_vars)
+      restored_vals = self.evaluate(restored_vars)
 
     for orig, restored in zip(orig_vals, restored_vals):
       self.assertAllEqual(orig, restored)
@@ -982,6 +991,7 @@ class SaveRestoreShardedTest(test.TestCase):
           checkpoint_management.latest_checkpoint(self.get_temp_dir()),
           os.path.join(self.get_temp_dir(), "sharded_basics"))
 
+  @test_util.run_deprecated_v1
   def testSaverDef(self):
     with self.cached_session():
       v0 = variables.VariableV1(123, name="v0")
@@ -998,19 +1008,12 @@ class SaveRestoreShardedTest(test.TestCase):
 
     call_saver_with_dict = False  # updated by test loop below
 
-    def _save(slices=None, partitioner=None):
+    def _save(partitioner=None):
       with self.session(graph=ops_lib.Graph()) as sess:
         # Calls .eval() to return the ndarray that makes up the full variable.
         rnd = random_ops.random_uniform(var_full_shape).eval()
 
-        if slices:
-          assert not partitioner
-          # TODO(apassos): make create_partitioned_variables take use_resource
-          # option to make this test passable without creating a named
-          # variable_scope.
-          vs = partitioned_variables.create_partitioned_variables(
-              var_full_shape, slices, rnd, name=var_name)
-        elif partitioner:
+        if partitioner:
           vs = [
               variable_scope.get_variable(
                   var_name,
@@ -1027,7 +1030,7 @@ class SaveRestoreShardedTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         if call_saver_with_dict:
-          saver = saver_module.Saver({var_name: (vs if slices else vs[0])})
+          saver = saver_module.Saver({var_name: vs[0]})
         else:
           saver = saver_module.Saver(vs)
         actual_path = saver.save(sess, saved_path)
@@ -1035,16 +1038,9 @@ class SaveRestoreShardedTest(test.TestCase):
 
         return rnd
 
-    def _restore(slices=None, partitioner=None):
+    def _restore(partitioner=None):
       with self.session(graph=ops_lib.Graph()) as sess:
-        if slices:
-          assert not partitioner
-          new_vs = partitioned_variables.create_partitioned_variables(
-              var_full_shape,
-              slices,
-              array_ops.zeros(var_full_shape),  # != original contents.
-              name=var_name)
-        elif partitioner:
+        if partitioner:
           new_vs = [
               variable_scope.get_variable(
                   var_name,
@@ -1063,7 +1059,7 @@ class SaveRestoreShardedTest(test.TestCase):
         variables.global_variables_initializer().run()
         if call_saver_with_dict:
           saver = saver_module.Saver({
-              var_name: (new_vs if slices else new_vs[0])
+              var_name: new_vs[0]
           })
         else:
           saver = saver_module.Saver(new_vs)
@@ -1071,11 +1067,7 @@ class SaveRestoreShardedTest(test.TestCase):
 
         if partitioner:
           return new_vs[0].as_tensor().eval()
-        elif slices and slices[0] != 1:
-          return array_ops.concat(new_vs, 0).eval()
-        elif slices and slices[1] != 1:
-          return array_ops.concat(new_vs, 1).eval()
-        else:  # Non-sliced.
+        else:
           return new_vs[0].eval()
 
     for call_saver_with_dict in {False, True}:
@@ -1086,32 +1078,30 @@ class SaveRestoreShardedTest(test.TestCase):
       restored_full = _restore()
       self.assertAllEqual(saved_full, restored_full)
 
-      # Saves 10 horizontal parts of a partitioned variable.
-      # Restores into a full variable, non-sliced.
-      saved_full = _save(slices=[10, 1])
-      restored_full = _restore()
-      self.assertAllEqual(saved_full, restored_full)
-
-      # Restores into a different number/orientation of slices.
-      restored_full = _restore(slices=[2, 1])  # 2 horizon parts.
-      self.assertAllEqual(saved_full, restored_full)
-      restored_full = _restore(slices=[1, 3])  # 3 vertical parts.
+      # Restores into the same number of partitions.
+      restored_full = _restore(
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              num_shards=2))
       self.assertAllEqual(saved_full, restored_full)
 
-      # Restores into a PartitionedVariable
+      # Restores into a different number of partitions.
       restored_full = _restore(
           partitioner=partitioned_variables.fixed_size_partitioner(
-              num_shards=2))
+              num_shards=3))
       self.assertAllEqual(saved_full, restored_full)
 
-      # Now, saves a full variable and restores in slices.
+      # Now, saves a full variable and restores PartitionedVariable.
       saved_full = _save()
-      restored_full = _restore(slices=[1, 3])
+      restored_full = _restore(
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              num_shards=3))
       self.assertAllEqual(saved_full, restored_full)
 
+  @test_util.run_deprecated_v1
   def testPartitionedVariable(self):
     self._testPartitionedVariables(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testPartitionedResourceVariable(self):
     self._testPartitionedVariables(use_resource=True)
 
@@ -1206,6 +1196,7 @@ class MaxToKeepTest(test.TestCase):
       # Deleted by the first helper.
       self.assertFalse(checkpoint_management.checkpoint_exists(s3))
 
+  @test_util.run_deprecated_v1
   def testNonSharded(self):
     save_dir = self._get_test_dir("max_to_keep_non_sharded")
 
@@ -1443,6 +1434,7 @@ class MaxToKeepTest(test.TestCase):
       self.assertTrue(
           gfile.Exists(checkpoint_management.meta_graph_filename(s3)))
 
+  @test_util.run_deprecated_v1
   def testNoMaxToKeep(self):
     save_dir = self._get_test_dir("no_max_to_keep")
     save_dir2 = self._get_test_dir("max_to_keep_0")
@@ -1471,6 +1463,7 @@ class MaxToKeepTest(test.TestCase):
       self.assertEqual([], save2.last_checkpoints)
       self.assertTrue(checkpoint_management.checkpoint_exists(s2))
 
+  @test_util.run_deprecated_v1
   def testNoMetaGraph(self):
     save_dir = self._get_test_dir("no_meta_graph")
 
@@ -1494,6 +1487,7 @@ class KeepCheckpointEveryNHoursTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   @test.mock.patch.object(saver_module, "time")
+  @test_util.run_deprecated_v1
   def testNonSharded(self, mock_time):
     save_dir = self._get_test_dir("keep_checkpoint_every_n_hours")
 
@@ -1613,6 +1607,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
       self.assertEqual(20.0, self.evaluate(v1))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testNonReshapeResourceVariable(self):
     self._testNonReshape(resource_variable_ops.ResourceVariable)
 
@@ -1627,6 +1622,7 @@ class MetaGraphTest(test.TestCase):
     gfile.MakeDirs(test_dir)
     return test_dir
 
+  @test_util.run_v1_only("b/120545219")
   def testAddCollectionDef(self):
     test_dir = self._get_test_dir("good_collection")
     filename = os.path.join(test_dir, "metafile")
@@ -1769,18 +1765,20 @@ class MetaGraphTest(test.TestCase):
       self.assertEqual([], v1.get_shape())
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v1" in e.message):
-        sess.run(v1)
+        self.evaluate(v1)
       # Retrieves saver1. Verifies that new_saver1 can restore v1.
       new_saver1 = savers[1]
       new_saver1.restore(sess, saver1_ckpt)
       v1 = sess.graph.get_tensor_by_name("v1:0")
       self.assertEqual(11.0, v1.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultiSaverCollection(self):
     test_dir = self._get_test_dir("saver_collection")
     self._testMultiSaverCollectionSave(test_dir)
     self._testMultiSaverCollectionRestore(test_dir)
 
+  @test_util.run_v1_only("b/120545219")
   def testClearExtraneousSavers(self):
     test_dir = self._get_test_dir("clear_extraneous_savers")
     filename = os.path.join(test_dir, "metafile")
@@ -1832,9 +1830,10 @@ class MetaGraphTest(test.TestCase):
       self.assertEqual(1, len(savers.value))
 
       # Verifies that saver0 graph nodes are omitted from the saver1 export
-      self.assertEqual(29, len(meta_graph_def0.graph_def.node))
-      self.assertEqual(19, len(meta_graph_def1.graph_def.node))
+      self.assertEqual(33, len(meta_graph_def0.graph_def.node))
+      self.assertEqual(21, len(meta_graph_def1.graph_def.node))
 
+  @test_util.run_deprecated_v1
   def testBinaryAndTextFormat(self):
     test_dir = self._get_test_dir("binary_and_text")
     filename = os.path.join(test_dir, "metafile")
@@ -1867,6 +1866,7 @@ class MetaGraphTest(test.TestCase):
                                                lambda e: "does not exist"):
         saver_module.import_meta_graph(filename)
 
+  @test_util.run_v1_only("b/120545219")
   def testSliceVariable(self):
     test_dir = self._get_test_dir("slice_saver")
     filename = os.path.join(test_dir, "metafile")
@@ -1949,9 +1949,9 @@ class MetaGraphTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initializes all the variables.
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
       # Runs to logit.
-      sess.run(logits)
+      self.evaluate(logits)
       # Creates a saver.
       saver0 = saver_module.Saver()
       saver0.save(sess, saver0_ckpt)
@@ -1991,7 +1991,7 @@ class MetaGraphTest(test.TestCase):
       ops_lib.add_to_collection("train_op", train_op)
 
       # Runs train_op.
-      sess.run(train_op)
+      self.evaluate(train_op)
 
       # Generates MetaGraphDef.
       saver_module.export_meta_graph(train_filename)
@@ -2005,8 +2005,9 @@ class MetaGraphTest(test.TestCase):
       # Restores from checkpoint.
       new_saver.restore(sess, saver0_ckpt)
       train_op = ops_lib.get_collection("train_op")[0]
-      sess.run(train_op)
+      self.evaluate(train_op)
 
+  @test_util.run_deprecated_v1
   def testGraphExtension(self):
     test_dir = self._get_test_dir("graph_extension")
     self._testGraphExtensionSave(test_dir)
@@ -2037,8 +2038,8 @@ class MetaGraphTest(test.TestCase):
 
       # Generate a MetaGraphDef containing the while loop.
       with session.Session() as sess:
-        sess.run(init_op)
-        sess.run(output)
+        self.evaluate(init_op)
+        self.evaluate(output)
         saver = saver_module.Saver()
         saver.save(sess, saver_ckpt)
         saver.export_meta_graph(filename)
@@ -2053,8 +2054,8 @@ class MetaGraphTest(test.TestCase):
       no_constfold_config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
       with session.Session(config=no_constfold_config) as sess:
-        sess.run(init_op)
-        expected_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        expected_grad_value = self.evaluate(grad)
 
     # Restore the MetaGraphDef into a new Graph.
     with ops_lib.Graph().as_default():
@@ -2070,8 +2071,8 @@ class MetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with session.Session(config=no_constfold_config) as sess:
-        sess.run(init_op)
-        actual_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        actual_grad_value = self.evaluate(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
   def _testWhileLoopAndGradientSerDes(self, outer_body_fn):
@@ -2092,6 +2093,7 @@ class MetaGraphTest(test.TestCase):
       return i + 1, x + r
     self._testWhileLoopAndGradientSerDes(body)
 
+  @test_util.run_deprecated_v1
   def testNestedControlFlowSerDes(self):
     # Test while loop in a cond in a while loop.
     # pylint: disable=g-long-lambda
@@ -2120,6 +2122,7 @@ class MetaGraphTest(test.TestCase):
                                       lambda: math_ops.multiply(x, -1.0))))
     # pylint: enable=g-long-lambda
 
+  @test_util.run_v1_only("b/120545219")
   def testStrippedOpListDef(self):
     with self.cached_session():
       # Creates a graph.
@@ -2140,13 +2143,14 @@ class MetaGraphTest(test.TestCase):
       ops = [o.name for o in meta_graph_def.meta_info_def.stripped_op_list.op]
       if save._write_version is saver_pb2.SaverDef.V1:
         self.assertEqual(ops, [
-            "Add", "Assign", "Const", "Identity", "NoOp", "RestoreV2",
-            "SaveSlices", "Sub", "VariableV2"
+            "Add", "Assign", "Const", "Identity", "NoOp",
+            "PlaceholderWithDefault", "RestoreV2", "SaveSlices", "Sub",
+            "VariableV2"
         ])
       else:
         self.assertEqual(ops, [
-            "Add", "Assign", "Const", "Identity", "NoOp", "RestoreV2", "SaveV2",
-            "Sub", "VariableV2"
+            "Add", "Assign", "Const", "Identity", "NoOp",
+            "PlaceholderWithDefault", "RestoreV2", "SaveV2", "Sub", "VariableV2"
         ])
 
       # Test calling stripped_op_list_for_graph directly
@@ -2156,6 +2160,7 @@ class MetaGraphTest(test.TestCase):
         self.assertEqual(o.summary, "")
         self.assertEqual(o.description, "")
 
+  @test_util.run_deprecated_v1
   def testStripDefaultValuedAttrs(self):
     """Verifies that default valued attrs are stripped, unless disabled."""
 
@@ -2192,6 +2197,7 @@ class MetaGraphTest(test.TestCase):
       self.assertIn("T", node_def.attr)
       self.assertIn("Tout", node_def.attr)
 
+  @test_util.run_deprecated_v1
   def testImportIntoNamescope(self):
     # Test that we can import a meta graph into a namescope.
     test_dir = self._get_test_dir("import_into_namescope")
@@ -2208,7 +2214,7 @@ class MetaGraphTest(test.TestCase):
                                                       logits=logit, name="cost")
       adam.AdamOptimizer().minimize(cost, name="optimize")
       saver = saver_module.Saver()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver.save(sess, filename)
 
     graph = ops_lib.Graph()
@@ -2245,7 +2251,7 @@ class MetaGraphTest(test.TestCase):
 
       # Create a variable in graph_2 under scope "my_scope".
       variables.VariableV1(array_ops.zeros([10]), name="my_scope/my_var")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       # Restore the checkpoint into a different scope "subgraph_2".
       new_saver_2 = saver_module.import_meta_graph(
           filename + ".meta", graph=graph_2, import_scope="subgraph_2")
@@ -2262,6 +2268,7 @@ class MetaGraphTest(test.TestCase):
           filename + ".meta", graph=graph_2, import_scope="my_scope")
       self.assertIsInstance(new_saver_3, saver_module.Saver)
 
+  @test_util.run_deprecated_v1
   def testImportIntoImplicitNamescope(self):
     # Test that we can import a meta graph into an implicit namescope.
     test_dir = self._get_test_dir("import_into_namescope")
@@ -2278,7 +2285,7 @@ class MetaGraphTest(test.TestCase):
                                                       logits=logit, name="cost")
       adam.AdamOptimizer().minimize(cost, name="optimize")
       saver = saver_module.Saver()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver.save(sess, filename)
 
     graph = ops_lib.Graph()
@@ -2315,12 +2322,12 @@ class MetaGraphTest(test.TestCase):
           meta_graph_def, clear_devices=False, import_scope="new_model")
       # Device refers to GPU, which is not available here.
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(
           meta_graph_def, clear_devices=True, import_scope="new_model")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(["new_model/optimize"], {
           "new_model/image:0": np.random.random([1, 784]),
           "new_model/label:0": np.random.randint(
@@ -2347,7 +2354,7 @@ class MetaGraphTest(test.TestCase):
 
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(["new_model/optimize"], {
           "new_model/image:0": np.random.random([1, 784]),
           "new_model/label:0": np.random.randint(
@@ -2357,7 +2364,7 @@ class MetaGraphTest(test.TestCase):
   def testPreserveDatasetAndFunctions(self):
     with ops_lib.Graph().as_default() as g:
       dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
       _ = array_ops.identity(next_element, name="output")
 
@@ -2373,7 +2380,7 @@ class MetaGraphTest(test.TestCase):
                            meta_graph_def_from_graph_def]:
       with session.Session(graph=ops_lib.Graph()) as sess:
         saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in range(10):
           self.assertEqual(i * i, sess.run("new_model/output:0"))
         with self.assertRaises(errors.OutOfRangeError):
@@ -2384,6 +2391,7 @@ class CheckpointReaderTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
 
+  @test_util.run_deprecated_v1
   def testDebugString(self):
     # Builds a graph.
     v0 = variables.VariableV1(
@@ -2399,7 +2407,7 @@ class CheckpointReaderTest(test.TestCase):
     save_path = os.path.join(self.get_temp_dir(),
                              "ckpt_for_debug_string" + str(self._WRITE_VERSION))
     with self.cached_session() as sess:
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
       # Saves a checkpoint.
       save.save(sess, save_path)
 
@@ -2545,7 +2553,7 @@ class ScopedGraphTest(test.TestCase):
       self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
 
     with self.session(graph=graph) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.save(sess, os.path.join(test_dir, ckpt_filename), write_state=False)
 
@@ -2608,13 +2616,14 @@ class ScopedGraphTest(test.TestCase):
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.restore(sess, os.path.join(test_dir, ckpt_filename))
       # Verify that we have restored weights1 and biases1.
-      sess.run([weights1, biases1])
+      self.evaluate([weights1, biases1])
       # Initialize the rest of the variables and run logits.
-      sess.run(init_rest_op)
-      sess.run(logits)
+      self.evaluate(init_rest_op)
+      self.evaluate(logits)
 
   # Verifies that we can save the subgraph under "hidden1" and restore it
   # into "new_hidden1" in the new graph.
+  @test_util.run_deprecated_v1
   def testScopedSaveAndRestore(self):
     test_dir = self._get_test_dir("scoped_export_import")
     ckpt_filename = "ckpt"
@@ -2624,6 +2633,7 @@ class ScopedGraphTest(test.TestCase):
 
   # Verifies that we can copy the subgraph under "hidden1" and copy it
   # to different name scope in the same graph or different graph.
+  @test_util.run_deprecated_v1
   def testCopyScopedGraph(self):
     test_dir = self._get_test_dir("scoped_copy")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
@@ -2639,7 +2649,7 @@ class ScopedGraphTest(test.TestCase):
 
     # Run the graph and save scoped checkpoint.
     with self.session(graph=graph1) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           export_scope="hidden1")
       saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
@@ -2680,6 +2690,7 @@ class ScopedGraphTest(test.TestCase):
       saver3.restore(sess, saver0_ckpt)
       self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
 
+  @test_util.run_deprecated_v1
   def testExportGraphDefWithScope(self):
     test_dir = self._get_test_dir("export_graph_def")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
@@ -2695,7 +2706,7 @@ class ScopedGraphTest(test.TestCase):
 
     # Run the graph and save scoped checkpoint.
     with self.session(graph=graph1) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           graph_def=graph1.as_graph_def(), export_scope="hidden1")
       saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
@@ -2716,6 +2727,7 @@ class ScopedGraphTest(test.TestCase):
       saver3.restore(sess, saver0_ckpt)
       self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
 
+  @test_util.run_deprecated_v1
   def testSerializeSaverWithScope(self):
     test_dir = self._get_test_dir("export_graph_def")
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
@@ -2963,7 +2975,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     a_saver = saver_module.Saver([a])
     b_saver = saver_module.Saver([b])
     with self.cached_session() as sess:
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
       save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
       with self.assertRaisesRegexp(
           errors.NotFoundError, "Key b not found in checkpoint"):
@@ -2976,6 +2988,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       # exception" block in Python 3.
       self.assertNotIn("NewCheckpointReader", cs.exception.message)
 
+  @test_util.run_v1_only("b/120545219")
   def testGraphChangedForRestoreErrorRaised(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -2985,7 +2998,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       a_saver = saver_module.Saver([a])
 
       with self.session(graph=g) as sess:
-        sess.run(a.initializer)
+        self.evaluate(a.initializer)
         save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
 
     with ops_lib.Graph().as_default() as g:
@@ -2997,6 +3010,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
             "a mismatch between the current graph and the graph"):
           a_saver.restore(sess=sess, save_path=save_path)
 
+  @test_util.run_v1_only("b/120545219")
   def testLoadFromObjectBasedGraph(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -3015,7 +3029,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
           checkpoint_directory, "second"))
 
     restore_graph = ops_lib.Graph()
-    with restore_graph.as_default(), self.test_session(
+    with restore_graph.as_default(), self.session(
         graph=restore_graph) as sess:
       root = self._initialized_model()
       self._set_sentinels(root)
@@ -3028,7 +3042,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       self.assertEqual(before_second_restore_ops,
                        restore_graph.get_operations())
       with self.assertRaisesRegexp(errors.NotFoundError,
-                                   "could not find a_variable"):
+                                   "Could not find some variables"):
         saver.restore(sess=sess, save_path=second_path)
 
   def testLoadFromObjectBasedEager(self):
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..67ccd59b88c289a11791c9098a2014c48e6c33fb
--- /dev/null
+++ b/tensorflow/python/training/saving/BUILD
@@ -0,0 +1,55 @@
+# Description:
+#   Low-level utilities for reading and writing checkpoints.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_library(
+    name = "functional_saver",
+    srcs = ["functional_saver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saveable_object",
+        ":saveable_object_util",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
+cuda_py_test(
+    name = "functional_saver_test",
+    size = "medium",
+    srcs = [
+        "functional_saver_test.py",
+    ],
+    additional_deps = [
+        ":functional_saver",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "saveable_object",
+    srcs = ["saveable_object.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "saveable_object_util",
+    srcs = ["saveable_object_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/checkpointable:base",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eed3336626ef63942a40702f9787e6b5847b97b
--- /dev/null
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -0,0 +1,101 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Saves and restore variables inside traced @tf.functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.saving import saveable_object_util
+
+
+class Saver(object):
+  """A minimal utility class for saving and restoring checkpoints.
+
+  Note that this is a low-level utility which stores Tensors in the keys
+  specified by `SaveableObject`s. Higher-level utilities for object-based
+  checkpointing are built on top of it.
+  """
+
+  def __init__(self, saveable_objects):
+    """Specify a list of `SaveableObject`s to save and restore.
+
+    Args:
+      saveable_objects: A list of `SaveableObject`s.
+    """
+    saveable_objects = list(saveable_objects)
+    for saveable in saveable_objects:
+      if not isinstance(saveable, saveable_object.SaveableObject):
+        raise ValueError(
+            "Saver expected a list of SaveableObjects, got %s." % (saveable,))
+    self._saveable_objects = saveable_objects
+
+  # TODO(b/120569892): Use tf.function here
+  def save(self, file_prefix):
+    """Save the saveable objects to a checkpoint with `file_prefix`.
+
+    Args:
+      file_prefix: A string or scalar string Tensor containing the prefix to
+        save under.
+    Returns:
+      A scalar string Tensor containing `file_prefix` with control dependencies
+      on the save ops.
+    """
+    tensor_names = []
+    tensors = []
+    tensor_slices = []
+    for saveable in self._saveable_objects:
+      for spec in saveable.specs:
+        tensor_names.append(spec.name)
+        tensors.append(spec.tensor)
+        tensor_slices.append(spec.slice_spec)
+    with ops.control_dependencies(
+        [io_ops.save_v2(file_prefix, tensor_names, tensor_slices, tensors)]):
+      return array_ops.identity(file_prefix)
+
+  # TODO(b/120569892): Use tf.function here
+  def restore(self, file_prefix):
+    """Restore the saveable objects from a checkpoint with `file_prefix`.
+
+    Args:
+      file_prefix: A string or scalar string Tensor containing the prefix for
+        files to read from.
+
+    Returns:
+      An operation which restores the `Saver`'s `SaveableObject`s when run, or
+      None if executing eagerly.
+    """
+    restore_ops = []
+    for saveable in self._saveable_objects:
+      if saveable.device:
+        device = saveable_object_util.set_cpu0(saveable.device)
+      else:
+        device = None
+      with ops.device(device):
+        tensors = []
+        for spec in saveable.specs:
+          tensors.append(
+              io_ops.restore_v2(
+                  file_prefix,
+                  [spec.name],
+                  [spec.slice_spec],
+                  [spec.dtype])[0])
+        restore_ops.append(saveable.restore(tensors, restored_shapes=None))
+    return control_flow_ops.group(restore_ops)
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..40002255aacd4b3579bab6ea44bc9e5ee98f9177
--- /dev/null
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for the functional saver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.saving import saveable_object_util
+
+
+class SaverTest(test.TestCase):
+
+  def test_resource_variable(self):
+    v1 = resource_variable_ops.ResourceVariable(2.)
+    saver = functional_saver.Saver(
+        saveable_object_util.saveable_objects_for_op(v1, "x"))
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_path = saver.save(constant_op.constant(prefix))
+    v1.assign(1.)
+    saver.restore(save_path)
+    self.assertEqual(2., self.evaluate(v1))
+
+    v2 = resource_variable_ops.ResourceVariable(3.)
+    second_saver = functional_saver.Saver(
+        saveable_object_util.saveable_objects_for_op(v2, "x"))
+    second_saver.restore(save_path)
+    self.assertEqual(2., self.evaluate(v2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/saveable_object.py b/tensorflow/python/training/saving/saveable_object.py
similarity index 100%
rename from tensorflow/python/training/saveable_object.py
rename to tensorflow/python/training/saving/saveable_object.py
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa88d2c6ebd2f29c2d2de7583a918dcbc6b28b51
--- /dev/null
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -0,0 +1,340 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for working with and creating SaveableObjects."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.saving import saveable_object
+
+
+# Op names which identify variable reads which should be saved.
+_VARIABLE_OPS = set(["Variable",
+                     "VariableV2",
+                     "AutoReloadVariable",
+                     "VarHandleOp",
+                     "ReadVariableOp"])
+
+
+def set_cpu0(device_string):
+  """Creates a new device string based on `device_string` but using /CPU:0.
+
+  If the device is already on /CPU:0, this is a no-op.
+
+  Args:
+    device_string: A device string.
+
+  Returns:
+    A device string.
+  """
+  parsed_device = pydev.DeviceSpec.from_string(device_string)
+  parsed_device.device_type = "CPU"
+  parsed_device.device_index = 0
+  return parsed_device.to_string()
+
+
+class ReferenceVariableSaveable(saveable_object.SaveableObject):
+  """SaveableObject implementation that handles reference variables."""
+
+  def __init__(self, var, slice_spec, name):
+    spec = saveable_object.SaveSpec(var, slice_spec, name, dtype=var.dtype)
+    super(ReferenceVariableSaveable, self).__init__(var, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    restored_tensor = restored_tensors[0]
+    if restored_shapes is not None:
+      restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
+    return state_ops.assign(
+        self.op,
+        restored_tensor,
+        validate_shape=restored_shapes is None and
+        self.op.get_shape().is_fully_defined())
+
+
+class ResourceVariableSaveable(saveable_object.SaveableObject):
+  """SaveableObject implementation that handles ResourceVariables."""
+
+  def __init__(self, var, slice_spec, name):
+    self._var_device = var.device
+    self._var_shape = var.shape
+    if isinstance(var, ops.Tensor):
+      self.handle_op = var.op.inputs[0]
+      tensor = var
+    elif isinstance(var, resource_variable_ops.ResourceVariable):
+
+      def _read_variable_closure(v):
+        def f():
+          with ops.device(v.device):
+            x = v.read_value()
+            # To allow variables placed on non-CPU devices to be checkpointed,
+            # we copy them to CPU on the same machine first.
+            with ops.device("/device:CPU:0"):
+              return array_ops.identity(x)
+        return f
+
+      self.handle_op = var.handle
+      tensor = _read_variable_closure(var)
+    else:
+      raise ValueError(
+          "Saveable is neither a resource variable nor a read operation."
+          " Got: %s" % repr(var))
+    spec = saveable_object.SaveSpec(tensor, slice_spec, name,
+                                    dtype=var.dtype)
+    super(ResourceVariableSaveable, self).__init__(var, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    restored_tensor = restored_tensors[0]
+    if restored_shapes is not None:
+      restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
+    # Copy the restored tensor to the variable's device.
+    with ops.device(self._var_device):
+      restored_tensor = array_ops.identity(restored_tensor)
+      return resource_variable_ops.shape_safe_assign_variable_handle(
+          self.handle_op, self._var_shape, restored_tensor)
+
+
+def _tensor_comes_from_variable(v):
+  return isinstance(v, ops.Tensor) and v.op.type in _VARIABLE_OPS
+
+
+def saveable_objects_for_op(op, name):
+  """Create `SaveableObject`s from an operation.
+
+  Args:
+    op: A variable, operation, or SaveableObject to coerce into a
+      SaveableObject.
+    name: A string name for the SaveableObject.
+
+  Yields:
+    `SaveableObject`s which together save/restore `op`.
+
+  Raises:
+    TypeError: If `name` is not a string.
+    ValueError: For operations with no known conversion to SaveableObject.
+  """
+  if not isinstance(name, six.string_types):
+    raise TypeError(
+        "names_to_saveables must be a dict mapping string names to "
+        "checkpointable operations. Name is not a string: %s" % name)
+  if isinstance(op, saveable_object.SaveableObject):
+    yield op
+  elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
+    if isinstance(op, variables.PartitionedVariable):
+      op = list(op)
+    # A set of slices.
+    slice_name = None
+    # pylint: disable=protected-access
+    for variable in op:
+      if not isinstance(variable, variables.Variable):
+        raise ValueError("Slices must all be Variables: %s" % variable)
+      if not variable._save_slice_info:
+        raise ValueError("Slices must all be slices: %s" % variable)
+      if slice_name is None:
+        slice_name = variable._save_slice_info.full_name
+      elif slice_name != variable._save_slice_info.full_name:
+        raise ValueError(
+            "Slices must all be from the same tensor: %s != %s" %
+            (slice_name, variable._save_slice_info.full_name))
+      if variable.op.type in ["Variable", "VariableV2",
+                              "AutoReloadVariable"]:
+        yield ReferenceVariableSaveable(
+            variable, variable._save_slice_info.spec, name)
+      else:
+        yield ResourceVariableSaveable(
+            variable, variable._save_slice_info.spec, name)
+    # pylint: enable=protected-access
+  elif isinstance(op, checkpointable.CheckpointableBase) and not isinstance(
+      op, variables.Variable):
+    # pylint: disable=protected-access
+    for attr, factory in op._gather_saveables_for_checkpoint().items():
+      if attr == checkpointable.VARIABLE_VALUE_KEY:
+        # Keep original name for classes masquerading as variables.
+        full_name = name
+      else:
+        full_name = name + "_" + attr
+      op = (factory(full_name) if callable(factory) else factory)
+      for op in saveable_objects_for_op(op, op.name):
+        yield op
+    # pylint: enable=protected-access
+  else:
+    # A variable or tensor.
+    if isinstance(op, resource_variable_ops.ResourceVariable):
+      # pylint: disable=protected-access
+      if op._in_graph_mode:
+        variable = op._graph_element
+      else:
+        variable = op
+      # pylint: enable=protected-access
+      yield ResourceVariableSaveable(variable, "", name)
+    else:
+      with ops.init_scope():
+        if context.executing_eagerly():
+          raise ValueError("Can only save/restore ResourceVariables when "
+                           "executing eagerly, got type: %s." % type(op))
+
+      variable = ops.internal_convert_to_tensor(op, as_ref=True)
+      if not _tensor_comes_from_variable(variable):
+        raise TypeError("names_to_saveables must be a dict mapping string "
+                        "names to Tensors/Variables. Not a variable: %s" %
+                        variable)
+      if variable.op.type in ["Variable", "VariableV2",
+                              "AutoReloadVariable"]:
+        yield ReferenceVariableSaveable(variable, "", name)
+      else:
+        yield ResourceVariableSaveable(
+            variable, "", name)
+
+
+def op_list_to_dict(op_list, convert_variable_to_tensor=True):
+  """Create a dictionary of names to operation lists.
+
+  Args:
+    op_list: A list, tuple, or set of Variables or SaveableObjects.
+    convert_variable_to_tensor: Whether or not to convert single Variables
+      with no slice info into Tensors.
+
+  Returns:
+    A dictionary of names to the operations that must be saved under
+    that name.  Variables with save_slice_info are grouped together under the
+    same key in no particular order.
+
+  Raises:
+    TypeError: If the type of op_list or its elements is not supported.
+    ValueError: If at least two saveables share the same name.
+  """
+  if not isinstance(op_list, (list, tuple, set)):
+    raise TypeError("Variables to save should be passed in a dict or a "
+                    "list: %s" % op_list)
+  # When ResourceVariables are converted to Tensors, read ops are added to the
+  # graph. Sorting the op_list ensures that the resulting graph is always
+  # constructed in a deterministic way:
+  op_list = sorted(op_list, key=lambda x: x.name)
+  names_to_saveables = {}
+  # pylint: disable=protected-access
+  for var in op_list:
+    if isinstance(var, saveable_object.SaveableObject):
+      names_to_saveables[var.name] = var
+    elif isinstance(var, variables.PartitionedVariable):
+      if var.name in names_to_saveables:
+        raise ValueError("At least two variables have the same name: %s" %
+                         var.name)
+      names_to_saveables[var.name] = var
+    elif isinstance(var, variables.Variable) and var._save_slice_info:
+      name = var._save_slice_info.full_name
+      if name in names_to_saveables:
+        if not isinstance(names_to_saveables[name], list):
+          raise ValueError("Mixing slices and non-slices with the same name: "
+                           "%s" % name)
+        names_to_saveables[name].append(var)
+      else:
+        names_to_saveables[name] = [var]
+    elif (isinstance(var, checkpointable.CheckpointableBase)
+          and not isinstance(var, variables.Variable)):
+      checkpointable_saveables = [
+          (factory() if callable(factory) else factory)
+          for factory in var._gather_saveables_for_checkpoint().values()]
+      names_to_saveables.update(
+          op_list_to_dict(checkpointable_saveables))
+    else:
+      if context.executing_eagerly():
+        if not isinstance(var, resource_variable_ops.ResourceVariable):
+          raise ValueError(
+              "Can only save/restore ResourceVariables when eager execution "
+              "is enabled, type: %s." % type(var))
+        set_var = names_to_saveables.setdefault(var._shared_name, var)
+        if set_var is not var:
+          raise ValueError(
+              ("Two different ResourceVariable objects with the same "
+               "shared_name '%s' were passed to the Saver. This likely means "
+               "that they were created in different Graphs or isolation "
+               "contexts, and may not be checkpointed together.") %
+              (var._shared_name,))
+      else:
+        if convert_variable_to_tensor:
+          if isinstance(var, resource_variable_ops.ResourceVariable):
+            var = var._graph_element  # pylint: disable=protected-access
+          else:
+            var = ops.internal_convert_to_tensor(var, as_ref=True)
+          if not _tensor_comes_from_variable(var):
+            raise TypeError("Variable to save is not a Variable: %s" % var)
+        if var.op.type == "ReadVariableOp":
+          name = var.op.inputs[0].op.name
+        else:
+          name = var.op.name
+        if name in names_to_saveables:
+          raise ValueError("At least two variables have the same name: %s" %
+                           name)
+        names_to_saveables[name] = var
+
+    # pylint: enable=protected-access
+  return names_to_saveables
+
+
+def _add_saveable(saveables, seen_ops, saveable):
+  """Adds the saveable to the saveables list.
+
+  Args:
+    saveables: List to append the SaveableObject to.
+    seen_ops: Set of the ops of the saveables already processed.  Used to
+      check that each saveable is only saved once.
+    saveable: The saveable.
+
+  Raises:
+    ValueError: If the saveable has already been processed.
+  """
+  if saveable.op in seen_ops:
+    raise ValueError("The same saveable will be restored with two names: %s" %
+                     saveable.name)
+  saveables.append(saveable)
+  seen_ops.add(saveable.op)
+
+
+def validate_and_slice_inputs(names_to_saveables):
+  """Returns the variables and names that will be used for a Saver.
+
+  Args:
+    names_to_saveables: A dict (k, v) where k is the name of an operation and
+       v is an operation to save or a BaseSaverBuilder.Saver.
+
+  Returns:
+    A list of SaveableObjects.
+
+  Raises:
+    TypeError: If any of the keys are not strings or any of the
+      values are not one of Tensor or Variable or a checkpointable operation.
+    ValueError: If the same operation is given in more than one value
+      (this also applies to slices of SlicedVariables).
+  """
+  if not isinstance(names_to_saveables, dict):
+    names_to_saveables = op_list_to_dict(names_to_saveables)
+
+  saveables = []
+  seen_ops = set()
+  for name, op in sorted(names_to_saveables.items(),
+                         # Avoid comparing ops, sort only by name.
+                         key=lambda x: x[0]):
+    for converted_saveable_object in saveable_objects_for_op(op, name):
+      _add_saveable(saveables, seen_ops, converted_saveable_object)
+  return saveables
diff --git a/tensorflow/python/training/server_lib.i b/tensorflow/python/training/server_lib.i
deleted file mode 100644
index 94250304f853ba1f942506bcfb3240a4adab3797..0000000000000000000000000000000000000000
--- a/tensorflow/python/training/server_lib.i
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-%nothread tensorflow::ServerInterface::Join;
-
-%include "tensorflow/python/platform/base.i"
-
-//%newobject tensorflow::NewServer;
-
-%typemap(in) const ServerDef& (tensorflow::ServerDef temp) {
-  char* c_string;
-  Py_ssize_t py_size;
-  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
-    // Python has raised an error (likely TypeError or UnicodeEncodeError).
-    SWIG_fail;
-  }
-
-  if (!temp.ParseFromString(string(c_string, py_size))) {
-    PyErr_SetString(
-        PyExc_TypeError,
-        "The ServerDef could not be parsed as a valid protocol buffer");
-    SWIG_fail;
-  }
-  $1 = &temp;
-}
-
-%typemap(in, numinputs=0)
-    std::unique_ptr<tensorflow::ServerInterface>* out_server (
-        std::unique_ptr<tensorflow::ServerInterface> temp) {
-  $1 = &temp;
-}
-
-%typemap(argout) std::unique_ptr<tensorflow::ServerInterface>* out_server {
-  // TODO(mrry): Convert this to SWIG_POINTER_OWN when the issues with freeing
-  // a server are fixed.
-  $result = SWIG_NewPointerObj($1->release(),
-                               $descriptor(tensorflow::ServerInterface*),
-                               0);
-}
-
-%feature("except") tensorflow::ServerInterface::Join {
-  // Let other threads run while we wait for the server to shut down.
-  Py_BEGIN_ALLOW_THREADS
-  $action
-  Py_END_ALLOW_THREADS
-}
-
-%{
-#include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/distributed_runtime/server_lib.h"
-#include "tensorflow/core/lib/core/status.h"
-
-using tensorflow::ServerDef;
-
-static void PyServer_New(const ServerDef& server_def,
-                         std::unique_ptr<tensorflow::ServerInterface>* out_server,
-                         TF_Status* out_status) {
-  tensorflow::Status status =
-      tensorflow::NewServer(server_def, out_server);
-  tensorflow::Set_TF_Status_from_Status(out_status, status);
-}
-
-static void PyServer_Start(
-    tensorflow::ServerInterface* in_server,
-    TF_Status* out_status) {
-  tensorflow::Set_TF_Status_from_Status(out_status, in_server->Start());
-}
-
-static void PyServer_Stop(
-    tensorflow::ServerInterface* in_server,
-    TF_Status* out_status) {
-  tensorflow::Set_TF_Status_from_Status(out_status, in_server->Stop());
-}
-
-static void PyServer_Join(
-    tensorflow::ServerInterface* in_server,
-    TF_Status* out_status) {
-  tensorflow::Set_TF_Status_from_Status(out_status, in_server->Join());
-}
-%}
-
-// Wrap this function.
-void PyServer_New(const ServerDef& server_def,
-                  std::unique_ptr<tensorflow::ServerInterface>* out_server,
-                  TF_Status* out_status);
-void PyServer_Start(tensorflow::ServerInterface* in_server,
-                    TF_Status* out_status);
-void PyServer_Stop(tensorflow::ServerInterface* in_server,
-                   TF_Status* out_status);
-void PyServer_Join(tensorflow::ServerInterface* in_server,
-                   TF_Status* out_status);
-
-%ignoreall
-
-%unignore tensorflow;
-%unignore tensorflow::ServerDef;
-%unignore tensorflow::ServerInterface;
-%unignore tensorflow::ServerInterface::~ServerInterface;
-%unignore tensorflow::ServerInterface::target;
-
-%unignore PyServer_New;
-%unignore PyServer_Start;
-%unignore PyServer_Stop;
-%unignore PyServer_Join;
-
-%include "tensorflow/core/distributed_runtime/server_lib.h"
-
-%unignoreall
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 46543413e40a5a212b180b0cdeb2280148d606c5..302ca2dd44b99d2a5cfeffa163d95634513f9eaa 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
-from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
@@ -143,12 +143,24 @@ class Server(object):
     """
     self._server_def = _make_server_def(server_or_cluster_def,
                                         job_name, task_index, protocol, config)
-    with errors.raise_exception_on_not_ok_status() as status:
-      self._server = pywrap_tensorflow.PyServer_New(
-          self._server_def.SerializeToString(), status)
+    self._server = c_api.TF_NewServer(self._server_def.SerializeToString())
     if start:
       self.start()
 
+  def __del__(self):
+    try:
+      c_api.TF_ServerStop(self._server)
+      # Clean shutdown of servers is not yet implemented, so
+      # we leak instead of calling c_api.TF_DeleteServer here.
+      # See:
+      # https://github.com/tensorflow/tensorflow/blob/0495317a6e9dd4cac577b9d5cf9525e62b571018/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h#L73
+    except errors.UnimplementedError:
+      pass
+    except AttributeError:
+      # At shutdown, `c_api` may have been garbage collected.
+      pass
+    self._server = None
+
   def start(self):
     """Starts this server.
 
@@ -156,8 +168,7 @@ class Server(object):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         starting the TensorFlow server.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.PyServer_Start(self._server, status)
+    c_api.TF_ServerStart(self._server)
 
   def join(self):
     """Blocks until the server has shut down.
@@ -168,8 +179,7 @@ class Server(object):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         joining the TensorFlow server.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.PyServer_Join(self._server, status)
+    c_api.TF_ServerJoin(self._server)
 
   @property
   def server_def(self):
@@ -198,7 +208,7 @@ class Server(object):
     Returns:
       A string containing a session target for this server.
     """
-    return self._server.target()
+    return c_api.TF_ServerTarget(self._server)
 
   @staticmethod
   def create_local_server(config=None, start=True):
diff --git a/tensorflow/python/training/server_lib_multiple_containers_test.py b/tensorflow/python/training/server_lib_multiple_containers_test.py
index f599e9b55b9f8d6ac3b66a9c72e6c33c7b127c58..fb6118942bdb7332a1f96a25927d4958796b6ba2 100644
--- a/tensorflow/python/training/server_lib_multiple_containers_test.py
+++ b/tensorflow/python/training/server_lib_multiple_containers_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -33,6 +34,7 @@ class MultipleContainersTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testMultipleContainers(self):
     with ops.container("test0"):
       v0 = variables.Variable(1.0, name="v0")
diff --git a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
index 11e6f28ab05b5d7e7ca8b90a0407f1dbdb283738..e0ab21bbd979ab8c7e6d825573c584325bcdaf7b 100644
--- a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -32,6 +33,7 @@ class SameVariablesClearContainerTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSameVariablesClearContainer(self):
     # Starts two servers with different names so they map to different
     # resource "containers".
@@ -60,9 +62,9 @@ class SameVariablesClearContainerTest(test.TestCase):
     session.Session.reset(server0.target, ["local0"])
     sess = session.Session(server0.target)
     with self.assertRaises(errors_impl.FailedPreconditionError):
-      sess.run(v0)
+      self.evaluate(v0)
     # Reinitializes v0 for the following test.
-    sess.run(v0.initializer)
+    self.evaluate(v0.initializer)
 
     # Verifies that v1 is still valid.
     self.assertAllEqual(2.0, sess_1.run(v1))
@@ -71,10 +73,10 @@ class SameVariablesClearContainerTest(test.TestCase):
     session.Session.reset(server1.target, ["local1"])
     sess = session.Session(server1.target)
     with self.assertRaises(errors_impl.FailedPreconditionError):
-      sess.run(v1)
+      self.evaluate(v1)
     # Verifies that v0 is still valid.
     sess = session.Session(server0.target)
-    self.assertAllEqual(1.0, sess.run(v0))
+    self.assertAllEqual(1.0, self.evaluate(v0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/server_lib_same_variables_clear_test.py b/tensorflow/python/training/server_lib_same_variables_clear_test.py
index 4682f1ab84d719cafd1d94669a9ee3ca5f1797fc..7b147af6c55894575e4f98436daaa3f3f33bd16c 100644
--- a/tensorflow/python/training/server_lib_same_variables_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_clear_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -32,6 +33,7 @@ class SameVariablesClearTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSameVariablesClear(self):
     server = server_lib.Server.create_local_server()
 
diff --git a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
index 5aa7f45c2b350a795016ed645a981e34f7626561..ff3fab9f372aecae28adf84a3d800759e3487665 100644
--- a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ class SameVariablesNoClearTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_v1_only("b/120545219")
   def testSameVariablesNoClear(self):
     server = server_lib.Server.create_local_server()
 
diff --git a/tensorflow/python/training/server_lib_sparse_job_test.py b/tensorflow/python/training/server_lib_sparse_job_test.py
index 1a6b44b90e8d4d4c3faf9f0ac596942a7ff3d09f..93b06e621608f0754fd4560ec4faa6c530209ac7 100644
--- a/tensorflow/python/training/server_lib_sparse_job_test.py
+++ b/tensorflow/python/training/server_lib_sparse_job_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -30,13 +31,14 @@ class SparseJobTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSparseJob(self):
     server = server_lib.Server({"local": {37: "localhost:0"}})
     with ops.device("/job:local/task:37"):
       a = constant_op.constant(1.0)
 
     with session.Session(server.target) as sess:
-      self.assertEqual(1.0, sess.run(a))
+      self.assertEqual(1.0, self.evaluate(a))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index cf995707fc56448e7fe5354d162581947604f382..92cdc1c4ad0832fc3f8593bebabe76d4e6dc0cc0 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -55,6 +56,7 @@ class GrpcServerTest(test.TestCase):
       self.assertAllEqual([[4]], sess.run(e))
     # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleSessions(self):
     server = self._cached_server
 
@@ -73,6 +75,7 @@ class GrpcServerTest(test.TestCase):
     # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
 
   # Verifies various reset failures.
+  @test_util.run_v1_only("b/120545219")
   def testResetFails(self):
     # Creates variable with container name.
     with ops.container("test0"):
@@ -146,6 +149,7 @@ class GrpcServerTest(test.TestCase):
       self.assertEqual(0.5, min_val)
       self.assertEqual(0.5, max_val)
 
+  @test_util.run_v1_only("b/120545219")
   def testCloseCancelsBlockingOperation(self):
     server = self._cached_server
     sess = session.Session(server.target, config=self._useRPCConfig())
@@ -174,7 +178,7 @@ class GrpcServerTest(test.TestCase):
     # is not supported, but it should successfully ignore it.
     sess = session.InteractiveSession(server.target)
     c = constant_op.constant(42.0)
-    self.assertEqual(42.0, c.eval())
+    self.assertEqual(42.0, self.evaluate(c))
     sess.close()
 
   def testSetConfiguration(self):
@@ -207,6 +211,7 @@ class GrpcServerTest(test.TestCase):
               "local": ["localhost"]
           }, job_name="local", task_index=0)
 
+  @test_util.run_v1_only("b/120545219")
   def testTimeoutRaisesException(self):
     server = self._cached_server
     q = data_flow_ops.FIFOQueue(1, [dtypes.float32])
@@ -241,6 +246,7 @@ class GrpcServerTest(test.TestCase):
       queue_runner_impl.start_queue_runners(sess)
       sess.run(var.assign(3.0))
 
+  @test_util.run_v1_only("b/120545219")
   def testIsolateSessionState(self):
     server = self._cached_server
 
@@ -296,6 +302,7 @@ class GrpcServerTest(test.TestCase):
     self.assertAllEqual(37, isolate_sess_0.run(v))
     self.assertAllEqual([19, 86], isolate_sess_1.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testShapeChangingIsolateState(self):
     server = self._cached_server
     sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index cd313c2ce053cdecd6b7856f55eb8969d31eac5a..0f68fcfe8bb4cb81e54ba27d35bfb0b2e3888a1b 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -25,7 +25,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -46,7 +45,7 @@ def _maybe_name(obj):
     return "<no name for %s>" % type(obj)
 
 
-@tf_export("train.SessionManager")
+@tf_export(v1=["train.SessionManager"])
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
@@ -183,12 +182,6 @@ class SessionManager(object):
     """
     self._target = master
     sess = session.Session(self._target, graph=self._graph, config=config)
-    # TODO(jhseu): Delete once tpu.initialize_system() goes away.
-    initialize_ops = (
-        distribution_strategy_context.get_distribution_strategy().initialize()
-    )
-    if initialize_ops:
-      sess.run(initialize_ops)
 
     if checkpoint_dir and checkpoint_filename_with_path:
       raise ValueError("Can not provide both checkpoint_dir and "
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 2b5c3b01defeedd6f59fd9b1eee9385d3101b584..c9a0c56ffc1e78f1f654b4ec224bf8480d53ad9b 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
@@ -68,6 +69,7 @@ class SessionManagerTest(test.TestCase):
           "", init_fn=lambda sess: sess.run(v.initializer))
       self.assertAllClose([125], sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -152,6 +154,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
@@ -184,6 +187,7 @@ class SessionManagerTest(test.TestCase):
           checkpoint_filename_with_path=checkpoint_management.latest_checkpoint(
               checkpoint_dir))
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
       variables.VariableV1(1, name="v")
@@ -206,6 +210,7 @@ class SessionManagerTest(test.TestCase):
               variables.global_variables()),
           local_init_op=None)
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(),
@@ -259,6 +264,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(v))
       self.assertEquals(1, sess.run(w))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
     # We use ready_for_local_init_op=tf.report_uninitialized_variables(),
     # which causes recover_session to not run local_init_op, and to return
@@ -315,6 +321,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionNoChkptStillRunsLocalInitOp(self):
     # This test checks for backwards compatibility.
     # In particular, we continue to ensure that recover_session will execute
@@ -343,6 +350,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(w))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionFailsStillRunsLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(
@@ -386,6 +394,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(w))
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionLocalInit(self):
     server = server_lib.Server.create_local_server()
     with ops.Graph().as_default() as graph:
@@ -437,6 +446,7 @@ class SessionManagerTest(test.TestCase):
         # because of overly restrictive ready_for_local_init_op
         sm.wait_for_session("", max_wait_secs=3)
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default() as graph:
       v = variables.VariableV1(1, name="v")
@@ -454,6 +464,7 @@ class SessionManagerTest(test.TestCase):
                                  "Session was not ready after waiting.*"):
       sm.wait_for_session("", max_wait_secs=3)
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithReadyForLocalInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -493,6 +504,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(w))
       self.assertEquals(3, sess.run(x))
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithPartialInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -559,6 +571,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(w_res))
       self.assertEquals(3, sess.run(x_res))
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithCyclicInitializer(self):
     # Regression test. Previously Variable._build_initializer_expr would enter
     # into an infinite recursion when the variable's initial_value involved
@@ -632,6 +645,7 @@ class SessionManagerTest(test.TestCase):
           "Init operations did not make model ready for local_init"):
         sm2.prepare_session("", init_op=None)
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -684,6 +698,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           "", init_fn=lambda sess: sess.run(v.initializer))
       self.assertAllClose([125], sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -745,6 +760,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
@@ -783,6 +799,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
       variables.VariableV1(1, name="v")
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 5daea9312886599f4119b088096434a8b2a258de..e9a61def7430fec0190c8f7b788fd7b72492e432 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -186,7 +186,7 @@ class SessionRunHook(object):
     pass
 
 
-@tf_export("train.SessionRunArgs")
+@tf_export(v1=["train.SessionRunArgs"])
 class SessionRunArgs(
     collections.namedtuple("SessionRunArgs",
                            ["fetches", "feed_dict", "options"])):
@@ -211,7 +211,7 @@ class SessionRunArgs(
     return super(SessionRunArgs, cls).__new__(cls, fetches, feed_dict, options)
 
 
-@tf_export("train.SessionRunContext")
+@tf_export(v1=["train.SessionRunContext"])
 class SessionRunContext(object):
   """Provides information about the `session.run()` call being made.
 
@@ -263,7 +263,7 @@ class SessionRunContext(object):
     self._stop_requested = True
 
 
-@tf_export("train.SessionRunValues")
+@tf_export(v1=["train.SessionRunValues"])
 class SessionRunValues(
     collections.namedtuple("SessionRunValues",
                            ["results", "options", "run_metadata"])):
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index d76b22acd86956e9b7bbd768299e3db7f630a4d5..ecf5a96ed49146fe4cafce6a809925aab5bdc6fb 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -39,13 +39,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context
 
 
 def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index 6d6364169fd4b9afa6f64fb9aadc283aab261cbb..f1f0d58a6913a542093ada7a948969f47928a43b 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -31,6 +32,7 @@ from tensorflow.python.training import slot_creator
 
 class SlotCreatorTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testCreateSlotFromVariable(self):
     with self.cached_session():
       v = variables.Variable([1.0, 2.5], name="var")
@@ -41,8 +43,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([1.0, 2.5], slot.eval())
+      self.assertAllEqual([1.0, 2.5], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateSlotFromTensor(self):
     with self.cached_session():
       v = constant_op.constant([1.0, 2.5], name="const")
@@ -53,8 +56,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([2.0, 5.0], slot.eval())
+      self.assertAllEqual([2.0, 5.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromVariable(self):
     with self.cached_session():
       v = variables.Variable([1.0, 2.5], name="var")
@@ -67,8 +71,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_v1_only("b/120545219")
   def testCreateZerosSlotFromDynamicShapedVariable(self):
     with self.cached_session():
       dyn_shape = constant_op.constant([2], dtype=dtypes.int32)
@@ -88,8 +93,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromTensor(self):
     with self.cached_session():
       v = constant_op.constant([1.0, 2.5], name="const")
@@ -101,8 +107,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromDynamicShapedTensor(self):
     with self.cached_session():
       v = random_ops.random_uniform([2], dtype=dtypes.float64)
@@ -116,8 +123,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_v1_only("b/120545219")
   def testCreateSlotFromVariableRespectsScope(self):
     # See discussion on #2740.
     with self.cached_session():
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index a5e626d3204f2e9a2993c07df6044ba99df0f68f..de60dd456ff81884398ba16abd03bdfde267d6f4 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -40,7 +40,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.Supervisor")
+@tf_export(v1=["train.Supervisor"])
 class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index 7cd99d86801e659b369419796848babb49ac9ff4..180ddb52876635c584a12aad26c3703f0fae9d9a 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
@@ -100,7 +101,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       for _ in xrange(10):
-        sess.run(my_op)
+        self.evaluate(my_op)
       sess.close()
       sv.stop()
 
@@ -111,7 +112,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir=logdir)
       with sv.managed_session("") as sess:
         for _ in xrange(10):
-          sess.run(my_op)
+          self.evaluate(my_op)
       # Supervisor has been stopped.
       self.assertTrue(sv.should_stop())
 
@@ -128,7 +129,7 @@ class SupervisorTest(test.TestCase):
             if step == 1:
               raise RuntimeError("failing here")
             else:
-              sess.run(my_op)
+              self.evaluate(my_op)
       # Supervisor has been stopped.
       self.assertTrue(sv.should_stop())
       self.assertEqual(1, last_step)
@@ -146,7 +147,7 @@ class SupervisorTest(test.TestCase):
             raise errors_impl.OutOfRangeError(my_op.op.node_def, my_op.op,
                                               "all done")
           else:
-            sess.run(my_op)
+            self.evaluate(my_op)
       # Supervisor has been stopped.  OutOfRangeError was not thrown.
       self.assertTrue(sv.should_stop())
       self.assertEqual(3, last_step)
@@ -335,7 +336,7 @@ class SupervisorTest(test.TestCase):
       sess = sv.prepare_or_wait_for_session(
           "", config=config_pb2.ConfigProto(device_count={"CPU": 2}))
       for _ in xrange(10):
-        sess.run(my_op)
+        self.evaluate(my_op)
       sess.close()
       sv.stop()
 
@@ -420,6 +421,7 @@ class SupervisorTest(test.TestCase):
       with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
         sv.summary_computed(sess, sess.run(summ))
 
+  @test_util.run_v1_only("b/120545219")
   def testLogdirButExplicitlyNoSummaryWriter(self):
     logdir = self._test_dir("explicit_no_summary_writer")
     with ops.Graph().as_default():
@@ -505,6 +507,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir="", session_manager=sm)
       sv.prepare_or_wait_for_session("")
 
+  @test_util.run_v1_only("b/120545219")
   def testInitOp(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -514,6 +517,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testInitFn(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -527,6 +531,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testInitOpWithFeedDict(self):
     logdir = self._test_dir("feed_dict_init_op")
     with ops.Graph().as_default():
@@ -540,6 +545,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testReadyForLocalInitOp(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_ready_for_local_init_op")
@@ -582,6 +588,7 @@ class SupervisorTest(test.TestCase):
     sv0.stop()
     sv1.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testReadyForLocalInitOpRestoreFromCheckpoint(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("ready_for_local_init_op_restore")
@@ -713,6 +720,7 @@ class SupervisorTest(test.TestCase):
                                    "Variables not initialized: w"):
         sv.prepare_or_wait_for_session(server.target)
 
+  @test_util.run_v1_only("b/120545219")
   def testSetupFail(self):
     logdir = self._test_dir("setup_fail")
     with ops.Graph().as_default():
@@ -723,6 +731,7 @@ class SupervisorTest(test.TestCase):
       variables.VariableV1([1.0, 2.0, 3.0], name="v")
       supervisor.Supervisor(logdir=logdir, is_chief=False)
 
+  @test_util.run_v1_only("b/120545219")
   def testDefaultGlobalStep(self):
     logdir = self._test_dir("default_global_step")
     with ops.Graph().as_default():
@@ -732,6 +741,7 @@ class SupervisorTest(test.TestCase):
       self.assertEquals(287, sess.run(sv.global_step))
       sv.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testRestoreFromMetaGraph(self):
     logdir = self._test_dir("restore_from_meta_graph")
     with ops.Graph().as_default():
@@ -753,6 +763,7 @@ class SupervisorTest(test.TestCase):
   # This test is based on the fact that the standard services start
   # right away and get to run once before sv.stop() returns.
   # We still sleep a bit to make the test robust.
+  @test_util.run_v1_only("b/120545219")
   def testStandardServicesWithoutGlobalStep(self):
     logdir = self._test_dir("standard_services_without_global_step")
     # Create a checkpoint.
@@ -799,10 +810,11 @@ class SupervisorTest(test.TestCase):
       v = variables.VariableV1([10.10], name="foo")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
-      self.assertEqual(1.0, v.eval()[0])
+      self.assertEqual(1.0, self.evaluate(v)[0])
 
   # Same as testStandardServicesNoGlobalStep but with a global step.
   # We should get a summary about the step time.
+  @test_util.run_v1_only("b/120545219")
   def testStandardServicesWithGlobalStep(self):
     logdir = self._test_dir("standard_services_with_global_step")
     # Create a checkpoint.
@@ -863,7 +875,7 @@ class SupervisorTest(test.TestCase):
       v = variables.VariableV1([-12], name="global_step")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
-      self.assertEqual(123, v.eval()[0])
+      self.assertEqual(123, self.evaluate(v)[0])
 
   def testNoQueueRunners(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index 6a3756fba9fd97b9f2916075f606119360342f5b..cd4590db7f6550f8790ad683c9aaecf145ad12da 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -31,6 +32,7 @@ from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
 from tensorflow.python.training import session_manager
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -39,10 +41,13 @@ from tensorflow.python.util.tf_export import tf_export
 # rate according to the number of replicas. This change is introduced to be
 # consistent with how gradients are aggregated (averaged) within a batch in a
 # replica.
-@tf_export("train.SyncReplicasOptimizer")
+@tf_export(v1=["train.SyncReplicasOptimizer"])
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
+  This class is deprecated. For synchrononous training, please use [Distribution
+  Strategies](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).
+
   In a typical asynchronous training environment, it's common to have some
   stale gradients. For example, with a N-replica asynchronous training,
   gradients will be applied to the variables N times independently. Depending
@@ -139,6 +144,12 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
   ```
   """
 
+  @deprecation.deprecated(
+      None,
+      "The `SyncReplicaOptimizer` class is deprecated. For synchrononous "
+      "training, please use [Distribution Strategies](https://github.com/"
+      "tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).",
+      warn_once=True)
   def __init__(self,
                opt,
                replicas_to_aggregate,
@@ -249,7 +260,9 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     # local_anchor op will be placed on this worker task by default.
     local_anchor = control_flow_ops.no_op()
     # Colocating local_step variable prevents it being placed on the PS.
-    with ops.colocate_with(local_anchor):
+    distribution_strategy = (
+        distribution_strategy_context.get_distribution_strategy())
+    with distribution_strategy.colocate_vars_with(local_anchor):
       self._local_step = variable_scope.variable(
           initial_value=0,
           trainable=False,
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 1ef8756ef671b652e2fb1b7616d813db7089fec2..428583d048ab30c8ccad0a5e32b47455c5c9bc3c 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -22,6 +22,7 @@ import time
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.test_util import create_local_cluster
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -88,6 +89,7 @@ class SyncReplicasOptimizerTest(test.TestCase):
   def _run(self, train_op, sess):
     sess.run(train_op)
 
+  @test_util.run_v1_only("b/120545219")
   def test2Workers(self):
     num_workers = 2
     replicas_to_aggregate = 2
@@ -178,6 +180,7 @@ class SyncReplicasOptimizerTest(test.TestCase):
                         sessions[1].run(var_1_g_1))
 
   # 3 workers and one of them is backup.
+  @test_util.run_v1_only("b/120545219")
   def test3Workers1Backup(self):
     num_workers = 3
     replicas_to_aggregate = 2
@@ -266,6 +269,7 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
                                  "apply_gradient should be called"):
       hook.begin()
 
+  @test_util.run_v1_only("b/120545219")
   def testCanCreatedBeforeMinimizeCalled(self):
     """This behavior is required to be integrated with Estimators."""
     opt = training.SyncReplicasOptimizer(
@@ -278,6 +282,7 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
     opt.minimize(v, global_step=global_step)
     hook.begin()
 
+  @test_util.run_v1_only("b/120545219")
   def testFetchVariableList(self):
     opt = training.SyncReplicasOptimizer(
         opt=adam.AdamOptimizer(0.01),
diff --git a/tensorflow/python/training/tensorboard_logging_test.py b/tensorflow/python/training/tensorboard_logging_test.py
index 5af6a0aa7b430cd6dc3d2e9f54392cf9ffafa63a..5088ab07e5e387c880aadc8de7385b53df911a29 100644
--- a/tensorflow/python/training/tensorboard_logging_test.py
+++ b/tensorflow/python/training/tensorboard_logging_test.py
@@ -25,6 +25,7 @@ import tempfile
 import time
 
 from tensorflow.core.util import event_pb2
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary_iterator
@@ -32,6 +33,7 @@ from tensorflow.python.summary.writer import writer
 from tensorflow.python.training import tensorboard_logging
 
 
+@test_util.run_v1_only("b/120545219")
 class EventLoggingTest(test.TestCase):
 
   def setUp(self):
@@ -85,6 +87,7 @@ class EventLoggingTest(test.TestCase):
                                   (event_pb2.LogMessage.ERROR, "format")])
     self.assertEqual(2, self.logged_message_count)
 
+  @test_util.run_v1_only("b/120545219")
   def testVerbosity(self):
     tensorboard_logging.set_summary_writer(self._sw)
     tensorboard_logging.set_verbosity(tensorboard_logging.ERROR)
@@ -112,6 +115,7 @@ class EventLoggingTest(test.TestCase):
     tensorboard_logging.warn("this should work")
     self.assertEqual(1, self.logged_message_count)
 
+  @test_util.run_v1_only("b/120545219")
   def testSummaryWriterFailsAfterClear(self):
     tensorboard_logging._clear_summary_writer()
     with self.assertRaises(RuntimeError):
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 686c4be31ae35c7201a4e7e38c9e5f97028dc26c..ae71a628c1f9e1e7e86a25cbcacab0bd400ed279 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -138,7 +138,7 @@ tf_export("train.FeatureLists")(FeatureLists)
 tf_export("train.FloatList")(FloatList)
 tf_export("train.Int64List")(Int64List)
 tf_export("train.JobDef")(JobDef)
-tf_export("train.SaverDef")(SaverDef)
+tf_export(v1=["train.SaverDef"])(SaverDef)
 tf_export("train.SequenceExample")(SequenceExample)
 tf_export("train.ServerDef")(ServerDef)
 # pylint: enable=undefined-variable
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index f410ceaafffdb0171f65febf48bf77ed0ca7274c..ba0f40999b48ffb8411c2cd0e7f4608f84ff292b 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
@@ -50,15 +51,16 @@ class TrainingOpsTest(TensorFlowTestCase):
 
   def _testTypes(self, x, alpha, delta, use_gpu=None):
     self.setUp()
-    with self.test_session(use_gpu=use_gpu):
+    with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
       variables.global_variables_initializer().run()
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
-      out = apply_sgd.eval()
+      out = self.evaluate(apply_sgd)
       self.assertShapeEqual(out, apply_sgd)
       self.assertAllCloseAccordingToType(x - alpha * delta, out)
 
+  @test_util.run_v1_only("b/120545219")
   def testApplyGradientDescent(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -69,18 +71,18 @@ class TrainingOpsTest(TensorFlowTestCase):
 
   def _testTypesForAdagrad(self, x, y, lr, grad, use_gpu=None):
     self.setUp()
-    with self.test_session(use_gpu=use_gpu):
+    with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_adagrad = training_ops.apply_adagrad(var, accum, lr, grad)
-      out = apply_adagrad.eval()
+      out = self.evaluate(apply_adagrad)
       self.assertShapeEqual(out, apply_adagrad)
       self.assertAllCloseAccordingToType(x - lr * grad * (y + grad * grad)**
                                          (-0.5), out)
-      self.assertAllCloseAccordingToType(y + grad * grad, accum.eval())
+      self.assertAllCloseAccordingToType(y + grad * grad, self.evaluate(accum))
 
   def _testTypesForFtrl(self,
                         x,
@@ -93,16 +95,16 @@ class TrainingOpsTest(TensorFlowTestCase):
                         l2=0.0,
                         lr_power=-0.5):
     self.setUp()
-    with self.test_session(use_gpu=use_gpu):
+    with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
       linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_ftrl = training_ops.apply_ftrl(var, accum, linear, grad, lr, l1, l2,
                                            lr_power)
-      out = apply_ftrl.eval()
+      out = self.evaluate(apply_ftrl)
       self.assertShapeEqual(out, apply_ftrl)
       accum_update = y + grad * grad
       linear_update = z + grad - (accum_update**(-lr_power) - y**
@@ -112,19 +114,22 @@ class TrainingOpsTest(TensorFlowTestCase):
           np.sign(linear_update[i]) * l1 - linear_update[i]) / (quadratic[i]) if
                                np.abs(linear_update[i]) > l1 else 0.0
                                for i in range(linear_update.size)])
-      self.assertAllCloseAccordingToType(accum_update, accum.eval())
+      self.assertAllCloseAccordingToType(accum_update, self.evaluate(accum))
       if x.dtype == np.float16:
         # The calculations here really are not very precise in float16.
-        self.assertAllClose(linear_update, linear.eval(), rtol=2e-2, atol=2e-2)
+        self.assertAllClose(
+            linear_update, self.evaluate(linear), rtol=2e-2, atol=2e-2)
         self.assertAllClose(expected_out, out, rtol=2e-2, atol=2e-2)
       elif x.dtype == np.float32:
         # The calculations here not sufficiently precise in float32.
-        self.assertAllClose(linear_update, linear.eval(), rtol=1e-5, atol=1e-5)
+        self.assertAllClose(
+            linear_update, self.evaluate(linear), rtol=1e-5, atol=1e-5)
         self.assertAllClose(expected_out, out, rtol=1e-5, atol=1e-5)
       else:
-        self.assertAllClose(linear_update, linear.eval())
+        self.assertAllClose(linear_update, self.evaluate(linear))
         self.assertAllClose(expected_out, out)
 
+  @test_util.run_v1_only("b/120545219")
   def testApplyAdagrad(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -134,6 +139,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       grad = np.arange(100).astype(dtype)
       self._testTypesForAdagrad(x, y, lr, grad, use_gpu)
 
+  @test_util.run_v1_only("b/120545219")
   def testApplyFtrl(self):
     for dtype in [np.float16, np.float32, np.float64]:
       x = np.arange(100).astype(dtype)
@@ -147,24 +153,24 @@ class TrainingOpsTest(TensorFlowTestCase):
 
   def _testTypesForSparseAdagrad(self, x, y, lr, grad, indices):
     self.setUp()
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_adagrad = training_ops.sparse_apply_adagrad(
           var, accum, lr, grad,
           constant_op.constant(indices, self._toType(indices.dtype)))
-      out = sparse_apply_adagrad.eval()
+      out = self.evaluate(sparse_apply_adagrad)
       self.assertShapeEqual(out, sparse_apply_adagrad)
 
       for (i, index) in enumerate(indices):
         self.assertAllCloseAccordingToType(
             x[index] - lr * grad[i] * (y[index] + grad[i] * grad[i])**(-0.5),
-            var.eval()[index])
+            self.evaluate(var)[index])
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
-                                           accum.eval()[index])
+                                           self.evaluate(accum)[index])
 
   def _testTypesForSparseFtrl(self,
                               x,
@@ -177,13 +183,13 @@ class TrainingOpsTest(TensorFlowTestCase):
                               l2=0.0,
                               lr_power=-0.5):
     self.setUp()
-    with self.test_session(use_gpu=False):
+    with self.session(use_gpu=False):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
       linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_ftrl = training_ops.sparse_apply_ftrl(
           var,
           accum,
@@ -194,16 +200,18 @@ class TrainingOpsTest(TensorFlowTestCase):
           l1,
           l2,
           lr_power=lr_power)
-      out = sparse_apply_ftrl.eval()
+      out = self.evaluate(sparse_apply_ftrl)
       self.assertShapeEqual(out, sparse_apply_ftrl)
 
       for (i, index) in enumerate(indices):
-        self.assertAllCloseAccordingToType(x[index] - lr * grad[i] *
-                                           (y[index] + grad[i] * grad[i])**
-                                           (lr_power), var.eval()[index])
+        self.assertAllCloseAccordingToType(
+            x[index] - lr * grad[i] * (y[index] + grad[i] * grad[i])**
+            (lr_power),
+            self.evaluate(var)[index])
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
-                                           accum.eval()[index])
+                                           self.evaluate(accum)[index])
 
+  @test_util.run_v1_only("b/120545219")
   def testSparseApplyAdagrad(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -217,6 +225,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
+  @test_util.run_v1_only("b/120545219")
   def testSparseApplyAdagradDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -230,6 +239,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
+  @test_util.run_v1_only("b/120545219")
   def testSparseApplyFtrlDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -245,6 +255,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseFtrl(x, y, z, lr, grad, indices)
 
+  @test_util.run_v1_only("b/120545219")
   def testApplyAdam(self):
     for dtype, use_gpu in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -256,7 +267,7 @@ class TrainingOpsTest(TensorFlowTestCase):
 
   def _testTypesForAdam(self, var, m, v, grad, use_gpu):
     self.setUp()
-    with self.test_session(use_gpu=use_gpu):
+    with self.session(use_gpu=use_gpu):
       var_t = variables.VariableV1(var)
       m_t = variables.VariableV1(m)
       v_t = variables.VariableV1(v)
@@ -276,13 +287,13 @@ class TrainingOpsTest(TensorFlowTestCase):
       epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), [])
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(var, var_t.eval())
+      self.assertAllCloseAccordingToType(var, self.evaluate(var_t))
       new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr, beta1,
                                             beta2, epsilon)
       apply_adam = training_ops.apply_adam(var_t, m_t, v_t, beta1_power_t,
                                            beta2_power_t, lr_t, beta1_t,
                                            beta2_t, epsilon_t, grad)
-      out = apply_adam.eval()
+      out = self.evaluate(apply_adam)
       self.assertShapeEqual(out, apply_adam)
       self.assertAllCloseAccordingToType(new_var, out)
 
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index d998d6af813e8d30942c7bc7ca6cfa7fd1ced89b..86f1b4d5aae31bacfe34141866ee4e7156eaa57b 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -39,7 +39,7 @@ GLOBAL_STEP_READ_KEY = 'global_step_read_op_cache'
 write_graph = graph_io.write_graph
 
 
-@tf_export('train.global_step')
+@tf_export(v1=['train.global_step'])
 def global_step(sess, global_step_tensor):
   """Small helper to get the global step.
 
@@ -69,7 +69,7 @@ def global_step(sess, global_step_tensor):
   return int(sess.run(global_step_tensor))
 
 
-@tf_export('train.get_global_step')
+@tf_export(v1=['train.get_global_step'])
 def get_global_step(graph=None):
   """Get the global step tensor.
 
@@ -104,7 +104,7 @@ def get_global_step(graph=None):
   return global_step_tensor
 
 
-@tf_export('train.create_global_step')
+@tf_export(v1=['train.create_global_step'])
 def create_global_step(graph=None):
   """Create global step tensor in graph.
 
@@ -129,7 +129,7 @@ def create_global_step(graph=None):
           dtype=dtypes.int64,
           initializer=init_ops.zeros_initializer(),
           trainable=False,
-          aggregation=variables.VariableAggregation.ONLY_FIRST_TOWER,
+          aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA,
           collections=[ops.GraphKeys.GLOBAL_VARIABLES,
                        ops.GraphKeys.GLOBAL_STEP])
   # Create in proper graph and base name_scope.
@@ -140,12 +140,12 @@ def create_global_step(graph=None):
         dtype=dtypes.int64,
         initializer=init_ops.zeros_initializer(),
         trainable=False,
-        aggregation=variables.VariableAggregation.ONLY_FIRST_TOWER,
+        aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES,
                      ops.GraphKeys.GLOBAL_STEP])
 
 
-@tf_export('train.get_or_create_global_step')
+@tf_export(v1=['train.get_or_create_global_step'])
 def get_or_create_global_step(graph=None):
   """Returns and create (if necessary) the global step tensor.
 
@@ -163,7 +163,7 @@ def get_or_create_global_step(graph=None):
   return global_step_tensor
 
 
-@tf_export('train.assert_global_step')
+@tf_export(v1=['train.assert_global_step'])
 def assert_global_step(global_step_tensor):
   """Asserts `global_step_tensor` is a scalar int `Variable` or `Tensor`.
 
diff --git a/tensorflow/python/training/training_util_test.py b/tensorflow/python/training/training_util_test.py
index ba64e785ac660a383e26651a37138f17e3e7cd17..3f9858a33bafc6ae0750695ec55e97ad5800119b 100644
--- a/tensorflow/python/training/training_util_test.py
+++ b/tensorflow/python/training/training_util_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import training_util
 
 
+@test_util.run_v1_only('b/120545219')
 class GlobalStepTest(test.TestCase):
 
   def _assert_global_step(self, global_step, expected_dtype=dtypes.int64):
@@ -90,6 +92,7 @@ class GlobalStepTest(test.TestCase):
       self._assert_global_step(training_util.get_or_create_global_step(g))
 
 
+@test_util.run_v1_only('b/120545219')
 class GlobalStepReadTest(test.TestCase):
 
   def test_global_step_read_is_none_if_there_is_no_global_step(self):
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index bea9bb6dffa02beecabaa2133c9ad2cc6f03bf36..1382b8ce72e93b19a16e60ac597a2413941b638e 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -28,11 +28,11 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.VocabInfo")
+@tf_export(v1=["train.VocabInfo"])
 class VocabInfo(
     collections.namedtuple("VocabInfo", [
         "new_vocab",
@@ -139,29 +139,26 @@ def _infer_var_name(var):
   Returns:
     Name of the `var`
   """
-  name_to_var_dict = saver.BaseSaverBuilder.OpListToDict(var)
+  name_to_var_dict = saveable_object_util.op_list_to_dict(var)
   if len(name_to_var_dict) > 1:
     raise TypeError("`var` = %s passed as arg violates the constraints.  "
                     "name_to_var_dict = %s" % (var, name_to_var_dict))
   return list(name_to_var_dict.keys())[0]
 
 
-def _warm_start_var(var, prev_ckpt, prev_tensor_name=None):
-  """Warm-starts given variable from `prev_tensor_name` tensor in `prev_ckpt`.
+def _get_var_info(var, prev_tensor_name=None):
+  """Helper method for standarizing Variable and naming.
 
   Args:
     var: Current graph's variable that needs to be warm-started (initialized).
-      Can be either of the following:
-      (i) `Variable`
-      (ii) `ResourceVariable`
+      Can be either of the following: (i) `Variable` (ii) `ResourceVariable`
       (iii) list of `Variable`: The list must contain slices of the same larger
-        variable.
-      (iv) `PartitionedVariable`
-    prev_ckpt: A string specifying the directory with checkpoint file(s) or path
-      to checkpoint. The given checkpoint must have tensor with name
-      `prev_tensor_name` (if not None) or tensor with name same as given `var`.
+        variable. (iv) `PartitionedVariable`
     prev_tensor_name: Name of the tensor to lookup in provided `prev_ckpt`. If
       None, we lookup tensor with same name as given `var`.
+
+  Returns:
+    A tuple of the Tensor name and var.
   """
   if checkpoint_utils._is_variable(var):  # pylint: disable=protected-access
     current_var_name = _infer_var_name([var])
@@ -178,7 +175,8 @@ def _warm_start_var(var, prev_ckpt, prev_tensor_name=None):
   if not prev_tensor_name:
     # Assume tensor name remains the same.
     prev_tensor_name = current_var_name
-  checkpoint_utils.init_from_checkpoint(prev_ckpt, {prev_tensor_name: var})
+
+  return prev_tensor_name, var
 
 
 # pylint: disable=protected-access
@@ -250,7 +248,7 @@ def _warm_start_var_with_vocab(var,
     prev_tensor_name = _infer_var_name(var)
 
   # TODO(eddz): Fix functionality for rank-1 Variables (like FC biases).
-  total_v_first_axis = sum([v.get_shape().as_list()[0] for v in var])
+  total_v_first_axis = sum(v.get_shape().as_list()[0] for v in var)
   for v in var:
     v_shape = v.get_shape().as_list()
     slice_info = v._get_save_slice_info()
@@ -335,12 +333,12 @@ def _get_grouped_variables(vars_to_warm_start):
         ops.GraphKeys.TRAINABLE_VARIABLES,
         scope=vars_to_warm_start)
   elif isinstance(vars_to_warm_start, list):
-    if all([isinstance(v, str) for v in vars_to_warm_start]):
+    if all(isinstance(v, str) for v in vars_to_warm_start):
       list_of_vars = []
       for v in vars_to_warm_start:
         list_of_vars += ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
                                            scope=v)
-    elif all([checkpoint_utils._is_variable(v) for v in vars_to_warm_start]):  # pylint: disable=protected-access
+    elif all(checkpoint_utils._is_variable(v) for v in vars_to_warm_start):  # pylint: disable=protected-access
       list_of_vars = vars_to_warm_start
     else:
       raise ValueError("If `vars_to_warm_start` is a list, it must be all "
@@ -362,7 +360,7 @@ def _get_grouped_variables(vars_to_warm_start):
   return grouped_variables
 
 
-@tf_export("train.warm_start")
+@tf_export(v1=["train.warm_start"])
 def warm_start(ckpt_to_initialize_from,
                vars_to_warm_start=".*",
                var_name_to_vocab_info=None,
@@ -380,23 +378,32 @@ def warm_start(ckpt_to_initialize_from,
 
       - A regular expression (string) that captures which variables to
         warm-start (see tf.get_collection).  This expression will only consider
-        variables in the TRAINABLE_VARIABLES collection.
-      - A list of Variables to warm-start.
-      - A list of strings, each representing a full variable name to warm-start.
+        variables in the TRAINABLE_VARIABLES collection -- if you need to
+        warm-start non_TRAINABLE vars (such as optimizer accumulators or batch
+        norm statistics), please use the below option.
+      - A list of Variables to warm-start.  If you do not have access to the
+        `Variable` objects at the call site, please use the below option.
+      - A list of strings, each a regex scope provided to tf.get_collection with
+        GLOBAL_VARIABLES (please see tf.get_collection).  For backwards
+        compatibility reasons, this is separate from the single-string argument
+        type.
       - `None`, in which case only variables specified in
         `var_name_to_vocab_info` will be warm-started.
 
       Defaults to `'.*'`, which warm-starts all variables in the
-      TRAINABLE_VARIABLES collection.  Note that this excludes variables such as
-      accumulators and moving statistics from batch norm.
+      TRAINABLE_VARIABLES collection.  Note that this excludes variables such
+      as accumulators and moving statistics from batch norm.
     var_name_to_vocab_info: [Optional] Dict of variable names (strings) to
-      VocabInfo. The variable names should be "full" variables, not the names
-      of the partitions.  If not explicitly provided, the variable is assumed to
-      have no vocabulary.
+      `tf.estimator.VocabInfo`. The variable names should be "full" variables,
+      not the names of the partitions.  If not explicitly provided, the variable
+      is assumed to have no (changes to) vocabulary.
     var_name_to_prev_var_name: [Optional] Dict of variable names (strings) to
       name of the previously-trained variable in `ckpt_to_initialize_from`. If
       not explicitly provided, the name of the variable is assumed to be same
-      between previous checkpoint and current model.
+      between previous checkpoint and current model.  Note that this has no
+      effect on the set of variables that is warm-started, and only controls
+      name mapping (use `vars_to_warm_start` for controlling what variables to
+      warm-start).
   Raises:
     ValueError: If the WarmStartSettings contains prev_var_name or VocabInfo
       configuration for variable names that are not used.  This is to ensure
@@ -418,6 +425,8 @@ def warm_start(ckpt_to_initialize_from,
   prev_var_name_used = set()
   vocab_info_used = set()
 
+  # Group the vocabless vars into one call to init_from_checkpoint.
+  vocabless_vars = {}
   for var_name, variable in six.iteritems(grouped_variables):
     prev_var_name = var_name_to_prev_var_name.get(var_name)
     if prev_var_name:
@@ -460,8 +469,10 @@ def warm_start(ckpt_to_initialize_from,
         # for init_from_checkpoint logic to work correctly.
         if len(variable) == 1:
           variable = variable[0]
-        _warm_start_var(variable, ckpt_to_initialize_from, prev_var_name)
+        prev_tensor_name, var = _get_var_info(variable, prev_var_name)
+        vocabless_vars[prev_tensor_name] = var
 
+  checkpoint_utils.init_from_checkpoint(ckpt_to_initialize_from, vocabless_vars)
   prev_var_name_not_used = set(
       var_name_to_prev_var_name.keys()) - prev_var_name_used
   vocab_info_not_used = set(var_name_to_vocab_info.keys()) - vocab_info_used
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index 3eddf79e341b9b663af6ca5646897ef886a05be8..fa1f370f41efeda0d823d85e4d755038362fd37e 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -22,7 +22,7 @@ import os
 import numpy as np
 import six
 
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -30,6 +30,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import warm_starting_util as ws_util
 
@@ -48,7 +49,7 @@ class WarmStartingUtilTest(test.TestCase):
     return vocab_file
 
   def _write_checkpoint(self, sess):
-    sess.run(variables.global_variables_initializer())
+    self.evaluate(variables.global_variables_initializer())
     saver = saver_lib.Saver()
     ckpt_prefix = os.path.join(self.get_temp_dir(), "model")
     saver.save(sess, ckpt_prefix, global_step=0)
@@ -69,7 +70,23 @@ class WarmStartingUtilTest(test.TestCase):
         if partitioner:
           self.assertTrue(isinstance(var, variables.PartitionedVariable))
           var = var._get_variable_list()
-        return var, sess.run(var)
+        return var, self.evaluate(var)
+
+  def _create_prev_run_vars(self,
+                            var_names,
+                            shapes,
+                            initializers):
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g) as sess:
+        all_vars = []
+        for var_name, shape, initializer in zip(var_names, shapes,
+                                                initializers):
+          all_vars.append(variable_scope.get_variable(
+              var_name,
+              shape=shape,
+              initializer=initializer))
+        self._write_checkpoint(sess)
+        return [self.evaluate(var) for var in all_vars]
 
   def _create_dummy_inputs(self):
     return {
@@ -105,8 +122,10 @@ class WarmStartingUtilTest(test.TestCase):
       with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
-        ws_util._warm_start_var(fruit_weights, self.get_temp_dir())
-        sess.run(variables.global_variables_initializer())
+        prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
+        checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
+                                              {prev_tensor_name: var})
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(prev_val, fruit_weights.eval(sess))
 
   def testWarmStartVarPrevVarPartitioned(self):
@@ -121,8 +140,10 @@ class WarmStartingUtilTest(test.TestCase):
       with self.session(graph=g) as sess:
         fruit_weights = variable_scope.get_variable(
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
-        ws_util._warm_start_var(fruit_weights, self.get_temp_dir())
-        sess.run(variables.global_variables_initializer())
+        prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
+        checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
+                                              {prev_tensor_name: var})
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(prev_val, fruit_weights.eval(sess))
 
   def testWarmStartVarCurrentVarPartitioned(self):
@@ -138,8 +159,10 @@ class WarmStartingUtilTest(test.TestCase):
             partitioner=lambda shape, dtype: [2, 1])
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
-        ws_util._warm_start_var(fruit_weights, self.get_temp_dir())
-        sess.run(variables.global_variables_initializer())
+        prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
+        checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
+                                              {prev_tensor_name: var})
+        self.evaluate(variables.global_variables_initializer())
         fruit_weights = fruit_weights._get_variable_list()
         new_val = np.concatenate(
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
@@ -162,11 +185,11 @@ class WarmStartingUtilTest(test.TestCase):
             partitioner=lambda shape, dtype: [2, 1])
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
-        ws_util._warm_start_var(
-            fruit_weights,
-            self.get_temp_dir(),
-            prev_tensor_name="old_scope/fruit_weights")
-        sess.run(variables.global_variables_initializer())
+        prev_tensor_name, var = ws_util._get_var_info(
+            fruit_weights, prev_tensor_name="old_scope/fruit_weights")
+        checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
+                                              {prev_tensor_name: var})
+        self.evaluate(variables.global_variables_initializer())
         fruit_weights = fruit_weights._get_variable_list()
         new_val = np.concatenate(
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
@@ -188,7 +211,7 @@ class WarmStartingUtilTest(test.TestCase):
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
 
@@ -213,7 +236,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[0.3, 0.5, 0.], [0.8, 1.0, 0.], [1.2, 1.5, 0.],
                              [2.3, 2., 0.]], fruit_output_layer.eval(sess))
 
@@ -238,7 +261,7 @@ class WarmStartingUtilTest(test.TestCase):
             self.get_temp_dir(),
             prev_vocab_path,
             previous_vocab_size=2)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Old vocabulary limited to ['apple', 'banana'].
         self.assertAllClose([[0.], [0.], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
@@ -262,7 +285,7 @@ class WarmStartingUtilTest(test.TestCase):
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
 
@@ -289,7 +312,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[0.3, 0.5, 0.], [0.8, 1.0, 0.], [1.2, 1.5, 0.],
                              [2.3, 2., 0.]], fruit_output_layer.eval(sess))
 
@@ -317,7 +340,7 @@ class WarmStartingUtilTest(test.TestCase):
             self.get_temp_dir(),
             prev_vocab_path,
             current_oov_buckets=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
         fruit_weights_vars = fruit_weights._get_variable_list()
@@ -349,7 +372,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_output_layer, variables.PartitionedVariable))
         fruit_output_layer_vars = fruit_output_layer._get_variable_list()
@@ -381,7 +404,7 @@ class WarmStartingUtilTest(test.TestCase):
             partitioner=lambda shape, dtype: [2, 1])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 6,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
         fruit_weights_vars = fruit_weights._get_variable_list()
@@ -415,7 +438,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_output_layer, variables.PartitionedVariable))
         fruit_output_layer_vars = fruit_output_layer._get_variable_list()
@@ -440,7 +463,7 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[10, 1],
             initializer=zeros())
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=[var])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
         self.assertAllEqual(var.eval(), prev_int_val)
 
@@ -460,10 +483,50 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[10, 1],
             initializer=zeros())
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=["v1"])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
         self.assertAllEqual(var.eval(), prev_int_val)
 
+  def testWarmStart_ListOfRegexes(self):
+    # Save checkpoint from which to warm-start.
+    [prev_v1_val, prev_v1_momentum_val,
+     prev_v2_val, _] = self._create_prev_run_vars(
+         var_names=["v1", "v1/Momentum", "v2", "v2/Momentum"],
+         shapes=[[10, 1]] * 4,
+         initializers=[ones()] * 4)
+
+    # New graph, new session with warm-starting.
+    with ops.Graph().as_default() as g:
+      with self.session(graph=g) as sess:
+        # Initialize with zeros.
+        v1 = variable_scope.get_variable(
+            "v1",
+            shape=[10, 1],
+            initializer=zeros())
+        v1_momentum = variable_scope.get_variable(
+            "v1/Momentum",
+            shape=[10, 1],
+            initializer=zeros())
+        v2 = variable_scope.get_variable(
+            "v2",
+            shape=[10, 1],
+            initializer=zeros())
+        v2_momentum = variable_scope.get_variable(
+            "v2/Momentum",
+            shape=[10, 1],
+            initializer=zeros())
+        ws_util.warm_start(self.get_temp_dir(),
+                           # This warm-starts both v1 and v1/Momentum, but only
+                           # v2 (and not v2/Momentum).
+                           vars_to_warm_start=["v1", "v2[^/]"])
+        self.evaluate(variables.global_variables_initializer())
+        # Verify the selection of weights were correctly warm-started (init
+        # overridden to ones).
+        self.assertAllEqual(v1.eval(), prev_v1_val)
+        self.assertAllEqual(v1_momentum.eval(), prev_v1_momentum_val)
+        self.assertAllEqual(v2.eval(), prev_v2_val)
+        self.assertAllEqual(v2_momentum.eval(), np.zeros([10, 1]))
+
   def testWarmStart_SparseColumnIntegerized(self):
     # Create feature column.
     sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10)
@@ -479,7 +542,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [np.zeros([10, 1])]},
@@ -490,7 +553,7 @@ class WarmStartingUtilTest(test.TestCase):
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=".*sc_int.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [prev_int_val]}, sess)
 
@@ -508,7 +571,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [np.zeros([15, 1])]},
@@ -520,7 +583,7 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]},
                                   sess)
@@ -542,7 +605,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
@@ -556,7 +619,7 @@ class WarmStartingUtilTest(test.TestCase):
         # vocab is assumed to be same as new vocab.
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*sc_vocab.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
@@ -578,7 +641,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
@@ -594,7 +657,7 @@ class WarmStartingUtilTest(test.TestCase):
             # Explicitly provide the file prefix instead of just the dir.
             os.path.join(self.get_temp_dir(), "model-0"),
             vars_to_warm_start=".*sc_vocab.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
@@ -623,7 +686,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([2, 1])]},
@@ -645,7 +708,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab/weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  'banana' isn't in the
         # first two entries of the old vocabulary, so it's newly initialized.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [[[1], [0]]]}, sess)
@@ -666,7 +729,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars,
@@ -678,7 +741,7 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars,
                                   {real_bucket: [prev_bucket_val]}, sess)
@@ -737,7 +800,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, all weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {
@@ -763,7 +826,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab/weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {
             sc_int: [prev_int_val],
@@ -802,12 +865,12 @@ class WarmStartingUtilTest(test.TestCase):
             "linear_model/sc_vocab/weights",
             initializer=[[0.5], [1.], [2.], [3.]])
         self._write_checkpoint(sess)
-        prev_keys_val = sess.run(sc_keys_weights)
+        prev_keys_val = self.evaluate(sc_keys_weights)
 
     def _partitioner(shape, dtype):  # pylint:disable=unused-argument
       # Partition each var into 2 equal slices.
       partitions = [1] * len(shape)
-      partitions[0] = min(2, shape[0].value)
+      partitions[0] = min(2, shape.dims[0].value)
       return partitions
 
     # New graph, new session with warm-starting.
@@ -829,7 +892,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
         # should be correctly warm-started after vocab remapping.
@@ -870,7 +933,7 @@ class WarmStartingUtilTest(test.TestCase):
             "linear_model/sc_vocab/weights",
             initializer=[[0.5], [1.], [2.], [3.]])
         self._write_checkpoint(sess)
-        prev_keys_val = sess.run(sc_keys_weights)
+        prev_keys_val = self.evaluate(sc_keys_weights)
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
@@ -892,7 +955,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
         # should be correctly warm-started after vocab remapping.
@@ -933,7 +996,7 @@ class WarmStartingUtilTest(test.TestCase):
     def _partitioner(shape, dtype):  # pylint:disable=unused-argument
       # Partition each var into 2 equal slices.
       partitions = [1] * len(shape)
-      partitions[0] = min(2, shape[0].value)
+      partitions[0] = min(2, shape.dims[0].value)
       return partitions
 
     # New graph, new session with warm-starting.
@@ -961,7 +1024,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_vocab should be correctly warm-started after vocab remapping,
         # and neither of the other two should be warm-started..
@@ -993,7 +1056,7 @@ class WarmStartingUtilTest(test.TestCase):
     def _partitioner(shape, dtype):  # pylint:disable=unused-argument
       # Partition each var into 2 equal slices.
       partitions = [1] * len(shape)
-      partitions[0] = min(2, shape[0].value)
+      partitions[0] = min(2, shape.dims[0].value)
       return partitions
 
     # Create feature columns.
@@ -1028,7 +1091,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[emb_vocab_column]):
                     vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab_column should be correctly warm-started after vocab
         # remapping. Missing values are filled in with the EmbeddingColumn's
@@ -1063,7 +1126,7 @@ class WarmStartingUtilTest(test.TestCase):
     def _partitioner(shape, dtype):  # pylint:disable=unused-argument
       # Partition each var into 2 equal slices.
       partitions = [1] * len(shape)
-      partitions[0] = min(2, shape[0].value)
+      partitions[0] = min(2, shape.dims[0].value)
       return partitions
 
     # Create feature columns.
@@ -1100,7 +1163,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab_embedding/embedding_weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab should be correctly warm-started after vocab remapping.
         # Missing values are filled in with the EmbeddingColumn's initializer.
diff --git a/tensorflow/python/user_ops/user_ops.py b/tensorflow/python/user_ops/user_ops.py
index 20ea3b0f621dc74bd3778d565f8897e47a881d42..3dbacd09e62b65c31266dca94dee5382664833fa 100644
--- a/tensorflow/python/user_ops/user_ops.py
+++ b/tensorflow/python/user_ops/user_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops.gen_user_ops import *  # pylint: disable=wildcard-imp
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('user_ops.my_fact')
+@tf_export(v1=['user_ops.my_fact'])
 def my_fact():
   """Example of overriding the generated code for an Op."""
   return _gen_user_ops.fact()
diff --git a/tensorflow/python/util/decorator_utils.py b/tensorflow/python/util/decorator_utils.py
index 7b4363c0e40802779cf47c75c5a5e5a901da37e2..ab9641d96bc28949d9dc81fa91357793dc8fd6ad 100644
--- a/tensorflow/python/util/decorator_utils.py
+++ b/tensorflow/python/util/decorator_utils.py
@@ -75,13 +75,31 @@ def _normalize_docstring(docstring):
 
 def add_notice_to_docstring(
     doc, instructions, no_doc_str, suffix_str, notice):
-  """Adds a deprecation notice to a docstring."""
+  """Adds a deprecation notice to a docstring.
+
+  Args:
+    doc: The original docstring.
+    instructions: A string, describing how to fix the problem.
+    no_doc_str: The default value to use for `doc` if `doc` is empty.
+    suffix_str: Is added to the end of the first line.
+    notice: A list of strings. The main notice warning body.
+
+  Returns:
+    A new docstring, with the notice attached.
+
+  Raises:
+    ValueError: If `notice` is empty.
+  """
   if not doc:
     lines = [no_doc_str]
   else:
     lines = _normalize_docstring(doc).splitlines()
     lines[0] += ' ' + suffix_str
 
+  if not notice:
+    raise ValueError('The `notice` arg must not be empty.')
+
+  notice[0] = 'Warning: ' + notice[0]
   notice = [''] + notice + ([instructions] if instructions else [])
 
   if len(lines) > 1:
diff --git a/tensorflow/python/util/decorator_utils_test.py b/tensorflow/python/util/decorator_utils_test.py
index 64e0cc7f57effe98756cb08d738dc198d982b473..440dcbb6df3ffbaeb0aed4668033750e44518374 100644
--- a/tensorflow/python/util/decorator_utils_test.py
+++ b/tensorflow/python/util/decorator_utils_test.py
@@ -55,8 +55,9 @@ class AddNoticeToDocstringTest(test.TestCase):
         expected)
 
   def test_regular(self):
-    expected = ("Brief (suffix)\n\nGo away\nInstructions\n\nDocstring\n\n"
-                "Args:\n  arg1: desc")
+    expected = (
+        "Brief (suffix)\n\nWarning: Go away\nInstructions\n\nDocstring\n\n"
+        "Args:\n  arg1: desc")
     # No indent for main docstring
     self._check("Brief\n\nDocstring\n\nArgs:\n  arg1: desc", expected)
     # 2 space indent for main docstring, blank lines not indented
@@ -71,7 +72,7 @@ class AddNoticeToDocstringTest(test.TestCase):
                 expected)
 
   def test_brief_only(self):
-    expected = "Brief (suffix)\n\nGo away\nInstructions"
+    expected = "Brief (suffix)\n\nWarning: Go away\nInstructions"
     self._check("Brief", expected)
     self._check("Brief\n", expected)
     self._check("Brief\n  ", expected)
@@ -79,12 +80,12 @@ class AddNoticeToDocstringTest(test.TestCase):
     self._check("\n  Brief\n  ", expected)
 
   def test_no_docstring(self):
-    expected = "Nothing here\n\nGo away\nInstructions"
+    expected = "Nothing here\n\nWarning: Go away\nInstructions"
     self._check(None, expected)
     self._check("", expected)
 
   def test_no_empty_line(self):
-    expected = "Brief (suffix)\n\nGo away\nInstructions\n\nDocstring"
+    expected = "Brief (suffix)\n\nWarning: Go away\nInstructions\n\nDocstring"
     # No second line indent
     self._check("Brief\nDocstring", expected)
     # 2 space second line indent
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index c43589f5c4555180442a1962e25f82e51d677d1b..9aaf0c2de9756718645e77de416c653182994019 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -28,6 +28,7 @@ from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util import tf_stack
 
 
 # Allow deprecation warnings to be silenced temporarily with a context manager.
@@ -54,16 +55,39 @@ def _add_deprecated_function_notice_to_docstring(doc, date, instructions):
       '(deprecated)', main_text)
 
 
-def _add_deprecated_arg_notice_to_docstring(doc, date, instructions):
+def _add_deprecated_arg_notice_to_docstring(doc, date, instructions,
+                                            deprecated_names):
   """Adds a deprecation notice to a docstring for deprecated arguments."""
+
+  deprecation_string = ', '.join(sorted(deprecated_names))
+
   return decorator_utils.add_notice_to_docstring(
-      doc, instructions,
-      'DEPRECATED FUNCTION ARGUMENTS',
+      doc, instructions, 'DEPRECATED FUNCTION ARGUMENTS',
       '(deprecated arguments)', [
-          'SOME ARGUMENTS ARE DEPRECATED. '
-          'They will be removed %s.' % (
-              'in a future version' if date is None else ('after %s' % date)),
-          'Instructions for updating:'])
+          'SOME ARGUMENTS ARE DEPRECATED: `(%s)`. '
+          'They will be removed %s.' %
+          (deprecation_string, 'in a future version' if date is None else
+           ('after %s' % date)), 'Instructions for updating:'
+      ])
+
+
+def _add_deprecated_arg_value_notice_to_docstring(doc, date, instructions,
+                                                  deprecated_name_value_dict):
+  """Adds a deprecation notice to a docstring for deprecated arguments."""
+
+  deprecation_string = ', '.join(
+      '%s=%r' % (key, value)
+      for key, value in sorted(deprecated_name_value_dict.items()))
+
+  when = 'in a future version' if date is None else ('after %s' % date)
+
+  return decorator_utils.add_notice_to_docstring(
+      doc, instructions, 'DEPRECATED FUNCTION ARGUMENT VALUES',
+      '(deprecated argument values)', [
+          'SOME ARGUMENT VALUES ARE DEPRECATED: `(%s)`. '
+          'They will be removed %s.' % (deprecation_string, when),
+          'Instructions for updating:'
+      ])
 
 
 def _validate_deprecation_args(date, instructions):
@@ -75,21 +99,9 @@ def _validate_deprecation_args(date, instructions):
 
 def _call_location(outer=False):
   """Returns call location given level up from current call."""
-  frame = tf_inspect.currentframe()
-  if frame:
-    # CPython internals are available, use them for performance.
-    # walk back two frames to get to deprecated function caller.
-    frame = frame.f_back
-    if frame.f_back:
-      frame = frame.f_back
-    if outer and frame.f_back:
-      frame = frame.f_back
-    return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno)
-  else:
-    # Slow fallback path
-    stack = tf_inspect.stack(0)  # 0 avoids generating unused context
-    entry = stack[3 if outer else 2]
-    return '%s:%d' % (entry[1], entry[2])
+  stack = tf_stack.extract_stack()
+  frame = stack[-4 if outer else -3]
+  return '{filename}:{lineno}'.format(filename=frame[0], lineno=frame[1])
 
 
 def _wrap_decorator(wrapped_function):
@@ -403,10 +415,11 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
             pos, spec.has_ok_value, spec.ok_value)
     return deprecated_positional_args
 
+  deprecated_arg_names = _get_arg_names_to_ok_vals()
+
   def deprecated_wrapper(func):
     """Deprecation decorator."""
     decorator_utils.validate_callable(func, 'deprecated_args')
-    deprecated_arg_names = _get_arg_names_to_ok_vals()
 
     arg_spec = tf_inspect.getfullargspec(func)
     deprecated_positions = _get_deprecated_positional_arguments(
@@ -486,9 +499,11 @@ def deprecated_args(date, instructions, *deprecated_arg_names_or_tuples,
                 'in a future version' if date is None else ('after %s' % date),
                 instructions)
       return func(*args, **kwargs)
-    return tf_decorator.make_decorator(func, new_func, 'deprecated',
-                                       _add_deprecated_arg_notice_to_docstring(
-                                           func.__doc__, date, instructions))
+
+    doc = _add_deprecated_arg_notice_to_docstring(
+        func.__doc__, date, instructions, sorted(deprecated_arg_names.keys()))
+    return tf_decorator.make_decorator(func, new_func, 'deprecated', doc)
+
   return deprecated_wrapper
 
 
@@ -551,9 +566,11 @@ def deprecated_arg_values(date, instructions, warn_once=True,
                   func.__module__, arg_name, arg_value, 'in a future version'
                   if date is None else ('after %s' % date), instructions)
       return func(*args, **kwargs)
-    return tf_decorator.make_decorator(func, new_func, 'deprecated',
-                                       _add_deprecated_arg_notice_to_docstring(
-                                           func.__doc__, date, instructions))
+
+    doc = _add_deprecated_arg_value_notice_to_docstring(
+        func.__doc__, date, instructions, deprecated_kwargs)
+    return tf_decorator.make_decorator(func, new_func, 'deprecated', doc)
+
   return deprecated_wrapper
 
 
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 90c73a0a58d129af44cc051874acda37d5c78394..035c416d793e04ab26adbe0f4b321594343a2286 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
@@ -153,7 +154,8 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed in a future version."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. "
+        "It will be removed in a future version."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -173,6 +175,7 @@ class DeprecationTest(test.TestCase):
                         set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -195,7 +198,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -213,6 +216,7 @@ class DeprecationTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_one_line_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -227,7 +231,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__)
 
     # Assert calling new fn issues log warning.
@@ -238,6 +242,7 @@ class DeprecationTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -251,7 +256,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
         "\n%s" % (date, instructions), _fn.__doc__)
 
@@ -289,7 +294,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -326,7 +331,7 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions),
         getattr(_Object, "_fn").__doc__)
 
@@ -355,9 +360,10 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
-        "\n%s" % (date, instructions), getattr(_Object, "_fn").__doc__)
+        "\n%s" % (date, instructions),
+        getattr(_Object, "_fn").__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual(3, _Object()._fn(1, 2))
@@ -406,12 +412,13 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "prop doc. (deprecated)"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
         "\n%s"
         "\n"
         "\nReturns:"
-        "\n  String." % (date, instructions), getattr(_Object, "_prop").__doc__)
+        "\n  String." % (date, instructions),
+        getattr(_Object, "_prop").__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual("prop_with_doc", _Object()._prop)
@@ -439,9 +446,10 @@ class DeprecationTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION"
         "\n"
-        "\nTHIS FUNCTION IS DEPRECATED. It will be removed after %s."
+        "\nWarning: THIS FUNCTION IS DEPRECATED. It will be removed after %s."
         "\nInstructions for updating:"
-        "\n%s" % (date, instructions), getattr(_Object, "_prop").__doc__)
+        "\n%s" % (date, instructions),
+        getattr(_Object, "_prop").__doc__)
 
     # Assert calling new fn issues log warning.
     self.assertEqual("prop_no_doc", _Object()._prop)
@@ -484,6 +492,7 @@ class DeprecatedArgsTest(test.TestCase):
       deprecation.deprecated_args(date, instructions, "missing")(_fn)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -507,7 +516,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated arguments)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -530,6 +540,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_one_line_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -544,7 +555,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(
         "fn doc. (deprecated arguments)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__)
 
     # Assert calls without the deprecated argument log nothing.
@@ -559,6 +571,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -572,7 +585,8 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(
         "DEPRECATED FUNCTION ARGUMENTS"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENTS ARE DEPRECATED: `(deprecated)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:"
         "\n%s" % (date, instructions), _fn.__doc__)
 
@@ -588,6 +602,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_varargs(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -608,6 +623,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_kwargs(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -628,6 +644,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_positional_and_named(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -653,6 +670,7 @@ class DeprecatedArgsTest(test.TestCase):
                         set(args2[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_positional_and_named_with_ok_vals(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -685,6 +703,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(0, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_deprecated_args_once(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -701,6 +720,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(1, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_deprecated_multiple_args_once_each(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -745,6 +765,7 @@ class DeprecatedArgValuesTest(test.TestCase):
       deprecation.deprecated_arg_values(date, instructions)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -767,9 +788,10 @@ class DeprecatedArgValuesTest(test.TestCase):
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
     self.assertEqual(
-        "fn doc. (deprecated arguments)"
+        "fn doc. (deprecated argument values)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s"
         "\n"
         "\nArgs:"
@@ -796,6 +818,7 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(2, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_one_line_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -809,9 +832,10 @@ class DeprecatedArgValuesTest(test.TestCase):
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
     self.assertEqual(
-        "fn doc. (deprecated arguments)"
+        "fn doc. (deprecated argument values)"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:\n%s" % (date, instructions), _fn.__doc__)
 
     # Assert calling new fn with non-deprecated value logs nothing.
@@ -830,6 +854,7 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(2, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -842,9 +867,10 @@ class DeprecatedArgValuesTest(test.TestCase):
     # Assert function docs are properly updated.
     self.assertEqual("_fn", _fn.__name__)
     self.assertEqual(
-        "DEPRECATED FUNCTION ARGUMENTS"
+        "DEPRECATED FUNCTION ARGUMENT VALUES"
         "\n"
-        "\nSOME ARGUMENTS ARE DEPRECATED. They will be removed after %s."
+        "\nWarning: SOME ARGUMENT VALUES ARE DEPRECATED: `(deprecated=True)`. "
+        "They will be removed after %s."
         "\nInstructions for updating:"
         "\n%s" % (date, instructions), _fn.__doc__)
 
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94e3345348b119bc64dd487c3c2a14603a2ce09
--- /dev/null
+++ b/tensorflow/python/util/dispatch.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Type-based dispatch for TensorFlow ops.
+
+"Operation dispatchers" can be used to override the behavior for TensorFlow ops
+when they are called with otherwise unsupported argument types.  In particular,
+when an operation is called with arguments that would cause it to raise a
+TypeError, it falls back on its registered operation dispatchers.  If any
+registered dispatchers can handle the arguments, then its result is returned.
+Otherwise, the original TypeError is raised.
+
+By default, dispatch support is added to the generated op wrappers for any
+visible ops by default.  Ops that are implemented in Python can opt in to
+dispatch support using the `add_dispatch_support` decorator.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+# Private function attribute used to store a list of dispatchers.
+DISPATCH_ATTR = "_tf_dispatchers"
+
+
+class OpDispatcher(object):
+  """Abstract base class for TensorFlow operator dispatchers.
+
+  Each operation dispatcher acts as an override handler for a single
+  TensorFlow operation, and its results are used when the handler indicates
+  that it can handle the operation's arguments (by returning any value other
+  than `OpDispatcher.NOT_SUPPORTED`).
+  """
+
+  # Sentinel value that can be returned to indicate that an operation
+  # dispatcher does not support a given set of arguments.
+  NOT_SUPPORTED = object()
+
+  def handle(self, args, kwargs):  # pylint: disable=unused-argument
+    """Handle this dispatcher's operation with the specified arguments.
+
+    If this operation dispatcher can handle the given arguments, then
+    return an appropriate value (or raise an appropriate exception).
+
+    Args:
+      args: The arguments to the operation.
+      kwargs: They keyword arguments to the operation.
+
+    Returns:
+      The result of the operation, or `OpDispatcher.NOT_SUPPORTED` if this
+      dispatcher can not handle the given arguments.
+    """
+    return self.NOT_SUPPORTED
+
+  def register(self, op):
+    """Register this dispatcher as a handler for `op`.
+
+    Args:
+      op: Python function: the TensorFlow operation that should be handled. Must
+        have a dispatch list (which is added automatically for generated ops,
+        and can be added to Python ops using the `add_dispatch_support`
+        decorator).
+    """
+    if not hasattr(op, DISPATCH_ATTR):
+      raise AssertionError("Dispatching not enabled for %s" % op)
+    getattr(op, DISPATCH_ATTR).append(self)
+
+
+def dispatch(op, *args, **kwargs):
+  """Returns the result from the first successful dispatcher for a given op.
+
+  Calls the `handle` method of each `OpDispatcher` that has been registered
+  to handle `op`, and returns the value from the first successful handler.
+
+  Args:
+    op: Python function: the operation to dispatch for.
+    *args: The arguments to the operation.
+    **kwargs: They keyword arguments to the operation.
+
+  Returns:
+    The result of the operation, or `NOT_SUPPORTED` if no registered
+    dispatcher can handle the given arguments.
+  """
+  for dispatcher in getattr(op, DISPATCH_ATTR):
+    result = dispatcher.handle(args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
+  return OpDispatcher.NOT_SUPPORTED
+
+
+class _TypeBasedDispatcher(OpDispatcher):
+  """Dispatcher that handles op if any arguments have a specified type.
+
+  Checks the types of the arguments and keyword arguments (including elements
+  of lists or tuples), and if any argument values have the indicated type(s),
+  then delegates to an override function.
+  """
+
+  def __init__(self, override_func, types):
+    self._types = types
+    self._override_func = override_func
+
+  def _handles(self, args, kwargs):
+    for arg in itertools.chain(args, kwargs.values()):
+      if (isinstance(arg, self._types) or
+          (isinstance(arg, (list, tuple)) and
+           any(isinstance(elt, self._types) for elt in arg))):
+        return True
+    return False
+
+  def handle(self, args, kwargs):
+    if self._handles(args, kwargs):
+      return self._override_func(*args, **kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+
+# pylint: disable=g-doc-return-or-yield
+def dispatch_for_types(op, *types):
+  """Decorator to declare that a Python function overrides an op for a type.
+
+  The decorated function is used to override `op` if any of the arguments or
+  keyword arguments (including elements of lists or tuples) have one of the
+  specified types.
+
+  Example:
+
+  ```python
+  @dispatch_for_types(math_ops.add, RaggedTensor, RaggedTensorValue)
+  def ragged_add(x, y, name=None): ...
+  ```
+
+  Args:
+    op: Python function: the operation that should be overridden.
+    *types: The argument types for which this function should be used.
+  """
+
+  def decorator(func):
+    if tf_inspect.getargspec(func) != tf_inspect.getargspec(op):
+      raise AssertionError("The decorated function's signature must exactly "
+                           "match the signature of the overridden op.")
+    _TypeBasedDispatcher(func, types).register(op)
+    return func
+
+  return decorator
+
+
+# pylint: enable=g-doc-return-or-yield
+
+
+def add_dispatch_list(target):
+  """Decorator that adds a dispatch_list attribute to an op."""
+  if hasattr(target, DISPATCH_ATTR):
+    raise AssertionError("%s already has a dispatch list" % target)
+  setattr(target, DISPATCH_ATTR, [])
+  return target
+
+
+def add_dispatch_support(target):
+  """Decorator that adds a dispatch handling wrapper to an op."""
+  def wrapper(*args, **kwargs):
+    """Call target, and fall back on dispatchers if there is a TypeError."""
+    try:
+      return target(*args, **kwargs)
+    except (TypeError, ValueError):
+      # Note: convert_to_eager_tensor currently raises a ValueError, not a
+      # TypeError, when given unexpected types.  So we need to catch both.
+      result = dispatch(wrapper, *args, **kwargs)
+      if result is not OpDispatcher.NOT_SUPPORTED:
+        return result
+      else:
+        raise
+
+  add_dispatch_list(wrapper)
+  return tf_decorator.make_decorator(target, wrapper)
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7c5c8eca8dbb8c810351291d9445404897a9d5f
--- /dev/null
+++ b/tensorflow/python/util/dispatch_test.py
@@ -0,0 +1,120 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for operator dispatch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import dispatch
+from tensorflow.python.util.tf_export import tf_export
+
+
+class CustomTensor(object):
+  """A fake composite tensor class, for testing type-based dispatching."""
+
+  def __init__(self, tensor, score):
+    self.tensor = ops.convert_to_tensor(tensor)
+    self.score = score
+
+
+@tf_export("test_op")
+@dispatch.add_dispatch_support
+def test_op(x, y, z):
+  """A fake op for testing dispatch of Python ops."""
+  return x + (2 * y) + (3 * z)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DispatchTest(test_util.TensorFlowTestCase):
+
+  def testAddDispatchForTypes_With_CppOp(self):
+    original_handlers = gen_math_ops.add._tf_dispatchers[:]
+
+    # Override the behavior of gen_math_ops.add.
+    @dispatch.dispatch_for_types(gen_math_ops.add, CustomTensor)
+    def custom_add(x, y, name=None):  # pylint: disable=unused-variable
+      return CustomTensor(gen_math_ops.add(x.tensor, y.tensor, name),
+                          (x.score+y.score) / 2.0)
+    self.assertEqual(len(math_ops.add._tf_dispatchers),
+                     len(original_handlers) + 1)
+
+    # Test that we see the overridden behavior when using CustomTensors.
+    x = CustomTensor([1, 2, 3], 2.0)
+    y = CustomTensor([7, 8, 2], 0.0)
+    x_plus_y = gen_math_ops.add(x, y)
+    self.assertAllEqual(self.evaluate(x_plus_y.tensor), [8, 10, 5])
+    self.assertNear(x_plus_y.score, 1.0, 0.001)
+
+    # Test that we still get the right behavior when using normal Tensors.
+    a = [1, 2, 3]
+    b = [4, 5, 6]
+    a_plus_b = gen_math_ops.add(a, b)
+    self.assertAllEqual(a_plus_b, [5, 7, 9])
+
+    # Test that we still get a TypeError or ValueError if we pass some
+    # type that's not supported by any dispatcher.
+    with self.assertRaises((TypeError, ValueError)):
+      gen_math_ops.add(a, None)
+
+    # Clean up
+    gen_math_ops.add._tf_dispatchers = original_handlers
+
+  def testAddDispatchForTypes_With_PythonOp(self):
+    original_handlers = test_op._tf_dispatchers[:]
+
+    @dispatch.dispatch_for_types(test_op, CustomTensor)
+    def override_for_test_op(x, y, z):  # pylint: disable=unused-variable
+      return CustomTensor(test_op(x.tensor, y.tensor, z.tensor),
+                          (x.score + y.score + z.score) / 3.0)
+
+    x = CustomTensor([1, 2, 3], 0.2)
+    y = CustomTensor([7, 8, 2], 0.4)
+    z = CustomTensor([0, 1, 2], 0.6)
+
+    result = test_op(x, y, z)
+    self.assertAllEqual(self.evaluate(result.tensor), [15, 21, 13])
+    self.assertNear(result.score, 0.4, 0.001)
+
+    # Clean up
+    test_op._tf_dispatchers = original_handlers
+
+  def testDispatchForTypes_SignatureMismatch(self):
+    with self.assertRaisesRegexp(AssertionError, "The decorated function's "
+                                 "signature must exactly match.*"):
+      @dispatch.dispatch_for_types(test_op, CustomTensor)
+      def override_for_test_op(a, b, c):  # pylint: disable=unused-variable
+        return CustomTensor(test_op(a.tensor, b.tensor, c.tensor),
+                            (a.score + b.score + c.score) / 3.0)
+
+  def testDispatchForTypes_OpDoesNotSupportDispatch(self):
+    def some_op(x, y):
+      return x + y
+
+    with self.assertRaisesRegexp(AssertionError, "Dispatching not enabled for"):
+      @dispatch.dispatch_for_types(some_op, CustomTensor)
+      def override_for_some_op(x, y):  # pylint: disable=unused-variable
+        return x if x.score > 0 else y
+
+
+if __name__ == "__main__":
+  googletest.main()
+
+
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index d67dbde30473f8466f443d0180f62d85f54a848b..be8b0f1949ff7655d14c81ce29d643a919176fe6 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -503,7 +503,8 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
               "The two namedtuples don't have the same sequence type. Input "
               "structure has type %s, while shallow structure has type %s."
               % (type(input_tree), type(shallow_tree)))
-      else:
+      elif not (isinstance(shallow_tree, _collections.Mapping)
+                and isinstance(input_tree, _collections.Mapping)):
         raise TypeError(
             "The two structures don't have the same sequence type. Input "
             "structure has type %s, while shallow structure has type %s."
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index e03a8daaa19b4f2a39741cbc120f6317557e8474..d0d0c5f7935ba0a4d2b867b3c6fb6bd52c7cd54a 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -482,6 +482,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(nt.a[1][::-1], rev_nt.a[1])
     self.assertEqual(nt.b[::-1], rev_nt.b)
 
+  @test_util.run_deprecated_v1
   def testMapStructureOverPlaceholders(self):
     inp_a = (array_ops.placeholder(dtypes.float32, shape=[3, 4]),
              array_ops.placeholder(dtypes.float32, shape=[3, 7]))
@@ -706,6 +707,40 @@ class NestTest(parameterized.TestCase, test.TestCase):
         name_list, data_list)
     self.assertEqual(out, ["first_4_evens", ["first_5_odds", "first_3_primes"]])
 
+    # Dicts.
+    inp_val = dict(a=2, b=3)
+    inp_ops = dict(a=dict(add=1, mul=2), b=dict(add=2, mul=3))
+    out = nest.map_structure_up_to(
+        inp_val,
+        lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+    self.assertEqual(out["a"], 6)
+    self.assertEqual(out["b"], 15)
+
+    # Non-equal dicts.
+    inp_val = dict(a=2, b=3)
+    inp_ops = dict(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
+    with self.assertRaisesRegexp(ValueError, "same keys"):
+      nest.map_structure_up_to(
+          inp_val,
+          lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+
+    # Dict+custom mapping.
+    inp_val = dict(a=2, b=3)
+    inp_ops = _CustomMapping(a=dict(add=1, mul=2), b=dict(add=2, mul=3))
+    out = nest.map_structure_up_to(
+        inp_val,
+        lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+    self.assertEqual(out["a"], 6)
+    self.assertEqual(out["b"], 15)
+
+    # Non-equal dict/mapping.
+    inp_val = dict(a=2, b=3)
+    inp_ops = _CustomMapping(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
+    with self.assertRaisesRegexp(ValueError, "same keys"):
+      nest.map_structure_up_to(
+          inp_val,
+          lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
+
   def testGetTraverseShallowStructure(self):
     scalar_traverse_input = [3, 4, (1, 2, [0]), [5, 6], {"a": (7,)}, []]
     scalar_traverse_r = nest.get_traverse_shallow_structure(
diff --git a/tensorflow/python/util/protobuf/compare.py b/tensorflow/python/util/protobuf/compare.py
index a0e6bf65cf5df73af2d30e7fb4349e87bc7ae222..3a3af4bffa5ccd97ebe3626cdbe8eb2602458113 100644
--- a/tensorflow/python/util/protobuf/compare.py
+++ b/tensorflow/python/util/protobuf/compare.py
@@ -63,6 +63,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import difflib
 
 import six
 
@@ -101,10 +102,19 @@ def assertProtoEqual(self, a, b, check_initialized=True,  # pylint: disable=inva
     if normalize_numbers:
       NormalizeNumberFields(pb)
 
-  self.assertMultiLineEqual(
-      text_format.MessageToString(a, descriptor_pool=pool),
-      text_format.MessageToString(b, descriptor_pool=pool),
-      msg=msg)
+  a_str = text_format.MessageToString(a, descriptor_pool=pool)
+  b_str = text_format.MessageToString(b, descriptor_pool=pool)
+
+  # Some Python versions would perform regular diff instead of multi-line
+  # diff if string is longer than 2**16. We substitute this behavior
+  # with a call to unified_diff instead to have easier-to-read diffs.
+  # For context, see: https://bugs.python.org/issue11763.
+  if len(a_str) < 2**16 and len(b_str) < 2**16:
+    self.assertMultiLineEqual(a_str, b_str, msg=msg)
+  else:
+    diff = '\n' + ''.join(difflib.unified_diff(a_str.splitlines(True),
+                                               b_str.splitlines(True)))
+    self.fail('%s : %s' % (msg, diff))
 
 
 def NormalizeNumberFields(pb):
diff --git a/tensorflow/python/util/py_checkpoint_reader.i b/tensorflow/python/util/py_checkpoint_reader.i
index 1c73f7f06f1937a8db0bd858421c2e884892e25b..a1b98a2a75991ee8555c3d3de3aca826fba07a7e 100644
--- a/tensorflow/python/util/py_checkpoint_reader.i
+++ b/tensorflow/python/util/py_checkpoint_reader.i
@@ -165,7 +165,6 @@ def NewCheckpointReader(filepattern):
     from tensorflow.python.util import compat
     return CheckpointReader(compat.as_bytes(filepattern), status)
 
-NewCheckpointReader._tf_api_names = ['train.NewCheckpointReader']
 NewCheckpointReader._tf_api_names_v1 = ['train.NewCheckpointReader']
 %}
 
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
index faf5164faa7f1c2d085e13358c8266c4eef22524..cff864c0304b02aaa6339efb403388c65ab6fec4 100644
--- a/tensorflow/python/util/serialization.py
+++ b/tensorflow/python/util/serialization.py
@@ -43,7 +43,7 @@ def get_json_type(obj):
   # if obj is any numpy type
   if type(obj).__module__ == np.__name__:
     if isinstance(obj, np.ndarray):
-      return {'type': type(obj), 'value': obj.tolist()}
+      return obj.tolist()
     else:
       return obj.item()
 
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index 3d837a40449ece056c154e1b09636a8885047035..0cfc836246d2d885c28d168fe90b08a325cf6ded 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -101,6 +101,55 @@ def make_decorator(target,
   return decorator_func
 
 
+def rewrap(decorator_func, previous_target, new_target):
+  """Injects a new target into a function built by make_decorator.
+
+  This function allows replacing a function wrapped by `decorator_func`,
+  assuming the decorator that wraps the function is written as described below.
+
+  The decorator function must use `<decorator name>.__wrapped__` instead of the
+  wrapped function that is normally used:
+
+  Example:
+
+      # Instead of this:
+      def simple_parametrized_wrapper(*args, **kwds):
+        return wrapped_fn(*args, **kwds)
+
+      tf_decorator.make_decorator(simple_parametrized_wrapper, wrapped_fn)
+
+      # Write this:
+      def simple_parametrized_wrapper(*args, **kwds):
+        return simple_parametrized_wrapper.__wrapped__(*args, **kwds)
+
+      tf_decorator.make_decorator(simple_parametrized_wrapper, wrapped_fn)
+
+  Note that this process modifies decorator_func.
+
+  Args:
+    decorator_func: Callable returned by `wrap`.
+    previous_target: Callable that needs to be replaced.
+    new_target: Callable to replace previous_target with.
+  """
+  # Because the process mutates the decorator, we only need to alter the
+  # innermost function that wraps previous_target.
+  cur = decorator_func
+  innermost_decorator = None
+  target = None
+  while hasattr(cur, '_tf_decorator'):
+    innermost_decorator = cur
+    target = getattr(cur, '_tf_decorator')
+    if target.decorated_target is previous_target:
+      break
+    cur = target.decorated_target
+
+  if innermost_decorator is None:
+    return
+
+  target.decorated_target = new_target
+  innermost_decorator.__wrapped__ = new_target
+
+
 def unwrap(maybe_tf_decorator):
   """Unwraps an object into a list of TFDecorators and a final target.
 
@@ -163,6 +212,10 @@ class TFDecorator(object):
   def decorated_target(self):
     return self._decorated_target
 
+  @decorated_target.setter
+  def decorated_target(self, decorated_target):
+    self._decorated_target = decorated_target
+
   @property
   def decorator_name(self):
     return self._decorator_name
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
index 0f9712c987d442358ecb4f81f46ef0898e380b01..9198f0b3fad1590bedac71b30cf332e35cb489fe 100644
--- a/tensorflow/python/util/tf_decorator_test.py
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -52,6 +52,22 @@ def test_decorator_increment_first_int_arg(target):
   return tf_decorator.make_decorator(target, wrapper)
 
 
+def test_injectable_decorator_square(target):
+
+  def wrapper(x):
+    return wrapper.__wrapped__(x)**2
+
+  return tf_decorator.make_decorator(target, wrapper)
+
+
+def test_injectable_decorator_increment(target):
+
+  def wrapper(x):
+    return wrapper.__wrapped__(x) + 1
+
+  return tf_decorator.make_decorator(target, wrapper)
+
+
 def test_function(x):
   """Test Function Docstring."""
   return x + 1
@@ -65,6 +81,12 @@ def test_decorated_function(x):
   return x * 2
 
 
+@test_injectable_decorator_square
+@test_injectable_decorator_increment
+def test_rewrappable_decorated(x):
+  return x * 2
+
+
 @test_tfdecorator('decorator')
 class TestDecoratedClass(object):
   """Test Decorated Class."""
@@ -215,6 +237,30 @@ class TfMakeDecoratorTest(test.TestCase):
     _ = tf_decorator.make_decorator(partial, test_wrapper)
 
 
+class TfDecoratorRewrapTest(test.TestCase):
+
+  def testRewrapMutatesAffectedFunction(self):
+
+    def new_target(x):
+      return x * 3
+
+    self.assertEqual((1 * 2 + 1) ** 2, test_rewrappable_decorated(1))
+    prev_target, _ = tf_decorator.unwrap(test_rewrappable_decorated)
+    tf_decorator.rewrap(test_rewrappable_decorated, prev_target, new_target)
+    self.assertEqual((1 * 3 + 1) ** 2, test_rewrappable_decorated(1))
+
+  def testRewrapOfDecoratorFunction(self):
+
+    def new_target(x):
+      return x * 3
+
+    prev_target = test_rewrappable_decorated._tf_decorator._decorated_target
+    # In this case, only the outer decorator (test_injectable_decorator_square)
+    # should be preserved.
+    tf_decorator.rewrap(test_rewrappable_decorated, prev_target, new_target)
+    self.assertEqual((1 * 3) ** 2, test_rewrappable_decorated(1))
+
+
 class TfDecoratorUnwrapTest(test.TestCase):
 
   def testUnwrapReturnsEmptyArrayForUndecoratedFunction(self):
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index a5ac430ce7e08c22ab44b3d86499964f547ad306..ec70cae7d2fc00f793e8ffa0aec331e32e11115f 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -34,7 +34,7 @@ tf_export('foo', 'bar.foo')(foo)
 Exporting a constant
 ```python
 foo = 1
-tf_export("consts.foo").export_constant(__name__, 'foo')
+tf_export('consts.foo').export_constant(__name__, 'foo')
 ```
 """
 from __future__ import absolute_import
@@ -50,6 +50,10 @@ from tensorflow.python.util import tf_decorator
 ESTIMATOR_API_NAME = 'estimator'
 TENSORFLOW_API_NAME = 'tensorflow'
 
+# List of subpackage names used by TensorFlow components. Have to check that
+# TensorFlow core repo does not export any symbols under these names.
+SUBPACKAGE_NAMESPACES = [ESTIMATOR_API_NAME]
+
 _Attributes = collections.namedtuple(
     'ExportedApiAttributes', ['names', 'constants'])
 
@@ -78,14 +82,21 @@ class SymbolAlreadyExposedError(Exception):
   pass
 
 
-def get_canonical_name_for_symbol(symbol, api_name=TENSORFLOW_API_NAME):
-  """Get canonical name for the API symbol.
+class InvalidSymbolNameError(Exception):
+  """Raised when trying to export symbol as an invalid or unallowed name."""
+  pass
+
 
-  Canonical name is the first non-deprecated endpoint name.
+def get_canonical_name_for_symbol(
+    symbol, api_name=TENSORFLOW_API_NAME,
+    add_prefix_to_v1_names=False):
+  """Get canonical name for the API symbol.
 
   Args:
     symbol: API function or class.
     api_name: API name (tensorflow or estimator).
+    add_prefix_to_v1_names: Specifies whether a name available only in V1
+      should be prefixed with compat.v1.
 
   Returns:
     Canonical name for the API symbol (for e.g. initializers.zeros) if
@@ -98,26 +109,42 @@ def get_canonical_name_for_symbol(symbol, api_name=TENSORFLOW_API_NAME):
   if api_names_attr not in undecorated_symbol.__dict__:
     return None
   api_names = getattr(undecorated_symbol, api_names_attr)
-  # TODO(annarev): may be add a separate deprecated attribute
-  # for estimator names.
   deprecated_api_names = undecorated_symbol.__dict__.get(
       '_tf_deprecated_api_names', [])
-  return get_canonical_name(api_names, deprecated_api_names)
+
+  canonical_name = get_canonical_name(api_names, deprecated_api_names)
+  if canonical_name:
+    return canonical_name
+
+  # If there is no V2 canonical name, get V1 canonical name.
+  api_names_attr = API_ATTRS_V1[api_name].names
+  api_names = getattr(undecorated_symbol, api_names_attr)
+  v1_canonical_name = get_canonical_name(api_names, deprecated_api_names)
+  if add_prefix_to_v1_names:
+    return 'compat.v1.%s' % v1_canonical_name
+  return v1_canonical_name
 
 
 def get_canonical_name(api_names, deprecated_api_names):
-  """Get first non-deprecated endpoint name.
+  """Get preferred endpoint name.
 
   Args:
     api_names: API names iterable.
     deprecated_api_names: Deprecated API names iterable.
   Returns:
-    Canonical name if there is at least one non-deprecated endpoint.
-    Otherwise returns None.
+    Returns one of the following in decreasing preference:
+    - first non-deprecated endpoint
+    - first endpoint
+    - None
   """
-  return next(
+  non_deprecated_name = next(
       (name for name in api_names if name not in deprecated_api_names),
       None)
+  if non_deprecated_name:
+    return non_deprecated_name
+  if api_names:
+    return api_names[0]
+  return None
 
 
 class api_export(object):  # pylint: disable=invalid-name
@@ -145,6 +172,37 @@ class api_export(object):  # pylint: disable=invalid-name
     self._overrides = kwargs.get('overrides', [])
     self._allow_multiple_exports = kwargs.get('allow_multiple_exports', False)
 
+    self._validate_symbol_names()
+
+  def _validate_symbol_names(self):
+    """Validate you are exporting symbols under an allowed package.
+
+    We need to ensure things exported by tf_export, estimator_export, etc.
+    export symbols under disjoint top-level package names.
+
+    For TensorFlow, we check that it does not export anything under subpackage
+    names used by components (estimator, keras, etc.).
+
+    For each component, we check that it exports everything under its own
+    subpackage.
+
+    Raises:
+      InvalidSymbolNameError: If you try to export symbol under disallowed name.
+    """
+    all_symbol_names = set(self._names) | set(self._names_v1)
+    if self._api_name == TENSORFLOW_API_NAME:
+      for subpackage in SUBPACKAGE_NAMESPACES:
+        if any(n.startswith(subpackage) for n in all_symbol_names):
+          raise InvalidSymbolNameError(
+              '@tf_export is not allowed to export symbols under %s.*' % (
+                  subpackage))
+    else:
+      if not all(n.startswith(self._api_name) for n in all_symbol_names):
+        raise InvalidSymbolNameError(
+            'Can only export symbols under package name of component. '
+            'e.g. tensorflow_estimator must export all symbols under '
+            'tf.estimator')
+
   def __call__(self, func):
     """Calls this decorator.
 
@@ -217,5 +275,4 @@ class api_export(object):  # pylint: disable=invalid-name
 
 
 tf_export = functools.partial(api_export, api_name=TENSORFLOW_API_NAME)
-estimator_export = functools.partial(
-    api_export, api_name=ESTIMATOR_API_NAME, allow_multiple_exports=True)
+estimator_export = functools.partial(api_export, api_name=ESTIMATOR_API_NAME)
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index 4ae1dc55e06b434aeb4a95e2ca9aa68e4eef56de..a0fac8bf362627e6802821e3b33c0f107c5c97ce 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -130,6 +130,26 @@ class ValidateExportTest(test.TestCase):
     with self.assertRaises(tf_export.SymbolAlreadyExposedError):
       export_decorator(_test_function)
 
+  def testRaisesExceptionIfInvalidSymbolName(self):
+    # TensorFlow code is not allowed to export symbols under package
+    # tf.estimator
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.tf_export('estimator.invalid')
+
+    # All symbols exported by Estimator must be under tf.estimator package.
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('invalid')
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('Estimator.invalid')
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('invalid.estimator')
+
+  def testRaisesExceptionIfInvalidV1SymbolName(self):
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.tf_export('valid', v1=['estimator.invalid'])
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('estimator.valid', v1=['invalid'])
+
   def testOverridesFunction(self):
     _test_function2._tf_api_names = ['abc']
 
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 444e44eaf141dadab7c1fab5d6091ec632d4bcc3..5f1e776640df3e2b75e6a0b8accfce40098cf36c 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -352,6 +352,11 @@ def isfunction(object):  # pylint: disable=redefined-builtin
   return _inspect.isfunction(tf_decorator.unwrap(object)[1])
 
 
+def isframe(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismodule."""
+  return _inspect.isframe(tf_decorator.unwrap(object)[1])
+
+
 def isgenerator(object):  # pylint: disable=redefined-builtin
   """TFDecorator-aware replacement for inspect.isgenerator."""
   return _inspect.isgenerator(tf_decorator.unwrap(object)[1])
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index ca6710bcf2178db0fcf63c9bdfdf27531651f7ed..63de4a7a96c162f38aa3cba1512cc639df09adcf 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -23,6 +23,7 @@ import traceback
 
 import six  # pylint: disable=unused-import
 
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
 # pylint: enable=g-bad-import-order,g-import-not-at-top
@@ -32,7 +33,8 @@ class _TFShouldUseHelper(object):
   """Object stored in TFShouldUse-wrapped objects.
 
   When it is deleted it will emit a warning or error if its `sate` method
-  has not been called by time of deletion.
+  has not been called by time of deletion, and Tensorflow is not executing
+  eagerly outside of functions.
   """
 
   def __init__(self, type_, repr_, stack_frame, fatal_error_if_unsated):
@@ -50,6 +52,8 @@ class _TFShouldUseHelper(object):
     self._logging_module = None
 
   def __del__(self):
+    if ops.executing_eagerly_outside_functions():
+      return
     if self._sated:
       return
     if self._fatal_error_if_unsated:
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index fedbe1dff6a7bd6e2524355e9946a99fa740f597..65d848cf2a530593857cd912f92a77983d35099b 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -24,6 +24,7 @@ import gc
 import sys
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_should_use
@@ -39,6 +40,7 @@ def reroute_error():
 
 class TfShouldUseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseWarningWhenNotUsed(self):
     c = constant_op.constant(0, name='blah0')
     def in_this_function():
@@ -52,6 +54,7 @@ class TfShouldUseTest(test.TestCase):
     self.assertIn('in_this_function', msg)
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseFatalWhenNotUsed(self):
     c = constant_op.constant(0, name='blah0')
     def in_this_function():
@@ -74,6 +77,7 @@ class TfShouldUseTest(test.TestCase):
     error.assert_not_called()
     fatal.assert_not_called()
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseWarningWhenUsedWithAdd(self):
     def add(h):
       _ = h + 1
@@ -81,6 +85,7 @@ class TfShouldUseTest(test.TestCase):
     gc.collect()
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseWarningWhenUsedWithGetName(self):
     def get_name(h):
       _ = h.name
@@ -88,6 +93,7 @@ class TfShouldUseTest(test.TestCase):
     gc.collect()
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testShouldUseResult(self):
     @tf_should_use.should_use_result
     def return_const(value):
@@ -101,6 +107,7 @@ class TfShouldUseTest(test.TestCase):
     gc.collect()
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testShouldUseResultWhenNotReallyUsed(self):
     @tf_should_use.should_use_result
     def return_const(value):
@@ -111,7 +118,7 @@ class TfShouldUseTest(test.TestCase):
         # Creating another op and executing it does not mark the
         # unused op as being "used".
         v = constant_op.constant(1.0, name='meh')
-        v.eval()
+        self.evaluate(v)
     msg = '\n'.join(error.call_args[0])
     self.assertIn('Object was never used', msg)
     self.assertIn('blah3:0', msg)
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 7b3e618e84894194a299c9e6388dffe4356be72d..e69eec73a0ef8b37f042d9a0f5bf63569b6f5b39 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -172,7 +172,7 @@ class CachedTypeCheck {
     auto* type = Py_TYPE(o);
 
     {
-      mutex_lock l(type_to_sequence_map_mu_);
+      tf_shared_lock l(type_to_sequence_map_mu_);
       auto it = type_to_sequence_map_.find(type);
       if (it != type_to_sequence_map_.end()) {
         return it->second;
@@ -195,7 +195,12 @@ class CachedTypeCheck {
       mutex_lock l(type_to_sequence_map_mu_);
       if (type_to_sequence_map_.size() < kMaxItemsInCache) {
         Py_INCREF(type);
-        type_to_sequence_map_.insert({type, check_result});
+        auto insert_result = type_to_sequence_map_.insert({type, check_result});
+        if (!insert_result.second) {
+          // The type was added to the cache by a concurrent thread after we
+          // looked it up above.
+          Py_DECREF(type);
+        }
       }
     }
 
@@ -825,18 +830,16 @@ PyObject* IsNamedtuple(PyObject* o, bool strict) {
 }
 
 PyObject* SameNamedtuples(PyObject* o1, PyObject* o2) {
-  PyObject* f1 = PyObject_GetAttrString(o1, "_fields");
-  PyObject* f2 = PyObject_GetAttrString(o2, "_fields");
+  Safe_PyObjectPtr f1 = make_safe(PyObject_GetAttrString(o1, "_fields"));
+  Safe_PyObjectPtr f2 = make_safe(PyObject_GetAttrString(o2, "_fields"));
   if (f1 == nullptr || f2 == nullptr) {
-    Py_XDECREF(f1);
-    Py_XDECREF(f2);
     PyErr_SetString(
         PyExc_RuntimeError,
         "Expected namedtuple-like objects (that have _fields attr)");
     return nullptr;
   }
 
-  if (PyObject_RichCompareBool(f1, f2, Py_NE)) {
+  if (PyObject_RichCompareBool(f1.get(), f2.get(), Py_NE)) {
     Py_RETURN_FALSE;
   }
 
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index 3c0ec87fa4e25041aa2d626c180ace34cc788560..4d34d61eee65ea48ad4fbb2894699695110fc76c 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -31,6 +31,9 @@ limitations under the License.
 %unignore tensorflow::swig::RegisterType;
 %noexception tensorflow::swig::RegisterType;
 
+%unignore tensorflow::swig::IsTensor;
+%noexception tensorflow::swig::IsTensor;
+
 %feature("docstring") tensorflow::swig::IsSequence
 """Returns a true if its input is a collections.Sequence (except strings).
 
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index d4d97087ba48087acf2313ca16fa2144bca649be..4c764a7b099010a980c007c5cdff7f20f7ba2106 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -1,6 +1,8 @@
 licenses(["restricted"])
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 
@@ -13,6 +15,14 @@ STREAM_EXECUTOR_HEADERS = glob([
     "platform/**/*.h",
 ])
 
+tf_proto_library(
+    name = "dnn_proto",
+    srcs = ["dnn.proto"],
+    cc_api_version = 2,
+    default_header = True,
+    protodeps = tf_additional_all_protos(),
+)
+
 cc_library(
     name = "stream_executor_impl",
     srcs = glob(
@@ -35,8 +45,13 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        ":dnn_proto_cc_impl",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@local_config_cuda//cuda:cuda_headers",
     ],
     alwayslink = 1,
@@ -47,8 +62,10 @@ cc_library(
     hdrs = STREAM_EXECUTOR_HEADERS,
     visibility = ["//visibility:public"],
     deps = [
+        ":dnn_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
+        "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_static([":stream_executor_impl"]),
 )
@@ -91,11 +108,8 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_cuda_is_configured([
         "//tensorflow/core:cuda",
-        "@local_config_cuda//cuda:cublas",
         "@local_config_cuda//cuda:cuda_driver",
         "@local_config_cuda//cuda:cudnn",
-        "@local_config_cuda//cuda:cufft",
-        "@local_config_cuda//cuda:curand",
     ]),
     alwayslink = 1,
 )
diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc
index 906d6fb7020ce35adb1438d394b34983c332f182..9b8fdf1efe827209eacc8d646358350c240a91c6 100644
--- a/tensorflow/stream_executor/blas.cc
+++ b/tensorflow/stream_executor/blas.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/blas.h"
 
-#include "tensorflow/stream_executor/lib/strcat.h"
+#include "absl/strings/str_cat.h"
 
 namespace stream_executor {
 namespace blas {
@@ -68,7 +68,7 @@ string SideString(Side s) {
 
 // -- AlgorithmConfig
 
-string AlgorithmConfig::ToString() const { return port::StrCat(algorithm_); }
+string AlgorithmConfig::ToString() const { return absl::StrCat(algorithm_); }
 
 string ComputationTypeString(ComputationType ty) {
   switch (ty) {
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index ab7091b3f54727874097f3887cfb63376ed34c9a..957f6c98da564500f81d7185ce6a151003549ee5 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include <assert.h>
 #include <complex>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -57,11 +58,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
@@ -76,21 +81,8 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
 namespace wrap {
 
-#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                    \
-    static const char *kName;                                       \
-    template <typename... Args>                                     \
-    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
-      return ::__name(args...);                                     \
-    }                                                               \
-  } __name;                                                         \
-  const char *WrapperShim__##__name::kName = #__name;
-
-#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
-  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
-
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+// clang-format off
+#define CUBLAS_ROUTINE_EACH(__macro)      \
   __macro(cublasSnrm2)                    \
   __macro(cublasDnrm2)                    \
   __macro(cublasScnrm2)                   \
@@ -262,6 +254,58 @@ namespace wrap {
   __macro(cublasCdgmm)                    \
   __macro(cublasZdgmm)
 
+// clang-format off
+
+#ifdef PLATFORM_GOOGLE
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
+  struct WrapperShim__##__name {                                    \
+    static const char *kName;                                       \
+    template <typename... Args>                                     \
+    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
+      cuda::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                     \
+    }                                                               \
+  } __name;                                                         \
+  const char *WrapperShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
+
+#else
+
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCublasDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cublas DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cublasStatus_t operator()(CUDAExecutor* parent, Args... args) {       \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
+
+#endif
+
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasCreate)
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasDestroy)
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetStream)
@@ -271,7 +315,7 @@ STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasDgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasCgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasZgemmBatched)
-CUBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
+CUBLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
 
 #if CUDA_VERSION >= 7050
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmEx)
@@ -322,7 +366,7 @@ static string ToString(cublasStatus_t status) {
       return "CUBLAS_STATUS_LICENSE_ERROR";
 #endif
     default:
-      return port::StrCat("<invalid cublas status: ", status, ">");
+      return absl::StrCat("<invalid cublas status: ", status, ">");
   }
 }
 
@@ -424,7 +468,8 @@ class ScopedCublasMathMode {
   // Note that when false is returned, an appropriate error has already been
   // logged.
   bool Init(cublasMath_t new_mode) {
-    cublasStatus_t ret = wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
+    cublasStatus_t ret =
+        wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
     if (ret != CUBLAS_STATUS_SUCCESS) {
       LOG(ERROR) << "failed to get old cublas math mode: " << ToString(ret);
       return ok_ = false;
@@ -442,7 +487,8 @@ class ScopedCublasMathMode {
   // successful in the first place.
   ~ScopedCublasMathMode() {
     if (ok_) {
-      cublasStatus_t ret = wrap::cublasSetMathMode(parent_, handle_, old_mode_);
+      cublasStatus_t ret =
+          wrap::cublasSetMathMode(parent_, handle_, old_mode_);
       if (ret != CUBLAS_STATUS_SUCCESS) {
         LOG(ERROR) << "failed to set former cublas math mode: "
                    << ToString(ret);
@@ -675,16 +721,16 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
-      wrap::cublasScasum, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasScasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
-      wrap::cublasDzasum, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasDzasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
@@ -835,16 +881,16 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
-      wrap::cublasScnrm2, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasScnrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
-      wrap::cublasDznrm2, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasDznrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -1060,48 +1106,48 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIcamax, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIcamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIzamax, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIzamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<float> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIsamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIsamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<double> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIdamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIdamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIcamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIcamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIzamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIzamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 42b3fde5b0816f7277cb5d08902af0145a0852aa..0fb05089d7530aa298a332e4e6c714eddd7799e9 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/thread_annotations.h"
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 124d5905b91cbf839437e763728cc76ad0d671dc..6af71b6c9d194182e79decd3f1beeb96d8141974 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -39,17 +39,15 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/stream_executor/lib/process_state.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/lib/numbers.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
 
 namespace stream_executor {
 namespace cuda {
@@ -117,7 +115,7 @@ port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
 // -- class Diagnostician
 
 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
-  return port::StrCat("/dev/nvidia", dev_node_ordinal);
+  return absl::StrCat("/dev/nvidia", dev_node_ordinal);
 }
 
 void Diagnostician::LogDiagnosticInformation() {
@@ -282,7 +280,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
   if (offset == string::npos) {
     return port::Status(
         port::error::NOT_FOUND,
-        port::StrCat("could not find kernel module information in "
+        absl::StrCat("could not find kernel module information in "
                      "driver version file contents: \"",
                      driver_version_file_contents, "\""));
   }
@@ -345,7 +343,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
   CFRelease(kext_infos);
   auto status = port::Status(
       port::error::INTERNAL,
-      port::StrCat(
+      absl::StrCat(
           "failed to read driver bundle version: ",
           CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)));
   return status;
@@ -359,12 +357,12 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
   if (driver_version_file == nullptr) {
     return port::Status(
         port::error::PERMISSION_DENIED,
-        port::StrCat("could not open driver version path for reading: ",
+        absl::StrCat("could not open driver version path for reading: ",
                      kDriverVersionPath));
   }
 
   static const int kContentsSize = 1024;
-  port::InlinedVector<char, 4> contents(kContentsSize);
+  absl::InlinedVector<char, 4> contents(kContentsSize);
   size_t retcode =
       fread(contents.begin(), 1, kContentsSize - 2, driver_version_file);
   if (retcode < kContentsSize - 1) {
@@ -381,7 +379,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
 
   auto status = port::Status(
       port::error::INTERNAL,
-      port::StrCat(
+      absl::StrCat(
           "failed to read driver version file contents: ", kDriverVersionPath,
           "; ferror: ", ferror(driver_version_file)));
   fclose(driver_version_file);
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index df8538a4b842995a0a5ce4ef71573ebde1f2e844..1f2e2f48bbddf5f638135129e502cfe233d5952f 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -36,8 +37,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/mathutil.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@@ -46,6 +45,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 // clang-format off
 #include "cuda/include/cudnn.h"
+#include "absl/strings/string_view.h"
 // clang-format on
 
 namespace stream_executor {
@@ -127,48 +127,11 @@ string ToString(cudnnStatus_t status) {
       return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
 #endif
     default:
-      return port::StrCat("<unknown cudnn status: ", static_cast<int>(status),
+      return absl::StrCat("<unknown cudnn status: ", static_cast<int>(status),
                           ">");
   }
 }
 
-template <typename T>
-cudnnDataType_t GetCudnnDataType(
-    dnn::DataLayout = dnn::DataLayout::kBatchDepthYX);
-
-template <>
-cudnnDataType_t GetCudnnDataType<double>(dnn::DataLayout) {
-  return CUDNN_DATA_DOUBLE;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<float>(dnn::DataLayout) {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<Eigen::half>(dnn::DataLayout) {
-  return CUDNN_DATA_HALF;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<int8>(dnn::DataLayout layout) {
-  switch (layout) {
-    case dnn::DataLayout::kYXDepthBatch:
-    case dnn::DataLayout::kYXBatchDepth:
-    case dnn::DataLayout::kBatchYXDepth:
-    case dnn::DataLayout::kBatchDepthYX:
-      return CUDNN_DATA_INT8;
-    case dnn::DataLayout::kBatchDepthYX4:
-      return CUDNN_DATA_INT8x4;
-  }
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<int32>(dnn::DataLayout) {
-  return CUDNN_DATA_INT32;
-}
-
 // RAII wrapper for all calls to cuDNN with a cuDNN handle argument.
 //
 // See CudnnAccess::GetHandle() for details.
@@ -313,20 +276,18 @@ port::StatusOr<int> GetCudnnProperty(libraryPropertyType type) {
   return value;
 }
 
-cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
-  if (algorithm.is_default()) {
+cudnnRNNAlgo_t ToCudnnRNNAlgo(absl::optional<dnn::AlgorithmDesc> algorithm) {
+  if (!algorithm.has_value()) {
     return CUDNN_RNN_ALGO_STANDARD;
-  } else {
-    cudnnRNNAlgo_t algo = static_cast<cudnnRNNAlgo_t>(algorithm.algo_id());
-    switch (algo) {
-      case CUDNN_RNN_ALGO_STANDARD:
-      case CUDNN_RNN_ALGO_PERSIST_STATIC:
-      case CUDNN_RNN_ALGO_PERSIST_DYNAMIC:
-        return algo;
-      default:
-        LOG(FATAL) << "Unsupported Cudnn RNN algorithm: "
-                   << algorithm.algo_id();
-    }
+  }
+  cudnnRNNAlgo_t algo = static_cast<cudnnRNNAlgo_t>(algorithm->algo_id());
+  switch (algo) {
+    case CUDNN_RNN_ALGO_STANDARD:
+    case CUDNN_RNN_ALGO_PERSIST_STATIC:
+    case CUDNN_RNN_ALGO_PERSIST_DYNAMIC:
+      return algo;
+    default:
+      LOG(FATAL) << "Unsupported Cudnn RNN algorithm: " << algorithm->algo_id();
   }
 }
 
@@ -351,7 +312,7 @@ port::Status CudnnSupport::Init() {
     CudnnVersion loaded_version;
     TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&loaded_version));
     if (!IsSourceCompatibleWithCudnnLibrary(source_version, loaded_version)) {
-      const tensorflow::string error = port::StrCat(
+      const tensorflow::string error = absl::StrCat(
           "Loaded runtime CuDNN library: ", loaded_version.ToString(),
           " but source was compiled with: ", source_version.ToString(),
           ".  CuDNN library major and minor version needs to match or have "
@@ -384,7 +345,7 @@ port::Status CudnnSupport::Init() {
   }
 
   return port::Status(port::error::INTERNAL,
-                      port::StrCat("cudnn library could not create a handle: ",
+                      absl::StrCat("cudnn library could not create a handle: ",
                                    ToString(status)));
 }
 
@@ -505,13 +466,13 @@ RnnDescriptor CreateRnnDescriptor() {
   CHECK_CUDNN_OK(cudnnCreateRNNDescriptor(&result));
   return RnnDescriptor(result);
 }
-PersistentRnnPlan CreatePersistentRnnPlan(cudnnRNNDescriptor_t rnn_desc,
-                                          int batch_size,
-                                          cudnnDataType_t data_type) {
+
+port::StatusOr<PersistentRnnPlan> CreatePersistentRnnPlan(
+    cudnnRNNDescriptor_t rnn_desc, int batch_size, cudnnDataType_t data_type) {
   cudnnPersistentRNNPlan_t result;
-  CHECK_CUDNN_OK(
+  RETURN_IF_CUDNN_ERROR(
       cudnnCreatePersistentRNNPlan(rnn_desc, batch_size, data_type, &result));
-  return PersistentRnnPlan(result);
+  return port::StatusOr<PersistentRnnPlan>(PersistentRnnPlan(result));
 }
 
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a
@@ -595,7 +556,8 @@ class CudnnFilterDescriptor {
     std::vector<int> dims(2 + filter_descriptor.ndims());
     dims[0] = filter_descriptor.output_feature_map_count();
     dims[1] = filter_descriptor.input_feature_map_count();
-    const auto& spatial_dims = filter_descriptor.input_filter_dims();
+    absl::Span<const int64> spatial_dims =
+        filter_descriptor.input_filter_dims();
     std::copy(spatial_dims.begin(), spatial_dims.end(), dims.begin() + 2);
 
     CHECK_CUDNN_OK(cudnnSetFilterNdDescriptor(handle_.get(), elem_type, format,
@@ -663,9 +625,9 @@ class CudnnConvolutionDescriptor {
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       cudnnDataType_t data_type)
       : handle_(CreateConvolutionDescriptor()) {
-    const auto& strides64 = convolution_descriptor.strides();
-    const auto& padding64 = convolution_descriptor.padding();
-    const auto& dilations64 = convolution_descriptor.dilations();
+    absl::Span<const int64> strides64 = convolution_descriptor.strides();
+    absl::Span<const int64> padding64 = convolution_descriptor.padding();
+    absl::Span<const int64> dilations64 = convolution_descriptor.dilations();
     CHECK_NE(convolution_descriptor.pad_alignment(),
              dnn::PadAlignment::kTensorFlowPadding)
         << "TensorFlow padding alignment is not supported.";
@@ -686,10 +648,10 @@ class CudnnConvolutionDescriptor {
     CHECK_CUDNN_OK(cudnnSetConvolutionNdDescriptor(
         handle_.get(), convolution_descriptor.ndims(), padding.data(),
         strides.data(), dilations.data(),
-        // NOTE(keveman): cuDNN supports convolution and cross correlation.
-        // However, almost all the use cases do cross correlation, so just
-        // hard coding it here.
-        CUDNN_CROSS_CORRELATION, data_type));
+        convolution_descriptor.convolution_not_crosscorr()
+            ? CUDNN_CONVOLUTION
+            : CUDNN_CROSS_CORRELATION,
+        data_type));
 
     // NOTE(benbarsdell): This only applies if tensor op math is enabled
     //                      and algo selection is set to Default.
@@ -731,9 +693,9 @@ class CudnnPoolingDescriptor {
   explicit CudnnPoolingDescriptor(
       const dnn::PoolingDescriptor& pooling_descriptor)
       : handle_(CreatePoolingDescriptor()) {
-    const std::vector<int64> strides64 = pooling_descriptor.strides();
-    const std::vector<int64> padding64 = pooling_descriptor.padding();
-    const std::vector<int64> shape64 = pooling_descriptor.window();
+    absl::Span<const int64> strides64 = pooling_descriptor.strides();
+    absl::Span<const int64> padding64 = pooling_descriptor.padding();
+    absl::Span<const int64> shape64 = pooling_descriptor.window();
 
     const int nd = pooling_descriptor.ndims();
     std::vector<int> shape(nd);
@@ -862,11 +824,19 @@ cudnnDataType_t ToCudnnDataType(
     case dnn::DataType::kInt8:
       return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
                                                             : CUDNN_DATA_INT8;
+    case dnn::DataType::kInt32:
+      return CUDNN_DATA_INT32;
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
 }
 
+template <typename T>
+cudnnDataType_t GetCudnnDataType(
+    dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
+  return ToCudnnDataType(dnn::ToDataType<T>::value, data_layout);
+}
+
 cudnnRNNInputMode_t ToCudnnRnnInputMode(dnn::RnnInputMode input_mode) {
   switch (input_mode) {
     case dnn::RnnInputMode::kRnnLinearSkip:
@@ -1042,12 +1012,19 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
         /*mode=*/rnn_mode, /*algo=*/rnn_algo,
         /*dataType=*/compute_type));
 
+    port::StatusOr<PersistentRnnPlan> rnn_plan_wrapper;
     PersistentRnnPlan rnn_plan;
     if (rnn_algo == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
       CHECK_GE(batch_size, 0);
-      rnn_plan = CreatePersistentRnnPlan(rnn_desc.get(), batch_size, data_type);
-      RETURN_IF_CUDNN_ERROR(
-          cudnnSetPersistentRNNPlan(rnn_desc.get(), rnn_plan.get()));
+      rnn_plan_wrapper =
+          CreatePersistentRnnPlan(rnn_desc.get(), batch_size, data_type);
+      if (!rnn_plan_wrapper.ok()) {
+        return port::StatusOr<CudnnRnnDescriptor>(rnn_plan_wrapper.status());
+      } else {
+        rnn_plan = rnn_plan_wrapper.ConsumeValueOrDie();
+        RETURN_IF_CUDNN_ERROR(
+            cudnnSetPersistentRNNPlan(rnn_desc.get(), rnn_plan.get()));
+      }
     }
 
     // Create the params handle.
@@ -1064,10 +1041,9 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // in profile mode, which is run with algorithms returned from
     // GetRnnAlgorithms() (which are non-default and explicitly set whether to
     // use tensor ops).
-    if (RnnTensorOpMathEnabled() &&
-        !algorithm_config.algorithm().is_default()) {
+    if (RnnTensorOpMathEnabled() && algorithm_config.algorithm().has_value()) {
       cudnnMathType_t math_type =
-          algorithm_config.algorithm().tensor_ops_enabled()
+          algorithm_config.algorithm()->tensor_ops_enabled()
               ? CUDNN_TENSOR_OP_MATH
               : CUDNN_DEFAULT_MATH;
       CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
@@ -1505,7 +1481,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
     if (!timer->Stop(AsCUDAStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    auto algo_desc = rnn_desc.algorithm_config().algorithm();
+    auto algo_desc = *rnn_desc.algorithm_config().algorithm();
     output_profile_result->set_algorithm(algo_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
@@ -1608,7 +1584,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
     if (!timer->Stop(AsCUDAStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    auto algo_desc = rnn_desc.algorithm_config().algorithm();
+    auto algo_desc = *rnn_desc.algorithm_config().algorithm();
     output_profile_result->set_algorithm(algo_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
@@ -2008,12 +1984,13 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
     const CudnnConvolutionDescriptor& conv,
-    const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc,
+    const CudnnTensorDescriptor& output_nd,
+    const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
   // TODO(csigg): This has side effects on the convolution descriptor. It is
   // functionally correct because the convolution is run with the algorithm of
   // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2021,14 +1998,9 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
       cudnn.handle(),
       /*xDesc=*/input_nd.handle(),
       /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-      /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(*algorithm_desc),
+      /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(algorithm_desc),
       /*sizeInBytes=*/&size_in_bytes));
 
-  if (TF_PREDICT_FALSE(!algorithm_desc)) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "No AlgorithmDesc provided");
-  }
-  algorithm_desc->set_scratch_size(size_in_bytes);
   int64 size_in_bytes_int64 = size_in_bytes;
 
   if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2055,12 +2027,13 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
     const CudnnConvolutionDescriptor& conv,
-    const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc,
+    const CudnnTensorDescriptor& output_nd,
+    const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
   // TODO(csigg): This has side effects on the convolution descriptor. It is
   // functionally correct because the convolution is run with the algorithm of
   // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2070,14 +2043,9 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
       /*dyDesc=*/output_nd.handle(),
       /*convDesc=*/conv.handle(),
       /*dxDesc=*/input_nd.handle(),
-      /*algo=*/ToConvBackwardDataAlgo(*algorithm_desc),
+      /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
       /*sizeInBytes=*/&size_in_bytes));
 
-  if (TF_PREDICT_FALSE(!algorithm_desc)) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "No AlgorithmDesc provided");
-  }
-  algorithm_desc->set_scratch_size(size_in_bytes);
   int64 size_in_bytes_int64 = size_in_bytes;
 
   if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2104,12 +2072,13 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
     const CudnnConvolutionDescriptor& conv,
-    const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc,
+    const CudnnTensorDescriptor& output_nd,
+    const dnn::AlgorithmDesc& algorithm_desc,
     ScratchAllocator* scratch_allocator) {
   // TODO(csigg): This has side effects on the convolution descriptor. It is
   // functionally correct because the convolution is run with the algorithm of
   // the last call to this function, but should be fixed anyway.
-  conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
 
   // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
@@ -2119,14 +2088,9 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
       /*dyDesc=*/output_nd.handle(),
       /*convDesc=*/conv.handle(),
       /*gradDesc=*/filter.handle(),
-      /*algo=*/ToConvBackwardFilterAlgo(*algorithm_desc),
+      /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
       /*sizeInBytes=*/&size_in_bytes));
 
-  if (TF_PREDICT_FALSE(!algorithm_desc)) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "No AlgorithmDesc provided");
-  }
-  algorithm_desc->set_scratch_size(size_in_bytes);
   int64 size_in_bytes_int64 = size_in_bytes;
 
   if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2155,8 +2119,8 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
-  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
-  if (algorithm_config.algorithm().is_default()) {
+  absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
     bool specify_workspace_limit = scratch_allocator != nullptr;
@@ -2168,33 +2132,33 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
                         GetCudnnConvolutionForwardAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(
-        algo, algorithm_config.algorithm().tensor_ops_enabled());
+    algo_desc = dnn::AlgorithmDesc(algo, /*use_tensor_ops=*/true);
   }
 
   auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
-      stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
+      stream, cudnn, input_nd, filter, conv, output_nd, *algo_desc,
       scratch_allocator);
 
   if (scratch_or.ok()) {
     *scratch = scratch_or.ValueOrDie();
-    return algo_desc;
+    return *algo_desc;
   }
 
+  algo_desc = algorithm_config.algorithm_no_scratch();
+
   // Failed to allocate workspace for the first algorithm, fall back to the
   // no_scratch algorithm.
-  if (algorithm_config.algorithm_no_scratch().is_default()) {
+  if (!algo_desc.has_value()) {
     return port::Status(
         port::error::INVALID_ARGUMENT,
         "The primary convolution algorithm failed memory allocation, "
         "while a secondary algorithm is not provided.");
   }
 
-  algo_desc = algorithm_config.algorithm_no_scratch();
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
-                                    output_nd, &algo_desc, scratch_allocator));
-  return algo_desc;
+                                    output_nd, *algo_desc, scratch_allocator));
+  return *algo_desc;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
@@ -2204,8 +2168,8 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
-  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
-  if (algorithm_config.algorithm().is_default()) {
+  absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
     bool specify_workspace_limit = scratch_allocator != nullptr;
@@ -2217,33 +2181,33 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
                         GetCudnnConvolutionBackwardDataAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(
-        algo, algorithm_config.algorithm().tensor_ops_enabled());
+    algo_desc = dnn::AlgorithmDesc(algo, /*use_tensor_ops=*/true);
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
-      stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
+      stream, cudnn, input_nd, filter, conv, output_nd, *algo_desc,
       scratch_allocator);
 
   if (scratch_or.ok()) {
     *scratch = scratch_or.ValueOrDie();
-    return algo_desc;
+    return *algo_desc;
   }
 
+  algo_desc = algorithm_config.algorithm_no_scratch();
+
   // Failed to allocate workspace for the first algorithm, fall back to the
   // no_scratch algorithm.
-  if (algorithm_config.algorithm_no_scratch().is_default()) {
+  if (!algo_desc.has_value()) {
     return port::Status(
         port::error::INVALID_ARGUMENT,
         "The primary convolution algorithm failed memory allocation, "
         "while a secondary algorithm is not provided.");
   }
 
-  algo_desc = algorithm_config.algorithm_no_scratch();
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
-                                    output_nd, &algo_desc, scratch_allocator));
-  return algo_desc;
+                                    output_nd, *algo_desc, scratch_allocator));
+  return *algo_desc;
 }
 
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
@@ -2253,8 +2217,8 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
     DeviceMemory<uint8>* scratch) {
-  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
-  if (algorithm_config.algorithm().is_default()) {
+  absl::optional<dnn::AlgorithmDesc> algo_desc = algorithm_config.algorithm();
+  if (!algo_desc.has_value()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
     // heuristics.
     bool specify_workspace_limit = scratch_allocator != nullptr;
@@ -2266,33 +2230,33 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
                         GetCudnnConvolutionBackwardFilterAlgo(
                             cudnn, input_nd, filter, conv, output_nd,
                             specify_workspace_limit, memory_limit_bytes));
-    algo_desc = dnn::AlgorithmDesc(
-        algo, algorithm_config.algorithm().tensor_ops_enabled());
+    algo_desc = dnn::AlgorithmDesc(algo, /*use_tensor_ops=*/true);
   }
 
   auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
-      stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
+      stream, cudnn, input_nd, filter, conv, output_nd, *algo_desc,
       scratch_allocator);
 
   if (scratch_or.ok()) {
     *scratch = scratch_or.ValueOrDie();
-    return algo_desc;
+    return *algo_desc;
   }
 
+  algo_desc = algorithm_config.algorithm_no_scratch();
+
   // Failed to allocate workspace for the first algorithm, fall back to the
   // no_scratch algorithm.
-  if (algorithm_config.algorithm_no_scratch().is_default()) {
+  if (!algo_desc.has_value()) {
     return port::Status(
         port::error::INVALID_ARGUMENT,
         "The primary convolution algorithm failed memory allocation, "
         "while a secondary algorithm is not provided.");
   }
 
-  algo_desc = algorithm_config.algorithm_no_scratch();
   SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
                                     stream, cudnn, input_nd, filter, conv,
-                                    output_nd, &algo_desc, scratch_allocator));
-  return algo_desc;
+                                    output_nd, *algo_desc, scratch_allocator));
+  return *algo_desc;
 }
 
 // A helper class to set env-vars and choose options for cudnn-related
@@ -2309,7 +2273,7 @@ class CudnnEnvVar {
   static bool IsEnabledImpl() {
     const char* tf_env_var_val = getenv(EnvVar::kName);
     if (tf_env_var_val != nullptr) {
-      port::StringPiece tf_env_var_val_str(tf_env_var_val);
+      absl::string_view tf_env_var_val_str(tf_env_var_val);
       if (tf_env_var_val_str == "0") {
         return false;
       }
@@ -2352,27 +2316,6 @@ struct ConvDoFP32ComputationFP16Input {
   static constexpr bool kDefaultFlag = true;
 };
 
-// A group of helper functions to return the internal compute type for
-// convolutions in cudnn.
-template <typename T>
-cudnnDataType_t GetConvComputeType() {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetConvComputeType<Eigen::half>() {
-  if (CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()) {
-    return CUDNN_DATA_FLOAT;
-  } else {
-    return CUDNN_DATA_HALF;
-  }
-}
-
-template <>
-cudnnDataType_t GetConvComputeType<double>() {
-  return CUDNN_DATA_DOUBLE;
-}
-
 // A helper struct to decide whether to use FP32 as the internal compute type
 // for rnn when the input data type is FP16. At present it is turned off,
 // users can explicitly control them through an env-var
@@ -2444,7 +2387,7 @@ port::Status CudnnSupport::DoConvolveImpl(
     const DeviceMemory<T>& filter_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
-    ScratchAllocator* scratch_allocator,
+    dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -2452,7 +2395,7 @@ port::Status CudnnSupport::DoConvolveImpl(
   CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -2537,13 +2480,13 @@ port::Status CudnnSupport::DoConvolveImpl(
     output_profile_result->set_algorithm(algo_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
+    output_profile_result->set_scratch_size(scratch.size());
   }
 
   return port::Status::OK();
 }
 
-template <typename AccumulatorType, typename ElementType, typename BiasType,
-          typename ScaleType>
+template <typename ElementType, typename BiasType, typename ScaleType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<ElementType>& conv_input_data,
@@ -2554,7 +2497,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
     ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
     const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<ElementType>* output_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   if (activation_mode != dnn::ActivationMode::kRelu &&
@@ -2575,7 +2519,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetCudnnDataType<AccumulatorType>());
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -2653,6 +2597,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
     output_profile_result->set_algorithm(algo_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
+    output_profile_result->set_scratch_size(scratch.size());
   }
 
   return port::Status::OK();
@@ -2943,10 +2888,10 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveImpl<float>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, dnn::DataType::kFloat, scratch_allocator,
+                     algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2961,10 +2906,10 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveImpl<double>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, dnn::DataType::kDouble, scratch_allocator,
+                     algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2978,11 +2923,15 @@ bool CudnnSupport::DoConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveImpl<Eigen::half>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, acc_type, scratch_allocator, algorithm_config,
+                     output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3000,12 +2949,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl<double>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kDouble, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3023,12 +2973,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl<float>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kFloat, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3046,13 +2997,17 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoFusedConvolveImpl<float>(
+      DoFusedConvolveImpl(
           stream, conv_input_descriptor, conv_input_data, conv_input_scale,
           filter_descriptor, filter_data, convolution_descriptor,
           side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+          activation_mode, output_descriptor, output_data, acc_type,
+          scratch_allocator, algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3078,12 +3033,13 @@ bool CudnnSupport::DoFusedConvolve(
     return false;
   }
   return IsStatusOk(
-      DoFusedConvolveImpl<int32>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kInt32, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3117,7 +3073,8 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
     DeviceMemory<T> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -3138,7 +3095,7 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
   CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3169,10 +3126,8 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
   // Cudnn 7.1.4 has a bug if the workspace of the following convolution is not
   // zero-initialized, nvbugs/2254619.
   if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
-      algorithm_config.algorithm().algo_id() ==
-          CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
-      cudnn_type == CUDNN_DATA_HALF &&
-      algorithm_config.algorithm().tensor_ops_enabled() &&
+      algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
+      cudnn_type == CUDNN_DATA_HALF && algo_desc.tensor_ops_enabled() &&
       input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
       filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX &&
       output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX &&
@@ -3202,6 +3157,7 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
     output_profile_result->set_algorithm(algo_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
+    output_profile_result->set_scratch_size(scratch.size());
   }
 
   return port::Status::OK();
@@ -3219,11 +3175,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kDouble, scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3239,11 +3195,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kFloat, scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3258,12 +3214,16 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, acc_type, scratch_allocator, algorithm_config,
+          output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3275,7 +3235,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     DeviceMemory<T> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<T>* backward_filter_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -3296,7 +3257,7 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
   CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3363,8 +3324,7 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
   //
   // See nvbugs/2379553.
   if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
-      algorithm_config.algorithm().algo_id() ==
-          CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
+      algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
       cudnn_type == CUDNN_DATA_HALF &&
       input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
       filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput &&
@@ -3395,6 +3355,7 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     output_profile_result->set_algorithm(algo_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
+    output_profile_result->set_scratch_size(scratch.size());
   }
 
   return port::Status::OK();
@@ -3412,11 +3373,12 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, dnn::DataType::kDouble,
+
+          scratch_allocator, algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3431,13 +3393,14 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return IsStatusOk(DoConvolveBackwardFilterImpl(
+                        stream, input_descriptor, input_data, output_descriptor,
+                        backward_output_data, convolution_descriptor,
+                        filter_descriptor, backward_filter_data,
+
+                        dnn::DataType::kFloat, scratch_allocator,
+                        algorithm_config, output_profile_result),
+                    /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3451,12 +3414,16 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, acc_type, scratch_allocator, algorithm_config,
+          output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 74f6f935b84cfbea27e1e9165b5f7241f74a9cbb..0641be140d2f19651696b0bcac498870a4db2960 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -670,12 +670,12 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<T>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T>* output_data, ScratchAllocator* scratch_allocator,
+      DeviceMemory<T>* output_data, dnn::DataType accumulator_type,
+      ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
-  template <typename AccumulatorType, typename ElementType, typename BiasType,
-            typename ScaleType>
+  template <typename ElementType, typename BiasType, typename ScaleType>
   port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<ElementType>& conv_input_data,
@@ -687,7 +687,7 @@ class CudnnSupport : public dnn::DnnSupport {
       ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
       const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<ElementType>* output_data,
+      DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
@@ -700,7 +700,8 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
+      DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+      ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
@@ -712,7 +713,7 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<T>* backward_filter_data,
+      DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index f982f34b98eca60dbf50dbf7c970b079283d0b42..b34d1f722eaf60b21f2289a4b87b5653bfd43bb9 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -21,17 +21,17 @@ limitations under the License.
 #include <set>
 #include <utility>
 
+#include "absl/base/casts.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
-#include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/human_readable.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
 #include "tensorflow/stream_executor/lib/notification.h"
 #include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@@ -109,13 +109,13 @@ class CreatedContexts {
 string ToString(CUresult result) {
   const char *error_name;
   if (cuGetErrorName(result, &error_name)) {
-    return port::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
+    return absl::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
   }
   const char *error_string;
   if (cuGetErrorString(result, &error_string)) {
     return error_name;
   }
-  return port::StrCat(error_name, ": ", error_string);
+  return absl::StrCat(error_name, ": ", error_string);
 }
 
 // Returns the current context and checks that it is in the set of CUDA contexts
@@ -241,7 +241,7 @@ namespace {
 string CUDAPointerToDeviceString(CUdeviceptr pointer) {
   auto value = CUDADriver::GetPointerDevice(pointer);
   if (value.ok()) {
-    return port::StrCat(value.ValueOrDie());
+    return absl::StrCat(value.ValueOrDie());
   }
   LOG(ERROR) << "could not query device: " << value.status();
   return "?";
@@ -300,7 +300,7 @@ static port::Status InternalInit() {
   LOG(ERROR) << "failed call to cuInit: " << ToString(res);
   Diagnostician::LogDiagnosticInformation();
   return port::Status(port::error::ABORTED,
-                      port::StrCat("failed call to cuInit: ", ToString(res)));
+                      absl::StrCat("failed call to cuInit: ", ToString(res)));
 }
 
 }  // namespace
@@ -330,13 +330,13 @@ static port::Status InternalInit() {
 
   return port::Status(
       port::error::INTERNAL,
-      port::StrCat("failed call to cuDeviceGet: ", ToString(res)));
+      absl::StrCat("failed call to cuDeviceGet: ", ToString(res)));
 }
 
 /* static */ bool CUDADriver::GetDeviceName(CUdevice device,
                                             string *device_name) {
   static const size_t kCharLimit = 64;
-  port::InlinedVector<char, 4> chars(kCharLimit);
+  absl::InlinedVector<char, 4> chars(kCharLimit);
   CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get device name for " << device << ": "
@@ -439,9 +439,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   if (res == CUDA_ERROR_OUT_OF_MEMORY) {
     uint64 total_memory;
     if (GetDeviceTotalMemory(device, &total_memory)) {
-      port::StrAppend(&message, "; total memory reported: ", total_memory);
+      absl::StrAppend(&message, "; total memory reported: ", total_memory);
     } else {
-      port::StrAppend(&message, "; could not query total memory");
+      absl::StrAppend(&message, "; could not query total memory");
     }
   }
 
@@ -504,7 +504,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                << ", result: " << ToString(result);
     return port::Status(
         port::error::INTERNAL,
-        port::StrCat("failed to get shared memory config: ", ToString(result)));
+        absl::StrCat("failed to get shared memory config: ", ToString(result)));
   }
   return shared_mem_config;
 }
@@ -522,7 +522,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                << ", result: " << ToString(result);
     return port::Status(
         port::error::INTERNAL,
-        port::StrCat("failed to set shared memory config: ", ToString(result)));
+        absl::StrCat("failed to set shared memory config: ", ToString(result)));
   }
   return port::Status::OK();
 }
@@ -575,8 +575,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     static const unsigned int kLogBufferBytesLimit = 1024;
     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
     unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
-    port::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
-    port::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
+    absl::InlinedVector<char, 4> error_log_buffer(error_log_buffer_bytes);
+    absl::InlinedVector<char, 4> info_log_buffer(info_log_buffer_bytes);
     bool log_verbose = true;
     CUjit_option options[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
                               CU_JIT_ERROR_LOG_BUFFER,
@@ -585,11 +585,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     // Note that the driver API wants the contents of this values to be stored
     // in an array of void*s, so we coerce them accordingly.
     void *option_values[] = {
-        port::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)),
-        port::bit_cast<void *>(error_log_buffer.data()),
-        port::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
-        port::bit_cast<void *>(info_log_buffer.data()),
-        port::bit_cast<void *>(uintptr_t(log_verbose))};
+        absl::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)),
+        absl::bit_cast<void *>(error_log_buffer.data()),
+        absl::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
+        absl::bit_cast<void *>(info_log_buffer.data()),
+        absl::bit_cast<void *>(uintptr_t(log_verbose))};
     CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
 
     CUresult res;
@@ -758,7 +758,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
   return port::Status(
       port::error::INTERNAL,
-      port::StrCat("failed to get device for context: ", ToString(result)));
+      absl::StrCat("failed to get device for context: ", ToString(result)));
 }
 
 /* static */ bool CUDADriver::CreateStream(CudaContext *context,
@@ -817,7 +817,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
                                                void *location) {
   ScopedActivateContext activation(context);
-  CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
+  CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
   CUresult res = cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free device memory at " << location
@@ -847,7 +847,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
                                                       void *location) {
   ScopedActivateContext activation(context);
-  CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
+  CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
   CUresult res = cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free unified memory at " << location
@@ -1023,7 +1023,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   CUresult res = cuStreamSynchronize(stream);
   if (res != CUDA_SUCCESS) {
     port::Status status = port::InternalError(
-        port::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
+        absl::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
     LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
     return status;
   }
@@ -1058,7 +1058,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
         port::Printf("failed to synchronous memcpy from device to host: %s; "
                      "host dst: %p; GPU src: %p; size: %llu=0x%llx",
                      ToString(res).c_str(), host_dst,
-                     port::bit_cast<void *>(gpu_src), size, size));
+                     absl::bit_cast<void *>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
           << host_dst;
@@ -1075,7 +1075,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
         " host src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size,
+        ToString(res).c_str(), absl::bit_cast<void *>(gpu_dst), host_src, size,
         size));
   }
   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
@@ -1092,8 +1092,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
         "GPU src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst),
-        port::bit_cast<void *>(gpu_src), size, size));
+        ToString(res).c_str(), absl::bit_cast<void *>(gpu_dst),
+        absl::bit_cast<void *>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
   return port::Status::OK();
@@ -1110,12 +1110,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
         "GPU src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), host_dst, port::bit_cast<void *>(gpu_src), size, size);
+        ToString(res).c_str(), host_dst, absl::bit_cast<void *>(gpu_src), size,
+        size);
     return false;
   }
   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
-          << " bytes from " << port::bit_cast<void *>(gpu_src) << " to " << host_dst
-          << " on stream " << stream;
+          << " bytes from " << absl::bit_cast<void *>(gpu_src) << " to "
+          << host_dst << " on stream " << stream;
   return true;
 }
 
@@ -1130,7 +1131,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
         "host src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), port::bit_cast<void *>(gpu_dst), host_src, size, size);
+        ToString(res).c_str(), absl::bit_cast<void *>(gpu_dst), host_src, size,
+        size);
     return false;
   }
   VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
@@ -1151,9 +1153,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
         "; GPU dst: %p on %s %s"
         "; GPU src: %p on %s %s"
         "; can access? %s; size: %llu=0x%llx",
-        ToString(result).c_str(), port::bit_cast<void *>(gpu_dst),
+        ToString(result).c_str(), absl::bit_cast<void *>(gpu_dst),
         CUDAPointerToMemorySpaceString(gpu_dst).c_str(),
-        CUDAPointerToDeviceString(gpu_dst).c_str(), port::bit_cast<void *>(gpu_src),
+        CUDAPointerToDeviceString(gpu_dst).c_str(),
+        absl::bit_cast<void *>(gpu_src),
         CUDAPointerToMemorySpaceString(gpu_src).c_str(),
         CUDAPointerToDeviceString(gpu_src).c_str(),
         CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
@@ -1190,7 +1193,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   } else {
     return port::Status(
         port::error::FAILED_PRECONDITION,
-        port::StrCat("could not create CUDA event: ", ToString(res)));
+        absl::StrCat("could not create CUDA event: ", ToString(res)));
   }
 }
 
@@ -1220,7 +1223,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
   return port::Status(
       port::error::INTERNAL,
-      port::StrCat("failed to query device pointer for context: ",
+      absl::StrCat("failed to query device pointer for context: ",
                    ToString(result)));
 }
 
@@ -1238,13 +1241,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       default:
         return port::Status(
             port::error::INTERNAL,
-            port::StrCat("unknown memory space provided by CUDA API: ", value));
+            absl::StrCat("unknown memory space provided by CUDA API: ", value));
     }
   }
 
   return port::Status(
       port::error::INTERNAL,
-      port::StrCat("failed to query device pointer for memory space: ",
+      absl::StrCat("failed to query device pointer for memory space: ",
                    ToString(result)));
 }
 
@@ -1306,7 +1309,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   if (result != CUDA_SUCCESS) {
     return port::Status(
         port::error::NOT_FOUND,
-        port::StrCat("could not retrieve CUDA device attribute (", attribute,
+        absl::StrCat("could not retrieve CUDA device attribute (", attribute,
                      "): ", ToString(result)));
   }
   T converted = value;
@@ -1463,7 +1466,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 /* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
   string pci_bus_id;
   static const int kBufferSize = 64;
-  port::InlinedVector<char, 4> chars(kBufferSize);
+  absl::InlinedVector<char, 4> chars(kBufferSize);
   chars[kBufferSize - 1] = '\0';
   CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
   if (res != CUDA_SUCCESS) {
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index 013ca2d7f6d7f9d58837d9aa2ebbefc5906c5623..acac7d6368885537b1f5727779388d550680e90d 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -23,6 +23,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -38,6 +43,7 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
 
 namespace wrap {
 
+#ifdef PLATFORM_GOOGLE
 // This macro wraps a global identifier, given by __name, in a callable
 // structure that loads the DLL symbol out of the DSO handle in a thread-safe
 // manner on first use. This dynamic loading technique is used to avoid DSO
@@ -52,22 +58,69 @@ namespace wrap {
     }                                                            \
   } __name;
 
-#define CUFFT_ROUTINE_EACH(__macro)                                            \
-  __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d)           \
-      __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany)         \
-          __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C)    \
-              __macro(cufftExecC2R) __macro(cufftExecZ2Z)                      \
-                  __macro(cufftExecR2C) __macro(cufftCreate)                   \
-                      __macro(cufftSetAutoAllocation)                          \
-                          __macro(cufftSetWorkArea) __macro(cufftGetSize1d)    \
-                              __macro(cufftMakePlan1d) __macro(cufftGetSize2d) \
-                                  __macro(cufftMakePlan2d)                     \
-                                      __macro(cufftGetSize3d)                  \
-                                          __macro(cufftMakePlan3d)             \
-                                              __macro(cufftGetSizeMany)        \
-                                                  __macro(cufftMakePlanMany)
+#else
+
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                                \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCufftDsoHandle();            \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cufft DSO; dlerror: " << s.error_message();   \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cufftResult operator()(CUDAExecutor *parent, Args... args) {          \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+// clang-format off
+
+#define CUFFT_ROUTINE_EACH(__macro)                                     \
+  __macro(cufftDestroy)                                                 \
+  __macro(cufftSetStream)                                               \
+  __macro(cufftPlan1d)                                                  \
+  __macro(cufftPlan2d)                                                  \
+  __macro(cufftPlan3d)                                                  \
+  __macro(cufftPlanMany)                                                \
+  __macro(cufftExecD2Z)                                                 \
+  __macro(cufftExecZ2D)                                                 \
+  __macro(cufftExecC2C)                                                 \
+  __macro(cufftExecC2R)                                                 \
+  __macro(cufftExecZ2Z)                                                 \
+  __macro(cufftExecR2C)                                                 \
+  __macro(cufftCreate)                                                  \
+  __macro(cufftSetAutoAllocation)                                       \
+  __macro(cufftSetWorkArea)                                             \
+  __macro(cufftGetSize1d)                                               \
+  __macro(cufftMakePlan1d)                                              \
+  __macro(cufftGetSize2d)                                               \
+  __macro(cufftMakePlan2d)                                              \
+  __macro(cufftGetSize3d)                                               \
+  __macro(cufftMakePlan3d)                                              \
+  __macro(cufftGetSizeMany)                                             \
+  __macro(cufftMakePlanMany)
+
+// clang-format on
 
 CUFFT_ROUTINE_EACH(STREAM_EXECUTOR_CUFFT_WRAP)
+#undef CUFFT_ROUTINE_EACH
 
 }  // namespace wrap
 
@@ -332,6 +385,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create1dPlan(Stream *stream, uint64 num_x,
   // TODO(yangzihao): In the future, send error msg back to TensorFlow
   // so it can fail gracefully,
   if (!status.ok()) {
+    LOG(ERROR) << "Plan Parameters: num_x: " << num_x;
     LOG(FATAL) << "failed to initialize cufft 1d plan: "
                << status.error_message();
   }
@@ -346,6 +400,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create1dPlanWithScratchAllocator(
   port::Status status = fft_plan_ptr->Initialize(parent_, stream, 1, elem_count,
                                                  type, scratch_allocator);
   if (!status.ok()) {
+    LOG(ERROR) << "Plan Parameters: num_x: " << num_x;
     LOG(FATAL)
         << "failed to initialize cufft 1d plan with customized allocator: "
         << status.error_message();
@@ -361,6 +416,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create2dPlan(Stream *stream, uint64 num_x,
   port::Status status = fft_plan_ptr->Initialize(
       parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
+    LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y;
     LOG(FATAL) << "failed to initialize cufft 2d plan: "
                << status.error_message();
   }
@@ -375,6 +431,7 @@ std::unique_ptr<fft::Plan> CUDAFft::Create2dPlanWithScratchAllocator(
   port::Status status = fft_plan_ptr->Initialize(parent_, stream, 2, elem_count,
                                                  type, scratch_allocator);
   if (!status.ok()) {
+    LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y;
     LOG(FATAL)
         << "failed to initialize cufft 2d plan with customized allocator: "
         << status.error_message();
@@ -391,6 +448,8 @@ std::unique_ptr<fft::Plan> CUDAFft::Create3dPlan(Stream *stream, uint64 num_x,
   port::Status status = fft_plan_ptr->Initialize(
       parent_, stream, 3, elem_count, type, /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
+    LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y
+               << " num_z: " << num_z;
     LOG(FATAL) << "failed to initialize cufft 3d plan: "
                << status.error_message();
   }
@@ -405,6 +464,8 @@ std::unique_ptr<fft::Plan> CUDAFft::Create3dPlanWithScratchAllocator(
   port::Status status = fft_plan_ptr->Initialize(parent_, stream, 3, elem_count,
                                                  type, scratch_allocator);
   if (!status.ok()) {
+    LOG(ERROR) << "Plan Parameters: num_x: " << num_x << " num_y: " << num_y
+               << " num_z: " << num_z;
     LOG(FATAL)
         << "failed to initialize cufft 3d plan with customized allocator: "
         << status.error_message();
@@ -423,6 +484,15 @@ std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlan(
       input_distance, output_embed, output_stride, output_distance, type,
       batch_count, /*scratch_allocator=*/nullptr);
   if (!status.ok()) {
+    LOG(ERROR) << "Initialize Params: rank: " << rank
+               << " elem_count: " << *elem_count
+               << " input_embed: " << *input_embed
+               << " input_stride: " << input_stride
+               << " input_distance: " << input_distance
+               << " output_embed: " << *output_embed
+               << " output_stride: " << output_stride
+               << " output_distance: " << output_distance
+               << " batch_count: " << batch_count;
     LOG(FATAL) << "failed to initialize batched cufft plan: "
                << status.error_message();
   }
@@ -441,6 +511,15 @@ std::unique_ptr<fft::Plan> CUDAFft::CreateBatchedPlanWithScratchAllocator(
       input_distance, output_embed, output_stride, output_distance, type,
       batch_count, scratch_allocator);
   if (!status.ok()) {
+    LOG(ERROR) << "Initialize Params: rank: " << rank
+               << " elem_count: " << *elem_count
+               << " input_embed: " << *input_embed
+               << " input_stride: " << input_stride
+               << " input_distance: " << input_distance
+               << " output_embed: " << *output_embed
+               << " output_stride: " << output_stride
+               << " output_distance: " << output_distance
+               << " batch_count: " << batch_count;
     LOG(FATAL)
         << "failed to initialize batched cufft plan with customized allocator: "
         << status.error_message();
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 5cceb8983cb971e88e960b18c4babe1d6fe90c42..4874d096ad54fa352fd6e9ad3b7b87c1fff59f73 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #else
 #include <unistd.h>
 #endif
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/cuda/cuda_event.h"
@@ -31,17 +33,16 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/kernel_cache_config.h"
-#include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/mathutil.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/process_state.h"
 #include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@@ -51,7 +52,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/timer.h"
-#include "tensorflow/stream_executor/lib/numbers.h"
 
 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
 #error \
@@ -147,14 +147,14 @@ port::Status CUDAExecutor::Init(int device_ordinal,
 }
 
 bool CUDAExecutor::FindOnDiskForComputeCapability(
-    port::StringPiece filename, port::StringPiece canonical_suffix,
+    absl::string_view filename, absl::string_view canonical_suffix,
     string *found_filename) const {
   if (cc_major_ == 0 && cc_minor_ == 0) {
     return false;
   }
 
   string cc_specific =
-      port::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
+      absl::StrCat(filename, ".cc", cc_major_, cc_minor_, canonical_suffix);
   if (port::FileExists(cc_specific).ok()) {
     VLOG(2) << "found compute-capability-specific file, using that: "
             << cc_specific;
@@ -662,8 +662,13 @@ bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
 }
 
 bool CUDAExecutor::HostCallback(Stream *stream,
-                                std::function<void()> callback) {
-  auto callback_ptr = new std::function<void()>(callback);
+                                std::function<port::Status()> callback) {
+  auto callback_ptr = new std::function<void()>([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
   return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
                                        InternalHostCallback, callback_ptr);
 }
@@ -1085,7 +1090,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
   }
 
   builder.set_platform_version(
-      port::StrCat("Compute Capability ", cc_major_, ".", cc_minor_));
+      absl::StrCat("Compute Capability ", cc_major_, ".", cc_minor_));
 
   // TODO(leary) should be a way to query this from the driver, but this is
   // unlikely to change for us any time soon.
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 53b2a29ae7554cdc97efa853c9fd62e725d7e39d..ae8e4abf92024626bf3d2bd3d334244708f55737 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <set>
 #include <unordered_map>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/cuda/cuda_kernel.h"
 #include "tensorflow/stream_executor/event.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -147,7 +148,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
                             const DeviceMemoryBase &gpu_src,
                             uint64 size) override;
 
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+  bool HostCallback(Stream *stream,
+                    std::function<port::Status()> callback) override;
 
   bool AllocateStream(Stream *stream) override;
 
@@ -234,8 +236,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
   // filename by looking for compute-capability-specific suffixed versions; i.e.
   // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
   // we're on a compute capability 3.0 machine.
-  bool FindOnDiskForComputeCapability(port::StringPiece filename,
-                                      port::StringPiece canonical_suffix,
+  bool FindOnDiskForComputeCapability(absl::string_view filename,
+                                      absl::string_view canonical_suffix,
                                       string *found_filename) const;
 
   // Host callback landing routine invoked by CUDA.
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
index beaebe8f1233533053c97bbac7eb283deaf96a2c..ec1dc51e57f5a928d54cb86b1cbcc217100df6d4 100644
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/kernel_cache_config.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "cuda/include/cuda.h"
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 622a4a4edb1fe4163831e9429c1a7ab9262f2727..b342e71bdd94f6112d500d86f6ed4051821d2d54 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -209,3 +209,5 @@ REGISTER_MODULE_INITIALIZER(cuda_platform,
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(cuda_platform, multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     cuda_platform);
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 88c4f15792737aac8dfafefba4c7fce74c434320..7f920719321637360fdf5c098e83dfaa49164e6c 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -21,6 +21,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -61,6 +66,7 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
 
 namespace wrap {
 
+#ifdef PLATFORM_GOOGLE
 #define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     template <typename... Args>                                     \
@@ -70,6 +76,36 @@ namespace wrap {
     }                                                               \
   } __name;
 
+#else
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCurandDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in curand DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {       \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+#endif
+
 STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
 STREAM_EXECUTOR_CURAND_WRAP(curandDestroyGenerator);
 STREAM_EXECUTOR_CURAND_WRAP(curandSetStream);
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 726c4adf748cb81e45c0b3c1fa3033761203ed58..0b991b7ba8cdad7f342adc6c8ff25b88d91e2bd2 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/stream_executor/lib/human_readable.h"
 #include "tensorflow/stream_executor/lib/mathutil.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
 
 namespace stream_executor {
 
@@ -67,20 +67,20 @@ std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
 
   const ThreadDim &thread_dim = thread_dim_limit();
   result["ThreadDim Limit"] =
-      port::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
+      absl::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
   const BlockDim &block_dim = block_dim_limit();
   result["BlockDim Limit"] =
-      port::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
+      absl::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
 
-  result["Threads Per Core Limit"] = port::StrCat(threads_per_core_limit());
-  result["Threads Per Block Limit"] = port::StrCat(threads_per_block_limit());
+  result["Threads Per Core Limit"] = absl::StrCat(threads_per_core_limit());
+  result["Threads Per Block Limit"] = absl::StrCat(threads_per_block_limit());
   result["Registers Per Block Limit"] =
-      port::StrCat(registers_per_block_limit());
+      absl::StrCat(registers_per_block_limit());
 
-  result["Device Address Bits"] = port::StrCat(device_address_bits());
+  result["Device Address Bits"] = absl::StrCat(device_address_bits());
   result["Device Memory Size"] =
       port::HumanReadableNumBytes::ToString(device_memory_size());
-  result["Memory Bandwidth"] = port::StrCat(
+  result["Memory Bandwidth"] = absl::StrCat(
       port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
 
   result["Shared Memory Per Core"] =
@@ -88,14 +88,14 @@ std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
   result["Shared Memory Per Block"] =
       port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
 
-  result["Clock Rate GHz"] = port::StrCat(clock_rate_ghz());
+  result["Clock Rate GHz"] = absl::StrCat(clock_rate_ghz());
 
-  result["CUDA Compute Capability"] = port::StrCat(
+  result["CUDA Compute Capability"] = absl::StrCat(
       cuda_compute_capability_major_, ".", cuda_compute_capability_minor_);
 
-  result["NUMA Node"] = port::StrCat(numa_node());
-  result["Core Count"] = port::StrCat(core_count());
-  result["ECC Enabled"] = port::StrCat(ecc_enabled());
+  result["NUMA Node"] = absl::StrCat(numa_node());
+  result["Core Count"] = absl::StrCat(core_count());
+  result["ECC Enabled"] = absl::StrCat(ecc_enabled());
   return owned_result;
 }
 
@@ -140,21 +140,11 @@ void CalculateDimensionality(const DeviceDescription &device_description,
                              uint64 element_count, uint64 *threads_per_block,
                              uint64 *block_count) {
   *threads_per_block = device_description.threads_per_block_limit();
-  *block_count = DivideCeil(element_count, *threads_per_block);
+  *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
   if (*block_count == 1) {
     CHECK_LE(element_count, *threads_per_block);
     *threads_per_block = element_count;
   }
 }
 
-// Round value up to a multiple of n.
-static uint64 RoundUp(uint64 value, uint64 n) {
-  return port::MathUtil::CeilOfRatio(value, n) * n;
-}
-
-// Round value down to a multiple of n.
-static uint64 RoundDown(uint64 value, uint64 n) {
-  return port::MathUtil::FloorOfRatio(value, n) * n;
-}
-
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_memory.h b/tensorflow/stream_executor/device_memory.h
index 5a5334e0f5f6e8744b92188de14d7fea0f2ff9a0..e575b5178bbd2f4ed3425823d497ea8b73bc7258 100644
--- a/tensorflow/stream_executor/device_memory.h
+++ b/tensorflow/stream_executor/device_memory.h
@@ -26,7 +26,6 @@ limitations under the License.
 
 #include <stddef.h>
 
-#include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace perftools {
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 2a30f922bca4d1dc7d8a9d4ee6e26f7bdf41251c..faa662211ebb366b8e20cdc3e33ca651c64cf73a 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -15,15 +15,15 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/dnn.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
 namespace dnn {
 
 uint64 AlgorithmDesc::hash() const {
-  return ::tensorflow::Hash64Combine(algo_, tensor_ops_enabled_);
+  return ::tensorflow::Hash64Combine(algo_id(), tensor_ops_enabled());
 }
 
 bool DnnSupport::GetConvolveAlgorithms(
@@ -187,6 +187,9 @@ std::tuple<int, int, int> GetDimIndices(const DataLayout& layout,
       batch_idx = 0;
       spatial_idx = 2;
       break;
+
+    default:
+      LOG(FATAL) << "Unknown layout " << layout;
   }
 
   return std::make_tuple(depth_idx, batch_idx, spatial_idx);
@@ -219,35 +222,41 @@ std::vector<int64> ReorderDims(const std::vector<int64>& input,
 // -- AlgorithmConfig
 
 string AlgorithmConfig::ToString() const {
-  return port::StrCat(algorithm_.algo_id(), ", ",
-                      algorithm_no_scratch_.algo_id());
+  AlgorithmDesc::Index algo_id = -1;
+  if (algorithm().has_value()) {
+    algo_id = algorithm()->algo_id();
+  }
+  AlgorithmDesc::Index algo_id_no_scratch = -1;
+  if (algorithm_no_scratch().has_value()) {
+    algo_id_no_scratch = algorithm_no_scratch()->algo_id();
+  }
+  return absl::StrCat(algo_id, ", ", algo_id_no_scratch);
 }
 
 // -- BatchDescriptor
 
 BatchDescriptor::BatchDescriptor(int ndims)
-    : count_(0),
-      feature_map_count_(0),
-      spatial_size_(ndims, 0),
-      value_max_(0.0),
+    : value_max_(0.0),
       value_min_(0.0),
-      layout_(DataLayout::kYXDepthBatch),
-      ndims_(ndims),
-      quantized_activation_mode_(QuantizedActivationMode::k8Bit) {}
+      quantized_activation_mode_(QuantizedActivationMode::k8Bit) {
+  tensor_.mutable_dimensions()->Resize(ndims + 2, 0);
+  set_layout(DataLayout::kYXDepthBatch);
+}
 
 BatchDescriptor::BatchDescriptor() : BatchDescriptor(/*ndims=*/2) {}
 
 std::vector<int64> BatchDescriptor::full_dims(const DataLayout& layout) const {
-  std::vector<int64> bdyx_dims(ndims_ + 2);
+  std::vector<int64> bdyx_dims(ndims() + 2);
   bdyx_dims[0] = count();
   bdyx_dims[1] = feature_map_count();
-  std::copy(spatial_size_.begin(), spatial_size_.end(), bdyx_dims.begin() + 2);
+  std::copy(spatial_size().begin(), spatial_size().end(),
+            bdyx_dims.begin() + 2);
   return ReorderDims(bdyx_dims, DataLayout::kBatchDepthYX, layout);
 }
 
 std::vector<int64> BatchDescriptor::full_strides(
     const DataLayout& layout) const {
-  if (layout_ == DataLayout::kBatchDepthYX4) {
+  if (this->layout() == DataLayout::kBatchDepthYX4) {
     LOG(FATAL)
         << "Cannot compute full strides for batch descriptor " << ToString()
         << ", because its layout is kBatchDepthYX4. In fact, "
@@ -255,53 +264,49 @@ std::vector<int64> BatchDescriptor::full_strides(
            "Use cudnnSetTensor4DDescriptor to set cudnnTensorDescriptor_t "
            "instead.";
   }
-  std::vector<int64> phys_dims = full_dims(layout_);
+  std::vector<int64> phys_dims = full_dims(this->layout());
   std::vector<int64> phys_strides(phys_dims.size());
-  phys_strides[ndims_ + 1] = 1;
-  for (int i = ndims_; i >= 0; i--) {
+  phys_strides[ndims() + 1] = 1;
+  for (int i = ndims(); i >= 0; i--) {
     phys_strides[i] = phys_strides[i + 1] * phys_dims[i + 1];
   }
-  return ReorderDims(phys_strides, layout_, layout);
+  return ReorderDims(phys_strides, this->layout(), layout);
 }
 
 void BatchDescriptor::CloneFrom(const BatchDescriptor& other) {
-  count_ = other.count_;
-  feature_map_count_ = other.feature_map_count_;
-  spatial_size_ = other.spatial_size_;
+  tensor_ = other.tensor_;
   value_max_ = other.value_max_;
   value_min_ = other.value_min_;
-  layout_ = other.layout_;
-  ndims_ = other.ndims_;
   quantized_activation_mode_ = other.quantized_activation_mode_;
 }
 
 string BatchDescriptor::ToString() const {
   string spatial;
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
   }
   return port::Printf(
       "{count: %lld feature_map_count: %lld spatial: %s "
       "value_min: %f value_max: %f layout: %s}",
-      count_, feature_map_count_, spatial.c_str(), value_min_, value_max_,
-      DataLayoutString(layout_).c_str());
+      count(), feature_map_count(), spatial.c_str(), value_min_, value_max_,
+      DataLayoutString(layout()).c_str());
 }
 
 string BatchDescriptor::ToShortString() const {
   // All the constituent strings are less than 15 characters, so the
   // small string optimization ensures that there will be at most one
   // heap memory allocation.
-  string depth = port::StrCat("d", feature_map_count());
-  string batch = port::StrCat("b", count());
+  string depth = absl::StrCat("d", feature_map_count());
+  string batch = absl::StrCat("b", count());
 
   string spatial = "s";
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
   }
 
   string suffix;
   if (value_min() != value_max()) {
-    port::StrAppend(&suffix, "[", value_min(), ";", value_max(), "]");
+    absl::StrAppend(&suffix, "[", value_min(), ";", value_max(), "]");
   }
   if (quantized_activation_mode() == QuantizedActivationMode::k16Bit) {
     suffix += "_16bit";
@@ -309,15 +314,15 @@ string BatchDescriptor::ToShortString() const {
 
   switch (layout()) {
     case DataLayout::kYXDepthBatch:
-      return port::StrCat(spatial, depth, batch, suffix);
+      return absl::StrCat(spatial, depth, batch, suffix);
     case DataLayout::kYXBatchDepth:
-      return port::StrCat(spatial, batch, depth, suffix);
+      return absl::StrCat(spatial, batch, depth, suffix);
     case DataLayout::kBatchYXDepth:
-      return port::StrCat(batch, spatial, depth, suffix);
+      return absl::StrCat(batch, spatial, depth, suffix);
     case DataLayout::kBatchDepthYX:
-      return port::StrCat(batch, depth, spatial, suffix);
+      return absl::StrCat(batch, depth, spatial, suffix);
     case DataLayout::kBatchDepthYX4:
-      return port::StrCat(batch, depth, spatial, suffix, "(VECT_C)");
+      return absl::StrCat(batch, depth, spatial, suffix, "(VECT_C)");
     default:
       LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout());
       return "";  // Avoid return warning (unreachable)
@@ -326,18 +331,18 @@ string BatchDescriptor::ToShortString() const {
 
 int64 BatchDescriptor::NodesPerFeatureMap() const {
   int64 ret = 1;
-  for (int i = 0; i < ndims_; i++) {
-    ret *= spatial_size_[i];
+  for (int i = 0; i < ndims(); i++) {
+    ret *= spatial_size()[i];
   }
   return ret;
 }
 
 int64 BatchDescriptor::NodesAcrossFeatureMaps() const {
-  return NodesPerFeatureMap() * feature_map_count_;
+  return NodesPerFeatureMap() * feature_map_count();
 }
 
 int64 BatchDescriptor::ElementCount() const {
-  return count_ * feature_map_count_ * NodesPerFeatureMap();
+  return count() * feature_map_count() * NodesPerFeatureMap();
 }
 
 int64 BatchDescriptor::FullyConnectedWeightCount(
@@ -365,35 +370,29 @@ BatchDescriptor BatchDescriptor::DepthConcatenateOutputDescriptor(
 
 // -- FilterDescriptor
 
-FilterDescriptor::FilterDescriptor(int ndims)
-    : output_feature_map_count_(0),
-      input_feature_map_count_(0),
-      input_filter_dims_(ndims, 0),
-      ndims_(ndims),
-      layout_(FilterLayout::kOutputInputYX) {}
+FilterDescriptor::FilterDescriptor(int ndims) {
+  tensor_.mutable_dimensions()->Resize(ndims + 2, 0);
+  set_layout(FilterLayout::kOutputInputYX);
+}
 
 FilterDescriptor::FilterDescriptor() : FilterDescriptor(/*ndims=*/2) {}
 
 FilterDescriptor::~FilterDescriptor() {}
 
 void FilterDescriptor::CloneFrom(const FilterDescriptor& other) {
-  set_output_feature_map_count(other.output_feature_map_count())
-      .set_input_feature_map_count(other.input_feature_map_count())
-      .set_layout(other.layout());
-  input_filter_dims_ = other.input_filter_dims_;
-  ndims_ = other.ndims_;
+  tensor_ = other.tensor_;
 }
 
 string FilterDescriptor::ToString() const {
   string desc = port::Printf(
       "{output_feature_map_count: %lld input_feature_map_count: %lld "
       "layout: %s shape: ",
-      output_feature_map_count_, input_feature_map_count_,
-      FilterLayoutString(layout_).c_str());
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "%lld ", input_filter_dims_[i]);
+      output_feature_map_count(), input_feature_map_count(),
+      FilterLayoutString(layout()).c_str());
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "%lld ", input_filter_dims()[i]);
   }
-  port::StrAppend(&desc, "}");
+  absl::StrAppend(&desc, "}");
 
   return desc;
 }
@@ -402,48 +401,48 @@ string FilterDescriptor::ToShortString() const {
   // All the constituent strings are less than 15 characters, so the
   // small string optimization ensures that there will be at most one
   // heap memory allocation.
-  string od = port::StrCat("od", output_feature_map_count_);
-  string id = port::StrCat("id", input_feature_map_count_);
+  string od = absl::StrCat("od", output_feature_map_count());
+  string id = absl::StrCat("id", input_feature_map_count());
 
   string spatial = "s";
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", input_filter_dims_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", input_filter_dims()[i]);
   }
 
-  switch (layout_) {
+  switch (layout()) {
     case FilterLayout::kOutputInputYX:
-      return port::StrCat(od, id, spatial);
+      return absl::StrCat(od, id, spatial);
     case FilterLayout::kOutputYXInput:
-      return port::StrCat(od, spatial, id);
+      return absl::StrCat(od, spatial, id);
     case FilterLayout::kOutputInputYX4:
-      return port::StrCat(od, id, spatial, "(VECT_C)");
+      return absl::StrCat(od, id, spatial, "(VECT_C)");
     case FilterLayout::kInputYXOutput:
-      return port::StrCat(id, spatial, od);
+      return absl::StrCat(id, spatial, od);
     case FilterLayout::kYXInputOutput:
-      return port::StrCat(spatial, id, od);
+      return absl::StrCat(spatial, id, od);
     default:
-      LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout_);
+      LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout());
       return "";  // Avoid return warning (unreachable)
   }
 }
 
 int64 FilterDescriptor::ComputeWeightCount() const {
-  int64 ret = output_feature_map_count_ * input_feature_map_count_;
-  for (int i = 0; i < ndims_; i++) {
-    ret *= input_filter_dims_[i];
+  int64 ret = output_feature_map_count() * input_feature_map_count();
+  for (int i = 0; i < ndims(); i++) {
+    ret *= input_filter_dims()[i];
   }
   return ret;
 }
 
 // -- ConvolutionDescriptor
 
-ConvolutionDescriptor::ConvolutionDescriptor(int ndims)
-    : zero_padding_(ndims, 0),
-      filter_strides_(ndims, 1),
-      dilation_rates_(ndims, 1),
-      pad_alignment_(PadAlignment::kDefault),
-      group_count_(1),
-      ndims_(ndims) {}
+ConvolutionDescriptor::ConvolutionDescriptor(int ndims) {
+  proto_.mutable_paddings()->Resize(ndims, 0);
+  proto_.mutable_strides()->Resize(ndims, 1);
+  proto_.mutable_dilations()->Resize(ndims, 1);
+  proto_.set_group_count(1);
+  proto_.set_convolution_mode(ConvolutionMode::CROSS_CORRELATION);
+}
 
 ConvolutionDescriptor::ConvolutionDescriptor()
     : ConvolutionDescriptor(/*ndims=*/2) {}
@@ -454,30 +453,30 @@ string ConvolutionDescriptor::ToString() const {
   string padding;
   string strides;
   string dilations;
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&padding, "%lld ", zero_padding_[i]);
-    port::Appendf(&strides, "%lld ", filter_strides_[i]);
-    port::Appendf(&dilations, "%lld ", dilation_rates_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&padding, "%lld ", this->padding()[i]);
+    port::Appendf(&strides, "%lld ", this->strides()[i]);
+    port::Appendf(&dilations, "%lld ", this->dilations()[i]);
   }
 
   return port::Printf(
       "{zero_padding: %s pad_alignment: %s filter_strides: %s dilation_rates: "
       "%s}",
-      padding.c_str(), PadAlignmentString(pad_alignment_).c_str(),
+      padding.c_str(), PadAlignmentString(pad_alignment()).c_str(),
       strides.c_str(), dilations.c_str());
 }
 
 string ConvolutionDescriptor::ToShortString() const {
   string desc;
-  for (int i = 0; i < ndims_; i++) {
+  for (int i = 0; i < ndims(); i++) {
     if (i > 0) port::Appendf(&desc, "_");
-    port::Appendf(&desc, "p%d:%lld", i, zero_padding_[i]);
+    port::Appendf(&desc, "p%d:%lld", i, padding()[i]);
   }
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "_s%d:%lld", i, filter_strides_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "_s%d:%lld", i, strides()[i]);
   }
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "_d%d:%lld", i, dilation_rates_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "_d%d:%lld", i, dilations()[i]);
   }
   return desc;
 }
@@ -529,7 +528,7 @@ string PoolingDescriptor::ToShortString() const {
     port::Appendf(&strides, "_s%d:%lld", i, strides_[i]);
     port::Appendf(&padding, "_p%d:%lld", i, padding_[i]);
   }
-  return port::StrCat(mode_ == dnn::PoolingMode::kMaximum ? "max" : "avg",
+  return absl::StrCat(mode_ == dnn::PoolingMode::kMaximum ? "max" : "avg",
                       window, strides, padding,
                       propagate_nans_ ? "propagate_nans" : "ignore_nans");
 }
@@ -561,9 +560,9 @@ string NormalizeDescriptor::ToString() const {
 }
 
 string NormalizeDescriptor::ToShortString() const {
-  return port::StrCat("bias:", bias_, "_range:", range_, "_alpha:", alpha_,
-                      "_beta:", beta_, "_wrap:", wrap_around_, "_size:",
-                      segment_size_);
+  return absl::StrCat("bias:", bias_, "_range:", range_, "_alpha:", alpha_,
+                      "_beta:", beta_, "_wrap:", wrap_around_,
+                      "_size:", segment_size_);
 }
 
 }  // namespace dnn
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 621b155240cc11c485b783237035e669ca0d6538..c044a356efb38c333c3153f024092a22fbdf56db 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -27,7 +27,11 @@ limitations under the License.
 #include <memory>
 #include <tuple>
 
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/dnn.pb.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -46,19 +50,6 @@ class ScratchAllocator;
 
 namespace dnn {
 
-// Describes how an input or output layer's data is formatted.
-// Specify int64 so there's no padding in BatchDescriptor.
-enum class DataLayout : int64 {
-  kYXDepthBatch = 0,  // Same as dist_belief::DF_DEPTH_MAJOR.
-  kYXBatchDepth,      // Same as dist_belief::DF_BATCH_MAJOR.
-  kBatchYXDepth,      // Same as run_brain output, and tensorflow's layout.
-  kBatchDepthYX,      // cuDNN's NCHW layout, data laid out as image, feature
-                      // maps, rows, columns.
-  kBatchDepthYX4,     // cuDNN's NCHW_VECT_C layout, data laid out the same as
-                      // kBatchDepthYX but each element is a vector of 4 feature
-                      // maps.
-};
-
 // Specifies an index to use when accessing specific spatial dimensions.
 enum class DimIndex : int {
   X = 0,
@@ -67,12 +58,31 @@ enum class DimIndex : int {
 };
 
 // Helper functions to make methods more readable.
-inline int64 GetDim(const std::vector<int64>& data, DimIndex dim) {
+inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) {
   return data.rbegin()[static_cast<int64>(dim)];
 }
 
+inline void SetDim(absl::Span<int64> data, DimIndex dim, int64 value) {
+  data.rbegin()[static_cast<int64>(dim)] = value;
+}
+
 inline void SetDim(std::vector<int64>* data, DimIndex dim, int64 value) {
-  data->rbegin()[static_cast<int64>(dim)] = value;
+  return SetDim(absl::MakeSpan(*data), dim, value);
+}
+
+// tensorflow::int64 is not the same type as tensorflow::protobuf_int64 in
+// open-source. Wrapper function that gives an int64 array slice view of a
+// repeated int64 protobuf field.
+inline absl::Span<const int64> AsInt64Slice(
+    const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>& v) {
+  return absl::Span<const int64>(reinterpret_cast<const int64*>(v.data()),
+                                 v.size());
+}
+
+inline absl::Span<int64> AsInt64Slice(
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* v) {
+  return absl::Span<int64>(reinterpret_cast<int64*>(v->mutable_data()),
+                           v->size());
 }
 
 // Returns a string representation of the given data layout.
@@ -85,14 +95,6 @@ enum class QuantizedActivationMode {
   k32Bit = 4,
 };
 
-// Specifies the data type used by an operation.
-enum class DataType {
-  kFloat = 0,
-  kDouble = 1,
-  kHalf = 2,
-  kInt8 = 3,
-};
-
 // A helper class to convert C/C++ types to the proper enums.
 template <typename T>
 struct ToDataType;
@@ -112,6 +114,10 @@ template <>
 struct ToDataType<int8> {
   static constexpr DataType value = DataType::kInt8;
 };
+template <>
+struct ToDataType<int32> {
+  static constexpr DataType value = DataType::kInt32;
+};
 
 // Specifies the types of a RNN model.
 enum class RnnMode {
@@ -243,15 +249,15 @@ class BatchDescriptor {
   string ToShortString() const;
 
   // Accessors.
-  int64 count() const { return count_; }
-  int64 feature_map_count() const { return feature_map_count_; }
-  int64 height() const { return GetDim(spatial_size_, DimIndex::Y); }
-  int64 width() const { return GetDim(spatial_size_, DimIndex::X); }
-  int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size_, dim); }
-  int ndims() const { return ndims_; }
+  int64 count() const { return tensor_.dimensions(0); }
+  int64 feature_map_count() const { return tensor_.dimensions(1); }
+  int64 height() const { return GetDim(spatial_size(), DimIndex::Y); }
+  int64 width() const { return GetDim(spatial_size(), DimIndex::X); }
+  int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size(), dim); }
+  int ndims() const { return spatial_size().size(); }
   float value_max() const { return value_max_; }
   float value_min() const { return value_min_; }
-  DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return tensor_.data_layout(); }
   QuantizedActivationMode quantized_activation_mode() const {
     return quantized_activation_mode_;
   }
@@ -265,23 +271,23 @@ class BatchDescriptor {
 
   // Named-argument helpers for avoiding user error during construction.
   BatchDescriptor& set_count(int64 value) {
-    count_ = value;
+    tensor_.set_dimensions(0, value);
     return *this;
   }
   BatchDescriptor& set_feature_map_count(int64 value) {
-    feature_map_count_ = value;
+    tensor_.set_dimensions(1, value);
     return *this;
   }
   BatchDescriptor& set_height(int64 value) {
-    SetDim(&spatial_size_, DimIndex::Y, value);
+    SetDim(spatial_size(), DimIndex::Y, value);
     return *this;
   }
   BatchDescriptor& set_width(int64 value) {
-    SetDim(&spatial_size_, DimIndex::X, value);
+    SetDim(spatial_size(), DimIndex::X, value);
     return *this;
   }
   BatchDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
-    SetDim(&spatial_size_, dim, value);
+    SetDim(spatial_size(), dim, value);
     return *this;
   }
   BatchDescriptor& set_value_max(float value) {
@@ -293,7 +299,7 @@ class BatchDescriptor {
     return *this;
   }
   BatchDescriptor& set_layout(DataLayout layout) {
-    layout_ = layout;
+    tensor_.set_data_layout(layout);
     return *this;
   }
   BatchDescriptor& set_quantized_activation_mode(
@@ -332,31 +338,20 @@ class BatchDescriptor {
       port::ArraySlice<dnn::BatchDescriptor> inputs);
 
  private:
-  int64 count_;
-  int64 feature_map_count_;
-  // Stored as: ..., y, x.
-  std::vector<int64> spatial_size_;
+  absl::Span<const int64> spatial_size() const {
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
+  }
+
+  absl::Span<int64> spatial_size() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
   float value_max_;
   float value_min_;
-  DataLayout layout_;
-  int ndims_;
   QuantizedActivationMode quantized_activation_mode_;
 };
 
-// Describes how a filter is laid out in the memory.
-// Specify int64 so there's no padding in FilterDescriptor.
-enum class FilterLayout : int64 {
-  kOutputInputYX = 0,  // cuDNN's default filter layout, laid out as:
-                       // (major) output feature maps >> input feature maps >>
-                       // rows >> columns (minor).
-  kOutputYXInput,      // major to minor:
-                       //   (output features, row, columns, input features)
-  kOutputInputYX4,  // laid out the same as kOutputInputYX but each element is a
-                    // vector of 4 feature maps.
-  kInputYXOutput,   // Same as dist_belief's default filter layout.
-  kYXInputOutput,   // Same as tensorflow's default filter layout.
-};
-
 // Returns a string representation of the given filter layout.
 string FilterLayoutString(FilterLayout layout);
 
@@ -396,30 +391,30 @@ class FilterDescriptor {
 
   // Named-argument helpers for avoiding user error during construction.
   FilterDescriptor& set_output_feature_map_count(int64 value) {
-    output_feature_map_count_ = value;
+    tensor_.set_dimensions(0, value);
     return *this;
   }
   FilterDescriptor& set_input_feature_map_count(int64 value) {
-    input_feature_map_count_ = value;
+    tensor_.set_dimensions(1, value);
     return *this;
   }
   FilterDescriptor& set_input_filter_height(int64 value) {
-    SetDim(&input_filter_dims_, DimIndex::Y, value);
+    SetDim(input_filter_dims(), DimIndex::Y, value);
     return *this;
   }
   FilterDescriptor& set_input_filter_width(int64 value) {
-    SetDim(&input_filter_dims_, DimIndex::X, value);
+    SetDim(input_filter_dims(), DimIndex::X, value);
     return *this;
   }
   FilterDescriptor& set_layout(FilterLayout layout) {
-    layout_ = layout;
+    tensor_.set_filter_layout(layout);
     return *this;
   }
   FilterDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
-    SetDim(&input_filter_dims_, dim, value);
+    SetDim(input_filter_dims(), dim, value);
     return *this;
   }
-  int ndims() const { return ndims_; }
+  int ndims() const { return input_filter_dims().size(); }
 
   void CloneFrom(const FilterDescriptor& other);
 
@@ -432,30 +427,32 @@ class FilterDescriptor {
 
   // Returns the number of biases required as parameters for a convolution
   // using this filter descriptor.
-  int64 bias_count() const { return output_feature_map_count_; }
+  int64 bias_count() const { return output_feature_map_count(); }
 
-  int64 output_feature_map_count() const { return output_feature_map_count_; }
-  int64 input_feature_map_count() const { return input_feature_map_count_; }
+  int64 output_feature_map_count() const { return tensor_.dimensions(0); }
+  int64 input_feature_map_count() const { return tensor_.dimensions(1); }
   int64 input_filter_height() const {
-    return GetDim(input_filter_dims_, DimIndex::Y);
+    return GetDim(input_filter_dims(), DimIndex::Y);
   }
   int64 input_filter_width() const {
-    return GetDim(input_filter_dims_, DimIndex::X);
+    return GetDim(input_filter_dims(), DimIndex::X);
   }
   int64 input_filter_dim(DimIndex dim) const {
-    return GetDim(input_filter_dims_, dim);
+    return GetDim(input_filter_dims(), dim);
   }
 
-  FilterLayout layout() const { return layout_; }
-  std::vector<int64> input_filter_dims() const { return input_filter_dims_; }
+  FilterLayout layout() const { return tensor_.filter_layout(); }
+
+  absl::Span<const int64> input_filter_dims() const {
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
+  }
 
  private:
-  int64 output_feature_map_count_;
-  int64 input_feature_map_count_;
-  // Stored as: ..., y, x.
-  std::vector<int64> input_filter_dims_;
-  int ndims_;
-  FilterLayout layout_;
+  absl::Span<int64> input_filter_dims() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
 };
 
 // Describes how padding should be aligned when the total number of pad
@@ -496,6 +493,11 @@ std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
 //   cells between each filter element in the "y dimension".
 // - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1)
 //   skipped cells between each filter element in the "x dimension".
+// - convolution_not_crosscor: By default (convolution_not_crosscor == false),
+//   we perform cross correlation rather than convolution. With the flag set,
+//   we perform convolution. Convolution and cross correlation are related by
+//   rotating the filter by 180 degrees (or equivalently flipping all spatial
+//   dimensions).
 class ConvolutionDescriptor {
  public:
   // By default construction, there is no zero-padding and the filter stride is
@@ -509,87 +511,102 @@ class ConvolutionDescriptor {
   string ToShortString() const;
 
   ConvolutionDescriptor& set_zero_padding_height(int64 value) {
-    SetDim(&zero_padding_, DimIndex::Y, value);
+    SetDim(padding(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_zero_padding_width(int64 value) {
-    SetDim(&zero_padding_, DimIndex::X, value);
+    SetDim(padding(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64 value) {
-    SetDim(&zero_padding_, dim, value);
+    SetDim(padding(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_vertical_filter_stride(int64 value) {
-    SetDim(&filter_strides_, DimIndex::Y, value);
+    SetDim(strides(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) {
-    SetDim(&filter_strides_, DimIndex::X, value);
+    SetDim(strides(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64 value) {
-    SetDim(&filter_strides_, dim, value);
+    SetDim(strides(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_vertical_dilation_rate(int64 value) {
-    SetDim(&dilation_rates_, DimIndex::Y, value);
+    SetDim(dilations(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_horizontal_dilation_rate(int64 value) {
-    SetDim(&dilation_rates_, DimIndex::X, value);
+    SetDim(dilations(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64 value) {
-    SetDim(&dilation_rates_, dim, value);
-    return *this;
-  }
-  ConvolutionDescriptor& set_pad_alignment(PadAlignment pad_alignment) {
-    pad_alignment_ = pad_alignment;
+    SetDim(dilations(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_group_count(int group_count) {
-    group_count_ = group_count;
+    proto_.set_group_count(group_count);
     return *this;
   }
-  int64 zero_padding_height() const {
-    return GetDim(zero_padding_, DimIndex::Y);
-  }
-  int64 zero_padding_width() const {
-    return GetDim(zero_padding_, DimIndex::X);
+  ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) {
+    proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION
+                                     : ConvolutionMode::CROSS_CORRELATION);
+    return *this;
   }
+  int64 zero_padding_height() const { return GetDim(padding(), DimIndex::Y); }
+  int64 zero_padding_width() const { return GetDim(padding(), DimIndex::X); }
   int64 vertical_filter_stride() const {
-    return GetDim(filter_strides_, DimIndex::Y);
+    return GetDim(strides(), DimIndex::Y);
   }
   int64 horizontal_filter_stride() const {
-    return GetDim(filter_strides_, DimIndex::X);
+    return GetDim(strides(), DimIndex::X);
   }
   int64 vertical_dilation_rate() const {
-    return GetDim(dilation_rates_, DimIndex::Y);
+    return GetDim(dilations(), DimIndex::Y);
   }
   int64 horizontal_dilation_rate() const {
-    return GetDim(dilation_rates_, DimIndex::X);
+    return GetDim(dilations(), DimIndex::X);
   }
 
-  int zero_padding(DimIndex dim) const { return GetDim(zero_padding_, dim); }
-  int filter_stride(DimIndex dim) const { return GetDim(filter_strides_, dim); }
-  int dilation_rate(DimIndex dim) const { return GetDim(dilation_rates_, dim); }
-  PadAlignment pad_alignment() const { return pad_alignment_; }
-  int group_count() const { return group_count_; }
-  int ndims() const { return ndims_; }
+  int zero_padding(DimIndex dim) const { return GetDim(padding(), dim); }
+  int filter_stride(DimIndex dim) const { return GetDim(strides(), dim); }
+  int dilation_rate(DimIndex dim) const { return GetDim(dilations(), dim); }
+  // TODO(timshen): remove this function. No users of this class is setting a
+  // non-default pad alignment.
+  PadAlignment pad_alignment() const { return PadAlignment::kDefault; }
+  int group_count() const { return proto_.group_count(); }
+  int ndims() const { return padding().size(); }
+  bool convolution_not_crosscorr() const {
+    return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION;
+  }
 
-  std::vector<int64> strides() const { return filter_strides_; }
-  std::vector<int64> dilations() const { return dilation_rates_; }
-  std::vector<int64> padding() const { return zero_padding_; }
+  absl::Span<const int64> strides() const {
+    return AsInt64Slice(proto_.strides());
+  }
+
+  absl::Span<const int64> dilations() const {
+    return AsInt64Slice(proto_.dilations());
+  }
+
+  absl::Span<const int64> padding() const {
+    return AsInt64Slice(proto_.paddings());
+  }
 
  private:
-  // Stored as: .. y, x.
-  std::vector<int64> zero_padding_;
-  std::vector<int64> filter_strides_;
-  std::vector<int64> dilation_rates_;
-  PadAlignment pad_alignment_;
-  int group_count_;
-  int ndims_;
+  absl::Span<int64> strides() { return AsInt64Slice(proto_.mutable_strides()); }
+
+  absl::Span<int64> dilations() {
+    return AsInt64Slice(proto_.mutable_dilations());
+  }
+
+  absl::Span<int64> padding() {
+    return AsInt64Slice(proto_.mutable_paddings());
+  }
+
+  ConvolutionDescriptorProto proto_;
+
   // TODO(leary) cudnn provides these fields, but need to characterize what
   // their effect is -- they may be boolean rather than integral.
   // int64 upscale_input_x;
@@ -693,9 +710,9 @@ class PoolingDescriptor {
   int64 vertical_stride() const { return GetDim(strides_, DimIndex::Y); }
   int64 horizontal_stride() const { return GetDim(strides_, DimIndex::X); }
   int64 stride(DimIndex dim) const { return GetDim(strides_, dim); }
-  std::vector<int64> window() const { return window_; }
-  std::vector<int64> padding() const { return padding_; }
-  std::vector<int64> strides() const { return strides_; }
+  absl::Span<const int64> window() const { return window_; }
+  absl::Span<const int64> padding() const { return padding_; }
+  absl::Span<const int64> strides() const { return strides_; }
   bool propagate_nans() const { return propagate_nans_; }
 
  private:
@@ -713,31 +730,23 @@ class PoolingDescriptor {
 class AlgorithmDesc {
  public:
   typedef int64 Index;
-  AlgorithmDesc()
-      : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true), scratch_size_(0) {}
-  AlgorithmDesc(Index a, bool use_tensor_ops)
-      : algo_(a), tensor_ops_enabled_(use_tensor_ops), scratch_size_(0) {}
-  AlgorithmDesc(Index a, bool use_tensor_ops, size_t scratch_size)
-      : algo_(a),
-        tensor_ops_enabled_(use_tensor_ops),
-        scratch_size_(scratch_size) {}
-  bool is_default() const { return algo_ == kDefaultAlgorithm; }
-  bool tensor_ops_enabled() const { return tensor_ops_enabled_; }
-  Index algo_id() const { return algo_; }
-  size_t scratch_size() const { return scratch_size_; }
-  void set_scratch_size(size_t val) { scratch_size_ = val; }
+  AlgorithmDesc(Index a, bool use_tensor_ops) {
+    proto_.set_algo_id(a);
+    proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
+                                        : AlgorithmProto::DEFAULT_MATH);
+  }
+  bool tensor_ops_enabled() const {
+    return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH;
+  }
+  Index algo_id() const { return proto_.algo_id(); }
   bool operator==(const AlgorithmDesc& other) const {
-    return this->algo_ == other.algo_ &&
-           this->tensor_ops_enabled_ == other.tensor_ops_enabled_ &&
-           this->scratch_size_ == other.scratch_size_;
+    return algo_id() == other.algo_id() &&
+           tensor_ops_enabled() == other.tensor_ops_enabled();
   }
   uint64 hash() const;
 
  private:
-  enum { kDefaultAlgorithm = -1 };
-  Index algo_;
-  bool tensor_ops_enabled_;
-  size_t scratch_size_;
+  AlgorithmProto proto_;
 };
 
 // Describes the result from a perf experiment.
@@ -748,17 +757,25 @@ class AlgorithmDesc {
 class ProfileResult {
  public:
   bool is_valid() const {
-    return (!algorithm_.is_default() &&
-            elapsed_time_in_ms_ != std::numeric_limits<float>::max());
+    return algorithm_.has_value() &&
+           elapsed_time_in_ms() != std::numeric_limits<float>::max();
   }
-  AlgorithmDesc algorithm() const { return algorithm_; }
+
+  AlgorithmDesc algorithm() const { return *algorithm_; }
   void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
+
   float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
   void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
 
+  size_t scratch_size() const { return scratch_size_; }
+  void set_scratch_size(size_t val) { scratch_size_ = val; }
+
  private:
-  AlgorithmDesc algorithm_;
+  absl::optional<AlgorithmDesc> algorithm_;
   float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
+  // The scratch size algorithm_ requires. Currently it's only populated by
+  // convolutions.
+  size_t scratch_size_ = 0;
 };
 
 // Describes the configuration for the algorithms that will used.
@@ -773,9 +790,11 @@ class AlgorithmConfig {
   explicit AlgorithmConfig(AlgorithmDesc algorithm) : algorithm_(algorithm) {}
   AlgorithmConfig(AlgorithmDesc algorithm, AlgorithmDesc algorithm_no_scratch)
       : algorithm_(algorithm), algorithm_no_scratch_(algorithm_no_scratch) {}
-  AlgorithmDesc algorithm() const { return algorithm_; }
+  absl::optional<AlgorithmDesc> algorithm() const { return algorithm_; }
   void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
-  AlgorithmDesc algorithm_no_scratch() const { return algorithm_no_scratch_; }
+  absl::optional<AlgorithmDesc> algorithm_no_scratch() const {
+    return algorithm_no_scratch_;
+  }
   void set_algorithm_no_scratch(AlgorithmDesc val) {
     algorithm_no_scratch_ = val;
   }
@@ -789,8 +808,8 @@ class AlgorithmConfig {
   string ToString() const;
 
  private:
-  AlgorithmDesc algorithm_;
-  AlgorithmDesc algorithm_no_scratch_;
+  absl::optional<AlgorithmDesc> algorithm_;
+  absl::optional<AlgorithmDesc> algorithm_no_scratch_;
 };
 
 // Describes a local response normalization (LRN). LRN is used e.g. in
@@ -871,24 +890,6 @@ class NormalizeDescriptor {
   int32 segment_size_;
 };
 
-// Describes a kind of non-linearity (threshold-like mathematical function).
-enum class ActivationMode {
-  kNone = 0,
-  kSigmoid,
-  // Rectified linear activation: f(x) = x < 0 ? 0 : x
-  kRelu,
-  // Rectified linear activation, where upper maximum is 6.0.
-  kRelu6,
-  // Rectified linear activation, where upper maximum specified by
-  // BatchDescriptor::value_max().
-  kReluX,
-  kTanh,
-  // Like ReluX, but passes all values in the range [-X,X].
-  kBandPass,
-
-  kNumActivationModes,  // Always in the end.
-};
-
 // Returns a string representation of the given activation mode.
 string ActivationModeString(ActivationMode mode);
 
@@ -917,6 +918,23 @@ class VersionInfo {
 // Suite of operations typically used for implementing Deep/Convolutional Neural
 // Nets. Note: A false return value of an operation indicates the
 // implementation is not available.
+//
+// TODO(b/118763918): this class (or rather dispatch table) has several
+// problems:
+// * Some overloads are missing. Ideally we want to have template virtual
+//   functions while the template arguments is a closed set. However, we don't
+//   get that from the language.
+// * The API is a union of cuDNN and another private backend. Only 10% of the
+//   functions are actually implemented by both backends, the rest are
+//   actually backend-specific. The massive interface creates extra mental
+//   burden.
+// * Poor error handling: the API should return Status objects.
+//
+// Things worth trying:
+// * Move functions that are not actually common back to the backends. Then,
+//   callers may use dynamic_cast to access specific backends. This may not be
+//   that hard, as many of the callers are Stream::ThenXxx functions.
+// * Change all the returned bools to Status.
 class DnnSupport {
  public:
   DnnSupport() {}
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
new file mode 100644
index 0000000000000000000000000000000000000000..56b079c3f5b962636e7c75b46449adca8e13a43e
--- /dev/null
+++ b/tensorflow/stream_executor/dnn.proto
@@ -0,0 +1,103 @@
+// LINT: LEGACY_NAMES
+syntax = "proto3";
+
+package stream_executor.dnn;
+
+// Specifies the data type used by an operation.
+enum DataType {
+  kFloat = 0;
+  kDouble = 1;
+  kHalf = 2;
+  kInt8 = 3;
+  kInt32 = 4;
+}
+
+// Describes how a convolution input or output layer's data is formatted.
+enum DataLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Batch <-> batch, or N
+  // Depth <-> feature, or channel
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  kYXDepthBatch = 0;
+  kYXBatchDepth = 1;
+  kBatchYXDepth = 2;   // cuDNN's NHWC layout
+  kBatchDepthYX = 3;   // cuDNN's NCHW layout
+  kBatchDepthYX4 = 4;  // cuDNN's NCHW_VECT_C layout
+}
+
+// Describes how a convolution filter is laid out in the memory.
+enum FilterLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Output <-> output feature, or N
+  // Input <-> input feature, or N
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  kOutputInputYX = 0;   // cuDNN's NCHW layout
+  kOutputYXInput = 1;   // cuDNN's NHWC layout
+  kOutputInputYX4 = 2;  // cuDNN's NCHW_VECT_C layout
+  kInputYXOutput = 3;
+  kYXInputOutput = 4;
+}
+
+// Describes a kind of non-linearity (threshold-like mathematical function).
+enum ActivationMode {
+  kNone = 0;
+  kSigmoid = 1;
+  // Rectified linear activation: f(x) = x < 0 ? 0 : x
+  kRelu = 2;
+  // Rectified linear activation; where upper maximum is 6.0.
+  kRelu6 = 3;
+  // Rectified linear activation; where upper maximum specified by
+  // BatchDescriptor::value_max().
+  kReluX = 4;
+  kTanh = 5;
+  // Like ReluX; but passes all values in the range [-X,X].
+  kBandPass = 6;
+}
+
+// Describe the math definition for the conv op. The popular behavior is
+// actually called cross-correlation in math, despite the operation is often
+// referred as convolution. See cuDNN cudnnConvolutionMode_t.
+enum ConvolutionMode {
+  CROSS_CORRELATION = 0;
+  CONVOLUTION = 1;
+}
+
+// Generic tensor representation.
+message TensorDescriptorProto {
+  repeated int64 dimensions = 1;
+  DataType data_type = 2;
+  oneof layout_oneof {
+    DataLayout data_layout = 3;
+    FilterLayout filter_layout = 4;
+  }
+}
+
+// Generic algorithm representation.
+message AlgorithmProto {
+  enum MathType {
+    DEFAULT_MATH = 0;
+    // The GPU may operate 4x4 matrix FMA.
+    // See cuDNN's documentation for CUDNN_TENSOR_OP_MATH.
+    TENSOR_OP_MATH = 1;
+  }
+  int64 algo_id = 1;
+  MathType math_type = 2;
+}
+
+// Convolution-specific parameters.
+message ConvolutionDescriptorProto {
+  repeated int64 paddings = 1;
+  repeated int64 strides = 2;
+  repeated int64 dilations = 3;
+  // The "accumulator" type. For example, use F32 as an accumulator for F16
+  // convolutions.
+  // See cuDNN's cudnnConvolutionMode_t.
+  DataType compute_mode = 4;
+  // See cuDNN's group count.
+  int32 group_count = 5;
+  ConvolutionMode convolution_mode = 6;
+}
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index ea5dffd15e50969af45e3153e648dd47ab30610b..6dda5d63155d8f9cf8d068b3feae51b1fba88a51 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -23,17 +23,18 @@ limitations under the License.
 #include <initializer_list>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/load_library.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/path.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 #if !defined(PLATFORM_GOOGLE)
+#include "absl/strings/string_view.h"
 #include "cuda/cuda_config.h"
 #endif
 
@@ -119,12 +120,12 @@ static mutex& GetRpathMutex() {
   return *mu;
 }
 
-/* static */ void DsoLoader::RegisterRpath(port::StringPiece path) {
+/* static */ void DsoLoader::RegisterRpath(absl::string_view path) {
   mutex_lock lock{GetRpathMutex()};
   GetRpaths()->emplace_back(path);
 }
 
-/* static */ port::Status DsoLoader::GetDsoHandle(port::StringPiece path,
+/* static */ port::Status DsoLoader::GetDsoHandle(absl::string_view path,
                                                   void** dso_handle,
                                                   LoadKind load_kind) {
   if (load_kind != LoadKind::kLocal) {
@@ -145,7 +146,7 @@ static mutex& GetRpathMutex() {
 #endif
     ;
     return port::Status(port::error::FAILED_PRECONDITION,
-                        port::StrCat("could not dlopen DSO: ", path,
+                        absl::StrCat("could not dlopen DSO: ", path,
                                      "; dlerror: ", s.error_message()));
   }
   LOG(INFO) << "successfully opened CUDA library " << path << " locally";
@@ -190,13 +191,13 @@ static std::vector<string>* CreatePrimordialRpaths() {
 #endif
 }
 
-/* static */ string DsoLoader::FindDsoPath(port::StringPiece library_name,
-                                           port::StringPiece runfiles_relpath) {
+/* static */ string DsoLoader::FindDsoPath(absl::string_view library_name,
+                                           absl::string_view runfiles_relpath) {
   // Keep a record of the paths we attempted so we can dump out meaningful
   // diagnostics if no path is found.
   std::vector<string> attempted;
 
-  using StringPieces = std::vector<port::StringPiece>;
+  using StringPieces = std::vector<absl::string_view>;
   string candidate;
 
   // Otherwise, try binary-plus-rpath locations.
diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/dso_loader.h
index 9ee081cb3d64e8878fa9d7f0c33da7f6827da620..f063b68d6058f7b1faecfd83d3d21b899cf027a3 100644
--- a/tensorflow/stream_executor/dso_loader.h
+++ b/tensorflow/stream_executor/dso_loader.h
@@ -22,9 +22,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/port.h"
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 
@@ -48,7 +48,7 @@ class DsoLoader {
   static port::Status GetLibcuptiDsoHandle(void** dso_handle);
 
   // Registers a new binary-relative path to use as a dlopen search path.
-  static void RegisterRpath(port::StringPiece path);
+  static void RegisterRpath(absl::string_view path);
 
  private:
   // Registered rpaths (singleton vector) and a mutex that guards it.
@@ -61,10 +61,9 @@ class DsoLoader {
   // Loads a DSO from the given "path" (which can technically be any dlopen-able
   // name). If the load kind is global, the symbols in the loaded DSO are
   // visible to subsequent DSO loading operations.
-  static port::Status GetDsoHandle(port::StringPiece path, void** dso_handle,
+  static port::Status GetDsoHandle(absl::string_view path, void** dso_handle,
                                    LoadKind load_kind = LoadKind::kLocal);
 
-
   // Returns the binary directory (or binary path) associated with the currently
   // executing program. If strip_executable_name is true, the executable file is
   // stripped off of the path.
@@ -80,8 +79,8 @@ class DsoLoader {
   //   library_name: the filename in tree; e.g. libOpenCL.so.1.0.0
   //   runfiles_relpath: where to look for the library relative to the runfiles
   //      root; e.g. third_party/gpus/cuda/lib64
-  static string FindDsoPath(port::StringPiece library_name,
-                            port::StringPiece runfiles_relpath);
+  static string FindDsoPath(absl::string_view library_name,
+                            absl::string_view runfiles_relpath);
 
   // Return platform dependent paths for DSOs
   static string GetCudaLibraryDirPath();
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 8adf739b170c42e5aeda5ccf3ea469f2c3cea07c..1396a83dfb1e0217b795d463323aafbcce081e65 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -148,8 +148,13 @@ port::Status HostExecutor::SynchronousMemcpyDeviceToDevice(
 }
 
 bool HostExecutor::HostCallback(Stream *stream,
-                                std::function<void()> callback) {
-  AsHostStream(stream)->EnqueueTask(callback);
+                                std::function<port::Status()> callback) {
+  AsHostStream(stream)->EnqueueTask([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
   return true;
 }
 
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 7ba1f181015e057b66e7e7287a592d5f2af1ead2..56e3c2aa6a946357b588f84fdd4c2375ee7e50ff 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -103,7 +103,8 @@ class HostExecutor : public internal::StreamExecutorInterface {
                                                const DeviceMemoryBase &gpu_src,
                                                uint64 size) override;
 
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+  bool HostCallback(Stream *stream,
+                    std::function<port::Status()> callback) override;
 
   port::Status AllocateEvent(Event *event) override {
     return port::Status(port::error::UNIMPLEMENTED, "");
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index 410dc9da899cc967b36c1738a6b7c128a98cf70c..d16cca8dcc041d1a78ed93c42aa14ba0ff692239 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -103,3 +103,5 @@ REGISTER_MODULE_INITIALIZER(host_platform,
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(host_platform, multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     host_platform);
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index e84b7e6cc2fbf257fb4989a9496825c4f1fd0788..240e955b6ff3d83bc3721cc06af04dc2f9ac263a 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/stream_executor/lib/demangle.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -93,9 +94,9 @@ KernelCacheConfig KernelBase::GetPreferredCacheConfig() const {
 // Prefix stub functions emitted by the CUDA splitter.
 static const char *kStubPrefix = "__device_stub_";
 
-void KernelBase::set_name(port::StringPiece name) {
+void KernelBase::set_name(absl::string_view name) {
   name_ = string(name);
-  port::StringPiece stubless_name = name;
+  absl::string_view stubless_name = name;
   if (tensorflow::str_util::StartsWith(name, kStubPrefix)) {
     stubless_name.remove_prefix(strlen(kStubPrefix));
   }
diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h
index 2216884b873cda98f09782866f23c06088b73e09..9384db6858291d44c4767378fecff0505e53acbc 100644
--- a/tensorflow/stream_executor/kernel.h
+++ b/tensorflow/stream_executor/kernel.h
@@ -75,11 +75,10 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/kernel_cache_config.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace stream_executor {
@@ -178,7 +177,7 @@ class KernelBase {
   // Gets the preferred cache configuration for a kernel.
   KernelCacheConfig GetPreferredCacheConfig() const;
 
-  void set_name(port::StringPiece name);
+  void set_name(absl::string_view name);
   const string &name() const { return name_; }
   const string &demangled_name() const { return demangled_name_; }
 
diff --git a/tensorflow/stream_executor/kernel_spec.cc b/tensorflow/stream_executor/kernel_spec.cc
index 1eaa0806993b1d9675421b78dea46ccf8e729d2e..2e090af7169ff59e3f1f6bf669128480181a8e59 100644
--- a/tensorflow/stream_executor/kernel_spec.cc
+++ b/tensorflow/stream_executor/kernel_spec.cc
@@ -14,26 +14,27 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/stream_executor/kernel_spec.h"
+#include "absl/strings/string_view.h"
 
 namespace stream_executor {
 
-KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname)
+KernelLoaderSpec::KernelLoaderSpec(absl::string_view kernelname)
     : kernelname_(string(kernelname)) {}
 
-OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(port::StringPiece filename,
-                                               port::StringPiece kernelname)
+OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(absl::string_view filename,
+                                               absl::string_view kernelname)
     : KernelLoaderSpec(kernelname), filename_(string(filename)) {}
 
-CudaPtxOnDisk::CudaPtxOnDisk(port::StringPiece filename,
-                             port::StringPiece kernelname)
+CudaPtxOnDisk::CudaPtxOnDisk(absl::string_view filename,
+                             absl::string_view kernelname)
     : OnDiskKernelLoaderSpec(filename, kernelname) {}
 
-CudaCubinOnDisk::CudaCubinOnDisk(port::StringPiece filename,
-                                 port::StringPiece kernelname)
+CudaCubinOnDisk::CudaCubinOnDisk(absl::string_view filename,
+                                 absl::string_view kernelname)
     : OnDiskKernelLoaderSpec(filename, kernelname) {}
 
 CudaCubinInMemory::CudaCubinInMemory(const char *bytes,
-                                     port::StringPiece kernelname)
+                                     absl::string_view kernelname)
     : KernelLoaderSpec(kernelname), bytes_(bytes) {}
 
 bool CompareComputeCapability(const std::tuple<int, int> &lhs,
@@ -45,8 +46,8 @@ bool CompareComputeCapability(const std::tuple<int, int> &lhs,
 
 const std::tuple<int, int> CudaPtxInMemory::kMinimumCapability{1, 0};
 
-CudaPtxInMemory::CudaPtxInMemory(port::StringPiece ptx,
-                                 port::StringPiece kernel_name,
+CudaPtxInMemory::CudaPtxInMemory(absl::string_view ptx,
+                                 absl::string_view kernel_name,
                                  bool ptx_compressed)
     : KernelLoaderSpec(kernel_name),
       ptx_by_compute_capability_(CompareComputeCapability) {
@@ -60,12 +61,12 @@ CudaPtxInMemory::CudaPtxInMemory(port::StringPiece ptx,
 
 CudaPtxInMemory::CudaPtxInMemory(
     const std::initializer_list<CudaPtxInMemory::PtxSpec> &spec_list,
-    port::StringPiece kernel_name, bool ptx_compressed)
+    absl::string_view kernel_name, bool ptx_compressed)
     : KernelLoaderSpec(kernel_name),
       ptx_by_compute_capability_(CompareComputeCapability) {
   for (const auto &spec : spec_list) {
     int major, minor;
-    port::StringPiece ptx;
+    absl::string_view ptx;
     std::tie(major, minor, ptx) = spec;
     if (ptx_compressed) {
       // Lazy decompression. Put an empty string in decompressed_ptx_ showing
@@ -155,62 +156,62 @@ const char *CudaPtxInMemory::original_text(int compute_capability_major,
   return ptx_iter->second;
 }
 
-OpenCLTextOnDisk::OpenCLTextOnDisk(port::StringPiece filename,
-                                   port::StringPiece kernelname)
+OpenCLTextOnDisk::OpenCLTextOnDisk(absl::string_view filename,
+                                   absl::string_view kernelname)
     : OnDiskKernelLoaderSpec(filename, kernelname) {}
 
-OpenCLTextInMemory::OpenCLTextInMemory(port::StringPiece text,
-                                       port::StringPiece kernelname)
+OpenCLTextInMemory::OpenCLTextInMemory(absl::string_view text,
+                                       absl::string_view kernelname)
     : KernelLoaderSpec(kernelname), text_(text) {}
 
-OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(port::StringPiece filename,
-                                       port::StringPiece kernelname)
+OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(absl::string_view filename,
+                                       absl::string_view kernelname)
     : OnDiskKernelLoaderSpec(filename, kernelname) {}
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextOnDisk(
-    port::StringPiece filename, port::StringPiece kernelname) {
+    absl::string_view filename, absl::string_view kernelname) {
   CHECK(ocl_text_on_disk_ == nullptr);
   ocl_text_on_disk_.reset(new OpenCLTextOnDisk{filename, kernelname});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLBinaryOnDisk(
-    port::StringPiece filename, port::StringPiece kernelname) {
+    absl::string_view filename, absl::string_view kernelname) {
   CHECK(ocl_binary_on_disk_ == nullptr);
   ocl_binary_on_disk_.reset(new OpenCLBinaryOnDisk{filename, kernelname});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextInMemory(
-    port::StringPiece filename, port::StringPiece kernelname) {
+    absl::string_view filename, absl::string_view kernelname) {
   CHECK(ocl_text_in_memory_ == nullptr);
   ocl_text_in_memory_.reset(new OpenCLTextInMemory{filename, kernelname});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxOnDisk(
-    port::StringPiece filename, port::StringPiece kernelname) {
+    absl::string_view filename, absl::string_view kernelname) {
   CHECK(cuda_ptx_on_disk_ == nullptr);
   cuda_ptx_on_disk_.reset(new CudaPtxOnDisk{filename, kernelname});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinInMemory(
-    const char *bytes, port::StringPiece kernelname) {
+    const char *bytes, absl::string_view kernelname) {
   CHECK(cuda_cubin_in_memory_ == nullptr);
   cuda_cubin_in_memory_.reset(new CudaCubinInMemory{bytes, kernelname});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinOnDisk(
-    port::StringPiece filename, port::StringPiece kernelname) {
+    absl::string_view filename, absl::string_view kernelname) {
   CHECK(cuda_cubin_on_disk_ == nullptr);
   cuda_cubin_on_disk_.reset(new CudaCubinOnDisk{filename, kernelname});
   return this;
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
-    port::StringPiece ptx, port::StringPiece kernelname) {
+    absl::string_view ptx, absl::string_view kernelname) {
   CHECK(cuda_ptx_in_memory_ == nullptr);
   cuda_ptx_in_memory_.reset(
       new CudaPtxInMemory{ptx, kernelname, false /* ptx_compressed */});
@@ -218,7 +219,7 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
 }
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
-    port::StringPiece ptx, port::StringPiece kernelname) {
+    absl::string_view ptx, absl::string_view kernelname) {
   CHECK(cuda_ptx_in_memory_ == nullptr);
   cuda_ptx_in_memory_.reset(
       new CudaPtxInMemory{ptx, kernelname, true /* ptx_compressed */});
@@ -227,7 +228,7 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
     std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
-    port::StringPiece kernelname) {
+    absl::string_view kernelname) {
   CHECK(cuda_ptx_in_memory_ == nullptr);
   cuda_ptx_in_memory_.reset(
       new CudaPtxInMemory{spec_list, kernelname, false /* ptx_compressed */});
@@ -236,7 +237,7 @@ MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
 
 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
     std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
-    port::StringPiece kernelname) {
+    absl::string_view kernelname) {
   CHECK(cuda_ptx_in_memory_ == nullptr);
   cuda_ptx_in_memory_.reset(
       new CudaPtxInMemory{spec_list, kernelname, true /* ptx_compressed */});
diff --git a/tensorflow/stream_executor/kernel_spec.h b/tensorflow/stream_executor/kernel_spec.h
index 7cc23bb4e64b45268f6bb00d9ea9ee4a686a0e25..04b2eab084c79bcf51bd58c15329dea4f28c2c8e 100644
--- a/tensorflow/stream_executor/kernel_spec.h
+++ b/tensorflow/stream_executor/kernel_spec.h
@@ -51,7 +51,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/stream_executor/platform/port.h"
 
-#include "tensorflow/stream_executor/lib/stringpiece.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
@@ -76,7 +76,7 @@ class KernelLoaderSpec {
   const string &kernelname() const { return kernelname_; }
 
  protected:
-  explicit KernelLoaderSpec(port::StringPiece kernelname);
+  explicit KernelLoaderSpec(absl::string_view kernelname);
 
  private:
   // The kernel name that should be loaded out of the program description given
@@ -101,8 +101,8 @@ class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
   virtual const char *CanonicalSuffix() const = 0;
 
  protected:
-  OnDiskKernelLoaderSpec(port::StringPiece filename,
-                         port::StringPiece kernelname);
+  OnDiskKernelLoaderSpec(absl::string_view filename,
+                         absl::string_view kernelname);
 
   string filename_;
 
@@ -113,7 +113,7 @@ class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
 // Kernel loader specification for PTX text that resides on disk.
 class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
  public:
-  CudaPtxOnDisk(port::StringPiece filename, port::StringPiece kernelname);
+  CudaPtxOnDisk(absl::string_view filename, absl::string_view kernelname);
   ~CudaPtxOnDisk() override {}
 
   const char *CanonicalSuffix() const override { return ".ptx"; }
@@ -125,7 +125,7 @@ class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
 // Kernel loader specification for CUBIN binary that resides on disk.
 class CudaCubinOnDisk : public OnDiskKernelLoaderSpec {
  public:
-  CudaCubinOnDisk(port::StringPiece filename, port::StringPiece kernelname);
+  CudaCubinOnDisk(absl::string_view filename, absl::string_view kernelname);
   ~CudaCubinOnDisk() override {}
 
   const string &filename() const { return filename_; }
@@ -143,7 +143,7 @@ class CudaPtxInMemory : public KernelLoaderSpec {
  public:
   // Components: compute capability major number, compute capability minor
   // number, and PTX source.
-  typedef std::tuple<int, int, port::StringPiece> PtxSpec;
+  typedef std::tuple<int, int, absl::string_view> PtxSpec;
 
   // Single-PTX constructor. Adds the provided PTX version with an unknown
   // compute capability. Since the CC is unknown, the PTX is assumed to be very
@@ -151,16 +151,16 @@ class CudaPtxInMemory : public KernelLoaderSpec {
   // likely to be used as the default! Note that the PTX can be compressed,
   // which is indicated by the argument ptx_compressed.
   //
-  // Warning: the string backing the provided port::StringPiece ptx must outlive this
-  // instance.
-  CudaPtxInMemory(port::StringPiece ptx, port::StringPiece kernelname,
+  // Warning: the string backing the provided absl::string_view ptx must outlive
+  // this instance.
+  CudaPtxInMemory(absl::string_view ptx, absl::string_view kernelname,
                   bool ptx_compressed = false);
 
   // Multiple-PTX-version constructor. Adds each item in spec_list to this
   // object. Note that the PTX can be compressed, which is indicated by the
   // argument ptx_compressed.
   CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list,
-                  port::StringPiece kernel_name, bool ptx_compressed = false);
+                  absl::string_view kernel_name, bool ptx_compressed = false);
   ~CudaPtxInMemory() override {}
 
   // Add the PTX implementation described by ptx_spec to this object. On
@@ -218,7 +218,7 @@ class CudaPtxInMemory : public KernelLoaderSpec {
 // Kernel loader specification for OpenCL text that resides on disk.
 class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec {
  public:
-  OpenCLTextOnDisk(port::StringPiece filename, port::StringPiece kernelname);
+  OpenCLTextOnDisk(absl::string_view filename, absl::string_view kernelname);
   ~OpenCLTextOnDisk() override {}
 
   const char *CanonicalSuffix() const override { return ".ocl"; }
@@ -230,7 +230,7 @@ class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec {
 // Kernel loader specification for OpenCL binary that resides on disk.
 class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec {
  public:
-  OpenCLBinaryOnDisk(port::StringPiece filename, port::StringPiece kernelname);
+  OpenCLBinaryOnDisk(absl::string_view filename, absl::string_view kernelname);
   ~OpenCLBinaryOnDisk() override {}
 
   const char *CanonicalSuffix() const override { return ".aocx"; }
@@ -242,7 +242,7 @@ class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec {
 // Kernel loader specification for OpenCL text that resides in memory.
 class OpenCLTextInMemory : public KernelLoaderSpec {
  public:
-  OpenCLTextInMemory(port::StringPiece text, port::StringPiece kernelname);
+  OpenCLTextInMemory(absl::string_view text, absl::string_view kernelname);
   ~OpenCLTextInMemory() override {}
 
   // Returns the OpenCL text contents.
@@ -258,7 +258,7 @@ class OpenCLTextInMemory : public KernelLoaderSpec {
 // Kernel loader specification for a CUBIN blob that resides in memory.
 class CudaCubinInMemory : public KernelLoaderSpec {
  public:
-  CudaCubinInMemory(const char *bytes, port::StringPiece kernelname);
+  CudaCubinInMemory(const char *bytes, absl::string_view kernelname);
   ~CudaCubinInMemory() override {}
 
   const char *bytes() const { return bytes_; }
@@ -328,28 +328,28 @@ class MultiKernelLoaderSpec {
   // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
   // name may be mangled by the compiler if it is not declared in an
   // extern "C" scope.
-  MultiKernelLoaderSpec *AddOpenCLTextOnDisk(port::StringPiece filename,
-                                             port::StringPiece kernelname);
-  MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(port::StringPiece filename,
-                                               port::StringPiece kernelname);
-  MultiKernelLoaderSpec *AddOpenCLTextInMemory(port::StringPiece ocl_text,
-                                               port::StringPiece kernelname);
-  MultiKernelLoaderSpec *AddCudaPtxOnDisk(port::StringPiece filename,
-                                          port::StringPiece kernelname);
-  MultiKernelLoaderSpec *AddCudaCubinOnDisk(port::StringPiece filename,
-                                            port::StringPiece kernelname);
+  MultiKernelLoaderSpec *AddOpenCLTextOnDisk(absl::string_view filename,
+                                             absl::string_view kernelname);
+  MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(absl::string_view filename,
+                                               absl::string_view kernelname);
+  MultiKernelLoaderSpec *AddOpenCLTextInMemory(absl::string_view ocl_text,
+                                               absl::string_view kernelname);
+  MultiKernelLoaderSpec *AddCudaPtxOnDisk(absl::string_view filename,
+                                          absl::string_view kernelname);
+  MultiKernelLoaderSpec *AddCudaCubinOnDisk(absl::string_view filename,
+                                            absl::string_view kernelname);
   MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes,
-                                              port::StringPiece kernelname);
-  MultiKernelLoaderSpec *AddCudaPtxInMemory(port::StringPiece ptx,
-                                            port::StringPiece kernelname);
+                                              absl::string_view kernelname);
+  MultiKernelLoaderSpec *AddCudaPtxInMemory(absl::string_view ptx,
+                                            absl::string_view kernelname);
   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
-      port::StringPiece ptx, port::StringPiece kernelname);
+      absl::string_view ptx, absl::string_view kernelname);
   MultiKernelLoaderSpec *AddCudaPtxInMemory(
       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
-      port::StringPiece kernelname);
+      absl::string_view kernelname);
   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
-      port::StringPiece kernelname);
+      absl::string_view kernelname);
 
  private:
   std::unique_ptr<CudaPtxOnDisk>
diff --git a/tensorflow/stream_executor/launch_dim.h b/tensorflow/stream_executor/launch_dim.h
index 68f2f748407a87ec9cf3bdd411bf96c1a64b5681..4a3c882d9f7a034214a062b1e08a5d2b12d26f81 100644
--- a/tensorflow/stream_executor/launch_dim.h
+++ b/tensorflow/stream_executor/launch_dim.h
@@ -37,7 +37,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
-#include "tensorflow/stream_executor/lib/strcat.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace stream_executor {
@@ -57,7 +57,7 @@ struct ThreadDim : public Dim3D {
 
   // Returns a string representation of the thread dimensionality.
   string ToString() const {
-    return port::StrCat("ThreadDim{", x, ", ", y, ", ", z, "}");
+    return absl::StrCat("ThreadDim{", x, ", ", y, ", ", z, "}");
   }
 };
 
@@ -69,7 +69,7 @@ struct BlockDim : public Dim3D {
 
   // Returns a string representation of the block dimensionality.
   string ToString() const {
-    return port::StrCat("BlockDim{", x, ", ", y, ", ", z, "}");
+    return absl::StrCat("BlockDim{", x, ", ", y, ", ", z, "}");
   }
 };
 
diff --git a/tensorflow/stream_executor/lib/casts.h b/tensorflow/stream_executor/lib/casts.h
deleted file mode 100644
index ec562e804fae51ac09e336b2e03b8ab0d7f1ca0e..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/lib/casts.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
-
-#include <stdlib.h>
-
-namespace stream_executor {
-namespace port {
-
-// port::bit_cast<Dest,Source> is a template function that implements the
-// equivalent of "*reinterpret_cast<Dest*>(&source)".  We need this in
-// very low-level functions like the protobuf library and fast math
-// support.
-//
-//   float f = 3.14159265358979;
-//   int i = port::bit_cast<int32>(f);
-//   // i = 0x40490fdb
-//
-// The classical address-casting method is:
-//
-//   // WRONG
-//   float f = 3.14159265358979;            // WRONG
-//   int i = * reinterpret_cast<int*>(&f);  // WRONG
-//
-// The address-casting method actually produces undefined behavior
-// according to ISO C++ specification section 3.10 -15 -.  Roughly, this
-// section says: if an object in memory has one type, and a program
-// accesses it with a different type, then the result is undefined
-// behavior for most values of "different type".
-//
-// This is true for any cast syntax, either *(int*)&f or
-// *reinterpret_cast<int*>(&f).  And it is particularly true for
-// conversions between integral lvalues and floating-point lvalues.
-//
-// The purpose of 3.10 -15- is to allow optimizing compilers to assume
-// that expressions with different types refer to different memory.  gcc
-// 4.0.1 has an optimizer that takes advantage of this.  So a
-// non-conforming program quietly produces wildly incorrect output.
-//
-// The problem is not the use of reinterpret_cast.  The problem is type
-// punning: holding an object in memory of one type and reading its bits
-// back using a different type.
-//
-// The C++ standard is more subtle and complex than this, but that
-// is the basic idea.
-//
-// Anyways ...
-//
-// port::bit_cast<> calls memcpy() which is blessed by the standard,
-// especially by the example in section 3.9 .  Also, of course,
-// port::bit_cast<> wraps up the nasty logic in one place.
-//
-// Fortunately memcpy() is very fast.  In optimized mode, with a
-// constant size, gcc 2.95.3, gcc 4.0.1, and msvc 7.1 produce inline
-// code with the minimal amount of data movement.  On a 32-bit system,
-// memcpy(d,s,4) compiles to one load and one store, and memcpy(d,s,8)
-// compiles to two loads and two stores.
-//
-// I tested this code with gcc 2.95.3, gcc 4.0.1, icc 8.1, and msvc 7.1.
-//
-// WARNING: if Dest or Source is a non-POD type, the result of the memcpy
-// is likely to surprise you.
-//
-// Props to Bill Gibbons for the compile time assertion technique and
-// Art Komninos and Igor Tandetnik for the msvc experiments.
-//
-// -- mec 2005-10-17
-
-template <class Dest, class Source>
-inline Dest bit_cast(const Source& source) {
-  // Compile time assertion: sizeof(Dest) == sizeof(Source)
-  // A compile error here means your Dest and Source have different sizes.
-  static_assert(sizeof(Dest) == sizeof(Source),
-                "src and dst types must have equal sizes");
-
-  Dest dest;
-  memcpy(&dest, &source, sizeof(dest));
-  return dest;
-}
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_CASTS_H_
diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h
index d78bbfd425925f9826c69621373b46b9fd4b46fc..a5eb8ef1d433becedc07c49ce05d05731f831332 100644
--- a/tensorflow/stream_executor/lib/env.h
+++ b/tensorflow/stream_executor/lib/env.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_ENV_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace stream_executor {
@@ -31,7 +31,7 @@ inline Status FileExists(const string& filename) {
   return Env::Default()->FileExists(filename);
 }
 
-inline Status FileExists(const port::StringPiece& filename) {
+inline Status FileExists(const absl::string_view& filename) {
   return Env::Default()->FileExists(string(filename));
 }
 
diff --git a/tensorflow/stream_executor/lib/inlined_vector.h b/tensorflow/stream_executor/lib/inlined_vector.h
deleted file mode 100644
index 0198947e5badf9a3aa317027b7c6159cf6e282f2..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/lib/inlined_vector.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_INLINED_VECTOR_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_INLINED_VECTOR_H_
-
-#include "absl/container/inlined_vector.h"
-
-namespace stream_executor {
-namespace port {
-
-using absl::InlinedVector;
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_INLINED_VECTOR_H_
diff --git a/tensorflow/stream_executor/lib/path.cc b/tensorflow/stream_executor/lib/path.cc
index 3d3da103e1e75b04a6502370272d54a36698b180..47eedbc6a163afd564f59b2d1f3626ae7b35b362 100644
--- a/tensorflow/stream_executor/lib/path.cc
+++ b/tensorflow/stream_executor/lib/path.cc
@@ -14,22 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/stream_executor/lib/path.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 
 namespace stream_executor {
 namespace port {
 namespace internal {
 
-static bool IsAbsolutePath(port::StringPiece path) {
+static bool IsAbsolutePath(absl::string_view path) {
   return !path.empty() && path[0] == '/';
 }
 
 // For an array of paths of length count, append them all together,
 // ensuring that the proper path separators are inserted between them.
-string JoinPathImpl(std::initializer_list<port::StringPiece> paths) {
+string JoinPathImpl(std::initializer_list<absl::string_view> paths) {
   string result;
 
-  for (port::StringPiece path : paths) {
+  for (absl::string_view path : paths) {
     if (path.empty()) continue;
 
     if (result.empty()) {
@@ -39,15 +40,15 @@ string JoinPathImpl(std::initializer_list<port::StringPiece> paths) {
 
     if (result[result.size() - 1] == '/') {
       if (IsAbsolutePath(path)) {
-        StrAppend(&result, path.substr(1));
+        absl::StrAppend(&result, path.substr(1));
       } else {
-        StrAppend(&result, path);
+        absl::StrAppend(&result, path);
       }
     } else {
       if (IsAbsolutePath(path)) {
-        StrAppend(&result, path);
+        absl::StrAppend(&result, path);
       } else {
-        StrAppend(&result, "/", path);
+        absl::StrAppend(&result, "/", path);
       }
     }
   }
diff --git a/tensorflow/stream_executor/lib/path.h b/tensorflow/stream_executor/lib/path.h
index 325f04ff47552e052d81c96bec74e816378b8254..76a623cc033ac258bfc95c654db50a28ea55fffc 100644
--- a/tensorflow/stream_executor/lib/path.h
+++ b/tensorflow/stream_executor/lib/path.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_PATH_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
 namespace stream_executor {
@@ -28,7 +28,7 @@ using tensorflow::io::Dirname;
 namespace internal {
 // TODO(rspringer): Move to cc/implementation file.
 // Not part of the public API.
-string JoinPathImpl(std::initializer_list<port::StringPiece> paths);
+string JoinPathImpl(std::initializer_list<absl::string_view> paths);
 }  // namespace internal
 
 // Join multiple paths together.
@@ -44,7 +44,7 @@ string JoinPathImpl(std::initializer_list<port::StringPiece> paths);
 // All paths will be treated as relative paths, regardless of whether or not
 // they start with a leading '/'.  That is, all paths will be concatenated
 // together, with the appropriate path separator inserted in between.
-// Arguments must be convertible to port::StringPiece.
+// Arguments must be convertible to absl::string_view.
 //
 // Usage:
 // string path = file::JoinPath("/var/log", dirname, filename);
diff --git a/tensorflow/stream_executor/lib/status.h b/tensorflow/stream_executor/lib/status.h
index 407b71b405bc8a73e5aebcd18b043420b074b708..87269b4591a864b6b65fc303f046b231ceda8b28 100644
--- a/tensorflow/stream_executor/lib/status.h
+++ b/tensorflow/stream_executor/lib/status.h
@@ -18,9 +18,9 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUS_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/error.h"  // IWYU pragma: export
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
 namespace stream_executor {
@@ -33,13 +33,13 @@ using Status = tensorflow::Status;
   ASSERT_EQ(::stream_executor::port::Status::OK(), (val))
 
 // Define some canonical error helpers.
-inline Status UnimplementedError(StringPiece message) {
+inline Status UnimplementedError(absl::string_view message) {
   return Status(error::UNIMPLEMENTED, message);
 }
-inline Status InternalError(StringPiece message) {
+inline Status InternalError(absl::string_view message) {
   return Status(error::INTERNAL, message);
 }
-inline Status FailedPreconditionError(StringPiece message) {
+inline Status FailedPreconditionError(absl::string_view message) {
   return Status(error::FAILED_PRECONDITION, message);
 }
 
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
index e77dfcef768a38030a5bcaea9aab77583b83006d..e99dfa8399d95a5c43e6587af4e7ecf3d4333420 100644
--- a/tensorflow/stream_executor/lib/str_util.h
+++ b/tensorflow/stream_executor/lib/str_util.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STR_UTIL_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 
 namespace stream_executor {
 namespace port {
@@ -27,7 +27,8 @@ using tensorflow::str_util::Split;
 
 // Returns a copy of the input string 'str' with the given 'suffix'
 // removed. If the suffix doesn't match, returns a copy of the original string.
-inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix) {
+inline string StripSuffixString(absl::string_view str,
+                                absl::string_view suffix) {
   if (tensorflow::str_util::EndsWith(str, suffix)) {
     str.remove_suffix(suffix.size());
   }
diff --git a/tensorflow/stream_executor/lib/strcat.h b/tensorflow/stream_executor/lib/strcat.h
deleted file mode 100644
index 3688d7b4eba00c31d3b170da7a261ece6bc7ef45..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/lib/strcat.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// IWYU pragma: private, include "third_party/tensorflow/stream_executor/stream_executor.h"
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
-
-#include "absl/strings/str_cat.h"
-
-namespace stream_executor {
-namespace port {
-
-using absl::StrAppend;
-using absl::StrCat;
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRCAT_H_
diff --git a/tensorflow/stream_executor/lib/stringpiece.h b/tensorflow/stream_executor/lib/stringpiece.h
deleted file mode 100644
index 76249101298588a02ff19bf2dd2c0c73ff9ac46e..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/lib/stringpiece.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPIECE_H_
-#define TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPIECE_H_
-
-#include "absl/strings/string_view.h"
-
-namespace stream_executor {
-namespace port {
-
-using StringPiece = absl::string_view;
-
-}  // namespace port
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_STRINGPIECE_H_
diff --git a/tensorflow/stream_executor/module_spec.h b/tensorflow/stream_executor/module_spec.h
index 75bdfed2d70364da4b191804d1e0973ee2658b70..e8a970283c55bb08858e3bd2dec8d423d914e7fc 100644
--- a/tensorflow/stream_executor/module_spec.h
+++ b/tensorflow/stream_executor/module_spec.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
 
 #include "tensorflow/stream_executor/lib/array_slice.h"
-#include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 
diff --git a/tensorflow/stream_executor/multi_platform_manager.cc b/tensorflow/stream_executor/multi_platform_manager.cc
index 5b51398d8cab5df7c7514bc3bedf87f5c33c6e5a..bbb56071f49c74973f360040db7d126ffe346075 100644
--- a/tensorflow/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/stream_executor/multi_platform_manager.cc
@@ -15,62 +15,86 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/str_util.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
+namespace {
 
-/* static */ mutex MultiPlatformManager::platforms_mutex_{LINKER_INITIALIZED};
+class MultiPlatformManagerImpl {
+ public:
+  port::Status RegisterPlatform(std::unique_ptr<Platform> platform)
+      LOCKS_EXCLUDED(mu_);
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::LookupByNameLocked(
-    const string& target) {
-  PlatformMap* platform_map = GetPlatformMap();
-  auto it = platform_map->find(port::Lowercase(target));
-  if (it == platform_map->end()) {
-    return port::Status(
-        port::error::NOT_FOUND,
-        "could not find registered platform with name: \"" + target + "\"");
-  }
-  return it->second;
-}
+  port::StatusOr<Platform*> PlatformWithName(absl::string_view target)
+      LOCKS_EXCLUDED(mu_);
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::LookupByIdLocked(
-    const Platform::Id& id) {
-  PlatformIdMap* platform_map = GetPlatformByIdMap();
-  auto it = platform_map->find(id);
-  if (it == platform_map->end()) {
-    return port::Status(
-        port::error::NOT_FOUND,
-        port::Printf("could not find registered platform with id: 0x%p", id));
-  }
-  return it->second;
-}
+  port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
+      LOCKS_EXCLUDED(mu_);
+
+  port::StatusOr<Platform*> InitializePlatformWithName(
+      absl::string_view target, const std::map<string, string>& options)
+      LOCKS_EXCLUDED(mu_);
+  port::StatusOr<Platform*> InitializePlatformWithId(
+      const Platform::Id& id, const std::map<string, string>& options)
+      LOCKS_EXCLUDED(mu_);
+
+  std::vector<Platform*> AllPlatforms() LOCKS_EXCLUDED(mu_);
+
+  using Listener = MultiPlatformManager::Listener;
+  port::Status RegisterListener(std::unique_ptr<Listener> listener)
+      LOCKS_EXCLUDED(mu_);
+
+ private:
+  // Looks up the platform object with the given name.  Assumes the Platforms
+  // mutex is held.
+  port::StatusOr<Platform*> LookupByNameLocked(absl::string_view target)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Looks up the platform object with the given id.  Assumes the Platforms
+  // mutex is held.
+  port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-/* static */ port::Status MultiPlatformManager::RegisterPlatform(
+  absl::Mutex mu_;
+  std::vector<std::unique_ptr<Listener>> listeners_ GUARDED_BY(mu_);
+  absl::flat_hash_map<Platform::Id, Platform*> id_map_ GUARDED_BY(mu_);
+  absl::flat_hash_map<string, Platform*> name_map_ GUARDED_BY(mu_);
+};
+
+port::Status MultiPlatformManagerImpl::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   CHECK(platform != nullptr);
   string key = port::Lowercase(platform->Name());
-  mutex_lock lock(platforms_mutex_);
-  if (GetPlatformMap()->find(key) != GetPlatformMap()->end()) {
+  absl::MutexLock lock(&mu_);
+  if (name_map_.find(key) != name_map_.end()) {
     return port::Status(port::error::INTERNAL,
                         "platform is already registered with name: \"" +
                             platform->Name() + "\"");
   }
-  GetPlatformByIdMap()->insert(std::make_pair(platform->id(), platform.get()));
+  Platform* platform_ptr = platform.get();
+  CHECK(id_map_.emplace(platform->id(), platform_ptr).second);
   // Release ownership/uniqueness to prevent destruction on program exit.
   // This avoids Platforms "cleaning up" on program exit, because otherwise,
   // there are _very_ tricky races between StreamExecutor and underlying
   // platforms (CUDA, OpenCL) during exit. Since these are fixed-size and 1x per
   // program, these are deemed acceptable.
-  (*GetPlatformMap())[key] = platform.release();
+  name_map_[key] = platform.release();
+  for (const auto& listener : listeners_) {
+    listener->PlatformRegistered(platform_ptr);
+  }
   return port::Status::OK();
 }
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
-    const string& target) {
-  mutex_lock lock(platforms_mutex_);
+port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
+    absl::string_view target) {
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
   if (!platform->Initialized()) {
@@ -80,9 +104,9 @@ namespace stream_executor {
   return platform;
 }
 
-/* static */ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+port::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
     const Platform::Id& id) {
-  mutex_lock lock(platforms_mutex_);
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (!platform->Initialized()) {
@@ -92,15 +116,15 @@ namespace stream_executor {
   return platform;
 }
 
-/* static */ port::StatusOr<Platform*>
-MultiPlatformManager::InitializePlatformWithName(
-    const string& target, const std::map<string, string>& options) {
-  mutex_lock lock(platforms_mutex_);
+port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithName(
+    absl::string_view target, const std::map<string, string>& options) {
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByNameLocked(target));
   if (platform->Initialized()) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "platform \"" + target + "\" is already initialized");
+    return port::Status(
+        port::error::FAILED_PRECONDITION,
+        absl::StrCat("platform \"", target, "\" is already initialized"));
   }
 
   SE_RETURN_IF_ERROR(platform->Initialize(options));
@@ -108,10 +132,9 @@ MultiPlatformManager::InitializePlatformWithName(
   return platform;
 }
 
-/* static */ port::StatusOr<Platform*>
-MultiPlatformManager::InitializePlatformWithId(
+port::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithId(
     const Platform::Id& id, const std::map<string, string>& options) {
-  mutex_lock lock(platforms_mutex_);
+  absl::MutexLock lock(&mu_);
 
   SE_ASSIGN_OR_RETURN(Platform * platform, LookupByIdLocked(id));
   if (platform->Initialized()) {
@@ -125,10 +148,90 @@ MultiPlatformManager::InitializePlatformWithId(
   return platform;
 }
 
-/* static */ void MultiPlatformManager::ClearPlatformRegistry() {
-  mutex_lock lock(platforms_mutex_);
-  GetPlatformMap()->clear();
-  GetPlatformByIdMap()->clear();
+port::Status MultiPlatformManagerImpl::RegisterListener(
+    std::unique_ptr<Listener> listener) {
+  absl::MutexLock lock(&mu_);
+  CHECK(id_map_.empty());
+  CHECK(name_map_.empty());
+  listeners_.push_back(std::move(listener));
+  return port::Status::OK();
+}
+
+std::vector<Platform*> MultiPlatformManagerImpl::AllPlatforms() {
+  absl::MutexLock lock(&mu_);
+  CHECK_EQ(id_map_.size(), name_map_.size());
+  std::vector<Platform*> platforms;
+  platforms.reserve(id_map_.size());
+  for (const auto& entry : id_map_) {
+    platforms.push_back(entry.second);
+  }
+  return platforms;
+}
+
+port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
+    absl::string_view target) {
+  auto it = name_map_.find(port::Lowercase(target));
+  if (it == name_map_.end()) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        absl::StrCat("Could not find registered platform with name: \"", target,
+                     "\""));
+  }
+  return it->second;
+}
+
+port::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByIdLocked(
+    const Platform::Id& id) {
+  auto it = id_map_.find(id);
+  if (it == id_map_.end()) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        port::Printf("could not find registered platform with id: 0x%p", id));
+  }
+  return it->second;
+}
+
+MultiPlatformManagerImpl& Impl() {
+  static MultiPlatformManagerImpl* impl = new MultiPlatformManagerImpl;
+  return *impl;
+}
+
+}  // namespace
+
+/*static*/ port::Status MultiPlatformManager::RegisterPlatform(
+    std::unique_ptr<Platform> platform) {
+  return Impl().RegisterPlatform(std::move(platform));
+}
+
+/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
+    absl::string_view target) {
+  return Impl().PlatformWithName(target);
+}
+
+/*static*/ port::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+    const Platform::Id& id) {
+  return Impl().PlatformWithId(id);
+}
+
+/*static*/ port::StatusOr<Platform*>
+MultiPlatformManager::InitializePlatformWithName(
+    absl::string_view target, const std::map<string, string>& options) {
+  return Impl().InitializePlatformWithName(target, options);
+}
+
+/*static*/ port::StatusOr<Platform*>
+MultiPlatformManager::InitializePlatformWithId(
+    const Platform::Id& id, const std::map<string, string>& options) {
+  return Impl().InitializePlatformWithId(id, options);
+}
+
+/*static*/ port::Status MultiPlatformManager::RegisterListener(
+    std::unique_ptr<Listener> listener) {
+  return Impl().RegisterListener(std::move(listener));
+}
+
+/*static*/ std::vector<Platform*> MultiPlatformManager::AllPlatforms() {
+  return Impl().AllPlatforms();
 }
 
 }  // namespace stream_executor
@@ -141,3 +244,15 @@ REGISTER_MODULE_INITIALIZER(
         // purposes from Platform subclasses that register
         // themselves with the MultiPlatformManager.
     });
+
+REGISTER_MODULE_INITIALIZER(
+    multi_platform_manager_listener,
+    {
+        // Nothing -- this is just a module initializer definition to reference
+        // for sequencing registration of listeners with the
+        // MultiPlatformManager.
+    });
+
+// Listener registration should happen before platform registration.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+                                     multi_platform_manager);
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index 146a128e85cfe84a844aae0fd50d5a329df2723c..06f5ae2c2baaee0444697d4096da7bf36e5c217d 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -67,14 +67,14 @@ limitations under the License.
 #include <functional>
 #include <map>
 #include <memory>
+#include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
 
 namespace stream_executor {
 
@@ -84,9 +84,8 @@ class MultiPlatformManager {
   // Registers a platform object, returns an error status if the platform is
   // already registered. The associated listener, if not null, will be used to
   // trace events for ALL executors for that platform.
-  // Takes ownership of listener.
-  static port::Status RegisterPlatform(std::unique_ptr<Platform> platform)
-      LOCKS_EXCLUDED(platforms_mutex_);
+  // Takes ownership of platform.
+  static port::Status RegisterPlatform(std::unique_ptr<Platform> platform);
 
   // Retrieves the platform registered with the given platform name (e.g.
   // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
@@ -98,10 +97,8 @@ class MultiPlatformManager {
   // If the requested platform is not registered, an error status is returned.
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
-  static port::StatusOr<Platform*> PlatformWithName(const string& target)
-      LOCKS_EXCLUDED(platforms_mutex_);
-  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id)
-      LOCKS_EXCLUDED(platforms_mutex_);
+  static port::StatusOr<Platform*> PlatformWithName(absl::string_view target);
+  static port::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
 
   // Retrieves the platform registered with the given platform name (e.g.
   // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
@@ -114,14 +111,12 @@ class MultiPlatformManager {
   // Ownership of the platform is NOT transferred to the caller --
   // the MultiPlatformManager owns the platforms in a singleton-like fashion.
   static port::StatusOr<Platform*> InitializePlatformWithName(
-      const string& target, const std::map<string, string>& options)
-      LOCKS_EXCLUDED(platforms_mutex_);
+      absl::string_view target, const std::map<string, string>& options);
+
   static port::StatusOr<Platform*> InitializePlatformWithId(
-      const Platform::Id& id, const std::map<string, string>& options)
-      LOCKS_EXCLUDED(platforms_mutex_);
+      const Platform::Id& id, const std::map<string, string>& options);
 
-  // Clears the set of registered platforms, primarily used for testing.
-  static void ClearPlatformRegistry() LOCKS_EXCLUDED(platforms_mutex_);
+  static std::vector<Platform*> AllPlatforms();
 
   // Although the MultiPlatformManager "owns" its platforms, it holds them as
   // undecorated pointers to prevent races during program exit (between this
@@ -135,57 +130,32 @@ class MultiPlatformManager {
   // of any platforms registered with it, and leak checking should be disabled
   // during allocation of such Platforms, to avoid spurious reporting at program
   // exit.
-  using PlatformMap = std::map<string, Platform*>;
-
-  // Provides access to the available set of platforms under a lock.
-  static port::Status WithPlatforms(
-      std::function<port::Status(PlatformMap*)> callback)
-      LOCKS_EXCLUDED(platforms_mutex_) {
-    mutex_lock lock(platforms_mutex_);
-    return callback(GetPlatformMap());
-  }
-
- private:
-  using PlatformIdMap = std::map<Platform::Id, Platform*>;
-
-  static mutex platforms_mutex_;
-
-  // TODO(b/22689637): Clean up these two maps; make sure they coexist nicely.
-  // TODO(b/22689637): Move this (whatever the final/"official" map is) to
-  // plugin_regstry.h, along with the associated functionality.
-  // Platform-name-to-object mapping. These platforms are registered via module
-  // initializers, and linkage determines which platforms are available to a
-  // given target.
-  static PlatformMap* GetPlatformMap() {
-    static PlatformMap* instance = new PlatformMap;
-    return instance;
-  }
-
-  // Holds a Platform::Id-to-object mapping.
-  // Unlike platforms_ above, this map does not own its contents.
-  static PlatformIdMap* GetPlatformByIdMap() {
-    static PlatformIdMap* instance = new PlatformIdMap;
-    return instance;
-  }
-
-  // Looks up the platform object with the given name.  Assumes the Platforms
-  // mutex is held.
-  static port::StatusOr<Platform*> LookupByNameLocked(const string& target)
-      EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_);
-
-  // Looks up the platform object with the given id.  Assumes the Platforms
-  // mutex is held.
-  static port::StatusOr<Platform*> LookupByIdLocked(const Platform::Id& id)
-      EXCLUSIVE_LOCKS_REQUIRED(platforms_mutex_);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(MultiPlatformManager);
+
+  // Interface for a listener that gets notfied at certain events.
+  class Listener {
+   public:
+    virtual ~Listener() = default;
+    // Callback that is invoked when a Platform is registered.
+    virtual void PlatformRegistered(Platform* platform) = 0;
+  };
+  // Registers a listeners to receive notifications about certain events.
+  // Precondition: No Platform has been registered yet.
+  static port::Status RegisterListener(std::unique_ptr<Listener> listener);
 };
 
 }  // namespace stream_executor
 
-// multi_platform_manager.cc will define this instance. Includers of this header
-// should use
+// multi_platform_manager.cc will define these instances.
+//
+// Registering a platform:
 // REGISTER_MODULE_INITIALIZER_SEQUENCE(my_platform, multi_platform_manager);
+// REGISTER_MODULE_INITIALIZER_SEQUENCE(multi_platform_manager_listener,
+// my_platform);
+//
+// Registering a listener:
+// REGISTER_MODULE_INITIALIZER_SEQUENCE(my_listener,
+// multi_platform_manager_listener);
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+DECLARE_MODULE_INITIALIZER(multi_platform_manager_listener);
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/tensorflow/stream_executor/platform.cc b/tensorflow/stream_executor/platform.cc
index 777abced8634410440fd5c2772a3b981a1489dd6..c0205abbee305edc23e24d79c53f9ed3b84049b5 100644
--- a/tensorflow/stream_executor/platform.cc
+++ b/tensorflow/stream_executor/platform.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
@@ -35,7 +35,7 @@ string PlatformKindString(PlatformKind kind) {
     case PlatformKind::kMock:
       return "Mock";
     default:
-      return port::StrCat("InvalidPlatformKind(", static_cast<int>(kind), ")");
+      return absl::StrCat("InvalidPlatformKind(", static_cast<int>(kind), ")");
   }
 }
 
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 69558fd14b12e9f011cf9997cde67275558040a5..3edc66cde8045d7f6ae53095e8136d1697fb1d23 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -17,12 +17,12 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
+#include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_buffer.h"
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/rng.h"
@@ -119,20 +119,20 @@ string ToVlogString(const DeviceMemoryBase *memory) {
 }
 
 string ToVlogString(const Eigen::half &h) {
-  return port::StrCat(static_cast<float>(h));
+  return absl::StrCat(static_cast<float>(h));
 }
 
-string ToVlogString(int i) { return port::StrCat(i); }
+string ToVlogString(int i) { return absl::StrCat(i); }
 
-string ToVlogString(uint32 i) { return port::StrCat(i); }
+string ToVlogString(uint32 i) { return absl::StrCat(i); }
 
-string ToVlogString(uint64 i) { return port::StrCat(i); }
+string ToVlogString(uint64 i) { return absl::StrCat(i); }
 
-string ToVlogString(int64 i) { return port::StrCat(i); }
+string ToVlogString(int64 i) { return absl::StrCat(i); }
 
-string ToVlogString(float f) { return port::StrCat(f); }
+string ToVlogString(float f) { return absl::StrCat(f); }
 
-string ToVlogString(double d) { return port::StrCat(d); }
+string ToVlogString(double d) { return absl::StrCat(d); }
 
 template <typename T>
 string ToVlogString(const HostOrDeviceScalar<T> &memory_or_constant) {
@@ -144,7 +144,7 @@ string ToVlogString(const HostOrDeviceScalar<T> &memory_or_constant) {
 
 template <class T>
 string ToVlogString(port::ArraySlice<T> elements) {
-  string str = port::StrCat(
+  string str = absl::StrCat(
       ToVlogString(reinterpret_cast<const void *>(elements.data())), "[",
       elements.size(), "]{");
   const char *separator = "";
@@ -161,7 +161,7 @@ string ToVlogString(port::ArraySlice<T> elements) {
       str += ", ...";
       break;
     }
-    port::StrAppend(&str, separator, ToVlogString(elements[i]));
+    absl::StrAppend(&str, separator, ToVlogString(elements[i]));
     separator = ", ";
   }
   str += "}";
@@ -191,8 +191,11 @@ string ToVlogString(dnn::DataType data_type) {
       return "dnn::DataType::kHalf";
     case dnn::DataType::kInt8:
       return "dnn::DataType::kInt8";
+    case dnn::DataType::kInt32:
+      return "dnn::DataType::kInt32";
+    default:
+      return "unknown DataType";
   }
-  return "unknown DataType";
 }
 
 // Used together with PARAM to VLOG calls made to the stream. Intended
@@ -211,16 +214,16 @@ string CallStr(const char *function_name, Stream *stream,
   // constructing all the strings in params is expensive.
   CHECK(VLOG_IS_ON(1));
 
-  string str = port::StrCat(stream->DebugStreamPointers(),
+  string str = absl::StrCat(stream->DebugStreamPointers(),
                             " Called Stream::", function_name, "(");
   const char *separator = "";
   for (const auto &param : params) {
-    port::StrAppend(&str, separator, param.first, "=", param.second);
+    absl::StrAppend(&str, separator, param.first, "=", param.second);
     separator = ", ";
   }
-  port::StrAppend(&str, ")");
+  absl::StrAppend(&str, ")");
   if (VLOG_IS_ON(10)) {
-    port::StrAppend(&str, " ", port::CurrentStackTrace(), "\n");
+    absl::StrAppend(&str, " ", port::CurrentStackTrace(), "\n");
   }
   return str;
 }
@@ -5087,15 +5090,17 @@ Stream &Stream::ThenRnnForward(
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoRnnForward(
+      auto status = dnn->DoRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           is_training, reserve_space_allocator, workspace_allocator,
-          output_profile_result));
+          output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
     } else {
-      SetError();
-      LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -5121,15 +5126,17 @@ Stream &Stream::ThenRnnForward(
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoRnnForward(
+      auto status = dnn->DoRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           is_training, reserve_space_allocator, workspace_allocator,
-          output_profile_result));
+          output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
     } else {
-      SetError();
-      LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -5156,15 +5163,17 @@ Stream &Stream::ThenRnnForward(
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoRnnForward(
+      auto status = dnn->DoRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           is_training, reserve_space_allocator, workspace_allocator,
-          output_profile_result));
+          output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
     } else {
-      SetError();
-      LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
+      SetErrorAndLogNoDnnSupport();
     }
   }
   return *this;
@@ -5198,14 +5207,17 @@ Stream &Stream::ThenRnnBackward(
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoRnnBackward(
+      auto status = dnn->DoRnnBackward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
           params_backprop_data, reserve_space_data, workspace_allocator,
-          output_profile_result));
+          output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -5241,14 +5253,17 @@ Stream &Stream::ThenRnnBackward(
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoRnnBackward(
+      auto status = dnn->DoRnnBackward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
           params_backprop_data, reserve_space_data, workspace_allocator,
-          output_profile_result));
+          output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -5285,14 +5300,17 @@ Stream &Stream::ThenRnnBackward(
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoRnnBackward(
+      auto status = dnn->DoRnnBackward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
           params_backprop_data, reserve_space_data, workspace_allocator,
-          output_profile_result));
+          output_profile_result);
+      if (!status && !output_profile_result) {
+        SetError();
+      }
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -5485,7 +5503,7 @@ port::Status Stream::BlockHostUntilDone() {
 
 string Stream::DebugStreamPointers() const {
   // Relies on the ToVlogString(const void*) overload above.
-  return port::StrCat("[stream=", ToVlogString(this),
+  return absl::StrCat("[stream=", ToVlogString(this),
                       ",impl=", ToVlogString(implementation_.get()), "]");
 }
 
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index e1629b5b3084e6641bcdf80d1de00f33f1c81940..0fc90cf83d6b4e3e0ede84747f8149c1a25289ca 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -2033,9 +2033,20 @@ class Stream {
   // transferred to the caller.
   internal::StreamInterface *implementation() { return implementation_.get(); }
 
+  // Entrains onto the stream a callback to the host (from the device).
+  // Behaves as ThenDoHostCallbackWithStatus below, but the callback should
+  // never fail or its failure is inconsequential.
+  //
+  // This is kept for backward compatibility. Future code should use
+  // ThenDoHostCallbackWithStatus and explicitly return a success status.
+  // TODO(b/112125301): Eventually remove this method.
+  Stream &ThenDoHostCallback(std::function<void()> callback);
+
   // Entrains onto the stream a callback to the host (from the device).
   // Host callbacks block/occupy the stream just as device functions
   // (execute one at a time, block later stream operations).
+  // Whether the callback return status affects the result of BlockHostUntilDone
+  // is platform-dependent.
   //
   // Behavior is undefined when synchronizing using OpenCL user events.
   // Behavior is undefined if host callbacks call device routines or insert
@@ -2043,11 +2054,6 @@ class Stream {
   //
   // On certain platforms, ThenDoHostCallback is expected to have significant
   // negative effects on performance.
-  Stream &ThenDoHostCallback(std::function<void()> callback);
-
-  // Entrains onto the stream a callback to the host (from the device).
-  // Behaves as ThenDoHostCallback above, but returns a Status instead of void.
-  // This overload should be preferred if the callback could fail.
   Stream &ThenDoHostCallbackWithStatus(std::function<port::Status()> callback);
 
   // Returns the StreamExecutor (parent object) associated with this stream.
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 7df6a361c6810b9a15c97f15704435d145dccb8e..341c6edccd3c1bfd314127c5356f03a15a85e1d3 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -36,16 +36,15 @@ StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
 
 StreamExecutorFactory MakeHostExecutorImplementation;
 
-// TODO(b/112125301): Consolodate this down to one implementation of
-// HostCallback, taking a callback that returns a Status.
-bool StreamExecutorInterface::HostCallback(
-    Stream* stream, std::function<port::Status()> callback) {
-  return HostCallback(stream, [callback]() {
-    port::Status s = callback();
-    if (!s.ok()) {
-      LOG(WARNING) << "HostCallback failed: " << s;
-    }
-  });
+// The default implementation just calls the other HostCallback method.
+// It should make all existing code that uses a void() callback still work.
+bool StreamExecutorInterface::HostCallback(Stream* stream,
+                                           std::function<void()> callback) {
+  return HostCallback(
+      stream, std::function<port::Status()>([callback]() -> port::Status {
+        callback();
+        return port::Status::OK();
+      }));
 }
 
 }  // namespace internal
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 59a477b5c9c37f10d8f12645deb3cdb832a8d544..0c2c33cfca227b2d67fcdc633dd94274a65b92bb 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/kernel_cache_config.h"
 #include "tensorflow/stream_executor/kernel_spec.h"
 #include "tensorflow/stream_executor/launch_dim.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/module_spec.h"
@@ -238,9 +237,9 @@ class StreamExecutorInterface {
   virtual bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
                                     const DeviceMemoryBase &gpu_src,
                                     uint64 size) = 0;
-  virtual bool HostCallback(Stream *stream, std::function<void()> callback) = 0;
+  virtual bool HostCallback(Stream *stream, std::function<void()> callback);
   virtual bool HostCallback(Stream *stream,
-                            std::function<port::Status()> callback);
+                            std::function<port::Status()> callback) = 0;
   virtual port::Status AllocateEvent(Event *event) = 0;
   virtual port::Status DeallocateEvent(Event *event) = 0;
   virtual port::Status RecordEvent(Stream *stream, Event *event) = 0;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 10bf006787662fd662361b23da8576d2f875ad79..d1d0bd9bc21e0cdf6d5bb3dc4fc58bc42b30378f 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <atomic>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/fft.h"
@@ -45,7 +46,7 @@ namespace {
 
 string StackTraceIfVLOG10() {
   if (VLOG_IS_ON(10)) {
-    return port::StrCat(" ", port::CurrentStackTrace(), "\n");
+    return absl::StrCat(" ", port::CurrentStackTrace(), "\n");
   } else {
     return "";
   }
@@ -501,13 +502,13 @@ port::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
   if (static_cast<bool>(module_handle)) {
     return port::Status(
         port::error::NOT_FOUND,
-        port::StrCat("Check if module containing symbol ", symbol_name,
+        absl::StrCat("Check if module containing symbol ", symbol_name,
                      " is loaded (module_handle = ",
                      reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
   } else {
     return port::Status(
         port::error::NOT_FOUND,
-        port::StrCat("Check if kernel using the symbol is loaded: ",
+        absl::StrCat("Check if kernel using the symbol is loaded: ",
                      symbol_name));
   }
 }
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 4a8a270afa9a3cc28f71d1ea5b13534c69d77023..d259a4ab635660982e9308bbf8f934fc5950d909 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/base/macros.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
diff --git a/tensorflow/stream_executor/temporary_memory_manager.cc b/tensorflow/stream_executor/temporary_memory_manager.cc
index 420dbb0933db3adf69a425f6223250e66f960261..cd6a3cd88b7d5a14fc6390707a5c85d6ee91325c 100644
--- a/tensorflow/stream_executor/temporary_memory_manager.cc
+++ b/tensorflow/stream_executor/temporary_memory_manager.cc
@@ -15,9 +15,10 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/temporary_memory_manager.h"
 
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 
@@ -97,7 +98,7 @@ TemporaryMemoryManager::AllocateArrayBase(uint64 element_count,
       stream_->parent()->AllocateArray<uint8>(byte_size);
   if (device_memory == nullptr) {
     return port::Status(port::error::RESOURCE_EXHAUSTED,
-                        port::StrCat("could not allocate temporary memory of ",
+                        absl::StrCat("could not allocate temporary memory of ",
                                      byte_size, " bytes"));
   }
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 8166ee9b423c25bb976b750b80bc1fb5a6be0996..9c3453fce00ab0559e2ce8b99d8723acfb812bcf 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -50,6 +50,9 @@ def register_extension_info(**kwargs):
 
 # if_cuda_is_configured def placeholder
 
+def if_cuda_is_configured_compat(x):
+    return if_cuda_is_configured(x)
+
 # Given a source file, generate a test name.
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
@@ -171,10 +174,10 @@ def if_not_windows(a):
         "//conditions:default": a,
     })
 
-def if_windows(a):
+def if_windows(a, otherwise = []):
     return select({
         clean_dep("//tensorflow:windows"): a,
-        "//conditions:default": [],
+        "//conditions:default": otherwise,
     })
 
 def if_not_windows_cuda(a):
@@ -201,6 +204,13 @@ def if_override_eigen_strong_inline(a):
         "//conditions:default": [],
     })
 
+def if_nccl(a):
+    return select({
+        "//tensorflow:no_nccl_support": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": a,
+    })
+
 def get_win_copts(is_external = False):
     WINDOWS_COPTS = [
         "/DPLATFORM_WINDOWS",
@@ -529,12 +539,15 @@ def tf_gen_op_wrappers_cc(
         op_lib_names = [],
         other_srcs = [],
         other_hdrs = [],
+        other_srcs_internal = [],
+        other_hdrs_internal = [],
         pkg = "",
         deps = [
             clean_dep("//tensorflow/cc:ops"),
             clean_dep("//tensorflow/cc:scope"),
             clean_dep("//tensorflow/cc:const_op"),
         ],
+        deps_internal = [],
         op_gen = clean_dep("//tensorflow/cc:cc_op_gen_main"),
         include_internal_ops = 0,
         visibility = None,
@@ -542,16 +555,16 @@ def tf_gen_op_wrappers_cc(
         api_def_srcs = []):
     subsrcs = other_srcs[:]
     subhdrs = other_hdrs[:]
-    internalsrcs = []
-    internalhdrs = []
+    internalsrcs = other_srcs_internal[:]
+    internalhdrs = other_hdrs_internal[:]
     for n in op_lib_names:
         tf_gen_op_wrapper_cc(
             n,
             "ops/" + n,
-            pkg = pkg,
-            op_gen = op_gen,
-            include_internal_ops = include_internal_ops,
             api_def_srcs = api_def_srcs,
+            include_internal_ops = include_internal_ops,
+            op_gen = op_gen,
+            pkg = pkg,
         )
         subsrcs += ["ops/" + n + ".cc"]
         subhdrs += ["ops/" + n + ".h"]
@@ -578,7 +591,7 @@ def tf_gen_op_wrappers_cc(
         name = name + "_internal",
         srcs = internalsrcs,
         hdrs = internalhdrs,
-        deps = deps + if_not_android([
+        deps = deps + deps_internal + if_not_android([
             clean_dep("//tensorflow/core:core_cpu"),
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/core:lib"),
@@ -637,14 +650,14 @@ def tf_gen_op_wrapper_py(
         deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
     tf_cc_binary(
         name = tool_name,
-        linkopts = if_not_windows(["-lm", "-Wl,-ldl"]) + cc_linkopts,
         copts = tf_copts(),
+        linkopts = if_not_windows(["-lm", "-Wl,-ldl"]) + cc_linkopts,
         linkstatic = 1,  # Faster to link this one-time-use binary dynamically
+        visibility = [clean_dep("//tensorflow:internal")],
         deps = ([
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/python:python_op_gen_main"),
         ] + deps),
-        visibility = [clean_dep("//tensorflow:internal")],
     )
 
     # Invoke the previous cc_binary to generate a python file.
@@ -711,6 +724,10 @@ def tf_gen_op_wrapper_py(
         deps = [
             clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
         ],
+        # Instruct build_cleaner to try to avoid using this rule; typically ops
+        # creators will provide their own tf_custom_op_py_library based target
+        # that wraps this one.
+        tags = ["avoid_dep"],
     )
 
 # Define a bazel macro that creates cc_test for tensorflow.
@@ -792,12 +809,12 @@ def tf_cc_test_gpu(
         name,
         srcs,
         deps,
-        linkstatic = linkstatic,
-        tags = tags,
-        data = data,
         size = size,
-        suffix = suffix,
         args = args,
+        data = data,
+        linkstatic = linkstatic,
+        suffix = suffix,
+        tags = tags,
     )
 
 register_extension_info(
@@ -819,24 +836,26 @@ def tf_gpu_cc_test(
         linkopts = []):
     tf_cc_test(
         name = name,
+        size = size,
         srcs = srcs,
-        deps = deps,
-        tags = tags + ["manual"],
+        args = args,
         data = data,
-        size = size,
         extra_copts = extra_copts,
-        linkstatic = linkstatic,
-        linkopts = linkopts,
-        args = args,
         kernels = kernels,
+        linkopts = linkopts,
+        linkstatic = linkstatic,
+        tags = tags + ["manual"],
+        deps = deps,
     )
     tf_cc_test(
         name = name,
+        size = size,
         srcs = srcs,
-        suffix = "_gpu",
-        deps = deps + if_cuda([
-            clean_dep("//tensorflow/core:gpu_runtime"),
-        ]),
+        args = args,
+        data = data,
+        extra_copts = extra_copts,
+        kernels = kernels,
+        linkopts = linkopts,
         linkstatic = select({
             # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
             clean_dep("//tensorflow:darwin"): 1,
@@ -844,13 +863,11 @@ def tf_gpu_cc_test(
             "@local_config_cuda//cuda:using_clang": 1,
             "//conditions:default": 0,
         }),
+        suffix = "_gpu",
         tags = tags + tf_gpu_tests_tags(),
-        data = data,
-        size = size,
-        extra_copts = extra_copts,
-        linkopts = linkopts,
-        args = args,
-        kernels = kernels,
+        deps = deps + if_cuda([
+            clean_dep("//tensorflow/core:gpu_runtime"),
+        ]),
     )
 
 register_extension_info(
@@ -883,6 +900,7 @@ def tf_cuda_cc_test(
         linkopts = linkopts,
         args = args,
         kernels = kernels,
+        linkopts = linkopts,
     )
 
 register_extension_info(
@@ -907,15 +925,14 @@ def tf_gpu_only_cc_test(
         size = size,
         args = args,
         copts = _cuda_copts() + rocm_copts() + tf_copts(),
+        features = if_cuda(["-use_header_modules"]),
         data = data + tf_binary_dynamic_kernel_dsos(kernels),
-        deps = deps + tf_binary_dynamic_kernel_deps(kernels) +
-               if_cuda_is_configured([
-                   clean_dep("//tensorflow/core:cuda"),
-                   clean_dep("//tensorflow/core:gpu_lib"),
-               ]) +
-               if_rocm_is_configured([
-                   clean_dep("//tensorflow/core:gpu_lib"),
-               ]),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda_is_configured([
+            clean_dep("//tensorflow/core:cuda"),
+            clean_dep("//tensorflow/core:gpu_lib"),
+        ]) + if_rocm_is_configured([
+            clean_dep("//tensorflow/core:gpu_lib"),
+        ]),
         linkopts = if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
         linkstatic = linkstatic or select({
             # cc_tests with ".so"s in srcs incorrectly link on Darwin
@@ -977,15 +994,15 @@ def tf_cc_tests(
     for src in srcs:
         tf_cc_test(
             name = src_to_test_name(src),
-            srcs = [src],
-            deps = deps,
-            linkstatic = linkstatic,
-            tags = tags,
             size = size,
+            srcs = [src],
             args = args,
+            kernels = kernels,
             linkopts = linkopts,
+            linkstatic = linkstatic,
             nocopts = nocopts,
-            kernels = kernels,
+            tags = tags,
+            deps = deps,
         )
 
 def tf_cc_test_mkl(
@@ -1035,7 +1052,7 @@ def tf_cc_tests_gpu(
         size = "medium",
         kernels = [],
         args = None):
-    tf_cc_tests(srcs, deps, linkstatic, tags = tags, size = size, kernels = kernels, args = args)
+    tf_cc_tests(srcs, deps, linkstatic, size = size, args = args, kernels = kernels, tags = tags)
 
 def tf_gpu_cc_tests(
         srcs,
@@ -1050,14 +1067,14 @@ def tf_gpu_cc_tests(
     for src in srcs:
         tf_gpu_cc_test(
             name = src_to_test_name(src),
-            srcs = [src],
-            deps = deps,
-            tags = tags,
             size = size,
-            linkstatic = linkstatic,
+            srcs = [src],
             args = args,
             kernels = kernels,
             linkopts = linkopts,
+            linkstatic = linkstatic,
+            tags = tags,
+            deps = deps,
         )
 
 # terminology changes: saving tf_cuda_* definition for compatibility
@@ -1119,7 +1136,7 @@ def _cuda_copts(opts = []):
         "@local_config_cuda//cuda:using_clang": ([
             "-fcuda-flush-denormals-to-zero",
         ]),
-    }) + if_cuda_is_configured(opts)
+    }) + if_cuda_is_configured_compat(opts)
 
 # Build defs for TensorFlow kernels
 
@@ -1144,7 +1161,7 @@ def tf_gpu_kernel_library(
         srcs = srcs,
         hdrs = hdrs,
         copts = copts,
-        deps = deps + if_cuda_is_configured([
+        deps = deps + if_cuda_is_configured_compat([
             clean_dep("//tensorflow/core:cuda"),
             clean_dep("//tensorflow/core:gpu_lib"),
         ]) + if_rocm_is_configured([
@@ -1184,16 +1201,13 @@ def tf_gpu_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
 
     kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
     native.cc_library(
-        deps = deps + if_cuda(cuda_deps + [
+        deps = deps + if_cuda_is_configured_compat(cuda_deps + [
             clean_dep("//tensorflow/core:cuda"),
             "@local_config_cuda//cuda:cuda_headers",
         ]) + if_rocm_is_configured(cuda_deps + [
             # rocm_header placeholder
         ]),
-        copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) + if_mkl(["-DINTEL_MKL=1"]) +
-                 if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
-                 if_enable_mkl(["-DENABLE_MKL"]) +
-                 if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
+        copts = (copts + if_cuda(["-DGOOGLE_CUDA=1"]) + if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) + if_mkl(["-DINTEL_MKL=1"]) + if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) + if_enable_mkl(["-DENABLE_MKL"]) + if_tensorrt(["-DGOOGLE_TENSORRT=1"])),
         **kwargs
     )
 
@@ -1258,6 +1272,11 @@ def tf_kernel_library(
         copts = []
     textual_hdrs = []
     copts = copts + tf_copts(is_external = is_external)
+
+    # Override EIGEN_STRONG_INLINE to inline when
+    # --define=override_eigen_strong_inline=true to avoid long compiling time.
+    # See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"])
     if prefix:
         if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
             if not gpu_srcs:
@@ -1313,8 +1332,8 @@ def tf_kernel_library(
         name = "libtfkernel_%s.so" % name,
         srcs = srcs + hdrs,
         copts = copts,
-        deps = deps,
         tags = ["manual", "notap"],
+        deps = deps,
     )
 
 register_extension_info(
@@ -1393,13 +1412,13 @@ def _py_wrap_cc_impl(ctx):
         ctx.outputs.py_out.dirname,
     ]
     args += ["-l" + f.path for f in ctx.files.swig_includes]
-    args += ["-I" + i for i in swig_include_dirs]
+    args += ["-I" + i for i in swig_include_dirs.to_list()]
     args += [src.path]
     outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
     ctx.action(
         executable = ctx.executable._swig,
         arguments = args,
-        inputs = list(inputs),
+        inputs = inputs.to_list(),
         outputs = outputs,
         mnemonic = "PythonSwig",
         progress_message = "SWIGing " + src.path,
@@ -1579,9 +1598,9 @@ check_deps = rule(
     },
 )
 
-# Helper to build a dynamic library (.so) from the sources containing
-# implementations of custom ops and kernels.
-def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = []):
+def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [], copts = [], **kwargs):
+    """Helper to build a dynamic library (.so) from the sources containing implementations of custom ops and kernels.
+    """
     cuda_deps = [
         clean_dep("//tensorflow/core:stream_executor_headers_lib"),
         "@local_config_cuda//cuda:cuda_headers",
@@ -1591,32 +1610,39 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         clean_dep("//tensorflow/core:stream_executor_headers_lib"),
     ]
     deps = deps + tf_custom_op_library_additional_deps()
+
+    # Override EIGEN_STRONG_INLINE to inline when
+    # --define=override_eigen_strong_inline=true to avoid long compiling time.
+    # See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"])
+
     if gpu_srcs:
         basename = name.split(".")[0]
         native.cc_library(
             name = basename + "_gpu",
             srcs = gpu_srcs,
-            copts = _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
+            copts = copts + _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
             features = if_cuda(["-use_header_modules"]),
-            deps = deps + if_cuda_is_configured(cuda_deps) + if_rocm_is_configured(rocm_deps),
+            deps = deps + if_cuda_is_configured_compat(cuda_deps) + if_rocm_is_configured(rocm_deps),
+            **kwargs
         )
         cuda_deps.extend([":" + basename + "_gpu"])
         rocm_deps.extend([":" + basename + "_gpu"])
 
     check_deps(
         name = name + "_check_deps",
-        deps = deps + if_cuda_is_configured(cuda_deps) + if_rocm_is_configured(rocm_deps),
         disallowed_deps = [
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/core:lib"),
         ],
+        deps = deps + if_cuda_is_configured_compat(cuda_deps) + if_rocm_is_configured(rocm_deps),
     )
     tf_cc_shared_object(
         name = name,
         srcs = srcs,
-        deps = deps + if_cuda_is_configured(cuda_deps) + if_rocm_is_configured(rocm_deps),
+        deps = deps + if_cuda_is_configured_compat(cuda_deps) + if_rocm_is_configured(rocm_deps),
         data = if_static([name + "_check_deps"]),
-        copts = tf_copts(is_external = True),
+        copts = copts + tf_copts(is_external = True),
         features = ["windows_export_all_symbols"],
         linkopts = linkopts + select({
             "//conditions:default": [
@@ -1625,6 +1651,7 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
             clean_dep("//tensorflow:windows"): [],
             clean_dep("//tensorflow:darwin"): [],
         }),
+        **kwargs
     )
 
 register_extension_info(
@@ -1640,7 +1667,7 @@ def tf_custom_op_py_library(
         srcs_version = "PY2AND3",
         visibility = None,
         deps = []):
-    kernels = kernels  # unused argument
+    _ignore = [kernels]
     native.py_library(
         name = name,
         data = dso,
@@ -1683,10 +1710,13 @@ def _append_init_to_versionscript_impl(ctx):
         )
 
 _append_init_to_versionscript = rule(
-    implementation = _append_init_to_versionscript_impl,
     attrs = {
         "module_name": attr.string(mandatory = True),
-        "template_file": attr.label(allow_files = True, single_file = True, mandatory = True),
+        "template_file": attr.label(
+            allow_files = True,
+            single_file = True,
+            mandatory = True,
+        ),
         "is_version_script": attr.bool(
             default = True,
             doc = "whether target is a ld version script or exported symbol list",
@@ -1694,6 +1724,7 @@ _append_init_to_versionscript = rule(
         ),
     },
     outputs = {"versionscript": "%{name}.lds"},
+    implementation = _append_init_to_versionscript_impl,
 )
 
 def tf_py_wrap_cc(
@@ -1715,20 +1746,20 @@ def tf_py_wrap_cc(
     _py_wrap_cc(
         name = name + "_py_wrap",
         srcs = srcs,
-        swig_includes = swig_includes,
-        deps = deps + extra_deps,
-        toolchain_deps = ["@bazel_tools//tools/cpp:current_cc_toolchain"],
         module_name = module_name,
         py_module_name = name,
+        swig_includes = swig_includes,
+        toolchain_deps = ["@bazel_tools//tools/cpp:current_cc_toolchain"],
+        deps = deps + extra_deps,
     )
     vscriptname = name + "_versionscript"
     _append_init_to_versionscript(
         name = vscriptname,
-        module_name = module_name,
         is_version_script = select({
             "@local_config_cuda//cuda:darwin": False,
             "//conditions:default": True,
         }),
+        module_name = module_name,
         template_file = select({
             "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
             "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
@@ -1864,19 +1895,19 @@ def tf_py_test(
         name = name,
         size = size,
         srcs = srcs,
-        main = main,
         args = args,
+        data = data,
+        flaky = flaky,
+        kernels = kernels,
+        main = main,
+        shard_count = shard_count,
+        srcs_version = "PY2AND3",
         tags = tags,
         visibility = [clean_dep("//tensorflow:internal")],
-        shard_count = shard_count,
-        kernels = kernels,
-        data = data,
         deps = [
             clean_dep("//tensorflow/python:extra_py_tests_deps"),
             clean_dep("//tensorflow/python:gradient_checker"),
         ] + additional_deps,
-        flaky = flaky,
-        srcs_version = "PY2AND3",
     )
 
 register_extension_info(
@@ -1898,22 +1929,29 @@ def gpu_py_test(
         flaky = 0,
         xla_enabled = False,
         grpc_enabled = False):
-    test_tags = tags + tf_gpu_tests_tags()
-    tf_py_test(
-        name = name,
-        size = size,
-        srcs = srcs,
-        data = data,
-        main = main,
-        args = args,
-        tags = test_tags,
-        shard_count = shard_count,
-        additional_deps = additional_deps,
-        kernels = kernels,
-        flaky = flaky,
-        xla_enabled = xla_enabled,
-        grpc_enabled = grpc_enabled,
-    )
+    if main == None:
+        main = name + ".py"
+    for config in ["cpu", "gpu"]:
+        test_name = name
+        test_tags = tags
+        if config == "gpu":
+            test_name += "_gpu"
+            test_tags = test_tags + tf_gpu_tests_tags()
+        tf_py_test(
+            name = test_name,
+            size = size,
+            srcs = srcs,
+            additional_deps = additional_deps,
+            args = args,
+            data = data,
+            flaky = flaky,
+            grpc_enabled = grpc_enabled,
+            kernels = kernels,
+            main = main,
+            shard_count = shard_count,
+            tags = test_tags,
+            xla_enabled = xla_enabled,
+        )
 
 register_extension_info(
     extension_name = "gpu_py_test",
@@ -1975,16 +2013,16 @@ def sycl_py_test(
         name = name,
         size = size,
         srcs = srcs,
+        additional_deps = additional_deps,
+        args = args,
         data = data,
+        flaky = flaky,
+        grpc_enabled = grpc_enabled,
+        kernels = kernels,
         main = main,
-        args = args,
-        tags = test_tags,
         shard_count = shard_count,
-        additional_deps = additional_deps,
-        kernels = kernels,
-        flaky = flaky,
+        tags = test_tags,
         xla_enabled = xla_enabled,
-        grpc_enabled = grpc_enabled,
     )
 
 register_extension_info(
@@ -2012,14 +2050,14 @@ def py_tests(
             name = test_name,
             size = size,
             srcs = [src],
-            main = src,
-            tags = tags,
-            shard_count = shard_count,
-            data = data,
             additional_deps = additional_deps,
+            data = data,
+            grpc_enabled = grpc_enabled,
             kernels = kernels,
+            main = src,
+            shard_count = shard_count,
+            tags = tags,
             xla_enabled = xla_enabled,
-            grpc_enabled = grpc_enabled,
         )
 
 def gpu_py_tests(
@@ -2041,12 +2079,12 @@ def gpu_py_tests(
         srcs = srcs,
         additional_deps = additional_deps,
         data = data,
-        tags = test_tags,
-        shard_count = shard_count,
-        prefix = prefix,
+        grpc_enabled = grpc_enabled,
         kernels = kernels,
+        prefix = prefix,
+        shard_count = shard_count,
+        tags = test_tags,
         xla_enabled = xla_enabled,
-        grpc_enabled = grpc_enabled,
     )
 
 # terminology changes: saving cuda_* definition for compatibility
@@ -2130,9 +2168,9 @@ def tf_version_info_genrule():
         ],
         outs = ["util/version_info.cc"],
         cmd =
-            "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
+            "$(location //tensorflow/tools/git:gen_git_source) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
         local = 1,
-        tools = [clean_dep("//tensorflow/tools/git:gen_git_source.py")],
+        tools = [clean_dep("//tensorflow/tools/git:gen_git_source")],
     )
 
 def tf_py_build_info_genrule():
@@ -2140,9 +2178,9 @@ def tf_py_build_info_genrule():
         name = "py_build_info_gen",
         outs = ["platform/build_info.py"],
         cmd =
-            "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+            "$(location //tensorflow/tools/build_info:gen_build_info) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu") + if_windows(" --key_value msvcp_dll_name=msvcp140.dll", ""),
         local = 1,
-        tools = [clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],
+        tools = [clean_dep("//tensorflow/tools/build_info:gen_build_info")],
     )
 
 def cc_library_with_android_deps(
@@ -2158,3 +2196,6 @@ register_extension_info(
     extension_name = "cc_library_with_android_deps",
     label_regex_for_dep = "{extension_name}",
 )
+
+def tensorflow_opensource_extra_deps():
+    return []
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index 3ff824e5e1707c65b5ad3cc22dd32267953964c6..9f6114f503467fc12fcfb5dae07e75d2113e410d 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -5,3 +5,4 @@
 *TFE_*
 *nsync_*
 *pywrap_xla*
+*stream_executor*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 6b28943f01cfdb174fd135c670a6bb409ee0e102..39d258c3b7edd1f5f7d0805c080e832aa1d6109a 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -6,6 +6,7 @@ tensorflow {
     *TFE_*;
     *nsync_*;
     *pywrap_xla*;
+    *stream_executor*;
   local:
     *;
 };
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index 9f6dcd8fdb069703844f1e05294cf55cd83ba745..a1083d732a1bb1b3212457f445323e5e868ef162 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -14,7 +14,19 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_STRING
     }
-    reserved_range {
+    field {
+      name: "recv_buf_max_chunk"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "use_numa_affinity"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+     reserved_range {
       start: 2
       end: 3
     }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index f3a515163df64297cbc25d6f9a33fc2b78648402..b505d813509c2049fa6e3f60df553492d6f66613 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -137,6 +137,18 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_STRING
       }
+      field {
+        name: "recv_buf_max_chunk"
+        number: 4
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "use_numa_affinity"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index 353e63127de174a79c209a05327da2de20bf0dd7..a2cc07483a4e10918891f555ca9459fb7503bb32 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -78,6 +78,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
+      field {
+        name: "collective_ring_order"
+        number: 4
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
       nested_type {
         name: "VirtualDevices"
         field {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
index 2f4257a66a5b45c3890c18bc5b8c97c020c6a001..2299a009d3d5335553e1de025c42b23a57592de3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
@@ -6,9 +6,17 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'persistent\', \'watch_accessed_variables\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
   }
+  member_method {
+    name: "batch_jacobian"
+    argspec: "args=[\'self\', \'target\', \'source\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "gradient"
-    argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'UnconnectedGradients.NONE\'], "
+  }
+  member_method {
+    name: "jacobian"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
   }
   member_method {
     name: "reset"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
index 8e3598fb2470b327e6e3601969f055d4907f614a..d11e927bd55cea52d0dbdfd4b28b2c1bc24fdaa5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.TensorShape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV1\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dims"
@@ -10,6 +10,10 @@ tf_class {
     name: "ndims"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "rank"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'dims\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..493dcba8922d7f6c51a61d337f48e09d168e6bac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.TensorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_spec.TensorSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable-aggregation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable-aggregation.pbtxt
index 66a20547eb6d13ae60d71b07cbf150a4ca2abfe7..4f815b38ed79930c2d48e3d94cb36bcedb4ae9c0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable-aggregation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable-aggregation.pbtxt
@@ -9,6 +9,10 @@ tf_class {
     name: "NONE"
     mtype: "<enum \'VariableAggregation\'>"
   }
+  member {
+    name: "ONLY_FIRST_REPLICA"
+    mtype: "<enum \'VariableAggregation\'>"
+  }
   member {
     name: "ONLY_FIRST_TOWER"
     mtype: "<enum \'VariableAggregation\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index af7fc9d4efebc62c282bb82f8a71cd0f5cdfb827..62d8ea9208f7f5f031b80be168cedfd538f18a22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "assign_sub"
     argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "batch_scatter_update"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "count_up_to"
     argspec: "args=[\'self\', \'limit\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.__metaclass__.pbtxt
deleted file mode 100644
index af08c88d3333fa897c38cc2f6530a9c5cda15342..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.Dataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 8b7f63e43e237864d4ef24d3b251b23199f9ee17..f7d388d33d050eac2c9f14682bc7068c745a46bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -1,18 +1,19 @@
 path: "tensorflow.data.Dataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_shapes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_types"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
deleted file mode 100644
index f384323fc89bb7d21309e86ddaab2e6e1f9f212b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.FixedLengthRecordDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index a7bfa82c650e0a511cb6c8eaffceaf49fbfeaa39..d73168b070e374a749a00f74b24b77a715d2f37e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -17,7 +19,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index d15dccc173b9c2633e4a4303a5c69f59b2cd4536..72fc2c3a9ee5b985723ce2dba9643ba796362dc7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -1,49 +1,30 @@
 path: "tensorflow.data.Options"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "experimental_autotune"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_filter_fusion"
+    name: "experimental_deterministic"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_hoist_random_uniform"
+    name: "experimental_numa_aware"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_latency_all_edges"
+    name: "experimental_optimization"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_map_and_batch_fusion"
+    name: "experimental_stats"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_map_and_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_parallelization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_vectorization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_noop_elimination"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_shuffle_and_repeat_fusion"
+    name: "experimental_threading"
     mtype: "<type \'property\'>"
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
deleted file mode 100644
index b12dec8a70be5e0cd8346785b48f56b15155dd02..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.TFRecordDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 7b7a9ebaf08b1e9fdb5e4c5b7448175611a9b2c4..51224cd6b45f0a1efdfbb3ba6a3ca377d37fd00b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.data.TFRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
deleted file mode 100644
index 7ddcdce2663ca0ef6409fb3ab3c29555948d7302..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.TextLineDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 2817f900e15ccf8df2ca71aa0218ba07eef682e2..a10add1b7e38f9875e699903b3e3c103d73e647e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
deleted file mode 100644
index 3eeaa1b185058c99d786bdad9e95c96c9ea5ebab..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.experimental.CsvDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2520e28a3c708f45942eb2e73911b7a5226646e5..71b597c19c512879b8f18b34843b160efecc6bec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.CsvDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb304f763ea44d0d7314248170e615115b0794c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.DatasetStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'element_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-nested-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-nested-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4b066e563cc6196650b1ba561da7c16a80a8656
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-nested-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.NestedStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.NestedStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'nested_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b7ad64f51f88ae9c860e061db5c1ad6b5f2bcf8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -0,0 +1,50 @@
+path: "tensorflow.data.experimental.OptimizationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.optimization_options.OptimizationOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "apply_default_optimizations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "hoist_random_uniform"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_batch_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_parallelization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_vectorization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "noop_elimination"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shuffle_and_repeat_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf41c1d1d696d94ef9da5fc64272349d1533816e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.OptionalStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.optional_ops.OptionalStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
deleted file mode 100644
index 2991b12f64ca8812a6b04217dc49225edd157d92..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.experimental.RandomDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 1dd53b1eabdf15b662a839a07176ba4eaf8bda37..20646e87b5fbe23d89ad31ca632a64bf958339f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.RandomDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f97376b328cf34eb04918bec7bacf08d254d8db5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.SparseTensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.SparseTensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
deleted file mode 100644
index 948e99ef86fa52a32b7028c69be190e384c1b658..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.experimental.SqlDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8fdd9dc52e332abdeed039bd85d31f6318d013e9..86c5ff5b0bd7b42d61a92a44c8888852a48677be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.SqlDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
index 0bcc8cf3e87ea8b78f28130da60a1749e2848806..6536a698b50efc9daaa72d8ae589855e30fbc601 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-aggregator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.data.experimental.StatsAggregator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_ops.StatsAggregator\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_aggregator.StatsAggregator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..892f8c1fb897dfc8bf4964c118aeb641dffd3caa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.data.experimental.StatsOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "aggregator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "counter_prefix"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latency_all_edges"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "prefix"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a99db4542e0deb506d00c00f889299dd22d67e1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.data.experimental.Structure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c8864a9dd98058c659e72ba8059182a666ea39
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.TensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.TensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-threading-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-threading-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b5ebf108018e75b6de28287a68a25a03b294b64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-threading-options.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.ThreadingOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.threading_options.ThreadingOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "max_intra_op_parallelism"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "private_threadpool_size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 2a1f899dc00dc9426f538d8a1a867cc16e2068c2..2d115904925eb96164484300baf628d41d3fcff4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -1,37 +1,85 @@
 path: "tensorflow.data.experimental"
 tf_module {
+  member {
+    name: "AUTOTUNE"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "CheckpointInputPipelineHook"
     mtype: "<type \'type\'>"
   }
   member {
     name: "CsvDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DatasetStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NestedStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OptimizationOptions"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "Optional"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OptionalStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RandomDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "Reducer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseTensorStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SqlDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "StatsAggregator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StatsOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Structure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ThreadingOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -40,6 +88,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "choose_from_datasets"
     argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
@@ -56,6 +108,10 @@ tf_module {
     name: "enumerate_dataset"
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "filter_for_shard"
+    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
@@ -82,7 +138,7 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
@@ -120,10 +176,6 @@ tf_module {
     name: "scan"
     argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "set_stats_aggregator"
-    argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], "
-  }
   member_method {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 3023276a1d3de86d76caa2749f7e85c45e6d9bd6..aa474680592a1a3996ca3db970b814ba167cd801 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -2,11 +2,11 @@ path: "tensorflow.data"
 tf_module {
   member {
     name: "Dataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "FixedLengthRecordDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "Iterator"
@@ -18,14 +18,22 @@ tf_module {
   }
   member {
     name: "TFRecordDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "TextLineDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
index ab6287f8cd080621d76fc34e2cb437960a217800..8a7f1e9363b8211d83d39d31da11507cb4c805eb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..583cbc66549223e5c954b715e2043efa5417ef18
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.distribute.InputContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "input_pipeline_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_input_pipelines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_input_pipelines\', \'input_pipeline_id\', \'num_replicas_in_sync\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "get_per_replica_batch_size"
+    argspec: "args=[\'self\', \'global_batch_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a7a3a97aa0927b81708311d4b8b28fced217c00
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.distribute.InputReplicationMode"
+tf_class {
+  is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_WORKER"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a613e2d3d1dcefacdf0ec336587a46ff7e0bcb90
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -0,0 +1,138 @@
+path: "tensorflow.distribute.MirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4899f38cad253167ce0b94f79388cb97fe534197
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.ReduceOp"
+tf_class {
+  is_instance: "<enum \'ReduceOp\'>"
+  member {
+    name: "MEAN"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..df707e8920e4488ed6b40a7f93f56b5624188c84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.ReplicaContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "distribution_strategy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica_id_in_sync_group"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strategy"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_call"
+    argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77706e57133e1186d9e98fcf9205ed4c91772eda
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.distribute.StrategyExtended"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategyExtended\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'container_strategy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_steps_on_iterator"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9eb73d2c0d9069ec4b818abe1825503f0ea36fc9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -0,0 +1,137 @@
+path: "tensorflow.distribute.Strategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b0dd73ca1d4179b4a3323fa0a9be2fde4e22799c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.distribute"
+tf_module {
+  member {
+    name: "InputContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InputReplicationMode"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "MirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ReduceOp"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReplicaContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Strategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StrategyExtended"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "get_loss_reduction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "has_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "in_cross_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
index ea23feca84802669e0cb1e0da0f77a6cdd50908c..01b870a81639807489ec2a09dcc185137aae1665 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
@@ -4,6 +4,110 @@ tf_module {
     name: "DType"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "QUANTIZED_DTYPES"
+    mtype: "<type \'frozenset\'>"
+  }
+  member {
+    name: "bfloat16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "bool"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "complex128"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "complex64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "double"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "half"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "quint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "quint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "resource"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "string"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "variant"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member_method {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
index 082e26b99bfe797dea72d27e2b66f2cd1cc815fd..ee3a72bfce71d64cc6d780ac7c4e0091ad5f0da9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.BaselineClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.baseline.BaselineClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38b27f735ffa546d46b6e1cb0b2de3de06358184
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
@@ -0,0 +1,67 @@
+path: "tensorflow.estimator.BaselineEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'model_dir\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
index 7cc4191eb32548ae48a49c6bc42ac78c7f79f5d0..3874b84d5a6204ee18c520fcdb0042e3175f63bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.BaselineRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.baseline.BaselineRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-best-exporter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-best-exporter.pbtxt
index 9694268199a29c51f37bc73a2f92715c78854a2f..68145735bd528038187946db665a25a77143abc6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-best-exporter.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-best-exporter.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BestExporter"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.exporter.BestExporter\'>"
-  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.BestExporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.Exporter\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index ef3409b1b59dc1177676107f7628354141b7f417..e138ce936ec73c05f8f790fb63c381e56ae2f654 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.estimator.BoostedTreesClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -22,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 775130468fe9e753e5c22902274a0b238021a598..eae0a292a962680a53d8c683ee2d2b97e24937a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.estimator.BoostedTreesRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -22,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9e1504b494e3863f770df23f9f9a92e004b8713
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.CheckpointSaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..111b7583f2cd005912c7f06d977565cd17f265b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.estimator.CheckpointSaverListener"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
index 718f415a777a0f150972fd061f979dbabf8cd592..b54133b294e3283cb84316dd0f71670e5bf49333 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.DNNClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.dnn.DNNClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09e0d3819244da026762eb1c4f31d25aba68fd27
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -0,0 +1,67 @@
+path: "tensorflow.estimator.DNNEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index b23c019d6c9af1865a53debc9940d7d957d5f183..5a1d85a9b1028b164250819203be49700d81b336 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.DNNLinearCombinedClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e311f96d3dca0007619df2322ee5ca0295c55ac6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -0,0 +1,67 @@
+path: "tensorflow.estimator.DNNLinearCombinedEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'input_layer_partitioner\', \'config\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'sum\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index caa9e3f1deb956a85ceefca6b12d89245f8c4ec6..db4780e4c0159b7ff553a736c24ac7701eae7f14 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.DNNLinearCombinedRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 1f5e650940259f78c56ab4d2e28260fb6f23db2b..a44e719099e5edd23e5e4bc4fd7eb0a1c3c5799d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.DNNRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.dnn.DNNRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator-spec.pbtxt
index aa6ac46613fbead7457b19e1aae5f2532afddef1..376becc3f9bb0c3d830f3ed2e5a05b8d17757299 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator-spec.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.EstimatorSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.model_fn.EstimatorSpec\'>"
-  is_instance: "<class \'tensorflow.python.estimator.model_fn.EstimatorSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.EstimatorSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.EstimatorSpec\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "eval_metric_ops"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
index ebd3869c9b093e45a0b61cf443f872a8ceb07327..bff6c86cd75358847430954c690fa021a027dca6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.estimator.Estimator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -30,9 +31,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-eval-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-eval-spec.pbtxt
index db83ba1bd8f0bd13c9048d62d74790ed2b729589..23c2544fe461f9760e873a0761059a6356e5f8fa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-eval-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-eval-spec.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.EvalSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.training.EvalSpec\'>"
-  is_instance: "<class \'tensorflow.python.estimator.training.EvalSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.training.EvalSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.training.EvalSpec\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "exporters"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-exporter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-exporter.pbtxt
index 035af70e52024f8d16e1cd12951af10aad355eda..6c3f0fd910829f0173ed78284cf92202d8186685 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-exporter.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-exporter.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.Exporter"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.Exporter\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-feed-fn-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f24de493f24a363190cd1d323adaa75b32b0d8e3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-feed-fn-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.FeedFnHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FeedFnHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feed_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-exporter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-exporter.pbtxt
index ee37b1fa210ea816ef762590cfd1725c71262ed8..e030d401ea4122800535074192f1ed24e85af0e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-exporter.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-exporter.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.FinalExporter"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.exporter.FinalExporter\'>"
-  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.FinalExporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.Exporter\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-ops-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6651170ba33f491d5a5342bcd6e6814e1b973832
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-ops-hook.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.estimator.FinalOpsHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FinalOpsHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "final_ops_values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'final_ops\', \'final_ops_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-global-step-waiter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37db48bc64e2f0e955105e8094d51c851c25558b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-global-step-waiter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.GlobalStepWaiterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.GlobalStepWaiterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'wait_until_step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-latest-exporter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-latest-exporter.pbtxt
index 2a9d0290295114daa006d39f17a295a01e40da6b..d67f2bd625ed8b66565df1a68c9bb90c33c22efd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-latest-exporter.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-latest-exporter.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LatestExporter"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.exporter.LatestExporter\'>"
-  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.LatestExporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.Exporter\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
index 53ec5a0c781096a04e65ea6ae41cd755040615ef..2c8e82517beea5262e19503edfb480d9ec880f9c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.LinearClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.linear.LinearClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2148374fdee77af7c4bc669f4a1ebd65b45c5e13
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
@@ -0,0 +1,67 @@
+path: "tensorflow.estimator.LinearEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
index 3791162619c0db1e205a7f6a028966e8f5dc2b68..1bdc6124fe93ff903ef55ce586e294cffc43e1be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.estimator.LinearRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.linear.LinearRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-logging-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..425f0167a161104891c3bb76816fe8c5094de28a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-logging-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.LoggingTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.LoggingTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tensors\', \'every_n_iter\', \'every_n_secs\', \'at_end\', \'formatter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
index 6a1c24fa63fc074c2b4ae9b3225a6abb47958b68..bf7c1abcd89b29c29f3487cab58cfdf28103119c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.ModeKeys"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.model_fn.ModeKeys\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.ModeKeys\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "EVAL"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6e17e4352b0f909b31327a57bbdca3bc0e02a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.estimator.NanLossDuringTrainingError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..82293c2c0c4e7204d9aba83f43ed2fac6bc46b19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.NanTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loss_tensor\', \'fail_on_nan_loss\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-profiler-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..65b5fb16b0874e7c6469ef11420db146be1f0b5f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-profiler-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.ProfilerHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.ProfilerHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'show_dataflow\', \'show_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'True\', \'False\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
index 269e18a0a700548ce01b6eb215d936da4c718a65..827b1ac5a576208e090d2db7d702675db89de4a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.RunConfig"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.run_config.RunConfig\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.run_config.RunConfig\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "cluster_spec"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-second-or-step-timer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64051d2bd6b69614cd210d902552ddeb8b6c8e5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-second-or-step-timer.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.estimator.SecondOrStepTimer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks._HookTimer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_secs\', \'every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "last_triggered_step"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "should_trigger_for_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_last_triggered_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-step-counter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4368e04df3f86834b540bb5306bf66dd82ac440c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-step-counter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.StepCounterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StepCounterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_n_steps\', \'every_n_secs\', \'output_dir\', \'summary_writer\'], varargs=None, keywords=None, defaults=[\'100\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-stop-at-step-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..938b189a8c30237bb15bf73083a348e6366fbfc4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-stop-at-step-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.StopAtStepHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StopAtStepHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_steps\', \'last_step\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-summary-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..104157315f5982efb4f6b9f39e0ece905a225e10
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-summary-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.SummarySaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SummarySaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'summary_writer\', \'scaffold\', \'summary_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-train-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-train-spec.pbtxt
index 7d2f77438afa41f2d8391524470f82a22076313b..1d9f51a20e2f94803abec9fceb3c25c86122785b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-train-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-train-spec.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.TrainSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.training.TrainSpec\'>"
-  is_instance: "<class \'tensorflow.python.estimator.training.TrainSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.training.TrainSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.training.TrainSpec\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "hooks"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-warm-start-settings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-warm-start-settings.pbtxt
index 43f5343359aff3b856a2b3708e4cda7cec29e146..dca2c1fe11764b6b2023e7c1c33a3e190706c08b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-warm-start-settings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-warm-start-settings.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.WarmStartSettings"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.estimator.WarmStartSettings\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.WarmStartSettings\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.WarmStartSettings\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.WarmStartSettings\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "ckpt_to_initialize_from"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a2a01cd5325ba7e02d9b549293dd09a4a57e167
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.experimental.InMemoryEvaluatorHook"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.hooks.InMemoryEvaluatorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'estimator\', \'input_fn\', \'steps\', \'hooks\', \'name\', \'every_n_iter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'100\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-linear-s-d-c-a.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-linear-s-d-c-a.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85292e4d7ed4e448c40186ba05ef12c351068a39
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-linear-s-d-c-a.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.estimator.experimental.LinearSDCA"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearSDCA\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'example_id_column\', \'num_loss_partitions\', \'num_table_shards\', \'symmetric_l1_regularization\', \'symmetric_l2_regularization\', \'adaptive\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'0.0\', \'1.0\', \'False\'], "
+  }
+  member_method {
+    name: "get_train_step"
+    argspec: "args=[\'self\', \'state_manager\', \'weight_column_name\', \'loss_type\', \'feature_columns\', \'features\', \'targets\', \'bias_var\', \'global_step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f0fd7ce782db71ff5e790fe50e93556bf5d19e1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.estimator.experimental"
+tf_module {
+  member {
+    name: "InMemoryEvaluatorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LinearSDCA"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "build_raw_supervised_input_receiver_fn"
+    argspec: "args=[\'features\', \'labels\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_logit_fn"
+    argspec: "args=[\'logit_fn\', \'features\', \'mode\', \'params\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dnn_logit_fn_builder"
+    argspec: "args=[\'units\', \'hidden_units\', \'feature_columns\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'batch_norm\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "linear_logit_fn_builder"
+    argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], "
+  }
+  member_method {
+    name: "make_early_stopping_hook"
+    argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
+  }
+  member_method {
+    name: "make_stop_at_checkpoint_step_hook"
+    argspec: "args=[\'estimator\', \'last_step\', \'wait_after_file_check_secs\'], varargs=None, keywords=None, defaults=[\'30\'], "
+  }
+  member_method {
+    name: "stop_if_higher_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
index 3cf7af8da95479cf49469b2f328db0919fd5ce95..820afac8e10a7ceeefb351896077c7cf2af044d7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ClassificationOutput.__metaclass__"
 tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
+  is_instance: "<type \'type\'>"
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
index 2df1840c4a4f03fc08ba535b4f6557d49608fa5f..52874dd9b9316d9815c5aef51e272e6ffddb5224 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ClassificationOutput"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ClassificationOutput\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
index 5d165ccbf91865e48f40f88ff817bff03881a03b..b811e1f3dab9437ee53f6bd8fb7215b35b121f9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput.__metaclass__"
 tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
+  is_instance: "<type \'type\'>"
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
index fa62e8ced801d66951ef5a62ec4fdd9795226ebd..964c315e9730effac38d60f7242527e71cbf9846 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
index 743495ba98cf4db0abeba86e26b812d9e3c8695b..bdfcb9c8882f334dc2706d079918855d651484c1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.PredictOutput.__metaclass__"
 tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
+  is_instance: "<type \'type\'>"
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
index e0160b10ce13a0b3499143d151ee7e58ad858fb2..bb82bc9e58627318b897f0610c7d852db7f98c07 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.PredictOutput"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.PredictOutput\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "outputs"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
index dbf4e3dec85d7d00045bfe4e7086ba23edf61a84..dcd7cbf427e3d0219a8aa94621f9502fffc10ca6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.RegressionOutput.__metaclass__"
 tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
+  is_instance: "<type \'type\'>"
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
index 905f0e055350fe9a7d5790e531fb2b089332f279..8522834433f214e5d646ef6265b1047fb7f2cc4f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.RegressionOutput"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.RegressionOutput\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "value"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-serving-input-receiver.pbtxt
index d71b2a430065740c376f8e90e3244d105ac2101f..a0371a16635161a69998a1901aec8f8962f98fd7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-serving-input-receiver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-serving-input-receiver.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ServingInputReceiver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export.ServingInputReceiver\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export.ServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export.ServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export.ServingInputReceiver\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "features"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
index 4fe92643bf9867765499d7bf475b9cdd1686aec5..da9d05df237397c4d0fa0746a6a4e835c5d42b0e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.TensorServingInputReceiver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export.TensorServingInputReceiver\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export.TensorServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export.TensorServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export.TensorServingInputReceiver\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "features"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.pbtxt
index bd72f6cd79f7dffb9f0a7f8ae43751c4ecba939d..8df585a5d9b401d553652fa8168445730eb145ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.pbtxt
@@ -2,19 +2,19 @@ path: "tensorflow.estimator.export"
 tf_module {
   member {
     name: "ClassificationOutput"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "ExportOutput"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "PredictOutput"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "RegressionOutput"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "ServingInputReceiver"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
index f1d204a3ef96f35e31f642bcb0a61351b263d273..d3656ae0455971ccd98062a52ec0412bf6af06f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "BaselineClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BaselineEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "BaselineRegressor"
     mtype: "<type \'type\'>"
@@ -20,14 +24,30 @@ tf_module {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointSaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CheckpointSaverListener"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DNNEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNLinearCombinedClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DNNLinearCombinedEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNLinearCombinedRegressor"
     mtype: "<type \'type\'>"
@@ -52,10 +72,22 @@ tf_module {
     name: "Exporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FeedFnHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FinalExporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FinalOpsHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlobalStepWaiterHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LatestExporter"
     mtype: "<type \'type\'>"
@@ -64,18 +96,54 @@ tf_module {
     name: "LinearClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LoggingTensorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ModeKeys"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NanLossDuringTrainingError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProfilerHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RunConfig"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SecondOrStepTimer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StepCounterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StopAtStepHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SummarySaverHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrainSpec"
     mtype: "<type \'type\'>"
@@ -88,6 +156,10 @@ tf_module {
     name: "WarmStartSettings"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "export"
     mtype: "<type \'module\'>"
@@ -96,6 +168,10 @@ tf_module {
     name: "inputs"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "add_metrics"
+    argspec: "args=[\'estimator\', \'metric_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "classifier_parse_example_spec"
     argspec: "args=[\'feature_columns\', \'label_key\', \'label_dtype\', \'label_default\', \'weight_column\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c3f04e468c4c817cd474deb42149aee3021aa43
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.experimental"
+tf_module {
+  member_method {
+    name: "function_executor_type"
+    argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index 0a231f1b65155b8662bb38943bfd97c5283b9385..15d0e099bab3052553671d52d396239b27383a8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -172,6 +172,10 @@ tf_module {
     name: "random_saturation"
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "resize"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
+  }
   member_method {
     name: "resize_area"
     argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -240,6 +244,10 @@ tf_module {
     name: "total_variation"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "transpose_image"
     argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cfa3372b12bfe32eed4311c89b6448c0359c0913
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.io.gfile"
+tf_module {
+  member_method {
+    name: "copy"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "exists"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "glob"
+    argspec: "args=[\'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "listdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "makedirs"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mkdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "remove"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rename"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "rmtree"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stat"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "walk"
+    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index dccf136788da44073160931707167b7d8baa0add..b760ec38906e7ae23445857b46e9941a418a5c29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -44,10 +44,22 @@ tf_module {
     name: "VarLenFeature"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "gfile"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "decode_and_crop_jpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
   member_method {
     name: "decode_compressed"
     argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
@@ -56,10 +68,26 @@ tf_module {
     name: "decode_csv"
     argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
   }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_json_example"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "decode_raw"
     argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -72,6 +100,18 @@ tf_module {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "extract_jpeg_shape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "is_jpeg"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "match_filenames_once"
     argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -112,6 +152,10 @@ tf_module {
     name: "serialize_sparse"
     argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
+  member_method {
+    name: "serialize_tensor"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tf_record_iterator"
     argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 0869de02432d1ed5c492641b3d5a3ddfb786d554..a3254cbd947d9ef70617131e9f4b17f44f059840 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -89,10 +101,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -107,7 +115,11 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_update"
@@ -115,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -229,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -263,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 20f39fae1eb70da00cfcc3b6ce0d2abb79189228..b70e9ee98d5bc4900420ddb1307abf9adcd8cad0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -90,10 +102,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -112,7 +120,11 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_update"
@@ -120,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -246,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -268,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -280,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt
index 2e9de9ebb21021ab82ed4409243e13db49d7327c..eb315e356dabc5a404740afe9d3c0c60d82fdbb5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.activations.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "elu"
     argspec: "args=[\'x\', \'alpha\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
   }
+  member_method {
+    name: "exponential"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index a71a59e269fa6e24b01b81bb222fef528869db68..8cd0c6ea5f027fa1f30b60a742450b651242d406 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -46,7 +46,7 @@ tf_module {
   }
   member_method {
     name: "batch_normalization"
-    argspec: "args=[\'x\', \'mean\', \'var\', \'beta\', \'gamma\', \'epsilon\'], varargs=None, keywords=None, defaults=[\'0.001\'], "
+    argspec: "args=[\'x\', \'mean\', \'var\', \'beta\', \'gamma\', \'axis\', \'epsilon\'], varargs=None, keywords=None, defaults=[\'-1\', \'0.001\'], "
   }
   member_method {
     name: "batch_set_value"
@@ -98,7 +98,7 @@ tf_module {
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'x\', \'kernel\', \'output_shape\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'None\'], "
+    argspec: "args=[\'x\', \'kernel\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\'], "
   }
   member_method {
     name: "conv3d"
@@ -182,7 +182,7 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'inputs\', \'outputs\', \'updates\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'outputs\', \'updates\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "gather"
@@ -386,7 +386,7 @@ tf_module {
   }
   member_method {
     name: "resize_images"
-    argspec: "args=[\'x\', \'height_factor\', \'width_factor\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'x\', \'height_factor\', \'width_factor\', \'data_format\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'nearest\'], "
   }
   member_method {
     name: "resize_volumes"
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\', \'zero_output_for_mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "round"
@@ -512,6 +512,10 @@ tf_module {
     name: "temporal_padding"
     argspec: "args=[\'x\', \'padding\'], varargs=None, keywords=None, defaults=[\'(1, 1)\'], "
   }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'x\', \'n\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_dense"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
index 9eee9b378964a9947b067b7ec495ef6556ab6d0c..7d298e95135ebf41230d72ff488fef30be682edb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 5bb949c5bb650acee91b14a4d6bf95b36029edf7..133205ab88b47afad32fc70ceca93513768a3b19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
index a5340d52c1af6d69da30fd710bcee9d832917574..d766c09ac5efaa9d0e4ffba4e495385130c7e770 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
@@ -22,6 +22,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
index f71292856cd29b2e52194bec8a586686fbfad667..605f74e5602a63f5a18c31cb26113d300ec76e7a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -5,7 +5,11 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'monitor\', \'min_delta\', \'patience\', \'verbose\', \'mode\', \'baseline\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0\', \'0\', \'0\', \'auto\', \'None\'], "
+    argspec: "args=[\'self\', \'monitor\', \'min_delta\', \'patience\', \'verbose\', \'mode\', \'baseline\', \'restore_best_weights\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0\', \'0\', \'0\', \'auto\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_monitor_value"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "on_batch_begin"
@@ -23,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
index ee400b31c43829efba156298d5ee807cdafc8a98..cd893e67269164781d6a6b6294a199014d40fed8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index df8d7b0ef7afca17338a26388c38827b5b306f95..50f2054cabb1b8f6c46a9537ea923a18f87e5c80 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index ce1a9b694d8708720e0eb677afd25607c6262e9c..9ed9db0a89b49b88098e15baca414ff78b6f10e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 48bb24a05274addca03f11acef99607f78b92e51..3d8d1363bb4e4de818788efbf3c997594350006a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index d8bb8b2a7d0f491c7ec2b30096a1acaf04681a56..5012f1517d57dd646d82ab669cb279b6363dd6ec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index dc27af9552a88650261b4f0694ea0265e6bda05c..73652c2b61259f768eca76b995ae4592df868392 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 5a3b791c0adc0d61129d38b2995ee9077cf0988b..24db71de1182d58b78fec0419aa9cb48a2e315d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
index e58ba18c1c0d06df3a53d93ae18f5bf0931df329..c5503c69a5f3cb6765c984778c0e3626369ee815 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'batch_size\', \'write_graph\', \'write_grads\', \'write_images\', \'embeddings_freq\', \'embeddings_layer_names\', \'embeddings_metadata\', \'embeddings_data\'], varargs=None, keywords=None, defaults=[\'./logs\', \'0\', \'32\', \'True\', \'False\', \'False\', \'0\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'batch_size\', \'write_graph\', \'write_grads\', \'write_images\', \'embeddings_freq\', \'embeddings_layer_names\', \'embeddings_metadata\', \'embeddings_data\', \'update_freq\'], varargs=None, keywords=None, defaults=[\'./logs\', \'0\', \'32\', \'True\', \'False\', \'False\', \'0\', \'None\', \'None\', \'None\', \'epoch\'], "
   }
   member_method {
     name: "on_batch_begin"
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index 5c2d336353aee7fc98b45620adac4f4bcda05ea0..de6e8ef072558e6d926ea125aa5056e3c229d37f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d814b2c8b553f1b2a07f9d9b97dc70ec0674969
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.experimental.PeepholeLSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..164edbd66ab2487a980155eabcf18ed8446e2c14
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.experimental"
+tf_module {
+  member {
+    name: "PeepholeLSTMCell"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 5510465d7b015e4989472b06c9d00ec9772373cf..b84629540e700f242f885064c92309c294693a11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index 38ec8a0aff0b9321f3a7ab2cfd9e6b75a8228e4a..5918a13ad8629582829049485e896688ecad9579 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 41cb8e30bfb57068ebe787f14f69ccc467047f26..599da06427dfe4f28e757a7aac8d8a14856a4556 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 9a7aaa8e961528aa750248e02f44403cab10a413..f9ff1538c8134d96051ad81d35c73e59c6a8cc57 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index c3dd2ad046ec087fd12553a2bb5243939c995e64..723fc9cdb0d0ad93470e22fd8c147d3ecc92af91 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index cc303bf7b98bb81cb0646fc18df0a4c5c70f1917..957ce2f0ce86f8df3eb8b57606229fb661eb52f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 628447ce3555628b651536d6c5b2a7716d59085c..a52c0af68175420dc2a1993d1f025d36705538e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index f03c986c22210906ad7bdc8b880753469b31aa1b..a004db62ddcaaae02a411d8db51f4026ece1384d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index c440604aae62b1ee1c7b7c0b5976ef509af54a7c..44f83d1387cb2ec681f50f7b1f0297f3f74594ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a01eaf8a12626257e97d135f50c06c7ea32fca27..8378faf7188ec594865d4b68c8ea8cae284183ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 0d6698f2ef4c674bf8a4dfc026eb209a83dcb8e7..9d5655c9644e3a2394a346bed78fc478cf60ba8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index f1b23be48f7fec2051f1985381058d769eb8c2f8..b3d3c84f92e6491601f670739b2b45f79313e8f5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -88,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 0672cd5b7b8fdb1967e39c9163635372f73459b7..d37a6b47105225d7b83b6a264b944ceeb583a6c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index b25ae1e82e8a1f315553337a261a2d8a46301fa0..1ad7a91be0ba48d0dbab19da8c7cd9ca89095918 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index bb1918eba65659d9ede888400c24b3a5121d6052..cb9abc25396bb63a3c40de5cc52f9df7ed20071e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -178,6 +178,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 16e0fd5a3131723b3ba3ef3ae6d93fa6426dbd47..47dba1d81f8f97a60fe72ec521f82a78ee5f3505 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 065bb4d35b422ca5ddaceec5726dd0e0bdb7027c..fd649418961301f150aac3dabc1bdf0ade4a9c28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -84,12 +84,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 543bae6fa96fa3ae51775e865bf95ea6f79c8e94..1b1425d53197db8b59abf51fe93c0b0c45299956 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index c7ba6056f9683badbbf3423faa98277a57d4cc45..1741063fe8b09acf3865e0a135e96bb715dcdcfa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -84,12 +84,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 072943dc2c709a7cee26c3439e02e11455187282..50feb4f458ad1a9cb2b2bfe5d67997b7551eed74 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 222a1ef4fc5d19afe2c111c169c2f0bd38c331d6..faaa535df9fe03ad07862f0793f8ebea67b405ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 8f4f7918ab3eb8f73751e6142d5a1ceadd37a6e2..4079329d1ee2a61270fee38426bb8a0859c38ce3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -84,12 +84,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index f93906717814d4df7dfbf983d6cdbef358e9a55c..32e56696e1617f7810792e3416a2ebb2037d23c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 93c442bd55ace0f55fce81fd14e7f05cb13ea3cf..381abe73401fa3a588873d643324fc020c159e30 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -84,12 +84,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 471b18ef8500a279fb07bc893e2c8100d76d7bf1..b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f250a09b7eb69871e7e89d30da817aeb1d896fc..7aeff8003c322e8a8168dd70481a8b30b08762a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index f52128483c67321e4f0e5f0cf5a9fd3c65794561..a1728d9d4f9a1e677646db04c4d0df9572e21208 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 98daf3bab128357ffdde2e8ffa4f61fd5c6493f7..8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 64e7a9046b0852bd44119c4711ef1e3627346aa8..7758209adf8fe7a1306fa5ef125935dafd925c3e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -128,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 6fdffef776827f64eafaa914c1ba3938e124c816..7c463ff1257599366be049edce6cc06140906286 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -128,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 3ac3825759391b7ea21fd6e3b3b149bb9e731479..4960d0264e96e872ea5c49a8841cef20bd5eb37c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 280ec8c25fabe1be63c9aa9a2c7f168315c219d7..8fad7535f882718462a11e27e75732e3097cb87d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 560f66f9c7a1f7e42e27c739a6c71671f8bd147b..5b425f2d4d7a8a897280490e26922766d8bf7065 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index c0543529c3884f20383911f32ea04c07fec4a050..f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index 04eb2824b9b14cf45eaef263282ffc6778bf709d..82b761fc1761bb3e7638f7a80bc80c6433162d04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index f400432915f8ce892a3297a23078f140eb96db7b..c9ff323877e06b6dff274644744d425e3a9b7932 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index ab176b441a246d93b88c00cd6decb34af175ad86..9b4165d4cbf88fefd2bb684dae70ea8afc01357b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index c3895a0ac127bc663f2a323661c1371a428159b0..f225f7c4309615919fb05df05f2ae664bde80097 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index a0fe598ab93a4e9712a1ef631283e8e552ab1e64..855d001700179fb634d1dff78585d340420abe7f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -179,7 +183,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -191,7 +195,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 55e0d7ef023ac4ca5e89f640c5ebb79199c31afa..2c404c99cd2175cdc8b60b229e4410bf280ebcb7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 38fbff5e4a3d2c892b0601c54e52690dae5760bd..6f109d59d0f6fcd2b4650719e3b4f653baec7d23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 5ea61d118de15b1b18410abb3befe404a6ecaecd..69f8a9031d32eb73bb44291cdf330d738d745cf9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -111,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 929f48df23180a2c5e21c110e0e1d343596ecd76..4299f765e525b136e289bba169becec06e19ffb1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 2e6d59337f1df94e327b506248eb74ab11bd6013..9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 11dca17c6df94170f442a88da0c4459caa70d0c1..625e81fd2322ceba153fa65c138948ce43843089 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -111,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4e3e258430cdacaf55aed5d46411d2b74c9bdf2e..2fc769742c70c5665c9cb77ad246fcdb49366d5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index fb9166316f6a641eb12a5664100e31d652148a84..e307a65c7c565660e1f2b6b6b74dc5970425eaa4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 278429af6febdfb9802d86992a1e46bf17633562..4394ad0364e89fd3531d6625e52540991cadf973 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 87b7f6797a0d5bef8c5a4ff582c30433eaced2d4..050ed39fe98dc7cfdf6febe45e235d3ae7cbf486 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 98bf96fa0c251c5f6de8878d48e651ac3346ff38..436191821ef4689351b6124cf2a20afad917e4ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 935a69ab2f3a93db608f6e18baa7359944a428a8..4ba540aa6adc72b572aa9340f89967d69ab78a3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index c9d4158d1c434655abb11b92269e6e70ad2d1f91..a2e9322cb3fd4e56af708d5c4e17b660f7bc2247 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 9953102ff991bfd4f0568120dd7aef07f75ea208..5d16a57fc1aeff9939220de8043fcae39e3d953e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 2617f5a95fa631cf0b92e1fd2feef7457f96fd80..9dd29c1251ef2eacaf535a3f10f3d42dc36624a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
index 5fd0a47a68c0d4ad218c4c64cc6be8f603d9673a..bc3ceb67a4e7506b42fccd6b227891b9eef8147f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index e9f6ef45aaf1c775ea1b8dd157737f65c87e232f..0045d5775e2c19df21428bd4420b6e5612c8002b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index ecdbf48157f5c4aabab065cc99191b1cd6cf57f0..529c750f98715ec30313ed34c9023a845061a3df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -179,7 +183,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -191,7 +195,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 2e0b6bac24fd63988b28c1099d40581989b783df..d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 1e93d1118a4d306d5427d9b6873de1746d93b764..e1f5491180903f7d6931cc09755cabb715bbf233 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bfd36012a7edb8a74198a87a86577278be3fdcd4..9b69d9a9447f42907236b5cc8c7672012f96c38a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 5ad5990d7e624c4f6b1dde92b4608c65aeb19db1..fd52259432577ac94dc702d4411ad5c0eed1ff10 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 40d03369a5235f394832e3e2f48710bb069e9aac..5fc8af0d03564c649dff6e9df70d10731319de40 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 86666b51bb8c8dc22deb95f05cb9edfb10688015..7f8932270e63bc02852c5b64e53694e7e26be08b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 238d96cca62e6e8dc2de2b527dd8a80644ff32fa..4723b99cb0792e1ce0bdc45e46908da8c2b5359c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 85f23df671d2772995ec01bb09e191237d60e6a7..173c5d4a8b149c4e23683cf375e8d793db7faa5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 235806b96500473fe95dd1b25aafe7f091bdb36b..14e1899e145224e411d65cbf481060a3b2cec0f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 4a45bf7997d819140d1c19907535ef2b2d818db9..a708e652bf0e82dea0f58034a81a040a39550dc9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index fda2562fc8c51623f5c4b33e23319ed35229905e..e6706b5cf9f32bda78adc4e2db5916a5750cc82e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 71d2d09a8d1d7addf91d7dc4ca109f8c2d45aed9..a73c082d1bba0453b742f76bacf0ad6116ba79a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index 12949b39a6f7affa657d1dccdc49ad0dc37e9c2f..f3f195554bbf4a43efaf2af0fd278a23bf270994 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index ab16d0021e627e6a2a821a0185ad71eb5bef1835..f345d1d67b2ce0200c64b1aeea5f39821d070bac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 61ccbf5962791ee1c0b35cc4aba422ff5cacd456..31cb8bc177c7a9e365101e75108a29900fbda124 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index ce2320d7030d05ba1e065f5bbcf8a18014891b5e..44cccc92bd2f1ff0335c22f2967865dc88a96ff7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 69848af8cf876ad1232a0bf7c419f52ed68af9f0..b55e191ff1ad6997550966bbb6154a81a489575d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 68b6678d4887323f43bd0a886a76e02a056a4260..e9575436e5b14ac8c52a0b59c86937886eab5f40 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -110,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -122,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 413f45f018ae0ce9ccf0e459b24d544c456e4c7c..98223b207f2ecfd5b7af8a53390166e53a7d4f73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9c61ff602744c00f9105a3f297151b49a8a3dead..2df918b16b2552323d75083bfa80e328c0639cfe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index baa91804c49f86a31093aed0c0a56613f7c1afee..ce5f9e21290eeddc0052257191ac4a6d068c1366 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 15a5d6ac9ea6e087dc0d76a2ab48b08448bfb6ee..a0bb917775fd9edb5d909bf850310e0596a88209 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index be43bd5b3c13632711a49cbbe6c85527d46d46ec..d7942f201bdbfa8d1577813be461a5905b5c6c90 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6105992c7a3a92d00718fe3287412af3c752db1d..f7ac9042d46f46ab35d18c62e5d8841679a18ca9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 1b6cf1e9ecb08a789212da141971434bd63988a6..e5a92688220f6e227b317d71a70fde01df4c432b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 29488a37f8f29f953d2b8b7e447c331df3244c84..0fe2c974a762784a82a6b97e116357be2a61d84f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 182efb83b8621b86672d909ca9929380fad2e1dd..2ee5873f0f11688019dec3a6cd69db06d99b9caa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -149,6 +149,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -167,7 +171,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -179,7 +183,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index d29731ecf9d5387a324104865af5f563d287c60b..5b8f64aa35725d0ea44fc5c5b81952fd839503e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index a6d7494ca7d2230298a442b86766f46bc58a6d54..240cb6e562f77467d94ef95db2374150e318bc04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index c36e802693df564702100a652f3ccc2e95e4c40d..6226c469f8a534f96f6ea991fa5e7d2cf0019e3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9c46cfe40fd6959b526d6ca271bda3182daa1188..34dabce6d8dd0b1b6fe50a008a981e1f06a77edf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 8982f787940dd65291580781b5dc95941d804071..0ddf628ace582db259ebe0b211aba6e6362b5d5b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index ec2cc502984d302b243803b04b4f9d60cee43d05..12eb35ad154a514afd9c900cb2dbece8af28c49f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index d7bc1980f32e523781a68e80312905bc355f0509..c41020c2b45cc88c9b63f3b7a45c35066794dfe2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index fec2de6b49ec1ffaf45b9ee9048bcce37425e919..479f89cf6ae93e8d6ae02e304a51a145164df7de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -93,6 +93,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 3d285e7f17db3e8cdfbacf0056a4c56ffa7e67cb..233363ce02614f184b43a059889c7475b6a8c50b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 40a56a0c948887493a8a4782f122c634da58aeb1..cb6228ac446bd236df88f94eb6e9e717ea38463d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -82,12 +82,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'size\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'(2, 2)\', \'None\'], "
+    argspec: "args=[\'self\', \'size\', \'data_format\', \'interpolation\'], varargs=None, keywords=kwargs, defaults=[\'(2, 2)\', \'None\', \'nearest\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 728eca415a80842291d5684e55632689ceea4099..03bad3ccb613a225ad56e128ea680fc9312151e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index da64e77c39c0e116ff725bb05526882541dd6056..158996792a47fab0e7aa26d21d4bb7f281ca76d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 2f505f9293f429490543ba2c569668f4b2ba3ca4..63a56cd3eebe271f66258c9a0acb974764555b34 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index f82c77072e6969dd57f89f4a971e59e28b4bfc63..965a4cca04651e123c5bd93484200a58b39918ba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 54e01a99177cde5fbfaf5e1e0ac310bef3ea8eae..1a624308878a68f1b48cb0f8b5e08dafbbfa0333 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f7da93f6f412ca559aec2f6acde2b80a5c93c86
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3a7cd80973259bd5cdfe382c656a9478f8933d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..712bb2ecd3526c354cbcf640e689526b2e415a13
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7fe362da89b47a925cd4708909e1c882a9a23aca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a5718533500d9508c558d25d13fc6b61518a73a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..200006db355ca4dc8eb2f509bcb9da7543145548
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
index eca6b915388ebff0103f7ad16f43c6be0df60b7d..9e26ddbdca0c45df195dd566952379887dcfcff3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,29 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -22,11 +46,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -106,7 +130,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2db07df5235e150f691a12d6b332c6d0d241ac19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..904ad3a21a05895b23e30dab82a89a31c74dcfca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17b74924fab4f596a010d6b9731b474433a8153e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49f577e1367aece126449923f77f4f6c89493e99
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8baf858669a446a11b44e044f36bfde61e440bb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40fe64bbd2cec45b9a8c4e9b041d3fa858af1327
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ae6a85026da80cd071984aede8d0ec4e9cd571c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31068a51d510a7b95f62f61f03d37176c0fca55d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa77d1972cea42184fbbdb91e117b08ba38328fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.SensitivityAtSpecificity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'specificity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c17452292a031d42f3da0d5844e99d1272dad25
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67857aa89f1769c736d810cf5f73739021afeddf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.SpecificityAtSensitivity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'sensitivity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b5eb8d0de53960c3a98409119709c1307aa6379
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b9c470e32d7e038f9ba11e4f96ab6eaa6b60a87
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index a296e131586504a3fadc9e6fe54079ee0f8270ba..905021dd790205e64a6f9839218200db98941927 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,57 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SensitivityAtSpecificity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SpecificityAtSensitivity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -26,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -34,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cosine"
@@ -110,7 +162,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 4011719317051a2f153216591e4c571e8b0b2c15..c58c7bef22dd4bff95d8ff07a10e20bb1bc463ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -89,10 +101,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -107,7 +115,11 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_update"
@@ -115,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -229,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -263,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 8a12ac1ad8d3267adf503d909b7f47d3c513a64f..473a1c16fb1edfbf37a7752e273566c1310853af 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -90,10 +102,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -112,7 +120,11 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_update"
@@ -120,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -246,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -268,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -280,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt
index 754b3b84b08b08c7d12eba4ddad0a483440055a9..ed9967856200d62fd152dfec85c8ec36403bcbc0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "estimator"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "initializers"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-generator-enqueuer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-generator-enqueuer.pbtxt
index 939fd547d06bbd03b7e1a1db1404263ff01fd07c..6f5ad2dc963961a6ac7be7656cce4aeb77815e0b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-generator-enqueuer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-generator-enqueuer.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'generator\', \'use_multiprocessing\', \'wait_time\', \'seed\'], varargs=None, keywords=None, defaults=[\'False\', \'0.05\', \'None\'], "
+    argspec: "args=[\'self\', \'sequence\', \'use_multiprocessing\', \'random_seed\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "get"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
index be4496e753f8bdcd76a4761f9bd1804a77380359..8177cc71ed34ed5d0ae57d25ee2da70067411ccc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\'], "
+    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\', \'unit_name\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\', \'step\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-sequence-enqueuer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
index a9e499d1009b5a7458080db6c10a948af21c7b6c..aa36d66f9215f1c61f539af25378e71b079b02e0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\', \'sequence\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "get"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
index 81b91d2780faa6b8ee61fc1201d7ecaf17967b09..138d97b11f23873f98e6bbfc5d0402dc65fd98b3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
@@ -70,6 +70,6 @@ tf_module {
   }
   member_method {
     name: "to_categorical"
-    argspec: "args=[\'y\', \'num_classes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'y\', \'num_classes\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'float32\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index c82e67526b21696a7d56517dc2cb6998882dc7a5..059c91f724aae187055f8323c7748dc99f153302 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 1d031cb5f8461145127b0f13d77e6b8774f5a0b3..d06c8e81ee5d2a8b487d7c3c3714a1f4ed2c8e80 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index a8dda6655df1d06ca77b74f0a992c8fd7e7a357d..6be8e7c210f3f0a28ed8ad8a6672bc4323eb7f9d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 97f65ed89436bd0b4027bb0cbeb80b6f1419269c..16d9ecce10cfb3c28cd1cf47fd65c987680bda41 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
@@ -98,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index ccd9578f0d62bd70ea252ddeac587d59c926b018..21c695935ce7751df67e09091c961e9e0cfbbf7c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 9cbb58d721bb49bde562a57728a9ee46968e611e..f24d0307207588610c1f764bf43912b64c3ea2c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index c75ea3911e17bc879d140068ef54521effd2824e..0a510ece355435d8e75e39d5f7cdc6cebefe32cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index 5dc834e5141e58d255357e02d7446a06e6e2aa45..d0ee44bed3c739da27cc83f0e643e1ea9dd98078 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 96ab209874ac14d6acf2e8115e7f04fc35c4b2bd..546de3cdab3aa0519450f74c6c6d0fe74ddc000c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 7e9656b3525c1d53940b869607616ff414a466cf..3ad311581eba815c2d1b0155a1380db80dd61c5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index e9a2269a6e8de1f9a12f1b54d2e6dced3d4f8902..9b83271350cf90a2d430303dfecfd28facad272b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 7d2eaaab2a8cb9159214a16ba65473d0b6870ac4..87a7fb3d843e3e8e3e2fe5a56ec0b181355a6d7b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
index fd02c919aeb5a536bd052324618983af699e7c47..80834e08f7ada08f02c660017ae0b735bb31e20e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 8bc3eb26e9ca0bf0f129db336b7ca23466fd036f..32b17e90ade7aa0054a390256e3abadfc7011cbe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index 6a0dcce56ac0184ffe995662fd62b89e16257a29..643c469717c258207046ddd93a318f47753de46b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index b6c84edf2a2f86240369b4053cd7351d0b59442d..434e25adc12c2f2f704b07087b8552781ac2d024 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 062a02fa590537b9efbf540a874eeaa6d36697f3..089fc6f9243c85937500b6275da034eb0748ecd4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index eaad0fb23ef7501c8c5b7acee6a9677665b7057f..bc3d58b9ca9789b43bc91f9283a81811f2b6a4e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index ece28a8ce962d8fafb3f7a397a814b903e915d48..fe7d71af3a4a46bed4ea9e62cbd7ad17987517c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea17901f8b2c9ff5f83e0c90b5c78d3788e7f16c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.layers.experimental"
+tf_module {
+  member_method {
+    name: "keras_style_scope"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_keras_style"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
index 0c24e9c7ddb2849732241c718bd08d31fe418e8c..5feedd2a8e283af7e21f520ce94164e4d2f07d76 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.pbtxt
@@ -76,6 +76,10 @@ tf_module {
     name: "SeparableConv2D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "average_pooling1d"
     argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
deleted file mode 100644
index b6dee6317604363275a128fe8d83aaa9473a257a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorBlockDiag.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 973705dae2fabbef0eafb38ad12e96c747aeee27..773c74e64d13ca4a840b7f599fc2cbe9c161cd03 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
deleted file mode 100644
index 3b33f3da97ec2ecb3f94e8bc309be2519fc79c62..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorCirculant.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index de917706d55214cc59f3205f0778d600a356a5b1..533544d21f2753f785113a30518f4fcbcff96cd7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
deleted file mode 100644
index 591bc9631a1d8ecbbd6e133b99c67e432399d73f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorCirculant2D.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index c4e6a21c3ac9324f5dd445dc65415c2abb4c6e9f..e3926eb6d4714731d09ff9c5b75a89830c06e7c1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
deleted file mode 100644
index d643139a53fc501fe2997a2b9f2d11c57b96f2e4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorCirculant3D.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 2e085a8e289e21173789041efb9254e992bd723b..ba209df7824a9cc076499458e35acd7dcf1eaf35 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
deleted file mode 100644
index 1adbcb41adfac33acfdb415662ced7992e21385e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorComposition.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 42d22bce42d8850a784afae3f67771ef1cfe5403..081fb0e08bcd1b35ab44459d1c8eb0857dd14956 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
deleted file mode 100644
index 023d90ccdba8a8739a11f4691d33b7087bedcc0b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorDiag.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index d6749fdcec69425e83a044409ec695d2661f782e..2014a04301618c20af5cf6f1144eb4dbda2479e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
deleted file mode 100644
index 381072e76c4d069ebf51fec44079b30f17cafc06..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorFullMatrix.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index d9f363d1336210623536e8293a6290d9ebfc2fe1..9a87ae9687741090485bd8d4d0d07d359a2015e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
deleted file mode 100644
index 5d115b35fb79cbc176a9e8a9bf1ec0f0edcc79e6..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorIdentity.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index aac7ee31ed62c22b2e86d287d48c68c7e905fd00..33afb835ce1d524991c0024bfb87c29a72aac08e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -76,6 +76,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
deleted file mode 100644
index 5c6784dd02104129a9ac38fe171d87c115efbbf0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorKronecker.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index c11d39082939eda4520b3955b767022bd485b5be..a9078c8ab5cca078237a29febabdbbd4a8b6c89c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
deleted file mode 100644
index 1f0d33298a252a8b3da6eea9fd4bc096e8dd6745..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorLowRankUpdate.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 3ee800269e617390c25248a2c847cbe259b18e79..4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
deleted file mode 100644
index 2683430f4fc5d96d63c5b6fdb4035d6e5e8ba609..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorLowerTriangular.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 63a1bc2321e35645700778c5906d1b8659eb4a32..a87649133fd207ad59f2124c6b0b5aa44916e5a5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
deleted file mode 100644
index 38bf7ad586a063046f260aca9b1c517a343c4c05..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorScaledIdentity.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index e2c5a505a7d2f9abbee5b3bb4f92ee8843198c51..32656467840fbbc0c8708ea68aac5aa75c11a540 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
deleted file mode 100644
index 49ff85728ffab559ec706691356ce071aab89083..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorZeros.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index a1b0e06b4753488bc9fcbe9aeb0d260092745f9c..49d8890c8942bc0021886ee6c9bc4e7625452655 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
deleted file mode 100644
index 38da809b360e5ea69b4324a859ed69da679bc436..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperator.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index 6d849dc040f61b498b100820bf7be3d4bc264bb4..c89dc067b331603e227d9d578147e2dd1ee4a900 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index 6ac95d96da8516ca762333f0ab30949d19904cd3..9f7b422fabcd55aed98bc93f01143d35698c0399 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -2,59 +2,59 @@ path: "tensorflow.linalg"
 tf_module {
   member {
     name: "LinearOperator"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorBlockDiag"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorCirculant"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorCirculant2D"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorCirculant3D"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorComposition"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorDiag"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorFullMatrix"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorIdentity"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorKronecker"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorLowRankUpdate"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorLowerTriangular"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorScaledIdentity"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorZeros"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member_method {
     name: "adjoint"
@@ -132,10 +132,18 @@ tf_module {
     name: "lstsq"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "lu"
+    argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
   member_method {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'adjoint_a\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "norm"
     argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
@@ -156,6 +164,10 @@ tf_module {
     name: "solve"
     argspec: "args=[\'matrix\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "sqrtm"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "svd"
     argspec: "args=[\'tensor\', \'full_matrices\', \'compute_uv\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec0d9522bca9e0a272cccb21c3acc814a7462923
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-interpreter.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.lite.Interpreter"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.interpreter.Interpreter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_path\', \'model_content\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "allocate_tensors"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_details"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_details"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_tensor"
+    argspec: "args=[\'self\', \'tensor_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_tensor_details"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "invoke"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_all_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "resize_tensor_input"
+    argspec: "args=[\'self\', \'input_index\', \'tensor_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_tensor"
+    argspec: "args=[\'self\', \'tensor_index\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tensor"
+    argspec: "args=[\'self\', \'tensor_index\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fe179f6c1b64ebc2f7535719bc1598577ee7f03
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.lite.OpHint.OpHintArgumentTracker"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.op_hint.OpHintArgumentTracker\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'self\', \'arg\', \'tag\', \'name\', \'aggregate\', \'index_override\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66e692a5a379203cb491980802b7003072bfe76c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt
@@ -0,0 +1,69 @@
+path: "tensorflow.lite.OpHint"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.op_hint.OpHint\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "AGGREGATE_FIRST"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "AGGREGATE_LAST"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "AGGREGATE_STACK"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_AGGREGATE_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_INPUT_INDEX_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_NAME_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_OUTPUT_INDEX_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_SORT_INDEX_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_UUID_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "OpHintArgumentTracker"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFLITE_INPUT_INDICES"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'function_name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_input"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_inputs"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_output"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_outputs"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68c651a3c9969f2f16fca39f4466cebbb44eea28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.OpsSet"
+tf_class {
+  is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "SELECT_TF_OPS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+  member {
+    name: "TFLITE_BUILTINS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c955b1a04a4b8af701a57ba2468145590c1a4a16
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-t-f-lite-converter.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.lite.TFLiteConverter"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'graph_def\', \'input_tensors\', \'output_tensors\', \'input_arrays_with_shape\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "convert"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_frozen_graph"
+    argspec: "args=[\'cls\', \'graph_def_file\', \'input_arrays\', \'output_arrays\', \'input_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_keras_model_file"
+    argspec: "args=[\'cls\', \'model_file\', \'input_arrays\', \'input_shapes\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_saved_model"
+    argspec: "args=[\'cls\', \'saved_model_dir\', \'input_arrays\', \'input_shapes\', \'output_arrays\', \'tag_set\', \'signature_key\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_session"
+    argspec: "args=[\'cls\', \'sess\', \'input_tensors\', \'output_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_arrays"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-toco-converter.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-toco-converter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ef90b8bc4646a2adfcbeca2258ff5aa7cbf8894
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-toco-converter.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.lite.TocoConverter"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.TocoConverter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_frozen_graph"
+    argspec: "args=[\'cls\', \'graph_def_file\', \'input_arrays\', \'output_arrays\', \'input_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_keras_model_file"
+    argspec: "args=[\'cls\', \'model_file\', \'input_arrays\', \'input_shapes\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_saved_model"
+    argspec: "args=[\'cls\', \'saved_model_dir\', \'input_arrays\', \'input_shapes\', \'output_arrays\', \'tag_set\', \'signature_key\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_session"
+    argspec: "args=[\'cls\', \'sess\', \'input_tensors\', \'output_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef6c777665c8662be3332dc74b7bd7dd5044c086
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.lite.constants"
+tf_module {
+  member {
+    name: "FLOAT"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "GRAPHVIZ_DOT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "INT32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "INT64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "QUANTIZED_UINT8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "STRING"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "TFLITE"
+    mtype: "<type \'int\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..154dd00821794ef4a5118e98d67e32beca38bebf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.lite"
+tf_module {
+  member {
+    name: "Interpreter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OpHint"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OpsSet"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "TFLiteConverter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TocoConverter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "constants"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "toco_convert"
+    argspec: "args=[\'input_data\', \'input_tensors\', \'output_tensors\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index 459b9e3684d65e2497fecdea6b09ef7da06674da..f34e2c2aa5a5b30e037157bc84894da5dce78538 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -84,6 +84,10 @@ tf_module {
     name: "ceil"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "confusion_matrix"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
+  }
   member_method {
     name: "conj"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -172,6 +176,26 @@ tf_module {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "l2_normalize"
     argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
@@ -292,10 +316,18 @@ tf_module {
     name: "reduce_prod"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_std"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "reduce_sum"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_variance"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -310,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "segment_max"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 9b28ce57464109f570148a2642c94bc6aab9c97a..40e20f8c919e64362e5697bd00ded70d0c2292a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "bidirectional_dynamic_rnn"
     argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "collapse_repeated"
+    argspec: "args=[\'labels\', \'seq_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_accidental_hits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -72,6 +76,10 @@ tf_module {
     name: "conv3d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
+  member_method {
+    name: "conv3d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "conv3d_backprop_filter_v2"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
@@ -92,6 +100,10 @@ tf_module {
     name: "ctc_beam_search_decoder"
     argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\'], varargs=None, keywords=None, defaults=[\'100\', \'1\', \'True\'], "
   }
+  member_method {
+    name: "ctc_beam_search_decoder_v2"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\'], varargs=None, keywords=None, defaults=[\'100\', \'1\'], "
+  }
   member_method {
     name: "ctc_greedy_decoder"
     argspec: "args=[\'inputs\', \'sequence_length\', \'merge_repeated\'], varargs=None, keywords=None, defaults=[\'True\'], "
@@ -100,6 +112,14 @@ tf_module {
     name: "ctc_loss"
     argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
   }
+  member_method {
+    name: "ctc_loss_v2"
+    argspec: "args=[\'labels\', \'logits\', \'label_length\', \'logit_length\', \'logits_time_major\', \'unique\', \'blank_index\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ctc_unique_labels"
+    argspec: "args=[\'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "depth_to_space"
     argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
@@ -108,6 +128,14 @@ tf_module {
     name: "depthwise_conv2d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "depthwise_conv2d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_backprop_input"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "depthwise_conv2d_native"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
@@ -126,7 +154,7 @@ tf_module {
   }
   member_method {
     name: "dropout"
-    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\', \'rate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "dynamic_rnn"
@@ -298,7 +326,7 @@ tf_module {
   }
   member_method {
     name: "softmax_cross_entropy_with_logits_v2"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 88b8f37c4ff0cfaf562293c845e505f06119e227..f7f9978c063ceae89c7228b476f54694e25bc249 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index a4483fefa279957ce503857021c063254a9abf83..f9e898484b9813373a49e6f117578f822cdeb156 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 381c4975d7d778599ce34a9023d0e46b20753cba..9e52a4252619ffc19b287fc1818fa6f772847335 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 912365a28b1277962f648b2b0655d280bca1427c..9836433d08cba809107f9bb5dbccf2e971865b8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -110,6 +110,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index a4bb3219c792708cd02a8345541d8685485c8d05..5fd9b329bdeb40b5a57fe68564977f61b5349ae5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 715bfd5fc7c18993d4997caeefe3188ba88f741c..76c8cff22b1e65e65d0ac3d6705541dc3f16f80c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index b66c0f89cc904c1318787651a3e8e629319c14fb..f53567af52f7ed6baa78bcc75bfc0e38de02e548 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index faeb4f3513362919fca8f0c2ef7c491d7938cb92..d3b68e4f2976912ed65ba7916284c951fda03b05 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -105,6 +105,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index caa2e600800178e4b2d36ae263da23d0b4608dd2..1f7840ab919baeeb0077904592ba8dcc1d4c91fb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 247dfcc1cacad14cf234e57f1f6dd7cdf06c4f9c..584c74f99d896e45de06fa020413b8edd4440afb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -244,6 +244,10 @@ tf_module {
     name: "TensorShape"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorSpec"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextLineReader"
     mtype: "<type \'type\'>"
@@ -312,10 +316,6 @@ tf_module {
     name: "constant_initializer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "contrib"
-    mtype: "<class \'tensorflow.python.util.lazy_loader.LazyLoader\'>"
-  }
   member {
     name: "data"
     mtype: "<type \'module\'>"
@@ -324,6 +324,10 @@ tf_module {
     name: "debugging"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "distribute"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "distributions"
     mtype: "<type \'module\'>"
@@ -344,6 +348,10 @@ tf_module {
     name: "estimator"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "feature_column"
     mtype: "<type \'module\'>"
@@ -424,6 +432,10 @@ tf_module {
     name: "linalg"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "lite"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "logging"
     mtype: "<type \'module\'>"
@@ -528,6 +540,10 @@ tf_module {
     name: "sets"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "signal"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "sparse"
     mtype: "<type \'module\'>"
@@ -600,6 +616,10 @@ tf_module {
     name: "variant"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "version"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "zeros_initializer"
     mtype: "<type \'type\'>"
@@ -680,6 +700,10 @@ tf_module {
     name: "argmin"
     argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'stable\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'False\', \'None\'], "
+  }
   member_method {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
@@ -766,7 +790,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
@@ -1020,10 +1044,30 @@ tf_module {
     name: "digamma"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "dimension_at_index"
+    argspec: "args=[\'shape\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dimension_value"
+    argspec: "args=[\'dimension\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "disable_eager_execution"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_v2_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "disable_v2_tensorshape"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "div"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1060,6 +1104,14 @@ tf_module {
     name: "enable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_v2_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "enable_v2_tensorshape"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -1204,6 +1256,10 @@ tf_module {
     name: "get_local_variable"
     argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
+  member_method {
+    name: "get_logger"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_seed"
     argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
@@ -1438,7 +1494,7 @@ tf_module {
   }
   member_method {
     name: "make_tensor_proto"
-    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\', \'allow_broadcast\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "map_fn"
@@ -1484,6 +1540,10 @@ tf_module {
     name: "matrix_solve_ls"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "matrix_square_root"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "matrix_transpose"
     argspec: "args=[\'a\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'matrix_transpose\', \'False\'], "
@@ -1532,6 +1592,10 @@ tf_module {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "no_gradient"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "no_op"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1612,6 +1676,10 @@ tf_module {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
   }
+  member_method {
+    name: "py_function"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "qr"
     argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -1770,7 +1838,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "scan"
@@ -1908,6 +1976,10 @@ tf_module {
     name: "slice"
     argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'None\'], "
+  }
   member_method {
     name: "space_to_batch"
     argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1922,7 +1994,7 @@ tf_module {
   }
   member_method {
     name: "sparse_add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\', \'thresh\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_concat"
@@ -2116,6 +2188,18 @@ tf_module {
     name: "tanh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_scatter_add"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_sub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_update"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tensordot"
     argspec: "args=[\'a\', \'b\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2256,6 +2340,10 @@ tf_module {
     name: "while_loop"
     argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\', \'return_same_structure\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\', \'False\'], "
   }
+  member_method {
+    name: "wrap_function"
+    argspec: "args=[\'fn\', \'signature\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "write_file"
     argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
index 77c92aeb0dad8898bccc28efba510509d1c351dd..632c2f8f83c8effb188d110bfacaf7f22c0c74cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "quantize"
     argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\'], "
+  }
   member_method {
     name: "quantized_concat"
     argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
index a568dd4cd8a68ec3a3354aad911b370b1bf40cea..1eefb1c70ce4d825402155a5e068c736defff02f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -1,5 +1,17 @@
 path: "tensorflow.random"
 tf_module {
+  member_method {
+    name: "all_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fixed_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
@@ -8,6 +20,10 @@ tf_module {
     name: "get_seed"
     argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "learned_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "log_uniform_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -32,6 +48,26 @@ tf_module {
     name: "shuffle"
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "stateless_categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "stateless_multinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "stateless_normal"
+    argspec: "args=[\'shape\', \'seed\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "stateless_truncated_normal"
+    argspec: "args=[\'shape\', \'seed\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "stateless_uniform"
+    argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "truncated_normal"
     argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
index 67457de070830d45a48230835fc4827e36f70058..e4cc0061a953c81729d8499530e43f5b43a2210e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.saved_model.Builder"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl._SavedModelBuilder\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
index 83bd7035409534abf036c7e2b0d66fcc060ada3a..44860b11720e1af87d8baa3aec5f4f3169410d82 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.saved_model.builder.SavedModelBuilder"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl._SavedModelBuilder\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34343e7c039a373e704d0feb1df2564896fd319f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.saved_model.experimental"
+tf_module {
+  member_method {
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 3f4965fc691944dd8957756d4524ae5e2921c4e1..2a7c78910526f83fdfcd963c21996b4f4dc4bc28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -1,9 +1,105 @@
 path: "tensorflow.saved_model"
 tf_module {
+  member {
+    name: "ASSETS_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "ASSETS_KEY"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "Builder"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CLASSIFY_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLASSIFY_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLASSIFY_OUTPUT_CLASSES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "CLASSIFY_OUTPUT_SCORES"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "DEFAULT_SERVING_SIGNATURE_DEF_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GPU"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "LEGACY_INIT_OP_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "MAIN_OP_KEY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_OUTPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_OUTPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_FILENAME_PB"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_FILENAME_PBTXT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_SCHEMA_VERSION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SERVING"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TPU"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRAINING"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VARIABLES_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VARIABLES_FILENAME"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "builder"
     mtype: "<type \'module\'>"
@@ -12,6 +108,10 @@ tf_module {
     name: "constants"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "loader"
     mtype: "<type \'module\'>"
@@ -48,6 +148,10 @@ tf_module {
     name: "classification_signature_def"
     argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "contains_saved_model"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_tensor_from_tensor_info"
     argspec: "args=[\'tensor_info\', \'graph\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -76,6 +180,10 @@ tf_module {
     name: "regression_signature_def"
     argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "simple_save"
     argspec: "args=[\'session\', \'export_dir\', \'inputs\', \'outputs\', \'legacy_init_op\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
index 8a196b1a556e283671cc75af28df3eaa62532975..09d6f1424b785e266854ede48b26ebbdf571288b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.sets"
 tf_module {
+  member_method {
+    name: "difference"
+    argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "intersection"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "set_difference"
     argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
@@ -16,4 +24,12 @@ tf_module {
     name: "set_union"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
+  member_method {
+    name: "size"
+    argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "union"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea717b4d719d6709e05182faca964ae544abc39c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.signal"
+tf_module {
+  member_method {
+    name: "dct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "frame"
+    argspec: "args=[\'signal\', \'frame_length\', \'frame_step\', \'pad_end\', \'pad_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "hamming_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "hann_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "idct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ifft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "inverse_stft"
+    argspec: "args=[\'stfts\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "inverse_stft_window_fn"
+    argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "irfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "linear_to_mel_weight_matrix"
+    argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "mfccs_from_log_mel_spectrograms"
+    argspec: "args=[\'log_mel_spectrograms\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "overlap_and_add"
+    argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "stft"
+    argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 32bd8d5f8edb24ee1f5a5672487499337bd1c0dd..33e342bc75486be0bccffc1e36a94e147f934432 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\', \'thresh\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "concat"
@@ -112,6 +112,10 @@ tf_module {
     name: "softmax"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sparse_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "split"
     argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index 5ba48e7f571434e80cc0de3c3cc425e7a147f80d..a1cd581a86bc2132bfa04ac3f3433e84b6365b19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -52,8 +52,16 @@ tf_module {
     name: "to_number"
     argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unicode_encode"
+    argspec: "args=[\'input\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
   member_method {
     name: "unicode_script"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unicode_transcode"
+    argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
index 2f00aeac25f691d9767080251798248281e5edf5..811ca18cdb4e9c7a830bb3d7e8af45b341fb6a35 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.sysconfig"
 tf_module {
+  member {
+    name: "CXX11_ABI_FLAG"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "MONOLITHIC_BUILD"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "get_compile_flags"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
index df528e26b60f8d8ddcc1eaf0ed292cc7ff0ebd94..6fc489c86043d074ac832d0ec9dbefd2cbbb4f19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_abstract"
     argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
index abe9b068ae95c08a2b72c9a5e164a097e6162dff..984c584c9e8ba75b371b8ff3ddf9f1b8184d1132 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.test.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "assert_equal_graph_def"
     argspec: "args=[\'actual\', \'expected\', \'checkpoint_v2\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "benchmark_config"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "compute_gradient"
     argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2538de661b357245ad18d9e1c4fc88d2e80eaeb0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.train.CheckpointManager"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.checkpoint_management.CheckpointManager\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "checkpoints"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latest_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'checkpoint_number\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-nan-loss-during-training-error.pbtxt
index 25fd5e75a79f6e4fe2cf77ebc7aa0d1fef759e7f..e415819b3d76a13335163d0a9bf5b91217ca4354 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-nan-loss-during-training-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-nan-loss-during-training-error.pbtxt
@@ -6,10 +6,6 @@ tf_class {
     name: "args"
     mtype: "<type \'getset_descriptor\'>"
   }
-  member {
-    name: "message"
-    mtype: "<type \'getset_descriptor\'>"
-  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index 45c81fdd3b1f70997b7bcb3dad9d40c391c7b80c..bdb3ea2197c78dd17357f2753f05638c3c054bd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "Checkpoint"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointManager"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CheckpointSaverHook"
     mtype: "<type \'type\'>"
@@ -272,10 +276,6 @@ tf_module {
     name: "checkpoint_exists"
     argspec: "args=[\'checkpoint_prefix\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "confusion_matrix"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
-  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
@@ -396,6 +396,10 @@ tf_module {
     name: "piecewise_constant"
     argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "piecewise_constant_decay"
+    argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "polynomial_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.version.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.version.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd4506cb0b38809ea65b3f11c2c79fa40831dc57
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.version.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.version"
+tf_module {
+  member {
+    name: "COMPILER_VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GIT_VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION_MIN_CONSUMER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION_MIN_PRODUCER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VERSION"
+    mtype: "<type \'str\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt
deleted file mode 100644
index f1dffd595285098afaeb0ff04e5db35d594f7fac..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt
+++ /dev/null
@@ -1,70 +0,0 @@
-path: "tensorflow.AttrValue.ListValue"
-tf_proto {
-  descriptor {
-    name: "ListValue"
-    field {
-      name: "s"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_BYTES
-    }
-    field {
-      name: "i"
-      number: 3
-      label: LABEL_REPEATED
-      type: TYPE_INT64
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "f"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_FLOAT
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "b"
-      number: 5
-      label: LABEL_REPEATED
-      type: TYPE_BOOL
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "type"
-      number: 6
-      label: LABEL_REPEATED
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "shape"
-      number: 7
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-    }
-    field {
-      name: "func"
-      number: 9
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NameAttrList"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt
deleted file mode 100644
index 6ccd64f428c3b87c807d0af82f67a884187f738c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt
+++ /dev/null
@@ -1,151 +0,0 @@
-path: "tensorflow.AttrValue"
-tf_proto {
-  descriptor {
-    name: "AttrValue"
-    field {
-      name: "s"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "i"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-      oneof_index: 0
-    }
-    field {
-      name: "f"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-      oneof_index: 0
-    }
-    field {
-      name: "b"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-      oneof_index: 0
-    }
-    field {
-      name: "type"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-      oneof_index: 0
-    }
-    field {
-      name: "shape"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-      oneof_index: 0
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-      oneof_index: 0
-    }
-    field {
-      name: "list"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AttrValue.ListValue"
-      oneof_index: 0
-    }
-    field {
-      name: "func"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NameAttrList"
-      oneof_index: 0
-    }
-    field {
-      name: "placeholder"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    nested_type {
-      name: "ListValue"
-      field {
-        name: "s"
-        number: 2
-        label: LABEL_REPEATED
-        type: TYPE_BYTES
-      }
-      field {
-        name: "i"
-        number: 3
-        label: LABEL_REPEATED
-        type: TYPE_INT64
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "f"
-        number: 4
-        label: LABEL_REPEATED
-        type: TYPE_FLOAT
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "b"
-        number: 5
-        label: LABEL_REPEATED
-        type: TYPE_BOOL
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "type"
-        number: 6
-        label: LABEL_REPEATED
-        type: TYPE_ENUM
-        type_name: ".tensorflow.DataType"
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "shape"
-        number: 7
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorShapeProto"
-      }
-      field {
-        name: "tensor"
-        number: 8
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorProto"
-      }
-      field {
-        name: "func"
-        number: 9
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.NameAttrList"
-      }
-    }
-    oneof_decl {
-      name: "value"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
deleted file mode 100644
index c9a32c16b34a78bd5a182b7c0635a559bddc611d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.ConditionalAccumulatorBase"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'accumulator_ref\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
deleted file mode 100644
index 15e0ab76b6fd97b83019589e79ac290bbce11053..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.ConditionalAccumulator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulator\'>"
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\', \'reduction_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'conditional_accumulator\', \'MEAN\'], "
-  }
-  member_method {
-    name: "apply_grad"
-    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "take_grad"
-    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt
deleted file mode 100644
index d9b142682899bf5d9fd5d942437359adf8962466..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.ConfigProto.DeviceCountEntry"
-tf_proto {
-  descriptor {
-    name: "DeviceCountEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
deleted file mode 100644
index 9f6dcd8fdb069703844f1e05294cf55cd83ba745..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.ConfigProto.Experimental"
-tf_proto {
-  descriptor {
-    name: "Experimental"
-    field {
-      name: "collective_group_leader"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "executor_type"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    reserved_range {
-      start: 2
-      end: 3
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
deleted file mode 100644
index f3a515163df64297cbc25d6f9a33fc2b78648402..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
+++ /dev/null
@@ -1,146 +0,0 @@
-path: "tensorflow.ConfigProto"
-tf_proto {
-  descriptor {
-    name: "ConfigProto"
-    field {
-      name: "device_count"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ConfigProto.DeviceCountEntry"
-    }
-    field {
-      name: "intra_op_parallelism_threads"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "inter_op_parallelism_threads"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "use_per_session_threads"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "session_inter_op_thread_pool"
-      number: 12
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ThreadPoolOptionProto"
-    }
-    field {
-      name: "placement_period"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "device_filters"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "gpu_options"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GPUOptions"
-    }
-    field {
-      name: "allow_soft_placement"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "log_device_placement"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "graph_options"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphOptions"
-    }
-    field {
-      name: "operation_timeout_in_ms"
-      number: 11
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "rpc_options"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RPCOptions"
-    }
-    field {
-      name: "cluster_def"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ClusterDef"
-    }
-    field {
-      name: "isolate_session_state"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 16
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ConfigProto.Experimental"
-    }
-    nested_type {
-      name: "DeviceCountEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "collective_group_leader"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "executor_type"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      reserved_range {
-        start: 2
-        end: 3
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
deleted file mode 100644
index 92e535c341447628a50d8941998a4065e78d12a5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-path: "tensorflow.DeviceSpec"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.device.DeviceSpec\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "job"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "replica"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "task"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'job\', \'replica\', \'task\', \'device_type\', \'device_index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_string"
-    argspec: "args=[\'spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "merge_from"
-    argspec: "args=[\'self\', \'dev\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "parse_from_string"
-    argspec: "args=[\'self\', \'spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "to_string"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
deleted file mode 100644
index a9ab27719b4d71f3d7ed10963ad896ccafa82f15..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-path: "tensorflow.Dimension"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.Dimension\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "value"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "assert_is_compatible_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_compatible_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "merge_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt
deleted file mode 100644
index 6933814a7b68f775e694fe940a7c65a8e31b9398..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-feature.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.FixedLenFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "default_value"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt
deleted file mode 100644
index c53878795190924e205a1e7efe1672f216869c41..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-fixed-len-sequence-feature.pbtxt
+++ /dev/null
@@ -1,31 +0,0 @@
-path: "tensorflow.FixedLenSequenceFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.FixedLenSequenceFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "allow_missing"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "default_value"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt
deleted file mode 100644
index 353e63127de174a79c209a05327da2de20bf0dd7..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt
+++ /dev/null
@@ -1,92 +0,0 @@
-path: "tensorflow.GPUOptions"
-tf_proto {
-  descriptor {
-    name: "GPUOptions"
-    field {
-      name: "per_process_gpu_memory_fraction"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "allow_growth"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "allocator_type"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "deferred_deletion_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "visible_device_list"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "polling_active_delay_usecs"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "polling_inactive_delay_msecs"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "force_gpu_compatible"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GPUOptions.Experimental"
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "virtual_devices"
-        number: 1
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.GPUOptions.Experimental.VirtualDevices"
-      }
-      field {
-        name: "use_unified_memory"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-      field {
-        name: "num_dev_to_dev_copy_streams"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      nested_type {
-        name: "VirtualDevices"
-        field {
-          name: "memory_limit_mb"
-          number: 1
-          label: LABEL_REPEATED
-          type: TYPE_FLOAT
-        }
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
index 2f4257a66a5b45c3890c18bc5b8c97c020c6a001..2299a009d3d5335553e1de025c42b23a57592de3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
@@ -6,9 +6,17 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'persistent\', \'watch_accessed_variables\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
   }
+  member_method {
+    name: "batch_jacobian"
+    argspec: "args=[\'self\', \'target\', \'source\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "gradient"
-    argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'UnconnectedGradients.NONE\'], "
+  }
+  member_method {
+    name: "jacobian"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
   }
   member_method {
     name: "reset"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt
deleted file mode 100644
index 19eccff03d24719d95ea84ccdad4014aa777ccd5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt
+++ /dev/null
@@ -1,36 +0,0 @@
-path: "tensorflow.GraphDef"
-tf_proto {
-  descriptor {
-    name: "GraphDef"
-    field {
-      name: "node"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NodeDef"
-    }
-    field {
-      name: "versions"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.VersionDef"
-    }
-    field {
-      name: "version"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-      options {
-        deprecated: true
-      }
-    }
-    field {
-      name: "library"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.FunctionDefLibrary"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
deleted file mode 100644
index ffe479093397a9bf98d10aa4e054c643e64d5f5d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
+++ /dev/null
@@ -1,140 +0,0 @@
-path: "tensorflow.GraphKeys"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.ops.GraphKeys\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "ACTIVATIONS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "ASSET_FILEPATHS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "BIASES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "CONCATENATED_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "COND_CONTEXT"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "EVAL_STEP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "GLOBAL_STEP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "GLOBAL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_RESOURCES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOSSES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "METRIC_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MODEL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MOVING_AVERAGE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "QUEUE_RUNNERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "READY_FOR_LOCAL_INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "READY_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "REGULARIZATION_LOSSES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "RESOURCES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVEABLE_OBJECTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SUMMARIES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SUMMARY_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TABLE_INITIALIZERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAINABLE_RESOURCE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAINABLE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAIN_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "UPDATE_OPS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "WHILE_CONTEXT"
-    mtype: "<type \'str\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt
deleted file mode 100644
index a9f99bc171cc3661031981f467f583b122e43476..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt
+++ /dev/null
@@ -1,67 +0,0 @@
-path: "tensorflow.GraphOptions"
-tf_proto {
-  descriptor {
-    name: "GraphOptions"
-    field {
-      name: "enable_recv_scheduling"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "optimizer_options"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.OptimizerOptions"
-    }
-    field {
-      name: "build_cost_model"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "build_cost_model_after"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "infer_shapes"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "place_pruned_graph"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "enable_bfloat16_sendrecv"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "timeline_step"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "rewrite_options"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RewriterConfig"
-    }
-    reserved_range {
-      start: 1
-      end: 2
-    }
-    reserved_name: "skip_common_subexpression_elimination"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt
deleted file mode 100644
index d4402f330b8a28eaa61eb2b74c9ca412dce06b62..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt
+++ /dev/null
@@ -1,54 +0,0 @@
-path: "tensorflow.HistogramProto"
-tf_proto {
-  descriptor {
-    name: "HistogramProto"
-    field {
-      name: "min"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "max"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "num"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "sum"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "sum_squares"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "bucket_limit"
-      number: 6
-      label: LABEL_REPEATED
-      type: TYPE_DOUBLE
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "bucket"
-      number: 7
-      label: LABEL_REPEATED
-      type: TYPE_DOUBLE
-      options {
-        packed: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-interactive-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-interactive-session.pbtxt
deleted file mode 100644
index 0a3b81bf829f48e88e9c48ce26cdbb4207101a16..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-interactive-session.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.InteractiveSession"
-tf_class {
-  is_instance: "<class \'tensorflow.python.client.session.InteractiveSession\'>"
-  is_instance: "<class \'tensorflow.python.client.session.BaseSession\'>"
-  is_instance: "<class \'tensorflow.python.client.session.SessionInterface\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph_def"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "sess_str"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'target\', \'graph\', \'config\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "as_default"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "list_devices"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "make_callable"
-    argspec: "args=[\'self\', \'fetches\', \'feed_list\', \'accept_options\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
-  }
-  member_method {
-    name: "partial_run"
-    argspec: "args=[\'self\', \'handle\', \'fetches\', \'feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "partial_run_setup"
-    argspec: "args=[\'self\', \'fetches\', \'feeds\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt
deleted file mode 100644
index 5023aa96bf3b4f3f550421db5f41872d9f62b70d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-path: "tensorflow.LogMessage"
-tf_proto {
-  descriptor {
-    name: "LogMessage"
-    field {
-      name: "level"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.LogMessage.Level"
-    }
-    field {
-      name: "message"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    enum_type {
-      name: "Level"
-      value {
-        name: "UNKNOWN"
-        number: 0
-      }
-      value {
-        name: "DEBUGGING"
-        number: 10
-      }
-      value {
-        name: "INFO"
-        number: 20
-      }
-      value {
-        name: "WARN"
-        number: 30
-      }
-      value {
-        name: "ERROR"
-        number: 40
-      }
-      value {
-        name: "FATAL"
-        number: 50
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
deleted file mode 100644
index 0ba09bec4b3fa6e9eaf59978beaa958ebc038b4c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.MetaGraphDef.CollectionDefEntry"
-tf_proto {
-  descriptor {
-    name: "CollectionDefEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.CollectionDef"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
deleted file mode 100644
index 41c62a407b8577288016f2376c35ba6ec1c3c1ca..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
+++ /dev/null
@@ -1,50 +0,0 @@
-path: "tensorflow.MetaGraphDef.MetaInfoDef"
-tf_proto {
-  descriptor {
-    name: "MetaInfoDef"
-    field {
-      name: "meta_graph_version"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "stripped_op_list"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.OpList"
-    }
-    field {
-      name: "any_info"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".google.protobuf.Any"
-    }
-    field {
-      name: "tags"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensorflow_version"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensorflow_git_version"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "stripped_default_attrs"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
deleted file mode 100644
index 73dc414a779ded3d1f896e743b7f1f1a443352f0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.MetaGraphDef.SignatureDefEntry"
-tf_proto {
-  descriptor {
-    name: "SignatureDefEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SignatureDef"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt
deleted file mode 100644
index d71c2358c93e9597726665fdf8f92e648b2ea772..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt
+++ /dev/null
@@ -1,133 +0,0 @@
-path: "tensorflow.MetaGraphDef"
-tf_proto {
-  descriptor {
-    name: "MetaGraphDef"
-    field {
-      name: "meta_info_def"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.MetaGraphDef.MetaInfoDef"
-    }
-    field {
-      name: "graph_def"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphDef"
-    }
-    field {
-      name: "saver_def"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SaverDef"
-    }
-    field {
-      name: "collection_def"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.MetaGraphDef.CollectionDefEntry"
-    }
-    field {
-      name: "signature_def"
-      number: 5
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.MetaGraphDef.SignatureDefEntry"
-    }
-    field {
-      name: "asset_file_def"
-      number: 6
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AssetFileDef"
-    }
-    nested_type {
-      name: "MetaInfoDef"
-      field {
-        name: "meta_graph_version"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "stripped_op_list"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.OpList"
-      }
-      field {
-        name: "any_info"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".google.protobuf.Any"
-      }
-      field {
-        name: "tags"
-        number: 4
-        label: LABEL_REPEATED
-        type: TYPE_STRING
-      }
-      field {
-        name: "tensorflow_version"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "tensorflow_git_version"
-        number: 6
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "stripped_default_attrs"
-        number: 7
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-    }
-    nested_type {
-      name: "CollectionDefEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.CollectionDef"
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "SignatureDefEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.SignatureDef"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt
deleted file mode 100644
index b119b208772199e5c3596be142f3e0f62d3ed50e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.NameAttrList.AttrEntry"
-tf_proto {
-  descriptor {
-    name: "AttrEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AttrValue"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt
deleted file mode 100644
index fcdb411ffce9b68ac28696f86ca11a47f9e64e8f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.NameAttrList"
-tf_proto {
-  descriptor {
-    name: "NameAttrList"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "attr"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NameAttrList.AttrEntry"
-    }
-    nested_type {
-      name: "AttrEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.AttrValue"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt
deleted file mode 100644
index 622e4c3d0f60ce4842a6fd4cc421551aa795fcbf..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.NodeDef.AttrEntry"
-tf_proto {
-  descriptor {
-    name: "AttrEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AttrValue"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt
deleted file mode 100644
index 646fa8abb9b22dbd908ff821cbe66a33ad02ba64..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt
+++ /dev/null
@@ -1,56 +0,0 @@
-path: "tensorflow.NodeDef"
-tf_proto {
-  descriptor {
-    name: "NodeDef"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "op"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "input"
-      number: 3
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "device"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "attr"
-      number: 5
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NodeDef.AttrEntry"
-    }
-    nested_type {
-      name: "AttrEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.AttrValue"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-op-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-op-error.pbtxt
deleted file mode 100644
index 7e59615534fc2b3ed4fb128caf8ea092ebfd25f4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-op-error.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.OpError"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.errors_impl.OpError\'>"
-  is_instance: "<type \'exceptions.Exception\'>"
-  member {
-    name: "args"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "error_code"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "message"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "node_def"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "op"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'node_def\', \'op\', \'message\', \'error_code\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt
deleted file mode 100644
index 3ccf9d459b133b48e5456f02e4780ade8d3042c8..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.OptimizerOptions"
-tf_proto {
-  descriptor {
-    name: "OptimizerOptions"
-    field {
-      name: "do_common_subexpression_elimination"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "do_constant_folding"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "max_folded_constant_in_bytes"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "do_function_inlining"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "opt_level"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.OptimizerOptions.Level"
-    }
-    field {
-      name: "global_jit_level"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.OptimizerOptions.GlobalJitLevel"
-    }
-    enum_type {
-      name: "Level"
-      value {
-        name: "L1"
-        number: 0
-      }
-      value {
-        name: "L0"
-        number: -1
-      }
-    }
-    enum_type {
-      name: "GlobalJitLevel"
-      value {
-        name: "DEFAULT"
-        number: 0
-      }
-      value {
-        name: "OFF"
-        number: -1
-      }
-      value {
-        name: "ON_1"
-        number: 1
-      }
-      value {
-        name: "ON_2"
-        number: 2
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-padding-f-i-f-o-queue.pbtxt
deleted file mode 100644
index 8fed133561544b91abfc64577e63a7088b43a007..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-padding-f-i-f-o-queue.pbtxt
+++ /dev/null
@@ -1,66 +0,0 @@
-path: "tensorflow.PaddingFIFOQueue"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PaddingFIFOQueue\'>"
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtypes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "names"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "queue_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shapes"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'padding_fifo_queue\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "dequeue"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "dequeue_many"
-    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "dequeue_up_to"
-    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "enqueue"
-    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "enqueue_many"
-    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "from_list"
-    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_closed"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-priority-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-priority-queue.pbtxt
deleted file mode 100644
index ebb017e81bc29e062d804fbe9f50c62f7b615dab..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-priority-queue.pbtxt
+++ /dev/null
@@ -1,66 +0,0 @@
-path: "tensorflow.PriorityQueue"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PriorityQueue\'>"
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtypes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "names"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "queue_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shapes"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'capacity\', \'types\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'priority_queue\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "dequeue"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "dequeue_many"
-    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "dequeue_up_to"
-    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "enqueue"
-    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "enqueue_many"
-    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "from_list"
-    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_closed"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-queue-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-queue-base.pbtxt
deleted file mode 100644
index 761f90989f316611d42580ee911e24bb3d0d2fec..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-queue-base.pbtxt
+++ /dev/null
@@ -1,65 +0,0 @@
-path: "tensorflow.QueueBase"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtypes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "names"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "queue_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shapes"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtypes\', \'shapes\', \'names\', \'queue_ref\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "dequeue"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "dequeue_many"
-    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "dequeue_up_to"
-    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "enqueue"
-    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "enqueue_many"
-    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "from_list"
-    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_closed"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-random-shuffle-queue.pbtxt
deleted file mode 100644
index f3ca84139311bc05478e3dce876b53f7b9dec883..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-random-shuffle-queue.pbtxt
+++ /dev/null
@@ -1,66 +0,0 @@
-path: "tensorflow.RandomShuffleQueue"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.RandomShuffleQueue\'>"
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtypes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "names"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "queue_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "shapes"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'capacity\', \'min_after_dequeue\', \'dtypes\', \'shapes\', \'names\', \'seed\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'random_shuffle_queue\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "dequeue"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "dequeue_many"
-    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "dequeue_up_to"
-    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "enqueue"
-    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "enqueue_many"
-    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "from_list"
-    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_closed"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt
deleted file mode 100644
index 1287940326c0196e76fff2cf6363622226092504..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.RunMetadata"
-tf_proto {
-  descriptor {
-    name: "RunMetadata"
-    field {
-      name: "step_stats"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.StepStats"
-    }
-    field {
-      name: "cost_graph"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.CostGraphDef"
-    }
-    field {
-      name: "partition_graphs"
-      number: 3
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphDef"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
deleted file mode 100644
index 47b5b56faf63edba9ce4f08bf744f3acf4f67f5f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.RunOptions.Experimental"
-tf_proto {
-  descriptor {
-    name: "Experimental"
-    field {
-      name: "collective_graph_key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "use_run_handler_pool"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
deleted file mode 100644
index c0c2e7b9f8d71be9b96e7195b561d0a934d24057..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
+++ /dev/null
@@ -1,89 +0,0 @@
-path: "tensorflow.RunOptions"
-tf_proto {
-  descriptor {
-    name: "RunOptions"
-    field {
-      name: "trace_level"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.RunOptions.TraceLevel"
-    }
-    field {
-      name: "timeout_in_ms"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "inter_op_thread_pool"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "output_partition_graphs"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "debug_options"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.DebugOptions"
-    }
-    field {
-      name: "report_tensor_allocations_upon_oom"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RunOptions.Experimental"
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "collective_graph_key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "use_run_handler_pool"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-    }
-    enum_type {
-      name: "TraceLevel"
-      value {
-        name: "NO_TRACE"
-        number: 0
-      }
-      value {
-        name: "SOFTWARE_TRACE"
-        number: 1
-      }
-      value {
-        name: "HARDWARE_TRACE"
-        number: 2
-      }
-      value {
-        name: "FULL_TRACE"
-        number: 3
-      }
-    }
-    reserved_range {
-      start: 4
-      end: 5
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt
deleted file mode 100644
index 259f2418740cbfe47cdb4bd871d4f5c6306d25f5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt
+++ /dev/null
@@ -1,44 +0,0 @@
-path: "tensorflow.SessionLog"
-tf_proto {
-  descriptor {
-    name: "SessionLog"
-    field {
-      name: "status"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.SessionLog.SessionStatus"
-    }
-    field {
-      name: "checkpoint_path"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "msg"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    enum_type {
-      name: "SessionStatus"
-      value {
-        name: "STATUS_UNSPECIFIED"
-        number: 0
-      }
-      value {
-        name: "START"
-        number: 1
-      }
-      value {
-        name: "STOP"
-        number: 2
-      }
-      value {
-        name: "CHECKPOINT"
-        number: 3
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-session.pbtxt
deleted file mode 100644
index 1d6b037f9c3540653a8fb18b6508f74b01da66ab..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-session.pbtxt
+++ /dev/null
@@ -1,55 +0,0 @@
-path: "tensorflow.Session"
-tf_class {
-  is_instance: "<class \'tensorflow.python.client.session.Session\'>"
-  is_instance: "<class \'tensorflow.python.client.session.BaseSession\'>"
-  is_instance: "<class \'tensorflow.python.client.session.SessionInterface\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph_def"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "sess_str"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'target\', \'graph\', \'config\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "as_default"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "list_devices"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "make_callable"
-    argspec: "args=[\'self\', \'fetches\', \'feed_list\', \'accept_options\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
-  }
-  member_method {
-    name: "partial_run"
-    argspec: "args=[\'self\', \'handle\', \'fetches\', \'feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "partial_run_setup"
-    argspec: "args=[\'self\', \'fetches\', \'feeds\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "reset"
-    argspec: "args=[\'target\', \'containers\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-conditional-accumulator.pbtxt
deleted file mode 100644
index 39ff336c4f57f2bc19e40bc5e6ab0330cb7df044..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-conditional-accumulator.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-path: "tensorflow.SparseConditionalAccumulator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.SparseConditionalAccumulator\'>"
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\', \'reduction_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'sparse_conditional_accumulator\', \'MEAN\'], "
-  }
-  member_method {
-    name: "apply_grad"
-    argspec: "args=[\'self\', \'grad_indices\', \'grad_values\', \'grad_shape\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "apply_indexed_slices_grad"
-    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "take_grad"
-    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "take_indexed_slices_grad"
-    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt
deleted file mode 100644
index d875394fb5de73f67629b77c902a2ed2a03dd982..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-feature.pbtxt
+++ /dev/null
@@ -1,35 +0,0 @@
-path: "tensorflow.SparseFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.SparseFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "already_sorted"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "index_key"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "value_key"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt
deleted file mode 100644
index d33fd4d5d7b6b3e2eb7454b5326d993c139f0490..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt
+++ /dev/null
@@ -1,26 +0,0 @@
-path: "tensorflow.SparseTensorValue"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensorValue\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "dense_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "indices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "values"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
deleted file mode 100644
index 0064c8460cb374f1e3f108085a2efed4131dd205..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.TensorInfo.CooSparse"
-tf_proto {
-  descriptor {
-    name: "CooSparse"
-    field {
-      name: "values_tensor_name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "indices_tensor_name"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "dense_shape_tensor_name"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
deleted file mode 100644
index 63566c808e55cb4d3b630f0a017fa3a2c8a30de3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
+++ /dev/null
@@ -1,59 +0,0 @@
-path: "tensorflow.TensorInfo"
-tf_proto {
-  descriptor {
-    name: "TensorInfo"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "coo_sparse"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorInfo.CooSparse"
-      oneof_index: 0
-    }
-    field {
-      name: "dtype"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-    }
-    field {
-      name: "tensor_shape"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    nested_type {
-      name: "CooSparse"
-      field {
-        name: "values_tensor_name"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "indices_tensor_name"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "dense_shape_tensor_name"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    oneof_decl {
-      name: "encoding"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
index 8e3598fb2470b327e6e3601969f055d4907f614a..bee19520b7736967533c6d30a1862e3c48d03fc2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.TensorShape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV2\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV1\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dims"
@@ -10,6 +11,10 @@ tf_class {
     name: "ndims"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "rank"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'dims\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..493dcba8922d7f6c51a61d337f48e09d168e6bac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.TensorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_spec.TensorSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt
deleted file mode 100644
index 54b66f43f8e7d714e82ae9d68b37ac348c476c97..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-var-len-feature.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.VarLenFeature"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
-  is_instance: "<class \'tensorflow.python.ops.parsing_ops.VarLenFeature\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable-aggregation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable-aggregation.pbtxt
index 66a20547eb6d13ae60d71b07cbf150a4ca2abfe7..6972b2be97170bd7d240cca8ebd64c53fb591a23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable-aggregation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable-aggregation.pbtxt
@@ -1,20 +1,20 @@
 path: "tensorflow.VariableAggregation"
 tf_class {
-  is_instance: "<enum \'VariableAggregation\'>"
+  is_instance: "<enum \'VariableAggregationV2\'>"
   member {
     name: "MEAN"
-    mtype: "<enum \'VariableAggregation\'>"
+    mtype: "<enum \'VariableAggregationV2\'>"
   }
   member {
     name: "NONE"
-    mtype: "<enum \'VariableAggregation\'>"
+    mtype: "<enum \'VariableAggregationV2\'>"
   }
   member {
-    name: "ONLY_FIRST_TOWER"
-    mtype: "<enum \'VariableAggregation\'>"
+    name: "ONLY_FIRST_REPLICA"
+    mtype: "<enum \'VariableAggregationV2\'>"
   }
   member {
     name: "SUM"
-    mtype: "<enum \'VariableAggregation\'>"
+    mtype: "<enum \'VariableAggregationV2\'>"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac3ccd468b216ab817c9ed05dcb292eaf1f44398
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.Variable.SaveSliceInfo"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.variables.SaveSliceInfo\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "spec"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'full_name\', \'full_shape\', \'var_offset\', \'var_shape\', \'save_slice_info_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6136c8fbe79ef8d3851c39b8f11ac3c33f6050f2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -0,0 +1,134 @@
+path: "tensorflow.Variable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "SaveSliceInfo"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "device"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initial_value"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "op"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'import_scope\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "assign"
+    argspec: "args=[\'self\', \'value\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "assign_add"
+    argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "assign_sub"
+    argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "batch_scatter_update"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "count_up_to"
+    argspec: "args=[\'self\', \'limit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "eval"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_proto"
+    argspec: "args=[\'variable_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_shape"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "initialized_value"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load"
+    argspec: "args=[\'self\', \'value\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "read_value"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scatter_add"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_add"
+    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_sub"
+    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter_nd_update"
+    argspec: "args=[\'self\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "scatter_sub"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "scatter_update"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "set_shape"
+    argspec: "args=[\'self\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_proto"
+    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "value"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
deleted file mode 100644
index 85044a8987963126ae12aaa0e5eb5d1ecc134539..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.app"
-tf_module {
-  member {
-    name: "flags"
-    mtype: "<type \'module\'>"
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'main\', \'argv\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.__metaclass__.pbtxt
deleted file mode 100644
index af08c88d3333fa897c38cc2f6530a9c5cda15342..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.Dataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 8b7f63e43e237864d4ef24d3b251b23199f9ee17..d877339409d781f95f7ff75a553d21d82c27fc40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -1,22 +1,21 @@
 path: "tensorflow.data.Dataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_shapes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_types"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "apply"
@@ -46,10 +45,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -66,14 +61,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -102,10 +89,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
deleted file mode 100644
index f384323fc89bb7d21309e86ddaab2e6e1f9f212b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.FixedLengthRecordDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index a7bfa82c650e0a511cb6c8eaffceaf49fbfeaa39..f1573512438b3f40db7653bf94fd4ad282a40acd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDatasetV2\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -17,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filenames\', \'record_bytes\', \'header_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -47,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -67,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -103,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
deleted file mode 100644
index 4f0147a52381c748eccbfee29df0d3537ba5d14a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-path: "tensorflow.data.Iterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_string_handle"
-    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_structure"
-    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_next"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_initializer"
-    argspec: "args=[\'self\', \'dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "string_handle"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index d15dccc173b9c2633e4a4303a5c69f59b2cd4536..72fc2c3a9ee5b985723ce2dba9643ba796362dc7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -1,49 +1,30 @@
 path: "tensorflow.data.Options"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "experimental_autotune"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_filter_fusion"
+    name: "experimental_deterministic"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_hoist_random_uniform"
+    name: "experimental_numa_aware"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_latency_all_edges"
+    name: "experimental_optimization"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_map_and_batch_fusion"
+    name: "experimental_stats"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_map_and_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_parallelization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_vectorization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_noop_elimination"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_shuffle_and_repeat_fusion"
+    name: "experimental_threading"
     mtype: "<type \'property\'>"
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
deleted file mode 100644
index b12dec8a70be5e0cd8346785b48f56b15155dd02..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.TFRecordDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 7b7a9ebaf08b1e9fdb5e4c5b7448175611a9b2c4..690da98b1ac2097c4241ba3218caa3b476dbf397 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.TFRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV2\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -47,10 +47,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -67,14 +63,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -103,10 +91,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
deleted file mode 100644
index 7ddcdce2663ca0ef6409fb3ab3c29555948d7302..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.TextLineDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 2817f900e15ccf8df2ca71aa0218ba07eef682e2..fe0bc1a4db5d4a5e78ec7479e414545b522ec2df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDatasetV2\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -47,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -67,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -103,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
deleted file mode 100644
index 3eeaa1b185058c99d786bdad9e95c96c9ea5ebab..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.experimental.CsvDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2520e28a3c708f45942eb2e73911b7a5226646e5..261129b132189ef504678058f11651dd22bdce8c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.CsvDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-dataset-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-dataset-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb304f763ea44d0d7314248170e615115b0794c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-dataset-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.DatasetStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'element_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-nested-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-nested-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4b066e563cc6196650b1ba561da7c16a80a8656
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-nested-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.NestedStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.NestedStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'nested_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b7ad64f51f88ae9c860e061db5c1ad6b5f2bcf8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -0,0 +1,50 @@
+path: "tensorflow.data.experimental.OptimizationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.optimization_options.OptimizationOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "apply_default_optimizations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "hoist_random_uniform"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_batch_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_parallelization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_vectorization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "noop_elimination"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shuffle_and_repeat_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf41c1d1d696d94ef9da5fc64272349d1533816e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.OptionalStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.optional_ops.OptionalStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
deleted file mode 100644
index 2991b12f64ca8812a6b04217dc49225edd157d92..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.experimental.RandomDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 1dd53b1eabdf15b662a839a07176ba4eaf8bda37..0b34bbc94269280d6cca77bca789fb74f76629be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.RandomDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f97376b328cf34eb04918bec7bacf08d254d8db5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.SparseTensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.SparseTensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
deleted file mode 100644
index 948e99ef86fa52a32b7028c69be190e384c1b658..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.data.experimental.SqlDataset.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8fdd9dc52e332abdeed039bd85d31f6318d013e9..0e61890eee42a8b5b0df7bda0f99d189c4911eb9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.SqlDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
index 0bcc8cf3e87ea8b78f28130da60a1749e2848806..6536a698b50efc9daaa72d8ae589855e30fbc601 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-aggregator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.data.experimental.StatsAggregator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_ops.StatsAggregator\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_aggregator.StatsAggregator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..892f8c1fb897dfc8bf4964c118aeb641dffd3caa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.data.experimental.StatsOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "aggregator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "counter_prefix"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latency_all_edges"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "prefix"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a99db4542e0deb506d00c00f889299dd22d67e1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-structure.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.data.experimental.Structure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c8864a9dd98058c659e72ba8059182a666ea39
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.TensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.TensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-threading-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-threading-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b5ebf108018e75b6de28287a68a25a03b294b64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-threading-options.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.ThreadingOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.threading_options.ThreadingOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "max_intra_op_parallelism"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "private_threadpool_size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 2a1f899dc00dc9426f538d8a1a867cc16e2068c2..2d115904925eb96164484300baf628d41d3fcff4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -1,37 +1,85 @@
 path: "tensorflow.data.experimental"
 tf_module {
+  member {
+    name: "AUTOTUNE"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "CheckpointInputPipelineHook"
     mtype: "<type \'type\'>"
   }
   member {
     name: "CsvDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "DatasetStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NestedStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OptimizationOptions"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "Optional"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OptionalStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RandomDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "Reducer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseTensorStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SqlDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "StatsAggregator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StatsOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Structure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ThreadingOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -40,6 +88,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "choose_from_datasets"
     argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
@@ -56,6 +108,10 @@ tf_module {
     name: "enumerate_dataset"
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "filter_for_shard"
+    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
@@ -82,7 +138,7 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
@@ -120,10 +176,6 @@ tf_module {
     name: "scan"
     argspec: "args=[\'initial_state\', \'scan_func\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "set_stats_aggregator"
-    argspec: "args=[\'stats_aggregator\', \'tag\', \'counter_prefix\'], varargs=None, keywords=None, defaults=[\'\', \'\'], "
-  }
   member_method {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index 3023276a1d3de86d76caa2749f7e85c45e6d9bd6..4c3d6ddd85233c356aa22ebc9a9ec395ca0a8b74 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -2,14 +2,10 @@ path: "tensorflow.data"
 tf_module {
   member {
     name: "Dataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "FixedLengthRecordDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
-  }
-  member {
-    name: "Iterator"
     mtype: "<type \'type\'>"
   }
   member {
@@ -18,11 +14,11 @@ tf_module {
   }
   member {
     name: "TFRecordDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "TextLineDataset"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "experimental"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
index ab6287f8cd080621d76fc34e2cb437960a217800..314aedda909cda8b1d8a209333b85a7792c19bd5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
@@ -6,19 +6,19 @@ tf_module {
   }
   member_method {
     name: "assert_all_finite"
-    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "assert_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_integer"
@@ -26,35 +26,35 @@ tf_module {
   }
   member_method {
     name: "assert_less"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_less_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_near"
-    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_non_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_non_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_none_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_proper_iterable"
@@ -62,15 +62,15 @@ tf_module {
   }
   member_method {
     name: "assert_rank"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank_at_least"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank_in"
-    argspec: "args=[\'x\', \'ranks\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'ranks\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_same_float_dtype"
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
@@ -88,28 +88,8 @@ tf_module {
     name: "check_numerics"
     argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "is_finite"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_inf"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_nan"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_non_decreasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "is_numeric_tensor"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "is_strictly_increasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..583cbc66549223e5c954b715e2043efa5417ef18
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.distribute.InputContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "input_pipeline_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_input_pipelines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_input_pipelines\', \'input_pipeline_id\', \'num_replicas_in_sync\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "get_per_replica_batch_size"
+    argspec: "args=[\'self\', \'global_batch_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a7a3a97aa0927b81708311d4b8b28fced217c00
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.distribute.InputReplicationMode"
+tf_class {
+  is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_WORKER"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a613e2d3d1dcefacdf0ec336587a46ff7e0bcb90
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -0,0 +1,138 @@
+path: "tensorflow.distribute.MirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4899f38cad253167ce0b94f79388cb97fe534197
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.ReduceOp"
+tf_class {
+  is_instance: "<enum \'ReduceOp\'>"
+  member {
+    name: "MEAN"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..df707e8920e4488ed6b40a7f93f56b5624188c84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.ReplicaContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "distribution_strategy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica_id_in_sync_group"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strategy"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_call"
+    argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77706e57133e1186d9e98fcf9205ed4c91772eda
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.distribute.StrategyExtended"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategyExtended\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'container_strategy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_steps_on_iterator"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9eb73d2c0d9069ec4b818abe1825503f0ea36fc9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -0,0 +1,137 @@
+path: "tensorflow.distribute.Strategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b0dd73ca1d4179b4a3323fa0a9be2fde4e22799c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.distribute"
+tf_module {
+  member {
+    name: "InputContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InputReplicationMode"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "MirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ReduceOp"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReplicaContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Strategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StrategyExtended"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "get_loss_reduction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "has_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "in_cross_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt
deleted file mode 100644
index ca96f4eaece0020235d24901f51306a65676c1c9..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-bernoulli.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Bernoulli"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.bernoulli.Bernoulli\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "logits"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "probs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'False\', \'True\', \'Bernoulli\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt
deleted file mode 100644
index d0508acd9f4f6c190b205301223599cf5b027955..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-beta.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Beta"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.beta.Beta\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration0"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration1"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'concentration1\', \'concentration0\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Beta\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt
deleted file mode 100644
index ff0fbb56cd4b9e4c288a168a7c3d9e83c552b0e2..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-categorical.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Categorical"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.categorical.Categorical\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "logits"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "probs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'logits\', \'probs\', \'dtype\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'False\', \'True\', \'Categorical\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt
deleted file mode 100644
index d75e4a2f88b29ff7f638d72f98876a230b191dce..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet-multinomial.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.DirichletMultinomial"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.dirichlet_multinomial.DirichletMultinomial\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_count"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'total_count\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'DirichletMultinomial\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt
deleted file mode 100644
index b838b9ae21decba0323211f08d09fe373ababf23..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-dirichlet.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Dirichlet"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.dirichlet.Dirichlet\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'concentration\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Dirichlet\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt
deleted file mode 100644
index 6f06b7d50dd9f5f405673d572503ff549f148f33..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-distribution.pbtxt
+++ /dev/null
@@ -1,134 +0,0 @@
-path: "tensorflow.distributions.Distribution"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'reparameterization_type\', \'validate_args\', \'allow_nan_stats\', \'parameters\', \'graph_parents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt
deleted file mode 100644
index d34f9cde5d4d4161883f6d1b4646f22f054d16ad..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-exponential.pbtxt
+++ /dev/null
@@ -1,144 +0,0 @@
-path: "tensorflow.distributions.Exponential"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.exponential.Exponential\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.gamma.Gamma\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "rate"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Exponential\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt
deleted file mode 100644
index df268b8d99eb6bf22264ddb63231074413686efa..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-gamma.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Gamma"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.gamma.Gamma\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "concentration"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "rate"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'concentration\', \'rate\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Gamma\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt
deleted file mode 100644
index 303dcb4ed3bf8416b822bb010c2e87e8ef03b7c9..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-laplace.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Laplace"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.laplace.Laplace\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loc"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Laplace\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt
deleted file mode 100644
index ecda8acb15c49c390eaae203a0082e78e53499bd..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-multinomial.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Multinomial"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.multinomial.Multinomial\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "logits"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "probs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "total_count"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'total_count\', \'logits\', \'probs\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'Multinomial\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt
deleted file mode 100644
index 92b9eeea223b488cda1ebcabd31ec808e78fcf70..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-normal.pbtxt
+++ /dev/null
@@ -1,143 +0,0 @@
-path: "tensorflow.distributions.Normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.normal.Normal\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loc"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'Normal\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt
deleted file mode 100644
index e3db443c2bdaa70f7651126a30caf2062a3c6f67..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-register-k-l.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-path: "tensorflow.distributions.RegisterKL"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.kullback_leibler.RegisterKL\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dist_cls_a\', \'dist_cls_b\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt
deleted file mode 100644
index 02e8d576ddd00aa21005fa39cd323a92392bf75a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-reparameterization-type.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-path: "tensorflow.distributions.ReparameterizationType"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'rep_type\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt
deleted file mode 100644
index 9aa7f9a63465c78f79ae4a8a11bc63d92d027dab..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-student-t.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.StudentT"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.student_t.StudentT\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "df"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "loc"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scale"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'df\', \'loc\', \'scale\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'StudentT\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt
deleted file mode 100644
index d1b9d3069629c552d6c6048642934f422a13dce7..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.-uniform.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.distributions.Uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.distributions.uniform.Uniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution.Distribution\'>"
-  is_instance: "<class \'tensorflow.python.ops.distributions.distribution._BaseDistribution\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "allow_nan_stats"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "batch_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "event_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "high"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "low"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "parameters"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "reparameterization_type"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "validate_args"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'low\', \'high\', \'validate_args\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'False\', \'True\', \'Uniform\'], "
-  }
-  member_method {
-    name: "batch_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
-  }
-  member_method {
-    name: "cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'cdf\'], "
-  }
-  member_method {
-    name: "copy"
-    argspec: "args=[\'self\'], varargs=None, keywords=override_parameters_kwargs, defaults=None"
-  }
-  member_method {
-    name: "covariance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
-  }
-  member_method {
-    name: "cross_entropy"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
-  }
-  member_method {
-    name: "entropy"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
-  }
-  member_method {
-    name: "event_shape_tensor"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'event_shape_tensor\'], "
-  }
-  member_method {
-    name: "is_scalar_batch"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_batch\'], "
-  }
-  member_method {
-    name: "is_scalar_event"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
-  }
-  member_method {
-    name: "log_cdf"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
-  }
-  member_method {
-    name: "log_prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_prob\'], "
-  }
-  member_method {
-    name: "log_survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_survival_function\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mean\'], "
-  }
-  member_method {
-    name: "mode"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'mode\'], "
-  }
-  member_method {
-    name: "param_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'DistributionParamShapes\'], "
-  }
-  member_method {
-    name: "param_static_shapes"
-    argspec: "args=[\'cls\', \'sample_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "prob"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'prob\'], "
-  }
-  member_method {
-    name: "quantile"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'quantile\'], "
-  }
-  member_method {
-    name: "range"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range\'], "
-  }
-  member_method {
-    name: "sample"
-    argspec: "args=[\'self\', \'sample_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'sample\'], "
-  }
-  member_method {
-    name: "stddev"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'stddev\'], "
-  }
-  member_method {
-    name: "survival_function"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'survival_function\'], "
-  }
-  member_method {
-    name: "variance"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'variance\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt
deleted file mode 100644
index 90b60ef074dd2eaf911291e6c725b98e2891e728..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.distributions.pbtxt
+++ /dev/null
@@ -1,75 +0,0 @@
-path: "tensorflow.distributions"
-tf_module {
-  member {
-    name: "Bernoulli"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Beta"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Categorical"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Dirichlet"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "DirichletMultinomial"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Distribution"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Exponential"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "FULLY_REPARAMETERIZED"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
-  }
-  member {
-    name: "Gamma"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Laplace"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Multinomial"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "NOT_REPARAMETERIZED"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution.ReparameterizationType\'>"
-  }
-  member {
-    name: "Normal"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "RegisterKL"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ReparameterizationType"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "StudentT"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member {
-    name: "Uniform"
-    mtype: "<class \'tensorflow.python.ops.distributions.distribution._DistributionMeta\'>"
-  }
-  member_method {
-    name: "kl_divergence"
-    argspec: "args=[\'distribution_a\', \'distribution_b\', \'allow_nan_stats\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
index ea23feca84802669e0cb1e0da0f77a6cdd50908c..01b870a81639807489ec2a09dcc185137aae1665 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
@@ -4,6 +4,110 @@ tf_module {
     name: "DType"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "QUANTIZED_DTYPES"
+    mtype: "<type \'frozenset\'>"
+  }
+  member {
+    name: "bfloat16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "bool"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "complex128"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "complex64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "double"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "float64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "half"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "int8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "qint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "quint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "quint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "resource"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "string"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint32"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint64"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "uint8"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
+  member {
+    name: "variant"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member_method {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
index 082e26b99bfe797dea72d27e2b66f2cd1cc815fd..efe9e74697096b4a7bac912f10c1092470daadec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BaselineClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.baseline.BaselineClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..382d392f39e4044916ff16718c9708b71043bcf4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.estimator.BaselineEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'model_dir\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
index 7cc4191eb32548ae48a49c6bc42ac78c7f79f5d0..a7300bf06bb5bbb01c02b9050f8779910b11919e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BaselineRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.baseline.BaselineRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-best-exporter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-best-exporter.pbtxt
index 9694268199a29c51f37bc73a2f92715c78854a2f..68145735bd528038187946db665a25a77143abc6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-best-exporter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-best-exporter.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BestExporter"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.exporter.BestExporter\'>"
-  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.BestExporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.Exporter\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index ef3409b1b59dc1177676107f7628354141b7f417..e138ce936ec73c05f8f790fb63c381e56ae2f654 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.estimator.BoostedTreesClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -22,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 775130468fe9e753e5c22902274a0b238021a598..eae0a292a962680a53d8c683ee2d2b97e24937a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.estimator.BoostedTreesRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -22,7 +23,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\', \'quantile_sketch_epsilon\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\', \'0.01\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9e1504b494e3863f770df23f9f9a92e004b8713
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.CheckpointSaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..111b7583f2cd005912c7f06d977565cd17f265b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.estimator.CheckpointSaverListener"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
index 718f415a777a0f150972fd061f979dbabf8cd592..a540085aba48c1d7c877b41831475cb2dacf8ec9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.dnn.DNNClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1b29d670a0cbd3628569ea1c401a329f336c960
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.estimator.DNNEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index b23c019d6c9af1865a53debc9940d7d957d5f183..f6c3910a9fe5c76bafe03a636a4e91014055ce81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNLinearCombinedClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b78527279ca32decc71185a98f9f8270b4cd41a2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.estimator.DNNLinearCombinedEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'input_layer_partitioner\', \'config\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'sum\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index caa9e3f1deb956a85ceefca6b12d89245f8c4ec6..9133f0d3b280dc8d2d5a263e25731594e0be2ef0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNLinearCombinedRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 1f5e650940259f78c56ab4d2e28260fb6f23db2b..a58d733302da9e69fe0d46d7d327e1b7868e198e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.dnn.DNNRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator-spec.pbtxt
index aa6ac46613fbead7457b19e1aae5f2532afddef1..376becc3f9bb0c3d830f3ed2e5a05b8d17757299 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator-spec.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.EstimatorSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.model_fn.EstimatorSpec\'>"
-  is_instance: "<class \'tensorflow.python.estimator.model_fn.EstimatorSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.EstimatorSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.EstimatorSpec\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "eval_metric_ops"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
index ebd3869c9b093e45a0b61cf443f872a8ceb07327..a1f0e76c8b87bac01e21850528e035e1baa7f3d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.Estimator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,12 +31,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-eval-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-eval-spec.pbtxt
index db83ba1bd8f0bd13c9048d62d74790ed2b729589..23c2544fe461f9760e873a0761059a6356e5f8fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-eval-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-eval-spec.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.EvalSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.training.EvalSpec\'>"
-  is_instance: "<class \'tensorflow.python.estimator.training.EvalSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.training.EvalSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.training.EvalSpec\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "exporters"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-exporter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-exporter.pbtxt
index 035af70e52024f8d16e1cd12951af10aad355eda..6c3f0fd910829f0173ed78284cf92202d8186685 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-exporter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-exporter.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.Exporter"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.Exporter\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-feed-fn-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f24de493f24a363190cd1d323adaa75b32b0d8e3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-feed-fn-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.FeedFnHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FeedFnHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feed_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-exporter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-exporter.pbtxt
index ee37b1fa210ea816ef762590cfd1725c71262ed8..e030d401ea4122800535074192f1ed24e85af0e9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-exporter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-exporter.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.FinalExporter"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.exporter.FinalExporter\'>"
-  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.FinalExporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.Exporter\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-ops-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6651170ba33f491d5a5342bcd6e6814e1b973832
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-ops-hook.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.estimator.FinalOpsHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FinalOpsHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "final_ops_values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'final_ops\', \'final_ops_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-global-step-waiter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37db48bc64e2f0e955105e8094d51c851c25558b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-global-step-waiter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.GlobalStepWaiterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.GlobalStepWaiterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'wait_until_step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-latest-exporter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-latest-exporter.pbtxt
index 2a9d0290295114daa006d39f17a295a01e40da6b..d67f2bd625ed8b66565df1a68c9bb90c33c22efd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-latest-exporter.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-latest-exporter.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LatestExporter"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.exporter.LatestExporter\'>"
-  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.LatestExporter\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.exporter.Exporter\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
index 53ec5a0c781096a04e65ea6ae41cd755040615ef..47de660a386c3362cf880ba9eed189f2bea047cd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LinearClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.linear.LinearClassifier\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66a127606a5be7c356a48ff7eb0751dd7db0eb02
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
@@ -0,0 +1,62 @@
+path: "tensorflow.estimator.LinearEstimator"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'head\', \'feature_columns\', \'model_dir\', \'optimizer\', \'config\', \'partitioner\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'Ftrl\', \'None\', \'None\', \'sum\'], "
+  }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
index 3791162619c0db1e205a7f6a028966e8f5dc2b68..5c094fe1318565443fb0864750fdf532d465cc04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LinearRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.canned.linear.LinearRegressor\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-logging-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..425f0167a161104891c3bb76816fe8c5094de28a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-logging-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.LoggingTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.LoggingTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tensors\', \'every_n_iter\', \'every_n_secs\', \'at_end\', \'formatter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
index 6a1c24fa63fc074c2b4ae9b3225a6abb47958b68..bf7c1abcd89b29c29f3487cab58cfdf28103119c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.ModeKeys"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.model_fn.ModeKeys\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.ModeKeys\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "EVAL"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6e17e4352b0f909b31327a57bbdca3bc0e02a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.estimator.NanLossDuringTrainingError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..82293c2c0c4e7204d9aba83f43ed2fac6bc46b19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.NanTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loss_tensor\', \'fail_on_nan_loss\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-profiler-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..65b5fb16b0874e7c6469ef11420db146be1f0b5f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-profiler-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.ProfilerHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.ProfilerHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'show_dataflow\', \'show_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'True\', \'False\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
index 269e18a0a700548ce01b6eb215d936da4c718a65..827b1ac5a576208e090d2db7d702675db89de4a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.RunConfig"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.run_config.RunConfig\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.run_config.RunConfig\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "cluster_spec"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-second-or-step-timer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64051d2bd6b69614cd210d902552ddeb8b6c8e5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-second-or-step-timer.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.estimator.SecondOrStepTimer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks._HookTimer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_secs\', \'every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "last_triggered_step"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "should_trigger_for_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_last_triggered_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-step-counter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4368e04df3f86834b540bb5306bf66dd82ac440c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-step-counter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.StepCounterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StepCounterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_n_steps\', \'every_n_secs\', \'output_dir\', \'summary_writer\'], varargs=None, keywords=None, defaults=[\'100\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-stop-at-step-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..938b189a8c30237bb15bf73083a348e6366fbfc4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-stop-at-step-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.StopAtStepHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StopAtStepHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_steps\', \'last_step\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-summary-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..104157315f5982efb4f6b9f39e0ece905a225e10
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-summary-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.SummarySaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SummarySaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'summary_writer\', \'scaffold\', \'summary_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-train-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-train-spec.pbtxt
index 7d2f77438afa41f2d8391524470f82a22076313b..1d9f51a20e2f94803abec9fceb3c25c86122785b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-train-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-train-spec.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.TrainSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.training.TrainSpec\'>"
-  is_instance: "<class \'tensorflow.python.estimator.training.TrainSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.training.TrainSpec\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.training.TrainSpec\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "hooks"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-warm-start-settings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-warm-start-settings.pbtxt
index 43f5343359aff3b856a2b3708e4cda7cec29e146..dca2c1fe11764b6b2023e7c1c33a3e190706c08b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-warm-start-settings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-warm-start-settings.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.WarmStartSettings"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.estimator.WarmStartSettings\'>"
-  is_instance: "<class \'tensorflow.python.estimator.estimator.WarmStartSettings\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.WarmStartSettings\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.WarmStartSettings\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "ckpt_to_initialize_from"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a2a01cd5325ba7e02d9b549293dd09a4a57e167
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.experimental.InMemoryEvaluatorHook"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.hooks.InMemoryEvaluatorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'estimator\', \'input_fn\', \'steps\', \'hooks\', \'name\', \'every_n_iter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'100\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-linear-s-d-c-a.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-linear-s-d-c-a.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85292e4d7ed4e448c40186ba05ef12c351068a39
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-linear-s-d-c-a.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.estimator.experimental.LinearSDCA"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearSDCA\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'example_id_column\', \'num_loss_partitions\', \'num_table_shards\', \'symmetric_l1_regularization\', \'symmetric_l2_regularization\', \'adaptive\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'0.0\', \'1.0\', \'False\'], "
+  }
+  member_method {
+    name: "get_train_step"
+    argspec: "args=[\'self\', \'state_manager\', \'weight_column_name\', \'loss_type\', \'feature_columns\', \'features\', \'targets\', \'bias_var\', \'global_step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f0fd7ce782db71ff5e790fe50e93556bf5d19e1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.estimator.experimental"
+tf_module {
+  member {
+    name: "InMemoryEvaluatorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LinearSDCA"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "build_raw_supervised_input_receiver_fn"
+    argspec: "args=[\'features\', \'labels\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_logit_fn"
+    argspec: "args=[\'logit_fn\', \'features\', \'mode\', \'params\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dnn_logit_fn_builder"
+    argspec: "args=[\'units\', \'hidden_units\', \'feature_columns\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'batch_norm\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "linear_logit_fn_builder"
+    argspec: "args=[\'units\', \'feature_columns\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'sum\'], "
+  }
+  member_method {
+    name: "make_early_stopping_hook"
+    argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
+  }
+  member_method {
+    name: "make_stop_at_checkpoint_step_hook"
+    argspec: "args=[\'estimator\', \'last_step\', \'wait_after_file_check_secs\'], varargs=None, keywords=None, defaults=[\'30\'], "
+  }
+  member_method {
+    name: "stop_if_higher_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
index 3cf7af8da95479cf49469b2f328db0919fd5ce95..820afac8e10a7ceeefb351896077c7cf2af044d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.__metaclass__.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ClassificationOutput.__metaclass__"
 tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
+  is_instance: "<type \'type\'>"
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
index 2df1840c4a4f03fc08ba535b4f6557d49608fa5f..52874dd9b9316d9815c5aef51e272e6ffddb5224 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ClassificationOutput"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ClassificationOutput\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "classes"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
index 5d165ccbf91865e48f40f88ff817bff03881a03b..b811e1f3dab9437ee53f6bd8fb7215b35b121f9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.__metaclass__.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput.__metaclass__"
 tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
+  is_instance: "<type \'type\'>"
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
index fa62e8ced801d66951ef5a62ec4fdd9795226ebd..964c315e9730effac38d60f7242527e71cbf9846 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
index 743495ba98cf4db0abeba86e26b812d9e3c8695b..bdfcb9c8882f334dc2706d079918855d651484c1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.__metaclass__.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.PredictOutput.__metaclass__"
 tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
+  is_instance: "<type \'type\'>"
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
index e0160b10ce13a0b3499143d151ee7e58ad858fb2..bb82bc9e58627318b897f0610c7d852db7f98c07 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.PredictOutput"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.PredictOutput\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "outputs"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
index dbf4e3dec85d7d00045bfe4e7086ba23edf61a84..dcd7cbf427e3d0219a8aa94621f9502fffc10ca6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.__metaclass__.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.RegressionOutput.__metaclass__"
 tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
+  is_instance: "<type \'type\'>"
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
index 905f0e055350fe9a7d5790e531fb2b089332f279..8522834433f214e5d646ef6265b1047fb7f2cc4f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.RegressionOutput"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.RegressionOutput\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "value"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-serving-input-receiver.pbtxt
index d71b2a430065740c376f8e90e3244d105ac2101f..a0371a16635161a69998a1901aec8f8962f98fd7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-serving-input-receiver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-serving-input-receiver.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ServingInputReceiver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export.ServingInputReceiver\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export.ServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export.ServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export.ServingInputReceiver\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "features"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
index 4fe92643bf9867765499d7bf475b9cdd1686aec5..da9d05df237397c4d0fa0746a6a4e835c5d42b0e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-tensor-serving-input-receiver.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.TensorServingInputReceiver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.estimator.export.export.TensorServingInputReceiver\'>"
-  is_instance: "<class \'tensorflow.python.estimator.export.export.TensorServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export.TensorServingInputReceiver\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export.TensorServingInputReceiver\'>"
   is_instance: "<type \'tuple\'>"
   member {
     name: "features"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.pbtxt
index bd72f6cd79f7dffb9f0a7f8ae43751c4ecba939d..8df585a5d9b401d553652fa8168445730eb145ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.pbtxt
@@ -2,19 +2,19 @@ path: "tensorflow.estimator.export"
 tf_module {
   member {
     name: "ClassificationOutput"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "ExportOutput"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "PredictOutput"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "RegressionOutput"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "ServingInputReceiver"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
index f1d204a3ef96f35e31f642bcb0a61351b263d273..d3656ae0455971ccd98062a52ec0412bf6af06f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "BaselineClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BaselineEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "BaselineRegressor"
     mtype: "<type \'type\'>"
@@ -20,14 +24,30 @@ tf_module {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointSaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CheckpointSaverListener"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DNNEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNLinearCombinedClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DNNLinearCombinedEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNLinearCombinedRegressor"
     mtype: "<type \'type\'>"
@@ -52,10 +72,22 @@ tf_module {
     name: "Exporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FeedFnHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FinalExporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FinalOpsHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlobalStepWaiterHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LatestExporter"
     mtype: "<type \'type\'>"
@@ -64,18 +96,54 @@ tf_module {
     name: "LinearClassifier"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearEstimator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LoggingTensorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ModeKeys"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NanLossDuringTrainingError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProfilerHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RunConfig"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SecondOrStepTimer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StepCounterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StopAtStepHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SummarySaverHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrainSpec"
     mtype: "<type \'type\'>"
@@ -88,6 +156,10 @@ tf_module {
     name: "WarmStartSettings"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "export"
     mtype: "<type \'module\'>"
@@ -96,6 +168,10 @@ tf_module {
     name: "inputs"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "add_metrics"
+    argspec: "args=[\'estimator\', \'metric_fn\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "classifier_parse_example_spec"
     argspec: "args=[\'feature_columns\', \'label_key\', \'label_dtype\', \'label_default\', \'weight_column\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c3f04e468c4c817cd474deb42149aee3021aa43
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.experimental"
+tf_module {
+  member_method {
+    name: "function_executor_type"
+    argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
index f06e7989537eef2b0e6fa4b720e90614366b41ee..3aadd7dc341ae97fdbfa83cd3fc96fc75249a4c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "categorical_column_with_vocabulary_file"
-    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\", \'None\', \'0\'], "
   }
   member_method {
     name: "categorical_column_with_vocabulary_list"
@@ -32,14 +32,6 @@ tf_module {
     name: "indicator_column"
     argspec: "args=[\'categorical_column\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "input_layer"
-    argspec: "args=[\'features\', \'feature_columns\', \'weight_collections\', \'trainable\', \'cols_to_vars\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "linear_model"
-    argspec: "args=[\'features\', \'feature_columns\', \'units\', \'sparse_combiner\', \'weight_collections\', \'trainable\', \'cols_to_vars\'], varargs=None, keywords=None, defaults=[\'1\', \'sum\', \'None\', \'True\', \'None\'], "
-  }
   member_method {
     name: "make_parse_example_spec"
     argspec: "args=[\'feature_columns\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
deleted file mode 100644
index eecfaffd0a6f6e611eba8bf3f5bb709bc9e0157f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.FastGFile"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.FastGFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
deleted file mode 100644
index 305251059d90b52aa2e76e99a4ec65e68b73fb79..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.GFile"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
deleted file mode 100644
index 6e8894180a4a685d5a35ba02df53c6e054db01b9..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.Open"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
deleted file mode 100644
index 65b55a8b7c4e30e349c1ea256664002b19191c82..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
+++ /dev/null
@@ -1,63 +0,0 @@
-path: "tensorflow.gfile"
-tf_module {
-  member {
-    name: "FastGFile"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GFile"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Open"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "Copy"
-    argspec: "args=[\'oldpath\', \'newpath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "DeleteRecursively"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Exists"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Glob"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "IsDirectory"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "ListDirectory"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "MakeDirs"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "MkDir"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Remove"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Rename"
-    argspec: "args=[\'oldname\', \'newname\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "Stat"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Walk"
-    argspec: "args=[\'top\', \'in_order\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.glorot_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.glorot_normal_initializer.pbtxt
deleted file mode 100644
index 483d1f8ba0918b118c76156f6cd70a5ba8c9a7f6..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.glorot_normal_initializer.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.glorot_normal_initializer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
index 162ee76ee7f900d266498c297873177ada35b542..d0facad3809f48763a6827b9fc1c66ab16d8dce6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.graph_util.pbtxt
@@ -1,27 +1,7 @@
 path: "tensorflow.graph_util"
 tf_module {
-  member_method {
-    name: "convert_variables_to_constants"
-    argspec: "args=[\'sess\', \'input_graph_def\', \'output_node_names\', \'variable_names_whitelist\', \'variable_names_blacklist\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "extract_sub_graph"
-    argspec: "args=[\'graph_def\', \'dest_nodes\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "import_graph_def"
     argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "must_run_on_cpu"
-    argspec: "args=[\'node\', \'pin_variables_on_cpu\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "remove_training_nodes"
-    argspec: "args=[\'input_graph\', \'protected_nodes\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "tensor_shape_from_node_def_name"
-    argspec: "args=[\'graph\', \'input_name\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 0a231f1b65155b8662bb38943bfd97c5283b9385..3c6ed1cfb8340b6e8f2599360e3c321c562e37ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -38,7 +38,7 @@ tf_module {
   }
   member_method {
     name: "crop_and_resize"
-    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+    argspec: "args=[\'image\', \'boxes\', \'box_indices\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
   }
   member_method {
     name: "crop_to_bounding_box"
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'sizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "extract_jpeg_shape"
@@ -173,16 +173,8 @@ tf_module {
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "resize_area"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bicubic"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bilinear"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    name: "resize"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "resize_image_with_crop_or_pad"
@@ -192,14 +184,6 @@ tf_module {
     name: "resize_image_with_pad"
     argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
-  member_method {
-    name: "resize_images"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
-  }
-  member_method {
-    name: "resize_nearest_neighbor"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
   member_method {
     name: "rgb_to_grayscale"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -222,7 +206,7 @@ tf_module {
   }
   member_method {
     name: "sample_distorted_bounding_box"
-    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sobel_edges"
@@ -241,8 +225,8 @@ tf_module {
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "transpose_image"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+    name: "transpose"
+    argspec: "args=[\'image\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "yiq_to_rgb"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
index d49181714fe44cb6e27fb149948ce6eedd5e8ec5..e3c63fe737ee655169c00c7c0b2882c84f566244 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
@@ -64,8 +64,4 @@ tf_module {
     name: "lecun_uniform"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "tables_initializer"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cfa3372b12bfe32eed4311c89b6448c0359c0913
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.io.gfile"
+tf_module {
+  member_method {
+    name: "copy"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "exists"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "glob"
+    argspec: "args=[\'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "listdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "makedirs"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mkdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "remove"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rename"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "rmtree"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stat"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "walk"
+    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index dccf136788da44073160931707167b7d8baa0add..8906329742c61ed08a25bcc252ec0d1dfa9e374e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -44,22 +44,50 @@ tf_module {
     name: "VarLenFeature"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "gfile"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "decode_and_crop_jpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
   member_method {
     name: "decode_compressed"
     argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
     name: "decode_csv"
-    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'name\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
   }
   member_method {
     name: "decode_json_example"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "decode_raw"
     argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -72,6 +100,18 @@ tf_module {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "extract_jpeg_shape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "is_jpeg"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "match_filenames_once"
     argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -82,7 +122,7 @@ tf_module {
   }
   member_method {
     name: "parse_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'serialized\', \'features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "parse_sequence_example"
@@ -90,7 +130,7 @@ tf_module {
   }
   member_method {
     name: "parse_single_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'serialized\', \'features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "parse_single_sequence_example"
@@ -106,15 +146,15 @@ tf_module {
   }
   member_method {
     name: "serialize_many_sparse"
-    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'sp_input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
   }
   member_method {
     name: "serialize_sparse"
-    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'sp_input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
   }
   member_method {
-    name: "tf_record_iterator"
-    argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "serialize_tensor"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "write_file"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 0869de02432d1ed5c492641b3d5a3ddfb786d554..a3254cbd947d9ef70617131e9f4b17f44f059840 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -89,10 +101,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -107,7 +115,11 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_update"
@@ -115,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -229,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -263,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 20f39fae1eb70da00cfcc3b6ce0d2abb79189228..b70e9ee98d5bc4900420ddb1307abf9adcd8cad0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -90,10 +102,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -112,7 +120,11 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_update"
@@ -120,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -246,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -268,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -280,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
index 2e9de9ebb21021ab82ed4409243e13db49d7327c..eb315e356dabc5a404740afe9d3c0c60d82fdbb5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.activations.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "elu"
     argspec: "args=[\'x\', \'alpha\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
   }
+  member_method {
+    name: "exponential"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index a71a59e269fa6e24b01b81bb222fef528869db68..d200d3d26d7c1b7d54eda596a8056a66e29be0b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -46,7 +46,7 @@ tf_module {
   }
   member_method {
     name: "batch_normalization"
-    argspec: "args=[\'x\', \'mean\', \'var\', \'beta\', \'gamma\', \'epsilon\'], varargs=None, keywords=None, defaults=[\'0.001\'], "
+    argspec: "args=[\'x\', \'mean\', \'var\', \'beta\', \'gamma\', \'axis\', \'epsilon\'], varargs=None, keywords=None, defaults=[\'-1\', \'0.001\'], "
   }
   member_method {
     name: "batch_set_value"
@@ -98,7 +98,7 @@ tf_module {
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'x\', \'kernel\', \'output_shape\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'None\'], "
+    argspec: "args=[\'x\', \'kernel\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\'], "
   }
   member_method {
     name: "conv3d"
@@ -182,16 +182,12 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'inputs\', \'outputs\', \'updates\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'outputs\', \'updates\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "gather"
     argspec: "args=[\'reference\', \'indices\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_session"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_uid"
     argspec: "args=[\'prefix\'], varargs=None, keywords=None, defaults=[\'\'], "
@@ -386,7 +382,7 @@ tf_module {
   }
   member_method {
     name: "resize_images"
-    argspec: "args=[\'x\', \'height_factor\', \'width_factor\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'x\', \'height_factor\', \'width_factor\', \'data_format\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'nearest\'], "
   }
   member_method {
     name: "resize_volumes"
@@ -398,7 +394,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\', \'zero_output_for_mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "round"
@@ -512,6 +508,10 @@ tf_module {
     name: "temporal_padding"
     argspec: "args=[\'x\', \'padding\'], varargs=None, keywords=None, defaults=[\'(1, 1)\'], "
   }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'x\', \'n\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_dense"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
index 9eee9b378964a9947b067b7ec495ef6556ab6d0c..7d298e95135ebf41230d72ff488fef30be682edb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 5bb949c5bb650acee91b14a4d6bf95b36029edf7..133205ab88b47afad32fc70ceca93513768a3b19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
index a5340d52c1af6d69da30fd710bcee9d832917574..d766c09ac5efaa9d0e4ffba4e495385130c7e770 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
@@ -22,6 +22,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
index f71292856cd29b2e52194bec8a586686fbfad667..605f74e5602a63f5a18c31cb26113d300ec76e7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -5,7 +5,11 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'monitor\', \'min_delta\', \'patience\', \'verbose\', \'mode\', \'baseline\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0\', \'0\', \'0\', \'auto\', \'None\'], "
+    argspec: "args=[\'self\', \'monitor\', \'min_delta\', \'patience\', \'verbose\', \'mode\', \'baseline\', \'restore_best_weights\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0\', \'0\', \'0\', \'auto\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_monitor_value"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "on_batch_begin"
@@ -23,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
index ee400b31c43829efba156298d5ee807cdafc8a98..cd893e67269164781d6a6b6294a199014d40fed8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index df8d7b0ef7afca17338a26388c38827b5b306f95..50f2054cabb1b8f6c46a9537ea923a18f87e5c80 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index ce1a9b694d8708720e0eb677afd25607c6262e9c..9ed9db0a89b49b88098e15baca414ff78b6f10e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 48bb24a05274addca03f11acef99607f78b92e51..3d8d1363bb4e4de818788efbf3c997594350006a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index d8bb8b2a7d0f491c7ec2b30096a1acaf04681a56..5012f1517d57dd646d82ab669cb279b6363dd6ec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index dc27af9552a88650261b4f0694ea0265e6bda05c..73652c2b61259f768eca76b995ae4592df868392 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -27,6 +27,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 5a3b791c0adc0d61129d38b2995ee9077cf0988b..24db71de1182d58b78fec0419aa9cb48a2e315d2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
index e58ba18c1c0d06df3a53d93ae18f5bf0931df329..c5503c69a5f3cb6765c984778c0e3626369ee815 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'batch_size\', \'write_graph\', \'write_grads\', \'write_images\', \'embeddings_freq\', \'embeddings_layer_names\', \'embeddings_metadata\', \'embeddings_data\'], varargs=None, keywords=None, defaults=[\'./logs\', \'0\', \'32\', \'True\', \'False\', \'False\', \'0\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'batch_size\', \'write_graph\', \'write_grads\', \'write_images\', \'embeddings_freq\', \'embeddings_layer_names\', \'embeddings_metadata\', \'embeddings_data\', \'update_freq\'], varargs=None, keywords=None, defaults=[\'./logs\', \'0\', \'32\', \'True\', \'False\', \'False\', \'0\', \'None\', \'None\', \'None\', \'epoch\'], "
   }
   member_method {
     name: "on_batch_begin"
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index 5c2d336353aee7fc98b45620adac4f4bcda05ea0..de6e8ef072558e6d926ea125aa5056e3c229d37f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -23,6 +23,14 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_begin"
     argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d814b2c8b553f1b2a07f9d9b97dc70ec0674969
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.experimental.PeepholeLSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..164edbd66ab2487a980155eabcf18ed8446e2c14
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.experimental"
+tf_module {
+  member {
+    name: "PeepholeLSTMCell"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 5510465d7b015e4989472b06c9d00ec9772373cf..b84629540e700f242f885064c92309c294693a11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index 38ec8a0aff0b9321f3a7ab2cfd9e6b75a8228e4a..5918a13ad8629582829049485e896688ecad9579 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 41cb8e30bfb57068ebe787f14f69ccc467047f26..599da06427dfe4f28e757a7aac8d8a14856a4556 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 9a7aaa8e961528aa750248e02f44403cab10a413..f9ff1538c8134d96051ad81d35c73e59c6a8cc57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index c3dd2ad046ec087fd12553a2bb5243939c995e64..723fc9cdb0d0ad93470e22fd8c147d3ecc92af91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index cc303bf7b98bb81cb0646fc18df0a4c5c70f1917..957ce2f0ce86f8df3eb8b57606229fb661eb52f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 628447ce3555628b651536d6c5b2a7716d59085c..a52c0af68175420dc2a1993d1f025d36705538e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index f03c986c22210906ad7bdc8b880753469b31aa1b..a004db62ddcaaae02a411d8db51f4026ece1384d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index c440604aae62b1ee1c7b7c0b5976ef509af54a7c..44f83d1387cb2ec681f50f7b1f0297f3f74594ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a01eaf8a12626257e97d135f50c06c7ea32fca27..8378faf7188ec594865d4b68c8ea8cae284183ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 0d6698f2ef4c674bf8a4dfc026eb209a83dcb8e7..9d5655c9644e3a2394a346bed78fc478cf60ba8d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index f1b23be48f7fec2051f1985381058d769eb8c2f8..5da79268129fc5c08cbd37686333847cbb32730d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 0672cd5b7b8fdb1967e39c9163635372f73459b7..d37a6b47105225d7b83b6a264b944ceeb583a6c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index b25ae1e82e8a1f315553337a261a2d8a46301fa0..1ad7a91be0ba48d0dbab19da8c7cd9ca89095918 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index bb1918eba65659d9ede888400c24b3a5121d6052..cb9abc25396bb63a3c40de5cc52f9df7ed20071e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -178,6 +178,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 16e0fd5a3131723b3ba3ef3ae6d93fa6426dbd47..47dba1d81f8f97a60fe72ec521f82a78ee5f3505 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 065bb4d35b422ca5ddaceec5726dd0e0bdb7027c..fd649418961301f150aac3dabc1bdf0ade4a9c28 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -84,12 +84,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 543bae6fa96fa3ae51775e865bf95ea6f79c8e94..1b1425d53197db8b59abf51fe93c0b0c45299956 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index c7ba6056f9683badbbf3423faa98277a57d4cc45..1741063fe8b09acf3865e0a135e96bb715dcdcfa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -84,12 +84,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 072943dc2c709a7cee26c3439e02e11455187282..50feb4f458ad1a9cb2b2bfe5d67997b7551eed74 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 222a1ef4fc5d19afe2c111c169c2f0bd38c331d6..faaa535df9fe03ad07862f0793f8ebea67b405ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 8f4f7918ab3eb8f73751e6142d5a1ceadd37a6e2..4079329d1ee2a61270fee38426bb8a0859c38ce3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -84,12 +84,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index f93906717814d4df7dfbf983d6cdbef358e9a55c..32e56696e1617f7810792e3416a2ebb2037d23c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 93c442bd55ace0f55fce81fd14e7f05cb13ea3cf..381abe73401fa3a588873d643324fc020c159e30 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -84,12 +84,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'output_padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 471b18ef8500a279fb07bc893e2c8100d76d7bf1..b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f250a09b7eb69871e7e89d30da817aeb1d896fc..7aeff8003c322e8a8168dd70481a8b30b08762a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index f52128483c67321e4f0e5f0cf5a9fd3c65794561..a1728d9d4f9a1e677646db04c4d0df9572e21208 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 98daf3bab128357ffdde2e8ffa4f61fd5c6493f7..8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 64e7a9046b0852bd44119c4711ef1e3627346aa8..7758209adf8fe7a1306fa5ef125935dafd925c3e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -128,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 6fdffef776827f64eafaa914c1ba3938e124c816..7c463ff1257599366be049edce6cc06140906286 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -128,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0781a93bd56c5ebc77e1fb650497621e49d7ee1f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -0,0 +1,179 @@
+path: "tensorflow.keras.layers.DenseFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 3ac3825759391b7ea21fd6e3b3b149bb9e731479..4960d0264e96e872ea5c49a8841cef20bd5eb37c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 280ec8c25fabe1be63c9aa9a2c7f168315c219d7..8fad7535f882718462a11e27e75732e3097cb87d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 560f66f9c7a1f7e42e27c739a6c71671f8bd147b..5b425f2d4d7a8a897280490e26922766d8bf7065 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index c0543529c3884f20383911f32ea04c07fec4a050..f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index 04eb2824b9b14cf45eaef263282ffc6778bf709d..82b761fc1761bb3e7638f7a80bc80c6433162d04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index f400432915f8ce892a3297a23078f140eb96db7b..c9ff323877e06b6dff274644744d425e3a9b7932 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index ab176b441a246d93b88c00cd6decb34af175ad86..9b4165d4cbf88fefd2bb684dae70ea8afc01357b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index c3895a0ac127bc663f2a323661c1371a428159b0..f225f7c4309615919fb05df05f2ae664bde80097 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index a0fe598ab93a4e9712a1ef631283e8e552ab1e64..855d001700179fb634d1dff78585d340420abe7f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -179,7 +183,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -191,7 +195,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 55e0d7ef023ac4ca5e89f640c5ebb79199c31afa..2c404c99cd2175cdc8b60b229e4410bf280ebcb7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 38fbff5e4a3d2c892b0601c54e52690dae5760bd..6f109d59d0f6fcd2b4650719e3b4f653baec7d23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 5ea61d118de15b1b18410abb3befe404a6ecaecd..69f8a9031d32eb73bb44291cdf330d738d745cf9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -111,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 929f48df23180a2c5e21c110e0e1d343596ecd76..4299f765e525b136e289bba169becec06e19ffb1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 2e6d59337f1df94e327b506248eb74ab11bd6013..9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 11dca17c6df94170f442a88da0c4459caa70d0c1..625e81fd2322ceba153fa65c138948ce43843089 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -111,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4e3e258430cdacaf55aed5d46411d2b74c9bdf2e..2fc769742c70c5665c9cb77ad246fcdb49366d5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index fb9166316f6a641eb12a5664100e31d652148a84..e307a65c7c565660e1f2b6b6b74dc5970425eaa4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 278429af6febdfb9802d86992a1e46bf17633562..4394ad0364e89fd3531d6625e52540991cadf973 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 87b7f6797a0d5bef8c5a4ff582c30433eaced2d4..050ed39fe98dc7cfdf6febe45e235d3ae7cbf486 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 98bf96fa0c251c5f6de8878d48e651ac3346ff38..436191821ef4689351b6124cf2a20afad917e4ab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 935a69ab2f3a93db608f6e18baa7359944a428a8..4ba540aa6adc72b572aa9340f89967d69ab78a3c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index c9d4158d1c434655abb11b92269e6e70ad2d1f91..a2e9322cb3fd4e56af708d5c4e17b660f7bc2247 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 9953102ff991bfd4f0568120dd7aef07f75ea208..5d16a57fc1aeff9939220de8043fcae39e3d953e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 2617f5a95fa631cf0b92e1fd2feef7457f96fd80..9dd29c1251ef2eacaf535a3f10f3d42dc36624a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
index 5fd0a47a68c0d4ad218c4c64cc6be8f603d9673a..bc3ceb67a4e7506b42fccd6b227891b9eef8147f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index e9f6ef45aaf1c775ea1b8dd157737f65c87e232f..0045d5775e2c19df21428bd4420b6e5612c8002b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index ecdbf48157f5c4aabab065cc99191b1cd6cf57f0..529c750f98715ec30313ed34c9023a845061a3df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -179,7 +183,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -191,7 +195,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 2e0b6bac24fd63988b28c1099d40581989b783df..d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 1e93d1118a4d306d5427d9b6873de1746d93b764..e1f5491180903f7d6931cc09755cabb715bbf233 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bfd36012a7edb8a74198a87a86577278be3fdcd4..9b69d9a9447f42907236b5cc8c7672012f96c38a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b66576c96b8503d3ebb90f02ed19233223a269a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
@@ -0,0 +1,289 @@
+path: "tensorflow.keras.layers.LinearModel"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.LinearModel\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'units\', \'sparse_combiner\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'sum\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 5ad5990d7e624c4f6b1dde92b4608c65aeb19db1..fd52259432577ac94dc702d4411ad5c0eed1ff10 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 40d03369a5235f394832e3e2f48710bb069e9aac..5fc8af0d03564c649dff6e9df70d10731319de40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 86666b51bb8c8dc22deb95f05cb9edfb10688015..7f8932270e63bc02852c5b64e53694e7e26be08b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 238d96cca62e6e8dc2de2b527dd8a80644ff32fa..4723b99cb0792e1ce0bdc45e46908da8c2b5359c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 85f23df671d2772995ec01bb09e191237d60e6a7..173c5d4a8b149c4e23683cf375e8d793db7faa5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 235806b96500473fe95dd1b25aafe7f091bdb36b..14e1899e145224e411d65cbf481060a3b2cec0f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 4a45bf7997d819140d1c19907535ef2b2d818db9..a708e652bf0e82dea0f58034a81a040a39550dc9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -83,12 +83,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'channels_last\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index fda2562fc8c51623f5c4b33e23319ed35229905e..e6706b5cf9f32bda78adc4e2db5916a5750cc82e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 71d2d09a8d1d7addf91d7dc4ca109f8c2d45aed9..a73c082d1bba0453b742f76bacf0ad6116ba79a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index 12949b39a6f7affa657d1dccdc49ad0dc37e9c2f..f3f195554bbf4a43efaf2af0fd278a23bf270994 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index ab16d0021e627e6a2a821a0185ad71eb5bef1835..f345d1d67b2ce0200c64b1aeea5f39821d070bac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 61ccbf5962791ee1c0b35cc4aba422ff5cacd456..31cb8bc177c7a9e365101e75108a29900fbda124 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index ce2320d7030d05ba1e065f5bbcf8a18014891b5e..44cccc92bd2f1ff0335c22f2967865dc88a96ff7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 69848af8cf876ad1232a0bf7c419f52ed68af9f0..b55e191ff1ad6997550966bbb6154a81a489575d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 68b6678d4887323f43bd0a886a76e02a056a4260..e9575436e5b14ac8c52a0b59c86937886eab5f40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -110,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -122,7 +126,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 413f45f018ae0ce9ccf0e459b24d544c456e4c7c..98223b207f2ecfd5b7af8a53390166e53a7d4f73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9c61ff602744c00f9105a3f297151b49a8a3dead..2df918b16b2552323d75083bfa80e328c0639cfe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index baa91804c49f86a31093aed0c0a56613f7c1afee..ce5f9e21290eeddc0052257191ac4a6d068c1366 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 15a5d6ac9ea6e087dc0d76a2ab48b08448bfb6ee..a0bb917775fd9edb5d909bf850310e0596a88209 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index be43bd5b3c13632711a49cbbe6c85527d46d46ec..d7942f201bdbfa8d1577813be461a5905b5c6c90 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6105992c7a3a92d00718fe3287412af3c752db1d..f7ac9042d46f46ab35d18c62e5d8841679a18ca9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 1b6cf1e9ecb08a789212da141971434bd63988a6..e5a92688220f6e227b317d71a70fde01df4c432b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 29488a37f8f29f953d2b8b7e447c331df3244c84..0fe2c974a762784a82a6b97e116357be2a61d84f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 182efb83b8621b86672d909ca9929380fad2e1dd..2ee5873f0f11688019dec3a6cd69db06d99b9caa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -149,6 +149,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -167,7 +171,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -179,7 +183,7 @@ tf_class {
   }
   member_method {
     name: "compute_output_shape"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "count_params"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index d29731ecf9d5387a324104865af5f563d287c60b..5b8f64aa35725d0ea44fc5c5b81952fd839503e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index a6d7494ca7d2230298a442b86766f46bc58a6d54..240cb6e562f77467d94ef95db2374150e318bc04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index c36e802693df564702100a652f3ccc2e95e4c40d..6226c469f8a534f96f6ea991fa5e7d2cf0019e3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9c46cfe40fd6959b526d6ca271bda3182daa1188..34dabce6d8dd0b1b6fe50a008a981e1f06a77edf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 8982f787940dd65291580781b5dc95941d804071..0ddf628ace582db259ebe0b211aba6e6362b5d5b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index ec2cc502984d302b243803b04b4f9d60cee43d05..12eb35ad154a514afd9c900cb2dbece8af28c49f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index d7bc1980f32e523781a68e80312905bc355f0509..c41020c2b45cc88c9b63f3b7a45c35066794dfe2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index fec2de6b49ec1ffaf45b9ee9048bcce37425e919..479f89cf6ae93e8d6ae02e304a51a145164df7de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -93,6 +93,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 3d285e7f17db3e8cdfbacf0056a4c56ffa7e67cb..233363ce02614f184b43a059889c7475b6a8c50b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 40a56a0c948887493a8a4782f122c634da58aeb1..cb6228ac446bd236df88f94eb6e9e717ea38463d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -82,12 +82,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'size\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'(2, 2)\', \'None\'], "
+    argspec: "args=[\'self\', \'size\', \'data_format\', \'interpolation\'], varargs=None, keywords=kwargs, defaults=[\'(2, 2)\', \'None\', \'nearest\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 728eca415a80842291d5684e55632689ceea4099..03bad3ccb613a225ad56e128ea680fc9312151e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index da64e77c39c0e116ff725bb05526882541dd6056..158996792a47fab0e7aa26d21d4bb7f281ca76d2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 2f505f9293f429490543ba2c569668f4b2ba3ca4..63a56cd3eebe271f66258c9a0acb974764555b34 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index f82c77072e6969dd57f89f4a971e59e28b4bfc63..965a4cca04651e123c5bd93484200a58b39918ba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 54e01a99177cde5fbfaf5e1e0ac310bef3ea8eae..1a624308878a68f1b48cb0f8b5e08dafbbfa0333 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index 9d7e5bb8c7808689bedd8abb835e61c1f38fdb1d..3b4724ef104878df0caada75b0ba68740dc93f8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -124,6 +124,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DenseFeatures"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DepthwiseConv2D"
     mtype: "<type \'type\'>"
@@ -240,6 +244,10 @@ tf_module {
     name: "LeakyReLU"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearModel"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LocallyConnected1D"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f7da93f6f412ca559aec2f6acde2b80a5c93c86
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3a7cd80973259bd5cdfe382c656a9478f8933d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..712bb2ecd3526c354cbcf640e689526b2e415a13
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7fe362da89b47a925cd4708909e1c882a9a23aca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a5718533500d9508c558d25d13fc6b61518a73a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..200006db355ca4dc8eb2f509bcb9da7543145548
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f20ed26e2ea2819554159a9bcecb4141601e4a19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.keras.losses.Reduction"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "NONE"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM_OVER_BATCH_SIZE"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "all"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "validate"
+    argspec: "args=[\'cls\', \'key\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
index eca6b915388ebff0103f7ad16f43c6be0df60b7d..c198096d252cd9a3706bcbf6f1e4a1199ec7a1f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,33 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Reduction"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -22,11 +50,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -106,7 +134,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2db07df5235e150f691a12d6b332c6d0d241ac19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..904ad3a21a05895b23e30dab82a89a31c74dcfca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17b74924fab4f596a010d6b9731b474433a8153e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49f577e1367aece126449923f77f4f6c89493e99
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8baf858669a446a11b44e044f36bfde61e440bb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40fe64bbd2cec45b9a8c4e9b041d3fa858af1327
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ae6a85026da80cd071984aede8d0ec4e9cd571c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31068a51d510a7b95f62f61f03d37176c0fca55d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa77d1972cea42184fbbdb91e117b08ba38328fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.SensitivityAtSpecificity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'specificity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c17452292a031d42f3da0d5844e99d1272dad25
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67857aa89f1769c736d810cf5f73739021afeddf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.SpecificityAtSensitivity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'sensitivity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b5eb8d0de53960c3a98409119709c1307aa6379
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b9c470e32d7e038f9ba11e4f96ab6eaa6b60a87
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index a296e131586504a3fadc9e6fe54079ee0f8270ba..905021dd790205e64a6f9839218200db98941927 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,57 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SensitivityAtSpecificity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SpecificityAtSensitivity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -26,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -34,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cosine"
@@ -110,7 +162,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 4011719317051a2f153216591e4c571e8b0b2c15..c58c7bef22dd4bff95d8ff07a10e20bb1bc463ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -89,10 +101,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -107,7 +115,11 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_update"
@@ -115,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -229,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -251,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -263,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 8a12ac1ad8d3267adf503d909b7f47d3c513a64f..473a1c16fb1edfbf37a7752e273566c1310853af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -90,10 +102,6 @@ tf_class {
     name: "updates"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "uses_learning_phase"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -112,7 +120,11 @@ tf_class {
   }
   member_method {
     name: "add_loss"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "add_update"
@@ -120,7 +132,7 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -246,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -268,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -280,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt
index 754b3b84b08b08c7d12eba4ddad0a483440055a9..ed9967856200d62fd152dfec85c8ec36403bcbc0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "estimator"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "initializers"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-generator-enqueuer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-generator-enqueuer.pbtxt
index 939fd547d06bbd03b7e1a1db1404263ff01fd07c..6f5ad2dc963961a6ac7be7656cce4aeb77815e0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-generator-enqueuer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-generator-enqueuer.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'generator\', \'use_multiprocessing\', \'wait_time\', \'seed\'], varargs=None, keywords=None, defaults=[\'False\', \'0.05\', \'None\'], "
+    argspec: "args=[\'self\', \'sequence\', \'use_multiprocessing\', \'random_seed\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "get"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
index be4496e753f8bdcd76a4761f9bd1804a77380359..8177cc71ed34ed5d0ae57d25ee2da70067411ccc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\'], "
+    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\', \'unit_name\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\', \'step\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-sequence-enqueuer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
index a9e499d1009b5a7458080db6c10a948af21c7b6c..aa36d66f9215f1c61f539af25378e71b079b02e0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\', \'sequence\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "get"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
index 81b91d2780faa6b8ee61fc1201d7ecaf17967b09..138d97b11f23873f98e6bbfc5d0402dc65fd98b3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
@@ -70,6 +70,6 @@ tf_module {
   }
   member_method {
     name: "to_categorical"
-    argspec: "args=[\'y\', \'num_classes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'y\', \'num_classes\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'float32\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling1-d.pbtxt
deleted file mode 100644
index c82e67526b21696a7d56517dc2cb6998882dc7a5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling1-d.pbtxt
+++ /dev/null
@@ -1,186 +0,0 @@
-path: "tensorflow.layers.AveragePooling1D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling2-d.pbtxt
deleted file mode 100644
index 1d031cb5f8461145127b0f13d77e6b8774f5a0b3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling2-d.pbtxt
+++ /dev/null
@@ -1,186 +0,0 @@
-path: "tensorflow.layers.AveragePooling2D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling3-d.pbtxt
deleted file mode 100644
index a8dda6655df1d06ca77b74f0a992c8fd7e7a357d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-average-pooling3-d.pbtxt
+++ /dev/null
@@ -1,186 +0,0 @@
-path: "tensorflow.layers.AveragePooling3D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-batch-normalization.pbtxt
deleted file mode 100644
index 97f65ed89436bd0b4027bb0cbeb80b6f1419269c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-batch-normalization.pbtxt
+++ /dev/null
@@ -1,185 +0,0 @@
-path: "tensorflow.layers.BatchNormalization"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'renorm\', \'renorm_clipping\', \'renorm_momentum\', \'fused\', \'trainable\', \'virtual_batch_size\', \'adjustment\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'<tensorflow.python.ops.init_ops.Ones object instance>\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'<tensorflow.python.ops.init_ops.Ones object instance>\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'0.99\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv1-d.pbtxt
deleted file mode 100644
index ccd9578f0d62bd70ea252ddeac587d59c926b018..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv1-d.pbtxt
+++ /dev/null
@@ -1,186 +0,0 @@
-path: "tensorflow.layers.Conv1D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d-transpose.pbtxt
deleted file mode 100644
index 9cbb58d721bb49bde562a57728a9ee46968e611e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ /dev/null
@@ -1,187 +0,0 @@
-path: "tensorflow.layers.Conv2DTranspose"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d.pbtxt
deleted file mode 100644
index c75ea3911e17bc879d140068ef54521effd2824e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv2-d.pbtxt
+++ /dev/null
@@ -1,186 +0,0 @@
-path: "tensorflow.layers.Conv2D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'(1, 1)\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d-transpose.pbtxt
deleted file mode 100644
index 5dc834e5141e58d255357e02d7446a06e6e2aa45..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ /dev/null
@@ -1,187 +0,0 @@
-path: "tensorflow.layers.Conv3DTranspose"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d.pbtxt
deleted file mode 100644
index 96ab209874ac14d6acf2e8115e7f04fc35c4b2bd..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-conv3-d.pbtxt
+++ /dev/null
@@ -1,186 +0,0 @@
-path: "tensorflow.layers.Conv3D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'(1, 1, 1)\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-dense.pbtxt
deleted file mode 100644
index 7e9656b3525c1d53940b869607616ff414a466cf..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-dense.pbtxt
+++ /dev/null
@@ -1,185 +0,0 @@
-path: "tensorflow.layers.Dense"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-dropout.pbtxt
deleted file mode 100644
index e9a2269a6e8de1f9a12f1b54d2e6dced3d4f8902..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-dropout.pbtxt
+++ /dev/null
@@ -1,185 +0,0 @@
-path: "tensorflow.layers.Dropout"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'rate\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.5\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-flatten.pbtxt
deleted file mode 100644
index 7d2eaaab2a8cb9159214a16ba65473d0b6870ac4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-flatten.pbtxt
+++ /dev/null
@@ -1,185 +0,0 @@
-path: "tensorflow.layers.Flatten"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-input-spec.pbtxt
deleted file mode 100644
index fd02c919aeb5a536bd052324618983af699e7c47..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-input-spec.pbtxt
+++ /dev/null
@@ -1,9 +0,0 @@
-path: "tensorflow.layers.InputSpec"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'ndim\', \'max_ndim\', \'min_ndim\', \'axes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-layer.pbtxt
deleted file mode 100644
index 8bc3eb26e9ca0bf0f129db336b7ca23466fd036f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-layer.pbtxt
+++ /dev/null
@@ -1,183 +0,0 @@
-path: "tensorflow.layers.Layer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling1-d.pbtxt
deleted file mode 100644
index 6a0dcce56ac0184ffe995662fd62b89e16257a29..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling1-d.pbtxt
+++ /dev/null
@@ -1,186 +0,0 @@
-path: "tensorflow.layers.MaxPooling1D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling2-d.pbtxt
deleted file mode 100644
index b6c84edf2a2f86240369b4053cd7351d0b59442d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling2-d.pbtxt
+++ /dev/null
@@ -1,186 +0,0 @@
-path: "tensorflow.layers.MaxPooling2D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling3-d.pbtxt
deleted file mode 100644
index 062a02fa590537b9efbf540a874eeaa6d36697f3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-max-pooling3-d.pbtxt
+++ /dev/null
@@ -1,186 +0,0 @@
-path: "tensorflow.layers.MaxPooling3D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv1-d.pbtxt
deleted file mode 100644
index eaad0fb23ef7501c8c5b7acee6a9677665b7057f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv1-d.pbtxt
+++ /dev/null
@@ -1,187 +0,0 @@
-path: "tensorflow.layers.SeparableConv1D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'None\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv2-d.pbtxt
deleted file mode 100644
index ece28a8ce962d8fafb3f7a397a814b903e915d48..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.-separable-conv2-d.pbtxt
+++ /dev/null
@@ -1,187 +0,0 @@
-path: "tensorflow.layers.SeparableConv2D"
-tf_class {
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'(1, 1)\', \'1\', \'None\', \'True\', \'None\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
deleted file mode 100644
index 0c24e9c7ddb2849732241c718bd08d31fe418e8c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.layers.pbtxt
+++ /dev/null
@@ -1,147 +0,0 @@
-path: "tensorflow.layers"
-tf_module {
-  member {
-    name: "AveragePooling1D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AveragePooling2D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AveragePooling3D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "BatchNormalization"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Conv1D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Conv2D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Conv2DTranspose"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Conv3D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Conv3DTranspose"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Dense"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Dropout"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Flatten"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "InputSpec"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Layer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "MaxPooling1D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "MaxPooling2D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "MaxPooling3D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SeparableConv1D"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SeparableConv2D"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "average_pooling1d"
-    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "average_pooling2d"
-    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "average_pooling3d"
-    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "batch_normalization"
-    argspec: "args=[\'inputs\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'training\', \'trainable\', \'name\', \'reuse\', \'renorm\', \'renorm_clipping\', \'renorm_momentum\', \'fused\', \'virtual_batch_size\', \'adjustment\'], varargs=None, keywords=None, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'<tensorflow.python.ops.init_ops.Ones object instance>\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'<tensorflow.python.ops.init_ops.Ones object instance>\', \'None\', \'None\', \'None\', \'None\', \'False\', \'True\', \'None\', \'None\', \'False\', \'None\', \'0.99\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "conv1d"
-    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "conv2d"
-    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'(1, 1)\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "conv2d_transpose"
-    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "conv3d"
-    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'(1, 1, 1)\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "conv3d_transpose"
-    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1, 1)\', \'valid\', \'channels_last\', \'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "dense"
-    argspec: "args=[\'inputs\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "dropout"
-    argspec: "args=[\'inputs\', \'rate\', \'noise_shape\', \'seed\', \'training\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'None\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "flatten"
-    argspec: "args=[\'inputs\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'channels_last\'], "
-  }
-  member_method {
-    name: "max_pooling1d"
-    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "max_pooling2d"
-    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "max_pooling3d"
-    argspec: "args=[\'inputs\', \'pool_size\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'valid\', \'channels_last\', \'None\'], "
-  }
-  member_method {
-    name: "separable_conv1d"
-    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'None\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "separable_conv2d"
-    argspec: "args=[\'inputs\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\', \'trainable\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\'(1, 1)\', \'valid\', \'channels_last\', \'(1, 1)\', \'1\', \'None\', \'True\', \'None\', \'None\', \'<tensorflow.python.ops.init_ops.Zeros object instance>\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
deleted file mode 100644
index b6dee6317604363275a128fe8d83aaa9473a257a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorBlockDiag.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 973705dae2fabbef0eafb38ad12e96c747aeee27..773c74e64d13ca4a840b7f599fc2cbe9c161cd03 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
deleted file mode 100644
index 3b33f3da97ec2ecb3f94e8bc309be2519fc79c62..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorCirculant.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index de917706d55214cc59f3205f0778d600a356a5b1..533544d21f2753f785113a30518f4fcbcff96cd7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
deleted file mode 100644
index 591bc9631a1d8ecbbd6e133b99c67e432399d73f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorCirculant2D.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index c4e6a21c3ac9324f5dd445dc65415c2abb4c6e9f..e3926eb6d4714731d09ff9c5b75a89830c06e7c1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
deleted file mode 100644
index d643139a53fc501fe2997a2b9f2d11c57b96f2e4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorCirculant3D.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 2e085a8e289e21173789041efb9254e992bd723b..ba209df7824a9cc076499458e35acd7dcf1eaf35 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
deleted file mode 100644
index 1adbcb41adfac33acfdb415662ced7992e21385e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorComposition.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 42d22bce42d8850a784afae3f67771ef1cfe5403..081fb0e08bcd1b35ab44459d1c8eb0857dd14956 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
deleted file mode 100644
index 023d90ccdba8a8739a11f4691d33b7087bedcc0b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorDiag.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index d6749fdcec69425e83a044409ec695d2661f782e..2014a04301618c20af5cf6f1144eb4dbda2479e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
deleted file mode 100644
index 381072e76c4d069ebf51fec44079b30f17cafc06..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorFullMatrix.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index d9f363d1336210623536e8293a6290d9ebfc2fe1..9a87ae9687741090485bd8d4d0d07d359a2015e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
deleted file mode 100644
index 5d115b35fb79cbc176a9e8a9bf1ec0f0edcc79e6..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorIdentity.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index aac7ee31ed62c22b2e86d287d48c68c7e905fd00..33afb835ce1d524991c0024bfb87c29a72aac08e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -76,6 +76,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
deleted file mode 100644
index 5c6784dd02104129a9ac38fe171d87c115efbbf0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorKronecker.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index c11d39082939eda4520b3955b767022bd485b5be..a9078c8ab5cca078237a29febabdbbd4a8b6c89c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
deleted file mode 100644
index 1f0d33298a252a8b3da6eea9fd4bc096e8dd6745..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorLowRankUpdate.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 3ee800269e617390c25248a2c847cbe259b18e79..4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
deleted file mode 100644
index 2683430f4fc5d96d63c5b6fdb4035d6e5e8ba609..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorLowerTriangular.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 63a1bc2321e35645700778c5906d1b8659eb4a32..a87649133fd207ad59f2124c6b0b5aa44916e5a5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
deleted file mode 100644
index 38bf7ad586a063046f260aca9b1c517a343c4c05..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorScaledIdentity.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index e2c5a505a7d2f9abbee5b3bb4f92ee8843198c51..32656467840fbbc0c8708ea68aac5aa75c11a540 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
deleted file mode 100644
index 49ff85728ffab559ec706691356ce071aab89083..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperatorZeros.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index a1b0e06b4753488bc9fcbe9aeb0d260092745f9c..49d8890c8942bc0021886ee6c9bc4e7625452655 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
deleted file mode 100644
index 38da809b360e5ea69b4324a859ed69da679bc436..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.linalg.LinearOperator.__metaclass__"
-tf_class {
-  is_instance: "<class \'abc.ABCMeta\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index 6d849dc040f61b498b100820bf7be3d4bc264bb4..c89dc067b331603e227d9d578147e2dd1ee4a900 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 6ac95d96da8516ca762333f0ab30949d19904cd3..3e1e2e3d54de3e2442299a783f933a60dfd2db6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -2,59 +2,59 @@ path: "tensorflow.linalg"
 tf_module {
   member {
     name: "LinearOperator"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorBlockDiag"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorCirculant"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorCirculant2D"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorCirculant3D"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorComposition"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorDiag"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorFullMatrix"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorIdentity"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorKronecker"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorLowRankUpdate"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorLowerTriangular"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorScaledIdentity"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "LinearOperatorZeros"
-    mtype: "<class \'abc.ABCMeta\'>"
+    mtype: "<type \'type\'>"
   }
   member_method {
     name: "adjoint"
@@ -118,7 +118,7 @@ tf_module {
   }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\'], "
   }
   member_method {
     name: "logdet"
@@ -132,13 +132,21 @@ tf_module {
     name: "lstsq"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "lu"
+    argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
   member_method {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'adjoint_a\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "qr"
@@ -156,6 +164,10 @@ tf_module {
     name: "solve"
     argspec: "args=[\'matrix\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "sqrtm"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "svd"
     argspec: "args=[\'tensor\', \'full_matrices\', \'compute_uv\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec0d9522bca9e0a272cccb21c3acc814a7462923
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-interpreter.pbtxt
@@ -0,0 +1,49 @@
+path: "tensorflow.lite.Interpreter"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.interpreter.Interpreter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_path\', \'model_content\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "allocate_tensors"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_details"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_details"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_tensor"
+    argspec: "args=[\'self\', \'tensor_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_tensor_details"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "invoke"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_all_variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "resize_tensor_input"
+    argspec: "args=[\'self\', \'input_index\', \'tensor_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_tensor"
+    argspec: "args=[\'self\', \'tensor_index\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tensor"
+    argspec: "args=[\'self\', \'tensor_index\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fe179f6c1b64ebc2f7535719bc1598577ee7f03
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.lite.OpHint.OpHintArgumentTracker"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.op_hint.OpHintArgumentTracker\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'self\', \'arg\', \'tag\', \'name\', \'aggregate\', \'index_override\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66e692a5a379203cb491980802b7003072bfe76c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt
@@ -0,0 +1,69 @@
+path: "tensorflow.lite.OpHint"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.op_hint.OpHint\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "AGGREGATE_FIRST"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "AGGREGATE_LAST"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "AGGREGATE_STACK"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_AGGREGATE_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_INPUT_INDEX_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_NAME_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_OUTPUT_INDEX_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_SORT_INDEX_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "FUNCTION_UUID_ATTR"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "OpHintArgumentTracker"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFLITE_INPUT_INDICES"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'function_name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_input"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_inputs"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_output"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_outputs"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68c651a3c9969f2f16fca39f4466cebbb44eea28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.OpsSet"
+tf_class {
+  is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "SELECT_TF_OPS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+  member {
+    name: "TFLITE_BUILTINS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c955b1a04a4b8af701a57ba2468145590c1a4a16
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-t-f-lite-converter.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.lite.TFLiteConverter"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.TFLiteConverter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'graph_def\', \'input_tensors\', \'output_tensors\', \'input_arrays_with_shape\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "convert"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_frozen_graph"
+    argspec: "args=[\'cls\', \'graph_def_file\', \'input_arrays\', \'output_arrays\', \'input_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_keras_model_file"
+    argspec: "args=[\'cls\', \'model_file\', \'input_arrays\', \'input_shapes\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_saved_model"
+    argspec: "args=[\'cls\', \'saved_model_dir\', \'input_arrays\', \'input_shapes\', \'output_arrays\', \'tag_set\', \'signature_key\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_session"
+    argspec: "args=[\'cls\', \'sess\', \'input_tensors\', \'output_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_arrays"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ef90b8bc4646a2adfcbeca2258ff5aa7cbf8894
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.lite.TocoConverter"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.TocoConverter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_frozen_graph"
+    argspec: "args=[\'cls\', \'graph_def_file\', \'input_arrays\', \'output_arrays\', \'input_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_keras_model_file"
+    argspec: "args=[\'cls\', \'model_file\', \'input_arrays\', \'input_shapes\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_saved_model"
+    argspec: "args=[\'cls\', \'saved_model_dir\', \'input_arrays\', \'input_shapes\', \'output_arrays\', \'tag_set\', \'signature_key\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_session"
+    argspec: "args=[\'cls\', \'sess\', \'input_tensors\', \'output_tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d5c4893b410120bf9d66e8d5d99ba0df5eaf164
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.lite.constants"
+tf_module {
+  member {
+    name: "GRAPHVIZ_DOT"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "TFLITE"
+    mtype: "<type \'int\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..154dd00821794ef4a5118e98d67e32beca38bebf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.lite"
+tf_module {
+  member {
+    name: "Interpreter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OpHint"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OpsSet"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "TFLiteConverter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TocoConverter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "constants"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "toco_convert"
+    argspec: "args=[\'input_data\', \'input_tensors\', \'output_tensors\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
deleted file mode 100644
index 85bb15455da624962744a0cc856e79e0a6d57d7c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
+++ /dev/null
@@ -1,83 +0,0 @@
-path: "tensorflow.logging"
-tf_module {
-  member {
-    name: "DEBUG"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ERROR"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "FATAL"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INFO"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "WARN"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "TaskLevelStatusMessage"
-    argspec: "args=[\'msg\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "debug"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "error"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "fatal"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_verbosity"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "info"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "log"
-    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "log_every_n"
-    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "log_first_n"
-    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "log_if"
-    argspec: "args=[\'level\', \'msg\', \'condition\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_verbosity"
-    argspec: "args=[\'v\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "vlog"
-    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "warn"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "warning"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
index 258ad5047eb6e82eeb9c0941b0acf0573e5ca61d..6a44e4ce66c9dfcb9912c96d0106e4f4fd9fdcff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
@@ -1,11 +1,7 @@
 path: "tensorflow.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.Reduction\'>"
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "MEAN"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "NONE"
     mtype: "<type \'str\'>"
@@ -14,18 +10,10 @@ tf_class {
     name: "SUM"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "SUM_BY_NONZERO_WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "SUM_OVER_BATCH_SIZE"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "SUM_OVER_NONZERO_WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
index c1d190ae116e94ec8f837237e54b6fcff7358254..233b1a0131a4d292574be161de2d547cb0060c23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
@@ -4,22 +4,10 @@ tf_module {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "absolute_difference"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'loss\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'losses\'], "
   }
-  member_method {
-    name: "compute_weighted_loss"
-    argspec: "args=[\'losses\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "cosine_distance"
-    argspec: "args=[\'labels\', \'predictions\', \'axis\', \'weights\', \'scope\', \'loss_collection\', \'reduction\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\', \'None\'], "
-  }
   member_method {
     name: "get_losses"
     argspec: "args=[\'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'None\', \'losses\'], "
@@ -36,36 +24,4 @@ tf_module {
     name: "get_total_loss"
     argspec: "args=[\'add_regularization_losses\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'total_loss\'], "
   }
-  member_method {
-    name: "hinge_loss"
-    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "huber_loss"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'delta\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "log_loss"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'epsilon\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1e-07\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "mean_pairwise_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\'], "
-  }
-  member_method {
-    name: "mean_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "sigmoid_cross_entropy"
-    argspec: "args=[\'multi_class_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "softmax_cross_entropy"
-    argspec: "args=[\'onehot_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "sparse_softmax_cross_entropy"
-    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.manip.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.manip.pbtxt
deleted file mode 100644
index 9add462396ea526ae94678e969c9acf5bce86df1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.manip.pbtxt
+++ /dev/null
@@ -1,35 +0,0 @@
-path: "tensorflow.manip"
-tf_module {
-  member_method {
-    name: "batch_to_space_nd"
-    argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "gather_nd"
-    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "reshape"
-    argspec: "args=[\'tensor\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "reverse"
-    argspec: "args=[\'tensor\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "roll"
-    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "scatter_nd"
-    argspec: "args=[\'indices\', \'updates\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "space_to_batch_nd"
-    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "tile"
-    argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 459b9e3684d65e2497fecdea6b09ef7da06674da..4ac0484050054abee9496bcf09d90ff58bbfb9d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -30,11 +30,11 @@ tf_module {
   }
   member_method {
     name: "argmax"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "argmin"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "asin"
@@ -78,12 +78,16 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "ceil"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "confusion_matrix"
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'None\'], "
+  }
   member_method {
     name: "conj"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -98,7 +102,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'keepdims\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "cumprod"
@@ -166,15 +170,35 @@ tf_module {
   }
   member_method {
     name: "in_top_k"
-    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'targets\', \'predictions\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\'], "
   }
   member_method {
     name: "lbeta"
@@ -206,7 +230,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "logical_and"
@@ -266,35 +290,43 @@ tf_module {
   }
   member_method {
     name: "reduce_all"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_any"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_logsumexp"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_mean"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_min"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_prod"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_std"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "reduce_variance"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "rint"
@@ -310,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "segment_max"
@@ -350,7 +382,7 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
deleted file mode 100644
index e9b996c9f53e9062dcdd39ef22f99eef5175eb35..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
+++ /dev/null
@@ -1,135 +0,0 @@
-path: "tensorflow.metrics"
-tf_module {
-  member_method {
-    name: "accuracy"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "auc"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'curve\', \'name\', \'summation_method\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'ROC\', \'None\', \'trapezoidal\'], "
-  }
-  member_method {
-    name: "average_precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "false_negatives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "false_negatives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "false_positives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "false_positives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_absolute_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_cosine_distance"
-    argspec: "args=[\'labels\', \'predictions\', \'dim\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_iou"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_per_class_accuracy"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_relative_error"
-    argspec: "args=[\'labels\', \'predictions\', \'normalizer\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_tensor"
-    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "percentage_below"
-    argspec: "args=[\'values\', \'threshold\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision_at_top_k"
-    argspec: "args=[\'labels\', \'predictions_idx\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall_at_top_k"
-    argspec: "args=[\'labels\', \'predictions_idx\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "root_mean_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sensitivity_at_specificity"
-    argspec: "args=[\'labels\', \'predictions\', \'specificity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_average_precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "specificity_at_sensitivity"
-    argspec: "args=[\'labels\', \'predictions\', \'sensitivity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_negatives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_negatives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_positives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_positives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 9b28ce57464109f570148a2642c94bc6aab9c97a..c75c75f2ef7ca50cce15fe1dffb4d0de3f6815de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "batch_norm_with_global_normalization"
-    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'mean\', \'variance\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_normalization"
@@ -41,8 +41,8 @@ tf_module {
     argspec: "args=[\'value\', \'bias\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "bidirectional_dynamic_rnn"
-    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
+    name: "collapse_repeated"
+    argspec: "args=[\'labels\', \'seq_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_accidental_hits"
@@ -50,47 +50,47 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filters\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "conv3d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_backprop_filter_v2"
+    name: "conv3d_backprop_filter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
   }
   member_method {
     name: "convolution"
-    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'VALID\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "crelu"
-    argspec: "args=[\'features\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+    argspec: "args=[\'features\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "ctc_beam_search_decoder"
-    argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\'], varargs=None, keywords=None, defaults=[\'100\', \'1\', \'True\'], "
+    argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\'], varargs=None, keywords=None, defaults=[\'100\', \'1\'], "
   }
   member_method {
     name: "ctc_greedy_decoder"
@@ -98,39 +98,35 @@ tf_module {
   }
   member_method {
     name: "ctc_loss"
-    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
+    argspec: "args=[\'labels\', \'logits\', \'label_length\', \'logit_length\', \'logits_time_major\', \'unique\', \'blank_index\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "depth_to_space"
-    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+    name: "ctc_unique_labels"
+    argspec: "args=[\'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    name: "depth_to_space"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    name: "depthwise_conv2d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native_backprop_filter"
+    name: "depthwise_conv2d_backprop_filter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native_backprop_input"
+    name: "depthwise_conv2d_backprop_input"
     argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "dilation2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "dropout"
-    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "dynamic_rnn"
-    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'rate\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "elu"
@@ -138,15 +134,15 @@ tf_module {
   }
   member_method {
     name: "embedding_lookup"
-    argspec: "args=[\'params\', \'ids\', \'partition_strategy\', \'name\', \'validate_indices\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'params\', \'ids\', \'partition_strategy\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\'], "
   }
   member_method {
     name: "embedding_lookup_sparse"
-    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'partition_strategy\', \'name\', \'combiner\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'partition_strategy\', \'combiner\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "erosion2d"
-    argspec: "args=[\'value\', \'kernel\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'value\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "fixed_unigram_candidate_sampler"
@@ -154,19 +150,15 @@ tf_module {
   }
   member_method {
     name: "fractional_avg_pool"
-    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'0\', \'None\'], "
   }
   member_method {
     name: "fractional_max_pool"
-    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "fused_batch_norm"
-    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.001\', \'NHWC\', \'True\', \'None\'], "
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'0\', \'None\'], "
   }
   member_method {
     name: "in_top_k"
-    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'targets\', \'predictions\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "l2_loss"
@@ -174,7 +166,7 @@ tf_module {
   }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\'], "
   }
   member_method {
     name: "leaky_relu"
@@ -194,11 +186,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "log_uniform_candidate_sampler"
-    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "lrn"
@@ -214,15 +202,15 @@ tf_module {
   }
   member_method {
     name: "max_pool_with_argmax"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'Targmax\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "moments"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "nce_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'mod\', \'nce_loss\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'nce_loss\'], "
   }
   member_method {
     name: "normalize_moments"
@@ -230,27 +218,7 @@ tf_module {
   }
   member_method {
     name: "pool"
-    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "quantized_avg_pool"
-    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "quantized_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'None\'], "
-  }
-  member_method {
-    name: "quantized_max_pool"
-    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "quantized_relu_x"
-    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
-  }
-  member_method {
-    name: "raw_rnn"
-    argspec: "args=[\'cell\', \'loop_fn\', \'parallel_iterations\', \'swap_memory\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'VALID\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "relu"
@@ -260,17 +228,13 @@ tf_module {
     name: "relu6"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "relu_layer"
-    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "safe_embedding_lookup_sparse"
-    argspec: "args=[\'embedding_weights\', \'sparse_ids\', \'sparse_weights\', \'combiner\', \'default_id\', \'name\', \'partition_strategy\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\', \'None\', \'div\', \'None\'], "
+    argspec: "args=[\'embedding_weights\', \'sparse_ids\', \'sparse_weights\', \'combiner\', \'default_id\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sampled_softmax_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\', \'seed\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\', \'None\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'None\', \'sampled_softmax_loss\'], "
   }
   member_method {
     name: "selu"
@@ -278,7 +242,7 @@ tf_module {
   }
   member_method {
     name: "separable_conv2d"
-    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sigmoid"
@@ -290,15 +254,11 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
-  }
-  member_method {
-    name: "softmax_cross_entropy_with_logits_v2"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "softplus"
@@ -310,31 +270,23 @@ tf_module {
   }
   member_method {
     name: "space_to_batch"
-    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "space_to_depth"
-    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "sparse_softmax_cross_entropy_with_logits"
     argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "static_bidirectional_rnn"
-    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "static_rnn"
-    argspec: "args=[\'cell\', \'inputs\', \'initial_state\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "static_state_saving_rnn"
     argspec: "args=[\'cell\', \'inputs\', \'state_saver\', \'state_name\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sufficient_statistics"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "tanh"
@@ -344,26 +296,18 @@ tf_module {
     name: "top_k"
     argspec: "args=[\'input\', \'k\', \'sorted\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'True\', \'None\'], "
   }
-  member_method {
-    name: "uniform_candidate_sampler"
-    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
   member_method {
     name: "weighted_cross_entropy_with_logits"
     argspec: "args=[\'targets\', \'logits\', \'pos_weight\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "weighted_moments"
-    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "with_space_to_batch"
     argspec: "args=[\'input\', \'dilation_rate\', \'padding\', \'op\', \'filter_shape\', \'spatial_dims\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "xw_plus_b"
-    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "zero_fraction"
     argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
deleted file mode 100644
index 88b8f37c4ff0cfaf562293c845e505f06119e227..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ /dev/null
@@ -1,202 +0,0 @@
-path: "tensorflow.nn.rnn_cell.BasicLSTMCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 381c4975d7d778599ce34a9023d0e46b20753cba..9e52a4252619ffc19b287fc1818fa6f772847335 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 912365a28b1277962f648b2b0655d280bca1427c..9836433d08cba809107f9bb5dbccf2e971865b8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -110,6 +110,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
deleted file mode 100644
index a4bb3219c792708cd02a8345541d8685485c8d05..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ /dev/null
@@ -1,202 +0,0 @@
-path: "tensorflow.nn.rnn_cell.GRUCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.GRUCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
deleted file mode 100644
index 715bfd5fc7c18993d4997caeefe3188ba88f741c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ /dev/null
@@ -1,202 +0,0 @@
-path: "tensorflow.nn.rnn_cell.LSTMCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
deleted file mode 100644
index b66c0f89cc904c1318787651a3e8e629319c14fb..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ /dev/null
@@ -1,201 +0,0 @@
-path: "tensorflow.nn.rnn_cell.MultiRNNCell"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'cells\', \'state_is_tuple\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index faeb4f3513362919fca8f0c2ef7c491d7938cb92..d3b68e4f2976912ed65ba7916284c951fda03b05 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -105,6 +105,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index caa2e600800178e4b2d36ae263da23d0b4608dd2..1f7840ab919baeeb0077904592ba8dcc1d4c91fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
index 24767e250f96da37ff078e9bf1b9b94fe0b1ed66..b1f687f52964e20a6dfa6f81f68e61d2a67513c9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
@@ -1,9 +1,5 @@
 path: "tensorflow.nn.rnn_cell"
 tf_module {
-  member {
-    name: "BasicLSTMCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "DeviceWrapper"
     mtype: "<type \'type\'>"
@@ -12,22 +8,10 @@ tf_module {
     name: "DropoutWrapper"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "GRUCell"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "LSTMCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "LSTMStateTuple"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "MultiRNNCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "RNNCell"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.orthogonal_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.orthogonal_initializer.pbtxt
deleted file mode 100644
index 13ec7454f41eac2b23e07ba62068bb48dddac90b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.orthogonal_initializer.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.orthogonal_initializer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 978afcf98524c5cd3fc0e5f56c50bcbeaed87c2e..4432cae53b64b66e5a5c906f87af94f61bcf36bd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -4,42 +4,10 @@ tf_module {
     name: "AggregationMethod"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "AttrValue"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "COMPILER_VERSION"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "CXX11_ABI_FLAG"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ConditionalAccumulator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ConditionalAccumulatorBase"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ConfigProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "DType"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "DeviceSpec"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Dimension"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Event"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -48,34 +16,6 @@ tf_module {
     name: "FIFOQueue"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "FixedLenFeature"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FixedLenSequenceFeature"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GIT_VERSION"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "GPUOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "GRAPH_DEF_VERSION"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GRAPH_DEF_VERSION_MIN_CONSUMER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GRAPH_DEF_VERSION_MIN_PRODUCER"
-    mtype: "<type \'int\'>"
-  }
   member {
     name: "GradientTape"
     mtype: "<type \'type\'>"
@@ -84,118 +24,22 @@ tf_module {
     name: "Graph"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "GraphDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "GraphKeys"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GraphOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "HistogramProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "IndexedSlices"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "InteractiveSession"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "LogMessage"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "MONOLITHIC_BUILD"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "MetaGraphDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "NameAttrList"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "NodeDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "OpError"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Operation"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "OptimizerOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "PaddingFIFOQueue"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "PriorityQueue"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "QUANTIZED_DTYPES"
-    mtype: "<type \'frozenset\'>"
-  }
-  member {
-    name: "QueueBase"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "RandomShuffleQueue"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "RegisterGradient"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "RunMetadata"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "RunOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "Session"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionLog"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SparseConditionalAccumulator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SparseFeature"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SparseTensor"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SparseTensorValue"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Summary"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -213,11 +57,11 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "TensorInfo"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+    name: "TensorShape"
+    mtype: "<type \'type\'>"
   }
   member {
-    name: "TensorShape"
+    name: "TensorSpec"
     mtype: "<type \'type\'>"
   }
   member {
@@ -225,12 +69,8 @@ tf_module {
     mtype: "<class \'enum.EnumMeta\'>"
   }
   member {
-    name: "VERSION"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "VarLenFeature"
-    mtype: "<type \'type\'>"
+    name: "Variable"
+    mtype: "<class \'tensorflow.python.ops.variables.VariableMetaclass\'>"
   }
   member {
     name: "VariableAggregation"
@@ -240,10 +80,6 @@ tf_module {
     name: "VariableSynchronization"
     mtype: "<class \'enum.EnumMeta\'>"
   }
-  member {
-    name: "app"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "bfloat16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -272,10 +108,6 @@ tf_module {
     name: "constant_initializer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "contrib"
-    mtype: "<class \'tensorflow.python.util.lazy_loader.LazyLoader\'>"
-  }
   member {
     name: "data"
     mtype: "<type \'module\'>"
@@ -285,7 +117,7 @@ tf_module {
     mtype: "<type \'module\'>"
   }
   member {
-    name: "distributions"
+    name: "distribute"
     mtype: "<type \'module\'>"
   }
   member {
@@ -305,11 +137,11 @@ tf_module {
     mtype: "<type \'module\'>"
   }
   member {
-    name: "feature_column"
+    name: "experimental"
     mtype: "<type \'module\'>"
   }
   member {
-    name: "flags"
+    name: "feature_column"
     mtype: "<type \'module\'>"
   }
   member {
@@ -324,14 +156,6 @@ tf_module {
     name: "float64"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "gfile"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "glorot_normal_initializer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "glorot_uniform_initializer"
     mtype: "<type \'type\'>"
@@ -376,34 +200,22 @@ tf_module {
     name: "keras"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "layers"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "linalg"
     mtype: "<type \'module\'>"
   }
   member {
-    name: "logging"
+    name: "lite"
     mtype: "<type \'module\'>"
   }
   member {
     name: "losses"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "manip"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "math"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "metrics"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "name_scope"
     mtype: "<type \'type\'>"
@@ -420,22 +232,6 @@ tf_module {
     name: "ones_initializer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "orthogonal_initializer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "profiler"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "python_io"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "pywrap_tensorflow"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "qint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -476,10 +272,6 @@ tf_module {
     name: "resource"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "resource_loader"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "saved_model"
     mtype: "<type \'module\'>"
@@ -489,11 +281,11 @@ tf_module {
     mtype: "<type \'module\'>"
   }
   member {
-    name: "sparse"
+    name: "signal"
     mtype: "<type \'module\'>"
   }
   member {
-    name: "spectral"
+    name: "sparse"
     mtype: "<type \'module\'>"
   }
   member {
@@ -541,21 +333,13 @@ tf_module {
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
-    name: "uniform_unit_scaling_initializer"
-    mtype: "<type \'type\'>"
+    name: "variant"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
-    name: "user_ops"
+    name: "version"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "variance_scaling_initializer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "variant"
-    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
-  }
   member {
     name: "zeros_initializer"
     mtype: "<type \'type\'>"
@@ -564,22 +348,10 @@ tf_module {
     name: "Assert"
     argspec: "args=[\'condition\', \'data\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "NoGradient"
-    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "NotDifferentiable"
-    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "abs"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "accumulate_n"
-    argspec: "args=[\'inputs\', \'shape\', \'tensor_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "acos"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -592,41 +364,21 @@ tf_module {
     name: "add"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "add_check_numerics_ops"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "add_n"
     argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "add_to_collection"
-    argspec: "args=[\'name\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_to_collections"
-    argspec: "args=[\'names\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "angle"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "arg_max"
-    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
-  }
-  member_method {
-    name: "arg_min"
-    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
-  }
   member_method {
     name: "argmax"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "argmin"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'stable\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'False\', \'None\'], "
   }
   member_method {
     name: "as_dtype"
@@ -646,79 +398,19 @@ tf_module {
   }
   member_method {
     name: "assert_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_greater_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_integer"
-    argspec: "args=[\'x\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_less"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_less_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_near"
-    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_non_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_non_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_none_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_proper_iterable"
-    argspec: "args=[\'values\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_rank_at_least"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_rank_in"
-    argspec: "args=[\'x\', \'ranks\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_same_float_dtype"
-    argspec: "args=[\'tensors\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "assert_type"
-    argspec: "args=[\'tensor\', \'tf_type\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "atan"
@@ -736,33 +428,17 @@ tf_module {
     name: "batch_gather"
     argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "batch_scatter_update"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
   member_method {
     name: "batch_to_space"
-    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "batch_to_space_nd"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "betainc"
-    argspec: "args=[\'a\', \'b\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "bincount"
-    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\"], "
-  }
   member_method {
     name: "bitcast"
     argspec: "args=[\'input\', \'type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "boolean_mask"
-    argspec: "args=[\'tensor\', \'mask\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'boolean_mask\', \'None\'], "
+    argspec: "args=[\'tensor\', \'mask\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'boolean_mask\'], "
   }
   member_method {
     name: "broadcast_dynamic_shape"
@@ -784,26 +460,6 @@ tf_module {
     name: "cast"
     argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "ceil"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "check_numerics"
-    argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "cholesky"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "cholesky_solve"
-    argspec: "args=[\'chol\', \'rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "clip_by_average_norm"
-    argspec: "args=[\'t\', \'clip_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "clip_by_global_norm"
     argspec: "args=[\'t_list\', \'clip_norm\', \'use_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -816,10 +472,6 @@ tf_module {
     name: "clip_by_value"
     argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "colocate_with"
-    argspec: "args=[\'op\', \'ignore_existing\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
   member_method {
     name: "complex"
     argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -830,23 +482,11 @@ tf_module {
   }
   member_method {
     name: "cond"
-    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'strict\', \'name\', \'fn1\', \'fn2\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "confusion_matrix"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
-  }
-  member_method {
-    name: "conj"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "constant"
-    argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\', \'False\'], "
-  }
-  member_method {
-    name: "container"
-    argspec: "args=[\'container_name\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\'], "
   }
   member_method {
     name: "control_dependencies"
@@ -854,15 +494,7 @@ tf_module {
   }
   member_method {
     name: "convert_to_tensor"
-    argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "convert_to_tensor_or_indexed_slices"
-    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "convert_to_tensor_or_sparse_tensor"
-    argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'value\', \'dtype\', \'dtype_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "cos"
@@ -873,84 +505,16 @@ tf_module {
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
   member_method {
-    name: "create_partitioned_variables"
-    argspec: "args=[\'shape\', \'slicing\', \'initializer\', \'dtype\', \'trainable\', \'collections\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'True\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "cross"
-    argspec: "args=[\'a\', \'b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "cumprod"
-    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "cumsum"
-    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "custom_gradient"
-    argspec: "args=[\'f\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "decode_base64"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "decode_compressed"
-    argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
-  }
-  member_method {
-    name: "decode_csv"
-    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
-  }
-  member_method {
-    name: "decode_json_example"
-    argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "decode_raw"
-    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-  member_method {
-    name: "delete_session_tensor"
-    argspec: "args=[\'handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "depth_to_space"
-    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
-  }
-  member_method {
-    name: "dequantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\'], "
-  }
-  member_method {
-    name: "deserialize_many_sparse"
-    argspec: "args=[\'serialized_sparse\', \'dtype\', \'rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    name: "custom_gradient"
+    argspec: "args=[\'f\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "device"
-    argspec: "args=[\'device_name_or_function\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "diag"
-    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "diag_part"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "digamma"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "div"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'device_name\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "div_no_nan"
@@ -976,14 +540,6 @@ tf_module {
     name: "einsum"
     argspec: "args=[\'equation\'], varargs=inputs, keywords=kwargs, defaults=None"
   }
-  member_method {
-    name: "enable_eager_execution"
-    argspec: "args=[\'config\', \'device_policy\', \'execution_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "encode_base64"
-    argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
   member_method {
     name: "ensure_shape"
     argspec: "args=[\'x\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -992,14 +548,6 @@ tf_module {
     name: "equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "erf"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "erfc"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "executing_eagerly"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1010,15 +558,7 @@ tf_module {
   }
   member_method {
     name: "expand_dims"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "expm1"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "extract_volume_patches"
@@ -1028,50 +568,10 @@ tf_module {
     name: "eye"
     argspec: "args=[\'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
-  member_method {
-    name: "fake_quant_with_min_max_args"
-    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'-6\', \'6\', \'8\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "fake_quant_with_min_max_args_gradient"
-    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'-6\', \'6\', \'8\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "fake_quant_with_min_max_vars"
-    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "fake_quant_with_min_max_vars_gradient"
-    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "fake_quant_with_min_max_vars_per_channel"
-    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "fake_quant_with_min_max_vars_per_channel_gradient"
-    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\', \'name\'], varargs=None, keywords=None, defaults=[\'8\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "fft"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "fill"
     argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "fixed_size_partitioner"
-    argspec: "args=[\'num_shards\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
-  }
   member_method {
     name: "floor"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1080,10 +580,6 @@ tf_module {
     name: "floor_div"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "floordiv"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "floormod"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1096,49 +592,25 @@ tf_module {
     name: "foldr"
     argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "function"
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\'], "
+  }
   member_method {
     name: "gather"
-    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "gather_nd"
     argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_collection"
-    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_collection_ref"
-    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_default_graph"
+    name: "get_logger"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_default_session"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_seed"
-    argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_session_handle"
-    argspec: "args=[\'data\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_session_tensor"
-    argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "global_norm"
-    argspec: "args=[\'t_list\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "gradients"
-    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
+    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
   }
   member_method {
     name: "greater"
@@ -1158,7 +630,7 @@ tf_module {
   }
   member_method {
     name: "hessians"
-    argspec: "args=[\'ys\', \'xs\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'hessians\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'ys\', \'xs\', \'gate_gradients\', \'aggregation_method\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'hessians\'], "
   }
   member_method {
     name: "histogram_fixed_width"
@@ -1176,30 +648,6 @@ tf_module {
     name: "identity_n"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "ifft"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ifft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ifft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "igamma"
-    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "igammac"
-    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "imag"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "import_graph_def"
     argspec: "args=[\'graph_def\', \'input_map\', \'return_elements\', \'name\', \'op_dict\', \'producer_op_list\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
@@ -1208,42 +656,6 @@ tf_module {
     name: "init_scope"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "initialize_all_tables"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
-  }
-  member_method {
-    name: "invert_permutation"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_finite"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_inf"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_nan"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_non_decreasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_numeric_tensor"
-    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_strictly_increasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "lbeta"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1252,22 +664,10 @@ tf_module {
     name: "less_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "lgamma"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "lin_space"
-    argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "linspace"
     argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "load_file_system_library"
-    argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "load_library"
     argspec: "args=[\'library_location\'], varargs=None, keywords=None, defaults=None"
@@ -1276,18 +676,6 @@ tf_module {
     name: "load_op_library"
     argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "log"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "log1p"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "log_sigmoid"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "logical_and"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1300,74 +688,22 @@ tf_module {
     name: "logical_or"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "logical_xor"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'LogicalXor\'], "
-  }
   member_method {
     name: "make_ndarray"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "make_template"
-    argspec: "args=[\'name_\', \'func_\', \'create_scope_now_\', \'unique_name_\', \'custom_getter_\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "make_tensor_proto"
-    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
-  }
   member_method {
     name: "map_fn"
     argspec: "args=[\'fn\', \'elems\', \'dtype\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'False\', \'True\', \'None\'], "
   }
-  member_method {
-    name: "matching_files"
-    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
   }
   member_method {
-    name: "matrix_band_part"
-    argspec: "args=[\'input\', \'num_lower\', \'num_upper\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "matrix_determinant"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "matrix_diag"
-    argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "matrix_diag_part"
+    name: "matrix_square_root"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "matrix_inverse"
-    argspec: "args=[\'input\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "matrix_set_diag"
-    argspec: "args=[\'input\', \'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "matrix_solve"
-    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "matrix_solve_ls"
-    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "matrix_transpose"
-    argspec: "args=[\'a\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'matrix_transpose\', \'False\'], "
-  }
-  member_method {
-    name: "matrix_triangular_solve"
-    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
-  }
   member_method {
     name: "maximum"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1376,10 +712,6 @@ tf_module {
     name: "meshgrid"
     argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
   }
-  member_method {
-    name: "min_max_variable_partitioner"
-    argspec: "args=[\'max_partitions\', \'axis\', \'min_slice_size\', \'bytes_per_string_element\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'262144\', \'16\'], "
-  }
   member_method {
     name: "minimum"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1388,10 +720,6 @@ tf_module {
     name: "mod"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1400,6 +728,10 @@ tf_module {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "no_gradient"
+    argspec: "args=[\'op_type\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "no_op"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1410,7 +742,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "not_equal"
@@ -1426,48 +758,16 @@ tf_module {
   }
   member_method {
     name: "ones_like"
-    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
-  }
-  member_method {
-    name: "op_scope"
-    argspec: "args=[\'values\', \'name\', \'default_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "pad"
-    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'name\', \'constant_values\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\', \'0\'], "
+    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'constant_values\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'0\', \'None\'], "
   }
   member_method {
     name: "parallel_stack"
     argspec: "args=[\'values\', \'name\'], varargs=None, keywords=None, defaults=[\'parallel_stack\'], "
   }
-  member_method {
-    name: "parse_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "parse_single_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "parse_single_sequence_example"
-    argspec: "args=[\'serialized\', \'context_features\', \'sequence_features\', \'example_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "parse_tensor"
-    argspec: "args=[\'serialized\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "placeholder"
-    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "placeholder_with_default"
-    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "polygamma"
-    argspec: "args=[\'a\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1477,48 +777,8 @@ tf_module {
     argspec: "args=[], varargs=inputs, keywords=kwargs, defaults=None"
   }
   member_method {
-    name: "py_func"
-    argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
-  member_method {
-    name: "qr"
-    argspec: "args=[\'input\', \'full_matrices\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "quantize"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
-  }
-  member_method {
-    name: "quantize_v2"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\', \'round_mode\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'HALF_AWAY_FROM_ZERO\'], "
-  }
-  member_method {
-    name: "quantized_concat"
-    argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "random_crop"
-    argspec: "args=[\'value\', \'size\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_gamma"
-    argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_normal"
-    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_poisson"
-    argspec: "args=[\'lam\', \'shape\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_shuffle"
-    argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "random_uniform"
-    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+    name: "py_function"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "range"
@@ -1528,61 +788,41 @@ tf_module {
     name: "rank"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "read_file"
-    argspec: "args=[\'filename\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "real"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "realdiv"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "reciprocal"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "reduce_all"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_any"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_logsumexp"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_mean"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_min"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_prod"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "regex_replace"
-    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "register_tensor_conversion_function"
@@ -1592,10 +832,6 @@ tf_module {
     name: "required_space_to_batch_paddings"
     argspec: "args=[\'input_shape\', \'block_shape\', \'base_paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "reset_default_graph"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reshape"
     argspec: "args=[\'tensor\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1606,15 +842,7 @@ tf_module {
   }
   member_method {
     name: "reverse_sequence"
-    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\', \'seq_dim\', \'batch_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reverse_v2"
-    argspec: "args=[\'tensor\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "rint"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "roll"
@@ -1624,17 +852,13 @@ tf_module {
     name: "round"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "rsqrt"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "saturate_cast"
     argspec: "args=[\'value\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "scan"
@@ -1664,61 +888,13 @@ tf_module {
     name: "searchsorted"
     argspec: "args=[\'sorted_sequence\', \'values\', \'side\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'left\', \"<dtype: \'int32\'>\", \'None\'], "
   }
-  member_method {
-    name: "segment_max"
-    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "segment_mean"
-    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "segment_min"
-    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "segment_prod"
-    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "segment_sum"
-    argspec: "args=[\'data\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "self_adjoint_eig"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "self_adjoint_eigvals"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "sequence_mask"
     argspec: "args=[\'lengths\', \'maxlen\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'bool\'>\", \'None\'], "
   }
-  member_method {
-    name: "serialize_many_sparse"
-    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
-  }
-  member_method {
-    name: "serialize_sparse"
-    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
-  }
-  member_method {
-    name: "serialize_tensor"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_random_seed"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "setdiff1d"
-    argspec: "args=[\'x\', \'y\', \'index_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
-  }
   member_method {
     name: "shape"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "shape_n"
@@ -1742,135 +918,23 @@ tf_module {
   }
   member_method {
     name: "size"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "slice"
     argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "space_to_batch"
-    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "sort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'None\'], "
   }
   member_method {
-    name: "space_to_batch_nd"
+    name: "space_to_batch"
     argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "space_to_depth"
-    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
-  }
-  member_method {
-    name: "sparse_add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
-  }
-  member_method {
-    name: "sparse_concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_fill_empty_rows"
-    argspec: "args=[\'sp_input\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_mask"
-    argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_matmul"
-    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_maximum"
-    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_merge"
-    argspec: "args=[\'sp_ids\', \'sp_values\', \'vocab_size\', \'name\', \'already_sorted\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
-  }
-  member_method {
-    name: "sparse_minimum"
-    argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_placeholder"
-    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reorder"
-    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_reset_shape"
-    argspec: "args=[\'sp_input\', \'new_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_reshape"
-    argspec: "args=[\'sp_input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_retain"
-    argspec: "args=[\'sp_input\', \'to_retain\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "sparse_segment_mean"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_segment_sqrt_n"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_segment_sum"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_slice"
-    argspec: "args=[\'sp_input\', \'start\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_softmax"
-    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_split"
-    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_tensor_dense_matmul"
-    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_tensor_to_dense"
-    argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_to_dense"
-    argspec: "args=[\'sparse_indices\', \'output_shape\', \'sparse_values\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_to_indicator"
-    argspec: "args=[\'sp_input\', \'vocab_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_transpose"
-    argspec: "args=[\'sp_input\', \'perm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    name: "space_to_batch_nd"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "split"
@@ -1884,13 +948,9 @@ tf_module {
     name: "square"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "squared_difference"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "squeeze"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'squeeze_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "stack"
@@ -1904,34 +964,10 @@ tf_module {
     name: "strided_slice"
     argspec: "args=[\'input_\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\', \'var\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'0\', \'0\', \'0\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "string_join"
-    argspec: "args=[\'inputs\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
-  }
   member_method {
     name: "string_split"
     argspec: "args=[\'source\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=[\' \', \'True\'], "
   }
-  member_method {
-    name: "string_strip"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "string_to_hash_bucket"
-    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "string_to_hash_bucket_fast"
-    argspec: "args=[\'input\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "string_to_hash_bucket_strong"
-    argspec: "args=[\'input\', \'num_buckets\', \'key\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "string_to_number"
-    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
-  }
   member_method {
     name: "substr"
     argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
@@ -1940,14 +976,6 @@ tf_module {
     name: "subtract"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "svd"
-    argspec: "args=[\'tensor\', \'full_matrices\', \'compute_uv\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'None\'], "
-  }
-  member_method {
-    name: "tables_initializer"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'init_all_tables\'], "
-  }
   member_method {
     name: "tan"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1957,61 +985,37 @@ tf_module {
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "tensordot"
-    argspec: "args=[\'a\', \'b\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "tensor_scatter_add"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "tile"
-    argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "tensor_scatter_sub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "timestamp"
-    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "tensor_scatter_update"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "to_bfloat16"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToBFloat16\'], "
-  }
-  member_method {
-    name: "to_complex128"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToComplex128\'], "
-  }
-  member_method {
-    name: "to_complex64"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToComplex64\'], "
-  }
-  member_method {
-    name: "to_double"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToDouble\'], "
-  }
-  member_method {
-    name: "to_float"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToFloat\'], "
-  }
-  member_method {
-    name: "to_int32"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt32\'], "
+    name: "tensordot"
+    argspec: "args=[\'a\', \'b\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "to_int64"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'ToInt64\'], "
+    name: "tile"
+    argspec: "args=[\'input\', \'multiples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "trace"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "timestamp"
+    argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "transpose"
-    argspec: "args=[\'a\', \'perm\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'None\', \'transpose\', \'False\'], "
+    argspec: "args=[\'a\', \'perm\', \'conjugate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'transpose\'], "
   }
   member_method {
     name: "truediv"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "truncated_normal"
-    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
   member_method {
     name: "truncatediv"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2022,7 +1026,7 @@ tf_module {
   }
   member_method {
     name: "tuple"
-    argspec: "args=[\'tensors\', \'name\', \'control_inputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'tensors\', \'control_inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "unique"
@@ -2036,41 +1040,13 @@ tf_module {
     name: "unravel_index"
     argspec: "args=[\'indices\', \'dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "unsorted_segment_max"
-    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "unsorted_segment_mean"
-    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "unsorted_segment_min"
-    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "unsorted_segment_prod"
-    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "unsorted_segment_sqrt_n"
-    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "unsorted_segment_sum"
-    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "unstack"
     argspec: "args=[\'value\', \'num\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'unstack\'], "
   }
   member_method {
-    name: "variable_axis_size_partitioner"
-    argspec: "args=[\'max_shard_bytes\', \'axis\', \'bytes_per_string_element\', \'max_shards\'], varargs=None, keywords=None, defaults=[\'0\', \'16\', \'None\'], "
-  }
-  member_method {
-    name: "verify_tensor_all_finite"
-    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "variable_creator_scope"
+    argspec: "args=[\'variable_creator\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "where"
@@ -2078,11 +1054,7 @@ tf_module {
   }
   member_method {
     name: "while_loop"
-    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\', \'return_same_structure\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\', \'False\'], "
-  }
-  member_method {
-    name: "write_file"
-    argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'maximum_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "zeros"
@@ -2090,10 +1062,6 @@ tf_module {
   }
   member_method {
     name: "zeros_like"
-    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
-  }
-  member_method {
-    name: "zeta"
-    argspec: "args=[\'x\', \'q\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
deleted file mode 100644
index e09c44cc9ce71305692740ba2d63b0940b2e0573..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.profiler.AdviceProto.Checker"
-tf_proto {
-  descriptor {
-    name: "Checker"
-    field {
-      name: "reports"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
deleted file mode 100644
index 87462435496fd2eedeb0bc8d92e8a833671b6531..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.profiler.AdviceProto.CheckersEntry"
-tf_proto {
-  descriptor {
-    name: "CheckersEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.AdviceProto.Checker"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
deleted file mode 100644
index a8a8858ccd5af3fb3dac612eef44e5cb450df914..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
+++ /dev/null
@@ -1,41 +0,0 @@
-path: "tensorflow.profiler.AdviceProto"
-tf_proto {
-  descriptor {
-    name: "AdviceProto"
-    field {
-      name: "checkers"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.AdviceProto.CheckersEntry"
-    }
-    nested_type {
-      name: "CheckersEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.tfprof.AdviceProto.Checker"
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "Checker"
-      field {
-        name: "reports"
-        number: 2
-        label: LABEL_REPEATED
-        type: TYPE_STRING
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
deleted file mode 100644
index afec73f537aadd5d1a274db8d57e37b8c6fa3e74..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.profiler.GraphNodeProto.InputShapesEntry"
-tf_proto {
-  descriptor {
-    name: "InputShapesEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
deleted file mode 100644
index 3c83177005323a277f929d8c769cd7b1eeff4d51..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
+++ /dev/null
@@ -1,191 +0,0 @@
-path: "tensorflow.profiler.GraphNodeProto"
-tf_proto {
-  descriptor {
-    name: "GraphNodeProto"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensor_value"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.TFProfTensorProto"
-    }
-    field {
-      name: "run_count"
-      number: 21
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "exec_micros"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "accelerator_exec_micros"
-      number: 17
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "cpu_exec_micros"
-      number: 18
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "requested_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "peak_bytes"
-      number: 24
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "residual_bytes"
-      number: 25
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "output_bytes"
-      number: 26
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "parameters"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "float_ops"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "devices"
-      number: 10
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "total_definition_count"
-      number: 23
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_run_count"
-      number: 22
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_exec_micros"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_accelerator_exec_micros"
-      number: 19
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_cpu_exec_micros"
-      number: 20
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_requested_bytes"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_peak_bytes"
-      number: 27
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_residual_bytes"
-      number: 28
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_output_bytes"
-      number: 29
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_parameters"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_float_ops"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "shapes"
-      number: 11
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    field {
-      name: "input_shapes"
-      number: 16
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto.InputShapesEntry"
-    }
-    field {
-      name: "children"
-      number: 12
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto"
-    }
-    nested_type {
-      name: "InputShapesEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorShapeProto"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
deleted file mode 100644
index 2b08a05437f90b91160fc08e670b2466ae163149..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
+++ /dev/null
@@ -1,134 +0,0 @@
-path: "tensorflow.profiler.MultiGraphNodeProto"
-tf_proto {
-  descriptor {
-    name: "MultiGraphNodeProto"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "exec_micros"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "accelerator_exec_micros"
-      number: 12
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "cpu_exec_micros"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "requested_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "peak_bytes"
-      number: 16
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "residual_bytes"
-      number: 17
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "output_bytes"
-      number: 18
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "parameters"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "float_ops"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_exec_micros"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_accelerator_exec_micros"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_cpu_exec_micros"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_requested_bytes"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_peak_bytes"
-      number: 19
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_residual_bytes"
-      number: 20
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_output_bytes"
-      number: 21
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_parameters"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_float_ops"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "graph_nodes"
-      number: 10
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto"
-    }
-    field {
-      name: "children"
-      number: 11
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.MultiGraphNodeProto"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
deleted file mode 100644
index b3adc50c7e14152a81a148df9deccc5272189aad..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.profiler.OpLogProto.IdToStringEntry"
-tf_proto {
-  descriptor {
-    name: "IdToStringEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
deleted file mode 100644
index 7510c566ba574e9370f5e54c29023ef4fb5ee804..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.profiler.OpLogProto"
-tf_proto {
-  descriptor {
-    name: "OpLogProto"
-    field {
-      name: "log_entries"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.OpLogEntry"
-    }
-    field {
-      name: "id_to_string"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.OpLogProto.IdToStringEntry"
-    }
-    nested_type {
-      name: "IdToStringEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
deleted file mode 100644
index 19ff38a3900c2d358faaa40e7316cc3a9da73040..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
+++ /dev/null
@@ -1,93 +0,0 @@
-path: "tensorflow.profiler.ProfileOptionBuilder"
-tf_class {
-  is_instance: "<class \'tensorflow.python.profiler.option_builder.ProfileOptionBuilder\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "account_displayed_op_only"
-    argspec: "args=[\'self\', \'is_true\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "float_operation"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "order_by"
-    argspec: "args=[\'self\', \'attribute\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "select"
-    argspec: "args=[\'self\', \'attributes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "time_and_memory"
-    argspec: "args=[\'min_micros\', \'min_bytes\', \'min_accelerator_micros\', \'min_cpu_micros\', \'min_peak_bytes\', \'min_residual_bytes\', \'min_output_bytes\'], varargs=None, keywords=None, defaults=[\'1\', \'1\', \'0\', \'0\', \'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "trainable_variables_parameter"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_accounted_types"
-    argspec: "args=[\'self\', \'account_type_regexes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_empty_output"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_file_output"
-    argspec: "args=[\'self\', \'outfile\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_max_depth"
-    argspec: "args=[\'self\', \'max_depth\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_execution_time"
-    argspec: "args=[\'self\', \'min_micros\', \'min_accelerator_micros\', \'min_cpu_micros\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "with_min_float_operations"
-    argspec: "args=[\'self\', \'min_float_ops\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_memory"
-    argspec: "args=[\'self\', \'min_bytes\', \'min_peak_bytes\', \'min_residual_bytes\', \'min_output_bytes\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "with_min_occurrence"
-    argspec: "args=[\'self\', \'min_occurrence\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_parameters"
-    argspec: "args=[\'self\', \'min_params\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_node_names"
-    argspec: "args=[\'self\', \'start_name_regexes\', \'show_name_regexes\', \'hide_name_regexes\', \'trim_name_regexes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "with_pprof_output"
-    argspec: "args=[\'self\', \'pprof_file\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_stdout_output"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_step"
-    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_timeline_output"
-    argspec: "args=[\'self\', \'timeline_file\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
deleted file mode 100644
index acb61dae9f0d184ba998aa820ec40de5bc38c3eb..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-path: "tensorflow.profiler.Profiler"
-tf_class {
-  is_instance: "<class \'tensorflow.python.profiler.model_analyzer.Profiler\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'graph\', \'op_log\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_step"
-    argspec: "args=[\'self\', \'step\', \'run_meta\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "advise"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_graph"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_name_scope"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_operations"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_python"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "serialize_to_string"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
deleted file mode 100644
index 7b4d3ac522abc4229c5623da25c4ec818d86f829..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
+++ /dev/null
@@ -1,39 +0,0 @@
-path: "tensorflow.profiler"
-tf_module {
-  member {
-    name: "AdviceProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "GraphNodeProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "MultiGraphNodeProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "OpLogProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "ProfileOptionBuilder"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Profiler"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "advise"
-    argspec: "args=[\'graph\', \'run_meta\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
-  }
-  member_method {
-    name: "profile"
-    argspec: "args=[\'graph\', \'run_meta\', \'op_log\', \'cmd\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'scope\', \'0\'], "
-  }
-  member_method {
-    name: "write_op_log"
-    argspec: "args=[\'graph\', \'log_dir\', \'op_log\', \'run_meta\', \'add_trace\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-compression-type.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-compression-type.pbtxt
deleted file mode 100644
index 4941dda50e4964f8400a4cb5033c8e918aeaea5d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-compression-type.pbtxt
+++ /dev/null
@@ -1,20 +0,0 @@
-path: "tensorflow.python_io.TFRecordCompressionType"
-tf_class {
-  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordCompressionType\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GZIP"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ZLIB"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-options.pbtxt
deleted file mode 100644
index 614ba42d3e0cad3cf468ad38b39ceec92ce6c59c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-options.pbtxt
+++ /dev/null
@@ -1,17 +0,0 @@
-path: "tensorflow.python_io.TFRecordOptions"
-tf_class {
-  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordOptions\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "compression_type_map"
-    mtype: "<type \'dict\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'compression_type\', \'flush_mode\', \'input_buffer_size\', \'output_buffer_size\', \'window_bits\', \'compression_level\', \'compression_method\', \'mem_level\', \'compression_strategy\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_compression_type_string"
-    argspec: "args=[\'cls\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-writer.pbtxt
deleted file mode 100644
index 31775de2d12bcd2f214f5a04be7a92f49c594fde..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.python_io.-t-f-record-writer.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.python_io.TFRecordWriter"
-tf_class {
-  is_instance: "<class \'tensorflow.python.lib.io.tf_record.TFRecordWriter\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'record\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.python_io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.python_io.pbtxt
deleted file mode 100644
index 7c9953e5fe3c883fd5e6e19ae011cc464f4107af..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.python_io.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.python_io"
-tf_module {
-  member {
-    name: "TFRecordCompressionType"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "TFRecordOptions"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "TFRecordWriter"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "tf_record_iterator"
-    argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
index 77c92aeb0dad8898bccc28efba510509d1c351dd..632c2f8f83c8effb188d110bfacaf7f22c0c74cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "quantize"
     argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\'], "
+  }
   member_method {
     name: "quantized_concat"
     argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
index a568dd4cd8a68ec3a3354aad911b370b1bf40cea..d49c23e59cf036f05758f5c50208febf4b7381d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -1,37 +1,61 @@
 path: "tensorflow.random"
 tf_module {
+  member_method {
+    name: "all_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fixed_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
-    name: "get_seed"
-    argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
+    name: "learned_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "log_uniform_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "normal"
     argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
     name: "poisson"
-    argspec: "args=[\'lam\', \'shape\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
+    argspec: "args=[\'shape\', \'lam\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
-    name: "set_random_seed"
+    name: "set_seed"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "shuffle"
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "stateless_categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "stateless_normal"
+    argspec: "args=[\'shape\', \'seed\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "stateless_truncated_normal"
+    argspec: "args=[\'shape\', \'seed\', \'mean\', \'stddev\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "stateless_uniform"
+    argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "truncated_normal"
     argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt
deleted file mode 100644
index 288b78b4cd0ad3f5d5bc1f9c773977d50a6db086..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.resource_loader.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.resource_loader"
-tf_module {
-  member_method {
-    name: "get_data_files_path"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_path_to_datafile"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_root_dir_with_all_resources"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "load_resource"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readahead_file_path"
-    argspec: "args=[\'path\', \'readahead\'], varargs=None, keywords=None, defaults=[\'128M\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-builder.pbtxt
deleted file mode 100644
index 67457de070830d45a48230835fc4827e36f70058..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-builder.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.saved_model.Builder"
-tf_class {
-  is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'export_dir\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_meta_graph"
-    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "add_meta_graph_and_variables"
-    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "save"
-    argspec: "args=[\'self\', \'as_text\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
deleted file mode 100644
index 83bd7035409534abf036c7e2b0d66fcc060ada3a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.saved_model.builder.SavedModelBuilder"
-tf_class {
-  is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'export_dir\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_meta_graph"
-    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "add_meta_graph_and_variables"
-    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "save"
-    argspec: "args=[\'self\', \'as_text\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.pbtxt
deleted file mode 100644
index adc697ad1c0bdd0c9b52be736fca3a19a2a82ef3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.builder.pbtxt
+++ /dev/null
@@ -1,7 +0,0 @@
-path: "tensorflow.saved_model.builder"
-tf_module {
-  member {
-    name: "SavedModelBuilder"
-    mtype: "<type \'type\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.constants.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.constants.pbtxt
deleted file mode 100644
index 20e10aa094f704f2168de37abb73f6edf6765f93..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.constants.pbtxt
+++ /dev/null
@@ -1,39 +0,0 @@
-path: "tensorflow.saved_model.constants"
-tf_module {
-  member {
-    name: "ASSETS_DIRECTORY"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "ASSETS_KEY"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LEGACY_INIT_OP_KEY"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MAIN_OP_KEY"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVED_MODEL_FILENAME_PB"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVED_MODEL_FILENAME_PBTXT"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVED_MODEL_SCHEMA_VERSION"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VARIABLES_DIRECTORY"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "VARIABLES_FILENAME"
-    mtype: "<type \'str\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.loader.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.loader.pbtxt
deleted file mode 100644
index 511e6b4712d3c55746a39fe9098fa3b649bc75dc..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.loader.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.saved_model.loader"
-tf_module {
-  member_method {
-    name: "load"
-    argspec: "args=[\'sess\', \'tags\', \'export_dir\', \'import_scope\'], varargs=None, keywords=saver_kwargs, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "maybe_saved_model_directory"
-    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.main_op.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.main_op.pbtxt
deleted file mode 100644
index 176cb788c249e68f1221713e96c7e808c39c8f6d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.main_op.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.saved_model.main_op"
-tf_module {
-  member_method {
-    name: "main_op"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "main_op_with_restore"
-    argspec: "args=[\'restore_op_name\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index 3f4965fc691944dd8957756d4524ae5e2921c4e1..63bebb20bcae08c645d9aaaecab2ea2de4cc49aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -1,73 +1,109 @@
 path: "tensorflow.saved_model"
 tf_module {
   member {
-    name: "Builder"
-    mtype: "<type \'type\'>"
+    name: "ASSETS_DIRECTORY"
+    mtype: "<type \'str\'>"
   }
   member {
-    name: "builder"
-    mtype: "<type \'module\'>"
+    name: "ASSETS_KEY"
+    mtype: "<type \'str\'>"
   }
   member {
-    name: "constants"
-    mtype: "<type \'module\'>"
+    name: "CLASSIFY_INPUTS"
+    mtype: "<type \'str\'>"
   }
   member {
-    name: "loader"
-    mtype: "<type \'module\'>"
+    name: "CLASSIFY_METHOD_NAME"
+    mtype: "<type \'str\'>"
   }
   member {
-    name: "main_op"
-    mtype: "<type \'module\'>"
+    name: "CLASSIFY_OUTPUT_CLASSES"
+    mtype: "<type \'str\'>"
   }
   member {
-    name: "signature_constants"
-    mtype: "<type \'module\'>"
+    name: "CLASSIFY_OUTPUT_SCORES"
+    mtype: "<type \'str\'>"
   }
   member {
-    name: "signature_def_utils"
-    mtype: "<type \'module\'>"
+    name: "DEFAULT_SERVING_SIGNATURE_DEF_KEY"
+    mtype: "<type \'str\'>"
   }
   member {
-    name: "tag_constants"
-    mtype: "<type \'module\'>"
+    name: "GPU"
+    mtype: "<type \'str\'>"
   }
   member {
-    name: "utils"
-    mtype: "<type \'module\'>"
+    name: "PREDICT_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "PREDICT_OUTPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_INPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_METHOD_NAME"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "REGRESS_OUTPUTS"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_FILENAME_PB"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_FILENAME_PBTXT"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SAVED_MODEL_SCHEMA_VERSION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "SERVING"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TPU"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "TRANING"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VARIABLES_DIRECTORY"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "VARIABLES_FILENAME"
+    mtype: "<type \'str\'>"
   }
   member_method {
     name: "build_signature_def"
     argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "build_tensor_info"
-    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "classification_signature_def"
     argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_tensor_from_tensor_info"
-    argspec: "args=[\'tensor_info\', \'graph\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    name: "contains_saved_model"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "is_valid_signature"
     argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "load"
-    argspec: "args=[\'sess\', \'tags\', \'export_dir\', \'import_scope\'], varargs=None, keywords=saver_kwargs, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "main_op_with_restore"
-    argspec: "args=[\'restore_op_name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "maybe_saved_model_directory"
-    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "predict_signature_def"
     argspec: "args=[\'inputs\', \'outputs\'], varargs=None, keywords=None, defaults=None"
@@ -77,7 +113,7 @@ tf_module {
     argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "simple_save"
-    argspec: "args=[\'session\', \'export_dir\', \'inputs\', \'outputs\', \'legacy_init_op\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_constants.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_constants.pbtxt
deleted file mode 100644
index 478d410e066b1ce3a17bb3ef9cc6e4503991ad0b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_constants.pbtxt
+++ /dev/null
@@ -1,47 +0,0 @@
-path: "tensorflow.saved_model.signature_constants"
-tf_module {
-  member {
-    name: "CLASSIFY_INPUTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "CLASSIFY_METHOD_NAME"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "CLASSIFY_OUTPUT_CLASSES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "CLASSIFY_OUTPUT_SCORES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "DEFAULT_SERVING_SIGNATURE_DEF_KEY"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "PREDICT_INPUTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "PREDICT_METHOD_NAME"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "PREDICT_OUTPUTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "REGRESS_INPUTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "REGRESS_METHOD_NAME"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "REGRESS_OUTPUTS"
-    mtype: "<type \'str\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_def_utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_def_utils.pbtxt
deleted file mode 100644
index a5602464eeb09a290076ef102ed5502ea61b4ac3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.signature_def_utils.pbtxt
+++ /dev/null
@@ -1,23 +0,0 @@
-path: "tensorflow.saved_model.signature_def_utils"
-tf_module {
-  member_method {
-    name: "build_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "classification_signature_def"
-    argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_valid_signature"
-    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "regression_signature_def"
-    argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.tag_constants.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.tag_constants.pbtxt
deleted file mode 100644
index 6af72498d74d4bbc12e7ca68ad1e0a6f0c237e0a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.tag_constants.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.saved_model.tag_constants"
-tf_module {
-  member {
-    name: "GPU"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SERVING"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TPU"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAINING"
-    mtype: "<type \'str\'>"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.utils.pbtxt
deleted file mode 100644
index d95c94668250e1de236462ccdcb134245eebf092..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.utils.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.saved_model.utils"
-tf_module {
-  member_method {
-    name: "build_tensor_info"
-    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_tensor_from_tensor_info"
-    argspec: "args=[\'tensor_info\', \'graph\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
index 8a196b1a556e283671cc75af28df3eaa62532975..900d08ff47ca062fdda4f0f2f6ac20ee9822d1df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
@@ -1,19 +1,19 @@
 path: "tensorflow.sets"
 tf_module {
   member_method {
-    name: "set_difference"
+    name: "difference"
     argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
   }
   member_method {
-    name: "set_intersection"
+    name: "intersection"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
-    name: "set_size"
+    name: "size"
     argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
-    name: "set_union"
+    name: "union"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea717b4d719d6709e05182faca964ae544abc39c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
@@ -0,0 +1,95 @@
+path: "tensorflow.signal"
+tf_module {
+  member_method {
+    name: "dct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "fft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "frame"
+    argspec: "args=[\'signal\', \'frame_length\', \'frame_step\', \'pad_end\', \'pad_value\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "hamming_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "hann_window"
+    argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "idct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ifft"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft2d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "ifft3d"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "inverse_stft"
+    argspec: "args=[\'stfts\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "inverse_stft_window_fn"
+    argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
+  }
+  member_method {
+    name: "irfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "linear_to_mel_weight_matrix"
+    argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "mfccs_from_log_mel_spectrograms"
+    argspec: "args=[\'log_mel_spectrograms\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "overlap_and_add"
+    argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "rfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "stft"
+    argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 32bd8d5f8edb24ee1f5a5672487499337bd1c0dd..b8bd2c0b72c1a78fb2abbfb319073fec267f56fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,11 +10,11 @@ tf_module {
   }
   member_method {
     name: "add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'expand_nonconcat_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "cross"
@@ -40,41 +40,21 @@ tf_module {
     name: "mask"
     argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "matmul"
-    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
-  }
   member_method {
     name: "maximum"
     argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "merge"
-    argspec: "args=[\'sp_ids\', \'sp_values\', \'vocab_size\', \'name\', \'already_sorted\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
-  }
   member_method {
     name: "minimum"
     argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "placeholder"
-    argspec: "args=[\'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'output_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'output_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reorder"
@@ -94,15 +74,15 @@ tf_module {
   }
   member_method {
     name: "segment_mean"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_sqrt_n"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_sum"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "slice"
@@ -112,9 +92,13 @@ tf_module {
     name: "softmax"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sparse_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "split"
-    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'num_split\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "to_dense"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
deleted file mode 100644
index 6a421ef12d58dc047905ec916cbe777b4ce19b9a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
+++ /dev/null
@@ -1,59 +0,0 @@
-path: "tensorflow.spectral"
-tf_module {
-  member_method {
-    name: "dct"
-    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "fft"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "fft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "idct"
-    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "ifft"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ifft2d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ifft3d"
-    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "irfft"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "irfft2d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "irfft3d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "rfft"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "rfft2d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "rfft3d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 5ba48e7f571434e80cc0de3c3cc425e7a147f80d..f6e32ed08c8339413374c11c6fc75aec92bffec2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -10,11 +10,11 @@ tf_module {
   }
   member_method {
     name: "length"
-    argspec: "args=[\'input\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
+    argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
     name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'axis\', \'keepdims\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\'], "
   }
   member_method {
     name: "regex_full_match"
@@ -34,11 +34,11 @@ tf_module {
   }
   member_method {
     name: "substr"
-    argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
+    argspec: "args=[\'input\', \'pos\', \'len\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
     name: "to_hash_bucket"
-    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "to_hash_bucket_fast"
@@ -50,10 +50,18 @@ tf_module {
   }
   member_method {
     name: "to_number"
-    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "unicode_encode"
+    argspec: "args=[\'input\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
   }
   member_method {
     name: "unicode_script"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unicode_transcode"
+    argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt
deleted file mode 100644
index 73de73869c8d1a6808b16fe8853fd21cc8891879..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt
+++ /dev/null
@@ -1,44 +0,0 @@
-path: "tensorflow.summary.SessionLog"
-tf_proto {
-  descriptor {
-    name: "SessionLog"
-    field {
-      name: "status"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.SessionLog.SessionStatus"
-    }
-    field {
-      name: "checkpoint_path"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "msg"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    enum_type {
-      name: "SessionStatus"
-      value {
-        name: "STATUS_UNSPECIFIED"
-        number: 0
-      }
-      value {
-        name: "START"
-        number: 1
-      }
-      value {
-        name: "STOP"
-        number: 2
-      }
-      value {
-        name: "CHECKPOINT"
-        number: 3
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6715c14e168d6a30ce8aa35470525521069de40a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.summary.SummaryWriter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.summary_ops_v2.SummaryWriter\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'resource\', \'init_op_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "init"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 7ed9cd77a01c2eadb5ea43a02306d60d505127a0..5cf4d7cfd9ac54eeccea5094ad789aede29540b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -12,10 +12,6 @@ tf_module {
     name: "FileWriterCache"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SessionLog"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "Summary"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -24,44 +20,24 @@ tf_module {
     name: "SummaryDescription"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "SummaryWriter"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TaggedRunMetadata"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
   member_method {
-    name: "audio"
-    argspec: "args=[\'name\', \'tensor\', \'sample_rate\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_summary_description"
-    argspec: "args=[\'node_def\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "histogram"
-    argspec: "args=[\'name\', \'values\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "image"
-    argspec: "args=[\'name\', \'tensor\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "merge"
-    argspec: "args=[\'inputs\', \'collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "merge_all"
-    argspec: "args=[\'key\', \'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "scalar"
-    argspec: "args=[\'name\', \'tensor\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    name: "create_file_writer"
+    argspec: "args=[\'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "tensor_summary"
-    argspec: "args=[\'name\', \'tensor\', \'summary_description\', \'collections\', \'summary_metadata\', \'family\', \'display_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    name: "flush"
+    argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "text"
-    argspec: "args=[\'name\', \'tensor\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "import_event"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
index 2f00aeac25f691d9767080251798248281e5edf5..811ca18cdb4e9c7a830bb3d7e8af45b341fb6a35 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.sysconfig"
 tf_module {
+  member {
+    name: "CXX11_ABI_FLAG"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "MONOLITHIC_BUILD"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "get_compile_flags"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
index df528e26b60f8d8ddcc1eaf0ed292cc7ff0ebd94..6fc489c86043d074ac832d0ec9dbefd2cbbb4f19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_abstract"
     argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt
deleted file mode 100644
index e02a0c6097c5ea4dae905b25cd0e381f5e257105..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt
+++ /dev/null
@@ -1,28 +0,0 @@
-path: "tensorflow.test.StubOutForTesting"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.googletest.StubOutForTesting\'>"
-  member_method {
-    name: "CleanUp"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Set"
-    argspec: "args=[\'self\', \'parent\', \'child_name\', \'new_child\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "SmartSet"
-    argspec: "args=[\'self\', \'obj\', \'attr_name\', \'new_attr\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "SmartUnsetAll"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "UnsetAll"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index abe9b068ae95c08a2b72c9a5e164a097e6162dff..980e96ac254aebf229ae52d98f607ed87d334e7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -4,38 +4,22 @@ tf_module {
     name: "Benchmark"
     mtype: "<class \'tensorflow.python.platform.benchmark._BenchmarkRegistrar\'>"
   }
-  member {
-    name: "StubOutForTesting"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "TestCase"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "mock"
-    mtype: "<type \'module\'>"
-  }
   member_method {
     name: "assert_equal_graph_def"
-    argspec: "args=[\'actual\', \'expected\', \'checkpoint_v2\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "compute_gradient"
-    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
+    argspec: "args=[\'actual\', \'expected\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "compute_gradient_error"
-    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
+    name: "benchmark_config"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "create_local_cluster"
     argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\', \'worker_config\', \'ps_config\'], varargs=None, keywords=None, defaults=[\'grpc\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "get_temp_dir"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "gpu_device_name"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -52,8 +36,4 @@ tf_module {
     name: "main"
     argspec: "args=[\'argv\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "test_src_dir_path"
-    argspec: "args=[\'relative_path\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt
deleted file mode 100644
index 1f1d8b6f9e2cde4800cdef9c417191b1a0ce07b5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdadeltaOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.95\', \'1e-08\', \'False\', \'Adadelta\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
deleted file mode 100644
index a7c05d484905a0af26c80a52d92623ef4a3eb6c4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdagradDAOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'global_step\', \'initial_gradient_squared_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'AdagradDA\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt
deleted file mode 100644
index bc8b92389c6ed7dcb0fa23ff3abd86bb0d1c488a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdagradOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'False\', \'Adagrad\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt
deleted file mode 100644
index 5d17be9378fd130b89e199544f85e03a23a71d3c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdamOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta1\', \'beta2\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-08\', \'False\', \'Adam\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2538de661b357245ad18d9e1c4fc88d2e80eaeb0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.train.CheckpointManager"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.checkpoint_management.CheckpointManager\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "checkpoints"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "latest_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint\', \'directory\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'checkpoint_number\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt
deleted file mode 100644
index c3037baa8c951ecd9b60267ee7cc8674ead88dbe..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.CheckpointSaverHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt
deleted file mode 100644
index 9d3688e565761758e765d00086de8b59dcc3801b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.train.CheckpointSaverListener"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "after_save"
-    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_save"
-    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt
deleted file mode 100644
index abbe273be32c6fd20b1a6464f3e99966bd3c8953..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.train.ChiefSessionCreator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.ChiefSessionCreator\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'checkpoint_filename_with_path\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "create_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt
deleted file mode 100644
index 7bec4d032cedc0711ca07049d5d04490e8bc3f30..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.FeedFnHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FeedFnHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'feed_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt
deleted file mode 100644
index 31cf9aaeb2c640f8db205c0753f20acc75338fe0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-path: "tensorflow.train.FinalOpsHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FinalOpsHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "final_ops_values"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'final_ops\', \'final_ops_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt
deleted file mode 100644
index d265fdeb01c38d8a1347e630d7f7bff111999634..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.FtrlOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt
deleted file mode 100644
index 147448618e2df9f71ac794e369b108629e10ce0a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.GlobalStepWaiterHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.GlobalStepWaiterHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'wait_until_step\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt
deleted file mode 100644
index c673e29cd4dd6cd3c01582abfbc306c092818892..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.GradientDescentOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'GradientDescent\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt
deleted file mode 100644
index 9801c05df181ee65cc8ce0ad2e886566c0145fd5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.LoggingTensorHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.LoggingTensorHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'tensors\', \'every_n_iter\', \'every_n_secs\', \'at_end\', \'formatter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt
deleted file mode 100644
index c61859004e897a14b580dc0b55957edfa6ae6860..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt
+++ /dev/null
@@ -1,73 +0,0 @@
-path: "tensorflow.train.LooperThread"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.coordinator.LooperThread\'>"
-  is_instance: "<class \'threading.Thread\'>"
-  member {
-    name: "daemon"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ident"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "getName"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "isAlive"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "isDaemon"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_alive"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "join"
-    argspec: "args=[\'self\', \'timeout\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "loop"
-    argspec: "args=[\'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run_loop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "setDaemon"
-    argspec: "args=[\'self\', \'daemonic\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "setName"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "start"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "start_loop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "stop_loop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt
deleted file mode 100644
index 8199f63b9b8c64c73a3d62294277838cdc240280..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.MomentumOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'use_locking\', \'name\', \'use_nesterov\'], varargs=None, keywords=None, defaults=[\'False\', \'Momentum\', \'False\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt
deleted file mode 100644
index 03efe6639e0e3d2c6c280bd30d2b59b5d654f995..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.train.MonitoredSession.StepContext"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.StepContext\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "session"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'session\', \'run_with_hooks_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run_with_hooks"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt
deleted file mode 100644
index 09b7b3fb538fb8d87dcfd622089818081a1fb79b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-path: "tensorflow.train.MonitoredSession"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.MonitoredSession\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "StepContext"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'session_creator\', \'hooks\', \'stop_grace_period_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'120\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "run_step_fn"
-    argspec: "args=[\'self\', \'step_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "should_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt
deleted file mode 100644
index 25fd5e75a79f6e4fe2cf77ebc7aa0d1fef759e7f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-path: "tensorflow.train.NanLossDuringTrainingError"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
-  is_instance: "<type \'exceptions.RuntimeError\'>"
-  member {
-    name: "args"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "message"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt
deleted file mode 100644
index 7d1c89f9b37b5e63ecf2cf766986cb8faa5872c4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.NanTensorHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanTensorHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'loss_tensor\', \'fail_on_nan_loss\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt
deleted file mode 100644
index 876bb35e391885e751066a415967af848280c714..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt
+++ /dev/null
@@ -1,50 +0,0 @@
-path: "tensorflow.train.Optimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
deleted file mode 100644
index 4df6c4156a8bfe6d3bc0fb6746512cb3025c2604..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.ProfilerHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.ProfilerHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'show_dataflow\', \'show_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'True\', \'False\'], "
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
deleted file mode 100644
index 14349a74efb61124fc7b5568d5ec023f08b1b62f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.ProximalAdagradOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'ProximalAdagrad\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
deleted file mode 100644
index 906384a2875bf7b05ac26fc43207f4ef9b5a7472..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.RMSPropOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'decay\', \'momentum\', \'epsilon\', \'use_locking\', \'centered\', \'name\'], varargs=None, keywords=None, defaults=[\'0.9\', \'0.0\', \'1e-10\', \'False\', \'False\', \'RMSProp\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-saver-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-saver-def.pbtxt
deleted file mode 100644
index 4ec99469e4025603e7ab340b190cbebf7e33eed7..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-saver-def.pbtxt
+++ /dev/null
@@ -1,64 +0,0 @@
-path: "tensorflow.train.SaverDef"
-tf_proto {
-  descriptor {
-    name: "SaverDef"
-    field {
-      name: "filename_tensor_name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "save_tensor_name"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "restore_op_name"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "max_to_keep"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "sharded"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "keep_checkpoint_every_n_hours"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-    }
-    field {
-      name: "version"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.SaverDef.CheckpointFormatVersion"
-    }
-    enum_type {
-      name: "CheckpointFormatVersion"
-      value {
-        name: "LEGACY"
-        number: 0
-      }
-      value {
-        name: "V1"
-        number: 1
-      }
-      value {
-        name: "V2"
-        number: 2
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-saver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-saver.pbtxt
deleted file mode 100644
index 2cda458f468b2d748b43954b14b670df7145243f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-saver.pbtxt
+++ /dev/null
@@ -1,53 +0,0 @@
-path: "tensorflow.train.Saver"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.saver.Saver\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "last_checkpoints"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'var_list\', \'reshape\', \'sharded\', \'max_to_keep\', \'keep_checkpoint_every_n_hours\', \'name\', \'restore_sequentially\', \'saver_def\', \'builder\', \'defer_build\', \'allow_empty\', \'write_version\', \'pad_step_number\', \'save_relative_paths\', \'filename\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\', \'5\', \'10000.0\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'2\', \'False\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "as_saver_def"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "export_meta_graph"
-    argspec: "args=[\'self\', \'filename\', \'collection_list\', \'as_text\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\', \'False\', \'False\'], "
-  }
-  member_method {
-    name: "from_proto"
-    argspec: "args=[\'saver_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "recover_last_checkpoints"
-    argspec: "args=[\'self\', \'checkpoint_paths\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "restore"
-    argspec: "args=[\'self\', \'sess\', \'save_path\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "save"
-    argspec: "args=[\'self\', \'sess\', \'save_path\', \'global_step\', \'latest_filename\', \'meta_graph_suffix\', \'write_meta_graph\', \'write_state\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'meta\', \'True\', \'True\', \'False\'], "
-  }
-  member_method {
-    name: "set_last_checkpoints"
-    argspec: "args=[\'self\', \'last_checkpoints\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_last_checkpoints_with_time"
-    argspec: "args=[\'self\', \'last_checkpoints_with_time\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "to_proto"
-    argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt
deleted file mode 100644
index 38cc98b48e78aa93f7614a9baff236f7b119f99d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt
+++ /dev/null
@@ -1,53 +0,0 @@
-path: "tensorflow.train.Scaffold"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.Scaffold\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "init_feed_dict"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_fn"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "local_init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_for_local_init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "saver"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "summary_op"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\', \'copy_from_scaffold\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "default_local_init_op"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_or_default"
-    argspec: "args=[\'arg_name\', \'collection_key\', \'default_constructor\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt
deleted file mode 100644
index 3c5a6ac13cc2d8a4d464ab48da6edaa0a9ccc14b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt
+++ /dev/null
@@ -1,26 +0,0 @@
-path: "tensorflow.train.SecondOrStepTimer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks._HookTimer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'every_secs\', \'every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "last_triggered_step"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reset"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "should_trigger_for_step"
-    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "update_last_triggered_step"
-    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt
deleted file mode 100644
index beb232715f725047dd8c03054b899a90fa81eec2..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.train.SessionCreator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "create_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt
deleted file mode 100644
index 448764fe081b250e1e22633f118268ad638cb9dd..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.train.SessionManager"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_manager.SessionManager\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'local_init_op\', \'ready_op\', \'ready_for_local_init_op\', \'graph\', \'recovery_wait_secs\', \'local_init_run_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'30\', \'None\'], "
-  }
-  member_method {
-    name: "prepare_session"
-    argspec: "args=[\'self\', \'master\', \'init_op\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\', \'init_feed_dict\', \'init_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'7200\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recover_session"
-    argspec: "args=[\'self\', \'master\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'7200\', \'None\'], "
-  }
-  member_method {
-    name: "wait_for_session"
-    argspec: "args=[\'self\', \'master\', \'config\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'inf\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt
deleted file mode 100644
index 442990893e33c92bd05a72b198a6584bc979b2fe..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.train.SessionRunArgs"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "feed_dict"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "fetches"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "options"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt
deleted file mode 100644
index d5adb15c95f8a6ebde4ca0e0c535dfebc5edfbf2..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-path: "tensorflow.train.SessionRunContext"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunContext\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "original_args"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "session"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "stop_requested"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'original_args\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt
deleted file mode 100644
index 0b401d59c400f1d08f47daa2d264a9a5bfc91538..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.train.SessionRunValues"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "options"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "results"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "run_metadata"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
deleted file mode 100644
index 36d8ce7ff82e02300b59705400be40d7cc3f65ae..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.train.SingularMonitoredSession.StepContext"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.StepContext\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "session"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'session\', \'run_with_hooks_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run_with_hooks"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt
deleted file mode 100644
index de0f2c1c1a2497ef4e541ee6583d416e31f48826..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.train.SingularMonitoredSession"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SingularMonitoredSession\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "StepContext"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'hooks\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'stop_grace_period_secs\', \'checkpoint_filename_with_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'120\', \'None\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "raw_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "run_step_fn"
-    argspec: "args=[\'self\', \'step_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "should_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt
deleted file mode 100644
index 13261f6dde1cf8e6fd228950600303370947b7ea..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.StepCounterHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StepCounterHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'every_n_steps\', \'every_n_secs\', \'output_dir\', \'summary_writer\'], varargs=None, keywords=None, defaults=[\'100\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt
deleted file mode 100644
index e388599b0bf63379fa95a3276e3f4859eab86d6d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.StopAtStepHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StopAtStepHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'num_steps\', \'last_step\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt
deleted file mode 100644
index 697c3667b09f42f208dec38938f5a1ce0cc09029..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.train.SummarySaverHook"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SummarySaverHook\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'summary_writer\', \'scaffold\', \'summary_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "after_create_session"
-    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "after_run"
-    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "before_run"
-    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "begin"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "end"
-    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt
deleted file mode 100644
index 9677e5a98e4a8308093f51a84d8b1edae405cd2b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt
+++ /dev/null
@@ -1,153 +0,0 @@
-path: "tensorflow.train.Supervisor"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.supervisor.Supervisor\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "USE_DEFAULT"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "coord"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "global_step"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_feed_dict"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_chief"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_for_local_init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_model_secs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_path"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_summaries_secs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "saver"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "session_manager"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "summary_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "summary_writer"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "Loop"
-    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "PrepareSession"
-    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
-  }
-  member_method {
-    name: "RequestStop"
-    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ShouldStop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "StartQueueRunners"
-    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "StartStandardServices"
-    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Stop"
-    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\', \'ignore_live_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'False\'], "
-  }
-  member_method {
-    name: "StopOnException"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "SummaryComputed"
-    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "WaitForStop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'graph\', \'ready_op\', \'ready_for_local_init_op\', \'is_chief\', \'init_op\', \'init_feed_dict\', \'local_init_op\', \'logdir\', \'summary_op\', \'saver\', \'global_step\', \'save_summaries_secs\', \'save_model_secs\', \'recovery_wait_secs\', \'stop_grace_secs\', \'checkpoint_basename\', \'session_manager\', \'summary_writer\', \'init_fn\', \'local_init_run_options\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'True\', \'0\', \'None\', \'0\', \'None\', \'0\', \'0\', \'0\', \'120\', \'600\', \'30\', \'120\', \'model.ckpt\', \'None\', \'0\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "loop"
-    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "managed_session"
-    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
-  }
-  member_method {
-    name: "prepare_or_wait_for_session"
-    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "should_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "start_queue_runners"
-    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "start_standard_services"
-    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "stop"
-    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\', \'ignore_live_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'False\'], "
-  }
-  member_method {
-    name: "stop_on_exception"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "summary_computed"
-    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "wait_for_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt
deleted file mode 100644
index 2c0fda3c72b7e1f02265827b9dc1929500935cd1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ /dev/null
@@ -1,63 +0,0 @@
-path: "tensorflow.train.SyncReplicasOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'opt\', \'replicas_to_aggregate\', \'total_num_replicas\', \'variable_averages\', \'variables_to_average\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'sync_replicas\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_chief_queue_runner"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_init_tokens_op"
-    argspec: "args=[\'self\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "make_session_run_hook"
-    argspec: "args=[\'self\', \'is_chief\', \'num_tokens\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt
deleted file mode 100644
index 39b946b82f3d5caadbdeac6253e5554df69a2776..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt
+++ /dev/null
@@ -1,43 +0,0 @@
-path: "tensorflow.train.VocabInfo"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
-  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "axis"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "backup_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "new_vocab"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "new_vocab_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "num_oov_buckets"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "old_vocab"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "old_vocab_size"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt
deleted file mode 100644
index ac263580687e53bb3fcffd5268f73f8b67aa43a1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.train.WorkerSessionCreator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.WorkerSessionCreator\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'1800\'], "
-  }
-  member_method {
-    name: "create_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index 7e980fe44d29af7a54101f95f3457e812fccfcff..8c327f88f32357bc15b1cdcbbc2ffad674063f6b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -1,21 +1,5 @@
 path: "tensorflow.train"
 tf_module {
-  member {
-    name: "AdadeltaOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AdagradDAOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AdagradOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AdamOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "BytesList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -25,15 +9,7 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "CheckpointSaverHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "CheckpointSaverListener"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ChiefSessionCreator"
+    name: "CheckpointManager"
     mtype: "<type \'type\'>"
   }
   member {
@@ -72,30 +48,10 @@ tf_module {
     name: "Features"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "FeedFnHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FinalOpsHook"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "FloatList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "FtrlOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GlobalStepWaiterHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GradientDescentOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Int64List"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -104,66 +60,10 @@ tf_module {
     name: "JobDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "LoggingTensorHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "LooperThread"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "MomentumOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "MonitoredSession"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "NanLossDuringTrainingError"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "NanTensorHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Optimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ProfilerHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ProximalAdagradOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ProximalGradientDescentOptimizer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "RMSPropOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Saver"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SaverDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "Scaffold"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SecondOrStepTimer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SequenceExample"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -176,86 +76,10 @@ tf_module {
     name: "ServerDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "SessionCreator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionManager"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionRunArgs"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionRunContext"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SessionRunHook"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SessionRunValues"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SingularMonitoredSession"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "StepCounterHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "StopAtStepHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SummarySaverHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Supervisor"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SyncReplicasOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "VocabInfo"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "WorkerSessionCreator"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "MonitoredTrainingSession"
-    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\', \'max_wait_secs\', \'save_checkpoint_steps\', \'summary_dir\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'<object object instance>\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\', \'7200\', \'<object object instance>\', \'None\'], "
-  }
-  member_method {
-    name: "NewCheckpointReader"
-    argspec: "args=[\'filepattern\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "assert_global_step"
-    argspec: "args=[\'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "basic_train_loop"
-    argspec: "args=[\'supervisor\', \'train_step_fn\', \'args\', \'kwargs\', \'master\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\'], "
-  }
-  member_method {
-    name: "checkpoint_exists"
-    argspec: "args=[\'checkpoint_prefix\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "confusion_matrix"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
-  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
@@ -264,50 +88,14 @@ tf_module {
     name: "cosine_decay_restarts"
     argspec: "args=[\'learning_rate\', \'global_step\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
   }
-  member_method {
-    name: "create_global_step"
-    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "exponential_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
-  member_method {
-    name: "export_meta_graph"
-    argspec: "args=[\'filename\', \'meta_info_def\', \'graph_def\', \'saver_def\', \'collection_list\', \'as_text\', \'graph\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'False\'], "
-  }
-  member_method {
-    name: "generate_checkpoint_state_proto"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_checkpoint_mtimes"
-    argspec: "args=[\'checkpoint_prefixes\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_checkpoint_state"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "get_global_step"
-    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_or_create_global_step"
-    argspec: "args=[\'graph\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "global_step"
-    argspec: "args=[\'sess\', \'global_step_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "import_meta_graph"
-    argspec: "args=[\'meta_graph_or_file\', \'clear_devices\', \'import_scope\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "init_from_checkpoint"
-    argspec: "args=[\'ckpt_dir_or_file\', \'assignment_map\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "inverse_time_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -332,10 +120,6 @@ tf_module {
     name: "load_variable"
     argspec: "args=[\'ckpt_dir_or_file\', \'name\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "match_filenames_once"
-    argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "natural_exp_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -345,21 +129,13 @@ tf_module {
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'initial_variance\', \'variance_decay\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.55\', \'0.5\', \'0.0\', \'0.001\', \'None\'], "
   }
   member_method {
-    name: "piecewise_constant"
+    name: "piecewise_constant_decay"
     argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "polynomial_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
   }
-  member_method {
-    name: "remove_checkpoint"
-    argspec: "args=[\'checkpoint_prefix\', \'checkpoint_format_version\', \'meta_graph_suffix\'], varargs=None, keywords=None, defaults=[\'2\', \'meta\'], "
-  }
-  member_method {
-    name: "replica_device_setter"
-    argspec: "args=[\'ps_tasks\', \'ps_device\', \'worker_device\', \'merge_devices\', \'cluster\', \'ps_ops\', \'ps_strategy\'], varargs=None, keywords=None, defaults=[\'0\', \'/job:ps\', \'/job:worker\', \'True\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "sdca_fprint"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -376,16 +152,4 @@ tf_module {
     name: "summary_iterator"
     argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_checkpoint_state"
-    argspec: "args=[\'save_dir\', \'model_checkpoint_path\', \'all_model_checkpoint_paths\', \'latest_filename\', \'all_model_checkpoint_timestamps\', \'last_preserved_timestamp\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "warm_start"
-    argspec: "args=[\'ckpt_to_initialize_from\', \'vars_to_warm_start\', \'var_name_to_vocab_info\', \'var_name_to_prev_var_name\'], varargs=None, keywords=None, defaults=[\'.*\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "write_graph"
-    argspec: "args=[\'graph_or_graph_def\', \'logdir\', \'name\', \'as_text\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.uniform_unit_scaling_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.uniform_unit_scaling_initializer.pbtxt
deleted file mode 100644
index e1b18dc92fbee9565dba81e8c09534bea6734f23..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.uniform_unit_scaling_initializer.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.uniform_unit_scaling_initializer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.UniformUnitScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'factor\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.variance_scaling_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.variance_scaling_initializer.pbtxt
deleted file mode 100644
index 09d7bc03b4f238923db6778ec32ce78ae76eed61..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.variance_scaling_initializer.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.variance_scaling_initializer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'mode\', \'distribution\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'fan_in\', \'truncated_normal\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.version.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.version.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd4506cb0b38809ea65b3f11c2c79fa40831dc57
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.version.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.version"
+tf_module {
+  member {
+    name: "COMPILER_VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GIT_VERSION"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION_MIN_CONSUMER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "GRAPH_DEF_VERSION_MIN_PRODUCER"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "VERSION"
+    mtype: "<type \'str\'>"
+  }
+}
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index 3a48cf683c908021a6a87849601227283a8e2034..5102066730533c717a029c6fd52ef0e2d10a520d 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
+import enum
 from google.protobuf import message
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_decorator
@@ -27,11 +29,62 @@ from tensorflow.tools.api.lib import api_objects_pb2
 
 # Following object need to be handled individually.
 _CORNER_CASES = {
-    '': {'tools': {}},
+    '': {
+        'tools': {}
+    },
     'test.TestCase': {},
     'test.TestCase.failureException': {},
+    'train.NanLossDuringTrainingError': {
+        'message': {}
+    },
+    'estimator.NanLossDuringTrainingError': {
+        'message': {}
+    },
 }
 
+# Python 2 vs. 3 differences
+if sys.version_info.major == 3:
+  _NORMALIZE_TYPE = {}
+  for t in ('property', 'object', 'getset_descriptor', 'int', 'str', 'type',
+            'tuple', 'module', 'collections.defaultdict', 'set', 'dict',
+            'NoneType', 'frozenset'):
+    _NORMALIZE_TYPE["<class '%s'>" % t] = "<type '%s'>" % t
+  for e in 'Exception', 'RuntimeError':
+    _NORMALIZE_TYPE["<class '%s'>" % e] = "<type 'exceptions.%s'>" % e
+  _NORMALIZE_TYPE["<class 'abc.ABCMeta'>"] = "<type 'type'>"
+  _NORMALIZE_ISINSTANCE = {
+      "<class "
+      "'tensorflow.lite.python.op_hint.OpHint.OpHintArgumentTracker'>":  # pylint: disable=line-too-long
+          "<class "
+          "'tensorflow.lite.python.op_hint.OpHintArgumentTracker'>",
+      "<class "
+      "'tensorflow.python.training.monitored_session._MonitoredSession.StepContext'>":  # pylint: disable=line-too-long
+          "<class "
+          "'tensorflow.python.training.monitored_session.StepContext'>",
+      "<class "
+      "'tensorflow.python.ops.variables.Variable.SaveSliceInfo'>":
+          "<class "
+          "'tensorflow.python.ops.variables.SaveSliceInfo'>"
+  }
+
+  def _SkipMember(cls, member):
+    return (member == 'with_traceback' or member in ('name', 'value') and
+            isinstance(cls, type) and issubclass(cls, enum.Enum))
+else:
+  _NORMALIZE_TYPE = {"<class 'abc.ABCMeta'>": "<type 'type'>"}
+  _NORMALIZE_ISINSTANCE = {}
+
+  def _SkipMember(cls, member):  # pylint: disable=unused-argument
+    return False
+
+
+def _NormalizeType(ty):
+  return _NORMALIZE_TYPE.get(ty, ty)
+
+
+def _NormalizeIsInstance(ty):
+  return _NORMALIZE_ISINSTANCE.get(ty, ty)
+
 
 def _SanitizedArgSpec(obj):
   """Get an ArgSpec string that is free of addresses.
@@ -91,7 +144,7 @@ def _SanitizedMRO(obj):
     if cls.__name__ == '_NewClass':
       # Ignore class created by @deprecated_alias decorator.
       continue
-    str_repr = str(cls)
+    str_repr = _NormalizeType(str(cls))
     return_list.append(str_repr)
     if 'tensorflow' not in str_repr:
       break
@@ -130,6 +183,8 @@ class PythonObjectToProtoVisitor(object):
     def _AddMember(member_name, member_obj, proto):
       """Add the child object to the object being constructed."""
       _, member_obj = tf_decorator.unwrap(member_obj)
+      if _SkipMember(parent, member_name):
+        return
       if member_name == '__init__' or not member_name.startswith('_'):
         if tf_inspect.isroutine(member_obj):
           new_method = proto.member_method.add()
@@ -137,12 +192,12 @@ class PythonObjectToProtoVisitor(object):
           # If member_obj is a python builtin, there is no way to get its
           # argspec, because it is implemented on the C side. It also has no
           # func_code.
-          if getattr(member_obj, 'func_code', None):
+          if hasattr(member_obj, '__code__'):
             new_method.argspec = _SanitizedArgSpec(member_obj)
         else:
           new_member = proto.member.add()
           new_member.name = member_name
-          new_member.mtype = str(type(member_obj))
+          new_member.mtype = _NormalizeType(str(type(member_obj)))
 
     parent_corner_cases = _CORNER_CASES.get(path, {})
 
@@ -172,7 +227,8 @@ class PythonObjectToProtoVisitor(object):
       elif tf_inspect.isclass(parent):
         # Construct a class.
         class_obj = api_objects_pb2.TFAPIClass()
-        class_obj.is_instance.extend(_SanitizedMRO(parent))
+        class_obj.is_instance.extend(
+            _NormalizeIsInstance(i) for i in _SanitizedMRO(parent))
         for name, child in children:
           if name in parent_corner_cases:
             # If we have an empty entry, skip this object.
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 6487a6267e54999608732b668cc14a7e6c50535f..723fceef413d86675e885debd37e73e5facd7f7c 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -31,15 +31,15 @@ import argparse
 import os
 import re
 import sys
-import unittest
 
 import tensorflow as tf
-from tensorflow._api import v2 as tf_v2
+from tensorflow._api.v2 import v2 as tf_v2
 
 from google.protobuf import message
 from google.protobuf import text_format
 
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -83,6 +83,7 @@ def _KeyToFilePath(key, api_version):
 
   Filepath will be inside golden folder for api_version.
   """
+
   def _ReplaceCapsWithDash(matchobj):
     match = matchobj.group(0)
     return '-%s' % (match.lower())
@@ -95,22 +96,22 @@ def _KeyToFilePath(key, api_version):
 
 def _FileNameToKey(filename):
   """From a given filename, construct a key we use for api objects."""
+
   def _ReplaceDashWithCaps(matchobj):
     match = matchobj.group(0)
     return match[1].upper()
 
   base_filename = os.path.basename(filename)
   base_filename_without_ext = os.path.splitext(base_filename)[0]
-  api_object_key = re.sub(
-      '((-[a-z]){1})', _ReplaceDashWithCaps, base_filename_without_ext)
+  api_object_key = re.sub('((-[a-z]){1})', _ReplaceDashWithCaps,
+                          base_filename_without_ext)
   return api_object_key
 
 
 def _VerifyNoSubclassOfMessageVisitor(path, parent, unused_children):
   """A Visitor that crashes on subclasses of generated proto classes."""
   # If the traversed object is a proto Message class
-  if not (isinstance(parent, type) and
-          issubclass(parent, message.Message)):
+  if not (isinstance(parent, type) and issubclass(parent, message.Message)):
     return
   if parent is message.Message:
     return
@@ -124,11 +125,11 @@ def _VerifyNoSubclassOfMessageVisitor(path, parent, unused_children):
 def _FilterNonCoreGoldenFiles(golden_file_list):
   """Filter out non-core API pbtxt files."""
   filtered_file_list = []
-  filtered_package_prefixes = [
-      'tensorflow.%s.' % p for p in _NON_CORE_PACKAGES]
+  filtered_package_prefixes = ['tensorflow.%s.' % p for p in _NON_CORE_PACKAGES]
   for f in golden_file_list:
-    if any([f.rsplit('/')[-1].startswith(pre)
-            for pre in filtered_package_prefixes]):
+    if any(
+        f.rsplit('/')[-1].startswith(pre) for pre in filtered_package_prefixes
+    ):
       continue
     filtered_file_list.append(f)
   return filtered_file_list
@@ -140,14 +141,12 @@ class ApiCompatibilityTest(test.TestCase):
     super(ApiCompatibilityTest, self).__init__(*args, **kwargs)
 
     golden_update_warning_filename = os.path.join(
-        resource_loader.get_root_dir_with_all_resources(),
-        _UPDATE_WARNING_FILE)
+        resource_loader.get_root_dir_with_all_resources(), _UPDATE_WARNING_FILE)
     self._update_golden_warning = file_io.read_file_to_string(
         golden_update_warning_filename)
 
     test_readme_filename = os.path.join(
-        resource_loader.get_root_dir_with_all_resources(),
-        _TEST_README_FILE)
+        resource_loader.get_root_dir_with_all_resources(), _TEST_README_FILE)
     self._test_readme_message = file_io.read_file_to_string(
         test_readme_filename)
 
@@ -161,15 +160,14 @@ class ApiCompatibilityTest(test.TestCase):
     """Diff given dicts of protobufs and report differences a readable way.
 
     Args:
-      expected_dict: a dict of TFAPIObject protos constructed from golden
-          files.
+      expected_dict: a dict of TFAPIObject protos constructed from golden files.
       actual_dict: a ict of TFAPIObject protos constructed by reading from the
-          TF package linked to the test.
+        TF package linked to the test.
       verbose: Whether to log the full diffs, or simply report which files were
-          different.
+        different.
       update_goldens: Whether to update goldens when there are diffs found.
       additional_missing_object_message: Message to print when a symbol is
-          missing.
+        missing.
       api_version: TensorFlow API version to test.
     """
     diffs = []
@@ -257,8 +255,7 @@ class ApiCompatibilityTest(test.TestCase):
     visitor = public_api.PublicAPIVisitor(_VerifyNoSubclassOfMessageVisitor)
     visitor.do_not_descend_map['tf'].append('contrib')
     if FLAGS.only_test_core_api:
-      visitor.do_not_descend_map['tf'].extend(
-          _NON_CORE_PACKAGES)
+      visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
     traverse.traverse(tf_v2.compat.v1, visitor)
 
   def testNoSubclassOfMessageV2(self):
@@ -267,23 +264,25 @@ class ApiCompatibilityTest(test.TestCase):
     visitor = public_api.PublicAPIVisitor(_VerifyNoSubclassOfMessageVisitor)
     visitor.do_not_descend_map['tf'].append('contrib')
     if FLAGS.only_test_core_api:
-      visitor.do_not_descend_map['tf'].extend(
-          _NON_CORE_PACKAGES)
+      visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
     traverse.traverse(tf_v2, visitor)
 
-  def _checkBackwardsCompatibility(
-      self, root, golden_file_pattern, api_version,
-      additional_private_map=None):
+  def _checkBackwardsCompatibility(self,
+                                   root,
+                                   golden_file_pattern,
+                                   api_version,
+                                   additional_private_map=None):
     # Extract all API stuff.
     visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
 
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
-    public_api_visitor.do_not_descend_map['tf'].append('contrib')
-    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = [
-        'Experimental']
+    public_api_visitor.private_map['tf'] = ['contrib']
+    if api_version == 2:
+      public_api_visitor.private_map['tf'].append('enable_v2_behavior')
+
+    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
     if FLAGS.only_test_core_api:
-      public_api_visitor.do_not_descend_map['tf'].extend(
-          _NON_CORE_PACKAGES)
+      public_api_visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
     if additional_private_map:
       public_api_visitor.private_map.update(additional_private_map)
 
@@ -315,9 +314,7 @@ class ApiCompatibilityTest(test.TestCase):
         update_goldens=FLAGS.update_goldens,
         api_version=api_version)
 
-  @unittest.skipUnless(
-      sys.version_info.major == 2,
-      'API compabitility test goldens are generated using python2.')
+  @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibility(self):
     api_version = 1
     golden_file_pattern = os.path.join(
@@ -331,27 +328,29 @@ class ApiCompatibilityTest(test.TestCase):
         # in separate tests.
         additional_private_map={'tf.compat': ['v1', 'v2']})
 
-  @unittest.skipUnless(
-      sys.version_info.major == 2,
-      'API compabitility test goldens are generated using python2.')
+    # Also check that V1 API has contrib
+    self.assertTrue(
+        'tensorflow.python.util.lazy_loader.LazyLoader'
+        in str(type(tf.contrib)))
+
+  @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibilityV1(self):
     api_version = 1
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
-    self._checkBackwardsCompatibility(
-        tf_v2.compat.v1, golden_file_pattern, api_version)
+    self._checkBackwardsCompatibility(tf_v2.compat.v1, golden_file_pattern,
+                                      api_version)
 
-  @unittest.skipUnless(
-      sys.version_info.major == 2,
-      'API compabitility test goldens are generated using python2.')
   def testAPIBackwardsCompatibilityV2(self):
     api_version = 2
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
     self._checkBackwardsCompatibility(
-        tf_v2, golden_file_pattern, api_version,
+        tf_v2,
+        golden_file_pattern,
+        api_version,
         additional_private_map={'tf.compat': ['v1']})
 
 
@@ -362,7 +361,9 @@ if __name__ == '__main__':
   # TODO(mikecase): Create Estimator's own API compatibility test or
   # a more general API compatibility test for use for TF components.
   parser.add_argument(
-      '--only_test_core_api', type=bool, default=False,
+      '--only_test_core_api',
+      type=bool,
+      default=False,
       help=_ONLY_TEST_CORE_API_HELP)
   parser.add_argument(
       '--verbose_diffs', type=bool, default=True, help=_VERBOSE_DIFFS_HELP)
diff --git a/tensorflow/tools/build_info/BUILD b/tensorflow/tools/build_info/BUILD
index 730741780550bfe3fbccd7e62f5f7d9788f0a9a9..680e68b0b9b22f1e8547f52b3eac12f6c6384d6e 100644
--- a/tensorflow/tools/build_info/BUILD
+++ b/tensorflow/tools/build_info/BUILD
@@ -4,8 +4,8 @@ package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(
-    glob(["gen/*"]) + [
-        "gen_build_info.py",
-    ],
+py_binary(
+    name = "gen_build_info",
+    srcs = ["gen_build_info.py"],
+    srcs_version = "PY2AND3",
 )
diff --git a/tensorflow/tools/build_info/gen_build_info.py b/tensorflow/tools/build_info/gen_build_info.py
index 690214abfb155c083ef197e583afbafe9a3381f8..9ebd168d78141c5839eab58aa3729efd746e6ac8 100755
--- a/tensorflow/tools/build_info/gen_build_info.py
+++ b/tensorflow/tools/build_info/gen_build_info.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/tools/ci_build/Dockerfile.android b/tensorflow/tools/ci_build/Dockerfile.android
index dcf077791a9752f2e22999b082a9805bb3775c8d..7e72eb0cbfa99bbb11e90837b1d4cfec0d64a0b2 100644
--- a/tensorflow/tools/ci_build/Dockerfile.android
+++ b/tensorflow/tools/ci_build/Dockerfile.android
@@ -45,9 +45,14 @@ ENV ANDROID_NDK_FILENAME android-ndk-r14b-linux-x86_64.zip
 ENV ANDROID_NDK_URL https://dl.google.com/android/repository/${ANDROID_NDK_FILENAME}
 ENV ANDROID_NDK_HOME ${ANDROID_DEV_HOME}/ndk
 ENV PATH ${PATH}:${ANDROID_NDK_HOME}
+# Workaround for b/117156972: inject missing #include into NDK versions of
+# futex.h.
 RUN cd ${ANDROID_DEV_HOME} && \
     wget -q ${ANDROID_NDK_URL} && \
     unzip ${ANDROID_NDK_FILENAME} -d ${ANDROID_DEV_HOME} && \
+    sed -i 15i"#include <linux/compiler.h>" ${ANDROID_DEV_HOME}/android-ndk-r14b/platforms/android-14/arch-arm/usr/include/linux/futex.h && \
+    sed -i 15i"#include <linux/compiler.h>" ${ANDROID_DEV_HOME}/android-ndk-r14b/platforms/android-14/arch-mips/usr/include/linux/futex.h && \
+    sed -i 15i"#include <linux/compiler.h>" ${ANDROID_DEV_HOME}/android-ndk-r14b/platforms/android-14/arch-x86/usr/include/linux/futex.h && \
     rm ${ANDROID_NDK_FILENAME} && \
     bash -c "ln -s ${ANDROID_DEV_HOME}/android-ndk-* ${ANDROID_NDK_HOME}"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op b/tensorflow/tools/ci_build/Dockerfile.custom_op
new file mode 100644
index 0000000000000000000000000000000000000000..4493b88348bef8e6f6ce80ed3d104d37b07c8268
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.custom_op
@@ -0,0 +1,19 @@
+FROM ubuntu:14.04
+
+LABEL maintainer="Yifei Feng <yifeif@google.com>"
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_bazel.sh
+RUN /install/install_proto3.sh
+RUN /install/install_buildifier.sh
+RUN /install/install_auditwheel.sh
+RUN /install/install_golang.sh
+
+# Set up the master bazelrc configuration file.
+COPY install/.bazelrc /etc/bazel.bazelrc
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
index 0a55b84ac4886ff76e72fc90116ea957e01066ee..b69588724cb3566b6a014af2767cb9a321d7ec10 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu.ppc64le
@@ -28,6 +28,3 @@ ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0
 ENV TF_CUDA_VERSION 9.2
 ENV CUDA_TOOLKIT_PATH /usr/local/cuda-9.2
-
-# TODO get NCCL 2 in the docker image
-ENV TF_NCCL_VERSION 1
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
new file mode 100644
index 0000000000000000000000000000000000000000..03de89b7176b702cf8fdee84bb4372002ad94707
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
@@ -0,0 +1,75 @@
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 \
+#       --tag "gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04
+
+FROM ubuntu:14.04
+LABEL maintainer="Manuel Klimek <klimek@google.com>"
+
+RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
+    rm -rf /var/lib/apt/lists/* && \
+    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
+    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
+    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +2 > cudasign.pub && \
+    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
+
+ENV CUDA_VERSION 10.0.130
+ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
+ENV CUDNN_VERSION 7.3.1.20
+ENV NCCL_VERSION 2.3.5
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0,driver>=410"
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV PATH /usr/local/cuda/bin:${PATH}
+
+# TODO(b/110903506): /usr/loca/cuda/lib64/stubs should not be needed in
+# LD_LIBRARY_PATH. The stubs/libcuda.so is not meant to used at runtime. The
+# correct way to pass the path to bfd-ld is to pass
+# -Wl,-rpath-link=/usr/local/cuda/lib64/stubs to all binaries transitively
+# depending on libcuda. Optimally, builds targeting cuda would do that
+# internally.
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs
+
+LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        cuda-command-line-tools-$CUDA_PKG_VERSION \
+        cuda-compat-10-0=410.48-1 \
+        cuda-cudart-$CUDA_PKG_VERSION \
+        cuda-libraries-$CUDA_PKG_VERSION \
+        cuda-libraries-dev-$CUDA_PKG_VERSION \
+        cuda-minimal-build-$CUDA_PKG_VERSION \
+        cuda-nvml-dev-$CUDA_PKG_VERSION \
+        cuda-nvtx-$CUDA_PKG_VERSION \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 \
+        libnccl2=$NCCL_VERSION-2+cuda10.0 \
+        libnccl-dev=$NCCL_VERSION-2+cuda10.0 && \
+    ln -s cuda-10.0 /usr/local/cuda && \
+    apt-mark hold libcudnn7 && \
+    apt-mark hold libnccl2 && \
+    rm -rf /var/lib/apt/lists/*
+
+# TODO(b/110903506): Provide a link to the SONAME of libcuda.so.
+# https://github.com/NVIDIA/nvidia-docker/issues/775
+RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
+# libnccl is resolved, delete this block.
+RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
+ && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_golang.sh
+
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
index dd8d705331a58af8ec8cd4474bbedce47bba727f..eb6ca7c8f0fe27bd8bb9e5b11cf14e98ad67e530 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
@@ -6,7 +6,7 @@
 # TODO(klimek): Include clang in this image so we can also target clang
 # builds.
 
-FROM ubuntu:14.04
+FROM gcr.io/clang-docker-builder/clang-ubuntu14_04
 LABEL maintainer="Manuel Klimek <klimek@google.com>"
 
 RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
@@ -71,6 +71,15 @@ RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
  && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
 
+# Install a newer version of libstdc++, as new clang versions do not work
+# with the stock ubuntu 14.04 libstdc++.
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
+    apt-get update && \
+    apt-get install -y libstdc++-7-dev && \
+    rm -rf /var/lib/apt/lists/*
+
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 ARG DEBIAN_FRONTEND=noninteractive
diff --git a/tensorflow/tools/ci_build/builds/configured b/tensorflow/tools/ci_build/builds/configured
index 3eee11fd7e0f310261191099412d00c26ee6021f..f8a9311918f58f22de35a96897ff208f1a580516 100755
--- a/tensorflow/tools/ci_build/builds/configured
+++ b/tensorflow/tools/ci_build/builds/configured
@@ -33,7 +33,7 @@ COMMAND=("$@")
 export CI_BUILD_PYTHON="${CI_BUILD_PYTHON:-python}"
 export PYTHON_BIN_PATH="${PYTHON_BIN_PATH:-$(which ${CI_BUILD_PYTHON})}"
 # XLA currently does not build under Android, so disable it for now.
-if [[ "${CONTAINER_TYPE}" -eq 'android' ]]; then
+if [[ "${CONTAINER_TYPE}" == 'android' ]]; then
   export TF_ENABLE_XLA=0
 fi
 
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 9b3ff0cba7dcacc0f68a417299c31f7a0f413430..44abcc309b9ff238059d6f298c42c7edb3fecd32 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -55,6 +55,7 @@ function build_libtensorflow_tarball() {
   export CC_OPT_FLAGS='-mavx'
   if [ "${TF_NEED_CUDA}" == "1" ]; then
     BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
+    export TF_NEED_ROCM=0
   fi
   bazel clean --expunge
   yes "" | ./configure
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 654377902262fe3d7dd096e5d49a541d2eecb378..d1fad98ed7e18d6558e1c3161e66bd31990064a7 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -321,6 +321,12 @@ create_activate_virtualenv_and_install_tensorflow() {
   # some versions in python
   curl https://bootstrap.pypa.io/get-pip.py | python
 
+  # Force upgrade of setuptools. This must happen before the pip install of the
+  # WHL_PATH, which pulls in absl-py, which uses install_requires notation
+  # introduced in setuptools >=20.5. The default version of setuptools is 5.5.1,
+  # which is too old for absl-py.
+  pip install --upgrade setuptools==39.1.0
+
   # Force tensorflow reinstallation. Otherwise it may not get installed from
   # last build if it had the same version number as previous build.
   PIP_FLAGS="--upgrade --force-reinstall"
@@ -328,9 +334,11 @@ create_activate_virtualenv_and_install_tensorflow() {
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${TF_WHEEL_PATH}"
 
-  # Force downgrade setuptools.
+  # Force downgrade of setuptools. This must happen after the pip install of the
+  # WHL_PATH, which ends up upgrading to the latest version of setuptools.
+  # Versions of setuptools >= 39.1.0 will cause tests to fail like this:
+  #   ImportError: cannot import name py31compat
   pip install --upgrade setuptools==39.1.0
-
 }
 
 ################################################################################
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index fdff867ff07f352c4e220f12ad8aa4916b56d09a..435ec7ca68fc28362b9b546f977b24e003e55d2f 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -128,8 +128,9 @@ NO_DOCKER_OPT_FLAG="--genrule_strategy=standalone"
 
 DO_DOCKER=1
 
-# Bazel uses defaults for all test sizes when given `-1`.
-TF_BUILD_TEST_TIMEOUT=${TF_BUILD_TEST_TIMEOUT:--1}
+# Default values for various settings.
+TF_BUILD_TEST_TIMEOUT=${TF_BUILD_TEST_TIMEOUT:--1}  # Use bazel defaults
+TF_GPU_COUNT=${TF_GPU_COUNT:-4}
 
 # Helpful flags:
 # --test_summary=detailed: Tell us more about which targets are being built
@@ -144,13 +145,28 @@ TF_BUILD_TEST_TIMEOUT=${TF_BUILD_TEST_TIMEOUT:--1}
 BAZEL_TEST_FLAGS=""\
 "--test_summary=detailed --build_tests_only --keep_going "\
 "--test_timeout=${TF_BUILD_TEST_TIMEOUT} "\
-"--test_env=TF_GPU_COUNT=${TF_GPU_COUNT} "\
-"--test_env=TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU} "\
+"--test_env=TF_GPU_COUNT=${TF_GPU_COUNT}"
+
+# Only set these environment variables if they're specified, to avoid causing
+# problems like b/118404869, where an envvar set to the empty string has
+# different semantics from an unset envvar.
+if [ -n "${TF_TESTS_PER_GPU}" ]; then
+  BAZEL_TEST_FLAGS="${BAZEL_TEST_FLAGS} "\
+"--test_env=TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU}"
+fi
+if [ -n "${TF_PER_DEVICE_MEMORY_LIMIT_MB}" ]; then
+  BAZEL_TEST_FLAGS="${BAZEL_TEST_FLAGS} "\
 "--test_env=TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB}"
+fi
+
 BAZEL_BUILD_FLAGS="--keep_going"
 
-BAZEL_CMD="bazel test ${BAZEL_TEST_FLAGS}"
-BAZEL_BUILD_ONLY_CMD="bazel build ${BAZEL_BUILD_FLAGS}"
+# Explicitly set jdk8 since that's what's installed in our images. Note that
+# bazel 0.16 and higher defaults to jdk9, which causes failures. See b/117634064
+BAZEL_JAVA_FLAGS="--java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8"
+
+BAZEL_CMD="bazel test ${BAZEL_TEST_FLAGS} ${BAZEL_JAVA_FLAGS}"
+BAZEL_BUILD_ONLY_CMD="bazel build ${BAZEL_BUILD_FLAGS} ${BAZEL_JAVA_FLAGS}"
 BAZEL_CLEAN_CMD="bazel clean"
 
 PIP_CMD="${CI_BUILD_DIR}/builds/pip.sh"
@@ -159,7 +175,6 @@ PIP_INTEGRATION_TESTS_FLAG="--integration_tests"
 ANDROID_CMD="${CI_BUILD_DIR}/builds/android.sh"
 ANDROID_FULL_CMD="${CI_BUILD_DIR}/builds/android_full.sh"
 
-TF_GPU_COUNT=${TF_GPU_COUNT:-4}
 PARALLEL_GPU_TEST_CMD='//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute'
 
 BENCHMARK_CMD="${CI_BUILD_DIR}/builds/benchmark.sh"
@@ -423,7 +438,7 @@ if [[ ${TF_BUILD_IS_PIP} == "no_pip" ]] ||
      [[ ${CTYPE} == "debian.jessie.cpu" ]]; then
     # CPU only command, fully parallel.
     NO_PIP_MAIN_CMD="${MAIN_CMD} ${BAZEL_CMD} ${OPT_FLAG} "\
-      "${EXTRA_ARGS} -- ${BAZEL_TARGET}"
+"${EXTRA_ARGS} -- ${BAZEL_TARGET}"
   elif [[ ${CTYPE} == gpu* ]]; then
     # GPU only command, run as many jobs as the GPU count only.
     NO_PIP_MAIN_CMD="${BAZEL_CMD} ${OPT_FLAG} "\
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index a98c15d961f7fb3b6e546de895e7cec26f1089d9..2c348a0e3390af05cffff5d9a73d0bd57caa92b4 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -104,7 +104,8 @@ do_pylint() {
 "^tensorflow/python/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
 "^tensorflow/python/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
 "^tensorflow/python/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
-"^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned"
+"^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned "\
+"^tensorflow/python/keras/utils/data_utils.py.*\[E1102.*not-callable"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
 
@@ -323,9 +324,12 @@ do_external_licenses_check(){
   LICENSES_FILE="$(mktemp)_licenses.log"
   MISSING_LICENSES_FILE="$(mktemp)_missing_licenses.log"
   EXTRA_LICENSES_FILE="$(mktemp)_extra_licenses.log"
+  TMP_FILE="$(mktemp)_tmp.log"
 
   echo "Getting external dependencies for ${BUILD_TARGET}"
- bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going \
+ bazel query "attr('licenses', 'notice', deps(${BUILD_TARGET}))" --keep_going > "${TMP_FILE}" 2>&1
+ cat "${TMP_FILE}" \
+  | grep -e "^\/\/" -e "^@" \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -334,7 +338,9 @@ do_external_licenses_check(){
 
   echo
   echo "Getting list of external licenses mentioned in ${LICENSES_TARGET}."
-  bazel query "deps(${LICENSES_TARGET})" --keep_going \
+  bazel query "deps(${LICENSES_TARGET})" --keep_going > "${TMP_FILE}" 2>&1
+ cat "${TMP_FILE}" \
+  | grep -e "^\/\/" -e "^@" \
   | grep -E -v "^//tensorflow" \
   | sed -e 's|:.*||' \
   | sort \
@@ -433,9 +439,9 @@ cmd_status(){
 # out by default in TF WORKSPACE file.
 do_bazel_nobuild() {
   BUILD_TARGET="//tensorflow/..."
-  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/java/demo/app/..."
-  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/examples/android/..."
-  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/schema/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/lite/java/demo/app/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/lite/examples/android/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/lite/schema/..."
   BUILD_CMD="bazel build --nobuild ${BAZEL_FLAGS} -- ${BUILD_TARGET}"
 
   ${BUILD_CMD}
@@ -524,11 +530,6 @@ do_check_load_py_test() {
   python check_load_py_test.py
 }
 
-do_cmake_python_sanity() {
-  cd "$ROOT_DIR/tensorflow/contrib/cmake"
-  python -m unittest -v python_sanity_test
-}
-
 do_check_futures_test() {
   cd "$ROOT_DIR/tensorflow/tools/test"
   python check_futures_test.py
@@ -540,8 +541,8 @@ do_check_file_name_test() {
 }
 
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_cmake_python_sanity" "do_check_file_name_test")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Test entries in /tensorflow/contrib/cmake/python_{modules|protos|protos_cc}.txt for validity and consistency" "Check file names for cases")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS=""
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 179fc42d60dc0ff25f2b61a721728e351a825378..989f2a92eb6e5940b0557452080c3b0f3cf706ae 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -38,6 +38,7 @@ if [[ "$ubuntu_version" == "14" ]]; then
   apt-get dist-upgrade -y
 fi
 
+## TODO(yifeif) remove ffmpeg once ffmpeg is removed from contrib
 apt-get install -y --no-install-recommends \
     autoconf \
     automake \
@@ -60,9 +61,9 @@ apt-get install -y --no-install-recommends \
     python3-setuptools \
     rsync \
     sudo \
-    subversion \
     swig \
     unzip \
+    vim \
     wget \
     zip \
     zlib1g-dev
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 329d05342a013f286cd535c8d76a69a990f45619..3470488cc55d3ec54af3546d33f4d1f8fc5e94d6 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -18,8 +18,10 @@ set -e
 
 # We don't apt-get install so that we can install a newer version of pip.
 # Only needed for Ubuntu 14.04 and 16.04; not needed for 18.04 and Debian 8,9?
-easy_install -U pip==9.0.3
+# Run easy_install before easy_install3, so that the default pip points to pip2,
+# to match the default python version of 2.7.
 easy_install3 -U pip==9.0.3
+easy_install -U pip==9.0.3
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
@@ -29,6 +31,11 @@ easy_install3 -U pip==9.0.3
 pip2 install wheel==0.31.1
 pip3 install wheel==0.31.1
 
+# Install last working version of setuptools. This must happen before we install
+# absl-py, which uses install_requires notation introduced in setuptools 20.5.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
+
 pip2 install virtualenv
 pip3 install virtualenv
 
@@ -112,10 +119,6 @@ pip3 install --upgrade gast
 pip2 install --upgrade termcolor
 pip3 install --upgrade termcolor
 
-# Install last working version of setuptools.
-pip2 install --upgrade setuptools==39.1.0
-pip3 install --upgrade setuptools==39.1.0
-
 # Keras
 pip2 install keras_applications==1.0.6 --no-deps
 pip3 install keras_applications==1.0.6 --no-deps
@@ -127,7 +130,3 @@ pip3 install --upgrade h5py==2.8.0
 # Estimator
 pip2 install tensorflow_estimator --no-deps
 pip3 install tensorflow_estimator --no-deps
-
-# Install last working version of setuptools.
-pip2 install --upgrade setuptools==39.1.0
-pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 61d4fe3fe8b62b61f3ddd50293e02bacfa219854..62e04df717316ffc8cf211a6887730be115623be 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -41,6 +41,10 @@ fi
 set -e
 pip3.5 install --upgrade pip
 
+# Install last working version of setuptools. This must happen before we install
+# absl-py, which uses install_requires notation introduced in setuptools 20.5.
+pip3.5 install --upgrade setuptools==39.1.0
+
 pip3.5 install --upgrade virtualenv
 
 # Install six.
@@ -81,15 +85,9 @@ pip3.5 install --upgrade astor
 pip3.5 install --upgrade gast
 pip3.5 install --upgrade termcolor
 
-# Install last working version of setuptools.
-pip3.5 install --upgrade setuptools==39.1.0
-
 # Keras
 pip3.5 install keras_applications==1.0.6
 pip3.5 install keras_preprocessing==1.0.5
 pip3.5 install --upgrade h5py==2.8.0
 
-# Install last working version of setuptools.
-pip3.5 install --upgrade setuptools==39.1.0
-
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 8949af8a885116a4ca6f3bc7d81949fe920566b8..48d556b1dd8e3e17b763b9c71e78e1d551554703 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -51,6 +51,10 @@ ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
 pip3 install --upgrade pip
 
+# Install last working version of setuptools. This must happen before we install
+# absl-py, which uses install_requires notation introduced in setuptools 20.5.
+pip3 install --upgrade setuptools==39.1.0
+
 pip3 install --upgrade virtualenv
 
 set -e
@@ -97,8 +101,6 @@ pip3 install --upgrade astor
 pip3 install --upgrade gast
 pip3 install --upgrade termcolor
 
-# Install last working version of setuptools.
-pip3 install --upgrade setuptools==39.1.0
 pip3 install --upgrade h5py==2.8.0
 
 # Keras
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
index 5c5a36139f50e85e70ce4bff5ca8054f7570b0f5..3efd994d783d8f47b3471cc5ce177293b1e017cc 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
@@ -35,4 +35,4 @@ bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium --config=opt \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
-    //tensorflow/contrib/... -//tensorflow/contrib/lite/...
+    //tensorflow/contrib/... -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 177ef390dbd2f27a34f7a4e230f682b92648ca84..62e1eaa366865616c063d9f9785b863033a32706 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -58,6 +58,8 @@ PY_TEST_DIR="py_test_dir"
 SKIP_TEST=0
 RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
+PROJECT_NAME=""
+EXTRA_BUILD_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -65,16 +67,32 @@ TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 #                        ensure performance
 # --test_core_only       Use tensorflow/python/... as test target
 # --test_contrib_only    Use tensorflow/contrib/... as test target
-for ARG in "$@"; do
-  case "$ARG" in
+#for ARG in "$@"; do
+while [[ $# -gt 0 ]]; do
+  case "$1" in
     --tf_nightly) TF_NIGHTLY=1 ;;
     --skip_test) SKIP_TEST=1 ;;
     --enable_remote_cache) set_remote_cache_options ;;
     --release_build) RELEASE_BUILD=1 ;;
     --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
     --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    --extra_build_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_BUILD_FLAGS="$1"
+      ;;
+    --project_name)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+      ;;
     *)
   esac
+  shift
 done
 
 if [[ "$RELEASE_BUILD" == 1 ]]; then
@@ -88,7 +106,11 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   python tensorflow/tools/ci_build/update_version.py --nightly
-  EXTRA_PIP_FLAG="--nightly_flag"
+  if [ -z ${PROJECT_NAME} ]; then
+    EXTRA_PIP_FLAGS="--nightly_flag"
+  else
+    EXTRA_PIP_FLAGS="--project_name ${PROJECT_NAME} --nightly_flag"
+  fi
 fi
 
 # Enable short object file path to avoid long path issue on Windows.
@@ -100,7 +122,9 @@ fi
 
 run_configure_for_cpu_build
 
-bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS} \
+  tensorflow/tools/pip_package:build_pip_package \
+  --incompatible_remove_native_http_archive=false || exit $?
 
 if [[ "$SKIP_TEST" == 1 ]]; then
   exit 0
@@ -109,7 +133,7 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" "${EXTRA_PIP_FLAG}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" "${EXTRA_PIP_FLAGS}"
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
@@ -126,8 +150,8 @@ N_JOBS="${NUMBER_OF_PROCESSORS}"
 # which will result testing system installed tensorflow
 bazel test --announce_rc --config=opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-no_oss \
-  --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
+  --test_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu --build_tests_only \
   --test_size_filters=small,medium \
   --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
   --flaky_test_attempts=3 \
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 34847e637a2ddf5a1a058093608091c0304dd900..acafd9ebce3afa634c1a1aafd4d9ac5c57935d80 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -58,6 +58,8 @@ PY_TEST_DIR="py_test_dir"
 SKIP_TEST=0
 RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
+PROJECT_NAME=""
+EXTRA_BUILD_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -65,7 +67,7 @@ TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 #                        ensure performance
 # --test_core_only       Use tensorflow/python/... as test target
 # --test_contrib_only    Use tensorflow/contrib/... as test target
-for ARG in "$@"; do
+while [[ $# -gt 0 ]]; do
   case "$ARG" in
     --tf_nightly) TF_NIGHTLY=1 ;;
     --skip_test) SKIP_TEST=1 ;;
@@ -73,8 +75,23 @@ for ARG in "$@"; do
     --release_build) RELEASE_BUILD=1 ;;
     --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
     --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    --extra_build_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_BUILD_FLAGS="$1"
+      ;;
+    --project_name)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+      ;;
     *)
   esac
+  shift
 done
 
 if [[ "$RELEASE_BUILD" == 1 ]]; then
@@ -88,7 +105,11 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   python tensorflow/tools/ci_build/update_version.py --nightly
-  EXTRA_PIP_FLAG="--nightly_flag"
+  if [ -z ${PROJECT_NAME} ]; then
+    EXTRA_PIP_FLAGS="--nightly_flag"
+  else
+    EXTRA_PIP_FLAGS="--project_name=${PROJECT_NAME} --nightly_flag"
+  fi
 fi
 
 # Enable short object file path to avoid long path issue on Windows.
@@ -103,7 +124,9 @@ fi
 
 run_configure_for_gpu_build
 
-bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build --announce_rc --config=opt --define=no_tensorflow_py_deps=true \
+  ${EXTRA_BUILD_FLAGS} \
+  tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$SKIP_TEST" == 1 ]]; then
   exit 0
@@ -112,7 +135,8 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" --gpu "${EXTRA_PIP_FLAG}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" \
+  --gpu "${EXTRA_PIP_FLAGS}"
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
diff --git a/tensorflow/tools/common/traverse.py b/tensorflow/tools/common/traverse.py
index 9607f80686df5c9d9d23f32ffd8ae9b550356736..b121a87062f6bd54b91fb5fa8efd3b8e58a426bc 100644
--- a/tensorflow/tools/common/traverse.py
+++ b/tensorflow/tools/common/traverse.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import enum
 import sys
 
 from tensorflow.python.util import tf_inspect
@@ -34,6 +35,13 @@ def _traverse_internal(root, visit, stack, path):
 
   try:
     children = tf_inspect.getmembers(root)
+
+    # Add labels for duplicate values in Enum.
+    if tf_inspect.isclass(root) and issubclass(root, enum.Enum):
+      for enum_member in root.__members__.items():
+        if enum_member not in children:
+          children.append(enum_member)
+      children = sorted(children)
   except ImportError:
     # On some Python installations, some modules do not support enumerating
     # members (six in particular), leading to import errors.
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 55792c51fe87f0ded92730c13409169f6c67d035..a9902d77f5ec103fe2000a4a470d425e3998f45e 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -14,6 +14,18 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_test(
+    name = "ast_edits_test",
+    srcs = ["ast_edits_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ast_edits",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
+    ],
+)
+
 py_binary(
     name = "tf_upgrade",
     srcs = ["tf_upgrade.py"],
@@ -33,14 +45,38 @@ py_test(
     ],
 )
 
+py_library(
+    name = "renames_v2",
+    srcs = ["renames_v2.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "reorders_v2",
+    srcs = ["reorders_v2.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "tf_upgrade_v2_lib",
+    srcs = ["tf_upgrade_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ast_edits",
+        ":renames_v2",
+        ":reorders_v2",
+    ],
+)
+
 py_binary(
     name = "tf_upgrade_v2",
-    srcs = [
-        "renames_v2.py",
-        "tf_upgrade_v2.py",
-    ],
+    srcs = ["tf_upgrade_v2_main.py"],
+    main = "tf_upgrade_v2_main.py",
     srcs_version = "PY2AND3",
-    deps = [":ast_edits"],
+    deps = [
+        ":ast_edits",
+        ":tf_upgrade_v2_lib",
+    ],
 )
 
 py_test(
@@ -49,8 +85,11 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":tf_upgrade_v2",
+        "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
         "@six_archive//:six",
     ],
 )
@@ -94,18 +133,28 @@ py_test(
 genrule(
     name = "generate_upgraded_file_v2",
     testonly = 1,
-    srcs = ["testdata/test_file_v1_10.py"],
+    srcs = ["testdata/test_file_v1_12.py"],
     outs = [
         "test_file_v2_0.py",
         "report_v2.txt",
     ],
     cmd = ("$(location :tf_upgrade_v2)" +
-           " --infile $(location testdata/test_file_v1_10.py)" +
+           " --infile $(location testdata/test_file_v1_12.py)" +
            " --outfile $(location test_file_v2_0.py)" +
            " --reportfile $(location report_v2.txt)"),
     tools = [":tf_upgrade_v2"],
 )
 
+py_test(
+    name = "test_file_v1_12",
+    size = "small",
+    srcs = ["testdata/test_file_v1_12.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_test(
     name = "test_file_v2_0",
     size = "small",
@@ -122,6 +171,6 @@ exports_files(
         "tf_upgrade.py",
         "renames_v2.py",
         "testdata/test_file_v0_11.py",
-        "testdata/test_file_v1_10.py",
+        "testdata/test_file_v1_12.py",
     ],
 )
diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md
index aabc7b253d68eb43d3e6c1d5cecd55697a0cab59..6ff42b1fefe983d2119ddc7841d14d888443b49a 100644
--- a/tensorflow/tools/compatibility/README.md
+++ b/tensorflow/tools/compatibility/README.md
@@ -1,60 +1,77 @@
 # TensorFlow Python API Upgrade Utility
 
 This tool allows you to upgrade your existing TensorFlow Python scripts.
-This script can be run on a single Python file:
+Specifically: \
+`tf_upgrade_v2.py`: upgrades code from TensorFlow 1.12 to TensorFlow 2.0 preview. \
+`tf_upgrade.py`: upgrades code to TensorFlow 1.0 from TensorFlow 0.11.
+
+## Running the script from pip package
+
+First, install TensorFlow pip package. See
+https://www.tensorflow.org/install/pip.
+
+Upgrade script can be run on a single Python file:
 
 ```
-tf_upgrade.py --infile foo.py --outfile foo-upgraded.py
+tf_upgrade_v2 --infile foo.py --outfile foo-upgraded.py
 ```
 
 It will print a list of errors it finds that it can't fix. You can also run
 it on a directory tree:
 
 ```
+# upgrade the .py files and copy all the other files to the outtree
+tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded
+
 # just upgrade the .py files
-tf_upgrade.py --intree coolcode --outtree coolcode-upgraded
-# after upgrade the .py files, then copy all the other files to the outtree
-tf_upgrade.py --intree coolcode --outtree coolcode-upgraded --copyotherfiles True
+tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded --copyotherfiles False
 ```
 
-In either case, it will also dump out a report e.g. which will detail changes
+
+## Report
+
+The script will also dump out a report e.g. which will detail changes
 e.g.:
 
 ```
-third_party/tensorflow/tools/compatibility/test_file_v0.11.py Line 125
+'tensorflow/tools/compatibility/testdata/test_file_v1_12.py' Line 65
+--------------------------------------------------------------------------------
+
+Added keyword 'input' to reordered function 'tf.argmax'
+Renamed keyword argument from 'dimension' to 'axis'
 
-Renamed keyword argument from `dim` to `axis`
-Renamed keyword argument from `squeeze_dims` to `axis`
+    Old:         tf.argmax([[1, 3, 2]], dimension=0))
+                                        ~~~~~~~~~~
+    New:         tf.argmax(input=[[1, 3, 2]], axis=0))
 
-    Old:                   [[1, 2, 3]], dim=1), squeeze_dims=[1]).eval(),
-                                        ~~~~    ~~~~~~~~~~~~~
-    New:                   [[1, 2, 3]], axis=1), axis=[1]).eval(),
-                                        ~~~~~    ~~~~~
 ```
 
 ## Caveats
 
 - Don't update parts of your code manually before running this script. In
-particular, functions that have had reordered arguments like `tf.concat`
-or `tf.split` will cause the script to incorrectly add keyword arguments that
-mismap arguments.
+particular, functions that have had reordered arguments like `tf.argmax`
+or `tf.batch_to_space` will cause the script to incorrectly add keyword
+arguments that mismap arguments.
 
 - This script wouldn't actually reorder arguments. Instead, the script will add
 keyword arguments to functions that had their arguments reordered.
 
 - This script is not able to upgrade all functions. One notable example is
-`tf.reverse()` which has been changed to take a list of indices rather than
-a tensor of bools. If the script detects this, it will report this to stdout
+`tf.nn.conv2d` that no longer takes `use_cudnn_on_gpu` argument.
+If the script detects this, it will report this to stdout
 (and in the report), and you can fix it manually. For example if you have
-`tf.reverse(a, [False, True, True])` you will need to manually change it to
-`tf.reverse(a, [1, 2])`.
+`tf.nn.conv2d(inputs, filters, strides, padding, use_cudnn_on_gpu=True)`
+you will need to manually change it to
+`tf.nn.conv2d(input, filters, strides, padding)`.
 
 - There are some syntaxes that are not handleable with this script as this
-script was designed to use only standard python packages. If the script fails
-with "A necessary keyword argument failed to be inserted." or
+script was designed to use only standard python packages.
+There is an alternative available for TensorFlow 0.* to 1.0 upgrade script.
+If the script fails with "A necessary keyword argument failed to be inserted." or
 "Failed to find keyword lexicographically. Fix manually.", you can try
 [@machrisaa's fork of this script](https://github.com/machrisaa/tf0to1).
 [@machrisaa](https://github.com/machrisaa) has used the
 [RedBaron Python refactoring engine](https://redbaron.readthedocs.io/en/latest/)
 which is able to localize syntactic elements more reliably than the built-in
-`ast` module this script is based upon.
+`ast` module this script is based upon. Note that the alternative script is not
+available for TensorFlow 2.0 upgrade.
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index 23cc4a21a9e6f81c8dc5016bc2cb6a2f151c7924..eac2150502d6511da127a42fbb46c92bea7fe364 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -21,11 +21,16 @@ from __future__ import print_function
 import ast
 import collections
 import os
+import re
 import shutil
 import sys
 import tempfile
 import traceback
 
+# Some regular expressions we will need for parsing
+FIND_OPEN = re.compile(r"^\s*(\[).*$")
+FIND_STRING_CHARS = re.compile(r"['\"]")
+
 
 class APIChangeSpec(object):
   """This class defines the transformations that need to happen.
@@ -34,12 +39,16 @@ class APIChangeSpec(object):
 
   * `function_keyword_renames`: maps function names to a map of old -> new
     argument names
-  * `function_renames`: maps function names to new function names
+  * `symbol_renames`: maps function names to new function names
   * `change_to_function`: a set of function names that have changed (for
     notifications)
   * `function_reorders`: maps functions whose argument order has changed to the
     list of arguments in the new order
   * `function_handle`: maps function names to custom handlers for the function
+  * `function_warnings`: maps full names of functions to warnings that will be
+    printed out if the function is used. (e.g. tf.nn.convolution())
+  * `unrestricted_function_warnings`: maps names of functions to warnings that
+    will be printed out when the function is used (e.g. foo.convolution()).
 
   For an example, see `TFAPIChangeSpec`.
   """
@@ -53,7 +62,7 @@ class _FileEditTuple(
   Fields:
     comment: A description of the edit and why it was made.
     line: The line number in the file where the edit occurs (1-indexed).
-    start: The line number in the file where the edit occurs (0-indexed).
+    start: The column number in the file where the edit occurs (0-indexed).
     old: text string to remove (this must match what was in file).
     new: text string to add in place of `old`.
   """
@@ -176,14 +185,48 @@ class _ASTCallVisitor(ast.NodeVisitor):
     ast.NodeVisitor.generic_visit(self, node)
 
   def _rename_functions(self, node, full_name):
-    function_renames = self._api_change_spec.function_renames
+    symbol_renames = self._api_change_spec.symbol_renames
     try:
-      new_name = function_renames[full_name]
+      new_name = symbol_renames[full_name]
       self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
                           node.lineno, node.col_offset, full_name, new_name)
     except KeyError:
       pass
 
+  def _print_warning_for_function(self, node, full_name):
+    function_warnings = self._api_change_spec.function_warnings
+    try:
+      warning_message = function_warnings[full_name]
+      warning_message = warning_message.replace("<function name>", full_name)
+      self._file_edit.add(warning_message,
+                          node.lineno, node.col_offset, full_name, full_name,
+                          error="%s requires manual check." % full_name)
+    except KeyError:
+      pass
+
+  def _print_warning_for_function_unrestricted(self, node):
+    """Print a warning when specific functions are called.
+
+    The function _print_warning_for_function matches the full name of the called
+    function, e.g., tf.foo.bar(). This function matches the function name that
+    is called, as long as the function is an attribute. For example,
+    `tf.foo.bar()` and `foo.bar()` are matched, but not `bar()`.
+
+    Args:
+      node: ast.Call object
+    """
+    function_warnings = getattr(
+        self._api_change_spec, "unrestricted_function_warnings", {})
+    if isinstance(node.func, ast.Attribute):
+      function_name = node.func.attr
+      try:
+        warning_message = function_warnings[function_name]
+        self._file_edit.add(warning_message,
+                            node.lineno, node.col_offset, "", "",
+                            error="%s requires manual check." % function_name)
+      except KeyError:
+        pass
+
   def _get_attribute_full_path(self, node):
     """Traverse an attribute to generate a full name e.g. tf.foo.bar.
 
@@ -198,11 +241,11 @@ class _ASTCallVisitor(ast.NodeVisitor):
     items = []
     while not isinstance(curr, ast.Name):
       if not isinstance(curr, ast.Attribute):
-        return None
+        return None, None
       items.append(curr.attr)
       curr = curr.value
     items.append(curr.id)
-    return ".".join(reversed(items))
+    return ".".join(reversed(items)), items[0]
 
   def _find_true_position(self, node):
     """Return correct line number and column offset for a given node.
@@ -210,13 +253,12 @@ class _ASTCallVisitor(ast.NodeVisitor):
     This is necessary mainly because ListComp's location reporting reports
     the next token after the list comprehension list opening.
 
+    Returns:
+      lineno, offset for the given node
+
     Args:
       node: Node for which we wish to know the lineno and col_offset
     """
-    import re
-    find_open = re.compile("^\s*(\\[).*$")
-    find_string_chars = re.compile("['\"]")
-
     if isinstance(node, ast.ListComp):
       # Strangely, ast.ListComp returns the col_offset of the first token
       # after the '[' token which appears to be a bug. Workaround by
@@ -230,7 +272,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
         reversed_preceding_text = text[:col][::-1]
         # First find if a [ can be found with only whitespace between it and
         # col.
-        m = find_open.match(reversed_preceding_text)
+        m = FIND_OPEN.match(reversed_preceding_text)
         if m:
           new_col_offset = col - m.start(1) - 1
           return line, new_col_offset
@@ -249,7 +291,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
             comment_start = prev_line.find("#")
             if comment_start == -1:
               col = len(prev_line) - 1
-            elif find_string_chars.search(prev_line[comment_start:]) is None:
+            elif FIND_STRING_CHARS.search(prev_line[comment_start:]) is None:
               col = comment_start
             else:
               return None, None
@@ -265,9 +307,10 @@ class _ASTCallVisitor(ast.NodeVisitor):
     Args:
       node: Current Node
     """
+    self._print_warning_for_function_unrestricted(node)
 
     # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name = self._get_attribute_full_path(node.func)
+    full_name, name = self._get_attribute_full_path(node.func)
 
     # Make sure the func is marked as being part of a call
     node.func.is_function_for_call = True
@@ -275,6 +318,9 @@ class _ASTCallVisitor(ast.NodeVisitor):
     if full_name:
       # Call special handlers
       function_handles = self._api_change_spec.function_handle
+      glob_name = "*.{}".format(name)
+      if glob_name in function_handles:
+        function_handles[glob_name](self._file_edit, node)
       if full_name in function_handles:
         function_handles[full_name](self._file_edit, node)
 
@@ -347,8 +393,10 @@ class _ASTCallVisitor(ast.NodeVisitor):
     Args:
       node: Node that is of type ast.Attribute
     """
-    full_name = self._get_attribute_full_path(node)
+    full_name, _ = self._get_attribute_full_path(node)
     if full_name:
+      # Make sure the warning comes first, otherwise the name may have changed
+      self._print_warning_for_function(node, full_name)
       self._rename_functions(node, full_name)
     if full_name in self._api_change_spec.change_to_function:
       if not hasattr(node, "is_function_for_call"):
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f20a026fcb9b60e0d4365dd2690946f0d833fc
--- /dev/null
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -0,0 +1,420 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ast_edits which is used in tf upgraders.
+
+All of the tests assume that we want to change from an API containing
+
+    def f(a, b, kw1, kw2): ...
+    def g(a, b, kw1, c, kw1_alias): ...
+    def g2(a, b, kw1, c, d, kw1_alias): ...
+    def h(a, kw1, kw2, kw1_alias, kw2_alias): ...
+
+and the changes to the API consist of renaming, reordering, and/or removing
+arguments. Thus, we want to be able to generate changes to produce each of the
+following new APIs:
+
+    def f(a, b, kw1, kw3): ...
+    def f(a, b, kw2, kw1): ...
+    def f(a, b, kw3, kw1): ...
+    def g(a, b, kw1, c): ...
+    def g(a, b, c, kw1): ...
+    def g2(a, b, kw1, c, d): ...
+    def g2(a, b, c, d, kw1): ...
+    def h(a, kw1, kw2): ...
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
+
+
+class NoUpdateSpec(ast_edits.APIChangeSpec):
+  """A specification of an API change which doesn't change anything."""
+
+  def __init__(self):
+    self.function_handle = {}
+    self.function_reorders = {}
+    self.function_keyword_renames = {}
+    self.symbol_renames = {}
+    self.function_warnings = {}
+    self.unrestricted_function_warnings = {}
+    self.change_to_function = {}
+
+
+class RenameKeywordSpec(NoUpdateSpec):
+  """A specification where kw2 gets renamed to kw3.
+
+  The new API is
+
+    def f(a, b, kw1, kw3): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.update_renames()
+
+  def update_renames(self):
+    self.function_keyword_renames["f"] = {"kw2": "kw3"}
+
+
+class ReorderKeywordSpec(NoUpdateSpec):
+  """A specification where kw2 gets moved in front of kw1.
+
+  The new API is
+
+    def f(a, b, kw2, kw1): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.update_reorders()
+
+  def update_reorders(self):
+    # Note that these should be in the old order.
+    self.function_reorders["f"] = ["a", "b", "kw1", "kw2"]
+
+
+class ReorderAndRenameKeywordSpec(ReorderKeywordSpec, RenameKeywordSpec):
+  """A specification where kw2 gets moved in front of kw1 and is changed to kw3.
+
+  The new API is
+
+    def f(a, b, kw3, kw1): ...
+
+  """
+
+  def __init__(self):
+    ReorderKeywordSpec.__init__(self)
+    RenameKeywordSpec.__init__(self)
+    self.update_renames()
+    self.update_reorders()
+
+
+class RemoveDeprecatedAliasKeyword(NoUpdateSpec):
+  """A specification where kw1_alias is removed in g.
+
+  The new API is
+
+    def g(a, b, kw1, c): ...
+    def g2(a, b, kw1, c, d): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.function_keyword_renames["g"] = {"kw1_alias": "kw1"}
+    self.function_keyword_renames["g2"] = {"kw1_alias": "kw1"}
+
+
+class RemoveDeprecatedAliasAndReorderRest(RemoveDeprecatedAliasKeyword):
+  """A specification where kw1_alias is removed in g.
+
+  The new API is
+
+    def g(a, b, c, kw1): ...
+    def g2(a, b, c, d, kw1): ...
+
+  """
+
+  def __init__(self):
+    RemoveDeprecatedAliasKeyword.__init__(self)
+    # Note that these should be in the old order.
+    self.function_reorders["g"] = ["a", "b", "kw1", "c"]
+    self.function_reorders["g2"] = ["a", "b", "kw1", "c", "d"]
+
+
+class RemoveMultipleKeywordArguments(NoUpdateSpec):
+  """A specification where both keyword aliases are removed from h.
+
+  The new API is
+
+    def h(a, kw1, kw2): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.function_keyword_renames["h"] = {
+        "kw1_alias": "kw1",
+        "kw2_alias": "kw2",
+    }
+
+
+class TestAstEdits(test_util.TensorFlowTestCase):
+
+  def _upgrade(self, spec, old_file_text):
+    in_file = six.StringIO(old_file_text)
+    out_file = six.StringIO()
+    upgrader = ast_edits.ASTCodeUpgrader(spec)
+    count, report, errors = (
+        upgrader.process_opened_file("test.py", in_file,
+                                     "test_out.py", out_file))
+    return (count, report, errors), out_file.getvalue()
+
+  def testNoTransformIfNothingIsSupplied(self):
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    self.assertEqual(new_text, text)
+
+    text = "f(a, b, c, d)\n"
+    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    self.assertEqual(new_text, text)
+
+  def testKeywordRename(self):
+    """Test that we get the expected result if renaming kw2 to kw3."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    expected = "f(a, b, kw1=c, kw3=d)\n"
+    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    self.assertEqual(new_text, expected)
+
+    # No keywords specified, no reordering, so we should get input as output
+    text = "f(a, b, c, d)\n"
+    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    self.assertEqual(new_text, text)
+
+  def testKeywordReorder(self):
+    """Test that we get the expected result if kw2 is now before kw1."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    acceptable_outputs = [
+        # No change is a valid output
+        text,
+        # Just reordering the kw.. args is also ok
+        "f(a, b, kw2=d, kw1=c)\n",
+        # Also cases where all arguments are fully specified are allowed
+        "f(a=a, b=b, kw1=c, kw2=d)\n",
+        "f(a=a, b=b, kw2=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "f(a, b, c, d)\n"
+    acceptable_outputs = [
+        "f(a, b, d, c)\n",
+        "f(a=a, b=b, kw1=c, kw2=d)\n",
+        "f(a=a, b=b, kw2=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testKeywordReorderAndRename(self):
+    """Test that we get the expected result if kw2 is renamed and moved."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    acceptable_outputs = [
+        "f(a, b, kw3=d, kw1=c)\n",
+        "f(a=a, b=b, kw1=c, kw3=d)\n",
+        "f(a=a, b=b, kw3=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "f(a, b, c, d)\n"
+    acceptable_outputs = [
+        "f(a, b, d, c)\n",
+        "f(a=a, b=b, kw1=c, kw3=d)\n",
+        "f(a=a, b=b, kw3=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAlias(self):
+    """Test that we get the expected result if a keyword alias is removed."""
+    text = "g(a, b, kw1=x, c=c)\n"
+    acceptable_outputs = [
+        # Not using deprecated alias, so original is ok
+        text,
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # No keyword used, should be no change
+    text = "g(a, b, x, c)\n"
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertEqual(new_text, text)
+
+    # If we used the alias, it should get renamed
+    text = "g(a, b, kw1_alias=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed even if it's last
+    text = "g(a, b, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAndReorder(self):
+    """Test for when a keyword alias is removed and args are reordered."""
+    text = "g(a, b, kw1=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "g(a, b, x, c)\n"
+    # Don't accept an output which doesn't reorder c and d
+    acceptable_outputs = [
+        "g(a, b, c, x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # If we used the alias, it should get renamed
+    text = "g(a, b, kw1_alias=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed and reordered even if it's last
+    text = "g(a, b, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAndReorder2(self):
+    """Same as testRemoveDeprecatedKeywordAndReorder but on g2 (more args)."""
+    text = "g2(a, b, kw1=x, c=c, d=d)\n"
+    acceptable_outputs = [
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "g2(a, b, x, c, d)\n"
+    # Don't accept an output which doesn't reorder c and d
+    acceptable_outputs = [
+        "g2(a, b, c, d, x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # If we used the alias, it should get renamed
+    text = "g2(a, b, kw1_alias=x, c=c, d=d)\n"
+    acceptable_outputs = [
+        "g2(a, b, kw1=x, c=c, d=d)\n",
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+        "g2(a=a, b=b, c=c, d=d, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed and reordered even if it's not in order
+    text = "g2(a, b, d=d, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g2(a, b, kw1=x, c=c, d=d)\n",
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a, b, d=d, c=c, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+        "g2(a=a, b=b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, d=d, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveMultipleKeywords(self):
+    """Remove multiple keywords at once."""
+    # Not using deprecated keywords -> no rename
+    text = "h(a, kw1=x, kw2=y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertEqual(new_text, text)
+
+    # Using positional arguments (in proper order) -> no change
+    text = "h(a, x, y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertEqual(new_text, text)
+
+    # Use only the old names, in order
+    text = "h(a, kw1_alias=x, kw2_alias=y)\n"
+    acceptable_outputs = [
+        "h(a, x, y)\n",
+        "h(a, kw1=x, kw2=y)\n",
+        "h(a=a, kw1=x, kw2=y)\n",
+        "h(a, kw2=y, kw1=x)\n",
+        "h(a=a, kw2=y, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Use only the old names, in reverse order, should give one of same outputs
+    text = "h(a, kw2_alias=y, kw1_alias=x)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Mix old and new names
+    text = "h(a, kw1=x, kw2_alias=y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testUnrestrictedFunctionWarnings(self):
+    class FooWarningSpec(NoUpdateSpec):
+      """Usages of function attribute foo() prints out a warning."""
+
+      def __init__(self):
+        NoUpdateSpec.__init__(self)
+        self.unrestricted_function_warnings = {"foo": "not good"}
+    texts = ["object.foo()", "get_object().foo()",
+             "get_object().foo()", "object.foo().bar()"]
+    for text in texts:
+      (_, report, _), _ = self._upgrade(FooWarningSpec(), text)
+      self.assertIn("not good", report)
+
+    # Note that foo() won't result in a warning, because in this case foo is
+    # not an attribute, but a name.
+    false_alarms = ["foo", "foo()", "foo.bar()", "obj.run_foo()", "obj.foo"]
+    for text in false_alarms:
+      (_, report, _), _ = self._upgrade(FooWarningSpec(), text)
+      self.assertNotIn("not good", report)
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 7e66ad816ac4b65962b0c0f40493a2b51feeaa41..b757ad4647c6d92e21feccd7d90da887df379531 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -25,64 +25,306 @@ from __future__ import division
 from __future__ import print_function
 
 renames = {
-    'tf.acos': 'tf.math.acos',
-    'tf.acosh': 'tf.math.acosh',
-    'tf.add': 'tf.math.add',
-    'tf.as_string': 'tf.dtypes.as_string',
-    'tf.asin': 'tf.math.asin',
-    'tf.asinh': 'tf.math.asinh',
-    'tf.atan': 'tf.math.atan',
-    'tf.atan2': 'tf.math.atan2',
-    'tf.atanh': 'tf.math.atanh',
-    'tf.batch_to_space_nd': 'tf.manip.batch_to_space_nd',
+    'tf.AUTO_REUSE': 'tf.compat.v1.AUTO_REUSE',
+    'tf.AttrValue': 'tf.compat.v1.AttrValue',
+    'tf.COMPILER_VERSION': 'tf.version.COMPILER_VERSION',
+    'tf.CXX11_ABI_FLAG': 'tf.sysconfig.CXX11_ABI_FLAG',
+    'tf.ConditionalAccumulator': 'tf.compat.v1.ConditionalAccumulator',
+    'tf.ConditionalAccumulatorBase': 'tf.compat.v1.ConditionalAccumulatorBase',
+    'tf.ConfigProto': 'tf.compat.v1.ConfigProto',
+    'tf.DeviceSpec': 'tf.compat.v1.DeviceSpec',
+    'tf.Dimension': 'tf.compat.v1.Dimension',
+    'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
+    'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
+    'tf.FixedLengthRecordReader': 'tf.compat.v1.FixedLengthRecordReader',
+    'tf.GIT_VERSION': 'tf.version.GIT_VERSION',
+    'tf.GPUOptions': 'tf.compat.v1.GPUOptions',
+    'tf.GRAPH_DEF_VERSION': 'tf.version.GRAPH_DEF_VERSION',
+    'tf.GRAPH_DEF_VERSION_MIN_CONSUMER': 'tf.version.GRAPH_DEF_VERSION_MIN_CONSUMER',
+    'tf.GRAPH_DEF_VERSION_MIN_PRODUCER': 'tf.version.GRAPH_DEF_VERSION_MIN_PRODUCER',
+    'tf.GraphDef': 'tf.compat.v1.GraphDef',
+    'tf.GraphKeys': 'tf.compat.v1.GraphKeys',
+    'tf.GraphOptions': 'tf.compat.v1.GraphOptions',
+    'tf.HistogramProto': 'tf.compat.v1.HistogramProto',
+    'tf.IdentityReader': 'tf.compat.v1.IdentityReader',
+    'tf.InteractiveSession': 'tf.compat.v1.InteractiveSession',
+    'tf.LMDBReader': 'tf.compat.v1.LMDBReader',
+    'tf.LogMessage': 'tf.compat.v1.LogMessage',
+    'tf.MONOLITHIC_BUILD': 'tf.sysconfig.MONOLITHIC_BUILD',
+    'tf.MetaGraphDef': 'tf.compat.v1.MetaGraphDef',
+    'tf.NameAttrList': 'tf.compat.v1.NameAttrList',
+    'tf.NoGradient': 'tf.no_gradient',
+    'tf.NodeDef': 'tf.compat.v1.NodeDef',
+    'tf.NotDifferentiable': 'tf.no_gradient',
+    'tf.OpError': 'tf.errors.OpError',
+    'tf.OptimizerOptions': 'tf.compat.v1.OptimizerOptions',
+    'tf.PaddingFIFOQueue': 'tf.io.PaddingFIFOQueue',
+    'tf.Print': 'tf.compat.v1.Print',
+    'tf.PriorityQueue': 'tf.io.PriorityQueue',
+    'tf.QUANTIZED_DTYPES': 'tf.dtypes.QUANTIZED_DTYPES',
+    'tf.QueueBase': 'tf.io.QueueBase',
+    'tf.RandomShuffleQueue': 'tf.io.RandomShuffleQueue',
+    'tf.ReaderBase': 'tf.compat.v1.ReaderBase',
+    'tf.RunMetadata': 'tf.compat.v1.RunMetadata',
+    'tf.RunOptions': 'tf.compat.v1.RunOptions',
+    'tf.Session': 'tf.compat.v1.Session',
+    'tf.SessionLog': 'tf.compat.v1.SessionLog',
+    'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
+    'tf.SparseFeature': 'tf.io.SparseFeature',
+    'tf.SparseTensorValue': 'tf.compat.v1.SparseTensorValue',
+    'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
+    'tf.TensorInfo': 'tf.compat.v1.TensorInfo',
+    'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
+    'tf.VERSION': 'tf.version.VERSION',
+    'tf.VarLenFeature': 'tf.io.VarLenFeature',
+    'tf.VariableScope': 'tf.compat.v1.VariableScope',
+    'tf.WholeFileReader': 'tf.compat.v1.WholeFileReader',
+    'tf.accumulate_n': 'tf.math.accumulate_n',
+    'tf.add_check_numerics_ops': 'tf.compat.v1.add_check_numerics_ops',
+    'tf.add_to_collection': 'tf.compat.v1.add_to_collection',
+    'tf.add_to_collections': 'tf.compat.v1.add_to_collections',
+    'tf.all_variables': 'tf.compat.v1.all_variables',
+    'tf.angle': 'tf.math.angle',
+    'tf.app.run': 'tf.compat.v1.app.run',
+    'tf.assert_greater_equal': 'tf.compat.v1.assert_greater_equal',
+    'tf.assert_integer': 'tf.compat.v1.assert_integer',
+    'tf.assert_less_equal': 'tf.compat.v1.assert_less_equal',
+    'tf.assert_near': 'tf.compat.v1.assert_near',
+    'tf.assert_negative': 'tf.compat.v1.assert_negative',
+    'tf.assert_non_negative': 'tf.compat.v1.assert_non_negative',
+    'tf.assert_non_positive': 'tf.compat.v1.assert_non_positive',
+    'tf.assert_none_equal': 'tf.compat.v1.assert_none_equal',
+    'tf.assert_positive': 'tf.compat.v1.assert_positive',
+    'tf.assert_proper_iterable': 'tf.debugging.assert_proper_iterable',
+    'tf.assert_rank_at_least': 'tf.compat.v1.assert_rank_at_least',
+    'tf.assert_rank_in': 'tf.compat.v1.assert_rank_in',
+    'tf.assert_same_float_dtype': 'tf.debugging.assert_same_float_dtype',
+    'tf.assert_scalar': 'tf.compat.v1.assert_scalar',
+    'tf.assert_type': 'tf.compat.v1.assert_type',
+    'tf.assert_variables_initialized': 'tf.compat.v1.assert_variables_initialized',
+    'tf.assign': 'tf.compat.v1.assign',
+    'tf.assign_add': 'tf.compat.v1.assign_add',
+    'tf.assign_sub': 'tf.compat.v1.assign_sub',
+    'tf.batch_scatter_update': 'tf.compat.v1.batch_scatter_update',
     'tf.betainc': 'tf.math.betainc',
     'tf.ceil': 'tf.math.ceil',
     'tf.check_numerics': 'tf.debugging.check_numerics',
     'tf.cholesky': 'tf.linalg.cholesky',
-    'tf.cos': 'tf.math.cos',
-    'tf.cosh': 'tf.math.cosh',
+    'tf.cholesky_solve': 'tf.linalg.cholesky_solve',
+    'tf.clip_by_average_norm': 'tf.compat.v1.clip_by_average_norm',
+    'tf.colocate_with': 'tf.compat.v1.colocate_with',
+    'tf.conj': 'tf.math.conj',
+    'tf.container': 'tf.compat.v1.container',
+    'tf.convert_to_tensor_or_indexed_slices': 'tf.compat.v1.convert_to_tensor_or_indexed_slices',
+    'tf.convert_to_tensor_or_sparse_tensor': 'tf.compat.v1.convert_to_tensor_or_sparse_tensor',
+    'tf.count_nonzero': 'tf.compat.v1.count_nonzero',
+    'tf.count_up_to': 'tf.compat.v1.count_up_to',
+    'tf.create_partitioned_variables': 'tf.compat.v1.create_partitioned_variables',
     'tf.cross': 'tf.linalg.cross',
+    'tf.cumprod': 'tf.math.cumprod',
+    'tf.data.make_initializable_iterator': 'tf.compat.v1.data.make_initializable_iterator',
+    'tf.data.make_one_shot_iterator': 'tf.compat.v1.data.make_one_shot_iterator',
+    'tf.debugging.is_finite': 'tf.math.is_finite',
+    'tf.debugging.is_inf': 'tf.math.is_inf',
+    'tf.debugging.is_nan': 'tf.math.is_nan',
+    'tf.debugging.is_non_decreasing': 'tf.math.is_non_decreasing',
+    'tf.debugging.is_strictly_increasing': 'tf.math.is_strictly_increasing',
     'tf.decode_base64': 'tf.io.decode_base64',
     'tf.decode_compressed': 'tf.io.decode_compressed',
     'tf.decode_json_example': 'tf.io.decode_json_example',
     'tf.decode_raw': 'tf.io.decode_raw',
+    'tf.delete_session_tensor': 'tf.compat.v1.delete_session_tensor',
+    'tf.depth_to_space': 'tf.compat.v1.depth_to_space',
     'tf.dequantize': 'tf.quantization.dequantize',
+    'tf.deserialize_many_sparse': 'tf.io.deserialize_many_sparse',
     'tf.diag': 'tf.linalg.tensor_diag',
     'tf.diag_part': 'tf.linalg.tensor_diag_part',
     'tf.digamma': 'tf.math.digamma',
+    'tf.dimension_at_index': 'tf.compat.v1.dimension_at_index',
+    'tf.dimension_value': 'tf.compat.v1.dimension_value',
+    'tf.disable_eager_execution': 'tf.compat.v1.disable_eager_execution',
+    'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_behavior': 'tf.compat.v1.disable_v2_behavior',
+    'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
+    'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
+    'tf.distributions.Beta': 'tf.compat.v1.distributions.Beta',
+    'tf.distributions.Categorical': 'tf.compat.v1.distributions.Categorical',
+    'tf.distributions.Dirichlet': 'tf.compat.v1.distributions.Dirichlet',
+    'tf.distributions.DirichletMultinomial': 'tf.compat.v1.distributions.DirichletMultinomial',
+    'tf.distributions.Distribution': 'tf.compat.v1.distributions.Distribution',
+    'tf.distributions.Exponential': 'tf.compat.v1.distributions.Exponential',
+    'tf.distributions.FULLY_REPARAMETERIZED': 'tf.compat.v1.distributions.FULLY_REPARAMETERIZED',
+    'tf.distributions.Gamma': 'tf.compat.v1.distributions.Gamma',
+    'tf.distributions.Laplace': 'tf.compat.v1.distributions.Laplace',
+    'tf.distributions.Multinomial': 'tf.compat.v1.distributions.Multinomial',
+    'tf.distributions.NOT_REPARAMETERIZED': 'tf.compat.v1.distributions.NOT_REPARAMETERIZED',
+    'tf.distributions.Normal': 'tf.compat.v1.distributions.Normal',
+    'tf.distributions.RegisterKL': 'tf.compat.v1.distributions.RegisterKL',
+    'tf.distributions.ReparameterizationType': 'tf.compat.v1.distributions.ReparameterizationType',
+    'tf.distributions.StudentT': 'tf.compat.v1.distributions.StudentT',
+    'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
+    'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
+    'tf.div': 'tf.compat.v1.div',
+    'tf.enable_eager_execution': 'tf.compat.v1.enable_eager_execution',
+    'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_behavior': 'tf.compat.v1.enable_v2_behavior',
+    'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
     'tf.encode_base64': 'tf.io.encode_base64',
-    'tf.equal': 'tf.math.equal',
+    'tf.erf': 'tf.math.erf',
     'tf.erfc': 'tf.math.erfc',
-    'tf.exp': 'tf.math.exp',
     'tf.expm1': 'tf.math.expm1',
-    'tf.extract_image_patches': 'tf.image.extract_image_patches',
     'tf.fake_quant_with_min_max_args': 'tf.quantization.fake_quant_with_min_max_args',
     'tf.fake_quant_with_min_max_args_gradient': 'tf.quantization.fake_quant_with_min_max_args_gradient',
     'tf.fake_quant_with_min_max_vars': 'tf.quantization.fake_quant_with_min_max_vars',
     'tf.fake_quant_with_min_max_vars_gradient': 'tf.quantization.fake_quant_with_min_max_vars_gradient',
     'tf.fake_quant_with_min_max_vars_per_channel': 'tf.quantization.fake_quant_with_min_max_vars_per_channel',
     'tf.fake_quant_with_min_max_vars_per_channel_gradient': 'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
-    'tf.fft': 'tf.spectral.fft',
-    'tf.floor': 'tf.math.floor',
-    'tf.gather_nd': 'tf.manip.gather_nd',
-    'tf.GraphKeys.VARIABLES': 'tf.GraphKeys.GLOBAL_VARIABLES',
-    'tf.greater': 'tf.math.greater',
-    'tf.greater_equal': 'tf.math.greater_equal',
-    'tf.ifft': 'tf.spectral.ifft',
+    'tf.feature_column.input_layer': 'tf.compat.v1.feature_column.input_layer',
+    'tf.feature_column.linear_model': 'tf.compat.v1.feature_column.linear_model',
+    'tf.fft': 'tf.signal.fft',
+    'tf.fft2d': 'tf.signal.fft2d',
+    'tf.fft3d': 'tf.signal.fft3d',
+    'tf.fixed_size_partitioner': 'tf.compat.v1.fixed_size_partitioner',
+    'tf.floordiv': 'tf.math.floordiv',
+    'tf.get_collection': 'tf.compat.v1.get_collection',
+    'tf.get_collection_ref': 'tf.compat.v1.get_collection_ref',
+    'tf.get_default_graph': 'tf.compat.v1.get_default_graph',
+    'tf.get_default_session': 'tf.compat.v1.get_default_session',
+    'tf.get_local_variable': 'tf.compat.v1.get_local_variable',
+    'tf.get_seed': 'tf.compat.v1.get_seed',
+    'tf.get_session_handle': 'tf.compat.v1.get_session_handle',
+    'tf.get_session_tensor': 'tf.compat.v1.get_session_tensor',
+    'tf.get_variable': 'tf.compat.v1.get_variable',
+    'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
+    'tf.gfile.FastGFile': 'tf.compat.v1.gfile.FastGFile',
+    'tf.gfile.GFile': 'tf.compat.v1.gfile.GFile',
+    'tf.gfile.Open': 'tf.compat.v1.gfile.Open',
+    'tf.global_norm': 'tf.linalg.global_norm',
+    'tf.global_variables': 'tf.compat.v1.global_variables',
+    'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
+    'tf.glorot_normal_initializer': 'tf.keras.initializers.glorot_normal',
+    'tf.graph_util.convert_variables_to_constants': 'tf.compat.v1.graph_util.convert_variables_to_constants',
+    'tf.graph_util.extract_sub_graph': 'tf.compat.v1.graph_util.extract_sub_graph',
+    'tf.graph_util.must_run_on_cpu': 'tf.compat.v1.graph_util.must_run_on_cpu',
+    'tf.graph_util.remove_training_nodes': 'tf.compat.v1.graph_util.remove_training_nodes',
+    'tf.graph_util.tensor_shape_from_node_def_name': 'tf.compat.v1.graph_util.tensor_shape_from_node_def_name',
+    'tf.ifft': 'tf.signal.ifft',
+    'tf.ifft2d': 'tf.signal.ifft2d',
+    'tf.ifft3d': 'tf.signal.ifft3d',
     'tf.igamma': 'tf.math.igamma',
     'tf.igammac': 'tf.math.igammac',
+    'tf.imag': 'tf.math.imag',
+    'tf.image.resize_area': 'tf.compat.v1.image.resize_area',
+    'tf.image.resize_bicubic': 'tf.compat.v1.image.resize_bicubic',
+    'tf.image.resize_bilinear': 'tf.compat.v1.image.resize_bilinear',
+    'tf.image.resize_images': 'tf.compat.v1.image.resize_images',
+    'tf.image.resize_nearest_neighbor': 'tf.compat.v1.image.resize_nearest_neighbor',
+    'tf.image.transpose_image': 'tf.compat.v1.image.transpose_image',
+    'tf.initialize_all_tables': 'tf.compat.v1.initialize_all_tables',
+    'tf.initialize_all_variables': 'tf.compat.v1.initialize_all_variables',
+    'tf.initialize_local_variables': 'tf.compat.v1.initialize_local_variables',
+    'tf.initialize_variables': 'tf.compat.v1.initialize_variables',
+    'tf.initializers.global_variables': 'tf.compat.v1.initializers.global_variables',
+    'tf.initializers.local_variables': 'tf.compat.v1.initializers.local_variables',
+    'tf.initializers.tables_initializer': 'tf.compat.v1.initializers.tables_initializer',
+    'tf.initializers.variables': 'tf.compat.v1.initializers.variables',
     'tf.invert_permutation': 'tf.math.invert_permutation',
-    'tf.is_finite': 'tf.debugging.is_finite',
-    'tf.is_inf': 'tf.debugging.is_inf',
-    'tf.is_nan': 'tf.debugging.is_nan',
-    'tf.less': 'tf.math.less',
-    'tf.less_equal': 'tf.math.less_equal',
+    'tf.io.tf_record_iterator': 'tf.compat.v1.io.tf_record_iterator',
+    'tf.is_finite': 'tf.math.is_finite',
+    'tf.is_inf': 'tf.math.is_inf',
+    'tf.is_nan': 'tf.math.is_nan',
+    'tf.is_non_decreasing': 'tf.math.is_non_decreasing',
+    'tf.is_numeric_tensor': 'tf.debugging.is_numeric_tensor',
+    'tf.is_strictly_increasing': 'tf.math.is_strictly_increasing',
+    'tf.is_variable_initialized': 'tf.compat.v1.is_variable_initialized',
+    'tf.keras.backend.get_session': 'tf.compat.v1.keras.backend.get_session',
+    'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
+    'tf.layers.AveragePooling2D': 'tf.compat.v1.layers.AveragePooling2D',
+    'tf.layers.AveragePooling3D': 'tf.compat.v1.layers.AveragePooling3D',
+    'tf.layers.BatchNormalization': 'tf.compat.v1.layers.BatchNormalization',
+    'tf.layers.Conv1D': 'tf.compat.v1.layers.Conv1D',
+    'tf.layers.Conv2D': 'tf.compat.v1.layers.Conv2D',
+    'tf.layers.Conv2DTranspose': 'tf.compat.v1.layers.Conv2DTranspose',
+    'tf.layers.Conv3D': 'tf.compat.v1.layers.Conv3D',
+    'tf.layers.Conv3DTranspose': 'tf.compat.v1.layers.Conv3DTranspose',
+    'tf.layers.Dense': 'tf.compat.v1.layers.Dense',
+    'tf.layers.Dropout': 'tf.compat.v1.layers.Dropout',
+    'tf.layers.Flatten': 'tf.compat.v1.layers.Flatten',
+    'tf.layers.InputSpec': 'tf.keras.layers.InputSpec',
+    'tf.layers.Layer': 'tf.compat.v1.layers.Layer',
+    'tf.layers.MaxPooling1D': 'tf.compat.v1.layers.MaxPooling1D',
+    'tf.layers.MaxPooling2D': 'tf.compat.v1.layers.MaxPooling2D',
+    'tf.layers.MaxPooling3D': 'tf.compat.v1.layers.MaxPooling3D',
+    'tf.layers.SeparableConv1D': 'tf.compat.v1.layers.SeparableConv1D',
+    'tf.layers.SeparableConv2D': 'tf.compat.v1.layers.SeparableConv2D',
+    'tf.layers.average_pooling1d': 'tf.compat.v1.layers.average_pooling1d',
+    'tf.layers.average_pooling2d': 'tf.compat.v1.layers.average_pooling2d',
+    'tf.layers.average_pooling3d': 'tf.compat.v1.layers.average_pooling3d',
+    'tf.layers.batch_normalization': 'tf.compat.v1.layers.batch_normalization',
+    'tf.layers.conv1d': 'tf.compat.v1.layers.conv1d',
+    'tf.layers.conv2d': 'tf.compat.v1.layers.conv2d',
+    'tf.layers.conv2d_transpose': 'tf.compat.v1.layers.conv2d_transpose',
+    'tf.layers.conv3d': 'tf.compat.v1.layers.conv3d',
+    'tf.layers.conv3d_transpose': 'tf.compat.v1.layers.conv3d_transpose',
+    'tf.layers.dense': 'tf.compat.v1.layers.dense',
+    'tf.layers.dropout': 'tf.compat.v1.layers.dropout',
+    'tf.layers.experimental.keras_style_scope': 'tf.compat.v1.layers.experimental.keras_style_scope',
+    'tf.layers.experimental.set_keras_style': 'tf.compat.v1.layers.experimental.set_keras_style',
+    'tf.layers.flatten': 'tf.compat.v1.layers.flatten',
+    'tf.layers.max_pooling1d': 'tf.compat.v1.layers.max_pooling1d',
+    'tf.layers.max_pooling2d': 'tf.compat.v1.layers.max_pooling2d',
+    'tf.layers.max_pooling3d': 'tf.compat.v1.layers.max_pooling3d',
+    'tf.layers.separable_conv1d': 'tf.compat.v1.layers.separable_conv1d',
+    'tf.layers.separable_conv2d': 'tf.compat.v1.layers.separable_conv2d',
+    'tf.lbeta': 'tf.math.lbeta',
     'tf.lgamma': 'tf.math.lgamma',
+    'tf.lin_space': 'tf.linspace',
+    'tf.local_variables': 'tf.compat.v1.local_variables',
+    'tf.local_variables_initializer': 'tf.compat.v1.local_variables_initializer',
     'tf.log': 'tf.math.log',
     'tf.log1p': 'tf.math.log1p',
-    'tf.logical_and': 'tf.math.logical_and',
-    'tf.logical_not': 'tf.math.logical_not',
-    'tf.logical_or': 'tf.math.logical_or',
+    'tf.log_sigmoid': 'tf.math.log_sigmoid',
+    'tf.logging.DEBUG': 'tf.compat.v1.logging.DEBUG',
+    'tf.logging.ERROR': 'tf.compat.v1.logging.ERROR',
+    'tf.logging.FATAL': 'tf.compat.v1.logging.FATAL',
+    'tf.logging.INFO': 'tf.compat.v1.logging.INFO',
+    'tf.logging.TaskLevelStatusMessage': 'tf.compat.v1.logging.TaskLevelStatusMessage',
+    'tf.logging.WARN': 'tf.compat.v1.logging.WARN',
+    'tf.logging.debug': 'tf.compat.v1.logging.debug',
+    'tf.logging.error': 'tf.compat.v1.logging.error',
+    'tf.logging.fatal': 'tf.compat.v1.logging.fatal',
+    'tf.logging.flush': 'tf.compat.v1.logging.flush',
+    'tf.logging.get_verbosity': 'tf.compat.v1.logging.get_verbosity',
+    'tf.logging.info': 'tf.compat.v1.logging.info',
+    'tf.logging.log': 'tf.compat.v1.logging.log',
+    'tf.logging.log_every_n': 'tf.compat.v1.logging.log_every_n',
+    'tf.logging.log_first_n': 'tf.compat.v1.logging.log_first_n',
+    'tf.logging.log_if': 'tf.compat.v1.logging.log_if',
+    'tf.logging.set_verbosity': 'tf.compat.v1.logging.set_verbosity',
+    'tf.logging.vlog': 'tf.compat.v1.logging.vlog',
+    'tf.logging.warn': 'tf.compat.v1.logging.warn',
+    'tf.logging.warning': 'tf.compat.v1.logging.warning',
+    'tf.logical_xor': 'tf.math.logical_xor',
+    'tf.losses.absolute_difference': 'tf.compat.v1.losses.absolute_difference',
+    'tf.losses.compute_weighted_loss': 'tf.compat.v1.losses.compute_weighted_loss',
+    'tf.losses.cosine_distance': 'tf.compat.v1.losses.cosine_distance',
+    'tf.losses.hinge_loss': 'tf.compat.v1.losses.hinge_loss',
+    'tf.losses.huber_loss': 'tf.compat.v1.losses.huber_loss',
+    'tf.losses.log_loss': 'tf.compat.v1.losses.log_loss',
+    'tf.losses.mean_pairwise_squared_error': 'tf.compat.v1.losses.mean_pairwise_squared_error',
+    'tf.losses.mean_squared_error': 'tf.compat.v1.losses.mean_squared_error',
+    'tf.losses.sigmoid_cross_entropy': 'tf.compat.v1.losses.sigmoid_cross_entropy',
+    'tf.losses.softmax_cross_entropy': 'tf.compat.v1.losses.softmax_cross_entropy',
+    'tf.losses.sparse_softmax_cross_entropy': 'tf.compat.v1.losses.sparse_softmax_cross_entropy',
+    'tf.make_template': 'tf.compat.v1.make_template',
+    'tf.make_tensor_proto': 'tf.compat.v1.make_tensor_proto',
+    'tf.manip.gather_nd': 'tf.gather_nd',
+    'tf.manip.reshape': 'tf.reshape',
+    'tf.manip.reverse': 'tf.reverse',
+    'tf.manip.roll': 'tf.roll',
+    'tf.manip.scatter_nd': 'tf.scatter_nd',
+    'tf.manip.space_to_batch_nd': 'tf.space_to_batch_nd',
+    'tf.manip.tile': 'tf.tile',
     'tf.matching_files': 'tf.io.matching_files',
     'tf.matrix_band_part': 'tf.linalg.band_part',
     'tf.matrix_determinant': 'tf.linalg.det',
@@ -91,45 +333,355 @@ renames = {
     'tf.matrix_inverse': 'tf.linalg.inv',
     'tf.matrix_set_diag': 'tf.linalg.set_diag',
     'tf.matrix_solve': 'tf.linalg.solve',
+    'tf.matrix_solve_ls': 'tf.linalg.lstsq',
+    'tf.matrix_transpose': 'tf.linalg.transpose',
     'tf.matrix_triangular_solve': 'tf.linalg.triangular_solve',
-    'tf.maximum': 'tf.math.maximum',
-    'tf.minimum': 'tf.math.minimum',
-    'tf.not_equal': 'tf.math.not_equal',
+    'tf.metrics.accuracy': 'tf.compat.v1.metrics.accuracy',
+    'tf.metrics.auc': 'tf.compat.v1.metrics.auc',
+    'tf.metrics.average_precision_at_k': 'tf.compat.v1.metrics.average_precision_at_k',
+    'tf.metrics.false_negatives': 'tf.compat.v1.metrics.false_negatives',
+    'tf.metrics.false_negatives_at_thresholds': 'tf.compat.v1.metrics.false_negatives_at_thresholds',
+    'tf.metrics.false_positives': 'tf.compat.v1.metrics.false_positives',
+    'tf.metrics.false_positives_at_thresholds': 'tf.compat.v1.metrics.false_positives_at_thresholds',
+    'tf.metrics.mean': 'tf.compat.v1.metrics.mean',
+    'tf.metrics.mean_absolute_error': 'tf.compat.v1.metrics.mean_absolute_error',
+    'tf.metrics.mean_cosine_distance': 'tf.compat.v1.metrics.mean_cosine_distance',
+    'tf.metrics.mean_iou': 'tf.compat.v1.metrics.mean_iou',
+    'tf.metrics.mean_per_class_accuracy': 'tf.compat.v1.metrics.mean_per_class_accuracy',
+    'tf.metrics.mean_relative_error': 'tf.compat.v1.metrics.mean_relative_error',
+    'tf.metrics.mean_squared_error': 'tf.compat.v1.metrics.mean_squared_error',
+    'tf.metrics.mean_tensor': 'tf.compat.v1.metrics.mean_tensor',
+    'tf.metrics.percentage_below': 'tf.compat.v1.metrics.percentage_below',
+    'tf.metrics.precision': 'tf.compat.v1.metrics.precision',
+    'tf.metrics.precision_at_k': 'tf.compat.v1.metrics.precision_at_k',
+    'tf.metrics.precision_at_thresholds': 'tf.compat.v1.metrics.precision_at_thresholds',
+    'tf.metrics.precision_at_top_k': 'tf.compat.v1.metrics.precision_at_top_k',
+    'tf.metrics.recall': 'tf.compat.v1.metrics.recall',
+    'tf.metrics.recall_at_k': 'tf.compat.v1.metrics.recall_at_k',
+    'tf.metrics.recall_at_thresholds': 'tf.compat.v1.metrics.recall_at_thresholds',
+    'tf.metrics.recall_at_top_k': 'tf.compat.v1.metrics.recall_at_top_k',
+    'tf.metrics.root_mean_squared_error': 'tf.compat.v1.metrics.root_mean_squared_error',
+    'tf.metrics.sensitivity_at_specificity': 'tf.compat.v1.metrics.sensitivity_at_specificity',
+    'tf.metrics.sparse_average_precision_at_k': 'tf.compat.v1.metrics.sparse_average_precision_at_k',
+    'tf.metrics.sparse_precision_at_k': 'tf.compat.v1.metrics.sparse_precision_at_k',
+    'tf.metrics.specificity_at_sensitivity': 'tf.compat.v1.metrics.specificity_at_sensitivity',
+    'tf.metrics.true_negatives': 'tf.compat.v1.metrics.true_negatives',
+    'tf.metrics.true_negatives_at_thresholds': 'tf.compat.v1.metrics.true_negatives_at_thresholds',
+    'tf.metrics.true_positives': 'tf.compat.v1.metrics.true_positives',
+    'tf.metrics.true_positives_at_thresholds': 'tf.compat.v1.metrics.true_positives_at_thresholds',
+    'tf.min_max_variable_partitioner': 'tf.compat.v1.min_max_variable_partitioner',
+    'tf.model_variables': 'tf.compat.v1.model_variables',
+    'tf.moving_average_variables': 'tf.compat.v1.moving_average_variables',
+    'tf.nn.bidirectional_dynamic_rnn': 'tf.compat.v1.nn.bidirectional_dynamic_rnn',
+    'tf.nn.conv3d_backprop_filter_v2': 'tf.nn.conv3d_backprop_filter',
+    'tf.nn.ctc_beam_search_decoder_v2': 'tf.nn.ctc_beam_search_decoder',
+    'tf.nn.ctc_loss_v2': 'tf.nn.ctc_loss',
+    'tf.nn.depthwise_conv2d_native': 'tf.compat.v1.nn.depthwise_conv2d_native',
+    'tf.nn.depthwise_conv2d_native_backprop_filter': 'tf.nn.depthwise_conv2d_backprop_filter',
+    'tf.nn.depthwise_conv2d_native_backprop_input': 'tf.nn.depthwise_conv2d_backprop_input',
+    'tf.nn.dynamic_rnn': 'tf.compat.v1.nn.dynamic_rnn',
+    'tf.nn.log_uniform_candidate_sampler': 'tf.random.log_uniform_candidate_sampler',
+    'tf.nn.quantized_avg_pool': 'tf.compat.v1.nn.quantized_avg_pool',
+    'tf.nn.quantized_conv2d': 'tf.compat.v1.nn.quantized_conv2d',
+    'tf.nn.quantized_max_pool': 'tf.compat.v1.nn.quantized_max_pool',
+    'tf.nn.quantized_relu_x': 'tf.compat.v1.nn.quantized_relu_x',
+    'tf.nn.raw_rnn': 'tf.compat.v1.nn.raw_rnn',
+    'tf.nn.relu_layer': 'tf.compat.v1.nn.relu_layer',
+    'tf.nn.rnn_cell.BasicLSTMCell': 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell',
+    'tf.nn.rnn_cell.BasicRNNCell': 'tf.compat.v1.nn.rnn_cell.BasicRNNCell',
+    'tf.nn.rnn_cell.GRUCell': 'tf.compat.v1.nn.rnn_cell.GRUCell',
+    'tf.nn.rnn_cell.LSTMCell': 'tf.compat.v1.nn.rnn_cell.LSTMCell',
+    'tf.nn.rnn_cell.MultiRNNCell': 'tf.compat.v1.nn.rnn_cell.MultiRNNCell',
+    'tf.nn.static_bidirectional_rnn': 'tf.compat.v1.nn.static_bidirectional_rnn',
+    'tf.nn.static_rnn': 'tf.compat.v1.nn.static_rnn',
+    'tf.nn.uniform_candidate_sampler': 'tf.random.uniform_candidate_sampler',
+    'tf.nn.xw_plus_b': 'tf.compat.v1.nn.xw_plus_b',
+    'tf.op_scope': 'tf.compat.v1.op_scope',
+    'tf.orthogonal_initializer': 'tf.keras.initializers.Orthogonal',
+    'tf.parse_single_sequence_example': 'tf.io.parse_single_sequence_example',
     'tf.parse_tensor': 'tf.io.parse_tensor',
+    'tf.placeholder': 'tf.compat.v1.placeholder',
+    'tf.placeholder_with_default': 'tf.compat.v1.placeholder_with_default',
     'tf.polygamma': 'tf.math.polygamma',
+    'tf.profiler.AdviceProto': 'tf.compat.v1.profiler.AdviceProto',
+    'tf.profiler.GraphNodeProto': 'tf.compat.v1.profiler.GraphNodeProto',
+    'tf.profiler.MultiGraphNodeProto': 'tf.compat.v1.profiler.MultiGraphNodeProto',
+    'tf.profiler.OpLogProto': 'tf.compat.v1.profiler.OpLogProto',
+    'tf.profiler.ProfileOptionBuilder': 'tf.compat.v1.profiler.ProfileOptionBuilder',
+    'tf.profiler.Profiler': 'tf.compat.v1.profiler.Profiler',
+    'tf.profiler.advise': 'tf.compat.v1.profiler.advise',
+    'tf.profiler.profile': 'tf.compat.v1.profiler.profile',
+    'tf.profiler.write_op_log': 'tf.compat.v1.profiler.write_op_log',
+    'tf.py_func': 'tf.compat.v1.py_func',
+    'tf.python_io.TFRecordCompressionType': 'tf.io.TFRecordCompressionType',
+    'tf.python_io.TFRecordOptions': 'tf.io.TFRecordOptions',
+    'tf.python_io.TFRecordWriter': 'tf.io.TFRecordWriter',
+    'tf.python_io.tf_record_iterator': 'tf.compat.v1.python_io.tf_record_iterator',
     'tf.qr': 'tf.linalg.qr',
+    'tf.quantize': 'tf.quantization.quantize',
     'tf.quantized_concat': 'tf.quantization.quantized_concat',
+    'tf.random.get_seed': 'tf.compat.v1.random.get_seed',
+    'tf.random.set_random_seed': 'tf.compat.v1.random.set_random_seed',
+    'tf.random_crop': 'tf.image.random_crop',
+    'tf.random_gamma': 'tf.random.gamma',
+    'tf.random_normal': 'tf.random.normal',
+    'tf.random_poisson': 'tf.compat.v1.random_poisson',
+    'tf.random_shuffle': 'tf.random.shuffle',
+    'tf.random_uniform': 'tf.random.uniform',
     'tf.read_file': 'tf.io.read_file',
+    'tf.real': 'tf.math.real',
     'tf.reciprocal': 'tf.math.reciprocal',
     'tf.regex_replace': 'tf.strings.regex_replace',
-    'tf.reshape': 'tf.manip.reshape',
-    'tf.reverse': 'tf.manip.reverse',
-    'tf.reverse_v2': 'tf.manip.reverse',
+    'tf.report_uninitialized_variables': 'tf.compat.v1.report_uninitialized_variables',
+    'tf.reset_default_graph': 'tf.compat.v1.reset_default_graph',
+    'tf.resource_loader.get_data_files_path': 'tf.compat.v1.resource_loader.get_data_files_path',
+    'tf.resource_loader.get_path_to_datafile': 'tf.compat.v1.resource_loader.get_path_to_datafile',
+    'tf.resource_loader.get_root_dir_with_all_resources': 'tf.compat.v1.resource_loader.get_root_dir_with_all_resources',
+    'tf.resource_loader.load_resource': 'tf.compat.v1.resource_loader.load_resource',
+    'tf.resource_loader.readahead_file_path': 'tf.compat.v1.resource_loader.readahead_file_path',
+    'tf.reverse_v2': 'tf.reverse',
     'tf.rint': 'tf.math.rint',
     'tf.rsqrt': 'tf.math.rsqrt',
-    'tf.scatter_nd': 'tf.manip.scatter_nd',
+    'tf.saved_model.Builder': 'tf.compat.v1.saved_model.Builder',
+    'tf.saved_model.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.MAIN_OP_KEY': 'tf.compat.v1.saved_model.MAIN_OP_KEY',
+    'tf.saved_model.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.build_tensor_info': 'tf.compat.v1.saved_model.build_tensor_info',
+    'tf.saved_model.builder.SavedModelBuilder': 'tf.compat.v1.saved_model.builder.SavedModelBuilder',
+    'tf.saved_model.constants.ASSETS_DIRECTORY': 'tf.saved_model.ASSETS_DIRECTORY',
+    'tf.saved_model.constants.ASSETS_KEY': 'tf.saved_model.ASSETS_KEY',
+    'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.constants.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.constants.MAIN_OP_KEY': 'tf.compat.v1.saved_model.constants.MAIN_OP_KEY',
+    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PB': 'tf.saved_model.SAVED_MODEL_FILENAME_PB',
+    'tf.saved_model.constants.SAVED_MODEL_FILENAME_PBTXT': 'tf.saved_model.SAVED_MODEL_FILENAME_PBTXT',
+    'tf.saved_model.constants.SAVED_MODEL_SCHEMA_VERSION': 'tf.saved_model.SAVED_MODEL_SCHEMA_VERSION',
+    'tf.saved_model.constants.VARIABLES_DIRECTORY': 'tf.saved_model.VARIABLES_DIRECTORY',
+    'tf.saved_model.constants.VARIABLES_FILENAME': 'tf.saved_model.VARIABLES_FILENAME',
+    'tf.saved_model.experimental.save': 'tf.saved_model.save',
+    'tf.saved_model.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
+    'tf.saved_model.load': 'tf.compat.v1.saved_model.load',
+    'tf.saved_model.loader.load': 'tf.compat.v1.saved_model.loader.load',
+    'tf.saved_model.loader.maybe_saved_model_directory': 'tf.compat.v1.saved_model.loader.maybe_saved_model_directory',
+    'tf.saved_model.main_op.main_op': 'tf.compat.v1.saved_model.main_op.main_op',
+    'tf.saved_model.main_op.main_op_with_restore': 'tf.compat.v1.saved_model.main_op.main_op_with_restore',
+    'tf.saved_model.main_op_with_restore': 'tf.compat.v1.saved_model.main_op_with_restore',
+    'tf.saved_model.maybe_saved_model_directory': 'tf.compat.v1.saved_model.maybe_saved_model_directory',
+    'tf.saved_model.signature_constants.CLASSIFY_INPUTS': 'tf.saved_model.CLASSIFY_INPUTS',
+    'tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME': 'tf.saved_model.CLASSIFY_METHOD_NAME',
+    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES': 'tf.saved_model.CLASSIFY_OUTPUT_CLASSES',
+    'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES': 'tf.saved_model.CLASSIFY_OUTPUT_SCORES',
+    'tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY': 'tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY',
+    'tf.saved_model.signature_constants.PREDICT_INPUTS': 'tf.saved_model.PREDICT_INPUTS',
+    'tf.saved_model.signature_constants.PREDICT_METHOD_NAME': 'tf.saved_model.PREDICT_METHOD_NAME',
+    'tf.saved_model.signature_constants.PREDICT_OUTPUTS': 'tf.saved_model.PREDICT_OUTPUTS',
+    'tf.saved_model.signature_constants.REGRESS_INPUTS': 'tf.saved_model.REGRESS_INPUTS',
+    'tf.saved_model.signature_constants.REGRESS_METHOD_NAME': 'tf.saved_model.REGRESS_METHOD_NAME',
+    'tf.saved_model.signature_constants.REGRESS_OUTPUTS': 'tf.saved_model.REGRESS_OUTPUTS',
+    'tf.saved_model.signature_def_utils.build_signature_def': 'tf.saved_model.build_signature_def',
+    'tf.saved_model.signature_def_utils.classification_signature_def': 'tf.saved_model.classification_signature_def',
+    'tf.saved_model.signature_def_utils.is_valid_signature': 'tf.saved_model.is_valid_signature',
+    'tf.saved_model.signature_def_utils.predict_signature_def': 'tf.saved_model.predict_signature_def',
+    'tf.saved_model.signature_def_utils.regression_signature_def': 'tf.saved_model.regression_signature_def',
+    'tf.saved_model.simple_save': 'tf.compat.v1.saved_model.simple_save',
+    'tf.saved_model.tag_constants.GPU': 'tf.saved_model.GPU',
+    'tf.saved_model.tag_constants.SERVING': 'tf.saved_model.SERVING',
+    'tf.saved_model.tag_constants.TPU': 'tf.saved_model.TPU',
+    'tf.saved_model.tag_constants.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.utils.build_tensor_info': 'tf.compat.v1.saved_model.utils.build_tensor_info',
+    'tf.saved_model.utils.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.utils.get_tensor_from_tensor_info',
+    'tf.scatter_add': 'tf.compat.v1.scatter_add',
+    'tf.scatter_nd_add': 'tf.compat.v1.scatter_nd_add',
+    'tf.scatter_nd_sub': 'tf.compat.v1.scatter_nd_sub',
+    'tf.scatter_nd_update': 'tf.compat.v1.scatter_nd_update',
+    'tf.scatter_sub': 'tf.compat.v1.scatter_sub',
+    'tf.scatter_update': 'tf.compat.v1.scatter_update',
     'tf.segment_max': 'tf.math.segment_max',
     'tf.segment_mean': 'tf.math.segment_mean',
     'tf.segment_min': 'tf.math.segment_min',
     'tf.segment_prod': 'tf.math.segment_prod',
     'tf.segment_sum': 'tf.math.segment_sum',
-    'tf.sin': 'tf.math.sin',
-    'tf.sinh': 'tf.math.sinh',
-    'tf.space_to_batch_nd': 'tf.manip.space_to_batch_nd',
+    'tf.self_adjoint_eig': 'tf.linalg.eigh',
+    'tf.self_adjoint_eigvals': 'tf.linalg.eigvalsh',
+    'tf.serialize_many_sparse': 'tf.compat.v1.serialize_many_sparse',
+    'tf.serialize_sparse': 'tf.compat.v1.serialize_sparse',
+    'tf.serialize_tensor': 'tf.io.serialize_tensor',
+    'tf.set_random_seed': 'tf.compat.v1.set_random_seed',
+    'tf.setdiff1d': 'tf.compat.v1.setdiff1d',
+    'tf.sets.set_difference': 'tf.sets.difference',
+    'tf.sets.set_intersection': 'tf.sets.intersection',
+    'tf.sets.set_size': 'tf.sets.size',
+    'tf.sets.set_union': 'tf.sets.union',
+    'tf.space_to_depth': 'tf.compat.v1.space_to_depth',
+    'tf.sparse.matmul': 'tf.sparse.sparse_dense_matmul',
+    'tf.sparse.merge': 'tf.compat.v1.sparse.merge',
+    'tf.sparse.placeholder': 'tf.compat.v1.sparse.placeholder',
+    'tf.sparse.reduce_max_sparse': 'tf.compat.v1.sparse.reduce_max_sparse',
+    'tf.sparse.reduce_sum_sparse': 'tf.compat.v1.sparse.reduce_sum_sparse',
+    'tf.sparse_fill_empty_rows': 'tf.sparse.fill_empty_rows',
+    'tf.sparse_mask': 'tf.sparse.mask',
+    'tf.sparse_maximum': 'tf.sparse.maximum',
+    'tf.sparse_merge': 'tf.compat.v1.sparse_merge',
+    'tf.sparse_minimum': 'tf.sparse.minimum',
+    'tf.sparse_placeholder': 'tf.compat.v1.sparse_placeholder',
+    'tf.sparse_reduce_max': 'tf.compat.v1.sparse_reduce_max',
+    'tf.sparse_reduce_max_sparse': 'tf.compat.v1.sparse_reduce_max_sparse',
+    'tf.sparse_reduce_sum': 'tf.compat.v1.sparse_reduce_sum',
+    'tf.sparse_reduce_sum_sparse': 'tf.compat.v1.sparse_reduce_sum_sparse',
+    'tf.sparse_reorder': 'tf.sparse.reorder',
+    'tf.sparse_reset_shape': 'tf.sparse.reset_shape',
+    'tf.sparse_reshape': 'tf.sparse.reshape',
+    'tf.sparse_retain': 'tf.sparse.retain',
+    'tf.sparse_segment_mean': 'tf.compat.v1.sparse_segment_mean',
+    'tf.sparse_segment_sqrt_n': 'tf.compat.v1.sparse_segment_sqrt_n',
+    'tf.sparse_segment_sum': 'tf.compat.v1.sparse_segment_sum',
+    'tf.sparse_slice': 'tf.sparse.slice',
+    'tf.sparse_softmax': 'tf.sparse.softmax',
+    'tf.sparse_tensor_dense_matmul': 'tf.sparse.sparse_dense_matmul',
+    'tf.sparse_tensor_to_dense': 'tf.sparse.to_dense',
+    'tf.sparse_to_dense': 'tf.compat.v1.sparse_to_dense',
+    'tf.sparse_to_indicator': 'tf.sparse.to_indicator',
+    'tf.sparse_transpose': 'tf.sparse.transpose',
+    'tf.spectral.dct': 'tf.signal.dct',
+    'tf.spectral.fft': 'tf.signal.fft',
+    'tf.spectral.fft2d': 'tf.signal.fft2d',
+    'tf.spectral.fft3d': 'tf.signal.fft3d',
+    'tf.spectral.idct': 'tf.signal.idct',
+    'tf.spectral.ifft': 'tf.signal.ifft',
+    'tf.spectral.ifft2d': 'tf.signal.ifft2d',
+    'tf.spectral.ifft3d': 'tf.signal.ifft3d',
+    'tf.spectral.irfft': 'tf.signal.irfft',
+    'tf.spectral.irfft2d': 'tf.signal.irfft2d',
+    'tf.spectral.irfft3d': 'tf.signal.irfft3d',
+    'tf.spectral.rfft': 'tf.signal.rfft',
+    'tf.spectral.rfft2d': 'tf.signal.rfft2d',
+    'tf.spectral.rfft3d': 'tf.signal.rfft3d',
     'tf.squared_difference': 'tf.math.squared_difference',
     'tf.string_join': 'tf.strings.join',
     'tf.string_strip': 'tf.strings.strip',
-    'tf.string_to_hash_bucket': 'tf.strings.to_hash_bucket',
     'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
     'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
-    'tf.string_to_number': 'tf.strings.to_number',
-    'tf.substr': 'tf.strings.substr',
-    'tf.tan': 'tf.math.tan',
-    'tf.tile': 'tf.manip.tile',
+    'tf.summary.SessionLog': 'tf.compat.v1.summary.SessionLog',
+    'tf.summary.audio': 'tf.compat.v1.summary.audio',
+    'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
+    'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
+    'tf.summary.image': 'tf.compat.v1.summary.image',
+    'tf.summary.merge': 'tf.compat.v1.summary.merge',
+    'tf.summary.merge_all': 'tf.compat.v1.summary.merge_all',
+    'tf.summary.scalar': 'tf.compat.v1.summary.scalar',
+    'tf.summary.tensor_summary': 'tf.compat.v1.summary.tensor_summary',
+    'tf.summary.text': 'tf.compat.v1.summary.text',
+    'tf.svd': 'tf.linalg.svd',
+    'tf.tables_initializer': 'tf.compat.v1.tables_initializer',
+    'tf.test.StubOutForTesting': 'tf.compat.v1.test.StubOutForTesting',
+    'tf.test.compute_gradient': 'tf.compat.v1.test.compute_gradient',
+    'tf.test.compute_gradient_error': 'tf.compat.v1.test.compute_gradient_error',
+    'tf.test.get_temp_dir': 'tf.compat.v1.test.get_temp_dir',
+    'tf.test.mock': 'tf.compat.v1.test.mock',
+    'tf.test.test_src_dir_path': 'tf.compat.v1.test.test_src_dir_path',
+    'tf.to_bfloat16': 'tf.compat.v1.to_bfloat16',
+    'tf.to_complex128': 'tf.compat.v1.to_complex128',
+    'tf.to_complex64': 'tf.compat.v1.to_complex64',
+    'tf.to_double': 'tf.compat.v1.to_double',
+    'tf.to_float': 'tf.compat.v1.to_float',
+    'tf.to_int32': 'tf.compat.v1.to_int32',
+    'tf.to_int64': 'tf.compat.v1.to_int64',
+    'tf.trace': 'tf.linalg.trace',
+    'tf.train.AdadeltaOptimizer': 'tf.compat.v1.train.AdadeltaOptimizer',
+    'tf.train.AdagradDAOptimizer': 'tf.compat.v1.train.AdagradDAOptimizer',
+    'tf.train.AdagradOptimizer': 'tf.compat.v1.train.AdagradOptimizer',
+    'tf.train.AdamOptimizer': 'tf.compat.v1.train.AdamOptimizer',
+    'tf.train.CheckpointSaverHook': 'tf.estimator.CheckpointSaverHook',
+    'tf.train.CheckpointSaverListener': 'tf.estimator.CheckpointSaverListener',
+    'tf.train.ChiefSessionCreator': 'tf.compat.v1.train.ChiefSessionCreator',
+    'tf.train.FeedFnHook': 'tf.estimator.FeedFnHook',
+    'tf.train.FinalOpsHook': 'tf.estimator.FinalOpsHook',
+    'tf.train.FtrlOptimizer': 'tf.compat.v1.train.FtrlOptimizer',
+    'tf.train.GlobalStepWaiterHook': 'tf.estimator.GlobalStepWaiterHook',
+    'tf.train.GradientDescentOptimizer': 'tf.compat.v1.train.GradientDescentOptimizer',
+    'tf.train.LoggingTensorHook': 'tf.estimator.LoggingTensorHook',
+    'tf.train.LooperThread': 'tf.compat.v1.train.LooperThread',
+    'tf.train.MomentumOptimizer': 'tf.compat.v1.train.MomentumOptimizer',
+    'tf.train.MonitoredSession': 'tf.compat.v1.train.MonitoredSession',
+    'tf.train.MonitoredTrainingSession': 'tf.compat.v1.train.MonitoredTrainingSession',
+    'tf.train.NanLossDuringTrainingError': 'tf.estimator.NanLossDuringTrainingError',
+    'tf.train.NanTensorHook': 'tf.estimator.NanTensorHook',
+    'tf.train.NewCheckpointReader': 'tf.compat.v1.train.NewCheckpointReader',
+    'tf.train.Optimizer': 'tf.compat.v1.train.Optimizer',
+    'tf.train.ProfilerHook': 'tf.estimator.ProfilerHook',
+    'tf.train.ProximalAdagradOptimizer': 'tf.compat.v1.train.ProximalAdagradOptimizer',
+    'tf.train.QueueRunner': 'tf.compat.v1.train.QueueRunner',
+    'tf.train.RMSPropOptimizer': 'tf.compat.v1.train.RMSPropOptimizer',
+    'tf.train.Saver': 'tf.compat.v1.train.Saver',
+    'tf.train.SaverDef': 'tf.compat.v1.train.SaverDef',
+    'tf.train.Scaffold': 'tf.compat.v1.train.Scaffold',
+    'tf.train.SecondOrStepTimer': 'tf.estimator.SecondOrStepTimer',
+    'tf.train.SessionCreator': 'tf.compat.v1.train.SessionCreator',
+    'tf.train.SessionManager': 'tf.compat.v1.train.SessionManager',
+    'tf.train.SessionRunArgs': 'tf.compat.v1.train.SessionRunArgs',
+    'tf.train.SessionRunContext': 'tf.compat.v1.train.SessionRunContext',
+    'tf.train.SessionRunValues': 'tf.compat.v1.train.SessionRunValues',
+    'tf.train.SingularMonitoredSession': 'tf.compat.v1.train.SingularMonitoredSession',
+    'tf.train.StepCounterHook': 'tf.estimator.StepCounterHook',
+    'tf.train.StopAtStepHook': 'tf.estimator.StopAtStepHook',
+    'tf.train.SummarySaverHook': 'tf.estimator.SummarySaverHook',
+    'tf.train.Supervisor': 'tf.compat.v1.train.Supervisor',
+    'tf.train.SyncReplicasOptimizer': 'tf.compat.v1.train.SyncReplicasOptimizer',
+    'tf.train.VocabInfo': 'tf.estimator.VocabInfo',
+    'tf.train.WorkerSessionCreator': 'tf.compat.v1.train.WorkerSessionCreator',
+    'tf.train.add_queue_runner': 'tf.compat.v1.train.add_queue_runner',
+    'tf.train.assert_global_step': 'tf.compat.v1.train.assert_global_step',
+    'tf.train.basic_train_loop': 'tf.compat.v1.train.basic_train_loop',
+    'tf.train.batch': 'tf.compat.v1.train.batch',
+    'tf.train.batch_join': 'tf.compat.v1.train.batch_join',
+    'tf.train.checkpoint_exists': 'tf.compat.v1.train.checkpoint_exists',
+    'tf.train.create_global_step': 'tf.compat.v1.train.create_global_step',
+    'tf.train.do_quantize_training_on_graphdef': 'tf.compat.v1.train.do_quantize_training_on_graphdef',
+    'tf.train.export_meta_graph': 'tf.compat.v1.train.export_meta_graph',
+    'tf.train.generate_checkpoint_state_proto': 'tf.compat.v1.train.generate_checkpoint_state_proto',
+    'tf.train.get_checkpoint_mtimes': 'tf.compat.v1.train.get_checkpoint_mtimes',
+    'tf.train.get_global_step': 'tf.compat.v1.train.get_global_step',
+    'tf.train.get_or_create_global_step': 'tf.compat.v1.train.get_or_create_global_step',
+    'tf.train.global_step': 'tf.compat.v1.train.global_step',
+    'tf.train.import_meta_graph': 'tf.compat.v1.train.import_meta_graph',
+    'tf.train.init_from_checkpoint': 'tf.compat.v1.train.init_from_checkpoint',
+    'tf.train.input_producer': 'tf.compat.v1.train.input_producer',
+    'tf.train.limit_epochs': 'tf.compat.v1.train.limit_epochs',
+    'tf.train.match_filenames_once': 'tf.io.match_filenames_once',
+    'tf.train.maybe_batch': 'tf.compat.v1.train.maybe_batch',
+    'tf.train.maybe_batch_join': 'tf.compat.v1.train.maybe_batch_join',
+    'tf.train.maybe_shuffle_batch': 'tf.compat.v1.train.maybe_shuffle_batch',
+    'tf.train.maybe_shuffle_batch_join': 'tf.compat.v1.train.maybe_shuffle_batch_join',
+    'tf.train.piecewise_constant': 'tf.compat.v1.train.piecewise_constant',
+    'tf.train.queue_runner.QueueRunner': 'tf.compat.v1.train.queue_runner.QueueRunner',
+    'tf.train.queue_runner.add_queue_runner': 'tf.compat.v1.train.queue_runner.add_queue_runner',
+    'tf.train.queue_runner.start_queue_runners': 'tf.compat.v1.train.queue_runner.start_queue_runners',
+    'tf.train.range_input_producer': 'tf.compat.v1.train.range_input_producer',
+    'tf.train.remove_checkpoint': 'tf.compat.v1.train.remove_checkpoint',
+    'tf.train.replica_device_setter': 'tf.compat.v1.train.replica_device_setter',
+    'tf.train.shuffle_batch': 'tf.compat.v1.train.shuffle_batch',
+    'tf.train.shuffle_batch_join': 'tf.compat.v1.train.shuffle_batch_join',
+    'tf.train.slice_input_producer': 'tf.compat.v1.train.slice_input_producer',
+    'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
+    'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
+    'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
+    'tf.train.warm_start': 'tf.compat.v1.train.warm_start',
+    'tf.train.write_graph': 'tf.io.write_graph',
+    'tf.trainable_variables': 'tf.compat.v1.trainable_variables',
+    'tf.truncated_normal': 'tf.random.truncated_normal',
+    'tf.uniform_unit_scaling_initializer': 'tf.initializers.uniform_unit_scaling',
     'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
+    'tf.unsorted_segment_mean': 'tf.math.unsorted_segment_mean',
     'tf.unsorted_segment_min': 'tf.math.unsorted_segment_min',
     'tf.unsorted_segment_prod': 'tf.math.unsorted_segment_prod',
+    'tf.unsorted_segment_sqrt_n': 'tf.math.unsorted_segment_sqrt_n',
     'tf.unsorted_segment_sum': 'tf.math.unsorted_segment_sum',
+    'tf.variable_axis_size_partitioner': 'tf.compat.v1.variable_axis_size_partitioner',
+    'tf.variable_op_scope': 'tf.compat.v1.variable_op_scope',
+    'tf.variable_scope': 'tf.compat.v1.variable_scope',
+    'tf.variables_initializer': 'tf.compat.v1.variables_initializer',
+    'tf.variance_scaling_initializer': 'tf.keras.initializers.VarianceScaling',
+    'tf.verify_tensor_all_finite': 'tf.compat.v1.verify_tensor_all_finite',
+    'tf.wrap_function': 'tf.compat.v1.wrap_function',
     'tf.write_file': 'tf.io.write_file',
     'tf.zeta': 'tf.math.zeta'
 }
diff --git a/tensorflow/tools/compatibility/reorders_v2.py b/tensorflow/tools/compatibility/reorders_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..44494ac148cb878d500ef504eae8a6c388cc89df
--- /dev/null
+++ b/tensorflow/tools/compatibility/reorders_v2.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+This file should be updated whenever a function is added to
+self.reordered_function_names in tf_upgrade_v2.py.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+reorders = {
+    'tf.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.batch_to_space': ['input', 'crops', 'block_size', 'name'],
+    'tf.boolean_mask': ['tensor', 'mask', 'name', 'axis'],
+    'tf.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
+    'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype'],
+    'tf.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
+    'tf.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
+    'tf.feature_column.categorical_column_with_vocabulary_file': ['key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'default_value', 'dtype'],
+    'tf.io.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
+    'tf.io.parse_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.io.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.io.serialize_many_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.io.serialize_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.linalg.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
+    'tf.math.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.math.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.math.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
+    'tf.math.in_top_k': ['predictions', 'targets', 'k', 'name'],
+    'tf.math.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_min': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
+    'tf.nn.convolution': ['input', 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format'],
+    'tf.nn.crelu': ['features', 'name', 'axis'],
+    'tf.nn.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
+    'tf.nn.depthwise_conv2d': ['input', 'filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.embedding_lookup': ['params', 'ids', 'partition_strategy', 'name', 'validate_indices', 'max_norm'],
+    'tf.nn.embedding_lookup_sparse': ['params', 'sp_ids', 'sp_weights', 'partition_strategy', 'name', 'combiner', 'max_norm'],
+    'tf.nn.in_top_k': ['predictions', 'targets', 'k', 'name'],
+    'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims'],
+    'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format'],
+    'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.nn.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
+    'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims'],
+    'tf.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
+    'tf.pad': ['tensor', 'paddings', 'mode', 'name', 'constant_values'],
+    'tf.parse_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.quantize_v2': ['input', 'min_range', 'max_range', 'T', 'mode', 'name', 'round_mode'],
+    'tf.random.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
+    'tf.random.poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
+    'tf.random_poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
+    'tf.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_min': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reverse_sequence': ['input', 'seq_lengths', 'seq_axis', 'batch_axis', 'name', 'seq_dim', 'batch_dim'],
+    'tf.serialize_many_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.serialize_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.shape': ['input', 'name', 'out_type'],
+    'tf.size': ['input', 'name', 'out_type'],
+    'tf.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
+    'tf.sparse.add': ['a', 'b', 'threshold', 'thresh'],
+    'tf.sparse.concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse.reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
+    'tf.sparse.segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse.segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse.segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse.split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
+    'tf.sparse_add': ['a', 'b', 'threshold', 'thresh'],
+    'tf.sparse_concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse_matmul': ['a', 'b', 'transpose_a', 'transpose_b', 'a_is_sparse', 'b_is_sparse', 'name'],
+    'tf.sparse_reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
+    'tf.sparse_segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
+    'tf.strings.length': ['input', 'name', 'unit'],
+    'tf.strings.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.strings.substr': ['input', 'pos', 'len', 'name', 'unit'],
+    'tf.transpose': ['a', 'perm', 'name', 'conjugate'],
+    'tf.tuple': ['tensors', 'name', 'control_inputs'],
+    'tf.while_loop': ['cond', 'body', 'loop_vars', 'shape_invariants', 'parallel_iterations', 'back_prop', 'swap_memory', 'name', 'maximum_iterations', 'return_same_structure']
+}
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
index 68ba7a2630cec9cf23e9fbe3d1e9822c31ae3c0c..917236da4b4b75a1a1ca65e11d49d722cc178571 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
@@ -34,6 +34,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   a unit test if the converter is successful.
   """
 
+  @test_util.run_v1_only("b/120545219")
   def testArgRenames(self):
     with self.cached_session():
 
@@ -97,6 +98,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.expand_dims([[1, 2], [3, 4]], axis=1).eval(),
           [[[1, 2]], [[3, 4]]])
 
+  @test_util.run_v1_only("b/120545219")
   def testArgMinMax(self):
     with self.cached_session():
       self.assertAllEqual(
@@ -112,6 +114,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.argmax([[1, 2, 3], [4, 1, 0]], dimension=0).eval(),
           [1, 0, 0])
 
+  @test_util.run_v1_only("b/120545219")
   def testExpandAndSqueeze(self):
     with self.cached_session():
 
@@ -139,6 +142,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                   [[1, 2, 3]], dim=1), squeeze_dims=[1]).eval(),
           a)
 
+  @test_util.run_v1_only("b/120545219")
   def testArithmeticRenames(self):
     with self.cached_session() as s:
       stuff = tf.split(1, 2, [[1, 2, 3, 4], [4, 5, 6, 7]])
@@ -163,6 +167,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       #     # TODO(aselle): (tf.batch_*)
       # ]
 
+  @test_util.run_v1_only("b/120545219")
   def testBatchAndSvd(self):
     with self.cached_session():
       mat = [[1., 2.], [2., 3.]]
@@ -174,6 +179,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.svd(mat, False, True).eval(),
           tf.svd(mat, compute_uv=False, full_matrices=True).eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testCrossEntropy(self):
     # TODO(aselle): Test sparse_softmax_...
     with self.cached_session():
@@ -190,6 +196,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.nn.sigmoid_cross_entropy_with_logits(
               labels=labels, logits=logits).eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     with self.cached_session() as s:
 
@@ -200,6 +207,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       _ = [v.name for v in tf.all_variables()]
       _ = [v.name for v in tf.local_variables()]
 
+  @test_util.run_v1_only("b/120545219")
   def testSummaries(self):
     with self.cached_session() as s:
       var = tf.Variable([1, 2, 3], dtype=tf.float32)
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py b/tensorflow/tools/compatibility/testdata/test_file_v1_10.py
deleted file mode 100644
index e5ca8d3e2e24161310fe9878b349dfd524d31efc..0000000000000000000000000000000000000000
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tf upgrader."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import tensorflow as tf
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import test as test_lib
-
-
-class TestUpgrade(test_util.TensorFlowTestCase):
-  """Test various APIs that have been changed in 2.0."""
-
-  def testRenames(self):
-    with self.cached_session():
-      self.assertAllClose(1.04719755, tf.acos(0.5).eval())
-      self.assertAllClose(0.5, tf.rsqrt(4.0).eval())
-
-if __name__ == "__main__":
-  test_lib.main()
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ce4dd49adc940dbc56e19915a188cdb6b8de1d1
--- /dev/null
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf upgrader."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+
+
+class TestUpgrade(test_util.TensorFlowTestCase):
+  """Test various APIs that have been changed in 2.0."""
+
+  def setUp(self):
+    tf.enable_eager_execution()
+
+  @test_util.run_v1_only("b/120545219")
+  def testRenames(self):
+    with self.cached_session():
+      self.assertAllClose(1.04719755, tf.acos(0.5))
+      self.assertAllClose(0.5, tf.rsqrt(4.0))
+
+  @test_util.run_v1_only("b/120545219")
+  def testSerializeSparseTensor(self):
+    sp_input = tf.SparseTensor(
+        indices=tf.constant([[1]], dtype=tf.int64),
+        values=tf.constant([2], dtype=tf.int64),
+        dense_shape=[2])
+
+    with self.cached_session():
+      serialized_sp = tf.serialize_sparse(sp_input, 'serialize_name', tf.string)
+      self.assertEqual((3,), serialized_sp.shape)
+      self.assertTrue(serialized_sp[0].numpy())  # check non-empty
+
+  @test_util.run_v1_only("b/120545219")
+  def testSerializeManySparse(self):
+    sp_input = tf.SparseTensor(
+        indices=tf.constant([[0, 1]], dtype=tf.int64),
+        values=tf.constant([2], dtype=tf.int64),
+        dense_shape=[1, 2])
+
+    with self.cached_session():
+      serialized_sp = tf.serialize_many_sparse(
+          sp_input, 'serialize_name', tf.string)
+      self.assertEqual((1, 3), serialized_sp.shape)
+
+  @test_util.run_v1_only("b/120545219")
+  def testArgMaxMin(self):
+    self.assertAllClose(
+        [1],
+        tf.argmax([[1, 3, 2]], name='abc', dimension=1))
+    self.assertAllClose(
+        [0, 0, 0],
+        tf.argmax([[1, 3, 2]], dimension=0))
+    self.assertAllClose(
+        [0],
+        tf.argmin([[1, 3, 2]], name='abc', dimension=1))
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 96705b1a4c27e72ba1d50f16dad10c35705b1782..287d1a5483c32379da1dc651aba62a86a3f6d0f9 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -102,7 +102,7 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     }
 
     # Mapping from function to the new name of the function
-    self.function_renames = {
+    self.symbol_renames = {
         "tf.inv": "tf.reciprocal",
         "tf.contrib.deprecated.scalar_summary": "tf.summary.scalar",
         "tf.contrib.deprecated.histogram_summary": "tf.summary.histogram",
@@ -178,6 +178,9 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # Specially handled functions.
     self.function_handle = {"tf.reverse": self._reverse_handler}
 
+    # Warnings that should be printed if corresponding functions are used.
+    self.function_warnings = {}
+
   @staticmethod
   def _reverse_handler(file_edit_recorder, node):
     # TODO(aselle): Could check for a literal list of bools and try to convert
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 53c546b10c01b0629bb9f92b88cb0545b4105833..ea86da42f6bbb8170c56d08e02ab38cf72acf3f7 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -18,11 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
-import functools
-
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
+from tensorflow.tools.compatibility import reorders_v2
 
 
 class TFAPIChangeSpec(ast_edits.APIChangeSpec):
@@ -31,117 +29,897 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   def __init__(self):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
-    self.function_keyword_renames = {}
+    self.function_keyword_renames = {
+        "tf.argmin": {
+            "dimension": "axis",
+        },
+        "tf.argmax": {
+            "dimension": "axis",
+        },
+        "tf.arg_min": {
+            "dimension": "axis",
+        },
+        "tf.arg_max": {
+            "dimension": "axis",
+        },
+        "tf.math.argmin": {
+            "dimension": "axis",
+        },
+        "tf.math.argmax": {
+            "dimension": "axis",
+        },
+        "tf.image.crop_and_resize": {
+            "box_ind": "box_indices",
+        },
+        "tf.image.extract_image_patches": {
+            "ksizes": "sizes",
+        },
+        "tf.extract_image_patches": {
+            "ksizes": "sizes",
+        },
+        "tf.expand_dims": {
+            "dim": "axis",
+        },
+        "tf.batch_to_space": {
+            "block_size": "block_shape",
+        },
+        "tf.space_to_batch": {
+            "block_size": "block_shape",
+        },
+        "tf.nn.space_to_batch": {
+            "block_size": "block_shape",
+        },
+        "tf.constant": {
+            "verify_shape": "verify_shape_is_now_always_true",
+        },
+        "tf.convert_to_tensor": {
+            "preferred_dtype": "dtype_hint"
+        },
+        "tf.nn.softmax_cross_entropy_with_logits_v2": {
+            "dim": "axis"
+        },
+        "tf.linalg.l2_normalize": {
+            "dim": "axis",
+        },
+        "tf.linalg.norm": {
+            "keep_dims": "keepdims",
+        },
+        "tf.norm": {
+            "keep_dims": "keepdims",
+        },
+        "tf.load_file_system_library": {
+            "library_filename": "library_location",
+        },
+        "tf.math.count_nonzero": {
+            "input_tensor": "input",
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis",
+        },
+        "tf.nn.erosion2d": {
+            "kernel": "filters",
+            "rates": "dilations",
+        },
+        "tf.math.l2_normalize": {
+            "dim": "axis",
+        },
+        "tf.math.log_softmax": {
+            "dim": "axis",
+        },
+        "tf.math.softmax": {
+            "dim": "axis"
+        },
+        "tf.nn.l2_normalize": {
+            "dim": "axis",
+        },
+        "tf.nn.log_softmax": {
+            "dim": "axis",
+        },
+        "tf.nn.moments": {
+            "keep_dims": "keepdims",
+        },
+        "tf.nn.pool": {
+            "dilation_rate": "dilations"
+        },
+        "tf.nn.separable_conv2d": {
+            "rate": "dilations"
+        },
+        "tf.nn.depthwise_conv2d": {
+            "rate": "dilations"
+        },
+        "tf.nn.softmax": {
+            "dim": "axis"
+        },
+        "tf.nn.sufficient_statistics": {
+            "keep_dims": "keepdims"
+        },
+        "tf.debugging.assert_all_finite": {
+            "t": "x",
+            "msg": "message",
+        },
+        "tf.sparse.add": {
+            "thresh": "threshold",
+        },
+        "tf.sparse_add": {
+            "thresh": "threshold",
+        },
+        "tf.sparse.concat": {
+            "concat_dim": "axis",
+            "expand_nonconcat_dim": "expand_nonconcat_dims",
+        },
+        "tf.sparse_concat": {
+            "concat_dim": "axis",
+            "expand_nonconcat_dim": "expand_nonconcat_dims",
+        },
+        "tf.sparse.split": {
+            "split_dim": "axis",
+        },
+        "tf.sparse_split": {
+            "split_dim": "axis",
+        },
+        "tf.sparse.reduce_max": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.sparse_reduce_max": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.sparse.reduce_sum": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.sparse_reduce_sum": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.nn.max_pool_with_argmax": {
+            "Targmax": "output_dtype",
+        },
+        "tf.multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.random.multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.reverse_sequence": {
+            "seq_dim": "seq_axis",
+            "batch_dim": "batch_axis",
+        },
+        "tf.nn.batch_norm_with_global_normalization": {
+            "t": "input",
+            "m": "mean",
+            "v": "variance",
+        },
+        "tf.nn.dilation2d": {
+            "filter": "filters",
+            "rates": "dilations",
+        },
+        "tf.nn.conv3d": {
+            "filter": "filters"
+        },
+        "tf.zeros_like": {
+            "tensor": "input",
+        },
+        "tf.ones_like": {
+            "tensor": "input",
+        },
+        "tf.nn.conv2d_transpose": {
+            "value": "input",
+            "filter": "filters",
+        },
+        "tf.nn.conv3d_transpose": {
+            "value": "input",
+            "filter": "filters",
+        },
+        "tf.nn.convolution": {
+            "filter": "filters",
+            "dilation_rate": "dilations",
+        },
+        "tf.gfile.Exists": {
+            "filename": "path",
+        },
+        "tf.gfile.Remove": {
+            "filename": "path",
+        },
+        "tf.gfile.Stat": {
+            "filename": "path",
+        },
+        "tf.gfile.Glob": {
+            "filename": "pattern",
+        },
+        "tf.gfile.MkDir": {
+            "dirname": "path",
+        },
+        "tf.gfile.MakeDirs": {
+            "dirname": "path",
+        },
+        "tf.gfile.DeleteRecursively": {
+            "dirname": "path",
+        },
+        "tf.gfile.IsDirectory": {
+            "dirname": "path",
+        },
+        "tf.gfile.ListDirectory": {
+            "dirname": "path",
+        },
+        "tf.gfile.Copy": {
+            "oldpath": "src",
+            "newpath": "dst",
+        },
+        "tf.gfile.Rename": {
+            "oldname": "src",
+            "newname": "dst",
+        },
+        "tf.gfile.Walk": {
+            "in_order": "topdown",
+        },
+        "tf.random.stateless_multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.string_to_number": {
+            "string_tensor": "input",
+        },
+        "tf.strings.to_number": {
+            "string_tensor": "input",
+        },
+        "tf.string_to_hash_bucket": {
+            "string_tensor": "input",
+        },
+        "tf.strings.to_hash_bucket": {
+            "string_tensor": "input",
+        },
+        "tf.reduce_all": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_all": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_any": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_any": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_min": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_min": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_max": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_max": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_sum": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_sum": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_mean": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_mean": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_prod": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_prod": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_logsumexp": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_logsumexp": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_join": {
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis"
+        },
+        "tf.strings.reduce_join": {
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis"
+        },
+        "tf.squeeze": {
+            "squeeze_dims": "axis",
+        },
+        "tf.nn.weighted_moments": {
+            "keep_dims": "keepdims"
+        },
+    }
+
+    # pylint: disable=line-too-long
+    # Add additional renames not in renames_v2.py here.
+    # IMPORTANT: For the renames in here, if you also need to add to
+    # function_reorders or function_keyword_renames, use the OLD function name.
+    # These renames happen after the arguments have been processed.
+    self.manual_symbol_renames = {
+        "tf.batch_to_space_nd":
+            "tf.batch_to_space",
+        "tf.space_to_batch_nd":
+            "tf.space_to_batch",
+        "tf.nn.space_to_batch":
+            "tf.space_to_batch",
+        "tf.extract_image_patches":
+            "tf.image.extract_image_patches",
+        "tf.gfile.Copy":
+            "tf.io.gfile.copy",
+        "tf.gfile.DeleteRecursively":
+            "tf.io.gfile.rmtree",
+        "tf.gfile.Exists":
+            "tf.io.gfile.exists",
+        "tf.gfile.Glob":
+            "tf.io.gfile.glob",
+        "tf.gfile.IsDirectory":
+            "tf.io.gfile.isdir",
+        "tf.gfile.ListDirectory":
+            "tf.io.gfile.listdir",
+        "tf.gfile.MakeDirs":
+            "tf.io.gfile.makedirs",
+        "tf.gfile.MkDir":
+            "tf.io.gfile.mkdir",
+        "tf.gfile.Remove":
+            "tf.io.gfile.remove",
+        "tf.gfile.Rename":
+            "tf.io.gfile.rename",
+        "tf.gfile.Stat":
+            "tf.io.gfile.stat",
+        "tf.gfile.Walk":
+            "tf.io.gfile.walk",
+        "tf.contrib.data.AUTOTUNE":
+            "tf.data.experimental.AUTOTUNE",
+        "tf.contrib.data.Counter":
+            "tf.data.experimental.Counter",
+        "tf.contrib.data.CheckpointInputPipelineHook":
+            "tf.data.experimental.CheckpointInputPipelineHook",
+        "tf.contrib.data.CsvDataset":
+            "tf.data.experimental.CsvDataset",
+        "tf.contrib.data.Optional":
+            "tf.data.experimental.Optional",
+        "tf.contrib.data.RandomDataset":
+            "tf.data.experimental.RandomDataset",
+        "tf.contrib.data.Reducer":
+            "tf.data.experimental.Reducer",
+        "tf.contrib.data.SqlDataset":
+            "tf.data.experimental.SqlDataset",
+        "tf.contrib.data.StatsAggregator":
+            "tf.data.experimental.StatsAggregator",
+        "tf.contrib.data.TFRecordWriter":
+            "tf.data.experimental.TFRecordWriter",
+        "tf.contrib.data.assert_element_shape":
+            "tf.data.experimental.assert_element_shape",
+        "tf.contrib.data.batch_and_drop_remainder":
+            "tf.compat.v1.contrib.data.batch_and_drop_remainder",
+        "tf.contrib.data.bucket_by_sequence_length":
+            "tf.data.experimental.bucket_by_sequence_length",
+        "tf.contrib.data.choose_from_datasets":
+            "tf.data.experimental.choose_from_datasets",
+        "tf.contrib.data.copy_to_device":
+            "tf.data.experimental.copy_to_device",
+        "tf.contrib.data.dense_to_sparse_batch":
+            "tf.data.experimental.dense_to_sparse_batch",
+        "tf.contrib.data.enumerate_dataset":
+            "tf.data.experimental.enumerate_dataset",
+        "tf.contrib.data.get_next_as_optional":
+            "tf.data.experimental.get_next_as_optional",
+        "tf.contrib.data.get_single_element":
+            "tf.data.experimental.get_single_element",
+        "tf.contrib.data.group_by_reducer":
+            "tf.data.experimental.group_by_reducer",
+        "tf.contrib.data.group_by_window":
+            "tf.data.experimental.group_by_window",
+        "tf.contrib.data.ignore_errors":
+            "tf.data.experimental.ignore_errors",
+        "tf.contrib.data.latency_stats":
+            "tf.data.experimental.latency_stats",
+        "tf.contrib.data.make_batched_features_dataset":
+            "tf.data.experimental.make_batched_features_dataset",
+        "tf.contrib.data.make_csv_dataset":
+            "tf.data.experimental.make_csv_dataset",
+        "tf.contrib.data.make_saveable_from_iterator":
+            "tf.data.experimental.make_saveable_from_iterator",
+        "tf.contrib.data.map_and_batch":
+            "tf.data.experimental.map_and_batch",
+        "tf.contrib.data.padded_batch_and_drop_remainder":
+            "tf.compat.v1.contrib.data.padded_batch_and_drop_remainder",
+        "tf.contrib.data.parallel_interleave":
+            "tf.data.experimental.parallel_interleave",
+        "tf.contrib.data.parse_example_dataset":
+            "tf.data.experimental.parse_example_dataset",
+        "tf.contrib.data.prefetch_to_device":
+            "tf.data.experimental.prefetch_to_device",
+        "tf.contrib.data.read_batch_features":
+            "tf.compat.v1.contrib.data.read_batch_features",
+        "tf.contrib.data.reduce_dataset":
+            "tf.compat.v1.contrib.data.reduce_dataset",
+        "tf.contrib.data.rejection_resample":
+            "tf.data.experimental.rejection_resample",
+        "tf.contrib.data.sample_from_datasets":
+            "tf.data.experimental.sample_from_datasets",
+        "tf.contrib.data.scan":
+            "tf.data.experimental.scan",
+        "tf.contrib.data.set_stats_aggregator":
+            "tf.data.experimental.set_stats_aggregator",
+        "tf.contrib.data.shuffle_and_repeat":
+            "tf.data.experimental.shuffle_and_repeat",
+        "tf.contrib.data.sliding_window_batch":
+            "tf.compat.v1.contrib.data.sliding_window_batch",
+        "tf.contrib.data.sloppy_interleave":
+            "tf.compat.v1.contrib.data.sloppy_interleave",
+        "tf.contrib.data.unbatch":
+            "tf.data.experimental.unbatch",
+        "tf.contrib.data.unique":
+            "tf.data.experimental.unique",
+        "tf.contrib.framework.sort":
+            "tf.sort",
+        "tf.contrib.framework.argsort":
+            "tf.argsort",
+        "tf.manip.batch_to_space_nd":
+            "tf.batch_to_space",
+        "tf.quantize_v2":
+            "tf.quantization.quantize",
+        "tf.sparse_add":
+            "tf.sparse.add",
+        "tf.sparse_concat":
+            "tf.sparse.concat",
+        "tf.sparse_split":
+            "tf.sparse.split",
+        "tf.sparse_matmul":
+            "tf.linalg.matmul",
+        "tf.sparse_reduce_sum":
+            "tf.sparse.reduce_sum",
+        "tf.sparse_reduce_max":
+            "tf.sparse.reduce_max",
+        "tf.random.stateless_multinomial":
+            "tf.random.stateless_categorical",
+        "tf.string_to_hash_bucket":
+            "tf.strings.to_hash_bucket",
+        "tf.string_to_number":
+            "tf.strings.to_number",
+        "tf.multinomial":
+            "tf.random.categorical",
+        "tf.random.multinomial":
+            "tf.random.categorical",
+        "tf.reduce_join":
+            "tf.strings.reduce_join",
+        "tf.load_file_system_library":
+            "tf.load_library",
+        "tf.pywrap_tensorflow":
+            "tf.compat.v1.pywrap_tensorflow",
+        "tf.bincount":
+            "tf.math.bincount",
+        "tf.confusion_matrix":
+            "tf.math.confusion_matrix",
+        "tf.train.confusion_matrix":
+            "tf.math.confusion_matrix",
+        "tf.decode_csv":
+            "tf.io.decode_csv",
+        "tf.data.Iterator":
+            "tf.compat.v1.data.Iterator",
+        "tf.parse_example":
+            "tf.io.parse_example",
+        "tf.parse_single_example":
+            "tf.io.parse_single_example",
+        "tf.nn.fused_batch_norm":
+            "tf.compat.v1.nn.fused_batch_norm",
+        "tf.nn.softmax_cross_entropy_with_logits_v2":
+            "tf.nn.softmax_cross_entropy_with_logits",
+        "tf.losses.Reduction.MEAN":
+            "tf.compat.v1.losses.Reduction.MEAN",
+        "tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS":
+            "tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS",
+        "tf.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS":
+            "tf.compat.v1.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS",
+        "tf.lite.constants.FLOAT":
+            "tf.float32",
+        "tf.lite.constants.INT32":
+            "tf.int32",
+        "tf.lite.constants.INT64":
+            "tf.int64",
+        "tf.lite.constants.STRING":
+            "tf.string",
+        "tf.lite.constants.QUANTIZED_UINT8":
+            "tf.uint8",
+        "tf.arg_max":
+            "tf.argmax",
+        "tf.arg_min":
+            "tf.argmin",
+        # tf.nn.ctc_loss is still available in 2.0 but behavior
+        # changed significantly.
+        "tf.nn.ctc_loss":
+            "tf.compat.v1.nn.ctc_loss",
+    }
+    # pylint: enable=line-too-long
 
     # Mapping from function to the new name of the function
-    self.function_renames = renames_v2.renames
+    self.symbol_renames = renames_v2.renames
+    self.symbol_renames.update(self.manual_symbol_renames)
 
     # Variables that should be changed to functions.
     self.change_to_function = {}
 
+    # pylint: disable=line-too-long
+    # This list should just contain names of functions that had
+    # their arguments reordered. After adding a function name to the list
+    # run the following to update reorders_v2.py:
+    # bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+    # bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+    # pylint: enable=line-too-long
+    self.reordered_function_names = {
+        "tf.io.serialize_sparse",
+        "tf.io.serialize_many_sparse",
+        "tf.argmax",
+        "tf.argmin",
+        "tf.batch_to_space",
+        "tf.nn.space_to_batch",
+        "tf.boolean_mask",
+        "tf.convert_to_tensor",
+        "tf.nn.moments",
+        "tf.nn.convolution",
+        "tf.nn.crelu",
+        "tf.nn.weighted_moments",
+        "tf.nn.pool",
+        "tf.nn.separable_conv2d",
+        "tf.nn.depthwise_conv2d",
+        "tf.multinomial",
+        "tf.random.multinomial",
+        "tf.pad",
+        "tf.quantize_v2",
+        "tf.feature_column.categorical_column_with_vocabulary_file",
+        "tf.shape",
+        "tf.size",
+        "tf.random.poisson",
+        "tf.sparse.add",
+        "tf.sparse_add",
+        "tf.sparse.concat",
+        "tf.sparse_concat",
+        "tf.sparse.segment_mean",
+        "tf.sparse.segment_sqrt_n",
+        "tf.sparse.segment_sum",
+        "tf.sparse_matmul",
+        "tf.sparse.reduce_max",
+        "tf.sparse_reduce_max",
+        "tf.io.decode_csv",
+        "tf.strings.substr",
+        "tf.strings.reduce_join",
+        "tf.strings.length",
+        "tf.transpose",
+        "tf.tuple",
+        "tf.parse_example",
+        "tf.parse_single_example",
+        "tf.io.parse_example",
+        "tf.io.parse_single_example",
+        "tf.while_loop",
+        "tf.reduce_all",
+        "tf.math.reduce_all",
+        "tf.reduce_any",
+        "tf.math.reduce_any",
+        "tf.reduce_min",
+        "tf.math.reduce_min",
+        "tf.reduce_max",
+        "tf.math.reduce_max",
+        "tf.reduce_sum",
+        "tf.math.reduce_sum",
+        "tf.reduce_mean",
+        "tf.math.reduce_mean",
+        "tf.reduce_prod",
+        "tf.math.reduce_prod",
+        "tf.reduce_logsumexp",
+        "tf.math.reduce_logsumexp",
+        "tf.reduce_join",
+        "tf.confusion_matrix",
+        "tf.math.confusion_matrix",
+        "tf.math.in_top_k",
+        "tf.nn.depth_to_space",
+        "tf.nn.embedding_lookup",
+        "tf.nn.embedding_lookup_sparse",
+        "tf.nn.in_top_k",
+        "tf.nn.space_to_depth",
+        "tf.linalg.norm",
+        "tf.norm",
+        "tf.reverse_sequence",
+        "tf.sparse_split",
+    }
+
     # Functions that were reordered should be changed to the new keyword args
     # for safety, if positional arguments are used. If you have reversed the
     # positional arguments yourself, this could do the wrong thing.
-    self.function_reorders = {}
+    self.function_reorders = reorders_v2.reorders
 
     # Specially handled functions.
-    self.function_handle = {}
-    for decay in ["tf.train.exponential_decay", "tf.train.piecewise_constant",
-                  "tf.train.polynomial_decay", "tf.train.natural_exp_decay",
-                  "tf.train.inverse_time_decay", "tf.train.cosine_decay",
-                  "tf.train.cosine_decay_restarts",
-                  "tf.train.linear_cosine_decay",
-                  "tf.train.noisy_linear_cosine_decay"]:
-      self.function_handle[decay] = functools.partial(
-          self._learning_rate_decay_handler, decay_name=decay)
+    self.function_handle = {
+        "tf.nn.dropout": self._dropout_handler,
+        "tf.gradients": self._colocate_handler("tf.gradients"),
+        "*.minimize": self._colocate_handler("Optimizer.minimize"),
+        "*.compute_gradients":
+            self._colocate_handler("Optimizer.compute_gradients"),
+    }
+
+    decay_function_comment = (
+        "WARNING: <function name> has been changed to return a callable instead"
+        " of a tensor when graph building, but its functionality remains "
+        "unchanged during eager execution (returns a callable like "
+        "before). The converter cannot detect and fix this reliably, so "
+        "this usage has been converted to compat.v1 (even though it may already"
+        " be correct).\n"
+    )
+
+    # TODO(b/118888586): add default value change to update script.
+    default_loss_reduction_changed = (
+        "WARNING: default value of loss_reduction has been changed to "
+        "SUM_OVER_BATCH_SIZE.\n"
+    )
+
+    assert_return_type_comment = (
+        "WARNING: assert_* functions have been changed to return None, the "
+        "data argument has been removed, and arguments have been reordered."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        " they may already have been correct)."
+    )
+
+    assert_rank_comment = (
+        "WARNING: assert_rank_* functions have been changed to return None, and"
+        " the data and summarize arguments have been removed."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        " they may already have been correct)."
+    )
+
+    tf_01s_like_no_optimize_comment = (
+        "WARNING: tf.zeros_like and tf.ones_like no longer have the optimize "
+        "argument in TF 2.0 or after (also, `tensor' argument is renamed to "
+        "`input')."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        " they may already have been correct)."
+    )
+
+    deprecate_partition_strategy_comment = (
+        "WARNING: `partition_strategy` has been removed from `%s` "
+        " The 'div' strategy is used by default.")
+
+    # Function warnings. <function name> placeholder inside warnings will be
+    # replaced by function name.
+    self.function_warnings = {
+        "tf.assert_greater":
+            assert_return_type_comment,
+        "tf.assert_equal":
+            assert_return_type_comment,
+        "tf.assert_less":
+            assert_return_type_comment,
+        "tf.assert_rank":
+            assert_rank_comment,
+        "tf.cond": "tf.cond no longer takes 'strict'. "
+                   "Now 'strict' defaults to True."
+                   "fn1/fn2 arguments are replaced by true_fn/false_fn.",
+        "tf.debugging.assert_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_greater":
+            assert_return_type_comment,
+        "tf.debugging.assert_greater_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_integer":
+            assert_return_type_comment,
+        "tf.debugging.assert_less":
+            assert_return_type_comment,
+        "tf.debugging.assert_less_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_near":
+            assert_return_type_comment,
+        "tf.debugging.assert_negative":
+            assert_return_type_comment,
+        "tf.debugging.assert_non_negative":
+            assert_return_type_comment,
+        "tf.debugging.assert_non_positive":
+            assert_return_type_comment,
+        "tf.debugging.assert_none_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_positive":
+            assert_return_type_comment,
+        "tf.debugging.assert_rank":
+            assert_rank_comment,
+        "tf.debugging.assert_rank_at_least":
+            assert_rank_comment,
+        "tf.debugging.assert_rank_in":
+            assert_rank_comment,
+        "tf.device": "tf.device no longer takes function as an argument. "
+                     "'devide_name_or_function' argument has been renamed to "
+                     "'device_name'.",
+        "tf.flags":
+            "tf.flags has been removed, please use the argparse or absl"
+            " module if you need command line parsing.",
+        "tf.train.exponential_decay":
+            decay_function_comment,
+        "tf.train.piecewise_constant_decay":
+            decay_function_comment,
+        "tf.train.polynomial_decay":
+            decay_function_comment,
+        "tf.train.natural_exp_decay":
+            decay_function_comment,
+        "tf.train.inverse_time_decay":
+            decay_function_comment,
+        "tf.train.cosine_decay":
+            decay_function_comment,
+        "tf.train.cosine_decay_restarts":
+            decay_function_comment,
+        "tf.train.linear_cosine_decay":
+            decay_function_comment,
+        "tf.train.noisy_linear_cosine_decay":
+            decay_function_comment,
+        "tf.estimator.LinearClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.LinearRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNLinearCombinedClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNLinearCombinedRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.BaselineClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.BaselineRegressor":
+            default_loss_reduction_changed,
+        "tf.hessians": "tf.hessians no longer takes "
+                       "'colocate_gradients_with_ops' argument. Also, "
+                       "arguments have been reordered so that 'name' is the "
+                       "last argument.",
+        "tf.nn.conv1d":
+            "WARNING: use_cudnn_on_gpu argument has been removed and \"value\""
+            " was renamed to \"input\"",
+        "tf.nn.conv2d":
+            "WARNING: use_cudnn_on_gpu argument has been removed and "
+            "\"filter\" was renamed to \"filters\"",
+        "tf.nn.conv2d_backprop_filter":
+            "WARNING: use_cudnn_on_gpu argument has been removed",
+        "tf.nn.conv2d_backprop_input":
+            "WARNING: use_cudnn_on_gpu argument has been removed and "
+            "\"filter\" was renamed to \"filters\"",
+        "tf.nn.erosion2d":
+            "WARNING: <function name> now requires a data_format argument",
+        "tf.nn.nce_loss":
+            deprecate_partition_strategy_comment % "tf.nn.nce_loss",
+        "tf.nn.safe_embedding_lookup_sparse":
+            deprecate_partition_strategy_comment %
+            "tf.nn.safe_embedding_lookup_sparse",
+        "tf.nn.sampled_softmax_loss":
+            deprecate_partition_strategy_comment % "tf.nn.sampled_softmax_loss",
+        "tf.zeros_like":
+            tf_01s_like_no_optimize_comment,
+        "tf.ones_like":
+            tf_01s_like_no_optimize_comment,
+        "tf.nn.embedding_lookup":
+            "WARNING: validate_indices argument has been removed.",
+        "tf.while_loop":
+            "tf.while_loop no longer takes 'return_same_structure' argument. "
+            "'return_same_structure' now defaults to True. Also, 'name'"
+            "argument is now the last argument.",
+        "tf.image.sample_distorted_bounding_box":
+            "tf.image.sample_distorted_bounding_box no longer takes 'seed2' "
+            "argument.",
+        "tf.nn.ctc_beam_search_decoder":
+            "tf.nn.ctc_beam_search_decoder no longer takes 'merge_repeated' "
+            "argument. 'merge_repeated' now defaults to False.",
+        "tf.nn.fractional_avg_pool":
+            "tf.nn.fractional_avg_pool no longer takes 'seed2' and "
+            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
+            "'seed' is zero, the execution is random and deterministic "
+            "otherwise",
+        "tf.nn.fractional_max_pool":
+            "tf.nn.fractional_max_pool no longer takes 'seed2' and "
+            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
+            "'seed' is zero, the execution is random and deterministic "
+            "otherwise",
+        "tf.nn.softmax_cross_entropy_with_logits":
+            "tf.nn.softmax_cross_entropy_with_logits behavior has changed. "
+            "'labels' needs to be wrapped with tf.stop_gradient to keep the "
+            "old behavior. Also, 'dim' argument has been renamed to 'axis'.",
+        "tf.test.assert_equal_graph_def":
+            "tf.assert_equal_graph_def no longer takes 'checkpoint_v2' "
+            "argument. 'checkpoint_v2' now defaults to True.",
+    }
+
+    self.symbol_renames = {
+        name: new_name
+        for name, new_name in self.symbol_renames.items()
+    }
+
+    export_saved_model_renamed = (
+        "(Manual edit required) Please rename the method export_savedmodel() "
+        "to export_saved_model(). Two things to note:\n\t(1) The argument "
+        "strip_default_attributes has been removed. The function will always "
+        "strip the default attributes from ops. If this breaks your code, "
+        "please switch to tf.compat.v1.estimator.Estimator.\n\t(2) This change "
+        "only effects core estimator. If you are using "
+        "tf.contrib.learn.Estimator, please switch to using core estimator.")
+
+    make_initializable_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_initializable_iterator()` method has been "
+        "removed. If you are using the Estimator API, you can return a dataset "
+        "directly from your input functions without creating an iterator. "
+        "As a last resort, please replace calls to that method on `dataset` "
+        "with a call to "
+        "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
+
+    make_one_shot_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_one_shot_iterator()` method has been "
+        "removed. If you are using eager execution, you can iterate over "
+        "`dataset` using a Python `for` loop. If you are using the Estimator "
+        "API, you can return a dataset directly from your input functions "
+        "without creating an iterator. As a last resort, please replace calls "
+        "to that method on `dataset` with a call to "
+        "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+
+    # Specify warnings for functions that aren't restricted to the tf.x.y.z
+    # format. This should only be used for methods with unique names, e.g.
+    # export_savedmodel, which is only defined in Estimator objects.
+    self.unrestricted_function_warnings = {
+        "export_savedmodel": export_saved_model_renamed,
+        "make_initializable_iterator": make_initializable_iterator_deprecation,
+        "make_one_shot_iterator": make_one_shot_iterator_deprecation,
+    }
+
+  @staticmethod
+  def _dropout_handler(file_edit_recorder, node):
+    if len(node.args) < 2:
+      comment = ("ERROR: tf.nn.dropout did not take arguments, so automatic "
+                 "transformation was disabled. tf.nn.dropout has changed "
+                 "the semantics of the second argument.")
+      file_edit_recorder.add(
+          comment,
+          node.lineno,
+          node.col_offset,
+          "tf.nn.dropout",
+          "tf.nn.dropout",
+          error="tf.nn.dropout requires manual check.")
+    else:
+      comment = ("WARNING: tf.nn.dropout has changed the semantics of the "
+                 "second argument. Please check the transformation.\n")
+      file_edit_recorder.add(
+          comment,
+          node.args[1].lineno,
+          node.args[1].col_offset,
+          "",
+          "1 - ")
 
   @staticmethod
-  def _learning_rate_decay_handler(file_edit_recorder, node, decay_name):
-    comment = ("ERROR: %s has been changed to return a callable instead of a "
-               "tensor when graph building, but its functionality remains "
-               "unchanged during eager execution (returns a callable like "
-               "before). The converter cannot detect and fix this reliably, so "
-               "you need to inspect this usage manually.\n") % decay_name
-    file_edit_recorder.add(
-        comment,
-        node.lineno,
-        node.col_offset,
-        decay_name,
-        decay_name,
-        error="%s requires manual check." % decay_name)
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser(
-      formatter_class=argparse.RawDescriptionHelpFormatter,
-      description="""Convert a TensorFlow Python file to 2.0
-
-Simple usage:
-  tf_convert_v2.py --infile foo.py --outfile bar.py
-  tf_convert_v2.py --intree ~/code/old --outtree ~/code/new
-""")
-  parser.add_argument(
-      "--infile",
-      dest="input_file",
-      help="If converting a single file, the name of the file "
-      "to convert")
-  parser.add_argument(
-      "--outfile",
-      dest="output_file",
-      help="If converting a single file, the output filename.")
-  parser.add_argument(
-      "--intree",
-      dest="input_tree",
-      help="If converting a whole tree of files, the directory "
-      "to read from (relative or absolute).")
-  parser.add_argument(
-      "--outtree",
-      dest="output_tree",
-      help="If converting a whole tree of files, the output "
-      "directory (relative or absolute).")
-  parser.add_argument(
-      "--copyotherfiles",
-      dest="copy_other_files",
-      help=("If converting a whole tree of files, whether to "
-            "copy the other files."),
-      type=bool,
-      default=False)
-  parser.add_argument(
-      "--reportfile",
-      dest="report_filename",
-      help=("The name of the file where the report log is "
-            "stored."
-            "(default: %(default)s)"),
-      default="report.txt")
-  args = parser.parse_args()
-
-  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
-  report_text = None
-  report_filename = args.report_filename
-  files_processed = 0
-  if args.input_file:
-    if not args.output_file:
-      raise ValueError(
-          "--outfile=<output file> argument is required when converting a "
-          "single file.")
-    files_processed, report_text, errors = upgrade.process_file(
-        args.input_file, args.output_file)
-    files_processed = 1
-  elif args.input_tree:
-    if not args.output_tree:
-      raise ValueError(
-          "--outtree=<output directory> argument is required when converting a "
-          "file tree.")
-    files_processed, report_text, errors = upgrade.process_tree(
-        args.input_tree, args.output_tree, args.copy_other_files)
-  else:
-    parser.print_help()
-  if report_text:
-    open(report_filename, "w").write(report_text)
-    print("TensorFlow 2.0 Upgrade Script")
-    print("-----------------------------")
-    print("Converted %d files\n" % files_processed)
-    print("Detected %d errors that require attention" % len(errors))
-    print("-" * 80)
-    print("\n".join(errors))
-    print("\nMake sure to read the detailed log %r\n" % report_filename)
+  def _colocate_handler(name):
+    def _helper(file_edit_recorder, node):
+      for keyword in node.keywords:
+        if keyword.arg == "colocate_gradients_with_ops":
+          # TODO(jhseu): Since ast_edit.py does string replacement, there's no
+          # straightforward way to remove the argument. Try to fix before 2.0 is
+          # final.
+          comment = ("For tf.gradients and tf.Optimizer.minimize, "
+                     "colocate_gradients_with_op has been removed and now "
+                     "defaults to True.")
+          file_edit_recorder.add(
+              comment,
+              node.lineno,
+              node.col_offset,
+              "",
+              "",
+              error="{} requires manual check.".format(name))
+    return _helper
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..543d0786423f5b3f9bc59895c1325d19b6241cf7
--- /dev/null
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -0,0 +1,104 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Upgrader for Python scripts from 1.* TensorFlow to 2.0 TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+
+from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import tf_upgrade_v2
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      formatter_class=argparse.RawDescriptionHelpFormatter,
+      description="""Convert a TensorFlow Python file to 2.0
+
+Simple usage:
+  tf_upgrade_v2.py --infile foo.py --outfile bar.py
+  tf_upgrade_v2.py --intree ~/code/old --outtree ~/code/new
+""")
+  parser.add_argument(
+      "--infile",
+      dest="input_file",
+      help="If converting a single file, the name of the file "
+      "to convert")
+  parser.add_argument(
+      "--outfile",
+      dest="output_file",
+      help="If converting a single file, the output filename.")
+  parser.add_argument(
+      "--intree",
+      dest="input_tree",
+      help="If converting a whole tree of files, the directory "
+      "to read from (relative or absolute).")
+  parser.add_argument(
+      "--outtree",
+      dest="output_tree",
+      help="If converting a whole tree of files, the output "
+      "directory (relative or absolute).")
+  parser.add_argument(
+      "--copyotherfiles",
+      dest="copy_other_files",
+      help=("If converting a whole tree of files, whether to "
+            "copy the other files."),
+      type=bool,
+      default=True)
+  parser.add_argument(
+      "--reportfile",
+      dest="report_filename",
+      help=("The name of the file where the report log is "
+            "stored."
+            "(default: %(default)s)"),
+      default="report.txt")
+  args = parser.parse_args()
+
+  upgrade = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+  report_text = None
+  report_filename = args.report_filename
+  files_processed = 0
+  if args.input_file:
+    if not args.output_file:
+      raise ValueError(
+          "--outfile=<output file> argument is required when converting a "
+          "single file.")
+    files_processed, report_text, errors = upgrade.process_file(
+        args.input_file, args.output_file)
+    files_processed = 1
+  elif args.input_tree:
+    if not args.output_tree:
+      raise ValueError(
+          "--outtree=<output directory> argument is required when converting a "
+          "file tree.")
+    files_processed, report_text, errors = upgrade.process_tree(
+        args.input_tree, args.output_tree, args.copy_other_files)
+  else:
+    parser.print_help()
+  if report_text:
+    open(report_filename, "w").write(report_text)
+    print("TensorFlow 2.0 Upgrade Script")
+    print("-----------------------------")
+    print("Converted %d files\n" % files_processed)
+    print("Detected %d errors that require attention" % len(errors))
+    print("-" * 80)
+    print("\n".join(errors))
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 3886c1e8b9e61f4008b7fec04526e7d92f7707d5..0fc7a18734219cd0216816873768dd9dada16cc5 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -17,15 +17,90 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+import inspect
 import os
 import tempfile
+
 import six
+import tensorflow as tf
+# OSS TF V2 import placeholder.
+
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade_v2
 
 
+_TENSORFLOW_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
+_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+_ESTIMATOR_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
+_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
+
+
+def get_v1_names(symbol):
+  names_v1 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  names_v2 = set()
+  if hasattr(symbol, _TENSORFLOW_API_ATTR):
+    names_v2.update(getattr(symbol, _TENSORFLOW_API_ATTR))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR):
+    names_v2.update(getattr(symbol, _ESTIMATOR_API_ATTR))
+  return list(names_v2)
+
+
+def get_symbol_for_name(root, name):
+  name_parts = name.split(".")
+  symbol = root
+  # Iterate starting with second item since 1st item is "tf.".
+  for part in name_parts[1:]:
+    symbol = getattr(symbol, part)
+  return symbol
+
+
+def get_args(symbol):
+  if hasattr(inspect, "signature"):
+    signature = inspect.signature(symbol)
+    # Ignore *args and **kwargs for now.
+    return [param.name for param in signature.parameters.values()
+            if param.kind == param.POSITIONAL_OR_KEYWORD]
+  return tf_inspect.getargspec(symbol)[0]
+
+
+def get_func_and_args_from_str(call_str):
+  """Parse call string to get function and argument names.
+
+  Args:
+    call_str: Call string must be in the form:
+              `tf.foo(arg1=val1, arg2=val2, ...)`.
+
+  Returns:
+    (function_name, list of arg names) tuple.
+  """
+  open_paren_index = call_str.find("(")
+  close_paren_index = call_str.rfind(")")
+
+  function_name = call_str[:call_str.find("(")]
+  args = call_str[open_paren_index+1:close_paren_index].split(",")
+  args = [arg.split("=")[0].strip() for arg in args]
+  args = [arg for arg in args if arg]  # filter out empty strings
+  return function_name, args
+
+
 class TestUpgrade(test_util.TensorFlowTestCase):
   """Test various APIs that have been changed in 2.0.
 
@@ -34,6 +109,22 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   work when run with current TensorFlow.
   """
 
+  @classmethod
+  def setUpClass(cls):
+    cls.v2_symbols = {}
+    if not hasattr(tf.compat, "v2"):
+      return
+
+    def symbol_collector(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names_v2 = get_v2_names(attr)
+        for name in api_names_v2:
+          cls.v2_symbols["tf." + name] = attr
+
+    visitor = public_api.PublicAPIVisitor(symbol_collector)
+    traverse.traverse(tf.compat.v2, visitor)
+
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
     out_file = six.StringIO()
@@ -49,22 +140,240 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     self.assertTrue(report.find("Failed to parse") != -1)
 
   def testReport(self):
-    text = "tf.acos(a)\n"
+    text = "tf.assert_near(a)\n"
     _, report, unused_errors, unused_new_text = self._upgrade(text)
     # This is not a complete test, but it is a sanity test that a report
     # is generating information.
-    self.assertTrue(report.find("Renamed function `tf.acos` to `tf.math.acos`"))
+    self.assertTrue(report.find("Renamed function `tf.assert_near` to "
+                                "`tf.debugging.assert_near`"))
 
   def testRename(self):
-    text = "tf.acos(a)\n"
+    text = "tf.conj(a)\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "tf.math.conj(a)\n")
+    text = "tf.rsqrt(tf.log_sigmoid(3.8))\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "tf.math.rsqrt(tf.math.log_sigmoid(3.8))\n")
+
+  def testAllAPI(self):
+    if not hasattr(tf.compat, "v2"):
+      return
+
+    # Converts all symbols in the v1 namespace to the v2 namespace, raising
+    # an error if the target of the conversion is not in the v2 namespace.
+    # Please regenerate the renames file or edit any manual renames if this
+    # test fails.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names = get_v1_names(attr)
+        for name in api_names:
+          _, _, _, text = self._upgrade("tf." + name)
+          if (text and
+              not text.startswith("tf.compat.v1") and
+              text not in self.v2_symbols):
+            self.assertFalse(
+                True, "Symbol %s generated from %s not in v2 API" % (
+                    text, name))
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testAllAPIV1(self):
+    collect = True
+    v1_symbols = set([])
+
+    # Converts all symbols in the v1 namespace to the v2 namespace, raising
+    # an error if the target of the conversion is not in the v1 namespace.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names = get_v1_names(attr)
+        for name in api_names:
+          if collect:
+            v1_symbols.add("tf." + name)
+          else:
+            _, _, _, text = self._upgrade("tf." + name)
+            if (text and
+                not text.startswith("tf.compat.v1") and
+                not text.startswith("tf.estimator") and
+                text not in v1_symbols):
+              self.assertFalse(
+                  True, "Symbol %s generated from %s not in v1 API" % (
+                      text, name))
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+    collect = False
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testV1KeywordArgNames(self):
+    all_keyword_renames = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
+
+    # Visitor that verifies V1 argument names.
+    def arg_test_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        names_v1 = get_v1_names(attr)
+
+        for name in names_v1:
+          name = "tf.%s" % name
+          if name not in all_keyword_renames:
+            continue
+          arg_names_v1 = tf_inspect.getargspec(attr)[0]
+          keyword_renames = all_keyword_renames[name]
+          self.assertEqual(type(keyword_renames), dict)
+
+          # Assert that v1 function has valid v1 argument names.
+          for from_name, _ in keyword_renames.items():
+            self.assertIn(
+                from_name, arg_names_v1,
+                "%s not found in %s arguments: %s" %
+                (from_name, name, str(arg_names_v1)))
+
+    visitor = public_api.PublicAPIVisitor(arg_test_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testV2KeywordArgNames(self):
+    # This test converts a call of the form:
+    # tf.foo(arg1=0, arg2=1, ...)
+    # to 2.0. Then, checks that converted function has valid argument names.
+    if not hasattr(tf.compat, "v2"):
+      return
+    v2_arg_exceptions = {
+        "verify_shape_is_now_always_true",
+        # These arguments should not be used, they just specify
+        # that a function takes named arguments.
+        "keyword_required",
+        "_sentinel",
+    }
+    v1_name_exceptions = {
+        "tf.print",  # requires print_function import
+    }
+    function_warnings = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_warnings)
+    function_handles = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_handle)
+    keyword_renames = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
+
+    # Visitor that converts to V2 and checks V2 argument names.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        if not tf_inspect.isfunction(attr):
+          continue
+        names_v1 = get_v1_names(attr)
+        arg_names_v1 = get_args(attr)
+
+        for name in names_v1:
+          tf_name = "tf.%s" % name
+          if tf_name in function_warnings or tf_name in function_handles:
+            continue  # These require manual change
+          if tf_name in v1_name_exceptions:
+            continue
+          # Assert that arg names after converting to v2 are present in
+          # v2 function.
+          # 1. First, create an input of the form:
+          #    tf.foo(arg1=val1, arg2=val2, ...)
+          args = ",".join(
+              ["%s=%d" % (from_name, from_index)
+               for from_index, from_name in enumerate(arg_names_v1)])
+          text_input = "%s(%s)" % (tf_name, args)
+          # 2. Convert the input to V2.
+          _, _, _, text = self._upgrade(text_input)
+          new_function_name, new_args = get_func_and_args_from_str(text)
+          if new_function_name == "tf.compat.v1.%s" % name:
+            if tf_name in keyword_renames:
+              # If we rename arguments, new function must be available in 2.0.
+              # We should not be using compat.v1 in this case.
+              self.assertFalse(
+                  "Function '%s' is not in 2.0 when converting\n%s\nto\n%s" %
+                  (new_function_name, text_input, text))
+            continue
+          # 3. Verify V2 function and arguments.
+          args_v2 = get_args(self.v2_symbols[new_function_name])
+          args_v2.extend(v2_arg_exceptions)
+          for new_arg in new_args:
+            self.assertIn(
+                new_arg, args_v2,
+                "Invalid argument '%s' in 2.0 when converting\n%s\nto\n%s.\n"
+                "Supported arguments: %s" % (
+                    new_arg, text_input, text, str(args_v2)))
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testReorderFileNeedsUpdate(self):
+    reordered_function_names = (
+        tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
+    function_reorders = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_reorders)
+
+    added_names_message = """Some function names in
+self.reordered_function_names are not in reorders_v2.py.
+Please run the following commands to update reorders_v2.py:
+bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+"""
+    removed_names_message = """%s in self.reorders_v2 does not match
+any name in self.reordered_function_names.
+Please run the following commands to update reorders_v2.py:
+bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+"""
+    self.assertTrue(
+        reordered_function_names.issubset(function_reorders),
+        added_names_message)
+    # function_reorders should contain reordered_function_names
+    # and their TensorFlow V1 aliases.
+    for name in function_reorders:
+      # get other names for this function
+      attr = get_symbol_for_name(tf.compat.v1, name)
+      _, attr = tf_decorator.unwrap(attr)
+      v1_names = get_v1_names(attr)
+      self.assertTrue(v1_names)
+      v1_names = ["tf.%s" % n for n in v1_names]
+      # check if any other name is in
+      self.assertTrue(
+          any(n in reordered_function_names for n in v1_names),
+          removed_names_message % name)
+
+  def testRenameConstant(self):
+    text = "tf.MONOLITHIC_BUILD\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "tf.sysconfig.MONOLITHIC_BUILD\n")
+    text = "some_call(tf.MONOLITHIC_BUILD)\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "some_call(tf.sysconfig.MONOLITHIC_BUILD)\n")
+
+  def testRenameArgs(self):
+    text = ("tf.nn.pool(input_a, window_shape_a, pooling_type_a, padding_a, "
+            "dilation_rate_a, strides_a, name_a, data_format_a)\n")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
-    self.assertEqual(new_text, "tf.math.acos(a)\n")
-    text = "tf.rsqrt(tf.log(3.8))\n"
+    self.assertEqual(new_text,
+                     ("tf.nn.pool(input=input_a, window_shape=window_shape_a,"
+                      " pooling_type=pooling_type_a, padding=padding_a, "
+                      "dilations=dilation_rate_a, strides=strides_a, "
+                      "name=name_a, data_format=data_format_a)\n"))
+
+  def testReorder(self):
+    text = "tf.boolean_mask(a, b, c, d)\n"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
-    self.assertEqual(new_text, "tf.math.rsqrt(tf.math.log(3.8))\n")
+    self.assertEqual(new_text,
+                     "tf.boolean_mask(tensor=a, mask=b, name=c, axis=d)\n")
 
   def testLearningRateDecay(self):
-    for decay in ["tf.train.exponential_decay", "tf.train.piecewise_constant",
+    for decay in ["tf.train.exponential_decay",
                   "tf.train.polynomial_decay", "tf.train.natural_exp_decay",
                   "tf.train.inverse_time_decay", "tf.train.cosine_decay",
                   "tf.train.cosine_decay_restarts",
@@ -72,9 +381,362 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                   "tf.train.noisy_linear_cosine_decay"]:
 
       text = "%s(a, b)\n" % decay
-      _, unused_report, errors, new_text = self._upgrade(text)
-      self.assertEqual(text, new_text)
+      _, report, errors, _ = self._upgrade(text)
       self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay])
+      self.assertIn("%s has been changed" % decay, report)
+
+  def testPiecewiseDecay(self):
+    text = "tf.train.piecewise_constant_decay(a, b)\n"
+    _, report, errors, _ = self._upgrade(text)
+    self.assertEqual(
+        errors,
+        ["test.py:1: tf.train.piecewise_constant_decay requires manual check."])
+    self.assertIn("tf.train.piecewise_constant_decay has been changed", report)
+
+  def testEstimatorLossReductionChange(self):
+    classes = [
+        "LinearClassifier", "LinearRegressor", "DNNLinearCombinedClassifier",
+        "DNNLinearCombinedRegressor", "DNNRegressor", "DNNClassifier",
+        "BaselineClassifier", "BaselineRegressor"
+    ]
+    for c in classes:
+      ns = "tf.estimator." + c
+      text = ns + "(a, b)"
+      _, report, errors, new_text = self._upgrade(text)
+      self.assertEqual(text, new_text)
+      self.assertEqual(errors, ["test.py:1: %s requires manual check." % ns])
+      self.assertIn("loss_reduction has been changed", report)
+
+  def testDropout(self):
+    text = "tf.nn.dropout(x, keep_prob, name=\"foo\")\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.nn.dropout(x, 1 - keep_prob, name=\"foo\")\n",
+    )
+
+    text = "tf.nn.dropout(x)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, text)
+    self.assertEqual(
+        errors,
+        ["test.py:1: tf.nn.dropout requires manual check."]
+    )
+
+  def testCountNonZeroChanges(self):
+    text = (
+        "tf.math.count_nonzero(input_tensor=input, dtype=dtype, name=name, "
+        "reduction_indices=axis, keep_dims=keepdims)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.math.count_nonzero(input=input, dtype=dtype, name=name, "
+        "axis=axis, keepdims=keepdims)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+  def testRandomMultinomialToRandomCategorical(self):
+    text = (
+        "tf.random.multinomial(logits, samples, seed, name, output_dtype)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
+        "name=name, dtype=output_dtype)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.multinomial(logits, samples, seed, name, output_dtype)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
+        "name=name, dtype=output_dtype)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+  def testConvolutionOpUpdate(self):
+    text = (
+        "tf.nn.convolution(input, filter, padding, strides, dilation_rate, "
+        "name, data_format)"
+    )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.nn.convolution(input=input, filters=filter, padding=padding, "
+        "strides=strides, dilations=dilation_rate, name=name, "
+        "data_format=data_format)"
+    )
+    self.assertEqual(new_text, expected_text)
+
+  def testColocateGradientsWithOps(self):
+    text = "tf.gradients(a, foo=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, [])
+
+    text = "tf.gradients(a, colocate_gradients_with_ops=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, ["test.py:1: tf.gradients requires manual check."])
+
+    text = "optimizer.minimize(a, foo=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, [])
+
+    text = "optimizer.minimize(a, colocate_gradients_with_ops=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors,
+                     ["test.py:1: Optimizer.minimize requires manual check."])
+
+    text = "optimizer.compute_gradients(a, foo=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, [])
+
+    text = "optimizer.compute_gradients(a, colocate_gradients_with_ops=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors,
+                     ["test.py:1: Optimizer.compute_gradients "
+                      "requires manual check."])
+
+  def testExportSavedModelRename(self):
+    text = "self.est.export_savedmodel(path)"
+    _, report, unused_errors, unused_new_text = self._upgrade(text)
+    self.assertIn(
+        "rename the method export_savedmodel() to export_saved_model()",
+        report)
+
+  def testArgmin(self):
+    text = "tf.argmin(input, name=n, dimension=1, output_type=type)"
+    expected_text = "tf.argmin(input=input, name=n, axis=1, output_type=type)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.argmin(input, 0)"
+    expected_text = "tf.argmin(input=input, axis=0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.arg_min(input, 0)"
+    expected_text = "tf.argmin(input, 0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testArgmax(self):
+    text = "tf.argmax(input, name=n, dimension=1, output_type=type)"
+    expected_text = "tf.argmax(input=input, name=n, axis=1, output_type=type)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.argmax(input, 0)"
+    expected_text = "tf.argmax(input=input, axis=0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.arg_max(input, 0)"
+    expected_text = "tf.argmax(input, 0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testBatchToSpace(self):
+    text = "tf.batch_to_space_nd(input, block_shape, crops, name)"
+    expected_text = "tf.batch_to_space(input, block_shape, crops, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.batch_to_space(input, crops, block_size, name)"
+    expected_text = (
+        "tf.batch_to_space(input=input, crops=crops, block_shape=block_size, "
+        "name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.manip.batch_to_space_nd(input, block_shape, crops, name)"
+    expected_text = "tf.batch_to_space(input, block_shape, crops, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testExtractImagePatches(self):
+    text = (
+        "tf.extract_image_patches(images, ksizes=ksizes, strides=strides,"
+        "rates=rates, padding=padding, name=name)")
+    expected_text = (
+        "tf.image.extract_image_patches(images, sizes=ksizes, strides=strides,"
+        "rates=rates, padding=padding, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testStatelessMultinomial(self):
+    text = (
+        "tf.random.stateless_multinomial(logits, num_samples, seed, "
+        "output_dtype=dtype, name=name)")
+    expected_text = (
+        "tf.random.stateless_categorical(logits, num_samples, seed, "
+        "dtype=dtype, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSoftMaxCrossEntropyWithLogitsV2(self):
+    text = "tf.nn.softmax_cross_entropy_with_logits_v2(labels, logits, dim=2)"
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, axis=2)")
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    self.assertFalse(errors)
+
+  def testSoftMaxCrossEntropyWithLogits(self):
+    text = "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)"
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)")
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+    self.assertIn(
+        "tf.nn.softmax_cross_entropy_with_logits requires manual check.",
+        errors[0])
+    self.assertIn(
+        "tf.nn.softmax_cross_entropy_with_logits behavior has changed. ",
+        report)
+
+  def testSparseMatmul(self):
+    text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
+    expected_text = ("tf.linalg.matmul(a=a, b=b, transpose_a=c, transpose_b=d, "
+                     "a_is_sparse=e, b_is_sparse=f, name=g)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testWeightedMoments(self):
+    text = "tf.nn.weighted_moments(x, axes, freq, name, kd)"
+    expected_text = (
+        "tf.nn.weighted_moments(x=x, axes=axes, frequency_weights=freq, "
+        "name=name, keepdims=kd)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSparseAdd(self):
+    text = "tf.sparse.add(a, b, t)"
+    expected_text = "tf.sparse.add(a=a, b=b, threshold=t)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSparseConcat(self):
+    text = "tf.sparse.concat(ax, inp, name, exp, concat)"
+    expected_text = (
+        "tf.sparse.concat(axis=ax, sp_inputs=inp, name=name, "
+        "expand_nonconcat_dims=exp, axis=concat)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSeparableConv2D(self):
+    text = "tf.nn.separable_conv2d(inp, d, pt, strides, pad, rate, name, fmt)"
+    expected_text = (
+        "tf.nn.separable_conv2d(input=inp, depthwise_filter=d, "
+        "pointwise_filter=pt, strides=strides, padding=pad, "
+        "dilations=rate, name=name, data_format=fmt)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSpacetoBatch(self):
+    text = "tf.space_to_batch_nd(input, shape, paddings, name)"
+    expected_text = "tf.space_to_batch(input, shape, paddings, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.nn.space_to_batch(input, paddings, block_size, name)"
+    expected_text = (
+        "tf.space_to_batch(input=input, paddings=paddings, "
+        "block_shape=block_size, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testInTopK(self):
+    text = "tf.math.in_top_k(a, b, c, n)"
+    expected_text = (
+        "tf.math.in_top_k(predictions=a, targets=b, k=c, name=n)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testDepthToSpace(self):
+    text = "tf.nn.depth_to_space(input, block_size, name, data_format)"
+    expected_text = (
+        "tf.nn.depth_to_space(input=input, block_size=block_size, "
+        "name=name, data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testEmbeddingLookup(self):
+    text = ("tf.nn.embedding_lookup(params, ids, partition_strategy, name, "
+            "validate_indices, max_norm)")
+    expected_text = ("tf.nn.embedding_lookup(params=params, ids=ids, "
+                     "partition_strategy=partition_strategy, name=name, "
+                     "validate_indices=validate_indices, max_norm=max_norm)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testEmbeddingLookupSparse(self):
+    text = ("tf.nn.embedding_lookup_sparse(params, sp_ids, sp_weights, "
+            "partition_strategy, name, combiner, max_norm)")
+    expected_text = ("tf.nn.embedding_lookup_sparse(params=params, "
+                     "sp_ids=sp_ids, sp_weights=sp_weights, "
+                     "partition_strategy=partition_strategy, name=name, "
+                     "combiner=combiner, max_norm=max_norm)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testNnInTopK(self):
+    text = "tf.nn.in_top_k(predictions, targets, k, name)"
+    expected_text = ("tf.nn.in_top_k(predictions=predictions, "
+                     "targets=targets, k=k, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSpaceToDepth(self):
+    text = "tf.nn.space_to_depth(input, block_size, name, data_format)"
+    expected_text = ("tf.nn.space_to_depth(input=input, block_size=block_size, "
+                     "name=name, data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testPrint(self):
+    # tf.print() cannot be parsed unless we import print_function
+    text = """from __future__ import print_function
+tf.print()
+tf.print('abc')
+"""
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, text)  # Text should stay the same
+
+  def testSparseSplit(self):
+    text = (
+        "tf.sparse_split(sp_input=sp_input, num_split=num_split, axis=axis, "
+        "name=name)")
+    expected_text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, axis=axis, "
+        "name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.sparse_split(sp_input=sp_input, num_split=num_split, "
+        "name=name, split_dim=axis)")
+    expected_text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, "
+        "name=name, axis=axis)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, "
+        "name=name, split_dim=axis)")
+    expected_text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, "
+        "name=name, axis=axis)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
 
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
@@ -82,8 +744,8 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
   def testInplace(self):
     """Check to make sure we don't have a file system race."""
     temp_file = tempfile.NamedTemporaryFile("w", delete=False)
-    original = "tf.acos(a, b)\n"
-    upgraded = "tf.math.acos(a, b)\n"
+    original = "tf.conj(a)\n"
+    upgraded = "tf.math.conj(a)\n"
     temp_file.write(original)
     temp_file.close()
     upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index feb37c902ec3359e6221937f4334ab2504394fa3..75bb0cfd2b7569c899fb72aa5ac9f4e608c3decc 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -9,7 +9,23 @@ py_binary(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:lib",
+        "//tensorflow/python:no_contrib",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
+        "//tensorflow/tools/compatibility:tf_upgrade_v2_lib",
+    ],
+)
+
+py_binary(
+    name = "generate_v2_reorders_map",
+    srcs = ["generate_v2_reorders_map.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:no_contrib",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
+        "//tensorflow/tools/compatibility:tf_upgrade_v2_lib",
     ],
 )
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
index 567eceb0b6595ceac624fe8211f22885a6490d85..19ad6c3a2a5c723cbbff2c76c8bfe6517ca4a4f0 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -20,14 +20,19 @@ To update renames_v2.py, run:
   bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
 """
 # pylint: enable=line-too-long
+import sys
 
 import tensorflow as tf
 
+# This import is needed so that TensorFlow python modules are in sys.modules.
+from tensorflow import python as tf_python  # pylint: disable=unused-import
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import app
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
+from tensorflow.tools.compatibility import tf_upgrade_v2
 
 
 _OUTPUT_FILE_PATH = 'third_party/tensorflow/tools/compatibility/renames_v2.py'
@@ -59,39 +64,173 @@ from __future__ import print_function
 
 """
 
+_TENSORFLOW_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
+_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+_TENSORFLOW_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].constants)
+_TENSORFLOW_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
+
+_ESTIMATOR_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
+_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
+_ESTIMATOR_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].constants)
+_ESTIMATOR_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].constants)
+
+
+def get_v1_names(symbol):
+  names_v1 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  names_v2 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR):
+    names_v2.extend(getattr(symbol, _TENSORFLOW_API_ATTR))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR):
+    names_v2.extend(getattr(symbol, _ESTIMATOR_API_ATTR))
+  return list(names_v2)
+
+
+def get_v1_constants(module):
+  constants_v1 = []
+  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1):
+    constants_v1.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1))
+  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1):
+    constants_v1.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1))
+  return constants_v1
+
+
+def get_v2_constants(module):
+  constants_v2 = []
+  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR):
+    constants_v2.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR))
+  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR):
+    constants_v2.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR))
+  return constants_v2
+
+
+def get_canonical_name(v2_names, v1_name):
+  if v2_names:
+    return v2_names[0]
+  return 'compat.v1.%s' % v1_name
+
+
+def get_all_v2_names():
+  """Get a set of function/class names available in TensorFlow 2.0."""
+  v2_names = set()  # All op names in TensorFlow 2.0
 
-def update_renames_v2(output_file_path):
-  """Writes a Python dictionary mapping deprecated to canonical API names.
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects TF 2.0 names."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      api_names_v2 = get_v2_names(attr)
+      for name in api_names_v2:
+        v2_names.add(name)
 
-  Args:
-    output_file_path: File path to write output to. Any existing contents
-      would be replaced.
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  traverse.traverse(tf.compat.v2, visitor)
+  return v2_names
+
+
+def collect_constant_renames():
+  """Looks for constants that need to be renamed in TF 2.0.
+
+  Returns:
+    Set of tuples of the form (current name, new name).
+  """
+  renames = set()
+  for module in sys.modules.values():
+    constants_v1_list = get_v1_constants(module)
+    constants_v2_list = get_v2_constants(module)
+
+    # _tf_api_constants attribute contains a list of tuples:
+    # (api_names_list, constant_name)
+    # We want to find API names that are in V1 but not in V2 for the same
+    # constant_names.
+
+    # First, we convert constants_v1_list and constants_v2_list to
+    # dictionaries for easier lookup.
+    constants_v1 = {constant_name: api_names
+                    for api_names, constant_name in constants_v1_list}
+    constants_v2 = {constant_name: api_names
+                    for api_names, constant_name in constants_v2_list}
+    # Second, we look for names that are in V1 but not in V2.
+    for constant_name, api_names_v1 in constants_v1.items():
+      api_names_v2 = constants_v2[constant_name]
+      for name in api_names_v1:
+        if name not in api_names_v2:
+          renames.add((name, get_canonical_name(api_names_v2, name)))
+  return renames
+
+
+def collect_function_renames():
+  """Looks for functions/classes that need to be renamed in TF 2.0.
+
+  Returns:
+    Set of tuples of the form (current name, new name).
   """
   # Set of rename lines to write to output file in the form:
   #   'tf.deprecated_name': 'tf.canonical_name'
-  rename_line_set = set()
-  # _tf_api_names attribute name
-  tensorflow_api_attr = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+  renames = set()
 
   def visit(unused_path, unused_parent, children):
     """Visitor that collects rename strings to add to rename_line_set."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      if not hasattr(attr, '__dict__'):
-        continue
-      api_names = attr.__dict__.get(tensorflow_api_attr, [])
-      deprecated_api_names = attr.__dict__.get('_tf_deprecated_api_names', [])
-      canonical_name = tf_export.get_canonical_name(
-          api_names, deprecated_api_names)
+      api_names_v1 = get_v1_names(attr)
+      api_names_v2 = get_v2_names(attr)
+      deprecated_api_names = set(api_names_v1) - set(api_names_v2)
       for name in deprecated_api_names:
-        rename_line_set.add('    \'tf.%s\': \'tf.%s\'' % (name, canonical_name))
+        renames.add((name, get_canonical_name(api_names_v2, name)))
 
   visitor = public_api.PublicAPIVisitor(visit)
   visitor.do_not_descend_map['tf'].append('contrib')
+  visitor.do_not_descend_map['tf.compat'] = ['v1', 'v2']
   traverse.traverse(tf, visitor)
 
+  # It is possible that a different function is exported with the
+  # same name. For e.g. when creating a different function to
+  # rename arguments. Exclude it from renames in this case.
+  v2_names = get_all_v2_names()
+  renames = set((name, new_name) for name, new_name in renames
+                if name not in v2_names)
+  return renames
+
+
+def get_rename_line(name, canonical_name):
+  return '    \'tf.%s\': \'tf.%s\'' % (name, canonical_name)
+
+
+def update_renames_v2(output_file_path):
+  """Writes a Python dictionary mapping deprecated to canonical API names.
+
+  Args:
+    output_file_path: File path to write output to. Any existing contents
+      would be replaced.
+  """
+  function_renames = collect_function_renames()
+  constant_renames = collect_constant_renames()
+  all_renames = function_renames.union(constant_renames)
+  manual_renames = set(
+      tf_upgrade_v2.TFAPIChangeSpec().manual_symbol_renames.keys())
+
+  # List of rename lines to write to output file in the form:
+  #   'tf.deprecated_name': 'tf.canonical_name'
+  rename_lines = [
+      get_rename_line(name, canonical_name)
+      for name, canonical_name in all_renames
+      if 'tf.' + name not in manual_renames]
   renames_file_text = '%srenames = {\n%s\n}\n' % (
-      _FILE_HEADER, ',\n'.join(sorted(rename_line_set)))
+      _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
   file_io.write_string_to_file(output_file_path, renames_file_text)
 
 
@@ -100,4 +239,4 @@ def main(unused_argv):
 
 
 if __name__ == '__main__':
-  tf.app.run(main=main)
+  app.run(main=main)
diff --git a/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..63541771bf36fb243ae241fbf1b4c4a83cf19fd7
--- /dev/null
+++ b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
@@ -0,0 +1,166 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""Script for updating tensorflow/tools/compatibility/reorders_v2.py.
+
+To update reorders_v2.py, run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+"""
+# pylint: enable=line-too-long
+import tensorflow as tf
+
+# This import is needed so that TensorFlow python modules are in sys.modules.
+from tensorflow import python as tf_python  # pylint: disable=unused-import
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import app
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
+from tensorflow.tools.compatibility import tf_upgrade_v2
+
+
+_OUTPUT_FILE_PATH = 'third_party/tensorflow/tools/compatibility/reorders_v2.py'
+_FILE_HEADER = """# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+\"\"\"List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+This file should be updated whenever a function is added to
+self.reordered_function_names in tf_upgrade_v2.py.
+\"\"\"
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""
+
+_TENSORFLOW_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
+_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+_TENSORFLOW_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].constants)
+_TENSORFLOW_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
+
+_ESTIMATOR_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
+_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
+_ESTIMATOR_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].constants)
+_ESTIMATOR_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].constants)
+
+
+def get_v1_names(symbol):
+  names_v1 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  names_v2 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR):
+    names_v2.extend(getattr(symbol, _TENSORFLOW_API_ATTR))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR):
+    names_v2.extend(getattr(symbol, _ESTIMATOR_API_ATTR))
+  return list(names_v2)
+
+
+def collect_function_arg_names(function_names):
+  """Determines argument names for reordered function signatures.
+
+  Args:
+    function_names: Functions to collect arguments for.
+
+  Returns:
+    Dictionary mapping function name to its arguments.
+  """
+  # Map from reordered function name to its arguments.
+  function_to_args = {}
+
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects arguments for reordered functions."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      api_names_v1 = get_v1_names(attr)
+      api_names_v1 = ['tf.%s' % name for name in api_names_v1]
+      matches_function_names = any(
+          name in function_names for name in api_names_v1)
+      if matches_function_names:
+        arg_list = tf_inspect.getargspec(attr)[0]
+        for name in api_names_v1:
+          function_to_args[name] = arg_list
+
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  visitor.do_not_descend_map['tf.compat'] = ['v1', 'v2']
+  traverse.traverse(tf, visitor)
+
+  return function_to_args
+
+
+def get_reorder_line(name, arg_list):
+  return '    \'%s\': %s' % (name, str(arg_list))
+
+
+def update_reorders_v2(output_file_path):
+  """Writes a Python dictionary mapping function name to argument order.
+
+  Args:
+    output_file_path: File path to write output to. Any existing contents
+      would be replaced.
+  """
+  reordered_function_names = (
+      tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
+
+  all_reorders = collect_function_arg_names(reordered_function_names)
+
+  # List of reorder lines to write to output file in the form:
+  #   'tf.function_name': ['arg1', 'arg2', ...]
+  rename_lines = [
+      get_reorder_line(name, arg_names)
+      for name, arg_names in all_reorders.items()]
+  renames_file_text = '%sreorders = {\n%s\n}\n' % (
+      _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
+  file_io.write_string_to_file(output_file_path, renames_file_text)
+
+
+def main(unused_argv):
+  update_reorders_v2(_OUTPUT_FILE_PATH)
+
+
+if __name__ == '__main__':
+  app.run(main=main)
diff --git a/tensorflow/tools/dist_test/README.md b/tensorflow/tools/dist_test/README.md
index f8ed74aaf75b02c3047d9a8112a6d5701f1115ca..1e29977788176477492a03c4683cc489ec9fae44 100644
--- a/tensorflow/tools/dist_test/README.md
+++ b/tensorflow/tools/dist_test/README.md
@@ -1,6 +1,7 @@
 # Testing Distributed Runtime in TensorFlow
-This folder containers tools and test suites for the GRPC-based distributed
-runtime in TensorFlow.
+
+This folder contains tools and test suites for GRPC-based and Allreduce-based
+distributed runtimes in TensorFlow.
 
 There are three general modes of testing:
 
@@ -122,3 +123,37 @@ servers. For example:
 
 See [Kubernetes kubectl documentation](http://kubernetes.io/docs/user-guide/kubectl-overview/)
 for more details.
+
+**Create allreduce-based Tensorflow k8s deployment**
+
+The allreduce-based Tensorflow, Horovod, is an open source distributed deep
+learning framework for TensorFlow, detailed information can be found in
+https://arxiv.org/pdf/1802.05799.pdf.
+
+The script "scripts_allreduce/k8s_deploy_tensorflow.sh" can be used to create or
+delete an allreduce-based Tensorflow k8s deployment with specified number of
+containers.
+
+Create a deployment containing a number of containers and enable passwordless
+ssh between the containers (optional: enable host network mode with --hostnet
+and --port <container_ssh_port>):
+
+    scripts_allreduce/k8s_deploy_tensorflow.sh \
+        --num_containers <num_of_containers> \
+        --image <docker_image> \
+        --deployment <deployment_name> \
+        --config_map <config_map>
+
+Delete a deployment and config_map in k8s cluster:
+
+    scripts_allreduce/k8s_deploy_tensorflow.sh \
+        --deployment <deployment_name> \
+        --config_map <config_map> \
+        --delete
+
+Upload file or directory to all the containers of a deployment:
+
+    scripts_allreduce/k8s_deploy_tensorflow.sh \
+        --cp --src <path_to_local_directory> \
+        --dest <path_to_directory_on_containers> \
+        --deployment <deployment_name>
diff --git a/tensorflow/tools/dist_test/scripts_allreduce/k8s_deploy_tensorflow.sh b/tensorflow/tools/dist_test/scripts_allreduce/k8s_deploy_tensorflow.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2f83c36fad1b0e5cffb90e73e230cc23c21338f5
--- /dev/null
+++ b/tensorflow/tools/dist_test/scripts_allreduce/k8s_deploy_tensorflow.sh
@@ -0,0 +1,254 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+function usage {
+  script_name=$0
+  echo "Usage:"
+  echo "  $script_name [--image docker_image] [--num_containers num_of_containers]"
+  echo "               [--deployment deployment_name] [--config_map config_map]"
+  echo "               [--cp] [--src local_src_dir] [--dest container_dest_dir]"
+  echo "               [--port container_ssh_port] [--hostnet] [--shared_volume]"
+  echo "               [--delete] [--help]"
+  echo ""
+  echo "  Parameters:"
+  echo "    image:          docker image used to create container."
+  echo "    num_containers: number of containers that will be launched."
+  echo "    deployment:     deployment name. (default: k8s-ml-deployment)"
+  echo "    config_map:     config map name. (default: k8s-config-map)"
+  echo "    cp:             upload file to all containers. (src and dest must"
+  echo "                    be provided along with cp option)"
+  echo "    src:            path to local source file. (used for cp option)"
+  echo "    dest:           path to destination in container. (used for cp option)"
+  echo "    port:           ssh port in container. Set ssh port (other than 22)"
+  echo "                    when host network mode is enabled"
+  echo "    hostnet:        enable host network mode. (default: disable)"
+  echo "    shared_volume:  mount shared volume. (default: disable)"
+  echo "    delete:         delete deployment and configmap."
+  echo "                    (default: k8s-ml-deployment and k8s-config-map)"
+  echo "    help:           print usage."
+}
+
+# Create temporary directory
+TMP_DIR=$(mktemp -d)
+
+# Temporary k8s yaml file
+YAML_TMP_FILE="${TMP_DIR}/k8s_ml.yaml"
+
+# Temporary hostfile
+HOST_FILE="${TMP_DIR}/hostfile"
+
+# Docker image and number of containers
+DOCKER_IMAGE=""
+NUM_CONTAINERS=0
+
+# Default ssh port
+SSH_PORT=22
+
+# Default config map
+CONFIG_MAP="k8s-config-map"
+
+# Default Deployment
+DEPLOYMENT="k8s-ml-deployment"
+
+# Used for uploading file to all docker containers
+CP=0
+SRC=""
+DEST=""
+
+# Python script to generate yaml file for k8s TensorFlow cluster
+CUR_SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+K8S_GEN_ALLREDUCE_TF_YAML="${CUR_SCRIPT_DIR}/k8s_generate_yaml.py"
+
+# Create or delete tensorflow cluster
+# DELETE=0: Create cluster
+# DELETE=1: Delete cluster
+DELETE=0
+
+# Used to enable host network mode to achieve best performance
+# USE_HOSTNET=0: Flannel network mode
+# USE_HOSTNET=1: Host network mode
+USE_HOSTNET=0
+
+# Used to mount shared volume
+USE_SHARED_VOLUME=0
+
+if [[ $# -lt 1 ]]; then
+  echo "Error: illegal number of parameters"
+  usage
+  exit 1
+fi
+
+while [[ $# -ge 1 ]]; do
+  key="$1"
+  case $key in
+    --image)
+      DOCKER_IMAGE="$2"
+      shift
+      ;;
+    --num_containers)
+      NUM_CONTAINERS="$2"
+      shift
+      ;;
+    --config_map)
+      CONFIG_MAP="$2"
+      shift
+      ;;
+    --deployment)
+      DEPLOYMENT="$2"
+      shift
+      ;;
+    --cp)
+      CP=1
+      ;;
+    --src)
+      SRC="$2"
+      shift
+      ;;
+    --dest)
+      DEST="$2"
+      shift
+      ;;
+    --port)
+      SSH_PORT="$2"
+      shift
+      ;;
+    --hostnet)
+      USE_HOSTNET=1
+      ;;
+    --shared_volume)
+      USE_SHARED_VOLUME=1
+      ;;
+    --delete)
+      DELETE=1
+      ;;
+    --help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $key"
+      usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+function generate_yaml_file {
+  if [[ ! -f ${K8S_GEN_ALLREDUCE_TF_YAML} ]]; then
+    echo "Error: can not find yaml-generating script ${K8S_GEN_ALLREDUCE_TF_YAML}"
+    exit 1
+  fi
+
+  echo ""
+  echo "Generating k8s cluster yaml config file with the following settings"
+  echo "  Docker image: ${DOCKER_IMAGE}"
+  echo "  Number of containers: ${NUM_CONTAINERS}"
+  echo "  Config map: ${CONFIG_MAP}"
+  echo "  Deployment: ${DEPLOYMENT}"
+
+  if [[ $USE_HOSTNET -eq 1 ]]; then
+    echo "  Host network mode: True"
+    echo "  Container ssh port: ${SSH_PORT}"
+  fi
+
+  python ${K8S_GEN_ALLREDUCE_TF_YAML} \
+    --docker_image ${DOCKER_IMAGE} \
+    --num_containers ${NUM_CONTAINERS} \
+    --config_map ${CONFIG_MAP} \
+    --deployment ${DEPLOYMENT} \
+    --ssh_port ${SSH_PORT} \
+    --use_hostnet ${USE_HOSTNET} \
+    --use_shared_volume ${USE_SHARED_VOLUME} \
+    > ${YAML_TMP_FILE}
+}
+
+# Note: this function remove the yaml file to make sure that the key automatically
+# generated inside the container is not reused in other deployment
+function remove_yaml_file {
+  rm -rf ${YAML_TMP_FILE}
+}
+
+function upload_file_to_all_containers {
+  ${KUBECTL_BIN} get pods | grep ${DEPLOYMENT} \
+    | awk '{print $1}' | \
+    while read line;
+    do
+      echo "Uploading $1 to $line:$2"
+      ${KUBECTL_BIN} cp $1 $line:$2
+    done
+}
+
+function generate_container_hostfile {
+  # This line assumes that --output=wide prints the IP addresses
+  # in the 6th column
+  ${KUBECTL_BIN} get pods --output=wide | grep ${DEPLOYMENT} \
+      | awk '{print $6}' > ${HOST_FILE}
+
+  echo ""
+  echo "Containers hostfile locates at ${HOST_FILE}"
+}
+
+function launch_container {
+  generate_yaml_file
+  echo ""
+  echo "Launching k8s cluster..."
+  ${KUBECTL_BIN} create -f ${YAML_TMP_FILE}
+  generate_container_hostfile
+  remove_yaml_file
+}
+
+function delete_deployment_configmap {
+  ${KUBECTL_BIN} delete deployment ${DEPLOYMENT}
+  ${KUBECTL_BIN} delete configmap ${CONFIG_MAP}
+}
+
+# Check kubectl binary
+KUBECTL_BIN=kubectl
+if [[ ! -x "$(command -v ${KUBECTL_BIN})" ]]; then
+  echo 'Error: cannot find kubectl binary'
+  exit 1
+fi
+
+if [[ $DELETE -eq 1 ]]; then
+  echo "Deleting deployment ${DEPLOYMENT} and config map ${CONFIG_MAP}..."
+  delete_deployment_configmap
+elif [[ $CP -eq 1 || -n "$SRC" || -n "$DEST" ]] ; then
+  if [[ "$CP" -eq 1 && -n "$SRC" && -n "$DEST" ]]; then
+    upload_file_to_all_containers $SRC $DEST
+  else
+    echo "Error: all cp, src and dest are required to upload file to container"
+    exit 1
+  fi
+else
+  if [[ -z "$DOCKER_IMAGE" ]]; then
+    echo "Error: docker image is missing"
+    exit 1
+  fi
+
+  if [[ "$NUM_CONTAINERS" -le 0 ]]; then
+    echo "Error: illegal number of containers"
+    exit 1
+  fi
+
+  if [[ $USE_HOSTNET -eq 1 && $SSH_PORT -eq 22 ]]; then
+    echo "Error: please set container ssh port with --port (other than 22)" \
+        "when host network mode is enabled"
+    exit 1
+  fi
+
+  launch_container
+fi
diff --git a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml.py b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd3d49af9b3b7ae62447f56e98f34638b25705c1
--- /dev/null
+++ b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generates YAML configuration file for allreduce-based distributed TensorFlow.
+
+The workers will be run in a Kubernetes (k8s) container cluster.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+import k8s_generate_yaml_lib
+
+# Note: It is intentional that we do not import tensorflow in this script. The
+# machine that launches a TensorFlow k8s cluster does not have to have the
+# Python package of TensorFlow installed on it.
+
+DEFAULT_DOCKER_IMAGE = 'tensorflow/tensorflow:latest-devel'
+DEFAULT_PORT = 22
+
+DEFAULT_CONFIG_MAP = 'k8s-config-map'
+DEFAULT_DEPLOYMENT = 'k8s-ml-deployment'
+
+
+def main():
+  """Do arg parsing."""
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--docker_image',
+      type=str,
+      default=DEFAULT_DOCKER_IMAGE,
+      help='Override default docker image for the TensorFlow')
+  parser.add_argument(
+      '--num_containers',
+      type=int,
+      default=0,
+      help='How many docker containers to launch')
+  parser.add_argument(
+      '--config_map',
+      type=str,
+      default=DEFAULT_CONFIG_MAP,
+      help='Override default config map')
+  parser.add_argument(
+      '--deployment',
+      type=str,
+      default=DEFAULT_DEPLOYMENT,
+      help='Override default deployment')
+  parser.add_argument(
+      '--ssh_port',
+      type=int,
+      default=DEFAULT_PORT,
+      help='Override default ssh port (Default: %d)' % DEFAULT_PORT)
+  parser.add_argument(
+      '--use_hostnet',
+      type=int,
+      default=0,
+      help='Used to enable host network mode (Default: 0)')
+  parser.add_argument(
+      '--use_shared_volume',
+      type=int,
+      default=0,
+      help='Used to mount shared volume (Default: 0)')
+  args = parser.parse_args()
+
+  if args.num_containers <= 0:
+    sys.stderr.write('--num_containers must be greater than 0; received %d\n' %
+                     args.num_containers)
+    sys.exit(1)
+
+  # Generate contents of yaml config
+  yaml_config = k8s_generate_yaml_lib.GenerateConfig(
+      args.docker_image, args.num_containers, args.config_map, args.deployment,
+      args.ssh_port, args.use_hostnet, args.use_shared_volume)
+  print(yaml_config)  # pylint: disable=superfluous-parens
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..c570d1a9f834bd9df57df62088a0c4562be9512c
--- /dev/null
+++ b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
@@ -0,0 +1,228 @@
+#!/usr/bin/python
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generates YAML configuration file for allreduce-based distributed TensorFlow.
+
+The workers will be run in a Kubernetes (k8s) container cluster.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from Crypto.PublicKey import RSA
+
+# Note: It is intentional that we do not import tensorflow in this script. The
+# machine that launches a TensorFlow k8s cluster does not have to have the
+# Python package of TensorFlow installed on it.
+
+CONFIG_MAP = ("""apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {config_map}
+data:
+  privatekey: |+
+    {private_key}
+
+  publickey: |+
+    {public_key}
+
+  start: |+
+    mkdir /root/.ssh
+    mkdir /var/run/sshd
+    cp /tmp/configs/* /root/.ssh
+    cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
+    chmod 600 -R /root/.ssh
+    {change_ssh_port}
+    /usr/bin/ssh-keygen -A
+    /usr/sbin/sshd -De
+
+  sshconfig: |+
+    Host *
+      Port {port}
+      StrictHostKeyChecking no
+
+""")
+
+DEPLOYMENT = ("""apiVersion: apps/v1beta1
+kind: Deployment
+metadata:
+  name: {deployment}
+  labels:
+    app: k8s-ml
+spec:
+  replicas: {num_containers}
+  selector:
+    matchLabels:
+      app: k8s-ml
+  template:
+    metadata:
+      labels:
+        app: k8s-ml
+    spec: {hostnet}
+      securityContext:
+        runAsUser: 0
+      containers:
+      - name: ml
+        image: {docker_image}
+        command:
+        - /bin/bash
+        - -x
+        - /tmp/scripts/start.sh
+        ports:
+        - containerPort: {port}
+        env: [{env_vars}]
+        securityContext:
+          privileged: true
+        volumeMounts: {volume_mounts}
+        - name: dshm
+          mountPath: /dev/shm
+        - name: sshkeys
+          mountPath: /tmp/configs
+        - name: scripts
+          mountPath: /tmp/scripts
+      volumes: {volumes}
+      - name: dshm
+        emptyDir:
+          medium: Memory
+      - name: sshkeys
+        configMap:
+          name: {config_map}
+          items:
+          - key: publickey
+            path: id_rsa.pub
+          - key: privatekey
+            path: id_rsa
+          - key: sshconfig
+            path: config
+      - name: scripts
+        configMap:
+          name: {config_map}
+          items:
+          - key: start
+            path: start.sh
+""")
+_ENV_VAR_TEMPLATE = '{name: "%s", value: "%s"}'
+
+
+def GenerateConfig(docker_image,
+                   num_containers,
+                   config_map,
+                   deployment,
+                   port,
+                   use_hostnet,
+                   use_shared_volume,
+                   env_vars=None):
+  """Generate configuration strings.
+
+  Args:
+    docker_image: docker image to use.
+    num_containers: number of containers.
+    config_map: config map.
+    deployment: deployment.
+    port: ssh port.
+    use_hostnet: Used to enable host network mode.
+    use_shared_volume: Used to mount shared volume.
+    env_vars: dictionary of environment variables to set.
+
+  Returns:
+    Kubernetes yaml config.
+  """
+
+  if env_vars is None:
+    env_vars = {}
+  env_str = ', '.join(
+      [_ENV_VAR_TEMPLATE % (name, value) for name, value in env_vars.items()])
+
+  private_key, public_key = generate_RSA(2048)
+
+  CHANGE_SSH_PORT = get_change_ssh_port(use_hostnet, port)
+
+  config = CONFIG_MAP.format(
+      port=port,
+      config_map=config_map,
+      private_key=private_key,
+      public_key=public_key,
+      change_ssh_port=CHANGE_SSH_PORT,
+      env_vars=env_str)
+  config += '---\n\n'
+
+  HOST_NET = get_hostnet(use_hostnet)
+  VOLUME_MOUNTS = get_volume_mounts(use_shared_volume)
+  VOLUMES = get_volumes(use_shared_volume)
+
+  config += DEPLOYMENT.format(
+      deployment=deployment,
+      num_containers=num_containers,
+      docker_image=docker_image,
+      port=port,
+      config_map=config_map,
+      hostnet=HOST_NET,
+      volume_mounts=VOLUME_MOUNTS,
+      volumes=VOLUMES,
+      env_vars=env_str)
+
+  return config
+
+
+def generate_RSA(bits=2048, exponent=65537):
+  key = RSA.generate(bits, e=exponent)
+  pubkey = key.publickey()
+
+  private_key = key.exportKey('PEM')
+  public_key = pubkey.exportKey('OpenSSH')
+
+  # Format private_key in yaml file
+  space_before = ' ' * 4
+  private_key_split = private_key.split('\n')
+  private_key = ''.join(('' if index == 0 else space_before) + line.strip() \
+        + ('\n' if index != len(private_key_split) - 1 else '') \
+        for index, line in enumerate(private_key_split))
+
+  return private_key, public_key
+
+
+def get_change_ssh_port(use_hostnet, port):
+  if use_hostnet == 1:
+    return "sed -i '/Port 22/c\Port {}' /etc/ssh/sshd_config".format(port)
+
+  return ''
+
+
+def get_hostnet(use_hostnet):
+  if use_hostnet == 1:
+    return """
+      hostNetwork: true
+      hostIPC: true"""
+
+  return ''
+
+
+def get_volume_mounts(use_shared_volume):
+  if use_shared_volume == 1:
+    return """
+        - name: shared
+          mountPath: /shared"""
+
+  return ''
+
+
+def get_volumes(use_shared_volume):
+  if use_shared_volume == 1:
+    return """
+       - name: shared
+         hostPath:
+           path: /shared"""
+
+  return ''
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 205128ad58a39d113d4f3414e009f547a5cb38af..6676de02a41dcd53e92b905e98e811cd71833e20 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 6f8e91fccf88b2d864e834ee151bdd39a97c0657..c256dd364ef5a29ba7f8a2afa6e772ee9c566cb8 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
@@ -9,7 +9,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python-dev \
@@ -78,7 +78,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.11 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.12 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 69a117fda6d98d8dd9ca4a4c2bd9d3c814ae1b73..7f9b55b45595bc74b51e14883d1fd1dc19b9099c 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -100,7 +100,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.11 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.12 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
index e433e9ebb2b8fb381826631c683891f30156ec1c..2341c0e8ccfc5f88356ed38f33cca356c207214f 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl
@@ -1,9 +1,9 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 
 # These parameters can be overridden by parameterized_docker_build.sh
-ARG TF_BUILD_VERSION=r1.11
+ARG TF_BUILD_VERSION=r1.12
 ARG PYTHON="python"
 ARG PYTHON3_DEV=""
 ARG WHL_DIR="/tmp/pip"
@@ -16,7 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         libssl-dev \
         pkg-config \
@@ -115,6 +115,7 @@ RUN export TAG_PREFIX="v" && \
     fi
 
 RUN yes "" | ${PYTHON} configure.py
+RUN cp .bazelrc /root/.bazelrc
 
 ENV CI_BUILD_PYTHON ${PYTHON}
 
@@ -125,7 +126,8 @@ ENV CI_BUILD_PYTHON ${PYTHON}
 # --copt=-march="avx" \
 # For haswell, broadwell, or skylake
 # --copt=-march="avx2" \
-COPY .bazelrc /root/.bazelrc
+COPY .bazelrc /root/.mkl.bazelrc
+RUN echo "import /root/.mkl.bazelrc" >>/root/.bazelrc
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
     bazel --bazelrc=/root/.bazelrc build -c opt \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
index 48f240056919ec5f6e523d9030bb25b14a849a07..5e24617b2190f1d564d63f4c9be6321aa03cd8fb 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
@@ -1,9 +1,9 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 
 # These parameters can be overridden by parameterized_docker_build.sh
-ARG TF_BUILD_VERSION=r1.9
+ARG TF_BUILD_VERSION=r1.11
 ARG PYTHON="python"
 ARG PYTHON3_DEV=""
 ARG WHL_DIR="/tmp/pip"
@@ -16,7 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python-dev \
@@ -29,7 +29,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         openjdk-8-jdk \
         openjdk-8-jre-headless \
         wget \
-        numactl \
+        libnuma-dev \
         openssh-client \
         openssh-server \
         && \
@@ -106,6 +106,7 @@ RUN export TAG_PREFIX="v" && \
     fi
 
 RUN yes "" | ${PYTHON} configure.py
+RUN cp .bazelrc /root/.bazelrc
 
 ENV CI_BUILD_PYTHON ${PYTHON}
 
@@ -116,7 +117,8 @@ ENV CI_BUILD_PYTHON ${PYTHON}
 # --copt=-march="avx" \
 # For haswell, broadwell, or skylake
 # --copt=-march="avx2" \
-COPY .bazelrc /root/.bazelrc
+COPY .bazelrc /root/.mkl.bazelrc
+RUN echo "import /root/.mkl.bazelrc" >>/root/.bazelrc
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
     bazel --bazelrc=/root/.bazelrc build -c opt \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl
index ac41cffe4bcc4742d7cc9256b11ceb0964515b84..dad27697fa142ac80d7237510b8b7d7ebda2b621 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl
+++ b/tensorflow/tools/docker/Dockerfile.mkl
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 
@@ -17,7 +17,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         ${PYTHON} \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod
index 4daf4fefffa8ddf8cde4eb1ccc56ec714cb202c3..19dc45c62cbc79bf931d89f275b5a7816e9924c8 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.mkl-horovod
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 
@@ -6,7 +6,7 @@ LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 ARG TF_WHL_URL
 
 # Optional parameters
-ARG TF_BUILD_VERSION=r1.9
+ARG TF_BUILD_VERSION=r1.11
 ARG PYTHON="python"
 ARG PYTHON_DEV="python-dev"
 ARG PIP="pip"
@@ -17,7 +17,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python \
@@ -25,6 +25,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         rsync \
         software-properties-common \
         unzip \
+        wget \
+        libnuma-dev \
+        openssh-client \
+        openssh-server \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
diff --git a/tensorflow/tools/docker/LICENSE b/tensorflow/tools/docker/LICENSE
index 28711d7885dbc8013847e063fa6e1f922525388f..dea770e05eeb359ba155c1a207f80852ca7d27aa 100644
--- a/tensorflow/tools/docker/LICENSE
+++ b/tensorflow/tools/docker/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index 263f25bc482fec0b2e97780b87360337a2d9dc37..176094cebfe4ad9864bf9c4ac2096f21bc99b83f 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -2,6 +2,9 @@
 
 TensorFlow's Dockerfiles are now located in
 [`tensorflow/tools/dockerfiles/`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles).
+However, these Dockerfiles are still used to build
+[TensorFlow's official Docker images](https://hub.docker.com/r/tensorflow/tensorflow)
+while the internal infrastructure for the newer Dockerfiles is being developed.
 
 This directory will eventually be removed.
 
diff --git a/tensorflow/tools/docker/notebooks/LICENSE b/tensorflow/tools/docker/notebooks/LICENSE
index 28711d7885dbc8013847e063fa6e1f922525388f..dea770e05eeb359ba155c1a207f80852ca7d27aa 100644
--- a/tensorflow/tools/docker/notebooks/LICENSE
+++ b/tensorflow/tools/docker/notebooks/LICENSE
@@ -1,4 +1,4 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/tensorflow/tools/dockerfiles/.gitignore b/tensorflow/tools/dockerfiles/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d7efa472a92b23dfde1277acfe4b543f14842678
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/.gitignore
@@ -0,0 +1 @@
+dockerfiles/*.temp.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md
index 5996573cf187639fbbbb255a40a876f240a5e443..07bfd5960e686d1198548c080df9c733955a2903 100644
--- a/tensorflow/tools/dockerfiles/README.md
+++ b/tensorflow/tools/dockerfiles/README.md
@@ -1,9 +1,16 @@
 # TensorFlow Dockerfiles
 
-This directory houses TensorFlow's Dockerfiles. **DO NOT EDIT THE DOCKERFILES
-MANUALLY!** They are maintained by `assembler.py`, which builds Dockerfiles from
-the files in `partials/` and the rules in `spec.yml`. See [the Contributing
-section](#contributing) for more information.
+This directory houses TensorFlow's Dockerfiles and the infrastructure used to
+create and deploy them to
+[Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow).
+
+**DO NOT EDIT THE DOCKERFILES/ DIRECTORY MANUALLY!** The files within are
+maintained by `assembler.py`, which builds Dockerfiles from the files in
+`partials/` and the rules in `spec.yml`. See
+[the Contributing section](#contributing) for more information.
+
+These Dockerfiles are planned to replace the Dockerfiles used to generate
+[TensorFlow's official Docker images](https://hub.docker.com/r/tensorflow/tensorflow).
 
 ## Building
 
@@ -17,10 +24,10 @@ $ docker build -f ./dockerfiles/cpu.Dockerfile -t tf .
 Each Dockerfile has its own set of available `--build-arg`s which are documented
 in the Dockerfile itself.
 
-## Running
+## Running Locally Built Images
 
 After building the image with the tag `tf` (for example), use `docker run` to
-run the images. Examples are below.
+run the images.
 
 Note for new Docker users: the `-v` and `-u` flags share directories between
 the Docker container and your machine, and very important. Without
@@ -39,8 +46,10 @@ $ docker run -u $(id -u):$(id -g) -v $(pwd):/my-devel -it tf
 # GPU-based images (set up nvidia-docker2 first)
 $ docker run --runtime=nvidia -u $(id -u):$(id -g) -v $(pwd):/my-devel -it tf
 
-# Images with Jupyter run on port 8888, and needs a volume for notebooks
-$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(pwd):/notebooks -it tf
+# Images with Jupyter run on port 8888 and need a volume for your notebooks
+# You can change $(PWD) to the full path to a directory if your notebooks
+# live outside the current directory.
+$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(PWD):/tf/notebooks -it tf
 ```
 
 These images do not come with the TensorFlow source code -- but the development
@@ -57,11 +66,32 @@ You can use the `Dockerfile` in this directory to build an editing environment
 that has all of the Python dependencies you'll need:
 
 ```bash
-$ docker build -t tf-assembler -f assembler.Dockerfile .
+# Build the tools-helper image so you can run the assembler
+$ docker build -t tf-tools -f tools.Dockerfile .
 
 # Set --user to set correct permissions on generated files
-$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-assembler bash 
+$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-tools bash
+
+# Next you can make a handy alias depending on what you're doing. When building
+# Docker images, you need to run as root with docker.sock mounted so that the
+# container can run Docker commands. When assembling Dockerfiles, though, you'll
+# want to run as your user so that new files have the right permissions.
+
+# If you're BUILDING OR DEPLOYING DOCKER IMAGES, run as root with docker.sock:
+$ alias asm_images="docker run --rm -v $(pwd):/tf -v /var/run/docker.sock:/var/run/docker.sock tf-tools python3 assembler.py "
+
+# If you're REBUILDING OR ADDING DOCKERFILES, remove docker.sock and add -u:
+$ alias asm_dockerfiles="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/tf tf-tools python3 assembler.py "
+
+# Check flags
+$ asm_dockerfiles --help
+
+# Assemble all of the Dockerfiles
+$ asm_dockerfiles --release ubuntu-dockerfiles --construct_dockerfiles
+
+# Build all of the "nightly" images on your local machine:
+$ asm_images --release nightly --build_images
 
-# In the container...
-/tf $ python3 ./assembler.py -o dockerfiles -s spec.yml
+# Build version release for version 99.0, except "gpu" tags:
+$ asm_images --release versioned --arg _TAG_PREFIX=99.0 --build_images --exclude_tags_matching '*.gpu.*'
 ```
diff --git a/tensorflow/tools/dockerfiles/assembler.Dockerfile b/tensorflow/tools/dockerfiles/assembler.Dockerfile
deleted file mode 100644
index 7a8e07fced3465e188f47727013fa92d14424c7c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/assembler.Dockerfile
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# TensorFlow Dockerfile Development Container
-#
-# You can use this image to quickly develop changes to the Dockerfile assembler
-# or set of TF Docker partials. See README.md for usage instructions.
-FROM debian:stretch
-LABEL maintainer="Austin Anderson <angerson@google.com>"
-
-RUN apt-get update && apt-get install -y python3 python3-pip bash
-RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus
-
-WORKDIR /tf
-VOLUME ["/tf"]
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 9cdd9bb0cb0841e95d8d334293026207f093ab90..67a0320241d273bbb7a2439b2e09723905db0765 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -11,63 +11,144 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Assemble common TF Dockerfiles from many parts.
+# ============================================================================
+"""Multipurpose TensorFlow Docker Helper.
 
-This script constructs TF's Dockerfiles by aggregating partial
-Dockerfiles. See README.md for usage examples.
+- Assembles Dockerfiles
+- Builds images (and optionally runs image tests)
+- Pushes images to Docker Hub (provided with credentials)
+
+Read README.md (in this directory) for instructions!
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 import errno
+import itertools
+import multiprocessing
 import os
-import os.path
 import re
 import shutil
-import textwrap
+import sys
 
 from absl import app
 from absl import flags
 import cerberus
+import docker
 import yaml
 
 FLAGS = flags.FLAGS
 
+flags.DEFINE_string('hub_username', None,
+                    'Dockerhub username, only used with --upload_to_hub')
+
+flags.DEFINE_string(
+    'hub_password', None,
+    ('Dockerhub password, only used with --upload_to_hub. Use from an env param'
+     'so your password isn\'t in your history.'))
+
+flags.DEFINE_integer('hub_timeout', 3600,
+                     'Abort Hub upload if it takes longer than this.')
+
+flags.DEFINE_string(
+    'repository', 'tensorflow',
+    'Tag local images as {repository}:tag (in addition to the '
+    'hub_repository, if uploading to hub)')
+
+flags.DEFINE_string(
+    'hub_repository', None,
+    'Push tags to this Docker Hub repository, e.g. tensorflow/tensorflow')
+
+flags.DEFINE_boolean(
+    'upload_to_hub',
+    False,
+    ('Push built images to Docker Hub (you must also provide --hub_username, '
+     '--hub_password, and --hub_repository)'),
+    short_name='u',
+)
+
+flags.DEFINE_boolean(
+    'construct_dockerfiles', False, 'Do not build images', short_name='d')
+
+flags.DEFINE_boolean(
+    'keep_temp_dockerfiles',
+    False,
+    'Retain .temp.Dockerfiles created while building images.',
+    short_name='k')
+
 flags.DEFINE_boolean(
-    'dry_run', False, 'Do not actually generate Dockerfiles', short_name='n')
+    'build_images', False, 'Do not build images', short_name='b')
 
 flags.DEFINE_string(
-    'spec_file',
-    './spec.yml',
-    'Path to a YAML specification file',
-    short_name='s')
+    'run_tests_path', None,
+    ('Execute test scripts on generated Dockerfiles before pushing them. '
+     'Flag value must be a full path to the "tests" directory, which is usually'
+     ' $(realpath ./tests). A failed tests counts the same as a failed build.'))
+
+flags.DEFINE_boolean(
+    'stop_on_failure', False,
+    ('Stop processing tags if any one build fails. If False or not specified, '
+     'failures are reported but do not affect the other images.'))
+
+flags.DEFINE_boolean(
+    'dry_run',
+    False,
+    'Do not build or deploy anything at all.',
+    short_name='n',
+)
 
 flags.DEFINE_string(
-    'output_dir',
-    './dockerfiles', ('Path to an output directory for Dockerfiles. '
-                      'Will be created if it doesn\'t exist.'),
+    'exclude_tags_matching',
+    None,
+    ('Regular expression that skips processing on any tag it matches. Must '
+     'match entire string, e.g. ".*gpu.*" ignores all GPU tags.'),
+    short_name='x')
+
+flags.DEFINE_string(
+    'only_tags_matching',
+    None,
+    ('Regular expression that skips processing on any tag it does not match. '
+     'Must match entire string, e.g. ".*gpu.*" includes only GPU tags.'),
+    short_name='i')
+
+flags.DEFINE_string(
+    'dockerfile_dir',
+    './dockerfiles', 'Path to an output directory for Dockerfiles.'
+    ' Will be created if it doesn\'t exist.'
+    ' Existing files in this directory will be deleted when new Dockerfiles'
+    ' are made.',
     short_name='o')
 
 flags.DEFINE_string(
     'partial_dir',
     './partials',
-    'Path to a directory containing foo.partial.Dockerfile partial files.',
+    'Path to a directory containing foo.partial.Dockerfile partial files.'
+    ' can have subdirectories, e.g. "bar/baz.partial.Dockerfile".',
     short_name='p')
 
-flags.DEFINE_boolean(
-    'quiet_dry_run',
-    True,
-    'Do not print contents of dry run Dockerfiles.',
-    short_name='q')
+flags.DEFINE_multi_string(
+    'release', [],
+    'Set of releases to build and tag. Defaults to every release type.',
+    short_name='r')
 
-flags.DEFINE_boolean(
-    'validate', True, 'Validate generated Dockerfiles', short_name='c')
+flags.DEFINE_multi_string(
+    'arg', [],
+    ('Extra build arguments. These are used for expanding tag names if needed '
+     '(e.g. --arg _TAG_PREFIX=foo) and for using as build arguments (unused '
+     'args will print a warning).'),
+    short_name='a')
 
-# Schema to verify the contents of spec.yml with Cerberus.
+flags.DEFINE_string(
+    'spec_file',
+    './spec.yml',
+    'Path to the YAML specification file',
+    short_name='s')
+
+# Schema to verify the contents of tag-spec.yml with Cerberus.
 # Must be converted to a dict from yaml to work.
 # Note: can add python references with e.g.
 # !!python/name:builtins.str
@@ -76,79 +157,78 @@ SCHEMA_TEXT = """
 header:
   type: string
 
-partials:
+slice_sets:
   type: dict
   keyschema:
     type: string
   valueschema:
-    type: dict
-    schema:
-      desc:
-        type: string
-      args:
+     type: list
+     schema:
         type: dict
-        keyschema:
-          type: string
-        valueschema:
-          anyof:
-            - type: [ boolean, number, string ]
-            - type: dict
-              schema:
-                 default:
-                    type: [ boolean, number, string ]
-                 desc:
-                    type: string
-                 options:
-                    type: list
-                    schema:
-                       type: string
-
-images:
+        schema:
+           add_to_name:
+             type: string
+           dockerfile_exclusive_name:
+             type: string
+           dockerfile_subdirectory:
+             type: string
+           partials:
+             type: list
+             schema:
+               type: string
+               ispartial: true
+           test_runtime:
+             type: string
+             required: false
+           tests:
+             type: list
+             default: []
+             schema:
+               type: string
+           args:
+             type: list
+             default: []
+             schema:
+               type: string
+               isfullarg: true
+
+releases:
+  type: dict
   keyschema:
     type: string
   valueschema:
     type: dict
     schema:
-      desc:
-        type: string
-      arg-defaults:
-        type: list
-        schema:
-          anyof:
-            - type: dict
-              keyschema:
-                type: string
-                arg_in_use: true
-              valueschema:
-                type: string
-            - type: string
-              isimage: true
-      create-dockerfile:
+      is_dockerfiles:
         type: boolean
-      partials:
+        required: false
+        default: false
+      upload_images:
+        type: boolean
+        required: false
+        default: true
+      tag_specs:
         type: list
+        required: true
         schema:
-          anyof:
-            - type: dict
-              keyschema:
-                type: string
-                regex: image
-              valueschema:
-                type: string
-                isimage: true
-            - type: string
-              ispartial: true
+          type: string
 """
 
 
-class TfDockerValidator(cerberus.Validator):
-  """Custom Cerberus validator for TF dockerfile spec.
+class TfDockerTagValidator(cerberus.Validator):
+  """Custom Cerberus validator for TF tag spec.
 
   Note: Each _validate_foo function's docstring must end with a segment
   describing its own validation schema, e.g. "The rule's arguments are...". If
   you add a new validator, you can copy/paste that section.
   """
 
+  def __init__(self, *args, **kwargs):
+    # See http://docs.python-cerberus.org/en/stable/customize.html
+    if 'partials' in kwargs:
+      self.partials = kwargs['partials']
+    super(cerberus.Validator, self).__init__(*args, **kwargs)
+
   def _validate_ispartial(self, ispartial, field, value):
     """Validate that a partial references an existing partial spec.
 
@@ -156,398 +236,431 @@ class TfDockerValidator(cerberus.Validator):
       ispartial: Value of the rule, a bool
       field: The field being validated
       value: The field's value
-
     The rule's arguments are validated against this schema:
     {'type': 'boolean'}
     """
-    if ispartial and value not in self.root_document.get('partials', dict()):
-      self._error(field, '{} is not an existing partial.'.format(value))
+    if ispartial and value not in self.partials:
+      self._error(field,
+                  '{} is not present in the partials directory.'.format(value))
 
-  def _validate_isimage(self, isimage, field, value):
-    """Validate that an image references an existing partial spec.
+  def _validate_isfullarg(self, isfullarg, field, value):
+    """Validate that a string is either a FULL=arg or NOT.
 
     Args:
-      isimage: Value of the rule, a bool
+      isfullarg: Value of the rule, a bool
       field: The field being validated
       value: The field's value
-
     The rule's arguments are validated against this schema:
     {'type': 'boolean'}
     """
-    if isimage and value not in self.root_document.get('images', dict()):
-      self._error(field, '{} is not an existing image.'.format(value))
-
-  def _validate_arg_in_use(self, arg_in_use, field, value):
-    """Validate that an arg references an existing partial spec's args.
-
-    Args:
-      arg_in_use: Value of the rule, a bool
-      field: The field being validated
-      value: The field's value
-
-    The rule's arguments are validated against this schema:
-    {'type': 'boolean'}
-    """
-    if arg_in_use:
-      for partial in self.root_document.get('partials', dict()).values():
-        if value in partial.get('args', tuple()):
-          return
-
-      self._error(field, '{} is not an arg used in any partial.'.format(value))
-
-
-def build_partial_description(partial_spec):
-  """Create the documentation lines for a specific partial.
-
-  Generates something like this:
-
-    # This is the partial's description, from spec.yml.
-    # --build-arg ARG_NAME=argdefault
-    #    this is one of the args.
-    # --build-arg ANOTHER_ARG=(some|choices)
-    #    another arg.
+    if isfullarg and '=' not in value:
+      self._error(field, '{} should be of the form ARG=VALUE.'.format(value))
+    if not isfullarg and '=' in value:
+      self._error(field, '{} should be of the form ARG (no =).'.format(value))
 
-  Args:
-    partial_spec: A dict representing one of the partials from spec.yml. Doesn't
-      include the name of the partial; is a dict like { desc: ..., args: ... }.
-
-  Returns:
-    A commented string describing this partial.
-  """
 
-  # Start from linewrapped desc field
-  lines = []
-  wrapper = textwrap.TextWrapper(
-      initial_indent='# ', subsequent_indent='# ', width=80)
-  description = wrapper.fill(partial_spec.get('desc', '( no comments )'))
-  lines.extend(['#', description])
-
-  # Document each arg
-  for arg, arg_data in partial_spec.get('args', dict()).items():
-    # Wrap arg description with comment lines
-    desc = arg_data.get('desc', '( no description )')
-    desc = textwrap.fill(
-        desc,
-        initial_indent='#    ',
-        subsequent_indent='#    ',
-        width=80,
-        drop_whitespace=False)
-
-    # Document (each|option|like|this)
-    if 'options' in arg_data:
-      arg_options = ' ({})'.format('|'.join(arg_data['options']))
-    else:
-      arg_options = ''
+def eprint(*args, **kwargs):
+  print(*args, file=sys.stderr, flush=True, **kwargs)
 
-    # Add usage sample
-    arg_use = '# --build-arg {}={}{}'.format(arg,
-                                             arg_data.get('default', '(unset)'),
-                                             arg_options)
-    lines.extend([arg_use, desc])
 
-  return '\n'.join(lines)
+def aggregate_all_slice_combinations(spec, slice_set_names):
+  """Figure out all of the possible slice groupings for a tag spec."""
+  slice_sets = copy.deepcopy(spec['slice_sets'])
 
+  for name in slice_set_names:
+    for slice_set in slice_sets[name]:
+      slice_set['set_name'] = name
 
-def construct_contents(partial_specs, image_spec):
-  """Assemble the dockerfile contents for an image spec.
+  slices_grouped_but_not_keyed = [slice_sets[name] for name in slice_set_names]
+  all_slice_combos = list(itertools.product(*slices_grouped_but_not_keyed))
+  return all_slice_combos
 
-  It assembles a concrete list of partial references into a single, large
-  string.
-  Also expands argument defaults, so that the resulting Dockerfile doesn't have
-  to be configured with --build-arg=... every time. That is, any ARG directive
-  will be updated with a new default value.
 
-  Args:
-    partial_specs: The dict from spec.yml["partials"].
-    image_spec: One of the dict values from spec.yml["images"].
+def build_name_from_slices(format_string, slices, args, is_dockerfile=False):
+  """Build the tag name (cpu-devel...) from a list of slices."""
+  name_formatter = copy.deepcopy(args)
+  name_formatter.update({s['set_name']: s['add_to_name'] for s in slices})
+  name_formatter.update({
+      s['set_name']: s['dockerfile_exclusive_name']
+      for s in slices
+      if is_dockerfile and 'dockerfile_exclusive_name' in s
+  })
+  name = format_string.format(**name_formatter)
+  return name
 
-  Returns:
-    A string containing a valid Dockerfile based on the partials listed in
-    image_spec.
-  """
-  processed_partial_strings = []
-  for partial_name in image_spec['partials']:
-    # Apply image arg-defaults to existing arg defaults
-    partial_spec = copy.deepcopy(partial_specs[partial_name])
-    args = partial_spec.get('args', dict())
-    for k_v in image_spec.get('arg-defaults', []):
-      arg, value = list(k_v.items())[0]
-      if arg in args:
-        args[arg]['default'] = value
-
-    # Read partial file contents
-    filename = partial_spec.get('file', partial_name)
-    partial_path = os.path.join(FLAGS.partial_dir,
-                                '{}.partial.Dockerfile'.format(filename))
-    with open(partial_path, 'r') as f_partial:
-      partial_contents = f_partial.read()
-
-    # Replace ARG FOO=BAR with ARG FOO=[new-default]
-    for arg, arg_data in args.items():
-      if 'default' in arg_data and arg_data['default']:
-        default = '={}'.format(arg_data['default'])
-      else:
-        default = ''
-      partial_contents = re.sub(r'ARG {}.*'.format(arg), 'ARG {}{}'.format(
-          arg, default), partial_contents)
-
-    # Store updated partial contents
-    processed_partial_strings.append(partial_contents)
-
-  # Join everything together
-  return '\n'.join(processed_partial_strings)
 
-
-def mkdir_p(path):
-  """Create a directory and its parents, even if it already exists."""
-  try:
-    os.makedirs(path)
-  except OSError as e:
-    if e.errno != errno.EEXIST:
-      raise
+def update_args_dict(args_dict, updater):
+  """Update a dict of arg values with more values from a list or dict."""
+  if isinstance(updater, list):
+    for arg in updater:
+      key, sep, value = arg.partition('=')
+      if sep == '=':
+        args_dict[key] = value
+  if isinstance(updater, dict):
+    for key, value in updater.items():
+      args_dict[key] = value
+  return args_dict
 
 
-def construct_documentation(header, partial_specs, image_spec):
-  """Assemble all of the documentation for a single dockerfile.
+def get_slice_sets_and_required_args(slice_sets, tag_spec):
+  """Extract used-slice-sets and required CLI arguments from a spec string.
 
-  Builds explanations of included partials and available build args.
+  For example, {FOO}{bar}{bat} finds FOO, bar, and bat. Assuming bar and bat
+  are both named slice sets, FOO must be specified on the command line.
 
   Args:
-    header: The string from spec.yml["header"]; will be commented and wrapped.
-    partial_specs: The dict from spec.yml["partials"].
-    image_spec: The spec for the dockerfile being built.
+     slice_sets: Dict of named slice sets
+     tag_spec: The tag spec string, e.g. {_FOO}{blep}
 
   Returns:
-    A string containing a commented header that documents the contents of the
-    dockerfile.
-
+     (used_slice_sets, required_args), a tuple of lists
   """
-  # Comment and wrap header and image description
-  commented_header = '\n'.join(
-      [('# ' + l).rstrip() for l in header.splitlines()])
-  commented_desc = '\n'.join(
-      ['# ' + l for l in image_spec.get('desc', '').splitlines()])
-  partial_descriptions = []
-
-  # Build documentation for each partial in the image
-  for partial in image_spec['partials']:
-    # Copy partial data for default args unique to this image
-    partial_spec = copy.deepcopy(partial_specs[partial])
-    args = partial_spec.get('args', dict())
-
-    # Overwrite any existing arg defaults
-    for k_v in image_spec.get('arg-defaults', []):
-      arg, value = list(k_v.items())[0]
-      if arg in args:
-        args[arg]['default'] = value
-
-    # Build the description from new args
-    partial_description = build_partial_description(partial_spec)
-    partial_descriptions.append(partial_description)
-
-  contents = [commented_header, '#', commented_desc] + partial_descriptions
-  return '\n'.join(contents) + '\n'
-
-
-def normalize_partial_args(partial_specs):
-  """Normalize the shorthand form of a partial's args specification.
-
-  Turns this:
-
-    partial:
-      args:
-        SOME_ARG: arg_value
-
-  Into this:
-
-    partial:
-       args:
-         SOME_ARG:
-            default: arg_value
-
-  Args:
-    partial_specs: The dict from spec.yml["partials"]. This dict is modified in
-      place.
-
-  Returns:
-    The modified contents of partial_specs.
-
-  """
-  for _, partial in partial_specs.items():
-    args = partial.get('args', dict())
-    for arg, value in args.items():
-      if not isinstance(value, dict):
-        new_value = {'default': value}
-        args[arg] = new_value
-
-  return partial_specs
-
-
-def flatten_args_references(image_specs):
-  """Resolve all default-args in each image spec to a concrete dict.
-
-  Turns this:
-
-    example-image:
-      arg-defaults:
-        - MY_ARG: ARG_VALUE
-
-    another-example:
-      arg-defaults:
-        - ANOTHER_ARG: ANOTHER_VALUE
-        - example_image
-
-  Into this:
+  required_args = []
+  used_slice_sets = []
+
+  extract_bracketed_words = re.compile(r'\{([^}]+)\}')
+  possible_args_or_slice_set_names = extract_bracketed_words.findall(tag_spec)
+  for name in possible_args_or_slice_set_names:
+    if name in slice_sets:
+      used_slice_sets.append(name)
+    else:
+      required_args.append(name)
 
-    example-image:
-      arg-defaults:
-        - MY_ARG: ARG_VALUE
+  return (used_slice_sets, required_args)
 
-    another-example:
-      arg-defaults:
-        - ANOTHER_ARG: ANOTHER_VALUE
-        - MY_ARG: ARG_VALUE
 
-  Args:
-    image_specs: A dict of image_spec dicts; should be the contents of the
-      "images" key in the global spec.yaml. This dict is modified in place and
-      then returned.
-
-  Returns:
-    The modified contents of image_specs.
-  """
-  for _, image_spec in image_specs.items():
-    too_deep = 0
-    while str in map(type, image_spec.get('arg-defaults', [])) and too_deep < 5:
-      new_args = []
-      for arg in image_spec['arg-defaults']:
-        if isinstance(arg, str):
-          new_args.extend(image_specs[arg]['arg-defaults'])
-        else:
-          new_args.append(arg)
+def gather_tag_args(slices, cli_input_args, required_args):
+  """Build a dictionary of all the CLI and slice-specified args for a tag."""
+  args = dict()
 
-      image_spec['arg-defaults'] = new_args
-      too_deep += 1
+  for s in slices:
+    args = update_args_dict(args, s['args'])
 
-  return image_specs
+  args = update_args_dict(args, cli_input_args)
+  for arg in required_args:
+    if arg not in args:
+      eprint(('> Error: {} is not a valid slice_set, and also isn\'t an arg '
+              'provided on the command line. If it is an arg, please specify '
+              'it with --arg. If not, check the slice_sets list.'.format(arg)))
+      exit(1)
 
+  return args
 
-def flatten_partial_references(image_specs):
-  """Resolve all partial references in each image spec to a concrete list.
 
-  Turns this:
+def gather_slice_list_items(slices, key):
+  """For a list of slices, get the flattened list of all of a certain key."""
+  return list(itertools.chain(*[s[key] for s in slices if key in s]))
 
-    example-image:
-      partials:
-        - foo
 
-    another-example:
-      partials:
-        - bar
-        - image: example-image
-        - bat
+def find_first_slice_value(slices, key):
+  """For a list of slices, get the first value for a certain key."""
+  for s in slices:
+    if key in s and s[key] is not None:
+      return s[key]
+  return None
 
-  Into this:
 
-    example-image:
-      partials:
-        - foo
+def assemble_tags(spec, cli_args, enabled_releases, all_partials):
+  """Gather all the tags based on our spec.
 
-    another-example:
-      partials:
-        - bar
-        - foo
-        - bat
   Args:
-    image_specs: A dict of image_spec dicts; should be the contents of the
-      "images" key in the global spec.yaml. This dict is modified in place and
-      then returned.
+    spec: Nested dict containing full Tag spec
+    cli_args: List of ARG=foo arguments to pass along to Docker build
+    enabled_releases: List of releases to parse. Empty list = all
+    all_partials: Dict of every partial, for reference
 
   Returns:
-    The modified contents of image_specs.
+    Dict of tags and how to build them
   """
-  for _, image_spec in image_specs.items():
-    too_deep = 0
-    while dict in map(type, image_spec['partials']) and too_deep < 5:
-      new_partials = []
-      for partial in image_spec['partials']:
-        if isinstance(partial, str):
-          new_partials.append(partial)
-        else:
-          new_partials.extend(image_specs[partial['image']]['partials'])
+  tag_data = collections.defaultdict(list)
+
+  for name, release in spec['releases'].items():
+    for tag_spec in release['tag_specs']:
+      if enabled_releases and name not in enabled_releases:
+        eprint('> Skipping release {}'.format(name))
+        continue
+
+      used_slice_sets, required_cli_args = get_slice_sets_and_required_args(
+          spec['slice_sets'], tag_spec)
+
+      slice_combos = aggregate_all_slice_combinations(spec, used_slice_sets)
+      for slices in slice_combos:
+
+        tag_args = gather_tag_args(slices, cli_args, required_cli_args)
+        tag_name = build_name_from_slices(tag_spec, slices, tag_args,
+                                          release['is_dockerfiles'])
+        used_partials = gather_slice_list_items(slices, 'partials')
+        used_tests = gather_slice_list_items(slices, 'tests')
+        test_runtime = find_first_slice_value(slices, 'test_runtime')
+        dockerfile_subdirectory = find_first_slice_value(
+            slices, 'dockerfile_subdirectory')
+        dockerfile_contents = merge_partials(spec['header'], used_partials,
+                                             all_partials)
+
+        tag_data[tag_name].append({
+            'release': name,
+            'tag_spec': tag_spec,
+            'is_dockerfiles': release['is_dockerfiles'],
+            'upload_images': release['upload_images'],
+            'cli_args': tag_args,
+            'dockerfile_subdirectory': dockerfile_subdirectory or '',
+            'partials': used_partials,
+            'tests': used_tests,
+            'test_runtime': test_runtime,
+            'dockerfile_contents': dockerfile_contents,
+        })
+
+  return tag_data
+
+
+def merge_partials(header, used_partials, all_partials):
+  """Merge all partial contents with their header."""
+  used_partials = list(used_partials)
+  return '\n'.join([header] + [all_partials[u] for u in used_partials])
+
+
+def upload_in_background(hub_repository, dock, image, tag):
+  """Upload a docker image (to be used by multiprocessing)."""
+  image.tag(hub_repository, tag=tag)
+  print(dock.images.push(hub_repository, tag=tag))
 
-      image_spec['partials'] = new_partials
-      too_deep += 1
 
-  return image_specs
+def mkdir_p(path):
+  """Create a directory and its parents, even if it already exists."""
+  try:
+    os.makedirs(path)
+  except OSError as e:
+    if e.errno != errno.EEXIST:
+      raise
 
 
-def construct_dockerfiles(tf_spec):
-  """Generate a mapping of {"cpu": <cpu dockerfile contents>, ...}.
+def gather_existing_partials(partial_path):
+  """Find and read all available partials.
 
   Args:
-    tf_spec: The full spec.yml loaded as a python object.
+    partial_path (string): read partials from this directory.
 
   Returns:
-    A string:string dict of short names ("cpu-devel") to Dockerfile contents.
+    Dict[string, string] of partial short names (like "ubuntu/python" or
+      "bazel") to the full contents of that partial.
   """
-  names_to_contents = dict()
-  image_specs = tf_spec['images']
-  image_specs = flatten_partial_references(image_specs)
-  image_specs = flatten_args_references(image_specs)
-  partial_specs = tf_spec['partials']
-  partial_specs = normalize_partial_args(partial_specs)
-
-  for name, image_spec in image_specs.items():
-    if not image_spec.get('create-dockerfile', True):
-      continue
-    documentation = construct_documentation(tf_spec['header'], partial_specs,
-                                            image_spec)
-    contents = construct_contents(partial_specs, image_spec)
-    names_to_contents[name] = '\n'.join([documentation, contents])
-
-  return names_to_contents
+  partials = dict()
+  for path, _, files in os.walk(partial_path):
+    for name in files:
+      fullpath = os.path.join(path, name)
+      if '.partial.Dockerfile' not in fullpath:
+        eprint(('> Probably not a problem: skipping {}, which is not a '
+                'partial.').format(fullpath))
+        continue
+      # partial_dir/foo/bar.partial.Dockerfile -> foo/bar
+      simple_name = fullpath[len(partial_path) + 1:-len('.partial.dockerfile')]
+      with open(fullpath, 'r') as f:
+        partial_contents = f.read()
+      partials[simple_name] = partial_contents
+  return partials
 
 
 def main(argv):
   if len(argv) > 1:
-    raise app.UsageError('Unexpected command line args found: {}'.format(argv))
+    raise app.UsageError('Too many command-line arguments.')
 
+  # Read the full spec file, used for everything
   with open(FLAGS.spec_file, 'r') as spec_file:
-    tf_spec = yaml.load(spec_file)
+    tag_spec = yaml.load(spec_file)
+
+  # Get existing partial contents
+  partials = gather_existing_partials(FLAGS.partial_dir)
 
   # Abort if spec.yaml is invalid
-  if FLAGS.validate:
-    schema = yaml.load(SCHEMA_TEXT)
-    v = TfDockerValidator(schema)
-    if not v.validate(tf_spec):
-      print('>> ERROR: {} is an invalid spec! The errors are:'.format(
-          FLAGS.spec_file))
-      print(yaml.dump(v.errors, indent=2))
+  schema = yaml.load(SCHEMA_TEXT)
+  v = TfDockerTagValidator(schema, partials=partials)
+  if not v.validate(tag_spec):
+    eprint('> Error: {} is an invalid spec! The errors are:'.format(
+        FLAGS.spec_file))
+    eprint(yaml.dump(v.errors, indent=2))
+    exit(1)
+  tag_spec = v.normalized(tag_spec)
+
+  # Assemble tags and images used to build them
+  all_tags = assemble_tags(tag_spec, FLAGS.arg, FLAGS.release, partials)
+
+  # Empty Dockerfile directory if building new Dockerfiles
+  if FLAGS.construct_dockerfiles:
+    eprint('> Emptying Dockerfile dir "{}"'.format(FLAGS.dockerfile_dir))
+    shutil.rmtree(FLAGS.dockerfile_dir, ignore_errors=True)
+    mkdir_p(FLAGS.dockerfile_dir)
+
+  # Set up Docker helper
+  dock = docker.from_env()
+
+  # Login to Docker if uploading images
+  if FLAGS.upload_to_hub:
+    if not FLAGS.hub_username:
+      eprint('> Error: please set --hub_username when uploading to Dockerhub.')
       exit(1)
-  else:
-    print('>> WARNING: Not validating {}'.format(FLAGS.spec_file))
-
-  # Generate mapping of { "cpu-devel": "<cpu-devel dockerfile contents>", ... }
-  names_to_contents = construct_dockerfiles(tf_spec)
-
-  # Write each completed Dockerfile
-  if not FLAGS.dry_run:
-    print('>> Emptying destination dir "{}"'.format(FLAGS.output_dir))
-    shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
-    mkdir_p(FLAGS.output_dir)
-  else:
-    print('>> Skipping creation of {} (dry run)'.format(FLAGS.output_dir))
-  for name, contents in names_to_contents.items():
-    path = os.path.join(FLAGS.output_dir, name + '.Dockerfile')
-    if FLAGS.dry_run:
-      print('>> Skipping writing contents of {} (dry run)'.format(path))
-      print(contents)
-    else:
-      mkdir_p(FLAGS.output_dir)
-      print('>> Writing {}'.format(path))
-      with open(path, 'w') as f:
-        f.write(contents)
+    if not FLAGS.hub_repository:
+      eprint(
+          '> Error: please set --hub_repository when uploading to Dockerhub.')
+      exit(1)
+    if not FLAGS.hub_password:
+      eprint('> Error: please set --hub_password when uploading to Dockerhub.')
+      exit(1)
+    dock.login(
+        username=FLAGS.hub_username,
+        password=FLAGS.hub_password,
+    )
+
+  # Each tag has a name ('tag') and a definition consisting of the contents
+  # of its Dockerfile, its build arg list, etc.
+  failed_tags = []
+  for tag, tag_defs in all_tags.items():
+    for tag_def in tag_defs:
+      eprint('> Working on {}'.format(tag))
+
+      if FLAGS.exclude_tags_matching and re.match(FLAGS.exclude_tags_matching,
+                                                  tag):
+        eprint('>> Excluded due to match against "{}".'.format(
+            FLAGS.exclude_tags_matching))
+        continue
+
+      if FLAGS.only_tags_matching and not re.match(FLAGS.only_tags_matching,
+                                                   tag):
+        eprint('>> Excluded due to failure to match against "{}".'.format(
+            FLAGS.only_tags_matching))
+        continue
+
+      # Write releases marked "is_dockerfiles" into the Dockerfile directory
+      if FLAGS.construct_dockerfiles and tag_def['is_dockerfiles']:
+        path = os.path.join(FLAGS.dockerfile_dir,
+                            tag_def['dockerfile_subdirectory'],
+                            tag + '.Dockerfile')
+        eprint('>> Writing {}...'.format(path))
+        if not FLAGS.dry_run:
+          mkdir_p(os.path.dirname(path))
+          with open(path, 'w') as f:
+            f.write(tag_def['dockerfile_contents'])
+
+      # Don't build any images for dockerfile-only releases
+      if not FLAGS.build_images:
+        continue
+
+      # Generate a temporary Dockerfile to use to build, since docker-py
+      # needs a filepath relative to the build context (i.e. the current
+      # directory)
+      dockerfile = os.path.join(FLAGS.dockerfile_dir, tag + '.temp.Dockerfile')
+      if not FLAGS.dry_run:
+        with open(dockerfile, 'w') as f:
+          f.write(tag_def['dockerfile_contents'])
+      eprint('>> (Temporary) writing {}...'.format(dockerfile))
+
+      repo_tag = '{}:{}'.format(FLAGS.repository, tag)
+      eprint('>> Building {} using build args:'.format(repo_tag))
+      for arg, value in tag_def['cli_args'].items():
+        eprint('>>> {}={}'.format(arg, value))
+
+      # Note that we are NOT using cache_from, which appears to limit
+      # available cache layers to those from explicitly specified layers. Many
+      # of our layers are similar between local builds, so we want to use the
+      # implied local build cache.
+      tag_failed = False
+      image, logs = None, []
+      if not FLAGS.dry_run:
+        try:
+          image, logs = dock.images.build(
+              timeout=FLAGS.hub_timeout,
+              path='.',
+              dockerfile=dockerfile,
+              buildargs=tag_def['cli_args'],
+              tag=repo_tag)
+
+          # Print logs after finishing
+          log_lines = [l.get('stream', '') for l in logs]
+          eprint(''.join(log_lines))
+
+          # Run tests if requested, and dump output
+          # Could be improved by backgrounding, but would need better
+          # multiprocessing support to track failures properly.
+          if FLAGS.run_tests_path:
+            if not tag_def['tests']:
+              eprint('>>> No tests to run.')
+            for test in tag_def['tests']:
+              eprint('>> Testing {}...'.format(test))
+              container, = dock.containers.run(
+                  image,
+                  '/tests/' + test,
+                  working_dir='/',
+                  log_config={'type': 'journald'},
+                  detach=True,
+                  stderr=True,
+                  stdout=True,
+                  volumes={
+                      FLAGS.run_tests_path: {
+                          'bind': '/tests',
+                          'mode': 'ro'
+                      }
+                  },
+                  runtime=tag_def['test_runtime']),
+              ret = container.wait()
+              code = ret['StatusCode']
+              out = container.logs(stdout=True, stderr=False)
+              err = container.logs(stdout=False, stderr=True)
+              container.remove()
+              if out:
+                eprint('>>> Output stdout:')
+                eprint(out.decode('utf-8'))
+              else:
+                eprint('>>> No test standard out.')
+              if err:
+                eprint('>>> Output stderr:')
+                eprint(out.decode('utf-8'))
+              else:
+                eprint('>>> No test standard err.')
+              if code != 0:
+                eprint('>> {} failed tests with status: "{}"'.format(
+                    repo_tag, code))
+                failed_tags.append(tag)
+                tag_failed = True
+                if FLAGS.stop_on_failure:
+                  eprint('>> ABORTING due to --stop_on_failure!')
+                  exit(1)
+              else:
+                eprint('>> Tests look good!')
+
+        except docker.errors.BuildError as e:
+          eprint('>> {} failed to build with message: "{}"'.format(
+              repo_tag, e.msg))
+          eprint('>> Build logs follow:')
+          log_lines = [l.get('stream', '') for l in e.build_log]
+          eprint(''.join(log_lines))
+          failed_tags.append(tag)
+          tag_failed = True
+          if FLAGS.stop_on_failure:
+            eprint('>> ABORTING due to --stop_on_failure!')
+            exit(1)
+
+        # Clean temporary dockerfiles if they were created earlier
+        if not FLAGS.keep_temp_dockerfiles:
+          os.remove(dockerfile)
+
+      # Upload new images to DockerHub as long as they built + passed tests
+      if FLAGS.upload_to_hub:
+        if not tag_def['upload_images']:
+          continue
+        if tag_failed:
+          continue
+
+        eprint('>> Uploading to {}:{}'.format(FLAGS.hub_repository, tag))
+        if not FLAGS.dry_run:
+          p = multiprocessing.Process(
+              target=upload_in_background,
+              args=(FLAGS.hub_repository, dock, image, tag))
+          p.start()
+
+  if failed_tags:
+    eprint(
+        '> Some tags failed to build or failed testing, check scrollback for '
+        'errors: {}'.format(','.join(failed_tags)))
+    exit(1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
deleted file mode 100644
index dbbad7d03afa4fa6e6c39bb04818aa6f3df146d7..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for developing changes for TensorFlow, with Jupyter included.
-#
-# Start from Ubuntu, with TF development packages (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
-
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-ARG USE_PYTHON_3_NOT_2=True
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
-
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    openjdk-8-jdk \
-    ${PYTHON}-dev \
-    swig
-
-# Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
-
-RUN ${PIP} install jupyter
-
-RUN mkdir /notebooks && chmod a+rwx /notebooks
-RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
-EXPOSE 8888
-
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
deleted file mode 100644
index 160d7c02e2909c4265a68784b7f773edd19b4191..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for developing changes for TensorFlow.
-#
-# Start from Ubuntu, with TF development packages (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-ARG USE_PYTHON_3_NOT_2=True
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
-
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    openjdk-8-jdk \
-    ${PYTHON}-dev \
-    swig
-
-# Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index 8d5d653ab7973e9195db58723d7cfa57e252e165..d8fabadec280cc136bd6cc9a30e79390a9a167bd 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -16,54 +16,56 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter included.
-#
-# Start from Ubuntu (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
 
-ARG USE_PYTHON_3_NOT_2=True
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
 
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
 RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
index 35c41b49fd10bb98f557746c48bae9984b00c167..857b5e20471a82bd162e55b146854d0a5c165db8 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
@@ -16,41 +16,37 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for using TensorFlow
-#
-# Start from Ubuntu (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
 
-ARG USE_PYTHON_3_NOT_2=True
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
 
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
 RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..43265676f8b7ab19dc14f2c1475de1af67054c6a
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -0,0 +1,118 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..5c5b2f91634ff43fb2a047c66a856ac787858a47
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..8769e4e9cd619a2c31e37ee838e45ea050e42712
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -0,0 +1,151 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
+        curl \
+        git \
+        libcudnn7=7.2.1.38-1+cuda9.0 \
+        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libnccl-dev=2.2.13-1+cuda9.0 \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python-dev \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        && \
+    rm -rf /var/lib/apt/lists/* && \
+    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
+        apt-get update && \
+        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
+        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
+
+# Link NCCL libray and header where the build script expects them.
+RUN mkdir /usr/local/cuda-9.0/lib &&  \
+    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
+    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
+
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..809cda679ea7e33b64e4b4180cfa1af2d05f8ff3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
+        curl \
+        git \
+        libcudnn7=7.2.1.38-1+cuda9.0 \
+        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libnccl-dev=2.2.13-1+cuda9.0 \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python-dev \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        && \
+    rm -rf /var/lib/apt/lists/* && \
+    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
+        apt-get update && \
+        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
+        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
+
+# Link NCCL libray and header where the build script expects them.
+RUN mkdir /usr/local/cuda-9.0/lib &&  \
+    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
+    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
+
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..acfe4d8607d56b6192926eb50ef9a3d58a07efe2
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-9-0 \
+        cuda-cufft-9-0 \
+        cuda-curand-9-0 \
+        cuda-cusolver-9-0 \
+        cuda-cusparse-9-0 \
+        curl \
+        libcudnn7=7.2.1.38-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
+        apt-get update && \
+        apt-get install libnvinfer4=4.1.2-1+cuda9.0
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN ${PIP} install ${TF_PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f36a21eaf0cce02cf77db7c88358696c6f392cf4
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -0,0 +1,85 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-9-0 \
+        cuda-cufft-9-0 \
+        cuda-curand-9-0 \
+        cuda-cusolver-9-0 \
+        cuda-cusparse-9-0 \
+        curl \
+        libcudnn7=7.2.1.38-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
+        apt-get update && \
+        apt-get install libnvinfer4=4.1.2-1+cuda9.0
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN ${PIP} install ${TF_PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
deleted file mode 100644
index 68c0e2f2bd1657269665f0eff72df52fe24d1a5c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow, with Jupyter included.
-#
-# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
-# packages.
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
-
-ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        wget \
-        && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
-
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
-
-ARG USE_PYTHON_3_NOT_2=True
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
-
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    openjdk-8-jdk \
-    ${PYTHON}-dev \
-    swig
-
-# Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
-
-RUN ${PIP} install jupyter
-
-RUN mkdir /notebooks && chmod a+rwx /notebooks
-RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
-EXPOSE 8888
-
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
deleted file mode 100644
index 77be0dd287a31a6f0bd709671e1381bc975bad68..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow.
-#
-# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
-# packages.
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-
-ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        wget \
-        && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
-
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
-
-ARG USE_PYTHON_3_NOT_2=True
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
-
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    openjdk-8-jdk \
-    ${PYTHON}-dev \
-    swig
-
-# Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
deleted file mode 100644
index 5ff1fa917afd9ade2e0a848fe77d7044f304956f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with Jupyter included.
-#
-# NVIDIA with CUDA and CuDNN, no dev stuff
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
-
-FROM nvidia/cuda:9.0-base-ubuntu16.04
-
-# Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-9-0 \
-        cuda-cufft-9-0 \
-        cuda-curand-9-0 \
-        cuda-cusolver-9-0 \
-        cuda-cusparse-9-0 \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0
-
-ARG USE_PYTHON_3_NOT_2=True
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
-
-ARG TF_PACKAGE=tensorflow-gpu
-RUN ${PIP} install ${TF_PACKAGE}
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
-
-RUN ${PIP} install jupyter
-
-RUN mkdir /notebooks && chmod a+rwx /notebooks
-RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
-EXPOSE 8888
-
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
deleted file mode 100644
index 3df810b5fe67e9e4d5bf2d5aabc4a0f545d0e4e8..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
-#
-# NVIDIA with CUDA and CuDNN, no dev stuff
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
-
-FROM nvidia/cuda:9.0-base-ubuntu16.04
-
-# Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-9-0 \
-        cuda-cufft-9-0 \
-        cuda-curand-9-0 \
-        cuda-cusolver-9-0 \
-        cuda-cusparse-9-0 \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0
-
-ARG USE_PYTHON_3_NOT_2=True
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
-
-ARG TF_PACKAGE=tensorflow-gpu
-RUN ${PIP} install ${TF_PACKAGE}
-
-COPY bashrc /etc/bash.bashrc
-RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
deleted file mode 100644
index b08d8bdd14b638b87ac8fbd57cf2b3e8c4564582..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
+++ /dev/null
@@ -1,13 +0,0 @@
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    openjdk-8-jdk \
-    ${PYTHON}-dev \
-    swig
-
-# Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
index 2c9b9f3f9a081e97c96cedf1bbdf0936a9961d46..c4ec6095c0cae43b9d5756cd4391ca3ddd329fbe 100644
--- a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
@@ -1,8 +1,16 @@
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
deleted file mode 100644
index 45159f711fcbdd0e6bb7083169d2abb39ab8dea5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
+++ /dev/null
@@ -1,49 +0,0 @@
-ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        wget \
-        && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
-
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
diff --git a/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
deleted file mode 100644
index 6f346236a58c9acc88f93aa849ab92269e47a05d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
+++ /dev/null
@@ -1,12 +0,0 @@
-ARG USE_PYTHON_3_NOT_2
-ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
-ARG PYTHON=python${_PY_SUFFIX}
-ARG PIP=pip${_PY_SUFFIX}
-
-RUN apt-get update && apt-get install -y \
-    ${PYTHON} \
-    ${PYTHON}-pip
-
-RUN ${PIP} install --upgrade \
-    pip \
-    setuptools
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
index 96e79547f0c67c232565019e0ae64d24d55d1516..76758bd147ef9d52b3db072bd0091190e132667c 100644
--- a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
@@ -1,2 +1,7 @@
-ARG TF_PACKAGE
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
diff --git a/tensorflow/examples/tutorials/monitors/__init__.py b/tensorflow/tools/dockerfiles/partials/test-import.partial.Dockerfile
similarity index 100%
rename from tensorflow/examples/tutorials/monitors/__init__.py
rename to tensorflow/tools/dockerfiles/partials/test-import.partial.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
deleted file mode 100644
index bc792722766e07d1af3d6944f14a8eb26f43dc1a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libhdf5-serial-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
deleted file mode 100644
index 0a50735bf83364446919254010f0acab0e26404c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
+++ /dev/null
@@ -1,2 +0,0 @@
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..156bb019914554e650421fb23bcebc935658abdb
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -0,0 +1,27 @@
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+# Install bazel
+RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
+    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
+    apt-get update && \
+    apt-get install -y bazel
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..d01b26e27f6ffb35affb95f8e40b7ce3b8e52d0a
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
@@ -0,0 +1 @@
+FROM ubuntu:${UBUNTU_VERSION} as base
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..a61dfbbe54eb163b25160490f3ee245c36d21ffe
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
@@ -0,0 +1,28 @@
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..95f9875012d2a552be4af6f59cb6a5c60d99dce5
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -0,0 +1,61 @@
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
+        curl \
+        git \
+        libcudnn7=7.2.1.38-1+cuda9.0 \
+        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
+        libnccl2=2.2.13-1+cuda9.0 \
+        libnccl-dev=2.2.13-1+cuda9.0 \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python-dev \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        && \
+    rm -rf /var/lib/apt/lists/* && \
+    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+RUN apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
+        apt-get update && \
+        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
+        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
+
+# Link NCCL libray and header where the build script expects them.
+RUN mkdir /usr/local/cuda-9.0/lib &&  \
+    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
+    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
+
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
similarity index 78%
rename from tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index 1064390af3b5006a8e539ad2b006d692e51692ae..1dc8e43aaddc606efde2cbd84215f7ef7131e251 100644
--- a/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -1,6 +1,5 @@
-FROM nvidia/cuda:9.0-base-ubuntu16.04
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
-# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-9-0 \
@@ -9,6 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-curand-9-0 \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
+        curl \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libnccl2=2.2.13-1+cuda9.0 \
         libfreetype6-dev \
@@ -16,6 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        rsync \
         software-properties-common \
         unzip \
         && \
@@ -26,3 +27,6 @@ RUN apt-get update && \
         apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
         apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6af473195380801bded2e6849e97432caf07816b
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
@@ -0,0 +1,18 @@
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/test-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/test-devel.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6ecd2b8b1acd59e50c172c3fc9c5574626ed5608
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile
@@ -0,0 +1 @@
+ARG UBUNTU_VERSION=16.04
diff --git a/tensorflow/tools/dockerfiles/readme-for-jupyter.md b/tensorflow/tools/dockerfiles/readme-for-jupyter.md
new file mode 100644
index 0000000000000000000000000000000000000000..f104a7533b884bea06c46e9670d07d92bca87ea1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/readme-for-jupyter.md
@@ -0,0 +1,3 @@
+Want more tutorials like these?
+
+Check out tensorflow.org/tutorials!
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 28bf9a55da123a0a45cd4b0e54971f14c355d794..19d96e7a3df4468ff82f2029a1945a02b1e58932 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -1,195 +1,148 @@
-# ======
-# HEADER
-# ======
-#
-# This is commented-out and prepended to each generated Dockerfile.
 header: |
-    Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-    ============================================================================
-
-    THIS IS A GENERATED DOCKERFILE.
-
-    This file was assembled from multiple pieces, whose use is documented
-    below. Please refer to the the TensorFlow dockerfiles documentation for
-    more information. Build args are documented as their default value.
-
-# ========
-# PARTIALS
-# ========
+    # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ============================================================================
+    #
+    # THIS IS A GENERATED DOCKERFILE.
+    #
+    # This file was assembled from multiple pieces, whose use is documented
+    # throughout. Please refer to the TensorFlow dockerfiles documentation
+    # for more information.
+
+# A combinatorial explosion of Docker images and Dockerfiles.
+# Each "release" defines all of the ways to combine related but separate chunks
+# of functionality ("slices") by listing all of the "slice sets" to use when
+# building.
 #
-# Represent and document pieces of a Dockerfile. Spec:
-# 
-# name: the name of the partial, is referenced from the images section
-#   desc: A description, inserted later into the Dockerfile
-#   file: Alternative file prefix, e.g. file.partial.Dockerfile. The default is
-#         the name of the partial.
-#   args: A dict of ARGs in the Dockerfile; each entry has the format
-#      ARG_NAME: VALUE where VALUE is one of:
-#         - a dict:
-#             desc: Documentation for the arg
-#             default: Default value for the arg; is written to the Dockerfile
-#             options: List of strings, part of documentation
-#         - a concrete value: the same as a dictionary with default: [value].
-
-partials:
-    ubuntu:
-        desc: Start from Ubuntu (no GPU support)
-        args:
-            UBUNTU_VERSION: 16.04
-
-    ubuntu-devel:
-        desc: Start from Ubuntu, with TF development packages (no GPU support)
-        args:
-            UBUNTU_VERSION: 16.04
-
-    bazel:
-        desc: Install the latest version of Bazel and Python development tools.
-
-    nvidia:
-        desc: NVIDIA with CUDA and CuDNN, no dev stuff
-        args:
-            UBUNTU_VERSION: 16.04
-
-    nvidia-devel:
-        desc: >
-            Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF
-            development packages.
-        args:
-            UBUNTU_VERSION: 16.04
+# For example, a release that uses {nightly}{py} would create 4 Dockerfiles
+# (which could become images or concrete Dockerfiles), because the "nightly"
+# and "py" slice sets both have two entries:
+#
+#   - nightly (no -py2 because the Python 2 slice set has add_to_name: ""
+#   - nightly-py3
+#   - nightly-gpu (similar)
+#   - nightly-gpu-py3
+#
+# Releases are all treated differently by TensorFlow's CI systems.
+releases:
+    # Built Nightly and pushed to tensorflow/tensorflow
+    nightly:
+        tag_specs:
+            - "{nightly}{py}{jupyter}"
+            - "{ubuntu-devel}{py}"
+
+    # Built per-release and pushed to tensorflow/tensorflow
+    # --arg _TAG_PREFIX=<val> should be set to "1.11" (for example) or "latest".
+    versioned:
+        tag_specs:
+            - "{_TAG_PREFIX}{ubuntu}{py}{jupyter}"
+
+    # Dockerfiles stored in the TF repo; not pushed anywhere
+    dockerfiles:
+        is_dockerfiles: true
+        upload_images: false
+        tag_specs:
+            - "{ubuntu}{jupyter}"
+            - "{ubuntu-devel}{jupyter}"
+
+slice_sets:
+
+    py:
+        - add_to_name: ""
+          args:
+              - USE_PYTHON_3_NOT_2=
+        - add_to_name: "-py3"
+          args:
+              - USE_PYTHON_3_NOT_2=1
 
-    python:
-        desc: Python is required for TensorFlow and other libraries.
-        args:
-            USE_PYTHON_3_NOT_2:
-                default: true
-                desc: Install python 3 over Python 2
-                
-    tensorflow:
-        desc: Install the TensorFlow Python package.
-        args:
-            TF_PACKAGE:
-                default: tensorflow
-                options:
-                    - tensorflow
-                    - tensorflow-gpu
-                    - tf-nightly
-                    - tf-nightly-gpu
-                desc: The specific TensorFlow Python package to install
-    shell:
-        desc: Configure TensorFlow's shell prompt and login tools.
     jupyter:
-        desc: Launch Jupyter on execution instead of a bash prompt.
-
-# ======
-# IMAGES
-# ======
-# 
-# Represent Dockerfiles. Spec:
-# 
-# name: the name of the image, possibly referenced by other images
-#   desc: A description, inserted later into the Dockerfile
-#   create-dockerfile: Create a dockerfile based on this. Useful for creating
-#      extensible base images that don't need a file. Default is true.
-#   partials: List of VALUEs, where a VALUE is either:
-#      - the name of a partial, which inserts that partial into this image
-#      - image: [name of another image], which inserts the partials from that
-#        image into this image
-#   arg-defaults: List of VALUEs, where a VALUE is either:
-#      - ARG_NAME: VALUE, which sets the ARG_NAME to VALUE wherever it appears
-#        in this image's partials
-#      - [name of another image], which loads the default args from that image
-images:
-
-    nodev:
-        create-dockerfile: false
-        partials:
-            - python
-            - tensorflow
-            - shell
-
-    dev:
-        create-dockerfile: false
-        partials:
-            - python
-            - bazel
-            - shell
-
-    cpu:
-      desc: Ubuntu-based, CPU-only environment for using TensorFlow
-      partials:
-        - ubuntu
-        - image: nodev
-
-    cpu-devel:
-      desc: >
-          Ubuntu-based, CPU-only environment for developing changes for
-          TensorFlow.
-      partials:
-        - ubuntu-devel
-        - image: dev
+        - add_to_name: ""
+        - add_to_name: "-jupyter"
+          partials:
+              - jupyter
 
-    nvidia:
-      desc: Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
-      arg-defaults: 
-        - TF_PACKAGE: tensorflow-gpu
-      partials:
-        - nvidia
-        - image: nodev
-
-    nvidia-devel:
-      desc: >
-          Ubuntu-based, Nvidia-GPU-enabled environment for developing changes
-          for TensorFlow.
-      arg-defaults: 
-        - TF_PACKAGE: tensorflow-gpu
-      partials:
-        - nvidia-devel
-        - image: dev
-
-    cpu-jupyter:
-      desc: >
-          Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter
-          included.
-      partials:
-        - image: cpu
-        - jupyter
-
-    cpu-devel-jupyter:
-      desc: >
-         Ubuntu-based, CPU-only environment for developing changes for
-         TensorFlow, with Jupyter included.
-      partials:
-        - image: cpu-devel
-        - jupyter
-
-    nvidia-jupyter:
-      desc: >
-        Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with
-        Jupyter included.
-      arg-defaults: 
-        - nvidia
-      partials:
-        - image: nvidia
-        - jupyter
+    ubuntu:
+        - add_to_name: ""
+          dockerfile_exclusive_name: "cpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow
+              - shell
+        - add_to_name: "-gpu"
+          dockerfile_exclusive_name: "gpu"
+          args:
+              - TF_PACKAGE=tensorflow-gpu
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow
+              - shell
+          tests:
+              - import-gpu.sh
+          test_runtime: nvidia
 
-    nvidia-devel-jupyter:
-      desc: >
-        Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for
-        TensorFlow, with Jupyter included.
-      arg-defaults: 
-        - nvidia-devel
-      partials:
-        - image: nvidia-devel
-        - jupyter
+    ubuntu-devel:
+        - add_to_name: "devel"
+          dockerfile_exclusive_name: "devel-cpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-cpu
+              - ubuntu/python
+              - ubuntu/bazel
+              - shell
+          tests:
+              - build-cpu.sh
+          args:
+              - CHECKOUT_TF_SRC=1
+        - add_to_name: "devel-gpu"
+          dockerfile_exclusive_name: "devel-gpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-nvidia
+              - ubuntu/python
+              - ubuntu/bazel
+              - shell
+          tests:
+              - build-gpu.sh
+          test_runtime: nvidia
+          args:
+              - CHECKOUT_TF_SRC=1
+
+    nightly:
+        - add_to_name: "nightly"
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow
+              - shell
+          args:
+              - TF_PACKAGE=tf-nightly
+          tests:
+              - import.sh
+        - add_to_name: "nightly-gpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow
+              - shell
+          test_runtime: nvidia
+          tests:
+              - import-gpu.sh
+          args:
+              - TF_PACKAGE=tf-nightly-gpu
diff --git a/tensorflow/tools/dockerfiles/tests/build-cpu.sh b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bcdc4c2139c83e65c15998d3dd6be2f29e27bff3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Download and build TensorFlow.
+set -euxo pipefail
+git clone --branch=master --depth=1 https://github.com/tensorflow/tensorflow.git /tensorflow
+cd /tensorflow
+
+ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
+# For ivy-bridge or sandy-bridge
+# --copt=-march="ivybridge" \
+# for haswell, broadwell, or skylake
+# --copt=-march="haswell" \
+tensorflow/tools/ci_build/builds/configured CPU \
+  bazel build -c opt --copt=-mavx --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+      tensorflow/tools/pip_package:build_pip_package && \
+  bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
+  pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
+  rm -rf /tmp/pip && \
+  rm -rf /root/.cache
+
diff --git a/tensorflow/tools/dockerfiles/tests/build-gpu.sh b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..76b25d5a7419b9a07a6799f14fa5175fb6fa36d5
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Download and build TensorFlow.
+set -euxo pipefail
+git clone --branch=master --depth=1 https://github.com/tensorflow/tensorflow.git /tensorflow
+cd /tensorflow
+
+ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
+tensorflow/tools/ci_build/builds/configured GPU \
+bazel build -c opt --copt=-mavx --config=cuda \
+    --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+    tensorflow/tools/pip_package:build_pip_package && \
+rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
+pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
+rm -rf /tmp/pip && \
+rm -rf /root/.cache
diff --git a/tensorflow/tools/dockerfiles/tests/import-gpu.sh b/tensorflow/tools/dockerfiles/tests/import-gpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6559210dcbfbb5fe3c76c369c5ae211920f46d15
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import-gpu.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+python -c 'import tensorflow as tf; tf.test.is_gpu_available() or exit(1)'
diff --git a/tensorflow/tools/dockerfiles/tests/import.sh b/tensorflow/tools/dockerfiles/tests/import.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b73bd86a8529e2b7634f0b027196b978f8245da0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+set -euxo pipefail
+python -c 'import tensorflow as tf'
diff --git a/tensorflow/tools/dockerfiles/tools.Dockerfile b/tensorflow/tools/dockerfiles/tools.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e8929295a5ee397acbe46ebf96894174ca01fca2
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tools.Dockerfile
@@ -0,0 +1,31 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# TensorFlow Dockerfile Development Container
+#
+# You can use this image to quickly develop changes to the Dockerfile assembler
+# or set of TF Docker partials. See README.md for usage instructions.
+FROM debian:stretch
+LABEL maintainer="Austin Anderson <angerson@google.com>"
+
+RUN apt-get update && apt-get install -y python3 python3-pip bash curl
+RUN curl -sSL https://get.docker.com/ | sh
+RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus docker
+
+WORKDIR /tf
+VOLUME ["/tf"]
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 1a53f241773a199a6fa1f5388d2c0a4dcf463503..b072853a4ec298ce5c15afc1307a966ecefb743f 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -142,17 +142,30 @@ py_test(
     ],
 )
 
-py_binary(
-    name = "generate_1_0",
-    srcs = ["generate_1_0.py"],
+py_test(
+    name = "generate2_test",
+    srcs = ["generate2_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        # No reason to run sanitizers or fastbuild for this test.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "optonly",
+    ],
     deps = [
-        ":generate_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python/debug:debug_py",
+        ":generate2",
     ],
 )
 
+py_binary(
+    name = "generate2",
+    srcs = ["generate2.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
 py_library(
     name = "py_guide_parser",
     srcs = ["py_guide_parser.py"],
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba909d26defffad2d7dbaffa4463695685ae50c
--- /dev/null
+++ b/tensorflow/tools/docs/generate2.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""A tool to generate api_docs for TensorFlow2.
+
+```
+python generate2.py --output_dir=/tmp/out
+```
+
+Requires a local installation of:
+  https://github.com/tensorflow/docs/tree/master/tools
+  tf-nightly-2.0-preview
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+from tensorflow_docs.api_generator import generate_lib
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    "code_url_prefix",
+    "/code/stable/tensorflow/",
+    "A url to prepend to code paths when creating links to defining code")
+
+flags.DEFINE_string(
+    "output_dir", "/tmp/out",
+    "A directory, where the docs will be output to.")
+
+flags.DEFINE_bool("search_hints", True,
+                  "Include meta-data search hints at the top of each file.")
+
+
+def build_docs(output_dir, code_url_prefix, search_hints=True):
+  """Build api docs for tensorflow v2.
+
+  Args:
+    output_dir: A string path, where to put the files.
+    code_url_prefix: prefix for "Defined in" links.
+    search_hints: Bool. Include meta-data search hints at the top of each file.
+  """
+  base_dir = path.dirname(tf.__file__)
+  doc_generator = generate_lib.DocGenerator(
+      root_title="TensorFlow 2.0 Preview",
+      py_modules=[("tf", tf)],
+      base_dir=base_dir,
+      search_hints=search_hints,
+      code_url_prefix=code_url_prefix,
+      site_path="api_docs/")
+
+  doc_generator.build(output_dir)
+
+
+def main(argv):
+  del argv
+  build_docs(output_dir=FLAGS.output_dir,
+             code_url_prefix=FLAGS.code_url_prefix,
+             search_hints=FLAGS.search_hints)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/tools/docs/generate2_test.py b/tensorflow/tools/docs/generate2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..774d45c536ba158d6d4cf4f4ac6043e76b88912f
--- /dev/null
+++ b/tensorflow/tools/docs/generate2_test.py
@@ -0,0 +1,39 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.tools.docs.generate2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from tensorflow.python.platform import googletest
+from tensorflow.tools.docs import generate2
+
+
+class Generate2Test(googletest.TestCase):
+
+  def test_end_to_end(self):
+    output_dir = os.path.join(googletest.GetTempDir(), 'output')
+    if os.path.exists(output_dir):
+      shutil.rmtree(output_dir)
+    os.makedirs(output_dir)
+    generate2.build_docs(output_dir=output_dir, code_url_prefix='')
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/tools/docs/generate_1_0.py b/tensorflow/tools/docs/generate_1_0.py
deleted file mode 100644
index f4384e0ced77718c80d4d146a2d72072588a0541..0000000000000000000000000000000000000000
--- a/tensorflow/tools/docs/generate_1_0.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generate docs for the TensorFlow Python API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-
-import tensorflow as tf
-
-from tensorflow.python import debug as tf_debug
-from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import generate_lib
-
-if __name__ == '__main__':
-  doc_generator = generate_lib.DocGenerator()
-  doc_generator.add_output_dir_argument()
-  doc_generator.add_src_dir_argument()
-
-  # This doc generator works on the TensorFlow codebase. Since this script lives
-  # at tensorflow/tools/docs, and all code is defined somewhere inside
-  # tensorflow/, we can compute the base directory (two levels up), which is
-  # valid unless we're trying to apply this to a different code base, or are
-  # moving the script around.
-  script_dir = os.path.dirname(tf_inspect.getfile(tf_inspect.currentframe()))
-  default_base_dir = os.path.join(script_dir, '..', '..')
-  doc_generator.add_base_dir_argument(default_base_dir)
-
-  flags = doc_generator.parse_known_args()
-
-  # tf_debug is not imported with tf, it's a separate module altogether
-  doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
-
-  doc_generator.set_do_not_descend_map({
-      'tf': ['cli', 'lib', 'wrappers'],
-      'tf.contrib': [
-          'compiler',
-          'factorization',
-          'grid_rnn',
-          'labeled_tensor',
-          'quantization',
-          'session_bundle',
-          'slim',
-          'solvers',
-          'specs',
-          'tensor_forest',
-          'tensorboard',
-          'testing',
-          'training',
-          'tfprof',
-      ],
-      'tf.contrib.bayesflow': [
-          'entropy', 'monte_carlo', 'special_math',
-          'stochastic_gradient_estimators', 'stochastic_graph',
-          'stochastic_tensor', 'stochastic_variables', 'variational_inference'
-      ],
-      'tf.contrib.distributions': ['bijector'],
-      'tf.contrib.ffmpeg': ['ffmpeg_ops'],
-      'tf.contrib.graph_editor': [
-          'edit', 'match', 'reroute', 'subgraph', 'transform', 'select', 'util'
-      ],
-      'tf.contrib.layers': ['feature_column', 'summaries'],
-      'tf.contrib.learn': [
-          'datasets',
-          'head',
-          'graph_actions',
-          'io',
-          'models',
-          'monitors',
-          'ops',
-          'preprocessing',
-          'utils',
-      ],
-      'tf.contrib.util': ['loader'],
-  })
-
-  sys.exit(doc_generator.build(flags))
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 77a3ca2052e113cb8af9f5369996790a8e5db72f..0e1a682d582f213fcc1be531bff9da741053344d 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -230,6 +230,7 @@ def _get_default_private_map():
       'tf.contrib.autograph': ['utils', 'operators'],
       'tf.test': ['mock'],
       'tf.compat': ['v1', 'v2'],
+      'tf.contrib.estimator': ['python'],
   }
 
 
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index daa17fbd501651540c4c90c6354eb0a5b2f2b7aa..34a516794850547a108d67338ac12bbbddce8164 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -6,6 +6,8 @@ package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(
-    ["gen_git_source.py"],
+py_binary(
+    name = "gen_git_source",
+    srcs = ["gen_git_source.py"],
+    srcs_version = "PY2AND3",
 )
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index cc2288a7fa9202efcd077e54b941cc278b25993c..8e7cd9b10415740a554445edbb634706dd97857c 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 1ad1895269022331bfd8156721778f4d68a10ee7..eb1ed1f2ca859df42809084c1ea47a6f3b21012e 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -97,6 +97,7 @@ cc_library(
         "fold_old_batch_norms.cc",
         "freeze_requantization_ranges.cc",
         "fuse_convolutions.cc",
+        "fuse_quantized_convolution.cc",
         "insert_logging.cc",
         "obfuscate_names.cc",
         "quantize_nodes.cc",
diff --git a/tensorflow/tools/graph_transforms/backports.cc b/tensorflow/tools/graph_transforms/backports.cc
index 5c153e8cefc900728c78340dd43a56737d887b21..041e7eedfb7a38f0eeb7ec17b44c92010041dc29 100644
--- a/tensorflow/tools/graph_transforms/backports.cc
+++ b/tensorflow/tools/graph_transforms/backports.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/flatten_atrous.cc b/tensorflow/tools/graph_transforms/flatten_atrous.cc
index a6f7cb0ed8b45dc537b6fe8c7b9d7e09685feef9..c80b28fbbca7e3d29f5abdef30a130934f17c9c0 100644
--- a/tensorflow/tools/graph_transforms/flatten_atrous.cc
+++ b/tensorflow/tools/graph_transforms/flatten_atrous.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
index 975b17380f6ca7fbd94783c6226f54c89e730cde..16a0f7d58df66be06224d58de623ee7e2dc41880 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 6df2718e61074daab7bdfd75ca923035ffe5fba4..dcc36b1a8557cf30ac030302fcb7545da55c7886 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index 156636ab8215d9abdc9e0ed461df550f1c7ed09c..fd546f812c0dafc5d2e71c94710c3c3f5b75250e 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fuse_convolutions.cc b/tensorflow/tools/graph_transforms/fuse_convolutions.cc
index df6e9e6dc2864872fa8f30741735a7d5985a3104..7754dde9c68753ea648ce31e0f87329826e10828 100644
--- a/tensorflow/tools/graph_transforms/fuse_convolutions.cc
+++ b/tensorflow/tools/graph_transforms/fuse_convolutions.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc b/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5aa2dd4f99b89f0ea03fe69db854c55f3f2f3c38
--- /dev/null
+++ b/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
@@ -0,0 +1,225 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+#include <algorithm>
+
+#include "tensorflow/core/common_runtime/constant_folding.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
+
+namespace tensorflow {
+namespace graph_transforms {
+
+Status FuseQuantizedConvolutionAndRequantize(
+    const GraphDef& input_graph_def, const TransformFuncContext& context,
+    GraphDef* output_graph_def) {
+  std::map<string, const NodeDef*> node_map;
+  MapNamesToNodes(input_graph_def, &node_map);
+  GraphDef replaced_graph_def;
+  TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
+      input_graph_def,  // clang-format off
+
+      {"Requantize",
+        {
+          {"QuantizedConv2D|QuantizedConv2DWithBias|QuantizedConv2DWithRelu|"
+            "QuantizedConv2DWithBiasAndRelu|QuantizedConv2DWithBiasSumAndRelu"},
+          {"QuantizedConv2D|QuantizedConv2DWithBias|QuantizedConv2DWithRelu|"
+           "QuantizedConv2DWithBiasAndRelu|QuantizedConv2DWithBiasSumAndRelu"},
+          {"QuantizedConv2D|QuantizedConv2DWithBias|QuantizedConv2DWithRelu|"
+           "QuantizedConv2DWithBiasAndRelu|QuantizedConv2DWithBiasSumAndRelu"},
+          {"Const"},
+          {"Const"}
+        }
+      },  // clang-format on */
+      [&node_map](const NodeMatch& match, const std::set<string>& input_nodes,
+         const std::set<string>& output_nodes,
+         std::vector<NodeDef>* new_nodes) {
+        // TODO(mdfaijul/sheng): Current implementation assumed all
+        // requantization cases have bias. Index of inputs need to be updated
+        // for non-bias cases.
+
+        // Find all the nodes we expect in the subgraph.
+        const NodeDef& requantize_node = match.node;
+        CHECK_EQ("Requantize", requantize_node.op());
+        const NodeDef& quantized_conv2D_node = match.inputs[0].node;
+        const NodeDef& const_requantize_range_min_node = match.inputs[3].node;
+        CHECK_EQ("Const", const_requantize_range_min_node.op());
+        const NodeDef& const_requantize_range_max_node = match.inputs[4].node;
+        CHECK_EQ("Const", const_requantize_range_max_node.op());
+
+        string quantized_conv2D_op_name = quantized_conv2D_node.op();
+        // Set up the new fused version of the convolution op.
+        NodeDef fused_conv;
+        fused_conv.set_op(quantized_conv2D_op_name + "AndRequantize");
+        fused_conv.set_name(match.node.name());
+        int n_input = quantized_conv2D_node.input_size();
+        if (quantized_conv2D_op_name.compare(
+                "QuantizedConv2DWithBiasSumAndRelu") == 0)
+          n_input -= 1;  // -1 since summand is moved after frozen min-max
+
+        for (int i=0; i < n_input; i++)
+          AddNodeInput(quantized_conv2D_node.input(i), &fused_conv);
+
+        AddNodeInput(const_requantize_range_min_node.name(), &fused_conv);
+        AddNodeInput(const_requantize_range_max_node.name(), &fused_conv);
+
+        // Add additional inputs to
+        // QuantizedConv2DWithBiasSumAndReluAndRequantize
+        if (quantized_conv2D_op_name.compare(
+              "QuantizedConv2DWithBiasSumAndRelu") == 0) {
+          const NodeDef *in_requantize = node_map[node_map[
+              quantized_conv2D_node.input(n_input)]->input(0)];
+          string summand(in_requantize->name());
+          string min_summand(in_requantize->name() + ":1");
+          string max_summand(in_requantize->name() + ":2");
+          AddNodeInput(summand, &fused_conv);
+          AddNodeInput(min_summand, &fused_conv);
+          AddNodeInput(max_summand, &fused_conv);
+
+          // Signed version QuantizedConv2DWithBiasSumAndReluAndRequantize
+          // if Relu does not follow the convolution operation
+          std::vector<string> signed_ops = {
+              "QuantizedConv2DWithBias",
+              "QuantizedConv2D"
+              };
+          bool is_signed_summand =
+              std::find(signed_ops.begin(), signed_ops.end(),
+              node_map[in_requantize->input(0)]->op()) != signed_ops.end();
+          if (is_signed_summand) {
+            fused_conv.set_op(
+                "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize");
+            SetNodeAttr("Tsummand", DT_QINT8, &fused_conv);
+          } else {
+            SetNodeAttr("Tsummand", DT_QUINT8, &fused_conv);
+          }
+        }
+        CopyNodeAttr(quantized_conv2D_node, "Tinput", "Tinput", &fused_conv);
+        CopyNodeAttr(quantized_conv2D_node, "Tfilter", "Tfilter", &fused_conv);
+        CopyNodeAttr(quantized_conv2D_node, "strides", "strides", &fused_conv);
+        CopyNodeAttr(quantized_conv2D_node, "padding", "padding", &fused_conv);
+
+        // Copy dilation attribute if exsit in the orginal node
+        if (HasNodeAttr(quantized_conv2D_node, "dilations"))
+          CopyNodeAttr(quantized_conv2D_node, "dilations",
+                       "dilations", &fused_conv);
+        if (quantized_conv2D_op_name.compare("QuantizedConv2D") == 0 ||
+           quantized_conv2D_op_name.compare("QuantizedConv2DWithBias") == 0)
+          SetNodeAttr("out_type", DT_QINT8, &fused_conv);
+        else
+          SetNodeAttr("out_type", DT_QUINT8, &fused_conv);
+        new_nodes->push_back(fused_conv);
+        new_nodes->push_back(const_requantize_range_min_node);
+        new_nodes->push_back(const_requantize_range_max_node);
+
+        return Status::OK();
+      },
+      {}, &replaced_graph_def));
+
+  // Convert bias float -> int32 on replaced_graph_def
+  std::vector<string> fused_requantized_bias_ops = {
+      "QuantizedConv2DWithBiasAndRequantize",
+      "QuantizedConv2DWithBiasAndReluAndRequantize",
+      "QuantizedConv2DWithBiasSumAndReluAndRequantize",
+      "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+      };
+  node_map.clear();
+  MapNamesToNodes(replaced_graph_def, &node_map);
+  for (auto& node_pair : node_map) {
+    const NodeDef *node = node_pair.second;
+    bool is_fused_requantized_conv_op =
+        std::find(fused_requantized_bias_ops.begin(),
+                  fused_requantized_bias_ops.end(),
+                  node->op()) != fused_requantized_bias_ops.end();
+    if (is_fused_requantized_conv_op) {
+      // If the op is not fed by Another Requantize op,
+      // then we coonvert bias as Int32
+      string input_op = node_map[NodeNameFromInput(node->input(0))]->op();
+      if (str_util::StartsWith(input_op, "QuantizedConv2D") &&
+          str_util::EndsWith(input_op, "AndRequantize")) {
+        NodeDef *bias_node = const_cast<NodeDef*>(node_map[NodeNameFromInput(
+            node->input(2))]);
+        const NodeDef *min_input_node = node_map[NodeNameFromInput(
+            node_map[node->input(0)]->input(7))];
+        const NodeDef *max_input_node = node_map[NodeNameFromInput(
+            node_map[node->input(0)]->input(8))];
+        const NodeDef *min_filter_node = node_map[NodeNameFromInput(
+            node->input(5))];
+        const NodeDef *max_filter_node = node_map[NodeNameFromInput(
+            node->input(6))];
+        const float min_input =
+            GetNodeTensorAttr(*min_input_node, "value").flat<float>()(0);
+        const float max_input =
+            GetNodeTensorAttr(*max_input_node, "value").flat<float>()(0);
+        const float min_filter =
+            GetNodeTensorAttr(*min_filter_node, "value").flat<float>()(0);
+        const float max_filter =
+            GetNodeTensorAttr(*max_filter_node, "value").flat<float>()(0);
+
+        TensorProto float_tensor_proto = bias_node->attr().at("value").tensor();
+        Tensor float_tensor;
+        if(!float_tensor.FromProto(float_tensor_proto)) {
+          TF_RETURN_IF_ERROR(::tensorflow::errors::InvalidArgument(
+              "TensorProto object is not valid."));
+        }
+        if (float_tensor.dtype() != DT_FLOAT) {
+          TF_RETURN_IF_ERROR(::tensorflow::errors::Unimplemented(
+              "Expected float tensor."));
+        }
+        float *p_bias_float = float_tensor.flat<float>().data();
+
+        Tensor int32_tensor = Tensor(DT_QINT32, float_tensor.shape());
+        qint32 *p_bias_int32 = int32_tensor.flat<qint32>().data();
+
+        float bias_scale = 255.0 * 127.0 /
+            (std::max(std::abs(max_input), std::abs(min_input)) *
+            std::max(std::abs(max_filter), std::abs(min_filter)));
+        int64 nelems = float_tensor.NumElements();
+        for (int64 n = 0; n < nelems; n++)
+          p_bias_int32[n] = (int32_t) (p_bias_float[n] * bias_scale);
+
+        bias_node->clear_attr();
+        AttrValue attr_type;
+        attr_type.set_type(int32_tensor.dtype());
+        bias_node->mutable_attr()->insert({"dtype", attr_type});
+
+        AttrValue attr_tensor;
+        TensorProto* t = attr_tensor.mutable_tensor();
+        int32_tensor.AsProtoTensorContent(t);
+        bias_node->mutable_attr()->insert({"value", attr_tensor});
+        SetNodeAttr("Tbias", DT_QINT32, const_cast<NodeDef*>(node));
+      } else {
+        SetNodeAttr("Tbias", DT_FLOAT, const_cast<NodeDef*>(node));
+      }
+    }
+  }
+  *output_graph_def = replaced_graph_def;
+  return Status::OK();
+}
+
+REGISTER_GRAPH_TRANSFORM("fuse_quantized_conv_and_requantize",
+                         FuseQuantizedConvolutionAndRequantize);
+
+}  // namespace graph_transforms
+}  // namespace tensorflow
+#endif // INTEL_MKL
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index 377665448c244aeace78f231ba0c263613afd9a0..ccc48540eb9731514ecbff41de86df956ff91a3b 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/obfuscate_names.cc b/tensorflow/tools/graph_transforms/obfuscate_names.cc
index c470b51b96096a36eacdc67a74431ec02e0515d0..ee8ca3d097d71fef91d0ee50057ff6d215891596 100644
--- a/tensorflow/tools/graph_transforms/obfuscate_names.cc
+++ b/tensorflow/tools/graph_transforms/obfuscate_names.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index a022f5792676c62c52fd1197b0d8c436f7161a47..b139dad2ddd13ade70a4563a50b0db2db298ef36 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/quantize_weights.cc b/tensorflow/tools/graph_transforms/quantize_weights.cc
index cccae8a992a64b0f49798eda71513a2fe62ad656..a1a6e27171ee5a48dec91d64a3b15f6caa88dbf8 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_attribute.cc b/tensorflow/tools/graph_transforms/remove_attribute.cc
index b1a04c0f283bf6bc03da702447694558c5b98538..0a76c2b2052a2c26ee66691b361fff2be70bbf30 100644
--- a/tensorflow/tools/graph_transforms/remove_attribute.cc
+++ b/tensorflow/tools/graph_transforms/remove_attribute.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_device.cc b/tensorflow/tools/graph_transforms/remove_device.cc
index 975fa3706335dd38e4f0992ff4c155addfc5e6a9..fdd43168a117b89884187e6b7a29e5f44f14fd33 100644
--- a/tensorflow/tools/graph_transforms/remove_device.cc
+++ b/tensorflow/tools/graph_transforms/remove_device.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_nodes.cc b/tensorflow/tools/graph_transforms/remove_nodes.cc
index 05f036a86a09b2a6a94e9c1a1220803eabc64da5..aa0288689d9e093a39e8aa6b9156bac19ef40491 100644
--- a/tensorflow/tools/graph_transforms/remove_nodes.cc
+++ b/tensorflow/tools/graph_transforms/remove_nodes.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/rename_attribute.cc b/tensorflow/tools/graph_transforms/rename_attribute.cc
index bd066aab5b9ab69a38e313c0b0437457b3a2bb52..62897d43a8ca774418c7b45c1f886cd8cd7fd850 100644
--- a/tensorflow/tools/graph_transforms/rename_attribute.cc
+++ b/tensorflow/tools/graph_transforms/rename_attribute.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/rename_op.cc b/tensorflow/tools/graph_transforms/rename_op.cc
index e1e13c1be43a531355e5df4530183bd55836fe4c..9deee8bbffbbda41c1e59480c5e642d4c6ce1de9 100644
--- a/tensorflow/tools/graph_transforms/rename_op.cc
+++ b/tensorflow/tools/graph_transforms/rename_op.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/round_weights.cc b/tensorflow/tools/graph_transforms/round_weights.cc
index 72927e439b7f4177a8db035d022ba450a924ad98..3a145ac1f6b0ef238383f4eb75dd5de023503c47 100644
--- a/tensorflow/tools/graph_transforms/round_weights.cc
+++ b/tensorflow/tools/graph_transforms/round_weights.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/sort_by_execution_order.cc b/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
index 43152d20fcc1aa477983c8d792dcab2e74664e73..548f5ba4820a82718676d995cbd7a09332051bf4 100644
--- a/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
+++ b/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index cc82100148117c7846ba5781e1a97e172ad7f03c..bed51f89821032862ec3d24077cb51d9c676be94 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
diff --git a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
index ae9d0aa20999c86fe2ea8902204604807f0f298c..d466f21c17ddfec9c0b0181f844b1b608f95246a 100644
--- a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
+++ b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 85514b8629d61b7d8a463daee5522ec5dc9a95b9..1186189844aa887ba011b532df3a73d89ffe52b8 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -112,6 +112,7 @@ pkg_tar(
 genrule(
     name = "clicenses_generate",
     srcs = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
@@ -137,6 +138,24 @@ genrule(
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
     ] + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_aws_support": [],
+        "//conditions:default": [
+            "@aws//:LICENSE",
+        ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_gcp_support": [],
+        "//conditions:default": [
+            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
+        ],
+    }) + select({
         "//tensorflow/core/kernels:xsmm": [
             "@libxsmm_archive//:LICENSE.md",
         ],
@@ -150,17 +169,9 @@ genrule(
         "grpc",
         [
             "@grpc//:LICENSE",
-            "@grpc//third_party/nanopb:LICENSE.txt",
             "@grpc//third_party/address_sorting:LICENSE",
         ],
-    ) + select({
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": [
-            "@aws//:LICENSE",
-            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
-        ],
-    }),
+    ),
     outs = ["include/tensorflow/c/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
     tools = [":concat_licenses.sh"],
@@ -169,6 +180,7 @@ genrule(
 genrule(
     name = "jnilicenses_generate",
     srcs = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
@@ -193,7 +205,27 @@ genrule(
         "@protobuf_archive//:LICENSE",
         "@snappy//:COPYING",
         "@zlib_archive//:zlib.h",
+        "@grpc//:LICENSE",
+        "@grpc//third_party/address_sorting:LICENSE",
     ] + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_aws_support": [],
+        "//conditions:default": [
+            "@aws//:LICENSE",
+        ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_gcp_support": [],
+        "//conditions:default": [
+            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
+        ],
+    }) + select({
         "//tensorflow/core/kernels:xsmm": [
             "@libxsmm_archive//:LICENSE.md",
         ],
@@ -203,14 +235,7 @@ genrule(
     ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
-    ]) + select({
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": [
-            "@aws//:LICENSE",
-            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
-        ],
-    }),
+    ]),
     outs = ["include/tensorflow/jni/LICENSE"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
     tools = [":concat_licenses.sh"],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 164b3d83037304a174415c7982cf829ac8b53660..baacb8723961d0a78b29338f1c4f212e46573b2c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -78,7 +78,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
     "//tensorflow/contrib/signal:signal_py",
-    "//tensorflow/contrib/signal:test_util",
     "//tensorflow/contrib/slim:slim",
     "//tensorflow/contrib/slim/python/slim/data:data_pip",
     "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
@@ -89,6 +88,9 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/timeseries:timeseries_pip",
     "//tensorflow/contrib/tpu",
     "//tensorflow/examples/tutorials/mnist:package",
+    "//tensorflow/lite/python:interpreter_test_data",
+    "//tensorflow/lite/python:tflite_convert",
+    "//tensorflow/lite/toco/python:toco_from_protos",
     # "//tensorflow/python/autograph/converters:converters",
     # "//tensorflow/python/autograph/core:core",
     "//tensorflow/python/autograph/core:test_lib",
@@ -109,7 +111,9 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
+    "//tensorflow/python/ops/ragged:ragged_test_util",
     "//tensorflow/python/saved_model:saved_model",
     "//tensorflow/python/tools:tools_pip",
     "//tensorflow/python/tools/api/generator:create_python_api",
@@ -124,7 +128,7 @@ COMMON_PIP_DEPS = [
 py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
-    data = COMMON_PIP_DEPS,
+    data = COMMON_PIP_DEPS + ["//tensorflow/python:pywrap_tensorflow_import_lib_file"],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -132,6 +136,7 @@ py_binary(
 filegroup(
     name = "licenses",
     data = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
@@ -153,6 +158,7 @@ filegroup(
         "@highwayhash//:LICENSE",
         "@icu//:icu4c/LICENSE",
         "@jpeg//:LICENSE.md",
+        "@keras_applications_archive//:LICENSE",
         "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nasm//:LICENSE",
@@ -167,6 +173,34 @@ filegroup(
         "@zlib_archive//:zlib.h",
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_aws_support": [],
+        "//conditions:default": [
+            "@aws//:LICENSE",
+        ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_gcp_support": [],
+        "//conditions:default": [
+            "@com_github_googleapis_googleapis//:LICENSE",
+            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
+        ],
+    }) + select({
+        "//tensorflow:android": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:windows": [],
+        "//tensorflow:no_kafka_support": [],
+        "//conditions:default": [
+            "@kafka//:LICENSE",
+        ],
+    }) + select({
         "//tensorflow/core/kernels:xsmm": [
             "@libxsmm_archive//:LICENSE.md",
         ],
@@ -181,7 +215,6 @@ filegroup(
         "grpc",
         [
             "@grpc//:LICENSE",
-            "@grpc//third_party/nanopb:LICENSE.txt",
             "@grpc//third_party/address_sorting:LICENSE",
         ],
     ) + if_ngraph([
@@ -189,16 +222,7 @@ filegroup(
         "@ngraph_tf//:LICENSE",
         "@nlohmann_json_lib//:LICENSE.MIT",
         "@tbb//:LICENSE",
-    ]) + tf_additional_license_deps() + select({
-        "//tensorflow:linux_s390x": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": [
-            "@aws//:LICENSE",
-            "@com_github_googleapis_googleapis//:LICENSE",
-            "@com_github_googlecloudplatform_google_cloud_cpp//:LICENSE",
-            "@kafka//:LICENSE",
-        ],
-    }),
+    ]) + tf_additional_license_deps(),
 )
 
 sh_binary(
@@ -207,15 +231,9 @@ sh_binary(
     data = select({
         "//tensorflow:windows": [
             ":simple_console_for_windows",
-            "//tensorflow/contrib/lite/python:interpreter_test_data",
-            "//tensorflow/contrib/lite/python:tflite_convert",
-            "//tensorflow/contrib/lite/toco/python:toco_from_protos",
         ],
         "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
-            "//tensorflow/contrib/lite/python:interpreter_test_data",
-            "//tensorflow/contrib/lite/python:tflite_convert",
-            "//tensorflow/contrib/lite/toco/python:toco_from_protos",
         ],
     }) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]),
 )
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index c4b4af93b807ae134573642932c25e760819121b..272ff4735c34b319589bd9302fcdb5cd91b6d1ec 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -1,5 +1,6 @@
 include README
 recursive-include * *.py
+recursive-include * *.pyd
 recursive-include * *.pd
 recursive-include * *.so
 recursive-include * *.dll
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index c62271c5cb17317d273235df11f328211a2df7f7..439b5428b3b7bff651689e08e783bf7875f16319 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -120,7 +120,7 @@ function prepare_src() {
     fi
     mkdir "${TMPDIR}/tensorflow/aux-bin"
     # Install toco as a binary in aux-bin.
-    cp bazel-bin/tensorflow/contrib/lite/python/tflite_convert ${TMPDIR}/tensorflow/aux-bin/
+    cp bazel-bin/tensorflow/lite/python/tflite_convert ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index c6ef82ccdc230503a54479dd01dd80c5903c9ead..ff821b864300c1eeb2f9d290ae47a25ce87a0884 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -37,7 +37,7 @@ def GetBuild(dir_base):
   for root, _, files in os.walk(dir_base):
     for name in files:
       if (name == "BUILD" and
-          root.find("tensorflow/contrib/lite/examples/android") == -1):
+          root.find("tensorflow/lite/examples/android") == -1):
         items.append("//" + root + ":all")
   return items
 
@@ -85,10 +85,14 @@ BLACKLIST = [
     # contrib
     "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
     "//tensorflow/contrib/keras:testing_utils",
-    "//tensorflow/contrib/lite/python:interpreter",
-    "//tensorflow/contrib/lite/python:interpreter_test",
-    "//tensorflow/contrib/lite/python:interpreter.py",
-    "//tensorflow/contrib/lite/python:interpreter_test.py",
+    "//tensorflow/lite/experimental/examples/lstm:tflite_lstm",
+    "//tensorflow/lite/experimental/examples/lstm:tflite_lstm.py",
+    "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test",  # pylint:disable=line-too-long
+    "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test.py",  # pylint:disable=line-too-long
+    "//tensorflow/lite/python:interpreter",
+    "//tensorflow/lite/python:interpreter_test",
+    "//tensorflow/lite/python:interpreter.py",
+    "//tensorflow/lite/python:interpreter_test.py",
     "//tensorflow/contrib/ffmpeg:test_data",
     "//tensorflow/contrib/fused_conv:fused_conv2d_bias_activation_op_test_base",
     "//tensorflow/contrib/hadoop:test_data",
@@ -142,7 +146,7 @@ def main():
 
   missing_dependencies = []
   # File extensions and endings to ignore
-  ignore_extensions = ["_test", "_test.py"]
+  ignore_extensions = ["_test", "_test.py", "_test_gpu", "_test_gpu.py"]
 
   ignored_files = 0
   blacklisted_files = len(BLACKLIST)
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d864a7a039eca7feb76b4a351cdabaa6899d7bcd..85c913f158863c5ff3718ae3f305829e15237b22 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.11.0-rc1'
+_VERSION = '1.12.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -56,7 +56,8 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.6.1',
-    'tensorboard >= 1.11.0, < 1.12.0',
+    'tensorboard >= 1.12.0, < 1.13.0',
+    'tensorflow_estimator >= 1.10.0',
     'termcolor >= 1.1.0',
 ]
 
@@ -85,8 +86,9 @@ else:
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.12.0a0, < 1.13.0a0'
-      break
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.13.0a0, < 1.14.0a0'
+    if 'tensorflow_estimator' in pkg:
+      REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
 # weakref.finalize and enum were introduced in Python 3.4
 if sys.version_info < (3, 4):
@@ -96,15 +98,16 @@ if sys.version_info < (3, 4):
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
     'freeze_graph = tensorflow.python.tools.freeze_graph:run_main',
-    'toco_from_protos = tensorflow.contrib.lite.toco.python.toco_from_protos:main',
-    'tflite_convert = tensorflow.contrib.lite.python.tflite_convert:main',
-    'toco = tensorflow.contrib.lite.python.tflite_convert:main',
+    'toco_from_protos = tensorflow.lite.toco.python.toco_from_protos:main',
+    'tflite_convert = tensorflow.lite.python.tflite_convert:main',
+    'toco = tensorflow.lite.python.tflite_convert:main',
     'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
     # We need to keep the TensorBoard command, even though the console script
     # is now declared by the tensorboard pip package. If we remove the
     # TensorBoard command, pip will inappropriately remove it during install,
     # even though the command is not removed, just moved to a different wheel.
     'tensorboard = tensorboard.main:run_main',
+    'tf_upgrade_v2 = tensorflow.tools.compatibility.tf_upgrade_v2_main:main',
 ]
 # pylint: enable=line-too-long
 
@@ -226,13 +229,14 @@ if os.name == 'nt':
 else:
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.so'
 
-headers = (list(find_files('*.h', 'tensorflow/core')) +
-           list(find_files('*.h', 'tensorflow/stream_executor')) +
-           list(find_files('*.h', 'google/protobuf_archive/src')) +
-           list(find_files('*', 'third_party/eigen3')) +
-           list(find_files('*.h',
-                           'tensorflow/include/external/com_google_absl')) +
-           list(find_files('*', 'tensorflow/include/external/eigen_archive')))
+headers = (
+    list(find_files('*.h', 'tensorflow/core')) + list(
+        find_files('*.h', 'tensorflow/stream_executor')) +
+    list(find_files('*.h', 'google/protobuf_archive/src')) + list(
+        find_files('*', 'third_party/eigen3')) + list(
+            find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
+    list(find_files('*.inc', 'tensorflow/include/external/com_google_absl')) +
+    list(find_files('*', 'tensorflow/include/external/eigen_archive')))
 
 setup(
     name=project_name,
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 72f3fd0cf87f6c0c38294675ff4e577be08ad62c..60dcca3207f88f4bba9e0d11c263f657d44ed1b5 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -20,12 +20,25 @@ load(
     "//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl",
     "def_file_filter_configure",
 )
+load("//third_party/aws:workspace.bzl", aws = "repo")
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
+load("//third_party/highwayhash:workspace.bzl", highwayhash = "repo")
 load("//third_party/icu:workspace.bzl", icu = "repo")
+load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
+load("//third_party/nasm:workspace.bzl", nasm = "repo")
+load("//third_party/kissfft:workspace.bzl", kissfft = "repo")
+load("//third_party/keras_applications_archive:workspace.bzl", keras_applications = "repo")
 
 def initialize_third_party():
+    """ Load third party repositories.  See above load() statements. """
+    aws()
     flatbuffers()
+    highwayhash()
     icu()
+    keras_applications()
+    kissfft()
+    jpeg()
+    nasm()
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
@@ -64,31 +77,31 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     mkl_repository(
         name = "mkl_linux",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "e2233534a9d15c387e22260997af4312a39e9f86f791768409be273b5453c4e6",
-        strip_prefix = "mklml_lnx_2019.0.20180710",
+        sha256 = "f00dc3b142a5be399bdeebd7e7ea369545a35d4fb84c86f98b6b048d72685295",
+        strip_prefix = "mklml_lnx_2019.0.1.20180928",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz",
         ],
     )
     mkl_repository(
         name = "mkl_windows",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "3fdcff17b018a0082491adf3ba143358265336a801646e46e0191ec8d58d24a2",
-        strip_prefix = "mklml_win_2019.0.20180710",
+        sha256 = "efef90b7b9613fab10f44c8ac4ff28db613a112c64ed94826d7e44df09c44b0b",
+        strip_prefix = "mklml_win_2019.0.1.20180928",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_win_2019.0.1.20180928.zip",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_win_2019.0.1.20180928.zip",
         ],
     )
     mkl_repository(
         name = "mkl_darwin",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "411a30014a938eb83fb9f37b3dbe8e371b106fc1dd621fc23123cadc72737ce6",
-        strip_prefix = "mklml_mac_2019.0.20180710",
+        sha256 = "83f02938a0c095274db7b8b7b694157abafa3837c5cbaef740440d466c86a477",
+        strip_prefix = "mklml_mac_2019.0.1.20180928",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_mac_2019.0.1.20180928.tgz",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_mac_2019.0.1.20180928.tgz",
         ],
     )
 
@@ -99,33 +112,33 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
-        sha256 = "363cc9239eacf8e7917753c6d8c94f767e4cd049160d0654a61ef32d5e1b3049",
-        strip_prefix = "mkl-dnn-4e333787e0d66a1dca1218e99a891d493dbc8ef1",
+        sha256 = "b100f57af4a2b59a3a37a1ba38f77b644d2107d758a1a7f4e51310063cd21e73",
+        strip_prefix = "mkl-dnn-733fc908874c71a5285043931a1cf80aa923165c",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
+            "https://github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "507903ef9353cb25cccd0a6840048fdd348fd20e98314d694f04a990c0f277e3",
-        strip_prefix = "abseil-cpp-f21d187b80e3b7f08fb279775ea9c8b48c636030",
+        sha256 = "3ad76de484192b2d5afd49d90492b5ed0bc59eb1a4e8e0deecc7a2a077a90251",
+        strip_prefix = "abseil-cpp-f197d7c72a54064cfde5a2058f1513a4a0ee36fb",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f21d187b80e3b7f08fb279775ea9c8b48c636030.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/f21d187b80e3b7f08fb279775ea9c8b48c636030.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f197d7c72a54064cfde5a2058f1513a4a0ee36fb.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/f197d7c72a54064cfde5a2058f1513a4a0ee36fb.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        sha256 = "d956415d784fa4e42b6a2a45c32556d6aec9d0a3d8ef48baee2522ab762556a9",
-        strip_prefix = "eigen-eigen-fd6845384b86",
+        sha256 = "aae7a680d141c978301dfae2c7945c06039f65849fcf64269595a9cdbba82638",
+        strip_prefix = "eigen-eigen-729d33d11c81",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/fd6845384b86.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/729d33d11c81.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/729d33d11c81.tar.gz",
         ],
     )
 
@@ -153,39 +166,28 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "ortools_archive",
-        build_file = clean_dep("//third_party:ortools.BUILD"),
-        sha256 = "d025a95f78b5fc5eaa4da5f395f23d11c23cf7dbd5069f1f627f002de87b86b9",
-        strip_prefix = "or-tools-6.7.2/src",
-        urls = [
-            "https://mirror.bazel.build/github.com/google/or-tools/archive/v6.7.2.tar.gz",
-            "https://github.com/google/or-tools/archive/v6.7.2.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "com_googlesource_code_re2",
-        sha256 = "803c7811146edeef8f91064de37c6f19136ff01a2a8cdb3230e940b2fd9f07fe",
-        strip_prefix = "re2-2018-07-01",
+        sha256 = "a31397714a353587413d307337d0b58f8a2e20e2b9d02f2e24e3463fa4eeda81",
+        strip_prefix = "re2-2018-10-01",
         system_build_file = clean_dep("//third_party/systemlibs:re2.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/google/re2/archive/2018-07-01.tar.gz",
-            "https://github.com/google/re2/archive/2018-07-01.tar.gz",
+            "https://mirror.bazel.build/github.com/google/re2/archive/2018-10-01.tar.gz",
+            "https://github.com/google/re2/archive/2018-10-01.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "fdd3b3aecce60987e5525e55bf3a21d68a8695320bd5b980775af6507eec3944",
-        strip_prefix = "google-cloud-cpp-14760a86c4ffab9943b476305c4fe927ad95db1c",
+        sha256 = "3ade2072e6588ff56c0434abe6c63aa5f3f2d56be15a299bafc7e9cdf0a12c17",
+        strip_prefix = "google-cloud-cpp-0.3.0",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
-            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
+            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
+            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
         ],
     )
 
@@ -222,42 +224,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "highwayhash",
-        build_file = clean_dep("//third_party:highwayhash.BUILD"),
-        sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
-        strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
-        urls = [
-            "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
-            "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
-        ],
-    )
-
-    tf_http_archive(
-        name = "nasm",
-        build_file = clean_dep("//third_party:nasm.BUILD"),
-        sha256 = "63ec86477ad3f0f6292325fd89e1d93aea2e2fd490070863f17d48f7cd387011",
-        strip_prefix = "nasm-2.13.03",
-        system_build_file = clean_dep("//third_party/systemlibs:nasm.BUILD"),
-        urls = [
-            "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
-            "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.13.03.tar.bz2/sha512/d7a6b4cee8dfd603d8d4c976e5287b5cc542fa0b466ff989b743276a6e28114e64289bf02a7819eca63142a5278aa6eed57773007e5f589e15768e6456a8919d/nasm-2.13.03.tar.bz2",
-            "http://www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
-        ],
-    )
-
-    tf_http_archive(
-        name = "jpeg",
-        build_file = clean_dep("//third_party/jpeg:jpeg.BUILD"),
-        sha256 = "f892fff427ab3adffc289363eac26d197ce3ccacefe5f5822377348a8166069b",
-        strip_prefix = "libjpeg-turbo-2.0.0",
-        system_build_file = clean_dep("//third_party/systemlibs:jpeg.BUILD"),
-        urls = [
-            "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
-            "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "png_archive",
         build_file = clean_dep("//third_party:png.BUILD"),
@@ -373,24 +339,28 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "org_python_license",
         licenses = ["notice"],  # Python 2.0
         sha256_urls = {
-            "b5556e921715ddb9242c076cae3963f483aa47266c5e37ea4c187f77cc79501c": [
-                "https://mirror.bazel.build/docs.python.org/2.7/_sources/license.txt",
-                "https://docs.python.org/2.7/_sources/license.txt",
+            "7ca8f169368827781684f7f20876d17b4415bbc5cb28baa4ca4652f0dda05e9f": [
+                "https://mirror.bazel.build/docs.python.org/2.7/_sources/license.rst.txt",
+                "https://docs.python.org/2.7/_sources/license.rst.txt",
             ],
         },
     )
 
     PROTOBUF_URLS = [
-        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.0.tar.gz",
-        "https://github.com/google/protobuf/archive/v3.6.0.tar.gz",
+        "https://mirror.bazel.build/github.com/protocolbuffers/protobuf/archive/v3.6.1.2.tar.gz",
+        "https://github.com/protocolbuffers/protobuf/archive/v3.6.1.2.tar.gz",
     ]
-    PROTOBUF_SHA256 = "50a5753995b3142627ac55cfd496cebc418a2e575ca0236e29033c67bd5665f4"
-    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.0"
+    PROTOBUF_SHA256 = "2244b0308846bb22b4ff0bcc675e99290ff9f1115553ae9671eba1030af31bc0"
+    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1.2"
 
     tf_http_archive(
         name = "protobuf_archive",
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"),
+        system_link_files = {
+            "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
+        },
         urls = PROTOBUF_URLS,
     )
 
@@ -401,6 +371,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "com_google_protobuf",
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"),
+        system_link_files = {
+            "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
+        },
         urls = PROTOBUF_URLS,
     )
 
@@ -408,6 +382,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "com_google_protobuf_cc",
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"),
+        system_link_files = {
+            "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
+        },
         urls = PROTOBUF_URLS,
     )
 
@@ -479,14 +457,26 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
     tf_http_archive(
         name = "grpc",
-        sha256 = "50db9cf2221354485eb7c3bd55a4c27190caef7048a2a1a15fbe60a498f98b44",
-        strip_prefix = "grpc-1.13.0",
+        sha256 = "1aa84387232dda273ea8fdfe722622084f72c16f7b84bfc519ac7759b71cdc91",
+        strip_prefix = "grpc-69b6c047bc767b4d80e7af4d00ccb7c45b683dae",
         system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/grpc/grpc/archive/v1.13.0.tar.gz",
-            "https://github.com/grpc/grpc/archive/v1.13.0.tar.gz",
+            "https://mirror.bazel.build/github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
+            "https://github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
+        ],
+    )
+
+    tf_http_archive(
+        name = "com_github_nanopb_nanopb",
+        sha256 = "8bbbb1e78d4ddb0a1919276924ab10d11b631df48b657d960e0c795a25515735",
+        build_file = "@grpc//third_party:nanopb.BUILD",
+        strip_prefix = "nanopb-f8ac463766281625ad710900479130c7fcb4d63b",
+        urls = [
+            "https://mirror.bazel.build/github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
+            "https://github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
         ],
     )
 
@@ -506,11 +496,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "a4f8bfe7e3e69069934a87e612a1d4d3b8b6af13e0f1213a42a6046e1bcd50d8",
-        strip_prefix = "llvm-d3429e96fe1e45b1dc0106463832523f37faf271",
+        sha256 = "34170a4aa07e434dd537d98a705dcf1b3901f73820fe1d6b9370e8c1c94e9157",
+        strip_prefix = "llvm-0487bd8f42c8b38166ff825d56014d0ff49db604",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/d3429e96fe1e45b1dc0106463832523f37faf271.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/d3429e96fe1e45b1dc0106463832523f37faf271.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/0487bd8f42c8b38166ff825d56014d0ff49db604.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/0487bd8f42c8b38166ff825d56014d0ff49db604.tar.gz",
         ],
     )
 
@@ -585,12 +575,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "nccl_archive",
-        build_file = clean_dep("//third_party:nccl/nccl_archive.BUILD"),
-        sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
-        strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
+        build_file = clean_dep("//third_party:nccl/archive.BUILD"),
+        sha256 = "19132b5127fa8e02d95a09795866923f04064c8f1e0770b2b42ab551408882a4",
+        strip_prefix = "nccl-f93fe9bfd94884cec2ba711897222e0df5569a53",
         urls = [
-            "https://mirror.bazel.build/github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
-            "https://github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
+            "https://mirror.bazel.build/github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
+            "https://github.com/nvidia/nccl/archive/f93fe9bfd94884cec2ba711897222e0df5569a53.tar.gz",
         ],
     )
 
@@ -606,17 +596,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "aws",
-        build_file = clean_dep("//third_party:aws.BUILD"),
-        sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
-        strip_prefix = "aws-sdk-cpp-1.3.15",
-        urls = [
-            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
-            "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
-        ],
-    )
-
     java_import_external(
         name = "junit",
         jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
@@ -723,22 +702,22 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "bazel_toolchains",
-        sha256 = "3b604699685c5c65dd3f6f17425570a4b2f00ddba2f750db15acc72e55bb098b",
-        strip_prefix = "bazel-toolchains-37acf1841ab1475c98a152cb9e446460c8ae29e1",
+        sha256 = "07dfbe80638eb1fe681f7c07e61b34b579c6710c691e49ee90ccdc6e9e75ebbb",
+        strip_prefix = "bazel-toolchains-9a111bd82161c1fbe8ed17a593ca1023fd941c70",
         urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/37acf1841ab1475c98a152cb9e446460c8ae29e1.tar.gz",
-            "https://github.com/bazelbuild/bazel-toolchains/archive/37acf1841ab1475c98a152cb9e446460c8ae29e1.tar.gz",
+            "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/9a111bd82161c1fbe8ed17a593ca1023fd941c70.tar.gz",
+            "https://github.com/bazelbuild/bazel-toolchains/archive/9a111bd82161c1fbe8ed17a593ca1023fd941c70.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "arm_neon_2_x86_sse",
         build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
-        sha256 = "c8d90aa4357f8079d427e87a6f4c493da1fa4140aee926c05902d7ec1533d9a5",
-        strip_prefix = "ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d",
+        sha256 = "213733991310b904b11b053ac224fee2d4e0179e46b52fe7f8735b8831e04dcc",
+        strip_prefix = "ARM_NEON_2_x86_SSE-1200fe90bb174a6224a525ee60148671a786a71f",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
-            "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
+            "https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz",
+            "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz",
         ],
     )
 
@@ -755,12 +734,22 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     )
 
     tf_http_archive(
-        name = "tflite_mobilenet",
-        build_file = clean_dep("//third_party:tflite_mobilenet.BUILD"),
-        sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b",
+        name = "tflite_mobilenet_float",
+        build_file = clean_dep("//third_party:tflite_mobilenet_float.BUILD"),
+        sha256 = "2fadeabb9968ec6833bee903900dda6e61b3947200535874ce2fe42a8493abc0",
+        urls = [
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
+        ],
+    )
+
+    tf_http_archive(
+        name = "tflite_mobilenet_quant",
+        build_file = clean_dep("//third_party:tflite_mobilenet_quant.BUILD"),
+        sha256 = "d32432d28673a936b2d6281ab0600c71cf7226dfe4cdcef3012555f691744166",
         urls = [
-            "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
-            "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
         ],
     )
 
@@ -818,11 +807,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "tflite_ovic_testdata",
         build_file = clean_dep("//third_party:tflite_ovic_testdata.BUILD"),
-        sha256 = "a9a705d8d519220178e2e65d383fdb21da37fdb31d1e909b0a1acdac46479e9c",
+        sha256 = "21288dccc517acee47fa9648d4d3da28bf0fef5381911ed7b4d2ee36366ffa20",
         strip_prefix = "ovic",
         urls = [
-            "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/data/ovic.zip",
-            "https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip",
+            "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/data/ovic_2018_10_23.zip",
+            "https://storage.googleapis.com/download.tensorflow.org/data/ovic_2018_10_23.zip",
         ],
     )
 
@@ -850,11 +839,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph",
         build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
-        sha256 = "bf9dcc88e5c66021e3aac80491a231711211540d613bf9b6bd28db3f5bb86b62",
-        strip_prefix = "ngraph-0.8.1",
+        sha256 = "2b28f9c9f063b96825a96d56d7f7978c9a1c55c9b25175c20dd49a8a77cb0305",
+        strip_prefix = "ngraph-0.9.1",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph/archive/v0.8.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
+            "https://github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
         ],
     )
 
@@ -872,11 +861,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "ngraph_tf",
         build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
-        sha256 = "402f84c748c113780a60f35f39aab118435285543aee4900d712b76fbf8a21ee",
-        strip_prefix = "ngraph-tf-0.6.1",
+        sha256 = "89accbc702e68a09775f1011a99dd16561038fd1ce59d566d64450176abaae5c",
+        strip_prefix = "ngraph-tf-0.7.0",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.6.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
+            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
         ],
     )
 
@@ -891,7 +880,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # important since we have set GRPC_ARES=0 in .bazelrc
     native.bind(
         name = "cares",
-        actual = "@grpc//third_party/nanopb:nanopb",
+        actual = "@com_github_nanopb_nanopb//:nanopb",
     )
 
     # Needed by Protobuf
@@ -923,7 +912,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # Needed by gRPC
     native.bind(
         name = "nanopb",
-        actual = "@grpc//third_party/nanopb:nanopb",
+        actual = "@com_github_nanopb_nanopb//:nanopb",
     )
 
     # Needed by gRPC
diff --git a/third_party/aws/BUILD b/third_party/aws/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2f5d02becb930602574c4df02c51cec7662bc25d
--- /dev/null
+++ b/third_party/aws/BUILD
@@ -0,0 +1 @@
+# Dummy BUILD file to make this directory a package.
diff --git a/third_party/aws.BUILD b/third_party/aws/BUILD.bazel
similarity index 100%
rename from third_party/aws.BUILD
rename to third_party/aws/BUILD.bazel
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..c2166381549a5cf6fb44912081ae9479bff69645
--- /dev/null
+++ b/third_party/aws/workspace.bzl
@@ -0,0 +1,15 @@
+"""loads the aws library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "aws",
+        urls = [
+            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
+            "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
+        ],
+        sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
+        strip_prefix = "aws-sdk-cpp-1.3.15",
+        build_file = "//third_party/aws:BUILD.bazel",
+    )
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index e782739661396854bdfc0be1356b30fd98451d2f..7ced9027473e39ad9870ce138b64c7f7ec64ad01 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -39,15 +39,15 @@ def download_clang(repo_ctx, out_folder):
 
     # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
     # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "340427"
+    CLANG_REVISION = "347933"
     CLANG_SUB_REVISION = 1
 
     package_version = "%s-%s" % (CLANG_REVISION, CLANG_SUB_REVISION)
 
     checksums = {
-        "Linux_x64": "8a8f21fb624fc7be7e91e439a13114847185375bb932db51ba590174ecaf764b",
-        "Mac": "ba894536b7c8d37103a5ddba784f268d55e65bb2ea1200a2cf9f2ef1590eaacd",
-        "Win": "c3f5bd977266dfd011411c94a13e00974b643b70fb0225a5fb030f7f703fa474",
+        "Linux_x64": "cae3643fdf5d46fc9bc8731212bb37573547148d90b64b083165e090133d11b0",
+        "Mac": "083a0e91a38c06e568652313ac7372b17a101268f7d65533d721ca30413442b4",
+        "Win": "43160487cfc7e88076a369a2b6e8e4a0f42e104c28d8903f3aaa62d630aba949",
     }
 
     platform_folder = _get_platform_folder(repo_ctx.os.name)
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 759f8a9be92e14537d334c3ec37f036d369d8796..194a2272d5489c6e193dbae4b96e23ab3290c77a 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -65,6 +65,7 @@ cc_library(
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
         "EIGEN_MAX_ALIGN_BYTES=64",
+        "EIGEN_HAS_TYPE_TRAITS=0",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
index 5ab36649187a41507f1201804090a801d7f639f9..ff359cedced9610f423d899b3a95b2f8d5f8bba5 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
@@ -249,9 +249,7 @@ EIGEN_STRONG_INLINE QInt32& operator/=(QInt32& a, const QInt32 b) {
   a.value /= b.value;
   return a;
 }
-EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) {
-  return -a.value;
-}
+EIGEN_STRONG_INLINE QInt32 operator-(const QInt32 a) { return -a.value; }
 
 // Scaling QInt32 by double. We do the arithmetic in double because
 // float only has 23 bits of mantissa, so casting QInt32 to float might reduce
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
index e6f4080ae127a93fc7830a8dcded1b74f581188f..8477933e1baebaddf209a9c6c07fa1100d6b10cc 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
@@ -15,11 +15,9 @@ namespace internal {
 
 // Accumulate the product of 2 QInt8 inputs on 32 bits to prevent
 // overflows
-template<> struct scalar_product_traits<QInt8, QInt8>
-{
-  enum {
-    Defined = 1
-  };
+template <>
+struct scalar_product_traits<QInt8, QInt8> {
+  enum { Defined = 1 };
   typedef QInt32 ReturnType;
 };
 
@@ -33,11 +31,9 @@ struct scalar_product_traits<QInt16, QInt16> {
 
 // Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
 // to prevent overflows
-template<> struct scalar_product_traits<QInt8, QUInt8>
-{
-  enum {
-    Defined = 1
-  };
+template <>
+struct scalar_product_traits<QInt8, QUInt8> {
+  enum { Defined = 1 };
   typedef QInt32 ReturnType;
 };
 
@@ -47,14 +43,16 @@ template<> struct scalar_product_traits<QInt8, QUInt8>
 // signed 8bit integers
 #ifndef EIGEN_USE_OPTIMIZED_INT8_INT8_MAT_MAT_PRODUCT
 
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
@@ -68,22 +66,24 @@ public:
 };
 
 // The signed 8bit Mat-Mat product itself.
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -113,18 +113,19 @@ void gebp_kernel<QInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjugat
 }
 #endif
 
-
 // This definition tackle the case where the lhs is encoded using signed 8bit
 // integers and the rhs using unsigned 8bit integers.
 #ifndef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
@@ -138,22 +139,24 @@ public:
 };
 
 // Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -183,18 +186,19 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 }
 #endif
 
-
 // This definition tackle the case where the khs is encoded using unsigned 8bit
 // integers and the rhs using signed 8bit integers.
 #ifndef EIGEN_USE_OPTIMIZED_UINT8_INT8_MAT_MAT_PRODUCT
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QUInt8, QInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QUInt8 LhsScalar;
   typedef QInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
@@ -207,24 +211,25 @@ public:
   };
 };
 
-
 // Mat-Mat product of an unsigned 8bit lhs with a signed 8bit rhs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QUInt8* blockA,
+                  const QInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QUInt8* blockA, const QInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -263,6 +268,9 @@ class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
   typedef QInt16 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // register block size along the M and N directions
     // One for the current implementation
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
index 66532fb60028789df7495bc54c833622187e79bf..8547dca1b32eb2d11b27b7854cb8ff77efe0a31e 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
@@ -28,6 +28,9 @@ class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
   typedef QInt16 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // Define register blocking scheme.
     nr = 16,
@@ -43,7 +46,7 @@ class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
 // Used by TensorContractionThreadPool, inputs must have dimensions that are
 // multiples of 32.
 template <typename Index, int ShardingType>
-class TensorContractionBlocking<QInt16, QInt16, Index, ShardingType> {
+class TensorContractionBlocking<QInt16, QInt16, QInt16, Index, ShardingType> {
  public:
   TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
       : kc_(((k + 15) / 16) * 16),
@@ -144,7 +147,7 @@ class gemm_blocking_space<ColMajor, QInt16, QInt16, MaxRows, MaxCols, MaxDepth,
 
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
+struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, QInt16, ColMajor,
                      Conjugate, PanelMode> {
   EIGEN_DONT_INLINE void operator()(QInt16* blockA, const DataMapper& lhs,
                                     Index depth, Index rows, Index stride = 0,
@@ -154,12 +157,14 @@ struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2,
-                                     ColMajor, Conjugate, PanelMode>::
+                                     QInt16, ColMajor, Conjugate, PanelMode>::
 operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows,
            Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt16>::type Packet;
+
   // Use alternate function for weird sizes
   if (rows % 16 != 0 || depth % 16 != 0) {
     assert(false &&
@@ -178,10 +183,10 @@ operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows,
     // Pack depth in sets of 4
     for (Index k = 0; k < depth; k += 4) {
       // Load vectors
-      __m256i L_A = lhs.loadPacket(m, k);
-      __m256i L_B = lhs.loadPacket(m, k + 1);
-      __m256i L_C = lhs.loadPacket(m, k + 2);
-      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
 
       // Rearrange the inputs as required by the kernel
       __m256i L_AB0_AB7 = _mm256_unpacklo_epi16(L_A, L_B);
@@ -236,13 +241,15 @@ struct gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
 
 template <typename Index, typename DataMapper, int nr, bool Conjugate,
           bool PanelMode>
-EIGEN_DONT_INLINE void
-gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
+EIGEN_DONT_INLINE void gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor,
+                                     Conjugate, PanelMode>::
 operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols,
            Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt16>::type Packet;
+
   // Use alternate function for weird sizes
   if (cols % 16 != 0 || depth % 16 != 0) {
     assert(false &&
@@ -277,28 +284,28 @@ operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols,
   for (Index n = 0; n < cols; n += 16) {
     // Pack depth in sets of 16
     for (Index k = 0; k < depth; k += 16) {
-      __m256i R_A = rhs.loadPacket(k, n);
-      __m256i R_B = rhs.loadPacket(k, n + 1);
-      __m256i R_C = rhs.loadPacket(k, n + 2);
-      __m256i R_D = rhs.loadPacket(k, n + 3);
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 4);
-      R_B = rhs.loadPacket(k, n + 5);
-      R_C = rhs.loadPacket(k, n + 6);
-      R_D = rhs.loadPacket(k, n + 7);
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 8);
-      R_B = rhs.loadPacket(k, n + 9);
-      R_C = rhs.loadPacket(k, n + 10);
-      R_D = rhs.loadPacket(k, n + 11);
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 12);
-      R_B = rhs.loadPacket(k, n + 13);
-      R_C = rhs.loadPacket(k, n + 14);
-      R_D = rhs.loadPacket(k, n + 15);
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
       PACK_STEP;
 
       blockB_256 += 12;
@@ -476,9 +483,13 @@ operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
       for (Index j = n; j < n + 16; j++) {
         LinearMapper r0 = res.getLinearMapper(m, j);
         LinearMapper r1 = res.getLinearMapper(m + 8, j);
-
-        r0.storePacket(0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
-        r1.storePacket(0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
+        typedef typename packet_traits<QInt32>::type Packet;
+        r0.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r0.template loadPacket<Packet>(0)));
+        r1.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r1.template loadPacket<Packet>(0)));
       }
 
       // Zero the result block so it can be reused
@@ -496,14 +507,16 @@ operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
 #ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
 
 // Define quantized traits
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
   typedef QInt32 ResScalar;
 
+  typedef typename packet_traits<LhsScalar>::type LhsPacket;
+  typedef LhsPacket LhsPacket4Packing;
+
   enum {
     // Define register blocking scheme.
     nr = 32,
@@ -518,22 +531,28 @@ public:
 // Specialized blocking for quantized implementations.
 // Used by TensorContractionThreadPool, inputs must have dimensions that are
 // multiples of 32.
-template<typename Index,
-         typename LeftTensor,
-         typename left_nocontract_t, typename left_contract_t,
-         bool left_inner_dim_contiguous, bool left_inner_dim_reordered, int LeftAlignment,
-         typename RightTensor,
-         typename right_nocontract_t, typename right_contract_t,
-         bool right_inner_dim_contiguous, bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
-class TensorContractionBlocking<TensorContractionInputMapper<QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32, left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>, TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor, right_nocontract_t, right_contract_t, 32, right_inner_dim_contiguous, right_inner_dim_reordered, RightAlignment>, Index, ShardingType> {
+template <typename ResScalar, typename Index, typename LeftTensor,
+          typename left_nocontract_t, typename left_contract_t,
+          bool left_inner_dim_contiguous, bool left_inner_dim_reordered,
+          int LeftAlignment, typename RightTensor, typename right_nocontract_t,
+          typename right_contract_t, bool right_inner_dim_contiguous,
+          bool right_inner_dim_reordered, int RightAlignment, int ShardingType>
+class TensorContractionBlocking<
+    ResScalar,
+    TensorContractionInputMapper<
+        QInt8, Index, Lhs, LeftTensor, left_nocontract_t, left_contract_t, 32,
+        left_inner_dim_contiguous, left_inner_dim_reordered, LeftAlignment>,
+    TensorContractionInputMapper<QUInt8, Index, Rhs, RightTensor,
+                                 right_nocontract_t, right_contract_t, 32,
+                                 right_inner_dim_contiguous,
+                                 right_inner_dim_reordered, RightAlignment>,
+    Index, ShardingType> {
  public:
-
-  typedef QInt8  LhsScalar;
+  typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
 
-  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
-      kc_(k), mc_(m), nc_(n)
-  {
+  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
     eigen_assert(m % 32 == 0);
     eigen_assert(k % 32 == 0);
     if (!k || !m || !n) {
@@ -543,8 +562,7 @@ class TensorContractionBlocking<TensorContractionInputMapper<QInt8, Index, Lhs,
     if (ShardingType == ShardByCol) {
       eigen_assert(n % 32 == 0);
       nc_ = (((n / num_threads) + 31) / 32) * 32;
-    }
-    else {
+    } else {
       eigen_assert(n % 32 == 0 || n == 1);
       // Special case to avoid breaking the unimplemented matrix-vector case
       if (n == 1) {
@@ -599,7 +617,6 @@ class gemm_blocking_space<ColMajor, QInt8, QInt8, MaxRows, MaxCols, MaxDepth,
   }
 };
 
-
 template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
 class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
                           KcFactor, false>
@@ -633,42 +650,60 @@ class gemm_blocking_space<ColMajor, QInt8, QUInt8, MaxRows, MaxCols, MaxDepth,
 };
 
 // Alternate templates for any input sizes
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template <typename Scalar, typename Index, typename DataMapper, int Pack1,
+          int Pack2, int StorageOrder, bool Conjugate = false,
+          bool PanelMode = false>
 struct gemm_pack_lhs_any;
-template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> {
-  EIGEN_DONT_INLINE void operator()
-      (QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                         Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
+                                    Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+template <typename Scalar, typename Index, typename DataMapper, int nr,
+          int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_rhs_any;
-template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
-  EIGEN_DONT_INLINE void operator()
-      (QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                         PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QUInt8* blockB, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
 };
 
-template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template <typename LhsScalar, typename RhsScalar, typename Index,
+          typename DataMapper, int mr, int nr, bool ConjugateLhs = false,
+          bool ConjugateRhs = false>
 struct gebp_kernel_any;
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                       ConjugateRhs> {
   typedef typename DataMapper::LinearMapper LinearMapper;
 
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
 // Alternate implementations for any input sizes
-template <typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>::
-operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2,
+                                         ColMajor, Conjugate, PanelMode>::
+operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
+           Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt8>::type Packet;
+
   // Get vector pointer
   __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
 
@@ -690,15 +725,15 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
     // Pack depth in sets of 8
     for (Index k = 0; k < depth_8; k += 8) {
       // Load vectors
-      __m256i L_A = lhs.loadPacket(m, k);
-      __m256i L_B = lhs.loadPacket(m, k + 1);
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
 
       // Interleave 8-bit elements
       __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
       __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
 
-      __m256i L_C = lhs.loadPacket(m, k + 2);
-      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
       __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
       __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
 
@@ -719,12 +754,12 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
       _mm256_store_si256(blockA_256++, L_AD16);
       __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
       _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_E = lhs.loadPacket(m, k + 4);
-      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
+      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
       __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
       __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_G = lhs.loadPacket(m, k + 6);
-      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
+      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
       __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
       __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
       __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
@@ -745,76 +780,76 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
     if (depth_8 < depth) {
       __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
       switch (depth - depth_8) {
-      case 1:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 2:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 3:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 4:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 5:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = lhs.loadPacket(m, depth_8 + 4);
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 6:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = lhs.loadPacket(m, depth_8 + 4);
-        L_F = lhs.loadPacket(m, depth_8 + 5);
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        break;
-      case 7:
-        L_A = lhs.loadPacket(m, depth_8);
-        L_B = lhs.loadPacket(m, depth_8 + 1);
-        L_C = lhs.loadPacket(m, depth_8 + 2);
-        L_D = lhs.loadPacket(m, depth_8 + 3);
-        L_E = lhs.loadPacket(m, depth_8 + 4);
-        L_F = lhs.loadPacket(m, depth_8 + 5);
-        L_G = lhs.loadPacket(m, depth_8 + 6);
-        L_H = _mm256_setzero_si256();
-        break;
+        case 1:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 2:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 3:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 4:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 5:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 6:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          break;
+        case 7:
+          L_A = lhs.template loadPacket<Packet>(m, depth_8);
+          L_B = lhs.template loadPacket<Packet>(m, depth_8 + 1);
+          L_C = lhs.template loadPacket<Packet>(m, depth_8 + 2);
+          L_D = lhs.template loadPacket<Packet>(m, depth_8 + 3);
+          L_E = lhs.template loadPacket<Packet>(m, depth_8 + 4);
+          L_F = lhs.template loadPacket<Packet>(m, depth_8 + 5);
+          L_G = lhs.template loadPacket<Packet>(m, depth_8 + 6);
+          L_H = _mm256_setzero_si256();
+          break;
       }
 
       // Interleave 8-bit elements
@@ -875,21 +910,21 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
       __m256i L_G = _mm256_setzero_si256();
       __m256i L_H = _mm256_setzero_si256();
       for (Index m = 0; m < rows - rows_32; m++) {
-        QInt8* ptr = (QInt8*) &L_A;
+        QInt8* ptr = (QInt8*)&L_A;
         ptr[m] = lhs(rows_32 + m, k);
-        ptr = (QInt8*) &L_B;
+        ptr = (QInt8*)&L_B;
         ptr[m] = lhs(rows_32 + m, k + 1);
-        ptr = (QInt8*) &L_C;
+        ptr = (QInt8*)&L_C;
         ptr[m] = lhs(rows_32 + m, k + 2);
-        ptr = (QInt8*) &L_D;
+        ptr = (QInt8*)&L_D;
         ptr[m] = lhs(rows_32 + m, k + 3);
-        ptr = (QInt8*) &L_E;
+        ptr = (QInt8*)&L_E;
         ptr[m] = lhs(rows_32 + m, k + 4);
-        ptr = (QInt8*) &L_F;
+        ptr = (QInt8*)&L_F;
         ptr[m] = lhs(rows_32 + m, k + 5);
-        ptr = (QInt8*) &L_G;
+        ptr = (QInt8*)&L_G;
         ptr[m] = lhs(rows_32 + m, k + 6);
-        ptr = (QInt8*) &L_H;
+        ptr = (QInt8*)&L_H;
         ptr[m] = lhs(rows_32 + m, k + 7);
       }
 
@@ -939,146 +974,146 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
       __m256i L_A, L_B, L_C, L_D, L_E, L_F, L_G, L_H;
       QInt8* ptr;
       switch (depth - depth_8) {
-      case 1:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          QInt8* ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-        }
-        break;
-      case 2:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-        }
-        break;
-      case 3:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-        }
-        break;
-      case 4:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-        }
-        break;
-      case 5:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          ptr = (QInt8*) &L_E;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-        }
-        break;
-      case 6:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          ptr = (QInt8*) &L_E;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-          ptr = (QInt8*) &L_F;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
-        }
-        break;
-      case 7:
-        L_A = _mm256_setzero_si256();
-        L_B = _mm256_setzero_si256();
-        L_C = _mm256_setzero_si256();
-        L_D = _mm256_setzero_si256();
-        L_E = _mm256_setzero_si256();
-        L_F = _mm256_setzero_si256();
-        L_G = _mm256_setzero_si256();
-        L_H = _mm256_setzero_si256();
-        for (Index m = 0; m < rows - rows_32; m++) {
-          ptr = (QInt8*) &L_A;
-          ptr[m] = lhs(rows_32 + m, depth_8);
-          ptr = (QInt8*) &L_B;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 1);
-          ptr = (QInt8*) &L_C;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 2);
-          ptr = (QInt8*) &L_D;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 3);
-          ptr = (QInt8*) &L_E;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 4);
-          ptr = (QInt8*) &L_F;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 5);
-          ptr = (QInt8*) &L_G;
-          ptr[m] = lhs(rows_32 + m, depth_8 + 6);
-        }
-        break;
+        case 1:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            QInt8* ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+          }
+          break;
+        case 2:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+          }
+          break;
+        case 3:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+          }
+          break;
+        case 4:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+          }
+          break;
+        case 5:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+          }
+          break;
+        case 6:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+            ptr = (QInt8*)&L_F;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+          }
+          break;
+        case 7:
+          L_A = _mm256_setzero_si256();
+          L_B = _mm256_setzero_si256();
+          L_C = _mm256_setzero_si256();
+          L_D = _mm256_setzero_si256();
+          L_E = _mm256_setzero_si256();
+          L_F = _mm256_setzero_si256();
+          L_G = _mm256_setzero_si256();
+          L_H = _mm256_setzero_si256();
+          for (Index m = 0; m < rows - rows_32; m++) {
+            ptr = (QInt8*)&L_A;
+            ptr[m] = lhs(rows_32 + m, depth_8);
+            ptr = (QInt8*)&L_B;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 1);
+            ptr = (QInt8*)&L_C;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 2);
+            ptr = (QInt8*)&L_D;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 3);
+            ptr = (QInt8*)&L_E;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 4);
+            ptr = (QInt8*)&L_F;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 5);
+            ptr = (QInt8*)&L_G;
+            ptr[m] = lhs(rows_32 + m, depth_8 + 6);
+          }
+          break;
       }
 
       // Interleave 8-bit elements
@@ -1124,12 +1159,17 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows, Index
   }
 }
 
-template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
-operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr,
+                                         ColMajor, Conjugate, PanelMode>::
+operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
+           Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QUInt8>::type Packet;
+
   // Get vector pointer
   __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
 
@@ -1158,52 +1198,52 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
   for (Index n = 0; n < cols_32; n += 32) {
     // Pack depth in sets of 32
     for (Index k = 0; k < depth_32; k += 32) {
-      __m256i R_A = rhs.loadPacket(k, n);
-      __m256i R_B = rhs.loadPacket(k, n + 1);
-      __m256i R_C = rhs.loadPacket(k, n + 2);
-      __m256i R_D = rhs.loadPacket(k, n + 3);
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 4);
-      R_B = rhs.loadPacket(k, n + 5);
-      R_C = rhs.loadPacket(k, n + 6);
-      R_D = rhs.loadPacket(k, n + 7);
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 8);
-      R_B = rhs.loadPacket(k, n + 9);
-      R_C = rhs.loadPacket(k, n + 10);
-      R_D = rhs.loadPacket(k, n + 11);
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 12);
-      R_B = rhs.loadPacket(k, n + 13);
-      R_C = rhs.loadPacket(k, n + 14);
-      R_D = rhs.loadPacket(k, n + 15);
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 16);
-      R_B = rhs.loadPacket(k, n + 17);
-      R_C = rhs.loadPacket(k, n + 18);
-      R_D = rhs.loadPacket(k, n + 19);
+      R_A = rhs.template loadPacket<Packet>(k, n + 16);
+      R_B = rhs.template loadPacket<Packet>(k, n + 17);
+      R_C = rhs.template loadPacket<Packet>(k, n + 18);
+      R_D = rhs.template loadPacket<Packet>(k, n + 19);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 20);
-      R_B = rhs.loadPacket(k, n + 21);
-      R_C = rhs.loadPacket(k, n + 22);
-      R_D = rhs.loadPacket(k, n + 23);
+      R_A = rhs.template loadPacket<Packet>(k, n + 20);
+      R_B = rhs.template loadPacket<Packet>(k, n + 21);
+      R_C = rhs.template loadPacket<Packet>(k, n + 22);
+      R_D = rhs.template loadPacket<Packet>(k, n + 23);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 24);
-      R_B = rhs.loadPacket(k, n + 25);
-      R_C = rhs.loadPacket(k, n + 26);
-      R_D = rhs.loadPacket(k, n + 27);
+      R_A = rhs.template loadPacket<Packet>(k, n + 24);
+      R_B = rhs.template loadPacket<Packet>(k, n + 25);
+      R_C = rhs.template loadPacket<Packet>(k, n + 26);
+      R_D = rhs.template loadPacket<Packet>(k, n + 27);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 28);
-      R_B = rhs.loadPacket(k, n + 29);
-      R_C = rhs.loadPacket(k, n + 30);
-      R_D = rhs.loadPacket(k, n + 31);
+      R_A = rhs.template loadPacket<Packet>(k, n + 28);
+      R_B = rhs.template loadPacket<Packet>(k, n + 29);
+      R_C = rhs.template loadPacket<Packet>(k, n + 30);
+      R_D = rhs.template loadPacket<Packet>(k, n + 31);
       PACK_STEP;
 
       blockB_256 += 24;
@@ -1216,13 +1256,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       __m256i R_C = _mm256_setzero_si256();
       __m256i R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 1);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 2);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 3);
       }
       PACK_STEP;
@@ -1232,13 +1272,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 4);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 5);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 6);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 7);
       }
       PACK_STEP;
@@ -1248,13 +1288,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 8);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 9);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 10);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 11);
       }
       PACK_STEP;
@@ -1264,13 +1304,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 12);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 13);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 14);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 15);
       }
       PACK_STEP;
@@ -1280,13 +1320,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 16);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 17);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 18);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 19);
       }
       PACK_STEP;
@@ -1296,13 +1336,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 20);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 21);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 22);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 23);
       }
       PACK_STEP;
@@ -1312,13 +1352,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 24);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 25);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 26);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 27);
       }
       PACK_STEP;
@@ -1328,13 +1368,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       R_C = _mm256_setzero_si256();
       R_D = _mm256_setzero_si256();
       for (Index k = depth_32; k < depth; k++) {
-        ptr = (QUInt8*) &R_A;
+        ptr = (QUInt8*)&R_A;
         ptr[k - depth_32] = rhs(k, n + 28);
-        ptr = (QUInt8*) &R_B;
+        ptr = (QUInt8*)&R_B;
         ptr[k - depth_32] = rhs(k, n + 29);
-        ptr = (QUInt8*) &R_C;
+        ptr = (QUInt8*)&R_C;
         ptr[k - depth_32] = rhs(k, n + 30);
-        ptr = (QUInt8*) &R_D;
+        ptr = (QUInt8*)&R_D;
         ptr[k - depth_32] = rhs(k, n + 31);
       }
       PACK_STEP;
@@ -1350,34 +1390,34 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
       Index n;
       for (n = cols_32; n < cols; n += 4) {
         switch (cols - n) {
-        case 1:
-          R_A = rhs.loadPacket(k, n);
-          R_B = _mm256_setzero_si256();
-          R_C = _mm256_setzero_si256();
-          R_D = _mm256_setzero_si256();
-          PACK_STEP;
-          break;
-        case 2:
-          R_A = rhs.loadPacket(k, n);
-          R_B = rhs.loadPacket(k, n + 1);
-          R_C = _mm256_setzero_si256();
-          R_D = _mm256_setzero_si256();
-          PACK_STEP;
-          break;
-        case 3:
-          R_A = rhs.loadPacket(k, n);
-          R_B = rhs.loadPacket(k, n + 1);
-          R_C = rhs.loadPacket(k, n + 2);
-          R_D = _mm256_setzero_si256();
-          PACK_STEP;
-          break;
-        default:
-          R_A = rhs.loadPacket(k, n);
-          R_B = rhs.loadPacket(k, n + 1);
-          R_C = rhs.loadPacket(k, n + 2);
-          R_D = rhs.loadPacket(k, n + 3);
-          PACK_STEP;
-          break;
+          case 1:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = _mm256_setzero_si256();
+            R_C = _mm256_setzero_si256();
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          case 2:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = _mm256_setzero_si256();
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          case 3:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = rhs.template loadPacket<Packet>(k, n + 2);
+            R_D = _mm256_setzero_si256();
+            PACK_STEP;
+            break;
+          default:
+            R_A = rhs.template loadPacket<Packet>(k, n);
+            R_B = rhs.template loadPacket<Packet>(k, n + 1);
+            R_C = rhs.template loadPacket<Packet>(k, n + 2);
+            R_D = rhs.template loadPacket<Packet>(k, n + 3);
+            PACK_STEP;
+            break;
         }
       }
 
@@ -1394,46 +1434,46 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
         __m256i R_C = _mm256_setzero_si256();
         __m256i R_D = _mm256_setzero_si256();
         switch (cols - n) {
-        case 1:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-          }
-          PACK_STEP;
-          break;
-        case 2:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-            ptr = (QUInt8*) &R_B;
-            ptr[k - depth_32] = rhs(k, n + 1);
-          }
-          PACK_STEP;
-          break;
-        case 3:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-            ptr = (QUInt8*) &R_B;
-            ptr[k - depth_32] = rhs(k, n + 1);
-            ptr = (QUInt8*) &R_C;
-            ptr[k - depth_32] = rhs(k, n + 2);
-          }
-          PACK_STEP;
-          break;
-        default:
-          for (Index k = depth_32; k < depth; k++) {
-            ptr = (QUInt8*) &R_A;
-            ptr[k - depth_32] = rhs(k, n);
-            ptr = (QUInt8*) &R_B;
-            ptr[k - depth_32] = rhs(k, n + 1);
-            ptr = (QUInt8*) &R_C;
-            ptr[k - depth_32] = rhs(k, n + 2);
-            ptr = (QUInt8*) &R_D;
-            ptr[k - depth_32] = rhs(k, n + 3);
-          }
-          PACK_STEP;
-          break;
+          case 1:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+            }
+            PACK_STEP;
+            break;
+          case 2:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+            }
+            PACK_STEP;
+            break;
+          case 3:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+              ptr = (QUInt8*)&R_C;
+              ptr[k - depth_32] = rhs(k, n + 2);
+            }
+            PACK_STEP;
+            break;
+          default:
+            for (Index k = depth_32; k < depth; k++) {
+              ptr = (QUInt8*)&R_A;
+              ptr[k - depth_32] = rhs(k, n);
+              ptr = (QUInt8*)&R_B;
+              ptr[k - depth_32] = rhs(k, n + 1);
+              ptr = (QUInt8*)&R_C;
+              ptr[k - depth_32] = rhs(k, n + 2);
+              ptr = (QUInt8*)&R_D;
+              ptr[k - depth_32] = rhs(k, n + 3);
+            }
+            PACK_STEP;
+            break;
         }
       }
     }
@@ -1441,13 +1481,13 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols, Index
 #undef PACK_STEP
 }
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                       ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   eigen_assert(alpha.value == 1);
@@ -1678,17 +1718,21 @@ void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Con
           LinearMapper r1 = res.getLinearMapper(m + 8, j);
           LinearMapper r2 = res.getLinearMapper(m + 16, j);
           LinearMapper r3 = res.getLinearMapper(m + 24, j);
-          r0.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
-          r1.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
-          r2.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
-          r3.storePacket(
-              0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+          typedef typename packet_traits<QInt32>::type Packet;
+          r0.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r0.template loadPacket<Packet>(0)));
+          r1.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r1.template loadPacket<Packet>(0)));
+          r2.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r2.template loadPacket<Packet>(0)));
+          r3.template storePacket<Packet>(
+              0, _mm256_add_epi32(blockO_256[i++],
+                                  r3.template loadPacket<Packet>(0)));
         }
-      }
-      else {
+      } else {
         for (Index j = n; j < cols; j++) {
           for (Index i = m; i < rows; i++) {
             res(i, j) = blockO[(j - n) * 32 + (i - m)];
@@ -1745,7 +1789,7 @@ void gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Con
 // madd both perform an adjacent addition in the kernel.
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, QInt8, ColMajor,
                      Conjugate, PanelMode> {
   EIGEN_DONT_INLINE void operator()(QInt8* blockA, const DataMapper& lhs,
                                     Index depth, Index rows, Index stride = 0,
@@ -1755,15 +1799,18 @@ struct gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
 template <typename Index, typename DataMapper, int Pack1, int Pack2,
           bool Conjugate, bool PanelMode>
 EIGEN_DONT_INLINE void gemm_pack_lhs<QInt8, Index, DataMapper, Pack1, Pack2,
-                                     ColMajor, Conjugate, PanelMode>::
+                                     QInt8, ColMajor, Conjugate, PanelMode>::
 operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
            Index stride, Index offset) {
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QInt8>::type Packet;
+
   // Use alternate function for weird sizes
   if (rows % 32 != 0 || depth % 32 != 0) {
-    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> lhs_pack;
+    gemm_pack_lhs_any<QInt8, Index, DataMapper, Pack1, Pack2, ColMajor,
+                      Conjugate, PanelMode> lhs_pack;
     return lhs_pack(blockA, lhs, depth, rows, stride, offset);
   }
 
@@ -1775,15 +1822,15 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
     // Pack depth in sets of 8
     for (Index k = 0; k < depth; k += 8) {
       // Load vectors
-      __m256i L_A = lhs.loadPacket(m, k);
-      __m256i L_B = lhs.loadPacket(m, k + 1);
+      __m256i L_A = lhs.template loadPacket<Packet>(m, k);
+      __m256i L_B = lhs.template loadPacket<Packet>(m, k + 1);
 
       // Interleave 8-bit elements
       __m256i L_AB0_AB16 = _mm256_unpacklo_epi8(L_A, L_B);
       __m256i L_AB8_AB24 = _mm256_unpackhi_epi8(L_A, L_B);
 
-      __m256i L_C = lhs.loadPacket(m, k + 2);
-      __m256i L_D = lhs.loadPacket(m, k + 3);
+      __m256i L_C = lhs.template loadPacket<Packet>(m, k + 2);
+      __m256i L_D = lhs.template loadPacket<Packet>(m, k + 3);
       __m256i L_CD0_CD16 = _mm256_unpacklo_epi8(L_C, L_D);
       __m256i L_CD8_CD24 = _mm256_unpackhi_epi8(L_C, L_D);
 
@@ -1804,12 +1851,12 @@ operator()(QInt8* blockA, const DataMapper& lhs, Index depth, Index rows,
       _mm256_store_si256(blockA_256++, L_AD16);
       __m256i L_AD24 = _mm256_permute2x128_si256(L_AD8_AD24, L_AD12_AD28, 0x31);
       _mm256_store_si256(blockA_256++, L_AD24);
-      __m256i L_E = lhs.loadPacket(m, k + 4);
-      __m256i L_F = lhs.loadPacket(m, k + 5);
+      __m256i L_E = lhs.template loadPacket<Packet>(m, k + 4);
+      __m256i L_F = lhs.template loadPacket<Packet>(m, k + 5);
       __m256i L_EF0_EF16 = _mm256_unpacklo_epi8(L_E, L_F);
       __m256i L_EF8_EF24 = _mm256_unpackhi_epi8(L_E, L_F);
-      __m256i L_G = lhs.loadPacket(m, k + 6);
-      __m256i L_H = lhs.loadPacket(m, k + 7);
+      __m256i L_G = lhs.template loadPacket<Packet>(m, k + 6);
+      __m256i L_H = lhs.template loadPacket<Packet>(m, k + 7);
       __m256i L_GH0_GH16 = _mm256_unpacklo_epi8(L_G, L_H);
       __m256i L_GH8_GH24 = _mm256_unpackhi_epi8(L_G, L_H);
       __m256i L_EH0_EH16 = _mm256_unpacklo_epi16(L_EF0_EF16, L_GH0_GH16);
@@ -1868,9 +1915,12 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
   eigen_assert(stride == 0);
   eigen_assert(offset == 0);
 
+  typedef typename packet_traits<QUInt8>::type Packet;
+
   // Use alternate function for weird sizes
   if (cols % 32 != 0 || depth % 32 != 0) {
-    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> rhs_pack;
+    gemm_pack_rhs_any<QUInt8, Index, DataMapper, nr, ColMajor, Conjugate,
+                      PanelMode> rhs_pack;
     return rhs_pack(blockB, rhs, depth, cols, stride, offset);
   }
 
@@ -1898,52 +1948,52 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
   for (Index n = 0; n < cols; n += 32) {
     // Pack depth in sets of 32
     for (Index k = 0; k < depth; k += 32) {
-      __m256i R_A = rhs.loadPacket(k, n);
-      __m256i R_B = rhs.loadPacket(k, n + 1);
-      __m256i R_C = rhs.loadPacket(k, n + 2);
-      __m256i R_D = rhs.loadPacket(k, n + 3);
+      __m256i R_A = rhs.template loadPacket<Packet>(k, n);
+      __m256i R_B = rhs.template loadPacket<Packet>(k, n + 1);
+      __m256i R_C = rhs.template loadPacket<Packet>(k, n + 2);
+      __m256i R_D = rhs.template loadPacket<Packet>(k, n + 3);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 4);
-      R_B = rhs.loadPacket(k, n + 5);
-      R_C = rhs.loadPacket(k, n + 6);
-      R_D = rhs.loadPacket(k, n + 7);
+      R_A = rhs.template loadPacket<Packet>(k, n + 4);
+      R_B = rhs.template loadPacket<Packet>(k, n + 5);
+      R_C = rhs.template loadPacket<Packet>(k, n + 6);
+      R_D = rhs.template loadPacket<Packet>(k, n + 7);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 8);
-      R_B = rhs.loadPacket(k, n + 9);
-      R_C = rhs.loadPacket(k, n + 10);
-      R_D = rhs.loadPacket(k, n + 11);
+      R_A = rhs.template loadPacket<Packet>(k, n + 8);
+      R_B = rhs.template loadPacket<Packet>(k, n + 9);
+      R_C = rhs.template loadPacket<Packet>(k, n + 10);
+      R_D = rhs.template loadPacket<Packet>(k, n + 11);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 12);
-      R_B = rhs.loadPacket(k, n + 13);
-      R_C = rhs.loadPacket(k, n + 14);
-      R_D = rhs.loadPacket(k, n + 15);
+      R_A = rhs.template loadPacket<Packet>(k, n + 12);
+      R_B = rhs.template loadPacket<Packet>(k, n + 13);
+      R_C = rhs.template loadPacket<Packet>(k, n + 14);
+      R_D = rhs.template loadPacket<Packet>(k, n + 15);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 16);
-      R_B = rhs.loadPacket(k, n + 17);
-      R_C = rhs.loadPacket(k, n + 18);
-      R_D = rhs.loadPacket(k, n + 19);
+      R_A = rhs.template loadPacket<Packet>(k, n + 16);
+      R_B = rhs.template loadPacket<Packet>(k, n + 17);
+      R_C = rhs.template loadPacket<Packet>(k, n + 18);
+      R_D = rhs.template loadPacket<Packet>(k, n + 19);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 20);
-      R_B = rhs.loadPacket(k, n + 21);
-      R_C = rhs.loadPacket(k, n + 22);
-      R_D = rhs.loadPacket(k, n + 23);
+      R_A = rhs.template loadPacket<Packet>(k, n + 20);
+      R_B = rhs.template loadPacket<Packet>(k, n + 21);
+      R_C = rhs.template loadPacket<Packet>(k, n + 22);
+      R_D = rhs.template loadPacket<Packet>(k, n + 23);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 24);
-      R_B = rhs.loadPacket(k, n + 25);
-      R_C = rhs.loadPacket(k, n + 26);
-      R_D = rhs.loadPacket(k, n + 27);
+      R_A = rhs.template loadPacket<Packet>(k, n + 24);
+      R_B = rhs.template loadPacket<Packet>(k, n + 25);
+      R_C = rhs.template loadPacket<Packet>(k, n + 26);
+      R_D = rhs.template loadPacket<Packet>(k, n + 27);
       PACK_STEP;
 
-      R_A = rhs.loadPacket(k, n + 28);
-      R_B = rhs.loadPacket(k, n + 29);
-      R_C = rhs.loadPacket(k, n + 30);
-      R_D = rhs.loadPacket(k, n + 31);
+      R_A = rhs.template loadPacket<Packet>(k, n + 28);
+      R_B = rhs.template loadPacket<Packet>(k, n + 29);
+      R_C = rhs.template loadPacket<Packet>(k, n + 30);
+      R_D = rhs.template loadPacket<Packet>(k, n + 31);
       PACK_STEP;
 
       blockB_256 += 24;
@@ -1953,24 +2003,26 @@ operator()(QUInt8* blockB, const DataMapper& rhs, Index depth, Index cols,
 }
 
 // Perform the actual multiplication on packed inputs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   typedef typename DataMapper::LinearMapper LinearMapper;
 
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   eigen_assert(alpha.value == 1);
@@ -1986,8 +2038,10 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 
   // Use alternate function for weird sizes
   if (rows % 32 != 0 || cols % 32 != 0 || depth % 32 != 0) {
-    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> gebp;
-    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    gebp_kernel_any<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                    ConjugateRhs> gebp;
+    return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB,
+                offsetA, offsetB);
   }
 
   // Create result block
@@ -2205,14 +2259,19 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
         LinearMapper r1 = res.getLinearMapper(m + 8, j);
         LinearMapper r2 = res.getLinearMapper(m + 16, j);
         LinearMapper r3 = res.getLinearMapper(m + 24, j);
-        r0.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
-        r1.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
-        r2.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r2.loadPacket(0)));
-        r3.storePacket(
-            0, _mm256_add_epi32(blockO_256[i++], r3.loadPacket(0)));
+        typedef typename packet_traits<QInt32>::type Packet;
+        r0.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r0.template loadPacket<Packet>(0)));
+        r1.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r1.template loadPacket<Packet>(0)));
+        r2.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r2.template loadPacket<Packet>(0)));
+        r3.template storePacket<Packet>(
+            0, _mm256_add_epi32(blockO_256[i++],
+                                r3.template loadPacket<Packet>(0)));
       }
 
       // Zero the result block so it can be reused
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
index 9cd31570231173337ef0a7049171055bca897be4..9e0efae6c9b3516bbc130be44d87d18e62038237 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
@@ -14,15 +14,14 @@
 namespace Eigen {
 namespace internal {
 
-
-// AVX2 optimized implementation of the case where the lhs is encoded using signed 8bit
+// AVX2 optimized implementation of the case where the lhs is encoded using
+// signed 8bit
 // integers and the rhs using unsigned 8bit integers.
 #ifdef EIGEN_USE_OPTIMIZED_INT8_UINT8_MAT_MAT_PRODUCT
 
-template<bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs>
-{
-public:
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt8, QUInt8, _ConjLhs, _ConjRhs> {
+ public:
   typedef QInt8 LhsScalar;
   typedef QUInt8 RhsScalar;
   typedef QInt32 ResScalar;
@@ -40,22 +39,24 @@ public:
 };
 
 // Mat-Mat product of a signed 8bit lhs with an unsigned 8bit rhs
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
   EIGEN_DONT_INLINE
-  void operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-                  Index rows, Index depth, Index cols, QInt32 alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+  void operator()(const DataMapper& res, const QInt8* blockA,
+                  const QUInt8* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-::operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
-             Index rows, Index depth, Index cols, QInt32 alpha,
-             Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt8* blockA, const QUInt8* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
   EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -85,7 +86,6 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 }
 #endif
 
-
 }  // namespace internal
 }  // namespace Eigen
 
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
index ad11d3d44b813830c87f2634a9234adfeac80329..f15200caba5d14e08c0bc3cc51f3f8bcc7f5debe 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
@@ -15,25 +15,23 @@ namespace internal {
 
 // Mat-Vec product
 // Both lhs and rhs are encoded as 8bit signed integers
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
-{
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  QInt32* res, Index resIncr,
-  QInt8 alpha);
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt8 alpha);
 };
 
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
-    Index rows, Index cols,
-    const LhsMapper& lhs,
-    const RhsMapper& rhs,
-    QInt32* res, Index resIncr,
-    QInt8 alpha)
-{
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt8 alpha) {
   eigen_assert(alpha.value == 1);
   eigen_assert(resIncr == 1);
   eigen_assert(rows > 0);
@@ -78,26 +76,25 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<
 }
 
 // Mat-Vec product
-// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>
-{
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  QInt32* res, Index resIncr,
-  QUInt8 alpha);
+// The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned
+// integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QUInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QUInt8 alpha);
 };
 
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMajor,ConjugateLhs,QUInt8,RhsMapper,ConjugateRhs,Version>::run(
-    Index rows, Index cols,
-    const LhsMapper& lhs,
-    const RhsMapper& rhs,
-    QInt32* res, Index resIncr,
-    QUInt8 alpha)
-{
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt8, LhsMapper, ColMajor, ConjugateLhs, QUInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QUInt8 alpha) {
   eigen_assert(alpha.value == 1);
   eigen_assert(resIncr == 1);
   eigen_assert(rows > 0);
@@ -110,28 +107,26 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMa
   }
 }
 
-
 // Mat-Vec product
-// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed integers
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>
-{
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsMapper& lhs,
-  const RhsMapper& rhs,
-  QInt32* res, Index resIncr,
-  QInt8 alpha);
+// The lhs is encoded using bit unsigned integers, the rhs using 8bit signed
+// integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QUInt8, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt8, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt8 alpha);
 };
 
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QUInt8,LhsMapper,ColMajor,ConjugateLhs,QInt8,RhsMapper,ConjugateRhs,Version>::run(
-    Index rows, Index cols,
-    const LhsMapper& lhs,
-    const RhsMapper& rhs,
-    QInt32* res, Index resIncr,
-    QInt8 alpha)
-{
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QUInt8, LhsMapper, ColMajor, ConjugateLhs, QInt8, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt8 alpha) {
   eigen_assert(alpha.value == 1);
   eigen_assert(resIncr == 1);
   eigen_assert(rows > 0);
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 3abd4ee49c2a6596ff9545faddedf926b4da857f..223ea4d58bf4c40b2790e2f5d73e2a4fc1a79eec 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -8,24 +8,20 @@
 
 #endif
 
-inline int _mm256_extract_epi16_N0(const __m256i X)
-{
-	return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
+inline int _mm256_extract_epi16_N0(const __m256i X) {
+  return _mm_extract_epi16(_mm256_extractf128_si256(X, 0 >> 3), 0 % 8);
 }
 
-inline int _mm256_extract_epi16_N1(const __m256i X)
-{
-	return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
+inline int _mm256_extract_epi16_N1(const __m256i X) {
+  return _mm_extract_epi16(_mm256_extractf128_si256(X, 1 >> 3), 1 % 8);
 }
 
-inline int _mm256_extract_epi8_N0(const __m256i X)
-{
-	return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
+inline int _mm256_extract_epi8_N0(const __m256i X) {
+  return _mm_extract_epi8(_mm256_extractf128_si256((X), 0 >> 4), 0 % 16);
 }
 
-inline int _mm256_extract_epi8_N1(const __m256i X)
-{
-	return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
+inline int _mm256_extract_epi8_N1(const __m256i X) {
+  return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
 }
 
 namespace Eigen {
@@ -34,56 +30,56 @@ namespace internal {
 typedef struct Packet32q8i {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet32q8i();
+  Packet32q8i() : val(_mm256_setzero_si256()){};
   Packet32q8i(__m256i val) : val(val) {}
 } Packet32q8i;
 
 typedef struct Packet16q16i {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet16q16i();
+  Packet16q16i() : val(_mm256_setzero_si256()){};
   Packet16q16i(__m256i val) : val(val) {}
 } Packet16q16i;
 
 typedef struct Packet32q8u {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet32q8u();
+  Packet32q8u() : val(_mm256_setzero_si256()){};
   Packet32q8u(__m256i val) : val(val) {}
 } Packet32q8u;
 
 typedef struct Packet16q8i {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet16q8i();
+  Packet16q8i() : val(_mm_setzero_si128()) {}
   Packet16q8i(__m128i val) : val(val) {}
 } Packet16q8i;
 
 typedef struct Packet16q8u {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet16q8u();
+  Packet16q8u() : val(_mm_setzero_si128()) {}
   Packet16q8u(__m128i val) : val(val) {}
 } Packet16q8u;
 
 typedef struct Packet8q16i {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet8q16i();
+  Packet8q16i() : val(_mm_setzero_si128()) {}
   Packet8q16i(__m128i val) : val(val) {}
 } Packet8q16i;
 
 typedef struct Packet8q32i {
   __m256i val;
   operator __m256i() const { return val; }
-  Packet8q32i();
+  Packet8q32i() : val(_mm256_setzero_si256()){};
   Packet8q32i(__m256i val) : val(val) {}
 } Packet8q32i;
 
 typedef struct Packet4q32i {
   __m128i val;
   operator __m128i() const { return val; }
-  Packet4q32i();
+  Packet4q32i() : val(_mm_setzero_si128()) {}
   Packet4q32i(__m128i val) : val(val) {}
 } Packet4q32i;
 
@@ -182,25 +178,25 @@ template <>
 struct unpacket_traits<Packet32q8i> {
   typedef QInt8 type;
   typedef Packet16q8i half;
-  enum { size = 32, alignment=Aligned32 };
+  enum { size = 32, alignment = Aligned32 };
 };
 template <>
 struct unpacket_traits<Packet16q16i> {
   typedef QInt16 type;
   typedef Packet8q16i half;
-  enum { size = 16, alignment=Aligned32 };
+  enum { size = 16, alignment = Aligned32 };
 };
 template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
-  enum { size = 32, alignment=Aligned32 };
+  enum { size = 32, alignment = Aligned32 };
 };
 template <>
 struct unpacket_traits<Packet8q32i> {
   typedef QInt32 type;
   typedef Packet4q32i half;
-  enum { size = 8, alignment=Aligned32 };
+  enum { size = 8, alignment = Aligned32 };
 };
 
 // Unaligned load
@@ -455,40 +451,47 @@ EIGEN_STRONG_INLINE QUInt8 predux_max<Packet32q8u>(const Packet32q8u& a) {
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_min<Packet32q8i>(const Packet32q8i& a) {
   __m256i tmp = _mm256_min_epi8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp =
+      _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   tmp = _mm256_min_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_min_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_min_epi8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   return std::min(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_max<Packet32q8i>(const Packet32q8i& a) {
   __m256i tmp = _mm256_max_epi8(a, _mm256_permute2f128_si256(a, a, 1));
-  tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp =
+      _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   tmp = _mm256_max_epi8(tmp, _mm256_shuffle_epi32(tmp, 1));
-  tmp = _mm256_max_epi8(tmp, _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+  tmp = _mm256_max_epi8(tmp,
+                        _mm256_shufflelo_epi16(tmp, _MM_SHUFFLE(1, 0, 3, 2)));
   return std::max(_mm256_extract_epi8_N0(tmp), _mm256_extract_epi8_N1(tmp));
 }
 
 // Vectorized scaling of Packet32q8i by float.
-template<>
+template <>
 struct scalar_product_op<QInt32, double> : binary_op_base<QInt32, double> {
   typedef typename ScalarBinaryOpTraits<QInt32, double>::ReturnType result_type;
 #ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
   EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
 #else
-  scalar_product_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
+  scalar_product_op() { EIGEN_SCALAR_BINARY_OP_PLUGIN }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const QInt32& a, const double& b) const { return a * b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  operator()(const QInt32& a, const double& b) const {
+    return a * b;
+  }
 
-  EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a, const double& b) const {
+  EIGEN_STRONG_INLINE const Packet8q32i packetOp(const Packet8q32i& a,
+                                                 const double& b) const {
     __m256d scale = _mm256_set1_pd(b);
     __m256d a_lo = _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
     __m128i result_lo = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_lo));
     __m256d a_hi = _mm256_cvtepi32_pd(_mm256_extracti128_si256(a, 1));
     __m128i result_hi = _mm256_cvtpd_epi32(_mm256_mul_pd(scale, a_hi));
-    return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+    return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi,
+                                   1);
   }
 };
 
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index 2092ce1d4c92754ce52b78f6a6e5fe814d4b7aaa..84750c1945a6125bf6be92647106de535c90a21f 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -127,25 +127,25 @@ template <>
 struct unpacket_traits<Packet64q8i> {
   typedef QInt8 type;
   typedef Packet32q8i half;
-  enum { size = 64, alignment=Aligned64 };
+  enum { size = 64, alignment = Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet32q16i> {
   typedef QInt16 type;
   typedef Packet16q16i half;
-  enum { size = 32, alignment=Aligned64 };
+  enum { size = 32, alignment = Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet64q8u> {
   typedef QUInt8 type;
   typedef Packet32q8u half;
-  enum { size = 64, alignment=Aligned64 };
+  enum { size = 64, alignment = Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet16q32i> {
   typedef QInt32 type;
   typedef Packet8q32i half;
-  enum { size = 16, alignment=Aligned64 };
+  enum { size = 16, alignment = Aligned64 };
 };
 
 // Unaligned load
@@ -244,7 +244,7 @@ EIGEN_STRONG_INLINE QInt32 pfirst<Packet16q32i>(const Packet16q32i& a) {
 template <>
 EIGEN_STRONG_INLINE QUInt8 pfirst<Packet64q8u>(const Packet64q8u& a) {
   return static_cast<uint8_t>(
-           _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
+      _mm_extract_epi8(_mm512_extracti32x4_epi32(a.val, 0), 0));
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 pfirst<Packet64q8i>(const Packet64q8i& a) {
@@ -410,9 +410,7 @@ EIGEN_STRONG_INLINE QInt32 predux_min<Packet16q32i>(const Packet16q32i& a) {
       _mm_min_epi32(_mm_min_epi32(lane0, lane1), _mm_min_epi32(lane2, lane3));
   res = _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   return pfirst(
-           _mm_min_epi32(
-             res,
-             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+      _mm_min_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
 }
 template <>
 EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
@@ -424,9 +422,7 @@ EIGEN_STRONG_INLINE QInt32 predux_max<Packet16q32i>(const Packet16q32i& a) {
       _mm_max_epi32(_mm_max_epi32(lane0, lane1), _mm_max_epi32(lane2, lane3));
   res = _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
   return pfirst(
-           _mm_max_epi32(
-             res,
-             _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+      _mm_max_epi32(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
@@ -437,13 +433,10 @@ EIGEN_STRONG_INLINE QInt16 predux_min<Packet32q16i>(const Packet32q16i& a) {
   Packet4i res =
       _mm_min_epi16(_mm_min_epi16(lane0, lane1), _mm_min_epi16(lane2, lane3));
   res = _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::int16_t>(w >> 16),
-           static_cast<std::int16_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_min_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
@@ -454,13 +447,10 @@ EIGEN_STRONG_INLINE QInt16 predux_max<Packet32q16i>(const Packet32q16i& a) {
   Packet4i res =
       _mm_max_epi16(_mm_max_epi16(lane0, lane1), _mm_max_epi16(lane2, lane3));
   res = _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::max({
-           static_cast<std::int16_t>(w >> 16),
-           static_cast<std::int16_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_max_epi16(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::max(
+      {static_cast<std::int16_t>(w >> 16), static_cast<std::int16_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
@@ -471,15 +461,11 @@ EIGEN_STRONG_INLINE QUInt8 predux_min<Packet64q8u>(const Packet64q8u& a) {
   Packet4i res =
       _mm_min_epu8(_mm_min_epu8(lane0, lane1), _mm_min_epu8(lane2, lane3));
   res = _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::uint8_t>(w >> 24),
-           static_cast<std::uint8_t>(w >> 16),
-           static_cast<std::uint8_t>(w >> 8),
-           static_cast<std::uint8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_min_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
+       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
@@ -490,15 +476,11 @@ EIGEN_STRONG_INLINE QUInt8 predux_max<Packet64q8u>(const Packet64q8u& a) {
   Packet4i res =
       _mm_max_epu8(_mm_max_epu8(lane0, lane1), _mm_max_epu8(lane2, lane3));
   res = _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::max({
-           static_cast<std::uint8_t>(w >> 24),
-           static_cast<std::uint8_t>(w >> 16),
-           static_cast<std::uint8_t>(w >> 8),
-           static_cast<std::uint8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_max_epu8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::max(
+      {static_cast<std::uint8_t>(w >> 24), static_cast<std::uint8_t>(w >> 16),
+       static_cast<std::uint8_t>(w >> 8), static_cast<std::uint8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
@@ -509,15 +491,11 @@ EIGEN_STRONG_INLINE QInt8 predux_min<Packet64q8i>(const Packet64q8i& a) {
   Packet4i res =
       _mm_min_epi8(_mm_min_epi8(lane0, lane1), _mm_min_epi8(lane2, lane3));
   res = _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::int8_t>(w >> 24),
-           static_cast<std::int8_t>(w >> 16),
-           static_cast<std::int8_t>(w >> 8),
-           static_cast<std::int8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_min_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
+       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
 }
 template <>
 EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
@@ -528,15 +506,11 @@ EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
   Packet4i res =
       _mm_max_epi8(_mm_max_epi8(lane0, lane1), _mm_max_epi8(lane2, lane3));
   res = _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  std::uint32_t w =
-      pfirst(
-        _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
-  return std::min({
-           static_cast<std::int8_t>(w >> 24),
-           static_cast<std::int8_t>(w >> 16),
-           static_cast<std::int8_t>(w >> 8),
-           static_cast<std::int8_t>(w)
-         });
+  std::uint32_t w = pfirst(
+      _mm_max_epi8(res, _mm_shuffle_epi32(res, _MM_SHUFFLE(0, 0, 0, 1))));
+  return std::min(
+      {static_cast<std::int8_t>(w >> 24), static_cast<std::int8_t>(w >> 16),
+       static_cast<std::int8_t>(w >> 8), static_cast<std::int8_t>(w)});
 }
 
 }  // end namespace internal
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
index a09eac67070477ad4b7ad7fd041800d1d815cac3..d3b02402971145f0bee4eec0f02dae24431a1da5 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
@@ -33,28 +33,23 @@ struct type_casting_traits<float, QInt16> {
 };
 
 template <>
-EIGEN_STRONG_INLINE Packet32q16i
-pcast<Packet16f>(const Packet16f& a, const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16f>(const Packet16f& a,
+                                                  const Packet16f& b) {
   Packet16i a_int = _mm512_cvtps_epi32(a);
   Packet16i b_int = _mm512_cvtps_epi32(b);
 #ifdef EIGEN_VECTORIZE_AVX512BW
   return _mm512_packs_epi32(a_int, b_int);
 #else
-  Packet8i ab_int16_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_castsi512_si256(a_int),
-          _mm512_castsi512_si256(b_int)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i ab_int16_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_extracti32x8_epi32(a_int, 1),
-          _mm512_extracti32x8_epi32(b_int, 1)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  return _mm512_inserti32x8(
-           _mm512_castsi256_si512(ab_int16_low),
-           ab_int16_high, 1);
+  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
+                         _mm512_castsi512_si256(b_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
+                         _mm512_extracti32x8_epi32(b_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(_mm512_castsi256_si512(ab_int16_low), ab_int16_high,
+                            1);
 #endif
 }
 
@@ -64,55 +59,41 @@ struct type_casting_traits<float, QInt8> {
 };
 
 template <>
-EIGEN_STRONG_INLINE Packet64q8i
-pcast<Packet16f>(const Packet16f& a,
-                 const Packet16f& b,
-                 const Packet16f& c,
-                 const Packet16f& d) {
+EIGEN_STRONG_INLINE Packet64q8i pcast<Packet16f>(const Packet16f& a,
+                                                 const Packet16f& b,
+                                                 const Packet16f& c,
+                                                 const Packet16f& d) {
   Packet16i a_int = _mm512_cvtps_epi32(a);
   Packet16i b_int = _mm512_cvtps_epi32(b);
   Packet16i c_int = _mm512_cvtps_epi32(c);
   Packet16i d_int = _mm512_cvtps_epi32(d);
 #ifdef EIGEN_VECTORIZE_AVX512BW
-  return _mm512_packs_epi16(
-           _mm512_packs_epi32(a_int, b_int),
-           _mm512_packs_epi32(c_int, d_int));
+  return _mm512_packs_epi16(_mm512_packs_epi32(a_int, b_int),
+                            _mm512_packs_epi32(c_int, d_int));
 #else
-  Packet8i ab_int16_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_castsi512_si256(a_int),
-          _mm512_castsi512_si256(b_int)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i cd_int16_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_castsi512_si256(c_int),
-          _mm512_castsi512_si256(d_int)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i ab_int16_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_extracti32x8_epi32(a_int, 1),
-          _mm512_extracti32x8_epi32(b_int, 1)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i cd_int16_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi32(
-          _mm512_extracti32x8_epi32(c_int, 1),
-          _mm512_extracti32x8_epi32(d_int, 1)),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  Packet8i abcd_int8_low =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi16(ab_int16_low, cd_int16_low),
-        _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(a_int),
+                         _mm512_castsi512_si256(b_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_castsi512_si256(c_int),
+                         _mm512_castsi512_si256(d_int)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i ab_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(a_int, 1),
+                         _mm512_extracti32x8_epi32(b_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i cd_int16_high = _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm512_extracti32x8_epi32(c_int, 1),
+                         _mm512_extracti32x8_epi32(d_int, 1)),
+      _MM_SHUFFLE(0, 2, 1, 3));
+  Packet8i abcd_int8_low = _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(ab_int16_low, cd_int16_low), _MM_SHUFFLE(0, 2, 1, 3));
   Packet8i abcd_int8_high =
-      _mm256_permute4x64_epi64(
-        _mm256_packs_epi16(ab_int16_high, cd_int16_high),
-        _MM_SHUFFLE(0, 2, 1, 3));
-  return _mm512_inserti32x8(
-           _mm512_castsi256_si512(abcd_int8_low),
-           abcd_int8_high, 1);
+      _mm256_permute4x64_epi64(_mm256_packs_epi16(ab_int16_high, cd_int16_high),
+                               _MM_SHUFFLE(0, 2, 1, 3));
+  return _mm512_inserti32x8(_mm512_castsi256_si512(abcd_int8_low),
+                            abcd_int8_high, 1);
 #endif
 }
 
@@ -128,10 +109,8 @@ struct type_casting_traits<QInt32, QInt16> {
 
 template <>
 EIGEN_STRONG_INLINE Packet64q8i
-pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a,
-                                 const Packet16q32i& b,
-                                 const Packet16q32i& c,
-                                 const Packet16q32i& d) {
+pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a, const Packet16q32i& b,
+                                 const Packet16q32i& c, const Packet16q32i& d) {
   __m128i a_part = _mm512_cvtsepi32_epi8(a);
   __m128i b_part = _mm512_cvtsepi32_epi8(b);
   __m128i c_part = _mm512_cvtsepi32_epi8(c);
@@ -145,9 +124,8 @@ pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a,
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet32q16i
-pcast<Packet16q32i, Packet32q16i>(const Packet16q32i& a,
-                                  const Packet16q32i& b) {
+EIGEN_STRONG_INLINE Packet32q16i pcast<Packet16q32i, Packet32q16i>(
+    const Packet16q32i& a, const Packet16q32i& b) {
   __m256i a_part = _mm512_cvtsepi32_epi16(a);
   __m256i b_part = _mm512_cvtsepi32_epi16(b);
   __m512i converted =
diff --git a/third_party/googleapis.BUILD b/third_party/googleapis.BUILD
index 95e999af1886576317aa59d133e8d5c88ba368d3..b8871eda7280becb7c3f53412120600d52c0fb54 100644
--- a/third_party/googleapis.BUILD
+++ b/third_party/googleapis.BUILD
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 package(default_visibility = ["//visibility:public"])
+
 licenses(["notice"])  # Apache 2.0
+
 exports_files(["LICENSE"])
 
 load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
@@ -21,6 +23,9 @@ load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
 cc_proto_library(
     name = "bigtable_protos",
     srcs = [
+        "google/api/annotations.proto",
+        "google/api/auth.proto",
+        "google/api/http.proto",
         "google/bigtable/admin/v2/bigtable_instance_admin.proto",
         "google/bigtable/admin/v2/bigtable_table_admin.proto",
         "google/bigtable/admin/v2/common.proto",
@@ -31,15 +36,12 @@ cc_proto_library(
         "google/iam/v1/iam_policy.proto",
         "google/iam/v1/policy.proto",
         "google/longrunning/operations.proto",
-        "google/rpc/status.proto",
         "google/rpc/error_details.proto",
-        "google/api/annotations.proto",
-        "google/api/auth.proto",
-        "google/api/http.proto",
+        "google/rpc/status.proto",
     ],
     include = ".",
-    protoc = "@protobuf_archive//:protoc",
     default_runtime = "@protobuf_archive//:protobuf",
-    deps = ["@protobuf_archive//:cc_wkt_protos"],
+    protoc = "@protobuf_archive//:protoc",
     use_grpc_plugin = True,
+    deps = ["@protobuf_archive//:cc_wkt_protos"],
 )
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
index 3189cf8e31610c432f03f8f3a30efc3ada4d9652..921188cbb431d925df69fbd0cc06aac07fe1a1a9 100644
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@@ -184,7 +184,8 @@ toolchain {
       action: "c++-link-dynamic-library"
       action: "c++-link-nodeps-dynamic-library"
       flag_group {
-        flag:"-no-canonical-prefixes"
+        flag: "-no-canonical-prefixes"
+        %{extra_no_canonical_prefixes_flags}
       }
     }
   }
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 69f4599c1699be83804d600125657ca50216859d..03c67bcb3d75aca19bcad8b824d79283193dc115 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -126,118 +126,141 @@ load(
 )
 
 def _get_python_bin(repository_ctx):
-    """Gets the python bin path."""
-    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-    if python_bin != None:
-        return python_bin
-    python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
-    python_bin_path = repository_ctx.which(python_bin_name)
-    if python_bin_path != None:
-        return str(python_bin_path)
-    auto_configure_fail("Cannot find python in PATH, please make sure " +
-                        "python is installed and add its directory in PATH, or --define " +
-                        "%s='/something/else'.\nPATH=%s" % (
-                            _PYTHON_BIN_PATH,
-                            repository_ctx.os.environ.get("PATH", ""),
-                        ))
+  """Gets the python bin path."""
+  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+  if python_bin != None:
+    return python_bin
+  python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
+  python_bin_path = repository_ctx.which(python_bin_name)
+  if python_bin_path != None:
+    return str(python_bin_path)
+  auto_configure_fail(
+      "Cannot find python in PATH, please make sure " +
+      "python is installed and add its directory in PATH, or --define " +
+      "%s='/something/else'.\nPATH=%s" % (
+          _PYTHON_BIN_PATH,
+          repository_ctx.os.environ.get("PATH", ""),
+      ))
+
 
 def _get_nvcc_tmp_dir_for_windows(repository_ctx):
-    """Return the tmp directory for nvcc to generate intermediate source files."""
-    escaped_tmp_dir = escape_string(
-        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace("\\", "\\\\"),
-    )
-    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
+  """Return the tmp directory for nvcc to generate intermediate source files."""
+  escaped_tmp_dir = escape_string(
+      get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+          "\\", "\\\\"),)
+  return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
 
-def _get_msvc_compiler(repository_ctx):
-    vc_path = find_vc_path(repository_ctx)
-    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
 
-def _get_win_cuda_defines(repository_ctx):
-    """Return CROSSTOOL defines for Windows"""
-
-    # If we are not on Windows, return empty vaules for Windows specific fields.
-    # This ensures the CROSSTOOL file parser is happy.
-    if not _is_windows(repository_ctx):
-        return {
-            "%{msvc_env_tmp}": "",
-            "%{msvc_env_path}": "",
-            "%{msvc_env_include}": "",
-            "%{msvc_env_lib}": "",
-            "%{msvc_cl_path}": "",
-            "%{msvc_ml_path}": "",
-            "%{msvc_link_path}": "",
-            "%{msvc_lib_path}": "",
-            "%{cxx_builtin_include_directory}": "",
-        }
-
-    vc_path = find_vc_path(repository_ctx)
-    if not vc_path:
-        auto_configure_fail("Visual C++ build tools not found on your machine." +
-                            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using")
-        return {}
-
-    env = setup_vc_env_vars(repository_ctx, vc_path)
-    escaped_paths = escape_string(env["PATH"])
-    escaped_include_paths = escape_string(env["INCLUDE"])
-    escaped_lib_paths = escape_string(env["LIB"])
-    escaped_tmp_dir = escape_string(
-        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace("\\", "\\\\"),
-    )
+def _get_msvc_compiler(repository_ctx):
+  vc_path = find_vc_path(repository_ctx)
+  return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
 
-    msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
-    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace("\\", "/")
-    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace("\\", "/")
-    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace("\\", "/")
 
-    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
-    # The generated files are guranteed to have unique name, so they can share the same tmp directory
-    escaped_cxx_include_directories = ["cxx_builtin_include_directory: \"%s\"" % _get_nvcc_tmp_dir_for_windows(repository_ctx)]
-    for path in escaped_include_paths.split(";"):
-        if path:
-            escaped_cxx_include_directories.append("cxx_builtin_include_directory: \"%s\"" % path)
+def _get_win_cuda_defines(repository_ctx):
+  """Return CROSSTOOL defines for Windows"""
 
+  # If we are not on Windows, return empty vaules for Windows specific fields.
+  # This ensures the CROSSTOOL file parser is happy.
+  if not _is_windows(repository_ctx):
     return {
-        "%{msvc_env_tmp}": escaped_tmp_dir,
-        "%{msvc_env_path}": escaped_paths,
-        "%{msvc_env_include}": escaped_include_paths,
-        "%{msvc_env_lib}": escaped_lib_paths,
-        "%{msvc_cl_path}": msvc_cl_path,
-        "%{msvc_ml_path}": msvc_ml_path,
-        "%{msvc_link_path}": msvc_link_path,
-        "%{msvc_lib_path}": msvc_lib_path,
-        "%{cxx_builtin_include_directory}": "\n".join(escaped_cxx_include_directories),
+        "%{msvc_env_tmp}": "",
+        "%{msvc_env_path}": "",
+        "%{msvc_env_include}": "",
+        "%{msvc_env_lib}": "",
+        "%{msvc_cl_path}": "",
+        "%{msvc_ml_path}": "",
+        "%{msvc_link_path}": "",
+        "%{msvc_lib_path}": "",
+        "%{cxx_builtin_include_directory}": "",
     }
 
+  vc_path = find_vc_path(repository_ctx)
+  if not vc_path:
+    auto_configure_fail(
+        "Visual C++ build tools not found on your machine." +
+        "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using"
+    )
+    return {}
+
+  env = setup_vc_env_vars(repository_ctx, vc_path)
+  escaped_paths = escape_string(env["PATH"])
+  escaped_include_paths = escape_string(env["INCLUDE"])
+  escaped_lib_paths = escape_string(env["LIB"])
+  escaped_tmp_dir = escape_string(
+      get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+          "\\", "\\\\"),)
+
+  msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
+  msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
+      "\\", "/")
+  msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
+      "\\", "/")
+  msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
+      "\\", "/")
+
+  # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
+  # The generated files are guranteed to have unique name, so they can share the same tmp directory
+  escaped_cxx_include_directories = [
+      "cxx_builtin_include_directory: \"%s\"" %
+      _get_nvcc_tmp_dir_for_windows(repository_ctx)
+  ]
+  for path in escaped_include_paths.split(";"):
+    if path:
+      escaped_cxx_include_directories.append(
+          "cxx_builtin_include_directory: \"%s\"" % path)
+
+  return {
+      "%{msvc_env_tmp}":
+          escaped_tmp_dir,
+      "%{msvc_env_path}":
+          escaped_paths,
+      "%{msvc_env_include}":
+          escaped_include_paths,
+      "%{msvc_env_lib}":
+          escaped_lib_paths,
+      "%{msvc_cl_path}":
+          msvc_cl_path,
+      "%{msvc_ml_path}":
+          msvc_ml_path,
+      "%{msvc_link_path}":
+          msvc_link_path,
+      "%{msvc_lib_path}":
+          msvc_lib_path,
+      "%{cxx_builtin_include_directory}":
+          "\n".join(escaped_cxx_include_directories),
+  }
+
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
 # BEGIN cc_configure common functions.
 def find_cc(repository_ctx):
-    """Find the C++ compiler."""
-    if _is_windows(repository_ctx):
-        return _get_msvc_compiler(repository_ctx)
-
-    if _use_cuda_clang(repository_ctx):
-        target_cc_name = "clang"
-        cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
-        if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
-            return "extra_tools/bin/clang"
-    else:
-        target_cc_name = "gcc"
-        cc_path_envvar = _GCC_HOST_COMPILER_PATH
-    cc_name = target_cc_name
-
-    if cc_path_envvar in repository_ctx.os.environ:
-        cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
-        if cc_name_from_env:
-            cc_name = cc_name_from_env
-    if cc_name.startswith("/"):
-        # Absolute path, maybe we should make this supported by our which function.
-        return cc_name
-    cc = repository_ctx.which(cc_name)
-    if cc == None:
-        fail(("Cannot find {}, either correct your path or set the {}" +
-              " environment variable").format(target_cc_name, cc_path_envvar))
-    return cc
+  """Find the C++ compiler."""
+  if _is_windows(repository_ctx):
+    return _get_msvc_compiler(repository_ctx)
+
+  if _use_cuda_clang(repository_ctx):
+    target_cc_name = "clang"
+    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+    if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
+      return "extra_tools/bin/clang"
+  else:
+    target_cc_name = "gcc"
+    cc_path_envvar = _GCC_HOST_COMPILER_PATH
+  cc_name = target_cc_name
+
+  if cc_path_envvar in repository_ctx.os.environ:
+    cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
+    if cc_name_from_env:
+      cc_name = cc_name_from_env
+  if cc_name.startswith("/"):
+    # Absolute path, maybe we should make this supported by our which function.
+    return cc_name
+  cc = repository_ctx.which(cc_name)
+  if cc == None:
+    fail(("Cannot find {}, either correct your path or set the {}" +
+          " environment variable").format(target_cc_name, cc_path_envvar))
+  return cc
+
 
 _INC_DIR_MARKER_BEGIN = "#include <...>"
 
@@ -246,80 +269,82 @@ _OSX_FRAMEWORK_SUFFIX = " (framework directory)"
 _OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
 
 def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-    return path
+  """Convert path returned by cc -E xc++ in a complete path."""
+  path = path.strip()
+  if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+    path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+  return path
+
 
 def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
+  """Normalizes include paths before writing them to the crosstool.
 
     If path points inside the 'crosstool' folder of the repository, a relative
     path is returned.
     If path points outside the 'crosstool' folder, an absolute path is returned.
     """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+  path = str(repository_ctx.path(path))
+  crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+  if path.startswith(crosstool_folder):
+    # We drop the path to "$REPO/crosstool" and a trailing path separator.
+    return path[len(crosstool_folder) + 1:]
+  return path
 
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return path[len(crosstool_folder) + 1:]
-    return path
 
 def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
-    index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = result.stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = result.stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = result.stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = result.stderr[index1 + 1:]
-    else:
-        inc_dirs = result.stderr[index1 + 1:index2].strip()
+  """Compute the list of default C or C++ include directories."""
+  if lang_is_cpp:
+    lang = "c++"
+  else:
+    lang = "c"
+  result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
+  index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
+  if index1 == -1:
+    return []
+  index1 = result.stderr.find("\n", index1)
+  if index1 == -1:
+    return []
+  index2 = result.stderr.rfind("\n ")
+  if index2 == -1 or index2 < index1:
+    return []
+  index2 = result.stderr.find("\n", index2 + 1)
+  if index2 == -1:
+    inc_dirs = result.stderr[index1 + 1:]
+  else:
+    inc_dirs = result.stderr[index1 + 1:index2].strip()
+
+  return [
+      _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+      for p in inc_dirs.split("\n")
+  ]
 
-    return [
-        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-        for p in inc_dirs.split("\n")
-    ]
 
 def get_cxx_inc_directories(repository_ctx, cc):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
-    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
-
-    includes_cpp_set = depset(includes_cpp)
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp_set
-    ]
+  """Compute the list of default C and C++ include directories."""
+
+  # For some reason `clang -xc` sometimes returns include paths that are
+  # different from the ones from `clang -xc++`. (Symlink and a dir)
+  # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+  includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+  includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+  includes_cpp_set = depset(includes_cpp)
+  return includes_cpp + [
+      inc for inc in includes_c if inc not in includes_cpp_set
+  ]
+
 
 def auto_configure_fail(msg):
-    """Output failure message when cuda configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+  """Output failure message when cuda configuration fails."""
+  red = "\033[0;31m"
+  no_color = "\033[0m"
+  fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
 
 # END cc_configure common functions (see TODO above).
 
 def _host_compiler_includes(repository_ctx, cc):
-    """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
+  """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
 
     Args:
       repository_ctx: The repository context.
@@ -330,14 +355,15 @@ def _host_compiler_includes(repository_ctx, cc):
       host compiler include directories, which can be added to the CROSSTOOL
       file.
     """
-    inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
-    inc_entries = []
-    for inc_dir in inc_dirs:
-        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
-    return "\n".join(inc_entries)
+  inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
+  inc_entries = []
+  for inc_dir in inc_dirs:
+    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
+  return "\n".join(inc_entries)
+
 
 def _cuda_include_path(repository_ctx, cuda_config):
-    """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
+  """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
 
     Args:
       repository_ctx: The repository context.
@@ -348,39 +374,41 @@ def _cuda_include_path(repository_ctx, cuda_config):
       host compiler include directories, which can be added to the CROSSTOOL
       file.
     """
-    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
-                                    (
-                                        cuda_config.cuda_toolkit_path,
-                                        ".exe" if cuda_config.cpu_value == "Windows" else "",
-                                    ))
-    result = repository_ctx.execute([
-        nvcc_path,
-        "-v",
-        "/dev/null",
-        "-o",
-        "/dev/null",
-    ])
-    target_dir = ""
-    for one_line in result.stderr.splitlines():
-        if one_line.startswith("#$ _TARGET_DIR_="):
-            target_dir = (cuda_config.cuda_toolkit_path + "/" +
-                          one_line.replace("#$ _TARGET_DIR_=", "") + "/include")
-    inc_entries = []
-    if target_dir != "":
-        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
-    default_include = cuda_config.cuda_toolkit_path + "/include"
-    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" %
-                       default_include)
-    return "\n".join(inc_entries)
+  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+      cuda_config.cuda_toolkit_path,
+      ".exe" if cuda_config.cpu_value == "Windows" else "",
+  ))
+  result = repository_ctx.execute([
+      nvcc_path,
+      "-v",
+      "/dev/null",
+      "-o",
+      "/dev/null",
+  ])
+  target_dir = ""
+  for one_line in result.stderr.splitlines():
+    if one_line.startswith("#$ _TARGET_DIR_="):
+      target_dir = (
+          cuda_config.cuda_toolkit_path + "/" + one_line.replace(
+              "#$ _TARGET_DIR_=", "") + "/include")
+  inc_entries = []
+  if target_dir != "":
+    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
+  default_include = cuda_config.cuda_toolkit_path + "/include"
+  inc_entries.append(
+      "  cxx_builtin_include_directory: \"%s\"" % default_include)
+  return "\n".join(inc_entries)
+
 
 def _enable_cuda(repository_ctx):
-    if "TF_NEED_CUDA" in repository_ctx.os.environ:
-        enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
-        return enable_cuda == "1"
-    return False
+  if "TF_NEED_CUDA" in repository_ctx.os.environ:
+    enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
+    return enable_cuda == "1"
+  return False
+
 
-def _cuda_toolkit_path(repository_ctx):
-    """Finds the cuda toolkit directory.
+def cuda_toolkit_path(repository_ctx):
+  """Finds the cuda toolkit directory.
 
     Args:
       repository_ctx: The repository context.
@@ -388,27 +416,31 @@ def _cuda_toolkit_path(repository_ctx):
     Returns:
       A speculative real path of the cuda toolkit install directory.
     """
-    cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
-    if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
-        cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
-    if not repository_ctx.path(cuda_toolkit_path).exists:
-        auto_configure_fail("Cannot find cuda toolkit path.")
-    return str(repository_ctx.path(cuda_toolkit_path).realpath)
+  cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
+  if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
+    cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
+  if not repository_ctx.path(cuda_toolkit_path).exists:
+    auto_configure_fail("Cannot find cuda toolkit path.")
+  return str(repository_ctx.path(cuda_toolkit_path).realpath)
+
 
 def _cudnn_install_basedir(repository_ctx):
-    """Finds the cudnn install directory."""
-    cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
-    if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
-        cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
-    if not repository_ctx.path(cudnn_install_path).exists:
-        auto_configure_fail("Cannot find cudnn install path.")
-    return cudnn_install_path
+  """Finds the cudnn install directory."""
+  cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
+  if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
+    cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
+  if not repository_ctx.path(cudnn_install_path).exists:
+    auto_configure_fail("Cannot find cudnn install path.")
+  return cudnn_install_path
+
 
 def matches_version(environ_version, detected_version):
-    """Checks whether the user-specified version matches the detected version.
+  """Checks whether the user-specified version matches the detected version.
 
-    This function performs a weak matching so that if the user specifies only the
-    major or major and minor versions, the versions are still considered matching
+    This function performs a weak matching so that if the user specifies only
+    the
+    major or major and minor versions, the versions are still considered
+    matching
     if the version parts match. To illustrate:
 
         environ_version  detected_version  result
@@ -424,25 +456,25 @@ def matches_version(environ_version, detected_version):
         variables.
       detected_version: The version autodetected from the CUDA installation on
         the system.
-
     Returns: True if user-specified version matches detected version and False
       otherwise.
-    """
-    environ_version_parts = environ_version.split(".")
-    detected_version_parts = detected_version.split(".")
-    if len(detected_version_parts) < len(environ_version_parts):
-        return False
-    for i, part in enumerate(detected_version_parts):
-        if i >= len(environ_version_parts):
-            break
-        if part != environ_version_parts[i]:
-            return False
-    return True
+  """
+  environ_version_parts = environ_version.split(".")
+  detected_version_parts = detected_version.split(".")
+  if len(detected_version_parts) < len(environ_version_parts):
+    return False
+  for i, part in enumerate(detected_version_parts):
+    if i >= len(environ_version_parts):
+      break
+    if part != environ_version_parts[i]:
+      return False
+  return True
+
 
 _NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
 
 def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
-    """Detects the version of CUDA installed on the system.
+  """Detects the version of CUDA installed on the system.
 
     Args:
       repository_ctx: The repository context.
@@ -452,64 +484,61 @@ def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
       String containing the version of CUDA.
     """
 
-    # Run nvcc --version and find the line containing the CUDA version.
-    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" %
-                                    (
-                                        cuda_toolkit_path,
-                                        ".exe" if cpu_value == "Windows" else "",
-                                    ))
-    if not nvcc_path.exists:
-        auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
-    result = repository_ctx.execute([str(nvcc_path), "--version"])
-    if result.stderr:
-        auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
-    lines = result.stdout.splitlines()
-    version_line = lines[len(lines) - 1]
-    if version_line.find(_NVCC_VERSION_PREFIX) == -1:
-        auto_configure_fail(
-            "Could not parse CUDA version from nvcc --version. Got: %s" %
-            result.stdout,
-        )
-
-    # Parse the CUDA version from the line containing the CUDA version.
-    prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, "")
-    parts = prefix_removed.split(",")
-    if len(parts) != 2 or len(parts[0]) < 2:
-        auto_configure_fail(
-            "Could not parse CUDA version from nvcc --version. Got: %s" %
-            result.stdout,
-        )
-    full_version = parts[1].strip()
-    if full_version.startswith("V"):
-        full_version = full_version[1:]
-
-    # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
-    # match the detected version.
-    environ_version = ""
-    if _TF_CUDA_VERSION in repository_ctx.os.environ:
-        environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
-    if environ_version and not matches_version(environ_version, full_version):
-        auto_configure_fail(
-            ("CUDA version detected from nvcc (%s) does not match " +
-             "TF_CUDA_VERSION (%s)") % (full_version, environ_version),
-        )
-
-    # We only use the version consisting of the major and minor version numbers.
-    version_parts = full_version.split(".")
-    if len(version_parts) < 2:
-        auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
-    if cpu_value == "Windows":
-        version = "64_%s%s" % (version_parts[0], version_parts[1])
-    else:
-        version = "%s.%s" % (version_parts[0], version_parts[1])
-    return version
+  # Run nvcc --version and find the line containing the CUDA version.
+  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+      cuda_toolkit_path,
+      ".exe" if cpu_value == "Windows" else "",
+  ))
+  if not nvcc_path.exists:
+    auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
+  result = repository_ctx.execute([str(nvcc_path), "--version"])
+  if result.stderr:
+    auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
+  lines = result.stdout.splitlines()
+  version_line = lines[len(lines) - 1]
+  if version_line.find(_NVCC_VERSION_PREFIX) == -1:
+    auto_configure_fail(
+        "Could not parse CUDA version from nvcc --version. Got: %s" %
+        result.stdout,)
+
+  # Parse the CUDA version from the line containing the CUDA version.
+  prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, "")
+  parts = prefix_removed.split(",")
+  if len(parts) != 2 or len(parts[0]) < 2:
+    auto_configure_fail(
+        "Could not parse CUDA version from nvcc --version. Got: %s" %
+        result.stdout,)
+  full_version = parts[1].strip()
+  if full_version.startswith("V"):
+    full_version = full_version[1:]
+
+  # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
+  # match the detected version.
+  environ_version = ""
+  if _TF_CUDA_VERSION in repository_ctx.os.environ:
+    environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
+  if environ_version and not matches_version(environ_version, full_version):
+    auto_configure_fail(
+        ("CUDA version detected from nvcc (%s) does not match " +
+         "TF_CUDA_VERSION (%s)") % (full_version, environ_version),)
+
+  # We only use the version consisting of the major and minor version numbers.
+  version_parts = full_version.split(".")
+  if len(version_parts) < 2:
+    auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
+  if cpu_value == "Windows":
+    version = "64_%s%s" % (version_parts[0], version_parts[1])
+  else:
+    version = "%s.%s" % (version_parts[0], version_parts[1])
+  return version
+
 
 _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
 _DEFINE_CUDNN_MINOR = "#define CUDNN_MINOR"
 _DEFINE_CUDNN_PATCHLEVEL = "#define CUDNN_PATCHLEVEL"
 
 def find_cuda_define(repository_ctx, header_dir, header_file, define):
-    """Returns the value of a #define in a header file.
+  """Returns the value of a #define in a header file.
 
     Greps through a header file and returns the value of the specified #define.
     If the #define is not found, then raise an error.
@@ -524,52 +553,52 @@ def find_cuda_define(repository_ctx, header_dir, header_file, define):
       The value of the #define found in the header.
     """
 
-    # Confirm location of the header and grep for the line defining the macro.
-    h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
-    if not h_path.exists:
-        auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
-    result = repository_ctx.execute(
-        # Grep one more lines as some #defines are splitted into two lines.
-        ["grep", "--color=never", "-A1", "-E", define, str(h_path)],
-    )
-    if result.stderr:
-        auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
-
-    # Parse the version from the line defining the macro.
-    if result.stdout.find(define) == -1:
-        auto_configure_fail("Cannot find line containing '%s' in %s" %
-                            (define, h_path))
-
-    # Split results to lines
-    lines = result.stdout.split("\n")
-    num_lines = len(lines)
-    for l in range(num_lines):
-        line = lines[l]
-        if define in line:  # Find the line with define
-            version = line
-            if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
-                version = version[:-1] + lines[l + 1]
-            break
-
-    # Remove any comments
-    version = version.split("//")[0]
-
-    # Remove define name
-    version = version.replace(define, "").strip()
-
-    # Remove the code after the version number.
-    version_end = version.find(" ")
-    if version_end != -1:
-        if version_end == 0:
-            auto_configure_fail(
-                "Cannot extract the version from line containing '%s' in %s" %
-                (define, str(h_path)),
-            )
-        version = version[:version_end].strip()
-    return version
+  # Confirm location of the header and grep for the line defining the macro.
+  h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
+  if not h_path.exists:
+    auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
+  result = repository_ctx.execute(
+      # Grep one more lines as some #defines are splitted into two lines.
+      ["grep", "--color=never", "-A1", "-E", define,
+       str(h_path)],)
+  if result.stderr:
+    auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
+
+  # Parse the version from the line defining the macro.
+  if result.stdout.find(define) == -1:
+    auto_configure_fail(
+        "Cannot find line containing '%s' in %s" % (define, h_path))
+
+  # Split results to lines
+  lines = result.stdout.split("\n")
+  num_lines = len(lines)
+  for l in range(num_lines):
+    line = lines[l]
+    if define in line:  # Find the line with define
+      version = line
+      if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
+        version = version[:-1] + lines[l + 1]
+      break
+
+  # Remove any comments
+  version = version.split("//")[0]
+
+  # Remove define name
+  version = version.replace(define, "").strip()
+
+  # Remove the code after the version number.
+  version_end = version.find(" ")
+  if version_end != -1:
+    if version_end == 0:
+      auto_configure_fail(
+          "Cannot extract the version from line containing '%s' in %s" %
+          (define, str(h_path)),)
+    version = version[:version_end].strip()
+  return version
+
 
 def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
-    """Detects the version of cuDNN installed on the system.
+  """Detects the version of cuDNN installed on the system.
 
     Args:
       repository_ctx: The repository context.
@@ -579,68 +608,68 @@ def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
     Returns:
       A string containing the version of cuDNN.
     """
-    cudnn_header_dir = _find_cudnn_header_dir(
-        repository_ctx,
-        cudnn_install_basedir,
-    )
-    major_version = find_cuda_define(
-        repository_ctx,
-        cudnn_header_dir,
-        "cudnn.h",
-        _DEFINE_CUDNN_MAJOR,
-    )
-    minor_version = find_cuda_define(
-        repository_ctx,
-        cudnn_header_dir,
-        "cudnn.h",
-        _DEFINE_CUDNN_MINOR,
-    )
-    patch_version = find_cuda_define(
-        repository_ctx,
-        cudnn_header_dir,
-        "cudnn.h",
-        _DEFINE_CUDNN_PATCHLEVEL,
-    )
-    full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
-
-    # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
-    # match the detected version.
-    environ_version = ""
-    if _TF_CUDNN_VERSION in repository_ctx.os.environ:
-        environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
-    if environ_version and not matches_version(environ_version, full_version):
-        cudnn_h_path = repository_ctx.path("%s/include/cudnn.h" %
-                                           cudnn_install_basedir)
-        auto_configure_fail(
-            ("cuDNN version detected from %s (%s) does not match " +
-             "TF_CUDNN_VERSION (%s)") %
-            (str(cudnn_h_path), full_version, environ_version),
-        )
-
-    # We only use the major version since we use the libcudnn libraries that are
-    # only versioned with the major version (e.g. libcudnn.so.5).
-    version = major_version
-    if cpu_value == "Windows":
-        version = "64_" + version
-    return version
-
-def _compute_capabilities(repository_ctx):
-    """Returns a list of strings representing cuda compute capabilities."""
-    if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
-        return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
-    capabilities = capabilities_str.split(",")
-    for capability in capabilities:
-        # Workaround for Skylark's lack of support for regex. This check should
-        # be equivalent to checking:
-        #     if re.match("[0-9]+.[0-9]+", capability) == None:
-        parts = capability.split(".")
-        if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
-            auto_configure_fail("Invalid compute capability: %s" % capability)
-    return capabilities
+  cudnn_header_dir = _find_cudnn_header_dir(
+      repository_ctx,
+      cudnn_install_basedir,
+  )
+  major_version = find_cuda_define(
+      repository_ctx,
+      cudnn_header_dir,
+      "cudnn.h",
+      _DEFINE_CUDNN_MAJOR,
+  )
+  minor_version = find_cuda_define(
+      repository_ctx,
+      cudnn_header_dir,
+      "cudnn.h",
+      _DEFINE_CUDNN_MINOR,
+  )
+  patch_version = find_cuda_define(
+      repository_ctx,
+      cudnn_header_dir,
+      "cudnn.h",
+      _DEFINE_CUDNN_PATCHLEVEL,
+  )
+  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+
+  # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
+  # match the detected version.
+  environ_version = ""
+  if _TF_CUDNN_VERSION in repository_ctx.os.environ:
+    environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
+  if environ_version and not matches_version(environ_version, full_version):
+    cudnn_h_path = repository_ctx.path(
+        "%s/include/cudnn.h" % cudnn_install_basedir)
+    auto_configure_fail(("cuDNN version detected from %s (%s) does not match " +
+                         "TF_CUDNN_VERSION (%s)") %
+                        (str(cudnn_h_path), full_version, environ_version),)
+
+  # We only use the major version since we use the libcudnn libraries that are
+  # only versioned with the major version (e.g. libcudnn.so.5).
+  version = major_version
+  if cpu_value == "Windows":
+    version = "64_" + version
+  return version
+
+
+def compute_capabilities(repository_ctx):
+  """Returns a list of strings representing cuda compute capabilities."""
+  if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
+    return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+  capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
+  capabilities = capabilities_str.split(",")
+  for capability in capabilities:
+    # Workaround for Skylark's lack of support for regex. This check should
+    # be equivalent to checking:
+    #     if re.match("[0-9]+.[0-9]+", capability) == None:
+    parts = capability.split(".")
+    if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
+      auto_configure_fail("Invalid compute capability: %s" % capability)
+  return capabilities
+
 
 def get_cpu_value(repository_ctx):
-    """Returns the name of the host operating system.
+  """Returns the name of the host operating system.
 
     Args:
       repository_ctx: The repository context.
@@ -648,20 +677,22 @@ def get_cpu_value(repository_ctx):
     Returns:
       A string containing the name of the host operating system.
     """
-    os_name = repository_ctx.os.name.lower()
-    if os_name.startswith("mac os"):
-        return "Darwin"
-    if os_name.find("windows") != -1:
-        return "Windows"
-    result = repository_ctx.execute(["uname", "-s"])
-    return result.stdout.strip()
+  os_name = repository_ctx.os.name.lower()
+  if os_name.startswith("mac os"):
+    return "Darwin"
+  if os_name.find("windows") != -1:
+    return "Windows"
+  result = repository_ctx.execute(["uname", "-s"])
+  return result.stdout.strip()
+
 
 def _is_windows(repository_ctx):
-    """Returns true if the host operating system is windows."""
-    return get_cpu_value(repository_ctx) == "Windows"
+  """Returns true if the host operating system is windows."""
+  return get_cpu_value(repository_ctx) == "Windows"
+
 
 def _lib_name(lib, cpu_value, version = "", static = False):
-    """Constructs the platform-specific name of a library.
+  """Constructs the platform-specific name of a library.
 
     Args:
       lib: The name of the library, such as "cudart"
@@ -672,23 +703,24 @@ def _lib_name(lib, cpu_value, version = "", static = False):
     Returns:
       The platform-specific name of the library.
     """
-    if cpu_value in ("Linux", "FreeBSD"):
-        if static:
-            return "lib%s.a" % lib
-        else:
-            if version:
-                version = ".%s" % version
-            return "lib%s.so%s" % (lib, version)
-    elif cpu_value == "Windows":
-        return "%s.lib" % lib
-    elif cpu_value == "Darwin":
-        if static:
-            return "lib%s.a" % lib
-        elif version:
-            version = ".%s" % version
-        return "lib%s%s.dylib" % (lib, version)
+  if cpu_value in ("Linux", "FreeBSD"):
+    if static:
+      return "lib%s.a" % lib
     else:
-        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+      if version:
+        version = ".%s" % version
+      return "lib%s.so%s" % (lib, version)
+  elif cpu_value == "Windows":
+    return "%s.lib" % lib
+  elif cpu_value == "Darwin":
+    if static:
+      return "lib%s.a" % lib
+    elif version:
+      version = ".%s" % version
+    return "lib%s%s.dylib" % (lib, version)
+  else:
+    auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
 
 def _find_cuda_lib(
         lib,
@@ -697,7 +729,7 @@ def _find_cuda_lib(
         basedir,
         version = "",
         static = False):
-    """Finds the given CUDA or cuDNN library on the system.
+  """Finds the given CUDA or cuDNN library on the system.
 
     Args:
       lib: The name of the library, such as "cudart"
@@ -712,15 +744,16 @@ def _find_cuda_lib(
         file_name: The basename of the library found on the system.
         path: The full path to the library.
     """
-    file_name = _lib_name(lib, cpu_value, version, static)
-    for relative_path in CUDA_LIB_PATHS:
-        path = repository_ctx.path("%s/%s%s" % (basedir, relative_path, file_name))
-        if path.exists:
-            return struct(file_name = file_name, path = str(path.realpath))
-    auto_configure_fail("Cannot find cuda library %s" % file_name)
+  file_name = _lib_name(lib, cpu_value, version, static)
+  for relative_path in CUDA_LIB_PATHS:
+    path = repository_ctx.path("%s/%s%s" % (basedir, relative_path, file_name))
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+  auto_configure_fail("Cannot find cuda library %s" % file_name)
+
 
 def _find_cupti_header_dir(repository_ctx, cuda_config):
-    """Returns the path to the directory containing cupti.h
+  """Returns the path to the directory containing cupti.h
 
     On most systems, the cupti library is not installed in the same directory as
     the other CUDA libraries but rather in a special extras/CUPTI directory.
@@ -732,14 +765,17 @@ def _find_cupti_header_dir(repository_ctx, cuda_config):
     Returns:
       The path of the directory containing the cupti header.
     """
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    for relative_path in CUPTI_HEADER_PATHS:
-        if repository_ctx.path("%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
-            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-    auto_configure_fail("Cannot find cupti.h under %s" % ", ".join([cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  for relative_path in CUPTI_HEADER_PATHS:
+    if repository_ctx.path(
+        "%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
+      return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+  auto_configure_fail("Cannot find cupti.h under %s" % ", ".join(
+      [cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
+
 
 def _find_cupti_lib(repository_ctx, cuda_config):
-    """Finds the cupti library on the system.
+  """Finds the cupti library on the system.
 
     On most systems, the cupti library is not installed in the same directory as
     the other CUDA libraries but rather in a special extras/CUPTI directory.
@@ -753,23 +789,23 @@ def _find_cupti_lib(repository_ctx, cuda_config):
         file_name: The basename of the library found on the system.
         path: The full path to the library.
     """
-    file_name = _lib_name(
-        "cupti",
-        cuda_config.cpu_value,
-        cuda_config.cuda_version,
-    )
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    for relative_path in CUPTI_LIB_PATHS:
-        path = repository_ctx.path(
-            "%s/%s%s" % (cuda_toolkit_path, relative_path, file_name),
-        )
-        if path.exists:
-            return struct(file_name = file_name, path = str(path.realpath))
+  file_name = _lib_name(
+      "cupti",
+      cuda_config.cpu_value,
+      cuda_config.cuda_version,
+  )
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  for relative_path in CUPTI_LIB_PATHS:
+    path = repository_ctx.path(
+        "%s/%s%s" % (cuda_toolkit_path, relative_path, file_name),)
+    if path.exists:
+      return struct(file_name=file_name, path=str(path.realpath))
+
+  auto_configure_fail("Cannot find cupti library %s" % file_name)
 
-    auto_configure_fail("Cannot find cupti library %s" % file_name)
 
 def _find_libs(repository_ctx, cuda_config):
-    """Returns the CUDA and cuDNN libraries on the system.
+  """Returns the CUDA and cuDNN libraries on the system.
 
     Args:
       repository_ctx: The repository context.
@@ -778,64 +814,75 @@ def _find_libs(repository_ctx, cuda_config):
     Returns:
       Map of library names to structs of filename and path.
     """
-    cpu_value = cuda_config.cpu_value
-    return {
-        "cuda": _find_cuda_lib("cuda", repository_ctx, cpu_value, cuda_config.cuda_toolkit_path),
-        "cudart": _find_cuda_lib(
-            "cudart",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "cudart_static": _find_cuda_lib(
-            "cudart_static",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-            static = True,
-        ),
-        "cublas": _find_cuda_lib(
-            "cublas",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "cusolver": _find_cuda_lib(
-            "cusolver",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "curand": _find_cuda_lib(
-            "curand",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "cufft": _find_cuda_lib(
-            "cufft",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cuda_toolkit_path,
-            cuda_config.cuda_version,
-        ),
-        "cudnn": _find_cuda_lib(
-            "cudnn",
-            repository_ctx,
-            cpu_value,
-            cuda_config.cudnn_install_basedir,
-            cuda_config.cudnn_version,
-        ),
-        "cupti": _find_cupti_lib(repository_ctx, cuda_config),
-    }
+  cpu_value = cuda_config.cpu_value
+  return {
+      "cuda":
+          _find_cuda_lib("cuda", repository_ctx, cpu_value,
+                         cuda_config.cuda_toolkit_path),
+      "cudart":
+          _find_cuda_lib(
+              "cudart",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "cudart_static":
+          _find_cuda_lib(
+              "cudart_static",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+              static=True,
+          ),
+      "cublas":
+          _find_cuda_lib(
+              "cublas",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "cusolver":
+          _find_cuda_lib(
+              "cusolver",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "curand":
+          _find_cuda_lib(
+              "curand",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "cufft":
+          _find_cuda_lib(
+              "cufft",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cuda_toolkit_path,
+              cuda_config.cuda_version,
+          ),
+      "cudnn":
+          _find_cuda_lib(
+              "cudnn",
+              repository_ctx,
+              cpu_value,
+              cuda_config.cudnn_install_basedir,
+              cuda_config.cudnn_version,
+          ),
+      "cupti":
+          _find_cupti_lib(repository_ctx, cuda_config),
+  }
+
 
 def _find_cuda_include_path(repository_ctx, cuda_config):
-    """Returns the path to the directory containing cuda.h
+  """Returns the path to the directory containing cuda.h
 
     Args:
       repository_ctx: The repository context.
@@ -844,14 +891,16 @@ def _find_cuda_include_path(repository_ctx, cuda_config):
     Returns:
       The path of the directory containing the CUDA headers.
     """
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    for relative_path in CUDA_INCLUDE_PATHS:
-        if repository_ctx.path("%s/%scuda.h" % (cuda_toolkit_path, relative_path)).exists:
-            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-    auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  for relative_path in CUDA_INCLUDE_PATHS:
+    if repository_ctx.path(
+        "%s/%scuda.h" % (cuda_toolkit_path, relative_path)).exists:
+      return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+  auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
+
 
 def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
-    """Returns the path to the directory containing cudnn.h
+  """Returns the path to the directory containing cudnn.h
 
     Args:
       repository_ctx: The repository context.
@@ -861,15 +910,17 @@ def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
     Returns:
       The path of the directory containing the cudnn header.
     """
-    for relative_path in CUDA_INCLUDE_PATHS:
-        if repository_ctx.path("%s/%scudnn.h" % (cudnn_install_basedir, relative_path)).exists:
-            return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
-    if repository_ctx.path("/usr/include/cudnn.h").exists:
-        return "/usr/include"
-    auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
+  for relative_path in CUDA_INCLUDE_PATHS:
+    if repository_ctx.path(
+        "%s/%scudnn.h" % (cudnn_install_basedir, relative_path)).exists:
+      return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
+  if repository_ctx.path("/usr/include/cudnn.h").exists:
+    return "/usr/include"
+  auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
+
 
 def _find_nvvm_libdevice_dir(repository_ctx, cuda_config):
-    """Returns the path to the directory containing libdevice in bitcode format.
+  """Returns the path to the directory containing libdevice in bitcode format.
 
     Args:
       repository_ctx: The repository context.
@@ -878,19 +929,23 @@ def _find_nvvm_libdevice_dir(repository_ctx, cuda_config):
     Returns:
       The path of the directory containing the CUDA headers.
     """
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    for libdevice_file in NVVM_LIBDEVICE_FILES:
-        for relative_path in NVVM_LIBDEVICE_PATHS:
-            if repository_ctx.path("%s/%s%s" % (cuda_toolkit_path, relative_path, libdevice_file)).exists:
-                return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-    auto_configure_fail("Cannot find libdevice*.bc files under %s" % cuda_toolkit_path)
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  for libdevice_file in NVVM_LIBDEVICE_FILES:
+    for relative_path in NVVM_LIBDEVICE_PATHS:
+      if repository_ctx.path("%s/%s%s" % (cuda_toolkit_path, relative_path,
+                                          libdevice_file)).exists:
+        return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+  auto_configure_fail(
+      "Cannot find libdevice*.bc files under %s" % cuda_toolkit_path)
+
 
 def _cudart_static_linkopt(cpu_value):
-    """Returns additional platform-specific linkopts for cudart."""
-    return "" if cpu_value == "Darwin" else "\"-lrt\","
+  """Returns additional platform-specific linkopts for cudart."""
+  return "" if cpu_value == "Darwin" else "\"-lrt\","
+
 
 def _get_cuda_config(repository_ctx):
-    """Detects and returns information about the CUDA installation on the system.
+  """Detects and returns information about the CUDA installation on the system.
 
     Args:
       repository_ctx: The repository context.
@@ -904,35 +959,39 @@ def _get_cuda_config(repository_ctx):
         compute_capabilities: A list of the system's CUDA compute capabilities.
         cpu_value: The name of the host operating system.
     """
-    cpu_value = get_cpu_value(repository_ctx)
-    cuda_toolkit_path = _cuda_toolkit_path(repository_ctx)
-    cuda_version = _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value)
-    cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
-    cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value)
-    return struct(
-        cuda_toolkit_path = cuda_toolkit_path,
-        cudnn_install_basedir = cudnn_install_basedir,
-        cuda_version = cuda_version,
-        cudnn_version = cudnn_version,
-        compute_capabilities = _compute_capabilities(repository_ctx),
-        cpu_value = cpu_value,
-    )
+  cpu_value = get_cpu_value(repository_ctx)
+  toolkit_path = cuda_toolkit_path(repository_ctx)
+  cuda_version = _cuda_version(repository_ctx, toolkit_path, cpu_value)
+  cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
+  cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir,
+                                 cpu_value)
+  return struct(
+      cuda_toolkit_path=toolkit_path,
+      cudnn_install_basedir=cudnn_install_basedir,
+      cuda_version=cuda_version,
+      cudnn_version=cudnn_version,
+      compute_capabilities=compute_capabilities(repository_ctx),
+      cpu_value=cpu_value,
+  )
+
 
 def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        Label("//third_party/gpus/%s.tpl" % tpl),
-        substitutions,
-    )
+  if not out:
+    out = tpl.replace(":", "/")
+  repository_ctx.template(
+      out,
+      Label("//third_party/gpus/%s.tpl" % tpl),
+      substitutions,
+  )
+
 
 def _file(repository_ctx, label):
-    repository_ctx.template(
-        label.replace(":", "/"),
-        Label("//third_party/gpus/%s.tpl" % label),
-        {},
-    )
+  repository_ctx.template(
+      label.replace(":", "/"),
+      Label("//third_party/gpus/%s.tpl" % label),
+      {},
+  )
+
 
 _DUMMY_CROSSTOOL_BZL_FILE = """
 def error_gpu_disabled():
@@ -960,81 +1019,99 @@ error_gpu_disabled()
 """
 
 def _create_dummy_repository(repository_ctx):
-    cpu_value = get_cpu_value(repository_ctx)
+  cpu_value = get_cpu_value(repository_ctx)
+
+  # Set up BUILD file for cuda/.
+  _tpl(
+      repository_ctx,
+      "cuda:build_defs.bzl",
+      {
+          "%{cuda_is_configured}": "False",
+          "%{cuda_extra_copts}": "[]",
+      },
+  )
+  _tpl(
+      repository_ctx,
+      "cuda:BUILD",
+      {
+          "%{cuda_driver_lib}":
+              _lib_name("cuda", cpu_value),
+          "%{cudart_static_lib}":
+              _lib_name(
+                  "cudart_static",
+                  cpu_value,
+                  static=True,
+              ),
+          "%{cudart_static_linkopt}":
+              _cudart_static_linkopt(cpu_value),
+          "%{cudart_lib}":
+              _lib_name("cudart", cpu_value),
+          "%{cublas_lib}":
+              _lib_name("cublas", cpu_value),
+          "%{cusolver_lib}":
+              _lib_name("cusolver", cpu_value),
+          "%{cudnn_lib}":
+              _lib_name("cudnn", cpu_value),
+          "%{cufft_lib}":
+              _lib_name("cufft", cpu_value),
+          "%{curand_lib}":
+              _lib_name("curand", cpu_value),
+          "%{cupti_lib}":
+              _lib_name("cupti", cpu_value),
+          "%{cuda_include_genrules}":
+              "",
+          "%{cuda_headers}":
+              "",
+      },
+  )
 
-    # Set up BUILD file for cuda/.
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "False",
-            "%{cuda_extra_copts}": "[]",
-        },
-    )
-    _tpl(
-        repository_ctx,
-        "cuda:BUILD",
-        {
-            "%{cuda_driver_lib}": _lib_name("cuda", cpu_value),
-            "%{cudart_static_lib}": _lib_name(
-                "cudart_static",
-                cpu_value,
-                static = True,
-            ),
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
-            "%{cudart_lib}": _lib_name("cudart", cpu_value),
-            "%{cublas_lib}": _lib_name("cublas", cpu_value),
-            "%{cusolver_lib}": _lib_name("cusolver", cpu_value),
-            "%{cudnn_lib}": _lib_name("cudnn", cpu_value),
-            "%{cufft_lib}": _lib_name("cufft", cpu_value),
-            "%{curand_lib}": _lib_name("curand", cpu_value),
-            "%{cupti_lib}": _lib_name("cupti", cpu_value),
-            "%{cuda_include_genrules}": "",
-            "%{cuda_headers}": "",
-        },
-    )
+  # Create dummy files for the CUDA toolkit since they are still required by
+  # tensorflow/core/platform/default/build_config:cuda.
+  repository_ctx.file("cuda/cuda/include/cuda.h", "")
+  repository_ctx.file("cuda/cuda/include/cublas.h", "")
+  repository_ctx.file("cuda/cuda/include/cudnn.h", "")
+  repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h", "")
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
+  repository_ctx.file(
+      "cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
+  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
+
+  # Set up cuda_config.h, which is used by
+  # tensorflow/stream_executor/dso_loader.cc.
+  _tpl(
+      repository_ctx,
+      "cuda:cuda_config.h",
+      {
+          "%{cuda_version}":
+              _DEFAULT_CUDA_VERSION,
+          "%{cudnn_version}":
+              _DEFAULT_CUDNN_VERSION,
+          "%{cuda_compute_capabilities}":
+              ",".join([
+                  "CudaVersion(\"%s\")" % c
+                  for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+              ]),
+          "%{cuda_toolkit_path}":
+              _DEFAULT_CUDA_TOOLKIT_PATH,
+      },
+      "cuda/cuda/cuda_config.h",
+  )
 
-    # Create dummy files for the CUDA toolkit since they are still required by
-    # tensorflow/core/platform/default/build_config:cuda.
-    repository_ctx.file("cuda/cuda/include/cuda.h", "")
-    repository_ctx.file("cuda/cuda/include/cublas.h", "")
-    repository_ctx.file("cuda/cuda/include/cudnn.h", "")
-    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h", "")
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/stream_executor/dso_loader.cc.
-    _tpl(
-        repository_ctx,
-        "cuda:cuda_config.h",
-        {
-            "%{cuda_version}": _DEFAULT_CUDA_VERSION,
-            "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
-            "%{cuda_compute_capabilities}": ",".join([
-                "CudaVersion(\"%s\")" % c
-                for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-            ]),
-            "%{cuda_toolkit_path}": _DEFAULT_CUDA_TOOLKIT_PATH,
-        },
-        "cuda/cuda/cuda_config.h",
-    )
+  # If cuda_configure is not configured to build with GPU support, and the user
+  # attempts to build with --config=cuda, add a dummy build rule to intercept
+  # this and fail with an actionable error message.
+  repository_ctx.file(
+      "crosstool/error_gpu_disabled.bzl",
+      _DUMMY_CROSSTOOL_BZL_FILE,
+  )
+  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
-    # If cuda_configure is not configured to build with GPU support, and the user
-    # attempts to build with --config=cuda, add a dummy build rule to intercept
-    # this and fail with an actionable error message.
-    repository_ctx.file(
-        "crosstool/error_gpu_disabled.bzl",
-        _DUMMY_CROSSTOOL_BZL_FILE,
-    )
-    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
 def _execute(
         repository_ctx,
@@ -1042,35 +1119,35 @@ def _execute(
         error_msg = None,
         error_details = None,
         empty_stdout_fine = False):
-    """Executes an arbitrary shell command.
+  """Executes an arbitrary shell command.
 
     Args:
       repository_ctx: the repository_ctx object
       cmdline: list of strings, the command to execute
       error_msg: string, a summary of the error if the command fails
       error_details: string, details about the error or steps to fix it
-      empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
-        it's an error
-    Return:
-      the result of repository_ctx.execute(cmdline)
-    """
-    result = repository_ctx.execute(cmdline)
-    if result.stderr or not (empty_stdout_fine or result.stdout):
-        auto_configure_fail(
-            "\n".join([
-                error_msg.strip() if error_msg else "Repository command failed",
-                result.stderr.strip(),
-                error_details if error_details else "",
-            ]),
-        )
-    return result
+      empty_stdout_fine: bool, if True, an empty stdout result is fine,
+        otherwise it's an error
+    Return: the result of repository_ctx.execute(cmdline)
+  """
+  result = repository_ctx.execute(cmdline)
+  if result.stderr or not (empty_stdout_fine or result.stdout):
+    auto_configure_fail(
+        "\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else "",
+        ]),)
+  return result
+
 
 def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
+  """Returns a path with '/' and remove the trailing slash."""
+  path = path.replace("\\", "/")
+  if path[-1] == "/":
+    path = path[:-1]
+  return path
+
 
 def symlink_genrule_for_dir(
         repository_ctx,
@@ -1079,167 +1156,174 @@ def symlink_genrule_for_dir(
         genrule_name,
         src_files = [],
         dest_files = []):
-    """Returns a genrule to symlink(or copy if on Windows) a set of files.
+  """Returns a genrule to symlink(or copy if on Windows) a set of files.
 
     If src_dir is passed, files will be read from the given directory; otherwise
     we assume files are in src_files and dest_files
     """
-    if src_dir != None:
-        src_dir = _norm_path(src_dir)
-        dest_dir = _norm_path(dest_dir)
-        files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
-
-        # Create a list with the src_dir stripped to use for outputs.
-        dest_files = files.replace(src_dir, "").splitlines()
-        src_files = files.splitlines()
-    command = []
-    if not _is_windows(repository_ctx):
-        # We clear folders that might have been generated previously to avoid
-        # undesired inclusions
-        command.append('if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi')
-        command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
-        command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
-        command.append('if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi')
-    outs = []
-    for i in range(len(dest_files)):
-        if dest_files[i] != "":
-            # If we have only one file to link we do not want to use the dest_dir, as
-            # $(@D) will include the full path to the file.
-            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
-
-            # Copy the headers to create a sandboxable setup.
-            cmd = "cp -f"
-            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-            outs.append('        "' + dest_dir + dest_files[i] + '",')
-    genrule = _genrule(
-        src_dir,
-        genrule_name,
-        " && ".join(command),
-        "\n".join(outs),
-    )
-    return genrule
+  if src_dir != None:
+    src_dir = _norm_path(src_dir)
+    dest_dir = _norm_path(dest_dir)
+    files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
+
+    # Create a list with the src_dir stripped to use for outputs.
+    dest_files = files.replace(src_dir, "").splitlines()
+    src_files = files.splitlines()
+  command = []
+  if not _is_windows(repository_ctx):
+    # We clear folders that might have been generated previously to avoid
+    # undesired inclusions
+    command.append('if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi')
+    command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
+    command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
+    command.append('if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi')
+  outs = []
+  for i in range(len(dest_files)):
+    if dest_files[i] != "":
+      # If we have only one file to link we do not want to use the dest_dir, as
+      # $(@D) will include the full path to the file.
+      dest = "$(@D)/" + dest_dir + dest_files[i] if len(
+          dest_files) != 1 else "$(@D)/" + dest_files[i]
+
+      # Copy the headers to create a sandboxable setup.
+      cmd = "cp -f"
+      command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
+      outs.append('        "' + dest_dir + dest_files[i] + '",')
+  genrule = _genrule(
+      src_dir,
+      genrule_name,
+      " && ".join(command),
+      "\n".join(outs),
+  )
+  return genrule
+
 
 def _genrule(src_dir, genrule_name, command, outs):
-    """Returns a string with a genrule.
+  """Returns a string with a genrule.
 
     Genrule executes the given command and produces the given outputs.
     """
-    return (
-        "genrule(\n" +
-        '    name = "' +
-        genrule_name + '",\n' +
-        "    outs = [\n" +
-        outs +
-        "\n    ],\n" +
-        '    cmd = """\n' +
-        command +
-        '\n   """,\n' +
-        ")\n"
-    )
+  return (
+      "genrule(\n" + '    name = "' + genrule_name + '",\n' + "    outs = [\n" +
+      outs + "\n    ],\n" + '    cmd = """\n' + command + '\n   """,\n' + ")\n")
+
 
 def _read_dir(repository_ctx, src_dir):
-    """Returns a string with all files in a directory.
+  """Returns a string with all files in a directory.
 
     Finds all files inside a directory, traversing subfolders and following
     symlinks. The returned string contains the full path of all files
     separated by line breaks.
     """
-    if _is_windows(repository_ctx):
-        src_dir = src_dir.replace("/", "\\")
-        find_result = _execute(
-            repository_ctx,
-            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-            empty_stdout_fine = True,
-        )
+  if _is_windows(repository_ctx):
+    src_dir = src_dir.replace("/", "\\")
+    find_result = _execute(
+        repository_ctx,
+        ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+        empty_stdout_fine=True,
+    )
+
+    # src_files will be used in genrule.outs where the paths must
+    # use forward slashes.
+    result = find_result.stdout.replace("\\", "/")
+  else:
+    find_result = _execute(
+        repository_ctx,
+        ["find", src_dir, "-follow", "-type", "f"],
+        empty_stdout_fine=True,
+    )
+    result = find_result.stdout
+  return result
 
-        # src_files will be used in genrule.outs where the paths must
-        # use forward slashes.
-        result = find_result.stdout.replace("\\", "/")
-    else:
-        find_result = _execute(
-            repository_ctx,
-            ["find", src_dir, "-follow", "-type", "f"],
-            empty_stdout_fine = True,
-        )
-        result = find_result.stdout
-    return result
 
 def _flag_enabled(repository_ctx, flag_name):
-    if flag_name in repository_ctx.os.environ:
-        value = repository_ctx.os.environ[flag_name].strip()
-        return value == "1"
-    return False
+  if flag_name in repository_ctx.os.environ:
+    value = repository_ctx.os.environ[flag_name].strip()
+    return value == "1"
+  return False
+
 
 def _use_cuda_clang(repository_ctx):
-    return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
+  return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
+
 
 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-    if _use_cuda_clang(repository_ctx):
-        capability_flags = ["--cuda-gpu-arch=sm_" +
-                            cap.replace(".", "") for cap in compute_capabilities]
-    else:
-        # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
-        capability_flags = []
-    return str(capability_flags)
+  if _use_cuda_clang(repository_ctx):
+    capability_flags = [
+        "--cuda-gpu-arch=sm_" + cap.replace(".", "")
+        for cap in compute_capabilities
+    ]
+  else:
+    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
+    # TODO(csigg): Make this consistent with cuda clang and pass to crosstool.
+    capability_flags = []
+  return str(capability_flags)
+
 
 def _create_local_cuda_repository(repository_ctx):
-    """Creates the repository containing files set up to build with CUDA."""
-    cuda_config = _get_cuda_config(repository_ctx)
+  """Creates the repository containing files set up to build with CUDA."""
+  cuda_config = _get_cuda_config(repository_ctx)
 
-    cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
-    cudnn_header_dir = _find_cudnn_header_dir(
-        repository_ctx,
-        cuda_config.cudnn_install_basedir,
-    )
-    cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
-    nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
-
-    # Set up symbolic links for the cuda toolkit by creating genrules to do
-    # symlinking. We create one genrule for each directory we want to track under
-    # cuda_toolkit_path
-    cuda_toolkit_path = cuda_config.cuda_toolkit_path
-    genrules = [symlink_genrule_for_dir(
-        repository_ctx,
-        cuda_include_path,
-        "cuda/include",
-        "cuda-include",
-    )]
-    genrules.append(symlink_genrule_for_dir(
-        repository_ctx,
-        nvvm_libdevice_dir,
-        "cuda/nvvm/libdevice",
-        "cuda-nvvm",
-    ))
-    genrules.append(symlink_genrule_for_dir(
-        repository_ctx,
-        cupti_header_dir,
-        "cuda/extras/CUPTI/include",
-        "cuda-extras",
-    ))
-
-    cuda_libs = _find_libs(repository_ctx, cuda_config)
-    cuda_lib_src = []
-    cuda_lib_dest = []
-    for lib in cuda_libs.values():
-        cuda_lib_src.append(lib.path)
-        cuda_lib_dest.append("cuda/lib/" + lib.file_name)
-    genrules.append(symlink_genrule_for_dir(
-        repository_ctx,
-        None,
-        "",
-        "cuda-lib",
-        cuda_lib_src,
-        cuda_lib_dest,
-    ))
-
-    # Set up the symbolic links for cudnn if cndnn was not installed to
-    # CUDA_TOOLKIT_PATH.
-    included_files = _read_dir(repository_ctx, cuda_include_path).replace(
-        cuda_include_path,
-        "",
-    ).splitlines()
-    if "/cudnn.h" not in included_files:
-        genrules.append(symlink_genrule_for_dir(
+  cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
+  cudnn_header_dir = _find_cudnn_header_dir(
+      repository_ctx,
+      cuda_config.cudnn_install_basedir,
+  )
+  cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
+  nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
+
+  # Set up symbolic links for the cuda toolkit by creating genrules to do
+  # symlinking. We create one genrule for each directory we want to track under
+  # cuda_toolkit_path
+  cuda_toolkit_path = cuda_config.cuda_toolkit_path
+  genrules = [
+      symlink_genrule_for_dir(
+          repository_ctx,
+          cuda_include_path,
+          "cuda/include",
+          "cuda-include",
+      )
+  ]
+  genrules.append(
+      symlink_genrule_for_dir(
+          repository_ctx,
+          nvvm_libdevice_dir,
+          "cuda/nvvm/libdevice",
+          "cuda-nvvm",
+      ))
+  genrules.append(
+      symlink_genrule_for_dir(
+          repository_ctx,
+          cupti_header_dir,
+          "cuda/extras/CUPTI/include",
+          "cuda-extras",
+      ))
+
+  cuda_libs = _find_libs(repository_ctx, cuda_config)
+  cuda_lib_src = []
+  cuda_lib_dest = []
+  for lib in cuda_libs.values():
+    cuda_lib_src.append(lib.path)
+    cuda_lib_dest.append("cuda/lib/" + lib.file_name)
+  genrules.append(
+      symlink_genrule_for_dir(
+          repository_ctx,
+          None,
+          "",
+          "cuda-lib",
+          cuda_lib_src,
+          cuda_lib_dest,
+      ))
+
+  # Set up the symbolic links for cudnn if cndnn was not installed to
+  # CUDA_TOOLKIT_PATH.
+  included_files = _read_dir(repository_ctx, cuda_include_path).replace(
+      cuda_include_path,
+      "",
+  ).splitlines()
+  if "/cudnn.h" not in included_files:
+    genrules.append(
+        symlink_genrule_for_dir(
             repository_ctx,
             None,
             "cuda/include/",
@@ -1247,204 +1331,238 @@ def _create_local_cuda_repository(repository_ctx):
             [cudnn_header_dir + "/cudnn.h"],
             ["cudnn.h"],
         ))
-    else:
-        genrules.append(
-            "filegroup(\n" +
-            '    name = "cudnn-include",\n' +
-            "    srcs = [],\n" +
-            ")\n",
-        )
-
-    # Set up BUILD file for cuda/
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                repository_ctx,
-                cuda_config.compute_capabilities,
-            ),
-        },
-    )
-    _tpl(
-        repository_ctx,
-        "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
-        {
-            "%{cuda_driver_lib}": cuda_libs["cuda"].file_name,
-            "%{cudart_static_lib}": cuda_libs["cudart_static"].file_name,
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(
-                cuda_config.cpu_value,
-            ),
-            "%{cudart_lib}": cuda_libs["cudart"].file_name,
-            "%{cublas_lib}": cuda_libs["cublas"].file_name,
-            "%{cusolver_lib}": cuda_libs["cusolver"].file_name,
-            "%{cudnn_lib}": cuda_libs["cudnn"].file_name,
-            "%{cufft_lib}": cuda_libs["cufft"].file_name,
-            "%{curand_lib}": cuda_libs["curand"].file_name,
-            "%{cupti_lib}": cuda_libs["cupti"].file_name,
-            "%{cuda_include_genrules}": "\n".join(genrules),
-            "%{cuda_headers}": ('":cuda-include",\n' +
-                                '        ":cudnn-include",'),
-        },
-        "cuda/BUILD",
-    )
-
-    is_cuda_clang = _use_cuda_clang(repository_ctx)
+  else:
+    genrules.append(
+        "filegroup(\n" + '    name = "cudnn-include",\n' + "    srcs = [],\n" +
+        ")\n",)
+
+  # Set up BUILD file for cuda/
+  _tpl(
+      repository_ctx,
+      "cuda:build_defs.bzl",
+      {
+          "%{cuda_is_configured}":
+              "True",
+          "%{cuda_extra_copts}":
+              _compute_cuda_extra_copts(
+                  repository_ctx,
+                  cuda_config.compute_capabilities,
+              ),
+      },
+  )
+  _tpl(
+      repository_ctx,
+      "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
+      {
+          "%{cuda_driver_lib}":
+              cuda_libs["cuda"].file_name,
+          "%{cudart_static_lib}":
+              cuda_libs["cudart_static"].file_name,
+          "%{cudart_static_linkopt}":
+              _cudart_static_linkopt(cuda_config.cpu_value,),
+          "%{cudart_lib}":
+              cuda_libs["cudart"].file_name,
+          "%{cublas_lib}":
+              cuda_libs["cublas"].file_name,
+          "%{cusolver_lib}":
+              cuda_libs["cusolver"].file_name,
+          "%{cudnn_lib}":
+              cuda_libs["cudnn"].file_name,
+          "%{cufft_lib}":
+              cuda_libs["cufft"].file_name,
+          "%{curand_lib}":
+              cuda_libs["curand"].file_name,
+          "%{cupti_lib}":
+              cuda_libs["cupti"].file_name,
+          "%{cuda_include_genrules}":
+              "\n".join(genrules),
+          "%{cuda_headers}": ('":cuda-include",\n' + '        ":cudnn-include",'
+                             ),
+      },
+      "cuda/BUILD",
+  )
 
-    should_download_clang = is_cuda_clang and _flag_enabled(
-        repository_ctx,
-        _TF_DOWNLOAD_CLANG,
-    )
-    if should_download_clang:
-        download_clang(repository_ctx, "crosstool/extra_tools")
-
-    # Set up crosstool/
-    cc = find_cc(repository_ctx)
-    cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
-
-    host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
-    cuda_defines = {}
-    # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
-    # https://github.com/bazelbuild/bazel/issues/760).
-    # However, this stops our custom clang toolchain from picking the provided
-    # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
-    # toolchain.
-    # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
-    #       flag from the CROSSTOOL completely (see
-    #       https://github.com/bazelbuild/bazel/issues/5634)
-    if should_download_clang:
-      cuda_defines["%{linker_bin_path_flag}"] = ""
-    else:
-      cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
+  is_cuda_clang = _use_cuda_clang(repository_ctx)
 
-    if is_cuda_clang:
-        cuda_defines["%{host_compiler_path}"] = str(cc)
-        cuda_defines["%{host_compiler_warnings}"] = """
+  should_download_clang = is_cuda_clang and _flag_enabled(
+      repository_ctx,
+      _TF_DOWNLOAD_CLANG,
+  )
+  if should_download_clang:
+    download_clang(repository_ctx, "crosstool/extra_tools")
+
+  # Set up crosstool/
+  cc = find_cc(repository_ctx)
+  cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
+
+  host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
+  cuda_defines = {}
+  # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
+  # https://github.com/bazelbuild/bazel/issues/760).
+  # However, this stops our custom clang toolchain from picking the provided
+  # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
+  # toolchain.
+  # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
+  #       flag from the CROSSTOOL completely (see
+  #       https://github.com/bazelbuild/bazel/issues/5634)
+  if should_download_clang:
+    cuda_defines["%{linker_bin_path_flag}"] = ""
+  else:
+    cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
+
+  if is_cuda_clang:
+    cuda_defines["%{host_compiler_path}"] = str(cc)
+    cuda_defines["%{host_compiler_warnings}"] = """
         # Some parts of the codebase set -Werror and hit this warning, so
         # switch it off for now.
         flag: "-Wno-invalid-partial-specialization"
     """
-        cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
-        _tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty", "%{win_linker_files}": ":empty"})
-        repository_ctx.file("crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
-        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
-        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.bat", "")
-    else:
-        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-        cuda_defines["%{host_compiler_warnings}"] = ""
-
-        # nvcc has the system include paths built in and will automatically
-        # search them; we cannot work around that, so we add the relevant cuda
-        # system paths to the allowed compiler specific include paths.
-        cuda_defines["%{host_compiler_includes}"] = (
-            host_compiler_includes + "\n" +
-            _cuda_include_path(repository_ctx, cuda_config) +
-            "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
-            "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
-        nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
-                                            (
-                                                cuda_config.cuda_toolkit_path,
-                                                ".exe" if _is_windows(repository_ctx) else "",
-                                            )))
-        _tpl(
-            repository_ctx,
-            "crosstool:BUILD",
-            {
-                "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
-                "%{win_linker_files}": ":windows_msvc_wrapper_files",
-            },
-        )
-        wrapper_defines = {
-            "%{cpu_compiler}": str(cc),
-            "%{cuda_version}": cuda_config.cuda_version,
-            "%{nvcc_path}": nvcc_path,
-            "%{gcc_host_compiler_path}": str(cc),
-            "%{cuda_compute_capabilities}": ", ".join(
-                ["\"%s\"" % c for c in cuda_config.compute_capabilities],
-            ),
-            "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
-        }
-        _tpl(
-            repository_ctx,
-            "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-            wrapper_defines,
-        )
-        _tpl(
-            repository_ctx,
-            "crosstool:windows/msvc_wrapper_for_nvcc.py",
-            wrapper_defines,
-        )
-        _tpl(
-            repository_ctx,
-            "crosstool:windows/msvc_wrapper_for_nvcc.bat",
-            {
-                "%{python_binary}": _get_python_bin(repository_ctx),
-            },
-        )
-
+    cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
+    _tpl(repository_ctx, "crosstool:BUILD", {
+        "%{linker_files}": ":empty",
+        "%{win_linker_files}": ":empty"
+    })
+    repository_ctx.file(
+        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
+    repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
+    repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.bat", "")
+  else:
+    cuda_defines[
+        "%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+    cuda_defines["%{host_compiler_warnings}"] = ""
+
+    # nvcc has the system include paths built in and will automatically
+    # search them; we cannot work around that, so we add the relevant cuda
+    # system paths to the allowed compiler specific include paths.
+    cuda_defines["%{host_compiler_includes}"] = (
+        host_compiler_includes + "\n" + _cuda_include_path(
+            repository_ctx, cuda_config) +
+        "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
+        "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
+
+    # For gcc, do not canonicalize system header paths; some versions of gcc
+    # pick the shortest possible path for system includes when creating the
+    # .d file - given that includes that are prefixed with "../" multiple
+    # time quickly grow longer than the root of the tree, this can lead to
+    # bazel's header check failing.
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
+        "flag: \"-fno-canonical-system-headers\"")
+    nvcc_path = str(
+        repository_ctx.path("%s/bin/nvcc%s" % (
+            cuda_config.cuda_toolkit_path,
+            ".exe" if _is_windows(repository_ctx) else "",
+        )))
     _tpl(
         repository_ctx,
-        "crosstool:CROSSTOOL",
-        cuda_defines + _get_win_cuda_defines(repository_ctx),
-        out = "crosstool/CROSSTOOL",
+        "crosstool:BUILD",
+        {
+            "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
+            "%{win_linker_files}": ":windows_msvc_wrapper_files",
+        },
     )
-
-    # Set up cuda_config.h, which is used by
-    # tensorflow/stream_executor/dso_loader.cc.
+    wrapper_defines = {
+        "%{cpu_compiler}":
+            str(cc),
+        "%{cuda_version}":
+            cuda_config.cuda_version,
+        "%{nvcc_path}":
+            nvcc_path,
+        "%{gcc_host_compiler_path}":
+            str(cc),
+        "%{cuda_compute_capabilities}":
+            ", ".join(
+                ["\"%s\"" % c for c in cuda_config.compute_capabilities],),
+        "%{nvcc_tmp_dir}":
+            _get_nvcc_tmp_dir_for_windows(repository_ctx),
+    }
     _tpl(
         repository_ctx,
-        "cuda:cuda_config.h",
-        {
-            "%{cuda_version}": cuda_config.cuda_version,
-            "%{cudnn_version}": cuda_config.cudnn_version,
-            "%{cuda_compute_capabilities}": ",".join(
-                [
-                    "CudaVersion(\"%s\")" % c
-                    for c in cuda_config.compute_capabilities
-                ],
-            ),
-            "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
-        },
-        "cuda/cuda/cuda_config.h",
+        "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        wrapper_defines,
     )
-
-def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
-    """Creates pointers to a remotely configured repo set up to build with CUDA."""
     _tpl(
         repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
-                repository_ctx,
-                _compute_capabilities(repository_ctx),
-            ),
-        },
+        "crosstool:windows/msvc_wrapper_for_nvcc.py",
+        wrapper_defines,
     )
     _tpl(
         repository_ctx,
-        "cuda:remote.BUILD",
+        "crosstool:windows/msvc_wrapper_for_nvcc.bat",
         {
-            "%{remote_cuda_repo}": remote_config_repo,
+            "%{python_binary}": _get_python_bin(repository_ctx),
         },
-        "cuda/BUILD",
     )
-    _tpl(repository_ctx, "crosstool:remote.BUILD", {
-        "%{remote_cuda_repo}": remote_config_repo,
-    }, "crosstool/BUILD")
+
+  _tpl(
+      repository_ctx,
+      "crosstool:CROSSTOOL",
+      cuda_defines + _get_win_cuda_defines(repository_ctx),
+      out="crosstool/CROSSTOOL",
+  )
+
+  # Set up cuda_config.h, which is used by
+  # tensorflow/stream_executor/dso_loader.cc.
+  _tpl(
+      repository_ctx,
+      "cuda:cuda_config.h",
+      {
+          "%{cuda_version}":
+              cuda_config.cuda_version,
+          "%{cudnn_version}":
+              cuda_config.cudnn_version,
+          "%{cuda_compute_capabilities}":
+              ",".join([
+                  "CudaVersion(\"%s\")" % c
+                  for c in cuda_config.compute_capabilities
+              ],),
+          "%{cuda_toolkit_path}":
+              cuda_config.cuda_toolkit_path,
+      },
+      "cuda/cuda/cuda_config.h",
+  )
+
+
+def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
+  """Creates pointers to a remotely configured repo set up to build with CUDA."""
+  _tpl(
+      repository_ctx,
+      "cuda:build_defs.bzl",
+      {
+          "%{cuda_is_configured}":
+              "True",
+          "%{cuda_extra_copts}":
+              _compute_cuda_extra_copts(
+                  repository_ctx,
+                  compute_capabilities(repository_ctx),
+              ),
+      },
+  )
+  _tpl(
+      repository_ctx,
+      "cuda:remote.BUILD",
+      {
+          "%{remote_cuda_repo}": remote_config_repo,
+      },
+      "cuda/BUILD",
+  )
+  _tpl(repository_ctx, "crosstool:remote.BUILD", {
+      "%{remote_cuda_repo}": remote_config_repo,
+  }, "crosstool/BUILD")
+
 
 def _cuda_autoconf_impl(repository_ctx):
-    """Implementation of the cuda_autoconf repository rule."""
-    if not _enable_cuda(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
-        _create_remote_cuda_repository(
-            repository_ctx,
-            repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
-        )
-    else:
-        _create_local_cuda_repository(repository_ctx)
+  """Implementation of the cuda_autoconf repository rule."""
+  if not _enable_cuda(repository_ctx):
+    _create_dummy_repository(repository_ctx)
+  elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
+    _create_remote_cuda_repository(
+        repository_ctx,
+        repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
+    )
+  else:
+    _create_local_cuda_repository(repository_ctx)
+
 
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 9108639b0bf74ab4b14468d77a0570ff8913f107..6df6799bd7696d5dbcc70345bf7b5e19f709b8d4 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -105,7 +105,7 @@ def get_cxx_inc_directories(repository_ctx, cc):
     return includes_cpp + [
         inc
         for inc in includes_c
-        if inc not in includes_cpp_set
+        if inc not in includes_cpp_set.to_list()
     ]
 
 def auto_configure_fail(msg):
diff --git a/third_party/highwayhash.BUILD b/third_party/highwayhash.BUILD
deleted file mode 100644
index 08cb84ea2c8f20689cbf61ca719c092976439241..0000000000000000000000000000000000000000
--- a/third_party/highwayhash.BUILD
+++ /dev/null
@@ -1,33 +0,0 @@
-# Description:
-#   SipHash and HighwayHash: cryptographically-strong pseudorandom functions
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "sip_hash",
-    srcs = ["highwayhash/sip_hash.cc"],
-    hdrs = [
-        "highwayhash/sip_hash.h",
-        "highwayhash/endianess.h",
-        "highwayhash/state_helpers.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":arch_specific",
-        ":compiler_specific",
-    ],
-)
-
-cc_library(
-    name = "arch_specific",
-    srcs = ["highwayhash/arch_specific.cc"],
-    hdrs = ["highwayhash/arch_specific.h"],
-    deps = [":compiler_specific"],
-)
-
-cc_library(
-    name = "compiler_specific",
-    hdrs = ["highwayhash/compiler_specific.h"],
-)
diff --git a/third_party/highwayhash/BUILD b/third_party/highwayhash/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2f5d02becb930602574c4df02c51cec7662bc25d
--- /dev/null
+++ b/third_party/highwayhash/BUILD
@@ -0,0 +1 @@
+# Dummy BUILD file to make this directory a package.
diff --git a/third_party/highwayhash/BUILD.bazel b/third_party/highwayhash/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..39b148bc000cdccad7098c0b4b72a39d2ceb797a
--- /dev/null
+++ b/third_party/highwayhash/BUILD.bazel
@@ -0,0 +1,33 @@
+# Description:
+#   SipHash and HighwayHash: cryptographically-strong pseudorandom functions
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "sip_hash",
+    srcs = ["highwayhash/sip_hash.cc"],
+    hdrs = [
+        "highwayhash/endianess.h",
+        "highwayhash/sip_hash.h",
+        "highwayhash/state_helpers.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":arch_specific",
+        ":compiler_specific",
+    ],
+)
+
+cc_library(
+    name = "arch_specific",
+    srcs = ["highwayhash/arch_specific.cc"],
+    hdrs = ["highwayhash/arch_specific.h"],
+    deps = [":compiler_specific"],
+)
+
+cc_library(
+    name = "compiler_specific",
+    hdrs = ["highwayhash/compiler_specific.h"],
+)
diff --git a/third_party/highwayhash/workspace.bzl b/third_party/highwayhash/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..793297b9ba0979d70f4e79db4df85e90cec6a2f2
--- /dev/null
+++ b/third_party/highwayhash/workspace.bzl
@@ -0,0 +1,15 @@
+"""loads the highwayhash library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "highwayhash",
+        urls = [
+            "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+            "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+        ],
+        sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
+        strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
+        build_file = "//third_party/highwayhash:BUILD.bazel",
+    )
diff --git a/third_party/icu/BUILD.system b/third_party/icu/BUILD.system
new file mode 100644
index 0000000000000000000000000000000000000000..328e412a8c29f6f7c2f5ecc5b6e8bbec7613972c
--- /dev/null
+++ b/third_party/icu/BUILD.system
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "icu4c/LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "icu4j/main/shared/licenses/LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "common",
+    deps = [
+        ":icuuc",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "icuuc",
+    linkopts = ["-licuuc"],
+    visibility = ["//visibility:private"],
+)
diff --git a/third_party/icu/data/BUILD.bazel b/third_party/icu/data/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..7db21566e4e65960d59caa9584c944ef8375bd7e
--- /dev/null
+++ b/third_party/icu/data/BUILD.bazel
@@ -0,0 +1,46 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Data for core MIME/Unix/Windows encodings:
+# ISO 8859-2..9, 15; Windows-125x; EUC-CN; GBK (Windows cp936); GB 18030;
+# Big5 (Windows cp950); SJIS (Windows cp932); EUC-JP; EUC-KR, KS C 5601;
+# Windows cp949. Data is pre-processed for little-endian platforms. To replicate
+# this pre-processing (if you want additional encodings, for example), do the
+# following:
+#
+# First, download, build, and install ICU. This installs tools such as makeconv.
+# Then, run the following from your icu4c/source directory:
+#   $ cd data/mappings
+#   $ rm *.cnv  # there shouldn't be any .cnv files here to begin with
+#   $ grep \.ucm ucmcore.mk | \
+#     sed 's/\(UCM_SOURCE_CORE=\)\?\([^ ]\+\.ucm\)\\\?/\2/g' | \
+#     tr '\n' ' ' | xargs makeconv
+#   $ ls *.cnv > filelist.lst
+#   $ pkgdata -m common -p ucmcore filelist.lst
+#   $ genccode -f custom_conversion_data ucmcore.dat
+# This creates custom_conversion_data.c. You will need to change the target
+# :conversion_data to depend on your custom source instead of :conversion_data.c
+filegroup(
+    name = "conversion_files",
+    srcs = glob(["icu_conversion_data.c.gz.*"]),
+)
+
+# Data files are compressed and split to work around git performance degradation
+# around large files.
+genrule(
+    name = "merge_conversion_data",
+    srcs = [":conversion_files"],
+    outs = ["conversion_data.c"],
+    cmd = "cat $(locations :conversion_files) | gunzip > $@",
+)
+
+cc_library(
+    name = "conversion_data",
+    srcs = [":conversion_data.c"],
+    deps = ["@icu//:headers"],
+)
diff --git a/third_party/icu/data/LICENSE b/third_party/icu/data/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..25b6eb9d3415e62e99af6a349362349c091bc6c7
--- /dev/null
+++ b/third_party/icu/data/LICENSE
@@ -0,0 +1,414 @@
+COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
+
+Copyright © 1991-2018 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+---------------------
+
+Third-Party Software Licenses
+
+This section contains third-party software notices and/or additional
+terms for licensed third-party software components included within ICU
+libraries.
+
+1. ICU License - ICU 1.8.1 to ICU 57.1
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2016 International Business Machines Corporation and others
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the
+property of their respective owners.
+
+2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
+
+ #     The Google Chrome software developed by Google is licensed under
+ # the BSD license. Other software included in this distribution is
+ # provided under other licenses, as set forth below.
+ #
+ #  The BSD License
+ #  http://opensource.org/licenses/bsd-license.php
+ #  Copyright (C) 2006-2008, Google Inc.
+ #
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions are met:
+ #
+ #  Redistributions of source code must retain the above copyright notice,
+ # this list of conditions and the following disclaimer.
+ #  Redistributions in binary form must reproduce the above
+ # copyright notice, this list of conditions and the following
+ # disclaimer in the documentation and/or other materials provided with
+ # the distribution.
+ #  Neither the name of  Google Inc. nor the names of its
+ # contributors may be used to endorse or promote products derived from
+ # this software without specific prior written permission.
+ #
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #
+ #
+ #  The word list in cjdict.txt are generated by combining three word lists
+ # listed below with further processing for compound word breaking. The
+ # frequency is generated with an iterative training against Google web
+ # corpora.
+ #
+ #  * Libtabe (Chinese)
+ #    - https://sourceforge.net/project/?group_id=1519
+ #    - Its license terms and conditions are shown below.
+ #
+ #  * IPADIC (Japanese)
+ #    - http://chasen.aist-nara.ac.jp/chasen/distribution.html
+ #    - Its license terms and conditions are shown below.
+ #
+ #  ---------COPYING.libtabe ---- BEGIN--------------------
+ #
+ #  /*
+ #   * Copyright (c) 1999 TaBE Project.
+ #   * Copyright (c) 1999 Pai-Hsiang Hsiao.
+ #   * All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the TaBE Project nor the names of its
+ #   *   contributors may be used to endorse or promote products derived
+ #   *   from this software without specific prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  /*
+ #   * Copyright (c) 1999 Computer Systems and Communication Lab,
+ #   *                    Institute of Information Science, Academia
+ #       *                    Sinica. All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the Computer Systems and Communication Lab
+ #   *   nor the names of its contributors may be used to endorse or
+ #   *   promote products derived from this software without specific
+ #   *   prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
+ #      University of Illinois
+ #  c-tsai4@uiuc.edu  http://casper.beckman.uiuc.edu/~c-tsai4
+ #
+ #  ---------------COPYING.libtabe-----END--------------------------------
+ #
+ #
+ #  ---------------COPYING.ipadic-----BEGIN-------------------------------
+ #
+ #  Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
+ #  and Technology.  All Rights Reserved.
+ #
+ #  Use, reproduction, and distribution of this software is permitted.
+ #  Any copy of this software, whether in its original form or modified,
+ #  must include both the above copyright notice and the following
+ #  paragraphs.
+ #
+ #  Nara Institute of Science and Technology (NAIST),
+ #  the copyright holders, disclaims all warranties with regard to this
+ #  software, including all implied warranties of merchantability and
+ #  fitness, in no event shall NAIST be liable for
+ #  any special, indirect or consequential damages or any damages
+ #  whatsoever resulting from loss of use, data or profits, whether in an
+ #  action of contract, negligence or other tortuous action, arising out
+ #  of or in connection with the use or performance of this software.
+ #
+ #  A large portion of the dictionary entries
+ #  originate from ICOT Free Software.  The following conditions for ICOT
+ #  Free Software applies to the current dictionary as well.
+ #
+ #  Each User may also freely distribute the Program, whether in its
+ #  original form or modified, to any third party or parties, PROVIDED
+ #  that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+ #  on, or be attached to, the Program, which is distributed substantially
+ #  in the same form as set out herein and that such intended
+ #  distribution, if actually made, will neither violate or otherwise
+ #  contravene any of the laws and regulations of the countries having
+ #  jurisdiction over the User or the intended distribution itself.
+ #
+ #  NO WARRANTY
+ #
+ #  The program was produced on an experimental basis in the course of the
+ #  research and development conducted during the project and is provided
+ #  to users as so produced on an experimental basis.  Accordingly, the
+ #  program is provided without any warranty whatsoever, whether express,
+ #  implied, statutory or otherwise.  The term "warranty" used herein
+ #  includes, but is not limited to, any warranty of the quality,
+ #  performance, merchantability and fitness for a particular purpose of
+ #  the program and the nonexistence of any infringement or violation of
+ #  any right of any third party.
+ #
+ #  Each user of the program will agree and understand, and be deemed to
+ #  have agreed and understood, that there is no warranty whatsoever for
+ #  the program and, accordingly, the entire risk arising from or
+ #  otherwise connected with the program is assumed by the user.
+ #
+ #  Therefore, neither ICOT, the copyright holder, or any other
+ #  organization that participated in or was otherwise related to the
+ #  development of the program and their respective officials, directors,
+ #  officers and other employees shall be held liable for any and all
+ #  damages, including, without limitation, general, special, incidental
+ #  and consequential damages, arising out of or otherwise in connection
+ #  with the use or inability to use the program or any product, material
+ #  or result produced or otherwise obtained by using the program,
+ #  regardless of whether they have been advised of, or otherwise had
+ #  knowledge of, the possibility of such damages at any time during the
+ #  project or thereafter.  Each user will be deemed to have agreed to the
+ #  foregoing by his or her commencement of use of the program.  The term
+ #  "use" as used herein includes, but is not limited to, the use,
+ #  modification, copying and distribution of the program and the
+ #  production of secondary products from the program.
+ #
+ #  In the case where the program, whether in its original form or
+ #  modified, was distributed or delivered to or received by a user from
+ #  any person, organization or entity other than ICOT, unless it makes or
+ #  grants independently of ICOT any specific warranty to the user in
+ #  writing, such person, organization or entity, will also be exempted
+ #  from and not be held liable to the user for any such damages as noted
+ #  above as far as the program is concerned.
+ #
+ #  ---------------COPYING.ipadic-----END----------------------------------
+
+3. Lao Word Break Dictionary Data (laodict.txt)
+
+ #  Copyright (c) 2013 International Business Machines Corporation
+ #  and others. All Rights Reserved.
+ #
+ # Project: http://code.google.com/p/lao-dictionary/
+ # Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
+ # License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
+ #              (copied below)
+ #
+ #  This file is derived from the above dictionary, with slight
+ #  modifications.
+ #  ----------------------------------------------------------------------
+ #  Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification,
+ #  are permitted provided that the following conditions are met:
+ #
+ #
+ # Redistributions of source code must retain the above copyright notice, this
+ #  list of conditions and the following disclaimer. Redistributions in
+ #  binary form must reproduce the above copyright notice, this list of
+ #  conditions and the following disclaimer in the documentation and/or
+ #  other materials provided with the distribution.
+ #
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # OF THE POSSIBILITY OF SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+4. Burmese Word Break Dictionary Data (burmesedict.txt)
+
+ #  Copyright (c) 2014 International Business Machines Corporation
+ #  and others. All Rights Reserved.
+ #
+ #  This list is part of a project hosted at:
+ #    github.com/kanyawtech/myanmar-karen-word-lists
+ #
+ #  --------------------------------------------------------------------------
+ #  Copyright (c) 2013, LeRoy Benjamin Sharon
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification, are permitted provided that the following conditions
+ #  are met: Redistributions of source code must retain the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer.  Redistributions in binary form must reproduce the
+ #  above copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #    Neither the name Myanmar Karen Word Lists, nor the names of its
+ #    contributors may be used to endorse or promote products derived
+ #    from this software without specific prior written permission.
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ #  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ #  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ #  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ #  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ #  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ #  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ #  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ #  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ #  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ #  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ #  THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ #  SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+5. Time Zone Database
+
+  ICU uses the public domain data and code derived from Time Zone
+Database for its time zone support. The ownership of the TZ database
+is explained in BCP 175: Procedure for Maintaining the Time Zone
+Database section 7.
+
+ # 7.  Database Ownership
+ #
+ #    The TZ database itself is not an IETF Contribution or an IETF
+ #    document.  Rather it is a pre-existing and regularly updated work
+ #    that is in the public domain, and is intended to remain in the
+ #    public domain.  Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
+ #    not apply to the TZ Database or contributions that individuals make
+ #    to it.  Should any claims be made and substantiated against the TZ
+ #    Database, the organization that is providing the IANA
+ #    Considerations defined in this RFC, under the memorandum of
+ #    understanding with the IETF, currently ICANN, may act in accordance
+ #    with all competent court orders.  No ownership claims will be made
+ #    by ICANN or the IETF Trust on the database or the code.  Any person
+ #    making a contribution to the database or code waives all rights to
+ #    future claims in that contribution or in the TZ Database.
+
+6. Google double-conversion
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.aa b/third_party/icu/data/icu_conversion_data.c.gz.aa
new file mode 100644
index 0000000000000000000000000000000000000000..b68a2c6516f8183e805c509a9139cf63d1ee3fa5
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.aa differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ab b/third_party/icu/data/icu_conversion_data.c.gz.ab
new file mode 100644
index 0000000000000000000000000000000000000000..d60aa92d675c85f95e811221bffc012d65e6c29e
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ab differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ac b/third_party/icu/data/icu_conversion_data.c.gz.ac
new file mode 100644
index 0000000000000000000000000000000000000000..de9b69ff9474e0c9ccc799d40d092d2ab2ad98bb
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ac differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ad b/third_party/icu/data/icu_conversion_data.c.gz.ad
new file mode 100644
index 0000000000000000000000000000000000000000..d5abb06b8ca21e1e6116ef1732c661c815b1489a
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ad differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ae b/third_party/icu/data/icu_conversion_data.c.gz.ae
new file mode 100644
index 0000000000000000000000000000000000000000..0e54fdb9eaffd814477460f71bc194104c1b247d
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ae differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.af b/third_party/icu/data/icu_conversion_data.c.gz.af
new file mode 100644
index 0000000000000000000000000000000000000000..cfbeb165ad3428555276a463a90a1ed2e34740f0
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.af differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ag b/third_party/icu/data/icu_conversion_data.c.gz.ag
new file mode 100644
index 0000000000000000000000000000000000000000..bde20b6da6253d866f87fcadc7e6c3571bd64d44
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ag differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ah b/third_party/icu/data/icu_conversion_data.c.gz.ah
new file mode 100644
index 0000000000000000000000000000000000000000..ae31dffbe2afc8ad59ae1dc323447d8cf9d61032
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ah differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ai b/third_party/icu/data/icu_conversion_data.c.gz.ai
new file mode 100644
index 0000000000000000000000000000000000000000..981b869561a615f21639482929b89d2b2e5ca360
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ai differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.aj b/third_party/icu/data/icu_conversion_data.c.gz.aj
new file mode 100644
index 0000000000000000000000000000000000000000..1ae6bce382a05570b46217e1a031414515439a42
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.aj differ
diff --git a/third_party/icu/udata.patch b/third_party/icu/udata.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d6d59100e48b8346fcaa54f0cbdebdc5e4658f92
--- /dev/null
+++ b/third_party/icu/udata.patch
@@ -0,0 +1,53 @@
+--- /icu4c/source/common/udata.cpp.old	2018-06-19 22:34:56.000000000 -0700
++++ /icu4c/source/common/udata.cpp	2018-10-19 14:26:09.778950855 -0700
+@@ -18,15 +18,15 @@
+ 
+ #include "unicode/utypes.h"  /* U_PLATFORM etc. */
+ 
+-#ifdef __GNUC__
+-/* if gcc
+-#define ATTRIBUTE_WEAK __attribute__ ((weak))
+-might have to #include some other header
+-*/
++#if defined(__GNUC__) || defined(__SUNPRO_CC)
++#  define ATTRIBUTE_WEAK __attribute__ ((weak))
++#else
++#  define ATTRIBUTE_WEAK
+ #endif
+ 
+ #include "unicode/putil.h"
+ #include "unicode/udata.h"
++#include "unicode/umachine.h"
+ #include "unicode/uversion.h"
+ #include "charstr.h"
+ #include "cmemory.h"
+@@ -641,10 +641,11 @@
+  * partial-data-library access functions where each returns a pointer
+  * to its data package, if it is linked in.
+  */
+-/*
+-extern const void *uprv_getICUData_collation(void) ATTRIBUTE_WEAK;
+-extern const void *uprv_getICUData_conversion(void) ATTRIBUTE_WEAK;
+-*/
++
++//extern "C" const void *uprv_getICUData_collation(void);
++U_CDECL_BEGIN
++const void *uprv_getICUData_conversion(void) ATTRIBUTE_WEAK;
++U_CDECL_END
+ 
+ /*----------------------------------------------------------------------*
+  *                                                                      *
+@@ -702,10 +703,11 @@
+         if (uprv_getICUData_collation) {
+             setCommonICUDataPointer(uprv_getICUData_collation(), FALSE, pErrorCode);
+         }
++        */
+         if (uprv_getICUData_conversion) {
+-            setCommonICUDataPointer(uprv_getICUData_conversion(), FALSE, pErrorCode);
++          setCommonICUDataPointer(uprv_getICUData_conversion(), FALSE, pErrorCode);
+         }
+-        */
++
+ #if U_PLATFORM_HAS_WINUWP_API == 0 // Windows UWP Platform does not support dll icu data at this time
+         setCommonICUDataPointer(&U_ICUDATA_ENTRY_POINT, FALSE, pErrorCode);
+         {
diff --git a/third_party/icu/workspace.bzl b/third_party/icu/workspace.bzl
index bfebf4219b1604c3bc16a4478412e23904298b35..f100836b4101efa0a20e09e7d430b0b44953e89a 100644
--- a/third_party/icu/workspace.bzl
+++ b/third_party/icu/workspace.bzl
@@ -2,6 +2,11 @@
 
 load("//third_party:repo.bzl", "third_party_http_archive")
 
+# Sanitize a dependency so that it works correctly from code that includes
+# TensorFlow as a submodule.
+def clean_dep(dep):
+    return str(Label(dep))
+
 def repo():
     third_party_http_archive(
         name = "icu",
@@ -12,4 +17,6 @@ def repo():
             "https://github.com/unicode-org/icu/archive/release-62-1.tar.gz",
         ],
         build_file = "//third_party/icu:BUILD.bazel",
+        system_build_file = "//third_party/icu:BUILD.system",
+        patch_file = clean_dep("//third_party/icu:udata.patch"),
     )
diff --git a/third_party/jpeg/BUILD b/third_party/jpeg/BUILD
index 5b01f6e3e4cfd195327e08ff6a957acce4e21c71..e3aec1fce9377ff53d9eeb0effa84dd0fcca2f49 100644
--- a/third_party/jpeg/BUILD
+++ b/third_party/jpeg/BUILD
@@ -1 +1 @@
-licenses(["notice"])
+# Needed to make this a package.
diff --git a/third_party/jpeg/BUILD.bazel b/third_party/jpeg/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..5243e995a3d0e93ffea25254b440d9532f27e58a
--- /dev/null
+++ b/third_party/jpeg/BUILD.bazel
@@ -0,0 +1,752 @@
+# Description:
+#   libjpeg-turbo is a drop in replacement for jpeglib optimized with SIMD.
+
+licenses(["notice"])  # custom notice-style license, see LICENSE.md
+
+exports_files(["LICENSE.md"])
+
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
+
+libjpegturbo_nocopts = "-[W]error"
+
+WIN_COPTS = [
+    "/Ox",
+    "-DWITH_SIMD",
+    "-wd4996",
+]
+
+libjpegturbo_copts = select({
+    ":android": [
+        "-O2",
+        "-fPIE",
+        "-w",
+    ],
+    ":windows": WIN_COPTS,
+    "//conditions:default": [
+        "-O3",
+        "-w",
+    ],
+}) + select({
+    ":armeabi-v7a": [
+        "-D__ARM_NEON__",
+        "-march=armv7-a",
+        "-mfloat-abi=softfp",
+        "-fprefetch-loop-arrays",
+    ],
+    ":linux_ppc64le": [
+        "-mcpu=power8",
+        "-mtune=power8",
+    ],
+    "//conditions:default": [],
+})
+
+cc_library(
+    name = "jpeg",
+    srcs = [
+        "jaricom.c",
+        "jcapimin.c",
+        "jcapistd.c",
+        "jcarith.c",
+        "jccoefct.c",
+        "jccolor.c",
+        "jcdctmgr.c",
+        "jchuff.c",
+        "jchuff.h",
+        "jcinit.c",
+        "jcmainct.c",
+        "jcmarker.c",
+        "jcmaster.c",
+        "jcomapi.c",
+        "jconfig.h",
+        "jconfigint.h",
+        "jcparam.c",
+        "jcphuff.c",
+        "jcprepct.c",
+        "jcsample.c",
+        "jctrans.c",
+        "jdapimin.c",
+        "jdapistd.c",
+        "jdarith.c",
+        "jdatadst.c",
+        "jdatasrc.c",
+        "jdcoefct.c",
+        "jdcoefct.h",
+        "jdcolor.c",
+        "jdct.h",
+        "jddctmgr.c",
+        "jdhuff.c",
+        "jdhuff.h",
+        "jdinput.c",
+        "jdmainct.c",
+        "jdmainct.h",
+        "jdmarker.c",
+        "jdmaster.c",
+        "jdmaster.h",
+        "jdmerge.c",
+        "jdphuff.c",
+        "jdpostct.c",
+        "jdsample.c",
+        "jdsample.h",
+        "jdtrans.c",
+        "jerror.c",
+        "jfdctflt.c",
+        "jfdctfst.c",
+        "jfdctint.c",
+        "jidctflt.c",
+        "jidctfst.c",
+        "jidctint.c",
+        "jidctred.c",
+        "jinclude.h",
+        "jmemmgr.c",
+        "jmemnobs.c",
+        "jmemsys.h",
+        "jpeg_nbits_table.h",
+        "jpegcomp.h",
+        "jquant1.c",
+        "jquant2.c",
+        "jutils.c",
+        "jversion.h",
+    ],
+    hdrs = [
+        "jccolext.c",  # should have been named .inc
+        "jdcol565.c",  # should have been named .inc
+        "jdcolext.c",  # should have been named .inc
+        "jdmrg565.c",  # should have been named .inc
+        "jdmrgext.c",  # should have been named .inc
+        "jerror.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jstdhuff.c",  # should have been named .inc
+    ],
+    copts = libjpegturbo_copts,
+    nocopts = libjpegturbo_nocopts,
+    visibility = ["//visibility:public"],
+    deps = select({
+        ":k8": [":simd_x86_64"],
+        ":armeabi-v7a": [":simd_armv7a"],
+        ":arm64-v8a": [":simd_armv8a"],
+        ":linux_ppc64le": [":simd_altivec"],
+        ":windows": [":simd_win_x86_64"],
+        "//conditions:default": [":simd_none"],
+    }),
+)
+
+cc_library(
+    name = "simd_altivec",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimddct.h",
+        "simd/jsimd.h",
+        "simd/powerpc/jccolor-altivec.c",
+        "simd/powerpc/jcgray-altivec.c",
+        "simd/powerpc/jcsample-altivec.c",
+        "simd/powerpc/jdcolor-altivec.c",
+        "simd/powerpc/jdmerge-altivec.c",
+        "simd/powerpc/jdsample-altivec.c",
+        "simd/powerpc/jfdctfst-altivec.c",
+        "simd/powerpc/jfdctint-altivec.c",
+        "simd/powerpc/jidctfst-altivec.c",
+        "simd/powerpc/jidctint-altivec.c",
+        "simd/powerpc/jquanti-altivec.c",
+        "simd/powerpc/jsimd.c",
+    ],
+    hdrs = [
+        "simd/powerpc/jccolext-altivec.c",
+        "simd/powerpc/jcgryext-altivec.c",
+        "simd/powerpc/jcsample.h",
+        "simd/powerpc/jdcolext-altivec.c",
+        "simd/powerpc/jdmrgext-altivec.c",
+        "simd/powerpc/jsimd_altivec.h",
+    ],
+    copts = libjpegturbo_copts,
+    nocopts = libjpegturbo_nocopts,
+)
+
+cc_library(
+    name = "simd_x86_64",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jconfigint.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimddct.h",
+        "simd/jsimd.h",
+        "simd/x86_64/jccolor-avx2.o",
+        "simd/x86_64/jccolor-sse2.o",
+        "simd/x86_64/jcgray-avx2.o",
+        "simd/x86_64/jcgray-sse2.o",
+        "simd/x86_64/jchuff-sse2.o",
+        "simd/x86_64/jcphuff-sse2.o",
+        "simd/x86_64/jcsample-avx2.o",
+        "simd/x86_64/jcsample-sse2.o",
+        "simd/x86_64/jdcolor-avx2.o",
+        "simd/x86_64/jdcolor-sse2.o",
+        "simd/x86_64/jdmerge-avx2.o",
+        "simd/x86_64/jdmerge-sse2.o",
+        "simd/x86_64/jdsample-avx2.o",
+        "simd/x86_64/jdsample-sse2.o",
+        "simd/x86_64/jfdctflt-sse.o",
+        "simd/x86_64/jfdctfst-sse2.o",
+        "simd/x86_64/jfdctint-avx2.o",
+        "simd/x86_64/jfdctint-sse2.o",
+        "simd/x86_64/jidctflt-sse2.o",
+        "simd/x86_64/jidctfst-sse2.o",
+        "simd/x86_64/jidctint-avx2.o",
+        "simd/x86_64/jidctint-sse2.o",
+        "simd/x86_64/jidctred-sse2.o",
+        "simd/x86_64/jquantf-sse2.o",
+        "simd/x86_64/jquanti-avx2.o",
+        "simd/x86_64/jquanti-sse2.o",
+        "simd/x86_64/jsimd.c",
+        "simd/x86_64/jsimdcpu.o",
+    ],
+    copts = libjpegturbo_copts,
+    linkstatic = 1,
+    nocopts = libjpegturbo_nocopts,
+)
+
+genrule(
+    name = "simd_x86_64_assemblage23",
+    srcs = [
+        "jconfig.h",
+        "jconfigint.h",
+        "simd/x86_64/jccolext-avx2.asm",
+        "simd/x86_64/jccolext-sse2.asm",
+        "simd/x86_64/jccolor-avx2.asm",
+        "simd/x86_64/jccolor-sse2.asm",
+        "simd/x86_64/jcgray-avx2.asm",
+        "simd/x86_64/jcgray-sse2.asm",
+        "simd/x86_64/jcgryext-avx2.asm",
+        "simd/x86_64/jcgryext-sse2.asm",
+        "simd/x86_64/jchuff-sse2.asm",
+        "simd/x86_64/jcphuff-sse2.asm",
+        "simd/x86_64/jcsample-avx2.asm",
+        "simd/x86_64/jcsample-sse2.asm",
+        "simd/x86_64/jdcolext-avx2.asm",
+        "simd/x86_64/jdcolext-sse2.asm",
+        "simd/x86_64/jdcolor-avx2.asm",
+        "simd/x86_64/jdcolor-sse2.asm",
+        "simd/x86_64/jdmerge-avx2.asm",
+        "simd/x86_64/jdmerge-sse2.asm",
+        "simd/x86_64/jdmrgext-avx2.asm",
+        "simd/x86_64/jdmrgext-sse2.asm",
+        "simd/x86_64/jdsample-avx2.asm",
+        "simd/x86_64/jdsample-sse2.asm",
+        "simd/x86_64/jfdctflt-sse.asm",
+        "simd/x86_64/jfdctfst-sse2.asm",
+        "simd/x86_64/jfdctint-avx2.asm",
+        "simd/x86_64/jfdctint-sse2.asm",
+        "simd/x86_64/jidctflt-sse2.asm",
+        "simd/x86_64/jidctfst-sse2.asm",
+        "simd/x86_64/jidctint-avx2.asm",
+        "simd/x86_64/jidctint-sse2.asm",
+        "simd/x86_64/jidctred-sse2.asm",
+        "simd/x86_64/jquantf-sse2.asm",
+        "simd/x86_64/jquanti-avx2.asm",
+        "simd/x86_64/jquanti-sse2.asm",
+        "simd/x86_64/jsimdcpu.asm",
+        "simd/nasm/jcolsamp.inc",
+        "simd/nasm/jdct.inc",
+        "simd/nasm/jpeg_nbits_table.inc",
+        "simd/nasm/jsimdcfg.inc",
+        "simd/nasm/jsimdcfg.inc.h",
+        "simd/nasm/jsimdext.inc",
+    ],
+    outs = [
+        "simd/x86_64/jccolor-avx2.o",
+        "simd/x86_64/jccolor-sse2.o",
+        "simd/x86_64/jcgray-avx2.o",
+        "simd/x86_64/jcgray-sse2.o",
+        "simd/x86_64/jchuff-sse2.o",
+        "simd/x86_64/jcphuff-sse2.o",
+        "simd/x86_64/jcsample-avx2.o",
+        "simd/x86_64/jcsample-sse2.o",
+        "simd/x86_64/jdcolor-avx2.o",
+        "simd/x86_64/jdcolor-sse2.o",
+        "simd/x86_64/jdmerge-avx2.o",
+        "simd/x86_64/jdmerge-sse2.o",
+        "simd/x86_64/jdsample-avx2.o",
+        "simd/x86_64/jdsample-sse2.o",
+        "simd/x86_64/jfdctflt-sse.o",
+        "simd/x86_64/jfdctfst-sse2.o",
+        "simd/x86_64/jfdctint-avx2.o",
+        "simd/x86_64/jfdctint-sse2.o",
+        "simd/x86_64/jidctflt-sse2.o",
+        "simd/x86_64/jidctfst-sse2.o",
+        "simd/x86_64/jidctint-avx2.o",
+        "simd/x86_64/jidctint-sse2.o",
+        "simd/x86_64/jidctred-sse2.o",
+        "simd/x86_64/jquantf-sse2.o",
+        "simd/x86_64/jquanti-avx2.o",
+        "simd/x86_64/jquanti-sse2.o",
+        "simd/x86_64/jsimdcpu.o",
+    ],
+    cmd = "for out in $(OUTS); do\n" +
+          "  $(location @nasm//:nasm) -f elf64" +
+          "    -DELF -DPIC -D__x86_64__" +
+          "    -I $$(dirname $(location jconfig.h))/" +
+          "    -I $$(dirname $(location jconfigint.h))/" +
+          "    -I $$(dirname $(location simd/nasm/jsimdcfg.inc.h))/" +
+          "    -I $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/" +
+          "    -o $$out" +
+          "    $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/$$(basename $${out%.o}.asm)\n" +
+          "done",
+    tools = ["@nasm"],
+)
+
+cc_library(
+    name = "simd_armv7a",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimddct.h",
+        "simd/arm/jsimd.c",
+        "simd/arm/jsimd_neon.S",
+        "simd/jsimd.h",
+    ],
+    copts = libjpegturbo_copts,
+    nocopts = libjpegturbo_nocopts,
+)
+
+cc_library(
+    name = "simd_armv8a",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimddct.h",
+        "simd/arm64/jsimd.c",
+        "simd/arm64/jsimd_neon.S",
+        "simd/jsimd.h",
+    ],
+    copts = libjpegturbo_copts,
+    nocopts = libjpegturbo_nocopts,
+)
+
+cc_library(
+    name = "simd_win_x86_64",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jconfigint.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimddct.h",
+        "simd/jsimd.h",
+        "simd/x86_64/jccolor-avx2.obj",
+        "simd/x86_64/jccolor-sse2.obj",
+        "simd/x86_64/jcgray-avx2.obj",
+        "simd/x86_64/jcgray-sse2.obj",
+        "simd/x86_64/jchuff-sse2.obj",
+        "simd/x86_64/jcphuff-sse2.obj",
+        "simd/x86_64/jcsample-avx2.obj",
+        "simd/x86_64/jcsample-sse2.obj",
+        "simd/x86_64/jdcolor-avx2.obj",
+        "simd/x86_64/jdcolor-sse2.obj",
+        "simd/x86_64/jdmerge-avx2.obj",
+        "simd/x86_64/jdmerge-sse2.obj",
+        "simd/x86_64/jdsample-avx2.obj",
+        "simd/x86_64/jdsample-sse2.obj",
+        "simd/x86_64/jfdctflt-sse.obj",
+        "simd/x86_64/jfdctfst-sse2.obj",
+        "simd/x86_64/jfdctint-avx2.obj",
+        "simd/x86_64/jfdctint-sse2.obj",
+        "simd/x86_64/jidctflt-sse2.obj",
+        "simd/x86_64/jidctfst-sse2.obj",
+        "simd/x86_64/jidctint-avx2.obj",
+        "simd/x86_64/jidctint-sse2.obj",
+        "simd/x86_64/jidctred-sse2.obj",
+        "simd/x86_64/jquantf-sse2.obj",
+        "simd/x86_64/jquanti-avx2.obj",
+        "simd/x86_64/jquanti-sse2.obj",
+        "simd/x86_64/jsimd.c",
+        "simd/x86_64/jsimdcpu.obj",
+    ],
+    copts = libjpegturbo_copts,
+)
+
+genrule(
+    name = "simd_win_x86_64_assemble",
+    srcs = [
+        "jconfig.h",
+        "jconfigint.h",
+        "simd/x86_64/jccolext-avx2.asm",
+        "simd/x86_64/jccolext-sse2.asm",
+        "simd/x86_64/jccolor-avx2.asm",
+        "simd/x86_64/jccolor-sse2.asm",
+        "simd/x86_64/jcgray-avx2.asm",
+        "simd/x86_64/jcgray-sse2.asm",
+        "simd/x86_64/jcgryext-avx2.asm",
+        "simd/x86_64/jcgryext-sse2.asm",
+        "simd/x86_64/jchuff-sse2.asm",
+        "simd/x86_64/jcphuff-sse2.asm",
+        "simd/x86_64/jcsample-avx2.asm",
+        "simd/x86_64/jcsample-sse2.asm",
+        "simd/x86_64/jdcolext-avx2.asm",
+        "simd/x86_64/jdcolext-sse2.asm",
+        "simd/x86_64/jdcolor-avx2.asm",
+        "simd/x86_64/jdcolor-sse2.asm",
+        "simd/x86_64/jdmerge-avx2.asm",
+        "simd/x86_64/jdmerge-sse2.asm",
+        "simd/x86_64/jdmrgext-avx2.asm",
+        "simd/x86_64/jdmrgext-sse2.asm",
+        "simd/x86_64/jdsample-avx2.asm",
+        "simd/x86_64/jdsample-sse2.asm",
+        "simd/x86_64/jfdctflt-sse.asm",
+        "simd/x86_64/jfdctfst-sse2.asm",
+        "simd/x86_64/jfdctint-avx2.asm",
+        "simd/x86_64/jfdctint-sse2.asm",
+        "simd/x86_64/jidctflt-sse2.asm",
+        "simd/x86_64/jidctfst-sse2.asm",
+        "simd/x86_64/jidctint-avx2.asm",
+        "simd/x86_64/jidctint-sse2.asm",
+        "simd/x86_64/jidctred-sse2.asm",
+        "simd/x86_64/jquantf-sse2.asm",
+        "simd/x86_64/jquanti-avx2.asm",
+        "simd/x86_64/jquanti-sse2.asm",
+        "simd/x86_64/jsimdcpu.asm",
+        "simd/nasm/jcolsamp.inc",
+        "simd/nasm/jdct.inc",
+        "simd/nasm/jpeg_nbits_table.inc",
+        "simd/nasm/jsimdcfg.inc",
+        "simd/nasm/jsimdcfg.inc.h",
+        "simd/nasm/jsimdext.inc",
+    ],
+    outs = [
+        "simd/x86_64/jccolor-avx2.obj",
+        "simd/x86_64/jccolor-sse2.obj",
+        "simd/x86_64/jcgray-avx2.obj",
+        "simd/x86_64/jcgray-sse2.obj",
+        "simd/x86_64/jchuff-sse2.obj",
+        "simd/x86_64/jcphuff-sse2.obj",
+        "simd/x86_64/jcsample-avx2.obj",
+        "simd/x86_64/jcsample-sse2.obj",
+        "simd/x86_64/jdcolor-avx2.obj",
+        "simd/x86_64/jdcolor-sse2.obj",
+        "simd/x86_64/jdmerge-avx2.obj",
+        "simd/x86_64/jdmerge-sse2.obj",
+        "simd/x86_64/jdsample-avx2.obj",
+        "simd/x86_64/jdsample-sse2.obj",
+        "simd/x86_64/jfdctflt-sse.obj",
+        "simd/x86_64/jfdctfst-sse2.obj",
+        "simd/x86_64/jfdctint-avx2.obj",
+        "simd/x86_64/jfdctint-sse2.obj",
+        "simd/x86_64/jidctflt-sse2.obj",
+        "simd/x86_64/jidctfst-sse2.obj",
+        "simd/x86_64/jidctint-avx2.obj",
+        "simd/x86_64/jidctint-sse2.obj",
+        "simd/x86_64/jidctred-sse2.obj",
+        "simd/x86_64/jquantf-sse2.obj",
+        "simd/x86_64/jquanti-avx2.obj",
+        "simd/x86_64/jquanti-sse2.obj",
+        "simd/x86_64/jsimdcpu.obj",
+    ],
+    cmd = "for out in $(OUTS); do\n" +
+          "  $(location @nasm//:nasm) -fwin64 -DWIN64 -D__x86_64__" +
+          "    -I $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/" +
+          "    -I $$(dirname $(location simd/nasm/jdct.inc))/" +
+          "    -I $$(dirname $(location simd/nasm/jdct.inc))/../../win/" +
+          "    -o $$out" +
+          "    $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/$$(basename $${out%.obj}.asm)\n" +
+          "done",
+    tools = ["@nasm"],
+)
+
+cc_library(
+    name = "simd_none",
+    srcs = [
+        "jchuff.h",
+        "jconfig.h",
+        "jdct.h",
+        "jerror.h",
+        "jinclude.h",
+        "jmorecfg.h",
+        "jpegint.h",
+        "jpeglib.h",
+        "jsimd.h",
+        "jsimd_none.c",
+        "jsimddct.h",
+    ],
+    copts = libjpegturbo_copts,
+    nocopts = libjpegturbo_nocopts,
+)
+
+template_rule(
+    name = "jconfig_win",
+    src = "win/jconfig.h.in",
+    out = "jconfig_win.h",
+    substitutions = {
+        "@JPEG_LIB_VERSION@": "62",
+        "@VERSION@": "2.0.0",
+        "@LIBJPEG_TURBO_VERSION_NUMBER@": "2000000",
+        "@BITS_IN_JSAMPLE@": "8",
+        "#cmakedefine C_ARITH_CODING_SUPPORTED": "#define C_ARITH_CODING_SUPPORTED",
+        "#cmakedefine D_ARITH_CODING_SUPPORTED": "#define D_ARITH_CODING_SUPPORTED",
+        "#cmakedefine MEM_SRCDST_SUPPORTED": "#define MEM_SRCDST_SUPPORTED",
+        "#cmakedefine WITH_SIMD": "",
+    },
+)
+
+JCONFIG_NOWIN_COMMON_SUBSTITUTIONS = {
+    "@JPEG_LIB_VERSION@": "62",
+    "@VERSION@": "2.0.0",
+    "@LIBJPEG_TURBO_VERSION_NUMBER@": "2000000",
+    "#cmakedefine C_ARITH_CODING_SUPPORTED": "#define C_ARITH_CODING_SUPPORTED",
+    "#cmakedefine D_ARITH_CODING_SUPPORTED": "#define D_ARITH_CODING_SUPPORTED",
+    "#cmakedefine MEM_SRCDST_SUPPORTED": "#define MEM_SRCDST_SUPPORTED",
+    "@BITS_IN_JSAMPLE@": "8",
+    "#cmakedefine HAVE_LOCALE_H": "#define HAVE_LOCALE_H 1",
+    "#cmakedefine HAVE_STDDEF_H": "#define HAVE_STDDEF_H 1",
+    "#cmakedefine HAVE_STDLIB_H": "#define HAVE_STDLIB_H 1",
+    "#cmakedefine NEED_SYS_TYPES_H": "#define NEED_SYS_TYPES_H",
+    "#cmakedefine NEED_BSD_STRINGS": "",
+    "#cmakedefine HAVE_UNSIGNED_CHAR": "#define HAVE_UNSIGNED_CHAR 1",
+    "#cmakedefine HAVE_UNSIGNED_SHORT": "#define HAVE_UNSIGNED_SHORT 1",
+    "#cmakedefine INCOMPLETE_TYPES_BROKEN": "",
+    "#cmakedefine RIGHT_SHIFT_IS_UNSIGNED": "",
+    "#cmakedefine __CHAR_UNSIGNED__": "",
+    "#undef const": "",
+    "#undef size_t": "",
+}
+
+JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = {
+    "#cmakedefine WITH_SIMD": "#define WITH_SIMD",
+}
+
+JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = {
+    "#cmakedefine WITH_SIMD": "",
+}
+
+JCONFIG_NOWIN_SIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
+
+JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
+
+template_rule(
+    name = "jconfig_nowin_nosimd",
+    src = "jconfig.h.in",
+    out = "jconfig_nowin_nosimd.h",
+    substitutions = JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS,
+)
+
+template_rule(
+    name = "jconfig_nowin_simd",
+    src = "jconfig.h.in",
+    out = "jconfig_nowin_simd.h",
+    substitutions = JCONFIG_NOWIN_SIMD_SUBSTITUTIONS,
+)
+
+JCONFIGINT_COMMON_SUBSTITUTIONS = {
+    "@BUILD@": "20180831",
+    "@VERSION@": "2.0.0",
+    "@CMAKE_PROJECT_NAME@": "libjpeg-turbo",
+    "#undef inline": "",
+    "#cmakedefine HAVE_INTRIN_H": "",
+}
+
+JCONFIGINT_NOWIN_SUBSTITUTIONS = {
+    "#cmakedefine HAVE_BUILTIN_CTZL": "#define HAVE_BUILTIN_CTZL",
+    "@INLINE@": "inline __attribute__((always_inline))",
+    "#define SIZEOF_SIZE_T  @SIZE_T@": "#if (__WORDSIZE==64 && !defined(__native_client__))\n" +
+                                       "#define SIZEOF_SIZE_T 8\n" +
+                                       "#else\n" +
+                                       "#define SIZEOF_SIZE_T 4\n" +
+                                       "#endif\n",
+}
+
+JCONFIGINT_WIN_SUBSTITUTIONS = {
+    "#cmakedefine HAVE_BUILTIN_CTZL": "",
+    "#define INLINE  @INLINE@": "#if defined(__GNUC__)\n" +
+                                "#define INLINE inline __attribute__((always_inline))\n" +
+                                "#elif defined(_MSC_VER)\n" +
+                                "#define INLINE __forceinline\n" +
+                                "#else\n" +
+                                "#define INLINE\n" +
+                                "#endif\n",
+    "#define SIZEOF_SIZE_T  @SIZE_T@": "#if (__WORDSIZE==64)\n" +
+                                       "#define SIZEOF_SIZE_T 8\n" +
+                                       "#else\n" +
+                                       "#define SIZEOF_SIZE_T 4\n" +
+                                       "#endif\n",
+}
+
+JCONFIGINT_NOWIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
+
+JCONFIGINT_WIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
+
+template_rule(
+    name = "jconfigint_nowin",
+    src = "jconfigint.h.in",
+    out = "jconfigint_nowin.h",
+    substitutions = JCONFIGINT_NOWIN_SUBSTITUTIONS,
+)
+
+template_rule(
+    name = "jconfigint_win",
+    src = "jconfigint.h.in",
+    out = "jconfigint_win.h",
+    substitutions = JCONFIGINT_WIN_SUBSTITUTIONS,
+)
+
+genrule(
+    name = "configure",
+    srcs = [
+        "jconfig_win.h",
+        "jconfig_nowin_nosimd.h",
+        "jconfig_nowin_simd.h",
+    ],
+    outs = ["jconfig.h"],
+    cmd = select({
+        ":windows": "cp $(location jconfig_win.h) $@",
+        ":k8": "cp $(location jconfig_nowin_simd.h) $@",
+        ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
+        ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
+        ":linux_ppc64le": "cp $(location jconfig_nowin_simd.h) $@",
+        "//conditions:default": "cp $(location jconfig_nowin_nosimd.h) $@",
+    }),
+)
+
+genrule(
+    name = "configure_internal",
+    srcs = [
+        "jconfigint_win.h",
+        "jconfigint_nowin.h",
+    ],
+    outs = ["jconfigint.h"],
+    cmd = select({
+        ":windows": "cp $(location jconfigint_win.h) $@",
+        "//conditions:default": "cp $(location jconfigint_nowin.h) $@",
+    }),
+)
+
+# jiminy cricket the way this file is generated is completely outrageous
+genrule(
+    name = "configure_simd",
+    outs = ["simd/jsimdcfg.inc"],
+    cmd = "cat <<'EOF' >$@\n" +
+          "%define DCTSIZE 8\n" +
+          "%define DCTSIZE2 64\n" +
+          "%define RGB_RED 0\n" +
+          "%define RGB_GREEN 1\n" +
+          "%define RGB_BLUE 2\n" +
+          "%define RGB_PIXELSIZE 3\n" +
+          "%define EXT_RGB_RED 0\n" +
+          "%define EXT_RGB_GREEN 1\n" +
+          "%define EXT_RGB_BLUE 2\n" +
+          "%define EXT_RGB_PIXELSIZE 3\n" +
+          "%define EXT_RGBX_RED 0\n" +
+          "%define EXT_RGBX_GREEN 1\n" +
+          "%define EXT_RGBX_BLUE 2\n" +
+          "%define EXT_RGBX_PIXELSIZE 4\n" +
+          "%define EXT_BGR_RED 2\n" +
+          "%define EXT_BGR_GREEN 1\n" +
+          "%define EXT_BGR_BLUE 0\n" +
+          "%define EXT_BGR_PIXELSIZE 3\n" +
+          "%define EXT_BGRX_RED 2\n" +
+          "%define EXT_BGRX_GREEN 1\n" +
+          "%define EXT_BGRX_BLUE 0\n" +
+          "%define EXT_BGRX_PIXELSIZE 4\n" +
+          "%define EXT_XBGR_RED 3\n" +
+          "%define EXT_XBGR_GREEN 2\n" +
+          "%define EXT_XBGR_BLUE 1\n" +
+          "%define EXT_XBGR_PIXELSIZE 4\n" +
+          "%define EXT_XRGB_RED 1\n" +
+          "%define EXT_XRGB_GREEN 2\n" +
+          "%define EXT_XRGB_BLUE 3\n" +
+          "%define EXT_XRGB_PIXELSIZE 4\n" +
+          "%define RGBX_FILLER_0XFF 1\n" +
+          "%define JSAMPLE byte ; unsigned char\n" +
+          "%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)\n" +
+          "%define CENTERJSAMPLE 128\n" +
+          "%define JCOEF word ; short\n" +
+          "%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)\n" +
+          "%define JDIMENSION dword ; unsigned int\n" +
+          "%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)\n" +
+          "%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)\n" +
+          "%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)\n" +
+          "%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)\n" +
+          "%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)\n" +
+          "%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)\n" +
+          "%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)\n" +
+          "%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)\n" +
+          "%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)\n" +
+          "%define DCTELEM word ; short\n" +
+          "%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)\n" +
+          "%define float FP32 ; float\n" +
+          "%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)\n" +
+          "%define ISLOW_MULT_TYPE word ; must be short\n" +
+          "%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)\n" +
+          "%define IFAST_MULT_TYPE word ; must be short\n" +
+          "%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)\n" +
+          "%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors\n" +
+          "%define FLOAT_MULT_TYPE FP32 ; must be float\n" +
+          "%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)\n" +
+          "%define JSIMD_NONE 0x00\n" +
+          "%define JSIMD_MMX 0x01\n" +
+          "%define JSIMD_3DNOW 0x02\n" +
+          "%define JSIMD_SSE 0x04\n" +
+          "%define JSIMD_SSE2 0x08\n" +
+          "EOF",
+)
+
+config_setting(
+    name = "k8",
+    values = {"cpu": "k8"},
+)
+
+config_setting(
+    name = "android",
+    values = {"crosstool_top": "//external:android/crosstool"},
+)
+
+config_setting(
+    name = "armeabi-v7a",
+    values = {"cpu": "armeabi-v7a"},
+)
+
+config_setting(
+    name = "arm64-v8a",
+    values = {"cpu": "arm64-v8a"},
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+)
+
+config_setting(
+    name = "linux_ppc64le",
+    values = {"cpu": "ppc"},
+)
diff --git a/third_party/systemlibs/jpeg.BUILD b/third_party/jpeg/BUILD.system
similarity index 100%
rename from third_party/systemlibs/jpeg.BUILD
rename to third_party/jpeg/BUILD.system
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
deleted file mode 100644
index 1b9b9bf2f502f4927432ef7e50d07b2082b40bc3..0000000000000000000000000000000000000000
--- a/third_party/jpeg/jpeg.BUILD
+++ /dev/null
@@ -1,751 +0,0 @@
-# Description:
-#   libjpeg-turbo is a drop in replacement for jpeglib optimized with SIMD.
-
-licenses(["notice"])  # custom notice-style license, see LICENSE.md
-
-exports_files(["LICENSE.md"])
-
-load("@org_tensorflow//third_party:common.bzl", "template_rule")
-
-libjpegturbo_nocopts = "-[W]error"
-
-WIN_COPTS = [
-    "/Ox",
-    "-DWITH_SIMD",
-    "-wd4996",
-]
-
-libjpegturbo_copts = select({
-    ":android": [
-        "-O2",
-        "-fPIE",
-        "-w",
-    ],
-    ":windows": WIN_COPTS,
-    "//conditions:default": [
-        "-O3",
-        "-w",
-    ],
-}) + select({
-    ":armeabi-v7a": [
-        "-D__ARM_NEON__",
-        "-march=armv7-a",
-        "-mfloat-abi=softfp",
-        "-fprefetch-loop-arrays",
-    ],
-    ":linux_ppc64le": [
-        "-mcpu=power8",
-        "-mtune=power8",
-    ],
-    "//conditions:default": [],
-})
-
-cc_library(
-    name = "jpeg",
-    srcs = [
-        "jaricom.c",
-        "jcapimin.c",
-        "jcapistd.c",
-        "jcarith.c",
-        "jccoefct.c",
-        "jccolor.c",
-        "jcdctmgr.c",
-        "jchuff.c",
-        "jchuff.h",
-        "jcinit.c",
-        "jcmainct.c",
-        "jcmarker.c",
-        "jcmaster.c",
-        "jcomapi.c",
-        "jconfig.h",
-        "jconfigint.h",
-        "jcparam.c",
-        "jcphuff.c",
-        "jcprepct.c",
-        "jcsample.c",
-        "jctrans.c",
-        "jdapimin.c",
-        "jdapistd.c",
-        "jdarith.c",
-        "jdatadst.c",
-        "jdatasrc.c",
-        "jdcoefct.c",
-        "jdcoefct.h",
-        "jdcolor.c",
-        "jdct.h",
-        "jddctmgr.c",
-        "jdhuff.c",
-        "jdhuff.h",
-        "jdinput.c",
-        "jdmainct.c",
-        "jdmainct.h",
-        "jdmarker.c",
-        "jdmaster.c",
-        "jdmaster.h",
-        "jdmerge.c",
-        "jdphuff.c",
-        "jdpostct.c",
-        "jdsample.c",
-        "jdsample.h",
-        "jdtrans.c",
-        "jerror.c",
-        "jfdctflt.c",
-        "jfdctfst.c",
-        "jfdctint.c",
-        "jidctflt.c",
-        "jidctfst.c",
-        "jidctint.c",
-        "jidctred.c",
-        "jinclude.h",
-        "jmemmgr.c",
-        "jmemnobs.c",
-        "jmemsys.h",
-        "jpeg_nbits_table.h",
-        "jpegcomp.h",
-        "jquant1.c",
-        "jquant2.c",
-        "jutils.c",
-        "jversion.h",
-    ],
-    hdrs = [
-        "jccolext.c",  # should have been named .inc
-        "jdcol565.c",  # should have been named .inc
-        "jdcolext.c",  # should have been named .inc
-        "jdmrg565.c",  # should have been named .inc
-        "jdmrgext.c",  # should have been named .inc
-        "jerror.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jstdhuff.c",  # should have been named .inc
-    ],
-    copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
-    visibility = ["//visibility:public"],
-    deps = select({
-        ":k8": [":simd_x86_64"],
-        ":armeabi-v7a": [":simd_armv7a"],
-        ":arm64-v8a": [":simd_armv8a"],
-        ":linux_ppc64le": [":simd_altivec"],
-        ":windows": [":simd_win_x86_64"],
-        "//conditions:default": [":simd_none"],
-    }),
-)
-
-cc_library(
-    name = "simd_altivec",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimddct.h",
-        "simd/jsimd.h",
-        "simd/powerpc/jccolor-altivec.c",
-        "simd/powerpc/jcgray-altivec.c",
-        "simd/powerpc/jcsample-altivec.c",
-        "simd/powerpc/jdcolor-altivec.c",
-        "simd/powerpc/jdmerge-altivec.c",
-        "simd/powerpc/jdsample-altivec.c",
-        "simd/powerpc/jfdctfst-altivec.c",
-        "simd/powerpc/jfdctint-altivec.c",
-        "simd/powerpc/jidctfst-altivec.c",
-        "simd/powerpc/jidctint-altivec.c",
-        "simd/powerpc/jquanti-altivec.c",
-        "simd/powerpc/jsimd.c",
-    ],
-    hdrs = [
-        "simd/powerpc/jccolext-altivec.c",
-        "simd/powerpc/jcgryext-altivec.c",
-        "simd/powerpc/jdcolext-altivec.c",
-        "simd/powerpc/jdmrgext-altivec.c",
-        "simd/powerpc/jcsample.h",
-        "simd/powerpc/jsimd_altivec.h",
-    ],
-    copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
-)
-
-cc_library(
-    name = "simd_x86_64",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jconfigint.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimddct.h",
-        "simd/jsimd.h",
-        "simd/x86_64/jsimd.c",
-        "simd/x86_64/jccolor-avx2.o",
-        "simd/x86_64/jccolor-sse2.o",
-        "simd/x86_64/jcgray-avx2.o",
-        "simd/x86_64/jcgray-sse2.o",
-        "simd/x86_64/jchuff-sse2.o",
-        "simd/x86_64/jcphuff-sse2.o",
-        "simd/x86_64/jcsample-avx2.o",
-        "simd/x86_64/jcsample-sse2.o",
-        "simd/x86_64/jdcolor-avx2.o",
-        "simd/x86_64/jdcolor-sse2.o",
-        "simd/x86_64/jdmerge-avx2.o",
-        "simd/x86_64/jdmerge-sse2.o",
-        "simd/x86_64/jdsample-avx2.o",
-        "simd/x86_64/jdsample-sse2.o",
-        "simd/x86_64/jfdctflt-sse.o",
-        "simd/x86_64/jfdctfst-sse2.o",
-        "simd/x86_64/jfdctint-avx2.o",
-        "simd/x86_64/jfdctint-sse2.o",
-        "simd/x86_64/jidctflt-sse2.o",
-        "simd/x86_64/jidctfst-sse2.o",
-        "simd/x86_64/jidctint-avx2.o",
-        "simd/x86_64/jidctint-sse2.o",
-        "simd/x86_64/jidctred-sse2.o",
-        "simd/x86_64/jquantf-sse2.o",
-        "simd/x86_64/jquanti-avx2.o",
-        "simd/x86_64/jquanti-sse2.o",
-        "simd/x86_64/jsimdcpu.o",
-    ],
-    copts = libjpegturbo_copts,
-    linkstatic = 1,
-    nocopts = libjpegturbo_nocopts,
-)
-
-genrule(
-    name = "simd_x86_64_assemblage23",
-    srcs = [
-        "jconfig.h",
-        "jconfigint.h",
-        "simd/x86_64/jccolext-avx2.asm",
-        "simd/x86_64/jccolext-sse2.asm",
-        "simd/x86_64/jccolor-avx2.asm",
-        "simd/x86_64/jccolor-sse2.asm",
-        "simd/x86_64/jcgray-avx2.asm",
-        "simd/x86_64/jcgray-sse2.asm",
-        "simd/x86_64/jcgryext-avx2.asm",
-        "simd/x86_64/jcgryext-sse2.asm",
-        "simd/x86_64/jchuff-sse2.asm",
-        "simd/x86_64/jcphuff-sse2.asm",
-        "simd/x86_64/jcsample-avx2.asm",
-        "simd/x86_64/jcsample-sse2.asm",
-        "simd/x86_64/jdcolext-avx2.asm",
-        "simd/x86_64/jdcolext-sse2.asm",
-        "simd/x86_64/jdcolor-avx2.asm",
-        "simd/x86_64/jdcolor-sse2.asm",
-        "simd/x86_64/jdmerge-avx2.asm",
-        "simd/x86_64/jdmerge-sse2.asm",
-        "simd/x86_64/jdmrgext-avx2.asm",
-        "simd/x86_64/jdmrgext-sse2.asm",
-        "simd/x86_64/jdsample-avx2.asm",
-        "simd/x86_64/jdsample-sse2.asm",
-        "simd/x86_64/jfdctflt-sse.asm",
-        "simd/x86_64/jfdctfst-sse2.asm",
-        "simd/x86_64/jfdctint-avx2.asm",
-        "simd/x86_64/jfdctint-sse2.asm",
-        "simd/x86_64/jidctflt-sse2.asm",
-        "simd/x86_64/jidctfst-sse2.asm",
-        "simd/x86_64/jidctint-avx2.asm",
-        "simd/x86_64/jidctint-sse2.asm",
-        "simd/x86_64/jidctred-sse2.asm",
-        "simd/x86_64/jquantf-sse2.asm",
-        "simd/x86_64/jquanti-avx2.asm",
-        "simd/x86_64/jquanti-sse2.asm",
-        "simd/x86_64/jsimdcpu.asm",
-        "simd/nasm/jcolsamp.inc",
-        "simd/nasm/jdct.inc",
-        "simd/nasm/jpeg_nbits_table.inc",
-        "simd/nasm/jsimdcfg.inc",
-        "simd/nasm/jsimdcfg.inc.h",
-        "simd/nasm/jsimdext.inc",
-    ],
-    outs = [
-        "simd/x86_64/jccolor-avx2.o",
-        "simd/x86_64/jccolor-sse2.o",
-        "simd/x86_64/jcgray-avx2.o",
-        "simd/x86_64/jcgray-sse2.o",
-        "simd/x86_64/jchuff-sse2.o",
-        "simd/x86_64/jcphuff-sse2.o",
-        "simd/x86_64/jcsample-avx2.o",
-        "simd/x86_64/jcsample-sse2.o",
-        "simd/x86_64/jdcolor-avx2.o",
-        "simd/x86_64/jdcolor-sse2.o",
-        "simd/x86_64/jdmerge-avx2.o",
-        "simd/x86_64/jdmerge-sse2.o",
-        "simd/x86_64/jdsample-avx2.o",
-        "simd/x86_64/jdsample-sse2.o",
-        "simd/x86_64/jfdctflt-sse.o",
-        "simd/x86_64/jfdctfst-sse2.o",
-        "simd/x86_64/jfdctint-avx2.o",
-        "simd/x86_64/jfdctint-sse2.o",
-        "simd/x86_64/jidctflt-sse2.o",
-        "simd/x86_64/jidctfst-sse2.o",
-        "simd/x86_64/jidctint-avx2.o",
-        "simd/x86_64/jidctint-sse2.o",
-        "simd/x86_64/jidctred-sse2.o",
-        "simd/x86_64/jquantf-sse2.o",
-        "simd/x86_64/jquanti-avx2.o",
-        "simd/x86_64/jquanti-sse2.o",
-        "simd/x86_64/jsimdcpu.o",
-    ],
-    cmd = "for out in $(OUTS); do\n" +
-          "  $(location @nasm//:nasm) -f elf64" +
-          "    -DELF -DPIC -D__x86_64__" +
-          "    -I $$(dirname $(location jconfig.h))/" +
-          "    -I $$(dirname $(location jconfigint.h))/" +
-          "    -I $$(dirname $(location simd/nasm/jsimdcfg.inc.h))/" +
-          "    -I $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/" +
-          "    -o $$out" +
-          "    $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/$$(basename $${out%.o}.asm)\n" +
-          "done",
-    tools = ["@nasm"],
-)
-
-cc_library(
-    name = "simd_armv7a",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimddct.h",
-        "simd/jsimd.h",
-        "simd/arm/jsimd.c",
-        "simd/arm/jsimd_neon.S",
-    ],
-    copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
-)
-
-cc_library(
-    name = "simd_armv8a",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimddct.h",
-        "simd/jsimd.h",
-        "simd/arm64/jsimd.c",
-        "simd/arm64/jsimd_neon.S",
-    ],
-    copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
-)
-
-cc_library(
-    name = "simd_win_x86_64",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jconfigint.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimddct.h",
-        "simd/jsimd.h",
-        "simd/x86_64/jsimd.c",
-        "simd/x86_64/jccolor-avx2.obj",
-        "simd/x86_64/jccolor-sse2.obj",
-        "simd/x86_64/jcgray-avx2.obj",
-        "simd/x86_64/jcgray-sse2.obj",
-        "simd/x86_64/jchuff-sse2.obj",
-        "simd/x86_64/jcphuff-sse2.obj",
-        "simd/x86_64/jcsample-avx2.obj",
-        "simd/x86_64/jcsample-sse2.obj",
-        "simd/x86_64/jdcolor-avx2.obj",
-        "simd/x86_64/jdcolor-sse2.obj",
-        "simd/x86_64/jdmerge-avx2.obj",
-        "simd/x86_64/jdmerge-sse2.obj",
-        "simd/x86_64/jdsample-avx2.obj",
-        "simd/x86_64/jdsample-sse2.obj",
-        "simd/x86_64/jfdctflt-sse.obj",
-        "simd/x86_64/jfdctfst-sse2.obj",
-        "simd/x86_64/jfdctint-avx2.obj",
-        "simd/x86_64/jfdctint-sse2.obj",
-        "simd/x86_64/jidctflt-sse2.obj",
-        "simd/x86_64/jidctfst-sse2.obj",
-        "simd/x86_64/jidctint-avx2.obj",
-        "simd/x86_64/jidctint-sse2.obj",
-        "simd/x86_64/jidctred-sse2.obj",
-        "simd/x86_64/jquantf-sse2.obj",
-        "simd/x86_64/jquanti-avx2.obj",
-        "simd/x86_64/jquanti-sse2.obj",
-        "simd/x86_64/jsimdcpu.obj",
-    ],
-    copts = libjpegturbo_copts,
-)
-
-genrule(
-    name = "simd_win_x86_64_assemble",
-    srcs = [
-        "jconfig.h",
-        "jconfigint.h",
-        "simd/x86_64/jccolext-avx2.asm",
-        "simd/x86_64/jccolext-sse2.asm",
-        "simd/x86_64/jccolor-avx2.asm",
-        "simd/x86_64/jccolor-sse2.asm",
-        "simd/x86_64/jcgray-avx2.asm",
-        "simd/x86_64/jcgray-sse2.asm",
-        "simd/x86_64/jcgryext-avx2.asm",
-        "simd/x86_64/jcgryext-sse2.asm",
-        "simd/x86_64/jchuff-sse2.asm",
-        "simd/x86_64/jcphuff-sse2.asm",
-        "simd/x86_64/jcsample-avx2.asm",
-        "simd/x86_64/jcsample-sse2.asm",
-        "simd/x86_64/jdcolext-avx2.asm",
-        "simd/x86_64/jdcolext-sse2.asm",
-        "simd/x86_64/jdcolor-avx2.asm",
-        "simd/x86_64/jdcolor-sse2.asm",
-        "simd/x86_64/jdmerge-avx2.asm",
-        "simd/x86_64/jdmerge-sse2.asm",
-        "simd/x86_64/jdmrgext-avx2.asm",
-        "simd/x86_64/jdmrgext-sse2.asm",
-        "simd/x86_64/jdsample-avx2.asm",
-        "simd/x86_64/jdsample-sse2.asm",
-        "simd/x86_64/jfdctflt-sse.asm",
-        "simd/x86_64/jfdctfst-sse2.asm",
-        "simd/x86_64/jfdctint-avx2.asm",
-        "simd/x86_64/jfdctint-sse2.asm",
-        "simd/x86_64/jidctflt-sse2.asm",
-        "simd/x86_64/jidctfst-sse2.asm",
-        "simd/x86_64/jidctint-avx2.asm",
-        "simd/x86_64/jidctint-sse2.asm",
-        "simd/x86_64/jidctred-sse2.asm",
-        "simd/x86_64/jquantf-sse2.asm",
-        "simd/x86_64/jquanti-avx2.asm",
-        "simd/x86_64/jquanti-sse2.asm",
-        "simd/x86_64/jsimdcpu.asm",
-        "simd/nasm/jcolsamp.inc",
-        "simd/nasm/jdct.inc",
-        "simd/nasm/jpeg_nbits_table.inc",
-        "simd/nasm/jsimdcfg.inc",
-        "simd/nasm/jsimdcfg.inc.h",
-        "simd/nasm/jsimdext.inc",
-    ],
-    outs = [
-        "simd/x86_64/jccolor-avx2.obj",
-        "simd/x86_64/jccolor-sse2.obj",
-        "simd/x86_64/jcgray-avx2.obj",
-        "simd/x86_64/jcgray-sse2.obj",
-        "simd/x86_64/jchuff-sse2.obj",
-        "simd/x86_64/jcphuff-sse2.obj",
-        "simd/x86_64/jcsample-avx2.obj",
-        "simd/x86_64/jcsample-sse2.obj",
-        "simd/x86_64/jdcolor-avx2.obj",
-        "simd/x86_64/jdcolor-sse2.obj",
-        "simd/x86_64/jdmerge-avx2.obj",
-        "simd/x86_64/jdmerge-sse2.obj",
-        "simd/x86_64/jdsample-avx2.obj",
-        "simd/x86_64/jdsample-sse2.obj",
-        "simd/x86_64/jfdctflt-sse.obj",
-        "simd/x86_64/jfdctfst-sse2.obj",
-        "simd/x86_64/jfdctint-avx2.obj",
-        "simd/x86_64/jfdctint-sse2.obj",
-        "simd/x86_64/jidctflt-sse2.obj",
-        "simd/x86_64/jidctfst-sse2.obj",
-        "simd/x86_64/jidctint-avx2.obj",
-        "simd/x86_64/jidctint-sse2.obj",
-        "simd/x86_64/jidctred-sse2.obj",
-        "simd/x86_64/jquantf-sse2.obj",
-        "simd/x86_64/jquanti-avx2.obj",
-        "simd/x86_64/jquanti-sse2.obj",
-        "simd/x86_64/jsimdcpu.obj",
-    ],
-    cmd = "for out in $(OUTS); do\n" +
-          "  $(location @nasm//:nasm) -fwin64 -DWIN64 -D__x86_64__" +
-          "    -I $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/" +
-          "    -I $$(dirname $(location simd/nasm/jdct.inc))/" +
-          "    -I $$(dirname $(location simd/nasm/jdct.inc))/../../win/" +
-          "    -o $$out" +
-          "    $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/$$(basename $${out%.obj}.asm)\n" +
-          "done",
-    tools = ["@nasm"],
-)
-
-cc_library(
-    name = "simd_none",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimd_none.c",
-        "jsimddct.h",
-    ],
-    copts = libjpegturbo_copts,
-    nocopts = libjpegturbo_nocopts,
-)
-
-template_rule(
-    name = "jconfig_win",
-    src = "win/jconfig.h.in",
-    out = "jconfig_win.h",
-    substitutions = {
-        "@JPEG_LIB_VERSION@": "62",
-        "@VERSION@": "2.0.0",
-        "@LIBJPEG_TURBO_VERSION_NUMBER@": "2000000",
-        "@BITS_IN_JSAMPLE@": "8",
-        "#cmakedefine C_ARITH_CODING_SUPPORTED": "#define C_ARITH_CODING_SUPPORTED",
-        "#cmakedefine D_ARITH_CODING_SUPPORTED": "#define D_ARITH_CODING_SUPPORTED",
-        "#cmakedefine MEM_SRCDST_SUPPORTED": "#define MEM_SRCDST_SUPPORTED",
-        "#cmakedefine WITH_SIMD": "",
-    },
-)
-
-JCONFIG_NOWIN_COMMON_SUBSTITUTIONS = {
-    "@JPEG_LIB_VERSION@": "62",
-    "@VERSION@": "2.0.0",
-    "@LIBJPEG_TURBO_VERSION_NUMBER@": "2000000",
-    "#cmakedefine C_ARITH_CODING_SUPPORTED": "#define C_ARITH_CODING_SUPPORTED",
-    "#cmakedefine D_ARITH_CODING_SUPPORTED": "#define D_ARITH_CODING_SUPPORTED",
-    "#cmakedefine MEM_SRCDST_SUPPORTED": "#define MEM_SRCDST_SUPPORTED",
-    "@BITS_IN_JSAMPLE@": "8",
-    "#cmakedefine HAVE_LOCALE_H": "#define HAVE_LOCALE_H 1",
-    "#cmakedefine HAVE_STDDEF_H": "#define HAVE_STDDEF_H 1",
-    "#cmakedefine HAVE_STDLIB_H": "#define HAVE_STDLIB_H 1",
-    "#cmakedefine NEED_SYS_TYPES_H": "#define NEED_SYS_TYPES_H",
-    "#cmakedefine NEED_BSD_STRINGS": "",
-    "#cmakedefine HAVE_UNSIGNED_CHAR": "#define HAVE_UNSIGNED_CHAR 1",
-    "#cmakedefine HAVE_UNSIGNED_SHORT": "#define HAVE_UNSIGNED_SHORT 1",
-    "#cmakedefine INCOMPLETE_TYPES_BROKEN": "",
-    "#cmakedefine RIGHT_SHIFT_IS_UNSIGNED": "",
-    "#cmakedefine __CHAR_UNSIGNED__": "",
-    "#undef const": "",
-    "#undef size_t": "",
-}
-
-JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = {
-    "#cmakedefine WITH_SIMD": "#define WITH_SIMD",
-}
-
-JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = {
-    "#cmakedefine WITH_SIMD": "",
-}
-
-JCONFIG_NOWIN_SIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
-
-JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
-
-template_rule(
-    name = "jconfig_nowin_nosimd",
-    src = "jconfig.h.in",
-    out = "jconfig_nowin_nosimd.h",
-    substitutions = JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS,
-)
-
-template_rule(
-    name = "jconfig_nowin_simd",
-    src = "jconfig.h.in",
-    out = "jconfig_nowin_simd.h",
-    substitutions = JCONFIG_NOWIN_SIMD_SUBSTITUTIONS,
-)
-
-JCONFIGINT_COMMON_SUBSTITUTIONS = {
-    "@BUILD@": "20180831",
-    "@VERSION@": "2.0.0",
-    "@CMAKE_PROJECT_NAME@": "libjpeg-turbo",
-    "#undef inline": "",
-    "#cmakedefine HAVE_INTRIN_H": "",
-}
-
-JCONFIGINT_NOWIN_SUBSTITUTIONS = {
-    "#cmakedefine HAVE_BUILTIN_CTZL": "#define HAVE_BUILTIN_CTZL",
-    "@INLINE@": "inline __attribute__((always_inline))",
-    "#define SIZEOF_SIZE_T  @SIZE_T@": "#if (__WORDSIZE==64 && !defined(__native_client__))\n" +
-                                       "#define SIZEOF_SIZE_T 8\n" +
-                                       "#else\n" +
-                                       "#define SIZEOF_SIZE_T 4\n" +
-                                       "#endif\n",
-}
-
-JCONFIGINT_WIN_SUBSTITUTIONS = {
-    "#cmakedefine HAVE_BUILTIN_CTZL": "",
-    "#define INLINE  @INLINE@": "#if defined(__GNUC__)\n" +
-                                "#define INLINE inline __attribute__((always_inline))\n" +
-                                "#elif defined(_MSC_VER)\n" +
-                                "#define INLINE __forceinline\n" +
-                                "#else\n" +
-                                "#define INLINE\n" +
-                                "#endif\n",
-    "#define SIZEOF_SIZE_T  @SIZE_T@": "#if (__WORDSIZE==64)\n" +
-                                       "#define SIZEOF_SIZE_T 8\n" +
-                                       "#else\n" +
-                                       "#define SIZEOF_SIZE_T 4\n" +
-                                       "#endif\n",
-}
-
-JCONFIGINT_NOWIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
-JCONFIGINT_WIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
-
-template_rule(
-    name = "jconfigint_nowin",
-    src = "jconfigint.h.in",
-    out = "jconfigint_nowin.h",
-    substitutions = JCONFIGINT_NOWIN_SUBSTITUTIONS,
-)
-
-template_rule(
-    name = "jconfigint_win",
-    src = "jconfigint.h.in",
-    out = "jconfigint_win.h",
-    substitutions = JCONFIGINT_WIN_SUBSTITUTIONS,
-)
-
-genrule(
-    name = "configure",
-    srcs = [
-        "jconfig_win.h",
-        "jconfig_nowin_nosimd.h",
-        "jconfig_nowin_simd.h",
-    ],
-    outs = ["jconfig.h"],
-    cmd = select({
-        ":windows": "cp $(location jconfig_win.h) $@",
-        ":k8": "cp $(location jconfig_nowin_simd.h) $@",
-        ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
-        ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
-        ":linux_ppc64le": "cp $(location jconfig_nowin_simd.h) $@",
-        "//conditions:default": "cp $(location jconfig_nowin_nosimd.h) $@",
-    }),
-)
-
-genrule(
-    name = "configure_internal",
-    srcs = [
-        "jconfigint_win.h",
-        "jconfigint_nowin.h",
-    ],
-    outs = ["jconfigint.h"],
-    cmd = select({
-        ":windows": "cp $(location jconfigint_win.h) $@",
-        "//conditions:default": "cp $(location jconfigint_nowin.h) $@",
-    }),
-)
-
-# jiminy cricket the way this file is generated is completely outrageous
-genrule(
-    name = "configure_simd",
-    outs = ["simd/jsimdcfg.inc"],
-    cmd = "cat <<'EOF' >$@\n" +
-          "%define DCTSIZE 8\n" +
-          "%define DCTSIZE2 64\n" +
-          "%define RGB_RED 0\n" +
-          "%define RGB_GREEN 1\n" +
-          "%define RGB_BLUE 2\n" +
-          "%define RGB_PIXELSIZE 3\n" +
-          "%define EXT_RGB_RED 0\n" +
-          "%define EXT_RGB_GREEN 1\n" +
-          "%define EXT_RGB_BLUE 2\n" +
-          "%define EXT_RGB_PIXELSIZE 3\n" +
-          "%define EXT_RGBX_RED 0\n" +
-          "%define EXT_RGBX_GREEN 1\n" +
-          "%define EXT_RGBX_BLUE 2\n" +
-          "%define EXT_RGBX_PIXELSIZE 4\n" +
-          "%define EXT_BGR_RED 2\n" +
-          "%define EXT_BGR_GREEN 1\n" +
-          "%define EXT_BGR_BLUE 0\n" +
-          "%define EXT_BGR_PIXELSIZE 3\n" +
-          "%define EXT_BGRX_RED 2\n" +
-          "%define EXT_BGRX_GREEN 1\n" +
-          "%define EXT_BGRX_BLUE 0\n" +
-          "%define EXT_BGRX_PIXELSIZE 4\n" +
-          "%define EXT_XBGR_RED 3\n" +
-          "%define EXT_XBGR_GREEN 2\n" +
-          "%define EXT_XBGR_BLUE 1\n" +
-          "%define EXT_XBGR_PIXELSIZE 4\n" +
-          "%define EXT_XRGB_RED 1\n" +
-          "%define EXT_XRGB_GREEN 2\n" +
-          "%define EXT_XRGB_BLUE 3\n" +
-          "%define EXT_XRGB_PIXELSIZE 4\n" +
-          "%define RGBX_FILLER_0XFF 1\n" +
-          "%define JSAMPLE byte ; unsigned char\n" +
-          "%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)\n" +
-          "%define CENTERJSAMPLE 128\n" +
-          "%define JCOEF word ; short\n" +
-          "%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)\n" +
-          "%define JDIMENSION dword ; unsigned int\n" +
-          "%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)\n" +
-          "%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)\n" +
-          "%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)\n" +
-          "%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)\n" +
-          "%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)\n" +
-          "%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)\n" +
-          "%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)\n" +
-          "%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)\n" +
-          "%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)\n" +
-          "%define DCTELEM word ; short\n" +
-          "%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)\n" +
-          "%define float FP32 ; float\n" +
-          "%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)\n" +
-          "%define ISLOW_MULT_TYPE word ; must be short\n" +
-          "%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)\n" +
-          "%define IFAST_MULT_TYPE word ; must be short\n" +
-          "%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)\n" +
-          "%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors\n" +
-          "%define FLOAT_MULT_TYPE FP32 ; must be float\n" +
-          "%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)\n" +
-          "%define JSIMD_NONE 0x00\n" +
-          "%define JSIMD_MMX 0x01\n" +
-          "%define JSIMD_3DNOW 0x02\n" +
-          "%define JSIMD_SSE 0x04\n" +
-          "%define JSIMD_SSE2 0x08\n" +
-          "EOF",
-)
-
-config_setting(
-    name = "k8",
-    values = {"cpu": "k8"},
-)
-
-config_setting(
-    name = "android",
-    values = {"crosstool_top": "//external:android/crosstool"},
-)
-
-config_setting(
-    name = "armeabi-v7a",
-    values = {"cpu": "armeabi-v7a"},
-)
-
-config_setting(
-    name = "arm64-v8a",
-    values = {"cpu": "arm64-v8a"},
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-)
-
-config_setting(
-    name = "linux_ppc64le",
-    values = {"cpu": "ppc"},
-)
diff --git a/third_party/jpeg/jpeg_helpers.BUILD.bazel b/third_party/jpeg/jpeg_helpers.BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..5b01f6e3e4cfd195327e08ff6a957acce4e21c71
--- /dev/null
+++ b/third_party/jpeg/jpeg_helpers.BUILD.bazel
@@ -0,0 +1 @@
+licenses(["notice"])
diff --git a/third_party/jpeg/workspace.bzl b/third_party/jpeg/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..2bb7dacd322d397d9cdd4c15da01960c861d5aa4
--- /dev/null
+++ b/third_party/jpeg/workspace.bzl
@@ -0,0 +1,16 @@
+"""loads the jpeg library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "jpeg",
+        urls = [
+            "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
+            "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz",
+        ],
+        sha256 = "f892fff427ab3adffc289363eac26d197ce3ccacefe5f5822377348a8166069b",
+        strip_prefix = "libjpeg-turbo-2.0.0",
+        build_file = "//third_party/jpeg:BUILD.bazel",
+        system_build_file = "//third_party/jpeg:BUILD.system",
+    )
diff --git a/third_party/keras_applications_archive/BUILD b/third_party/keras_applications_archive/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..82bab3ffd9646371869aafa09115ef0bb46d2862
--- /dev/null
+++ b/third_party/keras_applications_archive/BUILD
@@ -0,0 +1 @@
+# This empty BUILD file is required to make Bazel treat this directory as a package.
diff --git a/third_party/keras_applications_archive/BUILD.bazel b/third_party/keras_applications_archive/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..57c8f597c7f64cc81b6df8e3fc1e1a55311e30e1
--- /dev/null
+++ b/third_party/keras_applications_archive/BUILD.bazel
@@ -0,0 +1,31 @@
+# Description: Keras Applications: set of pre-trained deep learning models.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # MIT
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "keras_applications",
+    srcs = [
+        "keras_applications/__init__.py",
+        "keras_applications/densenet.py",
+        "keras_applications/imagenet_utils.py",
+        "keras_applications/inception_resnet_v2.py",
+        "keras_applications/inception_v3.py",
+        "keras_applications/mobilenet.py",
+        "keras_applications/mobilenet_v2.py",
+        "keras_applications/nasnet.py",
+        "keras_applications/resnet50.py",
+        "keras_applications/vgg16.py",
+        "keras_applications/vgg19.py",
+        "keras_applications/xception.py",
+    ],
+    deps = [
+        "@org_tensorflow//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
diff --git a/third_party/keras_applications_archive/workspace.bzl b/third_party/keras_applications_archive/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..e90630fa974fb97f4c7d5a72c045a44c237a6ace
--- /dev/null
+++ b/third_party/keras_applications_archive/workspace.bzl
@@ -0,0 +1,15 @@
+"""Loads Keras-applications python package."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "keras_applications_archive",
+        strip_prefix = "keras-applications-1.0.6",
+        sha256 = "2cb412c97153160ec267b238e958d281ac3532b139cab42045c2d7086a157c21",
+        urls = [
+            "http://mirror.bazel.build/github.com/keras-team/keras-applications/archive/1.0.6.tar.gz",
+            "https://github.com/keras-team/keras-applications/archive/1.0.6.tar.gz",
+        ],
+        build_file = "//third_party/keras_applications_archive:BUILD.bazel",
+    )
diff --git a/third_party/kissfft/BUILD b/third_party/kissfft/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..82bab3ffd9646371869aafa09115ef0bb46d2862
--- /dev/null
+++ b/third_party/kissfft/BUILD
@@ -0,0 +1 @@
+# This empty BUILD file is required to make Bazel treat this directory as a package.
diff --git a/third_party/kissfft/BUILD.bazel b/third_party/kissfft/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..a57cb6ebda4b9025d4b1abdcc351154a6ac43192
--- /dev/null
+++ b/third_party/kissfft/BUILD.bazel
@@ -0,0 +1,23 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "kiss_fftr_16",
+    srcs = [
+        "kiss_fft.c",
+        "tools/kiss_fftr.c",
+    ],
+    hdrs = [
+        "_kiss_fft_guts.h",
+        "kiss_fft.h",
+        "tools/kiss_fftr.h",
+    ],
+    copts = [
+        "-DFIXED_POINT=16",
+    ],
+)
diff --git a/third_party/kissfft/workspace.bzl b/third_party/kissfft/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..1754eb1a90f592cb1457e654074785cf662a3cf9
--- /dev/null
+++ b/third_party/kissfft/workspace.bzl
@@ -0,0 +1,15 @@
+"""Loads the kissfft library, used by TF Lite."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "kissfft",
+        strip_prefix = "kissfft-cddf3833fdf24fa84b79be37efdcd348cae0e39c",
+        sha256 = "7ba83a3da1636350472e501e3e6c3418df72466990530ea273c05fa7e3dd8635",
+        urls = [
+            "https://mirror.bazel.build/github.com/mborgerding/kissfft/archive/cddf3833fdf24fa84b79be37efdcd348cae0e39c.tar.gz",
+            "https://github.com/mborgerding/kissfft/archive/cddf3833fdf24fa84b79be37efdcd348cae0e39c.tar.gz",
+        ],
+        build_file = "//third_party/kissfft:BUILD.bazel",
+    )
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index ee49d281abcd54b566edde119f4a5b3e6b07d2a3..dc7dcc9517092e05999c067f9d2e04b4f36bb37a 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -38,8 +38,8 @@ genrule(
         ":libxsmm_interface",
     ],
     visibility = [
-        "//third_party/eigen3:__pkg__",
         "//tensorflow/core/kernels:__pkg__",
+        "//third_party/eigen3:__pkg__",
     ],
 )
 
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 776935739ace7e2b8e337eff28e73e3a8a5b7f47..eb468aa65fce9c014bc7b53f1fb69729eb2a3718 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -823,6 +823,7 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
     deps = [
+        ":arm_asm_printer",
         ":arm_desc",
         ":arm_info",
         ":arm_utils",
@@ -2141,6 +2142,7 @@ cc_library(
         ":core",
         ":global_i_sel",
         ":mc",
+        ":profile_data",
         ":selection_dag",
         ":support",
         ":target",
diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl
index 54ca86f3272cb6c91541e20d9ba5326d2cf726a0..5a977f82c417a9ae3e3022fa43534affe727cae2 100644
--- a/third_party/llvm/llvm.bzl
+++ b/third_party/llvm/llvm.bzl
@@ -250,6 +250,7 @@ linux_cmake_vars = {
 # CMake variables specific to the Darwin (Mac OS X) platform.
 darwin_cmake_vars = {
     "HAVE_MALLOC_MALLOC_H": 1,
+    "HAVE_MALLOC_ZONE_STATISTICS": 1,
 }
 
 # CMake variables specific to the Windows platform.
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 597ac69e2ffed73210733fab98bed3d1227b0d23..d80c7135d6fd47f45a00b35bb29ceae0c0d1d003 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -42,8 +42,8 @@ cc_library(
         "src",
         "src/common",
         "src/cpu",
-        "src/cpu/xbyak",
         "src/cpu/gemm",
+        "src/cpu/xbyak",
     ],
     nocopts = "-fno-exceptions",
     visibility = ["//visibility:public"],
@@ -63,3 +63,27 @@ cc_library(
         "//conditions:default": [],
     }),
 )
+
+cc_library(
+    name = "mkldnn_single_threaded",
+    srcs = glob([
+        "src/common/*.cpp",
+        "src/cpu/*.cpp",
+        "src/cpu/gemm/*.cpp",
+    ]),
+    hdrs = glob(["include/*"]),
+    copts = [
+        "-fexceptions",
+        "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
+    ],
+    includes = [
+        "include",
+        "src",
+        "src/common",
+        "src/cpu",
+        "src/cpu/gemm",
+        "src/cpu/xbyak",
+    ],
+    nocopts = "-fno-exceptions",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/mpi/mpi.bzl b/third_party/mpi/mpi.bzl
index 38ce91c4d069fc311d5e7f17a49ff7904c9c67eb..3a483351d1f982eba09d6522db9842dd4f7eca84 100644
--- a/third_party/mpi/mpi.bzl
+++ b/third_party/mpi/mpi.bzl
@@ -2,16 +2,16 @@
 #based on the configuration options return one or the other
 
 def mpi_hdr():
-    MPI_LIB_IS_OPENMPI=True
-    hdrs = []    
+    MPI_LIB_IS_OPENMPI = True
+    hdrs = []
     if MPI_LIB_IS_OPENMPI:
-        hdrs = ["mpi.h", "mpi_portable_platform.h"]   #When using OpenMPI
+        hdrs = ["mpi.h", "mpi_portable_platform.h"]  #When using OpenMPI
     else:
-        hdrs = ["mpi.h",  "mpio.h", "mpicxx.h"]        #When using MVAPICH
+        hdrs = ["mpi.h", "mpio.h", "mpicxx.h"]  #When using MVAPICH
     return hdrs
 
 def if_mpi(if_true, if_false = []):
     return select({
         "//tensorflow:with_mpi_support": if_true,
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
diff --git a/third_party/nasm.BUILD b/third_party/nasm.BUILD
deleted file mode 100644
index d746a65e7e5e22bf15eb181a39bde82eb4796cff..0000000000000000000000000000000000000000
--- a/third_party/nasm.BUILD
+++ /dev/null
@@ -1,168 +0,0 @@
-# Description:
-#   NASM is a portable assembler in the Intel/Microsoft tradition.
-
-licenses(["notice"])  # BSD 2-clause
-
-exports_files(["LICENSE"])
-
-cc_binary(
-    name = "nasm",
-    srcs = [
-        "asm/assemble.c",
-        "asm/assemble.h",
-        "asm/directbl.c",
-        "asm/directiv.c",
-        "asm/directiv.h",
-        "asm/error.c",
-        "asm/eval.c",
-        "asm/eval.h",
-        "asm/exprdump.c",
-        "asm/exprlib.c",
-        "asm/float.c",
-        "asm/float.h",
-        "asm/labels.c",
-        "asm/listing.c",
-        "asm/listing.h",
-        "asm/nasm.c",
-        "asm/parser.c",
-        "asm/parser.h",
-        "asm/pptok.c",
-        "asm/pptok.h",
-        "asm/pragma.c",
-        "asm/preproc.c",
-        "asm/preproc.h",
-        "asm/preproc-nop.c",
-        "asm/quote.c",
-        "asm/quote.h",
-        "asm/rdstrnum.c",
-        "asm/segalloc.c",
-        "asm/stdscan.c",
-        "asm/stdscan.h",
-        "asm/strfunc.c",
-        "asm/tokens.h",
-        "asm/tokhash.c",
-        "common/common.c",
-        "config/unknown.h",
-        "disasm/disasm.c",
-        "disasm/disasm.h",
-        "disasm/sync.c",
-        "disasm/sync.h",
-        "include/compiler.h",
-        "include/disp8.h",
-        "include/error.h",
-        "include/hashtbl.h",
-        "include/iflag.h",
-        "include/insns.h",
-        "include/labels.h",
-        "include/md5.h",
-        "include/nasm.h",
-        "include/nasmint.h",
-        "include/nasmlib.h",
-        "include/opflags.h",
-        "include/perfhash.h",
-        "include/raa.h",
-        "include/rbtree.h",
-        "include/rdoff.h",
-        "include/saa.h",
-        "include/strlist.h",
-        "include/tables.h",
-        "include/ver.h",
-        "macros/macros.c",
-        "nasmlib/badenum.c",
-        "nasmlib/bsi.c",
-        "nasmlib/crc64.c",
-        "nasmlib/file.c",
-        "nasmlib/file.h",
-        "nasmlib/filename.c",
-        "nasmlib/hashtbl.c",
-        "nasmlib/ilog2.c",
-        "nasmlib/malloc.c",
-        "nasmlib/md5c.c",
-        "nasmlib/mmap.c",
-        "nasmlib/path.c",
-        "nasmlib/perfhash.c",
-        "nasmlib/raa.c",
-        "nasmlib/rbtree.c",
-        "nasmlib/readnum.c",
-        "nasmlib/realpath.c",
-        "nasmlib/saa.c",
-        "nasmlib/srcfile.c",
-        "nasmlib/string.c",
-        "nasmlib/strlist.c",
-        "nasmlib/ver.c",
-        "nasmlib/zerobuf.c",
-        "output/codeview.c",
-        "output/dwarf.h",
-        "output/elf.h",
-        "output/legacy.c",
-        "output/nulldbg.c",
-        "output/nullout.c",
-        "output/outaout.c",
-        "output/outas86.c",
-        "output/outbin.c",
-        "output/outcoff.c",
-        "output/outdbg.c",
-        "output/outelf.c",
-        "output/outelf.h",
-        "output/outform.c",
-        "output/outform.h",
-        "output/outieee.c",
-        "output/outlib.c",
-        "output/outlib.h",
-        "output/outmacho.c",
-        "output/outobj.c",
-        "output/outrdf2.c",
-        "output/pecoff.h",
-        "output/stabs.h",
-        "stdlib/snprintf.c",
-        "stdlib/strlcpy.c",
-        "stdlib/strnlen.c",
-        "stdlib/vsnprintf.c",
-        "version.h",
-        "x86/disp8.c",
-        "x86/iflag.c",
-        "x86/iflaggen.h",
-        "x86/insnsa.c",
-        "x86/insnsb.c",
-        "x86/insnsd.c",
-        "x86/insnsi.h",
-        "x86/insnsn.c",
-        "x86/regdis.c",
-        "x86/regdis.h",
-        "x86/regflags.c",
-        "x86/regs.c",
-        "x86/regs.h",
-        "x86/regvals.c",
-    ] + select({
-        ":windows": ["config/msvc.h"],
-        "//conditions:default": [],
-    }),
-    includes = [
-        "asm",
-        "include",
-        "output",
-        "x86",
-    ],
-    copts = select({
-        ":windows": [],
-        "//conditions:default": [
-            "-w",
-            "-std=c99",
-        ],
-    }),
-    defines = select({
-        ":windows": [],
-        "//conditions:default": [
-            "HAVE_SNPRINTF",
-            "HAVE_SYS_TYPES_H",
-        ],
-    }),
-    visibility = ["@jpeg//:__pkg__"],
-)
-
-config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
-)
diff --git a/third_party/nasm/BUILD b/third_party/nasm/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e3aec1fce9377ff53d9eeb0effa84dd0fcca2f49
--- /dev/null
+++ b/third_party/nasm/BUILD
@@ -0,0 +1 @@
+# Needed to make this a package.
diff --git a/third_party/nasm/BUILD.bazel b/third_party/nasm/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..c68d713946410eae91052219906820bec30b1b66
--- /dev/null
+++ b/third_party/nasm/BUILD.bazel
@@ -0,0 +1,168 @@
+# Description:
+#   NASM is a portable assembler in the Intel/Microsoft tradition.
+
+licenses(["notice"])  # BSD 2-clause
+
+exports_files(["LICENSE"])
+
+cc_binary(
+    name = "nasm",
+    srcs = [
+        "asm/assemble.c",
+        "asm/assemble.h",
+        "asm/directbl.c",
+        "asm/directiv.c",
+        "asm/directiv.h",
+        "asm/error.c",
+        "asm/eval.c",
+        "asm/eval.h",
+        "asm/exprdump.c",
+        "asm/exprlib.c",
+        "asm/float.c",
+        "asm/float.h",
+        "asm/labels.c",
+        "asm/listing.c",
+        "asm/listing.h",
+        "asm/nasm.c",
+        "asm/parser.c",
+        "asm/parser.h",
+        "asm/pptok.c",
+        "asm/pptok.h",
+        "asm/pragma.c",
+        "asm/preproc.c",
+        "asm/preproc.h",
+        "asm/preproc-nop.c",
+        "asm/quote.c",
+        "asm/quote.h",
+        "asm/rdstrnum.c",
+        "asm/segalloc.c",
+        "asm/stdscan.c",
+        "asm/stdscan.h",
+        "asm/strfunc.c",
+        "asm/tokens.h",
+        "asm/tokhash.c",
+        "common/common.c",
+        "config/unknown.h",
+        "disasm/disasm.c",
+        "disasm/disasm.h",
+        "disasm/sync.c",
+        "disasm/sync.h",
+        "include/compiler.h",
+        "include/disp8.h",
+        "include/error.h",
+        "include/hashtbl.h",
+        "include/iflag.h",
+        "include/insns.h",
+        "include/labels.h",
+        "include/md5.h",
+        "include/nasm.h",
+        "include/nasmint.h",
+        "include/nasmlib.h",
+        "include/opflags.h",
+        "include/perfhash.h",
+        "include/raa.h",
+        "include/rbtree.h",
+        "include/rdoff.h",
+        "include/saa.h",
+        "include/strlist.h",
+        "include/tables.h",
+        "include/ver.h",
+        "macros/macros.c",
+        "nasmlib/badenum.c",
+        "nasmlib/bsi.c",
+        "nasmlib/crc64.c",
+        "nasmlib/file.c",
+        "nasmlib/file.h",
+        "nasmlib/filename.c",
+        "nasmlib/hashtbl.c",
+        "nasmlib/ilog2.c",
+        "nasmlib/malloc.c",
+        "nasmlib/md5c.c",
+        "nasmlib/mmap.c",
+        "nasmlib/path.c",
+        "nasmlib/perfhash.c",
+        "nasmlib/raa.c",
+        "nasmlib/rbtree.c",
+        "nasmlib/readnum.c",
+        "nasmlib/realpath.c",
+        "nasmlib/saa.c",
+        "nasmlib/srcfile.c",
+        "nasmlib/string.c",
+        "nasmlib/strlist.c",
+        "nasmlib/ver.c",
+        "nasmlib/zerobuf.c",
+        "output/codeview.c",
+        "output/dwarf.h",
+        "output/elf.h",
+        "output/legacy.c",
+        "output/nulldbg.c",
+        "output/nullout.c",
+        "output/outaout.c",
+        "output/outas86.c",
+        "output/outbin.c",
+        "output/outcoff.c",
+        "output/outdbg.c",
+        "output/outelf.c",
+        "output/outelf.h",
+        "output/outform.c",
+        "output/outform.h",
+        "output/outieee.c",
+        "output/outlib.c",
+        "output/outlib.h",
+        "output/outmacho.c",
+        "output/outobj.c",
+        "output/outrdf2.c",
+        "output/pecoff.h",
+        "output/stabs.h",
+        "stdlib/snprintf.c",
+        "stdlib/strlcpy.c",
+        "stdlib/strnlen.c",
+        "stdlib/vsnprintf.c",
+        "version.h",
+        "x86/disp8.c",
+        "x86/iflag.c",
+        "x86/iflaggen.h",
+        "x86/insnsa.c",
+        "x86/insnsb.c",
+        "x86/insnsd.c",
+        "x86/insnsi.h",
+        "x86/insnsn.c",
+        "x86/regdis.c",
+        "x86/regdis.h",
+        "x86/regflags.c",
+        "x86/regs.c",
+        "x86/regs.h",
+        "x86/regvals.c",
+    ] + select({
+        ":windows": ["config/msvc.h"],
+        "//conditions:default": [],
+    }),
+    copts = select({
+        ":windows": [],
+        "//conditions:default": [
+            "-w",
+            "-std=c99",
+        ],
+    }),
+    defines = select({
+        ":windows": [],
+        "//conditions:default": [
+            "HAVE_SNPRINTF",
+            "HAVE_SYS_TYPES_H",
+        ],
+    }),
+    includes = [
+        "asm",
+        "include",
+        "output",
+        "x86",
+    ],
+    visibility = ["@jpeg//:__pkg__"],
+)
+
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
+)
diff --git a/third_party/systemlibs/nasm.BUILD b/third_party/nasm/BUILD.system
similarity index 100%
rename from third_party/systemlibs/nasm.BUILD
rename to third_party/nasm/BUILD.system
diff --git a/third_party/nasm/workspace.bzl b/third_party/nasm/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..6d50f6fcada3008535d25270ef05126aa233f004
--- /dev/null
+++ b/third_party/nasm/workspace.bzl
@@ -0,0 +1,17 @@
+"""loads the nasm library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "nasm",
+        urls = [
+            "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
+            "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.13.03.tar.bz2/sha512/d7a6b4cee8dfd603d8d4c976e5287b5cc542fa0b466ff989b743276a6e28114e64289bf02a7819eca63142a5278aa6eed57773007e5f589e15768e6456a8919d/nasm-2.13.03.tar.bz2",
+            "http://www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2",
+        ],
+        sha256 = "63ec86477ad3f0f6292325fd89e1d93aea2e2fd490070863f17d48f7cd387011",
+        strip_prefix = "nasm-2.13.03",
+        build_file = "//third_party/nasm:BUILD.bazel",
+        system_build_file = "//third_party/nasm:BUILD.system",
+    )
diff --git a/third_party/nccl/LICENSE b/third_party/nccl/LICENSE
index 146d9b765c5db44c2f5bea8fa5010eef5ec0c68f..b9585181860564989797281af502beb3ed3d2706 100644
--- a/third_party/nccl/LICENSE
+++ b/third_party/nccl/LICENSE
@@ -1,203 +1,30 @@
-Copyright 2018 The TensorFlow Authors.  All rights reserved.
 
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2018, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+ Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+    Laboratory, the U.S. Department of Energy, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ The U.S. Department of Energy funded the development of this software
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7a08f97ef328a7a731d7c76de8bda70c8d004dac
--- /dev/null
+++ b/third_party/nccl/archive.BUILD
@@ -0,0 +1,174 @@
+# NVIDIA NCCL 2
+# A package of optimized primitives for collective multi-GPU communication.
+
+licenses(["restricted"])
+
+exports_files(["LICENSE.txt"])
+
+load(
+    "@local_config_nccl//:build_defs.bzl",
+    "gen_nccl_h",
+    "nccl_library",
+    "rdc_copts",
+    "rdc_library",
+)
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    "cuda_default_copts",
+)
+
+# Generate the nccl.h header file.
+gen_nccl_h(
+    name = "nccl_h",
+    output = "src/nccl.h",
+    template = "src/nccl.h.in",
+)
+
+nccl_library(
+    name = "src_hdrs",
+    hdrs = [
+        "src/nccl.h",
+        # src/include/common_coll.h #includes "collectives/collectives.h".
+        # All other #includes of collectives.h are patched in process_srcs.
+        "src/collectives/collectives.h",
+    ],
+    strip_include_prefix = "src",
+)
+
+nccl_library(
+    name = "include_hdrs",
+    hdrs = glob(["src/include/*.h"]),
+    strip_include_prefix = "src/include",
+)
+
+filegroup(
+    name = "device_hdrs",
+    srcs = glob(["src/collectives/device/*.h"]),
+)
+
+filegroup(
+    name = "device_srcs",
+    srcs = [
+        "src/collectives/device/all_gather.cu",
+        "src/collectives/device/all_reduce.cu",
+        "src/collectives/device/broadcast.cu",
+        "src/collectives/device/reduce.cu",
+        "src/collectives/device/reduce_scatter.cu",
+    ],
+)
+
+nccl_library(
+    name = "sum",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=0"] + rdc_copts(),
+    linkstatic = True,
+    prefix = "sum_",
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+nccl_library(
+    name = "prod",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=1"] + rdc_copts(),
+    linkstatic = True,
+    prefix = "_prod",
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+nccl_library(
+    name = "min",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=2"] + rdc_copts(),
+    linkstatic = True,
+    prefix = "min_",
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+nccl_library(
+    name = "max",
+    srcs = [
+        ":device_hdrs",
+        ":device_srcs",
+    ],
+    copts = ["-DNCCL_OP=3"] + rdc_copts(),
+    linkstatic = True,
+    prefix = "max_",
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+nccl_library(
+    name = "functions",
+    srcs = [
+        "src/collectives/device/functions.cu",
+        ":device_hdrs",
+    ],
+    copts = rdc_copts(),
+    linkstatic = True,
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+rdc_library(
+    name = "device_code",
+    deps = [
+        ":functions",
+        ":max",
+        ":min",
+        ":prod",
+        ":sum",
+    ],
+)
+
+# Primary NCCL target.
+nccl_library(
+    name = "nccl",
+    srcs = glob(
+        include = ["src/**/*.cu"],
+        # Exclude device-library code.
+        exclude = ["src/collectives/device/**"],
+    ) + [
+        # Required for header inclusion checking (see
+        # http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
+        # Files in src/ which #include "nccl.h" load it from there rather than
+        # from the virtual includes directory.
+        "src/nccl.h",
+    ],
+    hdrs = ["src/nccl.h"],
+    copts = cuda_default_copts(),
+    include_prefix = "third_party/nccl",
+    strip_include_prefix = "src",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":device_code",
+        ":include_hdrs",
+        ":src_hdrs",
+    ],
+)
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..42de79c411c844d48982c47753337102b915aefd
--- /dev/null
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -0,0 +1,252 @@
+"""Repository rule for NCCL."""
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
+def _gen_nccl_h_impl(ctx):
+    """Creates nccl.h from a template."""
+    ctx.actions.expand_template(
+        output = ctx.outputs.output,
+        template = ctx.file.template,
+        substitutions = {
+            "${nccl:Major}": "2",
+            "${nccl:Minor}": "3",
+            "${nccl:Patch}": "5",
+            "${nccl:Suffix}": "",
+            "${nccl:Version}": "2305",
+        },
+    )
+
+gen_nccl_h = rule(
+    implementation = _gen_nccl_h_impl,
+    attrs = {
+        "template": attr.label(allow_single_file = True),
+        "output": attr.output(),
+    },
+)
+"""Creates the NCCL header file."""
+
+def _process_srcs_impl(ctx):
+    """Appends .cc to .cu files, patches include directives."""
+    files = []
+    for src in ctx.files.srcs:
+        if not src.is_source:
+            # Process only once, specifically "src/nccl.h".
+            files.append(src)
+            continue
+        name = src.basename
+        if src.extension == "cu":
+            name = ctx.attr.prefix + name + ".cc"
+        file = ctx.actions.declare_file(name, sibling = src)
+        ctx.actions.expand_template(
+            output = file,
+            template = src,
+            substitutions = {
+                "\"collectives.h": "\"collectives/collectives.h",
+                "\"../collectives.h": "\"collectives/collectives.h",
+                "#if __CUDACC_VER_MAJOR__": "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
+                # Substitutions are applied in order.
+                "std::nullptr_t": "nullptr_t",
+                "nullptr_t": "std::nullptr_t",
+            },
+        )
+        files.append(file)
+    return [DefaultInfo(files = depset(files))]
+
+_process_srcs = rule(
+    implementation = _process_srcs_impl,
+    attrs = {
+        "srcs": attr.label_list(allow_files = True),
+        "prefix": attr.string(default = ""),
+    },
+)
+"""Processes the NCCL srcs so they can be compiled with bazel and clang."""
+
+def nccl_library(name, srcs = None, hdrs = None, prefix = None, **kwargs):
+    """Processes the srcs and hdrs and creates a cc_library."""
+
+    _process_srcs(
+        name = name + "_srcs",
+        srcs = srcs,
+        prefix = prefix,
+    )
+    _process_srcs(
+        name = name + "_hdrs",
+        srcs = hdrs,
+    )
+
+    native.cc_library(
+        name = name,
+        srcs = [name + "_srcs"] if srcs else [],
+        hdrs = [name + "_hdrs"] if hdrs else [],
+        **kwargs
+    )
+
+def rdc_copts():
+    """Returns copts for compiling relocatable device code."""
+
+    # The global functions can not have a lower register count than the
+    # device functions. This is enforced by setting a fixed register count.
+    # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
+    maxrregcount = "-maxrregcount=96"
+
+    return cuda_default_copts() + select({
+        "@local_config_cuda//cuda:using_nvcc": [
+            "-nvcc_options",
+            "relocatable-device-code=true",
+            "-nvcc_options",
+            "ptxas-options=" + maxrregcount,
+        ],
+        "@local_config_cuda//cuda:using_clang": [
+            "-fcuda-rdc",
+            "-Xcuda-ptxas",
+            maxrregcount,
+        ],
+        "//conditions:default": [],
+    }) + ["-fvisibility=hidden"]
+
+def _filter_impl(ctx):
+    suffix = ctx.attr.suffix
+    files = [src for src in ctx.files.srcs if src.path.endswith(suffix)]
+    return [DefaultInfo(files = depset(files))]
+
+_filter = rule(
+    implementation = _filter_impl,
+    attrs = {
+        "srcs": attr.label_list(allow_files = True),
+        "suffix": attr.string(),
+    },
+)
+"""Filters the srcs to the ones ending with suffix."""
+
+def _gen_link_src_impl(ctx):
+    ctx.actions.expand_template(
+        output = ctx.outputs.output,
+        template = ctx.file.template,
+        substitutions = {
+            "REGISTERLINKBINARYFILE": '"%s"' % ctx.file.register_hdr.short_path,
+            "FATBINFILE": '"%s"' % ctx.file.fatbin_hdr.short_path,
+        },
+    )
+
+_gen_link_src = rule(
+    implementation = _gen_link_src_impl,
+    attrs = {
+        "register_hdr": attr.label(allow_single_file = True),
+        "fatbin_hdr": attr.label(allow_single_file = True),
+        "template": attr.label(allow_single_file = True),
+        "output": attr.output(),
+    },
+)
+"""Patches the include directives for the link.stub file."""
+
+def rdc_library(name, deps):
+    """Produces a cc_library from deps containing relocatable device code."""
+
+    # From .a and .pic.a archives, just use the latter. Otherwise we get
+    # multiply defined symbols.
+    # TODO(csigg): C++ Sandwich once available should allow passing this target
+    # to a cc_library dependency, which would avoid the linking order issue.
+    _filter(
+        name = name + "_deps_a",
+        srcs = deps,
+        suffix = ".pic.a",
+    )
+
+    # Device-link to cubins for each architecture.
+    images = []
+    cubins = []
+    for arch in %{gpu_architectures}:
+        cubin = "%s_%s.cubin" % (name, arch)
+        register_hdr = "%s_%s.h" % (name, arch)
+        nvlink = "@local_config_nccl//:nvlink"
+        cmd = ("$(location %s) " % nvlink +
+               select({
+                   # NCCL is only supported on Linux.
+                   "@org_tensorflow//tensorflow:linux_x86_64": "--cpu-arch=X86_64 ",
+                   "@org_tensorflow//tensorflow:linux_ppc64le": "--cpu-arch=PPC64LE ",
+                   "//conditions:default": "",
+               }) +
+               "--arch=%s $(SRCS) " % arch +
+               "--register-link-binaries=$(location %s) " % register_hdr +
+               "--output-file=$(location %s)" % cubin)
+        native.genrule(
+            name = "%s_%s" % (name, arch),
+            outs = [register_hdr, cubin],
+            srcs = [name + "_deps_a"],
+            cmd = cmd,
+            tools = [nvlink],
+        )
+        images.append("--image=profile=%s,file=$(location %s)" % (arch, cubin))
+        cubins.append(cubin)
+
+    # Generate fatbin header from all cubins.
+    fatbin_hdr = name + ".fatbin.h"
+    fatbinary = "@local_config_nccl//:cuda/bin/fatbinary"
+    bin2c = "@local_config_nccl//:cuda/bin/bin2c"
+    cmd = ("$(location %s) -64 --cmdline=--compile-only " % fatbinary +
+           "--link --bin2c-path $$(dirname $(location %s)) " % bin2c +
+           "--compress-all %s --create=%%{name}.fatbin " % " ".join(images) +
+           "--embedded-fatbin=$@")
+    native.genrule(
+        name = name + "_fatbin_h",
+        outs = [fatbin_hdr],
+        srcs = cubins,
+        cmd = cmd,
+        tools = [fatbinary, bin2c],
+    )
+
+    # Generate the source file #including the headers generated above.
+    _gen_link_src(
+        name = name + "_dlink_src",
+        # Include just the last one, they are equivalent.
+        register_hdr = register_hdr,
+        fatbin_hdr = fatbin_hdr,
+        template = "@local_config_nccl//:cuda/bin/crt/link.stub",
+        output = name + ".cc",
+    )
+
+    # Compile the source file into the cc_library.
+    native.cc_library(
+        name = name + "_dlink_a",
+        srcs = [
+            name + "_dlink_src",
+        ],
+        textual_hdrs = [register_hdr, fatbin_hdr],
+        deps = [
+            "@local_config_cuda//cuda:cuda_headers",
+        ],
+        defines = [
+            # Silence warning about including internal header.
+            "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__",
+            # Macros that need to be defined starting with CUDA 10.
+            "__NV_EXTRA_INITIALIZATION=",
+            "__NV_EXTRA_FINALIZATION=",
+        ],
+        linkstatic = True,
+    )
+
+    # Repackage deps into a single archive. This avoid unresolved symbols when
+    # the archives happen to be linked in the wrong order. For more details, see
+    # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
+    native.genrule(
+        name = name + "_a",
+        srcs = [
+            name + "_deps_a",
+            name + "_dlink_a",
+        ],
+        outs = [name + ".a"],
+        # See https://stackoverflow.com/a/23621751
+        cmd = """
+addlibs=$$(echo $(SRCS) | sed "s/[^ ]* */\\naddlib &/g")
+printf "create $@$${addlibs}\\nsave\\nend" | $(AR) -M
+""",
+    )
+
+    native.cc_library(
+        name = name,
+        srcs = [name + "_a"],
+        deps = [
+            "@local_config_cuda//cuda:cudart_static",
+        ],
+        linkstatic = True,
+    )
diff --git a/third_party/nccl/nccl_archive.BUILD b/third_party/nccl/nccl_archive.BUILD
deleted file mode 100644
index a05899e38d531c066c774302e4ffd75ce7e482e7..0000000000000000000000000000000000000000
--- a/third_party/nccl/nccl_archive.BUILD
+++ /dev/null
@@ -1,68 +0,0 @@
-# NVIDIA nccl
-# A package of optimized primitives for collective multi-GPU communication.
-
-licenses(["notice"])  # BSD
-
-exports_files(["LICENSE.txt"])
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
-
-SRCS = [
-    "src/all_gather.cu",
-    "src/all_reduce.cu",
-    "src/broadcast.cu",
-    "src/core.cu",
-    "src/libwrap.cu",
-    "src/reduce.cu",
-    "src/reduce_scatter.cu",
-]
-
-# Copy .cu to .cu.cc so they can be in srcs of cc_library.
-[
-    genrule(
-        name = "gen_" + src,
-        srcs = [src],
-        outs = [src + ".cc"],
-        cmd = "cp $(location " + src + ") $(location " + src + ".cc)",
-    )
-    for src in SRCS
-]
-
-SRCS_CU_CC = [src + ".cc" for src in SRCS]
-
-cc_library(
-    name = "nccl",
-    srcs = if_cuda(SRCS_CU_CC + glob(["src/*.h"])),
-    hdrs = if_cuda(["src/nccl.h"]),
-    copts = [
-        "-DCUDA_MAJOR=0",
-        "-DCUDA_MINOR=0",
-        "-DNCCL_MAJOR=0",
-        "-DNCCL_MINOR=0",
-        "-DNCCL_PATCH=0",
-        "-Iexternal/nccl_archive/src",
-        "-O3",
-    ] + cuda_default_copts(),
-    include_prefix = "third_party/nccl",
-    linkopts = select({
-        "@org_tensorflow//tensorflow:android": [
-            "-pie",
-        ],
-        "@org_tensorflow//tensorflow:darwin": [
-            "-Wl,-framework",
-            "-Wl,CoreFoundation",
-            "-Wl,-framework",
-            "-Wl,Security",
-        ],
-        "@org_tensorflow//tensorflow:ios": [],
-        "@org_tensorflow//tensorflow:windows": [
-            "-DEFAULTLIB:ws2_32.lib",
-        ],
-        "//conditions:default": [
-            "-lrt",
-        ],
-    }),
-    strip_include_prefix = "src",
-    visibility = ["//visibility:public"],
-    deps = ["@local_config_cuda//cuda:cuda_headers"],
-)
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index d78fe8f3aab7eefe0c7b38aa36b606e380728351..1e6422b49ef4d7ce97b3b38f3b3964281a158b7c 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -3,7 +3,7 @@
 
 `nccl_configure` depends on the following environment variables:
 
-  * `TF_NCCL_VERSION`: The NCCL version.
+  * `TF_NCCL_VERSION`: Installed NCCL version or empty to build from source.
   * `NCCL_INSTALL_PATH`: The installation path of the NCCL library.
   * `NCCL_HDR_PATH`: The installation path of the NCCL header files.
 """
@@ -11,12 +11,16 @@
 load(
     "//third_party/gpus:cuda_configure.bzl",
     "auto_configure_fail",
+    "compute_capabilities",
+    "cuda_toolkit_path",
     "find_cuda_define",
     "matches_version",
 )
 
-_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
+_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
 _NCCL_HDR_PATH = "NCCL_HDR_PATH"
+_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
+_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
 _TF_NCCL_CONFIG_REPO = "TF_NCCL_CONFIG_REPO"
 
@@ -37,6 +41,13 @@ cc_library(
 """
 
 _NCCL_ARCHIVE_BUILD_CONTENT = """
+exports_files([
+    "cuda/bin/crt/link.stub",
+    "cuda/bin/fatbinary",
+    "cuda/bin/bin2c",
+    "nvlink",
+])
+
 filegroup(
   name = "LICENSE",
   data = ["@nccl_archive//:LICENSE.txt"],
@@ -50,113 +61,125 @@ alias(
 )
 """
 
-# Local build results in dynamic link and the license should not be included.
-_NCCL_REMOTE_BUILD_TEMPLATE = Label("//third_party/nccl:remote.BUILD.tpl")
-_NCCL_LOCAL_BUILD_TEMPLATE = Label("//third_party/nccl:system.BUILD.tpl")
+def _label(file):
+    return Label("//third_party/nccl:{}".format(file))
 
 def _find_nccl_header(repository_ctx, nccl_install_path):
-  """Finds the NCCL header on the system.
-
-  Args:
-    repository_ctx: The repository context.
-    nccl_install_path: The NCCL library install directory.
+    """Finds the NCCL header on the system.
 
-  Returns:
-    The path to the NCCL header.
-  """
-  header_path = repository_ctx.path("%s/include/nccl.h" % nccl_install_path)
-  if not header_path.exists:
-    auto_configure_fail("Cannot find %s" % str(header_path))
-  return header_path
+    Args:
+      repository_ctx: The repository context.
+      nccl_install_path: The NCCL library install directory.
 
+    Returns:
+      The path to the NCCL header.
+    """
+    header_path = repository_ctx.path("%s/include/nccl.h" % nccl_install_path)
+    if not header_path.exists:
+        auto_configure_fail("Cannot find %s" % str(header_path))
+    return header_path
 
 def _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_version):
-  """Checks whether the header file matches the specified version of NCCL.
-
-  Args:
-    repository_ctx: The repository context.
-    nccl_install_path: The NCCL library install directory.
-    nccl_version: The expected NCCL version.
-
-  Returns:
-    A string containing the library version of NCCL.
-  """
-  header_path = repository_ctx.path("%s/nccl.h" % nccl_hdr_path)
-  if not header_path.exists:
-    header_path = _find_nccl_header(repository_ctx, nccl_install_path)
-  header_dir = str(header_path.realpath.dirname)
-  major_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
-                                   _DEFINE_NCCL_MAJOR)
-  minor_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
-                                   _DEFINE_NCCL_MINOR)
-  patch_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
-                                   _DEFINE_NCCL_PATCH)
-  header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
-  if not matches_version(nccl_version, header_version):
-    auto_configure_fail(
-        ("NCCL library version detected from %s/nccl.h (%s) does not match " +
-         "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
-        (header_dir, header_version, nccl_version))
-
-
-def _find_nccl_lib(repository_ctx, nccl_install_path, nccl_version):
-  """Finds the given NCCL library on the system.
-
-  Args:
-    repository_ctx: The repository context.
-    nccl_install_path: The NCCL library installation directory.
-    nccl_version: The version of NCCL library files as returned
-      by _nccl_version.
-
-  Returns:
-    The path to the NCCL library.
-  """
-  lib_path = repository_ctx.path("%s/lib/libnccl.so.%s" % (nccl_install_path,
-                                                           nccl_version))
-  if not lib_path.exists:
-    auto_configure_fail("Cannot find NCCL library %s" % str(lib_path))
-  return lib_path
-
+    """Checks whether the header file matches the specified version of NCCL.
+
+    Args:
+      repository_ctx: The repository context.
+      nccl_install_path: The NCCL library install directory.
+      nccl_hdr_path: The NCCL header path.
+      nccl_version: The expected NCCL version.
+
+    Returns:
+      A string containing the library version of NCCL.
+    """
+    header_path = repository_ctx.path("%s/nccl.h" % nccl_hdr_path)
+    if not header_path.exists:
+        header_path = _find_nccl_header(repository_ctx, nccl_install_path)
+    header_dir = str(header_path.realpath.dirname)
+    major_version = find_cuda_define(
+        repository_ctx,
+        header_dir,
+        "nccl.h",
+        _DEFINE_NCCL_MAJOR,
+    )
+    minor_version = find_cuda_define(
+        repository_ctx,
+        header_dir,
+        "nccl.h",
+        _DEFINE_NCCL_MINOR,
+    )
+    patch_version = find_cuda_define(
+        repository_ctx,
+        header_dir,
+        "nccl.h",
+        _DEFINE_NCCL_PATCH,
+    )
+    header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+    if not matches_version(nccl_version, header_version):
+        auto_configure_fail(
+            ("NCCL library version detected from %s/nccl.h (%s) does not match " +
+             "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
+            (header_dir, header_version, nccl_version),
+        )
 
 def _nccl_configure_impl(repository_ctx):
-  """Implementation of the nccl_configure repository rule."""
-  if _TF_NCCL_VERSION not in repository_ctx.os.environ:
-    # Add a dummy build file to make bazel query happy.
-    repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
-    return
-
-  if _TF_NCCL_CONFIG_REPO in repository_ctx.os.environ:
-    # Forward to the pre-configured remote repository.
-    repository_ctx.template("BUILD", _NCCL_REMOTE_BUILD_TEMPLATE, {
-        "%{target}": repository_ctx.os.environ[_TF_NCCL_CONFIG_REPO],
-    })
-    return
-
-  nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
-  if matches_version("1", nccl_version):
-    # Alias to GitHub target from @nccl_archive.
-    if not matches_version(nccl_version, "1.3"):
-      auto_configure_fail(
-          "NCCL from GitHub must use version 1.3 (got %s)" % nccl_version)
-    repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
-  else:
-    # Create target for locally installed NCCL.
-    nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
-    nccl_hdr_path = repository_ctx.os.environ[_NCCL_HDR_PATH].strip()
-    _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_version)
-    repository_ctx.template("BUILD", _NCCL_LOCAL_BUILD_TEMPLATE, {
-        "%{version}": nccl_version,
-        "%{install_path}": nccl_install_path,
-        "%{hdr_path}": nccl_hdr_path,
-    })
-
+    """Implementation of the nccl_configure repository rule."""
+    if _TF_NCCL_VERSION not in repository_ctx.os.environ:
+        # Add a dummy build file to make bazel query happy.
+        repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+        return
+
+    if _TF_NCCL_CONFIG_REPO in repository_ctx.os.environ:
+        # Forward to the pre-configured remote repository.
+        repository_ctx.template("BUILD", _label("remote.BUILD.tpl"), {
+            "%{target}": repository_ctx.os.environ[_TF_NCCL_CONFIG_REPO],
+        })
+        return
+
+    nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
+    if nccl_version == "":
+        # Alias to open source build from @nccl_archive.
+        repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+
+        # TODO(csigg): implement and reuse in cuda_configure.bzl.
+        gpu_architectures = [
+            "sm_" + capability.replace(".", "")
+            for capability in compute_capabilities(repository_ctx)
+        ]
+
+        # Round-about way to make the list unique.
+        gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys()
+        repository_ctx.template("build_defs.bzl", _label("build_defs.bzl.tpl"), {
+            "%{gpu_architectures}": str(gpu_architectures),
+        })
+
+        repository_ctx.symlink(cuda_toolkit_path(repository_ctx), "cuda")
+
+        # Temporary work-around for setups which symlink ptxas to a newer
+        # version. The versions of nvlink and ptxas need to agree, so we find
+        # nvlink next to the real location of ptxas. This is only temporary and
+        # will be removed again soon.
+        nvlink_dir = repository_ctx.path("cuda/bin/ptxas").realpath.dirname
+        repository_ctx.symlink(nvlink_dir.get_child("nvlink"), "nvlink")
+    else:
+        # Create target for locally installed NCCL.
+        nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
+        nccl_hdr_path = repository_ctx.os.environ[_NCCL_HDR_PATH].strip()
+        _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_version)
+        repository_ctx.template("BUILD", _label("system.BUILD.tpl"), {
+            "%{version}": nccl_version,
+            "%{install_path}": nccl_install_path,
+            "%{hdr_path}": nccl_hdr_path,
+        })
 
 nccl_configure = repository_rule(
-    implementation=_nccl_configure_impl,
-    environ=[
-        _NCCL_INSTALL_PATH,
+    implementation = _nccl_configure_impl,
+    environ = [
+        _CUDA_TOOLKIT_PATH,
         _NCCL_HDR_PATH,
+        _NCCL_INSTALL_PATH,
         _TF_NCCL_VERSION,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+        _TF_NCCL_CONFIG_REPO,
     ],
 )
 """Detects and configures the NCCL configuration.
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index 6602a480afbf29ed9777aa0ab3e10b2860b84d66..63e9548c53262461cfc9c3fd160f4f17430319c7 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -34,8 +34,9 @@ cc_library(
         "src/ngraph/runtime/cpu/builder/one_hot.cpp",
         "src/ngraph/runtime/cpu/builder/pad.cpp",
         "src/ngraph/runtime/cpu/builder/product.cpp",
-        "src/ngraph/runtime/cpu/builder/quantize.cpp",
+        "src/ngraph/runtime/cpu/builder/quantization.cpp",
         "src/ngraph/runtime/cpu/builder/quantized_avg_pool.cpp",
+        "src/ngraph/runtime/cpu/builder/quantized_conv.cpp",
         "src/ngraph/runtime/cpu/builder/quantized_max_pool.cpp",
         "src/ngraph/runtime/cpu/builder/reduce_function.cpp",
         "src/ngraph/runtime/cpu/builder/reduce_function_window.cpp",
@@ -61,6 +62,7 @@ cc_library(
         "src/ngraph/runtime/cpu/cpu_tensor_view.cpp",
         "src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp",
         "src/ngraph/runtime/cpu/cpu_tracing.cpp",
+        "src/ngraph/runtime/cpu/cpu_visualize_tree.cpp",
         "src/ngraph/runtime/cpu/kernel/eigen_thread_pool.cpp",
         "src/ngraph/runtime/cpu/kernel/pad.cpp",
         "src/ngraph/runtime/cpu/kernel/reduce_max.cpp",
@@ -76,15 +78,11 @@ cc_library(
         "src/ngraph/runtime/cpu/op/conv_bias.cpp",
         "src/ngraph/runtime/cpu/op/conv_relu.cpp",
         "src/ngraph/runtime/cpu/op/convert_layout.cpp",
-        "src/ngraph/runtime/cpu/op/dequantize.cpp",
         "src/ngraph/runtime/cpu/op/group_conv.cpp",
         "src/ngraph/runtime/cpu/op/loop_kernel.cpp",
         "src/ngraph/runtime/cpu/op/lstm.cpp",
         "src/ngraph/runtime/cpu/op/matmul_bias.cpp",
         "src/ngraph/runtime/cpu/op/max_pool_with_indices.cpp",
-        "src/ngraph/runtime/cpu/op/quantize.cpp",
-        "src/ngraph/runtime/cpu/op/quantized_avg_pool.cpp",
-        "src/ngraph/runtime/cpu/op/quantized_max_pool.cpp",
         "src/ngraph/runtime/cpu/op/rnn.cpp",
         "src/ngraph/runtime/cpu/op/sigmoid_mul.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_assignment.cpp",
@@ -99,21 +97,22 @@ cc_library(
         "src/ngraph/runtime/cpu/pass/cpu_workspace_insertion.cpp",
     ],
     hdrs = glob(["src/ngraph/runtime/cpu/**/*.hpp"]) + glob([]),
-    deps = [
-        ":ngraph_headers",
-        "@eigen_archive//:eigen",
-        "@nlohmann_json_lib",
-        "@tbb",
-        "@mkl_dnn//:mkl_dnn",
-    ],
     copts = [
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.1\\"',
+        '-D NGRAPH_VERSION=\\"0.9.1\\"',
         "-D NGRAPH_DEX_ONLY",
+        '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
+    deps = [
+        ":ngraph_headers",
+        "@eigen_archive//:eigen",
+        "@mkl_dnn",
+        "@nlohmann_json_lib",
+        "@tbb",
+    ],
     alwayslink = 1,
 )
 
@@ -125,6 +124,11 @@ cc_library(
         "src/ngraph/builder/*.cpp",
         "src/ngraph/descriptor/*.cpp",
         "src/ngraph/descriptor/layout/*.cpp",
+        "src/ngraph/op/experimental/quantized_avg_pool.cpp",
+        "src/ngraph/op/experimental/quantized_conv_bias.cpp",
+        "src/ngraph/op/experimental/quantized_conv_relu.cpp",
+        "src/ngraph/op/experimental/quantized_conv.cpp",
+        "src/ngraph/op/experimental/quantized_max_pool.cpp",
         "src/ngraph/op/*.cpp",
         "src/ngraph/op/util/*.cpp",
         "src/ngraph/pattern/*.cpp",
@@ -134,18 +138,19 @@ cc_library(
         "src/ngraph/runtime/*.cpp",
         "src/ngraph/type/*.cpp",
     ]),
-    deps = [
-        ":ngraph_headers",
-        ":ngraph_cpu_backend",
-        "@eigen_archive//:eigen",
-        "@nlohmann_json_lib",
-    ],
     copts = [
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.8.1\\"',
+        '-D NGRAPH_VERSION=\\"0.9.1\\"',
+        '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
+    deps = [
+        ":ngraph_cpu_backend",
+        ":ngraph_headers",
+        "@eigen_archive//:eigen",
+        "@nlohmann_json_lib",
+    ],
     alwayslink = 1,
 )
diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD
index dbedca0a03c68d3099233e45102dd9401ea4359d..db9a66f9b5bcdaa29ec55175f1a8c76ac5f6f22a 100644
--- a/third_party/ngraph/ngraph_tf.BUILD
+++ b/third_party/ngraph/ngraph_tf.BUILD
@@ -10,6 +10,10 @@ load(
 cc_library(
     name = "ngraph_tf",
     srcs = [
+        "logging/ngraph_log.cc",
+        "logging/ngraph_log.h",
+        "logging/tf_graph_writer.cc",
+        "logging/tf_graph_writer.h",
         "src/ngraph_api.cc",
         "src/ngraph_api.h",
         "src/ngraph_assign_clusters.cc",
@@ -41,24 +45,23 @@ cc_library(
         "src/tf_deadness_analysis.h",
         "src/tf_graphcycles.cc",
         "src/tf_graphcycles.h",
-        "logging/ngraph_log.h",
-        "logging/ngraph_log.cc",
-        "logging/tf_graph_writer.h",
-        "logging/tf_graph_writer.cc",
-    ],
-    deps = [
-        "@org_tensorflow//tensorflow/core:protos_all_proto_text",
-        "@org_tensorflow//tensorflow/core:framework_headers_lib",
-        "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
-        "@ngraph//:ngraph_core",
     ],
     copts = [
         "-I external/ngraph_tf/src",
         "-I external/ngraph_tf/logging",
         "-I external/ngraph/src",
     ],
-    alwayslink = 1,
     visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/container:container_memory",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:variant",
+        "@ngraph//:ngraph_core",
+        "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
+        "@org_tensorflow//tensorflow/core:framework_headers_lib",
+        "@org_tensorflow//tensorflow/core:protos_all_proto_text",
+    ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -79,6 +82,12 @@ tf_cc_test(
         "test/test_utilities.h",
         "test/tf_exec.cpp",
     ],
+    extra_copts = [
+        "-fexceptions ",
+        "-I external/ngraph_tf/src",
+        "-I external/ngraph_tf/logging",
+        "-I external/ngraph/src",
+    ],
     deps = [
         ":ngraph_tf",
         "@com_google_googletest//:gtest",
@@ -86,10 +95,4 @@ tf_cc_test(
         "@org_tensorflow//tensorflow/cc:client_session",
         "@org_tensorflow//tensorflow/core:tensorflow",
     ],
-    extra_copts = [
-        "-fexceptions ",
-        "-I external/ngraph_tf/src",
-        "-I external/ngraph_tf/logging",
-        "-I external/ngraph/src",
-    ],
 )
diff --git a/third_party/ngraph/tbb.BUILD b/third_party/ngraph/tbb.BUILD
index 04e6544ffb579a94db2ffeed123068a64afbfcb7..c78a2d79ddfff53ddede0a70427dac89d08fbdcc 100644
--- a/third_party/ngraph/tbb.BUILD
+++ b/third_party/ngraph/tbb.BUILD
@@ -14,6 +14,10 @@ genrule(
     srcs = glob(["**"]) + [
         "@local_config_cc//:toolchain",
     ],
+    outs = [
+        "libtbb.a",
+        "libtbbmalloc.a",
+    ],
     cmd = """
 	    set -e
 	    WORK_DIR=$$PWD
@@ -45,19 +49,15 @@ genrule(
         cp build/build_{release,debug}/*.a $$DEST_DIR
 		cd $$WORK_DIR
 	""",
-    outs = [
-        "libtbb.a",
-        "libtbbmalloc.a",
-    ],
 )
 
 cc_library(
     name = "tbb",
+    srcs = ["libtbb.a"],
     hdrs = glob([
         "include/serial/**",
         "include/tbb/**/**",
     ]),
-    srcs = ["libtbb.a"],
     includes = ["include"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/ortools/BUILD b/third_party/ortools/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2f5d02becb930602574c4df02c51cec7662bc25d
--- /dev/null
+++ b/third_party/ortools/BUILD
@@ -0,0 +1 @@
+# Dummy BUILD file to make this directory a package.
diff --git a/third_party/ortools.BUILD b/third_party/ortools/BUILD.bazel
similarity index 100%
rename from third_party/ortools.BUILD
rename to third_party/ortools/BUILD.bazel
diff --git a/third_party/ortools/workspace.bzl b/third_party/ortools/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..e1c64b4390c8ca52211ac921c6d31791b8e8fef1
--- /dev/null
+++ b/third_party/ortools/workspace.bzl
@@ -0,0 +1,15 @@
+"""loads the aws library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "ortools_archive",
+        urls = [
+            "https://mirror.bazel.build/github.com/google/or-tools/archive/v6.7.2.tar.gz",
+            "https://github.com/google/or-tools/archive/v6.7.2.tar.gz",
+        ],
+        sha256 = "d025a95f78b5fc5eaa4da5f395f23d11c23cf7dbd5069f1f627f002de87b86b9",
+        strip_prefix = "or-tools-6.7.2/src",
+        build_file = "//third_party/ortools:BUILD.bazel",
+    )
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index c26a2897176e57220b42b7d2cc5b61d114ecfc5f..e82948648e42e14e97238726e7db5a932bbea946 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -44,11 +44,11 @@ cc_library(
         "png.h",
         "pngconf.h",
     ],
-    includes = ["."],
     copts = select({
         ":windows": ["-DPNG_INTEL_SSE_OPT=1"],
         "//conditions:default": [],
     }),
+    includes = ["."],
     linkopts = select({
         ":windows": [],
         "//conditions:default": ["-lm"],
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 6e30618d39e5d4cedde61c080d987cef963f7d55..bad6d20a08c0ee27345bf16a5a4f7c9e4d67a05f 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -26,7 +26,7 @@ def _wrap_bash_cmd(ctx, cmd):
         bazel_sh = _get_env_var(ctx, "BAZEL_SH")
         if not bazel_sh:
             fail("BAZEL_SH environment variable is not set")
-        cmd = [bazel_sh, "-l", "-c", " ".join(cmd)]
+        cmd = [bazel_sh, "-l", "-c", " ".join(["\"%s\"" % s for s in cmd])]
     return cmd
 
 def _get_env_var(ctx, name):
@@ -47,7 +47,7 @@ def _use_system_lib(ctx, name):
 # Executes specified command with arguments and calls 'fail' if it exited with
 # non-zero code
 def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
-    result = repo_ctx.execute(cmd_and_args, timeout = 10)
+    result = repo_ctx.execute(cmd_and_args, timeout = 60)
     if result.return_code != 0:
         fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n" +
               "Stderr: {3}").format(
@@ -84,7 +84,7 @@ def _apply_delete(ctx, paths):
 def _tf_http_archive(ctx):
     if ("mirror.bazel.build" not in ctx.attr.urls[0] and
         (len(ctx.attr.urls) < 2 and
-         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
         fail("tf_http_archive(urls) must have redundant URLs. The " +
              "mirror.bazel.build URL must be present and it must come first. " +
              "Even if you don't have permission to mirror the file, please " +
@@ -150,7 +150,7 @@ ensure best practices are followed.
 def _third_party_http_archive(ctx):
     if ("mirror.bazel.build" not in ctx.attr.urls[0] and
         (len(ctx.attr.urls) < 2 and
-         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
         fail("tf_http_archive(urls) must have redundant URLs. The " +
              "mirror.bazel.build URL must be present and it must come first. " +
              "Even if you don't have permission to mirror the file, please " +
diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4b1cf396b9b7abef8feaa653c7c71e9e8a9e304e
--- /dev/null
+++ b/third_party/systemlibs/protobuf.BUILD
@@ -0,0 +1,104 @@
+load(
+    "@protobuf_archive//:protobuf.bzl",
+    "proto_gen",
+    "py_proto_library",
+    "cc_proto_library",
+)
+
+licenses(["notice"])
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+HEADERS = [
+    "google/protobuf/any.pb.h",
+    "google/protobuf/any.proto",
+    "google/protobuf/arena.h",
+    "google/protobuf/compiler/importer.h",
+    "google/protobuf/descriptor.h",
+    "google/protobuf/descriptor.pb.h",
+    "google/protobuf/descriptor.proto",
+    "google/protobuf/duration.pb.h",
+    "google/protobuf/duration.proto",
+    "google/protobuf/dynamic_message.h",
+    "google/protobuf/empty.pb.h",
+    "google/protobuf/empty.proto",
+    "google/protobuf/field_mask.pb.h",
+    "google/protobuf/field_mask.proto",
+    "google/protobuf/io/coded_stream.h",
+    "google/protobuf/io/zero_copy_stream.h",
+    "google/protobuf/io/zero_copy_stream_impl_lite.h",
+    "google/protobuf/map.h",
+    "google/protobuf/repeated_field.h",
+    "google/protobuf/text_format.h",
+    "google/protobuf/timestamp.pb.h",
+    "google/protobuf/timestamp.proto",
+    "google/protobuf/util/json_util.h",
+    "google/protobuf/util/type_resolver_util.h",
+    "google/protobuf/wrappers.pb.h",
+    "google/protobuf/wrappers.proto",
+]
+
+genrule(
+    name = "link_headers",
+    outs = HEADERS,
+    cmd = """
+      for i in $(OUTS); do
+        f=$${i#$(@D)/}
+        mkdir -p $(@D)/$${f%/*}
+        ln -sf $(INCLUDEDIR)/$$f $(@D)/$$f
+      done
+    """,
+)
+
+cc_library(
+    name = "protobuf",
+    hdrs = HEADERS,
+    linkopts = ["-lprotobuf"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "protobuf_headers",
+    hdrs = HEADERS,
+    linkopts = ["-lprotobuf"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "protoc_lib",
+    linkopts = ["-lprotoc"],
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "protoc",
+    outs = ["protoc.bin"],
+    cmd = "ln -s $$(which protoc) $@",
+    executable = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_proto_library(
+    name = "cc_wkt_protos",
+    hdrs = HEADERS,
+    internal_bootstrap_hack = 1,
+    protoc = ":protoc",
+    visibility = ["//visibility:public"],
+)
+
+proto_gen(
+    name = "protobuf_python_genproto",
+    includes = ["."],
+    protoc = "@protobuf_archive//:protoc",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "protobuf_python",
+    data = [":link_headers"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/systemlibs/protobuf.bzl b/third_party/systemlibs/protobuf.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..2aa75610a9313d12daeb7406ea0107e53231e814
--- /dev/null
+++ b/third_party/systemlibs/protobuf.bzl
@@ -0,0 +1,425 @@
+def _GetPath(ctx, path):
+    if ctx.label.workspace_root:
+        return ctx.label.workspace_root + "/" + path
+    else:
+        return path
+
+def _IsNewExternal(ctx):
+    # Bazel 0.4.4 and older have genfiles paths that look like:
+    #   bazel-out/local-fastbuild/genfiles/external/repo/foo
+    # After the exec root rearrangement, they look like:
+    #   ../repo/bazel-out/local-fastbuild/genfiles/foo
+    return ctx.label.workspace_root.startswith("../")
+
+def _GenDir(ctx):
+    if _IsNewExternal(ctx):
+        # We are using the fact that Bazel 0.4.4+ provides repository-relative paths
+        # for ctx.genfiles_dir.
+        return ctx.genfiles_dir.path + (
+            "/" + ctx.attr.includes[0] if ctx.attr.includes and ctx.attr.includes[0] else ""
+        )
+
+    # This means that we're either in the old version OR the new version in the local repo.
+    # Either way, appending the source path to the genfiles dir works.
+    return ctx.var["GENDIR"] + "/" + _SourceDir(ctx)
+
+def _SourceDir(ctx):
+    if not ctx.attr.includes:
+        return ctx.label.workspace_root
+    if not ctx.attr.includes[0]:
+        return _GetPath(ctx, ctx.label.package)
+    if not ctx.label.package:
+        return _GetPath(ctx, ctx.attr.includes[0])
+    return _GetPath(ctx, ctx.label.package + "/" + ctx.attr.includes[0])
+
+def _CcHdrs(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + ".pb.h" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + ".grpc.pb.h" for s in srcs]
+    return ret
+
+def _CcSrcs(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + ".pb.cc" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
+    return ret
+
+def _CcOuts(srcs, use_grpc_plugin = False):
+    return _CcHdrs(srcs, use_grpc_plugin) + _CcSrcs(srcs, use_grpc_plugin)
+
+def _PyOuts(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
+    return ret
+
+def _RelativeOutputPath(path, include, dest = ""):
+    if include == None:
+        return path
+
+    if not path.startswith(include):
+        fail("Include path %s isn't part of the path %s." % (include, path))
+
+    if include and include[-1] != "/":
+        include = include + "/"
+    if dest and dest[-1] != "/":
+        dest = dest + "/"
+
+    path = path[len(include):]
+    return dest + path
+
+def _proto_gen_impl(ctx):
+    """General implementation for generating protos"""
+    srcs = ctx.files.srcs
+    deps = []
+    deps += ctx.files.srcs
+    source_dir = _SourceDir(ctx)
+    gen_dir = _GenDir(ctx)
+    if source_dir:
+        import_flags = ["-I" + source_dir, "-I" + gen_dir]
+    else:
+        import_flags = ["-I."]
+
+    for dep in ctx.attr.deps:
+        import_flags += dep.proto.import_flags
+        deps += dep.proto.deps
+
+    args = []
+    if ctx.attr.gen_cc:
+        args += ["--cpp_out=" + gen_dir]
+    if ctx.attr.gen_py:
+        args += ["--python_out=" + gen_dir]
+
+    inputs = srcs + deps
+    if ctx.executable.plugin:
+        plugin = ctx.executable.plugin
+        lang = ctx.attr.plugin_language
+        if not lang and plugin.basename.startswith("protoc-gen-"):
+            lang = plugin.basename[len("protoc-gen-"):]
+        if not lang:
+            fail("cannot infer the target language of plugin", "plugin_language")
+
+        outdir = gen_dir
+        if ctx.attr.plugin_options:
+            outdir = ",".join(ctx.attr.plugin_options) + ":" + outdir
+        args += ["--plugin=protoc-gen-%s=%s" % (lang, plugin.path)]
+        args += ["--%s_out=%s" % (lang, outdir)]
+        inputs += [plugin]
+
+    if args:
+        ctx.action(
+            inputs = inputs,
+            outputs = ctx.outputs.outs,
+            arguments = args + import_flags + [s.path for s in srcs],
+            executable = ctx.executable.protoc,
+            mnemonic = "ProtoCompile",
+            use_default_shell_env = True,
+        )
+
+    return struct(
+        proto = struct(
+            srcs = srcs,
+            import_flags = import_flags,
+            deps = deps,
+        ),
+    )
+
+proto_gen = rule(
+    attrs = {
+        "srcs": attr.label_list(allow_files = True),
+        "deps": attr.label_list(providers = ["proto"]),
+        "includes": attr.string_list(),
+        "protoc": attr.label(
+            cfg = "host",
+            executable = True,
+            single_file = True,
+            mandatory = True,
+        ),
+        "plugin": attr.label(
+            cfg = "host",
+            allow_files = True,
+            executable = True,
+        ),
+        "plugin_language": attr.string(),
+        "plugin_options": attr.string_list(),
+        "gen_cc": attr.bool(),
+        "gen_py": attr.bool(),
+        "outs": attr.output_list(),
+    },
+    output_to_genfiles = True,
+    implementation = _proto_gen_impl,
+)
+"""Generates codes from Protocol Buffers definitions.
+
+This rule helps you to implement Skylark macros specific to the target
+language. You should prefer more specific `cc_proto_library `,
+`py_proto_library` and others unless you are adding such wrapper macros.
+
+Args:
+  srcs: Protocol Buffers definition files (.proto) to run the protocol compiler
+    against.
+  deps: a list of dependency labels; must be other proto libraries.
+  includes: a list of include paths to .proto files.
+  protoc: the label of the protocol compiler to generate the sources.
+  plugin: the label of the protocol compiler plugin to be passed to the protocol
+    compiler.
+  plugin_language: the language of the generated sources
+  plugin_options: a list of options to be passed to the plugin
+  gen_cc: generates C++ sources in addition to the ones from the plugin.
+  gen_py: generates Python sources in addition to the ones from the plugin.
+  outs: a list of labels of the expected outputs from the protocol compiler.
+"""
+
+def cc_proto_library(
+        name,
+        srcs = [],
+        deps = [],
+        cc_libs = [],
+        include = None,
+        protoc = "@com_google_protobuf//:protoc",
+        internal_bootstrap_hack = False,
+        use_grpc_plugin = False,
+        default_runtime = "@com_google_protobuf//:protobuf",
+        **kargs):
+    """Bazel rule to create a C++ protobuf library from proto source files
+
+    NOTE: the rule is only an internal workaround to generate protos. The
+    interface may change and the rule may be removed when bazel has introduced
+    the native rule.
+
+    Args:
+      name: the name of the cc_proto_library.
+      srcs: the .proto files of the cc_proto_library.
+      deps: a list of dependency labels; must be cc_proto_library.
+      cc_libs: a list of other cc_library targets depended by the generated
+          cc_library.
+      include: a string indicating the include path of the .proto files.
+      protoc: the label of the protocol compiler to generate the sources.
+      internal_bootstrap_hack: a flag indicate the cc_proto_library is used only
+          for bootstraping. When it is set to True, no files will be generated.
+          The rule will simply be a provider for .proto files, so that other
+          cc_proto_library can depend on it.
+      use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin
+          when processing the proto files.
+      default_runtime: the implicitly default runtime which will be depended on by
+          the generated cc_library target.
+      **kargs: other keyword arguments that are passed to cc_library.
+
+    """
+
+    includes = []
+    if include != None:
+        includes = [include]
+
+    if internal_bootstrap_hack:
+        # For pre-checked-in generated files, we add the internal_bootstrap_hack
+        # which will skip the codegen action.
+        proto_gen(
+            name = name + "_genproto",
+            srcs = srcs,
+            deps = [s + "_genproto" for s in deps],
+            includes = includes,
+            protoc = protoc,
+            visibility = ["//visibility:public"],
+        )
+
+        # An empty cc_library to make rule dependency consistent.
+        native.cc_library(
+            name = name,
+            **kargs
+        )
+        return
+
+    grpc_cpp_plugin = None
+    if use_grpc_plugin:
+        grpc_cpp_plugin = "//external:grpc_cpp_plugin"
+
+    gen_srcs = _CcSrcs(srcs, use_grpc_plugin)
+    gen_hdrs = _CcHdrs(srcs, use_grpc_plugin)
+    outs = gen_srcs + gen_hdrs
+
+    proto_gen(
+        name = name + "_genproto",
+        srcs = srcs,
+        deps = [s + "_genproto" for s in deps],
+        includes = includes,
+        protoc = protoc,
+        plugin = grpc_cpp_plugin,
+        plugin_language = "grpc",
+        gen_cc = 1,
+        outs = outs,
+        visibility = ["//visibility:public"],
+    )
+
+    if default_runtime and not default_runtime in cc_libs:
+        cc_libs = cc_libs + [default_runtime]
+    if use_grpc_plugin:
+        cc_libs = cc_libs + ["//external:grpc_lib"]
+
+    native.cc_library(
+        name = name,
+        srcs = gen_srcs,
+        hdrs = gen_hdrs,
+        deps = cc_libs + deps,
+        includes = includes,
+        **kargs
+    )
+
+def internal_gen_well_known_protos_java(srcs):
+    """Bazel rule to generate the gen_well_known_protos_java genrule
+
+    Args:
+      srcs: the well known protos
+    """
+    root = Label("%s//protobuf_java" % (REPOSITORY_NAME)).workspace_root
+    pkg = PACKAGE_NAME + "/" if PACKAGE_NAME else ""
+    if root == "":
+        include = " -I%ssrc " % pkg
+    else:
+        include = " -I%s/%ssrc " % (root, pkg)
+    native.genrule(
+        name = "gen_well_known_protos_java",
+        srcs = srcs,
+        outs = [
+            "wellknown.srcjar",
+        ],
+        cmd = "$(location :protoc) --java_out=$(@D)/wellknown.jar" +
+              " %s $(SRCS) " % include +
+              " && mv $(@D)/wellknown.jar $(@D)/wellknown.srcjar",
+        tools = [":protoc"],
+    )
+
+def internal_copied_filegroup(name, srcs, strip_prefix, dest, **kwargs):
+    """Macro to copy files to a different directory and then create a filegroup.
+
+    This is used by the //:protobuf_python py_proto_library target to work around
+    an issue caused by Python source files that are part of the same Python
+    package being in separate directories.
+
+    Args:
+      srcs: The source files to copy and add to the filegroup.
+      strip_prefix: Path to the root of the files to copy.
+      dest: The directory to copy the source files into.
+      **kwargs: extra arguments that will be passesd to the filegroup.
+    """
+    outs = [_RelativeOutputPath(s, strip_prefix, dest) for s in srcs]
+
+    native.genrule(
+        name = name + "_genrule",
+        srcs = srcs,
+        outs = outs,
+        cmd = " && ".join(
+            ["cp $(location %s) $(location %s)" %
+             (s, _RelativeOutputPath(s, strip_prefix, dest)) for s in srcs],
+        ),
+    )
+
+    native.filegroup(
+        name = name,
+        srcs = outs,
+        **kwargs
+    )
+
+def py_proto_library(
+        name,
+        srcs = [],
+        deps = [],
+        py_libs = [],
+        py_extra_srcs = [],
+        include = None,
+        default_runtime = "@com_google_protobuf//:protobuf_python",
+        protoc = "@com_google_protobuf//:protoc",
+        use_grpc_plugin = False,
+        **kargs):
+    """Bazel rule to create a Python protobuf library from proto source files
+
+    NOTE: the rule is only an internal workaround to generate protos. The
+    interface may change and the rule may be removed when bazel has introduced
+    the native rule.
+
+    Args:
+      name: the name of the py_proto_library.
+      srcs: the .proto files of the py_proto_library.
+      deps: a list of dependency labels; must be py_proto_library.
+      py_libs: a list of other py_library targets depended by the generated
+          py_library.
+      py_extra_srcs: extra source files that will be added to the output
+          py_library. This attribute is used for internal bootstrapping.
+      include: a string indicating the include path of the .proto files.
+      default_runtime: the implicitly default runtime which will be depended on by
+          the generated py_library target.
+      protoc: the label of the protocol compiler to generate the sources.
+      use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
+          when processing the proto files.
+      **kargs: other keyword arguments that are passed to cc_library.
+
+    """
+    outs = _PyOuts(srcs, use_grpc_plugin)
+
+    includes = []
+    if include != None:
+        includes = [include]
+
+    grpc_python_plugin = None
+    if use_grpc_plugin:
+        grpc_python_plugin = "//external:grpc_python_plugin"
+        # Note: Generated grpc code depends on Python grpc module. This dependency
+        # is not explicitly listed in py_libs. Instead, host system is assumed to
+        # have grpc installed.
+
+    proto_gen(
+        name = name + "_genproto",
+        srcs = srcs,
+        deps = [s + "_genproto" for s in deps],
+        includes = includes,
+        protoc = protoc,
+        gen_py = 1,
+        outs = outs,
+        visibility = ["//visibility:public"],
+        plugin = grpc_python_plugin,
+        plugin_language = "grpc",
+    )
+
+    if default_runtime and not default_runtime in py_libs + deps:
+        py_libs = py_libs + [default_runtime]
+
+    native.py_library(
+        name = name,
+        srcs = outs + py_extra_srcs,
+        deps = py_libs + deps,
+        imports = includes,
+        **kargs
+    )
+
+def internal_protobuf_py_tests(
+        name,
+        modules = [],
+        **kargs):
+    """Bazel rules to create batch tests for protobuf internal.
+
+    Args:
+      name: the name of the rule.
+      modules: a list of modules for tests. The macro will create a py_test for
+          each of the parameter with the source "google/protobuf/%s.py"
+      kargs: extra parameters that will be passed into the py_test.
+
+    """
+    for m in modules:
+        s = "python/google/protobuf/internal/%s.py" % m
+        native.py_test(
+            name = "py_%s" % m,
+            srcs = [s],
+            main = s,
+            **kargs
+        )
+
+def check_protobuf_required_bazel_version():
+    """For WORKSPACE files, to check the installed version of bazel.
+
+    This ensures bazel supports our approach to proto_library() depending on a
+    copied filegroup. (Fixed in bazel 0.5.4)
+    """
+    expected = apple_common.dotted_version("0.5.4")
+    current = apple_common.dotted_version(native.bazel_version)
+    if current.compare_to(expected) < 0:
+        fail("Bazel must be newer than 0.5.4")
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index b03d3380d797a344c9b9f1aa3133de6aeebedb17..645d242c96c02a6a90b84334af1ac2fd11e437da 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -15,6 +15,8 @@ VALID_LIBS = [
     "boringssl",
     "com_github_googleapis_googleapis",
     "com_github_googlecloudplatform_google_cloud_cpp",
+    "com_google_protobuf",
+    "com_google_protobuf_cc",
     "com_googlesource_code_re2",
     "curl",
     "cython",
@@ -23,6 +25,7 @@ VALID_LIBS = [
     "gast_archive",
     "gif_archive",
     "grpc",
+    "icu",
     "jpeg",
     "jsoncpp_git",
     "lmdb",
@@ -31,6 +34,7 @@ VALID_LIBS = [
     "org_sqlite",
     "pcre",
     "png_archive",
+    "protobuf_archive",
     "six_archive",
     "snappy",
     "swig",
diff --git a/third_party/tflite_mobilenet_float.BUILD b/third_party/tflite_mobilenet_float.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de47ed61f9db9ad980468aa325e3c770e0aae4f1
--- /dev/null
+++ b/third_party/tflite_mobilenet_float.BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/third_party/tflite_mobilenet_quant.BUILD b/third_party/tflite_mobilenet_quant.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de47ed61f9db9ad980468aa325e3c770e0aae4f1
--- /dev/null
+++ b/third_party/tflite_mobilenet_quant.BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index bcbc4dda11aceed9aa47ff8906f45d9e6180b3c8..9da417fd5fe18619de6dc51032b8e3cde21b6ffb 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -2,6 +2,8 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
+load("//third_party/toolchains/preconfig/generate:containers.bzl", "container_digests")
+
 # Platform for use with remote execution with
 # custom container based off RBE Ubuntu16_04
 # http://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04
@@ -17,7 +19,7 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-ubuntu16_04-tf@sha256:495a025ed5e273cfa5d53357ef93ac20500c008994e0be106c509f51555fb93c"
+            value:"docker://gcr.io/asci-toolchain/nosla-ubuntu16_04-tf@sha256:63a0e981a4e7ce5da2a851cf063e430f72947fd999d9336b7e54e2eebe8e0bf5"
         }""",
 )
 
@@ -30,6 +32,19 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@sha256:e5099ff15650986e268a43ee99e2d2b7ffe2459b8b6935385078d1d3b2ed4d02"
-        }""",
+            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
+        }""" % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
+)
+
+platform(
+    name = "rbe_cuda10.0-cudnn7-ubuntu14.04",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value:"docker://gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
+        }""" % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
 )
diff --git a/third_party/toolchains/gpus/cuda/BUILD b/third_party/toolchains/gpus/cuda/BUILD
index f59e025019caffa333a1570b572dd7f0d9913923..f63a0ea81925783085b1b551aab778d41ba1fb2c 100644
--- a/third_party/toolchains/gpus/cuda/BUILD
+++ b/third_party/toolchains/gpus/cuda/BUILD
@@ -1258,7 +1258,7 @@ genrule(
         "cuda/lib/libcupti.so.9.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
    """,
 )
 
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7e3e93d6004894029135f3151a282bcc43b8938f
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -0,0 +1,35 @@
+licenses(["restricted"])
+
+load(":generate.bzl", "tensorflow_rbe_config")
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-gcc-cuda9.0-cudnn7-nccl2",
+    compiler = "gcc",
+    cuda_version = "9.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-clang-cuda9.0-cudnn7-nccl2",
+    compiler = "clang",
+    cuda_version = "9.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-gcc-cuda10.0-cudnn7-nccl2",
+    compiler = "gcc",
+    cuda_version = "10.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-clang-cuda10.0-cudnn7-nccl2",
+    compiler = "clang",
+    cuda_version = "10.0",
+    cudnn_version = "7",
+    python_version = "3",
+)
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..7099b9bf3e4715706cbe725373add4cc98d304b8
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -0,0 +1,4 @@
+container_digests = {
+    "cuda9.0-cudnn7-ubuntu14.04": "sha256:c26138f4c38c754da2bad44a8a068523abf7fbd71d58a57ce92e5342c5431bf5",
+    "cuda10.0-cudnn7-ubuntu14.04": "sha256:66e7d592c8149291d5562a0f3093655a15b09c22e0eb30a87b3b6469b7a30ffc",
+}
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..2fb3a94cdca7430b522939266a4b2b398a65df8d
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -0,0 +1,46 @@
+load(
+    "@bazel_toolchains//rules:docker_config.bzl",
+    "docker_toolchain_autoconfig",
+)
+
+def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, compiler):
+    docker_toolchain_autoconfig(
+        name = name,
+        base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version),
+        bazel_version = "0.16.1",
+        config_repos = [
+            "local_config_cuda",
+            "local_config_python",
+            "local_config_nccl",
+        ],
+        env = {
+            "ABI_VERSION": "gcc",
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "BAZEL_COMPILER": compiler,
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CC": compiler,
+            "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+            "TF_NEED_CUDA": "1",
+            "TF_CUDA_CLANG": "1" if compiler == "clang" else "0",
+            "CLEAR_CACHE": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.0",
+            "TF_ENABLE_XLA": "1",
+            "TF_CUDNN_VERSION": cudnn_version,
+            "TF_CUDA_VERSION": cuda_version,
+            "NCCL_INSTALL_PATH": "/usr/lib",
+            "NCCL_HDR_PATH": "/usr/include",
+            "TF_NCCL_VERSION": "2",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+        },
+        # TODO(klimek): We should use the sources that we currently work on, not
+        # just the latest snapshot of tensorflow that is checked in.
+        git_repo = "https://github.com/tensorflow/tensorflow",
+        tags = ["manual"],
+        incompatible_changes_off = True,
+    )
+
+tensorflow_rbe_config = _tensorflow_rbe_config
diff --git a/third_party/toolchains/preconfig/generate/generate.sh b/third_party/toolchains/preconfig/generate/generate.sh
new file mode 100755
index 0000000000000000000000000000000000000000..37c5211278abf243ab388d83688e6c8c7888cea3
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/generate.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+TARGET="$1"
+OUTPUT="$2"
+
+if [[ -z "${TARGET}" || -z "${OUTPUT}" ]]; then
+  echo "Usage:"
+  echo "$0 <target> <output>"
+  exit 1
+fi
+
+TEMPDIR="$(mktemp -d)"
+ROOT="${PWD}"
+PKG="third_party/toolchains/preconfig"
+IFS='-' read -ra PLATFORM <<< "${TARGET}"
+OS="${PLATFORM[0]}"
+PY_VERSION="${PLATFORM[1]}"
+COMPILER="${PLATFORM[2]}"
+CUDA_VERSION="${PLATFORM[3]}"
+CUDNN_VERSION="${PLATFORM[4]}"
+NCCL_VERSION="${PLATFORM[5]}"
+
+if [[ "${COMPILER}" == "gcc" ]]; then
+  COMPILER="gcc-nvcc-${CUDA_VERSION}"
+fi
+
+echo "OS: ${OS}"
+echo "Python: ${PY_VERSION}"
+echo "Compiler: ${COMPILER}"
+echo "CUDA: ${CUDA_VERSION}"
+echo "CUDNN: ${CUDNN_VERSION}"
+echo "NCCL: ${NCCL_VERSION}"
+
+bazel build "${PKG}/generate:${TARGET}"
+cd "${TEMPDIR}"
+tar xvf "${ROOT}/bazel-bin/${PKG}/generate/${TARGET}_outputs.tar"
+
+# Delete all empty files: configurations leave empty files around when they are
+# unnecessary.
+find . -empty -delete
+
+# We build up the following directory structure with preconfigured packages:
+# <OS>/
+#   <CUDA>-<CUDNN>/
+#   <COMPILER>/
+#   <NCCL>/
+#   <PYTHON>/
+
+# Create our toplevel output directory for the OS.
+mkdir "${OS}"
+
+# Python:
+mv local_config_python "${OS}/${PY_VERSION}"
+
+# NCCL:
+mv local_config_nccl "${OS}/${NCCL_VERSION}"
+
+# Compiler:
+mv local_config_cuda/crosstool "${OS}/${COMPILER}"
+
+# CUDA:
+mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
+
+# Cleanup for copybara.
+find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs buildifier
+find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs -I {} mv {} {}.oss
+
+# Tar it up:
+tar cvf "${OUTPUT}" "${OS}"
+
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..f30c2f1ae6318c645e174617a74b8fdadac1598e
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/workspace.bzl
@@ -0,0 +1,25 @@
+load(
+    "@io_bazel_rules_docker//container:container.bzl",
+    "container_pull",
+    container_repositories = "repositories",
+)
+load(":containers.bzl", "container_digests")
+
+def _remote_config_workspace():
+    container_repositories()
+
+    container_pull(
+        name = "cuda9.0-cudnn7-ubuntu14.04",
+        registry = "gcr.io",
+        repository = "asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04",
+        digest = container_digests["cuda9.0-cudnn7-ubuntu14.04"],
+    )
+
+    container_pull(
+        name = "cuda10.0-cudnn7-ubuntu14.04",
+        registry = "gcr.io",
+        repository = "asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04",
+        digest = container_digests["cuda10.0-cudnn7-ubuntu14.04"],
+    )
+
+remote_config_workspace = _remote_config_workspace
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..b61f572d6d2e4155a1b8c889407f9e0cb54b7674
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
+workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..c813efccf9b82578984b33d04fd513030c83e0b1
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
@@ -0,0 +1,1275 @@
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-include",
+        ":cudnn-include",
+    ],
+    includes = [
+        ".",
+        "cuda/include",
+        "cuda/include/crt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["cuda/lib/libcudart_static.a"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = select({
+        ":freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }) + [
+        "-lpthread",
+        "-lrt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = ["cuda/lib/libcuda.so"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart",
+    srcs = ["cuda/lib/libcudart.so.10.0"],
+    data = ["cuda/lib/libcudart.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cublas",
+    srcs = ["cuda/lib/libcublas.so.10.0"],
+    data = ["cuda/lib/libcublas.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cusolver",
+    srcs = ["cuda/lib/libcusolver.so.10.0"],
+    data = ["cuda/lib/libcusolver.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = ["-lgomp"],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn",
+    srcs = ["cuda/lib/libcudnn.so.7"],
+    data = ["cuda/lib/libcudnn.so.7"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn_header",
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cufft",
+    srcs = ["cuda/lib/libcufft.so.10.0"],
+    data = ["cuda/lib/libcufft.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "curand",
+    srcs = ["cuda/lib/libcurand.so.10.0"],
+    data = ["cuda/lib/libcurand.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cublas",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+cc_library(
+    name = "cupti_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-extras",
+    ],
+    includes = [
+        ".",
+        "cuda/extras/CUPTI/include/",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cupti_dsos",
+    data = ["cuda/lib/libcupti.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "cuda-include",
+    outs = [
+        "cuda/include/CL/cl.h",
+        "cuda/include/CL/cl.hpp",
+        "cuda/include/CL/cl_egl.h",
+        "cuda/include/CL/cl_ext.h",
+        "cuda/include/CL/cl_gl.h",
+        "cuda/include/CL/cl_gl_ext.h",
+        "cuda/include/CL/cl_platform.h",
+        "cuda/include/CL/opencl.h",
+        "cuda/include/builtin_types.h",
+        "cuda/include/channel_descriptor.h",
+        "cuda/include/common_functions.h",
+        "cuda/include/cooperative_groups.h",
+        "cuda/include/cooperative_groups_helpers.h",
+        "cuda/include/crt/common_functions.h",
+        "cuda/include/crt/device_double_functions.h",
+        "cuda/include/crt/device_double_functions.hpp",
+        "cuda/include/crt/device_functions.h",
+        "cuda/include/crt/device_functions.hpp",
+        "cuda/include/crt/func_macro.h",
+        "cuda/include/crt/host_config.h",
+        "cuda/include/crt/host_defines.h",
+        "cuda/include/crt/host_runtime.h",
+        "cuda/include/crt/math_functions.h",
+        "cuda/include/crt/math_functions.hpp",
+        "cuda/include/crt/mma.h",
+        "cuda/include/crt/mma.hpp",
+        "cuda/include/crt/nvfunctional",
+        "cuda/include/crt/sm_70_rt.h",
+        "cuda/include/crt/sm_70_rt.hpp",
+        "cuda/include/crt/storage_class.h",
+        "cuda/include/cuComplex.h",
+        "cuda/include/cublas.h",
+        "cuda/include/cublasXt.h",
+        "cuda/include/cublas_api.h",
+        "cuda/include/cublas_v2.h",
+        "cuda/include/cuda.h",
+        "cuda/include/cudaEGL.h",
+        "cuda/include/cudaGL.h",
+        "cuda/include/cudaProfiler.h",
+        "cuda/include/cudaVDPAU.h",
+        "cuda/include/cuda_device_runtime_api.h",
+        "cuda/include/cuda_egl_interop.h",
+        "cuda/include/cuda_fp16.h",
+        "cuda/include/cuda_fp16.hpp",
+        "cuda/include/cuda_gl_interop.h",
+        "cuda/include/cuda_occupancy.h",
+        "cuda/include/cuda_profiler_api.h",
+        "cuda/include/cuda_runtime.h",
+        "cuda/include/cuda_runtime_api.h",
+        "cuda/include/cuda_surface_types.h",
+        "cuda/include/cuda_texture_types.h",
+        "cuda/include/cuda_vdpau_interop.h",
+        "cuda/include/cudalibxt.h",
+        "cuda/include/cudart_platform.h",
+        "cuda/include/cufft.h",
+        "cuda/include/cufftXt.h",
+        "cuda/include/cufftw.h",
+        "cuda/include/curand.h",
+        "cuda/include/curand_discrete.h",
+        "cuda/include/curand_discrete2.h",
+        "cuda/include/curand_globals.h",
+        "cuda/include/curand_kernel.h",
+        "cuda/include/curand_lognormal.h",
+        "cuda/include/curand_mrg32k3a.h",
+        "cuda/include/curand_mtgp32.h",
+        "cuda/include/curand_mtgp32_host.h",
+        "cuda/include/curand_mtgp32_kernel.h",
+        "cuda/include/curand_mtgp32dc_p_11213.h",
+        "cuda/include/curand_normal.h",
+        "cuda/include/curand_normal_static.h",
+        "cuda/include/curand_philox4x32_x.h",
+        "cuda/include/curand_poisson.h",
+        "cuda/include/curand_precalc.h",
+        "cuda/include/curand_uniform.h",
+        "cuda/include/cusolverDn.h",
+        "cuda/include/cusolverRf.h",
+        "cuda/include/cusolverSp.h",
+        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
+        "cuda/include/cusolver_common.h",
+        "cuda/include/cusparse.h",
+        "cuda/include/cusparse_v2.h",
+        "cuda/include/device_atomic_functions.h",
+        "cuda/include/device_atomic_functions.hpp",
+        "cuda/include/device_double_functions.h",
+        "cuda/include/device_functions.h",
+        "cuda/include/device_launch_parameters.h",
+        "cuda/include/device_types.h",
+        "cuda/include/driver_functions.h",
+        "cuda/include/driver_types.h",
+        "cuda/include/fatBinaryCtl.h",
+        "cuda/include/fatbinary.h",
+        "cuda/include/host_config.h",
+        "cuda/include/host_defines.h",
+        "cuda/include/library_types.h",
+        "cuda/include/math_constants.h",
+        "cuda/include/math_functions.h",
+        "cuda/include/mma.h",
+        "cuda/include/npp.h",
+        "cuda/include/nppcore.h",
+        "cuda/include/nppdefs.h",
+        "cuda/include/nppi.h",
+        "cuda/include/nppi_arithmetic_and_logical_operations.h",
+        "cuda/include/nppi_color_conversion.h",
+        "cuda/include/nppi_compression_functions.h",
+        "cuda/include/nppi_computer_vision.h",
+        "cuda/include/nppi_data_exchange_and_initialization.h",
+        "cuda/include/nppi_filtering_functions.h",
+        "cuda/include/nppi_geometry_transforms.h",
+        "cuda/include/nppi_linear_transforms.h",
+        "cuda/include/nppi_morphological_operations.h",
+        "cuda/include/nppi_statistics_functions.h",
+        "cuda/include/nppi_support_functions.h",
+        "cuda/include/nppi_threshold_and_compare_operations.h",
+        "cuda/include/npps.h",
+        "cuda/include/npps_arithmetic_and_logical_operations.h",
+        "cuda/include/npps_conversion_functions.h",
+        "cuda/include/npps_filtering_functions.h",
+        "cuda/include/npps_initialization.h",
+        "cuda/include/npps_statistics_functions.h",
+        "cuda/include/npps_support_functions.h",
+        "cuda/include/nppversion.h",
+        "cuda/include/nvToolsExt.h",
+        "cuda/include/nvToolsExtCuda.h",
+        "cuda/include/nvToolsExtCudaRt.h",
+        "cuda/include/nvToolsExtMeta.h",
+        "cuda/include/nvToolsExtSync.h",
+        "cuda/include/nvblas.h",
+        "cuda/include/nvfunctional",
+        "cuda/include/nvgraph.h",
+        "cuda/include/nvjpeg.h",
+        "cuda/include/nvml.h",
+        "cuda/include/nvrtc.h",
+        "cuda/include/nvtx3/nvToolsExt.h",
+        "cuda/include/nvtx3/nvToolsExtCuda.h",
+        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
+        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
+        "cuda/include/nvtx3/nvToolsExtSync.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
+        "cuda/include/sm_20_atomic_functions.h",
+        "cuda/include/sm_20_atomic_functions.hpp",
+        "cuda/include/sm_20_intrinsics.h",
+        "cuda/include/sm_20_intrinsics.hpp",
+        "cuda/include/sm_30_intrinsics.h",
+        "cuda/include/sm_30_intrinsics.hpp",
+        "cuda/include/sm_32_atomic_functions.h",
+        "cuda/include/sm_32_atomic_functions.hpp",
+        "cuda/include/sm_32_intrinsics.h",
+        "cuda/include/sm_32_intrinsics.hpp",
+        "cuda/include/sm_35_atomic_functions.h",
+        "cuda/include/sm_35_intrinsics.h",
+        "cuda/include/sm_60_atomic_functions.h",
+        "cuda/include/sm_60_atomic_functions.hpp",
+        "cuda/include/sm_61_intrinsics.h",
+        "cuda/include/sm_61_intrinsics.hpp",
+        "cuda/include/sobol_direction_vectors.h",
+        "cuda/include/surface_functions.h",
+        "cuda/include/surface_functions.hpp",
+        "cuda/include/surface_indirect_functions.h",
+        "cuda/include/surface_indirect_functions.hpp",
+        "cuda/include/surface_types.h",
+        "cuda/include/texture_fetch_functions.h",
+        "cuda/include/texture_fetch_functions.hpp",
+        "cuda/include/texture_indirect_functions.h",
+        "cuda/include/texture_indirect_functions.hpp",
+        "cuda/include/texture_types.h",
+        "cuda/include/thrust/adjacent_difference.h",
+        "cuda/include/thrust/advance.h",
+        "cuda/include/thrust/binary_search.h",
+        "cuda/include/thrust/complex.h",
+        "cuda/include/thrust/copy.h",
+        "cuda/include/thrust/count.h",
+        "cuda/include/thrust/detail/adjacent_difference.inl",
+        "cuda/include/thrust/detail/advance.inl",
+        "cuda/include/thrust/detail/alignment.h",
+        "cuda/include/thrust/detail/allocator/allocator_traits.h",
+        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
+        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
+        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/default_construct_range.h",
+        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/destroy_range.h",
+        "cuda/include/thrust/detail/allocator/destroy_range.inl",
+        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
+        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
+        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
+        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
+        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
+        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
+        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
+        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
+        "cuda/include/thrust/detail/binary_search.inl",
+        "cuda/include/thrust/detail/complex/arithmetic.h",
+        "cuda/include/thrust/detail/complex/c99math.h",
+        "cuda/include/thrust/detail/complex/catrig.h",
+        "cuda/include/thrust/detail/complex/catrigf.h",
+        "cuda/include/thrust/detail/complex/ccosh.h",
+        "cuda/include/thrust/detail/complex/ccoshf.h",
+        "cuda/include/thrust/detail/complex/cexp.h",
+        "cuda/include/thrust/detail/complex/cexpf.h",
+        "cuda/include/thrust/detail/complex/clog.h",
+        "cuda/include/thrust/detail/complex/clogf.h",
+        "cuda/include/thrust/detail/complex/complex.inl",
+        "cuda/include/thrust/detail/complex/cpow.h",
+        "cuda/include/thrust/detail/complex/cproj.h",
+        "cuda/include/thrust/detail/complex/csinh.h",
+        "cuda/include/thrust/detail/complex/csinhf.h",
+        "cuda/include/thrust/detail/complex/csqrt.h",
+        "cuda/include/thrust/detail/complex/csqrtf.h",
+        "cuda/include/thrust/detail/complex/ctanh.h",
+        "cuda/include/thrust/detail/complex/ctanhf.h",
+        "cuda/include/thrust/detail/complex/math_private.h",
+        "cuda/include/thrust/detail/complex/stream.h",
+        "cuda/include/thrust/detail/config.h",
+        "cuda/include/thrust/detail/config/compiler.h",
+        "cuda/include/thrust/detail/config/compiler_fence.h",
+        "cuda/include/thrust/detail/config/config.h",
+        "cuda/include/thrust/detail/config/debug.h",
+        "cuda/include/thrust/detail/config/device_system.h",
+        "cuda/include/thrust/detail/config/exec_check_disable.h",
+        "cuda/include/thrust/detail/config/forceinline.h",
+        "cuda/include/thrust/detail/config/global_workarounds.h",
+        "cuda/include/thrust/detail/config/host_device.h",
+        "cuda/include/thrust/detail/config/host_system.h",
+        "cuda/include/thrust/detail/config/simple_defines.h",
+        "cuda/include/thrust/detail/contiguous_storage.h",
+        "cuda/include/thrust/detail/contiguous_storage.inl",
+        "cuda/include/thrust/detail/copy.h",
+        "cuda/include/thrust/detail/copy.inl",
+        "cuda/include/thrust/detail/copy_if.h",
+        "cuda/include/thrust/detail/copy_if.inl",
+        "cuda/include/thrust/detail/count.inl",
+        "cuda/include/thrust/detail/cstdint.h",
+        "cuda/include/thrust/detail/device_delete.inl",
+        "cuda/include/thrust/detail/device_free.inl",
+        "cuda/include/thrust/detail/device_malloc.inl",
+        "cuda/include/thrust/detail/device_new.inl",
+        "cuda/include/thrust/detail/device_ptr.inl",
+        "cuda/include/thrust/detail/device_reference.inl",
+        "cuda/include/thrust/detail/device_vector.inl",
+        "cuda/include/thrust/detail/dispatch/is_trivial_copy.h",
+        "cuda/include/thrust/detail/distance.inl",
+        "cuda/include/thrust/detail/equal.inl",
+        "cuda/include/thrust/detail/execute_with_allocator.h",
+        "cuda/include/thrust/detail/execution_policy.h",
+        "cuda/include/thrust/detail/extrema.inl",
+        "cuda/include/thrust/detail/fill.inl",
+        "cuda/include/thrust/detail/find.inl",
+        "cuda/include/thrust/detail/for_each.inl",
+        "cuda/include/thrust/detail/function.h",
+        "cuda/include/thrust/detail/functional.inl",
+        "cuda/include/thrust/detail/functional/actor.h",
+        "cuda/include/thrust/detail/functional/actor.inl",
+        "cuda/include/thrust/detail/functional/argument.h",
+        "cuda/include/thrust/detail/functional/composite.h",
+        "cuda/include/thrust/detail/functional/operators.h",
+        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
+        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
+        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
+        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
+        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
+        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
+        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
+        "cuda/include/thrust/detail/functional/placeholder.h",
+        "cuda/include/thrust/detail/functional/value.h",
+        "cuda/include/thrust/detail/gather.inl",
+        "cuda/include/thrust/detail/generate.inl",
+        "cuda/include/thrust/detail/get_iterator_value.h",
+        "cuda/include/thrust/detail/host_vector.inl",
+        "cuda/include/thrust/detail/inner_product.inl",
+        "cuda/include/thrust/detail/integer_math.h",
+        "cuda/include/thrust/detail/integer_traits.h",
+        "cuda/include/thrust/detail/internal_functional.h",
+        "cuda/include/thrust/detail/logical.inl",
+        "cuda/include/thrust/detail/malloc_and_free.h",
+        "cuda/include/thrust/detail/merge.inl",
+        "cuda/include/thrust/detail/minmax.h",
+        "cuda/include/thrust/detail/mismatch.inl",
+        "cuda/include/thrust/detail/mpl/math.h",
+        "cuda/include/thrust/detail/numeric_traits.h",
+        "cuda/include/thrust/detail/overlapped_copy.h",
+        "cuda/include/thrust/detail/pair.inl",
+        "cuda/include/thrust/detail/partition.inl",
+        "cuda/include/thrust/detail/pointer.h",
+        "cuda/include/thrust/detail/pointer.inl",
+        "cuda/include/thrust/detail/preprocessor.h",
+        "cuda/include/thrust/detail/range/head_flags.h",
+        "cuda/include/thrust/detail/range/tail_flags.h",
+        "cuda/include/thrust/detail/raw_pointer_cast.h",
+        "cuda/include/thrust/detail/raw_reference_cast.h",
+        "cuda/include/thrust/detail/reduce.inl",
+        "cuda/include/thrust/detail/reference.h",
+        "cuda/include/thrust/detail/reference.inl",
+        "cuda/include/thrust/detail/reference_forward_declaration.h",
+        "cuda/include/thrust/detail/remove.inl",
+        "cuda/include/thrust/detail/replace.inl",
+        "cuda/include/thrust/detail/reverse.inl",
+        "cuda/include/thrust/detail/scan.inl",
+        "cuda/include/thrust/detail/scatter.inl",
+        "cuda/include/thrust/detail/seq.h",
+        "cuda/include/thrust/detail/sequence.inl",
+        "cuda/include/thrust/detail/set_operations.inl",
+        "cuda/include/thrust/detail/sort.inl",
+        "cuda/include/thrust/detail/static_assert.h",
+        "cuda/include/thrust/detail/static_map.h",
+        "cuda/include/thrust/detail/swap.h",
+        "cuda/include/thrust/detail/swap.inl",
+        "cuda/include/thrust/detail/swap_ranges.inl",
+        "cuda/include/thrust/detail/tabulate.inl",
+        "cuda/include/thrust/detail/temporary_array.h",
+        "cuda/include/thrust/detail/temporary_array.inl",
+        "cuda/include/thrust/detail/temporary_buffer.h",
+        "cuda/include/thrust/detail/transform.inl",
+        "cuda/include/thrust/detail/transform_reduce.inl",
+        "cuda/include/thrust/detail/transform_scan.inl",
+        "cuda/include/thrust/detail/trivial_sequence.h",
+        "cuda/include/thrust/detail/tuple.inl",
+        "cuda/include/thrust/detail/tuple_meta_transform.h",
+        "cuda/include/thrust/detail/tuple_transform.h",
+        "cuda/include/thrust/detail/type_traits.h",
+        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
+        "cuda/include/thrust/detail/type_traits/function_traits.h",
+        "cuda/include/thrust/detail/type_traits/has_member_function.h",
+        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
+        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
+        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
+        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
+        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
+        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
+        "cuda/include/thrust/detail/type_traits/minimum_type.h",
+        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
+        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
+        "cuda/include/thrust/detail/uninitialized_copy.inl",
+        "cuda/include/thrust/detail/uninitialized_fill.inl",
+        "cuda/include/thrust/detail/unique.inl",
+        "cuda/include/thrust/detail/use_default.h",
+        "cuda/include/thrust/detail/util/align.h",
+        "cuda/include/thrust/detail/util/blocking.h",
+        "cuda/include/thrust/detail/vector_base.h",
+        "cuda/include/thrust/detail/vector_base.inl",
+        "cuda/include/thrust/device_allocator.h",
+        "cuda/include/thrust/device_delete.h",
+        "cuda/include/thrust/device_free.h",
+        "cuda/include/thrust/device_malloc.h",
+        "cuda/include/thrust/device_malloc_allocator.h",
+        "cuda/include/thrust/device_new.h",
+        "cuda/include/thrust/device_new_allocator.h",
+        "cuda/include/thrust/device_ptr.h",
+        "cuda/include/thrust/device_reference.h",
+        "cuda/include/thrust/device_vector.h",
+        "cuda/include/thrust/distance.h",
+        "cuda/include/thrust/equal.h",
+        "cuda/include/thrust/execution_policy.h",
+        "cuda/include/thrust/extrema.h",
+        "cuda/include/thrust/fill.h",
+        "cuda/include/thrust/find.h",
+        "cuda/include/thrust/for_each.h",
+        "cuda/include/thrust/functional.h",
+        "cuda/include/thrust/gather.h",
+        "cuda/include/thrust/generate.h",
+        "cuda/include/thrust/host_vector.h",
+        "cuda/include/thrust/inner_product.h",
+        "cuda/include/thrust/iterator/constant_iterator.h",
+        "cuda/include/thrust/iterator/counting_iterator.h",
+        "cuda/include/thrust/iterator/detail/any_assign.h",
+        "cuda/include/thrust/iterator/detail/any_system_tag.h",
+        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
+        "cuda/include/thrust/iterator/detail/device_system_tag.h",
+        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/distance_from_result.h",
+        "cuda/include/thrust/iterator/detail/host_system_tag.h",
+        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
+        "cuda/include/thrust/iterator/detail/is_trivial_iterator.h",
+        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
+        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
+        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
+        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
+        "cuda/include/thrust/iterator/detail/join_iterator.h",
+        "cuda/include/thrust/iterator/detail/minimum_category.h",
+        "cuda/include/thrust/iterator/detail/minimum_system.h",
+        "cuda/include/thrust/iterator/detail/normal_iterator.h",
+        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/retag.h",
+        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
+        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
+        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
+        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
+        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
+        "cuda/include/thrust/iterator/detail/universal_categories.h",
+        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
+        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
+        "cuda/include/thrust/iterator/discard_iterator.h",
+        "cuda/include/thrust/iterator/iterator_adaptor.h",
+        "cuda/include/thrust/iterator/iterator_categories.h",
+        "cuda/include/thrust/iterator/iterator_facade.h",
+        "cuda/include/thrust/iterator/iterator_traits.h",
+        "cuda/include/thrust/iterator/permutation_iterator.h",
+        "cuda/include/thrust/iterator/retag.h",
+        "cuda/include/thrust/iterator/reverse_iterator.h",
+        "cuda/include/thrust/iterator/transform_iterator.h",
+        "cuda/include/thrust/iterator/transform_output_iterator.h",
+        "cuda/include/thrust/iterator/zip_iterator.h",
+        "cuda/include/thrust/logical.h",
+        "cuda/include/thrust/memory.h",
+        "cuda/include/thrust/merge.h",
+        "cuda/include/thrust/mismatch.h",
+        "cuda/include/thrust/pair.h",
+        "cuda/include/thrust/partition.h",
+        "cuda/include/thrust/random.h",
+        "cuda/include/thrust/random/detail/discard_block_engine.inl",
+        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
+        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
+        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
+        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
+        "cuda/include/thrust/random/detail/mod.h",
+        "cuda/include/thrust/random/detail/normal_distribution.inl",
+        "cuda/include/thrust/random/detail/normal_distribution_base.h",
+        "cuda/include/thrust/random/detail/random_core_access.h",
+        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
+        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
+        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
+        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
+        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
+        "cuda/include/thrust/random/discard_block_engine.h",
+        "cuda/include/thrust/random/linear_congruential_engine.h",
+        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
+        "cuda/include/thrust/random/normal_distribution.h",
+        "cuda/include/thrust/random/subtract_with_carry_engine.h",
+        "cuda/include/thrust/random/uniform_int_distribution.h",
+        "cuda/include/thrust/random/uniform_real_distribution.h",
+        "cuda/include/thrust/random/xor_combine_engine.h",
+        "cuda/include/thrust/reduce.h",
+        "cuda/include/thrust/remove.h",
+        "cuda/include/thrust/replace.h",
+        "cuda/include/thrust/reverse.h",
+        "cuda/include/thrust/scan.h",
+        "cuda/include/thrust/scatter.h",
+        "cuda/include/thrust/sequence.h",
+        "cuda/include/thrust/set_operations.h",
+        "cuda/include/thrust/sort.h",
+        "cuda/include/thrust/swap.h",
+        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/cpp/detail/assign_value.h",
+        "cuda/include/thrust/system/cpp/detail/binary_search.h",
+        "cuda/include/thrust/system/cpp/detail/copy.h",
+        "cuda/include/thrust/system/cpp/detail/copy_if.h",
+        "cuda/include/thrust/system/cpp/detail/count.h",
+        "cuda/include/thrust/system/cpp/detail/equal.h",
+        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
+        "cuda/include/thrust/system/cpp/detail/extrema.h",
+        "cuda/include/thrust/system/cpp/detail/fill.h",
+        "cuda/include/thrust/system/cpp/detail/find.h",
+        "cuda/include/thrust/system/cpp/detail/for_each.h",
+        "cuda/include/thrust/system/cpp/detail/gather.h",
+        "cuda/include/thrust/system/cpp/detail/generate.h",
+        "cuda/include/thrust/system/cpp/detail/get_value.h",
+        "cuda/include/thrust/system/cpp/detail/inner_product.h",
+        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
+        "cuda/include/thrust/system/cpp/detail/logical.h",
+        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/cpp/detail/memory.inl",
+        "cuda/include/thrust/system/cpp/detail/merge.h",
+        "cuda/include/thrust/system/cpp/detail/mismatch.h",
+        "cuda/include/thrust/system/cpp/detail/par.h",
+        "cuda/include/thrust/system/cpp/detail/partition.h",
+        "cuda/include/thrust/system/cpp/detail/reduce.h",
+        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/remove.h",
+        "cuda/include/thrust/system/cpp/detail/replace.h",
+        "cuda/include/thrust/system/cpp/detail/reverse.h",
+        "cuda/include/thrust/system/cpp/detail/scan.h",
+        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/scatter.h",
+        "cuda/include/thrust/system/cpp/detail/sequence.h",
+        "cuda/include/thrust/system/cpp/detail/set_operations.h",
+        "cuda/include/thrust/system/cpp/detail/sort.h",
+        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
+        "cuda/include/thrust/system/cpp/detail/tabulate.h",
+        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/cpp/detail/transform.h",
+        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
+        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
+        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/cpp/detail/unique.h",
+        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/vector.inl",
+        "cuda/include/thrust/system/cpp/execution_policy.h",
+        "cuda/include/thrust/system/cpp/memory.h",
+        "cuda/include/thrust/system/cpp/vector.h",
+        "cuda/include/thrust/system/cuda/config.h",
+        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/cuda/detail/assign_value.h",
+        "cuda/include/thrust/system/cuda/detail/binary_search.h",
+        "cuda/include/thrust/system/cuda/detail/copy.h",
+        "cuda/include/thrust/system/cuda/detail/copy_if.h",
+        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
+        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
+        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
+        "cuda/include/thrust/system/cuda/detail/core/util.h",
+        "cuda/include/thrust/system/cuda/detail/count.h",
+        "cuda/include/thrust/system/cuda/detail/cross_system.h",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/equal.h",
+        "cuda/include/thrust/system/cuda/detail/error.inl",
+        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
+        "cuda/include/thrust/system/cuda/detail/extrema.h",
+        "cuda/include/thrust/system/cuda/detail/fill.h",
+        "cuda/include/thrust/system/cuda/detail/find.h",
+        "cuda/include/thrust/system/cuda/detail/for_each.h",
+        "cuda/include/thrust/system/cuda/detail/gather.h",
+        "cuda/include/thrust/system/cuda/detail/generate.h",
+        "cuda/include/thrust/system/cuda/detail/get_value.h",
+        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
+        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
+        "cuda/include/thrust/system/cuda/detail/inner_product.h",
+        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
+        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
+        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
+        "cuda/include/thrust/system/cuda/detail/logical.h",
+        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/cuda/detail/memory.inl",
+        "cuda/include/thrust/system/cuda/detail/merge.h",
+        "cuda/include/thrust/system/cuda/detail/mismatch.h",
+        "cuda/include/thrust/system/cuda/detail/par.h",
+        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
+        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
+        "cuda/include/thrust/system/cuda/detail/partition.h",
+        "cuda/include/thrust/system/cuda/detail/reduce.h",
+        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/remove.h",
+        "cuda/include/thrust/system/cuda/detail/replace.h",
+        "cuda/include/thrust/system/cuda/detail/reverse.h",
+        "cuda/include/thrust/system/cuda/detail/scan.h",
+        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/scatter.h",
+        "cuda/include/thrust/system/cuda/detail/sequence.h",
+        "cuda/include/thrust/system/cuda/detail/set_operations.h",
+        "cuda/include/thrust/system/cuda/detail/sort.h",
+        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
+        "cuda/include/thrust/system/cuda/detail/tabulate.h",
+        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/cuda/detail/terminate.h",
+        "cuda/include/thrust/system/cuda/detail/transform.h",
+        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
+        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
+        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/cuda/detail/unique.h",
+        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/util.h",
+        "cuda/include/thrust/system/cuda/detail/vector.inl",
+        "cuda/include/thrust/system/cuda/error.h",
+        "cuda/include/thrust/system/cuda/execution_policy.h",
+        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
+        "cuda/include/thrust/system/cuda/memory.h",
+        "cuda/include/thrust/system/cuda/vector.h",
+        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/adl/assign_value.h",
+        "cuda/include/thrust/system/detail/adl/binary_search.h",
+        "cuda/include/thrust/system/detail/adl/copy.h",
+        "cuda/include/thrust/system/detail/adl/copy_if.h",
+        "cuda/include/thrust/system/detail/adl/count.h",
+        "cuda/include/thrust/system/detail/adl/equal.h",
+        "cuda/include/thrust/system/detail/adl/extrema.h",
+        "cuda/include/thrust/system/detail/adl/fill.h",
+        "cuda/include/thrust/system/detail/adl/find.h",
+        "cuda/include/thrust/system/detail/adl/for_each.h",
+        "cuda/include/thrust/system/detail/adl/gather.h",
+        "cuda/include/thrust/system/detail/adl/generate.h",
+        "cuda/include/thrust/system/detail/adl/get_value.h",
+        "cuda/include/thrust/system/detail/adl/inner_product.h",
+        "cuda/include/thrust/system/detail/adl/iter_swap.h",
+        "cuda/include/thrust/system/detail/adl/logical.h",
+        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
+        "cuda/include/thrust/system/detail/adl/merge.h",
+        "cuda/include/thrust/system/detail/adl/mismatch.h",
+        "cuda/include/thrust/system/detail/adl/partition.h",
+        "cuda/include/thrust/system/detail/adl/reduce.h",
+        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/adl/remove.h",
+        "cuda/include/thrust/system/detail/adl/replace.h",
+        "cuda/include/thrust/system/detail/adl/reverse.h",
+        "cuda/include/thrust/system/detail/adl/scan.h",
+        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
+        "cuda/include/thrust/system/detail/adl/scatter.h",
+        "cuda/include/thrust/system/detail/adl/sequence.h",
+        "cuda/include/thrust/system/detail/adl/set_operations.h",
+        "cuda/include/thrust/system/detail/adl/sort.h",
+        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
+        "cuda/include/thrust/system/detail/adl/tabulate.h",
+        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/adl/transform.h",
+        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
+        "cuda/include/thrust/system/detail/adl/transform_scan.h",
+        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/adl/unique.h",
+        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
+        "cuda/include/thrust/system/detail/bad_alloc.h",
+        "cuda/include/thrust/system/detail/errno.h",
+        "cuda/include/thrust/system/detail/error_category.inl",
+        "cuda/include/thrust/system/detail/error_code.inl",
+        "cuda/include/thrust/system/detail/error_condition.inl",
+        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
+        "cuda/include/thrust/system/detail/generic/advance.h",
+        "cuda/include/thrust/system/detail/generic/advance.inl",
+        "cuda/include/thrust/system/detail/generic/binary_search.h",
+        "cuda/include/thrust/system/detail/generic/binary_search.inl",
+        "cuda/include/thrust/system/detail/generic/copy.h",
+        "cuda/include/thrust/system/detail/generic/copy.inl",
+        "cuda/include/thrust/system/detail/generic/copy_if.h",
+        "cuda/include/thrust/system/detail/generic/copy_if.inl",
+        "cuda/include/thrust/system/detail/generic/count.h",
+        "cuda/include/thrust/system/detail/generic/count.inl",
+        "cuda/include/thrust/system/detail/generic/distance.h",
+        "cuda/include/thrust/system/detail/generic/distance.inl",
+        "cuda/include/thrust/system/detail/generic/equal.h",
+        "cuda/include/thrust/system/detail/generic/equal.inl",
+        "cuda/include/thrust/system/detail/generic/extrema.h",
+        "cuda/include/thrust/system/detail/generic/extrema.inl",
+        "cuda/include/thrust/system/detail/generic/fill.h",
+        "cuda/include/thrust/system/detail/generic/find.h",
+        "cuda/include/thrust/system/detail/generic/find.inl",
+        "cuda/include/thrust/system/detail/generic/for_each.h",
+        "cuda/include/thrust/system/detail/generic/gather.h",
+        "cuda/include/thrust/system/detail/generic/gather.inl",
+        "cuda/include/thrust/system/detail/generic/generate.h",
+        "cuda/include/thrust/system/detail/generic/generate.inl",
+        "cuda/include/thrust/system/detail/generic/inner_product.h",
+        "cuda/include/thrust/system/detail/generic/inner_product.inl",
+        "cuda/include/thrust/system/detail/generic/logical.h",
+        "cuda/include/thrust/system/detail/generic/memory.h",
+        "cuda/include/thrust/system/detail/generic/memory.inl",
+        "cuda/include/thrust/system/detail/generic/merge.h",
+        "cuda/include/thrust/system/detail/generic/merge.inl",
+        "cuda/include/thrust/system/detail/generic/mismatch.h",
+        "cuda/include/thrust/system/detail/generic/mismatch.inl",
+        "cuda/include/thrust/system/detail/generic/partition.h",
+        "cuda/include/thrust/system/detail/generic/partition.inl",
+        "cuda/include/thrust/system/detail/generic/reduce.h",
+        "cuda/include/thrust/system/detail/generic/reduce.inl",
+        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
+        "cuda/include/thrust/system/detail/generic/remove.h",
+        "cuda/include/thrust/system/detail/generic/remove.inl",
+        "cuda/include/thrust/system/detail/generic/replace.h",
+        "cuda/include/thrust/system/detail/generic/replace.inl",
+        "cuda/include/thrust/system/detail/generic/reverse.h",
+        "cuda/include/thrust/system/detail/generic/reverse.inl",
+        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
+        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
+        "cuda/include/thrust/system/detail/generic/scan.h",
+        "cuda/include/thrust/system/detail/generic/scan.inl",
+        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
+        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
+        "cuda/include/thrust/system/detail/generic/scatter.h",
+        "cuda/include/thrust/system/detail/generic/scatter.inl",
+        "cuda/include/thrust/system/detail/generic/select_system.h",
+        "cuda/include/thrust/system/detail/generic/sequence.h",
+        "cuda/include/thrust/system/detail/generic/sequence.inl",
+        "cuda/include/thrust/system/detail/generic/set_operations.h",
+        "cuda/include/thrust/system/detail/generic/set_operations.inl",
+        "cuda/include/thrust/system/detail/generic/sort.h",
+        "cuda/include/thrust/system/detail/generic/sort.inl",
+        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
+        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
+        "cuda/include/thrust/system/detail/generic/tabulate.h",
+        "cuda/include/thrust/system/detail/generic/tabulate.inl",
+        "cuda/include/thrust/system/detail/generic/tag.h",
+        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
+        "cuda/include/thrust/system/detail/generic/transform.h",
+        "cuda/include/thrust/system/detail/generic/transform.inl",
+        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
+        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
+        "cuda/include/thrust/system/detail/generic/transform_scan.h",
+        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
+        "cuda/include/thrust/system/detail/generic/type_traits.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
+        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
+        "cuda/include/thrust/system/detail/generic/unique.h",
+        "cuda/include/thrust/system/detail/generic/unique.inl",
+        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
+        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
+        "cuda/include/thrust/system/detail/internal/decompose.h",
+        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/sequential/assign_value.h",
+        "cuda/include/thrust/system/detail/sequential/binary_search.h",
+        "cuda/include/thrust/system/detail/sequential/copy.h",
+        "cuda/include/thrust/system/detail/sequential/copy.inl",
+        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
+        "cuda/include/thrust/system/detail/sequential/copy_if.h",
+        "cuda/include/thrust/system/detail/sequential/count.h",
+        "cuda/include/thrust/system/detail/sequential/equal.h",
+        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
+        "cuda/include/thrust/system/detail/sequential/extrema.h",
+        "cuda/include/thrust/system/detail/sequential/fill.h",
+        "cuda/include/thrust/system/detail/sequential/find.h",
+        "cuda/include/thrust/system/detail/sequential/for_each.h",
+        "cuda/include/thrust/system/detail/sequential/gather.h",
+        "cuda/include/thrust/system/detail/sequential/general_copy.h",
+        "cuda/include/thrust/system/detail/sequential/generate.h",
+        "cuda/include/thrust/system/detail/sequential/get_value.h",
+        "cuda/include/thrust/system/detail/sequential/inner_product.h",
+        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
+        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
+        "cuda/include/thrust/system/detail/sequential/logical.h",
+        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
+        "cuda/include/thrust/system/detail/sequential/merge.h",
+        "cuda/include/thrust/system/detail/sequential/merge.inl",
+        "cuda/include/thrust/system/detail/sequential/mismatch.h",
+        "cuda/include/thrust/system/detail/sequential/partition.h",
+        "cuda/include/thrust/system/detail/sequential/reduce.h",
+        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/sequential/remove.h",
+        "cuda/include/thrust/system/detail/sequential/replace.h",
+        "cuda/include/thrust/system/detail/sequential/reverse.h",
+        "cuda/include/thrust/system/detail/sequential/scan.h",
+        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
+        "cuda/include/thrust/system/detail/sequential/scatter.h",
+        "cuda/include/thrust/system/detail/sequential/sequence.h",
+        "cuda/include/thrust/system/detail/sequential/set_operations.h",
+        "cuda/include/thrust/system/detail/sequential/sort.h",
+        "cuda/include/thrust/system/detail/sequential/sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
+        "cuda/include/thrust/system/detail/sequential/tabulate.h",
+        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/sequential/transform.h",
+        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
+        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
+        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
+        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/sequential/unique.h",
+        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
+        "cuda/include/thrust/system/detail/system_error.inl",
+        "cuda/include/thrust/system/error_code.h",
+        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/omp/detail/assign_value.h",
+        "cuda/include/thrust/system/omp/detail/binary_search.h",
+        "cuda/include/thrust/system/omp/detail/copy.h",
+        "cuda/include/thrust/system/omp/detail/copy.inl",
+        "cuda/include/thrust/system/omp/detail/copy_if.h",
+        "cuda/include/thrust/system/omp/detail/copy_if.inl",
+        "cuda/include/thrust/system/omp/detail/count.h",
+        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
+        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
+        "cuda/include/thrust/system/omp/detail/equal.h",
+        "cuda/include/thrust/system/omp/detail/execution_policy.h",
+        "cuda/include/thrust/system/omp/detail/extrema.h",
+        "cuda/include/thrust/system/omp/detail/fill.h",
+        "cuda/include/thrust/system/omp/detail/find.h",
+        "cuda/include/thrust/system/omp/detail/for_each.h",
+        "cuda/include/thrust/system/omp/detail/for_each.inl",
+        "cuda/include/thrust/system/omp/detail/gather.h",
+        "cuda/include/thrust/system/omp/detail/generate.h",
+        "cuda/include/thrust/system/omp/detail/get_value.h",
+        "cuda/include/thrust/system/omp/detail/inner_product.h",
+        "cuda/include/thrust/system/omp/detail/iter_swap.h",
+        "cuda/include/thrust/system/omp/detail/logical.h",
+        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/omp/detail/memory.inl",
+        "cuda/include/thrust/system/omp/detail/merge.h",
+        "cuda/include/thrust/system/omp/detail/mismatch.h",
+        "cuda/include/thrust/system/omp/detail/par.h",
+        "cuda/include/thrust/system/omp/detail/partition.h",
+        "cuda/include/thrust/system/omp/detail/partition.inl",
+        "cuda/include/thrust/system/omp/detail/reduce.h",
+        "cuda/include/thrust/system/omp/detail/reduce.inl",
+        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
+        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
+        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
+        "cuda/include/thrust/system/omp/detail/remove.h",
+        "cuda/include/thrust/system/omp/detail/remove.inl",
+        "cuda/include/thrust/system/omp/detail/replace.h",
+        "cuda/include/thrust/system/omp/detail/reverse.h",
+        "cuda/include/thrust/system/omp/detail/scan.h",
+        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
+        "cuda/include/thrust/system/omp/detail/scatter.h",
+        "cuda/include/thrust/system/omp/detail/sequence.h",
+        "cuda/include/thrust/system/omp/detail/set_operations.h",
+        "cuda/include/thrust/system/omp/detail/sort.h",
+        "cuda/include/thrust/system/omp/detail/sort.inl",
+        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
+        "cuda/include/thrust/system/omp/detail/tabulate.h",
+        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/omp/detail/transform.h",
+        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
+        "cuda/include/thrust/system/omp/detail/transform_scan.h",
+        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/omp/detail/unique.h",
+        "cuda/include/thrust/system/omp/detail/unique.inl",
+        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
+        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
+        "cuda/include/thrust/system/omp/detail/vector.inl",
+        "cuda/include/thrust/system/omp/execution_policy.h",
+        "cuda/include/thrust/system/omp/memory.h",
+        "cuda/include/thrust/system/omp/vector.h",
+        "cuda/include/thrust/system/system_error.h",
+        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/tbb/detail/assign_value.h",
+        "cuda/include/thrust/system/tbb/detail/binary_search.h",
+        "cuda/include/thrust/system/tbb/detail/copy.h",
+        "cuda/include/thrust/system/tbb/detail/copy.inl",
+        "cuda/include/thrust/system/tbb/detail/copy_if.h",
+        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
+        "cuda/include/thrust/system/tbb/detail/count.h",
+        "cuda/include/thrust/system/tbb/detail/equal.h",
+        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
+        "cuda/include/thrust/system/tbb/detail/extrema.h",
+        "cuda/include/thrust/system/tbb/detail/fill.h",
+        "cuda/include/thrust/system/tbb/detail/find.h",
+        "cuda/include/thrust/system/tbb/detail/for_each.h",
+        "cuda/include/thrust/system/tbb/detail/for_each.inl",
+        "cuda/include/thrust/system/tbb/detail/gather.h",
+        "cuda/include/thrust/system/tbb/detail/generate.h",
+        "cuda/include/thrust/system/tbb/detail/get_value.h",
+        "cuda/include/thrust/system/tbb/detail/inner_product.h",
+        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
+        "cuda/include/thrust/system/tbb/detail/logical.h",
+        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/tbb/detail/memory.inl",
+        "cuda/include/thrust/system/tbb/detail/merge.h",
+        "cuda/include/thrust/system/tbb/detail/merge.inl",
+        "cuda/include/thrust/system/tbb/detail/mismatch.h",
+        "cuda/include/thrust/system/tbb/detail/par.h",
+        "cuda/include/thrust/system/tbb/detail/partition.h",
+        "cuda/include/thrust/system/tbb/detail/partition.inl",
+        "cuda/include/thrust/system/tbb/detail/reduce.h",
+        "cuda/include/thrust/system/tbb/detail/reduce.inl",
+        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
+        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
+        "cuda/include/thrust/system/tbb/detail/remove.h",
+        "cuda/include/thrust/system/tbb/detail/remove.inl",
+        "cuda/include/thrust/system/tbb/detail/replace.h",
+        "cuda/include/thrust/system/tbb/detail/reverse.h",
+        "cuda/include/thrust/system/tbb/detail/scan.h",
+        "cuda/include/thrust/system/tbb/detail/scan.inl",
+        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/scatter.h",
+        "cuda/include/thrust/system/tbb/detail/sequence.h",
+        "cuda/include/thrust/system/tbb/detail/set_operations.h",
+        "cuda/include/thrust/system/tbb/detail/sort.h",
+        "cuda/include/thrust/system/tbb/detail/sort.inl",
+        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
+        "cuda/include/thrust/system/tbb/detail/tabulate.h",
+        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/tbb/detail/transform.h",
+        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
+        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
+        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/tbb/detail/unique.h",
+        "cuda/include/thrust/system/tbb/detail/unique.inl",
+        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
+        "cuda/include/thrust/system/tbb/detail/vector.inl",
+        "cuda/include/thrust/system/tbb/execution_policy.h",
+        "cuda/include/thrust/system/tbb/memory.h",
+        "cuda/include/thrust/system/tbb/vector.h",
+        "cuda/include/thrust/system_error.h",
+        "cuda/include/thrust/tabulate.h",
+        "cuda/include/thrust/transform.h",
+        "cuda/include/thrust/transform_reduce.h",
+        "cuda/include/thrust/transform_scan.h",
+        "cuda/include/thrust/tuple.h",
+        "cuda/include/thrust/uninitialized_copy.h",
+        "cuda/include/thrust/uninitialized_fill.h",
+        "cuda/include/thrust/unique.h",
+        "cuda/include/thrust/version.h",
+        "cuda/include/vector_functions.h",
+        "cuda/include/vector_functions.hpp",
+        "cuda/include/vector_types.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-10.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-10.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-10.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-10.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-10.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-10.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-10.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-10.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-10.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-10.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-10.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-10.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-10.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_egl_interop.h" "$(@D)/cuda/include/cuda_egl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-10.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-10.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-10.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-10.0/include/cudart_platform.h" "$(@D)/cuda/include/cudart_platform.h" && cp -f "/usr/local/cuda-10.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-10.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-10.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-10.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-10.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-10.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-10.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-10.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-10.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-10.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-10.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-10.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-10.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-10.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-10.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-10.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-10.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-10.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-10.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-10.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-10.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-10.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-10.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-10.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-10.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-10.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-10.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-10.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-10.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-10.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-10.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-10.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-10.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-10.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-10.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-10.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-10.0/include/nvjpeg.h" "$(@D)/cuda/include/nvjpeg.h" && cp -f "/usr/local/cuda-10.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-10.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExt.h" "$(@D)/cuda/include/nvtx3/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCuda.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtOpenCL.h" "$(@D)/cuda/include/nvtx3/nvToolsExtOpenCL.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtSync.h" "$(@D)/cuda/include/nvtx3/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImpl.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImpl.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCore.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInit.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInit.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDecls.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDefs.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxTypes.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxTypes.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-10.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/alignment.h" "$(@D)/cuda/include/thrust/detail/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/preprocessor.h" "$(@D)/cuda/include/thrust/detail/preprocessor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-10.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+   """,
+)
+
+genrule(
+    name = "cuda-nvvm",
+    outs = [
+        "cuda/nvvm/libdevice/libdevice.10.bc",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+   """,
+)
+
+genrule(
+    name = "cuda-extras",
+    outs = [
+        "cuda/extras/CUPTI/include/GL/gl.h",
+        "cuda/extras/CUPTI/include/GL/glew.h",
+        "cuda/extras/CUPTI/include/GL/glext.h",
+        "cuda/extras/CUPTI/include/GL/glu.h",
+        "cuda/extras/CUPTI/include/GL/glut.h",
+        "cuda/extras/CUPTI/include/GL/glx.h",
+        "cuda/extras/CUPTI/include/GL/glxext.h",
+        "cuda/extras/CUPTI/include/GL/wglew.h",
+        "cuda/extras/CUPTI/include/GL/wglext.h",
+        "cuda/extras/CUPTI/include/cuda_stdint.h",
+        "cuda/extras/CUPTI/include/cupti.h",
+        "cuda/extras/CUPTI/include/cupti_activity.h",
+        "cuda/extras/CUPTI/include/cupti_callbacks.h",
+        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
+        "cuda/extras/CUPTI/include/cupti_events.h",
+        "cuda/extras/CUPTI/include/cupti_metrics.h",
+        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
+        "cuda/extras/CUPTI/include/cupti_result.h",
+        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
+        "cuda/extras/CUPTI/include/cupti_version.h",
+        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
+        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
+        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
+        "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
+        "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
+        "cuda/extras/CUPTI/include/openmp/ompt.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/cupti_openmp.h" "$(@D)/cuda/extras/CUPTI/include/openmp/cupti_openmp.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/ompt.h" "$(@D)/cuda/extras/CUPTI/include/openmp/ompt.h"
+   """,
+)
+
+genrule(
+    name = "cuda-lib",
+    outs = [
+        "cuda/lib/libcuda.so",
+        "cuda/lib/libcudart.so.10.0",
+        "cuda/lib/libcudart_static.a",
+        "cuda/lib/libcublas.so.10.0",
+        "cuda/lib/libcusolver.so.10.0",
+        "cuda/lib/libcurand.so.10.0",
+        "cuda/lib/libcufft.so.10.0",
+        "cuda/lib/libcudnn.so.7",
+        "cuda/lib/libcupti.so.10.0",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.130" "$(@D)/cuda/lib/libcudart.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcublas.so.10.0.130" "$(@D)/cuda/lib/libcublas.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcusolver.so.10.0.130" "$(@D)/cuda/lib/libcusolver.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcurand.so.10.0.130" "$(@D)/cuda/lib/libcurand.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcufft.so.10.0.145" "$(@D)/cuda/lib/libcufft.so.10.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.3.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0.130" "$(@D)/cuda/lib/libcupti.so.10.0"
+   """,
+)
+
+genrule(
+    name = "cudnn-include",
+    outs = [
+        "cuda/include/cudnn.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
+   """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
new file mode 100755
index 0000000000000000000000000000000000000000..a53c891d8bba1b80a880ddd9c16091db27861a8d
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
@@ -0,0 +1,31 @@
+# Macros for building CUDA code.
+def if_cuda(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with CUDA.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "@local_config_cuda//cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_clang": if_true,
+        "//conditions:default": if_false,
+    })
+
+def cuda_default_copts():
+    """Default options for all CUDA compilations."""
+    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
+
+def cuda_is_configured():
+    """Returns true if CUDA was enabled during the configure process."""
+    return True
+
+def if_cuda_is_configured(x):
+    """Tests if the CUDA was enabled during the configure process.
+
+    Unlike if_cuda(), this does not require that we are building with
+    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
+    """
+    if cuda_is_configured():
+        return x
+    return []
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
new file mode 100755
index 0000000000000000000000000000000000000000..0934618e0b538ab0db2a969870c85aa9c4053130
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef CUDA_CUDA_CONFIG_H_
+#define CUDA_CUDA_CONFIG_H_
+
+#define TF_CUDA_CAPABILITIES CudaVersion("3.0")
+
+#define TF_CUDA_VERSION "10.0"
+#define TF_CUDNN_VERSION "7"
+
+#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
+
+#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
index 05abcb56d84789844616f1c884021ca9ea9eca10..c6930904b564bf2cce70b484a0e7b0759f13b7c9 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
@@ -1188,7 +1188,7 @@ genrule(
         "cuda/include/vector_types.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp -f "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp -f "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp -f "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
    """,
 )
 
@@ -1198,7 +1198,7 @@ genrule(
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
    """,
 )
 
@@ -1235,7 +1235,7 @@ genrule(
         "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
    """,
 )
 
@@ -1253,7 +1253,7 @@ genrule(
         "cuda/lib/libcupti.so.9.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
    """,
 )
 
@@ -1263,6 +1263,6 @@ genrule(
         "cuda/include/cudnn.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/include/cudnn.h" "$(@D)/cudnn.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
    """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
index 5c6703aab4fbdaf92c5b63a5c0f2600ad699c0cf..a53c891d8bba1b80a880ddd9c16091db27861a8d 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
@@ -9,15 +9,13 @@ def if_cuda(if_true, if_false = []):
     return select({
         "@local_config_cuda//cuda:using_nvcc": if_true,
         "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
 
-
 def cuda_default_copts():
     """Default options for all CUDA compilations."""
     return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
 
-
 def cuda_is_configured():
     """Returns true if CUDA was enabled during the configure process."""
     return True
@@ -29,5 +27,5 @@ def if_cuda_is_configured(x):
     --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
     """
     if cuda_is_configured():
-      return x
+        return x
     return []
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..6442e7628a416e3298cfd2579cee275459780145
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
@@ -0,0 +1,87 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..1c2e8bcae63ebc9b1ee22b5d677c185589b547f8
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
@@ -0,0 +1,1431 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "x64_windows"
+  toolchain_identifier: "local_windows"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        flag: "-fno-canonical-system-headers"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 0000000000000000000000000000000000000000..7ae59e9967adf9b1a980a8085e203459ba8a7c7b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '10.0'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # currently only c++11 is supported by Cuda 7.0 std argument
+  nvcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ "3.0" ]
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
new file mode 100755
index 0000000000000000000000000000000000000000..e896e654fd7ecd578c80d102895f51ce18bbd4eb
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
@@ -0,0 +1,20 @@
+:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
+@echo OFF
+set arg0=%~0
+for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
+"/usr/bin/python3" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
new file mode 100755
index 0000000000000000000000000000000000000000..00483951af966e0085e6f2b1d74290d9ee872963
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+NVCC_VERSION = '10.0'
+NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
+supported_cuda_compute_capabilities = [ "3.0" ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..6442e7628a416e3298cfd2579cee275459780145
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
@@ -0,0 +1,87 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..0d89a539b8d70788eb0f6924636824fba778a058
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL
@@ -0,0 +1,1431 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "x64_windows"
+  toolchain_identifier: "local_windows"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        flag: "-fno-canonical-system-headers"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 0000000000000000000000000000000000000000..63893d3722f6b43579758e5f747076b1f1e73ed7
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '9.0'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # currently only c++11 is supported by Cuda 7.0 std argument
+  nvcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ "3.0" ]
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat
new file mode 100755
index 0000000000000000000000000000000000000000..e896e654fd7ecd578c80d102895f51ce18bbd4eb
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat
@@ -0,0 +1,20 @@
+:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
+@echo OFF
+set arg0=%~0
+for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
+"/usr/bin/python3" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
new file mode 100755
index 0000000000000000000000000000000000000000..859b3196d5dba9afadeae56f34be04247b00fe09
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+NVCC_VERSION = '9.0'
+NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
+supported_cuda_compute_capabilities = [ "3.0" ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
index e021df9e1e3066b597dddc5dc78da3121ddd2430..460c879d32f1381454b6d043bded61e66b02f41d 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
@@ -136,7 +136,7 @@ genrule(
         "python_include/weakrefobject.h",
     ],
     cmd = """
-cp "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+cp -f "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
    """,
 )
 
@@ -171,6 +171,6 @@ genrule(
         "numpy_include/numpy/utils.h",
     ],
     cmd = """
-cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
    """,
 )
diff --git a/third_party/toolchains/preconfig/win_1803/BUILD b/third_party/toolchains/preconfig/win_1803/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ac599bc2f3d758ad7094fd9f9b748929d2c8ef7a
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/BUILD
@@ -0,0 +1,26 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+java_runtime(
+    name = "windows_jdk8",
+    srcs = [],
+    java_home = "C:/openjdk",
+)
+
+platform(
+    name = "rbe_windows_1803",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    remote_execution_properties = """
+        properties:{
+          name:"container-image"
+          value:"docker://gcr.io/tensorflow-testing/tf-rbe-win@sha256:fbc5713566011cc27fc3651183a6e7c2fd56fc6f006618c53f8fc71e742feebd"
+        }
+        properties:{
+          name: "OSFamily" value: "Windows"
+        }
+        """,
+)
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD b/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..edd958364811d2e063b10f3c2e3a347b601794b5
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
@@ -0,0 +1,162 @@
+# Copyright 2018 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under Windows.
+
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "malloc",
+)
+
+cc_library(
+    name = "stl",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+# Hardcoded toolchain, legacy behaviour.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
+        "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
+        "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_msys",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "msys_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_msys",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+        "@bazel_tools//tools/cpp:msys",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_msys",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows_mingw",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_identifier = "msys_x64_mingw",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows_mingw",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+        "@bazel_tools//tools/cpp:mingw",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows_mingw",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-x64_windows",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "msvc_x64",
+)
+
+toolchain(
+    name = "cc-toolchain-x64_windows",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:windows",
+    ],
+    toolchain = ":cc-compiler-x64_windows",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:arm",
+        "@bazel_tools//platforms:android",
+    ],
+    toolchain = ":cc-compiler-armeabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL b/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
new file mode 100644
index 0000000000000000000000000000000000000000..38a80c22da32de50a98b78da6e157db936d03040
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
@@ -0,0 +1,1176 @@
+# Copyright 2016 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+major_version: "local"
+minor_version: ""
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+toolchain {
+  abi_version: "armeabi-v7a"
+  abi_libc_version: "armeabi-v7a"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "armeabi-v7a"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  target_libc: "armeabi-v7a"
+  target_cpu: "armeabi-v7a"
+  target_system_name: "armeabi-v7a"
+  toolchain_identifier: "stub_armeabi-v7a"
+
+  tool_path { name: "ar" path: "/bin/false" }
+  tool_path { name: "compat-ld" path: "/bin/false" }
+  tool_path { name: "cpp" path: "/bin/false" }
+  tool_path { name: "dwp" path: "/bin/false" }
+  tool_path { name: "gcc" path: "/bin/false" }
+  tool_path { name: "gcov" path: "/bin/false" }
+  tool_path { name: "ld" path: "/bin/false" }
+
+  tool_path { name: "nm" path: "/bin/false" }
+  tool_path { name: "objcopy" path: "/bin/false" }
+  tool_path { name: "objdump" path: "/bin/false" }
+  tool_path { name: "strip" path: "/bin/false" }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "msys_x64"
+   abi_version: "local"
+   abi_libc_version: "local"
+   builtin_sysroot: ""
+   compiler: "msys-gcc"
+   host_system_name: "local"
+   needsPic: false
+   target_libc: "msys"
+   target_cpu: "x64_windows"
+   target_system_name: "local"
+   tool_path { name: "ar" path: "c:/tools/msys64/usr/bin/ar" }
+   tool_path { name: "compat-ld" path: "c:/tools/msys64/usr/bin/ld" }
+   tool_path { name: "cpp" path: "c:/tools/msys64/usr/bin/cpp" }
+   tool_path { name: "dwp" path: "c:/tools/msys64/usr/bin/dwp" }
+   tool_path { name: "gcc" path: "c:/tools/msys64/usr/bin/gcc" }
+   artifact_name_pattern { category_name: "executable" prefix: "" extension: ".exe"}
+   cxx_flag: "-std=gnu++0x"
+   linker_flag: "-lstdc++"
+   cxx_builtin_include_directory: "c:/tools/msys64/usr/"
+   tool_path { name: "gcov" path: "c:/tools/msys64/usr/bin/gcov" }
+   tool_path { name: "ld" path: "c:/tools/msys64/usr/bin/ld" }
+   tool_path { name: "nm" path: "c:/tools/msys64/usr/bin/nm" }
+   tool_path { name: "objcopy" path: "c:/tools/msys64/usr/bin/objcopy" }
+   objcopy_embed_flag: "-I"
+   objcopy_embed_flag: "binary"
+   tool_path { name: "objdump" path: "c:/tools/msys64/usr/bin/objdump" }
+   tool_path { name: "strip" path: "c:/tools/msys64/usr/bin/strip" }   feature { name: "targets_windows" implies: "copy_dynamic_libraries_to_binary" enabled: true }   feature { name: "copy_dynamic_libraries_to_binary" }
+
+  compilation_mode_flags {
+    mode: DBG
+
+  }
+  compilation_mode_flags {
+    mode: OPT
+
+  }
+  linking_mode_flags { mode: DYNAMIC }
+
+
+
+  feature {
+    name: 'fdo_optimize'
+    provides: 'profile'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      expand_if_all_available: 'fdo_profile_path'
+      flag_group {
+        flag: '-fprofile-use=%{fdo_profile_path}'
+        flag: '-fprofile-correction',
+      }
+    }
+  }
+}
+
+toolchain {
+  toolchain_identifier: "msys_x64_mingw"
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "mingw-gcc"
+  host_system_name: "local"
+  needsPic: false
+  target_libc: "mingw"
+  target_cpu: "x64_windows"
+  target_system_name: "local"
+
+  artifact_name_pattern {
+     category_name: 'executable'
+     prefix: ''
+     extension: '.exe'
+  }
+
+   tool_path { name: "ar" path: "c:/tools/msys64/mingw64/bin/ar" }
+   tool_path { name: "compat-ld" path: "c:/tools/msys64/mingw64/bin/ld" }
+   tool_path { name: "cpp" path: "c:/tools/msys64/mingw64/bin/cpp" }
+   tool_path { name: "dwp" path: "c:/tools/msys64/mingw64/bin/dwp" }
+   tool_path { name: "gcc" path: "c:/tools/msys64/mingw64/bin/gcc" }
+   artifact_name_pattern { category_name: "executable" prefix: "" extension: ".exe"}
+   cxx_flag: "-std=gnu++0x"
+   linker_flag: "-lstdc++"
+   cxx_builtin_include_directory: "c:/tools/msys64/mingw64/"
+   tool_path { name: "gcov" path: "c:/tools/msys64/mingw64/bin/gcov" }
+   tool_path { name: "ld" path: "c:/tools/msys64/mingw64/bin/ld" }
+   tool_path { name: "nm" path: "c:/tools/msys64/mingw64/bin/nm" }
+   tool_path { name: "objcopy" path: "c:/tools/msys64/mingw64/bin/objcopy" }
+   objcopy_embed_flag: "-I"
+   objcopy_embed_flag: "binary"
+   tool_path { name: "objdump" path: "c:/tools/msys64/mingw64/bin/objdump" }
+   tool_path { name: "strip" path: "c:/tools/msys64/mingw64/bin/strip" }   feature { name: "targets_windows" implies: "copy_dynamic_libraries_to_binary" enabled: true }   feature { name: "copy_dynamic_libraries_to_binary" }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "msvc_x64"
+  # This is a workaround for https://github.com/bazelbuild/bazel/issues/5087.
+  cxx_builtin_include_directory: "C:\\botcode\\w"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+  default_python_version: "python2.7"
+
+cxx_builtin_include_directory: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\INCLUDE"
+cxx_builtin_include_directory: "C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.10240.0\\ucrt"
+cxx_builtin_include_directory: "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\shared"
+cxx_builtin_include_directory: "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\um"
+cxx_builtin_include_directory: "C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\winrt"
+
+  tool_path {
+    name: "ar"
+    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/lib.exe"
+  }
+  tool_path {
+    name: "ml"
+    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/ml64.exe"
+  }
+  tool_path {
+    name: "cpp"
+    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/cl.exe"
+  }
+  tool_path {
+    name: "gcc"
+    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/cl.exe"
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: "C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/link.exe"
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_gold_linker: false
+  supports_start_end_lib: false
+  supports_interface_shared_objects: true
+  supports_incremental_linker: false
+  supports_normalizing_ar: true
+  needsPic: false
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  artifact_name_pattern {
+     category_name: 'object_file'
+     prefix: ''
+     extension: '.obj'
+  }
+
+  artifact_name_pattern {
+     category_name: 'static_library'
+     prefix: ''
+     extension: '.lib'
+  }
+
+  artifact_name_pattern {
+     category_name: 'alwayslink_static_library'
+     prefix: ''
+     extension: '.lo.lib'
+  }
+
+  artifact_name_pattern {
+     category_name: 'executable'
+     prefix: ''
+     extension: '.exe'
+  }
+
+  artifact_name_pattern {
+     category_name: 'dynamic_library'
+     prefix: ''
+     extension: '.dll'
+  }
+
+  artifact_name_pattern {
+     category_name: 'interface_library'
+     prefix: ''
+     extension: '.if.lib'
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/ml64.exe'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/ml64.exe'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/cl.exe'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/cl.exe'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/link.exe'
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/link.exe'
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/link.exe'
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/VC/bin/amd64/lib.exe'
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\BIN\\amd64;C:\\Windows\\Microsoft.NET\\Framework64\\v4.0.30319;C:\\Windows\\Microsoft.NET\\Framework64\\;C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x64;C:\\Program Files (x86)\\Windows Kits\\8.1\\bin\\x86;;C:\\Windows\\system32"
+      }
+      env_entry {
+        key: "TMP"
+        value: "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp"
+      }
+      env_entry {
+        key: "TEMP"
+        value: "C:\\Users\\ContainerAdministrator\\AppData\\Local\\Temp"
+      }
+    }
+    implies: 'msvc_compile_env'
+    implies: 'msvc_link_env'
+  }
+
+  feature {
+    name: "msvc_compile_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      env_entry {
+        key: "INCLUDE"
+        value: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\INCLUDE;C:\\Program Files (x86)\\Windows Kits\\10\\include\\10.0.10240.0\\ucrt;C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\shared;C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\um;C:\\Program Files (x86)\\Windows Kits\\8.1\\include\\winrt;"
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_link_env"
+    env_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "LIB"
+        value: "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\LIB\\amd64;C:\\Program Files (x86)\\Windows Kits\\10\\lib\\10.0.10240.0\\ucrt\\x64;C:\\Program Files (x86)\\Windows Kits\\8.1\\lib\\winv6.3\\um\\x64;"
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2" # Implies /Og /Oi /Ot /Oy /Ob2 /Gs /GF /Gy
+      }
+    }
+    implies: 'frame_pointer'
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  # Must come after /O1, /O2 and /Ox.
+  feature {
+    name: "frame_pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "/Oy-"
+      }
+    }
+  }
+
+  # Remove assert/DCHECKs in opt mode.
+  # You can have them back with --features=-disable_assertions.
+  feature {
+    name: 'disable_assertions'
+    enabled: true
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        # TODO: detect clang on Windows and use "-Wno-builtin-macro-redefined"
+        flag: "/wd4117" # Trying to define or undefine a predefined macro
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: 'treat_warnings_as_errors'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/WX"
+      }
+    }
+  }
+
+  # Trade slower build time for smaller binary
+  feature {
+    name: 'smaller_binary'
+    enabled: true
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: "/Gy" # Enable function-level linking (-ffunction-sections)
+        flag: "/Gw" # Optimize global data (-fdata-sections)
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library',
+      action: 'c++-link-nodeps-dynamic-library'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: '/OPT:ICF' # Fold identical functions
+        flag: '/OPT:REF' # Eliminate unreferenced functions and data
+      }
+    }
+  }
+
+  # Suppress warnings that most users do not care
+  feature {
+    name: 'ignore_noisy_warnings'
+    enabled: true
+    flag_set {
+      action: 'c++-link-static-library'
+      flag_group {
+        # Suppress 'object file does not define any public symbols' warning
+        flag: '/ignore:4221'
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
+
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl b/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..45c0285d232806672e93cb6d9b860b2693e75d3d
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
@@ -0,0 +1,23 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2017 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Skylark rule that stubs a toolchain."""
+
+def _dummy_toolchain_impl(ctx):
+    ctx = ctx  # unused argument
+    toolchain = platform_common.ToolchainInfo()
+    return [toolchain]
+
+dummy_toolchain = rule(_dummy_toolchain_impl, attrs = {})
diff --git a/third_party/toolchains/preconfig/win_1803/py36/BUILD b/third_party/toolchains/preconfig/win_1803/py36/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..7b2e84bb38e11173ba20807cc072810e7b86b678
--- /dev/null
+++ b/third_party/toolchains/preconfig/win_1803/py36/BUILD
@@ -0,0 +1,191 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+# See https://docs.python.org/3/extending/windows.html
+cc_import(
+    name = "python_lib",
+    interface_library = select({
+        ":windows": ":python_import_lib",
+        # A placeholder for Unix platforms which makes --no_build happy.
+        "//conditions:default": "not-existing.lib",
+    }),
+    system_provided = 1,
+)
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    includes = ["python_include"],
+    deps = select({
+        ":windows": [":python_lib"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "python_include",
+    outs = [
+        "python_include/Python-ast.h",
+        "python_include/Python.h",
+        "python_include/abstract.h",
+        "python_include/accu.h",
+        "python_include/asdl.h",
+        "python_include/ast.h",
+        "python_include/bitset.h",
+        "python_include/bltinmodule.h",
+        "python_include/boolobject.h",
+        "python_include/bytearrayobject.h",
+        "python_include/bytes_methods.h",
+        "python_include/bytesobject.h",
+        "python_include/cellobject.h",
+        "python_include/ceval.h",
+        "python_include/classobject.h",
+        "python_include/code.h",
+        "python_include/codecs.h",
+        "python_include/compile.h",
+        "python_include/complexobject.h",
+        "python_include/datetime.h",
+        "python_include/descrobject.h",
+        "python_include/dictobject.h",
+        "python_include/dtoa.h",
+        "python_include/dynamic_annotations.h",
+        "python_include/enumobject.h",
+        "python_include/errcode.h",
+        "python_include/eval.h",
+        "python_include/fileobject.h",
+        "python_include/fileutils.h",
+        "python_include/floatobject.h",
+        "python_include/frameobject.h",
+        "python_include/funcobject.h",
+        "python_include/genobject.h",
+        "python_include/graminit.h",
+        "python_include/grammar.h",
+        "python_include/import.h",
+        "python_include/intrcheck.h",
+        "python_include/iterobject.h",
+        "python_include/listobject.h",
+        "python_include/longintrepr.h",
+        "python_include/longobject.h",
+        "python_include/marshal.h",
+        "python_include/memoryobject.h",
+        "python_include/metagrammar.h",
+        "python_include/methodobject.h",
+        "python_include/modsupport.h",
+        "python_include/moduleobject.h",
+        "python_include/namespaceobject.h",
+        "python_include/node.h",
+        "python_include/object.h",
+        "python_include/objimpl.h",
+        "python_include/odictobject.h",
+        "python_include/opcode.h",
+        "python_include/osdefs.h",
+        "python_include/osmodule.h",
+        "python_include/parsetok.h",
+        "python_include/patchlevel.h",
+        "python_include/pgen.h",
+        "python_include/pgenheaders.h",
+        "python_include/py_curses.h",
+        "python_include/pyarena.h",
+        "python_include/pyatomic.h",
+        "python_include/pycapsule.h",
+        "python_include/pyconfig.h",
+        "python_include/pyctype.h",
+        "python_include/pydebug.h",
+        "python_include/pydtrace.h",
+        "python_include/pyerrors.h",
+        "python_include/pyexpat.h",
+        "python_include/pyfpe.h",
+        "python_include/pygetopt.h",
+        "python_include/pyhash.h",
+        "python_include/pylifecycle.h",
+        "python_include/pymacconfig.h",
+        "python_include/pymacro.h",
+        "python_include/pymath.h",
+        "python_include/pymem.h",
+        "python_include/pyport.h",
+        "python_include/pystate.h",
+        "python_include/pystrcmp.h",
+        "python_include/pystrhex.h",
+        "python_include/pystrtod.h",
+        "python_include/pythonrun.h",
+        "python_include/pythread.h",
+        "python_include/pytime.h",
+        "python_include/rangeobject.h",
+        "python_include/setobject.h",
+        "python_include/sliceobject.h",
+        "python_include/structmember.h",
+        "python_include/structseq.h",
+        "python_include/symtable.h",
+        "python_include/sysmodule.h",
+        "python_include/token.h",
+        "python_include/traceback.h",
+        "python_include/tupleobject.h",
+        "python_include/typeslots.h",
+        "python_include/ucnhash.h",
+        "python_include/unicodeobject.h",
+        "python_include/warnings.h",
+        "python_include/weakrefobject.h",
+    ],
+    cmd = """
+cp -f "C:/Python36/include/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "C:/Python36/include/Python.h" "$(@D)/python_include/Python.h" && cp -f "C:/Python36/include/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "C:/Python36/include/accu.h" "$(@D)/python_include/accu.h" && cp -f "C:/Python36/include/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "C:/Python36/include/ast.h" "$(@D)/python_include/ast.h" && cp -f "C:/Python36/include/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "C:/Python36/include/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "C:/Python36/include/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "C:/Python36/include/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "C:/Python36/include/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "C:/Python36/include/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "C:/Python36/include/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "C:/Python36/include/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "C:/Python36/include/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "C:/Python36/include/code.h" "$(@D)/python_include/code.h" && cp -f "C:/Python36/include/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "C:/Python36/include/compile.h" "$(@D)/python_include/compile.h" && cp -f "C:/Python36/include/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "C:/Python36/include/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "C:/Python36/include/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "C:/Python36/include/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "C:/Python36/include/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "C:/Python36/include/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "C:/Python36/include/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "C:/Python36/include/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "C:/Python36/include/eval.h" "$(@D)/python_include/eval.h" && cp -f "C:/Python36/include/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "C:/Python36/include/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "C:/Python36/include/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "C:/Python36/include/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "C:/Python36/include/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "C:/Python36/include/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "C:/Python36/include/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "C:/Python36/include/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "C:/Python36/include/import.h" "$(@D)/python_include/import.h" && cp -f "C:/Python36/include/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "C:/Python36/include/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "C:/Python36/include/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "C:/Python36/include/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "C:/Python36/include/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "C:/Python36/include/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "C:/Python36/include/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "C:/Python36/include/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "C:/Python36/include/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "C:/Python36/include/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "C:/Python36/include/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "C:/Python36/include/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "C:/Python36/include/node.h" "$(@D)/python_include/node.h" && cp -f "C:/Python36/include/object.h" "$(@D)/python_include/object.h" && cp -f "C:/Python36/include/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "C:/Python36/include/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "C:/Python36/include/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "C:/Python36/include/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "C:/Python36/include/osmodule.h" "$(@D)/python_include/osmodule.h" && cp -f "C:/Python36/include/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "C:/Python36/include/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "C:/Python36/include/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "C:/Python36/include/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "C:/Python36/include/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "C:/Python36/include/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "C:/Python36/include/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "C:/Python36/include/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "C:/Python36/include/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "C:/Python36/include/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "C:/Python36/include/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "C:/Python36/include/pydtrace.h" "$(@D)/python_include/pydtrace.h" && cp -f "C:/Python36/include/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "C:/Python36/include/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "C:/Python36/include/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "C:/Python36/include/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "C:/Python36/include/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "C:/Python36/include/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "C:/Python36/include/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "C:/Python36/include/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "C:/Python36/include/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "C:/Python36/include/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "C:/Python36/include/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "C:/Python36/include/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "C:/Python36/include/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "C:/Python36/include/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "C:/Python36/include/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "C:/Python36/include/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "C:/Python36/include/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "C:/Python36/include/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "C:/Python36/include/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "C:/Python36/include/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "C:/Python36/include/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "C:/Python36/include/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "C:/Python36/include/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "C:/Python36/include/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "C:/Python36/include/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "C:/Python36/include/token.h" "$(@D)/python_include/token.h" && cp -f "C:/Python36/include/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "C:/Python36/include/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "C:/Python36/include/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "C:/Python36/include/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "C:/Python36/include/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "C:/Python36/include/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "C:/Python36/include/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+   """,
+)
+
+genrule(
+    name = "numpy_include",
+    outs = [
+        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/__ufunc_api.h",
+        "numpy_include/numpy/_neighborhood_iterator_imp.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/ndarrayobject.h",
+        "numpy_include/numpy/ndarraytypes.h",
+        "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
+        "numpy_include/numpy/npy_3kcompat.h",
+        "numpy_include/numpy/npy_common.h",
+        "numpy_include/numpy/npy_cpu.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/npy_os.h",
+        "numpy_include/numpy/numpyconfig.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/utils.h",
+    ],
+    cmd = """
+cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "C:/Python36/lib/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+   """,
+)
+
+genrule(
+    name = "python_import_lib",
+    outs = [
+        "python36.lib",
+    ],
+    cmd = """
+cp -f "C:/Python36/libs/python36.lib" "$(@D)/python36.lib"
+   """,
+)
diff --git a/tools/bazel.rc b/tools/bazel.rc
new file mode 100644
index 0000000000000000000000000000000000000000..1fdf51f53e29c7111cf89c016400b710051cf9c6
--- /dev/null
+++ b/tools/bazel.rc
@@ -0,0 +1,95 @@
+# Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
+# target CPU to build transient dependencies correctly. See
+# https://docs.bazel.build/versions/master/user-manual.html#flag--fat_apk_cpu
+build:android --crosstool_top=//external:android/crosstool
+build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
+build:android_arm --config=android
+build:android_arm --cpu=armeabi-v7a
+build:android_arm --fat_apk_cpu=armeabi-v7a
+build:android_arm64 --config=android
+build:android_arm64 --cpu=arm64-v8a
+build:android_arm64 --fat_apk_cpu=arm64-v8a
+
+# Config to use a mostly-static build and disable modular op registration
+# support (this will revert to loading TensorFlow with RTLD_GLOBAL in Python).
+# By default, TensorFlow will build with a dependence on
+# //tensorflow:libtensorflow_framework.so.
+build:monolithic --define framework_shared_object=false
+
+# For projects which use TensorFlow as part of a Bazel build process, putting
+# nothing in a bazelrc will default to a monolithic build. The following line
+# opts in to modular op registration support by default.
+build --define framework_shared_object=true
+
+# Please note that MKL on MacOS or windows is still not supported.
+# If you would like to use a local MKL instead of downloading, please set the
+# environment variable "TF_MKL_ROOT" every time before build.
+build:mkl --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl -c opt
+
+# This config option is used to enable MKL-DNN open source library only,
+# without depending on MKL binary version.
+build:mkl_open_source_only --define=build_with_mkl_dnn_only=true
+build:mkl_open_source_only --define=build_with_mkl=true --define=enable_mkl=true
+
+build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
+build:download_clang --define=using_clang=true
+# Instruct clang to use LLD for linking.
+# This only works with GPU builds currently, since Bazel sets -B/usr/bin in
+# auto-generated CPU crosstool, forcing /usr/bin/ld.lld to be preferred over
+# the downloaded one.
+build:download_clang_use_lld --linkopt='-fuse-ld=lld'
+
+build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
+build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true
+
+build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
+build:rocm --define=using_rocm=true --define=using_rocm_hipcc=true
+
+build:cuda_clang --crosstool_top=@local_config_cuda//crosstool:toolchain
+build:cuda_clang --define=using_cuda=true --define=using_cuda_clang=true --define=using_clang=true
+
+build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl --define=using_sycl=true --define=using_trisycl=false
+
+build:sycl_nodouble --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl_nodouble --define=using_sycl=true --cxxopt -DTENSORFLOW_SYCL_NO_DOUBLE
+
+build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl_asan --define=using_sycl=true --define=using_trisycl=false --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
+
+build:sycl_trisycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl_trisycl --define=using_sycl=true --define=using_trisycl=true
+
+# Options extracted from configure script
+build:gdr --define=with_gdr_support=true
+build:ngraph --define=with_ngraph_support=true
+build:verbs --define=with_verbs_support=true
+
+# Options to disable default on features
+build:noaws --define=no_aws_support=true
+build:nogcp --define=no_gcp_support=true
+build:nohdfs --define=no_hdfs_support=true
+build:nokafka --define=no_kafka_support=true
+build:noignite --define=no_ignite_support=true
+build:nonccl --define=no_nccl_support=true
+
+build --define=use_fast_cpp_protos=true
+build --define=allow_oversize_protos=true
+build --define=grpc_no_ares=true
+
+build --spawn_strategy=standalone
+build --genrule_strategy=standalone
+build -c opt
+
+# Other build flags.
+build --define=grpc_no_ares=true
+
+# Modular TF build options
+build:dynamic_kernels --define=dynamic_loaded_kernels=true
+build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
+
+# Default paths for TF_SYSTEM_LIBS
+build --define=PREFIX=/usr
+build --define=LIBDIR=$(PREFIX)/lib
+build --define=INCLUDEDIR=$(PREFIX)/include